commit 38d813617c82722a00822b33ce62556f0e9a06df
Author: hailin <hailin.zhao@gdzx.xyz>
Date:   Sun Aug 3 20:28:19 2025 +0800

    first commit

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..868b817
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,527 @@
+
+# The vLLM Dockerfile is used to construct vLLM image that can be directly used
+# to run the OpenAI compatible server.
+
+# Please update any changes made here to
+# docs/contributing/dockerfile/dockerfile.md and
+# docs/assets/contributing/dockerfile-stages-dependency.png
+
+ARG CUDA_VERSION=12.8.1
+ARG PYTHON_VERSION=3.12
+
+# By parameterizing the base images, we allow third-party to use their own
+# base images. One use case is hermetic builds with base images stored in
+# private registries that use a different repository naming conventions.
+#
+# Example:
+# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
+ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
+ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
+
+# By parameterizing the Deadsnakes repository URL, we allow third-party to use
+# their own mirror. When doing so, we don't benefit from the transparent
+# installation of the GPG key of the PPA, as done by add-apt-repository, so we
+# also need a URL for the GPG key.
+ARG DEADSNAKES_MIRROR_URL
+ARG DEADSNAKES_GPGKEY_URL
+
+# The PyPA get-pip.py script is a self contained script+zip file, that provides
+# both the installer script and the pip base85-encoded zip archive. This allows
+# bootstrapping pip in environment where a dsitribution package does not exist.
+#
+# By parameterizing the URL for get-pip.py installation script, we allow
+# third-party to use their own copy of the script stored in a private mirror.
+# We set the default value to the PyPA owned get-pip.py script.
+#
+# Reference: https://pip.pypa.io/en/stable/installation/#get-pip-py
+ARG GET_PIP_URL="https://bootstrap.pypa.io/get-pip.py"
+
+# PIP supports fetching the packages from custom indexes, allowing third-party
+# to host the packages in private mirrors. The PIP_INDEX_URL and
+# PIP_EXTRA_INDEX_URL are standard PIP environment variables to override the
+# default indexes. By letting them empty by default, PIP will use its default
+# indexes if the build process doesn't override the indexes.
+#
+# Uv uses different variables. We set them by default to the same values as
+# PIP, but they can be overridden.
+ARG PIP_INDEX_URL
+ARG PIP_EXTRA_INDEX_URL
+ARG UV_INDEX_URL=${PIP_INDEX_URL}
+ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+
+# PyTorch provides its own indexes for standard and nightly builds
+ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl
+ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly
+
+# PIP supports multiple authentication schemes, including keyring
+# By parameterizing the PIP_KEYRING_PROVIDER variable and setting it to
+# disabled by default, we allow third-party to use keyring authentication for
+# their private Python indexes, while not changing the default behavior which
+# is no authentication.
+#
+# Reference: https://pip.pypa.io/en/stable/topics/authentication/#keyring-support
+ARG PIP_KEYRING_PROVIDER=disabled
+ARG UV_KEYRING_PROVIDER=${PIP_KEYRING_PROVIDER}
+
+# Flag enables built-in KV-connector dependency libs into docker images
+ARG INSTALL_KV_CONNECTORS=false
+
+#################### BASE BUILD IMAGE ####################
+# prepare basic build environment
+FROM ${BUILD_BASE_IMAGE} AS base
+ARG CUDA_VERSION
+ARG PYTHON_VERSION
+ARG TARGETPLATFORM
+ARG INSTALL_KV_CONNECTORS=false
+ENV DEBIAN_FRONTEND=noninteractive
+
+ARG DEADSNAKES_MIRROR_URL
+ARG DEADSNAKES_GPGKEY_URL
+ARG GET_PIP_URL
+
+# Install Python and other dependencies
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y ccache software-properties-common git curl sudo \
+    && if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
+        if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
+            mkdir -p -m 0755 /etc/apt/keyrings ; \
+            curl -L ${DEADSNAKES_GPGKEY_URL} | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \
+            sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \
+            echo "deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${DEADSNAKES_MIRROR_URL} $(lsb_release -cs) main" > /etc/apt/sources.list.d/deadsnakes.list ; \
+        fi ; \
+    else \
+        for i in 1 2 3; do \
+            add-apt-repository -y ppa:deadsnakes/ppa && break || \
+            { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
+        done ; \
+    fi \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
+    && python3 --version && python3 -m pip --version
+
+ARG PIP_INDEX_URL UV_INDEX_URL
+ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
+ARG PYTORCH_CUDA_INDEX_BASE_URL
+ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
+ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
+
+# Install uv for faster pip installs
+RUN --mount=type=cache,target=/root/.cache/uv \
+    python3 -m pip install uv
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+
+# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
+# as it was causing spam when compiling the CUTLASS kernels
+RUN apt-get install -y gcc-10 g++-10
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
+RUN <<EOF
+gcc --version
+EOF
+
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+
+WORKDIR /workspace
+
+# install build and runtime dependencies
+
+# arm64 (GH200) build follows the practice of "use existing pytorch" build,
+# we need to install torch and torchvision from the nightly builds first,
+# pytorch will not appear as a vLLM dependency in all of the following steps
+# after this step
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        uv pip install --system \
+            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+            "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
+        uv pip install --system \
+            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+            --pre pytorch_triton==3.3.0+gitab727c40; \
+    fi
+
+COPY requirements/common.txt requirements/common.txt
+COPY requirements/cuda.txt requirements/cuda.txt
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/cuda.txt \
+    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+
+# cuda arch list used by torch
+# can be useful for both `dev` and `test`
+# explicitly set the list to avoid issues with torch 2.2
+# see https://github.com/pytorch/pytorch/pull/123243
+ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
+ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
+# Override the arch list for flash-attn to reduce the binary size
+ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
+ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
+#################### BASE BUILD IMAGE ####################
+
+#################### WHEEL BUILD IMAGE ####################
+FROM base AS build
+ARG TARGETPLATFORM
+
+ARG PIP_INDEX_URL UV_INDEX_URL
+ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
+ARG PYTORCH_CUDA_INDEX_BASE_URL
+
+# install build dependencies
+COPY requirements/build.txt requirements/build.txt
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/build.txt \
+    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
+
+# max jobs used by Ninja to build extensions
+ARG max_jobs=2
+ENV MAX_JOBS=${max_jobs}
+# number of threads used by nvcc
+ARG nvcc_threads=8
+ENV NVCC_THREADS=$nvcc_threads
+
+ARG USE_SCCACHE
+ARG SCCACHE_DOWNLOAD_URL=https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz
+ARG SCCACHE_ENDPOINT
+ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
+ARG SCCACHE_REGION_NAME=us-west-2
+ARG SCCACHE_S3_NO_CREDENTIALS=0
+
+# Flag to control whether to use pre-built vLLM wheels
+ARG VLLM_USE_PRECOMPILED
+# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed
+ENV VLLM_USE_PRECOMPILED=""
+RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \
+        export VLLM_USE_PRECOMPILED=1 && \
+        echo "Using precompiled wheels"; \
+    else \
+        unset VLLM_USE_PRECOMPILED && \
+        echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \
+    fi
+
+# if USE_SCCACHE is set, use sccache to speed up compilation
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=.git,target=.git \
+    if [ "$USE_SCCACHE" = "1" ]; then \
+        echo "Installing sccache..." \
+        && curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
+        && tar -xzf sccache.tar.gz \
+        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
+        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
+        && if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \
+        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
+        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
+        && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
+        && export SCCACHE_IDLE_TIMEOUT=0 \
+        && export CMAKE_BUILD_TYPE=Release \
+        && sccache --show-stats \
+        && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
+        && sccache --show-stats; \
+    fi
+
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=.git,target=.git  \
+    if [ "$USE_SCCACHE" != "1" ]; then \
+        # Clean any existing CMake artifacts
+        rm -rf .deps && \
+        mkdir -p .deps && \
+        python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
+    fi
+
+# Check the size of the wheel if RUN_WHEEL_CHECK is true
+COPY .buildkite/check-wheel-size.py check-wheel-size.py
+# sync the default value with .buildkite/check-wheel-size.py
+ARG VLLM_MAX_SIZE_MB=400
+ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
+ARG RUN_WHEEL_CHECK=true
+RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
+        python3 check-wheel-size.py dist; \
+    else \
+        echo "Skipping wheel size check."; \
+    fi
+#################### EXTENSION Build IMAGE ####################
+
+#################### DEV IMAGE ####################
+FROM base AS dev
+
+ARG PIP_INDEX_URL UV_INDEX_URL
+ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
+ARG PYTORCH_CUDA_INDEX_BASE_URL
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+
+# Workaround for #17068
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
+
+COPY requirements/lint.txt requirements/lint.txt
+COPY requirements/test.txt requirements/test.txt
+COPY requirements/dev.txt requirements/dev.txt
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/dev.txt \
+    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+#################### DEV IMAGE ####################
+
+#################### vLLM installation IMAGE ####################
+# image with vLLM installed
+# TODO: Restore to base image after FlashInfer AOT wheel fixed
+FROM ${FINAL_BASE_IMAGE} AS vllm-base
+ARG CUDA_VERSION
+ARG PYTHON_VERSION
+ARG INSTALL_KV_CONNECTORS=false
+WORKDIR /vllm-workspace
+ENV DEBIAN_FRONTEND=noninteractive
+ARG TARGETPLATFORM
+
+SHELL ["/bin/bash", "-c"]
+
+ARG DEADSNAKES_MIRROR_URL
+ARG DEADSNAKES_GPGKEY_URL
+ARG GET_PIP_URL
+
+RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
+    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
+
+# Install Python and other dependencies
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+    && if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
+        if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
+            mkdir -p -m 0755 /etc/apt/keyrings ; \
+            curl -L ${DEADSNAKES_GPGKEY_URL} | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \
+            sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \
+            echo "deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${DEADSNAKES_MIRROR_URL} $(lsb_release -cs) main" > /etc/apt/sources.list.d/deadsnakes.list ; \
+        fi ; \
+    else \
+        for i in 1 2 3; do \
+            add-apt-repository -y ppa:deadsnakes/ppa && break || \
+            { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
+        done ; \
+    fi \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
+    && python3 --version && python3 -m pip --version
+
+ARG PIP_INDEX_URL UV_INDEX_URL
+ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
+ARG PYTORCH_CUDA_INDEX_BASE_URL
+ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
+ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
+
+# Install uv for faster pip installs
+RUN --mount=type=cache,target=/root/.cache/uv \
+    python3 -m pip install uv
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+
+# arm64 (GH200) build follows the practice of "use existing pytorch" build,
+# we need to install torch and torchvision from the nightly builds first,
+# pytorch will not appear as a vLLM dependency in all of the following steps
+# after this step
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        uv pip install --system \
+            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+            "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319" ; \
+        uv pip install --system \
+            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+            --pre pytorch_triton==3.3.0+gitab727c40 ; \
+    fi
+
+# Install vllm wheel first, so that torch etc will be installed.
+RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system dist/*.whl --verbose \
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+
+# If we need to build FlashInfer wheel before its release:
+# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
+# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0'
+# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
+# $ cd flashinfer
+# $ git checkout v0.2.6.post1
+# $ python -m flashinfer.aot
+# $ python -m build --no-isolation --wheel
+# $ ls -la dist
+# -rw-rw-r-- 1 mgoin mgoin 205M Jun  9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
+# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
+
+# Install FlashInfer from source
+ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
+ARG FLASHINFER_GIT_REF="v0.2.8rc1"
+RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
+  . /etc/environment
+    git clone --depth 1 --recursive --shallow-submodules \
+        --branch ${FLASHINFER_GIT_REF} \
+        ${FLASHINFER_GIT_REPO} flashinfer
+    # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
+    # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
+    if [[ "${CUDA_VERSION}" == 11.* ]]; then
+        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
+    elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
+        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
+    else
+        # CUDA 12.8+ supports 10.0a and 12.0
+        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
+    fi
+    echo "🏗️  Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
+    # Needed to build AOT kernels
+    pushd flashinfer
+        TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
+            python3 -m flashinfer.aot
+        TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
+            uv pip install --system --no-build-isolation .
+    popd
+    rm -rf flashinfer
+BASH
+COPY examples examples
+COPY benchmarks benchmarks
+COPY ./vllm/collect_env.py .
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+. /etc/environment && \
+uv pip list
+
+# Even when we build Flashinfer with AOT mode, there's still
+# some issues w.r.t. JIT compilation. Therefore we need to
+# install build dependencies for JIT compilation.
+# TODO: Remove this once FlashInfer AOT wheel is fixed
+COPY requirements/build.txt requirements/build.txt
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/build.txt \
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+
+#################### vLLM installation IMAGE ####################
+
+#################### TEST IMAGE ####################
+# image to run unit testing suite
+# note that this uses vllm installed by `pip`
+FROM vllm-base AS test
+
+ADD . /vllm-workspace/
+
+ARG PYTHON_VERSION
+
+ARG PIP_INDEX_URL UV_INDEX_URL
+ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+
+# Workaround for #17068
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
+
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
+    if [ "$CUDA_MAJOR" -ge 12 ]; then \
+        uv pip install --system -r requirements/dev.txt; \
+    fi
+
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -e tests/vllm_test_utils
+
+# enable fast downloads from hf (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system hf_transfer
+ENV HF_HUB_ENABLE_HF_TRANSFER 1
+
+# Copy in the v1 package for testing (it isn't distributed yet)
+COPY vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
+
+# doc requires source code
+# we hide them inside `test_docs/` , so that this source code
+# will not be imported by other tests
+RUN mkdir test_docs
+RUN mv docs test_docs/
+RUN cp -r examples test_docs/
+RUN mv vllm test_docs/
+RUN mv mkdocs.yaml test_docs/
+#################### TEST IMAGE ####################
+
+#################### OPENAI API SERVER ####################
+# base openai image with additional requirements, for any subsequent openai-style images
+FROM vllm-base AS vllm-openai-base
+ARG TARGETPLATFORM
+ARG INSTALL_KV_CONNECTORS=false
+
+ARG PIP_INDEX_URL UV_INDEX_URL
+ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
+COPY requirements/kv_connectors.txt requirements/kv_connectors.txt
+
+# install additional dependencies for openai api server
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
+        uv pip install --system -r requirements/kv_connectors.txt; \
+    fi; \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        BITSANDBYTES_VERSION="0.42.0"; \
+    else \
+        BITSANDBYTES_VERSION="0.46.1"; \
+    fi; \
+    uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]
+
+ENV VLLM_USAGE_SOURCE production-docker-image
+
+# define sagemaker first, so it is not default from `docker build`
+FROM vllm-openai-base AS vllm-sagemaker
+
+COPY examples/online_serving/sagemaker-entrypoint.sh .
+RUN chmod +x sagemaker-entrypoint.sh
+ENTRYPOINT ["./sagemaker-entrypoint.sh"]
+
+FROM vllm-openai-base AS vllm-openai
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+#################### OPENAI API SERVER ####################
diff --git a/vllm_v0.10.0/.buildkite/check-wheel-size.py b/vllm_v0.10.0/.buildkite/check-wheel-size.py
new file mode 100644
index 0000000..68aff79
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/check-wheel-size.py
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import sys
+import zipfile
+
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
+# Note that we have 400 MiB quota, please use it wisely.
+# See https://github.com/pypi/support/issues/3792 .
+# Please also sync the value with the one in Dockerfile.
+VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
+
+
+def print_top_10_largest_files(zip_file):
+    """Print the top 10 largest files in the given zip file."""
+    with zipfile.ZipFile(zip_file, "r") as z:
+        file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
+        file_sizes.sort(key=lambda x: x[1], reverse=True)
+        for f, size in file_sizes[:10]:
+            print(f"{f}: {size / (1024 * 1024):.2f} MBs uncompressed.")
+
+
+def check_wheel_size(directory):
+    """Check the size of .whl files in the given directory."""
+    for root, _, files in os.walk(directory):
+        for file_name in files:
+            if file_name.endswith(".whl"):
+                wheel_path = os.path.join(root, file_name)
+                wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
+                if wheel_size_mb > VLLM_MAX_SIZE_MB:
+                    print(
+                        f"Not allowed: Wheel {wheel_path} is larger "
+                        f"({wheel_size_mb:.2f} MB) than the limit "
+                        f"({VLLM_MAX_SIZE_MB} MB)."
+                    )
+                    print_top_10_largest_files(wheel_path)
+                    return 1
+                else:
+                    print(
+                        f"Wheel {wheel_path} is within the allowed size "
+                        f"({wheel_size_mb:.2f} MB)."
+                    )
+    return 0
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python check-wheel-size.py <directory>")
+        sys.exit(1)
+
+    directory = sys.argv[1]
+    sys.exit(check_wheel_size(directory))
diff --git a/vllm_v0.10.0/.buildkite/generate_index.py b/vllm_v0.10.0/.buildkite/generate_index.py
new file mode 100644
index 0000000..7045d88
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/generate_index.py
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import os
+
+template = """<!DOCTYPE html>
+<html>
+    <body>
+    <h1>Links for vLLM</h1/>
+        <a href="../{wheel_html_escaped}">{wheel}</a><br/>
+    </body>
+</html>
+"""
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--wheel", help="The wheel path.", required=True)
+args = parser.parse_args()
+
+filename = os.path.basename(args.wheel)
+
+with open("index.html", "w") as f:
+    print(f"Generated index.html for {args.wheel}")
+    # cloudfront requires escaping the '+' character
+    f.write(
+        template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
+    )
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
new file mode 100644
index 0000000..d392a5f
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
@@ -0,0 +1,13 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
+model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.671
+  - name: "exact_match,flexible-extract"
+    value: 0.664
+limit: 1000
+num_fewshot: 5
+trust_remote_code: True
\ No newline at end of file
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
new file mode 100644
index 0000000..4b7776b
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
@@ -0,0 +1,12 @@
+# For hf script, without -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
+model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.905
+  - name: "exact_match,flexible-extract"
+    value: 0.905
+limit: 1000
+num_fewshot: 5
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
new file mode 100644
index 0000000..05b6617
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
@@ -0,0 +1,12 @@
+# For hf script, without -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
+model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.892
+  - name: "exact_match,flexible-extract"
+    value: 0.892
+limit: 250
+num_fewshot: 5
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
new file mode 100644
index 0000000..12a87e5
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
@@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
+model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.752
+  - name: "exact_match,flexible-extract"
+    value: 0.754
+limit: 1000
+num_fewshot: 5
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
new file mode 100644
index 0000000..7c7a1ca
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
@@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
+model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.753
+  - name: "exact_match,flexible-extract"
+    value: 0.753
+limit: 1000
+num_fewshot: 5
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
new file mode 100644
index 0000000..1d45c37
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
@@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
+model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.755
+  - name: "exact_match,flexible-extract"
+    value: 0.755
+limit: 1000
+num_fewshot: 5
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
new file mode 100644
index 0000000..29a1452
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
@@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
+model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.753
+  - name: "exact_match,flexible-extract"
+    value: 0.753
+limit: 1000
+num_fewshot: 5
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
new file mode 100644
index 0000000..3a5f120
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
@@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
+model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.764
+  - name: "exact_match,flexible-extract"
+    value: 0.764
+limit: 250
+num_fewshot: 5
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
new file mode 100644
index 0000000..5ff57ba
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
@@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
+model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.728
+  - name: "exact_match,flexible-extract"
+    value: 0.728
+limit: 250
+num_fewshot: 5
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
new file mode 100644
index 0000000..07fb130
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
@@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
+model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.758
+  - name: "exact_match,flexible-extract"
+    value: 0.759
+limit: 1000
+num_fewshot: 5
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
new file mode 100644
index 0000000..c278865
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
@@ -0,0 +1,12 @@
+# For hf script, without -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
+model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.756
+  - name: "exact_match,flexible-extract"
+    value: 0.752
+limit: 250
+num_fewshot: 5
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
new file mode 100644
index 0000000..56ec933
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
+model_name: "HandH1998/QQQ-Llama-3-8b-g128"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.419
+  - name: "exact_match,flexible-extract"
+    value: 0.416
+limit: 1000
+num_fewshot: 5
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml
new file mode 100644
index 0000000..cca5809
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
+model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.335
+  - name: "exact_match,flexible-extract"
+    value: 0.323
+limit: 1319
+num_fewshot: 5
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
new file mode 100644
index 0000000..83e11f2
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
@@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
+model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.356
+  - name: "exact_match,flexible-extract"
+    value: 0.358
+limit: 1000
+num_fewshot: 5
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
new file mode 100644
index 0000000..15a836d
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
@@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
+model_name: "mgoin/Minitron-4B-Base-FP8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.231
+  - name: "exact_match,flexible-extract"
+    value: 0.22
+limit: 1000
+num_fewshot: 5
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
new file mode 100644
index 0000000..5633a2d
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
@@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
+model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.86
+  - name: "exact_match,flexible-extract"
+    value: 0.86
+limit: 250
+num_fewshot: 5
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
new file mode 100644
index 0000000..b8024c8
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
@@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
+model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.624
+  - name: "exact_match,flexible-extract"
+    value: 0.624
+limit: 250
+num_fewshot: 5
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
new file mode 100644
index 0000000..188a112
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
@@ -0,0 +1,12 @@
+# For hf script, without -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
+model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.616
+  - name: "exact_match,flexible-extract"
+    value: 0.632
+limit: 250
+num_fewshot: 5
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
new file mode 100644
index 0000000..099e0f4
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
@@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
+model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.30
+  - name: "exact_match,flexible-extract"
+    value: 0.465
+limit: 1319
+num_fewshot: 5
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
new file mode 100644
index 0000000..426e8ff
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
@@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
+model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.578
+  - name: "exact_match,flexible-extract"
+    value: 0.585
+limit: 1000
+num_fewshot: 5
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
new file mode 100644
index 0000000..8d57e9d
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
@@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
+model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.593
+  - name: "exact_match,flexible-extract"
+    value: 0.588
+limit: 1000
+num_fewshot: 5
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
new file mode 100644
index 0000000..1bce7e7
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
@@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
+model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.595
+  - name: "exact_match,flexible-extract"
+    value: 0.582
+limit: 1000
+num_fewshot: 5
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
new file mode 100644
index 0000000..fc9707d
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
@@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
+model_name: "Qwen/Qwen2-57B-A14B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.792
+  - name: "exact_match,flexible-extract"
+    value: 0.824
+limit: 250
+num_fewshot: 5
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml
new file mode 100644
index 0000000..54579a6
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
+model_name: "Qwen/Qwen2.5-1.5B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.54
+  - name: "exact_match,flexible-extract"
+    value: 0.59
+limit: 1319
+num_fewshot: 5
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
new file mode 100644
index 0000000..a2f235f
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
+model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.47
+  - name: "exact_match,flexible-extract"
+    value: 0.64
+limit: 1319
+num_fewshot: 5
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
new file mode 100644
index 0000000..9a9c749
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
@@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
+model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.6353
+  - name: "exact_match,flexible-extract"
+    value: 0.637
+limit: null
+num_fewshot: null 
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/models-large.txt b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/models-large.txt
new file mode 100644
index 0000000..27a1a9a
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/models-large.txt
@@ -0,0 +1,6 @@
+Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
+Meta-Llama-3-70B-Instruct.yaml
+Mixtral-8x7B-Instruct-v0.1.yaml
+Qwen2-57B-A14-Instruct.yaml
+DeepSeek-V2-Lite-Chat.yaml
+Meta-Llama-3-8B-QQQ.yaml
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/models-small.txt b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/models-small.txt
new file mode 100644
index 0000000..36e0543
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/configs/models-small.txt
@@ -0,0 +1,6 @@
+Qwen2.5-1.5B-Instruct.yaml
+Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
+Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
+Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
+Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+Qwen1.5-MoE-W4A16-compressed-tensors.yaml
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/conftest.py b/vllm_v0.10.0/.buildkite/lm-eval-harness/conftest.py
new file mode 100644
index 0000000..c0d60dd
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/conftest.py
@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from pathlib import Path
+
+import pytest
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--config-list-file",
+        action="store",
+        help="Path to the file listing model config YAMLs (one per line)",
+    )
+    parser.addoption(
+        "--tp-size",
+        action="store",
+        default="1",
+        help="Tensor parallel size to use for evaluation",
+    )
+
+
+@pytest.fixture(scope="session")
+def config_list_file(pytestconfig, config_dir):
+    rel_path = pytestconfig.getoption("--config-list-file")
+    return config_dir / rel_path
+
+
+@pytest.fixture(scope="session")
+def tp_size(pytestconfig):
+    return pytestconfig.getoption("--tp-size")
+
+
+def pytest_generate_tests(metafunc):
+    if "config_filename" in metafunc.fixturenames:
+        rel_path = metafunc.config.getoption("--config-list-file")
+        config_list_file = Path(rel_path).resolve()
+        config_dir = config_list_file.parent
+        with open(config_list_file, encoding="utf-8") as f:
+            configs = [
+                config_dir / line.strip()
+                for line in f
+                if line.strip() and not line.startswith("#")
+            ]
+        metafunc.parametrize("config_filename", configs)
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/vllm_v0.10.0/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
new file mode 100644
index 0000000..a67fc89
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on GSM for transformers.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install lm-eval==0.4.4
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on GSM8k using huggingface transformers."
+    echo "This pathway is intended to be used to create baselines for "
+    echo "our automated nm-test-accuracy workflow"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - huggingface stub or local directory of the model"
+    echo "  -b    - batch size to run the evaluation at"
+    echo "  -l    - limit number of samples to run"
+    echo "  -f    - number of fewshot samples to use"
+    echo
+}
+
+while getopts "m:b:l:f:" OPT; do
+  case ${OPT} in
+    m ) 
+        MODEL="$OPTARG"
+        ;;
+    b ) 
+        BATCH_SIZE="$OPTARG"
+        ;;
+    l ) 
+        LIMIT="$OPTARG"
+        ;;
+    f ) 
+        FEWSHOT="$OPTARG"
+        ;;
+    \? ) 
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+lm_eval --model hf \
+  --model_args "pretrained=$MODEL,parallelize=True" \
+  --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
+  --batch_size "$BATCH_SIZE"
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/vllm_v0.10.0/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
new file mode 100644
index 0000000..b98d42a
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on GSM for vllm.
+# We use this for fp8, which HF does not support.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install lm-eval==0.4.4
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on GSM8k using huggingface transformers."
+    echo "This pathway is intended to be used to create baselines for "
+    echo "our automated nm-test-accuracy workflow"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - huggingface stub or local directory of the model"
+    echo "  -b    - batch size to run the evaluation at"
+    echo "  -l    - limit number of samples to run"
+    echo "  -f    - number of fewshot samples to use"
+    echo "  -t    - tensor parallel size to run at"
+    echo
+}
+
+while getopts "m:b:l:f:t:" OPT; do
+  case ${OPT} in
+    m ) 
+        MODEL="$OPTARG"
+        ;;
+    b ) 
+        BATCH_SIZE="$OPTARG"
+        ;;
+    l ) 
+        LIMIT="$OPTARG"
+        ;;
+    f ) 
+        FEWSHOT="$OPTARG"
+        ;;
+    t )
+        TP_SIZE="$OPTARG"
+        ;;
+    \? ) 
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+lm_eval --model vllm \
+  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
+  --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
+  --batch_size "$BATCH_SIZE"
diff --git a/vllm_v0.10.0/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/vllm_v0.10.0/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
new file mode 100644
index 0000000..ceea011
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+LM eval harness on model to compare vs HF baseline computed offline.
+Configs are found in configs/$MODEL.yaml
+
+pytest -s -v test_lm_eval_correctness.py \
+    --config-list-file=configs/models-small.txt \
+    --tp-size=1
+"""
+
+import lm_eval
+import numpy as np
+import yaml
+
+RTOL = 0.08
+
+
+def launch_lm_eval(eval_config, tp_size):
+    trust_remote_code = eval_config.get("trust_remote_code", False)
+    max_model_len = eval_config.get("max_model_len", 4096)
+    model_args = (
+        f"pretrained={eval_config['model_name']},"
+        f"tensor_parallel_size={tp_size},"
+        f"enforce_eager=true,"
+        f"add_bos_token=true,"
+        f"trust_remote_code={trust_remote_code},"
+        f"max_model_len={max_model_len}"
+    )
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=model_args,
+        tasks=[task["name"] for task in eval_config["tasks"]],
+        num_fewshot=eval_config["num_fewshot"],
+        limit=eval_config["limit"],
+        batch_size="auto",
+    )
+    return results
+
+
+def test_lm_eval_correctness_param(config_filename, tp_size):
+    eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
+
+    results = launch_lm_eval(eval_config, tp_size)
+
+    success = True
+    for task in eval_config["tasks"]:
+        for metric in task["metrics"]:
+            ground_truth = metric["value"]
+            measured_value = results["results"][task["name"]][metric["name"]]
+            print(
+                f"{task['name']} | {metric['name']}: "
+                f"ground_truth={ground_truth} | measured={measured_value}"
+            )
+            success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
+
+    assert success
diff --git a/vllm_v0.10.0/.buildkite/nightly-benchmarks/README.md b/vllm_v0.10.0/.buildkite/nightly-benchmarks/README.md
new file mode 100644
index 0000000..cdf6a64
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/nightly-benchmarks/README.md
@@ -0,0 +1,181 @@
+# vLLM benchmark suite
+
+## Introduction
+
+This directory contains two sets of benchmark for vllm.
+
+- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
+- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
+
+See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
+
+## Performance benchmark quick overview
+
+**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models.
+
+**Benchmarking Duration**: about 1hr.
+
+**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
+
+## Nightly benchmark quick overview
+
+**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
+
+**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
+
+**Benchmarking Duration**: about 3.5hrs.
+
+## Trigger the benchmark
+
+Performance benchmark will be triggered when:
+- A PR being merged into vllm.
+- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
+
+Manually Trigger the benchmark
+
+```bash
+bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+```
+
+Runtime environment variables:
+- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
+- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
+- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
+- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
+- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
+- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
+
+Nightly benchmark will be triggered when:
+- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
+
+## Performance benchmark details
+
+See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
+> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
+### Latency test
+
+Here is an example of one test inside `latency-tests.json`:
+
+```json
+[
+    {
+        "test_name": "latency_llama8B_tp1",
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    },
+]
+```
+
+In this example:
+
+- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
+- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
+
+Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
+
+WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
+
+### Throughput test
+
+The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
+
+The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
+
+### Serving test
+
+We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
+
+```json
+[
+    {
+        "test_name": "serving_llama8B_tp1_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B",
+            "tensor_parallel_size": 1,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+]
+```
+
+Inside this example:
+
+- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
+- The `server-parameters` includes the command line arguments for vLLM server.
+- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
+- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
+
+The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
+
+WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
+
+### Visualizing the results
+
+The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
+You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
+If you do not see the table, please wait till the benchmark finish running.
+The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
+The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
+
+The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
+When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
+`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
+
+Here is an example using the script to compare result_a and result_b without detail test name.
+`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json --ignore_test_name`
+
+|    | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        |
+|----|----------------------------------------|----------------------------------------|----------|
+| 0  | 142.633982                             | 156.526018                             | 1.097396 |
+| 1  | 241.620334                             | 294.018783                             | 1.216863 |
+| 2  | 218.298905                             | 262.664916                             | 1.203235 |
+| 3  | 242.743860                             | 299.816190                             | 1.235113 |
+
+Here is an example using the script to compare result_a and result_b with detail test name.
+`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
+|   | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio        |
+|---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------|
+| 0 | serving_llama8B_tp1_sharegpt_qps_1          | 142.633982                             | serving_llama8B_tp1_sharegpt_qps_1          | 156.526018                             | 1.097396 |
+| 1 | serving_llama8B_tp1_sharegpt_qps_16         | 241.620334                             | serving_llama8B_tp1_sharegpt_qps_16         | 294.018783                             | 1.216863 |
+| 2 | serving_llama8B_tp1_sharegpt_qps_4          | 218.298905                             | serving_llama8B_tp1_sharegpt_qps_4          | 262.664916                             | 1.203235 |
+| 3 | serving_llama8B_tp1_sharegpt_qps_inf        | 242.743860                             | serving_llama8B_tp1_sharegpt_qps_inf        | 299.816190                             | 1.235113 |
+| 4 | serving_llama8B_tp2_random_1024_128_qps_1   | 96.613390                              | serving_llama8B_tp4_random_1024_128_qps_1   | 108.404853                             | 1.122048 |
+
+## Nightly test details
+
+See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
+
+### Workflow
+
+- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
+- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
+- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
+- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
+
+### Nightly tests
+
+In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
+
+### Docker containers
+
+The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
+
+WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
+
+WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
diff --git a/vllm_v0.10.0/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/vllm_v0.10.0/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
new file mode 100644
index 0000000..4259514
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -0,0 +1,184 @@
+steps:
+  - label: "Wait for container to be ready"
+    key: wait-for-container-image
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          containers:
+          - image: badouralix/curl-jq
+            command:
+            - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+  - label: "Cleanup H100"
+    agents:
+      queue: H100
+    depends_on: ~
+    command: docker system prune -a --volumes --force
+  
+  - label: "A100"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+    agents:
+      queue: A100
+    depends_on: wait-for-container-image
+    if: build.branch == "main"
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: perf-benchmark
+          containers:
+          - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
+            command:
+            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+
+  - label: "H200"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+    agents:
+      queue: H200
+    depends_on: wait-for-container-image
+    if: build.branch == "main"
+    plugins:
+    - docker#v5.12.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
+        command:
+        - bash
+        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+        mount-buildkite-agent: true
+        propagate-environment: true
+        ipc: host
+        gpus: 4,5,6,7
+        volumes:
+          - /data/benchmark-hf-cache:/root/.cache/huggingface
+        environment:
+        - VLLM_USAGE_SOURCE
+        - HF_TOKEN
+
+  #- block: "Run H100 Benchmark"
+    #key: block-h100
+    #depends_on: ~
+
+  - label: "H100"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+    agents:
+      queue: H100
+    depends_on: wait-for-container-image
+    if: build.branch == "main"
+    plugins:
+    - docker#v5.12.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
+        command:
+        - bash
+        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+        mount-buildkite-agent: true
+        propagate-environment: true
+        ipc: host
+        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
+        volumes:
+          - /data/benchmark-hf-cache:/root/.cache/huggingface
+        environment:
+        - VLLM_USAGE_SOURCE
+        - HF_TOKEN
+
+  # Premerge benchmark
+  - label: "A100"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+    agents:
+      queue: A100
+    depends_on: wait-for-container-image
+    if: build.branch != "main"
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: perf-benchmark
+          containers:
+          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+            command:
+            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+
+  - label: "H200"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+    agents:
+      queue: H200
+    depends_on: wait-for-container-image
+    if: build.branch != "main"
+    plugins:
+    - docker#v5.12.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+        command:
+        - bash
+        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+        mount-buildkite-agent: true
+        propagate-environment: true
+        ipc: host
+        gpus: 4,5,6,7
+        volumes:
+          - /data/benchmark-hf-cache:/root/.cache/huggingface
+        environment:
+        - VLLM_USAGE_SOURCE
+        - HF_TOKEN
+
+  #- block: "Run H100 Benchmark"
+    #key: block-h100
+    #depends_on: ~
+
+  - label: "H100"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+    agents:
+      queue: H100
+    depends_on: wait-for-container-image
+    if: build.branch != "main"
+    plugins:
+    - docker#v5.12.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+        command:
+        - bash
+        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+        mount-buildkite-agent: true
+        propagate-environment: true
+        ipc: host
+        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
+        volumes:
+          - /data/benchmark-hf-cache:/root/.cache/huggingface
+        environment:
+        - VLLM_USAGE_SOURCE
+        - HF_TOKEN
diff --git a/vllm_v0.10.0/.buildkite/nightly-benchmarks/nightly-annotation.md b/vllm_v0.10.0/.buildkite/nightly-benchmarks/nightly-annotation.md
new file mode 100644
index 0000000..ef11c04
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/nightly-benchmarks/nightly-annotation.md
@@ -0,0 +1,27 @@
+
+## Description
+
+This file contains the downloading link for benchmarking results.
+
+- [benchmarking pipeline](artifact://nightly-pipeline.yaml)
+- [benchmarking results](artifact://results.zip)
+- [benchmarking code](artifact://nightly-benchmarks.zip)
+
+Please download the visualization scripts in the post
+
+## Results reproduction
+
+- Find the docker we use in `benchmarking pipeline`
+- Deploy the docker, and inside the docker:
+  - Download `nightly-benchmarks.zip`.
+  - In the same folder, run the following code:
+
+  ```bash
+  export HF_TOKEN=<your HF token>
+  apt update
+  apt install -y git
+  unzip nightly-benchmarks.zip
+  VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+  ```
+
+And the results will be inside `./benchmarks/results`.
diff --git a/vllm_v0.10.0/.buildkite/nightly-benchmarks/nightly-descriptions.md b/vllm_v0.10.0/.buildkite/nightly-benchmarks/nightly-descriptions.md
new file mode 100644
index 0000000..5f003f4
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -0,0 +1,39 @@
+
+# Nightly benchmark
+
+This benchmark aims to:
+
+- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
+- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
+
+Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
+
+Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
+
+## Setup
+
+- Docker images:
+  - vLLM: `vllm/vllm-openai:v0.6.2`
+  - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
+  - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
+  - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
+    - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
+  - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
+- Hardware
+  - 8x Nvidia A100 GPUs
+- Workload:
+  - Dataset
+    - ShareGPT dataset
+    - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
+    - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
+    - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
+  - Models: llama-3 8B, llama-3 70B.
+    - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
+  - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
+    - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
+  - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
+
+## Known issues
+
+- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
+- TGI does not support `ignore-eos` flag.
diff --git a/vllm_v0.10.0/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/vllm_v0.10.0/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
new file mode 100644
index 0000000..199517e
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -0,0 +1,196 @@
+common_pod_spec: &common_pod_spec
+  priorityClassName: perf-benchmark
+  nodeSelector:
+    nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+  volumes:
+    - name: devshm
+      emptyDir:
+        medium: Memory
+    - name: hf-cache
+      hostPath:
+        path: /root/.cache/huggingface
+        type: Directory
+
+common_container_settings: &common_container_settings
+  command:
+    - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+  resources:
+    limits:
+      nvidia.com/gpu: 8
+  volumeMounts:
+    - name: devshm
+      mountPath: /dev/shm
+    - name: hf-cache
+      mountPath: /root/.cache/huggingface
+  env:
+    - name: VLLM_USAGE_SOURCE
+      value: ci-test
+    - name: HF_HOME
+      value: /root/.cache/huggingface
+    - name: VLLM_SOURCE_CODE_LOC
+      value: /workspace/build/buildkite/vllm/performance-benchmark
+    - name: HF_TOKEN
+      valueFrom:
+        secretKeyRef:
+          name: hf-token-secret
+          key: token
+
+steps:
+  - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
+
+
+
+  - label: "A100 vllm step 10"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: vllm/vllm-openai:v0.6.2
+                <<: *common_container_settings
+
+
+
+  - label: "A100 sglang benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: lmsysorg/sglang:v0.3.2-cu121
+                <<: *common_container_settings
+
+  - label: "A100 lmdeploy benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: openmmlab/lmdeploy:v0.6.1-cu12
+                <<: *common_container_settings
+
+
+
+
+  - label: "A100 trt llama-8B"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
+                <<: *common_container_settings
+                env:
+                  - name: VLLM_USAGE_SOURCE
+                    value: ci-test
+                  - name: HF_HOME
+                    value: /root/.cache/huggingface
+                  - name: VLLM_SOURCE_CODE_LOC
+                    value: /workspace/build/buildkite/vllm/performance-benchmark
+                  - name: HF_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: hf-token-secret
+                        key: token
+                  - name: TEST_SELECTOR
+                    value: "llama8B"
+
+
+  - label: "A100 trt llama-70B"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
+                <<: *common_container_settings
+                env:
+                  - name: VLLM_USAGE_SOURCE
+                    value: ci-test
+                  - name: HF_HOME
+                    value: /root/.cache/huggingface
+                  - name: VLLM_SOURCE_CODE_LOC
+                    value: /workspace/build/buildkite/vllm/performance-benchmark
+                  - name: HF_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: hf-token-secret
+                        key: token
+                  - name: TEST_SELECTOR
+                    value: "llama70B"
+
+
+  # FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image 
+  # - label: "A100 trt benchmark"
+  #   priority: 100
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #     - kubernetes:
+  #         podSpec:
+  #           <<: *common_pod_spec
+  #           containers:
+  #             - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
+  #               <<: *common_container_settings
+
+
+  # FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
+  # - label: "A100 tgi benchmark"
+  #   priority: 100
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #     - kubernetes:
+  #         podSpec:
+  #           <<: *common_pod_spec
+  #           containers:
+  #             - image: ghcr.io/huggingface/text-generation-inference:2.2.0
+  #               <<: *common_container_settings
+        
+  - wait
+
+  - label: "Collect the results"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+            - image: vllm/vllm-openai:v0.5.0.post1
+              command:
+              - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+              resources:
+                limits:
+                  nvidia.com/gpu: 8
+              volumeMounts:
+              - name: devshm
+                mountPath: /dev/shm
+              env:
+              - name: VLLM_USAGE_SOURCE
+                value: ci-test
+              - name: VLLM_SOURCE_CODE_LOC
+                value: /workspace/build/buildkite/vllm/performance-benchmark
+              - name: HF_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-token-secret
+                    key: token
+
+  - block: ":rocket: check the results!"
\ No newline at end of file
diff --git a/vllm_v0.10.0/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md b/vllm_v0.10.0/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
new file mode 100644
index 0000000..a1f8441
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
@@ -0,0 +1,64 @@
+
+## Latency tests
+
+- Input length: 32 tokens.
+- Output length: 128 tokens.
+- Batch size: fixed (8).
+- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- CPU Models: llama-3.1 8B.
+- Evaluation metrics: end-to-end latency (mean, median, p99).
+
+{latency_tests_markdown_table}
+
+## Throughput tests
+
+- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
+- Output length: the corresponding output length of these 200 prompts.
+- Batch size: dynamically determined by vllm to achieve maximum throughput.
+- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- CPU Models: llama-3.1 8B.
+- Evaluation metrics: throughput.
+
+{throughput_tests_markdown_table}
+
+## Serving tests
+
+- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
+- Output length: the corresponding output length of these 200 prompts.
+- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
+- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
+- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
+- CPU Models: llama-3.1 8B.
+- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
+- For CPU, we added random dataset tests to benchmark fixed input/output length with 100 prompts.
+
+{serving_tests_markdown_table}
+
+## Platform Information
+
+{platform_markdown_table}
+
+## json version of the benchmarking tables
+
+This section contains the data of the markdown tables above in JSON format.
+You can load the benchmarking tables into pandas dataframes as follows:
+
+```python
+import json
+import pandas as pd
+
+benchmarking_results_json = """The json string"""
+benchmarking_results = json.loads(benchmarking_results_json)
+latency_results = pd.DataFrame.from_dict(benchmarking_results["latency"])
+throughput_results = pd.DataFrame.from_dict(benchmarking_results["throughput"])
+serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
+```
+
+The json string for all benchmarking tables:
+
+```json
+{benchmarking_results_in_json_string}
+```
+
+You can also check the raw experiment data in the Artifact tab of the Buildkite page.
diff --git a/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
new file mode 100644
index 0000000..20c1062
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+
+import pandas as pd
+
+
+def compare_data_columns(
+    files, name_column, data_column, drop_column, ignore_test_name=False
+):
+    print("\ncompare_data_column: " + data_column)
+    frames = []
+    compare_frames = []
+    for file in files:
+        data_df = pd.read_json(file)
+        serving_df = data_df.dropna(subset=[drop_column], ignore_index=True)
+        if ignore_test_name is False:
+            serving_df = serving_df.rename(columns={name_column: file + "_name"})
+            frames.append(serving_df[file + "_name"])
+        serving_df = serving_df.rename(columns={data_column: file})
+        frames.append(serving_df[file])
+        compare_frames.append(serving_df[file])
+        if len(compare_frames) >= 2:
+            # Compare numbers among two files
+            ratio_df = compare_frames[1] / compare_frames[0]
+            frames.append(ratio_df)
+            compare_frames.pop(1)
+
+    concat_df = pd.concat(frames, axis=1)
+    return concat_df
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-f", "--file", action="append", type=str, help="input file name"
+    )
+    parser.add_argument(
+        "--ignore_test_name", action="store_true", help="ignore_test_name or not"
+    )
+    args = parser.parse_args()
+    files = args.file
+    print("comparing : " + ", ".join(files))
+
+    drop_column = "P99"
+    name_column = "Test name"
+    data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
+    html_msgs_for_data_cols = [
+        "Compare Output Tokens /n",
+        "Median TTFT /n",
+        "Median TPOT /n",
+    ]
+    ignore_test_name = args.ignore_test_name
+    with open("perf_comparison.html", "w") as text_file:
+        for i in range(len(data_cols_to_compare)):
+            output_df = compare_data_columns(
+                files,
+                name_column,
+                data_cols_to_compare[i],
+                drop_column,
+                ignore_test_name=ignore_test_name,
+            )
+            print(output_df)
+            html = output_df.to_html()
+            text_file.write(html_msgs_for_data_cols[i])
+            text_file.write(html)
diff --git a/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
new file mode 100644
index 0000000..724b530
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -0,0 +1,268 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+import os
+from importlib import util
+from pathlib import Path
+
+import pandas as pd
+import psutil
+from tabulate import tabulate
+
+results_folder = Path("results/")
+
+# latency results and the keys that will be printed into markdown
+latency_results = []
+latency_column_mapping = {
+    "test_name": "Test name",
+    "gpu_type": "GPU",
+    "avg_latency": "Mean latency (ms)",
+    # "P10": "P10 (s)",
+    # "P25": "P25 (s)",
+    "P50": "Median latency (ms)",
+    # "P75": "P75 (s)",
+    # "P90": "P90 (s)",
+    "P99": "P99 latency (ms)",
+}
+
+# throughput tests and the keys that will be printed into markdown
+throughput_results = []
+throughput_results_column_mapping = {
+    "test_name": "Test name",
+    "gpu_type": "GPU",
+    "num_requests": "# of req.",
+    "total_num_tokens": "Total # of tokens",
+    "elapsed_time": "Elapsed time (s)",
+    "requests_per_second": "Tput (req/s)",
+    "tokens_per_second": "Tput (tok/s)",
+}
+
+# serving results and the keys that will be printed into markdown
+serving_results = []
+serving_column_mapping = {
+    "test_name": "Test name",
+    "gpu_type": "GPU",
+    "completed": "# of req.",
+    "request_throughput": "Tput (req/s)",
+    "total_token_throughput": "Total Token Tput (tok/s)",
+    "output_throughput": "Output Tput (tok/s)",
+    "total_input_tokens": "Total input tokens",
+    "total_output_tokens": "Total output tokens",
+    "mean_ttft_ms": "Mean TTFT (ms)",
+    "median_ttft_ms": "Median TTFT (ms)",
+    "p99_ttft_ms": "P99 TTFT (ms)",
+    "mean_tpot_ms": "Mean TPOT (ms)",
+    "median_tpot_ms": "Median",
+    "p99_tpot_ms": "P99",
+    "mean_itl_ms": "Mean ITL (ms)",
+    "median_itl_ms": "Median ITL (ms)",
+    "p99_itl_ms": "P99 ITL (ms)",
+}
+
+
+def read_markdown(file):
+    if os.path.exists(file):
+        with open(file) as f:
+            return f.read() + "\n"
+    else:
+        return f"{file} not found.\n"
+
+
+def results_to_json(latency, throughput, serving):
+    return json.dumps(
+        {
+            "latency": latency.to_dict(),
+            "throughput": throughput.to_dict(),
+            "serving": serving.to_dict(),
+        }
+    )
+
+
+def get_size_with_unit(bytes, suffix="B"):
+    """
+    Scale bytes to its proper format
+    e.g:
+        1253656 => '1.20MB'
+        1253656678 => '1.17GB'
+    """
+    factor = 1024
+    for unit in ["", "K", "M", "G", "T", "P"]:
+        if bytes < factor:
+            return f"{bytes:.2f}{unit}{suffix}"
+        bytes /= factor
+
+
+if __name__ == "__main__":
+    # collect results
+    for test_file in results_folder.glob("*.json"):
+        with open(test_file) as f:
+            raw_result = json.loads(f.read())
+
+        if "serving" in str(test_file):
+            # this result is generated via `benchmark_serving.py`
+
+            # attach the benchmarking command to raw_result
+            try:
+                with open(test_file.with_suffix(".commands")) as f:
+                    command = json.loads(f.read())
+            except OSError as e:
+                print(e)
+                continue
+
+            raw_result.update(command)
+
+            # update the test name of this result
+            raw_result.update({"test_name": test_file.stem})
+
+            # add the result to raw_result
+            serving_results.append(raw_result)
+            continue
+
+        elif "latency" in f.name:
+            # this result is generated via `benchmark_latency.py`
+
+            # attach the benchmarking command to raw_result
+            try:
+                with open(test_file.with_suffix(".commands")) as f:
+                    command = json.loads(f.read())
+            except OSError as e:
+                print(e)
+                continue
+
+            raw_result.update(command)
+
+            # update the test name of this result
+            raw_result.update({"test_name": test_file.stem})
+
+            # get different percentiles
+            for perc in [10, 25, 50, 75, 90, 99]:
+                # Multiply 1000 to convert the time unit from s to ms
+                raw_result.update(
+                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}
+                )
+            raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
+
+            # add the result to raw_result
+            latency_results.append(raw_result)
+            continue
+
+        elif "throughput" in f.name:
+            # this result is generated via `benchmark_throughput.py`
+
+            # attach the benchmarking command to raw_result
+            try:
+                with open(test_file.with_suffix(".commands")) as f:
+                    command = json.loads(f.read())
+            except OSError as e:
+                print(e)
+                continue
+
+            raw_result.update(command)
+
+            # update the test name of this result
+            raw_result.update({"test_name": test_file.stem})
+
+            # add the result to raw_result
+            throughput_results.append(raw_result)
+            continue
+
+        print(f"Skipping {test_file}")
+
+    latency_results = pd.DataFrame.from_dict(latency_results)
+    serving_results = pd.DataFrame.from_dict(serving_results)
+    throughput_results = pd.DataFrame.from_dict(throughput_results)
+
+    svmem = psutil.virtual_memory()
+    platform_data = {
+        "Physical cores": [psutil.cpu_count(logical=False)],
+        "Total cores": [psutil.cpu_count(logical=True)],
+        "Total Memory": [get_size_with_unit(svmem.total)],
+    }
+
+    if util.find_spec("numa") is not None:
+        from numa import info
+
+        platform_data["Total NUMA nodes"] = [info.get_num_configured_nodes()]
+
+    if util.find_spec("cpuinfo") is not None:
+        from cpuinfo import get_cpu_info
+
+        platform_data["CPU Brand"] = [get_cpu_info()["brand_raw"]]
+
+    platform_results = pd.DataFrame.from_dict(
+        platform_data, orient="index", columns=["Platform Info"]
+    )
+
+    raw_results_json = results_to_json(
+        latency_results, throughput_results, serving_results
+    )
+
+    # remapping the key, for visualization purpose
+    if not latency_results.empty:
+        latency_results = latency_results[list(latency_column_mapping.keys())].rename(
+            columns=latency_column_mapping
+        )
+    if not serving_results.empty:
+        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
+            columns=serving_column_mapping
+        )
+    if not throughput_results.empty:
+        throughput_results = throughput_results[
+            list(throughput_results_column_mapping.keys())
+        ].rename(columns=throughput_results_column_mapping)
+
+    processed_results_json = results_to_json(
+        latency_results, throughput_results, serving_results
+    )
+
+    for df in [latency_results, serving_results, throughput_results]:
+        if df.empty:
+            continue
+
+        # Sort all dataframes by their respective "Test name" columns
+        df.sort_values(by="Test name", inplace=True)
+
+        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
+        # we want to turn it into "8xGPUTYPE"
+        df["GPU"] = df["GPU"].apply(
+            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
+        )
+
+    # get markdown tables
+    latency_md_table = tabulate(
+        latency_results, headers="keys", tablefmt="pipe", showindex=False
+    )
+    serving_md_table = tabulate(
+        serving_results, headers="keys", tablefmt="pipe", showindex=False
+    )
+    throughput_md_table = tabulate(
+        throughput_results, headers="keys", tablefmt="pipe", showindex=False
+    )
+    platform_md_table = tabulate(
+        platform_results, headers="keys", tablefmt="pipe", showindex=True
+    )
+
+    # document the result
+    with open(results_folder / "benchmark_results.md", "w") as f:
+        results = read_markdown(
+            "../.buildkite/nightly-benchmarks/"
+            + "performance-benchmarks-descriptions.md"
+        )
+        results = results.format(
+            latency_tests_markdown_table=latency_md_table,
+            throughput_tests_markdown_table=throughput_md_table,
+            serving_tests_markdown_table=serving_md_table,
+            platform_markdown_table=platform_md_table,
+            benchmarking_results_in_json_string=processed_results_json,
+        )
+        f.write(results)
+
+    # document benchmarking results in json
+    with open(results_folder / "benchmark_results.json", "w") as f:
+        results = (
+            latency_results.to_dict(orient="records")
+            + throughput_results.to_dict(orient="records")
+            + serving_results.to_dict(orient="records")
+        )
+        f.write(json.dumps(results))
diff --git a/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
new file mode 100644
index 0000000..8532ff7
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@@ -0,0 +1,26 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+
+from transformers import AutoTokenizer
+
+
+def main(model, cachedir):
+    # Load the tokenizer and save it to the specified directory
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    tokenizer.save_pretrained(cachedir)
+    print(f"Tokenizer saved to {cachedir}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Download and save Hugging Face tokenizer"
+    )
+    parser.add_argument("--model", type=str, required=True, help="Name of the model")
+    parser.add_argument(
+        "--cachedir", type=str, required=True, help="Directory to save the tokenizer"
+    )
+
+    args = parser.parse_args()
+    main(args.model, args.cachedir)
diff --git a/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
new file mode 100644
index 0000000..053fd52
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import json
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from tabulate import tabulate
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description="Parse command line arguments for summary-nightly-results script."
+    )
+    parser.add_argument(
+        "--results-folder",
+        type=str,
+        required=True,
+        help="The folder where the results are stored.",
+    )
+    parser.add_argument(
+        "--description", type=str, required=True, help="Description of the results."
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+def get_perf(df, method, model, metric):
+    means = []
+
+    for qps in [2, 4, 8, 16, "inf"]:
+        target = df["Test name"].str.contains(model)
+        target = target & df["Engine"].str.contains(method)
+        target = target & df["Test name"].str.contains("qps_" + str(qps))
+        filtered_df = df[target]
+
+        if filtered_df.empty:
+            means.append(0.0)
+        else:
+            means.append(filtered_df[metric].values[0])
+
+    return np.array(means)
+
+
+def get_perf_w_std(df, method, model, metric):
+    if metric in ["TTFT", "ITL"]:
+        mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
+        mean = mean.tolist()
+        std = get_perf(df, method, model, "Std " + metric + " (ms)")
+        if std.mean() == 0:
+            std = None
+        success = get_perf(df, method, model, "Successful req.")
+        if std is not None:
+            std = std / np.sqrt(success)
+            std = std.tolist()
+
+    else:
+        assert metric == "Tput"
+        mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
+            df, method, model, "Output Tput (tok/s)"
+        )
+        mean = mean.tolist()
+        std = None
+
+    return mean, std
+
+
+def main(args):
+    results_folder = Path(args.results_folder)
+
+    results = []
+
+    # collect results
+    for test_file in results_folder.glob("*_nightly_results.json"):
+        with open(test_file) as f:
+            results = results + json.loads(f.read())
+
+    # generate markdown table
+    df = pd.DataFrame.from_dict(results)
+
+    md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
+
+    with open(args.description) as f:
+        description = f.read()
+
+    description = description.format(nightly_results_benchmarking_table=md_table)
+
+    with open("nightly_results.md", "w") as f:
+        f.write(description)
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    main(args)
diff --git a/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py b/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
new file mode 100644
index 0000000..ddea1d2
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from lmdeploy.serve.openai.api_client import APIClient
+
+api_client = APIClient("http://localhost:8000")
+model_name = api_client.available_models[0]
+
+print(model_name)
diff --git a/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/launch-server.sh b/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/launch-server.sh
new file mode 100644
index 0000000..fb5063d
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/launch-server.sh
@@ -0,0 +1,228 @@
+#!/bin/bash
+
+# Currently FP8 benchmark is NOT enabled.
+
+set -x
+server_params=$1
+common_params=$2
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+launch_trt_server() {
+
+  model_path=$(echo "$common_params" | jq -r '.model')
+  model_name="${model_path#*/}"
+  model_type=$(echo "$server_params" | jq -r '.model_type')
+  model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
+  model_tp_size=$(echo "$common_params" | jq -r '.tp')
+  max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
+  max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
+  max_seq_len=$(echo "$server_params" | jq -r '.max_seq_len')
+  max_num_tokens=$(echo "$server_params" | jq -r '.max_num_tokens')
+  trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
+
+  # create model caching directory
+  cd ~
+  rm -rf models
+  mkdir -p models
+  cd models
+  models_dir=$(pwd)
+  trt_model_path=${models_dir}/${model_name}-trt-ckpt
+  trt_engine_path=${models_dir}/${model_name}-trt-engine
+
+  # clone tensorrt backend
+  cd /
+  rm -rf tensorrtllm_backend
+  git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
+  git lfs install
+  cd tensorrtllm_backend
+  git checkout "$trt_llm_version"
+  git submodule update --init --recursive
+
+  # build trtllm engine
+  cd /tensorrtllm_backend
+  cd "./tensorrt_llm/examples/${model_type}"
+  python3 convert_checkpoint.py \
+    --model_dir "${model_path}" \
+    --dtype "${model_dtype}" \
+    --tp_size "${model_tp_size}" \
+    --output_dir "${trt_model_path}"
+  trtllm-build \
+    --checkpoint_dir "${trt_model_path}" \
+    --use_fused_mlp \
+    --reduce_fusion disable \
+    --workers 8 \
+    --gpt_attention_plugin "${model_dtype}" \
+    --gemm_plugin "${model_dtype}" \
+    --tp_size "${model_tp_size}" \
+    --max_batch_size "${max_batch_size}" \
+    --max_input_len "${max_input_len}" \
+    --max_seq_len "${max_seq_len}" \
+    --max_num_tokens "${max_num_tokens}" \
+    --output_dir "${trt_engine_path}"
+
+  # handle triton protobuf files and launch triton server
+  cd /tensorrtllm_backend
+  mkdir triton_model_repo
+  cp -r all_models/inflight_batcher_llm/* triton_model_repo/
+  cd triton_model_repo
+  rm -rf ./tensorrt_llm/1/*
+  cp -r "${trt_engine_path}"/* ./tensorrt_llm/1
+  python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false
+  python3 ../tools/fill_template.py -i preprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5"
+  python3 ../tools/fill_template.py -i postprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false"
+  python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:"$max_batch_size"
+  python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt "triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:False,bls_instance_count:1"
+  cd /tensorrtllm_backend
+  python3 scripts/launch_triton_server.py \
+    --world_size="${model_tp_size}" \
+    --model_repo=/tensorrtllm_backend/triton_model_repo &
+
+}
+
+launch_tgi_server() {
+  model=$(echo "$common_params" | jq -r '.model')
+  tp=$(echo "$common_params" | jq -r '.tp')
+  port=$(echo "$common_params" | jq -r '.port')
+  server_args=$(json2args "$server_params")
+
+  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
+    echo "Key 'fp8' exists in common params."
+    server_command="/tgi-entrypoint.sh \
+                --model-id $model \
+                --num-shard $tp \
+                --port $port \
+                --quantize fp8 \
+                $server_args"
+  else
+    echo "Key 'fp8' does not exist in common params."
+    server_command="/tgi-entrypoint.sh \
+                --model-id $model \
+                --num-shard $tp \
+                --port $port \
+                $server_args"
+  fi
+
+  echo "Server command: $server_command"
+  eval "$server_command" &
+
+}
+
+launch_lmdeploy_server() {
+  model=$(echo "$common_params" | jq -r '.model')
+  tp=$(echo "$common_params" | jq -r '.tp')
+  port=$(echo "$common_params" | jq -r '.port')
+  server_args=$(json2args "$server_params")
+
+  server_command="lmdeploy serve api_server $model \
+    --tp $tp \
+    --server-port $port \
+    $server_args"
+
+  # run the server
+  echo "Server command: $server_command"
+  bash -c "$server_command" &
+}
+
+launch_sglang_server() {
+
+  model=$(echo "$common_params" | jq -r '.model')
+  tp=$(echo "$common_params" | jq -r '.tp')
+  port=$(echo "$common_params" | jq -r '.port')
+  server_args=$(json2args "$server_params")
+
+  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
+    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
+    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
+    server_command="python3 \
+        -m sglang.launch_server \
+        --tp $tp \
+        --model-path $model \
+        --port $port \
+        $server_args"
+  else
+    echo "Key 'fp8' does not exist in common params."
+    server_command="python3 \
+        -m sglang.launch_server \
+        --tp $tp \
+        --model-path $model \
+        --port $port \
+        $server_args"
+  fi
+
+  # run the server
+  echo "Server command: $server_command"
+  eval "$server_command" &
+}
+
+launch_vllm_server() {
+
+  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+
+  model=$(echo "$common_params" | jq -r '.model')
+  tp=$(echo "$common_params" | jq -r '.tp')
+  port=$(echo "$common_params" | jq -r '.port')
+  server_args=$(json2args "$server_params")
+
+  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
+    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
+    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
+    server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
+        -tp $tp \
+        --model $model \
+        --port $port \
+        $server_args"
+  else
+    echo "Key 'fp8' does not exist in common params."
+    server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
+        -tp $tp \
+        --model $model \
+        --port $port \
+        $server_args"
+  fi
+
+  # run the server
+  echo "Server command: $server_command"
+  eval "$server_command" &
+}
+
+main() {
+
+  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "trt" ]]; then
+    launch_trt_server
+  fi
+
+  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "tgi" ]]; then
+    launch_tgi_server
+  fi
+
+  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
+    launch_lmdeploy_server
+  fi
+
+  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "sglang" ]]; then
+    launch_sglang_server
+  fi
+
+  if [[ "$CURRENT_LLM_SERVING_ENGINE" == *"vllm"* ]]; then
+    launch_vllm_server
+  fi
+}
+
+main
diff --git a/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
new file mode 100644
index 0000000..69b6b14
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+
+set -ex
+set -o pipefail
+
+
+main() {
+
+    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+    (which jq) || (apt-get update && apt-get -y install jq)
+    (which zip) || (apt-get install -y zip)
+
+    if [ ! -f /workspace/buildkite-agent ]; then
+        echo "buildkite-agent binary not found. Skip plotting the results."
+        exit 0
+    fi
+
+    # initial annotation
+    #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
+
+    # download results
+    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+    mkdir -p results/
+    /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
+    ls
+    ls results/
+
+    # upload benchmark results
+    zip -r results.zip results/
+    /workspace/buildkite-agent artifact upload "results.zip"
+
+    # upload benchmarking scripts
+    cd "$VLLM_SOURCE_CODE_LOC/"
+    zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
+    /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
+
+    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
+    # upload benchmarking pipeline
+    /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
+
+    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
+    /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
+    
+
+
+    # The figures should be generated by a separate process outside the CI/CD pipeline
+
+    # # generate figures
+    # python3 -m pip install tabulate pandas matplotlib
+
+    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
+    #     --description $description \
+    #     --results-folder results/ 
+
+
+    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
+    #     --description $description \
+    #     --results-folder results/ \
+    #     --dataset sharegpt
+
+    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
+    #     --description $description \
+    #     --results-folder results/ \
+    #     --dataset sonnet_2048_128
+
+    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
+    #     --description $description \
+    #     --results-folder results/ \
+    #     --dataset sonnet_128_2048
+    
+    # # upload results and figures
+    # /workspace/buildkite-agent artifact upload "nightly_results*.png"
+    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+    # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
+}
+
+main "$@"
diff --git a/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
new file mode 100644
index 0000000..4d01a31
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@@ -0,0 +1,462 @@
+#!/bin/bash
+
+set -o pipefail
+set -x
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
+  echo "GPU type is $gpu_type"
+}
+
+check_hf_token() {
+  # check if HF_TOKEN is available and valid
+  if [[ -z "$HF_TOKEN" ]]; then
+    echo "Error: HF_TOKEN is not set."
+    exit 1
+  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
+    echo "Error: HF_TOKEN does not start with 'hf_'."
+    exit 1
+  else
+    echo "HF_TOKEN is set and valid."
+  fi
+}
+
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+
+get_current_llm_serving_engine() {
+
+  if which lmdeploy >/dev/null; then
+    echo "Container: lmdeploy"
+    export CURRENT_LLM_SERVING_ENGINE=lmdeploy
+    return
+  fi
+
+  if [ -e /tgi-entrypoint.sh ]; then
+    echo "Container: tgi"
+    export CURRENT_LLM_SERVING_ENGINE=tgi
+    return
+  fi
+
+  if which trtllm-build >/dev/null; then
+    echo "Container: tensorrt-llm"
+    export CURRENT_LLM_SERVING_ENGINE=trt
+    return
+  fi
+
+  if [ -e /sgl-workspace ]; then
+    echo "Container: sglang"
+    export CURRENT_LLM_SERVING_ENGINE=sglang
+    return
+  fi
+
+  if [ -e /vllm-workspace ]; then
+    echo "Container: vllm"
+    # move to a completely irrelevant directory, to avoid import vllm from current folder
+    export CURRENT_LLM_SERVING_ENGINE=vllm
+    
+    return
+  fi
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+kill_gpu_processes() {
+  pkill -f python
+  pkill -f python3
+  pkill -f tritonserver
+  pkill -f pt_main_thread
+  pkill -f text-generation
+  pkill -f lmdeploy
+
+  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
+    sleep 1
+  done
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  timeout 1200 bash -c '
+    until curl -s localhost:8000/v1/completions > /dev/null; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+ensure_installed() {
+  # Ensure that the given command is installed by apt-get
+  local cmd=$1
+  if ! which "$cmd" >/dev/null; then
+    apt-get update && apt-get install -y "$cmd"
+  fi
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # prepend the current serving engine to the test name
+    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
+    client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    if [[ $reuse_server == "true" ]]; then
+      echo "Reuse previous server for test case $test_name"
+    else
+      kill_gpu_processes
+      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
+        "$server_params" "$common_params"
+    fi
+
+    if wait_for_server; then
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
+    else
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
+      break
+    fi
+
+    # prepare tokenizer
+    # this is required for lmdeploy.
+    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+    rm -rf /tokenizer_cache
+    mkdir /tokenizer_cache
+    python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
+      --model "$model" \
+      --cachedir /tokenizer_cache
+    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+
+
+    # change model name for lmdeploy (it will not follow standard hf name)
+    if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
+      model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      backend=$CURRENT_LLM_SERVING_ENGINE
+
+      if [[ $backend = "trt" ]]; then
+        backend="tensorrt-llm"
+      fi
+
+      if [[ "$backend" == *"vllm"* ]]; then
+        backend="vllm"
+      fi
+
+      if [[ "$dataset_name" = "sharegpt" ]]; then
+
+        client_command="python3 benchmark_serving.py \
+          --backend $backend \
+          --tokenizer /tokenizer_cache \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --num-prompts $num_prompts \
+          --port $port \
+          --save-result \
+          --result-dir $RESULTS_FOLDER \
+          --result-filename ${new_test_name}.json \
+          --request-rate $qps \
+          --ignore-eos \
+          $client_args"
+
+      elif [[ "$dataset_name" = "sonnet" ]]; then
+
+        sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
+        sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
+        sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
+
+        client_command="python3 benchmark_serving.py \
+          --backend $backend \
+          --tokenizer /tokenizer_cache \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --num-prompts $num_prompts \
+          --sonnet-input-len $sonnet_input_len \
+          --sonnet-output-len $sonnet_output_len \
+          --sonnet-prefix-len $sonnet_prefix_len \
+          --port $port \
+          --save-result \
+          --result-dir $RESULTS_FOLDER \
+          --result-filename ${new_test_name}.json \
+          --request-rate $qps \
+          --ignore-eos \
+          $client_args"
+
+      else
+  
+        echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
+        exit 1
+
+      fi
+
+        
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      server_command="None"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "$CURRENT_LLM_SERVING_ENGINE" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+  done
+
+  kill_gpu_processes
+}
+
+run_genai_perf_tests() {
+  # run genai-perf tests 
+
+  # $1: a json file specifying genai-perf test cases
+  local genai_perf_test_file
+  genai_perf_test_file=$1
+
+  # Iterate over genai-perf tests
+  jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')    
+    
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+    
+    # prepend the current serving engine to the test name
+    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    if [[ $reuse_server == "true" ]]; then
+      echo "Reuse previous server for test case $test_name"
+    else
+      kill_gpu_processes
+      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
+        "$server_params" "$common_params"
+    fi
+
+    if wait_for_server; then
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
+    else
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
+      break
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps=$num_prompts
+        echo "now qps is $qps"
+      fi
+    
+      new_test_name=$test_name"_qps_"$qps
+      backend=$CURRENT_LLM_SERVING_ENGINE
+      
+      if [[ "$backend" == *"vllm"* ]]; then
+        backend="vllm"
+      fi
+      #TODO: add output dir.
+      client_command="genai-perf profile \
+        -m $model \
+        --service-kind openai \
+        --backend vllm \
+        --endpoint-type chat \
+        --streaming \
+        --url localhost:$port \
+        --request-rate $qps \
+        --num-prompts $num_prompts \
+      "
+
+    echo "Client command: $client_command"
+
+    eval "$client_command"
+
+    #TODO: process/record outputs
+    done
+  done
+
+  kill_gpu_processes
+
+}
+
+prepare_dataset() {
+
+  # download sharegpt dataset
+  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+  wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+  # duplicate sonnet by 4x, to allow benchmarking with input length 2048
+  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+  echo "" > sonnet_4x.txt
+  for _ in {1..4}
+  do
+    cat sonnet.txt >> sonnet_4x.txt
+  done
+  
+}
+
+main() {
+
+  # check if the environment variable is successfully injected from yaml
+
+  check_gpus
+  check_hf_token
+  get_current_llm_serving_engine
+
+  pip install -U transformers
+
+  pip install -r requirements/dev.txt
+  which genai-perf
+
+  # check storage
+  df -h
+
+  ensure_installed wget
+  ensure_installed curl
+  ensure_installed jq
+  # genai-perf dependency
+  ensure_installed libb64-0d
+
+  prepare_dataset
+
+  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
+
+  # run the test
+  run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
+
+  # run genai-perf tests
+  run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
+  mv artifacts/ $RESULTS_FOLDER/
+
+  # upload benchmark results to buildkite
+  python3 -m pip install tabulate pandas
+  python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
+  upload_to_buildkite
+
+}
+
+main "$@"
diff --git a/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
new file mode 100644
index 0000000..f050406
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -0,0 +1,474 @@
+#!/bin/bash
+
+# This script should be run inside the CI process
+# This script assumes that we are already inside the vllm/ directory
+# Benchmarking results will be available inside vllm/benchmarks/results/
+
+# Do not set -e, as the mixtral 8x22B model tends to crash occasionally
+# and we still want to see other benchmarking results even when mixtral crashes.
+set -x
+set -o pipefail
+
+check_gpus() {
+  if command -v nvidia-smi; then
+    # check the number of GPUs and GPU type.
+    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  elif command -v amd-smi; then
+    declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
+  fi
+
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  if command -v nvidia-smi; then
+    declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
+  elif command -v amd-smi; then
+    declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
+  fi
+  echo "GPU type is $gpu_type"
+}
+
+check_cpus() {
+  # check the number of CPUs and NUMA Node and GPU type.
+  declare -g numa_count=$(python3 -c  "from numa import info;numa_size = info.get_num_configured_nodes(); print(numa_size)")
+  if [[ $numa_count -gt 0 ]]; then
+    echo "NUMA found."
+    echo $numa_count
+  else
+    echo "Need at least 1 NUMA to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type="cpu"
+  echo "GPU type is $gpu_type"
+}
+
+check_hf_token() {
+  # check if HF_TOKEN is available and valid
+  if [[ -z "$HF_TOKEN" ]]; then
+    echo "Error: HF_TOKEN is not set."
+    exit 1
+  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
+    echo "Error: HF_TOKEN does not start with 'hf_'."
+    exit 1
+  else
+    echo "HF_TOKEN is set and valid."
+  fi
+}
+
+ensure_sharegpt_downloaded() {
+  local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
+  if [ ! -f "$FILE" ]; then
+    wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
+  else
+    echo "$FILE already exists."
+  fi
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+json2envs() {
+  # transforms the JSON string to environment variables.
+  # example:
+  # input: { "VLLM_CPU_KVCACHE_SPACE": 5 }
+  # output: VLLM_CPU_KVCACHE_SPACE=5
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map((.key ) + "=" + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  timeout 1200 bash -c '
+    until curl -X POST localhost:8000/v1/completions; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+kill_processes_launched_by_current_bash() {
+  # Kill all python processes launched from current bash script
+  current_shell_pid=$$
+  processes=$(ps -eo pid,ppid,command | awk -v ppid="$current_shell_pid" -v proc="$1" '$2 == ppid && $3 ~ proc {print $1}')
+  if [ -n "$processes" ]; then
+    echo "Killing the following processes matching '$1':"
+    echo "$processes"
+    echo "$processes" | xargs kill -9
+  else
+    echo "No processes found matching '$1'."
+  fi
+}
+
+kill_gpu_processes() {
+
+  ps -aux
+  lsof -t -i:8000 | xargs -r kill -9
+  pgrep python3 | xargs -r kill -9
+
+
+  # wait until GPU memory usage smaller than 1GB
+  if command -v nvidia-smi; then
+    while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
+      sleep 1
+    done
+  elif command -v amd-smi; then
+    while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
+      sleep 1
+    done
+  fi
+
+  # remove vllm config file
+  rm -rf ~/.config/vllm
+
+}
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  # Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent
+  if command -v buildkite-agent >/dev/null 2>&1; then
+    BUILDKITE_AGENT_COMMAND="buildkite-agent"
+  elif [ -f /workspace/buildkite-agent ]; then
+    BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent"
+  else
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+
+  # Use the determined command to annotate and upload artifacts
+  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md"
+  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
+}
+
+run_latency_tests() {
+  # run latency tests using `benchmark_latency.py`
+  # $1: a json file specifying latency test cases
+
+  local latency_test_file
+  latency_test_file=$1
+
+  # Iterate over latency tests
+  jq -c '.[]' "$latency_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    if [[ ! "$test_name" =~ ^latency_ ]]; then
+      echo "In latency-test.json, test_name must start with \"latency_\"."
+      exit 1
+    fi
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # get arguments
+    latency_params=$(echo "$params" | jq -r '.parameters')
+    latency_args=$(json2args "$latency_params")
+    latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
+    latency_envs=$(json2envs "$latency_environment_variables")
+
+    # check if there is enough GPU to run the test
+    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
+    if [ "$ON_CPU" == "1" ];then
+      if [[ $numa_count -lt $tp ]]; then
+        echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
+        continue
+      fi
+    else
+      if [[ $gpu_count -lt $tp ]]; then
+        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+        continue
+      fi
+    fi
+
+    latency_command=" $latency_envs python3 benchmark_latency.py \
+      --output-json $RESULTS_FOLDER/${test_name}.json \
+      $latency_args"
+
+    echo "Running test case $test_name"
+    echo "Latency command: $latency_command"
+
+    # recoding benchmarking command ang GPU command
+    jq_output=$(jq -n \
+      --arg latency "$latency_command" \
+      --arg gpu "$gpu_type" \
+      '{
+        latency_command: $latency,
+        gpu_type: $gpu
+      }')
+    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
+
+    # run the benchmark
+    eval "$latency_command"
+
+    kill_gpu_processes
+
+  done
+}
+
+run_throughput_tests() {
+  # run throughput tests using `benchmark_throughput.py`
+  # $1: a json file specifying throughput test cases
+
+  local throughput_test_file
+  throughput_test_file=$1
+
+  # Iterate over throughput tests
+  jq -c '.[]' "$throughput_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    if [[ ! "$test_name" =~ ^throughput_ ]]; then
+      echo "In throughput-test.json, test_name must start with \"throughput_\"."
+      exit 1
+    fi
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # get arguments
+    throughput_params=$(echo "$params" | jq -r '.parameters')
+    throughput_args=$(json2args "$throughput_params")
+    throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
+    throughput_envs=$(json2envs "$throughput_environment_variables")
+
+    # check if there is enough GPU to run the test
+    tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
+    if [ "$ON_CPU" == "1" ];then
+      if [[ $numa_count -lt $tp ]]; then
+        echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
+        continue
+      fi
+    else
+      if [[ $gpu_count -lt $tp ]]; then
+        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+        continue
+      fi
+    fi
+
+    throughput_command=" $throughput_envs python3 benchmark_throughput.py \
+      --output-json $RESULTS_FOLDER/${test_name}.json \
+      $throughput_args"
+
+    echo "Running test case $test_name"
+    echo "Throughput command: $throughput_command"
+    # recoding benchmarking command ang GPU command
+    jq_output=$(jq -n \
+      --arg command "$throughput_command" \
+      --arg gpu "$gpu_type" \
+      '{
+        throughput_command: $command,
+        gpu_type: $gpu
+      }')
+    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
+
+    # run the benchmark
+    eval "$throughput_command"
+
+    kill_gpu_processes
+
+  done
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    if [[ ! "$test_name" =~ ^serving_ ]]; then
+      echo "In serving-test.json, test_name must start with \"serving_\"."
+      exit 1
+    fi
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.server_parameters')
+    server_envs=$(echo "$params" | jq -r '.server_environment_variables')
+    client_params=$(echo "$params" | jq -r '.client_parameters')
+    server_args=$(json2args "$server_params")
+    server_envs=$(json2envs "$server_envs")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough resources to run the test
+    tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
+    if [ "$ON_CPU" == "1" ];then
+      if [[ $numa_count -lt $tp ]]; then
+        echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
+        continue
+      fi
+    else
+      if [[ $gpu_count -lt $tp ]]; then
+        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+        continue
+      fi
+    fi
+
+    # check if server model and client model is aligned
+    server_model=$(echo "$server_params" | jq -r '.model')
+    client_model=$(echo "$client_params" | jq -r '.model')
+    if [[ $server_model != "$client_model" ]]; then
+      echo "Server model and client model must be the same. Skip testcase $test_name."
+      continue
+    fi
+
+    server_command="$server_envs python3 \
+      -m vllm.entrypoints.openai.api_server \
+      $server_args"
+
+    # run the server
+    echo "Running test case $test_name"
+    echo "Server command: $server_command"
+    # support remote vllm server
+    client_remote_args=""
+    if [[ -z "${REMOTE_HOST}" ]]; then
+      bash -c "$server_command" &
+      server_pid=$!
+      # wait until the server is alive
+      if wait_for_server; then
+        echo ""
+        echo "vLLM server is up and running."
+      else
+        echo ""
+        echo "vLLM failed to start within the timeout period."
+      fi
+    else
+      server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
+      if [[ ${REMOTE_PORT} ]]; then
+        client_remote_args=" --host=$REMOTE_HOST --port=$REMOTE_PORT "
+      else
+        client_remote_args=" --host=$REMOTE_HOST "
+      fi
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      # pass the tensor parallel size to the client so that it can be displayed
+      # on the benchmark dashboard
+      client_command="python3 benchmark_serving.py \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        --metadata "tensor_parallel_size=$tp" \
+        $client_args $client_remote_args "
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      bash -c "$client_command"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+    # clean up
+    kill -9 $server_pid
+    kill_gpu_processes
+  done
+}
+
+main() {
+  local ARCH
+  ARCH=''
+  if [ "$ON_CPU" == "1" ];then
+     check_cpus
+     ARCH='-cpu'
+  else
+     check_gpus
+  fi
+  check_hf_token
+
+  # Set to v1 to run v1 benchmark
+  if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
+    export VLLM_USE_V1=1
+  fi
+
+  # dependencies
+  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+  (which jq) || (apt-get update && apt-get -y install jq)
+  (which lsof) || (apt-get update && apt-get install -y lsof)
+
+  # get the current IP address, required by benchmark_serving.py
+  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+  # turn of the reporting of the status of each request, to clean up the terminal output
+  export VLLM_LOGGING_LEVEL="WARNING"
+
+  # prepare for benchmarking
+  cd benchmarks || exit 1
+  ensure_sharegpt_downloaded
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+  # benchmarking
+  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
+  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
+  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
+
+  # postprocess benchmarking results
+  pip install tabulate pandas
+  python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
+
+  upload_to_buildkite
+}
+
+main "$@"
diff --git a/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
new file mode 100644
index 0000000..fb3b9d5
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import datetime
+import json
+import os
+from pathlib import Path
+
+import pandas as pd
+from tabulate import tabulate
+
+results_folder = Path("results/")
+
+# serving results and the keys that will be printed into markdown
+serving_results = []
+serving_column_mapping = {
+    "test_name": "Test name",
+    "gpu_type": "GPU",
+    "completed": "Successful req.",
+    "request_throughput": "Tput (req/s)",
+    "mean_ttft_ms": "Mean TTFT (ms)",
+    "std_ttft_ms": "Std TTFT (ms)",
+    "median_ttft_ms": "Median TTFT (ms)",
+    "mean_itl_ms": "Mean ITL (ms)",
+    "std_itl_ms": "Std ITL (ms)",
+    "median_itl_ms": "Median ITL (ms)",
+    "mean_tpot_ms": "Mean TPOT (ms)",
+    "std_tpot_ms": "Std TPOT (ms)",
+    "median_tpot_ms": "Median TPOT (ms)",
+    "total_token_throughput": "Total Token Tput (tok/s)",
+    "output_throughput": "Output Tput (tok/s)",
+    "total_input_tokens": "Total input tokens",
+    "total_output_tokens": "Total output tokens",
+    "engine": "Engine",
+}
+
+if __name__ == "__main__":
+    # collect results
+    for test_file in results_folder.glob("*.json"):
+        with open(test_file) as f:
+            raw_result = json.loads(f.read())
+
+        # attach the benchmarking command to raw_result
+        with open(test_file.with_suffix(".commands")) as f:
+            command = json.loads(f.read())
+        raw_result.update(command)
+
+        # update the test name of this result
+        raw_result.update({"test_name": test_file.stem})
+
+        # add the result to raw_result
+        serving_results.append(raw_result)
+        continue
+
+    serving_results = pd.DataFrame.from_dict(serving_results)
+
+    if not serving_results.empty:
+        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
+            columns=serving_column_mapping
+        )
+
+    serving_md_table_with_headers = tabulate(
+        serving_results, headers="keys", tablefmt="pipe", showindex=False
+    )
+    # remove the first line of header
+    serving_md_table_lines = serving_md_table_with_headers.split("\n")
+    serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
+
+    prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
+
+    # document benchmarking results in markdown
+    with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
+        # document results with header.
+        # for those who wants to reproduce our benchmark.
+        f.write(serving_md_table_with_headers)
+        f.write("\n")
+
+    # document benchmarking results in json
+    with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
+        results = serving_results.to_dict(orient="records")
+        f.write(json.dumps(results))
diff --git a/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh b/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
new file mode 100644
index 0000000..50e1ab0
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
+if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
+    URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
+else
+    URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
+fi
+
+TIMEOUT_SECONDS=10
+
+retries=0
+while [ $retries -lt 1000 ]; do
+    if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
+        exit 0
+    fi
+
+    echo "Waiting for image to be available..."
+
+    retries=$((retries + 1))
+    sleep 5
+done
+
+exit 1
diff --git a/vllm_v0.10.0/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json b/vllm_v0.10.0/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
new file mode 100644
index 0000000..edbe9f2
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
@@ -0,0 +1,23 @@
+[
+    {
+        "test_name": "llama8B_tp1_genai_perf",
+        "qps_list": [4,8,16,32],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+            "tp": 1,
+            "port": 8000,
+            "num_prompts": 500,
+            "reuse_server": false
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "genai_perf_input_parameters": {
+        }
+    }
+]
\ No newline at end of file
diff --git a/vllm_v0.10.0/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json b/vllm_v0.10.0/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
new file mode 100644
index 0000000..da93fdd
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
@@ -0,0 +1,30 @@
+[
+    {
+        "test_name": "latency_llama8B_tp1",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    },
+    {
+        "test_name": "latency_llama8B_tp4",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    }
+]
diff --git a/vllm_v0.10.0/.buildkite/nightly-benchmarks/tests/latency-tests.json b/vllm_v0.10.0/.buildkite/nightly-benchmarks/tests/latency-tests.json
new file mode 100644
index 0000000..7762a23
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/nightly-benchmarks/tests/latency-tests.json
@@ -0,0 +1,32 @@
+[
+    {
+        "test_name": "latency_llama8B_tp1",
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    },
+    {
+        "test_name": "latency_llama70B_tp4",
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "num-iters-warmup": 5,
+            "num-iters": 15
+        }
+    },
+    {
+        "test_name": "latency_mixtral8x7B_tp2",
+        "parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "load_format": "dummy",
+            "num-iters-warmup": 5,
+            "num-iters": 15
+        }
+    }
+]
diff --git a/vllm_v0.10.0/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/vllm_v0.10.0/.buildkite/nightly-benchmarks/tests/nightly-tests.json
new file mode 100644
index 0000000..fda1a7a
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -0,0 +1,323 @@
+[
+    {
+        "test_name": "llama8B_tp1_sharegpt",
+        "qps_list": [4,8,16,32,"inf"],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+            "tp": 1,
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 500,
+            "port": 8000,
+            "reuse_server": false
+        },
+        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
+            "max_input_len": 4096,
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        }, 
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "enable_torch_compile": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
+        }
+    },
+    {
+        "test_name": "llama8B_tp1_sonnet_512_16",
+        "qps_list": [4,8,16,32,"inf"],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+            "tp": 1,
+            "dataset_name": "sonnet",
+            "dataset_path": "./sonnet_4x.txt",
+            "num_prompts": 500,
+            "port": 8000,
+            "sonnet_input_len": 512,
+            "sonnet_output_len": 16,
+            "sonnet_prefix_len": 50,
+            "reuse_server": true
+        },
+        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
+            "max_input_len": 4096,
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        }, 
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "enable_torch_compile": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
+        }
+    },
+    {
+        "test_name": "llama8B_tp1_sonnet_512_256",
+        "qps_list": [4,8,16,32,"inf"],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+            "tp": 1,
+            "dataset_name": "sonnet",
+            "dataset_path": "./sonnet_4x.txt",
+            "num_prompts": 500,
+            "port": 8000,
+            "sonnet_input_len": 512,
+            "sonnet_output_len": 256,
+            "sonnet_prefix_len": 50,
+            "reuse_server": true
+        },
+        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
+            "max_input_len": 4096,
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        }, 
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "enable_torch_compile": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
+        }
+    },
+    {
+        "test_name": "llama70B_tp4_sharegpt",
+        "qps_list": [4,8,16,32,"inf"],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "tp": 4,
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 500,
+            "port": 8000,
+            "reuse_server": false
+        },
+        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
+            "max_input_len": 4096,
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        }, 
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
+        }
+    },
+    {
+        "test_name": "llama70B_tp4_sonnet_512_16",
+        "qps_list": [4,8,16,32,"inf"],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "tp": 4,
+            "dataset_name": "sonnet",
+            "dataset_path": "./sonnet_4x.txt",
+            "num_prompts": 500,
+            "port": 8000,
+            "sonnet_input_len": 512,
+            "sonnet_output_len": 16,
+            "sonnet_prefix_len": 50,
+            "reuse_server": true
+        },
+        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
+            "max_input_len": 4096,
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        }, 
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
+        }
+    },
+    {
+        "test_name": "llama70B_tp4_sonnet_512_256",
+        "qps_list": [4,8,16,32,"inf"],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "tp": 4,
+            "dataset_name": "sonnet",
+            "dataset_path": "./sonnet_4x.txt",
+            "num_prompts": 500,
+            "port": 8000,
+            "sonnet_input_len": 512,
+            "sonnet_output_len": 256,
+            "sonnet_prefix_len": 50,
+            "reuse_server": true
+        },
+        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
+            "max_input_len": 4096,
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        }, 
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
+        }
+    }
+]
\ No newline at end of file
diff --git a/vllm_v0.10.0/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/vllm_v0.10.0/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
new file mode 100644
index 0000000..22f71c9
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
@@ -0,0 +1,158 @@
+[
+    {
+        "test_name": "serving_llama8B_tp1_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+	    "enforce_eager": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+	    "max_concurrency": 60,
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp2_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+	    "enforce_eager": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+	    "max_concurrency": 60,
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp4_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+	    "enforce_eager": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+	    "max_concurrency": 60,
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp4_random_1024_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+	    "enforce_eager": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 1024,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+	    "max_concurrency": 100,
+            "num_prompts": 100
+        }
+    },
+    {
+        "test_name": "serving_llama8B_pp6_random_1024_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "pipeline_parallel_size": 6,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+	    "enforce_eager": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 1024,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+	    "max_concurrency": 100,
+            "num_prompts": 100
+        }
+    }
+]
diff --git a/vllm_v0.10.0/.buildkite/nightly-benchmarks/tests/serving-tests.json b/vllm_v0.10.0/.buildkite/nightly-benchmarks/tests/serving-tests.json
new file mode 100644
index 0000000..13fd5aa
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/nightly-benchmarks/tests/serving-tests.json
@@ -0,0 +1,81 @@
+[
+    {
+        "test_name": "serving_llama8B_tp1_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama70B_tp4_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_mixtral8x7B_tp2_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama70B_tp4_sharegpt_specdecode",
+        "qps_list": [2],
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "disable_log_requests": "", 
+            "tensor_parallel_size": 4,
+            "swap_space": 16,
+            "speculative_config": {
+                "model": "turboderp/Qwama-0.5B-Instruct",
+                "num_speculative_tokens": 4,
+                "draft_tensor_parallel_size": 1
+            }
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200 
+        }
+    }
+]
diff --git a/vllm_v0.10.0/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json b/vllm_v0.10.0/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
new file mode 100644
index 0000000..f159c30
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
@@ -0,0 +1,32 @@
+[
+    {
+        "test_name": "throughput_llama8B_tp1",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    },
+    {
+        "test_name": "throughput_llama8B_tp4",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    }
+]
diff --git a/vllm_v0.10.0/.buildkite/nightly-benchmarks/tests/throughput-tests.json b/vllm_v0.10.0/.buildkite/nightly-benchmarks/tests/throughput-tests.json
new file mode 100644
index 0000000..9bc87cb
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/nightly-benchmarks/tests/throughput-tests.json
@@ -0,0 +1,35 @@
+[
+    {
+        "test_name": "throughput_llama8B_tp1",
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    },
+    {
+        "test_name": "throughput_llama70B_tp4",
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    },
+    {
+        "test_name": "throughput_mixtral8x7B_tp2",
+        "parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    }
+]
diff --git a/vllm_v0.10.0/.buildkite/pyproject.toml b/vllm_v0.10.0/.buildkite/pyproject.toml
new file mode 100644
index 0000000..d5cad1c
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/pyproject.toml
@@ -0,0 +1,46 @@
+# This local pyproject file is part of the migration from yapf to ruff format.
+# It uses the same core rules as the main pyproject.toml file, but with the
+# following differences:
+# - ruff line length is overridden to 88
+# - deprecated typing ignores (UP006, UP035) have been removed
+
+[tool.ruff]
+line-length = 88
+
+[tool.ruff.lint.per-file-ignores]
+"vllm/third_party/**" = ["ALL"]
+"vllm/version.py" = ["F401"]
+"vllm/_version.py" = ["ALL"]
+
+[tool.ruff.lint]
+select = [
+    # pycodestyle
+    "E",
+    # Pyflakes
+    "F",
+    # pyupgrade
+    "UP",
+    # flake8-bugbear
+    "B",
+    # flake8-simplify
+    "SIM",
+    # isort
+    "I",
+    # flake8-logging-format
+    "G",
+]
+ignore = [
+    # star imports
+    "F405", "F403",
+    # lambda expression assignment
+    "E731",
+    # Loop control variable not used within loop body
+    "B007",
+    # f-string format
+    "UP032",
+    # Can remove once 3.10+ is the minimum Python version
+    "UP007",
+]
+
+[tool.ruff.format]
+docstring-code-format = true
diff --git a/vllm_v0.10.0/.buildkite/release-pipeline.yaml b/vllm_v0.10.0/.buildkite/release-pipeline.yaml
new file mode 100644
index 0000000..6314afd
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/release-pipeline.yaml
@@ -0,0 +1,124 @@
+steps:
+  - label: "Build wheel - CUDA 12.8"
+    id: build-wheel-cuda-12-8
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-wheels.sh"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  - label: "Build wheel - CUDA 12.6"
+    id: build-wheel-cuda-12-6
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-wheels.sh"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
+  # However, this block can be uncommented to save some compute hours.
+  # - block: "Build CUDA 11.8 wheel"
+  #   key: block-build-cu118-wheel
+
+  - label: "Build wheel - CUDA 11.8"
+    # depends_on: block-build-cu118-wheel
+    id: build-wheel-cuda-11-8
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-wheels.sh"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  - block: "Build release image"
+    depends_on: ~
+    key: block-release-image-build
+
+  - label: "Build release image"
+    depends_on: block-release-image-build
+    id: build-release-image
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+
+  - label: "Annotate release workflow"
+    depends_on:
+      - build-release-image
+      - build-wheel-cuda-12-8
+      - build-wheel-cuda-12-6
+      - build-wheel-cuda-11-8
+    id: annotate-release-workflow
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "bash .buildkite/scripts/annotate-release.sh"
+
+  - label: "Build and publish TPU release image"
+    depends_on: ~
+    if: build.env("NIGHTLY") == "1"
+    agents:
+      queue: tpu_queue_postmerge
+    commands:
+      - "yes | docker system prune -a"
+      - "git fetch --all"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
+      - "docker push vllm/vllm-tpu:nightly"
+      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
+    plugins:
+      - docker-login#v3.0.0:
+          username: vllmbot
+          password-env: DOCKERHUB_TOKEN
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  - input: "Provide Release version here"
+    id: input-release-version
+    fields:
+      - text: "What is the release version?"
+        key: release-version
+
+  - block: "Build CPU release image"
+    key: block-cpu-release-image-build
+    depends_on: ~
+
+  - label: "Build and publish CPU release image"
+    depends_on: block-cpu-release-image-build
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  - block: "Build Neuron release image"
+    key: block-neuron-release-image-build
+    depends_on: ~
+
+  - label: "Build and publish Neuron release image"
+    depends_on: block-neuron-release-image-build
+    agents:
+      queue: neuron-postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
+    env:
+      DOCKER_BUILDKIT: "1"
diff --git a/vllm_v0.10.0/.buildkite/scripts/annotate-release.sh b/vllm_v0.10.0/.buildkite/scripts/annotate-release.sh
new file mode 100755
index 0000000..94e0ac2
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/scripts/annotate-release.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+set -ex
+
+# Get release version and strip leading 'v' if present
+RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
+
+if [ -z "$RELEASE_VERSION" ]; then
+  echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
+  exit 1
+fi
+
+buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
+To download the wheel:
+\`\`\`
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl . 
+\`\`\`
+
+To download and upload the image:
+
+\`\`\`
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
+docker tag vllm/vllm-openai vllm/vllm-openai:latest
+docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
+docker push vllm/vllm-openai:latest
+docker push vllm/vllm-openai:v${RELEASE_VERSION}
+\`\`\`
+EOF 
\ No newline at end of file
diff --git a/vllm_v0.10.0/.buildkite/scripts/ci-clean-log.sh b/vllm_v0.10.0/.buildkite/scripts/ci-clean-log.sh
new file mode 100644
index 0000000..69d8a3a
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/scripts/ci-clean-log.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+# Usage: ./ci_clean_log.sh ci.log
+# This script strips timestamps and color codes from CI log files.
+
+# Check if argument is given
+if [ $# -lt 1 ]; then
+    echo "Usage: $0 ci.log"
+    exit 1
+fi
+
+INPUT_FILE="$1"
+
+# Strip timestamps
+sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
+
+# Strip colorization
+sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"
diff --git a/vllm_v0.10.0/.buildkite/scripts/hardware_ci/run-amd-test.sh b/vllm_v0.10.0/.buildkite/scripts/hardware_ci/run-amd-test.sh
new file mode 100755
index 0000000..5e5a532
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -0,0 +1,243 @@
+#!/bin/bash
+
+# This script runs test inside the corresponding ROCm docker container.
+set -o pipefail
+
+# Export Python path
+export PYTHONPATH=".."
+
+# Print ROCm version
+echo "--- Confirming Clean Initial State"
+while true; do
+        sleep 3
+        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+                echo "GPUs state is \"clean\""
+                break
+        fi
+done
+
+echo "--- ROCm info"
+rocminfo
+
+# cleanup older docker images
+cleanup_docker() {
+  # Get Docker's root directory
+  docker_root=$(docker info -f '{{.DockerRootDir}}')
+  if [ -z "$docker_root" ]; then
+    echo "Failed to determine Docker root directory."
+    exit 1
+  fi
+  echo "Docker root directory: $docker_root"
+  # Check disk usage of the filesystem where Docker's root directory is located
+  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+  # Define the threshold
+  threshold=70
+  if [ "$disk_usage" -gt "$threshold" ]; then
+    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+    # Remove dangling images (those that are not tagged and not used by any container)
+    docker image prune -f
+    # Remove unused volumes / force the system prune for old images as well.
+    docker volume prune -f && docker system prune --force --filter "until=72h" --all
+    echo "Docker images and volumes cleanup completed."
+  else
+    echo "Disk usage is below $threshold%. No cleanup needed."
+  fi
+}
+
+# Call the cleanup docker function
+cleanup_docker
+
+echo "--- Resetting GPUs"
+
+echo "reset" > /opt/amdgpu/etc/gpu_state
+
+while true; do
+        sleep 3
+        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+                echo "GPUs state is \"clean\""
+                break
+        fi
+done
+
+echo "--- Pulling container" 
+image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
+container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+docker pull "${image_name}"
+
+remove_docker_container() {
+   docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
+}
+trap remove_docker_container EXIT
+
+echo "--- Running container"
+
+HF_CACHE="$(realpath ~)/huggingface"
+mkdir -p "${HF_CACHE}"
+HF_MOUNT="/root/.cache/huggingface"
+
+commands=$@
+echo "Commands:$commands"
+
+if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
+  commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
+fi
+
+if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
+  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
+fi
+
+if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
+  commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
+fi
+
+if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
+  commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
+fi
+
+if [[ $commands == *"pytest -v -s lora"* ]]; then
+  commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
+fi
+
+#ignore certain kernels tests
+if [[ $commands == *" kernels/core"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/core/test_fused_quant_layernorm.py \
+  --ignore=kernels/core/test_permute_cols.py"
+fi
+
+if [[ $commands == *" kernels/attention"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/attention/test_attention_selector.py \
+  --ignore=kernels/attention/test_encoder_decoder_attn.py \
+  --ignore=kernels/attention/test_flash_attn.py \
+  --ignore=kernels/attention/test_flashinfer.py \
+  --ignore=kernels/attention/test_prefix_prefill.py \
+  --ignore=kernels/attention/test_cascade_flash_attn.py \
+  --ignore=kernels/attention/test_mha_attn.py \
+  --ignore=kernels/attention/test_lightning_attn.py \
+  --ignore=kernels/attention/test_attention.py"
+fi
+
+if [[ $commands == *" kernels/quantization"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/quantization/test_int8_quant.py \
+  --ignore=kernels/quantization/test_aqlm.py \
+  --ignore=kernels/quantization/test_machete_mm.py \
+  --ignore=kernels/quantization/test_block_fp8.py \
+  --ignore=kernels/quantization/test_block_int8.py \
+  --ignore=kernels/quantization/test_marlin_gemm.py \
+  --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
+  --ignore=kernels/quantization/test_int8_kernel.py"
+fi
+
+if [[ $commands == *" kernels/mamba"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/mamba/test_mamba_mixer2.py \
+  --ignore=kernels/mamba/test_causal_conv1d.py \
+  --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
+fi
+
+if [[ $commands == *" kernels/moe"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/moe/test_moe.py \
+  --ignore=kernels/moe/test_cutlass_moe.py \
+  --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
+fi
+
+#ignore certain Entrypoints/openai tests
+if [[ $commands == *" entrypoints/openai "* ]]; then
+  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
+  --ignore=entrypoints/openai/test_audio.py \
+  --ignore=entrypoints/openai/test_shutdown.py \
+  --ignore=entrypoints/openai/test_completion.py \
+  --ignore=entrypoints/openai/test_sleep.py \
+  --ignore=entrypoints/openai/test_models.py \
+  --ignore=entrypoints/openai/test_lora_adapters.py \
+  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
+  --ignore=entrypoints/openai/test_root_path.py \
+  --ignore=entrypoints/openai/test_tokenization.py \
+  --ignore=entrypoints/openai/test_prompt_validation.py "}
+fi
+
+#ignore certain Entrypoints/llm tests
+if [[ $commands == *" entrypoints/llm "* ]]; then
+  commands=${commands//" entrypoints/llm "/" entrypoints/llm \
+  --ignore=entrypoints/llm/test_chat.py \
+  --ignore=entrypoints/llm/test_accuracy.py \
+  --ignore=entrypoints/llm/test_init.py \
+  --ignore=entrypoints/llm/test_generate_multiple_loras.py \
+  --ignore=entrypoints/llm/test_prompt_validation.py "}
+fi
+
+#Obsolete currently
+##ignore certain Entrypoints/llm tests
+#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
+#  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
+#fi
+
+# --ignore=entrypoints/openai/test_encoder_decoder.py \
+# --ignore=entrypoints/openai/test_embedding.py \
+# --ignore=entrypoints/openai/test_oot_registration.py
+# --ignore=entrypoints/openai/test_accuracy.py \
+# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
+
+
+PARALLEL_JOB_COUNT=8
+MYPYTHONPATH=".."
+
+# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
+if [[ $commands == *"--shard-id="* ]]; then
+  # assign job count as the number of shards used   
+  commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
+  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
+    # assign shard-id for each shard
+    commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
+    echo "Shard ${GPU} commands:$commands_gpu"
+    echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
+    docker run \
+        --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+        --network=host \
+        --shm-size=16gb \
+        --rm \
+        -e HIP_VISIBLE_DEVICES="${GPU}" \
+        -e HF_TOKEN \
+        -e AWS_ACCESS_KEY_ID \
+        -e AWS_SECRET_ACCESS_KEY \
+        -v "${HF_CACHE}:${HF_MOUNT}" \
+        -e "HF_HOME=${HF_MOUNT}" \
+        -e "PYTHONPATH=${MYPYTHONPATH}" \
+        --name "${container_name}_${GPU}" \
+        "${image_name}" \
+        /bin/bash -c "${commands_gpu}" \
+        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
+    PIDS+=($!)
+  done
+  #wait for all processes to finish and collect exit codes
+  for pid in "${PIDS[@]}"; do
+    wait "${pid}"
+    STATUS+=($?)
+  done
+  for st in "${STATUS[@]}"; do
+    if [[ ${st} -ne 0 ]]; then
+      echo "One of the processes failed with $st"
+      exit "${st}"
+    fi
+  done
+else
+  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
+  docker run \
+          --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+          --network=host \
+          --shm-size=16gb \
+          --rm \
+          -e HIP_VISIBLE_DEVICES=0 \
+          -e HF_TOKEN \
+          -e AWS_ACCESS_KEY_ID \
+          -e AWS_SECRET_ACCESS_KEY \
+          -v "${HF_CACHE}:${HF_MOUNT}" \
+          -e "HF_HOME=${HF_MOUNT}" \
+          -e "PYTHONPATH=${MYPYTHONPATH}" \
+          --name "${container_name}" \
+          "${image_name}" \
+          /bin/bash -c "${commands}"
+fi
diff --git a/vllm_v0.10.0/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/vllm_v0.10.0/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
new file mode 100755
index 0000000..36bcb01
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Setup cleanup
+remove_docker_container() {
+  if [[ -n "$container_id" ]]; then
+      podman stop --all -t0
+      podman rm -f "$container_id" || true
+  fi
+  podman system prune -f
+}
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Try building the docker image
+podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
+
+# Run the image
+container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
+
+function cpu_tests() {
+
+  # offline inference
+  podman exec -it "$container_id" bash -c "
+    set -e
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+
+  # Run basic model test
+  podman exec -it "$container_id" bash -c "
+    set -e
+    pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
+    pip install sentence-transformers datamodel_code_generator
+    pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
+    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
+    pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
+}
+
+# All of CPU tests are expected to be finished less than 40 mins.
+
+export container_id
+export -f cpu_tests
+timeout 40m bash -c cpu_tests
+
diff --git a/vllm_v0.10.0/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh b/vllm_v0.10.0/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh
new file mode 100755
index 0000000..a97fa50
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Setup cleanup
+remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Try building the docker image
+docker build -t cpu-test -f docker/Dockerfile.s390x .
diff --git a/vllm_v0.10.0/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/vllm_v0.10.0/.buildkite/scripts/hardware_ci/run-cpu-test.sh
new file mode 100644
index 0000000..90cc9c8
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# allow to bind to different cores
+CORE_RANGE=${CORE_RANGE:-48-95}
+# used for TP/PP E2E test
+OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
+NUMA_NODE=${NUMA_NODE:-1}
+
+export CMAKE_BUILD_PARALLEL_LEVEL=32
+
+# Setup cleanup
+remove_docker_container() { 
+    set -e; 
+    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; 
+}
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Try building the docker image
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
+
+# Run the image, setting --shm-size=4g for tensor parallel.
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
+
+function cpu_tests() {
+  set -e
+  export NUMA_NODE=$2
+
+  # list packages
+  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
+    set -e
+    pip list"
+
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pip list"
+
+  # offline inference
+  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
+    set -e
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+
+  # Run basic model test
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    # Note: disable until supports V1
+    # pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
+    # pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
+
+    # Note: disable Bart until supports V1
+    pytest -v -s tests/models/language/generation -m cpu_model \
+                --ignore=tests/models/language/generation/test_bart.py
+    VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
+                --ignore=tests/models/language/generation/test_bart.py
+
+    pytest -v -s tests/models/language/pooling -m cpu_model
+    pytest -v -s tests/models/multimodal/generation \
+                --ignore=tests/models/multimodal/generation/test_mllama.py \
+                --ignore=tests/models/multimodal/generation/test_pixtral.py \
+                -m cpu_model"
+
+  # Run compressed-tensor test
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -s -v \
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]" 
+
+  # Note: disable it until supports V1
+  # Run AWQ test
+  # docker exec cpu-test-"$NUMA_NODE" bash -c "
+  #   set -e
+  #   VLLM_USE_V1=0 pytest -s -v \
+  #   tests/quantization/test_ipex_quant.py"
+
+  # online serving
+  docker exec cpu-test-"$NUMA_NODE" bash -c '
+    set -e
+    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
+    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+    python3 benchmarks/benchmark_serving.py \
+      --backend vllm \
+      --dataset-name random \
+      --model meta-llama/Llama-3.2-3B-Instruct \
+      --num-prompts 20 \
+      --endpoint /v1/completions'
+
+  # Run multi-lora tests
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -s -v \
+    tests/lora/test_qwen2vl.py"
+}
+
+# All of CPU tests are expected to be finished less than 40 mins.
+export -f cpu_tests
+timeout 1.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
diff --git a/vllm_v0.10.0/.buildkite/scripts/hardware_ci/run-gh200-test.sh b/vllm_v0.10.0/.buildkite/scripts/hardware_ci/run-gh200-test.sh
new file mode 100644
index 0000000..8c64e14
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/scripts/hardware_ci/run-gh200-test.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+# This script build the GH200 docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile
+python3 use_existing_torch.py
+
+# Try building the docker image
+DOCKER_BUILDKIT=1 docker build . \
+  --file docker/Dockerfile \
+  --target vllm-openai \
+  --platform "linux/arm64" \
+  -t gh200-test \
+  --build-arg max_jobs=66 \
+  --build-arg nvcc_threads=2 \
+  --build-arg RUN_WHEEL_CHECK=false \
+  --build-arg torch_cuda_arch_list="9.0+PTX" \
+  --build-arg vllm_fa_cmake_gpu_arches="90-real"
+
+# Setup cleanup
+remove_docker_container() { docker rm -f gh200-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and test offline inference
+docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
+    python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
+'
diff --git a/vllm_v0.10.0/.buildkite/scripts/hardware_ci/run-hpu-test.sh b/vllm_v0.10.0/.buildkite/scripts/hardware_ci/run-hpu-test.sh
new file mode 100644
index 0000000..dc9f2d3
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -exuo pipefail
+
+# Try building the docker image
+cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
+FROM gaudi-base-image:latest
+
+COPY ./ /workspace/vllm
+
+WORKDIR /workspace/vllm
+
+ENV no_proxy=localhost,127.0.0.1
+ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
+
+RUN VLLM_TARGET_DEVICE=empty pip install .
+RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
+
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
+WORKDIR /workspace/
+
+RUN git clone https://github.com/vllm-project/vllm-gaudi.git
+
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+EOF
+
+# Setup cleanup
+# certain versions of HPU software stack have a bug that can
+# override the exit code of the script, so we need to use
+# separate remove_docker_containers and remove_docker_containers_and_exit
+# functions, while other platforms only need one remove_docker_container
+# function.
+EXITCODE=1
+remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; }
+trap 'remove_docker_containers; exit $EXITCODE;' EXIT
+remove_docker_containers
+
+echo "Running HPU plugin v1 test"
+docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \
+  -e HABANA_VISIBLE_DEVICES=all \
+  hpu-plugin-v1-test-env \
+  /bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh"
+
+EXITCODE=$?
+if [ $EXITCODE -eq 0 ]; then
+  echo "Test with basic model passed"
+else
+  echo "Test with basic model FAILED with exit code: $EXITCODE" >&2
+fi
+
+# The trap will handle the container removal and final exit.
\ No newline at end of file
diff --git a/vllm_v0.10.0/.buildkite/scripts/hardware_ci/run-neuron-test.sh b/vllm_v0.10.0/.buildkite/scripts/hardware_ci/run-neuron-test.sh
new file mode 100644
index 0000000..a397457
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/scripts/hardware_ci/run-neuron-test.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# This script build the Neuron docker image and run the API server inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -e
+set -v
+
+image_name="neuron/vllm-ci"
+container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+
+HF_CACHE="$(realpath ~)/huggingface"
+mkdir -p "${HF_CACHE}"
+HF_MOUNT="/root/.cache/huggingface"
+HF_TOKEN=$(aws secretsmanager get-secret-value  --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
+
+NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
+mkdir -p "${NEURON_COMPILE_CACHE_URL}"
+NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
+
+# Try building the docker image
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
+
+# prune old image and containers to save disk space, and only once a day
+# by using a timestamp file in tmp.
+if [ -f /tmp/neuron-docker-build-timestamp ]; then
+    last_build=$(cat /tmp/neuron-docker-build-timestamp)
+    current_time=$(date +%s)
+    if [ $((current_time - last_build)) -gt 86400 ]; then
+        # Remove dangling images (those that are not tagged and not used by any container)
+        docker image prune -f
+        # Remove unused volumes / force the system prune for old images as well.
+        docker volume prune -f && docker system prune -f
+        echo "$current_time" > /tmp/neuron-docker-build-timestamp
+    fi
+else
+    date "+%s" > /tmp/neuron-docker-build-timestamp
+fi
+
+docker build -t "${image_name}" -f docker/Dockerfile.neuron .
+
+# Setup cleanup
+remove_docker_container() {
+    docker image rm -f "${image_name}" || true;
+}
+trap remove_docker_container EXIT
+
+# Run the image
+docker run --rm -it --device=/dev/neuron0 --network bridge \
+       -v "${HF_CACHE}:${HF_MOUNT}" \
+       -e "HF_HOME=${HF_MOUNT}" \
+       -e "HF_TOKEN=${HF_TOKEN}" \
+       -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
+       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
+       --name "${container_name}" \
+       ${image_name} \
+       /bin/bash -c "
+            set -e; # Exit on first error
+            python3 /workspace/vllm/examples/offline_inference/neuron.py;
+            python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
+            for f in /workspace/vllm/tests/neuron/2_core/*.py; do
+                echo \"Running test file: \$f\";
+                python3 -m pytest \$f -v --capture=tee-sys;
+            done
+       "
\ No newline at end of file
diff --git a/vllm_v0.10.0/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/vllm_v0.10.0/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
new file mode 100755
index 0000000..d39acae
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -0,0 +1,188 @@
+#!/bin/bash
+
+set -xu
+
+
+remove_docker_container() { 
+    docker rm -f tpu-test || true; 
+    docker rm -f vllm-tpu || true;
+}
+
+trap remove_docker_container EXIT
+
+# Remove the container that might not be cleaned up in the previous run.
+remove_docker_container
+
+# Build the docker image.
+docker build -f docker/Dockerfile.tpu -t vllm-tpu .
+
+# Set up cleanup.
+cleanup_docker() {
+  # Get Docker's root directory
+  docker_root=$(docker info -f '{{.DockerRootDir}}')
+  if [ -z "$docker_root" ]; then
+    echo "Failed to determine Docker root directory."
+    exit 1
+  fi
+  echo "Docker root directory: $docker_root"
+  # Check disk usage of the filesystem where Docker's root directory is located
+  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+  # Define the threshold
+  threshold=70
+  if [ "$disk_usage" -gt "$threshold" ]; then
+    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+    # Remove dangling images (those that are not tagged and not used by any container)
+    docker image prune -f
+    # Remove unused volumes / force the system prune for old images as well.
+    docker volume prune -f && docker system prune --force --filter "until=72h" --all
+    echo "Docker images and volumes cleanup completed."
+  else
+    echo "Disk usage is below $threshold%. No cleanup needed."
+  fi
+}
+cleanup_docker
+
+# For HF_TOKEN.
+source /etc/environment
+
+docker run --privileged --net host --shm-size=16G -it \
+    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
+    vllm-tpu /bin/bash -c '
+set -e # Exit immediately if a command exits with a non-zero status.
+set -u # Treat unset variables as an error.
+
+echo "--- Starting script inside Docker container ---"
+
+# Create results directory
+RESULTS_DIR=$(mktemp -d)
+# If mktemp fails, set -e will cause the script to exit.
+echo "Results will be stored in: $RESULTS_DIR"
+
+# Install dependencies
+echo "--- Installing Python dependencies ---"
+python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
+    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
+    && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
+    && python3 -m pip install --progress-bar off hf-transfer
+echo "--- Python dependencies installed ---"
+export VLLM_USE_V1=1
+export VLLM_XLA_CHECK_RECOMPILATION=1
+export VLLM_XLA_CACHE_PATH=
+echo "Using VLLM V1"
+
+echo "--- Hardware Information ---"
+# tpu-info
+echo "--- Starting Tests ---"
+set +e
+overall_script_exit_code=0
+
+# --- Test Definitions ---
+# If a test fails, this function will print logs and will not cause the main script to exit.
+run_test() {
+    local test_num=$1
+    local test_name=$2
+    local test_command=$3
+    local log_file="$RESULTS_DIR/test_${test_num}.log"
+    local actual_exit_code
+
+    echo "--- TEST_$test_num: Running $test_name ---"
+    
+    # Execute the test command.
+    eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
+    actual_exit_code=$?
+
+    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
+    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
+
+    if [ "$actual_exit_code" -ne 0 ]; then
+        echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
+        echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
+        if [ -f "$log_file" ]; then
+            cat "$log_file" >&2
+        else
+            echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
+        fi
+        echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
+        return "$actual_exit_code" # Return the failure code
+    else
+        echo "TEST_$test_num ($test_name) PASSED."
+        return 0 # Return success
+    fi
+}
+
+# Helper function to call run_test and update the overall script exit code
+run_and_track_test() {
+    local test_num_arg="$1"
+    local test_name_arg="$2"
+    local test_command_arg="$3"
+
+    # Run the test
+    run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
+    local test_specific_exit_code=$?
+
+    # If the test failed, set the overall script exit code to 1
+    if [ "$test_specific_exit_code" -ne 0 ]; then
+        # No need for extra echo here, run_test already logged the failure.
+        overall_script_exit_code=1
+    fi
+}
+
+# --- Actual Test Execution ---
+run_and_track_test 0 "test_perf.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py"
+run_and_track_test 1 "test_compilation.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py"
+run_and_track_test 2 "test_basic.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
+run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
+run_and_track_test 4 "test_quantization_accuracy.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
+run_and_track_test 5 "examples/offline_inference/tpu.py" \
+    "python3 /workspace/vllm/examples/offline_inference/tpu.py"
+run_and_track_test 6 "test_tpu_model_runner.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
+run_and_track_test 7 "test_sampler.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py"
+run_and_track_test 8 "test_topk_topp_sampler.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py"
+run_and_track_test 9 "test_multimodal.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
+run_and_track_test 10 "test_pallas.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
+run_and_track_test 11 "test_struct_output_generate.py" \
+    "HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
+run_and_track_test 12 "test_moe_pallas.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
+run_and_track_test 13 "test_lora.py" \
+    "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
+run_and_track_test 14 "test_tpu_qkv_linear.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
+run_and_track_test 15 "test_spmd_model_weight_loading.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
+run_and_track_test 16 "test_kv_cache_update_kernel.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
+
+# After all tests have been attempted, exit with the overall status.
+if [ "$overall_script_exit_code" -ne 0 ]; then
+    echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
+else
+    echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
+fi
+exit "$overall_script_exit_code"
+' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
+
+# Capture the exit code of the docker run command
+DOCKER_RUN_EXIT_CODE=$?
+
+# The trap will run for cleanup.
+# Exit the main script with the Docker run command's exit code.
+if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
+    echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
+    exit "$DOCKER_RUN_EXIT_CODE"
+else
+    echo "Docker run command completed successfully."
+    exit 0
+fi
+# TODO: This test fails because it uses RANDOM_SEED sampling
+# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
diff --git a/vllm_v0.10.0/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/vllm_v0.10.0/.buildkite/scripts/hardware_ci/run-xpu-test.sh
new file mode 100644
index 0000000..deb61a9
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
+container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+
+# Try building the docker image
+docker build -t ${image_name} -f docker/Dockerfile.xpu .
+
+# Setup cleanup
+remove_docker_container() {
+  docker rm -f "${container_name}" || true;
+  docker image rm -f "${image_name}" || true;
+  docker system prune -f || true;
+}
+trap remove_docker_container EXIT
+
+# Run the image and test offline inference/tensor parallel
+docker run \
+    --device /dev/dri \
+    -v /dev/dri/by-path:/dev/dri/by-path \
+    --entrypoint="" \
+    --name "${container_name}" \
+    "${image_name}" \
+    sh -c '
+    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
+    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+    cd tests
+    pytest -v -s v1/core
+    pytest -v -s v1/engine
+    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
+    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
+    pytest -v -s v1/structured_output
+    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py
+    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py
+    pytest -v -s v1/test_serial_utils.py
+    pytest -v -s v1/test_utils.py
+    pytest -v -s v1/test_metrics_reader.py
+'
diff --git a/vllm_v0.10.0/.buildkite/scripts/rerun-test.sh b/vllm_v0.10.0/.buildkite/scripts/rerun-test.sh
new file mode 100644
index 0000000..d79c0d5
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/scripts/rerun-test.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# Usage: ./rerun_test.sh path/to/test.py::test_name
+
+# Check if argument is given
+if [ $# -lt 1 ]; then
+    echo "Usage: $0 path/to/test.py::test_name"
+    echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
+    exit 1
+fi
+
+TEST=$1
+COUNT=1
+
+while pytest -sv "$TEST"; do
+    COUNT=$((COUNT + 1))
+    echo "RUN NUMBER ${COUNT}"
+done
diff --git a/vllm_v0.10.0/.buildkite/scripts/run-benchmarks.sh b/vllm_v0.10.0/.buildkite/scripts/run-benchmarks.sh
new file mode 100644
index 0000000..195a806
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/scripts/run-benchmarks.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+
+# This script is run by buildkite to run the benchmarks and upload the results to buildkite
+
+set -ex
+set -o pipefail
+
+# cd 2 levels into the working directory
+cd "$(dirname "${BASH_SOURCE[0]}")/../.."
+
+(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+
+# run python-based benchmarks and upload the result to buildkite
+python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
+bench_latency_exit_code=$?
+
+python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
+bench_throughput_exit_code=$?
+
+# run server-based benchmarks and upload the result to buildkite
+python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
+server_pid=$!
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+# wait for server to start, timeout after 600 seconds
+timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
+python3 benchmarks/benchmark_serving.py \
+    --backend vllm \
+    --dataset-name sharegpt \
+    --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
+    --model meta-llama/Llama-2-7b-chat-hf \
+    --num-prompts 20 \
+    --endpoint /v1/completions \
+    --tokenizer meta-llama/Llama-2-7b-chat-hf \
+    --save-result \
+    2>&1 | tee benchmark_serving.txt
+bench_serving_exit_code=$?
+kill $server_pid
+
+# write the results into a markdown file
+echo "### Latency Benchmarks" >> benchmark_results.md
+sed -n '1p' benchmark_latency.txt >> benchmark_results.md # first line
+echo "" >> benchmark_results.md
+sed -n '$p' benchmark_latency.txt >> benchmark_results.md # last line
+
+echo "### Throughput Benchmarks" >> benchmark_results.md
+sed -n '1p' benchmark_throughput.txt >> benchmark_results.md # first line
+echo "" >> benchmark_results.md
+sed -n '$p' benchmark_throughput.txt >> benchmark_results.md # last line
+
+echo "### Serving Benchmarks" >> benchmark_results.md
+sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
+echo "" >> benchmark_results.md
+echo '```' >> benchmark_results.md
+tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines
+echo '```' >> benchmark_results.md
+
+# if the agent binary is not found, skip uploading the results, exit 0
+if [ ! -f /usr/bin/buildkite-agent ]; then
+    exit 0
+fi
+
+# upload the results to buildkite
+buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
+
+# exit with the exit code of the benchmarks
+if [ $bench_latency_exit_code -ne 0 ]; then
+    exit $bench_latency_exit_code
+fi
+
+if [ $bench_throughput_exit_code -ne 0 ]; then
+    exit $bench_throughput_exit_code
+fi
+
+if [ $bench_serving_exit_code -ne 0 ]; then
+    exit $bench_serving_exit_code
+fi
+
+rm ShareGPT_V3_unfiltered_cleaned_split.json
+buildkite-agent artifact upload "*.json"
diff --git a/vllm_v0.10.0/.buildkite/scripts/run-multi-node-test.sh b/vllm_v0.10.0/.buildkite/scripts/run-multi-node-test.sh
new file mode 100755
index 0000000..49aebce
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/scripts/run-multi-node-test.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+
+set -euox pipefail
+
+if [[ $# -lt 4 ]]; then
+    echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
+    exit 1
+fi
+
+WORKING_DIR=$1
+NUM_NODES=$2
+NUM_GPUS=$3
+DOCKER_IMAGE=$4
+
+shift 4
+COMMANDS=("$@")
+if [ ${#COMMANDS[@]} -ne "$NUM_NODES" ]; then
+    echo "The number of commands must be equal to the number of nodes."
+    echo "Number of nodes: $NUM_NODES"
+    echo "Number of commands: ${#COMMANDS[@]}"
+    exit 1
+fi
+
+echo "List of commands"
+for command in "${COMMANDS[@]}"; do
+    echo "$command"
+done
+
+start_network() {
+    docker network create --subnet=192.168.10.0/24 docker-net
+}
+
+start_nodes() {
+    for node in $(seq 0 $(($NUM_NODES-1))); do
+        GPU_DEVICES='"device='
+        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
+            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
+            GPU_DEVICES+=$(($DEVICE_NUM))
+            if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
+                GPU_DEVICES+=','
+            fi
+        done
+        GPU_DEVICES+='"'
+
+        # start the container in detached mode
+        # things to note:
+        # 1. --shm-size=10.24gb is required. don't use --ipc=host
+        # 2. pass HF_TOKEN to the container
+        # 3. map the huggingface cache directory to the container
+        # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
+        #    starting from 192.168.10.11)
+        docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \
+            -v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \
+            --network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \
+            /bin/bash -c "tail -f /dev/null"
+
+        # organize containers into a ray cluster
+        if [ "$node" -eq 0 ]; then
+            # start the ray head node
+            docker exec -d "node$node" /bin/bash -c "ray start --head --port=6379 --block"
+            # wait for the head node to be ready
+            sleep 10
+        else
+            # start the ray worker nodes, and connect them to the head node
+            docker exec -d "node$node" /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
+        fi
+    done
+
+    # wait for the cluster to be ready
+    sleep 10
+
+    # print the cluster status
+    docker exec node0 /bin/bash -c "ray status"
+}
+
+run_nodes() {
+    # important: iterate in reverse order to start the head node last
+    # we start the worker nodes first, in detached mode, and then start the head node
+    # in the foreground, so that the output of the head node is visible in the buildkite logs
+    for node in $(seq $(($NUM_NODES - 1)) -1 0); do
+        GPU_DEVICES='"device='
+        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
+            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
+            GPU_DEVICES+=$(($DEVICE_NUM))
+            if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
+                GPU_DEVICES+=','
+            fi
+        done
+        GPU_DEVICES+='"'
+        echo "Running node$node with GPU devices: $GPU_DEVICES"
+        if [ "$node" -ne 0 ]; then
+            docker exec -d "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
+        else
+            docker exec "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
+        fi
+    done
+}
+cleanup() {
+    for node in $(seq 0 $(($NUM_NODES-1))); do
+        docker stop "node$node"
+    done
+    docker network rm docker-net
+}
+trap cleanup EXIT
+start_network
+start_nodes
+run_nodes
+
diff --git a/vllm_v0.10.0/.buildkite/scripts/tpu/cleanup_docker.sh b/vllm_v0.10.0/.buildkite/scripts/tpu/cleanup_docker.sh
new file mode 100755
index 0000000..209d9c4
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/scripts/tpu/cleanup_docker.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -euo pipefail
+
+docker_root=$(docker info -f '{{.DockerRootDir}}')
+if [ -z "$docker_root" ]; then
+  echo "Failed to determine Docker root directory."
+  exit 1
+fi
+echo "Docker root directory: $docker_root"
+# Check disk usage of the filesystem where Docker's root directory is located
+disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+# Define the threshold
+threshold=70
+if [ "$disk_usage" -gt "$threshold" ]; then
+  echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+  # Remove dangling images (those that are not tagged and not used by any container)
+  docker image prune -f
+  # Remove unused volumes / force the system prune for old images as well.
+  docker volume prune -f && docker system prune --force --filter "until=72h" --all
+  echo "Docker images and volumes cleanup completed."
+else
+  echo "Disk usage is below $threshold%. No cleanup needed."
+fi
diff --git a/vllm_v0.10.0/.buildkite/scripts/tpu/config_v6e_1.env b/vllm_v0.10.0/.buildkite/scripts/tpu/config_v6e_1.env
new file mode 100644
index 0000000..03ec116
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/scripts/tpu/config_v6e_1.env
@@ -0,0 +1,14 @@
+# Environment config
+TEST_NAME=llama8b
+CONTAINER_NAME=vllm-tpu
+
+# vllm config
+MODEL=meta-llama/Llama-3.1-8B-Instruct
+MAX_NUM_SEQS=256
+MAX_NUM_BATCHED_TOKENS=1024
+TENSOR_PARALLEL_SIZE=1
+MAX_MODEL_LEN=2048
+DOWNLOAD_DIR=/mnt/disks/persist
+EXPECTED_THROUGHPUT=8.0
+INPUT_LEN=1800
+OUTPUT_LEN=128
diff --git a/vllm_v0.10.0/.buildkite/scripts/tpu/docker_run_bm.sh b/vllm_v0.10.0/.buildkite/scripts/tpu/docker_run_bm.sh
new file mode 100755
index 0000000..8959877
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/scripts/tpu/docker_run_bm.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+
+if [ ! -f "$1" ]; then
+  echo "Error: The env file '$1' does not exist."
+  exit 1  # Exit the script with a non-zero status to indicate an error
+fi
+
+ENV_FILE=$1
+
+# For testing on local vm, use `set -a` to export all variables
+source /etc/environment
+source $ENV_FILE
+
+remove_docker_container() { 
+    docker rm -f tpu-test || true; 
+    docker rm -f vllm-tpu || true;
+    docker rm -f $CONTAINER_NAME || true;
+}
+
+trap remove_docker_container EXIT
+
+# Remove the container that might not be cleaned up in the previous run.
+remove_docker_container
+
+LOG_ROOT=$(mktemp -d)
+# If mktemp fails, set -e will cause the script to exit.
+echo "Results will be stored in: $LOG_ROOT"
+
+if [ -z "$HF_TOKEN" ]; then
+  echo "Error: HF_TOKEN is not set or is empty."  
+  exit 1
+fi
+
+# Make sure mounted disk or dir exists
+if [ ! -d "$DOWNLOAD_DIR" ]; then
+    echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder."
+    exit 1
+fi
+
+echo "Run model $MODEL"
+echo
+
+echo "starting docker...$CONTAINER_NAME"
+echo    
+docker run \
+ -v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
+ --env-file $ENV_FILE \
+ -e HF_TOKEN="$HF_TOKEN" \
+ -e TARGET_COMMIT=$BUILDKITE_COMMIT \
+ -e MODEL=$MODEL \
+ -e WORKSPACE=/workspace \
+ --name $CONTAINER_NAME \
+ -d \
+ --privileged \
+ --network host \
+ -v /dev/shm:/dev/shm \
+ vllm/vllm-tpu-bm tail -f /dev/null
+
+echo "run script..."
+echo
+docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/tpu/run_bm.sh"
+
+echo "copy result back..."
+VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
+BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt
+docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG" 
+docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG"
+
+throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
+echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput"
+
+if [ "$BUILDKITE" = "true" ]; then
+  echo "Running inside Buildkite"
+  buildkite-agent artifact upload "$VLLM_LOG" 
+  buildkite-agent artifact upload "$BM_LOG"
+else
+  echo "Not running inside Buildkite"
+fi
+
+#
+# compare the throughput with EXPECTED_THROUGHPUT 
+# and assert meeting the expectation
+# 
+if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
+  echo "Failed to get the throughput"
+  exit 1
+fi
+
+if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then
+  echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)"
+  exit 1
+fi
diff --git a/vllm_v0.10.0/.buildkite/scripts/tpu/quantized_v6e_1.env b/vllm_v0.10.0/.buildkite/scripts/tpu/quantized_v6e_1.env
new file mode 100644
index 0000000..bab34b3
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/scripts/tpu/quantized_v6e_1.env
@@ -0,0 +1,14 @@
+# Environment config
+TEST_NAME=llama8bw8a8
+CONTAINER_NAME=vllm-tpu
+
+# vllm config
+MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8
+MAX_NUM_SEQS=128
+MAX_NUM_BATCHED_TOKENS=1024
+TENSOR_PARALLEL_SIZE=1
+MAX_MODEL_LEN=2048
+DOWNLOAD_DIR=/mnt/disks/persist
+EXPECTED_THROUGHPUT=10.0
+INPUT_LEN=1800
+OUTPUT_LEN=128
diff --git a/vllm_v0.10.0/.buildkite/scripts/tpu/run_bm.sh b/vllm_v0.10.0/.buildkite/scripts/tpu/run_bm.sh
new file mode 100755
index 0000000..877669c
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/scripts/tpu/run_bm.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+
+set -euo pipefail
+
+VLLM_LOG="$WORKSPACE/vllm_log.txt"
+BM_LOG="$WORKSPACE/bm_log.txt"
+
+if [ -n "$TARGET_COMMIT" ]; then
+  head_hash=$(git rev-parse HEAD)
+  if [ "$TARGET_COMMIT" != "$head_hash" ]; then
+    echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash"
+    exit 1
+  fi
+fi
+
+echo "model: $MODEL"
+echo
+
+#
+# create a log folder
+#
+mkdir "$WORKSPACE/log"
+
+# TODO: Move to image building.
+pip install pandas
+pip install datasets
+
+#
+# create sonnet_4x
+#
+echo "Create sonnet_4x.txt"
+echo "" > benchmarks/sonnet_4x.txt
+for _ in {1..4}
+ do
+  cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
+done
+
+#
+# start vllm service in backend
+#
+echo "lanching vllm..."
+echo "logging to $VLLM_LOG"
+echo
+
+VLLM_USE_V1=1 vllm serve $MODEL \
+ --seed 42 \
+ --disable-log-requests \
+ --max-num-seqs $MAX_NUM_SEQS \
+ --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
+ --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
+ --no-enable-prefix-caching \
+ --download_dir $DOWNLOAD_DIR \
+ --max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
+
+
+echo "wait for 20 minutes.."
+echo
+# sleep 1200
+# wait for 10 minutes...
+for i in {1..120}; do
+    # TODO: detect other type of errors.
+    if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
+        echo "Detected RuntimeError, exiting."
+        exit 1
+    elif grep -Fq "Application startup complete" "$VLLM_LOG"; then
+        echo "Application started"
+        break
+    else
+        echo "wait for 10 seconds..."
+        sleep 10
+    fi
+done
+
+#
+# run test
+#
+echo "run benchmark test..."
+echo "logging to $BM_LOG"
+echo
+python benchmarks/benchmark_serving.py \
+    --backend vllm \
+    --model $MODEL  \
+    --dataset-name sonnet \
+    --dataset-path benchmarks/sonnet_4x.txt \
+    --sonnet-input-len $INPUT_LEN \
+    --sonnet-output-len $OUTPUT_LEN \
+    --ignore-eos > "$BM_LOG"
+
+echo "completed..."
+echo
+
+throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
+echo "throughput: $throughput"
+echo
diff --git a/vllm_v0.10.0/.buildkite/scripts/upload-wheels.sh b/vllm_v0.10.0/.buildkite/scripts/upload-wheels.sh
new file mode 100644
index 0000000..037897e
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/scripts/upload-wheels.sh
@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+
+set -ex
+
+# Assume wheels are in artifacts/dist/*.whl
+wheel_files=(artifacts/dist/*.whl)
+
+# Check that exactly one wheel is found
+if [[ ${#wheel_files[@]} -ne 1 ]]; then
+  echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
+  exit 1
+fi
+
+# Get the single wheel file
+wheel="${wheel_files[0]}"
+
+# Rename 'linux' to 'manylinux1' in the wheel filename
+new_wheel="${wheel/linux/manylinux1}"
+mv -- "$wheel" "$new_wheel"
+wheel="$new_wheel"
+
+# Extract the version from the wheel
+version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
+echo "Version: $version"
+
+normal_wheel="$wheel" # Save the original wheel filename
+
+# If the version contains "dev", rename it to v1.0.0.dev for consistency
+if [[ $version == *dev* ]]; then
+    suffix="${version##*.}"
+    if [[ $suffix == cu* ]]; then
+        new_version="1.0.0.dev+${suffix}"
+    else
+        new_version="1.0.0.dev"
+    fi
+    new_wheel="${wheel/$version/$new_version}"
+    # use cp to keep both files in the artifacts directory
+    cp -- "$wheel" "$new_wheel"
+    wheel="$new_wheel"
+    version="$new_version"
+fi
+
+# Upload the wheel to S3
+python3 .buildkite/generate_index.py --wheel "$normal_wheel"
+
+# generate index for this commit
+aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+
+if [[ $normal_wheel == *"cu118"* ]]; then
+    # if $normal_wheel matches cu118, do not upload the index.html
+    echo "Skipping index files for cu118 wheels"
+elif [[ $normal_wheel == *"cu126"* ]]; then
+    # if $normal_wheel matches cu126, do not upload the index.html
+    echo "Skipping index files for cu126 wheels"
+else
+    # only upload index.html for cu128 wheels (default wheels)
+    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
+    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
+fi
+
+# generate index for nightly
+aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
+aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
+
+if [[ $normal_wheel == *"cu118"* ]]; then
+    # if $normal_wheel matches cu118, do not upload the index.html
+    echo "Skipping index files for cu118 wheels"
+elif [[ $normal_wheel == *"cu126"* ]]; then
+    # if $normal_wheel matches cu126, do not upload the index.html
+    echo "Skipping index files for cu126 wheels"
+else
+    # only upload index.html for cu128 wheels (default wheels)
+    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
+fi
+
+aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
+aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
diff --git a/vllm_v0.10.0/.buildkite/test-pipeline.yaml b/vllm_v0.10.0/.buildkite/test-pipeline.yaml
new file mode 100644
index 0000000..948ce9e
--- /dev/null
+++ b/vllm_v0.10.0/.buildkite/test-pipeline.yaml
@@ -0,0 +1,847 @@
+# In this file, you can add more tests to run either by adding a new step or
+# adding a new command to an existing step. See different options here for examples.
+
+# This script will be feed into Jinja template in `test-template-aws.j2` at
+# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
+# to generate the final pipeline yaml file.
+
+# Documentation
+# label(str): the name of the test. emoji allowed.
+# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
+# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
+# fast_check_only(bool): run this test on fastcheck pipeline only
+# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
+# command(str): the single command to run for tests. incompatible with commands.
+# commands(list): the list of commands to run for test. incompatbile with command.
+# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
+# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
+# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
+# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
+#     in this case, commands must be specified. the first command runs on first host, the second
+#     command runs on the second host.
+# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
+# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
+
+# When adding a test
+# - If the test belong to an existing group, add it there
+# - If the test is short, add to any existing step
+# - If the test takes more than 10min, then it is okay to create a new step.
+#   Note that all steps execute in parallel.
+
+steps:
+##### fast check tests  #####
+
+- label: Documentation Build # 2min
+  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/test_docs"
+  fast_check: true
+  no_gpu: True
+  commands:
+  - pip install -r ../requirements/docs.txt
+  # TODO: add `--strict` once warnings in docstrings are fixed
+  - mkdocs build
+
+- label: Pytorch Nightly Dependency Override Check # 2min
+  # if this test fails, it means the nightly torch version is not compatible with some
+  # of the dependencies. Please check the error message and add the package to whitelist
+  # in /vllm/tools/generate_nightly_torch_test.py
+  soft_fail: true
+  source_file_dependencies:
+  - requirements/nightly_torch_test.txt
+  commands:
+  - bash standalone_tests/pytorch_nightly_dependency.sh
+
+- label: Async Engine, Inputs, Utils, Worker Test # 24min
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+  - vllm/
+  - tests/mq_llm_engine
+  - tests/async_engine
+  - tests/test_inputs
+  - tests/multimodal
+  - tests/test_utils
+  - tests/worker
+  - tests/standalone_tests/lazy_imports.py
+  commands:
+  - python3 standalone_tests/lazy_imports.py
+  - pytest -v -s mq_llm_engine # MQLLMEngine
+  - pytest -v -s async_engine # AsyncLLMEngine
+  - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
+  - pytest -v -s test_inputs.py
+  - pytest -v -s test_outputs.py
+  - pytest -v -s multimodal
+  - pytest -v -s test_utils.py # Utils
+  - pytest -v -s worker # Worker
+
+- label: Python-only Installation Test
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+  - tests/standalone_tests/python_only_compile.sh
+  - setup.py
+  commands:
+  - bash standalone_tests/python_only_compile.sh
+
+- label: Basic Correctness Test # 30min
+  mirror_hardwares: [amdexperimental, amdproduction]
+  fast_check: true
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/basic_correctness/test_basic_correctness
+  - tests/basic_correctness/test_cpu_offload
+  - tests/basic_correctness/test_preemption
+  - tests/basic_correctness/test_cumem.py
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s basic_correctness/test_cumem.py
+  - pytest -v -s basic_correctness/test_basic_correctness.py
+  - pytest -v -s basic_correctness/test_cpu_offload.py
+  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
+
+- label: Chunked Prefill Test
+  mirror_hardwares: [amdexperimental, amdproduction]
+  source_file_dependencies:
+  - vllm/
+  - tests/basic_correctness/test_chunked_prefill
+  commands:
+  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
+  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
+
+- label: Core Test # 10min
+  mirror_hardwares: [amdexperimental, amdproduction]
+  fast_check: true
+  source_file_dependencies:
+  - vllm/core
+  - vllm/distributed
+  - tests/core
+  commands:
+  - pytest -v -s core
+
+- label: Entrypoints Test (LLM) # 40min
+  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/llm
+  - tests/entrypoints/offline_mode
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
+  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
+  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
+  - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
+  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+
+- label: Entrypoints Test (API Server) # 40min
+  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/openai
+  - tests/entrypoints/test_chat_utils
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
+  - pytest -v -s entrypoints/test_chat_utils.py
+
+- label: Distributed Tests (4 GPUs) # 10min
+  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/core/
+  - tests/distributed/test_utils
+  - tests/distributed/test_pynccl
+  - tests/distributed/test_events
+  - tests/compile/test_basic_correctness
+  - examples/offline_inference/rlhf.py
+  - examples/offline_inference/rlhf_colocate.py
+  - tests/examples/offline_inference/data_parallel.py
+  - tests/v1/test_async_llm_dp.py
+  - tests/v1/test_external_lb_dp.py
+  - tests/v1/test_internal_lb_dp.py
+  - tests/v1/test_hybrid_lb_dp.py
+  - tests/v1/engine/test_engine_core_client.py
+  commands:
+  # test with tp=2 and external_dp=2
+  - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  # test with tp=2 and pp=2
+  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  # test with internal dp
+  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py
+  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
+  - pytest -v -s distributed/test_utils.py
+  - pytest -v -s compile/test_basic_correctness.py
+  - pytest -v -s distributed/test_pynccl.py
+  - pytest -v -s distributed/test_events.py
+  # TODO: create a dedicated test section for multi-GPU example tests
+  # when we have multiple distributed example tests
+  - pushd ../examples/offline_inference
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+  - popd
+
+- label: EPLB Algorithm Test
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/distributed/eplb
+  - tests/distributed/test_eplb_algo.py
+  commands:
+  - pytest -v -s distributed/test_eplb_algo.py
+
+- label: EPLB Execution Test # 5min
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/eplb
+  - tests/distributed/test_eplb_execute.py
+  commands:
+  - pytest -v -s distributed/test_eplb_execute.py
+
+- label: Metrics, Tracing Test # 10min
+  mirror_hardwares: [amdexperimental, amdproduction]
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/
+  - tests/metrics
+  - tests/tracing
+  commands:
+  - pytest -v -s metrics
+  - "pip install \
+      'opentelemetry-sdk>=1.26.0' \
+      'opentelemetry-api>=1.26.0' \
+      'opentelemetry-exporter-otlp>=1.26.0' \
+      'opentelemetry-semantic-conventions-ai>=0.4.1'"
+  - pytest -v -s tracing
+
+##### fast check tests  #####
+#####  1 GPU test  #####
+
+- label: Regression Test # 5min
+  mirror_hardwares: [amdexperimental, amdproduction]
+  source_file_dependencies:
+  - vllm/
+  - tests/test_regression
+  commands:
+  - pip install modelscope
+  - pytest -v -s test_regression.py
+  working_dir: "/vllm-workspace/tests" # optional
+
+- label: Engine Test # 10min
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+  - vllm/
+  - tests/engine
+  - tests/tokenization
+  - tests/test_sequence
+  - tests/test_config
+  - tests/test_logger
+  - tests/test_vllm_port
+  commands:
+  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
+  # OOM in the CI unless we run this separately
+  - pytest -v -s tokenization
+
+- label: V1 Test
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    # split the test to avoid interference
+    - pytest -v -s v1/core
+    - pytest -v -s v1/engine
+    - pytest -v -s v1/entrypoints
+    - pytest -v -s v1/sample
+    - pytest -v -s v1/worker
+    - pytest -v -s v1/structured_output
+    - pytest -v -s v1/spec_decode
+    - pytest -v -s v1/kv_connector/unit
+    - pytest -v -s v1/metrics
+    - pytest -v -s v1/test_serial_utils.py
+    - pytest -v -s v1/test_utils.py
+    - pytest -v -s v1/test_oracle.py
+    - pytest -v -s v1/test_metrics_reader.py
+    # TODO: accuracy does not match, whether setting
+    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+    - pytest -v -s v1/e2e
+    # Integration test for streaming correctness (requires special branch).
+    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
+    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
+
+- label: Examples Test # 25min
+  mirror_hardwares: [amdexperimental, amdproduction]
+  working_dir: "/vllm-workspace/examples"
+  source_file_dependencies:
+  - vllm/entrypoints
+  - examples/
+  commands:
+    - pip install tensorizer # for tensorizer test
+    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
+    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 offline_inference/basic/chat.py
+    - python3 offline_inference/prefix_caching.py
+    - python3 offline_inference/llm_engine_example.py
+    - python3 offline_inference/audio_language.py --seed 0
+    - python3 offline_inference/vision_language.py --seed 0
+    - python3 offline_inference/vision_language_pooling.py --seed 0
+    - python3 offline_inference/vision_language_multi_image.py --seed 0
+    - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 offline_inference/encoder_decoder.py
+    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+    - python3 offline_inference/basic/classify.py
+    - python3 offline_inference/basic/embed.py
+    - python3 offline_inference/basic/score.py
+    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
+
+- label: Prefix Caching Test # 9min
+  mirror_hardwares: [amdexperimental, amdproduction]
+  source_file_dependencies:
+  - vllm/
+  - tests/prefix_caching
+  commands:
+    - pytest -v -s prefix_caching
+
+
+- label: Platform Tests (CUDA)
+  mirror_hardwares: [amdexperimental, amdproduction]
+  source_file_dependencies:
+  - vllm/
+  - tests/cuda
+  commands:
+    - pytest -v -s cuda/test_cuda_context.py
+
+- label: Samplers Test # 36min
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+  - vllm/model_executor/layers
+  - vllm/sampling_metadata.py
+  - tests/samplers
+  - tests/conftest.py
+  commands:
+    - pytest -v -s samplers
+    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
+
+- label: LoRA Test %N # 15min each
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+  - vllm/lora
+  - tests/lora
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
+  parallelism: 4
+
+- label: PyTorch Compilation Unit Tests
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+    - vllm/
+    - tests/compile
+  commands:
+    - pytest -v -s compile/test_pass_manager.py
+    - pytest -v -s compile/test_fusion.py
+    - pytest -v -s compile/test_fusion_attn.py
+    - pytest -v -s compile/test_silu_mul_quant_fusion.py
+    - pytest -v -s compile/test_sequence_parallelism.py
+    - pytest -v -s compile/test_async_tp.py
+
+- label: PyTorch Fullgraph Smoke Test # 9min
+  mirror_hardwares: [amdexperimental, amdproduction]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+  - pytest -v -s compile/test_basic_correctness.py
+  # these tests need to be separated, cannot combine
+  - pytest -v -s compile/piecewise/test_simple.py
+  - pytest -v -s compile/piecewise/test_toy_llama.py
+  - pytest -v -s compile/piecewise/test_full_cudagraph.py
+
+- label: PyTorch Fullgraph Test # 18min
+  mirror_hardwares: [amdexperimental, amdproduction]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+  - pytest -v -s compile/test_full_graph.py
+
+- label: Kernels Core Operation Test
+  mirror_hardwares: [amdexperimental, amdproduction]
+  source_file_dependencies:
+  - csrc/
+  - tests/kernels/core
+  commands:
+    - pytest -v -s kernels/core
+
+- label: Kernels Attention Test %N
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+  - csrc/attention/
+  - vllm/attention
+  - vllm/v1/attention
+  - tests/kernels/attention
+  commands:
+    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels Quantization Test %N
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+  - csrc/quantization/
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization
+  commands:
+    - pytest -v -s kernels/quantization  --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels MoE Test
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+  - csrc/moe/
+  - tests/kernels/moe
+  - vllm/model_executor/layers/fused_moe/
+  commands:
+    - pytest -v -s kernels/moe
+
+- label: Kernels Mamba Test
+  mirror_hardwares: [amdexperimental, amdproduction]
+  source_file_dependencies:
+  - csrc/mamba/
+  - tests/kernels/mamba
+  commands:
+    - pytest -v -s kernels/mamba
+
+- label: Tensorizer Test # 11min
+  mirror_hardwares: [amdexperimental, amdproduction]
+  soft_fail: true
+  source_file_dependencies:
+  - vllm/model_executor/model_loader
+  - tests/tensorizer_loader
+  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
+  commands:
+    - apt-get update && apt-get install -y curl libsodium23
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s tensorizer_loader
+    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
+
+- label: Model Executor Test
+  mirror_hardwares: [amdexperimental, amdproduction]
+  source_file_dependencies:
+  - vllm/model_executor
+  - tests/model_executor
+  commands:
+    - apt-get update && apt-get install -y curl libsodium23
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s model_executor
+
+- label: Benchmarks # 9min
+  mirror_hardwares: [amdexperimental, amdproduction]
+  working_dir: "/vllm-workspace/.buildkite"
+  source_file_dependencies:
+  - benchmarks/
+  commands:
+  - bash scripts/run-benchmarks.sh
+
+- label: Benchmarks CLI Test # 10min
+  mirror_hardwares: [amdexperimental, amdproduction]
+  source_file_dependencies:
+  - vllm/
+  - tests/benchmarks/
+  commands:
+  - pytest -v -s benchmarks/
+
+- label: Quantization Test
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  - tests/quantization
+  commands:
+  # temporary install here since we need nightly, will move to requirements/test.in
+  # after torchao 0.12 release
+  - pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
+
+- label: LM Eval Small Models # 53min
+  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+
+- label: OpenAI API correctness
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+  - csrc/
+  - vllm/entrypoints/openai/
+  - vllm/model_executor/models/whisper.py
+  commands: # LMEval+Transcription WER check
+  - pytest -s entrypoints/openai/correctness/
+
+- label: Encoder Decoder tests # 5min
+  mirror_hardwares: [amdexperimental, amdproduction]
+  source_file_dependencies:
+  - vllm/
+  - tests/encoder_decoder
+  commands:
+    - pytest -v -s encoder_decoder
+
+- label: OpenAI-Compatible Tool Use # 20 min
+  mirror_hardwares: [amdexperimental, amdproduction]
+  fast_check: false
+  source_file_dependencies:
+    - vllm/
+    - tests/tool_use
+    - tests/mistral_tool_use
+  commands:
+    - pytest -v -s tool_use
+    - pytest -v -s mistral_tool_use
+
+#####  models test  #####
+
+- label: Basic Models Test # 24min
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models
+  commands:
+    - pytest -v -s models/test_transformers.py
+    - pytest -v -s models/test_registry.py
+    - pytest -v -s models/test_utils.py
+    - pytest -v -s models/test_vision.py
+    - pytest -v -s models/test_initialization.py
+
+- label: Language Models Test (Standard)
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language
+  commands:
+    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
+    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/language -m core_model
+
+- label: Language Models Test (Hybrid) # 35 min
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/generation
+  commands:
+    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
+    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
+    - pytest -v -s models/language/generation -m hybrid_model
+
+- label: Language Models Test (Extended Generation) # 1hr20min
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/generation
+  commands:
+    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
+    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
+    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
+
+- label: Language Models Test (Extended Pooling)  # 36min
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/pooling
+  commands:
+    - pytest -v -s models/language/pooling -m 'not core_model'
+
+- label: Multi-Modal Models Test (Standard)
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/multimodal/processing
+    - pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
+    - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
+
+- label: Multi-Modal Models Test (Extended) 1
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'
+
+- label: Multi-Modal Models Test (Extended) 2
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
+
+- label: Multi-Modal Models Test (Extended) 3
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
+
+- label: Quantized Models Test
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+  - vllm/model_executor/layers/quantization
+  - tests/models/quantization
+  commands:
+    - pytest -v -s models/quantization
+
+# This test is used only in PR development phase to test individual models and should never run on main
+- label: Custom Models Test
+  mirror_hardwares: [amdexperimental, amdproduction]
+  optional: true
+  commands:
+    - echo 'Testing custom models...'
+    # PR authors can temporarily add commands below to test individual models
+    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
+    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
+
+- label: Transformers Nightly Models Test
+  working_dir: "/vllm-workspace/"
+  optional: true
+  commands:
+    - pip install --upgrade git+https://github.com/huggingface/transformers
+    - pytest -v -s tests/models/test_initialization.py
+    - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/test_mapping.py
+    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/offline_inference/audio_language.py --model-type whisper
+    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+
+#####  1 GPU test  #####
+#####  multi gpus test  #####
+
+- label: Distributed Comm Ops Test # 7min
+  mirror_hardwares: [amdexperimental, amdproduction]
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/distributed
+  - tests/distributed
+  commands:
+  - pytest -v -s distributed/test_comm_ops.py
+  - pytest -v -s distributed/test_shm_broadcast.py
+
+- label: 2 Node Tests (4 GPUs in total) # 16min
+  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  num_nodes: 2
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/model_executor/models/
+  - tests/distributed/
+  - tests/examples/offline_inference/data_parallel.py
+  commands:
+  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
+    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
+    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
+    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
+  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
+    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
+    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+
+- label: Distributed Tests (2 GPUs) # 40min
+  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/model_executor/models/
+  - tests/distributed/
+  - vllm/compilation
+  - vllm/worker/worker_base.py
+  - vllm/worker/worker.py
+  - vllm/worker/model_runner.py
+  - entrypoints/llm/test_collective_rpc.py
+  - tests/v1/test_async_llm_dp.py
+  - tests/v1/test_external_lb_dp.py
+  - tests/v1/entrypoints/openai/test_multi_api_servers.py
+  - vllm/v1/engine/
+  commands:
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
+  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
+  - pytest -v -s entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s ./compile/test_basic_correctness.py
+  - pytest -v -s ./compile/test_wrapper.py
+  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+  # Avoid importing model tests that cause CUDA reinitialization error
+  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
+  # test sequence parallel
+  - pytest -v -s distributed/test_sequence_parallel.py
+  # this test fails consistently.
+  # TODO: investigate and fix
+  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
+  - pytest -v -s models/multimodal/generation/test_maverick.py
+
+- label: Plugin Tests (2 GPUs) # 40min
+  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/plugins/
+  - tests/plugins/
+  commands:
+  # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
+  - pip install -e ./plugins/vllm_add_dummy_platform
+  - pytest -v -s plugins_tests/test_platform_plugins.py
+  - pip uninstall vllm_add_dummy_platform -y
+  # end platform plugin tests
+  # other tests continue here:
+  - pytest -v -s plugins_tests/test_scheduler_plugins.py
+  - pip install -e ./plugins/vllm_add_dummy_model
+  - pytest -v -s distributed/test_distributed_oot.py
+  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
+  - pytest -v -s models/test_oot_registration.py # it needs a clean process
+  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
+
+- label: Multi-step Tests (4 GPUs) # 36min
+  mirror_hardwares: [amdexperimental, amdproduction]
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/model_executor/layers/sampler.py
+  - vllm/sequence.py
+  - vllm/worker/worker_base.py
+  - vllm/worker/worker.py
+  - vllm/worker/multi_step_worker.py
+  - vllm/worker/model_runner_base.py
+  - vllm/worker/model_runner.py
+  - vllm/worker/multi_step_model_runner.py
+  - vllm/engine
+  - tests/multi_step
+  commands:
+  # this test is quite flaky
+  # TODO: investigate and fix.
+  # - pytest -v -s multi_step/test_correctness_async_llm.py
+  - pytest -v -s multi_step/test_correctness_llm.py
+
+- label: Pipeline Parallelism Test # 45min
+  mirror_hardwares: [amdexperimental, amdproduction]
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/model_executor/models/
+  - tests/distributed/
+  commands:
+  - pytest -v -s distributed/test_pp_cudagraph.py
+  - pytest -v -s distributed/test_pipeline_parallel.py
+
+- label: LoRA TP Test (Distributed)
+  mirror_hardwares: [amdexperimental, amdproduction]
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/lora
+  - tests/lora
+  commands:
+    # FIXIT: find out which code initialize cuda before running the test
+    # before the fix, we need to use spawn to test it
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    # There is some Tensor Parallelism related processing logic in LoRA that
+    # requires multi-GPU testing for validation.
+    - pytest -v -s -x lora/test_chatglm3_tp.py
+    - pytest -v -s -x lora/test_llama_tp.py
+
+
+- label: Weight Loading Multiple GPU Test  # 33min
+  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
+
+- label: Weight Loading Multiple GPU Test - Large Models # optional
+  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  gpu: a100
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
+
+
+##### multi gpus test #####
+##### A100 test #####
+
+- label: Distributed Tests (A100) # optional
+  gpu: a100
+  optional: true
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/
+  commands:
+  # NOTE: don't test llama model here, it seems hf implementation is buggy
+  # see https://github.com/vllm-project/vllm/pull/5689 for details
+  - pytest -v -s distributed/test_custom_all_reduce.py
+  - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
+  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+  - pytest -v -s -x lora/test_mixtral.py
+
+- label: LM Eval Large Models # optional
+  gpu: a100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
diff --git a/vllm_v0.10.0/.clang-format b/vllm_v0.10.0/.clang-format
new file mode 100644
index 0000000..7f9e6d7
--- /dev/null
+++ b/vllm_v0.10.0/.clang-format
@@ -0,0 +1,26 @@
+BasedOnStyle: Google
+UseTab: Never
+IndentWidth: 2
+ColumnLimit: 80
+
+# Force pointers to the type for C++.
+DerivePointerAlignment: false
+PointerAlignment: Left
+
+# Reordering #include statements can (and currently will) introduce errors
+SortIncludes: false
+
+# Style choices
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+IndentPPDirectives: BeforeHash
+
+IncludeCategories:
+  - Regex:           '^<'
+    Priority:        4
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/'
+    Priority:        3
+  - Regex:           '^"(qoda|\.\.)/'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        1
diff --git a/vllm_v0.10.0/.dockerignore b/vllm_v0.10.0/.dockerignore
new file mode 100644
index 0000000..3863656
--- /dev/null
+++ b/vllm_v0.10.0/.dockerignore
@@ -0,0 +1,33 @@
+/.venv
+/build
+dist
+vllm/*.so
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+.mypy_cache
+
+# Distribution / packaging
+.Python
+/build/
+cmake-build-*/
+CMakeUserPresets.json
+develop-eggs/
+/dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
diff --git a/vllm_v0.10.0/.gemini/config.yaml b/vllm_v0.10.0/.gemini/config.yaml
new file mode 100644
index 0000000..2499d3f
--- /dev/null
+++ b/vllm_v0.10.0/.gemini/config.yaml
@@ -0,0 +1,6 @@
+# https://developers.google.com/gemini-code-assist/docs/customize-gemini-behavior-github
+have_fun: false  # Just review the code
+code_review:
+  comment_severity_threshold: HIGH  # Reduce quantity of comments
+  pull_request_opened:
+    summary: false  # Don't summarize the PR in a separate comment
diff --git a/vllm_v0.10.0/.gitignore b/vllm_v0.10.0/.gitignore
new file mode 100644
index 0000000..96b97a5
--- /dev/null
+++ b/vllm_v0.10.0/.gitignore
@@ -0,0 +1,205 @@
+# version file generated by setuptools-scm
+/vllm/_version.py
+
+# vllm-flash-attn built from source
+vllm/vllm_flash_attn/*
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+cmake-build-*/
+CMakeUserPresets.json
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+/.deps/
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# generated files
+**/generated/**
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+docs/argparse
+docs/examples
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+
+# VSCode
+.vscode/
+
+# DS Store
+.DS_Store
+
+# Results
+*.csv
+
+# Python pickle files
+*.pkl
+
+# Sphinx documentation
+_build/
+
+# vim swap files
+*.swo
+*.swp
+
+# hip files generated by PyTorch
+*.hip
+*_hip*
+hip_compat.h
+
+# Benchmark dataset
+benchmarks/**/*.json
+
+# Linting
+actionlint
+shellcheck*/
+
+# Ignore moe/marlin_moe gen code
+csrc/moe/marlin_moe_wna16/kernel_*
diff --git a/vllm_v0.10.0/.pre-commit-config.yaml b/vllm_v0.10.0/.pre-commit-config.yaml
new file mode 100644
index 0000000..5197820
--- /dev/null
+++ b/vllm_v0.10.0/.pre-commit-config.yaml
@@ -0,0 +1,177 @@
+default_install_hook_types:
+  - pre-commit
+  - commit-msg
+default_stages:
+  - pre-commit # Run locally
+  - manual # Run in CI
+exclude: 'vllm/third_party/.*'
+repos:
+- repo: https://github.com/google/yapf
+  rev: v0.43.0
+  hooks:
+  - id: yapf
+    args: [--in-place, --verbose]
+    # Keep the same list from yapfignore here to avoid yapf failing without any inputs
+    exclude: '(.buildkite|benchmarks|build|examples)/.*'
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  rev: v0.11.7
+  hooks:
+  - id: ruff
+    args: [--output-format, github, --fix]
+  - id: ruff-format
+    files: ^(.buildkite|benchmarks|examples)/.*
+- repo: https://github.com/crate-ci/typos
+  rev: v1.34.0
+  hooks:
+  - id: typos
+- repo: https://github.com/PyCQA/isort
+  rev: 6.0.1
+  hooks:
+  - id: isort
+- repo: https://github.com/pre-commit/mirrors-clang-format
+  rev: v20.1.3
+  hooks:
+  - id: clang-format
+    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
+    types_or: [c++, cuda]
+    args: [--style=file, --verbose]
+- repo: https://github.com/jackdewinter/pymarkdown
+  rev: v0.9.29
+  hooks:
+  - id: pymarkdown
+    exclude: '.*\.inc\.md'
+    args: [fix]
+- repo: https://github.com/rhysd/actionlint
+  rev: v1.7.7
+  hooks:
+  - id: actionlint
+- repo: https://github.com/astral-sh/uv-pre-commit
+  rev: 0.6.17
+  hooks:
+    - id: pip-compile
+      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
+      files: ^requirements/test\.(in|txt)$
+- repo: local
+  hooks:
+  - id: format-torch-nightly-test
+    name: reformat nightly_torch_test.txt to be in sync with test.in
+    language: python
+    entry: python tools/generate_nightly_torch_test.py
+    files: ^requirements/test\.(in|txt)$
+  - id: mypy-local
+    name: Run mypy for local Python installation
+    entry: tools/mypy.sh 0 "local"
+    language: python
+    types: [python]
+    additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
+    stages: [pre-commit] # Don't run in CI
+  - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
+    name: Run mypy for Python 3.9
+    entry: tools/mypy.sh 1 "3.9"
+    language: python
+    types: [python]
+    additional_dependencies: *mypy_deps
+    stages: [manual] # Only run in CI
+  - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
+    name: Run mypy for Python 3.10
+    entry: tools/mypy.sh 1 "3.10"
+    language: python
+    types: [python]
+    additional_dependencies: *mypy_deps
+    stages: [manual] # Only run in CI
+  - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
+    name: Run mypy for Python 3.11
+    entry: tools/mypy.sh 1 "3.11"
+    language: python
+    types: [python]
+    additional_dependencies: *mypy_deps
+    stages: [manual] # Only run in CI
+  - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
+    name: Run mypy for Python 3.12
+    entry: tools/mypy.sh 1 "3.12"
+    language: python
+    types: [python]
+    additional_dependencies: *mypy_deps
+    stages: [manual] # Only run in CI
+  - id: shellcheck
+    name: Lint shell scripts
+    entry: tools/shellcheck.sh
+    language: script
+    types: [shell]
+  - id: png-lint
+    name: Lint PNG exports from excalidraw
+    entry: tools/png-lint.sh
+    language: script
+    types: [png]
+  - id: signoff-commit
+    name: Sign-off Commit
+    entry: bash
+    args:
+      - -c
+      - |
+        if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" "$(git rev-parse --git-path COMMIT_EDITMSG)"; then
+          printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> "$(git rev-parse --git-path COMMIT_EDITMSG)"
+        fi
+    language: system
+    verbose: true
+    stages: [commit-msg]
+  - id: check-spdx-header
+    name: Check SPDX headers
+    entry: python tools/check_spdx_header.py
+    language: python
+    types: [python]
+  - id: check-root-lazy-imports
+    name: Check root lazy imports
+    entry: python tools/check_init_lazy_imports.py
+    language: python
+    types: [python]
+  - id: check-filenames
+    name: Check for spaces in all filenames
+    entry: bash
+    args:
+      - -c
+      - 'git ls-files | grep " " && echo "Filenames should not contain spaces!" && exit 1 || exit 0'
+    language: system
+    always_run: true
+    pass_filenames: false
+  - id: update-dockerfile-graph
+    name: Update Dockerfile dependency graph
+    entry: tools/update-dockerfile-graph.sh
+    language: script
+  - id: enforce-import-regex-instead-of-re
+    name: Enforce import regex as re
+    entry: python tools/enforce_regex_import.py
+    language: python
+    types: [python]
+    pass_filenames: false
+    additional_dependencies: [regex]
+  # forbid directly import triton
+  - id: forbid-direct-triton-import
+    name: "Forbid direct 'import triton'"
+    entry: python tools/check_triton_import.py
+    language: python
+    types: [python]
+    pass_filenames: false
+    additional_dependencies: [regex]
+  - id: check-pickle-imports
+    name: Prevent new pickle/cloudpickle imports
+    entry: python tools/check_pickle_imports.py
+    language: python
+    types: [python]
+    pass_filenames: false
+    additional_dependencies: [pathspec, regex]
+  - id: validate-config
+    name: Validate configuration has default values and that each field has a docstring
+    entry: python tools/validate_config.py
+    language: python
+    types: [python]
+    pass_filenames: true
+    files: vllm/config.py|tests/test_config.py|vllm/entrypoints/openai/cli_args.py
+  # Keep `suggestion` last
+  - id: suggestion
+    name: Suggestion
+    entry: bash -c 'echo "To bypass all the pre-commit hooks, add --no-verify to git commit. To skip a specific hook, prefix the commit command with SKIP=<hook-id>."'
+    language: system
+    verbose: true
+    pass_filenames: false
+  # Insert new entries above the `suggestion` entry
diff --git a/vllm_v0.10.0/.readthedocs.yaml b/vllm_v0.10.0/.readthedocs.yaml
new file mode 100644
index 0000000..98c3be2
--- /dev/null
+++ b/vllm_v0.10.0/.readthedocs.yaml
@@ -0,0 +1,17 @@
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+version: 2
+
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.12"
+
+mkdocs:
+  configuration: mkdocs.yaml
+
+# Optionally declare the Python requirements required to build your docs
+python:
+  install:
+    - requirements: requirements/docs.txt
diff --git a/vllm_v0.10.0/.shellcheckrc b/vllm_v0.10.0/.shellcheckrc
new file mode 100644
index 0000000..f3b6eed
--- /dev/null
+++ b/vllm_v0.10.0/.shellcheckrc
@@ -0,0 +1,9 @@
+# rules currently disabled:
+#
+#   SC1091 (info): Not following: <sourced file> was not specified as input (see shellcheck -x)
+#   SC2004 (style): $/${} is unnecessary on arithmetic variables.
+#   SC2129 (style): Consider using { cmd1; cmd2; } >> file instead of individual redirects.
+#   SC2155 (warning): Declare and assign separately to avoid masking return values.
+#   SC2164 (warning): Use 'cd ... || exit' or 'cd ... || return' in case cd fails.
+#
+disable=SC1091,SC2004,SC2129,SC2155,SC2164
diff --git a/vllm_v0.10.0/.yapfignore b/vllm_v0.10.0/.yapfignore
new file mode 100644
index 0000000..2d6dcf8
--- /dev/null
+++ b/vllm_v0.10.0/.yapfignore
@@ -0,0 +1 @@
+collect_env.py
diff --git a/vllm_v0.10.0/CMakeLists.txt b/vllm_v0.10.0/CMakeLists.txt
new file mode 100644
index 0000000..98ed682
--- /dev/null
+++ b/vllm_v0.10.0/CMakeLists.txt
@@ -0,0 +1,889 @@
+cmake_minimum_required(VERSION 3.26)
+
+# When building directly using CMake, make sure you run the install step
+# (it places the .so files in the correct location).
+#
+# Example:
+# mkdir build && cd build
+# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_INSTALL_PREFIX=.. ..
+# cmake --build . --target install
+#
+# If you want to only build one target, make sure to install it manually:
+# cmake --build . --target _C
+# cmake --install . --component _C
+project(vllm_extensions LANGUAGES CXX)
+
+# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
+set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
+message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
+message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
+
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
+
+# Suppress potential warnings about unused manually-specified variables
+set(ignoreMe "${VLLM_PYTHON_PATH}")
+
+# Prevent installation of dependencies (cutlass) by default.
+install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
+
+#
+# Supported python versions.  These versions will be searched in order, the
+# first match will be selected.  These should be kept in sync with setup.py.
+#
+set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
+
+# Supported AMD GPU architectures.
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
+
+#
+# Supported/expected torch versions for CUDA/ROCm.
+#
+# Currently, having an incorrect pytorch version results in a warning
+# rather than an error.
+#
+# Note: the CUDA torch version is derived from pyproject.toml and various
+# requirements.txt files and should be kept consistent.  The ROCm torch
+# versions are derived from docker/Dockerfile.rocm
+#
+set(TORCH_SUPPORTED_VERSION_CUDA "2.7.1")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
+
+#
+# Try to find python package with an executable that exactly matches
+# `VLLM_PYTHON_EXECUTABLE` and is one of the supported versions.
+#
+if (VLLM_PYTHON_EXECUTABLE)
+  find_python_from_executable(${VLLM_PYTHON_EXECUTABLE} "${PYTHON_SUPPORTED_VERSIONS}")
+else()
+  message(FATAL_ERROR
+    "Please set VLLM_PYTHON_EXECUTABLE to the path of the desired python version"
+    " before running cmake configure.")
+endif()
+
+#
+# Update cmake's `CMAKE_PREFIX_PATH` with torch location.
+#
+append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
+
+# Ensure the 'nvcc' command is in the PATH
+find_program(NVCC_EXECUTABLE nvcc)
+if (CUDA_FOUND AND NOT NVCC_EXECUTABLE)
+    message(FATAL_ERROR "nvcc not found")
+endif()
+
+#
+# Import torch cmake configuration.
+# Torch also imports CUDA (and partially HIP) languages with some customizations,
+# so there is no need to do this explicitly with check_language/enable_language,
+# etc.
+#
+find_package(Torch REQUIRED)
+
+# Supported NVIDIA architectures.
+# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
+if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
+   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
+  set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
+else()
+  set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
+endif()
+
+#
+# Forward the non-CUDA device extensions to external CMake scripts.
+#
+if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
+    NOT VLLM_TARGET_DEVICE STREQUAL "rocm")
+    if (VLLM_TARGET_DEVICE STREQUAL "cpu")
+        include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
+    else()
+        return()
+    endif()
+    return()
+endif()
+
+#
+# Set up GPU language and check the torch version and warn if it isn't
+# what is expected.
+#
+if (NOT HIP_FOUND AND CUDA_FOUND)
+  set(VLLM_GPU_LANG "CUDA")
+
+  if (NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_CUDA})
+    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_CUDA} "
+      "expected for CUDA build, saw ${Torch_VERSION} instead.")
+  endif()
+elseif(HIP_FOUND)
+  set(VLLM_GPU_LANG "HIP")
+
+  # Importing torch recognizes and sets up some HIP/ROCm configuration but does
+  # not let cmake recognize .hip files. In order to get cmake to understand the
+  # .hip extension automatically, HIP must be enabled explicitly.
+  enable_language(HIP)
+
+  # ROCm 5.X and 6.X
+  if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
+      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
+    message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
+      "expected for ROCm build, saw ${Torch_VERSION} instead.")
+  endif()
+else()
+  message(FATAL_ERROR "Can't find CUDA or HIP installation.")
+endif()
+
+
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  #
+  # For cuda we want to be able to control which architectures we compile for on
+  # a per-file basis in order to cut down on compile time. So here we extract
+  # the set of architectures we want to compile for and remove the from the
+  # CMAKE_CUDA_FLAGS so that they are not applied globally.
+  #
+  clear_cuda_arches(CUDA_ARCH_FLAGS)
+  extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
+  message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
+  # Filter the target architectures by the supported supported archs
+  # since for some files we will build for all CUDA_ARCHS.
+  cuda_archs_loose_intersection(CUDA_ARCHS
+    "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
+  message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
+else()
+  #
+  # For other GPU targets override the GPU architectures detected by cmake/torch
+  # and filter them by the supported versions for the current language.
+  # The final set of arches is stored in `VLLM_GPU_ARCHES`.
+  #
+  override_gpu_arches(VLLM_GPU_ARCHES
+    ${VLLM_GPU_LANG}
+    "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
+endif()
+
+#
+# Query torch for additional GPU compilation flags for the given
+# `VLLM_GPU_LANG`.
+# The final set of arches is stored in `VLLM_GPU_FLAGS`.
+#
+get_torch_gpu_compiler_flags(VLLM_GPU_FLAGS ${VLLM_GPU_LANG})
+
+#
+# Set nvcc parallelism.
+#
+if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
+  list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
+endif()
+
+#
+# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
+# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
+# Each dependency that produces build artifacts should override its BINARY_DIR to avoid
+# conflicts between build types. It should instead be set to ${CMAKE_BINARY_DIR}/<dependency>.
+#
+include(FetchContent)
+file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
+message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
+
+if(VLLM_GPU_LANG STREQUAL "HIP")
+  #
+  # Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
+  #
+  set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
+  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
+
+  #
+  # Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
+  # a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
+  #
+  set(CMAKE_${VLLM_GPU_LANG}_FLAGS "${CMAKE_${VLLM_GPU_LANG}_FLAGS} -Wno-unused-result")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result")
+endif()
+
+#
+# Define other extension targets
+#
+
+#
+# cumem_allocator extension
+#
+
+set(VLLM_CUMEM_EXT_SRC
+  "csrc/cumem_allocator.cpp")
+
+set_gencode_flags_for_srcs(
+  SRCS "${VLLM_CUMEM_EXT_SRC}"
+  CUDA_ARCHS "${CUDA_ARCHS}")
+
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  message(STATUS "Enabling cumem allocator extension.")
+  # link against cuda driver library
+  list(APPEND CUMEM_LIBS CUDA::cuda_driver)
+  define_gpu_extension_target(
+    cumem_allocator
+    DESTINATION vllm
+    LANGUAGE CXX
+    SOURCES ${VLLM_CUMEM_EXT_SRC}
+    LIBRARIES ${CUMEM_LIBS}
+    USE_SABI 3.8
+    WITH_SOABI)
+endif()
+
+#
+# _C extension
+#
+
+set(VLLM_EXT_SRC
+  "csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
+  "csrc/cache_kernels.cu"
+  "csrc/attention/paged_attention_v1.cu"
+  "csrc/attention/paged_attention_v2.cu"
+  "csrc/attention/merge_attn_states.cu"
+  "csrc/attention/vertical_slash_index.cu"
+  "csrc/pos_encoding_kernels.cu"
+  "csrc/activation_kernels.cu"
+  "csrc/layernorm_kernels.cu"
+  "csrc/layernorm_quant_kernels.cu"
+  "csrc/sampler.cu"
+  "csrc/cuda_view.cu"
+  "csrc/quantization/gptq/q_gemm.cu"
+  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
+  "csrc/quantization/fp8/common.cu"
+  "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
+  "csrc/quantization/gguf/gguf_kernel.cu"
+  "csrc/quantization/activation_kernels.cu"
+  "csrc/cuda_utils_kernels.cu"
+  "csrc/prepare_inputs/advance_step.cu"
+  "csrc/custom_all_reduce.cu"
+  "csrc/torch_bindings.cpp")
+
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
+
+  # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
+  set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")
+
+  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
+  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
+    set(VLLM_CUTLASS_SRC_DIR $ENV{VLLM_CUTLASS_SRC_DIR})
+  endif()
+
+  if(VLLM_CUTLASS_SRC_DIR)
+    if(NOT IS_ABSOLUTE VLLM_CUTLASS_SRC_DIR)
+      get_filename_component(VLLM_CUTLASS_SRC_DIR "${VLLM_CUTLASS_SRC_DIR}" ABSOLUTE)
+    endif()
+    message(STATUS "The VLLM_CUTLASS_SRC_DIR is set, using ${VLLM_CUTLASS_SRC_DIR} for compilation")
+    FetchContent_Declare(cutlass SOURCE_DIR ${VLLM_CUTLASS_SRC_DIR})
+  else()
+    FetchContent_Declare(
+        cutlass
+        GIT_REPOSITORY https://github.com/nvidia/cutlass.git
+        # Please keep this in sync with CUTLASS_REVISION line above.
+        GIT_TAG ${CUTLASS_REVISION}
+        GIT_PROGRESS TRUE
+
+        # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
+        # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
+        # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
+        GIT_SHALLOW TRUE
+    )
+  endif()
+  FetchContent_MakeAvailable(cutlass)
+
+  list(APPEND VLLM_EXT_SRC
+    "csrc/quantization/aqlm/gemm_kernels.cu"
+    "csrc/quantization/awq/gemm_kernels.cu"
+    "csrc/permute_cols.cu"
+    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
+    "csrc/quantization/fp4/nvfp4_quant_entry.cu"
+    "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
+    "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
+    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
+    "csrc/cutlass_extensions/common.cpp"
+    "csrc/attention/mla/cutlass_mla_entry.cu"
+    "csrc/quantization/fp8/per_token_group_quant.cu")
+
+  set_gencode_flags_for_srcs(
+    SRCS "${VLLM_EXT_SRC}"
+    CUDA_ARCHS "${CUDA_ARCHS}")
+
+  # Only build Marlin kernels if we are building for at least some compatible archs.
+  # Keep building Marlin for 9.0 as there are some group sizes and shapes that
+  # are not supported by Machete yet.
+  # 9.0 for latest bf16 atomicAdd PTX
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
+  if (MARLIN_ARCHS)
+
+    #
+    # For the Marlin kernels we automatically generate sources for various
+    # preselected input type pairs and schedules.
+    # Generate sources:
+    set(MARLIN_GEN_SCRIPT
+      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
+    file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
+
+    message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
+    message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
+
+    if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
+        OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
+      execute_process(
+        COMMAND ${CMAKE_COMMAND} -E env
+        PYTHONPATH=$PYTHONPATH
+          ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
+        RESULT_VARIABLE marlin_generation_result
+        OUTPUT_VARIABLE marlin_generation_result
+        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
+        ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
+      )
+
+      if (NOT marlin_generation_result EQUAL 0)
+        message(FATAL_ERROR "Marlin generation failed."
+                            " Result: \"${marlin_generation_result}\""
+                            "\nCheck the log for details: "
+                            "${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
+      else()
+        set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
+            CACHE STRING "Last run Marlin generate script hash" FORCE)
+        message(STATUS "Marlin generation completed successfully.")
+      endif()
+    else()
+      message(STATUS "Marlin generation script has not changed, skipping generation.")
+    endif()
+
+    file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
+      CUDA_ARCHS "${MARLIN_ARCHS}")
+
+    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
+
+    set(MARLIN_SRCS
+       "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
+       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
+       "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
+       "csrc/quantization/gptq_marlin/gptq_marlin.cu"
+       "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
+       "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${MARLIN_SRCS}"
+      CUDA_ARCHS "${MARLIN_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
+    message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
+  else()
+    message(STATUS "Not building Marlin kernels as no compatible archs found"
+                   " in CUDA target architectures")
+  endif()
+
+  # Only build AllSpark kernels if we are building for at least some compatible archs.
+  cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}")
+  if (ALLSPARK_ARCHS)
+    set(ALLSPARK_SRCS
+       "csrc/quantization/gptq_allspark/allspark_repack.cu"
+       "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${ALLSPARK_SRCS}"
+      CUDA_ARCHS "${ALLSPARK_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${ALLSPARK_SRCS}")
+    message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}")
+  else()
+    message(STATUS "Not building AllSpark kernels as no compatible archs found"
+                   " in CUDA target architectures")
+  endif()
+
+
+  set(SCALED_MM_3X_ARCHS)
+  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
+  # CUDA 12.0 or later
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
+    set(SRCS
+       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${SCALED_MM_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM90=1")
+    # Let scaled_mm_c2x know it doesn't need to build these arches
+    list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
+    message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
+      message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
+                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
+                     "later if you intend on running FP8 quantized models on "
+                     "Hopper.")
+    else()
+      message(STATUS "Not building scaled_mm_c3x_sm90 as no compatible archs found "
+                     "in CUDA target architectures")
+    endif()
+  endif()
+
+
+  # The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
+  # CUDA 12.8 or later
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+    set(SRCS
+      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
+      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
+    )
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${SCALED_MM_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM120=1")
+    # Let scaled_mm_c2x know it doesn't need to build these arches
+    list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
+    message(STATUS "Building scaled_mm_c3x_sm120 for archs: ${SCALED_MM_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+      message(STATUS "Not building scaled_mm_c3x_sm120 as CUDA Compiler version is "
+                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or "
+                     "later if you intend on running FP8 quantized models on "
+                     "Blackwell.")
+    else()
+      message(STATUS "Not building scaled_mm_c3x_120 as no compatible archs found "
+                     "in CUDA target architectures")
+    endif()
+  endif()
+
+
+  # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
+  # require CUDA 12.8 or later
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+    set(SRCS
+      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
+      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
+      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
+    )
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${SCALED_MM_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM100=1")
+    # Let scaled_mm_c2x know it doesn't need to build these arches
+    list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
+    message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+      message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
+                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or "
+                     "later if you intend on running FP8 quantized models on "
+                     "Blackwell.")
+    else()
+      message(STATUS "Not building scaled_mm_c3x_100 as no compatible archs found "
+                     "in CUDA target architectures")
+    endif()
+  endif()
+
+  #
+  # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
+  # kernels for the remaining archs that are not already built for 3x.
+  # (Build 8.9 for FP8)
+  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
+    "7.5;8.0;8.7;8.9+PTX" "${CUDA_ARCHS}")
+  # subtract out the archs that are already built for 3x
+  list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
+  if (SCALED_MM_2X_ARCHS)
+    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C2X=1")
+    message(STATUS "Building scaled_mm_c2x for archs: ${SCALED_MM_2X_ARCHS}")
+  else()
+    if (SCALED_MM_3X_ARCHS)
+      message(STATUS "Not building scaled_mm_c2x as all archs are already built"
+                     " for and covered by scaled_mm_c3x")
+    else()
+      message(STATUS "Not building scaled_mm_c2x as no compatible archs found "
+                    "in CUDA target architectures")
+    endif()
+  endif()
+
+  #
+  # 2:4 Sparse Kernels
+
+  # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
+  # require CUDA 12.2 or later (and only work on Hopper).
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.2 AND SCALED_MM_ARCHS)
+    set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${SCALED_MM_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
+    message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.2 AND SCALED_MM_ARCHS)
+      message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
+                     "not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
+                     "if you intend on running FP8 sparse quantized models on Hopper.")
+    else()
+      message(STATUS "Not building sparse_scaled_mm_c3x as no compatible archs found "
+                     "in CUDA target architectures")
+    endif()
+  endif()
+
+  # FP4 Archs and flags
+  cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
+    set(SRCS
+      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
+      "csrc/quantization/fp4/nvfp4_experts_quant.cu"
+      "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
+      "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${FP4_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
+    message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
+  else()
+    message(STATUS "Not building NVFP4 as no compatible archs were found.")
+    # clear FP4_ARCHS
+    set(FP4_ARCHS)
+  endif()
+
+  # CUTLASS MLA Archs and flags
+  cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
+    set(SRCS
+      "csrc/attention/mla/cutlass_mla_kernels.cu"
+      "csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${MLA_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MLA=1")
+    # Add MLA-specific include directories only to MLA source files
+    set_source_files_properties(${SRCS}
+      PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common")
+    message(STATUS "Building CUTLASS MLA for archs: ${MLA_ARCHS}")
+  else()
+    message(STATUS "Not building CUTLASS MLA as no compatible archs were found.")
+    # clear MLA_ARCHS
+    set(MLA_ARCHS)
+  endif()
+
+  # CUTLASS MoE kernels
+
+  # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works
+  # on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
+  # if it's possible to compile MoE kernels that use its output.
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${SCALED_MM_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM90=1")
+    message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
+      message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
+                     "not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
+                     "if you intend on running FP8 quantized MoE models on Hopper.")
+    else()
+      message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
+                     "in CUDA target architectures.")
+    endif()
+  endif()
+
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${SCALED_MM_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
+    message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+      message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
+                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
+                     "if you intend on running FP8 quantized MoE models on Blackwell.")
+    else()
+      message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
+                     "in CUDA target architectures.")
+    endif()
+  endif()
+
+  # moe_data.cu is used by all CUTLASS MoE kernels.
+  cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    message(STATUS "Building moe_data for archs: ${CUTLASS_MOE_DATA_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
+      message(STATUS "Not building moe_data as CUDA Compiler version is "
+                     "not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
+                     "if you intend on running FP8 quantized MoE models on Hopper or Blackwell.")
+    else()
+      message(STATUS "Not building moe_data as no compatible archs found "
+                     "in CUDA target architectures.")
+    endif()
+  endif()
+  
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${SCALED_MM_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
+    message(STATUS "Building blockwise_scaled_group_mm_sm100 for archs: ${SCALED_MM_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+      message(STATUS "Not building blockwise_scaled_group_mm_sm100 kernels as CUDA Compiler version is "
+                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
+                     "if you intend on running FP8 quantized MoE models on Blackwell.")
+    else()
+      message(STATUS "Not building blockwise_scaled_group_mm_sm100 as no compatible archs found "
+                     "in CUDA target architectures")
+    endif()
+  endif()
+
+  #
+  # Machete kernels
+
+  # The machete kernels only work on hopper and require CUDA 12.0 or later.
+  # Only build Machete kernels if we are building for something compatible with sm90a
+  cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND MACHETE_ARCHS)
+    #
+    # For the Machete kernels we automatically generate sources for various
+    # preselected input type pairs and schedules.
+    # Generate sources:
+    set(MACHETE_GEN_SCRIPT
+      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py)
+    file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)
+
+    message(STATUS "Machete generation script hash: ${MACHETE_GEN_SCRIPT_HASH}")
+    message(STATUS "Last run machete generate script hash: $CACHE{MACHETE_GEN_SCRIPT_HASH}")
+
+    if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH}
+        OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
+      execute_process(
+        COMMAND ${CMAKE_COMMAND} -E env
+        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
+          ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
+        RESULT_VARIABLE machete_generation_result
+        OUTPUT_VARIABLE machete_generation_output
+        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
+        ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
+      )
+
+      if (NOT machete_generation_result EQUAL 0)
+        message(FATAL_ERROR "Machete generation failed."
+                            " Result: \"${machete_generation_result}\""
+                            "\nCheck the log for details: "
+                            "${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
+      else()
+        set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
+            CACHE STRING "Last run machete generate script hash" FORCE)
+        message(STATUS "Machete generation completed successfully.")
+      endif()
+    else()
+      message(STATUS "Machete generation script has not changed, skipping generation.")
+    endif()
+
+    # Add machete generated sources
+    file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu")
+    list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES})
+
+    # forward compatible
+    set_gencode_flags_for_srcs(
+      SRCS "${MACHETE_GEN_SOURCES}"
+      CUDA_ARCHS "${MACHETE_ARCHS}")
+
+    list(APPEND VLLM_EXT_SRC
+      csrc/quantization/machete/machete_pytorch.cu)
+
+    message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0
+        AND MACHETE_ARCHS)
+      message(STATUS "Not building Machete kernels as CUDA Compiler version is "
+                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
+                     "later if you intend on running w4a16 quantized models on "
+                     "Hopper.")
+    else()
+      message(STATUS "Not building Machete kernels as no compatible archs "
+                     "found in CUDA target architectures")
+    endif()
+  endif()
+# if CUDA endif
+endif()
+
+if (VLLM_GPU_LANG STREQUAL "HIP")
+  # Add QuickReduce kernels
+  list(APPEND VLLM_EXT_SRC
+    "csrc/custom_quickreduce.cu"
+  )
+# if ROCM endif
+endif()
+
+message(STATUS "Enabling C extension.")
+define_gpu_extension_target(
+  _C
+  DESTINATION vllm
+  LANGUAGE ${VLLM_GPU_LANG}
+  SOURCES ${VLLM_EXT_SRC}
+  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
+  ARCHITECTURES ${VLLM_GPU_ARCHES}
+  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
+  INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
+  USE_SABI 3
+  WITH_SOABI)
+
+# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
+# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
+# driver API. This causes problems when linking with earlier versions of CUDA.
+# Setting this variable sidesteps the issue by calling the driver directly.
+target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
+
+#
+# _moe_C extension
+#
+
+set(VLLM_MOE_EXT_SRC
+  "csrc/moe/torch_bindings.cpp"
+  "csrc/moe/moe_align_sum_kernels.cu"
+  "csrc/moe/topk_softmax_kernels.cu")
+
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
+endif()
+
+set_gencode_flags_for_srcs(
+  SRCS "${VLLM_MOE_EXT_SRC}"
+  CUDA_ARCHS "${CUDA_ARCHS}")
+
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  set(VLLM_MOE_WNA16_SRC
+    "csrc/moe/moe_wna16.cu")
+
+  set_gencode_flags_for_srcs(
+    SRCS "${VLLM_MOE_WNA16_SRC}"
+    CUDA_ARCHS "${CUDA_ARCHS}")
+
+  list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
+  # 9.0 for latest bf16 atomicAdd PTX
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
+  if (MARLIN_MOE_ARCHS)
+
+    #
+    # For the Marlin MOE kernels we automatically generate sources for various
+    # preselected input type pairs and schedules.
+    # Generate sources:
+    set(MOE_MARLIN_GEN_SCRIPT
+      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
+    file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
+
+    message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
+    message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
+
+    if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
+        OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
+      execute_process(
+        COMMAND ${CMAKE_COMMAND} -E env
+        PYTHONPATH=$PYTHONPATH
+          ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
+        RESULT_VARIABLE moe_marlin_generation_result
+        OUTPUT_VARIABLE moe_marlin_generation_output
+        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
+        ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
+      )
+
+      if (NOT moe_marlin_generation_result EQUAL 0)
+        message(FATAL_ERROR "Marlin MOE generation failed."
+                            " Result: \"${moe_marlin_generation_result}\""
+                            "\nCheck the log for details: "
+                            "${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
+      else()
+        set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
+            CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
+        message(STATUS "Marlin MOE generation completed successfully.")
+      endif()
+    else()
+      message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
+    endif()
+
+    file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${MOE_WNAA16_MARLIN_SRC}"
+      CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
+
+    list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
+
+    message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
+  else()
+    message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
+                   " in CUDA target architectures")
+  endif()
+endif()
+
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  set(MOE_PERMUTE_SRC
+      "csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
+      "csrc/moe/moe_permute_unpermute_op.cu")
+
+  set_gencode_flags_for_srcs(
+    SRCS "${MARLIN_PERMUTE_SRC}"
+    CUDA_ARCHS "${MOE_PERMUTE_ARCHS}")
+
+  list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
+endif()
+message(STATUS "Enabling moe extension.")
+define_gpu_extension_target(
+  _moe_C
+  DESTINATION vllm
+  LANGUAGE ${VLLM_GPU_LANG}
+  SOURCES ${VLLM_MOE_EXT_SRC}
+  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
+  ARCHITECTURES ${VLLM_GPU_ARCHES}
+  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
+  INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
+  USE_SABI 3
+  WITH_SOABI)
+
+if(VLLM_GPU_LANG STREQUAL "HIP")
+  #
+  # _rocm_C extension
+  #
+  set(VLLM_ROCM_EXT_SRC
+    "csrc/rocm/torch_bindings.cpp"
+    "csrc/rocm/skinny_gemms.cu"
+    "csrc/rocm/attention.cu")
+
+  define_gpu_extension_target(
+    _rocm_C
+    DESTINATION vllm
+    LANGUAGE ${VLLM_GPU_LANG}
+    SOURCES ${VLLM_ROCM_EXT_SRC}
+    COMPILE_FLAGS ${VLLM_GPU_FLAGS}
+    ARCHITECTURES ${VLLM_GPU_ARCHES}
+    USE_SABI 3
+    WITH_SOABI)
+endif()
+
+# For CUDA we also build and ship some external projects.
+if (VLLM_GPU_LANG STREQUAL "CUDA")
+    include(cmake/external_projects/flashmla.cmake)
+
+    # vllm-flash-attn should be last as it overwrites some CMake functions
+    include(cmake/external_projects/vllm_flash_attn.cmake)
+endif ()
diff --git a/vllm_v0.10.0/CODE_OF_CONDUCT.md b/vllm_v0.10.0/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000..5268ff1
--- /dev/null
+++ b/vllm_v0.10.0/CODE_OF_CONDUCT.md
@@ -0,0 +1,127 @@
+
+# vLLM Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socioeconomic status,
+nationality, personal appearance, race, caste, color, religion, or sexual
+identity and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the overall
+  community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or advances of
+  any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email address,
+  without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official email address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline/IRL event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement in the #code-of-conduct
+channel in the [vLLM Slack](https://slack.vllm.ai).
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series of
+actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or permanent
+ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior, harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within the
+community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/),
+version 2.1, available at
+[v2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html).
+
+Community Impact Guidelines were inspired by
+[Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/inclusion).
+
+For answers to common questions about this code of conduct, see the
+[Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
+[Contributor Covenant translations](https://www.contributor-covenant.org/translations).
diff --git a/vllm_v0.10.0/CONTRIBUTING.md b/vllm_v0.10.0/CONTRIBUTING.md
new file mode 100644
index 0000000..2947aad
--- /dev/null
+++ b/vllm_v0.10.0/CONTRIBUTING.md
@@ -0,0 +1,3 @@
+# Contributing to vLLM
+
+You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing).
diff --git a/vllm_v0.10.0/DCO b/vllm_v0.10.0/DCO
new file mode 100644
index 0000000..49b8cb0
--- /dev/null
+++ b/vllm_v0.10.0/DCO
@@ -0,0 +1,34 @@
+Developer Certificate of Origin
+Version 1.1
+
+Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
+
+Everyone is permitted to copy and distribute verbatim copies of this
+license document, but changing it is not allowed.
+
+
+Developer's Certificate of Origin 1.1
+
+By making a contribution to this project, I certify that:
+
+(a) The contribution was created in whole or in part by me and I
+    have the right to submit it under the open source license
+    indicated in the file; or
+
+(b) The contribution is based upon previous work that, to the best
+    of my knowledge, is covered under an appropriate open source
+    license and I have the right under that license to submit that
+    work with modifications, whether created in whole or in part
+    by me, under the same open source license (unless I am
+    permitted to submit under a different license), as indicated
+    in the file; or
+
+(c) The contribution was provided directly to me by some other
+    person who certified (a), (b) or (c) and I have not modified
+    it.
+
+(d) I understand and agree that this project and the contribution
+    are public and that a record of the contribution (including all
+    personal information I submit with it, including my sign-off) is
+    maintained indefinitely and may be redistributed consistent with
+    this project or the open source license(s) involved.
diff --git a/vllm_v0.10.0/LICENSE b/vllm_v0.10.0/LICENSE
new file mode 100644
index 0000000..261eeb9
--- /dev/null
+++ b/vllm_v0.10.0/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/vllm_v0.10.0/MANIFEST.in b/vllm_v0.10.0/MANIFEST.in
new file mode 100644
index 0000000..82fd22b
--- /dev/null
+++ b/vllm_v0.10.0/MANIFEST.in
@@ -0,0 +1,10 @@
+include LICENSE
+include requirements/common.txt
+include requirements/cuda.txt
+include requirements/rocm.txt
+include requirements/neuron.txt
+include requirements/cpu.txt
+include CMakeLists.txt
+
+recursive-include cmake *
+recursive-include csrc *
diff --git a/vllm_v0.10.0/README.md b/vllm_v0.10.0/README.md
new file mode 100644
index 0000000..dc2f0af
--- /dev/null
+++ b/vllm_v0.10.0/README.md
@@ -0,0 +1,165 @@
+<p align="center">
+  <picture>
+    <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-dark.png">
+    <img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-light.png" width=55%>
+  </picture>
+</p>
+
+<h3 align="center">
+Easy, fast, and cheap LLM serving for everyone
+</h3>
+
+<p align="center">
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
+</p>
+
+---
+
+*Latest News* 🔥
+- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
+- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
+- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
+- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
+
+<details>
+<summary>Previous News</summary>
+
+- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
+- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
+- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
+- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
+- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
+- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
+- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
+- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
+- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
+- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
+- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
+- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
+- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
+- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
+- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
+- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
+- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
+- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
+
+</details>
+
+---
+## About
+
+vLLM is a fast and easy-to-use library for LLM inference and serving.
+
+Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.
+
+vLLM is fast with:
+
+- State-of-the-art serving throughput
+- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
+- Continuous batching of incoming requests
+- Fast model execution with CUDA/HIP graph
+- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516), INT4, INT8, and FP8
+- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer
+- Speculative decoding
+- Chunked prefill
+
+vLLM is flexible and easy to use with:
+
+- Seamless integration with popular Hugging Face models
+- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
+- Tensor, pipeline, data and expert parallelism support for distributed inference
+- Streaming outputs
+- OpenAI-compatible API server
+- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron
+- Prefix caching support
+- Multi-LoRA support
+
+vLLM seamlessly supports most popular open-source models on HuggingFace, including:
+- Transformer-like LLMs (e.g., Llama)
+- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
+- Embedding Models (e.g., E5-Mistral)
+- Multi-modal LLMs (e.g., LLaVA)
+
+Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
+
+## Getting Started
+
+Install vLLM with `pip` or [from source](https://docs.vllm.ai/en/latest/getting_started/installation/gpu/index.html#build-wheel-from-source):
+
+```bash
+pip install vllm
+```
+
+Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
+- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation.html)
+- [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
+- [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)
+
+## Contributing
+
+We welcome and value any contributions and collaborations.
+Please check out [Contributing to vLLM](https://docs.vllm.ai/en/latest/contributing/index.html) for how to get involved.
+
+## Sponsors
+
+vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
+
+<!-- Note: Please sort them in alphabetical order. -->
+<!-- Note: Please keep these consistent with docs/community/sponsors.md -->
+Cash Donations:
+- a16z
+- Dropbox
+- Sequoia Capital
+- Skywork AI
+- ZhenFund
+
+Compute Resources:
+- AMD
+- Anyscale
+- AWS
+- Crusoe Cloud
+- Databricks
+- DeepInfra
+- Google Cloud
+- Intel
+- Lambda Lab
+- Nebius
+- Novita AI
+- NVIDIA
+- Replicate
+- Roblox
+- RunPod
+- Trainy
+- UC Berkeley
+- UC San Diego
+
+Slack Sponsor: Anyscale
+
+We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
+
+## Citation
+
+If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
+
+```bibtex
+@inproceedings{kwon2023efficient,
+  title={Efficient Memory Management for Large Language Model Serving with PagedAttention},
+  author={Woosuk Kwon and Zhuohan Li and Siyuan Zhuang and Ying Sheng and Lianmin Zheng and Cody Hao Yu and Joseph E. Gonzalez and Hao Zhang and Ion Stoica},
+  booktitle={Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles},
+  year={2023}
+}
+```
+
+## Contact Us
+
+<!-- --8<-- [start:contact-us] -->
+- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
+- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
+- For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
+- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
+- For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
+<!-- --8<-- [end:contact-us] -->
+
+## Media Kit
+
+- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit)
diff --git a/vllm_v0.10.0/RELEASE.md b/vllm_v0.10.0/RELEASE.md
new file mode 100644
index 0000000..9352e7e
--- /dev/null
+++ b/vllm_v0.10.0/RELEASE.md
@@ -0,0 +1,87 @@
+# Releasing vLLM
+
+vLLM releases offer a reliable version of the code base, packaged into a binary format that can be conveniently accessed via PyPI. These releases also serve as key milestones for the development team to communicate with the community about newly available features, improvements, and upcoming changes that could affect users, including potential breaking changes.
+
+## Release Versioning
+
+vLLM uses a “right-shifted” versioning scheme where a new patch release is out every 2 weeks. And patch releases contain features and bug fixes (as opposed to semver where patch release contains only backwards-compatible bug fixes). When critical fixes need to be made, special release post1 is released.
+
+* _major_ major architectural milestone and when incompatible API changes are made, similar to PyTorch 2.0.
+* _minor_ major features
+* _patch_ features and backwards-compatible bug fixes
+* _post1_ or _patch-1_ backwards-compatible bug fixes, either explicit or implicit post release
+
+## Release Cadence
+
+Patch release is released on bi-weekly basis. Post release 1-3 days after patch release and uses same branch as patch release.
+Following is the release cadence for year 2025. All future release dates below are tentative. Please note: Post releases are optional.
+
+| Release Date | Patch release versions | Post Release versions |
+| --- | --- | --- |
+| Jan 2025 | 0.7.0 | --- |
+| Feb 2025 | 0.7.1, 0.7.2, 0.7.3  | --- |
+| Mar 2025 | 0.7.4, 0.7.5 | --- |
+| Apr 2025 | 0.7.6, 0.7.7 | --- |
+| May 2025 | 0.7.8, 0.7.9 | --- |
+| Jun 2025 | 0.7.10, 0.7.11 | --- |
+| Jul 2025 | 0.7.12, 0.7.13 | --- |
+| Aug 2025 | 0.7.14, 0.7.15 | --- |
+| Sep 2025 | 0.7.16, 0.7.17 | --- |
+| Oct 2025 | 0.7.18, 0.7.19 | --- |
+| Nov 2025 | 0.7.20, 0.7.21 | --- |
+| Dec 2025 | 0.7.22, 0.7.23 | --- |
+
+## Release branch
+
+Each release is built from a dedicated release branch.
+
+* For _major_, _minor_, _patch_ releases, the release branch cut is performed 1-2 days before release is live.
+* For post releases, previously cut release branch is reused
+* Release builds are triggered via push to RC tag like vX.Y.Z-rc1 . This enables us to build and test multiple RCs for each release.
+* Final tag : vX.Y.Z does not trigger the build but used for Release notes and assets.
+* After branch cut is created we monitor the main branch for any reverts and apply these reverts to a release branch.
+
+## Release Cherry-Pick Criteria
+
+After branch cut, we approach finalizing the release branch with clear criteria on what cherry picks are allowed in. Note: a cherry pick is a process to land a PR in the release branch after branch cut. These are typically limited to ensure that the team has sufficient time to complete a thorough round of testing on a stable code base.
+
+* Regression fixes - that address functional/performance regression against the most recent release (e.g. 0.7.0 for 0.7.1 release)
+* Critical fixes - critical fixes for severe issue such as silent incorrectness, backwards compatibility, crashes, deadlocks, (large) memory leaks
+* Fixes to new features introduced in the most recent release (e.g. 0.7.0 for 0.7.1 release)
+* Documentation improvements
+* Release branch specific changes (e.g. change version identifiers or CI fixes)
+
+Please note: **No feature work allowed for cherry picks**. All PRs that are considered for cherry-picks need to be merged on trunk, the only exception are Release branch specific changes.
+
+## Manual validations
+
+### E2E Performance Validation
+
+Before each release, we perform end-to-end performance validation to ensure no regressions are introduced. This validation uses the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) on PyTorch CI.
+
+**Current Coverage:**
+* Models: Llama3, Llama4, and Mixtral
+* Hardware: NVIDIA H100 and AMD MI300x
+* *Note: Coverage may change based on new model releases and hardware availability*
+
+**Performance Validation Process:**
+
+**Step 1: Get Access**
+Request write access to the [pytorch/pytorch-integration-testing](https://github.com/pytorch/pytorch-integration-testing) repository to run the benchmark workflow.
+
+**Step 2: Review Benchmark Setup**
+Familiarize yourself with the benchmark configurations:
+* [CUDA setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/cuda)
+* [ROCm setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/rocm)
+
+**Step 3: Run the Benchmark**
+Navigate to the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) and configure:
+* **vLLM branch**: Set to the release branch (e.g., `releases/v0.9.2`)
+* **vLLM commit**: Set to the RC commit hash
+
+**Step 4: Review Results**
+Once the workflow completes, benchmark results will be available on the [vLLM benchmark dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) under the corresponding branch and commit.
+
+**Step 5: Performance Comparison**
+Compare the current results against the previous release to verify no performance regressions have occurred. Here is an
+example of [v0.9.1 vs v0.9.2](https://hud.pytorch.org/benchmark/llms?startTime=Thu%2C%2017%20Apr%202025%2021%3A43%3A50%20GMT&stopTime=Wed%2C%2016%20Jul%202025%2021%3A43%3A50%20GMT&granularity=week&lBranch=releases/v0.9.1&lCommit=b6553be1bc75f046b00046a4ad7576364d03c835&rBranch=releases/v0.9.2&rCommit=a5dd03c1ebc5e4f56f3c9d3dc0436e9c582c978f&repoName=vllm-project%2Fvllm&benchmarkName=&modelName=All%20Models&backendName=All%20Backends&modeName=All%20Modes&dtypeName=All%20DType&deviceName=All%20Devices&archName=All%20Platforms).
diff --git a/vllm_v0.10.0/SECURITY.md b/vllm_v0.10.0/SECURITY.md
new file mode 100644
index 0000000..6053cfb
--- /dev/null
+++ b/vllm_v0.10.0/SECURITY.md
@@ -0,0 +1,13 @@
+# Security Policy
+
+## Reporting a Vulnerability
+
+If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
+
+Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html).
+
+---
+
+Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations.
+
+Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
diff --git a/vllm_v0.10.0/benchmarks/README.md b/vllm_v0.10.0/benchmarks/README.md
new file mode 100644
index 0000000..fb8690d
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/README.md
@@ -0,0 +1,605 @@
+# Benchmarking vLLM
+
+This README guides you through running benchmark tests with the extensive
+datasets supported on vLLM. It’s a living document, updated as new features and datasets
+become available.
+
+**Dataset Overview**
+
+<table style="width:100%; border-collapse: collapse;">
+  <thead>
+    <tr>
+      <th style="width:15%; text-align: left;">Dataset</th>
+      <th style="width:10%; text-align: center;">Online</th>
+      <th style="width:10%; text-align: center;">Offline</th>
+      <th style="width:65%; text-align: left;">Data Path</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td><strong>ShareGPT</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json</code></td>
+    </tr>
+    <tr>
+      <td><strong>BurstGPT</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv</code></td>
+    </tr>
+    <tr>
+      <td><strong>Sonnet</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td>Local file: <code>benchmarks/sonnet.txt</code></td>
+    </tr>
+    <tr>
+      <td><strong>Random</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>synthetic</code></td>
+    </tr>
+    <tr>
+      <td><strong>HuggingFace-VisionArena</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>lmarena-ai/VisionArena-Chat</code></td>
+    </tr>
+    <tr>
+      <td><strong>HuggingFace-InstructCoder</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>likaixin/InstructCoder</code></td>
+    </tr>
+      <tr>
+      <td><strong>HuggingFace-AIMO</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
+    </tr>
+    <tr>
+      <td><strong>HuggingFace-Other</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
+    </tr>
+    <tr>
+      <td><strong>Custom</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td>Local file: <code>data.jsonl</code></td>
+    </tr>
+  </tbody>
+</table>
+
+✅: supported
+
+🟡: Partial support
+
+🚧: to be supported
+
+**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
+
+---
+<details>
+<summary><b>🚀 Example - Online Benchmark</b></summary>
+
+<br/>
+
+First start serving your model
+
+```bash
+vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
+```
+
+Then run the benchmarking script
+
+```bash
+# download dataset
+# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+python3 vllm/benchmarks/benchmark_serving.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --endpoint /v1/completions \
+  --dataset-name sharegpt \
+  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --num-prompts 10
+```
+
+If successful, you will see the following output
+
+```
+============ Serving Benchmark Result ============
+Successful requests:                     10        
+Benchmark duration (s):                  5.78      
+Total input tokens:                      1369      
+Total generated tokens:                  2212      
+Request throughput (req/s):              1.73      
+Output token throughput (tok/s):         382.89    
+Total Token throughput (tok/s):          619.85    
+---------------Time to First Token----------------
+Mean TTFT (ms):                          71.54     
+Median TTFT (ms):                        73.88     
+P99 TTFT (ms):                           79.49     
+-----Time per Output Token (excl. 1st token)------
+Mean TPOT (ms):                          7.91      
+Median TPOT (ms):                        7.96      
+P99 TPOT (ms):                           8.03      
+---------------Inter-token Latency----------------
+Mean ITL (ms):                           7.74      
+Median ITL (ms):                         7.70      
+P99 ITL (ms):                            8.39      
+==================================================
+```
+
+**Custom Dataset**
+
+If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
+
+```
+{"prompt": "What is the capital of India?"}
+{"prompt": "What is the capital of Iran?"}
+{"prompt": "What is the capital of China?"}
+``` 
+
+```bash
+# start server
+VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
+```
+
+```bash
+# run benchmarking script
+python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
+  --backend vllm \
+  --model meta-llama/Llama-3.1-8B-Instruct \
+  --endpoint /v1/completions \
+  --dataset-name custom \
+  --dataset-path <path-to-your-data-jsonl> \
+  --custom-skip-chat-template \
+  --num-prompts 80 \
+  --max-concurrency 1 \
+  --temperature=0.3 \
+  --top-p=0.75 \
+  --result-dir "./log/"
+```
+
+You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
+
+**VisionArena Benchmark for Vision Language Models**
+
+```bash
+# need a model with vision capability here
+vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
+```
+
+```bash
+python3 vllm/benchmarks/benchmark_serving.py \
+  --backend openai-chat \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --endpoint /v1/chat/completions \
+  --dataset-name hf \
+  --dataset-path lmarena-ai/VisionArena-Chat \
+  --hf-split train \
+  --num-prompts 1000
+```
+
+**InstructCoder Benchmark with Speculative Decoding**
+
+``` bash
+VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
+    --speculative-config $'{"method": "ngram",
+    "num_speculative_tokens": 5, "prompt_lookup_max": 5,
+    "prompt_lookup_min": 2}'
+```
+
+``` bash
+python3 benchmarks/benchmark_serving.py \
+    --model meta-llama/Meta-Llama-3-8B-Instruct \
+    --dataset-name hf \
+    --dataset-path likaixin/InstructCoder \
+    --num-prompts 2048
+```
+
+**Other HuggingFaceDataset Examples**
+
+```bash
+vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
+```
+
+**`lmms-lab/LLaVA-OneVision-Data`**
+
+```bash
+python3 vllm/benchmarks/benchmark_serving.py \
+  --backend openai-chat \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --endpoint /v1/chat/completions \
+  --dataset-name hf \
+  --dataset-path lmms-lab/LLaVA-OneVision-Data \
+  --hf-split train \
+  --hf-subset "chart2text(cauldron)" \
+  --num-prompts 10
+```
+
+**`Aeala/ShareGPT_Vicuna_unfiltered`**
+
+```bash
+python3 vllm/benchmarks/benchmark_serving.py \
+  --backend openai-chat \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --endpoint /v1/chat/completions \
+  --dataset-name hf \
+  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
+  --hf-split train \
+  --num-prompts 10
+```
+
+**`AI-MO/aimo-validation-aime`**
+
+``` bash
+python3 vllm/benchmarks/benchmark_serving.py \
+    --model Qwen/QwQ-32B \
+    --dataset-name hf \
+    --dataset-path AI-MO/aimo-validation-aime \
+    --num-prompts 10 \
+    --seed 42
+```
+
+**`philschmid/mt-bench`**
+
+``` bash
+python3 vllm/benchmarks/benchmark_serving.py \
+    --model Qwen/QwQ-32B \
+    --dataset-name hf \
+    --dataset-path philschmid/mt-bench \
+    --num-prompts 80
+```
+
+**Running With Sampling Parameters**
+
+When using OpenAI-compatible backends such as `vllm`, optional sampling
+parameters can be specified. Example client command:
+
+```bash
+python3 vllm/benchmarks/benchmark_serving.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --endpoint /v1/completions \
+  --dataset-name sharegpt \
+  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --top-k 10 \
+  --top-p 0.9 \
+  --temperature 0.5 \
+  --num-prompts 10
+```
+
+**Running With Ramp-Up Request Rate**
+
+The benchmark tool also supports ramping up the request rate over the
+duration of the benchmark run. This can be useful for stress testing the
+server or finding the maximum throughput that it can handle, given some latency budget.
+
+Two ramp-up strategies are supported:
+- `linear`: Increases the request rate linearly from a start value to an end value.
+- `exponential`: Increases the request rate exponentially.
+
+The following arguments can be used to control the ramp-up:
+- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
+- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
+- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
+
+</details>
+
+<details>
+<summary><b>📈 Example - Offline Throughput Benchmark</b></summary>
+
+<br/>
+
+```bash
+python3 vllm/benchmarks/benchmark_throughput.py \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset-name sonnet \
+  --dataset-path vllm/benchmarks/sonnet.txt \
+  --num-prompts 10
+```
+
+If successful, you will see the following output
+
+```
+Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
+Total num prompt tokens:  5014
+Total num output tokens:  1500
+```
+
+**VisionArena Benchmark for Vision Language Models**
+
+``` bash
+python3 vllm/benchmarks/benchmark_throughput.py \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --backend vllm-chat \
+  --dataset-name hf \
+  --dataset-path lmarena-ai/VisionArena-Chat \
+  --num-prompts 1000 \
+  --hf-split train
+```
+
+The `num prompt tokens` now includes image token counts
+
+```
+Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
+Total num prompt tokens:  14527
+Total num output tokens:  1280
+```
+
+**InstructCoder Benchmark with Speculative Decoding**
+
+``` bash
+VLLM_WORKER_MULTIPROC_METHOD=spawn \
+VLLM_USE_V1=1 \
+python3 vllm/benchmarks/benchmark_throughput.py \
+    --dataset-name=hf \
+    --dataset-path=likaixin/InstructCoder \
+    --model=meta-llama/Meta-Llama-3-8B-Instruct \
+    --input-len=1000 \
+    --output-len=100 \
+    --num-prompts=2048 \
+    --async-engine \
+    --speculative-config $'{"method": "ngram",
+    "num_speculative_tokens": 5, "prompt_lookup_max": 5,
+    "prompt_lookup_min": 2}'
+```
+
+```
+Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
+Total num prompt tokens:  261136
+Total num output tokens:  204800
+```
+
+**Other HuggingFaceDataset Examples**
+
+**`lmms-lab/LLaVA-OneVision-Data`**
+
+```bash
+python3 vllm/benchmarks/benchmark_throughput.py \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --backend vllm-chat \
+  --dataset-name hf \
+  --dataset-path lmms-lab/LLaVA-OneVision-Data \
+  --hf-split train \
+  --hf-subset "chart2text(cauldron)" \
+  --num-prompts 10
+```
+
+**`Aeala/ShareGPT_Vicuna_unfiltered`**
+
+```bash
+python3 vllm/benchmarks/benchmark_throughput.py \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --backend vllm-chat \
+  --dataset-name hf \
+  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
+  --hf-split train \
+  --num-prompts 10
+```
+
+**`AI-MO/aimo-validation-aime`**
+
+```bash
+python3 benchmarks/benchmark_throughput.py \
+  --model Qwen/QwQ-32B \
+  --backend vllm \
+  --dataset-name hf \
+  --dataset-path AI-MO/aimo-validation-aime \
+  --hf-split train \
+  --num-prompts 10
+```
+
+**Benchmark with LoRA Adapters**
+
+``` bash
+# download dataset
+# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+python3 vllm/benchmarks/benchmark_throughput.py \
+  --model meta-llama/Llama-2-7b-hf \
+  --backend vllm \
+  --dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --dataset_name sharegpt \
+  --num-prompts 10 \
+  --max-loras 2 \
+  --max-lora-rank 8 \
+  --enable-lora \
+  --lora-path yard1/llama-2-7b-sql-lora-test
+  ```
+
+</details>
+
+<details>
+<summary><b>🛠️ Example - Structured Output Benchmark</b></summary>
+
+<br/>
+
+Benchmark the performance of structured output generation (JSON, grammar, regex).
+
+**Server Setup**
+
+```bash
+vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
+```
+
+**JSON Schema Benchmark**
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset json \
+  --structured-output-ratio 1.0 \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+**Grammar-based Generation Benchmark**
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset grammar \
+  --structure-type grammar \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+**Regex-based Generation Benchmark**
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset regex \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+**Choice-based Generation Benchmark**
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset choice \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+**XGrammar Benchmark Dataset**
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset xgrammar_bench \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+</details>
+
+<details>
+<summary><b>📚 Example - Long Document QA Benchmark</b></summary>
+
+<br/>
+
+Benchmark the performance of long document question-answering with prefix caching.
+
+**Basic Long Document QA Test**
+
+```bash
+python3 benchmarks/benchmark_long_document_qa_throughput.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-documents 16 \
+  --document-length 2000 \
+  --output-len 50 \
+  --repeat-count 5
+```
+
+**Different Repeat Modes**
+
+```bash
+# Random mode (default) - shuffle prompts randomly
+python3 benchmarks/benchmark_long_document_qa_throughput.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-documents 8 \
+  --document-length 3000 \
+  --repeat-count 3 \
+  --repeat-mode random
+
+# Tile mode - repeat entire prompt list in sequence
+python3 benchmarks/benchmark_long_document_qa_throughput.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-documents 8 \
+  --document-length 3000 \
+  --repeat-count 3 \
+  --repeat-mode tile
+
+# Interleave mode - repeat each prompt consecutively
+python3 benchmarks/benchmark_long_document_qa_throughput.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-documents 8 \
+  --document-length 3000 \
+  --repeat-count 3 \
+  --repeat-mode interleave
+```
+
+</details>
+
+<details>
+<summary><b>🗂️ Example - Prefix Caching Benchmark</b></summary>
+
+<br/>
+
+Benchmark the efficiency of automatic prefix caching.
+
+**Fixed Prompt with Prefix Caching**
+
+```bash
+python3 benchmarks/benchmark_prefix_caching.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-prompts 1 \
+  --repeat-count 100 \
+  --input-length-range 128:256
+```
+
+**ShareGPT Dataset with Prefix Caching**
+
+```bash
+# download dataset
+# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+python3 benchmarks/benchmark_prefix_caching.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --enable-prefix-caching \
+  --num-prompts 20 \
+  --repeat-count 5 \
+  --input-length-range 128:256
+```
+
+</details>
+
+<details>
+<summary><b>⚡ Example - Request Prioritization Benchmark</b></summary>
+
+<br/>
+
+Benchmark the performance of request prioritization in vLLM.
+
+**Basic Prioritization Test**
+
+```bash
+python3 benchmarks/benchmark_prioritization.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --input-len 128 \
+  --output-len 64 \
+  --num-prompts 100 \
+  --scheduling-policy priority
+```
+
+**Multiple Sequences per Prompt**
+
+```bash
+python3 benchmarks/benchmark_prioritization.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --input-len 128 \
+  --output-len 64 \
+  --num-prompts 100 \
+  --scheduling-policy priority \
+  --n 2
+```
+
+</details>
diff --git a/vllm_v0.10.0/benchmarks/auto_tune/README.md b/vllm_v0.10.0/benchmarks/auto_tune/README.md
new file mode 100644
index 0000000..7732f50
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/auto_tune/README.md
@@ -0,0 +1,137 @@
+# Automated vLLM Server Parameter Tuning
+
+This script automates the process of finding the optimal server parameter combination (`max-num-seqs` and `max-num-batched-tokens`) to maximize throughput for a vLLM server. It also supports additional constraints such as E2E latency and prefix cache hit rate.
+
+## Table of Contents
+- [Prerequisites](#prerequisites)
+- [Configuration](#configuration)
+- [How to Run](#how-to-run)
+- [Example Use Cases](#example-use-cases)
+- [Output](#output)
+- [How It Works](#how-it-works)
+
+## Prerequisites
+
+Before running the script, please ensure the following steps are completed:
+
+1. **Clone vLLM & Set Up Branch**: Clone the vLLM repository and check out to your desired branch.
+
+```bash
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+# git checkout <your-branch>
+```
+
+1. **Install Environment**: Install or update the correct running environment. For TPU usage, activate your `conda` environment and install the corresponding `torch` and `torch_xla` versions.
+
+2. **Model Configuration**: If you are using a customized model, ensure its configuration files are correctly placed and accessible.
+
+## Configuration
+
+You must set the following variables at the top of the script before execution.
+
+| Variable | Description | Example Value |
+| --- | --- | --- |
+| `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
+| `MODEL` | **Required.** The Hugging Face model identifier to be served by vllm. | `"meta-llama/Llama-3.1-8B-Instruct"` |
+| `SYSTEM`| **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` |
+| `TP` | **Required.** The tensor-parallelism size. | `1` |
+| `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) |
+| `INPUT_LEN` | **Required.** Request input length. | `4000` |
+| `OUTPUT_LEN` | **Required.** Request output length. | `16` |
+| `MIN_CACHE_HIT_PCT` | Prefix cache hit rate in percentage (0-100). Set to `0` to disable. | `60` |
+| `MAX_LATENCY_ALLOWED_MS` | The maximum allowed P99 end-to-end latency in milliseconds. Set to a very large number (e.g., `100000000000`) to effectively ignore the latency constraint. | `500` |
+| `NUM_SEQS_LIST` | A space-separated string of `max-num-seqs` values to test. | `"128 256"` |
+| `NUM_BATCHED_TOKENS_LIST` | A space-separated string of `max-num-batched-tokens` values to test. | `"1024 2048 4096"` |
+
+**Note**: The default `NUM_SEQS_LIST` and `NUM_BATCHED_TOKENS_LIST` are set for medium-sized inputs/outputs. For very short contexts (e.g., 20 input, 20 output tokens), you may need to test larger values for `max-num-seqs`.
+
+## How to Run
+
+1. **Configure**: Edit the script and set the variables in the [Configuration](#configuration) section.
+2. **Execute**: Run the script. Since the process can take a long time, it is highly recommended to use a terminal multiplexer like `tmux` or `screen` to prevent the script from stopping if your connection is lost.
+
+```
+cd <FOLDER_OF_THIS_SCRIPT>
+bash auto_tune.sh
+```
+
+    Please note that the `bash auto_tune.sh` command cannot contain full or partial path with keyword `vllm`, otherwise `pkill -f vllm` command will also kill this script itself.
+
+## Example Use Cases
+
+Here are a few examples of how to configure the script for different goals:
+
+### 1. Maximize Throughput (No Latency Constraint)
+- **Goal**: Find the best `max-num-seqs` and `max-num-batched-tokens` to get the highest possible throughput for 1800 input tokens and 20 output tokens.
+- **Configuration**:
+
+```bash
+INPUT_LEN=1800
+OUTPUT_LEN=20
+MIN_CACHE_HIT_PCT=0
+MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
+```
+
+#### 2. Maximize Throughput with a Latency Requirement
+- **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
+- **Configuration**:
+
+```bash
+INPUT_LEN=1800
+OUTPUT_LEN=20
+MIN_CACHE_HIT_PCT=0
+MAX_LATENCY_ALLOWED_MS=500
+```
+
+#### 3. Maximize Throughput with Prefix Caching and Latency Requirements
+- **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
+- **Configuration**:
+
+```bash
+INPUT_LEN=1800
+OUTPUT_LEN=20
+MIN_CACHE_HIT_PCT=60
+MAX_LATENCY_ALLOWED_MS=500
+```
+
+## Output
+
+After the script finishes, you will find the results in a new, timestamped directory created inside `$BASE/auto-benchmark/`.
+
+- **Log Files**: The directory (`$BASE/auto-benchmark/YYYY_MM_DD_HH_MM/`) contains detailed logs for each run:
+    - `vllm_log_...txt`: The log output from the vLLM server for each parameter combination.
+    - `bm_log_...txt`: The log output from the `benchmark_serving.py` script for each benchmark run.
+
+- **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found.
+
+```
+# Example result.txt content
+hash:a1b2c3d4...
+max_num_seqs: 128, max_num_batched_tokens: 2048, request_rate: 10.0, e2el: 450.5, throughput: 9.8, goodput: 9.8
+max_num_seqs: 128, max_num_batched_tokens: 4096 does not meet latency requirement 500
+...
+best_max_num_seqs: 256, best_num_batched_tokens: 2048, best_throughput: 12.5, profile saved in: /home/user/vllm/auto-benchmark/2024_08_01_10_30/profile
+```
+
+  If it cannot find the best parameters, the final row will be `best_max_num_seqs: 0, best_num_batched_tokens: 0, best_throughput: 0`. This can be due to either the server not starting properly, or the latency requirement being too strict.
+
+- **Profiler Trace**: A directory named `profile` is created inside the log directory. It contains the profiler trace file (e.g., `.xplane.pb` for TPU or a `.json` trace for GPU) from the single best-performing run.
+
+## How It Works
+
+The script follows a systematic process to find the optimal parameters:
+
+1. **Find Max GPU Memory Utilization**: The script first determines the highest safe `gpu-memory-utilization` (starting from 0.98 and decreasing) that does not cause an Out-Of-Memory (OOM) error when launching the server. This ensures the benchmark runs use the maximum available memory without crashing.
+
+2. **Iterate and Benchmark**: It then enters a nested loop, iterating through every combination of `max-num-seqs` and `max-num-batched-tokens` provided in the configuration lists.
+
+3. **Latency-Aware Throughput Search**: For each parameter combination:
+    - The vLLM server is started.
+    - A benchmark is first run with an infinite request rate (`--request-rate inf`).
+    - If the resulting P99 E2E latency is within the `MAX_LATENCY_ALLOWED_MS` limit, this throughput is considered the maximum for this configuration.
+    - If the latency is too high, the script performs a search by iteratively decreasing the request rate until the latency constraint is met. This finds the highest sustainable throughput for the given parameters and latency requirement.
+
+4. **Track Best Result**: Throughout the process, the script tracks the parameter combination that has yielded the highest valid throughput so far.
+
+5. **Profile Collection**: For the best-performing run, the script saves the vLLM profiler output, which can be used for deep-dive performance analysis with tools like TensorBoard.
diff --git a/vllm_v0.10.0/benchmarks/auto_tune/auto_tune.sh b/vllm_v0.10.0/benchmarks/auto_tune/auto_tune.sh
new file mode 100644
index 0000000..eaa28ea
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/auto_tune/auto_tune.sh
@@ -0,0 +1,248 @@
+#!/bin/bash
+
+# This script aims to tune the best server parameter combinations to maximize throughput for given requirement. 
+# See details in README (benchmarks/auto_tune/README.md).
+
+TAG=$(date +"%Y_%m_%d_%H_%M")
+BASE=""
+MODEL="meta-llama/Llama-3.1-8B-Instruct"
+SYSTEM="TPU"
+TP=1
+DOWNLOAD_DIR=""
+INPUT_LEN=4000
+OUTPUT_LEN=16
+MIN_CACHE_HIT_PCT=0
+MAX_LATENCY_ALLOWED_MS=100000000000
+NUM_SEQS_LIST="128 256"
+NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
+
+LOG_FOLDER="$BASE/auto-benchmark/$TAG"
+RESULT="$LOG_FOLDER/result.txt"
+PROFILE_PATH="$LOG_FOLDER/profile"
+
+echo "result file: $RESULT"
+echo "model: $MODEL"
+
+rm -rf $LOG_FOLDER
+rm -rf $PROFILE_PATH
+mkdir -p $LOG_FOLDER
+mkdir -p $PROFILE_PATH
+
+cd "$BASE/vllm"
+
+pip install -q datasets
+
+current_hash=$(git rev-parse HEAD)
+echo "hash:$current_hash" >> "$RESULT"
+echo "current_hash: $current_hash"
+
+best_throughput=0
+best_max_num_seqs=0
+best_num_batched_tokens=0
+best_goodput=0
+
+start_server() {
+    local gpu_memory_utilization=$1
+    local max_num_seqs=$2
+    local max_num_batched_tokens=$3
+    local vllm_log=$4
+    local profile_dir=$5
+    
+    pkill -f vllm
+
+    VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
+        --disable-log-requests \
+        --port 8004 \
+        --gpu-memory-utilization $gpu_memory_utilization \
+        --max-num-seqs $max_num_seqs \
+        --max-num-batched-tokens $max_num_batched_tokens \
+        --tensor-parallel-size $TP \
+        --enable-prefix-caching \
+        --load-format dummy \
+        --download-dir "$DOWNLOAD_DIR" \
+        --max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
+
+    # wait for 10 minutes...
+    server_started=0
+    for i in {1..60}; do  
+        RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
+        STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) 
+        if [[ "$STATUS_CODE" -eq 200 ]]; then
+            server_started=1
+            break
+        else
+            sleep 10
+        fi
+    done
+    if (( ! server_started )); then
+        echo "server did not start within 10 minutes. Please check server log at $vllm_log".
+        return 1
+    else
+        return 0
+    fi
+}
+
+update_best_profile() {
+    local profile_dir=$1
+    local profile_index=$2
+    sorted_paths=($(find "$profile_dir" -maxdepth 1 -not -path "$profile_dir" | sort))
+    selected_profile_file=
+    if [[ "$SYSTEM" == "TPU" ]]; then
+        selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
+    fi 
+    if [[ "$SYSTEM" == "GPU" ]]; then
+        selected_profile_file="${sorted_paths[$profile_index]}"
+    fi 
+    rm -f $PROFILE_PATH/*
+    cp $selected_profile_file $PROFILE_PATH
+}
+
+run_benchmark() {
+    local max_num_seqs=$1
+    local max_num_batched_tokens=$2
+    local gpu_memory_utilization=$3
+    echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
+    local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
+    local profile_dir="$LOG_FOLDER/profile_${max_num_seqs}_${max_num_batched_tokens}"
+    echo "vllm_log: $vllm_log"
+    echo
+    rm -f $vllm_log
+    mkdir -p $profile_dir
+    pkill -f vllm
+    local profile_index=0
+
+    echo "starting server..."
+    start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log $profile_dir
+    result=$?
+    if [[ "$result" -eq 1 ]]; then
+        echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
+    else
+        echo "server started."
+    fi
+    echo
+    
+    echo "run benchmark test..."
+    meet_latency_requirement=0
+    # get a basic qps by using request-rate inf
+    bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
+    prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
+adjusted_input_len=$(( INPUT_LEN - prefix_len ))
+    python3 benchmarks/benchmark_serving.py \
+        --backend vllm \
+        --model $MODEL  \
+        --dataset-name random \
+        --random-input-len $adjusted_input_len \
+        --random-output-len $OUTPUT_LEN \
+        --ignore-eos \
+        --disable-tqdm \
+        --request-rate inf \
+        --percentile-metrics ttft,tpot,itl,e2el \
+        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
+        --num-prompts 1000 \
+        --random-prefix-len $prefix_len \
+        --port 8004 \
+        --profile &> "$bm_log"
+    throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+    e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
+    goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+
+    if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
+        meet_latency_requirement=1
+        request_rate=inf
+    fi
+
+    if (( ! meet_latency_requirement )); then
+    # start from request-rate as int(throughput) + 1
+        request_rate=$((${throughput%.*} + 1))
+        while ((request_rate > 0)); do
+            profile_index=$((profile_index+1))
+            # clear prefix cache
+            curl -X POST http://0.0.0.0:8004/reset_prefix_cache
+            sleep 5
+            bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
+            python3 benchmarks/benchmark_serving.py \
+                --backend vllm \
+                --model $MODEL  \
+                --dataset-name random \
+                --random-input-len $adjusted_input_len \
+                --random-output-len $OUTPUT_LEN \
+                --ignore-eos \
+                --disable-tqdm \
+                --request-rate $request_rate \
+                --percentile-metrics ttft,tpot,itl,e2el \
+                --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
+                --num-prompts 100 \
+                --random-prefix-len $prefix_len \
+                --port 8004 &> "$bm_log"
+            throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+            e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
+            goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+            if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
+                meet_latency_requirement=1
+                break
+            fi
+            request_rate=$((request_rate-1))
+        done
+    fi
+    # write the results and update the best result.
+    if ((meet_latency_requirement)); then
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput"
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" >> "$RESULT"
+        if (( $(echo "$throughput > $best_throughput" | bc -l) )); then
+            best_throughput=$throughput
+            best_max_num_seqs=$max_num_seqs
+            best_num_batched_tokens=$max_num_batched_tokens
+            best_goodput=$goodput
+            if [[ "$SYSTEM" == "TPU" ]]; then
+                update_best_profile "$profile_dir/plugins/profile" $profile_index
+            fi
+            if [[ "$SYSTEM" == "GPU" ]]; then
+                update_best_profile "$profile_dir" $profile_index
+            fi
+        fi
+    else
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" >> "$RESULT"
+    fi
+
+    echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
+
+    pkill vllm
+    sleep 10
+    printf '=%.0s' $(seq 1 20)
+    return 0
+}
+
+read -r -a num_seqs_list <<< "$NUM_SEQS_LIST"
+read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"
+
+# first find out the max gpu-memory-utilization without HBM OOM.
+gpu_memory_utilization=0.98
+find_gpu_memory_utilization=0
+while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
+    start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log"
+    result=$?
+    if [[ "$result" -eq 0 ]]; then
+        find_gpu_memory_utilization=1
+        break
+    else
+        gpu_memory_utilization=$(echo "$gpu_memory_utilization - 0.01" | bc)
+    fi
+done
+
+if [[ "$find_gpu_memory_utilization" -eq 1 ]]; then
+    echo "Using gpu_memory_utilization=$gpu_memory_utilization to serve model."
+else
+    echo "Cannot find a proper gpu_memory_utilization over 0.9 to serve the model, please check logs in $LOG_FOLDER."
+    exit 1
+fi
+
+for num_seqs in "${num_seqs_list[@]}"; do
+    for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
+        run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization
+    done
+done
+echo "finish permutations"
+echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
+echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
+
diff --git a/vllm_v0.10.0/benchmarks/backend_request_func.py b/vllm_v0.10.0/benchmarks/backend_request_func.py
new file mode 100644
index 0000000..c7229db
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/backend_request_func.py
@@ -0,0 +1,628 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import io
+import json
+import os
+import sys
+import time
+import traceback
+from dataclasses import dataclass, field
+from typing import Optional, Union
+
+import aiohttp
+import huggingface_hub.constants
+from tqdm.asyncio import tqdm
+from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
+
+# NOTE(simon): do not import vLLM here so the benchmark script
+# can run without vLLM installed.
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+
+
+@dataclass
+class RequestFuncInput:
+    prompt: str
+    api_url: str
+    prompt_len: int
+    output_len: int
+    model: str
+    model_name: Optional[str] = None
+    logprobs: Optional[int] = None
+    extra_body: Optional[dict] = None
+    multi_modal_content: Optional[dict] = None
+    ignore_eos: bool = False
+    language: Optional[str] = None
+
+
+@dataclass
+class RequestFuncOutput:
+    generated_text: str = ""
+    success: bool = False
+    latency: float = 0.0
+    output_tokens: int = 0
+    ttft: float = 0.0  # Time to first token
+    itl: list[float] = field(default_factory=list)  # list of inter-token latencies
+    tpot: float = 0.0  # avg next-token latencies
+    prompt_len: int = 0
+    error: str = ""
+
+
+async def async_request_tgi(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate_stream")
+
+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
+        params = {
+            "max_new_tokens": request_func_input.output_len,
+            "do_sample": True,
+            "temperature": 0.01,  # TGI does not accept 0.0 temperature.
+            "top_p": 0.99,  # TGI does not accept 1.0 top_p.
+            "truncate": request_func_input.prompt_len,
+            "ignore_eos_token": request_func_input.ignore_eos,
+        }
+        payload = {
+            "inputs": request_func_input.prompt,
+            "parameters": params,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+        if request_func_input.ignore_eos:
+            output.output_tokens = request_func_input.output_len
+        else:
+            output.output_tokens = None
+
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+                        chunk_bytes = chunk_bytes.decode("utf-8")
+
+                        # NOTE: Sometimes TGI returns a ping response without
+                        # any data, we should skip it.
+                        if chunk_bytes.startswith(":"):
+                            continue
+                        chunk = chunk_bytes.removeprefix("data:")
+
+                        data = json.loads(chunk)
+                        timestamp = time.perf_counter()
+                        # First token
+                        if ttft == 0.0:
+                            ttft = time.perf_counter() - st
+                            output.ttft = ttft
+
+                        # Decoding phase
+                        else:
+                            output.itl.append(timestamp - most_recent_timestamp)
+
+                        most_recent_timestamp = timestamp
+
+                    output.latency = most_recent_timestamp - st
+                    output.success = True
+                    output.generated_text = data["generated_text"]
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_trt_llm(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate_stream")
+
+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
+        payload = {
+            "accumulate_tokens": True,
+            "text_input": request_func_input.prompt,
+            "temperature": 0.0,
+            "top_p": 1.0,
+            "max_tokens": request_func_input.output_len,
+            "stream": True,
+        }
+        if request_func_input.ignore_eos:
+            payload["min_length"] = request_func_input.output_len
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix("data:")
+
+                        data = json.loads(chunk)
+                        output.generated_text += data["text_output"]
+                        timestamp = time.perf_counter()
+                        # First token
+                        if ttft == 0.0:
+                            ttft = timestamp - st
+                            output.ttft = ttft
+
+                        # Decoding phase
+                        else:
+                            output.itl.append(timestamp - most_recent_timestamp)
+
+                        most_recent_timestamp = timestamp
+
+                    output.latency = most_recent_timestamp - st
+                    output.success = True
+
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_deepspeed_mii(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(("completions", "profile")), (
+        "OpenAI Completions API URL must end with 'completions' or 'profile'."
+    )
+
+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
+        payload = {
+            "model": request_func_input.model,
+            "prompt": request_func_input.prompt,
+            "max_tokens": request_func_input.output_len,
+            "temperature": 0.01,  # deepspeed-mii does not accept 0.0 temp.
+            "top_p": 1.0,
+        }
+        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        # NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
+        # will use 0 as placeholder.
+        # See https://github.com/microsoft/DeepSpeed-MII/pull/311
+        output.ttft = 0
+
+        st = time.perf_counter()
+        try:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
+                if response.status == 200:
+                    parsed_resp = await response.json()
+                    output.latency = time.perf_counter() - st
+                    if "choices" in parsed_resp:
+                        output.generated_text = parsed_resp["choices"][0]["text"]
+                    elif "text" in parsed_resp:
+                        output.generated_text = parsed_resp["text"][0]
+                    else:
+                        output.error = (
+                            "Unexpected response format: "
+                            "neither 'choices' nor 'text' found"
+                        )
+                        output.success = False
+                    output.success = True
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_openai_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(("completions", "profile")), (
+        "OpenAI Completions API URL must end with 'completions' or 'profile'."
+    )
+
+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
+        payload = {
+            "model": request_func_input.model_name
+            if request_func_input.model_name
+            else request_func_input.model,
+            "prompt": request_func_input.prompt,
+            "temperature": 0.0,
+            "repetition_penalty": 1.0,
+            "max_tokens": request_func_input.output_len,
+            "logprobs": request_func_input.logprobs,
+            "stream": True,
+            "stream_options": {
+                "include_usage": True,
+            },
+        }
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
+                if response.status == 200:
+                    first_chunk_received = False
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
+                        if chunk != "[DONE]":
+                            data = json.loads(chunk)
+
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
+                            if choices := data.get("choices"):
+                                # Note that text could be empty here
+                                # e.g. for special tokens
+                                text = choices[0].get("text")
+                                timestamp = time.perf_counter()
+                                # First token
+                                if not first_chunk_received:
+                                    first_chunk_received = True
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp - most_recent_timestamp)
+
+                                most_recent_timestamp = timestamp
+                                generated_text += text or ""
+                            if usage := data.get("usage"):
+                                output.output_tokens = usage.get("completion_tokens")
+                    if first_chunk_received:
+                        output.success = True
+                    else:
+                        output.success = False
+                        output.error = (
+                            "Never received a valid chunk to calculate TTFT."
+                            "This response will be marked as failed!"
+                        )
+                    output.generated_text = generated_text
+                    output.latency = most_recent_timestamp - st
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_openai_chat_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(("chat/completions", "profile")), (
+        "OpenAI Chat Completions API URL must end with 'chat/completions'."
+    )
+
+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
+        content = [{"type": "text", "text": request_func_input.prompt}]
+        if request_func_input.multi_modal_content:
+            content.append(request_func_input.multi_modal_content)
+        payload = {
+            "model": request_func_input.model_name
+            if request_func_input.model_name
+            else request_func_input.model,
+            "messages": [
+                {"role": "user", "content": content},
+            ],
+            "temperature": 0.0,
+            "max_completion_tokens": request_func_input.output_len,
+            "stream": True,
+            "stream_options": {
+                "include_usage": True,
+            },
+        }
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+                        chunk_bytes = chunk_bytes.decode("utf-8")
+                        # NOTE: SSE comments (often used as pings) start with a colon.
+                        # These are not JSON data payload and should be skipped.
+                        if chunk_bytes.startswith(":"):
+                            continue
+
+                        chunk = chunk_bytes.removeprefix("data: ")
+
+                        if chunk != "[DONE]":
+                            timestamp = time.perf_counter()
+                            data = json.loads(chunk)
+
+                            if choices := data.get("choices"):
+                                content = choices[0]["delta"].get("content")
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = timestamp - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp - most_recent_timestamp)
+
+                                generated_text += content or ""
+                            elif usage := data.get("usage"):
+                                output.output_tokens = usage.get("completion_tokens")
+
+                            most_recent_timestamp = timestamp
+
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = most_recent_timestamp - st
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_openai_audio(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    # Lazy import without PlaceholderModule to avoid vllm dep.
+    import soundfile
+
+    api_url = request_func_input.api_url
+    assert api_url.endswith(("transcriptions", "translations")), (
+        "OpenAI Chat Completions API URL must end with 'transcriptions' "
+    )
+    "or `translations`."
+
+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
+        content = [{"type": "text", "text": request_func_input.prompt}]
+        payload = {
+            "model": request_func_input.model_name
+            if request_func_input.model_name
+            else request_func_input.model,
+            "temperature": 0.0,
+            "max_completion_tokens": request_func_input.output_len,
+            "stream": True,
+            "language": "en",
+            # Flattened due to multipart/form-data
+            "stream_include_usage": True,
+            "stream_continuous_usage_stats": True,
+        }
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+
+        # Send audio file
+        def to_bytes(y, sr):
+            buffer = io.BytesIO()
+            soundfile.write(buffer, y, sr, format="WAV")
+            buffer.seek(0)
+            return buffer
+
+        with to_bytes(*request_func_input.multi_modal_content["audio"]) as f:
+            form = aiohttp.FormData()
+            form.add_field("file", f, content_type="audio/wav")
+            for key, value in payload.items():
+                form.add_field(key, str(value))
+
+            output = RequestFuncOutput()
+            output.prompt_len = request_func_input.prompt_len
+
+            generated_text = ""
+            ttft = 0.0
+            st = time.perf_counter()
+            most_recent_timestamp = st
+            try:
+                async with session.post(
+                    url=api_url, data=form, headers=headers
+                ) as response:
+                    if response.status == 200:
+                        async for chunk_bytes in response.content:
+                            chunk_bytes = chunk_bytes.strip()
+                            if not chunk_bytes:
+                                continue
+
+                            chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
+                            if chunk != "[DONE]":
+                                timestamp = time.perf_counter()
+                                data = json.loads(chunk)
+
+                                if choices := data.get("choices"):
+                                    content = choices[0]["delta"].get("content")
+                                    # First token
+                                    if ttft == 0.0:
+                                        ttft = timestamp - st
+                                        output.ttft = ttft
+
+                                    # Decoding phase
+                                    else:
+                                        output.itl.append(
+                                            timestamp - most_recent_timestamp
+                                        )
+
+                                    generated_text += content or ""
+                                elif usage := data.get("usage"):
+                                    output.output_tokens = usage.get(
+                                        "completion_tokens"
+                                    )
+
+                                most_recent_timestamp = timestamp
+
+                        output.generated_text = generated_text
+                        output.success = True
+                        output.latency = most_recent_timestamp - st
+                    else:
+                        output.error = response.reason or ""
+                        output.success = False
+            except Exception:
+                output.success = False
+                exc_info = sys.exc_info()
+                output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+def get_model(pretrained_model_name_or_path: str) -> str:
+    if os.getenv("VLLM_USE_MODELSCOPE", "False").lower() == "true":
+        from modelscope import snapshot_download
+
+        from vllm.model_executor.model_loader.weight_utils import get_lock
+
+        # Use file lock to prevent multiple processes from
+        # downloading the same model weights at the same time.
+        with get_lock(pretrained_model_name_or_path):
+            model_path = snapshot_download(
+                model_id=pretrained_model_name_or_path,
+                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
+            )
+
+            return model_path
+    return pretrained_model_name_or_path
+
+
+def get_tokenizer(
+    pretrained_model_name_or_path: str,
+    tokenizer_mode: str = "auto",
+    trust_remote_code: bool = False,
+    **kwargs,
+) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+    if pretrained_model_name_or_path is not None and not os.path.exists(
+        pretrained_model_name_or_path
+    ):
+        pretrained_model_name_or_path = get_model(pretrained_model_name_or_path)
+    if tokenizer_mode == "slow":
+        if kwargs.get("use_fast", False):
+            raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
+        kwargs["use_fast"] = False
+    if tokenizer_mode == "mistral":
+        try:
+            from vllm.transformers_utils.tokenizer import MistralTokenizer
+        except ImportError as e:
+            raise ImportError(
+                "MistralTokenizer requires vllm package.\n"
+                "Please install it with `pip install vllm` "
+                "to use mistral tokenizer mode."
+            ) from e
+        return MistralTokenizer.from_pretrained(str(pretrained_model_name_or_path))
+    else:
+        return AutoTokenizer.from_pretrained(
+            pretrained_model_name_or_path,
+            trust_remote_code=trust_remote_code,
+            **kwargs,
+        )
+
+
+ASYNC_REQUEST_FUNCS = {
+    "tgi": async_request_tgi,
+    "vllm": async_request_openai_completions,
+    "lmdeploy": async_request_openai_completions,
+    "deepspeed-mii": async_request_deepspeed_mii,
+    "openai": async_request_openai_completions,
+    "openai-chat": async_request_openai_chat_completions,
+    "openai-audio": async_request_openai_audio,
+    "tensorrt-llm": async_request_trt_llm,
+    "scalellm": async_request_openai_completions,
+    "sglang": async_request_openai_completions,
+    "llama.cpp": async_request_openai_completions,
+}
+
+OPENAI_COMPATIBLE_BACKENDS = [
+    k
+    for k, v in ASYNC_REQUEST_FUNCS.items()
+    if v in (async_request_openai_completions, async_request_openai_chat_completions)
+]
diff --git a/vllm_v0.10.0/benchmarks/benchmark_dataset.py b/vllm_v0.10.0/benchmarks/benchmark_dataset.py
new file mode 100644
index 0000000..1ad6cef
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/benchmark_dataset.py
@@ -0,0 +1,1173 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This module defines a framework for sampling benchmark requests from various
+datasets. Each dataset subclass of BenchmarkDataset must implement sample
+generation. Supported dataset types include:
+  - ShareGPT
+  - Random (synthetic)
+  - Sonnet
+  - BurstGPT
+  - HuggingFace
+  - VisionArena
+"""
+
+import base64
+import io
+import json
+import logging
+import random
+from abc import ABC, abstractmethod
+from collections.abc import Mapping
+from dataclasses import dataclass
+from functools import cache
+from io import BytesIO
+from typing import Any, Callable, Optional, Union
+
+import numpy as np
+import pandas as pd
+from datasets import load_dataset
+from PIL import Image
+from transformers import PreTrainedTokenizerBase
+
+from vllm.lora.request import LoRARequest
+from vllm.lora.utils import get_adapter_absolute_path
+from vllm.multimodal import MultiModalDataDict
+from vllm.multimodal.image import convert_image_mode
+from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
+
+logger = logging.getLogger(__name__)
+
+# -----------------------------------------------------------------------------
+# Data Classes
+# -----------------------------------------------------------------------------
+
+
+@dataclass
+class SampleRequest:
+    """
+    Represents a single inference request for benchmarking.
+    """
+
+    prompt: Union[str, Any]
+    prompt_len: int
+    expected_output_len: int
+    multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None
+    lora_request: Optional[LoRARequest] = None
+
+
+# -----------------------------------------------------------------------------
+# Benchmark Dataset Base Class
+# -----------------------------------------------------------------------------
+
+
+class BenchmarkDataset(ABC):
+    DEFAULT_SEED = 0
+    IS_MULTIMODAL = False
+
+    def __init__(
+        self,
+        dataset_path: Optional[str] = None,
+        random_seed: int = DEFAULT_SEED,
+    ) -> None:
+        """
+        Initialize the BenchmarkDataset with an optional dataset path and random
+        seed.  Args:
+            dataset_path (Optional[str]): Path to the dataset. If None, it
+            indicates that a default or random dataset might be used.
+            random_seed (int): Seed value for reproducible shuffling or
+            sampling. Defaults to DEFAULT_SEED.
+        """
+        self.dataset_path = dataset_path
+        # Set the random seed, ensuring that a None value is replaced with the
+        # default seed.
+        self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED
+        self.data = None
+
+    def apply_multimodal_chat_transformation(
+        self, prompt: str, mm_content: Optional[MultiModalDataDict] = None
+    ) -> list[dict]:
+        """
+        Transform a prompt and optional multimodal content into a chat format.
+        This method is used for chat models that expect a specific conversation
+        format.
+        """
+        content = [{"text": prompt, "type": "text"}]
+        if mm_content is not None:
+            content.append(mm_content)
+        return [{"role": "user", "content": content}]
+
+    def load_data(self) -> None:
+        """
+        Load data from the dataset path into self.data.
+
+        This method must be overridden by subclasses since the method to load
+        data will vary depending on the dataset format and source.
+
+        Raises:
+            NotImplementedError: If a subclass does not implement this method.
+        """
+        # TODO (jenniferzhao): add support for downloading data
+        raise NotImplementedError("load_data must be implemented in subclasses.")
+
+    def get_random_lora_request(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        max_loras: Optional[int] = None,
+        lora_path: Optional[str] = None,
+    ) -> tuple[Optional[LoRARequest], AnyTokenizer]:
+        """
+        Optionally select a random LoRA request and return its associated
+        tokenizer.
+
+        This method is used when LoRA parameters are provided.  It randomly
+        selects a LoRA based on max_loras and retrieves a cached tokenizer for
+        that LoRA if available. Otherwise, it returns the base tokenizer.
+
+        Args:
+            tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no
+            LoRA is selected.  max_loras (Optional[int]): The maximum number of
+            LoRAs available. If None, LoRA is not used.  lora_path
+            (Optional[str]): Path to the LoRA parameters on disk. If None, LoRA
+            is not used.
+
+        Returns:
+            tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first
+            element is a LoRARequest (or None if not applicable) and the second
+            element is the tokenizer associated with the LoRA request (or the
+            base tokenizer).
+        """
+        if max_loras is None or lora_path is None:
+            return None, tokenizer
+
+        # Generate a random LoRA ID in the range [1, max_loras].
+        lora_id = random.randint(1, max_loras)
+        lora_request = LoRARequest(
+            lora_name=str(lora_id),
+            lora_int_id=lora_id,
+            lora_path=lora_path_on_disk(lora_path),
+        )
+        if lora_id not in lora_tokenizer_cache:
+            lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
+        # Return lora_request and the cached tokenizer if available; otherwise,
+        # return the base tokenizer
+        return lora_request, lora_tokenizer_cache[lora_id] or tokenizer
+
+    @abstractmethod
+    def sample(
+        self, tokenizer: PreTrainedTokenizerBase, num_requests: int
+    ) -> list[SampleRequest]:
+        """
+        Abstract method to generate sample requests from the dataset.
+
+        Subclasses must override this method to implement dataset-specific logic
+        for generating a list of SampleRequest objects.
+
+        Args:
+            tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
+             for processing the dataset's text.
+            num_requests (int): The number of sample requests to generate.
+
+        Returns:
+            list[SampleRequest]: A list of sample requests generated from the
+            dataset.
+        """
+        raise NotImplementedError("sample must be implemented in subclasses.")
+
+    def maybe_oversample_requests(
+        self, requests: list[SampleRequest], num_requests: int
+    ) -> None:
+        """
+        Oversamples the list of requests if its size is less than the desired
+        number.
+
+        Args:
+            requests (List[SampleRequest]): The current list of sampled
+            requests.  num_requests (int): The target number of requests.
+        """
+        if len(requests) < num_requests:
+            random.seed(self.random_seed)
+            additional = random.choices(requests, k=num_requests - len(requests))
+            requests.extend(additional)
+            logger.info("Oversampled requests to reach %d total samples.", num_requests)
+
+
+# -----------------------------------------------------------------------------
+# Utility Functions and Global Caches
+# -----------------------------------------------------------------------------
+
+
+def is_valid_sequence(
+    prompt_len: int,
+    output_len: int,
+    min_len: int = 4,
+    max_prompt_len: int = 1024,
+    max_total_len: int = 2048,
+    skip_min_output_len_check: bool = False,
+) -> bool:
+    """
+    Validate a sequence based on prompt and output lengths.
+
+    Default pruning criteria are copied from the original `sample_hf_requests`
+    and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
+    from `sample_requests` in benchmark_throughput.py.
+    """
+    # Check for invalid conditions
+    prompt_too_short = prompt_len < min_len
+    output_too_short = (not skip_min_output_len_check) and (output_len < min_len)
+    prompt_too_long = prompt_len > max_prompt_len
+    combined_too_long = (prompt_len + output_len) > max_total_len
+
+    # Return True if none of the invalid conditions are met
+    return not (
+        prompt_too_short or output_too_short or prompt_too_long or combined_too_long
+    )
+
+
+@cache
+def lora_path_on_disk(lora_path: str) -> str:
+    return get_adapter_absolute_path(lora_path)
+
+
+# Global cache for LoRA tokenizers.
+lora_tokenizer_cache: dict[int, AnyTokenizer] = {}
+
+
+def process_image(image: Any) -> Mapping[str, Any]:
+    """
+    Process a single image input and return a multimedia content dictionary.
+
+    Supports three input types:
+
+    1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
+       containing raw image data.  - Loads the bytes as a PIL.Image.Image.
+
+    2. PIL.Image.Image input: - Converts the image to RGB.  - Saves the image as
+       a JPEG in memory.  - Encodes the JPEG data as a base64 string.  - Returns
+       a dictionary with the image as a base64 data URL.
+
+    3. String input: - Treats the string as a URL or local file path.  -
+       Prepends "file://" if the string doesn't start with "http://" or
+       "file://".  - Returns a dictionary with the image URL.
+
+    Raises:
+        ValueError: If the input is not a supported type.
+    """
+    if isinstance(image, dict) and "bytes" in image:
+        image = Image.open(BytesIO(image["bytes"]))
+    if isinstance(image, Image.Image):
+        image = convert_image_mode(image, "RGB")
+        with io.BytesIO() as image_data:
+            image.save(image_data, format="JPEG")
+            image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
+        return {
+            "type": "image_url",
+            "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
+        }
+
+    if isinstance(image, str):
+        image_url = (
+            image if image.startswith(("http://", "file://")) else f"file://{image}"
+        )
+        return {"type": "image_url", "image_url": {"url": image_url}}
+
+    raise ValueError(
+        f"Invalid image input {image}. Must be a PIL.Image.Image"
+        " or str or dictionary with raw image bytes."
+    )
+
+
+# -----------------------------------------------------------------------------
+# Random Dataset Implementation (Synthetic Data)
+# -----------------------------------------------------------------------------
+
+
+class RandomDataset(BenchmarkDataset):
+    # Default values copied from benchmark_serving.py for the random dataset.
+    DEFAULT_PREFIX_LEN = 0
+    DEFAULT_RANGE_RATIO = 0.0
+    DEFAULT_INPUT_LEN = 1024
+    DEFAULT_OUTPUT_LEN = 128
+
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        prefix_len: int = DEFAULT_PREFIX_LEN,
+        range_ratio: float = DEFAULT_RANGE_RATIO,
+        input_len: int = DEFAULT_INPUT_LEN,
+        output_len: int = DEFAULT_OUTPUT_LEN,
+        **kwargs,
+    ) -> list[SampleRequest]:
+        # Enforce range_ratio < 1
+        assert range_ratio < 1.0, (
+            "random_range_ratio must be < 1.0 to ensure a valid sampling range"
+        )
+
+        vocab_size = tokenizer.vocab_size
+        num_special_tokens = tokenizer.num_special_tokens_to_add()
+        real_input_len = input_len - num_special_tokens
+
+        prefix_token_ids = (
+            np.random.randint(0, vocab_size, size=prefix_len).tolist()
+            if prefix_len > 0
+            else []
+        )
+
+        # New sampling logic: [X * (1 - b), X * (1 + b)]
+        input_low = int(real_input_len * (1 - range_ratio))
+        input_high = int(real_input_len * (1 + range_ratio))
+        output_low = int(output_len * (1 - range_ratio))
+        # Ensure the lower bound for output length is at least 1 to prevent
+        # sampling 0 tokens, which can cause request failures.
+        output_low = max(output_low, 1)
+        output_high = int(output_len * (1 + range_ratio))
+
+        # Add logging for debugging
+        logger.info("Sampling input_len from [%s, %s]", input_low, input_high)
+        logger.info("Sampling output_len from [%s, %s]", output_low, output_high)
+
+        input_lens = np.random.randint(input_low, input_high + 1, size=num_requests)
+        output_lens = np.random.randint(output_low, output_high + 1, size=num_requests)
+        offsets = np.random.randint(0, vocab_size, size=num_requests)
+
+        requests = []
+        for i in range(num_requests):
+            inner_seq = (
+                (offsets[i] + i + np.arange(input_lens[i])) % vocab_size
+            ).tolist()
+            token_sequence = prefix_token_ids + inner_seq
+            prompt = tokenizer.decode(token_sequence)
+            # After decoding the prompt we have to encode and decode it again.
+            # This is done because in some cases N consecutive tokens
+            # give a string tokenized into != N number of tokens.
+            # For example for GPT2Tokenizer:
+            # [6880, 6881] -> ['Ġcalls', 'here'] ->
+            # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
+            # To avoid uncontrolled change of the prompt length,
+            # the encoded sequence is truncated before being decode again.
+            total_input_len = prefix_len + int(input_lens[i])
+            re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
+                :total_input_len
+            ]
+            prompt = tokenizer.decode(re_encoded_sequence)
+            total_input_len = len(re_encoded_sequence)
+            requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=total_input_len,
+                    expected_output_len=int(output_lens[i]),
+                )
+            )
+        return requests
+
+
+# -----------------------------------------------------------------------------
+# ShareGPT Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class ShareGPTDataset(BenchmarkDataset):
+    """
+    Implements the ShareGPT dataset.  Loads data from a JSON file and generates
+    sample requests based on conversation turns.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self) -> None:
+        if self.dataset_path is None:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        with open(self.dataset_path, encoding="utf-8") as f:
+            self.data = json.load(f)
+        # Filter entries with at least two conversation turns.
+        self.data = [
+            entry
+            for entry in self.data
+            if "conversations" in entry and len(entry["conversations"]) >= 2
+        ]
+        random.seed(self.random_seed)
+        random.shuffle(self.data)
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        lora_path: Optional[str] = None,
+        max_loras: Optional[int] = None,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
+        samples: list = []
+        for entry in self.data:
+            if len(samples) >= num_requests:
+                break
+            prompt, completion = (
+                entry["conversations"][0]["value"],
+                entry["conversations"][1]["value"],
+            )
+
+            lora_request, tokenizer = self.get_random_lora_request(
+                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path
+            )
+            prompt_ids = tokenizer(prompt).input_ids
+            completion_ids = tokenizer(completion).input_ids
+            prompt_len = len(prompt_ids)
+            new_output_len = len(completion_ids) if output_len is None else output_len
+            if not is_valid_sequence(
+                prompt_len,
+                new_output_len,
+                skip_min_output_len_check=output_len is not None,
+            ):
+                continue
+            if enable_multimodal_chat:
+                prompt = self.apply_multimodal_chat_transformation(prompt, None)
+            samples.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=new_output_len,
+                    lora_request=lora_request,
+                )
+            )
+        self.maybe_oversample_requests(samples, num_requests)
+        return samples
+
+
+# -----------------------------------------------------------------------------
+# Custom Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class CustomDataset(BenchmarkDataset):
+    """
+    Implements the Custom dataset.  Loads data from a JSONL file and generates
+    sample requests based on conversation turns. E.g.,
+    ```
+    {"prompt": "What is the capital of India?"}
+    {"prompt": "What is the capital of Iran?"}
+    {"prompt": "What is the capital of China?"}
+    ```
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self) -> None:
+        if self.dataset_path is None:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        # self.data will be a list of dictionaries
+        # e.g., [{"prompt": "What is the capital of India?"}, ...]
+        # This will be the standardized format which load_data()
+        # has to convert into depending on the filetype of dataset_path.
+        # sample() will assume this standardized format of self.data
+        self.data = []
+
+        # Load the JSONL file
+        if self.dataset_path.endswith(".jsonl"):
+            jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True)
+
+            # check if the JSONL file has a 'prompt' column
+            if "prompt" not in jsonl_data.columns:
+                raise ValueError("JSONL file must contain a 'prompt' column.")
+
+            # Convert each row to a dictionary and append to self.data
+            # This will convert the DataFrame to a list of dictionaries
+            # where each dictionary corresponds to a row in the DataFrame.
+            # This is the standardized format we want for self.data
+            for _, row in jsonl_data.iterrows():
+                self.data.append(row.to_dict())
+        else:
+            raise NotImplementedError(
+                "Only JSONL format is supported for CustomDataset."
+            )
+
+        random.seed(self.random_seed)
+        random.shuffle(self.data)
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        lora_path: Optional[str] = None,
+        max_loras: Optional[int] = None,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        skip_chat_template: bool = False,
+        **kwargs,
+    ) -> list:
+        sampled_requests = []
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = item["prompt"]
+
+            # apply template
+            if not skip_chat_template:
+                prompt = tokenizer.apply_chat_template(
+                    [{"role": "user", "content": prompt}],
+                    add_generation_prompt=True,
+                    tokenize=False,
+                )
+
+            prompt_len = len(tokenizer(prompt).input_ids)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                )
+            )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# Sonnet Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class SonnetDataset(BenchmarkDataset):
+    """
+    Simplified implementation of the Sonnet dataset.  Loads poem lines from a
+    text file and generates sample requests.  Default values here copied from
+    `benchmark_serving.py` for the sonnet dataset.
+    """
+
+    DEFAULT_PREFIX_LEN = 200
+    DEFAULT_INPUT_LEN = 550
+    DEFAULT_OUTPUT_LEN = 150
+
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self) -> None:
+        if not self.dataset_path:
+            raise ValueError("dataset_path must be provided.")
+        with open(self.dataset_path, encoding="utf-8") as f:
+            self.data = f.readlines()
+
+    def sample(
+        self,
+        tokenizer,
+        num_requests: int,
+        prefix_len: int = DEFAULT_PREFIX_LEN,
+        input_len: int = DEFAULT_INPUT_LEN,
+        output_len: int = DEFAULT_OUTPUT_LEN,
+        return_prompt_formatted: bool = False,
+        **kwargs,
+    ) -> list:
+        # Calculate average token length for a poem line.
+        tokenized_lines = [tokenizer(line).input_ids for line in self.data]
+        avg_len = sum(len(tokens) for tokens in tokenized_lines) / len(tokenized_lines)
+
+        # Build the base prompt.
+        base_prompt = "Pick as many lines as you can from these poem lines:\n"
+        base_msg = [{"role": "user", "content": base_prompt}]
+        base_fmt = tokenizer.apply_chat_template(
+            base_msg, add_generation_prompt=True, tokenize=False
+        )
+        base_offset = len(tokenizer(base_fmt).input_ids)
+        if input_len <= base_offset:
+            raise ValueError(
+                f"'input_len' must be higher than the base prompt length "
+                f"({base_offset})."
+            )
+
+        # Determine how many poem lines to use.
+        num_input_lines = round((input_len - base_offset) / avg_len)
+        num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0)
+        prefix_lines = self.data[:num_prefix_lines]
+
+        samples = []
+        while len(samples) < num_requests:
+            extra_lines = random.choices(
+                self.data, k=num_input_lines - num_prefix_lines
+            )
+            prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
+            msg = [{"role": "user", "content": prompt}]
+            prompt_formatted = tokenizer.apply_chat_template(
+                msg, add_generation_prompt=True, tokenize=False
+            )
+            prompt_len = len(tokenizer(prompt_formatted).input_ids)
+            if prompt_len <= input_len:
+                samples.append(
+                    SampleRequest(
+                        prompt=prompt_formatted if return_prompt_formatted else prompt,
+                        prompt_len=prompt_len,
+                        expected_output_len=output_len,
+                    )
+                )
+        return samples
+
+
+# -----------------------------------------------------------------------------
+# BurstGPT Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class BurstGPTDataset(BenchmarkDataset):
+    """
+    Implements the BurstGPT dataset.  Loads data from a CSV file and generates
+    sample requests based on synthetic prompt generation. Only rows with Model
+    "GPT-4" and positive response tokens are used.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(
+        self,
+    ):
+        if self.dataset_path is None:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        df = pd.read_csv(self.dataset_path)
+        # Filter to keep only GPT-4 rows.
+        gpt4_df = df[df["Model"] == "GPT-4"]
+        # Remove failed requests (where Response tokens is 0 or less).
+        gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
+        # Sample the desired number of rows.
+        self.data = gpt4_df
+
+    def _sample_loaded_data(self, num_requests: int) -> list:
+        if num_requests <= len(self.data):
+            data = self.data.sample(n=num_requests, random_state=self.random_seed)
+        else:
+            data = self.data.sample(
+                n=num_requests,
+                random_state=self.random_seed,
+                replace=True,
+            )
+        # Convert the dataframe to a list of lists.
+        return data.values.tolist()
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        max_loras: Optional[int] = None,
+        lora_path: Optional[str] = None,
+        **kwargs,
+    ) -> list[SampleRequest]:
+        samples = []
+        data = self._sample_loaded_data(num_requests=num_requests)
+        for i in range(num_requests):
+            input_len = int(data[i][2])
+            output_len = int(data[i][3])
+            lora_req, tokenizer = self.get_random_lora_request(
+                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path
+            )
+            vocab_size = tokenizer.vocab_size
+            # Generate a synthetic prompt: a list of token IDs computed as (i +
+            # j) modulo vocab_size.
+            token_ids = [(i + j) % vocab_size for j in range(input_len)]
+            prompt = tokenizer.decode(token_ids)
+            samples.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=input_len,
+                    expected_output_len=output_len,
+                    lora_request=lora_req,
+                )
+            )
+        return samples
+
+
+# -----------------------------------------------------------------------------
+# HuggingFace Dataset Base Implementation
+# -----------------------------------------------------------------------------
+class HuggingFaceDataset(BenchmarkDataset):
+    """Base class for datasets hosted on HuggingFace."""
+
+    SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set()
+
+    def __init__(
+        self,
+        dataset_path: str,
+        dataset_split: str,
+        no_stream: bool = False,
+        dataset_subset: Optional[str] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(dataset_path=dataset_path, **kwargs)
+
+        self.dataset_split = dataset_split
+        self.dataset_subset = dataset_subset
+        self.load_stream = not no_stream
+        self.load_data()
+
+    def load_data(self) -> None:
+        """Load data from HuggingFace datasets."""
+        self.data = load_dataset(
+            self.dataset_path,
+            name=self.dataset_subset,
+            split=self.dataset_split,
+            streaming=self.load_stream,
+        )
+        self.data = self.data.shuffle(seed=self.random_seed)
+
+
+# -----------------------------------------------------------------------------
+# Conversation Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class ConversationDataset(HuggingFaceDataset):
+    """Dataset for conversation data with multimodal support."""
+
+    SUPPORTED_DATASET_PATHS = {
+        "lmms-lab/LLaVA-OneVision-Data",
+        "Aeala/ShareGPT_Vicuna_unfiltered",
+    }
+    IS_MULTIMODAL = True
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
+        # Filter examples with at least 2 conversations
+        filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2)
+        sampled_requests = []
+        dynamic_output = output_len is None
+
+        for item in filtered_data:
+            if len(sampled_requests) >= num_requests:
+                break
+            conv = item["conversations"]
+            prompt, completion = conv[0]["value"], conv[1]["value"]
+
+            prompt_ids = tokenizer(prompt).input_ids
+            completion_ids = tokenizer(completion).input_ids
+            prompt_len = len(prompt_ids)
+            completion_len = len(completion_ids)
+            output_len = completion_len if dynamic_output else output_len
+            assert isinstance(output_len, int) and output_len > 0
+            if dynamic_output and not is_valid_sequence(prompt_len, completion_len):
+                continue
+            mm_content = process_image(item["image"]) if "image" in item else None
+            if enable_multimodal_chat:
+                # Note: when chat is enabled the request prompt_len is no longer
+                # accurate and we will be using request output to count the
+                # actual prompt len and output len
+                prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=mm_content,
+                )
+            )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# Vision Arena Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class VisionArenaDataset(HuggingFaceDataset):
+    """
+    Vision Arena Dataset.
+    """
+
+    DEFAULT_OUTPUT_LEN = 128
+    SUPPORTED_DATASET_PATHS = {
+        "lmarena-ai/VisionArena-Chat": lambda x: x["conversation"][0][0]["content"],
+        "lmarena-ai/vision-arena-bench-v0.1": lambda x: x["turns"][0][0]["content"],
+    }
+    IS_MULTIMODAL = True
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
+        output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
+        sampled_requests = []
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
+            if parser_fn is None:
+                raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
+            prompt = parser_fn(item)
+            mm_content = process_image(item["images"][0])
+            prompt_len = len(tokenizer(prompt).input_ids)
+            if enable_multimodal_chat:
+                # Note: when chat is enabled the request prompt_len is no longer
+                # accurate and we will be using request output to count the
+                # actual prompt len
+                prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=mm_content,
+                )
+            )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# Instruct Coder Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class InstructCoderDataset(HuggingFaceDataset):
+    """
+    InstructCoder Dataset.
+    https://huggingface.co/datasets/likaixin/InstructCoder
+
+    InstructCoder is the dataset designed for general code editing.  It consists
+    of 114,239 instruction-input-output triplets, and covers multiple distinct
+    code editing scenario.
+    """
+
+    DEFAULT_OUTPUT_LEN = 200  # this is the average default output length
+    SUPPORTED_DATASET_PATHS = {
+        "likaixin/InstructCoder",
+    }
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
+        output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
+        sampled_requests = []
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = f"{item['input']}\n\n{item['instruction']} Just output \
+            the code, do not include any explanation."
+
+            # apply template
+            prompt = tokenizer.apply_chat_template(
+                [{"role": "user", "content": prompt}],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
+            prompt_len = len(tokenizer(prompt).input_ids)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                )
+            )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# MT-Bench Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class MTBenchDataset(HuggingFaceDataset):
+    """
+    MT-Bench Dataset.
+    https://huggingface.co/datasets/philschmid/mt-bench
+
+    We create a single turn dataset for MT-Bench.
+    This is similar to Spec decoding benchmark setup in vLLM
+    https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18
+    """  # noqa: E501
+
+    DEFAULT_OUTPUT_LEN = 256  # avg len used in SD bench in vLLM
+    SUPPORTED_DATASET_PATHS = {
+        "philschmid/mt-bench",
+    }
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
+        output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
+        sampled_requests = []
+
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = item["turns"][0]
+
+            # apply template
+            prompt = tokenizer.apply_chat_template(
+                [{"role": "user", "content": prompt}],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
+
+            prompt_len = len(tokenizer(prompt).input_ids)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                )
+            )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# AIMO Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class AIMODataset(HuggingFaceDataset):
+    """
+    Dataset class for processing a AIMO dataset with reasoning questions.
+    """
+
+    SUPPORTED_DATASET_PATHS = {
+        "AI-MO/aimo-validation-aime",
+        "AI-MO/NuminaMath-1.5",
+        "AI-MO/NuminaMath-CoT",
+    }
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        **kwargs,
+    ) -> list:
+        sampled_requests = []
+        dynamic_output = output_len is None
+
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt, completion = item["problem"], item["solution"]
+
+            prompt_ids = tokenizer(prompt).input_ids
+            completion_ids = tokenizer(completion).input_ids
+            prompt_len = len(prompt_ids)
+            completion_len = len(completion_ids)
+            output_len = completion_len if dynamic_output else output_len
+            assert isinstance(output_len, int) and output_len > 0
+            if dynamic_output and not is_valid_sequence(
+                prompt_len, completion_len, max_prompt_len=2048, max_total_len=32000
+            ):
+                continue
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=None,
+                )
+            )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# Next Edit Prediction Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+zeta_prompt = """### Instruction:
+You are a code completion assistant and your task is to analyze user edits and then rewrite an excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking into account the cursor location.
+
+### User Edits:
+
+{}
+
+### User Excerpt:
+
+{}
+
+### Response:
+
+"""  # noqa: E501
+
+
+def _format_zeta_prompt(
+    sample: dict, original_start_marker: str = "<|editable_region_start|>"
+) -> dict:
+    """Format the zeta prompt for the Next Edit Prediction (NEP) dataset.
+
+    This function formats examples from the NEP dataset
+    into prompts and expected outputs. It could be
+    further extended to support more NEP datasets.
+
+    Args:
+        sample: The dataset sample containing events,
+            inputs, and outputs.
+        original_start_marker: The marker indicating the
+            start of the editable region. Defaults to
+            "<|editable_region_start|>".
+
+    Returns:
+        A dictionary with the formatted prompts and expected outputs.
+    """
+    events = sample["events"]
+    input = sample["input"]
+    output = sample["output"]
+    prompt = zeta_prompt.format(events, input)
+
+    # following the original implementation, extract the focused region
+    # from the raw output
+    output_start_index = output.find(original_start_marker)
+    output_focused_region = output[output_start_index:]
+    expected_output = output_focused_region
+
+    return {"prompt": prompt, "expected_output": expected_output}
+
+
+class NextEditPredictionDataset(HuggingFaceDataset):
+    """
+    Dataset class for processing a Next Edit Prediction dataset.
+    """
+
+    SUPPORTED_DATASET_PATHS = {
+        "zed-industries/zeta",
+    }
+    MAPPING_PROMPT_FUNCS = {
+        "zed-industries/zeta": _format_zeta_prompt,
+    }
+
+    def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int, **kwargs):
+        formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.dataset_path)
+        if formatting_prompt_func is None:
+            raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
+        samples = []
+        for sample in self.data:
+            sample = formatting_prompt_func(sample)
+            samples.append(
+                SampleRequest(
+                    prompt=sample["prompt"],
+                    prompt_len=len(tokenizer(sample["prompt"]).input_ids),
+                    expected_output_len=len(
+                        tokenizer(sample["expected_output"]).input_ids
+                    ),
+                )
+            )
+            if len(samples) >= num_requests:
+                break
+        self.maybe_oversample_requests(samples, num_requests)
+        return samples
+
+
+# -----------------------------------------------------------------------------
+# ASR Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class ASRDataset(HuggingFaceDataset):
+    """
+    Dataset class for processing a ASR dataset for transcription.
+    Tested on the following set:
+
+    +----------------+----------------------------------------+--------------------------+-----------------------------+
+    | Dataset        | Domain                                 | Speaking Style           | hf-subset                   |
+    +----------------+----------------------------------------+--------------------------+-----------------------------+
+    | TED-LIUM       | TED talks                              | Oratory                  | release1, release2, release3|
+    |                |                                        |                          | release3-speaker-adaptation |
+    | VoxPopuli      | European Parliament                    | Oratory                  | en, de, it, fr,  ...        |
+    | LibriSpeech    | Audiobook                              | Narrated                 | "LIUM/tedlium"              |
+    | GigaSpeech     | Audiobook, podcast, YouTube            | Narrated, spontaneous    | xs, s, m, l, xl, dev, test  |
+    | SPGISpeech     | Financial meetings                     | Oratory, spontaneous     | S, M, L, dev, test          |
+    | AMI            | Meetings                               | Spontaneous              | ihm, sdm                    |
+    +----------------+----------------------------------------+--------------------------+-----------------------------+
+
+    """  # noqa: E501
+
+    SUPPORTED_DATASET_PATHS = {
+        "openslr/librispeech_asr",
+        "facebook/voxpopuli",
+        "LIUM/tedlium",
+        "edinburghcstr/ami",
+        "speechcolab/gigaspeech",
+        "kensho/spgispeech",
+    }
+
+    DEFAULT_OUTPUT_LEN = 128
+    IS_MULTIMODAL = True
+
+    # TODO Whisper-specific. Abstract interface when more models are supported.
+    TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
+    skip_long_audios: bool = True
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        **kwargs,
+    ) -> list:
+        import librosa
+
+        output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
+        prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
+        prompt_len = len(tokenizer(prompt).input_ids)
+        sampled_requests = []
+        skipped = 0
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            audio = item["audio"]
+            y, sr = audio["array"], audio["sampling_rate"]
+            duration_s = librosa.get_duration(y=y, sr=sr)
+            # Whisper max supported duration
+            if self.skip_long_audios and duration_s > 30:
+                skipped += 1
+                continue
+
+            mm_content = {"audio": (y, sr)}
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=mm_content,
+                )
+            )
+        if skipped:
+            logger.warning(
+                "%d samples discarded from dataset due to"
+                " their length being greater than"
+                " what Whisper supports.",
+                skipped,
+            )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
diff --git a/vllm_v0.10.0/benchmarks/benchmark_latency.py b/vllm_v0.10.0/benchmarks/benchmark_latency.py
new file mode 100644
index 0000000..4d2ea12
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/benchmark_latency.py
@@ -0,0 +1,186 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Benchmark the latency of processing a single batch of requests."""
+
+import argparse
+import dataclasses
+import json
+import os
+import time
+from typing import Any, Optional
+
+import numpy as np
+from tqdm import tqdm
+
+import vllm.envs as envs
+from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.inputs import PromptType
+from vllm.sampling_params import BeamSearchParams
+from vllm.utils import FlexibleArgumentParser
+
+
+def save_to_pytorch_benchmark_format(
+    args: argparse.Namespace, results: dict[str, Any]
+) -> None:
+    pt_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={"latency": results["latencies"]},
+        extra_info={k: results[k] for k in ["avg_latency", "percentiles"]},
+    )
+    if pt_records:
+        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
+        write_to_json(pt_file, pt_records)
+
+
+def main(args: argparse.Namespace):
+    print(args)
+
+    engine_args = EngineArgs.from_cli_args(args)
+
+    # NOTE(woosuk): If the request cannot be processed in a single batch,
+    # the engine will automatically process the request in multiple batches.
+    llm = LLM(**dataclasses.asdict(engine_args))
+    assert llm.llm_engine.model_config.max_model_len >= (
+        args.input_len + args.output_len
+    ), (
+        "Please ensure that max_model_len is greater than"
+        " the sum of input_len and output_len."
+    )
+
+    sampling_params = SamplingParams(
+        n=args.n,
+        temperature=1.0,
+        top_p=1.0,
+        ignore_eos=True,
+        max_tokens=args.output_len,
+        detokenize=not args.disable_detokenize,
+    )
+    print(sampling_params)
+    dummy_prompt_token_ids = np.random.randint(
+        10000, size=(args.batch_size, args.input_len)
+    )
+    dummy_prompts: list[PromptType] = [
+        {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
+    ]
+
+    def llm_generate():
+        if not args.use_beam_search:
+            llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
+        else:
+            llm.beam_search(
+                dummy_prompts,
+                BeamSearchParams(
+                    beam_width=args.n,
+                    max_tokens=args.output_len,
+                    ignore_eos=True,
+                ),
+            )
+
+    def run_to_completion(profile_dir: Optional[str] = None):
+        if profile_dir:
+            llm.start_profile()
+            llm_generate()
+            llm.stop_profile()
+        else:
+            start_time = time.perf_counter()
+            llm_generate()
+            end_time = time.perf_counter()
+            latency = end_time - start_time
+            return latency
+
+    print("Warming up...")
+    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+        run_to_completion(profile_dir=None)
+
+    if args.profile:
+        profile_dir = envs.VLLM_TORCH_PROFILER_DIR
+        print(f"Profiling (results will be saved to '{profile_dir}')...")
+        run_to_completion(profile_dir=profile_dir)
+        return
+
+    # Benchmark.
+    latencies = []
+    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
+        latencies.append(run_to_completion(profile_dir=None))
+    latencies = np.array(latencies)
+    percentages = [10, 25, 50, 75, 90, 99]
+    percentiles = np.percentile(latencies, percentages)
+    print(f"Avg latency: {np.mean(latencies)} seconds")
+    for percentage, percentile in zip(percentages, percentiles):
+        print(f"{percentage}% percentile latency: {percentile} seconds")
+
+    # Output JSON results if specified
+    if args.output_json:
+        results = {
+            "avg_latency": np.mean(latencies),
+            "latencies": latencies.tolist(),
+            "percentiles": dict(zip(percentages, percentiles.tolist())),
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+        save_to_pytorch_benchmark_format(args, results)
+
+
+def create_argument_parser():
+    parser = FlexibleArgumentParser(
+        description="Benchmark the latency of processing a single batch of "
+        "requests till completion."
+    )
+    parser.add_argument("--input-len", type=int, default=32)
+    parser.add_argument("--output-len", type=int, default=128)
+    parser.add_argument("--batch-size", type=int, default=8)
+    parser.add_argument(
+        "--n",
+        type=int,
+        default=1,
+        help="Number of generated sequences per prompt.",
+    )
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument(
+        "--num-iters-warmup",
+        type=int,
+        default=10,
+        help="Number of iterations to run for warmup.",
+    )
+    parser.add_argument(
+        "--num-iters", type=int, default=30, help="Number of iterations to run."
+    )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="profile the generation process of a single batch",
+    )
+    parser.add_argument(
+        "--output-json",
+        type=str,
+        default=None,
+        help="Path to save the latency results in JSON format.",
+    )
+    parser.add_argument(
+        "--disable-detokenize",
+        action="store_true",
+        help=(
+            "Do not detokenize responses (i.e. do not include "
+            "detokenization time in the latency measurement)"
+        ),
+    )
+
+    parser = EngineArgs.add_cli_args(parser)
+    # V1 enables prefix caching by default which skews the latency
+    # numbers. We need to disable prefix caching by default.
+    parser.set_defaults(enable_prefix_caching=False)
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = create_argument_parser()
+    args = parser.parse_args()
+    if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
+        raise OSError(
+            "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
+            "Please set it to a valid path to use torch profiler."
+        )
+    main(args)
diff --git a/vllm_v0.10.0/benchmarks/benchmark_long_document_qa_throughput.py b/vllm_v0.10.0/benchmarks/benchmark_long_document_qa_throughput.py
new file mode 100644
index 0000000..6e0f3b5
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/benchmark_long_document_qa_throughput.py
@@ -0,0 +1,202 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Offline benchmark to test the long document QA throughput.
+
+Example usage:
+    # This workload samples 8 different prompts with a default input
+    # length of 20000 tokens, then replicates each prompt 2 times 
+    # in random order.
+    python benchmark_long_document_qa_throughput.py \
+        --model meta-llama/Llama-2-7b-chat-hf \
+        --enable-prefix-caching \
+        --num-documents 8 \
+        --repeat-count 2 
+
+Commandline arguments:
+    --num-documents: The number of documents to sample prompts from.
+
+    --document-length: The length of each document in tokens. 
+                       (Optional, default: 20000)
+
+    --output-len: The number of tokens to generate for each prompt.
+                  (Optional, default: 10)
+
+    --repeat-count: The number of times to repeat each prompt.
+                    (Optional, default: 2)
+
+    --repeat-mode: The mode to repeat prompts. The supported modes are:
+        - 'random': shuffle the prompts randomly. (Default)
+        - 'tile': the entire prompt list is repeated in sequence. (Potentially
+                  lowest cache hit)
+        - 'interleave': each prompt is repeated consecutively before 
+                        moving to the next element. (Highest cache hit)
+    
+    --shuffle-seed: Random seed when the repeat mode is "random".
+                    (Optional, default: 0)
+
+In the meantime, it also supports all the vLLM engine args to initialize the 
+LLM engine. You can refer to the `vllm.engine.arg_utils.EngineArgs` for more
+details.
+"""
+
+import dataclasses
+import random
+import time
+
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
+    """
+    Test long document QA with the given prompts and sampling parameters.
+    Print the time spent in processing all the prompts.
+
+    Args:
+        llm: The language model used for generating responses.
+        sampling_params: Sampling parameter used to generate the response.
+        prompts: A list of prompt strings to be processed by the LLM.
+    """
+    start_time = time.time()
+    llm.generate(prompts, sampling_params=sampling_params)
+    end_time = time.time()
+    print(f"Time to execute all requests: {end_time - start_time:.4f} secs")
+
+
+def repeat_prompts(prompts, repeat_count, mode: str):
+    """
+    Repeat each prompt in the list for a specified number of times.
+    The order of prompts in the output list depends on the mode.
+
+    Args:
+        prompts: A list of prompts to be repeated.
+        repeat_count: The number of times each prompt is repeated.
+        mode: The mode of repetition. Supported modes are:
+            - 'random': Shuffle the prompts randomly after repetition.
+            - 'tile': Repeat the entire prompt list in sequence.
+              Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3].
+            - 'interleave': Repeat each prompt consecutively before moving to
+              the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3].
+
+    Returns:
+        A list of repeated prompts in the specified order.
+
+    Raises:
+        ValueError: If an invalid mode is provided.
+    """
+    print("Repeat mode: ", mode)
+    if mode == "random":
+        repeated_prompts = prompts * repeat_count
+        random.shuffle(repeated_prompts)
+        return repeated_prompts
+    elif mode == "tile":
+        return prompts * repeat_count
+    elif mode == "interleave":
+        repeated_prompts = []
+        for prompt in prompts:
+            repeated_prompts.extend([prompt] * repeat_count)
+        return repeated_prompts
+    else:
+        raise ValueError(
+            f"Invalid mode: {mode}, only support 'random', 'tile', 'interleave'"
+        )
+
+
+def main(args):
+    random.seed(args.shuffle_seed)
+
+    # Prepare the prompts:
+    # we append the document id at the beginning to avoid any of the document
+    # being the prefix of other documents
+    prompts = [
+        str(i) + " ".join(["hi"] * args.document_length)
+        for i in range(args.num_documents)
+    ]
+
+    prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode)
+
+    warmup_prompts = [
+        "This is warm up request " + str(i) + " ".join(["hi"] * args.document_length)
+        for i in range(args.num_documents)
+    ]
+
+    # Create the LLM engine
+    engine_args = EngineArgs.from_cli_args(args)
+    llm = LLM(**dataclasses.asdict(engine_args))
+    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
+
+    print("------warm up------")
+    test_long_document_qa(
+        llm=llm,
+        prompts=warmup_prompts,
+        sampling_params=sampling_params,
+    )
+
+    print("------start generating------")
+    test_long_document_qa(
+        llm=llm,
+        prompts=prompts,
+        sampling_params=sampling_params,
+    )
+
+
+def create_argument_parser():
+    parser = FlexibleArgumentParser(
+        description="Benchmark the performance with or "
+        "without automatic prefix caching."
+    )
+
+    parser.add_argument(
+        "--document-length",
+        type=int,
+        # Roughly the number of tokens for a system paper,
+        # excluding images
+        default=20000,
+        help="Range of input lengths for sampling prompts, "
+        'specified as "min:max" (e.g., "128:256").',
+    )
+
+    parser.add_argument(
+        "--num-documents",
+        type=int,
+        default=8,
+        help="Range of input lengths for sampling prompts, "
+        'specified as "min:max" (e.g., "128:256").',
+    )
+
+    parser.add_argument("--output-len", type=int, default=10)
+
+    parser.add_argument(
+        "--repeat-count",
+        type=int,
+        default=2,
+        help="Number of times to repeat each prompt",
+    )
+
+    parser.add_argument(
+        "--repeat-mode",
+        type=str,
+        default="random",
+        help="The mode to repeat prompts. The supported "
+        'modes are "random", "tile", and "interleave". '
+        "See repeat_prompts() in the source code for details.",
+    )
+
+    parser.add_argument(
+        "--shuffle-seed",
+        type=int,
+        default=0,
+        help='Random seed when the repeat mode is "random"',
+    )
+
+    parser = EngineArgs.add_cli_args(parser)
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = create_argument_parser()
+    args = parser.parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/benchmarks/benchmark_prefix_caching.py b/vllm_v0.10.0/benchmarks/benchmark_prefix_caching.py
new file mode 100644
index 0000000..b5e2613
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/benchmark_prefix_caching.py
@@ -0,0 +1,278 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Benchmark the efficiency of prefix caching.
+
+This script allows you to benchmark the performance of
+a model with and without prefix caching using either fixed prompts
+or prompts sampled from the ShareGPT dataset.
+
+Fixed example usage:
+    python benchmark_prefix_caching.py \
+        --model meta-llama/Llama-2-7b-chat-hf \
+        --enable-prefix-caching \
+        --num-prompts 1 \
+        --repeat-count 100 \
+        --input-length-range 128:256
+
+ShareGPT example usage:
+    # This command samples 20 prompts with input lengths
+    # between 128 and 256 tokens from the ShareGPT dataset,
+    # then replicates each prompt 5 times.
+    python benchmark_prefix_caching.py \
+        --model meta-llama/Llama-2-7b-chat-hf \
+        --dataset-path /path/to/ShareGPT_V3_unfiltered_cleaned_split.json \
+        --enable-prefix-caching \
+        --num-prompts 20 \
+        --repeat-count 5 \
+        --input-length-range 128:256
+"""
+
+import dataclasses
+import json
+import random
+import time
+from typing import Optional
+
+from transformers import PreTrainedTokenizerBase
+
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+try:
+    from vllm.transformers_utils.tokenizer import get_tokenizer
+except ImportError:
+    from backend_request_func import get_tokenizer
+
+PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n"  # noqa: E501
+
+
+def test_prefix(llm=None, sampling_params=None, prompts=None):
+    start_time = time.time()
+
+    llm.generate(prompts, sampling_params=sampling_params)
+
+    end_time = time.time()
+    print(f"cost time {end_time - start_time}")
+
+
+@dataclasses.dataclass
+class Request:
+    prompt: str
+    prompt_len: int
+    output_len: int
+
+
+def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> list[int]:
+    vocab = tokenizer.get_vocab()
+    all_special_ids = set(tokenizer.all_special_ids)
+
+    # Remove the special tokens.
+    return random.choices(
+        [v for k, v in vocab.items() if k not in all_special_ids],
+        k=length,
+    )
+
+
+def sample_requests_from_dataset(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    input_length_range: tuple[int, int],
+    fixed_output_len: Optional[int],
+) -> list[Request]:
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    # Only keep the first two turns of each conversation.
+    dataset = [
+        (data["conversations"][0]["value"], data["conversations"][1]["value"])
+        for data in dataset
+    ]
+
+    # Shuffle the dataset.
+    random.shuffle(dataset)
+
+    min_len, max_len = input_length_range
+    assert min_len >= 0 and max_len >= min_len, "input_length_range too small"
+
+    # Filter out sequences that are too long or too short
+    filtered_requests: list[Request] = []
+
+    for i in range(len(dataset)):
+        if len(filtered_requests) == num_requests:
+            break
+
+        # Tokenize the prompts and completions.
+        prompt_token_ids = tokenizer(dataset[i][0]).input_ids
+        prompt = tokenizer.decode(prompt_token_ids)
+        completion = dataset[i][1]
+        completion_token_ids = tokenizer(completion).input_ids
+        prompt_len = len(prompt_token_ids)
+        output_len = (
+            len(completion_token_ids) if fixed_output_len is None else fixed_output_len
+        )
+        if min_len <= prompt_len <= max_len:
+            filtered_requests.append(Request(prompt, prompt_len, output_len))
+
+    return filtered_requests
+
+
+def sample_requests_from_random(
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    input_length_range: tuple[int, int],
+    fixed_output_len: Optional[int],
+    prefix_len: int,
+) -> list[Request]:
+    requests = []
+    prefix_token_ids = sample_tokens(tokenizer, prefix_len)
+    min_len, max_len = input_length_range
+
+    for i in range(num_requests):
+        unique_part_token_ids = sample_tokens(
+            tokenizer, random.randint(min_len - prefix_len, max_len - prefix_len)
+        )
+        prompt_token_ids = prefix_token_ids + unique_part_token_ids
+        prompt = tokenizer.decode(prompt_token_ids)
+        prompt_len = len(prompt_token_ids)
+        assert min_len <= prompt_len <= max_len, (
+            f"prompt_len {prompt_len} out of range {min_len}:{max_len}"
+        )
+        requests.append(Request(prompt, prompt_len, fixed_output_len))
+    return requests
+
+
+def repeat_and_sort_requests(
+    requests: list[Request], repeat_count: int, sort: bool = False
+) -> list[str]:
+    repeated_requests = requests * repeat_count
+    if sort:
+        repeated_requests.sort(key=lambda x: x[1])
+    else:
+        random.shuffle(repeated_requests)
+    return [req.prompt for req in repeated_requests]
+
+
+def main(args):
+    tokenizer = get_tokenizer(args.model, trust_remote_code=True)
+    input_length_range = tuple(map(int, args.input_length_range.split(":")))
+    random.seed(args.seed)
+    if args.dataset_path is not None:
+        if args.prefix_len > 0:
+            raise ValueError(
+                "prefix-len is not supported when dataset-path is provided."
+            )
+        print(f"Start to sample {args.num_prompts} prompts from {args.dataset_path}")
+        filtered_requests = sample_requests_from_dataset(
+            dataset_path=args.dataset_path,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            input_length_range=input_length_range,
+            fixed_output_len=args.output_len,
+        )
+    else:
+        print(f"Start to sample {args.num_prompts} prompts from random")
+        filtered_requests = sample_requests_from_random(
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            input_length_range=input_length_range,
+            fixed_output_len=args.output_len,
+            prefix_len=args.prefix_len,
+        )
+
+    # Print some helpful stats of the requests.
+    print(f"Sampled {len(filtered_requests)} requests.")
+    prompt_lens = [req.prompt_len for req in filtered_requests]
+    print(f"Average input length: {sum(prompt_lens) / len(prompt_lens)}")
+    print(f"P50 input length: {sorted(prompt_lens)[len(prompt_lens) // 2]}")
+    print(f"Min Prompt Length: {min(prompt_lens)}")
+    print(f"Max Prompt Length: {max(prompt_lens)}")
+
+    engine_args = EngineArgs.from_cli_args(args)
+
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    sampling_params = SamplingParams(
+        temperature=0,
+        max_tokens=args.output_len,
+        detokenize=not args.disable_detokenize,
+    )
+
+    print("Testing filtered requests")
+    prompts = repeat_and_sort_requests(
+        filtered_requests, repeat_count=args.repeat_count, sort=args.sort
+    )
+
+    print("------start generating------")
+    test_prefix(
+        llm=llm,
+        prompts=prompts,
+        sampling_params=sampling_params,
+    )
+
+
+def create_argument_parser():
+    parser = FlexibleArgumentParser(
+        description="Benchmark the performance with or without "
+        "automatic prefix caching."
+    )
+    parser.add_argument(
+        "--dataset-path", type=str, default=None, help="Path to the dataset."
+    )
+    parser.add_argument("--output-len", type=int, default=10)
+    parser.add_argument(
+        "--num-prompts",
+        type=int,
+        required=True,
+        help="Number of the prompts sampled from dataset",
+    )
+    parser.add_argument(
+        "--repeat-count",
+        type=int,
+        default=1,
+        help="Number of times to repeat each prompt",
+    )
+    parser.add_argument(
+        "--sort", action="store_true", help="Sort prompts by input length"
+    )
+    parser.add_argument(
+        "--input-length-range",
+        type=str,
+        required=True,
+        help="Range of input lengths for sampling prompts,"
+        'specified as "min:max" (e.g., "128:256").',
+    )
+    parser.add_argument(
+        "--prefix-len",
+        type=int,
+        default=0,
+        help="Specifies the length of a common prefix to be "
+        "added to the input prompt. The input-length-range will "
+        "subtract this length when filtering prompts. Only used "
+        "when dataset-path is not provided.",
+    )
+    parser.add_argument(
+        "--disable-detokenize",
+        action="store_true",
+        help=(
+            "Do not detokenize responses (i.e. do not include "
+            "detokenization time in the latency measurement)"
+        ),
+    )
+
+    parser = EngineArgs.add_cli_args(parser)
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = create_argument_parser()
+    args = parser.parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/benchmarks/benchmark_prioritization.py b/vllm_v0.10.0/benchmarks/benchmark_prioritization.py
new file mode 100644
index 0000000..bb45379
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/benchmark_prioritization.py
@@ -0,0 +1,222 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Benchmark offline prioritization."""
+
+import argparse
+import dataclasses
+import json
+import random
+import time
+from typing import Optional
+
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+# Select a equi-probable random priority
+def get_random_flag():
+    return 0 if random.random() < 0.5 else 1
+
+
+def sample_requests(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int],
+) -> list[tuple[str, int, int, int]]:
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    # Only keep the first two turns of each conversation.
+    dataset = [
+        (data["conversations"][0]["value"], data["conversations"][1]["value"])
+        for data in dataset
+    ]
+
+    # Shuffle the dataset.
+    random.shuffle(dataset)
+
+    # Filter out sequences that are too long or too short
+    filtered_dataset: list[tuple[str, int, int]] = []
+    for i in range(len(dataset)):
+        if len(filtered_dataset) == num_requests:
+            break
+
+        # Tokenize the prompts and completions.
+        prompt = dataset[i][0]
+        prompt_token_ids = tokenizer(prompt).input_ids
+        completion = dataset[i][1]
+        completion_token_ids = tokenizer(completion).input_ids
+        prompt_len = len(prompt_token_ids)
+        output_len = (
+            len(completion_token_ids) if fixed_output_len is None else fixed_output_len
+        )
+        if prompt_len < 4 or output_len < 4:
+            # Prune too short sequences.
+            continue
+        if prompt_len > 1024 or prompt_len + output_len > 2048:
+            # Prune too long sequences.
+            continue
+
+        priority = get_random_flag()
+
+        filtered_dataset.append((prompt, prompt_len, output_len, priority))
+
+    return filtered_dataset
+
+
+def run_vllm(
+    requests: list[tuple[str, int, int]],
+    n: int,
+    engine_args: EngineArgs,
+    disable_detokenize: bool = False,
+) -> float:
+    from vllm import LLM, SamplingParams
+
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    assert all(
+        llm.llm_engine.model_config.max_model_len >= (request[1] + request[2])
+        for request in requests
+    ), (
+        "Please ensure that max_model_len is greater than the sum of"
+        " input_len and output_len for all requests."
+    )
+
+    # Add the requests to the engine.
+    prompts = []
+    sampling_params = []
+    priority = []
+    for prompt, _, output_len, _priority in requests:
+        prompts.append(prompt)
+        priority.append(_priority)
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=1.0,
+                top_p=1.0,
+                ignore_eos=True,
+                max_tokens=output_len,
+                detokenize=not disable_detokenize,
+            )
+        )
+
+    start = time.perf_counter()
+    llm.generate(prompts, sampling_params, priority=priority, use_tqdm=True)
+    end = time.perf_counter()
+    return end - start
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+
+    # Sample the requests.
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code
+    )
+    if args.dataset is None:
+        # Synthesize a prompt with the given input length.
+        prompt = "hi" * (args.input_len - 1)
+        requests = [
+            (prompt, args.input_len, args.output_len, get_random_flag())
+            for _ in range(args.num_prompts)
+        ]
+    else:
+        requests = sample_requests(
+            args.dataset, args.num_prompts, tokenizer, args.output_len
+        )
+
+    if args.backend == "vllm":
+        elapsed_time = run_vllm(
+            requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize
+        )
+    else:
+        raise ValueError(f"Unknown backend: {args.backend}")
+    total_num_tokens = sum(
+        prompt_len + output_len for _, prompt_len, output_len, priority in requests
+    )
+    print(
+        f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+        f"{total_num_tokens / elapsed_time:.2f} tokens/s"
+    )
+
+    # Output JSON results if specified
+    if args.output_json:
+        results = {
+            "elapsed_time": elapsed_time,
+            "num_requests": len(requests),
+            "total_num_tokens": total_num_tokens,
+            "requests_per_second": len(requests) / elapsed_time,
+            "tokens_per_second": total_num_tokens / elapsed_time,
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+
+
+def create_argument_parser():
+    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
+    parser.add_argument(
+        "--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm"
+    )
+    parser.add_argument(
+        "--dataset", type=str, default=None, help="Path to the dataset."
+    )
+    parser.add_argument(
+        "--input-len",
+        type=int,
+        default=None,
+        help="Input prompt length for each request",
+    )
+    parser.add_argument(
+        "--output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the "
+        "output length from the dataset.",
+    )
+    parser.add_argument(
+        "--n", type=int, default=1, help="Number of generated sequences per prompt."
+    )
+    parser.add_argument(
+        "--num-prompts", type=int, default=200, help="Number of prompts to process."
+    )
+    parser.add_argument(
+        "--output-json",
+        type=str,
+        default=None,
+        help="Path to save the throughput results in JSON format.",
+    )
+    parser.add_argument(
+        "--disable-detokenize",
+        action="store_true",
+        help=(
+            "Do not detokenize responses (i.e. do not include "
+            "detokenization time in the latency measurement)"
+        ),
+    )
+
+    parser = EngineArgs.add_cli_args(parser)
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = create_argument_parser()
+    args = parser.parse_args()
+    if args.tokenizer is None:
+        args.tokenizer = args.model
+    if args.dataset is None:
+        assert args.input_len is not None
+        assert args.output_len is not None
+    else:
+        assert args.input_len is None
+
+    main(args)
diff --git a/vllm_v0.10.0/benchmarks/benchmark_serving.py b/vllm_v0.10.0/benchmarks/benchmark_serving.py
new file mode 100644
index 0000000..c597fb1
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/benchmark_serving.py
@@ -0,0 +1,1290 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+r"""Benchmark online serving throughput.
+
+On the server side, run one of the following commands:
+    vLLM OpenAI API server
+    vllm serve <your_model> \
+        --swap-space 16 \
+        --disable-log-requests
+
+On the client side, run:
+    python benchmarks/benchmark_serving.py \
+        --backend <backend> \
+        --model <your_model> \
+        --dataset-name sharegpt \
+        --dataset-path <path to dataset> \
+        --request-rate <request_rate> \ # By default <request_rate> is inf
+        --num-prompts <num_prompts> # By default <num_prompts> is 1000
+
+    when using tgi backend, add
+        --endpoint /generate_stream
+    to the end of the command above.
+"""
+
+import argparse
+import asyncio
+import gc
+import json
+import os
+import random
+import time
+import warnings
+from collections.abc import Iterable
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Any, Literal, Optional
+
+import numpy as np
+from tqdm.asyncio import tqdm
+from transformers import PreTrainedTokenizerBase
+
+from backend_request_func import (
+    ASYNC_REQUEST_FUNCS,
+    OPENAI_COMPATIBLE_BACKENDS,
+    RequestFuncInput,
+    RequestFuncOutput,
+)
+
+try:
+    from vllm.transformers_utils.tokenizer import get_tokenizer
+except ImportError:
+    from backend_request_func import get_tokenizer
+
+try:
+    from vllm.utils import FlexibleArgumentParser
+except ImportError:
+    from argparse import ArgumentParser as FlexibleArgumentParser
+
+from benchmark_dataset import (
+    AIMODataset,
+    ASRDataset,
+    BurstGPTDataset,
+    ConversationDataset,
+    CustomDataset,
+    HuggingFaceDataset,
+    InstructCoderDataset,
+    MTBenchDataset,
+    NextEditPredictionDataset,
+    RandomDataset,
+    SampleRequest,
+    ShareGPTDataset,
+    SonnetDataset,
+    VisionArenaDataset,
+)
+from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
+from vllm.benchmarks.serve import get_request
+
+MILLISECONDS_TO_SECONDS_CONVERSION = 1000
+
+
+@dataclass
+class BenchmarkMetrics:
+    completed: int
+    total_input: int
+    total_output: int
+    request_throughput: float
+    request_goodput: float
+    output_throughput: float
+    total_token_throughput: float
+    mean_ttft_ms: float
+    median_ttft_ms: float
+    std_ttft_ms: float
+    percentiles_ttft_ms: list[tuple[float, float]]
+    mean_tpot_ms: float
+    median_tpot_ms: float
+    std_tpot_ms: float
+    percentiles_tpot_ms: list[tuple[float, float]]
+    mean_itl_ms: float
+    median_itl_ms: float
+    std_itl_ms: float
+    percentiles_itl_ms: list[tuple[float, float]]
+    # E2EL stands for end-to-end latency per request.
+    # It is the time taken on the client side from sending
+    # a request to receiving a complete response.
+    mean_e2el_ms: float
+    median_e2el_ms: float
+    std_e2el_ms: float
+    percentiles_e2el_ms: list[tuple[float, float]]
+
+
+def calculate_metrics(
+    input_requests: list[SampleRequest],
+    outputs: list[RequestFuncOutput],
+    dur_s: float,
+    tokenizer: PreTrainedTokenizerBase,
+    selected_percentile_metrics: list[str],
+    selected_percentiles: list[float],
+    goodput_config_dict: dict[str, float],
+) -> tuple[BenchmarkMetrics, list[int]]:
+    actual_output_lens: list[int] = []
+    total_input = 0
+    completed = 0
+    good_completed = 0
+    itls: list[float] = []
+    tpots: list[float] = []
+    all_tpots: list[float] = []
+    ttfts: list[float] = []
+    e2els: list[float] = []
+    for i in range(len(outputs)):
+        if outputs[i].success:
+            output_len = outputs[i].output_tokens
+
+            if not output_len:
+                # We use the tokenizer to count the number of output tokens
+                # for some serving backends instead of looking at
+                # len(outputs[i].itl) since multiple output tokens may be
+                # bundled together
+                # Note : this may inflate the output token count slightly
+                output_len = len(
+                    tokenizer(
+                        outputs[i].generated_text, add_special_tokens=False
+                    ).input_ids
+                )
+            actual_output_lens.append(output_len)
+            total_input += input_requests[i].prompt_len
+            tpot = 0
+            if output_len > 1:
+                latency_minus_ttft = outputs[i].latency - outputs[i].ttft
+                tpot = latency_minus_ttft / (output_len - 1)
+                tpots.append(tpot)
+            # Note: if output_len <= 1, we regard tpot as 0 for goodput
+            all_tpots.append(tpot)
+            itls += outputs[i].itl
+            ttfts.append(outputs[i].ttft)
+            e2els.append(outputs[i].latency)
+            completed += 1
+        else:
+            actual_output_lens.append(0)
+
+    if goodput_config_dict:
+        valid_metrics = []
+        slo_values = []
+
+        if "ttft" in goodput_config_dict:
+            valid_metrics.append(ttfts)
+            slo_values.append(
+                goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )
+        if "tpot" in goodput_config_dict:
+            valid_metrics.append(all_tpots)
+            slo_values.append(
+                goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )
+        if "e2el" in goodput_config_dict:
+            valid_metrics.append(e2els)
+            slo_values.append(
+                goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )
+
+        for req_metric in zip(*valid_metrics):
+            is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
+            if is_good_req:
+                good_completed += 1
+
+    if completed == 0:
+        warnings.warn(
+            "All requests failed. This is likely due to a misconfiguration "
+            "on the benchmark arguments.",
+            stacklevel=2,
+        )
+    metrics = BenchmarkMetrics(
+        completed=completed,
+        total_input=total_input,
+        total_output=sum(actual_output_lens),
+        request_throughput=completed / dur_s,
+        request_goodput=good_completed / dur_s,
+        output_throughput=sum(actual_output_lens) / dur_s,
+        total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
+        mean_ttft_ms=np.mean(ttfts or 0)
+        * 1000,  # ttfts is empty if streaming is not supported by backend
+        std_ttft_ms=np.std(ttfts or 0) * 1000,
+        median_ttft_ms=np.median(ttfts or 0) * 1000,
+        percentiles_ttft_ms=[
+            (p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles
+        ],
+        mean_tpot_ms=np.mean(tpots or 0) * 1000,
+        std_tpot_ms=np.std(tpots or 0) * 1000,
+        median_tpot_ms=np.median(tpots or 0) * 1000,
+        percentiles_tpot_ms=[
+            (p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles
+        ],
+        mean_itl_ms=np.mean(itls or 0) * 1000,
+        std_itl_ms=np.std(itls or 0) * 1000,
+        median_itl_ms=np.median(itls or 0) * 1000,
+        percentiles_itl_ms=[
+            (p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles
+        ],
+        mean_e2el_ms=np.mean(e2els or 0) * 1000,
+        std_e2el_ms=np.std(e2els or 0) * 1000,
+        median_e2el_ms=np.median(e2els or 0) * 1000,
+        percentiles_e2el_ms=[
+            (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles
+        ],
+    )
+
+    return metrics, actual_output_lens
+
+
+async def benchmark(
+    backend: str,
+    api_url: str,
+    base_url: str,
+    model_id: str,
+    model_name: str,
+    tokenizer: PreTrainedTokenizerBase,
+    input_requests: list[SampleRequest],
+    logprobs: Optional[int],
+    request_rate: float,
+    burstiness: float,
+    disable_tqdm: bool,
+    profile: bool,
+    selected_percentile_metrics: list[str],
+    selected_percentiles: list[float],
+    ignore_eos: bool,
+    goodput_config_dict: dict[str, float],
+    max_concurrency: Optional[int],
+    lora_modules: Optional[Iterable[str]],
+    extra_body: Optional[dict],
+    ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
+    ramp_up_start_rps: Optional[int] = None,
+    ramp_up_end_rps: Optional[int] = None,
+):
+    if backend in ASYNC_REQUEST_FUNCS:
+        request_func = ASYNC_REQUEST_FUNCS[backend]
+    else:
+        raise ValueError(f"Unknown backend: {backend}")
+
+    print("Starting initial single prompt test run...")
+    test_prompt, test_prompt_len, test_output_len, test_mm_content = (
+        input_requests[0].prompt,
+        input_requests[0].prompt_len,
+        input_requests[0].expected_output_len,
+        input_requests[0].multi_modal_data,
+    )
+
+    assert test_mm_content is None or isinstance(test_mm_content, dict)
+    test_input = RequestFuncInput(
+        model=model_id,
+        model_name=model_name,
+        prompt=test_prompt,
+        api_url=api_url,
+        prompt_len=test_prompt_len,
+        output_len=test_output_len,
+        logprobs=logprobs,
+        multi_modal_content=test_mm_content,
+        ignore_eos=ignore_eos,
+        extra_body=extra_body,
+    )
+
+    test_output = await request_func(request_func_input=test_input)
+    if not test_output.success:
+        raise ValueError(
+            "Initial test run failed - Please make sure benchmark arguments "
+            f"are correctly specified. Error: {test_output.error}"
+        )
+    else:
+        print("Initial test run completed. Starting main benchmark run...")
+
+    if lora_modules:
+        # For each input request, choose a LoRA module at random.
+        lora_modules = iter(
+            [random.choice(lora_modules) for _ in range(len(input_requests))]
+        )
+
+    if profile:
+        print("Starting profiler...")
+        profile_input = RequestFuncInput(
+            model=model_id,
+            model_name=model_name,
+            prompt=test_prompt,
+            api_url=base_url + "/start_profile",
+            prompt_len=test_prompt_len,
+            output_len=test_output_len,
+            logprobs=logprobs,
+            multi_modal_content=test_mm_content,
+            ignore_eos=ignore_eos,
+            extra_body=extra_body,
+        )
+        profile_output = await request_func(request_func_input=profile_input)
+        if profile_output.success:
+            print("Profiler started")
+
+    distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution"
+
+    if ramp_up_strategy is not None:
+        print(
+            f"Traffic ramp-up strategy: {ramp_up_strategy}. Will increase "
+            f"RPS from {ramp_up_start_rps} to {ramp_up_end_rps} RPS over "
+            "the duration of the benchmark."
+        )
+    else:
+        print(f"Traffic request rate: {request_rate} RPS.")
+
+    print(f"Burstiness factor: {burstiness} ({distribution})")
+    print(f"Maximum request concurrency: {max_concurrency}")
+
+    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
+
+    # This can be used once the minimum Python version is 3.10 or higher,
+    # and it will simplify the code in limited_request_func.
+    #    semaphore = (asyncio.Semaphore(max_concurrency)
+    #                 if max_concurrency else contextlib.nullcontext())
+    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
+
+    async def limited_request_func(request_func_input, pbar):
+        if semaphore is None:
+            return await request_func(request_func_input=request_func_input, pbar=pbar)
+        async with semaphore:
+            return await request_func(request_func_input=request_func_input, pbar=pbar)
+
+    benchmark_start_time = time.perf_counter()
+    tasks: list[asyncio.Task] = []
+
+    rps_change_events = []
+    last_int_rps = -1
+    if ramp_up_strategy is not None and ramp_up_start_rps is not None:
+        last_int_rps = ramp_up_start_rps
+        rps_change_events.append(
+            {
+                "rps": last_int_rps,
+                "timestamp": datetime.now().isoformat(),
+            }
+        )
+
+    async for request, current_request_rate in get_request(
+        input_requests,
+        request_rate,
+        burstiness,
+        ramp_up_strategy,
+        ramp_up_start_rps,
+        ramp_up_end_rps,
+    ):
+        if ramp_up_strategy is not None:
+            current_int_rps = int(current_request_rate)
+            if current_int_rps > last_int_rps:
+                timestamp = datetime.now().isoformat()
+                for rps_val in range(last_int_rps + 1, current_int_rps + 1):
+                    rps_change_events.append({"rps": rps_val, "timestamp": timestamp})
+                last_int_rps = current_int_rps
+
+        prompt, prompt_len, output_len, mm_content = (
+            request.prompt,
+            request.prompt_len,
+            request.expected_output_len,
+            request.multi_modal_data,
+        )
+        req_model_id, req_model_name = model_id, model_name
+        if lora_modules:
+            req_lora_module = next(lora_modules)
+            req_model_id, req_model_name = req_lora_module, req_lora_module
+
+        request_func_input = RequestFuncInput(
+            model=req_model_id,
+            model_name=req_model_name,
+            prompt=prompt,
+            api_url=api_url,
+            prompt_len=prompt_len,
+            output_len=output_len,
+            logprobs=logprobs,
+            multi_modal_content=mm_content,
+            ignore_eos=ignore_eos,
+            extra_body=extra_body,
+        )
+        task = limited_request_func(request_func_input=request_func_input, pbar=pbar)
+        tasks.append(asyncio.create_task(task))
+    outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
+
+    if profile:
+        print("Stopping profiler...")
+        profile_input = RequestFuncInput(
+            model=model_id,
+            prompt=test_prompt,
+            api_url=base_url + "/stop_profile",
+            prompt_len=test_prompt_len,
+            output_len=test_output_len,
+            logprobs=logprobs,
+        )
+        profile_output = await request_func(request_func_input=profile_input)
+        if profile_output.success:
+            print("Profiler stopped")
+
+    if pbar is not None:
+        pbar.close()
+
+    benchmark_duration = time.perf_counter() - benchmark_start_time
+
+    metrics, actual_output_lens = calculate_metrics(
+        input_requests=input_requests,
+        outputs=outputs,
+        dur_s=benchmark_duration,
+        tokenizer=tokenizer,
+        selected_percentile_metrics=selected_percentile_metrics,
+        selected_percentiles=selected_percentiles,
+        goodput_config_dict=goodput_config_dict,
+    )
+
+    print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
+    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
+    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
+    print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Request throughput (req/s):", metrics.request_throughput
+        )
+    )
+    if goodput_config_dict:
+        print(
+            "{:<40} {:<10.2f}".format(
+                "Request goodput (req/s):", metrics.request_goodput
+            )
+        )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Output token throughput (tok/s):", metrics.output_throughput
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Total Token throughput (tok/s):", metrics.total_token_throughput
+        )
+    )
+
+    result = {
+        "duration": benchmark_duration,
+        "completed": metrics.completed,
+        "total_input_tokens": metrics.total_input,
+        "total_output_tokens": metrics.total_output,
+        "request_throughput": metrics.request_throughput,
+        "request_goodput": metrics.request_goodput if goodput_config_dict else None,
+        "output_throughput": metrics.output_throughput,
+        "total_token_throughput": metrics.total_token_throughput,
+        "input_lens": [output.prompt_len for output in outputs],
+        "output_lens": actual_output_lens,
+        "ttfts": [output.ttft for output in outputs],
+        "itls": [output.itl for output in outputs],
+        "generated_texts": [output.generated_text for output in outputs],
+        "errors": [output.error for output in outputs],
+    }
+
+    if rps_change_events:
+        result["rps_change_events"] = rps_change_events
+
+    def process_one_metric(
+        # E.g., "ttft"
+        metric_attribute_name: str,
+        # E.g., "TTFT"
+        metric_name: str,
+        # E.g., "Time to First Token"
+        metric_header: str,
+    ):
+        # This function prints and adds statistics of the specified
+        # metric.
+        if metric_attribute_name not in selected_percentile_metrics:
+            return
+        print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
+        print(
+            "{:<40} {:<10.2f}".format(
+                f"Mean {metric_name} (ms):",
+                getattr(metrics, f"mean_{metric_attribute_name}_ms"),
+            )
+        )
+        print(
+            "{:<40} {:<10.2f}".format(
+                f"Median {metric_name} (ms):",
+                getattr(metrics, f"median_{metric_attribute_name}_ms"),
+            )
+        )
+        result[f"mean_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"mean_{metric_attribute_name}_ms"
+        )
+        result[f"median_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"median_{metric_attribute_name}_ms"
+        )
+        result[f"std_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"std_{metric_attribute_name}_ms"
+        )
+        for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
+            p_word = str(int(p)) if int(p) == p else str(p)
+            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
+            result[f"p{p_word}_{metric_attribute_name}_ms"] = value
+
+    process_one_metric("ttft", "TTFT", "Time to First Token")
+    process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
+    process_one_metric("itl", "ITL", "Inter-token Latency")
+    process_one_metric("e2el", "E2EL", "End-to-end Latency")
+
+    print("=" * 50)
+
+    return result
+
+
+def check_goodput_args(args):
+    # Check and parse goodput arguments
+    goodput_config_dict = {}
+    VALID_NAMES = ["ttft", "tpot", "e2el"]
+    if args.goodput:
+        goodput_config_dict = parse_goodput(args.goodput)
+        for slo_name, slo_val in goodput_config_dict.items():
+            if slo_name not in VALID_NAMES:
+                raise ValueError(
+                    f"Invalid metric name found, {slo_name}: {slo_val}. "
+                    "The service level objective name should be one of "
+                    f"{str(VALID_NAMES)}. "
+                )
+            if slo_val < 0:
+                raise ValueError(
+                    f"Invalid value found, {slo_name}: {slo_val}. "
+                    "The service level objective value should be "
+                    "non-negative."
+                )
+    return goodput_config_dict
+
+
+def parse_goodput(slo_pairs):
+    goodput_config_dict = {}
+    try:
+        for slo_pair in slo_pairs:
+            slo_name, slo_val = slo_pair.split(":")
+            goodput_config_dict[slo_name] = float(slo_val)
+    except ValueError as err:
+        raise argparse.ArgumentTypeError(
+            "Invalid format found for service level objectives. "
+            'Specify service level objectives for goodput as "KEY:VALUE" '
+            "pairs, where the key is a metric name, and the value is a "
+            "number in milliseconds."
+        ) from err
+    return goodput_config_dict
+
+
+def save_to_pytorch_benchmark_format(
+    args: argparse.Namespace, results: dict[str, Any], file_name: str
+) -> None:
+    metrics = [
+        "median_ttft_ms",
+        "mean_ttft_ms",
+        "std_ttft_ms",
+        "p99_ttft_ms",
+        "mean_tpot_ms",
+        "median_tpot_ms",
+        "std_tpot_ms",
+        "p99_tpot_ms",
+        "median_itl_ms",
+        "mean_itl_ms",
+        "std_itl_ms",
+        "p99_itl_ms",
+    ]
+    # These raw data might be useful, but they are rather big. They can be added
+    # later if needed
+    ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
+    pt_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={k: [results[k]] for k in metrics},
+        extra_info={
+            k: results[k]
+            for k in results
+            if k not in metrics and k not in ignored_metrics
+        },
+    )
+    if pt_records:
+        # Don't use json suffix here as we don't want CI to pick it up
+        pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
+        write_to_json(pt_file, pt_records)
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    backend = args.backend
+    model_id = args.model
+    model_name = args.served_model_name
+    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
+    tokenizer_mode = args.tokenizer_mode
+
+    # Validate ramp-up arguments
+    if args.ramp_up_strategy is not None:
+        if args.request_rate != float("inf"):
+            raise ValueError(
+                "When using ramp-up, do not specify --request-rate. "
+                "The request rate will be controlled by ramp-up parameters. "
+                "Please remove the --request-rate argument."
+            )
+        if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None:
+            raise ValueError(
+                "When using --ramp-up-strategy, both --ramp-up-start-rps and "
+                "--ramp-up-end-rps must be specified"
+            )
+        if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0:
+            raise ValueError("Ramp-up start and end RPS must be non-negative")
+        if args.ramp_up_start_rps > args.ramp_up_end_rps:
+            raise ValueError("Ramp-up start RPS must be less than end RPS")
+        if args.ramp_up_strategy == "exponential" and args.ramp_up_start_rps == 0:
+            raise ValueError("For exponential ramp-up, the start RPS cannot be 0.")
+
+    if args.base_url is not None:
+        api_url = f"{args.base_url}{args.endpoint}"
+        base_url = f"{args.base_url}"
+    else:
+        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
+        base_url = f"http://{args.host}:{args.port}"
+
+    tokenizer = get_tokenizer(
+        tokenizer_id,
+        tokenizer_mode=tokenizer_mode,
+        trust_remote_code=args.trust_remote_code,
+    )
+
+    if args.dataset_name is None:
+        raise ValueError(
+            "Please specify '--dataset-name' and the corresponding "
+            "'--dataset-path' if required."
+        )
+
+    if args.dataset_name == "custom":
+        dataset = CustomDataset(dataset_path=args.dataset_path)
+        input_requests = dataset.sample(
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            output_len=args.custom_output_len,
+            skip_chat_template=args.custom_skip_chat_template,
+        )
+
+    elif args.dataset_name == "sonnet":
+        dataset = SonnetDataset(dataset_path=args.dataset_path)
+        # For the "sonnet" dataset, formatting depends on the backend.
+        if args.backend == "openai-chat":
+            input_requests = dataset.sample(
+                num_requests=args.num_prompts,
+                input_len=args.sonnet_input_len,
+                output_len=args.sonnet_output_len,
+                prefix_len=args.sonnet_prefix_len,
+                tokenizer=tokenizer,
+                return_prompt_formatted=False,
+            )
+        else:
+            assert tokenizer.chat_template or tokenizer.default_chat_template, (
+                "Tokenizer/model must have chat template for sonnet dataset."
+            )
+            input_requests = dataset.sample(
+                num_requests=args.num_prompts,
+                input_len=args.sonnet_input_len,
+                output_len=args.sonnet_output_len,
+                prefix_len=args.sonnet_prefix_len,
+                tokenizer=tokenizer,
+                return_prompt_formatted=True,
+            )
+
+    elif args.dataset_name == "hf":
+        # all following datasets are implemented from the
+        # HuggingFaceDataset base class
+        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = VisionArenaDataset
+            args.hf_split = "train"
+            args.hf_subset = None
+        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = InstructCoderDataset
+            args.hf_split = "train"
+        elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = MTBenchDataset
+            args.hf_split = "train"
+        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = ConversationDataset
+        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = AIMODataset
+            args.hf_split = "train"
+        elif args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS:  # noqa: E501
+            dataset_class = NextEditPredictionDataset
+            args.hf_split = "train"
+        elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = ASRDataset
+            args.hf_split = "train"
+        else:
+            supported_datasets = set(
+                [
+                    dataset_name
+                    for cls in HuggingFaceDataset.__subclasses__()
+                    for dataset_name in cls.SUPPORTED_DATASET_PATHS
+                ]
+            )
+            raise ValueError(
+                f"Unsupported dataset path: {args.dataset_path}. "
+                "Huggingface dataset only supports dataset_path"
+                f" from one of following: {supported_datasets}. "
+                "Please consider contributing if you would "
+                "like to add support for additional dataset formats."
+            )
+
+        if dataset_class.IS_MULTIMODAL and backend not in [
+            "openai-chat",
+            "openai-audio",
+        ]:
+            # multi-modal benchmark is only available on OpenAI Chat backend.
+            raise ValueError(
+                "Multi-modal content is only supported on 'openai-chat' and "
+                "'openai-audio' backend."
+            )
+        input_requests = dataset_class(
+            dataset_path=args.dataset_path,
+            dataset_subset=args.hf_subset,
+            dataset_split=args.hf_split,
+            random_seed=args.seed,
+            no_stream=args.no_stream,
+        ).sample(
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            output_len=args.hf_output_len,
+        )
+
+    else:
+        # For datasets that follow a similar structure, use a mapping.
+        dataset_mapping = {
+            "sharegpt": lambda: ShareGPTDataset(
+                random_seed=args.seed, dataset_path=args.dataset_path
+            ).sample(
+                tokenizer=tokenizer,
+                num_requests=args.num_prompts,
+                output_len=args.sharegpt_output_len,
+            ),
+            "burstgpt": lambda: BurstGPTDataset(
+                random_seed=args.seed, dataset_path=args.dataset_path
+            ).sample(tokenizer=tokenizer, num_requests=args.num_prompts),
+            "random": lambda: RandomDataset(dataset_path=args.dataset_path).sample(
+                tokenizer=tokenizer,
+                num_requests=args.num_prompts,
+                prefix_len=args.random_prefix_len,
+                input_len=args.random_input_len,
+                output_len=args.random_output_len,
+                range_ratio=args.random_range_ratio,
+            ),
+        }
+
+        try:
+            input_requests = dataset_mapping[args.dataset_name]()
+        except KeyError as err:
+            raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
+    goodput_config_dict = check_goodput_args(args)
+
+    # Collect the sampling parameters.
+    sampling_params = {
+        k: v
+        for k, v in {
+            "top_p": args.top_p,
+            "top_k": args.top_k,
+            "min_p": args.min_p,
+            "temperature": args.temperature,
+        }.items()
+        if v is not None
+    }
+
+    # Sampling parameters are only supported by openai-compatible backend.
+    if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
+        raise ValueError(
+            "Sampling parameters are only supported by openai-compatible backends."
+        )
+
+    if "temperature" not in sampling_params:
+        sampling_params["temperature"] = 0.0  # Default to greedy decoding.
+
+    if args.backend == "llama.cpp":
+        # Disable prompt caching in llama.cpp backend
+        sampling_params["cache_prompt"] = False
+
+    # Avoid GC processing "static" data - reduce pause times.
+    gc.collect()
+    gc.freeze()
+
+    benchmark_result = asyncio.run(
+        benchmark(
+            backend=backend,
+            api_url=api_url,
+            base_url=base_url,
+            model_id=model_id,
+            model_name=model_name,
+            tokenizer=tokenizer,
+            input_requests=input_requests,
+            logprobs=args.logprobs,
+            request_rate=args.request_rate,
+            burstiness=args.burstiness,
+            disable_tqdm=args.disable_tqdm,
+            profile=args.profile,
+            selected_percentile_metrics=args.percentile_metrics.split(","),
+            selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
+            ignore_eos=args.ignore_eos,
+            goodput_config_dict=goodput_config_dict,
+            max_concurrency=args.max_concurrency,
+            lora_modules=args.lora_modules,
+            extra_body=sampling_params,
+            ramp_up_strategy=args.ramp_up_strategy,
+            ramp_up_start_rps=args.ramp_up_start_rps,
+            ramp_up_end_rps=args.ramp_up_end_rps,
+        )
+    )
+
+    # Save config and results to json
+    if args.save_result or args.append_result:
+        result_json: dict[str, Any] = {}
+
+        # Setup
+        current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
+        result_json["date"] = current_dt
+        result_json["backend"] = backend
+        result_json["model_id"] = model_id
+        result_json["tokenizer_id"] = tokenizer_id
+        result_json["num_prompts"] = args.num_prompts
+
+        # Metadata
+        if args.metadata:
+            for item in args.metadata:
+                if "=" in item:
+                    kvstring = item.split("=")
+                    result_json[kvstring[0].strip()] = kvstring[1].strip()
+                else:
+                    raise ValueError(
+                        "Invalid metadata format. Please use KEY=VALUE format."
+                    )
+        # Traffic
+        result_json["request_rate"] = (
+            args.request_rate if args.request_rate < float("inf") else "inf"
+        )
+        result_json["burstiness"] = args.burstiness
+        result_json["max_concurrency"] = args.max_concurrency
+
+        if args.ramp_up_strategy is not None:
+            result_json["ramp_up_strategy"] = args.ramp_up_strategy
+            result_json["ramp_up_start_rps"] = args.ramp_up_start_rps
+            result_json["ramp_up_end_rps"] = args.ramp_up_end_rps
+
+        # Merge with benchmark result
+        result_json = {**result_json, **benchmark_result}
+
+        if not args.save_detailed:
+            # Remove fields with too many data points
+            for field in [
+                "input_lens",
+                "output_lens",
+                "ttfts",
+                "itls",
+                "generated_texts",
+                "errors",
+            ]:
+                if field in result_json:
+                    del result_json[field]
+                if field in benchmark_result:
+                    del benchmark_result[field]
+
+        # Save to file
+        base_model_id = model_id.split("/")[-1]
+        max_concurrency_str = (
+            f"-concurrency{args.max_concurrency}"
+            if args.max_concurrency is not None
+            else ""
+        )
+        if args.ramp_up_strategy is not None:
+            file_name = f"{backend}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
+        else:
+            file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
+        if args.result_filename:
+            file_name = args.result_filename
+        if args.result_dir:
+            os.makedirs(args.result_dir, exist_ok=True)
+            file_name = os.path.join(args.result_dir, file_name)
+        with open(
+            file_name, mode="a+" if args.append_result else "w", encoding="utf-8"
+        ) as outfile:
+            # Append a newline.
+            if args.append_result and outfile.tell() != 0:
+                outfile.write("\n")
+            json.dump(result_json, outfile)
+        save_to_pytorch_benchmark_format(args, result_json, file_name)
+
+
+def create_argument_parser():
+    parser = FlexibleArgumentParser(
+        description="Benchmark the online serving throughput."
+    )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="vllm",
+        choices=list(ASYNC_REQUEST_FUNCS.keys()),
+    )
+    parser.add_argument(
+        "--base-url",
+        type=str,
+        default=None,
+        help="Server or API base url if not using http host and port.",
+    )
+    # Use 127.0.0.1 here instead of localhost to force the use of ipv4
+    parser.add_argument("--host", type=str, default="127.0.0.1")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument(
+        "--endpoint",
+        type=str,
+        default="/v1/completions",
+        help="API endpoint.",
+    )
+    parser.add_argument(
+        "--dataset-name",
+        type=str,
+        default="sharegpt",
+        choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"],
+        help="Name of the dataset to benchmark on.",
+    )
+    parser.add_argument(
+        "--dataset-path",
+        type=str,
+        default=None,
+        help="Path to the sharegpt/sonnet dataset. "
+        "Or the huggingface dataset ID if using HF dataset.",
+    )
+    parser.add_argument(
+        "--no-stream",
+        action="store_true",
+        help="Do not load the dataset in streaming mode.",
+    )
+    parser.add_argument(
+        "--max-concurrency",
+        type=int,
+        default=None,
+        help="Maximum number of concurrent requests. This can be used "
+        "to help simulate an environment where a higher level component "
+        "is enforcing a maximum number of concurrent requests. While the "
+        "--request-rate argument controls the rate at which requests are "
+        "initiated, this argument will control how many are actually allowed "
+        "to execute at a time. This means that when used in combination, the "
+        "actual request rate may be lower than specified with --request-rate, "
+        "if the server is not processing requests fast enough to keep up.",
+    )
+
+    parser.add_argument(
+        "--model",
+        type=str,
+        required=True,
+        help="Name of the model.",
+    )
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+    )
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument(
+        "--num-prompts",
+        type=int,
+        default=1000,
+        help="Number of prompts to process.",
+    )
+    parser.add_argument(
+        "--logprobs",
+        type=int,
+        default=None,
+        help=(
+            "Number of logprobs-per-token to compute & return as part of "
+            "the request. If unspecified, then either (1) if beam search "
+            "is disabled, no logprobs are computed & a single dummy "
+            "logprob is returned for each token; or (2) if beam search "
+            "is enabled 1 logprob per token is computed"
+        ),
+    )
+    parser.add_argument(
+        "--request-rate",
+        type=float,
+        default=float("inf"),
+        help="Number of requests per second. If this is inf, "
+        "then all the requests are sent at time 0. "
+        "Otherwise, we use Poisson process or gamma distribution "
+        "to synthesize the request arrival times.",
+    )
+    parser.add_argument(
+        "--burstiness",
+        type=float,
+        default=1.0,
+        help="Burstiness factor of the request generation. "
+        "Only take effect when request_rate is not inf. "
+        "Default value is 1, which follows Poisson process. "
+        "Otherwise, the request intervals follow a gamma distribution. "
+        "A lower burstiness value (0 < burstiness < 1) results in more "
+        "bursty requests. A higher burstiness value (burstiness > 1) "
+        "results in a more uniform arrival of requests.",
+    )
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="Trust remote code from huggingface",
+    )
+    parser.add_argument(
+        "--disable-tqdm",
+        action="store_true",
+        help="Specify to disable tqdm progress bar.",
+    )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="Use Torch Profiler. The endpoint must be launched with "
+        "VLLM_TORCH_PROFILER_DIR to enable profiler.",
+    )
+    parser.add_argument(
+        "--save-result",
+        action="store_true",
+        help="Specify to save benchmark results to a json file",
+    )
+    parser.add_argument(
+        "--save-detailed",
+        action="store_true",
+        help="When saving the results, whether to include per request "
+        "information such as response, error, ttfs, tpots, etc.",
+    )
+    parser.add_argument(
+        "--append-result",
+        action="store_true",
+        help="Append the benchmark result to the existing json file.",
+    )
+    parser.add_argument(
+        "--metadata",
+        metavar="KEY=VALUE",
+        nargs="*",
+        help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) "
+        "for metadata of this run to be saved in the result JSON file "
+        "for record keeping purposes.",
+    )
+    parser.add_argument(
+        "--result-dir",
+        type=str,
+        default=None,
+        help="Specify directory to save benchmark json results."
+        "If not specified, results are saved in the current directory.",
+    )
+    parser.add_argument(
+        "--result-filename",
+        type=str,
+        default=None,
+        help="Specify the filename to save benchmark json results."
+        "If not specified, results will be saved in "
+        "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
+        " format.",
+    )
+    parser.add_argument(
+        "--ignore-eos",
+        action="store_true",
+        help="Set ignore_eos flag when sending the benchmark request."
+        "Warning: ignore_eos is not supported in deepspeed_mii and tgi.",
+    )
+    parser.add_argument(
+        "--percentile-metrics",
+        type=str,
+        default="ttft,tpot,itl",
+        help="Comma-separated list of selected metrics to report percentils. "
+        "This argument specifies the metrics to report percentiles. "
+        'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
+        'Default value is "ttft,tpot,itl".',
+    )
+    parser.add_argument(
+        "--metric-percentiles",
+        type=str,
+        default="99",
+        help="Comma-separated list of percentiles for selected metrics. "
+        'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". '
+        'Default value is "99". '
+        'Use "--percentile-metrics" to select metrics.',
+    )
+    parser.add_argument(
+        "--goodput",
+        nargs="+",
+        required=False,
+        help='Specify service level objectives for goodput as "KEY:VALUE" '
+        "pairs, where the key is a metric name, and the value is in "
+        'milliseconds. Multiple "KEY:VALUE" pairs can be provided, '
+        "separated by spaces. Allowed request level metric names are "
+        '"ttft", "tpot", "e2el". For more context on the definition of '
+        "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
+        "and the blog: https://hao-ai-lab.github.io/blogs/distserve",
+    )
+
+    # group for dataset specific arguments
+    custom_group = parser.add_argument_group("custom dataset options")
+    custom_group.add_argument(
+        "--custom-output-len",
+        type=int,
+        default=256,
+        help="Number of output tokens per request, used only for custom dataset.",
+    )
+    custom_group.add_argument(
+        "--custom-skip-chat-template",
+        action="store_true",
+        help="Skip applying chat template to prompt, used only for custom dataset.",
+    )
+
+    sonnet_group = parser.add_argument_group("sonnet dataset options")
+    sonnet_group.add_argument(
+        "--sonnet-input-len",
+        type=int,
+        default=550,
+        help="Number of input tokens per request, used only for sonnet dataset.",
+    )
+    sonnet_group.add_argument(
+        "--sonnet-output-len",
+        type=int,
+        default=150,
+        help="Number of output tokens per request, used only for sonnet dataset.",
+    )
+    sonnet_group.add_argument(
+        "--sonnet-prefix-len",
+        type=int,
+        default=200,
+        help="Number of prefix tokens per request, used only for sonnet dataset.",
+    )
+
+    sharegpt_group = parser.add_argument_group("sharegpt dataset options")
+    sharegpt_group.add_argument(
+        "--sharegpt-output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the output length "
+        "from the ShareGPT dataset.",
+    )
+
+    random_group = parser.add_argument_group("random dataset options")
+    random_group.add_argument(
+        "--random-input-len",
+        type=int,
+        default=1024,
+        help="Number of input tokens per request, used only for random sampling.",
+    )
+    random_group.add_argument(
+        "--random-output-len",
+        type=int,
+        default=128,
+        help="Number of output tokens per request, used only for random sampling.",
+    )
+    random_group.add_argument(
+        "--random-range-ratio",
+        type=float,
+        default=0.0,
+        help="Range ratio for sampling input/output length, "
+        "used only for random sampling. Must be in the range [0, 1) to define "
+        "a symmetric sampling range"
+        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
+    )
+    random_group.add_argument(
+        "--random-prefix-len",
+        type=int,
+        default=0,
+        help=(
+            "Number of fixed prefix tokens before the random context "
+            "in a request. "
+            "The total input length is the sum of `random-prefix-len` and "
+            "a random "
+            "context length sampled from [input_len * (1 - range_ratio), "
+            "input_len * (1 + range_ratio)]."
+        ),
+    )
+
+    hf_group = parser.add_argument_group("hf dataset options")
+    hf_group.add_argument(
+        "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
+    )
+    hf_group.add_argument(
+        "--hf-split", type=str, default=None, help="Split of the HF dataset."
+    )
+    hf_group.add_argument(
+        "--hf-output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the output lengths "
+        "from the sampled HF dataset.",
+    )
+
+    sampling_group = parser.add_argument_group("sampling parameters")
+    sampling_group.add_argument(
+        "--top-p",
+        type=float,
+        default=None,
+        help="Top-p sampling parameter. Only has effect on openai-compatible backends.",
+    )
+    sampling_group.add_argument(
+        "--top-k",
+        type=int,
+        default=None,
+        help="Top-k sampling parameter. Only has effect on openai-compatible backends.",
+    )
+    sampling_group.add_argument(
+        "--min-p",
+        type=float,
+        default=None,
+        help="Min-p sampling parameter. Only has effect on openai-compatible backends.",
+    )
+    sampling_group.add_argument(
+        "--temperature",
+        type=float,
+        default=None,
+        help="Temperature sampling parameter. Only has effect on "
+        "openai-compatible backends. If not specified, default to greedy "
+        "decoding (i.e. temperature==0.0).",
+    )
+
+    parser.add_argument(
+        "--tokenizer-mode",
+        type=str,
+        default="auto",
+        choices=["auto", "slow", "mistral", "custom"],
+        help='The tokenizer mode.\n\n* "auto" will use the '
+        'fast tokenizer if available.\n* "slow" will '
+        "always use the slow tokenizer. \n* "
+        '"mistral" will always use the `mistral_common` tokenizer. \n*'
+        '"custom" will use --tokenizer to select the preregistered tokenizer.',
+    )
+
+    parser.add_argument(
+        "--served-model-name",
+        type=str,
+        default=None,
+        help="The model name used in the API. "
+        "If not specified, the model name will be the "
+        "same as the ``--model`` argument. ",
+    )
+
+    parser.add_argument(
+        "--lora-modules",
+        nargs="+",
+        default=None,
+        help="A subset of LoRA module names passed in when "
+        "launching the server. For each request, the "
+        "script chooses a LoRA module at random.",
+    )
+
+    parser.add_argument(
+        "--ramp-up-strategy",
+        type=str,
+        default=None,
+        choices=["linear", "exponential"],
+        help="The ramp-up strategy. This would be used to "
+        "ramp up the request rate from initial RPS to final "
+        "RPS rate (specified by --ramp-up-start-rps and --ramp-up-end-rps). "
+        "over the duration of the benchmark.",
+    )
+    parser.add_argument(
+        "--ramp-up-start-rps",
+        type=int,
+        default=None,
+        help="The starting request rate for ramp-up (RPS). "
+        "Needs to be specified when --ramp-up-strategy is used.",
+    )
+    parser.add_argument(
+        "--ramp-up-end-rps",
+        type=int,
+        default=None,
+        help="The ending request rate for ramp-up (RPS). "
+        "Needs to be specified when --ramp-up-strategy is used.",
+    )
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = create_argument_parser()
+    args = parser.parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/benchmarks/benchmark_serving_structured_output.py b/vllm_v0.10.0/benchmarks/benchmark_serving_structured_output.py
new file mode 100644
index 0000000..e23a5a9
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/benchmark_serving_structured_output.py
@@ -0,0 +1,1043 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+r"""Benchmark online serving throughput with structured outputs.
+
+On the server side, run one of the following commands:
+    (vLLM OpenAI API server)
+    vllm serve <your_model> --disable-log-requests
+
+On the client side, run:
+    python benchmarks/benchmark_serving_structured_output.py \
+        --backend <backend> \
+        --model <your_model> \
+        --dataset json \
+        --structured-output-ratio 1.0 \
+        --request-rate 10 \
+        --num-prompts 1000
+
+    when using tgi backend, add
+        --endpoint /generate_stream
+    to the end of the command above.
+"""
+
+import argparse
+import asyncio
+import copy
+import dataclasses
+import json
+import os
+import random
+import time
+import uuid
+import warnings
+from collections.abc import AsyncGenerator
+from dataclasses import dataclass
+from typing import Optional
+
+import datasets
+import numpy as np
+import pandas as pd
+from tqdm.asyncio import tqdm
+from transformers import PreTrainedTokenizerBase
+
+from backend_request_func import (
+    ASYNC_REQUEST_FUNCS,
+    RequestFuncInput,
+    RequestFuncOutput,
+)
+
+try:
+    from vllm.transformers_utils.tokenizer import get_tokenizer
+except ImportError:
+    from backend_request_func import get_tokenizer
+
+try:
+    from vllm.utils import FlexibleArgumentParser
+except ImportError:
+    from argparse import ArgumentParser as FlexibleArgumentParser
+
+from vllm.v1.structured_output.backend_xgrammar import (
+    has_xgrammar_unsupported_json_features,
+)
+
+MILLISECONDS_TO_SECONDS_CONVERSION = 1000
+
+
+@dataclass
+class BenchmarkMetrics:
+    completed: int
+    total_input: int
+    total_output: int
+    request_throughput: float
+    request_goodput: float
+    output_throughput: float
+    total_token_throughput: float
+    mean_ttft_ms: float
+    median_ttft_ms: float
+    std_ttft_ms: float
+    percentiles_ttft_ms: list[tuple[float, float]]
+    mean_tpot_ms: float
+    median_tpot_ms: float
+    std_tpot_ms: float
+    percentiles_tpot_ms: list[tuple[float, float]]
+    mean_itl_ms: float
+    median_itl_ms: float
+    std_itl_ms: float
+    percentiles_itl_ms: list[tuple[float, float]]
+    # E2EL stands for end-to-end latency per request.
+    # It is the time taken on the client side from sending
+    # a request to receiving a complete response.
+    mean_e2el_ms: float
+    median_e2el_ms: float
+    std_e2el_ms: float
+    percentiles_e2el_ms: list[tuple[float, float]]
+
+
+@dataclasses.dataclass
+class SampleRequest:
+    """A class representing a single inference request for benchmarking.
+
+    Attributes:
+        prompt: The input text prompt for the model.
+        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
+            images).
+        prompt_len: The length of the prompt in tokens.
+        expected_output_len: The expected length of the output in tokens.
+    """
+
+    prompt: str
+    prompt_len: int
+    expected_output_len: int
+    schema: dict
+    structure_type: str
+    completion: str = None
+
+
+def sample_requests(
+    tokenizer: PreTrainedTokenizerBase, args: argparse.Namespace
+) -> list[SampleRequest]:
+    if args.dataset == "json" or args.dataset == "json-unique":
+        if args.json_schema_path is None:
+            dir_path = os.path.dirname(os.path.realpath(__file__))
+            args.json_schema_path = os.path.join(
+                dir_path, "structured_schemas", "structured_schema_1.json"
+            )
+        json_schemas = []
+        with open(args.json_schema_path) as f:
+            schema = json.load(f)
+
+        if args.dataset == "json-unique":
+            json_schemas = [copy.deepcopy(schema) for _ in range(args.num_prompts)]
+            for i in range(len(json_schemas)):
+                if "properties" not in json_schemas[i]:
+                    json_schemas[i]["properties"] = {}
+                json_schemas[i]["properties"][f"__optional_field_{uuid.uuid4()}"] = {
+                    "type": "string",
+                    "description": "An unique optional field to avoid cached schemas",
+                }
+        else:
+            json_schemas = [schema] * args.num_prompts
+
+        def gen_prompt(index: int):
+            return f"Generate an example of a brief user profile given the following schema: {json.dumps(get_schema(index))}"  # noqa: E501
+
+        def get_schema(index: int):
+            return json_schemas[index % len(json_schemas)]
+
+        requests = [
+            SampleRequest(
+                prompt=gen_prompt(i),
+                prompt_len=len(tokenizer(gen_prompt(i)).input_ids),
+                expected_output_len=args.output_len,
+                schema=get_schema(i),
+                structure_type=args.structure_type,
+            )
+            for i in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "grammar":
+        schema = """
+        root ::= select_statement
+
+        select_statement ::= "SELECT " column " from " table " where " condition
+
+        column ::= "col_1 " | "col_2 "
+
+        table ::= "table_1 " | "table_2 "
+
+        condition ::= column "= " number
+
+        number ::= "1 " | "2 "
+        """
+        prompt = "Generate an SQL query to show the 'username' \
+            and 'email' from the 'users' table."
+
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(
+                prompt=prompt,
+                prompt_len=input_len,
+                expected_output_len=args.output_len,
+                schema=schema,
+                structure_type=args.structure_type,
+            )
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "regex":
+        regex = r"\w+@\w+\.com\n"
+        args.regex = regex
+        prompt = "Generate an email address for Alan Turing, \
+            who works in Enigma. End in .com and new line. \
+                Example result: alan.turing@enigma.com\n"
+
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(
+                prompt=prompt,
+                prompt_len=input_len,
+                expected_output_len=args.output_len,
+                schema=regex,
+                structure_type=args.structure_type,
+            )
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "choice":
+        choice = ["Positive", "Negative"]
+        args.choice = choice
+        prompt = "Classify this sentiment: vLLM is wonderful!"
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(
+                prompt=prompt,
+                prompt_len=input_len,
+                expected_output_len=args.output_len,
+                schema=choice,
+                structure_type=args.structure_type,
+            )
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "xgrammar_bench":
+        requests: list[SampleRequest] = []
+        dataset = datasets.load_dataset("NousResearch/json-mode-eval", split="train")
+        full_dataset_len = len(dataset)
+
+        def _filter_func(item):
+            import json
+
+            schema = json.loads(item["schema"])
+            return not has_xgrammar_unsupported_json_features(schema)
+
+        dataset = dataset.filter(_filter_func)
+        num_filtered_out = full_dataset_len - len(dataset)
+        print(
+            f"dataset has {len(dataset)} entries after filtering "
+            f"out {num_filtered_out} entries with unsupported features"
+        )
+        len_dataset = len(dataset)
+        for data_point_idx in range(args.num_prompts):
+            idx = data_point_idx
+            while idx >= len_dataset:
+                idx -= len_dataset
+            schema = dataset["schema"][idx]
+            prompt = tokenizer.apply_chat_template(
+                dataset["prompt"][idx], tokenize=False, add_generation_prompt=True
+            )
+            input_len = len(tokenizer(prompt).input_ids)
+            completion = dataset["completion"][idx]
+
+            requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=input_len,
+                    expected_output_len=args.output_len,
+                    schema=schema,
+                    structure_type=args.structure_type,
+                    completion=completion,
+                )
+            )
+
+    return requests
+
+
+async def get_request(
+    input_requests: list[SampleRequest],
+    request_rate: float,
+    burstiness: float = 1.0,
+) -> AsyncGenerator[tuple[int, SampleRequest], None]:
+    """
+    Asynchronously generates requests at a specified rate
+    with OPTIONAL burstiness.
+
+    Args:
+        input_requests:
+            A list of input requests, each represented as a tuple.
+        request_rate:
+            The rate at which requests are generated (requests/s).
+        burstiness (optional):
+            The burstiness factor of the request generation.
+            Only takes effect when request_rate is not inf.
+            Default value is 1, which follows a Poisson process.
+            Otherwise, the request intervals follow a gamma distribution.
+            A lower burstiness value (0 < burstiness < 1) results
+            in more bursty requests, while a higher burstiness value
+            (burstiness > 1) results in a more uniform arrival of requests.
+    """
+    input_requests = iter(input_requests)
+
+    # Calculate scale parameter theta to maintain the desired request_rate.
+    assert burstiness > 0, (
+        f"A positive burstiness factor is expected, but given {burstiness}."
+    )
+    theta = 1.0 / (request_rate * burstiness)
+
+    for i, request in enumerate(input_requests):
+        yield i, request
+
+        if request_rate == float("inf"):
+            # If the request rate is infinity, then we don't need to wait.
+            continue
+
+        # Sample the request interval from the gamma distribution.
+        # If burstiness is 1, it follows exponential distribution.
+        interval = np.random.gamma(shape=burstiness, scale=theta)
+        # The next request will be sent after the interval.
+        await asyncio.sleep(interval)
+
+
+def calculate_metrics(
+    input_requests: list[tuple[str, int, int]],
+    outputs: list[RequestFuncOutput],
+    dur_s: float,
+    tokenizer: PreTrainedTokenizerBase,
+    selected_percentile_metrics: list[str],
+    selected_percentiles: list[float],
+    goodput_config_dict: Optional[dict[str, float]] = None,
+) -> tuple[BenchmarkMetrics, list[int]]:
+    actual_output_lens: list[int] = []
+    total_input = 0
+    completed = 0
+    good_completed = 0
+    itls: list[float] = []
+    tpots: list[float] = []
+    all_tpots: list[float] = []
+    ttfts: list[float] = []
+    e2els: list[float] = []
+    for i in range(len(outputs)):
+        if outputs[i].success:
+            # We use the tokenizer to count the number of output tokens for all
+            # serving backends instead of looking at len(outputs[i].itl) since
+            # multiple output tokens may be bundled together
+            # Note : this may inflate the output token count slightly
+            output_len = len(
+                tokenizer(outputs[i].generated_text, add_special_tokens=False).input_ids
+            )
+            actual_output_lens.append(output_len)
+            total_input += input_requests[i].prompt_len
+            tpot = 0
+            if output_len > 1:
+                latency_minus_ttft = outputs[i].latency - outputs[i].ttft
+                tpot = latency_minus_ttft / (output_len - 1)
+                tpots.append(tpot)
+            outputs[i].tpot = tpot
+            # Note: if output_len <= 1, we regard tpot as 0 for goodput
+            all_tpots.append(tpot)
+            itls += outputs[i].itl
+            ttfts.append(outputs[i].ttft)
+            e2els.append(outputs[i].latency)
+            completed += 1
+        else:
+            actual_output_lens.append(0)
+
+    if goodput_config_dict:
+        valid_metrics = []
+        slo_values = []
+
+        if "ttft" in goodput_config_dict:
+            valid_metrics.append(ttfts)
+            slo_values.append(
+                goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )
+        if "tpot" in goodput_config_dict:
+            valid_metrics.append(all_tpots)
+            slo_values.append(
+                goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )
+        if "e2el" in goodput_config_dict:
+            valid_metrics.append(e2els)
+            slo_values.append(
+                goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )
+
+        for req_metric in zip(*valid_metrics):
+            is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
+            if is_good_req:
+                good_completed += 1
+
+    if completed == 0:
+        warnings.warn(
+            "All requests failed. This is likely due to a misconfiguration "
+            "on the benchmark arguments.",
+            stacklevel=2,
+        )
+    metrics = BenchmarkMetrics(
+        completed=completed,
+        total_input=total_input,
+        total_output=sum(actual_output_lens),
+        request_throughput=completed / dur_s,
+        request_goodput=good_completed / dur_s,
+        output_throughput=sum(actual_output_lens) / dur_s,
+        total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
+        mean_ttft_ms=np.mean(ttfts or 0)
+        * 1000,  # ttfts is empty if streaming is not supported by backend
+        std_ttft_ms=np.std(ttfts or 0) * 1000,
+        median_ttft_ms=np.median(ttfts or 0) * 1000,
+        percentiles_ttft_ms=[
+            (p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles
+        ],
+        mean_tpot_ms=np.mean(tpots or 0) * 1000,
+        std_tpot_ms=np.std(tpots or 0) * 1000,
+        median_tpot_ms=np.median(tpots or 0) * 1000,
+        percentiles_tpot_ms=[
+            (p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles
+        ],
+        mean_itl_ms=np.mean(itls or 0) * 1000,
+        std_itl_ms=np.std(itls or 0) * 1000,
+        median_itl_ms=np.median(itls or 0) * 1000,
+        percentiles_itl_ms=[
+            (p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles
+        ],
+        mean_e2el_ms=np.mean(e2els or 0) * 1000,
+        std_e2el_ms=np.std(e2els or 0) * 1000,
+        median_e2el_ms=np.median(e2els or 0) * 1000,
+        percentiles_e2el_ms=[
+            (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles
+        ],
+    )
+
+    return metrics, actual_output_lens
+
+
+async def benchmark(
+    backend: str,
+    api_url: str,
+    base_url: str,
+    model_id: str,
+    tokenizer: PreTrainedTokenizerBase,
+    input_requests: list[SampleRequest],
+    request_rate: float,
+    burstiness: float,
+    disable_tqdm: bool,
+    profile: bool,
+    selected_percentile_metrics: list[str],
+    selected_percentiles: list[str],
+    ignore_eos: bool,
+    max_concurrency: Optional[int],
+    structured_output_ratio: float,
+    goodput_config_dict: Optional[dict[str, float]] = None,
+):
+    if backend in ASYNC_REQUEST_FUNCS:
+        request_func = ASYNC_REQUEST_FUNCS[backend]
+    else:
+        raise ValueError(f"Unknown backend: {backend}")
+
+    def prepare_extra_body(request) -> dict:
+        extra_body = {}
+        # Add the schema to the extra_body
+        extra_body[request.structure_type] = request.schema
+        return extra_body
+
+    print("Starting initial single prompt test run...")
+    structured_output_req_idx = random.sample(
+        range(len(input_requests)), int(len(input_requests) * structured_output_ratio)
+    )
+
+    test_request = input_requests[0]
+    test_req_extra_body = (
+        prepare_extra_body(test_request) if 0 in structured_output_req_idx else None
+    )
+    test_input = RequestFuncInput(
+        model=model_id,
+        prompt=test_request.prompt,
+        api_url=api_url,
+        prompt_len=test_request.prompt_len,
+        output_len=test_request.expected_output_len,
+        ignore_eos=ignore_eos,
+        extra_body=test_req_extra_body,
+    )
+    test_output = await request_func(request_func_input=test_input)
+    if not test_output.success:
+        raise ValueError(
+            "Initial test run failed - Please make sure benchmark arguments "
+            f"are correctly specified. Error: {test_output.error}"
+        )
+    else:
+        print("Initial test run completed. Starting main benchmark run...")
+
+    if profile:
+        print("Starting profiler...")
+        profile_input = RequestFuncInput(
+            model=model_id,
+            prompt=test_request.prompt,
+            api_url=base_url + "/start_profile",
+            prompt_len=test_request.prompt_len,
+            output_len=test_request.expected_output_len,
+            ignore_eos=ignore_eos,
+            extra_body=test_req_extra_body,
+        )
+        profile_output = await request_func(request_func_input=profile_input)
+        if profile_output.success:
+            print("Profiler started")
+
+    distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution"
+
+    print(f"Traffic request rate: {request_rate}")
+    print(f"Burstiness factor: {burstiness} ({distribution})")
+    print(f"Maximum request concurrency: {max_concurrency}")
+
+    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
+
+    # This can be used once the minimum Python version is 3.10 or higher,
+    # and it will simplify the code in limited_request_func.
+    #    semaphore = (asyncio.Semaphore(max_concurrency)
+    #                 if max_concurrency else contextlib.nullcontext())
+    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
+
+    async def limited_request_func(request_func_input, pbar):
+        if semaphore is None:
+            return await request_func(request_func_input=request_func_input, pbar=pbar)
+        async with semaphore:
+            return await request_func(request_func_input=request_func_input, pbar=pbar)
+
+    benchmark_start_time = time.perf_counter()
+    tasks: list[asyncio.Task] = []
+    expected: list[str] = []
+    async for i, request in get_request(input_requests, request_rate, burstiness):
+        extra_body = (
+            prepare_extra_body(request) if i in structured_output_req_idx else None
+        )
+        request_func_input = RequestFuncInput(
+            model=model_id,
+            prompt=request.prompt,
+            api_url=api_url,
+            prompt_len=request.prompt_len,
+            output_len=request.expected_output_len,
+            ignore_eos=ignore_eos,
+            extra_body=extra_body,
+        )
+        expected.append(request.completion)
+        tasks.append(
+            asyncio.create_task(
+                limited_request_func(request_func_input=request_func_input, pbar=pbar)
+            )
+        )
+    outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
+
+    if profile:
+        print("Stopping profiler...")
+        profile_input = RequestFuncInput(
+            model=model_id,
+            prompt=test_request.prompt,
+            api_url=base_url + "/stop_profile",
+            prompt_len=test_request.prompt_len,
+            output_len=test_request.expected_output_len,
+            extra_body={test_request.structure_type: test_request.schema},
+        )
+        profile_output = await request_func(request_func_input=profile_input)
+        if profile_output.success:
+            print("Profiler stopped")
+
+    if pbar is not None:
+        pbar.close()
+
+    benchmark_duration = time.perf_counter() - benchmark_start_time
+
+    metrics, actual_output_lens = calculate_metrics(
+        input_requests=input_requests,
+        outputs=outputs,
+        dur_s=benchmark_duration,
+        tokenizer=tokenizer,
+        selected_percentile_metrics=selected_percentile_metrics,
+        selected_percentiles=selected_percentiles,
+        goodput_config_dict=goodput_config_dict,
+    )
+
+    print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
+    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
+    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
+    print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Request throughput (req/s):", metrics.request_throughput
+        )
+    )
+    if goodput_config_dict:
+        print(
+            "{:<40} {:<10.2f}".format(
+                "Request goodput (req/s):", metrics.request_goodput
+            )
+        )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Output token throughput (tok/s):", metrics.output_throughput
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Total Token throughput (tok/s):", metrics.total_token_throughput
+        )
+    )
+
+    result = {
+        "duration": benchmark_duration,
+        "completed": metrics.completed,
+        "total_input_tokens": metrics.total_input,
+        "total_output_tokens": metrics.total_output,
+        "request_throughput": metrics.request_throughput,
+        "output_throughput": metrics.output_throughput,
+        "total_token_throughput": metrics.total_token_throughput,
+        "ttft_description": pd.Series([output.ttft for output in outputs])
+        .describe()
+        .to_dict(),
+        "tpot_description": pd.Series([output.tpot for output in outputs])
+        .describe()
+        .to_dict(),
+        "input_lens": [output.prompt_len for output in outputs],
+        "output_lens": actual_output_lens,
+        "ttfts": [output.ttft for output in outputs],
+        "itls": [output.itl for output in outputs],
+        "errors": [output.error for output in outputs],
+    }
+
+    ret = [
+        {"generated": output.generated_text, "expected": gt}
+        for output, gt in zip(outputs, expected)
+    ]
+
+    def process_one_metric(
+        # E.g., "ttft"
+        metric_attribute_name: str,
+        # E.g., "TTFT"
+        metric_name: str,
+        # E.g., "Time to First Token"
+        metric_header: str,
+    ):
+        # This function prints and adds statistics of the specified
+        # metric.
+        if metric_attribute_name not in selected_percentile_metrics:
+            return
+        print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
+        print(
+            "{:<40} {:<10.2f}".format(
+                f"Mean {metric_name} (ms):",
+                getattr(metrics, f"mean_{metric_attribute_name}_ms"),
+            )
+        )
+        print(
+            "{:<40} {:<10.2f}".format(
+                f"Median {metric_name} (ms):",
+                getattr(metrics, f"median_{metric_attribute_name}_ms"),
+            )
+        )
+        result[f"mean_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"mean_{metric_attribute_name}_ms"
+        )
+        result[f"median_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"median_{metric_attribute_name}_ms"
+        )
+        result[f"std_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"std_{metric_attribute_name}_ms"
+        )
+        for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
+            p_word = str(int(p)) if int(p) == p else str(p)
+            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
+            result[f"p{p_word}_{metric_attribute_name}_ms"] = value
+
+    process_one_metric("ttft", "TTFT", "Time to First Token")
+    process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
+    process_one_metric("itl", "ITL", "Inter-token Latency")
+    process_one_metric("e2el", "E2EL", "End-to-end Latency")
+
+    print("=" * 50)
+
+    return result, ret
+
+
+def evaluate(ret, args):
+    def _eval_correctness_json(expected, actual):
+        # extract json string from string using regex
+        import regex as re
+
+        actual = actual.replace("\n", "").replace(" ", "").strip()
+        try:
+            actual = re.search(r"\{.*\}", actual).group()
+            actual = json.loads(actual)
+        except Exception:
+            return False
+
+        return True
+
+    def _eval_correctness_choice(expected, actual):
+        return actual in args.choice
+
+    def _eval_correctness_regex(expected, actual):
+        import regex as re
+
+        return re.match(args.regex, actual) is not None
+
+    def _eval_correctness(expected, actual):
+        if args.structure_type == "guided_json":
+            return _eval_correctness_json(expected, actual)
+        elif args.structure_type == "guided_regex":
+            return _eval_correctness_regex(expected, actual)
+        elif args.structure_type == "guided_choice":
+            return _eval_correctness_choice(expected, actual)
+        else:
+            return None
+
+    scores = []
+    for res in ret:
+        score = _eval_correctness(res["expected"], res["generated"])
+        res["correctness"] = score
+        scores.append(score)
+
+    not_none_scores = [score for score in scores if score is not None]
+
+    return (
+        (sum(not_none_scores) / len(not_none_scores) * 100)
+        if len(not_none_scores) > 0
+        else None
+    )
+
+
+def parse_goodput(slo_pairs):
+    goodput_config_dict = {}
+    try:
+        for slo_pair in slo_pairs:
+            slo_name, slo_val = slo_pair.split(":")
+            goodput_config_dict[slo_name] = float(slo_val)
+    except ValueError as err:
+        raise argparse.ArgumentTypeError(
+            "Invalid format found for service level objectives. "
+            'Specify service level objectives for goodput as "KEY:VALUE" '
+            "pairs, where the key is a metric name, and the value is a "
+            "number in milliseconds."
+        ) from err
+    return goodput_config_dict
+
+
+def check_goodput_args(args):
+    goodput_config_dict = {}
+    VALID_NAMES = ["ttft", "tpot", "e2el"]
+    if args.goodput:
+        goodput_config_dict = parse_goodput(args.goodput)
+        for slo_name, slo_val in goodput_config_dict.items():
+            if slo_name not in VALID_NAMES:
+                raise ValueError(
+                    f"Invalid metric name found, {slo_name}: {slo_val}. "
+                    "The service level objective name should be one of "
+                    f"{str(VALID_NAMES)}. "
+                )
+            if slo_val < 0:
+                raise ValueError(
+                    f"Invalid value found, {slo_name}: {slo_val}. "
+                    "The service level objective value should be "
+                    "non-negative."
+                )
+    return goodput_config_dict
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    backend = args.backend
+    model_id = args.model
+    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
+
+    if args.base_url is not None:
+        api_url = f"{args.base_url}{args.endpoint}"
+        base_url = f"{args.base_url}"
+    else:
+        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
+        base_url = f"http://{args.host}:{args.port}"
+
+    tokenizer = get_tokenizer(
+        tokenizer_id,
+        trust_remote_code=args.trust_remote_code,
+        tokenizer_mode=args.tokenizer_mode,
+    )
+
+    if args.dataset == "grammar":
+        args.structure_type = "guided_grammar"
+    elif args.dataset == "regex":
+        args.structure_type = "guided_regex"
+    elif args.dataset == "choice":
+        args.structure_type = "guided_choice"
+    else:
+        args.structure_type = "guided_json"
+
+    if args.no_structured_output:
+        args.structured_output_ratio = 0
+    if args.save_results:
+        result_file_name = f"{args.structured_output_ratio}guided"
+        result_file_name += f"_{backend}"
+        result_file_name += f"_{args.request_rate}qps"
+        result_file_name += f"_{args.model.split('/')[-1]}"
+        result_file_name += f"_{args.dataset}"
+        result_file_name += f"_{args.num_prompts}"
+        result_file_name += f"_out{args.output_len}"
+        result_file_name += ".txt"
+    else:
+        result_file_name = None
+
+    input_requests = sample_requests(tokenizer, args)
+
+    goodput_config_dict = check_goodput_args(args)
+
+    benchmark_result, ret = asyncio.run(
+        benchmark(
+            backend=backend,
+            api_url=api_url,
+            base_url=base_url,
+            model_id=model_id,
+            tokenizer=tokenizer,
+            input_requests=input_requests,
+            request_rate=args.request_rate,
+            burstiness=args.burstiness,
+            disable_tqdm=args.disable_tqdm,
+            profile=args.profile,
+            selected_percentile_metrics=args.percentile_metrics.split(","),
+            selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
+            ignore_eos=args.ignore_eos,
+            max_concurrency=args.max_concurrency,
+            structured_output_ratio=args.structured_output_ratio,
+            goodput_config_dict=goodput_config_dict,
+        )
+    )
+
+    # Save config and results to json
+    score = evaluate(ret, args)
+    print("correct_rate(%)", score, "\n")
+    if args.save_results:
+        results = {
+            "backend": backend,
+            "model_id": model_id,
+            "tokenizer_id": tokenizer_id,
+            "num_prompts": args.num_prompts,
+            "request_rate": args.request_rate
+            if args.request_rate < float("inf")
+            else "inf",
+            "burstiness": args.burstiness,
+            "max_concurrency": args.max_concurrency,
+            "correct_rate(%)": score,
+        }
+        results = {"outputs": ret, **results, **benchmark_result}
+
+        # Save to file
+        if args.result_filename:
+            result_file_name = args.result_filename
+        if args.result_dir:
+            result_file_name = os.path.join(args.result_dir, result_file_name)
+        with open(result_file_name, "w", encoding="utf-8") as outfile:
+            json.dump(results, outfile, indent=4)
+
+
+def create_argument_parser():
+    parser = FlexibleArgumentParser(
+        description="Benchmark the online serving throughput."
+    )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="vllm",
+        choices=list(ASYNC_REQUEST_FUNCS.keys()),
+    )
+    parser.add_argument(
+        "--base-url",
+        type=str,
+        default=None,
+        help="Server or API base url if not using http host and port.",
+    )
+    # Use 127.0.0.1 here instead of localhost to force the use of ipv4
+    parser.add_argument("--host", type=str, default="127.0.0.1")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument(
+        "--endpoint",
+        type=str,
+        default="/v1/completions",
+        help="API endpoint.",
+    )
+    parser.add_argument(
+        "--dataset",
+        default="json",
+        choices=["json", "json-unique", "grammar", "regex", "choice", "xgrammar_bench"],
+    )
+    parser.add_argument(
+        "--json-schema-path", type=str, default=None, help="Path to json schema."
+    )
+    parser.add_argument(
+        "--max-concurrency",
+        type=int,
+        default=None,
+        help="Maximum number of concurrent requests. This can be used "
+        "to help simulate an environment where a higher level component "
+        "is enforcing a maximum number of concurrent requests. While the "
+        "--request-rate argument controls the rate at which requests are "
+        "initiated, this argument will control how many are actually allowed "
+        "to execute at a time. This means that when used in combination, the "
+        "actual request rate may be lower than specified with --request-rate, "
+        "if the server is not processing requests fast enough to keep up.",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        required=True,
+        help="Name of the model.",
+    )
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+    )
+    parser.add_argument(
+        "--tokenizer-mode",
+        type=str,
+        default="auto",
+        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+    )
+    parser.add_argument(
+        "--num-prompts",
+        type=int,
+        default=1000,
+        help="Number of prompts to process.",
+    )
+    parser.add_argument(
+        "--output-len",
+        type=int,
+        default=128,
+        help="Number of output tokens.",
+    )
+    parser.add_argument(
+        "--request-rate",
+        type=float,
+        default=float("inf"),
+        help="Number of requests per second. If this is inf, "
+        "then all the requests are sent at time 0. "
+        "Otherwise, we use Poisson process or gamma distribution "
+        "to synthesize the request arrival times.",
+    )
+    parser.add_argument(
+        "--burstiness",
+        type=float,
+        default=1.0,
+        help="Burstiness factor of the request generation. "
+        "Only take effect when request_rate is not inf. "
+        "Default value is 1, which follows Poisson process. "
+        "Otherwise, the request intervals follow a gamma distribution. "
+        "A lower burstiness value (0 < burstiness < 1) results in more "
+        "bursty requests. A higher burstiness value (burstiness > 1) "
+        "results in a more uniform arrival of requests.",
+    )
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="Trust remote code from huggingface",
+    )
+    parser.add_argument(
+        "--disable-tqdm",
+        action="store_true",
+        help="Specify to disable tqdm progress bar.",
+    )
+    parser.add_argument(
+        "--save-results",
+        action="store_true",
+        help="Specify to save benchmark results to a json file",
+    )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="Use Torch Profiler. The endpoint must be launched with "
+        "VLLM_TORCH_PROFILER_DIR to enable profiler.",
+    )
+    parser.add_argument(
+        "--result-dir",
+        type=str,
+        default=None,
+        help="Specify directory to save benchmark json results."
+        "If not specified, results are saved in the current directory.",
+    )
+    parser.add_argument(
+        "--result-filename",
+        type=str,
+        default=None,
+        help="Specify the filename to save benchmark json results."
+        "If not specified, results will be saved in "
+        "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
+        " format.",
+    )
+    parser.add_argument(
+        "--ignore-eos",
+        action="store_true",
+        help="Set ignore_eos flag when sending the benchmark request."
+        "Warning: ignore_eos is not supported in deepspeed_mii and tgi.",
+    )
+    parser.add_argument(
+        "--percentile-metrics",
+        type=str,
+        default="ttft,tpot,itl",
+        help="Comma-separated list of selected metrics to report percentils. "
+        "This argument specifies the metrics to report percentiles. "
+        'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
+        'Default value is "ttft,tpot,itl".',
+    )
+    parser.add_argument(
+        "--metric-percentiles",
+        type=str,
+        default="99",
+        help="Comma-separated list of percentiles for selected metrics. "
+        'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". '
+        'Default value is "99". '
+        'Use "--percentile-metrics" to select metrics.',
+    )
+    parser.add_argument(
+        "--goodput",
+        nargs="+",
+        required=False,
+        help='Specify service level objectives for goodput as "KEY:VALUE" '
+        "pairs, where the key is a metric name, and the value is in "
+        'milliseconds. Multiple "KEY:VALUE" pairs can be provided, '
+        "separated by spaces. Allowed request level metric names are "
+        '"ttft", "tpot", "e2el". For more context on the definition of '
+        "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
+        "and the blog: https://hao-ai-lab.github.io/blogs/distserve",
+    )
+
+    parser.add_argument(
+        "--no-structured-output",
+        action="store_true",
+        default=False,
+        help="Whether to disable JSON decoding or not.",
+    )
+    parser.add_argument(
+        "--structured-output-ratio",
+        type=float,
+        default=1.0,
+        help="Ratio of Structured Outputs requests",
+    )
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = create_argument_parser()
+    args = parser.parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/benchmarks/benchmark_throughput.py b/vllm_v0.10.0/benchmarks/benchmark_throughput.py
new file mode 100644
index 0000000..1446112
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/benchmark_throughput.py
@@ -0,0 +1,736 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Benchmark offline inference throughput."""
+
+import argparse
+import dataclasses
+import json
+import os
+import random
+import time
+import warnings
+from typing import Any, Optional, Union
+
+import torch
+import uvloop
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
+
+from benchmark_dataset import (
+    AIMODataset,
+    BurstGPTDataset,
+    ConversationDataset,
+    InstructCoderDataset,
+    RandomDataset,
+    SampleRequest,
+    ShareGPTDataset,
+    SonnetDataset,
+    VisionArenaDataset,
+)
+from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.entrypoints.openai.api_server import (
+    build_async_engine_client_from_engine_args,
+)
+from vllm.inputs import TextPrompt, TokensPrompt
+from vllm.lora.request import LoRARequest
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import BeamSearchParams
+from vllm.utils import FlexibleArgumentParser, merge_async_iterators
+
+
+def run_vllm(
+    requests: list[SampleRequest],
+    n: int,
+    engine_args: EngineArgs,
+    disable_detokenize: bool = False,
+) -> tuple[float, Optional[list[RequestOutput]]]:
+    from vllm import LLM, SamplingParams
+
+    llm = LLM(**dataclasses.asdict(engine_args))
+    assert all(
+        llm.llm_engine.model_config.max_model_len
+        >= (request.prompt_len + request.expected_output_len)
+        for request in requests
+    ), (
+        "Please ensure that max_model_len is greater than the sum of"
+        " prompt_len and expected_output_len for all requests."
+    )
+    # Add the requests to the engine.
+    prompts: list[Union[TextPrompt, TokensPrompt]] = []
+    sampling_params: list[SamplingParams] = []
+    for request in requests:
+        prompts.append(
+            TokensPrompt(
+                prompt_token_ids=request.prompt["prompt_token_ids"],
+                multi_modal_data=request.multi_modal_data,
+            )
+            if "prompt_token_ids" in request.prompt
+            else TextPrompt(
+                prompt=request.prompt, multi_modal_data=request.multi_modal_data
+            )
+        )
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=1.0,
+                top_p=1.0,
+                ignore_eos=True,
+                max_tokens=request.expected_output_len,
+                detokenize=not disable_detokenize,
+            )
+        )
+    lora_requests: Optional[list[LoRARequest]] = None
+    if engine_args.enable_lora:
+        lora_requests = [request.lora_request for request in requests]
+
+    use_beam_search = False
+
+    outputs = None
+    if not use_beam_search:
+        start = time.perf_counter()
+        outputs = llm.generate(
+            prompts, sampling_params, lora_request=lora_requests, use_tqdm=True
+        )
+        end = time.perf_counter()
+    else:
+        assert lora_requests is None, "BeamSearch API does not support LoRA"
+        prompts = [request.prompt for request in requests]
+        # output_len should be the same for all requests.
+        output_len = requests[0].expected_output_len
+        for request in requests:
+            assert request.expected_output_len == output_len
+        start = time.perf_counter()
+        llm.beam_search(
+            prompts,
+            BeamSearchParams(
+                beam_width=n,
+                max_tokens=output_len,
+                ignore_eos=True,
+            ),
+        )
+        end = time.perf_counter()
+    return end - start, outputs
+
+
+def run_vllm_chat(
+    requests: list[SampleRequest],
+    n: int,
+    engine_args: EngineArgs,
+    disable_detokenize: bool = False,
+) -> tuple[float, list[RequestOutput]]:
+    """
+    Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
+    multimodal models as it properly handles multimodal inputs and chat
+    formatting. For non-multimodal models, use run_vllm() instead.
+    """
+    from vllm import LLM, SamplingParams
+
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    assert all(
+        llm.llm_engine.model_config.max_model_len
+        >= (request.prompt_len + request.expected_output_len)
+        for request in requests
+    ), (
+        "Please ensure that max_model_len is greater than the sum of "
+        "prompt_len and expected_output_len for all requests."
+    )
+
+    prompts = []
+    sampling_params: list[SamplingParams] = []
+    for request in requests:
+        prompts.append(request.prompt)
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=1.0,
+                top_p=1.0,
+                ignore_eos=True,
+                max_tokens=request.expected_output_len,
+                detokenize=not disable_detokenize,
+            )
+        )
+    start = time.perf_counter()
+    outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
+    end = time.perf_counter()
+    return end - start, outputs
+
+
+async def run_vllm_async(
+    requests: list[SampleRequest],
+    n: int,
+    engine_args: AsyncEngineArgs,
+    disable_frontend_multiprocessing: bool = False,
+    disable_detokenize: bool = False,
+) -> float:
+    from vllm import SamplingParams
+
+    async with build_async_engine_client_from_engine_args(
+        engine_args, disable_frontend_multiprocessing
+    ) as llm:
+        model_config = await llm.get_model_config()
+        assert all(
+            model_config.max_model_len
+            >= (request.prompt_len + request.expected_output_len)
+            for request in requests
+        ), (
+            "Please ensure that max_model_len is greater than the sum of"
+            " prompt_len and expected_output_len for all requests."
+        )
+
+        # Add the requests to the engine.
+        prompts: list[Union[TextPrompt, TokensPrompt]] = []
+        sampling_params: list[SamplingParams] = []
+        lora_requests: list[Optional[LoRARequest]] = []
+        for request in requests:
+            prompts.append(
+                TokensPrompt(
+                    prompt_token_ids=request.prompt["prompt_token_ids"],
+                    multi_modal_data=request.multi_modal_data,
+                )
+                if "prompt_token_ids" in request.prompt
+                else TextPrompt(
+                    prompt=request.prompt, multi_modal_data=request.multi_modal_data
+                )
+            )
+            sampling_params.append(
+                SamplingParams(
+                    n=n,
+                    temperature=1.0,
+                    top_p=1.0,
+                    ignore_eos=True,
+                    max_tokens=request.expected_output_len,
+                    detokenize=not disable_detokenize,
+                )
+            )
+            lora_requests.append(request.lora_request)
+
+        generators = []
+        start = time.perf_counter()
+        for i, (prompt, sp, lr) in enumerate(
+            zip(prompts, sampling_params, lora_requests)
+        ):
+            generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}")
+            generators.append(generator)
+        all_gens = merge_async_iterators(*generators)
+        async for i, res in all_gens:
+            pass
+        end = time.perf_counter()
+        return end - start
+
+
+def run_hf(
+    requests: list[SampleRequest],
+    model: str,
+    tokenizer: PreTrainedTokenizerBase,
+    n: int,
+    max_batch_size: int,
+    trust_remote_code: bool,
+    disable_detokenize: bool = False,
+) -> float:
+    llm = AutoModelForCausalLM.from_pretrained(
+        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
+    )
+    if llm.config.model_type == "llama":
+        # To enable padding in the HF backend.
+        tokenizer.pad_token = tokenizer.eos_token
+    llm = llm.cuda()
+
+    pbar = tqdm(total=len(requests))
+    start = time.perf_counter()
+    batch: list[str] = []
+    max_prompt_len = 0
+    max_output_len = 0
+    for i in range(len(requests)):
+        prompt = requests[i].prompt
+        prompt_len = requests[i].prompt_len
+        output_len = requests[i].expected_output_len
+        # Add the prompt to the batch.
+        batch.append(prompt)
+        max_prompt_len = max(max_prompt_len, prompt_len)
+        max_output_len = max(max_output_len, output_len)
+        if len(batch) < max_batch_size and i != len(requests) - 1:
+            # Check if we can add more requests to the batch.
+            next_prompt_len = requests[i + 1].prompt_len
+            next_output_len = requests[i + 1].expected_output_len
+            if (
+                max(max_prompt_len, next_prompt_len)
+                + max(max_output_len, next_output_len)
+            ) <= 2048:
+                # We can add more requests to the batch.
+                continue
+
+        # Generate the sequences.
+        input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids
+        llm_outputs = llm.generate(
+            input_ids=input_ids.cuda(),
+            do_sample=True,
+            num_return_sequences=n,
+            temperature=1.0,
+            top_p=1.0,
+            use_cache=True,
+            max_new_tokens=max_output_len,
+        )
+        if not disable_detokenize:
+            # Include the decoding time.
+            tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
+        pbar.update(len(batch))
+
+        # Clear the batch.
+        batch = []
+        max_prompt_len = 0
+        max_output_len = 0
+    end = time.perf_counter()
+    return end - start
+
+
+def run_mii(
+    requests: list[SampleRequest],
+    model: str,
+    tensor_parallel_size: int,
+    output_len: int,
+) -> float:
+    from mii import client, serve
+
+    llm = serve(model, tensor_parallel=tensor_parallel_size)
+    prompts = [request.prompt for request in requests]
+
+    start = time.perf_counter()
+    llm.generate(prompts, max_new_tokens=output_len)
+    end = time.perf_counter()
+    client = client(model)
+    client.terminate_server()
+    return end - start
+
+
+def save_to_pytorch_benchmark_format(
+    args: argparse.Namespace, results: dict[str, Any]
+) -> None:
+    pt_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={
+            "requests_per_second": [results["requests_per_second"]],
+            "tokens_per_second": [results["tokens_per_second"]],
+        },
+        extra_info={
+            k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"]
+        },
+    )
+    if pt_records:
+        # Don't use json suffix here as we don't want CI to pick it up
+        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
+        write_to_json(pt_file, pt_records)
+
+
+def get_requests(args, tokenizer):
+    # Common parameters for all dataset types.
+    common_kwargs = {
+        "dataset_path": args.dataset_path,
+        "random_seed": args.seed,
+    }
+    sample_kwargs = {
+        "tokenizer": tokenizer,
+        "lora_path": args.lora_path,
+        "max_loras": args.max_loras,
+        "num_requests": args.num_prompts,
+        "input_len": args.input_len,
+        "output_len": args.output_len,
+    }
+
+    if args.dataset_path is None or args.dataset_name == "random":
+        sample_kwargs["range_ratio"] = args.random_range_ratio
+        sample_kwargs["prefix_len"] = args.prefix_len
+        dataset_cls = RandomDataset
+    elif args.dataset_name == "sharegpt":
+        dataset_cls = ShareGPTDataset
+        if args.backend == "vllm-chat":
+            sample_kwargs["enable_multimodal_chat"] = True
+    elif args.dataset_name == "sonnet":
+        assert tokenizer.chat_template or tokenizer.default_chat_template, (
+            "Tokenizer/model must have chat template for sonnet dataset."
+        )
+        dataset_cls = SonnetDataset
+        sample_kwargs["prefix_len"] = args.prefix_len
+        sample_kwargs["return_prompt_formatted"] = True
+    elif args.dataset_name == "burstgpt":
+        dataset_cls = BurstGPTDataset
+    elif args.dataset_name == "hf":
+        common_kwargs["no_stream"] = args.no_stream
+        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = VisionArenaDataset
+            common_kwargs["dataset_subset"] = None
+            common_kwargs["dataset_split"] = "train"
+            sample_kwargs["enable_multimodal_chat"] = True
+        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = InstructCoderDataset
+            common_kwargs["dataset_split"] = "train"
+        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = ConversationDataset
+            common_kwargs["dataset_subset"] = args.hf_subset
+            common_kwargs["dataset_split"] = args.hf_split
+            sample_kwargs["enable_multimodal_chat"] = True
+        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = AIMODataset
+            common_kwargs["dataset_subset"] = None
+            common_kwargs["dataset_split"] = "train"
+    else:
+        raise ValueError(f"Unknown dataset name: {args.dataset_name}")
+    # Remove None values
+    sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None}
+    return dataset_cls(**common_kwargs).sample(**sample_kwargs)
+
+
+def main(args: argparse.Namespace):
+    if args.seed is None:
+        args.seed = 0
+    print(args)
+    random.seed(args.seed)
+    # Sample the requests.
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code
+    )
+    requests = get_requests(args, tokenizer)
+    is_multi_modal = any(request.multi_modal_data is not None for request in requests)
+    request_outputs: Optional[list[RequestOutput]] = None
+    if args.backend == "vllm":
+        if args.async_engine:
+            elapsed_time = uvloop.run(
+                run_vllm_async(
+                    requests,
+                    args.n,
+                    AsyncEngineArgs.from_cli_args(args),
+                    args.disable_frontend_multiprocessing,
+                    args.disable_detokenize,
+                )
+            )
+        else:
+            elapsed_time, request_outputs = run_vllm(
+                requests,
+                args.n,
+                EngineArgs.from_cli_args(args),
+                args.disable_detokenize,
+            )
+    elif args.backend == "hf":
+        assert args.tensor_parallel_size == 1
+        elapsed_time = run_hf(
+            requests,
+            args.model,
+            tokenizer,
+            args.n,
+            args.hf_max_batch_size,
+            args.trust_remote_code,
+            args.disable_detokenize,
+        )
+    elif args.backend == "mii":
+        elapsed_time = run_mii(
+            requests, args.model, args.tensor_parallel_size, args.output_len
+        )
+    elif args.backend == "vllm-chat":
+        elapsed_time, request_outputs = run_vllm_chat(
+            requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize
+        )
+    else:
+        raise ValueError(f"Unknown backend: {args.backend}")
+
+    if request_outputs:
+        # Note: with the vllm and vllm-chat backends,
+        # we have request_outputs, which we use to count tokens.
+        total_prompt_tokens = 0
+        total_output_tokens = 0
+        for ro in request_outputs:
+            if not isinstance(ro, RequestOutput):
+                continue
+            total_prompt_tokens += (
+                len(ro.prompt_token_ids) if ro.prompt_token_ids else 0
+            )
+            total_output_tokens += sum(len(o.token_ids) for o in ro.outputs if o)
+        total_num_tokens = total_prompt_tokens + total_output_tokens
+    else:
+        total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests)
+        total_output_tokens = sum(r.expected_output_len for r in requests)
+        total_prompt_tokens = total_num_tokens - total_output_tokens
+
+    if is_multi_modal and args.backend != "vllm-chat":
+        print(
+            "\033[91mWARNING\033[0m: Multi-modal request with "
+            f"{args.backend} backend detected. The "
+            "following metrics are not accurate because image tokens are not"
+            " counted. See vllm-project/vllm/issues/9778 for details."
+        )
+        # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
+        # vllm-chat backend counts the image tokens now
+
+    print(
+        f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+        f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
+        f"{total_output_tokens / elapsed_time:.2f} output tokens/s"
+    )
+    print(f"Total num prompt tokens:  {total_prompt_tokens}")
+    print(f"Total num output tokens:  {total_output_tokens}")
+
+    # Output JSON results if specified
+    if args.output_json:
+        results = {
+            "elapsed_time": elapsed_time,
+            "num_requests": len(requests),
+            "total_num_tokens": total_num_tokens,
+            "requests_per_second": len(requests) / elapsed_time,
+            "tokens_per_second": total_num_tokens / elapsed_time,
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+        save_to_pytorch_benchmark_format(args, results)
+
+
+def validate_args(args):
+    """
+    Validate command-line arguments.
+    """
+
+    # === Deprecation and Defaulting ===
+    if args.dataset is not None:
+        warnings.warn(
+            "The '--dataset' argument will be deprecated in the next release. "
+            "Please use '--dataset-name' and '--dataset-path' instead.",
+            stacklevel=2,
+        )
+        args.dataset_path = args.dataset
+
+    if not getattr(args, "tokenizer", None):
+        args.tokenizer = args.model
+
+    # === Backend Validation ===
+    valid_backends = {"vllm", "hf", "mii", "vllm-chat"}
+    if args.backend not in valid_backends:
+        raise ValueError(f"Unsupported backend: {args.backend}")
+
+    # === Dataset Configuration ===
+    if not args.dataset and not args.dataset_path:
+        print("When dataset path is not set, it will default to random dataset")
+        args.dataset_name = "random"
+        if args.input_len is None:
+            raise ValueError("input_len must be provided for a random dataset")
+
+    # === Dataset Name Specific Checks ===
+    # --hf-subset and --hf-split: only used
+    # when dataset_name is 'hf'
+    if args.dataset_name != "hf" and (
+        getattr(args, "hf_subset", None) is not None
+        or getattr(args, "hf_split", None) is not None
+    ):
+        warnings.warn(
+            "--hf-subset and --hf-split will be ignored \
+                since --dataset-name is not 'hf'.",
+            stacklevel=2,
+        )
+    elif args.dataset_name == "hf":
+        if args.dataset_path in (
+            VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
+            | ConversationDataset.SUPPORTED_DATASET_PATHS
+        ):
+            assert args.backend == "vllm-chat", (
+                f"{args.dataset_path} needs to use vllm-chat as the backend."
+            )  # noqa: E501
+        elif args.dataset_path in (
+            InstructCoderDataset.SUPPORTED_DATASET_PATHS
+            | AIMODataset.SUPPORTED_DATASET_PATHS
+        ):
+            assert args.backend == "vllm", (
+                f"{args.dataset_path} needs to use vllm as the backend."
+            )  # noqa: E501
+        else:
+            raise ValueError(f"{args.dataset_path} is not supported by hf dataset.")
+
+    # --random-range-ratio: only used when dataset_name is 'random'
+    if args.dataset_name != "random" and args.random_range_ratio is not None:
+        warnings.warn(
+            "--random-range-ratio will be ignored since \
+                --dataset-name is not 'random'.",
+            stacklevel=2,
+        )
+
+    # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
+    # set.
+    if (
+        args.dataset_name not in {"random", "sonnet", None}
+        and args.prefix_len is not None
+    ):
+        warnings.warn(
+            "--prefix-len will be ignored since --dataset-name\
+                 is not 'random', 'sonnet', or not set.",
+            stacklevel=2,
+        )
+
+    # === LoRA Settings ===
+    if getattr(args, "enable_lora", False) and args.backend != "vllm":
+        raise ValueError("LoRA benchmarking is only supported for vLLM backend")
+    if getattr(args, "enable_lora", False) and args.lora_path is None:
+        raise ValueError("LoRA path must be provided when enable_lora is True")
+
+    # === Backend-specific Validations ===
+    if args.backend == "hf" and args.hf_max_batch_size is None:
+        raise ValueError("HF max batch size is required for HF backend")
+    if args.backend != "hf" and args.hf_max_batch_size is not None:
+        raise ValueError("HF max batch size is only for HF backend.")
+
+    if (
+        args.backend in {"hf", "mii"}
+        and getattr(args, "quantization", None) is not None
+    ):
+        raise ValueError("Quantization is only for vLLM backend.")
+
+    if args.backend == "mii" and args.dtype != "auto":
+        raise ValueError("dtype must be auto for MII backend.")
+    if args.backend == "mii" and args.n != 1:
+        raise ValueError("n must be 1 for MII backend.")
+    if args.backend == "mii" and args.tokenizer != args.model:
+        raise ValueError("Tokenizer must be the same as the model for MII backend.")
+
+    # --data-parallel is not supported currently.
+    # https://github.com/vllm-project/vllm/issues/16222
+    if args.data_parallel_size > 1:
+        raise ValueError(
+            "Data parallel is not supported in offline benchmark, \
+            please use benchmark serving instead"
+        )
+
+
+def create_argument_parser():
+    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
+    parser.add_argument(
+        "--backend",
+        type=str,
+        choices=["vllm", "hf", "mii", "vllm-chat"],
+        default="vllm",
+    )
+    parser.add_argument(
+        "--dataset-name",
+        type=str,
+        choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
+        help="Name of the dataset to benchmark on.",
+        default="sharegpt",
+    )
+    parser.add_argument(
+        "--no-stream",
+        action="store_true",
+        help="Do not load the dataset in streaming mode.",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default=None,
+        help="Path to the ShareGPT dataset, will be deprecated in\
+            the next release. The dataset is expected to "
+        "be a json in form of list[dict[..., conversations: "
+        "list[dict[..., value: <prompt_or_response>]]]]",
+    )
+    parser.add_argument(
+        "--dataset-path", type=str, default=None, help="Path to the dataset"
+    )
+    parser.add_argument(
+        "--input-len",
+        type=int,
+        default=None,
+        help="Input prompt length for each request",
+    )
+    parser.add_argument(
+        "--output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the "
+        "output length from the dataset.",
+    )
+    parser.add_argument(
+        "--n", type=int, default=1, help="Number of generated sequences per prompt."
+    )
+    parser.add_argument(
+        "--num-prompts", type=int, default=1000, help="Number of prompts to process."
+    )
+    parser.add_argument(
+        "--hf-max-batch-size",
+        type=int,
+        default=None,
+        help="Maximum batch size for HF backend.",
+    )
+    parser.add_argument(
+        "--output-json",
+        type=str,
+        default=None,
+        help="Path to save the throughput results in JSON format.",
+    )
+    parser.add_argument(
+        "--async-engine",
+        action="store_true",
+        default=False,
+        help="Use vLLM async engine rather than LLM class.",
+    )
+    parser.add_argument(
+        "--disable-frontend-multiprocessing",
+        action="store_true",
+        default=False,
+        help="Disable decoupled async engine frontend.",
+    )
+    parser.add_argument(
+        "--disable-detokenize",
+        action="store_true",
+        help=(
+            "Do not detokenize the response (i.e. do not include "
+            "detokenization time in the measurement)"
+        ),
+    )
+    # LoRA
+    parser.add_argument(
+        "--lora-path",
+        type=str,
+        default=None,
+        help="Path to the LoRA adapters to use. This can be an absolute path, "
+        "a relative path, or a Hugging Face model identifier.",
+    )
+    parser.add_argument(
+        "--prefix-len",
+        type=int,
+        default=None,
+        help=f"Number of prefix tokens to be used in RandomDataset "
+        "and SonnetDataset. For RandomDataset, the total input "
+        "length is the sum of prefix-len (default: "
+        f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length "
+        "sampled from [input_len * (1 - range_ratio), "
+        "input_len * (1 + range_ratio)]. For SonnetDataset, "
+        f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) "
+        "controls how much of the input is fixed lines versus "
+        "random lines, but the total input length remains approximately "
+        "input_len tokens.",
+    )
+    # random dataset
+    parser.add_argument(
+        "--random-range-ratio",
+        type=float,
+        default=None,
+        help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) "
+        "for sampling input/output length, "
+        "used only for RandomDataset. Must be in the range [0, 1) to "
+        "define a symmetric sampling range "
+        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
+    )
+
+    # hf dtaset
+    parser.add_argument(
+        "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
+    )
+    parser.add_argument(
+        "--hf-split", type=str, default=None, help="Split of the HF dataset."
+    )
+
+    parser = AsyncEngineArgs.add_cli_args(parser)
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = create_argument_parser()
+    args = parser.parse_args()
+    if args.tokenizer is None:
+        args.tokenizer = args.model
+    validate_args(args)
+    main(args)
diff --git a/vllm_v0.10.0/benchmarks/benchmark_utils.py b/vllm_v0.10.0/benchmarks/benchmark_utils.py
new file mode 100644
index 0000000..283f938
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/benchmark_utils.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import json
+import math
+import os
+from typing import Any
+
+
+def convert_to_pytorch_benchmark_format(
+    args: argparse.Namespace, metrics: dict[str, list], extra_info: dict[str, Any]
+) -> list:
+    """
+    Save the benchmark results in the format used by PyTorch OSS benchmark with
+    on metric per record
+    https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
+    """
+    records = []
+    if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
+        return records
+
+    for name, benchmark_values in metrics.items():
+        record = {
+            "benchmark": {
+                "name": "vLLM benchmark",
+                "extra_info": {
+                    "args": vars(args),
+                },
+            },
+            "model": {
+                "name": args.model,
+            },
+            "metric": {
+                "name": name,
+                "benchmark_values": benchmark_values,
+                "extra_info": extra_info,
+            },
+        }
+
+        tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size")
+        # Save tensor_parallel_size parameter if it's part of the metadata
+        if not tp and "tensor_parallel_size" in extra_info:
+            record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = (
+                extra_info["tensor_parallel_size"]
+            )
+
+        records.append(record)
+
+    return records
+
+
+class InfEncoder(json.JSONEncoder):
+    def clear_inf(self, o: Any):
+        if isinstance(o, dict):
+            return {k: self.clear_inf(v) for k, v in o.items()}
+        elif isinstance(o, list):
+            return [self.clear_inf(v) for v in o]
+        elif isinstance(o, float) and math.isinf(o):
+            return "inf"
+        return o
+
+    def iterencode(self, o: Any, *args, **kwargs) -> Any:
+        return super().iterencode(self.clear_inf(o), *args, **kwargs)
+
+
+def write_to_json(filename: str, records: list) -> None:
+    with open(filename, "w") as f:
+        json.dump(
+            records,
+            f,
+            cls=InfEncoder,
+            default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
+        )
diff --git a/vllm_v0.10.0/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/vllm_v0.10.0/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
new file mode 100644
index 0000000..9ec270b
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -0,0 +1,516 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import copy
+import itertools
+import pickle as pkl
+import time
+from collections.abc import Iterable
+from typing import Callable
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from utils import make_rand_sparse_tensors
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+from vllm.utils import FlexibleArgumentParser
+
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
+DEFAULT_TP_SIZES = [1]
+
+
+# bench
+def bench_fn(
+    label: str, sub_label: str, description: str, fn: Callable, *args, **kwargs
+) -> TMeasurement:
+    min_run_time = 1
+
+    globals = {
+        "args": args,
+        "kwargs": kwargs,
+        "fn": fn,
+    }
+    return TBenchmark.Timer(
+        stmt="fn(*args, **kwargs)",
+        globals=globals,
+        label=label,
+        sub_label=sub_label,
+        description=description,
+    ).blocked_autorange(min_run_time=min_run_time)
+
+
+def bench_int8(
+    dtype: torch.dtype, m: int, k: int, n: int, label: str, sub_label: str
+) -> Iterable[TMeasurement]:
+    assert dtype == torch.int8
+    b_compressed, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16)
+
+    out = ops.cutlass_scaled_sparse_mm(
+        a, b_compressed, e, scale_a, scale_b, torch.bfloat16
+    )
+    out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
+
+    if not torch.allclose(out, out_ref):
+        print("Incorrect results")
+        print(out)
+        print(out_ref)
+    else:
+        print("Correct results")
+
+    timers = []
+    # pytorch impl - bfloat16
+    timers.append(
+        bench_fn(
+            label,
+            sub_label,
+            "pytorch_bf16_bf16_bf16_matmul-no-scales",
+            torch.mm,
+            a.to(dtype=torch.bfloat16),
+            b.to(dtype=torch.bfloat16),
+        )
+    )
+
+    # pytorch impl - float16
+    timers.append(
+        bench_fn(
+            label,
+            sub_label,
+            "pytorch_fp16_fp16_fp16_matmul-no-scales",
+            torch.mm,
+            a.to(dtype=torch.float16),
+            b.to(dtype=torch.float16),
+        )
+    )
+
+    # cutlass impl
+    timers.append(
+        bench_fn(
+            label,
+            sub_label,
+            "cutlass_i8_i8_bf16_scaled_mm",
+            ops.cutlass_scaled_mm,
+            a,
+            b,
+            scale_a,
+            scale_b,
+            torch.bfloat16,
+        )
+    )
+
+    # cutlass with bias
+    timers.append(
+        bench_fn(
+            label,
+            sub_label,
+            "cutlass_i8_i8_bf16_scaled_mm_bias",
+            ops.cutlass_scaled_mm,
+            a,
+            b,
+            scale_a,
+            scale_b,
+            torch.bfloat16,
+            bias,
+        )
+    )
+
+    # cutlass sparse impl
+    timers.append(
+        bench_fn(
+            label,
+            sub_label,
+            "cutlass_i8_i8_bf16_scaled_sparse_mm",
+            ops.cutlass_scaled_sparse_mm,
+            a,
+            b_compressed,
+            e,
+            scale_a,
+            scale_b,
+            torch.bfloat16,
+        )
+    )
+
+    # cutlass sparse with bias
+    timers.append(
+        bench_fn(
+            label,
+            sub_label,
+            "cutlass_i8_i8_bf16_scaled_sparse_mm_bias",
+            ops.cutlass_scaled_sparse_mm,
+            a,
+            b_compressed,
+            e,
+            scale_a,
+            scale_b,
+            torch.bfloat16,
+            bias,
+        )
+    )
+
+    return timers
+
+
+def bench_fp8(
+    dtype: torch.dtype, m: int, k: int, n: int, label: str, sub_label: str
+) -> Iterable[TMeasurement]:
+    assert dtype == torch.float8_e4m3fn
+    b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16)
+
+    out = ops.cutlass_scaled_sparse_mm(
+        a, b_compressed, e, scale_a, scale_b, torch.bfloat16
+    )
+    out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
+
+    if not torch.allclose(out, out_ref):
+        print("Incorrect results")
+        print(out)
+        print(out_ref)
+    else:
+        print("Correct results")
+
+    timers = []
+
+    # pytorch impl w. bf16
+    timers.append(
+        bench_fn(
+            label,
+            sub_label,
+            "pytorch_bf16_bf16_bf16_matmul-no-scales",
+            torch.mm,
+            a.to(dtype=torch.bfloat16, device="cuda"),
+            b.to(dtype=torch.bfloat16, device="cuda"),
+        )
+    )
+
+    # pytorch impl: bf16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(
+            label,
+            sub_label,
+            "pytorch_fp8_fp8_bf16_scaled_mm",
+            torch._scaled_mm,
+            a,
+            b,
+            scale_a=scale_a,
+            scale_b=scale_b,
+            out_dtype=torch.bfloat16,
+        )
+    )
+
+    # pytorch impl: bf16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(
+            label,
+            sub_label,
+            "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
+            torch._scaled_mm,
+            a,
+            b,
+            scale_a=scale_a,
+            scale_b=scale_b,
+            out_dtype=torch.bfloat16,
+            use_fast_accum=True,
+        )
+    )
+
+    # pytorch impl: fp16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(
+            label,
+            sub_label,
+            "pytorch_fp8_fp8_fp16_scaled_mm",
+            torch._scaled_mm,
+            a,
+            b,
+            scale_a=scale_a,
+            scale_b=scale_b,
+            out_dtype=torch.float16,
+        )
+    )
+
+    # pytorch impl: fp16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(
+            label,
+            sub_label,
+            "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
+            torch._scaled_mm,
+            a,
+            b,
+            scale_a=scale_a,
+            scale_b=scale_b,
+            out_dtype=torch.float16,
+            use_fast_accum=True,
+        )
+    )
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(
+            label,
+            sub_label,
+            "cutlass_fp8_fp8_bf16_scaled_mm",
+            ops.cutlass_scaled_mm,
+            a,
+            b,
+            scale_a,
+            scale_b,
+            torch.bfloat16,
+        )
+    )
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(
+            label,
+            sub_label,
+            "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
+            ops.cutlass_scaled_sparse_mm,
+            a,
+            b_compressed,
+            e,
+            scale_a,
+            scale_b,
+            torch.bfloat16,
+        )
+    )
+
+    # cutlass impl: fp16 output
+    timers.append(
+        bench_fn(
+            label,
+            sub_label,
+            "cutlass_fp8_fp8_fp16_scaled_sparse_mm",
+            ops.cutlass_scaled_sparse_mm,
+            a,
+            b_compressed,
+            e,
+            scale_a,
+            scale_b,
+            torch.float16,
+        )
+    )
+
+    # cutlass impl: bf16 output, with bias
+    timers.append(
+        bench_fn(
+            label,
+            sub_label,
+            "cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias",
+            ops.cutlass_scaled_sparse_mm,
+            a,
+            b_compressed,
+            e,
+            scale_a,
+            scale_b,
+            torch.bfloat16,
+            bias,
+        )
+    )
+
+    # cutlass impl: fp16 output, with bias
+    timers.append(
+        bench_fn(
+            label,
+            sub_label,
+            "cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias",
+            ops.cutlass_scaled_sparse_mm,
+            a,
+            b_compressed,
+            e,
+            scale_a,
+            scale_b,
+            torch.float16,
+            bias.to(dtype=torch.float16),
+        )
+    )
+
+    return timers
+
+
+def bench(
+    dtype: torch.dtype, m: int, k: int, n: int, label: str, sub_label: str
+) -> Iterable[TMeasurement]:
+    if dtype == torch.int8:
+        return bench_int8(dtype, m, k, n, label, sub_label)
+    if dtype == torch.float8_e4m3fn:
+        return bench_fp8(dtype, m, k, n, label, sub_label)
+    raise ValueError("unsupported type")
+
+
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+def run(
+    dtype: torch.dtype, MKNs: Iterable[tuple[int, int, int]]
+) -> Iterable[TMeasurement]:
+    results = []
+    for m, k, n in MKNs:
+        timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm", f"MKN=({m}x{k}x{n})")
+        print_timers(timers)
+        results.extend(timers)
+
+    return results
+
+
+# output makers
+def make_output(
+    data: Iterable[TMeasurement],
+    MKNs: Iterable[tuple[int, int, int]],
+    base_description: str,
+    timestamp=None,
+):
+    print(f"== All Results {base_description} ====")
+    print_timers(data)
+
+    # pickle all the results
+    timestamp = int(time.time()) if timestamp is None else timestamp
+    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(data, f)
+
+
+# argparse runners
+
+
+def run_square_bench(args):
+    dim_sizes = list(range(args.dim_start, args.dim_end + 1, args.dim_increment))
+    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
+    data = run(args.dtype, MKNs)
+
+    make_output(data, MKNs, f"square_bench-{args.dtype}")
+
+
+def run_range_bench(args):
+    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
+    n = len(dim_sizes)
+    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
+    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
+    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
+    MKNs = list(zip(Ms, Ks, Ns))
+    data = run(args.dtype, MKNs)
+
+    make_output(data, MKNs, f"range_bench-{args.dtype}")
+
+
+def run_model_bench(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]:
+        KNs = []
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KNs.append(KN)
+        return KNs
+
+    model_bench_data = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        Ms = args.batch_sizes
+        KNs = model_shapes(model, tp_size)
+        MKNs = []
+        for m in Ms:
+            for k, n in KNs:
+                MKNs.append((m, k, n))
+
+        data = run(args.dtype, MKNs)
+        model_bench_data.append(data)
+
+    # Print all results
+    for data, model_tp in zip(model_bench_data, models_tps):
+        model, tp_size = model_tp
+        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
+        print_timers(data)
+
+    timestamp = int(time.time())
+
+    all_data = []
+    for d in model_bench_data:
+        all_data.extend(d)
+    # pickle all data
+    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(all_data, f)
+
+
+if __name__ == "__main__":
+
+    def to_torch_dtype(dt):
+        if dt == "int8":
+            return torch.int8
+        if dt == "fp8":
+            return torch.float8_e4m3fn
+        raise ValueError("unsupported dtype")
+
+    parser = FlexibleArgumentParser(
+        description="""
+Benchmark Cutlass GEMM.
+
+    To run square GEMMs:
+        python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
+    
+    To run constant N and K and sweep M:
+        python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
+    
+    To run dimensions from a model:
+        python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
+    
+    Output:
+        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
+            """,  # noqa: E501
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+
+    parser.add_argument(
+        "--dtype",
+        type=to_torch_dtype,
+        required=True,
+        help="Available options are ['int8', 'fp8']",
+    )
+    subparsers = parser.add_subparsers(dest="cmd")
+
+    square_parser = subparsers.add_parser("square_bench")
+    square_parser.add_argument("--dim-start", type=int, required=True)
+    square_parser.add_argument("--dim-end", type=int, required=True)
+    square_parser.add_argument("--dim-increment", type=int, required=True)
+    square_parser.set_defaults(func=run_square_bench)
+
+    range_parser = subparsers.add_parser("range_bench")
+    range_parser.add_argument("--dim-start", type=int, required=True)
+    range_parser.add_argument("--dim-end", type=int, required=True)
+    range_parser.add_argument("--dim-increment", type=int, required=True)
+    range_parser.add_argument("--m-constant", type=int, default=None)
+    range_parser.add_argument("--n-constant", type=int, default=None)
+    range_parser.add_argument("--k-constant", type=int, default=None)
+    range_parser.set_defaults(func=run_range_bench)
+
+    model_parser = subparsers.add_parser("model_bench")
+    model_parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=DEFAULT_MODELS,
+        choices=WEIGHT_SHAPES.keys(),
+    )
+    model_parser.add_argument(
+        "--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES
+    )
+    model_parser.add_argument(
+        "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
+    )
+    model_parser.set_defaults(func=run_model_bench)
+
+    args = parser.parse_args()
+    args.func(args)
diff --git a/vllm_v0.10.0/benchmarks/cutlass_benchmarks/utils.py b/vllm_v0.10.0/benchmarks/cutlass_benchmarks/utils.py
new file mode 100644
index 0000000..b4f3c6b
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/cutlass_benchmarks/utils.py
@@ -0,0 +1,100 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Cutlass bench utils
+from collections.abc import Iterable
+
+import torch
+
+import vllm._custom_ops as ops
+
+
+def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(min=finfo.min, max=finfo.max)).to(
+        dtype=torch.float8_e4m3fn
+    )
+
+
+def to_int8(tensor: torch.Tensor) -> torch.Tensor:
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.bfloat16)
+
+
+def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.float16)
+
+
+def make_rand_tensors(
+    dtype: torch.dtype, m: int, n: int, k: int
+) -> tuple[torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device="cuda") * 5
+    b = torch.randn((n, k), device="cuda").t() * 5
+
+    if dtype == torch.int8:
+        return to_int8(a), to_int8(b)
+    if dtype == torch.float8_e4m3fn:
+        return to_fp8(a), to_fp8(b)
+
+    raise ValueError("unsupported dtype")
+
+
+def prune_to_2_4(tensor):
+    # Reshape tensor to [N, 4] where N is number of groups of 4
+    original_shape = tensor.shape
+    reshaped = tensor.reshape(-1, 4)
+
+    # Get indices of top 2 absolute values in each group of 4
+    _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
+
+    # Create binary mask
+    mask = torch.zeros_like(reshaped)
+    mask.scatter_(dim=1, index=indices, src=torch.ones_like(indices, dtype=mask.dtype))
+
+    # Apply mask and reshape back
+    pruned = reshaped * mask
+
+    # Turn all -0.0 to 0.0
+    pruned[pruned == -0.0] = 0.0
+
+    return pruned.reshape(original_shape)
+
+
+def make_rand_sparse_tensors(
+    dtype: torch.dtype, m: int, n: int, k: int
+) -> tuple[torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device="cuda") * 5
+    b = torch.randn((n, k), device="cuda").t() * 5
+
+    b = prune_to_2_4(b.t()).t()
+
+    if dtype == torch.int8:
+        a, b = to_int8(a), to_int8(b)
+    elif dtype == torch.float8_e4m3fn:
+        a, b = to_fp8(a), to_fp8(b)
+    elif dtype == torch.float16:
+        a, b = to_fp16(a), to_fp16(b)
+    elif dtype == torch.bfloat16:
+        a, b = to_bf16(a), to_bf16(b)
+    else:
+        raise ValueError("unsupported dtype")
+
+    b_compressed, e = ops.cutlass_sparse_compress(b.t())
+
+    # Compressed B, Metadata, Original A, B
+    return b_compressed, e, a, b
+
+
+def make_n_rand_sparse_tensors(
+    num_tensors: int, dtype: torch.dtype, m: int, n: int, k: int
+) -> tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
+    ABs = []
+    for _ in range(num_tensors):
+        b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
+        if b_comp is not None:
+            ABs.append(make_rand_sparse_tensors(dtype, m, n, k))
+    BComps, Es, As, Bs = zip(*ABs)
+    return list(BComps), list(Es), list(As), list(Bs)
diff --git a/vllm_v0.10.0/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/vllm_v0.10.0/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
new file mode 100644
index 0000000..a5a5b52
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -0,0 +1,372 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import copy
+import itertools
+import pickle as pkl
+import time
+from collections.abc import Iterable
+from typing import Callable, Optional
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from utils import make_rand_tensors
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    w8a8_block_fp8_matmul,
+)
+from vllm.utils import FlexibleArgumentParser, cdiv
+
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
+DEFAULT_TP_SIZES = [1]
+
+
+# bench
+def bench_fn(
+    label: str, sub_label: str, description: str, fn: Callable, *args, **kwargs
+) -> TMeasurement:
+    min_run_time = 1
+
+    globals = {
+        "args": args,
+        "kwargs": kwargs,
+        "fn": fn,
+    }
+    return TBenchmark.Timer(
+        stmt="fn(*args, **kwargs)",
+        globals=globals,
+        label=label,
+        sub_label=sub_label,
+        description=description,
+    ).blocked_autorange(min_run_time=min_run_time)
+
+
+def bench_int8(
+    dtype: torch.dtype,
+    m: int,
+    k: int,
+    n: int,
+    label: str,
+    sub_label: str,
+    bench_kernels: Optional[list[str]] = None,
+) -> Iterable[TMeasurement]:
+    """Benchmark INT8-based kernels."""
+    assert dtype == torch.int8
+    a, b = make_rand_tensors(torch.int8, m, n, k)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16)
+    azp = torch.zeros((m,), device="cuda", dtype=torch.int32)
+    azp_adj = torch.zeros((n,), device="cuda", dtype=torch.int32)
+
+    bench_fns = {
+        "pytorch_bf16_bf16_bf16_matmul-no-scales": lambda: torch.mm(
+            a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
+        ),
+        "pytorch_fp16_fp16_fp16_matmul-no-scales": lambda: torch.mm(
+            a.to(dtype=torch.float16), b.to(dtype=torch.float16)
+        ),
+        "cutlass_i8_i8_bf16_scaled_mm": lambda: ops.cutlass_scaled_mm(
+            a, b, scale_a, scale_b, torch.bfloat16
+        ),
+        "cutlass_i8_i8_bf16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
+            a, b, scale_a, scale_b, torch.bfloat16, bias
+        ),
+        "cutlass_i8_i8_bf16_scaled_mm_azp": lambda: ops.cutlass_scaled_mm_azp(
+            a, b, scale_a, scale_b, torch.bfloat16, azp_adj
+        ),
+        "cutlass_i8_i8_bf16_scaled_mm_azp_bias": lambda: ops.cutlass_scaled_mm_azp(
+            a, b, scale_a, scale_b, torch.bfloat16, azp_adj, None, bias
+        ),
+        "cutlass_i8_i8_bf16_scaled_mm_azp_pt": lambda: ops.cutlass_scaled_mm_azp(
+            a, b, scale_a, scale_b, torch.bfloat16, azp_adj, azp
+        ),
+        "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias": lambda: ops.cutlass_scaled_mm_azp(
+            a, b, scale_a, scale_b, torch.bfloat16, azp_adj, azp, bias
+        ),
+    }
+
+    timers = []
+    for name, fn in bench_fns.items():
+        # If bench_kernels is None, run all. Otherwise, run only exact matches.
+        if bench_kernels is None or name in bench_kernels:
+            print(f"Running {name}")
+            timers.append(bench_fn(label, sub_label, name, fn))
+
+    return timers
+
+
+def bench_fp8(
+    dtype: torch.dtype,
+    m: int,
+    k: int,
+    n: int,
+    label: str,
+    sub_label: str,
+    bench_kernels: Optional[list[str]] = None,
+) -> Iterable[TMeasurement]:
+    """Benchmark FP8-based kernels."""
+    assert dtype == torch.float8_e4m3fn
+    a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
+    a_cont = a.contiguous()
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+
+    block_scale_a = torch.rand((m, cdiv(k, 128)), device="cuda", dtype=torch.float32)
+    block_scale_b = torch.rand(
+        cdiv(k, 128), cdiv(n, 128), device="cuda", dtype=torch.float32
+    )
+    block_scale_a_M_major = block_scale_a.t().contiguous().t()
+    block_scale_b_K_major = block_scale_b.t().contiguous().t()
+    bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16)
+
+    print(m, k, n)
+
+    bench_fns = {
+        "pytorch_bf16_bf16_bf16_matmul-no-scales": lambda: torch.mm(
+            a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
+        ),
+        "pytorch_fp16_fp16_fp16_matmul-no-scales": lambda: torch.mm(
+            a.to(dtype=torch.float16), b.to(dtype=torch.float16)
+        ),
+        "pytorch_fp8_fp8_fp16_scaled_mm": lambda: torch._scaled_mm(
+            a, b, scale_a, scale_b, out_dtype=torch.float16
+        ),
+        "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum": lambda: torch._scaled_mm(
+            a, b, scale_a, scale_b, out_dtype=torch.float16, use_fast_accum=True
+        ),
+        "pytorch_fp8_fp8_bf16_scaled_mm": lambda: torch._scaled_mm(
+            a, b, scale_a, scale_b, out_dtype=torch.bfloat16
+        ),
+        "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum": lambda: torch._scaled_mm(
+            a, b, scale_a, scale_b, out_dtype=torch.bfloat16, use_fast_accum=True
+        ),
+        "cutlass_fp8_fp8_bf16_scaled_mm": lambda: ops.cutlass_scaled_mm(
+            a, b, scale_a, scale_b, torch.bfloat16
+        ),
+        "cutlass_fp8_fp8_fp16_scaled_mm": lambda: ops.cutlass_scaled_mm(
+            a, b, scale_a, scale_b, torch.float16
+        ),
+        "cutlass_fp8_fp8_bf16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
+            a, b, scale_a, scale_b, torch.bfloat16, bias
+        ),
+        "cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
+            a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
+        ),
+        "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
+            a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
+        ),
+        "cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
+            a, b, block_scale_a_M_major, block_scale_b_K_major, torch.float16
+        ),
+    }
+
+    timers = []
+    for name, fn in bench_fns.items():
+        # If bench_kernels is None, run all. Otherwise, run only exact matches.
+        if bench_kernels is None or name in bench_kernels:
+            print(f"Running {name}")
+            timers.append(bench_fn(label, sub_label, name, fn))
+
+    return timers
+
+
+def bench(
+    dtype: torch.dtype,
+    m: int,
+    k: int,
+    n: int,
+    label: str,
+    sub_label: str,
+    bench_kernels: Optional[list[str]] = None,
+) -> Iterable[TMeasurement]:
+    if dtype == torch.int8:
+        return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
+    if dtype == torch.float8_e4m3fn:
+        return bench_fp8(dtype, m, k, n, label, sub_label, bench_kernels)
+    raise ValueError("unsupported type")
+
+
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+def run(
+    dtype: torch.dtype,
+    MKNs: Iterable[tuple[int, int, int]],
+    bench_kernels: Optional[list[str]] = None,
+) -> Iterable[TMeasurement]:
+    results = []
+    for m, k, n in MKNs:
+        timers = bench(
+            dtype,
+            m,
+            k,
+            n,
+            f"scaled-{dtype}-gemm",
+            f"MKN=({m}x{k}x{n})",
+            bench_kernels=bench_kernels,
+        )
+        print_timers(timers)
+        results.extend(timers)
+    return results
+
+
+def make_output(
+    data: Iterable[TMeasurement],
+    MKNs: Iterable[tuple[int, int, int]],
+    base_description: str,
+    timestamp=None,
+):
+    print(f"== All Results {base_description} ====")
+    print_timers(data)
+
+    # pickle all the results
+    timestamp = int(time.time()) if timestamp is None else timestamp
+    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(data, f)
+
+
+def run_square_bench(args):
+    dim_sizes = list(range(args.dim_start, args.dim_end + 1, args.dim_increment))
+    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
+    data = run(args.dtype, MKNs, bench_kernels=args.kernels)
+    make_output(data, MKNs, f"square_bench-{args.dtype}")
+
+
+def run_range_bench(args):
+    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
+    n = len(dim_sizes)
+    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
+    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
+    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
+    MKNs = list(zip(Ms, Ks, Ns))
+    data = run(args.dtype, MKNs, bench_kernels=args.kernels)
+    make_output(data, MKNs, f"range_bench-{args.dtype}")
+
+
+def run_model_bench(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]:
+        KNs = []
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KNs.append(KN)
+        return KNs
+
+    model_bench_data = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        Ms = args.batch_sizes
+        KNs = model_shapes(model, tp_size)
+        MKNs = []
+        for m in Ms:
+            for k, n in KNs:
+                MKNs.append((m, k, n))
+
+        data = run(args.dtype, MKNs, bench_kernels=args.kernels)
+        model_bench_data.append(data)
+
+    # Print all results
+    for data, model_tp in zip(model_bench_data, models_tps):
+        model, tp_size = model_tp
+        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
+        print_timers(data)
+
+    timestamp = int(time.time())
+
+    all_data = []
+    for d in model_bench_data:
+        all_data.extend(d)
+    # pickle all data
+    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(all_data, f)
+
+
+if __name__ == "__main__":
+
+    def to_torch_dtype(dt):
+        if dt == "int8":
+            return torch.int8
+        if dt == "fp8":
+            return torch.float8_e4m3fn
+        raise ValueError("unsupported dtype")
+
+    parser = FlexibleArgumentParser(
+        description="""
+Benchmark Cutlass GEMM.
+
+    To run square GEMMs:
+        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
+    
+    To run constant N and K and sweep M:
+        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
+    
+    To run dimensions from a model:
+        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
+    
+    Output:
+        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
+            """,  # noqa: E501
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+
+    parser.add_argument(
+        "--dtype",
+        type=to_torch_dtype,
+        required=True,
+        help="Available options are ['int8', 'fp8']",
+    )
+    parser.add_argument(
+        "--kernels",
+        nargs="+",
+        type=str,
+        default=None,
+        help="Exact names of the kernels to benchmark. If not set, runs all kernels.",
+    )
+
+    subparsers = parser.add_subparsers(dest="cmd")
+
+    square_parser = subparsers.add_parser("square_bench")
+    square_parser.add_argument("--dim-start", type=int, required=True)
+    square_parser.add_argument("--dim-end", type=int, required=True)
+    square_parser.add_argument("--dim-increment", type=int, required=True)
+    square_parser.set_defaults(func=run_square_bench)
+
+    range_parser = subparsers.add_parser("range_bench")
+    range_parser.add_argument("--dim-start", type=int, required=True)
+    range_parser.add_argument("--dim-end", type=int, required=True)
+    range_parser.add_argument("--dim-increment", type=int, required=True)
+    range_parser.add_argument("--m-constant", type=int, default=None)
+    range_parser.add_argument("--n-constant", type=int, default=None)
+    range_parser.add_argument("--k-constant", type=int, default=None)
+    range_parser.set_defaults(func=run_range_bench)
+
+    model_parser = subparsers.add_parser("model_bench")
+    model_parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=DEFAULT_MODELS,
+        choices=WEIGHT_SHAPES.keys(),
+    )
+    model_parser.add_argument(
+        "--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES
+    )
+    model_parser.add_argument(
+        "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
+    )
+    model_parser.set_defaults(func=run_model_bench)
+
+    args = parser.parse_args()
+    args.func(args)
diff --git a/vllm_v0.10.0/benchmarks/cutlass_benchmarks/weight_shapes.py b/vllm_v0.10.0/benchmarks/cutlass_benchmarks/weight_shapes.py
new file mode 100644
index 0000000..25b96ef
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/cutlass_benchmarks/weight_shapes.py
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Weight Shapes are in the format
+# ([K, N], TP_SPLIT_DIM)
+# Example:
+#  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
+#   - TP1 : K = 14336, N = 4096
+#   - TP2 : K = 7168, N = 4096
+#  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
+#   - TP1 : K = 4096, N = 6144
+#   - TP4 : K = 4096, N = 1536
+
+# TP1 shapes
+WEIGHT_SHAPES = {
+    "mistralai/Mistral-7B-v0.1": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-2-7b-hf": [
+        ([4096, 12288], 1),
+        ([4096, 4096], 0),
+        ([4096, 22016], 1),
+        ([11008, 4096], 0),
+    ],
+    "meta-llama/Llama-3-8b": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-2-13b-hf": [
+        ([5120, 15360], 1),
+        ([5120, 5120], 0),
+        ([5120, 27648], 1),
+        ([13824, 5120], 0),
+    ],
+    "meta-llama/Llama-2-70b-hf": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 57344], 1),
+        ([28672, 8192], 0),
+    ],
+}
diff --git a/vllm_v0.10.0/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/vllm_v0.10.0/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
new file mode 100644
index 0000000..9499963
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@@ -0,0 +1,145 @@
+#!/bin/bash
+
+# benchmark the overhead of disaggregated prefill.
+# methodology:
+# - send all request to prefill vLLM instance. It will buffer KV cache.
+# - then send all request to decode instance. 
+# - The TTFT of decode instance is the overhead.
+
+set -ex
+
+kill_gpu_processes() {
+  # kill all processes on GPU.
+  pgrep pt_main_thread | xargs -r kill -9
+  pgrep python3 | xargs -r kill -9
+  sleep 10
+
+  # remove vllm config file
+  rm -rf ~/.config/vllm
+
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+
+benchmark() {
+
+  export VLLM_LOGGING_LEVEL=DEBUG
+  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+
+  # compare chunked prefill with disaggregated prefill
+
+  results_folder="./results"
+  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
+  dataset_name="sonnet"
+  dataset_path="../sonnet_4x.txt"
+  num_prompts=10
+  qps=$1
+  prefix_len=50
+  input_len=2048
+  output_len=$2
+
+
+  CUDA_VISIBLE_DEVICES=0 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
+    --port 8100 \
+    --max-model-len 10000 \
+    --gpu-memory-utilization 0.6 \
+    --kv-transfer-config \
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    
+
+  CUDA_VISIBLE_DEVICES=1 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
+    --port 8200 \
+    --max-model-len 10000 \
+    --gpu-memory-utilization 0.6 \
+    --kv-transfer-config \
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+
+  wait_for_server 8100
+  wait_for_server 8200
+
+  # let the prefill instance finish prefill
+  python3 ../benchmark_serving.py \
+          --backend vllm \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --sonnet-input-len $input_len \
+          --sonnet-output-len "$output_len" \
+          --sonnet-prefix-len $prefix_len \
+          --num-prompts $num_prompts \
+          --port 8100 \
+          --save-result \
+          --result-dir $results_folder \
+          --result-filename disagg_prefill_tp1.json \
+          --request-rate "inf"
+
+
+  # send the request to decode.
+  # The TTFT of this command will be the overhead of disagg prefill impl.
+  python3 ../benchmark_serving.py \
+          --backend vllm \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --sonnet-input-len $input_len \
+          --sonnet-output-len "$output_len" \
+          --sonnet-prefix-len $prefix_len \
+          --num-prompts $num_prompts \
+          --port 8200 \
+          --save-result \
+          --result-dir $results_folder \
+          --result-filename disagg_prefill_tp1_overhead.json \
+          --request-rate "$qps"
+  kill_gpu_processes
+
+}
+
+
+main() {
+
+  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+  (which jq) || (apt-get -y install jq)
+  (which socat) || (apt-get -y install socat)
+
+  pip install quart httpx datasets
+
+  cd "$(dirname "$0")"
+
+  cd ..
+  # create sonnet-4x.txt
+  echo "" > sonnet_4x.txt
+  for _ in {1..4}
+  do
+    cat sonnet.txt >> sonnet_4x.txt
+  done
+  cd disagg_benchmarks
+
+  rm -rf results
+  mkdir results
+
+  default_qps=1
+  default_output_len=1
+  benchmark $default_qps $default_output_len
+
+}
+
+
+main "$@"
diff --git a/vllm_v0.10.0/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/vllm_v0.10.0/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
new file mode 100644
index 0000000..eb5d891
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -0,0 +1,163 @@
+#!/bin/bash
+
+# Requirement: 2x GPUs.
+
+
+# Model: meta-llama/Meta-Llama-3.1-8B-Instruct
+# Query: 1024 input tokens, 6 output tokens, QPS 2/4/6/8, 100 requests
+# Resource: 2x GPU
+# Approaches:
+# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
+# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
+# Prefilling instance: max_output_token=1
+# Decoding instance: force the input tokens be the same across requests to bypass prefilling
+
+set -ex
+
+kill_gpu_processes() {
+  # kill all processes on GPU.
+  pgrep pt_main_thread | xargs -r kill -9
+  pgrep python3 | xargs -r kill -9
+  for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done
+  sleep 1
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+
+launch_chunked_prefill() {
+  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
+  # disagg prefill
+  CUDA_VISIBLE_DEVICES=0 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
+    --port 8100 \
+    --max-model-len 10000 \
+    --enable-chunked-prefill \
+    --gpu-memory-utilization 0.6 &
+  CUDA_VISIBLE_DEVICES=1 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
+    --port 8200 \
+    --max-model-len 10000 \
+    --enable-chunked-prefill \
+    --gpu-memory-utilization 0.6 &
+  wait_for_server 8100
+  wait_for_server 8200
+  python3 round_robin_proxy.py &
+  sleep 1
+}
+
+
+launch_disagg_prefill() {
+  model="meta-llama/Meta-Llama-3.1-8B-Instruct" 
+  # disagg prefill
+  CUDA_VISIBLE_DEVICES=0 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
+    --port 8100 \
+    --max-model-len 10000 \
+    --gpu-memory-utilization 0.6 \
+    --kv-transfer-config \
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+
+  CUDA_VISIBLE_DEVICES=1 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
+    --port 8200 \
+    --max-model-len 10000 \
+    --gpu-memory-utilization 0.6 \
+    --kv-transfer-config \
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+
+  wait_for_server 8100
+  wait_for_server 8200
+  python3 disagg_prefill_proxy_server.py &
+  sleep 1
+}
+
+
+benchmark() {
+  results_folder="./results"
+  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
+  dataset_name="sonnet"
+  dataset_path="../sonnet_4x.txt"
+  num_prompts=100
+  qps=$1
+  prefix_len=50
+  input_len=1024
+  output_len=$2
+  tag=$3
+
+  python3 ../benchmark_serving.py \
+          --backend vllm \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --sonnet-input-len $input_len \
+          --sonnet-output-len "$output_len" \
+          --sonnet-prefix-len $prefix_len \
+          --num-prompts $num_prompts \
+          --port 8000 \
+          --save-result \
+          --result-dir $results_folder \
+          --result-filename "$tag"-qps-"$qps".json \
+          --request-rate "$qps"
+
+  sleep 2
+}
+
+
+main() {
+
+  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+  (which jq) || (apt-get -y install jq)
+  (which socat) || (apt-get -y install socat)
+  (which lsof) || (apt-get -y install lsof)
+
+  pip install quart httpx matplotlib aiohttp datasets
+
+  cd "$(dirname "$0")"
+
+  cd ..
+  # create sonnet-4x.txt so that we can sample 2048 tokens for input
+  echo "" > sonnet_4x.txt
+  for _ in {1..4}
+  do
+    cat sonnet.txt >> sonnet_4x.txt
+  done
+  cd disagg_benchmarks
+
+  rm -rf results
+  mkdir results
+
+  default_output_len=6
+
+  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+
+  launch_chunked_prefill
+  for qps in 2 4 6 8; do
+  benchmark $qps $default_output_len chunked_prefill
+  done
+  kill_gpu_processes
+
+  launch_disagg_prefill
+  for qps in 2 4 6 8; do
+  benchmark $qps $default_output_len disagg_prefill
+  done
+  kill_gpu_processes
+
+  python3 visualize_benchmark_results.py
+
+}
+
+
+main "$@"
diff --git a/vllm_v0.10.0/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/vllm_v0.10.0/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
new file mode 100644
index 0000000..f62d810
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+
+import aiohttp
+from quart import Quart, make_response, request
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+
+app = Quart(__name__)
+
+
+async def forward_request(url, data):
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+        async with session.post(url=url, json=data, headers=headers) as response:
+            if response.status == 200:
+                # if response.headers.get('Transfer-Encoding') == 'chunked':
+                if True:
+                    async for chunk_bytes in response.content.iter_chunked(1024):
+                        yield chunk_bytes
+                else:
+                    content = await response.read()
+                    yield content
+
+
+@app.route("/v1/completions", methods=["POST"])
+async def handle_request():
+    try:
+        original_request_data = await request.get_json()
+
+        prefill_request = original_request_data.copy()
+        # change max_tokens = 1 to let it only do prefill
+        prefill_request["max_tokens"] = 1
+
+        # finish prefill
+        async for _ in forward_request(
+            "http://localhost:8100/v1/completions", prefill_request
+        ):
+            continue
+
+        # return decode
+        generator = forward_request(
+            "http://localhost:8200/v1/completions", original_request_data
+        )
+        response = await make_response(generator)
+        response.timeout = None
+
+        return response
+
+    except Exception as e:
+        import sys
+        import traceback
+
+        exc_info = sys.exc_info()
+        print("Error occurred in disagg prefill proxy server")
+        print(e)
+        print("".join(traceback.format_exception(*exc_info)))
+
+
+if __name__ == "__main__":
+    app.run(port=8000)
diff --git a/vllm_v0.10.0/benchmarks/disagg_benchmarks/round_robin_proxy.py b/vllm_v0.10.0/benchmarks/disagg_benchmarks/round_robin_proxy.py
new file mode 100644
index 0000000..b1df2f2
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/disagg_benchmarks/round_robin_proxy.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import itertools
+
+import aiohttp
+from aiohttp import web
+
+
+class RoundRobinProxy:
+    def __init__(self, target_ports):
+        self.target_ports = target_ports
+        self.port_cycle = itertools.cycle(self.target_ports)
+
+    async def handle_request(self, request):
+        target_port = next(self.port_cycle)
+        target_url = f"http://localhost:{target_port}{request.path_qs}"
+
+        async with aiohttp.ClientSession() as session:
+            try:
+                # Forward the request
+                async with session.request(
+                    method=request.method,
+                    url=target_url,
+                    headers=request.headers,
+                    data=request.content,
+                ) as response:
+                    # Start sending the response
+                    resp = web.StreamResponse(
+                        status=response.status, headers=response.headers
+                    )
+                    await resp.prepare(request)
+
+                    # Stream the response content
+                    async for chunk in response.content.iter_any():
+                        await resp.write(chunk)
+
+                    await resp.write_eof()
+                    return resp
+
+            except Exception as e:
+                return web.Response(text=f"Error: {str(e)}", status=500)
+
+
+async def main():
+    proxy = RoundRobinProxy([8100, 8200])
+    app = web.Application()
+    app.router.add_route("*", "/{path:.*}", proxy.handle_request)
+
+    runner = web.AppRunner(app)
+    await runner.setup()
+    site = web.TCPSite(runner, "localhost", 8000)
+    await site.start()
+
+    print("Proxy server started on http://localhost:8000")
+
+    # Keep the server running
+    await asyncio.Event().wait()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/vllm_v0.10.0/benchmarks/disagg_benchmarks/visualize_benchmark_results.py b/vllm_v0.10.0/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
new file mode 100644
index 0000000..74fa56d
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import matplotlib.pyplot as plt
+import pandas as pd
+
+if __name__ == "__main__":
+    data = []
+    for name in ["disagg_prefill", "chunked_prefill"]:
+        for qps in [2, 4, 6, 8]:
+            with open(f"results/{name}-qps-{qps}.json") as f:
+                x = json.load(f)
+                x["name"] = name
+                x["qps"] = qps
+                data.append(x)
+
+    df = pd.DataFrame.from_dict(data)
+    dis_df = df[df["name"] == "disagg_prefill"]
+    chu_df = df[df["name"] == "chunked_prefill"]
+
+    plt.style.use("bmh")
+    plt.rcParams["font.size"] = 20
+
+    for key in [
+        "mean_ttft_ms",
+        "median_ttft_ms",
+        "p99_ttft_ms",
+        "mean_itl_ms",
+        "median_itl_ms",
+        "p99_itl_ms",
+    ]:
+        fig, ax = plt.subplots(figsize=(11, 7))
+        plt.plot(
+            dis_df["qps"], dis_df[key], label="disagg_prefill", marker="o", linewidth=4
+        )
+        plt.plot(
+            chu_df["qps"], chu_df[key], label="chunked_prefill", marker="o", linewidth=4
+        )
+        ax.legend()
+
+        ax.set_xlabel("QPS")
+        ax.set_ylabel(key)
+        ax.set_ylim(bottom=0)
+        fig.savefig(f"results/{key}.png")
+        plt.close(fig)
diff --git a/vllm_v0.10.0/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/vllm_v0.10.0/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
new file mode 100644
index 0000000..9015242
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@@ -0,0 +1,228 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pickle as pkl
+import time
+from collections.abc import Iterable
+from dataclasses import dataclass
+from itertools import product
+from typing import Callable, Optional
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from tqdm import tqdm
+
+import vllm._custom_ops as ops
+from vllm.model_executor.layers.layernorm import RMSNorm
+
+
+@dataclass
+class bench_params_t:
+    num_tokens: int
+    hidden_size: int
+    add_residual: bool
+    dtype: torch.dtype
+
+    def description(self):
+        return (
+            f"N {self.num_tokens} "
+            f"x D {self.hidden_size} "
+            f"x R {self.add_residual} "
+            f"x DT {self.dtype}"
+        )
+
+
+def get_bench_params() -> list[bench_params_t]:
+    ## Test Fixtures
+    NUM_TOKENS = [2**x for x in range(11)]
+    HIDDEN_SIZES = list(range(1024, 8129, 1024))
+    ADD_RESIDUAL = [True, False]
+    DTYPES = [torch.bfloat16, torch.float]
+
+    combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
+    bench_params = list(
+        map(lambda x: bench_params_t(x[0], x[1], x[2], x[3]), combinations)
+    )
+    return bench_params
+
+
+# Reference impls
+def unfused_int8_impl(
+    rms_norm_layer: RMSNorm,
+    x: torch.Tensor,
+    residual: Optional[torch.Tensor],
+    quant_dtype: torch.dtype,
+):
+    # Norm
+    torch_out = None
+    if residual is None:
+        torch_out = rms_norm_layer.forward_cuda(x, residual)
+    else:
+        torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
+
+    # Quant
+    torch_out, _, _ = ops.scaled_int8_quant(torch_out)
+
+
+def unfused_fp8_impl(
+    rms_norm_layer: RMSNorm,
+    x: torch.Tensor,
+    residual: Optional[torch.Tensor],
+    quant_dtype: torch.dtype,
+):
+    # Norm
+    torch_out = None
+    if residual is None:
+        torch_out = rms_norm_layer.forward_cuda(x, residual)
+    else:
+        torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
+
+    # Quant
+    torch_out, _ = ops.scaled_fp8_quant(torch_out)
+
+
+def fused_impl(
+    rms_norm_layer: RMSNorm,  # this stores the weights
+    x: torch.Tensor,
+    residual: Optional[torch.Tensor],
+    quant_dtype: torch.dtype,
+):
+    out, _ = ops.rms_norm_dynamic_per_token_quant(
+        x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual
+    )
+
+
+# Bench functions
+def bench_fn(
+    rms_norm_layer: RMSNorm,
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    quant_dtype: torch.dtype,
+    label: str,
+    sub_label: str,
+    fn: Callable,
+    description: str,
+) -> TMeasurement:
+    min_run_time = 1
+
+    globals = {
+        "rms_norm_layer": rms_norm_layer,
+        "x": x,
+        "residual": residual,
+        "quant_dtype": quant_dtype,
+        "fn": fn,
+    }
+    return TBenchmark.Timer(
+        stmt="fn(rms_norm_layer, x, residual, quant_dtype)",
+        globals=globals,
+        label=label,
+        sub_label=sub_label,
+        description=description,
+    ).blocked_autorange(min_run_time=min_run_time)
+
+
+def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasurement]:
+    # Make inputs
+    layer = RMSNorm(params.hidden_size, 1e-6).to(dtype=params.dtype)
+    # Make weights
+    layer.weight.data.normal_(mean=1.0, std=0.1)
+    # Make inputs
+    scale = 1 / params.hidden_size
+    x = (
+        torch.randn(
+            params.num_tokens, params.hidden_size, dtype=params.dtype, device="cuda"
+        )
+        * scale
+    )
+    residual = (
+        (torch.randn_like(x) * scale).to(device="cuda") if params.add_residual else None
+    )
+
+    timers = []
+
+    # unfused int8 impl.
+    timers.append(
+        bench_fn(
+            layer,
+            x,
+            residual,
+            torch.int8,
+            label,
+            sub_label,
+            unfused_int8_impl,
+            "unfused_int8_impl",
+        )
+    )
+
+    # unfused fp8 impl.
+    timers.append(
+        bench_fn(
+            layer,
+            x,
+            residual,
+            torch.float8_e4m3fn,
+            label,
+            sub_label,
+            unfused_fp8_impl,
+            "unfused_fp8_impl",
+        )
+    )
+
+    # fused int8 impl.
+    timers.append(
+        bench_fn(
+            layer,
+            x,
+            residual,
+            torch.int8,
+            label,
+            sub_label,
+            fused_impl,
+            "fused_int8_impl",
+        )
+    )
+
+    # fused fp8 impl.
+    timers.append(
+        bench_fn(
+            layer,
+            x,
+            residual,
+            torch.float8_e4m3fn,
+            label,
+            sub_label,
+            fused_impl,
+            "fused_fp8_impl",
+        )
+    )
+
+    print_timers(timers)
+
+    return timers
+
+
+# launch bench
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+def main():
+    torch.set_default_device("cuda")
+    bench_params = get_bench_params()
+
+    timers = []
+    for bp in tqdm(bench_params):
+        timers.extend(bench(bp, "rms-norm-dynamic-per-token-quant", bp.description()))
+    print_timers(timers)
+
+    # pickle all the results
+    timestamp = int(time.time())
+    with open(f"rms_norm_dpt_quant-{timestamp}.pkl", "wb") as f:
+        pkl.dump(timers, f)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/benchmarks/kernels/bench_fp8_gemm.py b/vllm_v0.10.0/benchmarks/kernels/bench_fp8_gemm.py
new file mode 100644
index 0000000..9209618
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/kernels/bench_fp8_gemm.py
@@ -0,0 +1,159 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import copy
+import itertools
+
+import torch
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
+from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
+from vllm.triton_utils import triton
+
+PROVIDER_CFGS = {
+    "torch-bf16": dict(enabled=True),
+    "fp8-tensor-w-token-a": dict(
+        w="tensor", a="token", no_a_quant=False, enabled=False
+    ),
+    "fp8-tensor-w-tensor-a": dict(
+        w="tensor", a="tensor", no_a_quant=False, enabled=True
+    ),
+    "fp8-channel-w-token-a": dict(
+        w="channel", a="token", no_a_quant=False, enabled=True
+    ),
+    "fp8-channel-w-tensor-a": dict(
+        w="channel", a="tensor", no_a_quant=False, enabled=False
+    ),
+    "fp8-tensor-w-token-a-noquant": dict(
+        w="tensor", a="token", no_a_quant=True, enabled=False
+    ),
+    "fp8-tensor-w-tensor-a-noquant": dict(
+        w="tensor", a="tensor", no_a_quant=True, enabled=True
+    ),
+    "fp8-channel-w-token-a-noquant": dict(
+        w="channel", a="token", no_a_quant=True, enabled=True
+    ),
+    "fp8-channel-w-tensor-a-noquant": dict(
+        w="channel", a="tensor", no_a_quant=True, enabled=False
+    ),
+}
+
+_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
+
+
+def _quant_weight_fp8(b: torch.Tensor, w_type: str, device: str):
+    if w_type == "tensor":
+        scale_b = torch.ones(1, device=device, dtype=torch.float32)
+        b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
+    else:
+        b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, use_per_token_if_dynamic=True)
+    return b_fp8.t(), scale_b_fp8
+
+
+def build_fp8_runner(cfg, a, b, dtype, device):
+    b_fp8, scale_b_fp8 = _quant_weight_fp8(b, cfg["w"], device)
+
+    scale_a_const = (
+        torch.ones(1, device=device, dtype=torch.float32)
+        if cfg["a"] == "tensor"
+        else None
+    )
+
+    if cfg["no_a_quant"]:
+        if cfg["a"] == "tensor":
+            a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a_const)
+        else:
+            a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, use_per_token_if_dynamic=True)
+
+        def run():
+            return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
+
+        return run
+
+    if cfg["a"] == "tensor":
+
+        def run():
+            a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a_const)
+            return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
+
+    else:
+
+        def run():
+            a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, use_per_token_if_dynamic=True)
+            return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
+
+    return run
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
+        x_log=False,
+        line_arg="provider",
+        line_vals=_enabled,
+        line_names=_enabled,
+        ylabel="TFLOP/s (larger is better)",
+        plot_name="BF16 vs FP8 GEMMs",
+        args={},
+    )
+)
+def benchmark(batch_size, provider, N, K):
+    M = batch_size
+    device = "cuda"
+    dtype = torch.bfloat16
+
+    a = torch.randn((M, K), device=device, dtype=dtype)
+    b = torch.randn((N, K), device=device, dtype=dtype)
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "torch-bf16":
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
+        )
+    else:
+        cfg = PROVIDER_CFGS[provider]
+        run_quant = build_fp8_runner(cfg, a, b, dtype, device)
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: run_quant(), quantiles=quantiles
+        )
+
+    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
+    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
+
+
+def prepare_shapes(args):
+    out = []
+    for model, tp_size in itertools.product(args.models, args.tp_sizes):
+        for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
+            KN[tp_dim] //= tp_size
+            KN.append(model)
+            out.append(KN)
+    return out
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=["meta-llama/Llama-3.1-8B-Instruct"],
+        choices=list(WEIGHT_SHAPES.keys()),
+    )
+    parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
+    args = parser.parse_args()
+
+    for K, N, model in prepare_shapes(args):
+        print(f"{model}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:")
+        benchmark.run(
+            print_data=True,
+            show_plots=True,
+            save_path=f"bench_fp8_res_n{N}_k{K}",
+            N=N,
+            K=K,
+        )
+
+    print("Benchmark finished!")
diff --git a/vllm_v0.10.0/benchmarks/kernels/bench_int8_gemm.py b/vllm_v0.10.0/benchmarks/kernels/bench_int8_gemm.py
new file mode 100644
index 0000000..e9c6d64
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/kernels/bench_int8_gemm.py
@@ -0,0 +1,169 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import copy
+import itertools
+
+import torch
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
+from vllm._custom_ops import scaled_int8_quant as vllm_scaled_int8_quant
+from vllm.triton_utils import triton
+
+PROVIDER_CFGS = {
+    "torch-bf16": dict(enabled=True),
+    "int8-tensor-w-token-a": dict(
+        w="tensor", a="token", no_a_quant=False, enabled=False
+    ),
+    "int8-tensor-w-tensor-a": dict(
+        w="tensor", a="tensor", no_a_quant=False, enabled=True
+    ),
+    "int8-channel-w-token-a": dict(
+        w="channel", a="token", no_a_quant=False, enabled=True
+    ),
+    "int8-channel-w-tensor-a": dict(
+        w="channel", a="tensor", no_a_quant=False, enabled=False
+    ),
+    "int8-tensor-w-token-a-noquant": dict(
+        w="tensor", a="token", no_a_quant=True, enabled=False
+    ),
+    "int8-tensor-w-tensor-a-noquant": dict(
+        w="tensor", a="tensor", no_a_quant=True, enabled=True
+    ),
+    "int8-channel-w-token-a-noquant": dict(
+        w="channel", a="token", no_a_quant=True, enabled=True
+    ),
+    "int8-channel-w-tensor-a-noquant": dict(
+        w="channel", a="tensor", no_a_quant=True, enabled=False
+    ),
+}
+
+
+def _quant_weight(b, w_type, device):
+    if w_type == "tensor":
+        scale_b = torch.ones(1, device=device, dtype=torch.float32)
+        b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b, scale_b)
+        assert scale_b_int8.numel() == 1
+    else:  # channel
+        b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b)
+        assert scale_b_int8.numel() == b.shape[0]
+    return b_int8.t(), scale_b_int8
+
+
+def build_int8_runner(cfg, a, b, dtype, device):
+    # quant before running the kernel
+    b_int8, scale_b_int8 = _quant_weight(b, cfg["w"], device)
+
+    scale_a_const = None
+    if cfg["a"] == "tensor":
+        scale_a_const = torch.ones(1, device=device, dtype=torch.float32)
+
+    # no quant, create activation ahead
+    if cfg["no_a_quant"]:
+        if cfg["a"] == "tensor":
+            a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a_const)
+        else:  # token
+            a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a)
+
+        def run_quant():
+            return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype)
+
+        return run_quant
+
+    # dynamic quant, create activation inside
+    if cfg["a"] == "tensor":
+
+        def run_quant():
+            a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a_const)
+            return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype)
+
+    else:  # token
+
+        def run_quant():
+            a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a)
+            return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype)
+
+    return run_quant
+
+
+_enabled = [k for k, v in PROVIDER_CFGS.items() if v.get("enabled")]
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
+        x_log=False,
+        line_arg="provider",
+        line_vals=_enabled,
+        line_names=[k for k in _enabled],
+        ylabel="TFLOP/s (larger is better)",
+        plot_name="BF16 vs INT8 GEMMs",
+        args={},
+    )
+)
+def benchmark(batch_size, provider, N, K):
+    M = batch_size
+    device = "cuda"
+    dtype = torch.bfloat16
+    a = torch.randn((M, K), device=device, dtype=dtype)
+    b = torch.randn((N, K), device=device, dtype=dtype)
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "torch-bf16":
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
+        )
+    else:
+        cfg = PROVIDER_CFGS[provider]
+        run_quant = build_int8_runner(cfg, a, b, dtype, device)
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: run_quant(), quantiles=quantiles
+        )
+
+    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
+    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
+
+
+def prepare_shapes(args):
+    KN_model_names = []
+    for model, tp_size in itertools.product(args.models, args.tp_sizes):
+        for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
+            KN[tp_dim] //= tp_size
+            KN.append(model)
+            KN_model_names.append(KN)
+    return KN_model_names
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=["meta-llama/Llama-3.1-8B-Instruct"],
+        choices=list(WEIGHT_SHAPES.keys()),
+        help="List of models to benchmark",
+    )
+    parser.add_argument(
+        "--tp-sizes",
+        nargs="+",
+        type=int,
+        default=[1],
+        help="List of tensor parallel sizes",
+    )
+    args = parser.parse_args()
+
+    for K, N, model in prepare_shapes(args):
+        print(f"{model}, N={N} K={K}, BF16 vs INT8 GEMMs TFLOP/s:")
+        benchmark.run(
+            print_data=True,
+            show_plots=True,
+            save_path=f"bench_int8_res_n{N}_k{K}",
+            N=N,
+            K=K,
+        )
+
+    print("Benchmark finished!")
diff --git a/vllm_v0.10.0/benchmarks/kernels/bench_nvfp4_gemm.py b/vllm_v0.10.0/benchmarks/kernels/bench_nvfp4_gemm.py
new file mode 100644
index 0000000..9e832c9
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/kernels/bench_nvfp4_gemm.py
@@ -0,0 +1,141 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import copy
+import itertools
+
+import torch
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+from vllm.triton_utils import triton
+
+if not current_platform.has_device_capability(100):
+    raise RuntimeError("NVFP4 requires compute capability of 10.0 (Blackwell)")
+
+
+FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
+FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
+
+PROVIDER_CFGS = {
+    "torch-bf16": dict(enabled=True),
+    "nvfp4": dict(no_a_quant=False, enabled=True),
+    "nvfp4-noquant": dict(no_a_quant=True, enabled=True),
+}
+
+_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
+
+
+def _quant_weight_nvfp4(b: torch.Tensor, device: str):
+    # Compute global scale for weight
+    b_amax = torch.abs(b).max().to(torch.float32)
+    b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
+    b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
+    return b_fp4, scale_b_fp4, b_global_scale
+
+
+def build_nvfp4_runner(cfg, a, b, dtype, device):
+    b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device)
+
+    # Compute global scale for activation
+    # NOTE: This is generally provided ahead-of-time by the model checkpoint.
+    a_amax = torch.abs(a).max().to(torch.float32)
+    a_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / a_amax
+
+    # Alpha for the GEMM operation
+    alpha = 1.0 / (a_global_scale * b_global_scale)
+
+    if cfg["no_a_quant"]:
+        # Pre-quantize activation
+        a_fp4, scale_a_fp4 = ops.scaled_fp4_quant(a, a_global_scale)
+
+        def run():
+            return ops.cutlass_scaled_fp4_mm(
+                a_fp4, b_fp4, scale_a_fp4, scale_b_fp4, alpha, dtype
+            )
+
+        return run
+
+    # Quantize activation on-the-fly
+    def run():
+        a_fp4, scale_a_fp4 = ops.scaled_fp4_quant(a, a_global_scale)
+        return ops.cutlass_scaled_fp4_mm(
+            a_fp4, b_fp4, scale_a_fp4, scale_b_fp4, alpha, dtype
+        )
+
+    return run
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
+        x_log=False,
+        line_arg="provider",
+        line_vals=_enabled,
+        line_names=_enabled,
+        ylabel="TFLOP/s (larger is better)",
+        plot_name="BF16 vs NVFP4 GEMMs",
+        args={},
+    )
+)
+def benchmark(batch_size, provider, N, K):
+    M = batch_size
+    device = "cuda"
+    dtype = torch.bfloat16
+
+    a = torch.randn((M, K), device=device, dtype=dtype)
+    b = torch.randn((N, K), device=device, dtype=dtype)
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "torch-bf16":
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
+        )
+    else:
+        cfg = PROVIDER_CFGS[provider]
+        run_quant = build_nvfp4_runner(cfg, a, b, dtype, device)
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: run_quant(), quantiles=quantiles
+        )
+
+    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
+    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
+
+
+def prepare_shapes(args):
+    out = []
+    for model, tp_size in itertools.product(args.models, args.tp_sizes):
+        for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
+            KN[tp_dim] //= tp_size
+            KN.append(model)
+            out.append(KN)
+    return out
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=["meta-llama/Llama-3.1-8B-Instruct"],
+        choices=list(WEIGHT_SHAPES.keys()),
+    )
+    parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
+    args = parser.parse_args()
+
+    for K, N, model in prepare_shapes(args):
+        print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:")
+        benchmark.run(
+            print_data=True,
+            show_plots=True,
+            save_path=f"bench_nvfp4_res_n{N}_k{K}",
+            N=N,
+            K=K,
+        )
+
+    print("Benchmark finished!")
diff --git a/vllm_v0.10.0/benchmarks/kernels/bench_per_token_quant_fp8.py b/vllm_v0.10.0/benchmarks/kernels/bench_per_token_quant_fp8.py
new file mode 100644
index 0000000..923d678
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/kernels/bench_per_token_quant_fp8.py
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import itertools
+from typing import Callable
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
+from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
+from vllm.triton_utils import triton
+
+
+# TODO(luka): use standalone_compile utility
+def with_dyn_arg(fn: Callable, arg_index: int, dim_index: int):
+    def inner(*args):
+        torch._dynamo.mark_dynamic(args[arg_index], dim_index)
+        return fn(*args)
+
+    return inner
+
+
+torch._dynamo.config.recompile_limit = 8888
+compilation_config = CompilationConfig(custom_ops=["none"])
+with set_current_vllm_config(VllmConfig(compilation_config=compilation_config)):
+    torch_per_token_quant_fp8 = torch.compile(
+        QuantFP8(False, GroupShape.PER_TOKEN),
+        fullgraph=True,
+        dynamic=False,  # recompile for different shapes
+    )
+
+    # First dim is explicitly dynamic to simulate vLLM usage
+    torch_per_token_quant_fp8 = with_dyn_arg(torch_per_token_quant_fp8, 0, 0)
+
+
+def cuda_per_token_quant_fp8(
+    input: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    return ops.scaled_fp8_quant(input)
+
+
+def calculate_diff(batch_size: int, seq_len: int):
+    """Calculate difference between Triton and CUDA implementations."""
+    device = torch.device("cuda")
+    x = torch.rand((batch_size * seq_len, 4096), dtype=torch.float16, device=device)
+
+    torch_out, torch_scale = torch_per_token_quant_fp8(x)
+    cuda_out, cuda_scale = cuda_per_token_quant_fp8(x)
+
+    if torch.allclose(
+        cuda_out.to(torch.float32), torch_out.to(torch.float32), rtol=1e-3, atol=1e-5
+    ) and torch.allclose(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5):
+        print("✅ All implementations match")
+    else:
+        print("❌ Implementations differ")
+
+
+batch_size_range = [1, 16, 32, 64, 128]
+seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
+
+configs = list(itertools.product(batch_size_range, seq_len_range))
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size", "seq_len"],
+        x_vals=configs,
+        line_arg="provider",
+        line_vals=["torch", "cuda"],
+        line_names=["Torch", "CUDA"],
+        styles=[("blue", "-"), ("green", "-")],
+        ylabel="us",
+        plot_name="per-token-dynamic-quant-fp8-performance",
+        args={},
+    )
+)
+def benchmark_quantization(batch_size, seq_len, provider):
+    dtype = torch.float16
+    device = torch.device("cuda")
+
+    x = torch.randn(batch_size * seq_len, 4096, device=device, dtype=dtype)
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "torch":
+        fn = lambda: torch_per_token_quant_fp8(x.clone())
+    elif provider == "cuda":
+        fn = lambda: cuda_per_token_quant_fp8(x.clone())
+
+    ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(fn, quantiles=quantiles)
+
+    return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+
+if __name__ == "__main__":
+    calculate_diff(batch_size=4, seq_len=4096)
+    benchmark_quantization.run(print_data=True)
diff --git a/vllm_v0.10.0/benchmarks/kernels/benchmark_aqlm.py b/vllm_v0.10.0/benchmarks/kernels/benchmark_aqlm.py
new file mode 100644
index 0000000..42de062
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/kernels/benchmark_aqlm.py
@@ -0,0 +1,345 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import sys
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.aqlm import (
+    dequantize_weight,
+    generic_dequantize_gemm,
+    get_int_dtype,
+    optimized_dequantize_gemm,
+)
+from vllm.utils import FlexibleArgumentParser
+
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+
+
+def torch_mult(
+    # [..., in_features]
+    input: torch.Tensor,
+    weights: torch.Tensor,
+    # [num_out_groups, 1, 1, 1]
+    scales: torch.Tensor,
+) -> torch.Tensor:
+    output = F.linear(input, weights)
+    return output
+
+
+def dequant_out_scale(
+    # [..., in_features]
+    input: torch.Tensor,
+    # [num_out_groups, num_in_groups, num_codebooks]
+    codes: torch.IntTensor,
+    # [num_codebooks, codebook_size, out_group_size, in_group_size]
+    codebooks: torch.Tensor,
+    # [num_out_groups, 1, 1, 1]
+    scales: torch.Tensor,
+    output_partition_sizes: torch.IntTensor,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+    if bias is None:
+        output = F.linear(input, weights, bias)
+        orig_shape = output.shape
+        flattened_output = output.view(-1, output.size(-1))
+        f_scales = scales.view(-1, scales.shape[0])
+        b_scales = f_scales.expand(flattened_output.shape[0], -1)
+        flattened_output *= b_scales
+        return flattened_output.view(orig_shape)
+    else:
+        b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1])
+        weights *= b_scales
+        return F.linear(input, weights, bias)
+
+
+def dequant_weight_scale(
+    # [..., in_features]
+    input: torch.Tensor,
+    # [num_out_groups, num_in_groups, num_codebooks]
+    codes: torch.IntTensor,
+    # [num_codebooks, codebook_size, out_group_size, in_group_size]
+    codebooks: torch.Tensor,
+    # [num_out_groups, 1, 1, 1]
+    scales: torch.Tensor,
+    output_partition_sizes: torch.IntTensor,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+    b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1])
+    weights *= b_scales
+    return F.linear(input, weights, bias)
+
+
+def dequant_no_scale(
+    # [..., in_features]
+    input: torch.Tensor,
+    # [num_out_groups, num_in_groups, num_codebooks]
+    codes: torch.IntTensor,
+    # [num_codebooks, codebook_size, out_group_size, in_group_size]
+    codebooks: torch.Tensor,
+    # [num_out_groups, 1, 1, 1]
+    scales: torch.Tensor,
+    output_partition_sizes: torch.IntTensor,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+    return F.linear(input, weights, bias)
+
+
+# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
+# the generic pytorch version.
+# Just visual comparison.
+def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
+    n = int(parts.sum().item())
+
+    device = torch.device("cuda:0")
+
+    code_range = (1 << bits) // 2
+    ingroups = 8
+
+    codes = torch.randint(
+        -code_range,
+        code_range,
+        size=(n, k // ingroups, nbooks),
+        dtype=get_int_dtype(bits),
+        device=device,
+    )
+
+    codebooks = torch.randn(
+        size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
+        dtype=torch.float16,
+        device=device,
+    )
+
+    count = 0
+    for index in range(16):
+        for i in range(8):
+            for book in range(nbooks):
+                codebooks[book, index, 0, i] = count * (10**book)
+            count += 1
+
+    print("codes shape", codes.shape)
+
+    for i in range(16):
+        for book in range(nbooks):
+            codes[0, i, book] = i
+            codes[0, -i, book] = i
+
+    weights = dequantize_weight(codes, codebooks, None)
+    weights2 = ops.aqlm_dequant(codes, codebooks, parts)
+
+    print("weights shape:", weights.shape)
+    print("weights2 shape:", weights2.shape)
+
+    print("weights are:", weights)
+    print("weights2 are:", weights2)
+
+    print("first 128 weights are", weights[0, 0:128].to(torch.int32))
+    print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32))
+
+    print("last 128 weights are", weights[0, -128:])
+    print("last 128 weights2 are:", weights2[0, -128:])
+
+
+def main():
+    parser = FlexibleArgumentParser(description="Benchmark aqlm performance.")
+
+    # Add arguments
+    parser.add_argument(
+        "--nbooks", type=int, default=1, help="Number of codebooks (default: 1)"
+    )
+    parser.add_argument(
+        "--bits",
+        type=int,
+        default=16,
+        help="Number of bits per code element (default: 16)",
+    )
+    parser.add_argument(
+        "--test",
+        type=bool,
+        default=False,
+        help="Run the decompression/dequant tester rather than benchmarking "
+        "(default: False)",
+    )
+
+    # Parse the arguments
+    args = parser.parse_args()
+
+    # Extract values
+    nbooks = args.nbooks
+    bits = args.bits
+
+    if args.test:
+        dequant_test(4096, torch.tensor((4096,)), nbooks, bits)
+        return
+
+    # Otherwise, benchmark.
+    methods = [
+        ops.aqlm_gemm,
+        dequant_out_scale,
+        generic_dequantize_gemm,
+        optimized_dequantize_gemm,
+        dequant_weight_scale,
+        torch_mult,
+        dequant_no_scale,
+    ]
+
+    filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv"
+    print(f"writing benchmarks to file {filename}")
+    with open(filename, "w") as f:
+        sys.stdout = f
+
+        print("m | k | n | n parts", end="")
+        for method in methods:
+            print(f" | {method.__name__.replace('_', ' ')} (µs)", end="")
+        print("")
+
+        # These are reasonable prefill sizes.
+        ksandpartions = (
+            (4096, (4096, 4096, 4096)),
+            (4096, (4096,)),
+            (4096, (11008, 11008)),
+            (11008, (4096,)),
+        )
+
+        # reasonable ranges for m.
+        for m in [
+            1,
+            2,
+            4,
+            8,
+            10,
+            12,
+            14,
+            16,
+            24,
+            32,
+            48,
+            52,
+            56,
+            64,
+            96,
+            112,
+            128,
+            256,
+            512,
+            1024,
+            1536,
+            2048,
+            3072,
+            4096,
+        ]:
+            print(f"{m}", file=sys.__stdout__)
+            for ksp in ksandpartions:
+                run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits, methods)
+
+        sys.stdout = sys.__stdout__
+
+
+def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, methods):
+    # I didn't see visible improvements from increasing these, but feel free :)
+    num_warmup_trials = 1
+    num_trials = 1
+
+    num_calls = 100
+
+    # warmup.
+    for method in methods:
+        for _ in range(num_warmup_trials):
+            run_timing(
+                num_calls=num_calls,
+                m=m,
+                k=k,
+                parts=parts,
+                nbooks=nbooks,
+                bits=bits,
+                method=method,
+            )
+
+    n = parts.sum().item()
+    print(f"{m} | {k} | {n} | {parts.tolist()}", end="")
+
+    for method in methods:
+        best_time_us = 1e20
+        for _ in range(num_trials):
+            kernel_dur_ms = run_timing(
+                num_calls=num_calls,
+                m=m,
+                k=k,
+                parts=parts,
+                nbooks=nbooks,
+                bits=bits,
+                method=method,
+            )
+
+            kernel_dur_us = 1000 * kernel_dur_ms
+
+            if kernel_dur_us < best_time_us:
+                best_time_us = kernel_dur_us
+
+        print(f" | {kernel_dur_us:.0f}", end="")
+
+    print("")
+
+
+def run_timing(
+    num_calls: int, m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, method
+) -> float:
+    n = int(parts.sum().item())
+
+    device = torch.device("cuda:0")
+
+    input = torch.randn((1, m, k), dtype=torch.float16, device=device)
+
+    code_range = (1 << bits) // 2
+    ingroups = 8
+
+    codes = torch.randint(
+        -code_range,
+        code_range,
+        size=(n, k // ingroups, nbooks),
+        dtype=get_int_dtype(bits),
+        device=device,
+    )
+
+    codebooks = torch.randn(
+        size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
+        dtype=torch.float16,
+        device=device,
+    )
+
+    scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device)
+
+    # for comparison to just a pytorch mult.
+    weights = torch.randn((n, k), dtype=torch.float16, device=device)
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    start_event.record()
+
+    if method is torch_mult:
+        for i in range(num_calls):
+            torch_mult(input, weights, scales)
+    else:
+        for i in range(num_calls):
+            method(input, codes, codebooks, scales, parts, None)
+
+    end_event.record()
+    end_event.synchronize()
+
+    dur_ms = start_event.elapsed_time(end_event) / num_calls
+    return dur_ms
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/vllm_v0.10.0/benchmarks/kernels/benchmark_bitblas.py b/vllm_v0.10.0/benchmarks/kernels/benchmark_bitblas.py
new file mode 100644
index 0000000..97ee060
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/kernels/benchmark_bitblas.py
@@ -0,0 +1,242 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
+    MINIMUM_BITBLAS_VERSION,
+)
+
+try:
+    import bitblas
+
+    if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
+        raise ImportError(
+            "bitblas version is wrong. Please "
+            f"install bitblas>={MINIMUM_BITBLAS_VERSION}"
+        )
+except ImportError as e:
+    bitblas_import_exception = e
+    raise ValueError(
+        "Trying to use the bitblas backend, but could not import"
+        f"with the following error: {bitblas_import_exception}. "
+        "Please install bitblas through the following command: "
+        f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`"
+    ) from bitblas_import_exception
+
+from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target
+
+from vllm.utils import FlexibleArgumentParser
+
+parser = FlexibleArgumentParser(
+    description="Benchmark BitBLAS int4 on a specific target."
+)
+
+# Add arguments to the parser
+parser.add_argument(
+    "--target",
+    type=str,
+    default=auto_detect_nvidia_target(),
+    help="Specify the target device for benchmarking.",
+)
+parser.add_argument(
+    "--group_size", type=int, default=None, help="Group size for grouped quantization."
+)
+parser.add_argument(
+    "--A_dtype",
+    type=str,
+    default="float16",
+    choices=["float16", "float32", "float64", "int32", "int8"],
+    help="Data type of activation A.",
+)
+parser.add_argument(
+    "--W_dtype",
+    type=str,
+    default="int4",
+    choices=[
+        "float16",
+        "float32",
+        "float64",
+        "int32",
+        "int8",
+        "int4",
+        "int2",
+        "int1",
+        "nf4",
+        "fp4_e2m1",
+    ],
+    help="Data type of weight W.",
+)
+parser.add_argument(
+    "--accum_dtype",
+    type=str,
+    default="float16",
+    choices=["float16", "int32"],
+    help="Data type for accumulation.",
+)
+parser.add_argument(
+    "--out_dtype",
+    type=str,
+    default="float16",
+    choices=["float16", "float32", "int32", "int8"],
+    help="Data type for output.",
+)
+parser.add_argument(
+    "--layout",
+    type=str,
+    default="nt",
+    choices=["nt", "nn"],
+    help="Matrix layout, 'nt' for non-transpose A and transpose W.",
+)
+parser.add_argument(
+    "--with_bias", action="store_true", help="Include bias in the benchmark."
+)
+parser.add_argument(
+    "--with_scaling",
+    action="store_true",
+    help="Include scaling factor in the quantization.",
+)
+parser.add_argument(
+    "--with_zeros", action="store_true", help="Include zeros in the quantization."
+)
+parser.add_argument(
+    "--zeros_mode",
+    type=str,
+    default=None,
+    choices=["original", "rescale", "quantized"],
+    help="Specify the mode for calculating zeros.",
+)
+
+# Parse the arguments
+args = parser.parse_args()
+
+# Assign arguments to variables
+target = args.target
+A_dtype = args.A_dtype
+W_dtype = args.W_dtype
+accum_dtype = args.accum_dtype
+out_dtype = args.out_dtype
+layout = args.layout
+with_bias = args.with_bias
+group_size = args.group_size
+with_scaling = args.with_scaling
+with_zeros = args.with_zeros
+zeros_mode = args.zeros_mode
+
+# Define a list of shared arguments that repeat in every config
+shared_args = [
+    A_dtype,
+    W_dtype,
+    out_dtype,
+    accum_dtype,
+    layout,
+    with_bias,
+    group_size,
+    with_scaling,
+    with_zeros,
+    zeros_mode,
+]
+
+# Define just the (M, K, N) shapes in a more compact list
+shapes = [
+    # square test
+    (1, 16384, 16384),
+    # BLOOM-176B
+    (1, 43008, 14336),
+    (1, 14336, 14336),
+    (1, 57344, 14336),
+    (1, 14336, 57344),
+    # OPT-65B
+    (1, 9216, 9216),
+    (1, 36864, 9216),
+    (1, 9216, 36864),
+    (1, 22016, 8192),
+    # LLAMA-70B/65B
+    (1, 8192, 22016),
+    (1, 8192, 8192),
+    (1, 28672, 8192),
+    (1, 8192, 28672),
+    # square test
+    (16384, 16384, 16384),
+    # BLOOM-176B
+    (8192, 43008, 14336),
+    (8192, 14336, 14336),
+    (8192, 57344, 14336),
+    (8192, 14336, 57344),
+    # OPT-65B
+    (8192, 9216, 9216),
+    (8192, 36864, 9216),
+    (8192, 9216, 36864),
+    (8192, 22016, 8192),
+    # LLAMA-70B/65B
+    (8192, 8192, 22016),
+    (8192, 8192, 8192),
+    (8192, 28672, 8192),
+    (8192, 8192, 28672),
+]
+
+# Build test shapes with all the shared arguments
+test_shapes = [(MatmulConfig, Matmul, (*shape, *shared_args)) for shape in shapes]
+
+benchmark_sets = []
+benchmark_sets.extend(test_shapes)
+
+benchmark_results = {}
+for config_class, operator, input_args in benchmark_sets:
+    config = config_class(*input_args)
+    matmul = operator(config, target=target, enable_tuning=True)
+    kernel_latency = matmul.profile_latency()
+
+    print("Time cost is: {:.3f} ms".format(kernel_latency))
+
+    profile_config = {
+        f"{operator.__name__}-{'-'.join([str(i) for i in input_args])}": {
+            "BitBLAS_top20_latency": kernel_latency,
+        }
+    }
+
+    benchmark_results.update(profile_config)
+
+# Define headers for the table
+headers = [
+    "PrimFunc",
+    "Input Arguments",
+    "BitBLAS Top20 Latency",
+]
+
+# Calculate column widths for pretty printing
+col_widths = [0, 0, 0]
+for config_key, values in benchmark_results.items():
+    args_split = config_key.split("-")
+    func_name = args_split[0]
+    input_args_str = "-".join(args_split[1:])
+    col_widths[0] = max(col_widths[0], len(func_name) + 2, len(headers[0]) + 2)
+    col_widths[1] = max(col_widths[1], len(input_args_str) + 2, len(headers[1]) + 2)
+    col_widths[2] = max(
+        col_widths[2],
+        len(f"{values['BitBLAS_top20_latency']:.3f} ms") + 2,
+        len(headers[2]) + 2,
+    )
+    # break only if you want to measure widths from a single example;
+    # otherwise, let it loop over all items.
+
+# Print header
+for i, header in enumerate(headers):
+    headers[i] = header.ljust(col_widths[i])
+print("".join(headers))
+print("-" * sum(col_widths))
+
+# Print rows
+for config_key, values in benchmark_results.items():
+    args_split = config_key.split("-")
+    func_name = args_split[0]
+    input_args_str = "-".join(args_split[1:])
+    row = [
+        func_name,
+        input_args_str,
+        f"{values['BitBLAS_top20_latency']:.3f} ms",
+    ]
+    row_str = "".join(
+        [str(cell).ljust(col_widths[idx]) for idx, cell in enumerate(row)]
+    )
+    print(row_str)
diff --git a/vllm_v0.10.0/benchmarks/kernels/benchmark_cutlass_fp4_moe.py b/vllm_v0.10.0/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
new file mode 100644
index 0000000..35c20ee
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
@@ -0,0 +1,490 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe
+kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit
+activations. The triton_moe kernel takes in fp8 weights(tensor scaled to fp8)
+and 16-bit activations.
+"""
+
+import nvtx
+import torch
+import torch.utils.benchmark as benchmark
+
+from vllm import _custom_ops as ops
+from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
+from vllm.scalar_type import scalar_types
+from vllm.utils import FlexibleArgumentParser
+
+WEIGHT_SHAPES_MOE = {
+    "nvidia/DeepSeek-R1-FP4": [
+        [256, 8, 2048, 7168],
+    ],
+}
+
+DEFAULT_MODELS = [
+    "nvidia/DeepSeek-R1-FP4",
+]
+
+DEFAULT_BATCH_SIZES = [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
+DEFAULT_TP_SIZES = [1]
+
+PER_ACT_TOKEN_OPTS = [False]
+PER_OUT_CH_OPTS = [False]
+FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
+FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
+
+
+def to_fp8(tensor: torch.Tensor):
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(min=finfo.min, max=finfo.max)).to(
+        dtype=torch.float8_e4m3fn
+    )
+
+
+def bench_run(
+    results: list[benchmark.Measurement],
+    model: str,
+    num_experts: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_ch: bool,
+    mkn: tuple[int, int, int],
+):
+    label = "NVFP4 Blockscaled CUTLASS MOE vs FP8 Tensor Scaled Triton"
+
+    sub_label = (
+        "{}, num_experts={}, topk={}, per_act_token={} per_out_ch={}, MKN=({})".format(
+            model, num_experts, topk, per_act_token, per_out_ch, mkn
+        )
+    )
+
+    print(f"Testing: {sub_label}")
+
+    (m, k, n) = mkn
+
+    dtype = torch.half
+    device = "cuda"
+    a = torch.randn((m, k), device=device, dtype=dtype) / 10
+    w1 = torch.randn((num_experts, 2 * n, k), device=device, dtype=dtype) / 10
+    w2 = torch.randn((num_experts, k, n), device=device, dtype=dtype) / 10
+
+    _, a_fp8_scale = ops.scaled_fp8_quant(a)
+
+    w1_fp8q = torch.empty(
+        (num_experts, 2 * n, k), device=device, dtype=torch.float8_e4m3fn
+    )
+    w2_fp8q = torch.empty((num_experts, k, n), device=device, dtype=torch.float8_e4m3fn)
+    w1_fp8scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
+    w2_fp8scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
+
+    for expert in range(num_experts):
+        w1_fp8q[expert], w1_fp8scale[expert] = ops.scaled_fp8_quant(w1[expert])
+        w2_fp8q[expert], w2_fp8scale[expert] = ops.scaled_fp8_quant(w2[expert])
+
+    w1_fp8q_notransp = w1_fp8q.clone()
+    w2_fp8q_notransp = w2_fp8q.clone()
+    w1_fp8q = w1_fp8q.transpose(1, 2)
+    w2_fp8q = w2_fp8q.transpose(1, 2)
+
+    score = torch.randn((m, num_experts), device=device, dtype=dtype)
+
+    topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
+
+    quant_blocksize = 16
+    w1_blockscale = torch.empty(
+        (num_experts, 2 * n, k // quant_blocksize),
+        device=device,
+        dtype=torch.float8_e4m3fn,
+    )
+    w2_blockscale = torch.empty(
+        (num_experts, k, n // quant_blocksize), device=device, dtype=torch.float8_e4m3fn
+    )
+
+    # n_b_scales = 2 * n if per_out_ch else 1
+    # k_b_scales = k if per_out_ch else 1
+    w1_fp4 = torch.empty((num_experts, 2 * n, k // 2), device=device, dtype=torch.uint8)
+    w2_fp4 = torch.empty((num_experts, k, n // 2), device=device, dtype=torch.uint8)
+
+    w1_gs = torch.empty((num_experts,), device=device, dtype=torch.float32)
+    w2_gs = torch.empty((num_experts,), device=device, dtype=torch.float32)
+    a1_gs = torch.ones((num_experts,), device=device, dtype=torch.float32)
+    a2_gs = torch.ones((num_experts,), device=device, dtype=torch.float32)
+
+    for expert in range(num_experts):
+        w1_e = w1[expert]
+        w2_e = w2[expert]
+        w1_amax = torch.abs(w1_e).max().to(torch.float32)
+        w2_amax = torch.abs(w2_e).max().to(torch.float32)
+        w1_gs[expert] = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w1_amax
+        w2_gs[expert] = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w2_amax
+
+        w1_fp4[expert], w1_blockscale[expert] = ops.scaled_fp4_quant(
+            w1_e, w1_gs[expert]
+        )
+
+        w2_fp4[expert], w2_blockscale[expert] = ops.scaled_fp4_quant(
+            w2_e, w2_gs[expert]
+        )
+
+    def run_triton_moe(
+        a: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        w1_scale: torch.Tensor,
+        w2_scale: torch.Tensor,
+        a_fp8_scale: torch.Tensor,
+        num_repeats: int,
+    ):
+        for _ in range(num_repeats):
+            fused_experts(
+                a,
+                w1,
+                w2,
+                topk_weights,
+                topk_ids,
+                use_fp8_w8a8=True,
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                a1_scale=a_fp8_scale,
+            )
+
+    def run_cutlass_moe_fp4(
+        a: torch.Tensor,
+        w1_fp4: torch.Tensor,
+        w2_fp4: torch.Tensor,
+        w1_blockscale: torch.Tensor,
+        w2_blockscale: torch.Tensor,
+        w1_gs: torch.Tensor,
+        w2_gs: torch.Tensor,
+        a1_gs: torch.Tensor,
+        a2_gs: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        m: int,
+        n: int,
+        k: int,
+        e: int,
+        device: torch.device,
+        num_repeats: int,
+    ):
+        for _ in range(num_repeats):
+            with nvtx.annotate("cutlass_moe_fp4", color="green"):
+                cutlass_moe_fp4(
+                    a=a,
+                    a1_gscale=a1_gs,
+                    a2_gscale=a2_gs,
+                    w1_fp4=w1_fp4,
+                    w1_blockscale=w1_blockscale,
+                    w1_alphas=w1_gs,
+                    w2_fp4=w2_fp4,
+                    w2_blockscale=w2_blockscale,
+                    w2_alphas=w2_gs,
+                    topk_weights=topk_weights,
+                    topk_ids=topk_ids,
+                    m=m,
+                    n=n,
+                    k=k,
+                    e=num_experts,
+                    device=device,
+                )
+
+    def run_cutlass_from_graph(
+        a: torch.Tensor,
+        a1_gscale: torch.Tensor,
+        w1_fp4: torch.Tensor,
+        w1_blockscale: torch.Tensor,
+        w1_alphas: torch.Tensor,
+        a2_gscale: torch.Tensor,
+        w2_fp4: torch.Tensor,
+        w2_blockscale: torch.Tensor,
+        w2_alphas: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        m: int,
+        n: int,
+        k: int,
+        e: int,
+        device: torch.device,
+    ):
+        with set_current_vllm_config(
+            VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
+        ):
+            return cutlass_moe_fp4(
+                a=a,
+                a1_gscale=a1_gs,
+                w1_fp4=w1_fp4,
+                w1_blockscale=w1_blockscale,
+                w1_alphas=w1_alphas,
+                a2_gscale=a2_gs,
+                w2_fp4=w2_fp4,
+                w2_blockscale=w2_blockscale,
+                w2_alphas=w2_alphas,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                m=m,
+                n=n,
+                k=k,
+                e=num_experts,
+                device=device,
+            )
+
+    def run_triton_from_graph(
+        a: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        w1_scale: torch.Tensor,
+        w2_scale: torch.Tensor,
+        a_fp8_scale: torch.Tensor,
+    ):
+        with set_current_vllm_config(
+            VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
+        ):
+            return fused_experts(
+                a,
+                w1,
+                w2,
+                topk_weights,
+                topk_ids,
+                use_fp8_w8a8=True,
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                a1_scale=a_fp8_scale,
+            )
+
+    def replay_graph(graph, num_repeats):
+        for _ in range(num_repeats):
+            graph.replay()
+        torch.cuda.synchronize()
+
+    cutlass_stream = torch.cuda.Stream()
+    cutlass_graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
+        run_cutlass_from_graph(
+            a=a,
+            a1_gscale=a1_gs,
+            w1_fp4=w1_fp4,
+            w1_blockscale=w1_blockscale,
+            w1_alphas=w1_gs,
+            a2_gscale=a2_gs,
+            w2_fp4=w2_fp4,
+            w2_blockscale=w2_blockscale,
+            w2_alphas=w2_gs,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            m=m,
+            n=n,
+            k=k,
+            e=num_experts,
+            device=device,
+        )
+    torch.cuda.synchronize()
+
+    triton_stream = torch.cuda.Stream()
+    triton_graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(triton_graph, stream=triton_stream):
+        run_triton_from_graph(
+            a,
+            w1_fp8q_notransp,
+            w2_fp8q_notransp,
+            topk_weights,
+            topk_ids,
+            w1_fp8scale,
+            w2_fp8scale,
+            a_fp8_scale,
+        )
+    torch.cuda.synchronize()
+
+    min_run_time = 5
+    num_warmup = 5
+    num_runs = 25
+
+    globals = {
+        # Baseline params
+        "w1": w1,
+        "w2": w2,
+        "score": score,
+        "topk": topk,
+        "w1_fp8q_notransp": w1_fp8q_notransp,
+        "w2_fp8q_notransp": w2_fp8q_notransp,
+        "w1_fp8scale": w1_fp8scale,
+        "w2_fp8scale": w2_fp8scale,
+        "a_fp8_scale": a_fp8_scale,
+        # Cutlass params
+        "a": a,
+        "a1_gscale": a1_gs,
+        "w1_fp4": w1_fp4,
+        "w1_blockscale": w1_blockscale,
+        "w1_alphas": w1_gs,
+        "a2_gscale": a2_gs,
+        "w2_fp4": w2_fp4,
+        "w2_blockscale": w2_blockscale,
+        "w2_alphas": w2_gs,
+        "topk_weights": topk_weights,
+        "topk_ids": topk_ids,
+        "m": m,
+        "n": n,
+        "k": k,
+        "e": num_experts,
+        "device": device,
+        # cuda graph params
+        "cutlass_graph": cutlass_graph,
+        "triton_graph": triton_graph,
+        # Gen params
+        "num_runs": num_runs,
+        # Kernels
+        "run_triton_moe": run_triton_moe,
+        "run_cutlass_moe_fp4": run_cutlass_moe_fp4,
+        "replay_graph": replay_graph,
+    }
+
+    # Warmup
+    run_triton_moe(
+        a,
+        w1_fp8q_notransp,
+        w2_fp8q_notransp,
+        topk_weights,
+        topk_ids,
+        w1_fp8scale,
+        w2_fp8scale,
+        a_fp8_scale,
+        num_warmup,
+    )
+
+    results.append(
+        benchmark.Timer(
+            stmt="run_triton_moe(a, w1_fp8q_notransp, w2_fp8q_notransp, topk_weights, topk_ids, w1_fp8scale, w2_fp8scale, a_fp8_scale, num_runs)",  # noqa: E501
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="triton_moe",
+        ).blocked_autorange(min_run_time=min_run_time)
+    )
+
+    # Warmup
+    replay_graph(triton_graph, num_warmup)
+
+    results.append(
+        benchmark.Timer(
+            stmt="replay_graph(triton_graph, num_runs)",
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="triton_moe_cuda_graphs",
+        ).blocked_autorange(min_run_time=min_run_time)
+    )
+
+    # Warmup
+
+    run_cutlass_moe_fp4(
+        a,
+        w1_fp4,
+        w2_fp4,
+        w1_blockscale,
+        w2_blockscale,
+        w1_gs,
+        w2_gs,
+        a1_gs,
+        a2_gs,
+        topk_weights,
+        topk_ids,
+        m,
+        n,
+        k,
+        num_experts,
+        device,
+        num_warmup,
+    )
+
+    results.append(
+        benchmark.Timer(
+            stmt="run_cutlass_moe_fp4(a, w1_fp4, w2_fp4, w1_blockscale, w2_blockscale, w1_alphas, w2_alphas, a1_gscale, a2_gscale, topk_weights, topk_ids, m, n, k, e, device, num_runs)",  # noqa: E501
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="cutlass_moe_fp4",
+        ).blocked_autorange(min_run_time=min_run_time)
+    )
+
+    # Warmup
+    replay_graph(cutlass_graph, num_warmup)
+
+    results.append(
+        benchmark.Timer(
+            stmt="replay_graph(cutlass_graph, num_runs)",
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="cutlass_moe_fp4_cuda_graphs",
+        ).blocked_autorange(min_run_time=min_run_time)
+    )
+
+
+def main(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    results: list[benchmark.Measurement] = []
+
+    for model in args.models:
+        for tp in args.tp_sizes:
+            for layer in WEIGHT_SHAPES_MOE[model]:
+                num_experts = layer[0]
+                topk = layer[1]
+                size_k = layer[2]
+                size_n = layer[3] // tp
+
+                if len(args.limit_k) > 0 and size_k not in args.limit_k:
+                    continue
+
+                if len(args.limit_n) > 0 and size_n not in args.limit_n:
+                    continue
+
+                for per_act_token in PER_ACT_TOKEN_OPTS:
+                    for per_out_ch in PER_OUT_CH_OPTS:
+                        for size_m in args.batch_sizes:
+                            mkn = (size_m, size_k, size_n)
+                            bench_run(
+                                results,
+                                model,
+                                num_experts,
+                                topk,
+                                per_act_token,
+                                per_out_ch,
+                                mkn,
+                            )
+
+    compare = benchmark.Compare(results)
+    compare.print()
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="Benchmark NVFP4 CUTLASS MOE across specified models/shapes/batches"
+    )
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=DEFAULT_MODELS,
+        choices=WEIGHT_SHAPES_MOE.keys(),
+    )
+    parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES)
+    parser.add_argument(
+        "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
+    )
+    parser.add_argument("--limit-k", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-n", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-num-groups", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-per-act-token", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-per-out-ch", nargs="+", type=int, default=[])
+
+    args = parser.parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/vllm_v0.10.0/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
new file mode 100644
index 0000000..1d4e730
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -0,0 +1,383 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+import torch.utils.benchmark as benchmark
+from benchmark_shapes import WEIGHT_SHAPES_MOE
+
+from vllm import _custom_ops as ops
+from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
+from vllm.model_executor.layers.fused_moe.fused_moe import (
+    fused_experts,
+    fused_topk,
+)
+from vllm.utils import FlexibleArgumentParser
+
+DEFAULT_MODELS = [
+    "nm-testing/Mixtral-8x7B-Instruct-v0.1",
+    "nm-testing/deepseekv2-lite",
+    "ibm-granite/granite-3.0-1b-a400m",
+    "ibm-granite/granite-3.0-3b-a800m",
+]
+DEFAULT_BATCH_SIZES = [1, 4, 8, 16, 32, 64, 128, 256, 512]
+DEFAULT_TP_SIZES = [1]
+
+PER_ACT_TOKEN_OPTS = [False]
+PER_OUT_CH_OPTS = [False]
+
+
+def to_fp8(tensor: torch.Tensor):
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(min=finfo.min, max=finfo.max)).to(
+        dtype=torch.float8_e4m3fn
+    )
+
+
+def bench_run(
+    results: list[benchmark.Measurement],
+    model: str,
+    num_experts: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_ch: bool,
+    mkn: tuple[int, int, int],
+):
+    label = "Quant Matmul"
+
+    sub_label = (
+        "{}, num_experts={}, topk={}, per_act_token={} per_out_ch={}, MKN=({})".format(
+            model, num_experts, topk, per_act_token, per_out_ch, mkn
+        )
+    )
+
+    print(f"Testing: {sub_label}")
+
+    (m, k, n) = mkn
+
+    dtype = torch.half
+
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((num_experts, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((num_experts, k, n), device="cuda", dtype=dtype) / 10
+
+    _, a_scale = ops.scaled_fp8_quant(a)
+
+    w1_q = torch.empty(
+        (num_experts, 2 * n, k), device="cuda", dtype=torch.float8_e4m3fn
+    )
+    w2_q = torch.empty((num_experts, k, n), device="cuda", dtype=torch.float8_e4m3fn)
+    w1_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
+    w2_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
+
+    for expert in range(num_experts):
+        w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert])
+        w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert])
+
+    score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
+
+    topk_weights, topk_ids, token_expert_indices = fused_topk(
+        a, score, topk, renormalize=False
+    )
+
+    def run_triton_moe(
+        a: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        w1_scale: torch.Tensor,
+        w2_scale: torch.Tensor,
+        a_scale: torch.Tensor,
+        num_repeats: int,
+    ):
+        for _ in range(num_repeats):
+            fused_experts(
+                a,
+                w1,
+                w2,
+                topk_weights,
+                topk_ids,
+                use_fp8_w8a8=True,
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                a1_scale=a_scale,
+            )
+
+    def run_cutlass_moe(
+        a: torch.Tensor,
+        a_scale: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        w1_scale: torch.Tensor,
+        w2_scale: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        per_act_token: bool,
+        num_repeats: int,
+    ):
+        for _ in range(num_repeats):
+            cutlass_moe_fp8(
+                a,
+                w1,
+                w2,
+                topk_weights,
+                topk_ids,
+                w1_scale,
+                w2_scale,
+                per_act_token,
+                a1_scale=None,
+            )
+
+    def run_cutlass_from_graph(
+        a: torch.Tensor,
+        a_scale: torch.Tensor,
+        w1_q: torch.Tensor,
+        w2_q: torch.Tensor,
+        w1_scale: torch.Tensor,
+        w2_scale: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+    ):
+        with set_current_vllm_config(
+            VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
+        ):
+            return cutlass_moe_fp8(
+                a,
+                w1_q,
+                w2_q,
+                topk_weights,
+                topk_ids,
+                w1_scale,
+                w2_scale,
+                per_act_token,
+                a1_scale=None,
+            )
+
+    def run_triton_from_graph(
+        a: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        w1_scale: torch.Tensor,
+        w2_scale: torch.Tensor,
+        a_scale: torch.Tensor,
+    ):
+        with set_current_vllm_config(
+            VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
+        ):
+            return fused_experts(
+                a,
+                w1,
+                w2,
+                topk_weights,
+                topk_ids,
+                use_fp8_w8a8=True,
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                a1_scale=a_scale,
+            )
+
+    def replay_graph(graph, num_repeats):
+        for _ in range(num_repeats):
+            graph.replay()
+        torch.cuda.synchronize()
+
+    cutlass_stream = torch.cuda.Stream()
+    cutlass_graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
+        run_cutlass_from_graph(
+            a,
+            a_scale,
+            w1_q,
+            w2_q,
+            w1_scale,
+            w2_scale,
+            topk_weights,
+            topk_ids,
+        )
+    torch.cuda.synchronize()
+
+    triton_stream = torch.cuda.Stream()
+    triton_graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(triton_graph, stream=triton_stream):
+        run_triton_from_graph(
+            a,
+            w1_q,
+            w2_q,
+            topk_weights,
+            topk_ids,
+            w1_scale,
+            w2_scale,
+            a_scale,
+        )
+    torch.cuda.synchronize()
+
+    min_run_time = 5
+    num_warmup = 5
+    num_runs = 25
+
+    globals = {
+        # Baseline params
+        "w1": w1,
+        "w2": w2,
+        "score": score,
+        "topk": topk,
+        # Cutlass params
+        "a_scale": a_scale,
+        "w1_q": w1_q,
+        "w2_q": w2_q,
+        "w1_scale": w1_scale,
+        "w2_scale": w2_scale,
+        "per_act_token": per_act_token,
+        # cuda graph params
+        "cutlass_graph": cutlass_graph,
+        "triton_graph": triton_graph,
+        # Gen params
+        "a": a,
+        "topk_weights": topk_weights,
+        "topk_ids": topk_ids,
+        "num_runs": num_runs,
+        # Kernels
+        "run_triton_moe": run_triton_moe,
+        "run_cutlass_moe": run_cutlass_moe,
+        "replay_graph": replay_graph,
+    }
+
+    # Warmup
+    run_triton_moe(
+        a,
+        w1_q,
+        w2_q,
+        topk_weights,
+        topk_ids,
+        w1_scale,
+        w2_scale,
+        a_scale,
+        num_warmup,
+    )
+
+    results.append(
+        benchmark.Timer(
+            stmt="run_triton_moe(a, w1_q, w2_q, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)",  # noqa: E501
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="triton_moe",
+        ).blocked_autorange(min_run_time=min_run_time)
+    )
+
+    # Warmup
+    replay_graph(triton_graph, num_warmup)
+
+    results.append(
+        benchmark.Timer(
+            stmt="replay_graph(triton_graph, num_runs)",
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="triton_moe_cuda_graphs",
+        ).blocked_autorange(min_run_time=min_run_time)
+    )
+
+    # Warmup
+    run_cutlass_moe(
+        a,
+        a_scale,
+        w1_q,
+        w2_q,
+        w1_scale,
+        w2_scale,
+        topk_weights,
+        topk_ids,
+        per_act_token,
+        num_warmup,
+    )
+
+    results.append(
+        benchmark.Timer(
+            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, per_act_token, num_runs)",  # noqa: E501
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="grouped_gemm_moe",
+        ).blocked_autorange(min_run_time=min_run_time)
+    )
+
+    # Warmup
+    replay_graph(cutlass_graph, num_warmup)
+
+    results.append(
+        benchmark.Timer(
+            stmt="replay_graph(cutlass_graph, num_runs)",
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="grouped_gemm_moe_cuda_graphs",
+        ).blocked_autorange(min_run_time=min_run_time)
+    )
+
+
+def main(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    results: list[benchmark.Measurement] = []
+
+    for model in args.models:
+        for tp in args.tp_sizes:
+            for layer in WEIGHT_SHAPES_MOE[model]:
+                num_experts = layer[0]
+                topk = layer[1]
+                size_k = layer[2]
+                size_n = layer[3] // tp
+
+                if len(args.limit_k) > 0 and size_k not in args.limit_k:
+                    continue
+
+                if len(args.limit_n) > 0 and size_n not in args.limit_n:
+                    continue
+
+                for per_act_token in PER_ACT_TOKEN_OPTS:
+                    for per_out_ch in PER_OUT_CH_OPTS:
+                        for size_m in DEFAULT_BATCH_SIZES:
+                            mkn = (size_m, size_k, size_n)
+                            bench_run(
+                                results,
+                                model,
+                                num_experts,
+                                topk,
+                                per_act_token,
+                                per_out_ch,
+                                mkn,
+                            )
+
+    compare = benchmark.Compare(results)
+    compare.print()
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="Benchmark Marlin across specified models/shapes/batches"
+    )
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=DEFAULT_MODELS,
+        choices=WEIGHT_SHAPES_MOE.keys(),
+    )
+    parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES)
+    parser.add_argument(
+        "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
+    )
+    parser.add_argument("--limit-k", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-n", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-num-groups", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-per-act-token", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-per-out-ch", nargs="+", type=int, default=[])
+
+    args = parser.parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/benchmarks/kernels/benchmark_layernorm.py b/vllm_v0.10.0/benchmarks/kernels/benchmark_layernorm.py
new file mode 100644
index 0000000..69978ec
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/kernels/benchmark_layernorm.py
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import time
+
+import torch
+
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.platforms import current_platform
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+
+
+@torch.inference_mode()
+def main(
+    num_tokens: int,
+    hidden_size: int,
+    add_residual: bool,
+    dtype: torch.dtype,
+    seed: int = 0,
+    do_profile: bool = False,
+    num_warmup_iters: int = 5,
+    num_iters: int = 100,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device("cuda")
+
+    layer = RMSNorm(hidden_size).to(dtype=dtype)
+    layer.weight.data.normal_(mean=1.0, std=0.1)
+    scale = 1 / (2 * hidden_size)
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    x *= scale
+    residual = torch.randn_like(x) * scale if add_residual else None
+
+    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
+        torch.cuda.synchronize()
+        if profile:
+            torch.cuda.cudart().cudaProfilerStart()
+        start_time = time.perf_counter()
+
+        for _ in range(num_iters):
+            layer(x, residual)
+        torch.cuda.synchronize()
+
+        end_time = time.perf_counter()
+        if profile:
+            torch.cuda.cudart().cudaProfilerStop()
+        return (end_time - start_time) / num_iters
+
+    # Warmup.
+    print("Warming up...")
+    run_benchmark = run_cuda_benchmark
+    run_benchmark(num_iters=num_warmup_iters, profile=False)
+
+    # Benchmark.
+    if do_profile:
+        latency = run_benchmark(num_iters=1, profile=True)
+    else:
+        latency = run_benchmark(num_iters=num_iters, profile=False)
+    print(f"Kernel running time: {latency * 1000000:.3f} us")
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(description="Benchmark the layernorm kernel.")
+    parser.add_argument("--num-tokens", type=int, default=4096)
+    parser.add_argument("--hidden-size", type=int, default=8192)
+    parser.add_argument("--add-residual", action="store_true")
+    parser.add_argument(
+        "--dtype", type=str, choices=["half", "bfloat16", "float"], default="half"
+    )
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--profile", action="store_true")
+    parser.add_argument("--num-warmup-iters", type=int, default=5)
+    parser.add_argument(
+        "--num-iters",
+        type=int,
+        default=100,
+        help="Number of benchmark iterations. "
+        "If --profile is set, this number is ignored",
+    )
+
+    args = parser.parse_args()
+    print(args)
+
+    main(
+        num_tokens=args.num_tokens,
+        hidden_size=args.hidden_size,
+        add_residual=args.add_residual,
+        dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
+        seed=args.seed,
+        do_profile=args.profile,
+        num_warmup_iters=args.num_warmup_iters,
+        num_iters=args.num_iters,
+    )
diff --git a/vllm_v0.10.0/benchmarks/kernels/benchmark_lora.py b/vllm_v0.10.0/benchmarks/kernels/benchmark_lora.py
new file mode 100644
index 0000000..3d38d4b
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/kernels/benchmark_lora.py
@@ -0,0 +1,1065 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import copy
+import json
+import pickle
+import time
+from dataclasses import dataclass
+from enum import Enum, auto
+from itertools import product
+from pathlib import Path
+from typing import Any, Callable, Optional
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from utils import ArgPool, Bench, CudaGraphBenchParams
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm.triton_utils import HAS_TRITON
+
+if HAS_TRITON:
+    from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
+    from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
+
+from vllm.utils import FlexibleArgumentParser
+
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
+DEFAULT_TP_SIZES = [1]
+DEFAULT_BATCH_SIZES = [
+    1,
+    16,
+    32,
+    64,
+    128,
+    192,
+    256,
+    320,
+    384,
+    448,
+    512,
+    640,
+    768,
+    896,
+    1024,
+    2048,
+    3072,
+    4096,
+    5120,
+    6144,
+    7168,
+    8192,
+]
+DEFAULT_HIDDEN_SIZES = [1024, 2048, 4096, 8192, 16384]
+DEFAULT_LORA_RANKS = [16]
+DEFAULT_NUM_LORAS = [1, 2, 3, 4]
+DEFAULT_SORT_BY_LORA_IDS = [False, True]
+DEFAULT_SEQ_LENGTHS = [1]
+DEFAULT_EXPAND_FN_ADD_INPUTS = [True, False]
+
+
+# Utilities
+def dtype_to_str(dtype: torch.dtype):
+    if dtype == torch.float16:
+        return "f16"
+    if dtype == torch.bfloat16:
+        return "bf16"
+    if dtype == torch.float32:
+        return "f32"
+    raise ValueError(f"Unsupported dtype {dtype}")
+
+
+def make_rand_lora_weight_tensor(
+    k: int, n: int, num_loras: int, dtype: torch.dtype, device: str = "cuda"
+) -> torch.Tensor:
+    # LoRA weights column major
+    return torch.rand((num_loras, n, k), dtype=dtype).to(device)
+
+
+def make_rand_tensors(
+    a_shape: tuple[int],
+    b_shape: tuple[int],
+    c_shape: tuple[int],
+    a_dtype: torch.dtype,
+    b_dtype: torch.dtype,
+    c_dtype: torch.dtype,
+    num_slices: int,
+    device: str = "cuda",
+) -> tuple[torch.Tensor, list[torch.Tensor], torch.Tensor]:
+    """
+    Make LoRA input/output matrices.
+    """
+    A = torch.rand(a_shape, dtype=a_dtype).to(device)
+
+    # LoRA weights column major
+    Bs = [torch.rand(b_shape, dtype=b_dtype).to(device) for _ in range(num_slices)]
+
+    C = torch.zeros(c_shape, dtype=c_dtype).to(device)
+    return A, Bs, C
+
+
+def make_prompt_lora_mapping(
+    num_prompts: int, num_active_loras: int, sort_by_lora_id: bool, device: str
+) -> torch.Tensor:
+    """
+    All prompts are mapped to a LoRA ID in range [0, num_active_loras).
+    where 0 refers to first lora, 1 refers to second lora and so on.
+    """
+    assert num_active_loras > 0
+
+    if not sort_by_lora_id:
+        return torch.randint(0, num_active_loras, (num_prompts,), dtype=torch.long)
+
+    # Divide LoRAs equally and in order.
+    part_size = num_prompts // num_active_loras
+    part_size = max(part_size, 1)
+
+    lora_id = 0
+    prompt_lora_mapping = []
+    while len(prompt_lora_mapping) < num_prompts:
+        prompt_lora_mapping.extend([lora_id] * part_size)
+        lora_id = lora_id + 1 if lora_id + 1 < num_active_loras else lora_id
+    return torch.tensor(
+        prompt_lora_mapping[:num_prompts], dtype=torch.long, device=device
+    )
+
+
+def make_token_lora_mapping(
+    num_tokens: int,
+    num_prompts: int,
+    prompt_lora_mapping: torch.Tensor,
+    seq_len_tensor: torch.Tensor,
+    device: str,
+):
+    """
+    Make token_lora_mapping from prompt_lora_mapping and seq_lens_tensor
+    """
+    assert prompt_lora_mapping.shape[0] == num_prompts
+
+    # token to lora index mapping
+    token_lora_mapping = [0] * num_tokens
+    current_offset = 0
+    for b_id in range(num_prompts):
+        lora_index = prompt_lora_mapping[b_id].item()
+        s = current_offset
+        e = s + seq_len_tensor[b_id].item()
+        token_lora_mapping[s:e] = [lora_index] * (e - s)
+        current_offset += seq_len_tensor[b_id].item()
+
+    return torch.tensor(token_lora_mapping, dtype=torch.long, device=device)
+
+
+def ref_group_gemm(
+    ref_out: torch.Tensor,
+    input: torch.Tensor,
+    lora_weights: list[torch.Tensor],
+    seq_lens_cpu: torch.Tensor,
+    prompt_lora_mapping_cpu: torch.Tensor,
+    scaling: float,
+    add_inputs: Optional[bool],
+):
+    """
+    Torch group gemm reference implementation to test correctness of
+    benchmarking operations.
+    """
+    batches = seq_lens_cpu.size(0)
+    out_list = []
+    current_offset = 0
+    for lora_index, b_length in zip(range(batches), seq_lens_cpu):
+        x = input[current_offset : b_length + current_offset, :]
+        current_offset += b_length
+        w = lora_weights[prompt_lora_mapping_cpu[lora_index]]
+        result = torch.nn.functional.linear(x, w)
+        result *= scaling
+        out_list.append(result)
+
+    cat_result = torch.cat(out_list, dim=0)
+
+    if add_inputs:
+        ref_out += cat_result
+    else:
+        ref_out.copy_(cat_result)
+
+
+class OpType(Enum):
+    """
+    LoRA Ops to benchmark and its properties.
+    """
+
+    LORA_SHRINK = auto()
+    LORA_EXPAND = auto()
+
+    @staticmethod
+    def from_str(s: str) -> "OpType":
+        if s.lower() == "lora_shrink":
+            return OpType.LORA_SHRINK
+        if s.lower() == "lora_expand":
+            return OpType.LORA_EXPAND
+        raise ValueError(f"Unrecognized str {s} to convert to OpType")
+
+    def is_shrink_fn(self) -> bool:
+        return self in [OpType.LORA_SHRINK]
+
+    def is_expand_fn(self) -> bool:
+        return self in [OpType.LORA_EXPAND]
+
+    def num_slices(self) -> list[int]:
+        return [1, 2, 3]
+
+    def mkn(
+        self, batch_size: int, seq_length: int, hidden_size: int, lora_rank: int
+    ) -> tuple[int, int, int]:
+        num_tokens = batch_size * seq_length
+        if self.is_shrink_fn():
+            m = num_tokens
+            k = hidden_size
+            n = lora_rank
+        else:
+            assert self.is_expand_fn()
+            m = num_tokens
+            k = lora_rank
+            n = hidden_size
+        return m, k, n
+
+    def matmul_dtypes(
+        self, op_dtype: torch.dtype
+    ) -> tuple[torch.dtype, torch.dtype, torch.dtype]:
+        """
+        return a type, b type and c type for A x B = C
+        """
+        if self.is_shrink_fn():
+            return op_dtype, op_dtype, torch.float32
+        else:
+            assert self.is_expand_fn()
+            return torch.float32, op_dtype, op_dtype
+
+    def matmul_shapes(
+        self,
+        batch_size: int,
+        seq_length: int,
+        hidden_size: int,
+        lora_rank: int,
+        num_loras: int,
+        num_slices: int,
+    ) -> tuple[tuple[int], tuple[int], tuple[int]]:
+        """
+        Given num_slices, return the shapes of the A, B, and C matrices
+        in A x B = C, for the op_type
+        """
+        m, k, n = self.mkn(batch_size, seq_length, hidden_size, lora_rank)
+
+        b_shape = (num_loras, n, k)  # col-major
+        if self in [OpType.LORA_SHRINK]:
+            # LoRA shrink kernels support num_slices inherently in the kernel.
+            return ((m, k), b_shape, (num_slices, m, n))
+        if self in [OpType.LORA_EXPAND]:
+            # LoRA expand kernels support num_slices inherently in the kernel
+            return ((num_slices, m, k), b_shape, (m, n * num_slices))
+        raise ValueError(f"Unrecognized op_type {self}")
+
+    def bench_fn(self) -> Callable:
+        if self == OpType.LORA_SHRINK:
+            return lora_shrink
+        if self == OpType.LORA_EXPAND:
+            return lora_expand
+
+        raise ValueError(f"Unrecognized optype {self}")
+
+    def run_ref_group_gemm(
+        self,
+        output: torch.Tensor,
+        input: torch.Tensor,
+        lora_weights: list[torch.Tensor],
+        **kwargs,
+    ) -> Callable:
+        """Each benchmark operation expects the input, lora_weights and outputs
+        in a slightly different format. Refer to self.matmul_shapes().
+        run_ref_group_gemm accounts for those differences in executing a
+        reference group gemm for correctness testing.
+        """
+        w_dtype = lora_weights[0].dtype
+        num_slices = len(lora_weights)
+        if self in [OpType.LORA_SHRINK]:
+            for slice_idx in range(num_slices):
+                ref_group_gemm(
+                    ref_out=output[slice_idx, :],
+                    input=input,
+                    lora_weights=lora_weights[slice_idx],
+                    **kwargs,
+                )
+        elif self in [OpType.LORA_EXPAND]:
+            hidden_size = lora_weights[0].shape[1]
+            for slice_idx in range(num_slices):
+                slice_offset = slice_idx * hidden_size
+                ref_group_gemm(
+                    ref_out=output[:, slice_offset : slice_offset + hidden_size],
+                    input=input[slice_idx].clone().to(dtype=w_dtype),
+                    lora_weights=lora_weights[slice_idx],
+                    **kwargs,
+                )
+        else:
+            raise ValueError(f"Unrecognized optype {self}")
+
+
+@dataclass
+class BenchmarkContext:
+    """
+    LoRA benchmark context
+    """
+
+    batch_size: int
+    hidden_size: int
+    num_loras: int
+    num_active_loras: int
+    lora_rank: int
+    sort_by_lora_id: bool
+    dtype: torch.dtype
+    seq_length: Optional[int] = None
+    num_slices: Optional[int] = None  # num_slices for slice based ops
+
+    def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
+        ctx = copy.copy(self)
+        ctx.seq_length = seq_length
+        return ctx
+
+    def with_num_slices(self, num_slices: int) -> "BenchmarkContext":
+        ctx = copy.copy(self)
+        ctx.num_slices = num_slices
+        return ctx
+
+    def bench_label(self) -> str:
+        return f"lora-{self.dtype}"
+
+    def bench_sublabel(self, op_type: OpType) -> str:
+        m, k, n = op_type.mkn(
+            self.batch_size, self.seq_length, self.hidden_size, self.lora_rank
+        )
+        desc = {
+            "bs": self.batch_size,
+            "sl": self.seq_length,
+            "m": m,
+            "k": k,
+            "n": n,
+            "num_loras": self.num_loras,
+            "sort_by_lora": self.sort_by_lora_id,
+            "num_slices": self.num_slices,
+        }
+        return json.dumps(desc)
+
+
+@dataclass
+class BenchmarkTensors:
+    """
+    Input/Output tensors used for benchmarks
+    """
+
+    # matmul tensors
+    input: torch.Tensor
+    lora_weights_lst: list[torch.Tensor]
+    output: torch.Tensor
+    # LoRA kernel metadata
+    lora_kernel_meta: LoRAKernelMeta
+    # Metadata tensors used in testing correctness
+    seq_lens: torch.Tensor
+    prompt_lora_mapping: torch.Tensor
+
+    def io_types(self) -> str:
+        return (
+            f"{dtype_to_str(self.input.dtype)}x"
+            f"{dtype_to_str(self.lora_weights_lst[0].dtype)}=>"
+            f"{dtype_to_str(self.output.dtype)}"
+        )
+
+    @staticmethod
+    def make(
+        ctx: BenchmarkContext, op_type: OpType, device: str = "cuda"
+    ) -> "BenchmarkTensors":
+        # Make input / output matmul tensors.
+        a_shape, b_shape, c_shape = op_type.matmul_shapes(
+            ctx.batch_size,
+            ctx.seq_length,
+            ctx.hidden_size,
+            ctx.lora_rank,
+            ctx.num_loras,
+            ctx.num_slices,
+        )
+        a_type, b_type, c_type = op_type.matmul_dtypes(ctx.dtype)
+        input_tensor, lora_weights, output_tensor = make_rand_tensors(
+            a_shape, b_shape, c_shape, a_type, b_type, c_type, num_slices=ctx.num_slices
+        )
+
+        # Make metadata tensors.
+        # Keep the metadata tensors in the CPU for further processing if needed.
+        # The tensors get moved to the GPU before benchmarking.
+        assert ctx.num_active_loras <= ctx.num_loras
+        total_tokens = ctx.batch_size * ctx.seq_length
+
+        # Make metadata tensors involved in correctness testing.
+        # Prepare seq lens tensor
+        seq_len_tensor = torch.randint(
+            ctx.seq_length, ctx.seq_length + 1, (ctx.batch_size,)
+        )
+        assert total_tokens == seq_len_tensor.sum()
+        # Prepare prompt lora indices tensor
+        prompt_lora_indices_tensor = make_prompt_lora_mapping(
+            ctx.batch_size, ctx.num_active_loras, ctx.sort_by_lora_id, "cpu"
+        )
+
+        # Make LoRAKernelMeta
+        token_lora_indices_tensor = make_token_lora_mapping(
+            total_tokens,
+            ctx.batch_size,
+            prompt_lora_indices_tensor,
+            seq_len_tensor,
+            "cpu",
+        )
+        lora_kernel_meta = LoRAKernelMeta.make(
+            max_loras=ctx.num_loras,
+            max_num_tokens=token_lora_indices_tensor.size(0),
+            device="cpu",
+        )
+        lora_kernel_meta.prepare_tensors(token_lora_mapping=token_lora_indices_tensor)
+
+        return BenchmarkTensors(
+            input_tensor,
+            lora_weights,
+            output_tensor,
+            lora_kernel_meta,
+            seq_len_tensor,
+            prompt_lora_indices_tensor,
+        )
+
+    def sanity_check(self) -> None:
+        """
+        Fails asserts when non-conformality is detected.
+        """
+        num_tokens = self.input.shape[-2]
+        # check metadata tensors
+        assert torch.sum(self.seq_lens) == num_tokens
+        num_seqs = self.seq_lens.shape[0]
+        # assert self.seq_start_loc.shape[0] == num_seqs
+        assert self.prompt_lora_mapping.shape[0] == num_seqs
+        assert self.lora_kernel_meta.token_lora_mapping.shape[0] == num_tokens
+
+    def to_device(self, device: str):
+        """
+        Transfer tensors to device if the tensors aren't already on the device
+        """
+
+        def to_device(tensor: torch.Tensor):
+            if tensor.device != device:
+                tensor = tensor.to(device=device)
+            return tensor
+
+        self.input = to_device(self.input)
+        self.output = to_device(self.output)
+        self.seq_lens = to_device(self.seq_lens)
+        self.prompt_lora_mapping = to_device(self.prompt_lora_mapping)
+        for i in range(len(self.lora_weights_lst)):
+            self.lora_weights_lst[i] = to_device(self.lora_weights_lst[i])
+
+        # LoRA meta
+        for field_name in LoRAKernelMeta.__dataclass_fields__:
+            field = getattr(self.lora_kernel_meta, field_name)
+            assert isinstance(field, torch.Tensor)
+            setattr(self.lora_kernel_meta, field_name, to_device(field))
+
+    def metadata(self) -> tuple[int, int, int]:
+        """
+        Return num_seqs, num_tokens and max_seq_len
+        """
+        num_seqs = self.seq_lens.shape[0]
+        num_tokens = self.lora_kernel_meta.token_lora_mapping.shape[0]
+        max_seq_len = torch.max(self.seq_lens).item()
+        num_slices = len(self.lora_weights_lst)
+        return num_seqs, num_tokens, max_seq_len, num_slices
+
+    def as_lora_shrink_kwargs(self) -> dict[str, Any]:
+        self.sanity_check()
+        self.to_device(self.input.device)
+
+        _, num_tokens, _, num_slices = self.metadata()
+
+        # Sanity check matrix shapes.
+        i_shape, lw_shape, o_shape = (
+            self.input.shape,
+            self.lora_weights_lst[0].shape,
+            self.output.shape,
+        )
+        # Expected input shape [num_tokens, hidden_size]
+        assert len(i_shape) == 2
+        assert i_shape[0] == num_tokens
+        hidden_size = i_shape[1]
+        # Expected lora weight shape [num_loras, lora_rank, hidden_size]
+        assert len(lw_shape) == 3
+        assert lw_shape[2] == hidden_size
+        lora_rank = lw_shape[1]
+        # Expected output shape [num_slices, num_tokens, lora_rank]
+        assert len(o_shape) == 3
+        assert o_shape == (num_slices, num_tokens, lora_rank)
+
+        return {
+            "inputs": self.input,
+            "lora_a_weights": self.lora_weights_lst,
+            "output_tensor": self.output,
+            "token_lora_mapping": self.lora_kernel_meta.token_lora_mapping,
+            "token_indices_sorted_by_lora_ids": (
+                self.lora_kernel_meta.token_indices_sorted_by_lora_ids
+            ),
+            "num_tokens_per_lora": self.lora_kernel_meta.num_tokens_per_lora,
+            "lora_token_start_loc": self.lora_kernel_meta.lora_token_start_loc,
+            "lora_ids": self.lora_kernel_meta.active_lora_ids,
+            "scaling": 1.0,
+        }
+
+    def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
+        self.sanity_check()
+        self.to_device(self.input.device)
+
+        _, num_tokens, _, num_slices = self.metadata()
+
+        # Sanity check matrix shapes.
+        i_shape, lw_shape, o_shape = (
+            self.input.shape,
+            self.lora_weights_lst[0].shape,
+            self.output.shape,
+        )
+        # Expected input shape : [num_slices, num_tokens, lora_rank]
+        assert len(i_shape) == 3
+        assert i_shape[0] == num_slices
+        assert i_shape[1] == num_tokens
+        lora_rank = i_shape[2]
+        # Expected lora weight shape : [num_lora, hidden_size, lora_rank]
+        assert len(lw_shape) == 3
+        assert lw_shape[2] == lora_rank
+        hidden_size = lw_shape[1]
+        # Expected output shape : [num_tokens, hidden_size * num_slices]
+        assert len(o_shape) == 2
+        assert o_shape == (num_tokens, hidden_size * num_slices)
+
+        return {
+            "inputs": self.input,
+            "lora_b_weights": self.lora_weights_lst,
+            "output_tensor": self.output,
+            "token_lora_mapping": self.lora_kernel_meta.token_lora_mapping,
+            "token_indices_sorted_by_lora_ids": (
+                self.lora_kernel_meta.token_indices_sorted_by_lora_ids
+            ),
+            "num_tokens_per_lora": self.lora_kernel_meta.num_tokens_per_lora,
+            "lora_token_start_loc": self.lora_kernel_meta.lora_token_start_loc,
+            "lora_ids": self.lora_kernel_meta.active_lora_ids,
+            "offset_start": 0,
+            "add_inputs": add_inputs,
+        }
+
+    def bench_fn_kwargs(
+        self, op_type: OpType, add_inputs: Optional[bool] = None
+    ) -> dict[str, Any]:
+        if op_type.is_shrink_fn():
+            assert add_inputs is None
+        else:
+            assert add_inputs is not None
+
+        if op_type == OpType.LORA_SHRINK:
+            return self.as_lora_shrink_kwargs()
+        if op_type == OpType.LORA_EXPAND:
+            return self.as_lora_expand_kwargs(add_inputs)
+        raise ValueError(f"Unrecognized optype {self}")
+
+    def test_correctness(
+        self, op_type: OpType, expand_fn_add_inputs: Optional[bool]
+    ) -> bool:
+        """
+        Test correctness of op_type implementation against a grouped gemm
+        reference implementation.
+        """
+        seq_lens_cpu = self.seq_lens.to(device="cpu")
+        prompt_lora_mapping_cpu = self.prompt_lora_mapping.to(device="cpu")
+        ref_output = self.output.clone()
+
+        self.output.zero_()
+        op_type.bench_fn()(**self.bench_fn_kwargs(op_type, expand_fn_add_inputs))
+
+        op_type.run_ref_group_gemm(
+            ref_output,
+            self.input,
+            self.lora_weights_lst,
+            seq_lens_cpu=seq_lens_cpu,
+            prompt_lora_mapping_cpu=prompt_lora_mapping_cpu,
+            scaling=1.0,
+            add_inputs=expand_fn_add_inputs,
+        )
+
+        rtol, atol = {
+            torch.float16: (6e-2, 6e-2),
+            torch.bfloat16: (6e-2, 6e-2),
+            torch.float32: (1e-2, 1e-2),
+        }[self.output.dtype]
+
+        return torch.allclose(ref_output, self.output, rtol=rtol, atol=atol)
+
+
+def bench_optype(
+    ctx: BenchmarkContext,
+    arg_pool_size: int,
+    op_type: OpType,
+    cuda_graph_nops: Optional[int] = None,
+    expand_fn_add_inputs: Optional[bool] = None,
+    test_correctness: bool = False,
+) -> TMeasurement:
+    assert arg_pool_size >= 1
+    if op_type.is_shrink_fn():
+        assert expand_fn_add_inputs is None
+    else:
+        assert expand_fn_add_inputs is not None
+
+    # BenchmarkContext -> BenchmarkTensors
+    bench_tensors: list[BenchmarkTensors] = [
+        BenchmarkTensors.make(ctx, op_type) for _ in range(arg_pool_size)
+    ]
+    for bt in bench_tensors:
+        bt.sanity_check()
+
+    # Test correctness of our implementation.
+    if test_correctness:
+        assert all(
+            [bt.test_correctness(op_type, expand_fn_add_inputs) for bt in bench_tensors]
+        )
+
+    # BenchmarkTensors -> dict (kwargs)
+    kwargs_list = [
+        bt.bench_fn_kwargs(op_type, add_inputs=expand_fn_add_inputs)
+        for bt in bench_tensors
+    ]
+
+    # Clear LoRA optimization hash-maps.
+    _LORA_A_PTR_DICT.clear()
+    _LORA_B_PTR_DICT.clear()
+    # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are setup
+    for kwargs in kwargs_list:
+        op_type.bench_fn()(**kwargs)
+    torch.cuda.synchronize()
+
+    # Merge into a single kwargs and qualify arguments as ArgPool
+    kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
+    for _kwargs in kwargs_list:
+        for k, v in _kwargs.items():
+            kwargs[k].values.append(v)
+
+    describe_args = (
+        f"add_inputs={expand_fn_add_inputs}" if expand_fn_add_inputs is not None else ""
+    )
+    description = f"{op_type.name}({describe_args}) ({bench_tensors[0].io_types()})"
+
+    cuda_graph_params = None
+    if cuda_graph_nops:
+        cuda_graph_params = CudaGraphBenchParams(cuda_graph_nops)
+    timer = None
+    with Bench(
+        cuda_graph_params,
+        ctx.bench_label(),
+        ctx.bench_sublabel(op_type),
+        description,
+        op_type.bench_fn(),
+        **kwargs,
+    ) as bench:
+        timer = bench.run()
+    return timer
+
+
+def bench_torch_mm(
+    ctx: BenchmarkContext,
+    arg_pool_size: int,
+    op_type: OpType,
+    cuda_graph_nops: Optional[int] = None,
+) -> TMeasurement:
+    """
+    Benchmark basic torch.mm as a roofline.
+
+    When all the input tokens have the same LoRA ID, the LoRA kernels are just
+    a matmul. This torch.mm benchmark serves as a roofline for that case.
+
+    input op_type is used in determining the m, k, n dimensions for the matmul.
+    """
+
+    batch_size, hidden_size, lora_rank, seq_length, dtype = (
+        ctx.batch_size,
+        ctx.hidden_size,
+        ctx.lora_rank,
+        ctx.seq_length,
+        ctx.dtype,
+    )
+
+    m, k, n = op_type.mkn(batch_size, seq_length, hidden_size, lora_rank)
+    # For a fairer comparison.
+    n = n * ctx.num_slices
+
+    # Get matmul input and output tensors for A x B = C
+    As, Bs, Cs = [], [], []
+    for _ in range(arg_pool_size):
+        As.append(torch.rand((m, k), dtype=dtype).to("cuda"))
+        Bs.append(torch.rand((n, k), dtype=dtype).to("cuda").t())
+        Cs.append(torch.rand((m, n), dtype=dtype).to("cuda"))
+
+    # Make torch.mm kwargs
+    mm_kwargs = {"input": ArgPool(As), "mat2": ArgPool(Bs), "out": ArgPool(Cs)}
+
+    description = (
+        f"single-lora roofline using torch.mm ({dtype_to_str(dtype)}"
+        f"x{dtype_to_str(dtype)}"
+        f"=>{dtype_to_str(dtype)})"
+    )
+    cuda_graph_params = None
+    if cuda_graph_nops:
+        cuda_graph_params = CudaGraphBenchParams(cuda_graph_nops)
+    with Bench(
+        cuda_graph_params,
+        ctx.bench_label(),
+        ctx.bench_sublabel(op_type),
+        description,
+        torch.mm,
+        **mm_kwargs,
+    ) as bench:
+        return bench.run()
+
+
+# runner
+def use_cuda_graph_recommendation() -> str:
+    return """
+            Triton kernels have a significant launch overhead with
+            launched directly via python. This overhead is more noticeable
+            for small the problem sizes. For these cases, it is recommended
+            to use the script with `--cuda-graph-nops N` to benchmark N
+            consecutive invocations of the benchmarking operations from 
+            inside a CUDA Graph. Note that the returned measurement is for N 
+            invocations of the operation.
+            """
+
+
+def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+    if args and args.cuda_graph_nops:
+        print(
+            f"Note : The timings reported above is for {args.cuda_graph_nops} "
+            "consecutive invocations of the benchmarking functions. "
+            f"Please divide by {args.cuda_graph_nops} for single invocation "
+            "timings."
+        )
+
+    print(
+        "Note on Comparison with torch.mm : The torch.mm numbers are "
+        "benchmark numbers of a simple matmul emulating the single lora "
+        "case. It is provided as a roofline for comparing our LoRA Kernel "
+        "implementations. It is expected that the LoRA kernels will be "
+        "slower than torch.mm in cases where num_loras is big. But for "
+        "small num_loras the goal should be to match the torch.mm numbers."
+    )
+
+
+def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]):
+    if args.cuda_graph_nops is not None:
+        assert args.cuda_graph_nops > 0
+        print(f"Benchmarking {args.cuda_graph_nops} invocations inside a CUDA Graph")
+    else:
+        print(f"CUDA Graphs not enabled.\n{use_cuda_graph_recommendation()}")
+
+    timers = []
+    for bench_ctx in bench_ctxs:
+        for seq_len in args.seq_lengths:
+            bench_ops: list[OpType] = args.op_types
+            seq_len_timers = []
+            for bench_op in bench_ops:
+                for num_slices in bench_op.num_slices():
+                    _ctx = bench_ctx.with_seq_length(seq_len).with_num_slices(
+                        num_slices
+                    )
+                    # Benchmark torch.mm as a roofline
+                    seq_len_timers.append(
+                        bench_torch_mm(
+                            _ctx, args.arg_pool_size, bench_op, args.cuda_graph_nops
+                        )
+                    )
+
+                    # Benchmark bench_op
+                    expand_fn_add_inputs = (
+                        [None] if bench_op.is_shrink_fn() else args.expand_fn_add_inputs
+                    )
+                    for add_input_arg in expand_fn_add_inputs:
+                        seq_len_timers.append(
+                            bench_optype(
+                                _ctx,
+                                args.arg_pool_size,
+                                bench_op,
+                                args.cuda_graph_nops,
+                                add_input_arg,
+                                args.test_correctness,
+                            )
+                        )
+
+            print_timers(seq_len_timers)
+            timers.extend(seq_len_timers)
+
+    # Result stdout dump
+    print("== All Results ====")
+    print_timers(timers, args)
+
+    if args.output_directory:
+        # Result file dump
+        od = Path(args.output_directory)
+        if not od.exists():
+            od.mkdir()
+
+        timestamp = int(time.time())
+        pkl_file = od / f"lora_bench-{timestamp}.pkl"
+        print(f"Writing benchmarks to {pkl_file}")
+        with open(pkl_file, "wb") as f:
+            pickle.dump(timers, f)
+
+
+def as_benchmark_contexts(
+    hidden_sizes: list[int], lora_ranks: list[int], args: argparse.Namespace
+) -> list[BenchmarkContext]:
+    ctxs: list[BenchmarkContext] = []
+    for batch_size, hidden_size, lora_rank, num_loras, sort_by_lora_id in product(  # noqa
+        args.batch_sizes,
+        list(hidden_sizes),
+        lora_ranks,
+        args.num_loras,
+        args.sort_by_lora_id,
+    ):
+        ctxs.append(
+            BenchmarkContext(
+                batch_size=batch_size,
+                hidden_size=hidden_size,
+                lora_rank=lora_rank,
+                num_loras=num_loras,
+                num_active_loras=args.num_active_loras
+                if args.num_active_loras
+                else num_loras,
+                # To be filled based on the OpType to benchmark
+                seq_length=None,
+                sort_by_lora_id=sort_by_lora_id,
+                dtype=args.dtype,
+                # To be filled based on the OpType to benchmark
+                num_slices=None,
+            )
+        )
+
+    return ctxs
+
+
+def run_list_bench(args: argparse.Namespace):
+    print(args)
+
+    print(
+        "List bench :\n"
+        f"  Hidden Sizes {args.hidden_sizes}"
+        f"  LoRA Ranks {args.lora_ranks}"
+    )
+
+    # Get all benchmarking contexts
+    bench_contexts: list[BenchmarkContext] = as_benchmark_contexts(
+        hidden_sizes=args.hidden_sizes, lora_ranks=args.lora_ranks, args=args
+    )
+
+    run(args, bench_contexts)
+
+
+def run_range_bench(args: argparse.Namespace):
+    print(args)
+
+    hidden_sizes = list(
+        range(
+            args.hidden_sizes_start,
+            args.hidden_sizes_end + 1,
+            args.hidden_sizes_increment,
+        )
+    )
+    lora_ranks = list(
+        range(args.lora_ranks_start, args.lora_ranks_end + 1, args.lora_ranks_increment)
+    )
+
+    print(f"Range bench :\n Hidden Sizes {hidden_sizes} LoRA Ranks {lora_ranks}")
+
+    # Get all benchmarking contexts
+    bench_contexts: list[BenchmarkContext] = as_benchmark_contexts(
+        hidden_sizes=hidden_sizes, lora_ranks=lora_ranks, args=args
+    )
+
+    run(args, bench_contexts)
+
+
+def run_model_bench(args: argparse.Namespace):
+    print(args)
+
+    def hidden_sizes_from_model(model: str, tp_size: int) -> set[int]:
+        hidden_sizes = set()
+        for KN, tp_split_dim in WEIGHT_SHAPES[model]:
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            hidden_sizes.add(KN[1])
+        return hidden_sizes
+
+    # Get all hidden sizes
+    hidden_sizes: set[int] = set()
+    for model_name, tp_size in product(args.models, args.tp_sizes):
+        hidden_sizes = hidden_sizes.union(hidden_sizes_from_model(model_name, tp_size))
+
+    print(f"Model bench :\n Hidden Sizes {hidden_sizes} LoRA Ranks {args.lora_ranks}")
+
+    # Get all benchmarking contexts
+    bench_contexts: list[BenchmarkContext] = as_benchmark_contexts(
+        hidden_sizes=hidden_sizes, lora_ranks=args.lora_ranks, args=args
+    )
+
+    run(args, bench_contexts)
+
+
+if __name__ == "__main__":
+
+    def to_torch_dtype(dt):
+        if dt == "torch.float16":
+            return torch.float16
+        if dt == "torch.bfloat16":
+            return torch.bfloat16
+        raise ValueError("unsupported dtype")
+
+    def get_bool(s: str) -> bool:
+        return s.lower() in ["true", "1"]
+
+    def add_common_command_args(p: argparse.ArgumentParser):
+        p.add_argument(
+            "--dtype",
+            type=to_torch_dtype,
+            required=True,
+            help="Available options are ['torch.float16', 'torch.bfloat16']",
+        )
+
+        p.add_argument(
+            "--arg-pool-size",
+            type=int,
+            default=32,
+            help="Run profiles with a pool of input/output/meta tensors instead"
+            "of simply reusing the same tensors for all runs. A bigger arg-pool"
+            "mitigates hardware caching effects during benchmarking.",
+        )
+
+        p.add_argument(
+            "--cuda-graph-nops",
+            type=int,
+            help=(
+                "when set profiling is done using cudagraph, "
+                "with the given number of operations in a graph."
+                "Note that the measurement returned is the time "
+                "taken for N consecutive executions of the benchmarking "
+                "functions, where N is the value of this argument."
+            ),
+        )
+        p.add_argument("--num-loras", nargs="+", type=int, default=DEFAULT_NUM_LORAS)
+        p.add_argument(
+            "--num-active-loras",
+            type=int,
+            default=None,
+            help="Active LoRAs. When None, all LoRAs are active",
+        )
+        p.add_argument(
+            "--sort-by-lora-id",
+            nargs="+",
+            type=get_bool,
+            default=DEFAULT_SORT_BY_LORA_IDS,
+        )
+        p.add_argument(
+            "--op-types", nargs="+", type=OpType.from_str, default=list(OpType)
+        )
+        p.add_argument(
+            "--seq-lengths", nargs="+", type=int, default=DEFAULT_SEQ_LENGTHS
+        )
+        p.add_argument(
+            "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
+        )
+        p.add_argument(
+            "--expand-fn-add-inputs",
+            nargs="+",
+            type=get_bool,
+            default=DEFAULT_EXPAND_FN_ADD_INPUTS,
+        )
+        p.add_argument(
+            "-o",
+            "--output-directory",
+            type=str,
+            help=(
+                "Output directory to store a the list of benchmarking"
+                "TMeasurement objects as a pickle file"
+            ),
+        )
+
+        p.add_argument(
+            "--test-correctness",
+            action="store_true",
+            help=(
+                "When enabled, the benchmarking functions are tested"
+                "for correctness before the actual benchmarking"
+            ),
+        )
+
+    parser = FlexibleArgumentParser(
+        description=f"""
+Benchmark LoRA kernels:
+    {use_cuda_graph_recommendation()}
+
+    list_bench example:
+        python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32
+
+    model_bench example:
+        python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16  --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 
+
+    range_bench example:
+        python3 benchmarks/kernels/benchmark_lora.py range_bench  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16   --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8 
+            """,  # noqa: E501
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+
+    subparsers = parser.add_subparsers(dest="cmd", required=True)
+
+    list_parser = subparsers.add_parser("list_bench")
+    list_parser.add_argument(
+        "--hidden-sizes", nargs="+", type=int, default=DEFAULT_HIDDEN_SIZES
+    )
+    list_parser.add_argument(
+        "--lora-ranks", nargs="+", type=int, default=DEFAULT_LORA_RANKS
+    )
+    add_common_command_args(list_parser)
+    list_parser.set_defaults(func=run_list_bench)
+
+    range_parser = subparsers.add_parser("range_bench")
+    range_parser.add_argument("--hidden-sizes-start", type=int, required=True)
+    range_parser.add_argument("--hidden-sizes-end", type=int, required=True)
+    range_parser.add_argument("--hidden-sizes-increment", type=int, required=True)
+    range_parser.add_argument("--lora-ranks-start", type=int, required=True)
+    range_parser.add_argument("--lora-ranks-end", type=int, required=True)
+    range_parser.add_argument("--lora-ranks-increment", type=int, required=True)
+    add_common_command_args(range_parser)
+    range_parser.set_defaults(func=run_range_bench)
+
+    model_parser = subparsers.add_parser("model_bench")
+    model_parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=DEFAULT_MODELS,
+        choices=WEIGHT_SHAPES.keys(),
+    )
+    model_parser.add_argument(
+        "--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES
+    )
+    model_parser.add_argument(
+        "--lora-ranks", nargs="+", type=int, default=DEFAULT_LORA_RANKS
+    )
+    add_common_command_args(model_parser)
+    model_parser.set_defaults(func=run_model_bench)
+
+    args = parser.parse_args()
+    args.func(args)
diff --git a/vllm_v0.10.0/benchmarks/kernels/benchmark_machete.py b/vllm_v0.10.0/benchmarks/kernels/benchmark_machete.py
new file mode 100644
index 0000000..f73d051
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/kernels/benchmark_machete.py
@@ -0,0 +1,732 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import copy
+import itertools
+import math
+import os
+import pickle as pkl
+import time
+from collections.abc import Iterable
+from dataclasses import dataclass
+from itertools import product
+from typing import Callable, Optional
+
+import pandas as pd
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    GPTQ_MARLIN_MAX_PARALLEL,
+    GPTQ_MARLIN_MIN_THREAD_N,
+    marlin_permute_scales,
+    marlin_zero_points,
+)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
+    MarlinWorkspace,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    pack_rows,
+    quantize_weights,
+)
+from vllm.scalar_type import ScalarType, scalar_types
+from vllm.utils import FlexibleArgumentParser
+
+DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"]
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024]
+DEFAULT_TP_SIZES = [1]
+
+NVTX_PROFILE = os.environ.get("NVTX_PROFILE", False)
+
+if NVTX_PROFILE:
+    import nvtx
+
+
+def terse_type_name(dt):
+    return {
+        torch.bfloat16: "bf16",
+        torch.float16: "fp16",
+        torch.int8: "int8",
+        torch.float8_e4m3fn: "fp8",
+        torch.float: "float",
+        torch.int: "int",
+    }[dt]
+
+
+@dataclass
+class BenchmarkTensors:
+    w_ref: torch.Tensor
+    a: torch.Tensor
+
+    w_q: torch.Tensor
+    group_size: Optional[int]
+    wtype: ScalarType
+    w_g_s: torch.Tensor
+    w_g_zp: Optional[torch.Tensor]
+    w_ch_s: Optional[torch.Tensor]
+    w_tok_s: Optional[torch.Tensor]
+
+
+@dataclass
+class TypeConfig:
+    act_type: torch.dtype
+    weight_type: ScalarType
+    output_type: Optional[torch.dtype]
+    group_scale_type: Optional[torch.dtype]
+    group_zero_type: Optional[torch.dtype]
+    channel_scale_type: Optional[torch.dtype]
+    token_scale_type: Optional[torch.dtype]
+
+
+def rand_data(shape, dtype=torch.float16, scale=1):
+    if dtype.is_floating_point:
+        return (scale * torch.rand(shape, device="cuda") - 0.3).to(dtype)
+    else:
+        return torch.randint(-15, 15, shape, dtype=dtype, device="cuda")
+
+
+def quantize_and_pack(
+    atype: torch.dtype,
+    w: torch.Tensor,
+    wtype: ScalarType,
+    stype: Optional[torch.dtype],
+    group_size: Optional[int],
+    zero_points: bool = False,
+):
+    assert wtype.is_integer(), "TODO: support floating point weights"
+
+    w_ref, w_q, w_s, w_zp = quantize_weights(
+        w,
+        wtype,
+        group_size=group_size,
+        zero_points=zero_points,
+        # to match how the kernel applies zps
+        ref_zero_points_after_scales=True,
+    )
+
+    w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape)
+    return w_ref, w_q, w_s, w_zp
+
+
+def create_bench_tensors(
+    shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int]
+) -> list[BenchmarkTensors]:
+    m, n, k = shape
+
+    # we want to make sure that weights don't fit into L2 cache between runs so
+    #  we construct enough weights to exceed L2 cache, which is 50mb on a H100
+    #  so we target total weight size > 2*50mb
+    num_weights = math.ceil(
+        2 * 50 * 1024**2 * 8 / (k * n * types.weight_type.size_bits)
+    )
+
+    a = rand_data((m, k), types.act_type, scale=5)
+
+    benchmark_tensors: list[BenchmarkTensors] = []
+    for _ in range(num_weights):
+        w = rand_data((k, n), types.act_type, scale=5)
+
+        if types.group_scale_type is not None:
+            w = w.to(types.group_scale_type)
+        if w.dtype.itemsize == 1:
+            w = w.to(torch.float16)
+
+        w_ref, w_q_packed, w_s, w_zp = quantize_and_pack(
+            a.dtype,
+            w,
+            types.weight_type,
+            types.group_scale_type,
+            group_size,
+            types.group_zero_type is not None,
+        )
+
+        if not a.dtype.is_floating_point:
+            aiinfo = torch.iinfo(a.dtype)
+            w_ref = w_ref.round().clamp(aiinfo.min, aiinfo.max)
+
+        w_ref = w_ref.to(torch.float32)
+
+        w_ch_s = (
+            None
+            if types.channel_scale_type is None
+            else rand_data((n,), types.channel_scale_type)
+        )
+        w_tok_s = (
+            None
+            if types.token_scale_type is None
+            else rand_data((m,), types.token_scale_type)
+        )
+
+        benchmark_tensors.append(
+            BenchmarkTensors(
+                w_ref=w_ref,
+                a=a,
+                w_q=w_q_packed,
+                wtype=types.weight_type,
+                w_g_s=w_s,
+                w_g_zp=w_zp,
+                group_size=group_size,
+                w_ch_s=w_ch_s,
+                w_tok_s=w_tok_s,
+            )
+        )
+
+    return benchmark_tensors
+
+
+def torch_matmul_f16_create_bench_fn(bt: BenchmarkTensors) -> Callable:
+    a = bt.a
+    w = bt.w_ref.to(bt.a.dtype)  # use float reference tensor
+    if a.dtype not in [torch.float16, torch.bfloat16]:
+        a = a.to(torch.float16)
+        w = w.to(torch.float16)
+    return lambda: torch.matmul(a, w)
+
+
+def cutlass_scaled_mm_create_bench_fn(bt: BenchmarkTensors) -> Callable:
+    if bt.w_ch_s is not None and bt.w_tok_s is not None:
+        scale_a = bt.w_tok_s.to(torch.float32)
+        scale_b = bt.w_ch_s.to(torch.float32)
+    else:
+        scale_a = torch.tensor(1.0, dtype=torch.float32, device=bt.a.device)
+        scale_b = torch.tensor(1.0, dtype=torch.float32, device=bt.a.device)
+    w_col_major = bt.w_ref.to(bt.a.dtype).t().contiguous().t()
+    return lambda: ops.cutlass_scaled_mm(
+        bt.a, w_col_major, scale_a, scale_b, out_dtype=torch.float16
+    )
+
+
+def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
+    device = bt.a.device
+
+    workspace = MarlinWorkspace(
+        bt.w_ref.shape[1], GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL
+    )
+
+    if bt.w_g_zp is None:
+        w_zp = torch.empty(0, dtype=torch.int, device=device)
+    else:
+        w_zp = marlin_zero_points(
+            bt.w_g_zp, bt.w_ref.shape[0], bt.w_ref.shape[1], bt.wtype.size_bits
+        )
+
+    if bt.group_size is None:
+        w_s = torch.tensor([], device="cuda", dtype=torch.half)
+    else:
+        w_s = marlin_permute_scales(
+            bt.w_g_s, bt.w_ref.shape[0], bt.w_ref.shape[1], bt.group_size
+        )
+
+    sort_indices = torch.empty(0, dtype=torch.int, device=device)
+    g_idx = torch.empty(0, dtype=torch.int, device=device)
+    w_q = ops.gptq_marlin_repack(
+        bt.w_q, sort_indices, bt.w_ref.shape[0], bt.w_ref.shape[1], bt.wtype.size_bits
+    )
+
+    if bt.a.dtype.is_floating_point:
+        assert bt.w_ch_s is None
+        assert bt.w_tok_s is None
+        assert bt.group_size is not None
+
+        fn = lambda: ops.gptq_marlin_gemm(
+            a=bt.a,
+            c=None,
+            b_q_weight=w_q,
+            b_scales=w_s,
+            global_scale=None,
+            b_zeros=w_zp,
+            g_idx=g_idx,
+            perm=sort_indices,
+            workspace=workspace.scratch,
+            b_q_type=bt.wtype,
+            size_m=bt.a.shape[0],
+            size_n=bt.w_ref.shape[1],
+            size_k=bt.w_ref.shape[0],
+            is_k_full=True,
+            is_zp_float=False,
+        )
+    else:
+        assert bt.a.dtype == torch.int8
+        assert bt.wtype == scalar_types.uint4b8
+
+        if bt.w_ch_s is not None:
+            s_ch = bt.w_ch_s.to(torch.float32)
+        else:
+            s_ch = torch.ones(bt.w_ref.shape[1], dtype=torch.float32, device=device)
+
+        if bt.w_tok_s is not None:
+            s_tok = bt.w_tok_s.to(torch.float32)
+        else:
+            s_tok = torch.ones(bt.a.shape[0], dtype=torch.float32, device=device)
+
+        fn = lambda: ops.marlin_qqq_gemm(
+            a=bt.a,
+            b_q_weight=w_q,
+            s_group=w_s,
+            s_tok=s_tok,
+            s_ch=s_ch,
+            workspace=workspace.scratch,
+            size_m=bt.a.shape[0],
+            size_n=bt.w_ref.shape[1],
+            size_k=bt.w_ref.shape[0],
+        )
+
+    return fn
+
+
+def machete_create_bench_fn(
+    bt: BenchmarkTensors, out_type=torch.dtype, schedule=None
+) -> Callable:
+    w_q = bt.w_q.t().contiguous().t()  # make col major
+    w_q = ops.machete_prepack_B(
+        w_q, bt.a.dtype, bt.wtype, None if bt.w_g_s is None else bt.w_g_s.dtype
+    )
+
+    w_g_zp = bt.w_g_zp
+    if w_g_zp is not None:
+        w_g_zp = -1 * bt.w_g_s * (w_g_zp.to(bt.w_g_s.dtype))
+
+    return lambda: ops.machete_mm(
+        a=bt.a,
+        b_q=w_q,
+        b_type=bt.wtype,
+        b_group_scales=bt.w_g_s,
+        b_group_zeros=w_g_zp,
+        b_group_size=bt.group_size,
+        b_channel_scales=bt.w_ch_s,
+        a_token_scales=bt.w_tok_s,
+        out_type=out_type,
+        schedule=schedule,
+    )
+
+
+# impl
+
+# bench
+
+
+def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable]):
+    min_run_time = 1 if not NVTX_PROFILE else 0.1
+    res = TBenchmark.Timer(
+        stmt="""
+        for fn in fns:
+            fn()
+        """,
+        globals={"fns": fns},
+        label=label,
+        sub_label=sub_label,
+        description=description,
+    ).blocked_autorange(min_run_time=min_run_time)
+
+    if NVTX_PROFILE:
+        with (
+            nvtx.annotate("mm-bench"),
+            nvtx.annotate(f"{label}|{sub_label}|{description}"),
+        ):
+            fns[0]()
+
+    return res
+
+
+_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
+_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
+
+
+def bench(
+    types: TypeConfig,
+    group_size: int,
+    m: int,
+    k: int,
+    n: int,
+    label: str,
+    sub_label: str,
+    sweep_schedules: bool = True,
+) -> list[TMeasurement]:
+    benchmark_tensors = create_bench_tensors((m, n, k), types, group_size)
+    sub_label += f", L={len(benchmark_tensors)}"
+
+    name_type_string = f"W{types.weight_type}" + f"-A{terse_type_name(types.act_type)}"
+    if types.group_scale_type is not None:
+        name_type_string += f"-GS{terse_type_name(types.group_scale_type)}"
+    if types.group_zero_type is not None:
+        name_type_string += f"-GZ{terse_type_name(types.group_zero_type)}"
+    if group_size is not None:
+        name_type_string += f"-G{group_size}"
+    if types.channel_scale_type is not None:
+        name_type_string += f"-CS{terse_type_name(types.channel_scale_type)}"
+    if types.token_scale_type is not None:
+        name_type_string += f"-TS{terse_type_name(types.token_scale_type)}"
+
+    timers = []
+    # pytorch impl
+    timers.append(
+        bench_fns(
+            label,
+            sub_label,
+            "torch.matmul (fp16)",
+            [torch_matmul_f16_create_bench_fn(bt) for bt in benchmark_tensors],
+        )
+    )
+
+    if types.act_type == torch.int8 or types.act_type == torch.float8_e4m3fn:
+        timers.append(
+            bench_fns(
+                label,
+                sub_label,
+                f"cutlass_scaled_mm ({terse_type_name(types.act_type)})",
+                [cutlass_scaled_mm_create_bench_fn(bt) for bt in benchmark_tensors],
+            )
+        )
+
+    if types.act_type != torch.float8_e4m3fn:
+        timers.append(
+            bench_fns(
+                label,
+                sub_label,
+                f"marlin ({name_type_string})",
+                [marlin_create_bench_fn(bt) for bt in benchmark_tensors],
+            )
+        )
+
+    # machete
+    timers.append(
+        bench_fns(
+            label,
+            sub_label,
+            f"machete ({name_type_string})",
+            [
+                machete_create_bench_fn(bt, out_type=types.output_type)
+                for bt in benchmark_tensors
+            ],
+        )
+    )
+
+    if sweep_schedules:
+        global _SWEEP_SCHEDULES_RESULTS
+
+        print("Finding best schedule for machete")
+        best = None
+        best_schedule = None
+        schedules = ops.machete_supported_schedules(
+            a_type=types.act_type,
+            b_type=types.weight_type,
+            group_scales_type=types.group_scale_type,
+            group_zeros_type=types.group_zero_type,
+            token_scales_type=types.token_scale_type,
+            channel_scales_type=types.channel_scale_type,
+            out_type=types.output_type,
+        )
+
+        if schedules is None or len(schedules) == 0:
+            raise ValueError("No schedules found to sweep")
+
+        for schedule in reversed(schedules):
+            schedule_M = int(schedule.split("_")[0].split("x")[1])
+
+            # Prune known bad schedules
+            if schedule_M >= 2 * max(m, 16) or schedule_M < m // 4:
+                continue
+
+            res = bench_fns(
+                label,
+                sub_label,
+                "machete_best",
+                [
+                    machete_create_bench_fn(
+                        bt, out_type=types.output_type, schedule=schedule
+                    )
+                    for bt in benchmark_tensors
+                ],
+            )
+
+            results_row = {
+                "M": m,
+                "K": k,
+                "N": n,
+                "group_size": group_size,
+                "schedule": schedule,
+                "median": res.median,
+            }
+            if _SWEEP_SCHEDULES_RESULTS is None:
+                _SWEEP_SCHEDULES_RESULTS = pd.DataFrame(columns=results_row.keys())
+            _SWEEP_SCHEDULES_RESULTS.loc[len(_SWEEP_SCHEDULES_RESULTS)] = results_row
+
+            print(f"  {res.median:5.5} ", schedule)
+            if not best or res.median < best.median:
+                best = res
+                best_schedule = schedule
+        print("Best schedule:", best_schedule)
+        timers.append(best)
+
+    return timers
+
+
+# runner
+def print_timers(timers: list[TMeasurement]):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+def run(args, MKNs: Iterable[tuple[int, int, int]]) -> Iterable[TMeasurement]:
+    types = TypeConfig(
+        act_type=args.act_type,
+        weight_type=scalar_types.uint4b8
+        if args.group_zero_type is None
+        else scalar_types.uint4,
+        output_type=args.out_type,
+        group_scale_type=args.group_scale_type,
+        group_zero_type=args.group_zero_type,
+        channel_scale_type=args.channel_scale_type,
+        token_scale_type=args.token_scale_type,
+    )
+
+    results: list[TMeasurement] = []
+    for m, k, n in MKNs:
+        timers = bench(
+            types,
+            args.group_size,
+            m,
+            k,
+            n,
+            f"{args.act_type}-gemm",
+            f"MKN=({m}x{k}x{n})",
+            sweep_schedules=args.sweep_schedules,
+        )
+        print_timers(timers)
+        results.extend(timers)
+
+    return results
+
+
+# output makers
+def make_output(
+    data: list[TMeasurement],
+    MKNs: Iterable[tuple[int, int, int]],
+    base_description: str,
+    timestamp=None,
+):
+    print(f"== All Results {base_description} ====")
+    print_timers(data)
+
+    # pickle all the results
+    timestamp = int(time.time()) if timestamp is None else timestamp
+    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(data, f)
+
+
+# argparse runners
+
+
+def run_square_bench(args):
+    dim_sizes = list(range(args.dim_start, args.dim_end + 1, args.dim_increment))
+    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
+    data = run(args.dtype, args.sweep_schedules, MKNs)
+
+    make_output(data, MKNs, f"square_bench-{args.dtype}")
+
+
+def run_range_bench(args):
+    m_start, k_start, n_start = (int(x) for x in args.dim_start.split(","))
+    m_end, k_end, n_end = (int(x) for x in args.dim_end.split(","))
+    m_increment, k_increment, n_increment = (
+        int(x) for x in args.dim_increment.split(",")
+    )
+    Ms = list(range(m_start, m_end + 1, m_increment))
+    Ks = list(range(k_start, k_end + 1, k_increment))
+    Ns = list(range(n_start, n_end + 1, n_increment))
+    MKNs = list(product(Ms, Ks, Ns))
+
+    data = run(args.dtype, args.sweep_schedules, MKNs)
+
+    make_output(data, MKNs, f"range_bench-{args.dtype}")
+
+
+def run_model_bench(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]:
+        KNs = []
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KNs.append(KN)
+        return KNs
+
+    model_bench_data = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        Ms = args.batch_sizes
+        KNs = model_shapes(model, tp_size)
+        MKNs = []
+        for m in Ms:
+            for k, n in KNs:
+                MKNs.append((m, k, n))
+
+        data = run(args, MKNs)
+        model_bench_data.append(data)
+
+    type_string = f"{args.act_type}"
+
+    # Print all results
+    for data, model_tp in zip(model_bench_data, models_tps):
+        model, tp_size = model_tp
+        print(f"== Results {type_string} {model}-TP{tp_size} ====")
+        print_timers(data)
+
+    timestr = time.strftime("%Y%m%d-%H%M%S")
+
+    all_results = []
+    for d in model_bench_data:
+        all_results.extend(d)
+
+    # pickle all data
+    with open(f"model_bench-{type_string}-{timestr}.pkl", "wb") as f:
+        args_dict = vars(args)
+        args_dict.pop("func")
+        pkl.dump(
+            {
+                "args": args_dict,
+                "results": all_results,
+            },
+            f,
+        )
+
+
+if __name__ == "__main__":
+
+    def to_torch_dtype(dt):
+        return {
+            "bfloat16": torch.bfloat16,
+            "float16": torch.float16,
+            "int8": torch.int8,
+            "float8_e4m3fn": torch.float8_e4m3fn,
+            "int": torch.int,
+            "float": torch.float,
+        }[dt]
+
+    class ToTorchDtype(argparse.Action):
+        def __call__(self, parser, namespace, values, option_string=None):
+            setattr(namespace, self.dest, to_torch_dtype(values))
+
+    parser = FlexibleArgumentParser(
+        description="""
+Benchmark Machete GEMM.
+
+    To run square GEMMs:
+        python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
+    
+    To run constant N and K and sweep M:
+        python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
+    
+    To run dimensions from a model:
+        python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
+    
+    Output:
+        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
+            """,  # noqa: E501
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--act-type",
+        action=ToTorchDtype,
+        required=True,
+        choices=["bfloat16", "float16", "int8", "float8_e4m3fn"],
+    )
+    parser.add_argument(
+        "--group-scale-type",
+        action=ToTorchDtype,
+        choices=["bfloat16", "float16"],
+    )
+    parser.add_argument(
+        "--group-zero-type",
+        type=to_torch_dtype,
+        choices=["bfloat16", "float16"],
+    )
+    parser.add_argument(
+        "--channel-scale-type",
+        action=ToTorchDtype,
+        choices=["float"],
+    )
+    parser.add_argument(
+        "--token-scale-type",
+        action=ToTorchDtype,
+        choices=["float"],
+    )
+    parser.add_argument(
+        "--out-type",
+        action=ToTorchDtype,
+        choices=["bfloat16", "float16"],
+    )
+    parser.add_argument(
+        "--group-size",
+        type=int,
+        help="Available options are ['None', '-1', '128'], default=128",
+        default=128,
+    )
+    parser.add_argument(
+        "--sweep-schedules",
+        action="store_true",
+        help="Run a sweep over all supported schedules",
+    )
+    parser.add_argument(
+        "--sweep-csv-out",
+        help="CSV to store sweep results",
+        default="sch_sweep_results.csv",
+    )
+    subparsers = parser.add_subparsers(dest="cmd", required=True)
+
+    square_parser = subparsers.add_parser("square_bench")
+    square_parser.add_argument("--dim-start", type=int, required=True)
+    square_parser.add_argument("--dim-end", type=int, required=True)
+    square_parser.add_argument("--dim-increment", type=int, required=True)
+    square_parser.set_defaults(func=run_square_bench)
+
+    range_parser = subparsers.add_parser("range_bench")
+    range_parser.add_argument(
+        "--dim-start",
+        type=str,
+        required=True,
+        help="Start value for M,K,N as common separated list",
+    )
+    range_parser.add_argument(
+        "--dim-end",
+        type=str,
+        required=True,
+        help="End value (inclusive) for M,K,N as common separated list",
+    )
+    range_parser.add_argument(
+        "--dim-increment",
+        type=str,
+        required=True,
+        help="Increment value for M,K,N as common separated list",
+    )
+    range_parser.set_defaults(func=run_range_bench)
+
+    model_parser = subparsers.add_parser("model_bench")
+    model_parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=DEFAULT_MODELS,
+        choices=WEIGHT_SHAPES.keys(),
+    )
+    model_parser.add_argument(
+        "--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES
+    )
+    model_parser.add_argument(
+        "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
+    )
+    model_parser.set_defaults(func=run_model_bench)
+
+    args = parser.parse_args()
+
+    _SWEEP_SCHEDULES_RESULTS_CSV = args.sweep_csv_out
+    args.func(args)
+
+    if _SWEEP_SCHEDULES_RESULTS is not None:
+        _SWEEP_SCHEDULES_RESULTS.to_csv(_SWEEP_SCHEDULES_RESULTS_CSV)
diff --git a/vllm_v0.10.0/benchmarks/kernels/benchmark_marlin.py b/vllm_v0.10.0/benchmarks/kernels/benchmark_marlin.py
new file mode 100644
index 0000000..34cc45e
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/kernels/benchmark_marlin.py
@@ -0,0 +1,413 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+import torch.utils.benchmark as benchmark
+from benchmark_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
+    GPTQ_MARLIN_24_MAX_PARALLEL,
+    GPTQ_MARLIN_24_MIN_THREAD_N,
+    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES,
+    GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES,
+)
+from vllm.model_executor.layers.quantization.utils.allspark_utils import (
+    ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
+    ALLSPARK_SUPPORTED_QUANT_TYPES,
+)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    GPTQ_MARLIN_MAX_PARALLEL,
+    GPTQ_MARLIN_MIN_THREAD_N,
+    MARLIN_SUPPORTED_GROUP_SIZES,
+    query_marlin_supported_quant_types,
+)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
+    FP4_MARLIN_SUPPORTED_GROUP_SIZES,
+    rand_marlin_weight_fp4_like,
+)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
+    marlin_quant_fp8_torch,
+)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
+    MarlinWorkspace,
+    awq_marlin_quantize,
+    marlin_quantize,
+)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
+    marlin_24_quantize,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    gptq_pack,
+    gptq_quantize_weights,
+    quantize_weights,
+    sort_weights,
+)
+from vllm.scalar_type import ScalarType, scalar_types
+from vllm.utils import FlexibleArgumentParser
+
+DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
+
+ACT_ORDER_OPTS = [False, True]
+K_FULL_OPTS = [False, True]
+
+
+def bench_run(
+    results: list[benchmark.Measurement],
+    model: str,
+    act_order: bool,
+    is_k_full: bool,
+    quant_type: ScalarType,
+    group_size: int,
+    size_m: int,
+    size_k: int,
+    size_n: int,
+):
+    label = "Quant Matmul"
+    sub_label = "{}, act={} k_full={}, q={}, g={}, MKN=({}x{}x{})".format(
+        model, act_order, is_k_full, str(quant_type), group_size, size_m, size_k, size_n
+    )
+    print(f"Testing: {sub_label}")
+
+    a = torch.randn(size_m, size_k).to(torch.half).cuda()
+    b = torch.rand(size_k, size_n).to(torch.half).cuda()
+    has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8]
+    if act_order and (group_size == -1 or group_size == size_k or has_zp):
+        return
+    if size_k % group_size != 0:
+        return
+
+    marlin_24_supported = (
+        quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
+        and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
+    )
+    repack_supported = (
+        quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
+        and group_size in MARLIN_SUPPORTED_GROUP_SIZES
+    )
+    allspark_supported = (
+        quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES
+        and group_size == -1
+        and not act_order
+        and is_k_full
+    )
+
+    def gen_marlin_params():
+        # Marlin quant
+        marlin_g_idx = marlin_sort_indices = marlin_zp = marlin_s2 = None
+        if quant_type == scalar_types.float4_e2m1f:
+            if group_size != 16 or act_order:
+                return
+            marlin_w_ref, marlin_q_w, marlin_s, marlin_s2 = rand_marlin_weight_fp4_like(
+                b.T, group_size
+            )
+        elif quant_type == scalar_types.float8_e4m3fn:
+            if group_size not in [-1, 128] or act_order:
+                return
+            marlin_w_ref, marlin_q_w, marlin_s = marlin_quant_fp8_torch(b.T, group_size)
+        elif group_size == 16:
+            return
+        elif has_zp:
+            marlin_w_ref, marlin_q_w, marlin_s, marlin_zp = awq_marlin_quantize(
+                b, quant_type, group_size
+            )
+        else:
+            marlin_w_ref, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, _ = (
+                marlin_quantize(b, quant_type, group_size, act_order)
+            )
+        return (
+            marlin_w_ref,
+            marlin_q_w,
+            marlin_s,
+            marlin_s2,
+            marlin_zp,
+            marlin_g_idx,
+            marlin_sort_indices,
+        )
+
+    def gen_marlin_24_params():
+        marlin_24_w_ref = marlin_24_q_w_comp = marlin_24_meta = marlin_24_s = None
+        if marlin_24_supported:
+            (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = (
+                marlin_24_quantize(b, quant_type, group_size)
+            )
+        return (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s)
+
+    def gen_repack_params():
+        q_w_gptq = None
+        repack_sort_indices = None
+        if repack_supported:
+            (w_ref, q_w, s, g_idx, rand_perm) = gptq_quantize_weights(
+                b, quant_type, group_size, act_order
+            )
+            q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
+
+            # For act_order, sort the "weights" and "g_idx"
+            # so that group ids are increasing
+            repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
+            if act_order:
+                (q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
+        return q_w_gptq, repack_sort_indices
+
+    def gen_allspark_params():
+        qw_reorder = s_reorder = zp_reorder = sm_count = sm_version = (
+            CUBLAS_M_THRESHOLD
+        ) = None
+        nonlocal allspark_supported
+        if allspark_supported:
+            properties = torch.cuda.get_device_properties(b.device.index)
+            sm_count = properties.multi_processor_count
+            sm_version = properties.major * 10 + properties.minor
+
+            supported_arch = sm_version >= 80 and sm_version < 90
+            allspark_supported = allspark_supported and supported_arch
+            if supported_arch:
+                w_ref, qw, s, zp = quantize_weights(b, quant_type, group_size, has_zp)
+                qw = qw.to(torch.uint8)
+
+                qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight(
+                    qw, s, zp, has_zp
+                )
+                CUBLAS_M_THRESHOLD = ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD
+        return (
+            qw_reorder,
+            s_reorder,
+            zp_reorder,
+            sm_count,
+            sm_version,
+            CUBLAS_M_THRESHOLD,
+        )
+
+    (
+        marlin_w_ref,
+        marlin_q_w,
+        marlin_s,
+        marlin_s2,
+        marlin_zp,
+        marlin_g_idx,
+        marlin_sort_indices,
+    ) = gen_marlin_params()
+    marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s = (
+        gen_marlin_24_params()
+    )
+    q_w_gptq, repack_sort_indices = gen_repack_params()
+    qw_reorder, s_reorder, zp_reorder, sm_count, sm_version, CUBLAS_M_THRESHOLD = (
+        gen_allspark_params()
+    )
+
+    # Prepare
+    marlin_workspace = MarlinWorkspace(
+        size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL
+    )
+    marlin_24_workspace = MarlinWorkspace(
+        size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL
+    )
+
+    globals = {
+        # Gen params
+        "quant_type": quant_type,
+        "group_size": group_size,
+        "size_m": size_m,
+        "size_n": size_n,
+        "size_k": size_k,
+        "a": a,
+        # Marlin params
+        "marlin_w_ref": marlin_w_ref,
+        "marlin_q_w": marlin_q_w,
+        "marlin_s": marlin_s,
+        "marlin_s2": marlin_s2,
+        "marlin_zp": marlin_zp,
+        "marlin_g_idx": marlin_g_idx,
+        "marlin_sort_indices": marlin_sort_indices,
+        "marlin_workspace": marlin_workspace,
+        "is_k_full": is_k_full,
+        # Marlin_24 params
+        "marlin_24_w_ref": marlin_24_w_ref,
+        "marlin_24_q_w_comp": marlin_24_q_w_comp,
+        "marlin_24_meta": marlin_24_meta,
+        "marlin_24_s": marlin_24_s,
+        "marlin_24_workspace": marlin_24_workspace,
+        # GPTQ params
+        "q_w_gptq": q_w_gptq,
+        "repack_sort_indices": repack_sort_indices,
+        # AllSpark W8A16 params
+        "qw_reorder": qw_reorder,
+        "s_reorder": s_reorder,
+        "zp_reorder": zp_reorder,
+        "sm_count": sm_count,
+        "sm_version": sm_version,
+        "CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD,
+        # Kernels
+        "gptq_marlin_gemm": ops.gptq_marlin_gemm,
+        "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
+        "gptq_marlin_repack": ops.gptq_marlin_repack,
+        "allspark_w8a16_gemm": ops.allspark_w8a16_gemm,
+    }
+
+    min_run_time = 1
+
+    # Warmup pytorch
+    for _ in range(5):
+        torch.matmul(a, marlin_w_ref)
+
+    results.append(
+        benchmark.Timer(
+            stmt="torch.matmul(a, marlin_w_ref)",
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="pytorch_gemm",
+        ).blocked_autorange(min_run_time=min_run_time)
+    )
+
+    results.append(
+        benchmark.Timer(
+            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="gptq_marlin_gemm",
+        ).blocked_autorange(min_run_time=min_run_time)
+    )
+
+    results.append(
+        benchmark.Timer(
+            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="gptq_marlin_gemm_fp32",
+        ).blocked_autorange(min_run_time=min_run_time)
+    )
+
+    if marlin_24_supported:
+        results.append(
+            benchmark.Timer(
+                stmt="output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)",  # noqa: E501
+                globals=globals,
+                label=label,
+                sub_label=sub_label,
+                description="gptq_marlin_24_gemm",
+            ).blocked_autorange(min_run_time=min_run_time)
+        )
+
+    if repack_supported:
+        results.append(
+            benchmark.Timer(
+                stmt="q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)",  # noqa: E501
+                globals=globals,
+                label=label,
+                sub_label=sub_label,
+                description="gptq_marlin_repack",
+            ).blocked_autorange(min_run_time=min_run_time)
+        )
+
+    if allspark_supported:
+        results.append(
+            benchmark.Timer(
+                stmt="output = allspark_w8a16_gemm(a, qw_reorder, s_reorder, zp_reorder, size_n, group_size, sm_count, sm_version, CUBLAS_M_THRESHOLD, False, True)",  # noqa: E501
+                globals=globals,
+                label=label,
+                sub_label=sub_label,
+                description="allspark_w8a16_gemm_fp32",
+            ).blocked_autorange(min_run_time=min_run_time)
+        )
+
+
+def main(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+    results: list[benchmark.Measurement] = []
+
+    for model in args.models:
+        for layer in WEIGHT_SHAPES[model]:
+            size_k = layer[0]
+            size_n = layer[1]
+
+            if len(args.limit_k) > 0 and size_k not in args.limit_k:
+                continue
+
+            if len(args.limit_n) > 0 and size_n not in args.limit_n:
+                continue
+
+            for act_order in ACT_ORDER_OPTS:
+                if (
+                    len(args.limit_act_order) > 0
+                    and act_order not in args.limit_act_order
+                ):
+                    continue
+
+                for is_k_full in K_FULL_OPTS:
+                    if (
+                        len(args.limit_k_full) > 0
+                        and is_k_full not in args.limit_k_full
+                    ):
+                        continue
+
+                    for quant_type in query_marlin_supported_quant_types():
+                        if (
+                            len(args.limit_num_bits) > 0
+                            and quant_type.size_bits not in args.limit_num_bits
+                        ):
+                            continue
+
+                        for group_size in (
+                            MARLIN_SUPPORTED_GROUP_SIZES
+                            + FP4_MARLIN_SUPPORTED_GROUP_SIZES
+                        ):
+                            if (
+                                len(args.limit_group_size) > 0
+                                and group_size not in args.limit_group_size
+                            ):
+                                continue
+
+                            # For act_order, the group_size must be less than
+                            # size_k
+                            if act_order and (group_size == size_k or group_size == -1):
+                                continue
+
+                            for size_m in args.batch_sizes:
+                                bench_run(
+                                    results,
+                                    model,
+                                    act_order,
+                                    is_k_full,
+                                    quant_type,
+                                    group_size,
+                                    size_m,
+                                    size_k,
+                                    size_n,
+                                )
+
+    compare = benchmark.Compare(results)
+    compare.print()
+
+
+# For quick benchmarking use:
+#   python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501
+#
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="Benchmark Marlin across specified models/shapes/batches"
+    )
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=DEFAULT_MODELS,
+        choices=WEIGHT_SHAPES.keys(),
+    )
+    parser.add_argument(
+        "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
+    )
+    parser.add_argument("--limit-k", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-n", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-group-size", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-num-bits", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-act-order", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-k-full", nargs="+", type=int, default=[])
+
+    args = parser.parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/benchmarks/kernels/benchmark_moe.py b/vllm_v0.10.0/benchmarks/kernels/benchmark_moe.py
new file mode 100644
index 0000000..c350aaf
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/kernels/benchmark_moe.py
@@ -0,0 +1,749 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import json
+import time
+from contextlib import nullcontext
+from datetime import datetime
+from itertools import product
+from typing import Any, TypedDict
+
+import ray
+import torch
+from ray.experimental.tqdm_ray import tqdm
+
+from vllm.model_executor.layers.fused_moe.fused_moe import *
+from vllm.platforms import current_platform
+from vllm.transformers_utils.config import get_config
+from vllm.triton_utils import triton
+from vllm.utils import FlexibleArgumentParser
+
+FP8_DTYPE = current_platform.fp8_dtype()
+
+
+class BenchmarkConfig(TypedDict):
+    BLOCK_SIZE_M: int
+    BLOCK_SIZE_N: int
+    BLOCK_SIZE_K: int
+    GROUP_SIZE_M: int
+    num_warps: int
+    num_stages: int
+
+
+def benchmark_config(
+    config: BenchmarkConfig,
+    num_tokens: int,
+    num_experts: int,
+    shard_intermediate_size: int,
+    hidden_size: int,
+    topk: int,
+    dtype: torch.dtype,
+    use_fp8_w8a8: bool,
+    use_int8_w8a16: bool,
+    num_iters: int = 100,
+    block_quant_shape: list[int] = None,
+    use_deep_gemm: bool = False,
+) -> float:
+    init_dtype = torch.float16 if use_fp8_w8a8 else dtype
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    if use_int8_w8a16:
+        w1 = torch.randint(
+            -127,
+            127,
+            (
+                num_experts,
+                shard_intermediate_size,
+                hidden_size,
+            ),
+            dtype=torch.int8,
+        )
+        w2 = torch.randint(
+            -127,
+            127,
+            (
+                num_experts,
+                hidden_size,
+                shard_intermediate_size // 2,
+            ),
+            dtype=torch.int8,
+        )
+    else:
+        w1 = torch.randn(
+            num_experts, shard_intermediate_size, hidden_size, dtype=init_dtype
+        )
+        w2 = torch.randn(
+            num_experts, hidden_size, shard_intermediate_size // 2, dtype=init_dtype
+        )
+    gating_output = torch.randn(num_iters, num_tokens, num_experts, dtype=torch.float32)
+
+    w1_scale = None
+    w2_scale = None
+    a1_scale = None
+    a2_scale = None
+    if use_int8_w8a16:
+        w1_scale = torch.randn(
+            (num_experts, 2 * shard_intermediate_size), dtype=torch.float32
+        )
+        w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32)
+    if use_deep_gemm:
+        # we use the default block shape for deepgemm
+        block_quant_shape = [128, 128]
+    if use_fp8_w8a8:
+        if block_quant_shape:
+            block_n, block_k = block_quant_shape[0], block_quant_shape[1]
+            E = num_experts
+            N = shard_intermediate_size // 2
+            K = hidden_size
+            factor_for_scale = 1e-2
+            n_tiles_w1 = (2 * N + block_n - 1) // block_n
+            n_tiles_w2 = (K + block_n - 1) // block_n
+            k_tiles_w1 = (K + block_k - 1) // block_k
+            k_tiles_w2 = (N + block_k - 1) // block_k
+            w1_scale = (
+                torch.rand((E, n_tiles_w1, k_tiles_w1), dtype=torch.float32)
+                * factor_for_scale
+            )
+            w2_scale = (
+                torch.rand((E, n_tiles_w2, k_tiles_w2), dtype=torch.float32)
+                * factor_for_scale
+            )
+        else:
+            w1_scale = torch.randn(num_experts, dtype=torch.float32)
+            w2_scale = torch.randn(num_experts, dtype=torch.float32)
+
+        a1_scale = torch.randn(1, dtype=torch.float32)
+        a2_scale = torch.randn(1, dtype=torch.float32)
+
+        w1 = w1.to(FP8_DTYPE)
+        w2 = w2.to(FP8_DTYPE)
+
+    input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32)
+
+    def prepare(i: int):
+        input_gating.copy_(gating_output[i])
+
+    def run():
+        from vllm.model_executor.layers.fused_moe import override_config
+
+        with override_config(config):
+            if use_deep_gemm:
+                topk_weights, topk_ids, token_expert_indices = fused_topk(
+                    x, input_gating, topk, False
+                )
+                return fused_experts(
+                    x,
+                    w1,
+                    w2,
+                    topk_weights,
+                    topk_ids,
+                    inplace=True,
+                    use_fp8_w8a8=use_fp8_w8a8,
+                    w1_scale=w1_scale,
+                    w2_scale=w2_scale,
+                    a1_scale=a1_scale,
+                    a2_scale=a2_scale,
+                    block_shape=block_quant_shape,
+                    allow_deep_gemm=True,
+                )
+            else:
+                fused_moe(
+                    x,
+                    w1,
+                    w2,
+                    input_gating,
+                    topk,
+                    renormalize=True,
+                    inplace=True,
+                    use_fp8_w8a8=use_fp8_w8a8,
+                    use_int8_w8a16=use_int8_w8a16,
+                    w1_scale=w1_scale,
+                    w2_scale=w2_scale,
+                    a1_scale=a1_scale,
+                    a2_scale=a2_scale,
+                    block_shape=block_quant_shape,
+                )
+
+    # JIT compilation & warmup
+    run()
+    torch.cuda.synchronize()
+
+    # Capture 10 invocations with CUDA graph
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph):
+        for _ in range(10):
+            run()
+    torch.cuda.synchronize()
+
+    # Warmup
+    for _ in range(5):
+        graph.replay()
+    torch.cuda.synchronize()
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    latencies: list[float] = []
+    for i in range(num_iters):
+        prepare(i)
+        torch.cuda.synchronize()
+
+        start_event.record()
+        graph.replay()
+        end_event.record()
+        end_event.synchronize()
+        latencies.append(start_event.elapsed_time(end_event))
+    avg = sum(latencies) / (num_iters * 10) * 1000  # us
+    graph.reset()
+    return avg
+
+
+def get_rocm_tuning_space(use_fp16):
+    block_mn_range = [16, 32, 64, 128, 256]
+    block_k_range = [16, 32, 64, 128, 256]
+    if not use_fp16:
+        block_k_range.remove(16)  # BLOCK_K=16 not supported for fp8
+    num_warps_range = [1, 2, 4, 8]
+    group_m_range = [1, 4, 8, 16, 32]
+    num_stage_range = [2]
+    waves_per_eu_range = [0]
+    matrix_instr_nonkdim_range = [16, 32] if use_fp16 else []
+    kpack_range = [1, 2] if use_fp16 else []
+
+    param_ranges = {
+        "BLOCK_SIZE_M": block_mn_range,
+        "BLOCK_SIZE_N": block_mn_range,
+        "BLOCK_SIZE_K": block_k_range,
+        "GROUP_SIZE_M": group_m_range,
+        "num_warps": num_warps_range,
+        "num_stages": num_stage_range,
+        "waves_per_eu": waves_per_eu_range,
+    }
+    if use_fp16:
+        param_ranges["matrix_instr_nonkdim"] = matrix_instr_nonkdim_range
+        param_ranges["kpack"] = kpack_range
+
+    return param_ranges
+
+
+def get_configs_compute_bound(use_fp16, block_quant_shape) -> list[dict[str, int]]:
+    configs: list[BenchmarkConfig] = []
+
+    if current_platform.is_rocm():
+        param_ranges = get_rocm_tuning_space(use_fp16)
+    else:
+        # Reduced search space for faster tuning.
+        # TODO(woosuk): Increase the search space and use a performance model to
+        # prune the search space.
+        block_m_range = [16, 32, 64, 128, 256]
+        block_n_range = [32, 64, 128, 256]
+        block_k_range = [64, 128, 256]
+        num_warps_range = [4, 8]
+        group_m_range = [1, 16, 32, 64]
+        num_stage_range = [2, 3, 4, 5]
+
+        param_ranges = {
+            "BLOCK_SIZE_M": block_m_range,
+            "BLOCK_SIZE_N": block_n_range,
+            "BLOCK_SIZE_K": block_k_range,
+            "GROUP_SIZE_M": group_m_range,
+            "num_warps": num_warps_range,
+            "num_stages": num_stage_range,
+        }
+
+    keys, values = zip(*param_ranges.items())
+    for config_values in product(*values):
+        config = dict(zip(keys, config_values))
+        configs.append(config)
+
+    # Remove configs that are not compatible with fp8 block quantization
+    # BLOCK_SIZE_K must be a multiple of block_k
+    # BLOCK_SIZE_N must be a multiple of block_n
+    if block_quant_shape is not None and not use_fp16:
+        block_n, block_k = block_quant_shape[0], block_quant_shape[1]
+        for config in configs[:]:
+            if (
+                config["BLOCK_SIZE_K"] % block_k != 0
+                or config["BLOCK_SIZE_N"] % block_n != 0
+            ):
+                configs.remove(config)
+    return configs
+
+
+def prune_rocm_search_space(
+    num_tokens, shard_intermediate_size, hidden_size, search_space, is_fp16, topk
+):
+    N1, K1 = shard_intermediate_size, hidden_size
+    N2, K2 = hidden_size, shard_intermediate_size // 2
+    pruned_space_1 = prune_rocm_configs(
+        num_tokens * topk, N1, K1, search_space, is_fp16
+    )
+    pruned_space_2 = prune_rocm_configs(
+        num_tokens * topk, N2, K2, search_space, is_fp16
+    )
+    search_space = merge_unique_dicts(pruned_space_1, pruned_space_2)
+    return search_space
+
+
+# The following code is inspired by ROCm/Triton GEMM tuning script:
+# https://github.com/ROCm/triton/blob/triton-mlir/scripts/amd/gemm/tune_gemm.py#L89
+def prune_rocm_configs(M, N, K, configs, is_fp16=True):
+    pruned_configs = []
+    elemBytes_a = 2 if is_fp16 else 1
+    elemBytes_b = 2 if is_fp16 else 1
+
+    mfma = 16 if M < 32 or N < 32 else 32
+
+    # TODO (zhanglx): figure out the boundary between large and small gemms
+    large_gemm = False
+    if M >= 2048 and N >= 2048:
+        large_gemm = True
+
+    for config in configs:
+        BLOCK_SIZE_M = config.get("BLOCK_SIZE_M")
+        BLOCK_SIZE_N = config.get("BLOCK_SIZE_N")
+        BLOCK_SIZE_K = config.get("BLOCK_SIZE_K")
+        num_warps = config.get("num_warps")
+
+        if is_fp16:
+            matrix_instr_nonkdim = config.get("matrix_instr_nonkdim")
+            if matrix_instr_nonkdim > mfma:
+                continue
+        if mfma == 4 and BLOCK_SIZE_K < 64:
+            continue
+        # some layouts could not work properly in case
+        # number elements per thread is less 1
+        if BLOCK_SIZE_M * BLOCK_SIZE_N < 64:
+            continue
+        SPLIT_K = config.get("SPLIT_K", 1)
+        GROUP_M = config.get("GROUP_SIZE_M")
+        if is_fp16:
+            if (
+                matrix_instr_nonkdim > BLOCK_SIZE_M
+                or matrix_instr_nonkdim > BLOCK_SIZE_N
+            ):
+                continue
+            if matrix_instr_nonkdim >= M and matrix_instr_nonkdim != BLOCK_SIZE_M:
+                continue
+            if matrix_instr_nonkdim >= N and matrix_instr_nonkdim != BLOCK_SIZE_N:
+                continue
+        # Skip BLOCK_SIZE that is too large compare to M/N
+        # unless BLOCK_SIZE is already small enough
+        if M * 2 < BLOCK_SIZE_M and BLOCK_SIZE_M != 16:
+            continue
+        if N * 2 < BLOCK_SIZE_N and BLOCK_SIZE_N != 16:
+            continue
+        # skip large split_k when not necessary
+        if SPLIT_K != 1 and not need_split_k(M, N, K):
+            continue
+        # skip split_k that leads to EVEN_K = false
+        leap = SPLIT_K * BLOCK_SIZE_K
+        modv = K % leap
+        if modv != 0:
+            continue
+        # skip large GROUP_M
+        if GROUP_M * BLOCK_SIZE_M > M and GROUP_M != 1:
+            continue
+        # out of shared memory resource
+        # TODO (zhanglx): This does not consider the LDS usage in the epilogue
+        LDS = (
+            BLOCK_SIZE_K * BLOCK_SIZE_M * elemBytes_a
+            + BLOCK_SIZE_K * BLOCK_SIZE_N * elemBytes_b
+        )
+        if LDS > 65536:
+            continue
+        # Skip small block sizes and num_warps for large gemm
+        # For fp16 and f8, we want to only use BLOCK_SIZE >= 64
+        if large_gemm:
+            if BLOCK_SIZE_M < 64 or BLOCK_SIZE_N < 64:
+                continue
+            if BLOCK_SIZE_K < 64:
+                continue
+            if num_warps < 4:
+                continue
+
+        pruned_configs.append(config)
+
+    return pruned_configs
+
+
+def need_split_k(SIZE_M, SIZE_N, SIZE_K):
+    return (SIZE_M < 64 or SIZE_N < 64) and SIZE_K > 1024
+
+
+def merge_unique_dicts(list1, list2):
+    result = []
+    combined_list = list1.copy()
+    combined_list.extend(list2)
+    for dictionary in combined_list:
+        if dictionary not in result:
+            result.append(dictionary)
+    return result
+
+
+@ray.remote(num_gpus=1)
+class BenchmarkWorker:
+    def __init__(self, seed: int) -> None:
+        torch.set_default_device("cuda")
+        current_platform.seed_everything(seed)
+        self.seed = seed
+        # Get the device ID to allocate tensors and kernels
+        # on the respective GPU. This is required for Ray to work
+        # correctly with multi-GPU tuning on the ROCm platform.
+        self.device_id = int(ray.get_gpu_ids()[0])
+
+    def benchmark(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        shard_intermediate_size: int,
+        hidden_size: int,
+        topk: int,
+        dtype: torch.dtype,
+        use_fp8_w8a8: bool,
+        use_int8_w8a16: bool,
+        block_quant_shape: list[int] = None,
+        use_deep_gemm: bool = False,
+    ) -> tuple[dict[str, int], float]:
+        current_platform.seed_everything(self.seed)
+        dtype_str = get_config_dtype_str(
+            dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
+        )
+        # NOTE(woosuk): The current naming convention uses w2.shape[2], which
+        # is the intermediate size after silu_and_mul.
+        op_config = get_moe_configs(
+            num_experts, shard_intermediate_size // 2, dtype_str
+        )
+        if op_config is None:
+            config = get_default_config(
+                num_tokens,
+                num_experts,
+                shard_intermediate_size,
+                hidden_size,
+                topk,
+                dtype_str,
+                is_marlin=False,
+            )
+        else:
+            config = op_config[min(op_config.keys(), key=lambda x: abs(x - num_tokens))]
+        kernel_time = benchmark_config(
+            config,
+            num_tokens,
+            num_experts,
+            shard_intermediate_size,
+            hidden_size,
+            topk,
+            dtype,
+            use_fp8_w8a8,
+            use_int8_w8a16,
+            num_iters=100,
+            block_quant_shape=block_quant_shape,
+            use_deep_gemm=use_deep_gemm,
+        )
+        return config, kernel_time
+
+    def tune(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        shard_intermediate_size: int,
+        hidden_size: int,
+        topk: int,
+        dtype: torch.dtype,
+        use_fp8_w8a8: bool,
+        use_int8_w8a16: bool,
+        search_space: list[dict[str, int]],
+        block_quant_shape: list[int],
+        use_deep_gemm: bool,
+    ) -> dict[str, int]:
+        best_config = None
+        best_time = float("inf")
+        if current_platform.is_rocm():
+            is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
+            search_space = prune_rocm_search_space(
+                num_tokens,
+                shard_intermediate_size,
+                hidden_size,
+                search_space,
+                is_fp16,
+                topk,
+            )
+
+        need_device_guard = False
+        if current_platform.is_rocm():
+            visible_device = os.environ.get("ROCR_VISIBLE_DEVICES", None)
+            if visible_device != f"{self.device_id}":
+                need_device_guard = True
+
+        with torch.cuda.device(self.device_id) if need_device_guard else nullcontext():
+            for config in tqdm(search_space):
+                try:
+                    kernel_time = benchmark_config(
+                        config,
+                        num_tokens,
+                        num_experts,
+                        shard_intermediate_size,
+                        hidden_size,
+                        topk,
+                        dtype,
+                        use_fp8_w8a8,
+                        use_int8_w8a16,
+                        num_iters=20,
+                        block_quant_shape=block_quant_shape,
+                        use_deep_gemm=use_deep_gemm,
+                    )
+                except triton.runtime.autotuner.OutOfResources:
+                    # Some configurations may be invalid and fail to compile.
+                    continue
+
+                if kernel_time < best_time:
+                    best_time = kernel_time
+                    best_config = config
+        now = datetime.now()
+        print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
+        assert best_config is not None
+        return best_config
+
+
+def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
+    return {
+        "BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
+        "BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
+        "BLOCK_SIZE_K": config["BLOCK_SIZE_K"],
+        "GROUP_SIZE_M": config["GROUP_SIZE_M"],
+        "num_warps": config["num_warps"],
+        "num_stages": config["num_stages"],
+        **(
+            {"waves_per_eu": config["waves_per_eu"]} if "waves_per_eu" in config else {}
+        ),
+        **(
+            {"matrix_instr_nonkdim": config["matrix_instr_nonkdim"]}
+            if "matrix_instr_nonkdim" in config
+            else {}
+        ),
+        **({"kpack": config["kpack"]} if "kpack" in config else {}),
+    }
+
+
+def save_configs(
+    configs: dict[int, BenchmarkConfig],
+    num_experts: int,
+    shard_intermediate_size: int,
+    hidden_size: int,
+    topk: int,
+    dtype: torch.dtype,
+    use_fp8_w8a8: bool,
+    use_int8_w8a16: bool,
+    block_quant_shape: list[int],
+) -> None:
+    dtype_str = get_config_dtype_str(
+        dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
+    )
+
+    # NOTE(woosuk): The current naming convention uses w2.shape[2], which
+    # is the intermediate size after silu_and_mul.
+    filename = get_config_file_name(
+        num_experts, shard_intermediate_size // 2, dtype_str, block_quant_shape
+    )
+
+    print(f"Writing best config to {filename}...")
+    with open(filename, "w") as f:
+        json.dump(configs, f, indent=4)
+        f.write("\n")
+
+
+def get_weight_block_size_safety(config, default_value=None):
+    quantization_config = getattr(config, "quantization_config", {})
+    if isinstance(quantization_config, dict):
+        return quantization_config.get("weight_block_size", default_value)
+    return default_value
+
+
+def main(args: argparse.Namespace):
+    print(args)
+
+    config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
+    if args.model_prefix:
+        config = getattr(config, args.model_prefix)
+
+    if config.architectures[0] == "DbrxForCausalLM":
+        E = config.ffn_config.moe_num_experts
+        topk = config.ffn_config.moe_top_k
+        intermediate_size = config.ffn_config.ffn_hidden_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    elif config.architectures[0] == "JambaForCausalLM":
+        E = config.num_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    elif config.architectures[0] in (
+        "DeepseekV3ForCausalLM",
+        "DeepseekV2ForCausalLM",
+        "Glm4MoeForCausalLM",
+    ):
+        E = config.n_routed_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.moe_intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    elif config.architectures[0] in ("Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"):
+        E = config.num_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.moe_intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
+        E = config.num_experts
+        topk = config.moe_topk[0]
+        intermediate_size = config.moe_intermediate_size[0]
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    else:
+        # Support for llama4
+        config = config.get_text_config()
+        # Default: Mixtral.
+        E = config.num_local_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+
+    hidden_size = config.hidden_size
+    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
+    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
+    use_int8_w8a16 = args.dtype == "int8_w8a16"
+    block_quant_shape = get_weight_block_size_safety(config)
+
+    if args.batch_size is None:
+        batch_sizes = [
+            1,
+            2,
+            4,
+            8,
+            16,
+            24,
+            32,
+            48,
+            64,
+            96,
+            128,
+            256,
+            512,
+            1024,
+            1536,
+            2048,
+            3072,
+            4096,
+        ]
+    else:
+        batch_sizes = args.batch_size
+
+    use_deep_gemm = bool(args.use_deep_gemm)
+
+    if current_platform.is_rocm() and "HIP_VISIBLE_DEVICES" in os.environ:
+        # Ray will set ROCR_VISIBLE_DEVICES for device visibility
+        logger.warning(
+            "Ray uses ROCR_VISIBLE_DEVICES to control device accessibility."
+            "Replacing HIP_VISIBLE_DEVICES with ROCR_VISIBLE_DEVICES."
+        )
+        val = os.environ["HIP_VISIBLE_DEVICES"]
+        os.environ["ROCR_VISIBLE_DEVICES"] = val
+        del os.environ["HIP_VISIBLE_DEVICES"]
+
+    ray.init()
+    num_gpus = int(ray.available_resources()["GPU"])
+    workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
+
+    def _distribute(method: str, inputs: list[Any]) -> list[Any]:
+        outputs = []
+        worker_idx = 0
+        for input_args in inputs:
+            worker = workers[worker_idx]
+            worker_method = getattr(worker, method)
+            output = worker_method.remote(*input_args)
+            outputs.append(output)
+            worker_idx = (worker_idx + 1) % num_gpus
+        return ray.get(outputs)
+
+    if args.tune:
+        is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
+        search_space = get_configs_compute_bound(is_fp16, block_quant_shape)
+        print(f"Start tuning over {len(search_space)} configurations...")
+
+        start = time.time()
+        configs = _distribute(
+            "tune",
+            [
+                (
+                    batch_size,
+                    E,
+                    shard_intermediate_size,
+                    hidden_size,
+                    topk,
+                    dtype,
+                    use_fp8_w8a8,
+                    use_int8_w8a16,
+                    search_space,
+                    block_quant_shape,
+                    use_deep_gemm,
+                )
+                for batch_size in batch_sizes
+            ],
+        )
+        best_configs = {
+            M: sort_config(config) for M, config in zip(batch_sizes, configs)
+        }
+        save_configs(
+            best_configs,
+            E,
+            shard_intermediate_size,
+            hidden_size,
+            topk,
+            dtype,
+            use_fp8_w8a8,
+            use_int8_w8a16,
+            block_quant_shape,
+        )
+        end = time.time()
+        print(f"Tuning took {end - start:.2f} seconds")
+    else:
+        outputs = _distribute(
+            "benchmark",
+            [
+                (
+                    batch_size,
+                    E,
+                    shard_intermediate_size,
+                    hidden_size,
+                    topk,
+                    dtype,
+                    use_fp8_w8a8,
+                    use_int8_w8a16,
+                    block_quant_shape,
+                    use_deep_gemm,
+                )
+                for batch_size in batch_sizes
+            ],
+        )
+
+        for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
+            print(f"Batch size: {batch_size}, config: {config}")
+            print(f"Kernel time: {kernel_time:.2f} us")
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser()
+    parser.add_argument(
+        "--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1"
+    )
+    parser.add_argument(
+        "--tp-size", "-tp", "--tensor-parallel-size", type=int, default=2
+    )
+    parser.add_argument(
+        "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
+    )
+    parser.add_argument("--use-deep-gemm", action="store_true")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--batch-size", type=int, nargs="+", required=False)
+    parser.add_argument("--tune", action="store_true")
+    parser.add_argument("--trust-remote-code", action="store_true")
+    parser.add_argument("--model-prefix", type=str, required=False)
+    args = parser.parse_args()
+
+    main(args)
diff --git a/vllm_v0.10.0/benchmarks/kernels/benchmark_moe_align_block_size.py b/vllm_v0.10.0/benchmarks/kernels/benchmark_moe_align_block_size.py
new file mode 100644
index 0000000..1af5a21
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/kernels/benchmark_moe_align_block_size.py
@@ -0,0 +1,156 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import itertools
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
+    moe_align_block_size_triton,
+)
+from vllm.triton_utils import triton
+
+
+def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
+    return torch.stack(
+        [
+            torch.randperm(num_experts, dtype=torch.int32, device="cuda")[:topk]
+            for _ in range(num_tokens)
+        ]
+    )
+
+
+def check_correctness(num_tokens, num_experts=256, block_size=256, topk=8):
+    """
+    Verifies vllm vs. Triton
+    """
+    topk_ids = get_topk_ids(num_tokens, num_experts, topk)
+
+    # 1. malloc space for triton and vllm
+    # malloc enough space (max_num_tokens_padded) for the sorted ids
+    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+    sorted_ids_triton = torch.empty(
+        (max_num_tokens_padded,), dtype=torch.int32, device="cuda"
+    )
+    expert_ids_triton = torch.empty(
+        (max_num_tokens_padded // block_size,), dtype=torch.int32, device="cuda"
+    )
+    num_tokens_post_pad_triton = torch.empty((1,), dtype=torch.int32, device="cuda")
+
+    sorted_ids_vllm = torch.empty_like(sorted_ids_triton)
+    expert_ids_vllm = torch.empty_like(expert_ids_triton)
+    num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_triton)
+
+    # 2. run implementations
+    moe_align_block_size_triton(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_ids_triton,
+        expert_ids_triton,
+        num_tokens_post_pad_triton,
+    )
+
+    ops.moe_align_block_size(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_ids_vllm,
+        expert_ids_vllm,
+        num_tokens_post_pad_vllm,
+    )
+    print(f"✅ VLLM implementation works with {num_experts} experts!")
+
+    # 3. compare results
+    if torch.allclose(expert_ids_triton, expert_ids_vllm) and torch.allclose(
+        num_tokens_post_pad_triton, num_tokens_post_pad_vllm
+    ):
+        print("✅ Triton and VLLM implementations match.")
+    else:
+        print("❌ Triton and VLLM implementations DO NOT match.")
+        print("Triton expert_ids:", expert_ids_triton)
+        print("VLLM expert_ids:", expert_ids_vllm)
+        print("Triton num_tokens_post_pad:", num_tokens_post_pad_triton)
+        print("VLLM num_tokens_post_pad:", num_tokens_post_pad_vllm)
+
+
+# test configurations
+num_tokens_range = [1, 16, 256, 4096]
+num_experts_range = [16, 64, 224, 256, 280, 512]
+topk_range = [1, 2, 8]
+configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["num_tokens", "num_experts", "topk"],
+        x_vals=configs,
+        line_arg="provider",
+        line_vals=["vllm", "triton"],  # "triton"
+        line_names=["VLLM", "Triton"],  # "Triton"
+        plot_name="moe-align-block-size-performance",
+        args={},
+    )
+)
+def benchmark(num_tokens, num_experts, topk, provider):
+    """Benchmark function for Triton."""
+    block_size = 256
+    topk_ids = get_topk_ids(num_tokens, num_experts, topk)
+
+    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+    sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
+    max_num_m_blocks = max_num_tokens_padded // block_size
+    expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device="cuda")
+    num_tokens_post_pad = torch.empty((1,), dtype=torch.int32, device="cuda")
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "vllm":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: ops.moe_align_block_size(
+                topk_ids,
+                num_experts,
+                block_size,
+                sorted_ids.clone(),
+                expert_ids.clone(),
+                num_tokens_post_pad.clone(),
+            ),
+            quantiles=quantiles,
+        )
+    elif provider == "triton":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: moe_align_block_size_triton(
+                topk_ids,
+                num_experts,
+                block_size,
+                sorted_ids.clone(),
+                expert_ids.clone(),
+                num_tokens_post_pad.clone(),
+            ),
+            quantiles=quantiles,
+        )
+
+    return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--num_experts",
+        type=int,
+        default=64,
+        choices=[8, 16, 32, 64, 128, 256],
+    )
+    parser.add_argument(
+        "--topk",
+        type=int,
+        default=8,
+        choices=[2, 4, 8],
+        help="Top-k value for correctness check.",
+    )
+    args = parser.parse_args()
+
+    print("Running correctness check...")
+    check_correctness(num_tokens=1024, num_experts=args.num_experts, topk=args.topk)
+    benchmark.run(print_data=True, show_plots=True)
diff --git a/vllm_v0.10.0/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/vllm_v0.10.0/benchmarks/kernels/benchmark_moe_permute_unpermute.py
new file mode 100644
index 0000000..4ed6900
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -0,0 +1,418 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+from typing import Any, TypedDict
+
+import ray
+import torch
+from transformers import AutoConfig
+
+from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+    _moe_permute,
+    _moe_unpermute_and_reduce,
+)
+from vllm.model_executor.layers.fused_moe.fused_moe import *
+from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import *
+from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
+from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser
+
+FP8_DTYPE = current_platform.fp8_dtype()
+
+
+class BenchmarkConfig(TypedDict):
+    BLOCK_SIZE_M: int
+    BLOCK_SIZE_N: int
+    BLOCK_SIZE_K: int
+    GROUP_SIZE_M: int
+    num_warps: int
+    num_stages: int
+
+
+def benchmark_permute(
+    num_tokens: int,
+    num_experts: int,
+    hidden_size: int,
+    topk: int,
+    dtype: torch.dtype,
+    use_fp8_w8a8: bool,
+    use_int8_w8a16: bool,
+    num_iters: int = 100,
+    use_customized_permute: bool = False,
+) -> float:
+    # init_dtype = torch.float16 if use_fp8_w8a8 else dtype
+    hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    # output_hidden_states = torch.empty_like(hidden_states)
+    if use_fp8_w8a8:
+        align_block_size = 128  # deepgemm needs 128 m aligned block
+        qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
+    else:
+        align_block_size = None
+        qhidden_states = hidden_states
+
+    gating_output = torch.randn(num_iters, num_tokens, num_experts, dtype=torch.float32)
+
+    input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32)
+    topk_weights, topk_ids, token_expert_indices = fused_topk(
+        qhidden_states, input_gating, topk, False
+    )
+
+    def prepare(i: int):
+        input_gating.copy_(gating_output[i])
+
+    def run():
+        if use_customized_permute:
+            (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = (
+                moe_permute(
+                    qhidden_states,
+                    topk_weights=topk_weights,
+                    topk_ids=topk_ids,
+                    token_expert_indices=token_expert_indices,
+                    topk=topk,
+                    n_expert=num_experts,
+                    n_local_expert=num_experts,
+                    expert_map=None,
+                    align_block_size=align_block_size,
+                )
+            )
+        else:
+            (
+                permuted_hidden_states,
+                a1q_scale,
+                sorted_token_ids,
+                expert_ids,
+                inv_perm,
+            ) = _moe_permute(
+                qhidden_states, None, topk_ids, num_experts, None, align_block_size
+            )
+
+    # JIT compilation & warmup
+    run()
+    torch.cuda.synchronize()
+
+    # Capture 10 invocations with CUDA graph
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph):
+        for _ in range(10):
+            run()
+    torch.cuda.synchronize()
+
+    # Warmup
+    for _ in range(5):
+        graph.replay()
+    torch.cuda.synchronize()
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    latencies: list[float] = []
+    for i in range(num_iters):
+        prepare(i)
+        torch.cuda.synchronize()
+
+        start_event.record()
+        graph.replay()
+        end_event.record()
+        end_event.synchronize()
+        latencies.append(start_event.elapsed_time(end_event))
+    avg = sum(latencies) / (num_iters * 10) * 1000  # us
+    graph.reset()
+    return avg
+
+
+def benchmark_unpermute(
+    num_tokens: int,
+    num_experts: int,
+    hidden_size: int,
+    topk: int,
+    dtype: torch.dtype,
+    use_fp8_w8a8: bool,
+    use_int8_w8a16: bool,
+    num_iters: int = 100,
+    use_customized_permute: bool = False,
+) -> float:
+    # init_dtype = torch.float16 if use_fp8_w8a8 else dtype
+    hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    output_hidden_states = torch.empty_like(hidden_states)
+    if use_fp8_w8a8:
+        align_block_size = 128  # deepgemm needs 128 m aligned block
+        qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
+    else:
+        align_block_size = None
+        qhidden_states = hidden_states
+
+    input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32)
+
+    topk_weights, topk_ids, token_expert_indices = fused_topk(
+        qhidden_states, input_gating, topk, False
+    )
+
+    def prepare():
+        if use_customized_permute:
+            (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = (
+                moe_permute(
+                    qhidden_states,
+                    topk_weights=topk_weights,
+                    topk_ids=topk_ids,
+                    token_expert_indices=token_expert_indices,
+                    topk=topk,
+                    n_expert=num_experts,
+                    n_local_expert=num_experts,
+                    expert_map=None,
+                    align_block_size=align_block_size,
+                )
+            )
+            # convert to fp16/bf16 as gemm output
+            return (
+                permuted_hidden_states.to(dtype),
+                first_token_off,
+                inv_perm_idx,
+                m_indices,
+            )
+        else:
+            (
+                permuted_qhidden_states,
+                a1q_scale,
+                sorted_token_ids,
+                expert_ids,
+                inv_perm,
+            ) = _moe_permute(
+                qhidden_states, None, topk_ids, num_experts, None, align_block_size
+            )
+            # convert to fp16/bf16 as gemm output
+            return (
+                permuted_qhidden_states.to(dtype),
+                a1q_scale,
+                sorted_token_ids,
+                expert_ids,
+                inv_perm,
+            )
+
+    def run(input: tuple):
+        if use_customized_permute:
+            (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = input
+            moe_unpermute(
+                permuted_hidden_states,
+                topk_weights,
+                topk_ids,
+                inv_perm_idx,
+                first_token_off,
+                topk,
+                num_experts,
+                num_experts,
+            )
+        else:
+            (
+                permuted_hidden_states,
+                a1q_scale,
+                sorted_token_ids,
+                expert_ids,
+                inv_perm,
+            ) = input
+            _moe_unpermute_and_reduce(
+                output_hidden_states, permuted_hidden_states, inv_perm, topk_weights
+            )
+
+    # JIT compilation & warmup
+    input = prepare()
+    run(input)
+    torch.cuda.synchronize()
+
+    # Capture 10 invocations with CUDA graph
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph):
+        for _ in range(10):
+            run(input)
+    torch.cuda.synchronize()
+
+    # Warmup
+    for _ in range(5):
+        graph.replay()
+    torch.cuda.synchronize()
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    latencies: list[float] = []
+    for i in range(num_iters):
+        torch.cuda.synchronize()
+        start_event.record()
+        graph.replay()
+        end_event.record()
+        end_event.synchronize()
+        latencies.append(start_event.elapsed_time(end_event))
+    avg = sum(latencies) / (num_iters * 10) * 1000  # us
+    graph.reset()
+    return avg
+
+
+@ray.remote(num_gpus=1)
+class BenchmarkWorker:
+    def __init__(self, seed: int) -> None:
+        torch.set_default_device("cuda")
+        current_platform.seed_everything(seed)
+        self.seed = seed
+        # Get the device ID to allocate tensors and kernels
+        # on the respective GPU. This is required for Ray to work
+        # correctly with multi-GPU tuning on the ROCm platform.
+        self.device_id = int(ray.get_gpu_ids()[0])
+
+    def benchmark(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        hidden_size: int,
+        topk: int,
+        dtype: torch.dtype,
+        use_fp8_w8a8: bool,
+        use_int8_w8a16: bool,
+        use_customized_permute: bool = False,
+    ) -> tuple[dict[str, int], float]:
+        current_platform.seed_everything(self.seed)
+
+        permute_time = benchmark_permute(
+            num_tokens,
+            num_experts,
+            hidden_size,
+            topk,
+            dtype,
+            use_fp8_w8a8,
+            use_int8_w8a16,
+            num_iters=100,
+            use_customized_permute=use_customized_permute,
+        )
+        unpermute_time = benchmark_unpermute(
+            num_tokens,
+            num_experts,
+            hidden_size,
+            topk,
+            dtype,
+            use_fp8_w8a8,
+            use_int8_w8a16,
+            num_iters=100,
+            use_customized_permute=use_customized_permute,
+        )
+        return permute_time, unpermute_time
+
+
+def get_weight_block_size_safety(config, default_value=None):
+    quantization_config = getattr(config, "quantization_config", {})
+    if isinstance(quantization_config, dict):
+        return quantization_config.get("weight_block_size", default_value)
+    return default_value
+
+
+def main(args: argparse.Namespace):
+    print(args)
+
+    config = AutoConfig.from_pretrained(
+        args.model, trust_remote_code=args.trust_remote_code
+    )
+    if config.architectures[0] == "DbrxForCausalLM":
+        E = config.ffn_config.moe_num_experts
+        topk = config.ffn_config.moe_top_k
+    elif config.architectures[0] == "JambaForCausalLM":
+        E = config.num_experts
+        topk = config.num_experts_per_tok
+    elif (
+        config.architectures[0] == "DeepseekV3ForCausalLM"
+        or config.architectures[0] == "DeepseekV2ForCausalLM"
+        or config.architectures[0] == "Glm4MoeForCausalLM"
+    ):
+        E = config.n_routed_experts
+        topk = config.num_experts_per_tok
+    elif config.architectures[0] in ["Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"]:
+        E = config.num_experts
+        topk = config.num_experts_per_tok
+
+    else:
+        # Support for llama4
+        config = config.get_text_config()
+        # Default: Mixtral.
+        E = config.num_local_experts
+        topk = config.num_experts_per_tok
+
+    hidden_size = config.hidden_size
+    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
+    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
+    use_int8_w8a16 = args.dtype == "int8_w8a16"
+    use_customized_permute = args.use_customized_permute
+
+    if args.batch_size is None:
+        batch_sizes = [
+            1,
+            2,
+            4,
+            8,
+            16,
+            24,
+            32,
+            48,
+            64,
+            96,
+            128,
+            256,
+            512,
+            1024,
+            1536,
+            2048,
+            3072,
+            4096,
+        ]
+    else:
+        batch_sizes = [args.batch_size]
+
+    ray.init()
+    num_gpus = int(ray.available_resources()["GPU"])
+    workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
+
+    def _distribute(method: str, inputs: list[Any]) -> list[Any]:
+        outputs = []
+        worker_idx = 0
+        for input_args in inputs:
+            worker = workers[worker_idx]
+            worker_method = getattr(worker, method)
+            output = worker_method.remote(*input_args)
+            outputs.append(output)
+            worker_idx = (worker_idx + 1) % num_gpus
+        return ray.get(outputs)
+
+    outputs = _distribute(
+        "benchmark",
+        [
+            (
+                batch_size,
+                E,
+                hidden_size,
+                topk,
+                dtype,
+                use_fp8_w8a8,
+                use_int8_w8a16,
+                use_customized_permute,
+            )
+            for batch_size in batch_sizes
+        ],
+    )
+
+    for batch_size, (permute, unpermute) in zip(batch_sizes, outputs):
+        print(f"Batch size: {batch_size}")
+        print(f"Permute time: {permute:.2f} us")
+        print(f"Unpermute time: {unpermute:.2f} us")
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser()
+    parser.add_argument(
+        "--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1"
+    )
+    parser.add_argument(
+        "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
+    )
+    parser.add_argument("--use-customized-permute", action="store_true")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--batch-size", type=int, required=False)
+    parser.add_argument("--trust-remote-code", action="store_true")
+    args = parser.parse_args()
+
+    main(args)
diff --git a/vllm_v0.10.0/benchmarks/kernels/benchmark_paged_attention.py b/vllm_v0.10.0/benchmarks/kernels/benchmark_paged_attention.py
new file mode 100644
index 0000000..7e0376c
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/kernels/benchmark_paged_attention.py
@@ -0,0 +1,251 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import random
+import time
+from typing import Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils import (
+    STR_DTYPE_TO_TORCH_DTYPE,
+    FlexibleArgumentParser,
+    create_kv_caches_with_random,
+)
+
+logger = init_logger(__name__)
+
+NUM_BLOCKS = 128 * 1024
+PARTITION_SIZE = 512
+PARTITION_SIZE_ROCM = 256
+
+
+@torch.inference_mode()
+def main(
+    version: str,
+    num_seqs: int,
+    seq_len: int,
+    num_query_heads: int,
+    num_kv_heads: int,
+    head_size: int,
+    use_alibi: bool,
+    block_size: int,
+    dtype: torch.dtype,
+    seed: int,
+    do_profile: bool,
+    device: str = "cuda",
+    kv_cache_dtype: Optional[str] = None,
+) -> None:
+    current_platform.seed_everything(seed)
+
+    scale = float(1.0 / (head_size**0.5))
+    query = torch.empty(
+        num_seqs, num_query_heads, head_size, dtype=dtype, device=device
+    )
+    query.uniform_(-scale, scale)
+
+    assert num_query_heads % num_kv_heads == 0
+    alibi_slopes = None
+    if use_alibi:
+        alibi_slopes = torch.randn(num_query_heads, dtype=torch.float, device=device)
+
+    seq_lens = [seq_len for _ in range(num_seqs)]
+    max_seq_len = max(seq_lens)
+    seq_lens = torch.tensor(seq_lens, dtype=torch.int, device=device)
+
+    # Create the block tables.
+    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
+    block_tables_lst: list[list[int]] = []
+    for _ in range(num_seqs):
+        block_table = [
+            random.randint(0, NUM_BLOCKS - 1) for _ in range(max_num_blocks_per_seq)
+        ]
+        block_tables_lst.append(block_table)
+
+    block_tables = torch.tensor(block_tables_lst, dtype=torch.int, device=device)
+
+    # Create the KV cache.
+    key_caches, value_caches = create_kv_caches_with_random(
+        NUM_BLOCKS,
+        block_size,
+        1,
+        num_kv_heads,
+        head_size,
+        kv_cache_dtype,
+        dtype,
+        device=device,
+    )
+    key_cache, value_cache = key_caches[0], value_caches[0]
+
+    # Prepare for the paged attention kernel.
+    output = torch.empty_like(query)
+    if version == "v2":
+        if current_platform.is_rocm():
+            global PARTITION_SIZE
+            if not args.custom_paged_attn and not current_platform.is_navi():
+                PARTITION_SIZE = 1024
+            else:
+                PARTITION_SIZE = PARTITION_SIZE_ROCM
+        num_partitions = (max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE
+        tmp_output = torch.empty(
+            size=(num_seqs, num_query_heads, num_partitions, head_size),
+            dtype=output.dtype,
+            device=output.device,
+        )
+        exp_sums = torch.empty(
+            size=(num_seqs, num_query_heads, num_partitions),
+            dtype=torch.float32,
+            device=output.device,
+        )
+        max_logits = torch.empty_like(exp_sums)
+
+    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
+        torch.cuda.synchronize()
+        if profile:
+            torch.cuda.cudart().cudaProfilerStart()
+        start_time = time.perf_counter()
+
+        # Using default kv_scale
+        k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
+
+        for _ in range(num_iters):
+            if version == "v1":
+                ops.paged_attention_v1(
+                    output,
+                    query,
+                    key_cache,
+                    value_cache,
+                    num_kv_heads,
+                    scale,
+                    block_tables,
+                    seq_lens,
+                    block_size,
+                    max_seq_len,
+                    alibi_slopes,
+                    kv_cache_dtype,
+                    k_scale,
+                    v_scale,
+                )
+            elif version == "v2":
+                if not args.custom_paged_attn:
+                    ops.paged_attention_v2(
+                        output,
+                        exp_sums,
+                        max_logits,
+                        tmp_output,
+                        query,
+                        key_cache,
+                        value_cache,
+                        num_kv_heads,
+                        scale,
+                        block_tables,
+                        seq_lens,
+                        block_size,
+                        max_seq_len,
+                        alibi_slopes,
+                        kv_cache_dtype,
+                        k_scale,
+                        v_scale,
+                    )
+                else:
+                    ops.paged_attention_rocm(
+                        output,
+                        exp_sums,
+                        max_logits,
+                        tmp_output,
+                        query,
+                        key_cache,
+                        value_cache,
+                        num_kv_heads,
+                        scale,
+                        block_tables,
+                        seq_lens,
+                        None,
+                        block_size,
+                        max_seq_len,
+                        alibi_slopes,
+                        kv_cache_dtype,
+                        k_scale,
+                        v_scale,
+                    )
+            else:
+                raise ValueError(f"Invalid version: {version}")
+        torch.cuda.synchronize()
+
+        end_time = time.perf_counter()
+        if profile:
+            torch.cuda.cudart().cudaProfilerStop()
+        return (end_time - start_time) / num_iters
+
+    # Warmup.
+    print("Warming up...")
+    run_benchmark = run_cuda_benchmark
+    run_benchmark(num_iters=3, profile=False)
+
+    # Benchmark.
+    if do_profile:
+        latency = run_benchmark(num_iters=1, profile=True)
+    else:
+        latency = run_benchmark(num_iters=100, profile=False)
+    print(f"Kernel running time: {latency * 1000000:.3f} us")
+
+
+if __name__ == "__main__":
+    logger.warning(
+        "This script benchmarks the paged attention kernel. "
+        "By default this is no longer used in vLLM inference."
+    )
+
+    parser = FlexibleArgumentParser(description="Benchmark the paged attention kernel.")
+    parser.add_argument("--version", type=str, choices=["v1", "v2"], default="v2")
+    parser.add_argument("--batch-size", type=int, default=8)
+    parser.add_argument("--seq-len", type=int, default=4096)
+    parser.add_argument("--num-query-heads", type=int, default=64)
+    parser.add_argument("--num-kv-heads", type=int, default=8)
+    parser.add_argument(
+        "--head-size",
+        type=int,
+        choices=[64, 80, 96, 112, 120, 128, 192, 256],
+        default=128,
+    )
+    parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
+    parser.add_argument("--use-alibi", action="store_true")
+    parser.add_argument(
+        "--dtype", type=str, choices=["half", "bfloat16", "float"], default="half"
+    )
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--profile", action="store_true")
+    parser.add_argument(
+        "--kv-cache-dtype",
+        type=str,
+        choices=["auto", "fp8", "fp8_e5m2", "fp8_e4m3"],
+        default="auto",
+        help="Data type for kv cache storage. If 'auto', will use model "
+        "data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. "
+        "ROCm (AMD GPU) supports fp8 (=fp8_e4m3)",
+    )
+    parser.add_argument(
+        "--custom-paged-attn", action="store_true", help="Use custom paged attention"
+    )
+    args = parser.parse_args()
+    print(args)
+
+    if args.num_query_heads % args.num_kv_heads != 0:
+        raise ValueError("num_query_heads must be divisible by num_kv_heads")
+    main(
+        version=args.version,
+        num_seqs=args.batch_size,
+        seq_len=args.seq_len,
+        num_query_heads=args.num_query_heads,
+        num_kv_heads=args.num_kv_heads,
+        head_size=args.head_size,
+        block_size=args.block_size,
+        use_alibi=args.use_alibi,
+        dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
+        seed=args.seed,
+        do_profile=args.profile,
+        kv_cache_dtype=args.kv_cache_dtype,
+    )
diff --git a/vllm_v0.10.0/benchmarks/kernels/benchmark_quant.py b/vllm_v0.10.0/benchmarks/kernels/benchmark_quant.py
new file mode 100644
index 0000000..6ab26f5
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/kernels/benchmark_quant.py
@@ -0,0 +1,108 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import time
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+
+
+@torch.inference_mode()
+def main(
+    num_tokens: int,
+    hidden_size: int,
+    static_scale: bool,
+    quant_dtype: torch.dtype,
+    dtype: torch.dtype,
+    seed: int = 0,
+    do_profile: bool = False,
+    num_warmup_iters: int = 5,
+    num_iters: int = 100,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device("cuda")
+
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None
+
+    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
+        torch.cuda.synchronize()
+        if profile:
+            torch.cuda.cudart().cudaProfilerStart()
+        start_time = time.perf_counter()
+
+        for _ in range(num_iters):
+            if quant_dtype == torch.int8:
+                ops.scaled_int8_quant(x, scale)
+            else:
+                ops.scaled_fp8_quant(x, scale)
+        torch.cuda.synchronize()
+
+        end_time = time.perf_counter()
+        if profile:
+            torch.cuda.cudart().cudaProfilerStop()
+        return (end_time - start_time) / num_iters
+
+    # Warmup.
+    print("Warming up...")
+    run_benchmark = run_cuda_benchmark
+    run_benchmark(num_iters=num_warmup_iters, profile=False)
+
+    # Benchmark.
+    if do_profile:
+        latency = run_benchmark(num_iters=1, profile=True)
+    else:
+        latency = run_benchmark(num_iters=num_iters, profile=False)
+    print(f"Kernel running time: {latency * 1000000:.3f} us")
+
+
+if __name__ == "__main__":
+
+    def to_torch_dtype(dt):
+        if dt == "int8":
+            return torch.int8
+        if dt == "fp8":
+            return torch.float8_e4m3fn
+        raise ValueError(f"Unsupported dtype: {dt}")
+
+    parser = FlexibleArgumentParser(
+        description="Benchmark the quantization (fp8 or int8) kernel."
+    )
+    parser.add_argument("--num-tokens", type=int, default=4096)
+    parser.add_argument("--hidden-size", type=int, default=8192)
+    parser.add_argument("--static-scale", action="store_true")
+    parser.add_argument(
+        "--quant-dtype", type=str, choices=["fp8", "int8"], default="int8"
+    )
+    parser.add_argument(
+        "--dtype", type=str, choices=["half", "bfloat16", "float"], default="half"
+    )
+
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--profile", action="store_true")
+    parser.add_argument("--num-warmup-iters", type=int, default=5)
+    parser.add_argument(
+        "--num-iters",
+        type=int,
+        default=100,
+        help="Number of benchmark iterations. "
+        "If --profile is set, this number is ignored",
+    )
+
+    args = parser.parse_args()
+    print(args)
+
+    main(
+        num_tokens=args.num_tokens,
+        hidden_size=args.hidden_size,
+        static_scale=args.static_scale,
+        quant_dtype=to_torch_dtype(args.quant_dtype),
+        dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
+        seed=args.seed,
+        do_profile=args.profile,
+        num_warmup_iters=args.num_warmup_iters,
+        num_iters=args.num_iters,
+    )
diff --git a/vllm_v0.10.0/benchmarks/kernels/benchmark_rmsnorm.py b/vllm_v0.10.0/benchmarks/kernels/benchmark_rmsnorm.py
new file mode 100644
index 0000000..4cf633a
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/kernels/benchmark_rmsnorm.py
@@ -0,0 +1,256 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import itertools
+from typing import Optional, Union
+
+import torch
+from flashinfer.norm import fused_add_rmsnorm, rmsnorm
+from torch import nn
+
+from vllm import _custom_ops as vllm_ops
+from vllm.triton_utils import triton
+
+
+class HuggingFaceRMSNorm(nn.Module):
+    def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        orig_dtype = x.dtype
+        x = x.to(torch.float32)
+        if residual is not None:
+            x = x + residual.to(torch.float32)
+            residual = x.to(orig_dtype)
+
+        variance = x.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        x = x.to(orig_dtype) * self.weight
+        if residual is None:
+            return x
+        else:
+            return x, residual
+
+
+def rmsnorm_naive(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+    eps: float = 1e-6,
+):
+    naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
+    naive_norm.weight = nn.Parameter(weight)
+    naive_norm = naive_norm.to(x.device)
+
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+    if residual is not None:
+        residual = residual.view(-1, residual.shape[-1])
+
+    output = naive_norm(x, residual)
+
+    if isinstance(output, tuple):
+        output = (output[0].view(orig_shape), output[1].view(orig_shape))
+    else:
+        output = output.view(orig_shape)
+    return output
+
+
+def rmsnorm_flashinfer(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+    eps: float = 1e-6,
+):
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+    if residual is not None:
+        residual = residual.view(-1, residual.shape[-1])
+
+    if residual is not None:
+        fused_add_rmsnorm(x, residual, weight, eps)
+        output = (x, residual)
+    else:
+        output = rmsnorm(x, weight, eps)
+
+    if isinstance(output, tuple):
+        output = (output[0].view(orig_shape), output[1].view(orig_shape))
+    else:
+        output = output.view(orig_shape)
+    return output
+
+
+def rmsnorm_vllm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+    eps: float = 1e-6,
+):
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+    if residual is not None:
+        residual = residual.view(-1, residual.shape[-1])
+
+    if residual is not None:
+        vllm_ops.fused_add_rms_norm(x, residual, weight, eps)
+        output = (x, residual)
+    else:
+        out = torch.empty_like(x)
+        vllm_ops.rms_norm(out, x, weight, eps)
+        output = out
+
+    if isinstance(output, tuple):
+        output = (output[0].view(orig_shape), output[1].view(orig_shape))
+    else:
+        output = output.view(orig_shape)
+    return output
+
+
+def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True):
+    dtype = torch.bfloat16
+    x = torch.randn(batch_size, seq_len, hidden_size, dtype=dtype, device="cuda")
+    weight = torch.ones(hidden_size, dtype=dtype, device="cuda")
+    residual = torch.randn_like(x) if use_residual else None
+
+    output_naive = rmsnorm_naive(
+        x.clone(), weight, residual.clone() if residual is not None else None
+    )
+    output_flashinfer = rmsnorm_flashinfer(
+        x.clone(), weight, residual.clone() if residual is not None else None
+    )
+    output_vllm = rmsnorm_vllm(
+        x.clone(), weight, residual.clone() if residual is not None else None
+    )
+
+    if use_residual:
+        output_naive = output_naive[0]
+        output_flashinfer = output_flashinfer[0]
+        output_vllm = output_vllm[0]
+
+    print(f"Naive output={output_naive}")
+    print(f"FlashInfer output={output_flashinfer}")
+    print(f"vLLM output={output_vllm}")
+
+    if torch.allclose(
+        output_naive, output_flashinfer, atol=1e-2, rtol=1e-2
+    ) and torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
+        print("✅ All implementations match")
+    else:
+        print("❌ Implementations differ")
+
+
+batch_size_range = [2**i for i in range(0, 7, 2)]
+seq_length_range = [2**i for i in range(6, 11, 1)]
+head_num_range = [32, 48]
+configs = list(itertools.product(head_num_range, batch_size_range, seq_length_range))
+
+
+def get_benchmark(use_residual):
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["head_num", "batch_size", "seq_len"],
+            x_vals=[list(_) for _ in configs],
+            line_arg="provider",
+            line_vals=["huggingface", "flashinfer", "vllm"],
+            line_names=["HuggingFace", "FlashInfer", "vLLM"],
+            styles=[("blue", "-"), ("green", "-"), ("red", "-")],
+            ylabel="us",
+            plot_name=f"rmsnorm-perf-{'with' if use_residual else 'without'}-residual",
+            args={},
+        )
+    )
+    def benchmark(head_num, batch_size, seq_len, provider):
+        dtype = torch.bfloat16
+        hidden_size = head_num * 128  # assuming head_dim = 128
+
+        x = torch.randn(batch_size, seq_len, hidden_size, dtype=dtype, device="cuda")
+        weight = torch.ones(hidden_size, dtype=dtype, device="cuda")
+        residual = torch.randn_like(x) if use_residual else None
+
+        quantiles = [0.5, 0.2, 0.8]
+
+        if provider == "huggingface":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rmsnorm_naive(
+                    x.clone(),
+                    weight,
+                    residual.clone() if residual is not None else None,
+                ),
+                quantiles=quantiles,
+            )
+        elif provider == "flashinfer":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rmsnorm_flashinfer(
+                    x.clone(),
+                    weight,
+                    residual.clone() if residual is not None else None,
+                ),
+                quantiles=quantiles,
+            )
+        else:
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rmsnorm_vllm(
+                    x.clone(),
+                    weight,
+                    residual.clone() if residual is not None else None,
+                ),
+                quantiles=quantiles,
+            )
+
+        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+    return benchmark
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=4,
+        help="Batch size",
+    )
+    parser.add_argument(
+        "--seq-len",
+        type=int,
+        default=128,
+        help="Sequence length",
+    )
+    parser.add_argument(
+        "--hidden-size",
+        type=int,
+        default=4096,
+        help="Hidden size (2nd dimension) of the sequence",
+    )
+    parser.add_argument(
+        "--use-residual", action="store_true", help="Whether to use residual connection"
+    )
+    parser.add_argument(
+        "--save-path",
+        type=str,
+        default="./configs/rmsnorm/",
+        help="Path to save rmsnorm benchmark results",
+    )
+
+    args = parser.parse_args()
+
+    # Run correctness test
+    calculate_diff(
+        batch_size=args.batch_size,
+        seq_len=args.seq_len,
+        hidden_size=args.hidden_size,
+        use_residual=args.use_residual,
+    )
+
+    # Get the benchmark function with proper use_residual setting
+    benchmark = get_benchmark(args.use_residual)
+    # Run performance benchmark
+    benchmark.run(print_data=True, save_path=args.save_path)
diff --git a/vllm_v0.10.0/benchmarks/kernels/benchmark_rope.py b/vllm_v0.10.0/benchmarks/kernels/benchmark_rope.py
new file mode 100644
index 0000000..b81baf1
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/kernels/benchmark_rope.py
@@ -0,0 +1,133 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from itertools import accumulate
+from typing import Optional
+
+import nvtx
+import torch
+
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding, get_rope
+from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser
+
+
+def benchmark_rope_kernels_multi_lora(
+    is_neox_style: bool,
+    batch_size: int,
+    seq_len: int,
+    num_heads: int,
+    head_size: int,
+    rotary_dim: Optional[int],
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    max_position: int = 8192,
+    base: float = 10000,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+    if rotary_dim is None:
+        rotary_dim = head_size
+    # silulating serving 4 LoRAs
+    scaling_factors = [1, 2, 4, 8]
+    # batched RoPE can take multiple scaling factors
+    batched_rope = get_rope(
+        head_size,
+        rotary_dim,
+        max_position,
+        base,
+        is_neox_style,
+        {"rope_type": "linear", "factor": tuple(scaling_factors)},
+    )
+    # non-batched RoPE takes only one scaling factor, we create multiple
+    # instances to simulate the same behavior
+    non_batched_ropes: list[RotaryEmbedding] = []
+    for scaling_factor in scaling_factors:
+        non_batched_ropes.append(
+            get_rope(
+                head_size,
+                rotary_dim,
+                max_position,
+                base,
+                is_neox_style,
+                {"rope_type": "linear", "factor": (scaling_factor,)},
+            )
+        )
+
+    positions = torch.randint(0, max_position, (batch_size, seq_len))
+    query = torch.randn(batch_size, seq_len, num_heads * head_size, dtype=dtype)
+    key = torch.randn_like(query)
+
+    # create query offsets for batched RoPE, we concat multiple kv cache
+    # together and each query needs to find the right kv cache of its type
+    offset_map = torch.tensor(
+        list(
+            accumulate(
+                [0]
+                + [
+                    max_position * scaling_factor * 2
+                    for scaling_factor in scaling_factors[:-1]
+                ]
+            )
+        )
+    )
+    query_types = torch.randint(
+        0, len(scaling_factors), (batch_size, seq_len), device=device
+    )
+    # map query types to offsets
+    query_offsets = offset_map[query_types]
+    # the kernel takes flattened offsets
+    flatten_offsets = query_offsets.flatten()
+
+    # batched queries of the same type together for non-batched RoPE
+    queries = [query[query_types == i] for i in range(len(scaling_factors))]
+    keys = [key[query_types == i] for i in range(len(scaling_factors))]
+    packed_qkr = zip(queries, keys, non_batched_ropes)
+    # synchronize before start timing
+    torch.cuda.synchronize()
+    with nvtx.annotate("non-batched", color="yellow"):
+        for q, k, r in packed_qkr:
+            r.forward(positions, q, k)
+    torch.cuda.synchronize()
+    with nvtx.annotate("batched", color="green"):
+        batched_rope.forward(positions, query, key, flatten_offsets)
+    torch.cuda.synchronize()
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="Benchmark the rotary embedding kernels."
+    )
+    parser.add_argument("--is-neox-style", type=bool, default=True)
+    parser.add_argument("--batch-size", type=int, default=16)
+    parser.add_argument("--seq-len", type=int, default=512)
+    parser.add_argument("--num-heads", type=int, default=8)
+    parser.add_argument(
+        "--head-size",
+        type=int,
+        choices=[64, 80, 96, 112, 120, 128, 192, 256],
+        default=128,
+    )
+    parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
+    parser.add_argument(
+        "--dtype", type=str, choices=["bfloat16", "float"], default="float"
+    )
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument(
+        "--device", type=str, choices=["cuda:0", "cuda:1"], default="cuda:0"
+    )
+    args = parser.parse_args()
+    print(args)
+
+    benchmark_rope_kernels_multi_lora(
+        is_neox_style=args.is_neox_style,
+        batch_size=args.batch_size,
+        seq_len=args.seq_len,
+        num_heads=args.num_heads,
+        head_size=args.head_size,
+        rotary_dim=args.rotary_dim,
+        dtype=getattr(torch, args.dtype),
+        seed=args.seed,
+        device=args.device,
+    )
diff --git a/vllm_v0.10.0/benchmarks/kernels/benchmark_shapes.py b/vllm_v0.10.0/benchmarks/kernels/benchmark_shapes.py
new file mode 100644
index 0000000..18c459c
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/kernels/benchmark_shapes.py
@@ -0,0 +1,94 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+WEIGHT_SHAPES = {
+    "ideal": [[4 * 256 * 32, 256 * 32]],
+    "mistralai/Mistral-7B-v0.1/TP1": [
+        [4096, 6144],
+        [4096, 4096],
+        [4096, 28672],
+        [14336, 4096],
+    ],
+    "mistralai/Mistral-7B-v0.1/TP2": [
+        [4096, 3072],
+        [2048, 4096],
+        [4096, 14336],
+        [7168, 4096],
+    ],
+    "mistralai/Mistral-7B-v0.1/TP4": [
+        [4096, 1536],
+        [1024, 4096],
+        [4096, 7168],
+        [3584, 4096],
+    ],
+    "meta-llama/Llama-2-7b-hf/TP1": [
+        [4096, 12288],
+        [4096, 4096],
+        [4096, 22016],
+        [11008, 4096],
+    ],
+    "meta-llama/Llama-2-7b-hf/TP2": [
+        [4096, 6144],
+        [2048, 4096],
+        [4096, 11008],
+        [5504, 4096],
+    ],
+    "meta-llama/Llama-2-7b-hf/TP4": [
+        [4096, 3072],
+        [1024, 4096],
+        [4096, 5504],
+        [2752, 4096],
+    ],
+    "meta-llama/Llama-2-13b-hf/TP1": [
+        [5120, 15360],
+        [5120, 5120],
+        [5120, 27648],
+        [13824, 5120],
+    ],
+    "meta-llama/Llama-2-13b-hf/TP2": [
+        [5120, 7680],
+        [2560, 5120],
+        [5120, 13824],
+        [6912, 5120],
+    ],
+    "meta-llama/Llama-2-13b-hf/TP4": [
+        [5120, 3840],
+        [1280, 5120],
+        [5120, 6912],
+        [3456, 5120],
+    ],
+    "meta-llama/Llama-2-70b-hf/TP1": [
+        [8192, 10240],
+        [8192, 8192],
+        [8192, 57344],
+        [28672, 8192],
+    ],
+    "meta-llama/Llama-2-70b-hf/TP2": [
+        [8192, 5120],
+        [4096, 8192],
+        [8192, 28672],
+        [14336, 8192],
+    ],
+    "meta-llama/Llama-2-70b-hf/TP4": [
+        [8192, 2560],
+        [2048, 8192],
+        [8192, 14336],
+        [7168, 8192],
+    ],
+}
+
+WEIGHT_SHAPES_MOE = {
+    "nm-testing/Mixtral-8x7B-Instruct-v0.1": [
+        [8, 2, 4096, 28672],
+        [8, 2, 14336, 4096],
+    ],
+    "nm-testing/deepseekv2-lite": [
+        [64, 6, 2048, 1408],
+    ],
+    "ibm-granite/granite-3.0-1b-a400m": [
+        [32, 8, 1024, 1024],
+    ],
+    "ibm-granite/granite-3.0-3b-a800m": [
+        [40, 8, 1024, 1536],
+    ],
+}
diff --git a/vllm_v0.10.0/benchmarks/kernels/benchmark_trtllm_attention.py b/vllm_v0.10.0/benchmarks/kernels/benchmark_trtllm_attention.py
new file mode 100644
index 0000000..8c980f9
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/kernels/benchmark_trtllm_attention.py
@@ -0,0 +1,240 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import csv
+import os
+import random
+from datetime import datetime
+
+import flashinfer
+import torch
+
+FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
+
+# KV Cache Layout for TRT-LLM
+# kv_cache_shape = (num_blocks, 2, num_kv_heads, page_size, head_dim)
+
+
+def to_float8(x, dtype=torch.float8_e4m3fn):
+    finfo = torch.finfo(dtype)
+    min_val, max_val = x.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
+    scale = finfo.max / amax * 0.1
+    x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max)
+    return x_scl_sat.to(dtype), scale.float().reciprocal()
+
+
+@torch.no_grad()
+def benchmark_decode(
+    num_seqs,
+    max_seq_len,
+    page_size=16,
+    dtype=torch.bfloat16,
+    kv_layout="HND",
+    num_kv_heads=8,
+    kv_cache_dtype="auto",
+    head_dim=128,
+    warmup=10,
+    trials=20,
+):
+    torch.set_default_device("cuda")
+    device = "cuda"
+    torch.manual_seed(0)
+
+    # Currently only HEAD_GRP_SIZE == 8 is supported
+    HEAD_GRP_SIZE = 8
+    MAX_SEQ_LEN = max_seq_len
+
+    # large number to reduce kv_cache reuse
+    NUM_BLOCKS = int(256000 / page_size)
+
+    workspace_buffer = torch.empty(1024 * 1024 * 1024, dtype=torch.int8, device=device)
+
+    # For decode, batch_size is num_decode_token
+    num_qo_heads = num_kv_heads * HEAD_GRP_SIZE
+    sm_scale = float(1.0 / (head_dim**0.5))
+    q = torch.randn(num_seqs, num_qo_heads, head_dim, device=device, dtype=dtype)
+    kv_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
+
+    max_kv_len = max(kv_lens)
+    kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int, device=device)
+    max_num_blocks_per_seq = (max_kv_len + page_size - 1) // page_size
+
+    block_tables = torch.randint(
+        0, NUM_BLOCKS, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
+
+    kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, page_size, head_dim)
+    kv_cache = torch.randn(size=kv_cache_shape, device=device, dtype=dtype)
+    k_scale = v_scale = 1.0
+
+    if kv_cache_dtype.startswith("fp8"):
+        kv_cache, _ = to_float8(kv_cache)
+
+    # Benchmark TRT decode
+    def trt_decode():
+        return flashinfer.decode.trtllm_batch_decode_with_kv_cache(
+            q,
+            kv_cache,
+            workspace_buffer,
+            num_qo_heads,
+            num_kv_heads,
+            sm_scale,
+            block_tables,
+            kv_lens_tensor,
+            page_size,
+            max_kv_len,
+            kv_cache_dtype,
+            k_scale,
+            v_scale,
+        )
+
+    def time_fn(fn, warmup=10, trials=20):
+        torch.cuda.synchronize()
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        times = []
+        for i in range(warmup):
+            fn()
+        for i in range(trials):
+            start.record()
+            fn()
+            end.record()
+            torch.cuda.synchronize()
+            times.append(start.elapsed_time(end))  # ms
+        return sum(times) / len(times), torch.std(torch.tensor(times))
+
+    # TRT Decode
+    trt_mean, trt_std = time_fn(trt_decode)
+
+    kv_indptr = [0]
+    kv_indices = []
+    kv_last_page_lens = []
+    for i in range(num_seqs):
+        seq_len = kv_lens[i]
+        assert seq_len > 0
+        num_blocks = (seq_len + page_size - 1) // page_size
+        kv_indices.extend(block_tables[i, :num_blocks])
+        kv_indptr.append(kv_indptr[-1] + num_blocks)
+        kv_last_page_len = seq_len % page_size
+        if kv_last_page_len == 0:
+            kv_last_page_len = page_size
+        kv_last_page_lens.append(kv_last_page_len)
+
+    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
+    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
+    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
+
+    wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
+        workspace_buffer,
+        kv_layout,
+        use_tensor_cores=((num_qo_heads // num_kv_heads) > 4),
+    )
+
+    wrapper.plan(
+        kv_indptr,
+        kv_indices,
+        kv_last_page_lens,
+        num_qo_heads,
+        num_kv_heads,
+        head_dim,
+        page_size,
+        "NONE",
+        q_data_type=dtype,
+        kv_data_type=torch.float8_e4m3fn if kv_cache_dtype.startswith("fp8") else dtype,
+    )
+
+    def baseline_decode():
+        return wrapper.run(q, kv_cache, sm_scale, k_scale, v_scale)
+
+    baseline_mean, baseline_std = time_fn(baseline_decode)
+
+    # Calculate percentage speedup (positive means TRT is faster)
+    speedup_percent = (baseline_mean - trt_mean) / baseline_mean
+
+    print(
+        f"\t{num_seqs}\t{max_seq_len}\t{trt_mean:.3f}\t{trt_std.item():.3f}"
+        f"\t{baseline_mean:.3f}\t{baseline_std.item():.3f}\t{speedup_percent:.3f}"
+    )
+
+    # Return results for CSV writing
+    return {
+        "num_seqs": num_seqs,
+        "trt_mean": trt_mean,
+        "trt_std": trt_std.item(),
+        "baseline_mean": baseline_mean,
+        "baseline_std": baseline_std.item(),
+        "speedup_percent": speedup_percent,
+        "q_dtype": str(dtype),
+        "kv_cache_dtype": kv_cache_dtype,
+        "page_size": page_size,
+        "num_kv_heads": num_kv_heads,
+        "head_dim": head_dim,
+        "max_seq_len": max_seq_len,
+    }
+
+
+def write_results_to_csv(results, filename=None):
+    """Write benchmark results to CSV file."""
+    if filename is None:
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        filename = f"flashinfer_trtllm_benchmark_{timestamp}.csv"
+
+    fieldnames = [
+        "num_seqs",
+        "trt_mean",
+        "trt_std",
+        "baseline_mean",
+        "baseline_std",
+        "speedup_percent",
+        "q_dtype",
+        "kv_cache_dtype",
+        "page_size",
+        "num_kv_heads",
+        "head_dim",
+        "max_seq_len",
+    ]
+
+    file_exists = os.path.exists(filename)
+
+    with open(filename, "a", newline="") as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+
+        if not file_exists:
+            writer.writeheader()
+
+        for result in results:
+            writer.writerow(result)
+
+    print(f"Results written to {filename}")
+
+
+if __name__ == "__main__":
+    num_seqs = [1, 4, 8, 16, 32, 64, 128, 256]
+    max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
+    all_results = []
+
+    print("Running benchmark for kv_cache_dtype: bfloat16")
+    print(
+        "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\tbaseline_std\tspeedup_percent"
+    )
+    for max_seq_len in max_seq_lens:
+        for bs in num_seqs:
+            result = benchmark_decode(
+                bs, max_seq_len, dtype=torch.bfloat16, kv_cache_dtype="auto"
+            )
+            all_results.append(result)
+
+    print("Running benchmark for q_dtype = bfloat16, kv_cache_dtype: fp8")
+    print(
+        "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\tbaseline_std\tspeedup_percent"
+    )
+    for max_seq_len in max_seq_lens:
+        for bs in num_seqs:
+            result = benchmark_decode(
+                bs, max_seq_len, dtype=torch.bfloat16, kv_cache_dtype="fp8"
+            )
+            all_results.append(result)
+
+    # Write all results to CSV
+    write_results_to_csv(all_results)
diff --git a/vllm_v0.10.0/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/vllm_v0.10.0/benchmarks/kernels/benchmark_w8a8_block_fp8.py
new file mode 100644
index 0000000..4fcdbad
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -0,0 +1,414 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from sglang quantization/tuning_block_wise_kernel.py
+
+import argparse
+import json
+import multiprocessing as mp
+import os
+import time
+from datetime import datetime
+from typing import Any
+
+import torch
+import tqdm
+import triton
+
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    _w8a8_block_fp8_matmul,
+)
+from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser
+
+mp.set_start_method("spawn", force=True)
+
+assert current_platform.is_cuda(), (
+    "Only support tune w8a8 block fp8 kernel on CUDA device."
+)
+
+DTYPE_MAP = {
+    "float32": torch.float32,
+    "float16": torch.float16,
+    "half": torch.half,
+    "bfloat16": torch.bfloat16,
+}
+
+
+def w8a8_block_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int],
+    config: dict[str, Any],
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    """This function performs matrix multiplication with
+    block-wise quantization.
+
+    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
+    The output is returned in the specified `output_dtype`.
+
+    Args:
+        A: The input tensor, e.g., activation.
+        B: The input tensor, e.g., weight.
+        As: The per-token-group quantization scale for `A`.
+        Bs: The per-block quantization scale for `B`.
+        block_size: The block size for per-block quantization.
+                    It should be 2-dim, e.g., [128, 128].
+        output_dytpe: The dtype of the returned tensor.
+
+    Returns:
+        torch.Tensor: The result of matmul.
+    """
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+
+    assert A.shape[-1] == B.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous()
+    assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
+    M = A.numel() // A.shape[-1]
+
+    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
+    N, K = B.shape
+    assert triton.cdiv(N, block_n) == Bs.shape[0]
+    assert triton.cdiv(K, block_k) == Bs.shape[1]
+
+    C_shape = A.shape[:-1] + (N,)
+    C = A.new_empty(C_shape, dtype=output_dtype)
+
+    def grid(META):
+        return (
+            triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
+        )
+
+    if A.dtype == torch.float8_e4m3fn:
+        kernel = _w8a8_block_fp8_matmul
+    else:
+        raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.")
+
+    kernel[grid](
+        A,
+        B,
+        C,
+        As,
+        Bs,
+        M,
+        N,
+        K,
+        block_n,
+        block_k,
+        A.stride(-2),
+        A.stride(-1),
+        B.stride(1),
+        B.stride(0),
+        C.stride(-2),
+        C.stride(-1),
+        As.stride(-2),
+        As.stride(-1),
+        Bs.stride(1),
+        Bs.stride(0),
+        **config,
+    )
+
+    return C
+
+
+def get_configs_compute_bound():
+    configs = []
+    for num_stages in [2, 3, 4, 5]:
+        for block_m in [16, 32, 64, 128, 256]:
+            for block_k in [64, 128]:
+                for block_n in [32, 64, 128, 256]:
+                    for num_warps in [4, 8]:
+                        for group_size in [1, 16, 32, 64]:
+                            configs.append(
+                                {
+                                    "BLOCK_SIZE_M": block_m,
+                                    "BLOCK_SIZE_N": block_n,
+                                    "BLOCK_SIZE_K": block_k,
+                                    "GROUP_SIZE_M": group_size,
+                                    "num_warps": num_warps,
+                                    "num_stages": num_stages,
+                                }
+                            )
+    return configs
+
+
+def get_weight_shapes(tp_size):
+    # NOTE(HandH1998): The weight shapes only works for DeepSeek-V3.
+    # Modify them, if you tune for another different model.
+    # cannot TP
+    total = [
+        (512 + 64, 7168),
+        ((128 + 64) * 128, 7168),
+        (128 * (128 + 128), 512),
+        (7168, 16384),
+        (7168, 18432),
+    ]
+    # N can TP
+    n_tp = [
+        (18432 * 2, 7168),
+        ((128 + 64) * 128, 7168),
+        (128 * (128 + 128), 512),
+        (24576, 1536),
+        (12288, 7168),
+        (4096, 7168),
+    ]
+    # K can TP
+    k_tp = [(7168, 18432), (7168, 16384), (7168, 2048)]
+
+    weight_shapes = []
+    for t in total:
+        weight_shapes.append(t)
+    for n_t in n_tp:
+        new_t = (n_t[0] // tp_size, n_t[1])
+        weight_shapes.append(new_t)
+    for k_t in k_tp:
+        new_t = (k_t[0], k_t[1] // tp_size)
+        weight_shapes.append(new_t)
+    return weight_shapes
+
+
+def benchmark_config(
+    A, B, As, Bs, block_size, config, out_dtype=torch.float16, num_iters=10
+):
+    def run():
+        w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype)
+
+    torch.cuda.synchronize()
+    # JIT complication & warmup
+    for _ in range(5):
+        run()
+    torch.cuda.synchronize()
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    latencies: list[float] = []
+    for i in range(num_iters):
+        torch.cuda.synchronize()
+        start_event.record()
+        run()
+        end_event.record()
+        end_event.synchronize()
+        latencies.append(start_event.elapsed_time(end_event))
+    avg = sum(latencies) / (num_iters * 10) * 1000  # us
+    return avg
+
+
+def tune(M, N, K, block_size, out_dtype, search_space, input_type):
+    factor_for_scale = 1e-2
+
+    if input_type == "fp8":
+        fp8_info = torch.finfo(torch.float8_e4m3fn)
+        fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+        A_fp32 = (
+            (torch.rand(M, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max
+        )
+        A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+        B_fp32 = (
+            (torch.rand(N, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max
+        )
+        B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    else:
+        raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.")
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+
+    As = torch.rand(M, k_tiles, dtype=torch.float32, device="cuda") * factor_for_scale
+    Bs = (
+        torch.rand(n_tiles, k_tiles, dtype=torch.float32, device="cuda")
+        * factor_for_scale
+    )
+
+    best_config = None
+    best_time = float("inf")
+    for config in tqdm(search_space):
+        try:
+            kernel_time = benchmark_config(
+                A,
+                B,
+                As,
+                Bs,
+                block_size,
+                config,
+                out_dtype,
+                num_iters=10,
+            )
+        except triton.runtime.autotuner.OutOfResources:
+            # Some configurations may be invalid and fail to compile.
+            continue
+
+        if kernel_time < best_time:
+            best_time = kernel_time
+            best_config = config
+    now = datetime.now()
+    print(f"{now.ctime()}] Completed tuning for batch_size={M}")
+    assert best_config is not None
+    return best_config
+
+
+def save_configs(
+    N,
+    K,
+    block_n,
+    block_k,
+    configs,
+    save_path,
+    input_type="fp8",
+) -> None:
+    os.makedirs(save_path, exist_ok=True)
+    device_name = current_platform.get_device_name().replace(" ", "_")
+    json_file_name = (
+        f"N={N},K={K},device_name={device_name},dtype={input_type}_w8a8,"
+        f"block_shape=[{block_n},{block_k}].json"
+    )
+
+    config_file_path = os.path.join(save_path, json_file_name)
+    print(f"Writing best config to {config_file_path}...")
+
+    with open(config_file_path, "w") as f:
+        json.dump(configs, f, indent=4)
+        f.write("\n")
+
+
+def tune_on_gpu(args_dict):
+    """Run tuning on a specific GPU."""
+    gpu_id = args_dict["gpu_id"]
+    batch_sizes = args_dict["batch_sizes"]
+    weight_shapes = args_dict["weight_shapes"]
+    args = args_dict["args"]
+
+    torch.cuda.set_device(gpu_id)
+    print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}")
+
+    block_n = args.block_n
+    block_k = args.block_k
+    out_dtype = DTYPE_MAP[args.out_dtype]
+    save_path = args.save_path
+    input_type = args.input_type
+
+    search_space = get_configs_compute_bound()
+    search_space = [
+        config for config in search_space if block_k % config["BLOCK_SIZE_K"] == 0
+    ]
+
+    start = time.time()
+    for shape in tqdm(weight_shapes, desc=f"GPU {gpu_id} - Shapes"):
+        N, K = shape[0], shape[1]
+        print(f"[GPU {gpu_id}] Tune for weight shape of `N: {N}, K: {K}`")
+        benchmark_results = [
+            tune(
+                batch_size,
+                N,
+                K,
+                [block_n, block_k],
+                out_dtype,
+                search_space,
+                input_type,
+            )
+            for batch_size in tqdm(batch_sizes, desc=f"GPU {gpu_id} - Batch sizes")
+        ]
+        best_configs = {M: config for M, config in zip(batch_sizes, benchmark_results)}
+        save_configs(N, K, block_n, block_k, best_configs, save_path, input_type)
+
+    end = time.time()
+    print(f"Tuning on GPU {gpu_id} took {end - start:.2f} seconds")
+
+
+def distribute_batch_sizes(batch_sizes, num_gpus):
+    """Distribute batch sizes across available GPUs."""
+    batches_per_gpu = []
+    for i in range(num_gpus):
+        start_idx = i * len(batch_sizes) // num_gpus
+        end_idx = (i + 1) * len(batch_sizes) // num_gpus
+        batches_per_gpu.append(batch_sizes[start_idx:end_idx])
+    return batches_per_gpu
+
+
+def main(args):
+    print(args)
+    num_gpus = torch.cuda.device_count()
+    if num_gpus == 0:
+        raise RuntimeError("No GPU available for tuning")
+    print(f"Found {num_gpus} GPUs for parallel tuning")
+
+    torch.cuda.init()
+
+    if args.batch_size is None:
+        batch_sizes = [
+            1,
+            2,
+            4,
+            8,
+            16,
+            24,
+            32,
+            48,
+            64,
+            96,
+            128,
+            256,
+            512,
+            1024,
+            1536,
+            2048,
+            3072,
+            4096,
+        ]
+    else:
+        batch_sizes = [args.batch_size]
+        num_gpus = 1  # If only one batch size, use only one GPU
+
+    weight_shapes = get_weight_shapes(args.tp_size)
+
+    batches_per_gpu = distribute_batch_sizes(batch_sizes, num_gpus)
+
+    process_args = []
+    for gpu_id in range(num_gpus):
+        process_args.append(
+            {
+                "gpu_id": gpu_id,
+                "batch_sizes": batches_per_gpu[gpu_id],
+                "weight_shapes": weight_shapes,  # Each GPU processes all weight shapes
+                "args": args,
+            }
+        )
+
+    ctx = mp.get_context("spawn")
+    with ctx.Pool(num_gpus) as pool:
+        pool.map(tune_on_gpu, process_args)
+
+    print("Multi-GPU tuning completed")
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="""
+Tune triton w8a8 block fp8 for DeepSeek-V3/DeepSeek-R1:
+    python3 benchmark_w8a8_block_fp8.py --tp-size 8 --input-type fp8
+Then copy to model_executor/layers/quantization/utils/configs
+        """,
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+
+    parser.add_argument("--tp-size", "-tp", type=int, default=8)
+    parser.add_argument("--input-type", type=str, choices=["fp8"], default="fp8")
+    parser.add_argument(
+        "--out-dtype",
+        type=str,
+        choices=["float32", "float16", "bfloat16", "half"],
+        default="float16",
+    )
+    parser.add_argument("--block-n", type=int, default=128)
+    parser.add_argument("--block-k", type=int, default=128)
+    parser.add_argument("--batch-size", type=int, required=False)
+    parser.add_argument("--save-path", type=str, default="./")
+    args = parser.parse_args()
+
+    main(args)
diff --git a/vllm_v0.10.0/benchmarks/kernels/deepgemm/README.md b/vllm_v0.10.0/benchmarks/kernels/deepgemm/README.md
new file mode 100644
index 0000000..917e814
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/kernels/deepgemm/README.md
@@ -0,0 +1,129 @@
+# DeepSeek DeepGEMM Kernels Benchmark
+
+This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels.
+
+Currently this just includes dense GEMMs and only works on Hopper GPUs.
+
+## Setup
+
+You need to install vLLM in your usual fashion, then install DeepGEMM from source in its own directory:
+
+```
+git clone --recursive https://github.com/deepseek-ai/DeepGEMM
+cd DeepGEMM
+python setup.py install
+uv pip install -e .
+```
+
+## Usage
+
+```
+python benchmark_fp8_block_dense_gemm.py
+INFO 02-26 21:55:13 [__init__.py:207] Automatically detected platform cuda.
+===== STARTING FP8 GEMM BENCHMARK =====
+PyTorch version: 2.5.1+cu124
+CUDA version: 12.4
+Triton version: 3.1.0
+Using device: NVIDIA H100 80GB HBM3
+WARNING 02-26 21:55:15 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+INFO 02-26 21:55:15 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel.
+WARNING 02-26 21:55:16 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=18432,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+WARNING 02-26 21:55:17 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+INFO 02-26 21:55:17 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel.
+INFO 02-26 21:55:17 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel.
+
+===== PERFORMANCE COMPARISON =====
+
+DeepGEMM Implementation:
++------+-------+-------+-----------+--------+--------+
+| m    | n     | k     | Time (μs) | TFLOPS | GB/s   |
++------+-------+-------+-----------+--------+--------+
+|    8 |  4096 |  7168 | 102.9     | 4.6    | 286.4  |
+|    8 |  7168 | 18432 | 70.8      | 29.8   | 1868.8 |
+|    8 | 18432 |  7168 | 69.3      | 30.5   | 1911.8 |
+|   64 |  4096 |  7168 | 69.1      | 54.4   | 439.0  |
+|   64 |  7168 | 18432 | 69.4      | 243.6  | 1933.6 |
+|   64 | 18432 |  7168 | 70.4      | 240.3  | 1917.2 |
+|   64 | 24576 |  1536 | 70.1      | 68.9   | 584.6  |
+|   64 | 32768 |   512 | 68.4      | 31.4   | 307.1  |
+|   64 |  7168 | 16384 | 69.5      | 216.3  | 1718.5 |
+|  128 |  4096 |  7168 | 141.1     | 53.3   | 222.1  |
+|  128 |  7168 | 18432 | 71.9      | 470.5  | 1896.1 |
+|  128 | 18432 |  7168 | 69.3      | 488.2  | 1988.2 |
+| 1024 |  4096 |  7168 | 89.7      | 670.1  | 502.5  |
+| 1024 | 18432 |  7168 | 279.0     | 969.8  | 635.2  |
+| 2048 |  4096 |  7168 | 175.1     | 687.0  | 347.4  |
+| 4096 |  4096 |  7168 | 335.4     | 717.0  | 275.1  |
++------+-------+-------+-----------+--------+--------+
+
+vLLM Triton Implementation:
++------+-------+-------+-----------+--------+--------+--------------+
+| m    | n     | k     | Time (μs) | TFLOPS | GB/s   | vs DeepGEMM  |
++------+-------+-------+-----------+--------+--------+--------------+
+|    8 |  4096 |  7168 | 74.0      | 6.3    | 398.2  | 1.39x faster |
+|    8 |  7168 | 18432 | 89.6      | 23.6   | 1478.1 | 0.79x slower |
+|    8 | 18432 |  7168 | 113.2     | 18.7   | 1170.4 | 0.61x slower |
+|   64 |  4096 |  7168 | 79.4      | 47.3   | 382.2  | 0.87x slower |
+|   64 |  7168 | 18432 | 98.5      | 171.7  | 1363.0 | 0.70x slower |
+|   64 | 18432 |  7168 | 119.5     | 141.5  | 1129.4 | 0.59x slower |
+|   64 | 24576 |  1536 | 37.6      | 128.4  | 1089.7 | 1.86x faster |
+|   64 | 32768 |   512 | 38.7      | 55.5   | 542.6  | 1.77x faster |
+|   64 |  7168 | 16384 | 86.1      | 174.5  | 1386.4 | 0.81x slower |
+|  128 |  4096 |  7168 | 90.7      | 82.9   | 345.4  | 1.56x faster |
+|  128 |  7168 | 18432 | 144.0     | 234.9  | 946.9  | 0.50x slower |
+|  128 | 18432 |  7168 | 229.5     | 147.4  | 600.1  | 0.30x slower |
+| 1024 |  4096 |  7168 | 242.3     | 248.2  | 186.1  | 0.37x slower |
+| 1024 | 18432 |  7168 | 897.8     | 301.4  | 197.4  | 0.31x slower |
+| 2048 |  4096 |  7168 | 463.0     | 259.7  | 131.4  | 0.38x slower |
+| 4096 |  4096 |  7168 | 901.8     | 266.7  | 102.3  | 0.37x slower |
++------+-------+-------+-----------+--------+--------+--------------+
+
+vLLM CUTLASS Implementation:
++------+-------+-------+-----------+--------+--------+--------------+--------------+
+| m    | n     | k     | Time (μs) | TFLOPS | GB/s   | vs DeepGEMM  | vs Triton    |
++------+-------+-------+-----------+--------+--------+--------------+--------------+
+|    8 |  4096 |  7168 | 34.6      | 13.6   | 852.3  | 2.98x faster | 2.14x faster |
+|    8 |  7168 | 18432 | 78.9      | 26.8   | 1677.3 | 0.90x slower | 1.13x faster |
+|    8 | 18432 |  7168 | 81.2      | 26.0   | 1631.1 | 0.85x slower | 1.39x faster |
+|   64 |  4096 |  7168 | 36.9      | 101.9  | 822.9  | 1.87x faster | 2.15x faster |
+|   64 |  7168 | 18432 | 87.4      | 193.4  | 1535.2 | 0.79x slower | 1.13x faster |
+|   64 | 18432 |  7168 | 85.0      | 199.0  | 1587.6 | 0.83x slower | 1.41x faster |
+|   64 | 24576 |  1536 | 28.0      | 172.8  | 1465.8 | 2.51x faster | 1.35x faster |
+|   64 | 32768 |   512 | 28.8      | 74.5   | 728.5  | 2.37x faster | 1.34x faster |
+|   64 |  7168 | 16384 | 77.9      | 193.0  | 1532.8 | 0.89x slower | 1.11x faster |
+|  128 |  4096 |  7168 | 39.1      | 192.4  | 802.0  | 3.61x faster | 2.32x faster |
+|  128 |  7168 | 18432 | 93.7      | 360.8  | 1454.2 | 0.77x slower | 1.54x faster |
+|  128 | 18432 |  7168 | 85.7      | 394.8  | 1608.0 | 0.81x slower | 2.68x faster |
+| 1024 |  4096 |  7168 | 99.7      | 603.1  | 452.2  | 0.90x slower | 2.43x faster |
+| 1024 | 18432 |  7168 | 331.3     | 816.7  | 534.9  | 0.84x slower | 2.71x faster |
+| 2048 |  4096 |  7168 | 198.3     | 606.6  | 306.7  | 0.88x slower | 2.34x faster |
+| 4096 |  4096 |  7168 | 392.2     | 613.2  | 235.3  | 0.86x slower | 2.30x faster |
++------+-------+-------+-----------+--------+--------+--------------+--------------+
+
+===== AVERAGE PERFORMANCE =====
++----------------+------------+----------+---------------+
+| Implementation | Avg TFLOPS | Avg GB/s | Avg Time (ms) |
++----------------+------------+----------+---------------+
+| DeepGEMM       | 310.98     | 1052.10  | 0.11          |
+| vLLM Triton    | 144.30     | 715.60   | 0.23          |
+| vLLM CUTLASS   | 286.78     | 1076.67  | 0.11          |
++----------------+------------+----------+---------------+
+
+===== AVERAGE SPEEDUPS =====
++-----------------------------+--------------+
+| Comparison                  | Speedup      |
++-----------------------------+--------------+
+| DeepGEMM vs vLLM Triton     | 1.71x faster |
+| DeepGEMM vs vLLM CUTLASS    | 0.94x slower |
+| vLLM CUTLASS vs vLLM Triton | 1.84x faster |
++-----------------------------+--------------+
+
+===== ACCURACY COMPARISON =====
++----------------+-----------------------+
+| Implementation | Avg Diff vs Reference |
++----------------+-----------------------+
+| DeepGEMM       | 0.000684              |
+| vLLM Triton    | 0.000684              |
+| vLLM CUTLASS   | 0.000684              |
++----------------+-----------------------+
+```
diff --git a/vllm_v0.10.0/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/vllm_v0.10.0/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
new file mode 100644
index 0000000..43c54d5
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@@ -0,0 +1,456 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# fmt: off
+# ruff: noqa: E501
+import time
+
+# Import DeepGEMM functions
+import deep_gemm
+import torch
+from deep_gemm import calc_diff, ceil_div, get_col_major_tma_aligned_tensor
+
+# Import vLLM functions
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8,
+    w8a8_block_fp8_matmul,
+)
+from vllm.triton_utils import triton
+
+
+# Copied from
+# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L9
+def per_token_cast_to_fp8(
+        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    """Convert tensor to FP8 format with per-token scaling."""
+    assert x.dim() == 2 and x.size(1) % 128 == 0
+    m, n = x.shape
+    x_view = x.view(m, -1, 128)
+    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
+    return (x_view * (448.0 / x_amax.unsqueeze(2))).to(
+        torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1)
+
+
+# Copied from
+# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L17
+def per_block_cast_to_fp8(
+        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    """Convert tensor to FP8 format with per-block scaling."""
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros((ceil_div(m, 128) * 128, ceil_div(n, 128) * 128),
+                           dtype=x.dtype,
+                           device=x.device)
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
+    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (
+        x_amax / 448.0).view(x_view.size(0), x_view.size(2))
+
+
+def benchmark_shape(m: int,
+                    n: int,
+                    k: int,
+                    warmup: int = 100,
+                    repeat: int = 10000,
+                    verbose: bool = False) -> dict:
+    """Benchmark all implementations for a specific (m, n, k) shape."""
+    if verbose:
+        print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===")
+
+    # Create test tensors
+    A = torch.randn((m, k), device='cuda', dtype=torch.bfloat16)
+    B = torch.randn((n, k), device='cuda', dtype=torch.bfloat16)
+
+    # Reference result in BF16
+    torch.cuda.synchronize()
+    C_ref = A @ B.t()
+
+    # Pre-quantize B for all implementations
+    # (weights can be pre-quantized offline)
+    B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B)
+    B_vllm, B_scale_vllm = per_block_cast_to_fp8(B)
+
+    # Block size configuration
+    block_size = [128, 128]
+
+    # Pre-quantize A for all implementations
+    A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A)
+    A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
+    C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
+    A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
+    A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
+        A, block_size[1], column_major_scales=True)
+
+    # === DeepGEMM Implementation ===
+    def deepgemm_gemm():
+        deep_gemm.gemm_fp8_fp8_bf16_nt((A_deepgemm, A_scale_deepgemm),
+                                       (B_deepgemm, B_scale_deepgemm),
+                                       C_deepgemm)
+        return C_deepgemm
+
+    # === vLLM Triton Implementation ===
+    def vllm_triton_gemm():
+        return w8a8_block_fp8_matmul(A_vllm,
+                                     B_vllm,
+                                     A_scale_vllm,
+                                     B_scale_vllm,
+                                     block_size,
+                                     output_dtype=torch.bfloat16)
+
+    # === vLLM CUTLASS Implementation ===
+    def vllm_cutlass_gemm():
+        return ops.cutlass_scaled_mm(A_vllm_cutlass,
+                                     B_vllm.T,
+                                     scale_a=A_scale_vllm_cutlass,
+                                     scale_b=B_scale_vllm.T,
+                                     out_dtype=torch.bfloat16)
+
+    # Run correctness check first
+    if verbose:
+        print("Running correctness check...")
+    C_deepgemm = deepgemm_gemm()
+    C_vllm_triton = vllm_triton_gemm()
+    C_vllm_cutlass = vllm_cutlass_gemm()
+
+    deepgemm_diff = calc_diff(C_deepgemm, C_ref)
+    vllm_triton_diff = calc_diff(C_vllm_triton, C_ref)
+    vllm_cutlass_diff = calc_diff(C_vllm_cutlass, C_ref)
+
+    if verbose:
+        print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}")
+        print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}")
+        print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}")
+        print("vLLM Triton vs DeepGEMM difference: "
+              f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}")
+        print("vLLM CUTLASS vs DeepGEMM difference: "
+              f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}")
+
+    # Benchmark implementations
+    implementations = {
+        "DeepGEMM": deepgemm_gemm,
+        "vLLM Triton": vllm_triton_gemm,
+        "vLLM CUTLASS": vllm_cutlass_gemm
+    }
+
+    benchmark_results = {
+        "shape": {
+            "m": m,
+            "n": n,
+            "k": k
+        },
+        "implementations": {}
+    }
+
+    for name, func in implementations.items():
+        # Warmup
+        for _ in range(warmup):
+            func()
+            torch.cuda.synchronize()
+
+        # Timing loop
+        torch.cuda.synchronize()
+        start = time.time()
+        for _ in range(repeat):
+            func()
+        torch.cuda.synchronize()
+        end = time.time()
+
+        # Calculate timing and TFLOPS
+        avg_time_ms = (end - start) / repeat * 1000
+        avg_time_us = avg_time_ms * 1000
+        tflops = 2 * m * n * k / (avg_time_ms * 1e-3) / 1e12
+        gb_s = (m * k + k * n + m * n * 2) / 1e9 / (avg_time_ms * 1e-3)
+
+        benchmark_results["implementations"][name] = {
+            "time_ms": avg_time_ms,
+            "time_us": avg_time_us,
+            "tflops": tflops,
+            "gb_s": gb_s,
+            "diff": {
+                "DeepGEMM":
+                0.0 if name == "DeepGEMM" else calc_diff(func(), C_deepgemm),
+                "Reference":
+                deepgemm_diff if name == "DeepGEMM" else
+                (vllm_triton_diff
+                 if name == "vLLM Triton" else vllm_cutlass_diff)
+            }
+        }
+
+        if verbose:
+            print(
+                f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s"
+            )
+
+    # Calculate speedups
+    baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"]
+    for name, data in benchmark_results["implementations"].items():
+        if name != "DeepGEMM":
+            speedup = baseline / data["time_ms"]
+            benchmark_results["implementations"][name][
+                "speedup_vs_deepgemm"] = speedup
+            if verbose:
+                print(f"DeepGEMM is {1/speedup:.2f}x "
+                      f"{'faster' if 1/speedup > 1 else 'slower'} than {name}")
+
+    vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"][
+        "time_ms"]
+    vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"][
+        "time_ms"]
+    cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time
+    benchmark_results["implementations"]["vLLM CUTLASS"][
+        "speedup_vs_triton"] = cutlass_vs_triton
+    if verbose:
+        print(
+            f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x "
+            f"{'faster' if cutlass_vs_triton > 1 else 'slower'} than vLLM Triton"
+        )
+
+    return benchmark_results
+
+
+def format_table_row(values, widths):
+    """Format a row with specified column widths."""
+    return "| " + " | ".join(f"{val:{w}}"
+                             for val, w in zip(values, widths)) + " |"
+
+
+def print_table(headers, rows, title=None):
+    """Print a table with headers and rows."""
+    if title:
+        print(f"\n{title}")
+
+    # Calculate column widths based on headers and data
+    widths = [
+        max(len(str(h)), max(len(str(row[i])) for row in rows))
+        for i, h in enumerate(headers)
+    ]
+
+    # Create separator line
+    separator = "+-" + "-+-".join("-" * w for w in widths) + "-+"
+
+    # Print table
+    print(separator)
+    print(format_table_row(headers, widths))
+    print(separator)
+    for row in rows:
+        print(format_table_row(row, widths))
+    print(separator)
+
+
+def format_speedup(value):
+    """Format speedup value with indicator if it's faster or slower."""
+    return f"{value:.2f}x {'faster' if value > 1.0 else 'slower'}"
+
+
+def run_benchmarks(verbose: bool = False):
+    """Run benchmarks for a set of common shapes."""
+    print("===== STARTING FP8 GEMM BENCHMARK =====")
+
+    # Make sure we're using the GPU
+    if not torch.cuda.is_available():
+        print("CUDA not available! Tests require GPU.")
+        return
+
+    # Print system information
+    print(f"PyTorch version: {torch.__version__}")
+    print(f"CUDA version: {torch.version.cuda}")
+    print(f"Triton version: {triton.__version__}")
+    print(f"Using device: {torch.cuda.get_device_name()}")
+
+    # Enable TF32 for better performance
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+
+    # Set seeds for reproducibility
+    torch.manual_seed(42)
+    torch.cuda.manual_seed(42)
+
+    # Define benchmark shapes (m, n, k)
+    shapes = [
+        (8, 4096, 7168),
+        (8, 7168, 18432),
+        (8, 18432, 7168),
+        (64, 4096, 7168),
+        (64, 7168, 18432),
+        (64, 18432, 7168),
+        (64, 24576, 1536),
+        (64, 32768, 512),
+        (64, 7168, 16384),
+        (128, 4096, 7168),
+        (128, 7168, 18432),
+        (128, 18432, 7168),
+        (1024, 4096, 7168),
+        (1024, 18432, 7168),
+        (2048, 4096, 7168),
+        (4096, 4096, 7168),
+    ]
+    shapes = [
+        # (64, 2112, 7168),
+        (64, 24576, 1536),
+        (64, 32768, 512),
+        (64, 7168, 16384),
+        (64, 4096, 7168),
+        (64, 7168, 2048),
+        # (128, 2112, 7168),
+        (128, 24576, 1536),
+        (128, 32768, 512),
+        (128, 7168, 16384),
+        (128, 4096, 7168),
+        (128, 7168, 2048),
+        # (4096, 2112, 7168),
+        (4096, 24576, 1536),
+        (4096, 32768, 512),
+        (4096, 7168, 16384),
+        (4096, 4096, 7168),
+        (4096, 7168, 2048),
+    ]
+
+    all_results = []
+    for m, n, k in shapes:
+        result = benchmark_shape(m, n, k, verbose=verbose)
+        all_results.append(result)
+
+    # Print results in a nicely formatted table
+    print("\n===== PERFORMANCE COMPARISON =====")
+
+    # Print DeepGEMM table
+    deepgemm_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s"]
+    deepgemm_rows = []
+    for result in all_results:
+        shape = result["shape"]
+        impl_data = result["implementations"]["DeepGEMM"]
+        deepgemm_rows.append([
+            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
+            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}"
+        ])
+
+    print_table(deepgemm_headers,
+                deepgemm_rows,
+                title="DeepGEMM Implementation:")
+
+    # Print vLLM Triton table
+    triton_headers = [
+        "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"
+    ]
+    triton_rows = []
+    for result in all_results:
+        shape = result["shape"]
+        impl_data = result["implementations"]["vLLM Triton"]
+        speedup = impl_data.get("speedup_vs_deepgemm", 1.0)
+        triton_rows.append([
+            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
+            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
+            format_speedup(speedup)
+        ])
+
+    print_table(triton_headers,
+                triton_rows,
+                title="vLLM Triton Implementation:")
+
+    # Print vLLM CUTLASS table
+    cutlass_headers = [
+        "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM",
+        "vs Triton"
+    ]
+    cutlass_rows = []
+    for result in all_results:
+        shape = result["shape"]
+        impl_data = result["implementations"]["vLLM CUTLASS"]
+        vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0)
+        vs_triton = impl_data.get("speedup_vs_triton", 1.0)
+        cutlass_rows.append([
+            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
+            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
+            format_speedup(vs_deepgemm),
+            format_speedup(vs_triton)
+        ])
+
+    print_table(cutlass_headers,
+                cutlass_rows,
+                title="vLLM CUTLASS Implementation:")
+
+    # Calculate and print averages
+    print("\n===== AVERAGE PERFORMANCE =====")
+
+    implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"]
+    avg_metrics = {
+        impl: {
+            "tflops": 0,
+            "gb_s": 0,
+            "time_ms": 0
+        }
+        for impl in implementations
+    }
+
+    for result in all_results:
+        for impl in implementations:
+            impl_data = result["implementations"][impl]
+            avg_metrics[impl]["tflops"] += impl_data["tflops"]
+            avg_metrics[impl]["gb_s"] += impl_data["gb_s"]
+            avg_metrics[impl]["time_ms"] += impl_data["time_ms"]
+
+    num_shapes = len(all_results)
+    avg_headers = ["Implementation", "Avg TFLOPS", "Avg GB/s", "Avg Time (ms)"]
+    avg_rows = []
+
+    for impl in implementations:
+        avg_tflops = avg_metrics[impl]["tflops"] / num_shapes
+        avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes
+        avg_time = avg_metrics[impl]["time_ms"] / num_shapes
+        avg_rows.append([
+            impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"
+        ])
+
+    print_table(avg_headers, avg_rows)
+
+    # Calculate average speedups
+    avg_speedups = {
+        "DeepGEMM vs vLLM Triton": 0,
+        "DeepGEMM vs vLLM CUTLASS": 0,
+        "vLLM CUTLASS vs vLLM Triton": 0
+    }
+
+    for result in all_results:
+        deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"]
+        vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"]
+        vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"][
+            "time_ms"]
+
+        avg_speedups[
+            "DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
+        avg_speedups[
+            "DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
+        avg_speedups[
+            "vLLM CUTLASS vs vLLM Triton"] += vllm_triton_time / vllm_cutlass_time
+
+    print("\n===== AVERAGE SPEEDUPS =====")
+    speedup_headers = ["Comparison", "Speedup"]
+    speedup_rows = []
+    for comparison, total in avg_speedups.items():
+        avg_speedup = total / num_shapes
+        status = "faster" if avg_speedup > 1 else "slower"
+        speedup_rows.append([comparison, f"{avg_speedup:.2f}x {status}"])
+
+    print_table(speedup_headers, speedup_rows)
+
+    # Average accuracy comparison
+    print("\n===== ACCURACY COMPARISON =====")
+    avg_diff = {impl: 0 for impl in implementations}
+
+    for result in all_results:
+        for impl in implementations:
+            avg_diff[impl] += result["implementations"][impl]["diff"][
+                "Reference"]
+
+    diff_headers = ["Implementation", "Avg Diff vs Reference"]
+    diff_rows = []
+    for impl in implementations:
+        diff_rows.append([impl, f"{avg_diff[impl] / num_shapes:.6f}"])
+
+    print_table(diff_headers, diff_rows)
+
+
+if __name__ == "__main__":
+    run_benchmarks(verbose=False)
diff --git a/vllm_v0.10.0/benchmarks/kernels/graph_machete_bench.py b/vllm_v0.10.0/benchmarks/kernels/graph_machete_bench.py
new file mode 100644
index 0000000..9a4da0e
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/kernels/graph_machete_bench.py
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+import pickle
+from collections import defaultdict
+
+import matplotlib.pyplot as plt
+import pandas as pd
+import regex as re
+import seaborn as sns
+from torch.utils.benchmark import Measurement as TMeasurement
+
+from vllm.utils import FlexibleArgumentParser
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="Benchmark the latency of processing a single batch of "
+        "requests till completion."
+    )
+    parser.add_argument("filename", type=str)
+
+    args = parser.parse_args()
+
+    with open(args.filename, "rb") as f:
+        data = pickle.load(f)
+        raw_results: list[TMeasurement] = data["results"]
+
+    results = defaultdict(lambda: list())
+    for v in raw_results:
+        result = re.search(r"MKN=\(\d+x(\d+x\d+)\)", v.task_spec.sub_label)
+        if result is not None:
+            KN = result.group(1)
+        else:
+            raise Exception("MKN not found")
+        result = re.search(r"MKN=\((\d+)x\d+x\d+\)", v.task_spec.sub_label)
+        if result is not None:
+            M = result.group(1)
+        else:
+            raise Exception("MKN not found")
+
+        kernel = v.task_spec.description
+        results[KN].append({"kernel": kernel, "batch_size": M, "median": v.median})
+
+    rows = int(math.ceil(len(results) / 2))
+    fig, axs = plt.subplots(rows, 2, figsize=(12, 5 * rows))
+    axs = axs.flatten()
+    for axs_idx, (shape, data) in enumerate(results.items()):
+        plt.sca(axs[axs_idx])
+        df = pd.DataFrame(data)
+        sns.lineplot(
+            data=df,
+            x="batch_size",
+            y="median",
+            hue="kernel",
+            style="kernel",
+            markers=True,
+            dashes=False,
+            palette="Dark2",
+        )
+        plt.title(f"Shape: {shape}")
+        plt.ylabel("time (median, s)")
+    plt.tight_layout()
+    plt.savefig("graph_machete_bench.pdf")
diff --git a/vllm_v0.10.0/benchmarks/kernels/requirements.txt b/vllm_v0.10.0/benchmarks/kernels/requirements.txt
new file mode 100644
index 0000000..1411a4a
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/kernels/requirements.txt
@@ -0,0 +1 @@
+pandas
\ No newline at end of file
diff --git a/vllm_v0.10.0/benchmarks/kernels/utils.py b/vllm_v0.10.0/benchmarks/kernels/utils.py
new file mode 100644
index 0000000..4bbb36b
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/kernels/utils.py
@@ -0,0 +1,214 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import dataclasses
+from collections.abc import Iterable
+from typing import Any, Callable, Optional
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+
+
+@dataclasses.dataclass
+class CudaGraphBenchParams:
+    num_ops_in_cuda_graph: int
+
+
+@dataclasses.dataclass
+class ArgPool:
+    """
+    When some argument of the benchmarking function is annotated with this type,
+    the benchmarking class (BenchMM) will collapse the argument to a pick a
+    single value from the given list of values, during function invocation.
+    For every invocation during a benchmarking run, it will choose a
+    different value from the list.
+    """
+
+    values: Iterable[Any]
+
+    def __getitem__(self, index):
+        return self.values[index]
+
+
+class Bench:
+    class ArgsIterator:
+        def __init__(self, args_list, kwargs_list):
+            assert len(args_list) == len(kwargs_list)
+            self.args_list = args_list
+            self.kwargs_list = kwargs_list
+            self.n = len(self.args_list)
+            self.idx = 0
+
+        def __next__(self):
+            while True:
+                yield (self.args_list[self.idx], self.kwargs_list[self.idx])
+                self.idx += 1
+                self.idx = self.idx % self.n
+
+        def reset(self):
+            self.idx = 0
+
+        @property
+        def n_args(self):
+            return self.n
+
+    def __init__(
+        self,
+        cuda_graph_params: Optional[CudaGraphBenchParams],
+        label: str,
+        sub_label: str,
+        description: str,
+        fn: Callable,
+        *args,
+        **kwargs,
+    ):
+        self.cuda_graph_params = cuda_graph_params
+        self.use_cuda_graph = self.cuda_graph_params is not None
+        self.label = label
+        self.sub_label = sub_label
+        self.description = description
+        self.fn = fn
+
+        # Process args
+        self._args = args
+        self._kwargs = kwargs
+        self.args_list, self.kwargs_list = self.collapse_argpool(*args, **kwargs)
+        self.args_iterator = self.ArgsIterator(self.args_list, self.kwargs_list)
+
+        # Cudagraph runner
+        self.g = None
+        if self.use_cuda_graph:
+            self.g = self.get_cuda_graph_runner()
+
+        # benchmark run params
+        self.min_run_time = 1
+
+    def collapse_argpool(self, *args, **kwargs):
+        argpool_args = [arg for arg in args if isinstance(arg, ArgPool)] + [
+            arg for arg in kwargs.values() if isinstance(arg, ArgPool)
+        ]
+        if len(argpool_args) == 0:
+            return [args], [kwargs]
+
+        # Make sure all argpools are of the same size
+        argpool_size = len(argpool_args[0].values)
+        assert all([argpool_size == len(arg.values) for arg in argpool_args])
+
+        # create copies of the args
+        args_list = []
+        kwargs_list = []
+        for _ in range(argpool_size):
+            args_list.append(args)
+            kwargs_list.append(kwargs.copy())
+
+        for i in range(argpool_size):
+            # collapse args; Just pick the ith value
+            args_list[i] = tuple(
+                [arg[i] if isinstance(arg, ArgPool) else arg for arg in args_list[i]]
+            )
+
+            # collapse kwargs
+            kwargs_i = kwargs_list[i]
+            arg_pool_keys = [k for k, v in kwargs_i.items() if isinstance(v, ArgPool)]
+            for k in arg_pool_keys:
+                # again just pick the ith value
+                kwargs_i[k] = kwargs_i[k][i]
+            kwargs_list[i] = kwargs_i
+
+        return args_list, kwargs_list
+
+    def get_cuda_graph_runner(self):
+        assert self.use_cuda_graph
+        assert self.args_iterator is not None
+
+        num_graph_ops = self.cuda_graph_params.num_ops_in_cuda_graph
+
+        # warmup
+        args_it = self.args_iterator.__next__()
+        for _ in range(2):
+            args, kwargs = next(args_it)
+            self.fn(*args, **kwargs)
+
+        self.args_iterator.reset()
+        args_it = self.args_iterator.__next__()
+        stream = torch.cuda.Stream()
+        with torch.cuda.stream(stream):
+            g = torch.cuda.CUDAGraph()
+            with torch.cuda.graph(g):
+                for _ in range(num_graph_ops):
+                    args, kwargs = next(args_it)
+                    self.fn(*args, **kwargs)
+        return g
+
+    def run_cudagrah(self) -> TMeasurement:
+        assert self.use_cuda_graph
+        globals = {"g": self.g}
+
+        return TBenchmark.Timer(
+            stmt="g.replay()",
+            globals=globals,
+            label=(
+                f"{self.label}"
+                f" | cugraph {self.cuda_graph_params.num_ops_in_cuda_graph} ops"
+            ),
+            sub_label=self.sub_label,
+            description=self.description,
+        ).blocked_autorange(min_run_time=self.min_run_time)
+
+    def run_eager(self) -> TMeasurement:
+        setup = None
+        stmt = None
+        globals = None
+
+        has_arg_pool = self.args_iterator.n_args > 1
+        if has_arg_pool:
+            setup = """
+                    args_iterator.reset()
+                    args_it = args_iterator.__next__()
+                    """
+            stmt = """
+                    args, kwargs = next(args_it)
+                    fn(*args, **kwargs)
+                    """
+            globals = {"fn": self.fn, "args_iterator": self.args_iterator}
+        else:
+            # no arg pool. Just use the args and kwargs directly
+            self.args_iterator.reset()
+            args_it = self.args_iterator.__next__()
+            args, kwargs = next(args_it)
+
+            setup = ""
+            stmt = """
+                    fn(*args, **kwargs)
+                   """
+            globals = {"fn": self.fn, "args": args, "kwargs": kwargs}
+
+        return TBenchmark.Timer(
+            stmt=stmt,
+            setup=setup,
+            globals=globals,
+            label=self.label,
+            sub_label=self.sub_label,
+            description=self.description,
+        ).blocked_autorange(min_run_time=self.min_run_time)
+
+    def run(self) -> TMeasurement:
+        timer = None
+        if self.use_cuda_graph:  # noqa SIM108
+            timer = self.run_cudagrah()
+        else:
+            timer = self.run_eager()
+        if not timer.meets_confidence() or timer.has_warnings:
+            print("Doesn't meet confidence - re-running bench ...")
+            return self.run()
+        return timer
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        if exc_type:
+            print(f"exc type {exc_type}")
+            print(f"exc value {exc_value}")
+            print(f"exc traceback {traceback}")
diff --git a/vllm_v0.10.0/benchmarks/kernels/weight_shapes.py b/vllm_v0.10.0/benchmarks/kernels/weight_shapes.py
new file mode 100644
index 0000000..a27f023
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/kernels/weight_shapes.py
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Weight Shapes are in the format
+# ([K, N], TP_SPLIT_DIM)
+# Example:
+#  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
+#   - TP1 : K = 14336, N = 4096
+#   - TP2 : K = 7168, N = 4096
+#  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
+#   - TP1 : K = 4096, N = 6144
+#   - TP4 : K = 4096, N = 1536
+
+# TP1 shapes
+WEIGHT_SHAPES = {
+    "mistralai/Mistral-7B-v0.1": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-2-7b-hf": [
+        ([4096, 12288], 1),
+        ([4096, 4096], 0),
+        ([4096, 22016], 1),
+        ([11008, 4096], 0),
+    ],
+    "meta-llama/Llama-3-8b": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-2-13b-hf": [
+        ([5120, 15360], 1),
+        ([5120, 5120], 0),
+        ([5120, 27648], 1),
+        ([13824, 5120], 0),
+    ],
+    "meta-llama/Llama-2-70b-hf": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 57344], 1),
+        ([28672, 8192], 0),
+    ],
+    "meta-llama/Llama-3.1-405b-hf": [
+        ([16384, 18432], 1),
+        ([16384, 16384], 0),
+        ([16384, 106496], 1),
+        ([53248, 16384], 0),
+    ],
+    "meta-llama/Llama-3.1-8B-Instruct": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-3.3-70B-Instruct": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 57344], 1),
+        ([28672, 8192], 0),
+    ],
+    "mistralai/Mistral-Large-Instruct-2407": [
+        ([12288, 14336], 1),
+        ([12288, 12288], 0),
+        ([12288, 57344], 1),
+        ([28672, 12288], 0),
+    ],
+    "Qwen/Qwen2.5-7B-Instruct": [
+        ([3584, 4608], 1),
+        ([3584, 3584], 0),
+        ([3584, 37888], 1),
+        ([18944, 3584], 0),
+    ],
+    "Qwen/Qwen2.5-32B-Instruct": [
+        ([5120, 7168], 1),
+        ([5120, 5120], 0),
+        ([5120, 55296], 1),
+        ([27648, 5120], 0),
+    ],
+    "Qwen/Qwen2.5-72B-Instruct": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 59136], 1),
+        ([29568, 8192], 0),
+    ],
+    "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": [
+        ([2048, 3072], 1),
+        ([2048, 4096], 1),
+        ([2048, 2048], 0),
+        ([2048, 576], 0),
+        ([2048, 21888], 1),
+        ([10944, 2048], 0),
+        ([2048, 2816], 1),
+        ([1408, 2048], 0),
+    ],
+}
diff --git a/vllm_v0.10.0/benchmarks/kv_cache/benchmark_block_pool.py b/vllm_v0.10.0/benchmarks/kv_cache/benchmark_block_pool.py
new file mode 100644
index 0000000..134551b
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/kv_cache/benchmark_block_pool.py
@@ -0,0 +1,108 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import gc
+import time
+from typing import Optional
+
+from tabulate import tabulate
+
+from vllm.utils import FlexibleArgumentParser
+from vllm.v1.core.block_pool import BlockPool
+
+
+class Metric:
+    def __init__(self) -> None:
+        self.cnt: int = 0
+        self.sum_v: int = 0
+        self.max_v: Optional[int] = None
+
+    def update(self, v: int) -> None:
+        self.cnt += 1
+        self.sum_v += v
+        if self.max_v is None:
+            self.max_v = v
+        else:
+            self.max_v = max(self.max_v, v)
+
+    def avg_v(self) -> float:
+        return self.sum_v * 1.0 / self.cnt
+
+
+def main(args):
+    rows = []
+    for allocate_block in args.allocate_blocks:
+        # Enforce a GC collect ahead to minimize the impact among runs
+        gc.collect()
+        block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True)
+
+        get_blocks_metric: Metric = Metric()
+        free_blocks_metric: Metric = Metric()
+        for _ in range(args.num_iteration):
+            t1 = time.monotonic_ns()
+            blocks = block_pool.get_new_blocks(allocate_block)
+            t2 = time.monotonic_ns()
+            block_pool.free_blocks(blocks)
+            t3 = time.monotonic_ns()
+            get_blocks_metric.update(t2 - t1)
+            free_blocks_metric.update(t3 - t2)
+
+        if get_blocks_metric.max_v is not None and free_blocks_metric.max_v is not None:
+            rows.append(
+                [
+                    get_blocks_metric.cnt,
+                    args.num_gpu_blocks,
+                    allocate_block,
+                    get_blocks_metric.avg_v() / 1000000,
+                    get_blocks_metric.max_v / 1000000.0,
+                    free_blocks_metric.avg_v() / 1000000,
+                    free_blocks_metric.max_v / 1000000.0,
+                ]
+            )
+        else:
+            print(
+                "No valid metrics found."
+                f" {get_blocks_metric.max_v=} {free_blocks_metric.max_v=}"
+            )
+
+    print(
+        tabulate(
+            rows,
+            headers=[
+                "Iterations",
+                "Total\nBlocks",
+                "Allocated\nBlocks",
+                "Get Blocks\nAvg (ms)",
+                "Get Blocks\nMax (ms)",
+                "Free Blocks\nAvg (ms)",
+                "Free Blocks\nMax (ms)",
+            ],
+            tablefmt="grid",
+            floatfmt=".6f",
+        )
+    )
+
+
+def invoke_main() -> None:
+    parser = FlexibleArgumentParser(
+        description="Benchmark the performance of BlockPool for KV Cache."
+    )
+    parser.add_argument("--num-gpu-blocks", type=int, default=100000)
+    parser.add_argument(
+        "--num-iteration",
+        type=int,
+        default=1000,
+        help="Number of iterations to run to stablize final data readings",
+    )
+    parser.add_argument(
+        "--allocate-blocks",
+        type=int,
+        nargs="*",
+        default=[10, 50, 100, 500, 1000],
+        help="Number of blocks to allocate",
+    )
+    args = parser.parse_args()
+    main(args)
+
+
+if __name__ == "__main__":
+    invoke_main()  # pragma: no cover
diff --git a/vllm_v0.10.0/benchmarks/overheads/benchmark_hashing.py b/vllm_v0.10.0/benchmarks/overheads/benchmark_hashing.py
new file mode 100644
index 0000000..0957a9c
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/overheads/benchmark_hashing.py
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import cProfile
+import pstats
+
+from vllm import LLM, SamplingParams
+from vllm.utils import FlexibleArgumentParser
+
+# A very long prompt, total number of tokens is about 15k.
+LONG_PROMPT = ["You are an expert in large language models, aren't you?"] * 1000
+LONG_PROMPT = " ".join(LONG_PROMPT)
+
+
+def main(args):
+    llm = LLM(
+        model=args.model,
+        enforce_eager=True,
+        enable_prefix_caching=True,
+        tensor_parallel_size=args.tensor_parallel_size,
+    )
+
+    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
+    profiler = cProfile.Profile()
+
+    print("------warm up------")
+    for i in range(3):
+        output = llm.generate(LONG_PROMPT, sampling_params)
+        print(output[0].outputs[0].text)
+
+    print("------start generating------")
+    for i in range(3):
+        profiler.runctx(
+            "llm.generate(LONG_PROMPT, sampling_params)", globals(), locals()
+        )
+
+    # analyze the runtime of hashing function
+    stats = pstats.Stats(profiler)
+    stats.sort_stats("cumulative")
+    total_time = 0
+    total_calls = 0
+    for func in stats.stats:
+        if "hash_of_block" in func[2]:
+            total_time = stats.stats[func][3]
+            total_calls = stats.stats[func][0]
+    percentage = (total_time / stats.total_tt) * 100
+    print(
+        f"Hashing took {total_time:.2f} seconds,{percentage:.2f}% of the total runtime."
+    )
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="Benchmark the performance of hashing function in"
+        "automatic prefix caching."
+    )
+    parser.add_argument("--model", type=str, default="lmsys/longchat-7b-16k")
+    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
+    parser.add_argument("--output-len", type=int, default=10)
+    parser.add_argument(
+        "--enable-prefix-caching", action="store_true", help="enable prefix caching"
+    )
+    args = parser.parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/benchmarks/pyproject.toml b/vllm_v0.10.0/benchmarks/pyproject.toml
new file mode 100644
index 0000000..65b1e09
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/pyproject.toml
@@ -0,0 +1,49 @@
+# This local pyproject file is part of the migration from yapf to ruff format.
+# It uses the same core rules as the main pyproject.toml file, but with the
+# following differences:
+# - ruff line length is overridden to 88
+# - deprecated typing ignores (UP006, UP035) have been removed
+
+[tool.ruff]
+line-length = 88
+
+[tool.ruff.lint.per-file-ignores]
+"vllm/third_party/**" = ["ALL"]
+"vllm/version.py" = ["F401"]
+"vllm/_version.py" = ["ALL"]
+
+[tool.ruff.lint]
+select = [
+    # pycodestyle
+    "E",
+    # Pyflakes
+    "F",
+    # pyupgrade
+    "UP",
+    # flake8-bugbear
+    "B",
+    # flake8-simplify
+    "SIM",
+    # isort
+    "I",
+    # flake8-logging-format
+    "G",
+]
+ignore = [
+    # star imports
+    "F405", "F403",
+    # lambda expression assignment
+    "E731",
+    # Loop control variable not used within loop body
+    "B007",
+    # f-string format
+    "UP032",
+    # Can remove once 3.10+ is the minimum Python version
+    "UP007",
+]
+
+[tool.ruff.lint.isort]
+known-first-party = ["vllm"]
+
+[tool.ruff.format]
+docstring-code-format = true
\ No newline at end of file
diff --git a/vllm_v0.10.0/benchmarks/run_structured_output_benchmark.sh b/vllm_v0.10.0/benchmarks/run_structured_output_benchmark.sh
new file mode 100755
index 0000000..b043ab8
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/run_structured_output_benchmark.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+
+# default values
+MODEL=${MODEL:-"Qwen/Qwen2.5-7B-Instruct"}
+BACKEND=${BACKEND:-"vllm"}
+DATASET=${DATASET:-"xgrammar_bench"}
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+OUTPUT_DIR=${OUTPUT_DIR:-"$SCRIPT_DIR/structured_output_benchmark_results"}
+PORT=${PORT:-8000}
+STRUCTURED_OUTPUT_RATIO=${STRUCTURED_OUTPUT_RATIO:-1}
+TOTAL_SECONDS=${TOTAL_SECONDS:-90}
+MAX_NEW_TOKENS=${MAX_NEW_TOKENS:-300}
+TOKENIZER_MODE=${TOKENIZER_MODE:-"auto"}
+
+usage() {
+    echo "Usage: $0 [options]"
+    echo "Options:"
+    echo "  --model MODEL                  Model to benchmark (default: $MODEL)"
+    echo "  --backend BACKEND              Backend to use (default: $BACKEND)" 
+    echo "  --dataset DATASET              Dataset to use (default: $DATASET)"
+    echo "  --max-new-tokens N             Maximum number of tokens to generate (default: $MAX_NEW_TOKENS)"
+    echo "  --output-dir DIR               Output directory for results (default: $OUTPUT_DIR)"
+    echo "  --port PORT                    Port to use (default: $PORT)"
+    echo "  --structured-output-ratio N    Ratio of structured outputs (default: $STRUCTURED_OUTPUT_RATIO)"
+    echo "  --tokenizer-mode MODE          Tokenizer mode to use (default: $TOKENIZER_MODE)"
+    echo "  --total-seconds N              Total seconds to run the benchmark (default: $TOTAL_SECONDS)"
+    echo "  -h, --help                     Show this help message and exit"
+    exit 0
+}
+
+# parse command line arguments
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --model)
+      MODEL="$2"
+      shift 2
+      ;;
+    --backend)
+      BACKEND="$2"
+      shift 2
+      ;;
+    --dataset)
+      DATASET="$2"
+      shift 2
+      ;;
+    --max-new-tokens)
+      MAX_NEW_TOKENS="$2"
+      shift 2
+      ;;
+    --output-dir)
+      OUTPUT_DIR="$2"
+      shift 2
+      ;;
+    --port)
+      PORT="$2"
+      shift 2
+      ;;
+    --structured-output-ratio)
+      STRUCTURED_OUTPUT_RATIO="$2"
+      shift 2
+      ;;
+    --tokenizer-mode)
+      TOKENIZER_MODE="$2"
+      shift 2
+      ;;
+    --total-seconds)
+      TOTAL_SECONDS="$2"
+      shift 2
+      ;;
+    -h|--help)
+      usage
+      ;;
+    *)
+      echo "Unknown argument: $1\n"
+      usage
+      ;;
+  esac
+done
+
+# Create output directory if it doesn't exist
+mkdir -p "$OUTPUT_DIR"
+
+# Define QPS values to test
+QPS_VALUES=(25 20 15 10 5 1)
+
+# Common parameters
+COMMON_PARAMS="--backend $BACKEND \
+               --model $MODEL \
+               --dataset $DATASET \
+               --structured-output-ratio $STRUCTURED_OUTPUT_RATIO \
+               --save-results \
+               --result-dir $OUTPUT_DIR \
+               --output-len $MAX_NEW_TOKENS \
+               --port $PORT \
+               --tokenizer-mode $TOKENIZER_MODE"
+
+echo "Starting structured output benchmark with model: $MODEL"
+echo "Backend: $BACKEND"
+echo "Dataset: $DATASET"
+echo "Results will be saved to: $OUTPUT_DIR"
+echo "----------------------------------------"
+
+# Run benchmarks with different QPS values
+for qps in "${QPS_VALUES[@]}"; do
+  echo "Running benchmark with QPS: $qps"
+
+  # Get git hash and branch for the filename
+  GIT_HASH=$(git rev-parse --short HEAD 2>/dev/null || echo "unknown")
+  GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")
+
+  # Construct filename for this run
+  FILENAME="${BACKEND}_${qps}qps_$(basename $MODEL)_${DATASET}_${GIT_HASH}.json"
+
+  NUM_PROMPTS=$(echo "$TOTAL_SECONDS * $qps" | bc)
+  NUM_PROMPTS=${NUM_PROMPTS%.*}  # Remove fractional part
+  echo "Running benchmark with $NUM_PROMPTS prompts"
+
+  # Run the benchmark
+  python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \
+    --request-rate $qps \
+    --result-filename "$FILENAME" \
+    --num-prompts $NUM_PROMPTS
+
+  echo "Completed benchmark with QPS: $qps"
+  echo "----------------------------------------"
+done
+
+echo "All benchmarks completed!"
+echo "Results saved to: $OUTPUT_DIR"
diff --git a/vllm_v0.10.0/benchmarks/sonnet.txt b/vllm_v0.10.0/benchmarks/sonnet.txt
new file mode 100644
index 0000000..34c444e
--- /dev/null
+++ b/vllm_v0.10.0/benchmarks/sonnet.txt
@@ -0,0 +1,518 @@
+FROM fairest creatures we desire increase,
+That thereby beauty's rose might never die,
+But as the riper should by time decease,
+His tender heir might bear his memory:
+But thou, contracted to thine own bright eyes,
+Feed'st thy light'st flame with self-substantial fuel,
+Making a famine where abundance lies,
+Thyself thy foe, to thy sweet self too cruel.
+Thou that art now the world's fresh ornament
+And only herald to the gaudy spring,
+Within thine own bud buriest thy content
+And, tender churl, makest waste in niggarding.
+Pity the world, or else this glutton be,
+To eat the world's due, by the grave and thee.
+When forty winters shall beseige thy brow,
+And dig deep trenches in thy beauty's field,
+Thy youth's proud livery, so gazed on now,
+Will be a tatter'd weed, of small worth held:
+Then being ask'd where all thy beauty lies,
+Where all the treasure of thy lusty days,
+To say, within thine own deep-sunken eyes,
+Were an all-eating shame and thriftless praise.
+How much more praise deserved thy beauty's use,
+If thou couldst answer 'This fair child of mine
+Shall sum my count and make my old excuse,'
+Proving his beauty by succession thine!
+This were to be new made when thou art old,
+And see thy blood warm when thou feel'st it cold.
+Look in thy glass, and tell the face thou viewest
+Now is the time that face should form another;
+Whose fresh repair if now thou not renewest,
+Thou dost beguile the world, unbless some mother.
+For where is she so fair whose unear'd womb
+Disdains the tillage of thy husbandry?
+Or who is he so fond will be the tomb
+Of his self-love, to stop posterity?
+Thou art thy mother's glass, and she in thee
+Calls back the lovely April of her prime:
+So thou through windows of thine age shall see
+Despite of wrinkles this thy golden time.
+But if thou live, remember'd not to be,
+Die single, and thine image dies with thee.
+Unthrifty loveliness, why dost thou spend
+Upon thyself thy beauty's legacy?
+Nature's bequest gives nothing but doth lend,
+And being frank she lends to those are free.
+Then, beauteous niggard, why dost thou abuse
+The bounteous largess given thee to give?
+Profitless usurer, why dost thou use
+So great a sum of sums, yet canst not live?
+For having traffic with thyself alone,
+Thou of thyself thy sweet self dost deceive.
+Then how, when nature calls thee to be gone,
+What acceptable audit canst thou leave?
+Thy unused beauty must be tomb'd with thee,
+Which, used, lives th' executor to be.
+Those hours, that with gentle work did frame
+The lovely gaze where every eye doth dwell,
+Will play the tyrants to the very same
+And that unfair which fairly doth excel:
+For never-resting time leads summer on
+To hideous winter and confounds him there;
+Sap cheque'd with frost and lusty leaves quite gone,
+Beauty o'ersnow'd and bareness every where:
+Then, were not summer's distillation left,
+A liquid prisoner pent in walls of glass,
+Beauty's effect with beauty were bereft,
+Nor it nor no remembrance what it was:
+But flowers distill'd though they with winter meet,
+Leese but their show; their substance still lives sweet.
+Then let not winter's ragged hand deface
+In thee thy summer, ere thou be distill'd:
+Make sweet some vial; treasure thou some place
+With beauty's treasure, ere it be self-kill'd.
+That use is not forbidden usury,
+Which happies those that pay the willing loan;
+That's for thyself to breed another thee,
+Or ten times happier, be it ten for one;
+Ten times thyself were happier than thou art,
+If ten of thine ten times refigured thee:
+Then what could death do, if thou shouldst depart,
+Leaving thee living in posterity?
+Be not self-will'd, for thou art much too fair
+To be death's conquest and make worms thine heir.
+Lo! in the orient when the gracious light
+Lifts up his burning head, each under eye
+Doth homage to his new-appearing sight,
+Serving with looks his sacred majesty;
+And having climb'd the steep-up heavenly hill,
+Resembling strong youth in his middle age,
+yet mortal looks adore his beauty still,
+Attending on his golden pilgrimage;
+But when from highmost pitch, with weary car,
+Like feeble age, he reeleth from the day,
+The eyes, 'fore duteous, now converted are
+From his low tract and look another way:
+So thou, thyself out-going in thy noon,
+Unlook'd on diest, unless thou get a son.
+Music to hear, why hear'st thou music sadly?
+Sweets with sweets war not, joy delights in joy.
+Why lovest thou that which thou receivest not gladly,
+Or else receivest with pleasure thine annoy?
+If the true concord of well-tuned sounds,
+By unions married, do offend thine ear,
+They do but sweetly chide thee, who confounds
+In singleness the parts that thou shouldst bear.
+Mark how one string, sweet husband to another,
+Strikes each in each by mutual ordering,
+Resembling sire and child and happy mother
+Who all in one, one pleasing note do sing:
+Whose speechless song, being many, seeming one,
+Sings this to thee: 'thou single wilt prove none.'
+Is it for fear to wet a widow's eye
+That thou consumest thyself in single life?
+Ah! if thou issueless shalt hap to die.
+The world will wail thee, like a makeless wife;
+The world will be thy widow and still weep
+That thou no form of thee hast left behind,
+When every private widow well may keep
+By children's eyes her husband's shape in mind.
+Look, what an unthrift in the world doth spend
+Shifts but his place, for still the world enjoys it;
+But beauty's waste hath in the world an end,
+And kept unused, the user so destroys it.
+No love toward others in that bosom sits
+That on himself such murderous shame commits.
+For shame! deny that thou bear'st love to any,
+Who for thyself art so unprovident.
+Grant, if thou wilt, thou art beloved of many,
+But that thou none lovest is most evident;
+For thou art so possess'd with murderous hate
+That 'gainst thyself thou stick'st not to conspire.
+Seeking that beauteous roof to ruinate
+Which to repair should be thy chief desire.
+O, change thy thought, that I may change my mind!
+Shall hate be fairer lodged than gentle love?
+Be, as thy presence is, gracious and kind,
+Or to thyself at least kind-hearted prove:
+Make thee another self, for love of me,
+That beauty still may live in thine or thee.
+As fast as thou shalt wane, so fast thou growest
+In one of thine, from that which thou departest;
+And that fresh blood which youngly thou bestowest
+Thou mayst call thine when thou from youth convertest.
+Herein lives wisdom, beauty and increase:
+Without this, folly, age and cold decay:
+If all were minded so, the times should cease
+And threescore year would make the world away.
+Let those whom Nature hath not made for store,
+Harsh featureless and rude, barrenly perish:
+Look, whom she best endow'd she gave the more;
+Which bounteous gift thou shouldst in bounty cherish:
+She carved thee for her seal, and meant thereby
+Thou shouldst print more, not let that copy die.
+When I do count the clock that tells the time,
+And see the brave day sunk in hideous night;
+When I behold the violet past prime,
+And sable curls all silver'd o'er with white;
+When lofty trees I see barren of leaves
+Which erst from heat did canopy the herd,
+And summer's green all girded up in sheaves
+Borne on the bier with white and bristly beard,
+Then of thy beauty do I question make,
+That thou among the wastes of time must go,
+Since sweets and beauties do themselves forsake
+And die as fast as they see others grow;
+And nothing 'gainst Time's scythe can make defence
+Save breed, to brave him when he takes thee hence.
+O, that you were yourself! but, love, you are
+No longer yours than you yourself here live:
+Against this coming end you should prepare,
+And your sweet semblance to some other give.
+So should that beauty which you hold in lease
+Find no determination: then you were
+Yourself again after yourself's decease,
+When your sweet issue your sweet form should bear.
+Who lets so fair a house fall to decay,
+Which husbandry in honour might uphold
+Against the stormy gusts of winter's day
+And barren rage of death's eternal cold?
+O, none but unthrifts! Dear my love, you know
+You had a father: let your son say so.
+Not from the stars do I my judgment pluck;
+And yet methinks I have astronomy,
+But not to tell of good or evil luck,
+Of plagues, of dearths, or seasons' quality;
+Nor can I fortune to brief minutes tell,
+Pointing to each his thunder, rain and wind,
+Or say with princes if it shall go well,
+By oft predict that I in heaven find:
+But from thine eyes my knowledge I derive,
+And, constant stars, in them I read such art
+As truth and beauty shall together thrive,
+If from thyself to store thou wouldst convert;
+Or else of thee this I prognosticate:
+Thy end is truth's and beauty's doom and date.
+When I consider every thing that grows
+Holds in perfection but a little moment,
+That this huge stage presenteth nought but shows
+Whereon the stars in secret influence comment;
+When I perceive that men as plants increase,
+Cheered and cheque'd even by the self-same sky,
+Vaunt in their youthful sap, at height decrease,
+And wear their brave state out of memory;
+Then the conceit of this inconstant stay
+Sets you most rich in youth before my sight,
+Where wasteful Time debateth with Decay,
+To change your day of youth to sullied night;
+And all in war with Time for love of you,
+As he takes from you, I engraft you new.
+But wherefore do not you a mightier way
+Make war upon this bloody tyrant, Time?
+And fortify yourself in your decay
+With means more blessed than my barren rhyme?
+Now stand you on the top of happy hours,
+And many maiden gardens yet unset
+With virtuous wish would bear your living flowers,
+Much liker than your painted counterfeit:
+So should the lines of life that life repair,
+Which this, Time's pencil, or my pupil pen,
+Neither in inward worth nor outward fair,
+Can make you live yourself in eyes of men.
+To give away yourself keeps yourself still,
+And you must live, drawn by your own sweet skill.
+Who will believe my verse in time to come,
+If it were fill'd with your most high deserts?
+Though yet, heaven knows, it is but as a tomb
+Which hides your life and shows not half your parts.
+If I could write the beauty of your eyes
+And in fresh numbers number all your graces,
+The age to come would say 'This poet lies:
+Such heavenly touches ne'er touch'd earthly faces.'
+So should my papers yellow'd with their age
+Be scorn'd like old men of less truth than tongue,
+And your true rights be term'd a poet's rage
+And stretched metre of an antique song:
+But were some child of yours alive that time,
+You should live twice; in it and in my rhyme.
+Shall I compare thee to a summer's day?
+Thou art more lovely and more temperate:
+Rough winds do shake the darling buds of May,
+And summer's lease hath all too short a date:
+Sometime too hot the eye of heaven shines,
+And often is his gold complexion dimm'd;
+And every fair from fair sometime declines,
+By chance or nature's changing course untrimm'd;
+But thy eternal summer shall not fade
+Nor lose possession of that fair thou owest;
+Nor shall Death brag thou wander'st in his shade,
+When in eternal lines to time thou growest:
+So long as men can breathe or eyes can see,
+So long lives this and this gives life to thee.
+Devouring Time, blunt thou the lion's paws,
+And make the earth devour her own sweet brood;
+Pluck the keen teeth from the fierce tiger's jaws,
+And burn the long-lived phoenix in her blood;
+Make glad and sorry seasons as thou fleets,
+And do whate'er thou wilt, swift-footed Time,
+To the wide world and all her fading sweets;
+But I forbid thee one most heinous crime:
+O, carve not with thy hours my love's fair brow,
+Nor draw no lines there with thine antique pen;
+Him in thy course untainted do allow
+For beauty's pattern to succeeding men.
+Yet, do thy worst, old Time: despite thy wrong,
+My love shall in my verse ever live young.
+A woman's face with Nature's own hand painted
+Hast thou, the master-mistress of my passion;
+A woman's gentle heart, but not acquainted
+With shifting change, as is false women's fashion;
+An eye more bright than theirs, less false in rolling,
+Gilding the object whereupon it gazeth;
+A man in hue, all 'hues' in his controlling,
+Much steals men's eyes and women's souls amazeth.
+And for a woman wert thou first created;
+Till Nature, as she wrought thee, fell a-doting,
+And by addition me of thee defeated,
+By adding one thing to my purpose nothing.
+But since she prick'd thee out for women's pleasure,
+Mine be thy love and thy love's use their treasure.
+So is it not with me as with that Muse
+Stirr'd by a painted beauty to his verse,
+Who heaven itself for ornament doth use
+And every fair with his fair doth rehearse
+Making a couplement of proud compare,
+With sun and moon, with earth and sea's rich gems,
+With April's first-born flowers, and all things rare
+That heaven's air in this huge rondure hems.
+O' let me, true in love, but truly write,
+And then believe me, my love is as fair
+As any mother's child, though not so bright
+As those gold candles fix'd in heaven's air:
+Let them say more than like of hearsay well;
+I will not praise that purpose not to sell.
+My glass shall not persuade me I am old,
+So long as youth and thou are of one date;
+But when in thee time's furrows I behold,
+Then look I death my days should expiate.
+For all that beauty that doth cover thee
+Is but the seemly raiment of my heart,
+Which in thy breast doth live, as thine in me:
+How can I then be elder than thou art?
+O, therefore, love, be of thyself so wary
+As I, not for myself, but for thee will;
+Bearing thy heart, which I will keep so chary
+As tender nurse her babe from faring ill.
+Presume not on thy heart when mine is slain;
+Thou gavest me thine, not to give back again.
+As an unperfect actor on the stage
+Who with his fear is put besides his part,
+Or some fierce thing replete with too much rage,
+Whose strength's abundance weakens his own heart.
+So I, for fear of trust, forget to say
+The perfect ceremony of love's rite,
+And in mine own love's strength seem to decay,
+O'ercharged with burden of mine own love's might.
+O, let my books be then the eloquence
+And dumb presagers of my speaking breast,
+Who plead for love and look for recompense
+More than that tongue that more hath more express'd.
+O, learn to read what silent love hath writ:
+To hear with eyes belongs to love's fine wit.
+Mine eye hath play'd the painter and hath stell'd
+Thy beauty's form in table of my heart;
+My body is the frame wherein 'tis held,
+And perspective it is the painter's art.
+For through the painter must you see his skill,
+To find where your true image pictured lies;
+Which in my bosom's shop is hanging still,
+That hath his windows glazed with thine eyes.
+Now see what good turns eyes for eyes have done:
+Mine eyes have drawn thy shape, and thine for me
+Are windows to my breast, where-through the sun
+Delights to peep, to gaze therein on thee;
+Yet eyes this cunning want to grace their art;
+They draw but what they see, know not the heart.
+Let those who are in favour with their stars
+Of public honour and proud titles boast,
+Whilst I, whom fortune of such triumph bars,
+Unlook'd for joy in that I honour most.
+Great princes' favourites their fair leaves spread
+But as the marigold at the sun's eye,
+And in themselves their pride lies buried,
+For at a frown they in their glory die.
+The painful warrior famoused for fight,
+After a thousand victories once foil'd,
+Is from the book of honour razed quite,
+And all the rest forgot for which he toil'd:
+Then happy I, that love and am beloved
+Where I may not remove nor be removed.
+Lord of my love, to whom in vassalage
+Thy merit hath my duty strongly knit,
+To thee I send this written embassage,
+To witness duty, not to show my wit:
+Duty so great, which wit so poor as mine
+May make seem bare, in wanting words to show it,
+But that I hope some good conceit of thine
+In thy soul's thought, all naked, will bestow it;
+Till whatsoever star that guides my moving
+Points on me graciously with fair aspect
+And puts apparel on my tatter'd loving,
+To show me worthy of thy sweet respect:
+Then may I dare to boast how I do love thee;
+Till then not show my head where thou mayst prove me.
+Weary with toil, I haste me to my bed,
+The dear repose for limbs with travel tired;
+But then begins a journey in my head,
+To work my mind, when body's work's expired:
+For then my thoughts, from far where I abide,
+Intend a zealous pilgrimage to thee,
+And keep my drooping eyelids open wide,
+Looking on darkness which the blind do see
+Save that my soul's imaginary sight
+Presents thy shadow to my sightless view,
+Which, like a jewel hung in ghastly night,
+Makes black night beauteous and her old face new.
+Lo! thus, by day my limbs, by night my mind,
+For thee and for myself no quiet find.
+How can I then return in happy plight,
+That am debarr'd the benefit of rest?
+When day's oppression is not eased by night,
+But day by night, and night by day, oppress'd?
+And each, though enemies to either's reign,
+Do in consent shake hands to torture me;
+The one by toil, the other to complain
+How far I toil, still farther off from thee.
+I tell the day, to please them thou art bright
+And dost him grace when clouds do blot the heaven:
+So flatter I the swart-complexion'd night,
+When sparkling stars twire not thou gild'st the even.
+But day doth daily draw my sorrows longer
+And night doth nightly make grief's strength seem stronger.
+When, in disgrace with fortune and men's eyes,
+I all alone beweep my outcast state
+And trouble deal heaven with my bootless cries
+And look upon myself and curse my fate,
+Wishing me like to one more rich in hope,
+Featured like him, like him with friends possess'd,
+Desiring this man's art and that man's scope,
+With what I most enjoy contented least;
+Yet in these thoughts myself almost despising,
+Haply I think on thee, and then my state,
+Like to the lark at break of day arising
+From sullen earth, sings hymns at heaven's gate;
+For thy sweet love remember'd such wealth brings
+That then I scorn to change my state with kings.
+When to the sessions of sweet silent thought
+I summon up remembrance of things past,
+I sigh the lack of many a thing I sought,
+And with old woes new wail my dear time's waste:
+Then can I drown an eye, unused to flow,
+For precious friends hid in death's dateless night,
+And weep afresh love's long since cancell'd woe,
+And moan the expense of many a vanish'd sight:
+Then can I grieve at grievances foregone,
+And heavily from woe to woe tell o'er
+The sad account of fore-bemoaned moan,
+Which I new pay as if not paid before.
+But if the while I think on thee, dear friend,
+All losses are restored and sorrows end.
+Thy bosom is endeared with all hearts,
+Which I by lacking have supposed dead,
+And there reigns love and all love's loving parts,
+And all those friends which I thought buried.
+How many a holy and obsequious tear
+Hath dear religious love stol'n from mine eye
+As interest of the dead, which now appear
+But things removed that hidden in thee lie!
+Thou art the grave where buried love doth live,
+Hung with the trophies of my lovers gone,
+Who all their parts of me to thee did give;
+That due of many now is thine alone:
+Their images I loved I view in thee,
+And thou, all they, hast all the all of me.
+If thou survive my well-contented day,
+When that churl Death my bones with dust shall cover,
+And shalt by fortune once more re-survey
+These poor rude lines of thy deceased lover,
+Compare them with the bettering of the time,
+And though they be outstripp'd by every pen,
+Reserve them for my love, not for their rhyme,
+Exceeded by the height of happier men.
+O, then vouchsafe me but this loving thought:
+'Had my friend's Muse grown with this growing age,
+A dearer birth than this his love had brought,
+To march in ranks of better equipage:
+But since he died and poets better prove,
+Theirs for their style I'll read, his for his love.'
+Full many a glorious morning have I seen
+Flatter the mountain-tops with sovereign eye,
+Kissing with golden face the meadows green,
+Gilding pale streams with heavenly alchemy;
+Anon permit the basest clouds to ride
+With ugly rack on his celestial face,
+And from the forlorn world his visage hide,
+Stealing unseen to west with this disgrace:
+Even so my sun one early morn did shine
+With all triumphant splendor on my brow;
+But out, alack! he was but one hour mine;
+The region cloud hath mask'd him from me now.
+Yet him for this my love no whit disdaineth;
+Suns of the world may stain when heaven's sun staineth.
+Why didst thou promise such a beauteous day,
+And make me travel forth without my cloak,
+To let base clouds o'ertake me in my way,
+Hiding thy bravery in their rotten smoke?
+'Tis not enough that through the cloud thou break,
+To dry the rain on my storm-beaten face,
+For no man well of such a salve can speak
+That heals the wound and cures not the disgrace:
+Nor can thy shame give physic to my grief;
+Though thou repent, yet I have still the loss:
+The offender's sorrow lends but weak relief
+To him that bears the strong offence's cross.
+Ah! but those tears are pearl which thy love sheds,
+And they are rich and ransom all ill deeds.
+No more be grieved at that which thou hast done:
+Roses have thorns, and silver fountains mud;
+Clouds and eclipses stain both moon and sun,
+And loathsome canker lives in sweetest bud.
+All men make faults, and even I in this,
+Authorizing thy trespass with compare,
+Myself corrupting, salving thy amiss,
+Excusing thy sins more than thy sins are;
+For to thy sensual fault I bring in sense--
+Thy adverse party is thy advocate--
+And 'gainst myself a lawful plea commence:
+Such civil war is in my love and hate
+That I an accessary needs must be
+To that sweet thief which sourly robs from me.
+Let me confess that we two must be twain,
+Although our undivided loves are one:
+So shall those blots that do with me remain
+Without thy help by me be borne alone.
+In our two loves there is but one respect,
+Though in our lives a separable spite,
+Which though it alter not love's sole effect,
+Yet doth it steal sweet hours from love's delight.
+I may not evermore acknowledge thee,
+Lest my bewailed guilt should do thee shame,
+Nor thou with public kindness honour me,
+Unless thou take that honour from thy name:
+But do not so; I love thee in such sort
+As, thou being mine, mine is thy good report.
+As a decrepit father takes delight
+To see his active child do deeds of youth,
+So I, made lame by fortune's dearest spite,
+Take all my comfort of thy worth and truth.
+For whether beauty, birth, or wealth, or wit,
+Or any of these all, or all, or more,
+Entitled in thy parts do crowned sit,
+I make my love engrafted to this store:
+So then I am not lame, poor, nor despised,
+Whilst that this shadow doth such substance give
+That I in thy abundance am sufficed
+And by a part of all thy glory live.
+Look, what is best, that best I wish in thee:
+This wish I have; then ten times happy me!
\ No newline at end of file
diff --git a/vllm_v0.10.0/cmake/cpu_extension.cmake b/vllm_v0.10.0/cmake/cpu_extension.cmake
new file mode 100644
index 0000000..21fcee6
--- /dev/null
+++ b/vllm_v0.10.0/cmake/cpu_extension.cmake
@@ -0,0 +1,305 @@
+include(FetchContent)
+
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+    set(MACOSX_FOUND TRUE)
+endif()
+
+
+#
+# Define environment variables for special configurations
+#
+set(ENABLE_AVX512BF16 $ENV{VLLM_CPU_AVX512BF16})
+set(ENABLE_AVX512VNNI $ENV{VLLM_CPU_AVX512VNNI})
+
+include_directories("${CMAKE_SOURCE_DIR}/csrc")
+
+
+set (ENABLE_NUMA TRUE)
+
+#
+# Check the compile flags
+#
+
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
+    list(APPEND CXX_COMPILE_FLAGS
+        "-mf16c"
+    )
+endif()
+
+if(MACOSX_FOUND)
+    list(APPEND CXX_COMPILE_FLAGS
+        "-DVLLM_CPU_EXTENSION")
+else()
+    list(APPEND CXX_COMPILE_FLAGS
+        "-fopenmp"
+        "-DVLLM_CPU_EXTENSION")
+endif()
+
+if (NOT MACOSX_FOUND)
+    execute_process(COMMAND cat /proc/cpuinfo
+                    RESULT_VARIABLE CPUINFO_RET
+                    OUTPUT_VARIABLE CPUINFO)
+    if (NOT CPUINFO_RET EQUAL 0)
+        message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo")
+    endif()
+endif()
+
+
+function (find_isa CPUINFO TARGET OUT)
+    string(FIND ${CPUINFO} ${TARGET} ISA_FOUND)
+    if(NOT ISA_FOUND EQUAL -1)
+        set(${OUT} ON PARENT_SCOPE)
+    else()
+        set(${OUT} OFF PARENT_SCOPE)
+    endif()
+endfunction()
+
+function (is_avx512_disabled OUT)
+    set(DISABLE_AVX512 $ENV{VLLM_CPU_DISABLE_AVX512})
+    if(DISABLE_AVX512 AND DISABLE_AVX512 STREQUAL "true")
+        set(${OUT} ON PARENT_SCOPE)
+    else()
+        set(${OUT} OFF PARENT_SCOPE)
+    endif()
+endfunction()
+
+is_avx512_disabled(AVX512_DISABLED)
+
+if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+    set(APPLE_SILICON_FOUND TRUE)
+else()
+    find_isa(${CPUINFO} "avx2" AVX2_FOUND)
+    find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
+    find_isa(${CPUINFO} "Power11" POWER11_FOUND)
+    find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
+    find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
+    find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
+    find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
+    find_isa(${CPUINFO} "S390" S390_FOUND)
+endif()
+
+
+if (AVX512_FOUND AND NOT AVX512_DISABLED)
+    list(APPEND CXX_COMPILE_FLAGS
+        "-mavx512f"
+        "-mavx512vl"
+        "-mavx512bw"
+        "-mavx512dq")
+
+    find_isa(${CPUINFO} "avx512_bf16" AVX512BF16_FOUND)
+    if (AVX512BF16_FOUND OR ENABLE_AVX512BF16)
+        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
+            CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
+            list(APPEND CXX_COMPILE_FLAGS "-mavx512bf16")
+            set(ENABLE_AVX512BF16 ON)
+        else()
+            set(ENABLE_AVX512BF16 OFF)
+            message(WARNING "Disable AVX512-BF16 ISA support, requires gcc/g++ >= 12.3")
+        endif()
+    else()
+        set(ENABLE_AVX512BF16 OFF)
+        message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.")
+    endif()
+
+    find_isa(${CPUINFO} "avx512_vnni" AVX512VNNI_FOUND)
+    if (AVX512VNNI_FOUND OR ENABLE_AVX512VNNI)
+        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
+            CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
+            list(APPEND CXX_COMPILE_FLAGS "-mavx512vnni")
+            set(ENABLE_AVX512VNNI ON)
+        else()
+            set(ENABLE_AVX512VNNI OFF)
+            message(WARNING "Disable AVX512-VNNI ISA support, requires gcc/g++ >= 12.3")
+        endif()
+    else()
+        set(ENABLE_AVX512VNNI OFF)
+        message(WARNING "Disable AVX512-VNNI ISA support, no avx512_vnni found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512VNNI=1.")
+    endif()
+    
+elseif (AVX2_FOUND)
+    list(APPEND CXX_COMPILE_FLAGS "-mavx2")
+    message(WARNING "vLLM CPU backend using AVX2 ISA")
+    
+elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
+    message(STATUS "PowerPC detected")
+    if (POWER9_FOUND)
+        list(APPEND CXX_COMPILE_FLAGS
+            "-mvsx"
+            "-mcpu=power9"
+            "-mtune=power9")
+    elseif (POWER10_FOUND OR POWER11_FOUND)
+        list(APPEND CXX_COMPILE_FLAGS
+            "-mvsx"
+            "-mcpu=power10"
+            "-mtune=power10")
+    endif()
+
+elseif (ASIMD_FOUND)
+    message(STATUS "ARMv8 or later architecture detected")
+    if(ARM_BF16_FOUND)
+        message(STATUS "BF16 extension detected")
+        set(MARCH_FLAGS "-march=armv8.2-a+bf16+dotprod+fp16")
+        add_compile_definitions(ARM_BF16_SUPPORT)
+    else()
+        message(WARNING "BF16 functionality is not available")
+        set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")  
+    endif()
+    list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})     
+elseif(APPLE_SILICON_FOUND)
+    message(STATUS "Apple Silicon Detected")
+    set(ENABLE_NUMA OFF)
+elseif (S390_FOUND)
+    message(STATUS "S390 detected")
+    # Check for S390 VXE support
+    list(APPEND CXX_COMPILE_FLAGS
+        "-mvx"
+        "-mzvector"
+        "-march=native"
+        "-mtune=native")
+else()
+    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA or ARMv8 support.")
+endif()
+
+#
+# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 /ARM platforms)
+# Flag to enable ACL kernels for AARCH64 platforms
+if ( VLLM_BUILD_ACL STREQUAL "ON")
+    set(USE_ACL ON)
+else()
+    set(USE_ACL OFF)
+endif()
+
+if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND)
+    FetchContent_Declare(
+        oneDNN
+        GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
+        GIT_TAG  v3.8.1
+        GIT_PROGRESS TRUE
+        GIT_SHALLOW TRUE
+    )
+
+    if(USE_ACL)
+        find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/)
+        if(NOT ARM_COMPUTE_LIBRARY)
+            message(FATAL_ERROR "Could not find ARM Compute Library: please set ACL_ROOT_DIR")
+        endif()
+        set(ONEDNN_AARCH64_USE_ACL "ON")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
+        endif()
+
+    set(ONEDNN_LIBRARY_TYPE "STATIC")
+    set(ONEDNN_BUILD_DOC "OFF")
+    set(ONEDNN_BUILD_EXAMPLES "OFF")
+    set(ONEDNN_BUILD_TESTS "OFF")
+    set(ONEDNN_ENABLE_WORKLOAD "INFERENCE")
+    set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
+    set(ONEDNN_BUILD_GRAPH "OFF")
+    set(ONEDNN_ENABLE_JIT_PROFILING "OFF")
+    set(ONEDNN_ENABLE_ITT_TASKS "OFF")
+    set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
+    set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
+    set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+
+    FetchContent_MakeAvailable(oneDNN)
+    
+    list(APPEND LIBS dnnl)
+elseif(POWER10_FOUND)
+    FetchContent_Declare(
+        oneDNN
+        GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
+        GIT_TAG v3.7.2
+        GIT_PROGRESS TRUE
+        GIT_SHALLOW TRUE
+    )
+
+    set(ONEDNN_LIBRARY_TYPE "STATIC")
+    set(ONEDNN_BUILD_DOC "OFF")
+    set(ONEDNN_BUILD_EXAMPLES "OFF")
+    set(ONEDNN_BUILD_TESTS "OFF")
+    set(ONEDNN_ENABLE_WORKLOAD "INFERENCE")
+    set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
+    set(ONEDNN_BUILD_GRAPH "OFF")
+    set(ONEDNN_ENABLE_JIT_PROFILING "OFF")
+    set(ONEDNN_ENABLE_ITT_TASKS "OFF")
+    set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
+    set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
+    set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+
+    set(DNNL_CPU_RUNTIME "OMP")
+
+    FetchContent_MakeAvailable(oneDNN)
+
+    list(APPEND LIBS dnnl)
+endif()
+
+message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
+
+if(ENABLE_NUMA)
+    list(APPEND LIBS numa)
+else()
+    message(STATUS "NUMA is disabled")
+    add_compile_definitions(-DVLLM_NUMA_DISABLED)
+endif()
+
+#
+# _C extension
+#
+set(VLLM_EXT_SRC
+    "csrc/cpu/activation.cpp"
+    "csrc/cpu/attention.cpp"
+    "csrc/cpu/cache.cpp"
+    "csrc/cpu/utils.cpp"
+    "csrc/cpu/layernorm.cpp"
+    "csrc/cpu/mla_decode.cpp"
+    "csrc/cpu/pos_encoding.cpp"
+    "csrc/cpu/torch_bindings.cpp")
+
+if (AVX512_FOUND AND NOT AVX512_DISABLED)
+    set(VLLM_EXT_SRC
+        "csrc/cpu/quant.cpp"
+        "csrc/cpu/shm.cpp"
+        ${VLLM_EXT_SRC})
+    if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
+        set(VLLM_EXT_SRC
+            "csrc/cpu/sgl-kernels/gemm.cpp"
+            "csrc/cpu/sgl-kernels/gemm_int8.cpp"
+            "csrc/cpu/sgl-kernels/gemm_fp8.cpp"
+            "csrc/cpu/sgl-kernels/moe.cpp"
+            "csrc/cpu/sgl-kernels/moe_int8.cpp"
+            "csrc/cpu/sgl-kernels/moe_fp8.cpp"
+            ${VLLM_EXT_SRC})
+        add_compile_definitions(-DCPU_CAPABILITY_AVX512)
+    endif()
+elseif(POWER10_FOUND)
+    set(VLLM_EXT_SRC
+        "csrc/cpu/quant.cpp"
+        ${VLLM_EXT_SRC})
+endif()
+if (ASIMD_FOUND)
+    set(VLLM_EXT_SRC
+        "csrc/cpu/quant.cpp"
+        ${VLLM_EXT_SRC})
+endif()
+
+message(STATUS "CPU extension source files: ${VLLM_EXT_SRC}")
+
+#
+# Define extension targets
+#
+
+define_gpu_extension_target(
+    _C
+    DESTINATION vllm
+    LANGUAGE CXX
+    SOURCES ${VLLM_EXT_SRC}
+    LIBRARIES ${LIBS}
+    COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
+    USE_SABI 3
+    WITH_SOABI
+)
+
+message(STATUS "Enabling C extension.")
diff --git a/vllm_v0.10.0/cmake/external_projects/flashmla.cmake b/vllm_v0.10.0/cmake/external_projects/flashmla.cmake
new file mode 100644
index 0000000..6291475
--- /dev/null
+++ b/vllm_v0.10.0/cmake/external_projects/flashmla.cmake
@@ -0,0 +1,66 @@
+include(FetchContent)
+
+# If FLASH_MLA_SRC_DIR is set, flash-mla is installed from that directory 
+# instead of downloading.
+# It can be set as an environment variable or passed as a cmake argument.
+# The environment variable takes precedence.
+if (DEFINED ENV{FLASH_MLA_SRC_DIR})
+  set(FLASH_MLA_SRC_DIR $ENV{FLASH_MLA_SRC_DIR})
+endif()
+
+if(FLASH_MLA_SRC_DIR)
+  FetchContent_Declare(
+        flashmla 
+        SOURCE_DIR ${FLASH_MLA_SRC_DIR}
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND ""
+  )
+else()
+  FetchContent_Declare(
+        flashmla
+        GIT_REPOSITORY https://github.com/vllm-project/FlashMLA.git
+        GIT_TAG 575f7724b9762f265bbee5889df9c7d630801845
+        GIT_PROGRESS TRUE
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND ""
+  )
+endif()
+
+
+FetchContent_MakeAvailable(flashmla)
+message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
+
+# The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
+# Only build FlashMLA kernels if we are building for something compatible with 
+# sm90a
+cuda_archs_loose_intersection(FLASH_MLA_ARCHS "9.0a" "${CUDA_ARCHS}")
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
+    set(FlashMLA_SOURCES
+        ${flashmla_SOURCE_DIR}/csrc/flash_api.cpp
+        ${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_bf16_sm90.cu
+        ${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_fp16_sm90.cu
+        ${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_metadata.cu)
+
+    set(FlashMLA_INCLUDES
+        ${flashmla_SOURCE_DIR}/csrc/cutlass/include
+        ${flashmla_SOURCE_DIR}/csrc/include)
+
+    set_gencode_flags_for_srcs(
+        SRCS "${FlashMLA_SOURCES}"
+        CUDA_ARCHS "${FLASH_MLA_ARCHS}")
+
+    define_gpu_extension_target(
+        _flashmla_C
+        DESTINATION vllm
+        LANGUAGE ${VLLM_GPU_LANG}
+        SOURCES ${FlashMLA_SOURCES}
+        COMPILE_FLAGS ${VLLM_GPU_FLAGS}
+        ARCHITECTURES ${VLLM_GPU_ARCHES}
+        INCLUDE_DIRECTORIES ${FlashMLA_INCLUDES}
+        USE_SABI 3
+        WITH_SOABI)
+else()
+    # Create an empty target for setup.py when not targeting sm90a systems
+    add_custom_target(_flashmla_C)
+endif()
+
diff --git a/vllm_v0.10.0/cmake/external_projects/vllm_flash_attn.cmake b/vllm_v0.10.0/cmake/external_projects/vllm_flash_attn.cmake
new file mode 100644
index 0000000..ef45a5f
--- /dev/null
+++ b/vllm_v0.10.0/cmake/external_projects/vllm_flash_attn.cmake
@@ -0,0 +1,83 @@
+# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
+# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
+# arches in the CUDA case (and instead set the gencodes on a per file basis)
+# we need to manually set VLLM_GPU_ARCHES here.
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  foreach(_ARCH ${CUDA_ARCHS})
+    string(REPLACE "." "" _ARCH "${_ARCH}")
+    list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
+  endforeach()
+endif()
+
+#
+# Build vLLM flash attention from source
+#
+# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
+# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
+# They should be identical but if they aren't, this is a massive footgun.
+#
+# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
+# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3).
+# If no component is specified, vllm-flash-attn is still installed.
+
+# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
+# This is to enable local development of vllm-flash-attn within vLLM.
+# It can be set as an environment variable or passed as a cmake argument.
+# The environment variable takes precedence.
+if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
+  set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
+endif()
+
+if(VLLM_FLASH_ATTN_SRC_DIR)
+  FetchContent_Declare(
+          vllm-flash-attn SOURCE_DIR 
+          ${VLLM_FLASH_ATTN_SRC_DIR}
+          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
+  )
+else()
+  FetchContent_Declare(
+          vllm-flash-attn
+          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
+          GIT_TAG 1c2624e53c078854e0637ee566c72fe2107e75f4
+          GIT_PROGRESS TRUE
+          # Don't share the vllm-flash-attn build between build types
+          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
+  )
+endif()
+
+
+# Ensure the vllm/vllm_flash_attn directory exists before installation
+install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" ALL_COMPONENTS)
+
+# Make sure vllm-flash-attn install rules are nested under vllm/
+# This is here to support installing all components under the same prefix with cmake --install.
+# setup.py installs every component separately but uses the same prefix for all.
+# ALL_COMPONENTS is used to avoid duplication for FA2 and FA3,
+# and these statements don't hurt when installing neither component.
+install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" ALL_COMPONENTS)
+install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
+install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" ALL_COMPONENTS)
+
+# Fetch the vllm-flash-attn library
+FetchContent_MakeAvailable(vllm-flash-attn)
+message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
+
+# Restore the install prefix
+install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
+install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
+
+# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
+# case only one is built, in the case both are built redundant work is done)
+install(
+  DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
+  DESTINATION vllm/vllm_flash_attn
+  COMPONENT _vllm_fa2_C
+  FILES_MATCHING PATTERN "*.py"
+)
+
+install(
+  DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
+  DESTINATION vllm/vllm_flash_attn
+  COMPONENT _vllm_fa3_C
+  FILES_MATCHING PATTERN "*.py"
+)
diff --git a/vllm_v0.10.0/cmake/hipify.py b/vllm_v0.10.0/cmake/hipify.py
new file mode 100755
index 0000000..55d378f
--- /dev/null
+++ b/vllm_v0.10.0/cmake/hipify.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+#
+# A command line tool for running pytorch's hipify preprocessor on CUDA
+# source files.
+#
+# See https://github.com/ROCm/hipify_torch
+# and <torch install dir>/utils/hipify/hipify_python.py
+#
+
+import argparse
+import os
+import shutil
+
+from torch.utils.hipify.hipify_python import hipify
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+
+    # Project directory where all the source + include files live.
+    parser.add_argument(
+        "-p",
+        "--project_dir",
+        help="The project directory.",
+    )
+
+    # Directory where hipified files are written.
+    parser.add_argument(
+        "-o",
+        "--output_dir",
+        help="The output directory.",
+    )
+
+    # Source files to convert.
+    parser.add_argument("sources",
+                        help="Source files to hipify.",
+                        nargs="*",
+                        default=[])
+
+    args = parser.parse_args()
+
+    # Limit include scope to project_dir only
+    includes = [os.path.join(args.project_dir, '*')]
+
+    # Get absolute path for all source files.
+    extra_files = [os.path.abspath(s) for s in args.sources]
+
+    # Copy sources from project directory to output directory.
+    # The directory might already exist to hold object files so we ignore that.
+    shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True)
+
+    hipify_result = hipify(project_directory=args.project_dir,
+                           output_directory=args.output_dir,
+                           header_include_dirs=[],
+                           includes=includes,
+                           extra_files=extra_files,
+                           show_detailed=True,
+                           is_pytorch_extension=True,
+                           hipify_extra_files_only=True)
+
+    hipified_sources = []
+    for source in args.sources:
+        s_abs = os.path.abspath(source)
+        hipified_s_abs = (hipify_result[s_abs].hipified_path if
+                          (s_abs in hipify_result
+                           and hipify_result[s_abs].hipified_path is not None)
+                          else s_abs)
+        hipified_sources.append(hipified_s_abs)
+
+    assert (len(hipified_sources) == len(args.sources))
+
+    # Print hipified source files.
+    print("\n".join(hipified_sources))
diff --git a/vllm_v0.10.0/cmake/utils.cmake b/vllm_v0.10.0/cmake/utils.cmake
new file mode 100644
index 0000000..621179a
--- /dev/null
+++ b/vllm_v0.10.0/cmake/utils.cmake
@@ -0,0 +1,499 @@
+#
+# Attempt to find the python package that uses the same python executable as
+# `EXECUTABLE` and is one of the `SUPPORTED_VERSIONS`.
+#
+macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS)
+  file(REAL_PATH ${EXECUTABLE} EXECUTABLE)
+  set(Python_EXECUTABLE ${EXECUTABLE})
+  find_package(Python COMPONENTS Interpreter Development.Module Development.SABIModule)
+  if (NOT Python_FOUND)
+    message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
+  endif()
+  set(_VER "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}")
+  set(_SUPPORTED_VERSIONS_LIST ${SUPPORTED_VERSIONS} ${ARGN})
+  if (NOT _VER IN_LIST _SUPPORTED_VERSIONS_LIST)
+    message(FATAL_ERROR
+      "Python version (${_VER}) is not one of the supported versions: "
+      "${_SUPPORTED_VERSIONS_LIST}.")
+  endif()
+  message(STATUS "Found python matching: ${EXECUTABLE}.")
+endmacro()
+
+#
+# Run `EXPR` in python.  The standard output of python is stored in `OUT` and
+# has trailing whitespace stripped.  If an error is encountered when running
+# python, a fatal message `ERR_MSG` is issued.
+#
+function (run_python OUT EXPR ERR_MSG)
+  execute_process(
+    COMMAND
+    "${Python_EXECUTABLE}" "-c" "${EXPR}"
+    OUTPUT_VARIABLE PYTHON_OUT
+    RESULT_VARIABLE PYTHON_ERROR_CODE
+    ERROR_VARIABLE PYTHON_STDERR
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+  if(NOT PYTHON_ERROR_CODE EQUAL 0)
+    message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}")
+  endif()
+  set(${OUT} ${PYTHON_OUT} PARENT_SCOPE)
+endfunction()
+
+# Run `EXPR` in python after importing `PKG`. Use the result of this to extend
+# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported.
+macro (append_cmake_prefix_path PKG EXPR)
+  run_python(_PREFIX_PATH
+    "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path")
+  list(APPEND CMAKE_PREFIX_PATH ${_PREFIX_PATH})
+endmacro()
+
+#
+# Add a target named `hipify${NAME}` that runs the hipify preprocessor on a set
+# of CUDA source files. The names of the corresponding "hipified" sources are
+# stored in `OUT_SRCS`.
+#
+function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
+  #
+  # Split into C++ and non-C++ (i.e. CUDA) sources.
+  #
+  set(SRCS ${ORIG_SRCS})
+  set(CXX_SRCS ${ORIG_SRCS})
+  list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)|(hip)$")
+  list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)|(hip)$")
+
+  #
+  # Generate ROCm/HIP source file names from CUDA file names.
+  # Since HIP files are generated code, they will appear in the build area
+  # `CMAKE_CURRENT_BINARY_DIR` directory rather than the original csrc dir.
+  #
+  set(HIP_SRCS)
+  foreach (SRC ${SRCS})
+    string(REGEX REPLACE "\.cu$" "\.hip" SRC ${SRC})
+    string(REGEX REPLACE "cuda" "hip" SRC ${SRC})
+    list(APPEND HIP_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${SRC}")
+  endforeach()
+
+  set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
+  add_custom_target(
+    hipify${NAME}
+    COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
+    DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS}
+    BYPRODUCTS ${HIP_SRCS}
+    COMMENT "Running hipify on ${NAME} extension source files.")
+
+  # Swap out original extension sources with hipified sources.
+  list(APPEND HIP_SRCS ${CXX_SRCS})
+  set(${OUT_SRCS} ${HIP_SRCS} PARENT_SCOPE)
+endfunction()
+
+#
+# Get additional GPU compiler flags from torch.
+#
+function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
+  if (${GPU_LANG} STREQUAL "CUDA")
+    #
+    # Get common NVCC flags from torch.
+    #
+    run_python(GPU_FLAGS
+      "from torch.utils.cpp_extension import COMMON_NVCC_FLAGS; print(';'.join(COMMON_NVCC_FLAGS))"
+      "Failed to determine torch nvcc compiler flags")
+
+    if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
+      list(APPEND GPU_FLAGS "-DENABLE_FP8")
+    endif()
+    if (CUDA_VERSION VERSION_GREATER_EQUAL 12.0)
+      list(REMOVE_ITEM GPU_FLAGS
+        "-D__CUDA_NO_HALF_OPERATORS__"
+        "-D__CUDA_NO_HALF_CONVERSIONS__"
+        "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
+        "-D__CUDA_NO_HALF2_OPERATORS__")
+    endif()
+
+  elseif(${GPU_LANG} STREQUAL "HIP")
+    #
+    # Get common HIP/HIPCC flags from torch.
+    #
+    run_python(GPU_FLAGS
+      "import torch.utils.cpp_extension as t; print(';'.join(t.COMMON_HIP_FLAGS + t.COMMON_HIPCC_FLAGS))"
+      "Failed to determine torch nvcc compiler flags")
+
+    list(APPEND GPU_FLAGS
+      "-DUSE_ROCM"
+      "-DENABLE_FP8"
+      "-U__HIP_NO_HALF_CONVERSIONS__"
+      "-U__HIP_NO_HALF_OPERATORS__"
+      "-Werror=unused-variable"
+      "-fno-gpu-rdc")
+
+  endif()
+  set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE)
+endfunction()
+
+# Macro for converting a `gencode` version number to a cmake version number.
+macro(string_to_ver OUT_VER IN_STR)
+  string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
+endmacro()
+
+#
+# Clear all `-gencode` flags from `CMAKE_CUDA_FLAGS` and store them in
+# `CUDA_ARCH_FLAGS`.
+#
+# Example:
+#   CMAKE_CUDA_FLAGS="-Wall -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75"
+#   clear_cuda_arches(CUDA_ARCH_FLAGS)
+#   CUDA_ARCH_FLAGS="-gencode arch=compute_70,code=sm_70;-gencode arch=compute_75,code=sm_75"
+#   CMAKE_CUDA_FLAGS="-Wall"
+#
+macro(clear_cuda_arches CUDA_ARCH_FLAGS)
+    # Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
+    string(REGEX MATCHALL "-gencode arch=[^ ]+" CUDA_ARCH_FLAGS
+      ${CMAKE_CUDA_FLAGS})
+
+    # Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
+    # and passed back via the `CUDA_ARCHITECTURES` property.
+    string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
+      ${CMAKE_CUDA_FLAGS})
+endmacro()
+
+#
+# Extract unique CUDA architectures from a list of compute capabilities codes in 
+# the form `<major><minor>[<letter>]`, convert them to the form sort 
+# `<major>.<minor>`, dedupes them and then sorts them in ascending order and 
+# stores them in `OUT_ARCHES`.
+#
+# Example:
+#   CUDA_ARCH_FLAGS="-gencode arch=compute_75,code=sm_75;...;-gencode arch=compute_90a,code=sm_90a" 
+#   extract_unique_cuda_archs_ascending(OUT_ARCHES CUDA_ARCH_FLAGS)
+#   OUT_ARCHES="7.5;...;9.0"
+function(extract_unique_cuda_archs_ascending OUT_ARCHES CUDA_ARCH_FLAGS)
+  set(_CUDA_ARCHES)
+  foreach(_ARCH ${CUDA_ARCH_FLAGS})
+    string(REGEX MATCH "arch=compute_\([0-9]+a?\)" _COMPUTE ${_ARCH})
+    if (_COMPUTE)
+      set(_COMPUTE ${CMAKE_MATCH_1})
+    endif()
+
+    string_to_ver(_COMPUTE_VER ${_COMPUTE})
+    list(APPEND _CUDA_ARCHES ${_COMPUTE_VER})
+  endforeach()
+
+  list(REMOVE_DUPLICATES _CUDA_ARCHES)
+  list(SORT _CUDA_ARCHES COMPARE NATURAL ORDER ASCENDING)
+  set(${OUT_ARCHES} ${_CUDA_ARCHES} PARENT_SCOPE)
+endfunction()
+
+#
+# For a specific file set the `-gencode` flag in compile options conditionally 
+# for the CUDA language. 
+#
+# Example:
+#   set_gencode_flag_for_srcs(
+#     SRCS "foo.cu"
+#     ARCH "compute_75"
+#     CODE "sm_75")
+#   adds: "-gencode arch=compute_75,code=sm_75" to the compile options for 
+#    `foo.cu` (only for the CUDA language).
+#
+macro(set_gencode_flag_for_srcs)
+  set(options)
+  set(oneValueArgs ARCH CODE)
+  set(multiValueArgs SRCS)
+  cmake_parse_arguments(arg "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN} )
+  set(_FLAG -gencode arch=${arg_ARCH},code=${arg_CODE})
+  set_property(
+    SOURCE ${arg_SRCS}
+    APPEND PROPERTY
+    COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CUDA>:${_FLAG}>"
+  )
+
+  message(DEBUG "Setting gencode flag for ${arg_SRCS}: ${_FLAG}")
+endmacro(set_gencode_flag_for_srcs)
+
+#
+# For a list of source files set the `-gencode` flags in the files specific 
+#  compile options (specifically for the CUDA language).
+#
+# arguments are:
+#  SRCS: list of source files
+#  CUDA_ARCHS: list of CUDA architectures in the form `<major>.<minor>[letter]`
+#  BUILD_PTX_FOR_ARCH: if set to true, then the PTX code will be built
+#    for architecture `BUILD_PTX_FOR_ARCH` if there is a CUDA_ARCH in CUDA_ARCHS 
+#    that is larger than BUILD_PTX_FOR_ARCH.
+#
+macro(set_gencode_flags_for_srcs)
+  set(options)
+  set(oneValueArgs BUILD_PTX_FOR_ARCH)
+  set(multiValueArgs SRCS CUDA_ARCHS)
+  cmake_parse_arguments(arg "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN} )
+
+  foreach(_ARCH ${arg_CUDA_ARCHS})
+    # handle +PTX suffix: generate both sm and ptx codes if requested
+    string(FIND "${_ARCH}" "+PTX" _HAS_PTX)
+    if(NOT _HAS_PTX EQUAL -1)
+      string(REPLACE "+PTX" "" _BASE_ARCH "${_ARCH}")
+      string(REPLACE "." "" _STRIPPED_ARCH "${_BASE_ARCH}")
+      set_gencode_flag_for_srcs(
+        SRCS ${arg_SRCS}
+        ARCH "compute_${_STRIPPED_ARCH}"
+        CODE "sm_${_STRIPPED_ARCH}")
+      set_gencode_flag_for_srcs(
+        SRCS ${arg_SRCS}
+        ARCH "compute_${_STRIPPED_ARCH}"
+        CODE "compute_${_STRIPPED_ARCH}")
+    else()
+      string(REPLACE "." "" _STRIPPED_ARCH "${_ARCH}")
+      set_gencode_flag_for_srcs(
+        SRCS ${arg_SRCS}
+        ARCH "compute_${_STRIPPED_ARCH}"
+        CODE "sm_${_STRIPPED_ARCH}")
+    endif()
+  endforeach()
+
+  if (${arg_BUILD_PTX_FOR_ARCH})
+    list(SORT arg_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
+    list(GET arg_CUDA_ARCHS -1 _HIGHEST_ARCH)
+    if (_HIGHEST_ARCH VERSION_GREATER_EQUAL ${arg_BUILD_PTX_FOR_ARCH})
+      string(REPLACE "." "" _PTX_ARCH "${arg_BUILD_PTX_FOR_ARCH}")
+      set_gencode_flag_for_srcs(
+        SRCS ${arg_SRCS}
+        ARCH "compute_${_PTX_ARCH}"
+        CODE "compute_${_PTX_ARCH}")
+    endif()
+  endif()
+endmacro()
+
+#
+# For the given `SRC_CUDA_ARCHS` list of gencode versions in the form
+#  `<major>.<minor>[letter]` compute the "loose intersection" with the
+#  `TGT_CUDA_ARCHS` list of gencodes. We also support the `+PTX` suffix in
+#  `SRC_CUDA_ARCHS` which indicates that the PTX code should be built when there
+#  is a CUDA_ARCH in `TGT_CUDA_ARCHS` that is equal to or larger than the
+#  architecture in `SRC_CUDA_ARCHS`.
+# The loose intersection is defined as:
+#   { max{ x \in tgt | x <= y } | y \in src, { x \in tgt | x <= y } != {} }
+#  where `<=` is the version comparison operator.
+# In other words, for each version in `TGT_CUDA_ARCHS` find the highest version
+#  in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`.
+# We have special handling for x.0a, if x.0a is in `SRC_CUDA_ARCHS` and x.0 is
+#  in `TGT_CUDA_ARCHS` then we should remove x.0a from `SRC_CUDA_ARCHS` and add
+#  x.0a to the result (and remove x.0 from TGT_CUDA_ARCHS).
+# The result is stored in `OUT_CUDA_ARCHS`.
+#
+# Example:
+#   SRC_CUDA_ARCHS="7.5;8.0;8.6;9.0;9.0a"
+#   TGT_CUDA_ARCHS="8.0;8.9;9.0"
+#   cuda_archs_loose_intersection(OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
+#   OUT_CUDA_ARCHS="8.0;8.6;9.0;9.0a"
+#
+# Example With PTX:
+#   SRC_CUDA_ARCHS="8.0+PTX"
+#   TGT_CUDA_ARCHS="9.0"
+#   cuda_archs_loose_intersection(OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
+#   OUT_CUDA_ARCHS="8.0+PTX"
+#
+function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
+  set(_SRC_CUDA_ARCHS "${SRC_CUDA_ARCHS}")
+  set(_TGT_CUDA_ARCHS ${TGT_CUDA_ARCHS})
+
+  # handle +PTX suffix: separate base arch for matching, record PTX requests
+  set(_PTX_ARCHS)
+  foreach(_arch ${_SRC_CUDA_ARCHS})
+    if(_arch MATCHES "\\+PTX$")
+      string(REPLACE "+PTX" "" _base "${_arch}")
+      list(APPEND _PTX_ARCHS "${_base}")
+      list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
+      list(APPEND _SRC_CUDA_ARCHS "${_base}")
+    endif()
+  endforeach()
+  list(REMOVE_DUPLICATES _PTX_ARCHS)
+  list(REMOVE_DUPLICATES _SRC_CUDA_ARCHS)
+
+  # if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
+  # remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS
+  set(_CUDA_ARCHS)
+  foreach(_arch ${_SRC_CUDA_ARCHS})
+    if(_arch MATCHES "\\a$")
+      list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
+      string(REPLACE "a" "" _base "${_arch}")
+      if ("${_base}" IN_LIST TGT_CUDA_ARCHS)
+        list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}")
+        list(APPEND _CUDA_ARCHS "${_arch}")
+      endif()
+    endif()
+  endforeach()
+
+  list(SORT _SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
+
+  # for each ARCH in TGT_CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that
+  # is less or equal to ARCH (but has the same major version since SASS binary
+  # compatibility is only forward compatible within the same major version).
+  foreach(_ARCH ${_TGT_CUDA_ARCHS})
+    set(_TMP_ARCH)
+    # Extract the major version of the target arch
+    string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" TGT_ARCH_MAJOR "${_ARCH}")
+    foreach(_SRC_ARCH ${_SRC_CUDA_ARCHS})
+      # Extract the major version of the source arch
+      string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" SRC_ARCH_MAJOR "${_SRC_ARCH}")
+      # Check version-less-or-equal, and allow PTX arches to match across majors
+      if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH)
+        if (_SRC_ARCH IN_LIST _PTX_ARCHS OR SRC_ARCH_MAJOR STREQUAL TGT_ARCH_MAJOR)
+          set(_TMP_ARCH "${_SRC_ARCH}")
+        endif()
+      else()
+        # If we hit a version greater than the target, we can break
+        break()
+      endif()
+    endforeach()
+
+    # If we found a matching _TMP_ARCH, append it to _CUDA_ARCHS
+    if (_TMP_ARCH)
+      list(APPEND _CUDA_ARCHS "${_TMP_ARCH}")
+    endif()
+  endforeach()
+
+  list(REMOVE_DUPLICATES _CUDA_ARCHS)
+
+  # reapply +PTX suffix to architectures that requested PTX
+  set(_FINAL_ARCHS)
+  foreach(_arch ${_CUDA_ARCHS})
+    if(_arch IN_LIST _PTX_ARCHS)
+      list(APPEND _FINAL_ARCHS "${_arch}+PTX")
+    else()
+      list(APPEND _FINAL_ARCHS "${_arch}")
+    endif()
+  endforeach()
+  set(_CUDA_ARCHS ${_FINAL_ARCHS})
+
+  set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)
+endfunction()
+
+#
+# Override the GPU architectures detected by cmake/torch and filter them by
+# `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in
+# `GPU_ARCHES`. This only applies to the HIP language since for CUDA we set 
+# the architectures on a per file basis.
+#
+# Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`.
+#
+macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
+  set(_GPU_SUPPORTED_ARCHES_LIST ${GPU_SUPPORTED_ARCHES} ${ARGN})
+  message(STATUS "${GPU_LANG} supported arches: ${_GPU_SUPPORTED_ARCHES_LIST}")
+
+  if (${GPU_LANG} STREQUAL "HIP")
+    #
+    # `GPU_ARCHES` controls the `--offload-arch` flags.
+    #
+    # If PYTORCH_ROCM_ARCH env variable exists, then we take it as a list,
+    # if not, then we use CMAKE_HIP_ARCHITECTURES which was generated by calling
+    # "rocm_agent_enumerator" in "enable_language(HIP)"
+    # (in file Modules/CMakeDetermineHIPCompiler.cmake)
+    #
+    if(DEFINED ENV{PYTORCH_ROCM_ARCH})
+      set(HIP_ARCHITECTURES $ENV{PYTORCH_ROCM_ARCH})
+    else()
+      set(HIP_ARCHITECTURES ${CMAKE_HIP_ARCHITECTURES})
+    endif()
+    #
+    # Find the intersection of the supported + detected architectures to
+    # set the module architecture flags.
+    #
+    set(${GPU_ARCHES})
+    foreach (_ARCH ${HIP_ARCHITECTURES})
+      if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
+        list(APPEND ${GPU_ARCHES} ${_ARCH})
+      endif()
+    endforeach()
+
+    if(NOT ${GPU_ARCHES})
+      message(FATAL_ERROR
+        "None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is"
+        " supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
+    endif()
+  endif()
+endmacro()
+
+#
+# Define a target named `GPU_MOD_NAME` for a single extension. The
+# arguments are:
+#
+# DESTINATION <dest>         - Module destination directory.
+# LANGUAGE <lang>            - The GPU language for this module, e.g CUDA, HIP,
+#                              etc.
+# SOURCES <sources>          - List of source files relative to CMakeLists.txt
+#                              directory.
+#
+# Optional arguments:
+#
+# ARCHITECTURES <arches>     - A list of target GPU architectures in cmake
+#                              format.
+#                              Refer `CMAKE_CUDA_ARCHITECTURES` documentation
+#                              and `CMAKE_HIP_ARCHITECTURES` for more info.
+#                              ARCHITECTURES will use cmake's defaults if
+#                              not provided.
+# COMPILE_FLAGS <flags>      - Extra compiler flags passed to NVCC/hip.
+# INCLUDE_DIRECTORIES <dirs> - Extra include directories.
+# LIBRARIES <libraries>      - Extra link libraries.
+# WITH_SOABI                 - Generate library with python SOABI suffix name.
+# USE_SABI <version>         - Use python stable api <version>
+#
+# Note: optimization level/debug info is set via cmake build type.
+#
+function (define_gpu_extension_target GPU_MOD_NAME)
+  cmake_parse_arguments(PARSE_ARGV 1
+    GPU
+    "WITH_SOABI"
+    "DESTINATION;LANGUAGE;USE_SABI"
+    "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")
+
+  # Add hipify preprocessing step when building with HIP/ROCm.
+  if (GPU_LANGUAGE STREQUAL "HIP")
+    hipify_sources_target(GPU_SOURCES ${GPU_MOD_NAME} "${GPU_SOURCES}")
+  endif()
+
+  if (GPU_WITH_SOABI)
+    set(GPU_WITH_SOABI WITH_SOABI)
+  else()
+    set(GPU_WITH_SOABI)
+  endif()
+
+  if (GPU_USE_SABI)
+    Python_add_library(${GPU_MOD_NAME} MODULE USE_SABI ${GPU_USE_SABI} ${GPU_WITH_SOABI} "${GPU_SOURCES}")
+  else()
+    Python_add_library(${GPU_MOD_NAME} MODULE ${GPU_WITH_SOABI} "${GPU_SOURCES}")
+  endif()
+
+  if (GPU_LANGUAGE STREQUAL "HIP")
+    # Make this target dependent on the hipify preprocessor step.
+    add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME})
+  endif()
+
+  if (GPU_ARCHITECTURES)
+    set_target_properties(${GPU_MOD_NAME} PROPERTIES
+      ${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}")
+  endif()
+
+  set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17)
+
+  target_compile_options(${GPU_MOD_NAME} PRIVATE
+    $<$<COMPILE_LANGUAGE:${GPU_LANGUAGE}>:${GPU_COMPILE_FLAGS}>)
+
+  target_compile_definitions(${GPU_MOD_NAME} PRIVATE
+    "-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}")
+
+  target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
+    ${GPU_INCLUDE_DIRECTORIES})
+
+  target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES})
+
+  # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
+  # dependencies that are not necessary and may not be installed.
+  if (GPU_LANGUAGE STREQUAL "CUDA")
+    target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart CUDA::cuda_driver)
+  else()
+    target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
+  endif()
+
+  install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION} COMPONENT ${GPU_MOD_NAME})
+endfunction()
diff --git a/vllm_v0.10.0/csrc/activation_kernels.cu b/vllm_v0.10.0/csrc/activation_kernels.cu
new file mode 100644
index 0000000..55e6596
--- /dev/null
+++ b/vllm_v0.10.0/csrc/activation_kernels.cu
@@ -0,0 +1,225 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <cmath>
+
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
+
+namespace vllm {
+
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
+          bool act_first>
+__device__ __forceinline__ scalar_t compute(const scalar_t& x,
+                                            const scalar_t& y) {
+  return act_first ? ACT_FN(x) * y : x * ACT_FN(y);
+}
+// Activation and gating kernel template.
+
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
+          bool act_first>
+__global__ void act_and_mul_kernel(
+    scalar_t* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., 2, d]
+    const int d) {
+  const int64_t token_idx = blockIdx.x;
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
+    const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
+    out[token_idx * d + idx] = compute<scalar_t, ACT_FN, act_first>(x, y);
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ T silu_kernel(const T& x) {
+  // x * sigmoid(x)
+  return (T)(((float)x) / (1.0f + expf((float)-x)));
+}
+
+template <typename T>
+__device__ __forceinline__ T gelu_kernel(const T& x) {
+  // Equivalent to PyTorch GELU with 'none' approximation.
+  // Refer to:
+  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38
+  const float f = (float)x;
+  constexpr float ALPHA = M_SQRT1_2;
+  return (T)(f * 0.5f * (1.0f + ::erf(f * ALPHA)));
+}
+
+template <typename T>
+__device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
+  // Equivalent to PyTorch GELU with 'tanh' approximation.
+  // Refer to:
+  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30
+  const float f = (float)x;
+  constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f;
+  constexpr float KAPPA = 0.044715;
+  float x_cube = f * f * f;
+  float inner = BETA * (f + KAPPA * x_cube);
+  return (T)(0.5f * f * (1.0f + ::tanhf(inner)));
+}
+
+}  // namespace vllm
+
+// Launch activation and gating kernel.
+// Use ACT_FIRST (bool) indicating whether to apply the activation function
+// first.
+#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL, ACT_FIRST)                 \
+  int d = input.size(-1) / 2;                                            \
+  int64_t num_tokens = input.numel() / input.size(-1);                   \
+  dim3 grid(num_tokens);                                                 \
+  dim3 block(std::min(d, 1024));                                         \
+  if (num_tokens == 0) {                                                 \
+    return;                                                              \
+  }                                                                      \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));      \
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();          \
+  VLLM_DISPATCH_FLOATING_TYPES(                                          \
+      input.scalar_type(), "act_and_mul_kernel", [&] {                   \
+        vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>, ACT_FIRST>  \
+            <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),       \
+                                         input.data_ptr<scalar_t>(), d); \
+      });
+
+void silu_and_mul(torch::Tensor& out,    // [..., d]
+                  torch::Tensor& input)  // [..., 2 * d]
+{
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, true);
+}
+
+void mul_and_silu(torch::Tensor& out,    // [..., d]
+                  torch::Tensor& input)  // [..., 2 * d]
+{
+  // The difference between mul_and_silu and silu_and_mul is that mul_and_silu
+  // applies the silu to the latter half of the input.
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, false);
+}
+
+void gelu_and_mul(torch::Tensor& out,    // [..., d]
+                  torch::Tensor& input)  // [..., 2 * d]
+{
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel, true);
+}
+
+void gelu_tanh_and_mul(torch::Tensor& out,    // [..., d]
+                       torch::Tensor& input)  // [..., 2 * d]
+{
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel, true);
+}
+
+namespace vllm {
+
+template <typename T>
+__device__ __forceinline__ T fatrelu_kernel(const T& x, const float threshold) {
+  const float f = (float)x;
+  return (T)(f > threshold ? f : 0.0f);
+}
+
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&, const float)>
+__global__ void act_and_mul_kernel_with_param(
+    scalar_t* __restrict__ out, const scalar_t* __restrict__ input, const int d,
+    const float param) {
+  const int64_t token_idx = blockIdx.x;
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
+    const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
+    out[token_idx * d + idx] = ACT_FN(x, param) * y;
+  }
+}
+
+}  // namespace vllm
+
+#define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM)         \
+  int d = input.size(-1) / 2;                                           \
+  int64_t num_tokens = input.numel() / input.size(-1);                  \
+  dim3 grid(num_tokens);                                                \
+  dim3 block(std::min(d, 1024));                                        \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));     \
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();         \
+  VLLM_DISPATCH_FLOATING_TYPES(                                         \
+      input.scalar_type(), "act_and_mul_kernel_with_param", [&] {       \
+        vllm::act_and_mul_kernel_with_param<scalar_t, KERNEL<scalar_t>> \
+            <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),      \
+                                         input.data_ptr<scalar_t>(), d, \
+                                         PARAM);                        \
+      });
+
+void fatrelu_and_mul(torch::Tensor& out,    // [..., d],
+                     torch::Tensor& input,  // [..., 2 * d]
+                     double threshold) {
+  LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold);
+}
+namespace vllm {
+
+// Element-wise activation kernel template.
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
+__global__ void activation_kernel(
+    scalar_t* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., d]
+    const int d) {
+  const int64_t token_idx = blockIdx.x;
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]);
+    out[token_idx * d + idx] = ACT_FN(x);
+  }
+}
+
+}  // namespace vllm
+
+// Launch element-wise activation kernel.
+#define LAUNCH_ACTIVATION_KERNEL(KERNEL)                                       \
+  int d = input.size(-1);                                                      \
+  int64_t num_tokens = input.numel() / d;                                      \
+  dim3 grid(num_tokens);                                                       \
+  dim3 block(std::min(d, 1024));                                               \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));            \
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                \
+  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "activation_kernel", [&] { \
+    vllm::activation_kernel<scalar_t, KERNEL<scalar_t>>                        \
+        <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),                 \
+                                     input.data_ptr<scalar_t>(), d);           \
+  });
+
+namespace vllm {
+
+template <typename T>
+__device__ __forceinline__ T gelu_new_kernel(const T& x) {
+  const float x3 = (float)(x * x * x);
+  const T t = (T)tanhf((T)(0.79788456f * (float)(x + (T)(0.044715f * x3))));
+  return ((T)0.5) * x * (((T)1.0) + t);
+}
+
+template <typename T>
+__device__ __forceinline__ T gelu_fast_kernel(const T& x) {
+  const float f = (float)x;
+  const T t =
+      (T)tanhf(((T)(f * 0.79788456f)) * (((T)1.0) + (T)(0.044715f * f) * x));
+  return ((T)0.5) * x * (((T)1.0) + t);
+}
+
+template <typename T>
+__device__ __forceinline__ T gelu_quick_kernel(const T& x) {
+  // x * sigmoid(1.702 * x)
+  return (T)(((float)x) / (1.0f + expf(-1.702f * (float)x)));
+}
+
+}  // namespace vllm
+
+void gelu_new(torch::Tensor& out,    // [..., d]
+              torch::Tensor& input)  // [..., d]
+{
+  LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel);
+}
+
+void gelu_fast(torch::Tensor& out,    // [..., d]
+               torch::Tensor& input)  // [..., d]
+{
+  LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
+}
+
+void gelu_quick(torch::Tensor& out,    // [..., d]
+                torch::Tensor& input)  // [..., d]
+{
+  LAUNCH_ACTIVATION_KERNEL(vllm::gelu_quick_kernel);
+}
diff --git a/vllm_v0.10.0/csrc/attention/attention_dtypes.h b/vllm_v0.10.0/csrc/attention/attention_dtypes.h
new file mode 100644
index 0000000..64f8638
--- /dev/null
+++ b/vllm_v0.10.0/csrc/attention/attention_dtypes.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include "attention_generic.cuh"
+#include "dtype_float16.cuh"
+#include "dtype_float32.cuh"
+#include "dtype_bfloat16.cuh"
+#include "dtype_fp8.cuh"
diff --git a/vllm_v0.10.0/csrc/attention/attention_generic.cuh b/vllm_v0.10.0/csrc/attention/attention_generic.cuh
new file mode 100644
index 0000000..62409c0
--- /dev/null
+++ b/vllm_v0.10.0/csrc/attention/attention_generic.cuh
@@ -0,0 +1,65 @@
+/*
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+ * Copyright (c) 2023, The vLLM team.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <stdint.h>
+
+namespace vllm {
+
+// A vector type to store Q, K, V elements.
+template <typename T, int VEC_SIZE>
+struct Vec {};
+
+// A vector type to store FP32 accumulators.
+template <typename T>
+struct FloatVec {};
+
+// Template vector operations.
+template <typename Acc, typename A, typename B>
+inline __device__ Acc mul(A a, B b);
+
+template <typename T>
+inline __device__ float sum(T v);
+
+template <typename T>
+inline __device__ float dot(T a, T b) {
+  return sum(mul<T, T, T>(a, b));
+}
+
+template <typename A, typename T>
+inline __device__ float dot(T a, T b) {
+  return sum(mul<A, T, T>(a, b));
+}
+
+template <typename T>
+inline __device__ void zero(T& dst) {
+  constexpr int WORDS = sizeof(T) / 4;
+  union {
+    T raw;
+    uint32_t words[WORDS];
+  } tmp;
+
+#pragma unroll
+  for (int ii = 0; ii < WORDS; ++ii) {
+    tmp.words[ii] = 0u;
+  }
+  dst = tmp.raw;
+}
+
+}  // namespace vllm
diff --git a/vllm_v0.10.0/csrc/attention/attention_kernels.cuh b/vllm_v0.10.0/csrc/attention/attention_kernels.cuh
new file mode 100644
index 0000000..8f24be8
--- /dev/null
+++ b/vllm_v0.10.0/csrc/attention/attention_kernels.cuh
@@ -0,0 +1,670 @@
+/*
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * Copyright (c) 2023, The vLLM team.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <algorithm>
+
+#include "attention_dtypes.h"
+#include "attention_utils.cuh"
+#include "cuda_compat.h"
+
+#ifdef USE_ROCM
+  #include <hip/hip_bf16.h>
+  #include "../quantization/fp8/amd/quant_utils.cuh"
+typedef __hip_bfloat16 __nv_bfloat16;
+#else
+  #include "../quantization/fp8/nvidia/quant_utils.cuh"
+#endif
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
+
+namespace vllm {
+
+// Utility function for attention softmax.
+template <int NUM_WARPS>
+inline __device__ float block_sum(float* red_smem, float sum) {
+  // Decompose the thread index into warp / lane.
+  int warp = threadIdx.x / WARP_SIZE;
+  int lane = threadIdx.x % WARP_SIZE;
+
+  // Compute the sum per warp.
+#pragma unroll
+  for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+    sum += VLLM_SHFL_XOR_SYNC(sum, mask);
+  }
+
+  // Warp leaders store the data to shared memory.
+  if (lane == 0) {
+    red_smem[warp] = sum;
+  }
+
+  // Make sure the data is in shared memory.
+  __syncthreads();
+
+  // The warps compute the final sums.
+  if (lane < NUM_WARPS) {
+    sum = red_smem[lane];
+  }
+
+  // Parallel reduction inside the warp.
+#pragma unroll
+  for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
+    sum += VLLM_SHFL_XOR_SYNC(sum, mask);
+  }
+
+  // Broadcast to other threads.
+  return VLLM_SHFL_SYNC(sum, 0);
+}
+
+// TODO(woosuk): Merge the last two dimensions of the grid.
+// Grid: (num_heads, num_seqs, max_num_partitions).
+template <typename scalar_t, typename cache_t, int HEAD_SIZE, int BLOCK_SIZE,
+          int NUM_THREADS, vllm::Fp8KVCacheDataType KV_DTYPE,
+          bool IS_BLOCK_SPARSE,
+          int PARTITION_SIZE = 0>  // Zero means no partitioning.
+__device__ void paged_attention_kernel(
+    float* __restrict__ exp_sums,  // [num_seqs, num_heads, max_num_partitions]
+    float* __restrict__ max_logits,  // [num_seqs, num_heads,
+                                     // max_num_partitions]
+    scalar_t* __restrict__ out,  // [num_seqs, num_heads, max_num_partitions,
+                                 // head_size]
+    const scalar_t* __restrict__ q,       // [num_seqs, num_heads, head_size]
+    const cache_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
+                                          // head_size/x, block_size, x]
+    const cache_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
+                                          // head_size, block_size]
+    const int num_kv_heads,               // [num_heads]
+    const float scale,
+    const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
+    const int* __restrict__ seq_lens,      // [num_seqs]
+    const int max_num_blocks_per_seq,
+    const float* __restrict__ alibi_slopes,  // [num_heads]
+    const int q_stride, const int kv_block_stride, const int kv_head_stride,
+    const float* k_scale, const float* v_scale, const int tp_rank,
+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
+  const int seq_idx = blockIdx.y;
+  const int partition_idx = blockIdx.z;
+  const int max_num_partitions = gridDim.z;
+  constexpr bool USE_PARTITIONING = PARTITION_SIZE > 0;
+  const int seq_len = seq_lens[seq_idx];
+  if (USE_PARTITIONING && partition_idx * PARTITION_SIZE >= seq_len) {
+    // No work to do. Terminate the thread block.
+    return;
+  }
+
+  const int num_seq_blocks = DIVIDE_ROUND_UP(seq_len, BLOCK_SIZE);
+  const int num_blocks_per_partition =
+      USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_seq_blocks;
+
+  // [start_block_idx, end_block_idx) is the range of blocks to process.
+  const int start_block_idx =
+      USE_PARTITIONING ? partition_idx * num_blocks_per_partition : 0;
+  const int end_block_idx =
+      MIN(start_block_idx + num_blocks_per_partition, num_seq_blocks);
+  const int num_blocks = end_block_idx - start_block_idx;
+
+  // [start_token_idx, end_token_idx) is the range of tokens to process.
+  const int start_token_idx = start_block_idx * BLOCK_SIZE;
+  const int end_token_idx =
+      MIN(start_token_idx + num_blocks * BLOCK_SIZE, seq_len);
+  const int num_tokens = end_token_idx - start_token_idx;
+
+  constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
+  constexpr int NUM_THREAD_GROUPS =
+      NUM_THREADS / THREAD_GROUP_SIZE;  // Note: This assumes THREAD_GROUP_SIZE
+                                        // divides NUM_THREADS
+  assert(NUM_THREADS % THREAD_GROUP_SIZE == 0);
+  constexpr int NUM_TOKENS_PER_THREAD_GROUP =
+      DIVIDE_ROUND_UP(BLOCK_SIZE, WARP_SIZE);
+  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
+  const int thread_idx = threadIdx.x;
+  const int warp_idx = thread_idx / WARP_SIZE;
+  const int lane = thread_idx % WARP_SIZE;
+
+  const int head_idx = blockIdx.x;
+  const int num_heads = gridDim.x;
+  const int num_queries_per_kv = num_heads / num_kv_heads;
+  const int kv_head_idx = head_idx / num_queries_per_kv;
+  const float alibi_slope =
+      alibi_slopes == nullptr ? 0.f : alibi_slopes[head_idx];
+
+  // A vector type to store a part of a key or a query.
+  // The vector size is configured in such a way that the threads in a thread
+  // group fetch or compute 16 bytes at a time. For example, if the size of a
+  // thread group is 4 and the data type is half, then the vector size is 16 /
+  // (4 * sizeof(half)) == 2.
+  constexpr int VEC_SIZE = MAX(16 / (THREAD_GROUP_SIZE * sizeof(scalar_t)), 1);
+  using K_vec = typename Vec<scalar_t, VEC_SIZE>::Type;
+  using Q_vec = typename Vec<scalar_t, VEC_SIZE>::Type;
+  using Quant_vec = typename Vec<cache_t, VEC_SIZE>::Type;
+
+  constexpr int NUM_ELEMS_PER_THREAD = HEAD_SIZE / THREAD_GROUP_SIZE;
+  constexpr int NUM_VECS_PER_THREAD = NUM_ELEMS_PER_THREAD / VEC_SIZE;
+
+  const int thread_group_idx = thread_idx / THREAD_GROUP_SIZE;
+  const int thread_group_offset = thread_idx % THREAD_GROUP_SIZE;
+
+  // Load the query to registers.
+  // Each thread in a thread group has a different part of the query.
+  // For example, if the thread group size is 4, then the first thread in
+  // the group has 0, 4, 8, ... th vectors of the query, and the second thread
+  // has 1, 5, 9, ... th vectors of the query, and so on. NOTE(woosuk): Because
+  // q is split from a qkv tensor, it may not be contiguous.
+  const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
+  __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
+#pragma unroll
+  for (int i = thread_group_idx; i < NUM_VECS_PER_THREAD;
+       i += NUM_THREAD_GROUPS) {
+    const int vec_idx = thread_group_offset + i * THREAD_GROUP_SIZE;
+    q_vecs[thread_group_offset][i] =
+        *reinterpret_cast<const Q_vec*>(q_ptr + vec_idx * VEC_SIZE);
+  }
+  __syncthreads();  // TODO(naed90): possible speedup if this is replaced with a
+                    // memory wall right before we use q_vecs
+
+  // Memory planning.
+  extern __shared__ char shared_mem[];
+  // NOTE(woosuk): We use FP32 for the softmax logits for better accuracy.
+  float* logits = reinterpret_cast<float*>(shared_mem);
+  // Workspace for reduction.
+  __shared__ float red_smem[2 * NUM_WARPS];
+
+  // x == THREAD_GROUP_SIZE * VEC_SIZE
+  // Each thread group fetches x elements from the key at a time.
+  constexpr int x = 16 / sizeof(cache_t);
+  float qk_max = -FLT_MAX;
+
+  // Iterate over the key blocks.
+  // Each warp fetches a block of keys for each iteration.
+  // Each thread group in a warp fetches a key from the block, and computes
+  // dot product with the query.
+  const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
+
+  // blocksparse specific vars
+  int bs_block_offset;
+  int q_bs_block_id;
+  if constexpr (IS_BLOCK_SPARSE) {
+    // const int num_blocksparse_blocks = DIVIDE_ROUND_UP(seq_len,
+    // blocksparse_block_size);
+    q_bs_block_id = (seq_len - 1) / blocksparse_block_size;
+    if (blocksparse_head_sliding_step >= 0)
+      // sliding on q heads
+      bs_block_offset =
+          (tp_rank * num_heads + head_idx) * blocksparse_head_sliding_step + 1;
+    else
+      // sliding on kv heads
+      bs_block_offset = (tp_rank * num_kv_heads + kv_head_idx) *
+                            (-blocksparse_head_sliding_step) +
+                        1;
+  }
+
+  for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx;
+       block_idx += NUM_WARPS) {
+    // NOTE(woosuk): The block number is stored in int32. However, we cast it to
+    // int64 because int32 can lead to overflow when this variable is multiplied
+    // by large numbers (e.g., kv_block_stride).
+    // For blocksparse attention: skip computation on blocks that are not
+    // attended
+    if constexpr (IS_BLOCK_SPARSE) {
+      const int k_bs_block_id = block_idx * BLOCK_SIZE / blocksparse_block_size;
+      const bool is_remote =
+          ((k_bs_block_id + bs_block_offset) % blocksparse_vert_stride == 0);
+      const bool is_local =
+          (k_bs_block_id > q_bs_block_id - blocksparse_local_blocks);
+      if (!is_remote && !is_local) {
+        for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {
+          const int physical_block_offset =
+              (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE;
+          const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
+
+          if (thread_group_offset == 0) {
+            // NOTE(linxihui): assign very large number to skipped tokens to
+            // avoid contribution to the sumexp softmax normalizer. This will
+            // not be used at computing sum(softmax*v) as the blocks will be
+            // skipped.
+            logits[token_idx - start_token_idx] = -FLT_MAX;
+          }
+        }
+        continue;
+      }
+    }
+    const int64_t physical_block_number =
+        static_cast<int64_t>(block_table[block_idx]);
+
+    // Load a key to registers.
+    // Each thread in a thread group has a different part of the key.
+    // For example, if the thread group size is 4, then the first thread in
+    // the group has 0, 4, 8, ... th vectors of the key, and the second thread
+    // has 1, 5, 9, ... th vectors of the key, and so on.
+    for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {
+      const int physical_block_offset =
+          (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE;
+      const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
+      K_vec k_vecs[NUM_VECS_PER_THREAD];
+
+#pragma unroll
+      for (int j = 0; j < NUM_VECS_PER_THREAD; j++) {
+        const cache_t* k_ptr =
+            k_cache + physical_block_number * kv_block_stride +
+            kv_head_idx * kv_head_stride + physical_block_offset * x;
+        const int vec_idx = thread_group_offset + j * THREAD_GROUP_SIZE;
+        const int offset1 = (vec_idx * VEC_SIZE) / x;
+        const int offset2 = (vec_idx * VEC_SIZE) % x;
+
+        if constexpr (KV_DTYPE == Fp8KVCacheDataType::kAuto) {
+          k_vecs[j] = *reinterpret_cast<const K_vec*>(
+              k_ptr + offset1 * BLOCK_SIZE * x + offset2);
+        } else {
+          // Vector conversion from Quant_vec to K_vec.
+          Quant_vec k_vec_quant = *reinterpret_cast<const Quant_vec*>(
+              k_ptr + offset1 * BLOCK_SIZE * x + offset2);
+          k_vecs[j] = fp8::scaled_convert<K_vec, Quant_vec, KV_DTYPE>(
+              k_vec_quant, *k_scale);
+        }
+      }
+
+      // Compute dot product.
+      // This includes a reduction across the threads in the same thread group.
+      float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(
+                             q_vecs[thread_group_offset], k_vecs);
+      // Add the ALiBi bias if slopes are given.
+      qk += (alibi_slope != 0) ? alibi_slope * (token_idx - seq_len + 1) : 0;
+
+      if (thread_group_offset == 0) {
+        // Store the partial reductions to shared memory.
+        // NOTE(woosuk): It is required to zero out the masked logits.
+        const bool mask = token_idx >= seq_len;
+        logits[token_idx - start_token_idx] = mask ? 0.f : qk;
+        // Update the max value.
+        qk_max = mask ? qk_max : fmaxf(qk_max, qk);
+      }
+    }
+  }
+
+  // Perform reduction across the threads in the same warp to get the
+  // max qk value for each "warp" (not across the thread block yet).
+  // The 0-th thread of each thread group already has its max qk value.
+#pragma unroll
+  for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) {
+    qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
+  }
+  if (lane == 0) {
+    red_smem[warp_idx] = qk_max;
+  }
+  __syncthreads();
+
+  // TODO(woosuk): Refactor this part.
+  // Get the max qk value for the sequence.
+  qk_max = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX;
+#pragma unroll
+  for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
+    qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
+  }
+  // Broadcast the max qk value to all threads.
+  qk_max = VLLM_SHFL_SYNC(qk_max, 0);
+
+  // Get the sum of the exp values.
+  float exp_sum = 0.f;
+  for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
+    float val = __expf(logits[i] - qk_max);
+    logits[i] = val;
+    exp_sum += val;
+  }
+  exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], exp_sum);
+
+  // Compute softmax.
+  const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f);
+  for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
+    logits[i] *= inv_sum;
+  }
+  __syncthreads();
+
+  // If partitioning is enabled, store the max logit and exp_sum.
+  if (USE_PARTITIONING && thread_idx == 0) {
+    float* max_logits_ptr = max_logits +
+                            seq_idx * num_heads * max_num_partitions +
+                            head_idx * max_num_partitions + partition_idx;
+    *max_logits_ptr = qk_max;
+    float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions +
+                          head_idx * max_num_partitions + partition_idx;
+    *exp_sums_ptr = exp_sum;
+  }
+
+  // Each thread will fetch 16 bytes from the value cache at a time.
+  constexpr int V_VEC_SIZE = MIN(16 / sizeof(scalar_t), BLOCK_SIZE);
+  using V_vec = typename Vec<scalar_t, V_VEC_SIZE>::Type;
+  using L_vec = typename Vec<scalar_t, V_VEC_SIZE>::Type;
+  using V_quant_vec = typename Vec<cache_t, V_VEC_SIZE>::Type;
+  using Float_L_vec = typename FloatVec<L_vec>::Type;
+
+  constexpr int NUM_V_VECS_PER_ROW = BLOCK_SIZE / V_VEC_SIZE;
+  constexpr int NUM_ROWS_PER_ITER = WARP_SIZE / NUM_V_VECS_PER_ROW;
+  constexpr int NUM_ROWS_PER_THREAD =
+      DIVIDE_ROUND_UP(HEAD_SIZE, NUM_ROWS_PER_ITER);
+
+  // NOTE(woosuk): We use FP32 for the accumulator for better accuracy.
+  float accs[NUM_ROWS_PER_THREAD];
+#pragma unroll
+  for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+    accs[i] = 0.f;
+  }
+
+  scalar_t zero_value;
+  zero(zero_value);
+  for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx;
+       block_idx += NUM_WARPS) {
+    // NOTE(woosuk): The block number is stored in int32. However, we cast it to
+    // int64 because int32 can lead to overflow when this variable is multiplied
+    // by large numbers (e.g., kv_block_stride).
+    // For blocksparse attention: skip computation on blocks that are not
+    // attended
+    if constexpr (IS_BLOCK_SPARSE) {
+      int v_bs_block_id = block_idx * BLOCK_SIZE / blocksparse_block_size;
+      if (!((v_bs_block_id + bs_block_offset) % blocksparse_vert_stride == 0) &&
+          !((v_bs_block_id > q_bs_block_id - blocksparse_local_blocks))) {
+        continue;
+      }
+    }
+    const int64_t physical_block_number =
+        static_cast<int64_t>(block_table[block_idx]);
+    const int physical_block_offset = (lane % NUM_V_VECS_PER_ROW) * V_VEC_SIZE;
+    const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
+    L_vec logits_vec;
+    from_float(logits_vec, *reinterpret_cast<Float_L_vec*>(logits + token_idx -
+                                                           start_token_idx));
+
+    const cache_t* v_ptr = v_cache + physical_block_number * kv_block_stride +
+                           kv_head_idx * kv_head_stride;
+#pragma unroll
+    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+      const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
+      if (row_idx < HEAD_SIZE) {
+        const int offset = row_idx * BLOCK_SIZE + physical_block_offset;
+        V_vec v_vec;
+
+        if constexpr (KV_DTYPE == Fp8KVCacheDataType::kAuto) {
+          v_vec = *reinterpret_cast<const V_vec*>(v_ptr + offset);
+        } else {
+          V_quant_vec v_quant_vec =
+              *reinterpret_cast<const V_quant_vec*>(v_ptr + offset);
+          // Vector conversion from V_quant_vec to V_vec.
+          v_vec = fp8::scaled_convert<V_vec, V_quant_vec, KV_DTYPE>(v_quant_vec,
+                                                                    *v_scale);
+        }
+        if (block_idx == num_seq_blocks - 1) {
+          // NOTE(woosuk): When v_vec contains the tokens that are out of the
+          // context, we should explicitly zero out the values since they may
+          // contain NaNs. See
+          // https://github.com/vllm-project/vllm/issues/641#issuecomment-1682544472
+          scalar_t* v_vec_ptr = reinterpret_cast<scalar_t*>(&v_vec);
+#pragma unroll
+          for (int j = 0; j < V_VEC_SIZE; j++) {
+            v_vec_ptr[j] = token_idx + j < seq_len ? v_vec_ptr[j] : zero_value;
+          }
+        }
+        accs[i] += dot(logits_vec, v_vec);
+      }
+    }
+  }
+
+  // Perform reduction within each warp.
+#pragma unroll
+  for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+    float acc = accs[i];
+#pragma unroll
+    for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) {
+      acc += VLLM_SHFL_XOR_SYNC(acc, mask);
+    }
+    accs[i] = acc;
+  }
+
+  // NOTE(woosuk): A barrier is required because the shared memory space for
+  // logits is reused for the output.
+  __syncthreads();
+
+  // Perform reduction across warps.
+  float* out_smem = reinterpret_cast<float*>(shared_mem);
+#pragma unroll
+  for (int i = NUM_WARPS; i > 1; i /= 2) {
+    int mid = i / 2;
+    // Upper warps write to shared memory.
+    if (warp_idx >= mid && warp_idx < i) {
+      float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
+#pragma unroll
+      for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+        const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
+        if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
+          dst[row_idx] = accs[i];
+        }
+      }
+    }
+    __syncthreads();
+
+    // Lower warps update the output.
+    if (warp_idx < mid) {
+      const float* src = &out_smem[warp_idx * HEAD_SIZE];
+#pragma unroll
+      for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+        const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
+        if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
+          accs[i] += src[row_idx];
+        }
+      }
+    }
+    __syncthreads();
+  }
+
+  // Write the final output.
+  if (warp_idx == 0) {
+    scalar_t* out_ptr =
+        out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
+        head_idx * max_num_partitions * HEAD_SIZE + partition_idx * HEAD_SIZE;
+#pragma unroll
+    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+      const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
+      if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
+        from_float(*(out_ptr + row_idx), accs[i]);
+      }
+    }
+  }
+}
+
+// Grid: (num_heads, num_seqs, 1).
+template <typename scalar_t, typename cache_t, int HEAD_SIZE, int BLOCK_SIZE,
+          int NUM_THREADS, vllm::Fp8KVCacheDataType KV_DTYPE,
+          bool IS_BLOCK_SPARSE>
+__global__ void paged_attention_v1_kernel(
+    scalar_t* __restrict__ out,           // [num_seqs, num_heads, head_size]
+    const scalar_t* __restrict__ q,       // [num_seqs, num_heads, head_size]
+    const cache_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
+                                          // head_size/x, block_size, x]
+    const cache_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
+                                          // head_size, block_size]
+    const int num_kv_heads,               // [num_heads]
+    const float scale,
+    const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
+    const int* __restrict__ seq_lens,      // [num_seqs]
+    const int max_num_blocks_per_seq,
+    const float* __restrict__ alibi_slopes,  // [num_heads]
+    const int q_stride, const int kv_block_stride, const int kv_head_stride,
+    const float* k_scale, const float* v_scale, const int tp_rank,
+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
+  paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,
+                         KV_DTYPE, IS_BLOCK_SPARSE>(
+      /* exp_sums */ nullptr, /* max_logits */ nullptr, out, q, k_cache,
+      v_cache, num_kv_heads, scale, block_tables, seq_lens,
+      max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride,
+      kv_head_stride, k_scale, v_scale, tp_rank, blocksparse_local_blocks,
+      blocksparse_vert_stride, blocksparse_block_size,
+      blocksparse_head_sliding_step);
+}
+
+// Grid: (num_heads, num_seqs, max_num_partitions).
+template <typename scalar_t, typename cache_t, int HEAD_SIZE, int BLOCK_SIZE,
+          int NUM_THREADS, vllm::Fp8KVCacheDataType KV_DTYPE,
+          bool IS_BLOCK_SPARSE,
+          int PARTITION_SIZE>
+__global__ void paged_attention_v2_kernel(
+    float* __restrict__ exp_sums,  // [num_seqs, num_heads, max_num_partitions]
+    float* __restrict__ max_logits,       // [num_seqs, num_heads,
+                                          // max_num_partitions]
+    scalar_t* __restrict__ tmp_out,       // [num_seqs, num_heads,
+                                          // max_num_partitions, head_size]
+    const scalar_t* __restrict__ q,       // [num_seqs, num_heads, head_size]
+    const cache_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
+                                          // head_size/x, block_size, x]
+    const cache_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
+                                          // head_size, block_size]
+    const int num_kv_heads,               // [num_heads]
+    const float scale,
+    const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
+    const int* __restrict__ seq_lens,      // [num_seqs]
+    const int max_num_blocks_per_seq,
+    const float* __restrict__ alibi_slopes,  // [num_heads]
+    const int q_stride, const int kv_block_stride, const int kv_head_stride,
+    const float* k_scale, const float* v_scale, const int tp_rank,
+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
+  paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,
+                         KV_DTYPE, IS_BLOCK_SPARSE, PARTITION_SIZE>(
+      exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale,
+      block_tables, seq_lens, max_num_blocks_per_seq, alibi_slopes, q_stride,
+      kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank,
+      blocksparse_local_blocks, blocksparse_vert_stride, blocksparse_block_size,
+      blocksparse_head_sliding_step);
+}
+
+// Grid: (num_heads, num_seqs).
+template <typename scalar_t, int HEAD_SIZE, int NUM_THREADS,
+          int PARTITION_SIZE>
+__global__ void paged_attention_v2_reduce_kernel(
+    scalar_t* __restrict__ out,            // [num_seqs, num_heads, head_size]
+    const float* __restrict__ exp_sums,    // [num_seqs, num_heads,
+                                           // max_num_partitions]
+    const float* __restrict__ max_logits,  // [num_seqs, num_heads,
+                                           // max_num_partitions]
+    const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads,
+                                           // max_num_partitions, head_size]
+    const int* __restrict__ seq_lens,      // [num_seqs]
+    const int max_num_partitions) {
+  const int num_heads = gridDim.x;
+  const int head_idx = blockIdx.x;
+  const int seq_idx = blockIdx.y;
+  const int seq_len = seq_lens[seq_idx];
+  const int num_partitions = DIVIDE_ROUND_UP(seq_len, PARTITION_SIZE);
+  if (num_partitions == 1) {
+    // No need to reduce. Only copy tmp_out to out.
+    scalar_t* out_ptr =
+        out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
+    const scalar_t* tmp_out_ptr =
+        tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
+        head_idx * max_num_partitions * HEAD_SIZE;
+    for (int i = threadIdx.x; i < HEAD_SIZE; i += blockDim.x) {
+      out_ptr[i] = tmp_out_ptr[i];
+    }
+    // Terminate the thread block.
+    return;
+  }
+
+  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
+  const int warp_idx = threadIdx.x / WARP_SIZE;
+  const int lane = threadIdx.x % WARP_SIZE;
+
+  // Size: 2 * num_partitions.
+  extern __shared__ char shared_mem[];
+  // Workspace for reduction.
+  __shared__ float red_smem[2 * NUM_WARPS];
+
+  // Load max logits to shared memory.
+  float* shared_max_logits = reinterpret_cast<float*>(shared_mem);
+  const float* max_logits_ptr = max_logits +
+                                seq_idx * num_heads * max_num_partitions +
+                                head_idx * max_num_partitions;
+  float max_logit = -FLT_MAX;
+  for (int i = threadIdx.x; i < num_partitions; i += blockDim.x) {
+    const float l = max_logits_ptr[i];
+    shared_max_logits[i] = l;
+    max_logit = fmaxf(max_logit, l);
+  }
+  __syncthreads();
+
+  // Get the global max logit.
+  // Reduce within the warp.
+#pragma unroll
+  for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+    max_logit = fmaxf(max_logit, VLLM_SHFL_XOR_SYNC(max_logit, mask));
+  }
+  if (lane == 0) {
+    red_smem[warp_idx] = max_logit;
+  }
+  __syncthreads();
+  // Reduce across warps.
+  max_logit = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX;
+#pragma unroll
+  for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
+    max_logit = fmaxf(max_logit, VLLM_SHFL_XOR_SYNC(max_logit, mask));
+  }
+  // Broadcast the max value to all threads.
+  max_logit = VLLM_SHFL_SYNC(max_logit, 0);
+
+  // Load rescaled exp sums to shared memory.
+  float* shared_exp_sums =
+      reinterpret_cast<float*>(shared_mem + sizeof(float) * num_partitions);
+  const float* exp_sums_ptr = exp_sums +
+                              seq_idx * num_heads * max_num_partitions +
+                              head_idx * max_num_partitions;
+  float global_exp_sum = 0.0f;
+  for (int i = threadIdx.x; i < num_partitions; i += blockDim.x) {
+    float l = shared_max_logits[i];
+    float rescaled_exp_sum = exp_sums_ptr[i] * expf(l - max_logit);
+    global_exp_sum += rescaled_exp_sum;
+    shared_exp_sums[i] = rescaled_exp_sum;
+  }
+  __syncthreads();
+  global_exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], global_exp_sum);
+  const float inv_global_exp_sum = __fdividef(1.0f, global_exp_sum + 1e-6f);
+
+  // Aggregate tmp_out to out.
+  const scalar_t* tmp_out_ptr =
+      tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
+      head_idx * max_num_partitions * HEAD_SIZE;
+  scalar_t* out_ptr =
+      out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
+#pragma unroll
+  for (int i = threadIdx.x; i < HEAD_SIZE; i += NUM_THREADS) {
+    float acc = 0.0f;
+    for (int j = 0; j < num_partitions; ++j) {
+      acc += to_float(tmp_out_ptr[j * HEAD_SIZE + i]) * shared_exp_sums[j] *
+             inv_global_exp_sum;
+    }
+    from_float(out_ptr[i], acc);
+  }
+}
+
+}  // namespace vllm
+
+#undef MAX
+#undef MIN
+#undef DIVIDE_ROUND_UP
diff --git a/vllm_v0.10.0/csrc/attention/attention_utils.cuh b/vllm_v0.10.0/csrc/attention/attention_utils.cuh
new file mode 100644
index 0000000..826b0ed
--- /dev/null
+++ b/vllm_v0.10.0/csrc/attention/attention_utils.cuh
@@ -0,0 +1,57 @@
+/*
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * Copyright (c) 2023, The vLLM team.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../cuda_compat.h"
+#include "attention_dtypes.h"
+
+#include <float.h>
+#include <type_traits>
+
+namespace vllm {
+
+// Q*K^T operation.
+template <int THREAD_GROUP_SIZE, typename Vec, int N>
+inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
+  using A_vec = typename FloatVec<Vec>::Type;
+  // Compute the parallel products for Q*K^T (treat vector lanes separately).
+  A_vec qk_vec = mul<A_vec, Vec, Vec>(q[0], k[0]);
+#pragma unroll
+  for (int ii = 1; ii < N; ++ii) {
+    qk_vec = vllm::fma(q[ii], k[ii], qk_vec);
+  }
+
+  // Finalize the reduction across lanes.
+  float qk = sum(qk_vec);
+#pragma unroll
+  for (int mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) {
+    qk += VLLM_SHFL_XOR_SYNC(qk, mask);
+  }
+  return qk;
+}
+
+template <typename T, int THREAD_GROUP_SIZE>
+struct Qk_dot {
+  template <typename Vec, int N>
+  static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) {
+    return qk_dot_<THREAD_GROUP_SIZE>(q, k);
+  }
+};
+
+}  // namespace vllm
diff --git a/vllm_v0.10.0/csrc/attention/dtype_bfloat16.cuh b/vllm_v0.10.0/csrc/attention/dtype_bfloat16.cuh
new file mode 100644
index 0000000..97a25ba
--- /dev/null
+++ b/vllm_v0.10.0/csrc/attention/dtype_bfloat16.cuh
@@ -0,0 +1,463 @@
+/*
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * and
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+ * Copyright (c) 2023, The vLLM team.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "attention_generic.cuh"
+#include "dtype_float32.cuh"
+
+#ifndef USE_ROCM
+  #include <cuda_bf16.h>
+  #include <cuda_fp16.h>
+#else
+  #include <hip/hip_bf16.h>
+  #include <hip/hip_fp16.h>
+
+typedef __hip_bfloat162 __nv_bfloat162;
+typedef __hip_bfloat16 __nv_bfloat16;
+#endif
+
+#include <stdint.h>
+
+namespace vllm {
+
+// Define custom BF16 vector data types.
+struct bf16_4_t {
+  __nv_bfloat162 x;
+  __nv_bfloat162 y;
+};
+
+struct bf16_8_t {
+  __nv_bfloat162 x;
+  __nv_bfloat162 y;
+  __nv_bfloat162 z;
+  __nv_bfloat162 w;
+};
+
+// BF16 vector types for Q, K, V.
+template <>
+struct Vec<__nv_bfloat16, 1> {
+  using Type = __nv_bfloat16;
+};
+template <>
+struct Vec<__nv_bfloat16, 2> {
+  using Type = __nv_bfloat162;
+};
+template <>
+struct Vec<__nv_bfloat16, 4> {
+  using Type = bf16_4_t;
+};
+template <>
+struct Vec<__nv_bfloat16, 8> {
+  using Type = bf16_8_t;
+};
+
+// FP32 accumulator vector types corresponding to Vec.
+template <>
+struct FloatVec<__nv_bfloat16> {
+  using Type = float;
+};
+template <>
+struct FloatVec<__nv_bfloat162> {
+  using Type = float2;
+};
+template <>
+struct FloatVec<bf16_4_t> {
+  using Type = Float4_;
+};
+template <>
+struct FloatVec<bf16_8_t> {
+  using Type = Float8_;
+};
+
+// Utility functions for type conversions.
+inline __device__ float2 bf1622float2(const __nv_bfloat162 val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  assert(false);
+#else
+  return __bfloat1622float2(val);
+#endif
+  __builtin_unreachable();  // Suppress missing return statement warning
+}
+
+inline __device__ __nv_bfloat162 bf162bf162(const __nv_bfloat16 val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  assert(false);
+#else
+  return __bfloat162bfloat162(val);
+#endif
+  __builtin_unreachable();  // Suppress missing return statement warning
+}
+
+// Vector addition.
+inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  assert(false);
+#else
+  #ifndef USE_ROCM
+  return a + b;
+  #else
+  return __hadd(a, b);
+  #endif
+#endif
+  __builtin_unreachable();  // Suppress missing return statement warning
+}
+
+inline __device__ __nv_bfloat162 add(__nv_bfloat162 a, __nv_bfloat162 b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  assert(false);
+#else
+  return __hadd2(a, b);
+#endif
+  __builtin_unreachable();  // Suppress missing return statement warning
+}
+
+inline __device__ bf16_4_t add(bf16_4_t a, bf16_4_t b) {
+  bf16_4_t c;
+  c.x = add(a.x, b.x);
+  c.y = add(a.y, b.y);
+  return c;
+}
+
+inline __device__ bf16_8_t add(bf16_8_t a, bf16_8_t b) {
+  bf16_8_t c;
+  c.x = add(a.x, b.x);
+  c.y = add(a.y, b.y);
+  c.z = add(a.z, b.z);
+  c.w = add(a.w, b.w);
+  return c;
+}
+
+inline __device__ float2 add(__nv_bfloat162 a, float2 fb) {
+  float2 fa = bf1622float2(a);
+  return add(fa, fb);
+}
+
+inline __device__ Float4_ add(bf16_4_t a, Float4_ fb) {
+  Float4_ fc;
+  fc.x = add(a.x, fb.x);
+  fc.y = add(a.y, fb.y);
+  return fc;
+}
+
+inline __device__ Float8_ add(bf16_8_t a, Float8_ fb) {
+  Float8_ fc;
+  fc.x = add(a.x, fb.x);
+  fc.y = add(a.y, fb.y);
+  fc.z = add(a.z, fb.z);
+  fc.w = add(a.w, fb.w);
+  return fc;
+}
+
+// Vector multiplication.
+template <>
+inline __device__ __nv_bfloat16 mul(__nv_bfloat16 a, __nv_bfloat16 b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  assert(false);
+#else
+  return __hmul(a, b);
+#endif
+  __builtin_unreachable();  // Suppress missing return statement warning
+}
+
+template <>
+inline __device__ __nv_bfloat162 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  assert(false);
+#else
+  return __hmul2(a, b);
+#endif
+  __builtin_unreachable();  // Suppress missing return statement warning
+}
+
+template <>
+inline __device__ __nv_bfloat162 mul(__nv_bfloat16 a, __nv_bfloat162 b) {
+  return mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(bf162bf162(a), b);
+}
+
+template <>
+inline __device__ bf16_4_t mul(bf16_4_t a, bf16_4_t b) {
+  bf16_4_t c;
+  c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
+  c.y = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.y, b.y);
+  return c;
+}
+
+template <>
+inline __device__ bf16_4_t mul(__nv_bfloat16 a, bf16_4_t b) {
+  __nv_bfloat162 s = bf162bf162(a);
+  bf16_4_t c;
+  c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.x);
+  c.y = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.y);
+  return c;
+}
+
+template <>
+inline __device__ bf16_8_t mul(bf16_8_t a, bf16_8_t b) {
+  bf16_8_t c;
+  c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
+  c.y = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.y, b.y);
+  c.z = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.z, b.z);
+  c.w = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.w, b.w);
+  return c;
+}
+
+template <>
+inline __device__ bf16_8_t mul(__nv_bfloat16 a, bf16_8_t b) {
+  __nv_bfloat162 s = bf162bf162(a);
+  bf16_8_t c;
+  c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.x);
+  c.y = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.y);
+  c.z = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.z);
+  c.w = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.w);
+  return c;
+}
+
+template <>
+inline __device__ float mul(__nv_bfloat16 a, __nv_bfloat16 b) {
+  float fa = __bfloat162float(a);
+  float fb = __bfloat162float(b);
+  return fa * fb;
+}
+
+template <>
+inline __device__ float2 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
+  float2 fa = bf1622float2(a);
+  float2 fb = bf1622float2(b);
+  return mul<float2, float2, float2>(fa, fb);
+}
+
+template <>
+inline __device__ float2 mul(__nv_bfloat16 a, __nv_bfloat162 b) {
+  return mul<float2, __nv_bfloat162, __nv_bfloat162>(bf162bf162(a), b);
+}
+
+template <>
+inline __device__ Float4_ mul(bf16_4_t a, bf16_4_t b) {
+  Float4_ fc;
+  fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
+  fc.y = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.y, b.y);
+  return fc;
+}
+
+template <>
+inline __device__ Float4_ mul(__nv_bfloat16 a, bf16_4_t b) {
+  __nv_bfloat162 s = bf162bf162(a);
+  Float4_ fc;
+  fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.x);
+  fc.y = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.y);
+  return fc;
+}
+
+template <>
+inline __device__ Float8_ mul(bf16_8_t a, bf16_8_t b) {
+  Float8_ fc;
+  fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
+  fc.y = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.y, b.y);
+  fc.z = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.z, b.z);
+  fc.w = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.w, b.w);
+  return fc;
+}
+
+template <>
+inline __device__ Float8_ mul(__nv_bfloat16 a, bf16_8_t b) {
+  __nv_bfloat162 s = bf162bf162(a);
+  Float8_ fc;
+  fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.x);
+  fc.y = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.y);
+  fc.z = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.z);
+  fc.w = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.w);
+  return fc;
+}
+
+// Vector fused multiply-add.
+inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b,
+                                     __nv_bfloat162 c) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  assert(false);
+#else
+  return __hfma2(a, b, c);
+#endif
+  __builtin_unreachable();  // Suppress missing return statement warning
+}
+
+inline __device__ __nv_bfloat162 fma(__nv_bfloat16 a, __nv_bfloat162 b,
+                                     __nv_bfloat162 c) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  assert(false);
+#else
+  return __hfma2(bf162bf162(a), b, c);
+#endif
+  __builtin_unreachable();  // Suppress missing return statement warning
+}
+
+inline __device__ bf16_4_t fma(bf16_4_t a, bf16_4_t b, bf16_4_t c) {
+  bf16_4_t d;
+  d.x = fma(a.x, b.x, c.x);
+  d.y = fma(a.y, b.y, c.y);
+  return d;
+}
+
+inline __device__ bf16_4_t fma(__nv_bfloat16 a, bf16_4_t b, bf16_4_t c) {
+  __nv_bfloat162 s = bf162bf162(a);
+  bf16_4_t d;
+  d.x = fma(s, b.x, c.x);
+  d.y = fma(s, b.y, c.y);
+  return d;
+}
+
+inline __device__ bf16_8_t fma(bf16_8_t a, bf16_8_t b, bf16_8_t c) {
+  bf16_8_t d;
+  d.x = fma(a.x, b.x, c.x);
+  d.y = fma(a.y, b.y, c.y);
+  d.z = fma(a.z, b.z, c.z);
+  d.w = fma(a.w, b.w, c.w);
+  return d;
+}
+
+inline __device__ bf16_8_t fma(__nv_bfloat16 a, bf16_8_t b, bf16_8_t c) {
+  __nv_bfloat162 s = bf162bf162(a);
+  bf16_8_t d;
+  d.x = fma(s, b.x, c.x);
+  d.y = fma(s, b.y, c.y);
+  d.z = fma(s, b.z, c.z);
+  d.w = fma(s, b.w, c.w);
+  return d;
+}
+
+inline __device__ float fma(__nv_bfloat16 a, __nv_bfloat16 b, float fc) {
+  return __bfloat162float(a) * __bfloat162float(b) + fc;
+}
+
+inline __device__ float2 fma(__nv_bfloat162 a, __nv_bfloat162 b, float2 fc) {
+  float2 fa = bf1622float2(a);
+  float2 fb = bf1622float2(b);
+  return fma(fa, fb, fc);
+}
+
+inline __device__ float2 fma(__nv_bfloat16 a, __nv_bfloat162 b, float2 fc) {
+  return fma(bf162bf162(a), b, fc);
+}
+
+inline __device__ Float4_ fma(bf16_4_t a, bf16_4_t b, Float4_ fc) {
+  Float4_ fd;
+  fd.x = fma(a.x, b.x, fc.x);
+  fd.y = fma(a.y, b.y, fc.y);
+  return fd;
+}
+
+inline __device__ Float4_ fma(__nv_bfloat16 a, bf16_4_t b, Float4_ fc) {
+  __nv_bfloat162 s = bf162bf162(a);
+  Float4_ fd;
+  fd.x = fma(s, b.x, fc.x);
+  fd.y = fma(s, b.y, fc.y);
+  return fd;
+}
+
+inline __device__ Float8_ fma(bf16_8_t a, bf16_8_t b, Float8_ fc) {
+  Float8_ fd;
+  fd.x = fma(a.x, b.x, fc.x);
+  fd.y = fma(a.y, b.y, fc.y);
+  fd.z = fma(a.z, b.z, fc.z);
+  fd.w = fma(a.w, b.w, fc.w);
+  return fd;
+}
+
+inline __device__ Float8_ fma(__nv_bfloat16 a, bf16_8_t b, Float8_ fc) {
+  __nv_bfloat162 s = bf162bf162(a);
+  Float8_ fd;
+  fd.x = fma(s, b.x, fc.x);
+  fd.y = fma(s, b.y, fc.y);
+  fd.z = fma(s, b.z, fc.z);
+  fd.w = fma(s, b.w, fc.w);
+  return fd;
+}
+
+// Vector sum.
+template <>
+inline __device__ float sum(__nv_bfloat16 v) {
+  return __bfloat162float(v);
+}
+
+template <>
+inline __device__ float sum(__nv_bfloat162 v) {
+  float2 vf = bf1622float2(v);
+  return vf.x + vf.y;
+}
+
+template <>
+inline __device__ float sum(bf16_4_t v) {
+  return sum(v.x) + sum(v.y);
+}
+
+template <>
+inline __device__ float sum(bf16_8_t v) {
+  return sum(v.x) + sum(v.y) + sum(v.z) + sum(v.w);
+}
+
+// From float32 to bfloat16.
+inline __device__ void from_float(__nv_bfloat16& dst, float src) {
+  dst = __float2bfloat16(src);
+}
+
+inline __device__ void from_float(__nv_bfloat162& dst, float2 src) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  assert(false);
+#else
+  dst = __float22bfloat162_rn(src);
+#endif
+}
+
+inline __device__ void from_float(bf16_4_t& dst, Float4_ src) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  assert(false);
+#else
+  dst.x = __float22bfloat162_rn(src.x);
+  dst.y = __float22bfloat162_rn(src.y);
+#endif
+}
+
+inline __device__ void from_float(bf16_8_t& dst, Float8_ src) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  assert(false);
+#else
+  dst.x = __float22bfloat162_rn(src.x);
+  dst.y = __float22bfloat162_rn(src.y);
+  dst.z = __float22bfloat162_rn(src.z);
+  dst.w = __float22bfloat162_rn(src.w);
+#endif
+}
+
+// From bfloat16 to float32.
+inline __device__ float to_float(__nv_bfloat16 u) {
+  return __bfloat162float(u);
+}
+
+// Zero-out a variable.
+inline __device__ void zero(__nv_bfloat16& dst) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  assert(false);
+#else
+  // Same as CUDART_ZERO_BF16 introduced in CUDA 12.2.
+  dst = __ushort_as_bfloat16((unsigned short)0x0000U);
+#endif
+}
+
+}  // namespace vllm
diff --git a/vllm_v0.10.0/csrc/attention/dtype_float16.cuh b/vllm_v0.10.0/csrc/attention/dtype_float16.cuh
new file mode 100644
index 0000000..3a1815f
--- /dev/null
+++ b/vllm_v0.10.0/csrc/attention/dtype_float16.cuh
@@ -0,0 +1,504 @@
+/*
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * and
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+ * Copyright (c) 2023, The vLLM team.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "attention_generic.cuh"
+#include "dtype_float32.cuh"
+
+#ifdef USE_ROCM
+  #include <hip/hip_fp16.h>
+#endif
+
+#include <stdint.h>
+
+namespace vllm {
+
+// FP16 vector types for Q, K, V.
+template <>
+struct Vec<uint16_t, 1> {
+  using Type = uint16_t;
+};
+template <>
+struct Vec<uint16_t, 2> {
+  using Type = uint32_t;
+};
+template <>
+struct Vec<uint16_t, 4> {
+  using Type = uint2;
+};
+template <>
+struct Vec<uint16_t, 8> {
+  using Type = uint4;
+};
+
+// FP32 accumulator vector types corresponding to Vec.
+template <>
+struct FloatVec<uint16_t> {
+  using Type = float;
+};
+template <>
+struct FloatVec<uint32_t> {
+  using Type = float2;
+};
+template <>
+struct FloatVec<uint2> {
+  using Type = Float4_;
+};
+template <>
+struct FloatVec<uint4> {
+  using Type = Float8_;
+};
+
+// Utility functions for type conversions.
+inline __device__ uint32_t h0_h0(uint16_t a) {
+#ifndef USE_ROCM
+  uint32_t b;
+  asm volatile("mov.b32 %0, {%1, %1};" : "=r"(b) : "h"(a));
+  return b;
+#else
+  union {
+    uint32_t u32;
+    uint16_t u16[2];
+  } tmp;
+  tmp.u16[0] = a;
+  tmp.u16[1] = a;
+  return tmp.u32;
+#endif
+}
+
+inline __device__ float half_to_float(uint16_t h) {
+  float f;
+#ifndef USE_ROCM
+  asm volatile("cvt.f32.f16 %0, %1;\n" : "=f"(f) : "h"(h));
+#else
+  asm volatile("v_cvt_f32_f16 %0, %1;" : "=v"(f) : "v"(h));
+#endif
+  return f;
+}
+
+inline __device__ float2 half2_to_float2(uint32_t v) {
+#ifndef USE_ROCM
+  uint16_t lo, hi;
+  asm volatile("mov.b32 {%0, %1}, %2;\n" : "=h"(lo), "=h"(hi) : "r"(v));
+  return make_float2(half_to_float(lo), half_to_float(hi));
+#else
+  union {
+    uint32_t u32;
+    uint16_t u16[2];
+  } tmp;
+  tmp.u32 = v;
+  float2 ret;
+  ret.x = half_to_float(tmp.u16[0]);
+  ret.y = half_to_float(tmp.u16[1]);
+  return ret;
+#endif
+}
+
+inline __device__ uint16_t float_to_half(float f) {
+  union {
+    uint32_t u32;
+    uint16_t u16[2];
+  } tmp;
+#ifndef USE_ROCM
+  asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f));
+#else
+  asm volatile("v_cvt_f16_f32 %0, %1;\n" : "=v"(tmp.u32) : "v"(f));
+#endif
+  return tmp.u16[0];
+}
+
+inline __device__ uint32_t float2_to_half2(float2 f) {
+  union {
+    uint32_t u32;
+    uint16_t u16[2];
+  } tmp;
+#ifndef USE_ROCM
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n"
+               : "=r"(tmp.u32)
+               : "f"(f.y), "f"(f.x));
+  #else
+  asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
+  asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
+  #endif
+#else
+  tmp.u16[0] = float_to_half(f.x);
+  tmp.u16[1] = float_to_half(f.y);
+#endif
+  return tmp.u32;
+}
+
+// Vector addition.
+inline __device__ uint16_t add(uint16_t a, uint16_t b) {
+  uint16_t c;
+#ifndef USE_ROCM
+  asm volatile("add.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
+#else
+  asm volatile("v_add_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b));
+#endif
+  return c;
+}
+
+inline __device__ uint32_t add(uint32_t a, uint32_t b) {
+  uint32_t c;
+#ifndef USE_ROCM
+  asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
+#else
+  asm volatile("v_pk_add_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b));
+#endif
+  return c;
+}
+
+inline __device__ uint2 add(uint2 a, uint2 b) {
+  uint2 c;
+  c.x = add(a.x, b.x);
+  c.y = add(a.y, b.y);
+  return c;
+}
+
+inline __device__ uint4 add(uint4 a, uint4 b) {
+  uint4 c;
+  c.x = add(a.x, b.x);
+  c.y = add(a.y, b.y);
+  c.z = add(a.z, b.z);
+  c.w = add(a.w, b.w);
+  return c;
+}
+
+inline __device__ float2 add(uint32_t a, float2 fb) {
+  float2 fa = half2_to_float2(a);
+  return add(fa, fb);
+}
+
+inline __device__ Float4_ add(uint2 a, Float4_ fb) {
+  Float4_ fc;
+  fc.x = add(a.x, fb.x);
+  fc.y = add(a.y, fb.y);
+  return fc;
+}
+
+inline __device__ Float8_ add(uint4 a, Float8_ fb) {
+  Float8_ fc;
+  fc.x = add(a.x, fb.x);
+  fc.y = add(a.y, fb.y);
+  fc.z = add(a.z, fb.z);
+  fc.w = add(a.w, fb.w);
+  return fc;
+}
+
+// Vector multiplication.
+template <>
+inline __device__ uint16_t mul(uint16_t a, uint16_t b) {
+  uint16_t c;
+#ifndef USE_ROCM
+  asm volatile("mul.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
+#else
+  asm volatile("v_mul_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b));
+#endif
+  return c;
+}
+
+template <>
+inline __device__ uint32_t mul(uint32_t a, uint32_t b) {
+  uint32_t c;
+#ifndef USE_ROCM
+  asm volatile("mul.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
+#else
+  asm volatile("v_pk_mul_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b));
+#endif
+  return c;
+}
+
+template <>
+inline __device__ uint32_t mul(uint16_t a, uint32_t b) {
+  return mul<uint32_t, uint32_t, uint32_t>(h0_h0(a), b);
+}
+
+template <>
+inline __device__ uint2 mul(uint2 a, uint2 b) {
+  uint2 c;
+  c.x = mul<uint32_t, uint32_t, uint32_t>(a.x, b.x);
+  c.y = mul<uint32_t, uint32_t, uint32_t>(a.y, b.y);
+  return c;
+}
+
+template <>
+inline __device__ uint2 mul(uint16_t a, uint2 b) {
+  uint32_t s = h0_h0(a);
+  uint2 c;
+  c.x = mul<uint32_t, uint32_t, uint32_t>(s, b.x);
+  c.y = mul<uint32_t, uint32_t, uint32_t>(s, b.y);
+  return c;
+}
+
+template <>
+inline __device__ uint4 mul(uint4 a, uint4 b) {
+  uint4 c;
+  c.x = mul<uint32_t, uint32_t, uint32_t>(a.x, b.x);
+  c.y = mul<uint32_t, uint32_t, uint32_t>(a.y, b.y);
+  c.z = mul<uint32_t, uint32_t, uint32_t>(a.z, b.z);
+  c.w = mul<uint32_t, uint32_t, uint32_t>(a.w, b.w);
+  return c;
+}
+
+template <>
+inline __device__ uint4 mul(uint16_t a, uint4 b) {
+  uint32_t s = h0_h0(a);
+  uint4 c;
+  c.x = mul<uint32_t, uint32_t, uint32_t>(s, b.x);
+  c.y = mul<uint32_t, uint32_t, uint32_t>(s, b.y);
+  c.z = mul<uint32_t, uint32_t, uint32_t>(s, b.z);
+  c.w = mul<uint32_t, uint32_t, uint32_t>(s, b.w);
+  return c;
+}
+
+template <>
+inline __device__ float mul(uint16_t a, uint16_t b) {
+  float fa = half_to_float(a);
+  float fb = half_to_float(b);
+  return fa * fb;
+}
+
+template <>
+inline __device__ float2 mul(uint32_t a, uint32_t b) {
+  float2 fa = half2_to_float2(a);
+  float2 fb = half2_to_float2(b);
+  return mul<float2, float2, float2>(fa, fb);
+}
+
+template <>
+inline __device__ float2 mul(uint16_t a, uint32_t b) {
+  return mul<float2, uint32_t, uint32_t>(h0_h0(a), b);
+}
+
+template <>
+inline __device__ Float4_ mul(uint2 a, uint2 b) {
+  Float4_ fc;
+  fc.x = mul<float2, uint32_t, uint32_t>(a.x, b.x);
+  fc.y = mul<float2, uint32_t, uint32_t>(a.y, b.y);
+  return fc;
+}
+
+template <>
+inline __device__ Float4_ mul(uint16_t a, uint2 b) {
+  uint32_t s = h0_h0(a);
+  Float4_ fc;
+  fc.x = mul<float2, uint32_t, uint32_t>(s, b.x);
+  fc.y = mul<float2, uint32_t, uint32_t>(s, b.y);
+  return fc;
+}
+
+template <>
+inline __device__ Float8_ mul(uint4 a, uint4 b) {
+  Float8_ fc;
+  fc.x = mul<float2, uint32_t, uint32_t>(a.x, b.x);
+  fc.y = mul<float2, uint32_t, uint32_t>(a.y, b.y);
+  fc.z = mul<float2, uint32_t, uint32_t>(a.z, b.z);
+  fc.w = mul<float2, uint32_t, uint32_t>(a.w, b.w);
+  return fc;
+}
+
+template <>
+inline __device__ Float8_ mul(uint16_t a, uint4 b) {
+  uint32_t s = h0_h0(a);
+  Float8_ fc;
+  fc.x = mul<float2, uint32_t, uint32_t>(s, b.x);
+  fc.y = mul<float2, uint32_t, uint32_t>(s, b.y);
+  fc.z = mul<float2, uint32_t, uint32_t>(s, b.z);
+  fc.w = mul<float2, uint32_t, uint32_t>(s, b.w);
+  return fc;
+}
+
+// Vector fused multiply-add.
+inline __device__ uint32_t fma(uint32_t a, uint32_t b, uint32_t c) {
+  uint32_t d;
+#ifndef USE_ROCM
+  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+               : "=r"(d)
+               : "r"(a), "r"(b), "r"(c));
+#else
+  asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n"
+               : "=v"(d)
+               : "v"(a), "v"(b), "v"(c));
+#endif
+  return d;
+}
+
+inline __device__ uint32_t fma(uint16_t a, uint32_t b, uint32_t c) {
+  return fma(h0_h0(a), b, c);
+}
+
+inline __device__ uint2 fma(uint2 a, uint2 b, uint2 c) {
+  uint2 d;
+  d.x = fma(a.x, b.x, c.x);
+  d.y = fma(a.y, b.y, c.y);
+  return d;
+}
+
+inline __device__ uint2 fma(uint16_t a, uint2 b, uint2 c) {
+  uint32_t s = h0_h0(a);
+  uint2 d;
+  d.x = fma(s, b.x, c.x);
+  d.y = fma(s, b.y, c.y);
+  return d;
+}
+
+inline __device__ uint4 fma(uint4 a, uint4 b, uint4 c) {
+  uint4 d;
+  d.x = fma(a.x, b.x, c.x);
+  d.y = fma(a.y, b.y, c.y);
+  d.z = fma(a.z, b.z, c.z);
+  d.w = fma(a.w, b.w, c.w);
+  return d;
+}
+
+inline __device__ uint4 fma(uint16_t a, uint4 b, uint4 c) {
+  uint32_t s = h0_h0(a);
+  uint4 d;
+  d.x = fma(s, b.x, c.x);
+  d.y = fma(s, b.y, c.y);
+  d.z = fma(s, b.z, c.z);
+  d.w = fma(s, b.w, c.w);
+  return d;
+}
+
+inline __device__ float fma(uint16_t a, uint16_t b, float fc) {
+  float fa = half_to_float(a);
+  float fb = half_to_float(b);
+  return fa * fb + fc;
+}
+
+inline __device__ float2 fma(uint32_t a, uint32_t b, float2 fc) {
+  float2 fa = half2_to_float2(a);
+  float2 fb = half2_to_float2(b);
+  return fma(fa, fb, fc);
+}
+
+inline __device__ float2 fma(uint16_t a, uint32_t b, float2 fc) {
+  return fma(h0_h0(a), b, fc);
+}
+
+inline __device__ Float4_ fma(uint2 a, uint2 b, Float4_ fc) {
+  Float4_ fd;
+  fd.x = fma(a.x, b.x, fc.x);
+  fd.y = fma(a.y, b.y, fc.y);
+  return fd;
+}
+
+inline __device__ Float4_ fma(uint16_t a, uint2 b, Float4_ fc) {
+  uint32_t s = h0_h0(a);
+  Float4_ fd;
+  fd.x = fma(s, b.x, fc.x);
+  fd.y = fma(s, b.y, fc.y);
+  return fd;
+}
+
+inline __device__ Float8_ fma(uint4 a, uint4 b, Float8_ fc) {
+  Float8_ fd;
+  fd.x = fma(a.x, b.x, fc.x);
+  fd.y = fma(a.y, b.y, fc.y);
+  fd.z = fma(a.z, b.z, fc.z);
+  fd.w = fma(a.w, b.w, fc.w);
+  return fd;
+}
+
+inline __device__ Float8_ fma(uint16_t a, uint4 b, Float8_ fc) {
+  uint32_t s = h0_h0(a);
+  Float8_ fd;
+  fd.x = fma(s, b.x, fc.x);
+  fd.y = fma(s, b.y, fc.y);
+  fd.z = fma(s, b.z, fc.z);
+  fd.w = fma(s, b.w, fc.w);
+  return fd;
+}
+
+// Vector sum.
+template <>
+inline __device__ float sum(uint16_t v) {
+  return half_to_float(v);
+}
+
+template <>
+inline __device__ float sum(uint32_t v) {
+  float2 tmp = half2_to_float2(v);
+  return tmp.x + tmp.y;
+}
+
+template <>
+inline __device__ float sum(uint2 v) {
+  uint32_t c = add(v.x, v.y);
+  return sum(c);
+}
+
+template <>
+inline __device__ float sum(uint4 v) {
+  uint32_t c = add(v.x, v.y);
+  c = add(c, v.z);
+  c = add(c, v.w);
+  return sum(c);
+}
+
+// From float32 to float16.
+inline __device__ void from_float(uint16_t& dst, float src) {
+  dst = float_to_half(src);
+}
+
+inline __device__ void from_float(uint32_t& dst, float2 src) {
+  dst = float2_to_half2(src);
+}
+
+inline __device__ void from_float(uint2& dst, Float4_ src) {
+  dst.x = float2_to_half2(src.x);
+  dst.y = float2_to_half2(src.y);
+}
+
+inline __device__ void from_float(uint4& dst, Float8_ src) {
+  dst.x = float2_to_half2(src.x);
+  dst.y = float2_to_half2(src.y);
+  dst.z = float2_to_half2(src.z);
+  dst.w = float2_to_half2(src.w);
+}
+
+// From float16 to float32.
+inline __device__ float to_float(uint16_t u) { return half_to_float(u); }
+
+inline __device__ float2 to_float(uint32_t u) { return half2_to_float2(u); }
+
+inline __device__ Float4_ to_float(uint2 u) {
+  Float4_ tmp;
+  tmp.x = half2_to_float2(u.x);
+  tmp.y = half2_to_float2(u.y);
+  return tmp;
+}
+
+inline __device__ Float8_ to_float(uint4 u) {
+  Float8_ tmp;
+  tmp.x = half2_to_float2(u.x);
+  tmp.y = half2_to_float2(u.y);
+  tmp.z = half2_to_float2(u.z);
+  tmp.w = half2_to_float2(u.w);
+  return tmp;
+}
+
+// Zero-out a variable.
+inline __device__ void zero(uint16_t& dst) { dst = uint16_t(0); }
+
+}  // namespace vllm
diff --git a/vllm_v0.10.0/csrc/attention/dtype_float32.cuh b/vllm_v0.10.0/csrc/attention/dtype_float32.cuh
new file mode 100644
index 0000000..7c6a686
--- /dev/null
+++ b/vllm_v0.10.0/csrc/attention/dtype_float32.cuh
@@ -0,0 +1,251 @@
+/*
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * and
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+ * Copyright (c) 2023, The vLLM team.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "attention_generic.cuh"
+
+#include <stdint.h>
+
+namespace vllm {
+
+// Define custom FP32 vector data types.
+struct Float4_ {
+  float2 x;
+  float2 y;
+};
+
+struct Float8_ {
+  float2 x;
+  float2 y;
+  float2 z;
+  float2 w;
+};
+
+// FP32 vector types for Q, K, V.
+template <>
+struct Vec<float, 1> {
+  using Type = float;
+};
+template <>
+struct Vec<float, 2> {
+  using Type = float2;
+};
+template <>
+struct Vec<float, 4> {
+  using Type = float4;
+};
+
+// FP32 accumulator vector types corresponding to Vec.
+template <>
+struct FloatVec<float> {
+  using Type = float;
+};
+template <>
+struct FloatVec<float2> {
+  using Type = float2;
+};
+template <>
+struct FloatVec<float4> {
+  using Type = float4;
+};
+
+// Vector addition.
+inline __device__ float add(float a, float b) { return a + b; }
+
+inline __device__ float2 add(float2 a, float2 b) {
+  float2 c;
+  c.x = add(a.x, b.x);
+  c.y = add(a.y, b.y);
+  return c;
+}
+
+inline __device__ float4 add(float4 a, float4 b) {
+  float4 c;
+  c.x = add(a.x, b.x);
+  c.y = add(a.y, b.y);
+  c.z = add(a.z, b.z);
+  c.w = add(a.w, b.w);
+  return c;
+}
+
+// Vector multiplication.
+template <>
+inline __device__ float mul<float, float>(float a, float b) {
+  return a * b;
+}
+
+template <>
+inline __device__ float2 mul(float2 a, float2 b) {
+  float2 c;
+  c.x = a.x * b.x;
+  c.y = a.y * b.y;
+  return c;
+}
+
+template <>
+inline __device__ float2 mul(float a, float2 b) {
+  float2 c;
+  c.x = a * b.x;
+  c.y = a * b.y;
+  return c;
+}
+
+template <>
+inline __device__ float4 mul(float4 a, float4 b) {
+  float4 c;
+  c.x = a.x * b.x;
+  c.y = a.y * b.y;
+  c.z = a.z * b.z;
+  c.w = a.w * b.w;
+  return c;
+}
+
+template <>
+inline __device__ float4 mul(float a, float4 b) {
+  float4 c;
+  c.x = a * b.x;
+  c.y = a * b.y;
+  c.z = a * b.z;
+  c.w = a * b.w;
+  return c;
+}
+
+// Vector fused multiply-add.
+inline __device__ float fma(float a, float b, float c) { return a * b + c; }
+
+inline __device__ float2 fma(float2 a, float2 b, float2 c) {
+  float2 d;
+  d.x = fma(a.x, b.x, c.x);
+  d.y = fma(a.y, b.y, c.y);
+  return d;
+}
+
+inline __device__ float2 fma(float a, float2 b, float2 c) {
+  float2 d;
+  d.x = fma(a, b.x, c.x);
+  d.y = fma(a, b.y, c.y);
+  return d;
+}
+
+inline __device__ float4 fma(float4 a, float4 b, float4 c) {
+  float4 d;
+  d.x = fma(a.x, b.x, c.x);
+  d.y = fma(a.y, b.y, c.y);
+  d.z = fma(a.z, b.z, c.z);
+  d.w = fma(a.w, b.w, c.w);
+  return d;
+}
+
+inline __device__ float4 fma(float a, float4 b, float4 c) {
+  float4 d;
+  d.x = fma(a, b.x, c.x);
+  d.y = fma(a, b.y, c.y);
+  d.z = fma(a, b.z, c.z);
+  d.w = fma(a, b.w, c.w);
+  return d;
+}
+
+inline __device__ Float4_ fma(float a, Float4_ b, Float4_ c) {
+  Float4_ d;
+  d.x = fma(a, b.x, c.x);
+  d.y = fma(a, b.y, c.y);
+  return d;
+}
+
+inline __device__ Float8_ fma(float a, Float8_ b, Float8_ c) {
+  Float8_ d;
+  d.x = fma(a, b.x, c.x);
+  d.y = fma(a, b.y, c.y);
+  d.z = fma(a, b.z, c.z);
+  d.w = fma(a, b.w, c.w);
+  return d;
+}
+
+// Vector sum.
+template <>
+inline __device__ float sum(float v) {
+  return v;
+}
+
+template <>
+inline __device__ float sum(float2 v) {
+  return v.x + v.y;
+}
+
+template <>
+inline __device__ float sum(float4 v) {
+  return v.x + v.y + v.z + v.w;
+}
+
+template <>
+inline __device__ float sum(Float4_ v) {
+  return v.x.x + v.x.y + v.y.x + v.y.y;
+}
+
+template <>
+inline __device__ float sum(Float8_ v) {
+  return v.x.x + v.x.y + v.y.x + v.y.y + v.z.x + v.z.y + v.w.x + v.w.y;
+}
+
+// Vector dot product.
+inline __device__ float dot(float a, float b) { return a * b; }
+
+inline __device__ float dot(float2 a, float2 b) {
+  float2 c = mul<float2, float2, float2>(a, b);
+  return c.x + c.y;
+}
+
+inline __device__ float dot(Float4_ a, Float4_ b) {
+  float2 acc = mul<float2, float2, float2>(a.x, b.x);
+  acc = fma(a.y, b.y, acc);
+  return acc.x + acc.y;
+}
+
+inline __device__ float dot(Float8_ a, Float8_ b) {
+  float2 acc = mul<float2, float2, float2>(a.x, b.x);
+  acc = fma(a.y, b.y, acc);
+  acc = fma(a.z, b.z, acc);
+  acc = fma(a.w, b.w, acc);
+  return acc.x + acc.y;
+}
+
+// From float to float.
+inline __device__ void from_float(float& dst, float src) { dst = src; }
+
+inline __device__ void from_float(float2& dst, float2 src) { dst = src; }
+
+inline __device__ void from_float(float4& dst, float4 src) { dst = src; }
+
+// From float to float.
+inline __device__ float to_float(float u) { return u; }
+
+inline __device__ float2 to_float(float2 u) { return u; }
+
+inline __device__ float4 to_float(float4 u) { return u; }
+
+inline __device__ Float4_ to_float(Float4_ u) { return u; }
+
+inline __device__ Float8_ to_float(Float8_ u) { return u; }
+
+// Zero-out a variable.
+inline __device__ void zero(float& dst) { dst = 0.f; }
+
+}  // namespace vllm
diff --git a/vllm_v0.10.0/csrc/attention/dtype_fp8.cuh b/vllm_v0.10.0/csrc/attention/dtype_fp8.cuh
new file mode 100644
index 0000000..e714e32
--- /dev/null
+++ b/vllm_v0.10.0/csrc/attention/dtype_fp8.cuh
@@ -0,0 +1,41 @@
+#pragma once
+
+#include "attention_generic.cuh"
+
+#include <stdint.h>
+#ifdef ENABLE_FP8
+  #ifndef USE_ROCM
+    #include <cuda_fp8.h>
+  #endif  // USE_ROCM
+#endif    // ENABLE_FP8
+
+namespace vllm {
+
+enum class Fp8KVCacheDataType {
+  kAuto = 0,
+  kFp8E4M3 = 1,
+  kFp8E5M2 = 2,
+};
+
+// fp8 vector types for quantization of kv cache
+template <>
+struct Vec<uint8_t, 1> {
+  using Type = uint8_t;
+};
+
+template <>
+struct Vec<uint8_t, 2> {
+  using Type = uint16_t;
+};
+
+template <>
+struct Vec<uint8_t, 4> {
+  using Type = uint32_t;
+};
+
+template <>
+struct Vec<uint8_t, 8> {
+  using Type = uint2;
+};
+
+}  // namespace vllm
diff --git a/vllm_v0.10.0/csrc/attention/merge_attn_states.cu b/vllm_v0.10.0/csrc/attention/merge_attn_states.cu
new file mode 100644
index 0000000..6bee9e4
--- /dev/null
+++ b/vllm_v0.10.0/csrc/attention/merge_attn_states.cu
@@ -0,0 +1,186 @@
+#include <optional>
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <algorithm>
+
+#include "attention_dtypes.h"
+#include "attention_utils.cuh"
+
+namespace vllm {
+
+// Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
+// can be used to combine partial attention results (in the split-KV case)
+template <typename scalar_t, const uint NUM_THREADS>
+__global__ void merge_attn_states_kernel(
+    scalar_t* output, float* output_lse, const scalar_t* prefix_output,
+    const float* prefix_lse, const scalar_t* suffix_output,
+    const float* suffix_lse, const uint num_tokens, const uint num_heads,
+    const uint head_size) {
+  using pack_128b_t = uint4;
+  const uint pack_size = 16 / sizeof(scalar_t);
+  const uint threads_per_head = head_size / pack_size;
+
+  const uint global_idx = blockIdx.x * NUM_THREADS + threadIdx.x;
+  const uint token_head_threads = num_tokens * num_heads * threads_per_head;
+
+  if (global_idx >= token_head_threads) return;
+
+  // global_idx -> token_idx + head_idx + pack_idx
+  const uint token_head_idx = global_idx / threads_per_head;
+  const uint pack_idx = global_idx % threads_per_head;
+
+  const uint token_idx = token_head_idx / num_heads;
+  const uint head_idx = token_head_idx % num_heads;
+
+  const uint pack_offset = pack_idx * pack_size;  // (0~15)*8, etc.
+  const uint head_offset =
+      token_idx * num_heads * head_size + head_idx * head_size;
+  const scalar_t* prefix_head_ptr = prefix_output + head_offset;
+  const scalar_t* suffix_head_ptr = suffix_output + head_offset;
+  scalar_t* output_head_ptr = output + head_offset;
+
+  float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
+  float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
+  p_lse = std::isinf(p_lse) ? -std::numeric_limits<float>::infinity() : p_lse;
+  s_lse = std::isinf(s_lse) ? -std::numeric_limits<float>::infinity() : s_lse;
+
+  const float max_lse = fmaxf(p_lse, s_lse);
+  p_lse = p_lse - max_lse;
+  s_lse = s_lse - max_lse;
+  const float p_se = expf(p_lse);
+  const float s_se = expf(s_lse);
+  const float out_se = p_se + s_se;
+  const float p_scale = p_se / out_se;
+  const float s_scale = s_se / out_se;
+
+  if (pack_offset < head_size) {
+    // Pack 128b load
+    pack_128b_t p_out_pack = reinterpret_cast<const pack_128b_t*>(
+        prefix_head_ptr)[pack_offset / pack_size];
+    pack_128b_t s_out_pack = reinterpret_cast<const pack_128b_t*>(
+        suffix_head_ptr)[pack_offset / pack_size];
+    pack_128b_t o_out_pack;
+
+#pragma unroll
+    for (uint i = 0; i < pack_size; ++i) {
+      // Always use float for FMA to keep high precision.
+      // half(uint16_t), bfloat16, float -> float.
+      const float p_out_f =
+          vllm::to_float(reinterpret_cast<const scalar_t*>(&p_out_pack)[i]);
+      const float s_out_f =
+          vllm::to_float(reinterpret_cast<const scalar_t*>(&s_out_pack)[i]);
+      // fma: a * b + c = p_out_f * p_scale + (s_out_f * s_scale)
+      const float o_out_f = p_out_f * p_scale + (s_out_f * s_scale);
+      // float -> half(uint16_t), bfloat16, float.
+      vllm::from_float(reinterpret_cast<scalar_t*>(&o_out_pack)[i], o_out_f);
+    }
+
+    // Pack 128b storage
+    reinterpret_cast<pack_128b_t*>(output_head_ptr)[pack_offset / pack_size] =
+        o_out_pack;
+  }
+  // We only need to write to output_lse once per head.
+  if (output_lse != nullptr && pack_idx == 0) {
+    float out_lse = logf(out_se) + max_lse;
+    output_lse[head_idx * num_tokens + token_idx] = out_lse;
+  }
+}
+
+}  // namespace vllm
+
+// The following macro is used to dispatch the conversion function based on
+// the output data type. The FN is a macro that calls a function with
+// template<typename scalar_t>.
+#define DISPATCH_BY_SCALAR_DTYPE(scalar_dtype, fn)                      \
+  {                                                                     \
+    if (scalar_dtype == at::ScalarType::Float) {                        \
+      fn(float);                                                        \
+    } else if (scalar_dtype == at::ScalarType::Half) {                  \
+      fn(uint16_t);                                                     \
+    } else if (scalar_dtype == at::ScalarType::BFloat16) {              \
+      fn(__nv_bfloat16);                                                \
+    } else {                                                            \
+      TORCH_CHECK(false, "Unsupported data type of O: ", scalar_dtype); \
+    }                                                                   \
+  }
+
+#define LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS)                     \
+  {                                                                         \
+    vllm::merge_attn_states_kernel<scalar_t, NUM_THREADS>                   \
+        <<<grid, block, 0, stream>>>(                                       \
+            reinterpret_cast<scalar_t*>(output.data_ptr()), output_lse_ptr, \
+            reinterpret_cast<scalar_t*>(prefix_output.data_ptr()),          \
+            reinterpret_cast<float*>(prefix_lse.data_ptr()),                \
+            reinterpret_cast<scalar_t*>(suffix_output.data_ptr()),          \
+            reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens,    \
+            num_heads, head_size);                                          \
+  }
+
+/*@brief Merges the attention states from prefix and suffix
+ * into the output tensor. NUM_TOKENS: n, NUM_HEADS: h, HEAD_SIZE: d
+ *
+ * @param output [n,h,d] The output tensor to store the merged attention states.
+ * @param output_lse [h,d] Optional tensor to store the log-sum-exp values.
+ * @param prefix_output [n,h,d] The prefix attention states.
+ * @param prefix_lse [h,n] The log-sum-exp values for the prefix attention
+ * states.
+ * @param suffix_output [n,h,d] The suffix attention states.
+ * @param suffix_lse [h,n] The log-sum-exp values for the suffix attention
+ * states.
+ */
+template <typename scalar_t>
+void merge_attn_states_launcher(torch::Tensor& output,
+                                std::optional<torch::Tensor> output_lse,
+                                const torch::Tensor& prefix_output,
+                                const torch::Tensor& prefix_lse,
+                                const torch::Tensor& suffix_output,
+                                const torch::Tensor& suffix_lse) {
+  constexpr uint NUM_THREADS = 128;
+  const uint num_tokens = output.size(0);
+  const uint num_heads = output.size(1);
+  const uint head_size = output.size(2);
+  const uint pack_size = 16 / sizeof(scalar_t);
+  TORCH_CHECK(head_size % pack_size == 0,
+              "headsize must be multiple of pack_size:", pack_size);
+  TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1,
+              "output heads must be contiguous in memory");
+  TORCH_CHECK(
+      prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1,
+      "prefix_output heads must be contiguous in memory");
+  TORCH_CHECK(
+      suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1,
+      "suffix_output heads must be contiguous in memory");
+  float* output_lse_ptr = nullptr;
+  if (output_lse.has_value()) {
+    output_lse_ptr = output_lse.value().data_ptr<float>();
+  }
+  // Process one pack elements per thread. for float, the
+  // pack_size is 4 for half/bf16, the pack_size is 8.
+  const uint threads_per_head = head_size / pack_size;
+  const uint total_threads = num_tokens * num_heads * threads_per_head;
+
+  dim3 block(NUM_THREADS);
+  dim3 grid((total_threads + NUM_THREADS - 1) / NUM_THREADS);
+
+  const c10::cuda::OptionalCUDAGuard device_guard(prefix_output.device());
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS);
+}
+
+#define CALL_MERGE_ATTN_STATES_LAUNCHER(scalar_t)                           \
+  {                                                                         \
+    merge_attn_states_launcher<scalar_t>(output, output_lse, prefix_output, \
+                                         prefix_lse, suffix_output,         \
+                                         suffix_lse);                       \
+  }
+
+void merge_attn_states(torch::Tensor& output,
+                       std::optional<torch::Tensor> output_lse,
+                       const torch::Tensor& prefix_output,
+                       const torch::Tensor& prefix_lse,
+                       const torch::Tensor& suffix_output,
+                       const torch::Tensor& suffix_lse) {
+  DISPATCH_BY_SCALAR_DTYPE(output.dtype(), CALL_MERGE_ATTN_STATES_LAUNCHER);
+}
diff --git a/vllm_v0.10.0/csrc/attention/mla/cutlass_mla_entry.cu b/vllm_v0.10.0/csrc/attention/mla/cutlass_mla_entry.cu
new file mode 100644
index 0000000..0319d1d
--- /dev/null
+++ b/vllm_v0.10.0/csrc/attention/mla/cutlass_mla_entry.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
+void cutlass_mla_decode_sm100a(torch::Tensor const& out,
+                               torch::Tensor const& q_nope,
+                               torch::Tensor const& q_pe,
+                               torch::Tensor const& kv_c_and_k_pe_cache,
+                               torch::Tensor const& seq_lens,
+                               torch::Tensor const& page_table, double scale);
+#endif
+
+void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
+                        torch::Tensor const& q_pe,
+                        torch::Tensor const& kv_c_and_k_pe_cache,
+                        torch::Tensor const& seq_lens,
+                        torch::Tensor const& page_table, double scale) {
+#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
+  return cutlass_mla_decode_sm100a(out, q_nope, q_pe, kv_c_and_k_pe_cache,
+                                   seq_lens, page_table, scale);
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled cutlass MLA");
+}
diff --git a/vllm_v0.10.0/csrc/attention/mla/cutlass_mla_kernels.cu b/vllm_v0.10.0/csrc/attention/mla/cutlass_mla_kernels.cu
new file mode 100644
index 0000000..9d05d91
--- /dev/null
+++ b/vllm_v0.10.0/csrc/attention/mla/cutlass_mla_kernels.cu
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "cute/tensor.hpp"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/kernel_hardware_info.h"
+
+#include "cutlass_extensions/common.hpp"
+
+#include "device/sm100_mla.hpp"
+#include "kernel/sm100_mla_tile_scheduler.hpp"
+
+using namespace cute;
+using namespace cutlass::fmha::kernel;
+
+template <typename T, bool PersistenceOption = true>
+struct MlaSm100 {
+  using Element = T;
+  using ElementAcc = float;
+  using ElementOut = T;
+
+  using TileShape = Shape<_128, _128, Shape<_512, _64>>;
+  using TileShapeH = cute::tuple_element_t<0, TileShape>;
+  using TileShapeD = cute::tuple_element_t<2, TileShape>;
+
+  // H K (D_latent D_rope) B
+  using ProblemShape = cute::tuple<TileShapeH, int, TileShapeD, int>;
+
+  using StrideQ = cute::tuple<int64_t, _1, int64_t>;  // H D B
+  using StrideK = cute::tuple<int64_t, _1, int64_t>;  // K D B
+  using StrideO = StrideK;                            // H D B
+  using StrideLSE = cute::tuple<_1, int>;             // H B
+
+  using TileScheduler =
+      std::conditional_t<PersistenceOption, Sm100MlaPersistentTileScheduler,
+                         Sm100MlaIndividualTileScheduler>;
+
+  using FmhaKernel =
+      cutlass::fmha::kernel::Sm100FmhaMlaKernelTmaWarpspecialized<
+          TileShape, Element, ElementAcc, ElementOut, ElementAcc, TileScheduler,
+          /*kIsCpAsync=*/true>;
+  using Fmha = cutlass::fmha::device::MLA<FmhaKernel>;
+};
+
+template <typename T>
+typename T::Fmha::Arguments args_from_options(
+    at::Tensor const& out, at::Tensor const& q_nope, at::Tensor const& q_pe,
+    at::Tensor const& kv_c_and_k_pe_cache, at::Tensor const& seq_lens,
+    at::Tensor const& page_table, double scale) {
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.device_id = q_nope.device().index();
+  hw_info.sm_count =
+      cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+          hw_info.device_id);
+
+  int batches = q_nope.sizes()[0];
+  int page_count_per_seq = page_table.sizes()[1];
+  int page_count_total = kv_c_and_k_pe_cache.sizes()[0];
+  int page_size = kv_c_and_k_pe_cache.sizes()[1];
+  int max_seq_len = page_size * page_count_per_seq;
+  using TileShapeH = typename T::TileShapeH;
+  using TileShapeD = typename T::TileShapeD;
+  auto problem_shape =
+      cute::make_tuple(TileShapeH{}, max_seq_len, TileShapeD{}, batches);
+
+  auto [H, K, D, B] = problem_shape;
+  auto [D_latent, D_rope] = D;
+
+  using StrideQ = typename T::StrideQ;
+  using StrideK = typename T::StrideK;
+  using StrideO = typename T::StrideO;
+  using StrideLSE = typename T::StrideLSE;
+
+  StrideQ stride_Q_latent = cute::make_tuple(
+      static_cast<int64_t>(D_latent), _1{}, static_cast<int64_t>(H * D_latent));
+  StrideQ stride_Q_rope = cute::make_tuple(static_cast<int64_t>(D_rope), _1{},
+                                           static_cast<int64_t>(H * D_rope));
+  StrideK stride_C =
+      cute::make_tuple(static_cast<int64_t>(D_latent + D_rope), _1{},
+                       static_cast<int64_t>(page_size * (D_latent + D_rope)));
+  StrideLSE stride_PT = cute::make_stride(_1{}, page_count_per_seq);
+  StrideLSE stride_LSE = cute::make_tuple(_1{}, static_cast<int>(H));
+  StrideO stride_O = cute::make_tuple(static_cast<int64_t>(D_latent), _1{},
+                                      static_cast<int64_t>(H * D_latent));
+
+  using Element = typename T::Element;
+  using ElementOut = typename T::ElementOut;
+  using ElementAcc = typename T::ElementAcc;
+  auto Q_latent_ptr = static_cast<Element*>(q_nope.data_ptr());
+  auto Q_rope_ptr = static_cast<Element*>(q_pe.data_ptr());
+  auto C_ptr = static_cast<Element*>(kv_c_and_k_pe_cache.data_ptr());
+  auto scale_f = static_cast<float>(scale);
+  typename T::Fmha::Arguments arguments{
+      problem_shape,
+      {scale_f, Q_latent_ptr, stride_Q_latent, Q_rope_ptr, stride_Q_rope, C_ptr,
+       stride_C, C_ptr + D_latent, stride_C,
+       static_cast<int*>(seq_lens.data_ptr()),
+       static_cast<int*>(page_table.data_ptr()), stride_PT, page_count_total,
+       page_size},
+      {static_cast<ElementOut*>(out.data_ptr()), stride_O,
+       static_cast<ElementAcc*>(nullptr), stride_LSE},
+      hw_info,
+      1,        // split_kv
+      nullptr,  // is_var_split_kv
+  };
+  // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
+  // split_kv automatically based on batch size and sequence length to balance
+  // workload across available SMs. Consider using var_split_kv for manual
+  // control if needed.
+  T::Fmha::set_split_kv(arguments);
+  return arguments;
+}
+
+template <typename Element>
+void runMla(at::Tensor const& out, at::Tensor const& q_nope,
+            at::Tensor const& q_pe, at::Tensor const& kv_c_and_k_pe_cache,
+            at::Tensor const& seq_lens, at::Tensor const& page_table,
+            float scale, cudaStream_t stream) {
+  using MlaSm100Type = MlaSm100<Element>;
+  typename MlaSm100Type::Fmha fmha;
+  auto arguments = args_from_options<MlaSm100Type>(
+      out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, scale);
+  size_t workspace_size = MlaSm100Type::Fmha::get_workspace_size(arguments);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(q_nope.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  CUTLASS_CHECK(fmha.can_implement(arguments));
+
+  CUTLASS_CHECK(fmha.initialize(arguments, workspace.data_ptr(), stream));
+
+  CUTLASS_CHECK(fmha.run(arguments, workspace.data_ptr(), stream));
+}
+
+void cutlass_mla_decode_sm100a(torch::Tensor const& out,
+                               torch::Tensor const& q_nope,
+                               torch::Tensor const& q_pe,
+                               torch::Tensor const& kv_c_and_k_pe_cache,
+                               torch::Tensor const& seq_lens,
+                               torch::Tensor const& page_table, double scale) {
+  TORCH_CHECK(q_nope.device().is_cuda(), "q_nope must be on CUDA");
+  TORCH_CHECK(q_nope.dim() == 3, "q_nope must be a 3D tensor");
+  TORCH_CHECK(q_pe.dim() == 3, "q_pe must be a 3D tensor");
+  TORCH_CHECK(kv_c_and_k_pe_cache.dim() == 3,
+              "kv_c_and_k_pe_cache must be a 3D tensor");
+  TORCH_CHECK(seq_lens.dim() == 1, "seq_lens must be a 1D tensor");
+  TORCH_CHECK(page_table.dim() == 2, "page_table must be a 2D tensor");
+  TORCH_CHECK(out.dim() == 3, "out must be a 3D tensor");
+
+  auto B_q_nope = q_nope.size(0);
+  auto H_q_nope = q_nope.size(1);
+  auto D_q_nope = q_nope.size(2);
+  auto B_q_pe = q_pe.size(0);
+  auto H_q_pe = q_pe.size(1);
+  auto D_q_pe = q_pe.size(2);
+  auto B_pt = page_table.size(0);
+  auto PAGE_NUM = page_table.size(1);
+  auto PAGE_SIZE = kv_c_and_k_pe_cache.size(1);
+  auto D_ckv = kv_c_and_k_pe_cache.size(2);
+  auto B_o = out.size(0);
+  auto H_o = out.size(1);
+  auto D_o = out.size(2);
+
+  TORCH_CHECK(D_q_nope == 512, "D_q_nope must be equal to 512");
+  TORCH_CHECK(D_q_pe == 64, "D_q_pe must be equal to 64");
+  TORCH_CHECK(D_ckv == 576, "D_ckv must be equal to 576");
+  TORCH_CHECK(H_q_nope == H_q_pe && H_q_nope == H_o && H_o == 128,
+              "H_q_nope, H_q_pe, and H_o must be equal to 128");
+  TORCH_CHECK(PAGE_SIZE > 0 && (PAGE_SIZE & (PAGE_SIZE - 1)) == 0,
+              "PAGE_SIZE must be a power of 2");
+  TORCH_CHECK(
+      B_q_nope == B_q_pe && B_q_nope == B_pt && B_q_nope == B_o,
+      "Batch dims must be same for page_table, q_nope and q_pe, and out");
+  TORCH_CHECK(PAGE_NUM % (128 / PAGE_SIZE) == 0,
+              "PAGE_NUM must be divisible by 128 / PAGE_SIZE");
+  TORCH_CHECK(D_o == 512, "D_o must be equal to 512");
+
+  TORCH_CHECK(q_nope.dtype() == at::ScalarType::Half ||
+                  q_nope.dtype() == at::ScalarType::BFloat16 ||
+                  q_nope.dtype() == at::ScalarType::Float8_e4m3fn,
+              "q_nope must be a half, bfloat16, or float8_e4m3fn tensor");
+  TORCH_CHECK(kv_c_and_k_pe_cache.dtype() == q_nope.dtype() &&
+                  q_nope.dtype() == q_pe.dtype(),
+              "kv_c_and_k_pe_cache, q_nope, and q_pe must be the same type");
+  TORCH_CHECK(seq_lens.dtype() == torch::kInt32,
+              "seq_lens must be a 32-bit integer tensor");
+  TORCH_CHECK(page_table.dtype() == torch::kInt32,
+              "page_table must be a 32-bit integer tensor");
+
+  auto in_dtype = q_nope.dtype();
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(q_nope));
+  const cudaStream_t stream =
+      at::cuda::getCurrentCUDAStream(q_nope.get_device());
+  if (in_dtype == at::ScalarType::Half) {
+    runMla<cutlass::half_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens,
+                            page_table, scale, stream);
+  } else if (in_dtype == at::ScalarType::BFloat16) {
+    runMla<cutlass::bfloat16_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache,
+                                seq_lens, page_table, scale, stream);
+  } else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
+    runMla<cutlass::float_e4m3_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache,
+                                  seq_lens, page_table, scale, stream);
+  } else {
+    TORCH_CHECK(false, "Unsupported input data type of MLA");
+  }
+}
diff --git a/vllm_v0.10.0/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp b/vllm_v0.10.0/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
new file mode 100644
index 0000000..95e3255
--- /dev/null
+++ b/vllm_v0.10.0/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
@@ -0,0 +1,372 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*
+ * Taken from SGLANG PR https://github.com/sgl-project/sglang/pull/6929
+ * by Alcanderian JieXin Liang
+ */
+
+/*!
+ \file
+ \brief An universal device layer for cutlass 3.x-style kernels.
+*/
+
+// clang-format off
+#pragma once
+
+// common
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+
+#if !defined(__CUDACC_RTC__)
+#include "cutlass/cluster_launch.hpp"
+#include "cutlass/trace.h"
+#endif // !defined(__CUDACC_RTC__)
+
+#include "../kernel/sm100_fmha_mla_tma_warpspecialized.hpp"
+#include "../kernel/sm100_fmha_mla_reduction.hpp"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::fmha::device {
+
+using namespace cute;
+using namespace cutlass::fmha::kernel;
+
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////// CUTLASS 3.x API /////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+template<
+    class Kernel_
+>
+class MLA {
+public:
+
+  using Kernel = Kernel_;
+
+  using ReductionKernel = cutlass::fmha::kernel::Sm100FmhaMlaReductionKernel<
+      typename Kernel::ElementOut,
+      typename Kernel::ElementAcc,
+      typename Kernel::ElementAcc,
+      Kernel::TileShapeH::value,
+      Kernel::TileShapeL::value,
+      256 /*Max split*/
+  >;
+
+  /// Argument structure: User API
+  using KernelArguments = typename Kernel::Arguments;
+  using ReductionArguments = typename ReductionKernel::Arguments;
+
+  using Arguments = KernelArguments;
+
+  /// Argument structure: Kernel API
+  using KernelParams = typename Kernel::Params;
+  using ReductionParams = typename ReductionKernel::Params;
+  struct Params {
+    KernelParams fmha_params;
+    ReductionParams reduction_params;
+  };
+
+private:
+
+  /// Kernel API parameters object
+  Params params_;
+
+  bool is_initialized(bool set = false) {
+    static bool initialized = false;
+    if (set) initialized = true;
+    return initialized;
+  }
+
+  static ReductionArguments to_reduction_args(Arguments const& args) {
+    auto [H, K, D, B] = args.problem_shape;
+    return ReductionArguments{
+      nullptr, args.epilogue.ptr_o, nullptr, args.epilogue.ptr_lse,
+      args.mainloop.softmax_scale, B, args.split_kv, K, args.mainloop.ptr_seq,
+      args.ptr_split_kv, Kernel::TileShapeS::value
+    };
+  }
+
+public:
+
+  /// Access the Params structure
+  Params const& params() const {
+    return params_;
+  }
+
+  static void set_split_kv (KernelArguments& args) {
+    // printf("set_split_kv start");
+    if (args.split_kv >= 1) return;
+    auto [H, K, D, B] = args.problem_shape;
+    // std::cout << H << " " << K << " " << D << " " << B << "\n";      
+    int sm_count = args.hw_info.sm_count;
+    // printf("    sm_count = %d\n", sm_count);
+    int max_splits = ceil_div(K, 128);
+    max_splits = min(16, max_splits);
+    // printf("    max_splits = %d\n", max_splits);
+    int sms_per_batch = max(1, sm_count / B);
+    // printf("    sms_per_batch = %d\n", sms_per_batch);
+    int split_heur = min(max_splits, sms_per_batch);
+    int waves = ceil_div(B * split_heur, sm_count);
+    int k_waves = ceil_div(max_splits, split_heur);
+    int split_wave_aware = ceil_div(max_splits, k_waves);
+    args.split_kv = split_wave_aware;
+    // printf("    args.split_kv = %d\n", args.split_kv);
+
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status
+  can_implement(Arguments const& args) {
+    if (! Kernel::can_implement(args)) {
+      return Status::kInvalid;
+    }
+    if (! ReductionKernel::can_implement(to_reduction_args(args))) {
+      return Status::kInvalid;
+    }
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_bytes = 0;
+    workspace_bytes += Kernel::get_workspace_size(args);
+    workspace_bytes += ReductionKernel::get_workspace_size(to_reduction_args(args));
+    return workspace_bytes;
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int /* smem_capacity */ = -1) {
+    CUTLASS_TRACE_HOST("MLA::maximum_active_blocks()");
+    int max_active_blocks = -1;
+    int smem_size = Kernel::SharedStorageSize;
+
+    // first, account for dynamic smem capacity if needed
+    cudaError_t result;
+    if (smem_size >= (48 << 10)) {
+      CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+      result = cudaFuncSetAttribute(
+          device_kernel<Kernel>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize,
+          smem_size);
+      if (cudaSuccess != result) {
+        result = cudaGetLastError(); // to clear the error bit
+        CUTLASS_TRACE_HOST(
+          "  cudaFuncSetAttribute() returned error: "
+          << cudaGetErrorString(result));
+        return -1;
+      }
+    }
+
+    // query occupancy after setting smem size
+    result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks,
+        device_kernel<Kernel>,
+        Kernel::MaxThreadsPerBlock,
+        smem_size);
+
+    if (cudaSuccess != result) {
+      result = cudaGetLastError(); // to clear the error bit
+      CUTLASS_TRACE_HOST(
+        "  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error: "
+        << cudaGetErrorString(result));
+      return -1;
+    }
+
+    CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
+    return max_active_blocks;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status
+  initialize(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) {
+    CUTLASS_TRACE_HOST("MLA::initialize() - workspace "
+      << workspace << ", stream: " << (stream ? "non-null" : "null"));
+
+    // Initialize the workspace
+    Status status = Kernel::initialize_workspace(args, workspace, stream);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    status = ReductionKernel::initialize_workspace(to_reduction_args(args), workspace, stream);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    KernelParams kernel_params = Kernel::to_underlying_arguments(args, workspace);
+
+    ReductionArguments reduction_args = to_reduction_args(args);
+    if (reduction_args.split_kv > 1) {
+      reduction_args.ptr_oaccum   = kernel_params.epilogue.ptr_o_acc;
+      reduction_args.ptr_lseaccum = kernel_params.epilogue.ptr_lse_acc;
+    }
+    ReductionParams reduction_params = ReductionKernel::to_underlying_arguments(reduction_args, workspace);
+    // Initialize the Params structure
+    params_ = Params {kernel_params, reduction_params};
+
+    if (is_initialized()) return Status::kSuccess;
+
+    // account for dynamic smem capacity if needed
+    // no dynamic smem is needed for reduction kernel
+    int smem_size = Kernel::SharedStorageSize;
+    if (smem_size >= (48 << 10)) {
+      CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+      cudaError_t result = cudaFuncSetAttribute(
+          device_kernel<Kernel>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize,
+          smem_size);
+      if (cudaSuccess != result) {
+        result = cudaGetLastError(); // to clear the error bit
+        CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error: " << cudaGetErrorString(result));
+        return Status::kErrorInternal;
+      }
+    }
+
+    is_initialized(true);
+
+    return Status::kSuccess;
+  }
+
+  /// Update API is preserved in 3.0, but does not guarantee a lightweight update of params.
+  Status
+  update(Arguments const& args, void* workspace = nullptr) {
+    CUTLASS_TRACE_HOST("MLA()::update() - workspace: " << workspace);
+
+    size_t workspace_bytes = get_workspace_size(args);
+    if (workspace_bytes > 0 && nullptr == workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    auto fmha_params = Kernel::to_underlying_arguments(args, workspace);
+
+    ReductionArguments reduction_args = to_reduction_args(args);
+    if (reduction_args.split_kv > 1) {
+      reduction_args.ptr_oaccum   = fmha_params.epilogue.ptr_o_acc;
+      reduction_args.ptr_lseaccum = fmha_params.epilogue.ptr_lse_acc;
+    }
+    ReductionParams reduction_params = ReductionKernel::to_underlying_arguments(reduction_args, workspace);
+    // Initialize the Params structure
+    params_ = Params {fmha_params, reduction_params};
+
+    return Status::kSuccess;
+  }
+
+  /// Primary run() entry point API that is static allowing users to create and manage their own params.
+  /// Supplied params struct must be construct by calling Kernel::to_underling_arguments()
+  static Status
+  run(Params& params, cudaStream_t stream = nullptr) {
+    CUTLASS_TRACE_HOST("MLA::run()");
+    dim3 const block = Kernel::get_block_shape();
+    dim3 const grid = Kernel::get_grid_shape(params.fmha_params);
+
+    // configure smem size and carveout
+    int smem_size = Kernel::SharedStorageSize;
+
+    Status launch_result;
+    // Use extended launch API only for mainloops that use it
+    if constexpr(Kernel::ArchTag::kMinComputeCapability >= 90) {
+      dim3 cluster(cute::size<0>(typename Kernel::ClusterShape{}),
+                   cute::size<1>(typename Kernel::ClusterShape{}),
+                   cute::size<2>(typename Kernel::ClusterShape{}));
+      void const* kernel = (void const*) device_kernel<Kernel>;
+      void* kernel_params[] = {&params.fmha_params};
+      launch_result = ClusterLauncher::launch(grid, cluster, block, smem_size, stream, kernel, kernel_params);
+    }
+    else {
+      launch_result = Status::kSuccess;
+      device_kernel<Kernel><<<grid, block, smem_size, stream>>>(params.fmha_params);
+    }
+
+    cudaError_t result = cudaGetLastError();
+    if (cudaSuccess != result or Status::kSuccess != launch_result) {
+      //return Status::kSuccess;
+      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
+      return Status::kErrorInternal;
+    }
+    if (params.reduction_params.split_kv > 1) {
+      // launch reduction kernel
+      dim3 const block = ReductionKernel::get_block_shape();
+      dim3 const grid  = ReductionKernel::get_grid_shape(params.reduction_params);
+      device_kernel<ReductionKernel><<<grid, block, 0, stream>>>(params.reduction_params);
+      cudaError_t result = cudaGetLastError();
+      if (cudaSuccess == result) {
+        return Status::kSuccess;
+      }
+      else {
+        CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
+        return Status::kErrorInternal;
+      }
+    }
+    else {
+      return Status::kSuccess;
+    }
+  }
+
+  //
+  // Non-static launch overloads that first create and set the internal params struct of this kernel handle.
+  //
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  run(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) {
+    Status status = initialize(args, workspace, stream);
+    if (Status::kSuccess == status) {
+      status = run(params_, stream);
+    }
+    return status;
+  }
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  operator()(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) {
+    return run(args, workspace, stream);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  run(cudaStream_t stream = nullptr) {
+    return run(params_, stream);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  operator()(cudaStream_t stream = nullptr) {
+    return run(params_, stream);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::fmha::device
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/vllm_v0.10.0/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_reduction.hpp b/vllm_v0.10.0/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_reduction.hpp
new file mode 100644
index 0000000..7b6e1dd
--- /dev/null
+++ b/vllm_v0.10.0/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_reduction.hpp
@@ -0,0 +1,203 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*
+ * Taken from SGLANG PR https://github.com/sgl-project/sglang/pull/6929
+ * by Alcanderian JieXin Liang
+ */
+
+// clang-format off
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/arch.h"
+#include "cute/tensor.hpp"
+
+namespace cutlass::fmha::kernel {
+
+using namespace cute;
+template<
+    class ElementOut,
+    class ElementAcc,
+    class ElementScale,
+    size_t kNumHeads,
+    size_t kHeadDimLatent,
+    int kMaxSplits
+>
+struct Sm100FmhaMlaReductionKernel {
+
+  static const int SharedStorageSize = 0;
+  static const int MaxThreadsPerBlock = 128;
+  static const int MinBlocksPerMultiprocessor = 1;
+
+  using ArchTag = cutlass::arch::Sm100;
+
+  static_assert(kHeadDimLatent % MaxThreadsPerBlock == 0);
+  struct Arguments {
+    ElementAcc* ptr_oaccum = nullptr;
+    ElementOut* ptr_o = nullptr;
+    ElementAcc* ptr_lseaccum = nullptr;
+    ElementAcc* ptr_lse = nullptr;
+    ElementScale scale = 1.f;
+    int num_batches = 0;
+    int split_kv = -1;
+    int dim_k = -1;
+    int* ptr_seq = nullptr;
+    int* ptr_split_kv = nullptr;
+    int tile_shape_s = 128;
+  };
+  using Params = Arguments;
+
+  static Params to_underlying_arguments(Arguments const& args, void* workspace) {
+    return {args.ptr_oaccum, args.ptr_o, args.ptr_lseaccum, args.ptr_lse,
+	    args.scale, args.num_batches, args.split_kv, args.dim_k, args.ptr_seq,
+	    args.ptr_split_kv, args.tile_shape_s};
+  }
+
+  static size_t get_workspace_size(Arguments const& /*args*/) {
+    return 0;
+  }
+
+  static Status initialize_workspace(
+      Arguments const& /*args*/, void* /*ws*/, cudaStream_t /*stream*/) {
+    return Status::kSuccess;
+  }
+
+  static dim3 get_grid_shape(Params const& params) {
+    return dim3(kNumHeads, 1, params.num_batches);
+  }
+
+  static dim3 get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  static bool can_implement(Arguments const& args) {
+    if (args.num_batches <= 0) return false;
+    if (args.split_kv <= 0) return false;
+    return true;
+  }
+
+  CUTLASS_DEVICE void operator() (Params const& params, char* smem_raw) {
+    if (params.split_kv <= 1) return;
+    auto blk_coord = make_coord(blockIdx.x, _0{}, blockIdx.z);
+
+    __shared__ ElementAcc sLseScale[kMaxSplits];
+    const size_t offset_lseaccum = get<0>(blk_coord) + kNumHeads * params.split_kv * get<2>(blk_coord);
+    const size_t offset_lse = get<0>(blk_coord) + kNumHeads * get<2>(blk_coord);
+
+    Tensor gLSEaccum = make_tensor(make_gmem_ptr(params.ptr_lseaccum + offset_lseaccum),
+                                   make_shape(params.split_kv), Stride<Int<kNumHeads>>{});
+
+    Tensor gLSE = make_tensor(make_gmem_ptr(params.ptr_lse + offset_lse),
+                              Shape<_1>{}, Stride<_1>{});
+
+    auto dim_k = params.ptr_seq == nullptr ?  params.dim_k : params.ptr_seq[get<2>(blk_coord)];
+    auto local_split_kv = params.ptr_split_kv == nullptr ? params.split_kv : params.ptr_split_kv[get<2>(blk_coord)];
+    auto k_tile_total = ceil_div(dim_k, params.tile_shape_s);
+    auto k_tile_per_cta = ceil_div(k_tile_total, local_split_kv);
+    local_split_kv = ceil_div(k_tile_total, k_tile_per_cta);
+
+    int warp_idx = cutlass::canonical_warp_idx_sync();
+    if (warp_idx == 0) {
+      constexpr int kNLsePerThread = cute::ceil_div(kMaxSplits, 32);
+
+      ElementAcc local_lse[kNLsePerThread];
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kNLsePerThread; ++i) {
+        const int split = i * 32 + threadIdx.x;
+        local_lse[i] = split < local_split_kv ? gLSEaccum(split) : -std::numeric_limits<ElementAcc>::infinity();
+      }
+
+      ElementAcc lse_max = -std::numeric_limits<ElementAcc>::infinity();
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kNLsePerThread; ++i) {
+        lse_max = max(lse_max, local_lse[i]);
+      }
+      CUTLASS_PRAGMA_UNROLL
+      for (int offset = 16; offset >= 1; offset /= 2) {
+        lse_max = max(lse_max, __shfl_xor_sync(0xffffffff, lse_max, offset));
+      }
+      lse_max = lse_max == -std::numeric_limits<ElementAcc>::infinity() ? 0.0f : lse_max;  // In case all local LSEs are -inf
+      lse_max = __shfl_sync(0xffffffff, lse_max, 0);
+
+      ElementAcc sum_lse = 0;
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kNLsePerThread; ++i) {
+        sum_lse = sum_lse + expf(local_lse[i] - lse_max);
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int offset = 16; offset >= 1; offset /= 2) {
+        sum_lse = sum_lse + __shfl_xor_sync(0xffffffff, sum_lse, offset);
+      }
+
+      sum_lse = __shfl_sync(0xffffffff, sum_lse, 0);
+
+      ElementAcc global_lse = (sum_lse == 0.f || sum_lse != sum_lse) ? std::numeric_limits<ElementAcc>::infinity() : logf(sum_lse) + lse_max;
+      if (threadIdx.x == 0 and params.ptr_lse != nullptr) {
+        gLSE(0) = global_lse;
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kNLsePerThread; ++i) {
+        const int split = i * 32 + threadIdx.x;
+        if (split < local_split_kv) {
+          sLseScale[split] = expf(local_lse[i] - global_lse);
+        }
+      }
+    }
+    __syncthreads();
+
+    constexpr int Elements = kHeadDimLatent / MaxThreadsPerBlock;
+    const size_t offset_oaccum = kHeadDimLatent * params.split_kv * (get<0>(blk_coord) + kNumHeads * get<2>(blk_coord));
+    Tensor gOaccum = make_tensor(make_gmem_ptr(params.ptr_oaccum + offset_oaccum),
+                               Shape<Int<kHeadDimLatent>>{}, Stride<_1>{});
+    ElementAcc local_val[Elements] = {0};
+    for (int split = 0; split < local_split_kv; ++split) {
+      ElementAcc lse_scale = sLseScale[split];
+      CUTLASS_PRAGMA_UNROLL
+      for(int i = 0; i < Elements; ++i) {
+        local_val[i] += lse_scale * gOaccum(threadIdx.x + MaxThreadsPerBlock * i);
+      }
+      gOaccum.data() = gOaccum.data() + kHeadDimLatent;
+    }
+    auto ptr_o_local = params.ptr_o + (get<0>(blk_coord) + get<2>(blk_coord) * kNumHeads) * kHeadDimLatent;
+    Tensor gO = make_tensor(make_gmem_ptr(ptr_o_local), Shape<Int<kHeadDimLatent>>{}, Stride<_1>{});
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int i = 0; i < Elements; ++i) {
+      gO(threadIdx.x + MaxThreadsPerBlock * i) = static_cast<ElementOut>(local_val[i]);
+    }
+  }
+};
+
+}  // namespace cutlass::fmha::kernel
diff --git a/vllm_v0.10.0/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp b/vllm_v0.10.0/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp
new file mode 100644
index 0000000..2cbc237
--- /dev/null
+++ b/vllm_v0.10.0/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp
@@ -0,0 +1,2023 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*
+ * Taken from SGLANG PR https://github.com/sgl-project/sglang/pull/6929
+ * by Alcanderian JieXin Liang
+ */
+
+// clang-format off
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cute/arch/simd_sm100.hpp"
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory_sm80.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "gather_tensor.hpp"  // from examples/common
+#include "common/pow_2.hpp"
+
+namespace cutlass::fmha::kernel {
+
+using namespace cute;
+
+template<
+    class TileShape,
+    class Element_,
+    class ElementAcc_,
+    class ElementOut_,
+    class ElementLSE_,
+    class TileScheduler,
+#ifdef CPASYNC
+    bool kIsCpAsync = true
+#else
+    bool kIsCpAsync = false
+#endif
+>
+struct Sm100FmhaMlaKernelTmaWarpspecialized {
+
+  using Element = Element_;
+  using ElementAcc = ElementAcc_;
+  using ElementOut = ElementOut_;
+  using ElementLSE = ElementLSE_;
+
+  // only 2Sm mode is supported
+  static const bool kIs2Sm = true;
+  static const int MaxThreadsPerBlock = 256;
+  static const int MinBlocksPerMultiprocessor = 1;
+  static const int TotalSNum = 2;
+  static const int TotalPNum = 2;
+  using ArchTag = cutlass::arch::Sm100;
+
+  using ClusterShape = cute::conditional_t<kIs2Sm, Shape<_2, _1, _1>, Shape<_1, _1, _1>>;
+
+  using TileShapeH = tuple_element_t<0, TileShape>;
+  using TileShapeS = tuple_element_t<1, TileShape>;
+  using TileShapeD = tuple_element_t<2, TileShape>;
+
+  using TileShapeL = tuple_element_t<0, TileShapeD>;
+  using TileShapeR = tuple_element_t<1, TileShapeD>;
+  static_assert(TileShapeL{} % TileShapeR{} == 0, "Rope head dim must divide latent head dim");
+
+  using ProblemShape = Shape<TileShapeH, int, TileShapeD, int>;
+  using TensorStride   = Stride<int64_t, _1, int64_t>;
+  using TmemAllocator = cute::conditional_t<kIs2Sm, cute::TMEM::Allocator2Sm, cute::TMEM::Allocator1Sm>;
+
+  static_assert(TileShapeH{} == 128);
+  static const int kWarpsInN = kIs2Sm ? 2 : 1;
+
+  static const int kNumComputeWarps = 4;
+  static const int kNumLoadWarps = kIsCpAsync ? 2 : 1;
+
+  enum class WarpRole {
+    kMma = 0x1, kLoad = 0x2, kCompute = 0x3, kLoadPageTable = 0x4, kEmpty=0x0
+  };
+
+  static const long long unsigned int kWarpAssignment = kIsCpAsync ? 0x4221'3333ull : 0x0021'3333ull;
+
+  static CUTLASS_DEVICE WarpRole warp_idx_to_role(int warp_idx) {
+      return static_cast<WarpRole>((kWarpAssignment >> (4 * warp_idx)) & 0xF);
+  }
+
+  static const int Alignment = 128 / sizeof_bits_v<Element>;
+  static const int AlignmentOut = 128 / sizeof_bits_v<ElementOut>;
+
+  using TileShapeQK = Shape<TileShapeH, TileShapeS, decltype(TileShapeR{} / _1{})>;
+  static const int StagesQK = 24 / sizeof(Element);  // free parameter
+  static const int IterationsQKLatent = decltype(TileShapeL{} / get<2>(TileShapeQK{}))::value;
+  static const int IterationsQKRope = decltype(TileShapeR{} / get<2>(TileShapeQK{}))::value;
+  static const int IterationsQK = IterationsQKLatent + IterationsQKRope;
+
+  using Schedule = cute::conditional_t<kIs2Sm, cutlass::gemm::KernelTmaWarpSpecialized2SmSm100, cutlass::gemm::KernelTmaWarpSpecialized1SmSm100>;
+  using CollectiveMmaQK = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      Element, TensorStride, Alignment,
+      Element, TensorStride, Alignment,
+      ElementAcc,
+      TileShapeQK, ClusterShape, cutlass::gemm::collective::StageCount<StagesQK>,
+      Schedule>::CollectiveOp;
+  using TiledMmaQK = typename CollectiveMmaQK::TiledMma;
+  using CtaShapeQK = typename CollectiveMmaQK::CtaShape_MNK;
+
+  // chosen for unified smem staging between K and V
+  using TileShapePV = Shape<TileShapeH, _256, _32>;
+  using TransposeTensorStride = decltype(select<1,0,2>(TensorStride{}));
+  static const int StagesPV = StagesQK;  // not sure why, but must be at least two. check pipes
+  static const int IterationsPV_K = decltype(TileShapeS{} / get<2>(TileShapePV{}))::value;
+  static const int IterationsPV_N = decltype(TileShapeL{} / get<1>(TileShapePV{}))::value;
+
+  using CollectiveMmaPV = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      Element, TensorStride, Alignment,
+      Element, TransposeTensorStride, Alignment,
+      ElementAcc,
+      TileShapePV, ClusterShape, cutlass::gemm::collective::StageCount<StagesPV>,
+      Schedule>::CollectiveOp;
+  using CtaShapePV = typename CollectiveMmaPV::CtaShape_MNK;
+  static_assert(std::is_same_v<TransposeTensorStride, typename CollectiveMmaPV::StrideB>);
+
+  using TiledMmaPV = typename CollectiveMmaPV::TiledMma;
+
+  using AtomThrShapeMNK = typename CollectiveMmaQK::AtomThrShapeMNK;
+  static_assert(typename CollectiveMmaQK::AtomThrShapeMNK{} == typename CollectiveMmaPV::AtomThrShapeMNK{}, "schedule must match");
+
+  static const int StagesPageTable = kIsCpAsync ? StagesPV : 1;
+
+  // pipelines from load to mma, PipelineTmaUmmaAsync, stages tbd
+  // use expect_tx for Q load
+  using PipelineLoadQK = cute::conditional_t<kIsCpAsync, PipelineUmmaConsumerAsync<StagesQK, AtomThrShapeMNK>, PipelineTmaUmmaAsync<StagesQK, ClusterShape, AtomThrShapeMNK>>;
+  using PipelineLoadPV = PipelineLoadQK;
+  // pipeline from mma (Q@K) to softmax, PipelineUmmaAsync, 2 stages
+  using PipelineS = PipelineUmmaAsync<TotalSNum, AtomThrShapeMNK>;
+  // pipeline from softmax (P) to mma (bmm2), PipelineUmmaAsync, 2 stages
+  using PipelineP = PipelineUmmaConsumerAsync<TotalPNum, AtomThrShapeMNK>;
+  // pipeline from mma to softmax (for rescale), PipelineUmmaAsync, 1 stage
+  using PipelineO = PipelineUmmaAsync<1, AtomThrShapeMNK>;
+
+  using PipelinePT = PipelineAsync<StagesPageTable>;
+
+  struct PipelineStorage {
+    alignas(16) typename PipelineLoadQK::SharedStorage load_qk;
+    alignas(16) typename PipelineS::SharedStorage mma_s;
+    alignas(16) typename PipelineP::SharedStorage p_mma;
+    alignas(16) typename PipelineO::SharedStorage mma_o;
+    alignas(16) typename PipelinePT::SharedStorage load_page_table;
+  };
+
+  template<class Layout, class Stages = _1>
+  static CUTE_DEVICE constexpr auto unstageSmemLayout(Layout const& layout, Stages stages = {}) {
+      return composition(layout, make_tuple(_, _, _, make_layout(stages)));
+  }
+
+  using SmemLayoutQ = decltype(unstageSmemLayout(typename CollectiveMmaQK::SmemLayoutA{}, Int<IterationsQK>{}));
+  using SmemLayoutKC = typename CollectiveMmaQK::SmemLayoutB;
+  using SmemLayoutVC = typename CollectiveMmaPV::SmemLayoutB;
+  using SmemLayoutP = decltype(unstageSmemLayout(typename CollectiveMmaPV::SmemLayoutA{}, make_shape(Int<IterationsPV_K>{}, _2{})));
+
+  static const int kBytesLoadQ  = size(AtomThrShapeMNK{}) * cutlass::bits_to_bytes(cosize(take<0,3>(SmemLayoutQ{})) * cute::sizeof_bits_v<Element>);
+  static const int kBytesLoadKC = size(AtomThrShapeMNK{}) * cutlass::bits_to_bytes(cosize(take<0,3>(SmemLayoutKC{})) * cute::sizeof_bits_v<Element>);
+  static const int kBytesLoadVC = size(AtomThrShapeMNK{}) * cutlass::bits_to_bytes(cosize(take<0,3>(SmemLayoutVC{})) * cute::sizeof_bits_v<Element>);
+  // pre-condition for overlapped smem staging
+  static_assert(kBytesLoadKC == kBytesLoadVC);
+  static_assert(StagesQK == StagesPV);
+
+  static const int kTransactionsBytesLoadQK = kBytesLoadKC;
+  static const int kTransactionsBytesLoadExtraQ = kBytesLoadQ;
+  static const int kTransactionsBytesLoadPV = kBytesLoadVC;
+
+  static const int kNamedBarrierExchange = (int) cutlass::arch::ReservedNamedBarriers::TransformBarrier;
+  // This Named Barrier is introduced to solve Q tile loading overwritten issue when enable persistent
+  // tile scheduler for FP8 MLA.
+  static const int kNamedBarrierEpilogue = (int) cutlass::arch::ReservedNamedBarriers::EpilogueBarrier;
+  //
+  static const int kNamedBarrierTmemDealloc = (int) cutlass::arch::ReservedNamedBarriers::TmemAllocBarrier;
+
+  enum class TmemAllocation : uint32_t {
+    kSizeS = TileShapeS::value / kWarpsInN,
+    // Overall
+    kSizeO = TileShapeL::value / kWarpsInN,
+    // Between accumulators we loop over
+    kSizeAccO = decltype(get<1>(TileShapePV{}))::value / kWarpsInN,
+    kNumS = TotalSNum,
+    kNumP = TotalPNum,
+    kNumO = 1,
+    kS0 = 0,
+    kS1 = kS0 + kSizeS,
+    kO0 = kS1 + kSizeS,
+    kTotal = kO0 + kSizeO
+  };
+
+  static_assert(static_cast<int>(TmemAllocation::kTotal) <= TmemAllocator::Sm100TmemCapacityColumns, "using too much tmem");
+
+  struct TensorStorage {
+    // to communicate max and row_sum
+    cute::array<ElementAcc, kNumComputeWarps * cutlass::NumThreadsPerWarp> smem_exchange;
+    cute::array<int, StagesPageTable * TileShapeS::value> smem_page_table;
+    alignas(2048) cute::array<Element, cute::cosize_v<SmemLayoutQ>> smem_q;
+    union {
+      alignas(2048) cute::array<Element, cute::cosize_v<SmemLayoutKC>> smem_kc;
+      alignas(2048) cute::array<Element, cute::cosize_v<SmemLayoutVC>> smem_vc;
+    };
+    alignas(2048) cute::array<Element, cute::cosize_v<SmemLayoutP>> smem_p;
+  };
+
+  struct SharedStorage {
+    PipelineStorage pipelines;
+    TensorStorage tensors;
+    uint32_t tmem_base_ptr;
+  };
+
+  static const int SharedStorageSize = sizeof(SharedStorage);
+  static_assert(SharedStorageSize <= cutlass::arch::sm100_smem_capacity_bytes, "using too much smem");
+
+  struct MainloopArguments {
+    ElementAcc softmax_scale;
+
+    // all tensors strides are (num_heads or seqlen, head_dim, batch)
+    // head_dim stride is always 1
+    Element* ptr_q_latent;
+    TensorStride stride_q_latent;
+    Element* ptr_q_rope;
+    TensorStride stride_q_rope;
+
+    Element* ptr_c_latent;
+    TensorStride stride_c_latent;
+    Element* ptr_k_rope;
+    TensorStride stride_k_rope;
+
+    // for paged attention, we interpret what was previously [batch, seqlen]
+    // as [page_count, page_size], and index according to page_table
+    int* ptr_seq = nullptr;
+    int* ptr_page_table = nullptr;
+    // page table is [batch, seqlen or similar]
+    Stride<_1, int> stride_page_table = {};
+    int page_count = 0;
+    int page_size = TileShapeS{};  // powers of two if kIsCpAsync, otherwise TileShapeS
+  };
+
+  struct EpilogueArguments {
+    ElementOut* ptr_o = nullptr;
+    TensorStride stride_o;
+    ElementLSE* ptr_lse = nullptr;
+    Stride<_1, int> stride_lse;
+    ElementAcc output_scale = 1.0f;
+  };
+
+  struct Arguments {
+    // (num_heads=128, seqlen, (d_latent=512, d_rope=64), batch_count)
+    // for paged attention, seqlen is max seqlen
+    ProblemShape problem_shape;
+    MainloopArguments mainloop;
+    EpilogueArguments epilogue;
+    KernelHardwareInfo hw_info;
+    int split_kv = -1;
+    int* ptr_split_kv = nullptr;
+  };
+
+  using TmaLoadQLatent = typename CollectiveMmaQK::Params::TMA_A;
+  using TmaLoadQRope = typename CollectiveMmaQK::Params::TMA_A;
+  using TmaLoadCLatent = typename CollectiveMmaQK::Params::TMA_B;
+  using TmaLoadKRope = typename CollectiveMmaQK::Params::TMA_B;
+  using TmaLoadCLatentTranspose = typename CollectiveMmaPV::Params::TMA_B;
+
+  struct MainloopParams {
+    TmaLoadQLatent tma_load_q_latent;
+    TmaLoadQRope tma_load_q_rope;
+    TmaLoadCLatent tma_load_c_latent;
+    TmaLoadKRope tma_load_k_rope;
+    TmaLoadCLatentTranspose tma_load_c_latent_transpose;
+  };
+
+  struct EpilogueParams {
+    ElementOut* ptr_o = nullptr;
+    ElementAcc* ptr_o_acc = nullptr;
+    TensorStride stride_o;
+    TensorStride stride_o_acc;
+    ElementLSE* ptr_lse = nullptr;
+    ElementLSE* ptr_lse_acc = nullptr;
+    Stride<_1, int> stride_lse;
+    Stride<_1, int> stride_lse_acc;
+    ElementAcc output_scale = 1.0f;
+  };
+
+  struct Params {
+    ProblemShape problem_shape;
+    MainloopArguments mainloop;
+    EpilogueParams epilogue;
+    MainloopParams mainloop_params;
+    typename TileScheduler::Params tile_scheduler;
+    int split_kv = -1;
+    int* ptr_split_kv = nullptr;
+  };
+
+  static Params to_underlying_arguments(Arguments const& args, void* workspace) {
+    //workspace = nullptr;  // let's get an error if one of these needs workspace
+
+    auto [H, K, D, B] = args.problem_shape;
+    auto [L, R] = D;
+
+    int paged_B = B;
+    int paged_K = K;
+    if (args.mainloop.ptr_page_table != nullptr) {
+      paged_B = args.mainloop.page_count;
+      paged_K = args.mainloop.page_size;
+    }
+
+    auto params_qk_latent = CollectiveMmaQK::to_underlying_arguments(
+        make_shape(H, K, L, B),
+        typename CollectiveMmaQK::Arguments {
+          args.mainloop.ptr_q_latent, args.mainloop.stride_q_latent,
+          args.mainloop.ptr_c_latent, args.mainloop.stride_c_latent,
+        }, nullptr);
+
+    auto params_qk_latent_paged = CollectiveMmaQK::to_underlying_arguments(
+        make_shape(H, paged_K, L, paged_B),
+        typename CollectiveMmaQK::Arguments {
+          args.mainloop.ptr_q_latent, args.mainloop.stride_q_latent,
+          args.mainloop.ptr_c_latent, args.mainloop.stride_c_latent,
+        }, nullptr);
+
+    auto params_qk_rope = CollectiveMmaQK::to_underlying_arguments(
+        make_shape(H, K, R, B),
+        typename CollectiveMmaQK::Arguments {
+          args.mainloop.ptr_q_rope, args.mainloop.stride_q_rope,
+          args.mainloop.ptr_k_rope, args.mainloop.stride_k_rope,
+        }, nullptr);
+
+    auto params_qk_rope_paged = CollectiveMmaQK::to_underlying_arguments(
+        make_shape(H, paged_K, R, paged_B),
+        typename CollectiveMmaQK::Arguments {
+          args.mainloop.ptr_q_rope, args.mainloop.stride_q_rope,
+          args.mainloop.ptr_k_rope, args.mainloop.stride_k_rope,
+        }, nullptr);
+
+
+    auto stride_c_latent_transpose = select<1,0,2>(args.mainloop.stride_c_latent);
+    auto params_pv_latent = CollectiveMmaPV::to_underlying_arguments(
+        make_shape(H, L, paged_K, paged_B),
+        typename CollectiveMmaPV::Arguments {
+          args.mainloop.ptr_q_latent, args.mainloop.stride_q_latent,  // dummy, never used
+          args.mainloop.ptr_c_latent, stride_c_latent_transpose,
+        }, nullptr);
+
+    MainloopParams mainloop_params {
+      params_qk_latent.tma_load_a,
+      params_qk_rope.tma_load_a,
+      params_qk_latent_paged.tma_load_b,
+      params_qk_rope_paged.tma_load_b,
+      params_pv_latent.tma_load_b
+    };
+
+    EpilogueParams epilogue_params;
+
+    epilogue_params.ptr_o = args.epilogue.ptr_o;
+    epilogue_params.stride_o = args.epilogue.stride_o;
+    epilogue_params.ptr_lse = args.epilogue.ptr_lse;
+    epilogue_params.stride_lse = args.epilogue.stride_lse;
+    epilogue_params.output_scale = args.epilogue.output_scale;
+
+    if (args.split_kv > 1) {
+      ElementAcc* ptr_o_acc   = reinterpret_cast<ElementAcc*>(workspace);
+      ElementLSE* ptr_lse_acc = reinterpret_cast<ElementLSE*>(ptr_o_acc + H * L * args.split_kv * B);
+      epilogue_params.ptr_o_acc   = ptr_o_acc;
+      epilogue_params.ptr_lse_acc = ptr_lse_acc;
+
+      epilogue_params.stride_o_acc = make_tuple(static_cast<int64_t>(0 + L) * args.split_kv, _1{}, static_cast<int64_t>(0 + H * L) * args.split_kv);
+      epilogue_params.stride_lse_acc = make_tuple(_1{}, (0 + H) * args.split_kv);
+    }
+
+    return {args.problem_shape, args.mainloop, epilogue_params, mainloop_params,
+            TileScheduler::to_underlying_arguments(args.problem_shape, args.hw_info, ClusterShape{}, args.split_kv), args.split_kv, args.ptr_split_kv};
+  }
+
+  static size_t get_workspace_size(Arguments const& args) {
+    ProblemShape problem_shape = args.problem_shape;
+    auto [H, K, D, B] = problem_shape;
+    auto [D_latent, D_rope] = D;
+    auto split_kv = args.split_kv;
+    return (sizeof(ElementAcc) * D_latent + sizeof(ElementLSE)) * H * split_kv * B;
+  }
+  static Status initialize_workspace(
+      Arguments const& /*args*/, void* /*ws*/, cudaStream_t /*stream*/) {
+    return Status::kSuccess;
+  }
+
+  static dim3 get_grid_shape(Params const& params) {
+    return TileScheduler::get_grid_shape(params.tile_scheduler);
+  }
+
+  static dim3 get_block_shape() {
+    dim3 block(MaxThreadsPerBlock, 1, 1);
+    return block;
+  }
+
+  static bool can_implement(Arguments const& args) {
+    if (kIsCpAsync) {
+      if ((args.mainloop.page_size & (args.mainloop.page_size - 1)) != 0) {
+        return false;
+      }
+      if (args.mainloop.page_size > TileShapeS{}) {
+        return false;
+      }
+    }
+    else {
+      if (args.mainloop.ptr_page_table != nullptr && args.mainloop.page_size != TileShapeS{}) {
+        return false;
+      }
+    }
+    if (get<0>(args.problem_shape) != 128) {
+      return false;
+    }
+    if (get<1>(args.problem_shape) <= 0) {
+      return false;
+    }
+    if (args.split_kv <= 0) {
+      return false;
+    }
+    return true;
+  }
+
+
+  CUTLASS_DEVICE void operator()(Params const& params, char* smem_raw) {
+
+    TileScheduler tile_scheduler(params.tile_scheduler);
+
+    int warp_idx = cutlass::canonical_warp_idx_sync();
+    auto role = warp_idx_to_role(warp_idx);
+    uint32_t lane_predicate = cute::elect_one_sync();
+
+    uint32_t cta_rank_in_cluster = cute::block_rank_in_cluster();
+    int cta_coord_v = cta_rank_in_cluster % size<0>(AtomThrShapeMNK{});
+    bool is_mma_leader_cta = cta_coord_v == 0;
+
+    if (role == WarpRole::kLoad && lane_predicate && ! kIsCpAsync) {
+      prefetch_tma_descriptor(params.mainloop_params.tma_load_q_latent.get_tma_descriptor());
+      prefetch_tma_descriptor(params.mainloop_params.tma_load_c_latent.get_tma_descriptor());
+      prefetch_tma_descriptor(params.mainloop_params.tma_load_q_rope.get_tma_descriptor());
+      prefetch_tma_descriptor(params.mainloop_params.tma_load_k_rope.get_tma_descriptor());
+      prefetch_tma_descriptor(params.mainloop_params.tma_load_c_latent_transpose.get_tma_descriptor());
+    }
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_raw);
+
+    typename PipelineLoadQK::Params pipeline_load_qk_params;
+    if (role == WarpRole::kLoad) {
+      pipeline_load_qk_params.role = PipelineLoadQK::ThreadCategory::Producer;
+    }
+    if (role == WarpRole::kMma) {
+      pipeline_load_qk_params.role = PipelineLoadQK::ThreadCategory::Consumer;
+    }
+    if constexpr (kIsCpAsync) {
+      // we can make our life easier by unconditionally loading blocks
+      // since we know it'll always be legal
+      pipeline_load_qk_params.producer_arv_count = kNumLoadWarps * cutlass::NumThreadsPerWarp * size(AtomThrShapeMNK{});
+    }
+    else {
+      pipeline_load_qk_params.is_leader = lane_predicate && (role == WarpRole::kLoad) && is_mma_leader_cta;
+      pipeline_load_qk_params.transaction_bytes = kTransactionsBytesLoadQK;
+    }
+    pipeline_load_qk_params.initializing_warp = 0;
+    PipelineLoadQK pipeline_load_qk(shared_storage.pipelines.load_qk, pipeline_load_qk_params,
+      ClusterShape{}, /*barrier init*/ cute::true_type{}, /*mask calc*/cute::false_type{});
+
+    typename PipelineS::Params pipeline_mma_s_params;
+    if (role == WarpRole::kMma) {
+      pipeline_mma_s_params.role = PipelineS::ThreadCategory::Producer;
+    }
+    if (role == WarpRole::kCompute) {
+      pipeline_mma_s_params.role = PipelineS::ThreadCategory::Consumer;
+    }
+    pipeline_mma_s_params.consumer_arv_count = kNumComputeWarps * cutlass::NumThreadsPerWarp * size(AtomThrShapeMNK{});
+    pipeline_mma_s_params.initializing_warp = 1;
+    PipelineS pipeline_mma_s(
+      shared_storage.pipelines.mma_s,
+      pipeline_mma_s_params,
+      ClusterShape{}, /*barrier init*/ cute::true_type{}, /*mask calc*/cute::false_type{});
+
+    typename PipelineP::Params pipeline_p_mma_params;
+    if (role == WarpRole::kMma) {
+      pipeline_p_mma_params.role = PipelineP::ThreadCategory::Consumer;
+    }
+    if (role == WarpRole::kCompute) {
+      pipeline_p_mma_params.role = PipelineP::ThreadCategory::Producer;
+    }
+    pipeline_p_mma_params.producer_arv_count = kNumComputeWarps * cutlass::NumThreadsPerWarp * size(AtomThrShapeMNK{});
+    pipeline_p_mma_params.consumer_arv_count = 1;
+    pipeline_p_mma_params.initializing_warp = 2;
+    PipelineP pipeline_p_mma(
+      shared_storage.pipelines.p_mma,
+      pipeline_p_mma_params,
+      ClusterShape{}, /*barrier init*/ cute::true_type{}, /*mask calc*/cute::false_type{});
+
+    typename PipelineO::Params pipeline_mma_o_params;
+    if (role == WarpRole::kMma) {
+      pipeline_mma_o_params.role = PipelineO::ThreadCategory::Producer;
+    }
+    if (role == WarpRole::kCompute) {
+      pipeline_mma_o_params.role = PipelineO::ThreadCategory::Consumer;
+    }
+    pipeline_mma_o_params.consumer_arv_count = kNumComputeWarps * cutlass::NumThreadsPerWarp * size(AtomThrShapeMNK{});
+    pipeline_mma_o_params.initializing_warp = 3;
+    PipelineO pipeline_mma_o(
+      shared_storage.pipelines.mma_o,
+      pipeline_mma_o_params,
+      ClusterShape{}, /*barrier init*/ cute::true_type{}, /*mask calc*/cute::false_type{});
+
+    typename PipelinePT::Params pipeline_pt_params;
+    if (role == WarpRole::kLoad) {
+      pipeline_pt_params.role = PipelinePT::ThreadCategory::Consumer;
+    }
+    if (role == WarpRole::kLoadPageTable) {
+      pipeline_pt_params.role = PipelinePT::ThreadCategory::Producer;
+    }
+    pipeline_pt_params.consumer_arv_count = kNumLoadWarps * cutlass::NumThreadsPerWarp;
+    pipeline_pt_params.producer_arv_count = cutlass::NumThreadsPerWarp;
+    pipeline_pt_params.initializing_warp = 4;
+    PipelinePT pipeline_page_table(
+      shared_storage.pipelines.load_page_table,
+      pipeline_pt_params);
+
+    TmemAllocator tmem_allocator;
+
+    pipeline_init_arrive_relaxed(size(ClusterShape{}));
+
+    pipeline_load_qk.init_masks(ClusterShape{});  // do we need an update here for 2Sm?
+    pipeline_mma_s.init_masks(ClusterShape{});
+    pipeline_p_mma.init_masks(ClusterShape{});
+    pipeline_mma_o.init_masks(ClusterShape{});
+
+    typename PipelineLoadQK::PipelineState pipeline_load_qk_consumer_state;
+    typename PipelineLoadQK::PipelineState pipeline_load_qk_producer_state = cutlass::make_producer_start_state<PipelineLoadQK>();
+
+    typename PipelineS::PipelineState pipeline_mma_s_consumer_state;
+    typename PipelineS::PipelineState pipeline_mma_s_producer_state = cutlass::make_producer_start_state<PipelineS>();
+
+    typename PipelineP::PipelineState pipeline_p_mma_consumer_state;
+    typename PipelineP::PipelineState pipeline_p_mma_producer_state = cutlass::make_producer_start_state<PipelineP>();
+
+    typename PipelineO::PipelineState pipeline_mma_o_consumer_state;
+    typename PipelineO::PipelineState pipeline_mma_o_producer_state = cutlass::make_producer_start_state<PipelineO>();
+
+    typename PipelinePT::PipelineState pipeline_pt_consumer_state;
+    typename PipelinePT::PipelineState pipeline_pt_producer_state = cutlass::make_producer_start_state<PipelinePT>();
+
+    pipeline_init_wait(size(ClusterShape{}));
+
+    if (role == WarpRole::kLoadPageTable) {
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; tile_scheduler.is_valid(); ++tile_scheduler) {
+        auto blk_coord = tile_scheduler.get_block_coord();
+        auto problem_shape = params.problem_shape;
+	auto local_split_kv = params.split_kv;
+        if (params.mainloop.ptr_seq != nullptr) {
+          get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
+	  if (params.ptr_split_kv != nullptr) {
+            local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
+          }
+        }
+	if (local_split_kv <= get<3>(blk_coord))
+	  continue;
+        load_page_table(
+          blk_coord,
+          problem_shape,
+          params.mainloop,
+          shared_storage.tensors,
+          pipeline_page_table, pipeline_pt_producer_state,
+	  local_split_kv
+        );
+      }
+    }
+    else if (role == WarpRole::kLoad) {
+      if constexpr (kIsCpAsync) {
+        CUTLASS_PRAGMA_NO_UNROLL
+        for (; tile_scheduler.is_valid(); ++tile_scheduler) {
+          auto blk_coord = tile_scheduler.get_block_coord();
+	  auto problem_shape = params.problem_shape;
+	  auto local_split_kv = params.split_kv;
+          if (params.mainloop.ptr_seq != nullptr) {
+            get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
+	    if (params.ptr_split_kv != nullptr) {
+              local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
+            }
+          }
+	  if (local_split_kv <= get<3>(blk_coord))
+            continue;
+          load_cpasync(
+            blk_coord,
+            problem_shape,
+            params.mainloop,
+            params.mainloop_params,
+            shared_storage.tensors,
+            pipeline_load_qk, pipeline_load_qk_producer_state,
+	    local_split_kv,
+            /* must be shared pipe */
+            pipeline_page_table, pipeline_pt_consumer_state
+          );
+          cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait();
+        }
+      }
+      else {
+        if (params.mainloop.ptr_page_table != nullptr) {
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (; tile_scheduler.is_valid(); ++tile_scheduler) {
+            auto blk_coord = tile_scheduler.get_block_coord();
+	    auto problem_shape = params.problem_shape;
+	    auto local_split_kv = params.split_kv;
+            if (params.mainloop.ptr_seq != nullptr) {
+              get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
+	      if (params.ptr_split_kv != nullptr) {
+	        local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
+	      }
+            }
+	    if (local_split_kv <= get<3>(blk_coord))
+              continue;
+            load_tma</* paged= */ true>(
+              blk_coord,
+              problem_shape,
+              params.mainloop,
+              params.mainloop_params,
+              shared_storage.tensors,
+              pipeline_load_qk, pipeline_load_qk_producer_state,
+              pipeline_load_qk, pipeline_load_qk_producer_state,
+	      local_split_kv
+            );
+            cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait();
+          }
+        }
+        else {
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (; tile_scheduler.is_valid(); ++tile_scheduler) {
+            auto blk_coord = tile_scheduler.get_block_coord();
+	    auto problem_shape = params.problem_shape;
+	    auto local_split_kv = params.split_kv;
+            if (params.mainloop.ptr_seq != nullptr) {
+              get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
+	      if (params.ptr_split_kv != nullptr) {
+                local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
+	      }
+            }
+	    if (local_split_kv <= get<3>(blk_coord))
+              continue;
+            load_tma<false>(
+              blk_coord,
+              problem_shape,
+              params.mainloop,
+              params.mainloop_params,
+              shared_storage.tensors,
+              pipeline_load_qk, pipeline_load_qk_producer_state,
+              pipeline_load_qk, pipeline_load_qk_producer_state,
+	      local_split_kv
+            );
+            cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait();
+          }
+        }
+      }
+    }
+    else if (role == WarpRole::kMma) {
+      tmem_allocator.allocate(TmemAllocator::Sm100TmemCapacityColumns, &shared_storage.tmem_base_ptr);
+      __syncwarp();
+
+      if (is_mma_leader_cta) {
+        CUTLASS_PRAGMA_NO_UNROLL
+        for (; tile_scheduler.is_valid(); ++tile_scheduler) {
+          auto blk_coord = tile_scheduler.get_block_coord();
+          auto problem_shape = params.problem_shape;
+	  auto local_split_kv = params.split_kv;
+          if (params.mainloop.ptr_seq != nullptr) {
+            get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
+            if (params.ptr_split_kv != nullptr) {
+                local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
+            }
+          }
+	  if (local_split_kv <= get<3>(blk_coord))
+            continue;
+          mma(blk_coord,
+            problem_shape,
+            shared_storage.tensors,
+            pipeline_load_qk, pipeline_load_qk_consumer_state,
+            pipeline_load_qk, pipeline_load_qk_consumer_state,
+            pipeline_mma_s, pipeline_mma_s_producer_state,
+            pipeline_p_mma, pipeline_p_mma_consumer_state,
+            pipeline_mma_o, pipeline_mma_o_producer_state,
+	    local_split_kv
+          );
+        }
+      }
+
+      //cutlass::arch::NamedBarrier((kNumComputeWarps + 1) * NumThreadsPerWarp, kNamedBarrierTmemDealloc).arrive_and_wait();
+
+      //uint32_t free_stage_ptr = shared_storage.tmem_base_ptr;
+      //tmem_allocator.free(free_stage_ptr, TmemAllocator::Sm100TmemCapacityColumns);
+    }
+    else if (role == WarpRole::kCompute) {
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; tile_scheduler.is_valid(); ++tile_scheduler) {
+        auto blk_coord = tile_scheduler.get_block_coord();
+        auto problem_shape = params.problem_shape;
+	auto split_kv = params.split_kv;
+	auto local_split_kv = split_kv;
+        if (params.mainloop.ptr_seq != nullptr) {
+          get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
+	  if (params.ptr_split_kv != nullptr) {
+            local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
+          }
+        }
+	if (local_split_kv <= get<3>(blk_coord))
+          continue;
+        compute(
+          blk_coord,
+          problem_shape,
+          params.mainloop,         // for softmax_scale
+          params.epilogue,
+          shared_storage.tensors,  // for smem_comm
+          pipeline_mma_s, pipeline_mma_s_consumer_state,
+          pipeline_p_mma, pipeline_p_mma_producer_state,
+          pipeline_mma_o, pipeline_mma_o_consumer_state,
+	  local_split_kv
+        );
+      }
+
+      //cutlass::arch::NamedBarrier((kNumComputeWarps + 1) * NumThreadsPerWarp, kNamedBarrierTmemDealloc).arrive();
+    }
+
+    cute::cluster_sync();
+    cutlass::arch::NamedBarrier((kNumComputeWarps + 1) * NumThreadsPerWarp, kNamedBarrierTmemDealloc).arrive();
+    if (role == WarpRole::kMma) {
+      uint32_t free_stage_ptr = shared_storage.tmem_base_ptr;
+      tmem_allocator.free(free_stage_ptr, TmemAllocator::Sm100TmemCapacityColumns);
+    }
+  }
+
+  template<class BlkCoord>
+  CUTLASS_DEVICE void load_page_table(
+      BlkCoord const& blk_coord,
+      ProblemShape const& problem_shape,
+      MainloopArguments const& mainloop_args,
+      TensorStorage& shared_tensors,
+      PipelinePT& pipeline_page_table,
+      typename PipelinePT::PipelineState& pipeline_pt_producer_state, int const& split_kv) {
+
+    auto [H, K, D, B] = problem_shape;
+    int batch_coord = get<2>(blk_coord);
+
+    auto mPT_l = make_tensor(make_gmem_ptr(mainloop_args.ptr_page_table),
+                            make_shape(mainloop_args.page_count, B),
+                            mainloop_args.stride_page_table);
+    auto mPT = mPT_l(_, batch_coord);
+
+    int k_tile_total = ceil_div(K, TileShapeS{});
+    int k_tile_per_cta = ceil_div(k_tile_total, split_kv);
+    int k_index = get<3>(blk_coord) * k_tile_per_cta; // lower limit
+    int k_tile_count = max(0, min(k_tile_total, k_index + k_tile_per_cta) - k_index);
+    if (k_tile_count == 0) {
+      return;
+    }
+
+    auto page_size = Pow2{mainloop_args.page_size};
+    auto pages_per_tile = Pow2{TileShapeS{} / page_size};
+    int thread_idx = threadIdx.x % cutlass::NumThreadsPerWarp;
+
+#if 1
+    for (; k_tile_count > 0; ++k_index, --k_tile_count) {
+      pipeline_page_table.producer_acquire(pipeline_pt_producer_state);
+
+      // assume a single warp
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < TileShapeS{}; i += cutlass::NumThreadsPerWarp) {
+        int idx = i + thread_idx;
+        bool guard = idx < pages_per_tile;
+        int smem_idx = pipeline_pt_producer_state.index() * TileShapeS::value + idx;
+        int pt_idx = pages_per_tile * k_index + idx;
+
+        cutlass::arch::cp_async_zfill<sizeof(int), cutlass::arch::CacheOperation::Always>(
+          &shared_tensors.smem_page_table[smem_idx], &mPT(pt_idx), guard
+        );
+      }
+
+      pipeline_page_table.producer_commit(pipeline_pt_producer_state, cutlass::arch::cpasync_barrier_arrive);
+      ++pipeline_pt_producer_state;
+    }
+#endif
+  }
+
+
+  struct Gather {
+    int& page_table_stage;
+    Pow2 pages_per_tile;
+    const int * __restrict__ smem_page_table;
+
+    CUTLASS_DEVICE int operator()(int idx) const {
+      return smem_page_table[page_table_stage * TileShapeS::value + idx % pages_per_tile];
+    }
+
+    CUTLASS_DEVICE friend void print(Gather const&) {
+      printf("<gather>");
+    }
+
+  };
+
+
+  template<class BlkCoord>
+  CUTLASS_DEVICE void load_cpasync(
+      BlkCoord const& blk_coord,
+      ProblemShape const& problem_shape,
+      MainloopArguments const& mainloop_args,
+      MainloopParams const& mainloop_params,
+      TensorStorage& shared_tensors,
+      PipelineLoadQK& pipeline_load,
+      typename PipelineLoadQK::PipelineState& pipeline_load_producer_state,
+      int const& split_kv,
+      PipelinePT& pipeline_page_table,
+      typename PipelinePT::PipelineState& pipeline_pt_consumer_state) {
+
+    auto [H, K, D, B] = problem_shape;
+    auto [D_latent, D_rope] = D;
+
+    using X = Underscore;
+
+    int k_tile_total = ceil_div(K, TileShapeS{});
+    int k_tile_per_cta = ceil_div(k_tile_total, split_kv);
+    int k_index = get<3>(blk_coord) * k_tile_per_cta; // lower limit
+    int k_tile_count = max(0, min(k_tile_total, k_index + k_tile_per_cta) - k_index);
+    if (k_tile_count == 0) {
+      return;
+    }
+
+    // partition all tensors
+    auto mQL = make_tensor(make_gmem_ptr(mainloop_args.ptr_q_latent), make_shape(H, D_latent, B), mainloop_args.stride_q_latent);
+    auto mQR = make_tensor(make_gmem_ptr(mainloop_args.ptr_q_rope), make_shape(H, D_rope, B), mainloop_args.stride_q_rope);
+
+    int  paged_B = mainloop_args.page_count;
+    auto paged_K = Pow2{mainloop_args.page_size};
+    auto mPT_l = make_tensor(make_gmem_ptr(mainloop_args.ptr_page_table), make_shape(paged_B, B), mainloop_args.stride_page_table);
+
+    int batch_coord = get<2>(blk_coord);
+    auto mPT = mPT_l(_, batch_coord);
+
+    auto gQL = local_tile(mQL, TileShapeQK{}, make_coord(_,_,_), Step<_1, X, _1>{});
+    auto gQR = local_tile(mQR, TileShapeQK{}, make_coord(_,_,_), Step<_1, X, _1>{});
+
+    ThrMMA cta_mma_qk = TiledMmaQK{}.get_slice(get<0>(blk_coord) % size(AtomThrShapeMNK{}));
+    ThrMMA cta_mma_pv = TiledMmaPV{}.get_slice(get<0>(blk_coord) % size(AtomThrShapeMNK{}));
+
+    auto tSgQL = cta_mma_qk.partition_A(gQL);
+    auto tSgQR = cta_mma_qk.partition_A(gQR);
+
+    Tensor sQ = make_tensor(make_smem_ptr(shared_tensors.smem_q.begin()), SmemLayoutQ{});
+    Tensor sKC = make_tensor(make_smem_ptr(shared_tensors.smem_kc.begin()), SmemLayoutKC{});
+    Tensor sVC = make_tensor(make_smem_ptr(shared_tensors.smem_vc.begin()), SmemLayoutVC{});
+
+    auto make_copy_for = [](auto sT) {
+      auto rT_a = sT.layout()(_, _, _, _0{});
+      auto rT = make_ordered_layout(shape(rT_a), stride(rT_a));
+      auto threads = Int<kNumLoadWarps * cutlass::NumThreadsPerWarp>{};
+      auto values = Int<sizeof(uint128_t) / sizeof(Element)>{};
+      return make_cotiled_copy(
+          Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<uint128_t>, Element>{},
+          make_ordered_layout(
+              make_shape(threads, values),
+              make_stride(_1{}, _0{})),
+          rT);
+    };
+
+    // like cute::copy, but makes sure we do all page table lookups first
+    auto copy_split = [](auto atom, auto src, auto dst) {
+      auto src_v = group_modes<1, rank_v<decltype(src)>>(src);
+      auto dst_v = group_modes<1, rank_v<decltype(dst)>>(dst);
+
+      auto src_v_ptrs = make_tensor<Element*>(size<1>(src_v));
+      for (int i = 0; i < size<1>(src_v); i++) {
+        src_v_ptrs(i) = &src_v(_0{}, i);
+      }
+
+
+      for (int i = 0; i < size<1>(src_v); i++) {
+        auto src_v_i = make_tensor(
+            make_gmem_ptr(src_v_ptrs(i)),
+            make_shape(shape<0>(src_v)),
+            make_stride(make_stride(_1{}, _0{}))
+        );
+        atom.call(src_v_i, dst_v(_, i));
+      }
+    };
+
+    auto tiled_copy_q = make_copy_for(sQ);
+    auto tiled_copy_kc = make_copy_for(sKC);
+    auto tiled_copy_vc = make_copy_for(sVC);
+
+    auto thr_copy_q = tiled_copy_q.get_thread_slice(threadIdx.x % (kNumLoadWarps * cutlass::NumThreadsPerWarp));
+    auto thr_copy_kc = tiled_copy_kc.get_thread_slice(threadIdx.x % (kNumLoadWarps * cutlass::NumThreadsPerWarp));
+    auto thr_copy_vc = tiled_copy_vc.get_thread_slice(threadIdx.x % (kNumLoadWarps * cutlass::NumThreadsPerWarp));
+
+    auto tQsQ = thr_copy_q.partition_D(sQ);
+    auto tQgQL = thr_copy_q.partition_S(tSgQL);
+    auto tQgQR = thr_copy_q.partition_S(tSgQR);
+
+    auto tKCsKC = thr_copy_kc.partition_D(sKC);
+    auto tVCsVC = thr_copy_vc.partition_D(sVC);
+
+    auto pipeline_pt_release_state = pipeline_pt_consumer_state;
+
+    int page_table_stage = -1;
+    Pow2 pages_per_tile{TileShapeS{} / paged_K};
+    const int * __restrict__ smem_page_table = shared_tensors.smem_page_table.begin();
+    Gather gather{page_table_stage, pages_per_tile, smem_page_table};
+
+    auto mCL = make_tensor(
+        make_gmem_ptr(mainloop_args.ptr_c_latent),
+        ComposedLayout{
+            make_layout(
+                make_shape(make_shape(paged_K, paged_B), _1{}),
+                make_stride(make_stride(get<0>(mainloop_args.stride_c_latent), example::CustomStride(gather, get<2>(mainloop_args.stride_c_latent))), get<1>(mainloop_args.stride_c_latent))),
+            make_coord(_0{}, _0{}),
+            make_identity_layout(make_shape(paged_K * paged_B, D_latent))});
+
+    auto mKR = make_tensor(
+        make_gmem_ptr(mainloop_args.ptr_k_rope),
+        ComposedLayout{
+            make_layout(
+                make_shape(make_shape(paged_K, paged_B), _1{}),
+                make_stride(make_stride(get<0>(mainloop_args.stride_k_rope), example::CustomStride(gather, get<2>(mainloop_args.stride_k_rope))), get<1>(mainloop_args.stride_k_rope))),
+            make_coord(_0{}, _0{}),
+            make_identity_layout(make_shape(paged_K * paged_B, D_latent))});
+
+    auto mCLT = make_tensor(
+        make_gmem_ptr(mainloop_args.ptr_c_latent),
+        ComposedLayout{
+            make_layout(
+                make_shape(_1{}, make_shape(paged_K, paged_B)),
+                make_stride(get<1>(mainloop_args.stride_c_latent), make_stride(get<0>(mainloop_args.stride_c_latent), example::CustomStride(gather, get<2>(mainloop_args.stride_c_latent))))),
+            make_coord(_0{}, _0{}),
+            make_identity_layout(make_shape(D_latent, paged_K * paged_B))});
+
+    auto gCL = local_tile(mCL, TileShapeQK{}, make_coord(_,_,_), Step<X, _1, _1>{});
+    auto gKR = local_tile(mKR, TileShapeQK{}, make_coord(_,_,_), Step<X, _1, _1>{});
+    auto gCLT = local_tile(mCLT, TileShapePV{}, make_coord(_,_,_), Step<X, _1, _1>{});
+
+    auto tSgCL = cta_mma_qk.partition_B(gCL);
+    auto tSgKR = cta_mma_qk.partition_B(gKR);
+    auto tOgCLT = cta_mma_pv.partition_B(gCLT);
+
+    auto tKCgCL = thr_copy_kc.partition_S(tSgCL);
+    auto tKCgKR = thr_copy_kc.partition_S(tSgKR);
+    auto tVCgCLT = thr_copy_vc.partition_S(tOgCLT);
+
+    // latent is first in memory, so let's load it first always
+    // startup: alternate Q and K, set tx count appropriately, for k_idx = 0
+    auto& pipeline_acquire_state = pipeline_load_producer_state;
+    auto pipeline_commit_state = pipeline_acquire_state;
+    int pipeline_offset = 0;
+
+    for (int i = 0; i < StagesPV; i++) {
+      cutlass::arch::cp_async_fence();
+    }
+
+    auto load_stage = [&](auto fn) {
+      pipeline_load.producer_acquire(pipeline_acquire_state);
+      fn(pipeline_acquire_state.index());
+      cutlass::arch::cp_async_fence();
+
+      ++pipeline_acquire_state;
+      ++pipeline_offset;
+
+      if (pipeline_offset == StagesPV - 1) {
+        cutlass::arch::cp_async_wait<StagesPV - 1>();
+        pipeline_load.producer_commit(pipeline_commit_state);
+        ++pipeline_commit_state;
+        --pipeline_offset;
+      }
+    };
+
+    pipeline_page_table.consumer_wait(pipeline_pt_consumer_state);
+    page_table_stage = pipeline_pt_consumer_state.index();
+    ++pipeline_pt_consumer_state;
+
+    // each Q/K tile consists of rope and latent
+    for (int i = 0; i < IterationsQKLatent; i++) {
+      load_stage([&](int index) {
+        cute::copy(tiled_copy_q, tQgQL(_, _, _, _, _0{}, i, batch_coord),  tQsQ(_, _, _, _, i));
+        copy_split(tiled_copy_kc, tKCgCL(_, _, _, _, k_index, i),  tKCsKC(_, _, _, _, index));
+      });
+    }
+
+    for (int i = 0; i < IterationsQKRope; i++) {
+      load_stage([&](int index) {
+        cute::copy(tiled_copy_q, tQgQR(_, _, _, _, _0{}, i, batch_coord),  tQsQ(_, _, _, _, IterationsQKLatent + i));
+        copy_split(tiled_copy_kc, tKCgKR(_, _, _, _, k_index, i),  tKCsKC(_, _, _, _, index));
+      });
+    }
+
+    k_index += 1;
+    k_tile_count -= 1;
+
+    // assume k_tile_count >= 1
+    // perform K+Q load here
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+
+      pipeline_page_table.consumer_wait(pipeline_pt_consumer_state);
+      page_table_stage = pipeline_pt_consumer_state.index();
+      ++pipeline_pt_consumer_state;
+
+      for (int i = 0; i < IterationsQKLatent; i++) {
+        load_stage([&](int index) {
+          copy_split(tiled_copy_kc, tKCgCL(_, _, _, _, k_index, i),  tKCsKC(_, _, _, _, index));
+        });
+      }
+
+      for (int i = 0; i < IterationsQKRope; i++) {
+        load_stage([&](int index) {
+          copy_split(tiled_copy_kc, tKCgKR(_, _, _, _, k_index, i),  tKCsKC(_, _, _, _, index));
+        });
+      }
+
+      page_table_stage = pipeline_pt_release_state.index();
+
+      for (int i = 0; i < IterationsPV_K; i++) {
+        for (int j = 0; j < IterationsPV_N; j++) {
+          load_stage([&](int index) {
+            copy_split(tiled_copy_vc, tVCgCLT(_, _, _, _, j, IterationsPV_K * (k_index - 1) + i),  tVCsVC(_, _, _, _, index));
+          });
+        }
+      }
+
+      pipeline_page_table.consumer_release(pipeline_pt_release_state);
+      ++pipeline_pt_release_state;
+
+      k_index += 1;
+      k_tile_count -= 1;
+    }
+
+    page_table_stage = pipeline_pt_release_state.index();
+
+    for (int i = 0; i < IterationsPV_K; i++) {
+      for (int j = 0; j < IterationsPV_N; j++) {
+        load_stage([&](int index) {
+          copy_split(tiled_copy_vc, tVCgCLT(_, _, _, _, j, IterationsPV_K * (k_index - 1) + i),  tVCsVC(_, _, _, _, index));
+        });
+      }
+    }
+
+    pipeline_page_table.consumer_release(pipeline_pt_release_state);
+    ++pipeline_pt_release_state;
+
+    while (pipeline_offset > 0) {
+      cutlass::arch::cp_async_fence();
+
+      cutlass::arch::cp_async_wait<StagesPV - 1>();
+      pipeline_load.producer_commit(pipeline_commit_state);
+      ++pipeline_commit_state;
+      --pipeline_offset;
+    }
+
+    cutlass::arch::cp_async_wait<0>();
+
+  }
+
+
+  template<bool kIsPaged = false, class BlkCoord>
+  CUTLASS_DEVICE void load_tma(
+      BlkCoord const& blk_coord,
+      ProblemShape const& problem_shape,
+      MainloopArguments const& mainloop_args,
+      MainloopParams const& mainloop_params,
+      TensorStorage& shared_tensors,
+      PipelineLoadQK& pipeline_load_qk,
+      typename PipelineLoadQK::PipelineState& pipeline_load_qk_producer_state,
+      PipelineLoadPV& pipeline_load_pv,
+      typename PipelineLoadPV::PipelineState& pipeline_load_pv_producer_state,
+      int const& split_kv) {
+
+    auto [H, K, D, B] = problem_shape;
+    auto [D_latent, D_rope] = D;
+
+    int k_tile_total = ceil_div(K, TileShapeS{});
+    int k_tile_per_cta = ceil_div(k_tile_total, split_kv);
+    int k_index = get<3>(blk_coord) * k_tile_per_cta; // lower limit
+    int k_tile_count = max(0, min(k_tile_total, k_index + k_tile_per_cta) - k_index);
+    if (k_tile_count == 0) {
+      return;
+    }
+
+    using X = Underscore;
+
+    // partition all tensors
+    auto mQL = mainloop_params.tma_load_q_latent.get_tma_tensor(make_shape(H, D_latent, B));
+    auto mQR = mainloop_params.tma_load_q_rope.get_tma_tensor(make_shape(H, D_rope, B));
+
+    int paged_B = B;
+    int paged_K = K;
+    if constexpr (kIsPaged) {
+      paged_B = mainloop_args.page_count;
+      paged_K = mainloop_args.page_size;
+    }
+    auto mPT_l = make_tensor(make_gmem_ptr(mainloop_args.ptr_page_table), make_shape(paged_B, B), mainloop_args.stride_page_table);
+
+    auto mCL = mainloop_params.tma_load_c_latent.get_tma_tensor(make_shape(paged_K, D_latent, paged_B));
+    auto mKR = mainloop_params.tma_load_k_rope.get_tma_tensor(make_shape(paged_K, D_rope, paged_B));
+
+    auto mCLT = mainloop_params.tma_load_c_latent_transpose.get_tma_tensor(make_shape(D_latent, paged_K, paged_B));
+
+    auto gQL = local_tile(mQL, TileShapeQK{}, make_coord(_,_,_), Step<_1, X, _1>{});
+    auto gQR = local_tile(mQR, TileShapeQK{}, make_coord(_,_,_), Step<_1, X, _1>{});
+
+    auto gCL = local_tile(mCL, TileShapeQK{}, make_coord(_,_,_), Step<X, _1, _1>{});
+    auto gKR = local_tile(mKR, TileShapeQK{}, make_coord(_,_,_), Step<X, _1, _1>{});
+    auto gCLT = local_tile(mCLT, TileShapePV{}, make_coord(_,_,_), Step<X, _1, _1>{});
+
+    ThrMMA cta_mma_qk = TiledMmaQK{}.get_slice(get<0>(blk_coord) % size(AtomThrShapeMNK{}));
+    ThrMMA cta_mma_pv = TiledMmaPV{}.get_slice(get<0>(blk_coord) % size(AtomThrShapeMNK{}));
+
+    auto tSgQL = cta_mma_qk.partition_A(gQL);
+    auto tSgQR = cta_mma_qk.partition_A(gQR);
+
+    auto tSgCL = cta_mma_qk.partition_B(gCL);
+    auto tSgKR = cta_mma_qk.partition_B(gKR);
+
+    auto tOgCLT = cta_mma_pv.partition_B(gCLT);
+
+    Tensor sQ = make_tensor(make_smem_ptr(shared_tensors.smem_q.begin()), SmemLayoutQ{});
+    Tensor sKC = make_tensor(make_smem_ptr(shared_tensors.smem_kc.begin()), SmemLayoutKC{});
+    Tensor sVC = make_tensor(make_smem_ptr(shared_tensors.smem_vc.begin()), SmemLayoutVC{});
+
+    auto [tQLgQL_mkl, tQsQ] = tma_partition(
+        mainloop_params.tma_load_q_latent, _0{}, make_layout(_1{}),
+        group_modes<0,3>(sQ), group_modes<0,3>(tSgQL));
+
+    auto [tQRgQR_mkl, tQsQ_ignore] = tma_partition(
+        mainloop_params.tma_load_q_rope, _0{}, make_layout(_1{}),
+        group_modes<0,3>(sQ), group_modes<0,3>(tSgQR));
+
+    auto [tCLgCL_nkl, tKCsKC] = tma_partition(
+        mainloop_params.tma_load_c_latent, _0{}, make_layout(_1{}),
+        group_modes<0,3>(sKC), group_modes<0,3>(tSgCL));
+
+    auto [tKRgKR_nkl, tKCsKC_ignore] = tma_partition(
+        mainloop_params.tma_load_k_rope, _0{}, make_layout(_1{}),
+        group_modes<0,3>(sKC), group_modes<0,3>(tSgKR));
+
+    auto [tCLTgCLT_nkl, tVCsVC] = tma_partition(
+        mainloop_params.tma_load_c_latent_transpose, _0{}, make_layout(_1{}),
+        group_modes<0,3>(sVC), group_modes<0,3>(tOgCLT));
+
+    uint16_t mcast_mask = 0;
+
+    int batch_coord = get<2>(blk_coord);
+    Tensor tQLgQL = tQLgQL_mkl(_, _, _, batch_coord);
+    Tensor tQRgQR = tQRgQR_mkl(_, _, _, batch_coord);
+
+    auto mPT = mPT_l(_, batch_coord);
+
+    Tensor tCLgCL = tCLgCL_nkl(_, _, _, _);
+    Tensor tKRgKR = tKRgKR_nkl(_, _, _, _);
+
+    // careful: stage and k are swapped here!
+    Tensor tCLTgCLT = tCLTgCLT_nkl(_, _, _, _);
+
+    // latent is first in memory, so let's load it first always
+    // startup: alternate Q and K, set tx count appropriately, for k_idx = 0
+
+    // each Q/K tile consists of rope and latent
+    for (int i = 0; i < IterationsQKLatent; i++) {
+      pipeline_load_qk.producer_expect_transaction(pipeline_load_qk_producer_state, kTransactionsBytesLoadExtraQ);
+      pipeline_load_qk.producer_acquire(pipeline_load_qk_producer_state);
+      auto tma_barrier = pipeline_load_qk.producer_get_barrier(pipeline_load_qk_producer_state);
+
+      if (cute::elect_one_sync()) {
+        // expect the extra bytes
+        // load_qk ql
+        cute::copy(mainloop_params.tma_load_q_latent.with(*tma_barrier, mcast_mask), tQLgQL(_, _0{}, i), tQsQ(_, i));
+        // load_qk cl
+        if constexpr (kIsPaged) {
+          cute::copy(
+              mainloop_params.tma_load_c_latent.with(*tma_barrier, mcast_mask),
+              tCLgCL(_, _0{}, i, mPT(k_index)),
+              tKCsKC(_, pipeline_load_qk_producer_state.index())
+          );
+        }
+        else {
+          cute::copy(
+              mainloop_params.tma_load_c_latent.with(*tma_barrier, mcast_mask),
+              tCLgCL(_, k_index, i, batch_coord),
+              tKCsKC(_, pipeline_load_qk_producer_state.index()));
+        }
+      }
+      ++pipeline_load_qk_producer_state;
+    }
+
+    for (int i = 0; i < IterationsQKRope; i++) {
+      pipeline_load_qk.producer_expect_transaction(pipeline_load_qk_producer_state, kTransactionsBytesLoadExtraQ);
+      pipeline_load_qk.producer_acquire(pipeline_load_qk_producer_state);
+      auto tma_barrier = pipeline_load_qk.producer_get_barrier(pipeline_load_qk_producer_state);
+
+      if (cute::elect_one_sync()) {
+        // expect the extra bytes
+        // load_qk ql
+        cute::copy(mainloop_params.tma_load_q_rope.with(*tma_barrier, mcast_mask), tQRgQR(_, _0{}, i), tQsQ(_, i + IterationsQKLatent));
+        // load_qk cl
+        if constexpr (kIsPaged) {
+          cute::copy(
+              mainloop_params.tma_load_k_rope.with(*tma_barrier, mcast_mask),
+              tKRgKR(_, _0{}, i, mPT(k_index)),
+              tKCsKC(_, pipeline_load_qk_producer_state.index())
+          );
+        }
+        else {
+          cute::copy(
+              mainloop_params.tma_load_k_rope.with(*tma_barrier, mcast_mask),
+              tKRgKR(_, k_index, i, batch_coord),
+              tKCsKC(_, pipeline_load_qk_producer_state.index()));
+        }
+      }
+      ++pipeline_load_qk_producer_state;
+    }
+
+    k_index += 1;
+    k_tile_count -= 1;
+
+    // assume k_tile_count >= 1
+    // perform K+Q load here
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+
+      // perform K load
+      for (int i = 0; i < IterationsQKLatent; i++) {
+        pipeline_load_qk.producer_acquire(pipeline_load_qk_producer_state);
+        auto tma_barrier = pipeline_load_qk.producer_get_barrier(pipeline_load_qk_producer_state);
+
+        if (cute::elect_one_sync()) {
+          // load_qk cl
+          if constexpr (kIsPaged) {
+            cute::copy(
+                mainloop_params.tma_load_c_latent.with(*tma_barrier, mcast_mask),
+                tCLgCL(_, _0{}, i, mPT(k_index)),
+                tKCsKC(_, pipeline_load_qk_producer_state.index())
+            );
+          }
+          else {
+            cute::copy(
+                mainloop_params.tma_load_c_latent.with(*tma_barrier, mcast_mask),
+                tCLgCL(_, k_index, i, batch_coord),
+                tKCsKC(_, pipeline_load_qk_producer_state.index()));
+          }
+        }
+        ++pipeline_load_qk_producer_state;
+      }
+
+      for (int i = 0; i < IterationsQKRope; i++) {
+        pipeline_load_qk.producer_acquire(pipeline_load_qk_producer_state);
+        auto tma_barrier = pipeline_load_qk.producer_get_barrier(pipeline_load_qk_producer_state);
+
+        if (cute::elect_one_sync()) {
+          // load_qk cl
+          if constexpr (kIsPaged) {
+            cute::copy(
+                mainloop_params.tma_load_k_rope.with(*tma_barrier, mcast_mask),
+                tKRgKR(_, _0{}, i, mPT(k_index)),
+                tKCsKC(_, pipeline_load_qk_producer_state.index())
+            );
+          }
+          else {
+            cute::copy(
+                mainloop_params.tma_load_k_rope.with(*tma_barrier, mcast_mask),
+                tKRgKR(_, k_index, i, batch_coord),
+                tKCsKC(_, pipeline_load_qk_producer_state.index()));
+          }
+        }
+        ++pipeline_load_qk_producer_state;
+      }
+
+      // prefetch next K load to keep busy while we transpose-load from cache
+      const int kPrefetchDistance = 1;
+      for (int i = 0; i < IterationsQKLatent; i++) {
+        if (cute::elect_one_sync()) {
+          if constexpr (kIsPaged) {
+            if (k_tile_count > kPrefetchDistance) {
+              cute::prefetch(
+                  mainloop_params.tma_load_c_latent,
+                  tCLgCL(_, _0{}, i, mPT(k_index + kPrefetchDistance))
+              );
+            }
+          }
+          else {
+            cute::prefetch(
+                mainloop_params.tma_load_c_latent,
+                tCLgCL(_, k_index + kPrefetchDistance, i, batch_coord)
+            );
+          }
+        }
+      }
+
+      for (int i = 0; i < IterationsQKRope; i++) {
+        if (cute::elect_one_sync()) {
+          if constexpr (kIsPaged) {
+            if (k_tile_count > kPrefetchDistance) {
+              cute::prefetch(
+                  mainloop_params.tma_load_k_rope,
+                  tKRgKR(_, _0{}, i, mPT(k_index + kPrefetchDistance))
+              );
+            }
+          }
+          else {
+            cute::prefetch(
+                mainloop_params.tma_load_k_rope,
+                tKRgKR(_, k_index + kPrefetchDistance, i, batch_coord)
+            );
+          }
+        }
+      }
+
+      // perform V load (k_idx - 1)
+
+      for (int i = 0; i < IterationsPV_K; i++) {
+        for (int j = 0; j < IterationsPV_N; j++) {
+          pipeline_load_pv.producer_acquire(pipeline_load_pv_producer_state);
+          auto tma_barrier = pipeline_load_pv.producer_get_barrier(pipeline_load_pv_producer_state);
+
+          if (cute::elect_one_sync()) {
+            // load_pv cl
+            // note the transpose in indices!
+            // note we are off-by-one on k_index
+            if constexpr (kIsPaged) {
+              cute::copy(
+                  mainloop_params.tma_load_c_latent_transpose.with(*tma_barrier, mcast_mask, cute::TMA::CacheHintSm100::EVICT_FIRST),
+                  tCLTgCLT(_, j, i, mPT(k_index - 1)),
+                  tVCsVC(_, pipeline_load_pv_producer_state.index())
+              );
+            }
+            else {
+              cute::copy(
+                  mainloop_params.tma_load_c_latent_transpose.with(*tma_barrier, mcast_mask, cute::TMA::CacheHintSm100::EVICT_FIRST),
+                  tCLTgCLT(_, j, IterationsPV_K * (k_index - 1) + i, batch_coord),
+                  tVCsVC(_, pipeline_load_pv_producer_state.index())
+              );
+            }
+          }
+          ++pipeline_load_pv_producer_state;
+        }
+      }
+
+      k_index += 1;
+      k_tile_count -= 1;
+    }
+
+    for (int i = 0; i < IterationsPV_K; i++) {
+      for (int j = 0; j < IterationsPV_N; j++) {
+        pipeline_load_pv.producer_acquire(pipeline_load_pv_producer_state);
+        auto tma_barrier = pipeline_load_pv.producer_get_barrier(pipeline_load_pv_producer_state);
+
+        if (cute::elect_one_sync()) {
+          // load_pv cl
+          // note the transpose in indices
+          // note we are off-by-one on k_index
+
+          if constexpr (kIsPaged) {
+            cute::copy(
+                mainloop_params.tma_load_c_latent_transpose.with(*tma_barrier, mcast_mask, cute::TMA::CacheHintSm100::EVICT_FIRST),
+                tCLTgCLT(_, j, i, mPT(k_index - 1)),
+                tVCsVC(_, pipeline_load_pv_producer_state.index())
+            );
+          }
+          else {
+            cute::copy(
+                mainloop_params.tma_load_c_latent_transpose.with(*tma_barrier, mcast_mask, cute::TMA::CacheHintSm100::EVICT_FIRST),
+                tCLTgCLT(_, j, IterationsPV_K * (k_index - 1) + i, batch_coord),
+                tVCsVC(_, pipeline_load_pv_producer_state.index())
+            );
+          }
+        }
+        ++pipeline_load_pv_producer_state;
+      }
+    }
+  }
+
+  template<class BlkCoord>
+  CUTLASS_DEVICE void mma(
+      BlkCoord const& blk_coord,
+      ProblemShape const& problem_shape,
+      TensorStorage& shared_tensors,
+      PipelineLoadQK& pipeline_load_qk,
+      typename PipelineLoadQK::PipelineState& pipeline_load_qk_consumer_state,
+      PipelineLoadPV& pipeline_load_pv,
+      typename PipelineLoadPV::PipelineState& pipeline_load_pv_consumer_state,
+      PipelineS& pipeline_mma_s,
+      typename PipelineS::PipelineState& pipeline_mma_s_producer_state,
+      PipelineP& pipeline_p_mma,
+      typename PipelineP::PipelineState& pipeline_p_mma_consumer_state,
+      PipelineO& pipeline_mma_o,
+      typename PipelineO::PipelineState& pipeline_mma_o_producer_state,
+      int const& split_kv) {
+
+    auto [H, K, D, B] = problem_shape;
+
+    int k_tile_total = ceil_div(K, TileShapeS{});
+    int k_tile_per_cta = ceil_div(k_tile_total, split_kv);
+    int k_index = get<3>(blk_coord) * k_tile_per_cta; // lower limit
+    int k_tile_count = max(0, min(k_tile_total, k_index + k_tile_per_cta) - k_index);
+    if (k_tile_count == 0) {
+      return;
+    }
+
+    // mma init
+    Tensor sQ = make_tensor(make_smem_ptr(shared_tensors.smem_q.begin()), SmemLayoutQ{});
+    Tensor sKC = make_tensor(make_smem_ptr(shared_tensors.smem_kc.begin()), SmemLayoutKC{});
+    Tensor sVC = make_tensor(make_smem_ptr(shared_tensors.smem_vc.begin()), SmemLayoutVC{});
+    Tensor sP = make_tensor(make_smem_ptr((Element*) shared_tensors.smem_p.begin()), SmemLayoutP{});
+
+    Tensor tSrQ = TiledMmaQK::make_fragment_A(sQ);
+    Tensor tSrKC = TiledMmaQK::make_fragment_B(sKC);
+    Tensor tOrP = TiledMmaPV::make_fragment_A(sP);
+    Tensor tOrVC = TiledMmaPV::make_fragment_B(sVC);
+
+    TiledMmaQK tiled_mma_qk;
+    TiledMmaPV tiled_mma_pv;
+
+    Tensor tStS =  partition_fragment_C(tiled_mma_qk, select<0,1>(TileShapeQK{}));
+    Tensor tItI =  partition_fragment_C(tiled_mma_pv, select<0,1>(TileShapePV{}));
+
+    tiled_mma_pv.accumulate_ = UMMA::ScaleOut::Zero;
+
+    pipeline_mma_s.producer_acquire(pipeline_mma_s_producer_state);
+
+    // Mma           S0 S1 O0 S2 O1 ... Sn On-1 On
+    // S0 ownership  --    -----        --      --
+    // S1 ownership     --       -----     ----
+    // O ownership         --    --        ---- --
+
+    tiled_mma_qk.accumulate_ = UMMA::ScaleOut::Zero;
+    for (int i = 0; i < IterationsQK; i++) {
+      pipeline_load_qk.consumer_wait(pipeline_load_qk_consumer_state);
+      int read_stage = pipeline_load_qk_consumer_state.index();
+
+      tStS.data() = uint32_t(pipeline_mma_s_producer_state.index() == 0 ? TmemAllocation::kS0 : TmemAllocation::kS1);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tSrQ); ++k_block) {
+        cute::gemm(tiled_mma_qk,
+                   tSrQ(_,_,k_block,i),
+                   tSrKC(_,_,k_block,read_stage),
+                   tStS);
+        tiled_mma_qk.accumulate_ = UMMA::ScaleOut::One;
+      }
+
+      pipeline_load_qk.consumer_release(pipeline_load_qk_consumer_state);
+      ++pipeline_load_qk_consumer_state;
+    }
+
+    pipeline_mma_s.producer_commit(pipeline_mma_s_producer_state);
+    ++pipeline_mma_s_producer_state;
+
+    k_tile_count -= 1;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+
+      pipeline_mma_s.producer_acquire(pipeline_mma_s_producer_state);
+      tiled_mma_qk.accumulate_ = UMMA::ScaleOut::Zero;
+      for (int i = 0; i < IterationsQK; i++) {
+        pipeline_load_qk.consumer_wait(pipeline_load_qk_consumer_state);
+        int read_stage = pipeline_load_qk_consumer_state.index();
+
+        tStS.data() = uint32_t(pipeline_mma_s_producer_state.index() == 0 ? TmemAllocation::kS0 : TmemAllocation::kS1);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int k_block = 0; k_block < size<2>(tSrQ); ++k_block) {
+          cute::gemm(tiled_mma_qk,
+                     tSrQ(_,_,k_block,i),
+                     tSrKC(_,_,k_block,read_stage),
+                     tStS);
+          tiled_mma_qk.accumulate_ = UMMA::ScaleOut::One;
+        }
+
+        pipeline_load_qk.consumer_release(pipeline_load_qk_consumer_state);
+        ++pipeline_load_qk_consumer_state;
+      }
+
+      pipeline_mma_s.producer_commit(pipeline_mma_s_producer_state);
+      ++pipeline_mma_s_producer_state;
+
+      pipeline_mma_o.producer_acquire(pipeline_mma_o_producer_state);
+      pipeline_p_mma.consumer_wait(pipeline_p_mma_consumer_state);
+
+      for (int i = 0; i < IterationsPV_K; i++) {
+        auto acc_flag = tiled_mma_pv.accumulate_;
+        for (int j = 0; j < IterationsPV_N; j++) {
+          pipeline_load_pv.consumer_wait(pipeline_load_pv_consumer_state);
+
+          int read_stage = pipeline_load_pv_consumer_state.index();
+
+          tItI.data() = uint32_t(TmemAllocation::kO0) + j * uint32_t(TmemAllocation::kSizeAccO);
+          tiled_mma_pv.accumulate_ = acc_flag;
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int k_block = 0; k_block < size<2>(tOrP); ++k_block) {
+            cute::gemm(tiled_mma_pv,
+                       tOrP(_,_,k_block, make_coord(i, pipeline_p_mma_consumer_state.index())),
+                       tOrVC(_,_,k_block,read_stage),
+                       tItI);
+            tiled_mma_pv.accumulate_ = UMMA::ScaleOut::One;
+          }
+
+          pipeline_load_pv.consumer_release(pipeline_load_pv_consumer_state);
+          ++pipeline_load_pv_consumer_state;
+        }
+      }
+
+      pipeline_p_mma.consumer_release(pipeline_p_mma_consumer_state);
+      ++pipeline_p_mma_consumer_state;
+      pipeline_mma_o.producer_commit(pipeline_mma_o_producer_state);
+      ++pipeline_mma_o_producer_state;
+
+      --k_tile_count;
+    }
+
+    pipeline_mma_o.producer_acquire(pipeline_mma_o_producer_state);
+    pipeline_p_mma.consumer_wait(pipeline_p_mma_consumer_state);
+
+    for (int i = 0; i < IterationsPV_K; i++) {
+      auto acc_flag = tiled_mma_pv.accumulate_;
+      for (int j = 0; j < IterationsPV_N; j++) {
+        pipeline_load_pv.consumer_wait(pipeline_load_pv_consumer_state);
+
+        int read_stage = pipeline_load_pv_consumer_state.index();
+
+        tItI.data() = uint32_t(TmemAllocation::kO0) + j * uint32_t(TmemAllocation::kSizeAccO);
+        tiled_mma_pv.accumulate_ = acc_flag;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int k_block = 0; k_block < size<2>(tOrP); ++k_block) {
+          cute::gemm(tiled_mma_pv,
+                     tOrP(_,_,k_block, make_coord(i, pipeline_p_mma_consumer_state.index())),
+                     tOrVC(_,_,k_block,read_stage),
+                     tItI);
+          tiled_mma_pv.accumulate_ = UMMA::ScaleOut::One;
+        }
+
+        pipeline_load_pv.consumer_release(pipeline_load_pv_consumer_state);
+        ++pipeline_load_pv_consumer_state;
+      }
+    }
+
+    pipeline_p_mma.consumer_release(pipeline_p_mma_consumer_state);
+    ++pipeline_p_mma_consumer_state;
+    pipeline_mma_o.producer_commit(pipeline_mma_o_producer_state);
+    ++pipeline_mma_o_producer_state;
+  }
+
+
+  template<class IsLastTile>
+  CUTLASS_DEVICE void softmax(
+      IsLastTile const& is_last_tile,
+      ElementAcc& row_max,
+      ElementAcc& row_sum,
+      ElementAcc& correction_factor,
+      ProblemShape const& problem_shape,
+      MainloopArguments const& mainloop_args,
+      TensorStorage& shared_tensors,
+      int k_index,
+      uint32_t tmem_s,
+      int smem_p_index) {
+
+    auto load_op = cute::SM100_TMEM_LOAD_32dp32b32x{};
+
+    TiledMmaQK tiled_mma_qk;
+
+    Tensor tStS = partition_fragment_C(tiled_mma_qk, select<0,1>(TileShapeQK{}));
+    tStS.data() = tmem_s;
+
+    CUTE_STATIC_ASSERT_V(shape<1>(tStS) == _1{});
+    CUTE_STATIC_ASSERT_V(shape<2>(tStS) == _1{});
+    Tensor tAcc = tStS(make_coord(_,_),_0{},_0{});
+
+    Tensor cS = make_identity_tensor(take<0,2>(CtaShapeQK{}));
+
+    auto tiled_t2r = make_tmem_copy(load_op, tAcc);
+    auto thread_idx = threadIdx.x % size(tiled_t2r);
+
+    auto thread_t2r = tiled_t2r.get_slice(thread_idx);
+    Tensor tTR_cS   = thread_t2r.partition_D(cS);
+    Tensor tTR_rAcc = make_tensor<ElementAcc>(shape(tTR_cS));
+
+    Tensor tTR_rS_frag = make_tensor<Element>(shape(tTR_rAcc));
+    const int AlignmentS = 4;
+    Tensor tTR_tAcc = thread_t2r.partition_S(tAcc);
+    Tensor tTR_rAcc_vec = recast<Array<ElementAcc, AlignmentS>>(tTR_rAcc);
+    Tensor tTR_rS_vec = recast<Array<Element, AlignmentS>>(tTR_rS_frag);
+
+    // load s
+    copy(tiled_t2r, tTR_tAcc, tTR_rAcc);
+
+    if (is_last_tile) {
+      for (int i = 0; i < size(tTR_rAcc); i++) {
+        if (get<1>(tTR_cS(i)) + TileShapeS{} * k_index >= get<1>(problem_shape)) {
+          tTR_rAcc(i) = -std::numeric_limits<ElementAcc>::infinity();
+        }
+      }
+    }
+
+    // max
+    ElementAcc row_max_new = row_max;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(tTR_rAcc); i += 1) {
+      row_max_new = ::fmax(row_max_new, tTR_rAcc(i));
+    }
+
+    // for 2x2 dp, reduce here
+    if constexpr (kWarpsInN > 1) {
+      shared_tensors.smem_exchange[threadIdx.x] = row_max_new;
+      cutlass::arch::NamedBarrier(kNumComputeWarps*NumThreadsPerWarp, kNamedBarrierExchange).sync();
+      // (64, 2) shape
+      int peer_index = (threadIdx.x + 64) % 128;
+      row_max_new = cutlass::max(row_max_new, shared_tensors.smem_exchange[peer_index]);
+    }
+
+#ifndef B2B
+    // find correction factor
+    ElementAcc softmax_scale_log2 = mainloop_args.softmax_scale * static_cast<ElementAcc>(M_LOG2E);
+    correction_factor = ::exp2f(softmax_scale_log2 * (row_max - row_max_new));
+    row_max = row_max_new;
+
+    // softmax
+    ElementAcc row_max_scale_log2 = row_max * softmax_scale_log2;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(tTR_rAcc); i++) {
+      tTR_rAcc(i) = ::exp2f(softmax_scale_log2 * tTR_rAcc(i) - row_max_scale_log2);
+    }
+#endif
+
+    // quantize
+    cutlass::NumericArrayConverter<Element, ElementAcc, AlignmentS> epilogue_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(tTR_rAcc_vec); i++) {
+      tTR_rS_vec(i) = epilogue_op(tTR_rAcc_vec(i));
+    }
+
+    Tensor sP = make_tensor(make_smem_ptr((Element*) shared_tensors.smem_p.begin()), SmemLayoutP{})(_, _, _, make_coord(_, smem_p_index));
+
+    Tensor tOcP = TiledMmaPV{}.get_slice(_0{}).partition_A(cS);
+
+    // have a mapping for each thread to coord
+    // find identical mapping to coords for the MMA
+    auto l = make_ordered_layout(make_shape(make_shape(_64{}, _2{}), make_shape(_16{}, TileShapeS{} / _32{})), make_stride(make_stride(_0{}, _3{}), make_stride(_1{}, _2{})));
+    auto sP_ = as_position_independent_swizzle_tensor(sP);
+    copy_aligned(tTR_rS_frag, sP_.compose(l)(threadIdx.x, _));
+
+    // sum
+    row_sum *= correction_factor;
+
+    static_assert(cute::is_same_v<ElementAcc, float>);
+    auto tTR_rAcc_float2 = recast<float2>(tTR_rAcc);
+    auto sums = make_tensor<float2>(_4{});
+    static_assert(size(tTR_rAcc_float2) % size(sums) == 0);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(sums); i++) {
+      sums(i) = tTR_rAcc_float2(i);
+    }
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = size(sums); i < size(tTR_rAcc_float2); i += size(sums)) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < size(sums); j++) {
+        cute::add(sums(j), sums(j), tTR_rAcc_float2(i + j));
+      }
+    }
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < size(sums); i *= 2) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < size(sums); j += 2*i) {
+        cute::add(sums(j), sums(j), sums(j+i));
+      }
+    }
+    row_sum += sums(0).x + sums(0).y;
+  }
+
+
+  CUTLASS_DEVICE void rescale(
+      ElementAcc correction_factor,
+      uint32_t tmem_o) {
+
+    // for b2b gemm, do nothing
+#ifndef B2B
+    auto load_op = cute::SM100_TMEM_LOAD_32dp32b32x{};
+    auto store_op = TMEM::tmem_load_to_store(load_op);
+
+    TiledMmaPV tiled_mma_pv;
+
+    Tensor tItI = partition_fragment_C(tiled_mma_pv, select<0,1>(TileShapePV{}));
+    tItI.data() = tmem_o;
+
+    CUTE_STATIC_ASSERT_V(shape<1>(tItI) == _1{});
+    CUTE_STATIC_ASSERT_V(shape<2>(tItI) == _1{});
+    Tensor tAcc = tItI(make_coord(_,_),_0{},_0{});
+
+    auto cta_tiler_pv = take<0,2>(typename CollectiveMmaPV::CtaShape_MNK{});
+    Tensor gO = make_tensor(make_gmem_ptr((ElementAcc*) nullptr), cta_tiler_pv, make_stride(0, 0));
+
+    auto tiled_t2r = make_tmem_copy(load_op, tAcc);
+    auto tiled_r2t = make_tmem_copy(store_op, tAcc);
+    auto thread_idx = threadIdx.x % size(tiled_t2r);
+
+    auto thread_t2r = tiled_t2r.get_slice(thread_idx);
+    auto thread_r2t = tiled_r2t.get_slice(thread_idx);
+    Tensor tTR_gO   = thread_t2r.partition_D(gO);
+    Tensor tTR_rAcc = make_tensor<ElementAcc>(shape(tTR_gO));
+
+    Tensor tTR_tAcc = thread_t2r.partition_S(tAcc);
+
+    // load o
+    copy(tiled_t2r, tTR_tAcc, tTR_rAcc);
+
+    // multiply by correction factor
+    float2 correction_factor_vec = make_float2(correction_factor, correction_factor);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(tTR_rAcc); i += 2) {
+      float2 in = make_float2(tTR_rAcc(i + 0), tTR_rAcc(i + 1));
+      float2 out;
+      cute::mul(out, in, correction_factor_vec);
+      tTR_rAcc(i + 0) = out.x;
+      tTR_rAcc(i + 1) = out.y;
+    }
+
+    // store o
+    copy(tiled_r2t, tTR_rAcc, tTR_tAcc);
+#endif
+ }
+
+
+  template<class BlkCoord>
+  CUTLASS_DEVICE void epilogue(
+      ElementAcc& row_max,
+      ElementAcc& row_sum,
+      BlkCoord const& cta_coord,
+      ProblemShape const& problem_shape,
+      MainloopArguments const& mainloop_args,
+      EpilogueParams const& epilogue_args,
+      TensorStorage& shared_tensors,
+      uint32_t tmem_o,
+      int const& split_kv) {
+
+    auto load_op = cute::SM100_TMEM_LOAD_32dp32b32x{};
+
+    TiledMmaPV tiled_mma_pv;
+
+    Tensor tItI = TiledMmaPV::make_fragment_C(partition_shape_C(TiledMmaPV{}, take<0, 2>(TileShapePV{})));
+    tItI.data() = tmem_o;
+
+    CUTE_STATIC_ASSERT_V(shape<1>(tItI) == _1{});
+    CUTE_STATIC_ASSERT_V(shape<2>(tItI) == _1{});
+    Tensor tAcc = tItI(make_coord(_,_),_0{},_0{});
+
+    auto [H, K, D, B] = problem_shape;
+    auto [D_latent, D_rope] = D;
+    if (epilogue_args.ptr_o_acc != nullptr) {
+      using ElementOutAcc = ElementAcc;
+      constexpr auto AlignmentOutAcc = 128 / cute::sizeof_bits_v<ElementOutAcc>;
+      Tensor mO = make_tensor(make_gmem_ptr(epilogue_args.ptr_o_acc + get<3>(cta_coord) * D_latent), make_shape(H, D_latent, B), epilogue_args.stride_o_acc);
+      auto cta_tiler_pv = take<0,2>(typename CollectiveMmaPV::CtaShape_MNK{});
+      Tensor gO = local_tile(mO, cta_tiler_pv, take<0,3>(cta_coord));
+
+      auto tiled_t2r = make_tmem_copy(load_op, tAcc);
+      auto thread_idx = threadIdx.x % size(tiled_t2r);
+
+      auto thread_t2r = tiled_t2r.get_slice(thread_idx);
+      Tensor tTR_gO   = thread_t2r.partition_D(gO);
+      Tensor tTR_rAcc = make_tensor<ElementAcc>(shape(tTR_gO));
+
+      Tensor tTR_rO_frag = make_tensor<ElementOutAcc>(shape(tTR_rAcc));
+      Tensor tTR_rO_src = recast<Array<ElementOutAcc, AlignmentOutAcc>>(coalesce(tTR_rO_frag));
+      Tensor tR2G_rO_dst = recast<Array<ElementOutAcc, AlignmentOutAcc>>(coalesce(tTR_gO));
+      Tensor tTR_tAcc = thread_t2r.partition_S(tAcc);
+
+      copy(tiled_t2r, tTR_tAcc, tTR_rAcc);
+
+      cutlass::epilogue::thread::LinearCombination<ElementOutAcc, 1, ElementAcc, ElementAcc, cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling> epilogue_op({epilogue_args.output_scale / row_sum});
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(tTR_rAcc); i++) {
+        tTR_rO_frag(i) = epilogue_op(tTR_rAcc(i));
+      }
+
+      copy(tTR_rO_src, tR2G_rO_dst);
+
+#ifndef B2B
+
+      // compute LSE
+      ElementAcc lse = cutlass::fast_log(row_sum) + mainloop_args.softmax_scale * row_max;
+
+      // store LSE
+      Tensor mLSE = make_tensor(make_gmem_ptr(epilogue_args.ptr_lse_acc + H * get<3>(cta_coord)), make_shape(H, B), epilogue_args.stride_lse_acc);
+      Tensor gLSE = local_tile(mLSE, append<3>(cta_tiler_pv, _1{}), take<0,3>(cta_coord), Step<_1, Underscore, _1>{});
+      // for 2x2 dp, this must be conditional and the index is wrong
+      if (! kIs2Sm || (threadIdx.x < 64))
+      {
+          gLSE(threadIdx.x) = lse;
+      }
+ #endif
+    }
+    else {
+      Tensor mO = make_tensor(make_gmem_ptr(epilogue_args.ptr_o), make_shape(H, D_latent, B), epilogue_args.stride_o);
+      auto cta_tiler_pv = take<0,2>(typename CollectiveMmaPV::CtaShape_MNK{});
+      Tensor gO = local_tile(mO, cta_tiler_pv, take<0,3>(cta_coord));
+
+      auto tiled_t2r = make_tmem_copy(load_op, tAcc);
+      auto thread_idx = threadIdx.x % size(tiled_t2r);
+
+      auto thread_t2r = tiled_t2r.get_slice(thread_idx);
+      Tensor tTR_gO   = thread_t2r.partition_D(gO);
+      Tensor tTR_rAcc = make_tensor<ElementAcc>(shape(tTR_gO));
+
+      Tensor tTR_rO_frag = make_tensor<ElementOut>(shape(tTR_rAcc));
+      Tensor tTR_rO_src = recast<Array<ElementOut, AlignmentOut>>(coalesce(tTR_rO_frag));
+      Tensor tR2G_rO_dst = recast<Array<ElementOut, AlignmentOut>>(coalesce(tTR_gO));
+      Tensor tTR_tAcc = thread_t2r.partition_S(tAcc);
+
+      copy(tiled_t2r, tTR_tAcc, tTR_rAcc);
+
+      cutlass::epilogue::thread::LinearCombination<ElementOut, 1, ElementAcc, ElementAcc, cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling> epilogue_op({epilogue_args.output_scale / row_sum});
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(tTR_rAcc); i++) {
+        tTR_rO_frag(i) = epilogue_op(tTR_rAcc(i));
+      }
+
+      copy(tTR_rO_src, tR2G_rO_dst);
+
+#ifndef B2B
+      if (epilogue_args.ptr_lse != nullptr) {
+        // compute LSE
+        ElementAcc lse = cutlass::fast_log(row_sum) + mainloop_args.softmax_scale * row_max;
+
+        // store LSE
+        Tensor mLSE = make_tensor(make_gmem_ptr(epilogue_args.ptr_lse), make_shape(H, B), epilogue_args.stride_lse);
+        Tensor gLSE = local_tile(mLSE, append<3>(cta_tiler_pv, _1{}), take<0,3>(cta_coord), Step<_1, Underscore, _1>{});
+
+        // for 2x2 dp, this must be conditional and the index is wrong
+        if (! kIs2Sm || (threadIdx.x < 64))
+        {
+          gLSE(threadIdx.x) = lse;
+        }
+      }
+#endif
+    }
+  }
+
+
+  template<class CtaCoord>
+  CUTLASS_DEVICE void compute(
+      CtaCoord const& cta_coord,
+      ProblemShape const& problem_shape,
+      MainloopArguments const& mainloop_args,
+      EpilogueParams const& epilogue_args,
+      TensorStorage& shared_tensors,
+      PipelineS& pipeline_mma_s,
+      typename PipelineS::PipelineState& pipeline_mma_s_consumer_state,
+      PipelineP& pipeline_p_mma,
+      typename PipelineP::PipelineState& pipeline_p_mma_producer_state,
+      PipelineO& pipeline_mma_o,
+      typename PipelineO::PipelineState& pipeline_mma_o_consumer_state,
+      int const& split_kv) {
+
+    auto [H, K, D, B] = problem_shape;
+
+    int k_tile_total = ceil_div(K, TileShapeS{});
+    int k_tile_per_cta = ceil_div(k_tile_total, split_kv);
+    int k_index = get<3>(cta_coord) * k_tile_per_cta; // lower limit
+    int k_tile_count = max(0, min(k_tile_total, k_index + k_tile_per_cta) - k_index);
+    if (k_tile_count == 0) {
+
+      // if we return early, we have to make sure we release the load warp
+      cutlass::arch::NamedBarrier(
+          (kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp,
+          kNamedBarrierEpilogue
+      ).arrive();
+
+      return;
+    }
+    int k_index_final = k_tile_total - 1;
+
+    ElementAcc row_max = -std::numeric_limits<ElementAcc>::infinity();
+    ElementAcc row_sum = 0;
+    ElementAcc correction_factor = 1;
+
+    pipeline_p_mma.producer_acquire(pipeline_p_mma_producer_state);
+    pipeline_mma_s.consumer_wait(pipeline_mma_s_consumer_state);
+
+    auto dispatch_bool = [](bool b, auto fn) {
+      if (b) {
+        fn(cute::true_type{});
+      }
+      else {
+        fn(cute::false_type{});
+      }
+    };
+
+    // softmax s0 -> p0
+    dispatch_bool(k_index == k_index_final, [&](auto is_last_tile) {
+      softmax(
+          is_last_tile,
+          row_max, row_sum, correction_factor,
+          problem_shape, mainloop_args, shared_tensors, k_index,
+          uint32_t(pipeline_mma_s_consumer_state.index() == 0 ? TmemAllocation::kS0 : TmemAllocation::kS1),
+          pipeline_p_mma_producer_state.index()
+      );
+    });
+
+    k_index += 1;
+
+    cutlass::arch::fence_view_async_tmem_load();
+    cutlass::arch::fence_view_async_shared();
+    pipeline_mma_s.consumer_release(pipeline_mma_s_consumer_state);
+    ++pipeline_mma_s_consumer_state;
+    pipeline_p_mma.producer_commit(pipeline_p_mma_producer_state);
+    ++pipeline_p_mma_producer_state;
+
+    k_tile_count -= 1;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      pipeline_p_mma.producer_acquire(pipeline_p_mma_producer_state);
+      pipeline_mma_s.consumer_wait(pipeline_mma_s_consumer_state);
+
+      // softmax s1 -> p1
+      dispatch_bool(k_index == k_index_final, [&](auto is_last_tile) {
+        softmax(
+            is_last_tile,
+            row_max, row_sum, correction_factor,
+            problem_shape, mainloop_args, shared_tensors, k_index,
+            uint32_t(pipeline_mma_s_consumer_state.index() == 0 ? TmemAllocation::kS0 : TmemAllocation::kS1),
+            pipeline_p_mma_producer_state.index()
+        );
+      });
+
+      cutlass::arch::fence_view_async_tmem_load();
+      cutlass::arch::fence_view_async_shared();
+      pipeline_mma_s.consumer_release(pipeline_mma_s_consumer_state);
+      ++pipeline_mma_s_consumer_state;
+      pipeline_p_mma.producer_commit(pipeline_p_mma_producer_state);
+      ++pipeline_p_mma_producer_state;
+
+      pipeline_mma_o.consumer_wait(pipeline_mma_o_consumer_state);
+
+      // rescale
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < IterationsPV_N; j++) {
+        rescale(correction_factor, uint32_t(TmemAllocation::kO0) + j * uint32_t(TmemAllocation::kSizeAccO));
+      }
+
+      cutlass::arch::fence_view_async_tmem_store();
+      pipeline_mma_o.consumer_release(pipeline_mma_o_consumer_state);
+      ++pipeline_mma_o_consumer_state;
+
+      --k_tile_count;
+      k_index += 1;
+    }
+
+    pipeline_mma_o.consumer_wait(pipeline_mma_o_consumer_state);
+
+#ifdef B2B
+    row_sum = 1;
+#else
+    if constexpr (kWarpsInN > 1) {
+      // reduce row_sum if needed (for 2x2 dp)
+      shared_tensors.smem_exchange[threadIdx.x] = row_sum;
+      cutlass::arch::NamedBarrier(kNumComputeWarps*NumThreadsPerWarp, kNamedBarrierExchange).sync();
+      // (64, 2) shape
+      int peer_index = (threadIdx.x + 64) % 128;
+      row_sum += shared_tensors.smem_exchange[peer_index];
+    }
+#endif
+
+    cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive();
+
+    // epilogue
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < IterationsPV_N; j++) {
+      epilogue(
+          row_max, row_sum,
+          replace<1>(cta_coord, j), problem_shape,
+          mainloop_args, epilogue_args, shared_tensors,
+          uint32_t(TmemAllocation::kO0) + j * uint32_t(TmemAllocation::kSizeAccO), split_kv
+      );
+    }
+
+    cutlass::arch::fence_view_async_tmem_load();
+    pipeline_mma_o.consumer_release(pipeline_mma_o_consumer_state);
+    ++pipeline_mma_o_consumer_state;
+  }
+
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::fmha::kernel
diff --git a/vllm_v0.10.0/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_mla_tile_scheduler.hpp b/vllm_v0.10.0/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_mla_tile_scheduler.hpp
new file mode 100644
index 0000000..c990ee2
--- /dev/null
+++ b/vllm_v0.10.0/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_mla_tile_scheduler.hpp
@@ -0,0 +1,165 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*
+ * Taken from SGLANG PR https://github.com/sgl-project/sglang/pull/6929
+ * by Alcanderian JieXin Liang
+ */
+
+// clang-format off
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.h"
+
+namespace cutlass::fmha::kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct Sm100MlaIndividualTileScheduler {
+
+  struct Params {
+    dim3 grid;
+  };
+
+  bool valid_ = true;
+
+  CUTLASS_DEVICE
+  Sm100MlaIndividualTileScheduler(Params const&) {}
+
+  template<class ProblemShape, class ClusterShape>
+  static Params to_underlying_arguments(
+      ProblemShape const& problem_shape, KernelHardwareInfo hw_info,
+      ClusterShape const& cluster_shape, int const& split_kv) {
+    using namespace cute;
+    dim3 grid(get<0>(cluster_shape), get<3>(problem_shape) /* Batch */, split_kv /*Maximum Split KV*/);
+    return Params{ grid };
+  }
+
+  static dim3 get_grid_shape(Params const& params) {
+    return params.grid;
+  }
+
+  CUTLASS_DEVICE
+  bool is_valid() {
+    return valid_;
+  }
+
+  CUTLASS_DEVICE
+  auto get_block_coord() {
+    using namespace cute;
+    return make_coord(blockIdx.x, _0{}, blockIdx.y, blockIdx.z);
+  }
+
+  CUTLASS_DEVICE
+  Sm100MlaIndividualTileScheduler& operator++() {
+    valid_ = false;
+    return *this;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct Sm100MlaPersistentTileScheduler {
+
+  struct Params {
+    int num_blocks;
+    FastDivmod divmod_m_block;
+    FastDivmod divmod_b;
+    FastDivmod divmod_split_kv;
+    KernelHardwareInfo hw_info;
+  };
+
+  int block_idx = 0;
+  Params params;
+
+  CUTLASS_DEVICE
+  Sm100MlaPersistentTileScheduler(Params const& params) : block_idx(blockIdx.x), params(params) {}
+
+  template<class ProblemShape, class ClusterShape>
+  static Params to_underlying_arguments(
+      ProblemShape const& problem_shape, KernelHardwareInfo hw_info,
+      ClusterShape const& cluster_shape, int const& split_kv) {
+    using namespace cute;
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = hw_info.sm_count;
+    if (sm_count <= 1 || sm_count % size<0>(cluster_shape) != 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+    hw_info.sm_count = sm_count;
+
+    int num_m_blocks = size<0>(cluster_shape);
+    int num_blocks = num_m_blocks * get<3>(problem_shape)  /* Batch */;
+    num_blocks *= split_kv; /* Maximum Split KV*/
+
+    return Params {
+      num_blocks,
+      { num_m_blocks}, { get<3>(problem_shape) }, {split_kv},
+      hw_info
+    };
+  }
+
+  static dim3 get_grid_shape(Params const& params) {
+    dim3 grid(std::min(params.num_blocks, params.hw_info.sm_count), 1, 1);
+    return grid;
+  }
+
+  CUTLASS_DEVICE
+  bool is_valid() {
+    return block_idx < params.num_blocks;
+  }
+
+  CUTLASS_DEVICE
+  auto get_block_coord() {
+    using namespace cute;
+    int block_decode = block_idx;
+    int m_block, bidb, n_split_kv;
+    params.divmod_m_block(block_decode, m_block, block_decode);
+    params.divmod_b(block_decode, bidb, block_decode);
+    params.divmod_split_kv(block_decode, n_split_kv, block_decode);
+    return make_coord(m_block, _0{}, bidb, n_split_kv);
+  }
+
+  CUTLASS_DEVICE
+  Sm100MlaPersistentTileScheduler& operator++() {
+    block_idx += gridDim.x;
+    return *this;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::fmha::kernel
diff --git a/vllm_v0.10.0/csrc/attention/mla/sm100_cutlass_mla_kernel.cu b/vllm_v0.10.0/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
new file mode 100644
index 0000000..e0e95d0
--- /dev/null
+++ b/vllm_v0.10.0/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
@@ -0,0 +1,283 @@
+/*
+Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/*
+ * Taken from SGLANG PR https://github.com/sgl-project/sglang/pull/6929
+ * by Alcanderian JieXin Liang
+ */
+#include "core/registration.h"
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/kernel_hardware_info.h>
+#include <torch/all.h>
+
+#include <cute/tensor.hpp>
+#include <iostream>
+
+#include "cutlass_sm100_mla/device/sm100_mla.hpp"
+#include "cutlass_sm100_mla/kernel/sm100_mla_tile_scheduler.hpp"
+
+// clang-format off
+#if !defined(CUDA_VERSION) || CUDA_VERSION < 12040
+void sm100_cutlass_mla_decode(
+    torch::Tensor const& out,
+    torch::Tensor const& q_nope,
+    torch::Tensor const& q_pe,
+    torch::Tensor const& kv_c_and_k_pe_cache,
+    torch::Tensor const& seq_lens,
+    torch::Tensor const& page_table,
+    torch::Tensor const& workspace,
+    int64_t num_kv_splits) {
+  TORCH_CHECK(false, "CUDA version must be >= 12.4 for cutlass_mla_decode");
+}
+int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_batches, int64_t sm_count, int64_t num_kv_splits) {
+  TORCH_CHECK(false, "CUDA version must be >= 12.4 for cutlass_mla_get_workspace_size");
+}
+#else
+
+#define CUTLASS_CHECK(status)                                                       \
+  {                                                                                 \
+    cutlass::Status error = status;                                                 \
+    TORCH_CHECK(error == cutlass::Status::kSuccess, cutlassGetStatusString(error)); \
+  }
+
+using namespace cute;
+using namespace cutlass::fmha::kernel;
+
+template <bool v>
+struct IsPersistent {
+  static const bool value = v;
+};
+
+template <typename T, bool IsPaged128, typename PersistenceOption = IsPersistent<true>>
+struct MlaSm100 {
+  using Element = T;
+  using ElementAcc = float;
+  using ElementOut = T;
+
+  using TileShape = Shape<_128, _128, Shape<_512, _64>>;
+  using TileShapeH = cute::tuple_element_t<0, TileShape>;
+  using TileShapeD = cute::tuple_element_t<2, TileShape>;
+
+  // H K (D_latent D_rope) B
+  using ProblemShape = cute::tuple<TileShapeH, int, TileShapeD, int>;
+
+  using StrideQ = cute::tuple<int64_t, _1, int64_t>;  // H D B
+  using StrideK = cute::tuple<int64_t, _1, int64_t>;  // K D B
+  using StrideO = StrideK;                            // H D B
+  using StrideLSE = cute::tuple<_1, int>;             // H B
+
+  using TileScheduler =
+      std::conditional_t<PersistenceOption::value, Sm100MlaPersistentTileScheduler, Sm100MlaIndividualTileScheduler>;
+
+  using FmhaKernel = cutlass::fmha::kernel::Sm100FmhaMlaKernelTmaWarpspecialized<
+      TileShape,
+      Element,
+      ElementAcc,
+      ElementOut,
+      ElementAcc,
+      TileScheduler,
+      /*kIsCpAsync=*/!IsPaged128>;
+  using Fmha = cutlass::fmha::device::MLA<FmhaKernel>;
+};
+
+template <typename T>
+typename T::Fmha::Arguments args_from_options(
+    at::Tensor const& out,
+    at::Tensor const& q_nope,
+    at::Tensor const& q_pe,
+    at::Tensor const& kv_c_and_k_pe_cache,
+    at::Tensor const& seq_lens,
+    at::Tensor const& page_table,
+    double sm_scale,
+    int64_t num_kv_splits) {
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.device_id = q_nope.device().index();
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+  int batches = q_nope.sizes()[0];
+  int page_count_per_seq = page_table.sizes()[1];
+  int page_count_total = kv_c_and_k_pe_cache.sizes()[0];
+  int page_size = kv_c_and_k_pe_cache.sizes()[1];
+  int max_seq_len = page_size * page_count_per_seq;
+  using TileShapeH = typename T::TileShapeH;
+  using TileShapeD = typename T::TileShapeD;
+  auto problem_shape = cute::make_tuple(TileShapeH{}, max_seq_len, TileShapeD{}, batches);
+
+  auto [H, K, D, B] = problem_shape;
+  auto [D_latent, D_rope] = D;
+
+  float scale = float(sm_scale);
+
+  using StrideQ = typename T::StrideQ;
+  using StrideK = typename T::StrideK;
+  using StrideO = typename T::StrideO;
+  using StrideLSE = typename T::StrideLSE;
+
+  StrideQ stride_Q_nope = cute::make_tuple(
+      static_cast<int64_t>(q_nope.stride(1)), _1{}, static_cast<int64_t>(q_nope.stride(0)));
+  StrideQ stride_Q_pe = cute::make_tuple(
+      static_cast<int64_t>(q_pe.stride(1)), _1{}, static_cast<int64_t>(q_pe.stride(0)));
+
+  StrideK stride_C = cute::make_tuple(
+      static_cast<int64_t>(0 + D_latent + D_rope), _1{}, static_cast<int64_t>(page_size * (D_latent + D_rope)));
+  StrideLSE stride_PT = cute::make_stride(_1{}, page_count_per_seq);
+  StrideLSE stride_LSE = cute::make_tuple(_1{}, 0 + H);
+  StrideO stride_O = cute::make_tuple(static_cast<int64_t>(0 + D_latent), _1{}, static_cast<int64_t>(0 + H * D_latent));
+
+  using Element = typename T::Element;
+  using ElementOut = typename T::ElementOut;
+  using ElementAcc = typename T::ElementAcc;
+  auto Q_nope_ptr = static_cast<Element*>(q_nope.data_ptr());
+  auto Q_pe_ptr = static_cast<Element*>(q_pe.data_ptr());
+  auto C_ptr = static_cast<Element*>(kv_c_and_k_pe_cache.data_ptr());
+  typename T::Fmha::Arguments arguments{
+      problem_shape,
+      {scale,
+       Q_nope_ptr,
+       stride_Q_nope,
+       Q_pe_ptr,
+       stride_Q_pe,
+       C_ptr,
+       stride_C,
+       C_ptr + D_latent,
+       stride_C,
+       static_cast<int*>(seq_lens.data_ptr()),
+       static_cast<int*>(page_table.data_ptr()),
+       stride_PT,
+       page_count_total,
+       page_size},
+      {static_cast<ElementOut*>(out.data_ptr()), stride_O, static_cast<ElementAcc*>(nullptr), stride_LSE},
+      hw_info,
+      // TODO(trevor-m): Change split_kv back to -1 when
+      // https://github.com/NVIDIA/cutlass/issues/2274 is fixed. Split_kv=1 will
+      // perform worse with larger context length and smaller batch sizes.
+      num_kv_splits, // split_kv
+      nullptr,       // is_var_split_kv
+  };
+  // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
+  // split_kv automatically based on batch size and sequence length to balance
+  // workload across available SMs. Consider using var_split_kv for manual
+  // control if needed.
+  T::Fmha::set_split_kv(arguments);
+  return arguments;
+}
+
+template <typename Element, bool IsPaged128, typename PersistenceOption>
+void runMla(
+    at::Tensor const& out,
+    at::Tensor const& q_nope,
+    at::Tensor const& q_pe,
+    at::Tensor const& kv_c_and_k_pe_cache,
+    at::Tensor const& seq_lens,
+    at::Tensor const& page_table,
+    at::Tensor const& workspace,
+    double sm_scale,
+    int64_t num_kv_splits,
+    cudaStream_t stream) {
+  using MlaSm100Type = MlaSm100<Element, IsPaged128, PersistenceOption>;
+  typename MlaSm100Type::Fmha fmha;
+  auto arguments = args_from_options<MlaSm100Type>(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits);
+
+  CUTLASS_CHECK(fmha.can_implement(arguments));
+
+  CUTLASS_CHECK(fmha.initialize(arguments, workspace.data_ptr(), stream));
+
+  CUTLASS_CHECK(fmha.run(arguments, workspace.data_ptr(), stream));
+}
+
+#define DISPATCH_BOOL(expr, const_expr, ...) \
+  [&]() -> bool {                            \
+    if (expr) {                              \
+      constexpr bool const_expr = true;      \
+      return __VA_ARGS__();                  \
+    } else {                                 \
+      constexpr bool const_expr = false;     \
+      return __VA_ARGS__();                  \
+    }                                        \
+  }()
+
+void sm100_cutlass_mla_decode(
+    torch::Tensor const& out,
+    torch::Tensor const& q_nope,
+    torch::Tensor const& q_pe,
+    torch::Tensor const& kv_c_and_k_pe_cache,
+    torch::Tensor const& seq_lens,
+    torch::Tensor const& page_table,
+    torch::Tensor const& workspace,
+    double sm_scale,
+    int64_t num_kv_splits) {
+  auto in_dtype = q_nope.dtype();
+  at::cuda::CUDAGuard device_guard{(char)q_nope.get_device()};
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(q_nope.get_device());
+  const int page_size = kv_c_and_k_pe_cache.sizes()[1];
+  
+  // NOTE(alcanderian): IsPersistent has bug with manual split_kv.
+  // Kernel will hang if batch is too large with large num_kv_splits. (for example bs=8, num_kv_splits=8)
+  // Maybe per batch split kv will fix this.
+  DISPATCH_BOOL(page_size == 128, IsPaged128, [&] {
+    DISPATCH_BOOL(num_kv_splits <= 1, NotManualSplitKV, [&] {
+      if (in_dtype == at::ScalarType::Half) {
+        runMla<cutlass::half_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
+          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+      } else if (in_dtype == at::ScalarType::BFloat16) {
+        runMla<cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
+          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+      } else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
+        runMla<cutlass::float_e4m3_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
+          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+      } else {
+        TORCH_CHECK(false, "Unsupported input data type of MLA");
+      }
+      return true;
+    });
+    return true;
+  });
+}
+
+int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_batches, int64_t sm_count, int64_t num_kv_splits) {
+  // Workspace size depends on ElementAcc and ElementLSE (same as ElementAcc)
+  // which are float, so Element type here doesn't matter.
+  using MlaSm100Type = MlaSm100<cutlass::half_t, true>;
+
+  // Get split kv. Requires problem shape and sm_count only.
+  typename MlaSm100Type::Fmha::Arguments arguments;
+  using TileShapeH = typename MlaSm100Type::TileShapeH;
+  using TileShapeD = typename MlaSm100Type::TileShapeD;
+  arguments.problem_shape =
+      cute::make_tuple(TileShapeH{}, static_cast<int>(max_seq_len), TileShapeD{}, static_cast<int>(num_batches));
+  // Assumes device 0 when getting sm_count.
+  arguments.hw_info.sm_count =
+      sm_count <= 0 ? cutlass::KernelHardwareInfo::query_device_multiprocessor_count(/*device_id=*/0) : sm_count;
+  arguments.split_kv = num_kv_splits;
+  MlaSm100Type::Fmha::set_split_kv(arguments);
+
+  return MlaSm100Type::Fmha::get_workspace_size(arguments);
+}
+
+#endif
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("sm100_cutlass_mla_decode", &sm100_cutlass_mla_decode);
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CatchAll, m) {
+  m.impl("sm100_cutlass_mla_get_workspace_size", &sm100_cutlass_mla_get_workspace_size);
+}
+
+// clang-format on
diff --git a/vllm_v0.10.0/csrc/attention/paged_attention_v1.cu b/vllm_v0.10.0/csrc/attention/paged_attention_v1.cu
new file mode 100644
index 0000000..7a5ef10
--- /dev/null
+++ b/vllm_v0.10.0/csrc/attention/paged_attention_v1.cu
@@ -0,0 +1,187 @@
+/*
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * Copyright (c) 2023, The vLLM team.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "attention_kernels.cuh"
+#include "cuda_compat.h"
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
+
+#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE)                                \
+  VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(                     \
+      ((void*)vllm::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE,        \
+                                              BLOCK_SIZE, NUM_THREADS,      \
+                                              KV_DTYPE, IS_BLOCK_SPARSE>),  \
+      shared_mem_size);                                                     \
+  vllm::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE,        \
+                                  NUM_THREADS, KV_DTYPE, IS_BLOCK_SPARSE>   \
+      <<<grid, block, shared_mem_size, stream>>>(                           \
+          out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \
+          scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq,    \
+          alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride,      \
+          k_scale_ptr, v_scale_ptr, tp_rank, blocksparse_local_blocks,      \
+          blocksparse_vert_stride, blocksparse_block_size,                  \
+          blocksparse_head_sliding_step);
+
+// TODO(woosuk): Tune NUM_THREADS.
+template <typename T, typename CACHE_T, int BLOCK_SIZE,
+          vllm::Fp8KVCacheDataType KV_DTYPE, bool IS_BLOCK_SPARSE,
+          int NUM_THREADS = 128>
+void paged_attention_v1_launcher(
+    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
+    torch::Tensor& value_cache, int num_kv_heads, float scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
+    const std::optional<torch::Tensor>& alibi_slopes, torch::Tensor& k_scale,
+    torch::Tensor& v_scale, const int tp_rank,
+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
+  int num_seqs = query.size(0);
+  int num_heads = query.size(1);
+  int head_size = query.size(2);
+  int max_num_blocks_per_seq = block_tables.size(1);
+  int q_stride = query.stride(0);
+  int kv_block_stride = key_cache.stride(0);
+  int kv_head_stride = key_cache.stride(1);
+
+  // NOTE: alibi_slopes is optional.
+  const float* alibi_slopes_ptr =
+      alibi_slopes
+          ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
+          : nullptr;
+
+  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
+  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
+  CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
+  CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
+  int* block_tables_ptr = block_tables.data_ptr<int>();
+  int* seq_lens_ptr = seq_lens.data_ptr<int>();
+  const float* k_scale_ptr = reinterpret_cast<const float*>(k_scale.data_ptr());
+  const float* v_scale_ptr = reinterpret_cast<const float*>(v_scale.data_ptr());
+
+  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
+  int padded_max_seq_len =
+      DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE;
+  int logits_size = padded_max_seq_len * sizeof(float);
+  int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
+  // Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len
+  // Keep that in sync with the logic here!
+  int shared_mem_size = std::max(logits_size, outputs_size);
+
+  dim3 grid(num_heads, num_seqs, 1);
+  dim3 block(NUM_THREADS);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  switch (head_size) {
+    // NOTE(woosuk): To reduce the compilation time, we only compile for the
+    // head sizes that we use in the model. However, we can easily extend this
+    // to support any head size which is a multiple of 16.
+    case 32:
+      LAUNCH_PAGED_ATTENTION_V1(32);
+      break;
+    case 64:
+      LAUNCH_PAGED_ATTENTION_V1(64);
+      break;
+    case 80:
+      LAUNCH_PAGED_ATTENTION_V1(80);
+      break;
+    case 96:
+      LAUNCH_PAGED_ATTENTION_V1(96);
+      break;
+    case 112:
+      LAUNCH_PAGED_ATTENTION_V1(112);
+      break;
+    case 120:
+      LAUNCH_PAGED_ATTENTION_V1(120);
+      break;
+    case 128:
+      LAUNCH_PAGED_ATTENTION_V1(128);
+      break;
+    case 192:
+      LAUNCH_PAGED_ATTENTION_V1(192);
+      break;
+    case 256:
+      LAUNCH_PAGED_ATTENTION_V1(256);
+      break;
+    default:
+      TORCH_CHECK(false, "Unsupported head size: ", head_size);
+      break;
+  }
+}
+
+#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE)  \
+  paged_attention_v1_launcher<T, CACHE_T, BLOCK_SIZE, KV_DTYPE,              \
+                              IS_BLOCK_SPARSE>(                              \
+      out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
+      seq_lens, max_seq_len, alibi_slopes, k_scale, v_scale, tp_rank,        \
+      blocksparse_local_blocks, blocksparse_vert_stride,                     \
+      blocksparse_block_size, blocksparse_head_sliding_step);
+
+#define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
+  if (is_block_sparse) {                                                   \
+    CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);       \
+  } else {                                                                 \
+    CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);      \
+  }
+
+// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
+// 1, 2, 4, 64, 128, 256.
+#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE)         \
+  switch (block_size) {                                           \
+    case 8:                                                       \
+      CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE);         \
+      break;                                                      \
+    case 16:                                                      \
+      CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE);        \
+      break;                                                      \
+    case 32:                                                      \
+      CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE);        \
+      break;                                                      \
+    default:                                                      \
+      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
+      break;                                                      \
+  }
+
+void paged_attention_v1(
+    torch::Tensor& out,    // [num_seqs, num_heads, head_size]
+    torch::Tensor& query,  // [num_seqs, num_heads, head_size]
+    torch::Tensor&
+        key_cache,  // [num_blocks, num_heads, head_size/x, block_size, x]
+    torch::Tensor&
+        value_cache,       // [num_blocks, num_heads, head_size, block_size]
+    int64_t num_kv_heads,  // [num_heads]
+    double scale,
+    torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
+    torch::Tensor& seq_lens,      // [num_seqs]
+    int64_t block_size, int64_t max_seq_len,
+    const std::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
+    torch::Tensor& v_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step) {
+  const bool is_block_sparse = (blocksparse_vert_stride > 1);
+
+  DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
+                             CALL_V1_LAUNCHER_BLOCK_SIZE)
+}
+
+#undef MAX
+#undef MIN
+#undef DIVIDE_ROUND_UP
diff --git a/vllm_v0.10.0/csrc/attention/paged_attention_v2.cu b/vllm_v0.10.0/csrc/attention/paged_attention_v2.cu
new file mode 100644
index 0000000..b45b28d
--- /dev/null
+++ b/vllm_v0.10.0/csrc/attention/paged_attention_v2.cu
@@ -0,0 +1,197 @@
+/*
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * Copyright (c) 2023, The vLLM team.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "attention_kernels.cuh"
+#include "cuda_compat.h"
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
+
+#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE)                                   \
+  vllm::paged_attention_v2_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE,           \
+                                  NUM_THREADS, KV_DTYPE, IS_BLOCK_SPARSE,      \
+                                  PARTITION_SIZE>                              \
+      <<<grid, block, shared_mem_size, stream>>>(                              \
+          exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \
+          value_cache_ptr, num_kv_heads, scale, block_tables_ptr,              \
+          seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride,    \
+          kv_block_stride, kv_head_stride, k_scale_ptr, v_scale_ptr, tp_rank,  \
+          blocksparse_local_blocks, blocksparse_vert_stride,                   \
+          blocksparse_block_size, blocksparse_head_sliding_step);              \
+  vllm::paged_attention_v2_reduce_kernel<T, HEAD_SIZE, NUM_THREADS,            \
+                                         PARTITION_SIZE>                       \
+      <<<reduce_grid, block, reduce_shared_mem_size, stream>>>(                \
+          out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, seq_lens_ptr,    \
+          max_num_partitions);
+
+template <typename T, typename CACHE_T, int BLOCK_SIZE,
+          vllm::Fp8KVCacheDataType KV_DTYPE, bool IS_BLOCK_SPARSE,
+          int NUM_THREADS = 128, int PARTITION_SIZE = 512>
+void paged_attention_v2_launcher(
+    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
+    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
+    torch::Tensor& value_cache, int num_kv_heads, float scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
+    const std::optional<torch::Tensor>& alibi_slopes, torch::Tensor& k_scale,
+    torch::Tensor& v_scale, const int tp_rank,
+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
+  int num_seqs = query.size(0);
+  int num_heads = query.size(1);
+  int head_size = query.size(2);
+  int max_num_blocks_per_seq = block_tables.size(1);
+  int q_stride = query.stride(0);
+  int kv_block_stride = key_cache.stride(0);
+  int kv_head_stride = key_cache.stride(1);
+
+  // NOTE: alibi_slopes is optional.
+  const float* alibi_slopes_ptr =
+      alibi_slopes
+          ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
+          : nullptr;
+
+  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
+  float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
+  float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
+  T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
+  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
+  CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
+  CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
+  int* block_tables_ptr = block_tables.data_ptr<int>();
+  int* seq_lens_ptr = seq_lens.data_ptr<int>();
+  const float* k_scale_ptr = reinterpret_cast<const float*>(k_scale.data_ptr());
+  const float* v_scale_ptr = reinterpret_cast<const float*>(v_scale.data_ptr());
+
+  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
+  int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE);
+  int logits_size = PARTITION_SIZE * sizeof(float);
+  int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
+
+  // For paged attention v2 kernel.
+  dim3 grid(num_heads, num_seqs, max_num_partitions);
+  int shared_mem_size = std::max(logits_size, outputs_size);
+  // For paged attention v2 reduce kernel.
+  dim3 reduce_grid(num_heads, num_seqs);
+  int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float);
+
+  dim3 block(NUM_THREADS);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  switch (head_size) {
+    // NOTE(woosuk): To reduce the compilation time, we only compile for the
+    // head sizes that we use in the model. However, we can easily extend this
+    // to support any head size which is a multiple of 16.
+    case 32:
+      LAUNCH_PAGED_ATTENTION_V2(32);
+      break;
+    case 64:
+      LAUNCH_PAGED_ATTENTION_V2(64);
+      break;
+    case 80:
+      LAUNCH_PAGED_ATTENTION_V2(80);
+      break;
+    case 96:
+      LAUNCH_PAGED_ATTENTION_V2(96);
+      break;
+    case 112:
+      LAUNCH_PAGED_ATTENTION_V2(112);
+      break;
+    case 120:
+      LAUNCH_PAGED_ATTENTION_V2(120);
+      break;
+    case 128:
+      LAUNCH_PAGED_ATTENTION_V2(128);
+      break;
+    case 192:
+      LAUNCH_PAGED_ATTENTION_V2(192);
+      break;
+    case 256:
+      LAUNCH_PAGED_ATTENTION_V2(256);
+      break;
+    default:
+      TORCH_CHECK(false, "Unsupported head size: ", head_size);
+      break;
+  }
+}
+
+#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE)   \
+  paged_attention_v2_launcher<T, CACHE_T, BLOCK_SIZE, KV_DTYPE,               \
+                              IS_BLOCK_SPARSE>(                               \
+      out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,      \
+      num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \
+      k_scale, v_scale, tp_rank, blocksparse_local_blocks,                    \
+      blocksparse_vert_stride, blocksparse_block_size,                        \
+      blocksparse_head_sliding_step);
+
+#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
+  if (is_block_sparse) {                                                   \
+    CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);       \
+  } else {                                                                 \
+    CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);      \
+  }
+
+// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
+// 1, 2, 4, 64, 128, 256.
+#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE)         \
+  switch (block_size) {                                           \
+    case 8:                                                       \
+      CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE);         \
+      break;                                                      \
+    case 16:                                                      \
+      CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE);        \
+      break;                                                      \
+    case 32:                                                      \
+      CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE);        \
+      break;                                                      \
+    default:                                                      \
+      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
+      break;                                                      \
+  }
+
+void paged_attention_v2(
+    torch::Tensor& out,         // [num_seqs, num_heads, head_size]
+    torch::Tensor& exp_sums,    // [num_seqs, num_heads, max_num_partitions]
+    torch::Tensor& max_logits,  // [num_seqs, num_heads, max_num_partitions]
+    torch::Tensor&
+        tmp_out,  // [num_seqs, num_heads, max_num_partitions, head_size]
+    torch::Tensor& query,  // [num_seqs, num_heads, head_size]
+    torch::Tensor&
+        key_cache,  // [num_blocks, num_heads, head_size/x, block_size, x]
+    torch::Tensor&
+        value_cache,       // [num_blocks, num_heads, head_size, block_size]
+    int64_t num_kv_heads,  // [num_heads]
+    double scale,
+    torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
+    torch::Tensor& seq_lens,      // [num_seqs]
+    int64_t block_size, int64_t max_seq_len,
+    const std::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
+    torch::Tensor& v_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step) {
+  const bool is_block_sparse = (blocksparse_vert_stride > 1);
+  DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
+                             CALL_V2_LAUNCHER_BLOCK_SIZE)
+}
+
+#undef MAX
+#undef MIN
+#undef DIVIDE_ROUND_UP
diff --git a/vllm_v0.10.0/csrc/attention/vertical_slash_index.cu b/vllm_v0.10.0/csrc/attention/vertical_slash_index.cu
new file mode 100644
index 0000000..c1b45b1
--- /dev/null
+++ b/vllm_v0.10.0/csrc/attention/vertical_slash_index.cu
@@ -0,0 +1,401 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <assert.h>
+
+#include <cuda.h>
+
+#include <torch/all.h>
+
+__device__ int64_t save_blocks(int* block_offset, int64_t range_start,
+                               int64_t range_end, int64_t block_size,
+                               int64_t input_block_count, int64_t kv_seqlen) {
+  if (range_start >= kv_seqlen) {
+    return input_block_count;
+  }
+  if (range_end > kv_seqlen) {
+    range_end = kv_seqlen;
+  }
+  int64_t current_block_count = input_block_count;
+  for (int idx = range_start; idx < range_end; idx += block_size) {
+    block_offset[current_block_count++] = idx;
+  }
+  return current_block_count;
+}
+
+__global__ void convert_vertical_slash_indexes_kernel(
+    const int* q_seqlens,         // [BATCH, ]
+    const int* kv_seqlens,        // [BATCH, ]
+    const int* vertical_indexes,  // [BATCH, N_HEADS, NNZ_V]
+    const int* slash_indexes,     // [BATCH, N_HEADS, NNZ_S]
+    int* block_count,             // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
+    int* block_offset,  // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_S]
+    int* column_count,  // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
+    int* column_index,  // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_V]
+    int64_t N_HEADS, int64_t N_ROWS, int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
+    int64_t NNZ_V, int64_t NNZ_S,
+    bool causal  // True for intra, False for succ
+) {
+  const int batch_idx = blockIdx.y;
+  const int head_idx = blockIdx.x;
+  const int group_idx = blockIdx.z;
+
+  int64_t q_seqlen = q_seqlens[batch_idx];
+  int64_t kv_seqlen = kv_seqlens[batch_idx];
+  int64_t block_idx_m = group_idx * blockDim.x + threadIdx.x;
+  int64_t start_m = block_idx_m * BLOCK_SIZE_M;
+  if (start_m >= q_seqlen) {
+    return;
+  }
+  int64_t end_m = start_m + BLOCK_SIZE_M;
+  vertical_indexes += (batch_idx * N_HEADS + head_idx) * NNZ_V;
+  slash_indexes += (batch_idx * N_HEADS + head_idx) * NNZ_S;
+  int64_t row_offset = (batch_idx * N_HEADS + head_idx) * N_ROWS + block_idx_m;
+  block_count += row_offset;
+  block_offset += row_offset * NNZ_S;
+  column_count += row_offset;
+  column_index += row_offset * NNZ_V;
+
+  bool has_slash = true;
+  int64_t tmp_col_cnt = 0, tmp_blk_cnt = 0;
+  int64_t s = 0, v = 0;
+  int64_t v_idx = vertical_indexes[v++];
+  int64_t s_idx = slash_indexes[s++];
+  if (causal) {
+    while (s_idx >= end_m + (kv_seqlen - q_seqlen) && s < NNZ_S) {
+      s_idx = slash_indexes[s++];
+    }
+    if (s_idx > end_m + (kv_seqlen - q_seqlen)) has_slash = false;
+    s_idx = max((kv_seqlen - q_seqlen) + end_m - s_idx, BLOCK_SIZE_M);
+  } else {
+    while (s_idx >= end_m + kv_seqlen && s < NNZ_S) {
+      s_idx = slash_indexes[s++];
+    }
+    if (s_idx > end_m + kv_seqlen) has_slash = false;
+    s_idx = max(kv_seqlen + end_m - s_idx, BLOCK_SIZE_M);
+  }
+
+  int64_t range_start = s_idx - BLOCK_SIZE_M, range_end = s_idx;
+  if (!has_slash) {
+    if (causal) {
+      range_start = (kv_seqlen - q_seqlen) + end_m;
+      range_end = (kv_seqlen - q_seqlen) + end_m + BLOCK_SIZE_N;
+    } else {
+      range_start = kv_seqlen;
+      range_end = kv_seqlen + BLOCK_SIZE_N;
+    }
+  }
+
+  bool slash_finished = false;
+  while (1) {
+    if (v_idx < range_end) {
+      if (v_idx < range_start) {
+        column_index[tmp_col_cnt++] = v_idx;
+      }
+      if (v < NNZ_V) {
+        v_idx = vertical_indexes[v++];
+      } else {
+        if (causal)
+          v_idx = end_m + BLOCK_SIZE_N + (kv_seqlen - q_seqlen);
+        else
+          v_idx = end_m + BLOCK_SIZE_N + kv_seqlen;
+      }
+    } else {
+      if ((s < NNZ_S && causal) ||
+          (s < NNZ_S && !causal && slash_indexes[s] >= start_m)) {
+        if (causal)
+          s_idx = max((kv_seqlen - q_seqlen) + end_m - slash_indexes[s++],
+                      BLOCK_SIZE_M);
+        else
+          s_idx = max(kv_seqlen + end_m - slash_indexes[s++], BLOCK_SIZE_M);
+      } else {
+        if (v == NNZ_V || (v_idx > range_start && causal)) {
+          // add the last vertical if no more slash
+          if (v == NNZ_V && !causal && v_idx < kv_seqlen) {
+            column_index[tmp_col_cnt++] = v_idx;
+          }
+          tmp_blk_cnt = save_blocks(block_offset, range_start, range_end,
+                                    BLOCK_SIZE_N, tmp_blk_cnt, kv_seqlen);
+          break;
+        } else {
+          if (causal) {
+            range_start = (kv_seqlen - q_seqlen) + end_m;
+            range_end = (kv_seqlen - q_seqlen) + end_m + BLOCK_SIZE_N;
+          } else {
+            // if slash_finished but there are vertical left, save current
+            // blocks
+            tmp_blk_cnt = save_blocks(block_offset, range_start, range_end,
+                                      BLOCK_SIZE_N, tmp_blk_cnt, kv_seqlen);
+            range_start = kv_seqlen;
+            range_end = kv_seqlen + BLOCK_SIZE_N;
+          }
+          slash_finished = true;
+        }
+      }
+      if (!slash_finished) {
+        if (s_idx > range_end + BLOCK_SIZE_M) {
+          tmp_blk_cnt = save_blocks(block_offset, range_start, range_end,
+                                    BLOCK_SIZE_N, tmp_blk_cnt, kv_seqlen);
+          range_start = s_idx - BLOCK_SIZE_M;
+          range_end = s_idx;
+        } else if (s_idx > range_end) {
+          range_end += BLOCK_SIZE_M;
+        }
+      }
+    }
+  }
+
+  block_count[0] = tmp_blk_cnt;
+  column_count[0] = tmp_col_cnt;
+}
+
+void convert_vertical_slash_indexes_64x64(
+    const int* q_seqlens,         // [BATCH, ]
+    const int* kv_seqlens,        // [BATCH, ]
+    const int* vertical_indexes,  // [BATCH, N_HEADS, NNZ_V]
+    const int* slash_indexes,     // [BATCH, N_HEADS, NNZ_S]
+    int* block_count,             // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
+    int* block_offset,  // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_S]
+    int* column_count,  // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
+    int* column_index,  // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_V]
+    int64_t BATCH_SIZE, int64_t N_HEADS, int64_t N_ROWS, int64_t BLOCK_SIZE_M,
+    int64_t BLOCK_SIZE_N, int64_t NNZ_V, int64_t NNZ_S, bool causal) {
+  const int N_THREADS = 64;
+  const dim3 dimBlock(N_THREADS);
+  const dim3 dimGrid(N_HEADS, BATCH_SIZE, (N_ROWS + N_THREADS - 1) / N_THREADS);
+  convert_vertical_slash_indexes_kernel<<<dimGrid, dimBlock>>>(
+      q_seqlens, kv_seqlens, vertical_indexes, slash_indexes, block_count,
+      block_offset, column_count, column_index, N_HEADS, N_ROWS, BLOCK_SIZE_M,
+      BLOCK_SIZE_N, NNZ_V, NNZ_S, causal);
+}
+
+/**
+ * Implements the Algorithm 4 in paper https://arxiv.org/abs/2407.02490.
+ *
+ * This function builds the index of each row of blocks from vertical indices
+ * and slash indices. The vertical indices are treated as points, while the
+ * slash indices are converted as ranges. The output consists of the merged
+ * ranges and separate column indices, where the ranges are represented by
+ * block indices.
+ *
+ * The implementation is referenced from the original MInference repo:
+ * https://github.com/microsoft/MInference/blob/main/csrc/vertical_slash_index.cu.
+ */
+void convert_vertical_slash_indexes(
+    torch::Tensor& block_count,      // [BATCH, N_HEADS, NUM_ROWS]
+    torch::Tensor& block_offset,     // [BATCH, N_HEADS, NUM_ROWS, NNZ_S]
+    torch::Tensor& column_count,     // [BATCH, N_HEADS, NUM_ROWS]
+    torch::Tensor& column_index,     // [BATCH, N_HEADS, NUM_ROWS, NNZ_V]
+    torch::Tensor q_seqlens,         // [BATCH, ]
+    torch::Tensor kv_seqlens,        // [BATCH, ]
+    torch::Tensor vertical_indexes,  // [BATCH, N_HEADS, NNZ_V]
+    torch::Tensor slash_indexes,     // [BATCH, N_HEADS, NNZ_S]
+    int64_t context_size, int64_t block_size_M, int64_t block_size_N,
+    bool causal) {
+  cudaSetDevice(q_seqlens.get_device());
+
+  int batch_size = slash_indexes.size(0);
+  int num_heads = slash_indexes.size(1);
+  int nnz_slash = slash_indexes.size(2);
+  int nnz_vertical = vertical_indexes.size(2);
+  int num_rows = (context_size + block_size_M - 1) / block_size_M;
+
+  convert_vertical_slash_indexes_64x64(
+      q_seqlens.data_ptr<int>(), kv_seqlens.data_ptr<int>(),
+      vertical_indexes.data_ptr<int>(), slash_indexes.data_ptr<int>(),
+      block_count.data_ptr<int>(), block_offset.data_ptr<int>(),
+      column_count.data_ptr<int>(), column_index.data_ptr<int>(), batch_size,
+      num_heads, num_rows, block_size_M, block_size_N, nnz_vertical, nnz_slash,
+      causal);
+}
+
+__global__ void convert_vertical_slash_indexes_kernel_mergehead(
+    const int* q_seqlens,         // [BATCH, ]
+    const int* kv_seqlens,        // [BATCH, ]
+    const int* vertical_indexes,  // [BATCH, N_HEADS, NNZ_V]
+    const int* slash_indexes,     // [BATCH, N_HEADS, NNZ_S]
+    const int* per_head_vertical_topkv, const int* per_head_slash_topkv,
+    int* block_count,   // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
+    int* block_offset,  // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_S]
+    int* column_count,  // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
+    int* column_index,  // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_V]
+    int64_t N_HEADS, int64_t N_ROWS, int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
+    int64_t NNZ_V, int64_t NNZ_S,
+    bool causal  // True for intra, False for succ
+) {
+  const int batch_idx = blockIdx.y;
+  const int head_idx = blockIdx.x;
+  const int group_idx = blockIdx.z;
+
+  int64_t q_seqlen = q_seqlens[batch_idx];
+  int64_t kv_seqlen = kv_seqlens[batch_idx];
+  int64_t block_idx_m = group_idx * blockDim.x + threadIdx.x;
+  int64_t start_m = block_idx_m * BLOCK_SIZE_M;
+  if (start_m >= q_seqlen) {
+    return;
+  }
+  int64_t end_m = start_m + BLOCK_SIZE_M;
+  vertical_indexes += (batch_idx * N_HEADS + head_idx) * NNZ_V;
+  slash_indexes += (batch_idx * N_HEADS + head_idx) * NNZ_S;
+  int64_t row_offset = (batch_idx * N_HEADS + head_idx) * N_ROWS + block_idx_m;
+  block_count += row_offset;
+  block_offset += row_offset * NNZ_S;
+  column_count += row_offset;
+  column_index += row_offset * NNZ_V;
+
+  // MergeHead: each head has it's unique max topk NNZ_V，NNZ_S. (NNZ_V，NNZ_S
+  // above is buffer size, use to compute offset)
+  NNZ_S = per_head_slash_topkv[head_idx];
+  NNZ_V = per_head_vertical_topkv[head_idx];
+
+  bool has_slash = true;
+  int64_t tmp_col_cnt = 0, tmp_blk_cnt = 0;
+  int64_t s = 0, v = 0;
+  int64_t v_idx = vertical_indexes[v++];
+  int64_t s_idx = slash_indexes[s++];
+  if (causal) {
+    while (s_idx >= end_m + (kv_seqlen - q_seqlen) && s < NNZ_S) {
+      s_idx = slash_indexes[s++];
+    }
+    if (s_idx > end_m + (kv_seqlen - q_seqlen)) has_slash = false;
+    s_idx = max((kv_seqlen - q_seqlen) + end_m - s_idx, BLOCK_SIZE_M);
+  } else {
+    while (s_idx >= end_m + kv_seqlen && s < NNZ_S) {
+      s_idx = slash_indexes[s++];
+    }
+    if (s_idx > end_m + kv_seqlen) has_slash = false;
+    s_idx = max(kv_seqlen + end_m - s_idx, BLOCK_SIZE_M);
+  }
+
+  int64_t range_start = s_idx - BLOCK_SIZE_M, range_end = s_idx;
+  if (!has_slash) {
+    if (causal) {
+      range_start = (kv_seqlen - q_seqlen) + end_m;
+      range_end = (kv_seqlen - q_seqlen) + end_m + BLOCK_SIZE_N;
+    } else {
+      range_start = kv_seqlen;
+      range_end = kv_seqlen + BLOCK_SIZE_N;
+    }
+  }
+
+  bool slash_finished = false;
+  while (1) {
+    if (v_idx < range_end) {
+      if (v_idx < range_start) {
+        column_index[tmp_col_cnt++] = v_idx;
+      }
+      if (v < NNZ_V) {
+        v_idx = vertical_indexes[v++];
+      } else {
+        if (causal)
+          v_idx = end_m + BLOCK_SIZE_N + (kv_seqlen - q_seqlen);
+        else
+          v_idx = end_m + BLOCK_SIZE_N + kv_seqlen;
+      }
+    } else {
+      if ((s < NNZ_S && causal) ||
+          (s < NNZ_S && !causal && slash_indexes[s] >= start_m)) {
+        if (causal)
+          s_idx = max((kv_seqlen - q_seqlen) + end_m - slash_indexes[s++],
+                      BLOCK_SIZE_M);
+        else
+          s_idx = max(kv_seqlen + end_m - slash_indexes[s++], BLOCK_SIZE_M);
+      } else {
+        if (v == NNZ_V || (v_idx > range_start && causal)) {
+          // add the last vertical if no more slash
+          if (v == NNZ_V && !causal && v_idx < kv_seqlen) {
+            column_index[tmp_col_cnt++] = v_idx;
+          }
+          tmp_blk_cnt = save_blocks(block_offset, range_start, range_end,
+                                    BLOCK_SIZE_N, tmp_blk_cnt, kv_seqlen);
+          break;
+        } else {
+          if (causal) {
+            range_start = (kv_seqlen - q_seqlen) + end_m;
+            range_end = (kv_seqlen - q_seqlen) + end_m + BLOCK_SIZE_N;
+          } else {
+            // if slash_finished but there are vertical left, save current
+            // blocks
+            tmp_blk_cnt = save_blocks(block_offset, range_start, range_end,
+                                      BLOCK_SIZE_N, tmp_blk_cnt, kv_seqlen);
+            range_start = kv_seqlen;
+            range_end = kv_seqlen + BLOCK_SIZE_N;
+          }
+          slash_finished = true;
+        }
+      }
+      if (!slash_finished) {
+        if (s_idx > range_end + BLOCK_SIZE_M) {
+          tmp_blk_cnt = save_blocks(block_offset, range_start, range_end,
+                                    BLOCK_SIZE_N, tmp_blk_cnt, kv_seqlen);
+          range_start = s_idx - BLOCK_SIZE_M;
+          range_end = s_idx;
+        } else if (s_idx > range_end) {
+          range_end += BLOCK_SIZE_M;
+        }
+      }
+    }
+  }
+
+  block_count[0] = tmp_blk_cnt;
+  column_count[0] = tmp_col_cnt;
+}
+
+void convert_vertical_slash_indexes_64x64_mergehead(
+    const int* q_seqlens,         // [BATCH, ]
+    const int* kv_seqlens,        // [BATCH, ]
+    const int* vertical_indexes,  // [BATCH, N_HEADS, NNZ_V]
+    const int* slash_indexes,     // [BATCH, N_HEADS, NNZ_S]
+    int* per_head_vertical_topkv, int* per_head_slash_topkv,
+    int* block_count,   // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
+    int* block_offset,  // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_S]
+    int* column_count,  // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
+    int* column_index,  // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_V]
+    int64_t BATCH_SIZE, int64_t N_HEADS, int64_t N_ROWS, int64_t BLOCK_SIZE_M,
+    int64_t BLOCK_SIZE_N, int64_t NNZ_V, int64_t NNZ_S, bool causal) {
+  const int N_THREADS = 64;
+  const dim3 dimBlock(N_THREADS);
+  const dim3 dimGrid(N_HEADS, BATCH_SIZE, (N_ROWS + N_THREADS - 1) / N_THREADS);
+  convert_vertical_slash_indexes_kernel_mergehead<<<dimGrid, dimBlock>>>(
+      q_seqlens, kv_seqlens, vertical_indexes, slash_indexes,
+      per_head_vertical_topkv, per_head_slash_topkv, block_count, block_offset,
+      column_count, column_index, N_HEADS, N_ROWS, BLOCK_SIZE_M, BLOCK_SIZE_N,
+      NNZ_V, NNZ_S, causal);
+}
+
+/**
+ * Implements the Algorithm 4 in paper https://arxiv.org/abs/2407.02490.
+ *
+ * Like the above convert_vertical_slash_indexes, but with
+ * pre-computed vertical and slash counts.
+ */
+void convert_vertical_slash_indexes_mergehead(
+    torch::Tensor& block_count,            // [BATCH, N_HEADS, NUM_ROWS]
+    torch::Tensor& block_offset,           // [BATCH, N_HEADS, NUM_ROWS, NNZ_S]
+    torch::Tensor& column_count,           // [BATCH, N_HEADS, NUM_ROWS]
+    torch::Tensor& column_index,           // [BATCH, N_HEADS, NUM_ROWS, NNZ_V]
+    torch::Tensor q_seqlens,               // [BATCH, ]
+    torch::Tensor kv_seqlens,              // [BATCH, ]
+    torch::Tensor vertical_indexes,        // [BATCH, N_HEADS, NNZ_V]
+    torch::Tensor slash_indexes,           // [BATCH, N_HEADS, NNZ_S]
+    torch::Tensor vertical_indices_count,  // [N_HEADS, ]
+    torch::Tensor slash_indices_count,     // [N_HEADS, ]
+    int64_t context_size, int64_t block_size_M, int64_t block_size_N,
+    bool causal) {
+  cudaSetDevice(q_seqlens.get_device());
+
+  int batch_size = slash_indexes.size(0);
+  int num_heads = slash_indexes.size(1);
+  int nnz_slash = slash_indexes.size(2);
+  int nnz_vertical = vertical_indexes.size(2);
+  int num_rows = (context_size + block_size_M - 1) / block_size_M;
+
+  convert_vertical_slash_indexes_64x64_mergehead(
+      q_seqlens.data_ptr<int>(), kv_seqlens.data_ptr<int>(),
+      vertical_indexes.data_ptr<int>(), slash_indexes.data_ptr<int>(),
+      vertical_indices_count.data_ptr<int>(),
+      slash_indices_count.data_ptr<int>(), block_count.data_ptr<int>(),
+      block_offset.data_ptr<int>(), column_count.data_ptr<int>(),
+      column_index.data_ptr<int>(), batch_size, num_heads, num_rows,
+      block_size_M, block_size_N, nnz_vertical, nnz_slash, causal);
+}
diff --git a/vllm_v0.10.0/csrc/cache.h b/vllm_v0.10.0/csrc/cache.h
new file mode 100644
index 0000000..0970b70
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cache.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <torch/all.h>
+
+#include <map>
+#include <vector>
+
+void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
+                 const torch::Tensor& block_mapping);
+
+// Note: the key_caches and value_caches vectors are constant but
+// not the Tensors they contain. The vectors need to be const refs
+// in order to satisfy pytorch's C++ operator registration code.
+void copy_blocks(std::vector<torch::Tensor> const& key_caches,
+                 std::vector<torch::Tensor> const& value_caches,
+                 const torch::Tensor& block_mapping);
+
+void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
+                     const torch::Tensor& block_mapping);
+
+void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
+                       torch::Tensor& key_cache, torch::Tensor& value_cache,
+                       torch::Tensor& slot_mapping,
+                       const std::string& kv_cache_dtype,
+                       torch::Tensor& k_scale, torch::Tensor& v_scale);
+
+void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
+                             torch::Tensor& key_cache,
+                             torch::Tensor& value_cache,
+                             torch::Tensor& slot_mapping,
+                             const std::string& kv_cache_dtype,
+                             torch::Tensor& k_scale, torch::Tensor& v_scale);
+
+void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
+                          torch::Tensor& kv_cache, torch::Tensor& slot_mapping,
+                          const std::string& kv_cache_dtype,
+                          torch::Tensor& scale);
+
+// Just for unittest
+void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
+                 const double scale, const std::string& kv_cache_dtype);
+
+void gather_cache(
+    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
+    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
+    int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/cache_kernels.cu b/vllm_v0.10.0/csrc/cache_kernels.cu
new file mode 100644
index 0000000..88559c8
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cache_kernels.cu
@@ -0,0 +1,734 @@
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "cuda_utils.h"
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
+
+#ifdef USE_ROCM
+  #include "quantization/fp8/amd/quant_utils.cuh"
+#else
+  #include "quantization/fp8/nvidia/quant_utils.cuh"
+#endif
+
+#include <algorithm>
+#include <cassert>
+#include <map>
+#include <vector>
+
+#ifdef USE_ROCM
+  #include <hip/hip_bf16.h>
+typedef __hip_bfloat16 __nv_bfloat16;
+#endif
+
+void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
+                 const torch::Tensor& block_mapping) {
+  torch::Device src_device = src.device();
+  torch::Device dst_device = dst.device();
+  cudaMemcpyKind memcpy_type;
+  if (src_device.is_cuda() && dst_device.is_cuda()) {
+    TORCH_CHECK(src_device.index() == dst_device.index(),
+                "src and dst must be on the same GPU");
+    memcpy_type = cudaMemcpyDeviceToDevice;
+  } else if (src_device.is_cuda() && dst_device.is_cpu()) {
+    memcpy_type = cudaMemcpyDeviceToHost;
+  } else if (src_device.is_cpu() && dst_device.is_cuda()) {
+    memcpy_type = cudaMemcpyHostToDevice;
+  } else {
+    TORCH_CHECK(false, "Invalid device combination");
+  }
+
+  // NOTE(youkaichao): keep in mind that `block_mapping` should be
+  // a cpu tensor, otherwise every `item` call will require a gpu-cpu
+  // synchronization.
+  TORCH_CHECK(block_mapping.device().is_cpu(), "block_mapping must be on CPU");
+
+  char* src_ptr = static_cast<char*>(src.data_ptr());
+  char* dst_ptr = static_cast<char*>(dst.data_ptr());
+
+  // We use the stride instead of numel in case the cache is padded for memory
+  // alignment reasons, we assume the blocks data (inclusive of any padding)
+  // is contiguous in memory
+  const int64_t block_size_in_bytes = src.element_size() * src.stride(0);
+  const at::cuda::OptionalCUDAGuard device_guard(
+      src_device.is_cuda() ? src_device : dst_device);
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  // NOTE(woosuk): This can be slow if the number of blocks is large.
+  const int64_t num_blocks = block_mapping.size(0);
+  for (size_t i = 0; i < num_blocks; i++) {
+    int64_t src_block_number = block_mapping[i][0].item<int64_t>();
+    int64_t dst_block_number = block_mapping[i][1].item<int64_t>();
+    int64_t src_offset = src_block_number * block_size_in_bytes;
+    int64_t dst_offset = dst_block_number * block_size_in_bytes;
+    cudaMemcpyAsync(dst_ptr + dst_offset, src_ptr + src_offset,
+                    block_size_in_bytes, memcpy_type, stream);
+  }
+}
+
+namespace vllm {
+
+// Grid: (num_layers, num_pairs)
+template <typename scalar_t>
+__global__ void copy_blocks_kernel(int64_t* key_cache_ptrs,
+                                   int64_t* value_cache_ptrs,
+                                   const int64_t* __restrict__ block_mapping,
+                                   const int numel_per_block) {
+  const int layer_idx = blockIdx.x;
+  const int pair_idx = blockIdx.y;
+
+  scalar_t* key_cache = reinterpret_cast<scalar_t*>(key_cache_ptrs[layer_idx]);
+  scalar_t* value_cache =
+      reinterpret_cast<scalar_t*>(value_cache_ptrs[layer_idx]);
+  int64_t src_block_number = block_mapping[2 * pair_idx];
+  int64_t dst_block_number = block_mapping[2 * pair_idx + 1];
+
+  const int64_t src_block_offset = src_block_number * numel_per_block;
+  const int64_t dst_block_offset = dst_block_number * numel_per_block;
+  for (int i = threadIdx.x; i < numel_per_block; i += blockDim.x) {
+    int64_t src_offset = src_block_offset + i;
+    int64_t dst_offset = dst_block_offset + i;
+    key_cache[dst_offset] = key_cache[src_offset];
+  }
+  for (int i = threadIdx.x; i < numel_per_block; i += blockDim.x) {
+    int64_t src_offset = src_block_offset + i;
+    int64_t dst_offset = dst_block_offset + i;
+    value_cache[dst_offset] = value_cache[src_offset];
+  }
+}
+
+// Kernel for MLA, which works on a single joint kv_cache
+// Grid: (num_layers, num_pairs)
+template <typename scalar_t>
+__global__ void copy_blocks_mla_kernel(
+    int64_t* cache_ptrs, const int64_t* __restrict__ block_mapping,
+    const int mem_footprint_per_block) {
+  const int layer_idx = blockIdx.x;
+  const int pair_idx = blockIdx.y;
+  scalar_t* cache = reinterpret_cast<scalar_t*>(cache_ptrs[layer_idx]);
+  int64_t src_block = block_mapping[2 * pair_idx];
+  int64_t dst_block = block_mapping[2 * pair_idx + 1];
+  int64_t src_offset = src_block * mem_footprint_per_block;
+  int64_t dst_offset = dst_block * mem_footprint_per_block;
+  for (int i = threadIdx.x; i < mem_footprint_per_block; i += blockDim.x) {
+    cache[dst_offset + i] = cache[src_offset + i];
+  }
+}
+
+}  // namespace vllm
+
+// Note: the key_caches and value_caches vectors are constant but
+// not the Tensors they contain. The vectors need to be const refs
+// in order to satisfy pytorch's C++ operator registration code.
+void copy_blocks(std::vector<torch::Tensor> const& key_caches,
+                 std::vector<torch::Tensor> const& value_caches,
+                 const torch::Tensor& block_mapping) {
+  int num_layers = key_caches.size();
+  TORCH_CHECK(num_layers == value_caches.size());
+  if (num_layers == 0) {
+    return;
+  }
+  torch::Device cache_device = key_caches[0].device();
+  TORCH_CHECK(cache_device.is_cuda());
+
+  // Create data structures for the kernel.
+  // Create an array of pointers to the key and value caches.
+  int64_t key_cache_ptrs[num_layers];
+  int64_t value_cache_ptrs[num_layers];
+  for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
+    key_cache_ptrs[layer_idx] =
+        reinterpret_cast<int64_t>(key_caches[layer_idx].data_ptr());
+    value_cache_ptrs[layer_idx] =
+        reinterpret_cast<int64_t>(value_caches[layer_idx].data_ptr());
+  }
+
+  // block_mapping is a 2D tensor with shape (num_pairs, 2).
+  int num_pairs = block_mapping.size(0);
+
+  // Move the data structures to the GPU.
+  // NOTE: This synchronizes the CPU and GPU.
+  torch::Tensor key_cache_ptrs_tensor =
+      torch::from_blob(key_cache_ptrs, {num_layers}, torch::kInt64)
+          .to(cache_device);
+  torch::Tensor value_cache_ptrs_tensor =
+      torch::from_blob(value_cache_ptrs, {num_layers}, torch::kInt64)
+          .to(cache_device);
+
+  // Launch the kernel.
+  const int numel_per_block = key_caches[0][0].numel();
+  dim3 grid(num_layers, num_pairs);
+  dim3 block(std::min(1024, numel_per_block));
+  const at::cuda::OptionalCUDAGuard device_guard(cache_device);
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
+      key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] {
+        vllm::copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>(
+            key_cache_ptrs_tensor.data_ptr<int64_t>(),
+            value_cache_ptrs_tensor.data_ptr<int64_t>(),
+            block_mapping.data_ptr<int64_t>(), numel_per_block);
+      }));
+}
+
+// copy blocks kernel for MLA (assumes a joint KV-cache)
+void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
+                     const torch::Tensor& block_mapping) {
+  int num_layers = kv_caches.size();
+  if (num_layers == 0) {
+    return;
+  }
+  torch::Device cache_device = kv_caches[0].device();
+  TORCH_CHECK(cache_device.is_cuda(), "kv_cache must be on CUDA");
+
+  std::vector<int64_t> cache_ptrs(num_layers);
+  for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
+    cache_ptrs[layer_idx] =
+        reinterpret_cast<int64_t>(kv_caches[layer_idx].data_ptr());
+  }
+  torch::Tensor cache_ptrs_tensor =
+      torch::from_blob(cache_ptrs.data(), {num_layers}, torch::kInt64)
+          .to(cache_device);
+
+  int num_pairs = block_mapping.size(0);
+  // We use the stride instead of numel in case the cache is padded for memory
+  // alignment reasons, we assume the blocks data (inclusive of any padding)
+  // is contiguous in memory
+  int mem_footprint_per_block = kv_caches[0].stride(0);
+  dim3 grid(num_layers, num_pairs);
+  dim3 block(std::min(1024, mem_footprint_per_block));
+  const at::cuda::OptionalCUDAGuard device_guard(cache_device);
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
+      kv_caches[0].scalar_type(), "copy_blocks_mla_kernel", ([&] {
+        vllm::copy_blocks_mla_kernel<scalar_t><<<grid, block, 0, stream>>>(
+            cache_ptrs_tensor.data_ptr<int64_t>(),
+            block_mapping.data_ptr<int64_t>(), mem_footprint_per_block);
+      }));
+}
+
+namespace vllm {
+
+template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
+__global__ void reshape_and_cache_kernel(
+    const scalar_t* __restrict__ key,    // [num_tokens, num_heads, head_size]
+    const scalar_t* __restrict__ value,  // [num_tokens, num_heads, head_size]
+    cache_t* __restrict__ key_cache,     // [num_blocks, num_heads, head_size/x,
+                                         // block_size, x]
+    cache_t* __restrict__ value_cache,   // [num_blocks, num_heads, head_size,
+                                         // block_size]
+    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
+    const int key_stride, const int value_stride, const int num_heads,
+    const int head_size, const int block_size, const int x,
+    const float* k_scale, const float* v_scale) {
+  const int64_t token_idx = blockIdx.x;
+  const int64_t slot_idx = slot_mapping[token_idx];
+  if (slot_idx < 0) {
+    // Padding token that should be ignored.
+    return;
+  }
+
+  const int64_t block_idx = slot_idx / block_size;
+  const int64_t block_offset = slot_idx % block_size;
+
+  const int n = num_heads * head_size;
+  for (int i = threadIdx.x; i < n; i += blockDim.x) {
+    const int64_t src_key_idx = token_idx * key_stride + i;
+    const int64_t src_value_idx = token_idx * value_stride + i;
+
+    const int head_idx = i / head_size;
+    const int head_offset = i % head_size;
+    const int x_idx = head_offset / x;
+    const int x_offset = head_offset % x;
+
+    const int64_t tgt_key_idx =
+        block_idx * num_heads * (head_size / x) * block_size * x +
+        head_idx * (head_size / x) * block_size * x + x_idx * block_size * x +
+        block_offset * x + x_offset;
+    const int64_t tgt_value_idx =
+        block_idx * num_heads * head_size * block_size +
+        head_idx * head_size * block_size + head_offset * block_size +
+        block_offset;
+    scalar_t tgt_key = key[src_key_idx];
+    scalar_t tgt_value = value[src_value_idx];
+    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+      key_cache[tgt_key_idx] = tgt_key;
+      value_cache[tgt_value_idx] = tgt_value;
+    } else {
+      key_cache[tgt_key_idx] =
+          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, *k_scale);
+      value_cache[tgt_value_idx] =
+          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, *v_scale);
+    }
+  }
+}
+
+template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
+__global__ void reshape_and_cache_flash_kernel(
+    const scalar_t* __restrict__ key,    // [num_tokens, num_heads, head_size]
+    const scalar_t* __restrict__ value,  // [num_tokens, num_heads, head_size]
+    cache_t* __restrict__ key_cache,     // [num_blocks, block_size, num_heads,
+                                         // head_size]
+    cache_t* __restrict__ value_cache,   // [num_blocks, block_size, num_heads,
+                                         // head_size]
+    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
+    const int64_t block_stride, const int64_t page_stride,
+    const int64_t head_stride, const int64_t key_stride,
+    const int64_t value_stride, const int num_heads, const int head_size,
+    const int block_size, const float* k_scale, const float* v_scale) {
+  const int64_t token_idx = blockIdx.x;
+  const int64_t slot_idx = slot_mapping[token_idx];
+  // NOTE: slot_idx can be -1 if the token is padded
+  if (slot_idx < 0) {
+    return;
+  }
+  const int64_t block_idx = slot_idx / block_size;
+  const int64_t block_offset = slot_idx % block_size;
+  const int n = num_heads * head_size;
+  for (int i = threadIdx.x; i < n; i += blockDim.x) {
+    const int64_t src_key_idx = token_idx * key_stride + i;
+    const int64_t src_value_idx = token_idx * value_stride + i;
+    const int head_idx = i / head_size;
+    const int head_offset = i % head_size;
+    const int64_t tgt_key_value_idx = block_idx * block_stride +
+                                      block_offset * page_stride +
+                                      head_idx * head_stride + head_offset;
+    scalar_t tgt_key = key[src_key_idx];
+    scalar_t tgt_value = value[src_value_idx];
+    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+      key_cache[tgt_key_value_idx] = tgt_key;
+      value_cache[tgt_key_value_idx] = tgt_value;
+    } else {
+      key_cache[tgt_key_value_idx] =
+          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, *k_scale);
+      value_cache[tgt_key_value_idx] =
+          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, *v_scale);
+    }
+  }
+}
+
+template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
+__global__ void concat_and_cache_mla_kernel(
+    const scalar_t* __restrict__ kv_c,  // [num_tokens, kv_lora_rank]
+    const scalar_t* __restrict__ k_pe,  // [num_tokens, pe_dim]
+    cache_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank
+                                     // + pe_dim)]
+    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
+    const int block_stride,                    //
+    const int entry_stride,                    //
+    const int kv_c_stride,                     //
+    const int k_pe_stride,                     //
+    const int kv_lora_rank,                    //
+    const int pe_dim,                          //
+    const int block_size,                      //
+    const float* scale                         //
+) {
+  const int64_t token_idx = blockIdx.x;
+  const int64_t slot_idx = slot_mapping[token_idx];
+  // NOTE: slot_idx can be -1 if the token is padded
+  if (slot_idx < 0) {
+    return;
+  }
+  const int64_t block_idx = slot_idx / block_size;
+  const int64_t block_offset = slot_idx % block_size;
+
+  auto copy = [&](const scalar_t* __restrict__ src, cache_t* __restrict__ dst,
+                  int src_stride, int dst_stride, int size, int offset) {
+    for (int i = threadIdx.x; i < size; i += blockDim.x) {
+      const int64_t src_idx = token_idx * src_stride + i;
+      const int64_t dst_idx =
+          block_idx * block_stride + block_offset * entry_stride + i + offset;
+      if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+        dst[dst_idx] = src[src_idx];
+      } else {
+        dst[dst_idx] =
+            fp8::scaled_convert<cache_t, scalar_t, kv_dt>(src[src_idx], *scale);
+      }
+    }
+  };
+
+  copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
+  copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
+}
+
+}  // namespace vllm
+
+// KV_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
+// KV_DTYPE is the real data type of kv-cache.
+#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, KV_DTYPE)               \
+  vllm::reshape_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE>             \
+      <<<grid, block, 0, stream>>>(                                   \
+          reinterpret_cast<KV_T*>(key.data_ptr()),                    \
+          reinterpret_cast<KV_T*>(value.data_ptr()),                  \
+          reinterpret_cast<CACHE_T*>(key_cache.data_ptr()),           \
+          reinterpret_cast<CACHE_T*>(value_cache.data_ptr()),         \
+          slot_mapping.data_ptr<int64_t>(), key_stride, value_stride, \
+          num_heads, head_size, block_size, x,                        \
+          reinterpret_cast<const float*>(k_scale.data_ptr()),         \
+          reinterpret_cast<const float*>(v_scale.data_ptr()));
+
+void reshape_and_cache(
+    torch::Tensor& key,    // [num_tokens, num_heads, head_size]
+    torch::Tensor& value,  // [num_tokens, num_heads, head_size]
+    torch::Tensor&
+        key_cache,  // [num_blocks, num_heads, head_size/x, block_size, x]
+    torch::Tensor&
+        value_cache,  // [num_blocks, num_heads, head_size, block_size]
+    torch::Tensor& slot_mapping,  // [num_tokens]
+    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
+    torch::Tensor& v_scale) {
+  int num_tokens = slot_mapping.size(0);
+  int num_heads = key.size(1);
+  int head_size = key.size(2);
+  int block_size = key_cache.size(3);
+  int x = key_cache.size(4);
+
+  int key_stride = key.stride(0);
+  int value_stride = value.stride(0);
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min(num_heads * head_size, 512));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  DISPATCH_BY_KV_CACHE_DTYPE(key.dtype(), kv_cache_dtype,
+                             CALL_RESHAPE_AND_CACHE)
+}
+
+// KV_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
+// KV_DTYPE is the real data type of kv-cache.
+#define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE)             \
+  vllm::reshape_and_cache_flash_kernel<KV_T, CACHE_T, KV_DTYPE>           \
+      <<<grid, block, 0, stream>>>(                                       \
+          reinterpret_cast<KV_T*>(key.data_ptr()),                        \
+          reinterpret_cast<KV_T*>(value.data_ptr()),                      \
+          reinterpret_cast<CACHE_T*>(key_cache.data_ptr()),               \
+          reinterpret_cast<CACHE_T*>(value_cache.data_ptr()),             \
+          slot_mapping.data_ptr<int64_t>(), block_stride, page_stride,    \
+          head_stride, key_stride, value_stride, num_heads, head_size,    \
+          block_size, reinterpret_cast<const float*>(k_scale.data_ptr()), \
+          reinterpret_cast<const float*>(v_scale.data_ptr()));
+
+void reshape_and_cache_flash(
+    torch::Tensor& key,        // [num_tokens, num_heads, head_size]
+    torch::Tensor& value,      // [num_tokens, num_heads, head_size]
+    torch::Tensor& key_cache,  // [num_blocks, block_size, num_heads, head_size]
+    torch::Tensor&
+        value_cache,  // [num_blocks, block_size, num_heads, head_size]
+    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
+    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
+    torch::Tensor& v_scale) {
+  // NOTE(woosuk): In vLLM V1, key.size(0) can be different from
+  // slot_mapping.size(0) because of padding for CUDA graphs.
+  // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
+  // both include padding.
+  // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
+  // since key includes padding for CUDA graphs, while slot_mapping does not.
+  // In this case, slot_mapping.size(0) represents the actual number of tokens
+  // before padding.
+  // For compatibility with both cases, we use slot_mapping.size(0) as the
+  // number of tokens.
+  int num_tokens = slot_mapping.size(0);
+  int num_heads = key.size(1);
+  int head_size = key.size(2);
+  int block_size = key_cache.size(1);
+
+  int64_t key_stride = key.stride(0);
+  int64_t value_stride = value.stride(0);
+  int64_t block_stride = key_cache.stride(0);
+  int64_t page_stride = key_cache.stride(1);
+  int64_t head_stride = key_cache.stride(2);
+  TORCH_CHECK(key_cache.stride(0) == value_cache.stride(0));
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min(num_heads * head_size, 512));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  DISPATCH_BY_KV_CACHE_DTYPE(key.dtype(), kv_cache_dtype,
+                             CALL_RESHAPE_AND_CACHE_FLASH);
+}
+
+// KV_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
+// KV_DTYPE is the real data type of kv-cache.
+#define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)              \
+  vllm::concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>            \
+      <<<grid, block, 0, stream>>>(                                     \
+          reinterpret_cast<KV_T*>(kv_c.data_ptr()),                     \
+          reinterpret_cast<KV_T*>(k_pe.data_ptr()),                     \
+          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),              \
+          slot_mapping.data_ptr<int64_t>(), block_stride, entry_stride, \
+          kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
+          reinterpret_cast<const float*>(scale.data_ptr()));
+
+void concat_and_cache_mla(
+    torch::Tensor& kv_c,          // [num_tokens, kv_lora_rank]
+    torch::Tensor& k_pe,          // [num_tokens, pe_dim]
+    torch::Tensor& kv_cache,      // [num_blocks, block_size, (kv_lora_rank +
+                                  // pe_dim)]
+    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
+    const std::string& kv_cache_dtype, torch::Tensor& scale) {
+  // NOTE(woosuk): In vLLM V1, key.size(0) can be different from
+  // slot_mapping.size(0) because of padding for CUDA graphs.
+  // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
+  // both include padding.
+  // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
+  // since key includes padding for CUDA graphs, while slot_mapping does not.
+  // In this case, slot_mapping.size(0) represents the actual number of tokens
+  // before padding.
+  // For compatibility with both cases, we use slot_mapping.size(0) as the
+  // number of tokens.
+  int num_tokens = slot_mapping.size(0);
+  int kv_lora_rank = kv_c.size(1);
+  int pe_dim = k_pe.size(1);
+  int block_size = kv_cache.size(1);
+
+  TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
+
+  int kv_c_stride = kv_c.stride(0);
+  int k_pe_stride = k_pe.stride(0);
+  int block_stride = kv_cache.stride(0);
+  int entry_stride = kv_cache.stride(1);
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min(kv_lora_rank, 512));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
+                             CALL_CONCAT_AND_CACHE_MLA);
+}
+
+namespace vllm {
+
+template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
+__global__ void convert_fp8_kernel(const Tin* __restrict__ src_cache,
+                                   Tout* __restrict__ dst_cache,
+                                   const float scale,
+                                   const int64_t block_stride) {
+  const int64_t block_idx = blockIdx.x;
+  for (int i = threadIdx.x; i < block_stride; i += blockDim.x) {
+    int64_t idx = block_idx * block_stride + i;
+    dst_cache[idx] =
+        fp8::scaled_convert<Tout, Tin, kv_dt>(src_cache[idx], scale);
+  }
+}
+
+}  // namespace vllm
+
+#define CALL_CONVERT_FP8(Tout, Tin, KV_DTYPE)                                \
+  vllm::convert_fp8_kernel<Tout, Tin, KV_DTYPE><<<grid, block, 0, stream>>>( \
+      reinterpret_cast<Tin*>(src_cache.data_ptr()),                          \
+      reinterpret_cast<Tout*>(dst_cache.data_ptr()), scale, block_stride);
+
+// Only for testing.
+void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
+                 const double scale, const std::string& kv_cache_dtype) {
+  torch::Device src_device = src_cache.device();
+  torch::Device dst_device = dst_cache.device();
+  TORCH_CHECK(src_device.is_cuda(), "src must be on a GPU")
+  TORCH_CHECK(dst_device.is_cuda(), "dst must be on a GPU")
+  TORCH_CHECK(src_device.index() == dst_device.index(),
+              "src and dst must be on the same GPU");
+  at::cuda::OptionalCUDAGuard device_guard(src_device);
+
+  int64_t num_blocks = src_cache.size(0);
+  int64_t block_stride = src_cache.stride(0);
+
+  dim3 grid(num_blocks);
+  dim3 block(std::min(block_stride, int64_t(512)));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  if (kv_cache_dtype == "auto") {
+    if (src_cache.dtype() == at::ScalarType::Float) {
+      CALL_CONVERT_FP8(uint8_t, float, vllm::Fp8KVCacheDataType::kAuto);
+    } else if (src_cache.dtype() == at::ScalarType::Half) {
+      CALL_CONVERT_FP8(uint8_t, uint16_t, vllm::Fp8KVCacheDataType::kAuto);
+    } else if (src_cache.dtype() == at::ScalarType::BFloat16) {
+      CALL_CONVERT_FP8(uint8_t, __nv_bfloat16, vllm::Fp8KVCacheDataType::kAuto);
+    } else if (dst_cache.dtype() == at::ScalarType::Float) {
+      CALL_CONVERT_FP8(float, uint8_t, vllm::Fp8KVCacheDataType::kAuto);
+    } else if (dst_cache.dtype() == at::ScalarType::Half) {
+      CALL_CONVERT_FP8(uint16_t, uint8_t, vllm::Fp8KVCacheDataType::kAuto);
+    } else if (dst_cache.dtype() == at::ScalarType::BFloat16) {
+      CALL_CONVERT_FP8(__nv_bfloat16, uint8_t, vllm::Fp8KVCacheDataType::kAuto);
+    }
+  } else if (kv_cache_dtype == "fp8" || kv_cache_dtype == "fp8_e4m3") {
+    if (src_cache.dtype() == at::ScalarType::Float) {
+      CALL_CONVERT_FP8(uint8_t, float, vllm::Fp8KVCacheDataType::kFp8E4M3);
+    } else if (src_cache.dtype() == at::ScalarType::Half) {
+      CALL_CONVERT_FP8(uint8_t, uint16_t, vllm::Fp8KVCacheDataType::kFp8E4M3);
+    } else if (src_cache.dtype() == at::ScalarType::BFloat16) {
+      CALL_CONVERT_FP8(uint8_t, __nv_bfloat16,
+                       vllm::Fp8KVCacheDataType::kFp8E4M3);
+    } else if (dst_cache.dtype() == at::ScalarType::Float) {
+      CALL_CONVERT_FP8(float, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3);
+    } else if (dst_cache.dtype() == at::ScalarType::Half) {
+      CALL_CONVERT_FP8(uint16_t, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3);
+    } else if (dst_cache.dtype() == at::ScalarType::BFloat16) {
+      CALL_CONVERT_FP8(__nv_bfloat16, uint8_t,
+                       vllm::Fp8KVCacheDataType::kFp8E4M3);
+    }
+  } else {
+    TORCH_CHECK(false, "Unsupported data type: ", kv_cache_dtype);
+  }
+}
+
+namespace vllm {
+
+// grid is launched with dimensions (batch, num_splits)
+template <typename scalar_t>
+__global__ void gather_cache(
+    const scalar_t* __restrict__ src_cache,   // [NUM_BLOCKS, BLOCK_SIZE,
+                                              // ENTRIES...]
+    scalar_t* __restrict__ dst,               // [TOT_TOKENS, ENTRIES...]
+    const int32_t* __restrict__ block_table,  // [BATCH, BLOCK_INDICES]
+    const int32_t* __restrict__ cu_seq_lens,  // [BATCH+1]
+    const int32_t block_size, const int32_t entry_size,
+    const int64_t block_table_stride, const int64_t cache_block_stride,
+    const int64_t cache_entry_stride, const int64_t dst_entry_stride,
+    const int32_t* __restrict__ seq_starts) {  // Optional: starting offsets per
+                                               // batch
+
+  const int64_t bid = blockIdx.x;  // Batch ID
+  const int32_t num_splits = gridDim.y;
+  const int32_t split = blockIdx.y;
+  const int32_t seq_start = cu_seq_lens[bid];
+  const int32_t seq_end = cu_seq_lens[bid + 1];
+  const int32_t seq_len = seq_end - seq_start;
+  const int32_t tot_blocks = cuda_utils::ceil_div(seq_len, block_size);
+  const int32_t split_blocks = cuda_utils::ceil_div(tot_blocks, num_splits);
+
+  const int32_t split_start = split * split_blocks;
+  const int32_t split_end = min((split + 1) * split_blocks, tot_blocks);
+
+  const bool is_active_split = (split_start < tot_blocks);
+  const bool is_last_split = (split_end == tot_blocks);
+
+  if (!is_active_split) return;
+
+  int32_t full_blocks_end = split_end;
+  int32_t partial_block_size = 0;
+
+  // Adjust the pointer for the block_table for this batch.
+  // If seq_starts is provided, compute an offset based on (seq_starts[bid] /
+  // page_size)
+  const int32_t batch_offset = bid * block_table_stride;
+  int32_t offset = 0;
+  if (seq_starts != nullptr) {
+    offset = seq_starts[bid] / block_size;
+  }
+  const int32_t* batch_block_table = block_table + batch_offset + offset;
+
+  // Adjust dst pointer based on the cumulative sequence lengths.
+  dst += seq_start * dst_entry_stride;
+
+  if (is_last_split) {
+    partial_block_size = seq_len % block_size;
+    if (partial_block_size) full_blocks_end -= 1;
+  }
+
+  auto copy_entry = [&](const scalar_t* __restrict__ _src,
+                        scalar_t* __restrict__ _dst) {
+    for (int i = threadIdx.x; i < entry_size; i += blockDim.x)
+      _dst[i] = _src[i];
+  };
+
+  for (int pid = split_start; pid < full_blocks_end; ++pid) {
+    auto block_id = batch_block_table[pid];
+    auto block_start_ptr = src_cache + block_id * cache_block_stride;
+    auto block_dst_ptr = dst + pid * block_size * dst_entry_stride;
+    for (int eid = 0; eid < block_size; ++eid) {
+      copy_entry(block_start_ptr + eid * cache_entry_stride,
+                 block_dst_ptr + eid * dst_entry_stride);
+    }
+  }
+
+  if (partial_block_size) {
+    auto block_id = batch_block_table[full_blocks_end];
+    auto block_start_ptr = src_cache + block_id * cache_block_stride;
+    auto block_dst_ptr = dst + full_blocks_end * block_size * dst_entry_stride;
+    for (int eid = 0; eid < partial_block_size; ++eid) {
+      copy_entry(block_start_ptr + eid * cache_entry_stride,
+                 block_dst_ptr + eid * dst_entry_stride);
+    }
+  }
+}
+
+}  // namespace vllm
+
+// Macro to dispatch the kernel based on the data type.
+#define CALL_GATHER_CACHE(CPY_DTYPE)                                    \
+  vllm::gather_cache<CPY_DTYPE><<<grid, block, 0, stream>>>(            \
+      reinterpret_cast<CPY_DTYPE*>(src_cache.data_ptr()),               \
+      reinterpret_cast<CPY_DTYPE*>(dst.data_ptr()),                     \
+      block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
+      block_size, entry_size, block_table_stride, cache_block_stride,   \
+      cache_entry_stride, dst_entry_stride, seq_starts_ptr);
+
+// Gather sequences from the cache into the destination tensor.
+//  - cu_seq_lens contains the cumulative sequence lengths for each batch
+//  - block_table contains the cache block indices for each sequence
+//  - Optionally, seq_starts (if provided) offsets the starting block index by
+//  (seq_starts[bid] / page_size)
+void gather_cache(
+    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
+    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
+    int64_t batch_size,
+    std::optional<torch::Tensor> seq_starts = std::nullopt) {
+  at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  int32_t block_size = src_cache.size(1);
+  int32_t entry_size = src_cache.flatten(2, -1).size(2);
+
+  TORCH_CHECK(block_table.dtype() == torch::kInt32,
+              "block_table must be int32");
+  TORCH_CHECK(cu_seq_lens.dtype() == torch::kInt32,
+              "cu_seq_lens must be int32");
+  if (seq_starts.has_value()) {
+    TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
+                "seq_starts must be int32");
+  }
+
+  TORCH_CHECK(src_cache.device() == dst.device(),
+              "src_cache and dst must be on the same device");
+  TORCH_CHECK(src_cache.device() == block_table.device(),
+              "src_cache and block_table must be on the same device");
+  TORCH_CHECK(src_cache.device() == cu_seq_lens.device(),
+              "src_cache and cu_seq_lens must be on the same device");
+  if (seq_starts.has_value()) {
+    TORCH_CHECK(src_cache.device() == seq_starts.value().device(),
+                "src_cache and seq_starts must be on the same device");
+  }
+
+  int64_t block_table_stride = block_table.stride(0);
+  int64_t cache_block_stride = src_cache.stride(0);
+  int64_t cache_entry_stride = src_cache.stride(1);
+  int64_t dst_entry_stride = dst.stride(0);
+
+  // Decide on the number of splits based on the batch size.
+  int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
+  dim3 grid(batch_size, num_splits);
+  dim3 block(1024);
+
+  TORCH_CHECK(src_cache.dtype() == dst.dtype(),
+              "src_cache and dst must have the same dtype");
+
+  const int dtype_bits = src_cache.element_size() * 8;
+  const int32_t* seq_starts_ptr =
+      seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
+
+  if (dtype_bits == 32) {
+    CALL_GATHER_CACHE(uint32_t);
+  } else if (dtype_bits == 16) {
+    CALL_GATHER_CACHE(uint16_t);
+  } else if (dtype_bits == 8) {
+    CALL_GATHER_CACHE(uint8_t);
+  } else {
+    TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits);
+  }
+}
diff --git a/vllm_v0.10.0/csrc/core/exception.hpp b/vllm_v0.10.0/csrc/core/exception.hpp
new file mode 100644
index 0000000..f3b2ffa
--- /dev/null
+++ b/vllm_v0.10.0/csrc/core/exception.hpp
@@ -0,0 +1,3 @@
+#pragma once
+
+#define VLLM_IMPLIES(p, q) (!(p) || (q))
diff --git a/vllm_v0.10.0/csrc/core/math.hpp b/vllm_v0.10.0/csrc/core/math.hpp
new file mode 100644
index 0000000..6764e1f
--- /dev/null
+++ b/vllm_v0.10.0/csrc/core/math.hpp
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <climits>
+#include <iostream>
+
+inline constexpr uint32_t next_pow_2(uint32_t const num) {
+  if (num <= 1) return num;
+  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
+}
+
+template <typename A, typename B>
+static inline constexpr auto div_ceil(A a, B b) {
+  return (a + b - 1) / b;
+}
+
+// Round a down to the next multiple of b. The caller is responsible for making
+// sure that b is non-zero
+template <typename T>
+inline constexpr T round_to_previous_multiple_of(T a, T b) {
+  return a % b == 0 ? a : (a / b) * b;
+}
+
+// Round a up to the next multiple of b. The caller is responsible for making
+// sure that b is non-zero
+template <typename T>
+inline constexpr T round_to_next_multiple_of(T a, T b) {
+  return a % b == 0 ? a : ((a / b) + 1) * b;
+}
diff --git a/vllm_v0.10.0/csrc/core/registration.h b/vllm_v0.10.0/csrc/core/registration.h
new file mode 100644
index 0000000..4d0ce1c
--- /dev/null
+++ b/vllm_v0.10.0/csrc/core/registration.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <Python.h>
+
+#define _CONCAT(A, B) A##B
+#define CONCAT(A, B) _CONCAT(A, B)
+
+#define _STRINGIFY(A) #A
+#define STRINGIFY(A) _STRINGIFY(A)
+
+// A version of the TORCH_LIBRARY macro that expands the NAME, i.e. so NAME
+// could be a macro instead of a literal token.
+#define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE)
+
+// A version of the TORCH_LIBRARY_IMPL macro that expands the NAME, i.e. so NAME
+// could be a macro instead of a literal token.
+#define TORCH_LIBRARY_IMPL_EXPAND(NAME, DEVICE, MODULE) \
+  TORCH_LIBRARY_IMPL(NAME, DEVICE, MODULE)
+
+// REGISTER_EXTENSION allows the shared library to be loaded and initialized
+// via python's import statement.
+#define REGISTER_EXTENSION(NAME)                                               \
+  PyMODINIT_FUNC CONCAT(PyInit_, NAME)() {                                     \
+    static struct PyModuleDef module = {PyModuleDef_HEAD_INIT,                 \
+                                        STRINGIFY(NAME), nullptr, 0, nullptr}; \
+    return PyModule_Create(&module);                                           \
+  }
diff --git a/vllm_v0.10.0/csrc/core/scalar_type.hpp b/vllm_v0.10.0/csrc/core/scalar_type.hpp
new file mode 100644
index 0000000..d0f85e2
--- /dev/null
+++ b/vllm_v0.10.0/csrc/core/scalar_type.hpp
@@ -0,0 +1,350 @@
+#pragma once
+
+// For TORCH_CHECK
+#include <torch/library.h>
+
+namespace vllm {
+
+//
+//  ScalarType can represent a wide range of floating point and integer types,
+//  in particular it can be used to represent sub-byte data types (something
+//  that torch.dtype currently does not support).
+//
+//  The type definitions on the Python side can be found in: vllm/scalar_type.py
+//  these type definitions should be kept up to date with any Python API changes
+//  here.
+//
+class ScalarType {
+ public:
+  enum NanRepr : uint8_t {
+    NAN_NONE = 0,                // nans are not supported
+    NAN_IEEE_754 = 1,            // nans are: exp all 1s, mantissa not all 0s
+    NAN_EXTD_RANGE_MAX_MIN = 2,  // nans are: exp all 1s, mantissa all 1s
+
+    NAN_REPR_ID_MAX
+  };
+
+  constexpr ScalarType(uint8_t exponent, uint8_t mantissa, bool signed_,
+                       int32_t bias, bool finite_values_only = false,
+                       NanRepr nan_repr = NAN_IEEE_754)
+      : exponent(exponent),
+        mantissa(mantissa),
+        signed_(signed_),
+        bias(bias),
+        finite_values_only(finite_values_only),
+        nan_repr(nan_repr) {};
+
+  static constexpr ScalarType int_(uint8_t size_bits, int32_t bias = 0) {
+    return ScalarType(0, size_bits - 1, true, bias);
+  }
+
+  static constexpr ScalarType uint(uint8_t size_bits, int32_t bias = 0) {
+    return ScalarType(0, size_bits, false, bias);
+  }
+
+  // IEEE 754 compliant floating point type
+  static constexpr ScalarType float_IEEE754(uint8_t exponent,
+                                            uint8_t mantissa) {
+    TORCH_CHECK(mantissa > 0 && exponent > 0);
+    return ScalarType(exponent, mantissa, true, 0, false, NAN_IEEE_754);
+  }
+
+  // IEEE 754 non-compliant floating point type
+  static constexpr ScalarType float_(uint8_t exponent, uint8_t mantissa,
+                                     bool finite_values_only,
+                                     NanRepr nan_repr) {
+    TORCH_CHECK(nan_repr < NAN_REPR_ID_MAX, "Invalid NanRepr");
+    TORCH_CHECK(mantissa > 0 && exponent > 0);
+    TORCH_CHECK(nan_repr != NAN_IEEE_754,
+                "use `float_IEEE754` constructor for floating point types that "
+                "follow IEEE 754 conventions");
+    return ScalarType(exponent, mantissa, true, 0, finite_values_only,
+                      nan_repr);
+  }
+
+  uint8_t const exponent;  // size of the exponent field (0 for integer types)
+  uint8_t const mantissa;  // size of the mantissa field (size of the integer
+                           // excluding the sign bit for integer types)
+  bool const signed_;  // flag if the type supports negative numbers (i.e. has a
+                       // sign bit)
+  int32_t const bias;  // stored values equal value + bias,
+                       // used for quantized type
+
+  // Extra Floating point info
+  bool const finite_values_only;  // i.e. no +/-inf if true
+  NanRepr const nan_repr;         // how NaNs are represented
+                                  // (not applicable for integer types)
+
+  using Id = int64_t;
+
+ private:
+  // Field size in id
+  template <typename T_>
+  static constexpr size_t member_id_field_width() {
+    using T = std::decay_t<T_>;
+    return std::is_same_v<T, bool> ? 1 : sizeof(T) * 8;
+  }
+
+  template <typename Fn, typename Init, typename Member, typename... Rest>
+  static constexpr auto reduce_members_helper(Fn f, Init val, Member member,
+                                              Rest... rest) {
+    auto new_val = f(val, member);
+    if constexpr (sizeof...(rest) > 0) {
+      return reduce_members_helper(f, new_val, rest...);
+    } else {
+      return new_val;
+    };
+  }
+
+  template <typename Fn, typename Init>
+  constexpr auto reduce_members(Fn f, Init init) const {
+    // Should be in constructor order for `from_id`
+    return reduce_members_helper(f, init, exponent, mantissa, signed_, bias,
+                                 finite_values_only, nan_repr);
+  };
+
+  template <typename Fn, typename Init>
+  static constexpr auto reduce_member_types(Fn f, Init init) {
+    constexpr auto dummy_type = ScalarType(0, 0, false, 0, false, NAN_NONE);
+    return dummy_type.reduce_members(f, init);
+  };
+
+  static constexpr auto id_size_bits() {
+    return reduce_member_types(
+        [](int acc, auto member) -> int {
+          return acc + member_id_field_width<decltype(member)>();
+        },
+        0);
+  }
+
+ public:
+  // unique id for this scalar type that can be computed at compile time for
+  //  c++17 template specialization this is not needed once we migrate to
+  //  c++20 and can pass literal classes as template parameters
+  constexpr Id id() const {
+    static_assert(id_size_bits() <= sizeof(Id) * 8,
+                  "ScalarType id is too large to be stored");
+
+    auto or_and_advance = [](std::pair<Id, uint32_t> result,
+                             auto member) -> std::pair<Id, uint32_t> {
+      auto [id, bit_offset] = result;
+      auto constexpr bits = member_id_field_width<decltype(member)>();
+      return {id | (int64_t(member) & ((uint64_t(1) << bits) - 1))
+                       << bit_offset,
+              bit_offset + bits};
+    };
+    return reduce_members(or_and_advance, std::pair<Id, uint32_t>{}).first;
+  }
+
+  // create a ScalarType from an id, for c++17 template specialization,
+  //  this is not needed once we migrate to c++20 and can pass literal
+  //  classes as template parameters
+  static constexpr ScalarType from_id(Id id) {
+    auto extract_and_advance = [id](auto result, auto member) {
+      using T = decltype(member);
+      auto [tuple, bit_offset] = result;
+      auto constexpr bits = member_id_field_width<T>();
+      auto extracted_val = static_cast<T>((int64_t(id) >> bit_offset) &
+                                          ((uint64_t(1) << bits) - 1));
+      auto new_tuple = std::tuple_cat(tuple, std::make_tuple(extracted_val));
+      return std::pair<decltype(new_tuple), int>{new_tuple, bit_offset + bits};
+    };
+
+    auto [tuple_args, _] = reduce_member_types(extract_and_advance,
+                                               std::pair<std::tuple<>, int>{});
+    return std::apply([](auto... args) { return ScalarType(args...); },
+                      tuple_args);
+  }
+
+  constexpr int64_t size_bits() const {
+    return mantissa + exponent + is_signed();
+  }
+  constexpr bool is_signed() const { return signed_; }
+  constexpr bool is_integer() const { return exponent == 0; }
+  constexpr bool is_floating_point() const { return exponent > 0; }
+  constexpr bool is_ieee_754() const {
+    return is_floating_point() && finite_values_only == false &&
+           nan_repr == NAN_IEEE_754;
+  }
+  constexpr bool has_nans() const {
+    return is_floating_point() && nan_repr != NAN_NONE;
+  }
+  constexpr bool has_infs() const {
+    return is_floating_point() && finite_values_only == false;
+  }
+  constexpr bool has_bias() const { return bias != 0; }
+
+ private:
+  double _floating_point_max() const {
+    TORCH_CHECK(mantissa <= 52 && exponent <= 11,
+                "Cannot represent max/min as a double for type ", str());
+
+    uint64_t max_mantissa = (uint64_t(1) << mantissa) - 1;
+    if (nan_repr == NAN_EXTD_RANGE_MAX_MIN) {
+      max_mantissa -= 1;
+    }
+
+    uint64_t max_exponent = (uint64_t(1) << exponent) - 2;
+    if (nan_repr == NAN_EXTD_RANGE_MAX_MIN || nan_repr == NAN_NONE) {
+      TORCH_CHECK(exponent < 11,
+                  "Cannot represent max/min as a double for type ", str());
+      max_exponent += 1;
+    }
+
+    // adjust the exponent to match that of a double
+    //  for now we assume the exponent bias is the standard 2^(e-1) -1, (where e
+    //  is the exponent bits), there is some precedent for non-standard biases,
+    //  example `float8_e4m3b11fnuz` here: https://github.com/jax-ml/ml_dtypes
+    //  but to avoid premature over complication we are just assuming the
+    //  standard exponent bias until there is a need to support non-standard
+    //  biases
+    uint64_t exponent_bias = (uint64_t(1) << (exponent - 1)) - 1;
+    uint64_t exponent_bias_double = (uint64_t(1) << 10) - 1;  // double e = 11
+
+    uint64_t max_exponent_double =
+        max_exponent - exponent_bias + exponent_bias_double;
+
+    // shift the mantissa into the position for a double and
+    // the exponent
+    uint64_t double_raw =
+        (max_mantissa << (52 - mantissa)) | (max_exponent_double << 52);
+
+    return *reinterpret_cast<double*>(&double_raw);
+  }
+
+  constexpr std::variant<int64_t, double> _raw_max() const {
+    if (is_floating_point()) {
+      return {_floating_point_max()};
+    } else {
+      TORCH_CHECK(size_bits() < 64 || size_bits() == 64 && is_signed(),
+                  "Cannot represent max as a int64_t");
+      return {(int64_t(1) << mantissa) - 1};
+    }
+  }
+
+  constexpr std::variant<int64_t, double> _raw_min() const {
+    if (is_floating_point()) {
+      TORCH_CHECK(is_signed(),
+                  "We currently assume all floating point types are signed");
+      constexpr uint64_t sign_bit_double = (uint64_t(1) << 63);
+
+      double max = _floating_point_max();
+      uint64_t max_raw = *reinterpret_cast<uint64_t*>(&max);
+      uint64_t min_raw = max_raw | sign_bit_double;
+      return {*reinterpret_cast<double*>(&min_raw)};
+    } else {
+      TORCH_CHECK(!is_signed() || size_bits() <= 64,
+                  "Cannot represent min as a int64_t");
+      if (is_signed()) {
+        // set the top bit to 1 (i.e. INT64_MIN) and the rest to 0
+        // then perform an arithmetic shift right to set all the bits above
+        // (size_bits() - 1) to 1
+        return {INT64_MIN >> (64 - size_bits())};
+      } else {
+        return {int64_t(0)};
+      }
+    }
+  }
+
+ public:
+  // Max representable value for this scalar type.
+  // (accounting for bias if there is one)
+  constexpr std::variant<int64_t, double> max() const {
+    return std::visit(
+        [this](auto x) -> std::variant<int64_t, double> { return {x - bias}; },
+        _raw_max());
+  }
+
+  // Min representable value for this scalar type.
+  // (accounting for bias if there is one)
+  constexpr std::variant<int64_t, double> min() const {
+    return std::visit(
+        [this](auto x) -> std::variant<int64_t, double> { return {x - bias}; },
+        _raw_min());
+  }
+
+  std::string str() const {
+    /* naming generally follows: https://github.com/jax-ml/ml_dtypes
+     * for floating point types (leading f) the scheme is:
+     *  `float<size_bits>_e<exponent_bits>m<mantissa_bits>[flags]`
+     *  flags:
+     *  - no-flags: means it follows IEEE 754 conventions
+     *  - f: means finite values only (no infinities)
+     *  - n: means nans are supported (non-standard encoding)
+     * for integer types the scheme is:
+     *  `[u]int<size_bits>[b<bias>]`
+     *  - if bias is not present it means its zero
+     */
+    if (is_floating_point()) {
+      auto ret = "float" + std::to_string(size_bits()) + "_e" +
+                 std::to_string(exponent) + "m" + std::to_string(mantissa);
+      if (!is_ieee_754()) {
+        if (finite_values_only) {
+          ret += "f";
+        }
+        if (nan_repr != NAN_NONE) {
+          ret += "n";
+        }
+      }
+      return ret;
+    } else {
+      auto ret = ((is_signed()) ? "int" : "uint") + std::to_string(size_bits());
+      if (has_bias()) {
+        ret += "b" + std::to_string(bias);
+      }
+      return ret;
+    }
+  }
+
+  constexpr bool operator==(ScalarType const& other) const {
+    return mantissa == other.mantissa && exponent == other.exponent &&
+           bias == other.bias && signed_ == other.signed_ &&
+           finite_values_only == other.finite_values_only &&
+           nan_repr == other.nan_repr;
+  }
+};
+
+using ScalarTypeId = ScalarType::Id;
+
+// "rust style" names generally following:
+//   https://github.com/pytorch/pytorch/blob/6d9f74f0af54751311f0dd71f7e5c01a93260ab3/torch/csrc/api/include/torch/types.h#L60-L70
+static inline constexpr auto kS4 = ScalarType::int_(4);
+static inline constexpr auto kU4 = ScalarType::uint(4);
+static inline constexpr auto kU4B8 = ScalarType::uint(4, 8);
+static inline constexpr auto kS8 = ScalarType::int_(8);
+static inline constexpr auto kU8 = ScalarType::uint(8);
+static inline constexpr auto kU8B128 = ScalarType::uint(8, 128);
+
+static inline constexpr auto kFE2M1f =
+    ScalarType::float_(2, 1, true, ScalarType::NAN_NONE);
+static inline constexpr auto kFE3M2f =
+    ScalarType::float_(3, 2, true, ScalarType::NAN_NONE);
+static inline constexpr auto kFE4M3fn =
+    ScalarType::float_(4, 3, true, ScalarType::NAN_EXTD_RANGE_MAX_MIN);
+static inline constexpr auto kFE5M2 = ScalarType::float_IEEE754(5, 2);
+static inline constexpr auto kFE8M7 = ScalarType::float_IEEE754(8, 7);
+static inline constexpr auto kFE5M10 = ScalarType::float_IEEE754(5, 10);
+
+// Fixed width style names, generally following:
+//  https://github.com/pytorch/pytorch/blob/6d9f74f0af54751311f0dd71f7e5c01a93260ab3/torch/csrc/api/include/torch/types.h#L47-L57
+static inline constexpr auto kInt4 = kS4;
+static inline constexpr auto kUint4 = kU4;
+static inline constexpr auto kUint4b8 = kU4B8;
+static inline constexpr auto kInt8 = kS8;
+static inline constexpr auto kUint8 = kU8;
+static inline constexpr auto kUint8b128 = kU8B128;
+
+static inline constexpr auto kFloat4_e2m1f = kFE2M1f;
+static inline constexpr auto kFloat6_e3m2f = kFE3M2f;
+static inline constexpr auto kFloat8_e4m3fn = kFE4M3fn;
+static inline constexpr auto kFloat8_e5m2 = kFE5M2;
+static inline constexpr auto kFloat16_e8m7 = kFE8M7;
+static inline constexpr auto kFloat16_e5m10 = kFE5M10;
+
+// colloquial names
+static inline constexpr auto kHalf = kFE5M10;
+static inline constexpr auto kFloat16 = kHalf;
+static inline constexpr auto kBFloat16 = kFE8M7;
+
+static inline constexpr auto kFloat16Id = kFloat16.id();
+};  // namespace vllm
diff --git a/vllm_v0.10.0/csrc/cpu/activation.cpp b/vllm_v0.10.0/csrc/cpu/activation.cpp
new file mode 100644
index 0000000..039b8d5
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cpu/activation.cpp
@@ -0,0 +1,163 @@
+#include "cpu_types.hpp"
+
+namespace {
+template <typename scalar_t, vec_op::FP32Vec8 (*func)(const vec_op::FP32Vec8&),
+          bool is_gated>
+void activation_kernel(int num_tokens, int d, scalar_t* __restrict__ input,
+                       scalar_t* __restrict__ output) {
+  using scalar_vec_t = vec_op::vec_t<scalar_t>;
+  constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
+
+  TORCH_CHECK(d % VEC_ELEM_NUM == 0);
+
+#pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    for (int j = 0; j < d; j += VEC_ELEM_NUM) {
+      int start = i * d;
+      if constexpr (is_gated) {
+        start *= 2;
+      }
+
+      const scalar_vec_t x(input + start + j);
+      const vec_op::FP32Vec8 f32_x(x);
+      vec_op::FP32Vec8 f32_ans = func(f32_x);
+
+      if constexpr (is_gated) {
+        const scalar_vec_t y(input + start + d + j);
+        const vec_op::FP32Vec8 f32_y(y);
+        f32_ans = f32_y * f32_ans;
+      }
+
+      const scalar_vec_t result(f32_ans);
+      result.save(output + i * d + j);
+    }
+  }
+}
+
+FORCE_INLINE vec_op::FP32Vec8 silu_act(const vec_op::FP32Vec8& x) {
+  const vec_op::FP32Vec8 zeros(0.0);
+  const vec_op::FP32Vec8 ones(1.0);
+  return x / (ones + (zeros - x).exp());
+}
+
+FORCE_INLINE vec_op::FP32Vec8 gelu_new_act(const vec_op::FP32Vec8& x) {
+  const vec_op::FP32Vec8 ones(1.0);
+  const vec_op::FP32Vec8 w1(0.79788456f);
+  const vec_op::FP32Vec8 w2(0.044715f);
+  const vec_op::FP32Vec8 w3(0.5);
+  const vec_op::FP32Vec8 x3 = x * x * x;
+  const vec_op::FP32Vec8 t = (w1 * (x + w2 * x3)).tanh();
+  return w3 * x * (ones + t);
+}
+
+FORCE_INLINE vec_op::FP32Vec8 gelu_fast_act(const vec_op::FP32Vec8& x) {
+  const vec_op::FP32Vec8 ones(1.0);
+  const vec_op::FP32Vec8 w1(0.79788456f);
+  const vec_op::FP32Vec8 w2(0.044715f);
+  const vec_op::FP32Vec8 w3(0.5);
+  const vec_op::FP32Vec8 t = (x * w1 * (ones + x * w2 * x)).tanh();
+  return w3 * x * (ones + t);
+}
+
+FORCE_INLINE vec_op::FP32Vec8 gelu_quick_act(const vec_op::FP32Vec8& x) {
+  const vec_op::FP32Vec8 zeros(0.0);
+  const vec_op::FP32Vec8 ones(1.0);
+  const vec_op::FP32Vec8 w1(1.702f);
+  return x / (ones + (zeros - w1 * x).exp());
+}
+
+FORCE_INLINE vec_op::FP32Vec8 gelu_act(const vec_op::FP32Vec8& x) {
+  const vec_op::FP32Vec8 ones(1.0);
+  const vec_op::FP32Vec8 w1(M_SQRT1_2);
+  const vec_op::FP32Vec8 w2(0.5);
+  return x * w2 * (ones + (x * w1).er());
+}
+
+FORCE_INLINE vec_op::FP32Vec8 gelu_tanh_act(const vec_op::FP32Vec8& x) {
+  const vec_op::FP32Vec8 ones(1.0);
+  const vec_op::FP32Vec8 w1(M_SQRT2 * M_2_SQRTPI * 0.5);
+  const vec_op::FP32Vec8 w2(0.5);
+  const vec_op::FP32Vec8 w3(0.044715);
+  const vec_op::FP32Vec8 x_3 = x * x * x;
+  const vec_op::FP32Vec8 inner = w1 * (x + x_3 * w3);
+  return x * w2 * (ones + inner.tanh());
+}
+};  // namespace
+
+void silu_and_mul(torch::Tensor& out, torch::Tensor& input) {
+  int num_tokens = input.numel() / input.size(-1);
+  int d = input.size(-1) / 2;
+
+  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "silu_and_mul_impl", [&] {
+    CPU_KERNEL_GUARD_IN(silu_and_mul_impl)
+    activation_kernel<scalar_t, silu_act, true>(
+        num_tokens, d, input.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
+    CPU_KERNEL_GUARD_OUT(silu_and_mul_impl)
+  });
+}
+
+void gelu_and_mul(torch::Tensor& out,    // [..., d]
+                  torch::Tensor& input)  // [..., 2 * d]
+{
+  int num_tokens = input.numel() / input.size(-1);
+  int d = input.size(-1) / 2;
+
+  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_and_mul_impl", [&] {
+    CPU_KERNEL_GUARD_IN(gelu_and_mul_impl)
+    activation_kernel<scalar_t, gelu_act, true>(
+        num_tokens, d, input.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
+    CPU_KERNEL_GUARD_OUT(gelu_and_mul_impl)
+  });
+}
+
+void gelu_tanh_and_mul(torch::Tensor& out,    // [..., d]
+                       torch::Tensor& input)  // [..., 2 * d]
+{
+  int num_tokens = input.numel() / input.size(-1);
+  int d = input.size(-1) / 2;
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "gelu_tanh_and_mul_impl", [&] {
+        CPU_KERNEL_GUARD_IN(gelu_tanh_and_mul_impl)
+        activation_kernel<scalar_t, gelu_tanh_act, true>(
+            num_tokens, d, input.data_ptr<scalar_t>(),
+            out.data_ptr<scalar_t>());
+        CPU_KERNEL_GUARD_OUT(gelu_tanh_and_mul_impl)
+      });
+}
+
+void gelu_new(torch::Tensor& out, torch::Tensor& input) {
+  int num_tokens = input.numel() / input.size(-1);
+  int d = input.size(-1);
+
+  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_new_impl", [&] {
+    CPU_KERNEL_GUARD_IN(gelu_new_impl)
+    activation_kernel<scalar_t, gelu_new_act, false>(
+        num_tokens, d, input.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
+    CPU_KERNEL_GUARD_OUT(gelu_new_impl)
+  });
+}
+
+void gelu_fast(torch::Tensor& out, torch::Tensor& input) {
+  int num_tokens = input.numel() / input.size(-1);
+  int d = input.size(-1);
+
+  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_fast_impl", [&] {
+    CPU_KERNEL_GUARD_IN(gelu_fast_impl)
+    activation_kernel<scalar_t, gelu_fast_act, false>(
+        num_tokens, d, input.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
+    CPU_KERNEL_GUARD_OUT(gelu_fast_impl)
+  });
+}
+
+void gelu_quick(torch::Tensor& out, torch::Tensor& input) {
+  int num_tokens = input.numel() / input.size(-1);
+  int d = input.size(-1);
+
+  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_quick_impl", [&] {
+    CPU_KERNEL_GUARD_IN(gelu_quick_impl)
+    activation_kernel<scalar_t, gelu_quick_act, false>(
+        num_tokens, d, input.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
+    CPU_KERNEL_GUARD_OUT(gelu_quick_impl)
+  });
+}
diff --git a/vllm_v0.10.0/csrc/cpu/attention.cpp b/vllm_v0.10.0/csrc/cpu/attention.cpp
new file mode 100644
index 0000000..82862fe
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cpu/attention.cpp
@@ -0,0 +1,798 @@
+#include "cpu_types.hpp"
+
+namespace {
+
+template <typename scalar_t>
+struct KernelVecType {
+  using q_load_vec_type = void;
+  using q_vec_type = void;
+  using k_load_vec_type = void;
+  using k_vec_type = void;
+  using qk_acc_vec_type = void;
+  using v_load_vec_type = void;
+};
+
+template <>
+struct KernelVecType<float> {
+  using q_load_vec_type = vec_op::FP32Vec4;
+  using q_vec_type = vec_op::FP32Vec16;
+  using k_load_vec_type = vec_op::FP32Vec16;
+  using k_vec_type = vec_op::FP32Vec16;
+  using qk_acc_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::FP32Vec16;
+};
+
+template <>
+struct KernelVecType<c10::Half> {
+#if defined(__powerpc64__) || defined(__s390x__)
+  // Power and s390x architecture-specific vector types
+  using q_load_vec_type = vec_op::FP32Vec8;
+  using k_load_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::FP32Vec16;
+#else
+  // Fallback for other architectures, including x86
+  using q_load_vec_type = vec_op::FP16Vec8;
+  using k_load_vec_type = vec_op::FP16Vec16;
+  using v_load_vec_type = vec_op::FP16Vec16;
+#endif
+  using q_vec_type = vec_op::FP32Vec16;
+  using k_vec_type = vec_op::FP32Vec16;
+  using qk_acc_vec_type = vec_op::FP32Vec16;
+};
+
+#ifdef __AVX512BF16__
+template <>
+struct KernelVecType<c10::BFloat16> {
+  using q_load_vec_type = vec_op::BF16Vec8;
+  using q_vec_type = vec_op::BF16Vec32;
+  using k_load_vec_type = vec_op::BF16Vec32;
+  using k_vec_type = vec_op::BF16Vec32;
+  using qk_acc_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::BF16Vec16;
+};
+#else
+  #ifdef __aarch64__
+    #ifndef ARM_BF16_SUPPORT
+    // pass
+    #else
+template <>
+struct KernelVecType<c10::BFloat16> {
+  using q_load_vec_type = vec_op::BF16Vec8;
+  using q_vec_type = vec_op::FP32Vec16;
+  using k_load_vec_type = vec_op::BF16Vec16;
+  using k_vec_type = vec_op::FP32Vec16;
+  using qk_acc_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::BF16Vec16;
+};
+    #endif
+  #else
+template <>
+struct KernelVecType<c10::BFloat16> {
+  using q_load_vec_type = vec_op::BF16Vec8;
+  using q_vec_type = vec_op::FP32Vec16;
+  using k_load_vec_type = vec_op::BF16Vec16;
+  using k_vec_type = vec_op::FP32Vec16;
+  using qk_acc_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::BF16Vec16;
+};
+  #endif
+#endif
+
+template <typename T>
+FORCE_INLINE std::pair<T, T> reduceSoftmax(T* data, const int size,
+                                           const int capacity) {
+  T max = data[0];
+  for (int i = 1; i < size; ++i) {
+    max = max >= data[i] ? max : data[i];
+  }
+
+  T sum = 0;
+  for (int i = 0; i < size; ++i) {
+    data[i] = std::exp(data[i] - max);
+    sum += data[i];
+  }
+
+  int i = 0;
+  for (; i < size; ++i) {
+    data[i] /= sum;
+  }
+
+  for (; i < capacity; ++i) {
+    data[i] = 0;
+  }
+
+  return {max, sum};
+}
+
+template <typename T>
+FORCE_INLINE std::pair<T, T> reduceSoftmaxAlibi(T* data, const int size,
+                                                const int capacity,
+                                                const float alibi_slope,
+                                                const int start_index,
+                                                const int seq_len) {
+  data[0] += alibi_slope * (start_index - seq_len + 1);
+  T max = data[0];
+  for (int i = 1; i < size; ++i) {
+    T qk = data[i] + alibi_slope * (start_index + i - seq_len + 1);
+    data[i] = qk;
+    max = max >= qk ? max : qk;
+  }
+
+  T sum = 0;
+  for (int i = 0; i < size; ++i) {
+    data[i] = std::exp(data[i] - max);
+    sum += data[i];
+  }
+
+  int i = 0;
+  for (; i < size; ++i) {
+    data[i] /= sum;
+  }
+
+  for (; i < capacity; ++i) {
+    data[i] = 0;
+  }
+
+  return {max, sum};
+}
+
+template <typename T>
+FORCE_INLINE void reducePartitionSoftmax(const T* max_data, T* sum_data,
+                                         const int size) {
+  T max = max_data[0];
+  for (int i = 1; i < size; ++i) {
+    max = max >= max_data[i] ? max : max_data[i];
+  }
+
+  T rescaled_sum = 0;
+  for (int i = 0; i < size; ++i) {
+    T rescale_factor = std::exp(max_data[i] - max);
+    rescaled_sum += rescale_factor * sum_data[i];
+    sum_data[i] *= rescale_factor;
+  }
+  for (int i = 0; i < size; ++i) {
+    sum_data[i] /= rescaled_sum + 1e-8;
+  }
+}
+
+template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE, int x>
+struct reduceQKBlockKernel {
+  using q_load_vec_type = typename KernelVecType<scalar_t>::q_load_vec_type;
+  using q_vec_type = typename KernelVecType<scalar_t>::q_vec_type;
+  using k_load_vec_type = typename KernelVecType<scalar_t>::k_load_vec_type;
+  using k_vec_type = typename KernelVecType<scalar_t>::k_vec_type;
+  using qk_acc_vec_type = typename KernelVecType<scalar_t>::qk_acc_vec_type;
+
+  constexpr static int TOKEN_PER_GROUP = k_load_vec_type::get_elem_num() / x;
+  constexpr static int MAX_GROUP_NUM = 16 / TOKEN_PER_GROUP;
+  constexpr static int UNROLL_GROUP_NUM = MAX_GROUP_NUM / 4;
+
+  static_assert(MAX_GROUP_NUM == 8 || MAX_GROUP_NUM == 4);
+  static_assert(k_load_vec_type::get_elem_num() % x == 0);
+  static_assert(q_load_vec_type::get_elem_num() * sizeof(scalar_t) == 16);
+
+  FORCE_INLINE static void call(const scalar_t* __restrict__ q,
+                                const scalar_t* __restrict__ k_block,
+                                float* __restrict__ logits, float scale,
+                                const int token_num) {
+    const int group_num = (token_num + TOKEN_PER_GROUP - 1) / TOKEN_PER_GROUP;
+
+    qk_acc_vec_type group_accums[MAX_GROUP_NUM];
+    if (token_num == BLOCK_SIZE) {
+      for (int q_offset = 0; q_offset < HEAD_SIZE;
+           q_offset += x, k_block += x * BLOCK_SIZE) {
+        q_load_vec_type q_load_group_vec(q + q_offset);
+        q_vec_type q_group_vec(q_load_group_vec);
+
+        vec_op::unroll_loop<int, MAX_GROUP_NUM>(
+            [k_block, &q_group_vec, &group_accums](int token_group_idx) {
+              k_load_vec_type k_load_group_vec(k_block + token_group_idx * x *
+                                                             TOKEN_PER_GROUP);
+              k_vec_type k_group_vec(k_load_group_vec);
+              vec_op::fma(group_accums[token_group_idx], q_group_vec,
+                          k_group_vec);
+              vec_op::prefetch(k_block + x * BLOCK_SIZE +
+                               token_group_idx * x * TOKEN_PER_GROUP);
+            });
+      }
+    } else {
+      for (int q_offset = 0; q_offset < HEAD_SIZE;
+           q_offset += x, k_block += x * BLOCK_SIZE) {
+        q_load_vec_type q_load_group_vec(q + q_offset);
+        q_vec_type q_group_vec(q_load_group_vec);
+        for (int token_group_start = 0; token_group_start < group_num;
+             token_group_start += UNROLL_GROUP_NUM) {
+          vec_op::unroll_loop<int, UNROLL_GROUP_NUM>(
+              [token_group_start, k_block, &q_group_vec,
+               &group_accums](int token_group_idx) {
+                token_group_idx += token_group_start;
+                k_load_vec_type k_load_group_vec(k_block + token_group_idx * x *
+                                                               TOKEN_PER_GROUP);
+                k_vec_type k_group_vec(k_load_group_vec);
+                vec_op::fma(group_accums[token_group_idx], q_group_vec,
+                            k_group_vec);
+                vec_op::prefetch(k_block + x * BLOCK_SIZE +
+                                 token_group_idx * x * TOKEN_PER_GROUP);
+              });
+        }
+      }
+    }
+
+    for (int token_group_idx = 0; token_group_idx < group_num;
+         ++token_group_idx) {
+      vec_op::unroll_loop<int, TOKEN_PER_GROUP>(
+          [&group_accums, logits, scale, token_group_idx](int token_idx) {
+            float dot_v =
+                group_accums[token_group_idx]
+                    .template reduce_sub_sum<qk_acc_vec_type::get_elem_num() /
+                                             TOKEN_PER_GROUP>(token_idx);
+            logits[token_group_idx * TOKEN_PER_GROUP + token_idx] =
+                dot_v * scale;
+          });
+    }
+  }
+};
+
+template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE,
+          int HEAD_PARTITION_SIZE, typename acc_t>
+FORCE_INLINE void reduceValueBlock(const float* prob, const scalar_t* v_block,
+                                   acc_t&& acc) {
+  using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
+  constexpr int ELEM_NUM = v_load_vec_type::get_elem_num();
+  static_assert(BLOCK_SIZE == ELEM_NUM);
+  vec_op::FP32Vec16 prob_vec(prob);
+
+  vec_op::unroll_loop<int, HEAD_PARTITION_SIZE>([&](int head_elem_idx) {
+    v_load_vec_type v_vec(v_block + BLOCK_SIZE * head_elem_idx);
+    vec_op::FP32Vec16 fp32_v_vec(v_vec);
+    acc[head_elem_idx] = acc[head_elem_idx] + prob_vec * fp32_v_vec;
+  });
+}
+};  // namespace
+
+// Paged attention v1
+namespace {
+template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE>
+struct paged_attention_v1_impl {
+  static void call(
+      scalar_t* __restrict__ out,            // [num_seqs, num_heads, head_size]
+      const scalar_t* __restrict__ q,        // [num_seqs, num_heads, head_size]
+      const scalar_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
+                                             // head_size/x, block_size, x]
+      const scalar_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
+                                             // head_size, block_size]
+      const int num_kv_heads, const float scale,
+      const int* __restrict__ block_tables,  // [num_seqs,
+                                             // max_num_blocks_per_seq]
+      const int* __restrict__ seq_lens,      // [num_seqs]
+      const int max_num_blocks_per_seq,
+      const float* __restrict__ alibi_slopes,  // [num_heads]
+      const int q_stride, const int kv_block_stride, const int kv_head_stride,
+      const int num_seqs, const int num_heads) {
+    constexpr int x = 16 / sizeof(scalar_t);
+    const int num_queries_per_kv = num_heads / num_kv_heads;
+
+    static_assert(BLOCK_SIZE == 16);
+
+    int max_seq_len = max_num_blocks_per_seq * BLOCK_SIZE;
+    int max_seq_len_padded = (max_seq_len + 15) & 0xFFFFFFF0;
+    TORCH_CHECK((max_seq_len_padded * sizeof(float)) % 64 == 0);
+
+    const int parallel_work_item_num = omp_get_max_threads();
+
+    size_t logits_bytes =
+        parallel_work_item_num * max_seq_len_padded * sizeof(float);
+    float* logits = (float*)std::aligned_alloc(
+        64, logits_bytes);  // Cacheline alignment for each context token.
+                            // [parallel_work_item_num, max_seq_len_padded]
+
+#pragma omp parallel for collapse(2) schedule(dynamic, 1)
+    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
+      for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
+        int seq_len = seq_lens[seq_idx];
+        const int* seq_block_table =
+            block_tables + max_num_blocks_per_seq * seq_idx;
+        const int block_num = (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        const int64_t kv_head_idx = head_idx / num_queries_per_kv;
+        const scalar_t* __restrict__ q_vec_ptr =
+            q + seq_idx * q_stride + head_idx * HEAD_SIZE;
+        const int last_block_token_num = seq_len - (block_num - 1) * BLOCK_SIZE;
+        float* __restrict__ thread_block_logits =
+            logits + omp_get_thread_num() * max_seq_len_padded;
+
+        // Compute logits
+        for (int block_idx = 0; block_idx < block_num; ++block_idx) {
+          const int64_t physical_block_idx = seq_block_table[block_idx];
+          const scalar_t* __restrict__ k_block_cache_ptr =
+              k_cache + physical_block_idx * kv_block_stride +
+              kv_head_idx * kv_head_stride;
+          float* __restrict__ head_block_logits =
+              thread_block_logits + block_idx * BLOCK_SIZE;
+
+          reduceQKBlockKernel<scalar_t, HEAD_SIZE, BLOCK_SIZE, x>::call(
+              q_vec_ptr, k_block_cache_ptr, head_block_logits, scale,
+              block_idx == block_num - 1 ? last_block_token_num : BLOCK_SIZE);
+        }
+
+        // Compute softmax
+        if (alibi_slopes) {
+          reduceSoftmaxAlibi(thread_block_logits, seq_len,
+                             block_num * BLOCK_SIZE, alibi_slopes[head_idx], 0,
+                             seq_len);
+        } else {
+          reduceSoftmax(thread_block_logits, seq_len, block_num * BLOCK_SIZE);
+        }
+
+        // Compute value
+        constexpr int head_elem_num_per_partition = 16;
+        constexpr int head_partition_num =
+            HEAD_SIZE / head_elem_num_per_partition;
+        for (int head_part_idx = 0; head_part_idx < head_partition_num;
+             ++head_part_idx) {
+          vec_op::FP32Vec16 accums[head_elem_num_per_partition];
+          scalar_t* __restrict__ out_ptr =
+              out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE +
+              head_part_idx * head_elem_num_per_partition;
+          for (int block_idx = 0; block_idx < block_num; ++block_idx) {
+            const int64_t physical_block_idx = seq_block_table[block_idx];
+            const float* __restrict__ prob_vec_ptr =
+                thread_block_logits + block_idx * BLOCK_SIZE;
+            const scalar_t* __restrict__ v_block_cache_ptr =
+                v_cache + physical_block_idx * kv_block_stride +
+                kv_head_idx * kv_head_stride +
+                BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
+            reduceValueBlock<scalar_t, HEAD_SIZE, BLOCK_SIZE,
+                             head_elem_num_per_partition>(
+                prob_vec_ptr, v_block_cache_ptr, accums);
+
+            if (block_idx != block_num - 1) {
+              const int64_t next_physical_block_idx =
+                  seq_block_table[block_idx + 1];
+              const scalar_t* __restrict__ next_v_block_cache_ptr =
+                  v_cache + next_physical_block_idx * kv_block_stride +
+                  kv_head_idx * kv_head_stride +
+                  BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
+              vec_op::unroll_loop<int, head_elem_num_per_partition>(
+                  [&](int head_elem_idx) {
+                    if (head_elem_idx % 2 == 0) {
+                      vec_op::prefetch(next_v_block_cache_ptr +
+                                       BLOCK_SIZE * head_elem_idx);
+                    }
+                  });
+            }
+          }
+
+          vec_op::unroll_loop<int, head_elem_num_per_partition>(
+              [&](int head_elem_idx) {
+                float value = accums[head_elem_idx].reduce_sum();
+                vec_op::storeFP32(value, out_ptr + head_elem_idx);
+              });
+        }
+      }
+    }
+    std::free(logits);
+  }
+};
+
+#define LAUNCH_V1_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE)                   \
+  paged_attention_v1_impl<T, HEAD_SIZE, BLOCK_SIZE>::call(                     \
+      out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \
+      block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq,                  \
+      alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, num_seqs,   \
+      num_heads);
+
+template <typename T, int BLOCK_SIZE>
+void paged_attention_v1_impl_launcher(
+    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
+    torch::Tensor& value_cache, int num_kv_heads, float scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
+    const std::optional<torch::Tensor>& alibi_slopes) {
+  int num_seqs = query.size(0);
+  int num_heads = query.size(1);
+  int head_size = query.size(2);
+  int max_num_blocks_per_seq = block_tables.size(1);
+  int q_stride = query.stride(0);
+  int kv_block_stride = key_cache.stride(0);
+  int kv_head_stride = key_cache.stride(1);
+
+  // NOTE: alibi_slopes is optional.
+  const float* alibi_slopes_ptr =
+      alibi_slopes
+          ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
+          : nullptr;
+
+  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
+  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
+  T* key_cache_ptr = reinterpret_cast<T*>(key_cache.data_ptr());
+  T* value_cache_ptr = reinterpret_cast<T*>(value_cache.data_ptr());
+  int* block_tables_ptr = block_tables.data_ptr<int>();
+  int* seq_lens_ptr = seq_lens.data_ptr<int>();
+
+  switch (head_size) {
+    case 32:
+      LAUNCH_V1_ATTENTION_KERNEL(T, 32, BLOCK_SIZE);
+      break;
+    case 64:
+      LAUNCH_V1_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
+      break;
+    case 80:
+      LAUNCH_V1_ATTENTION_KERNEL(T, 80, BLOCK_SIZE);
+      break;
+    case 96:
+      LAUNCH_V1_ATTENTION_KERNEL(T, 96, BLOCK_SIZE);
+      break;
+    case 112:
+      LAUNCH_V1_ATTENTION_KERNEL(T, 112, BLOCK_SIZE);
+      break;
+    case 128:
+      LAUNCH_V1_ATTENTION_KERNEL(T, 128, BLOCK_SIZE);
+      break;
+    case 192:
+      LAUNCH_V1_ATTENTION_KERNEL(T, 192, BLOCK_SIZE);
+      break;
+    case 256:
+      LAUNCH_V1_ATTENTION_KERNEL(T, 256, BLOCK_SIZE);
+      break;
+    default:
+      TORCH_CHECK(false, "Unsupported head size: ", head_size);
+      break;
+  }
+}
+
+#define CALL_V1_KERNEL_LAUNCHER(T, BLOCK_SIZE)                               \
+  paged_attention_v1_impl_launcher<T, BLOCK_SIZE>(                           \
+      out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
+      seq_lens, max_seq_len, alibi_slopes);
+
+#define CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(T)                     \
+  switch (block_size) {                                           \
+    case 16:                                                      \
+      CALL_V1_KERNEL_LAUNCHER(T, 16);                             \
+      break;                                                      \
+    default:                                                      \
+      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
+      break;                                                      \
+  }
+}  // namespace
+
+void paged_attention_v1(
+    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
+    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
+    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
+    torch::Tensor& v_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step) {
+  TORCH_CHECK(blocksparse_vert_stride <= 1,
+              "CPU backend does not support blocksparse attention yet.");
+  VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v1_impl",
+                               [&] {
+                                 CPU_KERNEL_GUARD_IN(paged_attention_v1_impl)
+                                 CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
+                                 CPU_KERNEL_GUARD_OUT(paged_attention_v1_impl)
+                               });
+}
+
+// Paged attention v2
+namespace {
+template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE, int PARTITION_SIZE>
+struct paged_attention_v2_impl {
+  static void call(
+      scalar_t* __restrict__ out,            // [num_seqs, num_heads, head_size]
+      float* __restrict__ exp_sums,          // [num_seqs, num_heads,
+                                             // max_num_partitions]
+      float* __restrict__ max_logits,        // [num_seqs, num_heads,
+                                             // max_num_partitions]
+      scalar_t* __restrict__ tmp_out,        // [num_seqs, num_heads,
+                                             // max_num_partitions, head_size]
+      const scalar_t* __restrict__ q,        // [num_seqs, num_heads, head_size]
+      const scalar_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
+                                             // head_size/x, block_size, x]
+      const scalar_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
+                                             // head_size, block_size]
+      const int num_kv_heads, const float scale,
+      const int* __restrict__ block_tables,  // [num_seqs,
+                                             // max_num_blocks_per_seq]
+      const int* __restrict__ seq_lens,      // [num_seqs]
+      const int max_num_blocks_per_seq,
+      const float* __restrict__ alibi_slopes,  // [num_heads]
+      const int q_stride, const int kv_block_stride, const int kv_head_stride,
+      const int num_seqs, const int num_heads, const int max_num_partitions) {
+    constexpr int x = 16 / sizeof(scalar_t);
+    const int num_queries_per_kv = num_heads / num_kv_heads;
+
+    static_assert(BLOCK_SIZE == 16);
+    static_assert(PARTITION_SIZE * sizeof(float) % 64 == 0);
+    static_assert(PARTITION_SIZE % BLOCK_SIZE == 0);
+
+#pragma omp parallel for collapse(3) schedule(static, 1)
+    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
+      for (int partition_idx = 0; partition_idx < max_num_partitions;
+           ++partition_idx) {
+        for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
+          const int seq_len = seq_lens[seq_idx];
+          const int start_token_idx = partition_idx * PARTITION_SIZE;
+
+          if (start_token_idx >= seq_len) continue;
+
+          const int partition_num =
+              (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
+          const bool no_reduce = (partition_num == 1);
+          const int token_num =
+              (std::min(seq_len, start_token_idx + PARTITION_SIZE) -
+               start_token_idx);
+          const int block_num = (token_num + BLOCK_SIZE - 1) / BLOCK_SIZE;
+          const int last_block_token_num =
+              token_num - (block_num - 1) * BLOCK_SIZE;
+          const int* seq_block_table = block_tables +
+                                       max_num_blocks_per_seq * seq_idx +
+                                       start_token_idx / BLOCK_SIZE;
+          const int64_t kv_head_idx = head_idx / num_queries_per_kv;
+          const scalar_t* __restrict__ q_vec_ptr =
+              q + seq_idx * q_stride + head_idx * HEAD_SIZE;
+
+          float logits[PARTITION_SIZE] __attribute__((aligned(64))) = {0};
+
+          // Compute logits
+          for (int block_idx = 0; block_idx < block_num; ++block_idx) {
+            const int64_t physical_block_idx = seq_block_table[block_idx];
+            const scalar_t* __restrict__ k_block_cache_ptr =
+                k_cache + physical_block_idx * kv_block_stride +
+                kv_head_idx * kv_head_stride;
+            float* __restrict__ head_block_logits =
+                logits + block_idx * BLOCK_SIZE;
+
+            reduceQKBlockKernel<scalar_t, HEAD_SIZE, BLOCK_SIZE, x>::call(
+                q_vec_ptr, k_block_cache_ptr, head_block_logits, scale,
+                block_idx == block_num - 1 ? last_block_token_num : BLOCK_SIZE);
+          }
+
+          std::pair<float, float> max_and_sum;
+          if (alibi_slopes) {
+            max_and_sum = reduceSoftmaxAlibi(
+                logits, token_num, block_num * BLOCK_SIZE,
+                alibi_slopes[head_idx], start_token_idx, seq_len);
+          } else {
+            max_and_sum =
+                reduceSoftmax(logits, token_num, block_num * BLOCK_SIZE);
+          }
+
+          auto&& [max_logit, exp_sum] = max_and_sum;
+
+          scalar_t* __restrict__ output_buffer = nullptr;
+          if (!no_reduce) {
+            auto idx = seq_idx * num_heads * max_num_partitions +
+                       head_idx * max_num_partitions + partition_idx;
+            max_logits[idx] = max_logit;
+            exp_sums[idx] = exp_sum;
+            output_buffer =
+                tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
+                head_idx * max_num_partitions * HEAD_SIZE +
+                partition_idx * HEAD_SIZE;
+          } else {
+            output_buffer =
+                out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
+          }
+
+          // Compute value
+          constexpr int head_elem_num_per_partition = 16;
+          constexpr int head_partition_num =
+              HEAD_SIZE / head_elem_num_per_partition;
+          for (int head_part_idx = 0; head_part_idx < head_partition_num;
+               ++head_part_idx) {
+            vec_op::FP32Vec16 accums[head_elem_num_per_partition];
+            scalar_t* __restrict__ out_ptr =
+                output_buffer + head_part_idx * head_elem_num_per_partition;
+            for (int block_idx = 0; block_idx < block_num; ++block_idx) {
+              const int64_t physical_block_idx = seq_block_table[block_idx];
+              const float* __restrict__ prob_vec_ptr =
+                  logits + block_idx * BLOCK_SIZE;
+              const scalar_t* __restrict__ v_block_cache_ptr =
+                  v_cache + physical_block_idx * kv_block_stride +
+                  kv_head_idx * kv_head_stride +
+                  BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
+              reduceValueBlock<scalar_t, HEAD_SIZE, BLOCK_SIZE,
+                               head_elem_num_per_partition>(
+                  prob_vec_ptr, v_block_cache_ptr, accums);
+
+              if (block_idx != block_num - 1) {
+                const int64_t next_physical_block_idx =
+                    seq_block_table[block_idx + 1];
+                const scalar_t* __restrict__ next_v_block_cache_ptr =
+                    v_cache + next_physical_block_idx * kv_block_stride +
+                    kv_head_idx * kv_head_stride +
+                    BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
+                vec_op::unroll_loop<int, head_elem_num_per_partition>(
+                    [&](int head_elem_idx) {
+                      if (head_elem_idx % 2 == 0) {
+                        vec_op::prefetch(next_v_block_cache_ptr +
+                                         BLOCK_SIZE * head_elem_idx);
+                      }
+                    });
+              }
+            }
+
+            vec_op::unroll_loop<int, head_elem_num_per_partition>(
+                [&](int head_elem_idx) {
+                  float value = accums[head_elem_idx].reduce_sum();
+                  vec_op::storeFP32(value, out_ptr + head_elem_idx);
+                });
+          }
+        }
+      }
+    }
+
+    // Rescale partition softmax and store the factors to exp_sums
+#pragma omp parallel for collapse(2) schedule(static, 1)
+    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
+      for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
+        const int seq_len = seq_lens[seq_idx];
+        const int partition_num =
+            (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
+
+        if (partition_num == 1) continue;
+
+        reducePartitionSoftmax(
+            max_logits + seq_idx * num_heads * max_num_partitions +
+                head_idx * max_num_partitions,
+            exp_sums + seq_idx * num_heads * max_num_partitions +
+                head_idx * max_num_partitions,
+            partition_num);
+      }
+    }
+
+    // Reduce values
+    using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
+    static_assert(v_load_vec_type::get_elem_num() == BLOCK_SIZE);
+    constexpr int head_elem_num_per_group =
+        16;  // Note: didn't align with the cacheline size, due to some
+             // HEAD_SIZE didn't align with 64 bytes
+    static_assert(HEAD_SIZE % head_elem_num_per_group == 0);
+    constexpr int head_group_num = HEAD_SIZE / head_elem_num_per_group;
+    const float* __restrict__ rescale_factors = exp_sums;
+#pragma omp parallel for collapse(3) schedule(static, 1)
+    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
+      for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
+        for (int group_idx = 0; group_idx < head_group_num; ++group_idx) {
+          const int seq_len = seq_lens[seq_idx];
+          const int partition_num =
+              (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
+
+          if (partition_num == 1) continue;
+
+          const float* __restrict__ seq_head_rescale_factors =
+              rescale_factors + seq_idx * num_heads * max_num_partitions +
+              head_idx * max_num_partitions;
+          const scalar_t* __restrict__ seq_head_tmp_out =
+              tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
+              head_idx * max_num_partitions * HEAD_SIZE +
+              group_idx * head_elem_num_per_group;
+          scalar_t* __restrict__ seq_head_output =
+              out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE +
+              group_idx * head_elem_num_per_group;
+
+          vec_op::FP32Vec16 acc;
+          for (int i = 0; i < partition_num; ++i) {
+            vec_op::FP32Vec16 rescale_factor(seq_head_rescale_factors[i]);
+            v_load_vec_type value(seq_head_tmp_out + i * HEAD_SIZE);
+            vec_op::FP32Vec16 fp32_value(value);
+            acc = acc + fp32_value * rescale_factor;
+          }
+          v_load_vec_type cast_acc(acc);
+          cast_acc.save(seq_head_output);
+        }
+      }
+    }
+  }
+};
+
+#define LAUNCH_V2_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE)                 \
+  paged_attention_v2_impl<T, HEAD_SIZE, BLOCK_SIZE, PARTITION_SIZE>::call(   \
+      out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr,         \
+      key_cache_ptr, value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \
+      seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride,      \
+      kv_block_stride, kv_head_stride, num_seqs, num_heads,                  \
+      max_num_partitions);
+
+template <typename T, int BLOCK_SIZE, int PARTITION_SIZE = 512>
+void paged_attention_v2_impl_launcher(
+    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
+    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
+    torch::Tensor& value_cache, int num_kv_heads, float scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
+    int max_seq_len, const std::optional<torch::Tensor>& alibi_slopes) {
+  int num_seqs = query.size(0);
+  int num_heads = query.size(1);
+  int head_size = query.size(2);
+  int max_num_blocks_per_seq = block_tables.size(1);
+  int q_stride = query.stride(0);
+  int kv_block_stride = key_cache.stride(0);
+  int kv_head_stride = key_cache.stride(1);
+  int max_num_partitions = exp_sums.size(-1);
+
+  // NOTE: alibi_slopes is optional.
+  const float* alibi_slopes_ptr =
+      alibi_slopes
+          ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
+          : nullptr;
+
+  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
+  float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
+  float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
+  T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
+  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
+  T* key_cache_ptr = reinterpret_cast<T*>(key_cache.data_ptr());
+  T* value_cache_ptr = reinterpret_cast<T*>(value_cache.data_ptr());
+  int* block_tables_ptr = block_tables.data_ptr<int>();
+  int* seq_lens_ptr = seq_lens.data_ptr<int>();
+
+  switch (head_size) {
+    case 32:
+      LAUNCH_V2_ATTENTION_KERNEL(T, 32, BLOCK_SIZE);
+      break;
+    case 64:
+      LAUNCH_V2_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
+      break;
+    case 80:
+      LAUNCH_V2_ATTENTION_KERNEL(T, 80, BLOCK_SIZE);
+      break;
+    case 96:
+      LAUNCH_V2_ATTENTION_KERNEL(T, 96, BLOCK_SIZE);
+      break;
+    case 112:
+      LAUNCH_V2_ATTENTION_KERNEL(T, 112, BLOCK_SIZE);
+      break;
+    case 128:
+      LAUNCH_V2_ATTENTION_KERNEL(T, 128, BLOCK_SIZE);
+      break;
+    case 192:
+      LAUNCH_V2_ATTENTION_KERNEL(T, 192, BLOCK_SIZE);
+      break;
+    case 256:
+      LAUNCH_V2_ATTENTION_KERNEL(T, 256, BLOCK_SIZE);
+      break;
+    default:
+      TORCH_CHECK(false, "Unsupported head size: ", head_size);
+      break;
+  }
+}
+
+#define CALL_V2_KERNEL_LAUNCHER(T, BLOCK_SIZE)                              \
+  paged_attention_v2_impl_launcher<T, BLOCK_SIZE>(                          \
+      out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,    \
+      num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len, \
+      alibi_slopes);
+
+#define CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(T)                     \
+  switch (block_size) {                                           \
+    case 16:                                                      \
+      CALL_V2_KERNEL_LAUNCHER(T, 16);                             \
+      break;                                                      \
+    default:                                                      \
+      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
+      break;                                                      \
+  }
+}  // namespace
+
+void paged_attention_v2(
+    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
+    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
+    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
+    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
+    torch::Tensor& v_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step) {
+  TORCH_CHECK(blocksparse_vert_stride <= 1,
+              "CPU backend does not support blocksparse attention yet.");
+  VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v2_impl",
+                               [&] {
+                                 CPU_KERNEL_GUARD_IN(paged_attention_v2_impl)
+                                 CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
+                                 CPU_KERNEL_GUARD_OUT(paged_attention_v2_impl)
+                               });
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/cpu/cache.cpp b/vllm_v0.10.0/csrc/cpu/cache.cpp
new file mode 100644
index 0000000..69f6d06
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cpu/cache.cpp
@@ -0,0 +1,214 @@
+#include <map>
+#include <vector>
+
+#include "cpu_types.hpp"
+
+#if defined(__x86_64__)
+  #define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES_WITH_E5M2
+#else
+  #define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES
+#endif
+
+namespace {
+template <typename scalar_t>
+void copy_blocks_cpu_impl(std::vector<torch::Tensor> const& key_caches,
+                          std::vector<torch::Tensor> const& value_caches,
+                          const torch::Tensor& mapping_pairs,
+                          const int element_num_per_block,
+                          const int layer_num) {
+  const size_t pair_num = mapping_pairs.size(0);
+  const size_t block_bytes = sizeof(scalar_t) * element_num_per_block;
+#pragma omp parallel for collapse(2)
+  for (int layer = 0; layer < layer_num; ++layer) {
+    for (size_t pair = 0; pair < pair_num; ++pair) {
+      int64_t source_offset =
+          element_num_per_block * mapping_pairs[pair][0].item<int64_t>();
+      int64_t target_offset =
+          element_num_per_block * mapping_pairs[pair][1].item<int64_t>();
+      scalar_t* key_cache_ptr = key_caches[layer].data_ptr<scalar_t>();
+      scalar_t* source_ptr = key_cache_ptr + source_offset;
+      scalar_t* target_ptr = key_cache_ptr + target_offset;
+      std::memcpy(target_ptr, source_ptr, block_bytes);
+
+      scalar_t* value_cache_ptr = value_caches[layer].data_ptr<scalar_t>();
+      source_ptr = value_cache_ptr + source_offset;
+      target_ptr = value_cache_ptr + target_offset;
+      std::memcpy(target_ptr, source_ptr, block_bytes);
+    }
+  }
+}
+
+template <typename scalar_t>
+void reshape_and_cache_cpu_impl(
+    const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
+    scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
+    const int64_t* __restrict__ slot_mapping, const int num_tokens,
+    const int key_stride, const int value_stride, const int num_heads,
+    const int head_size, const int block_size, const int x) {
+  const int block_elem_num = num_heads * head_size * block_size;
+
+#pragma omp parallel for collapse(2)
+  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
+    for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
+      const int64_t slot_idx = slot_mapping[token_idx];
+      if (slot_idx >= 0) {
+        int src_key_head_idx = token_idx * key_stride + head_idx * head_size;
+        int src_value_head_idx =
+            token_idx * value_stride + head_idx * head_size;
+        const scalar_t* src_key_head_ptr = key + src_key_head_idx;
+        const scalar_t* src_value_head_ptr = value + src_value_head_idx;
+        const int64_t block_index = slot_idx / block_size;
+        const int64_t block_offset = slot_idx % block_size;
+        scalar_t* target_key_head_ptr = key_cache +
+                                        block_elem_num * block_index +
+                                        head_idx * block_size * head_size;
+        scalar_t* target_value_head_ptr = value_cache +
+                                          block_elem_num * block_index +
+                                          head_idx * block_size * head_size;
+
+        for (int src_key_idx = 0; src_key_idx < head_size; src_key_idx += x) {
+          const int64_t target_offset =
+              src_key_idx * block_size + block_offset * x;
+          for (int i = 0; i < x; ++i) {
+            target_key_head_ptr[target_offset + i] =
+                src_key_head_ptr[src_key_idx + i];
+          }
+        }
+
+        for (int src_value_idx = 0; src_value_idx < head_size;
+             ++src_value_idx) {
+          const int64_t target_offset =
+              src_value_idx * block_size + block_offset;
+          target_value_head_ptr[target_offset] =
+              src_value_head_ptr[src_value_idx];
+        }
+      }
+    }
+  }
+}
+};  // namespace
+
+template <typename scalar_t>
+void concat_and_cache_mla_cpu_impl(
+    const scalar_t* __restrict__ kv_c,  // [num_tokens, kv_lora_rank]
+    const scalar_t* __restrict__ k_pe,  // [num_tokens, pe_dim]
+    scalar_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank
+                                      // + pe_dim)]
+    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
+    const int num_tokens,                      //
+    const int block_stride,                    //
+    const int entry_stride,                    //
+    const int kv_c_stride,                     //
+    const int k_pe_stride,                     //
+    const int kv_lora_rank,                    //
+    const int pe_dim,                          //
+    const int block_size                       //
+) {
+#pragma omp parallel for
+  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
+    const int64_t slot_idx = slot_mapping[token_idx];
+    // NOTE: slot_idx can be -1 if the token is padded
+    if (slot_idx < 0) {
+      continue;
+    }
+    const int64_t block_idx = slot_idx / block_size;
+    const int64_t block_offset = slot_idx % block_size;
+
+    auto copy = [&](const scalar_t* __restrict__ src,
+                    scalar_t* __restrict__ dst, int src_stride, int dst_stride,
+                    int size, int offset) {
+      for (int i = 0; i < size; i++) {
+        const int64_t src_idx = token_idx * src_stride + i;
+        const int64_t dst_idx =
+            block_idx * block_stride + block_offset * entry_stride + i + offset;
+        dst[dst_idx] = src[src_idx];
+      }
+    };
+
+    copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
+    copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
+  }
+}
+
+// Note: the key_caches and value_caches vectors are constant but
+// not the Tensors they contain. The vectors need to be const refs
+// in order to satisfy pytorch's C++ operator registration code.
+void copy_blocks(std::vector<torch::Tensor> const& key_caches,
+                 std::vector<torch::Tensor> const& value_caches,
+                 const torch::Tensor& block_mapping) {
+  unsigned num_layers = key_caches.size();
+  TORCH_CHECK(num_layers == value_caches.size());
+  if (num_layers == 0) {
+    return;
+  }
+
+  const int element_num_per_block = key_caches[0][0].numel();
+  DISPATCH_MACRO(key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] {
+    CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl)
+    copy_blocks_cpu_impl<scalar_t>(key_caches, value_caches, block_mapping,
+                                   element_num_per_block, num_layers);
+    CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl)
+  });
+}
+
+void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
+                       torch::Tensor& key_cache, torch::Tensor& value_cache,
+                       torch::Tensor& slot_mapping,
+                       const std::string& kv_cache_dtype,
+                       torch::Tensor& k_scale, torch::Tensor& v_scale) {
+  int num_tokens = key.size(0);
+  int num_heads = key.size(1);
+  int head_size = key.size(2);
+  int block_size = key_cache.size(3);
+  int x = key_cache.size(4);
+
+  int key_stride = key.stride(0);
+  int value_stride = value.stride(0);
+
+  DISPATCH_MACRO(key.scalar_type(), "reshape_and_cache_cpu_impl", [&] {
+    CPU_KERNEL_GUARD_IN(reshape_and_cache_cpu_impl)
+    reshape_and_cache_cpu_impl<scalar_t>(
+        key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
+        key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
+        slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride, value_stride,
+        num_heads, head_size, block_size, x);
+    CPU_KERNEL_GUARD_OUT(reshape_and_cache_cpu_impl)
+  });
+}
+
+void concat_and_cache_mla(
+    torch::Tensor& kv_c,          // [num_tokens, kv_lora_rank]
+    torch::Tensor& k_pe,          // [num_tokens, pe_dim]
+    torch::Tensor& kv_cache,      // [num_blocks, block_size, (kv_lora_rank +
+                                  // pe_dim)]
+    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
+    const std::string& kv_cache_dtype, torch::Tensor& scale) {
+  int num_tokens = slot_mapping.size(0);
+  int kv_lora_rank = kv_c.size(1);
+  int pe_dim = k_pe.size(1);
+  int block_size = kv_cache.size(1);
+
+  TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
+  TORCH_CHECK(kv_cache_dtype != "fp8");
+
+  int kv_c_stride = kv_c.stride(0);
+  int k_pe_stride = k_pe.stride(0);
+  int block_stride = kv_cache.stride(0);
+  int entry_stride = kv_cache.stride(1);
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      kv_c.scalar_type(), "concat_and_cache_mla_cpu_impl", [&] {
+        CPU_KERNEL_GUARD_IN(concat_and_cache_mla_cpu_impl)
+        concat_and_cache_mla_cpu_impl<scalar_t>(
+            kv_c.data_ptr<scalar_t>(), k_pe.data_ptr<scalar_t>(),
+            kv_cache.data_ptr<scalar_t>(), slot_mapping.data_ptr<int64_t>(),
+            num_tokens, block_stride, entry_stride, kv_c_stride, k_pe_stride,
+            kv_lora_rank, pe_dim, block_size);
+        CPU_KERNEL_GUARD_OUT(concat_and_cache_mla_cpu_impl)
+      });
+}
+
+void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
+                 const torch::Tensor& block_mapping) {
+  TORCH_CHECK(false, "swap_blocks is unsupported on CPU.")
+}
diff --git a/vllm_v0.10.0/csrc/cpu/cpu_types.hpp b/vllm_v0.10.0/csrc/cpu/cpu_types.hpp
new file mode 100644
index 0000000..17bbe04
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cpu/cpu_types.hpp
@@ -0,0 +1,20 @@
+#ifndef CPU_TYPES_HPP
+#define CPU_TYPES_HPP
+
+#if defined(__x86_64__)
+  // x86 implementation
+  #include "cpu_types_x86.hpp"
+#elif defined(__POWER9_VECTOR__)
+  // ppc implementation
+  #include "cpu_types_vsx.hpp"
+#elif defined(__s390x__)
+  // s390 implementation
+  #include "cpu_types_vxe.hpp"
+#elif defined(__aarch64__)
+  // arm implementation
+  #include "cpu_types_arm.hpp"
+#else
+  #warning "unsupported vLLM cpu implementation"
+#endif
+
+#endif
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/cpu/cpu_types_arm.hpp b/vllm_v0.10.0/csrc/cpu/cpu_types_arm.hpp
new file mode 100644
index 0000000..2251aac
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cpu/cpu_types_arm.hpp
@@ -0,0 +1,856 @@
+#include <arm_neon.h>
+#include <torch/all.h>
+#include <cmath>
+
+#if defined(__APPLE__)
+  #include "omp.h"
+#endif
+
+namespace vec_op {
+
+#ifdef ARM_BF16_SUPPORT
+  #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
+    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
+    AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)  \
+    AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+#else
+  #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
+    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
+    AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
+#endif
+
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
+#ifndef CPU_OP_GUARD
+  #define CPU_KERNEL_GUARD_IN(NAME)
+  #define CPU_KERNEL_GUARD_OUT(NAME)
+#else
+  #define CPU_KERNEL_GUARD_IN(NAME) \
+    std::cout << #NAME << " invoked." << std::endl;
+  #define CPU_KERNEL_GUARD_OUT(NAME) \
+    std::cout << #NAME << " exit." << std::endl;
+#endif
+
+#define FORCE_INLINE __attribute__((always_inline)) inline
+// Number of elements in single ASIMD vector of given Datatype
+#define NUM_ELEMENTS_REG(vec) (sizeof(vec) / sizeof(vec[0]))
+
+namespace {
+template <typename T, T... indexes, typename F>
+constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
+  (f(std::integral_constant<T, indexes>{}), ...);
+};
+};  // namespace
+
+template <typename T, T count, typename F,
+          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
+constexpr void unroll_loop(F&& f) {
+  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
+}
+
+template <typename T>
+struct Vec {
+  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; };
+};
+
+struct FP32Vec8;
+struct FP32Vec16;
+
+struct FP16Vec8 : public Vec<FP16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+
+  float16x8_t reg;
+
+  explicit FP16Vec8(const void* ptr)
+      : reg(vld1q_f16(static_cast<const __fp16*>(ptr))) {};
+
+  explicit FP16Vec8(const FP32Vec8&);
+
+  void save(void* ptr) const { vst1q_f16(static_cast<__fp16*>(ptr), reg); }
+};
+
+struct FP16Vec16 : public Vec<FP16Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+
+  float16x8x2_t reg;
+
+  explicit FP16Vec16(const void* ptr) {
+    reg.val[0] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr));
+    reg.val[1] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr) + 8);
+  }
+
+  explicit FP16Vec16(const FP32Vec16& vec);
+
+  void save(void* ptr) const {
+    vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);
+    vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
+  }
+
+  void save(void* ptr, const int elem_num) const {
+    int full_blocks = elem_num / NUM_ELEMENTS_REG(reg.val[0]);
+    int remainder = elem_num % NUM_ELEMENTS_REG(reg.val[0]);
+
+    if (full_blocks > 0) {
+      vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);
+      if (full_blocks > 1) {
+        vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
+      }
+    }
+
+    // Note: below is the unrolled version of the following code:
+    //
+    // for (int i = 0; i < remainder; ++i) {
+    //     reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] =
+    //          vgetq_lane_f16(temp, i);
+    // }
+    //
+    // For macOS build (Clang), the arm/neon intrinsics function
+    // `vgetq_lane_f16` needs the parameter `i` to be constant at compile
+    // time.
+
+    if (remainder > 0) {
+      float16x8_t temp = reg.val[full_blocks];
+      __fp16* fp16_ptr = reinterpret_cast<__fp16*>(ptr);
+      switch (remainder) {
+        case 1:
+          fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+          break;
+        case 2:
+          fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+          fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+          break;
+        case 3:
+          fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+          fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+          fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+          break;
+        case 4:
+          fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+          fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+          fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+          fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+          break;
+        case 5:
+          fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+          fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+          fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+          fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+          fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
+          break;
+        case 6:
+          fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+          fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+          fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+          fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+          fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
+          fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
+          break;
+        case 7:
+          fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+          fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+          fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+          fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+          fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
+          fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
+          fp16_ptr[full_blocks * 8 + 6] = vgetq_lane_f16(temp, 6);
+          break;
+
+        default:
+          break;
+      }
+    }
+  }
+};
+
+#ifdef ARM_BF16_SUPPORT
+struct BF16Vec8 : public Vec<BF16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+
+  bfloat16x8_t reg;
+
+  explicit BF16Vec8(const void* ptr)
+      : reg(*reinterpret_cast<const bfloat16x8_t*>(ptr)) {};
+
+  explicit BF16Vec8(bfloat16x8_t data) : reg(data) {};
+
+  explicit BF16Vec8(const FP32Vec8&);
+
+  explicit BF16Vec8(float32x4x2_t v)
+      : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1])) {};
+
+  void save(void* ptr) const { *reinterpret_cast<bfloat16x8_t*>(ptr) = reg; }
+};
+
+struct BF16Vec16 : public Vec<BF16Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+
+  bfloat16x8x2_t reg;
+
+  explicit BF16Vec16(const void* ptr)
+      : reg(*reinterpret_cast<const bfloat16x8x2_t*>(ptr)) {};
+
+  explicit BF16Vec16(bfloat16x8x2_t data) : reg(data) {};
+
+  explicit BF16Vec16(const FP32Vec16&);
+
+  explicit BF16Vec16(float32x4x4_t v)
+      : reg({vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1]),
+             vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[2]), v.val[3])}) {};
+
+  void save(void* ptr) const { *reinterpret_cast<bfloat16x8x2_t*>(ptr) = reg; };
+  void save(void* ptr, const int elem_num) const {
+    int full_blocks = elem_num / NUM_ELEMENTS_REG(reg.val[0]);
+    int remainder = elem_num % NUM_ELEMENTS_REG(reg.val[0]);
+    for (int i = 0; i < full_blocks; i++)
+      vst1q_bf16(
+          reinterpret_cast<__bf16*>(ptr) + NUM_ELEMENTS_REG(reg.val[0]) * i,
+          reg.val[i]);
+    if (remainder > 0) {
+      bfloat16x8_t temp = reg.val[full_blocks];
+      bfloat16_t* base = reinterpret_cast<bfloat16_t*>(ptr) + full_blocks * 8;
+      if (remainder > 0) base[0] = vgetq_lane_bf16(temp, 0);
+      if (remainder > 1) base[1] = vgetq_lane_bf16(temp, 1);
+      if (remainder > 2) base[2] = vgetq_lane_bf16(temp, 2);
+      if (remainder > 3) base[3] = vgetq_lane_bf16(temp, 3);
+      if (remainder > 4) base[4] = vgetq_lane_bf16(temp, 4);
+      if (remainder > 5) base[5] = vgetq_lane_bf16(temp, 5);
+      if (remainder > 6) base[6] = vgetq_lane_bf16(temp, 6);
+    }
+  };
+};
+
+struct BF16Vec32 : public Vec<BF16Vec32> {
+  constexpr static int VEC_ELEM_NUM = 32;
+
+  bfloat16x8x4_t reg;
+
+  explicit BF16Vec32(const void* ptr)
+      : reg(*reinterpret_cast<const bfloat16x8x4_t*>(ptr)) {};
+
+  explicit BF16Vec32(bfloat16x8x4_t data) : reg(data) {};
+
+  explicit BF16Vec32(const BF16Vec8& vec8_data)
+      : reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {};
+
+  void save(void* ptr) const { *reinterpret_cast<bfloat16x8x4_t*>(ptr) = reg; };
+  void save(void* ptr, const int elem_num) const {
+    int full_blocks = elem_num / NUM_ELEMENTS_REG(reg.val[0]);
+    int remainder = elem_num % NUM_ELEMENTS_REG(reg.val[0]);
+    for (int i = 0; i < full_blocks; i++)
+      vst1q_bf16(
+          reinterpret_cast<__bf16*>(ptr) + NUM_ELEMENTS_REG(reg.val[0]) * i,
+          reg.val[i]);
+    if (remainder > 0) {
+      bfloat16x8_t temp = reg.val[full_blocks];
+      bfloat16_t* base = reinterpret_cast<bfloat16_t*>(ptr) + full_blocks * 8;
+      base[0] = vgetq_lane_bf16(temp, 0);
+      if (remainder > 1) base[1] = vgetq_lane_bf16(temp, 1);
+      if (remainder > 2) base[2] = vgetq_lane_bf16(temp, 2);
+      if (remainder > 3) base[3] = vgetq_lane_bf16(temp, 3);
+      if (remainder > 4) base[4] = vgetq_lane_bf16(temp, 4);
+      if (remainder > 5) base[5] = vgetq_lane_bf16(temp, 5);
+      if (remainder > 6) base[6] = vgetq_lane_bf16(temp, 6);
+    }
+  };
+};
+#endif
+
+struct FP32Vec4 : public Vec<FP32Vec4> {
+  constexpr static int VEC_ELEM_NUM = 4;
+
+  union AliasReg {
+    float32x4_t reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  float32x4_t reg;
+
+  explicit FP32Vec4(float v) : reg(vdupq_n_f32(v)) {};
+
+  explicit FP32Vec4() : reg(vdupq_n_f32(0.0f)) {};
+
+  explicit FP32Vec4(const float* ptr) : reg(vld1q_f32(ptr)) {};
+
+  explicit FP32Vec4(float32x4_t data) : reg(data) {};
+
+  explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {};
+};
+
+struct FP32Vec8 : public Vec<FP32Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+  union AliasReg {
+    float32x4x2_t reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  float32x4x2_t reg;
+
+  explicit FP32Vec8(float v) : reg({vmovq_n_f32(v), vmovq_n_f32(v)}) {};
+
+  explicit FP32Vec8() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {};
+
+  explicit FP32Vec8(const float* ptr)
+      : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4)}) {};
+
+  explicit FP32Vec8(float32x4x2_t data) : reg(data) {};
+
+  explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {};
+
+  explicit FP32Vec8(const FP16Vec8& v) {
+    reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg));
+    reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg));
+  };
+
+  explicit FP32Vec8(float16x8_t v)
+      : reg({vcvt_f32_f16(vget_low_f16(v)), vcvt_f32_f16(vget_high_f16(v))}) {};
+
+#ifdef ARM_BF16_SUPPORT
+
+  explicit FP32Vec8(bfloat16x8_t v)
+      : reg({vcvtq_low_f32_bf16(v), vcvtq_high_f32_bf16(v)}) {};
+
+  explicit FP32Vec8(const BF16Vec8& v)
+      : reg({vcvtq_low_f32_bf16(v.reg), vcvtq_high_f32_bf16(v.reg)}) {};
+
+#endif
+
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float answer = 0;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&answer, &ar](int i) { answer += ar.values[i]; });
+
+    return answer;
+  }
+
+  FP32Vec8 exp() const {
+    AliasReg ar;
+    ar.reg = reg;
+
+    float32x2_t exp_vec0 = {expf(ar.values[0]), expf(ar.values[1])};
+    float32x2_t exp_vec1 = {expf(ar.values[2]), expf(ar.values[3])};
+    float32x2_t exp_vec2 = {expf(ar.values[4]), expf(ar.values[5])};
+    float32x2_t exp_vec3 = {expf(ar.values[6]), expf(ar.values[7])};
+
+    float32x4_t result0 = vcombine_f32(exp_vec0, exp_vec1);
+    float32x4_t result1 = vcombine_f32(exp_vec2, exp_vec3);
+
+    float32x4x2_t result;
+    result.val[0] = result0;
+    result.val[1] = result1;
+
+    return FP32Vec8(result);
+  }
+
+  FP32Vec8 tanh() const {
+    AliasReg ar;
+    ar.reg = reg;
+
+    float32x2_t tanh_vec0 = {tanhf(ar.values[0]), tanhf(ar.values[1])};
+    float32x2_t tanh_vec1 = {tanhf(ar.values[2]), tanhf(ar.values[3])};
+    float32x2_t tanh_vec2 = {tanhf(ar.values[4]), tanhf(ar.values[5])};
+    float32x2_t tanh_vec3 = {tanhf(ar.values[6]), tanhf(ar.values[7])};
+
+    float32x4_t result0 = vcombine_f32(tanh_vec0, tanh_vec1);
+    float32x4_t result1 = vcombine_f32(tanh_vec2, tanh_vec3);
+
+    float32x4x2_t result;
+    result.val[0] = result0;
+    result.val[1] = result1;
+
+    return FP32Vec8(result);
+  }
+
+  FP32Vec8 er() const {
+    AliasReg ar;
+    ar.reg = reg;
+
+    float32x2_t er_vec0 = {static_cast<float32_t>(erf(ar.values[0])),
+                           static_cast<float32_t>(erf(ar.values[1]))};
+    float32x2_t er_vec1 = {static_cast<float32_t>(erf(ar.values[2])),
+                           static_cast<float32_t>(erf(ar.values[3]))};
+    float32x2_t er_vec2 = {static_cast<float32_t>(erf(ar.values[4])),
+                           static_cast<float32_t>(erf(ar.values[5]))};
+    float32x2_t er_vec3 = {static_cast<float32_t>(erf(ar.values[6])),
+                           static_cast<float32_t>(erf(ar.values[7]))};
+
+    float32x4_t result0 = vcombine_f32(er_vec0, er_vec1);
+    float32x4_t result1 = vcombine_f32(er_vec2, er_vec3);
+
+    float32x4x2_t result;
+    result.val[0] = result0;
+    result.val[1] = result1;
+
+    return FP32Vec8(result);
+  }
+
+  FP32Vec8 operator*(const FP32Vec8& b) const {
+    return FP32Vec8(float32x4x2_t({vmulq_f32(reg.val[0], b.reg.val[0]),
+                                   vmulq_f32(reg.val[1], b.reg.val[1])}));
+  }
+
+  FP32Vec8 operator+(const FP32Vec8& b) const {
+    return FP32Vec8(float32x4x2_t({vaddq_f32(reg.val[0], b.reg.val[0]),
+                                   vaddq_f32(reg.val[1], b.reg.val[1])}));
+  }
+
+  FP32Vec8 operator-(const FP32Vec8& b) const {
+    return FP32Vec8(float32x4x2_t({vsubq_f32(reg.val[0], b.reg.val[0]),
+                                   vsubq_f32(reg.val[1], b.reg.val[1])}));
+  }
+
+  FP32Vec8 operator/(const FP32Vec8& b) const {
+    return FP32Vec8(float32x4x2_t({vdivq_f32(reg.val[0], b.reg.val[0]),
+                                   vdivq_f32(reg.val[1], b.reg.val[1])}));
+  }
+
+  void save(float* ptr) const {
+    vst1q_f32(ptr, reg.val[0]);
+    vst1q_f32(ptr + 4, reg.val[1]);
+  }
+};
+
+struct INT32Vec16 : public Vec<INT32Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  union AliasReg {
+    int32x4x4_t reg;
+    int32_t values[VEC_ELEM_NUM];
+  };
+  int32x4x4_t reg;
+
+  explicit INT32Vec16(const void* ptr) {
+    reg.val[0] = vld1q_s32(reinterpret_cast<const int32_t*>(ptr));
+    reg.val[1] = vld1q_s32(reinterpret_cast<const int32_t*>(ptr) + 4);
+    reg.val[2] = vld1q_s32(reinterpret_cast<const int32_t*>(ptr) + 8);
+    reg.val[3] = vld1q_s32(reinterpret_cast<const int32_t*>(ptr) + 12);
+  }
+
+  void save(int32_t* ptr) const {
+    vst1q_s32(ptr, reg.val[0]);
+    vst1q_s32(ptr + 4, reg.val[1]);
+    vst1q_s32(ptr + 8, reg.val[2]);
+    vst1q_s32(ptr + 12, reg.val[3]);
+  };
+
+  void save(int32_t* ptr, const int elem_num) const {
+    int full_blocks = elem_num / NUM_ELEMENTS_REG(reg.val[0]);
+    int remainder = elem_num % NUM_ELEMENTS_REG(reg.val[0]);
+
+    for (int i = 0; i < full_blocks; i++)
+      vst1q_s32(
+          reinterpret_cast<__int32_t*>(ptr) + NUM_ELEMENTS_REG(reg.val[0]) * i,
+          reg.val[i]);
+
+    if (remainder > 0) {
+      int32x4_t temp = reg.val[full_blocks];
+      int32_t* base = reinterpret_cast<int32_t*>(ptr) + full_blocks * 4;
+      if (remainder > 0) base[0] = vgetq_lane_s32(temp, 0);
+      if (remainder > 1) base[1] = vgetq_lane_s32(temp, 1);
+      if (remainder > 2) base[2] = vgetq_lane_s32(temp, 2);
+      if (remainder > 3) base[3] = vgetq_lane_s32(temp, 3);
+    }
+  }
+};
+
+struct FP32Vec16 : public Vec<FP32Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  union AliasReg {
+    float32x4x4_t reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  float32x4x4_t reg;
+
+  explicit FP32Vec16(float v)
+      : reg({vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v)}) {}
+
+  explicit FP32Vec16()
+      : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0),
+             vmovq_n_f32(0.0)}) {}
+
+  explicit FP32Vec16(const float* ptr)
+      : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8),
+             vld1q_f32(ptr + 12)}) {}
+
+  explicit FP32Vec16(float32x4x4_t data) : reg(data) {}
+
+  explicit FP32Vec16(const FP32Vec8& data) {
+    reg.val[0] = data.reg.val[0];
+    reg.val[1] = data.reg.val[1];
+    reg.val[2] = data.reg.val[0];
+    reg.val[3] = data.reg.val[1];
+  }
+
+  explicit FP32Vec16(const FP32Vec16& data) : reg(data.reg) {}
+
+  explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v.reg)) {}
+
+#ifdef ARM_BF16_SUPPORT
+  explicit FP32Vec16(bfloat16x8x2_t v)
+      : reg({vcvtq_low_f32_bf16(v.val[0]), vcvtq_high_f32_bf16(v.val[0]),
+             vcvtq_low_f32_bf16(v.val[1]), vcvtq_high_f32_bf16(v.val[1])}) {};
+#endif
+
+  explicit FP32Vec16(const FP32Vec4& data) {
+    reg.val[0] = data.reg;
+    reg.val[1] = data.reg;
+    reg.val[2] = data.reg;
+    reg.val[3] = data.reg;
+  };
+
+#ifdef ARM_BF16_SUPPORT
+  explicit FP32Vec16(const BF16Vec16& v)
+      : reg({vcvtq_low_f32_bf16(v.reg.val[0]),
+             vcvtq_high_f32_bf16(v.reg.val[0]),
+             vcvtq_low_f32_bf16(v.reg.val[1]),
+             vcvtq_high_f32_bf16(v.reg.val[1])}) {};
+
+  explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
+#endif
+
+  explicit FP32Vec16(const FP16Vec16& v) {
+    reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg.val[0]));
+    reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg.val[0]));
+    reg.val[2] = vcvt_f32_f16(vget_low_f16(v.reg.val[1]));
+    reg.val[3] = vcvt_f32_f16(vget_high_f16(v.reg.val[1]));
+  };
+  explicit FP32Vec16(const INT32Vec16& v) {
+    reg.val[0] = vcvtq_f32_s32(v.reg.val[0]);
+    reg.val[1] = vcvtq_f32_s32(v.reg.val[1]);
+    reg.val[2] = vcvtq_f32_s32(v.reg.val[2]);
+    reg.val[3] = vcvtq_f32_s32(v.reg.val[3]);
+  };
+  FP32Vec16 operator+(const FP32Vec16& b) const {
+    return FP32Vec16(float32x4x4_t({vaddq_f32(reg.val[0], b.reg.val[0]),
+                                    vaddq_f32(reg.val[1], b.reg.val[1]),
+                                    vaddq_f32(reg.val[2], b.reg.val[2]),
+                                    vaddq_f32(reg.val[3], b.reg.val[3])}));
+  };
+
+  FP32Vec16 operator*(const FP32Vec16& b) const {
+    return FP32Vec16(float32x4x4_t({vmulq_f32(reg.val[0], b.reg.val[0]),
+                                    vmulq_f32(reg.val[1], b.reg.val[1]),
+                                    vmulq_f32(reg.val[2], b.reg.val[2]),
+                                    vmulq_f32(reg.val[3], b.reg.val[3])}));
+  };
+
+  FP32Vec16 operator-(const FP32Vec16& b) const {
+    return FP32Vec16(float32x4x4_t({vsubq_f32(reg.val[0], b.reg.val[0]),
+                                    vsubq_f32(reg.val[1], b.reg.val[1]),
+                                    vsubq_f32(reg.val[2], b.reg.val[2]),
+                                    vsubq_f32(reg.val[3], b.reg.val[3])}));
+  };
+
+  FP32Vec16 operator/(const FP32Vec16& b) const {
+    return FP32Vec16(float32x4x4_t({vdivq_f32(reg.val[0], b.reg.val[0]),
+                                    vdivq_f32(reg.val[1], b.reg.val[1]),
+                                    vdivq_f32(reg.val[2], b.reg.val[2]),
+                                    vdivq_f32(reg.val[3], b.reg.val[3])}));
+  };
+
+  FP32Vec16 clamp(const FP32Vec16& min, const FP32Vec16& max) const {
+    return FP32Vec16(float32x4x4_t(
+        {vminq_f32(max.reg.val[0], vmaxq_f32(min.reg.val[0], reg.val[0])),
+         vminq_f32(max.reg.val[1], vmaxq_f32(min.reg.val[1], reg.val[1])),
+         vminq_f32(max.reg.val[2], vmaxq_f32(min.reg.val[2], reg.val[2])),
+         vminq_f32(max.reg.val[3], vmaxq_f32(min.reg.val[3], reg.val[3]))}));
+  };
+
+  FP32Vec16 max(const FP32Vec16& b) const {
+    return FP32Vec16(float32x4x4_t({vmaxq_f32(b.reg.val[0], reg.val[0]),
+                                    vmaxq_f32(b.reg.val[1], reg.val[1]),
+                                    vmaxq_f32(b.reg.val[2], reg.val[2]),
+                                    vmaxq_f32(b.reg.val[3], reg.val[3])}));
+  };
+
+  FP32Vec16 max(const FP32Vec16& b, const int elem_num) const {
+    int full_blocks = elem_num / NUM_ELEMENTS_REG(reg.val[0]);
+    int remainder = elem_num % NUM_ELEMENTS_REG(reg.val[0]);
+    float32x4x4_t temp;
+
+    for (int i = 0; i < full_blocks; i++)
+      temp.val[i] = vmaxq_f32(b.reg.val[i], reg.val[i]);
+
+    if (remainder > 0) {
+      float max_v = std::max(vgetq_lane_f32(reg.val[full_blocks], 0),
+                             vgetq_lane_f32(b.reg.val[full_blocks], 0));
+      temp.val[full_blocks] = vsetq_lane_f32(max_v, temp.val[full_blocks], 0);
+    }
+    if (remainder > 1) {
+      float max_v = std::max(vgetq_lane_f32(reg.val[full_blocks], 1),
+                             vgetq_lane_f32(b.reg.val[full_blocks], 1));
+      temp.val[full_blocks] = vsetq_lane_f32(max_v, temp.val[full_blocks], 1);
+    }
+    if (remainder > 2) {
+      float max_v = std::max(vgetq_lane_f32(reg.val[full_blocks], 2),
+                             vgetq_lane_f32(b.reg.val[full_blocks], 2));
+      temp.val[full_blocks] = vsetq_lane_f32(max_v, temp.val[full_blocks], 2);
+    }
+    return FP32Vec16(temp);
+  };
+
+  FP32Vec16 min(const FP32Vec16& b) const {
+    return FP32Vec16(float32x4x4_t({
+        vminq_f32(b.reg.val[0], reg.val[0]),
+        vminq_f32(b.reg.val[1], reg.val[1]),
+        vminq_f32(b.reg.val[2], reg.val[2]),
+        vminq_f32(b.reg.val[3], reg.val[3]),
+    }));
+  };
+  FP32Vec16 min(const FP32Vec16& b, const int elem_num) const {
+    int full_blocks = elem_num / NUM_ELEMENTS_REG(reg.val[0]);
+    const int remainder = elem_num % NUM_ELEMENTS_REG(reg.val[0]);
+    float32x4x4_t temp;
+    for (int i = 0; i < full_blocks; i++)
+      temp.val[i] = vminq_f32(b.reg.val[i], reg.val[i]);
+
+    if (remainder > 0) {
+      float min_v = std::min(vgetq_lane_f32(reg.val[full_blocks], 0),
+                             vgetq_lane_f32(b.reg.val[full_blocks], 0));
+      temp.val[full_blocks] = vsetq_lane_f32(min_v, temp.val[full_blocks], 0);
+    }
+    if (remainder > 1) {
+      float min_v = std::min(vgetq_lane_f32(reg.val[full_blocks], 1),
+                             vgetq_lane_f32(b.reg.val[full_blocks], 1));
+      temp.val[full_blocks] = vsetq_lane_f32(min_v, temp.val[full_blocks], 1);
+    }
+    if (remainder > 2) {
+      float min_v = std::min(vgetq_lane_f32(reg.val[full_blocks], 2),
+                             vgetq_lane_f32(b.reg.val[full_blocks], 2));
+      temp.val[full_blocks] = vsetq_lane_f32(min_v, temp.val[full_blocks], 2);
+    }
+
+    return FP32Vec16(temp);
+  };
+  FP32Vec16 abs() const {
+    return FP32Vec16(
+        float32x4x4_t({vabsq_f32(reg.val[0]), vabsq_f32(reg.val[1]),
+                       vabsq_f32(reg.val[2]), vabsq_f32(reg.val[3])}));
+  }
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float answer = 0;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&answer, &ar](int i) { answer += ar.values[i]; });
+
+    return answer;
+  };
+
+  float reduce_max() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float max_v = std::numeric_limits<float>::lowest();
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&max_v, &ar](int i) { max_v = std::max(max_v, ar.values[i]); });
+    return max_v;
+  }
+
+  float reduce_min() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float min_v = std::numeric_limits<float>::max();
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&min_v, &ar](int i) { min_v = std::min(min_v, ar.values[i]); });
+    return min_v;
+  }
+
+  template <int group_size>
+  float reduce_sub_sum(int idx) {
+    static_assert(VEC_ELEM_NUM % group_size == 0);
+
+    AliasReg ar;
+    ar.reg = reg;
+    float answer = 0;
+    const int start = idx * group_size;
+    unroll_loop<int, group_size>(
+        [&answer, &start, ar](int i) { answer += ar.values[start + i]; });
+
+    return answer;
+  };
+
+  void save(float* ptr) const {
+    vst1q_f32(ptr, reg.val[0]);
+    vst1q_f32(ptr + 4, reg.val[1]);
+    vst1q_f32(ptr + 8, reg.val[2]);
+    vst1q_f32(ptr + 12, reg.val[3]);
+  };
+
+  void save(float* ptr, const int elem_num) const {
+    int full_blocks = elem_num / NUM_ELEMENTS_REG(reg.val[0]);
+    int remainder = elem_num % NUM_ELEMENTS_REG(reg.val[0]);
+
+    for (int i = 0; i < full_blocks; i++)
+      vst1q_f32(
+          reinterpret_cast<float32_t*>(ptr) + NUM_ELEMENTS_REG(reg.val[0]) * i,
+          reg.val[i]);
+
+    if (remainder > 0) {
+      float32x4_t temp = reg.val[full_blocks];
+      float* base = reinterpret_cast<float32_t*>(ptr) +
+                    full_blocks * NUM_ELEMENTS_REG(reg.val[0]);
+      if (remainder > 0) base[0] = vgetq_lane_f32(temp, 0);
+      if (remainder > 1) base[1] = vgetq_lane_f32(temp, 1);
+      if (remainder > 2) base[2] = vgetq_lane_f32(temp, 2);
+    }
+  }
+};
+
+struct INT8Vec16 : public Vec<INT8Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  union AliasReg {
+    int8x16_t reg;
+    int8_t values[VEC_ELEM_NUM];
+  };
+  int8x16_t reg;
+
+  explicit INT8Vec16(const FP32Vec16& vec) {
+    // Convert each 128-bit float32 vector to int32
+    int32x4_t part0 =
+        vcvtq_s32_f32(vec.reg.val[0]);  // Convert first 128-bit block
+    int32x4_t part1 =
+        vcvtq_s32_f32(vec.reg.val[1]);  // Convert second 128-bit block
+    int32x4_t part2 =
+        vcvtq_s32_f32(vec.reg.val[2]);  // Convert third 128-bit block
+    int32x4_t part3 =
+        vcvtq_s32_f32(vec.reg.val[3]);  // Convert fourth 128-bit block
+
+    // Narrow each 32-bit vector to 8 bits and combine
+    int8x8_t lower =
+        vqmovn_s16(vcombine_s16(vqmovn_s32(part0), vqmovn_s32(part1)));
+    int8x8_t upper =
+        vqmovn_s16(vcombine_s16(vqmovn_s32(part2), vqmovn_s32(part3)));
+    reg = vcombine_s8(lower, upper);  // Combine to form a single 128-bit vector
+  }
+
+  void save(int8_t* ptr) const { vst1q_s8(ptr, reg); };
+
+  void save(int8_t* ptr, const int elem_num) const {
+    int full_blocks = elem_num / NUM_ELEMENTS_REG(reg);
+    int remainder = elem_num % NUM_ELEMENTS_REG(reg);
+
+    for (int i = 0; i < full_blocks; i++)
+      vst1q_s8(reinterpret_cast<int8_t*>(ptr) + NUM_ELEMENTS_REG(reg) * i, reg);
+    if (remainder > 0) {
+      int8x16_t temp = reg;
+      int8_t* base =
+          reinterpret_cast<int8_t*>(ptr) + full_blocks * NUM_ELEMENTS_REG(reg);
+      if (remainder > 0) base[0] = vgetq_lane_s8(temp, 0);
+      if (remainder > 1) base[1] = vgetq_lane_s8(temp, 1);
+      if (remainder > 2) base[2] = vgetq_lane_s8(temp, 2);
+      if (remainder > 3) base[3] = vgetq_lane_s8(temp, 3);
+      if (remainder > 4) base[4] = vgetq_lane_s8(temp, 4);
+      if (remainder > 5) base[5] = vgetq_lane_s8(temp, 5);
+      if (remainder > 6) base[6] = vgetq_lane_s8(temp, 6);
+      if (remainder > 7) base[7] = vgetq_lane_s8(temp, 7);
+      if (remainder > 8) base[8] = vgetq_lane_s8(temp, 8);
+      if (remainder > 9) base[9] = vgetq_lane_s8(temp, 9);
+      if (remainder > 10) base[10] = vgetq_lane_s8(temp, 10);
+      if (remainder > 11) base[11] = vgetq_lane_s8(temp, 11);
+      if (remainder > 12) base[12] = vgetq_lane_s8(temp, 12);
+      if (remainder > 13) base[13] = vgetq_lane_s8(temp, 13);
+      if (remainder > 14) base[14] = vgetq_lane_s8(temp, 14);
+    }
+  };
+};
+
+template <typename T>
+struct VecType {
+  using vec_type = void;
+};
+
+template <typename T>
+using vec_t = typename VecType<T>::vec_type;
+
+template <>
+struct VecType<float> {
+  using vec_type = FP32Vec8;
+};
+
+template <>
+struct VecType<c10::Half> {
+  using vec_type = FP16Vec8;
+};
+
+#ifdef ARM_BF16_SUPPORT
+template <>
+struct VecType<c10::BFloat16> {
+  using vec_type = BF16Vec8;
+};
+#endif
+
+template <typename T>
+void storeFP32(float v, T* ptr) {
+  *ptr = v;
+}
+
+template <>
+inline void storeFP32<c10::Half>(float v, c10::Half* ptr) {
+  *reinterpret_cast<__fp16*>(ptr) = v;
+}
+
+inline FP16Vec16::FP16Vec16(const FP32Vec16& v) {
+  float16x4_t low_0 = vcvt_f16_f32(v.reg.val[0]);
+  float16x4_t high_0 = vcvt_f16_f32(v.reg.val[1]);
+  float16x4_t low_1 = vcvt_f16_f32(v.reg.val[2]);
+  float16x4_t high_1 = vcvt_f16_f32(v.reg.val[3]);
+
+  reg.val[0] = vcombine_f16(low_0, high_0);
+  reg.val[1] = vcombine_f16(low_1, high_1);
+};
+
+inline FP16Vec8 ::FP16Vec8(const FP32Vec8& v) {
+  float16x4_t lower_half = vcvt_f16_f32(v.reg.val[0]);
+  float16x4_t upper_half = vcvt_f16_f32(v.reg.val[1]);
+
+  reg = vcombine_f16(lower_half, upper_half);
+};
+
+inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
+  acc.reg.val[0] = vfmaq_f32(acc.reg.val[0], a.reg.val[0], b.reg.val[0]);
+  acc.reg.val[1] = vfmaq_f32(acc.reg.val[1], a.reg.val[1], b.reg.val[1]);
+  acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a.reg.val[2], b.reg.val[2]);
+  acc.reg.val[3] = vfmaq_f32(acc.reg.val[3], a.reg.val[3], b.reg.val[3]);
+};
+
+#ifdef ARM_BF16_SUPPORT
+inline void fma(FP32Vec16& acc, BF16Vec32& a, BF16Vec32& b) {
+  float32x4_t a0_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[0]));
+  float32x4_t a0_high = vcvt_f32_bf16(vget_high_bf16(a.reg.val[0]));
+  float32x4_t a1_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[1]));
+  float32x4_t a1_high = vcvt_f32_bf16(vget_high_bf16(a.reg.val[1]));
+
+  float32x4_t b0_low = vcvt_f32_bf16(vget_low_bf16(b.reg.val[0]));
+  float32x4_t b0_high = vcvt_f32_bf16(vget_high_bf16(b.reg.val[0]));
+  float32x4_t b1_low = vcvt_f32_bf16(vget_low_bf16(b.reg.val[1]));
+  float32x4_t b1_high = vcvt_f32_bf16(vget_high_bf16(b.reg.val[1]));
+
+  acc.reg.val[0] = vfmaq_f32(acc.reg.val[0], a0_low, b0_low);
+  acc.reg.val[1] = vfmaq_f32(acc.reg.val[1], a0_high, b0_high);
+  acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a1_low, b1_low);
+  acc.reg.val[3] = vfmaq_f32(acc.reg.val[3], a1_high, b1_high);
+};
+#endif
+
+#ifdef ARM_BF16_SUPPORT
+inline BF16Vec8::BF16Vec8(const FP32Vec8& v)
+    : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1])) {
+      };
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16& v)
+    : reg({vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1]),
+           vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[2]),
+                               v.reg.val[3])}) {};
+#endif
+
+inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 1); };
+
+#ifdef ARM_BF16_SUPPORT
+template <>
+inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
+  *reinterpret_cast<__bf16*>(ptr) = vcvth_bf16_f32(v);
+};
+#endif
+};  // namespace vec_op
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/cpu/cpu_types_vsx.hpp b/vllm_v0.10.0/csrc/cpu/cpu_types_vsx.hpp
new file mode 100644
index 0000000..089b984
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cpu/cpu_types_vsx.hpp
@@ -0,0 +1,788 @@
+
+#ifndef CPU_TYPES_VSX_HPP
+#define CPU_TYPES_VSX_HPP
+
+#include <altivec.h>
+#include <cmath>
+#include <algorithm>
+#include <torch/all.h>
+
+namespace vec_op {
+
+// FIXME: FP16 is not fully supported in Torch-CPU
+#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
+#ifndef CPU_OP_GUARD
+  #define CPU_KERNEL_GUARD_IN(NAME)
+  #define CPU_KERNEL_GUARD_OUT(NAME)
+#else
+  #define CPU_KERNEL_GUARD_IN(NAME) \
+    std::cout << #NAME << " invoked." << std::endl;
+  #define CPU_KERNEL_GUARD_OUT(NAME) \
+    std::cout << #NAME << " exit." << std::endl;
+#endif
+
+#define FORCE_INLINE __attribute__((always_inline)) inline
+
+namespace {
+template <typename T, T... indexes, typename F>
+constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
+  (f(std::integral_constant<T, indexes>{}), ...);
+}
+};  // namespace
+
+template <typename T, T count, typename F,
+          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
+constexpr void unroll_loop(F&& f) {
+  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
+}
+
+template <typename T>
+struct Vec {
+  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
+};
+
+typedef struct ss16x8x2_t {
+  __vector signed short val[2];
+} ss16x8x2_t;
+
+typedef struct ss16x8x4_t {
+  __vector signed short val[4];
+} ss16x8x4_t;
+
+typedef struct f32x4x2_t {
+  __vector float val[2];
+} f32x4x2_t;
+
+typedef struct f32x4x4_t {
+  __vector float val[4];
+} f32x4x4_t;
+
+typedef struct i32x4x4_t {
+  __vector int32_t val[4];
+} i32x4x4_t;
+
+struct FP32Vec8;
+struct FP32Vec16;
+
+struct BF16Vec8 : public Vec<BF16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+
+  __vector signed short reg;
+
+  explicit BF16Vec8(const void* ptr)
+      : reg((__vector signed short)vec_xl(0, (__vector signed short*)ptr)) {}
+
+  explicit BF16Vec8(const FP32Vec8&);
+
+  void save(void* ptr) const {
+    *reinterpret_cast<__vector signed short*>(ptr) = reg;
+  }
+};
+
+struct BF16Vec16 : public Vec<BF16Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+
+  ss16x8x2_t reg;
+
+  explicit BF16Vec16(const void* ptr) {
+    // Load 256 bits in two parts
+    reg.val[0] = (__vector signed short)vec_xl(0, (signed short*)ptr);
+    reg.val[1] = (__vector signed short)vec_xl(16, (signed short*)ptr);
+  }
+
+  explicit BF16Vec16(const FP32Vec16&);
+
+  void save(void* ptr) const {
+    // Save 256 bits in two parts
+    vec_xst(reg.val[0], 0, (signed short*)ptr);
+    vec_xst(reg.val[1], 16, (signed short*)ptr);
+  }
+
+  void save(void* ptr, const int elem_num) const {
+    const int clamped_elem = std::max(0, std::min(elem_num, 16));
+
+    // Calculate elements to store in each 128-bit part (8 elements each)
+    const int elements_val0 = std::min(clamped_elem, 8);
+    const int elements_val1 = std::max(clamped_elem - 8, 0);
+
+    // Convert elements to bytes (2 bytes per element)
+    const size_t bytes_val0 = elements_val0 * sizeof(signed short);
+    const size_t bytes_val1 = elements_val1 * sizeof(signed short);
+
+    signed short* dest = static_cast<signed short*>(ptr);
+    // Store the first part using vec_xst_len
+    if (bytes_val0 > 0) {
+      vec_xst_len(reg.val[0], dest, bytes_val0);
+    }
+    // Store the second part if needed
+    if (bytes_val1 > 0) {
+      vec_xst_len(reg.val[1], dest + elements_val0, bytes_val1);
+    }
+  }
+};
+
+const static __vector signed short zero = vec_splats((signed short)0);
+
+struct BF16Vec32 : public Vec<BF16Vec32> {
+  constexpr static int VEC_ELEM_NUM = 32;
+
+  ss16x8x4_t reg;
+  explicit BF16Vec32(const void* ptr)
+      : reg(*reinterpret_cast<const ss16x8x4_t*>(ptr)) {}
+
+  explicit BF16Vec32(ss16x8x4_t data) : reg(data) {}
+
+  explicit BF16Vec32(const BF16Vec8& vec8_data)
+      : reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {}
+
+  void save(void* ptr) const { *reinterpret_cast<ss16x8x4_t*>(ptr) = reg; }
+};
+
+struct FP32Vec4 : public Vec<FP32Vec4> {
+  constexpr static int VEC_ELEM_NUM = 4;
+  union AliasReg {
+    __vector float reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  __vector float reg;
+
+  explicit FP32Vec4(float v) : reg(vec_splats(v)) {}
+
+  explicit FP32Vec4() : reg(vec_splats(0.0f)) {}
+
+  explicit FP32Vec4(const float* ptr) : reg(vec_xl(0, ptr)) {}
+
+  explicit FP32Vec4(__vector float data) : reg(data) {}
+
+  explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {}
+};
+
+struct FP32Vec8 : public Vec<FP32Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+  union AliasReg {
+    f32x4x2_t reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  f32x4x2_t reg;
+
+  explicit FP32Vec8(float v) {
+    reg.val[0] = vec_splats(v);
+    reg.val[1] = vec_splats(v);
+  }
+
+  explicit FP32Vec8() {
+    reg.val[0] = vec_splats(0.0f);
+    reg.val[1] = vec_splats(0.0f);
+  }
+
+  explicit FP32Vec8(const float* ptr) {
+    reg.val[0] = vec_xl(0, ptr);
+    reg.val[1] = vec_xl(16, ptr);
+  }
+
+  explicit FP32Vec8(f32x4x2_t data) : reg(data) {}
+
+  explicit FP32Vec8(const FP32Vec8& data) {
+    reg.val[0] = data.reg.val[0];
+    reg.val[1] = data.reg.val[1];
+  }
+
+  explicit FP32Vec8(const BF16Vec8& v) {
+    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg);
+    reg.val[1] = (__vector float)vec_mergel(zero, v.reg);
+  }
+
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float result = 0;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, &ar](int i) { result += ar.values[i]; });
+
+    return result;
+  }
+
+  FP32Vec8 exp() const {
+    // TODO: Vectorize this
+    AliasReg ar;
+    ar.reg = reg;
+    f32x4x4_t ret;
+    ret.val[0][0] = std::exp(ar.values[0]);
+    ret.val[0][1] = std::exp(ar.values[1]);
+    ret.val[0][2] = std::exp(ar.values[2]);
+    ret.val[0][3] = std::exp(ar.values[3]);
+    ret.val[1][0] = std::exp(ar.values[4]);
+    ret.val[1][1] = std::exp(ar.values[5]);
+    ret.val[1][2] = std::exp(ar.values[6]);
+    ret.val[1][3] = std::exp(ar.values[7]);
+    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+  }
+
+  FP32Vec8 tanh() const {
+    // TODO: Vectorize this
+    AliasReg ar;
+    ar.reg = reg;
+    f32x4x4_t ret;
+    ret.val[0][0] = std::tanh(ar.values[0]);
+    ret.val[0][1] = std::tanh(ar.values[1]);
+    ret.val[0][2] = std::tanh(ar.values[2]);
+    ret.val[0][3] = std::tanh(ar.values[3]);
+    ret.val[1][0] = std::tanh(ar.values[4]);
+    ret.val[1][1] = std::tanh(ar.values[5]);
+    ret.val[1][2] = std::tanh(ar.values[6]);
+    ret.val[1][3] = std::tanh(ar.values[7]);
+    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+  }
+
+  FP32Vec8 er() const {
+    // TODO: Vectorize this
+    AliasReg ar;
+    ar.reg = reg;
+    f32x4x4_t ret;
+    ret.val[0][0] = std::erf(ar.values[0]);
+    ret.val[0][1] = std::erf(ar.values[1]);
+    ret.val[0][2] = std::erf(ar.values[2]);
+    ret.val[0][3] = std::erf(ar.values[3]);
+    ret.val[1][0] = std::erf(ar.values[4]);
+    ret.val[1][1] = std::erf(ar.values[5]);
+    ret.val[1][2] = std::erf(ar.values[6]);
+    ret.val[1][3] = std::erf(ar.values[7]);
+    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+  }
+
+  FP32Vec8 operator*(const FP32Vec8& b) const {
+    return FP32Vec8(
+        {vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])});
+  }
+
+  FP32Vec8 operator+(const FP32Vec8& b) const {
+    return FP32Vec8(
+        {vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])});
+  }
+
+  FP32Vec8 operator-(const FP32Vec8& b) const {
+    return FP32Vec8(
+        {vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])});
+  }
+
+  FP32Vec8 operator/(const FP32Vec8& b) const {
+    return FP32Vec8(
+        {vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])});
+  }
+
+  void save(float* ptr) const {
+    vec_xst(reg.val[0], 0, ptr);
+    vec_xst(reg.val[1], 16, ptr);
+  }
+};
+
+struct INT32Vec16 : public Vec<INT32Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  union AliasReg {
+    i32x4x4_t reg;
+    int32_t values[VEC_ELEM_NUM];
+  };
+
+  i32x4x4_t reg;
+
+  explicit INT32Vec16(const void* data_ptr) {
+    reg.val[0] = vec_xl(0, reinterpret_cast<const __vector int32_t*>(data_ptr));
+    reg.val[1] =
+        vec_xl(16, reinterpret_cast<const __vector int32_t*>(data_ptr));
+    reg.val[2] =
+        vec_xl(32, reinterpret_cast<const __vector int32_t*>(data_ptr));
+    reg.val[3] =
+        vec_xl(48, reinterpret_cast<const __vector int32_t*>(data_ptr));
+  }
+
+  void save(int32_t* ptr) const {
+    vec_xst(reg.val[0], 0, reinterpret_cast<__vector int32_t*>(ptr));
+    vec_xst(reg.val[1], 16, reinterpret_cast<__vector int32_t*>(ptr));
+    vec_xst(reg.val[2], 32, reinterpret_cast<__vector int32_t*>(ptr));
+    vec_xst(reg.val[3], 48, reinterpret_cast<__vector int32_t*>(ptr));
+  }
+
+  void save(int32_t* ptr, const int elem_num) const {
+    const int elements_in_chunk1 =
+        (elem_num >= 0) ? ((elem_num >= 4) ? 4 : elem_num) : 0;
+    const int elements_in_chunk2 =
+        (elem_num > 4) ? ((elem_num >= 8) ? 4 : elem_num - 4) : 0;
+    const int elements_in_chunk3 =
+        (elem_num > 8) ? ((elem_num >= 12) ? 4 : elem_num - 8) : 0;
+    const int elements_in_chunk4 =
+        (elem_num > 12) ? ((elem_num >= 16) ? 4 : elem_num - 12) : 0;
+
+    const size_t bytes_chunk1 =
+        static_cast<size_t>(elements_in_chunk1 * sizeof(int32_t));
+    const size_t bytes_chunk2 =
+        static_cast<size_t>(elements_in_chunk2 * sizeof(int32_t));
+    const size_t bytes_chunk3 =
+        static_cast<size_t>(elements_in_chunk3 * sizeof(int32_t));
+    const size_t bytes_chunk4 =
+        static_cast<size_t>(elements_in_chunk4 * sizeof(int32_t));
+
+    vec_xst_len(reg.val[0], reinterpret_cast<int32_t*>(ptr), bytes_chunk1);
+    vec_xst_len(reg.val[1],
+                reinterpret_cast<int32_t*>(reinterpret_cast<char*>(ptr) + 16),
+                bytes_chunk2);
+    vec_xst_len(reg.val[2],
+                reinterpret_cast<int32_t*>(reinterpret_cast<char*>(ptr) + 32),
+                bytes_chunk3);
+    vec_xst_len(reg.val[3],
+                reinterpret_cast<int32_t*>(reinterpret_cast<char*>(ptr) + 48),
+                bytes_chunk4);
+  }
+};
+
+struct FP32Vec16 : public Vec<FP32Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  union AliasReg {
+    f32x4x4_t reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  f32x4x4_t reg;
+
+  explicit FP32Vec16(float v) {
+    reg.val[0] = vec_splats(v);
+    reg.val[1] = vec_splats(v);
+    reg.val[2] = vec_splats(v);
+    reg.val[3] = vec_splats(v);
+  }
+
+  explicit FP32Vec16() {
+    reg.val[0] = vec_splats(0.0f);
+    reg.val[1] = vec_splats(0.0f);
+    reg.val[2] = vec_splats(0.0f);
+    reg.val[3] = vec_splats(0.0f);
+  }
+
+  explicit FP32Vec16(const float* ptr) {
+    reg.val[0] = vec_xl(0, ptr);
+    reg.val[1] = vec_xl(16, ptr);
+    reg.val[2] = vec_xl(32, ptr);
+    reg.val[3] = vec_xl(48, ptr);
+  }
+
+  explicit FP32Vec16(f32x4x4_t data) : reg(data) {}
+
+  explicit FP32Vec16(const FP32Vec16& data) {
+    reg.val[0] = data.reg.val[0];
+    reg.val[1] = data.reg.val[1];
+    reg.val[2] = data.reg.val[2];
+    reg.val[3] = data.reg.val[3];
+  }
+
+  explicit FP32Vec16(const FP32Vec4& data) {
+    reg.val[0] = data.reg;
+    reg.val[1] = data.reg;
+    reg.val[2] = data.reg;
+    reg.val[3] = data.reg;
+  }
+
+  explicit FP32Vec16(const FP32Vec8& data) {
+    reg.val[0] = data.reg.val[0];
+    reg.val[1] = data.reg.val[1];
+    reg.val[2] = data.reg.val[0];
+    reg.val[3] = data.reg.val[1];
+  }
+
+  explicit FP32Vec16(const BF16Vec16& v) {
+    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
+    reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
+    reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]);
+    reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]);
+  }
+
+  explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
+
+  explicit FP32Vec16(const INT32Vec16& v) {
+    reg.val[0] = vec_ctf(v.reg.val[0], 0);
+    reg.val[1] = vec_ctf(v.reg.val[1], 0);
+    reg.val[2] = vec_ctf(v.reg.val[2], 0);
+    reg.val[3] = vec_ctf(v.reg.val[3], 0);
+  }
+
+  FP32Vec16 operator*(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_mul(reg.val[0], b.reg.val[0]),
+                                vec_mul(reg.val[1], b.reg.val[1]),
+                                vec_mul(reg.val[2], b.reg.val[2]),
+                                vec_mul(reg.val[3], b.reg.val[3])}));
+  }
+
+  FP32Vec16 operator+(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_add(reg.val[0], b.reg.val[0]),
+                                vec_add(reg.val[1], b.reg.val[1]),
+                                vec_add(reg.val[2], b.reg.val[2]),
+                                vec_add(reg.val[3], b.reg.val[3])}));
+  }
+
+  FP32Vec16 operator-(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_sub(reg.val[0], b.reg.val[0]),
+                                vec_sub(reg.val[1], b.reg.val[1]),
+                                vec_sub(reg.val[2], b.reg.val[2]),
+                                vec_sub(reg.val[3], b.reg.val[3])}));
+  }
+
+  FP32Vec16 operator/(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_div(reg.val[0], b.reg.val[0]),
+                                vec_div(reg.val[1], b.reg.val[1]),
+                                vec_div(reg.val[2], b.reg.val[2]),
+                                vec_div(reg.val[3], b.reg.val[3])}));
+  }
+
+  FP32Vec16 clamp(const FP32Vec16& min, const FP32Vec16& max) const {
+    return FP32Vec16(f32x4x4_t(
+        {vec_min(max.reg.val[0], vec_max(min.reg.val[0], reg.val[0])),
+         vec_min(max.reg.val[1], vec_max(min.reg.val[1], reg.val[1])),
+         vec_min(max.reg.val[2], vec_max(min.reg.val[2], reg.val[2])),
+         vec_min(max.reg.val[3], vec_max(min.reg.val[3], reg.val[3]))}));
+  }
+
+  FP32Vec16 max(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_max(reg.val[0], b.reg.val[0]),
+                                vec_max(reg.val[1], b.reg.val[1]),
+                                vec_max(reg.val[2], b.reg.val[2]),
+                                vec_max(reg.val[3], b.reg.val[3])}));
+  }
+
+  FP32Vec16 max(const FP32Vec16& b, int elem_num) const {
+    FP32Vec16 result;
+
+    // Create a vector of element indices for each chunk
+    __vector unsigned int indices = {0, 1, 2, 3};
+    __vector unsigned int elem_num_vec =
+        vec_splats(static_cast<unsigned int>(elem_num));
+
+    // Compute masks for each chunk
+    __vector unsigned int chunk_offset0 = {0, 0, 0,
+                                           0};  // Chunk 0: Elements 0-3
+    __vector unsigned int chunk_offset1 = {4, 4, 4,
+                                           4};  // Chunk 1: Elements 4-7
+    __vector unsigned int chunk_offset2 = {8, 8, 8,
+                                           8};  // Chunk 2: Elements 8-11
+    __vector unsigned int chunk_offset3 = {12, 12, 12,
+                                           12};  // Chunk 3: Elements 12-15
+
+    // Compute masks for each chunk
+    __vector bool int mask0 = vec_cmplt(indices + chunk_offset0, elem_num_vec);
+    __vector bool int mask1 = vec_cmplt(indices + chunk_offset1, elem_num_vec);
+    __vector bool int mask2 = vec_cmplt(indices + chunk_offset2, elem_num_vec);
+    __vector bool int mask3 = vec_cmplt(indices + chunk_offset3, elem_num_vec);
+
+    // Apply masks to compute the result for each chunk
+    result.reg.val[0] = vec_sel(this->reg.val[0],
+                                vec_max(this->reg.val[0], b.reg.val[0]), mask0);
+    result.reg.val[1] = vec_sel(this->reg.val[1],
+                                vec_max(this->reg.val[1], b.reg.val[1]), mask1);
+    result.reg.val[2] = vec_sel(this->reg.val[2],
+                                vec_max(this->reg.val[2], b.reg.val[2]), mask2);
+    result.reg.val[3] = vec_sel(this->reg.val[3],
+                                vec_max(this->reg.val[3], b.reg.val[3]), mask3);
+
+    return FP32Vec16(result.reg);
+  }
+
+  FP32Vec16 min(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_min(reg.val[0], b.reg.val[0]),
+                                vec_min(reg.val[1], b.reg.val[1]),
+                                vec_min(reg.val[2], b.reg.val[2]),
+                                vec_min(reg.val[3], b.reg.val[3])}));
+  }
+
+  FP32Vec16 min(const FP32Vec16& b, int elem_num) const {
+    FP32Vec16 result;
+
+    vector unsigned int indices = {0, 1, 2, 3};
+    vector unsigned int elem_num_vec =
+        vec_splats(static_cast<unsigned int>(elem_num));
+
+    vector unsigned int chunk_offset0 = {0, 0, 0, 0};
+    vector unsigned int chunk_offset1 = {4, 4, 4, 4};
+    vector unsigned int chunk_offset2 = {8, 8, 8, 8};
+    vector unsigned int chunk_offset3 = {12, 12, 12, 12};
+
+    vector bool int mask0 = vec_cmplt(indices + chunk_offset0, elem_num_vec);
+    vector bool int mask1 = vec_cmplt(indices + chunk_offset1, elem_num_vec);
+    vector bool int mask2 = vec_cmplt(indices + chunk_offset2, elem_num_vec);
+    vector bool int mask3 = vec_cmplt(indices + chunk_offset3, elem_num_vec);
+
+    result.reg.val[0] = vec_sel(this->reg.val[0],
+                                vec_min(this->reg.val[0], b.reg.val[0]), mask0);
+    result.reg.val[1] = vec_sel(this->reg.val[1],
+                                vec_min(this->reg.val[1], b.reg.val[1]), mask1);
+    result.reg.val[2] = vec_sel(this->reg.val[2],
+                                vec_min(this->reg.val[2], b.reg.val[2]), mask2);
+    result.reg.val[3] = vec_sel(this->reg.val[3],
+                                vec_min(this->reg.val[3], b.reg.val[3]), mask3);
+
+    return FP32Vec16(result.reg);
+  }
+
+  FP32Vec16 abs() const {
+    return FP32Vec16(f32x4x4_t({vec_abs(reg.val[0]), vec_abs(reg.val[1]),
+                                vec_abs(reg.val[2]), vec_abs(reg.val[3])}));
+  }
+
+  float reduce_max() {
+    __vector float max01 = vec_max(reg.val[0], reg.val[1]);
+    __vector float max23 = vec_max(reg.val[2], reg.val[3]);
+    __vector float max_all = vec_max(max01, max23);
+    __vector float temp = vec_max(max_all, vec_sld(max_all, max_all, 8));
+    temp = vec_max(temp, vec_sld(temp, temp, 4));
+    return vec_extract(temp, 0);
+  }
+
+  float reduce_min() {
+    __vector float min01 = vec_min(reg.val[0], reg.val[1]);
+    __vector float min23 = vec_min(reg.val[2], reg.val[3]);
+    __vector float min_all = vec_min(min01, min23);
+    __vector float temp = vec_min(min_all, vec_sld(min_all, min_all, 8));
+    temp = vec_min(temp, vec_sld(temp, temp, 4));
+    return vec_extract(temp, 0);
+  }
+
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float result = 0;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, &ar](int i) { result += ar.values[i]; });
+
+    return result;
+  }
+
+  template <int group_size>
+  float reduce_sub_sum(int idx) {
+    static_assert(VEC_ELEM_NUM % group_size == 0);
+
+    AliasReg ar;
+    ar.reg = reg;
+    float result = 0;
+    const int start = idx * group_size;
+    unroll_loop<int, group_size>(
+        [&result, &start, ar](int i) { result += ar.values[start + i]; });
+
+    return result;
+  }
+
+  void save(float* ptr) const {
+    vec_xst(reg.val[0], 0, ptr);
+    vec_xst(reg.val[1], 16, ptr);
+    vec_xst(reg.val[2], 32, ptr);
+    vec_xst(reg.val[3], 48, ptr);
+  }
+
+  void save(float* ptr, const int elem_num) const {
+    const int elements_in_chunk1 =
+        (elem_num >= 0) ? ((elem_num >= 4) ? 4 : elem_num) : 0;
+    const int elements_in_chunk2 =
+        (elem_num > 4) ? ((elem_num >= 8) ? 4 : elem_num - 4) : 0;
+    const int elements_in_chunk3 =
+        (elem_num > 8) ? ((elem_num >= 12) ? 4 : elem_num - 8) : 0;
+    const int elements_in_chunk4 =
+        (elem_num > 12) ? ((elem_num >= 16) ? 4 : elem_num - 12) : 0;
+
+    const size_t bytes_chunk1 =
+        static_cast<size_t>(elements_in_chunk1 * sizeof(float));
+    const size_t bytes_chunk2 =
+        static_cast<size_t>(elements_in_chunk2 * sizeof(float));
+    const size_t bytes_chunk3 =
+        static_cast<size_t>(elements_in_chunk3 * sizeof(float));
+    const size_t bytes_chunk4 =
+        static_cast<size_t>(elements_in_chunk4 * sizeof(float));
+
+    vec_xst_len(reg.val[0], ptr, bytes_chunk1);
+    vec_xst_len(reg.val[1],
+                reinterpret_cast<float*>(reinterpret_cast<char*>(ptr) + 16),
+                bytes_chunk2);
+    vec_xst_len(reg.val[2],
+                reinterpret_cast<float*>(reinterpret_cast<char*>(ptr) + 32),
+                bytes_chunk3);
+    vec_xst_len(reg.val[3],
+                reinterpret_cast<float*>(reinterpret_cast<char*>(ptr) + 48),
+                bytes_chunk4);
+  }
+};
+
+struct INT8Vec16 : public Vec<INT8Vec16> {
+  constexpr static int VEC_NUM_ELEM = 16;  // 128 bits / 8 bits = 16
+
+  union AliasReg {
+    __vector signed char reg;
+    int8_t values[VEC_NUM_ELEM];
+  };
+
+  __vector signed char reg;
+
+  explicit INT8Vec16(const FP32Vec16& vec) {
+    __vector signed int ret[4];
+    ret[0] = vec_cts(vec.reg.val[0], 0);
+    ret[1] = vec_cts(vec.reg.val[1], 0);
+    ret[2] = vec_cts(vec.reg.val[2], 0);
+    ret[3] = vec_cts(vec.reg.val[3], 0);
+
+    __vector signed short packed1 = vec_packs(ret[0], ret[1]);
+    __vector signed short packed2 = vec_packs(ret[2], ret[3]);
+
+    reg = vec_packs(packed1, packed2);
+  }
+
+  void save(void* ptr) const {
+    *reinterpret_cast<__vector signed char*>(ptr) = reg;
+  }
+  void save(signed char* ptr, const int elem_num) {
+    vec_xst_len(reg, ptr, static_cast<size_t>(elem_num));
+  }
+};
+
+template <typename T>
+struct VecType {
+  using vec_type = void;
+};
+
+template <typename T>
+using vec_t = typename VecType<T>::vec_type;
+
+template <>
+struct VecType<float> {
+  using vec_type = FP32Vec8;
+};
+
+template <>
+struct VecType<c10::BFloat16> {
+  using vec_type = BF16Vec8;
+};
+
+template <typename T>
+void storeFP32(float v, T* ptr) {
+  *ptr = v;
+}
+
+inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
+  acc = acc + a * b;
+}
+
+template <>
+inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
+  c10::BFloat16 __attribute__((__may_alias__))* v_ptr =
+      reinterpret_cast<c10::BFloat16*>(&v);
+  *ptr = *(v_ptr + 1);
+}
+
+#ifndef __VEC_CLASS_FP_NAN
+  #define __VEC_CLASS_FP_NAN (1 << 6)
+#endif
+
+const static __vector unsigned char omask = {0,  1,  4,  5,  8,  9,  12, 13,
+                                             16, 17, 20, 21, 24, 25, 28, 29};
+#ifndef _ARCH_PWR10
+const static __vector unsigned int bias = {0x00007fff, 0x00007fff, 0x00007fff,
+                                           0x00007fff};
+const static __vector unsigned int nan = {0x7fc00000, 0x7fc00000, 0x7fc00000,
+                                          0x7fc00000};
+const static __vector unsigned int sh16 = {16, 16, 16, 16};
+const static __vector unsigned int one = {1, 1, 1, 1};
+#endif
+
+inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
+#ifdef _ARCH_PWR10
+  __vector signed short ret[2];
+  ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16(
+      (__vector unsigned char)v.reg.val[0]);
+  ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16(
+      (__vector unsigned char)v.reg.val[1]);
+  reg = vec_perm(ret[0], ret[1], omask);
+#elif defined(_ARCH_PWR9)
+  __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
+  __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
+  __vector unsigned int lsb0 = vec_sr(inp0, sh16);
+  __vector unsigned int lsb1 = vec_sr(inp1, sh16);
+  lsb0 = vec_and(lsb0, one);
+  lsb1 = vec_and(lsb1, one);
+  __vector unsigned int rnd0 = vec_add(lsb0, bias);
+  __vector unsigned int rnd1 = vec_add(lsb1, bias);
+  inp0 = vec_add(inp0, rnd0);
+  inp1 = vec_add(inp1, rnd1);
+  __vector __bool int sel0 =
+      vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN);
+  __vector __bool int sel1 =
+      vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN);
+  inp0 = vec_sel(inp0, nan, sel0);
+  inp1 = vec_sel(inp1, nan, sel1);
+  inp0 = vec_sr(inp0, sh16);
+  inp1 = vec_sr(inp1, sh16);
+  reg = (__vector signed short)vec_perm(inp0, inp1, omask);
+#endif
+}
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
+#ifdef _ARCH_PWR10
+  __vector signed short ret[4];
+  ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16(
+      (__vector unsigned char)v.reg.val[0]);
+  ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16(
+      (__vector unsigned char)v.reg.val[1]);
+  ret[2] = (__vector signed short)__builtin_vsx_xvcvspbf16(
+      (__vector unsigned char)v.reg.val[2]);
+  ret[3] = (__vector signed short)__builtin_vsx_xvcvspbf16(
+      (__vector unsigned char)v.reg.val[3]);
+  reg.val[0] = vec_perm(ret[0], ret[1], omask);
+  reg.val[1] = vec_perm(ret[2], ret[3], omask);
+#elif defined(_ARCH_PWR9)
+  __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
+  __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
+  __vector unsigned int inp2 = (__vector unsigned int)(v.reg.val[2]);
+  __vector unsigned int inp3 = (__vector unsigned int)(v.reg.val[3]);
+  __vector unsigned int lsb0 = vec_sr(inp0, sh16);
+  __vector unsigned int lsb1 = vec_sr(inp1, sh16);
+  __vector unsigned int lsb2 = vec_sr(inp2, sh16);
+  __vector unsigned int lsb3 = vec_sr(inp3, sh16);
+  lsb0 = vec_and(lsb0, one);
+  lsb1 = vec_and(lsb1, one);
+  lsb2 = vec_and(lsb2, one);
+  lsb3 = vec_and(lsb3, one);
+  __vector unsigned int rnd0 = vec_add(lsb0, bias);
+  __vector unsigned int rnd1 = vec_add(lsb1, bias);
+  __vector unsigned int rnd2 = vec_add(lsb2, bias);
+  __vector unsigned int rnd3 = vec_add(lsb3, bias);
+  inp0 = vec_add(inp0, rnd0);
+  inp1 = vec_add(inp1, rnd1);
+  inp2 = vec_add(inp2, rnd2);
+  inp3 = vec_add(inp3, rnd3);
+  __vector __bool int sel0 =
+      vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN);
+  __vector __bool int sel1 =
+      vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN);
+  __vector __bool int sel2 =
+      vec_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN);
+  __vector __bool int sel3 =
+      vec_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN);
+  inp0 = vec_sel(inp0, nan, sel0);
+  inp1 = vec_sel(inp1, nan, sel1);
+  inp2 = vec_sel(inp2, nan, sel2);
+  inp3 = vec_sel(inp3, nan, sel3);
+  inp0 = vec_sr(inp0, sh16);
+  inp1 = vec_sr(inp1, sh16);
+  inp2 = vec_sr(inp2, sh16);
+  inp3 = vec_sr(inp3, sh16);
+  reg.val[0] = (__vector signed short)vec_perm(inp0, inp1, omask);
+  reg.val[1] = (__vector signed short)vec_perm(inp2, inp3, omask);
+#endif
+}
+
+inline void prefetch(const void* addr) {
+  __asm__ __volatile__("dcbt 0, %0" : : "r"(addr) : "memory");
+}
+
+};  // namespace vec_op
+
+#endif
diff --git a/vllm_v0.10.0/csrc/cpu/cpu_types_vxe.hpp b/vllm_v0.10.0/csrc/cpu/cpu_types_vxe.hpp
new file mode 100644
index 0000000..ab8cbbb
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cpu/cpu_types_vxe.hpp
@@ -0,0 +1,480 @@
+
+#ifndef CPU_TYPES_VXE_HPP
+#define CPU_TYPES_VXE_HPP
+
+#include <vecintrin.h>
+#include <cmath>
+#include <torch/all.h>
+namespace vec_op {
+
+#define vec_neg(a) (-(a))
+#define vec_add(a, b) ((a) + (b))
+#define vec_sub(a, b) ((a) - (b))
+#define vec_mul(a, b) ((a) * (b))
+#define vec_div(a, b) ((a) / (b))
+#define vec_sr(a, b) ((a) >> (b))  // Vector Shift Right Algebaic
+#define vec_sl(a, b) ((a) << (b))  // Vector Shift Left
+
+// FIXME: FP16 is not fully supported in Torch-CPU
+#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
+#ifndef CPU_OP_GUARD
+  #define CPU_KERNEL_GUARD_IN(NAME)
+  #define CPU_KERNEL_GUARD_OUT(NAME)
+#else
+  #define CPU_KERNEL_GUARD_IN(NAME) \
+    std::cout << #NAME << " invoked." << std::endl;
+  #define CPU_KERNEL_GUARD_OUT(NAME) \
+    std::cout << #NAME << " exit." << std::endl;
+#endif
+
+#define FORCE_INLINE __attribute__((always_inline)) inline
+
+namespace {
+template <typename T, T... indexes, typename F>
+constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
+  (f(std::integral_constant<T, indexes>{}), ...);
+}
+};  // namespace
+
+template <typename T, T count, typename F,
+          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
+constexpr void unroll_loop(F&& f) {
+  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
+}
+
+template <typename T>
+struct Vec {
+  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
+};
+
+typedef struct ss16x8x2_t {
+  __vector signed short val[2];
+} ss16x8x2_t;
+
+typedef struct ss16x8x4_t {
+  __vector signed short val[4];
+} ss16x8x4_t;
+
+typedef struct f32x4x2_t {
+  __vector float val[2];
+} f32x4x2_t;
+
+typedef struct f32x4x4_t {
+  __vector float val[4];
+} f32x4x4_t;
+
+struct FP32Vec8;
+struct FP32Vec16;
+
+struct BF16Vec8 : public Vec<BF16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+
+  __vector signed short reg;
+
+  explicit BF16Vec8(const void* ptr) : reg(*(__vector signed short*)ptr) {}
+  explicit BF16Vec8(const FP32Vec8&);
+
+  void save(void* ptr) const {
+    *reinterpret_cast<__vector signed short*>(ptr) = reg;
+  }
+};
+
+struct BF16Vec16 : public Vec<BF16Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+
+  ss16x8x2_t reg;
+
+  explicit BF16Vec16(const void* ptr) {
+    // Load 256 bits in two parts
+    reg.val[0] = (__vector signed short)vec_xl(0, (signed short*)ptr);
+    reg.val[1] = (__vector signed short)vec_xl(16, (signed short*)ptr);
+  }
+
+  explicit BF16Vec16(const FP32Vec16&);
+
+  void save(void* ptr) const {
+    // Save 256 bits in two parts
+    vec_xst(reg.val[0], 0, (signed short*)ptr);
+    vec_xst(reg.val[1], 16, (signed short*)ptr);
+  }
+};
+
+const static __vector signed short zero = vec_splats((signed short)0);
+
+struct BF16Vec32 : public Vec<BF16Vec32> {
+  constexpr static int VEC_ELEM_NUM = 32;
+
+  ss16x8x4_t reg;
+  explicit BF16Vec32(const void* ptr)
+      : reg(*reinterpret_cast<const ss16x8x4_t*>(ptr)) {}
+
+  explicit BF16Vec32(ss16x8x4_t data) : reg(data) {}
+
+  explicit BF16Vec32(const BF16Vec8& vec8_data)
+      : reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {}
+
+  void save(void* ptr) const { *reinterpret_cast<ss16x8x4_t*>(ptr) = reg; }
+};
+
+struct FP32Vec4 : public Vec<FP32Vec4> {
+  constexpr static int VEC_ELEM_NUM = 4;
+  union AliasReg {
+    __vector float reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  __vector float reg;
+
+  explicit FP32Vec4(float v) : reg(vec_splats(v)) {}
+
+  explicit FP32Vec4() : reg(vec_splats(0.0f)) {}
+
+  explicit FP32Vec4(const float* ptr) : reg(vec_xl(0, ptr)) {}
+
+  explicit FP32Vec4(__vector float data) : reg(data) {}
+
+  explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {}
+};
+
+struct FP32Vec8 : public Vec<FP32Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+  union AliasReg {
+    f32x4x2_t reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  f32x4x2_t reg;
+
+  explicit FP32Vec8(float v) {
+    reg.val[0] = vec_splats(v);
+    reg.val[1] = vec_splats(v);
+  }
+
+  explicit FP32Vec8() {
+    reg.val[0] = vec_splats(0.0f);
+    reg.val[1] = vec_splats(0.0f);
+  }
+
+  explicit FP32Vec8(const float* ptr) {
+    reg.val[0] = vec_xl(0, ptr);
+    reg.val[1] = vec_xl(16, ptr);
+  }
+
+  explicit FP32Vec8(f32x4x2_t data) : reg(data) {}
+
+  explicit FP32Vec8(const FP32Vec8& data) {
+    reg.val[0] = data.reg.val[0];
+    reg.val[1] = data.reg.val[1];
+  }
+
+  explicit FP32Vec8(const BF16Vec8& v) {
+    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg);
+    reg.val[1] = (__vector float)vec_mergel(zero, v.reg);
+  }
+
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float result = 0;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, &ar](int i) { result += ar.values[i]; });
+
+    return result;
+  }
+
+  FP32Vec8 exp() const {
+    // TODO: Vectorize this
+    AliasReg ar;
+    ar.reg = reg;
+    f32x4x4_t ret;
+    ret.val[0][0] = std::exp(ar.values[0]);
+    ret.val[0][1] = std::exp(ar.values[1]);
+    ret.val[0][2] = std::exp(ar.values[2]);
+    ret.val[0][3] = std::exp(ar.values[3]);
+    ret.val[1][0] = std::exp(ar.values[4]);
+    ret.val[1][1] = std::exp(ar.values[5]);
+    ret.val[1][2] = std::exp(ar.values[6]);
+    ret.val[1][3] = std::exp(ar.values[7]);
+    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+  }
+
+  FP32Vec8 tanh() const {
+    // TODO: Vectorize this
+    AliasReg ar;
+    ar.reg = reg;
+    f32x4x4_t ret;
+    ret.val[0][0] = std::tanh(ar.values[0]);
+    ret.val[0][1] = std::tanh(ar.values[1]);
+    ret.val[0][2] = std::tanh(ar.values[2]);
+    ret.val[0][3] = std::tanh(ar.values[3]);
+    ret.val[1][0] = std::tanh(ar.values[4]);
+    ret.val[1][1] = std::tanh(ar.values[5]);
+    ret.val[1][2] = std::tanh(ar.values[6]);
+    ret.val[1][3] = std::tanh(ar.values[7]);
+    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+  }
+
+  FP32Vec8 er() const {
+    // TODO: Vectorize this
+    AliasReg ar;
+    ar.reg = reg;
+    f32x4x4_t ret;
+    ret.val[0][0] = std::erf(ar.values[0]);
+    ret.val[0][1] = std::erf(ar.values[1]);
+    ret.val[0][2] = std::erf(ar.values[2]);
+    ret.val[0][3] = std::erf(ar.values[3]);
+    ret.val[1][0] = std::erf(ar.values[4]);
+    ret.val[1][1] = std::erf(ar.values[5]);
+    ret.val[1][2] = std::erf(ar.values[6]);
+    ret.val[1][3] = std::erf(ar.values[7]);
+    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+  }
+
+  FP32Vec8 operator*(const FP32Vec8& b) const {
+    return FP32Vec8(
+        {vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])});
+  }
+
+  FP32Vec8 operator+(const FP32Vec8& b) const {
+    return FP32Vec8(
+        {vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])});
+  }
+
+  FP32Vec8 operator-(const FP32Vec8& b) const {
+    return FP32Vec8(
+        {vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])});
+  }
+
+  FP32Vec8 operator/(const FP32Vec8& b) const {
+    return FP32Vec8(
+        {vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])});
+  }
+
+  void save(float* ptr) const {
+    vec_xst(reg.val[0], 0, ptr);
+    vec_xst(reg.val[1], 16, ptr);
+  }
+};
+
+struct FP32Vec16 : public Vec<FP32Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  union AliasReg {
+    f32x4x4_t reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  f32x4x4_t reg;
+
+  explicit FP32Vec16(float v) {
+    reg.val[0] = vec_splats(v);
+    reg.val[1] = vec_splats(v);
+    reg.val[2] = vec_splats(v);
+    reg.val[3] = vec_splats(v);
+  }
+
+  explicit FP32Vec16() {
+    reg.val[0] = vec_splats(0.0f);
+    reg.val[1] = vec_splats(0.0f);
+    reg.val[2] = vec_splats(0.0f);
+    reg.val[3] = vec_splats(0.0f);
+  }
+
+  explicit FP32Vec16(const float* ptr) {
+    reg.val[0] = vec_xl(0, ptr);
+    reg.val[1] = vec_xl(16, ptr);
+    reg.val[2] = vec_xl(32, ptr);
+    reg.val[3] = vec_xl(48, ptr);
+  }
+
+  explicit FP32Vec16(f32x4x4_t data) : reg(data) {}
+
+  explicit FP32Vec16(const FP32Vec16& data) {
+    reg.val[0] = data.reg.val[0];
+    reg.val[1] = data.reg.val[1];
+    reg.val[2] = data.reg.val[2];
+    reg.val[3] = data.reg.val[3];
+  }
+
+  explicit FP32Vec16(const FP32Vec4& data) {
+    reg.val[0] = data.reg;
+    reg.val[1] = data.reg;
+    reg.val[2] = data.reg;
+    reg.val[3] = data.reg;
+  }
+
+  explicit FP32Vec16(const FP32Vec8& data) {
+    reg.val[0] = data.reg.val[0];
+    reg.val[1] = data.reg.val[1];
+    reg.val[2] = data.reg.val[0];
+    reg.val[3] = data.reg.val[1];
+  }
+
+  explicit FP32Vec16(const BF16Vec16& v) {
+    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
+    reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
+    reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]);
+    reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]);
+  }
+
+  explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
+
+  FP32Vec16 operator*(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_mul(reg.val[0], b.reg.val[0]),
+                                vec_mul(reg.val[1], b.reg.val[1]),
+                                vec_mul(reg.val[2], b.reg.val[2]),
+                                vec_mul(reg.val[3], b.reg.val[3])}));
+  }
+
+  FP32Vec16 operator+(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_add(reg.val[0], b.reg.val[0]),
+                                vec_add(reg.val[1], b.reg.val[1]),
+                                vec_add(reg.val[2], b.reg.val[2]),
+                                vec_add(reg.val[3], b.reg.val[3])}));
+  }
+
+  FP32Vec16 operator-(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_sub(reg.val[0], b.reg.val[0]),
+                                vec_sub(reg.val[1], b.reg.val[1]),
+                                vec_sub(reg.val[2], b.reg.val[2]),
+                                vec_sub(reg.val[3], b.reg.val[3])}));
+  }
+
+  FP32Vec16 operator/(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_div(reg.val[0], b.reg.val[0]),
+                                vec_div(reg.val[1], b.reg.val[1]),
+                                vec_div(reg.val[2], b.reg.val[2]),
+                                vec_div(reg.val[3], b.reg.val[3])}));
+  }
+
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float result = 0;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, &ar](int i) { result += ar.values[i]; });
+
+    return result;
+  }
+
+  template <int group_size>
+  float reduce_sub_sum(int idx) {
+    static_assert(VEC_ELEM_NUM % group_size == 0);
+
+    AliasReg ar;
+    ar.reg = reg;
+    float result = 0;
+    const int start = idx * group_size;
+    unroll_loop<int, group_size>(
+        [&result, &start, ar](int i) { result += ar.values[start + i]; });
+
+    return result;
+  }
+
+  void save(float* ptr) const {
+    vec_xst(reg.val[0], 0, ptr);
+    vec_xst(reg.val[1], 16, ptr);
+    vec_xst(reg.val[2], 32, ptr);
+    vec_xst(reg.val[3], 48, ptr);
+  }
+};
+
+template <typename T>
+struct VecType {
+  using vec_type = void;
+};
+
+template <typename T>
+using vec_t = typename VecType<T>::vec_type;
+
+template <>
+struct VecType<float> {
+  using vec_type = FP32Vec8;
+};
+
+template <>
+struct VecType<c10::BFloat16> {
+  using vec_type = BF16Vec8;
+};
+
+template <typename T>
+void storeFP32(float v, T* ptr) {
+  *ptr = v;
+}
+
+inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
+  acc = acc + a * b;
+}
+
+namespace c10 {
+struct BFloat16 {
+  uint16_t value;  // Assume BFloat16 is defined as a struct containing a 16-bit
+                   // value.
+};
+}  // namespace c10
+
+template <>
+inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
+  c10::BFloat16 __attribute__((__may_alias__))* v_ptr =
+      reinterpret_cast<c10::BFloat16*>(&v);
+  *ptr = *(v_ptr + 1);
+}
+
+#ifndef __VEC_CLASS_FP_NAN
+  #define __VEC_CLASS_FP_NAN (1 << 6)
+#endif
+
+const static __vector unsigned char omask = {2,  3,  6,  7,  10, 11, 14, 15,
+                                             18, 19, 22, 23, 26, 27, 30, 31};
+const static __vector unsigned int bias = {0x00007fff, 0x00007fff, 0x00007fff,
+                                           0x00007fff};
+const static __vector unsigned int nan = {0x7fc00000, 0x7fc00000, 0x7fc00000,
+                                          0x7fc00000};
+const static __vector unsigned int sh16 = {16, 16, 16, 16};
+const static __vector unsigned int one = {1, 1, 1, 1};
+
+inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
+  __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
+  __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
+  int cc;
+  __vector __bool int sel0 =
+      vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
+  __vector __bool int sel1 =
+      vec_fp_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN, &cc);
+  inp0 = vec_sel(inp0, nan, sel0) >> sh16;
+  inp1 = vec_sel(inp1, nan, sel1) >> sh16;
+  reg = (__vector signed short)vec_perm(inp0, inp1, omask);
+}
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
+  __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
+  __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
+  __vector unsigned int inp2 = (__vector unsigned int)(v.reg.val[2]);
+  __vector unsigned int inp3 = (__vector unsigned int)(v.reg.val[3]);
+  int cc;
+  __vector __bool int sel0 =
+      vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
+  __vector __bool int sel1 =
+      vec_fp_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN, &cc);
+  __vector __bool int sel2 =
+      vec_fp_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN, &cc);
+  __vector __bool int sel3 =
+      vec_fp_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN, &cc);
+  inp0 = vec_sel(inp0, nan, sel0) >> sh16;
+  inp1 = vec_sel(inp1, nan, sel1) >> sh16;
+  inp2 = vec_sel(inp2, nan, sel2) >> sh16;
+  inp3 = vec_sel(inp3, nan, sel3) >> sh16;
+  reg.val[0] = (__vector signed short)vec_perm(inp0, inp1, omask);
+  reg.val[1] = (__vector signed short)vec_perm(inp2, inp3, omask);
+}
+
+inline void prefetch(const void* addr) { void __dcbt(const void* addr); }
+
+};  // namespace vec_op
+
+#endif
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/cpu/cpu_types_x86.hpp b/vllm_v0.10.0/csrc/cpu/cpu_types_x86.hpp
new file mode 100644
index 0000000..3952c43
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cpu/cpu_types_x86.hpp
@@ -0,0 +1,721 @@
+
+#ifndef CPU_TYPES_X86_HPP
+#define CPU_TYPES_X86_HPP
+
+#include <immintrin.h>
+#include <torch/all.h>
+
+#ifndef __AVX2__
+static_assert(false, "AVX2 must be supported for the current implementation.");
+#endif
+
+namespace vec_op {
+
+#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)            \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
+
+#define VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(...)        \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)     \
+  AT_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__)
+
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
+#define VLLM_DISPATCH_FLOATING_TYPES_WITH_E5M2(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME,                                \
+                     VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(__VA_ARGS__))
+
+#ifndef CPU_OP_GUARD
+  #define CPU_KERNEL_GUARD_IN(NAME)
+  #define CPU_KERNEL_GUARD_OUT(NAME)
+#else
+  #define CPU_KERNEL_GUARD_IN(NAME) \
+    RECORD_FUNCTION(#NAME, c10::ArrayRef<c10::IValue>({}));
+  #define CPU_KERNEL_GUARD_OUT(NAME)
+#endif
+
+#define FORCE_INLINE __attribute__((always_inline)) inline
+
+namespace {
+template <typename T, T... indexes, typename F>
+constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
+  (f(std::integral_constant<T, indexes>{}), ...);
+}
+};  // namespace
+
+template <typename T, T count, typename F,
+          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
+constexpr void unroll_loop(F&& f) {
+  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
+}
+
+template <typename T>
+struct Vec {
+  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
+};
+
+struct FP32Vec8;
+struct FP32Vec16;
+
+struct FP16Vec8 : public Vec<FP16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+
+  __m128i reg;
+
+  explicit FP16Vec8(const void* ptr)
+      : reg((__m128i)_mm_loadu_si128((__m128i*)ptr)) {}
+
+  explicit FP16Vec8(const FP32Vec8&);
+
+  void save(void* ptr) const { *reinterpret_cast<__m128i*>(ptr) = reg; }
+};
+
+struct FP16Vec16 : public Vec<FP16Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+
+  __m256i reg;
+
+  // normal load
+  explicit FP16Vec16(const void* ptr)
+      : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
+
+  // non-temporal load
+  explicit FP16Vec16(bool, void* ptr)
+      : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
+
+  explicit FP16Vec16(const FP32Vec16&);
+
+  void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
+
+  void save(void* ptr, const int elem_num) const {
+    constexpr uint32_t M = 0xFFFFFFFF;
+    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
+    _mm256_mask_storeu_epi16(ptr, mask, reg);
+  }
+};
+
+struct BF16Vec8 : public Vec<BF16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+
+  __m128i reg;
+
+  explicit BF16Vec8(const void* ptr)
+      : reg((__m128i)_mm_loadu_si128((__m128i*)ptr)) {}
+
+  explicit BF16Vec8(const FP32Vec8&);
+
+  void save(void* ptr) const { *reinterpret_cast<__m128i*>(ptr) = reg; }
+};
+
+struct BF16Vec16 : public Vec<BF16Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+
+  __m256i reg;
+
+  // normal load
+  explicit BF16Vec16(const void* ptr)
+      : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
+
+  // non-temporal load
+  explicit BF16Vec16(bool, void* ptr)
+      : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
+
+  explicit BF16Vec16(const FP32Vec16&);
+
+  void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
+
+  void save(void* ptr, const int elem_num) const {
+    constexpr uint32_t M = 0xFFFFFFFF;
+    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
+    _mm256_mask_storeu_epi16(ptr, mask, reg);
+  }
+};
+
+#ifdef __AVX512F__
+struct BF16Vec32 : public Vec<BF16Vec32> {
+  constexpr static int VEC_ELEM_NUM = 32;
+
+  __m512i reg;
+
+  explicit BF16Vec32() : reg(_mm512_setzero_si512()) {}
+
+  explicit BF16Vec32(const void* ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
+
+  explicit BF16Vec32(__m512i data) : reg(data) {}
+
+  explicit BF16Vec32(BF16Vec8& vec8_data)
+      : reg((__m512i)_mm512_inserti32x4(
+            _mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
+                                                      (__m128i)vec8_data.reg),
+                                                  (__m128i)vec8_data.reg, 1),
+                               (__m128i)vec8_data.reg, 2),
+            (__m128i)vec8_data.reg, 3)) {}
+
+  void save(void* ptr) const { *reinterpret_cast<__m512i*>(ptr) = reg; }
+};
+#else
+struct BF16Vec32 : public Vec<BF16Vec32> {
+  constexpr static int VEC_ELEM_NUM = 32;
+
+  __m256i reg_low;
+  __m256i reg_high;
+
+  explicit BF16Vec32(const void* ptr)
+      : reg_low(_mm256_loadu_si256((__m256i const*)ptr)),
+        reg_high(_mm256_loadu_si256((__m256i const*)ptr + 1)) {}
+
+  explicit BF16Vec32(__m256i low, __m256i high)
+      : reg_low(low), reg_high(high) {}
+
+  explicit BF16Vec32(BF16Vec8& vec8_data)
+      : reg_low((__m256i)_mm256_inserti32x4(
+            _mm256_castsi128_si256((__m128i)vec8_data.reg),
+            (__m128i)vec8_data.reg, 1)),
+        reg_high((__m256i)_mm256_inserti32x4(
+            _mm256_castsi128_si256((__m128i)vec8_data.reg),
+            (__m128i)vec8_data.reg, 1)) {}
+
+  void save(void* ptr) const {
+    *reinterpret_cast<__m256i*>(ptr) = reg_low;
+    *reinterpret_cast<__m256i*>((__m256i*)ptr + 1) = reg_high;
+  }
+};
+#endif
+
+struct FP32Vec4 : public Vec<FP32Vec4> {
+  constexpr static int VEC_ELEM_NUM = 4;
+  union AliasReg {
+    __m128 reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  __m128 reg;
+
+  explicit FP32Vec4(float v) : reg(_mm_set1_ps(v)) {}
+
+  explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {}
+
+  explicit FP32Vec4(const float* ptr) : reg(_mm_loadu_ps(ptr)) {}
+
+  explicit FP32Vec4(__m128 data) : reg(data) {}
+
+  explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {}
+};
+
+struct FP32Vec8 : public Vec<FP32Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+  union AliasReg {
+    __m256 reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  __m256 reg;
+
+  explicit FP32Vec8(float v) : reg(_mm256_set1_ps(v)) {}
+
+  explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {}
+
+  explicit FP32Vec8(const float* ptr) : reg(_mm256_loadu_ps(ptr)) {}
+
+  explicit FP32Vec8(__m256 data) : reg(data) {}
+
+  explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {}
+
+  explicit FP32Vec8(const FP16Vec8& v) : reg(_mm256_cvtph_ps(v.reg)) {}
+
+  explicit FP32Vec8(const BF16Vec8& v)
+      : reg(_mm256_castsi256_ps(
+            _mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {}
+
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float result = 0;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, &ar](int i) { result += ar.values[i]; });
+
+    return result;
+  }
+
+  FP32Vec8 exp() const {
+    AliasReg ar;
+    ar.reg = reg;
+    return FP32Vec8(_mm256_set_ps(expf(ar.values[7]), expf(ar.values[6]),
+                                  expf(ar.values[5]), expf(ar.values[4]),
+                                  expf(ar.values[3]), expf(ar.values[2]),
+                                  expf(ar.values[1]), expf(ar.values[0])));
+  }
+
+  FP32Vec8 tanh() const {
+    AliasReg ar;
+    ar.reg = reg;
+    return FP32Vec8(_mm256_set_ps(tanhf(ar.values[7]), tanhf(ar.values[6]),
+                                  tanhf(ar.values[5]), tanhf(ar.values[4]),
+                                  tanhf(ar.values[3]), tanhf(ar.values[2]),
+                                  tanhf(ar.values[1]), tanhf(ar.values[0])));
+  }
+
+  FP32Vec8 er() const {
+    AliasReg ar;
+    ar.reg = reg;
+    return FP32Vec8(_mm256_set_ps(erf(ar.values[7]), erf(ar.values[6]),
+                                  erf(ar.values[5]), erf(ar.values[4]),
+                                  erf(ar.values[3]), erf(ar.values[2]),
+                                  erf(ar.values[1]), erf(ar.values[0])));
+  }
+
+  FP32Vec8 operator*(const FP32Vec8& b) const {
+    return FP32Vec8(_mm256_mul_ps(reg, b.reg));
+  }
+
+  FP32Vec8 operator+(const FP32Vec8& b) const {
+    return FP32Vec8(_mm256_add_ps(reg, b.reg));
+  }
+
+  FP32Vec8 operator-(const FP32Vec8& b) const {
+    return FP32Vec8(_mm256_sub_ps(reg, b.reg));
+  }
+
+  FP32Vec8 operator/(const FP32Vec8& b) const {
+    return FP32Vec8(_mm256_div_ps(reg, b.reg));
+  }
+
+  void save(float* ptr) const { _mm256_storeu_ps(ptr, reg); }
+};
+
+#ifdef __AVX512F__
+struct INT32Vec16 : public Vec<INT32Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  union AliasReg {
+    __m512i reg;
+    int32_t values[VEC_ELEM_NUM];
+  };
+
+  __m512i reg;
+
+  explicit INT32Vec16(const void* data_ptr)
+      : reg(_mm512_loadu_epi32(data_ptr)) {}
+
+  void save(int32_t* ptr) const { _mm512_storeu_epi32(ptr, reg); }
+
+  void save(int32_t* ptr, const int elem_num) const {
+    constexpr uint32_t M = 0xFFFFFFFF;
+    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
+    _mm512_mask_storeu_epi32(ptr, mask, reg);
+  }
+};
+#endif
+
+#ifdef __AVX512F__
+struct FP32Vec16 : public Vec<FP32Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  union AliasReg {
+    __m512 reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  __m512 reg;
+
+  explicit FP32Vec16(float v) : reg(_mm512_set1_ps(v)) {}
+
+  explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
+
+  // normal load
+  explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {}
+
+  // non-temporal load
+  explicit FP32Vec16(bool, void* ptr)
+      : reg((__m512)_mm512_stream_load_si512(ptr)) {}
+
+  explicit FP32Vec16(__m512 data) : reg(data) {}
+
+  explicit FP32Vec16(const FP32Vec4& data)
+      : reg((__m512)_mm512_inserti32x4(
+            _mm512_inserti32x4(
+                _mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg),
+                                   (__m128i)data.reg, 1),
+                (__m128i)data.reg, 2),
+            (__m128i)data.reg, 3)) {}
+
+  explicit FP32Vec16(const FP32Vec8& data)
+      : reg((__m512)_mm512_inserti32x8(
+            _mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {}
+
+  explicit FP32Vec16(const BF16Vec16& v)
+      : reg(_mm512_castsi512_ps(
+            _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
+
+  explicit FP32Vec16(const FP16Vec16& v) : reg(_mm512_cvtph_ps(v.reg)) {}
+
+  explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
+
+  explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
+
+  explicit FP32Vec16(const INT32Vec16& v)
+      : reg(_mm512_cvt_roundepi32_ps(
+            v.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {}
+
+  FP32Vec16 operator*(const FP32Vec16& b) const {
+    return FP32Vec16(_mm512_mul_ps(reg, b.reg));
+  }
+
+  FP32Vec16 operator+(const FP32Vec16& b) const {
+    return FP32Vec16(_mm512_add_ps(reg, b.reg));
+  }
+
+  FP32Vec16 operator-(const FP32Vec16& b) const {
+    return FP32Vec16(_mm512_sub_ps(reg, b.reg));
+  }
+
+  FP32Vec16 operator/(const FP32Vec16& b) const {
+    return FP32Vec16(_mm512_div_ps(reg, b.reg));
+  }
+
+  FP32Vec16 clamp(const FP32Vec16& min, const FP32Vec16& max) const {
+    return FP32Vec16(_mm512_min_ps(max.reg, _mm512_max_ps(min.reg, reg)));
+  }
+
+  FP32Vec16 max(const FP32Vec16& b) const {
+    return FP32Vec16(_mm512_max_ps(reg, b.reg));
+  }
+
+  FP32Vec16 max(const FP32Vec16& b, const int elem_num) const {
+    constexpr uint32_t M = 0xFFFFFFFF;
+    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
+    return FP32Vec16(_mm512_mask_max_ps(reg, mask, reg, b.reg));
+  }
+
+  FP32Vec16 min(const FP32Vec16& b) const {
+    return FP32Vec16(_mm512_min_ps(reg, b.reg));
+  }
+
+  FP32Vec16 min(const FP32Vec16& b, const int elem_num) const {
+    constexpr uint32_t M = 0xFFFFFFFF;
+    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
+    return FP32Vec16(_mm512_mask_min_ps(reg, mask, reg, b.reg));
+  }
+
+  FP32Vec16 abs() const { return FP32Vec16(_mm512_abs_ps(reg)); }
+
+  float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
+
+  float reduce_max() const { return _mm512_reduce_max_ps(reg); }
+
+  float reduce_min() const { return _mm512_reduce_min_ps(reg); }
+
+  template <int group_size>
+  float reduce_sub_sum(int idx) {
+    static_assert(VEC_ELEM_NUM % group_size == 0);
+    constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
+    __mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size));
+    return _mm512_mask_reduce_add_ps(mask, reg);
+  }
+
+  void save(float* ptr) const { _mm512_storeu_ps(ptr, reg); }
+
+  void save(float* ptr, const int elem_num) const {
+    constexpr uint32_t M = 0xFFFFFFFF;
+    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
+    _mm512_mask_storeu_ps(ptr, mask, reg);
+  }
+};
+#else
+struct FP32Vec16 : public Vec<FP32Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+
+  union AliasReg {
+    __m256 reg;
+    float values[8];
+  };
+
+  __m256 reg_low;
+  __m256 reg_high;
+
+  explicit FP32Vec16(float v)
+      : reg_low(_mm256_set1_ps(v)), reg_high(_mm256_set1_ps(v)) {}
+
+  explicit FP32Vec16()
+      : reg_low(_mm256_set1_ps(0.0)), reg_high(_mm256_set1_ps(0.0)) {}
+
+  explicit FP32Vec16(const float* ptr)
+      : reg_low(_mm256_loadu_ps(ptr)), reg_high(_mm256_loadu_ps(ptr + 8)) {}
+
+  explicit FP32Vec16(__m256 low, __m256 high) : reg_low(low), reg_high(high) {}
+
+  explicit FP32Vec16(const FP32Vec16& data)
+      : reg_low(data.reg_low), reg_high(data.reg_high) {}
+
+  explicit FP32Vec16(const FP32Vec4& data)
+      : reg_low((__m256)_mm256_inserti128_si256(
+            _mm256_castsi128_si256((__m128i)data.reg), (__m128i)data.reg, 1)),
+        reg_high((__m256)_mm256_inserti128_si256(
+            _mm256_castsi128_si256((__m128i)data.reg), (__m128i)data.reg, 1)) {}
+
+  explicit FP32Vec16(const FP32Vec8& data)
+      : reg_low(data.reg), reg_high(data.reg) {}
+
+  explicit FP32Vec16(const FP16Vec16& v) {
+    __m128i low = _mm256_extractf128_si256(v.reg, 0);
+    __m128i high = _mm256_extractf128_si256(v.reg, 1);
+
+    reg_low = _mm256_cvtph_ps(low);
+    reg_high = _mm256_cvtph_ps(high);
+  }
+
+  explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
+
+  explicit FP32Vec16(const BF16Vec16& v) {
+    __m128i low = _mm256_extractf128_si256(v.reg, 0);
+    __m128i high = _mm256_extractf128_si256(v.reg, 1);
+
+    __m256i v_low_epi32 = _mm256_cvtepu16_epi32(low);
+    __m256i v_high_epi32 = _mm256_cvtepu16_epi32(high);
+
+    __m256i v_low_shifted = _mm256_bslli_epi128(v_low_epi32, 2);
+    __m256i v_high_shifted = _mm256_bslli_epi128(v_high_epi32, 2);
+
+    reg_low = _mm256_castsi256_ps(v_low_shifted);
+    reg_high = _mm256_castsi256_ps(v_high_shifted);
+  }
+
+  explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
+
+  FP32Vec16 operator*(const FP32Vec16& b) const {
+    return FP32Vec16(_mm256_mul_ps(reg_low, b.reg_low),
+                     _mm256_mul_ps(reg_high, b.reg_high));
+  }
+
+  FP32Vec16 operator+(const FP32Vec16& b) const {
+    return FP32Vec16(_mm256_add_ps(reg_low, b.reg_low),
+                     _mm256_add_ps(reg_high, b.reg_high));
+  }
+
+  FP32Vec16 operator-(const FP32Vec16& b) const {
+    return FP32Vec16(_mm256_sub_ps(reg_low, b.reg_low),
+                     _mm256_sub_ps(reg_high, b.reg_high));
+  }
+
+  FP32Vec16 operator/(const FP32Vec16& b) const {
+    return FP32Vec16(_mm256_div_ps(reg_low, b.reg_low),
+                     _mm256_div_ps(reg_high, b.reg_high));
+  }
+
+  float reduce_sum() const {
+    FP32Vec8 low = FP32Vec8(reg_low);
+    FP32Vec8 high = FP32Vec8(reg_high);
+    return low.reduce_sum() + high.reduce_sum();
+  }
+
+  template <int group_size>
+  float reduce_sub_sum(int idx) {
+    float sum = 0.0;
+    static_assert(VEC_ELEM_NUM % group_size == 0);
+    constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
+    uint32_t mask = base_mask << (idx * group_size);
+
+    AliasReg ar;
+
+    auto func = [&sum, &mask, &ar](int i) {
+      int flag = mask & 0x1;
+      mask = mask >> 1;
+      if (flag != 0) sum += ar.values[i];
+    };
+
+    ar.reg = reg_low;
+    unroll_loop<int, 8>(func);
+
+    ar.reg = reg_high;
+    unroll_loop<int, 8>(func);
+
+    return sum;
+  }
+
+  void save(float* ptr) const {
+    _mm256_storeu_ps(ptr, reg_low);
+    _mm256_storeu_ps(ptr + 8, reg_high);
+  }
+};
+#endif
+
+#ifdef __AVX512F__
+struct INT8Vec16 : public Vec<INT8Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  union AliasReg {
+    __m128i reg;
+    int8_t values[VEC_ELEM_NUM];
+  };
+
+  __m128i reg;
+
+  explicit INT8Vec16(const FP32Vec16& vec)
+      : reg(_mm512_cvtepi32_epi8(_mm512_cvt_roundps_epi32(
+            vec.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))) {}
+
+  void save(int8_t* ptr) const { _mm_storeu_epi8(ptr, reg); }
+
+  void save(int8_t* ptr, const int elem_num) const {
+    constexpr uint32_t M = 0xFFFFFFFF;
+    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
+    _mm_mask_storeu_epi8(ptr, mask, reg);
+  }
+};
+
+struct INT8Vec64 : public Vec<INT8Vec64> {
+  constexpr static int VEC_ELEM_NUM = 64;
+  union AliasReg {
+    __m512i reg;
+    int8_t values[VEC_ELEM_NUM];
+  };
+
+  __m512i reg;
+
+  // normal load
+  explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {}
+
+  // non-temporal load
+  explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {}
+
+  void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); }
+
+  void save(int8_t* ptr, const int elem_num) const {
+    constexpr uint64_t M = 0xFFFFFFFFFFFFFFFF;
+    __mmask64 mask = _cvtu64_mask64(M >> (64 - elem_num));
+    _mm512_mask_storeu_epi8(ptr, mask, reg);
+  }
+
+  // non-temporal save
+  void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); }
+};
+#endif
+
+template <typename T>
+struct VecType {
+  using vec_type = void;
+};
+
+template <typename T>
+using vec_t = typename VecType<T>::vec_type;
+
+template <>
+struct VecType<float> {
+  using vec_type = FP32Vec8;
+};
+
+template <>
+struct VecType<c10::Half> {
+  using vec_type = FP16Vec8;
+};
+
+template <>
+struct VecType<c10::BFloat16> {
+  using vec_type = BF16Vec8;
+};
+
+template <typename T>
+void storeFP32(float v, T* ptr) {
+  *ptr = v;
+}
+
+inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
+  acc = acc + a * b;
+}
+
+template <>
+inline void storeFP32<c10::Half>(float v, c10::Half* ptr) {
+  *reinterpret_cast<unsigned short*>(ptr) =
+      _cvtss_sh(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+}
+
+inline FP16Vec8::FP16Vec8(const FP32Vec8& v)
+    : reg(_mm256_cvtps_ph(v.reg,
+                          _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {}
+
+#ifdef __AVX512F__
+inline FP16Vec16::FP16Vec16(const FP32Vec16& v)
+    : reg(_mm512_cvtps_ph(v.reg,
+                          _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {}
+#else
+inline FP16Vec16::FP16Vec16(const FP32Vec16& v)
+    : reg(_mm256_insertf128_si256(
+          _mm256_castsi128_si256(FP16Vec8(FP32Vec8(v.reg_low)).reg),
+          FP16Vec8(FP32Vec8(v.reg_low)).reg, 1)) {}
+#endif
+
+#ifdef __AVX512BF16__
+template <>
+inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
+  *reinterpret_cast<__bfloat16*>(ptr) = _mm_cvtness_sbh(v);
+}
+
+inline BF16Vec8::BF16Vec8(const FP32Vec8& v)
+    : reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {}
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16& v)
+    : reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {}
+
+inline void fma(FP32Vec16& acc, BF16Vec32& a, BF16Vec32& b) {
+  acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg);
+}
+#else
+template <>
+inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
+  c10::BFloat16 __attribute__((__may_alias__))* v_ptr =
+      reinterpret_cast<c10::BFloat16*>(&v);
+  *ptr = *(v_ptr + 1);
+}
+
+  #ifdef __AVX512F__
+inline BF16Vec8::BF16Vec8(const FP32Vec8& v)
+    : reg(_mm256_cvtepi32_epi16(
+          _mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {}
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16& v)
+    : reg(_mm512_cvtepi32_epi16(
+          _mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {}
+  #else
+namespace {
+__m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) {
+  __m256i ai = _mm256_castps_si256(a);
+  ai = _mm256_srli_epi32(ai, 16);
+  ai = _mm256_packus_epi32(ai, ai);
+  ai = _mm256_permute4x64_epi64(ai, 0b00111001);
+  return _mm256_extracti128_si256(ai, 0);
+}
+}  // namespace
+
+inline BF16Vec8::BF16Vec8(const FP32Vec8& v)
+    : reg(FP32Vec8_to_BF16Vec8_avx2(v.reg)) {}
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
+  BF16Vec8 low = BF16Vec8(FP32Vec8(v.reg_low));
+  BF16Vec8 high = BF16Vec8(FP32Vec8(v.reg_high));
+  reg = _mm256_insertf128_si256(_mm256_castsi128_si256(low.reg), high.reg, 1);
+}
+  #endif  // __AVX512F__
+#endif    // __AVX512BF16__
+
+inline void prefetch(const void* addr) { _mm_prefetch(addr, _MM_HINT_T1); }
+
+#ifdef __AVX512F__
+inline void non_temporal_save(FP16Vec16& vec, void* ptr) {
+  _mm256_stream_si256((__m256i*)ptr, vec.reg);
+}
+inline void non_temporal_save(BF16Vec32& vec, void* ptr) {
+  _mm512_stream_si512((__m512i*)ptr, vec.reg);
+}
+inline void non_temporal_save(BF16Vec16& vec, void* ptr) {
+  _mm256_stream_si256((__m256i*)ptr, vec.reg);
+}
+inline void non_temporal_save(FP32Vec16& vec, void* ptr) {
+  _mm512_stream_ps((float*)ptr, vec.reg);
+}
+#endif
+
+inline void mem_barrier() { _mm_mfence(); }
+};  // namespace vec_op
+
+#endif
diff --git a/vllm_v0.10.0/csrc/cpu/dnnl_helper.hpp b/vllm_v0.10.0/csrc/cpu/dnnl_helper.hpp
new file mode 100644
index 0000000..1cb8dc5
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cpu/dnnl_helper.hpp
@@ -0,0 +1,206 @@
+#ifndef DNNL_HELPER_HPP
+#define DNNL_HELPER_HPP
+
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+
+#include "oneapi/dnnl/dnnl.hpp"
+
+namespace {
+template <typename T>
+struct DNNLType {
+  static constexpr dnnl::memory::data_type type =
+      dnnl::memory::data_type::undef;
+};
+
+template <>
+struct DNNLType<int8_t> {
+  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s8;
+};
+
+template <>
+struct DNNLType<int32_t> {
+  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s32;
+};
+
+template <>
+struct DNNLType<float> {
+  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f32;
+};
+
+template <>
+struct DNNLType<c10::BFloat16> {
+  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16;
+};
+
+template <>
+struct DNNLType<c10::Half> {
+  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f16;
+};
+
+template <typename T>
+constexpr inline dnnl::memory::data_type get_dnnl_type() {
+  return DNNLType<std::decay_t<T>>::type;
+}
+};  // namespace
+
+template <bool InputNoScale>
+class DNNLPrimitiveHelper {
+ public:
+  // I8 input GEMM kernel (C = a_scales * A @ (b_scales * B^T) + bias)
+  // A: [M, K], row-major
+  // B: [K, N], column-major
+  // C: [M, N], row-major
+  // bias: [N], row-major, optional
+  // a_scales: [MS]
+  // b_scales: [NS]
+  // Note: Due to the limitation of oneDNN
+  // (https://github.com/oneapi-src/oneDNN/issues/1636), the quantized bias is
+  // not supported.
+
+  template <typename OutputT, typename BiasT>
+  static void gemm_s8s8_jit(const int8_t* a, const int8_t* b, OutputT* c,
+                            const BiasT* bias, dnnl_dim_t M, dnnl_dim_t N,
+                            dnnl_dim_t K, const float* a_scales,
+                            const float* b_scales, dnnl_dim_t MS,
+                            dnnl_dim_t NS) {
+    auto&& OutputType = get_dnnl_type<OutputT>();
+    auto&& BiasType = get_dnnl_type<BiasT>();
+
+    dnnl::memory::desc a_md({M, K}, dnnl::memory::data_type::s8, {K, 1});
+    dnnl::memory::desc b_md({K, N}, dnnl::memory::data_type::s8, {1, K});
+    dnnl::memory::desc c_md({M, N}, OutputType, {N, 1});
+
+    dnnl::primitive_attr attr;
+    if constexpr (!InputNoScale) {
+      if (MS == 1) {
+        // per-tensor
+        attr.set_scales_mask(DNNL_ARG_SRC, 0);
+      } else {
+        // per-token
+        TORCH_CHECK(false, "per-token quantization is unsupported.");
+      }
+    }
+
+    if (NS == 1) {
+      // per-tensor
+      attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
+    } else {
+      // per-channel
+      attr.set_scales_mask(DNNL_ARG_WEIGHTS, 2);
+    }
+
+    dnnl::matmul::primitive_desc matmul_pd;
+// Create memory descriptors with format_tag::any for the primitive. This
+// enables the matmul primitive to choose memory layouts for an
+// optimized primitive implementation, and these layouts may differ from the
+// ones provided by the user.
+#ifdef __aarch64__
+    auto mat_src_md = dnnl::memory::desc({M, K}, dnnl::memory::data_type::s8,
+                                         dnnl::memory::format_tag::any);
+    auto mat_weights_md = dnnl::memory::desc(
+        {K, N}, dnnl::memory::data_type::s8, dnnl::memory::format_tag::any);
+    auto mat_dst_md =
+        dnnl::memory::desc({M, N}, OutputType, dnnl::memory::format_tag::any);
+    if (bias) {
+      dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1});
+      matmul_pd = dnnl::matmul::primitive_desc(default_engine(), mat_src_md,
+                                               mat_weights_md, bias_md,
+                                               mat_dst_md, attr);
+    } else {
+      matmul_pd = dnnl::matmul::primitive_desc(
+          default_engine(), mat_src_md, mat_weights_md, mat_dst_md, attr);
+    }
+#else
+    if (bias) {
+      dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1});
+      matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
+                                               bias_md, c_md, attr);
+    } else {
+      matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
+                                               c_md, attr);
+    }
+#endif
+    dnnl::matmul matmul(matmul_pd);
+
+    auto& engine = default_engine();
+
+    dnnl::memory a_m(a_md, engine, (void*)a);
+    dnnl::memory b_m(b_md, engine, (void*)b);
+    dnnl::memory c_m(c_md, engine, (void*)c);
+    dnnl::memory a_scales_m({{MS}, dnnl::memory::data_type::f32, {1}}, engine,
+                            (void*)a_scales);
+    dnnl::memory b_scales_m({{NS}, dnnl::memory::data_type::f32, {1}}, engine,
+                            (void*)b_scales);
+
+    auto& stream = default_stream();
+
+    auto mat_src_mem = a_m;
+    auto mat_weights_mem = b_m;
+    auto mat_dst_mem = c_m;
+#ifdef __aarch64__
+    if (matmul_pd.weights_desc() != b_m.get_desc()) {
+      mat_weights_mem = dnnl::memory(matmul_pd.weights_desc(), engine);
+      dnnl::reorder(b_m, mat_weights_mem).execute(stream, b_m, mat_weights_mem);
+    }
+#endif
+    if constexpr (InputNoScale) {
+      if (bias) {
+        dnnl::memory::desc bias_md({N}, BiasType, {1});
+        dnnl::memory bias_m(bias_md, engine, (void*)bias);
+        matmul.execute(
+            stream, {
+                        {DNNL_ARG_SRC, mat_src_mem},
+                        {DNNL_ARG_WEIGHTS, mat_weights_mem},
+                        {DNNL_ARG_BIAS, bias_m},
+                        {DNNL_ARG_DST, mat_dst_mem},
+                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
+                    });
+      } else {
+        matmul.execute(
+            stream, {
+                        {DNNL_ARG_SRC, mat_src_mem},
+                        {DNNL_ARG_WEIGHTS, mat_weights_mem},
+                        {DNNL_ARG_DST, mat_dst_mem},
+                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
+                    });
+      }
+    } else {
+      if (bias) {
+        dnnl::memory::desc bias_md({N}, BiasType, {1});
+        dnnl::memory bias_m(bias_md, engine, (void*)bias);
+        matmul.execute(
+            stream, {
+                        {DNNL_ARG_SRC, mat_src_mem},
+                        {DNNL_ARG_WEIGHTS, mat_weights_mem},
+                        {DNNL_ARG_BIAS, bias_m},
+                        {DNNL_ARG_DST, mat_dst_mem},
+                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
+                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
+                    });
+      } else {
+        matmul.execute(
+            stream, {
+                        {DNNL_ARG_SRC, mat_src_mem},
+                        {DNNL_ARG_WEIGHTS, mat_weights_mem},
+                        {DNNL_ARG_DST, mat_dst_mem},
+                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
+                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
+                    });
+      }
+    }
+    stream.wait();
+  }
+
+ private:
+  static dnnl::engine& default_engine() {
+    static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
+    return engine;
+  }
+
+  static dnnl::stream& default_stream() {
+    static dnnl::stream stream(default_engine());
+    return stream;
+  }
+};
+#endif
diff --git a/vllm_v0.10.0/csrc/cpu/layernorm.cpp b/vllm_v0.10.0/csrc/cpu/layernorm.cpp
new file mode 100644
index 0000000..a76ad08
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cpu/layernorm.cpp
@@ -0,0 +1,117 @@
+#include "cpu_types.hpp"
+
+namespace {
+template <typename scalar_t>
+void rms_norm_impl(scalar_t* __restrict__ out,
+                   const scalar_t* __restrict__ input,
+                   const scalar_t* __restrict__ weight, const float epsilon,
+                   const int num_tokens, const int hidden_size) {
+  using scalar_vec_t = vec_op::vec_t<scalar_t>;
+  constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
+  TORCH_CHECK(hidden_size % VEC_ELEM_NUM == 0);
+
+#pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    vec_op::FP32Vec8 variance(0.0);
+    auto input_p = input + i * hidden_size;
+    auto output_p = out + i * hidden_size;
+    for (int j = 0; j < hidden_size; j += VEC_ELEM_NUM) {
+      scalar_vec_t x(input_p + j);
+      vec_op::FP32Vec8 fp32_x(x);
+      variance = variance + fp32_x * fp32_x;
+    }
+
+    float s_variance =
+        1.0f / sqrtf(variance.reduce_sum() / (float)hidden_size + epsilon);
+    vec_op::FP32Vec8 fp32_s_variance(s_variance);
+
+    for (int j = 0; j < hidden_size; j += VEC_ELEM_NUM) {
+      scalar_vec_t x(input_p + j);
+      scalar_vec_t w(weight + j);
+
+      vec_op::FP32Vec8 fp32_x(x);
+      vec_op::FP32Vec8 fp32_w(w);
+
+      vec_op::FP32Vec8 fp32_out = fp32_x * fp32_s_variance * fp32_w;
+
+      scalar_vec_t out(fp32_out);
+      out.save(output_p + j);
+    }
+  }
+}
+
+template <typename scalar_t>
+void fused_add_rms_norm_impl(scalar_t* __restrict__ input,
+                             scalar_t* __restrict__ residual,
+                             const scalar_t* __restrict__ weight,
+                             const float epsilon, const int num_tokens,
+                             const int hidden_size) {
+  using scalar_vec_t = vec_op::vec_t<scalar_t>;
+  constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
+  TORCH_CHECK(hidden_size % VEC_ELEM_NUM == 0);
+
+#pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    vec_op::FP32Vec8 variance(0.0);
+    auto input_p = input + i * hidden_size;
+    auto residual_p = residual + i * hidden_size;
+    for (int j = 0; j < hidden_size; j += VEC_ELEM_NUM) {
+      scalar_vec_t x(input_p + j);
+      scalar_vec_t res(residual_p + j);
+      vec_op::FP32Vec8 fp32_x(x);
+      vec_op::FP32Vec8 fp32_res(res);
+
+      fp32_x = fp32_x + fp32_res;
+      variance = variance + fp32_x * fp32_x;
+      scalar_vec_t out(fp32_x);
+      out.save(residual_p + j);
+    }
+
+    float s_variance =
+        1.0f / sqrtf(variance.reduce_sum() / (float)hidden_size + epsilon);
+    vec_op::FP32Vec8 fp32_s_variance(s_variance);
+
+    for (int j = 0; j < hidden_size; j += VEC_ELEM_NUM) {
+      scalar_vec_t w(weight + j);
+      scalar_vec_t res(residual_p + j);
+
+      vec_op::FP32Vec8 fp32_w(w);
+      vec_op::FP32Vec8 fp32_res(res);
+
+      vec_op::FP32Vec8 fp32_out = fp32_res * fp32_s_variance * fp32_w;
+
+      scalar_vec_t out(fp32_out);
+      out.save(input_p + j);
+    }
+  }
+}
+}  // namespace
+
+void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
+              double epsilon) {
+  int hidden_size = input.size(-1);
+  int num_tokens = input.numel() / hidden_size;
+
+  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_impl", [&] {
+    CPU_KERNEL_GUARD_IN(rms_norm_impl)
+    rms_norm_impl(out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
+                  weight.data_ptr<scalar_t>(), epsilon, num_tokens,
+                  hidden_size);
+    CPU_KERNEL_GUARD_OUT(rms_norm_impl)
+  });
+}
+
+void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
+                        torch::Tensor& weight, double epsilon) {
+  int hidden_size = input.size(-1);
+  int num_tokens = input.numel() / hidden_size;
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "fused_add_rms_norm_impl", [&] {
+        CPU_KERNEL_GUARD_IN(fused_add_rms_norm_impl)
+        fused_add_rms_norm_impl(
+            input.data_ptr<scalar_t>(), residual.data_ptr<scalar_t>(),
+            weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
+        CPU_KERNEL_GUARD_OUT(fused_add_rms_norm_impl)
+      });
+}
diff --git a/vllm_v0.10.0/csrc/cpu/mla_decode.cpp b/vllm_v0.10.0/csrc/cpu/mla_decode.cpp
new file mode 100644
index 0000000..37bd463
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cpu/mla_decode.cpp
@@ -0,0 +1,393 @@
+#include "cpu_types.hpp"
+#include <float.h>
+
+namespace {
+template <typename scalar_t>
+struct KernelVecType {
+  using qk_load_vec_type = void;
+  using qk_vec_type = void;
+  using v_load_vec_type = void;
+};
+
+template <>
+struct KernelVecType<float> {
+  using qk_load_vec_type = vec_op::FP32Vec16;
+  using qk_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::FP32Vec16;
+};
+
+template <>
+struct KernelVecType<c10::Half> {
+#if defined(__powerpc64__) || defined(__s390x__)
+  // Power and s390x architecture-specific vector types
+  using qk_load_vec_type = vec_op::FP32Vec16;
+  using qk_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::FP32Vec16;
+#else
+  // Fallback for other architectures, including x86
+  using qk_load_vec_type = vec_op::FP16Vec16;
+  using qk_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::FP16Vec16;
+#endif
+};
+
+#ifdef __AVX512BF16__
+template <>
+struct KernelVecType<c10::BFloat16> {
+  using qk_load_vec_type = vec_op::BF16Vec32;
+  using qk_vec_type = vec_op::BF16Vec32;
+  using v_load_vec_type = vec_op::BF16Vec16;
+};
+#elif defined(__aarch64__) && !defined(ARM_BF16_SUPPORT)
+// pass
+#else
+template <>
+struct KernelVecType<c10::BFloat16> {
+  using qk_load_vec_type = vec_op::BF16Vec16;
+  using qk_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::BF16Vec16;
+};
+#endif
+
+template <int HEAD_DIM, int V_HEAD_DIM, int BLOCK_SIZE, int HEAD_UNROLL,
+          typename qk_vec_type>
+void mla_decode_block_head(
+    const qk_vec_type* __restrict__ q_vecs,          // [HEAD_UNROLL, head_dim]
+    const qk_vec_type* __restrict__ k_vecs,          // [block_size, head_dim]
+    const vec_op::FP32Vec16* __restrict v_vecs_f32,  // [block_size, v_head_dim]
+    float* __restrict__ acc_out,  // [HEAD_UNROLL, v_head_dim]
+    float* __restrict__ acc_lse,  // [HEAD_UNROLL]
+    const float scale, const int num_tokens) {
+  using f32_vec_type = vec_op::FP32Vec16;
+  constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM;
+  constexpr int V_NUM_ELEM = f32_vec_type::VEC_ELEM_NUM;
+
+  float logits[BLOCK_SIZE][HEAD_UNROLL] = {};  // initialize to zeros
+  float max_val[HEAD_UNROLL];
+  std::fill(max_val, max_val + HEAD_UNROLL, -FLT_MAX);
+
+  f32_vec_type acc_vec[BLOCK_SIZE][HEAD_UNROLL];
+  for (int i = 0; i < HEAD_DIM; i += QK_NUM_ELEM) {
+    // load to registers
+    qk_vec_type q_vec[HEAD_UNROLL];
+
+#pragma unroll
+    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
+      q_vec[unroll] =
+          qk_vec_type{q_vecs[(i + unroll * HEAD_DIM) / QK_NUM_ELEM]};
+
+    for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
+      qk_vec_type k_vec(k_vecs[(block_offset * HEAD_DIM + i) / QK_NUM_ELEM]);
+
+#pragma unroll
+      for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
+        vec_op::fma(acc_vec[block_offset][unroll], q_vec[unroll], k_vec);
+    }
+  }
+
+  for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
+#pragma unroll
+    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
+      const float acc = acc_vec[block_offset][unroll].reduce_sum() * scale;
+      logits[block_offset][unroll] = acc;
+      max_val[unroll] = std::max(max_val[unroll], acc);
+    }
+  }
+
+  float sum_exp[HEAD_UNROLL] = {};
+  for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
+#pragma unroll
+    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
+      const float val =
+          std::exp(logits[block_offset][unroll] - max_val[unroll]);
+      logits[block_offset][unroll] = val;
+      sum_exp[unroll] += val;
+    }
+  }
+
+  f32_vec_type this_out[V_HEAD_DIM / V_NUM_ELEM][HEAD_UNROLL];
+
+  for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
+    // load to registers
+    f32_vec_type scale_[HEAD_UNROLL];
+
+#pragma unroll
+    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
+      scale_[unroll] =
+          f32_vec_type{logits[block_offset][unroll] / sum_exp[unroll]};
+
+    for (int i = 0; i < V_HEAD_DIM; i += V_NUM_ELEM) {
+      f32_vec_type v_vec(
+          v_vecs_f32[(block_offset * HEAD_DIM + i) / V_NUM_ELEM]);
+
+#pragma unroll
+      for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
+        vec_op::fma(this_out[i / V_NUM_ELEM][unroll], v_vec, scale_[unroll]);
+    }
+  }
+
+  // merge attention state
+  // section 2.2 in https://arxiv.org/pdf/2501.01005
+  f32_vec_type prev_scale[HEAD_UNROLL];
+  f32_vec_type curr_scale[HEAD_UNROLL];
+
+#pragma unroll
+  for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
+    const float prev_lse = acc_lse[unroll];
+    const float curr_lse = std::log(sum_exp[unroll]) +
+                           max_val[unroll];  // add back max_val to get true lse
+    // softmax trick
+    const float max_lse = std::max(prev_lse, curr_lse);
+    const float prev_sum_exp = std::exp(prev_lse - max_lse);
+    const float curr_sum_exp = std::exp(curr_lse - max_lse);
+
+    const float new_sum_exp = prev_sum_exp + curr_sum_exp;
+    acc_lse[unroll] = std::log(new_sum_exp) + max_lse;
+
+    prev_scale[unroll] = f32_vec_type{prev_sum_exp / new_sum_exp};
+    curr_scale[unroll] = f32_vec_type{curr_sum_exp / new_sum_exp};
+  }
+
+  for (int i = 0; i < V_HEAD_DIM; i += V_NUM_ELEM) {
+#pragma unroll
+    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
+      f32_vec_type o_vec(acc_out + i + V_HEAD_DIM * unroll);
+      o_vec = o_vec * prev_scale[unroll] +
+              this_out[i / V_NUM_ELEM][unroll] * curr_scale[unroll];
+      o_vec.save(acc_out + i + V_HEAD_DIM * unroll);
+    }
+  }
+
+  q_vecs += HEAD_DIM / QK_NUM_ELEM * HEAD_UNROLL;
+  acc_out += V_HEAD_DIM * HEAD_UNROLL;
+}
+
+template <typename scalar_t, int HEAD_DIM, int V_HEAD_DIM, int BLOCK_SIZE,
+          typename qk_vec_type>
+void mla_decode_block(
+    const qk_vec_type* __restrict__ q_vecs,  // [num_heads, head_dim]
+    const scalar_t* __restrict__ kv_cache,   // [block_size, head_dim]
+    float* __restrict__ acc_out,             // [num_heads, v_head_dim]
+    float* __restrict__ acc_lse,             // [num_heads]
+    const int num_heads, const float scale, const int num_tokens) {
+  using qk_load_vec_type = typename KernelVecType<scalar_t>::qk_load_vec_type;
+  static_assert(
+      std::is_same<qk_vec_type,
+                   typename KernelVecType<scalar_t>::qk_vec_type>::value);
+  using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
+  using f32_vec_type = vec_op::FP32Vec16;
+  static_assert(qk_load_vec_type::VEC_ELEM_NUM == qk_vec_type::VEC_ELEM_NUM);
+  static_assert(v_load_vec_type::VEC_ELEM_NUM == f32_vec_type::VEC_ELEM_NUM);
+  constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM;
+  constexpr int V_NUM_ELEM = v_load_vec_type::VEC_ELEM_NUM;
+
+  const qk_vec_type* k_vecs;
+  const f32_vec_type* v_vecs_f32;
+  float* kv_cache_f32 = nullptr;
+
+  if constexpr (!std::is_same<scalar_t, float>::value) {
+    // convert KV cache block to FP32 to reuse it across query heads and
+    // attn @ V computation, since FP16/BF16->FP32 is expensive.
+    // TODO: move malloc outside of this fn to reuse across iterations.
+    const int nbytes = BLOCK_SIZE * HEAD_DIM * sizeof(float);
+    kv_cache_f32 = static_cast<float*>(std::aligned_alloc(64, nbytes));
+
+    for (int block_offset = 0; block_offset < num_tokens; ++block_offset)
+      for (int i = 0; i < HEAD_DIM; i += V_NUM_ELEM) {
+        v_load_vec_type kv_load_vec(kv_cache + block_offset * HEAD_DIM + i);
+        f32_vec_type kv_vec_f32(kv_load_vec);
+        kv_vec_f32.save(kv_cache_f32 + block_offset * HEAD_DIM + i);
+      }
+
+    if constexpr (std::is_same<qk_load_vec_type, qk_vec_type>::value) {
+      // for AVX512_BF16, Q @ K.T uses BF16 for K (no conversion)
+      // NOTE: in this case, we only need to convert the V section to FP32.
+      // But for simplicity, we will convert the whole KV block to FP32.
+      k_vecs = reinterpret_cast<const qk_vec_type*>(kv_cache);
+    } else {
+      k_vecs = reinterpret_cast<const qk_vec_type*>(kv_cache_f32);
+    }
+
+    // attn @ V always use FP32 for V, since attn is FP32.
+    v_vecs_f32 = reinterpret_cast<const f32_vec_type*>(kv_cache_f32);
+
+  } else {
+    // KV cache is FP32. don't need to do anything.
+    k_vecs = reinterpret_cast<const qk_vec_type*>(kv_cache);
+    v_vecs_f32 = reinterpret_cast<const f32_vec_type*>(kv_cache);
+  }
+
+  // compute 2 heads at the same time to improve ILP and
+  // take advantage of register cache for K and V.
+  constexpr int HEAD_UNROLL = 2;
+  for (int iter = 0; iter < num_heads / HEAD_UNROLL; ++iter) {
+    mla_decode_block_head<HEAD_DIM, V_HEAD_DIM, BLOCK_SIZE, HEAD_UNROLL>(
+        q_vecs, k_vecs, v_vecs_f32, acc_out, acc_lse, scale, num_tokens);
+
+    q_vecs += HEAD_UNROLL * HEAD_DIM / QK_NUM_ELEM;
+    acc_out += HEAD_UNROLL * V_HEAD_DIM;
+    acc_lse += HEAD_UNROLL;
+  }
+
+  // take care of the remaining heads
+  for (int iter = 0; iter < num_heads % HEAD_UNROLL; ++iter) {
+    mla_decode_block_head<HEAD_DIM, V_HEAD_DIM, BLOCK_SIZE, 1>(
+        q_vecs, k_vecs, v_vecs_f32, acc_out, acc_lse, scale, num_tokens);
+
+    q_vecs += HEAD_DIM / QK_NUM_ELEM;
+    acc_out += V_HEAD_DIM;
+    acc_lse += 1;
+  }
+
+  if (kv_cache_f32 != nullptr) {
+    std::free(kv_cache_f32);
+  }
+}
+}  // namespace
+
+template <typename scalar_t, int HEAD_DIM, int V_HEAD_DIM, int BLOCK_SIZE>
+void mla_decode_kvcache_cpu_impl(
+    scalar_t* __restrict__ out,             // [num_seqs, num_heads, v_head_dim]
+    const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_dim]
+    const scalar_t* __restrict__ kv_cache,  // [num_blocks, block_size,
+                                            // head_dim]
+    const int num_heads, const float scale,
+    const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
+    const int* __restrict__ seq_lens,      // [num_seqs]
+    const int max_num_blocks_per_seq, const int o_stride, const int q_stride,
+    const int kv_stride, const int num_seqs) {
+  using qk_load_vec_type = typename KernelVecType<scalar_t>::qk_load_vec_type;
+  using qk_vec_type = typename KernelVecType<scalar_t>::qk_vec_type;
+  constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM;
+
+  // shared across threads
+  const int max_threads = omp_get_max_threads();
+  const int acc_out_nbytes =
+      max_threads * num_heads * V_HEAD_DIM * sizeof(float);
+  float* acc_out = static_cast<float*>(std::aligned_alloc(64, acc_out_nbytes));
+  std::vector<float> acc_lse(max_threads * num_heads);
+
+  // allocate memory to pre-convert query to FP32 later
+  float* q_f32;
+  constexpr bool PRE_CONVERT_QUERY =
+      !std::is_same<scalar_t, float>::value &&
+      std::is_same<qk_vec_type, vec_op::FP32Vec16>::value;
+  if constexpr (PRE_CONVERT_QUERY) {
+    const int q_f32_nbytes = num_heads * HEAD_DIM * sizeof(float);
+    q_f32 = static_cast<float*>(std::aligned_alloc(64, q_f32_nbytes));
+  }
+
+#pragma omp parallel
+  {
+    const int num_threads = omp_get_num_threads();
+    const int thread_id = omp_get_thread_num();
+    float* __restrict__ acc_out_thread =
+        acc_out + thread_id * num_heads * V_HEAD_DIM;
+    float* __restrict__ acc_lse_thread = acc_lse.data() + thread_id * num_heads;
+
+    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
+      // reset accumulator
+      std::fill(acc_out_thread, acc_out_thread + num_heads * V_HEAD_DIM, 0.0f);
+      std::fill(acc_lse_thread, acc_lse_thread + num_heads, -FLT_MAX);
+
+      const int seq_len = seq_lens[seq_idx];
+      const int block_num = (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
+      const int last_block_size = seq_len - (block_num - 1) * BLOCK_SIZE;
+
+      const qk_vec_type* q_vecs;
+      if constexpr (PRE_CONVERT_QUERY) {
+// pre-convert query to FP32 since FP16/BF16->FP32 is slow.
+#pragma omp for
+        for (int i = 0; i < num_heads * HEAD_DIM; i += QK_NUM_ELEM) {
+          qk_load_vec_type q_load_vec(q + seq_idx * q_stride + i);
+          qk_vec_type q_vec(q_load_vec);
+          q_vec.save(q_f32 + i);
+        }
+        q_vecs = reinterpret_cast<const qk_vec_type*>(q_f32);
+      } else {
+        q_vecs = reinterpret_cast<const qk_vec_type*>(q + seq_idx * q_stride);
+      }
+
+#pragma omp for
+      for (int block_idx = 0; block_idx < block_num; ++block_idx) {
+        const int physical_block_idx =
+            block_tables[seq_idx * max_num_blocks_per_seq + block_idx];
+        const int num_tokens =
+            block_idx < block_num - 1 ? BLOCK_SIZE : last_block_size;
+
+        mla_decode_block<scalar_t, HEAD_DIM, V_HEAD_DIM, BLOCK_SIZE>(
+            q_vecs, kv_cache + physical_block_idx * kv_stride, acc_out_thread,
+            acc_lse_thread, num_heads, scale, num_tokens);
+      }
+
+// merge attention states across threads
+// section 2.2 in https://arxiv.org/pdf/2501.01005
+// each thread is responsible for 1 head
+#pragma omp for
+      for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
+        float* acc_lse_head = acc_lse.data() + head_idx;
+        float* acc_out_head = acc_out + head_idx * V_HEAD_DIM;
+
+        float max_val = -FLT_MAX;
+        for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) {
+          max_val = std::max(max_val, acc_lse_head[thread_id_ * num_heads]);
+        }
+
+        float sum_exp = 0.0f;
+        for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) {
+          float val = std::exp(acc_lse_head[thread_id_ * num_heads] - max_val);
+          acc_lse_head[thread_id_ * num_heads] = val;
+          sum_exp += val;
+        }
+
+        float inv_sum = 1.0f / sum_exp;
+        float out_head[V_HEAD_DIM] = {};
+        for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) {
+          float scale_ = acc_lse_head[thread_id_ * num_heads] * inv_sum;
+          for (int i = 0; i < V_HEAD_DIM; ++i) {
+            out_head[i] +=
+                acc_out_head[thread_id_ * num_heads * V_HEAD_DIM + i] * scale_;
+          }
+        }
+
+        for (int i = 0; i < V_HEAD_DIM; ++i) {
+          vec_op::storeFP32(out_head[i], out + seq_idx * o_stride +
+                                             head_idx * V_HEAD_DIM + i);
+        }
+      }
+    }
+  }
+  if (PRE_CONVERT_QUERY) {
+    std::free(q_f32);
+  }
+  std::free(acc_out);
+}
+
+void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
+                        torch::Tensor& kv_cache, double scale,
+                        torch::Tensor& block_tables, torch::Tensor& seq_lens) {
+  const int num_seqs = query.size(0);
+  const int num_heads = query.size(1);
+  const int head_dim = query.size(2);
+  const int block_size = kv_cache.size(1);
+  const int v_head_dim = out.size(2);
+
+  const int max_num_blocks_per_seq = block_tables.size(1);
+  const int o_stride = out.stride(0);
+  const int q_stride = query.stride(0);
+  const int kv_stride = kv_cache.stride(0);
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      query.scalar_type(), "mla_decode_kvcache_cpu_impl", [&] {
+        CPU_KERNEL_GUARD_IN(mla_decode_kvcache_cpu_impl)
+        if (head_dim == 576 && v_head_dim == 512 && block_size == 16)
+          mla_decode_kvcache_cpu_impl<scalar_t, 576, 512, 16>(
+              out.data_ptr<scalar_t>(), query.data_ptr<scalar_t>(),
+              kv_cache.data_ptr<scalar_t>(), num_heads, scale,
+              block_tables.data_ptr<int>(), seq_lens.data_ptr<int>(),
+              max_num_blocks_per_seq, o_stride, q_stride, kv_stride, num_seqs);
+        else
+          TORCH_CHECK(false, "Unsupported block size: ", block_size);
+        CPU_KERNEL_GUARD_OUT(mla_decode_kvcache_cpu_impl)
+      });
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/cpu/pos_encoding.cpp b/vllm_v0.10.0/csrc/cpu/pos_encoding.cpp
new file mode 100644
index 0000000..74bb014
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cpu/pos_encoding.cpp
@@ -0,0 +1,208 @@
+
+#include "cpu_types.hpp"
+
+namespace {
+template <typename scalar_t>
+void rotary_embedding_impl(
+    const int64_t* __restrict__ positions,  // [batch_size, seq_len] or
+                                            // [num_tokens]
+    scalar_t* __restrict__ query,           /// [batch_size, seq_len, num_heads,
+                                   /// head_size] or [num_tokens, num_heads,
+                                   /// head_size]
+    scalar_t* __restrict__ key,  // nullptr (optional) or
+                                 // [batch_size, seq_len, num_kv_heads,
+                                 // head_size] or [num_tokens, num_kv_heads,
+                                 // head_size]
+    const scalar_t* __restrict__ cos_sin_cache,  // [max_position, 2, rot_dim //
+                                                 // 2]
+    const int rot_dim, const int64_t query_stride, const int64_t key_stride,
+    const int num_heads, const int num_kv_heads, const int head_size,
+    const int num_tokens) {
+  using scalar_vec_t = vec_op::vec_t<scalar_t>;
+  constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
+
+  const int embed_dim = rot_dim / 2;
+  bool flag = (embed_dim % VEC_ELEM_NUM == 0);
+  const int loop_upper = flag ? embed_dim : embed_dim - VEC_ELEM_NUM;
+
+  auto compute_loop = [&](const int64_t token_head, const scalar_t* cache_ptr,
+                          scalar_t* qk) {
+    int j = 0;
+    for (; j < loop_upper; j += VEC_ELEM_NUM) {
+      const int rot_offset = j;
+      const int x_index = rot_offset;
+      const int y_index = embed_dim + rot_offset;
+
+      const int64_t out_x = token_head + x_index;
+      const int64_t out_y = token_head + y_index;
+
+      const scalar_vec_t cos(cache_ptr + x_index);
+      const scalar_vec_t sin(cache_ptr + y_index);
+
+      const scalar_vec_t q_x(qk + out_x);
+      const scalar_vec_t q_y(qk + out_y);
+
+      vec_op::FP32Vec8 fp32_cos(cos);
+      vec_op::FP32Vec8 fp32_sin(sin);
+
+      vec_op::FP32Vec8 fp32_q_x(q_x);
+      vec_op::FP32Vec8 fp32_q_y(q_y);
+
+      auto out1 = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
+      scalar_vec_t(out1).save(qk + out_x);
+
+      auto out2 = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
+      scalar_vec_t(out2).save(qk + out_y);
+    }
+    if (!flag) {
+      for (; j < embed_dim; ++j) {
+        const int x_index = j;
+        const int y_index = embed_dim + j;
+
+        const int64_t out_x = token_head + x_index;
+        const int64_t out_y = token_head + y_index;
+
+        const float fp32_cos = cache_ptr[x_index];
+        const float fp32_sin = cache_ptr[y_index];
+
+        const float fp32_q_x = qk[out_x];
+        const float fp32_q_y = qk[out_y];
+
+        qk[out_x] = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
+        qk[out_y] = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
+      }
+    }
+  };
+
+#pragma omp parallel for
+  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
+    int64_t pos = positions[token_idx];
+    const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
+
+    for (int i = 0; i < num_heads; ++i) {
+      const int head_idx = i;
+      const int64_t token_head =
+          token_idx * query_stride + head_idx * head_size;
+      compute_loop(token_head, cache_ptr, query);
+    }
+
+    if (key != nullptr) {
+      for (int i = 0; i < num_kv_heads; ++i) {
+        const int head_idx = i;
+        const int64_t token_head =
+            token_idx * key_stride + head_idx * head_size;
+        compute_loop(token_head, cache_ptr, key);
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+void rotary_embedding_gptj_impl(
+    const int64_t* __restrict__ positions,  // [batch_size, seq_len] or
+                                            // [num_tokens]
+    scalar_t* __restrict__ query,           /// [batch_size, seq_len, num_heads,
+                                   /// head_size] or [num_tokens, num_heads,
+                                   /// head_size]
+    scalar_t* __restrict__ key,  // nullptr (optional) or
+                                 // [batch_size, seq_len, num_kv_heads,
+                                 // head_size] or [num_tokens, num_kv_heads,
+                                 // head_size]
+    const scalar_t* __restrict__ cos_sin_cache,  // [max_position, 2, rot_dim //
+                                                 // 2]
+    const int rot_dim, const int64_t query_stride, const int64_t key_stride,
+    const int num_heads, const int num_kv_heads, const int head_size,
+    const int num_tokens) {
+  const int embed_dim = rot_dim / 2;
+
+#pragma omp parallel for collapse(2)
+  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
+    for (int i = 0; i < num_heads; ++i) {
+      int64_t pos = positions[token_idx];
+      const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
+      const scalar_t* cos_cache_ptr = cache_ptr;
+      const scalar_t* sin_cache_ptr = cache_ptr + embed_dim;
+      const int head_idx = i;
+      const int64_t token_head =
+          token_idx * query_stride + head_idx * head_size;
+      scalar_t* head_query = token_head + query;
+      for (int j = 0; j < embed_dim; j += 1) {
+        const int rot_offset = j;
+        const int x_index = 2 * rot_offset;
+        const int y_index = 2 * rot_offset + 1;
+
+        const float cos = cos_cache_ptr[rot_offset];
+        const float sin = sin_cache_ptr[rot_offset];
+
+        const float x = head_query[x_index];
+        const float y = head_query[y_index];
+
+        head_query[x_index] = x * cos - y * sin;
+        head_query[y_index] = y * cos + x * sin;
+      }
+    }
+  }
+
+  if (key == nullptr) {
+    return;
+  }
+
+#pragma omp parallel for collapse(2)
+  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
+    for (int i = 0; i < num_kv_heads; ++i) {
+      int64_t pos = positions[token_idx];
+      const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
+      const scalar_t* cos_cache_ptr = cache_ptr;
+      const scalar_t* sin_cache_ptr = cache_ptr + embed_dim;
+      const int head_idx = i;
+      const int64_t token_head = token_idx * key_stride + head_idx * head_size;
+      scalar_t* head_key = key + token_head;
+      for (int j = 0; j < embed_dim; j += 1) {
+        const int rot_offset = j;
+        const int x_index = 2 * rot_offset;
+        const int y_index = 2 * rot_offset + 1;
+
+        const float cos = cos_cache_ptr[rot_offset];
+        const float sin = sin_cache_ptr[rot_offset];
+
+        const float x = head_key[x_index];
+        const float y = head_key[y_index];
+
+        head_key[x_index] = x * cos - y * sin;
+        head_key[y_index] = y * cos + x * sin;
+      }
+    }
+  }
+}
+};  // namespace
+
+void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
+                      std::optional<torch::Tensor> key, int64_t head_size,
+                      torch::Tensor& cos_sin_cache, bool is_neox) {
+  int num_tokens = positions.numel();
+  int rot_dim = cos_sin_cache.size(1);
+  int num_heads = query.size(-1) / head_size;
+  int num_kv_heads = key.has_value() ? key->size(-1) / head_size : num_heads;
+  int64_t key_stride = key.has_value() ? key->stride(-2) : 0;
+  int64_t query_stride = query.stride(-2);
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      query.scalar_type(), "rotary_embedding_impl", [&] {
+        CPU_KERNEL_GUARD_IN(rotary_embedding_impl)
+        if (is_neox) {
+          rotary_embedding_impl(
+              positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
+              key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
+              cos_sin_cache.data_ptr<scalar_t>(), rot_dim, query_stride,
+              key_stride, num_heads, num_kv_heads, head_size, num_tokens);
+        } else {
+          rotary_embedding_gptj_impl(
+              positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
+              key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
+              cos_sin_cache.data_ptr<scalar_t>(), rot_dim, query_stride,
+              key_stride, num_heads, num_kv_heads, head_size, num_tokens);
+        }
+
+        CPU_KERNEL_GUARD_OUT(rotary_embedding_impl)
+      });
+}
diff --git a/vllm_v0.10.0/csrc/cpu/quant.cpp b/vllm_v0.10.0/csrc/cpu/quant.cpp
new file mode 100644
index 0000000..c1f7c64
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cpu/quant.cpp
@@ -0,0 +1,949 @@
+#include "cpu_types.hpp"
+#include "dnnl_helper.hpp"
+
+namespace {
+template <typename scalar_t>
+struct KernelVecType {
+  using load_vec_type = void;
+  using azp_adj_load_vec_type = void;
+  using cvt_vec_type = void;
+};
+
+template <>
+struct KernelVecType<float> {
+  using load_vec_type = vec_op::FP32Vec16;
+  using azp_adj_load_vec_type = vec_op::INT32Vec16;
+  using cvt_vec_type = vec_op::FP32Vec16;
+};
+
+template <>
+struct KernelVecType<c10::BFloat16> {
+  using load_vec_type = vec_op::BF16Vec16;
+  using azp_adj_load_vec_type = vec_op::INT32Vec16;
+  using cvt_vec_type = vec_op::FP32Vec16;
+};
+
+template <>
+struct KernelVecType<c10::Half> {
+#if defined(__powerpc64__) || defined(__s390x__)
+  // Power architecture-specific vector type
+  using load_vec_type = vec_op::FP32Vec16;
+#else
+  // Fallback for other architectures
+  using load_vec_type = vec_op::FP16Vec16;
+#endif
+  using azp_adj_load_vec_type = vec_op::INT32Vec16;
+  using cvt_vec_type = vec_op::FP32Vec16;
+};
+
+#if defined(__AVX512F__) || defined(__aarch64__)
+template <bool AZP, typename scalar_t>
+void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
+                                   const float* scale, const int32_t* azp,
+                                   const int num_tokens,
+                                   const int hidden_size) {
+  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
+  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
+  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+
+  constexpr float i8_min =
+      static_cast<float>(std::numeric_limits<int8_t>::min());
+  constexpr float i8_max =
+      static_cast<float>(std::numeric_limits<int8_t>::max());
+  const cvt_vec_t inv_scale(1.0 / *scale);
+  const cvt_vec_t i8_min_vec(i8_min);
+  const cvt_vec_t i8_max_vec(i8_max);
+
+  cvt_vec_t zp_vec;
+  if constexpr (AZP) {
+    zp_vec = cvt_vec_t(static_cast<float>(*azp));
+  }
+
+  #pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    int j = 0;
+    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+      load_vec_t elems(input + i * hidden_size + j);
+      cvt_vec_t elems_fp32(elems);
+      elems_fp32 = elems_fp32 * inv_scale;
+
+      if constexpr (AZP) {
+        elems_fp32 = elems_fp32 + zp_vec;
+      }
+
+      elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
+      vec_op::INT8Vec16 elems_int8(elems_fp32);
+      elems_int8.save(output + i * hidden_size + j);
+    }
+
+    load_vec_t elems(input + i * hidden_size + j);
+    cvt_vec_t elems_fp32(elems);
+    elems_fp32 = elems_fp32 * inv_scale;
+
+    if constexpr (AZP) {
+      elems_fp32 = elems_fp32 + zp_vec;
+    }
+
+    elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
+    vec_op::INT8Vec16 elems_int8(elems_fp32);
+    elems_int8.save(output + i * hidden_size + j, hidden_size - j);
+  }
+}
+
+template <bool AZP, typename scalar_t>
+void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
+                                    float* scale, int32_t* azp,
+                                    const int num_tokens,
+                                    const int hidden_size) {
+  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
+  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
+  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+
+  constexpr float i8_min =
+      static_cast<float>(std::numeric_limits<int8_t>::min());
+  constexpr float i8_max =
+      static_cast<float>(std::numeric_limits<int8_t>::max());
+  const cvt_vec_t i8_min_vec(i8_min);
+  const cvt_vec_t i8_max_vec(i8_max);
+
+  #pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    cvt_vec_t max_value(std::numeric_limits<float>::lowest());
+    cvt_vec_t min_value(std::numeric_limits<float>::max());
+    {
+      int j = 0;
+      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+        load_vec_t elems(input + i * hidden_size + j);
+        cvt_vec_t elems_fp32(elems);
+        if constexpr (AZP) {
+          max_value = max_value.max(elems_fp32);
+          min_value = min_value.min(elems_fp32);
+        } else {
+          max_value = max_value.max(elems_fp32.abs());
+        }
+      }
+
+      load_vec_t elems(input + i * hidden_size + j);
+      cvt_vec_t elems_fp32(elems);
+
+      if (j + vec_elem_num == hidden_size) {
+        if constexpr (AZP) {
+          max_value = max_value.max(elems_fp32);
+          min_value = min_value.min(elems_fp32);
+        } else {
+          max_value = max_value.max(elems_fp32.abs());
+        }
+      } else {
+        if constexpr (AZP) {
+          max_value = max_value.max(elems_fp32, hidden_size - j);
+          min_value = min_value.min(elems_fp32, hidden_size - j);
+        } else {
+          max_value = max_value.max(elems_fp32.abs(), hidden_size - j);
+        }
+      }
+    }
+
+    float scale_val, azp_val;
+    if constexpr (AZP) {
+      float max_scalar = max_value.reduce_max();
+      float min_scalar = min_value.reduce_min();
+      scale_val = (max_scalar - min_scalar) / 255.0f;
+      azp_val = std::nearbyint(-128.0f - min_scalar / scale_val);
+      azp[i] = static_cast<int32_t>(azp_val);
+      scale[i] = scale_val;
+    } else {
+      scale_val = max_value.reduce_max() / 127.0f;
+      scale[i] = scale_val;
+    }
+
+    const cvt_vec_t inv_scale(1.0 / scale_val);
+    const cvt_vec_t azp_vec(azp_val);
+
+    {
+      int j = 0;
+      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+        load_vec_t elems(input + i * hidden_size + j);
+        cvt_vec_t elems_fp32(elems);
+        elems_fp32 = (elems_fp32 * inv_scale);
+
+        if constexpr (AZP) {
+          elems_fp32 = elems_fp32 + azp_vec;
+        }
+        elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
+        vec_op::INT8Vec16 elems_int8(elems_fp32);
+        elems_int8.save(output + i * hidden_size + j);
+      }
+
+      load_vec_t elems(input + i * hidden_size + j);
+      cvt_vec_t elems_fp32(elems);
+      elems_fp32 = (elems_fp32 * inv_scale);
+
+      if constexpr (AZP) {
+        elems_fp32 = elems_fp32 + azp_vec;
+      }
+      elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
+      vec_op::INT8Vec16 elems_int8(elems_fp32);
+      elems_int8.save(output + i * hidden_size + j, hidden_size - j);
+    }
+  }
+}
+
+template <bool PerChannel, typename scalar_t>
+void static_quant_epilogue(const float* input, scalar_t* output,
+                           const float a_scale, const float* b_scale,
+                           const int32_t* azp_with_adj, const int num_tokens,
+                           const int hidden_size) {
+  CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl)
+  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
+  using azp_adj_load_vec_t =
+      typename KernelVecType<scalar_t>::azp_adj_load_vec_type;
+  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
+  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+
+  #pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    cvt_vec_t a_scale_vec(a_scale);
+    cvt_vec_t b_scale_vec(*b_scale);
+    cvt_vec_t scale_vec = a_scale_vec * b_scale_vec;
+
+    int j = 0;
+    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+      cvt_vec_t elems_fp32(input + i * hidden_size + j);
+      azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j);
+      cvt_vec_t azp_adj_fp32(azp_adj_vec);
+
+      if constexpr (PerChannel) {
+        b_scale_vec = cvt_vec_t(b_scale + j);
+        scale_vec = b_scale_vec * a_scale_vec;
+      }
+
+      elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32;
+
+      load_vec_t elems_out(elems_fp32);
+      elems_out.save(output + i * hidden_size + j);
+    }
+
+    cvt_vec_t elems_fp32(input + i * hidden_size + j);
+    azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j);
+    cvt_vec_t azp_adj_fp32(azp_adj_vec);
+
+    if constexpr (PerChannel) {
+      b_scale_vec = cvt_vec_t(b_scale + j);
+      scale_vec = b_scale_vec * a_scale_vec;
+    }
+
+    elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32;
+
+    load_vec_t elems_out(elems_fp32);
+    elems_out.save(output + i * hidden_size + j, hidden_size - j);
+  }
+}
+
+template <bool AZP, bool PerChannel, bool Bias, typename scalar_t>
+void dynamic_quant_epilogue(const float* input, scalar_t* output,
+                            const float* a_scale, const float* b_scale,
+                            const int32_t* azp, const int32_t* azp_adj,
+                            const scalar_t* bias, const int num_tokens,
+                            const int hidden_size) {
+  CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue)
+  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
+  using azp_adj_load_vec_t =
+      typename KernelVecType<scalar_t>::azp_adj_load_vec_type;
+  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
+  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+
+  #pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    int j = 0;
+    cvt_vec_t token_scale_vec(a_scale[i]);
+    cvt_vec_t token_zp_scale_vec;
+    if constexpr (AZP) {
+      float zp_scale_val = a_scale[i] * static_cast<float>(azp[i]);
+      if constexpr (!PerChannel) {
+        zp_scale_val *= *b_scale;
+      }
+      token_zp_scale_vec = cvt_vec_t(zp_scale_val);
+    }
+
+    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+      cvt_vec_t elems_fp32(input + i * hidden_size + j);
+      elems_fp32 = elems_fp32 * token_scale_vec;
+
+      if constexpr (AZP) {
+        azp_adj_load_vec_t azp_adj_vec(azp_adj + j);
+        cvt_vec_t azp_adj_fp32(azp_adj_vec);
+        azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec;
+
+        if constexpr (PerChannel) {
+          cvt_vec_t b_scale_vec(b_scale + j);
+          azp_adj_fp32 = azp_adj_fp32 * b_scale_vec;
+        }
+
+        elems_fp32 = elems_fp32 - azp_adj_fp32;
+      }
+
+      if constexpr (Bias) {
+        load_vec_t bias_vec(bias + j);
+        cvt_vec_t bias_vec_fp32(bias_vec);
+        elems_fp32 = elems_fp32 + bias_vec_fp32;
+      }
+
+      load_vec_t elems_out(elems_fp32);
+      elems_out.save(output + i * hidden_size + j);
+    }
+
+    cvt_vec_t elems_fp32(input + i * hidden_size + j);
+    elems_fp32 = elems_fp32 * token_scale_vec;
+
+    if constexpr (AZP) {
+      azp_adj_load_vec_t azp_adj_vec(azp_adj + j);
+      cvt_vec_t azp_adj_fp32(azp_adj_vec);
+      azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec;
+
+      if constexpr (PerChannel) {
+        cvt_vec_t b_scale_vec(b_scale + j);
+        azp_adj_fp32 = azp_adj_fp32 * b_scale_vec;
+      }
+
+      elems_fp32 = elems_fp32 - azp_adj_fp32;
+    }
+
+    if constexpr (Bias) {
+      load_vec_t bias_vec(bias + j);
+      cvt_vec_t bias_vec_fp32(bias_vec);
+      elems_fp32 = elems_fp32 + bias_vec_fp32;
+    }
+
+    load_vec_t elems_out(elems_fp32);
+    elems_out.save(output + i * hidden_size + j, hidden_size - j);
+  }
+}
+#elif defined(__powerpc64__)
+template <bool AZP, typename scalar_t>
+void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
+                                   const float* scale, const int32_t* azp,
+                                   const int num_tokens,
+                                   const int hidden_size) {
+  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
+  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
+  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+
+  constexpr float i8_min =
+      static_cast<float>(std::numeric_limits<int8_t>::min());
+  constexpr float i8_max =
+      static_cast<float>(std::numeric_limits<int8_t>::max());
+
+  const cvt_vec_t inv_scale(1.0 / *scale);
+  const cvt_vec_t i8_min_vec(i8_min);
+  const cvt_vec_t i8_max_vec(i8_max);
+
+  cvt_vec_t zp_vec;
+  if constexpr (AZP) {
+    zp_vec = cvt_vec_t(static_cast<float>(*azp));
+  }
+  #pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    int j = 0;
+    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+      load_vec_t elems(input + i * hidden_size + j);
+      cvt_vec_t elems_fp32(elems);
+      elems_fp32 = elems_fp32 * inv_scale;
+      if constexpr (AZP) {
+        elems_fp32 = elems_fp32 + zp_vec;
+      }
+      elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
+      vec_op::INT8Vec16 elems_int8(elems_fp32);
+      elems_int8.save(output + i * hidden_size + j);
+    }
+    load_vec_t elems(input + i * hidden_size + j);
+    cvt_vec_t elems_fp32(elems);
+    elems_fp32 = elems_fp32 * inv_scale;
+
+    if constexpr (AZP) {
+      elems_fp32 = elems_fp32 + zp_vec;
+    }
+
+    elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
+    vec_op::INT8Vec16 elems_int8(elems_fp32);
+    elems_int8.save(output + i * hidden_size + j, hidden_size - j);
+  }
+}
+template <bool AZP, typename scalar_t>
+void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
+                                    float* scale, int32_t* azp,
+                                    const int num_tokens,
+                                    const int hidden_size) {
+  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
+  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
+  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+
+  constexpr float i8_min =
+      static_cast<float>(std::numeric_limits<int8_t>::min());
+  constexpr float i8_max =
+      static_cast<float>(std::numeric_limits<int8_t>::max());
+  const cvt_vec_t i8_min_vec(i8_min);
+  const cvt_vec_t i8_max_vec(i8_max);
+
+  #pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    cvt_vec_t max_value(std::numeric_limits<float>::lowest());
+    cvt_vec_t min_value(std::numeric_limits<float>::max());
+    {
+      int j = 0;
+      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+        load_vec_t elems(input + i * hidden_size + j);
+        cvt_vec_t elems_fp32(elems);
+        if constexpr (AZP) {
+          max_value = max_value.max(elems_fp32);
+          min_value = min_value.min(elems_fp32);
+        } else {
+          max_value = max_value.max(elems_fp32.abs());
+        }
+      }
+
+      load_vec_t elems(input + i * hidden_size + j);
+      cvt_vec_t elems_fp32(elems);
+
+      if (j + vec_elem_num == hidden_size) {
+        if constexpr (AZP) {
+          max_value = max_value.max(elems_fp32);
+          min_value = min_value.min(elems_fp32);
+        } else {
+          max_value = max_value.max(elems_fp32.abs());
+        }
+      } else {
+        if constexpr (AZP) {
+          max_value = max_value.max(elems_fp32, hidden_size - j);
+          min_value = min_value.min(elems_fp32, hidden_size - j);
+        } else {
+          max_value = max_value.max(elems_fp32.abs(), hidden_size - j);
+        }
+      }
+    }
+
+    float scale_val, azp_val;
+    if constexpr (AZP) {
+      float max_scalar = max_value.reduce_max();
+      float min_scalar = min_value.reduce_min();
+      scale_val = (max_scalar - min_scalar) / 255.0f;
+      azp_val = std::nearbyint(-128.0f - min_scalar / scale_val);
+      azp[i] = static_cast<int32_t>(azp_val);
+      scale[i] = scale_val;
+    } else {
+      scale_val = max_value.reduce_max() / 127.0f;
+      scale[i] = scale_val;
+    }
+
+    const cvt_vec_t inv_scale(1.0 / scale_val);
+    const cvt_vec_t azp_vec(azp_val);
+
+    {
+      int j = 0;
+      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+        load_vec_t elems(input + i * hidden_size + j);
+        cvt_vec_t elems_fp32(elems);
+        elems_fp32 = (elems_fp32 * inv_scale);
+
+        if constexpr (AZP) {
+          elems_fp32 = elems_fp32 + azp_vec;
+        }
+        elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
+        vec_op::INT8Vec16 elems_int8(elems_fp32);
+        elems_int8.save(output + i * hidden_size + j);
+      }
+
+      load_vec_t elems(input + i * hidden_size + j);
+      cvt_vec_t elems_fp32(elems);
+      elems_fp32 = (elems_fp32 * inv_scale);
+
+      if constexpr (AZP) {
+        elems_fp32 = elems_fp32 + azp_vec;
+      }
+      elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
+      vec_op::INT8Vec16 elems_int8(elems_fp32);
+      elems_int8.save(output + i * hidden_size + j, hidden_size - j);
+    }
+  }
+}
+template <bool PerChannel, typename scalar_t>
+void static_quant_epilogue(const float* input, scalar_t* output,
+                           const float a_scale, const float* b_scale,
+                           const int32_t* azp_with_adj, const int num_tokens,
+                           const int hidden_size) {
+  CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl)
+  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
+  using azp_adj_load_vec_t =
+      typename KernelVecType<scalar_t>::azp_adj_load_vec_type;
+  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
+  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+
+  #pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    cvt_vec_t a_scale_vec(a_scale);
+    cvt_vec_t b_scale_vec(*b_scale);
+    cvt_vec_t scale_vec = a_scale_vec * b_scale_vec;
+
+    int j = 0;
+    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+      cvt_vec_t elems_fp32(input + i * hidden_size + j);
+      azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j);
+      cvt_vec_t azp_adj_fp32(azp_adj_vec);
+
+      if constexpr (PerChannel) {
+        b_scale_vec = cvt_vec_t(b_scale + j);
+        scale_vec = b_scale_vec * a_scale_vec;
+      }
+      elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32;
+      load_vec_t elems_out(elems_fp32);
+      elems_out.save(output + i * hidden_size + j);
+    }
+
+    cvt_vec_t elems_fp32(input + i * hidden_size + j);
+    azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j);
+    cvt_vec_t azp_adj_fp32(azp_adj_vec);
+
+    if constexpr (PerChannel) {
+      b_scale_vec = cvt_vec_t(b_scale + j);
+      scale_vec = b_scale_vec * a_scale_vec;
+    }
+
+    elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32;
+
+    load_vec_t elems_out(elems_fp32);
+    elems_out.save(output + i * hidden_size + j, hidden_size - j);
+  }
+}
+template <bool AZP, bool PerChannel, bool Bias, typename scalar_t>
+void dynamic_quant_epilogue(const float* input, scalar_t* output,
+                            const float* a_scale, const float* b_scale,
+                            const int32_t* azp, const int32_t* azp_adj,
+                            const scalar_t* bias, const int num_tokens,
+                            const int hidden_size) {
+  CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue)
+  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
+  using azp_adj_load_vec_t =
+      typename KernelVecType<scalar_t>::azp_adj_load_vec_type;
+  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
+  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+
+  #pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    int j = 0;
+    cvt_vec_t token_scale_vec(a_scale[i]);
+    cvt_vec_t token_zp_scale_vec;
+    if constexpr (AZP) {
+      float zp_scale_val = a_scale[i] * static_cast<float>(azp[i]);
+      if constexpr (!PerChannel) {
+        zp_scale_val *= *b_scale;
+      }
+      token_zp_scale_vec = cvt_vec_t(zp_scale_val);
+    }
+
+    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+      cvt_vec_t elems_fp32(input + i * hidden_size + j);
+      elems_fp32 = elems_fp32 * token_scale_vec;
+
+      if constexpr (AZP) {
+        azp_adj_load_vec_t azp_adj_vec(azp_adj + j);
+        cvt_vec_t azp_adj_fp32(azp_adj_vec);
+        azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec;
+
+        if constexpr (PerChannel) {
+          cvt_vec_t b_scale_vec(b_scale + j);
+          azp_adj_fp32 = azp_adj_fp32 * b_scale_vec;
+        }
+
+        elems_fp32 = elems_fp32 - azp_adj_fp32;
+      }
+
+      if constexpr (Bias) {
+        load_vec_t bias_vec(bias + j);
+        cvt_vec_t bias_vec_fp32(bias_vec);
+        elems_fp32 = elems_fp32 + bias_vec_fp32;
+      }
+
+      load_vec_t elems_out(elems_fp32);
+      elems_out.save(output + i * hidden_size + j);
+    }
+
+    cvt_vec_t elems_fp32(input + i * hidden_size + j);
+    elems_fp32 = elems_fp32 * token_scale_vec;
+
+    if constexpr (AZP) {
+      azp_adj_load_vec_t azp_adj_vec(azp_adj + j);
+      cvt_vec_t azp_adj_fp32(azp_adj_vec);
+      azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec;
+
+      if constexpr (PerChannel) {
+        cvt_vec_t b_scale_vec(b_scale + j);
+        azp_adj_fp32 = azp_adj_fp32 * b_scale_vec;
+      }
+
+      elems_fp32 = elems_fp32 - azp_adj_fp32;
+    }
+
+    if constexpr (Bias) {
+      load_vec_t bias_vec(bias + j);
+      cvt_vec_t bias_vec_fp32(bias_vec);
+      elems_fp32 = elems_fp32 + bias_vec_fp32;
+    }
+
+    load_vec_t elems_out(elems_fp32);
+    elems_out.save(output + i * hidden_size + j, hidden_size - j);
+  }
+}
+#else
+template <typename scalar_t>
+void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
+                                   const float* scale, const int32_t* azp,
+                                   const int num_tokens,
+                                   const int hidden_size) {
+  TORCH_CHECK(false,
+              "static_scaled_int8_quant_impl requires AVX512/powerpc64/AArch64 "
+              "support.")
+}
+
+template <typename scalar_t>
+void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
+                                    float* scale, int32_t* azp,
+                                    const int num_tokens,
+                                    const int hidden_size) {
+  TORCH_CHECK(false,
+              "dynamic_scaled_int8_quant_impl requires "
+              "AVX512/powerpc64/AArch64 support.")
+}
+
+template <bool PerChannel, typename scalar_t>
+void static_quant_epilogue(const float* input, scalar_t* output,
+                           const float a_scale, const float* b_scale,
+                           const int32_t* azp_with_adj, const int num_tokens,
+                           const int hidden_size) {
+  TORCH_CHECK(
+      false, "static_quant_epilogue requires AVX512/powerpc64/AArch64 support.")
+}
+
+template <typename scalar_t>
+void dynamic_quant_epilogue(const float* input, scalar_t* output,
+                            const float* a_scale, const float* b_scale,
+                            const int32_t* azp, const int32_t* azp_with_adj,
+                            const scalar_t* bias, const int num_tokens,
+                            const int hidden_size) {
+  TORCH_CHECK(
+      false,
+      "dynamic_quant_epilogue requires AVX512/powerpc64/AArch64 support.")
+}
+#endif
+}  // namespace
+
+void int8_scaled_mm(torch::Tensor& c,               // [M, OC], row-major
+                    const torch::Tensor& a,         // [M, IC], row-major
+                    const torch::Tensor& b,         // [IC, OC], column-major
+                    const torch::Tensor& a_scales,  // [1] or [M]
+                    const torch::Tensor& b_scales,  // [1] or [OC]
+                    const std::optional<torch::Tensor>& bias  // [OC]
+) {
+  CPU_KERNEL_GUARD_IN(cutlass_scaled_mm)
+  // Checks for conformality
+  TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8,
+              "int8_scaled_mm only supports INT8 inputs.")
+  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
+  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
+              b.size(1) == c.size(1));
+  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
+  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
+
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
+  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
+  TORCH_CHECK(c.stride(0) % 16 == 0 &&
+              b.stride(1) % 16 == 0);  // 16 Byte Alignment
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+
+  if (bias) {
+    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
+                bias->dim() == 1);
+  }
+
+  VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm", [&] {
+    if (a_scales.numel() != 1) {
+      // per-token
+      // Note: oneDNN doesn't support per-token activation quantization
+      // Ideally we want to fuse the GEMM and the scale procedure with oneDNN
+      // JIT, the intermediate data is cached in registers or L1. But for now
+      // the oneDNN GEMM code generation only supports two quantization
+      // patterns: per-tensor or per-output-channel of weight.
+      // So we have to apply the per-token scale with a 'epilogue'. In C=s_a *
+      // s_b * (A@B) + bias, the C_inter = s_b * (A@B) is computed by oneDNN
+      // GEMM, then the per-token scale (and bias) is applied with the epilogue
+      // C=s_a * C_inter + bias.
+      torch::Tensor tmp_fp32_out =
+          torch::empty_like(c, ::at::ScalarType::Float);
+      // Compute C_inter=s_b * (A@B)
+      DNNLPrimitiveHelper<true>::gemm_s8s8_jit<float, void>(
+          a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
+          tmp_fp32_out.data_ptr<float>(), nullptr, a.size(0), b.size(1),
+          a.size(1), nullptr, b_scales.data_ptr<float>(), 0, b_scales.numel());
+      if (bias.has_value()) {
+        // Compute C=s_a * C_inter + bias
+        dynamic_quant_epilogue<false, true, true>(
+            tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
+            a_scales.data_ptr<float>(), nullptr, nullptr, nullptr,
+            bias->data_ptr<scalar_t>(), c.size(0), c.size(1));
+      } else {
+        // Compute C=s_a * C_inter
+        dynamic_quant_epilogue<false, true, false, scalar_t>(
+            tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
+            a_scales.data_ptr<float>(), nullptr, nullptr, nullptr, nullptr,
+            c.size(0), c.size(1));
+      }
+    } else {
+      // per-tensor
+      if (bias.has_value()) {
+        // Compute C=s_a * s_b * (A@B) + bias
+        DNNLPrimitiveHelper<false>::gemm_s8s8_jit(
+            a.data_ptr<int8_t>(), b.data_ptr<int8_t>(), c.data_ptr<scalar_t>(),
+            bias->data_ptr<scalar_t>(), a.size(0), b.size(1), a.size(1),
+            a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
+            a_scales.numel(), b_scales.numel());
+      } else {
+        // Compute C=s_a * s_b * (A@B)
+        DNNLPrimitiveHelper<false>::gemm_s8s8_jit<scalar_t, void>(
+            a.data_ptr<int8_t>(), b.data_ptr<int8_t>(), c.data_ptr<scalar_t>(),
+            nullptr, a.size(0), b.size(1), a.size(1),
+            a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
+            a_scales.numel(), b_scales.numel());
+      }
+    }
+  });
+}
+
+void int8_scaled_mm_azp(torch::Tensor& c,        // [M, OC], row-major
+                        const torch::Tensor& a,  // [M, IC], row-major
+                        const torch::Tensor& b,  // [IC, OC], column-major
+                        const torch::Tensor& a_scales,            // [1] or [M]
+                        const torch::Tensor& b_scales,            // [1] or [OC]
+                        const torch::Tensor& azp_adj,             // [OC]
+                        const std::optional<torch::Tensor>& azp,  // [1] or [M]
+                        const std::optional<torch::Tensor>& bias  // [OC]
+) {
+  CPU_KERNEL_GUARD_IN(cutlass_scaled_mm_azp)
+  // Checks for conformality
+  TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8,
+              "int8_scaled_mm_azp only supports INT8 inputs.")
+  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
+  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
+              b.size(1) == c.size(1));
+  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
+  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
+
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
+  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
+  TORCH_CHECK(c.stride(0) % 16 == 0 &&
+              b.stride(1) % 16 == 0);  // 16 Byte Alignment
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+
+  if (bias) {
+    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous());
+  }
+  if (azp) {
+    TORCH_CHECK(azp->numel() == a.size(0) && azp->is_contiguous());
+  }
+  TORCH_CHECK(azp_adj.numel() == b.size(1) && azp_adj.is_contiguous());
+
+  // azp & bias types
+  TORCH_CHECK(azp_adj.dtype() == torch::kInt32);
+  TORCH_CHECK(!azp || azp->dtype() == torch::kInt32);
+  TORCH_CHECK(!bias || bias->dtype() == c.dtype(),
+              "currently bias dtype must match output dtype ", c.dtype());
+
+  VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm_azp", [&] {
+    torch::Tensor tmp_fp32_out = torch::empty_like(c, ::at::ScalarType::Float);
+    if (a_scales.numel() != 1) {
+      // per-token
+      // Note: oneDNN doesn't support per-token activation quantization
+      // Compute C_inter=s_b * (A@B)
+      DNNLPrimitiveHelper<true>::gemm_s8s8_jit<float, void>(
+          a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
+          tmp_fp32_out.data_ptr<float>(), nullptr, a.size(0), b.size(1),
+          a.size(1), nullptr, b_scales.data_ptr<float>(), 0, b_scales.numel());
+      if (bias.has_value()) {
+        // Compute C=s_a * C_inter - s_a * s_b * azp * azp_adj + bias
+        if (b_scales.numel() != 1) {
+          // Per-Channel
+          dynamic_quant_epilogue<true, true, true>(
+              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
+              a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
+              azp->data_ptr<int32_t>(), azp_adj.data_ptr<int32_t>(),
+              bias->data_ptr<scalar_t>(), c.size(0), c.size(1));
+        } else {
+          // Per-Tensor
+          dynamic_quant_epilogue<true, false, true>(
+              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
+              a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
+              azp->data_ptr<int32_t>(), azp_adj.data_ptr<int32_t>(),
+              bias->data_ptr<scalar_t>(), c.size(0), c.size(1));
+        }
+      } else {
+        // Compute C=s_a * C_inter - s_a * s_b * azp * azp_adj
+        if (b_scales.numel() != 1) {
+          // Per-Channel
+          dynamic_quant_epilogue<true, true, false, scalar_t>(
+              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
+              a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
+              azp->data_ptr<int32_t>(), azp_adj.data_ptr<int32_t>(), nullptr,
+              c.size(0), c.size(1));
+        } else {
+          // Per-Tensor
+          dynamic_quant_epilogue<true, false, false, scalar_t>(
+              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
+              a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
+              azp->data_ptr<int32_t>(), azp_adj.data_ptr<int32_t>(), nullptr,
+              c.size(0), c.size(1));
+        }
+      }
+    } else {
+      // per-tensor
+      if (bias.has_value()) {
+        // Compute C_inter=s_a * s_b * (A@B) + bias
+        DNNLPrimitiveHelper<false>::gemm_s8s8_jit(
+            a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
+            tmp_fp32_out.data_ptr<float>(), bias->data_ptr<scalar_t>(),
+            a.size(0), b.size(1), a.size(1), a_scales.data_ptr<float>(),
+            b_scales.data_ptr<float>(), a_scales.numel(), b_scales.numel());
+      } else {
+        // Compute C_inter=s_a * s_b * (A@B)
+        DNNLPrimitiveHelper<false>::gemm_s8s8_jit<float, void>(
+            a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
+            tmp_fp32_out.data_ptr<float>(), nullptr, a.size(0), b.size(1),
+            a.size(1), a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
+            a_scales.numel(), b_scales.numel());
+      }
+
+      // Compute C=C_inter - s_a * s_b * azp_adj
+      if (b_scales.numel() != 1) {
+        // Per-Channel
+        static_quant_epilogue<true>(
+            tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
+            *a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
+            azp_adj.data_ptr<int32_t>(), a.size(0), b.size(1));
+      } else {
+        // Per-Tensor
+        static_quant_epilogue<false>(
+            tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
+            *a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
+            azp_adj.data_ptr<int32_t>(), a.size(0), b.size(1));
+      }
+    }
+  });
+}
+
+// static-per-tensor quantization.
+void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
+                              const torch::Tensor& input,  // [..., hidden_size]
+                              const torch::Tensor& scale,
+                              std::optional<torch::Tensor> const& azp) {
+  CPU_KERNEL_GUARD_IN(static_scaled_int8_quant)
+  TORCH_CHECK(input.is_contiguous());
+  TORCH_CHECK(out.is_contiguous());
+  TORCH_CHECK(scale.numel() == 1);
+  TORCH_CHECK(!azp.has_value() || azp->numel() == 1);
+
+  const int hidden_size = input.size(-1);
+  const int num_tokens = input.numel() / hidden_size;
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "static_scaled_int8_quant_impl", [&] {
+        if (azp.has_value()) {
+          static_scaled_int8_quant_impl<true>(
+              input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
+              scale.data_ptr<float>(), azp->data_ptr<int32_t>(), num_tokens,
+              hidden_size);
+        } else {
+          static_scaled_int8_quant_impl<false>(
+              input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
+              scale.data_ptr<float>(), nullptr, num_tokens, hidden_size);
+        }
+      });
+}
+
+// dynamic-per-token quantization.
+void dynamic_scaled_int8_quant(
+    torch::Tensor& out,          // [..., hidden_size]
+    const torch::Tensor& input,  // [..., hidden_size]
+    torch::Tensor& scale,        // [..., 1]
+    std::optional<torch::Tensor> const& azp) {
+  CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant)
+  TORCH_CHECK(input.is_contiguous());
+  TORCH_CHECK(out.is_contiguous());
+
+  int const hidden_size = input.size(-1);
+  int const num_tokens = input.numel() / hidden_size;
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "dynamic_scaled_int8_quant_impl", [&] {
+        if (azp.has_value()) {
+          dynamic_scaled_int8_quant_impl<true>(
+              input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
+              scale.data_ptr<float>(), azp->data_ptr<int32_t>(), num_tokens,
+              hidden_size);
+        } else {
+          dynamic_scaled_int8_quant_impl<false>(
+              input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
+              scale.data_ptr<float>(), nullptr, num_tokens, hidden_size);
+        }
+      });
+}
+
+#if defined(__powerpc64__)
+void int8_scaled_mm_ppc64le(torch::Tensor& c,        // [M, OC], row-major
+                            const torch::Tensor& a,  // [M, IC], row-major
+                            const torch::Tensor& b,  // [IC, OC], column-major
+                            const torch::Tensor& a_scales,
+                            const torch::Tensor& b_scales,
+                            const std::optional<torch::Tensor>& bias  // [OC]
+) {
+  CPU_KERNEL_GUARD_IN(cutlass_scaled_mm)
+  // Checks for conformality
+  TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8,
+              "int8_scaled_mm_ppc64le only supports INT8 inputs.");
+  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
+  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
+              b.size(1) == c.size(1));
+  // We dont need this
+  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
+  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
+
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
+  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
+  TORCH_CHECK(c.stride(0) % 16 == 0 &&
+              b.stride(1) % 16 == 0);  // 16 Byte Alignment
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+
+  if (bias) {
+    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
+                bias->dim() == 1);
+  }
+  VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm_ppc64le", [&] {
+    torch::Tensor tmp_fp32_out = torch::empty_like(c, ::at::ScalarType::Float);
+    // Compute C_inter=s_b * (A@B)
+    DNNLPrimitiveHelper<true>::gemm_s8s8_jit<float, void>(
+        a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
+        tmp_fp32_out.data_ptr<float>(), nullptr, a.size(0), b.size(1),
+        a.size(1), nullptr, b_scales.data_ptr<float>(), 0, b_scales.numel());
+    if (bias.has_value()) {
+      // Compute C=s_a * C_inter + bias
+      dynamic_quant_epilogue<false, true, true>(
+          tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
+          a_scales.data_ptr<float>(), nullptr, nullptr, nullptr,
+          bias->data_ptr<scalar_t>(), c.size(0), c.size(1));
+    } else {
+      // Compute C=s_a * C_inter
+      dynamic_quant_epilogue<false, true, false, scalar_t>(
+          tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
+          a_scales.data_ptr<float>(), nullptr, nullptr, nullptr, nullptr,
+          c.size(0), c.size(1));
+    }
+  });
+}
+
+#endif
diff --git a/vllm_v0.10.0/csrc/cpu/sgl-kernels/common.h b/vllm_v0.10.0/csrc/cpu/sgl-kernels/common.h
new file mode 100644
index 0000000..b96037e
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cpu/sgl-kernels/common.h
@@ -0,0 +1,238 @@
+// Adapted from
+// https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
+
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/Parallel.h>
+#include <ATen/record_function.h>
+
+// clang-format off
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+namespace {
+
+// dispatch bool
+#define AT_DISPATCH_BOOL(BOOL_V, BOOL_NAME, ...)                                 \
+  [&] {                                                                          \
+    if (BOOL_V) {                                                                \
+      constexpr bool BOOL_NAME = true;                                           \
+      return __VA_ARGS__();                                                      \
+    } else {                                                                     \
+      constexpr bool BOOL_NAME = false;                                          \
+      return __VA_ARGS__();                                                      \
+    }                                                                            \
+  }()
+
+// dispatch: bfloat16, float16, int8_t, fp8_e4m3
+#define CPU_DISPATCH_PACKED_TYPES(TYPE, ...)                                    \
+  [&] {                                                                         \
+    switch (TYPE) {                                                             \
+      case at::ScalarType::BFloat16 : {                                         \
+        using packed_t = at::BFloat16;                                          \
+        return __VA_ARGS__();                                                   \
+      }                                                                         \
+      case at::ScalarType::Half: {                                              \
+        using packed_t = at::Half;                                              \
+        return __VA_ARGS__();                                                   \
+      }                                                                         \
+      case at::ScalarType::Char : {                                             \
+        using packed_t = int8_t;                                                \
+        return __VA_ARGS__();                                                   \
+      }                                                                         \
+      case at::ScalarType::Float8_e4m3fn : {                                    \
+        using packed_t = at::Float8_e4m3fn;                                     \
+        return __VA_ARGS__();                                                   \
+      }                                                                         \
+      default:                                                                  \
+        TORCH_CHECK(false, "Unsupported floating data type.\n");                \
+    }                                                                           \
+  }()
+
+#define UNUSED(x) (void)(x)
+
+#define CHECK_CPU(x) TORCH_CHECK(x.device().type() == at::kCPU, #x " must be a CPU tensor")
+
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_LAST_DIM_CONTIGUOUS(x) \
+  TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimension")
+
+#define CHECK_INPUT(x) \
+  CHECK_CPU(x);        \
+  CHECK_CONTIGUOUS(x)
+#define CHECK_LAST_DIM_CONTIGUOUS_INPUT(x) \
+  CHECK_CPU(x);                            \
+  CHECK_LAST_DIM_CONTIGUOUS(x)
+
+#define CHECK_DIM(d, x) TORCH_CHECK(x.dim() == d, #x " must be a " #d "D tensor")
+
+#define CHECK_EQ(a, b) TORCH_CHECK((a) == (b), "CHECK_EQ(" #a ", " #b ") failed. ", a, " vs ", b)
+
+// parallel routines
+constexpr int GRAIN_SIZE = 1024;
+
+template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
+inline T div_up(T x, T y) { return (x + y - 1) / y; }
+
+template <typename T>
+inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
+#if 0
+    // onednn partition pattern
+    T& n_my = n_end;
+    if (nth <= 1 || n == 0) {
+        n_start = 0;
+        n_my = n;
+    } else {
+        T n1 = div_up(n, nth);
+        T n2 = n1 - 1;
+        T T1 = n - n2 * nth;
+        n_my = ith < T1 ? n1 : n2;
+        n_start = ith <= T1 ? ith*n1 : T1 * n1 + (ith - T1) * n2;
+    }
+    n_end += n_start;
+#else
+    // pytorch aten partition pattern
+    T n_my = div_up(n, nth);
+    n_start = ith * n_my;
+    n_end = std::min(n_start + n_my, n);
+#endif
+}
+
+template <typename func_t>
+inline void parallel_for(int n, const func_t& f) {
+#if defined(_OPENMP)
+#pragma omp parallel
+{
+    int nth = omp_get_num_threads();
+    int ith = omp_get_thread_num();
+    int tbegin, tend;
+    balance211(n, nth, ith, tbegin, tend);
+    f(tbegin, tend);
+}
+#else
+    f(0, n);
+#endif
+}
+
+// for 1d parallel, use `actual_nth`
+// for 2d parallel, use even nths, e.g. 43->42
+int inline adjust_num_threads(int m) {
+  int actual_nth = at::get_num_threads();
+  if (m == 1) {
+    return actual_nth;
+  }
+  return std::max(1, (actual_nth >> 1) * 2);
+}
+
+template <typename func_t>
+inline void parallel_2d(int m, int n, const func_t& f) {
+
+  // make sure we have even num_threads
+  int nth = adjust_num_threads(m);
+
+  // [NOTE] thread blocking:
+  //
+  //   1) prefer square block per thread
+  //   2) use even number of CPU cores
+  //   3) use all `num_threads` cores
+  //
+  //   we have:
+  //     TM * TN = T
+  //     BM / TM = BN / TN
+  //   then:
+  //     TM = ((BM / BN) * T) ^ 0.5
+  //
+  float r = float(m) / n;
+  int nth_m = std::ceil(std::sqrt(r * nth));
+  int nth_n = 1;
+  for (; nth_m > 0; --nth_m) {
+    nth_n = nth / nth_m;
+    if (nth_m * nth_n == nth) {
+      break;
+    }
+  }
+
+#if defined(_OPENMP)
+#pragma omp parallel num_threads(nth)
+{
+  int ith = omp_get_thread_num();
+  int ith_m = ith / nth_n;
+  int ith_n = ith % nth_n;
+
+  int thread_block_m = div_up(m, nth_m);
+  int thread_block_n = div_up(n, nth_n);
+
+  int begin_m = ith_m * thread_block_m;
+  int end_m = std::min(m, begin_m + thread_block_m);
+  int begin_n = ith_n * thread_block_n;
+  int end_n = std::min(n, begin_n + thread_block_n);
+
+  f(begin_m, end_m, begin_n, end_n);
+}
+#else
+  f(0, m, 0, n);
+#endif
+}
+
+template <typename T>
+int get_cache_blocks(int BLOCK_SIZE, int K) {
+  // L2 2MB and ratio of 50%
+  const int L2_size = 2048 * 1024 >> 1;
+  return std::max(1, int(L2_size / (BLOCK_SIZE * K * sizeof(T))));
+}
+
+// data indexing for dimension collapse
+template <typename T>
+inline T data_index_init(T offset) {
+  return offset;
+}
+
+template <typename T, typename... Args>
+inline T data_index_init(T offset, T& x, const T& X, Args&&... args) {
+  offset = data_index_init(offset, std::forward<Args>(args)...);
+  x = offset % X;
+  return offset / X;
+}
+
+inline bool data_index_step() {
+  return true;
+}
+
+template <typename T, typename... Args>
+inline bool data_index_step(T& x, const T& X, Args&&... args) {
+  if (data_index_step(std::forward<Args>(args)...)) {
+    x = ((x + 1) == X) ? 0 : (x + 1);
+    return x == 0;
+  }
+  return false;
+}
+
+// forced unroll for perf critical path
+
+#if __has_attribute(always_inline)
+#define ALWAYS_INLINE __attribute__((__always_inline__)) inline
+#else
+#define ALWAYS_INLINE inline
+#endif
+
+template <int n>
+struct Unroll {
+  template <typename Func, typename... Args>
+  ALWAYS_INLINE void operator()(const Func& f, Args... args) const {
+    Unroll<n - 1>{}(f, args...);
+    f(std::integral_constant<int, n - 1>{}, args...);
+  }
+};
+
+template <>
+struct Unroll<1> {
+  template <typename Func, typename... Args>
+  ALWAYS_INLINE void operator()(const Func& f, Args... args) const {
+    f(std::integral_constant<int, 0>{}, args...);
+  }
+};
+
+} // anonymous namespace
diff --git a/vllm_v0.10.0/csrc/cpu/sgl-kernels/gemm.cpp b/vllm_v0.10.0/csrc/cpu/sgl-kernels/gemm.cpp
new file mode 100644
index 0000000..c122d07
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cpu/sgl-kernels/gemm.cpp
@@ -0,0 +1,464 @@
+// Adapted from
+// https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
+
+#include "common.h"
+#include "vec.h"
+#include "gemm.h"
+
+// clang-format off
+
+namespace {
+
+// packed   layout:
+//   quants {N, K}  int8_t
+//   comp   {N}     int32_t
+template <int BLOCK_N>
+inline void s8s8_compensation(int8_t* __restrict__ packed, int K) {
+#if defined(CPU_CAPABILITY_AVX512)
+  constexpr int COLS = BLOCK_N / 16;
+  __m512i vcomp[COLS];
+
+  for (int col = 0; col < COLS; ++col) {
+    vcomp[col] = _mm512_setzero_si512();
+  }
+
+  const int64_t offset = BLOCK_N * K;
+  const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
+  for (int k = 0; k < K / 4; ++k) {
+    for (int col = 0; col < COLS; ++col) {
+      __m512i vb = _mm512_loadu_si512((const __m512i *)(packed + k * BLOCK_N * 4 + col * 64));
+      vcomp[col] = _mm512_dpbusd_epi32(vcomp[col], off, vb);
+    }
+  }
+
+  for (int col = 0; col < COLS; ++col) {
+    _mm512_storeu_si512((__m512i *)(packed + offset + col * 64), vcomp[col]);
+  }
+#else
+  TORCH_CHECK(false, "s8s8_compensation not implemented!");
+#endif
+}
+
+// convert to vnni format
+// from [N, K] to [K/2, N, 2] for bfloat16 and float16
+template <typename packed_t>
+inline void pack_vnni(packed_t* __restrict__ packed, const packed_t* __restrict__ weight, int N, int K) {
+  const int VNNI_BLK = 2;
+  for (int n = 0; n < N; ++n) {
+    for (int k = 0; k < K / VNNI_BLK; ++k) {
+      for (int d = 0; d < VNNI_BLK; ++d) {
+        packed[k * N * VNNI_BLK + n * VNNI_BLK + d] = weight[n * K + k * VNNI_BLK + d];
+      }
+    }
+  }
+}
+
+template <>
+inline void pack_vnni<int8_t>(int8_t* __restrict__ packed, const int8_t* __restrict__ weight, int N, int K) {
+  constexpr int BLOCK_N = block_size_n();
+  TORCH_CHECK(N == BLOCK_N);
+
+  const int VNNI_BLK = 4;
+  for (int n = 0; n < N; ++n) {
+    for (int k = 0; k < K / VNNI_BLK; ++k) {
+      for (int d = 0; d < VNNI_BLK; ++d) {
+        packed[k * N * VNNI_BLK + n * VNNI_BLK + d] = weight[n * K + k * VNNI_BLK + d];
+      }
+    }
+  }
+  s8s8_compensation<BLOCK_N>(packed, K);
+}
+
+template <typename scalar_t>
+inline void copy_stub(scalar_t* __restrict__ out, const float* __restrict__ input, int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+
+  int64_t d;
+  #pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    fVec data0 = fVec::loadu(input + d);
+    fVec data1 = fVec::loadu(input + d + fVec::size());
+    bVec out_vec = convert_from_float_ext<scalar_t>(data0, data1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d]);
+  }
+}
+
+template <typename scalar_t>
+inline void copy_add_stub(scalar_t* __restrict__ out, const float* __restrict__ input, const float* __restrict__ bias, int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+
+  int64_t d;
+  #pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    fVec data0 = fVec::loadu(input + d) + fVec::loadu(bias + d);
+    fVec data1 = fVec::loadu(input + d + fVec::size()) + fVec::loadu(bias + d + fVec::size());
+    bVec out_vec = convert_from_float_ext<scalar_t>(data0, data1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d] + bias[d]);
+  }
+}
+
+template <typename scalar_t, bool has_bias, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn {
+  static inline void apply(
+      const scalar_t* __restrict__ A, const scalar_t* __restrict__ B, scalar_t* __restrict__ C,
+      const float* __restrict__ bias, int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+    TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
+  }
+};
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <bool has_bias, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn<at::BFloat16, has_bias, BLOCK_M, BLOCK_N> {
+  static inline void apply(
+      const at::BFloat16* __restrict__ A, const at::BFloat16* __restrict__ B, at::BFloat16* __restrict__ C,
+      const float* __restrict__ bias, int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+
+    constexpr int ROWS = BLOCK_M;
+    constexpr int COLS = BLOCK_N / 16;
+
+    // prefetch distance
+    constexpr int PREFETCH_SIZE_K = 0;
+
+    __m512bh va;
+    __m512bh vb[COLS];
+    __m512 vc[ROWS * COLS];
+
+    auto loadc = [&](auto i) {
+      constexpr int col = i % COLS;
+      if constexpr (has_bias) {
+        vc[i] = _mm512_loadu_ps(bias + col * 16);
+      } else {
+        vc[i] = _mm512_set1_ps(0.f);
+      }
+    };
+    Unroll<ROWS * COLS>{}(loadc);
+
+    const int64_t K2 = K >> 1;
+    const int64_t lda2 = lda >> 1;
+    const int64_t ldb2 = ldb; // ldb * 2 >> 1;
+    const float* a_ptr = reinterpret_cast<const float*>(A);
+    const float* b_ptr = reinterpret_cast<const float*>(B);
+
+    auto compute = [&](auto i, int64_t k) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      if constexpr (col == 0) {
+        va = (__m512bh)(_mm512_set1_ps(a_ptr[row * lda2 + k]));
+      }
+      if constexpr (row == 0) {
+        vb[col] = (__m512bh)(_mm512_loadu_si512(b_ptr + k * ldb2 + col * 16));
+        if constexpr (PREFETCH_SIZE_K > 0) {
+          _mm_prefetch(b_ptr + (k + PREFETCH_SIZE_K) * ldb2 + col * 16, _MM_HINT_T0);
+        }
+      }
+      vc[i] = _mm512_dpbf16_ps(vc[i], va, vb[col]);
+    };
+    for (int64_t k = 0; k < K2; ++k) {
+      Unroll<ROWS * COLS>{}(compute, k);
+    }
+
+    auto storec = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+      // for COLS = 2, 4 use 512bit store
+      // for COLS = 1, 3 use 256bit store
+      if constexpr (COLS % 2 == 0) {
+        if constexpr (col % 2 == 0) {
+          _mm512_storeu_si512(
+              reinterpret_cast<__m512i*>((C + row * ldc + col * 16)),
+              (__m512i)(_mm512_cvtne2ps_pbh(vc[row * COLS + col + 1], vc[row * COLS + col])));
+        }
+      } else {
+        _mm256_storeu_si256(
+            reinterpret_cast<__m256i*>(C + row * ldc + col * 16),
+            (__m256i)(_mm512_cvtneps_pbh(vc[i])));
+      }
+    };
+    Unroll<ROWS * COLS>{}(storec);
+  }
+};
+#endif
+
+#define LAUNCH_TINYGEMM_KERNEL_NN(MB_SIZE, NB_SIZE)                          \
+    tinygemm_kernel_nn<scalar_t, has_bias, MB_SIZE, NB_SIZE>::apply(         \
+        A + mb_start * lda, B + nb_start * 2, C + mb_start * ldc + nb_start, \
+        has_bias ? bias + nb_start : nullptr, K, lda, ldb, ldc);
+
+template <typename scalar_t, bool has_bias>
+struct brgemm {
+  static inline void apply(
+      const scalar_t* __restrict__ A, const scalar_t* __restrict__ B, scalar_t* __restrict__ C,
+      float* __restrict__ Ctmp, const float* __restrict__ bias,
+      int64_t M, int64_t N, int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+
+    constexpr int BLOCK_N = block_size_n();
+    at::native::cpublas::brgemm(
+        M, N, K, lda, ldb, BLOCK_N, /* add_C */false,
+        A, B, Ctmp);
+
+    // copy from Ctmp to C
+    for (int64_t m = 0; m < M; ++m) {
+      if constexpr (has_bias) {
+        copy_add_stub(C + m * ldc, Ctmp + m * BLOCK_N, bias, N);
+      } else {
+        copy_stub(C + m * ldc, Ctmp + m * BLOCK_N, N);
+      }
+    }
+  }
+};
+
+template <typename scalar_t, bool has_bias>
+void tinygemm_kernel(
+    const scalar_t* __restrict__ A,
+    const scalar_t* __restrict__ B,
+    scalar_t* __restrict__ C,
+    float* __restrict__ Ctmp,
+    const float* __restrict__ bias,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg) {
+
+  if (brg) {
+    brgemm<scalar_t, has_bias>::apply(
+        A, B, C, Ctmp, bias,
+        M, N, K, lda, ldb, ldc);
+    return;
+  }
+
+  // pattern: 1-4-16
+  constexpr int64_t BLOCK_M = 4;
+  constexpr int64_t BLOCK_N = 64;
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+  for (int mb = 0; mb < MB; ++mb) {
+    int64_t mb_start = mb * BLOCK_M;
+    int64_t mb_size = std::min(BLOCK_M, M - mb_start);
+    for (int64_t nb = 0; nb < NB; ++nb) {
+      int64_t nb_start = nb * BLOCK_N;
+      int64_t nb_size = std::min(BLOCK_N, N - nb_start);
+
+      switch(mb_size << 4 | nb_size >> 4) {
+        // mb_size = 1
+        case 0x12: LAUNCH_TINYGEMM_KERNEL_NN(1, 32); break;
+        case 0x14: LAUNCH_TINYGEMM_KERNEL_NN(1, 64); break;
+        // mb_size = 2
+        case 0x22: LAUNCH_TINYGEMM_KERNEL_NN(2, 32); break;
+        case 0x24: LAUNCH_TINYGEMM_KERNEL_NN(2, 64); break;
+        // mb_size = 3
+        case 0x32: LAUNCH_TINYGEMM_KERNEL_NN(3, 32); break;
+        case 0x34: LAUNCH_TINYGEMM_KERNEL_NN(3, 64); break;
+        // mb_size = 4
+        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
+        case 0x44: LAUNCH_TINYGEMM_KERNEL_NN(4, 64); break;
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+void weight_packed_linear_kernel_impl(
+    scalar_t* __restrict__ out,
+    const scalar_t* __restrict__ mat1,
+    const scalar_t* __restrict__ mat2,
+    const float* __restrict__ bias,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t mat1_strideM,
+    int64_t out_strideM) {
+
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+
+  // use avx512-bf16 when a) M is small; b) dtype is bfloat16, otherwise use amx
+  const bool use_brgemm = (M > 4) || (!std::is_same_v<scalar_t, at::BFloat16>);
+
+  // l2 cache block for n
+  int64_t cache_blocks_nb = get_cache_blocks<scalar_t>(BLOCK_N, K);
+
+  // parallel on [MB, NB]
+  AT_DISPATCH_BOOL(bias != nullptr, has_bias, [&] {
+    parallel_2d(MB, NB, [&](int64_t begin_mb, int64_t end_mb, int64_t begin_nb, int64_t end_nb) {
+
+      // for brgemm, use float32 for accumulate
+      alignas(64) float Ctmp[BLOCK_M * BLOCK_N];
+
+      for (int64_t nbb = begin_nb; nbb < end_nb; nbb += cache_blocks_nb) {
+      for (int64_t mb = begin_mb; mb < end_mb; ++mb) {
+      for (int64_t nb = nbb; nb < std::min(nbb + cache_blocks_nb, end_nb); ++nb) {
+
+        int64_t mb_start = mb * BLOCK_M;
+        int64_t mb_size = std::min(M - mb_start, BLOCK_M);
+        int64_t nb_start = nb * BLOCK_N;
+        int64_t nb_size = std::min(N - nb_start, BLOCK_N);
+
+        tinygemm_kernel<scalar_t, has_bias>(
+            /*   A */ mat1 + mb_start * mat1_strideM,
+            /*   B */ mat2 + nb_start * K /* nb * BLOCK_N * K */,
+            /*   C */ out + mb_start * out_strideM + nb_start,
+            /* Ctmp*/ Ctmp,
+            /* bias*/ bias + nb_start,
+            /*   M */ mb_size,
+            /*   N */ nb_size,
+            /*   K */ K,
+            /* lda */ mat1_strideM,
+            /* ldb */ nb_size,
+            /* ldc */ out_strideM,
+            /* brg */ use_brgemm);
+      }}}
+
+      if (use_brgemm) {
+        at::native::cpublas::brgemm_release();
+      }
+    });
+  });
+}
+
+} // anonymous namespace
+
+// tinygemm interface
+template <typename scalar_t>
+void tinygemm_kernel(const scalar_t* __restrict__ A, const scalar_t* __restrict__ B, scalar_t* __restrict__ C,
+    float* __restrict__ Ctmp, int64_t M, int64_t N, int64_t K, int64_t lda, int64_t ldb, int64_t ldc, bool brg) {
+  tinygemm_kernel<scalar_t, false>(A, B, C, Ctmp, nullptr, M, N, K, lda, ldb, ldc, brg);
+}
+
+#define INSTANTIATE_TINYGEMM_TEMPLATE(TYPE)                                             \
+    template void tinygemm_kernel<TYPE>(                                                \
+        const TYPE* __restrict__ A, const TYPE* __restrict__ B, TYPE* __restrict__ C,   \
+        float* __restrict__ Ctmp, int64_t M, int64_t N, int64_t K, int64_t lda,         \
+        int64_t ldb, int64_t ldc, bool brg)
+
+INSTANTIATE_TINYGEMM_TEMPLATE(at::BFloat16);
+INSTANTIATE_TINYGEMM_TEMPLATE(at::Half);
+
+at::Tensor convert_weight_packed(at::Tensor& weight) {
+  // for 3d moe weights
+  // weight : [E, OC, IC]
+  //     w1 : [E, 2N,  K]
+  //     w2 : [E,  K,  N]
+  CHECK_INPUT(weight);
+
+  const int64_t ndim = weight.ndimension();
+  TORCH_CHECK(ndim == 2 || ndim == 3, "expect weight to be 2d or 3d, got ", ndim, "d tensor.");
+  const auto st = weight.scalar_type();
+  const int64_t E = ndim == 3 ? weight.size(0) : 1;
+  const int64_t OC = ndim == 3 ? weight.size(1) : weight.size(0);
+  const int64_t IC = ndim == 3 ? weight.size(2) : weight.size(1);
+
+  // we handle 2 TILE_N at a time.
+  TORCH_CHECK(OC % TILE_N == 0, "invalid weight out features ", OC);
+  TORCH_CHECK(IC % TILE_K == 0, "invalid weight input features ", IC);
+
+  constexpr int64_t BLOCK_N = block_size_n();
+  const int64_t NB = div_up(OC, BLOCK_N);
+
+  // use phony sizes here [E, OC, IC], for each [E], [OC, IC] -> [IC / 2, OC, 2]
+  auto packed_weight = at::empty({}, weight.options());
+  const int64_t stride = OC * IC;
+
+  TORCH_CHECK(st == at::kBFloat16 || st == at::kHalf || st == at::kChar || st == at::kFloat8_e4m3fn,
+      "expect weight to be bfloat16, float16, int8 or fp8_e4m3.");
+
+  CPU_DISPATCH_PACKED_TYPES(st, [&] {
+    // adjust most inner dimension size
+    const int packed_row_size = get_row_size<packed_t>(IC);
+    auto sizes = weight.sizes().vec();
+    sizes[ndim - 1] = packed_row_size;
+    packed_weight.resize_(sizes);
+
+    const packed_t* w_data = weight.data_ptr<packed_t>();
+    packed_t* packed_data = packed_weight.data_ptr<packed_t>();
+
+    // parallel on {E, NB}
+    at::parallel_for(0, E * NB, 0, [&](int64_t begin, int64_t end) {
+      int64_t e{0}, nb{0};
+      data_index_init(begin, e, E, nb, NB);
+
+      for (int64_t i = begin; i < end; ++i) {
+        UNUSED(i);
+
+        int64_t n = nb * BLOCK_N;
+        int64_t n_size = std::min(BLOCK_N, OC - n);
+        pack_vnni<packed_t>(
+            packed_data + e * OC * packed_row_size + n * packed_row_size,
+            w_data + e * stride + n * IC,
+            n_size,
+            IC);
+
+        // move to the next index
+        data_index_step(e, E, nb, NB);
+      }
+    });
+  });
+  return packed_weight;
+}
+
+// mat1 : [M, K]
+// mat2 : [N, K]
+// bias : [N]
+// out  : [M, N]
+//
+at::Tensor weight_packed_linear(at::Tensor& mat1, at::Tensor& mat2,
+    const std::optional<at::Tensor>& bias, bool is_vnni) {
+  RECORD_FUNCTION(
+    "sgl-kernel::weight_packed_linear", std::vector<c10::IValue>({mat1, mat2, bias}));
+
+  auto packed_w = is_vnni ? mat2 : convert_weight_packed(mat2);
+
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(mat1);
+  CHECK_INPUT(mat2);
+
+  int64_t M = mat1.size(0);
+  int64_t N = mat2.size(0);
+  int64_t K = mat2.size(1);
+  CHECK_EQ(mat1.size(1), K);
+  CHECK_DIM(2, mat1);
+  CHECK_DIM(2, mat2);
+
+  auto out = at::empty({M, N}, mat1.options());
+
+  // strides
+  int64_t mat1_strideM = mat1.stride(0);
+  int64_t out_strideM = out.stride(0);
+
+  const bool has_bias = bias.has_value();
+  const float* bias_data = nullptr;
+  if (has_bias) {
+    CHECK_EQ(bias.value().size(0), N);
+    bias_data = bias.value().data_ptr<float>();
+  }
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(mat1.scalar_type(), "weight_packed_linear_kernel_impl", [&] {
+    weight_packed_linear_kernel_impl<scalar_t>(
+        out.data_ptr<scalar_t>(),
+        mat1.data_ptr<scalar_t>(),
+        packed_w.data_ptr<scalar_t>(),
+        bias_data,
+        M,
+        N,
+        K,
+        mat1_strideM,
+        out_strideM);
+  });
+
+  return out;
+}
diff --git a/vllm_v0.10.0/csrc/cpu/sgl-kernels/gemm.h b/vllm_v0.10.0/csrc/cpu/sgl-kernels/gemm.h
new file mode 100644
index 0000000..fba5673
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cpu/sgl-kernels/gemm.h
@@ -0,0 +1,266 @@
+#pragma once
+
+#include <ATen/native/CPUBlas.h>
+
+// clang-format off
+
+// amx-bf16
+#define TILE_M 16
+#define TILE_N 16
+#define TILE_K 32
+
+// block size for AMX gemm
+constexpr int block_size_m() { return 2 * TILE_M; }
+constexpr int block_size_n() { return 2 * TILE_N; }
+
+// define threshold using brgemm (intel AMX)
+template <typename T> inline bool can_use_brgemm(int M);
+template <> inline bool can_use_brgemm<at::BFloat16>(int M) { return M > 4; }
+template <> inline bool can_use_brgemm<at::Half>(int M) { return true; }
+// TODO: add u8s8 brgemm, this requires PyTorch 2.7
+template <> inline bool can_use_brgemm<int8_t>(int M) { return false; }
+template <> inline bool can_use_brgemm<at::Float8_e4m3fn>(int M) { return M > 4; }
+template <> inline bool can_use_brgemm<at::quint4x2>(int M) { return M > 4; }
+
+// work around compiler internal error
+#define BLOCK_K 128 // 4 * TILE_K
+
+// adjust leading dimension size for K
+template <typename T>
+inline int64_t get_row_size(int64_t K) {
+  return K;
+}
+
+template <>
+inline int64_t get_row_size<int8_t>(int64_t K) {
+  return K + sizeof(int32_t);
+}
+
+inline int64_t get_row_size(int64_t K, bool use_int8_w8a8) {
+  return use_int8_w8a8 ? K + sizeof(int32_t) : K;
+}
+
+// pack weight to vnni format
+at::Tensor convert_weight_packed(at::Tensor& weight);
+
+// moe implementations for int8 w8a8
+template <typename scalar_t>
+void fused_experts_int8_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic1,
+    scalar_t* __restrict__ ic2,
+    uint8_t* __restrict__ A_tmp,
+    float* __restrict__ C_tmp,
+    uint8_t* __restrict__ Aq_tmp,
+    float* __restrict__ As_tmp,
+    const scalar_t* __restrict__ input,
+    const int8_t* __restrict__ packed_w1,
+    const int8_t* __restrict__ packed_w2,
+    const float* __restrict__ w1s,
+    const float* __restrict__ w2s,
+    const float* __restrict__ topk_weights,
+    const int32_t* __restrict__ sorted_ids,
+    const int32_t* __restrict__ expert_ids,
+    const int32_t* __restrict__ offsets,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t E,
+    int64_t topk,
+    int64_t num_tokens_post_pad);
+
+// moe implementations for fp8 w8a16
+template <typename scalar_t>
+void fused_experts_fp8_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic0,
+    scalar_t* __restrict__ ic1,
+    scalar_t* __restrict__ ic2,
+    scalar_t* __restrict__ A_tmp,
+    scalar_t* __restrict__ B_tmp,
+    float* __restrict__ C_tmp,
+    const scalar_t* __restrict__ input,
+    const at::Float8_e4m3fn* __restrict__ packed_w1,
+    const at::Float8_e4m3fn* __restrict__ packed_w2,
+    const float* __restrict__ w1s,
+    const float* __restrict__ w2s,
+    int64_t block_size_N,
+    int64_t block_size_K,
+    const float* __restrict__ topk_weights,
+    const int32_t* __restrict__ sorted_ids,
+    const int32_t* __restrict__ expert_ids,
+    const int32_t* __restrict__ offsets,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t E,
+    int64_t topk,
+    int64_t num_tokens_post_pad);
+
+// moe implementations for int4 w4a16
+template <typename scalar_t>
+void fused_experts_int4_w4a16_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic0,
+    scalar_t* __restrict__ ic1,
+    scalar_t* __restrict__ ic2,
+    scalar_t* __restrict__ A_tmp,
+    scalar_t* __restrict__ B_tmp,
+    float* __restrict__ C_tmp,
+    const scalar_t* __restrict__ input,
+    const at::quint4x2* __restrict__ packed_w1,
+    const at::quint4x2* __restrict__ packed_w2,
+    const uint8_t* __restrict__ w1z,
+    const uint8_t* __restrict__ w2z,
+    const scalar_t* __restrict__ w1s,
+    const scalar_t* __restrict__ w2s,
+    int group_size,
+    const float* __restrict__ topk_weights,
+    const int32_t* __restrict__ sorted_ids,
+    const int32_t* __restrict__ expert_ids,
+    const int32_t* __restrict__ offsets,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t E,
+    int64_t topk,
+    int64_t num_tokens_post_pad);
+
+// shared expert implementation for int8 w8a8
+template <typename scalar_t>
+void shared_expert_int8_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic1,
+    float* __restrict__ C_tmp,
+    uint8_t* __restrict__ Aq_tmp,
+    float* __restrict__ As_tmp,
+    const scalar_t* __restrict__ input,
+    const int8_t* __restrict__ packed_w1,
+    const int8_t* __restrict__ packed_w2,
+    const float* __restrict__ w1s,
+    const float* __restrict__ w2s,
+    const scalar_t* __restrict__ fused_experts_out,
+    float routed_scaling_factor,
+    int64_t M,
+    int64_t N,
+    int64_t K);
+
+template <typename scalar_t>
+void shared_expert_fp8_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic0,
+    scalar_t* __restrict__ ic1,
+    scalar_t* __restrict__ B_tmp,
+    float* __restrict__ C_tmp,
+    const scalar_t* __restrict__ input,
+    const at::Float8_e4m3fn* __restrict__ packed_w1,
+    const at::Float8_e4m3fn* __restrict__ packed_w2,
+    const float* __restrict__ w1s,
+    const float* __restrict__ w2s,
+    int64_t block_size_N,
+    int64_t block_size_K,
+    const scalar_t* __restrict__ fused_experts_out,
+    float routed_scaling_factor,
+    int64_t M,
+    int64_t N,
+    int64_t K);
+
+// tinygemm interface
+template <typename scalar_t>
+void tinygemm_kernel(
+    const scalar_t* __restrict__ A,
+    const scalar_t* __restrict__ B,
+    scalar_t* __restrict__ C,
+    float* __restrict__ Ctmp,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg);
+
+template <typename scalar_t>
+void tinygemm_kernel(
+    const uint8_t* __restrict__ A,
+    const int8_t* __restrict__ B,
+    scalar_t* __restrict__ C,
+    int32_t* __restrict__ Ctmp,
+    const float* __restrict__ As,
+    const float* __restrict__ Bs,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg);
+
+template <typename scalar_t>
+void tinygemm_kernel(
+    const scalar_t* __restrict__ A,
+    const at::Float8_e4m3fn* __restrict__ B,
+    scalar_t* __restrict__ C,
+    scalar_t* __restrict__ Btmp,
+    float* __restrict__ Ctmp,
+    const float* __restrict__ scale,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg,
+    int64_t block_size_K);
+
+template <typename scalar_t>
+void tinygemm_kernel(
+    const scalar_t* __restrict__ A,
+    const at::quint4x2* __restrict__ B,
+    scalar_t* __restrict__ C,
+    const uint8_t* __restrict__ Bz,
+    const scalar_t* __restrict__ Bs,
+    scalar_t* __restrict__ Btmp,
+    float* __restrict__ Ctmp,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int group_size,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    int64_t strideBz,
+    int64_t strideBs,
+    bool brg);
+
+// TODO: debug print, remove me later
+inline void print_16x32i(const __m512i x) {
+  int32_t a[16];
+  _mm512_storeu_si512((__m512i *)a, x);
+
+  for (int i = 0; i < 16; i++){
+    std::cout << a[i] << " ";
+  }
+  std::cout << std::endl;
+}
+
+inline void print_16x32(const __m512 x) {
+  float a[16];
+  _mm512_storeu_ps((__m512 *)a, x);
+
+  for (int i = 0; i < 16; i++){
+    std::cout << a[i] << " ";
+  }
+  std::cout << std::endl;
+}
+
+
+inline void print_32x8u(const __m256i x) {
+  uint8_t a[32];
+  _mm256_storeu_si256((__m256i *)a, x);
+
+  for (int i = 0; i < 32; ++i) {
+    std::cout << int32_t(a[i]) << " ";
+  }
+  std::cout << std::endl;
+}
diff --git a/vllm_v0.10.0/csrc/cpu/sgl-kernels/gemm_fp8.cpp b/vllm_v0.10.0/csrc/cpu/sgl-kernels/gemm_fp8.cpp
new file mode 100644
index 0000000..b5f2f07
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cpu/sgl-kernels/gemm_fp8.cpp
@@ -0,0 +1,530 @@
+// Adapted from
+// https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
+
+#include "common.h"
+#include "vec.h"
+#include "gemm.h"
+
+// clang-format off
+
+// we use 4x32 for BLOCK_M
+#define BLOCK_SIZE_M_SCALE 4
+
+namespace {
+
+template <typename scalar_t>
+inline void copy_stub(scalar_t* __restrict__ out, const float* __restrict__ input, int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+
+  int64_t d;
+  #pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    fVec data0 = fVec::loadu(input + d);
+    fVec data1 = fVec::loadu(input + d + fVec::size());
+    bVec out_vec = convert_from_float_ext<scalar_t>(data0, data1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d]);
+  }
+}
+
+template <typename scalar_t>
+inline void copy_add_stub(scalar_t* __restrict__ out, const float* __restrict__ input, const float* __restrict__ bias, int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+
+  int64_t d;
+  #pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    fVec data0 = fVec::loadu(input + d) + fVec::loadu(bias + d);
+    fVec data1 = fVec::loadu(input + d + fVec::size()) + fVec::loadu(bias + d + fVec::size());
+    bVec out_vec = convert_from_float_ext<scalar_t>(data0, data1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d] + bias[d]);
+  }
+}
+
+inline void unpack_B(
+    at::BFloat16* __restrict__ Btmp,
+    const at::Float8_e4m3fn* __restrict__ packed_B,
+    int N,
+    int K,
+    int ldb,
+    int ldb_tmp,
+    float scale) {
+#if defined(CPU_CAPABILITY_AVX512)
+  // [K/2, N, 2]
+  const int K2 = K >> 1;
+  const int ldb2 = ldb; // ldb * 2 >> 1;
+  const uint16_t* b_ptr = reinterpret_cast<const uint16_t*>(packed_B);
+  const __m512 vd = _mm512_set1_ps(scale);
+
+  constexpr int BLOCK_N = block_size_n();
+  static_assert(BLOCK_N == 32);
+
+  // prefetch distance
+  constexpr int PREFETCH_SIZE_K = 64;
+
+#pragma GCC unroll 4
+  for (int k = 0; k < K2; ++k) {
+    __m512i b8 = _mm512_loadu_si512(b_ptr + k * ldb2);
+    if constexpr (PREFETCH_SIZE_K > 0) {
+      _mm_prefetch(b_ptr + (k + PREFETCH_SIZE_K) * ldb2, _MM_HINT_T0);
+    }
+
+    __m256i b8_0 = _mm512_extracti32x8_epi32(b8, 0);
+    __m256i b8_1 = _mm512_extracti32x8_epi32(b8, 1);
+
+    __m512bh bf16_0 = CVT_FP8_TO_BF16(b8_0);
+    __m512bh bf16_1 = CVT_FP8_TO_BF16(b8_1);
+
+    // Apply scale
+    __m512 f0_lo = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32((__m512i)bf16_0, 0));
+    __m512 f0_hi = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32((__m512i)bf16_0, 1));
+    __m512 f1_lo = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32((__m512i)bf16_1, 0));
+    __m512 f1_hi = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32((__m512i)bf16_1, 1));
+
+    f0_lo = _mm512_mul_ps(f0_lo, vd);
+    f0_hi = _mm512_mul_ps(f0_hi, vd);
+    f1_lo = _mm512_mul_ps(f1_lo, vd);
+    f1_hi = _mm512_mul_ps(f1_hi, vd);
+
+    bf16_0 = _mm512_cvtne2ps_pbh(f0_hi, f0_lo);
+    bf16_1 = _mm512_cvtne2ps_pbh(f1_hi, f1_lo);
+
+    _mm512_storeu_si512(Btmp + k * ldb_tmp * 2 + 0, (__m512i)bf16_0);
+    _mm512_storeu_si512(Btmp + k * ldb_tmp * 2 + 32, (__m512i)bf16_1);
+  }
+#else
+  TORCH_CHECK(false, "unpack_B: scalar path not implemented!");
+#endif
+}
+
+template <typename scalar_t, typename packed_t, bool has_bias, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn {
+  static inline void apply(
+      const scalar_t* __restrict__ A, const packed_t* __restrict__ B, scalar_t* __restrict__ C,
+      const float* __restrict__ bias, const float* __restrict__ scale, int K, int lda, int ldb, int ldc, int64_t block_size_K) {
+    TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
+  }
+};
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <bool has_bias, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn<at::BFloat16, at::Float8_e4m3fn, has_bias, BLOCK_M, BLOCK_N> {
+  static inline void apply(
+      const at::BFloat16* __restrict__ A, const at::Float8_e4m3fn* __restrict__ B, at::BFloat16* __restrict__ C,
+      const float* __restrict__ bias, const float* __restrict__ scale, int K, int lda, int ldb, int ldc, int64_t block_size_K) {
+
+    constexpr int ROWS = BLOCK_M;
+    constexpr int COLS = BLOCK_N / 16;
+
+    const int KB = div_up(K, BLOCK_K);
+
+    // prefetch distance
+    constexpr int PREFETCH_SIZE_K = 64;
+    constexpr int PREFETCH_SIZE_KB = 1;
+
+    __m512bh va;
+    __m512bh vb[COLS];
+    __m512 vc[ROWS * COLS];
+    __m512 vsum[ROWS * COLS];
+
+    // block quant scale
+    __m512 vscale;
+
+    auto loadc = [&](auto i) {
+      constexpr int col = i % COLS;
+      if constexpr (has_bias) {
+        vc[i] = _mm512_loadu_ps(bias + col * 16);
+      } else {
+        vc[i] = _mm512_setzero_ps();
+      }
+    };
+    Unroll<ROWS * COLS>{}(loadc);
+
+    const int lda2 = lda >> 1;
+    const int ldb2 = ldb; // ldb * 2 >> 1;
+    const float* a_ptr = reinterpret_cast<const float*>(A);
+    const uint16_t* b_ptr = reinterpret_cast<const uint16_t*>(B);
+
+    auto compute = [&](auto i, int k) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      if constexpr (col == 0) {
+        va = (__m512bh)(_mm512_set1_ps(a_ptr[row * lda2 + k]));
+        if constexpr (PREFETCH_SIZE_K > 0) {
+          _mm_prefetch(a_ptr + row * lda2 + k + PREFETCH_SIZE_K, _MM_HINT_T0);
+        }
+      }
+      if constexpr (row == 0) {
+        if constexpr (col % 2 == 0) {
+          __m512i b8 = _mm512_loadu_si512(b_ptr + k * ldb2 + col * 16);
+          if constexpr (PREFETCH_SIZE_K > 0) {
+            _mm_prefetch(b_ptr + (k + PREFETCH_SIZE_K) * ldb2 + col * 16, _MM_HINT_T0);
+          }
+          vb[col + 0] = CVT_FP8_TO_BF16(_mm512_extracti32x8_epi32(b8, 0));
+          vb[col + 1] = CVT_FP8_TO_BF16(_mm512_extracti32x8_epi32(b8, 1));
+        }
+      }
+      vsum[i] = _mm512_dpbf16_ps(vsum[i], va, vb[col]);
+    };
+
+    constexpr int BLOCK_K2 = BLOCK_K >> 1;
+    for (int kb = 0; kb < KB; ++kb) {
+      int kb_start = kb * BLOCK_K2;
+      int kb_end = std::min(K, kb_start + BLOCK_K2);
+      // 1. load scale vector
+      vscale = _mm512_set1_ps(scale[kb]);
+      if constexpr (PREFETCH_SIZE_KB > 0) {
+        _mm_prefetch(scale + kb + PREFETCH_SIZE_KB, _MM_HINT_T0);
+      }
+      // 2. zero vsum for each block
+      Unroll<ROWS * COLS>{}([&](auto i) {
+        vsum[i] = _mm512_setzero_ps();
+      });
+      // 3. accumulate across each block
+      for (int k = kb_start; k < kb_end; ++k) {
+        Unroll<ROWS * COLS>{}(compute, k);
+      }
+      // 4. apply scale
+      Unroll<ROWS * COLS>{}([&](auto i) {
+        vc[i] = _mm512_fmadd_ps(vsum[i], vscale, vc[i]);
+      });
+    }
+
+    auto storec = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+      // for COLS = 2,4 use 512bit store
+      if constexpr (col % 2 == 0) {
+        _mm512_storeu_si512(
+            reinterpret_cast<__m512i*>((C + row * ldc + col * 16)),
+            (__m512i)(_mm512_cvtne2ps_pbh(vc[row * COLS + col + 1], vc[row * COLS + col])));
+      }
+    };
+    Unroll<ROWS * COLS>{}(storec);
+  }
+};
+#endif
+
+#define LAUNCH_TINYGEMM_KERNEL_NN(MB_SIZE, NB_SIZE)                          \
+    tinygemm_kernel_nn<scalar_t, at::Float8_e4m3fn, has_bias, MB_SIZE, NB_SIZE>::apply(         \
+        A + mb_start * lda, B + nb_start * 2, C + mb_start * ldc + nb_start, \
+        has_bias ? bias + nb_start : nullptr, scale, K, lda, ldb, ldc, block_size_K);
+
+template <typename scalar_t, typename packed_t, bool has_bias>
+struct brgemm {
+  static inline void apply(
+      const scalar_t* __restrict__ A,
+      const packed_t* __restrict__ B,
+      scalar_t* __restrict__ C,
+      scalar_t* __restrict__ Btmp,
+      float* __restrict__ Ctmp,
+      const float* __restrict__ bias,
+      const float* __restrict__ scale,
+      int M,
+      int N,
+      int K,
+      int lda,
+      int ldb,
+      int ldc) {
+    TORCH_CHECK(false, "struct brgemm: primary template not implemented!");
+  }
+};
+
+template <bool has_bias>
+struct brgemm<at::BFloat16, at::Float8_e4m3fn, has_bias> {
+  static inline void apply(
+      const at::BFloat16* __restrict__ A,
+      const at::Float8_e4m3fn* __restrict__ B,
+      at::BFloat16* __restrict__ C,
+      at::BFloat16* __restrict__ Btmp,
+      float* __restrict__ Ctmp,
+      const float* __restrict__ bias,
+      const float* __restrict__ scale,
+      int M,
+      int N,
+      int K,
+      int lda,
+      int ldb,
+      int ldc) {
+
+    constexpr int BLOCK_N = block_size_n();
+
+    // [K, BLOCK_N] -> [K / 2, BLOCK_N * 2]
+    const int ldb_tmp = BLOCK_N;
+
+    for (int k = 0; k < K; k += BLOCK_K) {
+      int kb_size = std::min(BLOCK_K, K - k);
+
+      int idx = k >> 7; // k / BLOCK_K where BLOCK_K = 128
+      unpack_B(Btmp + k * ldb_tmp, B + k * ldb, N, kb_size, ldb, ldb_tmp, scale[idx]);
+    }
+
+    at::native::cpublas::brgemm(
+        M, N, K, lda, ldb_tmp, BLOCK_N, /* add_C */ false, A, Btmp, Ctmp);
+
+    // copy from Ctmp to C
+    for (int m = 0; m < M; ++m) {
+      if constexpr (has_bias) {
+        copy_add_stub(C + m * ldc, Ctmp + m * BLOCK_N, bias, N);
+      } else {
+        copy_stub(C + m * ldc, Ctmp + m * BLOCK_N, N);
+      }
+    }
+  }
+};
+
+template <typename scalar_t, bool has_bias>
+void tinygemm_kernel(
+    const scalar_t* __restrict__ A,
+    const at::Float8_e4m3fn* __restrict__ B,
+    scalar_t* __restrict__ C,
+    scalar_t* __restrict__ Btmp,
+    float* __restrict__ Ctmp,
+    const float* __restrict__ scale,
+    const float* __restrict__ bias,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg,
+    int64_t block_size_K) {
+
+  if (brg) {
+    brgemm<scalar_t, at::Float8_e4m3fn, has_bias>::apply(
+        A, B, C, Btmp, Ctmp, bias, scale, M, N, K, lda, ldb, ldc);
+    return;
+  }
+
+  // pattern: 1-4-16
+  constexpr int64_t BLOCK_M = 4;
+  constexpr int64_t BLOCK_N = 64;
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+  for (int mb = 0; mb < MB; ++mb) {
+    int64_t mb_start = mb * BLOCK_M;
+    int64_t mb_size = std::min(BLOCK_M, M - mb_start);
+    for (int64_t nb = 0; nb < NB; ++nb) {
+      int64_t nb_start = nb * BLOCK_N;
+      int64_t nb_size = std::min(BLOCK_N, N - nb_start);
+
+      switch(mb_size << 4 | nb_size >> 4) {
+        case 0x12: LAUNCH_TINYGEMM_KERNEL_NN(1, 32); break;
+        case 0x22: LAUNCH_TINYGEMM_KERNEL_NN(2, 32); break;
+        case 0x32: LAUNCH_TINYGEMM_KERNEL_NN(3, 32); break;
+        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+void fp8_scaled_mm_kernel_impl(
+    scalar_t* __restrict__ out,
+    const scalar_t* __restrict__ mat1,
+    const at::Float8_e4m3fn* __restrict__ mat2,
+    const float* __restrict__ scales2,
+    const float* __restrict__ bias,
+    scalar_t* __restrict__ buffer,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t mat1_strideM,
+    int64_t out_strideM,
+    int64_t block_size_N,
+    int64_t block_size_K,
+    int64_t buffer_size_per_thread) {
+
+  constexpr int64_t BLOCK_M = block_size_m() * BLOCK_SIZE_M_SCALE;
+  constexpr int64_t BLOCK_N = block_size_n();
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+
+  const int64_t scale_size_K = div_up(K, block_size_K);
+  const int64_t blocks_n_per_group = block_size_N / BLOCK_N;
+
+  const bool use_brgemm = can_use_brgemm<at::Float8_e4m3fn>(M);
+
+  // parallel on [MB, NB]
+  AT_DISPATCH_BOOL(bias != nullptr, has_bias, [&] {
+    at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
+      int64_t mb{0}, nb{0};
+      data_index_init(begin, mb, MB, nb, NB);
+
+      int tid = at::get_thread_num();
+      scalar_t* __restrict__ Btmp = buffer + tid * buffer_size_per_thread;
+      float* __restrict__ Ctmp = (float*)((void*)(Btmp + BLOCK_N * K));
+
+      for (int64_t i = begin; i < end; ++i) {
+        UNUSED(i);
+        const float* scale_ptr = scales2 + (nb / blocks_n_per_group) * scale_size_K;
+
+        int64_t mb_start = mb * BLOCK_M;
+        int64_t mb_size = std::min(M - mb_start, BLOCK_M);
+        int64_t nb_start = nb * BLOCK_N;
+        int64_t nb_size = std::min(N - nb_start, BLOCK_N);
+
+        tinygemm_kernel<scalar_t, has_bias>(
+            /*   A            */ mat1 + mb_start * mat1_strideM,
+            /*   B            */ mat2 + nb_start * K, // nb * BLOCK_N * K
+            /*   C            */ out + mb_start * out_strideM + nb_start,
+            /*   Btmp         */ Btmp,
+            /*   Ctmp         */ Ctmp,
+            /*   scale        */ scale_ptr,
+            /*   bias         */ bias + nb_start,
+            /*   M            */ mb_size,
+            /*   N            */ nb_size,
+            /*   K            */ K,
+            /*   lda          */ mat1_strideM,
+            /*   ldb          */ nb_size,
+            /*   ldc          */ out_strideM,
+            /*   brg          */ use_brgemm,
+            /*   block_size_K */ block_size_K);
+
+        // move to the next index
+        data_index_step(mb, MB, nb, NB);
+      }
+
+      if (use_brgemm) {
+        at::native::cpublas::brgemm_release();
+      }
+    });
+  });
+}
+
+} // anonymous namespace
+
+// tinygemm interface
+template <typename scalar_t>
+void tinygemm_kernel(
+    const scalar_t* __restrict__ A,
+    const at::Float8_e4m3fn* __restrict__ B,
+    scalar_t* __restrict__ C,
+    scalar_t* __restrict__ Btmp,
+    float* __restrict__ Ctmp,
+    const float* __restrict__ scale,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg,
+    int64_t block_size_K) {
+  tinygemm_kernel<scalar_t, false>(A, B, C, Btmp, Ctmp, scale, nullptr, M, N, K, lda, ldb, ldc, brg, block_size_K);
+}
+
+#define INSTANTIATE_TINYGEMM_TEMPLATE(TYPE)    \
+  template void tinygemm_kernel<TYPE>(         \
+      const TYPE* __restrict__ A,              \
+      const at::Float8_e4m3fn* __restrict__ B, \
+      TYPE* __restrict__ C,                    \
+      TYPE* __restrict__ Btmp,                 \
+      float* __restrict__ Ctmp,                \
+      const float* __restrict__ scale,         \
+      int64_t M,                               \
+      int64_t N,                               \
+      int64_t K,                               \
+      int64_t lda,                             \
+      int64_t ldb,                             \
+      int64_t ldc,                             \
+      bool brg,                                \
+      int64_t block_size_K)
+
+INSTANTIATE_TINYGEMM_TEMPLATE(at::BFloat16);
+INSTANTIATE_TINYGEMM_TEMPLATE(at::Half);
+
+at::Tensor fp8_scaled_mm_cpu(at::Tensor& mat1, at::Tensor& mat2, at::Tensor& scales2,
+    std::vector<int64_t> block_size, std::optional<at::Tensor>& bias,
+    at::ScalarType out_dtype, bool is_vnni) {
+  RECORD_FUNCTION("sgl-kernel::fp8_scaled_mm_cpu", std::vector<c10::IValue>({mat1, mat2, scales2, block_size, bias}));
+
+  auto packed_w = is_vnni ? mat2 : convert_weight_packed(mat2);
+
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(mat1);
+  CHECK_INPUT(mat2);
+  CHECK_INPUT(scales2);
+  TORCH_CHECK(scales2.scalar_type() == at::kFloat,
+      "fp8_scaled_mm_cpu: expect scales2 to be float32.");
+
+  int64_t M = mat1.size(0);
+  int64_t N = mat2.size(0);
+  int64_t K = mat2.size(1);
+
+  CHECK_EQ(mat1.size(1), K);
+  CHECK_DIM(2, mat1);
+  CHECK_DIM(2, mat2);
+
+  TORCH_CHECK(block_size.size() == 2,
+      "fp8_scaled_mm_cpu: expect block_size.size() to be 2.");
+
+  int64_t block_size_N = block_size[0];
+  int64_t block_size_K = block_size[1];
+
+  constexpr int64_t BLOCK_M = block_size_m() * BLOCK_SIZE_M_SCALE;
+  constexpr int64_t BLOCK_N = block_size_n();
+  TORCH_CHECK(block_size_N % BLOCK_N == 0, "fp8_scaled_mm_cpu: expect block_size_N to be multiples of BLOCK_N");
+  TORCH_CHECK(block_size_K == BLOCK_K, "fp8_scaled_mm_cpu: expect block_size_K equals to BLOCK_K");
+  CHECK_EQ(scales2.size(0), div_up(N, block_size_N));
+  CHECK_EQ(scales2.size(1), div_up(K, block_size_K));
+
+  const auto st = mat1.scalar_type();
+  TORCH_CHECK(st == at::kBFloat16 || st == at::kHalf,
+      "fp8_scaled_mm_cpu: expect A to be bfloat16 or half.");
+  TORCH_CHECK(st == out_dtype,
+      "fp8_scaled_mm_cpu: expect A has same dtype with out_dtype.");
+  TORCH_CHECK(mat2.scalar_type() == at::kFloat8_e4m3fn,
+      "fp8_scaled_mm_cpu: expect mat2 to be fp8_e4m3.");
+  TORCH_CHECK(scales2.scalar_type() == at::kFloat,
+      "fp8_scaled_mm_cpu: expect scales to be float32.");
+  auto out = at::empty({M, N}, mat1.options().dtype(out_dtype));
+
+  // strides
+  int64_t mat1_strideM = mat1.stride(0);
+  int64_t out_strideM = out.stride(0);
+
+  const bool has_bias = bias.has_value();
+  const float* bias_data = nullptr;
+  if (has_bias) {
+    CHECK_EQ(bias.value().size(0), N);
+    bias_data = bias.value().data_ptr<float>();
+  }
+
+  // Btmp : [T, BLOCK_N * K]
+  // Ctmp : [T, BLOCK_M * BLOCK_N]
+  int num_threads = at::get_num_threads();
+  int64_t size_per_thread = BLOCK_N * K + BLOCK_M * BLOCK_N * 2;
+  auto buffer = at::empty({num_threads, size_per_thread}, mat1.options());
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(out_dtype, "fp8_scaled_mm_kernel_impl", [&] {
+    fp8_scaled_mm_kernel_impl<scalar_t>(
+        out.data_ptr<scalar_t>(),
+        mat1.data_ptr<scalar_t>(),
+        packed_w.data_ptr<at::Float8_e4m3fn>(),
+        scales2.data_ptr<float>(),
+        bias_data,
+        buffer.data_ptr<scalar_t>(),
+        M,
+        N,
+        K,
+        mat1_strideM,
+        out_strideM,
+        block_size_N,
+        block_size_K,
+        size_per_thread);
+  });
+
+  return out;
+}
diff --git a/vllm_v0.10.0/csrc/cpu/sgl-kernels/gemm_int8.cpp b/vllm_v0.10.0/csrc/cpu/sgl-kernels/gemm_int8.cpp
new file mode 100644
index 0000000..9a5ca06
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cpu/sgl-kernels/gemm_int8.cpp
@@ -0,0 +1,440 @@
+// Adapted from
+// https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
+
+#include "common.h"
+#include "vec.h"
+#include "gemm.h"
+
+// clang-format off
+
+namespace {
+
+template <typename scalar_t, bool has_bias, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn {
+  static inline void apply(
+      const uint8_t* __restrict__ A, const int8_t* __restrict__ B, scalar_t* __restrict__ C,
+      const float* __restrict__ As, const float* __restrict__ Bs, const int32_t* __restrict__ Bcomp,
+      const float* __restrict__ bias, int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+    TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
+  }
+};
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <bool has_bias, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn<at::BFloat16, has_bias, BLOCK_M, BLOCK_N> {
+  static inline void apply(
+      const uint8_t* __restrict__ A, const int8_t* __restrict__ B, at::BFloat16* __restrict__ C,
+      const float* __restrict__ As, const float* __restrict__ Bs, const int32_t* __restrict__ Bcomp,
+      const float* __restrict__ bias, int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+
+    constexpr int ROWS = BLOCK_M;
+    constexpr int COLS = BLOCK_N / 16;
+    static_assert(COLS % 2 == 0);
+
+    // prefetch distance
+    constexpr int PREFETCH_SIZE_K = 0;
+
+    __m512i va;
+    __m512i vb[COLS];
+    __m512i vc[ROWS * COLS];
+    __m512i vcomp[COLS];
+    __m512  vd0;
+    __m512  vd1[COLS];
+
+    // oops! 4x4 spills but luckily we use 4x2
+    __m512 vbias[COLS];
+
+    // [NOTE]: s8s8 igemm compensation in avx512-vnni
+    //
+    // avx512-vnni has no s8s8, so we need to change s8s8 to u8s8 with compensate:
+    //
+    //   a * b = (a + 128) * b - 128 * b
+    //   s   s       u       s    u    s
+    //
+    // 1) 128 * b is pre-computed when packing B to vnni formats
+    // 2) a + 128 is fused when dynamically quantize A
+    //
+    auto loadc = [&](auto i) {
+      vc[i] = _mm512_set1_epi32(0);
+    };
+    Unroll<ROWS * COLS>{}(loadc);
+
+    const int64_t K4 = K >> 2;
+    const int64_t lda4 = lda >> 2;
+    const int64_t ldb4 = ldb; // ldb * 4 >> 2;
+    const int32_t* a_ptr = reinterpret_cast<const int32_t*>(A);
+    const int32_t* b_ptr = reinterpret_cast<const int32_t*>(B);
+
+    auto compute = [&](auto i, int64_t k) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      if constexpr (col == 0) {
+        va = _mm512_set1_epi32(a_ptr[row * lda4 + k]);
+      }
+      if constexpr (row == 0) {
+        vb[col] = _mm512_loadu_si512(b_ptr + k * ldb4 + col * 16);
+        if constexpr (PREFETCH_SIZE_K > 0) {
+          _mm_prefetch(b_ptr + (k + PREFETCH_SIZE_K) * ldb4 + col * 16, _MM_HINT_T0);
+        }
+      }
+      vc[i] = _mm512_dpbusd_epi32(vc[i], va, vb[col]);
+    };
+    for (int64_t k = 0; k < K4; ++k) {
+      Unroll<ROWS * COLS>{}(compute, k);
+    }
+
+    auto storec = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      // load a scale
+      if constexpr(col == 0) {
+        vd0 = _mm512_set1_ps(As[row]);
+      }
+      // load b scale and vcomp per 2 vectors
+      // also load bias if any
+      if constexpr (row == 0) {
+        if constexpr (col % 2 == 0) {
+          vd1[col + 0] = _mm512_loadu_ps(Bs + col * 16);
+          vd1[col + 1] = _mm512_loadu_ps(Bs + col * 16 + 16);
+          vcomp[col + 0] = _mm512_loadu_si512(Bcomp + col * 16);
+          vcomp[col + 1] = _mm512_loadu_si512(Bcomp + col * 16 + 16);
+          if constexpr (has_bias) {
+            vbias[col + 0] = _mm512_loadu_ps(bias + col * 16);
+            vbias[col + 1] = _mm512_loadu_ps(bias + col * 16 + 16);
+          }
+        }
+      }
+
+      // for COLS = 2, 4 use 512bit store
+      if constexpr (col % 2 == 0) {
+        __m512 vc0 = _mm512_cvtepi32_ps(_mm512_sub_epi32(vc[row * COLS + col + 0], vcomp[col + 0]));
+        __m512 vc1 = _mm512_cvtepi32_ps(_mm512_sub_epi32(vc[row * COLS + col + 1], vcomp[col + 1]));
+        if constexpr (has_bias) {
+          vc0 = _mm512_fmadd_ps(_mm512_mul_ps(vc0, vd0), vd1[col + 0], vbias[col + 0]);
+          vc1 = _mm512_fmadd_ps(_mm512_mul_ps(vc1, vd0), vd1[col + 1], vbias[col + 1]);
+        } else {
+          vc0 = _mm512_mul_ps(_mm512_mul_ps(vc0, vd0), vd1[col + 0]);
+          vc1 = _mm512_mul_ps(_mm512_mul_ps(vc1, vd0), vd1[col + 1]);
+        }
+
+        _mm512_storeu_si512(
+            reinterpret_cast<__m512i*>((C + row * ldc + col * 16)),
+            (__m512i)(_mm512_cvtne2ps_pbh(vc1, vc0)));
+      }
+    };
+    Unroll<ROWS * COLS>{}(storec);
+  }
+};
+#endif
+
+#define LAUNCH_TINYGEMM_KERNEL_NN(MB_SIZE, NB_SIZE)                          \
+    tinygemm_kernel_nn<scalar_t, has_bias, MB_SIZE, NB_SIZE>::apply(         \
+        A + mb_start * lda, B + nb_start * 4, C + mb_start * ldc + nb_start, \
+        As + mb_start, Bs + nb_start, Bcomp + nb_start,                      \
+        has_bias ? bias + nb_start : nullptr, K, lda, ldb, ldc);
+
+template <typename scalar_t, bool has_bias>
+void tinygemm_kernel(
+    const uint8_t* __restrict__ A,
+    const int8_t* __restrict__ B,
+    scalar_t* __restrict__ C,
+    int32_t* __restrict__ Ctmp,
+    const float* __restrict__ As,
+    const float* __restrict__ Bs,
+    const float* __restrict__ bias,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg) {
+
+  // B compensation
+  const int32_t* Bcomp = reinterpret_cast<const int32_t*>(B + block_size_n() * K);
+
+  // pattern: 1-4-16
+  constexpr int64_t BLOCK_M = 4;
+  constexpr int64_t BLOCK_N = 64;
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+  for (int64_t mb = 0; mb < MB; ++mb) {
+    int64_t mb_start = mb * BLOCK_M;
+    int64_t mb_size = std::min(BLOCK_M, M - mb_start);
+    for (int64_t nb = 0; nb < NB; ++nb) {
+      int64_t nb_start = nb * BLOCK_N;
+      int64_t nb_size = std::min(BLOCK_N, N - nb_start);
+
+      switch(mb_size << 4 | nb_size >> 4) {
+        // mb_size = 1
+        case 0x12: LAUNCH_TINYGEMM_KERNEL_NN(1, 32); break;
+        case 0x14: LAUNCH_TINYGEMM_KERNEL_NN(1, 64); break;
+        // mb_size = 2
+        case 0x22: LAUNCH_TINYGEMM_KERNEL_NN(2, 32); break;
+        case 0x24: LAUNCH_TINYGEMM_KERNEL_NN(2, 64); break;
+        // mb_size = 3
+        case 0x32: LAUNCH_TINYGEMM_KERNEL_NN(3, 32); break;
+        case 0x34: LAUNCH_TINYGEMM_KERNEL_NN(3, 64); break;
+        // mb_size = 4
+        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
+        case 0x44: LAUNCH_TINYGEMM_KERNEL_NN(4, 64); break;
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+      }
+    }
+  }
+}
+
+template<typename scalar_t>
+void int8_scaled_mm_kernel_impl(
+    scalar_t* __restrict__ out,
+    const uint8_t* __restrict__ mat1,
+    const int8_t* __restrict__ mat2,
+    const float* __restrict__ scales1,
+    const float* __restrict__ scales2,
+    const float* __restrict__ bias,
+    int64_t M,
+    int64_t N,
+    int64_t K) {
+
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+
+  // TODO: brgemm u8s8 depends on PyTorch 2.7 release.
+  const bool use_brgemm = false;
+
+  // K + 4 after compensation
+  const int64_t packed_row_size = get_row_size<int8_t>(K);
+
+  AT_DISPATCH_BOOL(bias != nullptr, has_bias, [&] {
+    at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
+      int64_t mb{0}, nb{0};
+      data_index_init(begin, mb, MB, nb, NB);
+
+      // for brgemm, use int32_t for accumulate
+      alignas(64) int32_t Ctmp[BLOCK_M * BLOCK_N];
+
+      for (int i = begin; i < end; ++i) {
+        UNUSED(i);
+        int mb_start = mb * BLOCK_M;
+        int mb_size = std::min(M - mb_start, BLOCK_M);
+        int nb_start = nb * BLOCK_N;
+        int nb_size = std::min(N - nb_start, BLOCK_N);
+
+        tinygemm_kernel<scalar_t, has_bias>(
+            /*   A */ mat1 + mb_start * K,
+            /*   B */ mat2 + nb_start * packed_row_size /* nb * BLOCK_N * (K + 4) */,
+            /*   C */ out + mb_start * N + nb_start,
+            /* Ctmp*/ Ctmp,
+            /*  As */ scales1 + mb_start,
+            /*  Bs */ scales2 + nb_start,
+            /* bias*/ bias + nb_start,
+            /*   M */ mb_size,
+            /*   N */ nb_size,
+            /*   K */ K,
+            /* lda */ K,
+            /* ldb */ nb_size,
+            /* ldc */ N,
+            /* brg */ use_brgemm);
+
+        // move to the next index
+        data_index_step(mb, MB, nb, NB);
+      }
+
+      if (use_brgemm) {
+        at::native::cpublas::brgemm_release();
+      }
+    });
+  });
+}
+
+} // anonymous namespace
+
+// tinygemm interface
+template <typename scalar_t>
+void tinygemm_kernel(const uint8_t* __restrict__ A, const int8_t* __restrict__ B, scalar_t* __restrict__ C,
+    int32_t* __restrict__ Ctmp,  const float* __restrict__ As, const float* __restrict__ Bs,
+    int64_t M, int64_t N, int64_t K, int64_t lda, int64_t ldb, int64_t ldc, bool brg) {
+  tinygemm_kernel<scalar_t, false>(A, B, C, Ctmp, As, Bs, nullptr, M, N, K, lda, ldb, ldc, brg);
+}
+
+#define INSTANTIATE_TINYGEMM_TEMPLATE(TYPE)                                                     \
+    template void tinygemm_kernel<TYPE>(                                                        \
+        const uint8_t* __restrict__ A, const int8_t* __restrict__ B, TYPE* __restrict__ C,      \
+        int32_t* __restrict__ Ctmp, const float* __restrict__ As, const float* __restrict__ Bs, \
+        int64_t M, int64_t N, int64_t K, int64_t lda, int64_t ldb, int64_t ldc, bool brg)
+
+INSTANTIATE_TINYGEMM_TEMPLATE(at::BFloat16);
+INSTANTIATE_TINYGEMM_TEMPLATE(at::Half);
+
+std::tuple<at::Tensor, at::Tensor> per_token_quant_int8_cpu(at::Tensor& A) {
+  RECORD_FUNCTION("sgl-kernel::per_token_quant_int8_cpu", std::vector<c10::IValue>({A}));
+
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(A);
+  CHECK_DIM(2, A);
+
+  int64_t M = A.size(0);
+  int64_t K = A.size(1);
+  int64_t lda = A.stride(0);
+
+  const auto st = A.scalar_type();
+  TORCH_CHECK(st == at::kBFloat16 || st == at::kHalf,
+      "per_token_quant_int8: expect A to be bfloat16 or half.");
+
+  auto Aq = at::empty({M, K}, A.options().dtype(at::kByte));
+  auto As = at::empty({M}, A.options().dtype(at::kFloat));
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(st, "per_token_quant_int8", [&] {
+    uint8_t* __restrict__ Aq_data = Aq.data_ptr<uint8_t>();
+    float* __restrict__ As_data = As.data_ptr<float>();
+    const scalar_t* __restrict__ A_data = A.data_ptr<scalar_t>();
+
+    at::parallel_for(0, M, 0, [&] (int64_t begin, int64_t end) {
+      for (int64_t m = begin; m < end; ++m) {
+        quantize_row_int8<scalar_t>(
+            Aq_data + m * K,
+            As_data[m],
+            A_data + m * lda,
+            K);
+      }
+    });
+  });
+  return std::make_tuple(Aq, As);
+}
+
+// weight     :  static, per-channel, symmetric
+// activation : dynamic,   per-token, symmetric
+//
+// mat1    : [M, K]
+// mat2    : [N, K]
+// scales1 : [M]
+// scales2 : [N]
+// bias    : [N]
+// out     : [M, N]
+//
+at::Tensor int8_scaled_mm_cpu(at::Tensor& mat1, at::Tensor& mat2,
+    at::Tensor& scales1, at::Tensor& scales2,
+    std::optional<at::Tensor>& bias, at::ScalarType out_dtype, bool is_vnni) {
+  RECORD_FUNCTION("sgl-kernel::int8_scaled_mm_cpu", std::vector<c10::IValue>({mat1, mat2, scales1, scales2, bias}));
+
+  auto packed_w = is_vnni ? mat2 : convert_weight_packed(mat2);
+
+  CHECK_INPUT(mat1);
+  CHECK_INPUT(mat2);
+  CHECK_INPUT(scales1);
+  CHECK_INPUT(scales2);
+  CHECK_DIM(2, mat1);
+  CHECK_DIM(2, mat2);
+
+  int64_t M = mat1.size(0);
+  int64_t N = mat2.size(0);
+  int64_t K = mat1.size(1);
+
+  // see [NOTE]: s8s8 igemm compensation in avx512-vnni
+  CHECK_EQ(mat2.size(1), (int64_t)(is_vnni ? K + sizeof(int32_t) : K));
+  CHECK_EQ(scales1.numel(), M);
+  CHECK_EQ(scales2.numel(), N);
+
+  TORCH_CHECK(mat1.scalar_type() == at::kByte, "int8_scaled_mm: expect mat1 to be uint8.");
+  TORCH_CHECK(mat2.scalar_type() == at::kChar, "int8_scaled_mm: expect mat2 to be int8.");
+  TORCH_CHECK(scales1.scalar_type() == at::kFloat && scales2.scalar_type() == at::kFloat,
+      "int8_scaled_mm: expect scales to be float32.");
+
+  auto out = at::empty({M, N}, mat1.options().dtype(out_dtype));
+
+  const bool has_bias = bias.has_value();
+  const float* bias_data = nullptr;
+  if (has_bias) {
+    CHECK_EQ(bias.value().size(0), N);
+    bias_data = bias.value().data_ptr<float>();
+  }
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(out_dtype, "int8_scaled_mm_kernel_impl", [&] {
+    int8_scaled_mm_kernel_impl<scalar_t>(
+        out.data_ptr<scalar_t>(),
+        mat1.data_ptr<uint8_t>(),
+        packed_w.data_ptr<int8_t>(),
+        scales1.data_ptr<float>(),
+        scales2.data_ptr<float>(),
+        bias_data,
+        M,
+        N,
+        K);
+  });
+  return out;
+}
+
+// fused `per_token_quant_int8_cpu` and `int8_scaled_mm_cpu`
+at::Tensor int8_scaled_mm_with_quant(at::Tensor& mat1, at::Tensor& mat2, at::Tensor& scales2,
+    const std::optional<at::Tensor>& bias, at::ScalarType out_dtype, bool is_vnni) {
+  RECORD_FUNCTION("sgl-kernel::int8_scaled_mm_cpu", std::vector<c10::IValue>({mat1, mat2, scales2, bias}));
+
+  auto packed_w = is_vnni ? mat2 : convert_weight_packed(mat2);
+
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(mat1);
+  CHECK_INPUT(mat2);
+  CHECK_INPUT(scales2);
+  CHECK_DIM(2, mat1);
+  CHECK_DIM(2, mat2);
+
+  int64_t M = mat1.size(0);
+  int64_t N = mat2.size(0);
+  int64_t K = mat1.size(1);
+  int64_t lda = mat1.stride(0);
+
+  // see [NOTE]: s8s8 igemm compensation in avx512-vnni
+  CHECK_EQ(mat2.size(1), (int64_t)(is_vnni ? K + sizeof(int32_t) : K));
+  CHECK_EQ(scales2.numel(), N);
+
+  const auto st = mat1.scalar_type();
+  TORCH_CHECK(st == at::kBFloat16 || st == at::kHalf,
+      "int8_scaled_mm_with_quant: expect A to be bfloat16 or half.");
+  TORCH_CHECK(st == out_dtype,
+      "int8_scaled_mm_with_quant: expect A has same dtype with out_dtype.");
+  TORCH_CHECK(mat2.scalar_type() == at::kChar,
+      "int8_scaled_mm_with_quant: expect mat2 to be int8.");
+  TORCH_CHECK(scales2.scalar_type() == at::kFloat,
+      "int8_scaled_mm_with_quant: expect scales to be float32.");
+
+  const int64_t buffer_size = M * K + M * sizeof(float);
+  auto buffer = at::empty({buffer_size}, mat1.options().dtype(at::kByte));
+  auto out = at::empty({M, N}, mat1.options().dtype(out_dtype));
+
+  const bool has_bias = bias.has_value();
+  const float* bias_data = nullptr;
+  if (has_bias) {
+    CHECK_EQ(bias.value().size(0), N);
+    bias_data = bias.value().data_ptr<float>();
+  }
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(out_dtype, "int8_scaled_mm_with_quant_kernel_impl", [&] {
+    uint8_t* __restrict__ Aq_data = buffer.data_ptr<uint8_t>();
+    float* __restrict__ As_data = (float*)((void*)(Aq_data + M * K));
+    const scalar_t* __restrict__ A_data = mat1.data_ptr<scalar_t>();
+
+    at::parallel_for(0, M, 0, [&] (int64_t begin, int64_t end) {
+      for (int64_t m = begin; m < end; ++m) {
+        quantize_row_int8<scalar_t>(
+            Aq_data + m * K,
+            As_data[m],
+            A_data + m * lda,
+            K);
+      }
+    });
+
+    int8_scaled_mm_kernel_impl<scalar_t>(
+        out.data_ptr<scalar_t>(),
+        Aq_data,
+        packed_w.data_ptr<int8_t>(),
+        As_data,
+        scales2.data_ptr<float>(),
+        bias_data,
+        M,
+        N,
+        K);
+  });
+  return out;
+}
diff --git a/vllm_v0.10.0/csrc/cpu/sgl-kernels/moe.cpp b/vllm_v0.10.0/csrc/cpu/sgl-kernels/moe.cpp
new file mode 100644
index 0000000..beeccff
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cpu/sgl-kernels/moe.cpp
@@ -0,0 +1,1330 @@
+// Adapted from
+// https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
+
+#include "common.h"
+#include "vec.h"
+#include "gemm.h"
+
+// clang-format off
+
+namespace {
+
+// [NOTE]: Fused MoE kernel with AMX
+//
+//   This file contains implementations for
+//     * `moe_align_block_size`
+//     * `fused_moe`
+//
+//   The functionality is identical to triton kernel, excepts:
+//     * fuse silu_and_mul with gemm1, therefore this kernel
+//       allocates 2 intermediate_caches instead of 3
+//     * add `offsets` in `moe_align_block_size` which keeps track
+//       of starting offset for each M block. this is for keeping
+//       output of silu_and_mul in sorted order, thus load_A for
+//       the 2nd gemm would be contiguous, therefore we can directly
+//       load A from intermediate_cache1.
+//
+//  TODO:
+//     1. tune BLOCK_M and BLOCK_N (BLOCK_N * K fit L2)
+//     2. add prefetch for load A which is indexed access
+//     3. abstract at::native::cpublas::brgemm with WoQ gemm (M = 1 & M != 1)
+//
+
+template <typename scalar_t>
+inline void fill_stub(scalar_t* __restrict__ out, scalar_t val, int64_t size) {
+  using Vec = at::vec::Vectorized<scalar_t>;
+  const Vec data_vec(val);
+  at::vec::map<scalar_t>([data_vec](Vec out) { return out = data_vec; }, out, out, size);
+}
+
+template <typename scalar_t>
+inline void copy_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, int64_t size) {
+  using Vec = at::vec::Vectorized<scalar_t>;
+  // no remainder
+  #pragma GCC unroll 4
+  for (int64_t d = 0; d < size; d += Vec::size()) {
+    Vec data = Vec::loadu(input + d);
+    data.store(out + d);
+  }
+}
+
+template <typename scalar_t>
+inline void copy_mul_stub(scalar_t* __restrict__ out, const float* __restrict__ input, float weight, int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  const fVec weight_vec = fVec(weight);
+  int64_t d;
+  #pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    fVec data0 = fVec::loadu(input + d) * weight_vec;
+    fVec data1 = fVec::loadu(input + d + fVec::size()) * weight_vec;
+    bVec out_vec = convert_from_float_ext<scalar_t>(data0, data1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d] * weight);
+  }
+}
+
+// acc from [topk, K] to [K]
+template <typename scalar_t>
+inline void sum_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, int64_t topk, int64_t K) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  if (topk == 1) {
+    // do copy for topk = 1
+    copy_stub(out, input, K);
+  } else {
+    // do sum for topk != 1
+    int64_t d;
+    #pragma GCC unroll 4
+    for (d = 0; d <= K - kVecSize; d += kVecSize) {
+      fVec sum_fvec0 = fVec(0.f);
+      fVec sum_fvec1 = fVec(0.f);
+      for (int t = 0; t < topk; ++t) {
+        bVec x_bvec = bVec::loadu(input + t * K + d);
+        fVec x_fvec0, x_fvec1;
+        std::tie(x_fvec0, x_fvec1) = at::vec::convert_to_float(x_bvec);
+
+        sum_fvec0 += x_fvec0;
+        sum_fvec1 += x_fvec1;
+      }
+      bVec out_bvec = convert_from_float_ext<scalar_t>(sum_fvec0, sum_fvec1);
+      out_bvec.store(out + d);
+    }
+    for (; d < K; ++d) {
+      float sum_val = 0.f;
+      for (int t = 0; t < topk; ++t) {
+        sum_val += static_cast<float>(input[t * K + d]);
+      }
+      out[d] = static_cast<scalar_t>(sum_val);
+    }
+  }
+}
+
+// out = input + input2 * scale
+template <typename scalar_t>
+inline void add_mul_stub(scalar_t* __restrict__ out, const float* __restrict__ input,
+    const scalar_t* __restrict__ input2, float scale, int64_t size) {
+
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  const fVec s_vec = fVec(scale);
+  int64_t d;
+  #pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    fVec x0 = fVec::loadu(input + d);
+    fVec x1 = fVec::loadu(input + d + fVec::size());
+
+    bVec y_bvec = bVec::loadu(input2 + d);
+    fVec y0, y1;
+    std::tie(y0, y1) = at::vec::convert_to_float(y_bvec);
+
+    x0 = x0 + y0 * s_vec;
+    x1 = x1 + y1 * s_vec;
+    bVec out_vec = convert_from_float_ext<scalar_t>(x0, x1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d] + float(input2[d]) * scale);
+  }
+}
+
+template <int BLOCK_M>
+int moe_align_block_size(
+    int32_t* __restrict__ sorted_ids,
+    int32_t* __restrict__ expert_ids,
+    int32_t* __restrict__ topk_ids,
+    int32_t* __restrict__ total_cnts,
+    int32_t* __restrict__ cumsums,
+    int32_t* __restrict__ offsets,
+    int num_experts,
+    int numel,
+    int num_threads) {
+
+  #define T_INDEX(tt) total_cnts + (tt) * num_experts
+
+  // accumulate count of expert ids locally
+  at::parallel_for(0, numel, 0, [&](int begin, int end) {
+    int tid = at::get_thread_num();
+    int32_t* __restrict__ local_cnts = T_INDEX(tid + 1);
+
+    for (int i = begin; i < end; ++i) {
+      local_cnts[topk_ids[i]]++;
+    }
+  });
+
+  using iVec = at::vec::Vectorized<int32_t>;
+  for (int t = 0; t < num_threads; ++t) {
+    at::vec::map2<int32_t>(
+        [](iVec x, iVec y) { return x + y; },
+        T_INDEX(t + 1), T_INDEX(t + 1), T_INDEX(t), num_experts);
+  }
+
+  // the last row holds sums of each experts
+  int32_t* total_cnts_t_1 = T_INDEX(num_threads);
+
+  cumsums[0] = 0;
+  for (int e = 0; e < num_experts; ++e) {
+    // accumulate `num_tokens_post_pad`, also as the expert offset
+    cumsums[e + 1] = cumsums[e] + div_up(total_cnts_t_1[e], BLOCK_M) * BLOCK_M;
+
+    for (int k = cumsums[e]; k < cumsums[e + 1]; k += BLOCK_M) {
+      expert_ids[k / BLOCK_M] = e;
+    }
+  }
+  int num_tokens_post_pad = cumsums[num_experts];
+
+  at::parallel_for(0, numel, 0, [&](int begin, int end) {
+    int tid = at::get_thread_num();
+    // thread tid offsets in `total_cnts`
+    int32_t* __restrict__ offsets = T_INDEX(tid);
+
+    for (int i = begin; i < end; ++i) {
+      int32_t expert_id = topk_ids[i];
+      int32_t b_offset = cumsums[expert_id];
+      int32_t t_offset = offsets[expert_id];
+      sorted_ids[b_offset + t_offset] = i;
+      offsets[expert_id]++;
+    }
+  });
+
+  // debug: the offset for thread t_1 should be identical to t_2
+  int32_t* total_cnts_t_2 = T_INDEX(num_threads - 1);
+  for (int e = 0; e < num_experts; ++e) {
+    TORCH_CHECK(total_cnts_t_1[e] == total_cnts_t_2[e]);
+  }
+
+  // padding value for sorted_ids: numel
+  auto sorted_id_size = [=](const int32_t* sorted_ids_ptr) {
+    for (int d = 0; d < BLOCK_M; ++d) {
+      if (sorted_ids_ptr[d] == numel) { return d; }
+    }
+    return BLOCK_M;
+  };
+
+  // offsets holds starting offset for each valida M blocks
+  //   shape : [num_token_blocks + 1]
+  offsets[0] = 0;
+  const int num_token_blocks = num_tokens_post_pad / BLOCK_M;
+  at::parallel_for(0, num_token_blocks, GRAIN_SIZE / BLOCK_M, [&](int begin, int end) {
+    for (int mb = begin; mb < end; ++mb) {
+      offsets[mb + 1] = sorted_id_size(sorted_ids + mb * BLOCK_M);
+    }
+  });
+  // TODO: do we need to vecterize this ?
+  for (int mb = 0; mb < num_token_blocks; ++mb) {
+    offsets[mb + 1] += offsets[mb];
+  }
+  // debug: the last value of offsets should be `numel`
+  TORCH_CHECK(offsets[num_token_blocks] == numel);
+
+  return num_tokens_post_pad;
+}
+
+//   silu :    shape          leading dimension
+//  input0  [m_size, BLOCK_N]    BLOCK_N
+//  input1  [m_size, BLOCK_N]    BLOCK_N
+//  output  [M * topk, N]          N
+template <typename scalar_t, int BLOCK_N>
+inline void silu_and_mul(
+    scalar_t* __restrict__ output,
+    const float* __restrict__ input0,  // x: x0, x1
+    const float* __restrict__ input1,  // y: y0, y1
+    int64_t m_size,
+    int64_t N) {
+
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+
+  const fVec one = fVec(1.f);
+
+  // no remainder
+  for (int64_t m = 0; m < m_size; ++m) {
+    scalar_t* __restrict__ out = output + m * N;
+    const float* __restrict__ x = input0 + m * BLOCK_N;
+    const float* __restrict__ y = input1 + m * BLOCK_N;
+
+    for (int64_t d = 0; d < BLOCK_N; d += bVec::size()) {
+      fVec x0 = fVec::loadu(x + d);
+      fVec x1 = fVec::loadu(x + d + fVec::size());
+      fVec y0 = fVec::loadu(y + d);
+      fVec y1 = fVec::loadu(y + d + fVec::size());
+      // silu
+      x0 = x0 / (one + x0.neg().exp_u20());
+      x1 = x1 / (one + x1.neg().exp_u20());
+      // mul
+      x0 = x0 * y0;
+      x1 = x1 * y1;
+      // convert
+      bVec out_vec = convert_from_float_ext<scalar_t>(x0, x1);
+      out_vec.store(out + d);
+    }
+  }
+}
+
+template <typename scalar_t, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn2 {
+  static inline void apply(
+      const scalar_t* __restrict__ A, const scalar_t* __restrict__ B0, const scalar_t* __restrict__ B1,
+      scalar_t* __restrict__ C, int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+    TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
+  }
+};
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn2<at::BFloat16, BLOCK_M, BLOCK_N> {
+  static inline void apply(
+      const at::BFloat16* __restrict__ A, const at::BFloat16* __restrict__ B0, const at::BFloat16* __restrict__ B1,
+      at::BFloat16* __restrict__ C, int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+
+    constexpr int ROWS = BLOCK_M;
+    constexpr int COLS = BLOCK_N / 16;
+
+    static_assert(COLS % 2 == 0);
+
+    // prefetch distance
+    constexpr int PREFETCH_SIZE_K = 0;
+
+    __m512bh va;
+    __m512bh vb0[COLS];
+    __m512bh vb1[COLS];
+    __m512 vc0[ROWS * COLS];
+    __m512 vc1[ROWS * COLS];
+
+    auto loadc = [&](auto i) {
+      vc0[i] = _mm512_set1_ps(0.f);
+      vc1[i] = _mm512_set1_ps(0.f);
+    };
+    Unroll<ROWS * COLS>{}(loadc);
+
+    const int64_t K2 = K >> 1;
+    const int64_t lda2 = lda >> 1;
+    const int64_t ldb2 = ldb; // ldb * 2 >> 1;
+    const float* a_ptr = reinterpret_cast<const float*>(A);
+    const float* b0_ptr = reinterpret_cast<const float*>(B0);
+    const float* b1_ptr = reinterpret_cast<const float*>(B1);
+
+    auto compute = [&](auto i, int64_t k) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      if constexpr (col == 0) {
+        va = (__m512bh)(_mm512_set1_ps(a_ptr[row * lda2 + k]));
+      }
+      if constexpr (row == 0) {
+        vb0[col] = (__m512bh)(_mm512_loadu_si512(b0_ptr + k * ldb2 + col * 16));
+        vb1[col] = (__m512bh)(_mm512_loadu_si512(b1_ptr + k * ldb2 + col * 16));
+        if constexpr (PREFETCH_SIZE_K > 0) {
+          _mm_prefetch(b0_ptr + (k + PREFETCH_SIZE_K) * ldb2 + col * 16, _MM_HINT_T0);
+          _mm_prefetch(b1_ptr + (k + PREFETCH_SIZE_K) * ldb2 + col * 16, _MM_HINT_T0);
+        }
+      }
+      vc0[i] = _mm512_dpbf16_ps(vc0[i], va, vb0[col]);
+      vc1[i] = _mm512_dpbf16_ps(vc1[i], va, vb1[col]);
+    };
+    for (int64_t k = 0; k < K2; ++k) {
+      Unroll<ROWS * COLS>{}(compute, k);
+    }
+
+    using Vec = at::vec::Vectorized<float>;
+    const Vec one = Vec(1.f);
+    auto storec = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+      // for COLS = 2, 4 use 512bit store
+      if constexpr (col % 2 == 0) {
+        Vec x0 = vc0[row * COLS + col + 0];
+        Vec x1 = vc0[row * COLS + col + 1];
+        Vec y0 = vc1[row * COLS + col + 0];
+        Vec y1 = vc1[row * COLS + col + 1];
+        // silu
+        x0 = x0 / (one + x0.neg().exp_u20());
+        x1 = x1 / (one + x1.neg().exp_u20());
+        // mul
+        x0 = x0 * y0;
+        x1 = x1 * y1;
+
+        _mm512_storeu_si512(
+            reinterpret_cast<__m512i*>((C + row * ldc + col * 16)),
+            (__m512i)(_mm512_cvtne2ps_pbh(__m512(x1), __m512(x0))));
+        }
+    };
+    Unroll<ROWS * COLS>{}(storec);
+  }
+};
+#endif
+
+#define LAUNCH_TINYGEMM_KERNEL_NN(MB_SIZE, NB_SIZE)                          \
+    tinygemm_kernel_nn2<scalar_t, MB_SIZE, NB_SIZE>::apply(                  \
+        A + mb_start * lda, B0 + nb_start * 2, B1 + nb_start * 2,            \
+        C + mb_start * ldc + nb_start, K, lda, ldb, ldc);
+
+template <typename scalar_t>
+void tinygemm_kernel(
+    const scalar_t* __restrict__ A,
+    const scalar_t* __restrict__ B0,
+    const scalar_t* __restrict__ B1,
+    scalar_t* __restrict__ C,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc) {
+
+  // pattern: 1-(2+2)-(8+8)
+  constexpr int64_t BLOCK_M = 4;
+  constexpr int64_t BLOCK_N = 32;
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+  for (int mb = 0; mb < MB; ++mb) {
+    int64_t mb_start = mb * BLOCK_M;
+    int64_t mb_size = std::min(BLOCK_M, M - mb_start);
+    for (int64_t nb = 0; nb < NB; ++nb) {
+      int64_t nb_start = nb * BLOCK_N;
+      int64_t nb_size = std::min(BLOCK_N, N - nb_start);
+
+      switch(mb_size << 4 | nb_size >> 4) {
+        // mb_size = 1
+        case 0x12: LAUNCH_TINYGEMM_KERNEL_NN(1, 32); break;
+        // mb_size = 2
+        case 0x22: LAUNCH_TINYGEMM_KERNEL_NN(2, 32); break;
+        // mb_size = 3
+        case 0x32: LAUNCH_TINYGEMM_KERNEL_NN(3, 32); break;
+        // mb_size = 4
+        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+      }
+    }
+  }
+}
+
+template <typename scalar_t, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn {
+  static inline void apply(
+      const scalar_t* __restrict__ A, const scalar_t* __restrict__ B, float* __restrict__ C,
+      int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+    TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
+  }
+};
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn<at::BFloat16, BLOCK_M, BLOCK_N> {
+  static inline void apply(
+      const at::BFloat16* __restrict__ A, const at::BFloat16* __restrict__ B, float* __restrict__ C,
+      int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+
+    constexpr int ROWS = BLOCK_M;
+    constexpr int COLS = BLOCK_N / 16;
+
+    static_assert(COLS % 2 == 0);
+
+    // prefetch distance
+    constexpr int PREFETCH_SIZE_K = 0;
+
+    __m512bh va;
+    __m512bh vb[COLS];
+    __m512 vc[ROWS * COLS];
+
+    auto loadc = [&](auto i) {
+      vc[i] = _mm512_set1_ps(0.f);
+    };
+    Unroll<ROWS * COLS>{}(loadc);
+
+    const int64_t K2 = K >> 1;
+    const int64_t lda2 = lda >> 1;
+    const int64_t ldb2 = ldb; // ldb * 2 >> 1;
+    const float* a_ptr = reinterpret_cast<const float*>(A);
+    const float* b_ptr = reinterpret_cast<const float*>(B);
+
+    auto compute = [&](auto i, int64_t k) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      if constexpr (col == 0) {
+        va = (__m512bh)(_mm512_set1_ps(a_ptr[row * lda2 + k]));
+      }
+      if constexpr (row == 0) {
+        vb[col] = (__m512bh)(_mm512_loadu_si512(b_ptr + k * ldb2 + col * 16));
+        if constexpr (PREFETCH_SIZE_K > 0) {
+          _mm_prefetch(b_ptr + (k + PREFETCH_SIZE_K) * ldb2 + col * 16, _MM_HINT_T0);
+        }
+      }
+      vc[i] = _mm512_dpbf16_ps(vc[i], va, vb[col]);
+    };
+    for (int64_t k = 0; k < K2; ++k) {
+      Unroll<ROWS * COLS>{}(compute, k);
+    }
+
+    auto storec = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+      _mm512_storeu_ps(reinterpret_cast<__m512*>(C + row * ldc + col * 16), vc[i]);
+
+    };
+    Unroll<ROWS * COLS>{}(storec);
+  }
+};
+#endif
+
+#define LAUNCH_TINYGEMM_KERNEL_NN2(MB_SIZE, NB_SIZE)                         \
+    tinygemm_kernel_nn<scalar_t, MB_SIZE, NB_SIZE>::apply(                   \
+        A + mb_start * lda, B + nb_start * 2, C + mb_start * ldc + nb_start, \
+        K, lda, ldb, ldc);
+
+template <typename scalar_t>
+void tinygemm_kernel(
+    const scalar_t* __restrict__ A,
+    const scalar_t* __restrict__ B,
+    float* __restrict__ C,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc) {
+
+  // pattern: 1-2-8
+  constexpr int64_t BLOCK_M = 4;
+  constexpr int64_t BLOCK_N = 32;
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+  for (int mb = 0; mb < MB; ++mb) {
+    int64_t mb_start = mb * BLOCK_M;
+    int64_t mb_size = std::min(BLOCK_M, M - mb_start);
+    for (int64_t nb = 0; nb < NB; ++nb) {
+      int64_t nb_start = nb * BLOCK_N;
+      int64_t nb_size = std::min(BLOCK_N, N - nb_start);
+
+      switch(mb_size << 4 | nb_size >> 4) {
+        // mb_size = 1
+        case 0x12: LAUNCH_TINYGEMM_KERNEL_NN2(1, 32); break;
+        // mb_size = 2
+        case 0x22: LAUNCH_TINYGEMM_KERNEL_NN2(2, 32); break;
+        // mb_size = 3
+        case 0x32: LAUNCH_TINYGEMM_KERNEL_NN2(3, 32); break;
+        // mb_size = 4
+        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN2(4, 32); break;
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+void fused_experts_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic1,
+    scalar_t* __restrict__ ic2,
+    scalar_t* __restrict__ A_tmp,
+    float* __restrict__ C_tmp,
+    const scalar_t* __restrict__ input,
+    const scalar_t* __restrict__ packed_w1,
+    const scalar_t* __restrict__ packed_w2,
+    const float* __restrict__ topk_weights,
+    const int32_t* __restrict__ sorted_ids,
+    const int32_t* __restrict__ expert_ids,
+    const int32_t* __restrict__ offsets,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t E,
+    int64_t topk,
+    int64_t num_tokens_post_pad) {
+
+  // handle 2 tiles per block
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+
+  // stage 1: intermediate_cache1 = silu(hidden_states @ w1)
+  const int64_t MB = div_up(num_tokens_post_pad, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+
+  // strides for w1: [E, 2N, K]
+  TORCH_CHECK(N % BLOCK_N == 0, "Fixme when N is not multiples of ", BLOCK_N);
+
+  const int64_t stride_e = 2 * N * K;
+  const int64_t stride_n = K;
+
+  // here we only parallel on half of 2N to fuse silu_and_mul with gemm
+  at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
+    // get local pointers
+    int tid = at::get_thread_num();
+    scalar_t* __restrict__ A = A_tmp + tid * BLOCK_M * K;
+    float* __restrict__ C0 = C_tmp + tid * 2 * BLOCK_M * BLOCK_N;
+    float* __restrict__ C1 = C0 + BLOCK_M * BLOCK_N;
+
+    bool is_brgemm_used = false;
+
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t mb = i / NB;
+      int64_t nb = i % NB;
+
+      // nb0 from top half and nb1 from bottom half
+      int64_t nb0 = nb, nb1 = nb + NB;
+      int64_t n_size = std::min(N - nb0 * BLOCK_N, BLOCK_N);
+
+      // B shape [K, n_size] in vnni format
+      int32_t expert_id = expert_ids[mb];
+      const scalar_t* __restrict__ B0 = packed_w1 + expert_id * stride_e + nb0 * BLOCK_N * stride_n;
+      const scalar_t* __restrict__ B1 = packed_w1 + expert_id * stride_e + nb1 * BLOCK_N * stride_n;
+
+      // 1.a load A
+      const int32_t* A_ids = sorted_ids + mb * BLOCK_M;
+      int64_t m_size = offsets[mb + 1] - offsets[mb];
+
+      const bool use_brgemm = can_use_brgemm<scalar_t>(m_size);
+      is_brgemm_used = is_brgemm_used || use_brgemm;
+
+      for (int64_t m = 0; m < m_size; ++m) {
+        int32_t index = A_ids[m] / topk;
+        copy_stub(A + m * K, input + index * K, K);
+      }
+
+      if (use_brgemm) {
+        // 1.b gemm: C0 = A @ B0
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B0,
+            /* C     */ C0);
+
+        // 1.c gemm: C1 = A @ B1
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B1,
+            /* C     */ C1);
+
+        // 1.d silu and mul
+        const int64_t offset = offsets[mb];
+        silu_and_mul<scalar_t, BLOCK_N>(
+            ic1 + offset * N + nb * BLOCK_N,
+            C0,
+            C1,
+            m_size,
+            N);
+      } else {
+        // fused 1.bcd: silu_and_mul(A @ B0, A @ B1)
+        const int64_t offset = offsets[mb];
+        tinygemm_kernel(
+            /* A     */ A,
+            /* B0    */ B0,
+            /* B1    */ B1,
+            /* C     */ ic1 + offset * N + nb * BLOCK_N,
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ N);
+      }
+    }
+
+    if (is_brgemm_used) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+
+  // stage 2: intermediate_cache2 = intermediate_cache1 @ w2
+  //   w2 : [E, K, N] as [E, OC, IC]
+  const int64_t OC = K;  // rename K as OC
+  const int64_t IC = N;  // rename N as IC
+  const int64_t MB2 = MB;
+  const int64_t NB2 = div_up(OC, BLOCK_N);
+  const int64_t stride_e2 = OC * IC;
+  const int64_t stride_oc = IC;
+
+  // parallel on [MB2, NB2]
+  at::parallel_for(0, MB2 * NB2, 0, [&](int64_t begin, int64_t end) {
+    // get local pointers
+    int tid = at::get_thread_num();
+    // we won't be using C1 for gemm2
+    float* __restrict__ C = C_tmp + tid * 2 * BLOCK_M * BLOCK_N;
+
+    bool is_brgemm_used = false;
+
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t mb = i / NB2;
+      int64_t nb = i % NB2;
+
+      int64_t m_size = offsets[mb + 1] - offsets[mb];
+      int64_t n_size = std::min(OC - nb * BLOCK_N, BLOCK_N);
+
+      const bool use_brgemm = can_use_brgemm<scalar_t>(m_size);
+      is_brgemm_used = is_brgemm_used || use_brgemm;
+
+      // A ptr from ic1 of [M * topk, N] in sorted order
+      // so as to avoid copy A to tmp buffer again
+      const scalar_t* __restrict__ A = ic1 + offsets[mb] * N;
+      const int32_t* A_ids = sorted_ids + mb * BLOCK_M;
+
+      // B shape [IC, n_size] in vnni format
+      int32_t expert_id = expert_ids[mb];
+      const scalar_t* __restrict__ B = packed_w2 + expert_id * stride_e2 + nb * BLOCK_N * stride_oc;
+
+      // 2.a gemm: C = A @ B
+      if (use_brgemm) {
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ IC,
+            /* lda   */ IC,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B,
+            /* C     */ C);
+      } else {
+        tinygemm_kernel(
+            /* A     */ A,
+            /* B     */ B,
+            /* C     */ C,
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ IC,
+            /* lda   */ IC,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N);
+      }
+
+      // 2.b copy from C to ic2 in original order
+      //   and also mul topk_weights in float32
+      for (int64_t m = 0; m < m_size; ++m) {
+        int32_t index = A_ids[m];
+        float weight = topk_weights[index];
+        copy_mul_stub(ic2 + index * K + nb * BLOCK_N, C + m * BLOCK_N, weight, n_size);
+      }
+    }
+
+    if (is_brgemm_used) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+
+  // stage 3: out = intermediate_cache2.sum(dim=1)
+  //   from [M, topk, K] to [M, K]
+  at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      sum_stub(output + m * K, ic2 + m * topk * K, topk, K);
+    }
+  });
+}
+
+template <typename scalar_t>
+void shared_expert_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic1,
+    float* __restrict__ C_tmp,
+    scalar_t* __restrict__ input,
+    const scalar_t* __restrict__ packed_w1,
+    const scalar_t* __restrict__ packed_w2,
+    const scalar_t* __restrict__ fused_experts_out,
+    float routed_scaling_factor,
+    int64_t M,
+    int64_t N,
+    int64_t K) {
+
+  // handle 2 tiles per block
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+
+  // stage 1: intermediate_cache1 = silu(hidden_states @ w1)
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+
+  TORCH_CHECK(N % BLOCK_N == 0, "Fixme when N is not multiples of ", BLOCK_N);
+  const int64_t stride_n = K;
+
+  // here we only parallel on half of 2N to fuse silu_and_mul with gemm
+  at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
+    // get local pointers
+    int tid = at::get_thread_num();
+    float* __restrict__ C0 = C_tmp + tid * 2 * BLOCK_M * BLOCK_N;
+    float* __restrict__ C1 = C0 + BLOCK_M * BLOCK_N;
+
+    bool is_brgemm_used = false;
+
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t mb = i / NB;
+      int64_t nb = i % NB;
+
+      // nb0 from top half and nb1 from bottom half
+      int64_t nb0 = nb, nb1 = nb + NB;
+      int64_t n_size = std::min(N - nb0 * BLOCK_N, BLOCK_N);
+      int64_t m_size = std::min(M - mb * BLOCK_M, BLOCK_M);
+
+      //int64_t mb_start = mb * BLOCK_M;
+      //int64_t mb_size = std::min(M - mb_start, BLOCK_M);
+
+      // A shape [m_size, K]
+      const scalar_t* A = input + mb * BLOCK_M * K;
+
+      // B shape [K, n_size] in vnni format
+      const scalar_t* __restrict__ B0 = packed_w1 + nb0 * BLOCK_N * stride_n;
+      const scalar_t* __restrict__ B1 = packed_w1 + nb1 * BLOCK_N * stride_n;
+
+      const bool use_brgemm = can_use_brgemm<scalar_t>(m_size);
+      is_brgemm_used = is_brgemm_used || use_brgemm;
+
+      if (use_brgemm) {
+        // 1.b gemm: C0 = A @ B0
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B0,
+            /* C     */ C0);
+
+        // 1.c gemm: C1 = A @ B1
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B1,
+            /* C     */ C1);
+
+        // 1.d silu and mul
+        silu_and_mul<scalar_t, BLOCK_N>(
+            ic1 + mb * BLOCK_M * N + nb * BLOCK_N,
+            C0,
+            C1,
+            m_size,
+            N);
+      } else {
+        // fused 1.bcd: silu_and_mul(A @ B0, A @ B1)
+        tinygemm_kernel(
+            /* A     */ A,
+            /* B0    */ B0,
+            /* B1    */ B1,
+            /* C     */ ic1 + mb * BLOCK_M * N + nb * BLOCK_N,
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ N);
+      }
+    }
+
+    if (is_brgemm_used) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+
+  // stage 2: output = intermediate_cache1 @ w2
+  //   w2 : [K, N] as [OC, IC]
+  const int64_t OC = K;  // rename K as OC
+  const int64_t IC = N;  // rename N as IC
+  const int64_t MB2 = MB;
+  const int64_t NB2 = div_up(OC, BLOCK_N);
+  const int64_t stride_oc = IC;
+
+  // parallel on [MB2, NB2]
+  at::parallel_for(0, MB2 * NB2, 0, [&](int64_t begin, int64_t end) {
+    // get local pointers
+    int tid = at::get_thread_num();
+    // we won't be using C1 for gemm2
+    float* __restrict__ C = C_tmp + tid * 2 * BLOCK_M * BLOCK_N;
+
+    bool is_brgemm_used = false;
+
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t mb = i / NB2;
+      int64_t nb = i % NB2;
+
+      int64_t m_size = std::min(M - mb * BLOCK_M, BLOCK_M);
+      int64_t n_size = std::min(OC - nb * BLOCK_N, BLOCK_N);
+
+      const bool use_brgemm = can_use_brgemm<scalar_t>(m_size);
+      is_brgemm_used = is_brgemm_used || use_brgemm;
+
+      // A shape [m_size, IC]
+      const scalar_t* __restrict__ A = ic1 + mb * BLOCK_M * N;
+
+      // B shape [IC, n_size] in vnni format
+      const scalar_t* __restrict__ B = packed_w2 + nb * BLOCK_N * stride_oc;
+
+      // 2.a gemm: C = A @ B
+      if (use_brgemm) {
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ IC,
+            /* lda   */ IC,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B,
+            /* C     */ C);
+      } else {
+        tinygemm_kernel(
+            /* A     */ A,
+            /* B     */ B,
+            /* C     */ C,
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ IC,
+            /* lda   */ IC,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N);
+      }
+
+      // 2.b copy from C to output and add fused_experts_out
+      scalar_t* __restrict__ out = output + mb * BLOCK_M * K + nb * BLOCK_N;
+      const scalar_t* __restrict__ fused_out = fused_experts_out + mb * BLOCK_M * K + nb * BLOCK_N;
+      for (int64_t m = 0; m < m_size; ++m) {
+        add_mul_stub(out + m * K, C + m * BLOCK_N, fused_out + m * K, routed_scaling_factor, n_size);
+      }
+    }
+
+    if (is_brgemm_used) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+}
+
+} // anonymous namespace
+
+// common checks
+static inline void check_moe_scales(
+    bool use_int8_w8a8,
+    bool use_fp8_w8a16,
+    const std::optional<at::Tensor>& w1_scale,
+    const std::optional<at::Tensor>& w2_scale,
+    const std::optional<std::vector<int64_t>> block_size,
+    const std::optional<at::Tensor>& a1_scale,
+    const std::optional<at::Tensor>& a2_scale) {
+  if (use_int8_w8a8) {
+    TORCH_CHECK(w1_scale.has_value(), "missing w1_scale for int8 w8a8.");
+    TORCH_CHECK(w2_scale.has_value(), "missing w2_scale for int8 w8a8.");
+    TORCH_CHECK(!a1_scale.has_value(), "static quantization for activation not supported.");
+    TORCH_CHECK(!a2_scale.has_value(), "static quantization for activation not supported.");
+  }
+  if (use_fp8_w8a16) {
+    TORCH_CHECK(w1_scale.has_value(), "missing w1_scale for fp8 w8a16.");
+    TORCH_CHECK(w2_scale.has_value(), "missing w2_scale for fp8 w8a16.");
+    TORCH_CHECK(block_size.has_value(), "missing block_size for fp8 w8a16.");
+    TORCH_CHECK(block_size.value().size() == 2, "expect block_size.size() to be 2.");
+  }
+}
+
+#define CHECK_MOE_SCALES_FP8(DIM0, DIM1)                 \
+    auto w1s = w1_scale.value();                         \
+    auto w2s = w2_scale.value();                         \
+    auto block_size_val = block_size.value();            \
+    int64_t block_size_N = block_size_val[0];            \
+    int64_t block_size_K = block_size_val[1];            \
+    TORCH_CHECK(w1s.size(DIM0) == 2 * N / block_size_N); \
+    TORCH_CHECK(w1s.size(DIM1) == K / block_size_K);     \
+    TORCH_CHECK(w2s.size(DIM0) == K / block_size_N);     \
+    TORCH_CHECK(w2s.size(DIM1) == N / block_size_K)
+
+// hidden_states: [M, K]
+// w1: [E, 2N, K]
+// w2: [E, K, N]
+// topk_weights: [M, topk]
+// topk_ids: [M, topk] (int32_t)
+//
+at::Tensor fused_experts_cpu(
+    at::Tensor& hidden_states,
+    at::Tensor& w1,
+    at::Tensor& w2,
+    at::Tensor& topk_weights,
+    at::Tensor& topk_ids,
+    bool inplace,
+    bool use_int8_w8a8,
+    bool use_fp8_w8a16,
+    const std::optional<at::Tensor>& w1_scale,
+    const std::optional<at::Tensor>& w2_scale,
+    const std::optional<std::vector<int64_t>> block_size,
+    const std::optional<at::Tensor>& a1_scale,
+    const std::optional<at::Tensor>& a2_scale,
+    bool is_vnni) {
+  RECORD_FUNCTION("sgl-kernel::fused_experts_cpu", std::vector<c10::IValue>({hidden_states, w1, w2, topk_weights, topk_ids}));
+
+  auto packed_w1 = is_vnni ? w1 : convert_weight_packed(w1);
+  auto packed_w2 = is_vnni ? w2 : convert_weight_packed(w2);
+
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+
+  const auto st = hidden_states.scalar_type();
+  CHECK_INPUT(hidden_states);
+  CHECK_INPUT(w1);
+  CHECK_INPUT(w2);
+  CHECK_EQ(topk_weights.sizes(), topk_ids.sizes());
+  CHECK_DIM(2, hidden_states);
+  CHECK_DIM(3, w1);
+  CHECK_DIM(3, w2);
+  CHECK_DIM(2, topk_weights);
+  CHECK_DIM(2, topk_ids);
+
+  CHECK_EQ(topk_ids.scalar_type(), at::kInt);
+  CHECK_EQ(topk_weights.scalar_type(), at::kFloat);
+
+  int64_t M = hidden_states.size(0);
+  int64_t K = hidden_states.size(1);
+  int64_t N = w1.size(1) / 2;
+  int64_t E = w1.size(0);
+  int64_t topk = topk_weights.size(1);
+
+  // we use int32_t compensation for int8 w8a8
+  int64_t packed_K = get_row_size(K, use_int8_w8a8);
+  int64_t packed_N = get_row_size(N, use_int8_w8a8);
+
+  // check weight shapes
+  CHECK_EQ(w2.size(0), E);
+  CHECK_EQ(w2.size(1), K);
+  CHECK_EQ(packed_w1.size(2), packed_K);
+  CHECK_EQ(packed_w2.size(2), packed_N);
+
+  // check scales
+  check_moe_scales(use_int8_w8a8, use_fp8_w8a16, w1_scale, w2_scale, block_size, a1_scale, a2_scale);
+
+  at::Tensor out_hidden_states = inplace ? hidden_states : at::empty_like(hidden_states);
+
+  // NB: worst case is each expert holds a block with remainder of 1
+  //   1. sorted_ids : [M * topk + E * (BLOCK_M - 1)]
+  //   2. expert_ids : [max_num_blocks]
+  //   3. total_cnts : [T + 1, E]
+  //   4. cumsums    : [E + 1]
+  //   5. offsets    : [max_num_blocks + 1]
+  //
+  int num_threads = at::get_num_threads();
+  int64_t max_num_tokens_padded = M * topk + E * (BLOCK_M - 1);
+  int64_t max_num_blocks = div_up(max_num_tokens_padded, BLOCK_M);
+  auto buffer = at::empty(
+      {max_num_tokens_padded + max_num_blocks + (num_threads + 1) * E + (E + 1) + (max_num_blocks + 1)},
+      topk_ids.options());
+
+  int32_t* __restrict__ sorted_ids = buffer.data_ptr<int32_t>();
+  int32_t* __restrict__ expert_ids = sorted_ids + max_num_tokens_padded;
+  int32_t* __restrict__ total_cnts = expert_ids + max_num_blocks;
+  int32_t* __restrict__ cumsums    = total_cnts + (num_threads + 1) * E;
+  int32_t* __restrict__ offsets    = cumsums    + (E + 1);
+
+  // init sorted_ids with `numel` as the padding number
+  // init expert_ids with `num_experts`
+  int64_t numel = M * topk;
+  at::parallel_for(0, max_num_blocks, GRAIN_SIZE / BLOCK_M, [&](int64_t begin, int64_t end) {
+    int64_t m_start = begin * BLOCK_M;
+    int64_t m_size = std::min((end - begin) * BLOCK_M, max_num_tokens_padded - m_start);
+    fill_stub(sorted_ids + m_start, (int32_t)numel, m_size);
+    fill_stub(expert_ids + begin, (int32_t)E, end - begin);
+  });
+  // zero total_cnts and cumsums
+  at::parallel_for(0, (num_threads + 1) * E + (E + 1), GRAIN_SIZE, [&](int64_t begin, int64_t end) {
+    fill_stub(total_cnts + begin, 0, end - begin);
+  });
+
+  // align experts index
+  int64_t num_tokens_post_pad = moe_align_block_size<BLOCK_M>(
+      sorted_ids, expert_ids, topk_ids.data_ptr<int32_t>(), total_cnts, cumsums, offsets, E, numel, num_threads);
+
+  // unlike triton kernel, we fuse silu with gemm1 so only need 2 intermediate_caches:
+  //   1. intermediate_cache1 : [M * topk, N]
+  //   2. intermediate_cache2 : [M * topk, K]
+  //   3. A_tmp : [T, BLOCK_M * K]
+  //   4. C_tmp : [T, 2 * BLOCK_M * BLOCK_N]
+  //
+  // for int8 w8a8:
+  //   5. Aq_tmp : [M, K] or [M * topk, N]
+  //   6. As_tmp : [M * topk]
+  //
+  // for fp8 w8a16:
+  //   7. intermediate_cache0 : [M * topk, 2N]
+  //   8. B_tmp : [T, BLOCK_N, std::max(K, N)]
+  //
+  int64_t buffer_size_nbytes = M * topk * N * 2 + M * topk * K * 2 +
+      num_threads * BLOCK_M * K * (use_int8_w8a8 ? 1 : 2) +
+      num_threads * 2 * BLOCK_M * BLOCK_N * sizeof(float);
+
+  if (use_int8_w8a8) {
+    buffer_size_nbytes += std::max(M * K, M * topk * N) + M * topk * sizeof(float);
+  }
+  if (use_fp8_w8a16) {
+    buffer_size_nbytes += M * topk * 2 * N * 2 + num_threads * BLOCK_N * std::max(K, N) * 2;
+  }
+
+  auto buffer2 = at::empty({buffer_size_nbytes}, hidden_states.options().dtype(at::kChar));
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(st, "fused_experts_kernel_impl", [&] {
+    scalar_t* __restrict__ intermediate_cache1 = (scalar_t*)((void*)(buffer2.data_ptr<int8_t>()));
+    scalar_t* __restrict__ intermediate_cache2 = intermediate_cache1 + M * topk * N;
+
+    if (use_int8_w8a8) {
+      uint8_t* __restrict__ A_tmp = (uint8_t*)((void*)(intermediate_cache2 + M * topk * K));
+      float* __restrict__ C_tmp = (float*)((void*)(A_tmp + num_threads * BLOCK_M * K));
+      uint8_t* __restrict__ Aq_tmp = (uint8_t*)((void*)(C_tmp + num_threads * 2 * BLOCK_M * BLOCK_N));
+      float* __restrict__ As_tmp = (float*)((void*)(Aq_tmp + std::max(M * K, M * topk * N)));
+
+      auto w1s = w1_scale.value();
+      auto w2s = w2_scale.value();
+      TORCH_CHECK(w1s.numel() == E * 2 * N);
+      TORCH_CHECK(w2s.numel() == E * K);
+
+      fused_experts_int8_kernel_impl<scalar_t>(
+          out_hidden_states.data_ptr<scalar_t>(),
+          intermediate_cache1,
+          intermediate_cache2,
+          A_tmp,
+          C_tmp,
+          Aq_tmp,
+          As_tmp,
+          hidden_states.data_ptr<scalar_t>(),
+          packed_w1.data_ptr<int8_t>(),
+          packed_w2.data_ptr<int8_t>(),
+          w1s.data_ptr<float>(),
+          w2s.data_ptr<float>(),
+          topk_weights.data_ptr<float>(),
+          sorted_ids,
+          expert_ids,
+          offsets,
+          M,
+          N,
+          K,
+          E,
+          topk,
+          num_tokens_post_pad);
+    } else if (use_fp8_w8a16) {
+      // here we just ignore C_tmp as it is not used
+      scalar_t* __restrict__ A_tmp = (scalar_t*)((void*)(intermediate_cache2 + M * topk * K));
+      float* __restrict__ C_tmp = (float*)((void*)(A_tmp + num_threads * BLOCK_M * K));
+      scalar_t* __restrict__ intermediate_cache0 = (scalar_t*)((void*)(C_tmp + num_threads * 2 * BLOCK_M * BLOCK_N));
+      scalar_t* __restrict__ B_tmp = (scalar_t*)((void*)(intermediate_cache0 + M * topk * 2 * N));
+
+      CHECK_MOE_SCALES_FP8(1, 2);
+      fused_experts_fp8_kernel_impl(
+          out_hidden_states.data_ptr<scalar_t>(),
+          intermediate_cache0,
+          intermediate_cache1,
+          intermediate_cache2,
+          A_tmp,
+          B_tmp,
+          C_tmp,
+          hidden_states.data_ptr<scalar_t>(),
+          packed_w1.data_ptr<at::Float8_e4m3fn>(),
+          packed_w2.data_ptr<at::Float8_e4m3fn>(),
+          w1s.data_ptr<float>(),
+          w2s.data_ptr<float>(),
+          block_size_N,
+          block_size_K,
+          topk_weights.data_ptr<float>(),
+          sorted_ids,
+          expert_ids,
+          offsets,
+          M,
+          N,
+          K,
+          E,
+          topk,
+          num_tokens_post_pad);
+    } else {
+      scalar_t* __restrict__ A_tmp = intermediate_cache2 + M * topk * K;
+      float* __restrict__ C_tmp = (float*)((void*)(A_tmp + num_threads * BLOCK_M * K));
+
+      fused_experts_kernel_impl<scalar_t>(
+          out_hidden_states.data_ptr<scalar_t>(),
+          intermediate_cache1,
+          intermediate_cache2,
+          A_tmp,
+          C_tmp,
+          hidden_states.data_ptr<scalar_t>(),
+          packed_w1.data_ptr<scalar_t>(),
+          packed_w2.data_ptr<scalar_t>(),
+          topk_weights.data_ptr<float>(),
+          sorted_ids,
+          expert_ids,
+          offsets,
+          M,
+          N,
+          K,
+          E,
+          topk,
+          num_tokens_post_pad);
+    }
+  });
+  return out_hidden_states;
+}
+
+// shared expert kernel
+//
+// hidden_states: [M, K]
+// w1: [2N, K]
+// w2: [K, N]
+// fused_experts_out
+at::Tensor shared_expert_cpu(
+    at::Tensor& hidden_states,
+    at::Tensor& w1,
+    at::Tensor& w2,
+    at::Tensor& fused_experts_out,
+    double routed_scaling_factor,
+    bool inplace,
+    bool use_int8_w8a8,
+    bool use_fp8_w8a16,
+    std::optional<at::Tensor>& w1_scale,
+    std::optional<at::Tensor>& w2_scale,
+    std::optional<std::vector<int64_t>> block_size,
+    std::optional<at::Tensor>& a1_scale,
+    std::optional<at::Tensor>& a2_scale,
+    bool is_vnni) {
+  RECORD_FUNCTION("sgl-kernel::shared_expert_cpu", std::vector<c10::IValue>({hidden_states, w1, w2}));
+
+  auto packed_w1 = is_vnni ? w1 : convert_weight_packed(w1);
+  auto packed_w2 = is_vnni ? w2 : convert_weight_packed(w2);
+
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+
+  const auto st = hidden_states.scalar_type();
+  CHECK_INPUT(hidden_states);
+  CHECK_INPUT(fused_experts_out);
+  CHECK_INPUT(w1);
+  CHECK_INPUT(w2);
+  CHECK_DIM(2, hidden_states);
+  CHECK_DIM(2, w1);
+  CHECK_DIM(2, w2);
+  CHECK_EQ(hidden_states.sizes(), fused_experts_out.sizes());
+  CHECK_EQ(hidden_states.scalar_type(), st);
+
+  int64_t M = hidden_states.size(0);
+  int64_t K = hidden_states.size(1);
+  int64_t N = w1.size(0) / 2;
+
+  // we use int32_t compensation for int8 w8a8
+  int64_t packed_K = get_row_size(K, use_int8_w8a8);
+  int64_t packed_N = get_row_size(N, use_int8_w8a8);
+
+  // check weight shapes
+  CHECK_EQ(w2.size(0), K);
+  CHECK_EQ(packed_w1.size(1), packed_K);
+  CHECK_EQ(packed_w2.size(1), packed_N);
+
+  // check scales
+  check_moe_scales(use_int8_w8a8, use_fp8_w8a16, w1_scale, w2_scale, block_size, a1_scale, a2_scale);
+
+  at::Tensor out_hidden_states = inplace ? hidden_states : at::empty_like(hidden_states);
+
+  // unlike triton kernel, we fuse silu with gemm1 so only need 2 intermediate_caches:
+  //   1. intermediate_cache1 : [M, N]
+  //   2. C_tmp : [T, 2 * BLOCK_M * BLOCK_N]
+  //
+  // for int8 w8a8:
+  //   3. Aq_tmp : [M, K] or [M, N]
+  //   4. As_tmp : [M]
+  //
+  // for fp8 w8a16:
+  //   5. intermediate_cache0 : [M, 2N]
+  //   6. B_tmp: [T, BLOCK_M, max(K, N)]
+  //
+  int num_threads = at::get_num_threads();
+  int64_t buffer_size_nbytes = M * N * 2 + num_threads * 2 * BLOCK_M * BLOCK_N * sizeof(float);
+
+  if (use_int8_w8a8) {
+    buffer_size_nbytes += std::max(M * K, M * N) + M * sizeof(float);
+  }
+  if (use_fp8_w8a16) {
+    buffer_size_nbytes += M * 2 * N * 2 + num_threads * BLOCK_M * std::max(K, N) * 2;
+  }
+
+  auto buffer = at::empty({buffer_size_nbytes}, hidden_states.options().dtype(at::kChar));
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(st, "share_experts_kernel_impl", [&] {
+    scalar_t* __restrict__ intermediate_cache1 = (scalar_t*)((void*)(buffer.data_ptr<int8_t>()));
+    float* __restrict__ C_tmp = (float*)((void*)(intermediate_cache1 + M * N));
+
+    if (use_int8_w8a8) {
+      uint8_t* __restrict__ Aq_tmp = (uint8_t*)((void*)(C_tmp + num_threads * 2 * BLOCK_M * BLOCK_N));
+      float* __restrict__ As_tmp = (float*)((void*)(Aq_tmp + std::max(M * K, M * N)));
+
+      auto w1s = w1_scale.value();
+      auto w2s = w2_scale.value();
+      TORCH_CHECK(w1s.numel() == 2 * N);
+      TORCH_CHECK(w2s.numel() == K);
+
+      shared_expert_int8_kernel_impl<scalar_t>(
+          out_hidden_states.data_ptr<scalar_t>(),
+          intermediate_cache1,
+          C_tmp,
+          Aq_tmp,
+          As_tmp,
+          hidden_states.data_ptr<scalar_t>(),
+          packed_w1.data_ptr<int8_t>(),
+          packed_w2.data_ptr<int8_t>(),
+          w1s.data_ptr<float>(),
+          w2s.data_ptr<float>(),
+          fused_experts_out.data_ptr<scalar_t>(),
+          routed_scaling_factor,
+          M,
+          N,
+          K);
+    } else if (use_fp8_w8a16) {
+      scalar_t* __restrict__ intermediate_cache0 = (scalar_t*)((void*)(C_tmp + num_threads * 2 * BLOCK_M * BLOCK_N));
+      scalar_t* __restrict__ B_tmp = (scalar_t*)((void*)(intermediate_cache0 + M * 2 * N));
+
+      CHECK_MOE_SCALES_FP8(0, 1);
+      shared_expert_fp8_kernel_impl<scalar_t>(
+          out_hidden_states.data_ptr<scalar_t>(),
+          intermediate_cache0,
+          intermediate_cache1,
+          B_tmp,
+          C_tmp,
+          hidden_states.data_ptr<scalar_t>(),
+          packed_w1.data_ptr<at::Float8_e4m3fn>(),
+          packed_w2.data_ptr<at::Float8_e4m3fn>(),
+          w1s.data_ptr<float>(),
+          w2s.data_ptr<float>(),
+          block_size_N,
+          block_size_K,
+          fused_experts_out.data_ptr<scalar_t>(),
+          routed_scaling_factor,
+          M,
+          N,
+          K);
+    } else {
+      shared_expert_kernel_impl<scalar_t>(
+          out_hidden_states.data_ptr<scalar_t>(),
+          intermediate_cache1,
+          C_tmp,
+          hidden_states.data_ptr<scalar_t>(),
+          packed_w1.data_ptr<scalar_t>(),
+          packed_w2.data_ptr<scalar_t>(),
+          fused_experts_out.data_ptr<scalar_t>(),
+          routed_scaling_factor,
+          M,
+          N,
+          K);
+    }
+  });
+  return out_hidden_states;
+}
diff --git a/vllm_v0.10.0/csrc/cpu/sgl-kernels/moe_fp8.cpp b/vllm_v0.10.0/csrc/cpu/sgl-kernels/moe_fp8.cpp
new file mode 100644
index 0000000..84a6af2
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cpu/sgl-kernels/moe_fp8.cpp
@@ -0,0 +1,502 @@
+// Adapted from
+// https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
+
+#include "common.h"
+#include "gemm.h"
+#include "vec.h"
+
+// clang-format off
+
+namespace {
+
+template <typename scalar_t>
+inline void copy_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, int64_t size) {
+  using Vec = at::vec::Vectorized<scalar_t>;
+  // no remainder
+  #pragma GCC unroll 4
+  for (int64_t d = 0; d < size; d += Vec::size()) {
+    Vec data = Vec::loadu(input + d);
+    data.store(out + d);
+  }
+}
+
+template <typename scalar_t>
+inline void copy_mul_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, float weight, int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  const fVec weight_vec = fVec(weight);
+  int64_t d;
+  #pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    bVec x = bVec::loadu(input + d);
+    fVec x0, x1;
+    std::tie(x0, x1) = at::vec::convert_to_float(x);
+    x0 = x0 * weight_vec;
+    x1 = x1 * weight_vec;
+    bVec out_vec = convert_from_float_ext<scalar_t>(x0, x1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d] * weight);
+  }
+}
+
+// acc from [topk, K] to [K]
+template <typename scalar_t>
+inline void sum_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, int64_t topk, int64_t K) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  if (topk == 1) {
+    // do copy for topk = 1
+    copy_stub(out, input, K);
+  } else {
+    // do sum for topk != 1
+    int64_t d;
+    #pragma GCC unroll 4
+    for (d = 0; d <= K - kVecSize; d += kVecSize) {
+      fVec sum_fvec0 = fVec(0.f);
+      fVec sum_fvec1 = fVec(0.f);
+      for (int t = 0; t < topk; ++t) {
+        bVec x_bvec = bVec::loadu(input + t * K + d);
+        fVec x_fvec0, x_fvec1;
+        std::tie(x_fvec0, x_fvec1) = at::vec::convert_to_float(x_bvec);
+
+        sum_fvec0 += x_fvec0;
+        sum_fvec1 += x_fvec1;
+      }
+      bVec out_bvec = convert_from_float_ext<scalar_t>(sum_fvec0, sum_fvec1);
+      out_bvec.store(out + d);
+    }
+    for (; d < K; ++d) {
+      float sum_val = 0.f;
+      for (int t = 0; t < topk; ++t) {
+        sum_val += static_cast<float>(input[t * K + d]);
+      }
+      out[d] = static_cast<scalar_t>(sum_val);
+    }
+  }
+}
+
+// out = input + input2 * scale
+template <typename scalar_t>
+inline void add_mul_stub(
+    scalar_t* __restrict__ out,
+    const scalar_t* __restrict__ input,
+    const scalar_t* __restrict__ input2,
+    float scale,
+    int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  const fVec s_vec = fVec(scale);
+
+  int64_t d;
+#pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    bVec x_bvec = bVec::loadu(input + d);
+    fVec x0, x1;
+    std::tie(x0, x1) = at::vec::convert_to_float(x_bvec);
+
+    bVec y_bvec = bVec::loadu(input2 + d);
+    fVec y0, y1;
+    std::tie(y0, y1) = at::vec::convert_to_float(y_bvec);
+
+    x0 = x0 + y0 * s_vec;
+    x1 = x1 + y1 * s_vec;
+    bVec out_vec = convert_from_float_ext<scalar_t>(x0, x1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d] + float(input2[d]) * scale);
+  }
+}
+
+template <typename scalar_t>
+inline void silu_and_mul_stub(
+    scalar_t* __restrict__ out,
+    const scalar_t* __restrict__ input,
+    const scalar_t* __restrict__ input2,
+    int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  const fVec one = fVec(1.f);
+
+  // no remainder
+#pragma GCC unroll 4
+  for (int64_t d = 0; d < size; d += bVec::size()) {
+    bVec x = bVec::loadu(input + d);
+    fVec x0, x1;
+    std::tie(x0, x1) = at::vec::convert_to_float(x);
+    bVec y = bVec::loadu(input2 + d);
+    fVec y0, y1;
+    std::tie(y0, y1) = at::vec::convert_to_float(y);
+    x0 = x0 / (one + x0.neg().exp_u20());
+    x1 = x1 / (one + x1.neg().exp_u20());
+    x0 = x0 * y0;
+    x1 = x1 * y1;
+    bVec out_vec = convert_from_float_ext<scalar_t>(x0, x1);
+    out_vec.store(out + d);
+  }
+}
+
+} // anonymous namespace
+
+template <typename scalar_t>
+void fused_experts_fp8_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic0,
+    scalar_t* __restrict__ ic1,
+    scalar_t* __restrict__ ic2,
+    scalar_t* __restrict__ A_tmp,
+    scalar_t* __restrict__ B_tmp,
+    float* __restrict__ C_tmp,
+    const scalar_t* __restrict__ input,
+    const at::Float8_e4m3fn* __restrict__ packed_w1,
+    const at::Float8_e4m3fn* __restrict__ packed_w2,
+    const float* __restrict__ w1s,
+    const float* __restrict__ w2s,
+    int64_t block_size_N,
+    int64_t block_size_K,
+    const float* __restrict__ topk_weights,
+    const int32_t* __restrict__ sorted_ids,
+    const int32_t* __restrict__ expert_ids,
+    const int32_t* __restrict__ offsets,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t E,
+    int64_t topk,
+    int64_t num_tokens_post_pad) {
+
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+
+  // stage 1: intermediate_cache0 = hidden_states @ w1
+  const int64_t MB = div_up(num_tokens_post_pad, BLOCK_M);
+  const int64_t NB = div_up(2 * N, BLOCK_N);
+  int64_t scale_size_N = div_up(2 * N, block_size_N);
+  int64_t scale_size_K = div_up(K, block_size_K);
+  int64_t blocks_n_per_group = block_size_N / BLOCK_N;
+
+  const int64_t stride_e = 2 * N * K;
+  const int64_t stride_n = K;
+
+  // here we only parallel on half of 2N to fuse silu_and_mul with gemm
+  at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
+    // get local pointers
+    int tid = at::get_thread_num();
+    scalar_t* __restrict__ A = A_tmp + tid * BLOCK_M * K;
+
+    bool is_brgemm_used = false;
+
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t mb = i / NB;
+      int64_t nb = i % NB;
+
+      int64_t n_size = std::min(2 * N - nb * BLOCK_N, BLOCK_N);
+
+      // B shape [K, n_size] in vnni format
+      int32_t expert_id = expert_ids[mb];
+      const at::Float8_e4m3fn* __restrict__ B = packed_w1 + expert_id * stride_e + nb * BLOCK_N * stride_n;
+      const float* __restrict__ Bs = w1s + expert_id * scale_size_N * scale_size_K + (nb / blocks_n_per_group) * scale_size_K;
+
+      // 1.a load A
+      const int32_t* A_ids = sorted_ids + mb * BLOCK_M;
+      int64_t m_size = offsets[mb + 1] - offsets[mb];
+
+      const bool use_brgemm = can_use_brgemm<at::Float8_e4m3fn>(m_size);
+      is_brgemm_used = is_brgemm_used || use_brgemm;
+
+      for (int64_t m = 0; m < m_size; ++m) {
+        int32_t index = A_ids[m] / topk;
+        copy_stub(A + m * K, input + index * K, K);
+      }
+
+      const int64_t offset = offsets[mb];
+      tinygemm_kernel<scalar_t>(
+          /*   A            */ A,
+          /*   B            */ B,
+          /*   C            */ ic0 + offset * 2 * N + nb * BLOCK_N,
+          /*   Btmp         */ B_tmp + tid * BLOCK_N * std::max(K, N),
+          /*   Ctmp         */ C_tmp + tid * 2 * BLOCK_M * BLOCK_N,
+          /*   scale        */ Bs,
+          /*   M            */ m_size,
+          /*   N            */ n_size,
+          /*   K            */ K,
+          /*   lda          */ K,
+          /*   ldb          */ n_size,
+          /*   ldc          */ 2 * N,
+          /*   brg          */ use_brgemm,
+          /*   block_size_K */ block_size_K);
+    }
+
+    if (is_brgemm_used) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+
+  // stage 1.5: intermediate_cache1 = silu(intermediate_cache0)
+  at::parallel_for(0, M * topk, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      silu_and_mul_stub(
+          ic1 + m * N,
+          ic0 + m * 2 * N,
+          ic0 + m * 2 * N + N,
+          N);
+    }
+  });
+
+  // stage 2: intermediate_cache2 = intermediate_cache1 @ w2
+  //   w2 : [E, K, N] as [E, OC, IC]
+  const int64_t OC = K;  // rename K as OC
+  const int64_t IC = N;  // rename N as IC
+  const int64_t MB2 = MB;
+  const int64_t NB2 = div_up(OC, BLOCK_N);
+  scale_size_N = div_up(K, block_size_N);
+  scale_size_K = div_up(N, block_size_K);
+  const int64_t stride_e2 = OC * IC;
+  const int64_t stride_oc = IC;
+
+  // parallel on [MB2, NB2]
+  at::parallel_for(0, MB2 * NB2, 0, [&](int64_t begin, int64_t end) {
+    int tid = at::get_thread_num();
+    alignas(64) scalar_t C[BLOCK_M * BLOCK_K];
+
+    bool is_brgemm_used = false;
+
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t mb = i / NB2;
+      int64_t nb = i % NB2;
+
+      int64_t m_size = offsets[mb + 1] - offsets[mb];
+      int64_t n_size = std::min(OC - nb * BLOCK_N, BLOCK_N);
+
+      const bool use_brgemm = can_use_brgemm<at::Float8_e4m3fn>(m_size);
+      is_brgemm_used = is_brgemm_used || use_brgemm;
+
+      // A ptr from ic1 of [M * topk, N] in sorted order
+      // so as to avoid copy A to tmp buffer again
+      const scalar_t* __restrict__ A = ic1 + offsets[mb] * N;
+      const int32_t* A_ids = sorted_ids + mb * BLOCK_M;
+
+      // B shape [IC, n_size] in vnni format
+      int32_t expert_id = expert_ids[mb];
+      const at::Float8_e4m3fn* __restrict__ B = packed_w2 + expert_id * stride_e2 + nb * BLOCK_N * stride_oc;
+      const float* __restrict__ Bs = w2s + expert_id * scale_size_N * scale_size_K + (nb / blocks_n_per_group) * scale_size_K;
+
+      tinygemm_kernel<scalar_t>(
+          /*   A            */ A,
+          /*   B            */ B,
+          /*   C            */ C,
+          /*   Btmp         */ B_tmp + tid * BLOCK_N * std::max(K, N),
+          /*   Ctmp         */ C_tmp + tid * 2 * BLOCK_M * BLOCK_N,
+          /*   scale        */ Bs,
+          /*   M            */ m_size,
+          /*   N            */ n_size,
+          /*   K            */ IC,
+          /*   lda          */ IC,
+          /*   ldb          */ n_size,
+          /*   ldc          */ BLOCK_N,
+          /*   brg          */ use_brgemm,
+          /*   block_size_K */ block_size_K);
+
+      // 2.b copy from C to ic2 in original order
+      //   and also mul topk_weights in float32
+      for (int64_t m = 0; m < m_size; ++m) {
+        int32_t index = A_ids[m];
+        float weight = topk_weights[index];
+        copy_mul_stub(ic2 + index * K + nb * BLOCK_N, C + m * BLOCK_N, weight, n_size);
+      }
+    }
+
+    if (is_brgemm_used) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+
+  // stage 3: out = intermediate_cache2.sum(dim=1)
+  //   from [M, topk, K] to [M, K]
+  at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      sum_stub(output + m * K, ic2 + m * topk * K, topk, K);
+    }
+  });
+}
+
+#define INSTANTIATE_MOE_FP8_TEMPLATE(TYPE)             \
+  template void fused_experts_fp8_kernel_impl<TYPE>(   \
+      TYPE* __restrict__ output,                       \
+      TYPE* __restrict__ ic0,                          \
+      TYPE* __restrict__ ic1,                          \
+      TYPE* __restrict__ ic2,                          \
+      TYPE* __restrict__ A_tmp,                        \
+      TYPE* __restrict__ B_tmp,                        \
+      float* __restrict__ C_tmp,                       \
+      const TYPE* __restrict__ input,                  \
+      const at::Float8_e4m3fn* __restrict__ packed_w1, \
+      const at::Float8_e4m3fn* __restrict__ packed_w2, \
+      const float* __restrict__ w1s,                   \
+      const float* __restrict__ w2s,                   \
+      int64_t block_size_N,                            \
+      int64_t block_size_K,                            \
+      const float* __restrict__ topk_weights,          \
+      const int32_t* __restrict__ sorted_ids,          \
+      const int32_t* __restrict__ expert_ids,          \
+      const int32_t* __restrict__ offsets,             \
+      int64_t M,                                       \
+      int64_t N,                                       \
+      int64_t K,                                       \
+      int64_t E,                                       \
+      int64_t topk,                                    \
+      int64_t num_tokens_post_pad)
+
+INSTANTIATE_MOE_FP8_TEMPLATE(at::BFloat16);
+INSTANTIATE_MOE_FP8_TEMPLATE(at::Half);
+
+template <typename scalar_t>
+void shared_expert_fp8_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic0,
+    scalar_t* __restrict__ ic1,
+    scalar_t* __restrict__ B_tmp,
+    float* __restrict__ C_tmp,
+    const scalar_t* __restrict__ input,
+    const at::Float8_e4m3fn* __restrict__ packed_w1,
+    const at::Float8_e4m3fn* __restrict__ packed_w2,
+    const float* __restrict__ w1s,
+    const float* __restrict__ w2s,
+    int64_t block_size_N,
+    int64_t block_size_K,
+    const scalar_t* __restrict__ fused_experts_out,
+    float routed_scaling_factor,
+    int64_t M,
+    int64_t N,
+    int64_t K) {
+
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+
+  // stage 1: intermediate_cache0 = hidden_states @ w1
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(2 * N, BLOCK_N);
+  int64_t scale_size_K = div_up(K, block_size_K);
+  int64_t blocks_n_per_group = block_size_N / BLOCK_N;
+
+  const bool use_brgemm = can_use_brgemm<at::Float8_e4m3fn>(M);
+
+  at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
+    int tid = at::get_thread_num();
+
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t mb = i / NB;
+      int64_t nb = i % NB;
+      int64_t m_size = std::min(M - mb * BLOCK_M, BLOCK_M);
+      int64_t n_size = std::min(2 * N - nb * BLOCK_N, BLOCK_N);
+
+      tinygemm_kernel<scalar_t>(
+          /*   A            */ input + mb * BLOCK_M * K,
+          /*   B            */ packed_w1 + nb * BLOCK_N * K,
+          /*   C            */ ic0 + mb * BLOCK_M * 2 * N + nb * BLOCK_N,
+          /*   Btmp         */ B_tmp + tid * BLOCK_N * std::max(K, N),
+          /*   Ctmp         */ C_tmp + tid * 2 * BLOCK_M * BLOCK_N,
+          /*   scale        */ w1s + (nb / blocks_n_per_group) * scale_size_K,
+          /*   M            */ m_size,
+          /*   N            */ n_size,
+          /*   K            */ K,
+          /*   lda          */ K,
+          /*   ldb          */ n_size,
+          /*   ldc          */ 2 * N,
+          /*   brg          */ use_brgemm,
+          /*   block_size_K */ block_size_K);
+    }
+
+    if (use_brgemm) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+
+  // stage 1.5: intermediate_cache1 = silu(intermediate_cache0)
+  at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      silu_and_mul_stub(
+          ic1 + m * N,
+          ic0 + m * 2 * N,
+          ic0 + m * 2 * N + N,
+          N);
+    }
+  });
+
+  // stage 2: intermediate_cache2 = intermediate_cache1 @ w2
+  //   w2 : [K, N] as [OC, IC]
+  const int64_t OC = K;  // rename K as OC
+  const int64_t IC = N;  // rename N as IC
+  const int64_t MB2 = MB;
+  const int64_t NB2 = div_up(K, BLOCK_N);
+  scale_size_K = div_up(N, block_size_K);
+
+  // parallel on [MB2, NB2]
+  at::parallel_for(0, MB2 * NB2, 0, [&](int64_t begin, int64_t end) {
+    int tid = at::get_thread_num();
+    alignas(64) scalar_t C[BLOCK_M * BLOCK_K];
+
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t mb = i / NB2;
+      int64_t nb = i % NB2;
+      int64_t m_size = std::min(M - mb * BLOCK_M, BLOCK_M);
+      int64_t n_size = std::min(OC - nb * BLOCK_N, BLOCK_N);
+
+      // 2.a gemm: C = A @ B
+      tinygemm_kernel<scalar_t>(
+          /*   A            */ ic1 + mb * BLOCK_M * N,
+          /*   B            */ packed_w2 + nb * BLOCK_N * N,
+          /*   C            */ C,
+          /*   Btmp         */ B_tmp + tid * BLOCK_N * std::max(K, N),
+          /*   Ctmp         */ C_tmp + tid * 2 * BLOCK_M * BLOCK_N,
+          /*   scale        */ w2s + (nb / blocks_n_per_group) * scale_size_K,
+          /*   M            */ m_size,
+          /*   N            */ n_size,
+          /*   K            */ IC,
+          /*   lda          */ IC,
+          /*   ldb          */ n_size,
+          /*   ldc          */ BLOCK_N,
+          /*   brg          */ use_brgemm,
+          /*   block_size_K */ block_size_K);
+
+      // 2.b copy from C to output and add fused_experts_out
+      scalar_t* __restrict__ out = output + mb * BLOCK_M * K + nb * BLOCK_N;
+      const scalar_t* __restrict__ fused_out = fused_experts_out + mb * BLOCK_M * K + nb * BLOCK_N;
+      for (int64_t m = 0; m < m_size; ++m) {
+        add_mul_stub(out + m * K, C + m * BLOCK_N, fused_out + m * K, routed_scaling_factor, n_size);
+      }
+    }
+  });
+
+  if (use_brgemm) {
+    at::native::cpublas::brgemm_release();
+  }
+}
+
+#define INSTANTIATE_SHARED_EXPERT_FP8_TEMPLATE(TYPE)   \
+  template void shared_expert_fp8_kernel_impl<TYPE>(   \
+      TYPE* __restrict__ output,                       \
+      TYPE* __restrict__ ic0,                          \
+      TYPE* __restrict__ ic1,                          \
+      TYPE* __restrict__ B_tmp,                        \
+      float* __restrict__ C_tmp,                       \
+      const TYPE* __restrict__ input,                  \
+      const at::Float8_e4m3fn* __restrict__ packed_w1, \
+      const at::Float8_e4m3fn* __restrict__ packed_w2, \
+      const float* __restrict__ w1s,                   \
+      const float* __restrict__ w2s,                   \
+      int64_t block_size_N,                            \
+      int64_t block_size_K,                            \
+      const TYPE* __restrict__ fused_experts_out,      \
+      float routed_scaling_factor,                     \
+      int64_t M,                                       \
+      int64_t N,                                       \
+      int64_t K)
+
+INSTANTIATE_SHARED_EXPERT_FP8_TEMPLATE(at::BFloat16);
+INSTANTIATE_SHARED_EXPERT_FP8_TEMPLATE(at::Half);
diff --git a/vllm_v0.10.0/csrc/cpu/sgl-kernels/moe_int8.cpp b/vllm_v0.10.0/csrc/cpu/sgl-kernels/moe_int8.cpp
new file mode 100644
index 0000000..89d0fb5
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cpu/sgl-kernels/moe_int8.cpp
@@ -0,0 +1,769 @@
+// Adapted from
+// https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
+
+#include "common.h"
+#include "vec.h"
+#include "gemm.h"
+
+// clang-format off
+
+namespace {
+
+template <typename scalar_t>
+inline void copy_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, int64_t size) {
+  using Vec = at::vec::Vectorized<scalar_t>;
+  // no remainder
+  #pragma GCC unroll 4
+  for (int64_t d = 0; d < size; d += Vec::size()) {
+    Vec data = Vec::loadu(input + d);
+    data.store(out + d);
+  }
+}
+
+template <>
+inline void copy_stub<uint8_t>(uint8_t* __restrict__ out, const uint8_t* __restrict__ input, int64_t size) {
+  // size might be 64x + 32
+  std::memcpy(out, input, size * sizeof(uint8_t));
+}
+
+template <typename scalar_t>
+inline void copy_mul_stub(scalar_t* __restrict__ out, const float* __restrict__ input, float weight, int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  const fVec weight_vec = fVec(weight);
+  int64_t d;
+  #pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    fVec data0 = fVec::loadu(input + d) * weight_vec;
+    fVec data1 = fVec::loadu(input + d + fVec::size()) * weight_vec;
+    bVec out_vec = convert_from_float_ext<scalar_t>(data0, data1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d] * weight);
+  }
+}
+
+// acc from [topk, K] to [K]
+template <typename scalar_t>
+inline void sum_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, int64_t topk, int64_t K) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  if (topk == 1) {
+    // do copy for topk = 1
+    copy_stub(out, input, K);
+  } else {
+    // do sum for topk != 1
+    int64_t d;
+    #pragma GCC unroll 4
+    for (d = 0; d <= K - kVecSize; d += kVecSize) {
+      fVec sum_fvec0 = fVec(0.f);
+      fVec sum_fvec1 = fVec(0.f);
+      for (int t = 0; t < topk; ++t) {
+        bVec x_bvec = bVec::loadu(input + t * K + d);
+        fVec x_fvec0, x_fvec1;
+        std::tie(x_fvec0, x_fvec1) = at::vec::convert_to_float(x_bvec);
+
+        sum_fvec0 += x_fvec0;
+        sum_fvec1 += x_fvec1;
+      }
+      bVec out_bvec = convert_from_float_ext<scalar_t>(sum_fvec0, sum_fvec1);
+      out_bvec.store(out + d);
+    }
+    for (; d < K; ++d) {
+      float sum_val = 0.f;
+      for (int t = 0; t < topk; ++t) {
+        sum_val += static_cast<float>(input[t * K + d]);
+      }
+      out[d] = static_cast<scalar_t>(sum_val);
+    }
+  }
+}
+
+// out = input + input2 * scale
+template <typename scalar_t>
+inline void add_mul_stub(scalar_t* __restrict__ out, const float* __restrict__ input,
+    const scalar_t* __restrict__ input2, float scale, int64_t size) {
+
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  const fVec s_vec = fVec(scale);
+  int64_t d;
+  #pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    fVec x0 = fVec::loadu(input + d);
+    fVec x1 = fVec::loadu(input + d + fVec::size());
+
+    bVec y_bvec = bVec::loadu(input2 + d);
+    fVec y0, y1;
+    std::tie(y0, y1) = at::vec::convert_to_float(y_bvec);
+
+    x0 = x0 + y0 * s_vec;
+    x1 = x1 + y1 * s_vec;
+    bVec out_vec = convert_from_float_ext<scalar_t>(x0, x1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d] + float(input2[d]) * scale);
+  }
+}
+
+/// gemm for w13
+template <typename scalar_t, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_vnni {
+  static inline void apply(
+      const uint8_t* __restrict__ A, const int8_t* __restrict__ B0, const int8_t* __restrict__ B1, scalar_t* __restrict__ C,
+      const float* __restrict__ As, const float* __restrict__ Bs0, const float* __restrict__ Bs1,
+      const int32_t* __restrict__ Bcomp0, const int32_t* __restrict__ Bcomp1,
+      int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+    TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
+  }
+};
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_vnni<at::BFloat16, BLOCK_M, BLOCK_N> {
+  static inline void apply(
+      const uint8_t* __restrict__ A, const int8_t* __restrict__ B0, const int8_t* __restrict__ B1, at::BFloat16* __restrict__ C,
+      const float* __restrict__ As, const float* __restrict__ Bs0, const float* __restrict__ Bs1,
+      const int32_t* __restrict__ Bcomp0, const int32_t* __restrict__ Bcomp1,
+      int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+
+    constexpr int ROWS = BLOCK_M;
+    constexpr int COLS = BLOCK_N / 16;
+    static_assert(COLS % 2 == 0);
+
+    __m512i va;
+    __m512i vb0[COLS];
+    __m512i vb1[COLS];
+    __m512i vc0[ROWS * COLS];
+    __m512i vc1[ROWS * COLS];
+    __m512i vcomp0[COLS];
+    __m512i vcomp1[COLS];
+    __m512  was;
+    __m512  vbs0[COLS];
+    __m512  vbs1[COLS];
+
+    auto loadc = [&](auto i) {
+      vc0[i] = _mm512_set1_epi32(0);
+      vc1[i] = _mm512_set1_epi32(0);
+    };
+    Unroll<ROWS * COLS>{}(loadc);
+
+    const int64_t K4 = K >> 2;
+    const int64_t lda4 = lda >> 2;
+    const int64_t ldb4 = ldb; // ldb * 4 >> 2;
+    const int32_t* a_ptr = reinterpret_cast<const int32_t*>(A);
+    const int32_t* b0_ptr = reinterpret_cast<const int32_t*>(B0);
+    const int32_t* b1_ptr = reinterpret_cast<const int32_t*>(B1);
+
+    auto compute = [&](auto i, int64_t k) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      if constexpr (col == 0) {
+        va = _mm512_set1_epi32(a_ptr[row * lda4 + k]);
+      }
+      if constexpr (row == 0) {
+        vb0[col] = _mm512_loadu_si512(b0_ptr + k * ldb4 + col * 16);
+        vb1[col] = _mm512_loadu_si512(b1_ptr + k * ldb4 + col * 16);
+      }
+      vc0[i] = _mm512_dpbusd_epi32(vc0[i], va, vb0[col]);
+      vc1[i] = _mm512_dpbusd_epi32(vc1[i], va, vb1[col]);
+    };
+    for (int64_t k = 0; k < K4; ++k) {
+      Unroll<ROWS * COLS>{}(compute, k);
+    }
+
+    auto scalec = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      // load a scale
+      if constexpr(col == 0) {
+        was = _mm512_set1_ps(As[row]);
+      }
+      // load b scale and vcomp
+      if constexpr (row == 0) {
+        vbs0[col] = _mm512_loadu_ps(Bs0 + col * 16);
+        vbs1[col] = _mm512_loadu_ps(Bs1 + col * 16);
+        vcomp0[col] = _mm512_loadu_si512(Bcomp0 + col * 16);
+        vcomp1[col] = _mm512_loadu_si512(Bcomp1 + col * 16);
+      }
+      __m512 c0 = _mm512_cvtepi32_ps(_mm512_sub_epi32(vc0[i], vcomp0[col]));
+      __m512 c1 = _mm512_cvtepi32_ps(_mm512_sub_epi32(vc1[i], vcomp1[col]));
+      vc0[i] = _mm512_castps_si512(_mm512_mul_ps(_mm512_mul_ps(c0, was), vbs0[col]));
+      vc1[i] = _mm512_castps_si512(_mm512_mul_ps(_mm512_mul_ps(c1, was), vbs1[col]));
+    };
+    Unroll<ROWS * COLS>{}(scalec);
+
+    using Vec = at::vec::Vectorized<float>;
+    const Vec one = Vec(1.f);
+    auto storec = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+      // for COLS = 2, 4 use 512bit store
+      if constexpr (col % 2 == 0) {
+        Vec x0 = _mm512_castsi512_ps(vc0[row * COLS + col + 0]);
+        Vec x1 = _mm512_castsi512_ps(vc0[row * COLS + col + 1]);
+        Vec y0 = _mm512_castsi512_ps(vc1[row * COLS + col + 0]);
+        Vec y1 = _mm512_castsi512_ps(vc1[row * COLS + col + 1]);
+        // silu
+        x0 = x0 / (one + x0.neg().exp_u20());
+        x1 = x1 / (one + x1.neg().exp_u20());
+        // mul
+        x0 = x0 * y0;
+        x1 = x1 * y1;
+
+        _mm512_storeu_si512(
+            reinterpret_cast<__m512i*>((C + row * ldc + col * 16)),
+            (__m512i)(_mm512_cvtne2ps_pbh(__m512(x1), __m512(x0))));
+        }
+    };
+    Unroll<ROWS * COLS>{}(storec);
+  }
+};
+#endif
+
+#define LAUNCH_TINYGEMM_KERNEL_VNNI(MB_SIZE, NB_SIZE)                        \
+    tinygemm_kernel_vnni<scalar_t, MB_SIZE, NB_SIZE>::apply(                 \
+        A + mb_start * lda, B0 + nb_start * 4, B1 + nb_start * 4,            \
+        C + mb_start * ldc + nb_start, As + mb_start,                        \
+        Bs0 + nb_start, Bs1 + nb_start, Bcomp0 + nb_start, Bcomp1 + nb_start,\
+        K, lda, ldb, ldc);
+
+template <typename scalar_t>
+void tinygemm_kernel(
+    const uint8_t* __restrict__ A,
+    const int8_t* __restrict__ B0,
+    const int8_t* __restrict__ B1,
+    scalar_t* __restrict__ C,
+    const float* __restrict__ As,
+    const float* __restrict__ Bs0,
+    const float* __restrict__ Bs1,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc) {
+
+  const int32_t* Bcomp0 = reinterpret_cast<const int32_t*>(B0 + block_size_n() * K);
+  const int32_t* Bcomp1 = reinterpret_cast<const int32_t*>(B1 + block_size_n() * K);
+
+  // pattern: 1-(2+2)-(8+8)
+  constexpr int64_t BLOCK_M = 4;
+  constexpr int64_t BLOCK_N = 32;
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+  for (int mb = 0; mb < MB; ++mb) {
+    int64_t mb_start = mb * BLOCK_M;
+    int64_t mb_size = std::min(BLOCK_M, M - mb_start);
+    for (int64_t nb = 0; nb < NB; ++nb) {
+      int64_t nb_start = nb * BLOCK_N;
+      int64_t nb_size = std::min(BLOCK_N, N - nb_start);
+
+      switch(mb_size << 4 | nb_size >> 4) {
+        case 0x12: LAUNCH_TINYGEMM_KERNEL_VNNI(1, 32); break;
+        case 0x22: LAUNCH_TINYGEMM_KERNEL_VNNI(2, 32); break;
+        case 0x32: LAUNCH_TINYGEMM_KERNEL_VNNI(3, 32); break;
+        case 0x42: LAUNCH_TINYGEMM_KERNEL_VNNI(4, 32); break;
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+      }
+    }
+  }
+}
+
+/// gemm for w2
+template <typename scalar_t, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_vnni2 {
+  static inline void apply(
+      const uint8_t* __restrict__ A, const int8_t* __restrict__ B, float* __restrict__ C,
+      const float* __restrict__ As, const float* __restrict__ Bs, const int32_t* __restrict__ Bcomp,
+      int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+    TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
+  }
+};
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_vnni2<at::BFloat16, BLOCK_M, BLOCK_N> {
+  static inline void apply(
+      const uint8_t* __restrict__ A, const int8_t* __restrict__ B, float* __restrict__ C,
+      const float* __restrict__ As, const float* __restrict__ Bs, const int32_t* __restrict__ Bcomp,
+      int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+
+    constexpr int ROWS = BLOCK_M;
+    constexpr int COLS = BLOCK_N / 16;
+    static_assert(COLS % 2 == 0);
+
+    __m512i va;
+    __m512i vb[COLS];
+    __m512i vc[ROWS * COLS];
+    __m512i vcomp[COLS];
+    __m512  was;
+    __m512  vbs[COLS];
+
+    auto loadc = [&](auto i) {
+      vc[i] = _mm512_set1_epi32(0);
+    };
+    Unroll<ROWS * COLS>{}(loadc);
+
+    const int64_t K4 = K >> 2;
+    const int64_t lda4 = lda >> 2;
+    const int64_t ldb4 = ldb; // ldb * 4 >> 2;
+    const int32_t* a_ptr = reinterpret_cast<const int32_t*>(A);
+    const int32_t* b_ptr = reinterpret_cast<const int32_t*>(B);
+
+    auto compute = [&](auto i, int64_t k) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      if constexpr (col == 0) {
+        va = _mm512_set1_epi32(a_ptr[row * lda4 + k]);
+      }
+      if constexpr (row == 0) {
+        vb[col] = _mm512_loadu_si512(b_ptr + k * ldb4 + col * 16);
+      }
+      vc[i] = _mm512_dpbusd_epi32(vc[i], va, vb[col]);
+    };
+    for (int64_t k = 0; k < K4; ++k) {
+      Unroll<ROWS * COLS>{}(compute, k);
+    }
+
+    auto storec = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      // load a scale
+      if constexpr(col == 0) {
+        was = _mm512_set1_ps(As[row]);
+      }
+      // load b scale and vcomp per 2 vectors
+      // also load bias if any
+      if constexpr (row == 0) {
+        if constexpr (col % 2 == 0) {
+          vbs[col + 0] = _mm512_loadu_ps(Bs + col * 16);
+          vbs[col + 1] = _mm512_loadu_ps(Bs + col * 16 + 16);
+          vcomp[col + 0] = _mm512_loadu_si512(Bcomp + col * 16);
+          vcomp[col + 1] = _mm512_loadu_si512(Bcomp + col * 16 + 16);
+        }
+      }
+      __m512 x = _mm512_cvtepi32_ps(_mm512_sub_epi32(vc[i], vcomp[col]));
+      x = _mm512_mul_ps(_mm512_mul_ps(x, was), vbs[col]);
+      _mm512_storeu_ps(reinterpret_cast<__m512*>(C + row * ldc + col * 16), x);
+    };
+    Unroll<ROWS * COLS>{}(storec);
+  }
+};
+#endif
+
+#define LAUNCH_TINYGEMM_KERNEL_VNNI2(MB_SIZE, NB_SIZE)                       \
+    tinygemm_kernel_vnni2<scalar_t, MB_SIZE, NB_SIZE>::apply(                \
+        A + mb_start * lda, B + nb_start * 4, C + mb_start * ldc + nb_start, \
+        As + mb_start, Bs + nb_start, Bcomp + nb_start,                      \
+        K, lda, ldb, ldc);
+
+template <typename scalar_t>
+void tinygemm_kernel(
+    const uint8_t* __restrict__ A,
+    const int8_t* __restrict__ B,
+    float* __restrict__ C,
+    const float* __restrict__ As,
+    const float* __restrict__ Bs,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc) {
+
+  // B compensation
+  const int32_t* Bcomp = reinterpret_cast<const int32_t*>(B + block_size_n() * K);
+
+  // pattern: 1-4-16
+  constexpr int64_t BLOCK_M = 4;
+  constexpr int64_t BLOCK_N = 64;
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+  for (int64_t mb = 0; mb < MB; ++mb) {
+    int64_t mb_start = mb * BLOCK_M;
+    int64_t mb_size = std::min(BLOCK_M, M - mb_start);
+    for (int64_t nb = 0; nb < NB; ++nb) {
+      int64_t nb_start = nb * BLOCK_N;
+      int64_t nb_size = std::min(BLOCK_N, N - nb_start);
+
+      switch(mb_size << 4 | nb_size >> 4) {
+        case 0x12: LAUNCH_TINYGEMM_KERNEL_VNNI2(1, 32); break;
+        case 0x22: LAUNCH_TINYGEMM_KERNEL_VNNI2(2, 32); break;
+        case 0x32: LAUNCH_TINYGEMM_KERNEL_VNNI2(3, 32); break;
+        case 0x42: LAUNCH_TINYGEMM_KERNEL_VNNI2(4, 32); break;
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+      }
+    }
+  }
+}
+
+} // anonymous namespace
+
+template <typename scalar_t>
+void fused_experts_int8_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic1,
+    scalar_t* __restrict__ ic2,
+    uint8_t* __restrict__ A_tmp,
+    float* __restrict__ C_tmp,
+    uint8_t* __restrict__ Aq_tmp,
+    float* __restrict__ As_tmp,
+    const scalar_t* __restrict__ input,
+    const int8_t* __restrict__ packed_w1,
+    const int8_t* __restrict__ packed_w2,
+    const float* __restrict__ w1s,
+    const float* __restrict__ w2s,
+    const float* __restrict__ topk_weights,
+    const int32_t* __restrict__ sorted_ids,
+    const int32_t* __restrict__ expert_ids,
+    const int32_t* __restrict__ offsets,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t E,
+    int64_t topk,
+    int64_t num_tokens_post_pad) {
+
+  // handle 2 tiles per block
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+
+  // stage 0: quantize input to uint8, [M, K]
+  at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      quantize_row_int8<scalar_t>(
+          Aq_tmp + m * K,
+          As_tmp[m],
+          input + m * K,
+          K);
+    }
+  });
+
+  // stage 1: intermediate_cache1 = silu(hidden_states @ w1)
+  const int64_t MB = div_up(num_tokens_post_pad, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+
+  // strides for w1: [E, 2N, K]
+  TORCH_CHECK(N % BLOCK_N == 0, "Fixme when N is not multiples of ", BLOCK_N);
+
+  // K and N are packed for int8
+  const int64_t packed_K = get_row_size<int8_t>(K);
+  const int64_t packed_N = get_row_size<int8_t>(N);
+
+  const int64_t stride_e = 2 * N * packed_K;
+  const int64_t stride_n = packed_K;
+  // here we only parallel on half of 2N to fuse silu_and_mul with gemm
+  at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
+    // get local pointers
+    int tid = at::get_thread_num();
+    uint8_t* __restrict__ A = A_tmp + tid * BLOCK_M * K;
+
+    alignas(64) float As[BLOCK_M];
+
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t mb = i / NB;
+      int64_t nb = i % NB;
+
+      // nb0 from top half and nb1 from bottom half
+      int64_t nb0 = nb, nb1 = nb + NB;
+      int64_t n_size = std::min(N - nb0 * BLOCK_N, BLOCK_N);
+
+      // B shape [K, n_size] in vnni format
+      int32_t expert_id = expert_ids[mb];
+      const int8_t* __restrict__ B0 = packed_w1 + expert_id * stride_e + nb0 * BLOCK_N * stride_n;
+      const int8_t* __restrict__ B1 = packed_w1 + expert_id * stride_e + nb1 * BLOCK_N * stride_n;
+      const float* __restrict__ Bs0 = w1s + expert_id * 2 * N + nb0 * BLOCK_N;
+      const float* __restrict__ Bs1 = w1s + expert_id * 2 * N + nb1 * BLOCK_N;
+
+      // 1.a load A
+      const int32_t* A_ids = sorted_ids + mb * BLOCK_M;
+      int64_t m_size = offsets[mb + 1] - offsets[mb];
+
+      for (int64_t m = 0; m < m_size; ++m) {
+        int32_t index = A_ids[m] / topk;
+        copy_stub(A + m * K, Aq_tmp + index * K, K);
+        As[m] = As_tmp[index];
+      }
+
+      // fused 1.b: silu_and_mul(A @ B0, A @ B1)
+      const int64_t offset = offsets[mb];
+      tinygemm_kernel(
+          /* A     */ A,
+          /* B0    */ B0,
+          /* B1    */ B1,
+          /* C     */ ic1 + offset * N + nb * BLOCK_N,
+          /* As    */ As,
+          /* Bs0   */ Bs0,
+          /* Bs1   */ Bs1,
+          /* M     */ m_size,
+          /* N     */ n_size,
+          /* K     */ K,
+          /* lda   */ K,
+          /* ldb   */ n_size,
+          /* ldc   */ N);
+    }
+  });
+
+  // stage 1.5: quantize ic1 to uint8, [M * topk, N]
+  at::parallel_for(0, M * topk, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      quantize_row_int8<scalar_t>(
+          Aq_tmp + m * N,
+          As_tmp[m],
+          ic1 + m * N,
+          N);
+    }
+  });
+
+  // stage 2: intermediate_cache2 = intermediate_cache1 @ w2
+  //   w2 : [E, K, N] as [E, OC, IC]
+  const int64_t OC = K;  // rename K as OC
+  const int64_t IC = N;  // rename N as IC
+  const int64_t MB2 = MB;
+  const int64_t NB2 = div_up(OC, BLOCK_N);
+  const int64_t stride_e2 = OC * packed_N;
+  const int64_t stride_oc = packed_N;
+
+  // parallel on [MB2, NB2]
+  at::parallel_for(0, MB2 * NB2, 0, [&](int64_t begin, int64_t end) {
+    // get local pointers
+    int tid = at::get_thread_num();
+    // we won't be using C1 for gemm2
+    float* __restrict__ C = C_tmp + tid * 2 * BLOCK_M * BLOCK_N;
+
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t mb = i / NB2;
+      int64_t nb = i % NB2;
+
+      int64_t m_size = offsets[mb + 1] - offsets[mb];
+      int64_t n_size = std::min(OC - nb * BLOCK_N, BLOCK_N);
+
+      // A ptr from ic1 of [M * topk, N] in sorted order
+      // so as to avoid copy A to tmp buffer again
+      const uint8_t* __restrict__ A = Aq_tmp + offsets[mb] * N;
+      const float* __restrict__ As = As_tmp + offsets[mb];
+      const int32_t* A_ids = sorted_ids + mb * BLOCK_M;
+
+      // B shape [IC, n_size] in vnni format
+      int32_t expert_id = expert_ids[mb];
+      const int8_t* __restrict__ B = packed_w2 + expert_id * stride_e2 + nb * BLOCK_N * stride_oc;
+      const float* __restrict__ Bs = w2s + expert_id * K + nb * BLOCK_N;
+
+      // 2.a gemm: C = A @ B
+      tinygemm_kernel<scalar_t>(
+          /* A     */ A,
+          /* B     */ B,
+          /* C     */ C,
+          /* As    */ As,
+          /* Bs    */ Bs,
+          /* M     */ m_size,
+          /* N     */ n_size,
+          /* K     */ IC,
+          /* lda   */ IC,
+          /* ldb   */ n_size,
+          /* ldc   */ BLOCK_N);
+
+      // 2.b copy from C to ic2 in original order
+      //   and also mul topk_weights in float32
+      for (int64_t m = 0; m < m_size; ++m) {
+        int32_t index = A_ids[m];
+        float weight = topk_weights[index];
+        copy_mul_stub(ic2 + index * K + nb * BLOCK_N, C + m * BLOCK_N, weight, n_size);
+      }
+    }
+  });
+
+  // stage 3: out = intermediate_cache2.sum(dim=1)
+  //   from [M, topk, K] to [M, K]
+  at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      sum_stub(output + m * K, ic2 + m * topk * K, topk, K);
+    }
+  });
+}
+
+#define INSTANTIATE_MOE_INT8_TEMPLATE(TYPE)                                                  \
+  template void fused_experts_int8_kernel_impl<TYPE> (                                       \
+      TYPE* __restrict__ output, TYPE* __restrict__ ic1,                                     \
+      TYPE* __restrict__ ic2, uint8_t* __restrict__ A_tmp,                                   \
+      float* __restrict__ C_tmp, uint8_t* __restrict__ Aq_tmp,                               \
+      float* __restrict__ As_tmp, const TYPE* __restrict__ input,                            \
+      const int8_t* __restrict__ packed_w1, const int8_t* __restrict__ packed_w2,            \
+      const float* __restrict__ w1s, const float* __restrict__ w2s,                          \
+      const float* __restrict__ topk_weights, const int32_t* __restrict__ sorted_ids,        \
+      const int32_t* __restrict__ expert_ids, const int32_t* __restrict__ offsets,           \
+      int64_t M, int64_t N, int64_t K, int64_t E, int64_t topk, int64_t num_tokens_post_pad)
+
+INSTANTIATE_MOE_INT8_TEMPLATE(at::BFloat16);
+INSTANTIATE_MOE_INT8_TEMPLATE(at::Half);
+
+template <typename scalar_t>
+void shared_expert_int8_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic1,
+    float* __restrict__ C_tmp,
+    uint8_t* __restrict__ Aq_tmp,
+    float* __restrict__ As_tmp,
+    const scalar_t* __restrict__ input,
+    const int8_t* __restrict__ packed_w1,
+    const int8_t* __restrict__ packed_w2,
+    const float* __restrict__ w1s,
+    const float* __restrict__ w2s,
+    const scalar_t* __restrict__ fused_experts_out,
+    float routed_scaling_factor,
+    int64_t M,
+    int64_t N,
+    int64_t K) {
+
+  // handle 2 tiles per block
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+
+  // stage 0: quantize input to uint8, [M, K]
+  at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      quantize_row_int8<scalar_t>(
+          Aq_tmp + m * K,
+          As_tmp[m],
+          input + m * K,
+          K);
+    }
+  });
+
+   // stage 1: intermediate_cache1 = silu(hidden_states @ w1)
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+
+  TORCH_CHECK(N % BLOCK_N == 0, "Fixme when N is not multiples of ", BLOCK_N);
+
+  // K and N are packed for int8
+  const int64_t packed_K = get_row_size<int8_t>(K);
+  const int64_t packed_N = get_row_size<int8_t>(N);
+  const int64_t stride_n = packed_K;
+
+  // here we only parallel on half of 2N to fuse silu_and_mul with gemm
+  at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t mb = i / NB;
+      int64_t nb = i % NB;
+
+      // nb0 from top half and nb1 from bottom half
+      int64_t nb0 = nb, nb1 = nb + NB;
+      int64_t n_size = std::min(N - nb0 * BLOCK_N, BLOCK_N);
+      int64_t m_size = std::min(M - mb * BLOCK_M, BLOCK_M);
+
+      // A shape [m_size, K]
+      const uint8_t* A = Aq_tmp + mb * BLOCK_M * K;
+      const float* As = As_tmp + mb * BLOCK_M;
+
+      // B shape [K, n_size] in vnni format
+      const int8_t* __restrict__ B0 = packed_w1 + nb0 * BLOCK_N * stride_n;
+      const int8_t* __restrict__ B1 = packed_w1 + nb1 * BLOCK_N * stride_n;
+      const float* __restrict__ Bs0 = w1s + nb0 * BLOCK_N;
+      const float* __restrict__ Bs1 = w1s + nb1 * BLOCK_N;
+
+      // fused 1.b: silu_and_mul(A @ B0, A @ B1)
+      tinygemm_kernel(
+          /* A     */ A,
+          /* B0    */ B0,
+          /* B1    */ B1,
+          /* C     */ ic1 + mb * BLOCK_M * N + nb * BLOCK_N,
+          /* As    */ As,
+          /* Bs0   */ Bs0,
+          /* Bs1   */ Bs1,
+          /* M     */ m_size,
+          /* N     */ n_size,
+          /* K     */ K,
+          /* lda   */ K,
+          /* ldb   */ n_size,
+          /* ldc   */ N);
+    }
+  });
+
+  // stage 1.5: quantize ic1 to uint8, [M * topk, N]
+  at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      quantize_row_int8<scalar_t>(
+          Aq_tmp + m * N,
+          As_tmp[m],
+          ic1 + m * N,
+          N);
+    }
+  });
+
+  // stage 2: intermediate_cache2 = intermediate_cache1 @ w2
+  //   w2 : [K, N] as [OC, IC]
+  const int64_t OC = K;  // rename K as OC
+  const int64_t IC = N;  // rename N as IC
+  const int64_t MB2 = MB;
+  const int64_t NB2 = div_up(OC, BLOCK_N);
+  const int64_t stride_oc = packed_N;
+
+  // parallel on [MB2, NB2]
+  at::parallel_for(0, MB2 * NB2, 0, [&](int64_t begin, int64_t end) {
+    // get local pointers
+    int tid = at::get_thread_num();
+    // we won't be using C1 for gemm2
+    float* __restrict__ C = C_tmp + tid * 2 * BLOCK_M * BLOCK_N;
+
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t mb = i / NB2;
+      int64_t nb = i % NB2;
+
+      int64_t m_size = std::min(M - mb * BLOCK_M, BLOCK_M);
+      int64_t n_size = std::min(OC - nb * BLOCK_N, BLOCK_N);
+
+      // A shape [m_size, IC]
+      const uint8_t* __restrict__ A = Aq_tmp + mb * BLOCK_M * N;
+      const float* __restrict__ As = As_tmp + mb * BLOCK_M;
+
+      // B shape [IC, n_size] in vnni format
+      const int8_t* __restrict__ B = packed_w2 + nb * BLOCK_N * stride_oc;
+      const float* __restrict__ Bs = w2s + nb * BLOCK_N;
+
+      // 2.a gemm: C = A @ B
+      tinygemm_kernel<scalar_t>(
+          /* A     */ A,
+          /* B     */ B,
+          /* C     */ C,
+          /* As    */ As,
+          /* Bs    */ Bs,
+          /* M     */ m_size,
+          /* N     */ n_size,
+          /* K     */ IC,
+          /* lda   */ IC,
+          /* ldb   */ n_size,
+          /* ldc   */ BLOCK_N);
+
+      // 2.b copy from C to output and add fused_experts_out
+      scalar_t* __restrict__ out = output + mb * BLOCK_M * K + nb * BLOCK_N;
+      const scalar_t* __restrict__ fused_out = fused_experts_out + mb * BLOCK_M * K + nb * BLOCK_N;
+      for (int64_t m = 0; m < m_size; ++m) {
+        add_mul_stub(out + m * K, C + m * BLOCK_N, fused_out + m * K, routed_scaling_factor, n_size);
+      }
+    }
+  });
+}
+
+#define INSTANTIATE_SHARED_EXPERT_INT8_TEMPLATE(TYPE)                                        \
+  template void shared_expert_int8_kernel_impl<TYPE> (                                       \
+      TYPE* __restrict__ output, TYPE* __restrict__ ic1,                                     \
+      float* __restrict__ C_tmp, uint8_t* __restrict__ Aq_tmp,                               \
+      float* __restrict__ As_tmp, const TYPE* __restrict__ input,                            \
+      const int8_t* __restrict__ packed_w1, const int8_t* __restrict__ packed_w2,            \
+      const float* __restrict__ w1s, const float* __restrict__ w2s,                          \
+      const TYPE* __restrict__ fused_experts_out, float routed_scaling_factor,               \
+      int64_t M, int64_t N, int64_t K)
+
+INSTANTIATE_SHARED_EXPERT_INT8_TEMPLATE(at::BFloat16);
+INSTANTIATE_SHARED_EXPERT_INT8_TEMPLATE(at::Half);
diff --git a/vllm_v0.10.0/csrc/cpu/sgl-kernels/vec.h b/vllm_v0.10.0/csrc/cpu/sgl-kernels/vec.h
new file mode 100644
index 0000000..160845c
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cpu/sgl-kernels/vec.h
@@ -0,0 +1,308 @@
+// Adapted from
+// https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
+
+#pragma once
+
+// clang-format off
+
+#if defined(__AVX512F__) && defined(__AVX512BF16__) && defined(__AMX_BF16__)
+#define CPU_CAPABILITY_AVX512
+#endif
+
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
+
+namespace {
+
+using namespace at::vec;
+
+template <typename scalar_t,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline Vectorized<scalar_t> convert_from_float_ext(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return at::vec::convert_from_float<scalar_t>(a, b);
+}
+
+#if defined(CPU_CAPABILITY_AVX512)
+
+// `at::vec::convert_from_float<>` from PyTorch doesn't have avx512-bf16 intrinsics
+// use native instruction for bfloat16->float32 conversion
+template <>
+inline Vectorized<at::BFloat16> convert_from_float_ext<at::BFloat16>(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return (__m512i)(_mm512_cvtne2ps_pbh(__m512(b), __m512(a)));
+}
+
+#define CVT_BF16_TO_FP32(a) \
+    _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(a), 16))
+
+#define CVT_FP16_TO_FP32(a) \
+    _mm512_cvtps_ph(a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))
+
+// this doesn't handle NaN.
+inline __m512bh cvt_e4m3_bf16_intrinsic_no_nan(__m256i fp8_vec) {
+  const __m512i x = _mm512_cvtepu8_epi16(fp8_vec);
+
+  const __m512i mant = _mm512_slli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(0x07)), 4);
+  const __m512i raw_exp = _mm512_srli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(0x78)), 3);
+  const __m512i exp = _mm512_slli_epi16(_mm512_add_epi16(raw_exp, _mm512_set1_epi16(120)), 7);
+  const __m512i nonsign = _mm512_or_si512(exp, mant);
+
+  const __m512i sign = _mm512_slli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(0x80)), 8);
+  const __m512i combined = _mm512_or_si512(nonsign, sign);
+
+  const __mmask32 is_nonzero = _mm512_cmpneq_epi16_mask(x, _mm512_setzero_si512());
+  return (__m512bh)_mm512_maskz_mov_epi16(is_nonzero, combined);
+}
+
+inline __m512bh cvt_e4m3_bf16_intrinsic_without_denorm(__m256i fp8_vec) {
+  // The following conversion is without denorm behavior, that is to say,
+  //   Max subnorm   : S.0000.111 = 0.875 ∗ 2**(−6)
+  //   Min subnorm   : S.0000.001 = 2**(−9)
+  // 0.0019 ~ 0.0137 cannot be converted correctly.
+  __m512i x = _mm512_cvtepu8_epi16(fp8_vec);
+  auto mask = _mm512_cmpneq_epi16_mask(
+      _mm512_and_si512(x, _mm512_set1_epi16(127)),
+      _mm512_setzero_si512());  // mask = x & 0x7f
+  auto mask_nan = _mm512_cmpneq_epi16_mask(
+      _mm512_and_si512(x, _mm512_set1_epi16(127)),
+      _mm512_set1_epi16(127));                                                      // mask_nan = x & 0x7f
+  auto mantissa = _mm512_slli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(7)), 4);  // mantissa = (x & 7) << 4
+  auto exponent = _mm512_add_epi16(
+      _mm512_srli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(120)), 3),
+      _mm512_set1_epi16(120));  // exponent = (((x >> 3) & 15) + 120)
+  auto nonsign = _mm512_maskz_mov_epi16(mask, _mm512_or_si512(mantissa, _mm512_slli_epi16(exponent, 7)));
+  nonsign = _mm512_mask_mov_epi16(_mm512_set1_epi16(0x7fff), mask_nan, nonsign);  // deal with Nan
+  return (__m512bh)(_mm512_or_si512(
+      nonsign,
+      _mm512_slli_epi16(
+          _mm512_and_si512(x, _mm512_set1_epi16(128)),
+          8)));  // add sign (x & 128) << 8
+}
+
+inline __m512bh cvt_e4m3_bf16_intrinsic_with_denorm(__m256i fp8_vec) {
+  __m512i x = _mm512_cvtepu8_epi16(fp8_vec);
+  __m512i lg2mant = _mm512_mask_mov_epi16(
+      _mm512_mask_mov_epi16(
+          _mm512_setzero_si512(), _mm512_test_epi16_mask(x, _mm512_set1_epi16(2)), _mm512_set1_epi16(1)),
+      _mm512_test_epi16_mask(x, _mm512_set1_epi16(4)),
+      _mm512_set1_epi16(2));
+  return (__m512bh)(_mm512_or_si512(
+      _mm512_maskz_mov_epi16(
+          _mm512_cmpneq_epi16_mask(_mm512_and_si512(x, _mm512_set1_epi16(127)), _mm512_setzero_si512()),
+          _mm512_mask_blend_epi16(
+              _mm512_test_epi16_mask(x, _mm512_set1_epi16(120)),
+              _mm512_or_si512(
+                  _mm512_and_si512(
+                      _mm512_sllv_epi16(
+                          _mm512_and_si512(x, _mm512_set1_epi16(3)), _mm512_sub_epi16(_mm512_set1_epi16(7), lg2mant)),
+                      _mm512_set1_epi16(0x007f)),
+                  _mm512_slli_epi16(_mm512_add_epi16(lg2mant, _mm512_set1_epi16(118)), 7)),
+              _mm512_or_si512(
+                  _mm512_slli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(7)), 4),
+                  _mm512_slli_epi16(
+                      _mm512_add_epi16(
+                          _mm512_srli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(120)), 3), _mm512_set1_epi16(120)),
+                      7)))),
+      _mm512_slli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(128)), 8)));
+}
+
+inline __m512bh CVT_FP8_TO_BF16(__m256i a) {
+#ifdef SGLANG_CPU_FP8_CVT_FTZ
+  return cvt_e4m3_bf16_intrinsic_no_nan(a);
+#else
+  return cvt_e4m3_bf16_intrinsic_with_denorm(a);
+#endif
+}
+
+#endif
+
+// vector to scalar reduction
+#if defined(CPU_CAPABILITY_AVX512) && 0
+inline float vec_reduce_sum(const Vectorized<float>& a) {
+  return _mm512_reduce_add_ps(__m512(a));
+}
+
+inline float vec_reduce_max(const Vectorized<float>& a) {
+  return _mm512_reduce_max_ps(__m512(a));
+}
+#else
+inline float vec_reduce_sum(const Vectorized<float>& a) {
+  return vec_reduce_all([](Vectorized<float>& x, Vectorized<float>& y) { return x + y; }, a);
+}
+
+inline float vec_reduce_max(const Vectorized<float>& a) {
+  return vec_reduce_all([](Vectorized<float>& x, Vectorized<float>& y) { return maximum(x, y); }, a);
+}
+#endif
+
+// https://github.com/InternLM/lmdeploy/blob/086481ed84b59bee3b8e4274e5fc69620040c048/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py#L282
+template <typename scalar_t>
+inline void quantize_row_int8(uint8_t* __restrict__ Aq, float& As,
+    const scalar_t* __restrict__ A, int64_t K, float eps = 1e-7) {
+
+  float amax = 0.f; // absolute max
+  for (int64_t k = 0; k < K; ++k) {
+    const float val = static_cast<float>(A[k]);
+    amax = std::max(amax, std::abs(val));
+  }
+
+  amax = std::max(amax, eps);
+  const float scale = amax / 127;
+  const float inv_scale = 127 / amax;
+
+  for (int64_t k = 0; k < K; ++k) {
+    const float val = static_cast<float>(A[k]) * inv_scale;
+    Aq[k] = (uint8_t)(std::round(val)) + 128;
+  }
+  As = scale;
+}
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <>
+inline void quantize_row_int8<at::BFloat16>(uint8_t* __restrict__ Aq, float& As,
+    const at::BFloat16* __restrict__ A, int64_t K, float eps) {
+
+  const __m512 signBit = _mm512_set1_ps(-0.0f);
+  const __m512i off = _mm512_set1_epi32(128);
+
+  // K is 32x, no remainder
+  float amax = 0.f;
+  __m512 vamax0 = _mm512_set1_ps(0.f);
+  __m512 vamax1 = _mm512_set1_ps(0.f);
+  for (int64_t k = 0; k < K; k += 32) {
+    __m512i va = _mm512_loadu_si512((void*)(A + k));
+    __m512 va0 = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32(va, 0));
+    __m512 va1 = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32(va, 1));
+    vamax0 = _mm512_max_ps(vamax0, _mm512_andnot_ps(signBit, va0));
+    vamax1 = _mm512_max_ps(vamax1, _mm512_andnot_ps(signBit, va1));
+  }
+  amax = _mm512_reduce_max_ps(_mm512_max_ps(vamax0, vamax1));
+  amax = std::max(amax, eps);
+  const float scale = amax / 127;
+  const float inv_scale = 127 / amax;
+  const __m512 vd = _mm512_set1_ps(inv_scale);
+
+  for (int64_t k = 0; k < K; k += 32) {
+    __m512i va = _mm512_loadu_si512((void*)(A + k));
+    __m512 va0 = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32(va, 0));
+    __m512 va1 = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32(va, 1));
+    va0 = _mm512_mul_ps(va0, vd);
+    va1 = _mm512_mul_ps(va1, vd);
+    va0 = _mm512_roundscale_ps(va0, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+    va1 = _mm512_roundscale_ps(va1, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+    __m128i i0 = _mm512_cvtepi32_epi8(_mm512_add_epi32(_mm512_cvtps_epi32(va0), off));
+    __m128i i1 = _mm512_cvtepi32_epi8(_mm512_add_epi32(_mm512_cvtps_epi32(va1), off));
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(Aq + k), _mm256_set_m128i(i1, i0));
+  }
+  As = scale;
+}
+#endif
+
+// transpose utils
+// taken from my PR in ggml: https://github.com/ggml-org/llama.cpp/pull/8998
+#if defined(CPU_CAPABILITY_AVX512)
+inline void transpose_16x16_32bit(__m512i * v) {
+  __m512i v1[16];
+  v1[0] = _mm512_unpacklo_epi32(v[0], v[1]);
+  v1[1] = _mm512_unpackhi_epi32(v[0], v[1]);
+  v1[2] = _mm512_unpacklo_epi32(v[2], v[3]);
+  v1[3] = _mm512_unpackhi_epi32(v[2], v[3]);
+  v1[4] = _mm512_unpacklo_epi32(v[4], v[5]);
+  v1[5] = _mm512_unpackhi_epi32(v[4], v[5]);
+  v1[6] = _mm512_unpacklo_epi32(v[6], v[7]);
+  v1[7] = _mm512_unpackhi_epi32(v[6], v[7]);
+  v1[8] = _mm512_unpacklo_epi32(v[8], v[9]);
+  v1[9] = _mm512_unpackhi_epi32(v[8], v[9]);
+  v1[10] = _mm512_unpacklo_epi32(v[10], v[11]);
+  v1[11] = _mm512_unpackhi_epi32(v[10], v[11]);
+  v1[12] = _mm512_unpacklo_epi32(v[12], v[13]);
+  v1[13] = _mm512_unpackhi_epi32(v[12], v[13]);
+  v1[14] = _mm512_unpacklo_epi32(v[14], v[15]);
+  v1[15] = _mm512_unpackhi_epi32(v[14], v[15]);
+
+  v[0] = _mm512_unpacklo_epi64(v1[0], v1[2]);
+  v[1] = _mm512_unpackhi_epi64(v1[0], v1[2]);
+  v[2] = _mm512_unpacklo_epi64(v1[1], v1[3]);
+  v[3] = _mm512_unpackhi_epi64(v1[1], v1[3]);
+  v[4] = _mm512_unpacklo_epi64(v1[4], v1[6]);
+  v[5] = _mm512_unpackhi_epi64(v1[4], v1[6]);
+  v[6] = _mm512_unpacklo_epi64(v1[5], v1[7]);
+  v[7] = _mm512_unpackhi_epi64(v1[5], v1[7]);
+  v[8] = _mm512_unpacklo_epi64(v1[8], v1[10]);
+  v[9] = _mm512_unpackhi_epi64(v1[8], v1[10]);
+  v[10] = _mm512_unpacklo_epi64(v1[9], v1[11]);
+  v[11] = _mm512_unpackhi_epi64(v1[9], v1[11]);
+  v[12] = _mm512_unpacklo_epi64(v1[12], v1[14]);
+  v[13] = _mm512_unpackhi_epi64(v1[12], v1[14]);
+  v[14] = _mm512_unpacklo_epi64(v1[13], v1[15]);
+  v[15] = _mm512_unpackhi_epi64(v1[13], v1[15]);
+
+  v1[0] = _mm512_shuffle_i32x4(v[0], v[4], 0x88);
+  v1[1] = _mm512_shuffle_i32x4(v[1], v[5], 0x88);
+  v1[2] = _mm512_shuffle_i32x4(v[2], v[6], 0x88);
+  v1[3] = _mm512_shuffle_i32x4(v[3], v[7], 0x88);
+  v1[4] = _mm512_shuffle_i32x4(v[0], v[4], 0xdd);
+  v1[5] = _mm512_shuffle_i32x4(v[1], v[5], 0xdd);
+  v1[6] = _mm512_shuffle_i32x4(v[2], v[6], 0xdd);
+  v1[7] = _mm512_shuffle_i32x4(v[3], v[7], 0xdd);
+  v1[8] = _mm512_shuffle_i32x4(v[8], v[12], 0x88);
+  v1[9] = _mm512_shuffle_i32x4(v[9], v[13], 0x88);
+  v1[10] = _mm512_shuffle_i32x4(v[10], v[14], 0x88);
+  v1[11] = _mm512_shuffle_i32x4(v[11], v[15], 0x88);
+  v1[12] = _mm512_shuffle_i32x4(v[8], v[12], 0xdd);
+  v1[13] = _mm512_shuffle_i32x4(v[9], v[13], 0xdd);
+  v1[14] = _mm512_shuffle_i32x4(v[10], v[14], 0xdd);
+  v1[15] = _mm512_shuffle_i32x4(v[11], v[15], 0xdd);
+
+  v[0] = _mm512_shuffle_i32x4(v1[0], v1[8], 0x88);
+  v[1] = _mm512_shuffle_i32x4(v1[1], v1[9], 0x88);
+  v[2] = _mm512_shuffle_i32x4(v1[2], v1[10], 0x88);
+  v[3] = _mm512_shuffle_i32x4(v1[3], v1[11], 0x88);
+  v[4] = _mm512_shuffle_i32x4(v1[4], v1[12], 0x88);
+  v[5] = _mm512_shuffle_i32x4(v1[5], v1[13], 0x88);
+  v[6] = _mm512_shuffle_i32x4(v1[6], v1[14], 0x88);
+  v[7] = _mm512_shuffle_i32x4(v1[7], v1[15], 0x88);
+  v[8] = _mm512_shuffle_i32x4(v1[0], v1[8], 0xdd);
+  v[9] = _mm512_shuffle_i32x4(v1[1], v1[9], 0xdd);
+  v[10] = _mm512_shuffle_i32x4(v1[2], v1[10], 0xdd);
+  v[11] = _mm512_shuffle_i32x4(v1[3], v1[11], 0xdd);
+  v[12] = _mm512_shuffle_i32x4(v1[4], v1[12], 0xdd);
+  v[13] = _mm512_shuffle_i32x4(v1[5], v1[13], 0xdd);
+  v[14] = _mm512_shuffle_i32x4(v1[6], v1[14], 0xdd);
+  v[15] = _mm512_shuffle_i32x4(v1[7], v1[15], 0xdd);
+}
+
+// remove warning : ignoring attributes on template argument ‘__m512i’ [-Wignored-attributes]
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wignored-attributes"
+
+// transpose from [2, 32] to [32, 2]
+inline std::tuple<__m512i, __m512i> transpose_2x32_16bit(__m512i r0, __m512i r1) {
+  // r0: {a0, a1, ..., a31}
+  // r1: {b0, b1, ..., b31}
+  //
+  // d0: {a0,   b0, ..., a15, b15}
+  // d1: {a16, b16, ..., a31, b31}
+  //
+  __m512i d0 = _mm512_unpacklo_epi16(r0, r1);
+  __m512i d1 = _mm512_unpackhi_epi16(r0, r1);
+  r0 = _mm512_shuffle_i32x4(d0, d1, 0x88);
+  r1 = _mm512_shuffle_i32x4(d0, d1, 0xdd);
+  d0 = _mm512_shuffle_i32x4(r0, r1, 0x88);
+  d1 = _mm512_shuffle_i32x4(r0, r1, 0xdd);
+  return std::make_tuple(d0, d1);
+}
+#pragma GCC diagnostic pop
+
+#endif
+
+// TODO: debug print, remove me later
+template<typename scalar_t>
+void print_array(scalar_t* ptr, int size) {
+  for (int d = 0; d < size; ++d) {
+    if (d % 16 == 0) { std::cout << std::endl; }
+    std::cout << ptr[d] << " ";
+  }
+  std::cout << std::endl;
+}
+
+} // anonymous namespace
diff --git a/vllm_v0.10.0/csrc/cpu/shm.cpp b/vllm_v0.10.0/csrc/cpu/shm.cpp
new file mode 100644
index 0000000..7e64e1c
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cpu/shm.cpp
@@ -0,0 +1,818 @@
+#include "cpu/cpu_types.hpp"
+
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+namespace {
+#define MAX_SHM_RANK_NUM 8
+#define PER_THREAD_SHM_BUFFER_BYTES (4 * 1024 * 1024)
+static_assert(PER_THREAD_SHM_BUFFER_BYTES % 2 == 0);
+#define PER_THREAD_SHM_BUFFER_OFFSET (PER_THREAD_SHM_BUFFER_BYTES >> 1)
+#define MIN_THREAD_PROCESS_SIZE (256)
+#define MAX_P2P_SEND_TENSOR_NUM 8
+
+template <typename scalar_t>
+struct KernelVecType {
+  using scalar_vec_t = void;
+};
+
+template <>
+struct KernelVecType<float> {
+  using scalar_vec_t = vec_op::FP32Vec16;
+};
+
+template <>
+struct KernelVecType<c10::BFloat16> {
+  using scalar_vec_t = vec_op::BF16Vec16;
+};
+
+template <>
+struct KernelVecType<c10::Half> {
+  using scalar_vec_t = vec_op::FP16Vec16;
+};
+
+struct ThreadSHMContext {
+  volatile char _curr_thread_stamp[2];
+  volatile char _ready_thread_stamp[2];
+  int local_stamp_buffer_idx;
+  int remote_stamp_buffer_idx;
+  int thread_id;
+  int thread_num;
+  int rank;
+  int group_size;
+  size_t _spinning_count;
+  int swizzled_ranks[MAX_SHM_RANK_NUM];
+  void* thread_shm_ptrs[MAX_SHM_RANK_NUM];
+  ThreadSHMContext* shm_contexts[MAX_SHM_RANK_NUM];
+  size_t _thread_buffer_mask[2];
+  char _padding2[40];
+
+  ThreadSHMContext(const int thread_id, const int thread_num, const int rank,
+                   const int group_size, void* thread_shm_ptr)
+      : local_stamp_buffer_idx(0),
+        remote_stamp_buffer_idx(0),
+        thread_id(thread_id),
+        thread_num(thread_num),
+        rank(rank),
+        group_size(group_size),
+        _spinning_count(0) {
+    static_assert(sizeof(ThreadSHMContext) % 64 == 0);
+    TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM);
+    TORCH_CHECK((size_t)this % 64 == 0);
+    TORCH_CHECK((size_t)thread_shm_ptr % 64 == 0);
+    _curr_thread_stamp[0] = 1;
+    _curr_thread_stamp[1] = 1;
+    _ready_thread_stamp[0] = 0;
+    _ready_thread_stamp[1] = 0;
+    _thread_buffer_mask[0] = 0;
+    _thread_buffer_mask[1] = 0;
+    for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
+      shm_contexts[i] = nullptr;
+      thread_shm_ptrs[i] = nullptr;
+      swizzled_ranks[i] = (i + rank) % group_size;
+    }
+    set_context(rank, this, thread_shm_ptr);
+  }
+
+  void set_stamp_buffer_idx(int local, int remote) {
+    local_stamp_buffer_idx = local;
+    remote_stamp_buffer_idx = remote;
+  }
+
+  void set_context(int rank, ThreadSHMContext* ptr, void* thread_shm_ptr) {
+    TORCH_CHECK(rank < MAX_SHM_RANK_NUM);
+    TORCH_CHECK(ptr);
+    TORCH_CHECK(thread_shm_ptr);
+    TORCH_CHECK_EQ(ptr->thread_num, thread_num);
+    TORCH_CHECK_EQ(ptr->thread_id, thread_id);
+    shm_contexts[rank] = ptr;
+    thread_shm_ptrs[rank] = thread_shm_ptr;
+  }
+
+  template <typename T>
+  T* get_thread_shm_ptr(int rank) {
+    return reinterpret_cast<T*>(
+        reinterpret_cast<int8_t*>(thread_shm_ptrs[rank]) +
+        (PER_THREAD_SHM_BUFFER_OFFSET &
+         _thread_buffer_mask[local_stamp_buffer_idx]));
+  }
+
+  void next_buffer() {
+    _thread_buffer_mask[local_stamp_buffer_idx] ^= 0xFFFFFFFFFFFFFFFF;
+  }
+
+  char get_curr_stamp(int idx) const { return _curr_thread_stamp[idx]; }
+
+  char get_ready_stamp(int idx) const { return _ready_thread_stamp[idx]; }
+
+  void next_stamp() {
+    _mm_mfence();
+    _curr_thread_stamp[local_stamp_buffer_idx] += 1;
+  }
+
+  void commit_ready_stamp() {
+    _mm_mfence();
+    _ready_thread_stamp[local_stamp_buffer_idx] =
+        _curr_thread_stamp[local_stamp_buffer_idx];
+  }
+
+  int get_swizzled_rank(int idx) { return swizzled_ranks[idx]; }
+
+  template <typename Cond>
+  void wait_for_all(Cond&& cond) {
+    for (int idx = 1; idx < group_size; ++idx) {
+      int rank = get_swizzled_rank(idx);
+      wait_for_one(rank, std::forward<Cond>(cond));
+    }
+  }
+
+  template <typename Cond>
+  void wait_for_one(int rank, Cond&& cond) {
+    ThreadSHMContext* rank_ctx = shm_contexts[rank];
+    for (;;) {
+      char local_curr_stamp = get_curr_stamp(local_stamp_buffer_idx);
+      char local_ready_stamp = get_ready_stamp(local_stamp_buffer_idx);
+      char rank_curr_stamp = rank_ctx->get_curr_stamp(remote_stamp_buffer_idx);
+      char rank_ready_stamp =
+          rank_ctx->get_ready_stamp(remote_stamp_buffer_idx);
+      if (cond(local_curr_stamp, local_ready_stamp, rank_curr_stamp,
+               rank_ready_stamp)) {
+        break;
+      }
+      ++_spinning_count;
+      _mm_pause();
+    }
+  }
+
+  static bool check_no_buffer_conflict(char local_curr_stamp,
+                                       char local_ready_stamp,
+                                       char rank_curr_stamp,
+                                       char rank_ready_stamp) {
+    char temp = rank_curr_stamp + 2;
+    return local_curr_stamp != temp;
+  }
+
+  static bool check_stamp_ready(char local_curr_stamp, char local_ready_stamp,
+                                char rank_curr_stamp, char rank_ready_stamp) {
+    char temp = local_curr_stamp + 1;
+    return (local_curr_stamp == rank_ready_stamp) || (temp == rank_ready_stamp);
+  }
+
+  std::string to_string() const {
+    std::stringstream ss;
+    ss << "SHMContext:";
+    ss << "\nrank: " << rank;
+    ss << "\ngroup_size: " << group_size;
+    ss << "\nthread_num: " << thread_num;
+    ss << "\nthread_id: " << thread_id;
+
+    ss << "\nshm_ctx_stat_loop_seq: [";
+    for (int i = 0; i < group_size; ++i) {
+      ss << swizzled_ranks[i] << ", ";
+    }
+    ss << "]";
+
+    ss << "\nshm_contexts: [";
+    for (int i = 0; i < group_size; ++i) {
+      if (shm_contexts[i]) {
+        ss << shm_contexts[i]->rank << ", ";
+      }
+    }
+    ss << "]";
+
+    return ss.str();
+  }
+};
+
+class SHMManager {
+ public:
+  explicit SHMManager(const std::string& name, const int rank,
+                      const int group_size)
+      : _rank(rank),
+        _group_size(group_size),
+        _thread_num(torch::get_num_threads()),
+        _shm_names({""}),
+        _shared_mem_ptrs({nullptr}),
+        _shm_ctx(nullptr) {
+    _shm_names[rank] = get_shm_name(name, rank);
+    _shared_mem_ptrs[rank] = init_shm(rank);
+    _shm_ctx = reinterpret_cast<ThreadSHMContext*>(_shared_mem_ptrs[rank]);
+
+    for (int i = 0; i < _thread_num; ++i) {
+      ThreadSHMContext* ctx = new (_shm_ctx + i)
+          ThreadSHMContext(i, _thread_num, _rank, _group_size,
+                           compute_thread_shm_ptr(_shm_ctx, i));
+    }
+  }
+
+  void join(const std::string& name) {
+    for (int rank_idx = 0; rank_idx < _group_size; ++rank_idx) {
+      if (rank_idx != _rank) {
+        TORCH_CHECK(_shm_names[rank_idx].empty());
+        TORCH_CHECK(_shared_mem_ptrs[rank_idx] == nullptr);
+        _shm_names[rank_idx] = get_shm_name(name, rank_idx);
+        _shared_mem_ptrs[rank_idx] = init_shm(rank_idx);
+        ThreadSHMContext* target_ctx =
+            reinterpret_cast<ThreadSHMContext*>(_shared_mem_ptrs[rank_idx]);
+        for (int thread_idx = 0; thread_idx < _thread_num; ++thread_idx) {
+          _shm_ctx[thread_idx].set_context(
+              rank_idx, target_ctx + thread_idx,
+              compute_thread_shm_ptr(target_ctx, thread_idx));
+        }
+      }
+    }
+  }
+
+  ~SHMManager() { destroy_shm(); }
+
+  ThreadSHMContext* get_shm_ctx() const { return _shm_ctx; }
+
+  static std::string get_shm_name(const std::string& name, int rank) {
+    return name + "_" + std::to_string(rank);
+  }
+
+  static int64_t create_singleton_instance(const std::string& name,
+                                           const int group_size,
+                                           const int rank) {
+    std::lock_guard<std::mutex> guard(SingletonInstancesLock);
+    SingletonInstances.emplace_back(
+        std::make_unique<SHMManager>(name, rank, group_size));
+    return static_cast<int64_t>(SingletonInstances.size() - 1);
+  }
+
+  static SHMManager* get_singleton_instance(int64_t handle) {
+    return SingletonInstances[handle].get();
+  }
+
+ protected:
+  static std::vector<std::unique_ptr<SHMManager>> SingletonInstances;
+  static std::mutex SingletonInstancesLock;
+
+ private:
+  static size_t round_to_alignment(size_t num) {
+    return ((num + 63) / 64) * 64;
+  }
+
+  int8_t* compute_thread_shm_ptr(ThreadSHMContext* ctx, int thread_id) {
+    int8_t* thread_shm_ptr =
+        reinterpret_cast<int8_t*>(ctx) +
+        round_to_alignment(_thread_num * sizeof(ThreadSHMContext));
+    return thread_shm_ptr +
+           thread_id * round_to_alignment(PER_THREAD_SHM_BUFFER_BYTES);
+  }
+
+  size_t compute_shm_size() {
+    const size_t rounded_rank_buffer_size =
+        round_to_alignment(PER_THREAD_SHM_BUFFER_BYTES) * _thread_num;
+    const size_t rounded_thread_shm_ctx_size =
+        round_to_alignment(_thread_num * sizeof(ThreadSHMContext));
+    const size_t shm_size =
+        rounded_thread_shm_ctx_size + rounded_rank_buffer_size;
+    return shm_size;
+  }
+
+  void* init_shm(int target_rank) {
+    const std::string& shm_name = _shm_names[target_rank];
+    const int local_rank = _rank;
+    const size_t shm_size = compute_shm_size();
+
+    int fd = -1;
+    if (local_rank == target_rank) {
+      fd = shm_open(shm_name.c_str(), O_CREAT | O_EXCL | O_RDWR,
+                    S_IRUSR | S_IWUSR);
+
+      if (fd == -1)
+        TORCH_CHECK(false, "create shm in SHMManager failed. errno: " +
+                               std::to_string(errno));
+
+      if (ftruncate(fd, shm_size) == -1)
+        TORCH_CHECK(false, "ftruncate in SHMManager failed. errno: " +
+                               std::to_string(errno));
+    } else {
+      fd = shm_open(shm_name.c_str(), O_RDWR, S_IRUSR | S_IWUSR);
+
+      if (fd == -1)
+        TORCH_CHECK(false, "open shm in SHMManager failed. errno: " +
+                               std::to_string(errno));
+    }
+
+    void* shm_ptr = mmap(nullptr, shm_size, PROT_READ | PROT_WRITE,
+                         MAP_SHARED | MAP_POPULATE, fd, 0);
+
+    if (shm_ptr == MAP_FAILED) {
+      TORCH_CHECK(false,
+                  "mmap in SHMManager failed. errno: " + std::to_string(errno));
+    }
+
+    if (close(fd) != 0) {
+      TORCH_CHECK(
+          false, "close in SHMManager failed. errno: " + std::to_string(errno));
+    }
+
+    TORCH_CHECK((size_t)shm_ptr % 64 == 0);
+
+    return shm_ptr;
+  }
+
+  void destroy_shm() {
+    std::stringstream ss;
+    ss << "local rank " << _rank << ": [";
+    for (int thread_id = 0; thread_id < _thread_num; ++thread_id) {
+      ss << _shm_ctx[thread_id]._spinning_count << ", ";
+    }
+    ss << "]\n";
+
+    for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
+      if (_shared_mem_ptrs[i] != nullptr) {
+        munmap(_shared_mem_ptrs[i], compute_shm_size());
+      }
+
+      if (!_shm_names[i].empty()) {
+        shm_unlink(_shm_names[i].c_str());
+      }
+    }
+  }
+
+  int _rank;
+  int _group_size;
+  int _thread_num;
+  std::array<std::string, MAX_SHM_RANK_NUM> _shm_names;
+  std::array<void*, MAX_SHM_RANK_NUM> _shared_mem_ptrs;
+  ThreadSHMContext* _shm_ctx;
+};
+
+namespace shm_cc_ops {
+template <typename scalar_t, typename F>
+void shm_cc_loop(ThreadSHMContext* ctx, int64_t elem_num, F&& inner_func) {
+  int thread_num = ctx->thread_num;
+  int64_t total_bytes = elem_num * sizeof(scalar_t);
+  int64_t total_units_num =
+      (total_bytes + MIN_THREAD_PROCESS_SIZE - 1) / MIN_THREAD_PROCESS_SIZE;
+  int64_t per_thread_units_num =
+      (total_units_num + thread_num - 1) / thread_num;
+  int64_t per_unit_elem_num = MIN_THREAD_PROCESS_SIZE / sizeof(scalar_t);
+  int64_t max_per_thread_iteration_elem_num =
+      (PER_THREAD_SHM_BUFFER_BYTES >> 1) /
+      sizeof(scalar_t);  // Note: double buffer
+  int64_t per_thread_elem_num = per_unit_elem_num * per_thread_units_num;
+
+#pragma omp parallel for schedule(static, 1)
+  for (int i = 0; i < thread_num; ++i) {
+    int64_t offset = i * per_thread_elem_num;
+    int64_t end = std::min(elem_num, offset + per_thread_elem_num);
+    int64_t curr_elem_num =
+        std::min(max_per_thread_iteration_elem_num, end - offset);
+    ThreadSHMContext* thread_ctx = ctx + i;
+    bool fast_mode = ((end - offset) <= max_per_thread_iteration_elem_num);
+
+    while (curr_elem_num > 0) {
+      inner_func(thread_ctx, offset, curr_elem_num, fast_mode);
+
+      thread_ctx->next_stamp();
+      thread_ctx->next_buffer();
+      offset += max_per_thread_iteration_elem_num;
+      curr_elem_num = std::min(max_per_thread_iteration_elem_num, end - offset);
+    }
+  }
+}
+
+void reset_threads_stamp_buffer_idx(ThreadSHMContext* ctx, int local,
+                                    int remote) {
+  int thread_num = ctx->thread_num;
+  for (int i = 0; i < thread_num; ++i) {
+    ThreadSHMContext* thread_ctx = ctx + i;
+    thread_ctx->set_stamp_buffer_idx(local, remote);
+  }
+}
+};  // namespace shm_cc_ops
+
+namespace shm_cc_ops {
+
+void memcpy_from_shm(void* dst, void* src, const int64_t bytes) {
+  const int64_t aligned_bytes = ((bytes >> 6) << 6);  // 64 bytes aligned
+  int64_t i = 0;
+#pragma GCC unroll 4
+  for (; i < aligned_bytes; i += 64) {
+    vec_op::INT8Vec64 data(
+        true, (int8_t*)src + i);  // stream loading shm to avoid caching
+    data.save((int8_t*)dst + i);
+  }
+  if (aligned_bytes < bytes) {
+    vec_op::INT8Vec64 data(true, (int8_t*)src + aligned_bytes);
+    data.save((int8_t*)dst + aligned_bytes, bytes - aligned_bytes);
+  }
+}
+
+void memcpy_to_shm(void* dst, void* src, const int64_t bytes) {
+#pragma GCC unroll 4
+  for (int64_t i = 0; i < bytes; i += 64) {
+    vec_op::INT8Vec64 data((int8_t*)src + i);
+    data.nt_save((int8_t*)dst + i);
+  }
+}
+
+void memcpy(void* dst, void* src, const int64_t bytes) {
+  const int64_t aligned_bytes = ((bytes >> 6) << 6);  // 64 bytes aligned
+  int64_t i = 0;
+#pragma GCC unroll 4
+  for (; i < aligned_bytes; i += 64) {
+    vec_op::INT8Vec64 data((int8_t*)src + i);
+    data.save((int8_t*)dst + i);
+  }
+  if (aligned_bytes < bytes) {
+    vec_op::INT8Vec64 data((int8_t*)src + aligned_bytes);
+    data.save((int8_t*)dst + aligned_bytes, bytes - aligned_bytes);
+  }
+}
+
+template <typename scalar_t, int RANKS>
+void all_reduce_sum_impl(ThreadSHMContext* ctx, scalar_t* data,
+                         size_t elem_num) {
+  CPU_KERNEL_GUARD_IN(all_reduce_sum_impl)
+  using vec_t = typename KernelVecType<scalar_t>::scalar_vec_t;
+  constexpr int64_t vec_elem_num = vec_t::get_elem_num();
+  const int worldsize = ctx->group_size;
+
+  shm_cc_ops::shm_cc_loop<scalar_t>(
+      ctx, elem_num,
+      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
+          int64_t data_elem_num, bool fast_mode) {
+        int rank = thread_ctx->rank;
+        scalar_t* thread_shm_ptr =
+            thread_ctx->get_thread_shm_ptr<scalar_t>(rank);
+        scalar_t* thread_data_ptr = data + data_offset;
+        int64_t thread_data_elem_num = data_elem_num * sizeof(scalar_t);
+
+        scalar_t* remote_data_ptrs[RANKS - 1];
+        vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
+          remote_data_ptrs[idx] = thread_ctx->get_thread_shm_ptr<scalar_t>(
+              thread_ctx->get_swizzled_rank(idx + 1));
+        });
+
+        if (!fast_mode) {
+          thread_ctx->wait_for_all(ThreadSHMContext::check_no_buffer_conflict);
+        }
+
+        shm_cc_ops::memcpy_to_shm(thread_shm_ptr, thread_data_ptr,
+                                  thread_data_elem_num);
+        thread_ctx->commit_ready_stamp();
+        int64_t aligned_data_elem_num =
+            (data_elem_num / vec_elem_num) * vec_elem_num;
+        int64_t i = 0;
+        thread_ctx->wait_for_all(ThreadSHMContext::check_stamp_ready);
+#pragma GCC unroll 4
+        for (; i < aligned_data_elem_num; i += vec_elem_num) {
+          vec_t local_data(thread_data_ptr + i);  // load from cache
+          vec_op::FP32Vec16 local_data_fp32(local_data);
+          vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
+            vec_t remote_data(
+                true, remote_data_ptrs[idx] + i);  // stream load from shm
+            vec_op::FP32Vec16 remote_data_fp32(remote_data);
+            local_data_fp32 = local_data_fp32 + remote_data_fp32;  // sum reduce
+          });
+          vec_t reduced_data(local_data_fp32);
+          reduced_data.save(thread_data_ptr + i);
+        }
+
+        if (i < data_elem_num) {
+          vec_t local_data(thread_data_ptr + i);  // load from cache
+          vec_op::FP32Vec16 local_data_fp32(local_data);
+          vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
+            vec_t remote_data(
+                true, remote_data_ptrs[idx] + i);  // stream load from shm
+            vec_op::FP32Vec16 remote_data_fp32(remote_data);
+            local_data_fp32 = local_data_fp32 + remote_data_fp32;  // sum reduce
+          });
+          vec_t reduced_data(local_data_fp32);
+          reduced_data.save(thread_data_ptr + i,
+                            data_elem_num - aligned_data_elem_num);
+        }
+      });
+
+  return;
+}
+};  // namespace shm_cc_ops
+
+std::vector<std::unique_ptr<SHMManager>> SHMManager::SingletonInstances = {};
+std::mutex SHMManager::SingletonInstancesLock = {};
+
+template <typename scalar_t>
+void shm_allreduce_sum(ThreadSHMContext* ctx, scalar_t* data, size_t elem_num) {
+  switch (ctx->group_size) {
+    case 2:
+      shm_cc_ops::all_reduce_sum_impl<scalar_t, 2>(ctx, data, elem_num);
+      break;
+    case 3:
+      shm_cc_ops::all_reduce_sum_impl<scalar_t, 3>(ctx, data, elem_num);
+      break;
+    case 4:
+      shm_cc_ops::all_reduce_sum_impl<scalar_t, 4>(ctx, data, elem_num);
+      break;
+    case 8:
+      shm_cc_ops::all_reduce_sum_impl<scalar_t, 8>(ctx, data, elem_num);
+      break;
+    default:
+      TORCH_CHECK(false,
+                  "Invalid world size: " + std::to_string(ctx->group_size));
+  }
+}
+
+template <typename scalar_t>
+void shm_gather_impl(ThreadSHMContext* ctx, scalar_t* data, size_t elem_num,
+                     scalar_t** outputs, const int dst) {
+  CPU_KERNEL_GUARD_IN(shm_gather_impl)
+  const int worldsize = ctx->group_size;
+  TORCH_CHECK_LT(dst, worldsize);
+  shm_cc_ops::shm_cc_loop<scalar_t>(
+      ctx, elem_num,
+      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
+          int64_t data_elem_num, bool fast_mode) {
+        int rank = thread_ctx->rank;
+        scalar_t* thread_shm_ptr =
+            thread_ctx->get_thread_shm_ptr<scalar_t>(rank);
+
+        if (!fast_mode) {
+          thread_ctx->wait_for_all(ThreadSHMContext::check_no_buffer_conflict);
+        }
+
+        shm_cc_ops::memcpy(thread_shm_ptr, data + data_offset,
+                           data_elem_num * sizeof(scalar_t));
+        thread_ctx->commit_ready_stamp();
+        if (rank == dst) {
+          shm_cc_ops::memcpy(outputs[rank] + data_offset, data + data_offset,
+                             data_elem_num * sizeof(scalar_t));
+          for (int i = 1; i < worldsize; ++i) {
+            int src_rank = thread_ctx->get_swizzled_rank(i);
+            scalar_t* src_ptr =
+                thread_ctx->get_thread_shm_ptr<scalar_t>(src_rank);  // shm
+            scalar_t* dst_ptr = outputs[src_rank] + data_offset;
+            thread_ctx->wait_for_one(src_rank,
+                                     ThreadSHMContext::check_stamp_ready);
+            shm_cc_ops::memcpy(dst_ptr, src_ptr,
+                               data_elem_num * sizeof(scalar_t));
+          }
+        }
+      });
+
+  return;
+}
+
+struct MemPiece {
+  void* ptr;
+  int64_t size;
+
+  template <typename T>
+  T* data_ptr() {
+    return reinterpret_cast<T*>(ptr);
+  }
+};
+
+struct TensorListMeta {
+  int64_t tensor_bytes[MAX_P2P_SEND_TENSOR_NUM];
+  torch::ScalarType tensor_types[MAX_P2P_SEND_TENSOR_NUM];
+  int64_t tensor_num;
+  int64_t total_bytes;
+
+  TensorListMeta() : tensor_num(0), total_bytes(0) {
+    static_assert(sizeof(TensorListMeta) % 64 == 0);
+    static_assert(sizeof(TensorListMeta) <
+                  MIN_THREAD_PROCESS_SIZE);  // To ensure the metadata always
+                                             // hold by the thread 0
+    for (int i = 0; i < MAX_P2P_SEND_TENSOR_NUM; ++i) {
+      tensor_bytes[i] = 0;
+      tensor_ptrs[i] = nullptr;
+      tensor_types[i] = torch::ScalarType::Undefined;
+    }
+  }
+
+  // For send and recv
+  void bind_tensor_list(std::vector<torch::Tensor>& tensor_list) {
+    TORCH_CHECK(tensor_types[0] == torch::ScalarType::Undefined,
+                "Re-bind TensorListMeta is not allowed.")
+    TORCH_CHECK_LE(tensor_list.size(), MAX_P2P_SEND_TENSOR_NUM);
+    tensor_num = tensor_list.size();
+    int64_t bytes_sum = 0;
+    for (int i = 0; i < tensor_list.size(); ++i) {
+      torch::Tensor& t = tensor_list[i];
+      TORCH_CHECK(t.is_contiguous());
+      tensor_bytes[i] = t.nbytes();
+      tensor_types[i] = t.scalar_type();
+      tensor_ptrs[i] = t.data_ptr();
+      bytes_sum += t.nbytes();
+    }
+    total_bytes = bytes_sum;
+  }
+
+  // For recv
+  std::vector<torch::Tensor> generate_tensor_list() {
+    std::vector<torch::Tensor> tensor_list;
+    tensor_list.reserve(tensor_num);
+
+    for (int i = 0; i < tensor_num; ++i) {
+      int64_t bytes = tensor_bytes[i];
+      auto type = tensor_types[i];
+      int64_t elem_bytes = torch::elementSize(type);
+
+      TORCH_CHECK_EQ(bytes % elem_bytes, 0);
+      int64_t elem_num = bytes / elem_bytes;
+      auto options = torch::TensorOptions().dtype(type).device(torch::kCPU);
+      tensor_list.emplace_back(torch::empty({elem_num}, options));
+    }
+    return tensor_list;
+  }
+
+  MemPiece get_data(int64_t offset) {
+    for (int i = 0; i < tensor_num; ++i) {
+      if (offset < tensor_bytes[i]) {
+        return {reinterpret_cast<int8_t*>(tensor_ptrs[i]) + offset,
+                tensor_bytes[i] - offset};
+      }
+      offset -= tensor_bytes[i];
+    }
+    return {nullptr, 0};
+  }
+
+ private:
+  void* tensor_ptrs[MAX_P2P_SEND_TENSOR_NUM];
+  int8_t _padding[40];
+};
+
+void shm_send_tensor_list_impl(ThreadSHMContext* ctx, int64_t dst,
+                               const std::vector<torch::Tensor>& tensor_list) {
+  CPU_KERNEL_GUARD_IN(shm_send_tensor_list_impl)
+  std::vector<torch::Tensor> tensor_list_with_metadata;
+  tensor_list_with_metadata.reserve(1 + tensor_list.size());
+
+  auto options = torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
+  tensor_list_with_metadata.emplace_back(
+      torch::empty({sizeof(TensorListMeta)}, options));
+  tensor_list_with_metadata.insert(tensor_list_with_metadata.end(),
+                                   tensor_list.begin(), tensor_list.end());
+
+  torch::Tensor& metadata_tensor = tensor_list_with_metadata[0];
+  TORCH_CHECK_EQ(metadata_tensor.nbytes(), sizeof(TensorListMeta));
+
+  TensorListMeta* metadata = new (metadata_tensor.data_ptr()) TensorListMeta();
+  metadata->bind_tensor_list(tensor_list_with_metadata);
+
+  shm_cc_ops::reset_threads_stamp_buffer_idx(ctx, 0, 1);
+  shm_cc_ops::shm_cc_loop<int8_t>(
+      ctx, metadata->total_bytes,
+      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
+          int64_t data_elem_num, bool fast_mode) {
+        int rank = thread_ctx->rank;
+        int64_t curr_shm_offset = 0;
+        thread_ctx->wait_for_one(dst,
+                                 ThreadSHMContext::check_no_buffer_conflict);
+        while (curr_shm_offset < data_elem_num) {
+          MemPiece frag = metadata->get_data(data_offset + curr_shm_offset);
+          frag.size = std::min(frag.size, data_elem_num - curr_shm_offset);
+          shm_cc_ops::memcpy(
+              thread_ctx->get_thread_shm_ptr<int8_t>(rank) + curr_shm_offset,
+              frag.ptr, frag.size);
+          curr_shm_offset += frag.size;
+        }
+        thread_ctx->commit_ready_stamp();
+      });
+}
+
+std::vector<torch::Tensor> shm_recv_tensor_list_impl(ThreadSHMContext* ctx,
+                                                     int64_t src) {
+  CPU_KERNEL_GUARD_IN(shm_recv_tensor_list_impl)
+  auto options = torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
+  torch::Tensor metadata_tensor =
+      torch::empty({sizeof(TensorListMeta)}, options);
+
+  shm_cc_ops::reset_threads_stamp_buffer_idx(ctx, 1, 0);
+  ctx->wait_for_one(src, ThreadSHMContext::check_stamp_ready);
+  shm_cc_ops::memcpy(metadata_tensor.data_ptr(),
+                     ctx->get_thread_shm_ptr<void>(src),
+                     sizeof(TensorListMeta));
+  TensorListMeta* src_metadata =
+      reinterpret_cast<TensorListMeta*>(metadata_tensor.data_ptr());
+  std::vector<torch::Tensor> tensor_list_with_metadata =
+      src_metadata->generate_tensor_list();
+
+  TensorListMeta metadata;
+  metadata.bind_tensor_list(tensor_list_with_metadata);
+  TORCH_CHECK_EQ(metadata.tensor_num, src_metadata->tensor_num);
+  TORCH_CHECK_EQ(metadata.total_bytes, src_metadata->total_bytes);
+
+  shm_cc_ops::shm_cc_loop<int8_t>(
+      ctx, metadata.total_bytes,
+      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
+          int64_t data_elem_num, bool fast_mode) {
+        thread_ctx->wait_for_one(src, ThreadSHMContext::check_stamp_ready);
+        int64_t curr_shm_offset = 0;
+        while (curr_shm_offset < data_elem_num) {
+          MemPiece frag = metadata.get_data(data_offset + curr_shm_offset);
+          frag.size = std::min(frag.size, data_elem_num - curr_shm_offset);
+          shm_cc_ops::memcpy(
+              frag.ptr,
+              thread_ctx->get_thread_shm_ptr<int8_t>(src) + curr_shm_offset,
+              frag.size);
+          curr_shm_offset += frag.size;
+        }
+      });
+
+  std::vector<torch::Tensor> tensor_list;
+  tensor_list.reserve(metadata.tensor_num - 1);
+  tensor_list.insert(tensor_list.begin(), tensor_list_with_metadata.begin() + 1,
+                     tensor_list_with_metadata.end());
+
+  return tensor_list;
+}
+}  // namespace
+
+void shm_gather(int64_t handle, torch::Tensor& data,
+                const std::optional<std::vector<torch::Tensor>>& outputs,
+                int64_t dst) {
+  TORCH_CHECK(data.is_contiguous())
+  VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_gather_impl", [&] {
+    CPU_KERNEL_GUARD_IN(shm_gather_impl)
+
+    if (outputs.has_value()) {
+      TORCH_CHECK_LE(outputs->size(), MAX_SHM_RANK_NUM);
+      scalar_t* output_ptrs[MAX_SHM_RANK_NUM] = {nullptr};
+      for (int i = 0; i < outputs->size(); ++i) {
+        output_ptrs[i] = outputs->at(i).data_ptr<scalar_t>();
+      }
+      shm_gather_impl(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
+                      data.data_ptr<scalar_t>(), data.numel(), output_ptrs,
+                      dst);
+    } else {
+      shm_gather_impl(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
+                      data.data_ptr<scalar_t>(), data.numel(), (scalar_t**)(0),
+                      dst);
+    }
+
+    CPU_KERNEL_GUARD_OUT(shm_gather_impl)
+  });
+}
+
+void shm_all_gather(int64_t handle, const torch::Tensor& data,
+                    torch::Tensor& output) {
+  TORCH_CHECK(data.is_contiguous())
+  TORCH_CHECK(output.is_contiguous())
+
+  const int64_t input_elem_num = data.numel();
+  const int64_t output_elem_num = output.numel();
+  TORCH_CHECK_EQ(output_elem_num % input_elem_num, 0);
+  const int world_size = output_elem_num / input_elem_num;
+
+  VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_all_gather_impl", [&] {
+    CPU_KERNEL_GUARD_IN(shm_all_gather_impl)
+    auto ctx = SHMManager::get_singleton_instance(handle)->get_shm_ctx();
+    TORCH_CHECK_EQ(ctx->group_size, world_size);
+
+    scalar_t* output_ptrs[MAX_SHM_RANK_NUM] = {nullptr};
+    for (int i = 0; i < world_size; ++i) {
+      output_ptrs[i] = output.data_ptr<scalar_t>() + i * input_elem_num;
+    }
+    shm_gather_impl(ctx, data.data_ptr<scalar_t>(), data.numel(), output_ptrs,
+                    ctx->rank);
+    CPU_KERNEL_GUARD_OUT(shm_all_gather_impl)
+  });
+}
+
+void shm_allreduce(int64_t handle, torch::Tensor& data) {
+  TORCH_CHECK(data.is_contiguous())
+  VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_allreduce_sum", [&] {
+    CPU_KERNEL_GUARD_IN(shm_allreduce_sum)
+    shm_allreduce_sum(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
+                      data.data_ptr<scalar_t>(), data.numel());
+    CPU_KERNEL_GUARD_OUT(shm_allreduce_sum)
+  });
+}
+
+void shm_send_tensor_list(int64_t handle,
+                          const std::vector<torch::Tensor>& tensor_list,
+                          int64_t dst) {
+  CPU_KERNEL_GUARD_IN(shm_send_tensor_list)
+  shm_send_tensor_list_impl(
+      SHMManager::get_singleton_instance(handle)->get_shm_ctx(), dst,
+      tensor_list);
+  CPU_KERNEL_GUARD_OUT(shm_send_tensor_list)
+}
+
+std::vector<torch::Tensor> shm_recv_tensor_list(int64_t handle, int64_t src) {
+  CPU_KERNEL_GUARD_IN(shm_recv_tensor_list)
+  auto tensor_list = shm_recv_tensor_list_impl(
+      SHMManager::get_singleton_instance(handle)->get_shm_ctx(), src);
+  CPU_KERNEL_GUARD_OUT(shm_recv_tensor_list)
+  return tensor_list;
+}
+
+int64_t init_shm_manager(const std::string& name, const int64_t group_size,
+                         const int64_t rank) {
+  return SHMManager::create_singleton_instance(name, group_size, rank);
+}
+
+std::string join_shm_manager(int64_t handle, const std::string& name) {
+  auto shm_manager = SHMManager::get_singleton_instance(handle);
+  TORCH_CHECK(shm_manager);
+  shm_manager->join(name);
+  return shm_manager->get_shm_ctx()->to_string();
+}
diff --git a/vllm_v0.10.0/csrc/cpu/torch_bindings.cpp b/vllm_v0.10.0/csrc/cpu/torch_bindings.cpp
new file mode 100644
index 0000000..f1738ae
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cpu/torch_bindings.cpp
@@ -0,0 +1,307 @@
+#include "cache.h"
+#include "ops.h"
+#include "core/registration.h"
+
+#include <torch/library.h>
+
+std::string init_cpu_threads_env(const std::string& cpu_ids);
+
+void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
+                    const torch::Tensor& b, const torch::Tensor& a_scales,
+                    const torch::Tensor& b_scales,
+                    const std::optional<torch::Tensor>& bias);
+
+void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a,
+                        const torch::Tensor& b, const torch::Tensor& a_scales,
+                        const torch::Tensor& b_scales,
+                        const torch::Tensor& azp_adj,
+                        const std::optional<torch::Tensor>& azp,
+                        const std::optional<torch::Tensor>& bias);
+
+#if defined(__powerpc64__)
+void int8_scaled_mm_ppc64le(torch::Tensor& c, const torch::Tensor& a,
+                            const torch::Tensor& b,
+                            const torch::Tensor& a_scales,
+                            const torch::Tensor& b_scales,
+                            const std::optional<torch::Tensor>& bias);
+#endif
+
+void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
+                        torch::Tensor& kv_cache, double scale,
+                        torch::Tensor& block_tables, torch::Tensor& seq_lens);
+
+int64_t init_shm_manager(const std::string& name, const int64_t group_size,
+                         const int64_t rank);
+
+std::string join_shm_manager(int64_t handle, const std::string& name);
+
+void shm_allreduce(int64_t handle, torch::Tensor& data);
+
+void shm_gather(int64_t handle, torch::Tensor& data,
+                const std::optional<std::vector<torch::Tensor>>& outputs,
+                int64_t dst);
+
+void shm_all_gather(int64_t handle, const torch::Tensor& data,
+                    torch::Tensor& output);
+
+void shm_send_tensor_list(int64_t handle,
+                          const std::vector<torch::Tensor>& tensor_list,
+                          int64_t dst);
+
+std::vector<torch::Tensor> shm_recv_tensor_list(int64_t handle, int64_t src);
+
+at::Tensor weight_packed_linear(at::Tensor& mat1, at::Tensor& mat2,
+                                const std::optional<at::Tensor>& bias,
+                                bool is_vnni);
+
+at::Tensor convert_weight_packed(at::Tensor& weight);
+
+at::Tensor fused_experts_cpu(
+    at::Tensor& hidden_states, at::Tensor& w1, at::Tensor& w2,
+    at::Tensor& topk_weights, at::Tensor& topk_ids, bool inplace,
+    bool use_int8_w8a8, bool use_fp8_w8a16,
+    const std::optional<at::Tensor>& w1_scale,
+    const std::optional<at::Tensor>& w2_scale,
+    const std::optional<std::vector<int64_t>> block_size,
+    const std::optional<at::Tensor>& a1_scale,
+    const std::optional<at::Tensor>& a2_scale, bool is_vnni);
+
+at::Tensor int8_scaled_mm_with_quant(at::Tensor& mat1, at::Tensor& mat2,
+                                     at::Tensor& scales2,
+                                     const std::optional<at::Tensor>& bias,
+                                     at::ScalarType out_dtype, bool is_vnni);
+
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  // vLLM custom ops
+
+  // Attention ops
+  // Compute the attention between an input query and the cached keys/values
+  // using PagedAttention.
+  ops.def(
+      "paged_attention_v1("
+      "    Tensor! out, Tensor query, Tensor key_cache,"
+      "    Tensor value_cache, int num_kv_heads, float scale,"
+      "    Tensor block_tables, Tensor seq_lens, int block_size,"
+      "    int max_seq_len, Tensor? alibi_slopes,"
+      "    str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
+      "    int tp_rank, int blocksparse_local_blocks,"
+      "    int blocksparse_vert_stride, int blocksparse_block_size,"
+      "    int blocksparse_head_sliding_step) -> ()");
+  ops.impl("paged_attention_v1", torch::kCPU, &paged_attention_v1);
+
+  // PagedAttention V2.
+  ops.def(
+      "paged_attention_v2("
+      "    Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
+      "    Tensor! tmp_out, Tensor query, Tensor key_cache,"
+      "    Tensor value_cache, int num_kv_heads, float scale,"
+      "    Tensor block_tables, Tensor seq_lens, int block_size,"
+      "    int max_seq_len, Tensor? alibi_slopes,"
+      "    str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
+      "    int tp_rank, int blocksparse_local_blocks,"
+      "    int blocksparse_vert_stride, int blocksparse_block_size,"
+      "    int blocksparse_head_sliding_step) -> ()");
+  ops.impl("paged_attention_v2", torch::kCPU, &paged_attention_v2);
+
+  // Activation ops
+
+  // Activation function used in SwiGLU.
+  ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
+  ops.impl("silu_and_mul", torch::kCPU, &silu_and_mul);
+
+  // Activation function used in GeGLU with `none` approximation.
+  ops.def("gelu_and_mul(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_and_mul", torch::kCPU, &gelu_and_mul);
+
+  // Activation function used in GeGLU with `tanh` approximation.
+  ops.def("gelu_tanh_and_mul(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_tanh_and_mul", torch::kCPU, &gelu_tanh_and_mul);
+
+  // GELU implementation used in GPT-2.
+  ops.def("gelu_new(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_new", torch::kCPU, &gelu_new);
+
+  // Approximate GELU implementation.
+  ops.def("gelu_fast(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_fast", torch::kCPU, &gelu_fast);
+
+  // Quick GELU implementation.
+  ops.def("gelu_quick(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_quick", torch::kCPU, &gelu_quick);
+
+  // Layernorm
+  // Apply Root Mean Square (RMS) Normalization to the input tensor.
+  ops.def(
+      "rms_norm(Tensor! out, Tensor input, Tensor weight, float epsilon) -> "
+      "()");
+  ops.impl("rms_norm", torch::kCPU, &rms_norm);
+
+  // In-place fused Add and RMS Normalization.
+  ops.def(
+      "fused_add_rms_norm(Tensor! input, Tensor! residual, Tensor weight, "
+      "float epsilon) -> ()");
+  ops.impl("fused_add_rms_norm", torch::kCPU, &fused_add_rms_norm);
+
+  // Rotary embedding
+  // Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
+  ops.def(
+      "rotary_embedding(Tensor positions, Tensor! query,"
+      "                 Tensor!? key, int head_size,"
+      "                 Tensor cos_sin_cache, bool is_neox) -> ()");
+  ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
+
+  // Quantization
+#if defined(__AVX512F__) || defined(__aarch64__)
+  at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
+
+  // Compute int8 quantized tensor for given scaling factor.
+  ops.def(
+      "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
+      "Tensor? azp) -> ()",
+      {stride_tag});
+  ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
+
+  // Compute int8 quantized tensor and scaling factor
+  ops.def(
+      "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
+      "Tensor!? azp) -> ()",
+      {stride_tag});
+  ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
+           &dynamic_scaled_int8_quant);
+  // W8A8 GEMM, supporting symmetric per-tensor or per-row/column
+  // quantization.
+  ops.def(
+      "cutlass_scaled_mm(Tensor! out, Tensor a,"
+      "                  Tensor b, Tensor a_scales,"
+      "                  Tensor b_scales, Tensor? bias) -> ()",
+      {stride_tag});
+  ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm);
+  // w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
+  // quantization.
+  ops.def(
+      "cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
+      "                  Tensor b, Tensor a_scales,"
+      "                  Tensor b_scales, Tensor azp_adj,"
+      "                  Tensor? azp, Tensor? bias) -> ()",
+      {stride_tag});
+  ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
+#elif defined(__powerpc64__)
+  // Compute int8 quantized tensor for given scaling factor.
+  ops.def(
+      "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
+      "Tensor? azp) -> ()");
+  ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
+
+  // Compute int8 quantized tensor and scaling factor
+  ops.def(
+      "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
+      "Tensor!? azp) -> ()");
+  ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
+           &dynamic_scaled_int8_quant);
+  // W8A8 GEMM, supporting symmetric quantization.
+  ops.def(
+      "cutlass_scaled_mm(Tensor! out, Tensor a,"
+      "                  Tensor b, Tensor a_scales,"
+      "                  Tensor b_scales, Tensor? bias) -> ()");
+  ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm_ppc64le);
+  // w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
+  // quantization.
+  ops.def(
+      "cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
+      "                  Tensor b, Tensor a_scales,"
+      "                  Tensor b_scales, Tensor azp_adj,"
+      "                  Tensor? azp, Tensor? bias) -> ()");
+  ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
+#endif
+
+// SHM CCL
+#ifdef __AVX512F__
+  ops.def("init_shm_manager(str name, int group_size, int rank) -> int",
+          &init_shm_manager);
+  ops.def("join_shm_manager(int handle, str name) -> str", &join_shm_manager);
+  ops.def("shm_allreduce(int handle, Tensor! data) -> ()");
+  ops.impl("shm_allreduce", torch::kCPU, &shm_allreduce);
+  ops.def(
+      "shm_gather(int handle, Tensor data, Tensor[](a!)? outputs, int dst) -> "
+      "()");
+  ops.impl("shm_gather", torch::kCPU, &shm_gather);
+  ops.def(
+      "shm_all_gather(int handle, Tensor data, Tensor! output) -> "
+      "()");
+  ops.impl("shm_all_gather", torch::kCPU, &shm_all_gather);
+  ops.def(
+      "shm_send_tensor_list(int handle, Tensor[](a) tensor_list, int dst) -> "
+      "()");
+  ops.impl("shm_send_tensor_list", torch::kCPU, &shm_send_tensor_list);
+  ops.def("shm_recv_tensor_list(int handle, int src) -> Tensor[](a)",
+          &shm_recv_tensor_list);
+#endif
+
+  // sgl-kernels
+#if defined(__AVX512BF16__) && defined(__AVX512F__) && defined(__AVX512VNNI__)
+  ops.def(
+      "weight_packed_linear(Tensor(a0!) mat1, Tensor(a1!) mat2, Tensor(a2!)? "
+      "bias, bool is_vnni) -> Tensor");
+  ops.impl("weight_packed_linear", torch::kCPU, &weight_packed_linear);
+  ops.def("convert_weight_packed(Tensor! weight) -> Tensor");
+  ops.impl("convert_weight_packed", torch::kCPU, &convert_weight_packed);
+  ops.def(
+      "fused_experts_cpu(Tensor! hidden_states, Tensor w1, Tensor w2, Tensor "
+      "topk_weights, Tensor topk_ids, bool inplace, bool use_int8_w8a8, bool "
+      "use_fp8_w8a16, Tensor? w1_scale, Tensor? w2_scale, SymInt[]? "
+      "block_size, Tensor? a1_scale, Tensor? a2_scale, bool is_vnni) -> "
+      "Tensor");
+  ops.impl("fused_experts_cpu", torch::kCPU, &fused_experts_cpu);
+  ops.def(
+      "int8_scaled_mm_with_quant(Tensor mat1, Tensor mat2, Tensor scales2, "
+      "Tensor? bias, ScalarType out_dtype, bool is_vnni) -> Tensor");
+  ops.impl("int8_scaled_mm_with_quant", torch::kCPU,
+           &int8_scaled_mm_with_quant);
+#endif
+}
+
+TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
+  // Cache ops
+  // Swap in (out) the cache blocks from src to dst.
+  cache_ops.def(
+      "swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
+  cache_ops.impl("swap_blocks", torch::kCPU, &swap_blocks);
+
+  // Copy the cache blocks from src to dst.
+  cache_ops.def(
+      "copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
+      "Tensor block_mapping) -> ()");
+  cache_ops.impl("copy_blocks", torch::kCPU, &copy_blocks);
+
+  // Reshape the key and value tensors and cache them.
+  cache_ops.def(
+      "reshape_and_cache(Tensor key, Tensor value,"
+      "                  Tensor! key_cache, Tensor! value_cache,"
+      "                  Tensor slot_mapping,"
+      "                  str kv_cache_dtype,"
+      "                  Tensor k_scale, Tensor v_scale) -> ()");
+  cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache);
+
+  cache_ops.def(
+      "concat_and_cache_mla(Tensor kv_c, Tensor k_pe,"
+      "                     Tensor! kv_cache,"
+      "                     Tensor slot_mapping,"
+      "                     str kv_cache_dtype,"
+      "                     Tensor scale) -> ()");
+  cache_ops.impl("concat_and_cache_mla", torch::kCPU, &concat_and_cache_mla);
+}
+
+TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
+  // CPU utils
+  utils.def("init_cpu_threads_env(str cpu_ids) -> str", &init_cpu_threads_env);
+}
+
+TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cpu), cpu_ops) {
+  cpu_ops.def(
+      "mla_decode_kvcache("
+      "   Tensor! out, Tensor query, Tensor kv_cache,"
+      "   float scale, Tensor block_tables, Tensor seq_lens) -> ()");
+  cpu_ops.impl("mla_decode_kvcache", torch::kCPU, &mla_decode_kvcache);
+}
+
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/vllm_v0.10.0/csrc/cpu/utils.cpp b/vllm_v0.10.0/csrc/cpu/utils.cpp
new file mode 100644
index 0000000..02514ed
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cpu/utils.cpp
@@ -0,0 +1,107 @@
+#ifndef VLLM_NUMA_DISABLED
+  #include <numa.h>
+  #include <unistd.h>
+  #include <string>
+  #include <sched.h>
+#endif
+#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 30
+  #include <unistd.h>
+  #include <sys/syscall.h>
+  #define gettid() syscall(SYS_gettid)
+#endif
+
+#include "cpu_types.hpp"
+
+#ifdef VLLM_NUMA_DISABLED
+std::string init_cpu_threads_env(const std::string& cpu_ids) {
+  return std::string(
+      "Warning: NUMA is not enabled in this build. `init_cpu_threads_env` has "
+      "no effect to setup thread affinity.");
+}
+
+#endif
+
+#ifndef VLLM_NUMA_DISABLED
+std::string init_cpu_threads_env(const std::string& cpu_ids) {
+  bitmask* omp_cpu_mask = numa_parse_cpustring_all(cpu_ids.c_str());
+  TORCH_CHECK(omp_cpu_mask->size > 0);
+  std::vector<int> omp_cpu_ids;
+  omp_cpu_ids.reserve(omp_cpu_mask->size);
+
+  constexpr int group_size = 8 * sizeof(*omp_cpu_mask->maskp);
+
+  for (int offset = 0; offset < omp_cpu_mask->size; offset += group_size) {
+    unsigned long group_mask = omp_cpu_mask->maskp[offset / group_size];
+    int i = 0;
+    while (group_mask) {
+      if (group_mask & 1) {
+        omp_cpu_ids.emplace_back(offset + i);
+      }
+      ++i;
+      group_mask >>= 1;
+    }
+  }
+
+  // Memory node binding
+  if (numa_available() != -1) {
+    int mem_node_id = numa_node_of_cpu(omp_cpu_ids.front());
+    bitmask* mask = numa_parse_nodestring(std::to_string(mem_node_id).c_str());
+    bitmask* src_mask = numa_get_membind();
+
+    int pid = getpid();
+
+    // move all existing pages to the specified numa node.
+    *(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp);
+    int page_num = numa_migrate_pages(pid, src_mask, mask);
+    if (page_num == -1) {
+      TORCH_WARN("numa_migrate_pages failed. errno: " + std::to_string(errno));
+    }
+
+    // restrict memory allocation node.
+    numa_set_membind(mask);
+    numa_set_strict(1);
+  }
+
+  // OMP threads binding
+  omp_set_num_threads((int)omp_cpu_ids.size());
+  torch::set_num_threads((int)omp_cpu_ids.size());
+  TORCH_CHECK_EQ(omp_cpu_ids.size(), torch::get_num_threads());
+  TORCH_CHECK_EQ(omp_cpu_ids.size(), omp_get_max_threads());
+
+  std::vector<std::pair<int, int>> thread_core_mapping;
+  thread_core_mapping.reserve(omp_cpu_ids.size());
+  omp_lock_t writelock;
+  omp_init_lock(&writelock);
+
+  #pragma omp parallel for schedule(static, 1)
+  for (size_t i = 0; i < omp_cpu_ids.size(); ++i) {
+    cpu_set_t mask;
+    CPU_ZERO(&mask);
+    CPU_SET(omp_cpu_ids[i], &mask);
+    int ret = sched_setaffinity(0, sizeof(cpu_set_t), &mask);
+    if (ret == -1) {
+      TORCH_CHECK(false,
+                  "sched_setaffinity failed. errno: " + std::to_string(errno));
+    }
+
+    omp_set_lock(&writelock);
+    thread_core_mapping.emplace_back(gettid(), omp_cpu_ids[i]);
+    omp_unset_lock(&writelock);
+  }
+
+  omp_destroy_lock(&writelock);
+
+  numa_free_nodemask(omp_cpu_mask);
+
+  std::stringstream ss;
+  ss << "OMP threads binding of Process " << getpid() << ":\n";
+  std::sort(thread_core_mapping.begin(), thread_core_mapping.end(),
+            [](auto&& a, auto&& b) { return a.second < b.second; });
+  for (auto&& item : thread_core_mapping) {
+    ss << "\t"
+       << "OMP tid: " << item.first << ", core " << item.second << "\n";
+  }
+
+  return ss.str();
+}
+#endif
diff --git a/vllm_v0.10.0/csrc/cuda_compat.h b/vllm_v0.10.0/csrc/cuda_compat.h
new file mode 100644
index 0000000..affa051
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cuda_compat.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#ifdef USE_ROCM
+  #include <hip/hip_runtime.h>
+#endif
+
+#if defined(USE_ROCM) && defined(__GFX9__)
+  #define WARP_SIZE 64
+#else
+  #define WARP_SIZE 32
+#endif
+
+#ifndef USE_ROCM
+  #define VLLM_LDG(arg) __ldg(arg)
+#else
+  #define VLLM_LDG(arg) *(arg)
+#endif
+
+#ifndef USE_ROCM
+  #define VLLM_SHFL_XOR_SYNC(var, lane_mask) \
+    __shfl_xor_sync(uint32_t(-1), var, lane_mask)
+  #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
+    __shfl_xor_sync(uint32_t(-1), var, lane_mask, width)
+#else
+  #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask)
+  #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
+    __shfl_xor(var, lane_mask, width)
+#endif
+
+#ifndef USE_ROCM
+  #define VLLM_SHFL_SYNC(var, src_lane) __shfl_sync(uint32_t(-1), var, src_lane)
+#else
+  #define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane)
+#endif
+
+#ifndef USE_ROCM
+  #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) \
+    __shfl_down_sync(uint32_t(-1), var, lane_delta)
+#else
+  #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) __shfl_down(var, lane_delta)
+#endif
+
+#ifndef USE_ROCM
+  #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
+    cudaFuncSetAttribute(FUNC, cudaFuncAttributeMaxDynamicSharedMemorySize, VAL)
+#else
+  #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
+    hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL)
+#endif
diff --git a/vllm_v0.10.0/csrc/cuda_utils.h b/vllm_v0.10.0/csrc/cuda_utils.h
new file mode 100644
index 0000000..6e62ea2
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cuda_utils.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <stdio.h>
+
+#if defined(__HIPCC__)
+  #define HOST_DEVICE_INLINE __host__ __device__
+  #define DEVICE_INLINE __device__
+  #define HOST_INLINE __host__
+#elif defined(__CUDACC__) || defined(_NVHPC_CUDA)
+  #define HOST_DEVICE_INLINE __host__ __device__ __forceinline__
+  #define DEVICE_INLINE __device__ __forceinline__
+  #define HOST_INLINE __host__ __forceinline__
+#else
+  #define HOST_DEVICE_INLINE inline
+  #define DEVICE_INLINE inline
+  #define HOST_INLINE inline
+#endif
+
+#define CUDA_CHECK(cmd)                                             \
+  do {                                                              \
+    cudaError_t e = cmd;                                            \
+    if (e != cudaSuccess) {                                         \
+      printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \
+             cudaGetErrorString(e));                                \
+      exit(EXIT_FAILURE);                                           \
+    }                                                               \
+  } while (0)
+
+int64_t get_device_attribute(int64_t attribute, int64_t device_id);
+
+int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id);
+
+namespace cuda_utils {
+
+template <typename T>
+HOST_DEVICE_INLINE constexpr std::enable_if_t<std::is_integral_v<T>, T>
+ceil_div(T a, T b) {
+  return (a + b - 1) / b;
+}
+
+};  // namespace cuda_utils
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/cuda_utils_kernels.cu b/vllm_v0.10.0/csrc/cuda_utils_kernels.cu
new file mode 100644
index 0000000..0627a42
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cuda_utils_kernels.cu
@@ -0,0 +1,35 @@
+#include "cuda_utils.h"
+#ifdef USE_ROCM
+  #include <hip/hip_runtime.h>
+  #include <hip/hip_runtime_api.h>
+#endif
+
+int64_t get_device_attribute(int64_t attribute, int64_t device_id) {
+  // Return the cached value on subsequent calls
+  static int value = [=]() {
+    int device = static_cast<int>(device_id);
+    if (device < 0) {
+      CUDA_CHECK(cudaGetDevice(&device));
+    }
+    int value;
+    CUDA_CHECK(cudaDeviceGetAttribute(
+        &value, static_cast<cudaDeviceAttr>(attribute), device));
+    return static_cast<int>(value);
+  }();
+
+  return value;
+}
+
+int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id) {
+  int64_t attribute;
+  // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
+  // cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74
+
+#ifdef USE_ROCM
+  attribute = hipDeviceAttributeMaxSharedMemoryPerBlock;
+#else
+  attribute = cudaDevAttrMaxSharedMemoryPerBlockOptin;
+#endif
+
+  return get_device_attribute(attribute, device_id);
+}
diff --git a/vllm_v0.10.0/csrc/cuda_view.cu b/vllm_v0.10.0/csrc/cuda_view.cu
new file mode 100644
index 0000000..938bd4a
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cuda_view.cu
@@ -0,0 +1,39 @@
+#include <torch/all.h>
+#include <torch/cuda.h>
+#include <cuda_runtime.h>
+
+// This function assumes that `cpu_tensor` is a CPU tensor allocated with pinned
+// memory, and that UVA (Unified Virtual Addressing) is enabled.
+torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor) {
+  TORCH_CHECK(cpu_tensor.device().is_cpu(), "Input tensor must be on CPU");
+
+  // Get raw host pointer from CPU tensor
+  void* host_ptr = cpu_tensor.data_ptr();
+
+  // Get a device pointer corresponding to the pinned host memory
+  void* device_ptr = nullptr;
+  cudaError_t err = cudaHostGetDevicePointer(&device_ptr, host_ptr, 0);
+  TORCH_CHECK(err == cudaSuccess,
+              "cudaHostGetDevicePointer failed: ", cudaGetErrorString(err));
+
+  // We'll use the same sizes, strides, and dtype as the CPU tensor.
+  // TODO: check if layout is respected.
+  auto sizes = cpu_tensor.sizes();
+  auto strides = cpu_tensor.strides();
+  auto options = cpu_tensor.options().device(torch::kCUDA);
+
+  // from_blob signature: from_blob(void *data, IntArrayRef sizes, ..., Deleter,
+  // const TensorOptions &) Provide a no-op deleter. The CPU tensor holds the
+  // memory, so we don't free it here.
+  auto deleter = [](void*) {
+    // no-op, since the memory is owned by the original CPU tensor
+  };
+
+  torch::Tensor cuda_tensor =
+      torch::from_blob(device_ptr, sizes, strides, deleter, options);
+
+  TORCH_CHECK(cuda_tensor.device().is_cuda(),
+              "Resulting tensor is not on CUDA device");
+
+  return cuda_tensor;
+}
diff --git a/vllm_v0.10.0/csrc/cumem_allocator.cpp b/vllm_v0.10.0/csrc/cumem_allocator.cpp
new file mode 100644
index 0000000..fab6ca3
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cumem_allocator.cpp
@@ -0,0 +1,349 @@
+// A CUDAPluggableAllocator based on cumem* APIs.
+// Important: allocation size, CUdeviceptr and CUmemGenericAllocationHandle*
+// need to be unsigned long long
+#include <iostream>
+
+extern "C" {
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#include <sys/types.h>
+#include <cuda_runtime_api.h>
+#include <cuda.h>
+
+char error_msg[10240];  // 10KB buffer to store error messages
+CUresult no_error = CUresult(0);
+CUresult error_code = no_error;  // store error code
+
+#define CUDA_CHECK(condition)                                           \
+  do {                                                                  \
+    CUresult error = condition;                                         \
+    if (error != 0) {                                                   \
+      error_code = error;                                               \
+      char* error_string;                                               \
+      cuGetErrorString(error, (const char**)&error_string);             \
+      snprintf(error_msg, sizeof(error_msg), "CUDA Error: %s at %s:%d", \
+               error_string, __FILE__, __LINE__);                       \
+      std::cerr << error_msg << std::endl;                              \
+    }                                                                   \
+  } while (0)
+
+// Global references to Python callables
+// NOTE: this is borrowed reference, so we don't need to DECREF them.
+// This brings the limitation that the allocator needs to be singleton.
+static PyObject* g_python_malloc_callback = nullptr;
+static PyObject* g_python_free_callback = nullptr;
+
+// ---------------------------------------------------------------------------
+// Helper functions:
+
+void ensure_context(unsigned long long device) {
+  CUcontext pctx;
+  CUDA_CHECK(cuCtxGetCurrent(&pctx));
+  if (!pctx) {
+    // Ensure device context.
+    CUDA_CHECK(cuDevicePrimaryCtxRetain(&pctx, device));
+    CUDA_CHECK(cuCtxSetCurrent(pctx));
+  }
+}
+
+void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
+                    CUmemGenericAllocationHandle* p_memHandle) {
+  ensure_context(device);
+  // Define memory allocation properties
+  CUmemAllocationProp prop = {};
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  prop.location.id = device;
+  prop.allocFlags.compressionType = CU_MEM_ALLOCATION_COMP_NONE;
+
+  // Allocate memory using cuMemCreate
+  CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
+  if (error_code != 0) {
+    return;
+  }
+  CUDA_CHECK(cuMemMap(d_mem, size, 0, *p_memHandle, 0));
+  if (error_code != 0) {
+    return;
+  }
+  CUmemAccessDesc accessDesc = {};
+  accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  accessDesc.location.id = device;
+  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+
+  CUDA_CHECK(cuMemSetAccess(d_mem, size, &accessDesc, 1));
+  if (error_code != 0) {
+    return;
+  }
+  // std::cout << "create_and_map: device=" << device << ", size=" << size << ",
+  // d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl;
+}
+
+void unmap_and_release(unsigned long long device, ssize_t size,
+                       CUdeviceptr d_mem,
+                       CUmemGenericAllocationHandle* p_memHandle) {
+  // std::cout << "unmap_and_release: device=" << device << ", size=" << size <<
+  // ", d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl;
+  ensure_context(device);
+  CUDA_CHECK(cuMemUnmap(d_mem, size));
+  if (error_code != 0) {
+    return;
+  }
+  CUDA_CHECK(cuMemRelease(*p_memHandle));
+  if (error_code != 0) {
+    return;
+  }
+}
+
+PyObject* create_tuple_from_c_integers(unsigned long long a,
+                                       unsigned long long b,
+                                       unsigned long long c,
+                                       unsigned long long d) {
+  // Create a new tuple of size 4
+  PyObject* tuple = PyTuple_New(4);
+  if (!tuple) {
+    return NULL;  // Return NULL on failure
+  }
+
+  // Convert integers to Python objects and set them in the tuple
+  PyTuple_SetItem(
+      tuple, 0,
+      PyLong_FromUnsignedLongLong(a));  // Steals reference to the PyLong
+  PyTuple_SetItem(tuple, 1, PyLong_FromUnsignedLongLong(b));
+  PyTuple_SetItem(tuple, 2, PyLong_FromUnsignedLongLong(c));
+  PyTuple_SetItem(tuple, 3, PyLong_FromUnsignedLongLong(d));
+
+  // Note: PyTuple_SetItem "steals" a reference to each object,
+  // so we do not need to Py_DECREF the PyLong objects explicitly.
+
+  return tuple;  // Return the created tuple
+}
+
+// ---------------------------------------------------------------------------
+// Our exported C functions that call Python:
+
+// use CUstream instead of cudaStream_t, to avoid including cuda_runtime_api.h
+void* my_malloc(ssize_t size, int device, CUstream stream) {
+  ensure_context(device);
+
+  // first allocation, align the size, and reserve an address, and also allocate
+  // a CUmemGenericAllocationHandle
+
+  // Define memory allocation properties
+  CUmemAllocationProp prop = {};
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  prop.location.id = device;
+  prop.allocFlags.compressionType = CU_MEM_ALLOCATION_COMP_NONE;
+
+  // Check if the allocation is supported
+  size_t granularity;
+  CUDA_CHECK(cuMemGetAllocationGranularity(&granularity, &prop,
+                                           CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+  if (error_code != 0) {
+    return nullptr;
+  }
+  size_t alignedSize = ((size + granularity - 1) / granularity) * granularity;
+
+  CUdeviceptr d_mem;
+  CUDA_CHECK(cuMemAddressReserve(&d_mem, alignedSize, 0, 0, 0));
+  if (error_code != 0) {
+    return nullptr;
+  }
+  // allocate the CUmemGenericAllocationHandle
+  CUmemGenericAllocationHandle* p_memHandle =
+      (CUmemGenericAllocationHandle*)malloc(
+          sizeof(CUmemGenericAllocationHandle));
+
+  if (!g_python_malloc_callback) {
+    std::cerr << "ERROR: g_python_malloc_callback not set.\n";
+    return nullptr;
+  }
+
+  // Acquire GIL (not in stable ABI officially, but often works)
+  PyGILState_STATE gstate = PyGILState_Ensure();
+
+  PyObject* arg_tuple = create_tuple_from_c_integers(
+      (unsigned long long)device, (unsigned long long)alignedSize,
+      (unsigned long long)d_mem, (unsigned long long)p_memHandle);
+
+  // Call g_python_malloc_callback
+  PyObject* py_result =
+      PyObject_CallFunctionObjArgs(g_python_malloc_callback, arg_tuple, NULL);
+  Py_DECREF(arg_tuple);
+
+  if (!py_result) {
+    PyErr_Print();
+    PyGILState_Release(gstate);
+    return nullptr;
+  }
+
+  PyGILState_Release(gstate);
+
+  // do the final mapping
+  create_and_map(device, alignedSize, d_mem, p_memHandle);
+
+  return (void*)d_mem;
+}
+
+// use CUstream instead of cudaStream_t, to avoid including cuda_runtime_api.h
+void my_free(void* ptr, ssize_t size, int device, CUstream stream) {
+  // get memory handle from the pointer
+  if (!g_python_free_callback) {
+    std::cerr << "ERROR: g_python_free_callback not set.\n";
+    return;
+  }
+
+  // Acquire GIL (not in stable ABI officially, but often works)
+  PyGILState_STATE gstate = PyGILState_Ensure();
+
+  PyObject* py_ptr =
+      PyLong_FromUnsignedLongLong(reinterpret_cast<unsigned long long>(ptr));
+
+  PyObject* py_result =
+      PyObject_CallFunctionObjArgs(g_python_free_callback, py_ptr, NULL);
+
+  if (!py_result || !PyTuple_Check(py_result) || PyTuple_Size(py_result) != 4) {
+    PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
+    return;
+  }
+
+  unsigned long long recv_device, recv_size;
+  unsigned long long recv_d_mem, recv_p_memHandle;
+  // Unpack the tuple into four C integers
+  if (!PyArg_ParseTuple(py_result, "KKKK", &recv_device, &recv_size,
+                        &recv_d_mem, &recv_p_memHandle)) {
+    // PyArg_ParseTuple sets an error if it fails
+    return;
+  }
+
+  PyGILState_Release(gstate);
+
+  // recv_size == size
+  // recv_device == device
+
+  // Free memory
+
+  CUdeviceptr d_mem = (CUdeviceptr)recv_d_mem;
+  CUmemGenericAllocationHandle* p_memHandle =
+      (CUmemGenericAllocationHandle*)recv_p_memHandle;
+  unmap_and_release(device, size, d_mem, p_memHandle);
+
+  // free address and the handle
+  CUDA_CHECK(cuMemAddressFree(d_mem, size));
+  if (error_code != 0) {
+    return;
+  }
+  free(p_memHandle);
+}
+
+// ---------------------------------------------------------------------------
+// Python extension boilerplate:
+
+// Python-exposed function: init_module(python_malloc, python_free)
+static PyObject* py_init_module(PyObject* self, PyObject* args) {
+  PyObject* malloc_callback = nullptr;
+  PyObject* free_callback = nullptr;
+
+  if (!PyArg_ParseTuple(args, "OO", &malloc_callback, &free_callback)) {
+    return nullptr;
+  }
+
+  if (!PyCallable_Check(malloc_callback) || !PyCallable_Check(free_callback)) {
+    PyErr_SetString(PyExc_TypeError, "Both arguments must be callables");
+    return nullptr;
+  }
+
+  // Save the Python callables
+  // This module does not handle GC of these objects, so they must be kept alive
+  // outside of this module.
+  g_python_malloc_callback = malloc_callback;
+  g_python_free_callback = free_callback;
+
+  Py_RETURN_NONE;
+}
+
+static PyObject* python_unmap_and_release(PyObject* self, PyObject* args) {
+  if (!args || !PyTuple_Check(args) || PyTuple_Size(args) != 4) {
+    PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
+    return nullptr;
+  }
+
+  unsigned long long recv_device, recv_size;
+  unsigned long long recv_d_mem, recv_p_memHandle;
+  // Unpack the tuple into four C integers
+  if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem,
+                        &recv_p_memHandle)) {
+    // PyArg_ParseTuple sets an error if it fails
+    return nullptr;
+  }
+
+  CUdeviceptr d_mem_ptr = (CUdeviceptr)recv_d_mem;
+  CUmemGenericAllocationHandle* p_memHandle =
+      (CUmemGenericAllocationHandle*)recv_p_memHandle;
+
+  unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle);
+
+  if (error_code != 0) {
+    error_code = no_error;
+    PyErr_SetString(PyExc_RuntimeError, error_msg);
+    return nullptr;
+  }
+
+  Py_RETURN_NONE;
+}
+
+static PyObject* python_create_and_map(PyObject* self, PyObject* args) {
+  if (!args || !PyTuple_Check(args) || PyTuple_Size(args) != 4) {
+    PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
+    return nullptr;
+  }
+
+  unsigned long long recv_device, recv_size;
+  unsigned long long recv_d_mem, recv_p_memHandle;
+  // Unpack the tuple into four C integers
+  if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem,
+                        &recv_p_memHandle)) {
+    // PyArg_ParseTuple sets an error if it fails
+    return nullptr;
+  }
+
+  CUdeviceptr d_mem_ptr = (CUdeviceptr)recv_d_mem;
+  CUmemGenericAllocationHandle* p_memHandle =
+      (CUmemGenericAllocationHandle*)recv_p_memHandle;
+
+  create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle);
+
+  if (error_code != 0) {
+    error_code = no_error;
+    PyErr_SetString(PyExc_RuntimeError, error_msg);
+    return nullptr;
+  }
+
+  Py_RETURN_NONE;
+}
+
+static PyMethodDef module_methods[] = {
+    {"init_module", (PyCFunction)py_init_module, METH_VARARGS,
+     "Initialize module with python_malloc and python_free callables."},
+    {"python_create_and_map", (PyCFunction)python_create_and_map, METH_VARARGS,
+     "Create and map memory on the device."},
+    {"python_unmap_and_release", (PyCFunction)python_unmap_and_release,
+     METH_VARARGS, "Unmap and release memory on the device."},
+    {NULL, NULL, 0, NULL}  // sentinel
+};
+
+static struct PyModuleDef cumem_allocator_module = {
+    PyModuleDef_HEAD_INIT, "cumem_allocator",
+    "cumem-based allocator for CUDAPluggableAllocator", -1, module_methods};
+
+PyMODINIT_FUNC PyInit_cumem_allocator(void) {
+  // Initialize the module
+  PyObject* module = PyModule_Create(&cumem_allocator_module);
+  if (!module) {
+    return NULL;
+  }
+  return module;
+}
+}  // extern "C"
diff --git a/vllm_v0.10.0/csrc/custom_all_reduce.cu b/vllm_v0.10.0/csrc/custom_all_reduce.cu
new file mode 100644
index 0000000..a38d6fa
--- /dev/null
+++ b/vllm_v0.10.0/csrc/custom_all_reduce.cu
@@ -0,0 +1,189 @@
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+#include <torch/all.h>
+
+#include "custom_all_reduce.cuh"
+
+// Fake pointer type, must match fptr_t type in ops.h.
+// We use this type alias to indicate when pointers are passed in as int64_t.
+using fptr_t = int64_t;
+static_assert(sizeof(void*) == sizeof(fptr_t));
+
+fptr_t init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs,
+                      torch::Tensor& rank_data, int64_t rank,
+                      bool fully_connected) {
+  int world_size = fake_ipc_ptrs.size();
+  if (world_size > 8)
+    throw std::invalid_argument("world size > 8 is not supported");
+  if (world_size % 2 != 0)
+    throw std::invalid_argument("Odd num gpus is not supported for now");
+  if (rank < 0 || rank >= world_size)
+    throw std::invalid_argument("invalid rank passed in");
+
+  vllm::Signal* ipc_ptrs[8];
+  for (int i = 0; i < world_size; i++) {
+    ipc_ptrs[i] = reinterpret_cast<vllm::Signal*>(fake_ipc_ptrs[i]);
+  }
+  return (fptr_t) new vllm::CustomAllreduce(ipc_ptrs, rank_data.data_ptr(),
+                                            rank_data.numel(), rank, world_size,
+                                            fully_connected);
+}
+
+/**
+ * Make sure tensor t's data lies completely within ((char)t.data_ptr()) +
+ * t.numel() * t.element_size(). This is slightly weaker than t.is_contiguous()
+ * because it allows transpose of contiguous slice (i.e. slicing the first
+ * dimension). Currently, we require this because stride information is not
+ * passed into the kernels and we treat input tensors as flat.
+ *
+ * Examples
+ * A = torch.zeros(3, 3, 3)
+ * 1. A: OK
+ * 2. A[1:]: OK
+ * 3. A.permute(2, 0, 1): OK
+ * 4. A[1:].permute(2, 0, 1): OK
+ * 5. A[None].expand(2, -1, -1, -1): Not OK
+ * 6. A[:, 1:, 1:]: Not OK
+ */
+bool _is_weak_contiguous(torch::Tensor& t) {
+  return t.is_contiguous() ||
+         (t.storage().nbytes() - t.storage_offset() * t.element_size() ==
+          t.numel() * t.element_size());
+}
+
+/**
+ * Performs an out-of-place allreduce and stores result in out.
+ *
+ * If _reg_buffer is null, assumes inp.data_ptr() is already IPC-registered.
+ * Otherwise, _reg_buffer is assumed to be IPC-registered and inp is first
+ * copied into _reg_buffer.
+ */
+void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
+                fptr_t _reg_buffer, int64_t reg_buffer_sz_bytes) {
+  auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
+  auto stream = c10::cuda::getCurrentCUDAStream().stream();
+
+  TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
+  TORCH_CHECK_EQ(inp.numel(), out.numel());
+  TORCH_CHECK(_is_weak_contiguous(out));
+  TORCH_CHECK(_is_weak_contiguous(inp));
+  auto input_size = inp.numel() * inp.element_size();
+  auto reg_buffer = reinterpret_cast<void*>(_reg_buffer);
+  if (reg_buffer) {
+    TORCH_CHECK_LE(input_size, reg_buffer_sz_bytes);
+    AT_CUDA_CHECK(cudaMemcpyAsync(reg_buffer, inp.data_ptr(), input_size,
+                                  cudaMemcpyDeviceToDevice, stream));
+  } else {
+    reg_buffer = inp.data_ptr();
+  }
+  switch (out.scalar_type()) {
+    case at::ScalarType::Float: {
+      fa->allreduce<float>(stream, reinterpret_cast<float*>(reg_buffer),
+                           reinterpret_cast<float*>(out.data_ptr()),
+                           out.numel());
+      break;
+    }
+    case at::ScalarType::Half: {
+      fa->allreduce<half>(stream, reinterpret_cast<half*>(reg_buffer),
+                          reinterpret_cast<half*>(out.data_ptr()), out.numel());
+      break;
+    }
+#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+    case at::ScalarType::BFloat16: {
+      fa->allreduce<nv_bfloat16>(
+          stream, reinterpret_cast<nv_bfloat16*>(reg_buffer),
+          reinterpret_cast<nv_bfloat16*>(out.data_ptr()), out.numel());
+      break;
+    }
+#endif
+    default:
+      throw std::runtime_error(
+          "custom allreduce only supports float32, float16 and bfloat16");
+  }
+}
+
+void dispose(fptr_t _fa) {
+  delete reinterpret_cast<vllm::CustomAllreduce*>(_fa);
+}
+
+int64_t meta_size() { return sizeof(vllm::Signal); }
+
+void register_buffer(fptr_t _fa, const std::vector<fptr_t>& fake_ipc_ptrs) {
+  auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
+  TORCH_CHECK(fake_ipc_ptrs.size() == fa->world_size_);
+  void* ipc_ptrs[8];
+  for (int i = 0; i < fake_ipc_ptrs.size(); i++) {
+    ipc_ptrs[i] = reinterpret_cast<void*>(fake_ipc_ptrs[i]);
+  }
+  fa->register_buffer(ipc_ptrs);
+}
+
+// Use vector<int64_t> to represent byte data for python binding compatibility.
+std::tuple<std::vector<int64_t>, std::vector<int64_t>>
+get_graph_buffer_ipc_meta(fptr_t _fa) {
+  auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
+  auto [handle, offsets] = fa->get_graph_buffer_ipc_meta();
+  std::vector<int64_t> bytes(handle.begin(), handle.end());
+  return std::make_tuple(bytes, offsets);
+}
+
+// Use vector<int64_t> to represent byte data for python binding compatibility.
+void register_graph_buffers(fptr_t _fa,
+                            const std::vector<std::vector<int64_t>>& handles,
+                            const std::vector<std::vector<int64_t>>& offsets) {
+  auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
+  std::vector<std::string> bytes;
+  bytes.reserve(handles.size());
+  for (int i = 0; i < handles.size(); i++) {
+    bytes.emplace_back(handles[i].begin(), handles[i].end());
+  }
+  bytes.reserve(handles.size());
+  fa->register_graph_buffers(bytes, offsets);
+}
+
+std::tuple<fptr_t, torch::Tensor> allocate_shared_buffer_and_handle(
+    int64_t size) {
+  auto device_index = c10::cuda::current_device();
+  at::DeviceGuard device_guard(at::Device(at::DeviceType::CUDA, device_index));
+  void* buffer;
+  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  auto stream = c10::cuda::getCurrentCUDAStream().stream();
+  AT_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+
+  // Allocate buffer
+#if defined(USE_ROCM)
+  // data buffers need to be "uncached" for signal on MI200
+  AT_CUDA_CHECK(
+      hipExtMallocWithFlags((void**)&buffer, size, hipDeviceMallocUncached));
+#else
+  AT_CUDA_CHECK(cudaMalloc((void**)&buffer, size));
+#endif
+  AT_CUDA_CHECK(cudaMemsetAsync(buffer, 0, size, stream));
+  AT_CUDA_CHECK(cudaStreamSynchronize(stream));
+  AT_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+
+  // Create IPC memhandle for the allocated buffer.
+  // Will use it in open_mem_handle.
+  auto options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
+  auto handle =
+      torch::empty({static_cast<int64_t>(sizeof(cudaIpcMemHandle_t))}, options);
+  AT_CUDA_CHECK(
+      cudaIpcGetMemHandle((cudaIpcMemHandle_t*)handle.data_ptr(), buffer));
+
+  return std::make_tuple(reinterpret_cast<fptr_t>(buffer), handle);
+}
+
+fptr_t open_mem_handle(torch::Tensor& mem_handle) {
+  void* ipc_ptr;
+  AT_CUDA_CHECK(cudaIpcOpenMemHandle(
+      (void**)&ipc_ptr, *((const cudaIpcMemHandle_t*)mem_handle.data_ptr()),
+      cudaIpcMemLazyEnablePeerAccess));
+  return reinterpret_cast<fptr_t>(ipc_ptr);
+}
+
+void free_shared_buffer(fptr_t buffer) {
+  AT_CUDA_CHECK(cudaFree(reinterpret_cast<void*>(buffer)));
+}
diff --git a/vllm_v0.10.0/csrc/custom_all_reduce.cuh b/vllm_v0.10.0/csrc/custom_all_reduce.cuh
new file mode 100644
index 0000000..44709b4
--- /dev/null
+++ b/vllm_v0.10.0/csrc/custom_all_reduce.cuh
@@ -0,0 +1,605 @@
+#pragma once
+
+#include <cuda.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#if defined(USE_ROCM)
+typedef __hip_bfloat16 nv_bfloat16;
+#endif
+
+#include <iostream>
+#include <array>
+#include <limits>
+#include <map>
+#include <unordered_map>
+#include <vector>
+
+namespace vllm {
+#define CUDACHECK(cmd)                                              \
+  do {                                                              \
+    cudaError_t e = cmd;                                            \
+    if (e != cudaSuccess) {                                         \
+      printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \
+             cudaGetErrorString(e));                                \
+      exit(EXIT_FAILURE);                                           \
+    }                                                               \
+  } while (0)
+
+// Maximal number of blocks in allreduce kernel.
+constexpr int kMaxBlocks = 36;
+
+// Default number of blocks in allreduce kernel.
+#ifndef USE_ROCM
+const int defaultBlockLimit = 36;
+CUpointer_attribute rangeStartAddrAttr = CU_POINTER_ATTRIBUTE_RANGE_START_ADDR;
+#else
+const int defaultBlockLimit = 16;
+hipPointer_attribute rangeStartAddrAttr =
+    HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR;
+#endif
+
+// Counter may overflow, but it's fine since unsigned int overflow is
+// well-defined behavior.
+using FlagType = uint32_t;
+
+// Two sets of peer counters are needed for two syncs: starting and ending an
+// operation. The reason is that it's possible for peer GPU block to arrive at
+// the second sync point while the current GPU block haven't passed the first
+// sync point. Thus, peer GPU may write counter+1 while current GPU is busy
+// waiting for counter. We use alternating counter array to avoid this
+// possibility.
+struct Signal {
+  alignas(128) FlagType start[kMaxBlocks][8];
+  alignas(128) FlagType end[kMaxBlocks][8];
+  alignas(128) FlagType _flag[kMaxBlocks];  // incremental flags for each rank
+};
+
+struct __align__(16) RankData {
+  const void* ptrs[8];
+};
+
+struct __align__(16) RankSignals {
+  Signal* signals[8];
+};
+
+// like std::array, but aligned
+template <typename T, int sz>
+struct __align__(alignof(T) * sz) array_t {
+  T data[sz];
+  using type = T;
+  static constexpr int size = sz;
+};
+
+// use packed type to maximize memory efficiency
+// goal: generate ld.128 and st.128 instructions
+template <typename T>
+struct packed_t {
+  // the (P)acked type for load/store
+  using P = array_t<T, 16 / sizeof(T)>;
+  // the (A)ccumulator type for reduction
+  using A = array_t<float, 16 / sizeof(T)>;
+};
+
+#define DINLINE __device__ __forceinline__
+
+// scalar cast functions
+DINLINE float upcast_s(half val) { return __half2float(val); }
+
+template <typename T>
+DINLINE T downcast_s(float val);
+template <>
+DINLINE half downcast_s(float val) {
+  return __float2half(val);
+}
+
+// scalar add functions
+// for some reason when compiling with Pytorch, the + operator for half and
+// bfloat is disabled so we call the intrinsics directly
+DINLINE half& assign_add(half& a, half b) {
+  a = __hadd(a, b);
+  return a;
+}
+DINLINE float& assign_add(float& a, float b) { return a += b; }
+
+#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+DINLINE float upcast_s(nv_bfloat16 val) { return __bfloat162float(val); }
+template <>
+DINLINE nv_bfloat16 downcast_s(float val) {
+  return __float2bfloat16(val);
+}
+DINLINE nv_bfloat16& assign_add(nv_bfloat16& a, nv_bfloat16 b) {
+  a = __hadd(a, b);
+  return a;
+}
+#endif
+
+template <typename T, int N>
+DINLINE array_t<T, N>& packed_assign_add(array_t<T, N>& a, array_t<T, N> b) {
+#pragma unroll
+  for (int i = 0; i < N; i++) {
+    assign_add(a.data[i], b.data[i]);
+  }
+  return a;
+}
+
+template <typename T, int N>
+DINLINE array_t<float, N> upcast(array_t<T, N> val) {
+  if constexpr (std::is_same<T, float>::value) {
+    return val;
+  } else {
+    array_t<float, N> out;
+#pragma unroll
+    for (int i = 0; i < N; i++) {
+      out.data[i] = upcast_s(val.data[i]);
+    }
+    return out;
+  }
+}
+
+template <typename O>
+DINLINE O downcast(array_t<float, O::size> val) {
+  if constexpr (std::is_same<typename O::type, float>::value) {
+    return val;
+  } else {
+    O out;
+#pragma unroll
+    for (int i = 0; i < O::size; i++) {
+      out.data[i] = downcast_s<typename O::type>(val.data[i]);
+    }
+    return out;
+  }
+}
+
+#if !defined(USE_ROCM)
+
+static DINLINE void st_flag_release(FlagType* flag_addr, FlagType flag) {
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  asm volatile("st.release.sys.global.u32 [%1], %0;" ::"r"(flag),
+               "l"(flag_addr));
+  #else
+  asm volatile("membar.sys; st.volatile.global.u32 [%1], %0;" ::"r"(flag),
+               "l"(flag_addr));
+  #endif
+}
+
+static DINLINE FlagType ld_flag_acquire(FlagType* flag_addr) {
+  FlagType flag;
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  asm volatile("ld.acquire.sys.global.u32 %0, [%1];"
+               : "=r"(flag)
+               : "l"(flag_addr));
+  #else
+  asm volatile("ld.volatile.global.u32 %0, [%1]; membar.gl;"
+               : "=r"(flag)
+               : "l"(flag_addr));
+  #endif
+  return flag;
+}
+
+static DINLINE void st_flag_volatile(FlagType* flag_addr, FlagType flag) {
+  asm volatile("st.volatile.global.u32 [%1], %0;" ::"r"(flag), "l"(flag_addr));
+}
+
+static DINLINE FlagType ld_flag_volatile(FlagType* flag_addr) {
+  FlagType flag;
+  asm volatile("ld.volatile.global.u32 %0, [%1];"
+               : "=r"(flag)
+               : "l"(flag_addr));
+  return flag;
+}
+
+// This function is meant to be used as the first synchronization in the all
+// reduce kernel. Thus, it doesn't need to make any visibility guarantees for
+// prior memory accesses. Note: volatile writes will not be reordered against
+// other volatile writes.
+template <int ngpus>
+DINLINE void barrier_at_start(const RankSignals& sg, Signal* self_sg,
+                              int rank) {
+  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
+  if (threadIdx.x < ngpus) {
+    auto peer_counter_ptr = &sg.signals[threadIdx.x]->start[blockIdx.x][rank];
+    auto self_counter_ptr = &self_sg->start[blockIdx.x][threadIdx.x];
+    // Write the expected counter value to peer and wait for correct value
+    // from peer.
+    st_flag_volatile(peer_counter_ptr, flag);
+    while (ld_flag_volatile(self_counter_ptr) != flag);
+  }
+  __syncthreads();
+  // use one thread to update flag
+  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
+}
+
+// This function is meant to be used as the second or the final
+// synchronization barrier in the all reduce kernel. If it's the final
+// synchronization barrier, we don't need to make any visibility guarantees
+// for prior memory accesses.
+template <int ngpus, bool final_sync = false>
+DINLINE void barrier_at_end(const RankSignals& sg, Signal* self_sg, int rank) {
+  __syncthreads();
+  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
+  if (threadIdx.x < ngpus) {
+    auto peer_counter_ptr = &sg.signals[threadIdx.x]->end[blockIdx.x][rank];
+    auto self_counter_ptr = &self_sg->end[blockIdx.x][threadIdx.x];
+    // Write the expected counter value to peer and wait for correct value from
+    // peer.
+    if constexpr (!final_sync) {
+      st_flag_release(peer_counter_ptr, flag);
+      while (ld_flag_acquire(self_counter_ptr) != flag);
+    } else {
+      st_flag_volatile(peer_counter_ptr, flag);
+      while (ld_flag_volatile(self_counter_ptr) != flag);
+    }
+  }
+  if constexpr (!final_sync) __syncthreads();
+
+  // use one thread to update flag
+  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
+}
+
+#else
+
+template <int ngpus>
+DINLINE void barrier_at_start(const RankSignals& sg, Signal* self_sg,
+                              int rank) {
+  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
+  if (threadIdx.x < ngpus) {
+    // simultaneously write to the corresponding flag of all ranks.
+    // Latency = 1 p2p write
+    __scoped_atomic_store_n(&sg.signals[threadIdx.x]->start[blockIdx.x][rank],
+                            flag, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
+    // wait until we got true from all ranks
+    while (__scoped_atomic_load_n(&self_sg->start[blockIdx.x][threadIdx.x],
+                                  __ATOMIC_RELAXED,
+                                  __MEMORY_SCOPE_DEVICE) < flag);
+  }
+  __syncthreads();
+  // use one thread to update flag
+  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
+}
+
+template <int ngpus, bool final_sync = false>
+DINLINE void barrier_at_end(const RankSignals& sg, Signal* self_sg, int rank) {
+  __syncthreads();
+  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
+  if (threadIdx.x < ngpus) {
+    // simultaneously write to the corresponding flag of all ranks.
+    // Latency = 1 p2p write
+    __scoped_atomic_store_n(&sg.signals[threadIdx.x]->end[blockIdx.x][rank],
+                            flag,
+                            final_sync ? __ATOMIC_RELAXED : __ATOMIC_RELEASE,
+                            __MEMORY_SCOPE_SYSTEM);
+    // wait until we got true from all ranks
+    while (
+        __scoped_atomic_load_n(&self_sg->end[blockIdx.x][threadIdx.x],
+                               final_sync ? __ATOMIC_RELAXED : __ATOMIC_ACQUIRE,
+                               __MEMORY_SCOPE_DEVICE) < flag);
+  }
+  if constexpr (!final_sync) __syncthreads();
+  // use one thread to update flag
+  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
+}
+
+#endif
+
+template <typename P, int ngpus, typename A>
+DINLINE P packed_reduce(const P* ptrs[], int idx) {
+  A tmp = upcast(ptrs[0][idx]);
+#pragma unroll
+  for (int i = 1; i < ngpus; i++) {
+    packed_assign_add(tmp, upcast(ptrs[i][idx]));
+  }
+  return downcast<P>(tmp);
+}
+
+template <typename T, int ngpus>
+__global__ void __launch_bounds__(512, 1)
+    cross_device_reduce_1stage(RankData* _dp, RankSignals sg, Signal* self_sg,
+                               T* __restrict__ result, int rank, int size) {
+  using P = typename packed_t<T>::P;
+  using A = typename packed_t<T>::A;
+  // note: we don't reorder the address so the accumulation order is the same
+  // for all ranks, ensuring bitwise identical results
+  auto dp = *_dp;
+  barrier_at_start<ngpus>(sg, self_sg, rank);
+  // do the actual reduction
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
+       idx += gridDim.x * blockDim.x) {
+    ((P*)result)[idx] = packed_reduce<P, ngpus, A>((const P**)&dp.ptrs[0], idx);
+  }
+  barrier_at_end<ngpus, true>(sg, self_sg, rank);
+}
+
+template <typename P>
+DINLINE P* get_tmp_buf(Signal* sg) {
+  return (P*)(((Signal*)sg) + 1);
+}
+
+template <typename T, int ngpus>
+__global__ void __launch_bounds__(512, 1)
+    cross_device_reduce_2stage(RankData* _dp, RankSignals sg, Signal* self_sg,
+                               T* __restrict__ result, int rank, int size) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = gridDim.x * blockDim.x;
+  using P = typename packed_t<T>::P;
+  using A = typename packed_t<T>::A;
+  int part = size / ngpus;
+  int start = rank * part;
+  int end = rank == ngpus - 1 ? size : start + part;
+  int largest_part = part + size % ngpus;
+  const P* ptrs[ngpus];
+  P* tmps[ngpus];
+#pragma unroll
+  for (int i = 0; i < ngpus; i++) {
+    int target = (rank + i) % ngpus;
+    ptrs[i] = (const P*)_dp->ptrs[target];
+    tmps[i] = get_tmp_buf<P>(sg.signals[target]);
+  }
+  auto tmp_out = tmps[0];
+  barrier_at_start<ngpus>(sg, self_sg, rank);
+
+  // stage 1: reduce scatter
+  for (int idx = start + tid; idx < end; idx += stride) {
+    tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
+  }
+  barrier_at_end<ngpus>(sg, self_sg, rank);
+
+  // stage 2: allgather. Note: it's important to match the tid between
+  // the two stages, because visibility across devices is only guaranteed
+  // between threads that have the same tid. If thread i computes the sum of
+  // start + i in the first stage, then thread i also gathers start + i from
+  // all ranks.
+
+  for (int idx = tid; idx < largest_part; idx += stride) {
+#pragma unroll
+    for (int i = 0; i < ngpus; i++) {
+      int gather_from_rank = ((rank + i) % ngpus);
+      if (gather_from_rank == ngpus - 1 || idx < part) {
+        int dst_idx = gather_from_rank * part + idx;
+        ((P*)result)[dst_idx] = tmps[i][idx];
+      }
+    }
+  }
+}
+
+using IPC_KEY = std::array<uint8_t, sizeof(cudaIpcMemHandle_t)>;
+static_assert(sizeof(IPC_KEY) == sizeof(cudaIpcMemHandle_t));
+static_assert(alignof(IPC_KEY) == alignof(cudaIpcMemHandle_t));
+
+class CustomAllreduce {
+ public:
+  int rank_;
+  int world_size_;
+  // Full NVLink or xGMI connection between GPUs.
+  bool fully_connected_;
+
+  RankSignals sg_;
+  // Stores a map from a pointer to its peer pointers from all ranks.
+  std::unordered_map<void*, RankData*> buffers_;
+  Signal* self_sg_;
+
+  // Stores rank data from all ranks. This is mainly for cuda graph purposes.
+  // For cuda graph to work, all kernel arguments must be fixed during graph
+  // capture time. However, the peer pointers are not known during graph
+  // capture time. Therefore, during capture, we increment the rank data
+  // pointer and use that as the argument to the kernel. The kernel arguments
+  // are stored in graph_unreg_buffers_. The actual peer pointers will be
+  // filled in at the memory pointed to by the pointers in
+  // graph_unreg_buffers_ when the IPC handles are exchanged between ranks.
+  //
+  // The overall process looks like this:
+  // 1. Graph capture.
+  // 2. Each rank obtains the IPC handles for each addresses used during cuda
+  // graph capture using get_graph_buffer_ipc_meta.
+  // 3. (In Python) all gather the IPC handles.
+  // 4. Obtain the peer pointers by opening the IPC handles, and store them in
+  // the rank data array at corresponding positions.
+  RankData *d_rank_data_base_, *d_rank_data_end_;
+  std::vector<void*> graph_unreg_buffers_;
+  // a map from IPC handles to opened IPC pointers
+  std::map<IPC_KEY, char*> ipc_handles_;
+
+  /**
+   * Signals are an array of ipc-enabled buffers from all ranks.
+   * For each of the buffer, the layout is as follows:
+   * | -- sizeof(Signal) -- | ------ a few MB ----- |
+   * The first section is for allreduce synchronization, and the second
+   * section is for storing the intermediate results required by some
+   * allreduce algos.
+   *
+   * Note: this class does not own any device memory. Any required buffers
+   * are passed in from the constructor.
+   */
+  CustomAllreduce(Signal** signals, void* rank_data, size_t rank_data_sz,
+                  int rank, int world_size, bool fully_connected = true)
+      : rank_(rank),
+        world_size_(world_size),
+        fully_connected_(fully_connected),
+        self_sg_(signals[rank]),
+        d_rank_data_base_(reinterpret_cast<RankData*>(rank_data)),
+        d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) {
+    for (int i = 0; i < world_size_; i++) {
+      sg_.signals[i] = signals[i];
+    }
+  }
+
+  char* open_ipc_handle(const void* ipc_handle) {
+    auto [it, new_handle] =
+        ipc_handles_.insert({*((IPC_KEY*)ipc_handle), nullptr});
+    if (new_handle) {
+      char* ipc_ptr;
+      CUDACHECK(cudaIpcOpenMemHandle((void**)&ipc_ptr,
+                                     *((const cudaIpcMemHandle_t*)ipc_handle),
+                                     cudaIpcMemLazyEnablePeerAccess));
+      it->second = ipc_ptr;
+    }
+    return it->second;
+  }
+
+  std::pair<std::string, std::vector<int64_t>> get_graph_buffer_ipc_meta() {
+    auto num_buffers = graph_unreg_buffers_.size();
+    auto handle_sz = sizeof(cudaIpcMemHandle_t);
+    std::string handles(handle_sz * num_buffers, static_cast<char>(0));
+    std::vector<int64_t> offsets(num_buffers);
+    for (int i = 0; i < num_buffers; i++) {
+      auto ptr = graph_unreg_buffers_[i];
+      void* base_ptr;
+      // note: must share the base address of each allocation, or we get wrong
+      // address
+      if (cuPointerGetAttribute(&base_ptr, rangeStartAddrAttr,
+                                (CUdeviceptr)ptr) != CUDA_SUCCESS)
+        throw std::runtime_error("failed to get pointer attr");
+      CUDACHECK(cudaIpcGetMemHandle(
+          (cudaIpcMemHandle_t*)&handles[i * handle_sz], base_ptr));
+      offsets[i] = ((char*)ptr) - ((char*)base_ptr);
+    }
+    return std::make_pair(handles, offsets);
+  }
+
+  void check_rank_data_capacity(size_t num = 1) {
+    if (d_rank_data_base_ + num > d_rank_data_end_)
+      throw std::runtime_error(
+          "Rank data buffer is overflowed by " +
+          std::to_string(d_rank_data_base_ + num - d_rank_data_end_));
+  }
+
+  /**
+   * Register already-shared IPC pointers.
+   */
+  void register_buffer(void** ptrs) {
+    check_rank_data_capacity();
+    RankData data;
+    for (int i = 0; i < world_size_; i++) {
+      data.ptrs[i] = ptrs[i];
+    }
+    auto d_data = d_rank_data_base_++;
+    CUDACHECK(
+        cudaMemcpy(d_data, &data, sizeof(RankData), cudaMemcpyHostToDevice));
+    buffers_[ptrs[rank_]] = d_data;
+  }
+
+  // Note: when registering graph buffers, we intentionally choose to not
+  // deduplicate the addresses. That means if the allocator reuses some
+  // addresses, they will be registered again. This is to account for the
+  // remote possibility of different allocation patterns between ranks. For
+  // example, rank 1 may get the same input address for the second allreduce,
+  // but rank 2 got a different address. IPC handles have internal reference
+  // counting mechanism so overhead should be small.
+  void register_graph_buffers(
+      const std::vector<std::string>& handles,
+      const std::vector<std::vector<int64_t>>& offsets) {
+    auto num_buffers = graph_unreg_buffers_.size();
+    check_rank_data_capacity(num_buffers);
+    std::vector<RankData> rank_data(num_buffers);
+    for (int i = 0; i < num_buffers; i++) {
+      auto self_ptr = graph_unreg_buffers_[i];
+      auto& rd = rank_data[i];
+      for (int j = 0; j < world_size_; j++) {
+        if (j != rank_) {
+          char* handle =
+              open_ipc_handle(&handles[j][i * sizeof(cudaIpcMemHandle_t)]);
+          handle += offsets[j][i];
+          rd.ptrs[j] = handle;
+        } else {
+          rd.ptrs[j] = self_ptr;
+        }
+      }
+    }
+    CUDACHECK(cudaMemcpy(d_rank_data_base_, rank_data.data(),
+                         sizeof(RankData) * num_buffers,
+                         cudaMemcpyHostToDevice));
+    d_rank_data_base_ += num_buffers;
+    graph_unreg_buffers_.clear();
+  }
+
+  /**
+   * Performs allreduce, assuming input has already been registered.
+   *
+   * Block and grid default configs are results after careful grid search.
+   * Using 36 blocks give the best or close to the best runtime on the devices
+   * I tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also
+   * only take a small amount of SMs. Not quite sure the underlying reason,
+   * but my guess is that too many SMs will cause contention on NVLink bus.
+   */
+  template <typename T>
+  void allreduce(cudaStream_t stream, T* input, T* output, int size,
+                 int threads = 512, int block_limit = defaultBlockLimit) {
+    auto d = packed_t<T>::P::size;
+    if (size % d != 0)
+      throw std::runtime_error(
+          "custom allreduce currently requires input length to be multiple "
+          "of " +
+          std::to_string(d));
+    if (block_limit > kMaxBlocks)
+      throw std::runtime_error("max supported block limit is " +
+                               std::to_string(kMaxBlocks) + ". Got " +
+                               std::to_string(block_limit));
+
+    RankData* ptrs;
+    cudaStreamCaptureStatus status;
+    CUDACHECK(cudaStreamIsCapturing(stream, &status));
+    if (status == cudaStreamCaptureStatusActive) {
+      ptrs = d_rank_data_base_ + graph_unreg_buffers_.size();
+      graph_unreg_buffers_.push_back(input);
+    } else {
+      auto it = buffers_.find(input);
+      if (it == buffers_.end())
+        throw std::runtime_error(
+            "buffer address " +
+            std::to_string(reinterpret_cast<uint64_t>(input)) +
+            " is not registered!");
+      ptrs = it->second;
+    }
+
+    size /= d;
+    auto bytes = size * sizeof(typename packed_t<T>::P);
+    int blocks = std::min(block_limit, (size + threads - 1) / threads);
+#define KL(ngpus, name)                                                       \
+  name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
+                                                 rank_, size);
+#define REDUCE_CASE(ngpus)                            \
+  case ngpus: {                                       \
+    if (world_size_ == 2) {                           \
+      KL(ngpus, cross_device_reduce_1stage);          \
+    } else if (fully_connected_) {                    \
+      if ((world_size_ <= 4 && bytes < 512 * 1024) || \
+          (world_size_ <= 8 && bytes < 256 * 1024)) { \
+        KL(ngpus, cross_device_reduce_1stage);        \
+      } else {                                        \
+        KL(ngpus, cross_device_reduce_2stage);        \
+      }                                               \
+    }                                                 \
+    break;                                            \
+  }
+
+    switch (world_size_) {
+      REDUCE_CASE(2)
+      REDUCE_CASE(4)
+      REDUCE_CASE(6)
+      REDUCE_CASE(8)
+      default:
+        throw std::runtime_error(
+            "custom allreduce only supports num gpus in (2,4,6,8). Actual "
+            "num "
+            "gpus = " +
+            std::to_string(world_size_));
+    }
+#undef REDUCE_CASE
+#undef KL
+  }
+
+  ~CustomAllreduce() {
+    for (auto [_, ptr] : ipc_handles_) {
+      CUDACHECK(cudaIpcCloseMemHandle(ptr));
+    }
+  }
+};
+
+/**
+ * To inspect PTX/SASS, copy paste this header file to compiler explorer and
+ add a template instantiation:
+ * template void vllm::CustomAllreduce::allreduce<half>(cudaStream_t, half *,
+ half *, int, int, int);
+*/
+}  // namespace vllm
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/custom_all_reduce_test.cu b/vllm_v0.10.0/csrc/custom_all_reduce_test.cu
new file mode 100644
index 0000000..f7f0823
--- /dev/null
+++ b/vllm_v0.10.0/csrc/custom_all_reduce_test.cu
@@ -0,0 +1,361 @@
+/**
+ * This is a standalone test for custom allreduce.
+ * To compile, make sure you have MPI and NCCL installed in your system.
+ * export MPI_HOME=XXX
+ * nvcc -O2 -arch=native -std=c++17 custom_all_reduce_test.cu -o
+ * custom_all_reduce_test -lnccl -I${MPI_HOME}/include -lmpi
+ *
+ * Warning: this C++ test is not designed to be very readable and was used
+ * during the rapid prototyping process.
+ *
+ * To run:
+ * mpirun --allow-run-as-root -np 8 ./custom_all_reduce_test
+ */
+#include <cuda.h>
+#include <curand_kernel.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <limits>
+#include <vector>
+
+#include "cuda_profiler_api.h"
+#include "custom_all_reduce.cuh"
+#include "mpi.h"
+#ifdef USE_ROCM
+  #include <hip/hip_bf16.h>
+typedef __hip_bfloat16 nv_bfloat16;
+  #include "rccl/rccl.h"
+  #include "custom_all_reduce_hip.cuh"
+#else
+  #include "nccl.h"
+  #include "custom_all_reduce.cuh"
+#endif
+
+#define MPICHECK(cmd)                                                  \
+  do {                                                                 \
+    int e = cmd;                                                       \
+    if (e != MPI_SUCCESS) {                                            \
+      printf("Failed: MPI error %s:%d '%d'\n", __FILE__, __LINE__, e); \
+      exit(EXIT_FAILURE);                                              \
+    }                                                                  \
+  } while (0)
+
+#define NCCLCHECK(cmd)                                              \
+  do {                                                              \
+    ncclResult_t r = cmd;                                           \
+    if (r != ncclSuccess) {                                         \
+      printf("Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, \
+             ncclGetErrorString(r));                                \
+      exit(EXIT_FAILURE);                                           \
+    }                                                               \
+  } while (0)
+
+#ifdef USE_ROCM
+__global__ void dummy_kernel() {
+  for (int i = 0; i < 100; i++) {
+    uint64_t start = wall_clock64();
+    uint64_t cycles_elapsed;
+    do {
+      cycles_elapsed = wall_clock64() - start;
+    } while (cycles_elapsed < 100);
+  }
+  for (int i = 0; i < 100; i++) __nanosleep(1000000);  // 100ms
+}
+#else
+__global__ void dummy_kernel() {
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  for (int i = 0; i < 100; i++) __nanosleep(1000000);  // 100ms
+  #else
+  for (int i = 0; i < 100; i++) {
+    long long int start = clock64();
+    while (clock64() - start < 150000000);  // approximately 98.4ms on P40
+  }
+  #endif
+}
+#endif
+
+template <typename T>
+__global__ void set_data(T* data, int size, int myRank) {
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
+       idx += gridDim.x * blockDim.x) {
+    data[idx] = myRank * 0.11f;
+  }
+}
+
+template <typename T>
+__global__ void convert_data(const T* data1, const T* data2, double* fdata1,
+                             double* fdata2, int size) {
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
+       idx += gridDim.x * blockDim.x) {
+    fdata1[idx] = data1[idx];
+    fdata2[idx] = data2[idx];
+  }
+}
+
+__global__ void init_rand(curandState_t* state, int size, int nRanks) {
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
+       idx += gridDim.x * blockDim.x) {
+    for (int i = 0; i < nRanks; i++) {
+      curand_init(i + 1, idx, 0, &state[idx * nRanks + i]);
+    }
+  }
+}
+
+template <typename T>
+__global__ void gen_data(curandState_t* state, T* data, double* ground_truth,
+                         int myRank, int nRanks, int size) {
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
+       idx += gridDim.x * blockDim.x) {
+    double sum = 0.0;
+    for (int i = 0; i < nRanks; i++) {
+      double val = curand_uniform_double(&state[idx * nRanks + i]) * 4;
+      T hval = val;  // downcast first
+      sum += static_cast<double>(hval);
+      if (i == myRank) data[idx] = hval;
+    }
+    ground_truth[idx] = sum;
+  }
+}
+
+template <typename T>
+void run(int myRank, int nRanks, ncclComm_t& comm, int threads, int block_limit,
+         int data_size, bool performance_test) {
+  T* result;
+  cudaStream_t stream;
+  CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+  CUDACHECK(cudaMalloc(&result, data_size * sizeof(T)));
+  CUDACHECK(cudaMemset(result, 0, data_size * sizeof(T)));
+
+  cudaIpcMemHandle_t self_data_handle;
+  cudaIpcMemHandle_t data_handles[8];
+  vllm::Signal* buffer;
+  T* self_data_copy;
+  /**
+   * Allocate IPC buffer
+   *
+   * The first section is a temporary buffer for storing intermediate allreduce
+   * results, if a particular algorithm requires it. The second section is for
+   * the input to the allreduce. The actual API takes the input pointer as an
+   * argument (that is, they can and usually should be allocated separately).
+   * But since the input pointers and the temporary buffer all require IPC
+   * registration, they are allocated and registered together in the test for
+   * convenience.
+   */
+#ifdef USE_ROCM
+  CUDACHECK(hipExtMallocWithFlags(
+      (void**)&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Signal),
+      hipDeviceMallocUncached));
+#else
+  CUDACHECK(
+      cudaMalloc(&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Signal)));
+#endif
+  CUDACHECK(
+      cudaMemset(buffer, 0, 2 * data_size * sizeof(T) + sizeof(vllm::Signal)));
+  CUDACHECK(cudaMalloc(&self_data_copy, data_size * sizeof(T)));
+  CUDACHECK(cudaIpcGetMemHandle(&self_data_handle, buffer));
+
+  MPICHECK(MPI_Allgather(&self_data_handle, sizeof(cudaIpcMemHandle_t),
+                         MPI_BYTE, data_handles, sizeof(cudaIpcMemHandle_t),
+                         MPI_BYTE, MPI_COMM_WORLD));
+
+  void* rank_data;
+  size_t rank_data_sz = 16 * 1024 * 1024;
+  CUDACHECK(cudaMalloc(&rank_data, rank_data_sz));
+  vllm::Signal* ipc_ptrs[8];
+  for (int i = 0; i < nRanks; i++) {
+    if (i == myRank)
+      ipc_ptrs[i] = buffer;
+    else
+      CUDACHECK(cudaIpcOpenMemHandle((void**)&ipc_ptrs[i], data_handles[i],
+                                     cudaIpcMemLazyEnablePeerAccess));
+  }
+  vllm::CustomAllreduce fa(ipc_ptrs, rank_data, rank_data_sz, myRank, nRanks);
+  auto* self_data =
+      reinterpret_cast<T*>(reinterpret_cast<char*>(buffer) +
+                           sizeof(vllm::Signal) + data_size * sizeof(T));
+  // hack buffer registration
+  {
+    void* data[8];
+    for (int i = 0; i < nRanks; i++) {
+      data[i] =
+          ((char*)ipc_ptrs[i]) + sizeof(vllm::Signal) + data_size * sizeof(T);
+    }
+    fa.register_buffer(data);
+  }
+
+  double* ground_truth;
+  CUDACHECK(cudaMallocHost(&ground_truth, data_size * sizeof(double)));
+  curandState_t* states;
+  CUDACHECK(cudaMalloc(&states, sizeof(curandState_t) * nRanks * data_size));
+  init_rand<<<108, 1024, 0, stream>>>(states, data_size, nRanks);
+  gen_data<T><<<108, 1024, 0, stream>>>(states, self_data, ground_truth, myRank,
+                                        nRanks, data_size);
+  CUDACHECK(cudaMemcpyAsync(self_data_copy, self_data, data_size * sizeof(T),
+                            cudaMemcpyDeviceToDevice, stream));
+  cudaEvent_t start, stop;
+  CUDACHECK(cudaEventCreate(&start));
+  CUDACHECK(cudaEventCreate(&stop));
+
+  ncclDataType_t ncclDtype;
+  if (std::is_same<T, half>::value) {
+    ncclDtype = ncclFloat16;
+  } else if (std::is_same<T, nv_bfloat16>::value) {
+    ncclDtype = ncclBfloat16;
+  } else {
+    ncclDtype = ncclFloat;
+  }
+  double *nccl_result, *my_result;
+  CUDACHECK(cudaMallocHost(&nccl_result, data_size * sizeof(double)));
+  CUDACHECK(cudaMallocHost(&my_result, data_size * sizeof(double)));
+  if (performance_test) {
+    dummy_kernel<<<1, 1, 0, stream>>>();
+    constexpr int warmup_iters = 5;
+    constexpr int num_iters = 100;
+    // warmup
+    for (int i = 0; i < warmup_iters; i++) {
+      NCCLCHECK(ncclAllReduce(result, result, data_size, ncclDtype, ncclSum,
+                              comm, stream));
+    }
+    CUDACHECK(cudaEventRecord(start, stream));
+    for (int i = 0; i < num_iters; i++) {
+      NCCLCHECK(ncclAllReduce(result, result, data_size, ncclDtype, ncclSum,
+                              comm, stream));
+    }
+    CUDACHECK(cudaEventRecord(stop, stream));
+    CUDACHECK(cudaStreamSynchronize(stream));
+    float allreduce_ms = 0;
+    cudaEventElapsedTime(&allreduce_ms, start, stop);
+
+    dummy_kernel<<<1, 1, 0, stream>>>();
+    // warm up
+    for (int i = 0; i < warmup_iters; i++) {
+      fa.allreduce<T>(stream, self_data, result, data_size, threads,
+                      block_limit);
+    }
+    CUDACHECK(cudaEventRecord(start, stream));
+    for (int i = 0; i < num_iters; i++) {
+      fa.allreduce<T>(stream, self_data, result, data_size, threads,
+                      block_limit);
+    }
+    CUDACHECK(cudaEventRecord(stop, stream));
+    CUDACHECK(cudaStreamSynchronize(stream));
+
+    float duration_ms = 0;
+    cudaEventElapsedTime(&duration_ms, start, stop);
+    if (myRank == 0)
+      printf(
+          "Rank %d done, nGPUs:%d, sz (kb): %d, %d, %d, my time:%.2fus, nccl "
+          "time:%.2fus\n",
+          myRank, nRanks, data_size * sizeof(T) / 1024, threads, block_limit,
+          duration_ms * 1e3 / num_iters, allreduce_ms * 1e3 / num_iters);
+
+    // And wait for all the queued up work to complete
+    CUDACHECK(cudaStreamSynchronize(stream));
+
+    NCCLCHECK(ncclAllReduce(self_data_copy, self_data, data_size, ncclDtype,
+                            ncclSum, comm, stream));
+
+    convert_data<T><<<108, 1024, 0, stream>>>(self_data, result, nccl_result,
+                                              my_result, data_size);
+    CUDACHECK(cudaStreamSynchronize(stream));
+
+    for (unsigned long j = 0; j < data_size; j++) {
+      auto diff = abs(nccl_result[j] - my_result[j]);
+      if (diff >= 4e-2) {
+        printf("Rank %d: Verification mismatch at %lld: %f != (my) %f, gt=%f\n",
+               myRank, j, nccl_result[j], my_result[j], ground_truth[j]);
+        break;
+      }
+    }
+    long double nccl_diffs = 0.0;
+    long double my_diffs = 0.0;
+    for (int j = 0; j < data_size; j++) {
+      nccl_diffs += abs(nccl_result[j] - ground_truth[j]);
+      my_diffs += abs(my_result[j] - ground_truth[j]);
+    }
+    if (myRank == 0)
+      std::cout << "average abs diffs: nccl: " << nccl_diffs / data_size
+                << " me: " << my_diffs / data_size << std::endl;
+  } else {
+    for (int i = 0; i < 100; i++) {
+      fa.allreduce<T>(stream, self_data, result, data_size, threads,
+                      block_limit);
+      CUDACHECK(cudaStreamSynchronize(stream));
+      NCCLCHECK(ncclAllReduce(self_data, self_data_copy, data_size, ncclDtype,
+                              ncclSum, comm, stream));
+      convert_data<T><<<108, 1024, 0, stream>>>(
+          self_data_copy, result, nccl_result, my_result, data_size);
+      CUDACHECK(cudaStreamSynchronize(stream));
+
+      for (unsigned long j = 0; j < data_size; j++) {
+        auto diff = abs(nccl_result[j] - my_result[j]);
+        if (diff >= 4e-2) {
+          printf(
+              "Rank %d: Verification mismatch at %lld: %f != (my) %f, gt=%f\n",
+              myRank, j, nccl_result[j], my_result[j], ground_truth[j]);
+          break;
+        }
+      }
+    }
+    if (myRank == 0)
+      printf("Test passed: nGPUs:%d, sz (kb): %d, %d, %d\n", nRanks,
+             data_size * sizeof(T) / 1024, threads, block_limit);
+    // long double nccl_diffs = 0.0;
+    // long double my_diffs = 0.0;
+    // for (int j = 0; j < data_size; j++) {
+    //   nccl_diffs += abs(nccl_result[j] - ground_truth[j]);
+    //   my_diffs += abs(my_result[j] - ground_truth[j]);
+    // }
+    // if (myRank == 0)
+    //   std::cout << "average abs diffs: nccl: " << nccl_diffs / data_size
+    //             << " me: " << my_diffs / data_size << std::endl;
+  }
+
+  CUDACHECK(cudaFree(result));
+  CUDACHECK(cudaFree(self_data_copy));
+  CUDACHECK(cudaFree(rank_data));
+  CUDACHECK(cudaFree(buffer));
+  CUDACHECK(cudaFree(states));
+  CUDACHECK(cudaFreeHost(ground_truth));
+  CUDACHECK(cudaFreeHost(nccl_result));
+  CUDACHECK(cudaFreeHost(my_result));
+  CUDACHECK(cudaStreamDestroy(stream));
+}
+
+int main(int argc, char** argv) {
+  int nRanks, myRank;
+  MPICHECK(MPI_Init(&argc, &argv));
+  MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myRank));
+  MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &nRanks));
+  CUDACHECK(cudaSetDevice(myRank));
+  ncclUniqueId id;
+  ncclComm_t comm;
+  if (myRank == 0) ncclGetUniqueId(&id);
+  MPICHECK(MPI_Bcast(static_cast<void*>(&id), sizeof(id), MPI_BYTE, 0,
+                     MPI_COMM_WORLD));
+  NCCLCHECK(ncclCommInitRank(&comm, nRanks, id, myRank));
+
+  bool performance_test = true;
+  cudaProfilerStart();
+// Uncomment to scan through different block size configs.
+// for (int threads : {256, 512, 1024}) {
+//   for (int block_limit = 16; block_limit < 112; block_limit += 4) {
+//     run<half>(myRank, nRanks, comm, threads, block_limit, 1024 * 1024,
+//     performance_test);
+//   }
+// }
+#ifdef USE_ROCM
+  const int block_limit = 16;
+#else
+  const int block_limit = 36;
+#endif
+  // Scan through different sizes to test performance.
+  for (int sz = 512; sz <= (8 << 20); sz *= 2) {
+    run<half>(myRank, nRanks, comm, 512, 36, sz + 8 * 47, performance_test);
+  }
+
+  cudaProfilerStop();
+  MPICHECK(MPI_Finalize());
+  return EXIT_SUCCESS;
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/custom_quickreduce.cu b/vllm_v0.10.0/csrc/custom_quickreduce.cu
new file mode 100644
index 0000000..33d0d4a
--- /dev/null
+++ b/vllm_v0.10.0/csrc/custom_quickreduce.cu
@@ -0,0 +1,114 @@
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+#include <torch/all.h>
+
+#ifdef USE_ROCM
+
+  #include "quickreduce/quick_reduce.h"
+
+quickreduce::fptr_t init_custom_qr(int64_t rank, int64_t world_size,
+                                   std::optional<int64_t> qr_max_size) {
+  if (world_size > 8)
+    throw std::invalid_argument("world size > 8 is not supported");
+  if (world_size == 6)
+    throw std::invalid_argument("world size == 6 is not supported");
+  if (world_size % 2 != 0)
+    throw std::invalid_argument("Odd num gpus is not supported for now");
+  if (rank < 0 || rank >= world_size)
+    throw std::invalid_argument("invalid rank passed in");
+  quickreduce::DeviceComms* fptr = new quickreduce::DeviceComms();
+  fptr->init(world_size, rank, qr_max_size);
+  return (quickreduce::fptr_t)fptr;
+}
+
+void qr_destroy(quickreduce::fptr_t _fa) {
+  if (_fa) {
+    auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa);
+    fa->destroy();
+    delete fa;
+  }
+}
+
+torch::Tensor qr_get_handle(quickreduce::fptr_t _fa) {
+  auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa);
+  hipIpcMemHandle_t handle = fa->get_handle();
+  auto options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
+  auto data_handle =
+      torch::empty({static_cast<int64_t>(sizeof(hipIpcMemHandle_t))}, options);
+  std::memcpy(data_handle.data_ptr(), &handle, sizeof(hipIpcMemHandle_t));
+  return data_handle;
+}
+
+void qr_open_handles(quickreduce::fptr_t _fa,
+                     const std::vector<torch::Tensor>& handles) {
+  auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa);
+  std::vector<hipIpcMemHandle_t> ipc_handles;
+  ipc_handles.reserve(handles.size());
+  for (auto& handle : handles) {
+    // Ensure the tensor is on the same device as the current device.
+    hipIpcMemHandle_t ipc_handle;
+    std::memcpy(&ipc_handle, handle.data_ptr(), sizeof(hipIpcMemHandle_t));
+    ipc_handles.push_back(ipc_handle);
+  }
+  fa->open_ipc_handles(ipc_handles);
+}
+
+void qr_all_reduce(quickreduce::fptr_t _fa, torch::Tensor& inp,
+                   torch::Tensor& out, int64_t quant_level, bool cast_bf2half) {
+  auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
+  auto stream = at::cuda::getCurrentHIPStreamMasqueradingAsCUDA();
+
+  TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
+  TORCH_CHECK_EQ(inp.numel(), out.numel());
+  TORCH_CHECK_LE(out.numel(), fa->kMaxProblemSize);
+  if (out.scalar_type() == at::ScalarType::Half) {
+    fa->allreduce<half, false>(reinterpret_cast<half*>(inp.data_ptr()),
+                               reinterpret_cast<half*>(out.data_ptr()),
+                               out.numel(), quant_level, stream);
+  } else if (out.scalar_type() == at::ScalarType::BFloat16) {
+    if (cast_bf2half) {
+      fa->allreduce<half, true>(reinterpret_cast<half*>(inp.data_ptr()),
+                                reinterpret_cast<half*>(out.data_ptr()),
+                                out.numel(), quant_level, stream);
+    } else {
+      fa->allreduce<quickreduce::nv_bfloat16, false>(
+          reinterpret_cast<quickreduce::nv_bfloat16*>(inp.data_ptr()),
+          reinterpret_cast<quickreduce::nv_bfloat16*>(out.data_ptr()),
+          out.numel(), quant_level, stream);
+    }
+  } else {
+    throw std::runtime_error(
+        "quick allreduce only supports float16 and bfloat16");
+  }
+}
+
+int64_t qr_max_size() {
+  // The default is 2GB (2,147,483,648 bytes)
+  return static_cast<int64_t>(std::numeric_limits<int32_t>::max()) + 1;
+}
+
+  #define INSTANTIATE_FOR_WORLDSIZE(T, Codec, cast_bf2half)       \
+    template struct quickreduce::AllReduceTwoshot<T, Codec<T, 2>, \
+                                                  cast_bf2half>;  \
+    template struct quickreduce::AllReduceTwoshot<T, Codec<T, 4>, \
+                                                  cast_bf2half>;  \
+    template struct quickreduce::AllReduceTwoshot<T, Codec<T, 8>, cast_bf2half>;
+
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecFP, false)
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ4, false)
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ6, false)
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ8, false)
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecFP, true)
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ4, true)
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ6, true)
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ8, true)
+
+INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecFP, false)
+INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecQ4, false)
+INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecQ6, false)
+INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecQ8, false)
+
+#endif  // USE_ROCM
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/cutlass_extensions/common.cpp b/vllm_v0.10.0/csrc/cutlass_extensions/common.cpp
new file mode 100644
index 0000000..3d2093a
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cutlass_extensions/common.cpp
@@ -0,0 +1,11 @@
+#include "cutlass_extensions/common.hpp"
+
+int32_t get_sm_version_num() {
+  int32_t major_capability, minor_capability;
+  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
+                         0);
+  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
+                         0);
+  int32_t version_num = major_capability * 10 + minor_capability;
+  return version_num;
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/cutlass_extensions/common.hpp b/vllm_v0.10.0/csrc/cutlass_extensions/common.hpp
new file mode 100644
index 0000000..195872e
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cutlass_extensions/common.hpp
@@ -0,0 +1,62 @@
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include <climits>
+#include "cuda_runtime.h"
+#include <iostream>
+
+/**
+ * Helper function for checking CUTLASS errors
+ */
+#define CUTLASS_CHECK(status)                       \
+  {                                                 \
+    cutlass::Status error = status;                 \
+    TORCH_CHECK(error == cutlass::Status::kSuccess, \
+                cutlassGetStatusString(error));     \
+  }
+
+inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
+  int max_shared_mem_per_block_opt_in = 0;
+  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, device);
+  return max_shared_mem_per_block_opt_in;
+}
+
+int32_t get_sm_version_num();
+
+/**
+ * A wrapper for a kernel that is used to guard against compilation on
+ * architectures that will never use the kernel. The purpose of this is to
+ * reduce the size of the compiled binary.
+ * __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
+ * into code that will be executed on the device where it is defined.
+ */
+template <typename Kernel>
+struct enable_sm90_or_later : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+template <typename Kernel>
+struct enable_sm90_only : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 900
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+template <typename Kernel>
+struct enable_sm100_only : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 1000
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
diff --git a/vllm_v0.10.0/csrc/cutlass_extensions/cute_utils.cuh b/vllm_v0.10.0/csrc/cutlass_extensions/cute_utils.cuh
new file mode 100644
index 0000000..f61fe3c
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cutlass_extensions/cute_utils.cuh
@@ -0,0 +1,68 @@
+#pragma once
+
+#include <cute/tensor.hpp>
+#include <torch/all.h>
+namespace cute {
+
+////////////////////////////////////////////////////////////////////
+// layout utils
+////////////////////////////////////////////////////////////////////
+
+// Permute layout based on indices, example:
+//   permute_layout<1, 0>(layout) will swap the two dimensions
+//   permute_layout<0, 2, 1>(layout) will swap the last two dimensions
+template <size_t... I, typename Layout>
+CUTE_HOST_DEVICE static constexpr auto permute_layout(Layout l) {
+  static_assert(rank(l) == sizeof...(I), "Invalid permutation, rank mismatch");
+  return cute::make_layout(cute::get<I>(l)...);
+}
+
+// is the layout f(x) = x
+template <typename Layout>
+CUTE_HOST_DEVICE static constexpr bool is_identity_layout() {
+  if constexpr (std::is_same_v<Layout, void>) {
+    return true;
+  } else {
+    constexpr auto coalesced_layout = coalesce(Layout{});
+    if constexpr (rank(coalesced_layout) == 1 &&
+                  stride<0>(coalesced_layout) == 1) {
+      return true;
+    }
+    return false;
+  }
+}
+
+////////////////////////////////////////////////////////////////////
+// Pointer utils
+////////////////////////////////////////////////////////////////////
+
+template <class PointerType>
+static constexpr auto get_logical_ptr(PointerType* ptr) {
+  if constexpr (cute::sizeof_bits_v<PointerType> < 8) {
+    return cute::subbyte_iterator<PointerType>(ptr);
+  } else {
+    return ptr;
+  }
+}
+
+////////////////////////////////////////////////////////////////////
+// Misc utils
+////////////////////////////////////////////////////////////////////
+
+template <typename T, typename Elements>
+CUTE_HOST_DEVICE static constexpr auto create_auto_vectorizing_copy() {
+  constexpr auto bits = sizeof_bits_v<T> * Elements{};
+  if constexpr (bits % 128 == 0) {
+    return AutoVectorizingCopyWithAssumedAlignment<128>{};
+  } else if constexpr (bits % 64 == 0) {
+    return AutoVectorizingCopyWithAssumedAlignment<64>{};
+  } else if constexpr (bits % 32 == 0) {
+    return AutoVectorizingCopyWithAssumedAlignment<32>{};
+  } else if constexpr (bits % 16 == 0) {
+    return AutoVectorizingCopyWithAssumedAlignment<16>{};
+  } else {
+    return AutoVectorizingCopyWithAssumedAlignment<8>{};
+  }
+}
+
+};  // namespace cute
diff --git a/vllm_v0.10.0/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp b/vllm_v0.10.0/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp
new file mode 100644
index 0000000..5c1d6e3
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp
@@ -0,0 +1,457 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+//
+// This file is a modified excerpt of
+// include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
+// from https://github.com/NVIDIA/cutlass v3.5.0
+// It has been modified to support either row/column or scalar broadcasting
+// where the tensor being loaded from is always passed in via a device pointer.
+// This lets one compiled kernel handle all cases of per-tensor or
+// per-channel/per-token quantization.
+//
+// This interface also allows the scales to be passed in as tensors that
+// consistently reside on the device, which avoids an issue with a previous
+// implementation where scalars needed to be on the CPU since they
+// were passed in via float values. This created a potential performance hazard
+// if scales were initially on the device, and caused torch.compile graphs
+// breaks when moving scales to the CPU.
+//
+#pragma once
+
+// Turn off clang-format for the entire file to keep it close to upstream
+// clang-format off
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/barrier.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
+
+namespace cutlass::epilogue::fusion {
+
+using namespace cute;
+using namespace detail;
+
+// Row vector broadcast
+template<
+  int Stages,
+  class CtaTileShapeMNK,
+  class Element,
+  class StrideMNL = Stride<_0,_1,_0>,
+  int Alignment = 128 / sizeof_bits_v<Element>
+>
+struct Sm90RowOrScalarBroadcastArray {
+  static_assert(Stages == 0, "Row broadcast doesn't support smem usage");
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_1>{});
+
+  struct SharedStorage { 
+    array_aligned<Element, size<1>(CtaTileShapeMNK{})> smem;
+  };
+
+  // This struct has been modified to have a bool indicating that ptr_row is a 
+  // scalar that must be broadcast, instead of containing a scalar that is 
+  // valid if ptr_row is null.
+  struct Arguments {
+    const Element* const* ptr_row_array = nullptr;
+    bool row_broadcast = true;
+    StrideMNL dRow = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowOrScalarBroadcastArray() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowOrScalarBroadcastArray(Params const& params, SharedStorage const& shared_storage)
+      : params(params)
+      , smem(const_cast<Element*>(shared_storage.smem.data())) { }
+
+  Params params;
+  Element *smem = nullptr;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return (!params.row_broadcast && *(params.ptr_row_array[group]) == Element(0));
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template <class GS_GTensor, class GS_STensor, class GS_CTensor, class Tiled_G2S, class SR_STensor, class SR_RTensor, class CTensor, class ThrResidue, class ThrNum>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+        GS_GTensor tGS_gRow_, GS_STensor tGS_sRow_, 
+        GS_CTensor tGS_cRow_, Tiled_G2S tiled_g2s_, 
+        SR_STensor tSR_sRow_, SR_RTensor tSR_rRow_,
+        CTensor tCcRow_, ThrResidue residue_tCcRow_, ThrNum thr_num_,
+        int group, Params const& params_)
+      : tGS_gRow(tGS_gRow_)
+      , tGS_sRow(tGS_sRow_)
+      , tGS_cRow(tGS_cRow_)
+      , tiled_G2S(tiled_g2s_)
+      , tSR_sRow(tSR_sRow_)
+      , tSR_rRow(tSR_rRow_)
+      , tCcRow(tCcRow_)
+      , residue_tCcRow(residue_tCcRow_)
+      , group(group)
+      , params(params_) {}
+
+    GS_GTensor tGS_gRow;                                                         // (CPY,CPY_M,CPY_N)
+    GS_STensor tGS_sRow;                                                         // (CPY,CPY_M,CPY_N)
+    GS_CTensor tGS_cRow;                                                         // (CPY,CPY_M,CPY_N)
+    Tiled_G2S tiled_G2S;
+
+    SR_STensor tSR_sRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    SR_RTensor tSR_rRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) 
+  
+    CTensor tCcRow;                                                              // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    ThrResidue residue_tCcRow;                                                   // (m, n)
+    ThrNum thr_num;
+    int group;
+    Params const& params;
+
+    CUTLASS_DEVICE void
+    begin() {
+      if (!params.row_broadcast) {
+        fill(tSR_rRow, *(params.ptr_row_array[group]));
+        return;
+      }
+
+      auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(thr_num, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+      Tensor tGS_gRow_flt = filter_zeros(tGS_gRow);
+      Tensor tGS_sRow_flt = filter_zeros(tGS_sRow);
+      Tensor tGS_cRow_flt = make_tensor(tGS_cRow.data(), make_layout(tGS_gRow_flt.shape(), tGS_cRow.stride()));
+
+      for (int i = 0; i < size(tGS_gRow_flt); ++i) {
+        if (get<1>(tGS_cRow_flt(i)) >= size<1>(CtaTileShapeMNK{})) {
+          continue; // OOB of SMEM, 
+        }
+        if (elem_less(tGS_cRow_flt(i), make_coord(get<0>(residue_tCcRow), get<1>(residue_tCcRow)))) {
+          tGS_sRow_flt(i) = tGS_gRow_flt(i);
+        }
+        else {
+          tGS_sRow_flt(i) = Element(0); // Set to Zero when OOB so LDS could be issue without any preds.
+        }
+      }
+      synchronize();
+    }
+
+    CUTLASS_DEVICE void
+    begin_loop(int epi_m, int epi_n) {
+      if (epi_m == 0) { // Assumes M-major subtile loop
+        if (!params.row_broadcast) return; // Do not issue LDS when row is scalar 
+        Tensor tSR_sRow_flt = filter_zeros(tSR_sRow(_,_,_,epi_m,epi_n));
+        Tensor tSR_rRow_flt = filter_zeros(tSR_rRow);
+        copy(tSR_sRow_flt, tSR_rRow_flt);
+      }
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<Element, FragmentSize> frg_row;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        frg_row[i] = tSR_rRow(epi_v * FragmentSize + i);
+      }
+
+      return frg_row;
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    using ThreadCount = decltype(size(args.tiled_copy));
+
+    Tensor mRow = make_tensor(make_gmem_ptr(params.ptr_row_array[l]), make_shape(M,N,1), params.dRow);
+    Tensor gRow = local_tile(mRow(_,_,l), take<0,2>(args.tile_shape_mnk), make_coord(m, n));          // (CTA_M, CTA_N)
+    Tensor sRow = make_tensor(make_smem_ptr(smem), 
+        make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})), make_shape(_0{}, _1{}));  // (CTA_M, CTA_N)
+    //// G2S: Gmem to Smem
+    auto tiled_g2s = make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
+                                     Layout< Shape<_1, ThreadCount>, 
+                                            Stride<_0,          _1>>{}, 
+                                     Layout<_1>{});   
+    auto thr_g2s = tiled_g2s.get_slice(args.thread_idx);
+    Tensor tGS_gRow = thr_g2s.partition_S(gRow);
+    Tensor tGS_sRow = thr_g2s.partition_D(sRow);
+
+    //// G2S: Coord 
+    auto cRow = make_identity_tensor(make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})));
+    Tensor tGS_cRow = thr_g2s.partition_S(cRow);
+
+    //// S2R: Smem to Reg
+    Tensor tSR_sRow = sm90_partition_for_epilogue<ReferenceSrc>(sRow, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tSR_rRow = make_tensor_like(take<0,3>(tSR_sRow));                                           // (CPY,CPY_M,CPY_N)
+
+    return ConsumerStoreCallbacks<decltype(tGS_gRow), decltype(tGS_sRow), decltype(tGS_cRow), decltype(tiled_g2s), decltype(tSR_sRow), decltype(tSR_rRow), decltype(args.tCcD), decltype(args.residue_cD), ThreadCount>(
+      tGS_gRow, 
+      tGS_sRow, 
+      tGS_cRow, tiled_g2s, 
+      tSR_sRow, 
+      tSR_rRow, 
+      args.tCcD, 
+      args.residue_cD,
+      ThreadCount{}, 
+      l,
+      params);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Column vector broadcast
+template<
+  int Stages,
+  class CtaTileShapeMNK,
+  class Element,
+  class StrideMNL = Stride<_1,_0,_0>,
+  int Alignment = 128 / sizeof_bits_v<Element>
+>
+struct Sm90ColOrScalarBroadcastArray {
+  static_assert(Stages == 0, "Column broadcast doesn't support smem usage yet");
+  static_assert(Alignment * sizeof_bits_v<Element> % 128 == 0, "sub-16B alignment not supported yet");
+  static_assert(
+    (cute::is_same_v<StrideMNL, Stride<_1,_0, _0>>) || // col vector broadcast, e.g. per-row alpha/bias
+    (cute::is_same_v<StrideMNL, Stride<_1,_0,int>>));  // batched col vector broadcast, e.g. batched per-row bias
+
+  // Accumulator distributes col elements evenly amongst threads so we can just directly load from gmem
+  struct SharedStorage { };
+
+  // This struct has been modified to have a bool indicating that ptr_col is a 
+  // scalar that must be broadcast, instead of containing a scalar that is 
+  // valid if ptr_col is null.
+  struct Arguments {
+    const Element* const* ptr_col_array = nullptr;
+    bool col_broadcast = true;
+    StrideMNL dCol = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return (!params.col_broadcast && *(params.ptr_col_array[group]) == Element(0));
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColOrScalarBroadcastArray() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColOrScalarBroadcastArray(Params const& params, SharedStorage const& shared_storage)
+      : params(params) { }
+
+  Params params;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+      GTensor&& tCgCol,
+      RTensor&& tCrCol,
+      CTensor&& tCcCol,
+      ProblemShape problem_shape,
+      int group,
+      Params const& params
+    ): 
+      tCgCol(cute::forward<GTensor>(tCgCol)),
+      tCrCol(cute::forward<RTensor>(tCrCol)),
+      tCcCol(cute::forward<CTensor>(tCcCol)),
+      m(get<0>(problem_shape)),
+      group(group),
+      params(params) {}
+
+    GTensor tCgCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    RTensor tCrCol;
+    CTensor tCcCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    Params const& params;
+    int m;
+    int group;
+
+    CUTLASS_DEVICE void
+    begin() {
+      Tensor pred = make_tensor<bool>(shape(tCgCol));
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(pred); ++i) {
+        pred(i) = get<0>(tCcCol(i)) < m;
+      }
+
+      if (!params.col_broadcast) {
+        fill(tCrCol, *(params.ptr_col_array[group]));
+        return;
+      }
+
+      // Filter so we don't issue redundant copies over stride-0 modes
+      // (only works if 0-strides are in same location, which is by construction)
+      copy_if(pred, filter(tCgCol), filter(tCrCol));
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<Element, FragmentSize> frg_col;
+      Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        frg_col[i] = tCrCol_mn(epi_v * FragmentSize + i);
+      }
+
+      return frg_col;
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+
+    Tensor mCol = make_tensor(make_gmem_ptr(params.ptr_col_array[l]), make_shape(M,N,1), params.dCol);
+    Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrCol = make_tensor_like(tCgCol);                                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+
+    // Generate an identity tensor matching the shape of the global tensor and 
+    //  partition the same way, this will be used to generate the predicate
+    //  tensor for loading
+    Tensor cCol = make_identity_tensor(mCol.shape());
+    Tensor tCcCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      cCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    return ConsumerStoreCallbacks(
+      cute::move(tCgCol), 
+      cute::move(tCrCol), 
+      cute::move(tCcCol), 
+      args.problem_shape_mnkl,
+      l,
+      params
+    );
+  }
+};
+
+}
diff --git a/vllm_v0.10.0/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp b/vllm_v0.10.0/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp
new file mode 100644
index 0000000..7aa87fe
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp
@@ -0,0 +1,497 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+//
+// This file is a modified excerpt of
+// include/cutlass/epilogue/fusion/visitor_load.hpp from
+// https://github.com/NVIDIA/cutlass v3.5.0
+// It has been modified to support either
+// row/column or scalar broadcasting where the tensor being loaded from is
+// always passed in via a device pointer. This lets one compiled kernel handle
+// all cases of per-tensor or per-channel/per-token quantization.
+//
+// This interface also allows the scales to be passed in as tensors that
+// consistently reside on the device, which avoids an issue with a previous
+// implementation where scalars needed to be on the CPU since they
+// were passed in via float values. This created a potential performance hazard
+// if scales were initially on the device, and caused torch.compile graph
+// breaks when moving scales to the CPU.
+//
+#pragma once
+
+// Turn off clang-format for the entire file to keep it close to upstream
+// clang-format off
+
+#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
+#include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
+#include "cute/tensor.hpp"
+
+namespace cutlass::epilogue::threadblock {
+
+using namespace cute;
+using namespace detail;
+
+template<
+  class ThreadMap,
+  class Element,
+  class StrideMNL
+>
+struct VisitorRowOrScalarBroadcast {
+
+  // This struct has been modified to have a bool indicating that ptr_row is a 
+  // scalar that must be broadcast.
+  struct Arguments {
+    Element const* ptr_row = nullptr;
+    bool row_broadcast = true;
+    StrideMNL dRow = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  struct SharedStorage {};
+
+  // Global load type
+  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value;
+  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
+  static int constexpr VecLength = sizeof(VecType) / sizeof(Element);
+
+  CUTLASS_HOST_DEVICE
+  VisitorRowOrScalarBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorRowOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      GTensor&& tC_gRow,
+      RTensor&& tC_rRow,
+      CTensor&& tC_cRow,
+      ProblemShape problem_shape,
+      Params const* params_ptr
+    ):
+      tC_gRow(cute::forward<GTensor>(tC_gRow)),
+      tC_rRow(cute::forward<RTensor>(tC_rRow)),
+      tC_cRow(cute::forward<CTensor>(tC_cRow)),
+      n(get<1>(problem_shape)),
+      params_ptr(params_ptr) { }
+
+    GTensor tC_gRow;
+    RTensor tC_rRow;
+    CTensor tC_cRow;
+    Params const* params_ptr;
+    int n;
+
+    // This function is modified from VisitorRowBroadcast
+    CUTLASS_DEVICE void
+    begin_epilogue() {
+      clear(tC_rRow);
+      auto src_v = filter(tC_gRow);
+      auto coord_v = filter(tC_cRow);
+      auto dst_v = filter(tC_rRow);
+
+      if (params_ptr->row_broadcast) {
+        // In this case we are loading from a row vector and broadcasting
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(src_v); ++i) {
+          bool guard = get<1>(coord_v(i)) < n;
+          cutlass::arch::global_load<VecType, sizeof(VecType)>(
+              dst_v(i), (void const*)&src_v(i), guard);
+        }
+      } else {
+        // In this case we are loading from a scalar and broadcasting
+        VecType filled_vec;
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < VecLength; i++) {
+          reinterpret_cast<Element*>(&filled_vec)[i] = *(params_ptr->ptr_row);
+        }
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(src_v); ++i) {
+          if (get<1>(coord_v(i)) < n) {
+            dst_v(i) = filled_vec;
+          }
+        }
+      }
+    }
+
+    template <class ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      Tensor rRow_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rRow));
+      return rRow_frg(column_idx);
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    Tensor mRow = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_row),
+      problem_shape,
+      params_ptr->dRow);
+
+    // VECTOR, FRAGMENT_COLUMN
+    Tensor tC_gRow = recast<VecType>(
+      ThreadMap::partition(mRow, thread_idx, threadblock_tile_offset)
+    )(_,_,_0{},_0{},_0{},_0{});
+    Tensor tC_rRow = make_tensor_like(tC_gRow);
+
+    // Generate the pred tensor
+    Tensor cRow = make_identity_tensor(mRow.shape());
+    Tensor tC_cRow = outer_partition(
+      ThreadMap::partition(cRow, thread_idx, threadblock_tile_offset)(_,_,_0{},_0{},_0{},_0{}),
+      Shape<Int<VecLength>>{},
+      (_0{})
+    );
+
+    return Callbacks<
+      decltype(tC_gRow), decltype(tC_rRow),
+      decltype(tC_cRow), ProblemShape>(
+      cute::move(tC_gRow),
+      cute::move(tC_rRow),
+      cute::move(tC_cRow),
+      problem_shape,
+      params_ptr
+    );
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// This is a modified RowBroadcast that will broadcast 0 if ptr_row is null
+template<
+  class ThreadMap,
+  class Element,
+  class StrideMNL
+>
+struct VisitorRowOrZeroBroadcast {
+
+  // This struct has been modified to remove null_default (because it's always 0)
+  struct Arguments {
+    Element const* ptr_row = nullptr;
+    StrideMNL dRow = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  struct SharedStorage {};
+
+  // Global load type
+  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value;
+  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
+  static int constexpr VecLength = sizeof(VecType) / sizeof(Element);
+
+  CUTLASS_HOST_DEVICE
+  VisitorRowOrZeroBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorRowOrZeroBroadcast(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      GTensor&& tC_gRow,
+      RTensor&& tC_rRow,
+      CTensor&& tC_cRow,
+      ProblemShape problem_shape,
+      Params const* params_ptr
+    ):
+      tC_gRow(cute::forward<GTensor>(tC_gRow)),
+      tC_rRow(cute::forward<RTensor>(tC_rRow)),
+      tC_cRow(cute::forward<CTensor>(tC_cRow)),
+      n(get<1>(problem_shape)),
+      params_ptr(params_ptr) { }
+
+    GTensor tC_gRow;
+    RTensor tC_rRow;
+    CTensor tC_cRow;
+    Params const* params_ptr;
+    int n;
+
+    // This function is modified from VisitorRowBroadcast
+    CUTLASS_DEVICE void
+    begin_epilogue() {
+      clear(tC_rRow);
+      auto src_v = filter(tC_gRow);
+      auto coord_v = filter(tC_cRow);
+      auto dst_v = filter(tC_rRow);
+
+      if (params_ptr->ptr_row != nullptr) {
+        // In this case we are loading from a row vector and broadcasting
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(src_v); ++i) {
+          bool guard = get<1>(coord_v(i)) < n;
+          cutlass::arch::global_load<VecType, sizeof(VecType)>(
+              dst_v(i), (void const*)&src_v(i), guard);
+        }
+      } else {
+        // In this case we are broadcasting 0
+        VecType filled_vec;
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < VecLength; i++) {
+          reinterpret_cast<Element*>(&filled_vec)[i] = Element{0};
+        }
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(src_v); ++i) {
+          if (get<1>(coord_v(i)) < n) {
+            dst_v(i) = filled_vec;
+          }
+        }
+      }
+    }
+
+    template <class ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      Tensor rRow_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rRow));
+      return rRow_frg(column_idx);
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    Tensor mRow = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_row),
+      problem_shape,
+      params_ptr->dRow);
+
+    // VECTOR, FRAGMENT_COLUMN
+    Tensor tC_gRow = recast<VecType>(
+      ThreadMap::partition(mRow, thread_idx, threadblock_tile_offset)
+    )(_,_,_0{},_0{},_0{},_0{});
+    Tensor tC_rRow = make_tensor_like(tC_gRow);
+
+    // Generate the pred tensor
+    Tensor cRow = make_identity_tensor(mRow.shape());
+    Tensor tC_cRow = outer_partition(
+      ThreadMap::partition(cRow, thread_idx, threadblock_tile_offset)(_,_,_0{},_0{},_0{},_0{}),
+      Shape<Int<VecLength>>{},
+      (_0{})
+    );
+
+    return Callbacks<
+      decltype(tC_gRow), decltype(tC_rRow),
+      decltype(tC_cRow), ProblemShape>(
+      cute::move(tC_gRow),
+      cute::move(tC_rRow),
+      cute::move(tC_cRow),
+      problem_shape,
+      params_ptr
+    );
+  }
+
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Column vector broadcast
+template<
+  class ThreadMap,
+  class Element,
+  class StrideMNL = Stride<_1,_0,_0>
+>
+struct VisitorColOrScalarBroadcast {
+
+  // This struct has been modified to have a bool indicating that ptr_col is a
+  // scalar that must be broadcast.
+  struct Arguments {
+    Element const* ptr_col = nullptr;
+    bool col_broadcast = true;
+    StrideMNL dCol = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  struct SharedStorage { };
+
+  CUTLASS_HOST_DEVICE
+  VisitorColOrScalarBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorColOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      GTensor&& tC_gCol,
+      RTensor&& tC_rCol,
+      CTensor&& tC_cCol,
+      ProblemShape problem_shape,
+      Params const* params_ptr
+    ):
+      tC_gCol(cute::forward<GTensor>(tC_gCol)),
+      tC_rCol(cute::forward<RTensor>(tC_rCol)),
+      tC_cCol(cute::forward<CTensor>(tC_cCol)),
+      m(get<0>(problem_shape)),
+      params_ptr(params_ptr) { }
+
+    GTensor tC_gCol;
+    RTensor tC_rCol;
+    CTensor tC_cCol;
+    Params const* params_ptr;
+    int m;
+
+    // This function is modified from VisitorColBroadcast
+    CUTLASS_DEVICE void 
+    begin_epilogue() {
+      clear(tC_rCol);
+
+      Tensor pred = make_tensor<bool>(shape(tC_gCol));
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(pred); ++i) {
+        pred(i) = get<0>(tC_cCol(i)) < m;
+      }
+
+      if (params_ptr->col_broadcast) {
+        // In this case we are loading from a column vector and broadcasting
+        copy_if(pred, tC_gCol, tC_rCol);
+      } else {
+        // In this case we are loading from a scalar and broadcasting
+        auto dst_v = filter(tC_rCol);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(dst_v); ++i) {
+          if (pred(i)) {
+            dst_v(i) = *(params_ptr->ptr_col);
+          }
+        }
+      }
+    }
+
+    template <class ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      Array<Element, FragmentSize> frg_col;
+      frg_col.fill(tC_rCol(row_idx,iter_idx));
+      return frg_col;
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    Tensor mCol = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_col),
+      problem_shape,
+      params_ptr->dCol);
+
+    // VECTOR, FRAGMENT_COLUMN, FRAGMENT_ROW, ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER
+    Tensor tC_gCol = group_modes<1,4>(
+      ThreadMap::partition(mCol, thread_idx, threadblock_tile_offset)(_0{},_0{},_,_,_,_));
+    Tensor tC_rCol = make_tensor_like(tC_gCol);
+
+    // Generate the pred tensor
+    Tensor cCol = make_identity_tensor(mCol.shape());
+    Tensor tC_cCol = group_modes<1,4>(
+      ThreadMap::partition(cCol, thread_idx, threadblock_tile_offset)(_0{},_0{},_,_,_,_));
+
+    return Callbacks<
+      decltype(tC_gCol), decltype(tC_rCol),
+      decltype(tC_cCol), ProblemShape>(
+      cute::move(tC_gCol),
+      cute::move(tC_rCol),
+      cute::move(tC_cCol),
+      problem_shape,
+      params_ptr
+    );
+  }
+};
+
+}
diff --git a/vllm_v0.10.0/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp b/vllm_v0.10.0/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp
new file mode 100644
index 0000000..58b1e8f
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp
@@ -0,0 +1,447 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+//
+// This file is a modified excerpt of
+// include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
+// from https://github.com/NVIDIA/cutlass v3.5.0
+// It has been modified to support either row/column or scalar broadcasting
+// where the tensor being loaded from is always passed in via a device pointer.
+// This lets one compiled kernel handle all cases of per-tensor or
+// per-channel/per-token quantization.
+//
+// This interface also allows the scales to be passed in as tensors that
+// consistently reside on the device, which avoids an issue with a previous
+// implementation where scalars needed to be on the CPU since they
+// were passed in via float values. This created a potential performance hazard
+// if scales were initially on the device, and caused torch.compile graphs
+// breaks when moving scales to the CPU.
+//
+#pragma once
+
+// Turn off clang-format for the entire file to keep it close to upstream
+// clang-format off
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/barrier.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
+
+namespace cutlass::epilogue::fusion {
+
+using namespace cute;
+using namespace detail;
+
+// Row vector broadcast
+template<
+  int Stages,
+  class CtaTileShapeMNK,
+  class Element,
+  class StrideMNL = Stride<_0,_1,_0>,
+  int Alignment = 128 / sizeof_bits_v<Element>
+>
+struct Sm90RowOrScalarBroadcast {
+  static_assert(Stages == 0, "Row broadcast doesn't support smem usage");
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_1>{});
+
+  struct SharedStorage { 
+    array_aligned<Element, size<1>(CtaTileShapeMNK{})> smem;
+  };
+
+  // This struct has been modified to have a bool indicating that ptr_row is a 
+  // scalar that must be broadcast, instead of containing a scalar that is 
+  // valid if ptr_row is null.
+  struct Arguments {
+    Element const* ptr_row = nullptr;
+    bool row_broadcast = true;
+    StrideMNL dRow = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowOrScalarBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
+      : params(params)
+      , smem(const_cast<Element*>(shared_storage.smem.data())) { }
+
+  Params params;
+  Element *smem = nullptr;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return (!params.row_broadcast && *(params.ptr_row) == Element(0));
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template <class GS_GTensor, class GS_STensor, class GS_CTensor, class Tiled_G2S, class SR_STensor, class SR_RTensor, class CTensor, class ThrResidue, class ThrNum>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+        GS_GTensor tGS_gRow_, GS_STensor tGS_sRow_, 
+        GS_CTensor tGS_cRow_, Tiled_G2S tiled_g2s_, 
+        SR_STensor tSR_sRow_, SR_RTensor tSR_rRow_,
+        CTensor tCcRow_, ThrResidue residue_tCcRow_, ThrNum thr_num_, Params const& params_)
+      : tGS_gRow(tGS_gRow_)
+      , tGS_sRow(tGS_sRow_)
+      , tGS_cRow(tGS_cRow_)
+      , tiled_G2S(tiled_g2s_)
+      , tSR_sRow(tSR_sRow_)
+      , tSR_rRow(tSR_rRow_)
+      , tCcRow(tCcRow_)
+      , residue_tCcRow(residue_tCcRow_)
+      , params(params_) {}
+
+    GS_GTensor tGS_gRow;                                                         // (CPY,CPY_M,CPY_N)
+    GS_STensor tGS_sRow;                                                         // (CPY,CPY_M,CPY_N)
+    GS_CTensor tGS_cRow;                                                         // (CPY,CPY_M,CPY_N)
+    Tiled_G2S tiled_G2S;
+
+    SR_STensor tSR_sRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    SR_RTensor tSR_rRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) 
+  
+    CTensor tCcRow;                                                              // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    ThrResidue residue_tCcRow;                                                   // (m, n)
+    ThrNum thr_num;
+    Params const& params;
+
+    CUTLASS_DEVICE void
+    begin() {
+      if (!params.row_broadcast) {
+        fill(tSR_rRow, *(params.ptr_row));
+        return;
+      }
+
+      auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(thr_num, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+      Tensor tGS_gRow_flt = filter_zeros(tGS_gRow);
+      Tensor tGS_sRow_flt = filter_zeros(tGS_sRow);
+      Tensor tGS_cRow_flt = make_tensor(tGS_cRow.data(), make_layout(tGS_gRow_flt.shape(), tGS_cRow.stride()));
+
+      for (int i = 0; i < size(tGS_gRow_flt); ++i) {
+        if (get<1>(tGS_cRow_flt(i)) >= size<1>(CtaTileShapeMNK{})) {
+          continue; // OOB of SMEM, 
+        }
+        if (elem_less(tGS_cRow_flt(i), make_coord(get<0>(residue_tCcRow), get<1>(residue_tCcRow)))) {
+          tGS_sRow_flt(i) = tGS_gRow_flt(i);
+        }
+        else {
+          tGS_sRow_flt(i) = Element(0); // Set to Zero when OOB so LDS could be issue without any preds.
+        }
+      }
+      synchronize();
+    }
+
+    CUTLASS_DEVICE void
+    begin_loop(int epi_m, int epi_n) {
+      if (epi_m == 0) { // Assumes M-major subtile loop
+        if (!params.row_broadcast) return; // Do not issue LDS when row is scalar 
+        Tensor tSR_sRow_flt = filter_zeros(tSR_sRow(_,_,_,epi_m,epi_n));
+        Tensor tSR_rRow_flt = filter_zeros(tSR_rRow);
+        copy(tSR_sRow_flt, tSR_rRow_flt);
+      }
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<Element, FragmentSize> frg_row;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        frg_row[i] = tSR_rRow(epi_v * FragmentSize + i);
+      }
+
+      return frg_row;
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    using ThreadCount = decltype(size(args.tiled_copy));
+
+    Tensor mRow = make_tensor(make_gmem_ptr(params.ptr_row), make_shape(M,N,L), params.dRow);
+    Tensor gRow = local_tile(mRow(_,_,l), take<0,2>(args.tile_shape_mnk), make_coord(m, n));          // (CTA_M, CTA_N)
+    Tensor sRow = make_tensor(make_smem_ptr(smem), 
+        make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})), make_shape(_0{}, _1{}));  // (CTA_M, CTA_N)
+    //// G2S: Gmem to Smem
+    auto tiled_g2s = make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
+                                     Layout< Shape<_1, ThreadCount>, 
+                                            Stride<_0,          _1>>{}, 
+                                     Layout<_1>{});   
+    auto thr_g2s = tiled_g2s.get_slice(args.thread_idx);
+    Tensor tGS_gRow = thr_g2s.partition_S(gRow);
+    Tensor tGS_sRow = thr_g2s.partition_D(sRow);
+
+    //// G2S: Coord 
+    auto cRow = make_identity_tensor(make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})));
+    Tensor tGS_cRow = thr_g2s.partition_S(cRow);
+
+    //// S2R: Smem to Reg
+    Tensor tSR_sRow = sm90_partition_for_epilogue<ReferenceSrc>(sRow, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tSR_rRow = make_tensor_like(take<0,3>(tSR_sRow));                                           // (CPY,CPY_M,CPY_N)
+
+    return ConsumerStoreCallbacks<decltype(tGS_gRow), decltype(tGS_sRow), decltype(tGS_cRow), decltype(tiled_g2s), decltype(tSR_sRow), decltype(tSR_rRow), decltype(args.tCcD), decltype(args.residue_cD), ThreadCount>(
+      tGS_gRow, 
+      tGS_sRow, 
+      tGS_cRow, tiled_g2s, 
+      tSR_sRow, 
+      tSR_rRow, 
+      args.tCcD, 
+      args.residue_cD,
+      ThreadCount{}, 
+      params);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Column vector broadcast
+template<
+  int Stages,
+  class CtaTileShapeMNK,
+  class Element,
+  class StrideMNL = Stride<_1,_0,_0>,
+  int Alignment = 128 / sizeof_bits_v<Element>
+>
+struct Sm90ColOrScalarBroadcast {
+  static_assert(Stages == 0, "Column broadcast doesn't support smem usage yet");
+  static_assert(Alignment * sizeof_bits_v<Element> % 128 == 0, "sub-16B alignment not supported yet");
+  static_assert(
+    (cute::is_same_v<StrideMNL, Stride<_1,_0, _0>>) || // col vector broadcast, e.g. per-row alpha/bias
+    (cute::is_same_v<StrideMNL, Stride<_1,_0,int>>));  // batched col vector broadcast, e.g. batched per-row bias
+
+  // Accumulator distributes col elements evenly amongst threads so we can just directly load from gmem
+  struct SharedStorage { };
+
+  // This struct has been modified to have a bool indicating that ptr_col is a 
+  // scalar that must be broadcast, instead of containing a scalar that is 
+  // valid if ptr_col is null.
+  struct Arguments {
+    Element const* ptr_col = nullptr;
+    bool col_broadcast = true;
+    StrideMNL dCol = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return (!params.col_broadcast && *(params.ptr_col) == Element(0));
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColOrScalarBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
+      : params(params) { }
+
+  Params params;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+      GTensor&& tCgCol,
+      RTensor&& tCrCol,
+      CTensor&& tCcCol,
+      ProblemShape problem_shape,
+      Params const& params
+    ): 
+      tCgCol(cute::forward<GTensor>(tCgCol)),
+      tCrCol(cute::forward<RTensor>(tCrCol)),
+      tCcCol(cute::forward<CTensor>(tCcCol)),
+      m(get<0>(problem_shape)),
+      params(params) {}
+
+    GTensor tCgCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    RTensor tCrCol;
+    CTensor tCcCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    Params const& params;
+    int m;
+
+    CUTLASS_DEVICE void
+    begin() {
+      Tensor pred = make_tensor<bool>(shape(tCgCol));
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(pred); ++i) {
+        pred(i) = get<0>(tCcCol(i)) < m;
+      }
+
+      if (!params.col_broadcast) {
+        fill(tCrCol, *(params.ptr_col));
+        return;
+      }
+
+      // Filter so we don't issue redundant copies over stride-0 modes
+      // (only works if 0-strides are in same location, which is by construction)
+      copy_if(pred, filter(tCgCol), filter(tCrCol));
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<Element, FragmentSize> frg_col;
+      Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        frg_col[i] = tCrCol_mn(epi_v * FragmentSize + i);
+      }
+
+      return frg_col;
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    Tensor mCol = make_tensor(make_gmem_ptr(params.ptr_col), make_shape(M,N,L), params.dCol);
+    Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrCol = make_tensor_like(tCgCol);                                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+
+    // Generate an identity tensor matching the shape of the global tensor and 
+    //  partition the same way, this will be used to generate the predicate
+    //  tensor for loading
+    Tensor cCol = make_identity_tensor(mCol.shape());
+    Tensor tCcCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      cCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    return ConsumerStoreCallbacks(
+      cute::move(tCgCol), 
+      cute::move(tCrCol), 
+      cute::move(tCcCol), 
+      args.problem_shape_mnkl, 
+      params
+    );
+  }
+};
+
+}
diff --git a/vllm_v0.10.0/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp b/vllm_v0.10.0/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
new file mode 100644
index 0000000..ad8c006
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
@@ -0,0 +1,321 @@
+#pragma once
+
+#include "cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp"
+
+/*
+   This file defines custom epilogues for fusing channel scales, token scales,
+   bias, and activation zero-points onto a GEMM operation using the
+   CUTLASS 2.x API, for sm80 (Ampere) NVIDIA GPUs.
+
+   Epilogues must contain a public type named EVTCompute of type Sm80EVT,
+   as well as a static prepare_args function that constructs an
+   EVTCompute::Arguments struct.
+*/
+
+namespace vllm::c2x {
+
+using namespace cute;
+
+/*
+ * This class provides the common load descriptors for the
+ * ScaledEpilogue[...] classes
+ */
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogueBase {
+ protected:
+  using Accum = cutlass::epilogue::threadblock::VisitorAccFetch;
+
+  template <typename T>
+  using ColOrScalarLoad =
+      cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast<
+          OutputTileThreadMap, T, Stride<Int<1>, Int<0>, Int<0>>>;
+
+  template <typename T>
+  using RowOrScalarLoad =
+      cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast<
+          OutputTileThreadMap, T, Stride<Int<0>, Int<1>, Int<0>>>;
+
+  template <typename T>
+  using ColLoad = cutlass::epilogue::threadblock::VisitorColBroadcast<
+      OutputTileThreadMap, T, Stride<Int<1>, Int<0>, Int<0>>>;
+
+  template <typename T>
+  using RowLoad = cutlass::epilogue::threadblock::VisitorRowBroadcast<
+      OutputTileThreadMap, T, Stride<Int<0>, Int<1>, Int<0>>>;
+
+  template <typename T>
+  using RowOrZeroLoad =
+      cutlass::epilogue::threadblock::VisitorRowOrZeroBroadcast<
+          OutputTileThreadMap, T, Stride<Int<0>, Int<1>, Int<0>>>;
+
+  // This utility function constructs the arguments for the load descriptors
+  // from a tensor. It can handle both row and column, as well as row/column or
+  // scalar cases.
+  template <typename Descriptor, typename T>
+  static auto args_from_tensor(torch::Tensor const& tensor) {
+    using Arguments = typename Descriptor::Arguments;
+    auto* data_ptr = static_cast<T*>(tensor.data_ptr());
+    if constexpr (std::is_same_v<Descriptor, ColOrScalarLoad<T>> ||
+                  std::is_same_v<Descriptor, RowOrScalarLoad<T>>) {
+      return Arguments{data_ptr, tensor.numel() != 1};
+    } else {
+      // it would technically work but no use case as data_ptr is never nullptr
+      static_assert(!std::is_same_v<Descriptor, RowOrZeroLoad<T>>);
+      return Arguments{data_ptr};
+    }
+  }
+
+  // This overload handles the case where there might not be a tensor, in which
+  // case a nullptr is passed and a constant (0) is used.
+  template <typename Descriptor, typename T>
+  static auto args_from_tensor(std::optional<torch::Tensor> const& tensor) {
+    static_assert(std::is_same_v<Descriptor, RowOrZeroLoad<T>>);
+    using Arguments = typename Descriptor::Arguments;
+    auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
+    return Arguments{data_ptr};
+  }
+};
+
+/*
+ This epilogue function defines a quantized GEMM operation similar to
+ torch._scaled_mm.
+
+ A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
+ per-row. B can be quantized per-tensor or per-column.
+ Any combination of per-tensor and per-row or column is supported.
+ A and B must have symmetric quantization (zero point == 0).
+
+ So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
+ scales are applied elementwise with numpy-style broadcasting.
+
+ ScaleA and ScaleB define the epilogue functions that apply the scales for
+ the A and B operands respectively. These scales may be either per-tensor or
+ per row or column.
+*/
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogue
+    : private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+
+  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA, EVTCompute0>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+
+    typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
+    return ArgumentType{a_args, evt0_args, {}};
+  }
+};
+
+/*
+ * This epilogue performs the same operation as ScaledEpilogue, but adds a bias.
+ * This bias can also be used in the per-tensor azp case, where the activation
+ * zero point (azp) is used to compute an azp correction term,
+ * which is folded into the bias.
+ *
+ * The bias tensor must be per-output channel.
+ * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
+ */
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogueBias
+    : protected ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
+ protected:
+  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowLoad<ElementD>;
+  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::homogeneous_multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA,
+                                                             EVTCompute0, Bias>;
+  using ArgumentType = typename EVTCompute::Arguments;
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+
+    typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
+    return ArgumentType{a_args, evt0_args, bias_args, {}};
+  }
+};
+
+/*
+ * This epilogue directly supports per-tensor azp in int32 form.
+ * As opposed to the per-token epilogue below, this epilogue only has an azp_adj
+ * term, which should already be multiplied with the scalar azp.
+ * The azp_adj term is a 1D tensor of shape (1,n), computed as azp * J @ B.
+ *
+ * This epilogue also supports bias, which remains per-channel.
+ */
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogueBiasAzp
+    : protected ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowOrZeroLoad<ElementD>;
+
+  // This is the full AZP term, azp * J @ B, shape (1,n)
+  using AzpWithAdj = typename SUPER::template RowLoad<int32_t>;
+
+  // Compute float(accum - azp_adj), both operands are int32_t
+  using ComputeAzp = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::minus, float, int32_t,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeAzp =
+      cutlass::epilogue::threadblock::Sm80EVT<ComputeAzp, Accum, AzpWithAdj>;
+
+  using ComputeScaleB = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeScaleB =
+      cutlass::epilogue::threadblock::Sm80EVT<ComputeScaleB, ScaleB,
+                                              EVTComputeAzp>;
+
+  using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::homogeneous_multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::threadblock::Sm80EVT<ComputeScaleBiasA, ScaleA,
+                                              EVTComputeScaleB, Bias>;
+
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& azp_adj,
+                                   std::optional<torch::Tensor> const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+    auto azp_adj_args =
+        SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
+
+    typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args, {}};
+    typename EVTComputeScaleB::Arguments evt_scale_b_args{
+        b_args, evt_azp_args, {}};
+    return ArgumentType{a_args, evt_scale_b_args, bias_args, {}};
+  }
+};
+
+/*
+ * This epilogue supports per-token azp by computing and applying
+ * the correction term using a rank-1 update. If the term were materialized,
+ * it would require O(m*n) space, and this way it only requires O(m+n) space.
+ * The azp term is a 1D tensor of shape (m,1), and represents the unscaled zero
+ * point for each row of A.
+ * The azp_adj term is a 1D tensor of shape (1,n), computed as J @ B.
+ *
+ * This epilogue also supports bias, which remains per-channel.
+ */
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogueBiasAzpToken
+    : protected ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowOrZeroLoad<ElementD>;
+
+  // Per-token azp term, shape (m,1)
+  using Azp = typename SUPER::template ColLoad<int32_t>;
+
+  // This is the AZP adjustment term, J @ B, shape (1,n)
+  using AzpAdj = typename SUPER::template RowLoad<int32_t>;
+
+  // Compute azp * azp_adj
+  using ComputeAzp = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, int32_t, int32_t,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeAzp =
+      cutlass::epilogue::threadblock::Sm80EVT<ComputeAzp, Azp, AzpAdj>;
+
+  // Compute float(accum - azp*azp_adj), all operands are int32_t
+  using ComputeAcc = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::minus, float, int32_t,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeAcc =
+      cutlass::epilogue::threadblock::Sm80EVT<ComputeAcc, Accum, EVTComputeAzp>;
+
+  using ComputeScaleB = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeScaleB =
+      cutlass::epilogue::threadblock::Sm80EVT<ComputeScaleB, ScaleB,
+                                              EVTComputeAcc>;
+
+  using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::homogeneous_multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::threadblock::Sm80EVT<ComputeScaleBiasA, ScaleA,
+                                              EVTComputeScaleB, Bias>;
+
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& azp_adj,
+                                   torch::Tensor const& azp,
+                                   std::optional<torch::Tensor> const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+    auto azp_args = SUPER::template args_from_tensor<Azp, int32_t>(azp);
+    auto azp_adj_args =
+        SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
+
+    typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args, {}};
+    typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args, {}};
+    typename EVTComputeScaleB::Arguments evt_scale_b_args{
+        b_args, evt_acc_args, {}};
+    return ArgumentType{a_args, evt_scale_b_args, bias_args, {}};
+  }
+};
+
+};  // namespace vllm::c2x
diff --git a/vllm_v0.10.0/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/vllm_v0.10.0/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
new file mode 100644
index 0000000..cf79507
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
@@ -0,0 +1,450 @@
+#pragma once
+
+#include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp"
+#include "cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp"
+
+/*
+   This file defines custom epilogues for fusing channel scales, token scales,
+   bias, and activation zero-points onto a GEMM operation using the
+   CUTLASS 3.x API, for NVIDIA GPUs with sm90a (Hopper) or later.
+
+   Epilogues must contain a public type named EVTCompute of type Sm90EVT,
+   as well as a static prepare_args function that constructs an
+   EVTCompute::Arguments struct.
+*/
+
+namespace vllm::c3x {
+
+using namespace cute;
+
+template <typename T>
+struct identity {
+  CUTLASS_HOST_DEVICE
+  T operator()(T lhs) const { return lhs; }
+};
+
+template <typename ElementAcc, typename ElementD, typename TileShape>
+struct TrivialEpilogue {
+ private:
+  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
+  using Compute = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::epilogue::thread::Identity, ElementD, ElementAcc,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute = cutlass::epilogue::fusion::Sm90EVT<Compute, Accum>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  template <typename... Args>
+  static ArgumentType prepare_args(Args... args) {
+    return {};
+  }
+};
+
+/*
+ * This class provides the common load descriptors for the
+ * ScaledEpilogue[...] classes
+ */
+template <typename ElementAcc, typename ElementD, typename TileShape>
+struct ScaledEpilogueBase {
+ protected:
+  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
+
+  template <typename T>
+  using ColOrScalarLoad = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
+      0 /*Stages*/, TileShape, T, Stride<Int<1>, Int<0>, Int<0>>>;
+
+  template <typename T>
+  using RowOrScalarLoad = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
+      0 /*Stages*/, TileShape, T, Stride<Int<0>, Int<1>, Int<0>>>;
+
+  // Don't want to support nullptr by default
+  template <typename T, bool EnableNullPtr = false>
+  using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast<
+      0 /*Stages*/, TileShape, T, T, Stride<Int<1>, Int<0>, Int<0>>,
+      128 / sizeof_bits_v<T>, EnableNullPtr>;
+
+  // Don't want to support nullptr by default
+  template <typename T, bool EnableNullPtr = false>
+  using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast<
+      0 /*Stages*/, TileShape, T, T, Stride<Int<0>, Int<1>, Int<0>>,
+      128 / sizeof_bits_v<T>, EnableNullPtr>;
+
+  template <typename T>
+  using ColOrScalarLoadArray =
+      cutlass::epilogue::fusion::Sm90ColOrScalarBroadcastArray<
+          0 /*Stages*/, TileShape, T, Stride<Int<1>, Int<0>, Int<0>>>;
+
+  template <typename T>
+  using RowOrScalarLoadArray =
+      cutlass::epilogue::fusion::Sm90RowOrScalarBroadcastArray<
+          0 /*Stages*/, TileShape, T, Stride<Int<0>, Int<1>, Int<0>>>;
+
+  // This utility function constructs the arguments for the load descriptors
+  // from a tensor. It can handle both row and column, as well as row/column or
+  // scalar cases.
+  template <typename Descriptor, typename T>
+  static auto args_from_tensor(torch::Tensor const& tensor) {
+    using Arguments = typename Descriptor::Arguments;
+    auto* data_ptr = static_cast<T*>(tensor.data_ptr());
+    if constexpr (std::is_same_v<Descriptor, ColOrScalarLoad<T>> ||
+                  std::is_same_v<Descriptor, RowOrScalarLoad<T>>) {
+      return Arguments{data_ptr, tensor.numel() != 1};
+    } else {
+      static_assert(!std::is_same_v<Descriptor, ColLoad<T, true>> &&
+                    !std::is_same_v<Descriptor, RowLoad<T, true>>);
+      return Arguments{data_ptr};
+    }
+  }
+
+  // This overload handles the case where there might not be a tensor, in which
+  // case a nullptr is passed and a constant (0) is used.
+  template <typename Descriptor, typename T>
+  static auto args_from_tensor(std::optional<torch::Tensor> const& tensor) {
+    using Arguments = typename Descriptor::Arguments;
+    auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
+    static_assert(std::is_same_v<Descriptor, ColLoad<T, true>> ||
+                  std::is_same_v<Descriptor, RowLoad<T, true>>);
+    return Arguments{data_ptr};
+  }
+
+  template <typename Descriptor, typename T>
+  static auto args_from_tensor(const T* const* data_ptr, bool do_broadcast) {
+    using Arguments = typename Descriptor::Arguments;
+    static_assert(std::is_same_v<Descriptor, ColOrScalarLoadArray<T>> ||
+                  std::is_same_v<Descriptor, RowOrScalarLoadArray<T>>);
+    return Arguments{data_ptr, do_broadcast};
+  }
+};
+
+/*
+   This epilogue function defines a quantized GEMM operation similar to
+   torch.scaled_mm_.
+
+   A and B may be both either int8 or fp8_e4m3. A can be
+   quantized per-tensor or per-row. B can be quantized per-tensor or per-column.
+   Any combination of per-tensor and per-row or column is supported.
+   A and B must have symmetric quantization (zero point == 0).
+
+   So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
+   scales are applied elementwise with numpy-style broadcasting.
+
+   ScaleA and ScaleB define the epilogue functions that apply the scales for
+   the A and B operands respectively. These scales may be either per-tensor or
+   per row or column.
+*/
+template <typename ElementAcc, typename ElementD, typename TileShape>
+struct ScaledEpilogue
+    : private ScaledEpilogueBase<ElementAcc, ElementD, TileShape> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, TileShape>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+
+    typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
+    return ArgumentType{a_args, evt0_args, {}};
+  }
+};
+
+/*
+ * This epilogue performs the same operation as ScaledEpilogue, but adds a bias.
+ * This bias can also be used in the per-tensor azp case, where the activation
+ * zero point (azp) is used to compute an azp correction term,
+ * which is folded into the bias.
+ *
+ * The bias tensor must be per-output channel.
+ * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
+ */
+template <typename ElementAcc, typename ElementD, typename TileShape>
+struct ScaledEpilogueBias
+    : private ScaledEpilogueBase<ElementAcc, ElementD, TileShape> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, TileShape>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowLoad<ElementD>;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::homogeneous_multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0, Bias>;
+
+  using ArgumentType = typename EVTCompute::Arguments;
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+
+    typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
+    return ArgumentType{a_args, evt0_args, bias_args, {}};
+  }
+};
+
+/*
+ * This epilogue performs the same operation as ScaledEpilogueBias, but the
+ * bias is a column vector instead of a row vector. Useful e.g. if we are
+ * computing a GEMM via C^T += B^T A^T. This happens in the 2:4 sparse kernels.
+ */
+template <typename ElementAcc, typename ElementD, typename TileShape>
+struct ScaledEpilogueColumnBias
+    : private ScaledEpilogueBase<ElementAcc, ElementD, TileShape> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, TileShape>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template ColLoad<ElementD>;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::homogeneous_multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0, Bias>;
+
+  using ArgumentType = typename EVTCompute::Arguments;
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+
+    typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
+    return ArgumentType{a_args, evt0_args, bias_args, {}};
+  }
+};
+
+/*
+ * This epilogue directly supports per-tensor azp in int32 form.
+ * As opposed to the per-token epilogue below, this epilogue only has an azp_adj
+ * term, which should already be multiplied with the scalar azp.
+ * The azp_adj term is a 1D tensor of shape (1,n), computed as azp * J @ B.
+ *
+ * This epilogue also supports bias, which remains per-channel.
+ */
+template <typename ElementAcc, typename ElementD, typename TileShape>
+struct ScaledEpilogueBiasAzp
+    : private ScaledEpilogueBase<ElementAcc, ElementD, TileShape> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, TileShape>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowLoad<ElementD, true>;
+
+  // This is the full AZP term, azp * J @ B, shape (1,n)
+  using AzpWithAdj = typename SUPER::template RowLoad<int32_t>;
+
+  // Compute float(accum - azp_adj), both operands are int32_t
+  using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::minus, float, int32_t,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeAzp =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeAzp, Accum, AzpWithAdj>;
+
+  using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeScaleB =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAzp>;
+
+  using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::homogeneous_multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleBiasA, ScaleA,
+                                         EVTComputeScaleB, Bias>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& azp_adj,
+                                   std::optional<torch::Tensor> const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+    auto azp_adj_args =
+        SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
+
+    typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args, {}};
+    typename EVTComputeScaleB::Arguments evt_scale_b_args{
+        b_args, evt_azp_args, {}};
+    return ArgumentType{a_args, evt_scale_b_args, bias_args, {}};
+  }
+};
+
+/*
+ * This epilogue supports per-token azp by computing and applying
+ * the correction term using a rank-1 update. If the term were materialized,
+ * it would require O(m*n) space, and this way it only requires O(m+n) space.
+ * The azp term is a 1D tensor of shape (m,1), and represents the unscaled zero
+ * point for each row of A.
+ * The azp_adj term is a 1D tensor of shape (1,n), computed as J @ B.
+ *
+ * This epilogue also supports bias, which remains per-channel.
+ */
+template <typename ElementAcc, typename ElementD, typename TileShape>
+struct ScaledEpilogueBiasAzpToken
+    : private ScaledEpilogueBase<ElementAcc, ElementD, TileShape> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, TileShape>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowLoad<ElementD, true>;
+
+  // Per-token azp term, shape (m,1)
+  using Azp = typename SUPER::template ColLoad<int32_t>;
+
+  // This is the AZP adjustment term, J @ B, shape (1,n)
+  using AzpAdj = typename SUPER::template RowLoad<int32_t>;
+
+  // Compute azp * azp_adj
+  using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, int32_t, int32_t,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeAzp =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeAzp, Azp, AzpAdj>;
+
+  // Compute float(accum - azp*azp_adj), all operands are int32_t
+  using ComputeAcc = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::minus, float, int32_t,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeAcc =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeAcc, Accum, EVTComputeAzp>;
+
+  using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeScaleB =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAcc>;
+
+  using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::homogeneous_multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleBiasA, ScaleA,
+                                         EVTComputeScaleB, Bias>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& azp_adj,
+                                   torch::Tensor const& azp,
+                                   std::optional<torch::Tensor> const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+    auto azp_args = SUPER::template args_from_tensor<Azp, int32_t>(azp);
+    auto azp_adj_args =
+        SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
+
+    typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args, {}};
+    typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args, {}};
+    typename EVTComputeScaleB::Arguments evt_scale_b_args{
+        b_args, evt_acc_args, {}};
+    return ArgumentType{a_args, evt_scale_b_args, bias_args, {}};
+  }
+};
+
+/*
+    This epilogue works like ScaledEpilogue, but ScaleA and ScaleB are pointers
+    to arrays containing different scales used in group gemm. The number of
+   pointers in ScaleA and the number of pointers in ScaleB are equal to the
+   group size.
+*/
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueArray
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoadArray<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoadArray<float>;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  using ScaleAArray = typename SUPER::template ColOrScalarLoadArray<float>;
+  using ScaleBArray = typename SUPER::template RowOrScalarLoadArray<float>;
+
+  static ArgumentType prepare_args(float const* const* a_scales_ptr,
+                                   float const* const* b_scales_ptr,
+                                   bool a_col_broadcast, bool b_row_broadcast) {
+    auto a_args = SUPER::template args_from_tensor<ScaleAArray, float>(
+        a_scales_ptr, a_col_broadcast);
+    auto b_args = SUPER::template args_from_tensor<ScaleBArray, float>(
+        b_scales_ptr, b_row_broadcast);
+
+    typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
+    return ArgumentType{a_args, evt0_args, {}};
+  }
+};
+
+};  // namespace vllm::c3x
diff --git a/vllm_v0.10.0/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp b/vllm_v0.10.0/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp
new file mode 100644
index 0000000..ec75c29
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp
@@ -0,0 +1,123 @@
+// Modified from: cutlass/gemm/collective/builders/sm90_gmma_builder.inl
+// clang-format off
+#pragma once
+
+#include "cutlass/gemm/collective/builders/sm90_gmma_builder.inl"
+
+#include "cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp"
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_TMA_WS_SS (BlockScaled Builders)
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  int ScaleGranularityM
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>,
+    cute::enable_if_t<
+      not detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()>
+> {
+  using KernelScheduleType = KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>;
+
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+                "Should meet TMA alignment requirement\n");
+
+  static constexpr bool IsArrayOfPointersGemm = (cute::is_any_of_v<KernelScheduleType,
+                                                                   KernelPtrArrayTmaWarpSpecializedCooperative,
+                                                                   KernelPtrArrayTmaWarpSpecializedPingpong>);
+  static constexpr bool IsFP8Input = detail::is_input_fp8<ElementA, ElementB>();
+  static_assert((!IsFP8Input || !IsArrayOfPointersGemm),
+                "KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum is only compatible with FP8 Blocked Scaled version right now.");
+
+  // For fp32 types, map to tf32 MMA value type
+  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
+  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
+
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementAMma, GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementBMma, GmemLayoutBTag>();
+
+  static constexpr bool IsCooperative = cute::is_any_of_v<KernelScheduleType,
+                                                          KernelTmaWarpSpecializedCooperative,
+                                                          KernelPtrArrayTmaWarpSpecializedCooperative,
+                                                          KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>>;
+  using AtomLayoutMNK = cute::conditional_t<IsCooperative,
+      Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
+      ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
+
+  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA = decltype(detail::ss_smem_selector<
+      GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
+      GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  static constexpr size_t TensorMapStorage = IsArrayOfPointersGemm ? sizeof(cute::TmaDescriptor) * 2 /* for A and B */ : 0;
+  static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage);
+
+  static constexpr int PipelineStages = detail::compute_stage_count_or_override<detail::sm90_smem_capacity_bytes - KernelSmemCarveout,
+      ElementAMma, ElementBMma, TileShape_MNK>(StageCountType{});
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<PipelineStages, ClusterShape_MNK, KernelScheduleType, ScaleGranularityM>;
+
+  using SmemCopyAtomA = void;
+  using SmemCopyAtomB = void;
+
+  using CollectiveOp = CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      TagToStrideA_t<GmemLayoutATag>,
+      ElementB,
+      TagToStrideB_t<GmemLayoutBTag>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      SmemCopyAtomA,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      SmemCopyAtomB,
+      cute::identity
+    >;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/vllm_v0.10.0/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp b/vllm_v0.10.0/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp
new file mode 100644
index 0000000..13b90e9
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp
@@ -0,0 +1,183 @@
+// clang-format off
+// adapted from: https://github.com/soundOfDestiny/cutlass/blob/a4208aa6958864923505cade9c63eb2a6daf16e5/include/cutlass/gemm/collective/fp8_accumulation.hpp
+
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cute/algorithm/clear.hpp"
+#include "cute/tensor.hpp"
+
+//////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////FP8 Accumulation///////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+/// This class provides API to promote (add) or scale (multiply_add) the results
+/// from the tensor core accumulators to the main accumulators when the number 
+/// of MMAs reaches the max number of MMA interval specified by user, after that
+/// the tensor core accumulators are zeroed.
+//////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+template <
+    class EngineAccum,
+    class LayoutAccum>
+struct GmmaFP8AccumulationWithScale {  
+  using TensorAccum = cute::Tensor<EngineAccum, LayoutAccum>;
+  using ElementAccumulator = typename EngineAccum::value_type;
+
+  static_assert(is_static<LayoutAccum>::value, "Accumulator Layout should be static");
+  static_assert(is_rmem<TensorAccum>::value , "Accumulator tensor must be rmem resident.");
+
+private:
+  TensorAccum& accum_;
+  TensorAccum accum_temp_;
+
+  uint32_t accum_promotion_interval_;         // defines the max num of executed MMAs after which accum should be promoted.
+  uint32_t mma_count_per_mainloop_iteration_; // num of MMAs per k_tile of mainloop
+  uint32_t mma_count_;                        // current executed MMAs
+  uint32_t reset_accum_flag_;                 // accum needs to be zeroed or not. 
+
+  // promote or `add` the partial accumulators to main accumulator (FADD).
+  CUTLASS_DEVICE
+  void promote_core() {
+    warpgroup_wait<0>();
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(accum_); ++i) {
+      accum_(i) += accum_temp_(i);
+    }
+  }
+
+  // `multiply` scale the partial accumulators and `add` to main accumulator (FFMA).
+  template <
+    class EngineScale,
+    class LayoutScale>
+  CUTLASS_DEVICE
+  void scale_core(const cute::Tensor<EngineScale, LayoutScale> &scale) {
+    using TensorScale = cute::Tensor<EngineScale, LayoutScale>;
+
+    static_assert(is_static<LayoutScale>::value, "Scale Layout should be static");
+    static_assert(is_rmem<TensorScale>::value , "Scale tensor must be rmem resident.");
+
+    static_assert(LayoutAccum{}.shape() == LayoutScale{}.shape(), "Accumulator and scale must have same shape.");
+
+    warpgroup_wait<0>();
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(accum_); ++i) {
+      accum_(i) += accum_temp_(i) * scale(i);
+    }
+  }
+
+public:
+  CUTLASS_DEVICE
+  GmmaFP8AccumulationWithScale(
+      TensorAccum &accum,
+      uint32_t accum_promotion_interval,
+      uint32_t mma_count_per_mainloop_iteration)
+      : accum_(accum), 
+        accum_promotion_interval_(accum_promotion_interval),
+        mma_count_per_mainloop_iteration_(mma_count_per_mainloop_iteration),
+        mma_count_(0), 
+        reset_accum_flag_(0) 
+  {
+    accum_temp_ = cute::make_fragment_like(accum);
+  }
+
+  //
+  // Methods (Common)
+  //
+
+  CUTLASS_DEVICE 
+  TensorAccum& operator()() {
+    return accum_temp_;
+  }
+
+  /// prepare the MMA accumulators when initialization or zeroing is required.
+  CUTLASS_DEVICE
+  bool prepare_if_needed() { 
+    return reset_accum_flag_;
+  }
+
+  //
+  // Methods (for FADD version)
+  //
+
+  /// promote (add) the results from the MMA accumulators to main accumulator if needed.
+  CUTLASS_DEVICE
+  void promote_if_needed() {
+    mma_count_ += mma_count_per_mainloop_iteration_;
+    reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0);
+    if (reset_accum_flag_) {
+      promote_core();
+      mma_count_ = 0;
+    }
+  }
+
+  /// promote (add) the residue results from the MMA accumulators to main accumulator if needed.
+  CUTLASS_DEVICE
+  void promote_residue_if_needed() {
+    if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) {
+      promote_core();
+    }
+  }
+
+  //
+  // Methods (for FFMA version)
+  //
+
+  /// scale (multiply_add) the results from the MMA accumulators to main accumulator if needed.
+  template <
+    class EngineScale,
+    class LayoutScale>
+  CUTLASS_DEVICE
+  void scale_if_needed(const cute::Tensor<EngineScale, LayoutScale> &scale) {
+    mma_count_ += mma_count_per_mainloop_iteration_;
+    reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0);
+    if (reset_accum_flag_) {
+      scale_core(scale);
+      mma_count_ = 0;
+    }
+  }
+
+  /// scale (multiply_add) the residue results from the MMA accumulators to main accumulator if needed.
+  template <
+    class EngineScale,
+    class LayoutScale>
+  CUTLASS_DEVICE
+  void scale_residue_if_needed(const cute::Tensor<EngineScale, LayoutScale> &scale) {
+    if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) {
+      scale_core(scale);
+    }
+  }
+};
+
+} // namespace cutlass::gemm::collective
diff --git a/vllm_v0.10.0/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/vllm_v0.10.0/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
new file mode 100644
index 0000000..ce7f47c
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@@ -0,0 +1,729 @@
+// clang-format off
+// Adapted (Heavily) from: https://github.com/soundOfDestiny/cutlass/blob/9d997ce0dea4c5fa1a617db6b7ff29aa9235822c/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm80.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+#include "cutlass_extensions/gemm/dispatch_policy.hpp"
+#include "cutlass_extensions/gemm/collective/fp8_accumulation.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule,
+  int ScaleGranularityM_,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using ElementBlockScale = ElementAccumulator;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  // Two threads per CTA are producers (1 for operand tile and 32 for scales)
+  static constexpr int NumProducerThreadEvents = 33; 
+
+  static constexpr int ScaleGranularityM = ScaleGranularityM_ == 0 ? size<0>(TileShape{}) : ScaleGranularityM_;
+  static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert((size<0>(TileShape{}) % ScaleGranularityM) == 0, "FP8 scaling granularity must evenly divide tile shape along M.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  
+  // Block scaling gmem-to-smem copy atom 
+  using SmemBlockScalingCopyAtomA = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
+  using SmemBlockScalingCopyAtomB = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
+  
+  // Block scaling smem layout
+  using SmemLayoutScaleA = Layout<Shape<Int<ScaleMsPerTile>, Int<DispatchPolicy::Stages>>>;
+  using SmemLayoutScaleB = Layout<Shape<Int<DispatchPolicy::Stages>>, Stride<_1>>; // `ScaleNsPerTile` is always 1.
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<ElementAccumulator, ElementBlockScale>,
+             "ElementAccumulator and ElementBlockScale should be same datatype");
+
+  struct SharedStorage
+  {
+    struct TensorStorage : cute::aligned_struct<128> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;  // mxk
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;  // nxk
+      cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutScaleA>> smem_scale_A; // ScaleMsPerTile x k
+      cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutScaleB>> smem_scale_B; // 1xk
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+    ElementBlockScale const* ptr_scale_A; 
+    ElementBlockScale const* ptr_scale_B;
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        make_tensor(static_cast<ElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_,_,0),
+        TileShape{},
+        ClusterShape{}));
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        make_tensor(static_cast<ElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,0),
+        TileShape{},
+        ClusterShape{}));
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+    // Block scaling factors for A and B
+    ElementBlockScale const* ptr_scale_A; 
+    ElementBlockScale const* ptr_scale_B;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = reinterpret_cast<ElementA const*>(args.ptr_A);
+    auto ptr_B = reinterpret_cast<ElementB const*>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    uint32_t transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t transaction_bytes_nk = TmaTransactionBytesNK;
+    uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk;
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      transaction_bytes,
+      transaction_bytes_mk,
+      transaction_bytes_nk,
+      args.ptr_scale_A,
+      args.ptr_scale_B
+    };
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+  static constexpr uint32_t TmaTransactionBytesMK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value));
+  static constexpr uint32_t TmaTransactionBytesNK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params)
+  {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    constexpr auto scales_m = Int<ScaleMsPerTile>{};
+    auto tM = get<2>(gA_mkl.shape());
+    auto tN = get<2>(gB_nkl.shape());
+    auto tK = get<3>(gA_mkl.shape());
+
+    // Make the tiled views of scale tensors
+    auto scaleA_shape = make_shape(M / ScaleGranularityM, tK, L); // (scale_m,k,l)
+    auto scaleA_layout = make_ordered_layout(scaleA_shape,  Step<_0, _1, _2>{});
+    auto scaleB_shape = make_shape(tN, tK, L); // (n,k,l)
+    auto scaleB_layout = make_ordered_layout(scaleB_shape, Step<_1, _0, _2>{});
+
+    // Note that mScaleA_mkl and mScaleB_nkl are already blocked tiled in the `m` host and 
+    // gScaleA_mkl and gScaleB_nkl in `g` global memory are same as mScaleA_mkl and mScaleB_nkl.
+    Tensor mScaleA_mkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_A), scaleA_layout); // (scale_m,k,l)
+    Tensor mScaleB_nkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_B), scaleB_layout); // (n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl, mScaleA_mkl, mScaleB_nkl);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class TensorScaleA, class TensorScaleB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB, TensorScaleA, TensorScaleB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Blockscaling: Tma loads for load_input and CpAsync for load_scale
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+    Tensor sScaleA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()), SmemLayoutScaleA{}); // (ScaleMsPerTile,k)
+    Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k)
+
+    //
+    // Prepare the TMA loads for A and B
+    //
+
+    constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+    uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+    auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+    // Partition the inputs based on the current block coordinates.
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+    Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+
+    // Block scaling: load_scale has scaling tensors in global memory which are not tiled
+    Tensor mScaleA_mkl = get<2>(load_inputs);
+    Tensor mScaleB_nkl = get<3>(load_inputs);
+    auto scales_m = get<0>(mScaleA_mkl.shape());
+
+    Tensor cScaleA_mkl = make_identity_tensor(mScaleA_mkl.shape());
+
+    Tensor gScaleA = local_tile( 
+      mScaleA_mkl, make_tile(Int<ScaleMsPerTile>{}), 
+      make_coord(m_coord,_,l_coord));                   // (ScaleMsPerTile,k,1)
+    Tensor cScaleA = local_tile( 
+      cScaleA_mkl, make_tile(Int<ScaleMsPerTile>{}), 
+      make_coord(m_coord,_,l_coord));
+    Tensor gScaleB = mScaleB_nkl(n_coord,_,l_coord);                                           // (1,k,1)
+
+    // TODO: test `scale_copy_a` with `ScaleMsPerTile` < 128
+    TiledCopy scale_copy_a = make_tiled_copy(SmemBlockScalingCopyAtomA{}, 
+      Layout<Shape<_32>>{}, Layout<Shape<_1>>{}); // (1,1,1)
+    TiledCopy scale_copy_b = make_tiled_copy(SmemBlockScalingCopyAtomB{}, 
+      Layout<Shape<_1>>{}, Layout<Shape<_1>>{}); // (1,1,1)
+    ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(threadIdx.x);
+    ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(threadIdx.x);
+    
+    Tensor tAgA_ScaleA = thr_scale_copy_a.partition_S(gScaleA);
+    Tensor tAcA_ScaleA = thr_scale_copy_a.partition_S(cScaleA);
+    Tensor tAsA_ScaleA = thr_scale_copy_a.partition_D(sScaleA);
+    
+    Tensor tBgB_ScaleB = thr_scale_copy_b.partition_S(gScaleB);
+    Tensor tBsB_ScaleB = thr_scale_copy_b.partition_D(sScaleB);
+
+    // Applies the mapping from block_tma_a
+    Tensor tAgA = block_tma_a.partition_S(gA);                                              // (TMA,TMA_M,TMA_K,k)
+    Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+    Tensor tBgB = block_tma_b.partition_S(gB);                                              // (TMA,TMA_N,TMA_K,k)
+    Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+    uint16_t mcast_mask_a = 0;
+    uint16_t mcast_mask_b = 0;
+
+    // Issue TmaLoads for GEMM operands A/B and CpAsync for scale tensors
+    // Maps the tile -> block, value
+    if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+      for (int n = 0; n < size<1>(block_layout); ++n) {
+        mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+      }
+    }
+
+    if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+      for (int m = 0; m < size<0>(block_layout); ++m) {
+        mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+      }
+    }
+
+    // Allocate predicate tensors for a_scales (since we can't guarantee that 
+    // all scales are valid, since we could have a partial tiles along M)
+    Tensor tApA_ScaleA = make_tensor<bool>(shape(tAsA_ScaleA(_,_,0)));
+    #pragma unroll
+    for (int i = 0; i < size(tApA_ScaleA); ++i) {
+      tApA_ScaleA(i) = get<0>(tAcA_ScaleA(i)) < scales_m;
+    }
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+
+      //
+      // Copy gmem to smem for *k_tile_iter
+      //
+      int write_stage = smem_pipe_write.index();
+      using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+      BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+      // Copy operands A and B from global memory to shared memory
+      if (lane_predicate) copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+      if (lane_predicate) copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+
+      // Copy scale tensors from global memory to shared memory
+      copy_if(scale_copy_a, tApA_ScaleA, tAgA_ScaleA(_,_,*k_tile_iter), tAsA_ScaleA(_,_,write_stage));
+      copy(scale_copy_b, tBgB_ScaleB(_,*k_tile_iter), tBsB_ScaleB(_,write_stage));
+      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive_noinc);
+
+      ++k_tile_iter;
+
+      // Advance smem_pipe_write
+      ++smem_pipe_write;
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+
+
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+    
+    // Block scaling
+    Tensor sScaleAViewAsC = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()),
+      Layout<
+        Shape<Shape<Int<ScaleGranularityM>, Int<ScaleMsPerTile>>, cute::tuple_element_t<1, TileShape>, Int<DispatchPolicy::Stages>>,
+        Stride<Stride<_0, _1>, _0, Int<ScaleMsPerTile>>
+      >{}); // ((ScaleGranularityM,ScaleMsPerTile),n,k)
+    Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+    
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and 
+                  stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    Tensor tCsScaleAViewAsC = tiled_mma.get_slice(thread_idx).partition_C(sScaleAViewAsC);    // (MMA,MMA_M,MMA_N,PIPE), `thread_mma` above is correct when partitioning A and B, but it is not correct when partitioning C.
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+    
+    // Per block scale values for operand A and B
+
+    using RegLayoutScaleAViewAsC = decltype(make_layout_like(tCsScaleAViewAsC(_, _, _, 0).layout())); // `make_layout_like` makes a compact layout.
+    using RegLayoutScaleAEssential = decltype(filter_zeros(RegLayoutScaleAViewAsC{}.stride(), RegLayoutScaleAViewAsC{}.shape())); // an interface to traverse the underlying storage for the compact layout mentioned above
+
+    Tensor tCrScaleAViewAsC = make_tensor<ElementBlockScale>(RegLayoutScaleAViewAsC{});              // (MMA,MMA_M,MMA_N)
+    ElementBlockScale scale_b;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    GmmaFP8AccumulationWithScale accumulation(accum, size<2>(TileShape{}) / size<2>(typename TiledMma::AtomShape_MNK{}), size<2>(tCrA));
+    warpgroup_fence_operand(accumulation());
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      if (accumulation.prepare_if_needed()) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      int read_stage = smem_pipe_read.index();
+      
+      // Load per block scale values from shared memory to registers.
+      scale_b = sScaleB[read_stage];
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
+        tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{}));
+      }
+      if constexpr (ScaleMsPerTile == 1) {
+        static_assert(size(RegLayoutScaleAEssential{}) == 1);
+        tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`.
+      } else {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
+          tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b;
+        }
+      }
+
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      // Block scale the accumulators with reg tensor `tCrScaleAViewAsC`
+      accumulation.scale_if_needed(tCrScaleAViewAsC);
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accumulation());
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      // Load per block scale values from shared memory to registers (at most twice per block along M and exactly once per block along N) 
+      scale_b = sScaleB[read_stage];
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
+        tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{}));
+      }
+      if constexpr (ScaleMsPerTile == 1) {
+        static_assert(size(RegLayoutScaleAEssential{}) == 1);
+        tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`.
+      } else {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
+          tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b;
+        }
+      }
+
+      if (accumulation.prepare_if_needed()) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      warpgroup_fence_operand(accumulation());
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accumulation());
+
+      // Block scale the accumulators with reg tensor `tCrScaleAViewAsC`
+      accumulation.scale_if_needed(tCrScaleAViewAsC);
+
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+
+      // Advance smem_pipe_read and smem_pipe_release
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+    
+    accumulation.scale_residue_if_needed(tCrScaleAViewAsC);
+
+    warpgroup_fence_operand(accumulation());
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/vllm_v0.10.0/csrc/cutlass_extensions/gemm/dispatch_policy.hpp b/vllm_v0.10.0/csrc/cutlass_extensions/gemm/dispatch_policy.hpp
new file mode 100644
index 0000000..df809e2
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cutlass_extensions/gemm/dispatch_policy.hpp
@@ -0,0 +1,39 @@
+#pragma once
+
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+namespace cutlass::gemm {
+
+//////////////////////////////////////////////////////////////////////////////
+
+// FP8 related policies (including Blocked Scaled Accumulation)
+//  `ScaleGranularityM` specifies scaling granularity along M, while zero-value
+//  `ScaleGranularityM` indicates that scaling granularity is
+//  `size<0>(TileShape_MNK{})` along M.
+template <int ScaleGranularityM = 0>
+struct KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum
+    : KernelTmaWarpSpecializedCooperative {};
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp
+// specialized dynamic schedule For FP8 kernels with Block Scaling
+template <int Stages_, class ClusterShape_ = Shape<_1, _1, _1>,
+          class KernelSchedule = KernelTmaWarpSpecialized,
+          int ScaleGranularityM =
+              0  // `ScaleGranularityM` specifies scaling granularity along M,
+                 // while zero-value `ScaleGranularityM` indicates that scaling
+                 // granularity is `size<0>(TileShape_MNK{})` along M.
+          >
+struct MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8
+    : MainloopSm90TmaGmmaWarpSpecialized<Stages_, ClusterShape_,
+                                         KernelSchedule> {
+  static_assert(
+      cute::is_same_v<
+          KernelSchedule,
+          KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<
+              ScaleGranularityM>>,
+      "KernelSchedule must be one of the warp specialized policies");
+};
+
+//////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/cutlass_extensions/torch_utils.hpp b/vllm_v0.10.0/csrc/cutlass_extensions/torch_utils.hpp
new file mode 100644
index 0000000..a1ff933
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cutlass_extensions/torch_utils.hpp
@@ -0,0 +1,160 @@
+#pragma once
+
+#include <torch/all.h>
+
+#include "cute/layout.hpp"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/bfloat16.h"
+#include "cutlass/half.h"
+
+using ColumnMajor = typename cutlass::layout::ColumnMajor;
+using RowMajor = typename cutlass::layout::RowMajor;
+
+namespace cute {
+
+namespace detail {
+
+template <class T, class F, class G, int... I>
+CUTE_HOST_DEVICE constexpr auto tapply_with_idx(T&& t, F&& f, G&& g,
+                                                seq<I...>) {
+  return g(f(cute::get<I>(static_cast<T&&>(t)), I)...);
+}
+
+template <class F, int... I>
+CUTE_HOST_DEVICE constexpr auto make_shape_from_idx(F&& f, seq<I...>) {
+  return make_shape(f(I)...);
+}
+
+};  // namespace detail
+
+template <class T, class F>
+CUTE_HOST_DEVICE constexpr auto transform_with_idx(T const& t, F&& f) {
+  if constexpr (cute::is_tuple<T>::value) {
+    return detail::tapply_with_idx(
+        t, f, [](auto const&... a) { return cute::make_tuple(a...); },
+        tuple_seq<T>{});
+  } else {
+    return f(t);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// calls: make_shape(f(0), f(1), ..., f(N-1))
+template <int N, class F>
+CUTE_HOST_DEVICE constexpr auto make_shape_from_idx(F&& f) {
+  return detail::make_shape_from_idx(f, make_seq<N>{});
+}
+
+};  // namespace cute
+
+// Make a layout from a tensor with `rank(Stride{})`, where the shape is the
+// shape of the passed in tensor and the strides are of type `Stride` and
+// contain the strides of the passed in tensor, checking that any static strides
+// in `Stride{}` match the strides of the passed in tensor.
+// If `tensor.dim() < rank(Stride{})`, the shape is padded with 1s and the extra
+// strides are set to be 0 or 1.
+template <typename Stride>
+static inline auto make_cute_layout(torch::Tensor const& tensor,
+                                    std::string_view name = "tensor") {
+  TORCH_CHECK(tensor.dim() <= rank(Stride{}));
+  auto stride = cute::transform_with_idx(
+      Stride{}, [&](auto const& stride_ele, auto const& idx) {
+        using StrideEle = std::decay_t<decltype(stride_ele)>;
+
+        if (idx < tensor.dim()) {
+          if constexpr (cute::is_static_v<StrideEle>) {
+            TORCH_CHECK(StrideEle::value == tensor.stride(idx), "Expected ",
+                        name, ".stride(", idx, ") to be ", StrideEle::value);
+            return StrideEle{};
+          } else {
+            if (tensor.size(idx) == 1) {
+              // use 0 stride for dim with size 1, this is easier for
+              // cute/cutlass to optimize (helps the TMA code flatten dims)
+              return StrideEle{0};
+            } else {
+              return tensor.stride(idx);
+            }
+          }
+        } else {
+          // Extra strides are assumed to be 0 or 1
+          if constexpr (cute::is_static_v<StrideEle>) {
+            static_assert(StrideEle::value == 0 || StrideEle::value == 1);
+          }
+          return StrideEle{};
+        }
+      });
+
+  auto shape = cute::make_shape_from_idx<rank(Stride{})>([&](auto const& idx) {
+    if (idx < tensor.dim())
+      return tensor.size(idx);
+    else
+      return int64_t(1);
+  });
+
+  return make_layout(shape, stride);
+}
+
+template <typename Stride>
+static inline auto maybe_make_cute_layout(
+    std::optional<torch::Tensor> const& tensor,
+    std::string_view name = "tensor") {
+  using Layout = decltype(make_cute_layout<Stride>(*tensor));
+
+  if (tensor) {
+    return std::optional<Layout>{make_cute_layout<Stride>(*tensor, name)};
+  } else {
+    return std::optional<Layout>{};
+  }
+}
+
+//
+//  Torch Type to Cutlass Type (equivalent_cutlass_type)
+//
+
+template <typename T>
+struct equivalent_cutlass_type {
+  using type = T;
+};
+
+template <typename T>
+using equivalent_cutlass_type_t = typename equivalent_cutlass_type<T>::type;
+
+template <>
+struct equivalent_cutlass_type<c10::Half> {
+  using type = cutlass::half_t;
+};
+
+template <>
+struct equivalent_cutlass_type<c10::BFloat16> {
+  using type = cutlass::bfloat16_t;
+};
+
+//
+// equivalent_scalar_t (basically inverse of equivalent_cutlass_type)
+//
+
+// Return a `c10::CppTypeToScalarType<T>` compatible type, i.e. get the C++ from
+// c10 that is equivalent to T, e.g.: `cutlass::half_t -> c10::Half`
+template <typename T>
+struct equivalent_scalar_type {
+  using type = T;
+};
+
+template <typename T>
+using equivalent_scalar_type_t = typename equivalent_scalar_type<T>::type;
+
+template <>
+struct equivalent_scalar_type<cutlass::half_t> {
+  using type = c10::Half;
+};
+
+template <>
+struct equivalent_scalar_type<cutlass::bfloat16_t> {
+  using type = c10::BFloat16;
+};
+
+// get equivalent c10::ScalarType tag from compile time type
+template <typename T>
+static inline constexpr c10::ScalarType equivalent_scalar_type_v =
+    c10::CppTypeToScalarType<equivalent_scalar_type_t<T>>::value;
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/cutlass_extensions/vllm_collective_builder.cuh b/vllm_v0.10.0/csrc/cutlass_extensions/vllm_collective_builder.cuh
new file mode 100644
index 0000000..e7fbba4
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cutlass_extensions/vllm_collective_builder.cuh
@@ -0,0 +1,43 @@
+#pragma once
+
+#include "cutlass_extensions/gemm/collective/collective_builder.hpp"
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+//
+// VLLMCollectiveBuilder is a wrapper around CollectiveBuilder that allows for
+// for custom kernel tags, allowing you to build custom collectives. Without
+// touching the cutlass library headers, using `CutlassKernelTag` will mean it
+// will resort to using the standard cutlass collective builder.
+//
+
+// Use the default Cutlass collective builder, i.e. use an unmodified cutless
+// collective
+struct CutlassKernelTag {};
+
+template <class KernelTag, class ArchTag, class OpClass, class ElementA,
+          class GmemLayoutA, int AlignmentA, class ElementB, class GmemLayoutB,
+          int AlignmentB, class ElementAccumulator, class TileShape_MNK,
+          class ClusterShape_MNK, class StageCountType,
+          class KernelScheduleType, class Enable = void>
+struct VLLMCollectiveBuilder {
+  static_assert(sizeof(ElementA) == 0,
+                "Could not build a collective for given parameters.");
+};
+
+template <class ArchTag, class OpClass, class ElementA, class GmemLayoutA,
+          int AlignmentA, class ElementB, class GmemLayoutB, int AlignmentB,
+          class ElementAccumulator, class TileShape_MNK, class ClusterShape_MNK,
+          class StageCountType, class KernelScheduleType>
+struct VLLMCollectiveBuilder<
+    CutlassKernelTag, ArchTag, OpClass, ElementA, GmemLayoutA, AlignmentA,
+    ElementB, GmemLayoutB, AlignmentB, ElementAccumulator, TileShape_MNK,
+    ClusterShape_MNK, StageCountType, KernelScheduleType> {
+  using CollectiveOp = typename CollectiveBuilder<
+      ArchTag, OpClass, ElementA, GmemLayoutA, AlignmentA, ElementB,
+      GmemLayoutB, AlignmentB, ElementAccumulator, TileShape_MNK,
+      ClusterShape_MNK, StageCountType, KernelScheduleType>::CollectiveOp;
+};
+
+};  // namespace cutlass::gemm::collective
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/cutlass_extensions/vllm_custom_types.cuh b/vllm_v0.10.0/csrc/cutlass_extensions/vllm_custom_types.cuh
new file mode 100644
index 0000000..6146bdc
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cutlass_extensions/vllm_custom_types.cuh
@@ -0,0 +1,50 @@
+#pragma once
+
+#include "cutlass/integer_subbyte.h"
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int Bits, int Bias, bool Signed = false>
+struct vllm_biased_integer_subbyte : public integer_subbyte<Bits, Signed> {
+  using Base = integer_subbyte<Bits, Signed>;
+
+  using Storage = typename Base::Storage;
+  using xint_t = typename Base::xint_t;
+
+  using Base::bits_mask_;
+  using Base::sign_mask_;
+  using Base::storage;
+
+  //
+  // Methods
+  //
+
+  /// No operation
+  vllm_biased_integer_subbyte() = default;
+
+  /// Conversion from integer type
+  CUTLASS_HOST_DEVICE explicit vllm_biased_integer_subbyte(int value)
+      : Base(value) {}
+  CUTLASS_HOST_DEVICE explicit vllm_biased_integer_subbyte(unsigned value)
+      : Base(value) {}
+  CUTLASS_HOST_DEVICE explicit vllm_biased_integer_subbyte(double value)
+      : Base(value) {}
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// "GPTQ" types, i.e. symmetric quantization
+using vllm_uint4b8_t = vllm_biased_integer_subbyte<4, 8>;      // u4b8
+using vllm_uint8b128_t = vllm_biased_integer_subbyte<8, 128>;  // u8b128
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int Bits, int Bias, bool Signed>
+struct sizeof_bits<vllm_biased_integer_subbyte<Bits, Bias, Signed>> {
+  static constexpr int value = Bits;
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass
diff --git a/vllm_v0.10.0/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/vllm_v0.10.0/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
new file mode 100644
index 0000000..1dd7101
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import enum
+from typing import Union
+
+from cutlass_library import *
+
+#
+#   Extend cutlass library with custom types, and missing values
+#
+
+
+class VLLMDataType(enum.Enum):
+    u4b8 = enum_auto()
+    u8b128 = enum_auto()
+
+
+class MixedInputKernelScheduleType(enum.Enum):
+    TmaWarpSpecialized = enum_auto()
+    TmaWarpSpecializedPingpong = enum_auto()
+    TmaWarpSpecializedCooperative = enum_auto()
+
+
+VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
+    **DataTypeNames,  # type: ignore
+    **{
+        VLLMDataType.u4b8: "u4b8",
+        VLLMDataType.u8b128: "u8b128",
+    }
+}
+
+VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
+    **DataTypeTag,  # type: ignore
+    **{
+        VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
+        VLLMDataType.u8b128: "cutlass::vllm_uint8b128_t",
+    }
+}
+
+VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
+    **DataTypeSize,  # type: ignore
+    **{
+        VLLMDataType.u4b8: 4,
+        VLLMDataType.u8b128: 8,
+    }
+}
+
+VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
+    VLLMDataType.u4b8: "vllm::kU4B8",
+    VLLMDataType.u8b128: "vllm::kU8B128",
+    DataType.u4: "vllm::kU4",
+    DataType.u8: "vllm::kU8",
+    DataType.s4: "vllm::kS4",
+    DataType.s8: "vllm::kS8",
+    DataType.f16: "vllm::kFloat16",
+    DataType.bf16: "vllm::kBfloat16",
+}
+
+VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
+    DataType.u8: "at::ScalarType::Byte",
+    DataType.s8: "at::ScalarType::Char",
+    DataType.e4m3: "at::ScalarType::Float8_e4m3fn",
+    DataType.s32: "at::ScalarType::Int",
+    DataType.f16: "at::ScalarType::Half",
+    DataType.bf16: "at::ScalarType::BFloat16",
+    DataType.f32: "at::ScalarType::Float",
+}
+
+VLLMKernelScheduleTag: dict[Union[
+    MixedInputKernelScheduleType, KernelScheduleType], str] = {
+        **KernelScheduleTag,  # type: ignore
+        **{
+            MixedInputKernelScheduleType.TmaWarpSpecialized:
+            "cutlass::gemm::KernelTmaWarpSpecialized",
+            MixedInputKernelScheduleType.TmaWarpSpecializedPingpong:
+            "cutlass::gemm::KernelTmaWarpSpecializedPingpong",
+            MixedInputKernelScheduleType.TmaWarpSpecializedCooperative:
+            "cutlass::gemm::KernelTmaWarpSpecializedCooperative",
+        }
+    }
diff --git a/vllm_v0.10.0/csrc/cutlass_extensions/vllm_numeric_conversion.cuh b/vllm_v0.10.0/csrc/cutlass_extensions/vllm_numeric_conversion.cuh
new file mode 100644
index 0000000..90f226c
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cutlass_extensions/vllm_numeric_conversion.cuh
@@ -0,0 +1,992 @@
+#pragma once
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass_extensions/vllm_custom_types.cuh"
+#include "cutlass_extensions/cute_utils.cuh"
+#include "cutlass_extensions/vllm_type_utils.cuh"
+
+// this file extends:
+//   https://github.com/NVIDIA/cutlass/blob/cutlass-3.5.0/include/cutlass/numeric_conversion.h
+// with vllm specific type conversions, namely: vllm_uint4b8_t, vllm_uint8b128_t
+// as well as adds interleaved numeric array converters for specific types.
+// (interleaved numeric array converters can be more efficient for subbyte
+// types)
+
+namespace cutlass {
+
+// InterleavedNumericArrayConverter is like NumericArrayConverter but also
+// deinterleaves converted elements based on IlvBlkLayout, interleaving can
+// make subbyte converts more efficient by allowing for efficient extraction
+// of subbyte elements from a 32bit register.
+template <typename IlvBlkLayout, typename T, typename S, int N,
+          FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+          class Enable = void>
+struct InterleavedNumericArrayConverter {
+  using Converter = NumericArrayConverter<T, S, N, Round>;
+
+  using result_type = typename Converter::result_type;
+  using source_type = typename Converter::source_type;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    if (cute::elect_one_sync()) {
+      if constexpr (std::is_same_v<IlvBlkLayout, void>) {
+        printf(
+            "Convert %s <= %s (N = %d, IlvBlkLayout = void), not implemented\n",
+            nameof_v<T>, nameof_v<S>, N);
+      } else {
+        printf(
+            "Convert %s <= %s (N = %d, size(IlvBlkLayout{}) = %d), not "
+            "implemented\n",
+            nameof_v<T>, nameof_v<S>, N, size(IlvBlkLayout{}));
+      }
+      __brkpt();
+    }
+    return {};
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) const { return convert(s); }
+};
+
+template <typename IlvBlkLayout, typename T, typename S, int N,
+          FloatRoundStyle Round>
+struct InterleavedNumericArrayConverter<
+    IlvBlkLayout, T, S, N, Round,
+    std::enable_if_t<is_identity_layout<IlvBlkLayout>()>> {
+  using Converter = NumericArrayConverter<T, S, N, Round>;
+
+  using result_type = typename Converter::result_type;
+  using source_type = typename Converter::source_type;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    return Converter::convert(source);
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) const { return convert(s); }
+};
+
+template <typename RegConvert32bit, typename T, typename S, int N>
+struct ArrayConverterPacked32Bit {
+  using result_type = Array<T, N>;
+  using source_type = Array<S, N>;
+
+  using result_packed_8_t = Array<T, 8>;
+  using result_packed_4_t = Array<T, 4>;
+  using result_packed_2_t = Array<T, 2>;
+  using src_packed_8_t = Array<S, 8>;
+  using src_packed_4_t = Array<S, 4>;
+  using src_packed_2_t = Array<S, 2>;
+
+  static_assert(N % 2 == 0, "N must be a multiple of 2");
+  static_assert(cutlass::sizeof_bits_v<S> >= 4);  // TODO: add 16 packed sources
+  static_assert(32 % cutlass::sizeof_bits_v<S> == 0);
+  static constexpr auto src_elems_per_32bit_reg =
+      32 / cutlass::sizeof_bits_v<S>;
+
+  // Maybe not Valid. ScalarConverter will not actually work unless
+  // NumericConverter<T, S, Round> is implemented. However it won't be used
+  // anyways since we assert N % 2 == 0, just here for compliance with
+  // VectorizedConverter.
+  using ScalarConverter = NumericConverter<T, S>;
+
+  template <typename PackedSrc>
+  CUTLASS_DEVICE static auto to_regs(PackedSrc const& src) {
+    if constexpr (sizeof(PackedSrc) == 1) {
+      return Array<uint32_t, 1>{reinterpret_cast<uint8_t const&>(src)};
+    } else if constexpr (sizeof(PackedSrc) == 2) {
+      return Array<uint32_t, 1>{reinterpret_cast<uint16_t const&>(src)};
+    } else if constexpr (sizeof(PackedSrc) == 4) {
+      return Array<uint32_t, 1>{reinterpret_cast<uint32_t const&>(src)};
+    } else {
+      static_assert(sizeof(PackedSrc) == 8);
+      return reinterpret_cast<Array<uint32_t, 2> const&>(src);
+    }
+  }
+
+  // The core converter uses bit tricks to construct a known FP16 number, then
+  // does a subtraction in FP16 for the final result.
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE static PackedResultType packed_convert(
+      PackedSrcType const& source) {
+    static_assert(PackedSrcType::kElements == PackedResultType::kElements);
+    static_assert(PackedResultType::kElements == 2 ||
+                      PackedResultType::kElements == 4 ||
+                      PackedResultType::kElements == 8,
+                  "Invalid PackedResultType must be 2, 4 or 8.");
+    static_assert(std::is_same_v<typename PackedSrcType::Element, S>);
+    static_assert(std::is_same_v<typename PackedResultType::Element, T>);
+
+    return RegConvert32bit::template convert<PackedResultType>(to_regs(source));
+  }
+
+  friend class detail::VectorizedConverter;
+
+ public:
+  CUTLASS_DEVICE static result_type convert(source_type const& source) {
+    result_type result;
+    using ConverterType =
+        ArrayConverterPacked32Bit<RegConvert32bit,
+                                  typename result_type::Element,
+                                  typename source_type::Element, N>;
+
+    if constexpr (src_elems_per_32bit_reg >= 8) {
+      detail::VectorizedConverter::convert<
+          ConverterType, result_packed_8_t, src_packed_8_t, result_packed_4_t,
+          src_packed_4_t, result_packed_2_t, src_packed_2_t>(result, source);
+    } else if constexpr (src_elems_per_32bit_reg >= 4) {
+      detail::VectorizedConverter::convert<ConverterType, result_packed_4_t,
+                                           src_packed_4_t, result_packed_2_t,
+                                           src_packed_2_t>(result, source);
+    } else {
+      detail::VectorizedConverter::convert<ConverterType, result_packed_2_t,
+                                           src_packed_2_t>(result, source);
+    }
+
+    return result;
+  }
+};
+
+// Convert 8 4bit values packed into a 32bit register to 8 8bit values packed
+// into 2 32bit register.
+template <uint8_t LUT0, uint8_t LUT1, uint8_t LUT2, uint8_t LUT3,    //
+          uint8_t LUT4, uint8_t LUT5, uint8_t LUT6, uint8_t LUT7,    //
+          uint8_t LUT8, uint8_t LUT9, uint8_t LUT10, uint8_t LUT11,  //
+          uint8_t LUT12, uint8_t LUT13, uint8_t LUT14, uint8_t LUT15>
+CUTLASS_DEVICE cutlass::AlignedArray<uint32_t, 2> lut_4bit_to_8bit_convert(
+    uint32_t src) {
+  cutlass::AlignedArray<uint32_t, 2> r;
+  // Determines if the value is in the top half of the LUT if set or
+  //  (i.e. LUT[8:15]) in the bottom half (i.e. LUT[0:7]) if not set. Then move
+  //  into bit position 0x4 of each nibble so when or'd with final_prmt_base it
+  //  selects the correct candidate. When elements in final_prmt_base
+  //  are >= 0x4, the high candidate is selected (i.e. LUT[8:15]), when elements
+  //  are  < 0x4, the low candidate is selected (i.e. LUT[0:7])
+  uint32_t high_bit = (src & 0x88888888) >> 1;
+
+  // `high_bit` is OR'd with 0x31203120 to find the correct value in the LUT
+  // (selects correct high or low candidate)
+  const uint32_t final_prmt_base = 0x32103210;
+
+  // Ignore the high bit when indexing into LUT, for each 4bit value
+  //  we index into both the high and low candidates then use
+  //  high_bit | final_prmt_base to select the correct candidate
+  uint32_t lut_idx = (src & 0x77777777);
+
+  auto pack = [](uint8_t a, uint8_t b, uint8_t c, uint8_t d) {
+    return uint32_t(a) | (uint32_t(b) << 8) | (uint32_t(c) << 16) |
+           (uint32_t(d) << 24);
+  };
+
+  static constexpr uint32_t LOW_0 = pack(LUT0, LUT1, LUT2, LUT3);
+  static constexpr uint32_t LOW_1 = pack(LUT4, LUT5, LUT6, LUT7);
+  static constexpr uint32_t HIGH_0 = pack(LUT8, LUT9, LUT10, LUT11);
+  static constexpr uint32_t HIGH_1 = pack(LUT12, LUT13, LUT14, LUT15);
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int ii = 0; ii < 2; ++ii, lut_idx >>= 16, high_bit >>= 16) {
+    uint32_t final_prmt_idx = final_prmt_base | high_bit;
+
+    // This uses a look up table to convert packed int4s to packed int8s,
+    // using the int4 value as the index to prmt. It first select both the
+    // high and low candidates, then uses the high bit (i.e. `high_bit`) to
+    // select the correct candidate.
+    asm volatile(
+        "{\n"
+        "  .reg .b32 low, high;\n"
+        "  prmt.b32 low, %1, %2, %5;\n"
+        "  prmt.b32 high, %3, %4, %5;\n"
+        "  prmt.b32 %0, low, high, %6;\n"
+        "}\n"
+        : "=r"(r[ii])
+        : "n"(LOW_0), "n"(LOW_1), "n"(HIGH_0), "n"(HIGH_1), "r"(lut_idx),
+          "r"(final_prmt_idx));
+  }
+
+  return r;
+};
+
+// for Array<int8_t, N> <= Array<vllm_uint4b8_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<int8_t, vllm_uint4b8_t, N, Round> {
+  using result_type = Array<int8_t, N>;
+  using source_type = Array<vllm_uint4b8_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+ private:
+  struct RegConvert {
+    template <typename PackedResultType>
+    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
+      // [-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7] as int8s
+      auto r = lut_4bit_to_8bit_convert<0xF8, 0xF9, 0xFA, 0xFB,  //
+                                        0xFC, 0xFD, 0xFE, 0xFF,  //
+                                        0x00, 0x01, 0x02, 0x03,  //
+                                        0x04, 0x05, 0x06, 0x07>(src_[0]);
+      return reinterpret_cast<PackedResultType&>(r);
+    };
+  };
+
+ public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
+                                     typename source_type::Element,
+                                     N>::convert(source);
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) const { return convert(s); }
+};
+
+// for Array<cutlass::float_e4m3_t, N> <= Array<vllm_uint4b8_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::float_e4m3_t, vllm_uint4b8_t, N, Round> {
+  using result_type = Array<cutlass::float_e4m3_t, N>;
+  using source_type = Array<vllm_uint4b8_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+ private:
+  struct RegConvert {
+    template <typename PackedResultType>
+    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
+      // [-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7] as fp8s
+      auto r = lut_4bit_to_8bit_convert<0xD0, 0xCE, 0xCC, 0xCA,  //
+                                        0xC8, 0xC4, 0xC0, 0xB8,  //
+                                        0x00, 0x38, 0x40, 0x44,  //
+                                        0x48, 0x4A, 0x4C, 0x4E>(src_[0]);
+      return reinterpret_cast<PackedResultType&>(r);
+    };
+  };
+
+ public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
+                                     typename source_type::Element,
+                                     N>::convert(source);
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) const { return convert(s); }
+};
+
+// for Array<cutlass::half_t, N> <= Array<vllm_uint4b8_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::half_t, vllm_uint4b8_t, N, Round> {
+  using result_type = Array<cutlass::half_t, N>;
+  using source_type = Array<vllm_uint4b8_t, N>;
+
+  struct RegConvert {
+    template <typename PackedResultType>
+    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
+      uint32_t src = src_[0];
+      using RegArray =
+          cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2,
+                                sizeof(PackedResultType)>;
+      RegArray r;
+
+      // Below constructs the following temporary:
+      // fp16s_01 = {0x00, i4_01, 0x00, i4_01}
+      // fp16s_23 = {0x00, i4_23, 0x00, i4_23}
+      // fp16s_45 = {0x00, i4_45, 0x00, i4_45}
+      // fp16s_67 = {0x00, i4_67, 0x00, i4_67}
+      // We use inline asm instead of __byte_perm intrinsic since we don't want
+      // the documented (& 0x7) on the index. NVCC might be able to optimize it
+      // out since the index is a constexpr, but we choose to be safe about it
+      // here.
+      uint32_t prmt_indices[4] = {0x4040, 0x4141, 0x4242, 0x4343};
+      static_assert(RegArray::kElements <= 4,
+                    "Too many inputs for F16 -> I4 vector converter");
+      CUTLASS_PRAGMA_UNROLL
+      for (int ii = 0; ii < RegArray::kElements; ++ii) {
+        asm volatile(
+            "{\n"
+            "  prmt.b32 %0, %1, %2, %3;\n"
+            "}\n"
+            : "=r"(r[ii])
+            : "r"(src), "n"(0), "r"(prmt_indices[ii]));
+      }
+
+      // Since the stored 4bit values are biased by 8 we get stored_val = (x+8)
+      //  we are trying to construct x and a fp16 value
+      // The below XOR does the following:
+      //  1) Sets the exponent bits of the FP16 to the correct value for the
+      //  FP16 magic_num. We will be constructing {1024+16*(x1+8), 1024+(x0+8)},
+      //  where x1 in the high nibble and x0 is the low nibble then using hfma
+      //  to subtract 1032 from that
+      // The AND does the following:
+      //  1) Clear the set bits for the int4 we will ignore.
+      // We use lop3 so that we can use 1 instruction for AND and XOR.
+      static constexpr uint32_t xor_mask = 0x64006400;
+      static constexpr uint32_t and_mask = 0xFFF0FF0F;
+      static constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
+
+      // For each operand, computes:
+      // r[i] = (r[i] & and_mask) ^ xor_mask
+      CUTLASS_PRAGMA_UNROLL
+      for (int ii = 0; ii < RegArray::kElements; ++ii) {
+        asm volatile(
+            "{\n"
+            "  lop3.b32 %0, %0, %1, %2, %3;\n"
+            "}\n"
+            : "+r"(r[ii])
+            : "n"(and_mask), "n"(xor_mask), "n"(immLut));
+      }
+
+      // We will issue 2 hfmas that do the following:
+      // {x1, x0} = {1024+16*(x1+8), 1024+(x0+8)} * {1/16, 1} - {72, 1032}
+      //          = {x1 + 1152, x0 + 1032} * {1/16, 1} - {72, 1032}
+      static constexpr uint32_t hfma_bias_rep = 0xD480E408;   // {72, 1032}
+      static constexpr uint32_t hfma_scale_rep = 0x2C003C00;  // {1 / 16, 1}
+
+      const half2& hfma_bias = reinterpret_cast<const half2&>(hfma_bias_rep);
+      const half2& hfma_scale = reinterpret_cast<const half2&>(hfma_scale_rep);
+      CUTLASS_PRAGMA_UNROLL
+      for (int ii = 0; ii < RegArray::kElements; ++ii) {
+        half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii]);
+        fp16x2_val = __hfma2(hfma_scale, fp16x2_val, hfma_bias);
+      }
+
+      return reinterpret_cast<PackedResultType&>(r);
+    };
+  };
+
+ public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
+                                     typename source_type::Element,
+                                     N>::convert(source);
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) const { return convert(s); }
+};
+
+// for Array<cutlass::half_t, N> <= Array<vllm_uint4b8_t, N>
+//   for IlvdLayout: (2, 4):(4, 1)
+template <FloatRoundStyle Round, int N>
+struct InterleavedNumericArrayConverter<Layout<Shape<_2, _4>, Stride<_4, _1>>,
+                                        cutlass::half_t, vllm_uint4b8_t, N,
+                                        Round, void> {
+  using IlvdLayout = Layout<Shape<_2, _4>, Stride<_4, _1>>;
+  static_assert(N % size(IlvdLayout{}) == 0);
+
+  using result_type = Array<cutlass::half_t, N>;
+  using source_type = Array<vllm_uint4b8_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+ private:
+  struct RegConvert {
+    template <typename PackedResultType>
+    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
+      uint32_t src = src_[0];
+      using RegArray =
+          cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2,
+                                sizeof(PackedResultType)>;
+      RegArray r;
+
+      static_assert(PackedResultType::kElements <= size(IlvdLayout{}));
+      static constexpr uint32_t xor_mask = 0x64006400;
+
+      for (int ii = 0; ii < RegArray::kElements; ii += 2) {
+        auto src_ = src >> (4 * (ii));
+        r[ii + 0] = src_;
+        r[ii + 1] = src_;
+
+        static constexpr uint32_t and_xor_imm_lut = (0xf0 & 0xcc) ^ 0xaa;
+
+        static constexpr uint32_t low_nib_mask = 0x000F000F;
+        static constexpr uint32_t high_nib_mask = 0x00F000F0;
+
+        asm volatile(
+            "{\n"
+            "  lop3.b32 %0, %0, %1, %2, %3;\n"
+            "}\n"
+            : "+r"(r[ii + 0])
+            : "n"(low_nib_mask), "n"(xor_mask), "n"(and_xor_imm_lut));
+
+        asm volatile(
+            "{\n"
+            "  lop3.b32 %0, %0, %1, %2, %3;\n"
+            "}\n"
+            : "+r"(r[ii + 1])
+            : "n"(high_nib_mask), "n"(xor_mask), "n"(and_xor_imm_lut));
+
+        // For low nibble:
+        //  {x1, x0} = {1024+(x1+8), 1024+(x0+8)} * {1, 1} - {1032, 1032}
+        // For high nibble:
+        //  {x1, x0} = {1024+16*(x1+8), 1024+16*(x0+8)} * {1/16, 1/16}
+        //             - {72, 72}
+        static constexpr uint32_t low_nib_bias = 0x64086408;    // {1032, 1032}
+        static constexpr uint32_t high_nib_scale = 0x2C002C00;  // {1/16, 1/16}
+        static constexpr uint32_t high_nib_bias = 0xD480D480;   // {-72, -72}
+
+        {
+          half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii + 0]);
+          fp16x2_val =
+              __hsub2(fp16x2_val, reinterpret_cast<const half2&>(low_nib_bias));
+        }
+
+        {
+          half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii + 1]);
+          fp16x2_val = __hfma2(fp16x2_val,
+                               reinterpret_cast<const half2&>(high_nib_scale),
+                               reinterpret_cast<const half2&>(high_nib_bias));
+        }
+      }
+
+      return reinterpret_cast<PackedResultType&>(r);
+    };
+  };
+
+ public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
+                                     typename source_type::Element,
+                                     N>::convert(source);
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) const { return convert(s); }
+};
+
+// for Array<cutlass::half_t, N> <= Array<uint4_t, N>
+//   for IlvdLayout: (2, 4):(4, 1)
+template <FloatRoundStyle Round, int N>
+struct InterleavedNumericArrayConverter<Layout<Shape<_2, _4>, Stride<_4, _1>>,
+                                        cutlass::half_t, uint4_t, N, Round,
+                                        void> {
+  using IlvdLayout = Layout<Shape<_2, _4>, Stride<_4, _1>>;
+  static_assert(N % size(IlvdLayout{}) == 0);
+
+  using result_type = Array<cutlass::half_t, N>;
+  using source_type = Array<uint4_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+ private:
+  struct RegConvert {
+    template <typename PackedResultType>
+    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
+      uint32_t src = src_[0];
+      using RegArray =
+          cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2,
+                                sizeof(PackedResultType)>;
+      RegArray r;
+
+      static_assert(PackedResultType::kElements <= size(IlvdLayout{}));
+      static constexpr uint32_t xor_mask = 0x64006400;
+
+      for (int ii = 0; ii < RegArray::kElements; ii += 2) {
+        auto src_ = src >> (4 * (ii));
+        r[ii + 0] = src_;
+        r[ii + 1] = src_;
+
+        static constexpr uint32_t and_xor_imm_lut = (0xf0 & 0xcc) ^ 0xaa;
+
+        static constexpr uint32_t low_nib_mask = 0x000F000F;
+        static constexpr uint32_t high_nib_mask = 0x00F000F0;
+
+        asm volatile(
+            "{\n"
+            "  lop3.b32 %0, %0, %1, %2, %3;\n"
+            "}\n"
+            : "+r"(r[ii + 0])
+            : "n"(low_nib_mask), "n"(xor_mask), "n"(and_xor_imm_lut));
+
+        asm volatile(
+            "{\n"
+            "  lop3.b32 %0, %0, %1, %2, %3;\n"
+            "}\n"
+            : "+r"(r[ii + 1])
+            : "n"(high_nib_mask), "n"(xor_mask), "n"(and_xor_imm_lut));
+
+        // For low nibble:
+        //  {x1, x0} = {1024+x1, 1024+x0} - {1024, 1024}
+        // For high nibble:
+        //  {x1, x0} = {1024+16*x1, 1024+16*x0} * {1/16, 1/16} - {64, 64}
+        static constexpr uint32_t low_nib_bias = 0x64006400;    // {1024, 1024}
+        static constexpr uint32_t high_nib_scale = 0x2C002C00;  // {1/16, 1/16}
+        static constexpr uint32_t high_nib_bias = 0xD400D400;   // {-64, -64}
+
+        {
+          half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii + 0]);
+          fp16x2_val =
+              __hsub2(fp16x2_val, reinterpret_cast<const half2&>(low_nib_bias));
+        }
+
+        {
+          half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii + 1]);
+          fp16x2_val = __hfma2(fp16x2_val,
+                               reinterpret_cast<const half2&>(high_nib_scale),
+                               reinterpret_cast<const half2&>(high_nib_bias));
+        }
+      }
+
+      return reinterpret_cast<PackedResultType&>(r);
+    };
+  };
+
+ public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
+                                     typename source_type::Element,
+                                     N>::convert(source);
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) const { return convert(s); }
+};
+
+// for Array<cutlass::half_t, N> <= Array<vllm_uint8b128_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::half_t, vllm_uint8b128_t, N, Round> {
+  using result_type = Array<cutlass::half_t, N>;
+  using source_type = Array<vllm_uint8b128_t, N>;
+
+  struct RegConvert {
+    template <typename PackedResultType>
+    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
+      uint32_t src = src_[0];
+      // Hold output FP16s in reg. We need 1 reg for every 2 elements
+      using RegArray =
+          cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2,
+                                sizeof(PackedResultType)>;
+      RegArray r;
+
+      uint32_t const prmt_indices[2] = {0x5150, 0x5352};
+      static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+
+      for (int ii = 0; ii < RegArray::kElements; ++ii) {
+        asm volatile("prmt.b32 %0,%1,%2,%3;\n"
+                     : "=r"(r[ii])
+                     : "r"(src), "n"(start_byte_for_fp16),
+                       "r"(prmt_indices[ii]));
+      }
+
+      // -128 is folded into bias subtraction, i.e. the 0x80 in the low bytes
+      static constexpr uint32_t bias_rep = 0x64806480;
+      const half2& bias = reinterpret_cast<const half2&>(bias_rep);
+      CUTLASS_PRAGMA_UNROLL
+      for (int ii = 0; ii < RegArray::kElements; ++ii) {
+        half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii]);
+        fp16x2_val = __hsub2(fp16x2_val, bias);
+      }
+
+      return reinterpret_cast<PackedResultType&>(r);
+    };
+  };
+
+ public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
+                                     typename source_type::Element,
+                                     N>::convert(source);
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) const { return convert(s); }
+};
+
+// for Array<cutlass::float, N> <= Array<vllm_uint8b128_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<float, vllm_uint8b128_t, N, Round> {
+  using result_type = Array<float, N>;
+  using source_type = Array<vllm_uint8b128_t, N>;
+  static FloatRoundStyle const round_style = Round;
+
+ private:
+  struct RegConvert {
+    template <typename PackedResultType>
+    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
+      uint32_t src = src_[0];
+      PackedResultType r;
+
+      // __byte_perm simulates the add.u32 0x4B000000 to every u8 element of
+      // u8x4 source and stores the result in r (without introducing extra
+      // cvt.u32.u8 instruction)
+      uint32_t const prmt_indices[4] = {0x7650, 0x7651, 0x7652, 0x7653};
+      uint32_t* result_as_int = reinterpret_cast<uint32_t*>(&r);
+      for (int ii = 0; ii < PackedResultType::kElements; ++ii) {
+        result_as_int[ii] = __byte_perm(src, 0x4B000000, prmt_indices[ii]);
+        // Subtract the magic number 0x4B000000 from tmp in floating-point
+        // arithmetic to obtain final result
+        r[ii] -= (8388608.f + 128.f);  // fold in -128 bias
+      }
+
+      return r;
+    };
+  };
+
+ public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
+                                     typename source_type::Element,
+                                     N>::convert(source);
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) const { return convert(s); }
+};
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+// for Array<cutlass::bfloat16_t, N> <= Array<vllm_uint4b8_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::bfloat16_t, vllm_uint4b8_t, N, Round> {
+  using result_type = Array<cutlass::bfloat16_t, N>;
+  using source_type = Array<vllm_uint4b8_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+ private:
+  struct RegConvert {
+    template <typename PackedResultType>
+    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
+      uint32_t src_reg = src_[0];
+      // Hold output BF16s in reg. We need 1 reg for every 2 elements
+      using RegArray =
+          cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2,
+                                sizeof(PackedResultType)>;
+      RegArray r;
+      uint32_t src_reg_shifted = src_reg >> 4;
+
+      // Below constructs the following temporary:
+      uint32_t const prmt_indices[4] = {0xF4F0, 0xF5F1, 0xF6F2, 0xF7F3};
+      static_assert(RegArray::kElements <= 4,
+                    "Too many inputs for uint4b8_t -> BF16 vector converter");
+      CUTLASS_PRAGMA_UNROLL
+      for (int ii = 0; ii < RegArray::kElements; ++ii) {
+        asm volatile(
+            "{\n"
+            "  prmt.b32 %0, %1, %2, %3;\n"
+            "}\n"
+            : "=r"(r[ii])
+            : "r"(src_reg), "r"(src_reg_shifted), "r"(prmt_indices[ii]));
+      }
+
+      // Since the stored 4bit values are biased by 8 we get stored_val = (x+8)
+      //  we are trying to construct x and a BF16 value
+      // The below XOR does the following:
+      //  1) Sets the exponent bits of the BF16 to the correct value for the
+      //  BF16 magic_num. We will be constructing {128 + (x1+8), 128 + (x0+8)}
+      //  and subtracting 136 to get {x1, x0}
+      static constexpr uint32_t xor_mask = 0x43004300;
+      static constexpr uint32_t and_mask = 0x000F000F;
+      static constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
+
+      // For each operand, computes:
+      // r[i] = (r[i] & and_mask) ^ xor_mask
+      CUTLASS_PRAGMA_UNROLL
+      for (int ii = 0; ii < RegArray::kElements; ++ii) {
+        asm volatile(
+            "{\n"
+            "  lop3.b32 %0, %0, %1, %2, %3;\n"
+            "}\n"
+            : "+r"(r[ii])
+            : "n"(and_mask), "n"(xor_mask), "n"(immLut));
+      }
+
+      // We will issue 2 bfmas that do the following:
+      // high BF16:
+      // hi_bf16 - 136, lo_bf16 - 136
+
+      // This is the BF16 {136, 136} represented as an integer.
+      static constexpr uint32_t bias_rep = 0x43084308;
+      const __nv_bfloat162& bias =
+          reinterpret_cast<const __nv_bfloat162&>(bias_rep);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int ii = 0; ii < RegArray::kElements; ++ii) {
+        __nv_bfloat162& bf16x2_val = reinterpret_cast<__nv_bfloat162&>(r[ii]);
+        bf16x2_val = __hsub2(bf16x2_val, bias);
+      }
+
+      return reinterpret_cast<PackedResultType&>(r);
+    }
+  };
+
+ public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
+                                     typename source_type::Element,
+                                     N>::convert(source);
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) const { return convert(s); }
+};
+
+// for Array<cutlass::bfloat16_t, N> <= Array<vllm_uint4b8_t, N>
+//   for IlvdLayout: (2, 4):(4, 1)
+template <FloatRoundStyle Round, int N>
+struct InterleavedNumericArrayConverter<Layout<Shape<_2, _4>, Stride<_4, _1>>,
+                                        cutlass::bfloat16_t, vllm_uint4b8_t, N,
+                                        Round, void> {
+  using IlvdLayout = Layout<Shape<_2, _4>, Stride<_4, _1>>;
+  static_assert(N % size(IlvdLayout{}) == 0);
+
+  using result_type = Array<cutlass::bfloat16_t, N>;
+  using source_type = Array<vllm_uint4b8_t, N>;
+
+ private:
+  struct RegConvert {
+    template <typename PackedResultType>
+    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
+      uint32_t src = src_[0];
+      using RegArray =
+          cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2,
+                                sizeof(PackedResultType)>;
+      RegArray r;
+
+      static_assert(PackedResultType::kElements <= size(IlvdLayout{}));
+      static constexpr uint32_t or_mask = 0x43004300;
+
+      // Unlike float16 where the mantissa is large enough to contain 2
+      // nibbles, bfloat16 can only fit one, so we can only convert one
+      // nibble at a time
+      for (int ii = 0; ii < RegArray::kElements; ++ii) {
+        r[ii] = src >> (4 * ii);
+
+        static constexpr uint32_t and_or_imm_lut = (0xf0 & 0xcc) | 0xaa;
+        static constexpr uint32_t low_nib_mask = 0x000F000F;
+
+        asm volatile(
+            "{\n"
+            "  lop3.b32 %0, %0, %1, %2, %3;\n"
+            "}\n"
+            : "+r"(r[ii + 0])
+            : "n"(low_nib_mask), "n"(or_mask), "n"(and_or_imm_lut));
+
+        // For low nibble:
+        //  {x1, x0} = {128+(x1+8), 128+(x0+8)} * {1, 1} - {136, 136}
+        static constexpr uint32_t low_nib_bias = 0x43084308;  // {136, 136}
+
+        {
+          __nv_bfloat162& fp16x2_val = reinterpret_cast<__nv_bfloat162&>(r[ii]);
+          fp16x2_val =
+              __hsub2(fp16x2_val,
+                      reinterpret_cast<const __nv_bfloat162&>(low_nib_bias));
+        }
+      }
+
+      return reinterpret_cast<PackedResultType&>(r);
+    };
+  };
+
+ public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
+                                     typename source_type::Element,
+                                     N>::convert(source);
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) const { return convert(s); }
+};
+
+// for Array<cutlass::bfloat16_t, N> <= Array<uint4_t, N>
+//   for IlvdLayout: (2, 4):(4, 1)
+template <FloatRoundStyle Round, int N>
+struct InterleavedNumericArrayConverter<Layout<Shape<_2, _4>, Stride<_4, _1>>,
+                                        cutlass::bfloat16_t, uint4_t, N, Round,
+                                        void> {
+  using IlvdLayout = Layout<Shape<_2, _4>, Stride<_4, _1>>;
+  static_assert(N % size(IlvdLayout{}) == 0);
+
+  using result_type = Array<cutlass::bfloat16_t, N>;
+  using source_type = Array<uint4_t, N>;
+
+ private:
+  struct RegConvert {
+    template <typename PackedResultType>
+    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
+      uint32_t src = src_[0];
+      using RegArray =
+          cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2,
+                                sizeof(PackedResultType)>;
+      RegArray r;
+
+      static_assert(PackedResultType::kElements <= size(IlvdLayout{}));
+      static constexpr uint32_t or_mask = 0x43004300;
+
+      // Unlike float16 where the mantissa is large enough to contain 2
+      // nibbles, bfloat16 can only fit one, so we can only convert one
+      // nibble at a time
+      for (int ii = 0; ii < RegArray::kElements; ++ii) {
+        r[ii] = src >> (4 * ii);
+
+        static constexpr uint32_t and_or_imm_lut = (0xf0 & 0xcc) | 0xaa;
+        static constexpr uint32_t low_nib_mask = 0x000F000F;
+
+        asm volatile(
+            "{\n"
+            "  lop3.b32 %0, %0, %1, %2, %3;\n"
+            "}\n"
+            : "+r"(r[ii])
+            : "n"(low_nib_mask), "n"(or_mask), "n"(and_or_imm_lut));
+
+        // For low nibble:
+        //  {x1, x0} = {128 + x1, 128 + x0} * {1, 1} - {128, 128}
+        static constexpr uint32_t low_nib_bias = 0x43004300;  // {128, 128}
+
+        {
+          __nv_bfloat162& fp16x2_val = reinterpret_cast<__nv_bfloat162&>(r[ii]);
+          fp16x2_val =
+              __hsub2(fp16x2_val,
+                      reinterpret_cast<const __nv_bfloat162&>(low_nib_bias));
+        }
+      }
+
+      return reinterpret_cast<PackedResultType&>(r);
+    };
+  };
+
+ public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
+                                     typename source_type::Element,
+                                     N>::convert(source);
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) const { return convert(s); }
+};
+
+// for Array<cutlass::bfloat16_t, N> <= Array<vllm_uint8b128_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::bfloat16_t, vllm_uint8b128_t, N, Round> {
+  using result_type = Array<cutlass::bfloat16_t, N>;
+  using source_type = Array<vllm_uint8b128_t, N>;
+  static FloatRoundStyle const round_style = Round;
+
+ private:
+  using result_packed_4_t = Array<cutlass::bfloat16_t, 4>;
+  using result_packed_2_t = Array<cutlass::bfloat16_t, 2>;
+  using src_packed_4_t = Array<vllm_uint8b128_t, 4>;
+  using src_packed_2_t = Array<vllm_uint8b128_t, 2>;
+
+  // Not Valid, not supported, only here to satisfy the interface and to avoid
+  //  a compile error. ScalarConverter will not actually work until
+  //  NumericConverter<cutlass::bfloat16_t, vllm_uint8b128_t, Round> is
+  //  implemented
+  using ScalarConverter =
+      NumericConverter<cutlass::bfloat16_t, vllm_uint8b128_t, Round>;
+
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE static PackedResultType packed_convert(
+      PackedSrcType const& source) {
+    static_assert(
+        (platform::is_same<PackedSrcType, src_packed_2_t>::value &&
+         platform::is_same<PackedResultType, result_packed_2_t>::value) ||
+            (platform::is_same<PackedSrcType, src_packed_4_t>::value &&
+             platform::is_same<PackedResultType, result_packed_4_t>::value),
+        "Invalid PackedSrcType/PackedResultType must be 2 or 4 to use private "
+        "convert dispatch.");
+
+    NumericArrayConverter<float, vllm_uint8b128_t, PackedResultType::kElements,
+                          Round>
+        convert_uint8_to_f32;
+    Array<float, PackedResultType::kElements> tmp =
+        convert_uint8_to_f32(source);
+    NumericArrayConverter<cutlass::bfloat16_t, float,
+                          PackedResultType::kElements, Round>
+        convert_f32_to_bf16_;
+    return convert_f32_to_bf16_(tmp);
+  }
+
+  friend class detail::VectorizedConverter;
+
+ public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    result_type result;
+    using ConverterType =
+        NumericArrayConverter<typename result_type::Element,
+                              typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType, result_packed_4_t,
+                                         src_packed_4_t, result_packed_2_t,
+                                         src_packed_2_t>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) const { return convert(s); }
+};
+
+#endif
+
+// for Array<int8_t, N> <= Array<cutlass::half_t, N>
+//   FastFP16toINT8 from https://arxiv.org/pdf/2406.09904
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<int8_t, cutlass::half_t, N, Round> {
+  using result_type = Array<int8_t, N>;
+  using source_type = Array<cutlass::half_t, N>;
+
+  struct RegConvert {
+    // FastFP16toINT8 from https://arxiv.org/pdf/2406.09904
+    template <typename PackedResultType, int src_regs>
+    CUTLASS_DEVICE static PackedResultType convert(
+        Array<uint32_t, src_regs> src) {
+      // Hold output int8s in reg. We need 1 reg for every 4 elements
+      using RegArray = cutlass::AlignedArray<
+          uint32_t, std::max(PackedResultType::kElements / 4, size_t(1))>;
+      RegArray r;
+
+      static constexpr uint32_t MAGIC_BIAS_ = 0x64806480;
+      auto MAGIC_BIAS = *reinterpret_cast<const half2*>(&MAGIC_BIAS_);
+
+      *reinterpret_cast<half2*>(&src[0]) =
+          __hadd2(*reinterpret_cast<half2*>(&src[0]), MAGIC_BIAS);
+
+      if constexpr (src_regs > 1) {
+        *reinterpret_cast<half2*>(&src[1]) =
+            __hadd2(*reinterpret_cast<half2*>(&src[1]), MAGIC_BIAS);
+      }
+
+      static_assert(PackedResultType::kElements <= 4);
+      uint32_t uint8s;
+      static constexpr uint32_t MASK_0246 = 0x6420;
+      static constexpr uint32_t UINT8s_TO_INT8s_MASK = 0x80808080;
+      asm volatile("prmt.b32 %0,%1,%2,%3;\n"
+                   : "=r"(uint8s)
+                   : "r"(src[0]), "r"((src_regs > 1) ? src[1] : src[0]),
+                     "n"(MASK_0246));
+
+      uint32_t int8s = (uint8s ^ UINT8s_TO_INT8s_MASK);
+
+      return reinterpret_cast<PackedResultType&>(int8s);
+    };
+  };
+
+ public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
+                                     typename source_type::Element,
+                                     N>::convert(source);
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) const { return convert(s); }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/vllm_v0.10.0/csrc/cutlass_extensions/vllm_type_utils.cuh b/vllm_v0.10.0/csrc/cutlass_extensions/vllm_type_utils.cuh
new file mode 100644
index 0000000..500ed50
--- /dev/null
+++ b/vllm_v0.10.0/csrc/cutlass_extensions/vllm_type_utils.cuh
@@ -0,0 +1,42 @@
+#include "cutlass/bfloat16.h"
+#include "cutlass/half.h"
+#include "cuda_bf16.h"
+
+#include "cutlass_extensions/vllm_custom_types.cuh"
+
+namespace cutlass {
+
+template <typename T>
+struct nameof {
+  static constexpr char const* value = "unknown";
+};
+
+template <typename T>
+inline constexpr auto nameof_v = nameof<T>::value;
+
+#define NAMEOF_TYPE(T)                       \
+  template <>                                \
+  struct nameof<T> {                         \
+    static constexpr char const* value = #T; \
+  };
+
+NAMEOF_TYPE(float_e4m3_t)
+NAMEOF_TYPE(float_e5m2_t)
+NAMEOF_TYPE(half_t)
+NAMEOF_TYPE(nv_bfloat16)
+NAMEOF_TYPE(bfloat16_t)
+NAMEOF_TYPE(float)
+
+NAMEOF_TYPE(int4b_t)
+NAMEOF_TYPE(int8_t)
+NAMEOF_TYPE(int32_t)
+NAMEOF_TYPE(int64_t)
+
+NAMEOF_TYPE(vllm_uint4b8_t)
+NAMEOF_TYPE(uint4b_t)
+NAMEOF_TYPE(uint8_t)
+NAMEOF_TYPE(vllm_uint8b128_t)
+NAMEOF_TYPE(uint32_t)
+NAMEOF_TYPE(uint64_t)
+
+};  // namespace cutlass
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/dispatch_utils.h b/vllm_v0.10.0/csrc/dispatch_utils.h
new file mode 100644
index 0000000..f7b75c4
--- /dev/null
+++ b/vllm_v0.10.0/csrc/dispatch_utils.h
@@ -0,0 +1,83 @@
+/*
+ * Adapted from
+ * https://github.com/pytorch/pytorch/blob/v2.0.1/aten/src/ATen/Dispatch.h
+ */
+#pragma once
+
+#include <torch/all.h>
+
+// Need a special dispatch case macro since we will nest the FP8 dispatch.
+// Instead of the usual 'scalar_t', this names the dispatched type 'fp8_t'.
+#define AT_DISPATCH_FP8_CASE(enum_type, ...) \
+  AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, fp8_t, __VA_ARGS__)
+
+#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)  \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
+// ROCm devices might use either fn or fnuz, so set up dispatch table for both.
+// A host-based check at runtime will create a preferred FP8 type for ROCm
+// such that the correct kernel is dispatched.
+#ifdef USE_ROCM
+  #define VLLM_DISPATCH_CASE_FP8_TYPES(...)                          \
+    AT_DISPATCH_FP8_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
+    AT_DISPATCH_FP8_CASE(at::ScalarType::Float8_e4m3fnuz, __VA_ARGS__)
+
+  #define VLLM_DISPATCH_CASE_QUANT_TYPES(...)                      \
+    AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__)   \
+    AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fnuz, __VA_ARGS__) \
+    AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)
+#else
+  #define VLLM_DISPATCH_CASE_FP8_TYPES(...) \
+    AT_DISPATCH_FP8_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__)
+
+  #define VLLM_DISPATCH_CASE_QUANT_TYPES(...)                    \
+    AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
+    AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)
+#endif
+
+// When using this dispatch macro, the type is 'fp8_t' not 'scalar_t'.
+// See AT_DISPATCH_FP8_CASE above.
+#define VLLM_DISPATCH_FP8_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FP8_TYPES(__VA_ARGS__))
+
+#define VLLM_DISPATCH_QUANT_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_QUANT_TYPES(__VA_ARGS__))
+
+#define VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...)   \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)     \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)
+
+#define VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME,                               \
+                     VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(__VA_ARGS__))
+
+#define VLLM_DISPATCH_CASE_INTEGRAL_TYPES(...)         \
+  AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)  \
+  AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)  \
+  AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)   \
+  AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)
+
+#define VLLM_DISPATCH_CASE_INTEGRAL_AND_UNSIGNED_TYPES(...) \
+  AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)       \
+  AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)       \
+  AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__)      \
+  AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)        \
+  AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)       \
+  AT_DISPATCH_CASE(at::ScalarType::UInt16, __VA_ARGS__)     \
+  AT_DISPATCH_CASE(at::ScalarType::UInt32, __VA_ARGS__)     \
+  AT_DISPATCH_CASE(at::ScalarType::UInt64, __VA_ARGS__)
+
+#define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))
+
+#define VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                              \
+      TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_AND_UNSIGNED_TYPES(__VA_ARGS__))
diff --git a/vllm_v0.10.0/csrc/layernorm_kernels.cu b/vllm_v0.10.0/csrc/layernorm_kernels.cu
new file mode 100644
index 0000000..f051eb0
--- /dev/null
+++ b/vllm_v0.10.0/csrc/layernorm_kernels.cu
@@ -0,0 +1,221 @@
+#include "type_convert.cuh"
+#include "dispatch_utils.h"
+
+#include <torch/cuda.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#ifndef USE_ROCM
+  #include <cub/cub.cuh>
+#else
+  #include <hipcub/hipcub.hpp>
+#endif
+
+namespace vllm {
+
+// TODO(woosuk): Further optimize this kernel.
+template <typename scalar_t>
+__global__ void rms_norm_kernel(
+    scalar_t* __restrict__ out,          // [..., hidden_size]
+    const scalar_t* __restrict__ input,  // [..., hidden_size]
+    const int64_t input_stride,
+    const scalar_t* __restrict__ weight,  // [hidden_size]
+    const float epsilon, const int num_tokens, const int hidden_size) {
+  __shared__ float s_variance;
+  float variance = 0.0f;
+
+  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+    const float x = (float)input[blockIdx.x * input_stride + idx];
+    variance += x * x;
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(variance / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+    float x = (float)input[blockIdx.x * input_stride + idx];
+    out[blockIdx.x * hidden_size + idx] =
+        ((scalar_t)(x * s_variance)) * weight[idx];
+  }
+}
+
+/* Function specialization in the case of FP16/BF16 tensors.
+   Additional optimizations we can make in this case are
+   packed and vectorized operations, which help with the
+   memory latency bottleneck. */
+template <typename scalar_t, int width>
+__global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
+fused_add_rms_norm_kernel(
+    scalar_t* __restrict__ input,  // [..., hidden_size]
+    const int64_t input_stride,
+    scalar_t* __restrict__ residual,      // [..., hidden_size]
+    const scalar_t* __restrict__ weight,  // [hidden_size]
+    const float epsilon, const int num_tokens, const int hidden_size) {
+  // Sanity checks on our vector struct and type-punned pointer arithmetic
+  static_assert(std::is_pod_v<_f16Vec<scalar_t, width>>);
+  static_assert(sizeof(_f16Vec<scalar_t, width>) == sizeof(scalar_t) * width);
+
+  const int vec_hidden_size = hidden_size / width;
+  const int64_t vec_input_stride = input_stride / width;
+  __shared__ float s_variance;
+  float variance = 0.0f;
+  /* These and the argument pointers are all declared `restrict` as they are
+     not aliased in practice. Argument pointers should not be dereferenced
+     in this kernel as that would be undefined behavior */
+  auto* __restrict__ input_v =
+      reinterpret_cast<_f16Vec<scalar_t, width>*>(input);
+  auto* __restrict__ residual_v =
+      reinterpret_cast<_f16Vec<scalar_t, width>*>(residual);
+  auto* __restrict__ weight_v =
+      reinterpret_cast<const _f16Vec<scalar_t, width>*>(weight);
+
+  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
+    int id = blockIdx.x * vec_hidden_size + idx;
+    int64_t strided_id = blockIdx.x * vec_input_stride + idx;
+    _f16Vec<scalar_t, width> temp = input_v[strided_id];
+    temp += residual_v[id];
+    variance += temp.sum_squares();
+    residual_v[id] = temp;
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(variance / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
+    int id = blockIdx.x * vec_hidden_size + idx;
+    int64_t strided_id = blockIdx.x * vec_input_stride + idx;
+    _f16Vec<scalar_t, width> temp = residual_v[id];
+    temp *= s_variance;
+    temp *= weight_v[idx];
+    input_v[strided_id] = temp;
+  }
+}
+
+/* Generic fused_add_rms_norm_kernel
+   The width field is not used here but necessary for other specializations.
+ */
+template <typename scalar_t, int width>
+__global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
+fused_add_rms_norm_kernel(
+    scalar_t* __restrict__ input,  // [..., hidden_size]
+    const int64_t input_stride,
+    scalar_t* __restrict__ residual,      // [..., hidden_size]
+    const scalar_t* __restrict__ weight,  // [hidden_size]
+    const float epsilon, const int num_tokens, const int hidden_size) {
+  __shared__ float s_variance;
+  float variance = 0.0f;
+
+  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+    scalar_t z = input[blockIdx.x * input_stride + idx];
+    z += residual[blockIdx.x * hidden_size + idx];
+    float x = (float)z;
+    variance += x * x;
+    residual[blockIdx.x * hidden_size + idx] = z;
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(variance / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+    float x = (float)residual[blockIdx.x * hidden_size + idx];
+    input[blockIdx.x * input_stride + idx] =
+        ((scalar_t)(x * s_variance)) * weight[idx];
+  }
+}
+
+}  // namespace vllm
+
+void rms_norm(torch::Tensor& out,     // [..., hidden_size]
+              torch::Tensor& input,   // [..., hidden_size]
+              torch::Tensor& weight,  // [hidden_size]
+              double epsilon) {
+  TORCH_CHECK(out.is_contiguous());
+  TORCH_CHECK(input.stride(-1) == 1);
+  TORCH_CHECK(weight.is_contiguous());
+
+  int hidden_size = input.size(-1);
+  int num_tokens = input.numel() / hidden_size;
+  int64_t input_stride = input.stride(-2);
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min(hidden_size, 1024));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
+    vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
+        out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), input_stride,
+        weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
+  });
+}
+
+#define LAUNCH_FUSED_ADD_RMS_NORM(width)                                    \
+  VLLM_DISPATCH_FLOATING_TYPES(                                             \
+      input.scalar_type(), "fused_add_rms_norm_kernel", [&] {               \
+        vllm::fused_add_rms_norm_kernel<scalar_t, width>                    \
+            <<<grid, block, 0, stream>>>(                                   \
+                input.data_ptr<scalar_t>(), input_stride,                   \
+                residual.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(), \
+                epsilon, num_tokens, hidden_size);                          \
+      });
+
+void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
+                        torch::Tensor& residual,  // [..., hidden_size]
+                        torch::Tensor& weight,    // [hidden_size]
+                        double epsilon) {
+  TORCH_CHECK(residual.is_contiguous());
+  TORCH_CHECK(weight.is_contiguous());
+  int hidden_size = input.size(-1);
+  int64_t input_stride = input.stride(-2);
+  int num_tokens = input.numel() / hidden_size;
+
+  dim3 grid(num_tokens);
+  /* This kernel is memory-latency bound in many scenarios.
+     When num_tokens is large, a smaller block size allows
+     for increased block occupancy on CUs and better latency
+     hiding on global mem ops. */
+  const int max_block_size = (num_tokens < 256) ? 1024 : 256;
+  dim3 block(std::min(hidden_size, max_block_size));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  /*If the tensor types are FP16/BF16, try to use the optimized kernel
+    with packed + vectorized ops.
+    Max optimization is achieved with a width-8 vector of FP16/BF16s
+    since we can load at most 128 bits at once in a global memory op.
+    However, this requires each tensor's data to be aligned to 16
+    bytes.
+   */
+  auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
+  auto res_ptr = reinterpret_cast<std::uintptr_t>(residual.data_ptr());
+  auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
+  constexpr int vector_width = 8;
+  constexpr int req_alignment_bytes =
+      vector_width * 2;  // vector_width * sizeof(bfloat16 or float16) (float32
+                         // falls back to non-vectorized version anyway)
+  bool ptrs_are_aligned = inp_ptr % req_alignment_bytes == 0 &&
+                          res_ptr % req_alignment_bytes == 0 &&
+                          wt_ptr % req_alignment_bytes == 0;
+  bool offsets_are_multiple_of_vector_width =
+      hidden_size % vector_width == 0 && input_stride % vector_width == 0;
+  if (ptrs_are_aligned && offsets_are_multiple_of_vector_width) {
+    LAUNCH_FUSED_ADD_RMS_NORM(8);
+  } else {
+    LAUNCH_FUSED_ADD_RMS_NORM(0);
+  }
+}
diff --git a/vllm_v0.10.0/csrc/layernorm_quant_kernels.cu b/vllm_v0.10.0/csrc/layernorm_quant_kernels.cu
new file mode 100644
index 0000000..0fd5849
--- /dev/null
+++ b/vllm_v0.10.0/csrc/layernorm_quant_kernels.cu
@@ -0,0 +1,253 @@
+/*
+ * This file contains the CUDA kernels for the fused quantized layernorm.
+ * The kernels correspond to the kernels in layernorm_kernels.cu, except they
+ * also produce quantized output directly.
+ * Currently, only static fp8 quantization is supported.
+ */
+
+#include "type_convert.cuh"
+#include "quantization/fp8/common.cuh"
+#include "dispatch_utils.h"
+
+#include <torch/cuda.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#ifndef USE_ROCM
+  #include <cub/cub.cuh>
+#else
+  #include <hipcub/hipcub.hpp>
+#endif
+
+namespace vllm {
+
+// TODO(woosuk): Further optimize this kernel.
+template <typename scalar_t, typename fp8_type>
+__global__ void rms_norm_static_fp8_quant_kernel(
+    fp8_type* __restrict__ out,          // [..., hidden_size]
+    const scalar_t* __restrict__ input,  // [..., hidden_size]
+    const int input_stride,
+    const scalar_t* __restrict__ weight,  // [hidden_size]
+    const float* __restrict__ scale,      // [1]
+    const float epsilon, const int num_tokens, const int hidden_size) {
+  __shared__ float s_variance;
+  float variance = 0.0f;
+
+  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+    const float x = (float)input[blockIdx.x * input_stride + idx];
+    variance += x * x;
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(variance / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  // invert scale to avoid division
+  float const scale_inv = 1.0f / *scale;
+
+  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+    float x = (float)input[blockIdx.x * input_stride + idx];
+    float const out_norm = ((scalar_t)(x * s_variance)) * weight[idx];
+    out[blockIdx.x * hidden_size + idx] =
+        scaled_fp8_conversion<true, fp8_type>(out_norm, scale_inv);
+  }
+}
+
+/* Function specialization in the case of FP16/BF16 tensors.
+   Additional optimizations we can make in this case are
+   packed and vectorized operations, which help with the
+   memory latency bottleneck. */
+template <typename scalar_t, int width, typename fp8_type>
+__global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
+fused_add_rms_norm_static_fp8_quant_kernel(
+    fp8_type* __restrict__ out,    // [..., hidden_size]
+    scalar_t* __restrict__ input,  // [..., hidden_size]
+    const int input_stride,
+    scalar_t* __restrict__ residual,      // [..., hidden_size]
+    const scalar_t* __restrict__ weight,  // [hidden_size]
+    const float* __restrict__ scale,      // [1]
+    const float epsilon, const int num_tokens, const int hidden_size) {
+  // Sanity checks on our vector struct and type-punned pointer arithmetic
+  static_assert(std::is_pod_v<_f16Vec<scalar_t, width>>);
+  static_assert(sizeof(_f16Vec<scalar_t, width>) == sizeof(scalar_t) * width);
+
+  const int vec_hidden_size = hidden_size / width;
+  const int vec_input_stride = input_stride / width;
+  __shared__ float s_variance;
+  float variance = 0.0f;
+  /* These and the argument pointers are all declared `restrict` as they are
+     not aliased in practice. Argument pointers should not be dereferenced
+     in this kernel as that would be undefined behavior */
+  auto* __restrict__ input_v =
+      reinterpret_cast<_f16Vec<scalar_t, width>*>(input);
+  auto* __restrict__ residual_v =
+      reinterpret_cast<_f16Vec<scalar_t, width>*>(residual);
+  auto* __restrict__ weight_v =
+      reinterpret_cast<const _f16Vec<scalar_t, width>*>(weight);
+
+  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
+    int stride_id = blockIdx.x * vec_input_stride + idx;
+    int id = blockIdx.x * vec_hidden_size + idx;
+    _f16Vec<scalar_t, width> temp = input_v[stride_id];
+    temp += residual_v[id];
+    variance += temp.sum_squares();
+    residual_v[id] = temp;
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(variance / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  // invert scale to avoid division
+  float const scale_inv = 1.0f / *scale;
+
+  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
+    int id = blockIdx.x * vec_hidden_size + idx;
+    _f16Vec<scalar_t, width> temp = residual_v[id];
+    temp *= s_variance;
+    temp *= weight_v[idx];
+#pragma unroll
+    for (int i = 0; i < width; ++i) {
+      out[id * width + i] =
+          scaled_fp8_conversion<true, fp8_type>(float(temp.data[i]), scale_inv);
+    }
+  }
+}
+
+/* Generic fused_add_rms_norm_kernel
+   The width field is not used here but necessary for other specializations.
+ */
+template <typename scalar_t, int width, typename fp8_type>
+__global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
+fused_add_rms_norm_static_fp8_quant_kernel(
+    fp8_type* __restrict__ out,    // [..., hidden_size]
+    scalar_t* __restrict__ input,  // [..., hidden_size]
+    const int input_stride,
+    scalar_t* __restrict__ residual,      // [..., hidden_size]
+    const scalar_t* __restrict__ weight,  // [hidden_size]
+    const float* __restrict__ scale,      // [1]
+    const float epsilon, const int num_tokens, const int hidden_size) {
+  __shared__ float s_variance;
+  float variance = 0.0f;
+
+  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+    scalar_t z = input[blockIdx.x * input_stride + idx];
+    z += residual[blockIdx.x * hidden_size + idx];
+    float x = (float)z;
+    variance += x * x;
+    residual[blockIdx.x * hidden_size + idx] = z;
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(variance / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  // invert scale to avoid division
+  float const scale_inv = 1.0f / *scale;
+
+  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+    float x = (float)residual[blockIdx.x * hidden_size + idx];
+    float const out_norm = ((scalar_t)(x * s_variance)) * weight[idx];
+    out[blockIdx.x * hidden_size + idx] =
+        scaled_fp8_conversion<true, fp8_type>(out_norm, scale_inv);
+  }
+}
+
+}  // namespace vllm
+
+void rms_norm_static_fp8_quant(torch::Tensor& out,     // [..., hidden_size]
+                               torch::Tensor& input,   // [..., hidden_size]
+                               torch::Tensor& weight,  // [hidden_size]
+                               torch::Tensor& scale,   // [1]
+                               double epsilon) {
+  TORCH_CHECK(out.is_contiguous());
+  int hidden_size = input.size(-1);
+  int input_stride = input.stride(-2);
+  int num_tokens = input.numel() / hidden_size;
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min(hidden_size, 1024));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "rms_norm_kernel_scalar_type", [&] {
+        VLLM_DISPATCH_FP8_TYPES(
+            out.scalar_type(), "rms_norm_kernel_fp8_type", [&] {
+              vllm::rms_norm_static_fp8_quant_kernel<scalar_t, fp8_t>
+                  <<<grid, block, 0, stream>>>(
+                      out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),
+                      input_stride, weight.data_ptr<scalar_t>(),
+                      scale.data_ptr<float>(), epsilon, num_tokens,
+                      hidden_size);
+            });
+      });
+}
+
+#define LAUNCH_FUSED_ADD_RMS_NORM(width)                                     \
+  VLLM_DISPATCH_FLOATING_TYPES(                                              \
+      input.scalar_type(), "fused_add_rms_norm_kernel_scalar_type", [&] {    \
+        VLLM_DISPATCH_FP8_TYPES(                                             \
+            out.scalar_type(), "fused_add_rms_norm_kernel_fp8_type", [&] {   \
+              vllm::fused_add_rms_norm_static_fp8_quant_kernel<scalar_t,     \
+                                                               width, fp8_t> \
+                  <<<grid, block, 0, stream>>>(                              \
+                      out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),     \
+                      input_stride, residual.data_ptr<scalar_t>(),           \
+                      weight.data_ptr<scalar_t>(), scale.data_ptr<float>(),  \
+                      epsilon, num_tokens, hidden_size);                     \
+            });                                                              \
+      });
+void fused_add_rms_norm_static_fp8_quant(
+    torch::Tensor& out,       // [..., hidden_size],
+    torch::Tensor& input,     // [..., hidden_size]
+    torch::Tensor& residual,  // [..., hidden_size]
+    torch::Tensor& weight,    // [hidden_size]
+    torch::Tensor& scale,     // [1]
+    double epsilon) {
+  TORCH_CHECK(out.is_contiguous());
+  TORCH_CHECK(residual.is_contiguous());
+  int hidden_size = input.size(-1);
+  int input_stride = input.stride(-2);
+  int num_tokens = input.numel() / hidden_size;
+
+  dim3 grid(num_tokens);
+  /* This kernel is memory-latency bound in many scenarios.
+     When num_tokens is large, a smaller block size allows
+     for increased block occupancy on CUs and better latency
+     hiding on global mem ops. */
+  const int max_block_size = (num_tokens < 256) ? 1024 : 256;
+  dim3 block(std::min(hidden_size, max_block_size));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  /*If the tensor types are FP16/BF16, try to use the optimized kernel
+    with packed + vectorized ops.
+    Max optimization is achieved with a width-8 vector of FP16/BF16s
+    since we can load at most 128 bits at once in a global memory op.
+    However, this requires each tensor's data to be aligned to 16
+    bytes.
+   */
+  auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
+  auto res_ptr = reinterpret_cast<std::uintptr_t>(residual.data_ptr());
+  auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
+  bool ptrs_are_aligned =
+      inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
+  if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0) {
+    LAUNCH_FUSED_ADD_RMS_NORM(8);
+  } else {
+    LAUNCH_FUSED_ADD_RMS_NORM(0);
+  }
+}
diff --git a/vllm_v0.10.0/csrc/mamba/mamba_ssm/selective_scan.h b/vllm_v0.10.0/csrc/mamba/mamba_ssm/selective_scan.h
new file mode 100644
index 0000000..563d2fe
--- /dev/null
+++ b/vllm_v0.10.0/csrc/mamba/mamba_ssm/selective_scan.h
@@ -0,0 +1,266 @@
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+// clang-format off
+// adapted from https://github.com/state-spaces/mamba/blob/main/csrc/selective_scan/selective_scan.h
+
+#pragma once
+
+#ifndef USE_ROCM
+    #include <cuda_bf16.h>
+#else
+    #include <hip/hip_bf16.h>
+#endif
+#include <cuda_fp16.h>
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SSMParamsBase {
+    using index_t = uint32_t;
+
+    int batch, dim, seqlen, dstate, n_groups, n_chunks;
+    int dim_ngroups_ratio;
+    bool is_variable_B;
+    bool is_variable_C;
+    int64_t pad_slot_id;
+
+    bool delta_softplus;
+
+    index_t A_d_stride;
+    index_t A_dstate_stride;
+    index_t B_batch_stride;
+    index_t B_d_stride;
+    index_t B_dstate_stride;
+    index_t B_group_stride;
+    index_t C_batch_stride;
+    index_t C_d_stride;
+    index_t C_dstate_stride;
+    index_t C_group_stride;
+    index_t u_batch_stride;
+    index_t u_d_stride;
+    index_t delta_batch_stride;
+    index_t delta_d_stride;
+    index_t z_batch_stride;
+    index_t z_d_stride;
+    index_t out_batch_stride;
+    index_t out_d_stride;
+    index_t out_z_batch_stride;
+    index_t out_z_d_stride;
+
+    // Common data pointers.
+    void *__restrict__ A_ptr;
+    void *__restrict__ B_ptr;
+    void *__restrict__ C_ptr;
+    void *__restrict__ D_ptr;
+    void *__restrict__ u_ptr;
+    void *__restrict__ delta_ptr;
+    void *__restrict__ delta_bias_ptr;
+    void *__restrict__ out_ptr;
+    void *__restrict__ ssm_states_ptr;
+    void *__restrict__ z_ptr;
+    void *__restrict__ out_z_ptr;
+
+    void *__restrict__ query_start_loc_ptr;
+    void *__restrict__ cache_indices_ptr;
+    void *__restrict__ has_initial_state_ptr;
+
+};
+
+
+
+
+#ifndef USE_ROCM
+
+    constexpr size_t custom_max(std::initializer_list<size_t> ilist) 
+    {
+        return std::max(ilist);
+    }
+
+    template<typename T>
+    constexpr T constexpr_min(T a, T b) {
+        return std::min(a, b);
+    }
+
+#else
+    constexpr size_t custom_max(std::initializer_list<size_t> ilist) 
+    {
+        return *std::max_element(ilist.begin(), ilist.end());
+    }
+
+    template<typename T>
+    constexpr T constexpr_min(T a, T b) {
+        return a < b ? a : b;
+    }
+#endif
+
+
+#define MAX_DSTATE 256
+
+
+inline __device__ float2 operator+(const float2 & a, const float2 & b){
+    return {a.x + b.x, a.y + b.y};
+}
+
+inline __device__ float3 operator+(const float3 &a, const float3 &b) {
+  return {a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+inline __device__ float4 operator+(const float4 & a, const float4 & b){
+    return {a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w};
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int BYTES> struct BytesToType {};
+
+template<> struct BytesToType<16> {
+    using Type = uint4;
+    static_assert(sizeof(Type) == 16);
+};
+
+template<> struct BytesToType<8> {
+    using Type = uint64_t;
+    static_assert(sizeof(Type) == 8);
+};
+
+template<> struct BytesToType<4> {
+    using Type = uint32_t;
+    static_assert(sizeof(Type) == 4);
+};
+
+template<> struct BytesToType<2> {
+    using Type = uint16_t;
+    static_assert(sizeof(Type) == 2);
+};
+
+template<> struct BytesToType<1> {
+    using Type = uint8_t;
+    static_assert(sizeof(Type) == 1);
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename scalar_t, int N>
+struct Converter{
+    static inline __device__ void to_float(const scalar_t (&src)[N], float (&dst)[N]) {
+        #pragma unroll
+        for (int i = 0; i < N; ++i) { dst[i] = src[i]; }
+    }
+};
+
+template<int N>
+struct Converter<at::Half, N>{
+    static inline __device__ void to_float(const at::Half (&src)[N], float (&dst)[N]) {
+        static_assert(N % 2 == 0);
+        auto &src2 = reinterpret_cast<const half2 (&)[N / 2]>(src);
+        auto &dst2 = reinterpret_cast<float2 (&)[N / 2]>(dst);
+        #pragma unroll
+        for (int i = 0; i < N / 2; ++i) { dst2[i] = __half22float2(src2[i]); }
+    }
+};
+
+#if __CUDA_ARCH__ >= 800
+template<int N>
+struct Converter<at::BFloat16, N>{
+    static inline __device__ void to_float(const at::BFloat16 (&src)[N], float (&dst)[N]) {
+        static_assert(N % 2 == 0);
+        auto &src2 = reinterpret_cast<const nv_bfloat162 (&)[N / 2]>(src);
+        auto &dst2 = reinterpret_cast<float2 (&)[N / 2]>(dst);
+        #pragma unroll
+        for (int i = 0; i < N / 2; ++i) { dst2[i] = __bfloat1622float2(src2[i]); }
+    }
+};
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+template<typename scalar_t> struct SSMScanOp;
+
+template<>
+struct SSMScanOp<float> {
+    __device__ __forceinline__ float2 operator()(const float2 &ab0, const float2 &ab1) const {
+        return make_float2(ab1.x * ab0.x, ab1.x * ab0.y + ab1.y);
+    }
+};
+
+// A stateful callback functor that maintains a running prefix to be applied
+// during consecutive scan operations.
+template <typename scalar_t> struct SSMScanPrefixCallbackOp {
+    using scan_t = std::conditional_t<std::is_same_v<scalar_t, float>, float2, float4>;
+    scan_t running_prefix;
+    // Constructor
+    __device__ SSMScanPrefixCallbackOp(scan_t running_prefix_) : running_prefix(running_prefix_) {}
+    // Callback operator to be entered by the first warp of threads in the block.
+    // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+    __device__ scan_t operator()(scan_t block_aggregate) {
+        scan_t old_prefix = running_prefix;
+        running_prefix = SSMScanOp<scalar_t>()(running_prefix, block_aggregate);
+        return old_prefix;
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Ktraits>
+inline __device__ void load_input(typename Ktraits::input_t *u,
+                                  typename Ktraits::input_t (&u_vals)[Ktraits::kNItems],
+                                  typename Ktraits::BlockLoadT::TempStorage &smem_load,
+                                  int seqlen) {
+    if constexpr (Ktraits::kIsEvenLen && !Ktraits::kVarlen) {
+        auto& smem_load_vec = reinterpret_cast<typename Ktraits::BlockLoadVecT::TempStorage&>(smem_load);
+        using vec_t = typename Ktraits::vec_t;
+        typename Ktraits::BlockLoadVecT(smem_load_vec).Load(
+            reinterpret_cast<vec_t*>(u),
+            reinterpret_cast<vec_t(&)[Ktraits::kNLoads]>(u_vals)
+            #ifdef USE_ROCM
+                , Ktraits::kNThreads * Ktraits::kNLoads
+            #endif
+            
+       );
+    } else {
+        typename Ktraits::BlockLoadT(smem_load).Load(u, u_vals, seqlen, 0.f);
+    }
+}
+
+
+template<typename Ktraits>
+inline __device__ void load_weight(typename Ktraits::input_t *Bvar,
+                                   typename Ktraits::weight_t (&B_vals)[Ktraits::kNItems],
+                                   typename Ktraits::BlockLoadWeightT::TempStorage &smem_load_weight,
+                                   int seqlen) {
+    constexpr int kNItems = Ktraits::kNItems;
+    typename Ktraits::input_t B_vals_load[kNItems];
+    if constexpr (Ktraits::kIsEvenLen && !Ktraits::kVarlen) {
+        auto& smem_load_weight_vec = reinterpret_cast<typename Ktraits::BlockLoadWeightVecT::TempStorage&>(smem_load_weight);
+        using vec_t = typename Ktraits::vec_t;
+        typename Ktraits::BlockLoadWeightVecT(smem_load_weight_vec).Load(
+            reinterpret_cast<vec_t*>(Bvar),
+            reinterpret_cast<vec_t(&)[Ktraits::kNLoads]>(B_vals_load)
+      );
+    } else {
+        typename Ktraits::BlockLoadWeightT(smem_load_weight).Load(Bvar, B_vals_load, seqlen, 0.f);
+    }
+    // #pragma unroll
+    // for (int i = 0; i < kNItems; ++i) { B_vals[i] = B_vals_load[i]; }
+    Converter<typename Ktraits::input_t, kNItems>::to_float(B_vals_load, B_vals);
+}
+
+template<typename Ktraits>
+inline __device__ void store_output(typename Ktraits::input_t *out,
+                                    const float (&out_vals)[Ktraits::kNItems],
+                                    typename Ktraits::BlockStoreT::TempStorage &smem_store,
+                                    int seqlen) {
+    typename Ktraits::input_t write_vals[Ktraits::kNItems];
+    #pragma unroll
+    for (int i = 0; i < Ktraits::kNItems; ++i) { write_vals[i] = out_vals[i]; }
+    if constexpr (Ktraits::kIsEvenLen && !Ktraits::kVarlen) {
+        auto& smem_store_vec = reinterpret_cast<typename Ktraits::BlockStoreVecT::TempStorage&>(smem_store);
+        using vec_t = typename Ktraits::vec_t;
+        typename Ktraits::BlockStoreVecT(smem_store_vec).Store(
+            reinterpret_cast<vec_t*>(out),
+            reinterpret_cast<vec_t(&)[Ktraits::kNLoads]>(write_vals)
+       );
+    } else {
+        typename Ktraits::BlockStoreT(smem_store).Store(out, write_vals, seqlen);
+    }
+}
diff --git a/vllm_v0.10.0/csrc/mamba/mamba_ssm/selective_scan_fwd.cu b/vllm_v0.10.0/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
new file mode 100644
index 0000000..5766fba
--- /dev/null
+++ b/vllm_v0.10.0/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
@@ -0,0 +1,666 @@
+// clang-format off
+// adapted from https://github.com/state-spaces/mamba/blob/main/csrc/selective_scan/selective_scan_fwd_kernel.cuh
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include "selective_scan.h"
+
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+#ifdef USE_ROCM
+    #include <c10/hip/HIPException.h>  // For C10_HIP_CHECK and C10_HIP_KERNEL_LAUNCH_CHECK
+#else
+    #include <c10/cuda/CUDAException.h>  // For C10_CUDA_CHECK and C10_CUDA_KERNEL_LAUNCH_CHECK
+#endif
+
+#ifndef USE_ROCM
+    #include <cub/block/block_load.cuh>
+    #include <cub/block/block_store.cuh>
+    #include <cub/block/block_scan.cuh>
+#else
+    #include <hipcub/hipcub.hpp>
+    namespace cub = hipcub;
+#endif
+
+#include "selective_scan.h"
+#include "static_switch.h"
+
+template<int kNThreads_, int kNItems_, int kNRows_, bool kIsEvenLen_,
+         bool kIsVariableB_, bool kIsVariableC_,
+         bool kHasZ_, bool kVarlen_, typename input_t_, typename weight_t_>
+struct Selective_Scan_fwd_kernel_traits {
+    static_assert(kNItems_ % 4 == 0);
+    using input_t = input_t_;
+    using weight_t = weight_t_;
+    static constexpr int kNThreads = kNThreads_;
+    // Setting MinBlocksPerMP to be 3 (instead of 2) for 128 threads improves occupancy.
+    static constexpr int kMinBlocks = kNThreads < 128 ? 5 : 3;
+    static constexpr int kNItems = kNItems_;
+    static constexpr int kNRows = kNRows_;
+    static constexpr int kNBytes = sizeof(input_t);
+    static_assert(kNBytes == 2 || kNBytes == 4);
+    static constexpr int kNElts = kNBytes == 4 ? 4 : constexpr_min(8, kNItems);
+    static_assert(kNItems % kNElts == 0);
+    static constexpr int kNLoads = kNItems / kNElts;
+    static constexpr bool kIsEvenLen = kVarlen_ ? false : kIsEvenLen_;
+    static constexpr bool kIsVariableB = kIsVariableB_;
+    static constexpr bool kIsVariableC = kIsVariableC_;
+    static constexpr bool kHasZ = kHasZ_;
+    static constexpr bool kVarlen = kVarlen_;
+
+    static constexpr bool kDirectIO = kVarlen_ ? false : kIsEvenLen && kNLoads == 1;
+    static constexpr int kNLoadsIndex = kNItems / 4;
+    using vec_t = typename BytesToType<kNBytes * kNElts>::Type;
+    using scan_t = float2;
+    using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNItems, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
+    using BlockLoadVecT = cub::BlockLoad<vec_t, kNThreads, kNLoads,
+        !kDirectIO ? cub::BLOCK_LOAD_WARP_TRANSPOSE : cub::BLOCK_LOAD_DIRECT>;
+    using BlockLoadWeightT = cub::BlockLoad<input_t, kNThreads, kNItems , cub::BLOCK_LOAD_WARP_TRANSPOSE>;
+    using BlockLoadWeightVecT = cub::BlockLoad<vec_t, kNThreads, kNLoads ,
+        !kDirectIO ? cub::BLOCK_LOAD_WARP_TRANSPOSE  : cub::BLOCK_LOAD_DIRECT>;
+    using BlockStoreT = cub::BlockStore<input_t, kNThreads, kNItems, cub::BLOCK_STORE_WARP_TRANSPOSE>;
+    using BlockStoreVecT = cub::BlockStore<vec_t, kNThreads, kNLoads,
+        !kDirectIO ? cub::BLOCK_STORE_WARP_TRANSPOSE : cub::BLOCK_STORE_DIRECT>;
+    // using BlockScanT = cub::BlockScan<scan_t, kNThreads, cub::BLOCK_SCAN_RAKING_MEMOIZE>;
+    // using BlockScanT = cub::BlockScan<scan_t, kNThreads, cub::BLOCK_SCAN_RAKING>;
+    using BlockScanT = cub::BlockScan<scan_t, kNThreads, cub::BLOCK_SCAN_WARP_SCANS>;
+    static constexpr int kSmemIOSize = custom_max({sizeof(typename BlockLoadT::TempStorage),
+                                                 sizeof(typename BlockLoadVecT::TempStorage),
+                                                 (int(kIsVariableB) + int(kIsVariableC)) * sizeof(typename BlockLoadWeightT::TempStorage),
+                                                 (int(kIsVariableB) + int(kIsVariableC)) * sizeof(typename BlockLoadWeightVecT::TempStorage),
+                                                 sizeof(typename BlockStoreT::TempStorage),
+                                                 sizeof(typename BlockStoreVecT::TempStorage)});
+    static constexpr int kSmemSize = kSmemIOSize + sizeof(typename BlockScanT::TempStorage);
+};
+
+template<typename Ktraits>
+__global__ __launch_bounds__(Ktraits::kNThreads, Ktraits::kMinBlocks)
+void selective_scan_fwd_kernel(SSMParamsBase params) {
+    constexpr bool kIsVariableB = Ktraits::kIsVariableB;
+    constexpr bool kIsVariableC = Ktraits::kIsVariableC;
+    constexpr bool kHasZ = Ktraits::kHasZ;
+    constexpr bool kVarlen = Ktraits::kVarlen;
+    constexpr int kNThreads = Ktraits::kNThreads;
+    constexpr int kNItems = Ktraits::kNItems;
+    constexpr int kNRows = Ktraits::kNRows;
+    constexpr bool kDirectIO = Ktraits::kDirectIO;
+    using input_t = typename Ktraits::input_t;
+    using weight_t = typename Ktraits::weight_t;
+    using scan_t = typename Ktraits::scan_t;
+
+    // Shared memory.
+    extern __shared__ char smem_[];
+    // cast to lvalue reference of expected type
+    // char *smem_loadstorescan = smem_ + 2 * MAX_DSTATE * sizeof(weight_t);
+    // auto& smem_load = reinterpret_cast<typename BlockLoadT::TempStorage&>(smem_ + 2 * MAX_DSTATE * sizeof(weight_t));
+    // auto& smem_load = reinterpret_cast<typename BlockLoadT::TempStorage&>(smem_loadstorescan);
+    auto& smem_load = reinterpret_cast<typename Ktraits::BlockLoadT::TempStorage&>(smem_);
+    auto& smem_load_weight = reinterpret_cast<typename Ktraits::BlockLoadWeightT::TempStorage&>(smem_);
+    auto& smem_load_weight1 = *reinterpret_cast<typename Ktraits::BlockLoadWeightT::TempStorage*>(smem_ + sizeof(typename Ktraits::BlockLoadWeightT::TempStorage));
+    auto& smem_store = reinterpret_cast<typename Ktraits::BlockStoreT::TempStorage&>(smem_);
+    auto& smem_scan = *reinterpret_cast<typename Ktraits::BlockScanT::TempStorage*>(smem_ + Ktraits::kSmemIOSize);
+    // weight_t *smem_a = reinterpret_cast<weight_t *>(smem_ + smem_loadstorescan_size);
+    // weight_t *smem_bc = reinterpret_cast<weight_t *>(smem_a + MAX_DSTATE);
+    scan_t *smem_running_prefix = reinterpret_cast<scan_t *>(smem_ + Ktraits::kSmemSize);
+
+    const int batch_id = blockIdx.x;
+    const int dim_id = blockIdx.y;
+    const int group_id = dim_id / (params.dim_ngroups_ratio);
+    int seqlen = params.seqlen;
+    int sequence_start_index = batch_id;
+    if constexpr (kVarlen){
+        int *query_start_loc = reinterpret_cast<int *>(params.query_start_loc_ptr);
+        sequence_start_index = query_start_loc[batch_id];
+        seqlen = query_start_loc[batch_id + 1] - sequence_start_index;
+    }
+    const bool has_initial_state = params.has_initial_state_ptr == nullptr ? false
+        : reinterpret_cast<bool *>(params.has_initial_state_ptr)[batch_id];
+
+    const int* cache_indices = params.cache_indices_ptr == nullptr ? nullptr
+        : reinterpret_cast<int *>(params.cache_indices_ptr);
+    const int cache_index = cache_indices == nullptr ? batch_id : cache_indices[batch_id];
+    // cache_index == params.pad_slot_id is defined as padding, so we exit early
+    if (cache_index == params.pad_slot_id){
+        return;
+    }
+    input_t *u = reinterpret_cast<input_t *>(params.u_ptr) + sequence_start_index * params.u_batch_stride
+        + dim_id * kNRows * params.u_d_stride;
+    input_t *delta = reinterpret_cast<input_t *>(params.delta_ptr) + sequence_start_index * params.delta_batch_stride
+        + dim_id * kNRows * params.delta_d_stride;
+    weight_t *A = reinterpret_cast<weight_t *>(params.A_ptr) + dim_id * kNRows * params.A_d_stride;
+    weight_t *B = reinterpret_cast<weight_t *>(params.B_ptr) + dim_id * kNRows * params.B_d_stride;
+    input_t *Bvar = reinterpret_cast<input_t *>(params.B_ptr) + sequence_start_index * params.B_batch_stride + group_id * params.B_group_stride;
+    weight_t *C = reinterpret_cast<weight_t *>(params.C_ptr) + dim_id * kNRows * params.C_d_stride;
+    input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + sequence_start_index * params.C_batch_stride + group_id * params.C_group_stride;
+    input_t *ssm_states = reinterpret_cast<input_t *>(params.ssm_states_ptr) + (cache_index * params.dim + dim_id * kNRows) * params.dstate;
+
+    float D_val[kNRows] = {0};
+    if (params.D_ptr != nullptr) {
+        #pragma unroll
+        for (int r = 0; r < kNRows; ++r) {
+            D_val[r] = reinterpret_cast<float *>(params.D_ptr)[dim_id * kNRows + r];
+        }
+    }
+    float delta_bias[kNRows] = {0};
+    if (params.delta_bias_ptr != nullptr) {
+        #pragma unroll
+        for (int r = 0; r < kNRows; ++r) {
+            delta_bias[r] = reinterpret_cast<float *>(params.delta_bias_ptr)[dim_id * kNRows + r];
+        }
+    }
+
+
+    // for (int state_idx = threadIdx.x; state_idx < params.dstate; state_idx += blockDim.x) {
+    //     smem_a[state_idx] = A[state_idx * params.A_dstate_stride];
+    //     smem_bc[state_idx] = B[state_idx * params.B_dstate_stride] * C[state_idx * params.C_dstate_stride];
+    // }
+
+    constexpr int kChunkSize = kNThreads * kNItems;
+    const int n_chunks = (seqlen + 2048 - 1) / 2048;
+    for (int chunk = 0; chunk < n_chunks; ++chunk) {
+        input_t u_vals[kNRows][kNItems], delta_vals_load[kNRows][kNItems];
+
+        __syncthreads();
+        #pragma unroll
+        for (int r = 0; r < kNRows; ++r) {
+            if constexpr (!kDirectIO) {
+                if (r > 0) { __syncthreads(); }
+            }
+            load_input<Ktraits>(u + r * params.u_d_stride, u_vals[r], smem_load, seqlen - chunk * kChunkSize);
+            if constexpr (!kDirectIO) { __syncthreads(); }
+            load_input<Ktraits>(delta + r * params.delta_d_stride, delta_vals_load[r], smem_load, seqlen - chunk * kChunkSize);
+        }
+        u += kChunkSize;
+        delta += kChunkSize;
+    
+        float delta_vals[kNRows][kNItems], delta_u_vals[kNRows][kNItems], out_vals[kNRows][kNItems];
+        #pragma unroll
+        for (int r = 0; r < kNRows; ++r) {
+            #pragma unroll
+            for (int i = 0; i < kNItems; ++i) {
+                float u_val = float(u_vals[r][i]);
+                delta_vals[r][i] = float(delta_vals_load[r][i]) + delta_bias[r];
+                if (params.delta_softplus) {
+                    delta_vals[r][i] = delta_vals[r][i] <= 20.f ? log1pf(expf(delta_vals[r][i])) : delta_vals[r][i];
+                }
+                delta_u_vals[r][i] = delta_vals[r][i] * u_val;
+                out_vals[r][i] = D_val[r] * u_val;
+            }
+        }
+
+        __syncthreads();
+        for (int state_idx = 0; state_idx < params.dstate; ++state_idx) {
+            weight_t A_val[kNRows];
+            #pragma unroll
+            for (int r = 0; r < kNRows; ++r) {
+                A_val[r] = A[state_idx * params.A_dstate_stride + r * params.A_d_stride];
+                // Multiply the real part of A with LOG2E so we can use exp2f instead of expf.
+                constexpr float kLog2e = M_LOG2E;
+                A_val[r] *= kLog2e;
+            }
+            // This variable holds B * C if both B and C are constant across seqlen. If only B varies
+            // across seqlen, this holds C. If only C varies across seqlen, this holds B.
+            // If both B and C vary, this is unused.
+            weight_t BC_val[kNRows];
+            weight_t B_vals[kNItems], C_vals[kNItems];
+            if constexpr (kIsVariableB) {
+                load_weight<Ktraits>(Bvar + state_idx * params.B_dstate_stride, B_vals,
+                    smem_load_weight, (seqlen - chunk * kChunkSize) * (1));
+                if constexpr (!kIsVariableC) {
+                    #pragma unroll
+                    for (int r = 0; r < kNRows; ++r) {
+                        BC_val[r] = C[state_idx * params.C_dstate_stride + r * params.C_d_stride];
+                    }
+                }
+            }
+            if constexpr (kIsVariableC) {
+                auto &smem_load_weight_C = !kIsVariableB ? smem_load_weight : smem_load_weight1;
+                load_weight<Ktraits>(Cvar + state_idx * params.C_dstate_stride, C_vals,
+                    smem_load_weight_C, (seqlen - chunk * kChunkSize) * (1 ));
+                if constexpr (!kIsVariableB) {
+                    #pragma unroll
+                    for (int r = 0; r < kNRows; ++r) {
+                        BC_val[r] = B[state_idx * params.B_dstate_stride + r * params.B_d_stride];
+                    }
+                }
+            }
+            if constexpr (!kIsVariableB && !kIsVariableC) {
+                #pragma unroll
+                for (int r = 0; r < kNRows; ++r) {
+                    BC_val[r] = B[state_idx * params.B_dstate_stride + r * params.B_d_stride] * C[state_idx * params.C_dstate_stride + r * params.C_d_stride];
+                }
+            }
+
+            #pragma unroll
+            for (int r = 0; r < kNRows; ++r) {
+                if (r > 0) { __syncthreads(); }  // Scan could be using the same smem
+                scan_t thread_data[kNItems];
+                #pragma unroll
+                for (int i = 0; i < kNItems; ++i) {
+                    thread_data[i] = make_float2(exp2f(delta_vals[r][i] * A_val[r]),
+                                                 !kIsVariableB ? delta_u_vals[r][i] : B_vals[i] * delta_u_vals[r][i]);
+                    
+                    if (seqlen % (kNItems * kNThreads) != 0) {  // So that the last state is correct
+                        if (threadIdx.x * kNItems + i >= seqlen - chunk * kChunkSize) {
+                            thread_data[i] = make_float2(1.f, 0.f);
+                        }
+                    }
+                }
+                // Initialize running total
+
+                scan_t running_prefix = chunk > 0 ? smem_running_prefix[state_idx + r * MAX_DSTATE] : make_float2(1.0, has_initial_state ? float(ssm_states[state_idx]): 0.0);
+
+                SSMScanPrefixCallbackOp<weight_t> prefix_op(running_prefix);
+                typename Ktraits::BlockScanT(smem_scan).InclusiveScan(
+                    thread_data, thread_data, SSMScanOp<weight_t>(), prefix_op
+                );
+                // There's a syncthreads in the scan op, so we don't need to sync here.
+                // Unless there's only 1 warp, but then it's the same thread (0) reading and writing.
+                if (threadIdx.x == 0) {
+                    smem_running_prefix[state_idx] = prefix_op.running_prefix;
+                    if (chunk == n_chunks - 1) {
+                        ssm_states[state_idx] = input_t(prefix_op.running_prefix.y);
+                    }
+                }
+                #pragma unroll
+                for (int i = 0; i < kNItems; ++i) {
+                    const weight_t C_val = !kIsVariableC
+                        ? BC_val[r]
+                        : (!kIsVariableB ? BC_val[r] * C_vals[i] : C_vals[i]);
+                    out_vals[r][i] += thread_data[i].y * C_val;
+                }
+            }
+        }
+        
+        input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + sequence_start_index * params.out_batch_stride
+            + dim_id * kNRows * params.out_d_stride + chunk * kChunkSize;
+        __syncthreads();
+        #pragma unroll
+        for (int r = 0; r < kNRows; ++r) {
+            if constexpr (!kDirectIO) {
+                if (r > 0) { __syncthreads(); }
+            }
+            store_output<Ktraits>(out + r * params.out_d_stride, out_vals[r], smem_store, seqlen - chunk * kChunkSize);
+        }
+
+        if constexpr (kHasZ) {
+            input_t *z = reinterpret_cast<input_t *>(params.z_ptr) + sequence_start_index * params.z_batch_stride
+                + dim_id * kNRows * params.z_d_stride + chunk * kChunkSize;
+            input_t *out_z = reinterpret_cast<input_t *>(params.out_z_ptr) + sequence_start_index * params.out_z_batch_stride
+                + dim_id * kNRows * params.out_z_d_stride + chunk * kChunkSize;
+            #pragma unroll
+            for (int r = 0; r < kNRows; ++r) {
+                input_t z_vals[kNItems];
+                __syncthreads();
+                load_input<Ktraits>(z + r * params.z_d_stride, z_vals, smem_load, seqlen - chunk * kChunkSize);
+                #pragma unroll
+                for (int i = 0; i < kNItems; ++i) {
+                    float z_val = z_vals[i];
+                    out_vals[r][i] *= z_val / (1 + expf(-z_val));
+                }
+                __syncthreads();
+                store_output<Ktraits>(out_z + r * params.out_z_d_stride, out_vals[r], smem_store, seqlen - chunk * kChunkSize);
+            }
+        }
+
+        Bvar += kChunkSize * 1;
+        Cvar += kChunkSize * 1;
+    }
+}
+
+template<int kNThreads, int kNItems, typename input_t, typename weight_t>
+void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
+    // Only kNRows == 1 is tested for now, which ofc doesn't differ from previously when we had each block
+    // processing 1 row.
+    constexpr int kNRows = 1;
+    // kIsVariableB, kIsVariableC and kHasZ are all set to True to reduce binary size
+    constexpr bool kIsVariableB = true;
+    constexpr bool kIsVariableC = true;
+    BOOL_SWITCH(params.seqlen % (kNThreads * kNItems) == 0, kIsEvenLen, [&] {
+        BOOL_SWITCH(params.z_ptr != nullptr , kHasZ, [&] {
+            BOOL_SWITCH(params.query_start_loc_ptr != nullptr , kVarlen, [&] {
+                using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ,  kVarlen, input_t, weight_t>;
+                constexpr int kSmemSize = Ktraits::kSmemSize + kNRows * MAX_DSTATE * sizeof(typename Ktraits::scan_t);
+                dim3 grid(params.batch, params.dim / kNRows);
+                auto kernel = &selective_scan_fwd_kernel<Ktraits>;
+                if (kSmemSize >= 48 * 1024) {
+#ifdef USE_ROCM
+                    C10_HIP_CHECK(hipFuncSetAttribute(
+                        reinterpret_cast<const void*>(kernel), hipFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
+#else
+                    C10_CUDA_CHECK(cudaFuncSetAttribute(
+                        kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
+#endif
+                }
+                kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
+                C10_CUDA_KERNEL_LAUNCH_CHECK();
+            });
+        });
+    });
+}
+
+template<typename input_t, typename weight_t>
+void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream) {
+
+    #ifndef USE_ROCM
+        if (params.seqlen <= 128) {           
+            selective_scan_fwd_launch<32, 4, input_t, weight_t>(params, stream);
+        } else if (params.seqlen <= 256) {
+            selective_scan_fwd_launch<32, 8, input_t, weight_t>(params, stream);
+        } else if (params.seqlen <= 512) {
+            selective_scan_fwd_launch<32, 16, input_t, weight_t>(params, stream);
+        } else if (params.seqlen <= 1024) {
+            selective_scan_fwd_launch<64, 16, input_t, weight_t>(params, stream);
+        } else {
+            selective_scan_fwd_launch<128, 16, input_t, weight_t>(params, stream);
+        }
+    #else
+        if (params.seqlen <= 256) {
+            selective_scan_fwd_launch<64, 4, input_t, weight_t>(params, stream);
+        } else if (params.seqlen <= 512) {
+            selective_scan_fwd_launch<64, 8, input_t, weight_t>(params, stream);
+        } else if (params.seqlen <= 1024) {
+            selective_scan_fwd_launch<64, 16, input_t, weight_t>(params, stream);
+        } else {
+            selective_scan_fwd_launch<128, 16, input_t, weight_t>(params, stream);
+        }
+    #endif
+}
+
+template void selective_scan_fwd_cuda<at::BFloat16, float>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<at::Half, float>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<float, float>(SSMParamsBase &params, cudaStream_t stream);
+
+#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
+
+#define DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, NAME, ...)              \
+    if (ITYPE == at::ScalarType::Half) {                                            \
+        using input_t = at::Half;                                                   \
+        using weight_t = float;                                                     \
+        __VA_ARGS__();                                                              \
+    } else if (ITYPE == at::ScalarType::BFloat16) {                                 \
+        using input_t = at::BFloat16;                                               \
+        using weight_t = float;                                                     \
+        __VA_ARGS__();                                                              \
+    } else if (ITYPE == at::ScalarType::Float)  {                                   \
+        using input_t = float;                                                      \
+        using weight_t = float;                                                     \
+        __VA_ARGS__();                                                              \
+    } else {                                                                        \
+        AT_ERROR(#NAME, " not implemented for input type '", toString(ITYPE), "'"); \
+    }
+
+
+template<typename input_t, typename weight_t>
+void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream);
+
+void set_ssm_params_fwd(SSMParamsBase &params,
+                        // sizes
+                        const size_t batch,
+                        const size_t dim,
+                        const size_t seqlen,
+                        const size_t dstate,
+                        const size_t n_groups,
+                        const bool is_variable_B,
+                        const bool is_variable_C,
+                        // device pointers
+                        const torch::Tensor u,
+                        const torch::Tensor delta,
+                        const torch::Tensor A,
+                        const torch::Tensor B,
+                        const torch::Tensor C,
+                        const torch::Tensor out,
+                        const torch::Tensor z,
+                        const torch::Tensor out_z,
+                        const std::optional<at::Tensor>& D,
+                        const std::optional<at::Tensor>& delta_bias,
+                        const torch::Tensor ssm_states,
+                        bool has_z, 
+                        bool delta_softplus,
+                        const std::optional<at::Tensor>& query_start_loc,
+                        const std::optional<at::Tensor>& cache_indices,
+                        const std::optional<at::Tensor>& has_initial_state,
+                        bool varlen,
+                        int64_t pad_slot_id) {
+
+    // Reset the parameters
+    memset(&params, 0, sizeof(params));
+
+    params.batch = batch;
+    params.dim = dim;
+    params.seqlen = seqlen;
+    params.dstate = dstate;
+    params.n_groups = n_groups;
+    params.dim_ngroups_ratio = dim / n_groups;
+    params.pad_slot_id = pad_slot_id;
+
+    params.delta_softplus = delta_softplus;
+
+    params.is_variable_B = is_variable_B;
+    params.is_variable_C = is_variable_C;
+
+    // Set the pointers and strides.
+    params.u_ptr = u.data_ptr();
+    params.delta_ptr = delta.data_ptr();
+    params.A_ptr = A.data_ptr();
+    params.B_ptr = B.data_ptr();
+    params.C_ptr = C.data_ptr();
+    params.D_ptr = D.has_value() ? D.value().data_ptr() : nullptr;
+    params.delta_bias_ptr = delta_bias.has_value() ? delta_bias.value().data_ptr() : nullptr;
+    params.out_ptr = out.data_ptr();
+    params.ssm_states_ptr = ssm_states.data_ptr();
+    params.z_ptr = has_z ? z.data_ptr() : nullptr;
+    params.out_z_ptr = has_z ? out_z.data_ptr() : nullptr;
+    params.query_start_loc_ptr = query_start_loc.has_value() ? query_start_loc.value().data_ptr() : nullptr;
+    params.cache_indices_ptr = cache_indices.has_value() ? cache_indices.value().data_ptr() : nullptr;
+    params.has_initial_state_ptr = has_initial_state.has_value() ? has_initial_state.value().data_ptr() : nullptr;
+
+
+    // All stride are in elements, not bytes.
+    params.A_d_stride = A.stride(0);
+    params.A_dstate_stride = A.stride(1);
+
+    if (varlen){
+        params.B_batch_stride = B.stride(2);
+        params.B_group_stride = B.stride(0);
+        params.B_dstate_stride = B.stride(1);
+        params.C_batch_stride = C.stride(2);
+        params.C_group_stride = C.stride(0);
+        params.C_dstate_stride = C.stride(1);
+
+        params.u_batch_stride = u.stride(1);
+        params.u_d_stride = u.stride(0);
+        params.delta_batch_stride = delta.stride(1);
+        params.delta_d_stride = delta.stride(0);
+        if (has_z) {
+            params.z_batch_stride = z.stride(1);
+            params.z_d_stride = z.stride(0);
+            params.out_z_batch_stride = out_z.stride(1);
+            params.out_z_d_stride = out_z.stride(0);
+        }
+        params.out_batch_stride = out.stride(1);
+        params.out_d_stride = out.stride(0);
+
+    }
+    else{
+        if (!is_variable_B) {
+            params.B_d_stride = B.stride(0);
+        } else {
+            params.B_batch_stride = B.stride(0);
+            params.B_group_stride = B.stride(1);
+        }
+        params.B_dstate_stride = !is_variable_B ? B.stride(1) : B.stride(2);
+        if (!is_variable_C) {
+            params.C_d_stride = C.stride(0);
+        } else {
+            params.C_batch_stride = C.stride(0);
+            params.C_group_stride = C.stride(1);
+        }
+        params.C_dstate_stride = !is_variable_C ? C.stride(1) : C.stride(2);
+        params.u_batch_stride = u.stride(0);
+        params.u_d_stride = u.stride(1);
+        params.delta_batch_stride = delta.stride(0);
+        params.delta_d_stride = delta.stride(1);
+        if (has_z) {
+            params.z_batch_stride = z.stride(0);
+            params.z_d_stride = z.stride(1);
+            params.out_z_batch_stride = out_z.stride(0);
+            params.out_z_d_stride = out_z.stride(1);
+        }
+        params.out_batch_stride = out.stride(0);
+        params.out_d_stride = out.stride(1);
+    }
+}
+
+void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
+                  const torch::Tensor &A, const torch::Tensor &B, const torch::Tensor &C,
+                  const std::optional<torch::Tensor> &D_,
+                  const std::optional<torch::Tensor> &z_,
+                  const std::optional<torch::Tensor> &delta_bias_,
+                  bool delta_softplus,
+                  const std::optional<torch::Tensor> &query_start_loc,
+                  const std::optional<torch::Tensor> &cache_indices,
+                  const std::optional<torch::Tensor> &has_initial_state,
+                  const torch::Tensor &ssm_states,
+                  // used to identify padding entries if cache_indices provided
+                  // in case of padding, the kernel will return early
+                  int64_t pad_slot_id) {
+    auto input_type = u.scalar_type();
+    auto weight_type = A.scalar_type();
+    TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
+    TORCH_CHECK(weight_type == at::ScalarType::Float);
+
+    const bool is_variable_B = B.dim() >= 3;
+    const bool is_variable_C = C.dim() >= 3;
+
+    TORCH_CHECK(delta.scalar_type() == input_type);
+    TORCH_CHECK(B.scalar_type() == (!is_variable_B ? weight_type : input_type));
+    TORCH_CHECK(C.scalar_type() == (!is_variable_C ? weight_type : input_type));
+
+    TORCH_CHECK(u.is_cuda());
+    TORCH_CHECK(delta.is_cuda());
+    TORCH_CHECK(A.is_cuda());
+    TORCH_CHECK(B.is_cuda());
+    TORCH_CHECK(C.is_cuda());
+
+    TORCH_CHECK(u.stride(-1) == 1 || u.size(-1) == 1);
+    TORCH_CHECK(delta.stride(-1) == 1 || delta.size(-1) == 1);
+
+    const auto sizes = u.sizes();
+    const bool varlen = query_start_loc.has_value();
+    const int batch_size = varlen ? query_start_loc.value().sizes()[0] - 1 : sizes[0];
+    const int dim = varlen ? sizes[0] : sizes[1];
+    const int seqlen = varlen ? sizes[1] : sizes[2];
+    const int dstate = A.size(1);
+    const int n_groups = varlen ? B.size(0) : B.size(1);
+
+    TORCH_CHECK(dstate <= 256, "selective_scan only supports state dimension <= 256");
+
+    if (varlen) {
+        CHECK_SHAPE(u, dim, seqlen);
+        CHECK_SHAPE(delta, dim, seqlen);
+    } else {
+        CHECK_SHAPE(u, batch_size, dim, seqlen);
+        CHECK_SHAPE(delta, batch_size, dim, seqlen);
+    }
+    CHECK_SHAPE(A, dim, dstate);
+    TORCH_CHECK(is_variable_B, "is_variable_B = False is disabled in favor of reduced binary size")
+    if (varlen) {
+        CHECK_SHAPE(B, n_groups, dstate, seqlen);
+    } else {
+        CHECK_SHAPE(B, batch_size, n_groups, dstate, seqlen); 
+    }
+    TORCH_CHECK(B.stride(-1) == 1 || B.size(-1) == 1);
+
+    TORCH_CHECK(is_variable_C, "is_variable_C = False is disabled in favor of reduced binary size")
+    if (varlen) {
+        CHECK_SHAPE(C, n_groups, dstate, seqlen);
+    } else {
+        CHECK_SHAPE(C, batch_size, n_groups, dstate, seqlen); 
+    }
+    TORCH_CHECK(C.stride(-1) == 1 || C.size(-1) == 1);
+
+    if (D_.has_value()) {
+        auto D = D_.value();
+        TORCH_CHECK(D.scalar_type() == at::ScalarType::Float);
+        TORCH_CHECK(D.is_cuda());
+        TORCH_CHECK(D.stride(-1) == 1 || D.size(-1) == 1);
+        CHECK_SHAPE(D, dim);
+    }
+
+    if (delta_bias_.has_value()) {
+        auto delta_bias = delta_bias_.value();
+        TORCH_CHECK(delta_bias.scalar_type() == at::ScalarType::Float);
+        TORCH_CHECK(delta_bias.is_cuda());
+        TORCH_CHECK(delta_bias.stride(-1) == 1 || delta_bias.size(-1) == 1);
+        CHECK_SHAPE(delta_bias, dim);
+    }
+
+
+    if (has_initial_state.has_value()) {
+        auto has_initial_state_ = has_initial_state.value();
+        TORCH_CHECK(has_initial_state_.scalar_type() == at::ScalarType::Bool);
+        TORCH_CHECK(has_initial_state_.is_cuda());
+        CHECK_SHAPE(has_initial_state_, batch_size);
+    }
+
+
+    if (query_start_loc.has_value()) {
+        auto query_start_loc_ = query_start_loc.value();
+        TORCH_CHECK(query_start_loc_.scalar_type() == at::ScalarType::Int);
+        TORCH_CHECK(query_start_loc_.is_cuda());
+    }
+
+
+    if (cache_indices.has_value()) {
+        auto cache_indices_ = cache_indices.value();
+        TORCH_CHECK(cache_indices_.scalar_type() == at::ScalarType::Int);
+        TORCH_CHECK(cache_indices_.is_cuda());
+        CHECK_SHAPE(cache_indices_, batch_size);
+    }
+   
+
+    at::Tensor z, out_z;
+    const bool has_z = z_.has_value();
+    if (has_z) {
+        z = z_.value();
+        TORCH_CHECK(z.scalar_type() == input_type);
+        TORCH_CHECK(z.is_cuda());
+        TORCH_CHECK(z.stride(-1) == 1 || z.size(-1) == 1);
+        if (varlen){
+            CHECK_SHAPE(z, dim, seqlen);
+        } else {
+            CHECK_SHAPE(z, batch_size, dim, seqlen);
+        }
+        
+        out_z = z;
+    }
+
+    // Right now u has BHL layout and delta has HBL layout, and we want out to have HBL layout
+    at::Tensor out = delta;
+    TORCH_CHECK(ssm_states.scalar_type() == input_type);
+    TORCH_CHECK(ssm_states.is_cuda());
+    TORCH_CHECK(ssm_states.stride(-1) == 1);
+
+    SSMParamsBase params;
+    set_ssm_params_fwd(params, batch_size, dim, seqlen, dstate, n_groups, is_variable_B, is_variable_C,
+                       u, delta, A, B, C, out, z, out_z,
+                       D_,
+                       delta_bias_,
+                       ssm_states,
+                       has_z,
+                       delta_softplus,
+                       query_start_loc,
+                       cache_indices,
+                       has_initial_state,
+                       varlen,
+                       pad_slot_id
+                       );
+
+    
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(u));
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_fwd", [&] {
+        selective_scan_fwd_cuda<input_t, weight_t>(params, stream);
+    });
+}
diff --git a/vllm_v0.10.0/csrc/mamba/mamba_ssm/static_switch.h b/vllm_v0.10.0/csrc/mamba/mamba_ssm/static_switch.h
new file mode 100644
index 0000000..840cb23
--- /dev/null
+++ b/vllm_v0.10.0/csrc/mamba/mamba_ssm/static_switch.h
@@ -0,0 +1,28 @@
+// Inspired by
+// https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h
+// and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h
+
+// clang-format off
+// adapted from https://github.com/state-spaces/mamba/blob/main/csrc/selective_scan/static_switch.h
+#pragma once
+
+/// @param COND       - a boolean expression to switch by
+/// @param CONST_NAME - a name given for the constexpr bool variable.
+/// @param ...       - code to execute for true and false
+///
+/// Usage:
+/// ```
+/// BOOL_SWITCH(flag, BoolConst, [&] {
+///     some_function<BoolConst>(...);
+/// });
+/// ```
+#define BOOL_SWITCH(COND, CONST_NAME, ...) \
+  [&] {                                    \
+    if (COND) {                            \
+      constexpr bool CONST_NAME = true;    \
+      return __VA_ARGS__();                \
+    } else {                               \
+      constexpr bool CONST_NAME = false;   \
+      return __VA_ARGS__();                \
+    }                                      \
+  }()
diff --git a/vllm_v0.10.0/csrc/moe/marlin_moe_wna16/.gitignore b/vllm_v0.10.0/csrc/moe/marlin_moe_wna16/.gitignore
new file mode 100644
index 0000000..7708855
--- /dev/null
+++ b/vllm_v0.10.0/csrc/moe/marlin_moe_wna16/.gitignore
@@ -0,0 +1 @@
+kernel_*.cu
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/moe/marlin_moe_wna16/generate_kernels.py b/vllm_v0.10.0/csrc/moe/marlin_moe_wna16/generate_kernels.py
new file mode 100644
index 0000000..49f3371
--- /dev/null
+++ b/vllm_v0.10.0/csrc/moe/marlin_moe_wna16/generate_kernels.py
@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import glob
+import itertools
+import os
+import subprocess
+
+import jinja2
+
+FILE_HEAD = """
+// auto generated by generate.py
+// clang-format off
+
+#include "kernel.h"
+#include "marlin_template.h"
+
+namespace MARLIN_NAMESPACE_NAME {
+""".strip()
+
+TEMPLATE = ("template __global__ void Marlin<"
+            "{{scalar_t}}, "
+            "{{w_type_id}}, "
+            "{{threads}}, "
+            "{{thread_m_blocks}}, "
+            "{{thread_n_blocks}}, "
+            "{{thread_k_blocks}}, "
+            "{{'true' if m_block_size_8 else 'false'}}, "
+            "{{stages}}, "
+            "{{group_blocks}}, "
+            "{{'true' if is_zp_float else 'false'}}>"
+            "( MARLIN_KERNEL_PARAMS );")
+
+# int8 with zero point case (vllm::kU8) is also supported,
+# we don't add it to reduce wheel size.
+SCALAR_TYPES = [
+    "vllm::kU4", "vllm::kU4B8", "vllm::kU8B128", "vllm::kFE4M3fn",
+    "vllm::kFE2M1f"
+]
+THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
+
+THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
+# group_blocks:
+#   = 0 : act order case
+#   = -1 : channelwise quantization
+#   > 0 : group_size=16*group_blocks
+GROUP_BLOCKS = [0, -1, 1, 2, 4, 8]
+DTYPES = ["fp16", "bf16"]
+
+
+def remove_old_kernels():
+    for filename in glob.glob(os.path.dirname(__file__) + "/kernel_*.cu"):
+        subprocess.call(["rm", "-f", filename])
+
+
+def generate_new_kernels():
+    for scalar_type, dtype in itertools.product(SCALAR_TYPES, DTYPES):
+        all_template_str_list = []
+
+        for group_blocks, m_blocks, thread_configs in itertools.product(
+                GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):
+
+            # act order case only support gptq-int4 and gptq-int8
+            if group_blocks == 0 and scalar_type not in [
+                    "vllm::kU4B8", "vllm::kU8B128"
+            ]:
+                continue
+            if thread_configs[2] == 256:
+                # for small batch (m_blocks == 1), we only need (128, 128, 256)
+                # for large batch (m_blocks > 1), we only need (64, 256, 256)
+                if m_blocks <= 1 and thread_configs[0] != 128:
+                    continue
+                if m_blocks > 1 and thread_configs[0] != 64:
+                    continue
+
+            # we only support channelwise quantization and group_size == 128
+            # for fp8
+            if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]:
+                continue
+            # nvfp4 only supports group_size == 16
+            if scalar_type == "vllm::kFE2M1f" and group_blocks not in [1, 2]:
+                continue
+            # other quantization methods don't support group_size = 16
+            if scalar_type != "vllm::kFE2M1f" and group_blocks == 1:
+                continue
+
+            k_blocks = thread_configs[0] // 16
+            n_blocks = thread_configs[1] // 16
+            threads = thread_configs[2]
+
+            c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
+
+            template_str = jinja2.Template(TEMPLATE).render(
+                scalar_t=c_dtype,
+                w_type_id=scalar_type + ".id()",
+                threads=threads,
+                thread_m_blocks=max(m_blocks, 1),
+                thread_n_blocks=n_blocks,
+                thread_k_blocks=k_blocks,
+                m_block_size_8=m_blocks == 0.5,
+                stages="pipe_stages",
+                group_blocks=group_blocks,
+                is_zp_float=False,
+            )
+
+            all_template_str_list.append(template_str)
+
+        file_content = FILE_HEAD + "\n\n"
+        file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
+        filename = f"kernel_{dtype}_{scalar_type[6:].lower()}.cu"
+
+        with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
+            f.write(file_content)
+
+
+if __name__ == "__main__":
+    remove_old_kernels()
+    generate_new_kernels()
diff --git a/vllm_v0.10.0/csrc/moe/marlin_moe_wna16/kernel.h b/vllm_v0.10.0/csrc/moe/marlin_moe_wna16/kernel.h
new file mode 100644
index 0000000..537282a
--- /dev/null
+++ b/vllm_v0.10.0/csrc/moe/marlin_moe_wna16/kernel.h
@@ -0,0 +1,43 @@
+
+#ifndef MARLIN_NAMESPACE_NAME
+  #define MARLIN_NAMESPACE_NAME marlin_moe_wna16
+#endif
+
+#include "quantization/gptq_marlin/marlin.cuh"
+#include "quantization/gptq_marlin/marlin_dtypes.cuh"
+#include "core/scalar_type.hpp"
+
+#define MARLIN_KERNEL_PARAMS                                          \
+  const int4 *__restrict__ A, const int4 *__restrict__ B,             \
+      int4 *__restrict__ C, int4 *__restrict__ C_tmp,                 \
+      const int4 *__restrict__ scales_ptr,                            \
+      const uint16_t *__restrict__ scale2_ptr,                        \
+      const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx, \
+      const int32_t *__restrict__ sorted_token_ids_ptr,               \
+      const int32_t *__restrict__ expert_ids_ptr,                     \
+      const int32_t *__restrict__ num_tokens_past_padded_ptr,         \
+      const float *__restrict__ topk_weights_ptr, int top_k,          \
+      bool mul_topk_weights, bool is_ep, int num_groups, int prob_m,  \
+      int prob_n, int prob_k, int *locks, bool use_atomic_add,        \
+      bool use_fp32_reduce, int max_shared_mem
+
+namespace MARLIN_NAMESPACE_NAME {
+template <typename scalar_t,  // compute dtype, half or nv_float16
+          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const bool m_block_size_8,  // whether m_block_size == 8
+                                      // only works when thread_m_blocks == 1
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const int group_blocks,  // number of consecutive 16x16 blocks
+                                   // with a separate quantization scale
+          const bool is_zp_float   // is zero point of float16 type?
+          >
+__global__ void Marlin(MARLIN_KERNEL_PARAMS);
+
+}
diff --git a/vllm_v0.10.0/csrc/moe/marlin_moe_wna16/marlin_template.h b/vllm_v0.10.0/csrc/moe/marlin_moe_wna16/marlin_template.h
new file mode 100644
index 0000000..8a913bb
--- /dev/null
+++ b/vllm_v0.10.0/csrc/moe/marlin_moe_wna16/marlin_template.h
@@ -0,0 +1,1925 @@
+/*
+ * Modified by Neural Magic
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Adapted from https://github.com/IST-DASLab/marlin
+ */
+
+#ifndef MARLIN_NAMESPACE_NAME
+  #define MARLIN_NAMESPACE_NAME marlin_moe_wna16
+#endif
+
+#include "quantization/gptq_marlin/marlin.cuh"
+#include "quantization/gptq_marlin/marlin_dtypes.cuh"
+#include "quantization/gptq_marlin/dequant.h"
+#include "core/scalar_type.hpp"
+
+#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
+  static_assert(std::is_same<scalar_t, half>::value ||          \
+                    std::is_same<scalar_t, nv_bfloat16>::value, \
+                "only float16 and bfloat16 is supported");
+
+namespace MARLIN_NAMESPACE_NAME {
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+
+template <typename scalar_t,  // compute dtype, half or nv_float16
+          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const bool m_block_size_8,  // whether m_block_size == 8
+                                      // only works when thread_m_blocks == 1
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const int group_blocks,  // number of consecutive 16x16 blocks
+                                   // with a separate quantization scale
+          const bool is_zp_float   // is zero point of float16 type?
+          >
+__global__ void Marlin(
+    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // fp16 output buffer of shape mxn
+    int4* __restrict__ C_tmp,    // fp32 tmp output buffer (for reduce)
+    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
+                                          // (k/groupsize)xn
+    const int4* __restrict__ zp_ptr,      // 4bit packed zero-points of shape
+                                          // (k/groupsize)x(n/pack_factor)
+    const int* __restrict__ g_idx,        // int32 group indices of shape k
+    const int32_t* __restrict__ sorted_token_ids_ptr,        // moe sorted_ids
+    const int32_t* __restrict__ expert_ids_ptr,              // moe expert ids
+    const int32_t* __restrict__ num_tokens_past_padded_ptr,  // moe num tokens
+    const float* __restrict__ topk_weights_ptr,              // moe top weights
+    int top_k,              // num of experts per token
+    bool mul_topk_weights,  // mul topk weights or not
+    bool is_ep,             // expert parallelism
+    int num_groups,         // number of scale groups per output channel
+    int prob_m,             // batch dimension m
+    int prob_n,             // output dimension n
+    int prob_k,             // reduction dimension k
+    int* locks,             // extra global storage for barrier synchronization
+    bool use_atomic_add,    // whether to use atomic add to reduce
+    bool use_fp32_reduce,   // whether to use fp32 global reduce
+    int max_shared_mem) {}
+
+}  // namespace MARLIN_NAMESPACE_NAME
+
+#else
+
+// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
+// output/accumulation.
+template <typename scalar_t>
+__device__ inline void mma(const typename ScalarType<scalar_t>::FragA& a_frag,
+                           const typename ScalarType<scalar_t>::FragB& frag_b,
+                           typename ScalarType<scalar_t>::FragC& frag_c) {
+  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  float* c = reinterpret_cast<float*>(&frag_c);
+  if constexpr (std::is_same<scalar_t, half>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  } else {
+    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+  }
+}
+
+template <typename scalar_t>
+__device__ inline void mma_trans(
+    const typename ScalarType<scalar_t>::FragA& a_frag,
+    const typename ScalarType<scalar_t>::FragB& frag_b,
+    const typename ScalarType<scalar_t>::FragB& frag_b2,
+    typename ScalarType<scalar_t>::FragC& frag_c) {
+  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  const uint32_t* b2 = reinterpret_cast<const uint32_t*>(&frag_b2);
+  float* c = reinterpret_cast<float*>(&frag_c);
+  if constexpr (std::is_same<scalar_t, half>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  } else {
+    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+  }
+}
+
+// Instruction for loading a full 16x16 matrix fragment of operand A from shared
+// memory, directly in tensor core layout.
+template <int count, typename scalar_t>
+__device__ inline void ldsm(typename ScalarType<scalar_t>::FragA& frag_a,
+                            const void* smem_ptr) {
+  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  if constexpr (count == 4) {
+    asm volatile(
+        "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
+        : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
+        : "r"(smem));
+  } else if constexpr (count == 2) {
+    asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];\n"
+                 : "=r"(a[0]), "=r"(a[1])
+                 : "r"(smem));
+  } else if constexpr (count == 1) {
+    asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n"
+                 : "=r"(a[0])
+                 : "r"(smem));
+  } else {
+    static_assert(count == 1 || count == 2 || count == 4, "invalid count");
+  }
+}
+
+// Multiply dequantized values by the corresponding quantization scale; used
+// only for grouped quantization.
+template <typename scalar_t>
+__device__ inline void scale(typename ScalarType<scalar_t>::FragB& frag_b,
+                             typename ScalarType<scalar_t>::FragS& frag_s,
+                             int i) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 s =
+      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_s)[i]);
+  frag_b[0] = __hmul2(frag_b[0], s);
+  frag_b[1] = __hmul2(frag_b[1], s);
+}
+
+template <typename scalar_t>
+__device__ inline void scale_and_sub(
+    typename ScalarType<scalar_t>::FragB& frag_b, scalar_t s, scalar_t zp) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 s2 = ScalarType<scalar_t>::num2num2(s);
+  scalar_t2 zp2 = ScalarType<scalar_t>::num2num2(zp);
+  frag_b[0] = __hfma2(frag_b[0], s2, __hneg2(zp2));
+  frag_b[1] = __hfma2(frag_b[1], s2, __hneg2(zp2));
+}
+
+template <typename scalar_t>
+__device__ inline void sub_zp(typename ScalarType<scalar_t>::FragB& frag_b,
+                              typename ScalarType<scalar_t>::scalar_t2& frag_zp,
+                              int i) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 zp =
+      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_zp)[i]);
+  frag_b[0] = __hsub2(frag_b[0], zp);
+  frag_b[1] = __hsub2(frag_b[1], zp);
+}
+
+// Same as above, but for act_order (each K is multiplied individually)
+template <typename scalar_t>
+__device__ inline void scale4(typename ScalarType<scalar_t>::FragB& frag_b,
+                              typename ScalarType<scalar_t>::FragS& frag_s_1,
+                              typename ScalarType<scalar_t>::FragS& frag_s_2,
+                              typename ScalarType<scalar_t>::FragS& frag_s_3,
+                              typename ScalarType<scalar_t>::FragS& frag_s_4,
+                              int i) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 s_val_1_2;
+  s_val_1_2.x = reinterpret_cast<scalar_t*>(&frag_s_1)[i];
+  s_val_1_2.y = reinterpret_cast<scalar_t*>(&frag_s_2)[i];
+
+  scalar_t2 s_val_3_4;
+  s_val_3_4.x = reinterpret_cast<scalar_t*>(&frag_s_3)[i];
+  s_val_3_4.y = reinterpret_cast<scalar_t*>(&frag_s_4)[i];
+
+  frag_b[0] = __hmul2(frag_b[0], s_val_1_2);
+  frag_b[1] = __hmul2(frag_b[1], s_val_3_4);
+}
+
+// Given 2 floats multiply by 2 scales (halves)
+template <typename scalar_t>
+__device__ inline void scale_float(float* c,
+                                   typename ScalarType<scalar_t>::FragS& s) {
+  scalar_t* s_ptr = reinterpret_cast<scalar_t*>(&s);
+  c[0] = __fmul_rn(c[0], ScalarType<scalar_t>::num2float(s_ptr[0]));
+  c[1] = __fmul_rn(c[1], ScalarType<scalar_t>::num2float(s_ptr[1]));
+}
+
+// Wait until barrier reaches `count`, then lock for current threadblock.
+__device__ inline void barrier_acquire(int* lock, int count) {
+  if (threadIdx.x == 0) {
+    int state = -1;
+    do
+      // Guarantee that subsequent writes by this threadblock will be visible
+      // globally.
+      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
+                   : "=r"(state)
+                   : "l"(lock));
+    while (state != count);
+  }
+  __syncthreads();
+}
+
+// Release barrier and increment visitation count.
+__device__ inline void barrier_release(int* lock, bool reset = false) {
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    if (reset) {
+      lock[0] = 0;
+      return;
+    }
+    int val = 1;
+    // Make sure that all writes since acquiring this barrier are visible
+    // globally, while releasing the barrier.
+    asm volatile("fence.acq_rel.gpu;\n");
+    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
+                 :
+                 : "l"(lock), "r"(val));
+  }
+}
+
+// Wait until value of lock to be negative, and then add 1
+__device__ inline void wait_negative_and_add(int* lock) {
+  if (threadIdx.x == 0) {
+    int state = 0;
+    do
+      // Guarantee that subsequent writes by this threadblock will be visible
+      // globally.
+      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
+                   : "=r"(state)
+                   : "l"(lock));
+    while (state >= 0);
+    atomicAdd(lock, 1);
+  }
+  __syncthreads();
+}
+
+template <typename scalar_t,  // compute dtype, half or nv_float16
+          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const bool m_block_size_8,  // whether m_block_size == 8
+                                      // only works when thread_m_blocks == 1
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const int group_blocks,  // number of consecutive 16x16 blocks
+                                   // with a separate quantization scale
+          const bool is_zp_float   // is zero point of float16 type?
+          >
+__global__ void Marlin(
+    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // fp16 output buffer of shape mxn
+    int4* __restrict__ C_tmp,    // fp32 tmp output buffer (for reduce)
+    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
+                                          // (k/groupsize)xn
+    const uint16_t* __restrict__ scale2_ptr,  // fp16 global scale (for nvfp4
+                                              // only)
+    const int4* __restrict__ zp_ptr,  // 4bit packed zero-points of shape
+                                      // (k/groupsize)x(n/pack_factor)
+    const int* __restrict__ g_idx,    // int32 group indices of shape k
+    const int32_t* __restrict__ sorted_token_ids_ptr,        // moe sorted_ids
+    const int32_t* __restrict__ expert_ids_ptr,              // moe expert ids
+    const int32_t* __restrict__ num_tokens_past_padded_ptr,  // moe num tokens
+    const float* __restrict__ topk_weights_ptr,              // moe top weights
+    int top_k,              // num of experts per token
+    bool mul_topk_weights,  // mul topk weights or not
+    bool is_ep,             // expert parallelism
+    int num_groups,         // number of scale groups per output channel
+    int prob_m,             // batch dimension m
+    int prob_n,             // output dimension n
+    int prob_k,             // reduction dimension k
+    int* locks,             // extra global storage for barrier synchronization
+    bool use_atomic_add,    // whether to use atomic add to reduce
+    bool use_fp32_reduce,   // whether to use fp32 global reduce
+    int max_shared_mem) {
+  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
+  // same size, which might involve multiple column "slices" (of width 16 *
+  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
+  // example:
+  //   0 1 3
+  //   0 2 3
+  //   1 2 4
+  // While this kind of partitioning makes things somewhat more complicated, it
+  // ensures good utilization of all SMs for many kinds of shape and GPU
+  // configurations, while requiring as few slow global cross-threadblock
+  // reductions as possible.
+  using Dtype = ScalarType<scalar_t>;
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  using FragA = typename ScalarType<scalar_t>::FragA;
+  using FragB = typename ScalarType<scalar_t>::FragB;
+  using FragC = typename ScalarType<scalar_t>::FragC;
+  using FragS = typename ScalarType<scalar_t>::FragS;
+  using FragZP = typename ScalarType<scalar_t>::FragZP;
+
+  extern __shared__ int4 sh[];
+  static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id);
+  constexpr bool has_zp = w_type == vllm::kU4 || w_type == vllm::kU8;
+  constexpr bool is_int_type = w_type == vllm::kU4 || w_type == vllm::kU8 ||
+                               w_type == vllm::kU4B8 || w_type == vllm::kU8B128;
+  // see comments of dequant.h for more details
+  constexpr bool dequant_skip_flop =
+      !is_int_type ||
+      has_zp && !is_zp_float && !std::is_same<scalar_t, nv_bfloat16>::value ||
+      has_zp && !is_zp_float && !(w_type == vllm::kU8);
+
+  scalar_t2 global_scale;
+
+  constexpr bool has_act_order = group_blocks == 0;
+
+  constexpr int pack_factor = 32 / w_type.size_bits();
+  static_assert(thread_m_blocks == 1 || !m_block_size_8);
+  constexpr int moe_block_size = m_block_size_8 ? 8 : (16 * thread_m_blocks);
+  const int group_size =
+      (!has_act_order && group_blocks == -1) ? prob_k : prob_k / num_groups;
+  const int scales_expert_stride =
+      prob_n * prob_k / group_size / (w_type == vllm::kFE2M1f ? 16 : 8);
+  const int zp_expert_stride =
+      is_zp_float ? prob_n * prob_k / group_size / 8
+                  : prob_n * prob_k / group_size / (pack_factor * 4);
+
+  // parallel: num valid moe blocks
+  int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
+  int parallel = num_tokens_past_padded / moe_block_size;
+  int num_valid_blocks = parallel;
+  if (is_ep) {
+    for (int i = 0; i < parallel; i++) {
+      if (expert_ids_ptr[i] == -1) num_valid_blocks--;
+    }
+  }
+  int num_invalid_blocks = parallel - num_valid_blocks;
+  parallel = num_valid_blocks;
+
+  int k_tiles = prob_k / 16 / thread_k_blocks;
+  int n_tiles = prob_n / 16 / thread_n_blocks;
+  int iters = div_ceil(k_tiles * n_tiles * parallel, gridDim.x);
+
+  if constexpr (!has_act_order && group_blocks != -1) {
+    if (group_blocks >= thread_k_blocks) {
+      // Ensure that the number of tiles in each stripe is a multiple of the
+      // groupsize; this avoids an annoying special case where a stripe starts
+      // in the middle of group.
+      iters = (group_blocks / thread_k_blocks) *
+              div_ceil(iters, (group_blocks / thread_k_blocks));
+    }
+  }
+
+  int slice_row = (iters * blockIdx.x) % k_tiles;
+  int slice_col_par = (iters * blockIdx.x) / k_tiles;
+  int slice_col = slice_col_par;
+  int slice_iters;  // number of threadblock tiles in the current slice
+  int slice_count =
+      0;          // total number of active threadblocks in the current slice
+  int slice_idx;  // index of threadblock in current slice; numbered bottom to
+                  // top
+
+  int par_id = 0;
+  int block_id = -1;
+  int64_t expert_id = 0;  // use int64 to avoid computation result overflow
+  int old_expert_id = 0;
+  int64_t B_expert_off = 0;
+
+  int4* sh_block_sorted_ids_int4 = sh;
+  int4* sh_rd_block_sorted_ids_int4 =
+      sh_block_sorted_ids_int4 + moe_block_size / 4;
+  int4* sh_block_topk_weights_int4 =
+      sh_rd_block_sorted_ids_int4 + moe_block_size / 4;
+  // sh_block_topk_weights_int4 only need (moe_block_size / 4);
+  // but we pad to align to 256 bytes
+  int4* sh_new =
+      sh_block_topk_weights_int4 + moe_block_size / 2 + moe_block_size;
+  int32_t* sh_block_sorted_ids =
+      reinterpret_cast<int*>(sh_block_sorted_ids_int4);
+  int32_t* sh_rd_block_sorted_ids =
+      reinterpret_cast<int*>(sh_rd_block_sorted_ids_int4);
+  scalar_t2* sh_block_topk_weights =
+      reinterpret_cast<scalar_t2*>(sh_block_topk_weights_int4);
+
+  int32_t block_num_valid_tokens = 0;
+  int32_t locks_off = 0;
+
+  // We can easily implement parallel problem execution by just remapping
+  // indices and advancing global pointers
+  if (slice_col_par >= n_tiles) {
+    slice_col = slice_col_par % n_tiles;
+    par_id = slice_col_par / n_tiles;
+  }
+  if (parallel * n_tiles >= gridDim.x) {
+    // when parallel * n_tiles >= sms
+    // then there are at most $sms$ conflict tile blocks
+    locks_off = blockIdx.x;
+  } else {
+    locks_off = (iters * blockIdx.x) / k_tiles - 1;
+  }
+
+  // read moe block data given block_id
+  // block_sorted_ids / block_num_valid_tokens / block_topk_weights
+  auto read_moe_block_data = [&](int block_id) {
+    block_num_valid_tokens = moe_block_size;
+  #pragma unroll
+    for (int i = 0; i < moe_block_size / 4; i++) {
+      int4 sorted_token_ids_int4 = reinterpret_cast<const int4*>(
+          sorted_token_ids_ptr)[block_id * moe_block_size / 4 + i];
+      int* sorted_token_ids = reinterpret_cast<int*>(&sorted_token_ids_int4);
+  #pragma unroll
+      for (int j = 0; j < 4; j++) {
+        if (sorted_token_ids[j] >= prob_m * top_k) {
+          block_num_valid_tokens = i * 4 + j;
+          break;
+        }
+      }
+      if (block_num_valid_tokens != moe_block_size) break;
+    }
+
+    __syncthreads();
+    int tid4 = threadIdx.x / 4;
+    if (threadIdx.x % 4 == 0 && threadIdx.x < block_num_valid_tokens) {
+      sh_block_sorted_ids_int4[tid4] = reinterpret_cast<const int4*>(
+          sorted_token_ids_ptr)[block_id * moe_block_size / 4 + tid4];
+
+  #pragma unroll
+      for (int i = 0; i < 4; i++)
+        sh_rd_block_sorted_ids[tid4 * 4 + i] =
+            sh_block_sorted_ids[tid4 * 4 + i] / top_k;
+
+      if (mul_topk_weights) {
+  #pragma unroll
+        for (int i = 0; i < 4; i++) {
+          int idx = tid4 * 4 + i;
+          idx = idx < block_num_valid_tokens ? idx : 0;
+          if constexpr (w_type == vllm::kFE2M1f) {
+            sh_block_topk_weights[idx] = __hmul2(
+                global_scale, Dtype::num2num2(Dtype::float2num(
+                                  topk_weights_ptr[sh_block_sorted_ids[idx]])));
+          } else {
+            sh_block_topk_weights[idx] = Dtype::num2num2(
+                Dtype::float2num(topk_weights_ptr[sh_block_sorted_ids[idx]]));
+          }
+        }
+      }
+    }
+    __syncthreads();
+  };
+
+  // when move to next moe block, find the next block_id and expert_id
+  // and then read moe block data
+  auto update_next_moe_block_data = [&]() {
+    if (par_id >= parallel) return;
+
+    old_expert_id = expert_id;
+    if (num_invalid_blocks > 0) {
+      int skip_count = block_id == -1 ? par_id : 0;
+      block_id++;
+      for (int i = block_id; i < num_tokens_past_padded / moe_block_size; i++) {
+        expert_id = expert_ids_ptr[i];
+        if (expert_id != -1) {
+          if (skip_count == 0) {
+            block_id = i;
+            break;
+          };
+          skip_count--;
+        };
+      }
+    } else {
+      block_id = par_id;
+      expert_id = expert_ids_ptr[block_id];
+    }
+
+    if constexpr (w_type == vllm::kFE2M1f) {
+      uint16_t val = scale2_ptr[expert_id];
+      global_scale = Dtype::num2num2(*reinterpret_cast<scalar_t*>(&val));
+    }
+
+    B_expert_off = expert_id * prob_n * prob_k / (pack_factor * 4);
+    scales_ptr += (expert_id - old_expert_id) * scales_expert_stride;
+    if constexpr (has_zp) {
+      zp_ptr += (expert_id - old_expert_id) * zp_expert_stride;
+    }
+    if constexpr (has_act_order) {
+      g_idx += (expert_id - old_expert_id) * prob_k;
+    }
+
+    read_moe_block_data(block_id);
+  };
+
+  // Compute all information about the current slice which is required for
+  // synchronization.
+  auto init_slice = [&](bool first_init = false) {
+    slice_iters =
+        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
+    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
+    if (slice_iters == 0) return;
+    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
+    slice_count = 1;
+    slice_idx = 0;
+    int col_first = iters * div_ceil(k_tiles * slice_col_par, iters);
+    if (col_first <= k_tiles * (slice_col_par + 1)) {
+      int col_off = col_first - k_tiles * slice_col_par;
+      slice_count = div_ceil(k_tiles - col_off, iters);
+      if (col_off > 0) slice_count++;
+      int delta_first = iters * blockIdx.x - col_first;
+      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
+        slice_idx = slice_count - 1;
+      else {
+        slice_idx = slice_count - 1 - delta_first / iters;
+        if (col_off > 0) slice_idx--;
+      }
+    }
+    if (parallel * n_tiles >= gridDim.x) {
+      if (slice_count > 1 && slice_idx == slice_count - 1) {
+        locks_off++;
+      }
+    } else {
+      locks_off++;
+    }
+
+    if (first_init && use_atomic_add && slice_count > 1 && slice_idx == 0) {
+      constexpr int threads_per_m = 16 * thread_n_blocks / 8;
+      int m_per_thread =
+          div_ceil(block_num_valid_tokens, threads / threads_per_m);
+      for (int i = 0; i < m_per_thread; i++) {
+        int row = threads / threads_per_m * i + threadIdx.x / threads_per_m;
+        if (row < block_num_valid_tokens) {
+          int64_t sorted_row = sh_block_sorted_ids[row];
+          int col = slice_col * 16 * thread_n_blocks / 8 +
+                    threadIdx.x % threads_per_m;
+          C[sorted_row * prob_n / 8 + col] = {0, 0, 0, 0};
+        }
+      }
+      // After write zero to output, write a negative value to lock.
+      // Every SM that processes the same slice would wait for
+      // the negative value, and then atomicAdd 1 to it.
+      // After all SMs are processed, the lock value would back to 0 again.
+      __syncthreads();
+      if (threadIdx.x == 0) locks[locks_off] = 1 - slice_count;
+    }
+
+    if (slice_col == n_tiles) {
+      slice_col = 0;
+      par_id++;
+      update_next_moe_block_data();
+    }
+  };
+
+  update_next_moe_block_data();
+  init_slice(true);
+
+  // A sizes/strides
+
+  // stride of the A matrix in global memory
+  int a_gl_stride = prob_k / 8;
+  // stride of an A matrix tile in shared memory
+  constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
+  // delta between subsequent A tiles in global memory
+  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8;
+  // between subsequent accesses within a tile
+  int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory writes
+  constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory tile reads
+  constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4));
+  // within a shared memory tile
+  constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
+  // overall size of a tile
+  constexpr int a_sh_stage = a_sh_stride * (16 * thread_m_blocks);
+  // number of shared write iterations for a tile
+  constexpr int a_sh_wr_iters = div_ceil(a_sh_stage, a_sh_wr_delta);
+
+  // B sizes/strides
+  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
+  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
+  constexpr int b_thread_vecs = w_type.size_bits() == 4 ? 1 : 2;
+  constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
+
+  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
+  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
+  constexpr int b_sh_wr_delta = threads * b_thread_vecs;
+  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
+  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
+  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
+
+  // Scale sizes/strides without act_order
+  int s_gl_stride = prob_n / 8;
+  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
+  constexpr int s_tb_groups =
+      !has_act_order && group_blocks != -1 && group_blocks < thread_k_blocks
+          ? thread_k_blocks / group_blocks / (w_type == vllm::kFE2M1f ? 2 : 1)
+          : 1;
+  constexpr int s_sh_stage = s_tb_groups * s_sh_stride;
+  int s_gl_rd_delta = s_gl_stride;
+
+  // Scale size/strides with act_order
+  constexpr int tb_k = 16 * thread_k_blocks;
+  constexpr int g_idx_stage = has_act_order ? (tb_k * sizeof(int)) / 16 : 0;
+  // constexpr int act_s_row_stride      = 1;
+  // int           act_s_col_stride      = act_s_row_stride * num_groups;
+  constexpr int act_s_max_num_groups = 32;
+  int act_s_col_stride = 1;
+  int act_s_col_warp_stride = act_s_col_stride * 8;
+  int tb_n_warps = thread_n_blocks / 4;
+  int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
+
+  // Zero-points sizes/strides
+  int zp_gl_stride = is_zp_float ? prob_n / 8 : (prob_n / pack_factor) / 4;
+  constexpr int zp_sh_stride = is_zp_float
+                                   ? 16 * thread_n_blocks / 8
+                                   : ((16 * thread_n_blocks) / pack_factor) / 4;
+  constexpr int zp_tb_groups = s_tb_groups;
+  constexpr int zp_sh_stage = has_zp ? zp_tb_groups * zp_sh_stride : 0;
+  int zp_gl_rd_delta = zp_gl_stride;
+
+  // Global A read index of current thread.
+  int a_gl_rd_row = threadIdx.x / a_gl_rd_delta_o;
+  int a_gl_rd_col = a_gl_rd_delta_o * slice_row + threadIdx.x % a_gl_rd_delta_o;
+
+  // Shared write index of current thread.
+  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  // Shared read index.
+  int a_sh_rd =
+      a_sh_stride * ((threadIdx.x % 32) % (16 / (m_block_size_8 ? 2 : 1))) +
+      (threadIdx.x % 32) / (16 / (m_block_size_8 ? 2 : 1));
+  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
+
+  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
+                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
+  b_gl_rd += b_sh_stride * slice_col;
+  b_gl_rd += b_gl_rd_delta_o * slice_row;
+  auto b_sh_wr = threadIdx.x * b_thread_vecs;
+  auto b_sh_rd = threadIdx.x * b_thread_vecs;
+
+  // For act_order
+  constexpr int k_iter_size = tb_k / b_sh_wr_iters;
+  int slice_k_start = tb_k * slice_row;
+  int slice_k_finish = slice_k_start + tb_k * slice_iters;
+  int slice_k_start_shared_fetch = slice_k_start;
+  int slice_n_offset = act_s_col_tb_stride * slice_col;
+
+  // No act_order
+  int s_gl_rd;
+  if constexpr (!has_act_order) {
+    if constexpr (group_blocks == -1) {
+      s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+    } else {
+      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) /
+                    (w_type == vllm::kFE2M1f ? 2 : 1) +
+                s_sh_stride * slice_col + threadIdx.x;
+    }
+  }
+  auto s_sh_wr = threadIdx.x;
+  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
+
+  // Zero-points
+  int zp_gl_rd;
+  if constexpr (has_zp) {
+    if constexpr (group_blocks == -1) {
+      zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
+    } else {
+      zp_gl_rd = zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+                 zp_sh_stride * slice_col + threadIdx.x;
+    }
+  }
+  auto zp_sh_wr = threadIdx.x;
+  bool zp_sh_wr_pred = threadIdx.x < zp_sh_stride;
+
+  // We use a different scale layout for grouped and column-wise quantization as
+  // we scale a `half2` tile in column-major layout in the former and in
+  // row-major in the latter case.
+  int s_sh_rd;
+  if constexpr (group_blocks != -1 && w_type == vllm::kFE2M1f) {
+    auto warp_id = threadIdx.x / 32;
+    int n_warps = thread_n_blocks / 4;
+    int warp_row = warp_id / n_warps;
+
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) / 4;
+    s_sh_rd = s_sh_rd * 2 + warp_row % 2;
+
+  } else if constexpr (group_blocks != -1)
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) / 4;
+  else if constexpr (group_blocks == -1 &&
+                     (m_block_size_8 || (has_zp && !dequant_skip_flop)))
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) / 8;
+  else
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) % 4;
+
+  // Zero-points have the same read layout as the scales
+  // (without column-wise case)
+  constexpr int num_col_threads = 8;
+  constexpr int num_row_threads = 4;
+  constexpr int num_ints_per_thread = 8 / pack_factor;
+  int zp_sh_rd;
+  if constexpr (has_zp) {
+    if constexpr (is_zp_float) {
+      if constexpr (group_blocks != -1) {
+        zp_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                   (threadIdx.x % 32) / 4;
+      }
+    } else {
+      zp_sh_rd = num_ints_per_thread * num_col_threads *
+                     ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                 num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
+    }
+  }
+
+  // To ensure that writing and reading A tiles to/from shared memory, the
+  // latter in fragment format, is fully bank conflict free, we need to use a
+  // rather fancy XOR-based layout. The key here is that neither reads nor
+  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
+  // same shared memory banks. Further, it seems (based on NSight-Compute) that
+  // each warp must also write a consecutive memory segment?
+  auto transform_a = [&](int i) {
+    int row = i / a_gl_rd_delta_o;
+    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ (row % 8);
+  };
+  // Since the computation of this remapping is non-trivial and, due to our main
+  // loop unrolls, all shared memory accesses are static, we simply precompute
+  // both transformed reads and writes.
+  int a_sh_wr_trans[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
+  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++) {
+  #pragma unroll
+    for (int j = 0; j < thread_m_blocks; j++)
+      a_sh_rd_trans[i][j] =
+          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+  }
+
+  // Since B-accesses have non-constant stride they have to be computed at
+  // runtime; we break dependencies between subsequent accesses with a tile by
+  // maintining multiple pointers (we have enough registers), a tiny
+  // optimization.
+  const int4* B_ptr[b_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++)
+    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
+
+  // Shared memory storage for global fetch pipelines.
+  constexpr int sh_red_size = (2 * thread_n_blocks + 1) * 16 * thread_m_blocks;
+  constexpr int sh_b_size = stages * b_sh_stage;
+  int4* sh_b = sh_new;
+  int4* sh_red = sh_new;
+  int4* sh_g_idx = sh_b + (sh_red_size > sh_b_size ? sh_red_size : sh_b_size);
+  int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
+  constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride)
+                                          : (stages * s_sh_stage);
+  int4* sh_s = sh_zp + (stages * zp_sh_stage);
+  // shared memory reused by reduction should be smaller than
+  // shared memory used by weight.
+  static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <=
+                stages * b_sh_stage);
+  int4* sh_a = sh_s + sh_s_size;
+  constexpr int shm_size_used =
+      moe_block_size + stages * (g_idx_stage + zp_sh_stage) + sh_s_size +
+      (sh_red_size > sh_b_size ? sh_red_size : sh_b_size);
+
+  // all remaining shared memory is used to cache A (input)
+  // sh_a_max_row is at least ` stages * 16 * thread_m_blocks `
+  int sh_a_max_row =
+      ((max_shared_mem - 1024) / 16 - shm_size_used) / (thread_k_blocks * 2);
+
+  // Register storage for double buffer of shared memory reads.
+  FragA frag_a[2][thread_m_blocks];
+  I4 frag_b_quant[2][b_thread_vecs];
+  FragC frag_c[thread_m_blocks][4][2];
+  FragS frag_s[2][4];                    // No act-order
+  FragS act_frag_s[2][4][4];             // For act-order
+  int frag_qzp[2][num_ints_per_thread];  // Zero-points
+  FragZP frag_zp;                        // Zero-points in fp16
+  FragZP frag_zpf[2];                    // Zero-points in fp16 in HQQ
+
+  // Zero accumulators.
+  auto zero_accums = [&]() {
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
+      reinterpret_cast<float*>(frag_c)[i] = 0;
+  };
+
+  int sh_first_group_id = -1;
+  int sh_num_groups = -1;
+
+  auto fetch_act_order_scales_to_shared = [&](bool is_async, int first_group_id,
+                                              int last_group_id) {
+    sh_first_group_id = first_group_id;
+    sh_num_groups = last_group_id - first_group_id + 1;
+
+    if (sh_num_groups > act_s_max_num_groups) {
+      sh_num_groups = act_s_max_num_groups;
+    }
+
+    if (sh_first_group_id + sh_num_groups > num_groups) {
+      sh_num_groups = num_groups - sh_first_group_id;
+    }
+
+    int row_offset = first_group_id * s_gl_stride;
+
+    if (is_async) {
+      for (int i = 0; i < sh_num_groups; i++) {
+        if (threadIdx.x < s_sh_stride) {
+          cp_async4_pred(&sh_s[(i * s_sh_stride) + threadIdx.x],
+                         &scales_ptr[row_offset + (i * s_gl_stride) +
+                                     slice_n_offset + threadIdx.x]);
+        }
+      }
+    } else {
+      for (int i = 0; i < sh_num_groups; i++) {
+        if (threadIdx.x < s_sh_stride) {
+          sh_s[(i * s_sh_stride) + threadIdx.x] =
+              scales_ptr[row_offset + (i * s_gl_stride) + slice_n_offset +
+                         threadIdx.x];
+        }
+      }
+    }
+  };
+
+  // Asynchronously fetch the next A, B and s tile from global to the next
+  // shared memory pipeline location.
+  bool should_load_a = true;
+  int max_num_stage_groups =
+      ((sh_a_max_row - moe_block_size) / moe_block_size + 1) / stages;
+  max_num_stage_groups = max(max_num_stage_groups, 1);
+  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true,
+                             int pipe_a = 0) {
+    if (pred) {
+      if (should_load_a) {
+        int4* sh_a_stage = sh_a + moe_block_size * a_sh_stride * pipe_a;
+  #pragma unroll
+        for (int i = 0; i < a_sh_wr_iters; i++) {
+          int row = a_gl_rd_delta_i / a_gl_stride * i + a_gl_rd_row;
+          int64_t sorted_row = 0;
+          if (!m_block_size_8 || row < 8)
+            sorted_row = sh_rd_block_sorted_ids[row];
+          int64_t true_idx =
+              sorted_row * a_gl_stride + a_gl_rd_col + a_gl_rd_delta_o * a_off;
+          cp_async4_pred(&sh_a_stage[a_sh_wr_trans[i]], &A[true_idx],
+                         row < block_num_valid_tokens);
+        }
+      }
+
+      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < b_sh_wr_iters; i++) {
+  #pragma unroll
+        for (int j = 0; j < b_thread_vecs; j++) {
+          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j],
+                    B_ptr[i] + j + B_expert_off);
+        }
+
+        B_ptr[i] += b_gl_rd_delta_o;
+      }
+
+      if constexpr (has_act_order) {
+        // Fetch g_idx thread-block portion
+        int full_pipe = a_off;
+        int cur_k = slice_k_start_shared_fetch + tb_k * full_pipe;
+        if (cur_k < prob_k && cur_k < slice_k_finish) {
+          int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+
+          int4 const* cur_g_idx_stage_ptr =
+              reinterpret_cast<int4 const*>(&g_idx[cur_k]);
+
+          if (threadIdx.x < g_idx_stage) {
+            cp_async4_pred(&sh_g_idx_stage[threadIdx.x],
+                           &cur_g_idx_stage_ptr[threadIdx.x]);
+          }
+        }
+      } else {
+        if constexpr (group_blocks != -1) {
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+
+          if constexpr (group_blocks >= thread_k_blocks) {
+            // Only fetch scales if this tile starts a new group
+            if (pipe % (group_blocks / thread_k_blocks) == 0) {
+              if (s_sh_wr_pred) {
+                cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
+              }
+              s_gl_rd += s_gl_rd_delta;
+            }
+          } else {
+            for (int i = 0; i < s_tb_groups; i++) {
+              if (s_sh_wr_pred) {
+                cp_async4(&sh_s_stage[i * s_sh_stride + s_sh_wr],
+                          &scales_ptr[s_gl_rd]);
+              }
+              s_gl_rd += s_gl_rd_delta;
+            }
+          }
+        }
+
+        if constexpr (has_zp && group_blocks != -1) {
+          int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
+
+          if constexpr (group_blocks >= thread_k_blocks) {
+            // Only fetch zero-points if this tile starts a new group
+            if (pipe % (group_blocks / thread_k_blocks) == 0) {
+              if (zp_sh_wr_pred) {
+                cp_async4(&sh_zp_stage[zp_sh_wr], &zp_ptr[zp_gl_rd]);
+              }
+              zp_gl_rd += zp_gl_rd_delta;
+            }
+          } else {
+            for (int i = 0; i < zp_tb_groups; i++) {
+              if (zp_sh_wr_pred) {
+                cp_async4(&sh_zp_stage[i * zp_sh_stride + zp_sh_wr],
+                          &zp_ptr[zp_gl_rd]);
+              }
+              zp_gl_rd += zp_gl_rd_delta;
+            }
+          }
+        }
+      }
+    }
+    // Insert a fence even when we are winding down the pipeline to ensure that
+    // waiting is also correct at this point.
+    cp_async_fence();
+  };
+
+  auto fetch_col_zp_to_shared = [&]() {
+    if (zp_sh_wr_pred) {
+      cp_async4(&sh_zp[zp_sh_wr], &zp_ptr[zp_gl_rd]);
+    }
+  };
+
+  auto fetch_col_scale_to_shared = [&]() {
+    if (s_sh_wr_pred) {
+      cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
+    }
+  };
+
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<stages - 2>();
+    __syncthreads();
+  };
+
+  // Load the next sub-tile from the current location in the shared memory pipe
+  // into the current register buffer.
+  auto fetch_to_registers = [&](int k, int pipe, int pipe_a = 0) {
+    int4* sh_a_stage = sh_a + moe_block_size * a_sh_stride * pipe_a;
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks; i++)
+      ldsm<m_block_size_8 ? 2 : 4, scalar_t>(
+          frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
+    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+
+  #pragma unroll
+    for (int i = 0; i < b_thread_vecs; i++) {
+      frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
+          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
+    }
+  };
+
+  bool is_same_group[stages];
+  int same_group_id[stages];
+
+  auto init_same_group = [&](int pipe) {
+    if constexpr (!has_act_order) {
+      return;
+    }
+
+    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);
+
+    int group_id_1 = sh_g_idx_int_ptr[0];
+    int group_id_2 = sh_g_idx_int_ptr[tb_k - 1];
+
+    is_same_group[pipe] = group_id_1 == group_id_2;
+    same_group_id[pipe] = group_id_1;
+  };
+
+  auto fetch_scales_to_registers = [&](int k, int full_pipe) {
+    int pipe = full_pipe % stages;
+
+    if constexpr (!has_act_order) {
+      // No act-order case
+      if constexpr (group_blocks == -1) {
+        // load only when starting a new slice
+        if (k == 0 && full_pipe == 0) {
+          reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd];
+          reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+        }
+      } else if constexpr (group_blocks != -1) {
+        if constexpr (group_blocks >= thread_k_blocks) {
+          if (k % b_sh_wr_iters == 0) {
+            int4* sh_s_stage =
+                sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
+                                     (pipe / (group_blocks / thread_k_blocks)));
+            reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
+          } else {
+            reinterpret_cast<int4*>(&frag_s[1])[0] =
+                reinterpret_cast<int4*>(&frag_s[0])[0];
+          }
+        } else {
+          auto warp_id = threadIdx.x / 32;
+          int n_warps = thread_n_blocks / 4;
+
+          int warp_row = warp_id / n_warps;
+
+          int cur_k = warp_row * 16;
+          cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+          int k_blocks = cur_k / 16;
+          int cur_group_id =
+              k_blocks / (group_blocks * (w_type == vllm::kFE2M1f ? 2 : 1));
+
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+
+          if constexpr (w_type_id != vllm::kFE2M1f.id()) {
+            reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
+                sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
+          } else {
+            reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
+                reinterpret_cast<int2*>(
+                    sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride)];
+          }
+        }
+      }
+
+      return;
+    }
+
+    // Act-order case
+
+    // Determine K of the "current" thread-block
+    int cur_k = slice_k_start + tb_k * full_pipe;
+    if (cur_k >= prob_k || cur_k >= slice_k_finish) {
+      return;
+    }
+
+    // Reset (to current thread-block) since we read g_idx portion from the
+    // shared memory
+    cur_k = 0;
+
+    // Progress to current iteration
+    cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+    // Determine "position" inside the thread-block (based on warp and
+    // thread-id)
+    auto warp_id = threadIdx.x / 32;
+    int n_warps =
+        thread_n_blocks / 4;  // Each warp processes 4 16-size tiles over N
+
+    int warp_row = warp_id / n_warps;
+    int warp_col = warp_id % n_warps;
+
+    cur_k += warp_row * 16;
+
+    auto th_id = threadIdx.x % 32;
+    cur_k += (th_id % 4) * 2;  // Due to tensor-core layout for fp16 B matrix
+
+    int s_col_shift =
+        /*slice_n_offset +*/ (act_s_col_warp_stride * warp_col) +
+        (th_id / 4) * act_s_col_stride;
+
+    if (is_same_group[pipe]) {
+      if (k % 2 == 0) {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
+            sh_s[(same_group_id[pipe] - sh_first_group_id) * s_sh_stride +
+                 s_col_shift];
+      } else {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
+            *(reinterpret_cast<int4*>(&(act_frag_s[(k - 1) % 2][0][0])));
+      }
+
+      for (int i = 1; i < 4; i++) {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
+            *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0])));
+      }
+      return;
+    }
+
+    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);
+
+    constexpr int k_frag_offsets[4] = {0, 1, 8,
+                                       9};  // Tensor core offsets per thread
+
+  #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      int actual_k = cur_k + k_frag_offsets[i];
+
+      int group_id = sh_g_idx_int_ptr[actual_k];
+      int rel_group_id = group_id - sh_first_group_id;
+
+      *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
+          sh_s[rel_group_id * s_sh_stride + s_col_shift];
+    }
+  };
+
+  auto fetch_zp_to_registers = [&](int k, int full_pipe) {
+    // This code does not handle group_blocks == 0,
+    // which signifies act_order.
+    // has_zp implies AWQ, which doesn't have act_order,
+    static_assert(!has_zp || group_blocks != 0);
+
+    if constexpr (has_zp && !is_zp_float) {
+      int pipe = full_pipe % stages;
+
+      if constexpr (group_blocks == -1) {
+        // load only when starting a new slice
+        if (k == 0 && full_pipe == 0) {
+  #pragma unroll
+          for (int i = 0; i < num_ints_per_thread; i++) {
+            frag_qzp[k % 2][i] = (reinterpret_cast<int*>(sh_zp))[zp_sh_rd + i];
+          }
+        }
+
+      } else if constexpr (group_blocks >= thread_k_blocks) {
+        if (k % b_sh_wr_iters == 0) {
+          int4* sh_zp_stage =
+              sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) *
+                                     (pipe / (group_blocks / thread_k_blocks)));
+  #pragma unroll
+          for (int i = 0; i < num_ints_per_thread; i++) {
+            frag_qzp[k % 2][i] =
+                (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
+          }
+        }
+      } else {
+        auto warp_id = threadIdx.x / 32;
+        int n_warps = thread_n_blocks / 4;
+
+        int warp_row = warp_id / n_warps;
+
+        int cur_k = warp_row * 16;
+        cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+        int k_blocks = cur_k / 16;
+        int cur_group_id = 0;
+
+        // Suppress bogus and persistent divide-by-zero warning
+  #pragma nv_diagnostic push
+  #pragma nv_diag_suppress divide_by_zero
+        cur_group_id = k_blocks / group_blocks;
+  #pragma nv_diagnostic pop
+
+        int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
+
+        sh_zp_stage += cur_group_id * zp_sh_stride;
+
+  #pragma unroll
+        for (int i = 0; i < num_ints_per_thread; i++) {
+          frag_qzp[k % 2][i] =
+              (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
+        }
+      }
+    }
+
+    else if constexpr (has_zp && is_zp_float) {
+      int pipe = full_pipe % stages;
+
+      if constexpr (group_blocks != -1) {
+        if constexpr (group_blocks >= thread_k_blocks) {
+          if (k % b_sh_wr_iters == 0) {
+            int4* sh_zp_stage =
+                sh_zp +
+                zp_sh_stage * ((group_blocks / thread_k_blocks) *
+                               (pipe / (group_blocks / thread_k_blocks)));
+            reinterpret_cast<int4*>(&frag_zpf[k % 2])[0] =
+                sh_zp_stage[zp_sh_rd];
+          }
+        } else {
+          auto warp_id = threadIdx.x / 32;
+          int n_warps = thread_n_blocks / 4;
+
+          int warp_row = warp_id / n_warps;
+
+          int cur_k = warp_row * 16;
+          cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+          int k_blocks = cur_k / 16;
+          // Suppress bogus and persistent divide-by-zero warning
+  #pragma nv_diagnostic push
+  #pragma nv_diag_suppress divide_by_zero
+          int cur_group_id = k_blocks / group_blocks;
+  #pragma nv_diagnostic pop
+
+          int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
+
+          reinterpret_cast<int4*>(&frag_zpf[k % 2])[0] =
+              sh_zp_stage[zp_sh_rd + cur_group_id * zp_sh_stride];
+        }
+      }
+    }
+  };
+
+  auto dequant_data = [&](int q, scalar_t2* frag_b_ptr) {
+    dequant<scalar_t2, w_type_id, dequant_skip_flop>(q, frag_b_ptr);
+  };
+
+  // Execute the actual tensor core matmul of a sub-tile.
+  bool is_first_matmul_in_slice = true;
+  auto matmul = [&](int k) {
+    int k2 = k % 2;
+    const bool is_new_zp =
+        ((group_blocks != -1) && (group_blocks < thread_k_blocks || k == 0)) ||
+        (group_blocks == -1 && is_first_matmul_in_slice);
+    if constexpr (has_zp && !is_zp_float) {
+      if (is_new_zp) {
+        if constexpr (group_blocks == -1) is_first_matmul_in_slice = false;
+        int zp_quant_0, zp_quant_1;
+
+        if constexpr (w_type.size_bits() == 4) {
+          zp_quant_0 = frag_qzp[k2][0];
+          zp_quant_1 = zp_quant_0 >> 8;
+        } else {
+          static_assert(w_type.size_bits() == 8);
+          zp_quant_0 = frag_qzp[k2][0];
+          zp_quant_1 = frag_qzp[k2][1];
+        }
+
+        dequant_data(zp_quant_0, reinterpret_cast<scalar_t2*>(&frag_zp));
+        dequant_data(zp_quant_1, reinterpret_cast<scalar_t2*>(&frag_zp) + 2);
+      }
+    }
+    if constexpr (!dequant_skip_flop && has_zp && is_zp_float) {
+      if (is_new_zp) {
+        reinterpret_cast<int4*>(&frag_zp)[0] =
+            reinterpret_cast<int4*>(&frag_zpf[k2])[0];
+      }
+    }
+
+    if constexpr (w_type == vllm::kFE2M1f) {
+      int s_quant_0 = reinterpret_cast<int*>(frag_s[k2])[0];
+      int s_quant_1 = reinterpret_cast<int*>(frag_s[k2])[1];
+
+      dequant_fp8_scales<scalar_t2>(s_quant_0,
+                                    reinterpret_cast<scalar_t2*>(&frag_s[k2]));
+      dequant_fp8_scales<scalar_t2>(
+          s_quant_1, reinterpret_cast<scalar_t2*>(&frag_s[k2]) + 2);
+    }
+
+  // We have the m dimension as the inner loop in order to encourage overlapping
+  // dequantization and matmul operations.
+  #pragma unroll
+    for (int j = 0; j < 4; j++) {
+      FragB frag_b0;
+      FragB frag_b1;
+      int b_quant_0, b_quant_1;
+
+      if constexpr (w_type_id == vllm::kFE2M1f.id()) {
+        b_quant_1 = frag_b_quant[k2][0][j];
+        b_quant_0 = b_quant_1 << 8;
+      } else if constexpr (w_type.size_bits() == 4) {
+        b_quant_0 = frag_b_quant[k2][0][j];
+        b_quant_1 = b_quant_0 >> 8;
+      } else {
+        static_assert(w_type.size_bits() == 8);
+        int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k2]);
+        b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
+        b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
+      }
+
+      dequant_data(b_quant_0, reinterpret_cast<scalar_t2*>(&frag_b0));
+      dequant_data(b_quant_1, reinterpret_cast<scalar_t2*>(&frag_b1));
+
+      if constexpr (dequant_skip_flop && has_zp && !is_zp_float) {
+        sub_zp<scalar_t>(frag_b0, frag_zp[j], 0);
+        sub_zp<scalar_t>(frag_b1, frag_zp[j], 1);
+      }
+
+      // Apply scale to frag_b0
+      if constexpr (has_act_order) {
+        static_assert(group_blocks != -1);
+        scale4<scalar_t>(frag_b0, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
+                         act_frag_s[k2][2][j], act_frag_s[k2][3][j], 0);
+        scale4<scalar_t>(frag_b1, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
+                         act_frag_s[k2][2][j], act_frag_s[k2][3][j], 1);
+      } else if constexpr (!dequant_skip_flop && has_zp && !is_zp_float &&
+                           group_blocks == -1) {
+        int idx = (threadIdx.x / 4) % 2;
+        scalar_t2 s2 = Dtype::nums2num2(
+            reinterpret_cast<scalar_t*>(&frag_s[j / 2][j % 2 * 2 + 0])[idx],
+            reinterpret_cast<scalar_t*>(&frag_s[j / 2][j % 2 * 2 + 1])[idx]);
+        if (is_new_zp) frag_zp[j] = __hmul2(frag_zp[j], s2);
+        scale_and_sub<scalar_t>(frag_b0, s2.x, frag_zp[j].x);
+        scale_and_sub<scalar_t>(frag_b1, s2.y, frag_zp[j].y);
+      } else if constexpr (!dequant_skip_flop && has_zp && group_blocks != -1) {
+        if (is_new_zp)
+          frag_zp[j] = __hmul2(frag_zp[j],
+                               *reinterpret_cast<scalar_t2*>(&frag_s[k2][j]));
+        scale_and_sub<scalar_t>(frag_b0, frag_s[k2][j][0].x, frag_zp[j].x);
+        scale_and_sub<scalar_t>(frag_b1, frag_s[k2][j][0].y, frag_zp[j].y);
+      } else if constexpr (group_blocks != -1) {
+        scale<scalar_t>(frag_b0, frag_s[k2][j], 0);
+        scale<scalar_t>(frag_b1, frag_s[k2][j], 1);
+      }
+
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        if constexpr (m_block_size_8) {
+          mma_trans<scalar_t>(frag_a[k2][i], frag_b0, frag_b1, frag_c[i][j][0]);
+        } else {
+          mma<scalar_t>(frag_a[k2][i], frag_b0, frag_c[i][j][0]);
+          mma<scalar_t>(frag_a[k2][i], frag_b1, frag_c[i][j][1]);
+        }
+      }
+    }
+  };
+
+  // Since we slice across the k dimension of a tile in order to increase the
+  // number of warps while keeping the n dimension of a tile reasonable, we have
+  // multiple warps that accumulate their partial sums of the same output
+  // location; which we have to reduce over in the end. We do in shared memory.
+  auto thread_block_reduce = [&]() {
+    constexpr int red_off = threads / b_sh_stride_threads / 2;
+    if (red_off >= 1) {
+      auto red_idx = threadIdx.x / b_sh_stride_threads;
+      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
+      constexpr int red_sh_delta = b_sh_stride_threads;
+      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
+                      (threadIdx.x % b_sh_stride_threads);
+
+      // Parallel logarithmic shared memory reduction. We make sure to avoid any
+      // unnecessary read or write iterations, e.g., for two warps we write only
+      // once by warp 1 and read only once by warp 0.
+
+  #pragma unroll
+      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
+  #pragma unroll
+        for (int i = red_off; i > 0; i /= 2) {
+          if (i <= red_idx && red_idx < 2 * i) {
+  #pragma unroll
+            for (int j = 0; j < 4 * 2; j += (m_block_size_8 ? 2 : 1)) {
+              int red_sh_wr =
+                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
+              if (i < red_off) {
+                float* c_rd = reinterpret_cast<float*>(
+                    &sh_red[red_sh_delta * j + red_sh_rd]);
+                float* c_wr = reinterpret_cast<float*>(&sh_red[red_sh_wr]);
+  #pragma unroll
+                for (int k = 0; k < 4; k++)
+                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
+                      c_rd[k] + c_wr[k];
+              }
+              sh_red[red_sh_wr] =
+                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
+            }
+          }
+          __syncthreads();
+        }
+        if (red_idx == 0) {
+  #pragma unroll
+          for (int i = 0; i < 4 * 2; i += (m_block_size_8 ? 2 : 1)) {
+            float* c_rd =
+                reinterpret_cast<float*>(&sh_red[red_sh_delta * i + red_sh_rd]);
+  #pragma unroll
+            for (int j = 0; j < 4; j++)
+              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
+                  c_rd[j];
+          }
+        }
+        __syncthreads();
+      }
+    }
+  };
+
+  // Since multiple threadblocks may process parts of the same column slice, we
+  // finally have to globally reduce over the results. As the striped
+  // partitioning minimizes the number of such reductions and our outputs are
+  // usually rather small, we perform this reduction serially in L2 cache.
+  auto global_reduce_fp16 = [&](bool first = false, bool last = false) {
+    // We are very careful here to reduce directly in the output buffer to
+    // maximize L2 cache utilization in this step. To do this, we write out
+    // results in FP16 (but still reduce with FP32 compute).
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    bool is_th_active = threadIdx.x < active_threads;
+    if (!is_th_active) {
+      return;
+    }
+
+    int c_gl_stride = prob_n / 8;
+    int c_gl_wr_delta_o = 8 * c_gl_stride;
+    int c_gl_wr_delta_i = 4 * (active_threads / 32);
+    int c_gl_wr;
+    if constexpr (m_block_size_8) {
+      c_gl_wr = c_gl_stride * ((threadIdx.x % 4) * 2) + 4 * (threadIdx.x / 32) +
+                (threadIdx.x % 32) / 8;
+      c_gl_wr += (2 * thread_n_blocks) * slice_col;
+    } else {
+      c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
+                4 * (threadIdx.x / 32) + threadIdx.x % 4;
+      c_gl_wr += (2 * thread_n_blocks) * slice_col;
+    }
+    constexpr int c_sh_wr_delta = active_threads;
+    int c_sh_wr = threadIdx.x;
+
+    if (!first) {
+
+  #pragma unroll
+      for (int i = 0; i < (m_block_size_8 ? 2 : thread_m_blocks * 4); i++) {
+        int c_idx;
+        if constexpr (m_block_size_8)
+          c_idx = c_gl_wr + i * c_gl_stride +
+                  (threadIdx.x % 8) / 4 * c_gl_wr_delta_i;
+        else
+          c_idx =
+              c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2);
+        if (c_idx / c_gl_stride < block_num_valid_tokens) {
+          int64_t sorted_row = sh_block_sorted_ids[c_idx / c_gl_stride];
+          int64_t true_idx = sorted_row * c_gl_stride + c_idx % c_gl_stride;
+          sh_red[c_sh_wr + c_sh_wr_delta * i] = C[true_idx];
+        }
+      }
+    }
+
+  #pragma unroll
+    for (int i = 0; i < (m_block_size_8 ? 2 : thread_m_blocks * 4); i++) {
+      if (!first) {
+        int4 c_red = sh_red[c_sh_wr + i * c_sh_wr_delta];
+  #pragma unroll
+        for (int j = 0; j < 2 * 4; j++) {
+          int delta = 0;
+          if constexpr (m_block_size_8) {
+            delta = j % 2 == 1 ? -2 : 0;
+          }
+          reinterpret_cast<float*>(
+              &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4) + delta] +=
+              Dtype::num2float(reinterpret_cast<scalar_t*>(&c_red)[j]);
+        }
+      }
+      if (!last) {
+        int4 c;
+  #pragma unroll
+        for (int j = 0; j < 2 * 4; j++) {
+          int delta = 0;
+          if constexpr (m_block_size_8) {
+            delta = j % 2 == 1 ? -2 : 0;
+          }
+          reinterpret_cast<scalar_t*>(&c)[j] =
+              Dtype::float2num(reinterpret_cast<float*>(
+                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4) + delta]);
+        }
+
+        int c_idx;
+        if constexpr (m_block_size_8)
+          c_idx = c_gl_wr + i * c_gl_stride +
+                  (threadIdx.x % 8) / 4 * c_gl_wr_delta_i;
+        else
+          c_idx =
+              c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2);
+        if (c_idx / c_gl_stride < block_num_valid_tokens) {
+          int64_t sorted_row = sh_block_sorted_ids[c_idx / c_gl_stride];
+          int64_t true_idx = sorted_row * c_gl_stride + c_idx % c_gl_stride;
+          C[true_idx] = c;
+        }
+      }
+    }
+  };
+
+  // Globally reduce over threadblocks that compute the same column block.
+  // We use a tmp C buffer to reduce in full fp32 precision.
+  auto global_reduce_fp32 = [&](bool first = false, bool last = false) {
+    constexpr int tb_m = thread_m_blocks * 16;
+    constexpr int tb_n = thread_n_blocks * 16;
+
+    constexpr int c_size = tb_m * tb_n * sizeof(float) / 16;
+
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    bool is_th_active = threadIdx.x < active_threads;
+
+    constexpr int num_floats = thread_m_blocks * 4 * 2 * 4;
+    constexpr int th_size = num_floats * sizeof(float) / 16;
+
+    int c_cur_offset = locks_off * c_size;
+
+    if (!is_th_active) {
+      return;
+    }
+
+    if (!first) {
+      float* frag_c_ptr = reinterpret_cast<float*>(&frag_c);
+  #pragma unroll
+      for (int k = 0; k < th_size; k++) {
+        if constexpr (m_block_size_8) {
+          if (k % 2) continue;
+        } else {
+          if (k / 8 * 16 + (threadIdx.x % 32) / 4 >= block_num_valid_tokens)
+            continue;
+        }
+
+        sh_red[threadIdx.x] =
+            C_tmp[c_cur_offset + active_threads * k + threadIdx.x];
+
+        float* sh_c_ptr = reinterpret_cast<float*>(&sh_red[threadIdx.x]);
+  #pragma unroll
+        for (int f = 0; f < 4; f++) {
+          frag_c_ptr[k * 4 + f] += sh_c_ptr[f];
+        }
+      }
+    }
+
+    if (!last) {
+      int4* frag_c_ptr = reinterpret_cast<int4*>(&frag_c);
+  #pragma unroll
+      for (int k = 0; k < th_size; k++) {
+        if constexpr (m_block_size_8) {
+          if (k % 2) continue;
+        } else {
+          if (k / 8 * 16 + (threadIdx.x % 32) / 4 >= block_num_valid_tokens)
+            continue;
+        }
+
+        C_tmp[c_cur_offset + active_threads * k + threadIdx.x] = frag_c_ptr[k];
+      }
+    }
+  };
+
+  // Write out the reduce final result in the correct layout. We only actually
+  // reshuffle matrix fragments in this step, the reduction above is performed
+  // in fragment layout.
+  auto write_result = [&]() {
+    int c_gl_stride = prob_n / 8;
+    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
+    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
+    constexpr int c_sh_rd_delta =
+        c_sh_stride * (threads / (2 * thread_n_blocks));
+
+    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+    c_gl_wr += (2 * thread_n_blocks) * slice_col;
+    int c_sh_wr;
+    if constexpr (m_block_size_8) {
+      c_sh_wr = (8 * c_sh_stride) * ((threadIdx.x % 32) % 4 * 2) +
+                (threadIdx.x % 32) / 4;
+      c_sh_wr += 64 * (threadIdx.x / 32);
+    } else {
+      c_sh_wr =
+          (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
+      c_sh_wr += 32 * (threadIdx.x / 32);
+    }
+
+    int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+
+    // We first reorder in shared memory to guarantee the most efficient final
+    // global write patterns
+    auto write = [&](int idx, float c0, float c1, FragS& s) {
+      scalar_t2 res =
+          Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));
+
+      // For per-column quantization we finally apply the scale here (only for
+      // 4-bit)
+      if constexpr (!has_act_order && group_blocks == -1 &&
+                    w_type.size_bits() == 4 &&
+                    (has_zp && dequant_skip_flop || !has_zp)) {
+        res = __hmul2(res, s[0]);
+      }
+
+      if constexpr (w_type == vllm::kFE2M1f) {
+        if (!mul_topk_weights) {
+          res = __hmul2(res, global_scale);
+        }
+      }
+
+      if constexpr (m_block_size_8) {
+        ((scalar_t*)sh_red)[idx] = res.x;
+        ((scalar_t*)sh_red)[idx + 8 * c_sh_stride] = res.y;
+      } else {
+        ((scalar_t2*)sh_red)[idx] = res;
+      }
+    };
+
+    if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+        for (int j = 0; j < 4; j++) {
+          if constexpr (m_block_size_8) {
+            int wr = c_sh_wr + 16 * j;
+            write(wr, frag_c[i][j][0][0], frag_c[i][j][0][1],
+                  frag_s[j / 2][2 * (j % 2) + 0]);
+            write(wr + 8, frag_c[i][j][0][2], frag_c[i][j][0][3],
+                  frag_s[j / 2][2 * (j % 2) + 1]);
+          } else {
+            int wr = c_sh_wr + 8 * j;
+            write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
+                  frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
+            write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
+                  frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
+            write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
+                  frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
+            write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
+                  frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
+          }
+        }
+        c_sh_wr += 16 * (4 * c_sh_stride);
+      }
+    }
+    __syncthreads();
+
+  #pragma unroll
+    for (int i = 0;
+         i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
+         i++) {
+      int row = c_gl_wr / c_gl_stride;
+      if (row < block_num_valid_tokens) {
+        int64_t sorted_row = sh_block_sorted_ids[row];
+        int64_t true_idx = sorted_row * c_gl_stride + c_gl_wr % c_gl_stride;
+        scalar_t2 topk_weight_score;
+        if (mul_topk_weights) topk_weight_score = sh_block_topk_weights[row];
+        if (use_atomic_add && slice_count > 1 || mul_topk_weights) {
+          scalar_t2* C_half2 = reinterpret_cast<scalar_t2*>(&C[true_idx]);
+          scalar_t2* sh_red_half2 =
+              reinterpret_cast<scalar_t2*>(&sh_red[c_sh_rd]);
+  #pragma unroll
+          for (int a = 0; a < 4; a++) {
+            scalar_t2 res = sh_red_half2[a];
+            if (mul_topk_weights) {
+              res = __hmul2(res, topk_weight_score);
+            }
+
+            if (use_atomic_add && slice_count > 1) {
+              atomicAdd(&C_half2[a], res);
+            } else {
+              C_half2[a] = res;
+            };
+          }
+        } else {
+          C[true_idx] = sh_red[c_sh_rd];
+        }
+        c_gl_wr += c_gl_wr_delta;
+        c_sh_rd += c_sh_rd_delta;
+      }
+    }
+    __syncthreads();
+  };
+
+  // Start global fetch and register load pipelines.
+  auto start_pipes = [&]() {
+
+  #pragma unroll
+    for (int i = 0; i < stages - 1; i++) {
+      if (has_act_order && i == 0) {
+        int last_g_idx = slice_k_start + stages * tb_k * 2;
+        if (last_g_idx >= prob_k) {
+          last_g_idx = prob_k - 1;
+        }
+        fetch_act_order_scales_to_shared(true, g_idx[slice_k_start],
+                                         g_idx[last_g_idx]);
+      }
+
+      if constexpr (has_zp && !is_zp_float && group_blocks == -1) {
+        if (i == 0) {
+          fetch_col_zp_to_shared();
+          if constexpr (!dequant_skip_flop) {
+            fetch_col_scale_to_shared();
+          }
+        }
+      }
+      fetch_to_shared(i, i, i < slice_iters, i);
+    }
+
+    zero_accums();
+    wait_for_stage();
+    init_same_group(0);
+    fetch_to_registers(0, 0);
+    fetch_scales_to_registers(0, 0);
+    fetch_zp_to_registers(0, 0);
+    a_gl_rd_col += a_gl_rd_delta_o * (stages - 1);
+    if constexpr (has_act_order) {
+      slice_k_start_shared_fetch += tb_k * (stages - 1);
+    }
+  };
+  if (slice_iters) {
+    start_pipes();
+  }
+
+  // Main loop.
+  while (slice_iters) {
+    // We unroll over both the global fetch and the register load pipeline to
+    // ensure all shared memory accesses are static. Note that both pipelines
+    // have even length meaning that the next iteration will always start at
+    // index 0.
+
+    for (int stage_group_id = 0; stage_group_id < max_num_stage_groups;
+         stage_group_id++) {
+  #pragma unroll
+      for (int pipe = 0; pipe < stages;) {
+  #pragma unroll
+        for (int k = 0; k < b_sh_wr_iters; k++) {
+          int idx =
+              (pipe >= stages && stage_group_id == max_num_stage_groups - 1)
+                  ? (pipe - stages)
+                  : (pipe + stage_group_id * stages);
+          fetch_to_registers(k + 1, pipe % stages, idx);
+          fetch_scales_to_registers(k + 1, pipe);
+          fetch_zp_to_registers(k + 1, pipe);
+          if (k == b_sh_wr_iters - 2) {
+            int idx = (pipe >= 1 && stage_group_id == max_num_stage_groups - 1)
+                          ? (pipe - 1)
+                          : (pipe + (stage_group_id + 1) * stages - 1);
+            fetch_to_shared((pipe + stages - 1) % stages, pipe,
+                            slice_iters >= stages, idx);
+            pipe++;
+            wait_for_stage();
+            init_same_group(pipe % stages);
+          }
+          matmul(k);
+        }
+        slice_iters--;
+        if (slice_iters == 0) {
+          break;
+        }
+      }
+
+      a_gl_rd_col += a_gl_rd_delta_o * stages;
+
+      if constexpr (has_act_order) {
+        slice_k_start += tb_k * stages;
+
+        if (slice_k_start < prob_k) {
+          slice_k_start_shared_fetch += tb_k * stages;
+          int first_group_id = g_idx[slice_k_start];
+          int last_g_idx = slice_k_start + stages * tb_k * 2;
+          if (last_g_idx >= prob_k) {
+            last_g_idx = prob_k - 1;
+          }
+          int last_group_id = g_idx[last_g_idx];
+          if (last_group_id >= sh_first_group_id + sh_num_groups) {
+            fetch_act_order_scales_to_shared(false, first_group_id,
+                                             last_group_id);
+            __syncthreads();
+          }
+        }
+      }
+      if (slice_iters == 0) {
+        break;
+      }
+    }
+
+    // Process results and, if necessary, proceed to the next column slice.
+    // While this pattern may not be the most readable, other ways of writing
+    // the loop seemed to noticeably worse performance after compilation.
+    if (slice_iters == 0) {
+      cp_async_wait<0>();
+      bool last = slice_idx == slice_count - 1;
+      // For per-column scales, we only fetch them here in the final step before
+      // write-out
+      if constexpr (!has_act_order && group_blocks == -1 &&
+                    (has_zp && dequant_skip_flop || !has_zp)) {
+        if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
+          if (s_sh_wr_pred) {
+            cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
+          }
+          cp_async_fence();
+        }
+      }
+
+      thread_block_reduce();
+      if constexpr (!has_act_order && group_blocks == -1 &&
+                    (has_zp && dequant_skip_flop || !has_zp)) {
+        if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
+          cp_async_wait<0>();
+          __syncthreads();
+          if (threadIdx.x / 32 < thread_n_blocks / 4) {
+            reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+            reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+            if constexpr (m_block_size_8) {
+              int idx = (threadIdx.x / 4) % 2;
+              scalar_t2* frag_s_half2 = reinterpret_cast<scalar_t2*>(frag_s);
+  #pragma unroll
+              for (int i = 0; i < 8; i++) {
+                frag_s_half2[i] = Dtype::num2num2(
+                    reinterpret_cast<scalar_t*>(&frag_s_half2[i])[idx]);
+              }
+            }
+          }
+        }
+      }
+
+      // For 8-bit channelwise, we apply the scale before the global reduction
+      // that converts the fp32 results to fp16 (so that we avoid possible
+      // overflow in fp16)
+      if constexpr (!has_act_order && group_blocks == -1 &&
+                    w_type.size_bits() == 8 &&
+                    (has_zp && dequant_skip_flop || !has_zp)) {
+        if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+          for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+            for (int j = 0; j < 4; j++) {
+              scale_float<scalar_t>(
+                  reinterpret_cast<float*>(&frag_c[i][j][0][0]),
+                  frag_s[j / 2][2 * (j % 2) + 0]);
+              scale_float<scalar_t>(
+                  reinterpret_cast<float*>(&frag_c[i][j][0][2]),
+                  frag_s[j / 2][2 * (j % 2) + (m_block_size_8 ? 1 : 0)]);
+
+              if constexpr (!m_block_size_8) {
+                scale_float<scalar_t>(
+                    reinterpret_cast<float*>(&frag_c[i][j][1][0]),
+                    frag_s[j / 2][2 * (j % 2) + 1]);
+                scale_float<scalar_t>(
+                    reinterpret_cast<float*>(&frag_c[i][j][1][2]),
+                    frag_s[j / 2][2 * (j % 2) + 1]);
+              }
+            }
+          }
+        }
+      }
+
+      if (slice_count > 1 && !use_atomic_add) {
+        // only globally reduce if there is more than one block in a slice
+        barrier_acquire(&locks[locks_off], slice_idx);
+        if (use_fp32_reduce) {
+          global_reduce_fp32(slice_idx == 0, last);
+        } else {
+          global_reduce_fp16(slice_idx == 0, last);
+        }
+        barrier_release(&locks[locks_off], last);
+      }
+      if (use_atomic_add && slice_count > 1 && slice_idx != 0)
+        wait_negative_and_add(&locks[locks_off]);
+      if (last || use_atomic_add)
+        // only the last block in a slice actually writes the result
+        write_result();
+      int old_slice_row = slice_row;
+      slice_row = 0;
+      slice_col_par++;
+      slice_col++;
+      is_first_matmul_in_slice = true;
+      init_slice();
+
+      // Should we load A matrix in next slice?
+      // `slice_col == 0`: when move to a new moe block
+      // `old_slice_row > 0`:
+      //    when the last slice is not starting from k_index == 0
+      //    (only happen when it is the first slice of a threadblock)
+      // `prob_k > thread_k_blocks * 16 * stages * max_num_stage_groups`:
+      //    when the required shared memory size is larger than
+      //    the remaining shared memory
+      if (slice_col == 0 || old_slice_row ||
+          prob_k > thread_k_blocks * 16 * stages * max_num_stage_groups) {
+        should_load_a = true;
+      } else {
+        should_load_a = false;
+      }
+
+      if (slice_iters) {
+        a_gl_rd_col = (threadIdx.x % a_gl_rd_delta_o);
+  #pragma unroll
+        for (int i = 0; i < b_sh_wr_iters; i++)
+          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
+        if (slice_col == 0) {
+  #pragma unroll
+          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
+        }
+
+        // Update slice k/n for scales loading
+        if constexpr (has_act_order) {
+          slice_k_start = tb_k * slice_row;
+          slice_k_finish = slice_k_start + tb_k * slice_iters;
+          slice_k_start_shared_fetch = slice_k_start;
+          slice_n_offset = act_s_col_tb_stride * slice_col;
+        } else {
+          s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+          zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
+        }
+        start_pipes();
+      }
+    }
+  }
+}
+
+}  // namespace MARLIN_NAMESPACE_NAME
+
+#endif
diff --git a/vllm_v0.10.0/csrc/moe/marlin_moe_wna16/ops.cu b/vllm_v0.10.0/csrc/moe/marlin_moe_wna16/ops.cu
new file mode 100644
index 0000000..2cff04f
--- /dev/null
+++ b/vllm_v0.10.0/csrc/moe/marlin_moe_wna16/ops.cu
@@ -0,0 +1,940 @@
+/*
+ * Modified by Neural Magic
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Adapted from https://github.com/IST-DASLab/marlin
+ */
+
+#ifndef MARLIN_NAMESPACE_NAME
+  #define MARLIN_NAMESPACE_NAME marlin_moe_wna16
+#endif
+
+#include "kernel.h"
+#include "core/registration.h"
+
+#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
+  static_assert(std::is_same<scalar_t, half>::value ||          \
+                    std::is_same<scalar_t, nv_bfloat16>::value, \
+                "only float16 and bfloat16 is supported");
+
+namespace MARLIN_NAMESPACE_NAME {
+
+__global__ void MarlinDefault(MARLIN_KERNEL_PARAMS){};
+
+using MarlinFuncPtr = void (*)(MARLIN_KERNEL_PARAMS);
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+
+template <int moe_block_size>
+__global__ void permute_cols_kernel(
+    int4 const* __restrict__ a_int4_ptr, int const* __restrict__ perm_int_ptr,
+    int4* __restrict__ out_int4_ptr,
+    const int32_t* __restrict__ sorted_token_ids_ptr,
+    const int32_t* __restrict__ expert_ids_ptr,
+    const int32_t* __restrict__ num_tokens_past_padded_ptr, int size_m,
+    int size_k, int top_k) {};
+
+}  // namespace marlin
+
+torch::Tensor moe_wna16_marlin_gemm(
+    torch::Tensor& a, std::optional<torch::Tensor> const& c_or_none,
+    torch::Tensor& b_q_weight, torch::Tensor& b_scales,
+    std::optional<torch::Tensor> const& b_zeros_or_none,
+    std::optional<torch::Tensor> const& g_idx_or_none,
+    std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
+    torch::Tensor& sorted_token_ids, torch::Tensor& expert_ids,
+    torch::Tensor& num_tokens_past_padded, torch::Tensor& topk_weights,
+    int64_t moe_block_size, int64_t top_k, bool mul_topk_weights, bool is_ep,
+    vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
+    int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
+    bool is_zp_float) {
+  TORCH_CHECK_NOT_IMPLEMENTED(false,
+                              "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
+  return torch::empty({1, 1});
+}
+
+#else
+
+// For a given "a" of size [M,K] performs a permutation of the K columns based
+// on the given "perm" indices.
+template <int moe_block_size>
+__global__ void permute_cols_kernel(
+    int4 const* __restrict__ a_int4_ptr, int const* __restrict__ perm_int_ptr,
+    int4* __restrict__ out_int4_ptr,
+    const int32_t* __restrict__ sorted_token_ids_ptr,
+    const int32_t* __restrict__ expert_ids_ptr,
+    const int32_t* __restrict__ num_tokens_past_padded_ptr, int size_m,
+    int size_k, int top_k) {
+  int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
+  int num_moe_blocks = div_ceil(num_tokens_past_padded, moe_block_size);
+  int32_t block_sorted_ids[moe_block_size];
+  int block_num_valid_tokens = 0;
+  int64_t old_expert_id = 0;
+  int64_t expert_id = 0;
+  int row_stride = size_k * sizeof(half) / 16;
+
+  auto read_moe_block_data = [&](int block_id) {
+    block_num_valid_tokens = moe_block_size;
+    int4* tmp_block_sorted_ids = reinterpret_cast<int4*>(block_sorted_ids);
+    for (int i = 0; i < moe_block_size / 4; i++) {
+      tmp_block_sorted_ids[i] =
+          ((int4*)sorted_token_ids_ptr)[block_id * moe_block_size / 4 + i];
+    }
+    for (int i = 0; i < moe_block_size; i++) {
+      if (block_sorted_ids[i] >= size_m * top_k) {
+        block_num_valid_tokens = i;
+        break;
+      };
+    }
+  };
+
+  auto permute_row = [&](int row) {
+    int iters = size_k / default_threads;
+    int rest = size_k % default_threads;
+
+    int in_offset = (row / top_k) * row_stride;
+    int out_offset = row * row_stride;
+
+    half const* a_row_half =
+        reinterpret_cast<half const*>(a_int4_ptr + in_offset);
+    half* out_half = reinterpret_cast<half*>(out_int4_ptr + out_offset);
+
+    int base_k = 0;
+
+    for (int i = 0; i < iters; i++) {
+      auto cur_k = base_k + threadIdx.x;
+      int src_pos = perm_int_ptr[cur_k];
+
+      out_half[cur_k] = a_row_half[src_pos];
+
+      base_k += default_threads;
+    }
+
+    if (rest) {
+      if (threadIdx.x < rest) {
+        auto cur_k = base_k + threadIdx.x;
+        int src_pos = perm_int_ptr[cur_k];
+
+        out_half[cur_k] = a_row_half[src_pos];
+      }
+    }
+  };
+
+  for (int index = blockIdx.x; index < num_moe_blocks; index += gridDim.x) {
+    old_expert_id = expert_id;
+    int tmp_expert_id = expert_ids_ptr[index];
+    if (tmp_expert_id == -1) continue;
+    expert_id = tmp_expert_id;
+    perm_int_ptr += (expert_id - old_expert_id) * size_k;
+    read_moe_block_data(index);
+
+    for (int i = 0; i < block_num_valid_tokens; i++)
+      permute_row(block_sorted_ids[i]);
+  }
+}
+
+typedef struct {
+  int thread_k;
+  int thread_n;
+  int num_threads;
+} thread_config_t;
+
+thread_config_t small_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {128, 128, 256},
+    {64, 128, 128}};
+
+thread_config_t large_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {64, 256, 256},
+    {64, 128, 128}};
+
+typedef struct {
+  int blocks_per_sm;
+  thread_config_t tb_cfg;
+} exec_config_t;
+
+int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
+                          int prob_n, int prob_k, int num_bits, int group_size,
+                          bool has_act_order, bool is_k_full) {
+  bool cache_scales_chunk = has_act_order && !is_k_full;
+
+  int tb_n = th_config.thread_n;
+  int tb_k = th_config.thread_k;
+
+  // Get max scale groups per thread-block
+  int tb_groups;
+  if (group_size == -1) {
+    tb_groups = 1;
+  } else if (group_size == 0) {
+    tb_groups = div_ceil(tb_k, 32);  // Worst case is 32 group size
+  } else {
+    tb_groups = div_ceil(tb_k, group_size);
+  }
+
+  if (cache_scales_chunk) {
+    int load_groups =
+        tb_groups * pipe_stages * 2;     // Chunk size is 2x pipeline over dim K
+    load_groups = max(load_groups, 32);  // We load at least 32 scale groups
+    return load_groups * tb_n * 2;
+  } else {
+    int tb_scales = tb_groups * tb_n * 2;
+
+    return tb_scales * pipe_stages;
+  }
+}
+
+int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
+                          int thread_m_blocks, int prob_m, int prob_n,
+                          int prob_k, int num_bits, int group_size,
+                          bool has_act_order, bool is_k_full, int has_zp,
+                          int is_zp_float) {
+  int pack_factor = 32 / num_bits;
+
+  // Get B size
+  int tb_k = th_config.thread_k;
+  int tb_n = th_config.thread_n;
+  int tb_m = thread_m_blocks * (m_block_size_8 ? 8 : 16);
+
+  // shm size for block_sorted_ids/rd_block_sorted_ids/block_topk_weights
+  // both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32)
+  int sh_block_meta_size = tb_m * 4;
+  int sh_a_size = pipe_stages * (tb_m * tb_k) * 2;
+  int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
+  int sh_red_size = tb_m * (tb_n + 8) * 2;
+  int sh_s_size =
+      get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
+                            group_size, has_act_order, is_k_full);
+  int sh_g_idx_size = has_act_order && !is_k_full ? pipe_stages * tb_k / 4 : 0;
+  int sh_zp_size = 0;
+  if (has_zp) {
+    if (is_zp_float)
+      sh_zp_size = sh_s_size;
+    else if (num_bits == 4)
+      sh_zp_size = sh_s_size / 4;
+    else if (num_bits == 8)
+      sh_zp_size = sh_s_size / 2;
+  }
+
+  int total_size = max(sh_b_size, sh_red_size) + sh_a_size + sh_s_size +
+                   sh_zp_size + sh_g_idx_size + sh_block_meta_size;
+
+  return total_size;
+}
+
+bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
+                     int thread_m_blocks, int prob_m, int prob_n, int prob_k,
+                     int num_bits, int group_size, bool has_act_order,
+                     bool is_k_full, int has_zp, int is_zp_float,
+                     int max_shared_mem) {
+  // Sanity
+  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
+      th_config.num_threads == -1) {
+    return false;
+  }
+
+  // Verify K/N are divisible by thread K/N
+  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
+    return false;
+  }
+
+  // Verify min for thread K/N
+  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
+    return false;
+  }
+
+  // num_threads must be at least 128 (= 4 warps)
+  if (th_config.num_threads < 128) {
+    return false;
+  }
+
+  // Check that pipeline fits into cache
+  int cache_size = get_kernel_cache_size(
+      th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k,
+      num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float);
+  return cache_size <= max_shared_mem;
+}
+
+  #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
+                  M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT)    \
+    else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&       \
+             thread_n_blocks == THREAD_N_BLOCKS &&                           \
+             thread_k_blocks == THREAD_K_BLOCKS &&                           \
+             m_block_size_8 == M_BLOCK_SIZE_8 &&                             \
+             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&   \
+             is_zp_float == IS_ZP_FLOAT) {                                   \
+      kernel = Marlin<scalar_t, W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS,   \
+                      THREAD_N_BLOCKS, THREAD_K_BLOCKS, M_BLOCK_SIZE_8,      \
+                      pipe_stages, GROUP_BLOCKS, IS_ZP_FLOAT>;               \
+    }
+
+  // COMMON: cases for (group_blocks in [-1, 2, 4, 8] and is_zp_float == false)
+  //         this is the most common cases
+  // BIGGROUP: cases for big group size (group_blocks in [-1, 8])
+  // FZP: cases for float-zero-point (is_zp_float = true)
+  // ACT: cases for act order case (group_blocks == 0)
+  // FP4: cases for nvfp4(e2m1) (group_blocks == 1)
+  #define COMMON_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false)  \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false)   \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, false)   \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false)   \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
+
+  #define COMMON_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
+    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
+    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
+    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
+                                                                          \
+    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
+    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
+    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
+                                                                          \
+    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
+    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
+    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
+
+  #define COMMON_GET_IF(W_TYPE)            \
+    COMMON_GET_IF_M1(W_TYPE, 8, 8, 256)    \
+    COMMON_GET_IF_M1(W_TYPE, 8, 4, 128)    \
+    COMMON_GET_IF_M234(W_TYPE, 16, 4, 256) \
+    COMMON_GET_IF_M234(W_TYPE, 8, 4, 128)
+
+  #define BIGGROUP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false)  \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false)   \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
+
+  #define BIGGROUP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)   \
+    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
+    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
+                                                                          \
+    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
+
+  #define FP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)        \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
+
+  #define FP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
+    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
+
+  #define FP4_GET_IF(W_TYPE)            \
+    FP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
+    FP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
+    FP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
+    FP4_GET_IF_M234(W_TYPE, 8, 4, 128)
+
+  #define BIGGROUP_GET_IF(W_TYPE)            \
+    BIGGROUP_GET_IF_M1(W_TYPE, 8, 8, 256)    \
+    BIGGROUP_GET_IF_M1(W_TYPE, 8, 4, 128)    \
+    BIGGROUP_GET_IF_M234(W_TYPE, 16, 4, 256) \
+    BIGGROUP_GET_IF_M234(W_TYPE, 8, 4, 128)
+
+  // We currently have 4-bit models only with group_blocks == 4
+  #define FZP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, true) \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true)
+
+  #define FZP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
+    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \
+    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \
+    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true)
+
+  #define FZP_GET_IF(W_TYPE)            \
+    FZP_GET_IF_M1(W_TYPE, 8, 8, 256)    \
+    FZP_GET_IF_M1(W_TYPE, 8, 4, 128)    \
+    FZP_GET_IF_M234(W_TYPE, 16, 4, 256) \
+    FZP_GET_IF_M234(W_TYPE, 8, 4, 128)
+
+  // We currently have 4-bit models only with group_blocks == 4
+  #define ACT_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)        \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false)
+
+  #define ACT_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
+    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false)
+
+  #define ACT_GET_IF(W_TYPE)            \
+    ACT_GET_IF_M1(W_TYPE, 8, 8, 256)    \
+    ACT_GET_IF_M1(W_TYPE, 8, 4, 128)    \
+    ACT_GET_IF_M234(W_TYPE, 16, 4, 256) \
+    ACT_GET_IF_M234(W_TYPE, 8, 4, 128)
+
+template <typename scalar_t>
+MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
+                                int thread_m_blocks, int thread_n_blocks,
+                                int thread_k_blocks, bool m_block_size_8,
+                                bool has_act_order, bool has_zp,
+                                int group_blocks, int num_threads,
+                                bool is_zp_float) {
+  int num_bits = q_type.size_bits();
+  auto kernel = MarlinDefault;
+  if (false) {
+  }
+
+  COMMON_GET_IF(vllm::kU4)
+  COMMON_GET_IF(vllm::kU4B8)
+  COMMON_GET_IF(vllm::kU8B128)
+
+  BIGGROUP_GET_IF(vllm::kFE4M3fn)
+
+  FP4_GET_IF(vllm::kFE2M1f)
+
+  ACT_GET_IF(vllm::kU4B8)
+  ACT_GET_IF(vllm::kU8B128)
+
+  return kernel;
+}
+
+template <typename scalar_t>
+exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
+                                    int prob_n, int prob_k, int thread_m_blocks,
+                                    bool m_block_size_8, int num_bits,
+                                    int group_size, bool has_act_order,
+                                    bool is_k_full, bool has_zp,
+                                    bool is_zp_float, int max_shared_mem) {
+  exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
+  thread_config_t* thread_configs = thread_m_blocks > 1
+                                        ? large_batch_thread_configs
+                                        : small_batch_thread_configs;
+  int thread_configs_size =
+      thread_m_blocks > 1
+          ? sizeof(large_batch_thread_configs) / sizeof(thread_config_t)
+          : sizeof(small_batch_thread_configs) / sizeof(thread_config_t);
+
+  int count = 0;
+  constexpr int device_max_reg_size = 255 * 1024;
+  for (int i = 0; i < thread_configs_size; i++) {
+    thread_config_t th_config = thread_configs[i];
+
+    if (!is_valid_config(th_config, m_block_size_8, thread_m_blocks, prob_m,
+                         prob_n, prob_k, num_bits, group_size, has_act_order,
+                         is_k_full, has_zp, is_zp_float, max_shared_mem)) {
+      continue;
+    }
+
+    int cache_size = get_kernel_cache_size(
+        th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k,
+        num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float);
+
+    int group_blocks = 0;
+    if (!has_act_order) {
+      group_blocks = group_size == -1 ? -1 : (group_size / 16);
+    }
+
+    auto kernel = get_marlin_kernel<scalar_t>(
+        q_type, thread_m_blocks, th_config.thread_n / 16,
+        th_config.thread_k / 16, m_block_size_8, has_act_order, has_zp,
+        group_blocks, th_config.num_threads, is_zp_float);
+
+    if (kernel == MarlinDefault) continue;
+
+    if (thread_m_blocks > 1) {
+      exec_cfg = {1, th_config};
+      break;
+    } else {
+      cudaFuncAttributes attr;
+      cudaFuncGetAttributes(&attr, kernel);
+      int reg_size = max(attr.numRegs, 1) * th_config.num_threads * 4;
+      int allow_count = min(device_max_reg_size / reg_size,
+                            max_shared_mem / (cache_size + 1024));
+      allow_count = max(min(allow_count, 4), 1);
+      if (allow_count > count) {
+        count = allow_count;
+        exec_cfg = {count, th_config};
+      };
+    }
+  }
+
+  return exec_cfg;
+}
+
+template <typename scalar_t>
+void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
+               void* s2, void* zp, void* g_idx, void* perm, void* a_tmp,
+               void* sorted_token_ids, void* expert_ids,
+               void* num_tokens_past_padded, void* topk_weights,
+               int moe_block_size, int top_k, bool mul_topk_weights, bool is_ep,
+               int prob_m, int prob_n, int prob_k, void* workspace,
+               vllm::ScalarType const& q_type, bool has_act_order,
+               bool is_k_full, bool has_zp, int num_groups, int group_size,
+               int dev, cudaStream_t stream, int thread_k, int thread_n,
+               int sms, bool use_atomic_add, bool use_fp32_reduce,
+               bool is_zp_float) {
+  int thread_m_blocks = div_ceil(moe_block_size, 16);
+  bool m_block_size_8 = moe_block_size == 8;
+
+  if (has_zp) {
+    TORCH_CHECK(
+        q_type == vllm::kU4 || q_type == vllm::kU8,
+        "q_type must be u4 or u8 when has_zp = True. Got = ", q_type.str());
+  } else {
+    TORCH_CHECK(
+        q_type == vllm::kU4B8 || q_type == vllm::kU8B128 ||
+            q_type == vllm::kFE4M3fn || q_type == vllm::kFE2M1f,
+        "q_type must be uint4b8, uint8b128, float8_e4m3fn or float4_e2m1f when "
+        "has_zp = False. Got = ",
+        q_type.str());
+  }
+
+  TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
+              ", ", prob_n, ", ", prob_k, "]");
+
+  int group_blocks = 0;
+  if (has_act_order) {
+    if (is_k_full) {
+      TORCH_CHECK(group_size != -1);
+      group_blocks = group_size / 16;
+      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
+                  " is not divisible by group_blocks = ", group_blocks);
+    } else {
+      TORCH_CHECK(group_size == 0);
+      group_blocks = 0;
+    }
+  } else {
+    if (group_size == -1) {
+      group_blocks = -1;
+    } else {
+      group_blocks = group_size / 16;
+      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
+                  " is not divisible by group_blocks = ", group_blocks);
+    }
+  }
+
+  int num_bits = q_type.size_bits();
+  const int4* A_ptr = (const int4*)A;
+  const int4* B_ptr = (const int4*)B;
+  int4* C_ptr = (int4*)C;
+  int4* C_tmp_ptr = (int4*)C_tmp;
+  const int4* s_ptr = (const int4*)s;
+  const uint16_t* s2_ptr = (const uint16_t*)s2;
+  const int4* zp_ptr = (const int4*)zp;
+  const int* g_idx_ptr = (const int*)g_idx;
+  const int* perm_ptr = (const int*)perm;
+  int4* a_tmp_ptr = (int4*)a_tmp;
+  const int32_t* sorted_token_ids_ptr = (const int32_t*)sorted_token_ids;
+  const int32_t* expert_ids_ptr = (const int32_t*)expert_ids;
+  const int32_t* num_tokens_past_padded_ptr =
+      (const int32_t*)num_tokens_past_padded;
+  const float* topk_weights_ptr = (const float*)topk_weights;
+  int* locks = (int*)workspace;
+
+  if (has_act_order) {
+    // Permute A columns
+    auto kernel = permute_cols_kernel<8>;
+    if (moe_block_size == 8) {
+    } else if (moe_block_size == 16)
+      kernel = permute_cols_kernel<16>;
+    else if (moe_block_size == 32)
+      kernel = permute_cols_kernel<32>;
+    else if (moe_block_size == 48)
+      kernel = permute_cols_kernel<48>;
+    else if (moe_block_size == 64)
+      kernel = permute_cols_kernel<64>;
+    else
+      TORCH_CHECK(false, "unsupported moe_block_size ", moe_block_size);
+
+    // avoid ">>>" being formatted to "> > >"
+    // clang-format off
+    kernel<<<sms, default_threads, 0, stream>>>(
+        A_ptr, perm_ptr, a_tmp_ptr, sorted_token_ids_ptr, expert_ids_ptr,
+        num_tokens_past_padded_ptr, prob_m, prob_k, top_k);
+    // clang-format on
+    A_ptr = a_tmp_ptr;
+    prob_m = prob_m * top_k;
+    top_k = 1;
+
+    // If we have a full K, then we can run the non-act-order version of Marlin
+    // (since the weight rows are reordered by increasing group ids, and by
+    // having a full K, we have full original groups)
+    if (is_k_full) has_act_order = false;
+  }
+
+  int max_shared_mem = 0;
+  cudaDeviceGetAttribute(&max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  TORCH_CHECK(max_shared_mem > 0);
+
+  // Set thread config
+  exec_config_t exec_cfg;
+  thread_config_t thread_tfg;
+  if (thread_k != -1 && thread_n != -1) {
+    thread_tfg = thread_config_t{thread_k, thread_n, default_threads};
+    exec_cfg = exec_config_t{1, thread_tfg};
+    TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
+                " is not divisible by thread_n = ", thread_n);
+    TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
+                " is not divisible by thread_k = ", thread_k);
+  } else {
+    // Auto config
+    exec_cfg = determine_exec_config<scalar_t>(
+        q_type, prob_m, prob_n, prob_k, thread_m_blocks, m_block_size_8,
+        num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float,
+        max_shared_mem);
+    thread_tfg = exec_cfg.tb_cfg;
+  }
+
+  int num_threads = thread_tfg.num_threads;
+  thread_k = thread_tfg.thread_k;
+  thread_n = thread_tfg.thread_n;
+  int blocks = sms * exec_cfg.blocks_per_sm;
+  if (exec_cfg.blocks_per_sm > 1)
+    max_shared_mem = max_shared_mem / exec_cfg.blocks_per_sm - 1024;
+
+  int thread_k_blocks = thread_k / 16;
+  int thread_n_blocks = thread_n / 16;
+
+  TORCH_CHECK(
+      is_valid_config(thread_tfg, m_block_size_8, thread_m_blocks, prob_m,
+                      prob_n, prob_k, num_bits, group_size, has_act_order,
+                      is_k_full, has_zp, is_zp_float, max_shared_mem),
+      "Invalid thread config: thread_m_blocks = ", thread_m_blocks,
+      ", thread_k = ", thread_tfg.thread_k,
+      ", thread_n = ", thread_tfg.thread_n,
+      ", num_threads = ", thread_tfg.num_threads, " for MKN = [", prob_m, ", ",
+      prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
+      ", group_size = ", group_size, ", has_act_order = ", has_act_order,
+      ", is_k_full = ", is_k_full, ", has_zp = ", has_zp,
+      ", is_zp_float = ", is_zp_float, ", max_shared_mem = ", max_shared_mem);
+
+  auto kernel = get_marlin_kernel<scalar_t>(
+      q_type, thread_m_blocks, thread_n_blocks, thread_k_blocks, m_block_size_8,
+      has_act_order, has_zp, group_blocks, num_threads, is_zp_float);
+
+  if (kernel == MarlinDefault) {
+    TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
+                ", ", prob_k, "]", ", has_act_order = ", has_act_order,
+                ", num_groups = ", num_groups, ", group_size = ", group_size,
+                ", thread_m_blocks = ", thread_m_blocks,
+                ", thread_n_blocks = ", thread_n_blocks,
+                ", thread_k_blocks = ", thread_k_blocks,
+                ", num_bits = ", num_bits);
+  }
+
+  cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
+                       max_shared_mem);
+  // avoid ">>>" being formatted to "> > >"
+  // clang-format off
+  kernel<<<blocks, num_threads, max_shared_mem, stream>>>(
+      A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, s2_ptr, zp_ptr, g_idx_ptr,
+      sorted_token_ids_ptr, expert_ids_ptr, num_tokens_past_padded_ptr,
+      topk_weights_ptr, top_k, mul_topk_weights, is_ep, num_groups, prob_m,
+      prob_n, prob_k, locks, use_atomic_add, use_fp32_reduce, max_shared_mem);
+  // clang-format on
+}
+
+}  // namespace MARLIN_NAMESPACE_NAME
+
+torch::Tensor moe_wna16_marlin_gemm(
+    torch::Tensor& a, std::optional<torch::Tensor> const& c_or_none,
+    torch::Tensor& b_q_weight, torch::Tensor& b_scales,
+    std::optional<torch::Tensor> const& global_scale_or_none,
+    std::optional<torch::Tensor> const& b_zeros_or_none,
+    std::optional<torch::Tensor> const& g_idx_or_none,
+    std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
+    torch::Tensor& sorted_token_ids, torch::Tensor& expert_ids,
+    torch::Tensor& num_tokens_past_padded, torch::Tensor& topk_weights,
+    int64_t moe_block_size, int64_t top_k, bool mul_topk_weights, bool is_ep,
+    vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
+    int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
+    bool is_zp_float) {
+  vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id);
+  int pack_factor = 32 / b_q_type.size_bits();
+
+  if (moe_block_size != 8) {
+    TORCH_CHECK(moe_block_size % 16 == 0,
+                "unsupported moe_block_size=", moe_block_size);
+    TORCH_CHECK(moe_block_size >= 16 && moe_block_size <= 64,
+                "unsupported moe_block_size=", moe_block_size);
+  }
+
+  // Verify A
+  TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0),
+              ", size_m = ", size_m);
+  TORCH_CHECK(a.size(1) == size_k, "Shape mismatch: a.size(1) = ", a.size(1),
+              ", size_k = ", size_k);
+
+  // Verify B
+  TORCH_CHECK(
+      size_k % MARLIN_NAMESPACE_NAME::tile_size == 0, "size_k = ", size_k,
+      " is not divisible by tile_size = ", MARLIN_NAMESPACE_NAME::tile_size);
+  TORCH_CHECK((size_k / MARLIN_NAMESPACE_NAME::tile_size) == b_q_weight.size(1),
+              "Shape mismatch: b_q_weight.size(1) = ", b_q_weight.size(1),
+              ", size_k = ", size_k,
+              ", tile_size = ", MARLIN_NAMESPACE_NAME::tile_size);
+  TORCH_CHECK(
+      b_q_weight.size(2) % MARLIN_NAMESPACE_NAME::tile_size == 0,
+      "b_q_weight.size(2) = ", b_q_weight.size(2),
+      " is not divisible by tile_size = ", MARLIN_NAMESPACE_NAME::tile_size);
+  int actual_size_n =
+      (b_q_weight.size(2) / MARLIN_NAMESPACE_NAME::tile_size) * pack_factor;
+  TORCH_CHECK(size_n == actual_size_n, "size_n = ", size_n,
+              ", actual_size_n = ", actual_size_n);
+
+  // Verify device and strides
+  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
+  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
+
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+
+  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
+  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
+
+  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_k = -1;
+  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_n = -1;
+  // sms: number of SMs to use for the kernel
+  int sms = -1;
+  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());
+
+  // Alloc buffers
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
+  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
+  torch::Tensor c;
+  if (c_or_none.has_value()) {
+    c = c_or_none.value();
+    TORCH_CHECK(c.device().is_cuda(), "c is not on GPU");
+    TORCH_CHECK(c.is_contiguous(), "c is not contiguous");
+    TORCH_CHECK(c.size(0) == size_m * top_k,
+                "Shape mismatch: c.size(0) = ", c.size(0),
+                ", size_m * topk = ", size_m * top_k);
+    TORCH_CHECK(c.size(1) == size_n, "Shape mismatch: c.size(1) = ", c.size(1),
+                ", size_n = ", size_n);
+  } else {
+    c = torch::empty({size_m * top_k, size_n}, options);
+  }
+
+  // Alloc C tmp buffer that is going to be used for the global reduce
+  torch::Tensor c_tmp;
+  auto options_fp32 =
+      torch::TensorOptions().dtype(at::kFloat).device(a.device());
+  if (use_fp32_reduce && !use_atomic_add) {
+    // max num of threadblocks is sms * 4
+    long max_c_tmp_size = min(
+        (long)size_n * sorted_token_ids.size(0),
+        (long)sms * 4 * moe_block_size * MARLIN_NAMESPACE_NAME::max_thread_n);
+    if (moe_block_size == 8) max_c_tmp_size *= 2;
+    c_tmp = torch::empty({max_c_tmp_size}, options_fp32);
+  } else {
+    c_tmp = torch::empty({0}, options_fp32);
+  }
+
+  // Detect groupsize and act_order
+  int num_groups = -1;
+  int group_size = -1;
+
+  int rank = b_scales.sizes().size();
+  TORCH_CHECK(rank == 3, "b_scales rank = ", rank, " is not 3");
+  TORCH_CHECK(b_scales.size(2) == size_n, "b_scales dim 2 = ", b_scales.size(2),
+              " is not size_n = ", size_n);
+  num_groups = b_scales.size(1);
+
+  torch::Tensor g_idx, perm, a_tmp;
+  ;
+  if (g_idx_or_none.has_value() && perm_or_none.has_value()) {
+    g_idx = g_idx_or_none.value();
+    perm = perm_or_none.value();
+
+    TORCH_CHECK(g_idx.device().is_cuda(), "g_idx is not on GPU");
+    TORCH_CHECK(g_idx.is_contiguous(), "g_idx is not contiguous");
+    TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU");
+    TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");
+
+    // Verify g_idx and perm
+    TORCH_CHECK((g_idx.size(-1) == 0 && perm.size(-1) == 0) ||
+                    (g_idx.size(-1) == size_k && perm.size(-1) == size_k),
+                "Unexpected g_idx.size(-1) = ", g_idx.size(-1),
+                " and perm.size(-1) = ", perm.size(-1),
+                ", where size_k = ", size_k);
+  } else {
+    g_idx = torch::empty({0}, options);
+    perm = torch::empty({0}, options);
+    a_tmp = torch::empty({0}, options);
+  }
+  bool has_act_order = g_idx.size(-1) > 0 && perm.size(-1) > 0;
+
+  if (has_act_order) {
+    a_tmp = torch::empty({size_m * top_k, size_k}, options);
+    if (is_k_full) {
+      TORCH_CHECK(num_groups > 1, "For act_order, num_groups must be > 1");
+      TORCH_CHECK(size_k % num_groups == 0, "size_k = ", size_k,
+                  ", is not divisible by num_groups = ", num_groups);
+      group_size = size_k / num_groups;
+    } else {
+      group_size = 0;
+    }
+
+  } else {
+    a_tmp = torch::empty({0}, options);
+    if (num_groups > 1) {
+      TORCH_CHECK(
+          size_k % num_groups == 0, "size_k = ", size_k,
+          ", is not divisible by b_scales.size(1) = ", b_scales.size(1));
+      group_size = size_k / num_groups;
+    } else {
+      group_size = -1;
+    }
+  }
+
+  torch::Tensor global_scale;
+  if (global_scale_or_none.has_value()) {
+    global_scale = global_scale_or_none.value();
+    TORCH_CHECK(b_q_type == vllm::kFE2M1f,
+                "global_scale can only be used for float4_e2m1f.");
+  } else {
+    global_scale = torch::empty({0}, options);
+    TORCH_CHECK(!(b_q_type == vllm::kFE2M1f),
+                "the global_scale parameter must be passed for float4_e2m1f.");
+  }
+
+  torch::Tensor b_zeros;
+  if (b_zeros_or_none.has_value()) {
+    b_zeros = b_zeros_or_none.value();
+    TORCH_CHECK(b_zeros.device().is_cuda(), "b_zeros is not on GPU");
+    TORCH_CHECK(b_zeros.is_contiguous(), "b_zeros is not contiguous");
+  } else {
+    b_zeros = torch::empty({0}, options);
+  }
+  bool has_zp = b_zeros.size(-1) > 0;
+
+  if (has_zp) {
+    TORCH_CHECK(
+        b_q_type == vllm::kU4 || b_q_type == vllm::kU8,
+        "b_q_type must be u4 or u8 when has_zp = True. Got = ", b_q_type.str());
+  } else {
+    TORCH_CHECK(b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128 ||
+                    b_q_type == vllm::kFE4M3fn || b_q_type == vllm::kFE2M1f,
+                "b_q_type must be uint4b8, uint8b128, float8_e4m3fn or "
+                "float4_e2m1f when "
+                "has_zp = False. Got = ",
+                b_q_type.str());
+  }
+
+  if (has_zp && is_zp_float) {
+    TORCH_CHECK(a.scalar_type() == at::ScalarType::Half,
+                "Computation type must be float16 (half) when using float zero "
+                "points.");
+  }
+
+  // Verify b_zeros
+  if (has_zp) {
+    int rank = b_zeros.sizes().size();
+    TORCH_CHECK(rank == 3, "b_zeros rank = ", rank, " is not 3");
+    if (is_zp_float) {
+      TORCH_CHECK(b_zeros.size(2) == size_n,
+                  "b_zeros dim 2 = ", b_zeros.size(2),
+                  " is not size_n = ", size_n);
+      TORCH_CHECK(num_groups == b_zeros.size(1),
+                  "b_zeros dim 1 = ", b_zeros.size(1),
+                  " is not num_groups = ", num_groups);
+      TORCH_CHECK(num_groups != -1, "num_groups must be != -1");
+    } else {
+      TORCH_CHECK(b_zeros.size(1) == num_groups,
+                  "b_zeros dim 1 = ", b_zeros.size(1),
+                  " is not num_groups = ", num_groups);
+      TORCH_CHECK(b_zeros.size(2) == size_n / pack_factor,
+                  "b_zeros dim 2 = ", b_zeros.size(2),
+                  " is not size_n / pack_factor = ", size_n / pack_factor);
+    }
+  }
+
+  // Verify workspace size
+  TORCH_CHECK(size_n % MARLIN_NAMESPACE_NAME::min_thread_n == 0,
+              "size_n = ", size_n, ", is not divisible by min_thread_n = ",
+              MARLIN_NAMESPACE_NAME::min_thread_n);
+
+  int max_n_tiles = size_n / MARLIN_NAMESPACE_NAME::min_thread_n;
+  int min_workspace_size = min(
+      max_n_tiles * (int)(sorted_token_ids.size(0) / moe_block_size), sms * 4);
+  TORCH_CHECK(workspace.numel() >= min_workspace_size,
+              "workspace.numel = ", workspace.numel(),
+              " is below min_workspace_size = ", min_workspace_size);
+
+  int dev = a.get_device();
+  if (a.scalar_type() == at::ScalarType::Half) {
+    void* scales_ptr;
+    if (b_q_type == vllm::kFE2M1f) {
+      scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+    } else {
+      scales_ptr = b_scales.data_ptr<at::Half>();
+    }
+
+    MARLIN_NAMESPACE_NAME::marlin_mm<half>(
+        a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
+        c_tmp.data_ptr<float>(), scales_ptr, global_scale.data_ptr<at::Half>(),
+        b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(),
+        a_tmp.data_ptr<at::Half>(), sorted_token_ids.data_ptr(),
+        expert_ids.data_ptr(), num_tokens_past_padded.data_ptr(),
+        topk_weights.data_ptr(), moe_block_size, top_k, mul_topk_weights, is_ep,
+        size_m, size_n, size_k, workspace.data_ptr(), b_q_type, has_act_order,
+        is_k_full, has_zp, num_groups, group_size, dev,
+        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
+        use_atomic_add, use_fp32_reduce, is_zp_float);
+  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
+    void* scales_ptr;
+    if (b_q_type == vllm::kFE2M1f) {
+      scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+    } else {
+      scales_ptr = b_scales.data_ptr<at::BFloat16>();
+    }
+
+    MARLIN_NAMESPACE_NAME::marlin_mm<nv_bfloat16>(
+        a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
+        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(), scales_ptr,
+        global_scale.data_ptr<at::BFloat16>(), b_zeros.data_ptr(),
+        g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
+        sorted_token_ids.data_ptr(), expert_ids.data_ptr(),
+        num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(),
+        moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k,
+        workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
+        num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
+        thread_k, thread_n, sms, use_atomic_add, use_fp32_reduce, is_zp_float);
+  } else {
+    TORCH_CHECK(false,
+                "moe_wna16_marlin_gemm only supports bfloat16 and float16");
+  }
+
+  return c;
+}
+
+#endif
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("moe_wna16_marlin_gemm", &moe_wna16_marlin_gemm);
+}
diff --git a/vllm_v0.10.0/csrc/moe/moe_align_sum_kernels.cu b/vllm_v0.10.0/csrc/moe/moe_align_sum_kernels.cu
new file mode 100644
index 0000000..8bbcf5a
--- /dev/null
+++ b/vllm_v0.10.0/csrc/moe/moe_align_sum_kernels.cu
@@ -0,0 +1,318 @@
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cub/cub.cuh>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/Atomic.cuh>
+
+#include "../cuda_compat.h"
+#include "../dispatch_utils.h"
+
+#define CEILDIV(x, y) (((x) + (y) - 1) / (y))
+
+namespace vllm {
+namespace moe {
+
+template <typename scalar_t>
+__global__ void moe_align_block_size_kernel(
+    const scalar_t* __restrict__ topk_ids,
+    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
+    int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
+    int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
+    size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded) {
+  extern __shared__ int32_t shared_counts[];
+
+  // Initialize sorted_token_ids with numel
+  for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
+    sorted_token_ids[it] = numel;
+  }
+
+  const int warp_id = threadIdx.x / WARP_SIZE;
+  const int my_expert_start = warp_id * experts_per_warp;
+
+  for (int i = 0; i < experts_per_warp; ++i) {
+    if (my_expert_start + i < padded_num_experts) {
+      shared_counts[warp_id * experts_per_warp + i] = 0;
+    }
+  }
+
+  __syncthreads();
+
+  const size_t tid = threadIdx.x;
+  const size_t stride = blockDim.x;
+
+  for (size_t i = tid; i < numel; i += stride) {
+    int expert_id = topk_ids[i];
+    int warp_idx = expert_id / experts_per_warp;
+    int expert_offset = expert_id % experts_per_warp;
+    atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1);
+  }
+
+  __syncthreads();
+
+  // Compute prefix sum over token counts per expert
+  using BlockScan = cub::BlockScan<int32_t, 1024>;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+
+  int expert_count = 0;
+  int expert_id = threadIdx.x;
+  if (expert_id < num_experts) {
+    int warp_idx = expert_id / experts_per_warp;
+    int expert_offset = expert_id % experts_per_warp;
+    expert_count = shared_counts[warp_idx * experts_per_warp + expert_offset];
+    expert_count = CEILDIV(expert_count, block_size) * block_size;
+  }
+
+  int cumsum_val;
+  BlockScan(temp_storage).ExclusiveSum(expert_count, cumsum_val);
+  if (expert_id <= num_experts) {
+    cumsum[expert_id] = cumsum_val;
+  }
+
+  if (expert_id == num_experts) {
+    *total_tokens_post_pad = cumsum_val;
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x < num_experts) {
+    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
+         i += block_size) {
+      expert_ids[i / block_size] = threadIdx.x;
+    }
+  }
+
+  // Fill remaining expert_ids with 0
+  const size_t fill_start_idx = cumsum[num_experts] / block_size + threadIdx.x;
+  const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size);
+  for (size_t i = fill_start_idx; i < expert_ids_size; i += blockDim.x) {
+    expert_ids[i] = 0;
+  }
+}
+
+template <typename scalar_t>
+__global__ void count_and_sort_expert_tokens_kernel(
+    const scalar_t* __restrict__ topk_ids,
+    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
+    size_t numel) {
+  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const size_t stride = blockDim.x * gridDim.x;
+
+  for (size_t i = tid; i < numel; i += stride) {
+    int32_t expert_id = topk_ids[i];
+    int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1);
+    sorted_token_ids[rank_post_pad] = i;
+  }
+}
+
+template <typename scalar_t, int TOPK>
+__global__ void moe_sum_kernel(
+    scalar_t* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., topk, d]
+    const int d) {
+  const int64_t token_idx = blockIdx.x;
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    scalar_t x = 0.0;
+#pragma unroll
+    for (int k = 0; k < TOPK; ++k) {
+      x += VLLM_LDG(&input[token_idx * TOPK * d + k * d + idx]);
+    }
+    out[token_idx * d + idx] = x;
+  }
+}
+
+template <typename scalar_t>
+__global__ void moe_align_block_size_small_batch_expert_kernel(
+    const scalar_t* __restrict__ topk_ids,
+    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
+    int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
+    int32_t block_size, size_t numel, int32_t max_num_tokens_padded) {
+  // Initialize sorted_token_ids with numel
+  for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
+    sorted_token_ids[it] = numel;
+  }
+
+  const size_t tid = threadIdx.x;
+  const size_t stride = blockDim.x;
+
+  extern __shared__ int32_t shared_mem[];
+  int32_t* cumsum = shared_mem;
+  int32_t* tokens_cnts = (int32_t*)(shared_mem + num_experts + 1);
+
+  for (int i = 0; i < num_experts; ++i) {
+    tokens_cnts[(threadIdx.x + 1) * num_experts + i] = 0;
+  }
+
+  for (size_t i = tid; i < numel; i += stride) {
+    ++tokens_cnts[(threadIdx.x + 1) * num_experts + topk_ids[i]];
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x < num_experts) {
+    tokens_cnts[threadIdx.x] = 0;
+    for (int i = 1; i <= blockDim.x; ++i) {
+      tokens_cnts[i * num_experts + threadIdx.x] +=
+          tokens_cnts[(i - 1) * num_experts + threadIdx.x];
+    }
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    cumsum[0] = 0;
+    for (int i = 1; i <= num_experts; ++i) {
+      cumsum[i] =
+          cumsum[i - 1] +
+          CEILDIV(tokens_cnts[blockDim.x * num_experts + i - 1], block_size) *
+              block_size;
+    }
+    *total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x < num_experts) {
+    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
+         i += block_size) {
+      expert_ids[i / block_size] = threadIdx.x;
+    }
+  }
+
+  // Fill remaining expert_ids with 0
+  const size_t fill_start_idx = cumsum[num_experts] / block_size + threadIdx.x;
+  const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size);
+  for (size_t i = fill_start_idx; i < expert_ids_size; i += blockDim.x) {
+    expert_ids[i] = 0;
+  }
+
+  for (size_t i = tid; i < numel; i += stride) {
+    int32_t expert_id = topk_ids[i];
+    int32_t rank_post_pad =
+        tokens_cnts[threadIdx.x * num_experts + expert_id] + cumsum[expert_id];
+    sorted_token_ids[rank_post_pad] = i;
+    ++tokens_cnts[threadIdx.x * num_experts + expert_id];
+  }
+}
+
+}  // namespace moe
+}  // namespace vllm
+
+// taken from
+// https://github.com/sgl-project/sglang/blob/8b5f83ed3b7d2a49ad5c5cd5aa61c5d502f47dbc
+void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
+                          int64_t block_size, torch::Tensor sorted_token_ids,
+                          torch::Tensor experts_ids,
+                          torch::Tensor num_tokens_post_pad) {
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  int64_t padded_num_experts =
+      ((num_experts + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
+  int experts_per_warp = WARP_SIZE;
+  int threads = 1024;
+  threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
+
+  // BlockScan uses 1024 threads and assigns one thread per expert.
+  TORCH_CHECK(padded_num_experts < 1024,
+              "padded_num_experts must be less than 1024");
+
+  VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
+      topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
+        // calc needed amount of shared mem for `cumsum` tensors
+        auto options_int =
+            torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
+        torch::Tensor cumsum_buffer =
+            torch::empty({num_experts + 1}, options_int);
+        bool small_batch_expert_mode =
+            (topk_ids.numel() < 1024) && (num_experts <= 64);
+
+        if (small_batch_expert_mode) {
+          const int32_t threads = max((int32_t)num_experts, WARP_SIZE);
+          const int32_t shared_mem_size =
+              ((threads + 1) * num_experts + (num_experts + 1)) *
+              sizeof(int32_t);
+
+          auto small_batch_expert_kernel =
+              vllm::moe::moe_align_block_size_small_batch_expert_kernel<
+                  scalar_t>;
+          small_batch_expert_kernel<<<1, threads, shared_mem_size, stream>>>(
+              topk_ids.data_ptr<scalar_t>(),
+              sorted_token_ids.data_ptr<int32_t>(),
+              experts_ids.data_ptr<int32_t>(),
+              num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
+              topk_ids.numel(), sorted_token_ids.size(0));
+        } else {
+          auto align_kernel = vllm::moe::moe_align_block_size_kernel<scalar_t>;
+
+          size_t num_warps = CEILDIV(padded_num_experts, experts_per_warp);
+          size_t shared_mem_size =
+              num_warps * experts_per_warp * sizeof(int32_t);
+
+          align_kernel<<<1, threads, shared_mem_size, stream>>>(
+              topk_ids.data_ptr<scalar_t>(),
+              sorted_token_ids.data_ptr<int32_t>(),
+              experts_ids.data_ptr<int32_t>(),
+              num_tokens_post_pad.data_ptr<int32_t>(), num_experts,
+              padded_num_experts, experts_per_warp, block_size,
+              topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>(),
+              sorted_token_ids.size(0));
+
+          const int block_threads = std::min(256, (int)threads);
+          const int num_blocks =
+              (topk_ids.numel() + block_threads - 1) / block_threads;
+          const int max_blocks = 65535;
+          const int actual_blocks = std::min(num_blocks, max_blocks);
+
+          auto sort_kernel =
+              vllm::moe::count_and_sort_expert_tokens_kernel<scalar_t>;
+          sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
+              topk_ids.data_ptr<scalar_t>(),
+              sorted_token_ids.data_ptr<int32_t>(),
+              cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel());
+        }
+      });
+}
+
+void moe_sum(torch::Tensor& input,   // [num_tokens, topk, hidden_size]
+             torch::Tensor& output)  // [num_tokens, hidden_size]
+{
+  const int hidden_size = input.size(-1);
+  const auto num_tokens = output.numel() / hidden_size;
+  const int topk = input.size(1);
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min(hidden_size, 1024));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(output));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  switch (topk) {
+    case 2:
+      VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] {
+        vllm::moe::moe_sum_kernel<scalar_t, 2><<<grid, block, 0, stream>>>(
+            output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
+            hidden_size);
+      });
+      break;
+
+    case 3:
+      VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] {
+        vllm::moe::moe_sum_kernel<scalar_t, 3><<<grid, block, 0, stream>>>(
+            output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
+            hidden_size);
+      });
+      break;
+
+    case 4:
+      VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] {
+        vllm::moe::moe_sum_kernel<scalar_t, 4><<<grid, block, 0, stream>>>(
+            output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
+            hidden_size);
+      });
+      break;
+
+    default:
+      at::sum_out(output, input, 1);
+      break;
+  }
+}
diff --git a/vllm_v0.10.0/csrc/moe/moe_ops.h b/vllm_v0.10.0/csrc/moe/moe_ops.h
new file mode 100644
index 0000000..661730c
--- /dev/null
+++ b/vllm_v0.10.0/csrc/moe/moe_ops.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <torch/all.h>
+
+void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
+                  torch::Tensor& token_expert_indices,
+                  torch::Tensor& gating_output);
+
+void moe_sum(torch::Tensor& input, torch::Tensor& output);
+
+void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
+                          int64_t block_size, torch::Tensor sorted_token_ids,
+                          torch::Tensor experts_ids,
+                          torch::Tensor num_tokens_post_pad);
+#ifndef USE_ROCM
+torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
+                             torch::Tensor b_qweight, torch::Tensor b_scales,
+                             std::optional<torch::Tensor> b_qzeros,
+                             std::optional<torch::Tensor> topk_weights,
+                             torch::Tensor sorted_token_ids,
+                             torch::Tensor expert_ids,
+                             torch::Tensor num_tokens_post_pad, int64_t top_k,
+                             int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
+                             int64_t BLOCK_SIZE_K, int64_t bit);
+#endif
+
+bool moe_permute_unpermute_supported();
+
+void shuffle_rows(const torch::Tensor& input_tensor,
+                  const torch::Tensor& dst2src_map,
+                  torch::Tensor& output_tensor);
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/moe/moe_permute_unpermute_op.cu b/vllm_v0.10.0/csrc/moe/moe_permute_unpermute_op.cu
new file mode 100644
index 0000000..a77471a
--- /dev/null
+++ b/vllm_v0.10.0/csrc/moe/moe_permute_unpermute_op.cu
@@ -0,0 +1,230 @@
+#include <c10/core/ScalarType.h>
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "permute_unpermute_kernels/moe_permute_unpermute_kernel.h"
+#include "permute_unpermute_kernels/dispatch.h"
+#include "core/registration.h"
+
+// moe_permute kernels require at least CUDA 12.0
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)
+
+void moe_permute(
+    const torch::Tensor& input,                      // [n_token, hidden]
+    const torch::Tensor& topk_weights,               //[n_token, topk]
+    torch::Tensor& topk_ids,                         // [n_token, topk]
+    const torch::Tensor& token_expert_indices,       // [n_token, topk]
+    const std::optional<torch::Tensor>& expert_map,  // [n_expert]
+    int64_t n_expert, int64_t n_local_expert, int64_t topk,
+    const std::optional<int64_t>& align_block_size,
+    torch::Tensor&
+        permuted_input,  // [topk * n_token/align_block_size_m, hidden]
+    torch::Tensor& expert_first_token_offset,  // [n_local_expert + 1]
+    torch::Tensor& src_row_id2dst_row_id_map,  // [n_token, topk]
+    torch::Tensor& m_indices) {                // [align_expand_m]
+  TORCH_CHECK(topk_weights.scalar_type() == at::ScalarType::Float,
+              "topk_weights must be float32");
+  TORCH_CHECK(expert_first_token_offset.scalar_type() == at::ScalarType::Long,
+              "expert_first_token_offset must be int64");
+  TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
+              "topk_ids must be int32");
+  TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int,
+              "token_expert_indices must be int32");
+  TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int,
+              "src_row_id2dst_row_id_map must be int32");
+  TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
+              "expert_first_token_offset shape != n_local_expert+1")
+  TORCH_CHECK(
+      src_row_id2dst_row_id_map.sizes() == token_expert_indices.sizes(),
+      "token_expert_indices shape must be same as src_row_id2dst_row_id_map");
+  auto n_token = input.sizes()[0];
+  auto n_hidden = input.sizes()[1];
+  auto align_block_size_value =
+      align_block_size.has_value() ? align_block_size.value() : -1;
+  auto stream = at::cuda::getCurrentCUDAStream().stream();
+  const long sorter_size =
+      CubKeyValueSorter::getWorkspaceSize(n_token * topk, n_expert);
+  auto sort_workspace = torch::empty(
+      {sorter_size},
+      torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
+  auto permuted_experts_id = torch::empty_like(topk_ids);
+  auto dst_row_id2src_row_id_map = torch::empty_like(src_row_id2dst_row_id_map);
+  auto align_expert_first_token_offset =
+      torch::zeros_like(expert_first_token_offset);
+
+  CubKeyValueSorter sorter{};
+  int64_t* valid_num_ptr = nullptr;
+  // pre-process kernel for expert-parallelism:
+  // no local expert id plus "n_expert" offset for priority to local expert
+  // map local expert id [n, .., n+n_local_expert-1] to [0, n_local_expert -1]
+  // For example, 4 expert with ep_size=2. ep_rank=1 owns global expert id
+  // [2,3] with expert_map[-1, -1, 0, 1], preprocess_topk_id  process topk_ids
+  // and map global expert id [2, 3] to local_expert id [0, 1] and map global
+  // expert id [0, 1] ( not in ep rank=1)  to [4, 5] by plus n_expert. This map
+  // operation is to make local expert high priority in following sort topk_ids
+  // and scan local expert_first_token_offset for each ep rank for next group
+  // gemm.
+  if (expert_map.has_value()) {
+    const int* expert_map_ptr = get_ptr<int>(expert_map.value());
+    valid_num_ptr =
+        get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
+    preprocessTopkIdLauncher(get_ptr<int>(topk_ids), n_token * topk,
+                             expert_map_ptr, n_expert, stream);
+  }
+  // expert sort topk expert id and scan expert id get expert_first_token_offset
+  sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indices),
+                    get_ptr<int>(permuted_experts_id),
+                    get_ptr<int>(dst_row_id2src_row_id_map),
+                    get_ptr<int64_t>(expert_first_token_offset), n_token,
+                    n_expert, n_local_expert, topk, sorter,
+                    get_ptr<int>(sort_workspace), stream);
+
+  // dispatch expandInputRowsKernelLauncher
+  MOE_DISPATCH(input.scalar_type(), [&] {
+    expandInputRowsKernelLauncher<scalar_t>(
+        get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input),
+        get_ptr<float>(topk_weights), get_ptr<int>(permuted_experts_id),
+        get_ptr<int>(dst_row_id2src_row_id_map),
+        get_ptr<int>(src_row_id2dst_row_id_map),
+        get_ptr<int64_t>(expert_first_token_offset), n_token, valid_num_ptr,
+        n_hidden, topk, n_local_expert, align_block_size_value, stream);
+  });
+
+  // get m_indices and update expert_first_token_offset with align block
+  getMIndices(get_ptr<int64_t>(expert_first_token_offset),
+              get_ptr<int64_t>(align_expert_first_token_offset),
+              get_ptr<int>(m_indices), n_local_expert, align_block_size_value,
+              stream);
+  if (align_block_size.has_value()) {
+    // update align_expert_first_token_offset
+    expert_first_token_offset.copy_(align_expert_first_token_offset);
+  }
+}
+
+void moe_unpermute(
+    const torch::Tensor& permuted_hidden_states,     // [n_token * topk, hidden]
+    const torch::Tensor& topk_weights,               //[n_token, topk]
+    const torch::Tensor& topk_ids,                   // [n_token, topk]
+    const torch::Tensor& src_row_id2dst_row_id_map,  // [n_token, topk]
+    const torch::Tensor& expert_first_token_offset,  // [n_local_expert+1]
+    int64_t n_expert, int64_t n_local_expert, int64_t topk,
+    torch::Tensor& hidden_states  // [n_token, hidden]
+) {
+  TORCH_CHECK(src_row_id2dst_row_id_map.sizes() == topk_ids.sizes(),
+              "topk_ids shape must be same as src_row_id2dst_row_id_map");
+  TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
+              "topk_ids must be int32");
+  TORCH_CHECK(
+      permuted_hidden_states.scalar_type() == hidden_states.scalar_type(),
+      "topk_ids dtype must be same as src_row_id2dst_row_id_map");
+  auto n_token = hidden_states.size(0);
+  auto n_hidden = hidden_states.size(1);
+  auto stream = at::cuda::getCurrentCUDAStream().stream();
+  const int64_t* valid_ptr =
+      get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
+  MOE_DISPATCH(hidden_states.scalar_type(), [&] {
+    finalizeMoeRoutingKernelLauncher<scalar_t, scalar_t>(
+        get_ptr<scalar_t>(permuted_hidden_states),
+        get_ptr<scalar_t>(hidden_states), get_ptr<float>(topk_weights),
+        get_ptr<int>(src_row_id2dst_row_id_map), get_ptr<int>(topk_ids),
+        n_token, n_hidden, topk, valid_ptr, stream);
+  });
+}
+
+template <typename T>
+__global__ void shuffleInputRowsKernel(const T* input,
+                                       const int32_t* dst2src_map, T* output,
+                                       int64_t num_src_rows,
+                                       int64_t num_dst_rows, int64_t num_cols) {
+  int64_t dest_row_idx = blockIdx.x;
+  int64_t const source_row_idx = dst2src_map[dest_row_idx];
+
+  if (blockIdx.x < num_dst_rows) {
+    // Load 128-bits per thread
+    constexpr int64_t ELEM_PER_THREAD = 128 / sizeof(T) / 8;
+    using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
+
+    // Duplicate and permute rows
+    auto const* source_row_ptr =
+        reinterpret_cast<DataElem const*>(input + source_row_idx * num_cols);
+    auto* dest_row_ptr =
+        reinterpret_cast<DataElem*>(output + dest_row_idx * num_cols);
+
+    int64_t const start_offset = threadIdx.x;
+    int64_t const stride = blockDim.x;
+    int64_t const num_elems_in_col = num_cols / ELEM_PER_THREAD;
+
+    for (int elem_index = start_offset; elem_index < num_elems_in_col;
+         elem_index += stride) {
+      dest_row_ptr[elem_index] = source_row_ptr[elem_index];
+    }
+  }
+}
+
+void shuffle_rows(const torch::Tensor& input_tensor,
+                  const torch::Tensor& dst2src_map,
+                  torch::Tensor& output_tensor) {
+  TORCH_CHECK(input_tensor.scalar_type() == output_tensor.scalar_type(),
+              "Input and output tensors must have the same data type");
+
+  auto stream = at::cuda::getCurrentCUDAStream().stream();
+  int64_t const blocks = output_tensor.size(0);
+  int64_t const threads = 256;
+  int64_t const num_dest_rows = output_tensor.size(0);
+  int64_t const num_src_rows = input_tensor.size(0);
+  int64_t const num_cols = input_tensor.size(1);
+
+  TORCH_CHECK(!(num_cols % (128 / sizeof(input_tensor.scalar_type()) / 8)),
+              "num_cols must be divisible by 128 / "
+              "sizeof(input_tensor.scalar_type()) / 8");
+
+  MOE_DISPATCH(input_tensor.scalar_type(), [&] {
+    shuffleInputRowsKernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<scalar_t*>(input_tensor.data_ptr()),
+        dst2src_map.data_ptr<int32_t>(),
+        reinterpret_cast<scalar_t*>(output_tensor.data_ptr()), num_src_rows,
+        num_dest_rows, num_cols);
+  });
+}
+
+#else
+
+void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
+                 torch::Tensor& topk_ids,
+                 const torch::Tensor& token_expert_indices,
+                 const std::optional<torch::Tensor>& expert_map,
+                 int64_t n_expert, int64_t n_local_expert, int64_t topk,
+                 const std::optional<int64_t>& align_block_size,
+                 torch::Tensor& permuted_input,
+                 torch::Tensor& expert_first_token_offset,
+                 torch::Tensor& src_row_id2dst_row_id_map,
+                 torch::Tensor& m_indices) {
+  TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0");
+}
+
+void moe_unpermute(const torch::Tensor& input,
+                   const torch::Tensor& topk_weights, torch::Tensor& topk_ids,
+                   const torch::Tensor& token_expert_indices,
+                   const std::optional<torch::Tensor>& expert_map,
+                   int64_t n_expert, int64_t n_local_expert, int64_t topk,
+                   const std::optional<int64_t>& align_block_size,
+                   torch::Tensor& permuted_input,
+                   torch::Tensor& expert_first_token_offset,
+                   torch::Tensor& src_row_id2dst_row_id_map,
+                   torch::Tensor& m_indices) {
+  TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0");
+}
+
+#endif
+
+bool moe_permute_unpermute_supported() {
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)
+  return true;
+#else
+  return false;
+#endif
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("moe_permute", &moe_permute);
+  m.impl("moe_unpermute", &moe_unpermute);
+}
diff --git a/vllm_v0.10.0/csrc/moe/moe_wna16.cu b/vllm_v0.10.0/csrc/moe/moe_wna16.cu
new file mode 100644
index 0000000..7b6a111
--- /dev/null
+++ b/vllm_v0.10.0/csrc/moe/moe_wna16.cu
@@ -0,0 +1,342 @@
+
+#include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda_runtime.h>
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include "moe_wna16_utils.h"
+
+#define DIVIDE(x, size) (((x) + (size) - 1) / (size))
+
+template <typename scalar_t, int bit, int GROUPS>
+__global__ void moe_wna16_gemm_kernel(
+    const scalar_t* __restrict__ input, scalar_t* __restrict__ output,
+    const uint32_t* __restrict__ qweight, const scalar_t* __restrict__ scales,
+    const uint32_t* __restrict__ qzeros,
+
+    const float* __restrict__ topk_weights,
+    const int32_t* __restrict__ sorted_token_ids,
+    const int32_t* __restrict__ expert_ids,
+    const int32_t* __restrict__ num_tokens_post_pad,
+
+    uint16_t num_experts, uint16_t group_size, uint16_t top_k, uint32_t size_m,
+    uint32_t size_n, uint32_t size_k, uint16_t BLOCK_SIZE_M,
+    uint16_t BLOCK_SIZE_N, uint16_t BLOCK_SIZE_K, bool has_zp,
+    bool mul_topk_weight) {
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800
+  if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+    return;
+  } else {
+#endif
+
+    using Dtype = ScalarType<scalar_t>;
+    using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+
+    if (blockIdx.x * BLOCK_SIZE_M >= num_tokens_post_pad[0]) return;
+
+    const int32_t offset_n = blockIdx.y * BLOCK_SIZE_N + threadIdx.x;
+    const int32_t offset_k = blockIdx.z * BLOCK_SIZE_K;
+
+    const int32_t expert_id = expert_ids[blockIdx.x];
+
+    int32_t num_valid_tokens = 0;
+    extern __shared__ uint16_t block_input_tmp[];
+    scalar_t* block_input = reinterpret_cast<scalar_t*>(block_input_tmp);
+    scalar_t2* block_input_half2 = reinterpret_cast<scalar_t2*>(block_input);
+
+    // load BLOCK_SIZE_M * BLOCK_SIZE_K into shared memory
+    for (int m = 0; m < BLOCK_SIZE_M; m++) {
+      const int32_t offset_m = blockIdx.x * BLOCK_SIZE_M + m;
+      const int32_t token_index = sorted_token_ids[offset_m];
+      if (token_index / top_k >= size_m) break;
+
+      num_valid_tokens = m + 1;
+
+      if (expert_id != -1) {
+        int k_per_thread = DIVIDE(BLOCK_SIZE_K, BLOCK_SIZE_N);
+        for (int i = 0; i < k_per_thread; i++) {
+          int k = BLOCK_SIZE_N * i + threadIdx.x;
+          if (k >= BLOCK_SIZE_K) break;
+          if (offset_k + k >= size_k) break;
+
+          // load input to shared memory
+          // use a special layout to fit the layout of dequanted-weight
+          int origin_k;
+          if constexpr (bit == 4) {
+            // [0, 4, 1, 5, 2, 6, 3, 7]
+            int8_t order = (threadIdx.x % 2) * 4 + ((threadIdx.x % 8) / 2);
+            origin_k = BLOCK_SIZE_N * i + threadIdx.x / 8 * 8 + order;
+          } else {
+            // [0, 2, 1, 3]
+            int8_t order = (threadIdx.x % 2) * 2 + ((threadIdx.x % 4) / 2);
+            origin_k = BLOCK_SIZE_N * i + threadIdx.x / 4 * 4 + order;
+          }
+
+          origin_k += token_index / top_k * size_k + blockIdx.z * BLOCK_SIZE_K;
+          block_input[m * BLOCK_SIZE_K + k] = input[origin_k];
+        }
+      }
+    }
+
+    if (expert_id == -1) return;
+    __syncthreads();
+    if (threadIdx.x >= BLOCK_SIZE_N || offset_n >= size_n) return;
+
+    float res[64];  // assume BLOCK_SIZE_M <= 64
+    scalar_t2 res2;
+    scalar_t2 scale_f2;
+    scalar_t2 qzero_f2;
+
+    // note that (size_n * size_k * expert_id) may greater than 2 ** 31
+    constexpr int8_t pack_factor = 32 / bit;
+    const uint64_t expert_offset = ((uint64_t)size_n) * size_k * expert_id;
+    const uint32_t* expert_qweight = qweight + expert_offset / pack_factor;
+    const scalar_t* expert_scales = scales + expert_offset / group_size;
+    const uint32_t* expert_qzeros =
+        qzeros + expert_offset / group_size / pack_factor;
+
+    // load 4*int32 one time: 4 int32 = 128 bit = 1 float4
+    // weight would be loaded in loop
+    uint32_t expert_qweight_tmp[4];
+    float4* expert_qweight_tmp_float4 =
+        reinterpret_cast<float4*>(expert_qweight_tmp);
+
+    // load all required scales one time
+    scalar_t expert_scales_groups[GROUPS];
+    int scales_offset_tmp =
+        (offset_n * size_k + offset_k) / group_size / GROUPS;
+    if constexpr (GROUPS == 1) {
+      *expert_scales_groups = expert_scales[scales_offset_tmp];
+    } else if constexpr (GROUPS == 2) {
+      float* expert_scales_groups_tmp =
+          reinterpret_cast<float*>(expert_scales_groups);
+      *expert_scales_groups_tmp =
+          reinterpret_cast<const float*>(expert_scales)[scales_offset_tmp];
+    } else if constexpr (GROUPS == 4) {
+      float2* expert_scales_groups_tmp =
+          reinterpret_cast<float2*>(expert_scales_groups);
+      *expert_scales_groups_tmp =
+          reinterpret_cast<const float2*>(expert_scales)[scales_offset_tmp];
+    } else if constexpr (GROUPS == 8) {
+      float4* expert_scales_groups_tmp =
+          reinterpret_cast<float4*>(expert_scales_groups);
+      *expert_scales_groups_tmp =
+          reinterpret_cast<const float4*>(expert_scales)[scales_offset_tmp];
+    }
+
+    // load all required qzeros one time
+    uint8_t expert_qzeros_groups[GROUPS];
+    if (!has_zp) {
+      if constexpr (bit == 4) {
+        qzero_f2 = Dtype::num2num2(Dtype::int2num(8));
+      } else {
+        qzero_f2 = Dtype::num2num2(Dtype::int2num(128));
+      }
+    } else {
+      int qzeros_offset_tmp =
+          (offset_n / (8 / bit)) * (size_k / group_size / GROUPS) +
+          offset_k / group_size / GROUPS;
+      if constexpr (GROUPS == 1) {
+        uint8_t* expert_qzeros_groups_tmp =
+            reinterpret_cast<uint8_t*>(expert_qzeros_groups);
+        *expert_qzeros_groups_tmp =
+            reinterpret_cast<const uint8_t*>(expert_qzeros)[qzeros_offset_tmp];
+      } else if constexpr (GROUPS == 2) {
+        uint16_t* expert_qzeros_groups_tmp =
+            reinterpret_cast<uint16_t*>(expert_qzeros_groups);
+        *expert_qzeros_groups_tmp =
+            reinterpret_cast<const uint16_t*>(expert_qzeros)[qzeros_offset_tmp];
+      } else if constexpr (GROUPS == 4) {
+        uint32_t* expert_qzeros_groups_tmp =
+            reinterpret_cast<uint32_t*>(expert_qzeros_groups);
+        *expert_qzeros_groups_tmp =
+            reinterpret_cast<const uint32_t*>(expert_qzeros)[qzeros_offset_tmp];
+      } else if constexpr (GROUPS == 8) {
+        uint64_t* expert_qzeros_groups_tmp =
+            reinterpret_cast<uint64_t*>(expert_qzeros_groups);
+        *expert_qzeros_groups_tmp =
+            reinterpret_cast<const uint64_t*>(expert_qzeros)[qzeros_offset_tmp];
+      }
+    }
+
+    for (int tmp_k = 0; tmp_k < BLOCK_SIZE_K / pack_factor; tmp_k++) {
+      int k = offset_k + tmp_k * pack_factor;
+      if (k >= size_k) break;
+      const int32_t weight_offset = offset_n * size_k + k;
+
+      if (tmp_k % 4 == 0) {
+        *expert_qweight_tmp_float4 = reinterpret_cast<const float4*>(
+            expert_qweight)[weight_offset / pack_factor / 4];
+      }
+
+      if (tmp_k % (group_size / pack_factor) == 0) {
+        scalar_t scale_f =
+            expert_scales_groups[tmp_k / (group_size / pack_factor)];
+        scale_f2 = Dtype::num2num2(scale_f);
+
+        if (has_zp) {
+          uint8_t qzero =
+              expert_qzeros_groups[tmp_k / (group_size / pack_factor)];
+          if constexpr (bit == 4) {
+            qzero = (qzero >> ((threadIdx.x % 2) * 4)) & 0xF;
+          }
+          qzero_f2 = Dtype::num2num2(Dtype::int2num(qzero));
+        }
+      }
+
+      scalar_t2 weight_half2[16 / bit];
+      dequant<scalar_t2, bit>(expert_qweight_tmp[tmp_k % 4], weight_half2);
+
+      for (int m = 0; m < num_valid_tokens; m++) {
+        res2 = {};
+
+#pragma unroll
+        for (int i = 0; i < 16 / bit; i++) {
+          int32_t offset_input = m * BLOCK_SIZE_K / 2 + tmp_k * (16 / bit) + i;
+          res2 = __hfma2(__hmul2(__hsub2(weight_half2[i], qzero_f2), scale_f2),
+                         block_input_half2[offset_input], res2);
+        }
+
+        if (tmp_k == 0) {
+          res[m] = Dtype::num2float(res2.x) + Dtype::num2float(res2.y);
+        } else {
+          res[m] += Dtype::num2float(res2.x) + Dtype::num2float(res2.y);
+        }
+      }
+    }
+
+    for (int m = 0; m < num_valid_tokens; ++m) {
+      const int32_t token_index =
+          sorted_token_ids[blockIdx.x * BLOCK_SIZE_M + m];
+      if (mul_topk_weight) {
+        res[m] *= topk_weights[token_index];
+      }
+      atomicAdd(&output[token_index * size_n + offset_n],
+                Dtype::float2num(res[m]));
+    }
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800
+  }
+#endif
+}
+
+template <typename scalar_t>
+void run_moe_wna16_gemm(const scalar_t* input, scalar_t* output,
+                        const uint32_t* b_qweight, const scalar_t* b_scales,
+                        const uint32_t* b_qzeros, const float* topk_weights,
+                        const int32_t* sorted_token_ids,
+                        const int32_t* expert_ids,
+                        const int32_t* num_tokens_post_pad, int num_experts,
+                        int group_size, int num_token_blocks, int top_k,
+                        int size_m, int size_n, int size_k, int BLOCK_SIZE_M,
+                        int BLOCK_SIZE_N, int BLOCK_SIZE_K, int bit,
+                        bool has_zp, bool mul_topk_weight) {
+  dim3 blockDim, gridDim;
+  blockDim.x = BLOCK_SIZE_N;
+  blockDim.y = 1;
+  blockDim.z = 1;
+  gridDim.x = num_token_blocks;
+  gridDim.y = DIVIDE(size_n, BLOCK_SIZE_N);
+  gridDim.z = DIVIDE(size_k, BLOCK_SIZE_K);
+
+  auto kernel = moe_wna16_gemm_kernel<scalar_t, 4, 1>;
+  if (bit == 4) {
+    if (BLOCK_SIZE_K / group_size == 2) {
+      kernel = moe_wna16_gemm_kernel<scalar_t, 4, 2>;
+    } else if (BLOCK_SIZE_K / group_size == 4) {
+      kernel = moe_wna16_gemm_kernel<scalar_t, 4, 4>;
+    } else if (BLOCK_SIZE_K / group_size == 8) {
+      kernel = moe_wna16_gemm_kernel<scalar_t, 4, 8>;
+    }
+  } else {
+    if (BLOCK_SIZE_K / group_size == 1) {
+      kernel = moe_wna16_gemm_kernel<scalar_t, 8, 1>;
+    } else if (BLOCK_SIZE_K / group_size == 2) {
+      kernel = moe_wna16_gemm_kernel<scalar_t, 8, 2>;
+    } else if (BLOCK_SIZE_K / group_size == 4) {
+      kernel = moe_wna16_gemm_kernel<scalar_t, 8, 4>;
+    } else if (BLOCK_SIZE_K / group_size == 8) {
+      kernel = moe_wna16_gemm_kernel<scalar_t, 8, 8>;
+    }
+  }
+
+  const int shared_mem_size = BLOCK_SIZE_M * BLOCK_SIZE_K * 2;
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  kernel<<<gridDim, blockDim, shared_mem_size, stream>>>(
+      input, output, b_qweight, b_scales, b_qzeros, topk_weights,
+      sorted_token_ids, expert_ids, num_tokens_post_pad, num_experts,
+      group_size, top_k, size_m, size_n, size_k, BLOCK_SIZE_M, BLOCK_SIZE_N,
+      BLOCK_SIZE_K, has_zp, mul_topk_weight);
+}
+
+torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
+                             torch::Tensor b_qweight, torch::Tensor b_scales,
+                             std::optional<torch::Tensor> b_qzeros,
+                             std::optional<torch::Tensor> topk_weights,
+                             torch::Tensor sorted_token_ids,
+                             torch::Tensor expert_ids,
+                             torch::Tensor num_tokens_post_pad, int64_t top_k,
+                             int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
+                             int64_t BLOCK_SIZE_K, int64_t bit) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  output.zero_();
+
+  const int num_experts = b_qweight.size(0);
+  const int size_m = input.size(0);
+  const int size_n = b_qweight.size(1);
+  const int size_k = input.size(1);
+  const int group_size = size_k / b_scales.size(2);
+
+  int64_t EM = sorted_token_ids.size(0);
+  if (size_m <= BLOCK_SIZE_M) {
+    EM = min(EM, size_m * BLOCK_SIZE_M * top_k);
+  }
+  const int num_token_blocks = (EM + BLOCK_SIZE_M - 1) / BLOCK_SIZE_M;
+
+  const uint32_t* b_qzeros_ptr;
+  if (b_qzeros.has_value())
+    b_qzeros_ptr = (const uint32_t*)b_qzeros.value().data_ptr<uint8_t>();
+  const float* topk_weights_ptr = nullptr;
+  if (topk_weights.has_value())
+    topk_weights_ptr = (const float*)topk_weights.value().data_ptr<float>();
+
+  int groups_per_block_row = BLOCK_SIZE_K / group_size;
+  TORCH_CHECK(bit == 4 || bit == 8, "bit must be 4 or 8");
+  TORCH_CHECK(size_k % BLOCK_SIZE_K == 0,
+              "size_k must divisible by BLOCK_SIZE_K");
+  TORCH_CHECK(BLOCK_SIZE_K % group_size == 0,
+              "BLOCK_SIZE_K must divisible by group_size");
+  TORCH_CHECK(BLOCK_SIZE_M <= 64, "BLOCK_SIZE_M must less or equal to 64");
+  TORCH_CHECK(groups_per_block_row == 1 || groups_per_block_row == 2 ||
+                  groups_per_block_row == 4 || groups_per_block_row == 8,
+              "BLOCK_SIZE_K // group_size must be one of [1, 2, 4, 8]");
+
+  if (input.scalar_type() == at::ScalarType::Half) {
+    run_moe_wna16_gemm<half>(
+        (const half*)input.data_ptr<at::Half>(),
+        (half*)output.data_ptr<at::Half>(),
+        (const uint32_t*)b_qweight.data_ptr<uint8_t>(),
+        (const half*)b_scales.data_ptr<at::Half>(), b_qzeros_ptr,
+        topk_weights_ptr, sorted_token_ids.data_ptr<int32_t>(),
+        expert_ids.data_ptr<int32_t>(), num_tokens_post_pad.data_ptr<int32_t>(),
+        num_experts, group_size, num_token_blocks, top_k, size_m, size_n,
+        size_k, BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, bit,
+        b_qzeros.has_value(), topk_weights.has_value());
+  } else if (input.scalar_type() == at::ScalarType::BFloat16) {
+    run_moe_wna16_gemm<nv_bfloat16>(
+        (const nv_bfloat16*)input.data_ptr<at::BFloat16>(),
+        (nv_bfloat16*)output.data_ptr<at::BFloat16>(),
+        (const uint32_t*)b_qweight.data_ptr<uint8_t>(),
+        (const nv_bfloat16*)b_scales.data_ptr<at::BFloat16>(), b_qzeros_ptr,
+        topk_weights_ptr, sorted_token_ids.data_ptr<int32_t>(),
+        expert_ids.data_ptr<int32_t>(), num_tokens_post_pad.data_ptr<int32_t>(),
+        num_experts, group_size, num_token_blocks, top_k, size_m, size_n,
+        size_k, BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, bit,
+        b_qzeros.has_value(), topk_weights.has_value());
+  } else {
+    TORCH_CHECK(false, "moe_wna16_gemm only supports bfloat16 and float16");
+  }
+  return output;
+}
diff --git a/vllm_v0.10.0/csrc/moe/moe_wna16_utils.h b/vllm_v0.10.0/csrc/moe/moe_wna16_utils.h
new file mode 100644
index 0000000..8ef03f0
--- /dev/null
+++ b/vllm_v0.10.0/csrc/moe/moe_wna16_utils.h
@@ -0,0 +1,200 @@
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+template <typename scalar_t>
+class ScalarType {};
+
+template <>
+class ScalarType<half> {
+ public:
+  using scalar_t = half;
+  using scalar_t2 = half2;
+
+  static __device__ float inline num2float(const half x) {
+    return __half2float(x);
+  }
+
+  static __device__ half2 inline num2num2(const half x) {
+    return __half2half2(x);
+  }
+
+  static __device__ half2 inline nums2num2(const half x1, const half x2) {
+    return __halves2half2(x1, x2);
+  }
+
+  static __host__ __device__ half inline float2num(const float x) {
+    return __float2half(x);
+  }
+
+  static __host__ __device__ half inline int2num(const float x) {
+    return __int2half_rn(x);
+  }
+
+  static __host__ __device__ float2 inline num22float2(const half2 x) {
+    return __half22float2(x);
+  }
+
+  static __host__ __device__ half2 inline float22num2(const float2 x) {
+    return __float22half2_rn(x);
+  }
+};
+
+template <>
+class ScalarType<nv_bfloat16> {
+ public:
+  using scalar_t = nv_bfloat16;
+  using scalar_t2 = nv_bfloat162;
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  static __device__ float inline num2float(const nv_bfloat16 x) {
+    return __bfloat162float(x);
+  }
+
+  static __device__ nv_bfloat162 inline num2num2(const nv_bfloat16 x) {
+    return __bfloat162bfloat162(x);
+  }
+
+  static __device__ nv_bfloat162 inline nums2num2(const nv_bfloat16 x1,
+                                                  const nv_bfloat16 x2) {
+    return __halves2bfloat162(x1, x2);
+  }
+
+  static __host__ __device__ nv_bfloat16 inline float2num(const float x) {
+    return __float2bfloat16(x);
+  }
+
+  static __host__ __device__ nv_bfloat16 inline int2num(const float x) {
+    return __int2bfloat16_rn(x);
+  }
+
+  static __host__ __device__ float2 inline num22float2(const nv_bfloat162 x) {
+    return __bfloat1622float2(x);
+  }
+
+  static __host__ __device__ nv_bfloat162 inline float22num2(const float2 x) {
+    return __float22bfloat162_rn(x);
+  }
+#endif
+};
+
+template <int lut>
+__device__ inline int lop3(int a, int b, int c) {
+  int res;
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(res)
+               : "r"(a), "r"(b), "r"(c), "n"(lut));
+  return res;
+}
+
+template <int start_byte, int mask>
+__device__ inline uint32_t prmt(uint32_t a) {
+  uint32_t res;
+  asm volatile("prmt.b32 %0, %1, %2, %3;\n"
+               : "=r"(res)
+               : "r"(a), "n"(start_byte), "n"(mask));
+  return res;
+}
+
+template <typename scalar_t2, int bit>
+__device__ inline void dequant(int q, scalar_t2* res) {}
+
+template <>
+__device__ inline void dequant<half2, 4>(int q, half2* res) {
+  const int LO = 0x000f000f;
+  const int HI = 0x00f000f0;
+  const int EX = 0x64006400;
+  const int SUB = 0x64006400;
+  const int MUL = 0x2c002c00;
+  const int ADD = 0xd400d400;
+
+  int lo0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  q >>= 8;
+  int lo1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+
+  res[0] = __hsub2(*reinterpret_cast<half2*>(&lo0),
+                   *reinterpret_cast<const half2*>(&SUB));
+  res[1] = __hfma2(*reinterpret_cast<half2*>(&hi0),
+                   *reinterpret_cast<const half2*>(&MUL),
+                   *reinterpret_cast<const half2*>(&ADD));
+  res[2] = __hsub2(*reinterpret_cast<half2*>(&lo1),
+                   *reinterpret_cast<const half2*>(&SUB));
+  res[3] = __hfma2(*reinterpret_cast<half2*>(&hi1),
+                   *reinterpret_cast<const half2*>(&MUL),
+                   *reinterpret_cast<const half2*>(&ADD));
+}
+
+template <>
+__device__ inline void dequant<half2, 8>(int q, half2* res) {
+  static constexpr uint32_t mask_for_elt_01 = 0x5250;
+  static constexpr uint32_t mask_for_elt_23 = 0x5351;
+  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+
+  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
+  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
+
+  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64006400;
+
+  res[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                   *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  res[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
+                   *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+}
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+template <>
+__device__ inline void dequant<nv_bfloat162, 4>(int q, nv_bfloat162* res) {
+  static constexpr uint32_t MASK = 0x000f000f;
+  static constexpr uint32_t EX = 0x43004300;
+
+  int lo0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  q >>= 4;
+  int hi0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  q >>= 4;
+  int lo1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  q >>= 4;
+  int hi1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+
+  static constexpr uint32_t MUL = 0x3F803F80;
+  static constexpr uint32_t ADD = 0xC300C300;
+
+  res[0] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&lo0),
+                   *reinterpret_cast<const nv_bfloat162*>(&MUL),
+                   *reinterpret_cast<const nv_bfloat162*>(&ADD));
+  res[1] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&hi0),
+                   *reinterpret_cast<const nv_bfloat162*>(&MUL),
+                   *reinterpret_cast<const nv_bfloat162*>(&ADD));
+  res[2] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&lo1),
+                   *reinterpret_cast<const nv_bfloat162*>(&MUL),
+                   *reinterpret_cast<const nv_bfloat162*>(&ADD));
+  res[3] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&hi1),
+                   *reinterpret_cast<const nv_bfloat162*>(&MUL),
+                   *reinterpret_cast<const nv_bfloat162*>(&ADD));
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, 8>(int q, nv_bfloat162* res) {
+  float fp32_intermediates[4];
+  uint32_t* fp32_intermediates_casted =
+      reinterpret_cast<uint32_t*>(fp32_intermediates);
+
+  static constexpr uint32_t fp32_base = 0x4B000000;
+  fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650);
+  fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652);
+  fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651);
+  fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653);
+
+  fp32_intermediates[0] -= 8388608.f;
+  fp32_intermediates[1] -= 8388608.f;
+  fp32_intermediates[2] -= 8388608.f;
+  fp32_intermediates[3] -= 8388608.f;
+
+  uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(res);
+  bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0],
+                                   fp32_intermediates_casted[1], 0x7632);
+  bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2],
+                                   fp32_intermediates_casted[3], 0x7632);
+}
+#endif
diff --git a/vllm_v0.10.0/csrc/moe/permute_unpermute_kernels/dispatch.h b/vllm_v0.10.0/csrc/moe/permute_unpermute_kernels/dispatch.h
new file mode 100644
index 0000000..d0f1ea4
--- /dev/null
+++ b/vllm_v0.10.0/csrc/moe/permute_unpermute_kernels/dispatch.h
@@ -0,0 +1,59 @@
+#pragma once
+#include <cuda_fp8.h>
+#define MOE_SWITCH(TYPE, ...)                                     \
+  at::ScalarType _st = ::detail::scalar_type(TYPE);               \
+  switch (_st) {                                                  \
+    __VA_ARGS__                                                   \
+    default:                                                      \
+      TORCH_CHECK(false, "[moe permute]data type dispatch fail!") \
+  }
+
+#define MOE_DISPATCH_CASE(enum_type, ...)                  \
+  case enum_type: {                                        \
+    using scalar_t = ScalarType2CudaType<enum_type>::type; \
+    __VA_ARGS__();                                         \
+    break;                                                 \
+  }
+#define MOE_DISPATCH_FLOAT_CASE(...)                            \
+  MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)         \
+  MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)          \
+  MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)      \
+  MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__)   \
+  MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
+  MOE_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)
+
+#define MOE_DISPATCH(TYPE, ...) \
+  MOE_SWITCH(TYPE, MOE_DISPATCH_FLOAT_CASE(__VA_ARGS__))
+
+template <at::ScalarType type>
+struct ScalarType2CudaType;
+
+template <>
+struct ScalarType2CudaType<at::ScalarType::Float> {
+  using type = float;
+};
+template <>
+struct ScalarType2CudaType<at::ScalarType::Half> {
+  using type = half;
+};
+template <>
+struct ScalarType2CudaType<at::ScalarType::BFloat16> {
+  using type = __nv_bfloat16;
+};
+// uint8 for packed fp4
+template <>
+struct ScalarType2CudaType<at::ScalarType::Byte> {
+  using type = uint8_t;
+};
+
+// #if __CUDA_ARCH__ >= 890
+// fp8
+template <>
+struct ScalarType2CudaType<at::ScalarType::Float8_e5m2> {
+  using type = __nv_fp8_e5m2;
+};
+template <>
+struct ScalarType2CudaType<at::ScalarType::Float8_e4m3fn> {
+  using type = __nv_fp8_e4m3;
+};
+// #endif
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu b/vllm_v0.10.0/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
new file mode 100644
index 0000000..de2c153
--- /dev/null
+++ b/vllm_v0.10.0/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
@@ -0,0 +1,231 @@
+
+#include "moe_permute_unpermute_kernel.h"
+
+// moe_permute kernels require at least CUDA 12.0
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)
+
+// CubKeyValueSorter definition begin
+CubKeyValueSorter::CubKeyValueSorter()
+    : num_experts_(0), num_bits_(sizeof(int) * 8) {}
+
+int CubKeyValueSorter::expertsToBits(int num_experts) {
+  // Max value we represent is V = num_experts + (num_experts - 1) = 2 *
+  // num_experts - 1 The maximum number of bits is therefore floor(log2(V)) + 1
+  return static_cast<int>(log2(2 * num_experts - 1)) + 1;
+}
+
+CubKeyValueSorter::CubKeyValueSorter(int const num_experts)
+    : num_experts_(num_experts), num_bits_(expertsToBits(num_experts)) {}
+
+void CubKeyValueSorter::updateNumExperts(int const num_experts) {
+  num_experts_ = num_experts;
+  num_bits_ = expertsToBits(num_experts);
+}
+
+size_t CubKeyValueSorter::getWorkspaceSize(size_t const num_key_value_pairs,
+                                           int const num_experts) {
+  int num_bits = expertsToBits(num_experts);
+  size_t required_storage = 0;
+  int* null_int = nullptr;
+  cub::DeviceRadixSort::SortPairs(nullptr, required_storage, null_int, null_int,
+                                  null_int, null_int, num_key_value_pairs, 0,
+                                  num_bits);
+
+  //   when num_key_value_pairs, num_experts, num_bits, required_storage = 64,
+  //   4, 3, 0 The required_storage seems to vary between 0 and 1 for the same
+  //   inputs
+  if (required_storage == 0) {
+    required_storage = 1;
+  }
+  return required_storage;
+}
+
+void CubKeyValueSorter::run(void* workspace, size_t const workspace_size,
+                            int const* keys_in, int* keys_out,
+                            int const* values_in, int* values_out,
+                            size_t const num_key_value_pairs,
+                            cudaStream_t stream) {
+  size_t expected_ws_size = getWorkspaceSize(num_key_value_pairs, num_experts_);
+  size_t actual_ws_size = workspace_size;
+
+  TORCH_CHECK(expected_ws_size <= workspace_size,
+              "[CubKeyValueSorter::run] The allocated workspace is too small "
+              "to run this problem.");
+  cub::DeviceRadixSort::SortPairs(workspace, actual_ws_size, keys_in, keys_out,
+                                  values_in, values_out, num_key_value_pairs, 0,
+                                  num_bits_, stream);
+}
+// CubKeyValueSorter definition end
+
+static inline size_t pad_to_multiple_of_16(size_t const& input) {
+  static constexpr int ALIGNMENT = 16;
+  return ALIGNMENT * ((input + ALIGNMENT - 1) / ALIGNMENT);
+}
+template <class T>
+__device__ inline int64_t findTotalEltsLessThanTarget(T const* sorted_indices,
+                                                      int64_t const arr_length,
+                                                      T const target) {
+  int64_t low = 0, high = arr_length - 1, target_location = -1;
+  while (low <= high) {
+    int64_t mid = (low + high) / 2;
+
+    if (sorted_indices[mid] >= target) {
+      high = mid - 1;
+    } else {
+      low = mid + 1;
+      target_location = mid;
+    }
+  }
+  return target_location + 1;
+}
+
+// Calculates the start offset of the tokens for a given expert. The last
+// element is the total number of valid tokens
+__global__ void computeExpertFirstTokenOffsetKernel(
+    int const* sorted_experts, int64_t const sorted_experts_len,
+    int const num_experts, int64_t* expert_first_token_offset) {
+  // First, compute the global tid. We only need 1 thread per expert.
+  int const expert = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Note that expert goes [0, num_experts] (inclusive) because we want a count
+  // for the total number of active tokens at the end of the scan.
+  if (expert >= num_experts + 1) {
+    return;
+  }
+  expert_first_token_offset[expert] =
+      findTotalEltsLessThanTarget(sorted_experts, sorted_experts_len, expert);
+}
+
+void computeExpertFirstTokenOffset(int const* sorted_indices,
+                                   int const total_indices,
+                                   int const num_experts,
+                                   int64_t* expert_first_token_offset,
+                                   cudaStream_t stream) {
+  int const num_entries = num_experts + 1;
+  int const threads = std::min(1024, num_entries);
+  int const blocks = (num_entries + threads - 1) / threads;
+
+  computeExpertFirstTokenOffsetKernel<<<blocks, threads, 0, stream>>>(
+      sorted_indices, total_indices, num_experts, expert_first_token_offset);
+}
+
+void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
+                       int* permuted_experts, int* permuted_rows,
+                       int64_t* expert_first_token_offset, int num_rows,
+                       int num_experts, int num_experts_per_node, int k,
+                       CubKeyValueSorter& sorter, void* sorter_ws,
+                       cudaStream_t stream) {
+  int64_t const expanded_num_rows = static_cast<int64_t>(k) * num_rows;
+  // We need to use the full num_experts because that is the sentinel value used
+  // by topk for disabled experts
+  sorter.updateNumExperts(num_experts);
+  size_t const sorter_ws_size_bytes = pad_to_multiple_of_16(
+      sorter.getWorkspaceSize(expanded_num_rows, num_experts));
+  sorter.run((void*)sorter_ws, sorter_ws_size_bytes, expert_for_source_row,
+             permuted_experts, source_rows, permuted_rows, expanded_num_rows,
+             stream);
+  computeExpertFirstTokenOffset(permuted_experts, expanded_num_rows,
+                                num_experts_per_node, expert_first_token_offset,
+                                stream);
+}
+
+__global__ void preprocessTopkIdKernel(int* topk_id_ptr, int size,
+                                       const int* expert_map_ptr,
+                                       int num_experts) {
+  auto tidx = threadIdx.x;
+  auto bidx = blockIdx.x;
+  auto offset = bidx * blockDim.x;
+  auto bound = min(offset + blockDim.x, size);
+  extern __shared__ int smem_expert_map[];
+  // store expert_map in smem
+  for (int i = tidx; i < num_experts; i += blockDim.x) {
+    smem_expert_map[i] = expert_map_ptr[i];
+  }
+  __syncthreads();
+
+  // query global expert id in expert map.
+  // if global expert id = -1 in exert map, plus n_expert
+  // else set global expert id = exert map[global expert id]
+  if (offset + tidx < bound) {
+    auto topk_id = topk_id_ptr[offset + tidx];
+    auto local_expert_idx = smem_expert_map[topk_id];
+    if (local_expert_idx == -1) {
+      topk_id += num_experts;
+    } else {
+      topk_id = local_expert_idx;
+    }
+    __syncwarp();
+    topk_id_ptr[offset + tidx] = topk_id;
+  }
+}
+void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
+                              const int* expert_map_ptr, int num_experts,
+                              cudaStream_t stream) {
+  int block = std::min(size, 1024);
+  int grid = (size + block - 1) / block;
+  int smem_size = (num_experts) * sizeof(int);
+  preprocessTopkIdKernel<<<grid, block, smem_size, stream>>>(
+      topk_id_ptr, size, expert_map_ptr, num_experts);
+}
+
+template <bool ALIGN_BLOCK_SIZE>
+__global__ void getMIndicesKernel(int64_t* expert_first_token_offset,
+                                  int64_t* align_expert_first_token_offset,
+                                  int* m_indices, const int num_local_expert,
+                                  const int align_block_size) {
+  int eidx = blockIdx.x;
+  int tidx = threadIdx.x;
+  extern __shared__ int64_t smem_expert_first_token_offset[];
+  for (int i = tidx; i <= num_local_expert; i += blockDim.x) {
+    smem_expert_first_token_offset[tidx] = __ldg(expert_first_token_offset + i);
+  }
+  __syncthreads();
+  auto last_token_offset = smem_expert_first_token_offset[eidx + 1];
+  auto first_token_offset = smem_expert_first_token_offset[eidx];
+  int n_token_in_expert = last_token_offset - first_token_offset;
+
+  if constexpr (ALIGN_BLOCK_SIZE) {
+    n_token_in_expert = (n_token_in_expert + align_block_size - 1) /
+                        align_block_size * align_block_size;
+    // round up to ALIGN_BLOCK_SIZE
+    int64_t accumulate_align_offset = 0;
+    for (int i = 1; i <= eidx + 1; i++) {
+      int n_token = smem_expert_first_token_offset[i] -
+                    smem_expert_first_token_offset[i - 1];
+      accumulate_align_offset =
+          accumulate_align_offset + (n_token + align_block_size - 1) /
+                                        align_block_size * align_block_size;
+      if (i == eidx) {
+        first_token_offset = accumulate_align_offset;
+      }
+      // last block store align_expert_first_token_offset
+      if (eidx == num_local_expert - 1 && threadIdx.x == 0) {
+        align_expert_first_token_offset[i] = accumulate_align_offset;
+      }
+    }
+  }
+  for (int idx = tidx; idx < n_token_in_expert; idx += blockDim.x) {
+    // update m_indice with expert id
+    m_indices[first_token_offset + idx] = eidx;
+  }
+}
+
+void getMIndices(int64_t* expert_first_token_offset,
+                 int64_t* align_expert_first_token_offset, int* m_indices,
+                 int num_local_expert, const int align_block_size,
+                 cudaStream_t stream) {
+  int block = 256;
+  int grid = num_local_expert;
+  int smem_size = sizeof(int64_t) * (num_local_expert + 1);
+  if (align_block_size == -1) {
+    getMIndicesKernel<false><<<grid, block, smem_size, stream>>>(
+        expert_first_token_offset, align_expert_first_token_offset, m_indices,
+        num_local_expert, align_block_size);
+  } else {
+    getMIndicesKernel<true><<<grid, block, smem_size, stream>>>(
+        expert_first_token_offset, align_expert_first_token_offset, m_indices,
+        num_local_expert, align_block_size);
+  }
+}
+
+#endif
diff --git a/vllm_v0.10.0/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h b/vllm_v0.10.0/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
new file mode 100644
index 0000000..43c2972
--- /dev/null
+++ b/vllm_v0.10.0/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
@@ -0,0 +1,95 @@
+#pragma once
+// reference from tensorrt_llm moe kernel implementation archive in
+// https://github.com/BBuf/tensorrt-llm-moe/tree/master
+
+#include <c10/core/ScalarType.h>
+#include <torch/all.h>
+#include "dispatch.h"
+#include <cub/cub.cuh>
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/util_type.cuh>
+#include "cutlass/numeric_size.h"
+#include "cutlass/array.h"
+
+template <typename T>
+inline T* get_ptr(torch::Tensor& t) {
+  return reinterpret_cast<T*>(t.data_ptr());
+}
+
+template <typename T>
+inline const T* get_ptr(const torch::Tensor& t) {
+  return reinterpret_cast<const T*>(t.data_ptr());
+}
+
+class CubKeyValueSorter {
+ public:
+  CubKeyValueSorter();
+
+  CubKeyValueSorter(int const num_experts);
+
+  void updateNumExperts(int const num_experts);
+
+  static size_t getWorkspaceSize(size_t const num_key_value_pairs,
+                                 int const num_experts);
+
+  void run(void* workspace, size_t const workspace_size, int const* keys_in,
+           int* keys_out, int const* values_in, int* values_out,
+           size_t const num_key_value_pairs, cudaStream_t stream);
+
+ private:
+  static int expertsToBits(int experts);
+  int num_experts_;
+  int num_bits_;
+};
+
+void computeExpertFirstTokenOffset(int const* sorted_indices,
+                                   int const total_indices,
+                                   int const num_experts,
+                                   int64_t* expert_first_token_offset,
+                                   cudaStream_t stream);
+
+void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
+                       int* permuted_experts, int* permuted_rows,
+                       int64_t* expert_first_token_offset, int num_rows,
+                       int num_experts, int num_experts_per_node, int k,
+                       CubKeyValueSorter& sorter, void* sorter_ws,
+                       cudaStream_t stream);
+
+template <typename T>
+void expandInputRowsKernelLauncher(
+    T const* unpermuted_input, T* permuted_output,
+    const float* unpermuted_scales, int* sorted_experts,
+    int const* expanded_dest_row_to_expanded_source_row,
+    int* expanded_source_row_to_expanded_dest_row,
+    int64_t* expert_first_token_offset, int64_t const num_rows,
+    int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
+    int num_local_experts, const int& align_block_size, cudaStream_t stream);
+
+// Final kernel to unpermute and scale
+// This kernel unpermutes the original data, does the k-way reduction and
+// performs the final skip connection.
+template <typename T, typename OutputType, bool CHECK_SKIPPED>
+__global__ void finalizeMoeRoutingKernel(
+    T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
+    float const* scales, int const* expanded_source_row_to_expanded_dest_row,
+    int const* expert_for_source_row, int64_t const orig_cols, int64_t const k,
+    int64_t const* num_valid_ptr);
+
+template <class T, class OutputType>
+void finalizeMoeRoutingKernelLauncher(
+    T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
+    float const* scales, int const* expanded_source_row_to_expanded_dest_row,
+    int const* expert_for_source_row, int64_t const num_rows,
+    int64_t const cols, int64_t const k, int64_t const* num_valid_ptr,
+    cudaStream_t stream);
+
+void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
+                              const int* expert_map_ptr, int num_experts,
+                              cudaStream_t stream);
+
+void getMIndices(int64_t* expert_first_token_offset,
+                 int64_t* align_expert_first_token_offset, int* m_indices,
+                 int num_local_expert, const int align_block_size,
+                 cudaStream_t stream);
+
+#include "moe_permute_unpermute_kernel.inl"
diff --git a/vllm_v0.10.0/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl b/vllm_v0.10.0/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
new file mode 100644
index 0000000..ad0d390
--- /dev/null
+++ b/vllm_v0.10.0/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
@@ -0,0 +1,206 @@
+#pragma once
+
+template <typename T, bool CHECK_SKIPPED, bool ALIGN_BLOCK_SIZE>
+__global__ void expandInputRowsKernel(
+    T const* unpermuted_input, T* permuted_output,
+    const float* unpermuted_scales, int* sorted_experts,
+    int const* expanded_dest_row_to_expanded_source_row,
+    int* expanded_source_row_to_expanded_dest_row,
+    int64_t* expert_first_token_offset, int64_t const num_rows,
+    int64_t const* num_dest_rows, int64_t const cols, int64_t k,
+    int num_local_experts, int align_block_size) {
+  // Reverse permutation map.
+  // I do this so that later, we can use the source -> dest map to do the k-way
+  // reduction and unpermuting. I need the reverse map for that reduction to
+  // allow each threadblock to do 1 k-way reduce without atomics later in MoE. 1
+  // thread block will be responsible for all k summations.
+  int64_t expanded_dest_row = blockIdx.x;
+  int64_t const expanded_source_row =
+      expanded_dest_row_to_expanded_source_row[expanded_dest_row];
+  int expert_id = sorted_experts[expanded_dest_row];
+
+  extern __shared__ int64_t smem_expert_first_token_offset[];
+  if constexpr (ALIGN_BLOCK_SIZE) {
+    // load g2s
+    for (int idx = threadIdx.x; idx < num_local_experts + 1;
+         idx += blockDim.x) {
+      smem_expert_first_token_offset[idx] =
+          __ldg(expert_first_token_offset + idx);
+    }
+    __syncthreads();
+    int lane_idx = threadIdx.x & 31;
+
+    if (lane_idx == 0) {
+      // set token_offset_in_expert = 0 if this expert is not local expert
+      int token_offset_in_expert =
+          expert_id >= num_local_experts
+              ? 0
+              : expanded_dest_row - smem_expert_first_token_offset[expert_id];
+      int64_t accumulate_align_offset = 0;
+#pragma unroll 1
+      for (int eidx = 1; eidx <= min(expert_id, num_local_experts); eidx++) {
+        auto n_token_in_expert = smem_expert_first_token_offset[eidx] -
+                                 smem_expert_first_token_offset[eidx - 1];
+        accumulate_align_offset += (n_token_in_expert + align_block_size - 1) /
+                                   align_block_size * align_block_size;
+      }
+      expanded_dest_row = accumulate_align_offset + token_offset_in_expert;
+    }
+    // lane0 shuffle broadcast align_expanded_dest_row
+    expanded_dest_row = __shfl_sync(0xffffffff, expanded_dest_row, 0);
+  }
+
+  if (threadIdx.x == 0) {
+    assert(expanded_dest_row <= INT32_MAX);
+    expanded_source_row_to_expanded_dest_row[expanded_source_row] =
+        static_cast<int>(expanded_dest_row);
+  }
+
+  if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) {
+    // Load 128-bits per thread
+    constexpr int64_t ELEM_PER_THREAD = 128 / cutlass::sizeof_bits<T>::value;
+    using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
+
+    // Duplicate and permute rows
+    int64_t const source_row = expanded_source_row % num_rows;
+
+    auto const* source_row_ptr =
+        reinterpret_cast<DataElem const*>(unpermuted_input + source_row * cols);
+    auto* dest_row_ptr =
+        reinterpret_cast<DataElem*>(permuted_output + expanded_dest_row * cols);
+
+    int64_t const start_offset = threadIdx.x;
+    int64_t const stride = blockDim.x;
+    int64_t const num_elems_in_col = cols / ELEM_PER_THREAD;
+
+    for (int elem_index = start_offset; elem_index < num_elems_in_col;
+         elem_index += stride) {
+      dest_row_ptr[elem_index] = source_row_ptr[elem_index];
+    }
+  }
+}
+
+template <typename T>
+void expandInputRowsKernelLauncher(
+    T const* unpermuted_input, T* permuted_output,
+    const float* unpermuted_scales, int* sorted_experts,
+    int const* expanded_dest_row_to_expanded_source_row,
+    int* expanded_source_row_to_expanded_dest_row,
+    int64_t* expert_first_token_offset, int64_t const num_rows,
+    int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
+    int num_local_experts, const int& align_block_size, cudaStream_t stream) {
+  int64_t const blocks = num_rows * k;
+  int64_t const threads = 256;
+  using FuncPtr = decltype(&expandInputRowsKernel<T, true, true>);
+  FuncPtr func_map[2][2] = {
+      {&expandInputRowsKernel<T, false, false>,
+       &expandInputRowsKernel<T, false, true>},
+      {&expandInputRowsKernel<T, true, false>,
+       &expandInputRowsKernel<T, true, true>},
+  };
+  bool is_check_skip = num_valid_tokens_ptr != nullptr;
+  bool is_align_block_size = align_block_size != -1;
+  auto func = func_map[is_check_skip][is_align_block_size];
+
+  int64_t smem_size = sizeof(int64_t) * (num_local_experts + 1);
+
+  func<<<blocks, threads, smem_size, stream>>>(
+      unpermuted_input, permuted_output, unpermuted_scales, sorted_experts,
+      expanded_dest_row_to_expanded_source_row,
+      expanded_source_row_to_expanded_dest_row, expert_first_token_offset,
+      num_rows, num_valid_tokens_ptr, cols, k, num_local_experts,
+      align_block_size);
+}
+
+template <class T, class U>
+__host__ __device__ constexpr static U arrayConvert(T const& input) {
+  using Type = typename U::Element;
+  static_assert(T::kElements == U::kElements);
+  U u;
+#pragma unroll
+  for (int i = 0; i < U::kElements; i++) {
+    u[i] = static_cast<Type>(input[i]);
+  }
+  return u;
+}
+
+template <typename T, typename OutputType, bool CHECK_SKIPPED>
+__global__ void finalizeMoeRoutingKernel(
+    T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
+    float const* scales, int const* expanded_source_row_to_expanded_dest_row,
+    int const* expert_for_source_row, int64_t const orig_cols, int64_t const k,
+    int64_t const* num_valid_ptr) {
+  assert(orig_cols % 4 == 0);
+  int64_t const original_row = blockIdx.x;
+  int64_t const num_rows = gridDim.x;
+  auto const offset = original_row * orig_cols;
+  OutputType* reduced_row_ptr = reduced_unpermuted_output + offset;
+  int64_t const num_valid = *num_valid_ptr;
+
+  // Load 128-bits per thread, according to the smallest data type we read/write
+  constexpr int64_t FINALIZE_ELEM_PER_THREAD =
+      128 / std::min(cutlass::sizeof_bits<OutputType>::value,
+                     cutlass::sizeof_bits<T>::value);
+
+  int64_t const start_offset = threadIdx.x;
+  int64_t const stride = blockDim.x;
+  int64_t const num_elems_in_col = orig_cols / FINALIZE_ELEM_PER_THREAD;
+
+  using InputElem = cutlass::Array<T, FINALIZE_ELEM_PER_THREAD>;
+  using OutputElem = cutlass::Array<OutputType, FINALIZE_ELEM_PER_THREAD>;
+  using ComputeElem = cutlass::Array<float, FINALIZE_ELEM_PER_THREAD>;
+  auto const* expanded_permuted_rows_v =
+      reinterpret_cast<InputElem const*>(expanded_permuted_rows);
+  auto* reduced_row_ptr_v = reinterpret_cast<OutputElem*>(reduced_row_ptr);
+
+#pragma unroll
+  for (int elem_index = start_offset; elem_index < num_elems_in_col;
+       elem_index += stride) {
+    ComputeElem thread_output;
+    thread_output.fill(0);
+    for (int k_idx = 0; k_idx < k; ++k_idx) {
+      int64_t const expanded_original_row = original_row + k_idx * num_rows;
+      int64_t const expanded_permuted_row =
+          expanded_source_row_to_expanded_dest_row[expanded_original_row];
+
+      int64_t const k_offset = original_row * k + k_idx;
+      float const row_scale = scales[k_offset];
+
+      // Check after row_rescale has accumulated
+      if (CHECK_SKIPPED && expanded_permuted_row >= num_valid) {
+        continue;
+      }
+
+      auto const* expanded_permuted_rows_row_ptr =
+          expanded_permuted_rows_v + expanded_permuted_row * num_elems_in_col;
+
+      ComputeElem expert_result = arrayConvert<InputElem, ComputeElem>(
+          expanded_permuted_rows_row_ptr[elem_index]);
+      thread_output = thread_output + row_scale * (expert_result);
+    }
+
+    OutputElem output_elem =
+        arrayConvert<ComputeElem, OutputElem>(thread_output);
+    reduced_row_ptr_v[elem_index] = output_elem;
+  }
+}
+
+template <class T, class OutputType>
+void finalizeMoeRoutingKernelLauncher(
+    T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
+    float const* scales, int const* expanded_source_row_to_expanded_dest_row,
+    int const* expert_for_source_row, int64_t const num_rows,
+    int64_t const cols, int64_t const k, int64_t const* num_valid_ptr,
+    cudaStream_t stream) {
+  int64_t const blocks = num_rows;
+  int64_t const threads = 256;
+  bool const check_finished = num_valid_ptr != nullptr;
+  using FuncPtr = decltype(&finalizeMoeRoutingKernel<T, OutputType, false>);
+  FuncPtr func_map[2] = {&finalizeMoeRoutingKernel<T, OutputType, false>,
+                         &finalizeMoeRoutingKernel<T, OutputType, true>};
+  auto* const kernel = func_map[check_finished];
+  kernel<<<blocks, threads, 0, stream>>>(
+      expanded_permuted_rows, reduced_unpermuted_output, scales,
+      expanded_source_row_to_expanded_dest_row, expert_for_source_row, cols, k,
+      num_valid_ptr);
+}
diff --git a/vllm_v0.10.0/csrc/moe/topk_softmax_kernels.cu b/vllm_v0.10.0/csrc/moe/topk_softmax_kernels.cu
new file mode 100644
index 0000000..064b76c
--- /dev/null
+++ b/vllm_v0.10.0/csrc/moe/topk_softmax_kernels.cu
@@ -0,0 +1,545 @@
+/*
+ * Adapted from https://github.com/NVIDIA/TensorRT-LLM/blob/v0.7.1/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu
+ * Copyright (c) 2024, The vLLM team.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include "../cuda_compat.h"
+
+#ifndef USE_ROCM
+    #include <cub/util_type.cuh>
+    #include <cub/cub.cuh>
+#else
+    #include <hipcub/util_type.hpp>
+    #include <hipcub/hipcub.hpp>
+#endif
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+namespace vllm {
+namespace moe {
+
+/// Aligned array type
+template <
+    typename T,
+    /// Number of elements in the array
+    int N,
+    /// Alignment requirement in bytes
+    int Alignment = sizeof(T) * N
+>
+class alignas(Alignment) AlignedArray {
+    float data[N];
+};
+
+// ====================== Softmax things ===============================
+// We have our own implementation of softmax here so we can support transposing the output
+// in the softmax kernel when we extend this module to support expert-choice routing.
+template <int TPB>
+__launch_bounds__(TPB) __global__
+    void moeSoftmax(const float* input, const bool* finished, float* output, const int num_cols)
+{
+    using BlockReduce = cub::BlockReduce<float, TPB>;
+    __shared__ typename BlockReduce::TempStorage tmpStorage;
+
+    __shared__ float normalizing_factor;
+    __shared__ float float_max;
+
+    const int thread_row_offset = blockIdx.x * num_cols;
+
+    cub::Sum sum;
+    float threadData(-FLT_MAX);
+
+    // Don't touch finished rows.
+    if ((finished != nullptr) && finished[blockIdx.x])
+    {
+        return;
+    }
+
+    for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
+    {
+        const int idx = thread_row_offset + ii;
+        threadData = max(static_cast<float>(input[idx]), threadData);
+    }
+
+    const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, cub::Max());
+    if (threadIdx.x == 0)
+    {
+        float_max = maxElem;
+    }
+    __syncthreads();
+
+    threadData = 0;
+
+    for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
+    {
+        const int idx = thread_row_offset + ii;
+        threadData += exp((static_cast<float>(input[idx]) - float_max));
+    }
+
+    const auto Z = BlockReduce(tmpStorage).Reduce(threadData, sum);
+
+    if (threadIdx.x == 0)
+    {
+        normalizing_factor = 1.f / Z;
+    }
+    __syncthreads();
+
+    for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
+    {
+        const int idx = thread_row_offset + ii;
+        const float val = exp((static_cast<float>(input[idx]) - float_max)) * normalizing_factor;
+        output[idx] = val;
+    }
+}
+
+template <int TPB, typename IndType>
+__launch_bounds__(TPB) __global__ void moeTopK(
+    const float* inputs_after_softmax,
+    const bool* finished,
+    float* output,
+    IndType* indices,
+    int* source_rows,
+    const int num_experts,
+    const int k,
+    const int start_expert,
+    const int end_expert)
+{
+
+    using cub_kvp = cub::KeyValuePair<int, float>;
+    using BlockReduce = cub::BlockReduce<cub_kvp, TPB>;
+    __shared__ typename BlockReduce::TempStorage tmpStorage;
+
+    cub_kvp thread_kvp;
+    cub::ArgMax arg_max;
+
+    const int num_rows = gridDim.x;
+    const int block_row = blockIdx.x;
+
+    const bool row_is_active = finished ? !finished[block_row] : true;
+    const int thread_read_offset = blockIdx.x * num_experts;
+    for (int k_idx = 0; k_idx < k; ++k_idx)
+    {
+        thread_kvp.key = 0;
+        thread_kvp.value = -1.f; // This is OK because inputs are probabilities
+
+        cub_kvp inp_kvp;
+        for (int expert = threadIdx.x; expert < num_experts; expert += TPB)
+        {
+            const int idx = thread_read_offset + expert;
+            inp_kvp.key = expert;
+            inp_kvp.value = inputs_after_softmax[idx];
+
+            for (int prior_k = 0; prior_k < k_idx; ++prior_k)
+            {
+                const int prior_winning_expert = indices[k * block_row + prior_k];
+
+                if (prior_winning_expert == expert)
+                {
+                    inp_kvp = thread_kvp;
+                }
+            }
+
+            thread_kvp = arg_max(inp_kvp, thread_kvp);
+        }
+
+        const cub_kvp result_kvp = BlockReduce(tmpStorage).Reduce(thread_kvp, arg_max);
+        if (threadIdx.x == 0)
+        {
+            // Ignore experts the node isn't responsible for with expert parallelism
+            const int expert = result_kvp.key;
+            const bool node_uses_expert = expert >= start_expert && expert < end_expert;
+            const bool should_process_row = row_is_active && node_uses_expert;
+
+            const int idx = k * block_row + k_idx;
+            output[idx] = result_kvp.value;
+            indices[idx] = should_process_row ? (expert - start_expert) : num_experts;
+            assert(indices[idx] >= 0);
+            source_rows[idx] = k_idx * num_rows + block_row;
+        }
+        __syncthreads();
+    }
+}
+
+// ====================== TopK softmax things ===============================
+
+/*
+  A Top-K gating softmax written to exploit when the number of experts in the MoE layers
+  are a small power of 2. This allows us to cleanly share the rows among the threads in
+  a single warp and eliminate communication between warps (so no need to use shared mem).
+
+  It fuses the softmax, max and argmax into a single kernel.
+
+  Limitations:
+  1) This implementation is intended for when the number of experts is a small power of 2.
+  2) This implementation assumes k is small, but will work for any k.
+*/
+
+template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, typename IndType>
+__launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
+    void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, IndType* indices,
+        int* source_rows, const int k, const int start_expert, const int end_expert)
+{
+    // We begin by enforcing compile time assertions and setting up compile time constants.
+    static_assert(VPT == (VPT & -VPT), "VPT must be power of 2");
+    static_assert(NUM_EXPERTS == (NUM_EXPERTS & -NUM_EXPERTS), "NUM_EXPERTS must be power of 2");
+    static_assert(BYTES_PER_LDG == (BYTES_PER_LDG & -BYTES_PER_LDG), "BYTES_PER_LDG must be power of 2");
+    static_assert(BYTES_PER_LDG <= 16, "BYTES_PER_LDG must be leq 16");
+
+    // Number of bytes each thread pulls in per load
+    static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
+    static constexpr int ELTS_PER_ROW = NUM_EXPERTS;
+    static constexpr int THREADS_PER_ROW = ELTS_PER_ROW / VPT;
+    static constexpr int LDG_PER_THREAD = VPT / ELTS_PER_LDG;
+
+    // Restrictions based on previous section.
+    static_assert(VPT % ELTS_PER_LDG == 0, "The elements per thread must be a multiple of the elements per ldg");
+    static_assert(WARP_SIZE % THREADS_PER_ROW == 0, "The threads per row must cleanly divide the threads per warp");
+    static_assert(THREADS_PER_ROW == (THREADS_PER_ROW & -THREADS_PER_ROW), "THREADS_PER_ROW must be power of 2");
+    static_assert(THREADS_PER_ROW <= WARP_SIZE, "THREADS_PER_ROW can be at most warp size");
+
+    // We have NUM_EXPERTS elements per row. We specialize for small #experts
+    static constexpr int ELTS_PER_WARP = WARP_SIZE * VPT;
+    static constexpr int ROWS_PER_WARP = ELTS_PER_WARP / ELTS_PER_ROW;
+    static constexpr int ROWS_PER_CTA = WARPS_PER_CTA * ROWS_PER_WARP;
+
+    // Restrictions for previous section.
+    static_assert(ELTS_PER_WARP % ELTS_PER_ROW == 0, "The elts per row must cleanly divide the total elt per warp");
+
+    // ===================== From this point, we finally start computing run-time variables. ========================
+
+    // Compute CTA and warp rows. We pack multiple rows into a single warp, and a block contains WARPS_PER_CTA warps.
+    // This, each block processes a chunk of rows. We start by computing the start row for each block.
+    const int cta_base_row = blockIdx.x * ROWS_PER_CTA;
+
+    // Now, using the base row per thread block, we compute the base row per warp.
+    const int warp_base_row = cta_base_row + threadIdx.y * ROWS_PER_WARP;
+
+    // The threads in a warp are split into sub-groups that will work on a row.
+    // We compute row offset for each thread sub-group
+    const int thread_row_in_warp = threadIdx.x / THREADS_PER_ROW;
+    const int thread_row = warp_base_row + thread_row_in_warp;
+
+    // Threads with indices out of bounds should early exit here.
+    if (thread_row >= num_rows)
+    {
+        return;
+    }
+    const bool row_is_active = finished ? !finished[thread_row] : true;
+
+    // We finally start setting up the read pointers for each thread. First, each thread jumps to the start of the
+    // row it will read.
+    const float* thread_row_ptr = input + thread_row * ELTS_PER_ROW;
+
+    // Now, we compute the group each thread belong to in order to determine the first column to start loads.
+    const int thread_group_idx = threadIdx.x % THREADS_PER_ROW;
+    const int first_elt_read_by_thread = thread_group_idx * ELTS_PER_LDG;
+    const float* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;
+
+    // Determine the pointer type to use to read in the data depending on the BYTES_PER_LDG template param. In theory,
+    // this can support all powers of 2 up to 16.
+    // NOTE(woosuk): The original implementation uses CUTLASS aligned array here.
+    // We defined our own aligned array and use it here to avoid the dependency on CUTLASS.
+    using AccessType = AlignedArray<float, ELTS_PER_LDG>;
+
+    // Finally, we pull in the data from global mem
+    float row_chunk[VPT];
+    AccessType* row_chunk_vec_ptr = reinterpret_cast<AccessType*>(&row_chunk);
+    const AccessType* vec_thread_read_ptr = reinterpret_cast<const AccessType*>(thread_read_ptr);
+#pragma unroll
+    for (int ii = 0; ii < LDG_PER_THREAD; ++ii)
+    {
+        row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
+    }
+
+    // First, we perform a max reduce within the thread. We can do the max in fp16 safely (I think) and just
+    // convert to float afterwards for the exp + sum reduction.
+    float thread_max = row_chunk[0];
+#pragma unroll
+    for (int ii = 1; ii < VPT; ++ii)
+    {
+        thread_max = max(thread_max, row_chunk[ii]);
+    }
+
+// Now, we find the max within the thread group and distribute among the threads. We use a butterfly reduce.
+#pragma unroll
+    for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
+    {
+        thread_max = max(thread_max, VLLM_SHFL_XOR_SYNC_WIDTH(thread_max, mask, THREADS_PER_ROW));
+    }
+
+    // From this point, thread max in all the threads have the max within the row.
+    // Now, we subtract the max from each element in the thread and take the exp. We also compute the thread local sum.
+    float row_sum = 0;
+#pragma unroll
+    for (int ii = 0; ii < VPT; ++ii)
+    {
+        row_chunk[ii] = expf(row_chunk[ii] - thread_max);
+        row_sum += row_chunk[ii];
+    }
+
+// Now, we perform the sum reduce within each thread group. Similar to the max reduce, we use a bufferfly pattern.
+#pragma unroll
+    for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
+    {
+        row_sum += VLLM_SHFL_XOR_SYNC_WIDTH(row_sum, mask, THREADS_PER_ROW);
+    }
+
+    // From this point, all threads have the max and the sum for their rows in the thread_max and thread_sum variables
+    // respectively. Finally, we can scale the rows for the softmax. Technically, for top-k gating we don't need to
+    // compute the entire softmax row. We can likely look at the maxes and only compute for the top-k values in the row.
+    // However, this kernel will likely not be a bottle neck and it seems better to closer match torch and find the
+    // argmax after computing the softmax.
+    const float reciprocal_row_sum = 1.f / row_sum;
+
+#pragma unroll
+    for (int ii = 0; ii < VPT; ++ii)
+    {
+        row_chunk[ii] = row_chunk[ii] * reciprocal_row_sum;
+    }
+
+    // Now, softmax_res contains the softmax of the row chunk. Now, I want to find the topk elements in each row, along
+    // with the max index.
+    int start_col = first_elt_read_by_thread;
+    static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;
+
+    for (int k_idx = 0; k_idx < k; ++k_idx)
+    {
+        // First, each thread does the local argmax
+        float max_val = row_chunk[0];
+        int expert = start_col;
+#pragma unroll
+        for (int ldg = 0, col = start_col; ldg < LDG_PER_THREAD; ++ldg, col += COLS_PER_GROUP_LDG)
+        {
+#pragma unroll
+            for (int ii = 0; ii < ELTS_PER_LDG; ++ii)
+            {
+                float val = row_chunk[ldg * ELTS_PER_LDG + ii];
+
+                // No check on the experts here since columns with the smallest index are processed first and only
+                // updated if > (not >=)
+                if (val > max_val)
+                {
+                    max_val = val;
+                    expert = col + ii;
+                }
+            }
+        }
+
+// Now, we perform the argmax reduce. We use the butterfly pattern so threads reach consensus about the max.
+// This will be useful for K > 1 so that the threads can agree on "who" had the max value. That thread can
+// then blank out their max with -inf and the warp can run more iterations...
+#pragma unroll
+        for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
+        {
+            float other_max = VLLM_SHFL_XOR_SYNC_WIDTH(max_val, mask, THREADS_PER_ROW);
+            int other_expert = VLLM_SHFL_XOR_SYNC_WIDTH(expert, mask, THREADS_PER_ROW);
+
+            // We want lower indices to "win" in every thread so we break ties this way
+            if (other_max > max_val || (other_max == max_val && other_expert < expert))
+            {
+                max_val = other_max;
+                expert = other_expert;
+            }
+        }
+
+        // Write the max for this k iteration to global memory.
+        if (thread_group_idx == 0)
+        {
+            // Add a guard to ignore experts not included by this node
+            const bool node_uses_expert = expert >= start_expert && expert < end_expert;
+            const bool should_process_row = row_is_active && node_uses_expert;
+
+            // The lead thread from each sub-group will write out the final results to global memory. (This will be a
+            // single) thread per row of the input/output matrices.
+            const int idx = k * thread_row + k_idx;
+            output[idx] = max_val;
+            indices[idx] = should_process_row ? (expert - start_expert) : NUM_EXPERTS;
+            source_rows[idx] = k_idx * num_rows + thread_row;
+        }
+
+        // Finally, we clear the value in the thread with the current max if there is another iteration to run.
+        if (k_idx + 1 < k)
+        {
+            const int ldg_group_for_expert = expert / COLS_PER_GROUP_LDG;
+            const int thread_to_clear_in_group = (expert / ELTS_PER_LDG) % THREADS_PER_ROW;
+
+            // Only the thread in the group which produced the max will reset the "winning" value to -inf.
+            if (thread_group_idx == thread_to_clear_in_group)
+            {
+                const int offset_for_expert = expert % ELTS_PER_LDG;
+                // Safe to set to any negative value since row_chunk values must be between 0 and 1.
+                row_chunk[ldg_group_for_expert * ELTS_PER_LDG + offset_for_expert] = -10000.f;
+            }
+        }
+    }
+}
+
+namespace detail
+{
+// Constructs some constants needed to partition the work across threads at compile time.
+template <int EXPERTS, int BYTES_PER_LDG>
+struct TopkConstants
+{
+    static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
+    static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE) == 0, "");
+    static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE));
+    static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
+    static constexpr int THREADS_PER_ROW = EXPERTS / VPT;
+    static constexpr int ROWS_PER_WARP = WARP_SIZE / THREADS_PER_ROW;
+};
+} // namespace detail
+
+template <int EXPERTS, int WARPS_PER_TB, typename IndType>
+void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices,
+    int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream)
+{
+    static constexpr std::size_t MAX_BYTES_PER_LDG = 16;
+
+    static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
+    using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG>;
+    static constexpr int VPT = Constants::VPT;
+    static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
+    const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
+    const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
+
+    dim3 block_dim(WARP_SIZE, WARPS_PER_TB);
+    topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG><<<num_blocks, block_dim, 0, stream>>>(
+        input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
+}
+
+#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB)                       \
+    topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB>(         \
+        gating_output, nullptr, topk_weights, topk_indices,            \
+        token_expert_indices, num_tokens, topk, 0, num_experts,         \
+        stream);
+
+template <typename IndType>
+void topkGatingSoftmaxKernelLauncher(
+    const float* gating_output,
+    float* topk_weights,
+    IndType* topk_indices,
+    int* token_expert_indices,
+    float* softmax_workspace,
+    const int num_tokens,
+    const int num_experts,
+    const int topk,
+    cudaStream_t stream) {
+    static constexpr int WARPS_PER_TB = 4;
+    switch (num_experts) {
+        case 1:
+            LAUNCH_SOFTMAX(1, WARPS_PER_TB);
+            break;
+        case 2:
+            LAUNCH_SOFTMAX(2, WARPS_PER_TB);
+            break;
+        case 4:
+            LAUNCH_SOFTMAX(4, WARPS_PER_TB);
+            break;
+        case 8:
+            LAUNCH_SOFTMAX(8, WARPS_PER_TB);
+            break;
+        case 16:
+            LAUNCH_SOFTMAX(16, WARPS_PER_TB);
+            break;
+        case 32:
+            LAUNCH_SOFTMAX(32, WARPS_PER_TB);
+            break;
+        case 64:
+            LAUNCH_SOFTMAX(64, WARPS_PER_TB);
+            break;
+        case 128:
+            LAUNCH_SOFTMAX(128, WARPS_PER_TB);
+            break;
+        case 256:
+            LAUNCH_SOFTMAX(256, WARPS_PER_TB);
+            break;
+        default: {
+            TORCH_CHECK(softmax_workspace != nullptr,
+                "softmax_workspace must be provided for num_experts that are not a power of 2.");
+            static constexpr int TPB = 256;
+            moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
+                gating_output, nullptr, softmax_workspace, num_experts);
+            moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
+                softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
+                num_experts, topk, 0, num_experts);
+        }
+    }
+}
+
+} // namespace moe
+} // namespace vllm
+
+void topk_softmax(
+    torch::Tensor& topk_weights,                // [num_tokens, topk]
+    torch::Tensor& topk_indices,                // [num_tokens, topk]
+    torch::Tensor& token_expert_indices,        // [num_tokens, topk]
+    torch::Tensor& gating_output)               // [num_tokens, num_experts]
+{
+    const int num_experts = gating_output.size(-1);
+    const auto num_tokens = gating_output.numel() / num_experts;
+    const int topk = topk_weights.size(-1);
+
+    const bool is_pow_2 = (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
+    const bool needs_workspace = !is_pow_2 || num_experts > 256;
+    const int64_t workspace_size = needs_workspace ? num_tokens * num_experts : 0;
+
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    torch::Tensor softmax_workspace = torch::empty({workspace_size}, gating_output.options());
+
+    if(topk_indices.scalar_type() == at::ScalarType::Int)
+    {
+        vllm::moe::topkGatingSoftmaxKernelLauncher(
+            gating_output.data_ptr<float>(),
+            topk_weights.data_ptr<float>(),
+            topk_indices.data_ptr<int>(),
+            token_expert_indices.data_ptr<int>(),
+            softmax_workspace.data_ptr<float>(),
+            num_tokens,
+            num_experts,
+            topk,
+            stream);
+    }
+    else if (topk_indices.scalar_type() == at::ScalarType::UInt32)
+    {
+        vllm::moe::topkGatingSoftmaxKernelLauncher(
+            gating_output.data_ptr<float>(),
+            topk_weights.data_ptr<float>(),
+            topk_indices.data_ptr<uint32_t>(),
+            token_expert_indices.data_ptr<int>(),
+            softmax_workspace.data_ptr<float>(),
+            num_tokens,
+            num_experts,
+            topk,
+            stream);
+    }
+    else {
+        assert(topk_indices.scalar_type() == at::ScalarType::Int64);
+        vllm::moe::topkGatingSoftmaxKernelLauncher(
+            gating_output.data_ptr<float>(),
+            topk_weights.data_ptr<float>(),
+            topk_indices.data_ptr<int64_t>(),
+            token_expert_indices.data_ptr<int>(),
+            softmax_workspace.data_ptr<float>(),
+            num_tokens,
+            num_experts,
+            topk,
+            stream);
+    }
+}
diff --git a/vllm_v0.10.0/csrc/moe/torch_bindings.cpp b/vllm_v0.10.0/csrc/moe/torch_bindings.cpp
new file mode 100644
index 0000000..97df311
--- /dev/null
+++ b/vllm_v0.10.0/csrc/moe/torch_bindings.cpp
@@ -0,0 +1,84 @@
+#include "core/registration.h"
+#include "moe_ops.h"
+
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
+  // Apply topk softmax to the gating outputs.
+  m.def(
+      "topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
+      "token_expert_indices, Tensor gating_output) -> ()");
+  m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
+
+  // Calculate the result of moe by summing up the partial results
+  // from all selected experts.
+  m.def("moe_sum(Tensor input, Tensor! output) -> ()");
+  m.impl("moe_sum", torch::kCUDA, &moe_sum);
+
+  // Aligning the number of tokens to be processed by each expert such
+  // that it is divisible by the block size.
+  m.def(
+      "moe_align_block_size(Tensor topk_ids, int num_experts,"
+      "                     int block_size, Tensor! sorted_token_ids,"
+      "                     Tensor! experts_ids,"
+      "                     Tensor! num_tokens_post_pad) -> ()");
+  m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
+
+#ifndef USE_ROCM
+  m.def(
+      "moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "
+      "Tensor b_scales, Tensor? b_qzeros, "
+      "Tensor? topk_weights, Tensor sorted_token_ids, "
+      "Tensor expert_ids, Tensor num_tokens_post_pad, "
+      "int top_k, int BLOCK_SIZE_M, int BLOCK_SIZE_N, int BLOCK_SIZE_K, "
+      "int bit) -> Tensor");
+
+  m.impl("moe_wna16_gemm", torch::kCUDA, &moe_wna16_gemm);
+
+  m.def(
+      "moe_wna16_marlin_gemm(Tensor! a, Tensor? c_or_none,"
+      "Tensor! b_q_weight, Tensor! b_scales, Tensor? global_scale, Tensor? "
+      "b_zeros_or_none,"
+      "Tensor? g_idx_or_none, Tensor? perm_or_none, Tensor! workspace,"
+      "Tensor sorted_token_ids,"
+      "Tensor! expert_ids, Tensor! num_tokens_past_padded,"
+      "Tensor! topk_weights, int moe_block_size, int top_k, "
+      "bool mul_topk_weights, bool is_ep, int b_q_type_id,"
+      "int size_m, int size_n, int size_k,"
+      "bool is_full_k, bool use_atomic_add,"
+      "bool use_fp32_reduce, bool is_zp_float) -> Tensor");
+  m.def(
+      "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
+      "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
+      "b_zeros, Tensor! g_idx, Tensor! perm, Tensor! workspace, "
+      "int b_q_type, SymInt size_m, "
+      "SymInt size_n, SymInt size_k, bool is_k_full, int num_experts, int "
+      "topk, "
+      "int moe_block_size, bool replicate_input, bool apply_weights)"
+      " -> Tensor");
+
+  m.def(
+      "moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids,"
+      "Tensor token_expert_indices, Tensor? expert_map, int n_expert,"
+      "int n_local_expert,"
+      "int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
+      "expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! "
+      "m_indices)->()");
+
+  m.def(
+      "moe_unpermute(Tensor permuted_hidden_states, Tensor topk_weights,"
+      "Tensor topk_ids,Tensor src_row_id2dst_row_id_map, Tensor "
+      "expert_first_token_offset, int n_expert, int n_local_expert,int "
+      "topk, Tensor! hidden_states)->()");
+
+  m.def("moe_permute_unpermute_supported() -> bool");
+  m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);
+
+  // Row shuffle for MoE
+  m.def(
+      "shuffle_rows(Tensor input_tensor, Tensor dst2src_map, Tensor! "
+      "output_tensor) -> ()");
+  m.impl("shuffle_rows", torch::kCUDA, &shuffle_rows);
+
+#endif
+}
+
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/vllm_v0.10.0/csrc/ops.h b/vllm_v0.10.0/csrc/ops.h
new file mode 100644
index 0000000..97a247d
--- /dev/null
+++ b/vllm_v0.10.0/csrc/ops.h
@@ -0,0 +1,362 @@
+#pragma once
+
+#include <optional>
+#include <torch/library.h>
+
+#include "core/scalar_type.hpp"
+
+#include <vector>
+
+torch::Tensor weak_ref_tensor(torch::Tensor& tensor) {
+  // Ensure tensor is on CUDA
+  if (!tensor.is_cuda()) {
+    throw std::runtime_error("Tensor must be on CUDA device");
+  }
+
+  // Get the raw data pointer
+  void* data_ptr = tensor.data_ptr();
+
+  // Get tensor sizes and strides
+  std::vector<int64_t> sizes = tensor.sizes().vec();
+  std::vector<int64_t> strides = tensor.strides().vec();
+
+  // Get tensor options (dtype, device)
+  auto options = tensor.options();
+
+  // Create a new tensor from the raw data pointer
+  auto new_tensor = torch::from_blob(data_ptr, sizes, strides, options);
+
+  return new_tensor;
+}
+
+void paged_attention_v1(
+    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
+    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
+    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
+    torch::Tensor& v_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step);
+
+void paged_attention_v2(
+    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
+    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
+    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
+    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
+    torch::Tensor& v_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step);
+
+#ifndef USE_ROCM
+void merge_attn_states(torch::Tensor& output,
+                       std::optional<torch::Tensor> output_lse,
+                       const torch::Tensor& prefix_output,
+                       const torch::Tensor& prefix_lse,
+                       const torch::Tensor& suffix_output,
+                       const torch::Tensor& suffix_lse);
+
+void convert_vertical_slash_indexes(
+    torch::Tensor& block_count,      // [BATCH, N_HEADS, NUM_ROWS]
+    torch::Tensor& block_offset,     // [BATCH, N_HEADS, NUM_ROWS, NNZ_S]
+    torch::Tensor& column_count,     // [BATCH, N_HEADS, NUM_ROWS]
+    torch::Tensor& column_index,     // [BATCH, N_HEADS, NUM_ROWS, NNZ_V]
+    torch::Tensor q_seqlens,         // [BATCH, ]
+    torch::Tensor kv_seqlens,        // [BATCH, ]
+    torch::Tensor vertical_indexes,  // [BATCH, N_HEADS, NNZ_V]
+    torch::Tensor slash_indexes,     // [BATCH, N_HEADS, NNZ_S]
+    int64_t context_size, int64_t block_size_M, int64_t block_size_N,
+    bool causal);
+
+void convert_vertical_slash_indexes_mergehead(
+    torch::Tensor& block_count,            // [BATCH, N_HEADS, NUM_ROWS]
+    torch::Tensor& block_offset,           // [BATCH, N_HEADS, NUM_ROWS, NNZ_S]
+    torch::Tensor& column_count,           // [BATCH, N_HEADS, NUM_ROWS]
+    torch::Tensor& column_index,           // [BATCH, N_HEADS, NUM_ROWS, NNZ_V]
+    torch::Tensor q_seqlens,               // [BATCH, ]
+    torch::Tensor kv_seqlens,              // [BATCH, ]
+    torch::Tensor vertical_indexes,        // [BATCH, N_HEADS, NNZ_V]
+    torch::Tensor slash_indexes,           // [BATCH, N_HEADS, NNZ_S]
+    torch::Tensor vertical_indices_count,  // [N_HEADS, ]
+    torch::Tensor slash_indices_count, int64_t context_size,
+    int64_t block_size_M, int64_t block_size_N, bool causal);
+#endif
+
+void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
+              double epsilon);
+
+void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
+                        torch::Tensor& weight, double epsilon);
+
+void apply_repetition_penalties_(torch::Tensor& logits,
+                                 const torch::Tensor& prompt_mask,
+                                 const torch::Tensor& output_mask,
+                                 const torch::Tensor& repetition_penalties);
+
+void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
+                               torch::Tensor& weight, torch::Tensor& scale,
+                               double epsilon);
+
+void fused_add_rms_norm_static_fp8_quant(torch::Tensor& out,
+                                         torch::Tensor& input,
+                                         torch::Tensor& residual,
+                                         torch::Tensor& weight,
+                                         torch::Tensor& scale, double epsilon);
+
+void rms_norm_dynamic_per_token_quant(torch::Tensor& out,
+                                      torch::Tensor const& input,
+                                      torch::Tensor const& weight,
+                                      torch::Tensor& scales,
+                                      double const epsilon,
+                                      std::optional<torch::Tensor> scale_ub,
+                                      std::optional<torch::Tensor> residual);
+
+void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
+                      std::optional<torch::Tensor> key, int64_t head_size,
+                      torch::Tensor& cos_sin_cache, bool is_neox);
+
+void batched_rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
+                              std::optional<torch::Tensor> key,
+                              int64_t head_size, torch::Tensor& cos_sin_cache,
+                              bool is_neox, int64_t rot_dim,
+                              torch::Tensor& cos_sin_cache_offsets);
+
+void silu_and_mul(torch::Tensor& out, torch::Tensor& input);
+
+void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input,
+                        torch::Tensor& scale);
+
+void mul_and_silu(torch::Tensor& out, torch::Tensor& input);
+
+void gelu_and_mul(torch::Tensor& out, torch::Tensor& input);
+
+void gelu_tanh_and_mul(torch::Tensor& out, torch::Tensor& input);
+
+void fatrelu_and_mul(torch::Tensor& out, torch::Tensor& input,
+                     double threshold);
+
+void gelu_new(torch::Tensor& out, torch::Tensor& input);
+
+void gelu_fast(torch::Tensor& out, torch::Tensor& input);
+
+void gelu_quick(torch::Tensor& out, torch::Tensor& input);
+
+void advance_step_flashattn(int64_t num_seqs, int64_t num_queries,
+                            int64_t block_size, torch::Tensor& input_tokens,
+                            torch::Tensor& sampled_token_ids,
+                            torch::Tensor& input_positions,
+                            torch::Tensor& seq_lens,
+                            torch::Tensor& slot_mapping,
+                            torch::Tensor& block_tables);
+
+void advance_step_flashinfer(
+    int64_t num_seqs, int64_t num_queries, int64_t block_size,
+    torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
+    torch::Tensor& input_positions, torch::Tensor& seq_lens,
+    torch::Tensor& slot_mapping, torch::Tensor& block_tables,
+    torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
+    torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds);
+
+void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
+                        torch::Tensor const& q_pe,
+                        torch::Tensor const& kv_c_and_k_pe_cache,
+                        torch::Tensor const& seq_lens,
+                        torch::Tensor const& page_table, double scale);
+
+torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor);
+
+#ifndef USE_ROCM
+torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
+                        const torch::Tensor& codebooks,
+                        const torch::Tensor& scales,
+                        const std::vector<int64_t>& codebook_partition_sizes,
+                        const std::optional<torch::Tensor>& bias);
+
+torch::Tensor aqlm_dequant(
+    const torch::Tensor& codes, const torch::Tensor& codebooks,
+    const std::vector<int64_t>& codebook_partition_sizes);
+
+torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel,
+                       torch::Tensor _scaling_factors, torch::Tensor _zeros,
+                       int64_t split_k_iters);
+
+torch::Tensor awq_dequantize(torch::Tensor _kernel,
+                             torch::Tensor _scaling_factors,
+                             torch::Tensor _zeros, int64_t split_k_iters,
+                             int64_t thx, int64_t thy);
+
+torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm);
+#endif
+
+torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
+                              int64_t n,
+                              std::optional<at::ScalarType> const& dtype);
+
+torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X,
+                                  int64_t type, int64_t row);
+
+torch::Tensor ggml_mul_mat_a8(torch::Tensor W, torch::Tensor X, int64_t type,
+                              int64_t row);
+
+torch::Tensor ggml_moe_a8(torch::Tensor X, torch::Tensor W,
+                          torch::Tensor sorted_token_ids,
+                          torch::Tensor expert_ids,
+                          torch::Tensor num_tokens_post_padded, int64_t type,
+                          int64_t row, int64_t top_k, int64_t tokens);
+
+torch::Tensor ggml_moe_a8_vec(torch::Tensor X, torch::Tensor W,
+                              torch::Tensor topk_ids, int64_t top_k,
+                              int64_t type, int64_t row, int64_t tokens);
+
+int64_t ggml_moe_get_block_size(int64_t type);
+
+#ifndef USE_ROCM
+
+bool cutlass_scaled_mm_supports_fp4(int64_t cuda_device_capability);
+bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
+bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability);
+bool cutlass_group_gemm_supported(int64_t cuda_device_capability);
+
+void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor const& A,
+                           torch::Tensor const& B, torch::Tensor const& A_sf,
+                           torch::Tensor const& B_sf,
+                           torch::Tensor const& alpha);
+
+void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, torch::Tensor const& a_scales,
+                       torch::Tensor const& b_scales,
+                       std::optional<torch::Tensor> const& bias);
+
+void cutlass_moe_mm(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch);
+
+void cutlass_fp4_group_mm(
+    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
+    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
+    const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
+    const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets);
+
+void get_cutlass_moe_mm_data(
+    const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
+    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
+    torch::Tensor& input_permutation, torch::Tensor& output_permutation,
+    const int64_t num_experts, const int64_t n, const int64_t k,
+    const std::optional<torch::Tensor>& blockscale_offsets);
+
+void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
+                                  torch::Tensor& problem_sizes1,
+                                  torch::Tensor& problem_sizes2,
+                                  const torch::Tensor& expert_num_tokens,
+                                  const int64_t num_local_experts,
+                                  const int64_t padded_m, const int64_t n,
+                                  const int64_t k);
+
+void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
+                           torch::Tensor const& b,
+                           torch::Tensor const& a_scales,
+                           torch::Tensor const& b_scales,
+                           torch::Tensor const& azp_adj,
+                           std::optional<torch::Tensor> const& azp,
+                           std::optional<torch::Tensor> const& bias);
+
+bool cutlass_sparse_scaled_mm_supported(int64_t cuda_device_capability);
+
+void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
+                              torch::Tensor const& b, torch::Tensor const& e,
+                              torch::Tensor const& a_scales,
+                              torch::Tensor const& b_scales,
+                              std::optional<torch::Tensor> const& bias);
+
+std::vector<torch::Tensor> cutlass_sparse_compress(torch::Tensor const& a);
+
+void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
+                      torch::Tensor& output_scale,
+                      torch::Tensor const& input_scale);
+
+void scaled_fp4_experts_quant(
+    torch::Tensor& output, torch::Tensor& output_scale,
+    torch::Tensor const& input, torch::Tensor const& input_global_scale,
+    torch::Tensor const& input_offset_by_experts,
+    torch::Tensor const& output_scale_offset_by_experts);
+
+void per_token_group_quant_fp8(const torch::Tensor& input,
+                               torch::Tensor& output_q, torch::Tensor& output_s,
+                               int64_t group_size, double eps, double fp8_min,
+                               double fp8_max, bool scale_ue8m0);
+#endif
+
+void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
+                              torch::Tensor const& scale,
+                              std::optional<torch::Tensor> const& azp);
+
+void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
+                               torch::Tensor& scales,
+                               std::optional<torch::Tensor> const& azp);
+
+torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
+                        torch::Tensor b_gptq_qzeros,
+                        torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
+                        bool use_exllama, int64_t bit);
+
+void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit);
+
+void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
+                             torch::Tensor const& scale);
+
+void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
+                              torch::Tensor& scale);
+
+void dynamic_per_token_scaled_fp8_quant(
+    torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale,
+    std::optional<torch::Tensor> const& scale_ub);
+
+void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta,
+                        const torch::Tensor& A, const torch::Tensor& B,
+                        const torch::Tensor& C,
+                        const std::optional<torch::Tensor>& D_,
+                        const std::optional<torch::Tensor>& z_,
+                        const std::optional<torch::Tensor>& delta_bias_,
+                        bool delta_softplus,
+                        const std::optional<torch::Tensor>& query_start_loc,
+                        const std::optional<torch::Tensor>& cache_indices,
+                        const std::optional<torch::Tensor>& has_initial_state,
+                        const torch::Tensor& ssm_states, int64_t pad_slot_id);
+
+using fptr_t = int64_t;
+fptr_t init_custom_ar(const std::vector<int64_t>& fake_ipc_ptrs,
+                      torch::Tensor& rank_data, int64_t rank,
+                      bool fully_connected);
+void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
+                fptr_t reg_buffer, int64_t reg_buffer_sz_bytes);
+void dispose(fptr_t _fa);
+int64_t meta_size();
+void register_buffer(fptr_t _fa, const std::vector<int64_t>& fake_ipc_ptrs);
+std::tuple<std::vector<int64_t>, std::vector<int64_t>>
+get_graph_buffer_ipc_meta(fptr_t _fa);
+void register_graph_buffers(fptr_t _fa,
+                            const std::vector<std::vector<int64_t>>& handles,
+                            const std::vector<std::vector<int64_t>>& offsets);
+std::tuple<int64_t, torch::Tensor> allocate_shared_buffer_and_handle(
+    int64_t size);
+int64_t open_mem_handle(torch::Tensor& mem_handle);
+void free_shared_buffer(int64_t buffer);
+
+#ifdef USE_ROCM
+fptr_t init_custom_qr(int64_t rank, int64_t world_size,
+                      std::optional<int64_t> qr_max_size = std::nullopt);
+void qr_destroy(fptr_t _fa);
+torch::Tensor qr_get_handle(fptr_t _fa);
+void qr_open_handles(fptr_t _fa, const std::vector<torch::Tensor>& handles);
+void qr_all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
+                   int64_t quant_level, bool cast_bf2half = false);
+int64_t qr_max_size();
+#endif
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/permute_cols.cu b/vllm_v0.10.0/csrc/permute_cols.cu
new file mode 100644
index 0000000..f51fa73
--- /dev/null
+++ b/vllm_v0.10.0/csrc/permute_cols.cu
@@ -0,0 +1,88 @@
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <cuda_fp16.h>
+
+static constexpr int default_threads = 256;
+static constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
+
+// For a given "a" of size [M,K] performs a permutation of the K columns based
+// on the given "perm" indices.
+// Currently only supports 16bit types (since we permute half types)
+__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
+                                    int const* __restrict__ perm_int_ptr,
+                                    int4* __restrict__ out_int4_ptr, int size_m,
+                                    int size_k, int block_rows) {
+  int start_row = block_rows * blockIdx.x;
+  int finish_row = start_row + block_rows;
+  if (finish_row > size_m) {
+    finish_row = size_m;
+  }
+  int cur_block_rows = std::max(finish_row - start_row, 0);
+
+  int row_stride = size_k * sizeof(half) / 16;
+
+  auto permute_row = [&](int row) {
+    int iters = size_k / default_threads;
+    int rest = size_k % default_threads;
+
+    int offset = row * row_stride;
+
+    half const* a_row_half = reinterpret_cast<half const*>(a_int4_ptr + offset);
+    half* out_half = reinterpret_cast<half*>(out_int4_ptr + offset);
+
+    int base_k = 0;
+
+    for (int i = 0; i < iters; i++) {
+      int cur_k = base_k + threadIdx.x;
+      int src_pos = perm_int_ptr[cur_k];
+
+      out_half[cur_k] = a_row_half[src_pos];
+
+      base_k += default_threads;
+    }
+
+    if (rest) {
+      if (threadIdx.x < rest) {
+        int cur_k = base_k + threadIdx.x;
+        int src_pos = perm_int_ptr[cur_k];
+
+        out_half[cur_k] = a_row_half[src_pos];
+      }
+    }
+  };
+
+  for (int i = 0; i < cur_block_rows; i++) {
+    int cur_row = start_row + i;
+    if (cur_row < size_m) {
+      permute_row(cur_row);
+    }
+  }
+}
+
+// More efficient version of A[..., perm]
+//  taken from gptq_marlin.cu
+torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
+  auto dev = A.get_device();
+  auto stream = at::cuda::getCurrentCUDAStream(dev);
+
+  TORCH_CHECK(A.scalar_type() == at::kHalf || A.scalar_type() == at::kBFloat16,
+              "Currently only 16bit types are supported");
+  TORCH_CHECK(A.is_contiguous(), "A must be contiguous");
+  TORCH_CHECK(A.size(-1) % 8 == 0,
+              "A columns must be a multiple of 8 (128bits)");
+  auto A_2d = A.view({-1, A.size(-1)});
+
+  torch::Tensor D = torch::empty_like(A);
+  int sms;
+  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
+  int block_rows = div_ceil(A_2d.size(0), sms);
+  permute_cols_kernel<<<sms, default_threads, 0, stream>>>(
+      reinterpret_cast<int4 const*>(A_2d.const_data_ptr()),
+      perm.const_data_ptr<int>(), reinterpret_cast<int4*>(D.mutable_data_ptr()),
+      A_2d.size(0), A_2d.size(1), block_rows);
+  return D;
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/pos_encoding_kernels.cu b/vllm_v0.10.0/csrc/pos_encoding_kernels.cu
new file mode 100644
index 0000000..266f2a0
--- /dev/null
+++ b/vllm_v0.10.0/csrc/pos_encoding_kernels.cu
@@ -0,0 +1,306 @@
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
+
+namespace vllm {
+
+template <typename scalar_t, bool IS_NEOX>
+inline __device__ void apply_token_rotary_embedding(
+    scalar_t* __restrict__ arr, const scalar_t* __restrict__ cos_ptr,
+    const scalar_t* __restrict__ sin_ptr, int rot_offset, int embed_dim) {
+  int x_index, y_index;
+  scalar_t cos, sin;
+  if (IS_NEOX) {
+    // GPT-NeoX style rotary embedding.
+    x_index = rot_offset;
+    y_index = embed_dim + rot_offset;
+    cos = VLLM_LDG(cos_ptr + x_index);
+    sin = VLLM_LDG(sin_ptr + x_index);
+  } else {
+    // GPT-J style rotary embedding.
+    x_index = 2 * rot_offset;
+    y_index = 2 * rot_offset + 1;
+    cos = VLLM_LDG(cos_ptr + x_index / 2);
+    sin = VLLM_LDG(sin_ptr + x_index / 2);
+  }
+
+  const scalar_t x = arr[x_index];
+  const scalar_t y = arr[y_index];
+  arr[x_index] = x * cos - y * sin;
+  arr[y_index] = y * cos + x * sin;
+}
+
+template <typename scalar_t, bool IS_NEOX>
+inline __device__ void apply_rotary_embedding(
+    scalar_t* __restrict__ query,  // [batch_size, seq_len, num_heads,
+                                   // head_size] or [num_tokens, num_heads,
+                                   // head_size]
+    scalar_t* __restrict__ key,    // nullptr or
+                                   // [batch_size, seq_len, num_kv_heads,
+                                   // head_size] or [num_tokens, num_kv_heads,
+                                   // head_size]
+    const scalar_t* cache_ptr, const int head_size, const int num_heads,
+    const int num_kv_heads, const int rot_dim, const int token_idx,
+    const int64_t query_stride, const int64_t key_stride,
+    const int64_t head_stride) {
+  const int embed_dim = rot_dim / 2;
+  const scalar_t* cos_ptr = cache_ptr;
+  const scalar_t* sin_ptr = cache_ptr + embed_dim;
+
+  const int nq = num_heads * embed_dim;
+  for (int i = threadIdx.x; i < nq; i += blockDim.x) {
+    const int head_idx = i / embed_dim;
+    const int64_t token_head =
+        token_idx * query_stride + head_idx * head_stride;
+    const int rot_offset = i % embed_dim;
+    apply_token_rotary_embedding<scalar_t, IS_NEOX>(
+        query + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim);
+  }
+
+  if (key != nullptr) {
+    const int nk = num_kv_heads * embed_dim;
+    for (int i = threadIdx.x; i < nk; i += blockDim.x) {
+      const int head_idx = i / embed_dim;
+      const int64_t token_head =
+          token_idx * key_stride + head_idx * head_stride;
+      const int rot_offset = i % embed_dim;
+      apply_token_rotary_embedding<scalar_t, IS_NEOX>(
+          key + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim);
+    }
+  }
+}
+
+template <typename scalar_t, bool IS_NEOX>
+__global__ void rotary_embedding_kernel(
+    const int64_t* __restrict__ positions,  // [batch_size, seq_len] or
+                                            // [num_tokens]
+    scalar_t* __restrict__ query,           // [batch_size, seq_len, num_heads,
+                                   // head_size] or [num_tokens, num_heads,
+                                   // head_size]
+    scalar_t* __restrict__ key,  // nullptr or
+                                 // [batch_size, seq_len, num_kv_heads,
+                                 // head_size] or [num_tokens, num_kv_heads,
+                                 // head_size]
+    const scalar_t* __restrict__ cos_sin_cache,  // [max_position, 2, rot_dim //
+                                                 // 2]
+    const int rot_dim, const int64_t query_stride, const int64_t key_stride,
+    const int64_t head_stride, const int num_heads, const int num_kv_heads,
+    const int head_size) {
+  // Each thread block is responsible for one token.
+  const int token_idx = blockIdx.x;
+  int64_t pos = positions[token_idx];
+  const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
+
+  apply_rotary_embedding<scalar_t, IS_NEOX>(
+      query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim,
+      token_idx, query_stride, key_stride, head_stride);
+}
+
+template <typename scalar_t, bool IS_NEOX>
+__global__ void batched_rotary_embedding_kernel(
+    const int64_t* __restrict__ positions,  // [batch_size, seq_len] or
+                                            // [num_tokens]
+    scalar_t* __restrict__ query,           // [batch_size, seq_len, num_heads,
+                                   // head_size] or [num_tokens, num_heads,
+                                   // head_size]
+    scalar_t* __restrict__ key,  // nullptr or
+                                 // [batch_size, seq_len, num_kv_heads,
+                                 // head_size] or [num_tokens, num_kv_heads,
+                                 // head_size]
+    const scalar_t* __restrict__ cos_sin_cache,  // [max_position, 2, rot_dim //
+                                                 // 2]
+    const int64_t* __restrict__ cos_sin_cache_offsets,  // [batch_size, seq_len]
+    const int rot_dim, const int64_t query_stride, const int64_t key_stride,
+    const int64_t head_stride, const int num_heads, const int num_kv_heads,
+    const int head_size) {
+  // Each thread block is responsible for one token.
+  const int token_idx = blockIdx.x;
+  int64_t pos = positions[token_idx];
+  int64_t cos_sin_cache_offset = cos_sin_cache_offsets[token_idx];
+  const scalar_t* cache_ptr =
+      cos_sin_cache + (cos_sin_cache_offset + pos) * rot_dim;
+
+  apply_rotary_embedding<scalar_t, IS_NEOX>(
+      query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim,
+      token_idx, query_stride, key_stride, head_stride);
+}
+
+}  // namespace vllm
+
+void rotary_embedding(
+    torch::Tensor& positions,  // [batch_size, seq_len] or [num_tokens]
+    torch::Tensor& query,  // [batch_size, seq_len, num_heads * head_size] or
+                           // [num_tokens, num_heads * head_size] or
+                           // [batch_size, seq_len, num_heads, head_size] or
+                           // [num_tokens, num_heads, head_size]
+    std::optional<torch::Tensor> key,
+    // null or
+    // [batch_size, seq_len, num_kv_heads * head_size] or
+    // [num_tokens, num_kv_heads * head_size] or
+    // [batch_size, seq_len, num_heads, head_size] or
+    // [num_tokens, num_heads, head_size]
+    int64_t head_size,
+    torch::Tensor& cos_sin_cache,  // [max_position, rot_dim]
+    bool is_neox) {
+  // num_tokens = batch_size * seq_len
+  int64_t num_tokens = positions.numel();
+  int positions_ndim = positions.dim();
+
+  // Make sure num_tokens dim is consistent across positions, query, and key
+  TORCH_CHECK(
+      positions_ndim == 1 || positions_ndim == 2,
+      "positions must have shape [num_tokens] or [batch_size, seq_len]");
+  if (positions_ndim == 1) {
+    TORCH_CHECK(query.size(0) == positions.size(0) &&
+                    (!key.has_value() || key->size(0) == positions.size(0)),
+                "query, key and positions must have the same number of tokens");
+  }
+  if (positions_ndim == 2) {
+    TORCH_CHECK(
+        query.size(0) == positions.size(0) &&
+            (!key.has_value() || key->size(0) == positions.size(0)) &&
+            query.size(1) == positions.size(1) &&
+            (!key.has_value() || key->size(1) == positions.size(1)),
+        "query, key and positions must have the same batch_size and seq_len");
+  }
+
+  // Make sure head_size is valid for query and key
+  // hidden_size = num_heads * head_size
+  int query_hidden_size = query.numel() / num_tokens;
+  int key_hidden_size = key.has_value() ? key->numel() / num_tokens : 0;
+  TORCH_CHECK(query_hidden_size % head_size == 0);
+  TORCH_CHECK(key_hidden_size % head_size == 0);
+
+  // Make sure query and key have consistent number of heads
+  int num_heads = query_hidden_size / head_size;
+  int num_kv_heads = key.has_value() ? key_hidden_size / head_size : num_heads;
+  TORCH_CHECK(num_heads % num_kv_heads == 0);
+
+  int rot_dim = cos_sin_cache.size(1);
+  int seq_dim_idx = positions_ndim - 1;
+  int64_t query_stride = query.stride(seq_dim_idx);
+  int64_t key_stride = key.has_value() ? key->stride(seq_dim_idx) : 0;
+  // Determine head stride: for [*, heads, head_size] use stride of last dim;
+  // for flat [*, heads*head_size], heads blocks are contiguous of size
+  // head_size
+  int query_ndim = query.dim();
+  int64_t head_stride =
+      (query_ndim == positions_ndim + 2) ? query.stride(-2) : head_size;
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "rotary_embedding", [&] {
+    if (is_neox) {
+      vllm::rotary_embedding_kernel<scalar_t, true><<<grid, block, 0, stream>>>(
+          positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
+          key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
+          cos_sin_cache.data_ptr<scalar_t>(), rot_dim, query_stride, key_stride,
+          head_stride, num_heads, num_kv_heads, head_size);
+    } else {
+      vllm::rotary_embedding_kernel<scalar_t, false>
+          <<<grid, block, 0, stream>>>(
+              positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
+              key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
+              cos_sin_cache.data_ptr<scalar_t>(), rot_dim, query_stride,
+              key_stride, head_stride, num_heads, num_kv_heads, head_size);
+    }
+  });
+}
+
+/*
+Batched version of rotary embedding, pack multiple LoRAs together
+and process in batched manner.
+*/
+void batched_rotary_embedding(
+    torch::Tensor& positions,  // [batch_size, seq_len] or [num_tokens]
+    torch::Tensor& query,  // [batch_size, seq_len, num_heads * head_size] or
+                           // [num_tokens, num_heads * head_size] or
+                           // [batch_size, seq_len, num_heads, head_size] or
+                           // [num_tokens, num_heads, head_size]
+    std::optional<torch::Tensor>
+        key,  // null or
+              // [batch_size, seq_len, num_kv_heads * head_size] or
+              // [num_tokens, num_kv_heads * head_size] or
+              // [batch_size, seq_len, num_heads, head_size] or
+              // [num_tokens, num_heads, head_size]
+    int64_t head_size,
+    torch::Tensor& cos_sin_cache,  // [max_position, rot_dim]
+    bool is_neox, int64_t rot_dim,
+    torch::Tensor& cos_sin_cache_offsets  // [num_tokens] or [batch_size]
+) {
+  // num_tokens = batch_size * seq_len
+  int64_t num_tokens = cos_sin_cache_offsets.size(0);
+  TORCH_CHECK(
+      positions.size(0) == num_tokens || positions.numel() == num_tokens,
+      "positions must have the same num_tokens or batch_size as "
+      "cos_sin_cache_offsets");
+
+  int positions_ndim = positions.dim();
+  // Make sure num_tokens dim is consistent across positions, query, and key
+  TORCH_CHECK(
+      positions_ndim == 1 || positions_ndim == 2,
+      "positions must have shape [num_tokens] or [batch_size, seq_len]");
+  if (positions_ndim == 1) {
+    TORCH_CHECK(query.size(0) == positions.size(0) &&
+                    (!key.has_value() || key->size(0) == positions.size(0)),
+                "query, key and positions must have the same number of tokens");
+  }
+  if (positions_ndim == 2) {
+    TORCH_CHECK(
+        query.size(0) == positions.size(0) &&
+            (!key.has_value() || key->size(0) == positions.size(0)) &&
+            query.size(1) == positions.size(1) &&
+            (!key.has_value() || key->size(1) == positions.size(1)),
+        "query, key and positions must have the same batch_size and seq_len");
+  }
+
+  // Make sure head_size is valid for query and key
+  int query_hidden_size = query.numel() / num_tokens;
+  int key_hidden_size = key.has_value() ? key->numel() / num_tokens : 0;
+  TORCH_CHECK(query_hidden_size % head_size == 0);
+  TORCH_CHECK(key_hidden_size % head_size == 0);
+
+  // Make sure query and key have concistent number of heads
+  int num_heads = query_hidden_size / head_size;
+  int num_kv_heads = key.has_value() ? key_hidden_size / head_size : num_heads;
+  TORCH_CHECK(num_heads % num_kv_heads == 0);
+
+  int seq_dim_idx = positions_ndim - 1;
+  int64_t query_stride = query.stride(seq_dim_idx);
+  int64_t key_stride = key.has_value() ? key->stride(seq_dim_idx) : 0;
+  // Determine head stride: for [*, heads, head_size] use stride of last dim;
+  // for flat [*, heads*head_size], heads blocks are contiguous of size
+  // head_size
+  int query_ndim = query.dim();
+  int64_t head_stride =
+      (query_ndim == positions_ndim + 2) ? query.stride(-2) : head_size;
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "rotary_embedding", [&] {
+    if (is_neox) {
+      vllm::batched_rotary_embedding_kernel<scalar_t, true>
+          <<<grid, block, 0, stream>>>(
+              positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
+              key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
+              cos_sin_cache.data_ptr<scalar_t>(),
+              cos_sin_cache_offsets.data_ptr<int64_t>(), rot_dim, query_stride,
+              key_stride, head_stride, num_heads, num_kv_heads, head_size);
+    } else {
+      vllm::batched_rotary_embedding_kernel<scalar_t, false>
+          <<<grid, block, 0, stream>>>(
+              positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
+              key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
+              cos_sin_cache.data_ptr<scalar_t>(),
+              cos_sin_cache_offsets.data_ptr<int64_t>(), rot_dim, query_stride,
+              key_stride, head_stride, num_heads, num_kv_heads, head_size);
+    }
+  });
+}
diff --git a/vllm_v0.10.0/csrc/prepare_inputs/advance_step.cu b/vllm_v0.10.0/csrc/prepare_inputs/advance_step.cu
new file mode 100644
index 0000000..3d5077d
--- /dev/null
+++ b/vllm_v0.10.0/csrc/prepare_inputs/advance_step.cu
@@ -0,0 +1,336 @@
+/*
+ * The goal of this GPU kernel is to advance input tensors on the GPU directly
+ * PR: https://github.com/vllm-project/vllm/pull/6338
+ * Current restrictions:
+ *     1. Specialized for DraftModelRunner
+ *     2. Supports flash_attn only
+ */
+
+#include "advance_step.cuh"
+
+namespace prepare_inputs {
+
+//
+template <int const num_threads>
+__global__ void advance_step_flashattn_kernel(
+    int num_seqs, int num_queries, int block_size, long* input_tokens_ptr,
+    long const* sampled_token_ids_ptr, long* input_positions_ptr,
+    int* seq_lens_ptr, long* slot_mapping_ptr, int const* block_tables_ptr,
+    int64_t const block_tables_stride) {
+  int const n_pad = num_seqs - num_queries;
+  if (n_pad && blockIdx.x == 0) {
+    // Handle cuda graph padding
+    int const offset = num_queries;
+    for (int i = threadIdx.x; i < n_pad; i += blockDim.x) {
+      input_tokens_ptr[offset + i] = 0;
+      input_positions_ptr[offset + i] = 0;
+      slot_mapping_ptr[offset + i] = -1;
+    }
+  }
+
+  int num_query_blocks = div_ceil(num_queries, num_threads);
+
+  if (blockIdx.x >= num_query_blocks) {
+    return;
+  }
+
+  int cur_query_id = blockIdx.x * num_threads + threadIdx.x;
+
+  if (cur_query_id >= num_queries) {
+    return;
+  }
+
+  // Update input_tokens
+  input_tokens_ptr[cur_query_id] = sampled_token_ids_ptr[cur_query_id];
+
+  int seq_len = seq_lens_ptr[cur_query_id];
+  int next_seq_len = seq_len + 1;
+  int next_input_pos = next_seq_len - 1;
+
+  // Update seq_lens
+  seq_lens_ptr[cur_query_id] = next_seq_len;
+  // Update input_positions
+  input_positions_ptr[cur_query_id] = next_input_pos;
+
+  int const* seq_block_tables_ptr =
+      block_tables_ptr + block_tables_stride * cur_query_id;
+
+  int block_index = next_input_pos / block_size;
+  int block_offset = next_input_pos % block_size;
+
+  int slot_num = seq_block_tables_ptr[block_index] * block_size + block_offset;
+  // Update slot_mapping
+  slot_mapping_ptr[cur_query_id] = slot_num;
+}
+
+inline void verify_tensor(std::string const& name, torch::Tensor const& t,
+                          int64_t const size_0, int64_t const size_1,
+                          c10::ScalarType const type) {
+  bool size_0_cond = true;
+  if (size_0 != -1) {
+    size_0_cond = t.size(0) == size_0;
+  }
+
+  bool size_1_cond = true;
+  if (size_1 != -1) {
+    size_1_cond = t.size(1) == size_1;
+  }
+
+  bool is_contiguous = t.is_contiguous();
+  bool same_type = t.dtype() == type;
+
+  bool pass = size_0_cond && size_1_cond && is_contiguous && same_type;
+  if (!pass) {
+    TORCH_CHECK(false, "tensor: name = ", name, ", shape = ", t.sizes(),
+                " is_cont = ", t.is_contiguous(), ", type = ", t.dtype(),
+                " is not as expected: shape = [", size_0, ", ", size_1,
+                "], type = ", type);
+  }
+}
+
+/// each thread processes a block per query
+__global__ void advance_step_flashinfer_kernel(
+    int num_threads, int num_seqs, int num_queries, int block_size,
+    long* input_tokens_ptr, long const* sampled_token_ids_ptr,
+    long* input_positions_ptr, int* seq_lens_ptr, long* slot_mapping_ptr,
+    int const* block_tables_ptr, int64_t const block_tables_stride,
+    int* paged_kv_last_page_len_ptr, int* block_table_bound_ptr) {
+  int const n_pad = num_seqs - num_queries;
+  if (n_pad && blockIdx.x == 0) {
+    // Handle cuda graph padding
+    int const offset = num_queries;
+    for (int i = threadIdx.x; i < n_pad; i += blockDim.x) {
+      input_tokens_ptr[offset + i] = 0;
+      input_positions_ptr[offset + i] = 0;
+      slot_mapping_ptr[offset + i] = -1;
+    }
+  }
+  int num_query_blocks = div_ceil(num_queries, num_threads);
+
+  if (blockIdx.x < num_query_blocks) {
+    int cur_query_id = blockIdx.x * num_threads + threadIdx.x;
+
+    if (cur_query_id < num_queries) {
+      // Update input_tokens
+      input_tokens_ptr[cur_query_id] = sampled_token_ids_ptr[cur_query_id];
+
+      int seq_len = seq_lens_ptr[cur_query_id];
+      int next_seq_len = seq_len + 1;
+      int next_input_pos = next_seq_len - 1;
+
+      // Update seq_lens
+      seq_lens_ptr[cur_query_id] = next_seq_len;
+      // Update input_positions
+      input_positions_ptr[cur_query_id] = next_input_pos;
+
+      int const* seq_block_tables_ptr =
+          block_tables_ptr + block_tables_stride * cur_query_id;
+
+      int block_index = next_input_pos / block_size;
+      int block_offset = next_input_pos % block_size;
+
+      // Update paged_kv_last_page_len
+      paged_kv_last_page_len_ptr[cur_query_id] = block_offset + 1;
+
+      int slot_num =
+          seq_block_tables_ptr[block_index] * block_size + block_offset;
+      // Update slot_mapping
+      slot_mapping_ptr[cur_query_id] = slot_num;
+      block_table_bound_ptr[cur_query_id] = div_ceil(next_seq_len, block_size);
+    }
+  }
+}
+
+__global__ void advance_step_flashinfer_indptr_kernel(
+    int num_threads, int num_seqs, int num_queries, int* paged_kv_indptr_ptr,
+    int* block_table_bound_ptr) {
+  int idx = blockIdx.x * num_threads + threadIdx.x;
+  // Update paged_kv_indptr
+  if (idx == 0) {
+    paged_kv_indptr_ptr[idx] = 0;
+  }
+  if (idx < num_queries) {
+    int sum = 0;
+    for (int i = 0; i <= idx; ++i) {
+      sum += block_table_bound_ptr[i];
+    }
+    paged_kv_indptr_ptr[idx + 1] = sum;
+  }
+}
+
+__global__ void advance_step_flashinfer_indices_kernel(
+    int num_seqs, int num_queries, int const* block_tables_ptr,
+    int64_t const max_num_blocks_per_seq, int* paged_kv_indices_ptr,
+    int* paged_kv_indptr_ptr, int* block_table_bound_ptr) {
+  // note: max_num_blocks_per_seq = block_tables.stride(0)
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // when cuda graphs are enabled, paged_kv_indptr tensor
+  // has to be updated for the padded queries
+  // tid represents a query# for paged_kv_indptr tensor
+  if (num_queries < tid && tid <= num_seqs) {
+    paged_kv_indptr_ptr[tid] = paged_kv_indptr_ptr[num_queries];
+  }
+
+  // each thread processes a block_ptr in block_tables
+  // block_tables shape: [num_queries, max_num_blocks_per_seq]
+  // paged_kv_indices is flattened block_tables.
+  for (int idx = tid; idx < (num_seqs * max_num_blocks_per_seq);
+       idx += (gridDim.x * blockDim.x)) {
+    // block_tables-row = paged_kv_indptr[queryNum]
+    int queryNum = idx / max_num_blocks_per_seq;
+    int col = idx % max_num_blocks_per_seq;
+    if (queryNum < num_queries && col < block_table_bound_ptr[queryNum]) {
+      int indices_arr_idx = paged_kv_indptr_ptr[queryNum] + col;
+      int block_tables_idx = queryNum * max_num_blocks_per_seq + col;
+      paged_kv_indices_ptr[indices_arr_idx] =
+          block_tables_ptr[block_tables_idx];
+    }
+  }
+}
+
+void advance_step_flashattn(int num_seqs, int num_queries, int block_size,
+                            torch::Tensor& input_tokens,       // type: long
+                            torch::Tensor& sampled_token_ids,  // type: long
+                            torch::Tensor& input_positions,    // type: long
+                            torch::Tensor& seq_lens,           // type: int
+                            torch::Tensor& slot_mapping,       // type: long
+                            torch::Tensor& block_tables) {     // type: int
+
+  if (logging) {
+    printf("advance_step_flashattn:\n");
+    printf("  num_seqs = %d\n", num_seqs);
+    printf("  num_queries = %d\n", num_queries);
+    printf("  block_size = %d\n", block_size);
+  }
+  // Verify all tensors
+  verify_tensor("input_tokens", input_tokens, num_seqs, -1, at::kLong);
+  verify_tensor("sampled_token_ids", sampled_token_ids, num_queries, 1,
+                at::kLong);
+  verify_tensor("input_positions", input_positions, num_seqs, -1, at::kLong);
+  verify_tensor("seq_lens", seq_lens, num_seqs, -1, at::kInt);
+  verify_tensor("slot_mapping", slot_mapping, num_seqs, -1, at::kLong);
+  verify_tensor("block_tables", block_tables, num_seqs, -1, at::kInt);
+
+  int dev = sampled_token_ids.get_device();
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
+
+  int blocks;
+  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
+
+  advance_step_flashattn_kernel<max_threads>
+      <<<blocks, max_threads, 0, stream>>>(
+          num_seqs, num_queries, block_size,
+          reinterpret_cast<long*>(input_tokens.data_ptr()),
+          reinterpret_cast<long const*>(sampled_token_ids.data_ptr()),
+          reinterpret_cast<long*>(input_positions.data_ptr()),
+          reinterpret_cast<int*>(seq_lens.data_ptr()),
+          reinterpret_cast<long*>(slot_mapping.data_ptr()),
+          reinterpret_cast<int const*>(block_tables.data_ptr()),
+          block_tables.stride(0));
+}
+
+void advance_step_flashinfer(
+    int num_seqs, int num_queries, int block_size,
+    torch::Tensor& input_tokens,            // type: long
+    torch::Tensor& sampled_token_ids,       // type: long
+    torch::Tensor& input_positions,         // type: long
+    torch::Tensor& seq_lens,                // type: int
+    torch::Tensor& slot_mapping,            // type: long
+    torch::Tensor& block_tables,            // type: int
+    torch::Tensor& paged_kv_indices,        // type: int
+    torch::Tensor& paged_kv_indptr,         // type: int
+    torch::Tensor& paged_kv_last_page_len,  // type: int
+    torch::Tensor& block_table_bound) {     // type: int
+
+  if (logging) {
+    printf("advance_step_flashinfer:\n");
+    printf("  num_seqs = %d\n", num_seqs);
+    printf("  num_queries = %d\n", num_queries);
+    printf("  block_size = %d\n", block_size);
+    printf("  block_tables.stride(0) = %zu\n", block_tables.stride(0));
+  }
+  // Verify all tensors
+  verify_tensor("input_tokens", input_tokens, num_seqs, -1, at::kLong);
+  // verify_tensor("sampled_token_ids", sampled_token_ids, num_queries, 1,
+  //               at::kLong);
+  verify_tensor("input_positions", input_positions, num_seqs, -1, at::kLong);
+  verify_tensor("seq_lens", seq_lens, num_seqs, -1, at::kInt);
+  verify_tensor("slot_mapping", slot_mapping, num_seqs, -1, at::kLong);
+  verify_tensor("block_tables", block_tables, num_seqs, -1, at::kInt);
+
+  verify_tensor("paged_kv_indices", paged_kv_indices, -1, -1, at::kInt);
+  verify_tensor("paged_kv_indptr", paged_kv_indptr, num_seqs + 1, -1, at::kInt);
+  verify_tensor("paged_kv_last_page_len", paged_kv_last_page_len, num_seqs, -1,
+                at::kInt);
+
+  verify_tensor("block_table_bound", block_table_bound, num_seqs, -1, at::kInt);
+
+  int dev = sampled_token_ids.get_device();
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
+
+  int blocks;
+  int threads;
+  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
+  cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);
+
+  TORCH_CHECK((blocks * threads > num_queries),
+              "multi-step: not enough threads to map to num_queries = ",
+              num_queries, " block_tables.stride(0) = ", block_tables.stride(0),
+              " blocks = ", blocks, " max_threads = ", threads);
+  if (logging) {
+    printf("launching kernels with %d blocks and %d threads\n", blocks,
+           threads);
+  }
+  advance_step_flashinfer_kernel<<<blocks, threads, 0, stream>>>(
+      threads, num_seqs, num_queries, block_size,
+      reinterpret_cast<long*>(input_tokens.data_ptr()),
+      reinterpret_cast<long const*>(sampled_token_ids.data_ptr()),
+      reinterpret_cast<long*>(input_positions.data_ptr()),
+      reinterpret_cast<int*>(seq_lens.data_ptr()),
+      reinterpret_cast<long*>(slot_mapping.data_ptr()),
+      reinterpret_cast<int const*>(block_tables.data_ptr()),
+      block_tables.stride(0),
+      reinterpret_cast<int*>(paged_kv_last_page_len.data_ptr()),
+      reinterpret_cast<int*>(block_table_bound.data_ptr()));
+
+  advance_step_flashinfer_indptr_kernel<<<blocks, threads, 0, stream>>>(
+      threads, num_seqs, num_queries,
+      reinterpret_cast<int*>(paged_kv_indptr.data_ptr()),
+      reinterpret_cast<int*>(block_table_bound.data_ptr()));
+
+  advance_step_flashinfer_indices_kernel<<<blocks, threads, 0, stream>>>(
+      num_seqs, num_queries,
+      reinterpret_cast<int const*>(block_tables.data_ptr()),
+      block_tables.stride(0),
+      reinterpret_cast<int*>(paged_kv_indices.data_ptr()),
+      reinterpret_cast<int*>(paged_kv_indptr.data_ptr()),
+      reinterpret_cast<int*>(block_table_bound.data_ptr()));
+}
+
+}  // namespace prepare_inputs
+
+void advance_step_flashattn(int64_t num_seqs, int64_t num_queries,
+                            int64_t block_size, torch::Tensor& input_tokens,
+                            torch::Tensor& sampled_token_ids,
+                            torch::Tensor& input_positions,
+                            torch::Tensor& seq_lens,
+                            torch::Tensor& slot_mapping,
+                            torch::Tensor& block_tables) {
+  prepare_inputs::advance_step_flashattn(
+      num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
+      input_positions, seq_lens, slot_mapping, block_tables);
+}
+
+void advance_step_flashinfer(
+    int64_t num_seqs, int64_t num_queries, int64_t block_size,
+    torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
+    torch::Tensor& input_positions, torch::Tensor& seq_lens,
+    torch::Tensor& slot_mapping, torch::Tensor& block_tables,
+    torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
+    torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bound) {
+  prepare_inputs::advance_step_flashinfer(
+      num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
+      input_positions, seq_lens, slot_mapping, block_tables, paged_kv_indices,
+      paged_kv_indptr, paged_kv_last_page_len, block_table_bound);
+}
diff --git a/vllm_v0.10.0/csrc/prepare_inputs/advance_step.cuh b/vllm_v0.10.0/csrc/prepare_inputs/advance_step.cuh
new file mode 100644
index 0000000..f215746
--- /dev/null
+++ b/vllm_v0.10.0/csrc/prepare_inputs/advance_step.cuh
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <iostream>
+
+namespace prepare_inputs {
+
+static constexpr int max_threads = 256;
+static constexpr bool logging = false;
+
+constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
+
+}  // namespace prepare_inputs
diff --git a/vllm_v0.10.0/csrc/quantization/activation_kernels.cu b/vllm_v0.10.0/csrc/quantization/activation_kernels.cu
new file mode 100644
index 0000000..67e9149
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/activation_kernels.cu
@@ -0,0 +1,121 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <cmath>
+#include "core/math.hpp"
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
+
+#include "quantization/fp8/common.cuh"
+
+namespace vllm {
+
+template <typename T>
+__device__ __forceinline__ T silu_kernel(const T& x) {
+  // x * sigmoid(x)
+  return (T)(((float)x) / (1.0f + expf((float)-x)));
+}
+
+// Activation and gating kernel template.
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
+          typename fp8_type>
+__global__ void act_and_mul_quant_kernel(
+    fp8_type* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., 2, d]
+    const float* scale, const int d) {
+  const int32_t blocks_per_token = gridDim.y;
+
+  const int32_t elems_per_128bit_load = (128 / 8) / sizeof(scalar_t);
+
+  // We don't expect the hidden dimension to exceed 32 bits so int32 should
+  // be safe here.
+  const int32_t tgt_elems_per_block = div_ceil(d, blocks_per_token);
+  const int32_t elems_per_block =
+      round_to_next_multiple_of(tgt_elems_per_block, elems_per_128bit_load);
+  const int32_t block_start = blockIdx.y * elems_per_block;
+  int32_t block_end = block_start + elems_per_block;
+  block_end = block_end > d ? d : block_end;
+
+  // token_idx is 64 bit to prevent 32 bit overflow when the number of tokens
+  // is very large
+  const int64_t token_idx = blockIdx.x;
+  const scalar_t* __restrict__ x_ptr = input + token_idx * 2 * d;
+  const scalar_t* __restrict__ y_ptr = input + token_idx * 2 * d + d;
+  fp8_type* __restrict__ out_ptr = out + token_idx * d;
+
+  // 128-bit vectorized code
+  const int32_t vec_loop_end =
+      round_to_previous_multiple_of(elems_per_128bit_load, block_end);
+  const int32_t vec_end_idx = vec_loop_end / elems_per_128bit_load;
+  const int32_t vec_start_idx = block_start / elems_per_128bit_load;
+
+  const int4* __restrict__ x_128bit_ptr = reinterpret_cast<const int4*>(x_ptr);
+  const int4* __restrict__ y_128bit_ptr = reinterpret_cast<const int4*>(y_ptr);
+  int2* __restrict__ out_128bit_ptr = reinterpret_cast<int2*>(out_ptr);
+
+  float inverted_scale = 1 / *scale;
+#pragma unroll
+  for (int32_t vec_idx = vec_start_idx + threadIdx.x; vec_idx < vec_end_idx;
+       vec_idx += blockDim.x) {
+    const int4 x_128bit = VLLM_LDG(&x_128bit_ptr[vec_idx]);
+    const int4 y_128bit = VLLM_LDG(&y_128bit_ptr[vec_idx]);
+    using scalar_128bit_vec_t = std::array<scalar_t, elems_per_128bit_load>;
+    using scalar_64bit_vec_t = std::array<fp8_type, elems_per_128bit_load>;
+
+    scalar_64bit_vec_t out_vec;
+    const auto x_vec = reinterpret_cast<scalar_128bit_vec_t const&>(x_128bit);
+    const auto y_vec = reinterpret_cast<scalar_128bit_vec_t const&>(y_128bit);
+
+#pragma unroll
+    for (int i = 0; i < elems_per_128bit_load; i++) {
+      out_vec[i] = scaled_fp8_conversion<true, fp8_type>(
+          ACT_FN(x_vec[i]) * y_vec[i], inverted_scale);
+    }
+
+    out_128bit_ptr[vec_idx] = reinterpret_cast<const int2&>(out_vec);
+  }
+
+  // Scalar cleanup code
+  if (block_end > vec_loop_end) {
+    for (int64_t idx = vec_loop_end + threadIdx.x; idx < block_end;
+         idx += blockDim.x) {
+      const scalar_t x = VLLM_LDG(&x_ptr[idx]);
+      const scalar_t y = VLLM_LDG(&y_ptr[idx]);
+      out_ptr[idx] =
+          scaled_fp8_conversion<true, fp8_type>(ACT_FN(x) * y, inverted_scale);
+    }
+  }
+}
+}  // namespace vllm
+
+// Launch activation, gating, and quantize kernel.
+#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL)                               \
+  int d = input.size(-1) / 2;                                               \
+  int64_t num_tokens = input.numel() / input.size(-1);                      \
+  dim3 grid(num_tokens, num_tokens > 16 ? num_tokens > 32 ? 1 : 2 : 4);     \
+  dim3 block(std::min(d, 512));                                             \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));         \
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();             \
+  VLLM_DISPATCH_FLOATING_TYPES(                                             \
+      input.scalar_type(), "act_and_mul_kernel", [&] {                      \
+        VLLM_DISPATCH_FP8_TYPES(                                            \
+            out.scalar_type(), "fused_add_rms_norm_kernel_fp8_type", [&] {  \
+              vllm::act_and_mul_quant_kernel<scalar_t, KERNEL<scalar_t>,    \
+                                             fp8_t>                         \
+                  <<<grid, block, 0, stream>>>(out.data_ptr<fp8_t>(),       \
+                                               input.data_ptr<scalar_t>(),  \
+                                               scale.data_ptr<float>(), d); \
+            });                                                             \
+      });
+
+void silu_and_mul_quant(torch::Tensor& out,    // [..., d]
+                        torch::Tensor& input,  // [..., 2 * d]
+                        torch::Tensor& scale) {
+  TORCH_CHECK(out.dtype() == torch::kFloat8_e4m3fn ||
+              out.dtype() == torch::kFloat8_e4m3fnuz);
+  TORCH_CHECK(input.dtype() == torch::kFloat16 ||
+              input.dtype() == torch::kBFloat16);
+  TORCH_CHECK(input.size(-1) % 2 == 0);
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
+}
diff --git a/vllm_v0.10.0/csrc/quantization/aqlm/gemm_kernels.cu b/vllm_v0.10.0/csrc/quantization/aqlm/gemm_kernels.cu
new file mode 100644
index 0000000..79cd2c6
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/aqlm/gemm_kernels.cu
@@ -0,0 +1,597 @@
+/*
+ * Modified by Neural Magic
+ * Adapted from https://github.com/Vahe1994/AQLM
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <torch/all.h>
+#include <c10/cuda/CUDAStream.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <iostream>
+#include <cstdlib>
+
+namespace vllm {
+namespace aqlm {
+
+__global__ void Code1x16MatVec(
+    const int4* __restrict__ A, const int4* __restrict__ B,
+    int4* __restrict__ C, const int4* __restrict__ codebook, const int prob_m,
+    const int prob_k,
+    const int4 codebook_a_sizes,  // cumulative sizes of A spanning each
+                                  // codebook, at most 3 long.
+    const int codebook_stride     // as int4.
+) {
+  int a_gl_stride = prob_k / 8 / 8;
+  int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+  bool pred = a_gl_rd < prob_m;
+
+  if (pred) {
+    // advance to the correct codebook, this easy because we only multiply one
+    // column of the codebook.
+    auto codebook_size = &codebook_a_sizes.x;
+    while (a_gl_rd >= *codebook_size) {
+      codebook += codebook_stride;
+      ++codebook_size;
+    }
+  }
+
+  int b_gl_rd = 0;
+  int c_gl_wr = a_gl_rd;
+  a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
+  int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
+
+  __shared__ int4 sh_b[32 * 9];
+  float res = 0;
+
+  int iters = (prob_k / 8 + 8 * 32 - 1) / (8 * 32);
+  while (iters--) {
+    // We pad shared memory to avoid bank conflicts during reads
+    __syncthreads();
+    for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) {
+      if (b_gl_rd + i < prob_k / 8) sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i];
+    }
+    __syncthreads();
+    b_gl_rd += 32 * 8;
+
+    int b_sh_rd = 9 * (threadIdx.x % 32);
+    if (pred && a_gl_rd < a_gl_end) {
+      const uint16_t* enc = reinterpret_cast<const uint16_t*>(&A[a_gl_rd]);
+#pragma unroll
+      for (int i = 0; i < 8; i++) {
+        uint32_t dec[4];
+        // We bypass the L1 cache to avoid massive amounts of memory streaming
+        // that doesn't actually help us; this brings > 2x speedup.
+        asm volatile("ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
+                     : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3])
+                     : "l"((void*)&codebook[enc[i]]));
+        half2* a = reinterpret_cast<half2*>(&dec);
+        half2* b = reinterpret_cast<half2*>(&sh_b[b_sh_rd]);
+        half2 res2 = {};
+#pragma unroll
+        for (int j = 0; j < 4; j++) res2 = __hfma2(a[j], b[j], res2);
+        res += __half2float(res2.x) + __half2float(res2.y);
+        b_sh_rd++;
+      }
+      a_gl_rd += 32;
+    }
+  }
+
+  if (pred) {
+#pragma unroll
+    for (int i = 16; i > 0; i /= 2) res += __shfl_down_sync(0xffffffff, res, i);
+    if (threadIdx.x % 32 == 0)
+      reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res);
+  }
+}
+
+__global__ void Code2x8MatVec(
+    const int4* __restrict__ A, const int4* __restrict__ B,
+    int4* __restrict__ C, const int4* __restrict__ codebook, int prob_m,
+    int prob_k,
+    const int4 codebook_a_sizes,  // cumulative sizes of A spanning each
+                                  // codebook, at most 3 long.
+    const int codebook_stride     // as int4.
+
+) {
+  int a_gl_stride = prob_k / 8 / 8;
+  int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+  bool pred = a_gl_rd < prob_m;
+
+  if (pred) {
+    // advance to the correct codebook, this easy because we only multiply one
+    // column of the codebook.
+    auto codebook_size = &codebook_a_sizes.x;
+    while (a_gl_rd >= *codebook_size) {
+      codebook += codebook_stride;
+      ++codebook_size;
+    }
+  }
+
+  int b_gl_rd = 0;
+  int c_gl_wr = a_gl_rd;
+  a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
+  int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
+  int lane = threadIdx.x % 8;
+
+  extern __shared__ int4 sh[];
+  int4* sh_b = sh;
+  int4* sh_code = sh_b + 32 * 9;
+  int4* sh_code0 = sh_code;
+  int4* sh_code1 = sh_code + 256 * 8;
+
+  for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) {
+    int4 dec = codebook[i];
+#pragma unroll
+    for (int j = 0; j < 8; j++) sh_code[8 * i + (j + lane) % 8] = dec;
+  }
+  __syncthreads();
+
+  float res = 0;
+
+  int iters = (prob_k / 8 + 8 * 32 - 1) / (8 * 32);
+  while (iters--) {
+    // We pad shared memory to avoid bank conflicts during reads
+    __syncthreads();
+    for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) {
+      if (b_gl_rd + i < prob_k / 8) sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i];
+    }
+    __syncthreads();
+    b_gl_rd += 32 * 8;
+
+    int b_sh_rd = 9 * (threadIdx.x % 32);
+    if (pred && a_gl_rd < a_gl_end) {
+      const uint8_t* enc = reinterpret_cast<const uint8_t*>(&A[a_gl_rd]);
+#pragma unroll
+      for (int i = 0; i < 8; i++) {
+        half2* a0 =
+            reinterpret_cast<half2*>(&sh_code0[8 * enc[2 * i + 0] + lane]);
+        half2* a1 =
+            reinterpret_cast<half2*>(&sh_code1[8 * enc[2 * i + 1] + lane]);
+        half2* b = reinterpret_cast<half2*>(&sh_b[b_sh_rd]);
+        half2 res2 = {};
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+          res2 = __hfma2(__hadd2(a0[j], a1[j]), b[j], res2);
+        res += __half2float(res2.x) + __half2float(res2.y);
+        b_sh_rd++;
+      }
+      a_gl_rd += 32;
+    }
+  }
+
+  if (pred) {
+#pragma unroll
+    for (int i = 16; i > 0; i /= 2) res += __shfl_down_sync(0xffffffff, res, i);
+    if (threadIdx.x % 32 == 0)
+      reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res);
+  }
+}
+
+__global__ void Code1x16Dequant(
+    const int4* __restrict__ A, int4* __restrict__ C,
+    const int4* __restrict__ codebook, int prob_m, int prob_k,
+    const int4 codebook_a_sizes,  // cumulative sizes of A spanning each
+                                  // codebook, at most 3 long, sums to m.
+    const int codebook_stride     // as int4
+) {
+  int a_gl_stride = prob_k / 8 / 8;
+  int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+  bool pred = a_gl_rd < prob_m;
+
+  if (pred) {
+    // advance to the correct codebook, this easy because we only multiply one
+    // column of the codebook.
+    auto codebook_size = &codebook_a_sizes.x;
+    while (a_gl_rd >= *codebook_size) {
+      codebook += codebook_stride;
+      ++codebook_size;
+    }
+  }
+
+  a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
+  int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
+
+  int c_gl_stride = prob_k / 8;
+  int c_gl_wr = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+  c_gl_wr = c_gl_stride * c_gl_wr + (threadIdx.x % 32) * 8;
+
+  int iters = (prob_k / 8 - 1) / (8 * 32) + 1;
+  while (iters--) {
+    if (pred && a_gl_rd < a_gl_end) {
+      const uint16_t* enc = reinterpret_cast<const uint16_t*>(&A[a_gl_rd]);
+#pragma unroll
+      for (int i = 0; i < 8; i++) {
+        int4 chunk;
+        auto dec = reinterpret_cast<uint32_t*>(&chunk);
+        // We bypass the L1 cache to avoid massive amounts of memory streaming
+        // that doesn't actually help us; this brings > 2x speedup.
+        asm volatile("ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
+                     : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3])
+                     : "l"((void*)&codebook[enc[i]]));
+
+        C[a_gl_rd * 8 + i] = chunk;
+      }
+    }
+    a_gl_rd += 32;
+  }
+}
+
+__global__ void Code2x8Dequant(
+    const int4* __restrict__ A, int4* __restrict__ C,
+    const int4* __restrict__ codebook, int prob_m, int prob_k,
+    const int4
+        codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at
+                           // most 3 long, corresponds to cols.
+    const int codebook_stride  // as int4
+) {
+  int a_gl_stride = prob_k / 8 / 8;
+  int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+  bool pred = a_gl_rd < prob_m;
+
+  if (pred) {
+    // advance to the correct codebook, this easy because we only multiply one
+    // column of the codebook.
+    auto codebook_size = &codebook_a_sizes.x;
+    while (a_gl_rd >= *codebook_size) {
+      codebook += codebook_stride;
+      ++codebook_size;
+    }
+  }
+
+  a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
+  int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
+  int lane = threadIdx.x % 8;
+
+  int c_gl_stride = prob_k / 8;
+  int c_gl_wr = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+  c_gl_wr = c_gl_stride * c_gl_wr + (threadIdx.x % 32) * 8;
+
+  extern __shared__ int4 sh[];
+  int4* sh_code = sh;
+  int4* sh_code0 = sh_code;
+  int4* sh_code1 = sh_code + 256 * 8;
+
+  for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) {
+    int4 dec = codebook[i];
+#pragma unroll
+    for (int j = 0; j < 8; j++) sh_code[8 * i + (j + lane) % 8] = dec;
+  }
+  __syncthreads();
+
+  int iters = (prob_k / 8 - 1) / (8 * 32) + 1;
+  while (iters--) {
+    if (pred && a_gl_rd < a_gl_end) {
+      const uint8_t* enc = reinterpret_cast<const uint8_t*>(&A[a_gl_rd]);
+#pragma unroll
+      for (int i = 0; i < 8; i++) {
+        int4 chunk;
+        half2* a0 =
+            reinterpret_cast<half2*>(&sh_code0[8 * enc[2 * i + 0] + lane]);
+        half2* a1 =
+            reinterpret_cast<half2*>(&sh_code1[8 * enc[2 * i + 1] + lane]);
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+          reinterpret_cast<half2*>(&chunk)[j] = __hadd2(a0[j], a1[j]);
+        C[a_gl_rd * 8 + i] = chunk;
+      }
+    }
+    a_gl_rd += 32;
+  }
+}
+
+inline int ceildiv(int a, int b) { return (a + b - 1) / b; }
+
+const int THREAD_M = 16;
+
+void code1x16_matvec_cuda(const void* __restrict__ A,
+                          const void* __restrict__ B, void* __restrict__ C,
+                          const void* __restrict__ codebook, int prob_m,
+                          int prob_k, const int4 codebook_a_sizes,
+                          const int codebook_stride) {
+  int sms;
+  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
+  int waves = 0;
+  int thread_m;
+  do {
+    waves++;
+    thread_m = ceildiv(prob_m, waves * sms);
+  } while (thread_m > THREAD_M);
+
+  int blocks = ceildiv(prob_m, thread_m);
+  int threads = 32 * thread_m;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+  Code1x16MatVec<<<blocks, threads, 16 * 32 * 9, stream>>>(
+      (const int4*)A, (const int4*)B, (int4*)C, (const int4*)codebook, prob_m,
+      prob_k, codebook_a_sizes, codebook_stride);
+}
+
+void code2x8_matvec_cuda(const void* __restrict__ A, const void* __restrict__ B,
+                         void* __restrict__ C,
+                         const void* __restrict__ codebook, int prob_m,
+                         int prob_k, const int4 codebook_a_sizes,
+                         const int codebook_stride) {
+  int sms;
+  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
+  int waves = 0;
+  int thread_m;
+  do {
+    waves++;
+    thread_m = ceildiv(prob_m, waves * sms);
+  } while (thread_m > THREAD_M);
+
+  int blocks = ceildiv(prob_m, thread_m);
+  int threads = 32 * thread_m;
+  int shared = 16 * (2 * 256 * 8 + 32 * 9);
+  cudaFuncSetAttribute(Code2x8MatVec,
+                       cudaFuncAttributeMaxDynamicSharedMemorySize, shared);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+  Code2x8MatVec<<<blocks, threads, shared, stream>>>(
+      (const int4*)A, (const int4*)B, (int4*)C, (const int4*)codebook, prob_m,
+      prob_k, codebook_a_sizes, codebook_stride);
+}
+
+void code1x16_dequant_cuda(
+    const void* __restrict__ A, void* __restrict__ C,
+    const void* __restrict__ codebook, int prob_m, int prob_k,
+    const int4 codebook_a_sizes,  // cumulative sizes of A spanning each
+                                  // codebook, at most 3 long.
+    const int codebook_stride     // as int4.
+) {
+  int sms;
+  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
+  int waves = 0;
+  int thread_m;
+  do {
+    waves++;
+    thread_m = ceildiv(prob_m, waves * sms);
+  } while (thread_m > THREAD_M);
+
+  int blocks = ceildiv(prob_m, thread_m);
+  int threads = 32 * thread_m;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+  Code1x16Dequant<<<blocks, threads, 0, stream>>>(
+      (const int4*)A, (int4*)C, (const int4*)codebook, prob_m, prob_k,
+      codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at
+                         // most 3 long.
+      codebook_stride    // as int4.
+  );
+}
+
+// Dequantizes the code and codebook into weights.
+void code2x8_dequant_cuda(
+    const void* __restrict__ A, void* __restrict__ C,
+    const void* __restrict__ codebook, int prob_m, int prob_k,
+    const int4
+        codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at
+                           // most 3 long, corresponds to cols.
+    const int codebook_stride  // as int4
+) {
+  int sms;
+  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
+  int waves = 0;
+  int thread_m;
+  do {
+    waves++;
+    thread_m = ceildiv(prob_m, waves * sms);
+  } while (thread_m > THREAD_M);
+
+  int blocks = ceildiv(prob_m, thread_m);
+  int threads = 32 * thread_m;
+  int shared = 16 * (2 * 256 * 8 + 32 * 9);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+
+  cudaFuncSetAttribute(Code2x8Dequant,
+                       cudaFuncAttributeMaxDynamicSharedMemorySize, shared);
+  Code2x8Dequant<<<blocks, threads, shared, stream>>>(
+      (const int4*)A, (int4*)C, (const int4*)codebook, prob_m, prob_k,
+      codebook_a_sizes, codebook_stride);
+}
+
+int codebook_stride(const torch::Tensor& codebooks) {
+  return codebooks.stride(0) * codebooks.element_size() / sizeof(int4);
+}
+
+void code1x16_matvec(
+    const torch::Tensor& A, const torch::Tensor& B, torch::Tensor& C,
+    const torch::Tensor& codebook,
+    const int4 codebook_a_sizes  // cumulative sizes of A spanning each
+                                 // codebook, at most 3 long.
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
+  int prob_m = C.size(0);
+  int prob_k = B.size(0);
+
+  code1x16_matvec_cuda(A.data_ptr(), B.data_ptr(), C.data_ptr(),
+                       codebook.data_ptr(), prob_m, prob_k, codebook_a_sizes,
+                       codebook_stride(codebook));
+}
+
+torch::Tensor code1x16_matmat(const torch::Tensor& input,
+                              const torch::Tensor& codes,
+                              const torch::Tensor& codebooks,
+                              const torch::Tensor& scales,
+                              const int4 codebook_a_sizes,
+                              const std::optional<torch::Tensor>& bias) {
+  auto input_sizes = input.sizes();
+  auto out_features = codes.size(0) * codebooks.size(2);
+  auto flat_input = input.reshape({-1, input.size(-1)});
+  auto flat_output = torch::empty(
+      {flat_input.size(0), out_features},
+      torch::TensorOptions().dtype(input.dtype()).device(input.device()));
+
+  for (int i = 0; i < flat_input.size(0); ++i) {
+    auto input_vec = flat_input.index({i});
+    auto output_vec = flat_output.index({i});
+    code1x16_matvec(codes.squeeze(2), input_vec, output_vec, codebooks,
+                    codebook_a_sizes);
+  }
+  flat_output *= scales.flatten().unsqueeze(0);
+
+  if (bias.has_value()) {
+    flat_output += bias->unsqueeze(0);
+  }
+
+  auto output_sizes = input_sizes.vec();
+  output_sizes.pop_back();
+  output_sizes.push_back(-1);
+  auto output = flat_output.reshape(output_sizes);
+  return output;
+}
+
+void code2x8_matvec(const torch::Tensor& A, const torch::Tensor& B,
+                    torch::Tensor& C, const torch::Tensor& codebook,
+                    const int4 codebook_a_sizes) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
+  int prob_m = C.size(0);
+  int prob_k = B.size(0);
+  code2x8_matvec_cuda(A.data_ptr(), B.data_ptr(), C.data_ptr(),
+                      codebook.data_ptr(), prob_m, prob_k, codebook_a_sizes,
+                      2 * codebook_stride(codebook));
+}
+
+torch::Tensor code2x8_matmat(const torch::Tensor& input,
+                             const torch::Tensor& codes,
+                             const torch::Tensor& codebooks,
+                             const torch::Tensor& scales,
+                             const int4 codebook_a_sizes,
+                             const std::optional<torch::Tensor>& bias) {
+  auto input_sizes = input.sizes();
+  auto out_features = codes.size(0) * codebooks.size(2);
+  auto flat_input = input.reshape({-1, input.size(-1)});
+  auto flat_output = torch::empty(
+      {flat_input.size(0), out_features},
+      torch::TensorOptions().dtype(input.dtype()).device(input.device()));
+
+  for (int i = 0; i < flat_input.size(0); ++i) {
+    auto input_vec = flat_input.index({i});
+    auto output_vec = flat_output.index({i});
+    code2x8_matvec(codes.squeeze(2), input_vec, output_vec, codebooks,
+                   codebook_a_sizes);
+  }
+  flat_output *= scales.flatten().unsqueeze(0);
+  if (bias.has_value()) {
+    flat_output += bias->unsqueeze(0);
+  }
+
+  auto output_sizes = input_sizes.vec();
+  output_sizes.pop_back();
+  output_sizes.push_back(-1);
+  auto output = flat_output.reshape(output_sizes);
+  return output;
+}
+
+// Accumulate the partition sizes.
+int4 accumulate_sizes(const std::vector<int64_t>& codebook_partition_sizes) {
+  int4 cumulative_sizes;
+  auto cumulative_size = &cumulative_sizes.x;
+  size_t i = 0;
+  int last = 0;
+  assert(codebook_partition_sizes.size() <= 4);
+  for (; i < codebook_partition_sizes.size(); ++i, ++cumulative_size) {
+    *cumulative_size = codebook_partition_sizes[i] + last;
+    last = *cumulative_size;
+  }
+  // fill in the rest with unreachable.
+  for (; i < 4; ++i, ++cumulative_size) {
+    *cumulative_size = last * 10;
+  }
+  return cumulative_sizes;
+}
+
+}  // namespace aqlm
+}  // namespace vllm
+
+torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
+                        const torch::Tensor& codebooks,
+                        const torch::Tensor& scales,
+                        const std::vector<int64_t>& codebook_partition_sizes,
+                        const std::optional<torch::Tensor>& bias) {
+  int4 cumulative_sizes =
+      vllm::aqlm::accumulate_sizes(codebook_partition_sizes);
+
+  int const nbooks = codebooks.size(0) / codebook_partition_sizes.size();
+  int const entries = codebooks.size(1);
+
+  if (nbooks == 1 && entries == (1 << 16)) {
+    return vllm::aqlm::code1x16_matmat(input, codes, codebooks, scales,
+                                       cumulative_sizes, bias);
+  }
+  if (nbooks == 2 && entries == (1 << 8)) {
+    return vllm::aqlm::code2x8_matmat(input, codes, codebooks, scales,
+                                      cumulative_sizes, bias);
+  }
+
+  TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries,
+              " entries is not currently supported.")
+  return {};
+}
+
+torch::Tensor aqlm_dequant(
+    const torch::Tensor& codes, const torch::Tensor& codebooks,
+    const std::vector<int64_t>& codebook_partition_sizes) {
+  int4 cumulative_sizes =
+      vllm::aqlm::accumulate_sizes(codebook_partition_sizes);
+
+  int const nbooks = codebooks.size(0) / codebook_partition_sizes.size();
+  int const entries = codebooks.size(1);
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(codes));
+  int rows = codes.size(1);
+  int cols = codes.size(0);
+
+  auto in_features = codes.size(1) * 8;
+  auto out_features = codes.size(0);
+
+  assert(out_features == std::accumulate(codebook_partition_sizes.begin(),
+                                         codebook_partition_sizes.end(), 0));
+
+  auto weights = torch::empty({out_features, in_features},
+                              torch::TensorOptions()
+                                  .dtype(codebooks.dtype())
+                                  .device(codebooks.device()));
+
+  if (nbooks == 1 && entries == (1 << 16)) {
+    vllm::aqlm::code1x16_dequant_cuda(codes.data_ptr(), weights.data_ptr(),
+                                      codebooks.data_ptr(), out_features,
+                                      in_features, cumulative_sizes,
+                                      vllm::aqlm::codebook_stride(codebooks));
+
+    // if you wanted to flip to scaling the weights, (though it's 30%-ish slower
+    // and not consistent with gemv implementation.) weights *=
+    // scales.index({"...", 0, 0});
+
+    return weights;
+  }
+
+  if (nbooks == 2 && entries == (1 << 8)) {
+    vllm::aqlm::code2x8_dequant_cuda(codes.data_ptr(), weights.data_ptr(),
+                                     codebooks.data_ptr(), out_features,
+                                     in_features, cumulative_sizes,
+                                     vllm::aqlm::codebook_stride(codebooks));
+
+    // if you wanted to flip to scaling the weights, (though it's 30%-ish slower
+    // and not consistent with gemv implementation) weights *=
+    // scales.index({"...", 0, 0});
+
+    return weights;
+  }
+
+  TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries,
+              " entries is not currently supported.")
+  return {};
+}
diff --git a/vllm_v0.10.0/csrc/quantization/awq/dequantize.cuh b/vllm_v0.10.0/csrc/quantization/awq/dequantize.cuh
new file mode 100644
index 0000000..5fa4b5f
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/awq/dequantize.cuh
@@ -0,0 +1,102 @@
+/*
+Adapted from https://github.com/mit-han-lab/llm-awq
+Modified from NVIDIA FasterTransformer:
+https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+@article{lin2023awq,
+  title={AWQ: Activation-aware Weight Quantization for LLM Compression and
+Acceleration}, author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang,
+Shang and Dang, Xingyu and Han, Song}, journal={arXiv}, year={2023}
+}
+*/
+
+#pragma once
+
+namespace vllm {
+namespace awq {
+
+__device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
+  assert(false);
+#else
+  uint4 result;
+
+  uint32_t* h = reinterpret_cast<uint32_t*>(&result);
+  uint32_t const i4s = reinterpret_cast<uint32_t const&>(source);
+
+  // First, we extract the i4s and construct an intermediate fp16 number.
+  static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa;
+  static constexpr uint32_t BOTTOM_MASK = 0x000f000f;
+  static constexpr uint32_t TOP_MASK = 0x00f000f0;
+  static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400;
+
+  // Note that the entire sequence only requires 1 shift instruction. This is
+  // thanks to the register packing format and the fact that we force our
+  // integers to be unsigned, and account for this in the fp16 subtractions. In
+  // addition, I exploit the fact that sub and fma have the same throughput in
+  // order to convert elt_23 and elt_67 to fp16 without having to shift them to
+  // the bottom bits before hand.
+
+  // Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW
+  // dependency if we issue immediately before required.
+  const uint32_t top_i4s = i4s >> 8;
+  // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(h[0])
+               : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM),
+                 "n"(immLut));
+  // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(h[1])
+               : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM),
+                 "n"(immLut));
+  // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(h[2])
+               : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM),
+                 "n"(immLut));
+  // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(h[3])
+               : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM),
+                 "n"(immLut));
+
+  // I use inline PTX below because I am not sure if the compiler will emit
+  // float2half instructions if I use the half2 ctor. In this case, I chose
+  // performance reliability over code readability.
+
+  // This is the half2 {1032, 1032} represented as an integer.
+  // static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64086408;
+  // Haotian: subtract {1024, 1024} instead, we do not need to map to [-8, 7]
+  static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64006400;
+  // This is the half2 {1 / 16, 1 / 16} represented as an integer.
+  static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00;
+  // This is the half2 {-72, -72} represented as an integer.
+  // static constexpr uint32_t NEG_72 = 0xd480d480;
+  // Haotian: Let's use {-64, -64}.
+  static constexpr uint32_t NEG_64 = 0xd400d400;
+
+  // Finally, we construct the output numbers.
+  // Convert elt_01
+  asm volatile("sub.f16x2 %0, %1, %2;\n"
+               : "=r"(h[0])
+               : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM));
+  // Convert elt_23
+  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+               : "=r"(h[1])
+               : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
+  // Convert elt_45
+  asm volatile("sub.f16x2 %0, %1, %2;\n"
+               : "=r"(h[2])
+               : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM));
+  // Convert elt_67
+  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+               : "=r"(h[3])
+               : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
+
+  return result;
+#endif
+  __builtin_unreachable();  // Suppress missing return statement warning
+}
+
+}  // namespace awq
+}  // namespace vllm
diff --git a/vllm_v0.10.0/csrc/quantization/awq/gemm_kernels.cu b/vllm_v0.10.0/csrc/quantization/awq/gemm_kernels.cu
new file mode 100644
index 0000000..53c4767
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/awq/gemm_kernels.cu
@@ -0,0 +1,526 @@
+/*
+Adapted from https://github.com/mit-han-lab/llm-awq
+@article{lin2023awq,
+  title={AWQ: Activation-aware Weight Quantization for LLM Compression and
+Acceleration}, author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang,
+Shang and Dang, Xingyu and Han, Song}, journal={arXiv}, year={2023}
+}
+ */
+
+#include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "dequantize.cuh"
+
+#include <cuda_fp16.h>
+
+namespace vllm {
+namespace awq {
+
+template <int N>
+__global__ void __launch_bounds__(64)
+    gemm_forward_4bit_cuda_m16nXk32(int G, int split_k_iters,
+                                    half* __restrict__ A, int* __restrict__ B,
+                                    half* __restrict__ scaling_factors,
+                                    int* __restrict__ zeros, int M, int IC,
+                                    int OC, half* __restrict__ C) {
+  // Only support matrix n = 64 or 128
+  assert(N == 64 || N == 128);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
+  assert(false);
+#else
+  static constexpr uint32_t ZERO = 0x0;
+  float C_warp[32];
+  __shared__ half A_shared[16 * (32 + 8)];
+  __shared__ half B_shared[32 * (N + 8)];
+
+  int j_factors1 = ((OC + N - 1) / N);
+  int blockIdx_y = blockIdx.x % ((M + 16 - 1) / 16 * j_factors1);
+  int blockIdx_z = blockIdx.x / ((M + 16 - 1) / 16 * j_factors1);
+
+  half A_shared_warp[8];
+  half B_shared_warp[N / 4];
+  for (int j_0_4_init = 0; j_0_4_init < N / 32; ++j_0_4_init) {
+    for (int i = 0; i < 8; ++i) {
+      C_warp[(j_0_4_init * 8) + i] = 0.0;
+    }
+  }
+
+  static constexpr int row_stride_warp = 32 * 8 / 32;
+  static constexpr int row_stride = 2 * 32 * 8 / N;
+  // TODO: Haotian: blockIdx_y / j_factors1 in A loading to support bsz > 16
+  bool ld_A_flag =
+      (blockIdx_y / j_factors1 * 16 + threadIdx.y * row_stride_warp +
+       threadIdx.x * 8 / 32) < M;  // threadIdx.y is warp_id
+  // bool wb_C_flag = (threadIdx.x / 4) < M;
+
+  half* A_ptr =
+      A +
+      (((int)blockIdx_y) / j_factors1 * 16 +
+       (((int)threadIdx.y) * row_stride_warp) + ((int)threadIdx.x) / (32 / 8)) *
+          IC +
+      (((int)threadIdx.x) % (32 / 8)) * 8;
+
+  int* B_ptr = B + ((int)threadIdx.y) * (OC / 8) * (256 / N) +
+               (((int)threadIdx.x) / (N / 8)) * (OC / 8) +
+               (((int)blockIdx_y) % j_factors1) * (N / 8) +
+               (((int)threadIdx.x) % (N / 8)) * 1;
+  // Why * 1 in the above line?
+
+  half* A_shared_ptr = A_shared +
+                       ((int)threadIdx.y) * row_stride_warp * (32 + 8) +
+                       (((int)threadIdx.x) / (32 / 8)) * (32 + 8) +
+                       (((int)threadIdx.x) % (32 / 8)) * 8;
+
+  half* B_shared_ptr = B_shared +
+                       ((int)threadIdx.y) * (row_stride / 2) * (N + 8) +
+                       (((int)threadIdx.x) / (N / 8)) * (N + 8) +
+                       (((int)threadIdx.x) % (N / 8)) * 8;
+
+  int* zeros_ptr = zeros + (((int)blockIdx_y) % j_factors1) * (N / 8) +
+                   ((int)threadIdx.x) % (N / 8);
+
+  half* scaling_factors_ptr = scaling_factors +
+                              (((int)blockIdx_y) % j_factors1) * N +
+                              (((int)threadIdx.x) % (N / 8)) * 8;
+
+  half* C_ptr =
+      C +
+      static_cast<long long>(blockIdx_z) * M * OC  // blockIdz.x -> split_k dim
+      + (((int)blockIdx_y) % j_factors1) * N + ((int)threadIdx.y) * (N / 2) +
+      (((int)threadIdx.x) % 4) * 2;
+
+  // preload s.f. and zeros
+  int k_bound = (IC / 32 + split_k_iters - 1) / split_k_iters;
+  if ((k_bound - 1) * split_k_iters * 32 + blockIdx_z * 32 >= IC) k_bound -= 1;
+  for (int _k_0_0 = 0; _k_0_0 < k_bound; ++_k_0_0) {
+    int k_0_0 = _k_0_0 * split_k_iters + blockIdx_z;
+    __syncthreads();
+    // TODO: Haotian: blockIdx_y / j_factors1 in A loading to support bsz > 16
+    if (ld_A_flag) {
+      *(uint4*)(A_shared_ptr) = *(uint4*)(A_ptr + (k_0_0 * 32));
+    } else {
+      *(uint4*)(A_shared_ptr) = make_uint4(0, 0, 0, 0);
+    }
+
+    // for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 2; ++ax0_ax1_fused_0) {
+    uint32_t zeros_loaded = *(uint32_t*)(zeros_ptr + k_0_0 * 32 / G * (OC / 8));
+    uint4 B_loaded_zero = dequantize_s4_to_fp16x2(zeros_loaded);
+    uint4 B_loaded_scale =
+        *(uint4*)(scaling_factors_ptr + k_0_0 * 32 / G * (OC));
+    /*
+    if (blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 == 0 && threadIdx.x == 0 &&
+    threadIdx.y == 0){ printf("%x %x %x %x %x %x %x %x\n", B_loaded_scale.x,
+    B_loaded_scale.y, B_loaded_scale.z, B_loaded_scale.w, B_loaded_zero.x,
+    B_loaded_zero.y, B_loaded_zero.z, B_loaded_zero.w);
+    }
+    */
+    // uint4 B_loaded_scale = make_uint4(0, 0, 0, 0);
+    int* B_ptr_local = B_ptr + k_0_0 * 32 * (OC / 8);
+
+    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < N / 16; ++ax0_ax1_fused_0) {
+      // B: 32 x 136 (128+8) float16
+      // each warp: 32 x 4
+      // each thr: read 32 bit -> convert to 8xFP16 (a UINT4) -> scale and minus
+      // zero -> WB UINT4
+      // *(uint4*)(B_shared + ((((ax0_ax1_fused_0 * 544) + (((int)threadIdx.y) *
+      // 272)) + ((((int)threadIdx.x) >> 4) * 136)) + ((((int)threadIdx.x) & 15)
+      // * 8))) = *(uint4*)(B + ((((((k_0_0 * 163840) + (ax0_ax1_fused_0 *
+      // 20480)) + (((int)threadIdx.y) * 10240)) + ((((int)threadIdx.x) >> 4) *
+      // 5120)) + (((int)blockIdx_y) * 128)) + ((((int)threadIdx.x) & 15) *
+      // 8))); row stride in shared memory: (NWARPS * 32 * 8 / cta_N)
+      uint32_t B_loaded =
+          *(uint32_t*)(B_ptr_local + ax0_ax1_fused_0 * row_stride * (OC / 8));
+      uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
+
+      // - zero and * scale
+      // TODO (Haotian): can save 4 assembly instructions if sormulate as deq =
+      // q * scale - zero * scale.
+      asm volatile("sub.f16x2 %0, %1, %2;\n"
+                   : "=r"(B_loaded_fp16.x)
+                   : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x));
+      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+                   : "=r"(B_loaded_fp16.x)
+                   : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO));
+      asm volatile("sub.f16x2 %0, %1, %2;\n"
+                   : "=r"(B_loaded_fp16.y)
+                   : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y));
+      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+                   : "=r"(B_loaded_fp16.y)
+                   : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO));
+      asm volatile("sub.f16x2 %0, %1, %2;\n"
+                   : "=r"(B_loaded_fp16.z)
+                   : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z));
+      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+                   : "=r"(B_loaded_fp16.z)
+                   : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO));
+      asm volatile("sub.f16x2 %0, %1, %2;\n"
+                   : "=r"(B_loaded_fp16.w)
+                   : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w));
+      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+                   : "=r"(B_loaded_fp16.w)
+                   : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO));
+      /*
+      if (ax0_ax1_fused_0 == 0 && blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 ==
+      0 && threadIdx.x == 17 && threadIdx.y == 0){ printf("[x] %X %X %X %X\n",
+      B_loaded_fp16.x, B_loaded_fp16.y, B_loaded_fp16.z, B_loaded_fp16.w);
+      }
+      */
+
+      // write back
+      *(uint4*)(B_shared_ptr + ax0_ax1_fused_0 * row_stride * (N + 8)) =
+          B_loaded_fp16;
+    }
+    __syncthreads();
+
+    for (int k_0_1 = 0; k_0_1 < 2; ++k_0_1) {
+      {
+        unsigned int addr;
+        __asm__ __volatile__(
+            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, "
+            "addr; }\n"
+            : "=r"(addr)
+            : "l"((void*)((&(A_shared[(k_0_1 * 16)])) +
+                          (((((int)threadIdx.x) & 15) * 40) +
+                           ((((int)threadIdx.x) >> 4) * 8)))));
+
+        __asm__ __volatile__(
+            "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
+            "{%0, %1, %2, %3}, [%4];\n"
+            : "=r"(((unsigned*)(A_shared_warp + 0))[0]),
+              "=r"(((unsigned*)(A_shared_warp + 0))[1]),
+              "=r"(((unsigned*)(A_shared_warp + 0))[2]),
+              "=r"(((unsigned*)(A_shared_warp + 0))[3])
+            : "r"(addr));
+      }
+
+      for (int ax1_0 = 0; ax1_0 < N / 32; ++ax1_0) {
+        {
+          unsigned int addr;
+          __asm__ __volatile__(
+              "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, "
+              "addr; }\n"
+              : "=r"(addr)
+              : "l"((void*)((&(B_shared[(((k_0_1 * (N * 16 + 128)) +
+                                          (((int)threadIdx.y) * (N / 2))) +
+                                         (ax1_0 * 16))])) +
+                            (((((int)threadIdx.x) & 15) * (N + 8)) +
+                             ((((int)threadIdx.x) >> 4) * 8)))));
+          __asm__ __volatile__(
+              "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+              "{%0, %1, %2, %3}, [%4];\n"
+              : "=r"(((unsigned*)(B_shared_warp + (ax1_0 * 8)))[0]),
+                "=r"(((unsigned*)(B_shared_warp + (ax1_0 * 8)))[1]),
+                "=r"(((unsigned*)(B_shared_warp + (ax1_0 * 8)))[2]),
+                "=r"(((unsigned*)(B_shared_warp + (ax1_0 * 8)))[3])
+              : "r"(addr));
+        }
+      }
+      for (int j_0_4 = 0; j_0_4 < N / 32; ++j_0_4) {
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+        {
+          __asm__ __volatile__(
+              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
+              : "=f"(((float*)(C_warp + (j_0_4 * 8)))[0]),
+                "=f"(((float*)(C_warp + (j_0_4 * 8)))[1]),
+                "=f"(((float*)(C_warp + (j_0_4 * 8)))[2]),
+                "=f"(((float*)(C_warp + (j_0_4 * 8)))[3])
+              : "r"(((unsigned*)(A_shared_warp + 0))[0]),
+                "r"(((unsigned*)(A_shared_warp + 0))[1]),
+                "r"(((unsigned*)(B_shared_warp + (j_0_4 * 8)))[0]),
+                "f"(((float*)(C_warp + (j_0_4 * 8)))[0]),
+                "f"(((float*)(C_warp + (j_0_4 * 8)))[1]),
+                "f"(((float*)(C_warp + (j_0_4 * 8)))[2]),
+                "f"(((float*)(C_warp + (j_0_4 * 8)))[3]));
+        }
+
+        {
+          __asm__ __volatile__(
+              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
+              : "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]),
+                "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[1]),
+                "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[2]),
+                "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[3])
+              : "r"(((unsigned*)(A_shared_warp + 0))[0]),
+                "r"(((unsigned*)(A_shared_warp + 0))[1]),
+                "r"(((unsigned*)(B_shared_warp + ((j_0_4 * 8) + 4)))[0]),
+                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]),
+                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[1]),
+                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[2]),
+                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[3]));
+        }
+
+        {
+          __asm__ __volatile__(
+              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
+              : "=f"(((float*)(C_warp + (j_0_4 * 8)))[0]),
+                "=f"(((float*)(C_warp + (j_0_4 * 8)))[1]),
+                "=f"(((float*)(C_warp + (j_0_4 * 8)))[2]),
+                "=f"(((float*)(C_warp + (j_0_4 * 8)))[3])
+              : "r"(((unsigned*)(A_shared_warp + 0))[2]),
+                "r"(((unsigned*)(A_shared_warp + 0))[3]),
+                "r"(((unsigned*)(B_shared_warp + (j_0_4 * 8)))[1]),
+                "f"(((float*)(C_warp + (j_0_4 * 8)))[0]),
+                "f"(((float*)(C_warp + (j_0_4 * 8)))[1]),
+                "f"(((float*)(C_warp + (j_0_4 * 8)))[2]),
+                "f"(((float*)(C_warp + (j_0_4 * 8)))[3]));
+        }
+
+        {
+          __asm__ __volatile__(
+              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
+              : "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]),
+                "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[1]),
+                "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[2]),
+                "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[3])
+              : "r"(((unsigned*)(A_shared_warp + 0))[2]),
+                "r"(((unsigned*)(A_shared_warp + 0))[3]),
+                "r"(((unsigned*)(B_shared_warp + ((j_0_4 * 8) + 4)))[1]),
+                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]),
+                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[1]),
+                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[2]),
+                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[3]));
+        }
+  #else
+        {
+          __asm__ __volatile__(
+              "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+              "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, "
+              "%13};\n"
+              : "=f"(((float*)(C_warp + (j_0_4 * 8)))[0]),
+                "=f"(((float*)(C_warp + (j_0_4 * 8)))[1]),
+                "=f"(((float*)(C_warp + (j_0_4 * 8)))[2]),
+                "=f"(((float*)(C_warp + (j_0_4 * 8)))[3])
+              : "r"(((unsigned*)(A_shared_warp + 0))[0]),
+                "r"(((unsigned*)(A_shared_warp + 0))[1]),
+                "r"(((unsigned*)(A_shared_warp + 0))[2]),
+                "r"(((unsigned*)(A_shared_warp + 0))[3]),
+                "r"(((unsigned*)(B_shared_warp + (j_0_4 * 8)))[0]),
+                "r"(((unsigned*)(B_shared_warp + (j_0_4 * 8)))[1]),
+                "f"(((float*)(C_warp + (j_0_4 * 8)))[0]),
+                "f"(((float*)(C_warp + (j_0_4 * 8)))[1]),
+                "f"(((float*)(C_warp + (j_0_4 * 8)))[2]),
+                "f"(((float*)(C_warp + (j_0_4 * 8)))[3]));
+        }
+
+        {
+          __asm__ __volatile__(
+              "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+              "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, "
+              "%13};\n"
+              : "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]),
+                "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[1]),
+                "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[2]),
+                "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[3])
+              : "r"(((unsigned*)(A_shared_warp + 0))[0]),
+                "r"(((unsigned*)(A_shared_warp + 0))[1]),
+                "r"(((unsigned*)(A_shared_warp + 0))[2]),
+                "r"(((unsigned*)(A_shared_warp + 0))[3]),
+                "r"(((unsigned*)(B_shared_warp + ((j_0_4 * 8) + 4)))[0]),
+                "r"(((unsigned*)(B_shared_warp + ((j_0_4 * 8) + 4)))[1]),
+                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]),
+                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[1]),
+                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[2]),
+                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[3]));
+        }
+
+  #endif
+      }
+    }
+  }
+
+  // TODO: Shang: Hoist loop invariance.
+  for (int ax1_0_1 = 0; ax1_0_1 < (N / 32); ++ax1_0_1) {
+    for (int local_id = 0; local_id < 8; ++local_id) {
+      int row_offset = (((int)blockIdx_y) / j_factors1) * 16 +
+                       ((int)threadIdx.x) / 4 + (local_id % 4) / 2 * 8;
+      if (row_offset < M) {
+        *(C_ptr + ax1_0_1 * 16 + row_offset * OC + (local_id / 4) * 8 +
+          local_id % 2) = __float2half(C_warp[(ax1_0_1 * 8) + local_id]);
+      }
+    }
+  }
+#endif
+}
+
+__global__ void __launch_bounds__(64)
+    dequantize_weights(int* __restrict__ B, half* __restrict__ scaling_factors,
+                       int* __restrict__ zeros, half* __restrict__ C, int G) {
+  static constexpr uint32_t ZERO = 0x0;
+  half B_shared[32 * (128 + 8)];
+
+  half* B_shared_ptr2 = B_shared;
+
+  int N = blockDim.x * gridDim.x;  // 2
+  int col = (blockIdx.x * blockDim.x + threadIdx.x);
+  int row = blockIdx.y * blockDim.y + threadIdx.y;
+  int index1 = 8 * col + 8 * row * N;
+  half* C_ptr2 = C + index1;
+
+  int index2 = col + row * N;
+  int* B_ptr2 = B + index2;
+
+  int index3 = col + (int)(row / G) * N;
+  int* zeros_ptr2 = zeros + index3;
+  int index4 = 8 * col + (int)(row / G) * N * 8;
+  half* scaling_factors_ptr2 = scaling_factors + index4;
+
+  uint32_t zeros_loaded = *(uint32_t*)(zeros_ptr2);
+  uint4 B_loaded_zero = dequantize_s4_to_fp16x2(zeros_loaded);
+  uint4 B_loaded_scale = *(uint4*)(scaling_factors_ptr2);
+
+  uint32_t B_loaded = *(uint32_t*)B_ptr2;
+  uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
+  asm volatile("sub.f16x2 %0, %1, %2;\n"
+               : "=r"(B_loaded_fp16.x)
+               : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x));
+  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+               : "=r"(B_loaded_fp16.x)
+               : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO));
+  asm volatile("sub.f16x2 %0, %1, %2;\n"
+               : "=r"(B_loaded_fp16.y)
+               : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y));
+  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+               : "=r"(B_loaded_fp16.y)
+               : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO));
+  asm volatile("sub.f16x2 %0, %1, %2;\n"
+               : "=r"(B_loaded_fp16.z)
+               : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z));
+  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+               : "=r"(B_loaded_fp16.z)
+               : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO));
+  asm volatile("sub.f16x2 %0, %1, %2;\n"
+               : "=r"(B_loaded_fp16.w)
+               : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w));
+  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+               : "=r"(B_loaded_fp16.w)
+               : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO));
+
+  *(uint4*)B_shared_ptr2 = B_loaded_fp16;
+
+  for (int i = 0; i < 8; ++i) {
+    *(C_ptr2 + i) = B_shared[i];
+  }
+}
+
+}  // namespace awq
+}  // namespace vllm
+
+torch::Tensor awq_dequantize(torch::Tensor _kernel,
+                             torch::Tensor _scaling_factors,
+                             torch::Tensor _zeros, int64_t split_k_iters,
+                             int64_t thx, int64_t thy) {
+  int in_c = _kernel.size(0);
+  int qout_c = _kernel.size(1);
+  int out_c = qout_c * 8;
+  int G = in_c / _scaling_factors.size(0);
+
+  int x_thread = thx;
+  int y_thread = thy;
+
+  int x_blocks = 1;
+  int y_blocks = 1;
+  if (thx == 0) {
+    x_thread = qout_c;
+  }
+  if (thy == 0) {
+    y_thread = in_c;
+  }
+  if (thx == 0 && thy == 0) {
+    x_thread = 8;
+    y_thread = 8;
+    x_blocks = (int)(qout_c / 8);
+    y_blocks = (int)(in_c / 8);
+  }
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(_scaling_factors));
+
+  auto options = torch::TensorOptions()
+                     .dtype(_scaling_factors.dtype())
+                     .device(_scaling_factors.device());
+  at::Tensor _de_kernel = torch::empty({in_c, out_c}, options);
+
+  auto kernel = reinterpret_cast<int*>(_kernel.data_ptr<int>());
+  auto de_kernel = reinterpret_cast<half*>(_de_kernel.data_ptr<at::Half>());
+  auto scaling_factors =
+      reinterpret_cast<half*>(_scaling_factors.data_ptr<at::Half>());
+  auto zeros = reinterpret_cast<int*>(_zeros.data_ptr<int>());
+
+  dim3 num_blocks(x_blocks, y_blocks);
+  dim3 threads_per_block(x_thread, y_thread);
+
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  vllm::awq::dequantize_weights<<<num_blocks, threads_per_block, 0, stream>>>(
+      kernel, scaling_factors, zeros, de_kernel, G);
+
+  return _de_kernel;
+}
+
+// in_feats: M, IC [float16]
+// kernel: IC, OC // 8 [int32] -> cast to IC, OC [uint4b]
+// scaling_factors: IC // G, OC [float16]
+// zeros: IC // G, OC // 8 [int32] -> cast to IC // G, OC [uint4b]
+// assume that batch_size < 16 for now
+
+torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel,
+                       torch::Tensor _scaling_factors, torch::Tensor _zeros,
+                       int64_t split_k_iters) {
+  int num_in_feats = _in_feats.size(0);
+  int num_in_channels = _in_feats.size(1);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(_in_feats));
+
+  auto options = torch::TensorOptions()
+                     .dtype(_in_feats.dtype())
+                     .device(_in_feats.device());
+  at::Tensor _out_feats =
+      torch::empty({split_k_iters, num_in_feats, _kernel.size(1) * 8}, options);
+  int num_out_feats = _out_feats.size(-2);
+  int num_out_channels = _out_feats.size(-1);
+
+  auto in_feats = reinterpret_cast<half*>(_in_feats.data_ptr<at::Half>());
+  auto kernel = reinterpret_cast<int*>(_kernel.data_ptr<int>());
+  auto out_feats = reinterpret_cast<half*>(_out_feats.data_ptr<at::Half>());
+  auto scaling_factors =
+      reinterpret_cast<half*>(_scaling_factors.data_ptr<at::Half>());
+  auto zeros = reinterpret_cast<int*>(_zeros.data_ptr<int>());
+  int group_size = num_in_channels / _scaling_factors.size(0);
+
+  if (num_out_channels % 64 != 0)
+    throw std::invalid_argument("OC is not multiple of cta_N = 64");
+  if (num_out_channels % 8 != 0)
+    throw std::invalid_argument("OC is not multiple of pack_num = 8");
+  if (group_size % 32 != 0)
+    throw std::invalid_argument("Group size should be a multiple of 32");
+  if (num_out_channels % group_size != 0)
+    throw std::invalid_argument("OC is not multiple of Group size");
+
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (num_out_channels % 128 == 0) {
+    int j_factors1 = num_out_channels / 128 / 1;
+    dim3 num_blocks((num_out_feats + 16 - 1) / 16 * j_factors1 * split_k_iters);
+    // threadIdx.x: 32
+    // threadIdx.y: i_factors[2] * j_factors[2]
+    dim3 threads_per_block(32, 2);
+    vllm::awq::gemm_forward_4bit_cuda_m16nXk32<128>
+        <<<num_blocks, threads_per_block, 0, stream>>>(
+            group_size, split_k_iters, in_feats, kernel, scaling_factors, zeros,
+            num_in_feats, num_in_channels, num_out_channels, out_feats);
+  } else if (num_out_channels % 64 == 0) {
+    int j_factors1 = num_out_channels / 64 / 1;
+    dim3 num_blocks(1 * (num_out_feats + 16 - 1) / 16 * j_factors1 *
+                    split_k_iters);
+
+    // threadIdx.x: 32
+    // threadIdx.y: i_factors[2] * j_factors[2]
+    dim3 threads_per_block(32, 2);
+    vllm::awq::gemm_forward_4bit_cuda_m16nXk32<64>
+        <<<num_blocks, threads_per_block, 0, stream>>>(
+            group_size, split_k_iters, in_feats, kernel, scaling_factors, zeros,
+            num_in_feats, num_in_channels, num_out_channels, out_feats);
+  }
+  return _out_feats.sum(0);
+}
diff --git a/vllm_v0.10.0/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/vllm_v0.10.0/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
new file mode 100644
index 0000000..5cd2ac1
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@@ -0,0 +1,338 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/all.h>
+
+#include <cmath>
+
+#include "../../dispatch_utils.h"
+#include "../vectorization_utils.cuh"
+
+#ifndef USE_ROCM
+  #include <cub/cub.cuh>
+  #include <cub/util_type.cuh>
+#else
+  #include <hipcub/hipcub.hpp>
+  #include <hipcub/util_type.hpp>
+#endif
+
+static inline __device__ int8_t float_to_int8_rn(float x) {
+#ifdef USE_ROCM
+  static constexpr auto i8_min =
+      static_cast<float>(std::numeric_limits<int8_t>::min());
+  static constexpr auto i8_max =
+      static_cast<float>(std::numeric_limits<int8_t>::max());
+
+  // To match the rounding mode of CUDA, we use nearbyint.
+  // It uses the current rounding mode, which is always FE_TONEAREST on HIP.
+  // If that changes in the future, we may need to set the rounding mode
+  // explicitly, either at runtime or compile time.
+  float dst = std::nearbyint(x);
+
+  // saturate
+
+  // See https://github.com/pytorch/pytorch/issues/127666
+  // See https://github.com/llvm/llvm-project/issues/95183
+  // hip-clang std::clamp __glibcxx_assert_fail host function when building on
+  // Arch/gcc14. The following replaces std::clamp usage with similar logic
+  // dst = std::clamp(dst, i8_min, i8_max);
+  dst = (dst < i8_min) ? i8_min : (dst > i8_max) ? i8_max : dst;
+  return static_cast<int8_t>(dst);
+#else
+  // CUDA path
+  uint32_t dst;
+  asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(dst) : "f"(x));
+  return reinterpret_cast<const int8_t&>(dst);
+#endif
+}
+
+static inline __device__ int32_t float_to_int32_rn(float x) {
+#ifdef USE_ROCM
+  // int32_max is not exactly representable as float.
+  // Therefore, we need to be careful and manually return int32_max on overflow.
+  // For symmetry, we also do the same for int32_min, even though it is exactly
+  // representable as float and the conversion should be exact.
+  static constexpr auto i32_min = std::numeric_limits<int32_t>::min();
+  static constexpr auto i32_min_f = static_cast<float>(i32_min);
+  static constexpr auto i32_max = std::numeric_limits<int32_t>::max();
+  static constexpr auto i32_max_f = static_cast<float>(i32_max);
+
+  // To match the rounding mode of CUDA, we use nearbyint.
+  // It uses the current rounding mode, which is always FE_TONEAREST on HIP.
+  // If that changes in the future, we may need to set the rounding mode
+  // explicitly, either at runtime or compile time.
+  float dst = std::nearbyint(x);
+
+  // saturate on the higher end.
+  if (dst >= i32_max_f) {
+    return i32_max;
+  }
+  // saturate on the lower end.
+  if (dst <= i32_min_f) {
+    return i32_min;
+  }
+
+  return static_cast<int32_t>(dst);
+#else
+  // CUDA path
+  uint32_t dst;
+  asm volatile("cvt.rni.sat.s32.f32 %0, %1;" : "=r"(dst) : "f"(x));
+  return reinterpret_cast<const int32_t&>(dst);
+#endif
+}
+
+static inline __device__ int8_t int32_to_int8(int32_t x) {
+#ifdef USE_ROCM
+  static constexpr auto i8_min =
+      static_cast<int32_t>(std::numeric_limits<int8_t>::min());
+  static constexpr auto i8_max =
+      static_cast<int32_t>(std::numeric_limits<int8_t>::max());
+
+  // saturate
+
+  // See https://github.com/pytorch/pytorch/issues/127666
+  // See https://github.com/llvm/llvm-project/issues/95183
+  // hip-clang std::clamp __glibcxx_assert_fail host function when building on
+  // Arch/gcc14. The following replaces std::clamp usage with similar logic
+  // int32_t dst = std::clamp(x, i8_min, i8_max);
+  int32_t dst = (x < i8_min) ? i8_min : (x > i8_max) ? i8_max : x;
+  return static_cast<int8_t>(dst);
+#else
+  // CUDA path
+  uint32_t dst;
+  asm volatile("cvt.sat.s8.s32 %0, %1;" : "=r"(dst) : "r"(x));
+  return reinterpret_cast<const int8_t&>(dst);
+#endif
+}
+
+namespace vllm {
+
+template <typename scalar_t, typename scale_t>
+__global__ void static_scaled_int8_quant_kernel(
+    const scalar_t* __restrict__ input, int8_t* __restrict__ output,
+    const scale_t* scale_ptr, const int hidden_size) {
+  const int tid = threadIdx.x;
+  const int stride = blockDim.x;
+  const int64_t token_idx = blockIdx.x;
+  const float scale = *scale_ptr;
+
+  // Must be performed using 64-bit math to avoid integer overflow.
+  const scalar_t* row_in = input + token_idx * hidden_size;
+  int8_t* row_out = output + token_idx * hidden_size;
+
+  vectorize_with_alignment<16>(
+      row_in, row_out, hidden_size, tid, stride,
+      [=] __device__(int8_t& dst, const scalar_t& src) {
+        dst = float_to_int8_rn(static_cast<float>(src) / scale);
+      });
+}
+
+template <typename scalar_t, typename scale_t, typename azp_t>
+__global__ void static_scaled_int8_azp_quant_kernel(
+    const scalar_t* __restrict__ input, int8_t* __restrict__ output,
+    const scale_t* scale_ptr, const azp_t* azp_ptr, const int hidden_size) {
+  const int tid = threadIdx.x;
+  const int stride = blockDim.x;
+  const int64_t token_idx = blockIdx.x;
+  const float scale = *scale_ptr;
+  const azp_t azp = *azp_ptr;
+  const float inv_s = 1.0f / scale;
+
+  // Must be performed using 64-bit math to avoid integer overflow.
+  const scalar_t* row_in = input + token_idx * hidden_size;
+  int8_t* row_out = output + token_idx * hidden_size;
+
+  vectorize_with_alignment<16>(
+      row_in, row_out, hidden_size, tid, stride,
+      [=] __device__(int8_t& dst, const scalar_t& src) {
+        const auto v = static_cast<float>(src) * inv_s;
+        dst = int32_to_int8(float_to_int32_rn(v) + azp);
+      });
+}
+
+template <typename scalar_t, typename scale_t>
+__global__ void dynamic_scaled_int8_quant_kernel(
+    const scalar_t* __restrict__ input, int8_t* __restrict__ output,
+    scale_t* scale_out, const int hidden_size) {
+  const int tid = threadIdx.x;
+  const int stride = blockDim.x;
+  const int64_t token_idx = blockIdx.x;
+
+  // Must be performed using 64-bit math to avoid integer overflow.
+  const scalar_t* row_in = input + token_idx * hidden_size;
+  int8_t* row_out = output + token_idx * hidden_size;
+
+  // calculate for absmax
+  float thread_max = 0.f;
+  vectorize_read_with_alignment<16>(
+      row_in, hidden_size, tid, stride, [&] __device__(const scalar_t& src) {
+        const float v = fabsf(static_cast<float>(src));
+        thread_max = fmaxf(thread_max, v);
+      });
+  using BlockReduce = cub::BlockReduce<float, 256>;
+  __shared__ typename BlockReduce::TempStorage tmp;
+  float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x);
+  __shared__ float absmax;
+  if (tid == 0) {
+    absmax = block_max;
+    scale_out[blockIdx.x] = absmax / 127.f;
+  }
+  __syncthreads();
+
+  float inv_s = (absmax == 0.f) ? 0.f : 127.f / absmax;
+
+  // 2. quantize
+  vectorize_with_alignment<16>(
+      row_in, row_out, hidden_size, tid, stride,
+      [=] __device__(int8_t& dst, const scalar_t& src) {
+        dst = float_to_int8_rn(static_cast<float>(src) * inv_s);
+      });
+}
+
+// MinMax structure to hold min and max values in one go
+struct MinMax {
+  float min, max;
+
+  __host__ __device__ MinMax()
+      : min(std::numeric_limits<float>::max()),
+        max(std::numeric_limits<float>::lowest()) {}
+
+  __host__ __device__ explicit MinMax(float v) : min(v), max(v) {}
+
+  // add a value to the MinMax
+  __host__ __device__ MinMax& operator+=(float v) {
+    min = fminf(min, v);
+    max = fmaxf(max, v);
+    return *this;
+  }
+
+  // merge two MinMax objects
+  __host__ __device__ MinMax& operator&=(const MinMax& other) {
+    min = fminf(min, other.min);
+    max = fmaxf(max, other.max);
+    return *this;
+  }
+};
+
+__host__ __device__ inline MinMax operator+(MinMax a, float v) {
+  return a += v;
+}
+__host__ __device__ inline MinMax operator&(MinMax a, const MinMax& b) {
+  return a &= b;
+}
+
+template <typename scalar_t, typename scale_t, typename azp_t>
+__global__ void dynamic_scaled_int8_azp_quant_kernel(
+    const scalar_t* __restrict__ input, int8_t* __restrict__ output,
+    scale_t* scale_out, azp_t* azp_out, const int hidden_size) {
+  const int tid = threadIdx.x;
+  const int stride = blockDim.x;
+  const int64_t token_idx = blockIdx.x;
+
+  // Must be performed using 64-bit math to avoid integer overflow.
+  const scalar_t* row_in = input + token_idx * hidden_size;
+  int8_t* row_out = output + token_idx * hidden_size;
+
+  // 1. calculate min & max
+  MinMax thread_mm;
+  vectorize_read_with_alignment<16>(row_in, hidden_size, tid, stride,
+                                    [&] __device__(const scalar_t& src) {
+                                      thread_mm += static_cast<float>(src);
+                                    });
+
+  using BlockReduce = cub::BlockReduce<MinMax, 256>;
+  __shared__ typename BlockReduce::TempStorage tmp;
+
+  MinMax mm = BlockReduce(tmp).Reduce(
+      thread_mm,
+      [] __device__(MinMax a, const MinMax& b) {
+        a &= b;
+        return a;
+      },
+      blockDim.x);
+
+  __shared__ float scale_sh;
+  __shared__ azp_t azp_sh;
+  if (tid == 0) {
+    float s = (mm.max - mm.min) / 255.f;
+    float zp = nearbyintf(-128.f - mm.min / s);  // round-to-even
+    scale_sh = s;
+    azp_sh = azp_t(zp);
+    scale_out[blockIdx.x] = s;
+    azp_out[blockIdx.x] = azp_sh;
+  }
+  __syncthreads();
+
+  const float inv_s = 1.f / scale_sh;
+  const azp_t azp = azp_sh;
+
+  // 2. quantize
+  vectorize_with_alignment<16>(
+      row_in, row_out, hidden_size, tid, stride,
+      [=] __device__(int8_t& dst, const scalar_t& src) {
+        const auto v = static_cast<float>(src) * inv_s;
+        dst = int32_to_int8(float_to_int32_rn(v) + azp);
+      });
+}
+
+}  // namespace vllm
+
+void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
+                              torch::Tensor const& input,  // [..., hidden_size]
+                              torch::Tensor const& scale,
+                              std::optional<torch::Tensor> const& azp) {
+  TORCH_CHECK(input.is_contiguous());
+  TORCH_CHECK(out.is_contiguous());
+  TORCH_CHECK(scale.numel() == 1);
+  TORCH_CHECK(!azp || azp->numel() == 1);
+
+  int const hidden_size = input.size(-1);
+  int const num_tokens = input.numel() / hidden_size;
+  dim3 const grid(num_tokens);
+  dim3 const block(std::min(hidden_size, 256));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "static_scaled_int8_quant_kernel", [&] {
+        if (!azp) {
+          vllm::static_scaled_int8_quant_kernel<scalar_t, float>
+              <<<grid, block, 0, stream>>>(
+                  input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
+                  scale.data_ptr<float>(), hidden_size);
+        } else {
+          vllm::static_scaled_int8_azp_quant_kernel<scalar_t, float, int32_t>
+              <<<grid, block, 0, stream>>>(
+                  input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
+                  scale.data_ptr<float>(), azp->data_ptr<int32_t>(),
+                  hidden_size);
+        }
+      });
+}
+
+void dynamic_scaled_int8_quant(
+    torch::Tensor& out,          // [..., hidden_size]
+    torch::Tensor const& input,  // [..., hidden_size]
+    torch::Tensor& scales, std::optional<torch::Tensor> const& azp) {
+  TORCH_CHECK(input.is_contiguous());
+  TORCH_CHECK(out.is_contiguous());
+  TORCH_CHECK(scales.is_contiguous());
+  TORCH_CHECK(!azp || azp->is_contiguous());
+
+  int const hidden_size = input.size(-1);
+  int const num_tokens = input.numel() / hidden_size;
+  dim3 const grid(num_tokens);
+  dim3 const block(std::min(hidden_size, 256));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "dynamic_scaled_int8_quant_kernel", [&] {
+        if (!azp) {
+          vllm::dynamic_scaled_int8_quant_kernel<scalar_t, float>
+              <<<grid, block, 0, stream>>>(
+                  input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
+                  scales.data_ptr<float>(), hidden_size);
+        } else {
+          vllm::dynamic_scaled_int8_azp_quant_kernel<scalar_t, float, int32_t>
+              <<<grid, block, 0, stream>>>(
+                  input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
+                  scales.data_ptr<float>(), azp->data_ptr<int32_t>(),
+                  hidden_size);
+        }
+      });
+}
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/Epilogues.md b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/Epilogues.md
new file mode 100644
index 0000000..a30e1fd
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/Epilogues.md
@@ -0,0 +1,167 @@
+# CUTLASS Epilogues
+
+## Introduction
+
+This document describes the various CUTLASS epilogues implemented for fusing de-quantization operations onto GEMMs.
+
+Currently, we only support symmetric quantization for weights,
+and symmetric and asymmetric quantization for activations.
+Both can be quantized per-tensor or per-channel (weights) / per-token (activations).
+
+There are 4 epilogues:
+
+1. `ScaledEpilogue`: symmetric quantization for activations, no bias.
+1. `ScaledEpilogueBias`: symmetric quantization for activations, supports bias.
+1. `ScaledEpilogueAzp`: asymmetric per-tensor quantization for activations, supports bias.
+1. `ScaledEpilogueAzpPerToken`: asymmetric per-token quantization for activations, supports bias.
+
+We do not have epilogues for asymmetric quantization of activations without bias in order to reduce final binary size.
+Instead, if no bias is passed, the epilogue will use 0 as the bias.
+That induces a redundant addition operation (and runtime check), but the performance impact is minor.
+
+## Underlying Linear Algebra
+
+More details available in the [Activation Quantization RFC](https://github.com/vllm-project/vllm/issues/3975).
+
+If $` \widehat X `$ is the quantized $` X `$, our matrices become the following
+
+```math
+A = s_a (\widehat A - J_a z_a)
+```
+
+```math
+B = s_b \widehat B
+```
+
+```math
+D = A B + C
+```
+
+```math
+D = s_a s_b \widehat D + C
+```
+
+Here, D is the output of the GEMM, and C is the bias.
+A is the activations and supports asymmetric quantization,
+and B is the weights and only supports symmetric quantization.
+$ s_a $ and $s_b$ are the scales for activations and weights, respectively.
+$ z_a $ is the zero-point for activations, and $ J_a $ is the matrix of all ones with dimensions of A.
+Additional epilogues would be required to support asymmetric quantization for weights.
+
+Expanding further, we can calculate $` \widehat D `$ as follows:
+
+```math
+A B = s_a ( \widehat A - J_a z_a ) s_b \widehat B
+```
+
+```math
+A B = s_a s_b \left( \widehat A \widehat B - J_a z_a \widehat B \right)
+```
+
+```math
+\widehat D = \widehat A \widehat B - z_a J_a \widehat B
+```
+
+Note that $` \widehat A \widehat B `$ is the raw output of the GEMM,
+and $` J_a \widehat B `$ is known ahead of time.
+Each row of it is equal to $` \mathbf 1 \widehat B `$, which is a row-vector of column sums of $` \widehat B `$.
+
+## Epilogues
+
+### `ScaledEpilogue`
+
+This epilogue computes the symmetric quantization for activations without bias, meaning $` C = 0 `$ and $` z_a = 0 `$.
+The output of the GEMM is:
+
+```math
+\widehat D = \widehat A \widehat B
+```
+
+```math
+D = s_a s_b \widehat D
+```
+
+```math
+D = s_a s_b \widehat A \widehat B
+```
+
+Epilogue parameters:
+- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
+- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
+
+### `ScaledEpilogueBias`
+
+This epilogue computes the symmetric quantization for activations with bias, meaning $` z_a = 0 `$.
+The output of the GEMM is:
+
+```math
+\widehat D = \widehat A \widehat B
+```
+
+```math
+D = s_a s_b \widehat D + C 
+```
+
+```math
+D = s_a s_b \widehat A \widehat B + C
+```
+
+Epilogue parameters:
+
+- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
+- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
+- `bias` is the bias, is always per-channel (row-vector).
+
+### `ScaledEpilogueAzp`
+
+This epilogue computes the asymmetric per-tensor quantization for activations with bias.
+The output of the GEMM is:
+
+```math
+\widehat D = \widehat A \widehat B - z_a J_a \widehat B
+```
+
+```math
+D = s_a s_b \widehat D + C 
+```
+
+```math
+D = s_a s_b \left( \widehat A \widehat B - z_a J_a \widehat B \right) + C
+```
+
+Because $` z_a `$ is a scalar, the zero-point term $` z_a J_a \widehat B `$ has every row equal to $` z_a \mathbf 1 B `$.
+That is precomputed and stored in `azp_with_adj` as a row-vector.
+
+Epilogue parameters:
+
+- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
+  - Generally this will be per-tensor as the zero-points are per-tensor.
+- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
+- `azp_with_adj` is the precomputed zero-point term ($` z_a J_a \widehat B `$), is per-channel (row-vector).
+- `bias` is the bias, is always per-channel (row-vector).
+
+To use these kernels efficiently, users must precompute the `azp_with_adj` term offline and pass it to the kernel.
+
+### `ScaledEpilogueAzpPerToken`
+
+This epilogue computes the asymmetric per-token quantization for activations with bias.
+
+The output of the GEMM is the same as above, but the $` z_a `$ is a column-vector.
+That means the zero-point term $` z_a J_a \widehat B `$ becomes an outer product of $` z_a `$ and $` \mathbf 1 \widehat B `$.
+
+Epilogue parameters:
+
+- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
+  - Generally this will be per-token as the zero-points are per-token.
+- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
+- `azp_adj` is the precomputed zero-point adjustment term ($` \mathbf 1 \widehat B `$), is per-channel (row-vector).
+- `azp` is the zero-point (`z_a`), is per-token (column-vector).
+- `bias` is the bias, is always per-channel (row-vector).
+
+To use these kernels efficiently, users must precompute the `azp_adj` term offline and pass it to the kernel.
+
+The epilogue performs the following computation (where `Dq` is the raw quantized output of the GEMM):
+
+```math
+out = scale_a * scale_b * (Dq - azp_adj * azp) + bias
+```
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh
new file mode 100644
index 0000000..26de32c
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh
@@ -0,0 +1,107 @@
+#pragma once
+
+// clang-format will break include orders
+// clang-format off
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/util/packed_stride.hpp"
+
+#include "core/math.hpp"
+#include "cutlass_extensions/common.hpp"
+// clang-format on
+
+namespace vllm::c3x {
+
+static inline cute::Shape<int, int, int, int> get_problem_shape(
+    torch::Tensor const& a, torch::Tensor const& b) {
+  int32_t m = a.size(0), n = b.size(1), k = a.size(1);
+  return {m, n, k, 1};
+}
+
+template <typename GemmKernel>
+void cutlass_gemm_caller(
+    torch::Device device, cute::Shape<int, int, int, int> prob_shape,
+    typename GemmKernel::MainloopArguments mainloop_args,
+    typename GemmKernel::EpilogueArguments epilogue_args,
+    typename GemmKernel::TileSchedulerArguments scheduler = {}) {
+  cutlass::KernelHardwareInfo hw_info;
+  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
+                                      prob_shape,
+                                      mainloop_args,
+                                      epilogue_args,
+                                      hw_info,
+                                      scheduler};
+
+  // Launch the CUTLASS GEMM kernel.
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(device);
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto stream = at::cuda::getCurrentCUDAStream(device.index());
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+template <typename Gemm, typename... EpilogueArgs>
+void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                         torch::Tensor const& b,
+                         EpilogueArgs&&... epilogue_params) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementC = typename Gemm::ElementC;
+  using ElementD = typename Gemm::ElementD;
+  using GemmKernel = typename Gemm::GemmKernel;
+
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = StrideC;
+  using StrideAux = StrideC;
+
+  typename GemmKernel::ProblemShape prob_shape = get_problem_shape(a, b);
+  auto [M, N, K, L] = prob_shape;
+
+  StrideA a_stride =
+      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+  StrideB b_stride =
+      cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+  StrideC c_stride =
+      cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+  StrideD d_stride =
+      cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+  StrideAux aux_stride = d_stride;
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
+                                                       b_stride};
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  // auto d_ptr = static_cast<ElementC*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, c_stride, c_ptr, d_stride};
+
+  cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
+                                  epilogue_args);
+}
+
+}  // namespace vllm::c3x
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
new file mode 100644
index 0000000..2d67da9
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
@@ -0,0 +1,209 @@
+#pragma once
+
+// clang-format will break include orders
+// clang-format off
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "core/math.hpp"
+#include "cutlass_extensions/common.hpp"
+// clang-format on
+
+/*
+  Epilogues defined in,
+  csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp,
+  must contain a public type named EVTCompute of type Sm90EVT, as well as a
+  static prepare_args function that constructs an EVTCompute::Arguments struct.
+*/
+
+using namespace cute;
+
+namespace vllm {
+
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule>
+struct cutlass_3x_gemm {
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+
+  using Epilogue = Epilogue_<ElementAcc, ElementD, TileShape>;
+
+  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
+  using ElementC = void;
+  using StrideC = StrideD;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  // These are the minimum alignments needed for the kernels to compile
+  static constexpr int AlignmentAB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentCD =
+      128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAcc, float, ElementC, StrideC, AlignmentCD, ElementD, StrideD,
+          AlignmentCD, EpilogueSchedule, EVTCompute>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  // clang-format off
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, 
+          ElementAB, cutlass::layout::RowMajor, AlignmentAB, 
+          ElementAB, cutlass::layout::ColumnMajor, AlignmentAB, 
+          ElementAcc, TileShape, ClusterShape,
+          Stages,
+          KernelSchedule>::CollectiveOp;
+  // clang-format on
+
+  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
+      cutlass::gemm::PersistentScheduler>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule>
+struct cutlass_3x_gemm_sm100 {
+  using ElementAB = ElementAB_;
+  using LayoutA = cutlass::layout::RowMajor;
+  static constexpr int AlignmentA =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+
+  using LayoutB = cutlass::layout::ColumnMajor;
+  static constexpr int AlignmentB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+
+  using ElementC = void;
+  using LayoutC = cutlass::layout::RowMajor;
+  static constexpr int AlignmentC =
+      128 / cutlass::sizeof_bits<ElementD_>::value;
+
+  using ElementD = ElementD_;
+  using LayoutD = cutlass::layout::RowMajor;
+  static constexpr int AlignmentD = AlignmentC;
+
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+  using Epilogue = Epilogue_<ElementAcc, ElementD, TileShape>;
+
+  // MMA type
+  using ElementAccumulator = float;
+
+  // Epilogue types
+  using ElementBias = cutlass::half_t;
+  using ElementCompute = float;
+  using ElementAux = ElementD;
+  using LayoutAux = LayoutD;
+  using ElementAmax = float;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAccumulator, ElementCompute, ElementC, LayoutC, AlignmentC,
+          ElementD, LayoutD, AlignmentD, EpilogueSchedule,
+          EVTCompute>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, ElementAB,
+          LayoutA, AlignmentA, ElementAB, LayoutB, AlignmentB,
+          ElementAccumulator, TileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          KernelSchedule>::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+};
+
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule>
+struct cutlass_3x_gemm_sm120 {
+  using ElementAB = ElementAB_;
+  using LayoutA = cutlass::layout::RowMajor;
+  static constexpr int AlignmentA =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+
+  using LayoutB = cutlass::layout::ColumnMajor;
+  static constexpr int AlignmentB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+
+  using ElementC = void;
+  using LayoutC = cutlass::layout::RowMajor;
+  static constexpr int AlignmentC =
+      128 / cutlass::sizeof_bits<ElementD_>::value;
+
+  using ElementD = ElementD_;
+  using LayoutD = cutlass::layout::RowMajor;
+  static constexpr int AlignmentD = AlignmentC;
+
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+  using Epilogue = Epilogue_<ElementAcc, ElementD, TileShape>;
+
+  // MMA type
+  using ElementAccumulator = float;
+
+  // Epilogue types
+  using ElementBias = cutlass::half_t;
+  using ElementCompute = float;
+  using ElementAux = ElementD;
+  using LayoutAux = LayoutD;
+  using ElementAmax = float;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAccumulator, ElementCompute, ElementC, LayoutC, AlignmentC,
+          ElementD, LayoutD, AlignmentD, EpilogueSchedule,
+          EVTCompute>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp, ElementAB,
+          LayoutA, AlignmentA, ElementAB, LayoutB, AlignmentB,
+          ElementAccumulator, TileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          KernelSchedule>::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+};
+
+}  // namespace vllm
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu
new file mode 100644
index 0000000..4cd38f4
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu
@@ -0,0 +1,24 @@
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_sm90_int8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_azp_sm90_int8(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     torch::Tensor const& a_scales,
+                                     torch::Tensor const& b_scales,
+                                     torch::Tensor const& azp_adj,
+                                     std::optional<torch::Tensor> const& azp,
+                                     std::optional<torch::Tensor> const& bias) {
+  if (azp) {
+    return cutlass_scaled_mm_sm90_int8_epilogue<
+        c3x::ScaledEpilogueBiasAzpToken>(out, a, b, a_scales, b_scales, azp_adj,
+                                         *azp, bias);
+  } else {
+    return cutlass_scaled_mm_sm90_int8_epilogue<c3x::ScaledEpilogueBiasAzp>(
+        out, a, b, a_scales, b_scales, azp_adj, bias);
+  }
+}
+
+}  // namespace vllm
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu
new file mode 100644
index 0000000..4a8a5ed
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu
@@ -0,0 +1,23 @@
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_blockwise_sm100_fp8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out,
+                                           torch::Tensor const& a,
+                                           torch::Tensor const& b,
+                                           torch::Tensor const& a_scales,
+                                           torch::Tensor const& b_scales) {
+  if (out.dtype() == torch::kBFloat16) {
+    cutlass_gemm_blockwise_sm100_fp8_dispatch<cutlass::bfloat16_t>(
+        out, a, b, a_scales, b_scales);
+
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    cutlass_gemm_blockwise_sm100_fp8_dispatch<cutlass::half_t>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+}  // namespace vllm
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
new file mode 100644
index 0000000..c841125
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
@@ -0,0 +1,279 @@
+#pragma once
+
+#include "cuda_utils.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass_extensions/gemm/dispatch_policy.hpp"
+#include "cutlass_extensions/gemm/collective/collective_builder.hpp"
+
+#include "cutlass_gemm_caller.cuh"
+
+namespace vllm {
+
+using namespace cute;
+
+// clang-format off
+template <class OutType, int ScaleGranularityM,
+          int ScaleGranularityN, int ScaleGranularityK,
+          class MmaTileShape, class ClusterShape,
+          class EpilogueScheduler, class MainloopScheduler,
+          bool swap_ab_ = false>
+struct cutlass_3x_gemm_fp8_blockwise {
+  static constexpr bool swap_ab = swap_ab_;
+  using ElementAB = cutlass::float_e4m3_t;
+
+  using ElementA = ElementAB;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
+  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+
+  using ElementB = ElementAB;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
+  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+
+  using ElementD = OutType;
+  using LayoutD = cutlass::layout::RowMajor;
+  using LayoutD_Transpose = typename cutlass::layout::LayoutTranspose<LayoutD>::type;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using ElementC = void; // TODO: support bias
+  using LayoutC = LayoutD;
+  using LayoutC_Transpose = LayoutD_Transpose;
+  static constexpr int AlignmentC = AlignmentD;
+
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementBlockScale = float;
+
+  using ScaleConfig = conditional_t<swap_ab,
+      cutlass::detail::Sm100BlockwiseScaleConfig<
+        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
+        cute::UMMA::Major::K, cute::UMMA::Major::MN>,
+      cutlass::detail::Sm100BlockwiseScaleConfig<
+        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
+        cute::UMMA::Major::MN, cute::UMMA::Major::K>>;
+
+  // layout_SFA and layout_SFB cannot be swapped since they are deduced.
+  using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
+  using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
+
+  using ArchTag = cutlass::arch::Sm100;
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+
+  static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
+  using ElementScalar = float;
+  using DefaultOperation = cutlass::epilogue::fusion::LinearCombination<ElementD, ElementCompute, ElementC, ElementScalar, RoundStyle>;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      MmaTileShape,
+      ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator,
+      ElementCompute,
+      ElementC,
+      conditional_t<swap_ab, LayoutC_Transpose, LayoutC>,
+      AlignmentC,
+      ElementD,
+      conditional_t<swap_ab, LayoutD_Transpose, LayoutD>,
+      AlignmentD,
+      EpilogueScheduler,
+      DefaultOperation
+  >::CollectiveOp;
+ 
+  using StageCountType = cutlass::gemm::collective::StageCountAuto; 
+  using CollectiveMainloop = conditional_t<swap_ab,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag,
+          OperatorClass,
+          ElementB,
+          cute::tuple<LayoutB_Transpose, LayoutSFA>,
+          AlignmentB,
+          ElementA,
+          cute::tuple<LayoutA_Transpose, LayoutSFB>,
+          AlignmentA,
+          ElementAccumulator,
+          MmaTileShape,
+          ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          MainloopScheduler
+      >::CollectiveOp,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag,
+          OperatorClass,
+          ElementA,
+          cute::tuple<LayoutA, LayoutSFA>,
+          AlignmentA,
+          ElementB,
+          cute::tuple<LayoutB, LayoutSFB>,
+          AlignmentB,
+          ElementAccumulator,
+          MmaTileShape,
+          ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          MainloopScheduler
+      >::CollectiveOp>;
+
+  using KernelType = enable_sm100_only<cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename Gemm>
+void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
+                                   torch::Tensor const& b,
+                                   torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales) {
+  static constexpr bool swap_ab = Gemm::swap_ab;
+  using GemmKernel = typename Gemm::GemmKernel;
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using LayoutSFA = typename Gemm::LayoutSFA;
+  using LayoutSFB = typename Gemm::LayoutSFB;
+  using ScaleConfig = typename Gemm::ScaleConfig;
+
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  int32_t m = a.size(0), n = b.size(1), k = a.size(1);
+
+  StrideA a_stride;
+  StrideB b_stride;
+  StrideC c_stride;
+  a_stride =
+      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
+  b_stride =
+      cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
+  c_stride =
+      cutlass::make_cute_packed_stride(StrideC{}, swap_ab ? cute::make_shape(n, m, 1) : cute::make_shape(m, n, 1));
+
+  LayoutSFA layout_SFA = swap_ab ? 
+      ScaleConfig::tile_atom_to_shape_SFA(make_shape(n, m, k, 1)) :
+      ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1));
+  LayoutSFB layout_SFB = swap_ab ?
+      ScaleConfig::tile_atom_to_shape_SFB(make_shape(n, m, k, 1)) :
+      ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
+  auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());
+
+  auto mainloop_args = [&](){
+    // layout_SFA and layout_SFB cannot be swapped since they are deduced.
+    if (swap_ab) {
+      return typename GemmKernel::MainloopArguments{
+          b_ptr,        b_stride,   a_ptr,        a_stride,
+          b_scales_ptr, layout_SFA, a_scales_ptr, layout_SFB
+      };
+    }
+    else {
+      return typename GemmKernel::MainloopArguments{
+          a_ptr,        a_stride,   b_ptr,        b_stride,
+          a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB
+      };
+    }
+  }();
+  auto prob_shape = swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1);
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      {}, c_ptr, c_stride, c_ptr, c_stride};
+  c3x::cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
+                                       epilogue_args);
+}
+
+template <typename OutType>
+void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
+                                               torch::Tensor const& a,
+                                               torch::Tensor const& b,
+                                               torch::Tensor const& a_scales,
+                                               torch::Tensor const& b_scales) {
+  int32_t m = a.size(0), n = b.size(1), k = a.size(1), sms;
+  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());
+
+  constexpr int TILE_K = 128;
+  // TODO: better heuristics
+  bool swap_ab = (m < 16) || (m % 4 != 0);
+  bool use_tma_epilogue = (m * n) % 4 == 0;
+  if (!swap_ab) {
+    constexpr int TILE_N = 128;
+    int tile_m = 256;
+    if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 64) <= sms) {
+      tile_m = 64;
+    }
+    else if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 128) <= sms) {
+      tile_m = 128;
+    }
+    if (tile_m == 64) {
+      if (use_tma_epilogue) {
+        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+            OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      } else {
+        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+            OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      }
+    } else if (tile_m == 128) {
+      if (use_tma_epilogue) {
+        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+            OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      } else {
+        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+            OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      }
+    } else { // tile_m == 256
+      if (use_tma_epilogue) {
+          cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+              OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_2, _1, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      } else {
+          cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+              OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_2, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized2Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      }
+    }
+  } else {
+    // TODO: Test more tile N configs
+    constexpr int TILE_M = 128;
+    constexpr int TILE_N = 16;
+    // TMA epilogue isn't compatible with Swap A/B
+    cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+        OutType, TILE_M, 1, TILE_K, Shape<Int<TILE_M>, Int<TILE_N>, Int<TILE_K>>,
+        Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
+        cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100, true>>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+}  // namespace vllm
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu
new file mode 100644
index 0000000..0501e6d
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu
@@ -0,0 +1,24 @@
+
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_blockwise_sm90_fp8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_blockwise_sm90_fp8(torch::Tensor& out,
+                                          torch::Tensor const& a,
+                                          torch::Tensor const& b,
+                                          torch::Tensor const& a_scales,
+                                          torch::Tensor const& b_scales) {
+  if (out.dtype() == torch::kBFloat16) {
+    cutlass_gemm_blockwise_sm90_fp8_dispatch<cutlass::bfloat16_t>(
+        out, a, b, a_scales, b_scales);
+
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    cutlass_gemm_blockwise_sm90_fp8_dispatch<cutlass::half_t>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+}  // namespace vllm
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
new file mode 100644
index 0000000..e089c3d
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
@@ -0,0 +1,194 @@
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass_extensions/gemm/dispatch_policy.hpp"
+#include "cutlass_extensions/gemm/collective/collective_builder.hpp"
+
+#include "cutlass_gemm_caller.cuh"
+
+namespace vllm {
+
+using namespace cute;
+
+template <typename SchedulerType, typename OutType, int GroupSizeM_,
+          int GroupSizeN_, int GroupSizeK_, int TileSizeM_ = 128,
+          class ClusterShape = Shape<_1, _2, _1>>
+struct cutlass_3x_gemm_fp8_blockwise {
+  using GroupSizeM = Int<GroupSizeM_>;
+  using GroupSizeN = Int<GroupSizeN_>;
+  using GroupSizeK = Int<GroupSizeK_>;
+  using TileSizeM = Int<TileSizeM_>;
+
+  static_assert(TileSizeM_ % GroupSizeM_ == 0,
+                "TileSizeM must be a multiple of GroupSizeM");
+
+  using ElementAB = cutlass::float_e4m3_t;
+
+  using ElementA = ElementAB;
+  using LayoutA = cutlass::layout::RowMajor;
+  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+
+  using ElementB = ElementAB;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+
+  using ElementD = OutType;
+  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using ElementC = void;
+  using StrideC = StrideD;
+  static constexpr int AlignmentC = AlignmentD;
+
+  using ElementAccumulator = float;
+  using ElementBlockScale = float;
+  using ElementCompute = float;
+  using ArchTag = cutlass::arch::Sm90;
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+  using TileShape = Shape<TileSizeM, GroupSizeN, GroupSizeK>;
+
+  using KernelSchedule = cutlass::gemm::
+      KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<
+          GroupSizeM_>;
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+
+  using StoreEpilogueCompute = typename cutlass::epilogue::fusion::Sm90EVT<
+      cutlass::epilogue::fusion::Sm90AccFetch>;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, TileShape, ClusterShape, EpilogueTileType,
+          ElementAccumulator, ElementCompute, ElementC, StrideC, AlignmentC,
+          ElementD, StrideD, AlignmentD, EpilogueSchedule,
+          StoreEpilogueCompute>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, ElementA, LayoutA, AlignmentA, ElementB,
+          LayoutB, AlignmentB, ElementAccumulator, TileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          KernelSchedule>::CollectiveOp;
+
+  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
+      SchedulerType>>;
+
+  struct GemmKernel : public KernelType {};
+
+  using StrideA = typename GemmKernel::StrideA;
+  using StrideB = typename GemmKernel::StrideB;
+};
+
+template <typename Gemm>
+void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
+                                   torch::Tensor const& b,
+                                   torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales) {
+  using GemmKernel = typename Gemm::GemmKernel;
+
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  auto prob_shape = c3x::get_problem_shape(a, b);
+  int32_t m = get<0>(prob_shape), n = get<1>(prob_shape),
+          k = get<2>(prob_shape);
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+  using StrideB = Stride<int64_t, Int<1>, int64_t>;
+  using StrideC = typename Gemm::StrideC;
+
+  StrideA a_stride{lda, Int<1>{}, 0};
+  StrideB b_stride{ldb, Int<1>{}, 0};
+  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
+  auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());
+
+  // Check is the t is contiguous and is 1D or 2D with one of the dimensions
+  // being 1 (i.e. a row or column vector)
+  auto is_contiguous_vector = [](const torch::Tensor& t) {
+    auto t_sizes = t.sizes();
+    return t.is_contiguous() &&
+           (t.dim() == 1 ||
+            (t.dim() == 2 &&
+             *std::min_element(t_sizes.begin(), t_sizes.end()) == 1));
+  };
+
+  // TODO(lucas): lets clean-up the kernel so that we pass in Strides so
+  //  we don't have to deal with enforcing implicit layouts
+  TORCH_CHECK(a_scales.size(0) == m / Gemm::GroupSizeM::value);
+  TORCH_CHECK(a_scales.size(1) == k / Gemm::GroupSizeK::value);
+  TORCH_CHECK(a_scales.stride(0) == 1 || is_contiguous_vector(a_scales),
+              "a_scales must be M major");
+  TORCH_CHECK(b_scales.size(0) == k / Gemm::GroupSizeK::value);
+  TORCH_CHECK(b_scales.size(1) == n / Gemm::GroupSizeN::value);
+  TORCH_CHECK(b_scales.stride(0) == 1 || is_contiguous_vector(b_scales),
+              "b_scales must be K major");
+  typename GemmKernel::MainloopArguments mainloop_args{
+      a_ptr, a_stride, b_ptr, b_stride, a_scales_ptr, b_scales_ptr};
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      {}, c_ptr, c_stride, c_ptr, c_stride};
+
+  typename GemmKernel::TileSchedulerArguments scheduler;
+
+  static constexpr bool UsesStreamKScheduler =
+      cute::is_same_v<typename GemmKernel::TileSchedulerTag,
+                      cutlass::gemm::StreamKScheduler>;
+
+  if constexpr (UsesStreamKScheduler) {
+    using DecompositionMode = typename cutlass::gemm::kernel::detail::
+        PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+    using ReductionMode = typename cutlass::gemm::kernel::detail::
+        PersistentTileSchedulerSm90StreamKParams::ReductionMode;
+
+    scheduler.decomposition_mode = DecompositionMode::StreamK;
+    scheduler.reduction_mode = ReductionMode::Nondeterministic;
+  }
+
+  c3x::cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
+                                       epilogue_args, scheduler);
+}
+
+template <typename OutType>
+void cutlass_gemm_blockwise_sm90_fp8_dispatch(torch::Tensor& out,
+                                              torch::Tensor const& a,
+                                              torch::Tensor const& b,
+                                              torch::Tensor const& a_scales,
+                                              torch::Tensor const& b_scales) {
+  auto k = a.size(1);
+  auto n = b.size(1);
+
+  if (k > 3 * n) {
+    cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+        cutlass::gemm::StreamKScheduler, OutType, 1, 128, 128>>(
+        out, a, b, a_scales, b_scales);
+  } else {
+    cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+        cutlass::gemm::PersistentScheduler, OutType, 1, 128, 128>>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+}  // namespace vllm
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp
new file mode 100644
index 0000000..2ee6a19
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp
@@ -0,0 +1,75 @@
+#include <torch/all.h>
+#include "cuda_utils.h"
+#include "cutlass_extensions/common.hpp"
+
+template <typename Fp8Func, typename Int8Func, typename BlockwiseFunc>
+void dispatch_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
+                        torch::Tensor const& b, torch::Tensor const& a_scales,
+                        torch::Tensor const& b_scales,
+                        std::optional<torch::Tensor> const& bias,
+                        Fp8Func fp8_func, Int8Func int8_func,
+                        BlockwiseFunc blockwise_func) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+  int M = a.size(0), N = b.size(1), K = a.size(1);
+
+  if ((a_scales.numel() == 1 || a_scales.numel() == a.size(0)) &&
+      (b_scales.numel() == 1 || b_scales.numel() == b.size(1))) {
+    // Standard per-tensor/per-token/per-channel scaling
+    TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+    if (a.dtype() == torch::kFloat8_e4m3fn) {
+      fp8_func(c, a, b, a_scales, b_scales, bias);
+    } else {
+      TORCH_CHECK(a.dtype() == torch::kInt8);
+      if constexpr (!std::is_same_v<Int8Func, std::nullptr_t>) {
+        int8_func(c, a, b, a_scales, b_scales, bias);
+      } else {
+        TORCH_CHECK(false, "Int8 not supported for this architecture");
+      }
+    }
+  } else {
+    TORCH_CHECK(a_scales.dim() == 2, "a scale must be 2d tensor.");
+    TORCH_CHECK(b_scales.dim() == 2, "b scale must be 2d tensor.");
+    int32_t version_num = get_sm_version_num();
+    if (version_num >= 100) {
+      TORCH_CHECK(
+          a.size(0) == a_scales.size(0) &&
+              cuda_utils::ceil_div(a.size(1), int64_t(128)) == a_scales.size(1),
+          "a_scale_group_shape must be [1, 128].");
+      TORCH_CHECK(
+          cuda_utils::ceil_div(b.size(0), int64_t(128)) == b_scales.size(0) &&
+              cuda_utils::ceil_div(b.size(1), int64_t(128)) == b_scales.size(1),
+          "b_scale_group_shape must be [128, 128].");
+    } else {
+      // TODO: Remove this after using cutlass sm90 blockwise scaling gemm
+      // kernel, or introducing ceil_div to the load_init() of mainloop.
+      using GroupShape = std::array<int64_t, 2>;
+      auto make_group_shape = [](torch::Tensor const& x,
+                                 torch::Tensor const& s) -> GroupShape {
+        TORCH_CHECK(s.dim() == 2, "cutlass_scaled_mm group scales must be 2D");
+        return {cuda_utils::ceil_div(x.size(0), s.size(0)),
+                cuda_utils::ceil_div(x.size(1), s.size(1))};
+      };
+
+      GroupShape a_scale_group_shape = make_group_shape(a, a_scales);
+      GroupShape b_scale_group_shape = make_group_shape(b, b_scales);
+
+      // 1x128 per-token group scales for activations
+      // 128x128 blockwise scales for weights
+      TORCH_CHECK((a_scale_group_shape == GroupShape{1, 128} &&
+                   b_scale_group_shape == GroupShape{128, 128} &&
+                   a.dtype() == torch::kFloat8_e4m3fn &&
+                   b.dtype() == torch::kFloat8_e4m3fn),
+                  "cutlass_scaled_mm only supports datatype float8_e4m3fn.\n"
+                  "a_scale_group_shape must be [1, 128]. Got: [",
+                  a_scale_group_shape[0], ", ", a_scale_group_shape[1],
+                  "]\n"
+                  "b_scale_group_shape must be [128, 128]. Got: [",
+                  b_scale_group_shape[0], ", ", b_scale_group_shape[1], "]");
+    }
+
+    TORCH_CHECK(!bias, "Bias not yet supported blockwise scaled_mm");
+    blockwise_func(c, a, b, a_scales, b_scales);
+  }
+}
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
new file mode 100644
index 0000000..e049a5f
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <torch/all.h>
+
+namespace vllm {
+
+void cutlass_scaled_mm_sm90_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                torch::Tensor const& a_scales,
+                                torch::Tensor const& b_scales,
+                                std::optional<torch::Tensor> const& bias);
+
+void cutlass_scaled_mm_sm90_int8(torch::Tensor& out, torch::Tensor const& a,
+                                 torch::Tensor const& b,
+                                 torch::Tensor const& a_scales,
+                                 torch::Tensor const& b_scales,
+                                 std::optional<torch::Tensor> const& bias);
+
+void cutlass_scaled_mm_azp_sm90_int8(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     torch::Tensor const& a_scales,
+                                     torch::Tensor const& b_scales,
+                                     torch::Tensor const& azp_adj,
+                                     std::optional<torch::Tensor> const& azp,
+                                     std::optional<torch::Tensor> const& bias);
+
+void cutlass_scaled_mm_blockwise_sm90_fp8(torch::Tensor& out,
+                                          torch::Tensor const& a,
+                                          torch::Tensor const& b,
+                                          torch::Tensor const& a_scales,
+                                          torch::Tensor const& b_scales);
+
+void cutlass_scaled_mm_sm100_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                 torch::Tensor const& b,
+                                 torch::Tensor const& a_scales,
+                                 torch::Tensor const& b_scales,
+                                 std::optional<torch::Tensor> const& bias);
+
+void cutlass_scaled_mm_sm120_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                 torch::Tensor const& b,
+                                 torch::Tensor const& a_scales,
+                                 torch::Tensor const& b_scales,
+                                 std::optional<torch::Tensor> const& bias);
+
+void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out,
+                                           torch::Tensor const& a,
+                                           torch::Tensor const& b,
+                                           torch::Tensor const& a_scales,
+                                           torch::Tensor const& b_scales);
+}  // namespace vllm
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu
new file mode 100644
index 0000000..cf2cccc
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu
@@ -0,0 +1,24 @@
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_sm100_fp8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_sm100_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                 torch::Tensor const& b,
+                                 torch::Tensor const& a_scales,
+                                 torch::Tensor const& b_scales,
+                                 std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm100_fp8_epilogue<c3x::ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm100_fp8_epilogue<c3x::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+}  // namespace vllm
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
new file mode 100644
index 0000000..24564ef
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
@@ -0,0 +1,136 @@
+#pragma once
+
+#include "scaled_mm.cuh"
+#include "cutlass_gemm_caller.cuh"
+
+/**
+ * This file defines Gemm kernel configurations for SM100 (fp8) based on the
+ * Gemm shape.
+ */
+
+namespace vllm {
+
+using c3x::cutlass_gemm_caller;
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm100_fp8_config_default {
+  // M in (256, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_256, _128, _128>;
+  using ClusterShape = Shape<_2, _2, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm100_fp8_config_M256 {
+  // M in (64, 256]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm100_fp8_config_M64 {
+  // M in (16, 64]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_64, _64, _128>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm100_fp8_config_M16 {
+  // M in [1, 16]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_64, _64, _128>;
+  using ClusterShape = Shape<_1, _4, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,
+                                            torch::Tensor const& a,
+                                            torch::Tensor const& b,
+                                            EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  using Cutlass3xGemmDefault =
+      typename sm100_fp8_config_default<InType, OutType,
+                                        Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM16 =
+      typename sm100_fp8_config_M16<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm100_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM256 =
+      typename sm100_fp8_config_M256<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 16) {
+    // m in [1, 16]
+    return cutlass_gemm_caller<Cutlass3xGemmM16>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 64) {
+    // m in (16, 64]
+    return cutlass_gemm_caller<Cutlass3xGemmM64>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 256) {
+    // m in (64, 256]
+    return cutlass_gemm_caller<Cutlass3xGemmM256>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // m in (256, inf)
+    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm100_fp8_epilogue(torch::Tensor& out,
+                                          torch::Tensor const& a,
+                                          torch::Tensor const& b,
+                                          EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  if (out.dtype() == torch::kBFloat16) {
+    return cutlass_gemm_sm100_fp8_dispatch<cutlass::float_e4m3_t,
+                                           cutlass::bfloat16_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    return cutlass_gemm_sm100_fp8_dispatch<cutlass::float_e4m3_t,
+                                           cutlass::half_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  }
+}
+
+}  // namespace vllm
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu
new file mode 100644
index 0000000..bc816cb
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu
@@ -0,0 +1,24 @@
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_sm120_fp8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_sm120_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                 torch::Tensor const& b,
+                                 torch::Tensor const& a_scales,
+                                 torch::Tensor const& b_scales,
+                                 std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm120_fp8_epilogue<c3x::ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm120_fp8_epilogue<c3x::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+}  // namespace vllm
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8_dispatch.cuh b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8_dispatch.cuh
new file mode 100644
index 0000000..c31f96b
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8_dispatch.cuh
@@ -0,0 +1,67 @@
+#pragma once
+
+#include "scaled_mm.cuh"
+#include "cutlass_gemm_caller.cuh"
+
+/**
+ * This file defines Gemm kernel configurations for SM120 (fp8) based on the
+ * Gemm shape.
+ */
+
+namespace vllm {
+
+using c3x::cutlass_gemm_caller;
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm120_fp8_config_default {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_1, _1, _1>;  // Only work with Shape<_1, _1, _1>
+  using Cutlass3xGemm =
+      cutlass_3x_gemm_sm120<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm120_fp8_dispatch(torch::Tensor& out,
+                                            torch::Tensor const& a,
+                                            torch::Tensor const& b,
+                                            EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  using Cutlass3xGemmDefault =
+      typename sm120_fp8_config_default<InType, OutType,
+                                        Epilogue>::Cutlass3xGemm;
+  return cutlass_gemm_caller<Cutlass3xGemmDefault>(
+      out, a, b, std::forward<EpilogueArgs>(args)...);
+}
+
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm120_fp8_epilogue(torch::Tensor& out,
+                                          torch::Tensor const& a,
+                                          torch::Tensor const& b,
+                                          EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  if (out.dtype() == torch::kBFloat16) {
+    return cutlass_gemm_sm120_fp8_dispatch<cutlass::float_e4m3_t,
+                                           cutlass::bfloat16_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    return cutlass_gemm_sm120_fp8_dispatch<cutlass::float_e4m3_t,
+                                           cutlass::half_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  }
+}
+
+}  // namespace vllm
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu
new file mode 100644
index 0000000..e092c61
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu
@@ -0,0 +1,24 @@
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_sm90_fp8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_sm90_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                torch::Tensor const& a_scales,
+                                torch::Tensor const& b_scales,
+                                std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm90_fp8_epilogue<c3x::ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm90_fp8_epilogue<c3x::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+}  // namespace vllm
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh
new file mode 100644
index 0000000..32ea5db
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh
@@ -0,0 +1,120 @@
+#pragma once
+
+#include "scaled_mm.cuh"
+#include "cutlass_gemm_caller.cuh"
+
+/**
+ * This file defines Gemm kernel configurations for SM90 (fp8) based on the Gemm
+ * shape.
+ */
+
+namespace vllm {
+
+using c3x::cutlass_gemm_caller;
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_default {
+  // M in (128, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M128 {
+  // M in (64, 128]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M64 {
+  // M in [1, 64]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _128>;
+  using ClusterShape = Shape<_1, _8, _1>;
+
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
+                                           torch::Tensor const& a,
+                                           torch::Tensor const& b,
+                                           EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_fp8_config_default<InType, OutType,
+                                       Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm90_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm90_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 64) {
+    // m in [1, 64]
+    return cutlass_gemm_caller<Cutlass3xGemmM64>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // m in (64, 128]
+    return cutlass_gemm_caller<Cutlass3xGemmM128>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // m in (128, inf)
+    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm90_fp8_epilogue(torch::Tensor& out,
+                                         torch::Tensor const& a,
+                                         torch::Tensor const& b,
+                                         EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  if (out.dtype() == torch::kBFloat16) {
+    return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                          cutlass::bfloat16_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                          cutlass::half_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  }
+}
+
+}  // namespace vllm
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu
new file mode 100644
index 0000000..021467b
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu
@@ -0,0 +1,24 @@
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_sm90_int8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_sm90_int8(torch::Tensor& out, torch::Tensor const& a,
+                                 torch::Tensor const& b,
+                                 torch::Tensor const& a_scales,
+                                 torch::Tensor const& b_scales,
+                                 std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm90_int8_epilogue<c3x::ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm90_int8_epilogue<c3x::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+}  // namespace vllm
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8_dispatch.cuh b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8_dispatch.cuh
new file mode 100644
index 0000000..c4fa181
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8_dispatch.cuh
@@ -0,0 +1,163 @@
+#pragma once
+
+#include "scaled_mm.cuh"
+#include "cutlass_gemm_caller.cuh"
+
+/**
+ * This file defines Gemm kernel configurations for SM90 (int8) based on the
+ * Gemm shape.
+ */
+
+namespace vllm {
+
+using c3x::cutlass_gemm_caller;
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_default {
+  // For M > 128 and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M128 {
+  // For M in (64, 128] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M64 {
+  // For M in (32, 64] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NBig {
+  // For M in [1, 32] and N >= 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _4, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NSmall {
+  // For M in [1, 32] and N < 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _8, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out,
+                                            torch::Tensor const& a,
+                                            torch::Tensor const& b,
+                                            EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, int8_t>());
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_int8_config_default<InType, OutType,
+                                        Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm90_int8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm90_int8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM32NBig =
+      typename sm90_int8_config_M32_NBig<InType, OutType,
+                                         Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM32NSmall =
+      typename sm90_int8_config_M32_NSmall<InType, OutType,
+                                           Epilogue>::Cutlass3xGemm;
+
+  uint32_t const n = out.size(1);
+  bool const is_small_n = n < 8192;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 32) {
+    // m in [1, 32]
+    if (is_small_n) {
+      return cutlass_gemm_caller<Cutlass3xGemmM32NSmall>(
+          out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      return cutlass_gemm_caller<Cutlass3xGemmM32NBig>(
+          out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  } else if (mp2 <= 64) {
+    // m in (32, 64]
+    return cutlass_gemm_caller<Cutlass3xGemmM64>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // m in (64, 128]
+    return cutlass_gemm_caller<Cutlass3xGemmM128>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // m in (128, inf)
+    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm90_int8_epilogue(torch::Tensor& out,
+                                          torch::Tensor const& a,
+                                          torch::Tensor const& b,
+                                          EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  if (out.dtype() == torch::kBFloat16) {
+    return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
+                                           Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  }
+}
+
+}  // namespace vllm
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu
new file mode 100644
index 0000000..6c8f630
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu
@@ -0,0 +1,373 @@
+#include "core/registration.h"
+
+#include <torch/all.h>
+#include <cutlass/arch/arch.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/group_array_problem_shape.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/gett.hpp"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include <cassert>
+
+using namespace cute;
+
+template <typename ElementAB, typename ElementC, typename ElementAccumulator,
+          typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
+__global__ void get_ggemm_starts(
+    int32_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets,
+    ElementC** out_offsets, ElementAccumulator** a_scale_offsets,
+    ElementAccumulator** b_scale_offsets, ElementAB* a_base_as_int,
+    ElementAB* b_base_as_int, ElementC* out_base_as_int,
+    ElementAccumulator* a_scale_base_as_int,
+    ElementAccumulator* b_scale_base_as_int, LayoutSFA* layout_sfa_base_as_int,
+    LayoutSFB* layout_sfb_base_as_int, int* problem_sizes) {
+  int expert_id = threadIdx.x;
+
+  if (expert_id >= gridDim.x * blockDim.x) {
+    return;
+  }
+
+  int m = problem_sizes[expert_id * 3];
+  int n = problem_sizes[expert_id * 3 + 1];
+  int k = problem_sizes[expert_id * 3 + 2];
+
+  int32_t expert_offset = expert_offsets[expert_id];
+  int a_stride = expert_offset * k;
+  int b_stride = expert_id * k * n;
+  int a_scale_stride = expert_offset * k / 128;
+  int b_scale_stride = expert_id * k * n / 128 / 128;
+
+  a_offsets[expert_id] = a_base_as_int + a_stride;
+  b_offsets[expert_id] = b_base_as_int + b_stride;
+  out_offsets[expert_id] = out_base_as_int + expert_offset * n;
+  a_scale_offsets[expert_id] = a_scale_base_as_int + a_scale_stride;
+  b_scale_offsets[expert_id] = b_scale_base_as_int + b_scale_stride;
+
+  LayoutSFA* layout_sfa_ptr = layout_sfa_base_as_int + expert_id;
+  LayoutSFB* layout_sfb_ptr = layout_sfb_base_as_int + expert_id;
+
+  *layout_sfa_ptr =
+      ScaleConfig::tile_atom_to_shape_SFA(cute::make_shape(m, n, k, 1));
+  *layout_sfb_ptr =
+      ScaleConfig::tile_atom_to_shape_SFB(cute::make_shape(m, n, k, 1));
+}
+
+#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE, LayoutSFA, LayoutSFB, \
+                                 ScaleConfig)                                 \
+  else if (out_tensors.dtype() == TENSOR_C_TYPE) {                            \
+    get_ggemm_starts<cutlass::float_e4m3_t, C_TYPE, float, LayoutSFA,         \
+                     LayoutSFB, ScaleConfig><<<1, num_experts, 0, stream>>>(  \
+        static_cast<int32_t*>(expert_offsets.data_ptr()),                     \
+        static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()),              \
+        static_cast<cutlass::float_e4m3_t**>(b_ptrs.data_ptr()),              \
+        static_cast<C_TYPE**>(out_ptrs.data_ptr()),                           \
+        static_cast<float**>(a_scales_ptrs.data_ptr()),                       \
+        static_cast<float**>(b_scales_ptrs.data_ptr()),                       \
+        static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()),            \
+        static_cast<cutlass::float_e4m3_t*>(b_tensors.data_ptr()),            \
+        static_cast<C_TYPE*>(out_tensors.data_ptr()),                         \
+        static_cast<float*>(a_scales.data_ptr()),                             \
+        static_cast<float*>(b_scales.data_ptr()),                             \
+        reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),                  \
+        reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr()),                  \
+        static_cast<int*>(problem_sizes.data_ptr()));                         \
+  }
+
+template <typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
+void run_get_ggemm_starts(
+    torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs,
+    torch::Tensor& b_ptrs, torch::Tensor& out_ptrs,
+    torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs,
+    torch::Tensor const& a_tensors, torch::Tensor const& b_tensors,
+    torch::Tensor out_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& layout_sfa,
+    torch::Tensor const& layout_sfb, torch::Tensor const& problem_sizes) {
+  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(out_tensors.size(1) % 128 == 0 or out_tensors.size(0) % 128 == 0);
+  TORCH_CHECK(a_tensors.size(1) % 128 == 0 or a_tensors.size(0) % 128 == 0);
+
+  int num_experts = (int)expert_offsets.size(0);
+  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
+
+  if (false) {
+  }
+  __CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t, LayoutSFA,
+                           LayoutSFB, ScaleConfig)
+  __CALL_GET_STARTS_KERNEL(torch::kFloat16, cutlass::half_t, LayoutSFA,
+                           LayoutSFB, ScaleConfig)
+  else {
+    TORCH_CHECK(false, "Unsupported output tensor type");
+  }
+}
+
+template <typename OutType, typename ScheduleConfig, typename LayoutD>
+void run_blockwise_scaled_group_mm(
+    torch::Tensor& out_ptrs, const torch::Tensor& a_ptrs,
+    const torch::Tensor& b_ptrs, const torch::Tensor& a_scales_ptrs,
+    const torch::Tensor& b_scales_ptrs, const torch::Tensor& stride_a,
+    const torch::Tensor& stride_b, const torch::Tensor& stride_c,
+    const torch::Tensor& layout_sfa, const torch::Tensor& layout_sfb,
+    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
+  using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int, int, int>>;
+
+  // Types
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = OutType;
+  using ElementD = ElementC;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = LayoutD;
+
+  // Alignments
+  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+
+  using ArchTag = cutlass::arch::Sm100;
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, typename ScheduleConfig::MmaTileShape,
+          typename ScheduleConfig::ClusterShape,
+          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
+          ElementAccumulator, void, LayoutC*, AlignmentC, ElementD, LayoutC*,
+          AlignmentC, typename ScheduleConfig::EpilogueSchedule>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, ElementA,
+          cute::tuple<LayoutA*, typename ScheduleConfig::LayoutSFA*>,
+          AlignmentA, ElementB,
+          cute::tuple<LayoutB*, typename ScheduleConfig::LayoutSFB*>,
+          AlignmentB, ElementAccumulator, typename ScheduleConfig::MmaTileShape,
+          typename ScheduleConfig::ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          typename ScheduleConfig::KernelSchedule>::CollectiveOp;
+
+  using GemmKernel =
+      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
+                                           CollectiveEpilogue, void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  using StrideA = typename Gemm::GemmKernel::InternalStrideA;
+  using StrideB = typename Gemm::GemmKernel::InternalStrideB;
+  using StrideC = typename Gemm::GemmKernel::InternalStrideC;
+  using StrideD = typename Gemm::GemmKernel::InternalStrideD;
+
+  using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
+  int num_experts = (int)expert_offsets.size(0);
+
+  Gemm gemm_op;
+
+  // Mainloop Arguments
+  typename GemmKernel::MainloopArguments mainloop_args{
+      static_cast<const ElementA**>(a_ptrs.data_ptr()),
+      static_cast<StrideA*>(stride_a.data_ptr()),
+      static_cast<const ElementB**>(b_ptrs.data_ptr()),
+      static_cast<StrideB*>(stride_b.data_ptr()),
+      static_cast<const ElementAccumulator**>(a_scales_ptrs.data_ptr()),
+      reinterpret_cast<typename ScheduleConfig::LayoutSFA*>(
+          layout_sfa.data_ptr()),
+      static_cast<const ElementAccumulator**>(b_scales_ptrs.data_ptr()),
+      reinterpret_cast<typename ScheduleConfig::LayoutSFB*>(
+          layout_sfb.data_ptr())};
+
+  int device_id = a_ptrs.device().index();
+  static const cutlass::KernelHardwareInfo hw_info{
+      device_id, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+                     device_id)};
+
+  // Epilogue Arguments
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      {},  // epilogue.thread
+      nullptr,
+      static_cast<StrideC*>(stride_c.data_ptr()),
+      static_cast<ElementD**>(out_ptrs.data_ptr()),
+      static_cast<StrideC*>(stride_c.data_ptr())};
+
+  UnderlyingProblemShape* problem_sizes_as_shapes =
+      static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
+
+  // Gemm Arguments
+  typename GemmKernel::Arguments args{
+      cutlass::gemm::GemmUniversalMode::kGrouped,
+      {num_experts, problem_sizes_as_shapes, nullptr},
+      mainloop_args,
+      epilogue_args,
+      hw_info};
+
+  at::cuda::CUDAGuard device_guard{(char)a_ptrs.device().index()};
+  const cudaStream_t stream =
+      at::cuda::getCurrentCUDAStream(a_ptrs.get_device());
+
+  auto can_implement_status = gemm_op.can_implement(args);
+  TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
+              "Failed to implement GEMM");
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a_ptrs.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto status = gemm_op.initialize(args, workspace.data_ptr(), stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
+
+  status = gemm_op.run(stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
+}
+
+template <typename OutType>
+void blockwise_scaled_group_mm_dispatch_shape(
+    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
+    const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
+  struct MmaConfig {
+    using ElementA = cutlass::float_e4m3_t;
+    using KernelSchedule =
+        cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockwise1SmSm100;
+    using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+    using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig<
+        1, 128, 128, cute::UMMA::Major::K, cute::UMMA::Major::K>;
+    using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
+    using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
+    using LayoutC = cutlass::layout::RowMajor;
+    using MmaTileShape = Shape<_128, _128, _128>;
+    using ClusterShape = Shape<_1, _1, _1>;
+  };
+
+  int num_experts = (int)expert_offsets.size(0);
+
+  auto a_ptrs = torch::empty(
+      {num_experts},
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+  auto b_ptrs = torch::empty(
+      {num_experts},
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+  auto out_ptrs = torch::empty(
+      {num_experts},
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+  auto a_scales_ptrs = torch::empty(
+      {num_experts},
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+  auto b_scales_ptrs = torch::empty(
+      {num_experts},
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+
+  auto layout_sfa = torch::empty(
+      {num_experts, 5},
+      torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
+  auto layout_sfb = torch::empty(
+      {num_experts, 5},
+      torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
+
+  auto stride_a = torch::full(
+      {num_experts}, a.size(1),
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+  auto stride_b = torch::full(
+      {num_experts}, a.size(1),
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+  auto stride_c = torch::full(
+      {num_experts}, output.size(1),
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+
+  torch::TensorOptions options_int =
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device());
+
+  run_get_ggemm_starts<typename MmaConfig::LayoutSFA,
+                       typename MmaConfig::LayoutSFB,
+                       typename MmaConfig::ScaleConfig>(
+      expert_offsets, a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, a,
+      b, output, scales_a, scales_b, layout_sfa, layout_sfb, problem_sizes);
+
+  run_blockwise_scaled_group_mm<OutType, MmaConfig,
+                                typename MmaConfig::LayoutC>(
+      out_ptrs, a_ptrs, b_ptrs, a_scales_ptrs, b_scales_ptrs, stride_a,
+      stride_b, stride_c, layout_sfa, layout_sfb, problem_sizes,
+      expert_offsets);
+}
+
+void cutlass_blockwise_scaled_grouped_mm(
+    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
+    const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
+  TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
+  TORCH_CHECK(problem_sizes.size(1) == 3,
+              "problem_sizes must have shape (num_experts, 3)");
+  TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
+              "Number of experts in problem_sizes must match expert_offsets");
+  TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
+              "problem_sizes must be int32");
+  TORCH_CHECK(a.scalar_type() == torch::kFloat8_e4m3fn,
+              "a must be kFloat8_e4m3fn");
+  TORCH_CHECK(b.scalar_type() == torch::kFloat8_e4m3fn,
+              "b must be kFloat8_e4m3fn");
+  TORCH_CHECK(output.scalar_type() == torch::kBFloat16 ||
+                  output.scalar_type() == torch::kHalf,
+              "output must be bfloat16 or half");
+  TORCH_CHECK(scales_a.scalar_type() == torch::kFloat32,
+              "scales_a must be float32");
+  TORCH_CHECK(scales_b.scalar_type() == torch::kFloat32,
+              "scales_b must be float32");
+  TORCH_CHECK(expert_offsets.scalar_type() == torch::kInt32,
+              "expert_offsets must be int32");
+
+  TORCH_CHECK(output.dim() == 2, "output must be 2D tensor");
+  TORCH_CHECK(a.dim() == 2, "a must be 2D tensor");
+  TORCH_CHECK(b.dim() == 3, "b must be 3D tensor");
+  TORCH_CHECK(scales_a.dim() == 2, "scales_a must be 2D tensor");
+  TORCH_CHECK(scales_b.dim() == 3, "scales_b must be 3D tensor");
+  TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
+  TORCH_CHECK(problem_sizes.size(1) == 3,
+              "problem_sizes must have shape (num_experts, 3)");
+  TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
+              "Number of experts in problem_sizes must match expert_offsets");
+  TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
+              "problem_sizes must be int32");
+  TORCH_CHECK(expert_offsets.dim() == 1, "expert_offsets must be 1D tensor");
+
+#if defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100
+  if (output.scalar_type() == torch::kBFloat16) {
+    blockwise_scaled_group_mm_dispatch_shape<cutlass::bfloat16_t>(
+        output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
+  } else if (output.scalar_type() == torch::kFloat16) {
+    blockwise_scaled_group_mm_dispatch_shape<cutlass::half_t>(
+        output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
+  } else {
+    TORCH_CHECK(false, "Unsupported output tensor type");
+  }
+#endif
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("cutlass_blockwise_scaled_grouped_mm",
+         &cutlass_blockwise_scaled_grouped_mm);
+}
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh
new file mode 100644
index 0000000..6c6e897
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh
@@ -0,0 +1,80 @@
+#pragma once
+
+#include <cuda.h>
+#include <torch/all.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include "core/scalar_type.hpp"
+#include "cutlass/bfloat16.h"
+#include "cutlass/float8.h"
+
+template <typename ElementAB, typename ElementC, typename ElementAccumulator>
+__global__ void get_group_gemm_starts(
+    int32_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets,
+    ElementC** out_offsets, ElementAccumulator** a_scales_offsets,
+    ElementAccumulator** b_scales_offsets, ElementAB* a_base_as_int,
+    ElementAB* b_base_as_int, ElementC* out_base_as_int,
+    ElementAccumulator* a_scales_base_as_int,
+    ElementAccumulator* b_scales_base_as_int, int64_t n, int64_t k,
+    bool per_act_token, bool per_out_ch) {
+  int expert_id = threadIdx.x;
+
+  int64_t expert_offset = expert_offsets[expert_id];
+
+  a_offsets[expert_id] = a_base_as_int + expert_offset * k;
+  b_offsets[expert_id] = b_base_as_int + expert_id * k * n;
+  out_offsets[expert_id] = out_base_as_int + expert_offset * n;
+  a_scales_offsets[expert_id] =
+      a_scales_base_as_int + (per_act_token ? expert_offset : 0);
+  b_scales_offsets[expert_id] =
+      b_scales_base_as_int + (per_out_ch ? n * expert_id : expert_id);
+}
+
+#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE)                    \
+  else if (out_tensors.dtype() == TENSOR_C_TYPE) {                         \
+    get_group_gemm_starts<cutlass::float_e4m3_t, C_TYPE, float>            \
+        <<<1, num_experts, 0, stream>>>(                                   \
+            static_cast<int32_t*>(expert_offsets.data_ptr()),              \
+            static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()),       \
+            static_cast<cutlass::float_e4m3_t**>(b_ptrs.data_ptr()),       \
+            static_cast<C_TYPE**>(out_ptrs.data_ptr()),                    \
+            static_cast<float**>(a_scales_ptrs.data_ptr()),                \
+            static_cast<float**>(b_scales_ptrs.data_ptr()),                \
+            static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()),     \
+            static_cast<cutlass::float_e4m3_t*>(b_tensors.data_ptr()),     \
+            static_cast<C_TYPE*>(out_tensors.data_ptr()),                  \
+            static_cast<float*>(a_scales.data_ptr()),                      \
+            static_cast<float*>(b_scales.data_ptr()), out_tensors.size(1), \
+            a_tensors.size(1), per_act_token, per_out_ch);                 \
+  }
+
+namespace {
+
+void run_get_group_gemm_starts(
+    torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs,
+    torch::Tensor& b_ptrs, torch::Tensor& out_ptrs,
+    torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs,
+    torch::Tensor const& a_tensors, torch::Tensor const& b_tensors,
+    torch::Tensor& out_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales) {
+  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+  int num_experts = static_cast<int>(expert_offsets.size(0));
+  bool per_act_token = a_scales.numel() != 1;
+  bool per_out_ch = b_scales.numel() != num_experts;
+
+  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
+
+  if (false) {
+  }
+  __CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t)
+  __CALL_GET_STARTS_KERNEL(torch::kFloat16, half)
+  else {
+    TORCH_CHECK(false, "Invalid output type (must be float16 or bfloat16)");
+  }
+}
+
+}  // namespace
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
new file mode 100644
index 0000000..659941d
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
@@ -0,0 +1,181 @@
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+#include "cutlass_extensions/common.hpp"
+#include "get_group_starts.cuh"
+
+using namespace cute;
+
+namespace {
+
+using ProblemShape =
+    cutlass::gemm::GroupProblemShape<cute::Shape<int, int, int>>;
+
+using ElementAccumulator = float;
+using OperatorClass = cutlass::arch::OpClassTensorOp;
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutA_Transpose =
+    typename cutlass::layout::LayoutTranspose<LayoutA>::type;
+using LayoutB = cutlass::layout::ColumnMajor;
+using LayoutB_Transpose =
+    typename cutlass::layout::LayoutTranspose<LayoutB>::type;
+using LayoutD = cutlass::layout::RowMajor;
+using LayoutD_Transpose =
+    typename cutlass::layout::LayoutTranspose<LayoutD>::type;
+using LayoutC = LayoutD;
+using LayoutC_Transpose = LayoutD_Transpose;
+
+template <typename ElementAB_, typename ElementC_, typename ArchTag_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule, bool swap_ab_ = false>
+struct cutlass_3x_group_gemm {
+  static constexpr bool swap_ab = swap_ab_;
+  using ElementAB = ElementAB_;
+  using ElementC = void;
+  using ElementD = ElementC_;
+  using ElementAccumulator = float;
+  using ArchTag = ArchTag_;
+
+  using Epilogue = Epilogue_<ElementAccumulator, ElementD, TileShape>;
+
+  static constexpr int AlignmentAB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, TileShape, ClusterShape,
+          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
+          ElementAccumulator, ElementC,
+          conditional_t<swap_ab, LayoutC_Transpose*, LayoutC*>, AlignmentC,
+          ElementD, conditional_t<swap_ab, LayoutD_Transpose*, LayoutD*>,
+          AlignmentC, EpilogueSchedule, EVTCompute>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  using CollectiveMainloop = conditional_t<
+      swap_ab,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, ElementAB, LayoutB_Transpose*, AlignmentAB,
+          ElementAB, LayoutA_Transpose*, AlignmentAB, ElementAccumulator,
+          TileShape, ClusterShape, Stages, KernelSchedule>::CollectiveOp,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, ElementAB, LayoutA*, AlignmentAB, ElementAB,
+          LayoutB*, AlignmentAB, ElementAccumulator, TileShape, ClusterShape,
+          Stages, KernelSchedule>::CollectiveOp>;
+
+  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+      ProblemShape, CollectiveMainloop, CollectiveEpilogue>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename Gemm>
+void cutlass_group_gemm_caller(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
+  static constexpr bool swap_ab = Gemm::swap_ab;
+
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  int num_experts = static_cast<int>(expert_offsets.size(0));
+
+  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
+
+  auto options_int =
+      torch::TensorOptions().dtype(torch::kInt64).device(a_tensors.device());
+
+  torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
+
+  run_get_group_gemm_starts(expert_offsets, a_ptrs, b_ptrs, out_ptrs,
+                            a_scales_ptrs, b_scales_ptrs, a_tensors, b_tensors,
+                            out_tensors, a_scales, b_scales);
+
+  using GemmKernel = typename Gemm::GemmKernel;
+  using StrideA = Stride<int64_t, Int<1>, Int<0>>;
+  using StrideB = Stride<int64_t, Int<1>, Int<0>>;
+  using StrideC = typename GemmKernel::InternalStrideC;
+
+  ProblemShape::UnderlyingProblemShape* problem_sizes_as_shapes =
+      static_cast<ProblemShape::UnderlyingProblemShape*>(
+          problem_sizes.data_ptr());
+  ProblemShape prob_shape{num_experts, problem_sizes_as_shapes, nullptr};
+
+  typename GemmKernel::MainloopArguments mainloop_args;
+  if constexpr (swap_ab) {
+    mainloop_args = typename GemmKernel::MainloopArguments{
+        static_cast<const ElementAB**>(b_ptrs.data_ptr()),
+        static_cast<StrideB*>(b_strides.data_ptr()),
+        static_cast<const ElementAB**>(a_ptrs.data_ptr()),
+        static_cast<StrideA*>(a_strides.data_ptr())};
+  } else {
+    mainloop_args = typename GemmKernel::MainloopArguments{
+        static_cast<const ElementAB**>(a_ptrs.data_ptr()),
+        static_cast<StrideA*>(a_strides.data_ptr()),
+        static_cast<const ElementAB**>(b_ptrs.data_ptr()),
+        static_cast<StrideB*>(b_strides.data_ptr())};
+  }
+
+  // Currently, we are only able to do broadcast on either all or none a_scales
+  // and on either all or none b_scales
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          swap_ab ? static_cast<const ElementAccumulator**>(
+                        b_scales_ptrs.data_ptr())
+                  : static_cast<const ElementAccumulator**>(
+                        a_scales_ptrs.data_ptr()),
+          swap_ab ? static_cast<const ElementAccumulator**>(
+                        a_scales_ptrs.data_ptr())
+                  : static_cast<const ElementAccumulator**>(
+                        b_scales_ptrs.data_ptr()),
+          swap_ab ? per_out_ch : per_act_token,
+          swap_ab ? per_act_token : per_out_ch),
+      nullptr, static_cast<StrideC*>(c_strides.data_ptr()),
+      static_cast<ElementD**>(out_ptrs.data_ptr()),
+      static_cast<StrideC*>(c_strides.data_ptr())};
+
+  int device_id = a_tensors.device().index();
+  static const cutlass::KernelHardwareInfo hw_info{
+      device_id, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+                     device_id)};
+
+  typename GemmKernel::Arguments args{
+      cutlass::gemm::GemmUniversalMode::kGrouped, prob_shape, mainloop_args,
+      epilogue_args, hw_info};
+
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a_tensors.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+}  // namespace
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu
new file mode 100644
index 0000000..641e599
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu
@@ -0,0 +1,140 @@
+#include <cudaTypedefs.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include "cutlass/cutlass.h"
+#include "grouped_mm_c3x.cuh"
+
+using namespace cute;
+
+namespace {
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm100_fp8_config_default {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+  using TileShape = cute::Shape<cute::_128, cute::_256, cute::_128>;
+  using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+  using ArchTag = cutlass::arch::Sm100;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
+                            ClusterShape, KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm100_fp8_config_M64 {
+  // M in [1,64]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+  using TileShape = cute::Shape<cute::_128, cute::_16, cute::_128>;
+  using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+  using ArchTag = cutlass::arch::Sm100;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
+                            ClusterShape, KernelSchedule, EpilogueSchedule,
+                            true>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm100_fp8_config_N8192 {
+  // N in [8192, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
+  using TileShape = cute::Shape<cute::_128, cute::_256, cute::_128>;
+  using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+  using ArchTag = cutlass::arch::Sm100;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
+                            ClusterShape, KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType>
+void run_cutlass_moe_mm_sm100(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
+  TORCH_CHECK(a_tensors.size(0) > 0, "No input A tensors provided.");
+  TORCH_CHECK(b_tensors.size(0) > 0, "No input B tensors provided.");
+  TORCH_CHECK(out_tensors.size(0) > 0, "No output tensors provided.");
+
+  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn,
+              "A tensors must be of type float8_e4m3fn.");
+  TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn,
+              "B tensors must be of type float8_e4m3fn.");
+
+  using Cutlass3xGemmDefault = typename sm100_fp8_config_default<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+  using Cutlass3xGemmN8192 = typename sm100_fp8_config_N8192<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 = typename sm100_fp8_config_M64<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+
+  uint32_t const m = a_tensors.size(0);
+  uint32_t const n = out_tensors.size(1);
+
+  if (m <= 64) {
+    cutlass_group_gemm_caller<Cutlass3xGemmM64>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
+  } else if (n >= 8192) {
+    cutlass_group_gemm_caller<Cutlass3xGemmN8192>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
+  } else {
+    cutlass_group_gemm_caller<Cutlass3xGemmDefault>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
+  }
+}
+}  // namespace
+
+void dispatch_moe_mm_sm100(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
+  if (out_tensors.dtype() == torch::kBFloat16) {
+    run_cutlass_moe_mm_sm100<cutlass::float_e4m3_t, cutlass::bfloat16_t>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
+  } else {
+    run_cutlass_moe_mm_sm100<cutlass::float_e4m3_t, cutlass::half_t>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
+  }
+}
+
+void cutlass_moe_mm_sm100(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
+  dispatch_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
+                        expert_offsets, problem_sizes, a_strides, b_strides,
+                        c_strides, per_act_token, per_out_ch);
+}
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu
new file mode 100644
index 0000000..8f21623
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu
@@ -0,0 +1,198 @@
+#include <cudaTypedefs.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include "cutlass/cutlass.h"
+#include "grouped_mm_c3x.cuh"
+
+using namespace cute;
+
+namespace {
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_default {
+  // M in (16, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule =
+      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+  using TileShape = cute::Shape<cute::_64, cute::_256, cute::_128>;
+  using ClusterShape = cute::Shape<cute::_1, cute::_2, cute::_1>;
+  using ArchTag = cutlass::arch::Sm90;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
+                            ClusterShape, KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M4 {
+  // M in [1, 4]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule =
+      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+  using TileShape = cute::Shape<cute::_128, cute::_16, cute::_128>;
+  using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+  using ArchTag = cutlass::arch::Sm90;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
+                            ClusterShape, KernelSchedule, EpilogueSchedule,
+                            true>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M64 {
+  // M in (4, 64]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule =
+      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+  using TileShape = cute::Shape<cute::_128, cute::_16, cute::_256>;
+  using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+  using ArchTag = cutlass::arch::Sm90;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
+                            ClusterShape, KernelSchedule, EpilogueSchedule,
+                            true>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_K8192 {
+  // K in [8192, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule =
+      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+  using TileShape = cute::Shape<cute::_128, cute::_128, cute::_128>;
+  using ClusterShape = cute::Shape<cute::_1, cute::_8, cute::_1>;
+  using ArchTag = cutlass::arch::Sm90;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
+                            ClusterShape, KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_N8192 {
+  // N in [8192, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule =
+      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+  using TileShape = cute::Shape<cute::_64, cute::_128, cute::_256>;
+  using ClusterShape = cute::Shape<cute::_1, cute::_8, cute::_1>;
+  using ArchTag = cutlass::arch::Sm90;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
+                            ClusterShape, KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType>
+void run_cutlass_moe_mm_sm90(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
+  TORCH_CHECK(a_tensors.size(0) > 0, "No input A tensors provided.");
+  TORCH_CHECK(b_tensors.size(0) > 0, "No input B tensors provided.");
+  TORCH_CHECK(out_tensors.size(0) > 0, "No output tensors provided.");
+
+  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn,
+              "A tensors must be of type float8_e4m3fn.");
+  TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn,
+              "B tensors must be of type float8_e4m3fn.");
+
+  using Cutlass3xGemmN8192 = typename sm90_fp8_config_N8192<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+  using Cutlass3xGemmK8192 = typename sm90_fp8_config_K8192<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+  using Cutlass3xGemmM4 = typename sm90_fp8_config_M4<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 = typename sm90_fp8_config_M64<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+  using Cutlass3xGemmDefault = typename sm90_fp8_config_default<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+
+  uint32_t const m = a_tensors.size(0);
+  uint32_t const n = out_tensors.size(1);
+  uint32_t const k = a_tensors.size(1);
+
+  // Use swap_ab for M <= 64 by default to reduce padding
+  if (m <= 4) {
+    cutlass_group_gemm_caller<Cutlass3xGemmM4>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
+  } else if (m <= 64) {
+    cutlass_group_gemm_caller<Cutlass3xGemmM64>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
+  } else if (n >= 8192) {
+    cutlass_group_gemm_caller<Cutlass3xGemmN8192>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
+  } else if (k >= 8192) {
+    cutlass_group_gemm_caller<Cutlass3xGemmK8192>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
+  } else {
+    cutlass_group_gemm_caller<Cutlass3xGemmDefault>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
+  }
+}
+
+void dispatch_moe_mm_sm90(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
+  if (out_tensors.dtype() == torch::kBFloat16) {
+    run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::bfloat16_t>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
+  } else {
+    run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::half_t>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
+  }
+}
+
+}  // namespace
+
+void cutlass_moe_mm_sm90(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
+  dispatch_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
+                       expert_offsets, problem_sizes, a_strides, b_strides,
+                       c_strides, per_act_token, per_out_ch);
+}
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/moe/moe_data.cu b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
new file mode 100644
index 0000000..993c30c
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
@@ -0,0 +1,193 @@
+#include <cudaTypedefs.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include <iostream>
+
+constexpr uint64_t THREADS_PER_EXPERT = 512;
+// threshold must match the dispatch logic in run_cutlass_moe_mm_sm90()
+constexpr int SWAP_AB_THRESHOLD = 64;
+
+template <bool SWAP_AB>
+__global__ void compute_problem_sizes(const int32_t* __restrict__ topk_ids,
+                                      int32_t* problem_sizes1,
+                                      int32_t* problem_sizes2,
+                                      int32_t* atomic_buffer,
+                                      const int topk_length, const int n,
+                                      const int k) {
+  int expert_id = blockIdx.x;
+
+  int occurrences = 0;
+  for (int i = threadIdx.x; i < topk_length; i += THREADS_PER_EXPERT) {
+    occurrences += (topk_ids[i] == expert_id);
+  }
+  atomicAdd(&atomic_buffer[expert_id], occurrences);
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    int final_occurrences = atomic_buffer[expert_id];
+    if constexpr (!SWAP_AB) {
+      problem_sizes1[expert_id * 3] = final_occurrences;
+      problem_sizes1[expert_id * 3 + 1] = 2 * n;
+      problem_sizes1[expert_id * 3 + 2] = k;
+      problem_sizes2[expert_id * 3] = final_occurrences;
+      problem_sizes2[expert_id * 3 + 1] = k;
+      problem_sizes2[expert_id * 3 + 2] = n;
+    } else {
+      problem_sizes1[expert_id * 3] = 2 * n;
+      problem_sizes1[expert_id * 3 + 1] = final_occurrences;
+      problem_sizes1[expert_id * 3 + 2] = k;
+      problem_sizes2[expert_id * 3] = k;
+      problem_sizes2[expert_id * 3 + 1] = final_occurrences;
+      problem_sizes2[expert_id * 3 + 2] = n;
+    }
+  }
+}
+
+__global__ void compute_expert_offsets(
+    const int32_t* __restrict__ problem_sizes1, int32_t* expert_offsets,
+    int32_t* atomic_buffer, const int num_experts, const int topk_length) {
+  int32_t tot_offset = 0;
+  expert_offsets[0] = 0;
+  for (int i = 0; i < num_experts; ++i) {
+    atomic_buffer[i] = tot_offset;
+    tot_offset += topk_length > SWAP_AB_THRESHOLD ? problem_sizes1[i * 3]
+                                                  : problem_sizes1[i * 3 + 1];
+    expert_offsets[i + 1] = tot_offset;
+  }
+}
+
+__global__ void compute_expert_blockscale_offsets(
+    const int32_t* __restrict__ problem_sizes1, int32_t* expert_offsets,
+    int32_t* blockscale_offsets, int32_t* atomic_buffer, const int num_experts,
+    const int topk_length) {
+  int32_t tot_offset = 0;
+  int32_t tot_offset_round = 0;
+  expert_offsets[0] = 0;
+  blockscale_offsets[0] = 0;
+  for (int i = 0; i < num_experts; ++i) {
+    int32_t cur_offset = topk_length > SWAP_AB_THRESHOLD
+                             ? problem_sizes1[i * 3]
+                             : problem_sizes1[i * 3 + 1];
+    atomic_buffer[i] = tot_offset;
+    tot_offset += cur_offset;
+    expert_offsets[i + 1] = tot_offset;
+    tot_offset_round += (cur_offset + (128 - 1)) / 128 * 128;
+    blockscale_offsets[i + 1] = tot_offset_round;
+  }
+}
+
+__global__ void compute_arg_sorts(const int32_t* __restrict__ topk_ids,
+                                  const int32_t* __restrict__ expert_offsets,
+                                  int32_t* input_permutation,
+                                  int32_t* output_permutation,
+                                  int32_t* atomic_buffer, const int topk_length,
+                                  const int topk) {
+  int const blk_expert_id = blockIdx.x;
+  int const num_experts = gridDim.x;
+  int32_t const num_tokens = expert_offsets[num_experts];
+
+  for (int i = threadIdx.x; i < topk_length; i += THREADS_PER_EXPERT) {
+    int const expert_id = topk_ids[i];
+    if (expert_id == -1 && blockIdx.x == 0) {
+      // output_permutation is used to re-order the moe outputs. It is
+      // used as c2 = c2[c_map], where c2 is a torch.tensor that is the
+      // output of the cutlass kernels and c_map is the output_permutation.
+      // c2 is initialized to zeros, therefore by setting the output_permutation
+      // to num_tokens, we are guaranteed to fill the moe outputs to zero
+      // for "invalid" topk_ids.
+      output_permutation[i] = num_tokens;
+    } else if (expert_id == blk_expert_id) {
+      int start = atomicAdd(&atomic_buffer[expert_id], 1);
+      input_permutation[start] = i / topk;
+      output_permutation[i] = start;
+    }
+  }
+}
+
+void get_cutlass_moe_mm_data_caller(
+    const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
+    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
+    torch::Tensor& input_permutation, torch::Tensor& output_permutation,
+    const int64_t num_experts, const int64_t n, const int64_t k,
+    const std::optional<torch::Tensor>& blockscale_offsets) {
+  auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
+  auto options_int32 =
+      torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
+  torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32);
+
+  int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
+
+  if (topk_ids.numel() > SWAP_AB_THRESHOLD) {
+    compute_problem_sizes<false><<<num_experts, num_threads, 0, stream>>>(
+        static_cast<const int32_t*>(topk_ids.data_ptr()),
+        static_cast<int32_t*>(problem_sizes1.data_ptr()),
+        static_cast<int32_t*>(problem_sizes2.data_ptr()),
+        static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n,
+        k);
+  } else {
+    compute_problem_sizes<true><<<num_experts, num_threads, 0, stream>>>(
+        static_cast<const int32_t*>(topk_ids.data_ptr()),
+        static_cast<int32_t*>(problem_sizes1.data_ptr()),
+        static_cast<int32_t*>(problem_sizes2.data_ptr()),
+        static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n,
+        k);
+  }
+
+  if (blockscale_offsets.has_value()) {
+    compute_expert_blockscale_offsets<<<1, 1, 0, stream>>>(
+        static_cast<const int32_t*>(problem_sizes1.data_ptr()),
+        static_cast<int32_t*>(expert_offsets.data_ptr()),
+        static_cast<int32_t*>(blockscale_offsets.value().data_ptr()),
+        static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts,
+        topk_ids.numel());
+  } else {
+    compute_expert_offsets<<<1, 1, 0, stream>>>(
+        static_cast<const int32_t*>(problem_sizes1.data_ptr()),
+        static_cast<int32_t*>(expert_offsets.data_ptr()),
+        static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts,
+        topk_ids.numel());
+  }
+  compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>(
+      static_cast<const int32_t*>(topk_ids.data_ptr()),
+      static_cast<const int32_t*>(expert_offsets.data_ptr()),
+      static_cast<int32_t*>(input_permutation.data_ptr()),
+      static_cast<int32_t*>(output_permutation.data_ptr()),
+      static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(),
+      topk_ids.size(1));
+}
+
+__global__ void compute_pplx_data(int32_t* expert_offsets,
+                                  int32_t* problem_sizes1,
+                                  int32_t* problem_sizes2,
+                                  const int32_t* __restrict__ expert_num_tokens,
+                                  const int padded_m, const int n,
+                                  const int k) {
+  int expert_idx = threadIdx.x;
+
+  expert_offsets[expert_idx] = expert_idx * padded_m;
+  problem_sizes1[expert_idx * 3] = expert_num_tokens[expert_idx];
+  problem_sizes1[expert_idx * 3 + 1] = 2 * n;
+  problem_sizes1[expert_idx * 3 + 2] = k;
+  problem_sizes2[expert_idx * 3] = expert_num_tokens[expert_idx];
+  problem_sizes2[expert_idx * 3 + 1] = k;
+  problem_sizes2[expert_idx * 3 + 2] = n;
+}
+
+void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
+                                         torch::Tensor& problem_sizes1,
+                                         torch::Tensor& problem_sizes2,
+                                         const torch::Tensor& expert_num_tokens,
+                                         const int64_t num_local_experts,
+                                         const int64_t padded_m,
+                                         const int64_t n, const int64_t k) {
+  auto stream = at::cuda::getCurrentCUDAStream(expert_offsets.device().index());
+
+  compute_pplx_data<<<1, num_local_experts, 0, stream>>>(
+      static_cast<int32_t*>(expert_offsets.data_ptr()),
+      static_cast<int32_t*>(problem_sizes1.data_ptr()),
+      static_cast<int32_t*>(problem_sizes2.data_ptr()),
+      static_cast<const int32_t*>(expert_num_tokens.data_ptr()), padded_m, n,
+      k);
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
new file mode 100644
index 0000000..865fef5
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
@@ -0,0 +1,199 @@
+#include <stddef.h>
+#include <torch/all.h>
+#include "cutlass/cutlass.h"
+
+#include "scaled_mm_c2x.cuh"
+#include "scaled_mm_c2x_sm75_dispatch.cuh"
+#include "scaled_mm_c2x_sm80_dispatch.cuh"
+#include "scaled_mm_c2x_sm89_fp8_dispatch.cuh"
+#include "scaled_mm_c2x_sm89_int8_dispatch.cuh"
+
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp"
+
+using namespace vllm;
+
+/*
+   This file defines quantized GEMM operations using the CUTLASS 2.x API, for
+   NVIDIA GPUs with SM versions prior to sm90 (Hopper).
+*/
+
+template <template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm75_epilogue(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  if (out.dtype() == torch::kBFloat16) {
+    return cutlass_gemm_sm75_dispatch<int8_t, cutlass::bfloat16_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    return cutlass_gemm_sm75_dispatch<int8_t, cutlass::half_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  }
+}
+
+void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales,
+                            std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm75_epilogue<c2x::ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm75_epilogue<c2x::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+void cutlass_scaled_mm_azp_sm75(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                torch::Tensor const& a_scales,
+                                torch::Tensor const& b_scales,
+                                torch::Tensor const& azp_adj,
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+  if (azp) {
+    return cutlass_scaled_mm_sm75_epilogue<c2x::ScaledEpilogueBiasAzpToken>(
+        out, a, b, a_scales, b_scales, azp_adj, *azp, bias);
+  } else {
+    return cutlass_scaled_mm_sm75_epilogue<c2x::ScaledEpilogueBiasAzp>(
+        out, a, b, a_scales, b_scales, azp_adj, bias);
+  }
+}
+
+template <template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm80_epilogue(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  if (out.dtype() == torch::kBFloat16) {
+    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::bfloat16_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::half_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  }
+}
+
+void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales,
+                            std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm80_epilogue<c2x::ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm80_epilogue<c2x::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+void cutlass_scaled_mm_azp_sm80(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                torch::Tensor const& a_scales,
+                                torch::Tensor const& b_scales,
+                                torch::Tensor const& azp_adj,
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+  if (azp) {
+    return cutlass_scaled_mm_sm80_epilogue<c2x::ScaledEpilogueBiasAzpToken>(
+        out, a, b, a_scales, b_scales, azp_adj, *azp, bias);
+  } else {
+    return cutlass_scaled_mm_sm80_epilogue<c2x::ScaledEpilogueBiasAzp>(
+        out, a, b, a_scales, b_scales, azp_adj, bias);
+  }
+}
+
+template <template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     EpilogueArgs&&... epilogue_args) {
+  if (a.dtype() == torch::kInt8) {
+    TORCH_CHECK(b.dtype() == torch::kInt8);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm89_int8_dispatch<int8_t, cutlass::bfloat16_t,
+                                             Epilogue>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      assert(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm89_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  } else {
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+    TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm89_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::bfloat16_t, Epilogue>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm89_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::half_t, Epilogue>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  }
+}
+
+void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales,
+                            std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm89_epilogue<c2x::ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm89_epilogue<c2x::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+void cutlass_scaled_mm_azp_sm89(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                torch::Tensor const& a_scales,
+                                torch::Tensor const& b_scales,
+                                torch::Tensor const& azp_adj,
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+  if (azp) {
+    return cutlass_scaled_mm_sm89_epilogue<c2x::ScaledEpilogueBiasAzpToken>(
+        out, a, b, a_scales, b_scales, azp_adj, *azp, bias);
+  } else {
+    return cutlass_scaled_mm_sm89_epilogue<c2x::ScaledEpilogueBiasAzp>(
+        out, a, b, a_scales, b_scales, azp_adj, bias);
+  }
+}
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
new file mode 100644
index 0000000..ce7cf2f
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
@@ -0,0 +1,225 @@
+#pragma once
+#include <stddef.h>
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+// clang-format will break include orders
+// clang-format off
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm_coord.h"
+#include "cutlass/arch/mma_sm75.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+
+#include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
+#include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
+
+#include "core/math.hpp"
+#include "cutlass_extensions/common.hpp"
+// clang-format on
+
+using namespace cute;
+
+/*
+   Epilogues defined in,
+   csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
+   must contain a public type named EVTCompute of type Sm80EVT,
+   as well as a static prepare_args function that constructs an
+   EVTCompute::Arguments struct.
+*/
+
+namespace vllm {
+
+// Wrappers for the GEMM kernel that is used to guard against compilation on
+// architectures that will never use the kernel. The purpose of this is to
+// reduce the size of the compiled binary.
+// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
+// into code that will be executed on the device where it is defined.
+template <typename Kernel>
+struct enable_sm75_to_sm80 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 750 && __CUDA_ARCH__ < 800
+    Kernel::invoke(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+template <typename Kernel>
+struct enable_sm80_to_sm89 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 890
+    Kernel::invoke(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+template <typename Kernel>
+struct enable_sm89_to_sm90 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 890 && __CUDA_ARCH__ < 900
+    Kernel::invoke(std::forward<Args>(args)...);
+#endif
+  }
+};
+template <typename Arch, template <typename> typename ArchGuard,
+          typename ElementAB_, typename ElementD_,
+          template <typename, typename> typename Epilogue_, typename TileShape,
+          typename WarpShape, typename InstructionShape, int32_t MainLoopStages,
+          typename FP8MathOperator = cutlass::arch::OpMultiplyAdd>
+struct cutlass_2x_gemm {
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+
+  using Operator =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>,
+                                cutlass::arch::OpMultiplyAddSaturate,
+                                FP8MathOperator>::type;
+
+  using OutputTileThreadMap =
+      cutlass::epilogue::threadblock::OutputTileThreadLayout<
+          TileShape, WarpShape, float, 4, 1 /* epilogue stages */
+          >;
+
+  using Epilogue = Epilogue_<ElementD, OutputTileThreadMap>;
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  using D = cutlass::epilogue::threadblock::VisitorAuxStore<
+      OutputTileThreadMap, ElementD, cutlass::FloatRoundStyle::round_to_nearest,
+      Stride<int64_t, Int<1>, Int<0>>>;
+
+  using EVTD = cutlass::epilogue::threadblock::Sm80EVT<D, EVTCompute>;
+
+  // These are the minimum alignments needed for the kernels to compile
+  static constexpr int AlignmentAB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentCD = 4;
+
+  // clang-format off
+  using RowMajor = typename cutlass::layout::RowMajor;
+  using ColumnMajor = typename cutlass::layout::ColumnMajor;
+  using KernelType =
+    ArchGuard<typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
+      ElementAB, RowMajor, cutlass::ComplexTransform::kNone, AlignmentAB,
+      ElementAB, ColumnMajor, cutlass::ComplexTransform::kNone, AlignmentAB,
+      float, cutlass::layout::RowMajor, AlignmentCD,
+      ElementAcc, float, cutlass::arch::OpClassTensorOp,
+      Arch,
+      TileShape, WarpShape, InstructionShape,
+      EVTD,
+      cutlass::gemm::threadblock::ThreadblockSwizzleStreamK,
+      MainLoopStages, Operator,
+      1 /* epilogue stages */
+      >::GemmKernel>;
+  // clang-format on
+
+  using Op = cutlass::gemm::device::GemmUniversalAdapter<KernelType>;
+};
+
+template <typename Gemm, typename... EpilogueArgs>
+inline void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                EpilogueArgs&&... epilogue_params) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  int32_t m = a.size(0);
+  int32_t n = b.size(1);
+  int32_t k = a.size(1);
+  cutlass::gemm::GemmCoord problem_size{m, n, k};
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  using StrideC = Stride<int64_t, Int<1>, Int<0>>;
+  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
+
+  auto a_ptr = static_cast<ElementAB const*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB const*>(b.data_ptr());
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+
+  typename Gemm::D::Arguments d_args{c_ptr, c_stride};
+
+  using Epilogue = typename Gemm::Epilogue;
+  auto evt_args =
+      Epilogue::prepare_args(std::forward<EpilogueArgs>(epilogue_params)...);
+
+  typename Gemm::EVTD::Arguments epilogue_args{
+      evt_args,
+      d_args,
+  };
+
+  typename Gemm::Op::Arguments args{
+      cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel,  // universal mode
+      problem_size,                                           // problem size
+      1,                                                      // batch count
+      epilogue_args,
+      a_ptr,
+      b_ptr,
+      nullptr,
+      nullptr,
+      0,
+      0,
+      0,
+      0,
+      lda,
+      ldb,
+      ldc,
+      ldc};
+
+  // Launch the CUTLASS GEMM kernel.
+  typename Gemm::Op gemm_op;
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+  cutlass::Status status = gemm_op(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+template <typename Gemm, typename FallbackGemm, typename... EpilogueArgs>
+inline void fallback_cutlass_gemm_caller(torch::Tensor& out,
+                                         torch::Tensor const& a,
+                                         torch::Tensor const& b,
+                                         EpilogueArgs&&... args) {
+  // In some cases, the GPU isn't able to accommodate the
+  // shared memory requirements of the Gemm. In such cases, use
+  // the FallbackGemm instead.
+  static const int max_shared_mem_per_block_opt_in =
+      get_cuda_max_shared_memory_per_block_opt_in(0);
+
+  size_t const gemm_shared_mem_size =
+      sizeof(typename Gemm::KernelType::SharedStorage);
+  size_t const fallback_gemm_shared_mem_size =
+      sizeof(typename FallbackGemm::KernelType::SharedStorage);
+
+  if (gemm_shared_mem_size <= max_shared_mem_per_block_opt_in) {
+    return cutlass_gemm_caller<Gemm>(out, a, b,
+                                     std::forward<EpilogueArgs>(args)...);
+  } else {
+    TORCH_CHECK(fallback_gemm_shared_mem_size <=
+                max_shared_mem_per_block_opt_in);
+    return cutlass_gemm_caller<FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+}  // namespace vllm
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh
new file mode 100644
index 0000000..a562fd8
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh
@@ -0,0 +1,123 @@
+#pragma once
+
+#include "scaled_mm_c2x.cuh"
+
+/**
+ * This file defines Gemm kernel configurations for SM75 based on the Gemm
+ * shape.
+ */
+
+namespace vllm {
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm75_config_default {
+  // This config is used in 2 cases,
+  // - M in (256, inf]
+  // - M in (64, 128]
+  // Shared memory required by this Gemm 32768
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm75, enable_sm75_to_sm80, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 2>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm75_config_M256 {
+  // M in (128, 256]
+  // Shared memory required by this Gemm 65536
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm75, enable_sm75_to_sm80, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 2>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm75_config_M64 {
+  // M in (32, 64]
+  // Shared memory required by this Gemm 49152
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm75, enable_sm75_to_sm80, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 2>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm75_config_M32 {
+  // M in [1, 32]
+  // Shared memory required by this Gemm 49152
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<32, 128, 64>;
+  using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm75, enable_sm75_to_sm80, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 2>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm75_dispatch(torch::Tensor& out,
+                                       torch::Tensor const& a,
+                                       torch::Tensor const& b,
+                                       EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, int8_t>());
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  using Cutlass2xGemmDefault =
+      typename sm75_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM256 =
+      typename sm75_config_M256<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM128 = Cutlass2xGemmDefault;
+  using Cutlass2xGemmM64 =
+      typename sm75_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM32 =
+      typename sm75_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+  // Due to shared memory requirements, some Gemms may fail to run on some
+  // GPUs. As the name indicates, the Fallback Gemm is used as an alternative
+  // in such cases.
+  // sm75_config_default has the least shared-memory requirements.
+  using FallbackGemm = Cutlass2xGemmDefault;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+  if (mp2 <= 32) {
+    // M in [1, 32]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM32, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 64) {
+    // M in (32, 64]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM64, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // M in (64, 128]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM128, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 256) {
+    // M in (128, 256]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM256, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // M in (256, inf)
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmDefault, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+}  // namespace vllm
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh
new file mode 100644
index 0000000..89d101b
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh
@@ -0,0 +1,139 @@
+#pragma once
+
+#include "scaled_mm_c2x.cuh"
+
+/**
+ * This file defines Gemm kernel configurations for SM80 based on the Gemm
+ * shape.
+ */
+
+namespace vllm {
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_default {
+  // This config is used in 2 cases,
+  //  - M in (128, inf)
+  //  - M in (64, 128] and N >= 8192
+  // Shared Memory required by this Gemm - 81920 bytes
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_M64 {
+  // This config is used in 2 cases,
+  // - M in (32, 64]
+  // - M in (64, 128] and N < 8192
+  // Shared Memory required by this Gemm - 122880 bytes
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_M32 {
+  // M in (16, 32]
+  // Shared Memory required by this Gemm - 61440 bytes
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_M16 {
+  // M in [1, 16]
+  // Shared Memory required by this Gemm - 51200 bytes
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm80_dispatch(torch::Tensor& out,
+                                       torch::Tensor const& a,
+                                       torch::Tensor const& b,
+                                       EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, int8_t>());
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  using Cutlass2xGemmDefault =
+      typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM128BigN =
+      typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM128SmallN =
+      typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM64 =
+      typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM32 =
+      typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM16 =
+      typename sm80_config_M16<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+  // Due to shared memory requirements, some Gemms may fail to run on some
+  // GPUs. As the name indicates, the Fallback Gemm is used as an alternative
+  // in such cases.
+  // sm80_config_M16 has the least shared-memory requirement. However,
+  // based on some profiling, we select sm80_config_M32 as a better alternative
+  // performance wise.
+  using FallbackGemm =
+      typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2
+  if (mp2 <= 16) {
+    // M in [1, 16]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM16, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 32) {
+    // M in (16, 32]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM32, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 64) {
+    // M in (32, 64]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM64, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // M in (64, 128]
+    uint32_t const n = out.size(1);
+    bool const small_n = n < 8192;
+    if (small_n) {
+      return fallback_cutlass_gemm_caller<Cutlass2xGemmM128SmallN,
+                                          FallbackGemm>(
+          out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      return fallback_cutlass_gemm_caller<Cutlass2xGemmM128BigN, FallbackGemm>(
+          out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  } else {
+    // M in (128, inf)
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmDefault, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+}  // namespace vllm
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh
new file mode 100644
index 0000000..6082937
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh
@@ -0,0 +1,368 @@
+#pragma once
+
+#include "scaled_mm_c2x.cuh"
+#include "cutlass/float8.h"
+
+/**
+ * This file defines Gemm kernel configurations for SM89 (FP8) based on the Gemm
+ * shape.
+ */
+
+namespace vllm {
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm89_fp8_fallback_gemm {
+  // Shared Memory required by this Gemm - 61440 bytes
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<64, 128, 64>;
+  using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using FP8MathOperator = typename cutlass::arch::OpMultiplyAdd;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5,
+                      FP8MathOperator>;
+};
+
+struct sm89_fp8_config_default {
+  // M in (256, inf)
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+
+    using FallbackGemm =
+        typename sm89_fp8_fallback_gemm<InType, OutType,
+                                        Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 4096) {
+      using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 8192) {
+      using TileShape = typename cutlass::gemm::GemmShape<256, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+
+    } else {
+      using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_fp8_config_M256 {
+  // M in (128, 256]
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+
+    using FallbackGemm =
+        typename sm89_fp8_fallback_gemm<InType, OutType,
+                                        Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 4096) {
+      using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_fp8_config_M128 {
+  // M in (64, 128]
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+
+    using FallbackGemm =
+        typename sm89_fp8_fallback_gemm<InType, OutType,
+                                        Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8192) {
+      using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+
+    } else if (np2 <= 16384) {
+      using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = typename cutlass::gemm::GemmShape<128, 64, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_fp8_config_M64 {
+  // M in (32, 64]
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+
+    using FallbackGemm =
+        typename sm89_fp8_fallback_gemm<InType, OutType,
+                                        Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8196) {
+      using TileShape = typename cutlass::gemm::GemmShape<64, 64, 128>;
+      using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
+      using FP8MathOperator = typename cutlass::arch::OpMultiplyAdd;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 16384) {
+      using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
+      using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+      using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = typename cutlass::gemm::GemmShape<64, 64, 128>;
+      using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
+      using FP8MathOperator = typename cutlass::arch::OpMultiplyAdd;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_fp8_config_M32 {
+  // M in (16, 32]
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+
+    using FallbackGemm =
+        typename sm89_fp8_fallback_gemm<InType, OutType,
+                                        Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8192) {
+      using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
+      using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 16384) {
+      using TileShape = typename cutlass::gemm::GemmShape<32, 128, 128>;
+      using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 4, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
+      using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_fp8_config_M16 {
+  // M in [1, 16]
+  using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
+  static const int32_t MainLoopStages = 5;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+
+    using FallbackGemm =
+        typename sm89_fp8_fallback_gemm<InType, OutType,
+                                        Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8192) {
+      using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, MainLoopStages,
+                                FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 24576) {
+      using TileShape = typename cutlass::gemm::GemmShape<16, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, MainLoopStages,
+                                FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, MainLoopStages,
+                                FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm89_fp8_dispatch(torch::Tensor& out,
+                                           torch::Tensor const& a,
+                                           torch::Tensor const& b,
+                                           EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 16) {
+    // M in [1, 16]
+    return sm89_fp8_config_M16::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 32) {
+    // M in (16, 32]
+    return sm89_fp8_config_M32::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 64) {
+    // M in (32, 64]
+    return sm89_fp8_config_M64::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // M in (64, 128]
+    return sm89_fp8_config_M128::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 256) {
+    // M in (128, 256]
+    return sm89_fp8_config_M256::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // M in (256, inf)
+    return sm89_fp8_config_default::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+}  // namespace vllm
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh
new file mode 100644
index 0000000..87be125
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh
@@ -0,0 +1,353 @@
+#pragma once
+
+#include "scaled_mm_c2x.cuh"
+
+/**
+ * This file defines Gemm kernel configurations for SM89 (int8) based on the
+ * Gemm shape.
+ */
+
+namespace vllm {
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm89_int8_fallback_gemm {
+  // Shared mem requirement : 61440
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = cutlass::gemm::GemmShape<32, 64, 128>;
+  using WarpShape = cutlass::gemm::GemmShape<16, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  static int32_t const MainLoopStages = 5;
+
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+struct sm89_int8_config_default {
+  // M in (256, inf)
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, int8_t>());
+    TORCH_CHECK(a.dtype() == torch::kInt8);
+
+    using FallbackGemm =
+        typename sm89_int8_fallback_gemm<InType, OutType,
+                                         Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 4096) {
+      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 8192) {
+      using TileShape = cutlass::gemm::GemmShape<256, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 16384) {
+      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = cutlass::gemm::GemmShape<256, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_int8_config_M256 {
+  // M in (128, 256]
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, int8_t>());
+    TORCH_CHECK(a.dtype() == torch::kInt8);
+
+    using FallbackGemm =
+        typename sm89_int8_fallback_gemm<InType, OutType,
+                                         Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 4096) {
+      using TileShape = cutlass::gemm::GemmShape<64, 128, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 8192) {
+      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 16384) {
+      using TileShape = cutlass::gemm::GemmShape<256, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_int8_config_M128 {
+  // M in (64, 128]
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, int8_t>());
+    TORCH_CHECK(a.dtype() == torch::kInt8);
+
+    using FallbackGemm =
+        typename sm89_int8_fallback_gemm<InType, OutType,
+                                         Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8192) {
+      using TileShape = cutlass::gemm::GemmShape<64, 128, 128>;
+      using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 16384) {
+      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;
+      using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = cutlass::gemm::GemmShape<64, 64, 128>;
+      using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_int8_config_M64 {
+  // M in (32, 64]
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, int8_t>());
+    TORCH_CHECK(a.dtype() == torch::kInt8);
+
+    using FallbackGemm =
+        typename sm89_int8_fallback_gemm<InType, OutType,
+                                         Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8192) {
+      using TileShape = cutlass::gemm::GemmShape<64, 64, 128>;
+      using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = cutlass::gemm::GemmShape<64, 128, 128>;
+      using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_int8_config_M32 {
+  // M in (16, 32]
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, int8_t>());
+    TORCH_CHECK(a.dtype() == torch::kInt8);
+
+    using FallbackGemm =
+        typename sm89_int8_fallback_gemm<InType, OutType,
+                                         Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8192) {
+      using TileShape = cutlass::gemm::GemmShape<32, 64, 128>;
+      using WarpShape = cutlass::gemm::GemmShape<16, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = cutlass::gemm::GemmShape<32, 128, 128>;
+      using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 4>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_int8_config_M16 {
+  // M in [1, 16]
+  using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, int8_t>());
+    TORCH_CHECK(a.dtype() == torch::kInt8);
+
+    using FallbackGemm =
+        typename sm89_int8_fallback_gemm<InType, OutType,
+                                         Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8192) {
+      using TileShape = cutlass::gemm::GemmShape<16, 64, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = cutlass::gemm::GemmShape<16, 128, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 4>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm89_int8_dispatch(torch::Tensor& out,
+                                            torch::Tensor const& a,
+                                            torch::Tensor const& b,
+                                            EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, int8_t>());
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 16) {
+    // M in [1, 16]
+    return sm89_int8_config_M16::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 32) {
+    // M in (16, 32]
+    return sm89_int8_config_M32::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 64) {
+    // M in (32, 64]
+    return sm89_int8_config_M64::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // M in (64, 128]
+    return sm89_int8_config_M128::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 256) {
+    // M in (128, 256]
+    return sm89_int8_config_M256::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // M in (256, inf)
+    return sm89_int8_config_default::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+}  // namespace vllm
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu
new file mode 100644
index 0000000..0cbd530
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu
@@ -0,0 +1,22 @@
+#include "c3x/scaled_mm_helper.hpp"
+#include "c3x/scaled_mm_kernels.hpp"
+
+/*
+   This file defines quantized GEMM operations using the CUTLASS 3.x API, for
+   NVIDIA GPUs with sm100 (Blackwell).
+*/
+
+#if defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM100
+
+void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
+                             torch::Tensor const& b,
+                             torch::Tensor const& a_scales,
+                             torch::Tensor const& b_scales,
+                             std::optional<torch::Tensor> const& bias) {
+  dispatch_scaled_mm(c, a, b, a_scales, b_scales, bias,
+                     vllm::cutlass_scaled_mm_sm100_fp8,
+                     nullptr,  // int8 not supported on SM100
+                     vllm::cutlass_scaled_mm_blockwise_sm100_fp8);
+}
+
+#endif
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu
new file mode 100644
index 0000000..0c47ab8
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu
@@ -0,0 +1,34 @@
+#include <cudaTypedefs.h>
+#include "c3x/scaled_mm_kernels.hpp"
+
+#include "cuda_utils.h"
+
+/*
+   This file defines quantized GEMM operations using the CUTLASS 3.x API, for
+   NVIDIA GPUs with sm120 (Blackwell Geforce).
+*/
+
+#if defined ENABLE_SCALED_MM_SM120 && ENABLE_SCALED_MM_SM120
+
+void cutlass_scaled_mm_sm120(torch::Tensor& c, torch::Tensor const& a,
+                             torch::Tensor const& b,
+                             torch::Tensor const& a_scales,
+                             torch::Tensor const& b_scales,
+                             std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+  int M = a.size(0), N = b.size(1), K = a.size(1);
+  TORCH_CHECK(
+      (a_scales.numel() == 1 || a_scales.numel() == a.size(0)) &&
+          (b_scales.numel() == 1 || b_scales.numel() == b.size(1)),
+      "Currently, block scaled fp8 gemm is not implemented for Blackwell");
+
+  // Standard per-tensor/per-token/per-channel scaling
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn,
+              "Currently, only fp8 gemm is implemented for Blackwell");
+  vllm::cutlass_scaled_mm_sm120_fp8(c, a, b, a_scales, b_scales, bias);
+}
+
+#endif
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu
new file mode 100644
index 0000000..2113021
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu
@@ -0,0 +1,36 @@
+#include "c3x/scaled_mm_helper.hpp"
+#include "c3x/scaled_mm_kernels.hpp"
+
+/*
+   This file defines quantized GEMM operations using the CUTLASS 3.x API, for
+   NVIDIA GPUs with sm90a (Hopper).
+*/
+
+#if defined ENABLE_SCALED_MM_SM90 && ENABLE_SCALED_MM_SM90
+
+void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales,
+                            std::optional<torch::Tensor> const& bias) {
+  dispatch_scaled_mm(c, a, b, a_scales, b_scales, bias,
+                     vllm::cutlass_scaled_mm_sm90_fp8,
+                     vllm::cutlass_scaled_mm_sm90_int8,
+                     vllm::cutlass_scaled_mm_blockwise_sm90_fp8);
+}
+
+void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                torch::Tensor const& a_scales,
+                                torch::Tensor const& b_scales,
+                                torch::Tensor const& azp_adj,
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+  vllm::cutlass_scaled_mm_azp_sm90_int8(out, a, b, a_scales, b_scales, azp_adj,
+                                        azp, bias);
+}
+
+#endif
diff --git a/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
new file mode 100644
index 0000000..106bacb
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -0,0 +1,392 @@
+#include <cudaTypedefs.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include "cutlass_extensions/common.hpp"
+
+void cutlass_scaled_mm_sm75(torch::Tensor& c, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales,
+                            std::optional<torch::Tensor> const& bias);
+
+void cutlass_scaled_mm_sm80(torch::Tensor& c, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales,
+                            std::optional<torch::Tensor> const& bias);
+
+void cutlass_scaled_mm_sm89(torch::Tensor& c, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales,
+                            std::optional<torch::Tensor> const& bias);
+
+#if defined ENABLE_SCALED_MM_SM90 && ENABLE_SCALED_MM_SM90
+void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales,
+                            std::optional<torch::Tensor> const& bias);
+#endif
+#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
+void cutlass_moe_mm_sm90(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch);
+
+#endif
+
+#if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100
+void cutlass_moe_mm_sm100(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch);
+#endif
+
+#if defined ENABLE_SCALED_MM_SM120 && ENABLE_SCALED_MM_SM120
+void cutlass_scaled_mm_sm120(torch::Tensor& c, torch::Tensor const& a,
+                             torch::Tensor const& b,
+                             torch::Tensor const& a_scales,
+                             torch::Tensor const& b_scales,
+                             std::optional<torch::Tensor> const& bias);
+#endif
+
+#if defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM100
+void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
+                             torch::Tensor const& b,
+                             torch::Tensor const& a_scales,
+                             torch::Tensor const& b_scales,
+                             std::optional<torch::Tensor> const& bias);
+#endif
+
+#if defined(ENABLE_SCALED_MM_SM90) && ENABLE_SCALED_MM_SM90 || \
+    defined(ENABLE_SCALED_MM_SM100) && ENABLE_SCALED_MM_SM100
+void get_cutlass_moe_mm_data_caller(
+    const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
+    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
+    torch::Tensor& input_permutation, torch::Tensor& output_permutation,
+    const int64_t num_experts, const int64_t n, const int64_t k,
+    const std::optional<torch::Tensor>& blockscale_offsets);
+
+void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
+                                         torch::Tensor& problem_sizes1,
+                                         torch::Tensor& problem_sizes2,
+                                         const torch::Tensor& expert_num_tokens,
+                                         const int64_t num_local_experts,
+                                         const int64_t padded_m,
+                                         const int64_t n, const int64_t k);
+#endif
+
+void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                torch::Tensor const& a_scales,
+                                torch::Tensor const& b_scales,
+                                torch::Tensor const& azp_adj,
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias);
+
+void cutlass_scaled_mm_azp_sm80(torch::Tensor& c, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                torch::Tensor const& a_scales,
+                                torch::Tensor const& b_scales,
+                                torch::Tensor const& azp_adj,
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias);
+
+void cutlass_scaled_mm_azp_sm89(torch::Tensor& c, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                torch::Tensor const& a_scales,
+                                torch::Tensor const& b_scales,
+                                torch::Tensor const& azp_adj,
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias);
+
+#if defined ENABLE_SCALED_MM_SM90 && ENABLE_SCALED_MM_SM90
+void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                torch::Tensor const& a_scales,
+                                torch::Tensor const& b_scales,
+                                torch::Tensor const& azp_adj,
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias);
+#endif
+
+bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
+  // CUTLASS FP8 kernels need at least
+  //   CUDA 12.0 on SM90 systems (Hopper)
+  //   CUDA 12.4 on SM89 systems (Lovelace)
+
+#if defined CUDA_VERSION
+  if (cuda_device_capability >= 90) {
+    return CUDA_VERSION >= 12000;
+  } else if (cuda_device_capability >= 89) {
+    return CUDA_VERSION >= 12040;
+  }
+#endif
+
+  return false;
+}
+
+bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability) {
+  // CUTLASS block-quantized FP8 kernels need at least CUDA 12.0
+  // and at least SM90 (Hopper)
+
+#if defined CUDA_VERSION
+  if (cuda_device_capability >= 100) {
+    return CUDA_VERSION >= 12080;
+  } else if (cuda_device_capability >= 90) {
+    return CUDA_VERSION >= 12000;
+  }
+#endif
+
+  return false;
+}
+
+bool cutlass_group_gemm_supported(int64_t cuda_device_capability) {
+  // CUTLASS grouped FP8 kernels need at least CUDA 12.3 and SM90 (Hopper)
+  // or CUDA 12.8 and SM100 (Blackwell)
+
+#if defined CUDA_VERSION
+  if (cuda_device_capability >= 100) {
+    return CUDA_VERSION >= 12080;
+  }
+  if (cuda_device_capability >= 90) {
+    return CUDA_VERSION >= 12030;
+  }
+#endif
+
+  return false;
+}
+
+void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
+                       torch::Tensor const& b, torch::Tensor const& a_scales,
+                       torch::Tensor const& b_scales,
+                       std::optional<torch::Tensor> const& bias) {
+  // Checks for conformality
+  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
+  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
+              b.size(1) == c.size(1));
+
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
+  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
+  TORCH_CHECK(c.stride(0) % 16 == 0 &&
+              b.stride(1) % 16 == 0);  // 16 Byte Alignment
+
+  if (bias) {
+    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
+                bias->dim() == 1);
+  }
+
+  at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
+  int32_t version_num = get_sm_version_num();
+
+#if defined ENABLE_SCALED_MM_SM120 && ENABLE_SCALED_MM_SM120
+  if (version_num >= 120) {
+    cutlass_scaled_mm_sm120(c, a, b, a_scales, b_scales, bias);
+    return;
+  }
+#endif
+
+#if defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM100
+  if (version_num >= 100 && version_num < 120) {
+    cutlass_scaled_mm_sm100(c, a, b, a_scales, b_scales, bias);
+    return;
+  }
+#endif
+
+  // Guard against compilation issues for sm90 kernels
+#if defined ENABLE_SCALED_MM_SM90 && ENABLE_SCALED_MM_SM90
+  if (version_num >= 90 && version_num < 100) {
+    // Hopper
+    cutlass_scaled_mm_sm90(c, a, b, a_scales, b_scales, bias);
+    return;
+  }
+#endif
+
+#if defined ENABLE_SCALED_MM_C2X && ENABLE_SCALED_MM_C2X
+  if (version_num == 89) {
+    // Ada Lovelace
+    cutlass_scaled_mm_sm89(c, a, b, a_scales, b_scales, bias);
+    return;
+  }
+
+  if (version_num >= 80) {
+    // Ampere
+    cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales, bias);
+    return;
+  }
+
+  if (version_num >= 75) {
+    // Turing
+    cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales, bias);
+    return;
+  }
+#endif
+
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_scaled_mm for a compute capability less than "
+      "CUDA device capability: ",
+      version_num);
+}
+
+void cutlass_moe_mm(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
+  int32_t version_num = get_sm_version_num();
+#if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100
+  if (version_num >= 100) {
+    cutlass_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
+                         expert_offsets, problem_sizes, a_strides, b_strides,
+                         c_strides, per_act_token, per_out_ch);
+    return;
+  }
+#endif
+#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
+  if (version_num >= 90) {
+    cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
+                        expert_offsets, problem_sizes, a_strides, b_strides,
+                        c_strides, per_act_token, per_out_ch);
+    return;
+  }
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_scaled_mm for CUDA device capability: ", version_num,
+      ". Required capability: 90 or 100");
+}
+
+void get_cutlass_moe_mm_data(
+    const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
+    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
+    torch::Tensor& input_permutation, torch::Tensor& output_permutation,
+    const int64_t num_experts, const int64_t n, const int64_t k,
+    const std::optional<torch::Tensor>& blockscale_offsets) {
+  // This function currently gets compiled only if we have a valid cutlass moe
+  // mm to run it for.
+  int32_t version_num = get_sm_version_num();
+#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
+    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
+  get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1,
+                                 problem_sizes2, input_permutation,
+                                 output_permutation, num_experts, n, k,
+                                 blockscale_offsets);
+  return;
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled get_cutlass_moe_mm_data: no cutlass_scaled_mm kernel for "
+      "CUDA device capability: ",
+      version_num, ". Required capability: 90 or 100");
+}
+
+void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
+                                  torch::Tensor& problem_sizes1,
+                                  torch::Tensor& problem_sizes2,
+                                  const torch::Tensor& expert_num_tokens,
+                                  const int64_t num_local_experts,
+                                  const int64_t padded_m, const int64_t n,
+                                  const int64_t k) {
+  // This function currently gets compiled only if we have a valid cutlass moe
+  // mm to run it for.
+  int32_t version_num = get_sm_version_num();
+#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
+    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
+  get_cutlass_pplx_moe_mm_data_caller(expert_offsets, problem_sizes1,
+                                      problem_sizes2, expert_num_tokens,
+                                      num_local_experts, padded_m, n, k);
+  return;
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled get_cutlass_pplx_moe_mm_data: no cutlass_scaled_mm kernel "
+      "for CUDA device capability: ",
+      version_num, ". Required capability: 90 or 100");
+}
+
+void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
+                           torch::Tensor const& b,
+                           torch::Tensor const& a_scales,
+                           torch::Tensor const& b_scales,
+                           torch::Tensor const& azp_adj,
+                           std::optional<torch::Tensor> const& azp,
+                           std::optional<torch::Tensor> const& bias) {
+  // Checks for conformality
+  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
+  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
+              b.size(1) == c.size(1));
+  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
+  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
+
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
+  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
+  TORCH_CHECK(c.stride(0) % 16 == 0 &&
+              b.stride(1) % 16 == 0);  // 16 Byte Alignment
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+
+  // bias, azp, azp_adj are all 1d
+  // bias and azp_adj have n elements, azp has m elements
+  if (bias) {
+    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous());
+  }
+  if (azp) {
+    TORCH_CHECK(azp->numel() == a.size(0) && azp->is_contiguous());
+  }
+  TORCH_CHECK(azp_adj.numel() == b.size(1) && azp_adj.is_contiguous());
+
+  // azp & bias types
+  TORCH_CHECK(azp_adj.dtype() == torch::kInt32);
+  TORCH_CHECK(!azp || azp->dtype() == torch::kInt32);
+  TORCH_CHECK(!bias || bias->dtype() == c.dtype(),
+              "currently bias dtype must match output dtype ", c.dtype());
+
+  at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
+
+  int32_t version_num = get_sm_version_num();
+
+#if defined ENABLE_SCALED_MM_SM90 && ENABLE_SCALED_MM_SM90
+  if (version_num >= 90) {
+    cutlass_scaled_mm_azp_sm90(c, a, b, a_scales, b_scales, azp_adj, azp, bias);
+    return;
+  }
+#endif
+
+#if defined ENABLE_SCALED_MM_C2X && ENABLE_SCALED_MM_C2X
+  if (version_num == 89) {
+    // Ada Lovelace
+    cutlass_scaled_mm_azp_sm89(c, a, b, a_scales, b_scales, azp_adj, azp, bias);
+    return;
+  }
+
+  if (version_num >= 80) {
+    // Ampere
+    cutlass_scaled_mm_azp_sm80(c, a, b, a_scales, b_scales, azp_adj, azp, bias);
+    return;
+  }
+
+  // Turing
+  TORCH_CHECK(version_num >= 75);
+  cutlass_scaled_mm_azp_sm75(c, a, b, a_scales, b_scales, azp_adj, azp, bias);
+  return;
+#endif
+
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_scaled_mm_azp for a compute capability less than "
+      "CUDA device capability: ",
+      version_num);
+}
diff --git a/vllm_v0.10.0/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu b/vllm_v0.10.0/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
new file mode 100644
index 0000000..a21ee55
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
@@ -0,0 +1,404 @@
+#include <torch/all.h>
+#include <cutlass/arch/arch.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/group_array_problem_shape.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/gett.hpp"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include <cassert>
+
+using namespace cute;
+
+template <typename ElementAB, typename ElementC, typename ElementSF,
+          typename ElementAccumulator, typename LayoutSFA, typename LayoutSFB,
+          typename ScaleConfig>
+__global__ void __get_group_gemm_starts(
+    ElementAB** a_offsets, ElementAB** b_offsets, ElementC** out_offsets,
+    ElementSF** a_scales_offsets, ElementSF** b_scales_offsets,
+    ElementAccumulator** alpha_offsets, LayoutSFA* layout_sfa_base_as_int,
+    LayoutSFB* layout_sfb_base_as_int, ElementAB* a_base_as_int,
+    ElementAB* b_base_as_int, ElementC* out_base_as_int,
+    ElementSF* a_scales_base_as_int, ElementSF* b_scales_base_as_int,
+    ElementAccumulator* alphas_base_as_int, const int32_t* expert_offsets,
+    const int32_t* sf_offsets, const int32_t* problem_sizes_as_shapes,
+    const int K, const int N) {
+  int64_t expert_id = threadIdx.x;
+  if (expert_id >= gridDim.x * blockDim.x) {
+    return;
+  }
+  // Originally int32_t but upcasting to int64_t to avoid overflow
+  // during offset calculations
+  int64_t expert_offset = static_cast<int64_t>(expert_offsets[expert_id]);
+  int64_t sf_offset = static_cast<int64_t>(sf_offsets[expert_id]);
+  // size for block in block scale.
+  int64_t group_size = 16;
+  int64_t m = static_cast<int64_t>(problem_sizes_as_shapes[expert_id * 3]);
+  int64_t n = static_cast<int64_t>(problem_sizes_as_shapes[expert_id * 3 + 1]);
+  int64_t k = static_cast<int64_t>(problem_sizes_as_shapes[expert_id * 3 + 2]);
+  assert((m >= 0 && n == N && k == K && k % 2 == 0) &&
+         "unexpected problem sizes");
+
+  int64_t half_k = static_cast<int64_t>(k / 2);
+  int64_t group_k = static_cast<int64_t>(k / group_size);
+  // Shape of A as uint8/byte = [M, K // 2]
+  // Shape of B as uint8/byte = [E, N, K // 2]
+  a_offsets[expert_id] = a_base_as_int + expert_offset * half_k;
+
+  b_offsets[expert_id] = b_base_as_int + expert_id * n * half_k;
+  // Shape of C = [M, N]
+  out_offsets[expert_id] = out_base_as_int + expert_offset * n;
+  // Shape of a_scale = [sum(sf_sizes), K // group_size]
+  a_scales_offsets[expert_id] = a_scales_base_as_int + sf_offset * group_k;
+
+  assert((reinterpret_cast<uintptr_t>(a_scales_offsets[expert_id]) % 128) ==
+             0 &&
+         "TMA requires 128-byte alignment");
+
+  // Shape of B scale = [E, N, K // group_size]
+  b_scales_offsets[expert_id] = b_scales_base_as_int + expert_id * n * group_k;
+  assert((reinterpret_cast<uintptr_t>(b_scales_offsets[expert_id]) % 128) ==
+             0 &&
+         "TMA requires 128-byte alignment");
+  // Shape of alpha = [E]
+  alpha_offsets[expert_id] = alphas_base_as_int + expert_id;
+
+  LayoutSFA* layout_sfa_ptr = layout_sfa_base_as_int + expert_id;
+  LayoutSFB* layout_sfb_ptr = layout_sfb_base_as_int + expert_id;
+
+  *layout_sfa_ptr = ScaleConfig::tile_atom_to_shape_SFA(cute::make_shape(
+      static_cast<int>(m), static_cast<int>(n), static_cast<int>(k), 1));
+  *layout_sfb_ptr = ScaleConfig::tile_atom_to_shape_SFB(cute::make_shape(
+      static_cast<int>(m), static_cast<int>(n), static_cast<int>(k), 1));
+}
+
+#define __CALL_GET_STARTS_KERNEL_BLOCKSCALE(ELEMENT_AB_TYPE, SF_TYPE,         \
+                                            TENSOR_C_TYPE, C_TYPE, LayoutSFA, \
+                                            LayoutSFB, ScaleConfig)           \
+  else if (out_tensors.dtype() == TENSOR_C_TYPE) {                            \
+    __get_group_gemm_starts<ELEMENT_AB_TYPE, C_TYPE, SF_TYPE, float,          \
+                            LayoutSFA, LayoutSFB, ScaleConfig>                \
+        <<<1, num_experts, 0, stream>>>(                                      \
+            static_cast<ELEMENT_AB_TYPE**>(a_starts.data_ptr()),              \
+            static_cast<ELEMENT_AB_TYPE**>(b_starts.data_ptr()),              \
+            static_cast<C_TYPE**>(out_starts.data_ptr()),                     \
+            static_cast<SF_TYPE**>(a_scales_starts.data_ptr()),               \
+            static_cast<SF_TYPE**>(b_scales_starts.data_ptr()),               \
+            static_cast<float**>(alpha_starts.data_ptr()),                    \
+            reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),              \
+            reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr()),              \
+            static_cast<ELEMENT_AB_TYPE*>(a_tensors.data_ptr()),              \
+            static_cast<ELEMENT_AB_TYPE*>(b_tensors.data_ptr()),              \
+            static_cast<C_TYPE*>(out_tensors.data_ptr()),                     \
+            static_cast<SF_TYPE*>(a_scales.data_ptr()),                       \
+            static_cast<SF_TYPE*>(b_scales.data_ptr()),                       \
+            static_cast<float*>(alphas.data_ptr()),                           \
+            static_cast<int32_t*>(expert_offsets.data_ptr()),                 \
+            static_cast<int32_t*>(sf_offsets.data_ptr()),                     \
+            static_cast<int32_t*>(problem_sizes.data_ptr()), K, N);           \
+  }
+
+template <typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
+void run_get_group_gemm_starts(
+    const torch::Tensor& a_starts, const torch::Tensor& b_starts,
+    const torch::Tensor& out_starts, const torch::Tensor& a_scales_starts,
+    const torch::Tensor& b_scales_starts, const torch::Tensor& alpha_starts,
+    const torch::Tensor& layout_sfa, const torch::Tensor& layout_sfb,
+    /*these are used for their base addresses*/
+    torch::Tensor const& a_tensors, torch::Tensor const& b_tensors,
+    torch::Tensor const& out_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& alphas,
+    torch::Tensor const& expert_offsets, torch::Tensor const& sf_offsets,
+    torch::Tensor const& problem_sizes, int M, int N, int K) {
+  int num_experts = (int)expert_offsets.size(0);
+  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
+
+  TORCH_CHECK(out_tensors.size(1) == N,
+              "Output tensor shape doesn't match expected shape");
+  TORCH_CHECK(K / 2 == b_tensors.size(2),
+              "b_tensors(dim = 2) and a_tensors(dim = 1) trailing"
+              " dimension must match");
+  if (false) {
+  }
+  //(ELEMENT_AB_TYPE, BS_TYPE, TENSOR_C_TYPE, C_TYPE, LayoutSFA, LayoutSFB,
+  // ScaleConfig)
+  __CALL_GET_STARTS_KERNEL_BLOCKSCALE(
+      cutlass::float_e2m1_t, cutlass::float_ue4m3_t, torch::kBFloat16,
+      cutlass::bfloat16_t, LayoutSFA, LayoutSFB, ScaleConfig)
+  __CALL_GET_STARTS_KERNEL_BLOCKSCALE(cutlass::float_e2m1_t,
+                                      cutlass::float_ue4m3_t, torch::kFloat16,
+                                      half, LayoutSFA, LayoutSFB, ScaleConfig)
+  else {
+    TORCH_CHECK(false, "Invalid output type (must be float16 or bfloat16)");
+  }
+}
+
+template <typename OutType>
+void run_fp4_blockwise_scaled_group_mm(
+    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
+    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
+    const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
+    const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets, int M,
+    int N, int K) {
+  using ProblemShape =
+      cutlass::gemm::GroupProblemShape<Shape<int32_t, int32_t, int32_t>>;
+  using ElementType = cutlass::float_e2m1_t;
+  using ElementSFType = cutlass::float_ue4m3_t;
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  using ElementC = OutType;
+  using ElementD = ElementC;
+  using ElementAccumulator = float;
+  // Layout definitions
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = LayoutC;
+
+  // Alignment constraints
+  static constexpr int AlignmentA = 32;
+  static constexpr int AlignmentB = 32;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  // Architecture definitions
+  using ArchTag = cutlass::arch::Sm100;
+  using EpilogueOperatorClass =
+      cutlass::arch::OpClassTensorOp;  // Epilogue Operator class tag
+  using MainloopOperatorClass =
+      cutlass::arch::OpClassBlockScaledTensorOp;  // Mainloop Operator class tag
+  using StageCountType =
+      cutlass::gemm::collective::StageCountAuto;  // Stage count maximized based
+                                                  // on the tile size
+
+  using ClusterShape = Shape<_1, _1, _1>;
+  struct MMA1SMConfig {
+    using MmaTileShape = Shape<_128, _128, _128>;
+    using KernelSchedule = cutlass::gemm::
+        KernelPtrArrayTmaWarpSpecialized1SmNvf4Sm100;  // Kernel to launch
+    using EpilogueSchedule =
+        cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;  // Epilogue to launch
+  };
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, EpilogueOperatorClass, typename MMA1SMConfig::MmaTileShape,
+          ClusterShape, Shape<_128, _64>, ElementAccumulator,
+          ElementAccumulator, ElementC, LayoutC*, AlignmentC, ElementD,
+          LayoutC*, AlignmentD,
+          typename MMA1SMConfig::EpilogueSchedule>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, MainloopOperatorClass, ElementA, LayoutA*, AlignmentA,
+          ElementB, LayoutB*, AlignmentB, ElementAccumulator,
+          typename MMA1SMConfig::MmaTileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          typename MMA1SMConfig::KernelSchedule>::CollectiveOp;
+
+  using GemmKernel =
+      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
+                                           CollectiveEpilogue>;
+
+  using Gemm1SM = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  using Gemm = Gemm1SM;
+  using StrideA = typename Gemm::GemmKernel::InternalStrideA;
+  using StrideB = typename Gemm::GemmKernel::InternalStrideB;
+  using StrideC = typename Gemm::GemmKernel::InternalStrideC;
+  using StrideD = typename Gemm::GemmKernel::InternalStrideD;
+
+  using LayoutSFA =
+      typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFA;
+  using LayoutSFB =
+      typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFB;
+  using ScaleConfig =
+      typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+
+  using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
+  int num_experts = static_cast<int>(expert_offsets.size(0));
+  auto options_int =
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device());
+
+  torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor alpha_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor layout_sfa = torch::empty({num_experts, 5}, options_int);
+  torch::Tensor layout_sfb = torch::empty({num_experts, 5}, options_int);
+  torch::Tensor c_strides1 =
+      torch::full({num_experts}, output.stride(0), options_int);
+  torch::Tensor a_strides1 =
+      torch::full({num_experts}, a.stride(0) * 2, options_int);
+  torch::Tensor b_strides1 =
+      torch::full({num_experts}, b.stride(1) * 2, options_int);
+
+  run_get_group_gemm_starts<LayoutSFA, LayoutSFB, ScaleConfig>(
+      a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, alpha_ptrs,
+      layout_sfa, layout_sfb, a, b, output, a_blockscale, b_blockscales, alphas,
+      expert_offsets, sf_offsets, problem_sizes, M, N, K);
+
+  // Create an instance of the GEMM
+  Gemm gemm_op;
+
+  // Initialize problem_sizes_as_shapes correctly
+  UnderlyingProblemShape* problem_sizes_as_shapes =
+      static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
+
+  // Set the Scheduler info
+  cutlass::KernelHardwareInfo hw_info;
+  using RasterOrderOptions = typename cutlass::gemm::kernel::detail::
+      PersistentTileSchedulerSm100GroupParams<
+          typename ProblemShape::UnderlyingProblemShape>::RasterOrderOptions;
+  typename Gemm::GemmKernel::TileSchedulerArguments scheduler;
+  scheduler.raster_order = RasterOrderOptions::AlongM;
+  hw_info.device_id = a.get_device();
+  static std::unordered_map<int, int> cached_sm_counts;
+  if (cached_sm_counts.find(hw_info.device_id) == cached_sm_counts.end()) {
+    cached_sm_counts[hw_info.device_id] =
+        cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+            hw_info.device_id);
+  }
+  hw_info.sm_count = min(cached_sm_counts[hw_info.device_id], INT_MAX);
+
+  // Mainloop Arguments
+  typename GemmKernel::MainloopArguments mainloop_args{
+      static_cast<const ElementType**>(a_ptrs.data_ptr()),
+      static_cast<StrideA*>(a_strides1.data_ptr()),
+      static_cast<const ElementType**>(b_ptrs.data_ptr()),
+      static_cast<StrideB*>(b_strides1.data_ptr()),
+      static_cast<const ElementSFType**>(a_scales_ptrs.data_ptr()),
+      reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),
+      static_cast<const ElementSFType**>(b_scales_ptrs.data_ptr()),
+      reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr())};
+
+  // Epilogue Arguments
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      {},  // epilogue.thread
+      nullptr,
+      static_cast<StrideC*>(c_strides1.data_ptr()),
+      static_cast<ElementD**>(out_ptrs.data_ptr()),
+      static_cast<StrideC*>(c_strides1.data_ptr())};
+  auto& fusion_args = epilogue_args.thread;
+  fusion_args.alpha_ptr_array =
+      reinterpret_cast<float**>(alpha_ptrs.data_ptr());
+  fusion_args.dAlpha = {_0{}, _0{}, 1};
+
+  // Gemm Arguments
+  typename GemmKernel::Arguments args{
+      cutlass::gemm::GemmUniversalMode::kGrouped,
+      {num_experts, problem_sizes_as_shapes, nullptr},
+      mainloop_args,
+      epilogue_args,
+      hw_info,
+      scheduler};
+
+  size_t workspace_size = Gemm::get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  auto can_implement_status = gemm_op.can_implement(args);
+  TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
+              "Failed to implement GEMM");
+
+  // Run the GEMM
+  auto status = gemm_op.initialize(args, workspace.data_ptr());
+  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
+
+  status = gemm_op.run(args, workspace.data_ptr(), stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
+}
+
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte;
+constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn;
+#endif
+
+#define CHECK_TYPE(x, st, m) \
+  TORCH_CHECK(x.scalar_type() == st, ": Inconsistency of Tensor type:", m)
+#define CHECK_TH_CUDA(x, m) \
+  TORCH_CHECK(x.is_cuda(), m, ": must be a CUDA tensor.")
+#define CHECK_CONTIGUOUS(x, m) \
+  TORCH_CHECK(x.is_contiguous(), m, ": must be contiguous.")
+#define CHECK_INPUT(x, st, m) \
+  CHECK_TH_CUDA(x, m);        \
+  CHECK_CONTIGUOUS(x, m);     \
+  CHECK_TYPE(x, st, m)
+
+void cutlass_fp4_group_mm(
+    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
+    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
+    const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
+    const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets) {
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+  // Input validation
+  CHECK_INPUT(a, FLOAT4_E2M1X2, "a");
+  CHECK_INPUT(b, FLOAT4_E2M1X2, "b");
+  CHECK_INPUT(a_blockscale, SF_DTYPE, "a_blockscale");
+  CHECK_INPUT(b_blockscales, SF_DTYPE, "b_blockscales");
+  CHECK_INPUT(alphas, at::ScalarType::Float, "alphas");
+
+  TORCH_CHECK(a_blockscale.dim() == 2,
+              "expected a_blockscale to be of shape [num_experts, rounded_m,"
+              " k // group_size], observed rank: ",
+              a_blockscale.dim())
+  TORCH_CHECK(b_blockscales.dim() == 3,
+              "expected b_blockscale to be of shape: "
+              " [num_experts, n, k // group_size], observed rank: ",
+              b_blockscales.dim())
+  TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be  a 2D tensor");
+  TORCH_CHECK(problem_sizes.size(1) == 3,
+              "problem_sizes must have the shape (num_experts, 3)");
+  TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
+              "Number of experts in problem_sizes must match expert_offsets");
+  TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
+              "problem_sizes must be int32.");
+
+  int M = static_cast<int>(a.size(0));
+  int N = static_cast<int>(b.size(1));
+  int E = static_cast<int>(b.size(0));
+  int K = static_cast<int>(2 * b.size(2));
+
+  if (output.scalar_type() == torch::kBFloat16) {
+    run_fp4_blockwise_scaled_group_mm<cutlass::bfloat16_t>(
+        output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
+        expert_offsets, sf_offsets, M, N, K);
+  } else {
+    run_fp4_blockwise_scaled_group_mm<cutlass::half_t>(
+        output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
+        expert_offsets, sf_offsets, M, N, K);
+  }
+#else
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_fp4_group_mm kernel, vLLM must "
+      "be compiled with ENABLE_NVFP4 for SM100+ and CUDA "
+      "12.8 or above.");
+#endif
+}
diff --git a/vllm_v0.10.0/csrc/quantization/fp4/nvfp4_experts_quant.cu b/vllm_v0.10.0/csrc/quantization/fp4/nvfp4_experts_quant.cu
new file mode 100644
index 0000000..190d66f
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/fp4/nvfp4_experts_quant.cu
@@ -0,0 +1,582 @@
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <cuda_runtime.h>
+#include <cuda_fp8.h>
+
+template <typename T>
+struct TypeConverter {
+  using Type = half2;
+};  // keep for generality
+
+template <>
+struct TypeConverter<half2> {
+  using Type = half;
+};
+
+template <>
+struct TypeConverter<half> {
+  using Type = half2;
+};
+
+template <>
+struct TypeConverter<__nv_bfloat162> {
+  using Type = __nv_bfloat16;
+};
+
+template <>
+struct TypeConverter<__nv_bfloat16> {
+  using Type = __nv_bfloat162;
+};
+
+#define ELTS_PER_THREAD 8
+
+constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
+constexpr int CVT_FP4_SF_VEC_SIZE = 16;
+
+// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
+inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  uint32_t val;
+  asm volatile(
+      "{\n"
+      ".reg .b8 byte0;\n"
+      ".reg .b8 byte1;\n"
+      ".reg .b8 byte2;\n"
+      ".reg .b8 byte3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
+      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
+      "}"
+      : "=r"(val)
+      : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]),
+        "f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7]));
+  return val;
+#else
+  return 0;
+#endif
+}
+
+// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
+inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  uint32_t val;
+  asm volatile(
+      "{\n"
+      ".reg .b8 byte0;\n"
+      ".reg .b8 byte1;\n"
+      ".reg .b8 byte2;\n"
+      ".reg .b8 byte3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
+      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
+      "}"
+      : "=r"(val)
+      : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
+        "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
+  return val;
+#else
+  return 0;
+#endif
+}
+
+// Fast reciprocal.
+inline __device__ float reciprocal_approximate_ftz(float a) {
+  float b;
+  asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
+  return b;
+}
+
+template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
+__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
+                                                       int numCols,
+                                                       SFType* SFout) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
+                CVT_FP4_NUM_THREADS_PER_SF == 2);
+
+  // One pair of threads write one SF to global memory.
+  // TODO: stage through smem for packed STG.32
+  // is it better than STG.8 from 4 threads ?
+  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
+    // SF vector index (16 elements share one SF in the K dimension).
+    int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
+    int32_t mIdx = rowIdx;
+
+    // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
+    // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
+
+    int32_t mTileIdx = mIdx / (32 * 4);
+    // SF vector size 16.
+    int factor = CVT_FP4_SF_VEC_SIZE * 4;
+    int32_t numKTiles = (numCols + factor - 1) / factor;
+    int64_t mTileStride = numKTiles * 32 * 4 * 4;
+
+    int32_t kTileIdx = (kIdx / 4);
+    int64_t kTileStride = 32 * 4 * 4;
+
+    // M tile layout [32, 4] is column-major.
+    int32_t outerMIdx = (mIdx % 32);
+    int64_t outerMStride = 4 * 4;
+
+    int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
+    int64_t innerMStride = 4;
+
+    int32_t innerKIdx = (kIdx % 4);
+    int64_t innerKStride = 1;
+
+    // Compute the global offset.
+    int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
+                       outerMIdx * outerMStride + innerMIdx * innerMStride +
+                       innerKIdx * innerKStride;
+
+    return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
+  }
+#endif
+  return nullptr;
+}
+
+// Define a 16 bytes packed data type.
+template <class Type>
+struct PackedVec {
+  typename TypeConverter<Type>::Type elts[4];
+};
+
+template <>
+struct PackedVec<__nv_fp8_e4m3> {
+  __nv_fp8x2_e4m3 elts[8];
+};
+
+// Quantizes the provided PackedVec into the uint32_t output
+template <class Type, bool UE8M0_SF = false>
+__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
+                                         uint8_t* SFout) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  // Get absolute maximum values among the local 8 values.
+  auto localMax = __habs2(vec.elts[0]);
+
+  // Local maximum value.
+  #pragma unroll
+  for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    localMax = __hmax2(localMax, __habs2(vec.elts[i]));
+  }
+
+  // Get the absolute maximum among all 16 values (two threads).
+  localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
+  // Get the final absolute maximum values.
+  float vecMax = float(__hmax(localMax.x, localMax.y));
+
+  // Get the SF (max value of the vector / max value of e2m1).
+  // maximum value of e2m1 = 6.0.
+  // TODO: use half as compute data type.
+  float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
+  // 8 bits representation of the SF.
+  uint8_t fp8SFVal;
+  // Write the SF to global memory (STG.8).
+  if constexpr (UE8M0_SF) {
+    // Extract the 8 exponent bits from float32.
+    // float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits.
+    uint32_t tmp = reinterpret_cast<uint32_t&>(SFValue) >> 23;
+    fp8SFVal = tmp & 0xff;
+    // Convert back to fp32.
+    reinterpret_cast<uint32_t&>(SFValue) = tmp << 23;
+  } else {
+    // Here SFValue is always positive, so E4M3 is the same as UE4M3.
+    __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
+    reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp;
+    // Convert back to fp32.
+    SFValue = float(tmp);
+  }
+  // Get the output scale.
+  // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) *
+  //                       reciprocal(SFScaleVal))
+  float outputScale =
+      SFValue != 0 ? reciprocal_approximate_ftz(
+                         SFValue * reciprocal_approximate_ftz(SFScaleVal))
+                   : 0.0f;
+
+  if (SFout) {
+    // Write the SF to global memory (STG.8).
+    *SFout = fp8SFVal;
+  }
+
+  // Convert the input to float.
+  float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
+
+  #pragma unroll
+  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    if constexpr (std::is_same_v<Type, half>) {
+      fp2Vals[i] = __half22float2(vec.elts[i]);
+    } else {
+      fp2Vals[i] = __bfloat1622float2(vec.elts[i]);
+    }
+    fp2Vals[i].x *= outputScale;
+    fp2Vals[i].y *= outputScale;
+  }
+
+  // Convert to e2m1 values.
+  uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
+
+  // Write the e2m1 values to global memory.
+  return e2m1Vec;
+#else
+  return 0;
+#endif
+}
+
+// Use UE4M3 by default.
+template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
+__global__ void
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+__launch_bounds__(512, 4) cvt_fp16_to_fp4(
+#else
+cvt_fp16_to_fp4(
+#endif
+    int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
+    uint32_t* out, uint32_t* SFout, uint32_t* input_offset_by_experts,
+    uint32_t* output_scale_offset_by_experts, int n_experts, bool low_latency) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  using PackedVec = PackedVec<Type>;
+  static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
+      (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
+  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
+                "Vec size is not matched.");
+
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int colsPerRow = numCols / CVT_FP4_ELTS_PER_THREAD;
+
+  // Each global thread processes one element
+  for (int globalIdx = tid; globalIdx < numRows * colsPerRow;
+       globalIdx += gridDim.x * blockDim.x) {
+    // Calculate which row and column this global thread should process
+    int rowIdx = globalIdx / colsPerRow;
+    int colIdx = globalIdx % colsPerRow;
+
+    int64_t inOffset = rowIdx * colsPerRow + colIdx;
+    PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
+    // Get the output tensor offset.
+    // Same as inOffset because 8 elements are packed into one uint32_t.
+    int64_t outOffset = inOffset;
+    auto& out_pos = out[outOffset];
+
+    // Find index within the experts using different strategies based on expert
+    // count
+    int rowIdx_in_expert = 0;
+    int expert_idx = 0;
+
+    if constexpr (SMALL_NUM_EXPERTS) {
+      for (int i = 0; i < n_experts; i++) {
+        uint32_t current_offset = __ldca(&input_offset_by_experts[i]);
+        uint32_t next_offset = __ldca(&input_offset_by_experts[i + 1]);
+        if (rowIdx >= current_offset && rowIdx < next_offset) {
+          rowIdx_in_expert = rowIdx - current_offset;
+          expert_idx = i;
+          break;
+        }
+      }
+    } else {
+      // Load input offsets into registers first, then do the computation.
+      // Local array size set to 17 because of register limit.
+      uint32_t local_offsets[17];
+      for (int chunk_start = 0; chunk_start < n_experts; chunk_start += 16) {
+        *reinterpret_cast<int4*>(local_offsets) =
+            __ldca(reinterpret_cast<const int4*>(
+                &input_offset_by_experts[chunk_start]));
+        *reinterpret_cast<int4*>(local_offsets + 4) =
+            __ldca(reinterpret_cast<const int4*>(
+                &input_offset_by_experts[chunk_start + 4]));
+        *reinterpret_cast<int4*>(local_offsets + 8) =
+            __ldca(reinterpret_cast<const int4*>(
+                &input_offset_by_experts[chunk_start + 8]));
+        *reinterpret_cast<int4*>(local_offsets + 12) =
+            __ldca(reinterpret_cast<const int4*>(
+                &input_offset_by_experts[chunk_start + 12]));
+        local_offsets[16] = __ldca(&input_offset_by_experts[chunk_start + 16]);
+
+  // Check against the 16 loaded offsets
+  #pragma unroll
+        for (int i = 0; i < 16; i++) {
+          if (rowIdx >= local_offsets[i] && rowIdx < local_offsets[i + 1]) {
+            rowIdx_in_expert = rowIdx - local_offsets[i];
+            expert_idx = chunk_start + i;
+            break;
+          }
+        }
+      }
+    }
+
+    // Get the global scaling factor, which will be applied to the SF.
+    // Note SFScale is the same as next GEMM's alpha, which is
+    // (448.f / (Alpha_A / 6.f)).
+    float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx];
+
+    int factor = CVT_FP4_SF_VEC_SIZE * 4;
+    // The actual output_scales dim is computed from the padded numCols.
+    int32_t numCols_padded = (numCols + factor - 1) / factor * factor;
+    int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4;
+    uint32_t* SFout_in_expert =
+        SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout;
+
+    auto sf_out =
+        cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
+                                           CVT_FP4_NUM_THREADS_PER_SF>(
+            rowIdx_in_expert, colIdx, numCols, SFout_in_expert);
+
+    out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
+  }
+#endif
+}
+
+// Kernel for LARGE_M_TOPK = true (large m_topk optimized version)
+template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
+__global__ void
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+__launch_bounds__(1024, 4) cvt_fp16_to_fp4(
+#else
+cvt_fp16_to_fp4(
+#endif
+    int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
+    uint32_t* out, uint32_t* SFout, uint32_t* input_offset_by_experts,
+    uint32_t* output_scale_offset_by_experts, int n_experts) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  using PackedVec = PackedVec<Type>;
+  static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
+      (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
+  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
+                "Vec size is not matched.");
+  extern __shared__ uint32_t shared_input_offsets[];
+
+  // Load input offsets into shared memory.
+  // If n_experts is larger than 4, use vectorized int4 to save instructions.
+  // If n_experts is smaller than 4, read directly.
+  if constexpr (SMALL_NUM_EXPERTS) {
+    for (int i = threadIdx.x; i < n_experts + 1; i += blockDim.x) {
+      shared_input_offsets[i] = input_offset_by_experts[i];
+    }
+  } else {
+    for (int i = threadIdx.x * 4; i < n_experts; i += blockDim.x * 4) {
+      *reinterpret_cast<int4*>(&shared_input_offsets[i]) =
+          *reinterpret_cast<const int4*>(&input_offset_by_experts[i]);
+    }
+    if (threadIdx.x == 0) {
+      shared_input_offsets[n_experts] = input_offset_by_experts[n_experts];
+    }
+  }
+
+  __syncthreads();
+
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int colsPerRow = numCols / CVT_FP4_ELTS_PER_THREAD;
+
+  // Each global thread processes one element
+  for (int globalIdx = tid; globalIdx < numRows * colsPerRow;
+       globalIdx += gridDim.x * blockDim.x) {
+    // Calculate which row and column this global thread should process
+    int rowIdx = globalIdx / colsPerRow;
+    int colIdx = globalIdx % colsPerRow;
+
+    int64_t inOffset = rowIdx * colsPerRow + colIdx;
+    PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
+    int64_t outOffset = inOffset;
+    auto& out_pos = out[outOffset];
+
+    // Find expert using binary search for better performance with large m_topk
+    int rowIdx_in_expert = 0;
+    int expert_idx = 0;
+
+    // Binary search through experts using shared memory
+    int left = 0, right = n_experts - 1;
+    while (left <= right) {
+      int mid = (left + right) / 2;
+      // Get offsets: shared_input_offsets[i] corresponds to
+      // input_offset_by_experts[i]
+      uint32_t mid_offset = shared_input_offsets[mid];
+      uint32_t next_offset = shared_input_offsets[mid + 1];
+
+      if (rowIdx >= mid_offset && rowIdx < next_offset) {
+        rowIdx_in_expert = rowIdx - mid_offset;
+        expert_idx = mid;
+        break;
+      } else if (rowIdx < mid_offset) {
+        right = mid - 1;
+      } else {
+        left = mid + 1;
+      }
+    }
+
+    float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx];
+
+    int factor = CVT_FP4_SF_VEC_SIZE * 4;
+    int32_t numCols_padded = (numCols + factor - 1) / factor * factor;
+    int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4;
+    uint32_t* SFout_in_expert =
+        SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout;
+
+    auto sf_out =
+        cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
+                                           CVT_FP4_NUM_THREADS_PER_SF>(
+            rowIdx_in_expert, colIdx, numCols, SFout_in_expert);
+
+    out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
+  }
+#endif
+}
+
+template <typename T>
+void quant_impl(void* output, void* output_scale, void* input,
+                void* input_global_scale, void* input_offset_by_experts,
+                void* output_scale_offset_by_experts, int m_topk, int k,
+                int n_experts, cudaStream_t stream) {
+  // TODO: this multiProcessorCount should be cached.
+  int device;
+  cudaGetDevice(&device);
+  int multiProcessorCount;
+  cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount,
+                         device);
+
+  // Grid, Block size.
+  // Each thread converts 8 values.
+  int const workSizePerRow = k / ELTS_PER_THREAD;
+  int const totalWorkSize = m_topk * workSizePerRow;
+  dim3 block(std::min(workSizePerRow, 512));
+  // Get number of blocks per SM (assume we can fully utilize the SM).
+  int const numBlocksPerSM = 2048 / block.x;
+  dim3 grid(std::min(static_cast<int>((totalWorkSize + block.x - 1) / block.x),
+                     multiProcessorCount * numBlocksPerSM));
+  while (grid.x <= multiProcessorCount && block.x > 64) {
+    grid.x *= 2;
+    block.x = (block.x + 1) / 2;
+  }
+
+  int const blockRepeat =
+      (totalWorkSize + block.x * grid.x - 1) / (block.x * grid.x);
+  if (blockRepeat > 1) {
+    size_t shared_mem_size = (n_experts + 1) * sizeof(uint32_t);
+    if (n_experts >= 4) {
+      cvt_fp16_to_fp4<T, false, false>
+          <<<grid, block, shared_mem_size, stream>>>(
+              m_topk, k, reinterpret_cast<T*>(input),
+              reinterpret_cast<float*>(input_global_scale),
+              reinterpret_cast<uint32_t*>(output),
+              reinterpret_cast<uint32_t*>(output_scale),
+              reinterpret_cast<uint32_t*>(input_offset_by_experts),
+              reinterpret_cast<uint32_t*>(output_scale_offset_by_experts),
+              n_experts);
+    } else {
+      cvt_fp16_to_fp4<T, false, true><<<grid, block, shared_mem_size, stream>>>(
+          m_topk, k, reinterpret_cast<T*>(input),
+          reinterpret_cast<float*>(input_global_scale),
+          reinterpret_cast<uint32_t*>(output),
+          reinterpret_cast<uint32_t*>(output_scale),
+          reinterpret_cast<uint32_t*>(input_offset_by_experts),
+          reinterpret_cast<uint32_t*>(output_scale_offset_by_experts),
+          n_experts);
+    }
+  } else {
+    if (n_experts >= 16) {
+      cvt_fp16_to_fp4<T, false, false><<<grid, block, 0, stream>>>(
+          m_topk, k, reinterpret_cast<T*>(input),
+          reinterpret_cast<float*>(input_global_scale),
+          reinterpret_cast<uint32_t*>(output),
+          reinterpret_cast<uint32_t*>(output_scale),
+          reinterpret_cast<uint32_t*>(input_offset_by_experts),
+          reinterpret_cast<uint32_t*>(output_scale_offset_by_experts),
+          n_experts, /* bool low_latency */ true);
+    } else {
+      cvt_fp16_to_fp4<T, false, true><<<grid, block, 0, stream>>>(
+          m_topk, k, reinterpret_cast<T*>(input),
+          reinterpret_cast<float*>(input_global_scale),
+          reinterpret_cast<uint32_t*>(output),
+          reinterpret_cast<uint32_t*>(output_scale),
+          reinterpret_cast<uint32_t*>(input_offset_by_experts),
+          reinterpret_cast<uint32_t*>(output_scale_offset_by_experts),
+          n_experts, /* bool low_latency */ true);
+    }
+  }
+}
+
+/*Quantization entry for fp4 experts quantization*/
+#define CHECK_TH_CUDA(x, m) TORCH_CHECK(x.is_cuda(), m, "must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x, m) \
+  TORCH_CHECK(x.is_contiguous(), m, "must be contiguous")
+#define CHECK_INPUT(x, m) \
+  CHECK_TH_CUDA(x, m);    \
+  CHECK_CONTIGUOUS(x, m);
+
+constexpr auto HALF = at::ScalarType::Half;
+constexpr auto BF16 = at::ScalarType::BFloat16;
+constexpr auto FLOAT = at::ScalarType::Float;
+constexpr auto INT = at::ScalarType::Int;
+constexpr auto UINT8 = at::ScalarType::Byte;
+
+void scaled_fp4_experts_quant_sm100a(
+    torch::Tensor& output, torch::Tensor& output_scale,
+    torch::Tensor const& input, torch::Tensor const& input_global_scale,
+    torch::Tensor const& input_offset_by_experts,
+    torch::Tensor const& output_scale_offset_by_experts) {
+  CHECK_INPUT(output, "output must be a CUDA tensor");
+  CHECK_INPUT(output_scale, "output_scale must be a CUDA tensor");
+  CHECK_INPUT(input, "input must be a CUDA tensor");
+  CHECK_INPUT(input_global_scale, "input_global_scale must be a CUDA tensor");
+  CHECK_INPUT(input_offset_by_experts,
+              "input_offset_by_experts must be a CUDA tensor");
+  CHECK_INPUT(output_scale_offset_by_experts,
+              "output_scale_offset_by_experts must be a CUDA tensor");
+
+  TORCH_CHECK(output.dim() == 2);
+  TORCH_CHECK(output_scale.dim() == 2);
+  TORCH_CHECK(input.dim() == 2);
+  TORCH_CHECK(input_global_scale.dim() == 1);
+  TORCH_CHECK(input_offset_by_experts.dim() == 1);
+  TORCH_CHECK(output_scale_offset_by_experts.dim() == 1);
+
+  TORCH_CHECK(input.scalar_type() == HALF || input.scalar_type() == BF16);
+  TORCH_CHECK(input_global_scale.scalar_type() == FLOAT);
+  TORCH_CHECK(input_offset_by_experts.scalar_type() == INT);
+  TORCH_CHECK(output_scale_offset_by_experts.scalar_type() == INT);
+  // output is uint8 (two nvfp4 values are packed into one uint8)
+  // output_scale is int32 (four fp8 values are packed into one int32)
+  TORCH_CHECK(output.scalar_type() == UINT8);
+  TORCH_CHECK(output_scale.scalar_type() == INT);
+
+  const int BLOCK_SIZE = 16;
+  auto m_topk = input.size(0);
+  auto k = input.size(1);
+  TORCH_CHECK(k % BLOCK_SIZE == 0, "k must be a multiple of 16");
+  auto n_experts = input_global_scale.size(0);
+  TORCH_CHECK(input_offset_by_experts.size(0) == n_experts + 1);
+  TORCH_CHECK(output_scale_offset_by_experts.size(0) == n_experts + 1);
+  TORCH_CHECK(output.size(0) == m_topk);
+  TORCH_CHECK(output.size(1) == k / 2);
+  int scales_k = k / BLOCK_SIZE;
+  // 4 means the swizzle requirement by nvidia nvfp4.
+  int padded_k = (scales_k + (4 - 1)) / 4 * 4;
+  // 4 means 4 fp8 values are packed into one int32
+  TORCH_CHECK(output_scale.size(1) * 4 == padded_k);
+
+  auto in_dtype = input.dtype();
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream =
+      at::cuda::getCurrentCUDAStream(input.get_device());
+  if (in_dtype == at::ScalarType::Half) {
+    quant_impl<half>(output.data_ptr(), output_scale.data_ptr(),
+                     input.data_ptr(), input_global_scale.data_ptr(),
+                     input_offset_by_experts.data_ptr(),
+                     output_scale_offset_by_experts.data_ptr(), m_topk, k,
+                     n_experts, stream);
+  } else if (in_dtype == at::ScalarType::BFloat16) {
+    quant_impl<__nv_bfloat16>(output.data_ptr(), output_scale.data_ptr(),
+                              input.data_ptr(), input_global_scale.data_ptr(),
+                              input_offset_by_experts.data_ptr(),
+                              output_scale_offset_by_experts.data_ptr(), m_topk,
+                              k, n_experts, stream);
+  } else {
+    TORCH_CHECK(false, "Expected input data type to be half or bfloat16");
+  }
+}
diff --git a/vllm_v0.10.0/csrc/quantization/fp4/nvfp4_quant_entry.cu b/vllm_v0.10.0/csrc/quantization/fp4/nvfp4_quant_entry.cu
new file mode 100644
index 0000000..badbb7e
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/fp4/nvfp4_quant_entry.cu
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+void scaled_fp4_quant_sm100a(torch::Tensor const& output,
+                             torch::Tensor const& input,
+                             torch::Tensor const& output_sf,
+                             torch::Tensor const& input_sf);
+#endif
+
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+void scaled_fp4_experts_quant_sm100a(
+    torch::Tensor& output, torch::Tensor& output_scale,
+    torch::Tensor const& input, torch::Tensor const& input_global_scale,
+    torch::Tensor const& input_offset_by_experts,
+    torch::Tensor const& output_scale_offset_by_experts);
+#endif
+
+void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
+                      torch::Tensor& output_sf, torch::Tensor const& input_sf) {
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+  return scaled_fp4_quant_sm100a(output, input, output_sf, input_sf);
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 quantization kernel");
+}
+
+void scaled_fp4_experts_quant(
+    torch::Tensor& output, torch::Tensor& output_scale,
+    torch::Tensor const& input, torch::Tensor const& input_global_scale,
+    torch::Tensor const& input_offset_by_experts,
+    torch::Tensor const& output_scale_offset_by_experts) {
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+  return scaled_fp4_experts_quant_sm100a(
+      output, output_scale, input, input_global_scale, input_offset_by_experts,
+      output_scale_offset_by_experts);
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(false,
+                              "No compiled nvfp4 experts quantization kernel");
+}
diff --git a/vllm_v0.10.0/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/vllm_v0.10.0/csrc/quantization/fp4/nvfp4_quant_kernels.cu
new file mode 100644
index 0000000..d329113
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#include <cuda_runtime_api.h>
+#include <cuda_runtime.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <cuda_fp8.h>
+
+#include "cuda_utils.h"
+
+// Get type2 from type or vice versa (applied to half and bfloat16)
+template <typename T>
+struct TypeConverter {
+  using Type = half2;
+};  // keep for generality
+
+template <>
+struct TypeConverter<half2> {
+  using Type = half;
+};
+
+template <>
+struct TypeConverter<half> {
+  using Type = half2;
+};
+
+template <>
+struct TypeConverter<__nv_bfloat162> {
+  using Type = __nv_bfloat16;
+};
+
+template <>
+struct TypeConverter<__nv_bfloat16> {
+  using Type = __nv_bfloat162;
+};
+
+#define ELTS_PER_THREAD 8
+
+constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
+constexpr int CVT_FP4_SF_VEC_SIZE = 16;
+
+// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
+inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  uint32_t val;
+  asm volatile(
+      "{\n"
+      ".reg .b8 byte0;\n"
+      ".reg .b8 byte1;\n"
+      ".reg .b8 byte2;\n"
+      ".reg .b8 byte3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
+      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
+      "}"
+      : "=r"(val)
+      : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]),
+        "f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7]));
+  return val;
+#else
+  return 0;
+#endif
+}
+
+// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
+inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  uint32_t val;
+  asm volatile(
+      "{\n"
+      ".reg .b8 byte0;\n"
+      ".reg .b8 byte1;\n"
+      ".reg .b8 byte2;\n"
+      ".reg .b8 byte3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
+      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
+      "}"
+      : "=r"(val)
+      : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
+        "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
+  return val;
+#else
+  return 0;
+#endif
+}
+
+// Fast reciprocal.
+inline __device__ float reciprocal_approximate_ftz(float a) {
+  float b;
+  asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
+  return b;
+}
+
+template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
+__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
+                                                       int numCols,
+                                                       SFType* SFout) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
+                CVT_FP4_NUM_THREADS_PER_SF == 2);
+
+  // One pair of threads write one SF to global memory.
+  // TODO: stage through smem for packed STG.32
+  // is it better than STG.8 from 4 threads ?
+  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
+    // SF vector index (16 elements share one SF in the K dimension).
+    int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
+    int32_t mIdx = rowIdx;
+
+    // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
+    // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
+
+    int32_t mTileIdx = mIdx / (32 * 4);
+    // SF vector size 16.
+    int factor = CVT_FP4_SF_VEC_SIZE * 4;
+    int32_t numKTiles = (numCols + factor - 1) / factor;
+    int64_t mTileStride = numKTiles * 32 * 4 * 4;
+
+    int32_t kTileIdx = (kIdx / 4);
+    int64_t kTileStride = 32 * 4 * 4;
+
+    // M tile layout [32, 4] is column-major.
+    int32_t outerMIdx = (mIdx % 32);
+    int64_t outerMStride = 4 * 4;
+
+    int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
+    int64_t innerMStride = 4;
+
+    int32_t innerKIdx = (kIdx % 4);
+    int64_t innerKStride = 1;
+
+    // Compute the global offset.
+    int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
+                       outerMIdx * outerMStride + innerMIdx * innerMStride +
+                       innerKIdx * innerKStride;
+
+    return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
+  }
+#endif
+  return nullptr;
+}
+
+// Define a 16 bytes packed data type.
+template <class Type>
+struct PackedVec {
+  typename TypeConverter<Type>::Type elts[4];
+};
+
+template <>
+struct PackedVec<__nv_fp8_e4m3> {
+  __nv_fp8x2_e4m3 elts[8];
+};
+
+// Quantizes the provided PackedVec into the uint32_t output
+template <class Type, bool UE8M0_SF = false>
+__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
+                                         uint8_t* SFout) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  // Get absolute maximum values among the local 8 values.
+  auto localMax = __habs2(vec.elts[0]);
+
+  // Local maximum value.
+  #pragma unroll
+  for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    localMax = __hmax2(localMax, __habs2(vec.elts[i]));
+  }
+
+  // Get the absolute maximum among all 16 values (two threads).
+  localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
+  // Get the final absolute maximum values.
+  float vecMax = float(__hmax(localMax.x, localMax.y));
+
+  // Get the SF (max value of the vector / max value of e2m1).
+  // maximum value of e2m1 = 6.0.
+  // TODO: use half as compute data type.
+  float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
+  // 8 bits representation of the SF.
+  uint8_t fp8SFVal;
+  // Write the SF to global memory (STG.8).
+  if constexpr (UE8M0_SF) {
+    // Extract the 8 exponent bits from float32.
+    // float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits.
+    uint32_t tmp = reinterpret_cast<uint32_t&>(SFValue) >> 23;
+    fp8SFVal = tmp & 0xff;
+    // Convert back to fp32.
+    reinterpret_cast<uint32_t&>(SFValue) = tmp << 23;
+  } else {
+    // Here SFValue is always positive, so E4M3 is the same as UE4M3.
+    __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
+    reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp;
+    // Convert back to fp32.
+    SFValue = float(tmp);
+  }
+  // Get the output scale.
+  // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) *
+  //                       reciprocal(SFScaleVal))
+  float outputScale =
+      SFValue != 0 ? reciprocal_approximate_ftz(
+                         SFValue * reciprocal_approximate_ftz(SFScaleVal))
+                   : 0.0f;
+
+  if (SFout) {
+    // Write the SF to global memory (STG.8).
+    *SFout = fp8SFVal;
+  }
+
+  // Convert the input to float.
+  float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
+
+  #pragma unroll
+  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    if constexpr (std::is_same_v<Type, half>) {
+      fp2Vals[i] = __half22float2(vec.elts[i]);
+    } else {
+      fp2Vals[i] = __bfloat1622float2(vec.elts[i]);
+    }
+    fp2Vals[i].x *= outputScale;
+    fp2Vals[i].y *= outputScale;
+  }
+
+  // Convert to e2m1 values.
+  uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
+
+  // Write the e2m1 values to global memory.
+  return e2m1Vec;
+#else
+  return 0;
+#endif
+}
+
+// Use UE4M3 by default.
+template <class Type, bool UE8M0_SF = false>
+__global__ void
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+__launch_bounds__(512, 4) cvt_fp16_to_fp4(
+#else
+cvt_fp16_to_fp4(
+#endif
+    int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
+    uint32_t* out, uint32_t* SFout) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  using PackedVec = PackedVec<Type>;
+  static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
+      (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
+  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
+                "Vec size is not matched.");
+
+  // Get the global scaling factor, which will be applied to the SF.
+  // Note SFScale is the same as next GEMM's alpha, which is
+  // (448.f / (Alpha_A / 6.f)).
+  float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
+
+  // Input tensor row/col loops.
+  for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
+    for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD;
+         colIdx += blockDim.x) {
+      int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
+      PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
+      // Get the output tensor offset.
+      // Same as inOffset because 8 elements are packed into one uint32_t.
+      int64_t outOffset = inOffset;
+      auto& out_pos = out[outOffset];
+
+      auto sf_out =
+          cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
+                                             CVT_FP4_NUM_THREADS_PER_SF>(
+              rowIdx, colIdx, numCols, SFout);
+
+      out_pos =
+          cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
+    }
+  }
+#endif
+}
+
+template <typename T>
+void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale,
+                           int64_t* output, int32_t* SFOuput, bool useUE8M0,
+                           int multiProcessorCount, cudaStream_t stream) {
+  // Grid, Block size.
+  // Each thread converts 8 values.
+  dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
+  // Get number of blocks per SM (assume we can fully utilize the SM).
+  int const numBlocksPerSM = 2048 / block.x;
+  dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
+
+  // Launch the cvt kernel.
+  if (useUE8M0) {
+    cvt_fp16_to_fp4<T, true><<<grid, block, 0, stream>>>(
+        m, n, input, SFScale, reinterpret_cast<uint32_t*>(output),
+        reinterpret_cast<uint32_t*>(SFOuput));
+  } else {
+    cvt_fp16_to_fp4<T, false><<<grid, block, 0, stream>>>(
+        m, n, input, SFScale, reinterpret_cast<uint32_t*>(output),
+        reinterpret_cast<uint32_t*>(SFOuput));
+  }
+}
+
+// Instantiate the function.
+template void invokeFP4Quantization(int m, int n, half const* input,
+                                    float const* SFScale, int64_t* output,
+                                    int32_t* SFOuput, bool useUE8M0,
+                                    int multiProcessorCount,
+                                    cudaStream_t stream);
+
+template void invokeFP4Quantization(int m, int n, __nv_bfloat16 const* input,
+                                    float const* SFScale, int64_t* output,
+                                    int32_t* SFOuput, bool useUE8M0,
+                                    int multiProcessorCount,
+                                    cudaStream_t stream);
+
+void scaled_fp4_quant_sm100a(torch::Tensor const& output,
+                             torch::Tensor const& input,
+                             torch::Tensor const& output_sf,
+                             torch::Tensor const& input_sf) {
+  int32_t m = input.size(0);
+  int32_t n = input.size(1);
+
+  TORCH_CHECK(n % 16 == 0, "The N dimension must be multiple of 16.");
+
+  int multiProcessorCount =
+      get_device_attribute(cudaDevAttrMultiProcessorCount, -1);
+
+  auto input_sf_ptr = static_cast<float const*>(input_sf.data_ptr());
+  auto sf_out = static_cast<int32_t*>(output_sf.data_ptr());
+  auto output_ptr = static_cast<int64_t*>(output.data_ptr());
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
+
+  // We don't support e8m0 scales at this moment.
+  bool useUE8M0 = false;
+
+  switch (input.scalar_type()) {
+    case torch::kHalf: {
+      auto input_ptr = reinterpret_cast<half const*>(input.data_ptr());
+      invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr, sf_out,
+                            useUE8M0, multiProcessorCount, stream);
+      break;
+    }
+    case torch::kBFloat16: {
+      auto input_ptr = reinterpret_cast<__nv_bfloat16 const*>(input.data_ptr());
+      invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr, sf_out,
+                            useUE8M0, multiProcessorCount, stream);
+      break;
+    }
+    default: {
+      std::cerr << "Observing: " << input.scalar_type()
+                << " for the input datatype which is invalid";
+      throw std::runtime_error(
+          "Unsupported input data type for quantize_to_fp4.");
+    }
+  }
+}
diff --git a/vllm_v0.10.0/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu b/vllm_v0.10.0/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
new file mode 100644
index 0000000..61b75e9
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+void cutlass_scaled_fp4_mm_sm100a(torch::Tensor& D, torch::Tensor const& A,
+                                  torch::Tensor const& B,
+                                  torch::Tensor const& A_sf,
+                                  torch::Tensor const& B_sf,
+                                  torch::Tensor const& alpha);
+#endif
+
+void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor const& A,
+                           torch::Tensor const& B, torch::Tensor const& A_sf,
+                           torch::Tensor const& B_sf,
+                           torch::Tensor const& alpha) {
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+  return cutlass_scaled_fp4_mm_sm100a(D, A, B, A_sf, B_sf, alpha);
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(false,
+                              "No compiled nvfp4 mm kernel, vLLM should "
+                              "be compiled using CUDA 12.8 and target "
+                              "compute capability 100 or above.");
+}
+
+bool cutlass_scaled_mm_supports_fp4(int64_t cuda_device_capability) {
+  int runtimeVersion;
+  cudaRuntimeGetVersion(&runtimeVersion);
+  return cuda_device_capability >= 100 && runtimeVersion >= 12080;
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu b/vllm_v0.10.0/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
new file mode 100644
index 0000000..5bc4c38
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
@@ -0,0 +1,317 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "cutlass_extensions/common.hpp"
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+#include "cutlass/util/packed_stride.hpp"
+
+#include "core/math.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// Configuration for M in (256, inf)
+struct sm100_fp4_config_default {
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_256, _256, _256>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using PerSmTileShape_MNK = Shape<_128, _256, _256>;
+};
+
+// Configuration for M in (16, 256]
+struct sm100_fp4_config_M256 {
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_256, _128, _256>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using PerSmTileShape_MNK = Shape<_128, _128, _256>;
+};
+
+// Configuration for M in [1, 16]
+struct sm100_fp4_config_M16 {
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using PerSmTileShape_MNK = Shape<_128, _128, _256>;
+};
+
+template <typename Config, typename OutType>
+struct Fp4GemmSm100 {
+  // A matrix configuration
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using LayoutATag = cutlass::layout::RowMajor;
+  static constexpr int AlignmentA = 32;
+
+  // B matrix configuration
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  static constexpr int AlignmentB = 32;
+
+  // C/D matrix configuration
+  using ElementD = OutType;
+  using ElementC = OutType;
+  using LayoutCTag = cutlass::layout::RowMajor;
+  using LayoutDTag = cutlass::layout::RowMajor;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+
+  // Kernel functional config
+  using ElementAccumulator = float;
+  using ArchTag = cutlass::arch::Sm100;
+  using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp;
+
+  // Use config's tile shapes
+  using MmaTileShape = typename Config::TileShape;
+  using ClusterShape = typename Config::ClusterShape;
+  using PerSmTileShape_MNK = typename Config::PerSmTileShape_MNK;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, PerSmTileShape_MNK, ClusterShape,
+          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
+          ElementAccumulator, ElementC, LayoutCTag, AlignmentC, ElementD,
+          LayoutDTag, AlignmentD,
+          cutlass::epilogue::collective::EpilogueScheduleAuto>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, ElementA, LayoutATag, AlignmentA, ElementB,
+          LayoutBTag, AlignmentB, ElementAccumulator, MmaTileShape,
+          ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          cutlass::gemm::collective::KernelScheduleAuto>::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using LayoutA = decltype(cute::make_layout(make_shape(0, 0, 0), StrideA{}));
+  using LayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using LayoutB = decltype(cute::make_layout(make_shape(0, 0, 0), StrideB{}));
+  using LayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using LayoutC = decltype(cute::make_layout(make_shape(0, 0, 0), StrideC{}));
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+  using LayoutD = decltype(cute::make_layout(make_shape(0, 0, 0), StrideD{}));
+};
+
+template <typename Config>
+typename Config::Gemm::Arguments args_from_options(
+    at::Tensor& D, at::Tensor const& A, at::Tensor const& B,
+    at::Tensor const& A_sf, at::Tensor const& B_sf, at::Tensor const& alpha,
+    int64_t M, int64_t N, int64_t K) {
+  using ElementA = typename Config::Gemm::ElementA;
+  using ElementB = typename Config::Gemm::ElementB;
+  using ElementSFA = cutlass::float_ue4m3_t;
+  using ElementSFB = cutlass::float_ue4m3_t;
+  using ElementD = typename Config::Gemm::ElementD;
+  using ElementCompute = float;
+  using StrideA = typename Config::StrideA;
+  using StrideB = typename Config::StrideB;
+  using StrideD = typename Config::StrideD;
+  using Sm100BlkScaledConfig = typename Config::Gemm::GemmKernel::
+      CollectiveMainloop::Sm1xxBlkScaledConfig;
+
+  int m = static_cast<int>(M);
+  int n = static_cast<int>(N);
+  int k = static_cast<int>(K);
+  auto stride_A = cutlass::make_cute_packed_stride(StrideA{}, {m, k, 1});
+  auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, {n, k, 1});
+  auto stride_D = cutlass::make_cute_packed_stride(StrideD{}, {m, n, 1});
+
+  auto layout_SFA = Sm100BlkScaledConfig::tile_atom_to_shape_SFA(
+      cute::make_shape(m, n, k, 1));
+  auto layout_SFB = Sm100BlkScaledConfig::tile_atom_to_shape_SFB(
+      cute::make_shape(m, n, k, 1));
+
+  typename Config::Gemm::Arguments arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      {m, n, k, 1},
+      {// Mainloop arguments
+       static_cast<ElementA const*>(A.data_ptr()), stride_A,
+       static_cast<ElementB const*>(B.data_ptr()), stride_B,
+       static_cast<ElementSFA const*>(A_sf.data_ptr()), layout_SFA,
+       static_cast<ElementSFB const*>(B_sf.data_ptr()), layout_SFB},
+      {     // Epilogue arguments
+       {},  // epilogue.thread
+       static_cast<ElementD const*>(D.data_ptr()),
+       stride_D,
+       static_cast<ElementD*>(D.data_ptr()),
+       stride_D}};
+  auto& fusion_args = arguments.epilogue.thread;
+  fusion_args.alpha_ptr = static_cast<ElementCompute const*>(alpha.data_ptr());
+  return arguments;
+}
+
+template <typename Config>
+void runGemm(at::Tensor& D, at::Tensor const& A, at::Tensor const& B,
+             at::Tensor const& A_sf, at::Tensor const& B_sf,
+             at::Tensor const& alpha, int64_t m, int64_t n, int64_t k,
+             cudaStream_t stream) {
+  typename Config::Gemm gemm;
+
+  auto arguments =
+      args_from_options<Config>(D, A, B, A_sf, B_sf, alpha, m, n, k);
+
+  size_t workspace_size = Config::Gemm::get_workspace_size(arguments);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(A.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  CUTLASS_CHECK(gemm.can_implement(arguments));
+
+  CUTLASS_CHECK(gemm.initialize(arguments, workspace.data_ptr(), stream));
+
+  CUTLASS_CHECK(gemm.run(arguments, workspace.data_ptr(), stream));
+}
+
+// Dispatch function to select appropriate config based on M
+template <typename OutType>
+void cutlass_fp4_gemm_dispatch(torch::Tensor& D, torch::Tensor const& A,
+                               torch::Tensor const& B,
+                               torch::Tensor const& A_sf,
+                               torch::Tensor const& B_sf,
+                               torch::Tensor const& alpha, int64_t m, int64_t n,
+                               int64_t k, cudaStream_t stream) {
+  uint32_t const mp2 = std::max(static_cast<uint32_t>(16), next_pow_2(m));
+
+  if (mp2 <= 16) {
+    // m in [1, 16]
+    runGemm<Fp4GemmSm100<sm100_fp4_config_M16, OutType>>(
+        D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  } else if (mp2 <= 256) {
+    // m in (16, 256]
+    runGemm<Fp4GemmSm100<sm100_fp4_config_M256, OutType>>(
+        D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  } else {
+    // m in (256, inf)
+    runGemm<Fp4GemmSm100<sm100_fp4_config_default, OutType>>(
+        D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  }
+}
+
+#else
+template <typename OutType>
+void cutlass_fp4_gemm_dispatch(torch::Tensor& D, torch::Tensor const& A,
+                               torch::Tensor const& B,
+                               torch::Tensor const& A_sf,
+                               torch::Tensor const& B_sf,
+                               torch::Tensor const& alpha, int64_t m, int64_t n,
+                               int64_t k, cudaStream_t stream) {
+  TORCH_CHECK(false,
+              "Unsupported CUTLASS version. Set VLLM_CUTLASS_SRC_DIR to "
+              "a CUTLASS 3.8 source directory to enable support.");
+}
+#endif  // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+#define CHECK_TYPE(x, st, m) \
+  TORCH_CHECK(x.scalar_type() == st, ": Inconsistency of Tensor type:", m)
+#define CHECK_TH_CUDA(x, m) \
+  TORCH_CHECK(x.is_cuda(), m, ": must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x, m) \
+  TORCH_CHECK(x.is_contiguous(), m, ": must be contiguous")
+#define CHECK_INPUT(x, st, m) \
+  CHECK_TH_CUDA(x, m);        \
+  CHECK_CONTIGUOUS(x, m);     \
+  CHECK_TYPE(x, st, m)
+
+constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte;
+constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn;
+
+void cutlass_scaled_fp4_mm_sm100a(torch::Tensor& D, torch::Tensor const& A,
+                                  torch::Tensor const& B,
+                                  torch::Tensor const& A_sf,
+                                  torch::Tensor const& B_sf,
+                                  torch::Tensor const& alpha) {
+  CHECK_INPUT(A, FLOAT4_E2M1X2, "a");
+  CHECK_INPUT(B, FLOAT4_E2M1X2, "b");
+
+  CHECK_INPUT(A_sf, SF_DTYPE, "scale_a");
+  CHECK_INPUT(B_sf, SF_DTYPE, "scale_b");
+
+  CHECK_INPUT(alpha, at::ScalarType::Float, "alpha");
+
+  TORCH_CHECK(A.dim() == 2, "a must be a matrix");
+  TORCH_CHECK(B.dim() == 2, "b must be a matrix");
+  TORCH_CHECK(A.sizes()[1] == B.sizes()[1],
+              "a and b shapes cannot be multiplied (", A.sizes()[0], "x",
+              A.sizes()[1], " and ", B.sizes()[0], "x", B.sizes()[1], ")");
+
+  auto const m = A.sizes()[0];
+  auto const n = B.sizes()[0];
+  auto const k = A.sizes()[1] * 2;
+
+  constexpr int alignment = 32;
+  TORCH_CHECK(k % alignment == 0, "Expected k to be divisible by ", alignment,
+              ", but got a shape: (", A.sizes()[0], "x", A.sizes()[1],
+              "), k: ", k, ".");
+  TORCH_CHECK(n % alignment == 0, "Expected n to be divisible by ", alignment,
+              ", but got b shape: (", B.sizes()[0], "x", B.sizes()[1], ").");
+
+  auto round_up = [](int x, int y) { return (x + y - 1) / y * y; };
+  int rounded_m = round_up(m, 128);
+  int rounded_n = round_up(n, 128);
+  // Since k is divisible by 32 (alignment), k / 16 is guaranteed to be an
+  // integer.
+  int rounded_k = round_up(k / 16, 4);
+
+  TORCH_CHECK(A_sf.dim() == 2, "scale_a must be a matrix");
+  TORCH_CHECK(B_sf.dim() == 2, "scale_b must be a matrix");
+  TORCH_CHECK(A_sf.sizes()[1] == B_sf.sizes()[1],
+              "scale_a and scale_b shapes cannot be multiplied (",
+              A_sf.sizes()[0], "x", A_sf.sizes()[1], " and ", B_sf.sizes()[0],
+              "x", B_sf.sizes()[1], ")");
+  TORCH_CHECK(A_sf.sizes()[0] == rounded_m && A_sf.sizes()[1] == rounded_k,
+              "scale_a must be padded and swizzled to a shape (", rounded_m,
+              "x", rounded_k, "), but got a shape (", A_sf.sizes()[0], "x",
+              A_sf.sizes()[1], ")");
+  TORCH_CHECK(B_sf.sizes()[0] == rounded_n && B_sf.sizes()[1] == rounded_k,
+              "scale_b must be padded and swizzled to a shape (", rounded_n,
+              "x", rounded_k, "), but got a shape (", B_sf.sizes()[0], "x",
+              B_sf.sizes()[1], ")");
+
+  auto out_dtype = D.dtype();
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(A.get_device());
+
+  if (out_dtype == at::ScalarType::Half) {
+    cutlass_fp4_gemm_dispatch<cutlass::half_t>(D, A, B, A_sf, B_sf, alpha, m, n,
+                                               k, stream);
+  } else if (out_dtype == at::ScalarType::BFloat16) {
+    cutlass_fp4_gemm_dispatch<cutlass::bfloat16_t>(D, A, B, A_sf, B_sf, alpha,
+                                                   m, n, k, stream);
+  } else {
+    TORCH_CHECK(false, "Unsupported output data type of nvfp4 mm (", out_dtype,
+                ")");
+  }
+}
diff --git a/vllm_v0.10.0/csrc/quantization/fp8/amd/quant_utils.cuh b/vllm_v0.10.0/csrc/quantization/fp8/amd/quant_utils.cuh
new file mode 100644
index 0000000..e51a4e1
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/fp8/amd/quant_utils.cuh
@@ -0,0 +1,671 @@
+#pragma once
+#include <hip/hip_fp8.h>
+
+#include <hip/hip_fp16.h>
+#include <hip/hip_bf16.h>
+#include <hip/hip_bfloat16.h>
+
+#include "../../../attention/attention_dtypes.h"
+
+namespace vllm {
+#ifdef USE_ROCM
+
+namespace fp8 {
+  #ifdef ENABLE_FP8
+
+// Use hardware cvt instruction for fp8 on rocm
+template <typename fp8_type>
+__device__ __forceinline__ fp8_type cvt_c10(float const r) {
+  return {};
+}
+
+// __hip_fp8_e4m3 only exists starting in ROCm 6.3. The macro
+// HIP_FP8_TYPE_OCP comes from the hip_fp8.h header and also makes
+// its first appearance in ROCm 6.3. Since VLLM_DISPATCH_FP8_TYPES
+// on ROCm instantiates both OCP and FNUZ kernels, we need to replace
+// the new HW cvt with something reasonable that doesn't rely on the
+// ROCm 6.3 feature. This allows compiling on ROCm 6.2 or newer.
+template <>
+__device__ __forceinline__ c10::Float8_e4m3fn cvt_c10(float const r) {
+    #if HIP_FP8_TYPE_OCP
+  return c10::Float8_e4m3fn(
+      __hip_cvt_float_to_fp8(r, __hip_fp8_e4m3::__default_saturation,
+                             __hip_fp8_e4m3::__default_interpret),
+      c10::Float8_e4m3fn::from_bits());
+    #else
+  // Cast implemented by pytorch. Uses bit manipulation instead of HW cvt.
+  // HW cvt above is faster when it is available (ROCm 6.3 or newer).
+  return static_cast<c10::Float8_e4m3fn>(r);
+    #endif
+}
+
+template <>
+__device__ __forceinline__ c10::Float8_e4m3fnuz cvt_c10(float const r) {
+  return c10::Float8_e4m3fnuz(
+      __hip_cvt_float_to_fp8(r, __hip_fp8_e4m3_fnuz::__default_saturation,
+                             __hip_fp8_e4m3_fnuz::__default_interpret),
+      c10::Float8_e4m3fnuz::from_bits());
+}
+
+template <typename Tout, typename Tin>
+__inline__ __device__ Tout vec_conversion(const Tin& x) {
+  return x;
+}
+
+template <typename Tout, typename Tin>
+__inline__ __device__ Tout scaled_vec_conversion(const Tin& x,
+                                                 const float scale) {
+  return x;
+}
+
+    #if HIP_FP8_TYPE_OCP
+using fp8_type = __hip_fp8_e4m3;
+using fp8x2_type = __hip_fp8x2_e4m3;
+    #else
+using fp8_type = __hip_fp8_e4m3_fnuz;
+using fp8x2_type = __hip_fp8x2_e4m3_fnuz;
+    #endif
+
+// fp8 -> half
+template <>
+__inline__ __device__ uint16_t
+vec_conversion<uint16_t, uint8_t>(const uint8_t& a) {
+  return __hip_cvt_fp8_to_halfraw(a, fp8_type::__default_interpret).x;
+}
+
+// fp8x2 -> half2
+template <>
+__inline__ __device__ uint32_t
+vec_conversion<uint32_t, uint16_t>(const uint16_t& a) {
+  union {
+    __half2_raw h2r;
+    uint32_t ui32;
+  } tmp;
+  tmp.h2r = __hip_cvt_fp8x2_to_halfraw2(a, fp8_type::__default_interpret);
+  return tmp.ui32;
+}
+
+// fp8x4 -> half2x2
+template <>
+__inline__ __device__ uint2 vec_conversion<uint2, uint32_t>(const uint32_t& a) {
+  union {
+    uint2 u32x2;
+    uint32_t u32[2];
+  } tmp;
+  tmp.u32[0] = vec_conversion<uint32_t, uint16_t>((uint16_t)a);
+  tmp.u32[1] = vec_conversion<uint32_t, uint16_t>((uint16_t)(a >> 16U));
+  return tmp.u32x2;
+}
+
+// fp8x8 -> half2x4
+template <>
+__inline__ __device__ uint4 vec_conversion<uint4, uint2>(const uint2& a) {
+  union {
+    uint4 u64x2;
+    uint2 u64[2];
+  } tmp;
+  tmp.u64[0] = vec_conversion<uint2, uint32_t>(a.x);
+  tmp.u64[1] = vec_conversion<uint2, uint32_t>(a.y);
+  return tmp.u64x2;
+}
+
+using __nv_bfloat16 = __hip_bfloat16;
+
+// fp8 -> __nv_bfloat16
+template <>
+__inline__ __device__ __nv_bfloat16
+vec_conversion<__nv_bfloat16, uint8_t>(const uint8_t& a) {
+  fp8_type f8;
+  f8.__x = a;
+  return __float2bfloat16(static_cast<float>(f8));
+}
+
+using __nv_bfloat162 = __hip_bfloat162;
+
+// fp8x2 -> __nv_bfloat162
+template <>
+__inline__ __device__ __nv_bfloat162
+vec_conversion<__nv_bfloat162, uint16_t>(const uint16_t& a) {
+  __nv_bfloat162 res;
+  res.x = vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)a);
+  res.y = vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)(a >> 8U));
+  return res;
+}
+
+// fp8x4 -> bf16_4_t
+template <>
+__inline__ __device__ bf16_4_t
+vec_conversion<bf16_4_t, uint32_t>(const uint32_t& a) {
+  bf16_4_t res;
+  res.x = vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)a);
+  res.y = vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)(a >> 16U));
+  return res;
+}
+
+// fp8x8 -> bf16_8_t
+template <>
+__inline__ __device__ bf16_8_t vec_conversion<bf16_8_t, uint2>(const uint2& a) {
+  bf16_4_t tmp1, tmp2;
+  tmp1 = vec_conversion<bf16_4_t, uint32_t>(a.x);
+  tmp2 = vec_conversion<bf16_4_t, uint32_t>(a.y);
+  bf16_8_t res;
+  res.x = tmp1.x;
+  res.y = tmp1.y;
+  res.z = tmp2.x;
+  res.w = tmp2.y;
+  return res;
+}
+
+// fp8 -> float
+template <>
+__inline__ __device__ float vec_conversion<float, uint8_t>(const uint8_t& a) {
+  fp8_type f8;
+  f8.__x = a;
+  return static_cast<float>(f8);
+}
+
+// fp8x2 -> float2
+template <>
+__inline__ __device__ float2
+vec_conversion<float2, uint16_t>(const uint16_t& a) {
+  fp8x2_type f8x2;
+  f8x2.__x = a;
+  return static_cast<float2>(f8x2);
+}
+
+// fp8x4 -> float4
+template <>
+__inline__ __device__ Float4_
+vec_conversion<Float4_, uint32_t>(const uint32_t& a) {
+  Float4_ res;
+  res.x = vec_conversion<float2, uint16_t>((uint16_t)a);
+  res.y = vec_conversion<float2, uint16_t>((uint16_t)(a >> 16U));
+  return res;
+}
+
+// fp8x4 -> float4
+template <>
+__inline__ __device__ float4
+vec_conversion<float4, uint32_t>(const uint32_t& a) {
+  Float4_ tmp = vec_conversion<Float4_, uint32_t>(a);
+  float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y);
+  return res;
+}
+
+// fp8x8 -> float8
+template <>
+__inline__ __device__ Float8_ vec_conversion<Float8_, uint2>(const uint2& a) {
+  Float4_ tmp1, tmp2;
+  tmp1 = vec_conversion<Float4_, uint32_t>(a.x);
+  tmp2 = vec_conversion<Float4_, uint32_t>(a.y);
+  Float8_ res;
+  res.x = tmp1.x;
+  res.y = tmp1.y;
+  res.z = tmp2.x;
+  res.w = tmp2.y;
+  return res;
+}
+
+// half -> fp8
+template <>
+__inline__ __device__ uint8_t
+vec_conversion<uint8_t, uint16_t>(const uint16_t& a) {
+  __half_raw tmp;
+  tmp.x = a;
+  return __hip_cvt_halfraw_to_fp8(tmp, fp8_type::__default_saturation,
+                                  fp8_type::__default_interpret);
+}
+
+template <>
+__inline__ __device__ uint16_t
+vec_conversion<uint16_t, uint32_t>(const uint32_t& a) {
+  union {
+    uint32_t ui32;
+    __half2_raw h2r;
+  } tmp;
+  tmp.ui32 = a;
+  return __hip_cvt_halfraw2_to_fp8x2(tmp.h2r, fp8_type::__default_saturation,
+                                     fp8_type::__default_interpret);
+}
+
+// bf16 -> fp8
+template <>
+__inline__ __device__ uint8_t
+vec_conversion<uint8_t, __nv_bfloat16>(const __nv_bfloat16& a) {
+  return __hip_cvt_float_to_fp8(__bfloat162float(a),
+                                fp8_type::__default_saturation,
+                                fp8_type::__default_interpret);
+}
+
+// float -> fp8
+template <>
+__inline__ __device__ uint8_t vec_conversion<uint8_t, float>(const float& a) {
+  return __hip_cvt_float_to_fp8(a, fp8_type::__default_saturation,
+                                fp8_type::__default_interpret);
+}
+
+// float2 -> half2
+template <>
+__inline__ __device__ uint32_t
+vec_conversion<uint32_t, float2>(const float2& a) {
+  union {
+    half2 float16;
+    uint32_t uint32;
+  };
+
+  float16 = __float22half2_rn(a);
+  return uint32;
+}
+
+// Float4 -> half2x2
+template <>
+__inline__ __device__ uint2 vec_conversion<uint2, Float4_>(const Float4_& a) {
+  uint2 b;
+  float2 val;
+  val.x = a.x.x;
+  val.y = a.x.y;
+  b.x = vec_conversion<uint32_t, float2>(val);
+
+  val.x = a.y.x;
+  val.y = a.y.y;
+  b.y = vec_conversion<uint32_t, float2>(val);
+  return b;
+}
+
+// Float4 -> float4
+template <>
+__inline__ __device__ float4 vec_conversion<float4, Float4_>(const Float4_& a) {
+  float4 b;
+  b.x = a.x.x;
+  b.y = a.x.y;
+  b.z = a.y.x;
+  b.w = a.y.y;
+  return b;
+}
+
+// Float8 -> half2x4
+template <>
+__inline__ __device__ uint4 vec_conversion<uint4, Float8_>(const Float8_& a) {
+  uint4 b;
+  b.x = vec_conversion<uint32_t, float2>(a.x);
+  b.y = vec_conversion<uint32_t, float2>(a.y);
+  b.z = vec_conversion<uint32_t, float2>(a.z);
+  b.w = vec_conversion<uint32_t, float2>(a.w);
+  return b;
+}
+
+// float2 -> bfloat162
+template <>
+__inline__ __device__ __nv_bfloat162
+vec_conversion<__nv_bfloat162, float2>(const float2& a) {
+  __nv_bfloat162 b = __float22bfloat162_rn(a);
+  return b;
+}
+
+// Float4 -> bfloat162x2
+template <>
+__inline__ __device__ bf16_4_t
+vec_conversion<bf16_4_t, Float4_>(const Float4_& a) {
+  bf16_4_t b;
+  b.x = __float22bfloat162_rn(a.x);
+  b.y = __float22bfloat162_rn(a.y);
+  return b;
+}
+
+// Float8 -> bfloat162x4
+template <>
+__inline__ __device__ bf16_8_t
+vec_conversion<bf16_8_t, Float8_>(const Float8_& a) {
+  bf16_8_t b;
+  b.x = __float22bfloat162_rn(a.x);
+  b.y = __float22bfloat162_rn(a.y);
+  b.z = __float22bfloat162_rn(a.z);
+  b.w = __float22bfloat162_rn(a.w);
+  return b;
+}
+
+/* Scaled and vectorized conversions, for data exchange between high and low
+   precision domains
+
+   Convention of the scale in API, e.g: FP8_data = Quantization(
+   High_Precision_data / scale ) s.t. Quantize(HP / scale) => FP8 Dequant(FP8) *
+   scale =>  HP
+
+ */
+
+using __nv_bfloat16 = __hip_bfloat16;
+
+// fp8 -> __nv_bfloat16
+template <>
+__inline__ __device__ __nv_bfloat16
+scaled_vec_conversion<__nv_bfloat16, uint8_t>(const uint8_t& a, float scale) {
+  fp8_type f8;
+  f8.__x = a;
+  return __float2bfloat16(static_cast<float>(f8) * scale);
+}
+
+// fp8x2 -> __nv_bfloat162
+template <>
+__inline__ __device__ __nv_bfloat162
+scaled_vec_conversion<__nv_bfloat162, uint16_t>(const uint16_t& a,
+                                                float scale) {
+  __nv_bfloat162 res;
+  res.x = scaled_vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)a, scale);
+  res.y =
+      scaled_vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)(a >> 8U), scale);
+  return res;
+}
+
+// fp8x4 -> bf16_4_t
+template <>
+__inline__ __device__ bf16_4_t
+scaled_vec_conversion<bf16_4_t, uint32_t>(const uint32_t& a, float scale) {
+  bf16_4_t res;
+  res.x = scaled_vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)a, scale);
+  res.y = scaled_vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)(a >> 16U),
+                                                          scale);
+  return res;
+}
+
+// fp8x8 -> bf16_8_t
+template <>
+__inline__ __device__ bf16_8_t
+scaled_vec_conversion<bf16_8_t, uint2>(const uint2& a, float scale) {
+  bf16_4_t tmp1, tmp2;
+  tmp1 = scaled_vec_conversion<bf16_4_t, uint32_t>(a.x, scale);
+  tmp2 = scaled_vec_conversion<bf16_4_t, uint32_t>(a.y, scale);
+  bf16_8_t res;
+  res.x = tmp1.x;
+  res.y = tmp1.y;
+  res.z = tmp2.x;
+  res.w = tmp2.y;
+  return res;
+}
+
+// fp8 -> float
+template <>
+__inline__ __device__ float scaled_vec_conversion<float, uint8_t>(
+    const uint8_t& a, float scale) {
+  fp8_type f8;
+  f8.__x = a;
+  return static_cast<float>(f8) * scale;
+}
+
+// fp8x2 -> float2
+template <>
+__inline__ __device__ float2
+scaled_vec_conversion<float2, uint16_t>(const uint16_t& a, float scale) {
+  fp8x2_type f8x2;
+  f8x2.__x = a;
+  return static_cast<float2>(f8x2) * scale;
+}
+
+// fp8x4 -> float4
+template <>
+__inline__ __device__ Float4_
+scaled_vec_conversion<Float4_, uint32_t>(const uint32_t& a, const float scale) {
+  Float4_ res;
+  res.x = scaled_vec_conversion<float2, uint16_t>((uint16_t)a, scale);
+  res.y = scaled_vec_conversion<float2, uint16_t>((uint16_t)(a >> 16U), scale);
+  return res;
+}
+
+// fp8x4 -> float4
+template <>
+__inline__ __device__ float4
+scaled_vec_conversion<float4, uint32_t>(const uint32_t& a, float scale) {
+  Float4_ res = scaled_vec_conversion<Float4_, uint32_t>(a, scale);
+  return {res.x.x, res.x.y, res.y.x, res.y.y};
+}
+
+// fp8x8 -> float8
+template <>
+__inline__ __device__ Float8_
+scaled_vec_conversion<Float8_, uint2>(const uint2& a, float scale) {
+  Float4_ tmp1, tmp2;
+  tmp1 = scaled_vec_conversion<Float4_, uint32_t>(a.x, scale);
+  tmp2 = scaled_vec_conversion<Float4_, uint32_t>(a.y, scale);
+  Float8_ res;
+  res.x = tmp1.x;
+  res.y = tmp1.y;
+  res.z = tmp2.x;
+  res.w = tmp2.y;
+  return res;
+}
+
+// fp8 -> half
+template <>
+__inline__ __device__ uint16_t
+scaled_vec_conversion<uint16_t, uint8_t>(const uint8_t& a, float scale) {
+  __half_raw res;
+  res.data = scaled_vec_conversion<float, uint8_t>(a, scale);
+  return res.x;
+}
+
+// fp8x2 -> half2
+template <>
+__inline__ __device__ uint32_t
+scaled_vec_conversion<uint32_t, uint16_t>(const uint16_t& a, float scale) {
+  union {
+    __half2_raw h2r;
+    uint32_t ui32;
+  } tmp;
+  tmp.h2r = __hip_cvt_fp8x2_to_halfraw2(a, fp8_type::__default_interpret);
+  tmp.h2r.x.data *= scale;
+  tmp.h2r.y.data *= scale;
+  return tmp.ui32;
+}
+
+// fp8x4 -> half2x2
+template <>
+__inline__ __device__ uint2
+scaled_vec_conversion<uint2, uint32_t>(const uint32_t& a, float scale) {
+  union {
+    uint2 u32x2;
+    uint32_t u32[2];
+  } tmp;
+  tmp.u32[0] = scaled_vec_conversion<uint32_t, uint16_t>((uint16_t)a, scale);
+  tmp.u32[1] =
+      scaled_vec_conversion<uint32_t, uint16_t>((uint16_t)(a >> 16U), scale);
+  return tmp.u32x2;
+}
+
+// fp8x8 -> half2x4
+template <>
+__inline__ __device__ uint4 scaled_vec_conversion<uint4, uint2>(const uint2& a,
+                                                                float scale) {
+  union {
+    uint4 u64x2;
+    uint2 u64[2];
+  } tmp;
+  tmp.u64[0] = scaled_vec_conversion<uint2, uint32_t>(a.x, scale);
+  tmp.u64[1] = scaled_vec_conversion<uint2, uint32_t>(a.y, scale);
+  return tmp.u64x2;
+}
+
+// half -> fp8
+template <>
+__inline__ __device__ uint8_t
+scaled_vec_conversion<uint8_t, uint16_t>(const uint16_t& a, float scale) {
+  __half_raw tmp;
+  tmp.x = a;
+  tmp.data /= scale;
+  return __hip_cvt_halfraw_to_fp8(tmp, fp8_type::__default_saturation,
+                                  fp8_type::__default_interpret);
+}
+
+// halfx2 -> fp8x2
+template <>
+__inline__ __device__ uint16_t
+scaled_vec_conversion<uint16_t, uint32_t>(const uint32_t& a, float scale) {
+  union {
+    uint32_t ui32;
+    __half2_raw h2r;
+  } tmp;
+  tmp.ui32 = a;
+  tmp.h2r.x.data /= scale;
+  tmp.h2r.y.data /= scale;
+  return __hip_cvt_halfraw2_to_fp8x2(tmp.h2r, fp8_type::__default_saturation,
+                                     fp8_type::__default_interpret);
+}
+
+// half2x2 -> fp8x4
+template <>
+__inline__ __device__ uint32_t
+scaled_vec_conversion<uint32_t, uint2>(const uint2& a, float scale) {
+  union {
+    uint16_t ui16[2];
+    uint32_t ui32;
+  } tmp;
+  tmp.ui16[0] = scaled_vec_conversion<uint16_t, uint32_t>(a.x, scale);
+  tmp.ui16[1] = scaled_vec_conversion<uint16_t, uint32_t>(a.y, scale);
+  return tmp.ui32;
+}
+
+// half2x4 -> fp8x8
+template <>
+__inline__ __device__ uint2 scaled_vec_conversion<uint2, uint4>(const uint4& a,
+                                                                float scale) {
+  union {
+    uint2 ui2[2];
+    uint4 ui4;
+  } tmp;
+  tmp.ui4 = a;
+  uint2 res;
+  res.x = scaled_vec_conversion<uint32_t, uint2>(tmp.ui2[0], scale);
+  res.y = scaled_vec_conversion<uint32_t, uint2>(tmp.ui2[1], scale);
+  return res;
+}
+
+// bf16 -> fp8
+template <>
+__inline__ __device__ uint8_t scaled_vec_conversion<uint8_t, __nv_bfloat16>(
+    const __nv_bfloat16& a, float scale) {
+  return __hip_cvt_float_to_fp8(__bfloat162float(a) / scale,
+                                fp8_type::__default_saturation,
+                                fp8_type::__default_interpret);
+}
+
+// bf16x2 -> fp8x2
+template <>
+__inline__ __device__ uint16_t scaled_vec_conversion<uint16_t, __nv_bfloat162>(
+    const __nv_bfloat162& a, float scale) {
+  union {
+    uint8_t ui8[2];
+    uint16_t ui16;
+  } tmp;
+  tmp.ui8[0] = scaled_vec_conversion<uint8_t, __nv_bfloat16>(a.x, scale);
+  tmp.ui8[1] = scaled_vec_conversion<uint8_t, __nv_bfloat16>(a.y, scale);
+  return tmp.ui16;
+}
+
+// bf16x4 -> fp8x4
+template <>
+__inline__ __device__ uint32_t
+scaled_vec_conversion<uint32_t, bf16_4_t>(const bf16_4_t& a, float scale) {
+  union {
+    uint16_t ui16[2];
+    uint32_t ui32;
+  } tmp;
+  tmp.ui16[0] = scaled_vec_conversion<uint16_t, __nv_bfloat162>(a.x, scale);
+  tmp.ui16[1] = scaled_vec_conversion<uint16_t, __nv_bfloat162>(a.y, scale);
+  return tmp.ui32;
+}
+
+// bf16x8 -> fp8x8
+template <>
+__inline__ __device__ uint2
+scaled_vec_conversion<uint2, bf16_8_t>(const bf16_8_t& a, float scale) {
+  uint2 res;
+  res.x = scaled_vec_conversion<uint32_t, bf16_4_t>({a.x, a.y}, scale);
+  res.y = scaled_vec_conversion<uint32_t, bf16_4_t>({a.z, a.w}, scale);
+  return res;
+}
+
+// float -> fp8
+template <>
+__inline__ __device__ uint8_t
+scaled_vec_conversion<uint8_t, float>(const float& a, float scale) {
+  return __hip_cvt_float_to_fp8(a / scale, fp8_type::__default_saturation,
+                                fp8_type::__default_interpret);
+}
+
+// floatx2 -> fp8x2
+template <>
+__inline__ __device__ uint16_t
+scaled_vec_conversion<uint16_t, float2>(const float2& a, float scale) {
+  return __hip_cvt_float2_to_fp8x2(a / scale, fp8_type::__default_saturation,
+                                   fp8_type::__default_interpret);
+}
+
+// floatx4 -> fp8x4
+template <>
+__inline__ __device__ uint32_t
+scaled_vec_conversion<uint32_t, float4>(const float4& a, float scale) {
+  union {
+    uint16_t ui16[2];
+    uint32_t ui32;
+  } tmp;
+  tmp.ui16[0] = scaled_vec_conversion<uint16_t, float2>({a.x, a.y}, scale);
+  tmp.ui16[1] = scaled_vec_conversion<uint16_t, float2>({a.z, a.w}, scale);
+  return tmp.ui32;
+}
+  #endif  // ENABLE_FP8
+
+template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
+__inline__ __device__ Tout convert(const Tin& x) {
+  #ifdef ENABLE_FP8
+  if constexpr (kv_dt == Fp8KVCacheDataType::kFp8E4M3) {
+    return vec_conversion<Tout, Tin>(x);
+  }
+  #endif
+  assert(false);
+  return {};  // Squash missing return statement warning
+}
+
+template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
+__inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) {
+  #ifdef ENABLE_FP8
+  if constexpr (kv_dt == Fp8KVCacheDataType::kFp8E4M3) {
+    return scaled_vec_conversion<Tout, Tin>(x, scale);
+  }
+  #endif
+  assert(false);
+  return {};  // Squash missing return statement warning
+}
+
+  // The following macro is used to dispatch the conversion function based on
+  // the data type of the key and value cache. The FN is a macro that calls a
+  // function with template<typename scalar_t, typename cache_t,
+  // Fp8KVCacheDataType kv_dt>.
+  #define DISPATCH_BY_KV_CACHE_DTYPE(SRC_DTYPE, KV_DTYPE, FN)                  \
+    if (KV_DTYPE == "auto") {                                                  \
+      if (SRC_DTYPE == at::ScalarType::Float) {                                \
+        FN(float, float, vllm::Fp8KVCacheDataType::kAuto);                     \
+      } else if (SRC_DTYPE == at::ScalarType::Half) {                          \
+        FN(uint16_t, uint16_t, vllm::Fp8KVCacheDataType::kAuto);               \
+      } else if (SRC_DTYPE == at::ScalarType::BFloat16) {                      \
+        FN(__nv_bfloat16, __nv_bfloat16, vllm::Fp8KVCacheDataType::kAuto);     \
+      } else {                                                                 \
+        TORCH_CHECK(false, "Unsupported input type of kv cache: ", SRC_DTYPE); \
+      }                                                                        \
+    } else {                                                                   \
+      if (KV_DTYPE == "fp8" || KV_DTYPE == "fp8_e4m3") {                       \
+        if (SRC_DTYPE == at::ScalarType::Float) {                              \
+          FN(float, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3);              \
+        } else if (SRC_DTYPE == at::ScalarType::Half) {                        \
+          FN(uint16_t, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3);           \
+        } else if (SRC_DTYPE == at::ScalarType::BFloat16) {                    \
+          FN(__nv_bfloat16, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3);      \
+        } else {                                                               \
+          TORCH_CHECK(false,                                                   \
+                      "Unsupported input type of kv cache: ", SRC_DTYPE);      \
+        }                                                                      \
+      } else {                                                                 \
+        TORCH_CHECK(false, "Unsupported data type of kv cache: ", KV_DTYPE);   \
+      }                                                                        \
+    }
+
+}  // namespace fp8
+#endif  // USE_ROCM
+}  // namespace vllm
diff --git a/vllm_v0.10.0/csrc/quantization/fp8/common.cu b/vllm_v0.10.0/csrc/quantization/fp8/common.cu
new file mode 100644
index 0000000..0e1eab6
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/fp8/common.cu
@@ -0,0 +1,171 @@
+#include "common.cuh"
+#include "dispatch_utils.h"
+
+#include <c10/cuda/CUDAGuard.h>
+
+#ifndef USE_ROCM
+  #include <cub/cub.cuh>
+#else
+  #include <hipcub/hipcub.hpp>
+#endif
+
+namespace vllm {
+
+template <typename scalar_t, typename fp8_type>
+__global__ void scaled_fp8_quant_kernel(fp8_type* __restrict__ out,
+                                        const scalar_t* __restrict__ input,
+                                        const float* __restrict__ scale,
+                                        int64_t num_elems) {
+  int tid = blockDim.x * blockIdx.x + threadIdx.x;
+
+  // Invert the scale so that we can use multiplications to avoid expensive
+  // division.
+  const float inverted_scale = 1.0f / (*scale);
+  scaled_fp8_conversion_vec<scalar_t, true>(
+      out, input, inverted_scale, num_elems, tid, blockDim.x * gridDim.x);
+}
+
+template <typename scalar_t, typename fp8_type>
+__global__ void dynamic_per_token_scaled_fp8_quant_kernel(
+    fp8_type* __restrict__ out, float* __restrict__ scale,
+    scalar_t const* __restrict__ input, float const* __restrict__ scale_ub,
+    const int hidden_size) {
+  int const tid = threadIdx.x;
+  int const token_idx = blockIdx.x;
+
+  // Use int64 to avoid overflowing an int32 when calculating this offset
+  int64_t offset = static_cast<int64_t>(token_idx) * hidden_size;
+  scalar_t const* __restrict__ token_input = &input[offset];
+  fp8_type* __restrict__ token_output = &out[offset];
+
+  // For vectorization, token_input and token_output pointers need to be
+  // aligned at 32-byte and 16-byte addresses respectively.
+  bool const can_vectorize = hidden_size % 16 == 0;
+
+  float absmax_val = 0.0f;
+  if (can_vectorize) {
+    absmax_val = thread_max_vec(token_input, hidden_size, tid, blockDim.x);
+  } else {
+    for (int i = tid; i < hidden_size; i += blockDim.x) {
+      float const x = static_cast<float>(token_input[i]);
+      absmax_val = fmaxf(absmax_val, fabsf(x));
+    }
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 256>;
+  __shared__ typename BlockReduce::TempStorage reduceStorage;
+  float const block_absmax_val_maybe =
+      BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x);
+  __shared__ float token_scale;
+  if (tid == 0) {
+    if (scale_ub) {
+      token_scale = fminf(block_absmax_val_maybe, *scale_ub);
+    } else {
+      token_scale = block_absmax_val_maybe;
+    }
+    // token scale computation
+    token_scale = fmaxf(token_scale / quant_type_max_v<fp8_type>,
+                        min_scaling_factor<fp8_type>::val());
+    scale[token_idx] = token_scale;
+  }
+  __syncthreads();
+
+  // Note that we don't use inverted scales so we can match FBGemm impl.
+  if (can_vectorize) {
+    scaled_fp8_conversion_vec<scalar_t, false>(
+        token_output, token_input, token_scale, hidden_size, tid, blockDim.x);
+  } else {
+    for (int i = tid; i < hidden_size; i += blockDim.x) {
+      token_output[i] = scaled_fp8_conversion<false, fp8_type>(
+          static_cast<float>(token_input[i]), token_scale);
+    }
+  }
+}
+
+}  // namespace vllm
+
+void static_scaled_fp8_quant(torch::Tensor& out,          // [..., d]
+                             torch::Tensor const& input,  // [..., d]
+                             torch::Tensor const& scale)  // [1]
+{
+  TORCH_CHECK(input.is_contiguous());
+  TORCH_CHECK(out.is_contiguous());
+  int const block_size = 256;
+  int const num_tokens = input.numel() / input.size(-1);
+  int const num_elems = input.numel();
+  dim3 const grid(num_tokens);
+  dim3 const block(block_size);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "scaled_fp8_quant_kernel_scalar_type", [&] {
+        VLLM_DISPATCH_FP8_TYPES(
+            out.scalar_type(), "scaled_fp8_quant_kernel_fp8_type", [&] {
+              vllm::scaled_fp8_quant_kernel<scalar_t, fp8_t>
+                  <<<grid, block, 0, stream>>>(
+                      out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),
+                      scale.data_ptr<float>(), num_elems);
+            });
+      });
+}
+
+void dynamic_scaled_fp8_quant(torch::Tensor& out,          // [..., d]
+                              torch::Tensor const& input,  // [..., d]
+                              torch::Tensor& scale)        // [1]
+{
+  TORCH_CHECK(input.is_contiguous());
+  TORCH_CHECK(out.is_contiguous());
+  int const block_size = 256;
+  int const num_tokens = input.numel() / input.size(-1);
+  int const num_elems = input.numel();
+  dim3 const grid(num_tokens);
+  dim3 const block(block_size);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "scaled_fp8_quant_kernel_scalar_type", [&] {
+        VLLM_DISPATCH_FP8_TYPES(
+            out.scalar_type(), "scaled_fp8_quant_kernel_fp8_type", [&] {
+              vllm::segmented_max_reduction<scalar_t, fp8_t>
+                  <<<grid, block, 0, stream>>>(scale.data_ptr<float>(),
+                                               input.data_ptr<scalar_t>(),
+                                               num_elems);
+              vllm::scaled_fp8_quant_kernel<scalar_t, fp8_t>
+                  <<<grid, block, 0, stream>>>(
+                      out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),
+                      scale.data_ptr<float>(), num_elems);
+            });
+      });
+}
+
+void dynamic_per_token_scaled_fp8_quant(
+    torch::Tensor& out,          // [..., d]
+    torch::Tensor const& input,  // [..., d]
+    torch::Tensor& scales, std::optional<at::Tensor> const& scale_ub) {
+  TORCH_CHECK(input.is_contiguous());
+  TORCH_CHECK(out.is_contiguous());
+
+  int const hidden_size = input.size(-1);
+  int const num_tokens = input.numel() / hidden_size;
+  int const block_size = 256;
+  dim3 const grid(num_tokens);
+  dim3 const block(std::min(hidden_size, block_size));
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(),
+      "dynamic_per_token_scaled_fp8_quant_kernel_scalar_type", [&] {
+        VLLM_DISPATCH_FP8_TYPES(
+            out.scalar_type(),
+            "dynamic_per_token_scaled_fp8_quant_kernel_fp8_type", [&] {
+              vllm::dynamic_per_token_scaled_fp8_quant_kernel<scalar_t, fp8_t>
+                  <<<grid, block, 0, stream>>>(
+                      out.data_ptr<fp8_t>(), scales.data_ptr<float>(),
+                      input.data_ptr<scalar_t>(),
+                      scale_ub.has_value() ? scale_ub->data_ptr<float>()
+                                           : nullptr,
+                      hidden_size);
+            });
+      });
+}
diff --git a/vllm_v0.10.0/csrc/quantization/fp8/common.cuh b/vllm_v0.10.0/csrc/quantization/fp8/common.cuh
new file mode 100644
index 0000000..d36f94a
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/fp8/common.cuh
@@ -0,0 +1,165 @@
+#pragma once
+
+#include "quantization/vectorization.cuh"
+#include "quantization/utils.cuh"
+
+#include <cmath>
+
+#ifdef USE_ROCM
+  #include "amd/quant_utils.cuh"
+#endif
+
+// Determines the preferred FP8 type for the current platform.
+// Note that for CUDA this just returns true,
+// but on ROCm it will check device props.
+static bool is_fp8_ocp() {
+#ifndef USE_ROCM
+  return true;
+#else
+  auto dprops = at::cuda::getCurrentDeviceProperties();
+  std::string device_arch = dprops->gcnArchName;
+  size_t substring = device_arch.find("gfx94");
+  return substring == std::string::npos;
+#endif
+}
+
+namespace vllm {
+
+__device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
+  float old;
+  old = (value >= 0)
+            ? __int_as_float(atomicMax((int*)addr, __float_as_int(value)))
+            : __uint_as_float(
+                  atomicMin((unsigned int*)addr, __float_as_uint(value)));
+
+  return old;
+}
+
+template <bool is_scale_inverted, typename fp8_type>
+__device__ __forceinline__ fp8_type scaled_fp8_conversion(float const val,
+                                                          float const scale) {
+  float x = 0.0f;
+  if constexpr (is_scale_inverted) {
+    x = val * scale;
+  } else {
+    x = val / scale;
+  }
+
+  float r =
+      fmaxf(-quant_type_max_v<fp8_type>, fminf(x, quant_type_max_v<fp8_type>));
+#ifndef USE_ROCM
+  return static_cast<fp8_type>(r);
+#else
+  // Use hardware cvt instruction for fp8 on rocm
+  return fp8::cvt_c10<fp8_type>(r);
+#endif
+}
+
+// Compute the absolute maximum m of the input tensor and store
+// m / float8_e4m3::max() in *scale. Each thread block performs a
+// reduction tree and the memory in scale is atomically updated.
+// So to get the right answer, *scale needs to be initialized to
+// a value <= 0.0 and we need to wait for all thread blocks to
+// finish before consuming *scale.
+template <typename scalar_t, typename fp8_type>
+__global__ void segmented_max_reduction(float* __restrict__ scale,
+                                        const scalar_t* __restrict__ input,
+                                        int64_t num_elems) {
+  __shared__ float cache[256];
+  int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
+
+  // First store maximum for all values processes by
+  // the current thread in cache[threadIdx.x]
+  scalar_t tmp = 0.0;
+  while (i < num_elems) {
+    float x = static_cast<float>(input[i]);
+    tmp = fmaxf(tmp, fabsf(x));
+    i += blockDim.x * gridDim.x;
+  }
+  cache[threadIdx.x] = tmp;
+
+  __syncthreads();
+
+  // Now perform parallel reduction within the thread block
+  int ib = blockDim.x / 2;
+  while (ib != 0) {
+    if (threadIdx.x < ib && cache[threadIdx.x + ib] > cache[threadIdx.x]) {
+      cache[threadIdx.x] = cache[threadIdx.x + ib];
+    }
+    __syncthreads();
+    ib /= 2;
+  }
+  // Finally, since cache[0] contains the maximum for this thread block,
+  // atomically write the max to the target location
+  if (threadIdx.x == 0) {
+    atomicMaxFloat(scale, cache[0] / quant_type_max_v<fp8_type>);
+  }
+}
+
+template <typename scalar_t>
+__device__ float thread_max_vec(scalar_t const* __restrict__ input,
+                                int64_t const num_elems, int const tid,
+                                int const step) {
+  constexpr size_t VEC_SIZE = 16;
+  using scalarxN_t = vec_n_t<scalar_t, VEC_SIZE>;
+  // Vectorized input/output to better utilize memory bandwidth.
+  auto const* vectorized_in = reinterpret_cast<scalarxN_t const*>(input);
+
+  // num_elems / VEC_SIZE (which is 16)
+  int64_t const num_vec_elems = num_elems >> 4;
+  float absmax_val = 0.0f;
+
+#pragma unroll
+  for (int64_t i = tid; i < num_vec_elems; i += step) {
+    scalarxN_t in_vec = vectorized_in[i];
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      absmax_val = fmaxf(absmax_val, fabsf(in_vec.val[j]));
+    }
+  }
+
+  // Handle the remaining elements if num_elems is not divisible by VEC_SIZE
+  for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) {
+    absmax_val = fmaxf(absmax_val, fabsf(input[i]));
+  }
+
+  return absmax_val;
+}
+
+template <typename scalar_t, bool is_scale_inverted, typename fp8_type>
+__device__ void scaled_fp8_conversion_vec(fp8_type* __restrict__ out,
+                                          scalar_t const* __restrict__ input,
+                                          float const scale,
+                                          int64_t const num_elems,
+                                          int const tid, int const step) {
+  constexpr size_t VEC_SIZE = 16;
+  using scalarxN_t = vec_n_t<scalar_t, VEC_SIZE>;
+  using float8xN_t = q8_n_t<fp8_type, VEC_SIZE>;
+  // Vectorized input/output to better utilize memory bandwidth.
+  auto const* vectorized_in = reinterpret_cast<scalarxN_t const*>(input);
+  auto* vectorized_out = reinterpret_cast<float8xN_t*>(out);
+
+  // num_elems / VEC_SIZE (which is 16)
+  int64_t const num_vec_elems = num_elems >> 4;
+
+#pragma unroll
+  for (int64_t i = tid; i < num_vec_elems; i += step) {
+    scalarxN_t in_vec = vectorized_in[i];
+    float8xN_t out_vec;
+
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      out_vec.val[j] = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
+          static_cast<float>(in_vec.val[j]), scale);
+    }
+    vectorized_out[i] = out_vec;
+  }
+
+  // Handle the remaining elements if num_elems is not divisible by VEC_SIZE
+  for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) {
+    out[i] = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
+        static_cast<float>(input[i]), scale);
+  }
+}
+
+}  // namespace vllm
diff --git a/vllm_v0.10.0/csrc/quantization/fp8/nvidia/quant_utils.cuh b/vllm_v0.10.0/csrc/quantization/fp8/nvidia/quant_utils.cuh
new file mode 100644
index 0000000..f8cd1dc
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/fp8/nvidia/quant_utils.cuh
@@ -0,0 +1,573 @@
+#pragma once
+
+#include "../../../attention/attention_dtypes.h"
+#include <assert.h>
+#include <float.h>
+#include <stdint.h>
+#include <type_traits>
+
+namespace vllm {
+#ifndef USE_ROCM
+
+namespace fp8 {
+  #ifdef ENABLE_FP8
+
+    #if 0  // Disable the following code to reduce the binary size.
+template <typename Tout, typename Tin>
+__inline__ __device__ Tout
+vec_conversion(const Tin &x, const __nv_fp8_interpretation_t fp8_type) {
+  return x;
+}
+
+// fp8 -> half
+template <>
+__inline__ __device__ uint16_t vec_conversion<uint16_t, uint8_t>(
+    const uint8_t &a, const __nv_fp8_interpretation_t fp8_type) {
+  __half_raw res = __nv_cvt_fp8_to_halfraw(a, fp8_type);
+  return res.x;
+}
+
+// fp8x2 -> half2
+template <>
+__inline__ __device__ uint32_t vec_conversion<uint32_t, uint16_t>(
+    const uint16_t &a, const __nv_fp8_interpretation_t fp8_type) {
+  union {
+    uint16_t u16[2];
+    uint32_t u32;
+  } tmp;
+  __half2_raw res = __nv_cvt_fp8x2_to_halfraw2(a, fp8_type);
+  tmp.u16[0] = res.x;
+  tmp.u16[1] = res.y;
+  return tmp.u32;
+}
+
+// fp8x4 -> half2x2
+template <>
+__inline__ __device__ uint2 vec_conversion<uint2, uint32_t>(
+    const uint32_t &a, const __nv_fp8_interpretation_t fp8_type) {
+  union {
+    uint2 u32x2;
+    uint32_t u32[2];
+  } tmp;
+  tmp.u32[0] = vec_conversion<uint32_t, uint16_t>((uint16_t)a, fp8_type);
+  tmp.u32[1] =
+      vec_conversion<uint32_t, uint16_t>((uint16_t)(a >> 16U), fp8_type);
+  return tmp.u32x2;
+}
+
+// fp8x8 -> half2x4
+template <>
+__inline__ __device__ uint4 vec_conversion<uint4, uint2>(
+    const uint2 &a, const __nv_fp8_interpretation_t fp8_type) {
+  union {
+    uint4 u64x2;
+    uint2 u64[2];
+  } tmp;
+  tmp.u64[0] = vec_conversion<uint2, uint32_t>(a.x, fp8_type);
+  tmp.u64[1] = vec_conversion<uint2, uint32_t>(a.y, fp8_type);
+  return tmp.u64x2;
+}
+
+// fp8 -> __nv_bfloat16
+template <>
+__inline__ __device__ __nv_bfloat16 vec_conversion<__nv_bfloat16, uint8_t>(
+    const uint8_t &a, const __nv_fp8_interpretation_t fp8_type) {
+  // Note there is no direct convert function from fp8 to bf16.
+  // fp8 -> half
+  __half_raw res = __nv_cvt_fp8_to_halfraw(a, fp8_type);
+  // half -> float -> bf16
+  float tmp = half_to_float(res.x);
+  return __float2bfloat16(tmp);
+}
+
+// fp8x2 -> __nv_bfloat162
+template <>
+__inline__ __device__ __nv_bfloat162 vec_conversion<__nv_bfloat162, uint16_t>(
+    const uint16_t &a, const __nv_fp8_interpretation_t fp8_type) {
+  __nv_bfloat162 res;
+  res.x = vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)a, fp8_type);
+  res.y = vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)(a >> 8U), fp8_type);
+  return res;
+}
+
+// fp8x4 -> bf16_4_t
+template <>
+__inline__ __device__ bf16_4_t vec_conversion<bf16_4_t, uint32_t>(
+    const uint32_t &a, const __nv_fp8_interpretation_t fp8_type) {
+  bf16_4_t res;
+  res.x = vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)a, fp8_type);
+  res.y =
+      vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)(a >> 16U), fp8_type);
+  return res;
+}
+
+// fp8x8 -> bf16_8_t
+template <>
+__inline__ __device__ bf16_8_t vec_conversion<bf16_8_t, uint2>(
+    const uint2 &a, const __nv_fp8_interpretation_t fp8_type) {
+  bf16_4_t tmp1, tmp2;
+  tmp1 = vec_conversion<bf16_4_t, uint32_t>(a.x, fp8_type);
+  tmp2 = vec_conversion<bf16_4_t, uint32_t>(a.y, fp8_type);
+  bf16_8_t res;
+  res.x = tmp1.x;
+  res.y = tmp1.y;
+  res.z = tmp2.x;
+  res.w = tmp2.y;
+  return res;
+}
+
+// fp8 -> float
+template <>
+__inline__ __device__ float
+vec_conversion<float, uint8_t>(const uint8_t &a,
+                               const __nv_fp8_interpretation_t fp8_type) {
+  // fp8 -> half
+  uint16_t tmp = vec_conversion<uint16_t, uint8_t>(a, fp8_type);
+  // half -> float
+  return half_to_float(tmp);
+}
+
+// fp8x2 -> float2
+template <>
+__inline__ __device__ float2 vec_conversion<float2, uint16_t>(
+    const uint16_t &a, const __nv_fp8_interpretation_t fp8_type) {
+  // fp8x2 -> half2
+  uint32_t tmp = vec_conversion<uint32_t, uint16_t>(a, fp8_type);
+  // half2 -> float2
+  return half2_to_float2(tmp);
+}
+
+// fp8x4 -> float4
+template <>
+__inline__ __device__ Float4_ vec_conversion<Float4_, uint32_t>(
+    const uint32_t &a, const __nv_fp8_interpretation_t fp8_type) {
+  Float4_ res;
+  res.x = vec_conversion<float2, uint16_t>((uint16_t)a, fp8_type);
+  res.y = vec_conversion<float2, uint16_t>((uint16_t)(a >> 16U), fp8_type);
+  return res;
+}
+
+// fp8x8 -> float8
+template <>
+__inline__ __device__ Float8_ vec_conversion<Float8_, uint2>(
+    const uint2 &a, const __nv_fp8_interpretation_t fp8_type) {
+  Float4_ tmp1, tmp2;
+  tmp1 = vec_conversion<Float4_, uint32_t>(a.x, fp8_type);
+  tmp2 = vec_conversion<Float4_, uint32_t>(a.y, fp8_type);
+  Float8_ res;
+  res.x = tmp1.x;
+  res.y = tmp1.y;
+  res.z = tmp2.x;
+  res.w = tmp2.y;
+  return res;
+}
+
+// half -> fp8
+template <>
+__inline__ __device__ uint8_t vec_conversion<uint8_t, uint16_t>(
+    const uint16_t &a, const __nv_fp8_interpretation_t fp8_type) {
+  __half_raw tmp;
+  tmp.x = a;
+  __nv_fp8_storage_t res =
+      __nv_cvt_halfraw_to_fp8(tmp, __NV_SATFINITE, fp8_type);
+  return (uint8_t)res;
+}
+
+// bf16 -> fp8
+template <>
+__inline__ __device__ uint8_t vec_conversion<uint8_t, __nv_bfloat16>(
+    const __nv_bfloat16 &a, const __nv_fp8_interpretation_t fp8_type) {
+      #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  assert(false);
+      #else
+  __nv_fp8_storage_t res = __nv_cvt_bfloat16raw_to_fp8(
+      __nv_bfloat16_raw(a), __NV_SATFINITE, fp8_type);
+  return (uint8_t)res;
+      #endif
+}
+
+// float -> fp8
+template <>
+__inline__ __device__ uint8_t vec_conversion<uint8_t, float>(
+    const float &a, const __nv_fp8_interpretation_t fp8_type) {
+  __nv_fp8_storage_t res = __nv_cvt_float_to_fp8(a, __NV_SATFINITE, fp8_type);
+  return (uint8_t)res;
+}
+
+// fp8x4 -> float4
+template <>
+__inline__ __device__ float4 vec_conversion<float4, uint32_t>(
+    const uint32_t &a, const __nv_fp8_interpretation_t fp8_type) {
+  Float4_ tmp = vec_conversion<Float4_, uint32_t>(a, fp8_type);
+  float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y);
+  return res;
+}
+
+template <>
+__inline__ __device__ uint32_t vec_conversion<uint32_t, float2>(
+    const float2 &a, const __nv_fp8_interpretation_t fp8_type) {
+  union {
+    half2 float16;
+    uint32_t uint32;
+  };
+
+  float16 = __float22half2_rn(a);
+  return uint32;
+}
+
+template <>
+__inline__ __device__ uint2 vec_conversion<uint2, Float4_>(
+    const Float4_ &a, const __nv_fp8_interpretation_t fp8_type) {
+  uint2 b;
+  float2 val;
+  val.x = a.x.x;
+  val.y = a.x.y;
+  b.x = vec_conversion<uint32_t, float2>(val, fp8_type);
+
+  val.x = a.y.x;
+  val.y = a.y.y;
+  b.y = vec_conversion<uint32_t, float2>(val, fp8_type);
+
+  return b;
+}
+
+template <>
+__inline__ __device__ float4 vec_conversion<float4, Float4_>(
+    const Float4_ &a, const __nv_fp8_interpretation_t fp8_type) {
+  float4 b;
+  b.x = a.x.x;
+  b.y = a.x.y;
+  b.z = a.y.x;
+  b.w = a.y.y;
+  return b;
+}
+
+template <>
+__inline__ __device__ uint4 vec_conversion<uint4, Float8_>(
+    const Float8_ &a, const __nv_fp8_interpretation_t fp8_type) {
+  uint4 b;
+  b.x = vec_conversion<uint32_t, float2>(a.x, fp8_type);
+  b.y = vec_conversion<uint32_t, float2>(a.y, fp8_type);
+  b.z = vec_conversion<uint32_t, float2>(a.z, fp8_type);
+  b.w = vec_conversion<uint32_t, float2>(a.w, fp8_type);
+  return b;
+}
+
+template <>
+__inline__ __device__ __nv_bfloat162 vec_conversion<__nv_bfloat162, float2>(
+    const float2 &a, const __nv_fp8_interpretation_t fp8_type) {
+  __nv_bfloat162 b;
+  from_float(b, a);
+  return b;
+}
+
+template <>
+__inline__ __device__ bf16_4_t vec_conversion<bf16_4_t, Float4_>(
+    const Float4_ &a, const __nv_fp8_interpretation_t fp8_type) {
+  bf16_4_t b;
+  from_float(b, a);
+  return b;
+}
+
+template <>
+__inline__ __device__ bf16_8_t vec_conversion<bf16_8_t, Float8_>(
+    const Float8_ &a, const __nv_fp8_interpretation_t fp8_type) {
+  bf16_8_t b;
+  from_float(b, a);
+  return b;
+}
+    #endif
+
+/* Scaled and vectorized conversions, for data exchange between high and low
+   precision domains Convention of the scale in API, e.g: FP8_data =
+   Quantization( High_Precision_data / scale ) s.t. Quantize(HP / scale) => FP8
+     Dequant(FP8) * scale =>  HP
+ */
+
+template <typename Tout, typename Tin>
+__inline__ __device__ Tout scaled_vec_conversion(
+    const Tin& x, const float scale, const __nv_fp8_interpretation_t fp8_type) {
+  return x;
+}
+
+// fp8 -> half
+template <>
+__inline__ __device__ uint16_t scaled_vec_conversion<uint16_t, uint8_t>(
+    const uint8_t& a, const float scale,
+    const __nv_fp8_interpretation_t fp8_type) {
+  __half_raw tmp = __nv_cvt_fp8_to_halfraw(a, fp8_type);
+  return float_to_half(half_to_float(tmp.x) * scale);
+}
+
+// fp8x2 -> half2
+template <>
+__inline__ __device__ uint32_t scaled_vec_conversion<uint32_t, uint16_t>(
+    const uint16_t& a, const float scale,
+    const __nv_fp8_interpretation_t fp8_type) {
+  union {
+    uint16_t u16[2];
+    uint32_t u32;
+  } tmp;
+  __half2_raw res = __nv_cvt_fp8x2_to_halfraw2(a, fp8_type);
+  tmp.u16[0] = float_to_half(half_to_float(res.x) * scale);
+  tmp.u16[1] = float_to_half(half_to_float(res.y) * scale);
+  return tmp.u32;
+}
+
+// fp8x4 -> half2x2
+template <>
+__inline__ __device__ uint2 scaled_vec_conversion<uint2, uint32_t>(
+    const uint32_t& a, const float scale,
+    const __nv_fp8_interpretation_t fp8_type) {
+  union {
+    uint2 u32x2;
+    uint32_t u32[2];
+  } tmp;
+  tmp.u32[0] =
+      scaled_vec_conversion<uint32_t, uint16_t>((uint16_t)a, scale, fp8_type);
+  tmp.u32[1] = scaled_vec_conversion<uint32_t, uint16_t>((uint16_t)(a >> 16U),
+                                                         scale, fp8_type);
+  return tmp.u32x2;
+}
+
+// fp8x8 -> half2x4
+template <>
+__inline__ __device__ uint4
+scaled_vec_conversion<uint4, uint2>(const uint2& a, const float scale,
+                                    const __nv_fp8_interpretation_t fp8_type) {
+  union {
+    uint4 u64x2;
+    uint2 u64[2];
+  } tmp;
+  tmp.u64[0] = scaled_vec_conversion<uint2, uint32_t>(a.x, scale, fp8_type);
+  tmp.u64[1] = scaled_vec_conversion<uint2, uint32_t>(a.y, scale, fp8_type);
+  return tmp.u64x2;
+}
+
+// fp8 -> __nv_bfloat16
+template <>
+__inline__ __device__ __nv_bfloat16
+scaled_vec_conversion<__nv_bfloat16, uint8_t>(
+    const uint8_t& a, const float scale,
+    const __nv_fp8_interpretation_t fp8_type) {
+  // Note there is no direct convert function from fp8 to bf16.
+  // fp8 -> half
+  __half_raw res = __nv_cvt_fp8_to_halfraw(a, fp8_type);
+  // half -> float -> bf16
+  float tmp = half_to_float(res.x);
+  return __float2bfloat16(tmp * scale);
+}
+
+// fp8x2 -> __nv_bfloat162
+template <>
+__inline__ __device__ __nv_bfloat162
+scaled_vec_conversion<__nv_bfloat162, uint16_t>(
+    const uint16_t& a, const float scale,
+    const __nv_fp8_interpretation_t fp8_type) {
+  __nv_bfloat162 res;
+  res.x = scaled_vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)a, scale,
+                                                        fp8_type);
+  res.y = scaled_vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)(a >> 8U),
+                                                        scale, fp8_type);
+  return res;
+}
+
+// fp8x4 -> bf16_4_t
+template <>
+__inline__ __device__ bf16_4_t scaled_vec_conversion<bf16_4_t, uint32_t>(
+    const uint32_t& a, const float scale,
+    const __nv_fp8_interpretation_t fp8_type) {
+  bf16_4_t res;
+  res.x = scaled_vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)a, scale,
+                                                          fp8_type);
+  res.y = scaled_vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)(a >> 16U),
+                                                          scale, fp8_type);
+  return res;
+}
+
+// fp8x8 -> bf16_8_t
+template <>
+__inline__ __device__ bf16_8_t scaled_vec_conversion<bf16_8_t, uint2>(
+    const uint2& a, const float scale,
+    const __nv_fp8_interpretation_t fp8_type) {
+  bf16_4_t tmp1, tmp2;
+  tmp1 = scaled_vec_conversion<bf16_4_t, uint32_t>(a.x, scale, fp8_type);
+  tmp2 = scaled_vec_conversion<bf16_4_t, uint32_t>(a.y, scale, fp8_type);
+  bf16_8_t res;
+  res.x = tmp1.x;
+  res.y = tmp1.y;
+  res.z = tmp2.x;
+  res.w = tmp2.y;
+  return res;
+}
+
+// fp8 -> float
+template <>
+__inline__ __device__ float scaled_vec_conversion<float, uint8_t>(
+    const uint8_t& a, const float scale,
+    const __nv_fp8_interpretation_t fp8_type) {
+  // fp8 -> half
+  __half_raw res = __nv_cvt_fp8_to_halfraw(a, fp8_type);
+  uint16_t tmp = res.x;
+
+  // half -> float
+  return half_to_float(tmp) * scale;
+}
+
+// fp8x2 -> float2
+template <>
+__inline__ __device__ float2 scaled_vec_conversion<float2, uint16_t>(
+    const uint16_t& a, const float scale,
+    const __nv_fp8_interpretation_t fp8_type) {
+  // fp8x2 -> half2
+  uint32_t tmp = scaled_vec_conversion<uint32_t, uint16_t>(a, scale, fp8_type);
+  // half2 -> float2
+  return half2_to_float2(tmp);
+}
+
+// fp8x4 -> float4
+template <>
+__inline__ __device__ Float4_ scaled_vec_conversion<Float4_, uint32_t>(
+    const uint32_t& a, const float scale,
+    const __nv_fp8_interpretation_t fp8_type) {
+  Float4_ res;
+  res.x = scaled_vec_conversion<float2, uint16_t>((uint16_t)a, scale, fp8_type);
+  res.y = scaled_vec_conversion<float2, uint16_t>((uint16_t)(a >> 16U), scale,
+                                                  fp8_type);
+  return res;
+}
+
+// fp8x8 -> float8
+template <>
+__inline__ __device__ Float8_ scaled_vec_conversion<Float8_, uint2>(
+    const uint2& a, const float scale,
+    const __nv_fp8_interpretation_t fp8_type) {
+  Float4_ tmp1, tmp2;
+  tmp1 = scaled_vec_conversion<Float4_, uint32_t>(a.x, scale, fp8_type);
+  tmp2 = scaled_vec_conversion<Float4_, uint32_t>(a.y, scale, fp8_type);
+  Float8_ res;
+  res.x = tmp1.x;
+  res.y = tmp1.y;
+  res.z = tmp2.x;
+  res.w = tmp2.y;
+  return res;
+}
+
+// half -> fp8
+template <>
+__inline__ __device__ uint8_t scaled_vec_conversion<uint8_t, uint16_t>(
+    const uint16_t& a, const float scale,
+    const __nv_fp8_interpretation_t fp8_type) {
+  __nv_fp8_storage_t res =
+      __nv_cvt_float_to_fp8(half_to_float(a) / scale, __NV_SATFINITE, fp8_type);
+  return (uint8_t)res;
+}
+
+// bf16 -> fp8
+template <>
+__inline__ __device__ uint8_t scaled_vec_conversion<uint8_t, __nv_bfloat16>(
+    const __nv_bfloat16& a, const float scale,
+    const __nv_fp8_interpretation_t fp8_type) {
+    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  assert(false);
+    #else
+  __nv_fp8_storage_t res = __nv_cvt_float_to_fp8(__bfloat162float(a) / scale,
+                                                 __NV_SATFINITE, fp8_type);
+  return (uint8_t)res;
+    #endif
+  __builtin_unreachable();  // Suppress missing return statement warning
+}
+
+// float -> fp8
+template <>
+__inline__ __device__ uint8_t scaled_vec_conversion<uint8_t, float>(
+    const float& a, const float scale,
+    const __nv_fp8_interpretation_t fp8_type) {
+  __nv_fp8_storage_t res =
+      __nv_cvt_float_to_fp8(a / scale, __NV_SATFINITE, fp8_type);
+  return (uint8_t)res;
+}
+
+// fp8x4 -> float4
+template <>
+__inline__ __device__ float4 scaled_vec_conversion<float4, uint32_t>(
+    const uint32_t& a, const float scale,
+    const __nv_fp8_interpretation_t fp8_type) {
+  Float4_ tmp = scaled_vec_conversion<Float4_, uint32_t>(a, scale, fp8_type);
+  float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y);
+  return res;
+}
+  #endif  // ENABLE_FP8
+
+template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
+__inline__ __device__ Tout convert(const Tin& x) {
+  #if 0  // Disable the following code to reduce the binary size.
+  if constexpr (kv_dt == Fp8KVCacheDataType::kFp8E4M3) {
+    return vec_conversion<Tout, Tin>(x, __NV_E4M3);
+  } else if constexpr (kv_dt == Fp8KVCacheDataType::kFp8E5M2) {
+    return vec_conversion<Tout, Tin>(x, __NV_E5M2);
+  }
+  #endif
+  assert(false);
+  __builtin_unreachable();  // Suppress missing return statement warning
+}
+
+template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
+__inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) {
+  #ifdef ENABLE_FP8
+  if constexpr (kv_dt == Fp8KVCacheDataType::kFp8E4M3) {
+    return scaled_vec_conversion<Tout, Tin>(x, scale, __NV_E4M3);
+  } else if constexpr (kv_dt == Fp8KVCacheDataType::kFp8E5M2) {
+    return scaled_vec_conversion<Tout, Tin>(x, scale, __NV_E5M2);
+  }
+  #endif
+  assert(false);
+  __builtin_unreachable();  // Suppress missing return statement warning
+}
+
+  // The following macro is used to dispatch the conversion function based on
+  // the data type of the key and value cache. The FN is a macro that calls a
+  // function with template<typename scalar_t, typename cache_t,
+  // Fp8KVCacheDataType kv_dt>.
+  #define DISPATCH_BY_KV_CACHE_DTYPE(SRC_DTYPE, KV_DTYPE, FN)                  \
+    if (KV_DTYPE == "auto") {                                                  \
+      if (SRC_DTYPE == at::ScalarType::Float) {                                \
+        FN(float, float, vllm::Fp8KVCacheDataType::kAuto);                     \
+      } else if (SRC_DTYPE == at::ScalarType::Half) {                          \
+        FN(uint16_t, uint16_t, vllm::Fp8KVCacheDataType::kAuto);               \
+      } else if (SRC_DTYPE == at::ScalarType::BFloat16) {                      \
+        FN(__nv_bfloat16, __nv_bfloat16, vllm::Fp8KVCacheDataType::kAuto);     \
+      } else {                                                                 \
+        TORCH_CHECK(false, "Unsupported input type of kv cache: ", SRC_DTYPE); \
+      }                                                                        \
+    } else {                                                                   \
+      if (KV_DTYPE == "fp8" || KV_DTYPE == "fp8_e4m3") {                       \
+        if (SRC_DTYPE == at::ScalarType::Float) {                              \
+          FN(float, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3);              \
+        } else if (SRC_DTYPE == at::ScalarType::Half) {                        \
+          FN(uint16_t, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3);           \
+        } else if (SRC_DTYPE == at::ScalarType::BFloat16) {                    \
+          FN(__nv_bfloat16, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3);      \
+        } else {                                                               \
+          TORCH_CHECK(false,                                                   \
+                      "Unsupported input type of kv cache: ", SRC_DTYPE);      \
+        }                                                                      \
+      } else if (KV_DTYPE == "fp8_e5m2") {                                     \
+        if (SRC_DTYPE == at::ScalarType::Float) {                              \
+          FN(float, uint8_t, vllm::Fp8KVCacheDataType::kFp8E5M2);              \
+        } else if (SRC_DTYPE == at::ScalarType::Half) {                        \
+          FN(uint16_t, uint8_t, vllm::Fp8KVCacheDataType::kFp8E5M2);           \
+        } else if (SRC_DTYPE == at::ScalarType::BFloat16) {                    \
+          FN(__nv_bfloat16, uint8_t, vllm::Fp8KVCacheDataType::kFp8E5M2);      \
+        } else {                                                               \
+          TORCH_CHECK(false,                                                   \
+                      "Unsupported input type of kv cache: ", SRC_DTYPE);      \
+        }                                                                      \
+      } else {                                                                 \
+        TORCH_CHECK(false, "Unsupported data type of kv cache: ", KV_DTYPE);   \
+      }                                                                        \
+    }
+
+}  // namespace fp8
+#endif  // not USE_ROCM
+}  // namespace vllm
diff --git a/vllm_v0.10.0/csrc/quantization/fp8/per_token_group_quant.cu b/vllm_v0.10.0/csrc/quantization/fp8/per_token_group_quant.cu
new file mode 100644
index 0000000..afc41fa
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/fp8/per_token_group_quant.cu
@@ -0,0 +1,213 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/util/Float8_e4m3fn.h>
+
+#include <cmath>
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+#include <torch/all.h>
+
+#include "../vectorization.cuh"
+#include "../vectorization_utils.cuh"
+#include "../../dispatch_utils.h"
+
+__device__ __forceinline__ float GroupReduceMax(float val, const int tid) {
+  unsigned mask = 0xffff;
+
+  val = fmaxf(val, __shfl_xor_sync(mask, val, 8));
+  val = fmaxf(val, __shfl_xor_sync(mask, val, 4));
+  val = fmaxf(val, __shfl_xor_sync(mask, val, 2));
+  val = fmaxf(val, __shfl_xor_sync(mask, val, 1));
+  return val;
+}
+
+template <typename T, typename DST_DTYPE, bool IS_COLUMN_MAJOR = false,
+          bool SCALE_UE8M0 = false, typename scale_packed_t = float>
+__global__ void per_token_group_quant_8bit_kernel(
+    const T* __restrict__ input, void* __restrict__ output_q,
+    scale_packed_t* __restrict__ output_s, const int group_size,
+    const int num_groups, const int groups_per_block, const float eps,
+    const float min_8bit, const float max_8bit, const int scale_num_rows = 0,
+    const int scale_stride = 0) {
+  const int threads_per_group = 16;
+  const int64_t local_group_id = threadIdx.x / threads_per_group;
+  const int lane_id = threadIdx.x % threads_per_group;
+
+  const int64_t block_group_id = blockIdx.x * groups_per_block;
+  const int64_t global_group_id = block_group_id + local_group_id;
+  const int64_t block_group_offset = global_group_id * group_size;
+
+  float local_absmax = eps;
+
+  using scale_element_t = float;
+  static_assert(sizeof(scale_packed_t) % sizeof(scale_element_t) == 0);
+
+  const T* group_input = input + block_group_offset;
+  DST_DTYPE* group_output =
+      static_cast<DST_DTYPE*>(output_q) + block_group_offset;
+  scale_element_t* scale_output;
+
+  if constexpr (IS_COLUMN_MAJOR) {
+    const int num_elems_per_pack =
+        static_cast<int>(sizeof(scale_packed_t) / sizeof(scale_element_t));
+    const int scale_num_rows_element = scale_num_rows * num_elems_per_pack;
+    const int row_idx = global_group_id / scale_num_rows_element;
+    const int col_idx_raw = global_group_id % scale_num_rows_element;
+    const int col_idx = col_idx_raw / num_elems_per_pack;
+    const int pack_idx = col_idx_raw % num_elems_per_pack;
+    scale_output = reinterpret_cast<scale_element_t*>(output_s) +
+                   (col_idx * scale_stride * num_elems_per_pack +
+                    row_idx * num_elems_per_pack + pack_idx);
+  } else {
+    scale_output = output_s + global_group_id;
+  }
+
+  // shared memory to cache each group's data to avoid double DRAM reads.
+  extern __shared__ __align__(16) char smem_raw[];
+  T* smem = reinterpret_cast<T*>(smem_raw);
+  T* smem_group = smem + local_group_id * group_size;
+
+  constexpr int vec_size = 16 / sizeof(T);
+  using vec_t = vllm::vec_n_t<T, vec_size>;
+
+  // copy global -> shared & compute absmax
+  auto scalar_op_cache = [&] __device__(T & dst, const T& src) {
+    float abs_v = fabsf(static_cast<float>(src));
+    local_absmax = fmaxf(local_absmax, abs_v);
+    dst = src;
+  };
+
+  vllm::vectorize_with_alignment<vec_size>(
+      group_input,        // in
+      smem_group,         // out (shared)
+      group_size,         // elements per group
+      lane_id,            // thread id
+      threads_per_group,  // stride in group
+      scalar_op_cache);   // scalar handler
+
+  local_absmax = GroupReduceMax(local_absmax, lane_id);
+
+  float y_s = local_absmax / max_8bit;
+  if constexpr (SCALE_UE8M0) {
+    y_s = exp2f(ceilf(log2f(fmaxf(fabsf(y_s), 1e-10f))));
+  }
+
+  scale_element_t y_s_quant = y_s;
+
+  if (lane_id == 0) {
+    *scale_output = y_s_quant;
+  }
+
+  __syncthreads();
+
+  // quantize shared -> global 8-bit
+  auto scalar_op_quant = [&] __device__(DST_DTYPE & dst, const T& src) {
+    float q = fminf(fmaxf(static_cast<float>(src) / y_s, min_8bit), max_8bit);
+    dst = DST_DTYPE(q);
+  };
+
+  vllm::vectorize_with_alignment<vec_size>(
+      smem_group,         // in (shared)
+      group_output,       // out (global quant tensor)
+      group_size,         // elements
+      lane_id,            // tid
+      threads_per_group,  // stride
+      scalar_op_quant);   // scalar handler
+}
+
+void per_token_group_quant_8bit(const torch::Tensor& input,
+                                torch::Tensor& output_q,
+                                torch::Tensor& output_s, int64_t group_size,
+                                double eps, double min_8bit, double max_8bit,
+                                bool scale_ue8m0 = false) {
+  TORCH_CHECK(input.is_contiguous());
+  TORCH_CHECK(output_q.is_contiguous());
+
+  const int num_groups = input.numel() / group_size;
+
+  TORCH_CHECK(input.numel() % group_size == 0);
+  TORCH_CHECK(output_s.dim() == 2);
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  constexpr int THREADS_PER_GROUP = 16;
+
+  int groups_per_block = 1;
+
+  if (num_groups % 16 == 0) {
+    groups_per_block = 16;
+  } else if (num_groups % 8 == 0) {
+    groups_per_block = 8;
+  } else if (num_groups % 4 == 0) {
+    groups_per_block = 4;
+  } else if (num_groups % 2 == 0) {
+    groups_per_block = 2;
+  }
+
+  auto dst_type = output_q.scalar_type();
+  const int num_blocks = num_groups / groups_per_block;
+  const int num_threads = groups_per_block * THREADS_PER_GROUP;
+
+  const bool is_column_major = output_s.stride(0) < output_s.stride(1);
+  const int scale_num_rows = output_s.size(1);
+  const int scale_stride = output_s.stride(1);
+
+#define LAUNCH_KERNEL(T, DST_DTYPE)                                        \
+  do {                                                                     \
+    dim3 grid(num_blocks);                                                 \
+    dim3 block(num_threads);                                               \
+    size_t smem_bytes =                                                    \
+        static_cast<size_t>(groups_per_block) * group_size * sizeof(T);    \
+    if (is_column_major) {                                                 \
+      if (scale_ue8m0) {                                                   \
+        per_token_group_quant_8bit_kernel<T, DST_DTYPE, true, true>        \
+            <<<grid, block, smem_bytes, stream>>>(                         \
+                static_cast<T*>(input.data_ptr()), output_q.data_ptr(),    \
+                static_cast<float*>(output_s.data_ptr()), group_size,      \
+                num_groups, groups_per_block, (float)eps, (float)min_8bit, \
+                (float)max_8bit, scale_num_rows, scale_stride);            \
+      } else {                                                             \
+        per_token_group_quant_8bit_kernel<T, DST_DTYPE, true, false>       \
+            <<<grid, block, smem_bytes, stream>>>(                         \
+                static_cast<T*>(input.data_ptr()), output_q.data_ptr(),    \
+                static_cast<float*>(output_s.data_ptr()), group_size,      \
+                num_groups, groups_per_block, (float)eps, (float)min_8bit, \
+                (float)max_8bit, scale_num_rows, scale_stride);            \
+      }                                                                    \
+    } else {                                                               \
+      if (scale_ue8m0) {                                                   \
+        per_token_group_quant_8bit_kernel<T, DST_DTYPE, false, true>       \
+            <<<grid, block, smem_bytes, stream>>>(                         \
+                static_cast<T*>(input.data_ptr()), output_q.data_ptr(),    \
+                static_cast<float*>(output_s.data_ptr()), group_size,      \
+                num_groups, groups_per_block, (float)eps, (float)min_8bit, \
+                (float)max_8bit);                                          \
+      } else {                                                             \
+        per_token_group_quant_8bit_kernel<T, DST_DTYPE, false, false>      \
+            <<<grid, block, smem_bytes, stream>>>(                         \
+                static_cast<T*>(input.data_ptr()), output_q.data_ptr(),    \
+                static_cast<float*>(output_s.data_ptr()), group_size,      \
+                num_groups, groups_per_block, (float)eps, (float)min_8bit, \
+                (float)max_8bit);                                          \
+      }                                                                    \
+    }                                                                      \
+  } while (0)
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "per_token_group_quant_8bit", ([&] {
+        if (dst_type == at::ScalarType::Float8_e4m3fn) {
+          LAUNCH_KERNEL(scalar_t, c10::Float8_e4m3fn);
+        }
+      }));
+
+#undef LAUNCH_KERNEL
+}
+
+void per_token_group_quant_fp8(const torch::Tensor& input,
+                               torch::Tensor& output_q, torch::Tensor& output_s,
+                               int64_t group_size, double eps, double fp8_min,
+                               double fp8_max, bool scale_ue8m0) {
+  per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
+                             fp8_min, fp8_max, scale_ue8m0);
+}
diff --git a/vllm_v0.10.0/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu b/vllm_v0.10.0/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
new file mode 100644
index 0000000..95aa92e
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
@@ -0,0 +1,155 @@
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "../../dispatch_utils.h"
+#include "layernorm_utils.cuh"
+#include "quant_conversions.cuh"
+
+namespace vllm {
+
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
+__device__ void rms_norm_dynamic_per_token_quant_vec(
+    scalar_out_t* __restrict__ out,       // [..., hidden_size]
+    float* __restrict__ scales,           // [num_tokens]
+    scalar_t const* __restrict__ input,   // [..., hidden_size]
+    scalar_t const* __restrict__ weight,  // [hidden_size]
+    float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
+    scalar_t* __restrict__ residual = nullptr) {
+  float rms = 0.0f;
+  float token_scale = 0.0f;
+
+  // Compute rms
+  vllm::vectorized::compute_rms<scalar_t, has_residual>(
+      &rms, input, hidden_size, var_epsilon, residual);
+
+  // Compute scale
+  vllm::vectorized::compute_dynamic_per_token_scales<scalar_t, scalar_out_t,
+                                                     has_residual>(
+      &token_scale, scales, input, weight, rms, scale_ub, hidden_size,
+      residual);
+
+  // RMS Norm + Quant
+  if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
+    vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, true,
+                                     has_residual>(
+        out, input, weight, rms, 1.0f / token_scale, hidden_size, residual);
+  } else {
+    // FP8 - Do not invert token_scale for exact match with FBGemm
+    vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, false,
+                                     has_residual>(
+        out, input, weight, rms, token_scale, hidden_size, residual);
+  }
+}
+
+// RMS norm + quant kernel
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
+__global__ void rms_norm_dynamic_per_token_quant_kernel(
+    scalar_out_t* __restrict__ out,       // [..., hidden_size]
+    float* __restrict__ scales,           // [num_tokens]
+    scalar_t const* __restrict__ input,   // [..., hidden_size]
+    scalar_t const* __restrict__ weight,  // [hidden_size]
+    float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
+    scalar_t* __restrict__ residual = nullptr) {
+  // For vectorization, token_input and token_output pointers need to be
+  // aligned at 8-byte and 4-byte addresses respectively.
+  bool const can_vectorize = hidden_size % 4 == 0;
+
+  if (can_vectorize) {
+    return rms_norm_dynamic_per_token_quant_vec<scalar_t, scalar_out_t,
+                                                has_residual>(
+        out, scales, input, weight, scale_ub, var_epsilon, hidden_size,
+        residual);
+  }
+
+  float rms = 0.0f;
+  float token_scale = 0.0f;
+
+  // Compute RMS
+  vllm::compute_rms<scalar_t, has_residual>(&rms, input, hidden_size,
+                                            var_epsilon, residual);
+  // Compute Scale
+  vllm::compute_dynamic_per_token_scales<scalar_t, scalar_out_t, has_residual>(
+      &token_scale, scales, input, weight, rms, scale_ub, hidden_size,
+      residual);
+
+  // RMS Norm + Quant
+  if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
+    vllm::norm_and_quant<scalar_t, scalar_out_t, true, has_residual>(
+        out, input, weight, rms, 1.0f / token_scale, hidden_size, residual);
+  } else {
+    // FP8 - Do not invert s_token_scale for exact match with FBGemm
+    vllm::norm_and_quant<scalar_t, scalar_out_t, false, has_residual>(
+        out, input, weight, rms, token_scale, hidden_size, residual);
+  }
+}
+}  // namespace vllm
+
+// Residual add + RMS norm + dynamic per token
+template <typename scalar_in_t>
+void rms_norm_dynamic_per_token_quant_dispatch(
+    torch::Tensor& out,           // [..., hidden_size]
+    torch::Tensor const& input,   // [..., hidden_size]
+    torch::Tensor const& weight,  // [hidden_size]
+    torch::Tensor& scales,        // [num_tokens]
+    double const var_epsilon,     // Variance epsilon used in norm calculation
+    std::optional<at::Tensor> const& scale_ub,
+    std::optional<at::Tensor>& residual) {
+  int32_t hidden_size = input.size(-1);
+  auto num_tokens = input.numel() / hidden_size;
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min(hidden_size, 1024));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  if (residual.has_value()) {
+    VLLM_DISPATCH_QUANT_TYPES(
+        out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
+          vllm::rms_norm_dynamic_per_token_quant_kernel<scalar_in_t, scalar_t,
+                                                        true>
+              <<<grid, block, 0, stream>>>(
+                  out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
+                  input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
+                  scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
+                  var_epsilon, hidden_size, residual->data_ptr<scalar_in_t>());
+        });
+
+  } else {
+    VLLM_DISPATCH_QUANT_TYPES(
+        out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
+          vllm::rms_norm_dynamic_per_token_quant_kernel<scalar_in_t, scalar_t,
+                                                        false>
+              <<<grid, block, 0, stream>>>(
+                  out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
+                  input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
+                  scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
+                  var_epsilon, hidden_size, nullptr);
+        });
+  }
+}
+
+void rms_norm_dynamic_per_token_quant(
+    torch::Tensor& out,           // [..., hidden_size]
+    torch::Tensor const& input,   // [..., hidden_size]
+    torch::Tensor const& weight,  // [hidden_size]
+    torch::Tensor& scales,        // [num_tokens]
+    double const var_epsilon,     // Variance epsilon used in norm calculation
+    std::optional<at::Tensor> scale_ub, std::optional<at::Tensor> residual) {
+  static c10::ScalarType kFp8Type = is_fp8_ocp()
+                                        ? c10::ScalarType::Float8_e4m3fn
+                                        : c10::ScalarType::Float8_e4m3fnuz;
+  TORCH_CHECK(out.dtype() == kFp8Type || out.dtype() == torch::kInt8);
+  TORCH_CHECK(out.is_contiguous() && input.is_contiguous());
+
+  if (scale_ub.has_value()) {
+    TORCH_CHECK(out.dtype() == kFp8Type);
+  }
+  TORCH_CHECK(scales.dtype() == torch::kFloat32);
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "rms_norm_dynamic_per_token_quant_dispatch", [&] {
+        rms_norm_dynamic_per_token_quant_dispatch<scalar_t>(
+            out, input, weight, scales, var_epsilon, scale_ub, residual);
+      });
+}
diff --git a/vllm_v0.10.0/csrc/quantization/fused_kernels/layernorm_utils.cuh b/vllm_v0.10.0/csrc/quantization/fused_kernels/layernorm_utils.cuh
new file mode 100644
index 0000000..3f18887
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/fused_kernels/layernorm_utils.cuh
@@ -0,0 +1,329 @@
+#pragma once
+
+/**
+ * __device__ layernorm utilities.
+ */
+
+#include "quantization/vectorization.cuh"
+#include "quantization/utils.cuh"
+#include "quant_conversions.cuh"
+
+#ifndef USE_ROCM
+  #include <cub/cub.cuh>
+#else
+  #include <hipcub/hipcub.hpp>
+#endif
+
+namespace vllm {
+
+// has_residual must be true, if residual is not a nullptr
+template <typename scalar_t, bool has_residual = false>
+__device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
+                            int32_t const hidden_size, float const epsilon,
+                            scalar_t const* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+  // sum of squares
+  float ss = 0.0f;
+
+  for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+    float x = static_cast<float>(input[token_offset + i]);
+    if constexpr (has_residual) {
+      x += static_cast<float>(residual[token_offset + i]);
+    }
+
+    ss += x * x;
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
+
+  __shared__ float s_rms;
+  if (threadIdx.x == 0) {
+    s_rms = rsqrtf(ss / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  *rms = s_rms;
+}
+
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
+__device__ void compute_dynamic_per_token_scales(
+    float* __restrict__ token_scale, float* __restrict__ all_token_scales,
+    scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
+    float const rms, float const* __restrict__ scale_ub,
+    int32_t const hidden_size,
+    scalar_t const* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+  ;
+  constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
+
+  float block_absmax_val_maybe = 0.0f;
+  for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+    float x = static_cast<float>(input[token_offset + i]);
+    if constexpr (has_residual) {
+      x += static_cast<float>(residual[token_offset + i]);
+    }
+
+    x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
+    block_absmax_val_maybe = fmaxf(block_absmax_val_maybe, fabsf(x));
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  block_absmax_val_maybe =
+      BlockReduce(reduceStore)
+          .Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);
+
+  __shared__ float s_token_scale;
+  if (threadIdx.x == 0) {
+    float scale = 0.0f;
+    if (scale_ub) {
+      scale = min(block_absmax_val_maybe, *scale_ub);
+    } else {
+      scale = block_absmax_val_maybe;
+    }
+    // token scale computation
+    scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
+    s_token_scale = scale;                 // Shared memory store
+    all_token_scales[blockIdx.x] = scale;  // Global output store
+  }
+  __syncthreads();
+
+  *token_scale = s_token_scale;
+}
+
+template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
+          bool has_residual = false>
+__device__ void norm_and_quant(scalar_out_t* __restrict__ output,
+                               scalar_t const* __restrict__ input,
+                               scalar_t const* __restrict__ weight,
+                               float const rms, float const scale,
+                               int32_t const hidden_size,
+                               scalar_t* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+  ;
+
+  for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+    float x = static_cast<float>(input[token_offset + i]);
+    if constexpr (has_residual) {
+      x += static_cast<float>(residual[token_offset + i]);
+      residual[token_offset + i] = static_cast<scalar_t>(x);
+    }
+    // Norm
+    x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
+    // Quant
+    output[token_offset + i] =
+        ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(x, scale);
+  }
+}
+
+namespace vectorized {
+
+// Compute 1.0/rms(input)
+// hidden_size must be a multiple of 4
+template <typename scalar_t, bool has_residual = false>
+__device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
+                            int32_t const hidden_size, float const epsilon,
+                            scalar_t const* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+
+  // Vectorized input/output to better utilize memory bandwidth.
+  vec4_t<scalar_t> const* vec_input =
+      reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
+  vec4_t<scalar_t> const* vec_residual = nullptr;
+  if constexpr (has_residual) {
+    vec_residual =
+        reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
+  }
+
+  // sum of squares
+  float ss = 0.0f;
+
+  const int VEC_SIZE = 4;
+  int32_t const num_vec_elems = hidden_size >> 2;
+
+#pragma unroll 4
+  for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+    vec4_t<scalar_t> in = vec_input[i];
+
+    vec4_t<float> x;
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      x.val[j] = static_cast<float>(in.val[j]);
+    }
+
+    if constexpr (has_residual) {
+      vec4_t<scalar_t> r = vec_residual[i];
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        x.val[j] += static_cast<float>(r.val[j]);
+      }
+    }
+
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      ss += x.val[j] * x.val[j];
+    }
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
+
+  __shared__ float s_rms;
+  if (threadIdx.x == 0) {
+    s_rms = rsqrtf(ss / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  *rms = s_rms;
+}
+
+// Vectorized version of vllm::compute_dynamic_per_token_scales
+// hidden_size must be a multiple of 4
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
+__device__ void compute_dynamic_per_token_scales(
+    float* __restrict__ token_scale, float* __restrict__ all_token_scales,
+    scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
+    float const rms, float const* __restrict__ scale_ub,
+    int32_t const hidden_size,
+    scalar_t const* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+  ;
+
+  // Vectorized input/weight/residual to better utilize memory bandwidth.
+  vec4_t<scalar_t> const* vec_input =
+      reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
+  vec4_t<scalar_t> const* vec_weight =
+      reinterpret_cast<vec4_t<scalar_t> const*>(weight);
+  vec4_t<scalar_t> const* vec_residual = nullptr;
+  if constexpr (has_residual) {
+    vec_residual =
+        reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
+  }
+
+  constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
+
+  const int VEC_SIZE = 4;
+  int32_t const num_vec_elems = hidden_size >> 2;
+  float block_absmax_val_maybe = 0.0f;
+
+#pragma unroll 4
+  for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+    vec4_t<scalar_t> in = vec_input[i];
+    vec4_t<scalar_t> const w = vec_weight[i];
+
+    vec4_t<float> x;
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      x.val[j] = static_cast<float>(in.val[j]);
+    }
+
+    if constexpr (has_residual) {
+      vec4_t<scalar_t> r = vec_residual[i];
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        x.val[j] += static_cast<float>(r.val[j]);
+      }
+    }
+
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      block_absmax_val_maybe =
+          fmaxf(block_absmax_val_maybe,
+                fabs(static_cast<scalar_t>(x.val[j] * rms) * w.val[j]));
+    }
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  block_absmax_val_maybe =
+      BlockReduce(reduceStore)
+          .Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);
+
+  __shared__ float s_token_scale;
+  if (threadIdx.x == 0) {
+    float scale = 0.0f;
+    if (scale_ub) {
+      scale = min(block_absmax_val_maybe, *scale_ub);
+    } else {
+      scale = block_absmax_val_maybe;
+    }
+    // token scale computation
+    scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
+    s_token_scale = scale;                 // shared memory store
+    all_token_scales[blockIdx.x] = scale;  // global output store
+  }
+  __syncthreads();
+
+  *token_scale = s_token_scale;
+}
+
+// hidden_size must be a multiple of 4
+template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
+          bool has_residual = false>
+__device__ void norm_and_quant(scalar_out_t* __restrict__ output,
+                               scalar_t const* __restrict__ input,
+                               scalar_t const* __restrict__ weight,
+                               float const rms, float const scale,
+                               int32_t const hidden_size,
+                               scalar_t* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+  ;
+
+  // Vectorized input/output/weight/residual to better utilize memory bandwidth.
+  vec4_t<scalar_t> const* vec_input =
+      reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
+  vec4_t<scalar_t> const* vec_weight =
+      reinterpret_cast<vec4_t<scalar_t> const*>(weight);
+  q8x4_t<scalar_out_t>* vec_output =
+      reinterpret_cast<q8x4_t<scalar_out_t>*>(&output[token_offset]);
+  vec4_t<scalar_t>* vec_residual = nullptr;
+  if constexpr (has_residual) {
+    vec_residual = reinterpret_cast<vec4_t<scalar_t>*>(&residual[token_offset]);
+  }
+
+  const int VEC_SIZE = 4;
+  int32_t const num_vec_elems = hidden_size >> 2;
+
+// TODO(luka/varun) extract into type-agnostic vectorized quant function to
+//  replace scaled_fp8_conversion_vec
+#pragma unroll 4
+  for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+    vec4_t<scalar_t> const in = vec_input[i];
+    vec4_t<scalar_t> const w = vec_weight[i];
+
+    vec4_t<float> x;
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      x.val[j] = static_cast<float>(in.val[j]);
+    }
+
+    if constexpr (has_residual) {
+      vec4_t<scalar_t> r = vec_residual[i];
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        x.val[j] += static_cast<float>(r.val[j]);
+      }
+// Update residual
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        r.val[j] = static_cast<scalar_t>(x.val[j]);
+      }
+      vec_residual[i] = r;
+    }
+
+    q8x4_t<scalar_out_t> out;
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      out.val[j] = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
+          static_cast<scalar_t>(x.val[j] * rms) * w.val[j], scale);
+    }
+    vec_output[i] = out;
+  }
+}
+
+}  // namespace vectorized
+
+}  // namespace vllm
diff --git a/vllm_v0.10.0/csrc/quantization/fused_kernels/quant_conversions.cuh b/vllm_v0.10.0/csrc/quantization/fused_kernels/quant_conversions.cuh
new file mode 100644
index 0000000..4e6118e
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/fused_kernels/quant_conversions.cuh
@@ -0,0 +1,90 @@
+#pragma once
+
+/**
+ * __device__ helper functions to deal with float -> quant datatype conversion
+ */
+
+#include "quantization/vectorization.cuh"
+// TODO(luka/varun):refactor common.cuh to use this file instead
+#include "quantization/fp8/common.cuh"
+
+namespace vllm {
+
+// TODO(luka/varun): combine into common utilities for int8
+//  (with int8_quant_kernels.cu)
+static __device__ __forceinline__ int8_t float_to_int8_rn(float const x) {
+#ifdef USE_ROCM
+  static const float i8_min =
+      static_cast<float>(std::numeric_limits<int8_t>::min());
+  static const float i8_max =
+      static_cast<float>(std::numeric_limits<int8_t>::max());
+  // round
+  float dst = std::nearbyint(x);
+  // saturate
+
+  // See https://github.com/pytorch/pytorch/issues/127666
+  // See https://github.com/llvm/llvm-project/issues/95183
+  // hip-clang std::clamp __glibcxx_assert_fail host function when building on
+  // Arch/gcc14. The following replaces std::clamp usage with similar logic
+  // dst = std::clamp(dst, i8_min, i8_max);
+  dst = (dst < i8_min) ? i8_min : (dst > i8_max) ? i8_max : dst;
+  return static_cast<int8_t>(dst);
+#else
+  // CUDA path
+  uint32_t dst;
+  asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(dst) : "f"(x));
+  return reinterpret_cast<const int8_t&>(dst);
+#endif
+}
+
+template <typename fp8_type>
+static __device__ __forceinline__ fp8_type float_to_fp8(float const x) {
+  float const r =
+      fmax(-quant_type_max_v<fp8_type>, fmin(x, quant_type_max_v<fp8_type>));
+  return static_cast<fp8_type>(r);
+}
+
+template <typename quant_type_t, bool is_scale_inverted, typename enable = void>
+struct ScaledQuant;
+
+template <typename quant_type_t, bool is_scale_inverted>
+struct ScaledQuant<
+    quant_type_t, is_scale_inverted,
+    typename std::enable_if_t<std::is_same_v<quant_type_t, int8_t>>> {
+  static __device__ __forceinline__ quant_type_t quant_fn(float const x,
+                                                          float const scale) {
+    if constexpr (is_scale_inverted) {
+      return float_to_int8_rn(x * scale);
+    } else {
+      return float_to_int8_rn(x / scale);
+    }
+  }
+};
+
+template <typename quant_type_t, bool is_scale_inverted>
+struct ScaledQuant<quant_type_t, is_scale_inverted,
+                   typename std::enable_if_t<
+                       std::is_same_v<quant_type_t, c10::Float8_e4m3fn> ||
+                       std::is_same_v<quant_type_t, c10::Float8_e4m3fnuz>>> {
+  static __device__ __forceinline__ quant_type_t quant_fn(float const x,
+                                                          float const scale) {
+    if constexpr (is_scale_inverted) {
+      return float_to_fp8<quant_type_t>(x * scale);
+    } else {
+      return float_to_fp8<quant_type_t>(x / scale);
+    }
+  }
+};
+
+template <typename scalar_t, typename quant_type_t, bool is_scale_inverted>
+__device__ void scaled_quant_conversion(quant_type_t* __restrict__ output,
+                                        scalar_t const* __restrict__ input,
+                                        float const scale, int const tid,
+                                        int const num_elements,
+                                        int const step) {
+  for (int i = tid; i < num_elements; i += step) {
+    output[i] = ScaledQuant<quant_type_t, is_scale_inverted>(input[i], scale);
+  }
+}
+
+}  // namespace vllm
diff --git a/vllm_v0.10.0/csrc/quantization/gguf/dequantize.cuh b/vllm_v0.10.0/csrc/quantization/gguf/dequantize.cuh
new file mode 100644
index 0000000..9d35500
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/gguf/dequantize.cuh
@@ -0,0 +1,571 @@
+// copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/convert.cu
+// Dequant functions
+static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const block_q4_0 * x = (const block_q4_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    const int vui = x[ib].qs[iqs];
+
+    v.x = __int2half_rn(vui & 0xF);
+    v.y = __int2half_rn(vui >> 4);
+
+    v = __hsub2(v, __floats2half2_rn(8.0f, 8.0f));
+    v = __hmul2(v, {d, d});
+}
+
+static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const block_q4_1 * x = (const block_q4_1 *) vx;
+
+    const dfloat d = __low2half(x[ib].dm);
+    const dfloat m = __high2half(x[ib].dm);
+
+    const int vui = x[ib].qs[iqs];
+
+    v.x = __int2half_rn(vui & 0xF);
+    v.y = __int2half_rn(vui >> 4);
+
+    v = __hmul2(v, {d, d});
+    v = __hadd2(v, {m, m});
+}
+
+static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const block_q5_0 * x = (const block_q5_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+
+    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    v.x = __int2half_rn((x[ib].qs[iqs] & 0xf) | xh_0);
+    v.y = __int2half_rn((x[ib].qs[iqs] >>  4) | xh_1);
+
+    v = __hsub2(v, __floats2half2_rn(16.0f, 16.0f));
+    v = __hmul2(v, {d, d});
+}
+
+static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const block_q5_1 * x = (const block_q5_1 *) vx;
+
+    const dfloat d = __low2half(x[ib].dm);
+    const dfloat m = __high2half(x[ib].dm);
+
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+
+    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    v.x = __int2half_rn((x[ib].qs[iqs] & 0xf) | xh_0);
+    v.y = __int2half_rn((x[ib].qs[iqs] >>  4) | xh_1);
+
+    v = __hmul2(v, {d, d});
+    v = __hadd2(v, {m, m});
+}
+
+static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const block_q8_0 * x = (const block_q8_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    v.x = __int2half_rn(x[ib].qs[iqs + 0]);
+    v.y = __int2half_rn(x[ib].qs[iqs + 1]);
+
+    v = __hmul2(v, {d, d});
+}
+
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
+    const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
+
+    if (i >= k) {
+        return;
+    }
+
+    const int ib = i/qk; // block index
+    const int iqs = (i%qk)/qr; // quant index
+    const int iybs = i - i%qk; // y block start index
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    // dequantize
+    dfloat2 v;
+    dequantize_kernel(vx, ib, iqs, v);
+
+    y[iybs + iqs + 0]        = convert_from_half<dst_t>(v.x);
+    y[iybs + iqs + y_offset] = convert_from_half<dst_t>(v.y);
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const auto i   = blockIdx.x;
+    const block_q2_K * x = (const block_q2_K *) vx;
+
+    const auto tid = threadIdx.x;
+    const int n   = tid/32;
+    const int l   = tid - 32*n;
+    const int is  = 8*n + l/16;
+
+    const uint8_t q = x[i].qs[32*n + l];
+    dst_t * y = yy + i*QK_K + 128*n;
+
+    half dall = __low2half(x[i].dm);
+    half dmin = __high2half(x[i].dm);
+    y[l+ 0] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+0] & 0xF) * ((q >> 0) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+0] >> 4))));
+    y[l+32] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+2] & 0xF) * ((q >> 2) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+2] >> 4))));
+    y[l+64] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+4] & 0xF) * ((q >> 4) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+4] >> 4))));
+    y[l+96] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+6] & 0xF) * ((q >> 6) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+6] >> 4))));
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const auto i = blockIdx.x;
+    const block_q3_K * x = (const block_q3_K *) vx;
+
+    const auto r = threadIdx.x/4;
+    const int tid = r/2;
+    const int is0 = r%2;
+    const int l0 = 16*is0 + 4*(threadIdx.x%4);
+    const int n = tid / 4;
+    const int j = tid - 4*n;
+
+    uint8_t m = 1 << (4*n + j);
+    int is = 8*n + 2*j + is0;
+    int shift = 2*j;
+
+    int8_t us = is <  4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
+                is <  8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
+                is < 12 ? (x[i].scales[is-8] >>  4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
+                          (x[i].scales[is-8] >>  4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
+    half d_all = x[i].d;
+    half dl = __hmul(d_all,  __int2half_rn(us - 32));
+
+    dst_t * y = yy + i*QK_K + 128*n + 32*j;
+    const uint8_t * q = x[i].qs + 32*n;
+    const uint8_t * hm = x[i].hmask;
+
+    for (int l = l0; l < l0+4; ++l) {
+        y[l] = convert_from_half<dst_t>(__hmul(dl,  __int2half_rn((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4))));
+    }
+}
+
+static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
+    if (j < 4) {
+        d = q[j] & 63; m = q[j + 4] & 63;
+    } else {
+        d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
+        m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
+    }
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+    const block_q4_K * x = (const block_q4_K *) vx;
+
+    const auto i = blockIdx.x;
+
+    // assume 32 threads
+    const auto tid = threadIdx.x;
+    const int il  = tid/8;
+    const int ir  = tid%8;
+    const int is  = 2*il;
+    const int n   = 4;
+
+    dst_t * y = yy + i*QK_K + 64*il + n*ir;
+
+    const half dall = __low2half(x[i].dm);
+    const half dmin = __high2half(x[i].dm);
+
+    const uint8_t * q = x[i].qs + 32*il + n*ir;
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, x[i].scales, sc, m);
+    const half d1 = __hmul(dall, __int2half_rn(sc));
+    const half m1 = __hmul(dmin,  __int2half_rn(m));
+    get_scale_min_k4(is + 1, x[i].scales, sc, m);
+    const half d2 = __hmul(dall, __int2half_rn(sc));
+    const half m2 = __hmul(dmin, __int2half_rn(m));
+    for (int l = 0; l < n; ++l) {
+        y[l + 0] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn(q[l] & 0xF)), m1));
+        y[l +32] = convert_from_half<dst_t>(__hsub(__hmul(d2,  __int2half_rn(q[l] >> 4)), m2));
+    }
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+    const block_q5_K * x = (const block_q5_K *) vx;
+
+    const auto i = blockIdx.x;
+
+    // assume 64 threads - this is very slightly better than the one below
+    const auto tid = threadIdx.x;
+    const int il  = tid/16;   // il is in 0...3
+    const int ir  = tid%16;   // ir is in 0...15
+    const int is  = 2*il;     // is is in 0...6
+
+    dst_t * y = yy + i*QK_K + 64*il + 2*ir;
+
+    const half dall = __low2half(x[i].dm);
+    const half dmin = __high2half(x[i].dm);
+
+    const uint8_t * ql = x[i].qs + 32*il + 2*ir;
+    const uint8_t * qh = x[i].qh + 2*ir;
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, x[i].scales, sc, m);
+    const half d1 = __hmul(dall, __int2half_rn(sc)); const half m1 = __hmul(dmin, __int2half_rn(m));
+    get_scale_min_k4(is + 1, x[i].scales, sc, m);
+    const half d2 = __hmul(dall, __int2half_rn(sc)); const half m2 = __hmul(dmin, __int2half_rn(m));
+
+    uint8_t   hm  = 1 << (2*il);
+    y[ 0] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0))), m1));
+    y[ 1] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0))), m1));
+    hm <<= 1;
+    y[32] = convert_from_half<dst_t>(__hsub(__hmul(d2, __int2half_rn((ql[0] >>  4) + (qh[0] & hm ? 16 : 0))), m2));
+    y[33] = convert_from_half<dst_t>(__hsub(__hmul(d2, __int2half_rn((ql[1] >>  4) + (qh[1] & hm ? 16 : 0))), m2));
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+    const block_q6_K * x = (const block_q6_K *) vx;
+
+    const auto i = blockIdx.x;
+
+    // assume 64 threads - this is very slightly better than the one below
+    const auto tid = threadIdx.x;
+    const int ip  = tid/32;   // ip is 0 or 1
+    const int il  = tid - 32*ip; // 0...32
+    const int is  = 8*ip + il/16;
+
+    dst_t * y = yy + i*QK_K + 128*ip + il;
+
+    const half d = x[i].d;
+
+    const uint8_t * ql = x[i].ql + 64*ip + il;
+    const uint8_t   qh = x[i].qh[32*ip + il];
+    const int8_t  * sc = x[i].scales + is;
+
+    y[ 0] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32))));
+    y[32] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32))));
+    y[64] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32))));
+    y[96] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32))));
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const auto i   = blockIdx.x;
+    const block_iq2_xxs * x = (const block_iq2_xxs  *) vx;
+
+    const auto tid = threadIdx.x;
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint16_t * q2 = x[i].qs + 4*ib;
+    const uint8_t  * aux8 = (const uint8_t *)q2;
+    const uint8_t  * grid = (const uint8_t *)(iq2xxs_grid + aux8[il]);
+    const uint32_t aux32 = q2[2] | (q2[3] << 16);
+    const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.25f;
+    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
+    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const auto i   = blockIdx.x;
+    const block_iq2_xs * x = (const block_iq2_xs *) vx;
+
+    const auto tid = threadIdx.x;
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint16_t * q2 = x[i].qs + 4*ib;
+    const uint8_t  * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
+    const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
+    const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
+    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
+
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const auto i   = blockIdx.x;
+    const block_iq2_s * x = (const block_iq2_s *) vx;
+
+    const auto tid = threadIdx.x;
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
+    const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
+    const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
+    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const auto i   = blockIdx.x;
+    const block_iq3_xxs * x = (const block_iq3_xxs  *) vx;
+
+    const auto tid = threadIdx.x;
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint8_t  * q3 = x[i].qs + 8*ib;
+    const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
+    const uint8_t  * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]);
+    const uint8_t  * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]);
+    const uint32_t aux32 = gas[0] | (gas[1] << 16);
+    const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.5f;
+    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
+    for (int j = 0; j < 4; ++j) {
+        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
+        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
+    }
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const auto i   = blockIdx.x;
+    const block_iq3_s * x = (const block_iq3_s *) vx;
+
+    const auto tid = threadIdx.x;
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint8_t * qs = x[i].qs + 8*ib;
+    const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
+    const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
+    const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)) * 0.5f;
+    const uint8_t signs = x[i].signs[4*ib + il];
+    for (int j = 0; j < 4; ++j) {
+        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
+        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
+    }
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const int64_t i   = blockIdx.x;
+    const block_iq1_s * x = (const block_iq1_s  *) vx;
+
+    const int64_t tid = threadIdx.x;
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
+    const float d = __half2float(x[i].d) * (2*((x[i].qh[ib] >> 12) & 7) + 1);
+    uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
+    grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
+    grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
+    grid32[0] &= 0x0f0f0f0f;
+    for (int j = 0; j < 8; ++j) {
+        y[j] = d * (q[j] + delta);
+    }
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const int64_t i   = blockIdx.x;
+    const block_iq1_m * x = (const block_iq1_m  *) vx;
+
+    const int64_t tid = threadIdx.x;
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint16_t * sc = (const uint16_t *)x[i].scales;
+    iq1m_scale_t scale;
+    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+    const int64_t ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
+    const float d = __half2float(scale.f16) * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
+    const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
+    uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
+    grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)];
+    grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
+    grid32[0] &= 0x0f0f0f0f;
+    for (int j = 0; j < 8; ++j) {
+        y[j] = d * (q[j] + delta);
+    }
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const auto i   = blockIdx.x;
+    const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
+
+    const auto tid = threadIdx.x;
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
+    const uint8_t  * q4 = x[ib].qs + 4*il;
+    const float d = __half2float(x[ib].d);
+    for (int j = 0; j < 4; ++j) {
+        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
+        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
+    }
+
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+    const auto i   = blockIdx.x;
+    const block_iq4_xs * x = (const block_iq4_xs *)vx;
+
+    const auto tid = threadIdx.x;
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
+    const uint8_t  * q4 = x[i].qs + 16*ib + 4*il;
+    const float d = __half2float(x[i].d) * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
+    for (int j = 0; j < 4; ++j) {
+        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
+        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
+    }
+}
+
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
+    dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
+}
+
+template<typename dst_t>
+static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_iq1_m<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = (k + QK_K - 1) / QK_K;
+    dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = (k + QK_K - 1) / QK_K;
+    dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static to_cuda_ggml_t<dst_t> ggml_get_to_cuda(int64_t type) {
+    switch (type) {
+        case 2:
+            return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
+        case 3:
+            return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
+        case 6:
+            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
+        case 7:
+            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
+        case 8:
+            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
+        case 10:
+            return dequantize_row_q2_K_cuda;
+        case 11:
+            return dequantize_row_q3_K_cuda;
+        case 12:
+            return dequantize_row_q4_K_cuda;
+        case 13:
+            return dequantize_row_q5_K_cuda;
+        case 14:
+            return dequantize_row_q6_K_cuda;
+        case 16:
+            return dequantize_row_iq2_xxs_cuda;
+        case 17:
+            return dequantize_row_iq2_xs_cuda;
+        case 18:
+            return dequantize_row_iq3_xxs_cuda;
+        case 19:
+            return dequantize_row_iq1_s_cuda;
+        case 20:
+            return dequantize_row_iq4_nl_cuda;
+        case 21:
+            return dequantize_row_iq3_s_cuda;
+        case 22:
+            return dequantize_row_iq2_s_cuda;
+        case 23:
+            return dequantize_row_iq4_xs_cuda;
+        case 29:
+            return dequantize_row_iq1_m_cuda;
+        default:
+            return nullptr;
+    }
+}
diff --git a/vllm_v0.10.0/csrc/quantization/gguf/ggml-common.h b/vllm_v0.10.0/csrc/quantization/gguf/ggml-common.h
new file mode 100644
index 0000000..6bef5db
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/gguf/ggml-common.h
@@ -0,0 +1,1150 @@
+// copied from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-common.h
+#define QK_K 256
+#define K_QUANTS_PER_ITERATION 2
+#define WARP_SIZE_GGUF 32
+#define K_SCALE_SIZE 12
+#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
+#define CUDA_QUANTIZE_BLOCK_SIZE 256
+#define GGML_CUDA_DMMV_X 32
+#define GGML_CUDA_MMV_Y 1
+
+
+// Data Structures
+// QK = number of values after dequantization
+// QR = QK / number of values before dequantization
+// QI = number of 32 bit integers before dequantization
+
+#define QK4_0 32
+#define QR4_0 2
+#define QI4_0 (QK4_0 / (4 * QR4_0))
+typedef struct {
+    half    d;              // delta
+    uint8_t qs[QK4_0 / 2];  // nibbles / quants
+} block_q4_0;
+
+#define QK4_1 32
+#define QR4_1 2
+#define QI4_1 (QK4_1 / (4 * QR4_1))
+typedef struct {
+    half2   dm;             // dm.x = delta, dm.y = min
+    uint8_t qs[QK4_1 / 2];  // nibbles / quants
+} block_q4_1;
+
+#define QK5_0 32
+#define QR5_0 2
+#define QI5_0 (QK5_0 / (4 * QR5_0))
+typedef struct {
+    half d;                 // delta
+    uint8_t qh[4];          // 5-th bit of quants
+    uint8_t qs[QK5_0 / 2];  // nibbles / quants
+} block_q5_0;
+
+#define QK5_1 32
+#define QR5_1 2
+#define QI5_1 (QK5_1 / (4 * QR5_1))
+typedef struct {
+    half2 dm;               // dm.x = delta, dm.y = min
+    uint8_t qh[4];          // 5-th bit of quants
+    uint8_t qs[QK5_1 / 2];  // nibbles / quants
+} block_q5_1;
+
+#define QK8_0 32
+#define QR8_0 1
+#define QI8_0 (QK8_0 / (4 * QR8_0))
+typedef struct {
+    half    d;              // delta
+    int8_t  qs[QK8_0];      // quants
+} block_q8_0;
+
+#define QK8_1 32
+#define QR8_1 1
+#define QI8_1 (QK8_1 / (4 * QR8_1))
+typedef struct {
+    half2   ds;             // ds.x = delta, ds.y = sum
+    int8_t  qs[QK8_0];      // quants
+} block_q8_1;
+
+#define QR2_K 4
+#define QI2_K (QK_K / (4*QR2_K))
+typedef struct {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    half2 dm;                // super-block scale for quantized scales/mins
+} block_q2_K;
+
+#define QR3_K 4
+#define QI3_K (QK_K / (4*QR3_K))
+typedef struct {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+    half d;             // super-block scale
+} block_q3_K;
+
+#define QR4_K 2
+#define QI4_K (QK_K / (4*QR4_K))
+typedef struct {
+    half2 dm;                  // super-block scale for quantized scales/mins
+    uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+} block_q4_K;
+
+#define QR5_K 2
+#define QI5_K (QK_K / (4*QR5_K))
+typedef struct {
+    half2 dm;                     // super-block scale for quantized scales/mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];           // quants, high bit
+    uint8_t qs[QK_K/2];           // quants, low 4 bits
+} block_q5_K;
+
+#define QR6_K 2
+#define QI6_K (QK_K / (4*QR6_K))
+typedef struct {
+    uint8_t ql[QK_K/2];   // quants, lower 4 bits
+    uint8_t qh[QK_K/4];   // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales
+    half    d;         // delta
+} block_q6_K;
+
+#define QR2_XXS 8
+#define QI2_XXS (QK_K / (4*QR2_XXS))
+typedef struct {
+    half d;
+    uint16_t qs[QK_K/8];
+} block_iq2_xxs;
+
+#define QR2_XS 8
+#define QI2_XS (QK_K / (4*QR2_XS))
+typedef struct {
+    half d;
+    uint16_t qs[QK_K/8];
+    uint8_t  scales[QK_K/32];
+} block_iq2_xs;
+
+#define QR2_S 8
+#define QI2_S (QK_K / (4*QR2_S))
+typedef struct {
+    half d;
+    uint8_t qs[QK_K/4];
+    uint8_t qh[QK_K/32];
+    uint8_t scales[QK_K/32];
+} block_iq2_s;
+
+#define QR3_XXS 8
+#define QI3_XXS (QK_K / (4*QR3_XXS))
+typedef struct {
+    half d;
+    uint8_t qs[3*(QK_K/8)];
+} block_iq3_xxs;
+
+#define QR3_XS 8
+#define QI3_XS (QK_K / (4*QR3_XS))
+#define IQ3S_N_SCALE QK_K/64
+typedef struct {
+    half d;
+    uint8_t qs[QK_K/4];
+    uint8_t qh[QK_K/32];
+    uint8_t signs[QK_K/8];
+    uint8_t scales[IQ3S_N_SCALE];
+} block_iq3_s;
+
+// 1.5625 bpw
+#define QR1_S 8
+#define QI1_S (QK_K / (4*QR1_S))
+typedef struct {
+    half d;
+    uint8_t  qs[QK_K/8];
+    uint16_t qh[QK_K/32];
+} block_iq1_s;
+
+// 1.75 bpw
+#define QR1_M 8
+#define QI1_M (QK_K / (4*QR1_M))
+typedef struct {
+    uint8_t  qs[QK_K/8];      // grid index, low 8 bits
+    uint8_t  qh[QK_K/16];     // grid index, high 3 bits + grid shift bit (for two groups of 8)
+    uint8_t  scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
+} block_iq1_m;
+
+// Used by IQ1_M quants
+typedef union {
+    half f16;
+    uint16_t  u16;
+} iq1m_scale_t;
+
+#define QK4_NL 32
+#define QR4_NL 2
+#define QI4_NL (QK4_NL / (4*QR4_NL))
+typedef struct {
+    half d;
+    uint8_t qs[QK4_NL/2];
+} block_iq4_nl;
+
+#define QR4_XS 8
+#define QI4_XS (QK_K / (4*QR4_XS))
+typedef struct {
+    half d;
+    uint16_t scales_h;
+    uint8_t  scales_l[QK_K/64];
+    uint8_t  qs[QK_K/2];
+} block_iq4_xs;
+
+static const __device__ uint64_t iq2xxs_grid[256] = {
+    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
+    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
+    0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
+    0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
+    0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
+    0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
+    0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
+    0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
+    0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
+    0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
+    0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
+    0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
+    0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
+    0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
+    0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
+    0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
+    0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
+    0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
+    0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
+    0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
+    0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
+    0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
+    0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
+    0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
+    0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
+    0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
+    0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
+    0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
+    0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
+    0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
+    0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
+    0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
+    0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
+    0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
+    0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
+    0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
+    0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
+    0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
+    0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
+    0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
+    0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
+    0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
+    0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
+    0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
+    0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
+    0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
+    0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
+    0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
+    0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
+    0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
+    0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
+    0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
+    0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
+    0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
+    0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
+    0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
+    0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
+    0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
+    0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
+    0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
+    0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
+    0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
+    0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
+    0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
+};
+
+static const __device__ uint64_t iq2xs_grid[512] = {
+    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
+    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
+    0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
+    0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
+    0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
+    0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
+    0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
+    0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
+    0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
+    0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
+    0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
+    0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
+    0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
+    0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
+    0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
+    0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
+    0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
+    0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
+    0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
+    0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
+    0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
+    0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
+    0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
+    0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
+    0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
+    0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
+    0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
+    0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
+    0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
+    0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
+    0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
+    0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
+    0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
+    0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
+    0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
+    0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
+    0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
+    0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
+    0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
+    0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
+    0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
+    0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
+    0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
+    0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
+    0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
+    0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
+    0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
+    0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
+    0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
+    0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
+    0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
+    0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
+    0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
+    0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
+    0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
+    0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
+    0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
+    0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
+    0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
+    0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
+    0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
+    0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
+    0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
+    0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
+    0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
+    0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
+    0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
+    0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
+    0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
+    0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
+    0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
+    0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
+    0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
+    0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
+    0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
+    0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
+    0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
+    0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
+    0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
+    0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
+    0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
+    0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
+    0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
+    0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
+    0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
+    0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
+    0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
+    0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
+    0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
+    0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
+    0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
+    0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
+    0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
+    0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
+    0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
+    0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
+    0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
+    0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
+    0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
+    0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
+    0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
+    0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
+    0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
+    0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
+    0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
+    0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
+    0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
+    0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
+    0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
+    0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
+    0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
+    0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
+    0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
+    0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
+    0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
+    0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
+    0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
+    0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
+    0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
+    0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
+    0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
+    0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
+    0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
+    0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
+    0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
+    0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
+    0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
+    0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
+};
+
+static const __device__ uint64_t iq2s_grid[1024] = {
+    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
+    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
+    0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
+    0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
+    0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
+    0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
+    0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
+    0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
+    0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
+    0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
+    0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
+    0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
+    0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
+    0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
+    0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
+    0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
+    0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
+    0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
+    0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
+    0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
+    0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
+    0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
+    0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
+    0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
+    0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
+    0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
+    0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
+    0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
+    0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
+    0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
+    0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
+    0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
+    0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
+    0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
+    0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
+    0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
+    0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
+    0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
+    0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
+    0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
+    0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
+    0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
+    0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
+    0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
+    0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
+    0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
+    0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
+    0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
+    0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
+    0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
+    0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
+    0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
+    0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
+    0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
+    0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
+    0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
+    0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
+    0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
+    0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
+    0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
+    0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
+    0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
+    0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
+    0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
+    0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
+    0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
+    0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
+    0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
+    0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
+    0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
+    0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
+    0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
+    0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
+    0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
+    0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
+    0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
+    0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
+    0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
+    0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
+    0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
+    0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
+    0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
+    0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
+    0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
+    0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
+    0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
+    0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
+    0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
+    0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
+    0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
+    0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
+    0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
+    0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
+    0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
+    0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
+    0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
+    0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
+    0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
+    0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
+    0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
+    0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
+    0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
+    0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
+    0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
+    0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
+    0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
+    0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
+    0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
+    0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
+    0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
+    0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
+    0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
+    0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
+    0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
+    0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
+    0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
+    0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
+    0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
+    0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
+    0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
+    0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
+    0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
+    0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
+    0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
+    0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
+    0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
+    0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
+    0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
+    0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
+    0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
+    0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
+    0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
+    0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
+    0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
+    0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
+    0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
+    0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
+    0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
+    0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
+    0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
+    0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
+    0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
+    0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
+    0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
+    0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
+    0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
+    0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
+    0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
+    0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
+    0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
+    0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
+    0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
+    0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
+    0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
+    0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
+    0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
+    0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
+    0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
+    0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
+    0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
+    0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
+    0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
+    0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
+    0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
+    0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
+    0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
+    0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
+    0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
+    0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
+    0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
+    0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
+    0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
+    0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
+    0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
+    0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
+    0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
+    0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
+    0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
+    0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
+    0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
+    0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
+    0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
+    0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
+    0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
+    0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
+    0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
+    0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
+    0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
+    0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
+    0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
+    0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
+    0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
+    0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
+    0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
+    0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
+    0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
+    0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
+    0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
+    0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
+    0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
+    0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
+    0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
+    0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
+    0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
+    0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
+    0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
+    0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
+    0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
+    0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
+    0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
+    0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
+    0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
+    0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
+    0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
+    0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
+    0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
+    0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
+    0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
+    0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
+    0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
+    0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
+    0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
+    0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
+    0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
+    0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
+    0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
+    0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
+    0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
+    0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
+    0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
+    0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
+    0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
+    0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
+    0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
+    0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
+    0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
+    0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
+    0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
+    0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
+    0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
+    0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
+    0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
+    0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
+    0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
+    0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
+    0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
+    0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
+    0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
+    0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
+    0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
+    0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
+    0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
+    0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
+    0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
+    0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
+    0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
+};
+
+static const __device__ uint32_t iq3xxs_grid[256] = {
+    0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
+    0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
+    0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
+    0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
+    0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
+    0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
+    0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
+    0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
+    0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
+    0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
+    0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
+    0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
+    0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
+    0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
+    0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
+    0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
+    0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
+    0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
+    0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
+    0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
+    0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
+    0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
+    0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
+    0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
+    0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
+    0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
+    0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
+    0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
+    0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
+    0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
+    0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
+    0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
+};
+
+static const __device__ uint32_t iq3xs_grid[512] = {
+    0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14,
+    0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414,
+    0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24,
+    0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c,
+    0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c,
+    0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34,
+    0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c,
+    0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414,
+    0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c,
+    0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404,
+    0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434,
+    0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c,
+    0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404,
+    0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414,
+    0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414,
+    0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404,
+    0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c,
+    0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c,
+    0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404,
+    0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e,
+    0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14,
+    0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c,
+    0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424,
+    0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c,
+    0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c,
+    0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e,
+    0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e,
+    0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e,
+    0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424,
+    0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e,
+    0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424,
+    0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404,
+    0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c,
+    0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e,
+    0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c,
+    0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c,
+    0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c,
+    0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404,
+    0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04,
+    0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c,
+    0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414,
+    0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c,
+    0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c,
+    0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424,
+    0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c,
+    0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c,
+    0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414,
+    0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c,
+    0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e,
+    0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04,
+    0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424,
+    0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14,
+    0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34,
+    0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c,
+    0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434,
+    0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c,
+    0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424,
+    0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24,
+    0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24,
+    0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e,
+    0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c,
+    0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c,
+    0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c,
+    0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404,
+};
+
+#define IQ1S_DELTA 0.125f
+#define IQ1M_DELTA 0.125f
+static const __device__ uint64_t iq1s_grid_gpu[2048] = {
+    0x00000000, 0x00000002, 0x00000101, 0x00000200, 0x00000202, 0x00010001, 0x00010101, 0x00020000,
+    0x00020002, 0x00020200, 0x00020202, 0x01000101, 0x01010001, 0x01010100, 0x01010102, 0x01020101,
+    0x02000000, 0x02000002, 0x02000200, 0x02000202, 0x02010101, 0x02020000, 0x02020002, 0x02020200,
+    0x02020202, 0x00000110, 0x00000111, 0x00010011, 0x00010110, 0x00010112, 0x00010211, 0x00010212,
+    0x00020111, 0x01000011, 0x01000112, 0x01000211, 0x01010012, 0x01010111, 0x01010212, 0x01020011,
+    0x01020110, 0x01020112, 0x01020210, 0x02000111, 0x02010011, 0x02010110, 0x02010112, 0x02020111,
+    0x00000020, 0x00000022, 0x00000220, 0x00000222, 0x00010121, 0x00020020, 0x00020022, 0x00020220,
+    0x00020222, 0x01000121, 0x01010021, 0x01010221, 0x01020120, 0x01020221, 0x02000020, 0x02000022,
+    0x02000220, 0x02000222, 0x02010021, 0x02010121, 0x02010221, 0x02020020, 0x02020022, 0x02020220,
+    0x02020222, 0x00011001, 0x00011100, 0x00011102, 0x00021101, 0x01001001, 0x01001201, 0x01011101,
+    0x01011202, 0x01021100, 0x01021101, 0x02011001, 0x02011201, 0x02021101, 0x00001011, 0x00001110,
+    0x00001111, 0x00001112, 0x00011111, 0x00011210, 0x00011212, 0x00021211, 0x01001010, 0x01001111,
+    0x01001212, 0x01011010, 0x01011011, 0x01011110, 0x01011111, 0x01011112, 0x01011211, 0x01021010,
+    0x01021012, 0x01021111, 0x01021210, 0x01021212, 0x02001011, 0x02011011, 0x02011111, 0x02011210,
+    0x02011212, 0x02021011, 0x02021110, 0x02021111, 0x02021112, 0x02021211, 0x00011120, 0x00011221,
+    0x01001021, 0x01001120, 0x01011020, 0x01011022, 0x01011121, 0x01011220, 0x01021020, 0x01021021,
+    0x01021122, 0x01021221, 0x02001121, 0x02011021, 0x02011120, 0x02011221, 0x00002000, 0x00002002,
+    0x00002200, 0x00002202, 0x00012101, 0x00022000, 0x00022002, 0x00022200, 0x00022202, 0x01002101,
+    0x01012001, 0x01012102, 0x01022101, 0x02002000, 0x02002002, 0x02002200, 0x02002202, 0x02012101,
+    0x02022000, 0x02022002, 0x02022200, 0x02022202, 0x00002111, 0x00012011, 0x00012110, 0x00012211,
+    0x00022110, 0x00022111, 0x01002011, 0x01012010, 0x01012011, 0x01012111, 0x01022011, 0x01022110,
+    0x01022211, 0x02012011, 0x02012110, 0x02012112, 0x02012211, 0x02022111, 0x00002020, 0x00002022,
+    0x00002220, 0x00002222, 0x00012121, 0x00022020, 0x00022022, 0x00022220, 0x00022222, 0x01002121,
+    0x01012021, 0x01012221, 0x01022021, 0x01022121, 0x02002020, 0x02002022, 0x02002121, 0x02002220,
+    0x02002222, 0x02012121, 0x02022020, 0x02022022, 0x02022220, 0x02022222, 0x00110000, 0x00110001,
+    0x00110100, 0x00110201, 0x00120100, 0x00120101, 0x01100001, 0x01100100, 0x01110000, 0x01110101,
+    0x01110200, 0x01120001, 0x01120100, 0x01120101, 0x01120201, 0x02110001, 0x02110100, 0x02110102,
+    0x02120001, 0x02120101, 0x00100011, 0x00100110, 0x00100112, 0x00100211, 0x00110010, 0x00110012,
+    0x00110111, 0x00110210, 0x00120011, 0x00120110, 0x00120211, 0x01100111, 0x01100212, 0x01110010,
+    0x01110011, 0x01110012, 0x01110110, 0x01110111, 0x01110112, 0x01110211, 0x01120010, 0x01120111,
+    0x02100110, 0x02110012, 0x02110111, 0x02120011, 0x02120110, 0x00110021, 0x00110120, 0x00110122,
+    0x00120121, 0x01100020, 0x01100122, 0x01100221, 0x01110022, 0x01110121, 0x01110220, 0x01110222,
+    0x01120120, 0x01120122, 0x02100121, 0x02110021, 0x02110120, 0x02110122, 0x02120121, 0x00101001,
+    0x00101102, 0x00101201, 0x00111100, 0x00111101, 0x00111200, 0x00111201, 0x00121001, 0x00121102,
+    0x01101001, 0x01101101, 0x01101102, 0x01101200, 0x01101202, 0x01111001, 0x01111100, 0x01111101,
+    0x01111102, 0x01111201, 0x01121002, 0x01121101, 0x01121200, 0x02101100, 0x02101201, 0x02111000,
+    0x02111100, 0x02111101, 0x02111200, 0x02111201, 0x02111202, 0x02121001, 0x02121100, 0x02121101,
+    0x02121201, 0x00101012, 0x00101111, 0x00101212, 0x00111011, 0x00111110, 0x00111111, 0x00111112,
+    0x00111211, 0x00121010, 0x00121012, 0x00121111, 0x00121210, 0x00121212, 0x01101011, 0x01101110,
+    0x01101111, 0x01101112, 0x01111011, 0x01111012, 0x01111110, 0x01111111, 0x01111112, 0x01111211,
+    0x01111212, 0x01121011, 0x01121110, 0x01121111, 0x01121112, 0x01121211, 0x02101010, 0x02101012,
+    0x02101110, 0x02101111, 0x02101210, 0x02101212, 0x02111010, 0x02111011, 0x02111110, 0x02111111,
+    0x02111112, 0x02111211, 0x02111212, 0x02121010, 0x02121012, 0x02121111, 0x00101021, 0x00101120,
+    0x00101121, 0x00101122, 0x00111121, 0x00111122, 0x00111220, 0x00111222, 0x00121021, 0x00121122,
+    0x01101020, 0x01101022, 0x01101120, 0x01101121, 0x01101220, 0x01101222, 0x01111021, 0x01111121,
+    0x01111122, 0x01111220, 0x01111221, 0x01121021, 0x01121120, 0x01121121, 0x01121220, 0x01121221,
+    0x01121222, 0x02101122, 0x02101222, 0x02111022, 0x02111121, 0x02121120, 0x02121221, 0x00112001,
+    0x00112102, 0x00122101, 0x01102001, 0x01102100, 0x01102102, 0x01102201, 0x01112000, 0x01112101,
+    0x01112200, 0x01112202, 0x01122000, 0x01122001, 0x01122100, 0x01122102, 0x01122201, 0x02102101,
+    0x02112001, 0x02112100, 0x02122101, 0x00112010, 0x00112012, 0x00112111, 0x00112212, 0x00122011,
+    0x00122111, 0x01102012, 0x01102110, 0x01102111, 0x01102210, 0x01112011, 0x01112110, 0x01112111,
+    0x01112112, 0x01112211, 0x01112212, 0x01122010, 0x01122111, 0x01122212, 0x02102211, 0x02112011,
+    0x02112012, 0x02112111, 0x02112210, 0x02122011, 0x02122112, 0x02122211, 0x00102221, 0x00112122,
+    0x00122120, 0x00122122, 0x01102120, 0x01102122, 0x01102221, 0x01112020, 0x01112022, 0x01112121,
+    0x01112220, 0x01122021, 0x01122122, 0x01122221, 0x02102121, 0x02112021, 0x02112122, 0x02112222,
+    0x00200000, 0x00200002, 0x00200200, 0x00200202, 0x00210101, 0x00220000, 0x00220002, 0x00220101,
+    0x00220200, 0x00220202, 0x01200101, 0x01210001, 0x01210201, 0x01220001, 0x01220101, 0x02200000,
+    0x02200002, 0x02200200, 0x02200202, 0x02210101, 0x02220000, 0x02220002, 0x02220101, 0x02220200,
+    0x02220202, 0x00200111, 0x00210011, 0x00210110, 0x00210211, 0x00220111, 0x01200012, 0x01200110,
+    0x01200211, 0x01210111, 0x01210210, 0x01210212, 0x01220011, 0x01220110, 0x01220111, 0x01220112,
+    0x02200111, 0x02210010, 0x02210112, 0x02210211, 0x02220111, 0x00200021, 0x00200220, 0x00200222,
+    0x00210021, 0x00210121, 0x00220020, 0x00220022, 0x00220220, 0x00220222, 0x01200121, 0x01210021,
+    0x01210122, 0x01210221, 0x01220121, 0x02200021, 0x02200220, 0x02200222, 0x02210021, 0x02210121,
+    0x02220020, 0x02220022, 0x02220220, 0x02220222, 0x00201101, 0x00211100, 0x00211102, 0x00211201,
+    0x00221101, 0x01201100, 0x01201101, 0x01201102, 0x01201201, 0x01211002, 0x01211101, 0x01211200,
+    0x01211202, 0x01221102, 0x02201101, 0x02211001, 0x02211100, 0x02211201, 0x02221001, 0x02221101,
+    0x00201211, 0x00211111, 0x00221011, 0x00221211, 0x01201010, 0x01201111, 0x01201210, 0x01211011,
+    0x01211110, 0x01211111, 0x01211211, 0x01221012, 0x01221111, 0x01221210, 0x02201211, 0x02211010,
+    0x02211110, 0x02211111, 0x02211210, 0x02211212, 0x02221011, 0x02221110, 0x02221112, 0x02221211,
+    0x00201121, 0x00211020, 0x00211022, 0x00211221, 0x00221121, 0x01201021, 0x01201221, 0x01211121,
+    0x01221020, 0x01221021, 0x01221221, 0x02201120, 0x02201122, 0x02211020, 0x02211222, 0x00202000,
+    0x00202002, 0x00202200, 0x00202202, 0x00212101, 0x00222000, 0x00222002, 0x00222200, 0x00222202,
+    0x01202101, 0x01212001, 0x01212100, 0x01222101, 0x02202000, 0x02202002, 0x02202200, 0x02202202,
+    0x02222000, 0x02222002, 0x02222200, 0x02222202, 0x00202211, 0x00212011, 0x00212110, 0x00212211,
+    0x00222111, 0x01202112, 0x01202211, 0x01212012, 0x01212111, 0x01222011, 0x01222110, 0x01222112,
+    0x01222211, 0x02202111, 0x02212010, 0x02212112, 0x02212211, 0x02222110, 0x02222111, 0x00202020,
+    0x00202022, 0x00202220, 0x00202222, 0x00222020, 0x00222022, 0x00222220, 0x00222222, 0x01202121,
+    0x01212021, 0x01212122, 0x01212221, 0x01222121, 0x02202020, 0x02202022, 0x02202220, 0x02202222,
+    0x02212121, 0x02222020, 0x02222022, 0x02222220, 0x02222222, 0x10000101, 0x10010001, 0x10010102,
+    0x10020101, 0x11000201, 0x11010002, 0x11010101, 0x11010200, 0x11010202, 0x11020001, 0x11020100,
+    0x11020102, 0x12010100, 0x12010201, 0x12020001, 0x12020102, 0x10000010, 0x10000011, 0x10000110,
+    0x10000112, 0x10000211, 0x10010012, 0x10010111, 0x10010112, 0x10010210, 0x10010212, 0x10020011,
+    0x10020112, 0x10020211, 0x11000111, 0x11000210, 0x11000212, 0x11010011, 0x11010110, 0x11010111,
+    0x11010112, 0x11010211, 0x11010212, 0x11020111, 0x11020210, 0x11020212, 0x12000011, 0x12000110,
+    0x12000112, 0x12010010, 0x12010012, 0x12010111, 0x12020010, 0x12020011, 0x12020012, 0x10000121,
+    0x10010021, 0x10010120, 0x10010122, 0x10020121, 0x11000021, 0x11010022, 0x11010121, 0x11010222,
+    0x11020120, 0x11020221, 0x12000221, 0x12010120, 0x12020121, 0x10001001, 0x10011101, 0x10011201,
+    0x10021201, 0x11001101, 0x11001200, 0x11001202, 0x11011001, 0x11011100, 0x11011101, 0x11011102,
+    0x11021001, 0x11021002, 0x11021101, 0x11021200, 0x11021202, 0x12001001, 0x12001102, 0x12001201,
+    0x12011000, 0x12011002, 0x12011101, 0x12021000, 0x12021001, 0x12021201, 0x10001011, 0x10001012,
+    0x10001111, 0x10001212, 0x10011011, 0x10011110, 0x10011111, 0x10011112, 0x10011211, 0x10021010,
+    0x10021111, 0x10021212, 0x11001011, 0x11001110, 0x11001111, 0x11001112, 0x11001211, 0x11011010,
+    0x11011011, 0x11011110, 0x11011111, 0x11011112, 0x11011210, 0x11011211, 0x11021011, 0x11021110,
+    0x11021111, 0x11021112, 0x11021211, 0x12001012, 0x12001110, 0x12001111, 0x12001210, 0x12011011,
+    0x12011110, 0x12011111, 0x12011112, 0x12011211, 0x12011212, 0x12021111, 0x12021210, 0x12021212,
+    0x10001021, 0x10001121, 0x10001221, 0x10011120, 0x10011121, 0x10011220, 0x10011222, 0x10021021,
+    0x10021120, 0x10021221, 0x11001020, 0x11001022, 0x11001121, 0x11001220, 0x11011020, 0x11011021,
+    0x11011022, 0x11011121, 0x11011122, 0x11011221, 0x11021022, 0x11021121, 0x11021220, 0x12001021,
+    0x12001121, 0x12001222, 0x12011120, 0x12011121, 0x12021021, 0x12021120, 0x12021122, 0x10002101,
+    0x10012001, 0x10012101, 0x10012202, 0x10022101, 0x11002002, 0x11002201, 0x11012000, 0x11012101,
+    0x11012200, 0x11022001, 0x11022100, 0x11022102, 0x11022201, 0x12002101, 0x12012001, 0x12012100,
+    0x12012102, 0x12012201, 0x12022101, 0x10002011, 0x10002111, 0x10002112, 0x10002212, 0x10012010,
+    0x10012110, 0x10012111, 0x10012210, 0x10022011, 0x10022110, 0x10022112, 0x11002010, 0x11002111,
+    0x11002212, 0x11012011, 0x11012012, 0x11012110, 0x11012111, 0x11012112, 0x11012211, 0x11022010,
+    0x11022012, 0x11022111, 0x11022112, 0x11022212, 0x12002112, 0x12002211, 0x12012012, 0x12012111,
+    0x12012112, 0x12012210, 0x12022011, 0x12022110, 0x12022112, 0x12022211, 0x10012122, 0x11002120,
+    0x11002122, 0x11002221, 0x11012121, 0x11012220, 0x11012222, 0x11022120, 0x11022221, 0x12012120,
+    0x12022121, 0x10100001, 0x10100100, 0x10100101, 0x10100102, 0x10100201, 0x10110002, 0x10110101,
+    0x10110202, 0x10120001, 0x10120100, 0x10120201, 0x11100000, 0x11100101, 0x11100200, 0x11110001,
+    0x11110100, 0x11110101, 0x11110102, 0x11110201, 0x11120101, 0x11120200, 0x12100102, 0x12100201,
+    0x12110101, 0x12110200, 0x12120000, 0x12120001, 0x12120102, 0x12120201, 0x10100111, 0x10100210,
+    0x10100211, 0x10100212, 0x10110011, 0x10110110, 0x10110111, 0x10110112, 0x10110210, 0x10110211,
+    0x10120010, 0x10120111, 0x10120112, 0x10120210, 0x10120212, 0x11100011, 0x11100110, 0x11100111,
+    0x11100112, 0x11100211, 0x11110010, 0x11110011, 0x11110012, 0x11110110, 0x11110111, 0x11110112,
+    0x11110210, 0x11110211, 0x11110212, 0x11120011, 0x11120110, 0x11120111, 0x11120112, 0x11120211,
+    0x12100012, 0x12100111, 0x12110011, 0x12110110, 0x12110111, 0x12110112, 0x12110211, 0x12120010,
+    0x12120111, 0x12120212, 0x10100021, 0x10100122, 0x10110022, 0x10110121, 0x10110222, 0x10120021,
+    0x10120120, 0x11100022, 0x11100121, 0x11100222, 0x11110021, 0x11110120, 0x11110121, 0x11110122,
+    0x11110221, 0x11120022, 0x11120121, 0x12100121, 0x12110020, 0x12110022, 0x12110121, 0x12110221,
+    0x12110222, 0x12120120, 0x10101100, 0x10101101, 0x10111001, 0x10111100, 0x10111101, 0x10111102,
+    0x10111200, 0x10111201, 0x10121001, 0x10121101, 0x10121200, 0x10121202, 0x11101001, 0x11101100,
+    0x11101101, 0x11101102, 0x11101201, 0x11101202, 0x11111000, 0x11111001, 0x11111100, 0x11111101,
+    0x11111102, 0x11111200, 0x11111201, 0x11111202, 0x11121001, 0x11121002, 0x11121100, 0x11121101,
+    0x11121102, 0x11121201, 0x12101000, 0x12101200, 0x12101202, 0x12111001, 0x12111100, 0x12111101,
+    0x12111102, 0x12111201, 0x12121001, 0x12121100, 0x12121101, 0x12121202, 0x10101011, 0x10101012,
+    0x10101110, 0x10101111, 0x10101112, 0x10101211, 0x10111010, 0x10111011, 0x10111012, 0x10111110,
+    0x10111111, 0x10111112, 0x10111211, 0x10111212, 0x10121011, 0x10121110, 0x10121111, 0x10121112,
+    0x10121211, 0x11101010, 0x11101011, 0x11101012, 0x11101110, 0x11101111, 0x11101112, 0x11101210,
+    0x11101211, 0x11111010, 0x11111011, 0x11111012, 0x11111110, 0x11111111, 0x11111112, 0x11111210,
+    0x11111211, 0x11111212, 0x11121010, 0x11121011, 0x11121110, 0x11121111, 0x11121112, 0x11121210,
+    0x11121211, 0x11121212, 0x12101011, 0x12101110, 0x12101111, 0x12101211, 0x12101212, 0x12111010,
+    0x12111011, 0x12111110, 0x12111111, 0x12111112, 0x12111210, 0x12111211, 0x12121011, 0x12121110,
+    0x12121111, 0x12121112, 0x12121211, 0x10101020, 0x10101021, 0x10101022, 0x10101120, 0x10101122,
+    0x10101220, 0x10101221, 0x10111021, 0x10111120, 0x10111121, 0x10111220, 0x10111221, 0x10121020,
+    0x10121021, 0x10121022, 0x10121120, 0x10121121, 0x10121122, 0x10121220, 0x10121221, 0x11101021,
+    0x11101121, 0x11101122, 0x11101220, 0x11101221, 0x11101222, 0x11111020, 0x11111021, 0x11111022,
+    0x11111120, 0x11111121, 0x11111122, 0x11111220, 0x11111221, 0x11111222, 0x11121021, 0x11121120,
+    0x11121121, 0x11121221, 0x12101022, 0x12101121, 0x12101122, 0x12101220, 0x12101221, 0x12101222,
+    0x12111021, 0x12111121, 0x12111222, 0x12121022, 0x12121121, 0x12121122, 0x12121220, 0x12121221,
+    0x10102100, 0x10102101, 0x10102102, 0x10102201, 0x10112000, 0x10112101, 0x10112200, 0x10122001,
+    0x10122202, 0x11102101, 0x11102200, 0x11102202, 0x11112001, 0x11112100, 0x11112101, 0x11112102,
+    0x11112200, 0x11112201, 0x11122000, 0x11122002, 0x11122100, 0x11122101, 0x12102002, 0x12102201,
+    0x12112000, 0x12112002, 0x12112101, 0x12112200, 0x12122001, 0x12122201, 0x10102011, 0x10102012,
+    0x10102111, 0x10102212, 0x10112011, 0x10112110, 0x10112111, 0x10112112, 0x10112211, 0x10122111,
+    0x11102011, 0x11102110, 0x11102111, 0x11102112, 0x11102211, 0x11112010, 0x11112011, 0x11112012,
+    0x11112110, 0x11112111, 0x11112112, 0x11112210, 0x11112211, 0x11112212, 0x11122011, 0x11122110,
+    0x11122111, 0x11122112, 0x11122211, 0x12102011, 0x12102111, 0x12102211, 0x12112011, 0x12112110,
+    0x12112111, 0x12112112, 0x12112210, 0x12112211, 0x12122111, 0x10102120, 0x10102220, 0x10112121,
+    0x10112222, 0x10122020, 0x10122121, 0x10122122, 0x10122221, 0x11102121, 0x11102220, 0x11102221,
+    0x11112021, 0x11112121, 0x11112122, 0x11112220, 0x11112221, 0x11122022, 0x11122121, 0x11122220,
+    0x11122222, 0x12102021, 0x12102222, 0x12112022, 0x12112121, 0x12112122, 0x12112220, 0x12112222,
+    0x12122021, 0x10200101, 0x10210100, 0x10210102, 0x10210201, 0x10220101, 0x11200100, 0x11210000,
+    0x11210101, 0x11210102, 0x11210200, 0x11210202, 0x11220001, 0x11220100, 0x11220102, 0x11220201,
+    0x12200001, 0x12210102, 0x12220101, 0x10200011, 0x10200110, 0x10200112, 0x10200211, 0x10210012,
+    0x10210111, 0x10220011, 0x10220012, 0x10220112, 0x10220211, 0x11200111, 0x11200211, 0x11210011,
+    0x11210111, 0x11210112, 0x11210211, 0x11220111, 0x11220112, 0x11220212, 0x12200110, 0x12200212,
+    0x12210012, 0x12210111, 0x12220011, 0x12220112, 0x12220211, 0x10210021, 0x10210122, 0x10210221,
+    0x11200020, 0x11200021, 0x11200122, 0x11210121, 0x11210122, 0x11210220, 0x11220020, 0x12200121,
+    0x12210021, 0x12210122, 0x12220121, 0x10211001, 0x10211002, 0x10211101, 0x10211102, 0x10211202,
+    0x10221001, 0x10221102, 0x10221201, 0x11201000, 0x11201002, 0x11201101, 0x11201200, 0x11201202,
+    0x11211001, 0x11211100, 0x11211101, 0x11211102, 0x11211201, 0x11211202, 0x11221000, 0x11221002,
+    0x11221101, 0x12201100, 0x12201101, 0x12201201, 0x12211000, 0x12211002, 0x12211100, 0x12211101,
+    0x12211102, 0x12211200, 0x12211202, 0x12221001, 0x12221100, 0x12221201, 0x10201111, 0x10201210,
+    0x10201212, 0x10211011, 0x10211111, 0x10211112, 0x10211211, 0x11201110, 0x11201111, 0x11201112,
+    0x11201211, 0x11211010, 0x11211011, 0x11211110, 0x11211111, 0x11211112, 0x11211211, 0x11221011,
+    0x11221110, 0x11221111, 0x11221112, 0x11221211, 0x12201112, 0x12201211, 0x12201212, 0x12211011,
+    0x12211111, 0x12211112, 0x12211211, 0x12211212, 0x12221012, 0x12221111, 0x12221112, 0x12221210,
+    0x10201022, 0x10201221, 0x10211121, 0x10221020, 0x10221122, 0x10221220, 0x10221221, 0x11201020,
+    0x11201121, 0x11201220, 0x11201222, 0x11211021, 0x11211120, 0x11211121, 0x11211122, 0x11211220,
+    0x11211222, 0x11221020, 0x11221121, 0x11221220, 0x12201020, 0x12201022, 0x12201121, 0x12201222,
+    0x12211120, 0x12211122, 0x12211220, 0x12211221, 0x12221020, 0x12221120, 0x12221122, 0x12221222,
+    0x10212102, 0x10212201, 0x10222101, 0x11202001, 0x11212002, 0x11212101, 0x11212202, 0x11222001,
+    0x11222201, 0x12202101, 0x12212001, 0x12212200, 0x12222102, 0x10202011, 0x10202110, 0x10212010,
+    0x10212111, 0x10222011, 0x10222110, 0x10222112, 0x10222211, 0x11202010, 0x11202011, 0x11202111,
+    0x11202112, 0x11202210, 0x11212011, 0x11212110, 0x11212111, 0x11212112, 0x11212211, 0x11222010,
+    0x11222111, 0x11222212, 0x12202012, 0x12202110, 0x12202212, 0x12212111, 0x12222011, 0x12222110,
+    0x12222111, 0x12222211, 0x10212021, 0x10212122, 0x10212220, 0x11202021, 0x11202120, 0x11202221,
+    0x11212020, 0x11212121, 0x11212220, 0x11212222, 0x11222120, 0x11222121, 0x11222221, 0x12202122,
+    0x12212120, 0x12212220, 0x12212222, 0x12222122, 0x20000000, 0x20000002, 0x20000200, 0x20000202,
+    0x20020000, 0x20020002, 0x20020200, 0x20020202, 0x21000101, 0x21010000, 0x21010001, 0x21010100,
+    0x21010102, 0x21010201, 0x21020101, 0x22000000, 0x22000002, 0x22000200, 0x22000202, 0x22010101,
+    0x22020000, 0x22020002, 0x22020200, 0x22020202, 0x20000111, 0x20010011, 0x20010110, 0x20010112,
+    0x20010211, 0x20020111, 0x21000011, 0x21000110, 0x21000211, 0x21010010, 0x21010012, 0x21010111,
+    0x21010112, 0x21010210, 0x21010211, 0x21020110, 0x21020112, 0x21020211, 0x22000111, 0x22000211,
+    0x22010110, 0x22010112, 0x22010211, 0x22020111, 0x20000020, 0x20000022, 0x20000220, 0x20000222,
+    0x20010121, 0x20020020, 0x20020022, 0x20020220, 0x20020222, 0x21010021, 0x21010120, 0x21010221,
+    0x21020121, 0x22000020, 0x22000022, 0x22000220, 0x22000222, 0x22010121, 0x22020020, 0x22020022,
+    0x22020220, 0x22020222, 0x20011100, 0x20011201, 0x21001001, 0x21001100, 0x21011001, 0x21011101,
+    0x21011202, 0x21021001, 0x21021100, 0x21021201, 0x22011100, 0x22011201, 0x20001011, 0x20001211,
+    0x20011012, 0x20011111, 0x20011212, 0x20021112, 0x20021211, 0x21001010, 0x21001011, 0x21001111,
+    0x21001210, 0x21011011, 0x21011110, 0x21011111, 0x21011112, 0x21011211, 0x21011212, 0x21021111,
+    0x21021112, 0x21021210, 0x21021212, 0x22001011, 0x22001110, 0x22001112, 0x22001211, 0x22011010,
+    0x22011012, 0x22011111, 0x22011210, 0x22021112, 0x20011021, 0x20011122, 0x20011221, 0x20021121,
+    0x21001021, 0x21001120, 0x21001221, 0x21001222, 0x21011020, 0x21011121, 0x21011221, 0x21011222,
+    0x21021021, 0x21021122, 0x21021222, 0x22001121, 0x22011021, 0x22011222, 0x22021120, 0x20002000,
+    0x20002002, 0x20002200, 0x20002202, 0x20012101, 0x20022000, 0x20022002, 0x20022200, 0x20022202,
+    0x21002001, 0x21002101, 0x21012001, 0x21012100, 0x21012201, 0x21022101, 0x21022201, 0x22002000,
+    0x22002002, 0x22002200, 0x22002202, 0x22012101, 0x22022000, 0x22022002, 0x22022200, 0x22022202,
+    0x20002111, 0x20002112, 0x20012011, 0x20012110, 0x20012112, 0x20022111, 0x21002011, 0x21002110,
+    0x21002112, 0x21002211, 0x21012010, 0x21012012, 0x21012111, 0x21012212, 0x21022011, 0x21022110,
+    0x22002111, 0x22012112, 0x22012211, 0x22022111, 0x20002020, 0x20002022, 0x20002220, 0x20002222,
+    0x20012121, 0x20022020, 0x20022022, 0x20022220, 0x20022222, 0x21002121, 0x21012021, 0x21012120,
+    0x21012122, 0x22002020, 0x22002022, 0x22002220, 0x22002222, 0x22012121, 0x22022020, 0x22022022,
+    0x22022220, 0x22022222, 0x20100101, 0x20110001, 0x20110102, 0x20110200, 0x20110201, 0x20120101,
+    0x21100001, 0x21100102, 0x21100201, 0x21110101, 0x21110200, 0x21110202, 0x21120201, 0x21120202,
+    0x22100101, 0x22110001, 0x22110100, 0x22110102, 0x22110201, 0x22120101, 0x20100011, 0x20100110,
+    0x20100112, 0x20100211, 0x20110010, 0x20110111, 0x20110210, 0x20110212, 0x20120011, 0x20120110,
+    0x20120112, 0x20120211, 0x21100010, 0x21100111, 0x21110010, 0x21110011, 0x21110110, 0x21110111,
+    0x21110112, 0x21110211, 0x21120012, 0x21120111, 0x22100110, 0x22100112, 0x22110012, 0x22110111,
+    0x22110210, 0x22120011, 0x22120110, 0x22120112, 0x22120211, 0x20100121, 0x20110021, 0x20110120,
+    0x20110221, 0x20120121, 0x21100120, 0x21100122, 0x21100221, 0x21110020, 0x21110022, 0x21110121,
+    0x21110220, 0x21120122, 0x21120221, 0x22100121, 0x22110120, 0x22110122, 0x22120221, 0x20101001,
+    0x20101100, 0x20101102, 0x20111000, 0x20111101, 0x20111200, 0x20121102, 0x21101000, 0x21101202,
+    0x21111001, 0x21111100, 0x21111101, 0x21111102, 0x21111200, 0x21111201, 0x21121000, 0x21121001,
+    0x21121002, 0x21121101, 0x22101100, 0x22101102, 0x22111002, 0x22111100, 0x22111101, 0x22111200,
+    0x22121001, 0x22121201, 0x20101010, 0x20101111, 0x20101210, 0x20101212, 0x20111010, 0x20111011,
+    0x20111110, 0x20111111, 0x20111112, 0x20111211, 0x20121011, 0x20121111, 0x20121211, 0x20121212,
+    0x21101011, 0x21101110, 0x21101111, 0x21101112, 0x21101211, 0x21111010, 0x21111011, 0x21111012,
+    0x21111110, 0x21111111, 0x21111112, 0x21111210, 0x21111211, 0x21111212, 0x21121011, 0x21121110,
+    0x21121111, 0x21121112, 0x21121211, 0x22101011, 0x22101111, 0x22101210, 0x22111011, 0x22111012,
+    0x22111110, 0x22111111, 0x22111112, 0x22111211, 0x22111212, 0x22121010, 0x22121012, 0x22121111,
+    0x22121210, 0x22121212, 0x20101021, 0x20101120, 0x20111020, 0x20111121, 0x20111221, 0x20121020,
+    0x20121122, 0x20121221, 0x21101121, 0x21101220, 0x21101221, 0x21111021, 0x21111022, 0x21111121,
+    0x21111122, 0x21111221, 0x21121121, 0x21121220, 0x22101022, 0x22101120, 0x22101221, 0x22101222,
+    0x22111022, 0x22111120, 0x22111121, 0x22121120, 0x22121122, 0x22121221, 0x20102101, 0x20112102,
+    0x20112201, 0x20122101, 0x21102001, 0x21102102, 0x21112000, 0x21112002, 0x21112101, 0x21112102,
+    0x21112202, 0x21122100, 0x21122101, 0x22102101, 0x22112001, 0x22112102, 0x22112201, 0x22122101,
+    0x20102110, 0x20102112, 0x20102211, 0x20112010, 0x20112012, 0x20112111, 0x20112210, 0x20112212,
+    0x20122010, 0x20122011, 0x20122110, 0x20122112, 0x21102010, 0x21102012, 0x21102111, 0x21102210,
+    0x21102212, 0x21112011, 0x21112110, 0x21112111, 0x21112112, 0x21112211, 0x21122012, 0x21122111,
+    0x21122112, 0x21122212, 0x22102011, 0x22102110, 0x22112010, 0x22112012, 0x22112111, 0x22112212,
+    0x22122011, 0x22122112, 0x20102121, 0x20112121, 0x20122121, 0x21102120, 0x21102122, 0x21102221,
+    0x21112020, 0x21112121, 0x21112220, 0x21122021, 0x22102121, 0x22112021, 0x22112120, 0x22112121,
+    0x22112122, 0x20200000, 0x20200002, 0x20200200, 0x20200202, 0x20210101, 0x20220000, 0x20220002,
+    0x20220200, 0x20220202, 0x21200101, 0x21210001, 0x21210100, 0x21210102, 0x21210201, 0x22200000,
+    0x22200002, 0x22200200, 0x22200202, 0x22210101, 0x22220000, 0x22220002, 0x22220200, 0x22220202,
+    0x20200111, 0x20200211, 0x20210011, 0x20210110, 0x20210112, 0x20210211, 0x20210212, 0x21200112,
+    0x21200211, 0x21210011, 0x21210111, 0x21210210, 0x21210212, 0x21220011, 0x21220110, 0x22200111,
+    0x22210010, 0x22210012, 0x22210112, 0x22210211, 0x20200022, 0x20200220, 0x20200222, 0x20210020,
+    0x20210221, 0x20220022, 0x20220220, 0x20220222, 0x21200121, 0x21210021, 0x21210122, 0x21210221,
+    0x21220121, 0x22200020, 0x22200022, 0x22200220, 0x22200222, 0x22210121, 0x22220020, 0x22220022,
+    0x22220220, 0x22220222, 0x20211201, 0x20221101, 0x21201001, 0x21201100, 0x21211000, 0x21211100,
+    0x21211101, 0x21211200, 0x21211202, 0x21221001, 0x21221101, 0x21221102, 0x21221200, 0x21221201,
+    0x22201101, 0x20201112, 0x20201211, 0x20211010, 0x20211012, 0x20211111, 0x20211210, 0x20221112,
+    0x20221211, 0x21201012, 0x21201111, 0x21211011, 0x21211110, 0x21211111, 0x21211112, 0x21211211,
+    0x21221111, 0x21221212, 0x22201011, 0x22201110, 0x22201111, 0x22201112, 0x22201211, 0x22211012,
+    0x22211111, 0x22211210, 0x20201121, 0x20211021, 0x20211122, 0x20211222, 0x20221021, 0x20221121,
+    0x21201120, 0x21201122, 0x21201222, 0x21211022, 0x21211121, 0x21211122, 0x21211220, 0x21221020,
+    0x21221022, 0x22201122, 0x22211020, 0x22211121, 0x22211122, 0x22211221, 0x22221021, 0x22221120,
+    0x22221122, 0x20202000, 0x20202002, 0x20202200, 0x20202202, 0x20222000, 0x20222002, 0x20222200,
+    0x20222202, 0x21212001, 0x21212100, 0x21212102, 0x21212201, 0x22202000, 0x22202002, 0x22202200,
+    0x22202202, 0x22212101, 0x22222000, 0x22222002, 0x22222200, 0x22222202, 0x20202111, 0x20212110,
+    0x20212211, 0x20222011, 0x20222111, 0x21202011, 0x21212010, 0x21212111, 0x21212212, 0x21222011,
+    0x21222112, 0x21222211, 0x22212010, 0x22212112, 0x20202020, 0x20202022, 0x20202220, 0x20202222,
+    0x20222020, 0x20222022, 0x20222220, 0x20222222, 0x21212021, 0x21212120, 0x21212122, 0x22202020,
+    0x22202022, 0x22202220, 0x22202222, 0x22212121, 0x22222020, 0x22222022, 0x22222220, 0x22222222,
+};
+
+static const __device__ uint8_t ksigns_iq2xs[128] = {
+      0, 129, 130,   3, 132,   5,   6, 135, 136,   9,  10, 139,  12, 141, 142,  15,
+    144,  17,  18, 147,  20, 149, 150,  23,  24, 153, 154,  27, 156,  29,  30, 159,
+    160,  33,  34, 163,  36, 165, 166,  39,  40, 169, 170,  43, 172,  45,  46, 175,
+     48, 177, 178,  51, 180,  53,  54, 183, 184,  57,  58, 187,  60, 189, 190,  63,
+    192,  65,  66, 195,  68, 197, 198,  71,  72, 201, 202,  75, 204,  77,  78, 207,
+     80, 209, 210,  83, 212,  85,  86, 215, 216,  89,  90, 219,  92, 221, 222,  95,
+     96, 225, 226,  99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
+    240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
+};
+
+static const __device__ uint64_t ksigns64[128] = {
+    0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
+    0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
+    0xff000000ff000000, 0x00000000ff0000ff, 0x00000000ff00ff00, 0xff000000ff00ffff,
+    0x00000000ffff0000, 0xff000000ffff00ff, 0xff000000ffffff00, 0x00000000ffffffff,
+    0xff0000ff00000000, 0x000000ff000000ff, 0x000000ff0000ff00, 0xff0000ff0000ffff,
+    0x000000ff00ff0000, 0xff0000ff00ff00ff, 0xff0000ff00ffff00, 0x000000ff00ffffff,
+    0x000000ffff000000, 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0x000000ffff00ffff,
+    0xff0000ffffff0000, 0x000000ffffff00ff, 0x000000ffffffff00, 0xff0000ffffffffff,
+    0xff00ff0000000000, 0x0000ff00000000ff, 0x0000ff000000ff00, 0xff00ff000000ffff,
+    0x0000ff0000ff0000, 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0x0000ff0000ffffff,
+    0x0000ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 0x0000ff00ff00ffff,
+    0xff00ff00ffff0000, 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0xff00ff00ffffffff,
+    0x0000ffff00000000, 0xff00ffff000000ff, 0xff00ffff0000ff00, 0x0000ffff0000ffff,
+    0xff00ffff00ff0000, 0x0000ffff00ff00ff, 0x0000ffff00ffff00, 0xff00ffff00ffffff,
+    0xff00ffffff000000, 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0xff00ffffff00ffff,
+    0x0000ffffffff0000, 0xff00ffffffff00ff, 0xff00ffffffffff00, 0x0000ffffffffffff,
+    0xffff000000000000, 0x00ff0000000000ff, 0x00ff00000000ff00, 0xffff00000000ffff,
+    0x00ff000000ff0000, 0xffff000000ff00ff, 0xffff000000ffff00, 0x00ff000000ffffff,
+    0x00ff0000ff000000, 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0x00ff0000ff00ffff,
+    0xffff0000ffff0000, 0x00ff0000ffff00ff, 0x00ff0000ffffff00, 0xffff0000ffffffff,
+    0x00ff00ff00000000, 0xffff00ff000000ff, 0xffff00ff0000ff00, 0x00ff00ff0000ffff,
+    0xffff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 0xffff00ff00ffffff,
+    0xffff00ffff000000, 0x00ff00ffff0000ff, 0x00ff00ffff00ff00, 0xffff00ffff00ffff,
+    0x00ff00ffffff0000, 0xffff00ffffff00ff, 0xffff00ffffffff00, 0x00ff00ffffffffff,
+    0x00ffff0000000000, 0xffffff00000000ff, 0xffffff000000ff00, 0x00ffff000000ffff,
+    0xffffff0000ff0000, 0x00ffff0000ff00ff, 0x00ffff0000ffff00, 0xffffff0000ffffff,
+    0xffffff00ff000000, 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0xffffff00ff00ffff,
+    0x00ffff00ffff0000, 0xffffff00ffff00ff, 0xffffff00ffffff00, 0x00ffff00ffffffff,
+    0xffffffff00000000, 0x00ffffff000000ff, 0x00ffffff0000ff00, 0xffffffff0000ffff,
+    0x00ffffff00ff0000, 0xffffffff00ff00ff, 0xffffffff00ffff00, 0x00ffffff00ffffff,
+    0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,
+    0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,
+};
+
+static const __device__ uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
+static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
+
+
+typedef half dfloat; // dequantize float
+typedef half2 dfloat2;
+typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
+template<typename dst_t>
+using to_cuda_ggml_t = void (*)(const void * __restrict__ x, dst_t * __restrict__ y, int k, cudaStream_t stream);
+typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
+typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
+typedef void (*load_tiles_cuda_t)(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
+typedef float (*vec_dot_q_mul_mat_cuda_t)(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
+
+// Utility function
+
+template<typename dst_t>
+static __device__ __forceinline__ dst_t convert_from_half(half val) {
+    return val;
+}
+
+template<>
+__device__ __forceinline__ c10::BFloat16 convert_from_half<c10::BFloat16>(half val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    return __float2bfloat16(__half2float(val));
+#else
+    return __half2float(val);
+#endif  // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+}
+
+template<>
+__device__ __forceinline__ float convert_from_half<float>(half val) {
+    return __half2float(val);
+}
+
+#if defined(USE_ROCM)
+
+#ifndef __has_builtin
+    #define __has_builtin(x) 0
+#endif
+
+typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
+static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
+    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
+    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
+#if __has_builtin(__builtin_elementwise_sub_sat)
+    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
+    return reinterpret_cast<const int &>(c);
+#else
+    int8x4_t c;
+    int16_t tmp;
+#pragma unroll
+    for (int i = 0; i < 4; i++) {
+        tmp = va[i] - vb[i];
+        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
+        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
+        c[i] = tmp;
+    }
+    return reinterpret_cast<int &>(c);
+#endif // __has_builtin(__builtin_elementwise_sub_sat)
+}
+
+static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
+#if __has_builtin(__builtin_amdgcn_sdot4)
+    c = __builtin_amdgcn_sdot4(a, b, c, false);
+#else
+    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
+    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
+    c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
+#endif
+    return c;
+}
+
+static __device__ __forceinline__ uint32_t __vcmpeq4(const uint32_t a, const uint32_t b) {
+    uint32_t neq = a^b;
+    return !(neq & 0xff000000) * 0xff000000 |
+           !(neq & 0x00ff0000) * 0x00ff0000 |
+           !(neq & 0x0000ff00) * 0x0000ff00 |
+           !(neq & 0x000000ff) * 0x000000ff;
+}
+
+static __device__ __forceinline__ uint32_t __vsub4(const uint32_t a, const uint32_t b) {
+    return (static_cast<uint8_t>(((a & 0xff000000) >> 24) - ((b & 0xff000000) >> 24)) << 24) +
+           (static_cast<uint8_t>(((a & 0x00ff0000) >> 16) - ((b & 0x00ff0000) >> 16)) << 16) +
+           (static_cast<uint8_t>(((a & 0x0000ff00) >>  8) - ((b & 0x0000ff00) >>  8)) <<  8) +
+           (static_cast<uint8_t>(((a & 0x000000ff) >>  0) - ((b & 0x000000ff) >>  0)) <<  0);
+}
+#endif // defined(USE_ROCM)
diff --git a/vllm_v0.10.0/csrc/quantization/gguf/gguf_kernel.cu b/vllm_v0.10.0/csrc/quantization/gguf/gguf_kernel.cu
new file mode 100644
index 0000000..3b5180b
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/gguf/gguf_kernel.cu
@@ -0,0 +1,542 @@
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
+
+#include "ggml-common.h"
+#include "vecdotq.cuh"
+#include "dequantize.cuh"
+#include "mmvq.cuh"
+#include "mmq.cuh"
+#include "moe.cuh"
+#include "moe_vec.cuh"
+
+// Q8 gemv
+template <typename scalar_t>
+static __global__ void quantize_q8_1(const scalar_t* __restrict__ x,
+                                     void* __restrict__ vy, const int kx,
+                                     const int kx_padded) {
+  const auto ix = blockDim.x * blockIdx.x + threadIdx.x;
+  if (ix >= kx_padded) {
+    return;
+  }
+  const auto iy = blockDim.y * blockIdx.y + threadIdx.y;
+  const int i_padded = iy * kx_padded + ix;
+
+  block_q8_1* y = (block_q8_1*)vy;
+
+  const int ib = i_padded / QK8_1;   // block index
+  const int iqs = i_padded % QK8_1;  // quant index
+
+  const float xi = ix < kx ? static_cast<float>(x[iy * kx + ix]) : 0.0f;
+  float amax = fabsf(xi);
+  float sum = xi;
+
+#pragma unroll
+  for (int mask = 16; mask > 0; mask >>= 1) {
+    amax = fmaxf(amax, VLLM_SHFL_XOR_SYNC_WIDTH(amax, mask, 32));
+    sum += VLLM_SHFL_XOR_SYNC_WIDTH(sum, mask, 32);
+  }
+
+  const float d = amax / 127;
+  const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
+
+  y[ib].qs[iqs] = q;
+
+  if (iqs > 0) {
+    return;
+  }
+
+  y[ib].ds.x = __float2half(d);
+  y[ib].ds.y = __float2half(sum);
+}
+
+template <typename scalar_t>
+static void quantize_row_q8_1_cuda(const scalar_t* x, void* vy, const int kx,
+                                   const int ky, cudaStream_t stream) {
+  const int64_t kx_padded = (kx + 512 - 1) / 512 * 512;
+  const int block_num_x =
+      (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
+  constexpr int MAX_BLOCK_SIZE = 65535;
+  for (int off = 0; off < ky; off += MAX_BLOCK_SIZE) {
+    const int num_blocks_y = std::min(ky, off + MAX_BLOCK_SIZE) - off;
+    const dim3 num_blocks(block_num_x, num_blocks_y, 1);
+    const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
+    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(
+        &x[off * kx], (int32_t*)vy + off * (kx_padded / 32 * 9), kx, kx_padded);
+  }
+}
+
+torch::Tensor ggml_dequantize(torch::Tensor W,  // quant weight
+                              int64_t type, int64_t m, int64_t n,
+                              std::optional<at::ScalarType> const& dtype) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(W));
+  auto dtype_ = dtype.value_or(torch::kFloat16);
+  auto options = torch::TensorOptions().dtype(dtype_).device(W.device());
+  at::Tensor DW = torch::empty({m, n}, options);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+
+  VLLM_DISPATCH_FLOATING_TYPES(DW.scalar_type(), "ggml_dequantize", [&] {
+    auto to_cuda = ggml_get_to_cuda<scalar_t>(type);
+    to_cuda((void*)W.data_ptr(), (scalar_t*)DW.data_ptr(), m * n, stream);
+  });
+
+  return DW;
+}
+
+torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W,  // quant weight
+                                  torch::Tensor X,  // input
+                                  int64_t type, int64_t row) {
+  int col = X.sizes()[1];
+  int vecs = X.sizes()[0];
+  const int padded = (col + 512 - 1) / 512 * 512;
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(X));
+  auto options = torch::TensorOptions().dtype(X.dtype()).device(W.device());
+  at::Tensor Y = torch::empty({vecs, row}, options);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+  options = torch::TensorOptions().dtype(torch::kInt32).device(W.device());
+  at::Tensor quant_X = torch::empty({vecs, padded / 32 * 9}, options);
+  VLLM_DISPATCH_FLOATING_TYPES(X.scalar_type(), "ggml_mul_mat_vec_a8", [&] {
+    quantize_row_q8_1_cuda<scalar_t>(
+        (scalar_t*)X.data_ptr(), (void*)quant_X.data_ptr(), col, vecs, stream);
+    switch (type) {
+      case 2:
+        mul_mat_vec_q4_0_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 3:
+        mul_mat_vec_q4_1_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 6:
+        mul_mat_vec_q5_0_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 7:
+        mul_mat_vec_q5_1_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 8:
+        mul_mat_vec_q8_0_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 10:
+        mul_mat_vec_q2_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 11:
+        mul_mat_vec_q3_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 12:
+        mul_mat_vec_q4_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 13:
+        mul_mat_vec_q5_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 14:
+        mul_mat_vec_q6_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 16:
+        mul_mat_vec_iq2_xxs_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 17:
+        mul_mat_vec_iq2_xs_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 18:
+        mul_mat_vec_iq3_xxs_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 19:
+        mul_mat_vec_iq1_s_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 20:
+        mul_mat_vec_iq4_nl_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 21:
+        mul_mat_vec_iq3_s_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 22:
+        mul_mat_vec_iq2_s_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 23:
+        mul_mat_vec_iq4_xs_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 29:
+        mul_mat_vec_iq1_m_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+    }
+  });
+  return Y;
+}
+
+torch::Tensor ggml_mul_mat_a8(torch::Tensor W,  // quant weight
+                              torch::Tensor X,  // input
+                              int64_t type, int64_t row) {
+  int col = X.sizes()[1];
+  int padded = (col + 512 - 1) / 512 * 512;
+  int batch = X.sizes()[0];
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(X));
+  auto options = torch::TensorOptions().dtype(X.dtype()).device(W.device());
+  at::Tensor Y = torch::empty({batch, row}, options);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+  options = torch::TensorOptions().dtype(torch::kInt32).device(W.device());
+  at::Tensor quant_X = torch::empty({batch, padded / 32 * 9}, options);
+  VLLM_DISPATCH_FLOATING_TYPES(X.scalar_type(), "ggml_mul_mat_a8", [&] {
+    quantize_row_q8_1_cuda((scalar_t*)X.data_ptr(), (void*)quant_X.data_ptr(),
+                           col, batch, stream);
+
+    switch (type) {
+      case 2:
+        ggml_mul_mat_q4_0_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 3:
+        ggml_mul_mat_q4_1_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 6:
+        ggml_mul_mat_q5_0_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 7:
+        ggml_mul_mat_q5_1_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 8:
+        ggml_mul_mat_q8_0_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 10:
+        ggml_mul_mat_q2_K_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 11:
+        ggml_mul_mat_q3_K_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 12:
+        ggml_mul_mat_q4_K_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 13:
+        ggml_mul_mat_q5_K_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 14:
+        ggml_mul_mat_q6_K_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+    }
+  });
+  return Y;
+}
+
+torch::Tensor ggml_moe_a8(torch::Tensor X,  // input
+                          torch::Tensor W,  // expert weights
+                          torch::Tensor sorted_token_ids,
+                          torch::Tensor expert_ids,
+                          torch::Tensor num_tokens_post_padded, int64_t type,
+                          int64_t row, int64_t top_k, int64_t tokens) {
+  int col = X.sizes()[1];
+  int padded = (col + 512 - 1) / 512 * 512;
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(X));
+  auto options = torch::TensorOptions().dtype(X.dtype()).device(W.device());
+  at::Tensor Y = torch::empty({tokens * top_k, row}, options);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+  options = torch::TensorOptions().dtype(torch::kInt32).device(W.device());
+  at::Tensor quant_X = torch::empty({tokens, padded / 32 * 9}, options);
+  VLLM_DISPATCH_FLOATING_TYPES(X.scalar_type(), "ggml_moe_a8", [&] {
+    quantize_row_q8_1_cuda((scalar_t*)X.data_ptr(), (void*)quant_X.data_ptr(),
+                           col, tokens, stream);
+    switch (type) {
+      case 2:
+        ggml_moe_q4_0_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+      case 3:
+        ggml_moe_q4_1_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+      case 6:
+        ggml_moe_q5_0_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+      case 7:
+        ggml_moe_q5_1_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+      case 8:
+        ggml_moe_q8_0_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+      case 10:
+        ggml_moe_q2_K_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+      case 11:
+        ggml_moe_q3_K_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+      case 12:
+        ggml_moe_q4_K_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+      case 13:
+        ggml_moe_q5_K_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+      case 14:
+        ggml_moe_q6_K_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+    }
+  });
+  return Y;
+}
+
+torch::Tensor ggml_moe_a8_vec(torch::Tensor X,  // input
+                              torch::Tensor W,  // expert weights
+                              torch::Tensor topk_ids, int64_t top_k,
+                              int64_t type, int64_t row, int64_t tokens) {
+  int col = X.sizes()[1];
+  const int padded = (col + 512 - 1) / 512 * 512;
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(X));
+  auto options = torch::TensorOptions().dtype(X.dtype()).device(W.device());
+  at::Tensor Y = torch::zeros({tokens * top_k, row}, options);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+  options = torch::TensorOptions().dtype(torch::kInt32).device(W.device());
+  at::Tensor quant_X = torch::empty({tokens, padded / 32 * 9}, options);
+  VLLM_DISPATCH_FLOATING_TYPES(X.scalar_type(), "ggml_moe_vec_a8", [&] {
+    quantize_row_q8_1_cuda<scalar_t>((scalar_t*)X.data_ptr(),
+                                     (void*)quant_X.data_ptr(), col, tokens,
+                                     stream);
+    switch (type) {
+      case 2:
+        moe_vec_q4_0_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 3:
+        moe_vec_q4_1_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 6:
+        moe_vec_q5_0_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 7:
+        moe_vec_q5_1_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 8:
+        moe_vec_q8_0_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 10:
+        moe_vec_q2_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 11:
+        moe_vec_q3_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 12:
+        moe_vec_q4_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 13:
+        moe_vec_q5_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 14:
+        moe_vec_q6_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 16:
+        moe_vec_iq2_xxs_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 17:
+        moe_vec_iq2_xs_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 18:
+        moe_vec_iq3_xxs_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 19:
+        moe_vec_iq1_s_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 20:
+        moe_vec_iq4_nl_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 21:
+        moe_vec_iq3_s_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 22:
+        moe_vec_iq2_s_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 23:
+        moe_vec_iq4_xs_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 29:
+        moe_vec_iq1_m_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+    }
+  });
+  return Y;
+}
+
+int64_t ggml_moe_get_block_size(int64_t type) {
+  switch (type) {
+    case 2:
+      return MOE_X_Q4_0;
+    case 3:
+      return MOE_X_Q4_1;
+    case 6:
+      return MOE_X_Q5_0;
+    case 7:
+      return MOE_X_Q5_1;
+    case 8:
+      return MOE_X_Q8_0;
+    case 10:
+      return MOE_X_Q2_K;
+    case 11:
+      return MOE_X_Q3_K;
+    case 12:
+      return MOE_X_Q4_K;
+    case 13:
+      return MOE_X_Q5_K;
+    case 14:
+      return MOE_X_Q6_K;
+  }
+  return 0;
+}
diff --git a/vllm_v0.10.0/csrc/quantization/gguf/mmq.cuh b/vllm_v0.10.0/csrc/quantization/gguf/mmq.cuh
new file mode 100644
index 0000000..7c89918
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/gguf/mmq.cuh
@@ -0,0 +1,610 @@
+// copied from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmq.cu
+template <typename scalar_t, int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
+              allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
+static __device__ __forceinline__ void mul_mat_q(
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    const int blocks_per_row_x = ncols_x / qk;
+    const int blocks_per_col_y = nrows_y / QK8_1;
+    const int blocks_per_warp = WARP_SIZE_GGUF / qi;
+
+    const int & ncols_dst = ncols_y;
+
+    const auto row_dst_0 = blockIdx.x*mmq_y;
+    const int & row_x_0 = row_dst_0;
+
+    const auto col_dst_0 = blockIdx.y*mmq_x;
+    const int & col_y_0 = col_dst_0;
+
+    int   * tile_x_ql = nullptr;
+    half2 * tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
+
+    allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+
+    __shared__ int    tile_y_qs[mmq_x * WARP_SIZE_GGUF];
+    __shared__ half2  tile_y_ds[mmq_x * WARP_SIZE_GGUF/QI8_1];
+
+    float sum[mmq_y/WARP_SIZE_GGUF][mmq_x/nwarps] = {{0.0f}};
+
+    for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
+
+        load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
+                   threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
+
+#pragma unroll
+        for (int ir = 0; ir < qr && ib0 + ir * blocks_per_warp/qr < blocks_per_row_x; ++ir) {
+            const auto kqs = ir*WARP_SIZE_GGUF + threadIdx.x;
+            const int kbxd = kqs / QI8_1;
+
+#pragma unroll
+            for (int i = 0; i < mmq_x; i += nwarps) {
+                const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
+                const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
+                const int index_y = (threadIdx.y + i) * WARP_SIZE_GGUF + kqs % WARP_SIZE_GGUF;
+                tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
+            }
+
+#pragma unroll
+            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
+                const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE_GGUF/QI8_1)) % mmq_x;
+                const auto kby = threadIdx.x % (WARP_SIZE_GGUF/QI8_1);
+                const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
+
+                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
+                const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE_GGUF/QI8_1) + kby].ds;
+                half2       * dsi_dst = &tile_y_ds[ids * (WARP_SIZE_GGUF/QI8_1) + kby];
+                if (need_sum) {
+                    *dsi_dst = *dsi_src;
+                } else {
+                    float * dfi_dst = (float *) dsi_dst;
+                    *dfi_dst = __low2float(*dsi_src);
+                }
+            }
+
+            __syncthreads();
+
+// #pragma unroll // unrolling this loop causes too much register pressure
+            for (int k = ir*WARP_SIZE_GGUF/qr; k < (ir+1)*WARP_SIZE_GGUF/qr; k += vdr) {
+#pragma unroll
+                for (int j = 0; j < mmq_x; j += nwarps) {
+#pragma unroll
+                    for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
+                        sum[i/WARP_SIZE_GGUF][j/nwarps] += vec_dot(
+                            tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
+                            threadIdx.x + i, threadIdx.y + j, k);
+                    }
+                }
+            }
+            __syncthreads();
+        }
+    }
+
+#pragma unroll
+    for (int j = 0; j < mmq_x; j += nwarps) {
+        const auto col_dst = col_dst_0 + j + threadIdx.y;
+        if (col_dst >= ncols_dst) {
+            return;
+        }
+
+#pragma unroll
+        for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
+            const auto row_dst = row_dst_0 + threadIdx.x + i;
+            if (row_dst >= nrows_dst) {
+                continue;
+            }
+            dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE_GGUF][j/nwarps];
+        }
+    }
+}
+
+#if defined(USE_ROCM)
+#define  MMQ_X_Q4_0  64
+#define  MMQ_Y_Q4_0  128
+#define NWARPS_Q4_0  8
+#else
+#define  MMQ_X_Q4_0 4
+#define  MMQ_Y_Q4_0 32
+#define NWARPS_Q4_0 4
+#endif
+
+template<typename scalar_t, bool need_check> static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_0, 2)
+#endif
+mul_mat_q4_0(
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+    const int mmq_x  =  MMQ_X_Q4_0;
+    const int mmq_y  =  MMQ_Y_Q4_0;
+    const int nwarps = NWARPS_Q4_0;
+
+    mul_mat_q<scalar_t, QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
+        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+}
+
+template<typename scalar_t>
+static void ggml_mul_mat_q4_0_q8_1_cuda(
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+    int mmq_x  =  MMQ_X_Q4_0;
+    int mmq_y  =  MMQ_Y_Q4_0;
+    int nwarps = NWARPS_Q4_0;
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q4_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q4_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+#if defined(USE_ROCM)
+#define  MMQ_X_Q4_1 64
+#define  MMQ_Y_Q4_1 128
+#define NWARPS_Q4_1 8
+#else
+#define  MMQ_X_Q4_1 4
+#define  MMQ_Y_Q4_1 32
+#define NWARPS_Q4_1 4
+#endif
+
+template<typename scalar_t, bool need_check> static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_1, 2)
+#endif
+mul_mat_q4_1(
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+    const int mmq_x  =  MMQ_X_Q4_1;
+    const int mmq_y  =  MMQ_Y_Q4_1;
+    const int nwarps = NWARPS_Q4_1;
+
+    mul_mat_q<scalar_t, QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
+        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+}
+
+template<typename scalar_t>
+static void ggml_mul_mat_q4_1_q8_1_cuda(
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+    int mmq_x  =  MMQ_X_Q4_1;
+    int mmq_y  =  MMQ_Y_Q4_1;
+    int nwarps = NWARPS_Q4_1;
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q4_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q4_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+#if defined(USE_ROCM)
+#define  MMQ_X_Q5_0 64
+#define  MMQ_Y_Q5_0 128
+#define NWARPS_Q5_0 8
+#else
+#define  MMQ_X_Q5_0 4
+#define  MMQ_Y_Q5_0 32
+#define NWARPS_Q5_0 4
+#endif
+
+template<typename scalar_t, bool need_check> static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_0, 2)
+#endif
+mul_mat_q5_0(
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+    const int mmq_x  =  MMQ_X_Q5_0;
+    const int mmq_y  =  MMQ_Y_Q5_0;
+    const int nwarps = NWARPS_Q5_0;
+
+    mul_mat_q<scalar_t, QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
+        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+}
+
+template<typename scalar_t>
+static void ggml_mul_mat_q5_0_q8_1_cuda(
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+    const int mmq_x  =  MMQ_X_Q5_0;
+    const int mmq_y  =  MMQ_Y_Q5_0;
+    const int nwarps = NWARPS_Q5_0;
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q5_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q5_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+#if defined(USE_ROCM)
+#define  MMQ_X_Q5_1 64
+#define  MMQ_Y_Q5_1 128
+#define NWARPS_Q5_1 8
+#else
+#define  MMQ_X_Q5_1 4
+#define  MMQ_Y_Q5_1 32
+#define NWARPS_Q5_1 4
+#endif
+
+template<typename scalar_t, bool need_check> static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_1, 2)
+#endif
+mul_mat_q5_1(
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+    const int mmq_x  =  MMQ_X_Q5_1;
+    const int mmq_y  =  MMQ_Y_Q5_1;
+    const int nwarps = NWARPS_Q5_1;
+
+    mul_mat_q<scalar_t, QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
+        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+}
+
+template<typename scalar_t>
+static void ggml_mul_mat_q5_1_q8_1_cuda(
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+    const int mmq_x  =  MMQ_X_Q5_1;
+    const int mmq_y  =  MMQ_Y_Q5_1;
+    const int nwarps = NWARPS_Q5_1;
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q5_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q5_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+#if defined(USE_ROCM)
+#define  MMQ_X_Q8_0 64
+#define  MMQ_Y_Q8_0 128
+#define NWARPS_Q8_0 8
+#else
+#define  MMQ_X_Q8_0 4
+#define  MMQ_Y_Q8_0 32
+#define NWARPS_Q8_0 4
+#endif
+
+template<typename scalar_t, bool need_check> static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q8_0, 2)
+#endif
+mul_mat_q8_0(
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+    const int mmq_x  =  MMQ_X_Q8_0;
+    const int mmq_y  =  MMQ_Y_Q8_0;
+    const int nwarps = NWARPS_Q8_0;
+
+    mul_mat_q<scalar_t, QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
+        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+}
+
+template<typename scalar_t>
+static void ggml_mul_mat_q8_0_q8_1_cuda(
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+    const int mmq_x  =  MMQ_X_Q8_0;
+    const int mmq_y  =  MMQ_Y_Q8_0;
+    const int nwarps = NWARPS_Q8_0;
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q8_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q8_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+#if defined(USE_ROCM)
+#define  MMQ_X_Q2_K 64
+#define  MMQ_Y_Q2_K 128
+#define NWARPS_Q2_K 8
+#else
+#define  MMQ_X_Q2_K 4
+#define  MMQ_Y_Q2_K 32
+#define NWARPS_Q2_K 4
+#endif
+
+template<typename scalar_t, bool need_check> static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q2_K, 2)
+#endif
+mul_mat_q2_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+    const int mmq_x  =  MMQ_X_Q2_K;
+    const int mmq_y  =  MMQ_Y_Q2_K;
+    const int nwarps = NWARPS_Q2_K;
+
+    mul_mat_q<scalar_t, QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
+        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+}
+
+template<typename scalar_t>
+static void ggml_mul_mat_q2_K_q8_1_cuda(
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+    const int mmq_x  =  MMQ_X_Q2_K;
+    const int mmq_y  =  MMQ_Y_Q2_K;
+    const int nwarps = NWARPS_Q2_K;
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q2_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q2_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+#if defined(USE_ROCM)
+#define  MMQ_X_Q3_K 64
+#define  MMQ_Y_Q3_K 128
+#define NWARPS_Q3_K 8
+#else
+#define  MMQ_X_Q3_K 4
+#define  MMQ_Y_Q3_K 32
+#define NWARPS_Q3_K 4
+#endif
+
+template<typename scalar_t, bool need_check> static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q3_K, 2)
+#endif
+mul_mat_q3_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+    const int mmq_x  =  MMQ_X_Q3_K;
+    const int mmq_y  =  MMQ_Y_Q3_K;
+    const int nwarps = NWARPS_Q3_K;
+
+    mul_mat_q<scalar_t, QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
+        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+}
+
+template<typename scalar_t>
+static void ggml_mul_mat_q3_K_q8_1_cuda(
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+    const int mmq_x  =  MMQ_X_Q3_K;
+    const int mmq_y  =  MMQ_Y_Q3_K;
+    const int nwarps = NWARPS_Q3_K;
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q3_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q3_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+#if defined(USE_ROCM)
+#define  MMQ_X_Q4_K 64
+#define  MMQ_Y_Q4_K 128
+#define NWARPS_Q4_K 8
+#else
+#define  MMQ_X_Q4_K 4
+#define  MMQ_Y_Q4_K 32
+#define NWARPS_Q4_K 4
+#endif
+
+template<typename scalar_t, bool need_check> static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_K, 2)
+#endif
+mul_mat_q4_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+    const int mmq_x  =  MMQ_X_Q4_K;
+    const int mmq_y  =  MMQ_Y_Q4_K;
+    const int nwarps = NWARPS_Q4_K;
+
+    mul_mat_q<scalar_t, QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
+        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+}
+
+template<typename scalar_t>
+static void ggml_mul_mat_q4_K_q8_1_cuda(
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+    const int mmq_x  =  MMQ_X_Q4_K;
+    const int mmq_y  =  MMQ_Y_Q4_K;
+    const int nwarps = NWARPS_Q4_K;
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q4_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q4_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+#if defined(USE_ROCM)
+#define  MMQ_X_Q5_K 64
+#define  MMQ_Y_Q5_K 128
+#define NWARPS_Q5_K 8
+#else
+#define  MMQ_X_Q5_K 4
+#define  MMQ_Y_Q5_K 32
+#define NWARPS_Q5_K 4
+#endif
+
+template<typename scalar_t, bool need_check> static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_K, 2)
+#endif
+mul_mat_q5_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+    const int mmq_x  =  MMQ_X_Q5_K;
+    const int mmq_y  =  MMQ_Y_Q5_K;
+    const int nwarps = NWARPS_Q5_K;
+
+    mul_mat_q<scalar_t, QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
+        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+}
+
+template<typename scalar_t>
+static void ggml_mul_mat_q5_K_q8_1_cuda(
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+    const int mmq_x  =  MMQ_X_Q5_K;
+    const int mmq_y  =  MMQ_Y_Q5_K;
+    const int nwarps = NWARPS_Q5_K;
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q5_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q5_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+#if defined(USE_ROCM)
+#define  MMQ_X_Q6_K 64
+#define  MMQ_Y_Q6_K 128
+#define NWARPS_Q6_K 8
+#else
+#define  MMQ_X_Q6_K 4
+#define  MMQ_Y_Q6_K 32
+#define NWARPS_Q6_K 4
+#endif
+
+template<typename scalar_t, bool need_check> static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q6_K, 2)
+#endif
+mul_mat_q6_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+    const int mmq_x  =  MMQ_X_Q6_K;
+    const int mmq_y  =  MMQ_Y_Q6_K;
+    const int nwarps = NWARPS_Q6_K;
+
+    mul_mat_q<scalar_t, QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
+        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+}
+
+template<typename scalar_t>
+static void ggml_mul_mat_q6_K_q8_1_cuda(
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+    const int mmq_x  =  MMQ_X_Q6_K;
+    const int mmq_y  =  MMQ_Y_Q6_K;
+    const int nwarps = NWARPS_Q6_K;
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q6_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q6_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
diff --git a/vllm_v0.10.0/csrc/quantization/gguf/mmvq.cuh b/vllm_v0.10.0/csrc/quantization/gguf/mmvq.cuh
new file mode 100644
index 0000000..e27bec7
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/gguf/mmvq.cuh
@@ -0,0 +1,212 @@
+// copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmvq.cu
+template <typename scalar_t, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
+static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, const int ncols, const int nrows, const int nvecs) {
+    const auto row = blockIdx.x*blockDim.y + threadIdx.y;
+    const auto vec = blockIdx.y;
+
+    if (row >= nrows || vec >= nvecs) {
+        return;
+    }
+
+    const int blocks_per_row = ncols / qk;
+    const int blocks_per_warp = vdr * WARP_SIZE / qi;
+    const int nrows_y = (ncols + 512 - 1) / 512 * 512;
+
+
+    // partial sum for each thread
+    float tmp = 0.0f;
+
+    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    for (auto i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
+        const int ibx = row*blocks_per_row + i; // x block index
+
+        const int iby = vec*(nrows_y/QK8_1) + i * (qk/QK8_1); // y block index that aligns with ibx
+
+        const int iqs  = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
+
+        tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = WARP_SIZE/2; mask > 0; mask >>= 1) {
+        tmp += VLLM_SHFL_XOR_SYNC(tmp, mask);
+    }
+
+    if (threadIdx.x == 0) {
+        dst[vec*nrows + row] = tmp;
+    }
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_iq2_xxs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_iq2_xs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_iq2_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_iq3_xxs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_iq1_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_iq1_m_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK_K, QI1_M, block_iq1_m, 1, vec_dot_iq1_m_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_iq4_nl_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_iq4_xs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK_K, QI4_XS, block_iq4_xs, 1, vec_dot_iq4_xs_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_iq3_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
diff --git a/vllm_v0.10.0/csrc/quantization/gguf/moe.cuh b/vllm_v0.10.0/csrc/quantization/gguf/moe.cuh
new file mode 100644
index 0000000..df9b84a
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/gguf/moe.cuh
@@ -0,0 +1,739 @@
+#include <cstdint>
+
+/* Adapted from ./csrc/quantization/gguf/mmq.cuh
+   based on ./vllm/model_executor/layers/fused_moe/fused_moe.py */
+template <typename scalar_t, int qk, int qr, int qi, bool need_sum,
+          typename block_q_t, int mmq_x, int mmq_y, int nwarps,
+          allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles,
+          int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
+static __device__ __forceinline__ void moe_q(
+    const void* __restrict__ vx, const void* __restrict__ vy,
+    scalar_t* __restrict__ dst, const int* __restrict__ sorted_token_ids,
+    const int* __restrict__ expert_ids,
+    const int* __restrict__ num_tokens_post_padded, const int exp_stride,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y,
+    const int nrows_dst, const int top_k) {
+  const int blocks_per_row_x = ncols_x / qk;
+  const int blocks_per_col_y = nrows_y / QK8_1;
+  const int blocks_per_warp = WARP_SIZE_GGUF / qi;
+
+  const int ncols_dst = ncols_y * top_k;
+
+  const auto row_dst_0 = blockIdx.x * mmq_y;
+  const int& row_x_0 = row_dst_0;
+
+  const auto col_dst_0 = blockIdx.y * mmq_x;
+
+  int token_offs[mmq_x / nwarps];
+  for (int i = 0; i < mmq_x; i += nwarps) {
+    token_offs[i / nwarps] = sorted_token_ids[col_dst_0 + threadIdx.y + i];
+  }
+
+  const int exp_idx = expert_ids[blockIdx.y];
+  if (exp_idx > 255 || exp_idx < 0) return;
+  if (blockIdx.y * mmq_x > num_tokens_post_padded[0]) return;
+
+  const block_q_t* x = (const block_q_t*)((char*)vx + exp_idx * exp_stride);
+  const block_q8_1* y = (const block_q8_1*)(vy);
+
+  int* tile_x_ql = nullptr;
+  half2* tile_x_dm = nullptr;
+  int* tile_x_qh = nullptr;
+  int* tile_x_sc = nullptr;
+
+  allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+
+  __shared__ int tile_y_qs[mmq_x * WARP_SIZE_GGUF];
+  __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE_GGUF / QI8_1];
+
+  float sum[mmq_y / WARP_SIZE_GGUF][mmq_x / nwarps] = {{0.0f}};
+
+  for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
+    load_tiles(x + row_x_0 * blocks_per_row_x + ib0, tile_x_ql, tile_x_dm,
+               tile_x_qh, tile_x_sc, threadIdx.y, nrows_x - row_x_0 - 1,
+               threadIdx.x, blocks_per_row_x);
+
+    const int n_per_r = ((qk * blocks_per_warp) / qr);
+#pragma unroll
+    for (int ir = 0; ir < qr && ib0 * qk + ir * n_per_r < ncols_x; ++ir) {
+      const auto kqs = ir * WARP_SIZE_GGUF + threadIdx.x;
+      const int kbxd = kqs / QI8_1;
+
+#pragma unroll
+      for (int i = 0; i < mmq_x; i += nwarps) {
+        const int col_y_eff = token_offs[i / nwarps] / top_k;
+        const int block_x = ib0 * (qk / QK8_1) + kbxd;
+        if (col_y_eff < ncols_y && block_x < blocks_per_col_y) {
+          const block_q8_1* by0 = &y[col_y_eff * blocks_per_col_y + block_x];
+          const int index_y =
+              (threadIdx.y + i) * WARP_SIZE_GGUF + kqs % WARP_SIZE_GGUF;
+          tile_y_qs[index_y] =
+              get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
+        }
+      }
+
+      if (threadIdx.x < n_per_r / QK8_1) {
+        const auto kby = threadIdx.x % (WARP_SIZE_GGUF / QI8_1);
+        const int col_y_eff = token_offs[threadIdx.y] / top_k;
+        const int block_x =
+            ib0 * (qk / QK8_1) + ir * (WARP_SIZE_GGUF / QI8_1) + kby;
+
+        if (col_y_eff < ncols_y && block_x < blocks_per_col_y) {
+          const half2* dsi_src = &y[col_y_eff * blocks_per_col_y + block_x].ds;
+          half2* dsi_dst =
+              &tile_y_ds[threadIdx.y * (WARP_SIZE_GGUF / QI8_1) + kby];
+
+          if (need_sum) {
+            *dsi_dst = *dsi_src;
+          } else {
+            float* dfi_dst = (float*)dsi_dst;
+            *dfi_dst = __low2float(*dsi_src);
+          }
+        }
+      }
+      __syncthreads();
+
+      // #pragma unroll // unrolling this loop causes too much register pressure
+      for (int k = ir * WARP_SIZE_GGUF / qr; k < (ir + 1) * WARP_SIZE_GGUF / qr;
+           k += vdr) {
+#pragma unroll
+        for (int j = 0; j < mmq_x; j += nwarps) {
+#pragma unroll
+          for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
+            sum[i / WARP_SIZE_GGUF][j / nwarps] +=
+                vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs,
+                        tile_y_ds, threadIdx.x + i, threadIdx.y + j, k);
+          }
+        }
+      }
+      __syncthreads();
+    }
+  }
+
+#pragma unroll
+  for (int j = 0; j < mmq_x; j += nwarps) {
+    const int col_dst = token_offs[j / nwarps];
+    if (col_dst >= ncols_dst) {
+      return;
+    }
+
+#pragma unroll
+    for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
+      const auto row_dst = row_dst_0 + threadIdx.x + i;
+      if (row_dst >= nrows_dst) {
+        continue;
+      }
+      dst[col_dst * nrows_dst + row_dst] = sum[i / WARP_SIZE_GGUF][j / nwarps];
+    }
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MOE_X_Q4_0 8
+  #define MOE_Y_Q4_0 128
+  #define NWARPS_Q4_0 8
+#else
+  #define MOE_X_Q4_0 4
+  #define MOE_Y_Q4_0 32
+  #define NWARPS_Q4_0 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_0, 2)
+#endif
+    moe_q4_0(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+  const int mmq_x = MOE_X_Q4_0;
+  const int mmq_y = MOE_Y_Q4_0;
+  const int nwarps = NWARPS_Q4_0;
+
+  moe_q<scalar_t, QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q4_0<mmq_y>, load_tiles_q4_0<mmq_y, nwarps, need_check>,
+        VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+
+template <typename scalar_t>
+static void ggml_moe_q4_0_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  int mmq_x = MOE_X_Q4_0;
+  int mmq_y = MOE_Y_Q4_0;
+  int nwarps = NWARPS_Q4_0;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q4_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q4_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MOE_X_Q4_1 8
+  #define MOE_Y_Q4_1 128
+  #define NWARPS_Q4_1 8
+#else
+  #define MOE_X_Q4_1 4
+  #define MOE_Y_Q4_1 32
+  #define NWARPS_Q4_1 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_1, 2)
+#endif
+    moe_q4_1(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+  const int mmq_x = MOE_X_Q4_1;
+  const int mmq_y = MOE_Y_Q4_1;
+  const int nwarps = NWARPS_Q4_1;
+
+  moe_q<scalar_t, QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q4_1<mmq_y>, load_tiles_q4_1<mmq_y, nwarps, need_check>,
+        VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+
+template <typename scalar_t>
+static void ggml_moe_q4_1_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  int mmq_x = MOE_X_Q4_1;
+  int mmq_y = MOE_Y_Q4_1;
+  int nwarps = NWARPS_Q4_1;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q4_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q4_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MOE_X_Q5_0 8
+  #define MOE_Y_Q5_0 128
+  #define NWARPS_Q5_0 8
+#else
+  #define MOE_X_Q5_0 4
+  #define MOE_Y_Q5_0 32
+  #define NWARPS_Q5_0 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_0, 2)
+#endif
+    moe_q5_0(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+  const int mmq_x = MOE_X_Q5_0;
+  const int mmq_y = MOE_Y_Q5_0;
+  const int nwarps = NWARPS_Q5_0;
+
+  moe_q<scalar_t, QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q5_0<mmq_y>, load_tiles_q5_0<mmq_y, nwarps, need_check>,
+        VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+
+template <typename scalar_t>
+static void ggml_moe_q5_0_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  const int mmq_x = MOE_X_Q5_0;
+  const int mmq_y = MOE_Y_Q5_0;
+  const int nwarps = NWARPS_Q5_0;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q5_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q5_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MOE_X_Q5_1 8
+  #define MOE_Y_Q5_1 128
+  #define NWARPS_Q5_1 8
+#else
+  #define MOE_X_Q5_1 4
+  #define MOE_Y_Q5_1 32
+  #define NWARPS_Q5_1 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_1, 2)
+#endif
+    moe_q5_1(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+  const int mmq_x = MOE_X_Q5_1;
+  const int mmq_y = MOE_Y_Q5_1;
+  const int nwarps = NWARPS_Q5_1;
+
+  moe_q<scalar_t, QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q5_1<mmq_y>, load_tiles_q5_1<mmq_y, nwarps, need_check>,
+        VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+
+template <typename scalar_t>
+static void ggml_moe_q5_1_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  const int mmq_x = MOE_X_Q5_1;
+  const int mmq_y = MOE_Y_Q5_1;
+  const int nwarps = NWARPS_Q5_1;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q5_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q5_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MOE_X_Q8_0 8
+  #define MOE_Y_Q8_0 128
+  #define NWARPS_Q8_0 8
+#else
+  #define MOE_X_Q8_0 4
+  #define MOE_Y_Q8_0 32
+  #define NWARPS_Q8_0 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q8_0, 2)
+#endif
+    moe_q8_0(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+  const int mmq_x = MOE_X_Q8_0;
+  const int mmq_y = MOE_Y_Q8_0;
+  const int nwarps = NWARPS_Q8_0;
+
+  moe_q<scalar_t, QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q8_0<mmq_y>, load_tiles_q8_0<mmq_y, nwarps, need_check>,
+        VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+
+template <typename scalar_t>
+static void ggml_moe_q8_0_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  const int mmq_x = MOE_X_Q8_0;
+  const int mmq_y = MOE_Y_Q8_0;
+  const int nwarps = NWARPS_Q8_0;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q8_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q8_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MOE_X_Q2_K 8
+  #define MOE_Y_Q2_K 128
+  #define NWARPS_Q2_K 8
+#else
+  #define MOE_X_Q2_K 4
+  #define MOE_Y_Q2_K 32
+  #define NWARPS_Q2_K 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q2_K, 2)
+#endif
+    moe_q2_K(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+  const int mmq_x = MOE_X_Q2_K;
+  const int mmq_y = MOE_Y_Q2_K;
+  const int nwarps = NWARPS_Q2_K;
+
+  moe_q<scalar_t, QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q2_K<mmq_y>, load_tiles_q2_K<mmq_y, nwarps, need_check>,
+        VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+
+template <typename scalar_t>
+static void ggml_moe_q2_K_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  const int mmq_x = MOE_X_Q2_K;
+  const int mmq_y = MOE_Y_Q2_K;
+  const int nwarps = NWARPS_Q2_K;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q2_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q2_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MOE_X_Q3_K 8
+  #define MOE_Y_Q3_K 128
+  #define NWARPS_Q3_K 8
+#else
+  #define MOE_X_Q3_K 4
+  #define MOE_Y_Q3_K 32
+  #define NWARPS_Q3_K 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q3_K, 2)
+#endif
+    moe_q3_K(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+
+  const int mmq_x = MOE_X_Q3_K;
+  const int mmq_y = MOE_Y_Q3_K;
+  const int nwarps = NWARPS_Q3_K;
+
+  moe_q<scalar_t, QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q3_K<mmq_y>, load_tiles_q3_K<mmq_y, nwarps, need_check>,
+        VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+template <typename scalar_t>
+static void ggml_moe_q3_K_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  const int mmq_x = MOE_X_Q3_K;
+  const int mmq_y = MOE_Y_Q3_K;
+  const int nwarps = NWARPS_Q3_K;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q3_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q3_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MOE_X_Q4_K 8
+  #define MOE_Y_Q4_K 128
+  #define NWARPS_Q4_K 8
+#else
+  #define MOE_X_Q4_K 4
+  #define MOE_Y_Q4_K 32
+  #define NWARPS_Q4_K 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_K, 2)
+#endif
+    moe_q4_K(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+  const int mmq_x = MOE_X_Q4_K;
+  const int mmq_y = MOE_Y_Q4_K;
+  const int nwarps = NWARPS_Q4_K;
+
+  moe_q<scalar_t, QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q4_K<mmq_y>, load_tiles_q4_K<mmq_y, nwarps, need_check>,
+        VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+
+template <typename scalar_t>
+static void ggml_moe_q4_K_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  const int mmq_x = MOE_X_Q4_K;
+  const int mmq_y = MOE_Y_Q4_K;
+  const int nwarps = NWARPS_Q4_K;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q4_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q4_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MOE_X_Q5_K 8
+  #define MOE_Y_Q5_K 128
+  #define NWARPS_Q5_K 8
+#else
+  #define MOE_X_Q5_K 4
+  #define MOE_Y_Q5_K 32
+  #define NWARPS_Q5_K 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_K, 2)
+#endif
+    moe_q5_K(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+  const int mmq_x = MOE_X_Q5_K;
+  const int mmq_y = MOE_Y_Q5_K;
+  const int nwarps = NWARPS_Q5_K;
+
+  moe_q<scalar_t, QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q5_K<mmq_y>, load_tiles_q5_K<mmq_y, nwarps, need_check>,
+        VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+
+template <typename scalar_t>
+static void ggml_moe_q5_K_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  const int mmq_x = MOE_X_Q5_K;
+  const int mmq_y = MOE_Y_Q5_K;
+  const int nwarps = NWARPS_Q5_K;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q5_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q5_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MOE_X_Q6_K 8
+  #define MOE_Y_Q6_K 128
+  #define NWARPS_Q6_K 8
+#else
+  #define MOE_X_Q6_K 4
+  #define MOE_Y_Q6_K 32
+  #define NWARPS_Q6_K 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q6_K, 2)
+#endif
+    moe_q6_K(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+  const int mmq_x = MOE_X_Q6_K;
+  const int mmq_y = MOE_Y_Q6_K;
+  const int nwarps = NWARPS_Q6_K;
+
+  moe_q<scalar_t, QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q6_K<mmq_y>, load_tiles_q6_K<mmq_y, nwarps, need_check>,
+        VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+
+template <typename scalar_t>
+static void ggml_moe_q6_K_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  const int mmq_x = MOE_X_Q6_K;
+  const int mmq_y = MOE_Y_Q6_K;
+  const int nwarps = NWARPS_Q6_K;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q6_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q6_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
diff --git a/vllm_v0.10.0/csrc/quantization/gguf/moe_vec.cuh b/vllm_v0.10.0/csrc/quantization/gguf/moe_vec.cuh
new file mode 100644
index 0000000..60f65a1
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/gguf/moe_vec.cuh
@@ -0,0 +1,338 @@
+// copied and adapted from
+// https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmvq.cu
+template <typename scalar_t, int qk, int qi, typename block_q_t, int vdr,
+          vec_dot_q_cuda_t vec_dot_q_cuda>
+static __global__ void moe_vec_q(const void* __restrict__ vx,
+                                 const void* __restrict__ vy,
+                                 scalar_t* __restrict__ dst,
+                                 const int* topk_ids, const int topk,
+                                 const int ncols, const int nrows,
+                                 const int token_stride) {
+  const auto row = blockIdx.x * blockDim.y + threadIdx.y;
+
+  const auto token = blockIdx.z / topk;
+  const auto expert = (topk_ids)[blockIdx.z];
+
+  if (row >= nrows) {
+    return;
+  }
+
+  const int blocks_per_row = ncols / qk;
+  const int blocks_per_warp = vdr * WARP_SIZE / qi;
+
+  // partial sum for each thread
+  float tmp = 0.0f;
+
+  const block_q_t* x = ((const block_q_t*)vx) + expert * nrows * blocks_per_row;
+  const block_q8_1* y =
+      (const block_q8_1*)(((const int*)vy) + token * token_stride);
+
+  for (auto i = threadIdx.x / (qi / vdr); i < blocks_per_row;
+       i += blocks_per_warp) {
+    const int ibx = row * blocks_per_row + i;  // x block index
+
+    const int iby = i * (qk / QK8_1);  // y block index that aligns with ibx
+
+    const int iqs =
+        vdr *
+        (threadIdx.x %
+         (qi / vdr));  // x block quant index when casting the quants to int
+
+    tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
+  }
+
+  // sum up partial sums and write back result
+#pragma unroll
+  for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
+    tmp += VLLM_SHFL_XOR_SYNC(tmp, mask);
+  }
+
+  if (threadIdx.x == 0) {
+    dst[blockIdx.z * nrows + row] = tmp;
+  }
+}
+
+template <typename scalar_t>
+static void moe_vec_q4_0_q8_1_cuda(const void* vx, const void* vy,
+                                   scalar_t* dst, const int* topk_ids,
+                                   const int top_k, const int tokens,
+                                   const int ncols, const int nrows,
+                                   const int token_stride,
+                                   cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ,
+            vec_dot_q4_0_q8_1><<<block_nums, block_dims, 0, stream>>>(
+      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_q4_1_q8_1_cuda(const void* vx, const void* vy,
+                                   scalar_t* dst, const int* topk_ids,
+                                   const int top_k, const int tokens,
+                                   const int ncols, const int nrows,
+                                   const int token_stride,
+                                   cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ,
+            vec_dot_q4_1_q8_1><<<block_nums, block_dims, 0, stream>>>(
+      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_q5_0_q8_1_cuda(const void* vx, const void* vy,
+                                   scalar_t* dst, const int* topk_ids,
+                                   const int top_k, const int tokens,
+                                   const int ncols, const int nrows,
+                                   const int token_stride,
+                                   cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ,
+            vec_dot_q5_0_q8_1><<<block_nums, block_dims, 0, stream>>>(
+      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_q5_1_q8_1_cuda(const void* vx, const void* vy,
+                                   scalar_t* dst, const int* topk_ids,
+                                   const int top_k, const int tokens,
+                                   const int ncols, const int nrows,
+                                   const int token_stride,
+                                   cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ,
+            vec_dot_q5_1_q8_1><<<block_nums, block_dims, 0, stream>>>(
+      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_q8_0_q8_1_cuda(const void* vx, const void* vy,
+                                   scalar_t* dst, const int* topk_ids,
+                                   const int top_k, const int tokens,
+                                   const int ncols, const int nrows,
+                                   const int token_stride,
+                                   cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ,
+            vec_dot_q8_0_q8_1><<<block_nums, block_dims, 0, stream>>>(
+      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_q2_K_q8_1_cuda(const void* vx, const void* vy,
+                                   scalar_t* dst, const int* topk_ids,
+                                   const int top_k, const int tokens,
+                                   const int ncols, const int nrows,
+                                   const int token_stride,
+                                   cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ,
+            vec_dot_q2_K_q8_1><<<block_nums, block_dims, 0, stream>>>(
+      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_q3_K_q8_1_cuda(const void* vx, const void* vy,
+                                   scalar_t* dst, const int* topk_ids,
+                                   const int top_k, const int tokens,
+                                   const int ncols, const int nrows,
+                                   const int token_stride,
+                                   cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ,
+            vec_dot_q3_K_q8_1><<<block_nums, block_dims, 0, stream>>>(
+      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_q4_K_q8_1_cuda(const void* vx, const void* vy,
+                                   scalar_t* dst, const int* topk_ids,
+                                   const int top_k, const int tokens,
+                                   const int ncols, const int nrows,
+                                   const int token_stride,
+                                   cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ,
+            vec_dot_q4_K_q8_1><<<block_nums, block_dims, 0, stream>>>(
+      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_q5_K_q8_1_cuda(const void* vx, const void* vy,
+                                   scalar_t* dst, const int* topk_ids,
+                                   const int top_k, const int tokens,
+                                   const int ncols, const int nrows,
+                                   const int token_stride,
+                                   cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ,
+            vec_dot_q5_K_q8_1><<<block_nums, block_dims, 0, stream>>>(
+      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_q6_K_q8_1_cuda(const void* vx, const void* vy,
+                                   scalar_t* dst, const int* topk_ids,
+                                   const int top_k, const int tokens,
+                                   const int ncols, const int nrows,
+                                   const int token_stride,
+                                   cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ,
+            vec_dot_q6_K_q8_1><<<block_nums, block_dims, 0, stream>>>(
+      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_iq2_xxs_q8_1_cuda(const void* vx, const void* vy,
+                                      scalar_t* dst, const int* topk_ids,
+                                      const int top_k, const int tokens,
+                                      const int ncols, const int nrows,
+                                      const int token_stride,
+                                      cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
+      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
+                                              ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_iq2_xs_q8_1_cuda(const void* vx, const void* vy,
+                                     scalar_t* dst, const int* topk_ids,
+                                     const int top_k, const int tokens,
+                                     const int ncols, const int nrows,
+                                     const int token_stride,
+                                     cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
+      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
+                                              ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_iq2_s_q8_1_cuda(const void* vx, const void* vy,
+                                    scalar_t* dst, const int* topk_ids,
+                                    const int top_k, const int tokens,
+                                    const int ncols, const int nrows,
+                                    const int token_stride,
+                                    cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
+      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
+                                              ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_iq3_xxs_q8_1_cuda(const void* vx, const void* vy,
+                                      scalar_t* dst, const int* topk_ids,
+                                      const int top_k, const int tokens,
+                                      const int ncols, const int nrows,
+                                      const int token_stride,
+                                      cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
+      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
+                                              ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_iq1_s_q8_1_cuda(const void* vx, const void* vy,
+                                    scalar_t* dst, const int* topk_ids,
+                                    const int top_k, const int tokens,
+                                    const int ncols, const int nrows,
+                                    const int token_stride,
+                                    cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
+      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
+                                              ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_iq1_m_q8_1_cuda(const void* vx, const void* vy,
+                                    scalar_t* dst, const int* topk_ids,
+                                    const int top_k, const int tokens,
+                                    const int ncols, const int nrows,
+                                    const int token_stride,
+                                    cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK_K, QI1_M, block_iq1_m, 1, vec_dot_iq1_m_q8_1>
+      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
+                                              ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_iq4_nl_q8_1_cuda(const void* vx, const void* vy,
+                                     scalar_t* dst, const int* topk_ids,
+                                     const int top_k, const int tokens,
+                                     const int ncols, const int nrows,
+                                     const int token_stride,
+                                     cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ,
+            vec_dot_iq4_nl_q8_1><<<block_nums, block_dims, 0, stream>>>(
+      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_iq4_xs_q8_1_cuda(const void* vx, const void* vy,
+                                     scalar_t* dst, const int* topk_ids,
+                                     const int top_k, const int tokens,
+                                     const int ncols, const int nrows,
+                                     const int token_stride,
+                                     cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK_K, QI4_XS, block_iq4_xs, 1, vec_dot_iq4_xs_q8_1>
+      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
+                                              ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_iq3_s_q8_1_cuda(const void* vx, const void* vy,
+                                    scalar_t* dst, const int* topk_ids,
+                                    const int top_k, const int tokens,
+                                    const int ncols, const int nrows,
+                                    const int token_stride,
+                                    cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
+      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
+                                              ncols, nrows, token_stride);
+}
diff --git a/vllm_v0.10.0/csrc/quantization/gguf/vecdotq.cuh b/vllm_v0.10.0/csrc/quantization/gguf/vecdotq.cuh
new file mode 100644
index 0000000..d0d4c74
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/gguf/vecdotq.cuh
@@ -0,0 +1,1812 @@
+// copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/vecdotq.cuh
+// and https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmq.cu
+static __device__ __forceinline__ int get_int_b2(const void * x, const int & i32) {
+    const uint16_t * x16 = (const uint16_t *) x; // assume at least 2 byte alignment
+
+    int x32  = x16[2*i32 + 0] <<  0;
+    x32     |= x16[2*i32 + 1] << 16;
+
+    return x32;
+}
+
+static __device__ __forceinline__ int get_int_b4(const void * x, const int & i32) {
+    return ((const int *) x)[i32]; // assume at least 4 byte alignment
+}
+
+static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
+    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
+    int x32 = 0;
+    x32 |= x16[0] <<  0;
+    x32 |= x16[1] << 16;
+    return x32;
+}
+
+static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
+    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
+    int x32 = 0;
+    x32 |= x16[0] <<  0;
+    x32 |= x16[1] << 16;
+    return x32;
+}
+
+static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
+    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
+}
+
+static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
+    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
+}
+
+// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
+// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
+
+#define VDR_Q4_0_Q8_1_MMVQ 2
+#define VDR_Q4_0_Q8_1_MMQ  4
+
+template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
+    const int * v, const int * u, const float & d4, const half2 & ds8) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
+        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
+
+        // SIMD dot product of quantized values
+        sumi = __dp4a(vi0, u[2*i+0], sumi);
+        sumi = __dp4a(vi1, u[2*i+1], sumi);
+    }
+
+    const float2 ds8f = __half22float2(ds8);
+
+    // second part effectively subtracts 8 from each quant value
+    return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
+#endif
+}
+
+#define VDR_Q4_1_Q8_1_MMVQ 2
+#define VDR_Q4_1_Q8_1_MMQ  4
+
+template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
+    const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
+        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
+
+        // SIMD dot product of quantized values
+        sumi = __dp4a(vi0, u[2*i+0], sumi);
+        sumi = __dp4a(vi1, u[2*i+1], sumi);
+    }
+
+    const float2 tmp = __half22float2(__hmul2(dm4, ds8));
+    const float d4d8 = tmp.x;
+    const float m4s8 = tmp.y;
+
+    // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
+    return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
+#endif
+}
+
+#define VDR_Q5_0_Q8_1_MMVQ 2
+#define VDR_Q5_0_Q8_1_MMQ  4
+
+template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
+    const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
+        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
+        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
+        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
+        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
+        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
+
+        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
+        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
+        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
+        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
+        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
+        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
+    }
+
+    const float2 ds8f = __half22float2(ds8);
+
+    // second part effectively subtracts 16 from each quant value
+    return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
+#endif
+}
+
+
+#define VDR_Q5_1_Q8_1_MMVQ 2
+#define VDR_Q5_1_Q8_1_MMQ  4
+
+template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
+    const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
+        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
+        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
+        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
+        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
+        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
+
+        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
+        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
+        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
+        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
+        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
+        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
+    }
+
+    const float2 tmp = __half22float2(__hmul2(dm5, ds8));
+    const float d5d8 = tmp.x;
+    const float m5s8 = tmp.y;
+
+    // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
+    return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
+#endif
+}
+
+#define VDR_Q8_0_Q8_1_MMVQ 2
+#define VDR_Q8_0_Q8_1_MMQ 8
+
+template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
+    const int * v, const int * u, const float & d8_0, const float & d8_1) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        // SIMD dot product of quantized values
+        sumi = __dp4a(v[i], u[i], sumi);
+    }
+    return d8_0*d8_1 * sumi;
+#endif
+}
+
+template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
+    const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
+
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        // SIMD dot product of quantized values
+        sumi = __dp4a(v[i], u[i], sumi);
+    }
+
+    const float2 tmp = __half22float2(__hmul2(dm8, ds8));
+    const float d8d8 = tmp.x;
+    const float m8s8 = tmp.y;
+
+    // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
+    return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
+#endif
+}
+
+#define VDR_Q2_K_Q8_1_MMVQ 1
+#define VDR_Q2_K_Q8_1_MMQ  2
+
+// contiguous v/x values
+static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
+    const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
+    const half2 & dm2, const float * __restrict__ d8) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR2_K; ++i) {
+        const int sc = scales[2*i];
+
+        const int vi = (v >> (2*i)) & 0x03030303;
+
+        sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
+
+        // fill int with 4x m
+        int m = sc >> 4;
+        m |= m <<  8;
+        m |= m << 16;
+        sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
+    }
+
+    const float2 dm2f = __half22float2(dm2);
+
+    return dm2f.x*sumf_d - dm2f.y*sumf_m;
+#endif
+}
+
+static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
+    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
+    const half2 & dm2, const float & d8) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
+    int sumi_d = 0;
+    int sumi_m = 0;
+
+#pragma unroll
+    for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
+        int sumi_d_sc = 0;
+
+        const int sc = scales[i0 / (QI8_1/2)];
+
+        // fill int with 4x m
+        int m = sc >> 4;
+        m |= m <<  8;
+        m |= m << 16;
+
+#pragma unroll
+        for (int i = i0; i < i0 + QI8_1/2; ++i) {
+            sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
+            sumi_m    = __dp4a(m,    u[i], sumi_m); // multiply sum of q8_1 values with m
+        }
+
+        sumi_d += sumi_d_sc * (sc & 0xF);
+    }
+
+    const float2 dm2f = __half22float2(dm2);
+
+    return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
+#endif
+}
+
+#define VDR_Q3_K_Q8_1_MMVQ 1
+#define VDR_Q3_K_Q8_1_MMQ  2
+
+// contiguous v/x values
+static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
+    const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
+    const int & scale_offset, const float & d3, const float * __restrict__ d8) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
+
+    float sumf = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR3_K; ++i) {
+        const int isc = scale_offset + 2*i;
+
+        const int isc_low = isc % (QK_K/32);
+        const int sc_shift_low = 4 * (isc / (QK_K/32));
+        const int sc_low  = (scales[isc_low] >> sc_shift_low) & 0xF;
+
+        const int isc_high = isc % (QK_K/64);
+        const int sc_shift_high = 2 * (isc / (QK_K/64));
+        const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
+
+        const int sc = (sc_low | sc_high) - 32;
+
+        const int vil = (vl >> (2*i)) & 0x03030303;
+
+        const int vih = ((vh >> i) << 2) & 0x04040404;
+
+        const int vi = __vsubss4(vil, vih);
+
+        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
+    }
+
+    return d3 * sumf;
+#endif
+}
+
+static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
+    const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
+    const float & d3, const float & d8) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
+    int sumi = 0;
+
+#pragma unroll
+    for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
+        int sumi_sc = 0;
+
+        for (int i = i0; i < i0 + QI8_1/2; ++i) {
+            sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
+        }
+
+        sumi += sumi_sc * scales[i0 / (QI8_1/2)];
+    }
+
+    return d3*d8 * sumi;
+#endif
+}
+
+#define VDR_Q4_K_Q8_1_MMVQ 2
+#define VDR_Q4_K_Q8_1_MMQ  8
+
+// contiguous v/x values
+static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
+    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
+    const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
+
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR4_K; ++i) {
+        const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
+        const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
+
+        const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
+        const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
+
+        sumf_d += d8[i] * (dot1 * sc[i]);
+        sumf_m += d8[i] * (dot2 * m[i]);  // multiply constant part of q4_K with sum of q8_1 values
+    }
+
+    const float2 dm4f = __half22float2(dm4);
+    return dm4f.x*sumf_d - dm4f.y*sumf_m;
+#endif
+}
+
+static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
+    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
+    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
+        int sumi_d = 0;
+
+#pragma unroll
+        for (int j = 0; j < QI8_1; ++j) {
+            sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
+        }
+
+        const float2 ds8f = __half22float2(ds8[i]);
+
+        sumf_d += ds8f.x * (sc[i] * sumi_d);
+        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
+    }
+
+    const float2 dm4f = __half22float2(dm4);
+
+    return dm4f.x*sumf_d - dm4f.y*sumf_m;
+#endif
+}
+
+#define VDR_Q5_K_Q8_1_MMVQ 2
+#define VDR_Q5_K_Q8_1_MMQ  8
+
+static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
+    const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
+    const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
+
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR5_K; ++i) {
+        const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
+        const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
+
+        const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
+        const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
+
+        const int v0i = vl0i | vh0i;
+        const int v1i = vl1i | vh1i;
+
+        const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
+        const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
+
+        sumf_d += d8[i] * (dot1 * sc[i]);
+        sumf_m += d8[i] * (dot2 * m[i]);
+    }
+
+    const float2 dm5f = __half22float2(dm5);
+    return dm5f.x*sumf_d - dm5f.y*sumf_m;
+#endif
+}
+
+static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
+    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
+    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
+        int sumi_d = 0;
+
+#pragma unroll
+        for (int j = 0; j < QI8_1; ++j) {
+            sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
+        }
+
+        const float2 ds8f = __half22float2(ds8[i]);
+
+        sumf_d += ds8f.x * (sc[i] * sumi_d);
+        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
+    }
+
+    const float2 dm4f = __half22float2(dm4);
+
+    return dm4f.x*sumf_d - dm4f.y*sumf_m;
+#endif
+}
+
+#define VDR_Q6_K_Q8_1_MMVQ 1
+#define VDR_Q6_K_Q8_1_MMQ  8
+
+// contiguous v/x values
+static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
+    const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
+    const float & d, const float * __restrict__ d8) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
+    float sumf = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR6_K; ++i) {
+        const int sc = scales[4*i];
+        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
+        const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
+        const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
+
+        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
+    }
+
+    return d*sumf;
+#endif
+}
+
+static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
+    const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
+    const float & d6, const float * __restrict__ d8) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
+    float sumf_d = 0.0f;
+
+#pragma unroll
+    for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
+        int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
+
+#pragma unroll
+        for (int i = i0; i < i0 + 2; ++i) {
+            sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
+            sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
+
+            sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
+            sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
+        }
+
+        sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
+    }
+
+    return d6 * sumf_d;
+#endif
+}
+
+static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
+
+    int v[VDR_Q4_0_Q8_1_MMVQ];
+    int u[2*VDR_Q4_0_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
+        v[i]     = get_int_from_uint8(bq4_0->qs, iqs + i);
+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
+    }
+
+    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, __half2float(bq4_0->d), bq8_1->ds);
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE_GGUF)       + mmq_y];
+    __shared__ float tile_x_d[mmq_y * (WARP_SIZE_GGUF/QI4_0) + mmq_y/QI4_0];
+    *x_ql = tile_x_qs;
+    *x_dm = (half2 *) tile_x_d;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+    const int kbx  = k / QI4_0;
+    const int kqsx = k % QI4_0;
+
+    const block_q4_0 * bx0 = (const block_q4_0 *) vx;
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+        if (need_check) {
+            i = min(i, i_max);
+        }
+        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
+        x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
+        // x_dmf[i * (WARP_SIZE_GGUF/QI4_0) + i / QI4_0 + kbx] = bxi->d;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI4_0;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
+        int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
+        if (need_check) {
+            i = min(i, i_max);
+        }
+        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
+        x_dmf[i * (WARP_SIZE_GGUF/QI4_0) + i / QI4_0 + kbxd] = __half2float(bxi->d);
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+    (void)x_qh; (void)x_sc;
+
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+    const float * x_dmf = (const float *) x_dm;
+
+    int u[2*VDR_Q4_0_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
+        u[2*l+0] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l)         % WARP_SIZE_GGUF];
+        u[2*l+1] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l + QI4_0) % WARP_SIZE_GGUF];
+    }
+
+    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
+        (&x_ql[i * (WARP_SIZE_GGUF + 1) + k], u, x_dmf[i * (WARP_SIZE_GGUF/QI4_0) + i/QI4_0 + k/QI4_0],
+         y_ds[j * (WARP_SIZE_GGUF/QI8_1) + (2*k/QI8_1) % (WARP_SIZE_GGUF/QI8_1)]);
+}
+
+static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
+
+    int v[VDR_Q4_1_Q8_1_MMVQ];
+    int u[2*VDR_Q4_1_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
+        v[i]    = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
+    }
+
+    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+    __shared__ int   tile_x_qs[mmq_y * (WARP_SIZE_GGUF) +     + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI4_1) + mmq_y/QI4_1];
+    *x_ql = tile_x_qs;
+    *x_dm = tile_x_dm;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+    const int kbx  = k / QI4_1;
+    const int kqsx = k % QI4_1;
+
+    const block_q4_1 * bx0 = (const block_q4_1 *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+        if (need_check) {
+            i = min(i, i_max);
+        }
+        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
+        x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI4_1;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
+        int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
+        if (need_check) {
+            i = min(i, i_max);
+        }
+        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
+        x_dm[i * (WARP_SIZE_GGUF/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+
+    int u[2*VDR_Q4_1_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
+        u[2*l+0] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l)         % WARP_SIZE_GGUF];
+        u[2*l+1] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l + QI4_1) % WARP_SIZE_GGUF];
+    }
+
+    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
+        (&x_ql[i * (WARP_SIZE_GGUF + 1) + k], u, x_dm[i * (WARP_SIZE_GGUF/QI4_1) + i/QI4_1 + k/QI4_1],
+         y_ds[j * (WARP_SIZE_GGUF/QI8_1) + (2*k/QI8_1) % (WARP_SIZE_GGUF/QI8_1)]);
+}
+
+static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+    const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
+
+    int vl[VDR_Q5_0_Q8_1_MMVQ];
+    int vh[VDR_Q5_0_Q8_1_MMVQ];
+    int  u[2*VDR_Q5_0_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
+        vl[i]    = get_int_from_uint8(bq5_0->qs, iqs + i);
+        vh[i]    = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
+    }
+
+    return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, __half2float(bq5_0->d), bq8_1->ds);
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+    __shared__ int  tile_x_ql[mmq_y * (2*WARP_SIZE_GGUF)     + mmq_y];
+    __shared__ float tile_x_d[mmq_y * (WARP_SIZE_GGUF/QI5_0) + mmq_y/QI5_0];
+
+    *x_ql = tile_x_ql;
+    *x_dm = (half2 *) tile_x_d;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+    const int kbx  = k / QI5_0;
+    const int kqsx = k % QI5_0;
+
+    const block_q5_0 * bx0 = (const block_q5_0 *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
+        const int ql = get_int_from_uint8(bxi->qs, kqsx);
+        const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
+
+        int qs0 = (ql >>  0)   & 0x0F0F0F0F;
+        qs0    |= (qh <<  4)   & 0x00000010;  // 0 ->  4
+        qs0    |= (qh << 11)   & 0x00001000;  // 1 -> 12
+        qs0    |= (qh << 18)   & 0x00100000;  // 2 -> 20
+        qs0    |= (qh << 25)   & 0x10000000;  // 3 -> 28
+        qs0     = __vsubss4(qs0, 0x10101010); // subtract 16
+
+        x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2*k+0] = qs0;
+
+        int qs1 = (ql >>  4)   & 0x0F0F0F0F;
+        qs1    |= (qh >> 12)   & 0x00000010;  // 16 ->  4
+        qs1    |= (qh >>  5)   & 0x00001000;  // 17 -> 12
+        qs1    |= (qh <<  2)   & 0x00100000;  // 18 -> 20
+        qs1    |= (qh <<  9)   & 0x10000000;  // 19 -> 28
+        qs1     = __vsubss4(qs1, 0x10101010); // subtract 16
+
+        x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2*k+1] = qs1;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI5_0;
+    const int kbxd = k % blocks_per_tile_x_row;
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
+        int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
+        x_dmf[i * (WARP_SIZE_GGUF/QI5_0) + i / QI5_0 + kbxd] = __half2float(bxi->d);
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+    const int index_bx = i * (WARP_SIZE_GGUF/QI5_0) + i/QI5_0 + k/QI5_0;
+    const float * x_dmf = (const float *) x_dm;
+    const float * y_df  = (const float *) y_ds;
+
+    int u[2*VDR_Q5_0_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
+        u[2*l+0] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l)         % WARP_SIZE_GGUF];
+        u[2*l+1] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l + QI5_0) % WARP_SIZE_GGUF];
+    }
+
+    return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
+        (&x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE_GGUF/QI8_1) + (2*k/QI8_1) % (WARP_SIZE_GGUF/QI8_1)]);
+}
+
+static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
+
+    int vl[VDR_Q5_1_Q8_1_MMVQ];
+    int vh[VDR_Q5_1_Q8_1_MMVQ];
+    int  u[2*VDR_Q5_1_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
+        vl[i]   = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
+        vh[i]   = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
+    }
+
+    return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE_GGUF)     + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI5_1) + mmq_y/QI5_1];
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+    const int kbx  = k / QI5_1;
+    const int kqsx = k % QI5_1;
+
+    const block_q5_1 * bx0 = (const block_q5_1 *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
+        const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
+
+        int qs0 = (ql >>  0) & 0x0F0F0F0F;
+        qs0    |= (qh <<  4) & 0x00000010; // 0 ->  4
+        qs0    |= (qh << 11) & 0x00001000; // 1 -> 12
+        qs0    |= (qh << 18) & 0x00100000; // 2 -> 20
+        qs0    |= (qh << 25) & 0x10000000; // 3 -> 28
+
+        x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2*k+0] = qs0;
+
+        int qs1 = (ql >>  4) & 0x0F0F0F0F;
+        qs1    |= (qh >> 12) & 0x00000010; // 16 ->  4
+        qs1    |= (qh >>  5) & 0x00001000; // 17 -> 12
+        qs1    |= (qh <<  2) & 0x00100000; // 18 -> 20
+        qs1    |= (qh <<  9) & 0x10000000; // 19 -> 28
+
+        x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2*k+1] = qs1;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI5_1;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
+        int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dm[i * (WARP_SIZE_GGUF/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+    const int index_bx = i * (WARP_SIZE_GGUF/QI5_1) + + i/QI5_1 + k/QI5_1;
+
+    int u[2*VDR_Q5_1_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
+        u[2*l+0] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l)         % WARP_SIZE_GGUF];
+        u[2*l+1] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l + QI5_1) % WARP_SIZE_GGUF];
+    }
+
+    return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
+        (&x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE_GGUF/QI8_1) + (2*k/QI8_1) % (WARP_SIZE_GGUF/QI8_1)]);
+}
+
+static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
+
+    int v[VDR_Q8_0_Q8_1_MMVQ];
+    int u[VDR_Q8_0_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
+        v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
+        u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+    }
+
+    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, __half2float(bq8_0->d), __low2float(bq8_1->ds));
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE_GGUF)       + mmq_y];
+    __shared__ float tile_x_d[mmq_y * (WARP_SIZE_GGUF/QI8_0) + mmq_y/QI8_0];
+
+    *x_ql = tile_x_qs;
+    *x_dm = (half2 *) tile_x_d;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+    const int kbx  = k / QI8_0;
+    const int kqsx = k % QI8_0;
+    float * x_dmf = (float *) x_dm;
+
+    const block_q8_0 * bx0 = (const block_q8_0 *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
+        x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI8_0;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
+        int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
+        x_dmf[i * (WARP_SIZE_GGUF/QI8_0) + i / QI8_0 + kbxd] = __half2float(bxi->d);
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+    const float * x_dmf = (const float *) x_dm;
+    const float * y_df  = (const float *) y_ds;
+
+    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
+        (&x_ql[i * (WARP_SIZE_GGUF + 1) + k], &y_qs[j * WARP_SIZE_GGUF + k], x_dmf[i * (WARP_SIZE_GGUF/QI8_0) + i/QI8_0 + k/QI8_0],
+         y_df[j * (WARP_SIZE_GGUF/QI8_1) + k/QI8_1]);
+}
+
+static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+    const block_q2_K * bq2_K = (const block_q2_K *) vbq;
+
+    const int bq8_offset = QR2_K * (iqs / QI8_1);
+    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
+
+    const uint8_t * scales = bq2_K->scales + scale_offset;
+
+    const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
+    int    u[QR2_K];
+    float d8[QR2_K];
+
+#pragma unroll
+    for (int i = 0; i < QR2_K; ++ i) {
+        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
+        d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
+    }
+
+    return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE_GGUF)       + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI2_K) + mmq_y/QI2_K];
+    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE_GGUF/4)     + mmq_y/4];
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+    *x_sc = tile_x_sc;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+    const int kbx  = k / QI2_K;
+    const int kqsx = k % QI2_K;
+
+    const block_q2_K * bx0 = (const block_q2_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
+        x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI2_K;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
+        int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
+        x_dm[i * (WARP_SIZE_GGUF/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
+        int i = i0 + i_offset * 4 + k / (WARP_SIZE_GGUF/4);
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+        const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/4)) / (QI2_K/4);
+        x_sc[i * (WARP_SIZE_GGUF/4) + i / 4 + k % (WARP_SIZE_GGUF/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+    const int kbx = k / QI2_K;
+    const int ky  = (k % QI2_K) * QR2_K;
+    const float * y_df = (const float *) y_ds;
+
+    int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
+
+    const int kqsx = i * (WARP_SIZE_GGUF + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
+    const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
+
+#pragma unroll
+    for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
+        v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
+    }
+
+    const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE_GGUF/4) + i/4 + kbx*4]) + ky/4;
+
+    const int index_y = j * WARP_SIZE_GGUF + (QR2_K*k) % WARP_SIZE_GGUF;
+    return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE_GGUF/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
+}
+
+static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+    const block_q3_K * bq3_K = (const block_q3_K *) vbq;
+
+    const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
+    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
+
+    const float d = __half2float(bq3_K->d);
+
+    const int vl = get_int_from_uint8(bq3_K->qs, iqs);
+
+    // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
+    const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
+
+    int    u[QR3_K];
+    float d8[QR3_K];
+
+#pragma unroll
+    for (int i = 0; i < QR3_K; ++i) {
+        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
+        d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
+    }
+
+    return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE_GGUF)       + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI3_K) + mmq_y/QI3_K];
+    __shared__ int   tile_x_qh[mmq_y * (WARP_SIZE_GGUF/2)     + mmq_y/2];
+    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE_GGUF/4)     + mmq_y/4];
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+    *x_qh = tile_x_qh;
+    *x_sc = tile_x_sc;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+    const int kbx  = k / QI3_K;
+    const int kqsx = k % QI3_K;
+
+    const block_q3_K * bx0 = (const block_q3_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+        if (need_check) {
+            i = min(i, i_max);
+        }
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
+        x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI3_K;
+    const int kbxd = k % blocks_per_tile_x_row;
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
+        int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
+        if (need_check) {
+            i = min(i, i_max);
+        }
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
+        x_dmf[i * (WARP_SIZE_GGUF/QI3_K) + i / QI3_K + kbxd] = __half2float(bxi->d);
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
+        int i = i0 + i_offset * 2 + k / (WARP_SIZE_GGUF/2);
+        if (need_check) {
+            i = min(i, i_max);
+        }
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/2)) / (QI3_K/2);
+        // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
+        x_qh[i * (WARP_SIZE_GGUF/2) + i / 2 + k % (WARP_SIZE_GGUF/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
+        int i = i0 + i_offset * 4 + k / (WARP_SIZE_GGUF/4);
+        if (need_check) {
+            i = min(i, i_max);
+        }
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/4)) / (QI3_K/4);
+
+        const int ksc = k % (QI3_K/4);
+
+        const int ksc_low = ksc % (QI3_K/8);
+        const int shift_low = 4 * (ksc / (QI3_K/8));
+        const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
+
+        const int ksc_high = QI3_K/8;
+        const int shift_high = 2 * ksc;
+        const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
+
+        const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
+
+        x_sc[i * (WARP_SIZE_GGUF/4) + i / 4 + k % (WARP_SIZE_GGUF/4)] = sc;
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+
+    const int kbx  = k / QI3_K;
+    const int ky  = (k % QI3_K) * QR3_K;
+    const float * x_dmf = (const float *) x_dm;
+    const float * y_df  = (const float *) y_ds;
+
+    const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE_GGUF/4) + i/4 + kbx*4)) + ky/4;
+
+    int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
+        const int kqsx = i * (WARP_SIZE_GGUF + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
+        const int shift = 2 * ((ky % 32) / 8);
+        const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
+
+        const int vh = x_qh[i * (WARP_SIZE_GGUF/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
+        const int vlh = (vh << 2) & 0x04040404;
+
+        v[l] = __vsubss4(vll, vlh);
+    }
+
+    const int index_y = j * WARP_SIZE_GGUF + (k*QR3_K) % WARP_SIZE_GGUF;
+    return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE_GGUF/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
+}
+
+static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
+
+    int    v[2];
+    int    u[2*QR4_K];
+    float d8[QR4_K];
+
+    // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
+    const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
+
+    // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
+    // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
+    // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
+    // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
+
+    const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
+    v[0] = q4[0];
+    v[1] = q4[4];
+
+    const uint16_t * scales = (const uint16_t *)bq4_K->scales;
+    uint16_t aux[2];
+    const int j = bq8_offset/2;
+    if (j < 2) {
+        aux[0] = scales[j+0] & 0x3f3f;
+        aux[1] = scales[j+2] & 0x3f3f;
+    } else {
+        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
+        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
+    }
+    const uint8_t * sc = (const uint8_t *)aux;
+    const uint8_t * m  = sc + 2;
+
+    for (int i = 0; i < QR4_K; ++i) {
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        d8[i] = __low2float(bq8i->ds);
+
+        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
+        u[2*i+0] = q8[0];
+        u[2*i+1] = q8[4];
+    }
+
+    return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE_GGUF)       + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI4_K) + mmq_y/QI4_K];
+    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE_GGUF/8)     + mmq_y/8];
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+    *x_sc = tile_x_sc;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+    const int kbx  = k / QI4_K; // == 0 if QK_K == 256
+    const int kqsx = k % QI4_K; // == k if QK_K == 256
+
+    const block_q4_K * bx0 = (const block_q4_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
+        x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI4_K; // == 1 if QK_K == 256
+    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
+        int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
+        if (need_check) {
+            i = min(i, i_max);
+        }
+        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
+        x_dm[i * (WARP_SIZE_GGUF/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+        int i = (i0 + i_offset * 8 + k / (WARP_SIZE_GGUF/8)) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/8)) / (QI4_K/8);
+
+        const int * scales = (const int *) bxi->scales;
+
+        const int ksc = k % (WARP_SIZE_GGUF/8);
+        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
+        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
+        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
+
+        x_sc[i * (WARP_SIZE_GGUF/8) + i / 8 + ksc] = scales8;
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+    (void)x_qh;
+
+    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE_GGUF/8) + i/8 + k/16]) + 2*((k % 16) / 8);
+
+    const int index_y = j * WARP_SIZE_GGUF + (QR4_K*k) % WARP_SIZE_GGUF;
+    return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE_GGUF + 1) + k], &y_qs[index_y], sc, sc+8,
+                                      x_dm[i * (WARP_SIZE_GGUF/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
+}
+
+static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
+
+    int   vl[2];
+    int   vh[2];
+    int    u[2*QR5_K];
+    float d8[QR5_K];
+
+    const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
+    const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
+    const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
+
+    vl[0] = ql[0];
+    vl[1] = ql[4];
+
+    vh[0] = qh[0] >> bq8_offset;
+    vh[1] = qh[4] >> bq8_offset;
+
+    const uint16_t * scales = (const uint16_t *)bq5_K->scales;
+    uint16_t aux[2];
+    const int j = bq8_offset/2;
+    if (j < 2) {
+        aux[0] = scales[j+0] & 0x3f3f;
+        aux[1] = scales[j+2] & 0x3f3f;
+    } else {
+        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
+        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
+    }
+    const uint8_t * sc = (const uint8_t *)aux;
+    const uint8_t * m  = sc + 2;
+
+#pragma unroll
+    for (int i = 0; i < QR5_K; ++i) {
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        d8[i] = __low2float(bq8i->ds);
+
+        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
+        u[2*i+0] = q8[0];
+        u[2*i+1] = q8[4];
+    }
+
+    return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE_GGUF)     + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI5_K) + mmq_y/QI5_K];
+    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE_GGUF/8)     + mmq_y/8];
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+    *x_sc = tile_x_sc;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+    const int kbx  = k / QI5_K; // == 0 if QK_K == 256
+    const int kqsx = k % QI5_K; // == k if QK_K == 256
+
+    const block_q5_K * bx0 = (const block_q5_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
+        const int ky = QR5_K*kqsx;
+
+        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
+        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
+        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
+
+        const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
+        const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
+        const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
+
+        const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
+        const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
+
+        x_ql[i * (2*WARP_SIZE_GGUF + 1) + kq0] = ql0 | qh0;
+        x_ql[i * (2*WARP_SIZE_GGUF + 1) + kq1] = ql1 | qh1;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI5_K; // == 1 if QK_K == 256
+    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
+        int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
+        x_dm[i * (WARP_SIZE_GGUF/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+        int i = (i0 + i_offset * 8 + k / (WARP_SIZE_GGUF/8)) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/8)) / (QI5_K/8);
+
+        const int * scales = (const int *) bxi->scales;
+
+        const int ksc = k % (WARP_SIZE_GGUF/8);
+
+        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
+        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
+        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
+
+        x_sc[i * (WARP_SIZE_GGUF/8) + i / 8 + ksc] = scales8;
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE_GGUF/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
+
+    const int index_x = i * (QR5_K*WARP_SIZE_GGUF + 1) +  QR5_K*k;
+    const int index_y = j * WARP_SIZE_GGUF             + (QR5_K*k) % WARP_SIZE_GGUF;
+    return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
+                                      x_dm[i * (WARP_SIZE_GGUF/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
+}
+
+static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+    const block_q6_K * bq6_K = (const block_q6_K *) vbq;
+
+    const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
+    const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
+    const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
+
+    const int vl = get_int_from_uint8(bq6_K->ql, iqs);
+    const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
+
+    const int8_t * scales = bq6_K->scales + scale_offset;
+
+    int    u[QR6_K];
+    float d8[QR6_K];
+
+#pragma unroll
+    for (int i = 0; i < QR6_K; ++i) {
+        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
+        d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds);
+    }
+
+    return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, __half2float(bq6_K->d), d8);
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE_GGUF)     + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI6_K) + mmq_y/QI6_K];
+    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE_GGUF/8)     + mmq_y/8];
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+    *x_sc = tile_x_sc;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+    const int kbx  = k / QI6_K; // == 0 if QK_K == 256
+    const int kqsx = k % QI6_K; // == k if QK_K == 256
+
+    const block_q6_K * bx0 = (const block_q6_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
+        const int ky = QR6_K*kqsx;
+
+        const int ql = get_int_from_uint8(bxi->ql, kqsx);
+        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
+        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
+
+        const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
+        const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
+        const int qh1 =  (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4))))       & 0x30303030;
+
+        const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
+        const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
+
+        x_ql[i * (2*WARP_SIZE_GGUF + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
+        x_ql[i * (2*WARP_SIZE_GGUF + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI6_K; // == 1 if QK_K == 256
+    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
+        int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE_GGUF/QI6_K) + i / QI6_K + kbxd] = __half2float(bxi->d);
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+        int i = (i0 + i_offset * 8 + k / (WARP_SIZE_GGUF/8)) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/8)) / 4;
+
+        x_sc[i * (WARP_SIZE_GGUF/8) + i / 8 + k % (WARP_SIZE_GGUF/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+    const float * x_dmf = (const float *) x_dm;
+    const float * y_df  = (const float *) y_ds;
+
+    const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE_GGUF/8) + i/8 + k/8]);
+
+    const int index_x = i * (QR6_K*WARP_SIZE_GGUF + 1) +  QR6_K*k;
+    const int index_y = j * WARP_SIZE_GGUF             + (QR6_K*k) % WARP_SIZE_GGUF;
+    return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE_GGUF/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
+}
+
+static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+    const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
+
+    const int ib32 = iqs;
+    const uint16_t * q2 = bq2->qs + 4*ib32;
+    const uint8_t  * aux8 = (const uint8_t *)q2;
+    const int8_t   * q8 = bq8_1[ib32].qs;
+    uint32_t aux32 = q2[2] | (q2[3] << 16);
+    int sumi = 0;
+    for (int l = 0; l < 4; ++l) {
+        const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
+        const uint8_t  signs = ksigns_iq2xs[aux32 & 127];
+        for (int j = 0; j < 8; ++j) {
+            sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+        }
+        q8 += 8;
+        aux32 >>= 7;
+    }
+    const float d = __half2float(bq2->d) * (0.5f + aux32) * __half2float(bq8_1[ib32].ds.x) * 0.25f;
+    return d * sumi;
+}
+
+static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+    const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
+
+    const int ib32 = iqs;
+    const uint16_t * q2 = bq2->qs + 4*ib32;
+    const int8_t   * q8 = bq8_1[ib32].qs;
+    const uint8_t ls1 = bq2->scales[ib32] & 0xf;
+    const uint8_t ls2 = bq2->scales[ib32] >>  4;
+    int sumi1 = 0;
+    for (int l = 0; l < 2; ++l) {
+        const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
+        const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
+        for (int j = 0; j < 8; ++j) {
+            sumi1 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+        }
+        q8 += 8;
+    }
+    int sumi2 = 0;
+    for (int l = 2; l < 4; ++l) {
+        const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
+        const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
+        for (int j = 0; j < 8; ++j) {
+            sumi2 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+        }
+        q8 += 8;
+    }
+    const float d = __half2float(bq2->d) * __half2float(bq8_1[ib32].ds.x) * 0.25f;
+    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
+}
+
+static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
+    const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
+
+    const int ib32 = iqs;
+    const int8_t  * q8 = bq8_1[ib32].qs;
+    const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
+    const uint8_t ls1 = bq2->scales[ib32] & 0xf;
+    const uint8_t ls2 = bq2->scales[ib32] >>  4;
+    int sumi1 = 0;
+    for (int l = 0; l < 2; ++l) {
+        const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
+        const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
+        const uint32_t signs1 = __vcmpeq4(((signs[l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);
+        const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
+        const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
+        sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
+        sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
+        q8 += 8;
+    }
+    int sumi2 = 0;
+    for (int l = 2; l < 4; ++l) {
+        const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
+        const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
+        const uint32_t signs1 = __vcmpeq4(((signs[l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);
+        const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
+        const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
+        sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
+        sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
+        q8 += 8;
+    }
+    const float d = __half2float(bq2->d) * __low2float(bq8_1[ib32].ds) * 0.25f;
+    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
+#endif
+}
+
+static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
+    const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
+
+    const int ib32 = iqs;
+    const uint8_t  * q3 = bq2->qs + 8*ib32;
+    const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32;
+    const int8_t   * q8 = bq8_1[ib32].qs;
+    uint32_t aux32 = gas[0] | (gas[1] << 16);
+    int sumi = 0;
+    for (int l = 0; l < 4; ++l) {
+        const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0];
+        const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1];
+        const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127));
+        const int grid_l = __vsub4(grid1[0] ^ signs[0], signs[0]);
+        const int grid_h = __vsub4(grid2[0] ^ signs[1], signs[1]);
+        sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
+        sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
+        q8 += 8;
+        aux32 >>= 7;
+    }
+    const float d = __half2float(bq2->d) * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.5f;
+    return d * sumi;
+#endif
+}
+
+static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
+    const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
+
+    const int ib32 = iqs;
+    const uint8_t  * qs = bq2->qs + 8*ib32;
+    const int8_t   * q8 = bq8_1[ib32].qs;
+    int sumi = 0;
+    for (int l = 0; l < 4; ++l) {
+        const uint32_t * grid1 = iq3xs_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
+        const uint32_t * grid2 = iq3xs_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
+        uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
+        uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);
+        const int grid_l = __vsub4(grid1[0] ^ signs0, signs0);
+        const int grid_h = __vsub4(grid2[0] ^ signs1, signs1);
+        sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
+        sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
+        q8 += 8;
+    }
+    const float d = __half2float(bq2->d) * (0.5f + ((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds) * 0.5f;
+    return d * sumi;
+#endif
+}
+
+static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
+    const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
+
+    const int       qs_packed = get_int_b2(bq1->qs, iqs);
+    const uint8_t * qs        = (const uint8_t *) &qs_packed;
+
+    const int qh = bq1->qh[iqs];
+
+    int sumi = 0;
+#pragma unroll
+    for (int l0 = 0; l0 < 8; l0 += 2) {
+        const int grid = iq1s_grid_gpu[qs[l0/2] | (((qh >> 3*(l0/2)) & 0x07) << 8)];
+
+        const int grid0 = (grid >> 0) & 0x0F0F0F0F;
+        const int grid1 = (grid >> 4) & 0x0F0F0F0F;
+
+        const int u0 = get_int_b4(bq8_1[iqs].qs, l0 + 0);
+        const int u1 = get_int_b4(bq8_1[iqs].qs, l0 + 1);
+
+        sumi = __dp4a(grid0, u0, sumi);
+        sumi = __dp4a(grid1, u1, sumi);
+    }
+
+    const float  d1q   = __half2float(bq1->d) * (((qh >> 11) & 0x0E) + 1);
+    const float  delta = -1.0f + IQ1S_DELTA - (qh & 0x8000) * (2.0f*IQ1S_DELTA/0x8000);
+    const float2 ds    = __half22float2(bq8_1[iqs].ds);
+    return d1q * (ds.x*sumi + ds.y*delta);
+#endif
+}
+
+static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
+
+    const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
+
+    const int       qs_packed = get_int_b4(bq1->qs, iqs);
+    const uint8_t * qs        = (const uint8_t *) &qs_packed;
+
+    int   sumi[2] = {0};
+    float sumf[2] = {0.0f};
+#pragma unroll
+    for (int l0 = 0; l0 < 8; l0 += 2) {
+        const int qhl = bq1->qh[2*iqs + l0/4] >> (4 * ((l0/2) % 2));
+
+        const int grid = iq1s_grid_gpu[qs[l0/2] | ((qhl & 0x07) << 8)];
+
+        const int grid0 = (grid >> 0) & 0x0F0F0F0F;
+        const int grid1 = (grid >> 4) & 0x0F0F0F0F;
+
+        const int u0 = get_int_b4(bq8_1[iqs].qs, l0 + 0);
+        const int u1 = get_int_b4(bq8_1[iqs].qs, l0 + 1);
+
+        sumi[l0/4] = __dp4a(grid0, u0, sumi[l0/4]);
+        sumi[l0/4] = __dp4a(grid1, u1, sumi[l0/4]);
+
+        const float delta = -1.0f + IQ1M_DELTA - (qhl & 0x08) * (2.0f*IQ1M_DELTA/0x08);
+        int sumy = 0;
+        sumy = __dp4a(u0, 0x01010101, sumy);
+        sumy = __dp4a(u1, 0x01010101, sumy);
+        sumf[l0/4] += delta*sumy;
+    }
+
+    const uint16_t * sc = (const uint16_t *) bq1->scales;
+
+    iq1m_scale_t scale;
+    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00F0) | ((sc[2] >> 4) & 0x0F00) | (sc[3] & 0xF000);
+    const float d = __half2float(scale.f16) * __low2float(bq8_1[iqs].ds);
+
+    const int tmp = sc[iqs/2] >> (6*(iqs%2));
+    const int sc0 = 2*((tmp >> 0) & 0x07) + 1;
+    const int sc1 = 2*((tmp >> 3) & 0x07) + 1;
+    return d * ((sumi[0] + sumf[0]) * sc0 + (sumi[1] + sumf[1]) * sc1);
+#endif
+}
+
+static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4, const uint8_t * values,
+        int & val1, int & val2) {
+
+    uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
+    aux32 = q4 & 0x0f0f0f0f;
+    uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8);
+    uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8);
+    val1 = v1 | (v2 << 16);
+    aux32 = (q4 >> 4) & 0x0f0f0f0f;
+    v1 = values[q8[0]] | (values[q8[1]] << 8);
+    v2 = values[q8[2]] | (values[q8[3]] << 8);
+    val2 = v1 | (v2 << 16);
+}
+
+static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
+
+    const block_iq4_nl * bq = (const block_iq4_nl *) vbq;
+
+    const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;
+    const int32_t  * q8 = (const int32_t  *)bq8_1->qs + iqs;
+
+    const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
+
+    int v1, v2;
+    int sumi1 = 0, sumi2 = 0;
+    for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
+        const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16);
+        get_int_from_table_16(aux, values, v1, v2);
+        sumi1 = __dp4a(v1, q8[l+0], sumi1);
+        sumi2 = __dp4a(v2, q8[l+4], sumi2);
+    }
+    const float d = __half2float(bq->d) * __low2float(bq8_1->ds);
+    return d * (sumi1 + sumi2);
+#endif
+}
+
+
+static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
+    const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
+    const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
+
+    // iqs is 0...7
+    const int ib32 = iqs;
+    const int32_t  * q8 = (const int *)bq8_1[ib32].qs;
+    const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
+    const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
+    const float d = __half2float(bq4->d) * (ls - 32) * __low2float(bq8_1[ib32].ds);
+    int v1, v2;
+    int sumi1 = 0, sumi2 = 0;
+    for (int j = 0; j < 4; ++j) {
+        get_int_from_table_16(q4[j], values, v1, v2);
+        sumi1 = __dp4a(v1, q8[j+0], sumi1);
+        sumi2 = __dp4a(v2, q8[j+4], sumi2);
+    }
+    return d * (sumi1 + sumi2);
+#endif
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/quantization/gptq/compat.cuh b/vllm_v0.10.0/csrc/quantization/gptq/compat.cuh
new file mode 100644
index 0000000..1b3fb3d
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/gptq/compat.cuh
@@ -0,0 +1,64 @@
+/*
+Copied from https://github.com/turboderp/exllamav2
+*/
+
+#ifndef _compat_cuh
+#define _compat_cuh
+
+namespace vllm {
+namespace gptq {
+// atomicAdd for half types, to support CC < 7.x
+
+__device__ __forceinline__ void atomicAdd_half(half* address, half val) {
+  unsigned int* address_as_ui =
+      (unsigned int*)((char*)address - ((size_t)address & 2));
+  unsigned int old = *address_as_ui;
+  unsigned int assumed;
+
+  do {
+    assumed = old;
+    __half_raw hsum;
+    hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+    half tmpres = __hadd(hsum, val);
+    hsum = __half_raw(tmpres);
+    old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16)
+                              : (old & 0xffff0000) | hsum.x;
+    old = atomicCAS(address_as_ui, assumed, old);
+  } while (assumed != old);
+}
+
+// atomicAdd for half2 types
+
+__device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val) {
+  unsigned int* address_as_ui = (unsigned int*)address;
+  unsigned int old = *address_as_ui;
+  unsigned int assumed;
+  do {
+    assumed = old;
+    half2 old_val = *((half2*)&old);
+    half2 new_val = __hadd2(old_val, val);
+    old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val));
+  } while (assumed != old);
+}
+
+//
+
+#if defined(__CUDA_ARCH__) || defined(USE_ROCM)
+  #if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
+
+__device__ __forceinline__ void atomicAdd(half* address, half val) {
+  atomicAdd_half(address, val);
+}
+
+    #if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
+__device__ __forceinline__ void atomicAdd(half2* address, half2 val) {
+  atomicAdd_half2(address, val);
+}
+    #endif
+
+  #endif
+#endif
+
+}  // namespace gptq
+}  // namespace vllm
+#endif
diff --git a/vllm_v0.10.0/csrc/quantization/gptq/matrix_view.cuh b/vllm_v0.10.0/csrc/quantization/gptq/matrix_view.cuh
new file mode 100644
index 0000000..2b6719f
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/gptq/matrix_view.cuh
@@ -0,0 +1,295 @@
+/*
+Adapted from https://github.com/turboderp/exllamav2 and
+https://github.com/turboderp/exllama
+*/
+
+#ifndef _matrix_view_cuh
+#define _matrix_view_cuh
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "qdq_util.cuh"
+
+namespace vllm {
+namespace gptq {
+
+class MatrixView_half {
+ public:
+  const half* data;
+  const int height;
+  const int width;
+
+  __device__ __forceinline__ MatrixView_half(const half* data, const int height,
+                                             const int width)
+      : data(data), height(height), width(width) {}
+
+  __device__ __forceinline__ half item(int row, int column) const {
+    return data[row * width + column];
+  }
+  __device__ __forceinline__ half2 item_half2(int row, int column) const {
+    return ((half2*)data)[(row * width + column) / 2];
+  }
+  __device__ __forceinline__ half2 item_half2half2(int row, int column) const {
+    return __half2half2(data[row * width + column]);
+  }
+  __device__ __forceinline__ const half* item_ptr(int row, int column) const {
+    return &data[row * width + column];
+  }
+
+  __device__ __forceinline__ void item4(half (&items)[4], int row,
+                                        int column) const {
+    half2* ptr = (half2*)item_ptr(row, column);
+    half2 i01 = ptr[0];
+    half2 i23 = ptr[1];
+    items[0] = __low2half(i01);
+    items[1] = __high2half(i01);
+    items[2] = __low2half(i23);
+    items[3] = __high2half(i23);
+  }
+  __device__ __forceinline__ void item4_f(float (&items)[4], int row,
+                                          int column) const {
+    half2* ptr = (half2*)item_ptr(row, column);
+    half2 i01 = ptr[0];
+    half2 i23 = ptr[1];
+    items[0] = __half2float(__low2half(i01));
+    items[1] = __half2float(__high2half(i01));
+    items[2] = __half2float(__low2half(i23));
+    items[3] = __half2float(__high2half(i23));
+  }
+
+  __device__ __forceinline__ void item4_h2(half2 (&items)[4], int row,
+                                           int column) const {
+    half2* ptr = (half2*)item_ptr(row, column);
+    half2 i01 = ptr[0];
+    half2 i23 = ptr[1];
+    items[0] = __half2half2(__low2half(i01));
+    items[1] = __half2half2(__high2half(i01));
+    items[2] = __half2half2(__low2half(i23));
+    items[3] = __half2half2(__high2half(i23));
+  }
+};
+
+class MatrixView_half_rw {
+ public:
+  half* data;
+  const int height;
+  const int width;
+
+  __device__ __forceinline__ MatrixView_half_rw(half* data, const int height,
+                                                const int width)
+      : data(data), height(height), width(width) {}
+
+  __device__ __forceinline__ half item(int row, int column) const {
+    return data[row * width + column];
+  }
+  __device__ __forceinline__ half2 item_half2(int row, int column) const {
+    return ((half2*)data)[(row * width + column) / 2];
+  }
+  __device__ __forceinline__ half* item_ptr(int row, int column) {
+    return &data[row * width + column];
+  }
+  __device__ __forceinline__ void set(int row, int column, half value) {
+    data[row * width + column] = value;
+  }
+  __device__ __forceinline__ void set_half2(int row, int column, half2 value) {
+    ((half2*)data)[(row * width + column) / 2] = value;
+  }
+
+  __device__ __forceinline__ void set4(int row, int column, half v0, half v1,
+                                       half v2, half v3) {
+    half2 v01 = __halves2half2(v0, v1);
+    half2 v23 = __halves2half2(v2, v3);
+    half2* ptr = (half2*)item_ptr(row, column);
+    ptr[0] = v01;
+    ptr[1] = v23;
+  }
+};
+
+class MatrixView_q4_row {
+ public:
+  const uint32_t* data;
+  const int height;
+  const int width;
+
+  __device__ __forceinline__ MatrixView_q4_row(const uint32_t* data,
+                                               const int height,
+                                               const int width)
+      : data(data), height(height), width(width) {}
+
+  __device__ __forceinline__ int item(int row, int column) const {
+    int shift = (column & 0x07) * 4;
+    return (data[row * width / 8 + column / 8] >> shift) & 0x0f;
+  }
+
+  __device__ __forceinline__ void item2(int (&items)[2], int row,
+                                        int column) const {
+    int shift = (column & 0x07) * 4;
+    uint32_t d = data[row * width / 8 + column / 8] >> shift;
+    items[0] = d & 0x0f;
+    items[1] = (d >> 4) & 0x0f;
+  }
+
+  __device__ __forceinline__ void item4(int (&items)[4], int row,
+                                        int column) const {
+    int shift = (column & 0x07) * 4;
+    uint32_t d = data[row * width / 8 + column / 8] >> shift;
+    items[0] = d & 0x0f;
+    items[1] = (d >> 4) & 0x0f;
+    items[2] = (d >> 8) & 0x0f;
+    items[3] = (d >> 12) & 0x0f;
+  }
+};
+
+class MatrixView_q4_column {
+ public:
+  const uint32_t* data;
+  const int height;
+  const int width;
+
+  __device__ __forceinline__ MatrixView_q4_column(const uint32_t* data,
+                                                  const int height,
+                                                  const int width)
+      : data(data), height(height), width(width) {}
+
+  __device__ __forceinline__ int item(int row, int column) const {
+    int shift = (row & 0x07) * 4;
+    return (data[row / 8 * width + column] >> shift) & 0x0f;
+  }
+
+  __device__ __forceinline__ uint32_t item_uint32_t(int row, int column) {
+    return data[row / 8 * width + column];
+  }
+  __device__ __forceinline__ const uint32_t* item_uint32_ptr(int row,
+                                                             int column) {
+    return &data[row / 8 * width + column];
+  }
+};
+
+class MatrixView_q2_row {
+ public:
+  const uint32_t* data;
+  const int height;
+  const int width;
+
+  __device__ __forceinline__ MatrixView_q2_row(const uint32_t* data,
+                                               const int height,
+                                               const int width)
+      : data(data), height(height), width(width) {}
+
+  __device__ __forceinline__ int item(int row, int column) const {
+    int shift = (column & 0x0f) * 2;
+    return (data[row * width / 16 + column / 16] >> shift) & 0x03;
+  }
+
+  __device__ __forceinline__ void item2(int (&items)[2], int row,
+                                        int column) const {
+    int shift = (column & 0x0f) * 2;
+    uint32_t d = data[row * width / 16 + column / 16] >> shift;
+    items[0] = d & 0x03;
+    items[1] = (d >> 2) & 0x03;
+  }
+
+  __device__ __forceinline__ void item4(int (&items)[4], int row,
+                                        int column) const {
+    int shift = (column & 0x0f) * 2;
+    uint32_t d = data[row * width / 16 + column / 16] >> shift;
+    items[0] = d & 0x03;
+    items[1] = (d >> 2) & 0x03;
+    items[2] = (d >> 4) & 0x03;
+    items[3] = (d >> 6) & 0x03;
+  }
+};
+
+class MatrixView_q3_row {
+ public:
+  const uint32_t* data;
+  const int height;
+  const int width;
+
+  __device__ __forceinline__ MatrixView_q3_row(const uint32_t* data,
+                                               const int height,
+                                               const int width)
+      : data(data), height(height), width(width) {}
+
+  __device__ __forceinline__ int item(int row, int column) const {
+    int z_w = column * 3 / 32;
+    int z_mod = column & 0x1f;
+
+    if (z_mod == 10) {
+      return (data[row * width * 3 / 32 + z_w] >> 30) |
+             ((data[row * width * 3 / 32 + (z_w + 1)] << 2) & 0x4);
+    } else if (z_mod == 21) {
+      return (data[row * width * 3 / 32 + z_w] >> 31) |
+             ((data[row * width * 3 / 32 + (z_w + 1)] << 1) & 0x6);
+    } else if (z_mod < 10) {
+      return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3)) & 0x07;
+    } else if (z_mod < 21) {
+      return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3 - 32)) & 0x07;
+    } else {
+      return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3 - 64)) & 0x07;
+    }
+  }
+
+  __device__ __forceinline__ void item4(int (&items)[4], int row,
+                                        int column) const {
+    int shift = (column & 0x1f);
+    uint32_t d;
+    if (shift <= 4) {
+      d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3);
+    } else if (shift == 8) {
+      d = (data[row * width / 32 * 3 + column * 3 / 32] >> 24) |
+          ((data[row * width / 32 * 3 + column * 3 / 32 + 1] & 0x0f) << 8);
+    } else if (shift <= 16) {
+      d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3 - 32);
+    } else if (shift == 20) {
+      d = (data[row * width / 32 * 3 + column * 3 / 32] >> 28) |
+          ((data[row * width / 32 * 3 + column * 3 / 32 + 1] & 0xff) << 4);
+    } else {
+      d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3 - 64);
+    }
+    items[0] = d & 0x07;
+    items[1] = (d >> 3) & 0x07;
+    items[2] = (d >> 6) & 0x07;
+    items[3] = (d >> 9) & 0x07;
+  }
+};
+
+class MatrixView_q8_row {
+ public:
+  const uint32_t* data;
+  const int height;
+  const int width;
+
+  __device__ __forceinline__ MatrixView_q8_row(const uint32_t* data,
+                                               const int height,
+                                               const int width)
+      : data(data), height(height), width(width) {}
+
+  __device__ __forceinline__ int item(int row, int column) const {
+    int shift = (column & 0x03) * 8;
+    return (data[row * width / 4 + column / 4] >> shift) & 0xff;
+  }
+
+  __device__ __forceinline__ void item2(int (&items)[2], int row,
+                                        int column) const {
+    int shift = (column & 0x03) * 8;
+    uint32_t d = data[row * width / 4 + column / 4] >> shift;
+    items[0] = d & 0xff;
+    items[1] = (d >> 8) & 0xff;
+  }
+
+  __device__ __forceinline__ void item4(int (&items)[4], int row,
+                                        int column) const {
+    int shift = (column & 0x03) * 2;
+    uint32_t d = data[row * width / 4 + column / 4] >> shift;
+    items[0] = d & 0xff;
+    items[1] = (d >> 8) & 0xff;
+    items[2] = (d >> 16) & 0xff;
+    items[3] = (d >> 24) & 0xff;
+  }
+};
+
+}  // namespace gptq
+}  // namespace vllm
+#endif
diff --git a/vllm_v0.10.0/csrc/quantization/gptq/q_gemm.cu b/vllm_v0.10.0/csrc/quantization/gptq/q_gemm.cu
new file mode 100644
index 0000000..43b2455
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/gptq/q_gemm.cu
@@ -0,0 +1,1848 @@
+/*
+Adapted from https://github.com/turboderp/exllamav2 and
+https://github.com/qwopqwop200/GPTQ-for-LLaMa
+*/
+
+#include <cstdint>
+#include <cstdio>
+
+#include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "compat.cuh"
+#include "matrix_view.cuh"
+#include "qdq_2.cuh"
+#include "qdq_3.cuh"
+#include "qdq_4.cuh"
+#include "qdq_8.cuh"
+
+namespace vllm {
+namespace gptq {
+
+#define BLOCK_KN_SIZE 128
+#define BLOCK_M_SIZE_MAX 8
+#define MAX_GROUPS_IN_BLOCK (BLOCK_KN_SIZE / 32)
+#define MAX_Q_GEMM_ROWS 50
+#define MAX_Q_GEMM_ROWS_8BIT 24
+#define MAX_ALT_GEMM_ROWS 8
+#define THREADS_X 32
+#define THREADS_Y 32
+#define DIVIDE(x, size) (((x) + (size) - 1) / (size))
+
+#if defined(USE_ROCM)
+  #include <hipblas/hipblas.h>
+__host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(
+    hipblasHandle_t handle, hipblasOperation_t transA,
+    hipblasOperation_t transB, int m, int n, int k, const half* alpha,
+    const half* AP, int lda, const half* BP, int ldb, const half* beta,
+    half* CP, int ldc) {
+  return hipblasHgemm(handle, transA, transB, m, n, k,
+                      reinterpret_cast<const hipblasHalf*>(alpha),
+                      reinterpret_cast<const hipblasHalf*>(AP), lda,
+                      reinterpret_cast<const hipblasHalf*>(BP), ldb,
+                      reinterpret_cast<const hipblasHalf*>(beta),
+                      reinterpret_cast<hipblasHalf*>(CP), ldc);
+}
+  #define hipblasHgemm __compat_hipblasHgemm
+
+  // Previous version of PyTorch were converting to rocBLAS instead of hipBLAS.
+  #define rocblas_operation_none HIPBLAS_OP_N
+  #define rocblas_hgemm __compat_hipblasHgemm
+#endif
+
+__forceinline__ __device__ half2 dot22_8(half2 (&dq)[4], const half* a_ptr,
+                                         const half2 g_result) {
+  half2 result = {};
+  const half2* a2_ptr = (const half2*)a_ptr;
+#pragma unroll
+  for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
+  return __hadd2(result, g_result);
+}
+
+__forceinline__ __device__ float dot22_8_f(half2 (&dq)[4], const half* a_ptr) {
+  half2 result = {};
+  const half2* a2_ptr = (const half2*)a_ptr;
+#pragma unroll
+  for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
+  return __half2float(__low2half(result)) + __half2float(__high2half(result));
+}
+
+__forceinline__ __device__ half2 dot22_8(half2 (&dq)[4], const half* a_ptr,
+                                         const half2 g_result,
+                                         const half qs_h) {
+  half2 result = {};
+  const half2* a2_ptr = (const half2*)a_ptr;
+#pragma unroll
+  for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
+  return __hfma2(result, __halves2half2(qs_h, qs_h), g_result);
+}
+
+__forceinline__ __device__ half2 dot22_16(half2 (&dq)[8], const half* a_ptr,
+                                          const half2 g_result,
+                                          const half qs_h) {
+  half2 result = {};
+  const half2* a2_ptr = (const half2*)a_ptr;
+#pragma unroll
+  for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result);
+  return __hfma2(result, __halves2half2(qs_h, qs_h), g_result);
+}
+
+__forceinline__ __device__ half2 dot22_32(half2 (&dq)[16], const half* a_ptr,
+                                          const half2 g_result,
+                                          const half qs_h) {
+  half2 result = {};
+  const half2* a2_ptr = (const half2*)a_ptr;
+#pragma unroll
+  for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result);
+  return __hfma2(result, __halves2half2(qs_h, qs_h), g_result);
+}
+
+__forceinline__ __device__ float dot22_8_f(half2 (&dq)[4], const half* a_ptr,
+                                           const float g_result,
+                                           const float qs_f) {
+  half2 result = {};
+  const half2* a2_ptr = (const half2*)a_ptr;
+#pragma unroll
+  for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
+  float result_f =
+      __half2float(__low2half(result)) + __half2float(__high2half(result));
+  return fma(result_f, qs_f, g_result);
+}
+
+__forceinline__ __device__ float dot22_16_f(half2 (&dq)[8], const half* a_ptr,
+                                            const float g_result,
+                                            const float qs_f) {
+  half2 result = {};
+  const half2* a2_ptr = (const half2*)a_ptr;
+#pragma unroll
+  for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result);
+  float result_f =
+      __half2float(__low2half(result)) + __half2float(__high2half(result));
+  return fma(result_f, qs_f, g_result);
+}
+
+__forceinline__ __device__ float dot22_32_f(half2 (&dq)[16], const half* a_ptr,
+                                            const float g_result,
+                                            const float qs_f) {
+  half2 result = {};
+  const half2* a2_ptr = (const half2*)a_ptr;
+#pragma unroll
+  for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result);
+  float result_f =
+      __half2float(__low2half(result)) + __half2float(__high2half(result));
+  return fma(result_f, qs_f, g_result);
+}
+
+__forceinline__ __device__ half dot22_8_h(half2 (&dq)[4], const half* a_ptr,
+                                          const half g_result,
+                                          const half qs_h) {
+  // Use FP32 accumulator to avoid potential overflow since unscaled weights are
+  // in the range -128..127
+
+  float result = {};
+#pragma unroll
+  for (int i = 0; i < 4; i++) {
+    half2 w01 = dq[i];
+    float w0 = __low2float(w01);
+    float w1 = __high2float(w01);
+    float x0 = __half2float(*a_ptr++);
+    float x1 = __half2float(*a_ptr++);
+    result = fma(w0, x0, result);
+    result = fma(w1, x1, result);
+  }
+  float qs = __half2float(qs_h);
+  result *= qs;
+  half result_h = __float2half_rn(result);
+  return __hadd(result_h, g_result);
+}
+
+__forceinline__ __device__ half dot22_16_h(half2 (&dq)[8], const half* a_ptr,
+                                           const half g_result,
+                                           const half qs_h) {
+  half2 result = {};
+  const half2* a2_ptr = (const half2*)a_ptr;
+#pragma unroll
+  for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result);
+  half result_h = __hadd(__low2half(result), __high2half(result));
+  return __hfma(result_h, qs_h, g_result);
+}
+
+__forceinline__ __device__ half dot22_32_h(half2 (&dq)[16], const half* a_ptr,
+                                           const half g_result,
+                                           const half qs_h) {
+  half2 result = {};
+  const half2* a2_ptr = (const half2*)a_ptr;
+#pragma unroll
+  for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result);
+  half result_h = __hadd(__low2half(result), __high2half(result));
+  return __hfma(result_h, qs_h, g_result);
+}
+
+typedef void (*fp_gemm_half_q_half_gptq_kernel)(const half*, const uint32_t*,
+                                                const uint32_t*, const half*,
+                                                half*, const int, const int,
+                                                const int, const int,
+                                                const int*);
+
+template <bool first_block, int m_count>
+__global__ void gemm_half_q_half_gptq_4bit_kernel(
+    const half* __restrict__ a, const uint32_t* __restrict__ b_q_weight,
+    const uint32_t* __restrict__ b_gptq_qzeros,
+    const half* __restrict__ b_gptq_scales, half* __restrict__ c,
+    const int size_m, const int size_n, const int size_k, const int groups,
+    const int* __restrict__ b_q_perm) {
+  MatrixView_half a_(a, size_m, size_k);
+  MatrixView_half_rw c_(c, size_m, size_n);
+  MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+
+  auto t = threadIdx.x;
+
+  // Block
+  auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+  auto offset_m = blockIdx.y * m_count;
+  auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
+
+  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+
+  int n = offset_n + t * 4;
+
+  // Preload block_a
+  __shared__ half block_a[m_count][BLOCK_KN_SIZE];
+
+  if (offset_k + t < end_k) {
+    for (int m = 0; m < m_count; ++m) {
+      const half* a_ptr = a_.item_ptr(offset_m + m, 0);
+      half* block_a_ptr = block_a[m];
+
+      half a0;
+      if (b_q_perm)
+        a0 = a_ptr[b_q_perm[offset_k + t]];
+      else
+        a0 = a_ptr[offset_k + t];
+      block_a_ptr[t] = a0;
+    }
+  }
+
+  // Zero output
+  if (n >= size_n) return;
+
+  if (blockIdx.z == 0) {
+    for (int m = 0; m < m_count; m++)
+      *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
+  }
+
+  __syncthreads();
+
+  // Find initial group
+  int groupsize = size_k / groups;
+  int group = offset_k / groupsize;
+  int nextgroup = offset_k + groupsize;
+
+  // a, b offset
+  int qk = offset_k / (32 / 4);
+
+  const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+  const half* a_ptr = &block_a[0][0];
+  int a_stride = BLOCK_KN_SIZE;
+
+  // Initial group
+  int zeros[4];
+  float scales[4];
+  half2 z1z16[4][2];
+  half2 y1y16[4][2];
+  b_gptq_qzeros_.item4(zeros, group, n);
+  b_gptq_scales_.item4_f(scales, group, n);
+  dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
+  dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
+  dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
+  dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
+
+  // Column result
+  float block_c[m_count][4] = {};
+
+  // Dequantize and multiply
+  int k = offset_k;
+  while (k < end_k) {
+    if (k == nextgroup) {
+      group++;
+      nextgroup += groupsize;
+      b_gptq_qzeros_.item4(zeros, group, n);
+      b_gptq_scales_.item4_f(scales, group, n);
+      dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
+      dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
+      dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
+      dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
+    }
+
+#pragma unroll
+    for (int j = 0; j < 4; j++) {
+      const int4* b_ptr4 = (int4*)b_ptr;
+      int4 load_int4 = *b_ptr4;
+
+      half2 dq[4][4];
+      dequant_4bit_8_gptq(load_int4.x, dq[0], z1z16[0], y1y16[0], size_n,
+                          false);
+      dequant_4bit_8_gptq(load_int4.y, dq[1], z1z16[1], y1y16[1], size_n,
+                          false);
+      dequant_4bit_8_gptq(load_int4.z, dq[2], z1z16[2], y1y16[2], size_n,
+                          false);
+      dequant_4bit_8_gptq(load_int4.w, dq[3], z1z16[3], y1y16[3], size_n,
+                          false);
+
+#pragma unroll
+      for (int m = 0; m < m_count; m++) {
+        block_c[m][0] = fma(dot22_8_f(dq[0], a_ptr + m * a_stride), scales[0],
+                            block_c[m][0]);
+        block_c[m][1] = fma(dot22_8_f(dq[1], a_ptr + m * a_stride), scales[1],
+                            block_c[m][1]);
+        block_c[m][2] = fma(dot22_8_f(dq[2], a_ptr + m * a_stride), scales[2],
+                            block_c[m][2]);
+        block_c[m][3] = fma(dot22_8_f(dq[3], a_ptr + m * a_stride), scales[3],
+                            block_c[m][3]);
+      }
+
+      b_ptr += size_n;
+      a_ptr += 8;
+    }
+
+    k += 32;
+  }
+
+  for (int m = 0; m < m_count; m++) {
+    half2* out = (half2*)c_.item_ptr(offset_m + m, n);
+    half2 result01 = __halves2half2(__float2half_rn(block_c[m][0]),
+                                    __float2half_rn(block_c[m][1]));
+    half2 result23 = __halves2half2(__float2half_rn(block_c[m][2]),
+                                    __float2half_rn(block_c[m][3]));
+    atomicAdd(out, result01);
+    atomicAdd(out + 1, result23);
+  }
+}
+
+template <bool first_block, int m_count>
+__global__ void gemm_half_q_half_gptq_2bit_kernel(
+    const half* __restrict__ a, const uint32_t* __restrict__ b_q_weight,
+    const uint32_t* __restrict__ b_gptq_qzeros,
+    const half* __restrict__ b_gptq_scales, half* __restrict__ c,
+    const int size_m, const int size_n, const int size_k, const int groups,
+    const int* __restrict__ b_q_perm) {
+  MatrixView_half a_(a, size_m, size_k);
+  MatrixView_half_rw c_(c, size_m, size_n);
+  MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+
+  auto t = threadIdx.x;
+
+  // Block
+  auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+  auto offset_m = blockIdx.y * m_count;
+  auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
+
+  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+
+  int n = offset_n + t * 4;
+
+  // Preload block_a
+  __shared__ half block_a[m_count][BLOCK_KN_SIZE];
+
+  if (offset_k + t < end_k) {
+    for (int m = 0; m < m_count; ++m) {
+      const half* a_ptr = a_.item_ptr(offset_m + m, 0);
+      half* block_a_ptr = block_a[m];
+
+      half a0;
+      if (b_q_perm)
+        a0 = a_ptr[b_q_perm[offset_k + t]];
+      else
+        a0 = a_ptr[offset_k + t];
+      block_a_ptr[t] = a0;
+    }
+  }
+
+  // Zero output
+  if (n >= size_n) return;
+
+  if (blockIdx.z == 0) {
+    for (int m = 0; m < m_count; m++)
+      *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
+  }
+
+  __syncthreads();
+
+  // Find initial group
+  int groupsize = size_k / groups;
+  int group = offset_k / groupsize;
+  int nextgroup = offset_k + groupsize;
+
+  // a, b offset
+  int qk = offset_k / (32 / 2);
+
+  const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+  const half* a_ptr = &block_a[0][0];
+  int a_stride = BLOCK_KN_SIZE;
+
+  // Initial group
+  int zeros[4];
+  half scales[4];
+  b_gptq_qzeros_.item4(zeros, group, n);
+  b_gptq_scales_.item4(scales, group, n);
+  // Column result
+  half block_c[m_count][4] = {};
+
+  // Dequantize and multiply
+  int k = offset_k;
+  while (k < end_k) {
+    if (k == nextgroup) {
+      group++;
+      nextgroup += groupsize;
+      b_gptq_qzeros_.item4(zeros, group, n);
+      b_gptq_scales_.item4(scales, group, n);
+    }
+
+#pragma unroll
+    for (int j = 0; j < 1; j++) {
+      const int4* b_ptr4 = (int4*)b_ptr;
+      int4 load_int4 = *b_ptr4;
+
+      half2 dq[4][8];
+      dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + 1);
+      dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + 1);
+      dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + 1);
+      dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + 1);
+
+#pragma unroll
+      for (int m = 0; m < m_count; m++) {
+        block_c[m][0] =
+            dot22_16_h(dq[0], a_ptr + m * a_stride, block_c[m][0], scales[0]);
+        block_c[m][1] =
+            dot22_16_h(dq[1], a_ptr + m * a_stride, block_c[m][1], scales[1]);
+        block_c[m][2] =
+            dot22_16_h(dq[2], a_ptr + m * a_stride, block_c[m][2], scales[2]);
+        block_c[m][3] =
+            dot22_16_h(dq[3], a_ptr + m * a_stride, block_c[m][3], scales[3]);
+      }
+
+      b_ptr += size_n;
+      a_ptr += 16;
+    }
+
+    k += 16;
+  }
+
+  for (int m = 0; m < m_count; m++) {
+    half2* out = (half2*)c_.item_ptr(offset_m + m, n);
+    half2 result01 = __halves2half2(block_c[m][0], block_c[m][1]);
+    half2 result23 = __halves2half2(block_c[m][2], block_c[m][3]);
+    atomicAdd(out, result01);
+    atomicAdd(out + 1, result23);
+  }
+}
+
+template <bool first_block, int m_count>
+__global__ void gemm_half_q_half_gptq_3bit_kernel(
+    const half* __restrict__ a, const uint32_t* __restrict__ b_q_weight,
+    const uint32_t* __restrict__ b_gptq_qzeros,
+    const half* __restrict__ b_gptq_scales, half* __restrict__ c,
+    const int size_m, const int size_n, const int size_k, const int groups,
+    const int* __restrict__ b_q_perm) {
+  MatrixView_half a_(a, size_m, size_k);
+  MatrixView_half_rw c_(c, size_m, size_n);
+  MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+
+  auto t = threadIdx.x;
+
+  // Block
+  auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+  auto offset_m = blockIdx.y * m_count;
+  auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
+
+  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+
+  int n = offset_n + t * 4;
+
+  // Preload block_a
+  __shared__ half block_a[m_count][BLOCK_KN_SIZE];
+
+  if (offset_k + t < end_k) {
+    for (int m = 0; m < m_count; ++m) {
+      const half* a_ptr = a_.item_ptr(offset_m + m, 0);
+      half* block_a_ptr = block_a[m];
+
+      half a0;
+      if (b_q_perm)
+        a0 = a_ptr[b_q_perm[offset_k + t]];
+      else
+        a0 = a_ptr[offset_k + t];
+      block_a_ptr[t] = a0;
+    }
+  }
+
+  // Zero output
+  if (n >= size_n) return;
+
+  if (blockIdx.z == 0) {
+    for (int m = 0; m < m_count; m++)
+      *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
+  }
+
+  __syncthreads();
+
+  // Find initial group
+  int groupsize = size_k / groups;
+  int group = offset_k / groupsize;
+  int nextgroup = offset_k + groupsize;
+
+  // a, b offset
+  int qk = offset_k / 32 * 3;
+
+  const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+  const half* a_ptr = &block_a[0][0];
+  int a_stride = BLOCK_KN_SIZE;
+
+  // Initial group
+  int zeros[4];
+  half scales[4];
+  b_gptq_qzeros_.item4(zeros, group, n);
+  b_gptq_scales_.item4(scales, group, n);
+  // Column result
+  half block_c[m_count][4] = {};
+
+  // Dequantize and multiply
+  int k = offset_k;
+  while (k < end_k) {
+    if (k == nextgroup) {
+      group++;
+      nextgroup += groupsize;
+      b_gptq_qzeros_.item4(zeros, group, n);
+      b_gptq_scales_.item4(scales, group, n);
+    }
+
+#pragma unroll
+    for (int j = 0; j < 1; j++) {
+      int4 load_int4[3];
+      load_int4[0] = *((int4*)b_ptr);
+      b_ptr += size_n;
+      load_int4[1] = *((int4*)b_ptr);
+      b_ptr += size_n;
+      load_int4[2] = *((int4*)b_ptr);
+      b_ptr += size_n;
+
+      half2 dq[4][16];
+      dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0],
+                      size_n, zeros[0] + 1);
+      dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1],
+                      size_n, zeros[1] + 1);
+      dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2],
+                      size_n, zeros[2] + 1);
+      dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3],
+                      size_n, zeros[3] + 1);
+
+#pragma unroll
+      for (int m = 0; m < m_count; m++) {
+        block_c[m][0] =
+            dot22_32_h(dq[0], a_ptr + m * a_stride, block_c[m][0], scales[0]);
+        block_c[m][1] =
+            dot22_32_h(dq[1], a_ptr + m * a_stride, block_c[m][1], scales[1]);
+        block_c[m][2] =
+            dot22_32_h(dq[2], a_ptr + m * a_stride, block_c[m][2], scales[2]);
+        block_c[m][3] =
+            dot22_32_h(dq[3], a_ptr + m * a_stride, block_c[m][3], scales[3]);
+      }
+      a_ptr += 32;
+    }
+
+    k += 32;
+  }
+
+  for (int m = 0; m < m_count; m++) {
+    half2* out = (half2*)c_.item_ptr(offset_m + m, n);
+    half2 result01 = __halves2half2(block_c[m][0], block_c[m][1]);
+    half2 result23 = __halves2half2(block_c[m][2], block_c[m][3]);
+    atomicAdd(out, result01);
+    atomicAdd(out + 1, result23);
+  }
+}
+
+template <bool first_block, int m_count>
+__global__ void gemm_half_q_half_gptq_8bit_kernel(
+    const half* __restrict__ a, const uint32_t* __restrict__ b_q_weight,
+    const uint32_t* __restrict__ b_gptq_qzeros,
+    const half* __restrict__ b_gptq_scales, half* __restrict__ c,
+    const int size_m, const int size_n, const int size_k, const int groups,
+    const int* __restrict__ b_q_perm) {
+  MatrixView_half a_(a, size_m, size_k);
+  MatrixView_half_rw c_(c, size_m, size_n);
+  MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+
+  auto t = threadIdx.x;
+
+  // Block
+  auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+  auto offset_m = blockIdx.y * m_count;
+  auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
+
+  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+
+  int n = offset_n + t * 4;
+
+  // Preload block_a
+  __shared__ half block_a[m_count][BLOCK_KN_SIZE];
+
+  if (offset_k + t < end_k) {
+    for (int m = 0; m < m_count; ++m) {
+      const half* a_ptr = a_.item_ptr(offset_m + m, 0);
+      half* block_a_ptr = block_a[m];
+
+      half a0;
+      if (b_q_perm)
+        a0 = a_ptr[b_q_perm[offset_k + t]];
+      else
+        a0 = a_ptr[offset_k + t];
+      block_a_ptr[t] = a0;
+    }
+  }
+
+  // Zero output
+  if (n >= size_n) return;
+
+  if (blockIdx.z == 0) {
+    for (int m = 0; m < m_count; m++)
+      *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
+  }
+
+  __syncthreads();
+
+  // Find initial group
+  int groupsize = size_k / groups;
+  int group = offset_k / groupsize;
+  int nextgroup = offset_k + groupsize;
+
+  // a, b offset
+  int qk = offset_k / (32 / 8);
+
+  const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+  const half* a_ptr = &block_a[0][0];
+  int a_stride = BLOCK_KN_SIZE;
+
+  // Initial group
+  int zeros[4];
+  half scales[4];
+  b_gptq_qzeros_.item4(zeros, group, n);
+  b_gptq_scales_.item4(scales, group, n);
+  // Column result
+  half block_c[m_count][4] = {};
+
+  // Dequantize and multiply
+  int k = offset_k;
+  while (k < end_k) {
+    if (k == nextgroup) {
+      group++;
+      nextgroup += groupsize;
+      b_gptq_qzeros_.item4(zeros, group, n);
+      b_gptq_scales_.item4(scales, group, n);
+    }
+
+#pragma unroll
+    for (int j = 0; j < 4; j++) {
+      int4 load_int4[2];
+      load_int4[0] = *((int4*)b_ptr);
+      b_ptr += size_n;
+      load_int4[1] = *((int4*)b_ptr);
+      b_ptr += size_n;
+
+      half2 dq[4][4];
+      dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n,
+                     zeros[0] + 1);
+      dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n,
+                     zeros[1] + 1);
+      dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n,
+                     zeros[2] + 1);
+      dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n,
+                     zeros[3] + 1);
+
+      for (int m = 0; m < m_count; m++) {
+        block_c[m][0] =
+            dot22_8_h(dq[0], a_ptr + m * a_stride, block_c[m][0], scales[0]);
+        block_c[m][1] =
+            dot22_8_h(dq[1], a_ptr + m * a_stride, block_c[m][1], scales[1]);
+        block_c[m][2] =
+            dot22_8_h(dq[2], a_ptr + m * a_stride, block_c[m][2], scales[2]);
+        block_c[m][3] =
+            dot22_8_h(dq[3], a_ptr + m * a_stride, block_c[m][3], scales[3]);
+      }
+      a_ptr += 8;
+    }
+    k += 32;
+  }
+
+  for (int m = 0; m < m_count; m++) {
+    half2* out = (half2*)c_.item_ptr(offset_m + m, n);
+    half2 result01 = __halves2half2(block_c[m][0], block_c[m][1]);
+    half2 result23 = __halves2half2(block_c[m][2], block_c[m][3]);
+    atomicAdd(out, result01);
+    atomicAdd(out + 1, result23);
+  }
+}
+
+fp_gemm_half_q_half_gptq_kernel pick_gemm_half_q_half_gptq_kernel(
+    bool first_block, const int m_count, const int bit) {
+#define SELECT_KERNEL(M_COUNT)                                             \
+  if (m_count == M_COUNT) {                                                \
+    if (bit == 2) return gemm_half_q_half_gptq_2bit_kernel<true, M_COUNT>; \
+    if (bit == 3) return gemm_half_q_half_gptq_3bit_kernel<true, M_COUNT>; \
+    if (bit == 4) return gemm_half_q_half_gptq_4bit_kernel<true, M_COUNT>; \
+    if (bit == 8) return gemm_half_q_half_gptq_8bit_kernel<true, M_COUNT>; \
+  }
+#if BLOCK_M_SIZE_MAX >= 1
+  SELECT_KERNEL(1);
+#endif
+#if BLOCK_M_SIZE_MAX >= 2
+  SELECT_KERNEL(2);
+#endif
+#if BLOCK_M_SIZE_MAX >= 3
+  SELECT_KERNEL(3);
+#endif
+#if BLOCK_M_SIZE_MAX >= 4
+  SELECT_KERNEL(4);
+#endif
+#if BLOCK_M_SIZE_MAX >= 5
+  SELECT_KERNEL(5);
+#endif
+#if BLOCK_M_SIZE_MAX >= 6
+  SELECT_KERNEL(6);
+#endif
+#if BLOCK_M_SIZE_MAX >= 7
+  SELECT_KERNEL(7);
+#endif
+#if BLOCK_M_SIZE_MAX >= 8
+  SELECT_KERNEL(8);
+#endif
+  return NULL;
+}
+
+void gemm_half_q_half_cuda_part(const half* a, const uint32_t* b_q_weight,
+                                const uint32_t* b_gptq_qzeros,
+                                const half* b_gptq_scales, const int* b_q_perm,
+                                half* c, int size_m, int size_n, int size_k,
+                                int m_count, int groups, int bit) {
+  dim3 blockDim, gridDim;
+  blockDim.x = BLOCK_KN_SIZE;
+  blockDim.y = 1;
+  blockDim.z = 1;
+  gridDim.x = DIVIDE(size_n, BLOCK_KN_SIZE * 4);
+  gridDim.y = DIVIDE(size_m, m_count);
+  gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE);
+
+  fp_gemm_half_q_half_gptq_kernel kernel =
+      pick_gemm_half_q_half_gptq_kernel(true, m_count, bit);
+
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  kernel<<<gridDim, blockDim, 0, stream>>>(a, b_q_weight, b_gptq_qzeros,
+                                           b_gptq_scales, c, size_m, size_n,
+                                           size_k, groups, b_q_perm);
+}
+
+__global__ void reconstruct_exllama_8bit_kernel(
+    const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm,
+    const uint32_t* __restrict__ b_gptq_qzeros,
+    const half* __restrict__ b_gptq_scales, const int size_k, const int size_n,
+    const int groups, half* __restrict__ b) {
+  MatrixView_half_rw b_(b, size_k, size_n);
+  MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+
+  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
+  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+
+  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+
+  // Preload remapping table
+  __shared__ int perm[BLOCK_KN_SIZE];
+  auto t = threadIdx.x;
+
+  if (b_q_perm) {
+    if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
+  }
+
+  // Column
+  int n = offset_n + t * 4;
+  if (n >= size_n) return;
+
+  // Find initial group
+  int groupsize = size_k / groups;
+  int group = offset_k / groupsize;
+  int nextgroup = offset_k + groupsize;
+
+  // b offset
+  int qk = offset_k / (32 / 8);
+
+  const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+
+  // Initial zeros/scale
+  int zeros[4];
+  half2 scales[4];
+  b_gptq_qzeros_.item4(zeros, group, n);
+  b_gptq_scales_.item4_h2(scales, group, n);
+
+  __syncthreads();
+
+  int k = offset_k;
+  int lk = 0;
+
+  while (k < end_k) {
+    if (k == nextgroup) {
+      group++;
+      nextgroup += groupsize;
+      b_gptq_qzeros_.item4(zeros, group, n);
+      b_gptq_scales_.item4_h2(scales, group, n);
+    }
+
+    for (int p = 0; p < 4; p++) {
+      int4 load_int4[2];
+      load_int4[0] = *((int4*)b_ptr);
+      b_ptr += size_n;
+      load_int4[1] = *((int4*)b_ptr);
+      b_ptr += size_n;
+
+      half2 dq[4][4];
+      dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n,
+                     zeros[0] + 1);
+      dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n,
+                     zeros[1] + 1);
+      dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n,
+                     zeros[2] + 1);
+      dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n,
+                     zeros[3] + 1);
+
+      // half* dqh = (half*)dq;
+      if (b_q_perm) {
+        for (int j = 0; j < 4; j++) {
+          for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
+          b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]),
+                  __low2half(dq[2][j]), __low2half(dq[3][j]));
+          b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]),
+                  __high2half(dq[2][j]), __high2half(dq[3][j]));
+        }
+      } else {
+        for (int j = 0; j < 4; j++) {
+          for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
+          b_.set4(offset_k + lk++, n, __low2half(dq[0][j]),
+                  __low2half(dq[1][j]), __low2half(dq[2][j]),
+                  __low2half(dq[3][j]));
+          b_.set4(offset_k + lk++, n, __high2half(dq[0][j]),
+                  __high2half(dq[1][j]), __high2half(dq[2][j]),
+                  __high2half(dq[3][j]));
+        }
+      }
+    }
+    k += 32;
+  }
+}
+
+__global__ void reconstruct_exllama_4bit_kernel(
+    const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm,
+    const uint32_t* __restrict__ b_gptq_qzeros,
+    const half* __restrict__ b_gptq_scales, const int size_k, const int size_n,
+    const int groups, half* __restrict__ b) {
+  MatrixView_half_rw b_(b, size_k, size_n);
+  MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+
+  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
+  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+
+  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+
+  // Preload remapping table
+  __shared__ int perm[BLOCK_KN_SIZE];
+  auto t = threadIdx.x;
+
+  if (b_q_perm) {
+    if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
+  }
+
+  // Column
+  int n = offset_n + t * 4;
+  if (n >= size_n) return;
+
+  // Find initial group
+  int groupsize = size_k / groups;
+  int group = offset_k / groupsize;
+  int nextgroup = offset_k + groupsize;
+
+  // b offset
+  int qk = offset_k / (32 / 4);
+
+  const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+
+  // Initial zeros/scale
+  int zeros[4];
+  half2 scales[4];
+  half2 z1z16[4][2];
+  half2 y1y16[4][2];
+  b_gptq_qzeros_.item4(zeros, group, n);
+  b_gptq_scales_.item4_h2(scales, group, n);
+  dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
+  dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
+  dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
+  dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
+
+  __syncthreads();
+
+  int k = offset_k;
+  int lk = 0;
+
+  while (k < end_k) {
+    if (k == nextgroup) {
+      group++;
+      nextgroup += groupsize;
+      b_gptq_qzeros_.item4(zeros, group, n);
+      b_gptq_scales_.item4_h2(scales, group, n);
+      dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
+      dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
+      dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
+      dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
+    }
+
+    for (int p = 0; p < 4; p++) {
+      half2 dq[4][4];
+      const int4* b_ptr4 = (int4*)b_ptr;
+      int4 load_int4 = *b_ptr4;
+
+      dequant_4bit_8_gptq(load_int4.x, dq[0], z1z16[0], y1y16[0], size_n,
+                          false);
+      dequant_4bit_8_gptq(load_int4.y, dq[1], z1z16[1], y1y16[1], size_n,
+                          false);
+      dequant_4bit_8_gptq(load_int4.z, dq[2], z1z16[2], y1y16[2], size_n,
+                          false);
+      dequant_4bit_8_gptq(load_int4.w, dq[3], z1z16[3], y1y16[3], size_n,
+                          false);
+
+      b_ptr += size_n;
+      // half* dqh = (half*)dq;
+      if (b_q_perm) {
+        for (int j = 0; j < 4; j++) {
+          for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
+          b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]),
+                  __low2half(dq[2][j]), __low2half(dq[3][j]));
+          b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]),
+                  __high2half(dq[2][j]), __high2half(dq[3][j]));
+        }
+      } else {
+        for (int j = 0; j < 4; j++) {
+          for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
+          b_.set4(offset_k + lk++, n, __low2half(dq[0][j]),
+                  __low2half(dq[1][j]), __low2half(dq[2][j]),
+                  __low2half(dq[3][j]));
+          b_.set4(offset_k + lk++, n, __high2half(dq[0][j]),
+                  __high2half(dq[1][j]), __high2half(dq[2][j]),
+                  __high2half(dq[3][j]));
+        }
+      }
+    }
+    k += 32;
+  }
+}
+
+__global__ void reconstruct_exllama_3bit_kernel(
+    const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm,
+    const uint32_t* __restrict__ b_gptq_qzeros,
+    const half* __restrict__ b_gptq_scales, const int size_k, const int size_n,
+    const int groups, half* __restrict__ b) {
+  MatrixView_half_rw b_(b, size_k, size_n);
+  MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+
+  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
+  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+
+  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+
+  // Preload remapping table
+  __shared__ int perm[BLOCK_KN_SIZE];
+  auto t = threadIdx.x;
+
+  if (b_q_perm) {
+    if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
+  }
+
+  // Column
+  int n = offset_n + t * 4;
+  if (n >= size_n) return;
+
+  // Find initial group
+  int groupsize = size_k / groups;
+  int group = offset_k / groupsize;
+  int nextgroup = offset_k + groupsize;
+
+  // b offset
+  int qk = offset_k / 32 * 3;
+
+  const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+
+  // Initial zeros/scale
+  int zeros[4];
+  half2 scales[4];
+  b_gptq_qzeros_.item4(zeros, group, n);
+  b_gptq_scales_.item4_h2(scales, group, n);
+
+  __syncthreads();
+
+  int k = offset_k;
+  int lk = 0;
+
+  while (k < end_k) {
+    if (k == nextgroup) {
+      group++;
+      nextgroup += groupsize;
+      b_gptq_qzeros_.item4(zeros, group, n);
+      b_gptq_scales_.item4_h2(scales, group, n);
+    }
+
+    for (int p = 0; p < 1; p++) {
+      int4 load_int4[3];
+      load_int4[0] = *((int4*)b_ptr);
+      b_ptr += size_n;
+      load_int4[1] = *((int4*)b_ptr);
+      b_ptr += size_n;
+      load_int4[2] = *((int4*)b_ptr);
+      b_ptr += size_n;
+
+      half2 dq[4][16];
+      dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0],
+                      size_n, zeros[0] + 1);
+      dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1],
+                      size_n, zeros[1] + 1);
+      dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2],
+                      size_n, zeros[2] + 1);
+      dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3],
+                      size_n, zeros[3] + 1);
+
+      if (b_q_perm) {
+        for (int j = 0; j < 16; j++) {
+          for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
+          b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]),
+                  __low2half(dq[2][j]), __low2half(dq[3][j]));
+          b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]),
+                  __high2half(dq[2][j]), __high2half(dq[3][j]));
+        }
+      } else {
+        for (int j = 0; j < 16; j++) {
+          for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
+          b_.set4(offset_k + lk++, n, __low2half(dq[0][j]),
+                  __low2half(dq[1][j]), __low2half(dq[2][j]),
+                  __low2half(dq[3][j]));
+          b_.set4(offset_k + lk++, n, __high2half(dq[0][j]),
+                  __high2half(dq[1][j]), __high2half(dq[2][j]),
+                  __high2half(dq[3][j]));
+        }
+      }
+    }
+    k += 32;
+  }
+}
+
+__global__ void reconstruct_exllama_2bit_kernel(
+    const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm,
+    const uint32_t* __restrict__ b_gptq_qzeros,
+    const half* __restrict__ b_gptq_scales, const int size_k, const int size_n,
+    const int groups, half* __restrict__ b) {
+  MatrixView_half_rw b_(b, size_k, size_n);
+  MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+
+  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
+  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+
+  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+
+  // Preload remapping table
+  __shared__ int perm[BLOCK_KN_SIZE];
+  auto t = threadIdx.x;
+
+  if (b_q_perm) {
+    if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
+  }
+
+  // Column
+  int n = offset_n + t * 4;
+  if (n >= size_n) return;
+
+  // Find initial group
+  int groupsize = size_k / groups;
+  int group = offset_k / groupsize;
+  int nextgroup = offset_k + groupsize;
+
+  // b offset
+  int qk = offset_k / (32 / 2);
+
+  const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+
+  // Initial zeros/scale
+  int zeros[4];
+  half2 scales[4];
+  b_gptq_qzeros_.item4(zeros, group, n);
+  b_gptq_scales_.item4_h2(scales, group, n);
+
+  __syncthreads();
+
+  int k = offset_k;
+  int lk = 0;
+
+  while (k < end_k) {
+    if (k == nextgroup) {
+      group++;
+      nextgroup += groupsize;
+      b_gptq_qzeros_.item4(zeros, group, n);
+      b_gptq_scales_.item4_h2(scales, group, n);
+    }
+
+    for (int p = 0; p < 2; p++) {
+      const int4* b_ptr4 = (int4*)b_ptr;
+      int4 load_int4 = *b_ptr4;
+
+      half2 dq[4][8];
+      dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + 1);
+      dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + 1);
+      dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + 1);
+      dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + 1);
+
+      b_ptr += size_n;
+      // half* dqh = (half*)dq;
+      if (b_q_perm) {
+        for (int j = 0; j < 8; j++) {
+          for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
+          b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]),
+                  __low2half(dq[2][j]), __low2half(dq[3][j]));
+          b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]),
+                  __high2half(dq[2][j]), __high2half(dq[3][j]));
+        }
+      } else {
+        for (int j = 0; j < 8; j++) {
+          for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
+          b_.set4(offset_k + lk++, n, __low2half(dq[0][j]),
+                  __low2half(dq[1][j]), __low2half(dq[2][j]),
+                  __low2half(dq[3][j]));
+          b_.set4(offset_k + lk++, n, __high2half(dq[0][j]),
+                  __high2half(dq[1][j]), __high2half(dq[2][j]),
+                  __high2half(dq[3][j]));
+        }
+      }
+    }
+    k += 32;
+  }
+}
+
+void reconstruct_exllama(const uint32_t* b_q_weight,
+                         const uint32_t* b_gptq_qzeros,
+                         const half* b_gptq_scales, const int* b_q_perm,
+                         half* out, int height, int width, int groups,
+                         int bit) {
+  dim3 blockDim, gridDim;
+  blockDim.x = BLOCK_KN_SIZE;
+  blockDim.y = 1;
+  gridDim.y = DIVIDE(height, BLOCK_KN_SIZE);
+  gridDim.x = DIVIDE(width, BLOCK_KN_SIZE);
+
+  auto reconstruct_exllama_kernel = reconstruct_exllama_4bit_kernel;
+  if (bit == 2) {
+    reconstruct_exllama_kernel = reconstruct_exllama_2bit_kernel;
+  } else if (bit == 3) {
+    reconstruct_exllama_kernel = reconstruct_exllama_3bit_kernel;
+  } else if (bit == 8) {
+    reconstruct_exllama_kernel = reconstruct_exllama_8bit_kernel;
+  }
+
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  reconstruct_exllama_kernel<<<gridDim, blockDim, 0, stream>>>(
+      b_q_weight, b_q_perm, b_gptq_qzeros, b_gptq_scales, height, width, groups,
+      out);
+}
+
+__global__ void gemm_half_q_half_alt_4bit_kernel(
+    const half2* __restrict__ vec, const uint32_t* __restrict__ mat,
+    half* __restrict__ mul, const half* __restrict__ scales,
+    const uint32_t* __restrict__ zeros, const int* __restrict__ g_idx,
+    int batch, int height, int width) {
+  int zero_width = width / 8;
+  int vec_height = height * 4;
+  const int blockwidth2 = BLOCK_KN_SIZE / 2;
+  auto b = blockIdx.y * BLOCK_M_SIZE_MAX;
+  int b_end = min(BLOCK_M_SIZE_MAX, batch - b);
+  auto h = BLOCK_KN_SIZE * blockIdx.z / 8;
+  int h_end = min(BLOCK_KN_SIZE / 8, height - h) * 4;
+  auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+
+  __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
+  if (threadIdx.x < h_end) {
+    for (int m = 0; m < b_end; ++m) {
+      blockvec[m][threadIdx.x] =
+          vec[(m + b) * vec_height + blockIdx.z * BLOCK_KN_SIZE / 2 +
+              threadIdx.x];
+    }
+  }
+
+  __shared__ half2 deq2[256][8];
+  auto val = threadIdx.x / 8;
+  auto off = threadIdx.x % 8;
+  for (; val < 256; val += BLOCK_KN_SIZE / 8) {
+    deq2[val][off] =
+        __halves2half2(__int2half_rn(val & 0xF), __int2half_rn(val >> 4));
+  }
+
+  if (blockIdx.z == 0) {
+    for (int m = 0; m < b_end; m++) mul[(b + m) * width + w] = __int2half_rn(0);
+  }
+  __syncthreads();
+
+  int i = width * h + w;
+  int g_h = h * 8;
+  int k = 0;
+  int z_w = w / 8;
+  int z_mod = (w % 8) * 4;
+  half2 res2;
+  half res[BLOCK_M_SIZE_MAX] = {};
+
+  unsigned int tmp;
+  while (k < h_end) {
+    tmp = mat[i];
+    half2 scales_tmp[4];
+    half2 zeros_tmp[4];
+    for (int tmp_k = 0; tmp_k < 4; tmp_k++) {
+      int g = g_idx[g_h + (k + tmp_k) * 2];
+      int g2 = g_idx[g_h + (k + tmp_k) * 2 + 1];
+      half scale_f = scales[g * width + w];
+      half scale_f2 = scales[g2 * width + w];
+      half2 scale = __halves2half2(scale_f, scale_f2);
+      half2 zero = __halves2half2(
+          __hmul(scale_f,
+                 __int2half_rn(-((zeros[g * zero_width + z_w] >> z_mod) & 0xF) -
+                               1)),
+          __hmul(scale_f2,
+                 __int2half_rn(
+                     -((zeros[g2 * zero_width + z_w] >> z_mod) & 0xF) - 1)));
+      scales_tmp[tmp_k] = scale;
+      zeros_tmp[tmp_k] = zero;
+    }
+    for (int m = 0; m < b_end; m++) {
+#ifndef USE_ROCM
+      res2 = {};
+#else
+      res2.x = __half_as_ushort(__float2half(0));
+      res2.y = __half_as_ushort(__float2half(0));
+#endif
+      res2 = __hfma2(
+          __hfma2(deq2[(tmp >> 0) & 0xff][off], scales_tmp[0], zeros_tmp[0]),
+          blockvec[m][k + 0], res2);
+      res2 = __hfma2(
+          __hfma2(deq2[(tmp >> 8) & 0xff][off], scales_tmp[1], zeros_tmp[1]),
+          blockvec[m][k + 1], res2);
+      res2 = __hfma2(
+          __hfma2(deq2[(tmp >> 16) & 0xff][off], scales_tmp[2], zeros_tmp[2]),
+          blockvec[m][k + 2], res2);
+      res2 = __hfma2(
+          __hfma2(deq2[(tmp >> 24) & 0xff][off], scales_tmp[3], zeros_tmp[3]),
+          blockvec[m][k + 3], res2);
+#ifndef USE_ROCM
+      res[m] = __hadd(res[m], __hadd(res2.x, res2.y));
+#else
+      res[m] = __hadd(
+          res[m], __hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y)));
+#endif
+    }
+    i += width;
+    k += 4;
+  }
+  for (int m = 0; m < b_end; m++) {
+    atomicAdd(&mul[(b + m) * width + w], res[m]);
+  }
+}
+
+__global__ void gemm_half_q_half_alt_8bit_kernel(
+    const half2* __restrict__ vec, const uint32_t* __restrict__ mat,
+    half* __restrict__ mul, const half* __restrict__ scales,
+    const uint32_t* __restrict__ zeros, const int* __restrict__ g_idx,
+    int batch, int height, int width) {
+  int zero_width = width / 4;
+  int vec_height = height * 2;
+  const int blockwidth2 = BLOCK_KN_SIZE / 2;
+  auto b = blockIdx.y * BLOCK_M_SIZE_MAX;
+  int b_end = min(BLOCK_M_SIZE_MAX, batch - b);
+  auto h = BLOCK_KN_SIZE * blockIdx.z / 4;
+  int h_end = min(BLOCK_KN_SIZE / 4, height - h) * 2;
+  auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+
+  __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
+  if (threadIdx.x < h_end) {
+    for (int m = 0; m < b_end; ++m) {
+      blockvec[m][threadIdx.x] =
+          vec[(m + b) * vec_height + blockIdx.z * BLOCK_KN_SIZE / 2 +
+              threadIdx.x];
+    }
+  }
+
+  if (blockIdx.z == 0) {
+    for (int m = 0; m < b_end; m++) mul[(b + m) * width + w] = __int2half_rn(0);
+  }
+  __syncthreads();
+
+  int i = width * h + w;
+  int g_h = h * 4;
+  int k = 0;
+  int z_w = w / 4;
+  int z_mod = (w % 4) * 8;
+  half2 res2;
+  half res[BLOCK_M_SIZE_MAX] = {};
+
+  unsigned int tmp;
+  while (k < h_end) {
+    tmp = mat[i];
+    half2 scales_tmp[2];
+    half2 zeros_tmp[2];
+    for (int tmp_k = 0; tmp_k < 2; tmp_k++) {
+      int g = g_idx[g_h + (k + tmp_k) * 2];
+      int g2 = g_idx[g_h + (k + tmp_k) * 2 + 1];
+      half scale_f = scales[g * width + w];
+      half scale_f2 = scales[g2 * width + w];
+      half2 scale = __halves2half2(scale_f, scale_f2);
+      half2 zero = __halves2half2(
+          __hmul(scale_f,
+                 __int2half_rn(
+                     -((zeros[g * zero_width + z_w] >> z_mod) & 0xff) - 1)),
+          __hmul(scale_f2,
+                 __int2half_rn(
+                     -((zeros[g2 * zero_width + z_w] >> z_mod) & 0xff) - 1)));
+      scales_tmp[tmp_k] = scale;
+      zeros_tmp[tmp_k] = zero;
+    }
+    for (int m = 0; m < b_end; m++) {
+#ifndef USE_ROCM
+      res2 = {};
+#else
+      res2.x = __half_as_ushort(__float2half(0));
+      res2.y = __half_as_ushort(__float2half(0));
+#endif
+      half2 v12 = __halves2half2(__int2half_rn(tmp & 0xFF),
+                                 __int2half_rn((tmp >> 8) & 0xFF));
+      res2 = __hfma2(__hfma2(v12, scales_tmp[0], zeros_tmp[0]),
+                     blockvec[m][k + 0], res2);
+      half2 v34 = __halves2half2(__int2half_rn((tmp >> 16) & 0xFF),
+                                 __int2half_rn((tmp >> 24) & 0xFF));
+      res2 = __hfma2(__hfma2(v34, scales_tmp[1], zeros_tmp[1]),
+                     blockvec[m][k + 1], res2);
+#ifndef USE_ROCM
+      res[m] = __hadd(res[m], __hadd(res2.x, res2.y));
+#else
+      res[m] = __hadd(
+          res[m], __hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y)));
+#endif
+    }
+    i += width;
+    k += 2;
+  }
+  for (int m = 0; m < b_end; m++) {
+    atomicAdd(&mul[(b + m) * width + w], res[m]);
+  }
+}
+
+void gemm_half_q_half_alt(const half* a, const uint32_t* b_q_weight,
+                          const uint32_t* b_gptq_qzeros,
+                          const half* b_gptq_scales, const int* b_g_idx,
+                          half* c, int size_m, int size_n, int size_k,
+                          int bit) {
+  dim3 blockDim, gridDim;
+  blockDim.x = BLOCK_KN_SIZE;
+  blockDim.y = 1;
+  blockDim.z = 1;
+  gridDim.x = DIVIDE(size_n, BLOCK_KN_SIZE);
+  gridDim.y = DIVIDE(size_m, BLOCK_M_SIZE_MAX);
+  gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE);
+
+  auto kernel = gemm_half_q_half_alt_4bit_kernel;
+  if (bit == 8) {
+    kernel = gemm_half_q_half_alt_8bit_kernel;
+  }
+
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  kernel<<<gridDim, blockDim, 0, stream>>>(
+      (const half2*)a, b_q_weight, c, b_gptq_scales, b_gptq_qzeros, b_g_idx,
+      size_m, size_k / 32 * bit, size_n);
+}
+
+template <class T, int bit>
+__global__ void reconstruct_gptq_kernel(const uint32_t* __restrict__ w,
+                                        const half* __restrict__ w_scales,
+                                        const uint32_t* __restrict__ w_zeros,
+                                        const int* __restrict__ g_idx,
+                                        const int height, const int width,
+                                        const int group,
+                                        half* __restrict__ out) {
+  // Start of block
+
+  auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+  auto row = blockIdx.y * 32 / bit;
+  if (column >= width) return;
+
+  // Views
+
+  MatrixView_half_rw out_(out, height, width);
+  MatrixView_half w_scales_(w_scales, group, width);
+  T w_zeros_(w_zeros, group, width);
+
+  uint32_t w_read = w[blockIdx.y * width + column];
+  half* out_ptr = out_.item_ptr(row, column);
+
+#pragma unroll
+  for (int s = 0; s < 32; s += bit) {
+    int group = g_idx[row + s / bit];
+    half w_scale = w_scales_.item(group, column);
+    uint32_t w_zero = w_zeros_.item(group, column) + 1;
+    half w_item =
+        __hmul(__int2half_rn((int)((w_read >> s) & ((1 << bit) - 1)) - w_zero),
+               w_scale);
+    *out_ptr = w_item;
+    out_ptr += out_.width;
+  }
+}
+
+__global__ void reconstruct_gptq_3bit_kernel(
+    const uint32_t* __restrict__ w, const half* __restrict__ w_scales,
+    const uint32_t* __restrict__ w_zeros, const int* __restrict__ g_idx,
+    const int height, const int width, const int group,
+    half* __restrict__ out) {
+  // Start of block
+  auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+  auto row = blockIdx.y * 32;
+  if (column >= width) return;
+
+  // Views
+
+  MatrixView_half_rw out_(out, height, width);
+  MatrixView_half w_scales_(w_scales, group, width);
+  MatrixView_q3_row w_zeros_(w_zeros, group, width);
+
+  uint32_t w1 = w[(blockIdx.y * 3) * width + column];
+  uint32_t w2 = w[(blockIdx.y * 3 + 1) * width + column];
+  uint32_t w3 = w[(blockIdx.y * 3 + 2) * width + column];
+  half* out_ptr = out_.item_ptr(row, column);
+
+#pragma unroll
+  for (int i = 0; i < 32; i += 1) {
+    int group = g_idx[row + i];
+    half w_scale = w_scales_.item(group, column);
+    uint32_t w_zero = w_zeros_.item(group, column) + 1;
+    int w_item;
+    if (i == 10) {
+      w_item = (w1 >> 30) | ((w2 << 2) & 0x4);
+    } else if (i == 21) {
+      w_item = (w2 >> 31) | ((w3 << 1) & 0x6);
+    } else if (i < 10) {
+      w_item = ((w1 >> (i * 3)) & 0x7);
+    } else if (i < 21) {
+      w_item = ((w2 >> (i * 3 - 32)) & 0x7);
+    } else {
+      w_item = ((w3 >> (i * 3 - 64)) & 0x7);
+    }
+    *out_ptr = __hmul(__int2half_rn(w_item - w_zero), w_scale);
+    out_ptr += out_.width;
+  }
+}
+
+void reconstruct_gptq(const uint32_t* b_q_weight, const uint32_t* b_gptq_qzeros,
+                      const half* b_gptq_scales, const int* b_g_idx, half* out,
+                      int height, int width, int groups, int bit) {
+  dim3 blockDim, gridDim;
+  blockDim.x = BLOCK_KN_SIZE;
+  blockDim.y = 1;
+  gridDim.y = DIVIDE(height, 32 / bit);
+  gridDim.x = DIVIDE(width, BLOCK_KN_SIZE);
+
+  auto kernel = reconstruct_gptq_kernel<MatrixView_q4_row, 4>;
+  if (bit == 2) {
+    kernel = reconstruct_gptq_kernel<MatrixView_q2_row, 2>;
+  } else if (bit == 8) {
+    kernel = reconstruct_gptq_kernel<MatrixView_q8_row, 8>;
+  } else if (bit == 3) {
+    kernel = reconstruct_gptq_3bit_kernel;
+    gridDim.y = DIVIDE(height, 32);
+  }
+
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  kernel<<<gridDim, blockDim, 0, stream>>>(b_q_weight, b_gptq_scales,
+                                           b_gptq_qzeros, b_g_idx, height,
+                                           width, groups, out);
+}
+
+void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,
+                           const uint32_t* b_q_weight,
+                           const uint32_t* b_gptq_qzeros,
+                           const half* b_gptq_scales, const int* b_g_idx,
+                           half* c, half* temp_dq, int size_m, int size_n,
+                           int size_k, int groups, bool use_exllama, int bit) {
+  bool use_reconstruct;
+  if (use_exllama) {
+    use_reconstruct = ((bit == 8 && size_m > MAX_Q_GEMM_ROWS_8BIT) ||
+                       (bit != 8 && size_m > MAX_Q_GEMM_ROWS));
+  } else {
+    // The 2/3-bit kernels are somehow slower than dequant + gemm baseline, so
+    // we disabled them for now.
+    use_reconstruct = (bit < 4 || size_m > MAX_ALT_GEMM_ROWS);
+  }
+  if (use_reconstruct) {
+    // Reconstruct FP16 matrix, then cuBLAS
+    if (use_exllama) {
+      reconstruct_exllama(b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx,
+                          temp_dq, size_k, size_n, groups, bit);
+    } else {
+      reconstruct_gptq(b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx,
+                       temp_dq, size_k, size_n, groups, bit);
+    }
+
+    const half alpha = __float2half(1.0f);
+    const half beta = __float2half(0.0f);
+    cublasHgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, size_n, size_m, size_k,
+                &alpha, temp_dq, size_n, a, size_k, &beta, c, size_n);
+  } else if (use_exllama) {
+    // Quantized matmul
+    int max_chunks = size_m / BLOCK_M_SIZE_MAX;
+    int last_chunk = max_chunks * BLOCK_M_SIZE_MAX;
+    int last_chunk_size = size_m - last_chunk;
+
+    if (max_chunks) {
+      gemm_half_q_half_cuda_part(a, b_q_weight, b_gptq_qzeros, b_gptq_scales,
+                                 b_g_idx, c, last_chunk, size_n, size_k,
+                                 BLOCK_M_SIZE_MAX, groups, bit);
+    }
+
+    if (last_chunk_size) {
+      gemm_half_q_half_cuda_part(a + last_chunk * size_k, b_q_weight,
+                                 b_gptq_qzeros, b_gptq_scales, b_g_idx,
+                                 c + last_chunk * size_n, last_chunk_size,
+                                 size_n, size_k, last_chunk_size, groups, bit);
+    }
+  } else {
+    gemm_half_q_half_alt(a, b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx,
+                         c, size_m, size_n, size_k, bit);
+  }
+}
+
+__global__ void shuffle_4bit_kernel(uint32_t* __restrict__ b_q_weight,
+                                    const int size_k, const int size_n) {
+  auto n = blockIdx.x * THREADS_X + threadIdx.x;
+  if (n >= size_n) return;
+  int k = 0;
+  uint32_t* b_ptr = b_q_weight + n;
+  while (k < size_k) {
+    shuffle_4bit_8(b_ptr, size_n);
+    b_ptr += 1 * size_n;
+    k += 8;
+  }
+}
+
+__global__ void shuffle_8bit_kernel(uint32_t* __restrict__ b_q_weight,
+                                    const int size_k, const int size_n) {
+  auto n = blockIdx.x * THREADS_X + threadIdx.x;
+  if (n >= size_n) return;
+  int k = 0;
+  uint32_t* b_ptr = b_q_weight + n;
+  while (k < size_k) {
+    shuffle_8bit_4(b_ptr, size_n);
+    b_ptr += 1 * size_n;
+    k += 4;
+  }
+}
+
+__global__ void shuffle_2bit_kernel(uint32_t* __restrict__ b_q_weight,
+                                    const int size_k, const int size_n) {
+  auto n = blockIdx.x * THREADS_X + threadIdx.x;
+  if (n >= size_n) return;
+  int k = 0;
+  uint32_t* b_ptr = b_q_weight + n;
+  while (k < size_k) {
+    shuffle_2bit_16(b_ptr, size_n);
+    b_ptr += 1 * size_n;
+    k += 16;
+  }
+}
+
+__global__ void shuffle_3bit_kernel(uint32_t* __restrict__ b_q_weight,
+                                    const int size_k, const int size_n) {
+  auto n = blockIdx.x * THREADS_X + threadIdx.x;
+  if (n >= size_n) return;
+  int k = 0;
+  uint32_t* b_ptr = b_q_weight + n;
+  while (k < size_k) {
+    shuffle_3bit_32(b_ptr, size_n);
+    b_ptr += 3 * size_n;
+    k += 32;
+  }
+}
+
+__global__ void make_sequential_4bit_kernel(const uint32_t* __restrict__ w,
+                                            uint32_t* __restrict__ w_new,
+                                            const int* __restrict__ q_perm,
+                                            const int w_width) {
+  const uint64_t* w2 = (uint64_t*)w;
+  uint64_t* w_new2 = (uint64_t*)w_new;
+  int w2_stride = w_width >> 1;
+  auto w2_column = THREADS_X * blockIdx.x + threadIdx.x;
+  if (w2_column >= w2_stride) return;
+  auto w_new2_row = blockIdx.y;
+  int q_perm_idx = w_new2_row << 3;
+  uint64_t dst = 0;
+
+#pragma unroll
+  for (int i = 0; i < 8; i++) {
+    int source_row = q_perm[q_perm_idx++];
+
+    int w2_row = source_row >> 3;
+    int w2_subrow = source_row & 0x07;
+    int w2_row_shift = w2_subrow << 2;
+    int wnew2_row_shift = i << 2;
+
+    uint64_t src = w2[w2_row * w2_stride + w2_column];
+    src >>= w2_row_shift;
+    src &= 0x0000000f0000000f;
+    src <<= wnew2_row_shift;
+    dst |= src;
+  }
+  w_new2[w_new2_row * w2_stride + w2_column] = dst;
+}
+
+__global__ void make_sequential_2bit_kernel(const uint32_t* __restrict__ w,
+                                            uint32_t* __restrict__ w_new,
+                                            const int* __restrict__ q_perm,
+                                            const int w_width) {
+  const uint64_t* w2 = (uint64_t*)w;
+  uint64_t* w_new2 = (uint64_t*)w_new;
+  int w2_stride = w_width >> 1;
+  auto w2_column = THREADS_X * blockIdx.x + threadIdx.x;
+  if (w2_column >= w2_stride) return;
+  auto w_new2_row = blockIdx.y;
+  int q_perm_idx = w_new2_row << 4;
+  uint64_t dst = 0;
+
+#pragma unroll
+  for (int i = 0; i < 16; i++) {
+    int source_row = q_perm[q_perm_idx++];
+
+    int w2_row = source_row >> 4;
+    int w2_subrow = source_row & 0x0f;
+    int w2_row_shift = w2_subrow << 1;
+    int wnew2_row_shift = i << 1;
+
+    uint64_t src = w2[w2_row * w2_stride + w2_column];
+    src >>= w2_row_shift;
+    src &= 0x0000000300000003;
+    src <<= wnew2_row_shift;
+    dst |= src;
+  }
+  w_new2[w_new2_row * w2_stride + w2_column] = dst;
+}
+
+__global__ void make_sequential_3bit_kernel(const uint32_t* __restrict__ w,
+                                            uint32_t* __restrict__ w_new,
+                                            const int* __restrict__ q_perm,
+                                            const int w_width) {
+  auto w_column = THREADS_X * blockIdx.x + threadIdx.x;
+  if (w_column >= w_width) return;
+  auto w_new_row = blockIdx.y * 3;
+  auto q_perm_idx = blockIdx.y << 5;
+  uint32_t dst[3] = {0, 0, 0};
+
+#pragma unroll
+  for (int i = 0; i < 32; i++) {
+    int source_row = q_perm[q_perm_idx++];
+    int z_w = (source_row / 32) * 3;
+    int z_mod = source_row % 32;
+    int z_bit;
+
+    if (z_mod != 10) {
+      if (z_mod != 21) {
+        z_bit = z_mod;
+        if (z_bit > 21) {
+          z_bit *= 3;
+          z_bit -= 64;
+          z_w += 2;
+        } else if (z_bit > 10) {
+          z_bit *= 3;
+          z_bit -= 32;
+          z_w += 1;
+        } else {
+          z_bit *= 3;
+        }
+      } else {
+        z_w += 1;
+      }
+    }
+
+    uint64_t src;
+    if (z_mod == 10) {
+      src = (w[z_w * w_width + w_column] >> 30) |
+            ((w[(z_w + 1) * w_width + w_column] << 2) & 0x4);
+    } else if (z_mod == 21) {
+      src = (w[z_w * w_width + w_column] >> 31) |
+            ((w[(z_w + 1) * w_width + w_column] << 1) & 0x6);
+    } else {
+      src = w[z_w * w_width + w_column];
+      src >>= z_bit;
+      src &= 0x07;
+    }
+
+    z_w = 0;
+    if (i != 10) {
+      if (i != 21) {
+        z_bit = i;
+        if (z_bit > 21) {
+          z_bit *= 3;
+          z_bit -= 64;
+          z_w += 2;
+        } else if (z_bit > 10) {
+          z_bit *= 3;
+          z_bit -= 32;
+          z_w += 1;
+        } else {
+          z_bit *= 3;
+        }
+      } else {
+        z_w += 1;
+      }
+    }
+    if (i == 10) {
+      dst[z_w] |= (src & 0x03) << 30;
+      dst[z_w + 1] |= ((src & 0x4) >> 2);
+    } else if (i == 21) {
+      dst[z_w] |= (src & 0x01) << 31;
+      dst[z_w + 1] |= ((src & 0x6) >> 1);
+    } else {
+      dst[z_w] |= (src << z_bit);
+    }
+  }
+  w_new[w_new_row * w_width + w_column] = dst[0];
+  w_new[(w_new_row + 1) * w_width + w_column] = dst[1];
+  w_new[(w_new_row + 2) * w_width + w_column] = dst[2];
+}
+
+__global__ void make_sequential_8bit_kernel(const uint32_t* __restrict__ w,
+                                            uint32_t* __restrict__ w_new,
+                                            const int* __restrict__ q_perm,
+                                            const int w_width) {
+  const uint64_t* w2 = (uint64_t*)w;
+  uint64_t* w_new2 = (uint64_t*)w_new;
+  int w2_stride = w_width >> 1;
+  auto w2_column = THREADS_X * blockIdx.x + threadIdx.x;
+  if (w2_column >= w2_stride) return;
+  auto w_new2_row = blockIdx.y;
+  int q_perm_idx = w_new2_row << 2;
+  uint64_t dst = 0;
+
+#pragma unroll
+  for (int i = 0; i < 4; i++) {
+    int source_row = q_perm[q_perm_idx++];
+
+    int w2_row = source_row >> 2;
+    int w2_subrow = source_row & 0x03;
+    int w2_row_shift = w2_subrow << 3;
+    int wnew2_row_shift = i << 3;
+
+    uint64_t src = w2[w2_row * w2_stride + w2_column];
+    src >>= w2_row_shift;
+    src &= 0x000000ff000000ff;
+    src <<= wnew2_row_shift;
+    dst |= src;
+  }
+  w_new2[w_new2_row * w2_stride + w2_column] = dst;
+}
+
+void shuffle_exllama_weight(uint32_t* q_weight, int* q_perm, int height,
+                            int width, int bit) {
+  if (q_perm) {
+    uint32_t* new_qweight = NULL;
+    cudaMalloc(&new_qweight, height / 32 * bit * width * sizeof(uint32_t));
+
+    dim3 blockDim, gridDim;
+    blockDim.x = THREADS_X;
+    blockDim.y = 1;
+    gridDim.x = DIVIDE(width, THREADS_X);
+    gridDim.y = height / 32 * bit;
+
+    auto kernel = make_sequential_4bit_kernel;
+    if (bit == 2) {
+      kernel = make_sequential_2bit_kernel;
+    } else if (bit == 3) {
+      kernel = make_sequential_3bit_kernel;
+      gridDim.y = height / 32;
+    } else if (bit == 8) {
+      kernel = make_sequential_8bit_kernel;
+    }
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    kernel<<<gridDim, blockDim, 0, stream>>>(q_weight, new_qweight, q_perm,
+                                             width);
+    // Replace qweights
+    cudaMemcpyAsync(q_weight, new_qweight,
+                    height / 32 * bit * width * sizeof(uint32_t),
+                    cudaMemcpyDeviceToDevice);
+    // Cleanup
+    cudaDeviceSynchronize();
+    cudaFree(new_qweight);
+  }
+  dim3 blockDim, gridDim;
+  blockDim.x = THREADS_X;
+  blockDim.y = 1;
+  gridDim.x = DIVIDE(width, THREADS_X);
+  gridDim.y = 1;
+  auto shuffle_kernel = shuffle_4bit_kernel;
+  if (bit == 2) {
+    shuffle_kernel = shuffle_2bit_kernel;
+  } else if (bit == 3) {
+    shuffle_kernel = shuffle_3bit_kernel;
+  } else if (bit == 8) {
+    shuffle_kernel = shuffle_8bit_kernel;
+  }
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  shuffle_kernel<<<gridDim, blockDim, 0, stream>>>(q_weight, height, width);
+}
+
+}  // namespace gptq
+}  // namespace vllm
+
+torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
+                        torch::Tensor b_gptq_qzeros,
+                        torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
+                        bool use_exllama, int64_t bit) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
+  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
+  at::Tensor c = torch::empty({a.size(0), b_q_weight.size(1)}, options);
+  at::Tensor temp_dq = torch::empty(
+      {b_q_weight.size(0) * 32 / bit, b_q_weight.size(1)}, options);
+
+  vllm::gptq::gemm_half_q_half_cuda(
+      at::cuda::getCurrentCUDABlasHandle(), (const half*)a.data_ptr(),
+      (const uint32_t*)b_q_weight.data_ptr(),
+      (const uint32_t*)b_gptq_qzeros.data_ptr(),
+      (const half*)b_gptq_scales.data_ptr(),
+      b_g_idx.device().is_meta() ? NULL : (const int*)b_g_idx.data_ptr(),
+      (half*)c.data_ptr(), (half*)temp_dq.data_ptr(),
+      c.size(0),              // m
+      c.size(1),              // n
+      a.size(1),              // k
+      b_gptq_qzeros.size(0),  // group number
+      use_exllama, bit);
+  return c;
+}
+
+void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(q_weight));
+  vllm::gptq::shuffle_exllama_weight(
+      (uint32_t*)q_weight.data_ptr(),
+      q_perm.device().is_meta() || q_perm.numel() == 0
+          ? NULL
+          : (int*)q_perm.data_ptr(),
+      q_weight.size(0) * 32 / bit, q_weight.size(1), bit);
+}
diff --git a/vllm_v0.10.0/csrc/quantization/gptq/qdq_2.cuh b/vllm_v0.10.0/csrc/quantization/gptq/qdq_2.cuh
new file mode 100644
index 0000000..ca0f810
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/gptq/qdq_2.cuh
@@ -0,0 +1,76 @@
+/*
+Copied from https://github.com/turboderp/exllamav2
+*/
+
+#ifndef _qdq_2_cuh
+#define _qdq_2_cuh
+
+#include "qdq_util.cuh"
+
+namespace vllm {
+namespace gptq {
+
+// Permutation:
+//
+// ffddbb99 77553311  eeccaa88 66442200
+
+__forceinline__ __device__ void shuffle_2bit_16(uint32_t* q, int stride) {
+  uint32_t qa = q[0];
+  uint32_t qb = 0;
+
+#pragma unroll
+  for (int i = 0; i < 8; i++) {
+    uint32_t qa0 = qa & 0x03;
+    uint32_t qa1 = (qa & 0x0c) >> 2;
+    qa >>= 4;
+    qb |= (qa1 << (i * 2 + 16));
+    qb |= (qa0 << (i * 2));
+  }
+  q[0] = qb;
+}
+
+__forceinline__ __device__ void dequant_2bit_16(const uint32_t q_0,
+                                                half2 (&dq)[8], int stride,
+                                                const uint32_t zero) {
+  const uint32_t c0 = 0x64006400;
+  const half y4_ = __float2half_rn(1.0f / 4.0f);
+  const half y16_ = __float2half_rn(1.0f / 16.0f);
+  const half y64_ = __float2half_rn(1.0f / 64.0f);
+  const half2 y4 = __halves2half2(y4_, y4_);
+  const half2 y16 = __halves2half2(y16_, y16_);
+  const half2 y64 = __halves2half2(y64_, y64_);
+
+  const half_uint16 z1_(0xe400 | zero);  // half(-1024.0f - zero);
+  const half z4_ = __hsub(__int2half_rn(-256), __int2half_rn(zero));
+  const half z16_ = __hsub(__int2half_rn(-64), __int2half_rn(zero));
+  const half z64_ = __hsub(__int2half_rn(-16), __int2half_rn(zero));
+  const half2 z1 = __half2half2(z1_.as_half);
+  const half2 z4 = __half2half2(z4_);
+  const half2 z16 = __half2half2(z16_);
+  const half2 z64 = __half2half2(z64_);
+
+  uint32_t qa = q_0;
+  half2_uint32 q0((qa & 0x00030003) | c0);  // half2(q[ 0], q[ 1])      + 1024
+  half2_uint32 q1((qa & 0x000c000c) | c0);  // half2(q[ 2], q[ 3]) *  4 + 1024
+  half2_uint32 q2((qa & 0x00300030) | c0);  // half2(q[ 4], q[ 5]) * 16 + 1024
+  half2_uint32 q3((qa & 0x00c000c0) | c0);  // half2(q[ 6], q[ 7]) * 64 + 1024
+  qa >>= 8;
+  half2_uint32 q4((qa & 0x00030003) | c0);  // half2(q[ 8], q[ 8])      + 1024
+  half2_uint32 q5((qa & 0x000c000c) | c0);  // half2(q[10], q[11]) *  4 + 1024
+  half2_uint32 q6((qa & 0x00300030) | c0);  // half2(q[12], q[13]) * 16 + 1024
+  half2_uint32 q7((qa & 0x00c000c0) | c0);  // half2(q[14], q[15]) * 64 + 1024
+
+  dq[0] = __hadd2(q0.as_half2, z1);
+  dq[1] = __hfma2(q1.as_half2, y4, z4);
+  dq[2] = __hfma2(q2.as_half2, y16, z16);
+  dq[3] = __hfma2(q3.as_half2, y64, z64);
+  dq[4] = __hadd2(q4.as_half2, z1);
+  dq[5] = __hfma2(q5.as_half2, y4, z4);
+  dq[6] = __hfma2(q6.as_half2, y16, z16);
+  dq[7] = __hfma2(q7.as_half2, y64, z64);
+}
+
+}  // namespace gptq
+}  // namespace vllm
+
+#endif
diff --git a/vllm_v0.10.0/csrc/quantization/gptq/qdq_3.cuh b/vllm_v0.10.0/csrc/quantization/gptq/qdq_3.cuh
new file mode 100644
index 0000000..0d5c2ad
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/gptq/qdq_3.cuh
@@ -0,0 +1,149 @@
+#ifndef _qdq_3_cuh
+#define _qdq_3_cuh
+
+#include "qdq_util.cuh"
+
+namespace vllm {
+namespace gptq {
+// Permutation:
+//
+// v9997775 55333111  u8886664 44222000  (u, v lsb)
+// vjjjhhhf ffdddbbb  uiiiggge eecccaaa
+// vtttrrrp ppnnnlll  usssqqqo oommmkkk
+
+__forceinline__ __device__ void shuffle_3bit_32(uint32_t* q, int stride) {
+  uint32_t qa = q[0 * stride];
+  uint32_t qb = q[1 * stride];
+  uint32_t qc = q[2 * stride];
+
+  // qa: aa999888 77766655  54443332 22111000
+  // qb: lkkkjjji iihhhggg  fffeeedd dcccbbba
+  // qc: vvvuuutt tsssrrrq  qqpppooo nnnmmmll
+
+  uint32_t qd = qc >> 26;
+  qc <<= 4;
+  qc |= qb >> 28;
+  qb <<= 2;
+  qb |= qa >> 30;
+
+  // qa: ..999888 77766655  54443332 22111000
+  // qb: ..jjjiii hhhgggff  feeedddc ccbbbaaa
+  // qc: ..tttsss rrrqqqpp  pooonnnm mmlllkkk
+  // qd:                               vvvuuu
+
+  uint32_t za = 0;
+  uint32_t zb = 0;
+  uint32_t zc = 0;
+
+  for (int i = 0; i < 5; i++) {
+    uint32_t t0 = qa & 0x07;
+    uint32_t t1 = (qa & 0x38) >> 3;
+    qa >>= 6;
+    za |= (t0 << (i * 3));
+    za |= (t1 << (i * 3 + 16));
+  }
+  for (int i = 0; i < 5; i++) {
+    uint32_t t0 = qb & 0x07;
+    uint32_t t1 = (qb & 0x38) >> 3;
+    qb >>= 6;
+    zb |= (t0 << (i * 3));
+    zb |= (t1 << (i * 3 + 16));
+  }
+  for (int i = 0; i < 5; i++) {
+    uint32_t t0 = qc & 0x07;
+    uint32_t t1 = (qc & 0x38) >> 3;
+    qc >>= 6;
+    zc |= (t0 << (i * 3));
+    zc |= (t1 << (i * 3 + 16));
+  }
+
+  // za:  9997775 55333111   8886664 44222000
+  // zb:  jjjhhhf ffdddbbb   iiiggge eecccaaa
+  // zc:  tttrrrp ppnnnlll   sssqqqo oommmkkk
+  // qd:                               vvvuuu
+
+  za |= ((qd & 0x01) >> 0) << 15;
+  zb |= ((qd & 0x02) >> 1) << 15;
+  zc |= ((qd & 0x04) >> 2) << 15;
+  za |= ((qd & 0x08) >> 3) << 31;
+  zb |= ((qd & 0x10) >> 4) << 31;
+  zc |= ((qd & 0x20) >> 5) << 31;
+
+  // za: v9997775 55333111  u8886664 44222000  (u, v lsb)
+  // zb: vjjjhhhf ffdddbbb  uiiiggge eecccaaa
+  // zc: vtttrrrp ppnnnlll  usssqqqo oommmkkk
+
+  q[0 * stride] = za;
+  q[1 * stride] = zb;
+  q[2 * stride] = zc;
+}
+
+__forceinline__ __device__ void dequant_3bit_32(const uint32_t q_0,
+                                                const uint32_t q_1,
+                                                const uint32_t q_2,
+                                                half2 (&dq)[16], int stride,
+                                                const uint32_t zero) {
+  const uint32_t c0 = 0x64006400;
+  const half y8_ = __float2half_rn(1.0f / 8.0f);
+  const half y64_ = __float2half_rn(1.0f / 64.0f);
+  const half2 y8 = __halves2half2(y8_, y8_);
+  const half2 y64 = __halves2half2(y64_, y64_);
+  const half_uint16 z1_(0xe400 | zero);  // half(-1024.0f - zero);
+  const half z8_ = __hsub(__int2half_rn(-128), __int2half_rn(zero));
+  const half z64_ = __hsub(__int2half_rn(-16), __int2half_rn(zero));
+  const half2 z1 = __halves2half2(z1_.as_half, z1_.as_half);
+  const half2 z8 = __halves2half2(z8_, z8_);
+  const half2 z64 = __halves2half2(z64_, z64_);
+
+  uint32_t qa = q_0;
+  uint32_t qb = q_1;
+  uint32_t qc = q_2;
+
+  half2_uint32 q0((qa & 0x00070007) | c0);  // half2(q[ 0], q[ 1])      + 1024
+  half2_uint32 q1((qa & 0x00380038) | c0);  // half2(q[ 2], q[ 3]) *  8 + 1024
+  qa >>= 6;
+  half2_uint32 q2((qa & 0x00070007) | c0);  // half2(q[ 4], q[ 5])      + 1024
+  half2_uint32 q3((qa & 0x00380038) | c0);  // half2(q[ 6], q[ 7]) *  8 + 1024
+  half2_uint32 q4((qa & 0x01c001c0) | c0);  // half2(q[ 8], q[ 9]) * 64 + 1024
+  qa >>= 9;
+  qa &= 0x00010001;
+  half2_uint32 q5((qb & 0x00070007) | c0);  // half2(q[10], q[11])      + 1024
+  half2_uint32 q6((qb & 0x00380038) | c0);  // half2(q[12], q[13]) *  8 + 1024
+  qb >>= 6;
+  half2_uint32 q7((qb & 0x00070007) | c0);  // half2(q[14], q[15])      + 1024
+  half2_uint32 q8((qb & 0x00380038) | c0);  // half2(q[16], q[17]) *  8 + 1024
+  half2_uint32 q9((qb & 0x01c001c0) | c0);  // half2(q[18], q[19]) * 64 + 1024
+  qb >>= 8;
+  qb &= 0x00020002;
+  half2_uint32 q10((qc & 0x00070007) | c0);  // half2(q[20], q[21])      + 1024
+  half2_uint32 q11((qc & 0x00380038) | c0);  // half2(q[22], q[23]) *  8 + 1024
+  qc >>= 6;
+  half2_uint32 q12((qc & 0x00070007) | c0);  // half2(q[24], q[25])      + 1024
+  half2_uint32 q13((qc & 0x00380038) | c0);  // half2(q[26], q[27]) *  8 + 1024
+  half2_uint32 q14((qc & 0x01c001c0) | c0);  // half2(q[28], q[29]) * 64 + 1024
+  qc >>= 7;
+  qc &= 0x00040004;
+  half2_uint32 q15((qa | qb | qc) | c0);
+
+  dq[0] = __hadd2(q0.as_half2, z1);
+  dq[1] = __hfma2(q1.as_half2, y8, z8);
+  dq[2] = __hadd2(q2.as_half2, z1);
+  dq[3] = __hfma2(q3.as_half2, y8, z8);
+  dq[4] = __hfma2(q4.as_half2, y64, z64);
+  dq[5] = __hadd2(q5.as_half2, z1);
+  dq[6] = __hfma2(q6.as_half2, y8, z8);
+  dq[7] = __hadd2(q7.as_half2, z1);
+  dq[8] = __hfma2(q8.as_half2, y8, z8);
+  dq[9] = __hfma2(q9.as_half2, y64, z64);
+  dq[10] = __hadd2(q10.as_half2, z1);
+  dq[11] = __hfma2(q11.as_half2, y8, z8);
+  dq[12] = __hadd2(q12.as_half2, z1);
+  dq[13] = __hfma2(q13.as_half2, y8, z8);
+  dq[14] = __hfma2(q14.as_half2, y64, z64);
+  dq[15] = __hadd2(q15.as_half2, z1);
+}
+
+}  // namespace gptq
+}  // namespace vllm
+
+#endif
diff --git a/vllm_v0.10.0/csrc/quantization/gptq/qdq_4.cuh b/vllm_v0.10.0/csrc/quantization/gptq/qdq_4.cuh
new file mode 100644
index 0000000..7f65d2d
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/gptq/qdq_4.cuh
@@ -0,0 +1,126 @@
+/*
+Copied from https://github.com/turboderp/exllamav2
+*/
+
+#ifndef _qdq_4_cuh
+#define _qdq_4_cuh
+
+#include "qdq_util.cuh"
+
+namespace vllm {
+namespace gptq {
+// Permutation:
+//
+// 77775555 33331111  66664444 22220000
+
+__forceinline__ __device__ void shuffle_4bit_8(uint32_t* q, int stride) {
+  uint32_t qa = q[0];
+  uint32_t qb = 0;
+
+#pragma unroll
+  for (int i = 0; i < 4; i++) {
+    uint32_t qa0 = qa & 0x0f;
+    uint32_t qa1 = (qa & 0xf0) >> 4;
+    qa >>= 8;
+    qb |= (qa1 << (i * 4 + 16));
+    qb |= (qa0 << (i * 4));
+  }
+  q[0] = qb;
+}
+
+__forceinline__ __device__ void dequant_4bit_8(const uint32_t q_0,
+                                               half2 (&dq)[4], int stride,
+                                               const uint32_t zero) {
+  const uint32_t c0 = 0x64006400;
+  const half y16_ = __float2half_rn(1.0f / 16.0f);
+  const half2 y16 = __halves2half2(y16_, y16_);
+  const half_uint16 z1_(0xe400 | zero);  // half(-1024.0f - zero);
+  const half z16_ = __hsub(__int2half_rn(-64), __int2half_rn(zero));
+  const half2 z1 = __half2half2(z1_.as_half);
+  const half2 z16 = __half2half2(z16_);
+
+  uint32_t qa = q_0;
+  half2_uint32 q0((qa & 0x000f000f) | c0);  // half2(q[ 0], q[ 1])      + 1024
+  half2_uint32 q1((qa & 0x00f000f0) | c0);  // half2(q[ 2], q[ 3]) * 16 + 1024
+  qa >>= 8;
+  half2_uint32 q2((qa & 0x000f000f) | c0);  // half2(q[ 4], q[ 5])      + 1024
+  half2_uint32 q3((qa & 0x00f000f0) | c0);  // half2(q[ 6], q[ 7]) * 16 + 1024
+
+  dq[0] = __hadd2(q0.as_half2, z1);
+  dq[1] = __hfma2(q1.as_half2, y16, z16);
+  dq[2] = __hadd2(q2.as_half2, z1);
+  dq[3] = __hfma2(q3.as_half2, y16, z16);
+}
+
+__forceinline__ __device__ void dequant_4bit_8_prep_zero_scale(
+    const uint32_t zero, const half scale, half2 (&z1z16)[2],
+    half2 (&y1y16)[2]) {
+  half_uint16 z1(0xe400 | zero);  // half(-1024.0f - zero);
+  half z16 = __hsub(__int2half_rn(-64), __int2half_rn(zero));
+
+  half2 scale2 = __half2half2(scale);
+
+  z1z16[0] = __hmul2(scale2, __half2half2(z1.as_half));
+  z1z16[1] = __hmul2(scale2, __half2half2(z16));
+
+  const half y1 = __float2half_rn(1.0f);
+  const half y16 = __float2half_rn(1.0f / 16.0f);
+
+  y1y16[0] = __hmul2(scale2, __half2half2(y1));
+  y1y16[1] = __hmul2(scale2, __half2half2(y16));
+}
+
+__forceinline__ __device__ void dequant_4bit_8_prep_zero(const uint32_t zero,
+                                                         half2 (&z1z16)[2],
+                                                         half2 (&y1y16)[2]) {
+  half_uint16 z1(0xe400 | zero);  // half(-1024.0f - zero);
+  half z16 = __hsub(__int2half_rn(-64), __int2half_rn(zero));
+
+  z1z16[0] = __half2half2(z1.as_half);
+  z1z16[1] = __half2half2(z16);
+
+  const half y1 = __float2half_rn(1.0f);
+  const half y16 = __float2half_rn(1.0f / 16.0f);
+
+  y1y16[0] = __half2half2(y1);
+  y1y16[1] = __half2half2(y16);
+}
+
+__forceinline__ __device__ void dequant_4bit_8_gptq(const uint32_t q_0,
+                                                    half2 (&dq)[4],
+                                                    half2 (&z1z16)[2],
+                                                    half2 (&y1y16)[2],
+                                                    int stride, bool scaled) {
+  const uint32_t c0 = 0x64006400;
+
+  uint32_t qa = q_0;
+  half2_uint32 q0((qa & 0x000f000f) |
+                  c0);  // half2( q[0]      + 1024, q[1]      + 1024 )
+  half2_uint32 q1((qa & 0x00f000f0) |
+                  c0);  // half2( q[2] * 16 + 1024, q[3] * 16 + 1024 )
+  qa >>= 8;
+  half2_uint32 q2((qa & 0x000f000f) |
+                  c0);  // half2( q[4]      + 1024, q[5]      + 1024 )
+  half2_uint32 q3((qa & 0x00f000f0) |
+                  c0);  // half2( q[6] * 16 + 1024, q[7] * 16 + 1024 )
+
+  if (scaled) {
+    dq[0] = __hfma2(q0.as_half2, y1y16[0],
+                    z1z16[0]);  // half2( q[0] * s - z * s, q[1] * s - z * s)
+    dq[1] = __hfma2(q1.as_half2, y1y16[1],
+                    z1z16[1]);  // half2( q[2] * s - z * s, q[3] * s - z * s)
+    dq[2] = __hfma2(q2.as_half2, y1y16[0], z1z16[0]);
+    dq[3] = __hfma2(q3.as_half2, y1y16[1], z1z16[1]);
+  } else {
+    dq[0] = __hadd2(q0.as_half2, z1z16[0]);  // half2( q[0] - z, q[1] - z )
+    dq[1] = __hfma2(q1.as_half2, y1y16[1],
+                    z1z16[1]);               // half2( q[2] - z, q[3] - z )
+    dq[2] = __hadd2(q2.as_half2, z1z16[0]);  // half2( q[4] - z, q[5] - z )
+    dq[3] = __hfma2(q3.as_half2, y1y16[1],
+                    z1z16[1]);  // half2( q[6] - z, q[7] - z )
+  }
+}
+}  // namespace gptq
+}  // namespace vllm
+
+#endif
diff --git a/vllm_v0.10.0/csrc/quantization/gptq/qdq_8.cuh b/vllm_v0.10.0/csrc/quantization/gptq/qdq_8.cuh
new file mode 100644
index 0000000..feb5d22
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/gptq/qdq_8.cuh
@@ -0,0 +1,30 @@
+/*
+Copied from https://github.com/turboderp/exllamav2
+*/
+
+#ifndef _qdq_8_cuh
+#define _qdq_8_cuh
+
+#include "qdq_util.cuh"
+
+namespace vllm {
+namespace gptq {
+
+__forceinline__ __device__ void shuffle_8bit_4(uint32_t* q, int stride) {}
+
+__forceinline__ __device__ void dequant_8bit_8(const uint32_t q_0,
+                                               const uint32_t q_1,
+                                               half2 (&dq)[4], int stride,
+                                               const uint32_t zero) {
+  half dqh[8];
+  for (int i = 0; i < 4; i++) dqh[i] = dq_ns(exb(q_0, i * 8, 0xff), zero);
+  for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), zero);
+
+  for (int i = 0; i < 4; i++)
+    dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
+}
+
+}  // namespace gptq
+}  // namespace vllm
+
+#endif
diff --git a/vllm_v0.10.0/csrc/quantization/gptq/qdq_util.cuh b/vllm_v0.10.0/csrc/quantization/gptq/qdq_util.cuh
new file mode 100644
index 0000000..9426408
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/gptq/qdq_util.cuh
@@ -0,0 +1,56 @@
+/*
+Copied from https://github.com/turboderp/exllamav2
+*/
+
+#ifndef _qdq_util_cuh
+#define _qdq_util_cuh
+
+namespace vllm {
+namespace gptq {
+
+union half2_uint32 {
+  uint32_t as_uint32;
+  half2 as_half2;
+  __device__ half2_uint32(uint32_t val) : as_uint32(val) {}
+  __device__ half2_uint32(half2 val) : as_half2(val) {}
+};
+
+union half_uint16 {
+  uint16_t as_uint16;
+  half as_half;
+  __device__ half_uint16(uint16_t val) : as_uint16(val) {}
+  __device__ half_uint16(half val) : as_half(val) {}
+};
+
+// Max_scale premultiplied by 1/256
+
+__forceinline__ __device__ half dq_scale(const int qs, const half max_scale) {
+  int qs_i = qs + 1;
+  half qs_h = __int2half_rn(qs_i * qs_i);
+  qs_h = __hmul(qs_h, max_scale);
+  return qs_h;
+}
+
+__forceinline__ __device__ half dq(const int q, const int qzero,
+                                   const half scale) {
+  return __hmul(__int2half_rn(q - qzero), scale);
+}
+
+__forceinline__ __device__ half dq_ns(const int q, const int qzero) {
+  // return __hsub(__int2half_rn(q), __int2half_rn(qzero));
+  return __int2half_rn(q - qzero);
+}
+
+__forceinline__ __device__ int exb(const uint32_t q, const int shift,
+                                   const int mask) {
+  return (int)((q >> shift) & mask);
+}
+
+__forceinline__ __device__ int exb(const uint32_t q1, const uint32_t q0,
+                                   const int shift, const int mask) {
+  return (int)(__funnelshift_rc(q0, q1, shift) & mask);
+}
+
+}  // namespace gptq
+}  // namespace vllm
+#endif
diff --git a/vllm_v0.10.0/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu b/vllm_v0.10.0/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
new file mode 100644
index 0000000..03bd596
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
@@ -0,0 +1,1011 @@
+#include "allspark_utils.cuh"
+#include <torch/all.h>
+#include "core/registration.h"
+#include <cublas_v2.h>
+
+at::Tensor as_g_workspace;
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+
+torch::Tensor allspark_w8a16_gemm(
+    torch::Tensor const& a, torch::Tensor const& b_qweight,
+    torch::Tensor const& b_scales, std::optional<torch::Tensor> const& b_qzeros,
+    int64_t n, int64_t group_size, int64_t sm_count, int64_t sm_version,
+    int64_t CUBLAS_M_THRESHOLD, bool has_zp, bool n32k16_reorder) {
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false, "allspark_w8a16_gemm(..) requires CUDA_ARCH >= 8.0");
+  return torch::empty({1, 1});
+}
+
+#else
+namespace allspark {
+/*
+ * GemmTile manage data movement from Global Memory to Shared Memory
+ * requiring N % 8 == 0， K % 16 == 0 by loading uint
+ * BN is obtained by padding the original N to a multiple of 32
+ * weight B is rearranged as N32K16 order,
+ * i.e. a initial data block of size 32(n)x16(k) is reordered as n8k4n4k4，
+ * in order to put data loaded by the same thread of 32x16 data block together
+ * continuously (see
+ * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type)
+ */
+template <typename FType, typename QType, int Mtile, int Ntile, int NStage,
+          int BLOCK>
+struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
+  // element num loaded by a LDG inst.
+  static constexpr int LDG_ELEMENT_CNT_A = 8;
+  static constexpr int LDG_ELEMENT_CNT_B = 16;
+  static constexpr int WARP_SIZE = 32;
+  static constexpr int M_SIZE_ONE_LOAD = (BLOCK * LDG_ELEMENT_CNT_A) / 32;
+  static constexpr int N_SIZE_ONE_LOAD = (BLOCK * LDG_ELEMENT_CNT_B) / 32;
+
+  __device__ GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK(
+      const SM8x_GEMM_W8A16_Splitk_Params<FType, QType>& k_params,
+      const uint32_t& A_smem_addr, const uint32_t& BQ_smem_addr,
+      const uint32_t& A_stage_stride, const uint32_t& BQ_stage_stride)
+      : params(k_params),
+        A_smem_base_addr(A_smem_addr),
+        BQ_smem_base_addr(BQ_smem_addr),
+        A_smem_stage_stride(A_stage_stride),
+        BQ_smem_stage_stride(BQ_stage_stride) {
+    this_block_A_base_ptr = params.A_ptr + blockIdx.x * Mtile * params.K +
+                            blockIdx.z * params.SplitK;
+    // here B is rearranged as N32K16 order, i.e. 4 continuous N-direction
+    // 8(N)x16(K) size data blocks are packed together
+    this_block_B_base_ptr = params.B_ptr + blockIdx.y * Ntile * params.K +
+                            blockIdx.z * params.SplitK * 4;
+
+    const auto lane_id = threadIdx.x % WARP_SIZE;
+
+    // For matrix A, a block load/store Mtile(row) x 32(col) elements in
+    // multiple iters, 8x4 warp load/store 8(row) x 32(col) elements per iter
+    const auto Aldg_row_base_idx = threadIdx.x / 4;
+    Aldg_col_idx = (threadIdx.x % 4) * LDG_ELEMENT_CNT_A;
+    const int Aldg_base_offset = Aldg_row_base_idx * params.K + Aldg_col_idx;
+
+    // For matrix B, a block load/store elements of (Ntile / 4) row x 128 col
+    // elements of N32K16 packing in multiple iters, 4x8 warp load/store 4(row)
+    // * 128(col) per iter
+    Bldg_col_idx = (threadIdx.x % 8) * LDG_ELEMENT_CNT_B;
+    const auto Bldg_row_base_idx = threadIdx.x / 8;
+    const int Bldg_base_offset =
+        Bldg_row_base_idx * params.K * 4 + Bldg_col_idx;
+
+    this_block_A_base_ptr += Aldg_base_offset;
+    this_block_B_base_ptr += Bldg_base_offset;
+
+    const int sts_a_base_offset =
+        (threadIdx.x / 4) * 32 +
+        ((lane_id % 4) ^ ((lane_id / 4) % 4) ^ ((lane_id / 4) / 4)) *
+            LDG_ELEMENT_CNT_A;
+    const int sts_bq_base_offset =
+        Bldg_row_base_idx * 32 * 4 +
+        ((threadIdx.x % 8) ^ (((threadIdx.x / 8) % 2) * 4)) * LDG_ELEMENT_CNT_B;
+
+    A_smem_base_addr += sts_a_base_offset * sizeof(FType);
+    BQ_smem_base_addr += sts_bq_base_offset * sizeof(uint8_t);
+
+    A_ldg_guard = 0;
+    B_ldg_guard = 0;
+  #pragma unroll
+    for (int i = 0; i < (Mtile + M_SIZE_ONE_LOAD - 1) / M_SIZE_ONE_LOAD; ++i) {
+      auto m_idx = blockIdx.x * Mtile + Aldg_row_base_idx + i * M_SIZE_ONE_LOAD;
+      if (m_idx < params.M) {
+        A_ldg_guard |= (1u << i);
+      }
+    }
+
+    const int N_padded = (params.N + 31) / 32 * 32;
+  #pragma unroll
+    for (int i = 0; i < (Ntile + N_SIZE_ONE_LOAD - 1) / N_SIZE_ONE_LOAD; ++i) {
+      auto n_idx = blockIdx.y * Ntile + (Bldg_row_base_idx / 8) * 32 +
+                   i * N_SIZE_ONE_LOAD;
+      if (n_idx < N_padded) {
+        B_ldg_guard |= (1u << i);
+      }
+    }
+  }
+
+  __device__ void ldgsts_first_ktiles(const int& first_k_tile,
+                                      const int& k_tiles) {
+    // load first k_tile
+    // load A
+    const int A_src_size = Aldg_col_idx < first_k_tile ? 16 : 0;
+  #pragma unroll
+    for (int i = 0; i < (Mtile + M_SIZE_ONE_LOAD - 1) / M_SIZE_ONE_LOAD; ++i) {
+      cp_async<16>(
+          A_smem_base_addr + (i * M_SIZE_ONE_LOAD * 32) * sizeof(FType),
+          this_block_A_base_ptr + i * M_SIZE_ONE_LOAD * params.K, A_src_size,
+          (A_ldg_guard & (1u << i)) != 0);
+    }
+
+    // load B
+    const int B_src_size = (Bldg_col_idx / 4) < first_k_tile ? 16 : 0;
+  #pragma unroll
+    for (int i = 0; i < (Ntile + N_SIZE_ONE_LOAD - 1) / N_SIZE_ONE_LOAD; ++i) {
+      cp_async<16>(
+          BQ_smem_base_addr + (i * N_SIZE_ONE_LOAD * 32) * sizeof(uint8_t),
+          this_block_B_base_ptr + i * N_SIZE_ONE_LOAD * params.K, B_src_size,
+          (B_ldg_guard & (1u << i)) != 0);
+    }
+
+    cp_async_commit_group();
+    this_block_A_base_ptr += first_k_tile;
+    this_block_B_base_ptr += (first_k_tile * 4);
+
+    // load second to (N-stage - 1) k_tiles
+    for (int stage_idx = 1; stage_idx < NStage - 1; ++stage_idx) {
+      if (stage_idx < k_tiles) {
+  #pragma unroll
+        for (int i = 0; i < (Mtile + M_SIZE_ONE_LOAD - 1) / M_SIZE_ONE_LOAD;
+             ++i) {
+          cp_async<16>(A_smem_base_addr + stage_idx * A_smem_stage_stride +
+                           (i * M_SIZE_ONE_LOAD * 32) * sizeof(FType),
+                       this_block_A_base_ptr + i * M_SIZE_ONE_LOAD * params.K,
+                       16, (A_ldg_guard & (1u << i)) != 0);
+        }
+
+  #pragma unroll
+        for (int i = 0; i < (Ntile + N_SIZE_ONE_LOAD - 1) / N_SIZE_ONE_LOAD;
+             ++i) {
+          cp_async<16>(BQ_smem_base_addr + stage_idx * BQ_smem_stage_stride +
+                           (i * N_SIZE_ONE_LOAD * 32) * sizeof(uint8_t),
+                       this_block_B_base_ptr + i * N_SIZE_ONE_LOAD * params.K,
+                       16, (B_ldg_guard & (1u << i)) != 0);
+        }
+
+        this_block_A_base_ptr += 32;
+        this_block_B_base_ptr += (32 * 4);
+      }
+      cp_async_commit_group();
+    }
+  }
+
+  __device__ void ldgsts(const int& sts_stage_idx) {
+    const int a_stage_offset = sts_stage_idx * A_smem_stage_stride;
+    const int bq_stage_offset = sts_stage_idx * BQ_smem_stage_stride;
+  #pragma unroll
+    for (int i = 0; i < (Mtile + M_SIZE_ONE_LOAD - 1) / M_SIZE_ONE_LOAD; ++i) {
+      cp_async<16>(A_smem_base_addr + a_stage_offset +
+                       (i * M_SIZE_ONE_LOAD * 32) * sizeof(FType),
+                   this_block_A_base_ptr + i * M_SIZE_ONE_LOAD * params.K, 16,
+                   (A_ldg_guard & (1u << i)) != 0);
+    }
+
+  #pragma unroll
+    for (int i = 0; i < (Ntile + N_SIZE_ONE_LOAD - 1) / N_SIZE_ONE_LOAD; ++i) {
+      cp_async<16>(BQ_smem_base_addr + bq_stage_offset +
+                       (i * N_SIZE_ONE_LOAD * 32) * sizeof(uint8_t),
+                   this_block_B_base_ptr + i * N_SIZE_ONE_LOAD * params.K, 16,
+                   (B_ldg_guard & (1u << i)) != 0);
+    }
+
+    cp_async_commit_group();
+    this_block_A_base_ptr += 32;
+    this_block_B_base_ptr += (32 * 4);
+  }
+
+  const FType* this_block_A_base_ptr = nullptr;
+  const QType* this_block_B_base_ptr = nullptr;
+
+  int Aldg_col_idx;
+  int Bldg_col_idx;
+
+  uint32_t A_ldg_guard;
+  uint32_t B_ldg_guard;
+
+  uint32_t A_smem_base_addr, BQ_smem_base_addr;
+  const uint32_t A_smem_stage_stride, BQ_smem_stage_stride;
+
+  const SM8x_GEMM_W8A16_Splitk_Params<FType, QType>& params;
+};
+
+/*
+ * requiring N % 8 == 0
+ */
+template <typename FType, typename QType, int Mtile, int Ntile, int BLOCK,
+          bool EnableFuse, bool has_zp>
+struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
+  static constexpr int WARP_SIZE = 32;
+  static constexpr int WARP_CNT = BLOCK / WARP_SIZE;
+  static constexpr int WARP_NTILE = Ntile / WARP_CNT;
+  static constexpr int WARP_NITER = WARP_NTILE / 8;  // hmma16816
+  static_assert(WARP_NTILE == 32 or WARP_NTILE == 64,
+                "now only support WARP_NTILE = 32 or 64!");
+
+  __device__ ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK(
+      const SM8x_GEMM_W8A16_Splitk_Params<FType, QType>& k_params,
+      const uint32_t& A_smem_addr, const uint32_t& BQ_smem_addr,
+      const uint32_t& A_stage_stride, const uint32_t& BQ_stage_stride)
+      : params(k_params),
+        A_smem_base_addr(A_smem_addr),
+        BQ_smem_base_addr(BQ_smem_addr),
+        A_smem_stage_stride(A_stage_stride),
+        BQ_smem_stage_stride(BQ_stage_stride) {
+    warp_id = threadIdx.x / WARP_SIZE;
+    lane_id = threadIdx.x % WARP_SIZE;
+
+    load_a_base_offset[0] =
+        (lane_id % 16) * 32 +
+        ((lane_id / 16) ^ (lane_id % 4) ^ ((lane_id / 4) % 2)) * 8;
+    load_a_base_offset[1] =
+        (lane_id % 16) * 32 +
+        ((lane_id / 16 + 2) ^ (lane_id % 4) ^ ((lane_id / 4) % 2)) * 8;
+
+    load_b_base_offset[0] =
+        (lane_id / 4 + warp_id * (WARP_NTILE / 4)) * 32 * 4 +
+        (lane_id % 4) * 16 + ((lane_id / 4) % 2) * 16 * 4;
+    load_b_base_offset[1] =
+        (lane_id / 4 + warp_id * (WARP_NTILE / 4)) * 32 * 4 +
+        (lane_id % 4) * 16 + (((lane_id / 4) % 2) ^ 1) * 16 * 4;
+
+    sts_c_base_offset = warp_id * Mtile * WARP_NTILE +
+                        (lane_id / 4) * WARP_NTILE + (lane_id % 4) * 2;
+
+    if (EnableFuse) {
+      this_block_C_base_ptr =
+          params.C_ptr + blockIdx.x * Mtile * params.N + blockIdx.y * Ntile;
+    } else {
+      this_block_C_base_ptr =
+          params.C_split_ptr + blockIdx.z * params.M * params.N +
+          blockIdx.x * Mtile * params.N + blockIdx.y * Ntile;
+    }
+    int store_thds_in_row = WARP_NTILE / 8;
+    store_c_row_base_idx = lane_id / store_thds_in_row;
+    store_c_col_idx = warp_id * WARP_NTILE + (lane_id % store_thds_in_row) * 8;
+    store_c_base_offset = store_c_row_base_idx * params.N + store_c_col_idx;
+
+  #pragma unroll
+    for (int i = 0; i < Mtile / 16; ++i) {
+  #pragma unroll
+      for (int j = 0; j < WARP_NITER; ++j) {
+  #pragma unroll
+        for (int k = 0; k < 4; ++k) {
+          C_frag[i][j][k] = 0.f;
+        }
+      }
+    }
+    params_n_idx =
+        blockIdx.y * Ntile + warp_id * WARP_NTILE + (lane_id / 4) * 4;
+  }
+
+  __device__ void lds(const int& smem_stage_idx, const int& reg_buf_idx,
+                      const int& k_phase_idx) {
+    uint32_t A_smem_addr =
+        A_smem_base_addr + A_smem_stage_stride * smem_stage_idx;
+    uint32_t B_smem_addr =
+        BQ_smem_base_addr + BQ_smem_stage_stride * smem_stage_idx;
+
+  #pragma unroll
+    for (int i = 0; i < Mtile / 16; ++i) {
+      ldsm_4(A_frag[reg_buf_idx][i][0], A_frag[reg_buf_idx][i][1],
+             A_frag[reg_buf_idx][i][2], A_frag[reg_buf_idx][i][3],
+             A_smem_addr + (load_a_base_offset[k_phase_idx] + i * 16 * 32) *
+                               sizeof(FType));
+    }
+  #pragma unroll
+    for (int i = 0; i < WARP_NTILE / 32; ++i) {
+      lds128(BQ_frag[reg_buf_idx][4 * i + 0], BQ_frag[reg_buf_idx][4 * i + 1],
+             BQ_frag[reg_buf_idx][4 * i + 2], BQ_frag[reg_buf_idx][4 * i + 3],
+             B_smem_addr + (load_b_base_offset[k_phase_idx] + i * 32 * 32) *
+                               sizeof(uint8_t));
+    }
+
+  // dequant B
+  #pragma unroll
+    for (int i = 0; i < WARP_NITER / 2; ++i) {
+      cvt_8bx4_to_16bx4_bias128(BQ_frag[reg_buf_idx][2 * i],
+                                BF_frag[reg_buf_idx][2 * i]);
+      if (has_zp) {
+        BF_frag[reg_buf_idx][2 * i][0] =
+            __hsub2(BF_frag[reg_buf_idx][2 * i][0], num2num2(B_zero[i].x));
+        BF_frag[reg_buf_idx][2 * i][1] =
+            __hsub2(BF_frag[reg_buf_idx][2 * i][1], num2num2(B_zero[i].x));
+      }
+
+      BF_frag[reg_buf_idx][2 * i][0] =
+          __hmul2(BF_frag[reg_buf_idx][2 * i][0], num2num2(B_scale[i].x));
+      BF_frag[reg_buf_idx][2 * i][1] =
+          __hmul2(BF_frag[reg_buf_idx][2 * i][1], num2num2(B_scale[i].x));
+
+      cvt_8bx4_to_16bx4_bias128(BQ_frag[reg_buf_idx][2 * i + 1],
+                                BF_frag[reg_buf_idx][2 * i + 1]);
+      if (has_zp) {
+        BF_frag[reg_buf_idx][2 * i + 1][0] =
+            __hsub2(BF_frag[reg_buf_idx][2 * i + 1][0], num2num2(B_zero[i].y));
+        BF_frag[reg_buf_idx][2 * i + 1][1] =
+            __hsub2(BF_frag[reg_buf_idx][2 * i + 1][1], num2num2(B_zero[i].y));
+      }
+
+      BF_frag[reg_buf_idx][2 * i + 1][0] =
+          __hmul2(BF_frag[reg_buf_idx][2 * i + 1][0], num2num2(B_scale[i].y));
+      BF_frag[reg_buf_idx][2 * i + 1][1] =
+          __hmul2(BF_frag[reg_buf_idx][2 * i + 1][1], num2num2(B_scale[i].y));
+    }
+  }
+
+  __device__ void ldg_params() {
+    const int N_padded = (params.N + 31) / 32 * 32;
+    // load B scale and zero_point
+  #pragma unroll
+    for (int i = 0; i < WARP_NTILE / 32; ++i) {
+      ldg64_ca(B_scale[2 * i + 0], B_scale[2 * i + 1],
+               params.B_scale_ptr + params_n_idx + i * 32,
+               (params_n_idx + i * 32) < N_padded);
+      if (has_zp) {
+        ldg64_ca(B_zero[2 * i + 0], B_zero[2 * i + 1],
+                 params.B_zero_ptr + params_n_idx + i * 32,
+                 (params_n_idx + i * 32) < N_padded);
+      }
+    }
+  }
+
+  __device__ void mma(const int& reg_buf_idx) {
+  #pragma unroll
+    for (int m_idx = 0; m_idx < Mtile / 16; ++m_idx) {
+  #pragma unroll
+      for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) {
+        hmma16816_f32<FType>(
+            C_frag[m_idx][n_idx], A_frag[reg_buf_idx][m_idx],
+            reinterpret_cast<uint32_t (&)[2]>(BF_frag[reg_buf_idx][n_idx]));
+      }
+    }
+  }
+
+  __device__ void fused_splitk_reduce() {
+    // need splitk-reduce if enable splitk
+    if (gridDim.z > 1) {
+      auto blk_red_idx = blockIdx.x * gridDim.y + blockIdx.y;
+      // Wait for all previous blocks in the splitk direction to accumulate the
+      // results into C_tmp
+      if (threadIdx.x == 0) {
+        uint32_t* red_count_ptr = params.red_count_ptr + blk_red_idx;
+        uint32_t count;
+        do {
+          // make sure the ld.cg inside the do-wile loop
+          __threadfence_block();
+          asm volatile("ld.global.cg.b32 %0, [%1];"
+                       : "=r"(count)
+                       : "l"(red_count_ptr));
+        } while (count != blockIdx.z);
+      }
+      __syncthreads();
+
+      auto C_tmp_base_offset = blk_red_idx * Mtile * Ntile + threadIdx.x * 4;
+      if (blockIdx.z != 0) {
+        // expecting that temporary register here reuses the previous A&B frag
+        // register
+        float temp_frag[Mtile / 16][WARP_NITER][4];
+  #pragma unroll
+        for (int m_idx = 0; m_idx < Mtile / 16; ++m_idx) {
+  #pragma unroll
+          for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) {
+            int offset =
+                C_tmp_base_offset + (m_idx * WARP_NITER + n_idx) * BLOCK * 4;
+            *reinterpret_cast<int4*>(temp_frag[m_idx][n_idx]) =
+                *reinterpret_cast<int4*>(params.C_tmp_ptr + offset);
+          }
+        }
+  #pragma unroll
+        for (int m_idx = 0; m_idx < Mtile / 16; ++m_idx) {
+  #pragma unroll
+          for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) {
+  #pragma unroll
+            for (int idx = 0; idx < 4; ++idx) {
+              C_frag[m_idx][n_idx][idx] += temp_frag[m_idx][n_idx][idx];
+            }
+          }
+        }
+      }
+
+      // first splitk - 1 blocks need to write partial results into C_tmp
+      if (blockIdx.z != gridDim.z - 1) {
+  #pragma unroll
+        for (int m_idx = 0; m_idx < Mtile / 16; ++m_idx) {
+  #pragma unroll
+          for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) {
+            int offset =
+                C_tmp_base_offset + (m_idx * WARP_NITER + n_idx) * BLOCK * 4;
+            asm volatile(
+                "{st.global.cg.v4.b32 [%0], {%1, %2, %3, %4};}\n"
+                :
+                : "l"(params.C_tmp_ptr + offset), "f"(C_frag[m_idx][n_idx][0]),
+                  "f"(C_frag[m_idx][n_idx][1]), "f"(C_frag[m_idx][n_idx][2]),
+                  "f"(C_frag[m_idx][n_idx][3]));
+          }
+        }
+        __threadfence();
+        __syncthreads();
+        if (threadIdx.x == 0) {
+          uint32_t* red_count_ptr = params.red_count_ptr + blk_red_idx;
+          atomicInc(red_count_ptr, gridDim.z);
+        }
+      }
+    }
+  }
+
+  __device__ void stg(char* smem) {
+    if (EnableFuse) {
+      if (blockIdx.z != gridDim.z - 1) return;
+    }
+    uint32_t* C_sts_ptr =
+        reinterpret_cast<uint32_t*>(smem + sts_c_base_offset * sizeof(FType));
+    // C_tile sts
+  #pragma unroll
+    for (int m_idx = 0; m_idx < Mtile / 16; ++m_idx) {
+  #pragma unroll
+      for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) {
+  #pragma unroll
+        for (int k_idx = 0; k_idx < 2; ++k_idx) {
+          FType low16 =
+              ScalarType<FType>::float2num(C_frag[m_idx][n_idx][k_idx * 2]);
+          FType high16 =
+              ScalarType<FType>::float2num(C_frag[m_idx][n_idx][k_idx * 2 + 1]);
+          uint32_t tmp = (reinterpret_cast<uint32_t&>(low16) & 0xffff) |
+                         (reinterpret_cast<uint32_t&>(high16) << 16);
+          int sts_offset =
+              m_idx * 16 * (WARP_NTILE / 2) +
+              (((lane_id / (32 / WARP_NITER)) + n_idx) % WARP_NITER) * (8 / 2) +
+              k_idx * 8 * (WARP_NTILE / 2);
+          C_sts_ptr[sts_offset] = tmp;
+        }
+      }
+    }
+
+    __syncthreads();
+
+    FType* C_base_ptr = this_block_C_base_ptr + store_c_base_offset;
+    // C_tile lds and stg
+    auto m_base_idx = store_c_row_base_idx + blockIdx.x * Mtile;
+    bool n_guard = (store_c_col_idx + blockIdx.y * Ntile) < params.N;
+    if (WARP_NTILE == 32) {
+      int lds_c_base_offset = warp_id * Mtile * WARP_NTILE +
+                              (lane_id / 4) * WARP_NTILE +
+                              ((lane_id % 4 + lane_id / 8) % 4) * 8;
+      uint4* C_lds_ptr =
+          reinterpret_cast<uint4*>(smem + lds_c_base_offset * sizeof(FType));
+  #pragma unroll
+      for (int i = 0; i < (Mtile / 16) * (WARP_NITER / 2); ++i) {
+        uint4 stg_reg = C_lds_ptr[i * 8 * 4];
+        stg128(stg_reg.x, stg_reg.y, stg_reg.z, stg_reg.w,
+               C_base_ptr + i * 8 * params.N,
+               (m_base_idx + i * 8) < params.M && n_guard);
+      }
+    } else if (WARP_NTILE == 64) {
+      int lds_c_base_offset =
+          warp_id * Mtile * WARP_NTILE + (lane_id / 8) * WARP_NTILE;
+  #pragma unroll
+      for (int i = 0; i < (Mtile / 16) * (WARP_NITER / 2); ++i) {
+        int lds_c_offset = lds_c_base_offset + i * 4 * WARP_NTILE +
+                           ((lane_id % 8 + lane_id / 8 + (i % 2) * 4) % 8) * 8;
+        uint4 stg_reg =
+            *reinterpret_cast<uint4*>(smem + lds_c_offset * sizeof(FType));
+        stg128(stg_reg.x, stg_reg.y, stg_reg.z, stg_reg.w,
+               C_base_ptr + i * 4 * params.N,
+               (m_base_idx + i * 4) < params.M && n_guard);
+      }
+    }
+  }
+
+  const SM8x_GEMM_W8A16_Splitk_Params<FType, QType>& params;
+
+  int load_a_base_offset[2];
+  int load_b_base_offset[2];
+  int sts_c_base_offset;
+
+  int store_c_base_offset;
+
+  int store_c_row_base_idx, store_c_col_idx;
+  FType* this_block_C_base_ptr = nullptr;
+
+  int params_n_idx;
+  const uint32_t A_smem_base_addr, BQ_smem_base_addr;
+  const uint32_t A_smem_stage_stride, BQ_smem_stage_stride;
+
+  int lane_id;
+  int warp_id;
+  // first 2 denotes double buffer, second dim denotes M direction
+  uint32_t A_frag[2][Mtile / 16][4];
+
+  typename HalfType<FType>::T2 B_scale[WARP_NITER / 2];
+  typename HalfType<FType>::T2 B_zero[WARP_NITER / 2];
+  uint32_t BQ_frag[2][WARP_NITER];
+  // first 2 denotes double buffer, second dim denotes N direction, last 2
+  // denotes K direction
+  typename HalfType<FType>::T2 BF_frag[2][WARP_NITER][2];
+  // first dim denotes M direction, second dim denotes N direction
+  float C_frag[Mtile / 16][WARP_NITER][4];
+};
+
+/*
+ *  @brief W8A16 Perchannel Quantization GEMM,
+ *         requires N % 8 == 0, K % 16 == 0
+ *         accumulator precision: FP32
+ *  @tparam FType: DataType for A, B_scale, B_zero, and C, supports half or
+ * nv_bfloat16
+ *  @tparam QType: DataType for B, support uint8(bias128)
+ *  @tparam Mtile: M-dimensional size of the gemm block tile, supports 16, 32,
+ * 48 or 64
+ *  @tparam Ntile: N-dimensional size of the gemm block tile, supports 128 or
+ * 256
+ *  @tparam NStage: Num of stages for async copy
+ *  @tparam BLOCK: BLOCK size
+ *  @tparam EnableFuse: If true, use fused splitk-reduce, otherwise use
+ * non-fused splitk-reduce
+ *  @tparam has_zp: whether to use zero_point
+ *
+ *  @fparam params struct consists of following parameters:
+ *      @param A_ptr: Matrix A value ptr, A = (M, K)
+ *      @param B_ptr: Matrix B value ptr, B = (N32_align, K) (N32K16 special
+ * format), N32_align = (N + 32 - 1) / 32 * 32
+ *      @param B_scale_ptr: B_scale value ptr, B_scale = (N32_align,) (N32K16
+ * special format)
+ *      @param B_zero_ptr: B_zero value ptr, B_zero = (N32_align,) (N32K16
+ * special format)
+ *      @param C_ptr: Matrix C value ptr, C = (M, N)
+ *      @param M: dimnesion m
+ *      @param N: dimnesion n
+ *      @param K: dimnesion k
+ *      @param SplitK: split size along K-dimension
+ *      @param C_split_ptr: Matrix C_split value ptr, used only in non-fused
+ * splitk-reduce
+ *      @param C_tmp_ptr: Matrix C_tmp value ptr, used only in fused
+ * splitk-reduce
+ *      @param red_count_ptr: 1-D red_count value ptr, used only in fused
+ * splitk-reduce
+ */
+template <typename FType, typename QType, int Mtile, int Ntile, int NStage,
+          int BLOCK, bool EnableFuse, bool has_zp>
+__global__ void __launch_bounds__(BLOCK)
+    ampere_hgemm_W8A16_perc_f16_f16_MtilexNtilex32_hmma16816_multistage_AN_BTN32K16_CN_splitk_kernel(
+        const SM8x_GEMM_W8A16_Splitk_Params<FType, QType> params) {
+  // A smem size = 64 * 32 * 2B/elem * 4(stage) = 16KB
+  // B smem size = 128 * 32 * 1B/elem * 4(stage) = 16KB
+  constexpr int smem_size_one_stage = Mtile * 32 * 2 + Ntile * 32;
+  __shared__ char smem[NStage * smem_size_one_stage];
+  char* A_smem = smem;
+  char* BQ_smem = smem + Mtile * 32 * 2 * NStage;
+
+  uint32_t A_smem_addr = smem_u32addr(A_smem);
+  uint32_t BQ_smem_addr = smem_u32addr(BQ_smem);
+  uint32_t A_smem_stage_stride = Mtile * 32 * 2;
+  uint32_t BQ_smem_stage_stride = Ntile * 32;
+
+  // initialize the data move process from GM to SMEM for this block
+  GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK<
+      FType, QType, Mtile, Ntile, NStage, BLOCK>
+      gmem_tile(params, A_smem_addr, BQ_smem_addr, A_smem_stage_stride,
+                BQ_smem_stage_stride);
+
+  int sts_stage_idx = 0;
+  int lds_stage_idx = 0;
+
+  auto tb_k_slice = blockIdx.z * params.SplitK + params.SplitK <= params.K
+                        ? params.SplitK
+                        : params.K - blockIdx.z * params.SplitK;
+  int k_tiles = (tb_k_slice + 31) / 32;
+  int first_k_tile = tb_k_slice - (k_tiles - 1) * 32;
+
+  // load first three tiles to shared memory
+  gmem_tile.ldgsts_first_ktiles(first_k_tile, k_tiles);
+  sts_stage_idx += (NStage - 2);
+  ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK<
+      FType, QType, Mtile, Ntile, BLOCK, EnableFuse, has_zp>
+      compute_tile(params, A_smem_addr, BQ_smem_addr, A_smem_stage_stride,
+                   BQ_smem_stage_stride);
+  compute_tile.ldg_params();
+  cp_asyc_wait_group<NStage - 2>();
+  __syncthreads();
+
+  compute_tile.lds(lds_stage_idx, 0, 0);
+  int reg_buf_idx = 1;
+
+  // main loop
+  for (; k_tiles > NStage - 1; --k_tiles) {
+    // load next A&B tile
+    sts_stage_idx = sts_stage_idx < NStage - 1 ? sts_stage_idx + 1 : 0;
+    gmem_tile.ldgsts(sts_stage_idx);
+
+  #pragma unroll
+    for (int k_phase_idx = 0; k_phase_idx < 2; k_phase_idx++) {
+      // dequantize next B tile
+      if (k_phase_idx == 1) {
+        cp_asyc_wait_group<NStage - 2>();
+        __syncthreads();
+        lds_stage_idx = lds_stage_idx < NStage - 1 ? lds_stage_idx + 1 : 0;
+      }
+
+      compute_tile.lds(lds_stage_idx, reg_buf_idx, (k_phase_idx + 1) % 2);
+
+      compute_tile.mma(reg_buf_idx ^ 1);
+      reg_buf_idx ^= 1;
+    }
+  }
+
+  // last NStage-1 tiles
+  for (; k_tiles > 0; --k_tiles) {
+    cp_async_commit_group();
+  #pragma unroll
+    for (int k_phase_idx = 0; k_phase_idx < 2; k_phase_idx++) {
+      // dequantize next B tile
+      if (k_phase_idx == 1) {
+        cp_asyc_wait_group<NStage - 2>();
+        __syncthreads();
+        lds_stage_idx = lds_stage_idx < NStage - 1 ? lds_stage_idx + 1 : 0;
+      }
+
+      compute_tile.lds(lds_stage_idx, reg_buf_idx, (k_phase_idx + 1) % 2);
+
+      compute_tile.mma(reg_buf_idx ^ 1);
+      reg_buf_idx ^= 1;
+    }
+  }
+
+  if (EnableFuse) {
+    compute_tile.fused_splitk_reduce();
+  }
+  compute_tile.stg(smem);
+}
+
+  #define __CALL_IF(MTILE, NTILE, NUM_THREADS, ENABLE_FUSE, HAS_ZP)                                     \
+    else if (Mtile == MTILE && Ntile == NTILE && BLOCK == NUM_THREADS &&                                \
+             enable_fuse == ENABLE_FUSE && has_zp == HAS_ZP) {                                          \
+      ampere_hgemm_W8A16_perc_f16_f16_MtilexNtilex32_hmma16816_multistage_AN_BTN32K16_CN_splitk_kernel< \
+          FType, QType, MTILE, NTILE, 4, NUM_THREADS, ENABLE_FUSE, HAS_ZP>                              \
+          <<<grid, block, 0, stream>>>(params);                                                         \
+    }
+
+template <typename FType, typename QType>
+void ampere_hgemm_W8A16_perc_f16_f16_MtilexNtilex32_mma16816_multistage_AN_BTN32K16_CN_splitk(
+    const FType* A, const QType* B, const FType* B_scale, const FType* B_zero,
+    FType* C, const int M, const int N, const int K, void* workspace,
+    const int sm_version, const BlockTileSplitkParams& fused_gemm_params,
+    cudaStream_t stream) {
+  int Mtile = fused_gemm_params.Mtile;
+  int grid_x = (M + Mtile - 1) / Mtile;
+  int Ntile = fused_gemm_params.Ntile;
+  int grid_y = (N + Ntile - 1) / Ntile;
+  int SplitK = fused_gemm_params.SplitK;
+  int grid_z = (K + SplitK - 1) / SplitK;
+
+  int BLOCK = (Ntile == 256) ? 256 : 128;
+
+  dim3 grid(grid_x, grid_y, grid_z);
+  dim3 block(BLOCK);
+
+  bool enable_fuse = fused_gemm_params.EnableFuse;
+  bool has_zp = B_zero != nullptr;
+  if (enable_fuse) {
+    float* C_tmp = reinterpret_cast<float*>(workspace);
+    uint32_t* red_count = reinterpret_cast<uint32_t*>(
+        (char*)workspace + grid_x * Mtile * grid_y * Ntile * sizeof(float));
+    CHECK_CUDA(cudaMemsetAsync(red_count, 0, grid_x * grid_y * sizeof(uint32_t),
+                               stream));
+    SM8x_GEMM_W8A16_Splitk_Params<FType, QType> params{
+        A, B,      B_scale, B_zero, C,       M,     N,
+        K, SplitK, 0,       -1,     nullptr, C_tmp, red_count};
+
+    if (false) {
+    }
+    // Select the template parameters for kernel launch
+    // according to the above settings. Tuning is not supported.
+    __CALL_IF(16, 256, 256, true, false)
+    __CALL_IF(32, 256, 256, true, false)
+    __CALL_IF(48, 256, 256, true, false)
+    __CALL_IF(64, 128, 128, true, false)
+    __CALL_IF(64, 256, 256, true, false)
+    __CALL_IF(16, 256, 256, true, true)
+    __CALL_IF(32, 256, 256, true, true)
+    __CALL_IF(48, 256, 256, true, true)
+    __CALL_IF(64, 128, 128, true, true)
+    __CALL_IF(64, 256, 256, true, true)
+  } else {
+    FType* C_split = reinterpret_cast<FType*>(workspace);
+    SM8x_GEMM_W8A16_Splitk_Params<FType, QType> params{
+        A, B,      B_scale, B_zero, C,       M,       N,
+        K, SplitK, 0,       -1,     C_split, nullptr, nullptr};
+
+    if (false) {
+    }
+    // Select the template parameters for kernel launch
+    // according to the above settings. Tuning is not supported.
+    __CALL_IF(16, 256, 256, false, false)
+    __CALL_IF(32, 256, 256, false, false)
+    __CALL_IF(48, 256, 256, false, false)
+    __CALL_IF(64, 128, 128, false, false)
+    __CALL_IF(64, 256, 256, false, false)
+    __CALL_IF(16, 256, 256, false, true)
+    __CALL_IF(32, 256, 256, false, true)
+    __CALL_IF(48, 256, 256, false, true)
+    __CALL_IF(64, 128, 128, false, true)
+    __CALL_IF(64, 256, 256, false, true)
+
+    // SplitK reduce
+    f16_gemm_splitk_reduce(C_split, C, M, N, grid_z, stream);
+  }
+}
+
+size_t allspark_qgemm_w8a16_perc_n32k16_ampere_workspace_size(
+    int m, int n, int k, int sm_count,
+    BlockTileSplitkParams& fused_gemm_params) {
+  // Determine the block tile and splitk strategy
+  int m16_times = (m + 16 - 1) / 16;
+  int Mtile = m16_times <= 4 ? m16_times * 16 : 64;
+  int grid_x = (m + Mtile - 1) / Mtile;
+  int Ntile =
+      (float(grid_x * ((n + 127) / 128)) / sm_count > 10) || (Mtile < 64) ? 256
+                                                                          : 128;
+  int grid_y = (n + Ntile - 1) / Ntile;
+  int grid_z;
+
+  // split-k
+  const float SPLIT_THRESHOLD = 0.8;
+  int n_slice;
+  for (n_slice = 1; n_slice < k / 256; ++n_slice) {
+    int n_block = grid_x * grid_y * n_slice;
+    if (n_block >= sm_count * SPLIT_THRESHOLD &&
+        (n_block % sm_count == 0 || n_block % sm_count >= sm_count * 0.5)) {
+      break;
+    }
+  }
+
+  int k_slice =
+      (k / n_slice) % 32 == 0 ? k / n_slice : k / n_slice / 32 * 32 + 32;
+  grid_z = (k + k_slice - 1) / k_slice;
+  bool enable_fuse = float(grid_x * grid_y) / sm_count >= 0.5 ? 1 : 0;
+
+  size_t ws_size;
+  if (enable_fuse) {
+    ws_size = grid_x * Mtile * grid_y * Ntile * sizeof(float)  // For C_tmp
+              + grid_x * grid_y * sizeof(uint32_t);            // For red_count
+  } else {
+    ws_size = grid_z * m * n * sizeof(__half);
+  }
+
+  fused_gemm_params.Mtile = Mtile;
+  fused_gemm_params.Ntile = Ntile;
+  fused_gemm_params.SplitK = k_slice;
+  fused_gemm_params.EnableFuse = enable_fuse;
+  return ws_size;
+}
+
+// restore from N32K16 order to original N-major order
+// K % 16 == 0, N % 8 == 0
+// each block process 64(k) * 32(n) result elements
+template <typename FT, typename QT>
+__global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel(
+    const QT* qdata, const FT* scales, const FT* zeros, FT* fdata,
+    const int N_32align, const int N, const int K) {
+  __shared__ FT smem[64 * 32];
+  auto warp_id = threadIdx.x / 32;
+  auto lane_id = threadIdx.x % 32;
+  const auto src_row_idx = blockIdx.x * 8 + lane_id / 4;
+  const int src_col_idx =
+      blockIdx.y * 64 * 4 + warp_id * 16 * 4 + (lane_id % 4) * 16;
+  const int src_offset = src_row_idx * K * 4 + src_col_idx;
+  auto params_nidx = blockIdx.x * 32 + (lane_id / 4) * 4;
+
+  QT qval_reg[16];
+  const QT* pdata = qdata + src_offset;
+  if (src_col_idx < (K * 4)) {
+    *(reinterpret_cast<uint4*>(qval_reg)) =
+        *(reinterpret_cast<const uint4*>(qdata + src_offset));
+  }
+  FT scale_reg[4];
+  *(reinterpret_cast<uint2*>(scale_reg)) =
+      *(reinterpret_cast<const uint2*>(scales + params_nidx));
+  FT zero_reg[4];
+  if (zeros != nullptr) {
+    *(reinterpret_cast<uint2*>(zero_reg)) =
+        *(reinterpret_cast<const uint2*>(zeros + params_nidx));
+  }
+  FT fval_reg[16];
+
+  const int sts_base_offset =
+      (warp_id * 16 + (lane_id % 4) * 2) * 32 + lane_id / 4;
+  #pragma unroll
+  for (int ni = 0; ni < 4; ++ni) {
+    cvt_8bx4_to_16bx4_bias128(
+        *reinterpret_cast<uint32_t*>(&qval_reg[ni * 4]),
+        reinterpret_cast<typename HalfType<FT>::T2*>(&(fval_reg[ni * 4])));
+  #pragma unroll
+    for (int ki = 0; ki < 4; ++ki) {
+      if (zeros != nullptr) {
+        fval_reg[ni * 4 + ki] = __hsub(fval_reg[ni * 4 + ki], zero_reg[ni]);
+      }
+      fval_reg[ni * 4 + ki] = __hmul(fval_reg[ni * 4 + ki], scale_reg[ni]);
+      int sts_offset = sts_base_offset + ((ki / 2) * 8 + (ki % 2)) * 32 +
+                       ((ni + lane_id % 4) % 4) * 8;
+      smem[sts_offset] = fval_reg[ni * 4 + ki];
+    }
+  }
+  __syncthreads();
+
+  const int lds_base_offset =
+      (threadIdx.x / 4) * 32 + ((threadIdx.x % 4 + threadIdx.x / 8) % 4) * 8;
+  #pragma unroll
+  for (int i = 0; i < 2; ++i) {
+    *reinterpret_cast<uint4*>(fval_reg + i * 8) =
+        *reinterpret_cast<uint4*>(smem + lds_base_offset + i * 32 * 32);
+  }
+
+  const auto dst_row_base_kidx = blockIdx.y * 64 + threadIdx.x / 4;
+  const auto dst_col_nidx = blockIdx.x * 32 + (threadIdx.x % 4) * 8;
+  #pragma unroll
+  for (int i = 0; i < 2; ++i) {
+    int dst_row_kidx = dst_row_base_kidx + i * 32;
+    int dst_offset = dst_row_kidx * N + dst_col_nidx;
+    if (dst_row_kidx < K && dst_col_nidx < N) {
+      *reinterpret_cast<uint4*>(fdata + dst_offset) =
+          *reinterpret_cast<uint4*>(fval_reg + i * 8);
+    }
+  }
+}
+
+template <typename FT, typename QT>
+void restore_N32_K16_dequantize_rhs_w8a16(const QT* qdata, const FT* scales,
+                                          const FT* zeros, FT* fdata,
+                                          const int N_32align, const int N,
+                                          const int K, const int GroupSize,
+                                          cudaStream_t stream) {
+  TORCH_CHECK(N % 8 == 0 && K % 16 == 0 && N_32align % 32 == 0,
+              "Unsupported shape");
+  if (GroupSize == -1) {
+    const int BLOCK = 128;
+    dim3 grid(N_32align / 32, ((K / 16) + 3) / 4);
+    restore_N32_K16_dequantize_rhs_w8a16_perc_kernel<FT, QT>
+        <<<grid, BLOCK, 0, stream>>>(qdata, scales, zeros, fdata, N_32align, N,
+                                     K);
+  }
+  // TODO: Support SubChannel
+  else {
+    TORCH_CHECK(false, "Now only support PerChannel");
+  }
+}
+
+template <typename FT, typename QT>
+void w8a16_gemm_dq_cublas(const FT* in, const QT* rhs_qdata_ptr,
+                          const FT* rhs_scales_ptr, const FT* rhs_zeros_ptr,
+                          FT* out, void* workspace, const int M,
+                          const int N_32align, const int N, const int K,
+                          const int group_size, cudaStream_t stream,
+                          cublasHandle_t handle) {
+  static_assert(
+      std::is_same<FT, half>::value || std::is_same<FT, nv_bfloat16>::value,
+      "only float16 and bfloat16 is supported");
+  // Dequant
+  FT* rhs_fdata_ptr = static_cast<FT*>(workspace);
+  restore_N32_K16_dequantize_rhs_w8a16(rhs_qdata_ptr, rhs_scales_ptr,
+                                       rhs_zeros_ptr, rhs_fdata_ptr, N_32align,
+                                       N, K, group_size, stream);
+  // cuBLAS GEMM
+  int lda = K;
+  int ldb = N;
+  int ldc = N;
+  const float alpha = 1.0f;
+  const float beta = 0.0f;
+  cudaDataType_t cuda_type;
+  if (std::is_same<FT, __half>::value) {
+    cuda_type = CUDA_R_16F;
+  } else {
+    cuda_type = CUDA_R_16BF;
+  }
+  CHECK_CUBLAS(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha,
+                            rhs_fdata_ptr, cuda_type, ldb, in, cuda_type, lda,
+                            &beta, out, cuda_type, ldc, CUDA_R_32F,
+                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+}
+
+template <typename FType, typename QType>
+void allspark_qgemm_w8a16_perc_ampere(
+    const FType* A, const QType* B, const FType* B_scale, const FType* B_zero,
+    FType* C, const int M, const int N_32align, const int N, const int K,
+    void* workspace, const BlockTileSplitkParams& fused_gemm_params,
+    const int group_size, int CUBLAS_M_THRESHOLD, const int sm_version,
+    cudaStream_t stream, cublasHandle_t handle) {
+  if (M > CUBLAS_M_THRESHOLD) {
+    w8a16_gemm_dq_cublas<FType, QType>(A, B, B_scale, B_zero, C, workspace, M,
+                                       N_32align, N, K, group_size, stream,
+                                       handle);
+  } else {
+    ampere_hgemm_W8A16_perc_f16_f16_MtilexNtilex32_mma16816_multistage_AN_BTN32K16_CN_splitk<
+        FType, QType>(A, B, B_scale, B_zero, C, M, N, K, workspace, sm_version,
+                      fused_gemm_params, stream);
+  }
+}
+
+}  // namespace allspark
+
+torch::Tensor allspark_w8a16_gemm(
+    torch::Tensor const& a, torch::Tensor const& b_qweight,
+    torch::Tensor const& b_scales, std::optional<torch::Tensor> const& b_qzeros,
+    int64_t n, int64_t group_size, int64_t sm_count, int64_t sm_version,
+    int64_t CUBLAS_M_THRESHOLD, bool has_zp, bool n32k16_reorder) {
+  // Verify device and strides
+  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
+  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
+
+  TORCH_CHECK(b_qweight.device().is_cuda(), "b_qweight is not on GPU");
+  TORCH_CHECK(b_qweight.is_contiguous(), "b_qweight is not contiguous");
+
+  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
+  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
+
+  if (has_zp) {
+    TORCH_CHECK(b_qzeros.value().device().is_cuda(), "b_qzeros is not on GPU");
+    TORCH_CHECK(b_qzeros.value().is_contiguous(), "b_qzeros is not contiguous");
+  }
+
+  int m = a.size(0);
+  int n_32align = (n + 32 - 1) / 32 * 32;
+  int k = a.size(1);
+
+  // Verify shape
+  TORCH_CHECK(b_qweight.size(0) == n_32align,
+              "Shape mismatch: b_qweight.size(0) = ", b_qweight.size(0),
+              ", n_32align = ", n_32align);
+  TORCH_CHECK(b_qweight.size(1) == k,
+              "Shape mismatch: b_qweight.size(1) = ", b_qweight.size(1),
+              ", k = ", k);
+
+  TORCH_CHECK(group_size == -1, "Currently only supports group_size = -1");
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
+  const void* a_ptr = reinterpret_cast<const void*>(a.data_ptr());
+  const uint8_t* b_ptr = reinterpret_cast<const uint8_t*>(b_qweight.data_ptr());
+  const void* b_scale_ptr = reinterpret_cast<const void*>(b_scales.data_ptr());
+  const void* b_zero_ptr = nullptr;
+  if (b_qzeros.has_value()) {
+    b_zero_ptr = reinterpret_cast<const void*>(b_qzeros.value().data_ptr());
+  }
+
+  auto c_options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
+  torch::Tensor c = torch::empty({m, n}, c_options);
+  void* c_ptr = reinterpret_cast<void*>(c.data_ptr());
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
+
+  allspark::BlockTileSplitkParams fused_gemm_params;
+
+  size_t ws_size = 0;
+  if (m > CUBLAS_M_THRESHOLD) {
+    ws_size = k * n * 2;  // sizeof(f16)==2
+  } else {
+    ws_size = allspark::allspark_qgemm_w8a16_perc_n32k16_ampere_workspace_size(
+        m, n, k, sm_count, fused_gemm_params);
+  }
+
+  auto ws_options = torch::TensorOptions().dtype(at::kChar).device(a.device());
+  if (as_g_workspace.numel() <
+      ws_size) {  // ws_options: kChar, so numel() is bytes
+    as_g_workspace = torch::empty({long(ws_size)}, ws_options);
+  }
+  void* ws = reinterpret_cast<void*>(as_g_workspace.data_ptr());
+
+  if (a.dtype() == at::ScalarType::Half) {
+    allspark::allspark_qgemm_w8a16_perc_ampere<__half, uint8_t>(
+        reinterpret_cast<const __half*>(a_ptr), b_ptr,
+        reinterpret_cast<const __half*>(b_scale_ptr),
+        reinterpret_cast<const __half*>(b_zero_ptr),
+        reinterpret_cast<__half*>(c_ptr), m, n_32align, n, k, ws,
+        fused_gemm_params, group_size, CUBLAS_M_THRESHOLD, sm_version, stream,
+        handle);
+  } else if (a.dtype() == at::ScalarType::BFloat16) {
+    allspark::allspark_qgemm_w8a16_perc_ampere<__nv_bfloat16, uint8_t>(
+        reinterpret_cast<const __nv_bfloat16*>(a_ptr), b_ptr,
+        reinterpret_cast<const __nv_bfloat16*>(b_scale_ptr),
+        reinterpret_cast<const __nv_bfloat16*>(b_zero_ptr),
+        reinterpret_cast<__nv_bfloat16*>(c_ptr), m, n_32align, n, k, ws,
+        fused_gemm_params, group_size, CUBLAS_M_THRESHOLD, sm_version, stream,
+        handle);
+  }
+
+  return c;
+}
+
+#endif
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("allspark_w8a16_gemm", &allspark_w8a16_gemm);
+}
diff --git a/vllm_v0.10.0/csrc/quantization/gptq_allspark/allspark_repack.cu b/vllm_v0.10.0/csrc/quantization/gptq_allspark/allspark_repack.cu
new file mode 100644
index 0000000..7a5b2f9
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/gptq_allspark/allspark_repack.cu
@@ -0,0 +1,163 @@
+#include "allspark_utils.cuh"
+#include <torch/all.h>
+#include "core/registration.h"
+
+namespace allspark {
+
+// Rearrange B to facilitate Ampere Tensor Core load data
+// reorder B from (K, N) to (N_32align / 4, K * 4)
+// K % 16 == 0, N % 16 == 0, N_32align % 32 == 0
+template <typename FType>
+__global__ void __launch_bounds__(128)
+    rearrange_kn_weight_as_n32k16_order_ldg16_kernel(
+        const uint8_t* B, const FType* B_scale, const FType* B_zero,
+        uint8_t* B_result, FType* B_scale_result, FType* B_zero_result,
+        const int K, const int N, const int N_32align) {
+  const auto lane_id = threadIdx.x % 32;
+  const auto warp_id = threadIdx.x / 32;
+
+  if (blockIdx.x != gridDim.x - 1) {
+    // Load B
+    // per block process 64(k) * 128(n) B elements
+    // per warp process 16(k) * 128 B elements
+    const int src_row_base_idx =
+        blockIdx.x * 64 + warp_id * 16 + ((lane_id % 8) / 2) * 2;
+    const int src_col_idx =
+        blockIdx.y * 128 + (lane_id / 8) * 32 + (lane_id % 2) * 16;
+    uint8_t B_frag[4][16];
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {
+      int src_row_idx = src_row_base_idx + (i / 2) * 8 + (i % 2);
+      int src_offset = src_row_idx * N + src_col_idx;
+      bool guard = src_row_idx < K && src_col_idx < N;
+      ldg128_cg_0(*reinterpret_cast<uint32_t*>(B_frag[i]),
+                  *(reinterpret_cast<uint32_t*>(B_frag[i]) + 1),
+                  *(reinterpret_cast<uint32_t*>(B_frag[i]) + 2),
+                  *(reinterpret_cast<uint32_t*>(B_frag[i]) + 3), B + src_offset,
+                  guard);
+    }
+
+    // reorder B
+    uint8_t B_reorder_frag[8][8];
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {
+#pragma unroll
+      for (int j = 0; j < 16; ++j) {
+        int dst_i = j % 8;
+        int dst_j = i + (j / 8) * 4;
+        B_reorder_frag[dst_i][dst_j] = B_frag[i][j];
+      }
+    }
+
+    // Store B
+    const auto dst_row_base_idx = blockIdx.y * (128 / 4) + (lane_id / 8) * 8;
+    const int dst_col_idx =
+        blockIdx.x * (64 * 4) + warp_id * 64 + (lane_id % 8) * 8;
+    for (int i = 0; i < 8; ++i) {
+      int dst_row_idx = dst_row_base_idx + i;
+      int dst_offset = dst_row_idx * K * 4 + dst_col_idx;
+      bool guard = (dst_row_base_idx < N_32align / 4) && (dst_col_idx < K * 4);
+      if (guard) {
+        *reinterpret_cast<int2*>(B_result + dst_offset) =
+            *reinterpret_cast<int2*>(B_reorder_frag[i]);
+      }
+    }
+  } else {
+    // Load B_scale and B_zero
+    FType b_scale_reg, b_zero_reg;
+    auto src_offset = blockIdx.y * 128 + threadIdx.x;
+    ldg16_cg_0(b_scale_reg, B_scale + src_offset, src_offset < N);
+    if (B_zero != nullptr)
+      ldg16_cg_0(b_zero_reg, B_zero + src_offset, src_offset < N);
+    int dst_offset =
+        blockIdx.y * 128 + warp_id * 32 + (lane_id % 8) * 4 + lane_id / 8;
+    if (dst_offset < N_32align) {
+      B_scale_result[dst_offset] = b_scale_reg;
+      if (B_zero != nullptr) B_zero_result[dst_offset] = b_zero_reg;
+    }
+  }
+}
+
+template <typename FType>
+void rearrange_kn_weight_as_n32k16_order_ldg16(
+    const uint8_t* B, const FType* B_scale, const FType* B_zero,
+    uint8_t* B_result, FType* B_scale_result, FType* B_zero_result,
+    const int64_t K, const int64_t N, const int64_t N_32align,
+    cudaStream_t stream) {
+  if (N % 16 != 0 || K % 16 != 0) {
+    std::cerr << "Now only support N and K is multiples of 16" << std::endl;
+  }
+  const int BLOCK = 128;
+  int grid_x = (K + 64 - 1) / 64 + 1;
+  int grid_y = (N + 128 - 1) / 128;
+  dim3 grid(grid_x, grid_y);
+
+  rearrange_kn_weight_as_n32k16_order_ldg16_kernel<FType>
+      <<<grid, BLOCK, 0, stream>>>(B, B_scale, B_zero, B_result, B_scale_result,
+                                   B_zero_result, K, N, N_32align);
+}
+}  // namespace allspark
+
+void rearrange_kn_weight_as_n32k16_order(
+    torch::Tensor const& b_qweight, torch::Tensor const& b_scales,
+    std::optional<torch::Tensor> const& b_zeros, bool has_zp,
+    torch::Tensor& b_qweight_reorder, torch::Tensor& b_scales_reorder,
+    std::optional<torch::Tensor> const& b_zeros_reorder, const int64_t K,
+    const int64_t N, const int64_t N_32align) {
+  // Verify device and strides
+  TORCH_CHECK(b_qweight.device().is_cuda(), "b_qweight is not on GPU");
+  TORCH_CHECK(b_qweight.is_contiguous(), "b_qweight is not contiguous");
+
+  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
+  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
+
+  TORCH_CHECK(b_qweight_reorder.device().is_cuda(),
+              "b_qweight_reorder is not on GPU");
+  TORCH_CHECK(b_qweight_reorder.is_contiguous(),
+              "b_qweight_reorder is not contiguous");
+
+  TORCH_CHECK(b_scales_reorder.device().is_cuda(),
+              "b_scales_reorder is not on GPU");
+  TORCH_CHECK(b_scales_reorder.is_contiguous(),
+              "b_scales_reorder is not contiguous");
+
+  if (has_zp) {
+    TORCH_CHECK(b_zeros.value().device().is_cuda(), "b_zeros is not on GPU");
+    TORCH_CHECK(b_zeros.value().is_contiguous(), "b_zeros is not contiguous");
+
+    TORCH_CHECK(b_zeros_reorder.value().device().is_cuda(),
+                "b_zeros_reorder is not on GPU");
+    TORCH_CHECK(b_zeros_reorder.value().is_contiguous(),
+                "b_zeros_reorder is not contiguous");
+  }
+
+  const uint8_t* matB = reinterpret_cast<const uint8_t*>(b_qweight.data_ptr());
+  const void* b_scale = b_scales.data_ptr();
+  const void* b_zero = has_zp ? b_zeros.value().data_ptr() : nullptr;
+
+  uint8_t* matB_reorder =
+      reinterpret_cast<uint8_t*>(b_qweight_reorder.data_ptr());
+  void* b_scale_reorder = b_scales_reorder.data_ptr();
+  void* b_zero_reorder = has_zp ? b_zeros_reorder.value().data_ptr() : nullptr;
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (b_scales.dtype() == at::ScalarType::Half) {
+    allspark::rearrange_kn_weight_as_n32k16_order_ldg16<__half>(
+        matB, reinterpret_cast<const __half*>(b_scale),
+        reinterpret_cast<const __half*>(b_zero), matB_reorder,
+        reinterpret_cast<__half*>(b_scale_reorder),
+        reinterpret_cast<__half*>(b_zero_reorder), K, N, N_32align, stream);
+  } else if (b_scales.dtype() == at::ScalarType::BFloat16) {
+    allspark::rearrange_kn_weight_as_n32k16_order_ldg16<__nv_bfloat16>(
+        matB, reinterpret_cast<const __nv_bfloat16*>(b_scale),
+        reinterpret_cast<const __nv_bfloat16*>(b_zero), matB_reorder,
+        reinterpret_cast<__nv_bfloat16*>(b_scale_reorder),
+        reinterpret_cast<__nv_bfloat16*>(b_zero_reorder), K, N, N_32align,
+        stream);
+  }
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("rearrange_kn_weight_as_n32k16_order",
+         &rearrange_kn_weight_as_n32k16_order);
+}
diff --git a/vllm_v0.10.0/csrc/quantization/gptq_allspark/allspark_utils.cuh b/vllm_v0.10.0/csrc/quantization/gptq_allspark/allspark_utils.cuh
new file mode 100644
index 0000000..8314130
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/gptq_allspark/allspark_utils.cuh
@@ -0,0 +1,410 @@
+#pragma once
+
+#include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <iostream>
+#include "../gptq_marlin/marlin_dtypes.cuh"
+using marlin::ScalarType;
+
+namespace allspark {
+
+#define CHECK_CUDA(cmd)                                             \
+  do {                                                              \
+    cudaError_t cuda_status = cmd;                                  \
+    if (cuda_status != cudaSuccess) {                               \
+      std::string err_str = cudaGetErrorString(cuda_status);        \
+      std::cerr << "Failed: " << __FILE__ << ":" << __LINE__ << " " \
+                << err_str;                                         \
+      exit(-1);                                                     \
+    }                                                               \
+  } while (0)
+
+#define CHECK_CUBLAS(cmd)                                            \
+  do {                                                               \
+    cublasStatus_t cublas_status = cmd;                              \
+    if (cublas_status != CUBLAS_STATUS_SUCCESS) {                    \
+      std::cerr << "Failed:  " << __FILE__ << ":" << __LINE__ << " " \
+                << cublas_status << std::endl;                       \
+      exit(-1);                                                      \
+    }                                                                \
+  } while (0)
+
+template <typename FType, typename QType>
+struct SM8x_GEMM_W8A16_Splitk_Params {
+  const FType* A_ptr;
+  const QType* B_ptr;
+  const FType* B_scale_ptr;
+  const FType* B_zero_ptr;
+  FType* C_ptr;
+  int M;
+  int N;
+  int K;
+  int SplitK;
+  int GroupCnt;
+  int GroupSize;
+  FType* C_split_ptr;       // for non-fused splitk reduce
+  float* C_tmp_ptr;         // for fused splitk reduce
+  uint32_t* red_count_ptr;  // for fused splitk reduce
+};
+
+struct alignas(16) BlockTileSplitkParams {
+  int Mtile;
+  int Ntile;
+  int SplitK;
+  bool EnableFuse;
+};
+
+template <typename FType, int BLOCK, int N_MATRIX>
+__global__ void f16_gemm_splitk_reduce_kernel(const FType* C_split, FType* C,
+                                              uint32_t n, uint32_t n_matrix,
+                                              uint32_t matrix_size) {
+  auto idx = blockIdx.x * BLOCK + threadIdx.x;
+
+  if (idx >= matrix_size) {
+    return;
+  }
+
+  float sum = 0.f;
+
+  int n_mat = N_MATRIX > 0 ? N_MATRIX : (int)n_matrix;
+  for (int i = 0; i < n_mat; ++i) {
+    sum += ScalarType<FType>::num2float(C_split[idx + i * matrix_size]);
+  }
+
+  C[idx] = ScalarType<FType>::float2num(sum);
+}
+
+template <typename FType>
+void f16_gemm_splitk_reduce(const FType* C_split, FType* C, const uint32_t m,
+                            const uint32_t n, const uint32_t n_matrix,
+                            cudaStream_t stream) {
+  const int BLOCK = 128;
+  uint32_t matrix_size = m * n;
+  int grid = (matrix_size + BLOCK - 1) / BLOCK;
+
+  void (*kernel)(const FType*, FType*, uint32_t, uint32_t, uint32_t) = nullptr;
+
+  switch (n_matrix) {
+    case 4:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 4>;
+      break;
+    case 5:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 5>;
+      break;
+    case 6:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 6>;
+      break;
+    case 7:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 7>;
+      break;
+    case 8:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 8>;
+      break;
+    case 9:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 9>;
+      break;
+    case 10:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 10>;
+      break;
+    case 11:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 11>;
+      break;
+    case 12:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 12>;
+      break;
+    default:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, -1>;
+      break;
+  }
+
+  kernel<<<grid, BLOCK, 0, stream>>>(C_split, C, n, n_matrix, matrix_size);
+}
+
+template <typename T>
+struct HalfType;
+template <>
+struct HalfType<half> {
+  using T1 = __half;
+  using T2 = __half2;
+};
+template <>
+struct HalfType<__nv_bfloat16> {
+  using T1 = __nv_bfloat16;
+  using T2 = __nv_bfloat162;
+};
+
+// convert 64-bit pointer to 32-bit smem addr
+__device__ __forceinline__ uint32_t smem_u32addr(const void* smem_ptr) {
+  uint32_t addr;
+  asm("{.reg .u64 u64addr;\n"
+      " cvta.to.shared.u64 u64addr, %1;\n"
+      " cvt.u32.u64 %0, u64addr;}\n"
+      : "=r"(addr)
+      : "l"(smem_ptr));
+
+  return addr;
+}
+
+template <typename T>
+__device__ __forceinline__ void ldg16_cg_0(T& r0, const void* ptr, bool guard) {
+  static_assert(sizeof(T) == 2, "ldg16_cg_0: invalid T");
+
+  asm volatile(
+      "{.reg .pred p;\n"
+      " setp.ne.b32 p, %2, 0;\n"
+      " @!p mov.b16 %0, 0;\n"
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4 && \
+    __CUDA_ARCH__ >= 750
+      " @p ld.global.cg.L2::128B.b16 {%0}, [%1];}\n"
+#else
+      " @p ld.global.ca.b16 {%0}, [%1];}\n"
+#endif
+      : "=h"(reinterpret_cast<uint16_t&>(r0))
+      : "l"(ptr), "r"((int)guard));
+}
+
+template <typename T>
+__device__ __forceinline__ void ldg64_ca(T& r0, T& r1, const void* ptr,
+                                         bool guard) {
+  static_assert(sizeof(T) == 4, "ldg64_ca: invalid T");
+
+  asm volatile(
+      "{.reg .pred p;\n"
+      " setp.ne.b32 p, %3, 0;\n"
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4 && \
+    __CUDA_ARCH__ >= 750
+      " @p ld.global.ca.L2::128B.v2.b32 {%0, %1}, [%2];}\n"
+#else
+      " @p ld.global.ca.v2.b32 {%0, %1}, [%2];}\n"
+#endif
+      : "=r"(reinterpret_cast<uint32_t&>(r0)),
+        "=r"(reinterpret_cast<uint32_t&>(r1))
+      : "l"(ptr), "r"((int)guard));
+}
+
+template <typename T>
+__device__ __forceinline__ void ldg128_cg_0(T& r0, T& r1, T& r2, T& r3,
+                                            const void* ptr, bool guard) {
+  static_assert(sizeof(T) == 4, "ldg128_cg_0: invalid T");
+
+  asm volatile(
+      "{.reg .pred p;\n"
+      " setp.ne.b32 p, %5, 0;\n"
+      " @!p mov.b32 %0, 0;\n"
+      " @!p mov.b32 %1, 0;\n"
+      " @!p mov.b32 %2, 0;\n"
+      " @!p mov.b32 %3, 0;\n"
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4 && \
+    __CUDA_ARCH__ >= 750
+      " @p ld.global.cg.L2::128B.v4.b32 {%0, %1, %2, %3}, [%4];}\n"
+#else
+      " @p ld.global.cg.v4.b32 {%0, %1, %2, %3}, [%4];}\n"
+#endif
+      : "=r"(reinterpret_cast<uint32_t&>(r0)),
+        "=r"(reinterpret_cast<uint32_t&>(r1)),
+        "=r"(reinterpret_cast<uint32_t&>(r2)),
+        "=r"(reinterpret_cast<uint32_t&>(r3))
+      : "l"(ptr), "r"((int)guard));
+}
+
+template <typename T>
+__device__ __forceinline__ void lds128(T& reg0, T& reg1, T& reg2, T& reg3,
+                                       const uint32_t addr) {
+  static_assert(sizeof(T) == 4, "lds128: invalid T");
+
+  asm volatile("ld.shared.v4.b32 {%0, %1, %2, %3}, [%4];\n"
+               : "=r"(reinterpret_cast<uint32_t&>(reg0)),
+                 "=r"(reinterpret_cast<uint32_t&>(reg1)),
+                 "=r"(reinterpret_cast<uint32_t&>(reg2)),
+                 "=r"(reinterpret_cast<uint32_t&>(reg3))
+               : "r"(addr));
+}
+
+template <typename T>
+__device__ __forceinline__ void stg128(const T& r0, const T& r1, const T& r2,
+                                       const T& r3, const void* ptr,
+                                       bool guard) {
+  static_assert(sizeof(T) == 4, "stg128: invalid T");
+
+  asm volatile(
+      "{.reg .pred p;\n"
+      " setp.ne.b32 p, %1, 0;\n"
+      " @p st.global.v4.b32 [%0], {%2, %3, %4, %5};}\n"
+      :
+      : "l"(ptr), "r"((int)guard), "r"(reinterpret_cast<const uint32_t&>(r0)),
+        "r"(reinterpret_cast<const uint32_t&>(r1)),
+        "r"(reinterpret_cast<const uint32_t&>(r2)),
+        "r"(reinterpret_cast<const uint32_t&>(r3)));
+}
+
+template <typename T>
+__device__ __forceinline__ void ldsm_4(T& r0, T& r1, T& r2, T& r3,
+                                       const uint32_t& addr) {
+  static_assert(sizeof(T) == 4, "ldsm_4: invalid T");
+#if (__CUDA_ARCH__ >= 750) && (__CUDACC_VER_MAJOR__ >= 11)
+  asm volatile(
+      "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0, %1, %2, %3}, [%4];\n"
+      : "=r"(reinterpret_cast<uint32_t&>(r0)),
+        "=r"(reinterpret_cast<uint32_t&>(r1)),
+        "=r"(reinterpret_cast<uint32_t&>(r2)),
+        "=r"(reinterpret_cast<uint32_t&>(r3))
+      : "r"(addr));
+#endif
+}
+
+template <typename FType>
+__device__ __forceinline__ void hmma16816_f32(float (&d)[4],
+                                              const uint32_t (&a)[4],
+                                              const uint32_t (&b)[2]);
+
+template <>
+__device__ __forceinline__ void hmma16816_f32<__half>(float (&d)[4],
+                                                      const uint32_t (&a)[4],
+                                                      const uint32_t (&b)[2]) {
+#if (__CUDA_ARCH__ >= 800) && (__CUDACC_VER_MAJOR__ >= 11)
+  asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%0, %1, %2, %3}, "
+      "{%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};\n"
+      : "+f"(d[0]), "+f"(d[1]), "+f"(d[2]), "+f"(d[3])
+      : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]));
+#endif
+}
+
+template <>
+__device__ __forceinline__ void hmma16816_f32<__nv_bfloat16>(
+    float (&d)[4], const uint32_t (&a)[4], const uint32_t (&b)[2]) {
+#if (__CUDA_ARCH__ >= 800) && (__CUDACC_VER_MAJOR__ >= 11)
+  asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 {%0, %1, %2, %3}, "
+      "{%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};\n"
+      : "+f"(d[0]), "+f"(d[1]), "+f"(d[2]), "+f"(d[3])
+      : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]));
+#endif
+}
+
+template <int SIZE_IN_BYTES>
+__device__ __forceinline__ void cp_async(const uint32_t smem_addr,
+                                         const void* gmem_ptr,
+                                         const int src_in_bytes, bool guard) {
+  static_assert(
+      (SIZE_IN_BYTES == 4 || SIZE_IN_BYTES == 8 || SIZE_IN_BYTES == 16),
+      "Size is not supported");
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
+  asm volatile(
+      "{.reg.pred p;\n"
+      " setp.ne.b32 p, %4, 0;\n"
+  #if __CUDACC_VER_MINOR__ >= 4
+      " @p cp.async.cg.shared.global.L2::256B [%0], [%1], %2, %3;}\n"
+  #else
+      " @p cp.async.cg.shared.global [%0], [%1], %2, %3;}\n"
+  #endif
+      ::"r"(smem_addr),
+      "l"(gmem_ptr), "n"(SIZE_IN_BYTES), "r"(src_in_bytes), "r"((int)guard));
+#endif
+}
+
+template <int SIZE_IN_BYTES>
+__device__ __forceinline__ void cp_async_ca(const uint32_t smem_addr,
+                                            const void* gmem_ptr,
+                                            const int src_in_bytes,
+                                            bool guard) {
+  static_assert(
+      (SIZE_IN_BYTES == 4 || SIZE_IN_BYTES == 8 || SIZE_IN_BYTES == 16),
+      "Size is not supported");
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
+  asm volatile(
+      "{.reg.pred p;\n"
+      " setp.ne.b32 p, %4, 0;\n"
+  #if __CUDACC_VER_MINOR__ >= 4
+      " @p cp.async.ca.shared.global.L2::256B [%0], [%1], %2, %3;}\n"
+  #else
+      " @p cp.async.ca.shared.global [%0], [%1], %2, %3;}\n"
+  #endif
+      ::"r"(smem_addr),
+      "l"(gmem_ptr), "n"(SIZE_IN_BYTES), "r"(src_in_bytes), "r"((int)guard));
+#endif
+}
+
+__device__ __forceinline__ void cp_async_commit_group() {
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
+  asm volatile("cp.async.commit_group;\n");
+#endif
+}
+
+template <int N>
+__device__ __forceinline__ void cp_asyc_wait_group() {
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
+  asm volatile("cp.async.wait_group %0;\n" : : "n"(N));
+#endif
+}
+
+template <typename T>
+__device__ __forceinline__ void cvt_8bx4_to_16bx4_bias128(const uint32_t& idata,
+                                                          T* fdata);
+
+template <>
+// fast conversion: 4xuint8 to 4xhalf, subtracting bias = 128
+__device__ __forceinline__ void cvt_8bx4_to_16bx4_bias128<__half2>(
+    const uint32_t& idata, __half2* fdata) {
+  uint32_t i10, i32;
+  asm volatile(
+      "prmt.b32 %0, %2, 0x64, 0x4140;"
+      "prmt.b32 %1, %2, 0x64, 0x4342;"
+      : "=r"(i10), "=r"(i32)
+      : "r"(idata));
+
+  static constexpr uint32_t MAGIC_NUM = 0x64806480;
+  fdata[0] = __hsub2(reinterpret_cast<const __half2&>(i10),
+                     reinterpret_cast<const __half2&>(MAGIC_NUM));
+  fdata[1] = __hsub2(reinterpret_cast<const __half2&>(i32),
+                     reinterpret_cast<const __half2&>(MAGIC_NUM));
+}
+
+template <>
+// fast conversion: 4xuint8 to 4xbfloat16, subtracting bias = 128
+// reference from marlin fast implementation
+__device__ __forceinline__ void cvt_8bx4_to_16bx4_bias128<__nv_bfloat162>(
+    const uint32_t& idata, __nv_bfloat162* fdata) {
+  float fp32_imd[4];
+  uint32_t* fp32_imd_casted = reinterpret_cast<uint32_t*>(fp32_imd);
+  asm volatile(
+      "prmt.b32 %0, %4, 0x4B000000, 0x7650;"
+      "prmt.b32 %1, %4, 0x4B000000, 0x7651;"
+      "prmt.b32 %2, %4, 0x4B000000, 0x7652;"
+      "prmt.b32 %3, %4, 0x4B000000, 0x7653;"
+      : "=r"(fp32_imd_casted[0]), "=r"(fp32_imd_casted[1]),
+        "=r"(fp32_imd_casted[2]), "=r"(fp32_imd_casted[3])
+      : "r"(idata));
+
+  fp32_imd[0] -= 8388736.f;
+  fp32_imd[1] -= 8388736.f;
+  fp32_imd[2] -= 8388736.f;
+  fp32_imd[3] -= 8388736.f;
+
+  uint32_t* bf16_res = reinterpret_cast<uint32_t*>(fdata);
+  asm volatile(
+      "prmt.b32 %0, %2, %3, 0x7632;"
+      "prmt.b32 %1, %4, %5, 0x7632;"
+      : "=r"(bf16_res[0]), "=r"(bf16_res[1])
+      : "r"(fp32_imd_casted[0]), "r"(fp32_imd_casted[1]),
+        "r"(fp32_imd_casted[2]), "r"(fp32_imd_casted[3]));
+}
+
+static __device__ nv_bfloat162 inline num2num2(const nv_bfloat16 x) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  assert(false);
+#else
+  return __bfloat162bfloat162(x);
+#endif
+  __builtin_unreachable();  // Suppress missing return statement warning
+}
+
+static __device__ half2 inline num2num2(const half x) {
+  return __half2half2(x);
+}
+
+}  // namespace allspark
diff --git a/vllm_v0.10.0/csrc/quantization/gptq_marlin/.gitignore b/vllm_v0.10.0/csrc/quantization/gptq_marlin/.gitignore
new file mode 100644
index 0000000..7708855
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/gptq_marlin/.gitignore
@@ -0,0 +1 @@
+kernel_*.cu
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/quantization/gptq_marlin/awq_marlin_repack.cu b/vllm_v0.10.0/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
new file mode 100644
index 0000000..8ba617a
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
@@ -0,0 +1,268 @@
+#include "marlin.cuh"
+
+#include "core/registration.h"
+
+namespace marlin {
+
+template <int const num_threads, int const num_bits>
+__global__ void awq_marlin_repack_kernel(
+    uint32_t const* __restrict__ b_q_weight_ptr, uint32_t* __restrict__ out_ptr,
+    int size_k, int size_n) {
+  constexpr int pack_factor = 32 / num_bits;
+
+  int k_tiles = size_k / tile_k_size;
+  int n_tiles = size_n / tile_n_size;
+  int block_k_tiles = div_ceil(k_tiles, gridDim.x);
+
+  auto start_k_tile = blockIdx.x * block_k_tiles;
+  if (start_k_tile >= k_tiles) {
+    return;
+  }
+
+  int finish_k_tile = min(start_k_tile + block_k_tiles, k_tiles);
+
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<repack_stages - 2>();
+    __syncthreads();
+  };
+
+  extern __shared__ int4 sh[];
+
+  constexpr int tile_n_ints = tile_n_size / pack_factor;
+
+  constexpr int stage_n_threads = tile_n_ints / 4;
+  constexpr int stage_k_threads = tile_k_size;
+  constexpr int stage_size = stage_k_threads * stage_n_threads;
+
+  auto fetch_to_shared = [&](int pipe, int k_tile_id, int n_tile_id) {
+    if (n_tile_id >= n_tiles) {
+      cp_async_fence();
+      return;
+    }
+
+    int first_n = n_tile_id * tile_n_size;
+    int first_n_packed = first_n / pack_factor;
+
+    int4* sh_ptr = sh + stage_size * pipe;
+
+    if (threadIdx.x < stage_size) {
+      auto k_id = threadIdx.x / stage_n_threads;
+      auto n_id = threadIdx.x % stage_n_threads;
+
+      int first_k = k_tile_id * tile_k_size;
+
+      cp_async4(&sh_ptr[k_id * stage_n_threads + n_id],
+                reinterpret_cast<int4 const*>(
+                    &(b_q_weight_ptr[(first_k + k_id) * (size_n / pack_factor) +
+                                     first_n_packed + (n_id * 4)])));
+    }
+
+    cp_async_fence();
+  };
+
+  auto repack_tile = [&](int pipe, int k_tile_id, int n_tile_id) {
+    if (n_tile_id >= n_tiles) {
+      return;
+    }
+
+    auto warp_id = threadIdx.x / 32;
+    auto th_id = threadIdx.x % 32;
+
+    if (warp_id >= 4) {
+      return;
+    }
+
+    int tc_col = th_id / 4;
+    int tc_row = (th_id % 4) * 2;
+
+    constexpr int tc_offsets[4] = {0, 1, 8, 9};
+
+    int cur_n = warp_id * 16 + tc_col;
+    int cur_n_packed = cur_n / pack_factor;
+    int cur_n_pos = cur_n % pack_factor;
+
+    constexpr int sh_stride = tile_n_ints;
+    constexpr uint32_t mask = (1 << num_bits) - 1;
+
+    int4* sh_stage_ptr = sh + stage_size * pipe;
+    uint32_t* sh_stage_int_ptr = reinterpret_cast<uint32_t*>(sh_stage_ptr);
+
+    // Undo interleaving
+    int cur_n_pos_unpacked;
+    if constexpr (num_bits == 4) {
+      constexpr int undo_pack[8] = {0, 4, 1, 5, 2, 6, 3, 7};
+      cur_n_pos_unpacked = undo_pack[cur_n_pos];
+    } else {
+      constexpr int undo_pack[4] = {0, 2, 1, 3};
+      cur_n_pos_unpacked = undo_pack[cur_n_pos];
+    }
+
+    uint32_t vals[8];
+#pragma unroll
+    for (int i = 0; i < 4; i++) {
+      int cur_elem = tc_row + tc_offsets[i];
+
+      int packed_src_0 = sh_stage_int_ptr[cur_n_packed + sh_stride * cur_elem];
+      int packed_src_1 = sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) +
+                                          sh_stride * cur_elem];
+
+      vals[i] = (packed_src_0 >> (cur_n_pos_unpacked * num_bits)) & mask;
+      vals[4 + i] = (packed_src_1 >> (cur_n_pos_unpacked * num_bits)) & mask;
+    }
+
+    constexpr int tile_size = tile_k_size * tile_n_size / pack_factor;
+    int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;
+
+    // Result of:
+    // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+    if constexpr (num_bits == 4) {
+      constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+
+      uint32_t res = 0;
+#pragma unroll
+      for (int i = 0; i < 8; i++) {
+        res |= vals[pack_idx[i]] << (i * 4);
+      }
+
+      out_ptr[out_offset + th_id * 4 + warp_id] = res;
+
+    } else {
+      constexpr int pack_idx[4] = {0, 2, 1, 3};
+
+      uint32_t res1 = 0;
+      uint32_t res2 = 0;
+#pragma unroll
+      for (int i = 0; i < 4; i++) {
+        res1 |= vals[pack_idx[i]] << (i * 8);
+        res2 |= vals[4 + pack_idx[i]] << (i * 8);
+      }
+
+      out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1;
+      out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 1] = res2;
+    }
+  };
+
+  auto start_pipes = [&](int k_tile_id, int n_tile_id) {
+#pragma unroll
+    for (int pipe = 0; pipe < repack_stages - 1; pipe++) {
+      fetch_to_shared(pipe, k_tile_id, n_tile_id + pipe);
+    }
+
+    wait_for_stage();
+  };
+#pragma unroll
+  for (int k_tile_id = start_k_tile; k_tile_id < finish_k_tile; k_tile_id++) {
+    int n_tile_id = 0;
+
+    start_pipes(k_tile_id, n_tile_id);
+
+    while (n_tile_id < n_tiles) {
+#pragma unroll
+      for (int pipe = 0; pipe < repack_stages; pipe++) {
+        fetch_to_shared((pipe + repack_stages - 1) % repack_stages, k_tile_id,
+                        n_tile_id + pipe + repack_stages - 1);
+        repack_tile(pipe, k_tile_id, n_tile_id + pipe);
+        wait_for_stage();
+      }
+      n_tile_id += repack_stages;
+    }
+  }
+}
+
+}  // namespace marlin
+
+#define CALL_IF(NUM_BITS)                                                   \
+  else if (num_bits == NUM_BITS) {                                          \
+    cudaFuncSetAttribute(                                                   \
+        marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS>, \
+        cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);       \
+    marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS>      \
+        <<<blocks, marlin::repack_threads, max_shared_mem, stream>>>(       \
+            b_q_weight_ptr, out_ptr, size_k, size_n);                       \
+  }
+
+torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
+                                int64_t size_n, int64_t num_bits) {
+  // Verify compatibility with marlin tile of 16x64
+  TORCH_CHECK(size_k % marlin::tile_k_size == 0, "size_k = ", size_k,
+              " is not divisible by tile_k_size = ", marlin::tile_k_size);
+  TORCH_CHECK(size_n % marlin::tile_n_size == 0, "size_n = ", size_n,
+              " is not divisible by tile_n_size = ", marlin::tile_n_size);
+
+  TORCH_CHECK(num_bits == 4 || num_bits == 8,
+              "num_bits must be 4 or 8. Got = ", num_bits);
+  int const pack_factor = 32 / num_bits;
+
+  // Verify B
+  TORCH_CHECK(b_q_weight.size(0) == size_k,
+              "b_q_weight.size(0) = ", b_q_weight.size(0),
+              " is not size_k = ", size_k);
+  TORCH_CHECK((size_n / pack_factor) == b_q_weight.size(1),
+              "Shape mismatch: b_q_weight.size(1) = ", b_q_weight.size(1),
+              ", size_n = ", size_n, ", pack_factor = ", pack_factor);
+
+  // Verify device and strides
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+  TORCH_CHECK(b_q_weight.dtype() == at::kInt, "b_q_weight type is not kInt");
+
+  // Alloc buffers
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(b_q_weight));
+  auto options = torch::TensorOptions()
+                     .dtype(b_q_weight.dtype())
+                     .device(b_q_weight.device());
+  torch::Tensor out = torch::empty(
+      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
+      options);
+
+  // Get ptrs
+  uint32_t const* b_q_weight_ptr =
+      reinterpret_cast<uint32_t const*>(b_q_weight.data_ptr());
+  uint32_t* out_ptr = reinterpret_cast<uint32_t*>(out.data_ptr());
+
+  // Get dev info
+  int dev = b_q_weight.get_device();
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
+  int blocks;
+  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
+
+  int max_shared_mem = 0;
+  cudaDeviceGetAttribute(&max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  TORCH_CHECK(max_shared_mem > 0);
+
+  if (false) {
+  }
+  CALL_IF(4)
+  CALL_IF(8)
+  else {
+    TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits);
+  }
+
+  return out;
+}
+
+torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
+                                     c10::SymInt size_k, c10::SymInt size_n,
+                                     int64_t num_bits) {
+  int const pack_factor = 32 / num_bits;
+  auto options = torch::TensorOptions()
+                     .dtype(b_q_weight.dtype())
+                     .device(b_q_weight.device());
+  return torch::empty_symint(
+      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
+      options);
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("awq_marlin_repack", &awq_marlin_repack);
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, Meta, m) {
+  m.impl("awq_marlin_repack", &awq_marlin_repack_meta);
+}
diff --git a/vllm_v0.10.0/csrc/quantization/gptq_marlin/dequant.h b/vllm_v0.10.0/csrc/quantization/gptq_marlin/dequant.h
new file mode 100644
index 0000000..ae0d6c0
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/gptq_marlin/dequant.h
@@ -0,0 +1,507 @@
+/*
+Fast Dequantization (Converting INT4/INT8/FP4/FP8 to FP16/BF16)
+
+The process of fast dequantization can be summarized as a combination
+of bitwise operations and floating-point computations:
+
+weight =>(bit_op / bitwise operations)=>
+f16_value =>(flop / floating-point computation)=>
+dequantized_weight
+
+Since the dequantized weights typically require subtracting the zero point and
+applying a scale factor, the floating-point computation step can be fused with
+the zero-point subtraction and scaling operations.
+
+The following are the parts that need to be modified for the fused operation
+of zero-point subtraction and scaling.
+
+## INT4 => FP16/BF16 or INT8 => FP16
+
+The floating-point computation is `__hsub2`
+
+If has zero points:
+
+    flop(bit_op(weight)) - flop(bit_op(zp))
+  = sub(bit_op(weight), bias) - sub(bit_op(zp), bias)
+  = bit_op(weight) - bit_op(zp)
+
+so we don't need additional modification.
+
+If has float zero points:
+
+    flop(bit_op(weight)) - fzp
+  = sub(bit_op(weight), bias) - fzp
+  = bit_op(weight) - (fzp + bias)
+
+where the `fzp + bias` can be computed at weight loading. But this
+may have accuracy issue, so we should not use this in most cases.
+
+If has not zero points:
+
+    scale(flop(bit_op(weight)))
+  = scale(sub(bit_op(weight), bias))
+  = scale(bit_op(weight)) - scale(bias)
+  = fma(bit_op(weight), scale_factor, scale(bias))
+
+where the `scale(bias)` can be cached. But this may have accuracy issue,
+so we should not use this in most cases.
+
+
+## INT8 => BF16
+
+INT8 => BF16 is a special case, it use byte_perm instead of flop.
+We cannot fused byte_perm with scaling.
+
+
+## FP4/FP8 => FP16/BF16
+
+    scale(flop(bit_op(weight)))
+  = scale(mul(bit_op(weight), multiplier))
+  = mul(bit_op(weight), scale_factor * multiplier)
+
+where `scale_factor * multiplier` can be computed at weight loading.
+
+*/
+
+#include "marlin_dtypes.cuh"
+
+namespace MARLIN_NAMESPACE_NAME {
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
+// Lookup-table based 3-input logical operation; explicitly used for
+// dequantization as the compiler does not seem to automatically recognize it in
+// all cases.
+template <int lut>
+__device__ inline int lop3(int a, int b, int c) {
+  int res;
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(res)
+               : "r"(a), "r"(b), "r"(c), "n"(lut));
+  return res;
+}
+
+// Constructs destination register by taking bytes from 2 sources (based on
+// mask)
+template <int start_byte, int mask>
+__device__ inline uint32_t prmt(uint32_t a) {
+  uint32_t res;
+  asm volatile("prmt.b32 %0, %1, %2, %3;\n"
+               : "=r"(res)
+               : "r"(a), "n"(start_byte), "n"(mask));
+  return res;
+}
+
+template <typename scalar_t2, vllm::ScalarTypeId w_type_id,
+          bool skip_flop = false>
+__device__ inline void dequant(int q, scalar_t2* frag_b);
+
+//
+// Efficiently dequantize 4bit values packed in an int32 value into a full
+// B-fragment of 4 fp16 values. We mostly follow the strategy in the link below,
+// with some small changes:
+// - FP16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L215-L287
+// - BF16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L327-L385
+//
+template <>
+__device__ inline void dequant<half2, vllm::kU4B8.id(), true>(int q,
+                                                              half2* frag_b) {
+  const int MASK = 0x000f000f;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  q >>= 4;
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+
+  frag_b[0] = *reinterpret_cast<half2*>(&lo);
+  frag_b[1] = *reinterpret_cast<half2*>(&hi);
+}
+
+template <>
+__device__ inline void dequant<half2, vllm::kU4B8.id(), false>(int q,
+                                                               half2* frag_b) {
+  const int LO = 0x000f000f;
+  const int HI = 0x00f000f0;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  // clang-format off
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  // clang-format on
+  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
+  // directly into `SUB` and `ADD`.
+  const int SUB = 0x64086408;
+  const int MUL = 0x2c002c00;
+  const int ADD = 0xd480d480;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&SUB));
+  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&MUL),
+                      *reinterpret_cast<const half2*>(&ADD));
+}
+
+template <>
+__device__ inline void dequant<half2, vllm::kU4.id(), true>(int q,
+                                                            half2* frag_b) {
+  dequant<half2, vllm::kU4B8.id(), true>(q, frag_b);
+}
+
+template <>
+__device__ inline void dequant<half2, vllm::kU4.id(), false>(int q,
+                                                             half2* frag_b) {
+  const int LO = 0x000f000f;
+  const int HI = 0x00f000f0;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  // clang-format off
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  // clang-format on
+  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
+  // directly into `SUB` and `ADD`.
+  const int SUB = 0x64006400;
+  const int MUL = 0x2c002c00;
+  const int ADD = 0xd400d400;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&SUB));
+  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&MUL),
+                      *reinterpret_cast<const half2*>(&ADD));
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kU4B8.id(), true>(
+    int q, nv_bfloat162* frag_b) {
+  static constexpr uint32_t MASK = 0x000f000f;
+  static constexpr uint32_t EX = 0x43004300;
+
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  // clang-format off
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  q >>= 4;
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  // clang-format on
+
+  frag_b[0] = *reinterpret_cast<nv_bfloat162*>(&lo);
+  frag_b[1] = *reinterpret_cast<nv_bfloat162*>(&hi);
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kU4B8.id(), false>(
+    int q, nv_bfloat162* frag_b) {
+  dequant<nv_bfloat162, vllm::kU4B8.id(), true>(q, frag_b);
+
+  static constexpr uint32_t SUB = 0x43084308;
+
+  frag_b[0] = __hsub2(frag_b[0], *reinterpret_cast<const nv_bfloat162*>(&SUB));
+  frag_b[1] = __hsub2(frag_b[1], *reinterpret_cast<const nv_bfloat162*>(&SUB));
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kU4.id(), true>(
+    int q, nv_bfloat162* frag_b) {
+  dequant<nv_bfloat162, vllm::kU4B8.id(), true>(q, frag_b);
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kU4.id(), false>(
+    int q, nv_bfloat162* frag_b) {
+  dequant<nv_bfloat162, vllm::kU4.id(), true>(q, frag_b);
+
+  static constexpr uint32_t SUB = 0x43004300;
+
+  frag_b[0] = __hsub2(frag_b[0], *reinterpret_cast<const nv_bfloat162*>(&SUB));
+  frag_b[1] = __hsub2(frag_b[1], *reinterpret_cast<const nv_bfloat162*>(&SUB));
+}
+
+//
+// Fast Int8ToFp16/Int8ToBf16: Efficiently dequantize 8bit int values to fp16 or
+// bf16 Reference:
+// - FP16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85
+// - BF16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L125-L175
+//
+template <>
+__device__ inline void dequant<half2, vllm::kU8B128.id(), true>(int q,
+                                                                half2* frag_b) {
+  static constexpr uint32_t mask_for_elt_01 = 0x5250;
+  static constexpr uint32_t mask_for_elt_23 = 0x5351;
+  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+
+  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
+  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
+
+  frag_b[0] = *reinterpret_cast<half2*>(&lo);
+  frag_b[1] = *reinterpret_cast<half2*>(&hi);
+}
+
+template <>
+__device__ inline void dequant<half2, vllm::kU8B128.id(), false>(
+    int q, half2* frag_b) {
+  dequant<half2, vllm::kU8B128.id(), true>(q, frag_b);
+
+  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
+  frag_b[0] = __hsub2(frag_b[0],
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  frag_b[1] = __hsub2(frag_b[1],
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+}
+
+template <>
+__device__ inline void dequant<half2, vllm::kU8.id(), true>(int q,
+                                                            half2* frag_b) {
+  dequant<half2, vllm::kU8B128.id(), true>(q, frag_b);
+}
+
+template <>
+__device__ inline void dequant<half2, vllm::kU8.id(), false>(int q,
+                                                             half2* frag_b) {
+  dequant<half2, vllm::kU8.id(), true>(q, frag_b);
+
+  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64006400;
+  frag_b[0] = __hsub2(frag_b[0],
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  frag_b[1] = __hsub2(frag_b[1],
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kU8B128.id(), false>(
+    int q, nv_bfloat162* frag_b) {
+  float fp32_intermediates[4];
+  uint32_t* fp32_intermediates_casted =
+      reinterpret_cast<uint32_t*>(fp32_intermediates);
+
+  static constexpr uint32_t fp32_base = 0x4B000000;
+  fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650);
+  fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652);
+  fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651);
+  fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653);
+
+  fp32_intermediates[0] -= 8388736.f;
+  fp32_intermediates[1] -= 8388736.f;
+  fp32_intermediates[2] -= 8388736.f;
+  fp32_intermediates[3] -= 8388736.f;
+
+  uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(frag_b);
+  bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0],
+                                   fp32_intermediates_casted[1], 0x7632);
+  bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2],
+                                   fp32_intermediates_casted[3], 0x7632);
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kU8.id(), false>(
+    int q, nv_bfloat162* frag_b) {
+  float fp32_intermediates[4];
+  uint32_t* fp32_intermediates_casted =
+      reinterpret_cast<uint32_t*>(fp32_intermediates);
+
+  static constexpr uint32_t fp32_base = 0x4B000000;
+  fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650);
+  fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652);
+  fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651);
+  fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653);
+
+  fp32_intermediates[0] -= 8388608.f;
+  fp32_intermediates[1] -= 8388608.f;
+  fp32_intermediates[2] -= 8388608.f;
+  fp32_intermediates[3] -= 8388608.f;
+
+  uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(frag_b);
+  bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0],
+                                   fp32_intermediates_casted[1], 0x7632);
+  bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2],
+                                   fp32_intermediates_casted[3], 0x7632);
+}
+
+template <>
+__device__ inline void dequant<half2, vllm::kFE4M3fn.id(), true>(
+    int q, half2* frag_b) {
+  // Constants for FP8 (E4M3) and FP16 formats
+  constexpr int FP8_EXPONENT = 4, FP16_EXPONENT = 5;
+  constexpr int RIGHT_SHIFT = FP16_EXPONENT - FP8_EXPONENT;
+  constexpr int MASK = 0x7F007F00;
+
+  // Extract and shift FP8 values to FP16 format
+  int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+  q <<= 8;
+  int Out2 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = *reinterpret_cast<const half2*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const half2*>(&Out2);
+}
+
+template <>
+__device__ inline void dequant<half2, vllm::kFE4M3fn.id(), false>(
+    int q, half2* frag_b) {
+  dequant<half2, vllm::kFE4M3fn.id(), true>(q, frag_b);
+
+  // Constants for FP8 (E4M3) and FP16 formats
+  constexpr int FP8_EXPONENT = 4, FP16_EXPONENT = 5;
+
+  // Construct and apply exponent bias
+  constexpr int BIAS_OFFSET =
+      (1 << (FP16_EXPONENT - 1)) - (1 << (FP8_EXPONENT - 1));
+  const half2 bias_reg = __float2half2_rn(float(1 << BIAS_OFFSET));
+
+  // Convert to half2 and apply bias
+  frag_b[1] = __hmul2(frag_b[1], bias_reg);
+  frag_b[0] = __hmul2(frag_b[0], bias_reg);
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kFE4M3fn.id(), true>(
+    int q, nv_bfloat162* frag_b) {
+  // Constants for FP8 (E4M3) and BF16 formats
+  constexpr int FP8_EXPONENT = 4, BF16_EXPONENT = 8;
+  constexpr int RIGHT_SHIFT = BF16_EXPONENT - FP8_EXPONENT;
+
+  constexpr int MASK = 0x7F007F00;
+
+  // Extract and shift FP8 values to BF16 format
+  int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+  q <<= 8;
+  int Out2 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = *reinterpret_cast<const nv_bfloat162*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const nv_bfloat162*>(&Out2);
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kFE4M3fn.id(), false>(
+    int q, nv_bfloat162* frag_b) {
+  dequant<nv_bfloat162, vllm::kFE4M3fn.id(), true>(q, frag_b);
+
+  // Constants for FP8 (E4M3) and BF16 formats
+  constexpr int FP8_EXPONENT = 4, BF16_EXPONENT = 8;
+
+  // Construct and apply exponent bias
+  constexpr int BIAS_OFFSET =
+      (1 << (BF16_EXPONENT - 1)) - (1 << (FP8_EXPONENT - 1));
+  // Add 127 (float exponent bias) to BIAS_OFFSET and shift to float exponent
+  // position
+  constexpr uint32_t BIAS = (BIAS_OFFSET + 127) << 23;
+  const nv_bfloat162 bias_reg =
+      __float2bfloat162_rn(*reinterpret_cast<const float*>(&BIAS));
+
+  // Convert to bfloat162 and apply bias
+  frag_b[1] = __hmul2(frag_b[1], bias_reg);
+  frag_b[0] = __hmul2(frag_b[0], bias_reg);
+}
+
+template <>
+__device__ inline void dequant<half2, vllm::kFE2M1f.id(), true>(int q,
+                                                                half2* frag_b) {
+  // Constants for FP4 (E2M1) and FP16 formats
+  constexpr int FP4_EXPONENT = 2, FP16_EXPONENT = 5;
+  constexpr int RIGHT_SHIFT = FP16_EXPONENT - FP4_EXPONENT;
+  constexpr int MASK = 0x70007000;
+
+  // Extract and shift FP4 values to FP16 format
+  int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+  q <<= 4;
+  int Out2 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = *reinterpret_cast<const half2*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const half2*>(&Out2);
+}
+
+template <>
+__device__ inline void dequant<half2, vllm::kFE2M1f.id(), false>(
+    int q, half2* frag_b) {
+  dequant<half2, vllm::kFE2M1f.id(), true>(q, frag_b);
+
+  // Constants for FP4 (E2M1) and FP16 formats
+  constexpr int FP4_EXPONENT = 2, FP16_EXPONENT = 5;
+
+  // Construct and apply exponent bias
+  constexpr int BIAS_OFFSET =
+      (1 << (FP16_EXPONENT - 1)) - (1 << (FP4_EXPONENT - 1));
+  const half2 bias_reg = __float2half2_rn(float(1 << BIAS_OFFSET));
+
+  // Convert to half2 and apply bias
+  frag_b[1] = __hmul2(frag_b[1], bias_reg);
+  frag_b[0] = __hmul2(frag_b[0], bias_reg);
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kFE2M1f.id(), true>(
+    int q, nv_bfloat162* frag_b) {
+  // Constants for FP4 (E2M1) and FP16 formats
+  constexpr int FP4_EXPONENT = 2, BF16_EXPONENT = 8;
+  constexpr int RIGHT_SHIFT = BF16_EXPONENT - FP4_EXPONENT;
+  constexpr int MASK = 0x70007000;
+
+  // Extract and shift FP4 values to FP16 format
+  int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+  q <<= 4;
+  int Out2 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = *reinterpret_cast<const nv_bfloat162*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const nv_bfloat162*>(&Out2);
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kFE2M1f.id(), false>(
+    int q, nv_bfloat162* frag_b) {
+  dequant<nv_bfloat162, vllm::kFE2M1f.id(), true>(q, frag_b);
+
+  // Constants for FP4 (E2M1) and BF16 formats
+  constexpr int FP4_EXPONENT = 2, BF16_EXPONENT = 8;
+
+  // Construct and apply exponent bias
+  constexpr int BIAS_OFFSET =
+      (1 << (BF16_EXPONENT - 1)) - (1 << (FP4_EXPONENT - 1));
+  // Add 127 (float exponent bias) to BIAS_OFFSET and shift to float exponent
+  // position
+  constexpr uint32_t BIAS = (BIAS_OFFSET + 127) << 23;
+  const nv_bfloat162 bias_reg =
+      __float2bfloat162_rn(*reinterpret_cast<const float*>(&BIAS));
+
+  // Convert to half2 and apply bias
+  frag_b[1] = __hmul2(frag_b[1], bias_reg);
+  frag_b[0] = __hmul2(frag_b[0], bias_reg);
+}
+
+template <typename scalar_t2>
+__device__ inline void dequant_fp8_scales(int q, scalar_t2* frag_b);
+
+template <>
+__device__ inline void dequant_fp8_scales<half2>(int q, half2* frag_b) {
+  int Out1 = (q & 0xFF00FF00) >> 1;
+  ;
+  q <<= 8;
+  int Out2 = (q & 0xFF00FF00) >> 1;
+
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = *reinterpret_cast<const half2*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const half2*>(&Out2);
+};
+
+template <>
+__device__ inline void dequant_fp8_scales<nv_bfloat162>(int q,
+                                                        nv_bfloat162* frag_b) {
+  constexpr int FP8_EXPONENT = 4, BF16_EXPONENT = 8;
+  constexpr int RIGHT_SHIFT = BF16_EXPONENT - FP8_EXPONENT;
+  constexpr int MASK = 0x7F007F00;
+
+  // Extract and shift FP8 values to BF16 format
+  int Out1 = ((q & 0x80008000) >> 1) | ((q & MASK) >> RIGHT_SHIFT);
+  q <<= 8;
+  int Out2 = ((q & 0x80008000) >> 1) | ((q & MASK) >> RIGHT_SHIFT);
+
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = *reinterpret_cast<const nv_bfloat162*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const nv_bfloat162*>(&Out2);
+}
+
+#endif
+
+}  // namespace MARLIN_NAMESPACE_NAME
diff --git a/vllm_v0.10.0/csrc/quantization/gptq_marlin/generate_kernels.py b/vllm_v0.10.0/csrc/quantization/gptq_marlin/generate_kernels.py
new file mode 100644
index 0000000..18fb6c1
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/gptq_marlin/generate_kernels.py
@@ -0,0 +1,126 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import glob
+import itertools
+import os
+import subprocess
+
+import jinja2
+
+FILE_HEAD = """
+// auto generated by generate.py
+// clang-format off
+
+#include "kernel.h"
+#include "marlin_template.h"
+
+namespace MARLIN_NAMESPACE_NAME {
+""".strip()
+
+TEMPLATE = ("template __global__ void Marlin<"
+            "{{scalar_t}}, "
+            "{{w_type_id}}, "
+            "{{threads}}, "
+            "{{thread_m_blocks}}, "
+            "{{thread_n_blocks}}, "
+            "{{thread_k_blocks}}, "
+            "{{'true' if m_block_size_8 else 'false'}}, "
+            "{{stages}}, "
+            "{{group_blocks}}, "
+            "{{'true' if is_zp_float else 'false'}}>"
+            "( MARLIN_KERNEL_PARAMS );")
+
+# int8 with zero point case (vllm::kU8) is also supported,
+# we don't add it to reduce wheel size.
+SCALAR_TYPES = [
+    "vllm::kU4", "vllm::kU4B8", "vllm::kU8B128", "vllm::kFE4M3fn",
+    "vllm::kFE2M1f"
+]
+THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128),
+                  (128, 64, 128)]
+
+THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
+# group_blocks:
+#   = 0 : act order case
+#   = -1 : channelwise quantization
+#   > 0 : group_size=16*group_blocks
+GROUP_BLOCKS = [0, 1, -1, 2, 4, 8]
+DTYPES = ["fp16", "bf16"]
+
+
+def remove_old_kernels():
+    for filename in glob.glob(os.path.dirname(__file__) + "/kernel_*.cu"):
+        subprocess.call(["rm", "-f", filename])
+
+
+def generate_new_kernels():
+    for scalar_type, dtype in itertools.product(SCALAR_TYPES, DTYPES):
+        all_template_str_list = []
+
+        for group_blocks, m_blocks, thread_configs in itertools.product(
+                GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):
+
+            # act order case only support gptq-int4 and gptq-int8
+            if group_blocks == 0 and scalar_type not in [
+                    "vllm::kU4B8", "vllm::kU8B128"
+            ]:
+                continue
+            if thread_configs[2] == 256:
+                # for small batch (m_blocks == 1), we only need (128, 128, 256)
+                # for large batch (m_blocks > 1), we only need (64, 256, 256)
+                if m_blocks <= 1 and thread_configs[0] != 128:
+                    continue
+                if m_blocks > 1 and thread_configs[0] != 64:
+                    continue
+
+            # we only support channelwise quantization and group_size == 128
+            # for fp8
+            if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]:
+                continue
+            # nvfp4 only supports group_size == 16
+            if scalar_type == "vllm::kFE2M1f" and group_blocks != 1:
+                continue
+            # other quantization methods don't support group_size = 16
+            if scalar_type != "vllm::kFE2M1f" and group_blocks == 1:
+                continue
+
+            k_blocks = thread_configs[0] // 16
+            n_blocks = thread_configs[1] // 16
+            threads = thread_configs[2]
+
+            c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
+
+            is_zp_float_list = [False]
+            if dtype == "fp16" and scalar_type == "vllm::kU4" and \
+                    group_blocks == 4:
+                # HQQ (is_zp_float = true) only supports
+                # 4bit quantization and fp16
+                is_zp_float_list.append(True)
+
+            for is_zp_float in is_zp_float_list:
+                template_str = jinja2.Template(TEMPLATE).render(
+                    scalar_t=c_dtype,
+                    w_type_id=scalar_type + ".id()",
+                    threads=threads,
+                    thread_m_blocks=max(m_blocks, 1),
+                    thread_n_blocks=n_blocks,
+                    thread_k_blocks=k_blocks,
+                    m_block_size_8=m_blocks == 0.5,
+                    stages="pipe_stages",
+                    group_blocks=group_blocks,
+                    is_zp_float=is_zp_float,
+                )
+
+                all_template_str_list.append(template_str)
+
+        file_content = FILE_HEAD + "\n\n"
+        file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
+        filename = f"kernel_{dtype}_{scalar_type[6:].lower()}.cu"
+
+        with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
+            f.write(file_content)
+
+
+if __name__ == "__main__":
+    remove_old_kernels()
+    generate_new_kernels()
diff --git a/vllm_v0.10.0/csrc/quantization/gptq_marlin/gptq_marlin.cu b/vllm_v0.10.0/csrc/quantization/gptq_marlin/gptq_marlin.cu
new file mode 100644
index 0000000..4a242f2
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -0,0 +1,901 @@
+/*
+ * Modified by Neural Magic
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Adapted from https://github.com/IST-DASLab/marlin
+ */
+
+#ifndef MARLIN_NAMESPACE_NAME
+  #define MARLIN_NAMESPACE_NAME marlin
+#endif
+
+#include "kernel.h"
+#include "core/registration.h"
+
+#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
+  static_assert(std::is_same<scalar_t, half>::value ||          \
+                    std::is_same<scalar_t, nv_bfloat16>::value, \
+                "only float16 and bfloat16 is supported");
+
+namespace marlin {
+
+__global__ void MarlinDefault(MARLIN_KERNEL_PARAMS){};
+
+using MarlinFuncPtr = void (*)(MARLIN_KERNEL_PARAMS);
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+
+__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
+                                    int const* __restrict__ perm_int_ptr,
+                                    int4* __restrict__ out_int4_ptr, int size_m,
+                                    int size_k, int lda, int block_rows) {}
+
+}  // namespace marlin
+
+torch::Tensor gptq_marlin_gemm(
+    torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
+    torch::Tensor& b_q_weight, torch::Tensor& b_scales,
+    std::optional<torch::Tensor> const& b_zeros_or_none,
+    std::optional<torch::Tensor> const& g_idx_or_none,
+    std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
+    vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
+    int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
+    bool is_zp_float) {
+  TORCH_CHECK_NOT_IMPLEMENTED(false,
+                              "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
+  return torch::empty({1, 1});
+}
+
+#else
+
+// For a given "a" of size [M,K] performs a permutation of the K columns based
+// on the given "perm" indices.
+__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
+                                    int const* __restrict__ perm_int_ptr,
+                                    int4* __restrict__ out_int4_ptr, int size_m,
+                                    int size_k, int lda, int block_rows) {
+  auto start_row = block_rows * blockIdx.x;
+  int finish_row = start_row + block_rows;
+  if (finish_row > size_m) {
+    finish_row = size_m;
+  }
+  int cur_block_rows = finish_row - start_row;
+
+  int input_row_stride = lda * sizeof(half) / 16;
+  int output_row_stride = size_k * sizeof(half) / 16;
+
+  auto permute_row = [&](int row) {
+    int iters = size_k / default_threads;
+    int rest = size_k % default_threads;
+
+    int input_offset = row * input_row_stride;
+    int output_offset = row * output_row_stride;
+
+    half const* a_row_half =
+        reinterpret_cast<half const*>(a_int4_ptr + input_offset);
+    half* out_half = reinterpret_cast<half*>(out_int4_ptr + output_offset);
+
+    int base_k = 0;
+
+    for (int i = 0; i < iters; i++) {
+      auto cur_k = base_k + threadIdx.x;
+      int src_pos = perm_int_ptr[cur_k];
+
+      out_half[cur_k] = a_row_half[src_pos];
+
+      base_k += default_threads;
+    }
+
+    if (rest) {
+      if (threadIdx.x < rest) {
+        auto cur_k = base_k + threadIdx.x;
+        int src_pos = perm_int_ptr[cur_k];
+
+        out_half[cur_k] = a_row_half[src_pos];
+      }
+    }
+  };
+
+  for (int i = 0; i < cur_block_rows; i++) {
+    int cur_row = start_row + i;
+    if (cur_row < size_m) {
+      permute_row(cur_row);
+    }
+  }
+}
+
+typedef struct {
+  int thread_k;
+  int thread_n;
+  int num_threads;
+} thread_config_t;
+
+thread_config_t small_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {128, 128, 256},
+    {64, 128, 128},
+    {128, 64, 128}};
+
+thread_config_t large_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {64, 256, 256},
+    {64, 128, 128},
+    {128, 64, 128}};
+
+typedef struct {
+  int blocks_per_sm;
+  thread_config_t tb_cfg;
+} exec_config_t;
+
+int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
+                          int prob_n, int prob_k, int num_bits, int group_size,
+                          bool has_act_order, bool is_k_full) {
+  bool cache_scales_chunk = has_act_order && !is_k_full;
+
+  int tb_n = th_config.thread_n;
+  int tb_k = th_config.thread_k;
+
+  // Get max scale groups per thread-block
+  int tb_groups;
+  if (group_size == -1) {
+    tb_groups = 1;
+  } else if (group_size == 0) {
+    tb_groups = div_ceil(tb_k, 32);  // Worst case is 32 group size
+  } else {
+    tb_groups = div_ceil(tb_k, group_size);
+  }
+
+  if (cache_scales_chunk) {
+    int load_groups =
+        tb_groups * pipe_stages * 2;     // Chunk size is 2x pipeline over dim K
+    load_groups = max(load_groups, 32);  // We load at least 32 scale groups
+    return load_groups * tb_n * 2;
+  } else {
+    int tb_scales = tb_groups * tb_n * 2;
+
+    return tb_scales * pipe_stages;
+  }
+}
+
+int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks,
+                          int prob_m, int prob_n, int prob_k, int num_bits,
+                          int group_size, bool has_act_order, bool is_k_full,
+                          int has_zp, int is_zp_float) {
+  int pack_factor = 32 / num_bits;
+
+  // Get B size
+  int tb_k = th_config.thread_k;
+  int tb_n = th_config.thread_n;
+  int tb_m = thread_m_blocks * 16;
+  int sh_a_size = pipe_stages * (tb_m * tb_k) * 2;
+  int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
+  int sh_red_size = tb_m * (tb_n + 8);
+  int sh_s_size =
+      get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
+                            group_size, has_act_order, is_k_full);
+  int sh_g_idx_size = has_act_order && !is_k_full ? pipe_stages * tb_k / 4 : 0;
+  int sh_zp_size = 0;
+  if (has_zp) {
+    if (is_zp_float)
+      sh_zp_size = sh_s_size;
+    else if (num_bits == 4)
+      sh_zp_size = sh_s_size / 4;
+    else if (num_bits == 8)
+      sh_zp_size = sh_s_size / 2;
+  }
+
+  int total_size = max(sh_b_size, sh_red_size) + sh_a_size + sh_s_size +
+                   sh_zp_size + sh_g_idx_size;
+
+  return total_size;
+}
+
+bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks,
+                     int prob_m, int prob_n, int prob_k, int num_bits,
+                     int group_size, bool has_act_order, bool is_k_full,
+                     int has_zp, int is_zp_float, int max_shared_mem) {
+  // Sanity
+  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
+      th_config.num_threads == -1) {
+    return false;
+  }
+
+  // Verify K/N are divisible by thread K/N
+  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
+    return false;
+  }
+
+  // Verify min for thread K/N
+  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
+    return false;
+  }
+
+  // num_threads must be at least 128 (= 4 warps)
+  if (th_config.num_threads < 128) {
+    return false;
+  }
+
+  // Check that pipeline fits into cache
+  int cache_size = get_kernel_cache_size(
+      th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size,
+      has_act_order, is_k_full, has_zp, is_zp_float);
+  return cache_size <= max_shared_mem;
+}
+
+  #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
+                  M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT)    \
+    else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&       \
+             thread_n_blocks == THREAD_N_BLOCKS &&                           \
+             thread_k_blocks == THREAD_K_BLOCKS &&                           \
+             m_block_size_8 == M_BLOCK_SIZE_8 &&                             \
+             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&   \
+             is_zp_float == IS_ZP_FLOAT) {                                   \
+      kernel = Marlin<scalar_t, W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS,   \
+                      THREAD_N_BLOCKS, THREAD_K_BLOCKS, M_BLOCK_SIZE_8,      \
+                      pipe_stages, GROUP_BLOCKS, IS_ZP_FLOAT>;               \
+    }
+
+  // COMMON: cases for (group_blocks in [-1, 2, 4, 8] and is_zp_float == false)
+  //         this is the most common cases
+  // BIGGROUP: cases for big group size (group_blocks in [-1, 8])
+  // FZP: cases for float-zero-point (is_zp_float = true)
+  // ACT: cases for act order case (group_blocks == 0)
+  // FP4: cases for nvfp4(e2m1) (group_blocks == 1)
+  #define COMMON_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false)  \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false)   \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, false)   \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false)   \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
+
+  #define COMMON_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
+    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
+    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
+    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
+                                                                          \
+    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
+    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
+    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
+                                                                          \
+    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
+    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
+    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
+
+  #define COMMON_GET_IF(W_TYPE)            \
+    COMMON_GET_IF_M1(W_TYPE, 8, 8, 256)    \
+    COMMON_GET_IF_M1(W_TYPE, 8, 4, 128)    \
+    COMMON_GET_IF_M1(W_TYPE, 4, 8, 128)    \
+    COMMON_GET_IF_M234(W_TYPE, 16, 4, 256) \
+    COMMON_GET_IF_M234(W_TYPE, 8, 4, 128)  \
+    COMMON_GET_IF_M234(W_TYPE, 4, 8, 128)
+
+  #define BIGGROUP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false)  \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false)   \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
+
+  #define BIGGROUP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)   \
+    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
+    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
+    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
+
+  #define BIGGROUP_GET_IF(W_TYPE)            \
+    BIGGROUP_GET_IF_M1(W_TYPE, 8, 8, 256)    \
+    BIGGROUP_GET_IF_M1(W_TYPE, 8, 4, 128)    \
+    BIGGROUP_GET_IF_M1(W_TYPE, 4, 8, 128)    \
+    BIGGROUP_GET_IF_M234(W_TYPE, 16, 4, 256) \
+    BIGGROUP_GET_IF_M234(W_TYPE, 8, 4, 128)  \
+    BIGGROUP_GET_IF_M234(W_TYPE, 4, 8, 128)
+
+  #define FP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)        \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
+
+  #define FP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
+    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
+
+  #define FP4_GET_IF(W_TYPE)            \
+    FP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
+    FP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
+    FP4_GET_IF_M1(W_TYPE, 4, 8, 128)    \
+    FP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
+    FP4_GET_IF_M234(W_TYPE, 8, 4, 128)  \
+    FP4_GET_IF_M234(W_TYPE, 4, 8, 128)
+
+  // We currently have 4-bit models only with group_blocks == 4
+  #define FZP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, true) \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true)
+
+  #define FZP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
+    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \
+    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \
+    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true)
+
+  #define FZP_GET_IF(W_TYPE)            \
+    FZP_GET_IF_M1(W_TYPE, 8, 8, 256)    \
+    FZP_GET_IF_M1(W_TYPE, 8, 4, 128)    \
+    FZP_GET_IF_M1(W_TYPE, 4, 8, 128)    \
+    FZP_GET_IF_M234(W_TYPE, 16, 4, 256) \
+    FZP_GET_IF_M234(W_TYPE, 8, 4, 128)  \
+    FZP_GET_IF_M234(W_TYPE, 4, 8, 128)
+
+  // We currently have 4-bit models only with group_blocks == 4
+  #define ACT_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)        \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false)
+
+  #define ACT_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
+    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false)
+
+  #define ACT_GET_IF(W_TYPE)            \
+    ACT_GET_IF_M1(W_TYPE, 8, 8, 256)    \
+    ACT_GET_IF_M1(W_TYPE, 8, 4, 128)    \
+    ACT_GET_IF_M1(W_TYPE, 4, 8, 128)    \
+    ACT_GET_IF_M234(W_TYPE, 16, 4, 256) \
+    ACT_GET_IF_M234(W_TYPE, 8, 4, 128)  \
+    ACT_GET_IF_M234(W_TYPE, 4, 8, 128)
+
+template <typename scalar_t>
+MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
+                                int thread_m_blocks, int thread_n_blocks,
+                                int thread_k_blocks, bool m_block_size_8,
+                                bool has_act_order, bool has_zp,
+                                int group_blocks, int num_threads,
+                                bool is_zp_float) {
+  int num_bits = q_type.size_bits();
+  auto kernel = MarlinDefault;
+  if (false) {
+  }
+
+  COMMON_GET_IF(vllm::kU4)
+  COMMON_GET_IF(vllm::kU4B8)
+  COMMON_GET_IF(vllm::kU8B128)
+
+  FP4_GET_IF(vllm::kFE2M1f)
+
+  BIGGROUP_GET_IF(vllm::kFE4M3fn)
+
+  ACT_GET_IF(vllm::kU4B8)
+  ACT_GET_IF(vllm::kU8B128)
+
+  if (std::is_same<scalar_t, half>::value) {
+    if (false) {
+    }
+    FZP_GET_IF(vllm::kU4)
+  }
+
+  return kernel;
+}
+
+template <typename scalar_t>
+exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
+                                    int prob_n, int prob_k, int thread_m_blocks,
+                                    bool m_block_size_8, int num_bits,
+                                    int group_size, bool has_act_order,
+                                    bool is_k_full, bool has_zp,
+                                    bool is_zp_float, int max_shared_mem,
+                                    int sms) {
+  exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
+  thread_config_t* thread_configs = thread_m_blocks > 1
+                                        ? large_batch_thread_configs
+                                        : small_batch_thread_configs;
+  int thread_configs_size =
+      thread_m_blocks > 1
+          ? sizeof(large_batch_thread_configs) / sizeof(thread_config_t)
+          : sizeof(small_batch_thread_configs) / sizeof(thread_config_t);
+
+  for (int i = 0; i < thread_configs_size; i++) {
+    thread_config_t th_config = thread_configs[i];
+
+    if (!is_valid_config(th_config, thread_m_blocks, prob_m, prob_n, prob_k,
+                         num_bits, group_size, has_act_order, is_k_full, has_zp,
+                         is_zp_float, max_shared_mem)) {
+      continue;
+    }
+
+    int cache_size = get_kernel_cache_size(
+        th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits,
+        group_size, has_act_order, is_k_full, has_zp, is_zp_float);
+
+    int group_blocks = 0;
+    if (!has_act_order) {
+      group_blocks = group_size == -1 ? -1 : group_size / 16;
+    }
+
+    auto kernel = get_marlin_kernel<scalar_t>(
+        q_type, thread_m_blocks, th_config.thread_n / 16,
+        th_config.thread_k / 16, m_block_size_8, has_act_order, has_zp,
+        group_blocks, th_config.num_threads, is_zp_float);
+
+    if (kernel == MarlinDefault) continue;
+
+    // int m_tiles = div_ceil(prob_m, thread_m_blocks * 16);
+    // int n_tiles = prob_n / th_config.thread_n;
+    // int k_tiles = prob_k / th_config.thread_k;
+
+    return {1, th_config};
+  }
+
+  return exec_cfg;
+}
+
+template <typename scalar_t>
+void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
+               void* s2, void* zp, void* g_idx, void* perm, void* a_tmp,
+               int prob_m, int prob_n, int prob_k, int lda, void* workspace,
+               vllm::ScalarType const& q_type, bool has_act_order,
+               bool is_k_full, bool has_zp, int num_groups, int group_size,
+               int dev, cudaStream_t stream, int thread_k_init,
+               int thread_n_init, int sms, bool use_atomic_add,
+               bool use_fp32_reduce, bool is_zp_float) {
+  if (has_zp) {
+    TORCH_CHECK(
+        q_type == vllm::kU4 || q_type == vllm::kU8,
+        "q_type must be u4 or u8 when has_zp = True. Got = ", q_type.str());
+  } else {
+    TORCH_CHECK(
+        q_type == vllm::kU4B8 || q_type == vllm::kU8B128 ||
+            q_type == vllm::kFE4M3fn || q_type == vllm::kFE2M1f,
+        "q_type must be uint4b8, uint8b128, float8_e4m3fn or float4_e2m1f when "
+        "has_zp = False. Got = ",
+        q_type.str());
+  }
+
+  TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
+              ", ", prob_n, ", ", prob_k, "]");
+
+  int group_blocks = 0;
+  if (has_act_order) {
+    if (is_k_full) {
+      TORCH_CHECK(group_size != -1);
+      group_blocks = group_size / 16;
+      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
+                  " is not divisible by group_blocks = ", group_blocks);
+    } else {
+      TORCH_CHECK(group_size == 0);
+      group_blocks = 0;
+    }
+  } else {
+    if (group_size == -1) {
+      group_blocks = -1;
+    } else {
+      group_blocks = group_size / 16;
+      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
+                  " is not divisible by group_blocks = ", group_blocks);
+    }
+  }
+
+  int num_bits = q_type.size_bits();
+  const int4* A_ptr = (const int4*)A;
+  const int4* B_ptr = (const int4*)B;
+  int4* C_ptr = (int4*)C;
+  int4* C_tmp_ptr = (int4*)C_tmp;
+  const int4* s_ptr = (const int4*)s;
+  const uint16_t* s2_ptr = (const uint16_t*)s2;
+  const int4* zp_ptr = (const int4*)zp;
+  const int* g_idx_ptr = (const int*)g_idx;
+  const int* perm_ptr = (const int*)perm;
+  int4* a_tmp_ptr = (int4*)a_tmp;
+
+  int* locks = (int*)workspace;
+
+  if (has_act_order) {
+    // Permute A columns
+    int block_rows = div_ceil(prob_m, sms);
+    // avoid ">>>" being formatted to "> > >"
+    // clang-format off
+    permute_cols_kernel<<<sms, default_threads, 0, stream>>>(
+        A_ptr, perm_ptr, a_tmp_ptr, prob_m, prob_k, lda, block_rows);
+    // clang-format on
+    A_ptr = a_tmp_ptr;
+    lda = prob_k;
+
+    // If we have a full K, then we can run the non-act-order version of Marlin
+    // (since the weight rows are reordered by increasing group ids, and by
+    // having a full K, we have full original groups)
+    if (is_k_full) has_act_order = false;
+  }
+
+  int max_shared_mem = 0;
+  cudaDeviceGetAttribute(&max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  TORCH_CHECK(max_shared_mem > 0);
+
+  int max_par = 16;
+  if (prob_n <= 4096) max_par = 16 * 8;
+  int max_shared_mem_new = max_shared_mem;
+  int rest_m = prob_m;
+  int max_thread_m_blocks = 4;
+  while (rest_m) {
+    int par_count = rest_m / (max_thread_m_blocks * 16);
+    if (par_count > max_par) par_count = max_par;
+    int prob_m_split =
+        par_count > 0 ? (par_count * (max_thread_m_blocks * 16)) : rest_m;
+
+    int thread_k = thread_k_init;
+    int thread_n = thread_n_init;
+
+    int thread_m_blocks = min(div_ceil(prob_m_split, 16), max_thread_m_blocks);
+    int m_block_size_8 = prob_m_split <= 8;
+
+    // Set thread config
+    exec_config_t exec_cfg;
+    thread_config_t thread_tfg;
+    if (thread_k != -1 && thread_n != -1) {
+      thread_tfg = thread_config_t{thread_k, thread_n, default_threads};
+      exec_cfg = exec_config_t{1, thread_tfg};
+      TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
+                  " is not divisible by thread_n = ", thread_n);
+      TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
+                  " is not divisible by thread_k = ", thread_k);
+    } else {
+      // Auto config
+      exec_cfg = determine_exec_config<scalar_t>(
+          q_type, prob_m_split, prob_n, prob_k, thread_m_blocks, m_block_size_8,
+          num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float,
+          max_shared_mem, sms);
+      thread_tfg = exec_cfg.tb_cfg;
+      if (thread_tfg.thread_k == -1 && max_thread_m_blocks > 1) {
+        max_thread_m_blocks--;
+        continue;
+      }
+    }
+
+    int num_threads = thread_tfg.num_threads;
+    thread_k = thread_tfg.thread_k;
+    thread_n = thread_tfg.thread_n;
+    int blocks = sms * exec_cfg.blocks_per_sm;
+    if (exec_cfg.blocks_per_sm > 1)
+      max_shared_mem_new = max_shared_mem / exec_cfg.blocks_per_sm - 1024;
+
+    int thread_k_blocks = thread_k / 16;
+    int thread_n_blocks = thread_n / 16;
+
+    TORCH_CHECK(
+        is_valid_config(thread_tfg, thread_m_blocks, prob_m_split, prob_n,
+                        prob_k, num_bits, group_size, has_act_order, is_k_full,
+                        has_zp, is_zp_float, max_shared_mem_new),
+        "Invalid thread config: thread_m_blocks = ", thread_m_blocks,
+        ", thread_k = ", thread_tfg.thread_k,
+        ", thread_n = ", thread_tfg.thread_n,
+        ", num_threads = ", thread_tfg.num_threads, " for MKN = [", prob_m,
+        ", ", prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
+        ", prob_m_split = ", prob_m_split, ", group_size = ", group_size,
+        ", has_act_order = ", has_act_order, ", is_k_full = ", is_k_full,
+        ", has_zp = ", has_zp, ", is_zp_float = ", is_zp_float,
+        ", max_shared_mem_new = ", max_shared_mem_new);
+
+    auto kernel = get_marlin_kernel<scalar_t>(
+        q_type, thread_m_blocks, thread_n_blocks, thread_k_blocks,
+        m_block_size_8, has_act_order, has_zp, group_blocks, num_threads,
+        is_zp_float);
+
+    if (kernel == MarlinDefault) {
+      TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
+                  ", ", prob_k, "]", ", has_act_order = ", has_act_order,
+                  ", num_groups = ", num_groups, ", group_size = ", group_size,
+                  ", prob_m_split = ", prob_m_split,
+                  ", thread_m_blocks = ", thread_m_blocks,
+                  ", thread_n_blocks = ", thread_n_blocks,
+                  ", thread_k_blocks = ", thread_k_blocks,
+                  ", num_threads = ", num_threads, ", num_bits = ", num_bits);
+    }
+
+    cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
+                         max_shared_mem_new);
+
+    bool part_use_atomic_add =
+        use_atomic_add && div_ceil(prob_m_split, 64) * prob_n <= 2048;
+
+    // avoid ">>>" being formatted to "> > >"
+    // clang-format off
+    kernel<<<blocks, num_threads, max_shared_mem_new, stream>>>(
+        A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, s2_ptr, zp_ptr, g_idx_ptr, num_groups,
+        prob_m_split, prob_n, prob_k, lda, locks, part_use_atomic_add,
+        use_fp32_reduce, max_shared_mem_new);
+    // clang-format on
+
+    A_ptr += prob_m_split * (lda / 8);
+    C_ptr += prob_m_split * (prob_n / 8);
+    rest_m -= prob_m_split;
+  }
+}
+
+}  // namespace marlin
+
+torch::Tensor gptq_marlin_gemm(
+    torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
+    torch::Tensor& b_q_weight, torch::Tensor& b_scales,
+    std::optional<torch::Tensor> const& global_scale_or_none,
+    std::optional<torch::Tensor> const& b_zeros_or_none,
+    std::optional<torch::Tensor> const& g_idx_or_none,
+    std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
+    vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
+    int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
+    bool is_zp_float) {
+  vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id);
+  int pack_factor = 32 / b_q_type.size_bits();
+
+  // Verify A
+  TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0),
+              ", size_m = ", size_m);
+  TORCH_CHECK(a.size(1) == size_k, "Shape mismatch: a.size(1) = ", a.size(1),
+              ", size_k = ", size_k);
+
+  // Verify B
+  TORCH_CHECK(
+      size_k % MARLIN_NAMESPACE_NAME::tile_size == 0, "size_k = ", size_k,
+      " is not divisible by tile_size = ", MARLIN_NAMESPACE_NAME::tile_size);
+  TORCH_CHECK((size_k / MARLIN_NAMESPACE_NAME::tile_size) == b_q_weight.size(0),
+              "Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0),
+              ", size_k = ", size_k,
+              ", tile_size = ", MARLIN_NAMESPACE_NAME::tile_size);
+  TORCH_CHECK(
+      b_q_weight.size(1) % MARLIN_NAMESPACE_NAME::tile_size == 0,
+      "b_q_weight.size(1) = ", b_q_weight.size(1),
+      " is not divisible by tile_size = ", MARLIN_NAMESPACE_NAME::tile_size);
+  int actual_size_n =
+      (b_q_weight.size(1) / MARLIN_NAMESPACE_NAME::tile_size) * pack_factor;
+  TORCH_CHECK(size_n == actual_size_n, "size_n = ", size_n,
+              ", actual_size_n = ", actual_size_n);
+
+  // Verify device and strides
+  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
+  TORCH_CHECK(a.stride(1) == 1, "A.stride(1) is not 1");
+  // We use int4 (16 bytes) to load A, so A must aligned to 16 bytes
+  TORCH_CHECK(a.stride(0) % 8 == 0, "A.stride(0) must divisible by 8");
+  TORCH_CHECK(((uint64_t)a.data_ptr()) % 16 == 0, "A must aligned to 16 bytes");
+
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+
+  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
+  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
+
+  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_k = -1;
+  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_n = -1;
+  // sms: number of SMs to use for the kernel
+  int sms = -1;
+  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());
+
+  // Alloc buffers
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
+  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
+  torch::Tensor c;
+  if (c_or_none.has_value()) {
+    c = c_or_none.value();
+    TORCH_CHECK(c.device().is_cuda(), "c is not on GPU");
+    TORCH_CHECK(c.is_contiguous(), "c is not contiguous");
+    TORCH_CHECK(c.size(0) == size_m, "Shape mismatch: c.size(0) = ", c.size(0),
+                ", size_m = ", size_m);
+    TORCH_CHECK(c.size(1) == size_n, "Shape mismatch: c.size(1) = ", c.size(1),
+                ", size_n = ", size_n);
+  } else {
+    c = torch::empty({size_m, size_n}, options);
+  }
+  if (size_m == 0) return c;
+
+  // Alloc C tmp buffer that is going to be used for the global reduce
+  torch::Tensor c_tmp;
+  auto options_fp32 =
+      torch::TensorOptions().dtype(at::kFloat).device(a.device());
+  if (use_fp32_reduce) {
+    int max_m_block_size = (size_m + 16 - 1) / 16 * 16;
+    max_m_block_size = min(max_m_block_size, 64);
+    int max_c_tmp_size =
+        sms * max_m_block_size * MARLIN_NAMESPACE_NAME::max_thread_n;
+    c_tmp = torch::empty({max_c_tmp_size}, options_fp32);
+  } else {
+    c_tmp = torch::empty({0}, options_fp32);
+  }
+
+  // Detect groupsize and act_order
+  int num_groups = -1;
+  int group_size = -1;
+
+  int rank = b_scales.sizes().size();
+  TORCH_CHECK(rank == 2, "b_scales rank = ", rank, " is not 2");
+  TORCH_CHECK(b_scales.size(1) == size_n, "b_scales dim 1 = ", b_scales.size(1),
+              " is not size_n = ", size_n);
+  num_groups = b_scales.size(0);
+
+  torch::Tensor g_idx, perm, a_tmp;
+  if (g_idx_or_none.has_value() && perm_or_none.has_value()) {
+    g_idx = g_idx_or_none.value();
+    perm = perm_or_none.value();
+
+    TORCH_CHECK(g_idx.device().is_cuda(), "g_idx is not on GPU");
+    TORCH_CHECK(g_idx.is_contiguous(), "g_idx is not contiguous");
+    TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU");
+    TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");
+
+    // Verify g_idx and perm
+    TORCH_CHECK((g_idx.size(-1) == 0 && perm.size(-1) == 0) ||
+                    (g_idx.size(-1) == size_k && perm.size(-1) == size_k),
+                "Unexpected g_idx.size(-1) = ", g_idx.size(-1),
+                " and perm.size(-1) = ", perm.size(-1),
+                ", where size_k = ", size_k);
+  } else {
+    g_idx = torch::empty({0}, options);
+    perm = torch::empty({0}, options);
+    a_tmp = torch::empty({0}, options);
+  }
+  bool has_act_order = g_idx.size(-1) > 0 && perm.size(-1) > 0;
+
+  if (has_act_order) {
+    a_tmp = torch::empty({size_m, size_k}, options);
+    if (is_k_full) {
+      TORCH_CHECK(num_groups > 1, "For act_order, num_groups must be > 1");
+      TORCH_CHECK(size_k % num_groups == 0, "size_k = ", size_k,
+                  ", is not divisible by num_groups = ", num_groups);
+      group_size = size_k / num_groups;
+    } else {
+      group_size = 0;
+    }
+
+  } else {
+    a_tmp = torch::empty({0}, options);
+    if (num_groups > 1) {
+      TORCH_CHECK(
+          size_k % num_groups == 0, "size_k = ", size_k,
+          ", is not divisible by b_scales.size(0) = ", b_scales.size(0));
+      group_size = size_k / num_groups;
+    } else {
+      group_size = -1;
+    }
+  }
+
+  torch::Tensor global_scale;
+  if (global_scale_or_none.has_value()) {
+    global_scale = global_scale_or_none.value();
+    TORCH_CHECK(b_q_type == vllm::kFE2M1f,
+                "global_scale can only be used for float4_e2m1f.");
+  } else {
+    global_scale = torch::empty({0}, options);
+    TORCH_CHECK(!(b_q_type == vllm::kFE2M1f),
+                "the global_scale parameter must be passed for float4_e2m1f.");
+  }
+
+  torch::Tensor b_zeros;
+  if (b_zeros_or_none.has_value()) {
+    b_zeros = b_zeros_or_none.value();
+    TORCH_CHECK(b_zeros.device().is_cuda(), "b_zeros is not on GPU");
+    TORCH_CHECK(b_zeros.is_contiguous(), "b_zeros is not contiguous");
+  } else {
+    b_zeros = torch::empty({0}, options);
+  }
+  bool has_zp = b_zeros.size(-1) > 0;
+  if (has_zp) {
+    TORCH_CHECK(
+        b_q_type == vllm::kU4 || b_q_type == vllm::kU8,
+        "b_q_type must be u4 or u8 when has_zp = True. Got = ", b_q_type.str());
+  } else {
+    TORCH_CHECK(b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128 ||
+                    b_q_type == vllm::kFE4M3fn || b_q_type == vllm::kFE2M1f,
+                "b_q_type must be uint4b8, uint8b128, float8_e4m3fn or "
+                "float4_e2m1f when "
+                "has_zp = False. Got = ",
+                b_q_type.str());
+  }
+
+  if (has_zp && is_zp_float) {
+    TORCH_CHECK(a.scalar_type() == at::ScalarType::Half,
+                "Computation type must be float16 (half) when using float zero "
+                "points.");
+  }
+
+  // Verify b_zeros
+  if (has_zp) {
+    int rank = b_zeros.sizes().size();
+    TORCH_CHECK(rank == 2, "b_zeros rank = ", rank, " is not 2");
+    if (is_zp_float) {
+      TORCH_CHECK(b_zeros.size(1) == size_n,
+                  "b_zeros dim 1 = ", b_zeros.size(1),
+                  " is not size_n = ", size_n);
+      TORCH_CHECK(num_groups == b_zeros.size(0),
+                  "b_zeros dim 0 = ", b_zeros.size(0),
+                  " is not num_groups = ", num_groups);
+      TORCH_CHECK(num_groups != -1, "num_groups must be != -1");
+    } else {
+      TORCH_CHECK(b_zeros.size(0) == num_groups,
+                  "b_zeros dim 0 = ", b_zeros.size(0),
+                  " is not num_groups = ", num_groups);
+      TORCH_CHECK(b_zeros.size(1) == size_n / pack_factor,
+                  "b_zeros dim 1 = ", b_zeros.size(1),
+                  " is not size_n / pack_factor = ", size_n / pack_factor);
+    }
+  }
+
+  // Verify workspace size
+  TORCH_CHECK(size_n % MARLIN_NAMESPACE_NAME::min_thread_n == 0,
+              "size_n = ", size_n, ", is not divisible by min_thread_n = ",
+              MARLIN_NAMESPACE_NAME::min_thread_n);
+
+  int min_workspace_size = sms;
+  TORCH_CHECK(workspace.numel() >= min_workspace_size,
+              "workspace.numel = ", workspace.numel(),
+              " is below min_workspace_size = ", min_workspace_size);
+
+  int dev = a.get_device();
+  if (a.scalar_type() == at::ScalarType::Half) {
+    void* scales_ptr;
+    if (b_q_type == vllm::kFE2M1f) {
+      scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+    } else {
+      scales_ptr = b_scales.data_ptr<at::Half>();
+    }
+
+    marlin::marlin_mm<half>(
+        a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
+        c_tmp.data_ptr<float>(), scales_ptr, global_scale.data_ptr<at::Half>(),
+        b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(),
+        a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k, a.stride(0),
+        workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
+        num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
+        thread_k, thread_n, sms, use_atomic_add, use_fp32_reduce, is_zp_float);
+  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
+    void* scales_ptr;
+    if (b_q_type == vllm::kFE2M1f) {
+      scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+    } else {
+      scales_ptr = b_scales.data_ptr<at::BFloat16>();
+    }
+
+    marlin::marlin_mm<nv_bfloat16>(
+        a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
+        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(), scales_ptr,
+        global_scale.data_ptr<at::BFloat16>(), b_zeros.data_ptr(),
+        g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
+        size_m, size_n, size_k, a.stride(0), workspace.data_ptr(), b_q_type,
+        has_act_order, is_k_full, has_zp, num_groups, group_size, dev,
+        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
+        use_atomic_add, use_fp32_reduce, is_zp_float);
+  } else {
+    TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16");
+  }
+
+  return c;
+}
+
+#endif
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("gptq_marlin_gemm", &gptq_marlin_gemm);
+}
diff --git a/vllm_v0.10.0/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu b/vllm_v0.10.0/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
new file mode 100644
index 0000000..7c2d089
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
@@ -0,0 +1,342 @@
+#include "marlin.cuh"
+
+#include "core/registration.h"
+
+namespace marlin {
+
+template <int const num_threads, int const num_bits, bool const has_perm>
+__global__ void gptq_marlin_repack_kernel(
+    uint32_t const* __restrict__ b_q_weight_ptr,
+    uint32_t const* __restrict__ perm_ptr, uint32_t* __restrict__ out_ptr,
+    int size_k, int size_n) {
+  constexpr int pack_factor = 32 / num_bits;
+
+  int k_tiles = size_k / tile_k_size;
+  int n_tiles = size_n / tile_n_size;
+  int block_k_tiles = div_ceil(k_tiles, gridDim.x);
+
+  auto start_k_tile = blockIdx.x * block_k_tiles;
+  if (start_k_tile >= k_tiles) {
+    return;
+  }
+
+  int finish_k_tile = min(start_k_tile + block_k_tiles, k_tiles);
+
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<repack_stages - 2>();
+    __syncthreads();
+  };
+
+  extern __shared__ int4 sh[];
+
+  constexpr int perm_size = tile_k_size / 4;
+
+  int4* sh_perm_ptr = sh;
+  int4* sh_pipe_ptr = sh_perm_ptr;
+  if constexpr (has_perm) {
+    sh_pipe_ptr += perm_size;
+  }
+
+  constexpr int tile_ints = tile_k_size / pack_factor;
+
+  constexpr int stage_n_threads = tile_n_size / 4;
+  constexpr int stage_k_threads = has_perm ? tile_k_size : tile_ints;
+  constexpr int stage_size = stage_k_threads * stage_n_threads;
+
+  auto load_perm_to_shared = [&](int k_tile_id) {
+    int first_k_int4 = (k_tile_id * tile_k_size) / 4;
+
+    int4 const* perm_int4_ptr = reinterpret_cast<int4 const*>(perm_ptr);
+
+    if (threadIdx.x < perm_size) {
+      sh_perm_ptr[threadIdx.x] = perm_int4_ptr[first_k_int4 + threadIdx.x];
+    }
+    __syncthreads();
+  };
+
+  auto fetch_to_shared = [&](int pipe, int k_tile_id, int n_tile_id) {
+    if (n_tile_id >= n_tiles) {
+      cp_async_fence();
+      return;
+    }
+
+    int first_n = n_tile_id * tile_n_size;
+
+    int4* sh_ptr = sh_pipe_ptr + stage_size * pipe;
+
+    if constexpr (has_perm) {
+      if (threadIdx.x < stage_size) {
+        auto k_id = threadIdx.x / stage_n_threads;
+        auto n_id = threadIdx.x % stage_n_threads;
+
+        uint32_t const* sh_perm_int_ptr =
+            reinterpret_cast<uint32_t const*>(sh_perm_ptr);
+
+        int src_k = sh_perm_int_ptr[k_id];
+        int src_k_packed = src_k / pack_factor;
+
+        cp_async4(
+            &sh_ptr[k_id * stage_n_threads + n_id],
+            reinterpret_cast<int4 const*>(&(
+                b_q_weight_ptr[src_k_packed * size_n + first_n + (n_id * 4)])));
+      }
+
+    } else {
+      if (threadIdx.x < stage_size) {
+        auto k_id = threadIdx.x / stage_n_threads;
+        auto n_id = threadIdx.x % stage_n_threads;
+
+        int first_k = k_tile_id * tile_k_size;
+        int first_k_packed = first_k / pack_factor;
+
+        cp_async4(&sh_ptr[k_id * stage_n_threads + n_id],
+                  reinterpret_cast<int4 const*>(
+                      &(b_q_weight_ptr[(first_k_packed + k_id) * size_n +
+                                       first_n + (n_id * 4)])));
+      }
+    }
+
+    cp_async_fence();
+  };
+
+  auto repack_tile = [&](int pipe, int k_tile_id, int n_tile_id) {
+    if (n_tile_id >= n_tiles) {
+      return;
+    }
+
+    auto warp_id = threadIdx.x / 32;
+    auto th_id = threadIdx.x % 32;
+
+    if (warp_id >= 4) {
+      return;
+    }
+
+    int tc_col = th_id / 4;
+    int tc_row = (th_id % 4) * 2;
+
+    constexpr int tc_offsets[4] = {0, 1, 8, 9};
+
+    int cur_n = warp_id * 16 + tc_col;
+
+    constexpr int sh_stride = 64;
+    constexpr uint32_t mask = (1 << num_bits) - 1;
+
+    int4* sh_stage_ptr = sh_pipe_ptr + stage_size * pipe;
+    uint32_t* sh_stage_int_ptr = reinterpret_cast<uint32_t*>(sh_stage_ptr);
+
+    uint32_t* sh_perm_int_ptr = reinterpret_cast<uint32_t*>(sh_perm_ptr);
+
+    uint32_t vals[8];
+
+    if constexpr (has_perm) {
+      for (int i = 0; i < 4; i++) {
+        int k_idx = tc_row + tc_offsets[i];
+
+        uint32_t src_k = sh_perm_int_ptr[k_idx];
+        uint32_t src_k_pos = src_k % pack_factor;
+
+        uint32_t b1_val = sh_stage_int_ptr[k_idx * sh_stride + cur_n];
+        uint32_t b1_cur_val = (b1_val >> (src_k_pos * num_bits)) & mask;
+
+        uint32_t b2_val = sh_stage_int_ptr[k_idx * sh_stride + cur_n + 8];
+        uint32_t b2_cur_val = (b2_val >> (src_k_pos * num_bits)) & mask;
+
+        vals[i] = b1_cur_val;
+        vals[4 + i] = b2_cur_val;
+      }
+
+    } else {
+      uint32_t b1_vals[tile_ints];
+      uint32_t b2_vals[tile_ints];
+
+#pragma unroll
+      for (int i = 0; i < tile_ints; i++) {
+        b1_vals[i] = sh_stage_int_ptr[cur_n + sh_stride * i];
+        b2_vals[i] = sh_stage_int_ptr[cur_n + 8 + sh_stride * i];
+      }
+
+#pragma unroll
+      for (int i = 0; i < 4; i++) {
+        int cur_elem = tc_row + tc_offsets[i];
+        int cur_int = cur_elem / pack_factor;
+        int cur_pos = cur_elem % pack_factor;
+
+        vals[i] = (b1_vals[cur_int] >> (cur_pos * num_bits)) & mask;
+        vals[4 + i] = (b2_vals[cur_int] >> (cur_pos * num_bits)) & mask;
+      }
+    }
+
+    constexpr int tile_size = tile_k_size * tile_n_size / pack_factor;
+    int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;
+
+    // Result of:
+    // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+    if constexpr (num_bits == 4) {
+      constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+
+      uint32_t res = 0;
+#pragma unroll
+      for (int i = 0; i < 8; i++) {
+        res |= vals[pack_idx[i]] << (i * 4);
+      }
+
+      out_ptr[out_offset + th_id * 4 + warp_id] = res;
+
+    } else {
+      constexpr int pack_idx[4] = {0, 2, 1, 3};
+
+      uint32_t res1 = 0;
+      uint32_t res2 = 0;
+#pragma unroll
+      for (int i = 0; i < 4; i++) {
+        res1 |= vals[pack_idx[i]] << (i * 8);
+        res2 |= vals[4 + pack_idx[i]] << (i * 8);
+      }
+
+      out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1;
+      out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 1] = res2;
+    }
+  };
+
+  auto start_pipes = [&](int k_tile_id, int n_tile_id) {
+#pragma unroll
+    for (int pipe = 0; pipe < repack_stages - 1; pipe++) {
+      fetch_to_shared(pipe, k_tile_id, n_tile_id + pipe);
+    }
+
+    wait_for_stage();
+  };
+#pragma unroll
+  for (int k_tile_id = start_k_tile; k_tile_id < finish_k_tile; k_tile_id++) {
+    int n_tile_id = 0;
+
+    if constexpr (has_perm) {
+      load_perm_to_shared(k_tile_id);
+    }
+
+    start_pipes(k_tile_id, n_tile_id);
+
+    while (n_tile_id < n_tiles) {
+#pragma unroll
+      for (int pipe = 0; pipe < repack_stages; pipe++) {
+        fetch_to_shared((pipe + repack_stages - 1) % repack_stages, k_tile_id,
+                        n_tile_id + pipe + repack_stages - 1);
+        repack_tile(pipe, k_tile_id, n_tile_id + pipe);
+        wait_for_stage();
+      }
+      n_tile_id += repack_stages;
+    }
+  }
+}
+
+}  // namespace marlin
+
+#define CALL_IF(NUM_BITS, HAS_PERM)                                         \
+  else if (num_bits == NUM_BITS && has_perm == HAS_PERM) {                  \
+    cudaFuncSetAttribute(                                                   \
+        marlin::gptq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS, \
+                                          HAS_PERM>,                        \
+        cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);       \
+    marlin::gptq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS,     \
+                                      HAS_PERM>                             \
+        <<<blocks, marlin::repack_threads, max_shared_mem, stream>>>(       \
+            b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n);             \
+  }
+
+torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
+                                 int64_t size_k, int64_t size_n,
+                                 int64_t num_bits) {
+  // Verify compatibility with marlin tile of 16x64
+  TORCH_CHECK(size_k % marlin::tile_k_size == 0, "size_k = ", size_k,
+              " is not divisible by tile_k_size = ", marlin::tile_k_size);
+  TORCH_CHECK(size_n % marlin::tile_n_size == 0, "size_n = ", size_n,
+              " is not divisible by tile_n_size = ", marlin::tile_n_size);
+
+  TORCH_CHECK(num_bits == 4 || num_bits == 8,
+              "num_bits must be 4 or 8. Got = ", num_bits);
+  int const pack_factor = 32 / num_bits;
+
+  // Verify B
+  TORCH_CHECK((size_k / pack_factor) == b_q_weight.size(0),
+              "Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0),
+              ", size_k = ", size_k, ", pack_factor = ", pack_factor);
+  TORCH_CHECK(b_q_weight.size(1) == size_n,
+              "b_q_weight.size(1) = ", b_q_weight.size(1),
+              " is not size_n = ", size_n);
+
+  // Verify device and strides
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+  TORCH_CHECK(b_q_weight.dtype() == at::kInt, "b_q_weight type is not kInt");
+
+  TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU");
+  TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");
+  TORCH_CHECK(perm.dtype() == at::kInt, "perm type is not at::kInt");
+
+  // Alloc buffers
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(b_q_weight));
+  auto options = torch::TensorOptions()
+                     .dtype(b_q_weight.dtype())
+                     .device(b_q_weight.device());
+  torch::Tensor out = torch::empty(
+      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
+      options);
+
+  // Detect if there is act_order
+  bool has_perm = perm.size(0) != 0;
+
+  // Get ptrs
+  uint32_t const* b_q_weight_ptr =
+      reinterpret_cast<uint32_t const*>(b_q_weight.data_ptr());
+  uint32_t const* perm_ptr = reinterpret_cast<uint32_t const*>(perm.data_ptr());
+  uint32_t* out_ptr = reinterpret_cast<uint32_t*>(out.data_ptr());
+
+  // Get dev info
+  int dev = b_q_weight.get_device();
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
+  int blocks;
+  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
+
+  int max_shared_mem = 0;
+  cudaDeviceGetAttribute(&max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  TORCH_CHECK(max_shared_mem > 0);
+
+  if (false) {
+  }
+  CALL_IF(4, false)
+  CALL_IF(4, true)
+  CALL_IF(8, false)
+  CALL_IF(8, true)
+  else {
+    TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits,
+                ", has_perm = ", has_perm);
+  }
+
+  return out;
+}
+
+torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
+                                      torch::Tensor& perm, c10::SymInt size_k,
+                                      c10::SymInt size_n, int64_t num_bits) {
+  int const pack_factor = 32 / num_bits;
+  auto options = torch::TensorOptions()
+                     .dtype(b_q_weight.dtype())
+                     .device(b_q_weight.device());
+  return torch::empty_symint(
+      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
+      options);
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("gptq_marlin_repack", &gptq_marlin_repack);
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, Meta, m) {
+  m.impl("gptq_marlin_repack", &gptq_marlin_repack_meta);
+}
diff --git a/vllm_v0.10.0/csrc/quantization/gptq_marlin/kernel.h b/vllm_v0.10.0/csrc/quantization/gptq_marlin/kernel.h
new file mode 100644
index 0000000..f920565
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/gptq_marlin/kernel.h
@@ -0,0 +1,38 @@
+
+#ifndef MARLIN_NAMESPACE_NAME
+  #define MARLIN_NAMESPACE_NAME marlin
+#endif
+
+#include "marlin.cuh"
+#include "marlin_dtypes.cuh"
+#include "core/scalar_type.hpp"
+
+#define MARLIN_KERNEL_PARAMS                                                   \
+  const int4 *__restrict__ A, const int4 *__restrict__ B,                      \
+      int4 *__restrict__ C, int4 *__restrict__ C_tmp,                          \
+      const int4 *__restrict__ scales_ptr,                                     \
+      const uint16_t *__restrict__ scale2_ptr,                                 \
+      const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx,          \
+      int num_groups, int prob_m, int prob_n, int prob_k, int lda, int *locks, \
+      bool use_atomic_add, bool use_fp32_reduce, int max_shared_mem
+
+namespace MARLIN_NAMESPACE_NAME {
+template <typename scalar_t,  // compute dtype, half or nv_float16
+          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const bool m_block_size_8,  // whether m_block_size == 8
+                                      // only works when thread_m_blocks == 1
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const int group_blocks,  // number of consecutive 16x16 blocks
+                                   // with a separate quantization scale
+          const bool is_zp_float   // is zero point of float16 type?
+          >
+__global__ void Marlin(MARLIN_KERNEL_PARAMS);
+
+}
diff --git a/vllm_v0.10.0/csrc/quantization/gptq_marlin/marlin.cuh b/vllm_v0.10.0/csrc/quantization/gptq_marlin/marlin.cuh
new file mode 100644
index 0000000..f3b4464
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/gptq_marlin/marlin.cuh
@@ -0,0 +1,92 @@
+#pragma once
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <iostream>
+
+#ifndef MARLIN_NAMESPACE_NAME
+  #define MARLIN_NAMESPACE_NAME marlin
+#endif
+
+namespace MARLIN_NAMESPACE_NAME {
+
+// Marlin params
+
+// 8 warps are a good choice since every SM has 4 schedulers and having more
+// than 1 warp per schedule allows some more latency hiding. At the same time,
+// we want relatively few warps to have many registers per warp and small tiles.
+static constexpr int default_threads = 256;
+
+static constexpr int pipe_stages =
+    4;  // 4 pipeline stages fit into shared memory
+
+static constexpr int min_thread_n = 64;
+static constexpr int min_thread_k = 64;
+static constexpr int max_thread_n = 256;
+
+static constexpr int tile_size = 16;
+static constexpr int max_par = 16;
+
+// Repack params
+static constexpr int repack_stages = 8;
+
+static constexpr int repack_threads = 256;
+
+static constexpr int tile_k_size = tile_size;
+static constexpr int tile_n_size = tile_k_size * 4;
+
+// Helpers
+template <typename T, int n>
+struct Vec {
+  T elems[n];
+  __device__ T& operator[](int i) { return elems[i]; }
+};
+
+using I4 = Vec<int, 4>;
+
+constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+// No support for async
+#else
+
+__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
+                                      bool pred = true) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+
+__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
+      "}\n" ::"r"(smem),
+      "l"(glob_ptr), "n"(BYTES));
+}
+
+__device__ inline void cp_async_fence() {
+  asm volatile("cp.async.commit_group;\n" ::);
+}
+
+template <int n>
+__device__ inline void cp_async_wait() {
+  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
+}
+
+#endif
+
+}  // namespace MARLIN_NAMESPACE_NAME
diff --git a/vllm_v0.10.0/csrc/quantization/gptq_marlin/marlin_dtypes.cuh b/vllm_v0.10.0/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
new file mode 100644
index 0000000..cc16054
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
@@ -0,0 +1,83 @@
+
+#ifndef _data_types_cuh
+#define _data_types_cuh
+#include "marlin.cuh"
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+#ifndef MARLIN_NAMESPACE_NAME
+  #define MARLIN_NAMESPACE_NAME marlin
+#endif
+
+namespace MARLIN_NAMESPACE_NAME {
+
+template <typename scalar_t>
+class ScalarType {};
+
+template <>
+class ScalarType<half> {
+ public:
+  using scalar_t = half;
+  using scalar_t2 = half2;
+
+  // Matrix fragments for tensor core instructions; their precise layout is
+  // documented here:
+  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
+  using FragA = Vec<half2, 4>;
+  using FragB = Vec<half2, 2>;
+  using FragC = Vec<float, 4>;
+  using FragS = Vec<half2, 1>;
+  using FragZP = Vec<half2, 4>;
+
+  static __device__ float inline num2float(const half x) {
+    return __half2float(x);
+  }
+
+  static __device__ half2 inline num2num2(const half x) {
+    return __half2half2(x);
+  }
+
+  static __device__ half2 inline nums2num2(const half x1, const half x2) {
+    return __halves2half2(x1, x2);
+  }
+
+  static __host__ __device__ half inline float2num(const float x) {
+    return __float2half(x);
+  }
+};
+
+template <>
+class ScalarType<nv_bfloat16> {
+ public:
+  using scalar_t = nv_bfloat16;
+  using scalar_t2 = nv_bfloat162;
+
+  using FragA = Vec<nv_bfloat162, 4>;
+  using FragB = Vec<nv_bfloat162, 2>;
+  using FragC = Vec<float, 4>;
+  using FragS = Vec<nv_bfloat162, 1>;
+  using FragZP = Vec<nv_bfloat162, 4>;
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
+  static __device__ float inline num2float(const nv_bfloat16 x) {
+    return __bfloat162float(x);
+  }
+
+  static __device__ nv_bfloat162 inline num2num2(const nv_bfloat16 x) {
+    return __bfloat162bfloat162(x);
+  }
+
+  static __device__ nv_bfloat162 inline nums2num2(const nv_bfloat16 x1,
+                                                  const nv_bfloat16 x2) {
+    return __halves2bfloat162(x1, x2);
+  }
+
+  static __host__ __device__ nv_bfloat16 inline float2num(const float x) {
+    return __float2bfloat16(x);
+  }
+#endif
+};
+
+}  // namespace MARLIN_NAMESPACE_NAME
+
+#endif
diff --git a/vllm_v0.10.0/csrc/quantization/gptq_marlin/marlin_template.h b/vllm_v0.10.0/csrc/quantization/gptq_marlin/marlin_template.h
new file mode 100644
index 0000000..0086633
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/gptq_marlin/marlin_template.h
@@ -0,0 +1,1729 @@
+/*
+ * Modified by Neural Magic
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Adapted from https://github.com/IST-DASLab/marlin
+ */
+
+#ifndef MARLIN_NAMESPACE_NAME
+  #define MARLIN_NAMESPACE_NAME marlin
+#endif
+
+#include "marlin.cuh"
+#include "marlin_dtypes.cuh"
+#include "dequant.h"
+#include "core/scalar_type.hpp"
+
+#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
+  static_assert(std::is_same<scalar_t, half>::value ||          \
+                    std::is_same<scalar_t, nv_bfloat16>::value, \
+                "only float16 and bfloat16 is supported");
+
+namespace MARLIN_NAMESPACE_NAME {
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+
+template <typename scalar_t,  // compute dtype, half or nv_float16
+          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const bool m_block_size_8,  // whether m_block_size == 8
+                                      // only works when thread_m_blocks == 1
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const bool has_act_order,  // whether act_order is enabled
+          const int group_blocks,    // number of consecutive 16x16 blocks
+                                     // with a separate quantization scale
+          const bool is_zp_float     // is zero point of float16 type?
+          >
+__global__ void Marlin(
+    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // fp16 output buffer of shape mxn
+    int4* __restrict__ C_tmp,    // fp32 tmp output buffer (for reduce)
+    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
+                                          // (k/groupsize)xn
+    const int* __restrict__ g_idx,        // int32 group indices of shape k
+    int num_groups,       // number of scale groups per output channel
+    int prob_m,           // batch dimension m
+    int prob_n,           // output dimension n
+    int prob_k,           // reduction dimension k
+    int* locks,           // extra global storage for barrier synchronization
+    bool use_fp32_reduce  // whether to use fp32 global reduce
+) {}
+
+}  // namespace marlin
+
+#else
+
+// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
+// output/accumulation.
+template <typename scalar_t>
+__device__ inline void mma(const typename ScalarType<scalar_t>::FragA& a_frag,
+                           const typename ScalarType<scalar_t>::FragB& frag_b,
+                           typename ScalarType<scalar_t>::FragC& frag_c) {
+  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  float* c = reinterpret_cast<float*>(&frag_c);
+  if constexpr (std::is_same<scalar_t, half>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  } else {
+    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+  }
+}
+
+template <typename scalar_t>
+__device__ inline void mma_trans(
+    const typename ScalarType<scalar_t>::FragA& a_frag,
+    const typename ScalarType<scalar_t>::FragB& frag_b,
+    const typename ScalarType<scalar_t>::FragB& frag_b2,
+    typename ScalarType<scalar_t>::FragC& frag_c) {
+  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  const uint32_t* b2 = reinterpret_cast<const uint32_t*>(&frag_b2);
+  float* c = reinterpret_cast<float*>(&frag_c);
+  if constexpr (std::is_same<scalar_t, half>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  } else {
+    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+  }
+}
+
+// Instruction for loading a full 16x16 matrix fragment of operand A from shared
+// memory, directly in tensor core layout.
+template <int count, typename scalar_t>
+__device__ inline void ldsm(typename ScalarType<scalar_t>::FragA& frag_a,
+                            const void* smem_ptr) {
+  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  if constexpr (count == 4) {
+    asm volatile(
+        "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
+        : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
+        : "r"(smem));
+  } else if constexpr (count == 2) {
+    asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];\n"
+                 : "=r"(a[0]), "=r"(a[1])
+                 : "r"(smem));
+  } else if constexpr (count == 1) {
+    asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n"
+                 : "=r"(a[0])
+                 : "r"(smem));
+  } else {
+    static_assert(count == 1 || count == 2 || count == 4, "invalid count");
+  }
+}
+
+// Multiply dequantized values by the corresponding quantization scale; used
+// only for grouped quantization.
+template <typename scalar_t>
+__device__ inline void scale(typename ScalarType<scalar_t>::FragB& frag_b,
+                             typename ScalarType<scalar_t>::FragS& frag_s,
+                             int i) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 s =
+      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_s)[i]);
+  frag_b[0] = __hmul2(frag_b[0], s);
+  frag_b[1] = __hmul2(frag_b[1], s);
+}
+
+template <typename scalar_t>
+__device__ inline void scale_and_sub(
+    typename ScalarType<scalar_t>::FragB& frag_b, scalar_t s, scalar_t zp) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 s2 = ScalarType<scalar_t>::num2num2(s);
+  scalar_t2 zp2 = ScalarType<scalar_t>::num2num2(zp);
+  frag_b[0] = __hfma2(frag_b[0], s2, __hneg2(zp2));
+  frag_b[1] = __hfma2(frag_b[1], s2, __hneg2(zp2));
+}
+
+template <typename scalar_t>
+__device__ inline void sub_zp(typename ScalarType<scalar_t>::FragB& frag_b,
+                              typename ScalarType<scalar_t>::scalar_t2& frag_zp,
+                              int i) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 zp =
+      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_zp)[i]);
+  frag_b[0] = __hsub2(frag_b[0], zp);
+  frag_b[1] = __hsub2(frag_b[1], zp);
+}
+
+// Same as above, but for act_order (each K is multiplied individually)
+template <typename scalar_t>
+__device__ inline void scale4(typename ScalarType<scalar_t>::FragB& frag_b,
+                              typename ScalarType<scalar_t>::FragS& frag_s_1,
+                              typename ScalarType<scalar_t>::FragS& frag_s_2,
+                              typename ScalarType<scalar_t>::FragS& frag_s_3,
+                              typename ScalarType<scalar_t>::FragS& frag_s_4,
+                              int i) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 s_val_1_2;
+  s_val_1_2.x = reinterpret_cast<scalar_t*>(&frag_s_1)[i];
+  s_val_1_2.y = reinterpret_cast<scalar_t*>(&frag_s_2)[i];
+
+  scalar_t2 s_val_3_4;
+  s_val_3_4.x = reinterpret_cast<scalar_t*>(&frag_s_3)[i];
+  s_val_3_4.y = reinterpret_cast<scalar_t*>(&frag_s_4)[i];
+
+  frag_b[0] = __hmul2(frag_b[0], s_val_1_2);
+  frag_b[1] = __hmul2(frag_b[1], s_val_3_4);
+}
+
+// Given 2 floats multiply by 2 scales (halves)
+template <typename scalar_t>
+__device__ inline void scale_float(float* c,
+                                   typename ScalarType<scalar_t>::FragS& s) {
+  scalar_t* s_ptr = reinterpret_cast<scalar_t*>(&s);
+  c[0] = __fmul_rn(c[0], ScalarType<scalar_t>::num2float(s_ptr[0]));
+  c[1] = __fmul_rn(c[1], ScalarType<scalar_t>::num2float(s_ptr[1]));
+}
+
+// Wait until barrier reaches `count`, then lock for current threadblock.
+__device__ inline void barrier_acquire(int* lock, int count) {
+  if (threadIdx.x == 0) {
+    int state = -1;
+    do
+      // Guarantee that subsequent writes by this threadblock will be visible
+      // globally.
+      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
+                   : "=r"(state)
+                   : "l"(lock));
+    while (state != count);
+  }
+  __syncthreads();
+}
+
+// Release barrier and increment visitation count.
+__device__ inline void barrier_release(int* lock, bool reset = false) {
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    if (reset) {
+      lock[0] = 0;
+      return;
+    }
+    int val = 1;
+    // Make sure that all writes since acquiring this barrier are visible
+    // globally, while releasing the barrier.
+    asm volatile("fence.acq_rel.gpu;\n");
+    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
+                 :
+                 : "l"(lock), "r"(val));
+  }
+}
+
+// Wait until value of lock to be negative, and then add 1
+__device__ inline void wait_negative_and_add(int* lock) {
+  if (threadIdx.x == 0) {
+    int state = 0;
+    do
+      // Guarantee that subsequent writes by this threadblock will be visible
+      // globally.
+      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
+                   : "=r"(state)
+                   : "l"(lock));
+    while (state >= 0);
+    atomicAdd(lock, 1);
+  }
+  __syncthreads();
+}
+
+template <typename scalar_t,  // compute dtype, half or nv_float16
+          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const bool m_block_size_8,  // whether m_block_size == 8
+                                      // only works when thread_m_blocks == 1
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const int group_blocks,  // number of consecutive 16x16 blocks
+                                   // with a separate quantization scale
+          const bool is_zp_float   // is zero point of float16 type?
+          >
+__global__ void Marlin(
+    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // fp16 output buffer of shape mxn
+    int4* __restrict__ C_tmp,    // fp32 tmp output buffer (for reduce)
+    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
+                                          // (k/groupsize)xn
+    const uint16_t* __restrict__ scale2_ptr,  // fp16 global scale (for nvfp4
+                                              // only)
+    const int4* __restrict__ zp_ptr,  // 4bit packed zero-points of shape
+                                      // (k/groupsize)x(n/pack_factor)
+    const int* __restrict__ g_idx,    // int32 group indices of shape k
+    int num_groups,        // number of scale groups per output channel
+    int prob_m,            // batch dimension m
+    int prob_n,            // output dimension n
+    int prob_k,            // reduction dimension k
+    int lda,               // A.stride(0), equal to prob_k is A is contiguous
+    int* locks,            // extra global storage for barrier synchronization
+    bool use_atomic_add,   // whether to use atomic add to reduce
+    bool use_fp32_reduce,  // whether to use fp32 global reduce
+    int max_shared_mem) {
+  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
+  // same size, which might involve multiple column "slices" (of width 16 *
+  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
+  // example:
+  //   0 1 3
+  //   0 2 3
+  //   1 2 4
+  // While this kind of partitioning makes things somewhat more complicated, it
+  // ensures good utilization of all SMs for many kinds of shape and GPU
+  // configurations, while requiring as few slow global cross-threadblock
+  // reductions as possible.
+  using Dtype = ScalarType<scalar_t>;
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  using FragA = typename ScalarType<scalar_t>::FragA;
+  using FragB = typename ScalarType<scalar_t>::FragB;
+  using FragC = typename ScalarType<scalar_t>::FragC;
+  using FragS = typename ScalarType<scalar_t>::FragS;
+  using FragZP = typename ScalarType<scalar_t>::FragZP;
+
+  static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id);
+  constexpr bool has_zp = w_type == vllm::kU4 || w_type == vllm::kU8;
+  constexpr bool is_int_type = w_type == vllm::kU4 || w_type == vllm::kU8 ||
+                               w_type == vllm::kU4B8 || w_type == vllm::kU8B128;
+  // see comments of dequant.h for more details
+  constexpr bool dequant_skip_flop =
+      !is_int_type ||
+      has_zp && !is_zp_float && !std::is_same<scalar_t, nv_bfloat16>::value ||
+      has_zp && !is_zp_float && !(w_type == vllm::kU8);
+
+  scalar_t2 global_scale;
+
+  if constexpr (w_type == vllm::kFE2M1f) {
+    uint16_t val = scale2_ptr[0];
+    global_scale = Dtype::num2num2(*reinterpret_cast<scalar_t*>(&val));
+  }
+
+  constexpr bool has_act_order = group_blocks == 0;
+  constexpr int m_block_size = m_block_size_8 ? 8 : (16 * thread_m_blocks);
+
+  constexpr int pack_factor = 32 / w_type.size_bits();
+  static_assert(thread_m_blocks == 1 || !m_block_size_8);
+
+  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
+  // better partitioning with less reductions
+  int parallel = 1;
+  if (prob_m > m_block_size) {
+    parallel = prob_m / m_block_size;
+    prob_m = m_block_size;
+  }
+
+  int k_tiles = prob_k / 16 / thread_k_blocks;
+  int n_tiles = prob_n / 16 / thread_n_blocks;
+  int iters = div_ceil(k_tiles * n_tiles * parallel, gridDim.x);
+
+  if constexpr (!has_act_order && group_blocks != -1) {
+    if (group_blocks >= thread_k_blocks) {
+      // Ensure that the number of tiles in each stripe is a multiple of the
+      // groupsize; this avoids an annoying special case where a stripe starts
+      // in the middle of group.
+      iters = (group_blocks / thread_k_blocks) *
+              div_ceil(iters, (group_blocks / thread_k_blocks));
+    }
+  }
+
+  int slice_row = (iters * blockIdx.x) % k_tiles;
+  int slice_col_par = (iters * blockIdx.x) / k_tiles;
+  int slice_col = slice_col_par;
+  int slice_iters;  // number of threadblock tiles in the current slice
+  int slice_count =
+      0;          // total number of active threadblocks in the current slice
+  int slice_idx;  // index of threadblock in current slice; numbered bottom to
+                  // top
+
+  int par_id = 0;
+  int locks_off = 0;
+
+  // We can easily implement parallel problem execution by just remapping
+  // indices and advancing global pointers
+  if (slice_col_par >= n_tiles) {
+    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * lda / 8;
+    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
+    slice_col = slice_col_par % n_tiles;
+    par_id = slice_col_par / n_tiles;
+  }
+  if (parallel * n_tiles >= gridDim.x) {
+    // when parallel * n_tiles >= sms
+    // then there are at most $sms$ conflict tile blocks
+    locks_off = blockIdx.x;
+  } else {
+    locks_off = (iters * blockIdx.x) / k_tiles - 1;
+  }
+
+  // Compute all information about the current slice which is required for
+  // synchronization.
+  auto init_slice = [&](bool first_init = false) {
+    slice_iters =
+        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
+    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
+    if (slice_iters == 0) return;
+    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
+    slice_count = 1;
+    slice_idx = 0;
+    int col_first = iters * div_ceil(k_tiles * slice_col_par, iters);
+    if (col_first <= k_tiles * (slice_col_par + 1)) {
+      int col_off = col_first - k_tiles * slice_col_par;
+      slice_count = div_ceil(k_tiles - col_off, iters);
+      if (col_off > 0) slice_count++;
+      int delta_first = iters * blockIdx.x - col_first;
+      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
+        slice_idx = slice_count - 1;
+      else {
+        slice_idx = slice_count - 1 - delta_first / iters;
+        if (col_off > 0) slice_idx--;
+      }
+    }
+    if (parallel * n_tiles >= gridDim.x) {
+      if (slice_count > 1 && slice_idx == slice_count - 1) {
+        locks_off++;
+      }
+    } else {
+      locks_off++;
+    }
+
+    if (first_init && use_atomic_add && slice_count > 1 && slice_idx == 0) {
+      constexpr int threads_per_m = 16 * thread_n_blocks / 8;
+      int m_per_thread =
+          div_ceil(thread_m_blocks * 16, threads / threads_per_m);
+      if (m_block_size_8) m_per_thread = div_ceil(8, threads / threads_per_m);
+      for (int i = 0; i < m_per_thread; i++) {
+        int row = threads / threads_per_m * i + threadIdx.x / threads_per_m;
+        if (row < prob_m) {
+          int col = slice_col * 16 * thread_n_blocks / 8 +
+                    threadIdx.x % threads_per_m;
+          C[row * prob_n / 8 + col] = {0, 0, 0, 0};
+        }
+      }
+      // After write zero to output, write a negative value to lock.
+      // Every SM that processes the same slice would wait for
+      // the negative value, and then atomicAdd 1 to it.
+      // After all SMs are processed, the lock value would back to 0 again.
+      __syncthreads();
+      if (threadIdx.x == 0) locks[locks_off] = 1 - slice_count;
+    }
+
+    if (slice_col == n_tiles) {
+      A += 16 * thread_m_blocks * lda / 8;
+      C += 16 * thread_m_blocks * prob_n / 8;
+      slice_col = 0;
+      par_id++;
+    }
+  };
+  init_slice(true);
+
+  // A sizes/strides
+
+  // stride of the A matrix in global memory
+  int a_gl_stride = lda / 8;
+  // stride of an A matrix tile in shared memory
+  constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
+  // delta between subsequent A tiles in global memory
+  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8;
+  // between subsequent accesses within a tile
+  int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory writes
+  constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory tile reads
+  constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4));
+  // within a shared memory tile
+  constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
+  // overall size of a tile
+  constexpr int a_sh_stage = a_sh_stride * m_block_size;
+  // number of shared write iterations for a tile
+  constexpr int a_sh_wr_iters = div_ceil(a_sh_stage, a_sh_wr_delta);
+
+  // B sizes/strides
+  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
+  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
+  constexpr int b_thread_vecs = w_type.size_bits() == 4 ? 1 : 2;
+  constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
+
+  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
+  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
+  constexpr int b_sh_wr_delta = threads * b_thread_vecs;
+  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
+  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
+  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
+
+  // Scale sizes/strides without act_order
+  int s_gl_stride = prob_n / 8;
+  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
+  constexpr int s_tb_groups =
+      !has_act_order && group_blocks != -1 && group_blocks < thread_k_blocks
+          ? thread_k_blocks / group_blocks / (w_type == vllm::kFE2M1f ? 2 : 1)
+          : 1;
+  constexpr int s_sh_stage = s_tb_groups * s_sh_stride;
+  int s_gl_rd_delta = s_gl_stride;
+
+  // Scale size/strides with act_order
+  constexpr int tb_k = 16 * thread_k_blocks;
+  constexpr int g_idx_stage = has_act_order ? (tb_k * sizeof(int)) / 16 : 0;
+  // constexpr int act_s_row_stride      = 1;
+  // int           act_s_col_stride      = act_s_row_stride * num_groups;
+  constexpr int act_s_max_num_groups = 32;
+  int act_s_col_stride = 1;
+  int act_s_col_warp_stride = act_s_col_stride * 8;
+
+  int tb_n_warps = thread_n_blocks / 4;
+  int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
+
+  // Zero-points sizes/strides
+  int zp_gl_stride = is_zp_float ? prob_n / 8 : (prob_n / pack_factor) / 4;
+  constexpr int zp_sh_stride = is_zp_float
+                                   ? 16 * thread_n_blocks / 8
+                                   : ((16 * thread_n_blocks) / pack_factor) / 4;
+  constexpr int zp_tb_groups = s_tb_groups;
+  constexpr int zp_sh_stage = has_zp ? zp_tb_groups * zp_sh_stride : 0;
+  int zp_gl_rd_delta = zp_gl_stride;
+
+  // Global A read index of current thread.
+  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  a_gl_rd += a_gl_rd_delta_o * slice_row;
+  // Shared write index of current thread.
+  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  // Shared read index.
+  int a_sh_rd =
+      a_sh_stride * ((threadIdx.x % 32) % (16 / (m_block_size_8 ? 2 : 1))) +
+      (threadIdx.x % 32) / (16 / (m_block_size_8 ? 2 : 1));
+  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
+
+  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
+                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
+  b_gl_rd += b_sh_stride * slice_col;
+  b_gl_rd += b_gl_rd_delta_o * slice_row;
+  auto b_sh_wr = threadIdx.x * b_thread_vecs;
+  auto b_sh_rd = threadIdx.x * b_thread_vecs;
+
+  // For act_order
+  constexpr int k_iter_size = tb_k / b_sh_wr_iters;
+  int slice_k_start = tb_k * slice_row;
+  int slice_k_finish = slice_k_start + tb_k * slice_iters;
+  int slice_k_start_shared_fetch = slice_k_start;
+  int slice_n_offset = act_s_col_tb_stride * slice_col;
+
+  // No act_order
+  int s_gl_rd;
+  if constexpr (!has_act_order) {
+    if constexpr (group_blocks == -1) {
+      s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+    } else {
+      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) /
+                    (w_type == vllm::kFE2M1f ? 2 : 1) +
+                s_sh_stride * slice_col + threadIdx.x;
+    }
+  }
+  auto s_sh_wr = threadIdx.x;
+  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
+
+  // Zero-points
+  int zp_gl_rd;
+  if constexpr (has_zp) {
+    if constexpr (group_blocks == -1) {
+      zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
+    } else {
+      zp_gl_rd = zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+                 zp_sh_stride * slice_col + threadIdx.x;
+    }
+  }
+  auto zp_sh_wr = threadIdx.x;
+  bool zp_sh_wr_pred = threadIdx.x < zp_sh_stride;
+
+  // We use a different scale layout for grouped and column-wise quantization as
+  // we scale a `half2` tile in column-major layout in the former and in
+  // row-major in the latter case.
+  int s_sh_rd;
+  if constexpr (group_blocks != -1 && w_type == vllm::kFE2M1f) {
+    auto warp_id = threadIdx.x / 32;
+    int n_warps = thread_n_blocks / 4;
+    int warp_row = warp_id / n_warps;
+
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) / 4;
+    s_sh_rd = s_sh_rd * 2 + warp_row % 2;
+
+  } else if constexpr (group_blocks != -1)
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) / 4;
+  else if constexpr (group_blocks == -1 &&
+                     (m_block_size_8 || (has_zp && !dequant_skip_flop)))
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) / 8;
+  else
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) % 4;
+
+  // Zero-points have the same read layout as the scales
+  // (without column-wise case)
+  constexpr int num_col_threads = 8;
+  constexpr int num_row_threads = 4;
+  constexpr int num_ints_per_thread = 8 / pack_factor;
+  int zp_sh_rd;
+  if constexpr (has_zp) {
+    if constexpr (is_zp_float) {
+      if constexpr (group_blocks != -1) {
+        zp_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                   (threadIdx.x % 32) / 4;
+      }
+    } else {
+      zp_sh_rd = num_ints_per_thread * num_col_threads *
+                     ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                 num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
+    }
+  }
+
+  // Precompute which thread should not read memory in which iterations; this is
+  // needed if there are more threads than required for a certain tilesize or
+  // when the batchsize is not a multiple of 16.
+  bool a_sh_wr_pred[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
+
+  // To ensure that writing and reading A tiles to/from shared memory, the
+  // latter in fragment format, is fully bank conflict free, we need to use a
+  // rather fancy XOR-based layout. The key here is that neither reads nor
+  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
+  // same shared memory banks. Further, it seems (based on NSight-Compute) that
+  // each warp must also write a consecutive memory segment?
+  auto transform_a = [&](int i) {
+    int row = i / a_gl_rd_delta_o;
+    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ (row % 8);
+  };
+  // Since the computation of this remapping is non-trivial and, due to our main
+  // loop unrolls, all shared memory accesses are static, we simply precompute
+  // both transformed reads and writes.
+  int a_sh_wr_trans[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
+  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++) {
+  #pragma unroll
+    for (int j = 0; j < thread_m_blocks; j++)
+      a_sh_rd_trans[i][j] =
+          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+  }
+
+  // Since B-accesses have non-constant stride they have to be computed at
+  // runtime; we break dependencies between subsequent accesses with a tile by
+  // maintining multiple pointers (we have enough registers), a tiny
+  // optimization.
+  const int4* B_ptr[b_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++)
+    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
+
+  extern __shared__ int4 sh[];
+  // Shared memory storage for global fetch pipelines.
+  constexpr int sh_red_size = (2 * thread_n_blocks + 1) * 16 * thread_m_blocks;
+  constexpr int sh_b_size = stages * b_sh_stage;
+  int4* sh_b = sh;
+  int4* sh_red = sh;
+  int4* sh_g_idx = sh_b + (sh_red_size > sh_b_size ? sh_red_size : sh_b_size);
+  int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
+  constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride)
+                                          : (stages * s_sh_stage);
+  int4* sh_s = sh_zp + (stages * zp_sh_stage);
+  // shared memory reused by reduction should be smaller than
+  // shared memory used by weight.
+  static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <=
+                stages * b_sh_stage);
+  int4* sh_a = sh_s + sh_s_size;
+  // constexpr int shm_size_used =
+  //     stages * (g_idx_stage + zp_sh_stage) + sh_s_size +
+  //     (sh_red_size > sh_b_size ? sh_red_size : sh_b_size);
+
+  // Register storage for double buffer of shared memory reads.
+  FragA frag_a[2][thread_m_blocks];
+  I4 frag_b_quant[2][b_thread_vecs];
+  FragC frag_c[thread_m_blocks][4][2];
+  FragS frag_s[2][4];                    // No act-order
+  FragS act_frag_s[2][4][4];             // For act-order
+  int frag_qzp[2][num_ints_per_thread];  // Zero-points
+  FragZP frag_zp;                        // Zero-points in fp16
+  FragZP frag_zpf[2];                    // Zero-points in fp16 in HQQ
+
+  // Zero accumulators.
+  auto zero_accums = [&]() {
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
+      reinterpret_cast<float*>(frag_c)[i] = 0;
+  };
+
+  int sh_first_group_id = -1;
+  int sh_num_groups = -1;
+
+  auto fetch_act_order_scales_to_shared = [&](bool is_async, int first_group_id,
+                                              int last_group_id) {
+    sh_first_group_id = first_group_id;
+    sh_num_groups = last_group_id - first_group_id + 1;
+
+    if (sh_num_groups > act_s_max_num_groups) {
+      sh_num_groups = act_s_max_num_groups;
+    }
+
+    if (sh_first_group_id + sh_num_groups > num_groups) {
+      sh_num_groups = num_groups - sh_first_group_id;
+    }
+
+    int row_offset = first_group_id * s_gl_stride;
+
+    if (is_async) {
+      for (int i = 0; i < sh_num_groups; i++) {
+        if (threadIdx.x < s_sh_stride) {
+          cp_async4_pred(&sh_s[(i * s_sh_stride) + threadIdx.x],
+                         &scales_ptr[row_offset + (i * s_gl_stride) +
+                                     slice_n_offset + threadIdx.x]);
+        }
+      }
+    } else {
+      for (int i = 0; i < sh_num_groups; i++) {
+        if (threadIdx.x < s_sh_stride) {
+          sh_s[(i * s_sh_stride) + threadIdx.x] =
+              scales_ptr[row_offset + (i * s_gl_stride) + slice_n_offset +
+                         threadIdx.x];
+        }
+      }
+    }
+  };
+  // Asynchronously fetch the next A, B and s tile from global to the next
+  // shared memory pipeline location.
+  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
+    if (pred) {
+      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < a_sh_wr_iters; i++) {
+        cp_async4_pred(
+            &sh_a_stage[a_sh_wr_trans[i]],
+            &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
+            a_sh_wr_pred[i]);
+      }
+      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < b_sh_wr_iters; i++) {
+  #pragma unroll
+        for (int j = 0; j < b_thread_vecs; j++) {
+          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j);
+        }
+
+        B_ptr[i] += b_gl_rd_delta_o;
+      }
+
+      if constexpr (has_act_order) {
+        // Fetch g_idx thread-block portion
+        int full_pipe = a_off;
+        int cur_k = slice_k_start_shared_fetch + tb_k * full_pipe;
+        if (cur_k < prob_k && cur_k < slice_k_finish) {
+          int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+
+          int4 const* cur_g_idx_stage_ptr =
+              reinterpret_cast<int4 const*>(&g_idx[cur_k]);
+
+          if (threadIdx.x < g_idx_stage) {
+            cp_async4_pred(&sh_g_idx_stage[threadIdx.x],
+                           &cur_g_idx_stage_ptr[threadIdx.x]);
+          }
+        }
+      } else {
+        if constexpr (group_blocks != -1) {
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+
+          if constexpr (group_blocks >= thread_k_blocks) {
+            // Only fetch scales if this tile starts a new group
+            if (pipe % (group_blocks / thread_k_blocks) == 0) {
+              if (s_sh_wr_pred) {
+                cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
+              }
+              s_gl_rd += s_gl_rd_delta;
+            }
+          } else {
+            for (int i = 0; i < s_tb_groups; i++) {
+              if (s_sh_wr_pred) {
+                cp_async4(&sh_s_stage[i * s_sh_stride + s_sh_wr],
+                          &scales_ptr[s_gl_rd]);
+              }
+              s_gl_rd += s_gl_rd_delta;
+            }
+          }
+        }
+
+        if constexpr (has_zp && group_blocks != -1) {
+          int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
+
+          if constexpr (group_blocks >= thread_k_blocks) {
+            // Only fetch zero-points if this tile starts a new group
+            if (pipe % (group_blocks / thread_k_blocks) == 0) {
+              if (zp_sh_wr_pred) {
+                cp_async4(&sh_zp_stage[zp_sh_wr], &zp_ptr[zp_gl_rd]);
+              }
+              zp_gl_rd += zp_gl_rd_delta;
+            }
+          } else {
+            for (int i = 0; i < zp_tb_groups; i++) {
+              if (zp_sh_wr_pred) {
+                cp_async4(&sh_zp_stage[i * zp_sh_stride + zp_sh_wr],
+                          &zp_ptr[zp_gl_rd]);
+              }
+              zp_gl_rd += zp_gl_rd_delta;
+            }
+          }
+        }
+      }
+    }
+    // Insert a fence even when we are winding down the pipeline to ensure that
+    // waiting is also correct at this point.
+    cp_async_fence();
+  };
+
+  auto fetch_col_zp_to_shared = [&]() {
+    if (zp_sh_wr_pred) {
+      cp_async4(&sh_zp[zp_sh_wr], &zp_ptr[zp_gl_rd]);
+    }
+  };
+
+  auto fetch_col_scale_to_shared = [&]() {
+    if (s_sh_wr_pred) {
+      cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
+    }
+  };
+
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<stages - 2>();
+    __syncthreads();
+  };
+
+  // Load the next sub-tile from the current location in the shared memory pipe
+  // into the current register buffer.
+  auto fetch_to_registers = [&](int k, int pipe) {
+    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks; i++)
+      ldsm<m_block_size_8 ? 2 : 4, scalar_t>(
+          frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
+    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+
+  #pragma unroll
+    for (int i = 0; i < b_thread_vecs; i++) {
+      frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
+          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
+    }
+  };
+
+  bool is_same_group[stages];
+  int same_group_id[stages];
+
+  auto init_same_group = [&](int pipe) {
+    if constexpr (!has_act_order) {
+      return;
+    }
+
+    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);
+
+    int group_id_1 = sh_g_idx_int_ptr[0];
+    int group_id_2 = sh_g_idx_int_ptr[tb_k - 1];
+
+    is_same_group[pipe] = group_id_1 == group_id_2;
+    same_group_id[pipe] = group_id_1;
+  };
+
+  auto fetch_scales_to_registers = [&](int k, int full_pipe) {
+    int pipe = full_pipe % stages;
+
+    if constexpr (!has_act_order) {
+      // No act-order case
+      if constexpr (group_blocks == -1) {
+        // load only when starting a new slice
+        if (k == 0 && full_pipe == 0) {
+          reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd];
+          reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+        }
+      } else if constexpr (group_blocks != -1) {
+        if constexpr (group_blocks >= thread_k_blocks) {
+          if (k % b_sh_wr_iters == 0) {
+            int4* sh_s_stage =
+                sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
+                                     (pipe / (group_blocks / thread_k_blocks)));
+            reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
+          } else {
+            reinterpret_cast<int4*>(&frag_s[1])[0] =
+                reinterpret_cast<int4*>(&frag_s[0])[0];
+          }
+        } else {
+          auto warp_id = threadIdx.x / 32;
+          int n_warps = thread_n_blocks / 4;
+
+          int warp_row = warp_id / n_warps;
+
+          int cur_k = warp_row * 16;
+          cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+          int k_blocks = cur_k / 16;
+          int cur_group_id =
+              k_blocks / (group_blocks * (w_type == vllm::kFE2M1f ? 2 : 1));
+
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+
+          if constexpr (w_type_id != vllm::kFE2M1f.id()) {
+            reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
+                sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
+          } else {
+            reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
+                reinterpret_cast<int2*>(
+                    sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride)];
+          }
+        }
+      }
+
+      return;
+    }
+
+    // Act-order case
+
+    // Determine K of the "current" thread-block
+    int cur_k = slice_k_start + tb_k * full_pipe;
+    if (cur_k >= prob_k || cur_k >= slice_k_finish) {
+      return;
+    }
+
+    // Reset (to current thread-block) since we read g_idx portion from the
+    // shared memory
+    cur_k = 0;
+
+    // Progress to current iteration
+    cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+    // Determine "position" inside the thread-block (based on warp and
+    // thread-id)
+    auto warp_id = threadIdx.x / 32;
+    int n_warps =
+        thread_n_blocks / 4;  // Each warp processes 4 16-size tiles over N
+
+    int warp_row = warp_id / n_warps;
+    int warp_col = warp_id % n_warps;
+
+    cur_k += warp_row * 16;
+
+    auto th_id = threadIdx.x % 32;
+    cur_k += (th_id % 4) * 2;  // Due to tensor-core layout for fp16 B matrix
+
+    int s_col_shift =
+        /*slice_n_offset +*/ (act_s_col_warp_stride * warp_col) +
+        (th_id / 4) * act_s_col_stride;
+
+    if (is_same_group[pipe]) {
+      if (k % 2 == 0) {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
+            sh_s[(same_group_id[pipe] - sh_first_group_id) * s_sh_stride +
+                 s_col_shift];
+      } else {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
+            *(reinterpret_cast<int4*>(&(act_frag_s[(k - 1) % 2][0][0])));
+      }
+
+      for (int i = 1; i < 4; i++) {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
+            *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0])));
+      }
+      return;
+    }
+
+    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);
+
+    constexpr int k_frag_offsets[4] = {0, 1, 8,
+                                       9};  // Tensor core offsets per thread
+
+  #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      int actual_k = cur_k + k_frag_offsets[i];
+
+      int group_id = sh_g_idx_int_ptr[actual_k];
+      int rel_group_id = group_id - sh_first_group_id;
+
+      *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
+          sh_s[rel_group_id * s_sh_stride + s_col_shift];
+    }
+  };
+
+  auto fetch_zp_to_registers = [&](int k, int full_pipe) {
+    // This code does not handle group_blocks == 0,
+    // which signifies act_order.
+    // has_zp implies AWQ, which doesn't have act_order,
+    static_assert(!has_zp || group_blocks != 0);
+
+    if constexpr (has_zp && !is_zp_float) {
+      int pipe = full_pipe % stages;
+
+      if constexpr (group_blocks == -1) {
+        // load only when starting a new slice
+        if (k == 0 && full_pipe == 0) {
+  #pragma unroll
+          for (int i = 0; i < num_ints_per_thread; i++) {
+            frag_qzp[k % 2][i] = (reinterpret_cast<int*>(sh_zp))[zp_sh_rd + i];
+          }
+        }
+
+      } else if constexpr (group_blocks >= thread_k_blocks) {
+        if (k % b_sh_wr_iters == 0) {
+          int4* sh_zp_stage =
+              sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) *
+                                     (pipe / (group_blocks / thread_k_blocks)));
+  #pragma unroll
+          for (int i = 0; i < num_ints_per_thread; i++) {
+            frag_qzp[k % 2][i] =
+                (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
+          }
+        }
+      } else {
+        auto warp_id = threadIdx.x / 32;
+        int n_warps = thread_n_blocks / 4;
+
+        int warp_row = warp_id / n_warps;
+
+        int cur_k = warp_row * 16;
+        cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+        int k_blocks = cur_k / 16;
+        int cur_group_id = 0;
+
+        // Suppress bogus and persistent divide-by-zero warning
+  #pragma nv_diagnostic push
+  #pragma nv_diag_suppress divide_by_zero
+        cur_group_id = k_blocks / group_blocks;
+  #pragma nv_diagnostic pop
+
+        int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
+
+        sh_zp_stage += cur_group_id * zp_sh_stride;
+
+  #pragma unroll
+        for (int i = 0; i < num_ints_per_thread; i++) {
+          frag_qzp[k % 2][i] =
+              (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
+        }
+      }
+    }
+
+    else if constexpr (has_zp && is_zp_float) {
+      int pipe = full_pipe % stages;
+
+      if constexpr (group_blocks != -1) {
+        if constexpr (group_blocks >= thread_k_blocks) {
+          if (k % b_sh_wr_iters == 0) {
+            int4* sh_zp_stage =
+                sh_zp +
+                zp_sh_stage * ((group_blocks / thread_k_blocks) *
+                               (pipe / (group_blocks / thread_k_blocks)));
+            reinterpret_cast<int4*>(&frag_zpf[k % 2])[0] =
+                sh_zp_stage[zp_sh_rd];
+          }
+        } else {
+          auto warp_id = threadIdx.x / 32;
+          int n_warps = thread_n_blocks / 4;
+
+          int warp_row = warp_id / n_warps;
+
+          int cur_k = warp_row * 16;
+          cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+          int k_blocks = cur_k / 16;
+          // Suppress bogus and persistent divide-by-zero warning
+  #pragma nv_diagnostic push
+  #pragma nv_diag_suppress divide_by_zero
+          int cur_group_id = k_blocks / group_blocks;
+  #pragma nv_diagnostic pop
+
+          int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
+
+          reinterpret_cast<int4*>(&frag_zpf[k % 2])[0] =
+              sh_zp_stage[zp_sh_rd + cur_group_id * zp_sh_stride];
+        }
+      }
+    }
+  };
+
+  auto dequant_data = [&](int q, scalar_t2* frag_b_ptr) {
+    dequant<scalar_t2, w_type_id, dequant_skip_flop>(q, frag_b_ptr);
+  };
+
+  // Execute the actual tensor core matmul of a sub-tile.
+  bool is_first_matmul_in_slice = true;
+  auto matmul = [&](int k) {
+    int k2 = k % 2;
+    const bool is_new_zp =
+        ((group_blocks != -1) && (group_blocks < thread_k_blocks || k == 0)) ||
+        (group_blocks == -1 && is_first_matmul_in_slice);
+    if constexpr (has_zp && !is_zp_float) {
+      if (is_new_zp) {
+        if constexpr (group_blocks == -1) is_first_matmul_in_slice = false;
+        int zp_quant_0, zp_quant_1;
+
+        if constexpr (w_type.size_bits() == 4) {
+          zp_quant_0 = frag_qzp[k2][0];
+          zp_quant_1 = zp_quant_0 >> 8;
+        } else {
+          static_assert(w_type.size_bits() == 8);
+          zp_quant_0 = frag_qzp[k2][0];
+          zp_quant_1 = frag_qzp[k2][1];
+        }
+
+        dequant_data(zp_quant_0, reinterpret_cast<scalar_t2*>(&frag_zp));
+        dequant_data(zp_quant_1, reinterpret_cast<scalar_t2*>(&frag_zp) + 2);
+      }
+    }
+    if constexpr (!dequant_skip_flop && has_zp && is_zp_float) {
+      if (is_new_zp) {
+        reinterpret_cast<int4*>(&frag_zp)[0] =
+            reinterpret_cast<int4*>(&frag_zpf[k2])[0];
+      }
+    }
+
+    if constexpr (w_type == vllm::kFE2M1f) {
+      int s_quant_0 = reinterpret_cast<int*>(frag_s[k2])[0];
+      int s_quant_1 = reinterpret_cast<int*>(frag_s[k2])[1];
+
+      dequant_fp8_scales<scalar_t2>(s_quant_0,
+                                    reinterpret_cast<scalar_t2*>(&frag_s[k2]));
+      dequant_fp8_scales<scalar_t2>(
+          s_quant_1, reinterpret_cast<scalar_t2*>(&frag_s[k2]) + 2);
+    }
+
+  // We have the m dimension as the inner loop in order to encourage overlapping
+  // dequantization and matmul operations.
+  #pragma unroll
+    for (int j = 0; j < 4; j++) {
+      FragB frag_b0;
+      FragB frag_b1;
+      int b_quant_0, b_quant_1;
+
+      if constexpr (w_type_id == vllm::kFE2M1f.id()) {
+        b_quant_1 = frag_b_quant[k2][0][j];
+        b_quant_0 = b_quant_1 << 8;
+      } else if constexpr (w_type.size_bits() == 4) {
+        b_quant_0 = frag_b_quant[k2][0][j];
+        b_quant_1 = b_quant_0 >> 8;
+      } else {
+        static_assert(w_type.size_bits() == 8);
+        int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k2]);
+        b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
+        b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
+      }
+
+      dequant_data(b_quant_0, reinterpret_cast<scalar_t2*>(&frag_b0));
+      dequant_data(b_quant_1, reinterpret_cast<scalar_t2*>(&frag_b1));
+
+      if constexpr (dequant_skip_flop && has_zp && !is_zp_float) {
+        sub_zp<scalar_t>(frag_b0, frag_zp[j], 0);
+        sub_zp<scalar_t>(frag_b1, frag_zp[j], 1);
+      }
+
+      // Apply scale to frag_b0
+      if constexpr (has_act_order) {
+        static_assert(group_blocks != -1);
+        scale4<scalar_t>(frag_b0, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
+                         act_frag_s[k2][2][j], act_frag_s[k2][3][j], 0);
+        scale4<scalar_t>(frag_b1, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
+                         act_frag_s[k2][2][j], act_frag_s[k2][3][j], 1);
+      } else if constexpr (!dequant_skip_flop && has_zp && !is_zp_float &&
+                           group_blocks == -1) {
+        int idx = (threadIdx.x / 4) % 2;
+        scalar_t2 s2 = Dtype::nums2num2(
+            reinterpret_cast<scalar_t*>(&frag_s[j / 2][j % 2 * 2 + 0])[idx],
+            reinterpret_cast<scalar_t*>(&frag_s[j / 2][j % 2 * 2 + 1])[idx]);
+        if (is_new_zp) frag_zp[j] = __hmul2(frag_zp[j], s2);
+        scale_and_sub<scalar_t>(frag_b0, s2.x, frag_zp[j].x);
+        scale_and_sub<scalar_t>(frag_b1, s2.y, frag_zp[j].y);
+      } else if constexpr (!dequant_skip_flop && has_zp && group_blocks != -1) {
+        if (is_new_zp)
+          frag_zp[j] = __hmul2(frag_zp[j],
+                               *reinterpret_cast<scalar_t2*>(&frag_s[k2][j]));
+        scale_and_sub<scalar_t>(frag_b0, frag_s[k2][j][0].x, frag_zp[j].x);
+        scale_and_sub<scalar_t>(frag_b1, frag_s[k2][j][0].y, frag_zp[j].y);
+      } else if constexpr (group_blocks != -1) {
+        scale<scalar_t>(frag_b0, frag_s[k2][j], 0);
+        scale<scalar_t>(frag_b1, frag_s[k2][j], 1);
+      }
+
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        if constexpr (m_block_size_8) {
+          mma_trans<scalar_t>(frag_a[k2][i], frag_b0, frag_b1, frag_c[i][j][0]);
+        } else {
+          mma<scalar_t>(frag_a[k2][i], frag_b0, frag_c[i][j][0]);
+          mma<scalar_t>(frag_a[k2][i], frag_b1, frag_c[i][j][1]);
+        }
+      }
+    }
+  };
+
+  // Since we slice across the k dimension of a tile in order to increase the
+  // number of warps while keeping the n dimension of a tile reasonable, we have
+  // multiple warps that accumulate their partial sums of the same output
+  // location; which we have to reduce over in the end. We do in shared memory.
+  auto thread_block_reduce = [&]() {
+    constexpr int red_off = threads / b_sh_stride_threads / 2;
+    if (red_off >= 1) {
+      auto red_idx = threadIdx.x / b_sh_stride_threads;
+      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
+      constexpr int red_sh_delta = b_sh_stride_threads;
+      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
+                      (threadIdx.x % b_sh_stride_threads);
+
+      // Parallel logarithmic shared memory reduction. We make sure to avoid any
+      // unnecessary read or write iterations, e.g., for two warps we write only
+      // once by warp 1 and read only once by warp 0.
+
+  #pragma unroll
+      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
+  #pragma unroll
+        for (int i = red_off; i > 0; i /= 2) {
+          if (i <= red_idx && red_idx < 2 * i) {
+  #pragma unroll
+            for (int j = 0; j < 4 * 2; j += (m_block_size_8 ? 2 : 1)) {
+              int red_sh_wr =
+                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
+              if (i < red_off) {
+                float* c_rd = reinterpret_cast<float*>(
+                    &sh_red[red_sh_delta * j + red_sh_rd]);
+                float* c_wr = reinterpret_cast<float*>(&sh_red[red_sh_wr]);
+  #pragma unroll
+                for (int k = 0; k < 4; k++)
+                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
+                      c_rd[k] + c_wr[k];
+              }
+              sh_red[red_sh_wr] =
+                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
+            }
+          }
+          __syncthreads();
+        }
+        if (red_idx == 0) {
+  #pragma unroll
+          for (int i = 0; i < 4 * 2; i += (m_block_size_8 ? 2 : 1)) {
+            float* c_rd =
+                reinterpret_cast<float*>(&sh_red[red_sh_delta * i + red_sh_rd]);
+  #pragma unroll
+            for (int j = 0; j < 4; j++)
+              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
+                  c_rd[j];
+          }
+        }
+        __syncthreads();
+      }
+    }
+  };
+
+  // Since multiple threadblocks may process parts of the same column slice, we
+  // finally have to globally reduce over the results. As the striped
+  // partitioning minimizes the number of such reductions and our outputs are
+  // usually rather small, we perform this reduction serially in L2 cache.
+  auto global_reduce_fp16 = [&](bool first = false, bool last = false) {
+    // We are very careful here to reduce directly in the output buffer to
+    // maximize L2 cache utilization in this step. To do this, we write out
+    // results in FP16 (but still reduce with FP32 compute).
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    if (threadIdx.x < active_threads) {
+      int c_gl_stride = prob_n / 8;
+      int c_gl_wr_delta_o = 8 * c_gl_stride;
+      int c_gl_wr_delta_i = 4 * (active_threads / 32);
+      int c_gl_wr;
+      if constexpr (m_block_size_8) {
+        c_gl_wr = c_gl_stride * ((threadIdx.x % 4) * 2) +
+                  4 * (threadIdx.x / 32) + (threadIdx.x % 32) / 8;
+        c_gl_wr += (2 * thread_n_blocks) * slice_col;
+      } else {
+        c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
+                  4 * (threadIdx.x / 32) + threadIdx.x % 4;
+        c_gl_wr += (2 * thread_n_blocks) * slice_col;
+      }
+      constexpr int c_sh_wr_delta = active_threads;
+      auto c_sh_wr = threadIdx.x;
+
+      int row = (threadIdx.x % 32) / 4;
+
+      if (!first) {
+  // Interestingly, doing direct global accesses here really seems to mess up
+  // the compiler and lead to slowdowns, hence we also use async-copies even
+  // though these fetches are not actually asynchronous.
+  #pragma unroll
+        for (int i = 0; i < (m_block_size_8 ? 2 : thread_m_blocks * 4); i++) {
+          if constexpr (m_block_size_8) {
+            cp_async4_pred(&sh_red[c_sh_wr + c_sh_wr_delta * i],
+                           &C[c_gl_wr + i * c_gl_stride +
+                              (threadIdx.x % 8) / 4 * c_gl_wr_delta_i],
+                           (threadIdx.x % 4) * 2 + i < prob_m);
+          } else {
+            cp_async4_pred(
+                &sh_red[c_sh_wr + c_sh_wr_delta * i],
+                &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
+                   c_gl_wr_delta_i * (i % 2)],
+                i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
+          }
+        }
+        cp_async_fence();
+        cp_async_wait<0>();
+      }
+
+  #pragma unroll
+      for (int i = 0; i < (m_block_size_8 ? 2 : thread_m_blocks * 4); i++) {
+        bool mask = (!m_block_size_8) && (i < (thread_m_blocks - 1) * 4 ||
+                                          8 * (i / 2) + row < prob_m) ||
+                    (m_block_size_8) && ((threadIdx.x % 4) * 2 + i < prob_m);
+        if (mask) {
+          if (!first) {
+            int4 c_red = sh_red[c_sh_wr + i * c_sh_wr_delta];
+  #pragma unroll
+            for (int j = 0; j < 2 * 4; j++) {
+              int delta = 0;
+              if constexpr (m_block_size_8) {
+                delta = j % 2 == 1 ? -2 : 0;
+              }
+              reinterpret_cast<float*>(
+                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4) + delta] +=
+                  Dtype::num2float(reinterpret_cast<scalar_t*>(&c_red)[j]);
+            }
+          }
+          if (!last) {
+            int4 c;
+  #pragma unroll
+            for (int j = 0; j < 2 * 4; j++) {
+              int delta = 0;
+              if constexpr (m_block_size_8) {
+                delta = j % 2 == 1 ? -2 : 0;
+              }
+              reinterpret_cast<scalar_t*>(&c)[j] =
+                  Dtype::float2num(reinterpret_cast<float*>(
+                      &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4) + delta]);
+            }
+            if constexpr (m_block_size_8)
+              C[c_gl_wr + i * c_gl_stride +
+                (threadIdx.x % 8) / 4 * c_gl_wr_delta_i] = c;
+            else
+              C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
+                c_gl_wr_delta_i * (i % 2)] = c;
+          }
+        }
+      }
+    }
+  };
+
+  // Globally reduce over threadblocks that compute the same column block.
+  // We use a tmp C buffer to reduce in full fp32 precision.
+  auto global_reduce_fp32 = [&](bool first = false, bool last = false) {
+    constexpr int tb_m = thread_m_blocks * 16;
+    constexpr int tb_n = thread_n_blocks * 16;
+
+    constexpr int c_size = tb_m * tb_n * sizeof(float) / 16;
+
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    bool is_th_active = threadIdx.x < active_threads;
+
+    constexpr int num_floats = thread_m_blocks * 4 * 2 * 4;
+    constexpr int th_size = num_floats * sizeof(float) / 16;
+
+    int c_cur_offset = locks_off * c_size;
+
+    if (!is_th_active) {
+      return;
+    }
+
+    if (!first) {
+      float* frag_c_ptr = reinterpret_cast<float*>(&frag_c);
+  #pragma unroll
+      for (int k = 0; k < th_size; k += (m_block_size_8 ? 2 : 1)) {
+        sh_red[threadIdx.x] =
+            C_tmp[c_cur_offset + active_threads * k + threadIdx.x];
+
+        float* sh_c_ptr = reinterpret_cast<float*>(&sh_red[threadIdx.x]);
+  #pragma unroll
+        for (int f = 0; f < 4; f++) {
+          frag_c_ptr[k * 4 + f] += sh_c_ptr[f];
+        }
+      }
+    }
+
+    if (!last) {
+      int4* frag_c_ptr = reinterpret_cast<int4*>(&frag_c);
+  #pragma unroll
+      for (int k = 0; k < th_size; k += (m_block_size_8 ? 2 : 1)) {
+        C_tmp[c_cur_offset + active_threads * k + threadIdx.x] = frag_c_ptr[k];
+      }
+    }
+  };
+
+  // Write out the reduce final result in the correct layout. We only actually
+  // reshuffle matrix fragments in this step, the reduction above is performed
+  // in fragment layout.
+  auto write_result = [&]() {
+    int c_gl_stride = prob_n / 8;
+    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
+    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
+    constexpr int c_sh_rd_delta =
+        c_sh_stride * (threads / (2 * thread_n_blocks));
+
+    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+    c_gl_wr += (2 * thread_n_blocks) * slice_col;
+    int c_sh_wr;
+    if constexpr (m_block_size_8) {
+      c_sh_wr = (8 * c_sh_stride) * ((threadIdx.x % 32) % 4 * 2) +
+                (threadIdx.x % 32) / 4;
+      c_sh_wr += 64 * (threadIdx.x / 32);
+    } else {
+      c_sh_wr =
+          (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
+      c_sh_wr += 32 * (threadIdx.x / 32);
+    }
+
+    int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+
+    int c_gl_wr_end = c_gl_stride * prob_m;
+    // We first reorder in shared memory to guarantee the most efficient final
+    // global write patterns
+    auto write = [&](int idx, float c0, float c1, FragS& s) {
+      scalar_t2 res =
+          Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));
+
+      // For per-column quantization we finally apply the scale here (only for
+      // 4-bit)
+      if constexpr (!has_act_order && group_blocks == -1 &&
+                    w_type.size_bits() == 4 &&
+                    (has_zp && dequant_skip_flop || !has_zp)) {
+        res = __hmul2(res, s[0]);
+      }
+
+      if constexpr (w_type == vllm::kFE2M1f) {
+        res = __hmul2(res, global_scale);
+      }
+
+      if constexpr (m_block_size_8) {
+        ((scalar_t*)sh_red)[idx] = res.x;
+        ((scalar_t*)sh_red)[idx + 8 * c_sh_stride] = res.y;
+      } else {
+        ((scalar_t2*)sh_red)[idx] = res;
+      }
+    };
+
+    if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+        for (int j = 0; j < 4; j++) {
+          if constexpr (m_block_size_8) {
+            int wr = c_sh_wr + 16 * j;
+            write(wr, frag_c[i][j][0][0], frag_c[i][j][0][1],
+                  frag_s[j / 2][2 * (j % 2) + 0]);
+            write(wr + 8, frag_c[i][j][0][2], frag_c[i][j][0][3],
+                  frag_s[j / 2][2 * (j % 2) + 1]);
+          } else {
+            int wr = c_sh_wr + 8 * j;
+            write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
+                  frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
+            write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
+                  frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
+            write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
+                  frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
+            write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
+                  frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
+          }
+        }
+        c_sh_wr += 16 * (4 * c_sh_stride);
+      }
+    }
+    __syncthreads();
+
+  #pragma unroll
+    for (int i = 0;
+         i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
+         i++) {
+      if (c_gl_wr < c_gl_wr_end) {
+        if (use_atomic_add && slice_count > 1) {
+          scalar_t2* C_half2 = reinterpret_cast<scalar_t2*>(&C[c_gl_wr]);
+          scalar_t2* sh_red_half2 =
+              reinterpret_cast<scalar_t2*>(&sh_red[c_sh_rd]);
+  #pragma unroll
+          for (int a = 0; a < 4; a++) {
+            atomicAdd(&C_half2[a], sh_red_half2[a]);
+          }
+        } else {
+          C[c_gl_wr] = sh_red[c_sh_rd];
+        }
+        c_gl_wr += c_gl_wr_delta;
+        c_sh_rd += c_sh_rd_delta;
+      }
+    }
+    __syncthreads();
+  };
+
+  // Start global fetch and register load pipelines.
+  auto start_pipes = [&]() {
+
+  #pragma unroll
+    for (int i = 0; i < stages - 1; i++) {
+      if (has_act_order && i == 0) {
+        int last_g_idx = slice_k_start + stages * tb_k * 2;
+        if (last_g_idx >= prob_k) {
+          last_g_idx = prob_k - 1;
+        }
+        fetch_act_order_scales_to_shared(true, g_idx[slice_k_start],
+                                         g_idx[last_g_idx]);
+      }
+
+      if constexpr (has_zp && !is_zp_float && group_blocks == -1) {
+        if (i == 0) {
+          fetch_col_zp_to_shared();
+          if constexpr (!dequant_skip_flop) {
+            fetch_col_scale_to_shared();
+          }
+        }
+      }
+      fetch_to_shared(i, i, i < slice_iters);
+    }
+
+    zero_accums();
+    wait_for_stage();
+    init_same_group(0);
+    fetch_to_registers(0, 0);
+    fetch_scales_to_registers(0, 0);
+    fetch_zp_to_registers(0, 0);
+    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
+    if constexpr (has_act_order) {
+      slice_k_start_shared_fetch += tb_k * (stages - 1);
+    }
+  };
+  if (slice_iters) {
+    start_pipes();
+  }
+
+  // Main loop.
+  while (slice_iters) {
+    // We unroll over both the global fetch and the register load pipeline to
+    // ensure all shared memory accesses are static. Note that both pipelines
+    // have even length meaning that the next iteration will always start at
+    // index 0.
+
+  #pragma unroll
+    for (int pipe = 0; pipe < stages;) {
+  #pragma unroll
+      for (int k = 0; k < b_sh_wr_iters; k++) {
+        fetch_to_registers(k + 1, pipe % stages);
+        fetch_scales_to_registers(k + 1, pipe);
+        fetch_zp_to_registers(k + 1, pipe);
+        if (k == b_sh_wr_iters - 2) {
+          fetch_to_shared((pipe + stages - 1) % stages, pipe,
+                          slice_iters >= stages);
+          pipe++;
+          wait_for_stage();
+          init_same_group(pipe % stages);
+        }
+        matmul(k);
+      }
+      slice_iters--;
+      if (slice_iters == 0) {
+        break;
+      }
+    }
+
+    a_gl_rd += a_gl_rd_delta_o * stages;
+
+    if constexpr (has_act_order) {
+      slice_k_start += tb_k * stages;
+
+      if (slice_k_start < prob_k) {
+        slice_k_start_shared_fetch += tb_k * stages;
+        int first_group_id = g_idx[slice_k_start];
+        int last_g_idx = slice_k_start + stages * tb_k * 2;
+        if (last_g_idx >= prob_k) {
+          last_g_idx = prob_k - 1;
+        }
+        int last_group_id = g_idx[last_g_idx];
+        if (last_group_id >= sh_first_group_id + sh_num_groups) {
+          fetch_act_order_scales_to_shared(false, first_group_id,
+                                           last_group_id);
+          __syncthreads();
+        }
+      }
+    }
+
+    // Process results and, if necessary, proceed to the next column slice.
+    // While this pattern may not be the most readable, other ways of writing
+    // the loop seemed to noticeably worse performance after compilation.
+    if (slice_iters == 0) {
+      cp_async_wait<0>();
+      bool last = slice_idx == slice_count - 1;
+      // For per-column scales, we only fetch them here in the final step before
+      // write-out
+      if constexpr (!has_act_order && group_blocks == -1 &&
+                    (has_zp && dequant_skip_flop || !has_zp)) {
+        if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
+          if (s_sh_wr_pred) {
+            cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
+          }
+          cp_async_fence();
+        }
+      }
+
+      thread_block_reduce();
+      if constexpr (!has_act_order && group_blocks == -1 &&
+                    (has_zp && dequant_skip_flop || !has_zp)) {
+        if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
+          cp_async_wait<0>();
+          __syncthreads();
+          if (threadIdx.x / 32 < thread_n_blocks / 4) {
+            reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+            reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+            if constexpr (m_block_size_8) {
+              int idx = (threadIdx.x / 4) % 2;
+              scalar_t2* frag_s_half2 = reinterpret_cast<scalar_t2*>(frag_s);
+  #pragma unroll
+              for (int i = 0; i < 8; i++) {
+                frag_s_half2[i] = Dtype::num2num2(
+                    reinterpret_cast<scalar_t*>(&frag_s_half2[i])[idx]);
+              }
+            }
+          }
+        }
+      }
+
+      // For 8-bit channelwise, we apply the scale before the global reduction
+      // that converts the fp32 results to fp16 (so that we avoid possible
+      // overflow in fp16)
+      if constexpr (!has_act_order && group_blocks == -1 &&
+                    w_type.size_bits() == 8 &&
+                    (has_zp && dequant_skip_flop || !has_zp)) {
+        if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+          for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+            for (int j = 0; j < 4; j++) {
+              scale_float<scalar_t>(
+                  reinterpret_cast<float*>(&frag_c[i][j][0][0]),
+                  frag_s[j / 2][2 * (j % 2) + 0]);
+              scale_float<scalar_t>(
+                  reinterpret_cast<float*>(&frag_c[i][j][0][2]),
+                  frag_s[j / 2][2 * (j % 2) + (m_block_size_8 ? 1 : 0)]);
+
+              if constexpr (!m_block_size_8) {
+                scale_float<scalar_t>(
+                    reinterpret_cast<float*>(&frag_c[i][j][1][0]),
+                    frag_s[j / 2][2 * (j % 2) + 1]);
+                scale_float<scalar_t>(
+                    reinterpret_cast<float*>(&frag_c[i][j][1][2]),
+                    frag_s[j / 2][2 * (j % 2) + 1]);
+              }
+            }
+          }
+        }
+      }
+
+      if (slice_count > 1 && !use_atomic_add) {
+        // only globally reduce if there is more than one block in a slice
+        barrier_acquire(&locks[locks_off], slice_idx);
+        if (use_fp32_reduce) {
+          global_reduce_fp32(slice_idx == 0, last);
+        } else {
+          global_reduce_fp16(slice_idx == 0, last);
+        }
+        barrier_release(&locks[locks_off], last);
+      }
+      if (use_atomic_add && slice_count > 1 && slice_idx != 0)
+        wait_negative_and_add(&locks[locks_off]);
+      if (last || use_atomic_add)
+        // only the last block in a slice actually writes the result
+        write_result();
+      slice_row = 0;
+      slice_col_par++;
+      slice_col++;
+      is_first_matmul_in_slice = true;
+      init_slice();
+
+      if (slice_iters) {
+        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                  (threadIdx.x % a_gl_rd_delta_o);
+  #pragma unroll
+        for (int i = 0; i < b_sh_wr_iters; i++)
+          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
+        if (slice_col == 0) {
+  #pragma unroll
+          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
+        }
+
+        // Update slice k/n for scales loading
+        if constexpr (has_act_order) {
+          slice_k_start = tb_k * slice_row;
+          slice_k_finish = slice_k_start + tb_k * slice_iters;
+          slice_k_start_shared_fetch = slice_k_start;
+          slice_n_offset = act_s_col_tb_stride * slice_col;
+
+        } else {
+          s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+          zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
+        }
+
+        start_pipes();
+      }
+    }
+  }
+}
+
+}  // namespace MARLIN_NAMESPACE_NAME
+
+#endif
diff --git a/vllm_v0.10.0/csrc/quantization/machete/Readme.md b/vllm_v0.10.0/csrc/quantization/machete/Readme.md
new file mode 100644
index 0000000..6ffb241
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/machete/Readme.md
@@ -0,0 +1,45 @@
+# Machete (Mixed Precision Cutlass-Based GEMM)
+
+Machete is a spiritual successor to the Marlin kernel but optimized for Hopper architectures and based on Cutlass. Being based on Cutlass, new type pairs and epilogues are easier to add compared to Marlin.
+
+## Overview
+
+Machete effectively performs
+
+```python
+scale_type = w_s.dtype
+compute_type = a.dtype
+out = (w_q.to(scale_type) * w_s - w_z.to(scale_type)) @ a
+```
+
+Where `w_q` is a quantized weight matrix, `w_s` is the quantization scales, and
+`w_z` is the quantization zeropoints.
+
+> **_NOTE:_**  `w_z` is added after the scales so we can
+use FMA operations, but this means they must have the scales pre-applied if the
+supplied zeropoints assume that they will be subtracted before the scales are
+applied.
+
+## API
+
+The main optimization within Machete is prepacking the weight matrix to more closely match the tensor core layouts, allowing for wider shared memory loads when loading the weight matrix. This means that the weight matrix must be prepacked before calling `machete_gemm`. The flow looks something like:
+
+```python
+from vllm import _custom_ops as ops
+
+...
+W_q_packed = ops.machete_prepack_B(w_q, wtype)
+output = ops.machete_gemm(
+    a,
+    b_q=W_q_packed,
+    b_type=wtype,
+    b_scales=w_s,
+    b_group_size=group_size
+)
+```
+
+## Code Generation
+
+Since Machete is based on Cutlass, we can generate multiple type pairs and different tile shapes using the same kernel template. We generate multiple instantiations of this template using `generate.py`.
+
+New type pairs (`TypeConfig`s) can be appended to `impl_configs` (in `generate()`), and these will get automatically generated (assuming they can be supported without issues). For each `TypeConfig`, you must also provide an `ImplConfig`, which bundles a `TypeConfig` with a list of `ScheduleConfig`s, `Specialization`s, and a default heuristic. The `ScheduleConfig`s (which contain info on tile shapes, tile scheduler, etc.) can perform differently for different problem shapes, and there is almost never one `ScheduleConfig` that works well for all problem shapes, so it is generally beneficial to generate different `ScheduleConfig`s for different potential problem shapes. This is where the heuristic comes in. For each `TypeConfig`, a default heuristic should be provided. This maps different problem shapes to different `ScheduleConfig`s and is used when the user does not provide the `schedule` parameter to `machete_gemm`. The `Specialization`s define what feature combinations to generate, i.e., `with_zeropoints`, `with_scales`, etc. We can reduce compile times and the final binary size by limiting the set of feature combinations we generate.
diff --git a/vllm_v0.10.0/csrc/quantization/machete/generate.py b/vllm_v0.10.0/csrc/quantization/machete/generate.py
new file mode 100644
index 0000000..9af7833
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/machete/generate.py
@@ -0,0 +1,662 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import itertools
+import math
+import os
+import shutil
+from collections.abc import Iterable
+from copy import deepcopy
+from dataclasses import dataclass, fields
+from functools import reduce
+from typing import Optional, Union
+
+import jinja2
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm_cutlass_library_extension import (DataType, EpilogueScheduleTag,
+                                            EpilogueScheduleType,
+                                            MixedInputKernelScheduleType,
+                                            TileSchedulerTag,
+                                            TileSchedulerType, VLLMDataType,
+                                            VLLMDataTypeNames,
+                                            VLLMDataTypeSize, VLLMDataTypeTag,
+                                            VLLMDataTypeTorchDataTypeTag,
+                                            VLLMDataTypeVLLMScalarTypeTag,
+                                            VLLMKernelScheduleTag)
+
+# yapf: enable
+
+#
+#   Generator templating
+#
+
+DISPATCH_TEMPLATE = """
+#include "../machete_mm_launcher.cuh"
+
+namespace machete {
+
+{% for impl_config in impl_configs %}
+{% set type_sig = gen_type_sig(impl_config.types) -%}
+{% for s in impl_config.schedules %}
+extern torch::Tensor impl_{{type_sig}}_sch_{{gen_sch_sig(s)}}(MMArgs);
+{%- endfor %}
+
+torch::Tensor mm_dispatch_{{type_sig}}(MMArgs args) {
+  [[maybe_unused]] auto M = args.A.size(0);
+  [[maybe_unused]] auto N = args.B.size(1);
+  [[maybe_unused]] auto K = args.A.size(1);
+    
+  if (!args.maybe_schedule) {
+    {%- for cond, s in impl_config.heuristic %}
+    {%if cond is not none%}if ({{cond}})
+    {%- else %}else
+    {%- endif %}
+        return impl_{{type_sig}}_sch_{{ gen_sch_sig(s) }}(args);{% endfor %}
+  }
+
+  {%- for s in impl_config.schedules %}
+  if (*args.maybe_schedule == "{{ gen_sch_sig(s) }}")
+    return impl_{{type_sig}}_sch_{{ gen_sch_sig(s) }}(args);
+  {%- endfor %}
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "machete_gemm(..) is not implemented for "
+                                     "schedule = ", *args.maybe_schedule);
+}
+{%- endfor %}
+
+
+static inline std::optional<at::ScalarType> maybe_scalartype(
+    std::optional<at::Tensor> const& t) {
+    if (!t) {
+      return std::nullopt;
+    } else {
+      return t->scalar_type();
+    };
+}
+
+torch::Tensor mm_dispatch(MMArgs args) {
+  auto out_type = args.maybe_out_type.value_or(args.A.scalar_type());
+  auto a_type = args.A.scalar_type();
+  auto maybe_g_scales_type = maybe_scalartype(args.maybe_group_scales);
+  auto maybe_g_zeros_type = maybe_scalartype(args.maybe_group_zeros);
+  auto maybe_ch_scales_type = maybe_scalartype(args.maybe_channel_scales);
+  auto maybe_tok_scales_type = maybe_scalartype(args.maybe_token_scales);
+
+  {% for impl_config in impl_configs %}
+  {% set t = impl_config.types -%}
+  {% set type_sig = gen_type_sig(t) -%}
+  if (args.b_type == {{VLLMScalarTypeTag[t.b]}}
+      && a_type == {{TorchTypeTag[t.a]}}
+      && out_type == {{TorchTypeTag[t.out]}}
+      && {%if t.b_group_scale != void -%}
+      maybe_g_scales_type == {{TorchTypeTag[t.b_group_scale]}}
+      {%- else %}!maybe_g_scales_type{%endif%}
+      && {%if t.b_group_zeropoint != void -%}
+      maybe_g_zeros_type == {{TorchTypeTag[t.b_group_zeropoint]}}
+      {%- else %}!maybe_g_zeros_type{%endif%}
+      && {%if t.b_channel_scale != void -%}
+      maybe_ch_scales_type == {{TorchTypeTag[t.b_channel_scale]}}
+      {%- else %}!maybe_ch_scales_type{%endif%}
+      && {%if t.a_token_scale != void -%}
+      maybe_tok_scales_type == {{TorchTypeTag[t.a_token_scale]}}
+      {%- else %}!maybe_tok_scales_type{%endif%}
+  ) {
+      return mm_dispatch_{{type_sig}}(args);
+  }
+  {%- endfor %}
+  
+  TORCH_CHECK_NOT_IMPLEMENTED(
+    false, "machete_mm(..) is not implemented for "
+    "a_type=", args.A.scalar_type(),
+    ", b_type=", args.b_type.str(),
+    ", out_type=", out_type,
+    ", with_group_scale_type=", maybe_g_scales_type
+        ? toString(*maybe_g_scales_type) : "None",
+    ", with_group_zeropoint_type=", maybe_g_zeros_type
+        ? toString(*maybe_g_zeros_type) : "None",
+    ", with_channel_scale_type=", maybe_ch_scales_type
+        ? toString(*maybe_ch_scales_type) : "None",
+    ", with_token_scale_type=", maybe_tok_scales_type
+        ? toString(*maybe_tok_scales_type) : "None",
+    "; implemented types are: \\n",
+    {%- for impl_config in impl_configs %}
+    {% set t = impl_config.types -%}
+    "\\t{{gen_type_option_name(t)}}\\n",
+    {%- endfor %}
+    "");
+}
+
+std::vector<std::string> supported_schedules_dispatch(
+    SupportedSchedulesArgs args) {
+    auto out_type = args.maybe_out_type.value_or(args.a_type);
+    
+    {% for impl_config in impl_configs %}
+    {% set t = impl_config.types -%}
+    {% set schs = impl_config.schedules -%}
+    if (args.b_type == {{VLLMScalarTypeTag[t.b]}}
+        && args.a_type == {{TorchTypeTag[t.a]}}
+        && out_type == {{TorchTypeTag[t.out]}}
+        && {%if t.b_group_scale != void -%}
+        args.maybe_group_scales_type == {{TorchTypeTag[t.b_group_scale]}}
+        {%- else %}!args.maybe_group_scales_type{%endif%}
+        && {%if t.b_group_zeropoint != void-%}
+        args.maybe_group_zeros_type == {{TorchTypeTag[t.b_group_zeropoint]}}
+        {%- else %}!args.maybe_group_zeros_type{%endif%}
+    ) {
+        return {
+            {%- for s in impl_config.schedules %}
+            "{{gen_sch_sig(s)}}"{% if not loop.last %},{% endif %}
+            {%- endfor %}
+        };
+    }
+    {%- endfor %}
+    
+    return {};
+};
+
+}; // namespace machete
+"""
+
+IMPL_TEMPLATE = """
+#include "../machete_mm_launcher.cuh"
+
+namespace machete {
+    
+{% for sch in unique_schedules(impl_configs) %}
+{% set sch_sig = gen_sch_sig(sch) -%}
+struct sch_{{sch_sig}} {
+  using TileShapeNM = Shape<{{
+      to_cute_constant(sch.tile_shape_mn)|join(', ')}}>;
+  using ClusterShape = Shape<{{
+      to_cute_constant(sch.cluster_shape_mnk)|join(', ')}}>;
+  // TODO: Reimplement
+  // using KernelSchedule   = {{KernelScheduleTag[sch.kernel_schedule]}};
+  using EpilogueSchedule = {{EpilogueScheduleTag[sch.epilogue_schedule]}};
+  using TileScheduler    = {{TileSchedulerTag[sch.tile_scheduler]}};
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+};
+{% endfor %}
+    
+{% for impl_config in impl_configs %}
+{% set t = impl_config.types -%}
+{% set schs = impl_config.schedules -%}
+{% set type_sig = gen_type_sig(t) -%}
+
+template<typename Sch>
+using Kernel_{{type_sig}} = MacheteKernelTemplate<
+  {{DataTypeTag[t.a]}},  // ElementA
+  {{DataTypeTag[t.b]}},  // ElementB
+  {{DataTypeTag[t.out]}},  // ElementD
+  {{DataTypeTag[t.accumulator]}}, // Accumulator
+  {{DataTypeTag[t.b_group_scale]}}, // GroupScaleT
+  {{DataTypeTag[t.b_group_zeropoint]}}, // GroupZeroT
+  {{DataTypeTag[t.b_channel_scale]}}, // ChannelScaleT
+  {{DataTypeTag[t.a_token_scale]}}, // TokenScaleT
+  cutlass::gemm::KernelTmaWarpSpecializedCooperative,
+  Sch>;
+
+{% for sch in schs %}
+{% set sch_sig = gen_sch_sig(sch) -%}
+torch::Tensor 
+impl_{{type_sig}}_sch_{{sch_sig}}(MMArgs args) {
+  return run_impl<Kernel_{{type_sig}}<sch_{{sch_sig}}>>(args);
+}
+{%- endfor %}
+{%- endfor %}
+
+}; // namespace machete
+"""
+
+PREPACK_TEMPLATE = """
+#include "../machete_prepack_launcher.cuh"
+
+namespace machete {
+
+torch::Tensor prepack_B_dispatch(PrepackBArgs args) {
+  auto convert_type = args.maybe_group_scales_type.value_or(args.a_type);
+  {%- for t in types %}
+  {% set b_type = unsigned_type_with_bitwidth(t.b_num_bits) %}
+  if (args.a_type == {{TorchTypeTag[t.a]}}
+      && args.b_type.size_bits() == {{t.b_num_bits}} 
+      && convert_type == {{TorchTypeTag[t.convert]}}) {
+    return prepack_impl<
+      PrepackedLayoutBTemplate<
+        {{DataTypeTag[t.a]}}, // ElementA
+        {{DataTypeTag[b_type]}}, // ElementB
+        {{DataTypeTag[t.convert]}}, // ElementConvert
+        {{DataTypeTag[t.accumulator]}}, // Accumulator
+        cutlass::layout::ColumnMajor,
+        cutlass::gemm::KernelTmaWarpSpecializedCooperative>
+    >(args.B); 
+  }
+  {%- endfor %}
+  
+  TORCH_CHECK_NOT_IMPLEMENTED(false, 
+    "prepack_B_dispatch(..) is not implemented for "
+    "atype = ", args.a_type,
+    ", b_type = ", args.b_type.str(),
+    ", with_group_scales_type= ", args.maybe_group_scales_type ? 
+        toString(*args.maybe_group_scales_type) : "None");
+}
+
+}; // namespace machete
+"""
+
+TmaMI = MixedInputKernelScheduleType.TmaWarpSpecializedCooperative
+TmaCoop = EpilogueScheduleType.TmaWarpSpecializedCooperative
+
+
+@dataclass(frozen=True)
+class ScheduleConfig:
+    tile_shape_mn: tuple[int, int]
+    cluster_shape_mnk: tuple[int, int, int]
+    kernel_schedule: MixedInputKernelScheduleType
+    epilogue_schedule: EpilogueScheduleType
+    tile_scheduler: TileSchedulerType
+
+
+@dataclass(frozen=True)
+class TypeConfig:
+    a: DataType
+    b: Union[DataType, VLLMDataType]
+    b_group_scale: DataType
+    b_group_zeropoint: DataType
+    b_channel_scale: DataType
+    a_token_scale: DataType
+    out: DataType
+    accumulator: DataType
+
+
+@dataclass(frozen=True)
+class PrepackTypeConfig:
+    a: DataType
+    b_num_bits: int
+    convert: DataType
+    accumulator: DataType
+
+
+@dataclass
+class ImplConfig:
+    types: TypeConfig
+    schedules: list[ScheduleConfig]
+    heuristic: list[tuple[Optional[str], ScheduleConfig]]
+
+
+def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
+    tile_shape = (
+        f"{schedule_config.tile_shape_mn[0]}x{schedule_config.tile_shape_mn[1]}"
+    )
+    cluster_shape = (f"{schedule_config.cluster_shape_mnk[0]}" +
+                     f"x{schedule_config.cluster_shape_mnk[1]}" +
+                     f"x{schedule_config.cluster_shape_mnk[2]}")
+    kernel_schedule = VLLMKernelScheduleTag[schedule_config.kernel_schedule]\
+        .split("::")[-1]
+    epilogue_schedule = EpilogueScheduleTag[
+        schedule_config.epilogue_schedule].split("::")[-1]
+    tile_scheduler = TileSchedulerTag[schedule_config.tile_scheduler]\
+        .split("::")[-1]
+
+    return (f"{tile_shape}_{cluster_shape}_{kernel_schedule}" +
+            f"_{epilogue_schedule}_{tile_scheduler}")
+
+
+# mostly unique shorter sch_sig
+def generate_terse_sch_sig(schedule_config: ScheduleConfig) -> str:
+    kernel_terse_names_replace = {
+        "KernelTmaWarpSpecializedCooperative": "TmaMI_",
+        "TmaWarpSpecializedCooperative_": "TmaCoop_",
+        "StreamKScheduler": "streamK",
+    }
+
+    sch_sig = generate_sch_sig(schedule_config)
+    for orig, terse in kernel_terse_names_replace.items():
+        sch_sig = sch_sig.replace(orig, terse)
+    return sch_sig
+
+
+# unique type_name
+def generate_type_signature(kernel_types: TypeConfig):
+    return str("".join([
+        VLLMDataTypeNames[getattr(kernel_types, field.name)]
+        for field in fields(TypeConfig)
+    ]))
+
+
+def generate_type_option_name(kernel_types: TypeConfig):
+    return ", ".join([
+        f"{field.name.replace('b_', 'with_')+'_type'}=" +
+        VLLMDataTypeNames[getattr(kernel_types, field.name)]
+        for field in fields(TypeConfig)
+    ])
+
+
+def is_power_of_two(n):
+    return (n != 0) and (n & (n - 1) == 0)
+
+
+def to_cute_constant(value: list[int]):
+
+    def _to_cute_constant(value: int):
+        if is_power_of_two(value):
+            return f"_{value}"
+        else:
+            return f"Int<{value}>"
+
+    if isinstance(value, Iterable):
+        return [_to_cute_constant(value) for value in value]
+    else:
+        return _to_cute_constant(value)
+
+
+def unique_schedules(impl_configs: list[ImplConfig]):
+    return list(
+        set(sch for impl_config in impl_configs
+            for sch in impl_config.schedules))
+
+
+def unsigned_type_with_bitwidth(num_bits):
+    return {
+        4: DataType.u4,
+        8: DataType.u8,
+        16: DataType.u16,
+        32: DataType.u32,
+        64: DataType.u64,
+    }[num_bits]
+
+
+template_globals = {
+    "void": DataType.void,
+    "DataTypeTag": VLLMDataTypeTag,
+    "VLLMScalarTypeTag": VLLMDataTypeVLLMScalarTypeTag,
+    "TorchTypeTag": VLLMDataTypeTorchDataTypeTag,
+    "KernelScheduleTag": VLLMKernelScheduleTag,
+    "EpilogueScheduleTag": EpilogueScheduleTag,
+    "TileSchedulerTag": TileSchedulerTag,
+    "to_cute_constant": to_cute_constant,
+    "gen_sch_sig": generate_terse_sch_sig,
+    "gen_type_sig": generate_type_signature,
+    "unique_schedules": unique_schedules,
+    "unsigned_type_with_bitwidth": unsigned_type_with_bitwidth,
+    "gen_type_option_name": generate_type_option_name
+}
+
+
+def create_template(template_str):
+    template = jinja2.Template(template_str)
+    template.globals.update(template_globals)
+    return template
+
+
+mm_dispatch_template = create_template(DISPATCH_TEMPLATE)
+mm_impl_template = create_template(IMPL_TEMPLATE)
+prepack_dispatch_template = create_template(PREPACK_TEMPLATE)
+
+
+def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
+    sources = []
+
+    sources.append((
+        "machete_mm_dispatch",
+        mm_dispatch_template.render(impl_configs=impl_configs),
+    ))
+
+    prepack_types = []
+    for impl_config in impl_configs:
+        convert_type = impl_config.types.a \
+             if impl_config.types.b_group_scale == DataType.void \
+             else impl_config.types.b_group_scale
+        prepack_types.append(
+            PrepackTypeConfig(
+                a=impl_config.types.a,
+                b_num_bits=VLLMDataTypeSize[impl_config.types.b],
+                convert=convert_type,
+                accumulator=impl_config.types.accumulator,
+            ))
+
+    def prepacked_type_key(prepack_type: PrepackTypeConfig):
+        # For now we we can just use the first accumulator type seen since
+        # the tensor core shapes/layouts don't vary based on accumulator
+        # type so we can generate less code this way
+        return (prepack_type.a, prepack_type.b_num_bits, prepack_type.convert)
+
+    unique_prepack_types = []
+    prepack_types_seen = set()
+    for prepack_type in prepack_types:
+        key = prepacked_type_key(prepack_type)
+        if key not in prepack_types_seen:
+            unique_prepack_types.append(prepack_type)
+            prepack_types_seen.add(key)
+
+    sources.append((
+        "machete_prepack",
+        prepack_dispatch_template.render(types=unique_prepack_types, ),
+    ))
+
+    # Split up impls across files
+    num_impls = reduce(lambda x, y: x + len(y.schedules), impl_configs, 0)
+    num_impls_per_file = math.ceil(num_impls / num_impl_files)
+
+    files_impls: list[list[ImplConfig]] = [[]]
+
+    curr_num_impls_assigned = 0
+    curr_impl_in_file = 0
+    curr_impl_configs = deepcopy(list(reversed(impl_configs)))
+
+    while curr_num_impls_assigned < num_impls:
+        room_left_in_file = num_impls_per_file - curr_impl_in_file
+        if room_left_in_file == 0:
+            files_impls.append([])
+            room_left_in_file = num_impls_per_file
+            curr_impl_in_file = 0
+
+        curr_ic = curr_impl_configs[-1]
+        if len(curr_ic.schedules) >= room_left_in_file:
+            # Break apart the current impl config
+            tmp_ic = deepcopy(curr_ic)
+            tmp_ic.schedules = curr_ic.schedules[:room_left_in_file]
+            curr_ic.schedules = curr_ic.schedules[room_left_in_file:]
+            files_impls[-1].append(tmp_ic)
+        else:
+            files_impls[-1].append(curr_ic)
+            curr_impl_configs.pop()
+        curr_num_impls_assigned += len(files_impls[-1][-1].schedules)
+        curr_impl_in_file += len(files_impls[-1][-1].schedules)
+
+    for part, file_impls in enumerate(files_impls):
+        sources.append((
+            f"machete_mm_impl_part{part+1}",
+            mm_impl_template.render(impl_configs=file_impls),
+        ))
+
+    return sources
+
+
+def generate():
+    # See csrc/quantization/machete/Readme.md, the Codegeneration for more info
+    # about how this works
+    SCRIPT_DIR = os.path.dirname(__file__)
+
+    sch_common_params = dict(
+        kernel_schedule=TmaMI,
+        epilogue_schedule=TmaCoop,
+        tile_scheduler=TileSchedulerType.StreamK,
+    )
+
+    # Stored as "condition": ((tile_shape_mn), (cluster_shape_mnk))
+    default_tile_heuristic_config = {
+        #### M = 257+
+        "M > 256 && K <= 16384 && N <= 4096": ((128, 128), (2, 1, 1)),
+        "M > 256": ((128, 256), (2, 1, 1)),
+        #### M = 129-256
+        "M > 128 && K <= 4096 && N <= 4096": ((128, 64), (2, 1, 1)),
+        "M > 128 && K <= 8192 && N <= 8192": ((128, 128), (2, 1, 1)),
+        "M > 128": ((128, 256), (2, 1, 1)),
+        #### M = 65-128
+        "M > 64 && K <= 4069 && N <= 4069": ((128, 32), (2, 1, 1)),
+        "M > 64 && K <= 4069 && N <= 8192": ((128, 64), (2, 1, 1)),
+        "M > 64 && K >= 8192 && N >= 12288": ((256, 128), (2, 1, 1)),
+        "M > 64": ((128, 128), (2, 1, 1)),
+        #### M = 33-64
+        "M > 32 && K <= 6144 && N <= 6144": ((128, 16), (1, 1, 1)),
+        "M > 32 && K >= 16384 && N >= 12288": ((256, 64), (2, 1, 1)),
+        "M > 32": ((128, 64), (2, 1, 1)),
+        #### M = 17-32
+        "M > 16 && K <= 12288 && N <= 8192": ((128, 32), (2, 1, 1)),
+        "M > 16": ((256, 32), (2, 1, 1)),
+        #### M = 1-16
+        "N >= 26624": ((256, 16), (1, 1, 1)),
+        None: ((128, 16), (1, 1, 1)),
+    }
+
+    # For now we use the same heuristic for all types
+    # Heuristic is currently tuned for H100s
+    default_heuristic = [
+        (cond, ScheduleConfig(*tile_config,
+                              **sch_common_params))  # type: ignore
+        for cond, tile_config in default_tile_heuristic_config.items()
+    ]
+
+    def get_unique_schedules(heuristic: dict[str, ScheduleConfig]):
+        # Do not use schedules = list(set(...)) because we need to make sure
+        # the output list is deterministic; otherwise the generated kernel file
+        # will be non-deterministic and causes ccache miss.
+        schedules = []
+        for _, schedule_config in heuristic:
+            if schedule_config not in schedules:
+                schedules.append(schedule_config)
+        return schedules
+
+    impl_configs = []
+
+    GPTQ_kernel_type_configs = list(
+        TypeConfig(
+            a=a,
+            b=b,
+            b_group_scale=a,
+            b_group_zeropoint=DataType.void,
+            b_channel_scale=DataType.void,
+            a_token_scale=DataType.void,
+            out=a,
+            accumulator=DataType.f32,
+        ) for b in (VLLMDataType.u4b8, VLLMDataType.u8b128)
+        for a in (DataType.f16, DataType.bf16))
+
+    impl_configs += [
+        ImplConfig(x[0], x[1], x[2])
+        for x in zip(GPTQ_kernel_type_configs,
+                     itertools.repeat(get_unique_schedules(default_heuristic)),
+                     itertools.repeat(default_heuristic))
+    ]
+
+    AWQ_kernel_type_configs = list(
+        TypeConfig(
+            a=a,
+            b=b,
+            b_group_scale=a,
+            b_group_zeropoint=a,
+            b_channel_scale=DataType.void,
+            a_token_scale=DataType.void,
+            out=a,
+            accumulator=DataType.f32,
+        ) for b in (DataType.u4, DataType.u8)
+        for a in (DataType.f16, DataType.bf16))
+
+    impl_configs += [
+        ImplConfig(x[0], x[1], x[2])
+        for x in zip(AWQ_kernel_type_configs,
+                     itertools.repeat(get_unique_schedules(default_heuristic)),
+                     itertools.repeat(default_heuristic))
+    ]
+
+    # Stored as "condition": ((tile_shape_mn), (cluster_shape_mnk))
+    # TODO (LucasWilkinson): Further tuning required
+    qqq_tile_heuristic_config = {
+        #### M = 257+
+        # ((128, 256), (2, 1, 1)) Broken for QQQ types
+        # TODO (LucasWilkinson): Investigate further
+        # "M > 256 && K <= 16384 && N <= 4096": ((128, 128), (2, 1, 1)),
+        # "M > 256": ((128, 256), (2, 1, 1)),
+        "M > 256": ((128, 128), (2, 1, 1)),
+        #### M = 129-256
+        "M > 128 && K <= 4096 && N <= 4096": ((128, 64), (2, 1, 1)),
+        "M > 128 && K <= 8192 && N <= 8192": ((128, 128), (2, 1, 1)),
+        # ((128, 256), (2, 1, 1)) Broken for QQQ types
+        # TODO (LucasWilkinson): Investigate further
+        # "M > 128": ((128, 256), (2, 1, 1)),
+        "M > 128": ((128, 128), (2, 1, 1)),
+        #### M = 65-128
+        "M > 64 && K <= 4069 && N <= 4069": ((128, 32), (2, 1, 1)),
+        "M > 64 && K <= 4069 && N <= 8192": ((128, 64), (2, 1, 1)),
+        "M > 64 && K >= 8192 && N >= 12288": ((256, 128), (2, 1, 1)),
+        "M > 64": ((128, 128), (2, 1, 1)),
+        #### M = 33-64
+        "M > 32 && K <= 6144 && N <= 6144": ((128, 16), (1, 1, 1)),
+        # Broken for QQQ types
+        # TODO (LucasWilkinson): Investigate further
+        #"M > 32 && K >= 16384 && N >= 12288": ((256, 64), (2, 1, 1)),
+        "M > 32": ((128, 64), (2, 1, 1)),
+        #### M = 17-32
+        "M > 16 && K <= 12288 && N <= 8192": ((128, 32), (2, 1, 1)),
+        "M > 16": ((256, 32), (2, 1, 1)),
+        #### M = 1-16
+        "N >= 26624": ((256, 16), (1, 1, 1)),
+        None: ((128, 16), (1, 1, 1)),
+    }
+
+    # For now we use the same heuristic for all types
+    # Heuristic is currently tuned for H100s
+    qqq_heuristic = [
+        (cond, ScheduleConfig(*tile_config,
+                              **sch_common_params))  # type: ignore
+        for cond, tile_config in qqq_tile_heuristic_config.items()
+    ]
+
+    QQQ_kernel_types = [
+        *(TypeConfig(
+            a=DataType.s8,
+            b=VLLMDataType.u4b8,
+            b_group_scale=b_group_scale,
+            b_group_zeropoint=DataType.void,
+            b_channel_scale=DataType.f32,
+            a_token_scale=DataType.f32,
+            out=DataType.f16,
+            accumulator=DataType.s32,
+        ) for b_group_scale in (DataType.f16, DataType.void)),
+        *(TypeConfig(
+            a=DataType.e4m3,
+            b=VLLMDataType.u4b8,
+            b_group_scale=b_group_scale,
+            b_group_zeropoint=DataType.void,
+            b_channel_scale=DataType.f32,
+            a_token_scale=DataType.f32,
+            out=DataType.f16,
+            accumulator=DataType.f32,
+        ) for b_group_scale in (DataType.f16, DataType.void)),
+    ]
+
+    impl_configs += [
+        ImplConfig(x[0], x[1], x[2])
+        for x in zip(QQQ_kernel_types,
+                     itertools.repeat(get_unique_schedules(qqq_heuristic)),
+                     itertools.repeat(qqq_heuristic))
+    ]
+
+    output_dir = os.path.join(SCRIPT_DIR, "generated")
+
+    # Delete the "generated" directory if it exists
+    if os.path.exists(output_dir):
+        shutil.rmtree(output_dir)
+
+    # Create the "generated" directory
+    os.makedirs(output_dir)
+
+    # Render each group of configurations into separate files
+    for filename, code in create_sources(impl_configs):
+        filepath = os.path.join(output_dir, f"{filename}.cu")
+        with open(filepath, "w") as output_file:
+            output_file.write(code)
+        print(f"Rendered template to {filepath}")
+
+
+if __name__ == "__main__":
+    generate()
diff --git a/vllm_v0.10.0/csrc/quantization/machete/machete_collective_builder.cuh b/vllm_v0.10.0/csrc/quantization/machete/machete_collective_builder.cuh
new file mode 100644
index 0000000..ee82558
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/machete/machete_collective_builder.cuh
@@ -0,0 +1,31 @@
+#pragma once
+
+#include "cutlass_extensions/vllm_collective_builder.cuh"
+#include "machete_mainloop.cuh"
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+struct MacheteKernelTag {};
+
+template <class ElementPairA_, class GmemLayoutA_, int AlignmentA,
+          class ElementPairB_, class GmemLayoutB_, int AlignmentB,
+          class ElementAccumulator, class TileShape_MNK, class ClusterShape_MNK,
+          class StageCountType, class KernelScheduleType>
+struct VLLMCollectiveBuilder<
+    MacheteKernelTag, arch::Sm90, arch::OpClassTensorOp, ElementPairA_,
+    GmemLayoutA_, AlignmentA, ElementPairB_, GmemLayoutB_, AlignmentB,
+    ElementAccumulator, TileShape_MNK, ClusterShape_MNK, StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<(
+        cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecialized> ||
+        cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpong> ||
+        cute::is_same_v<KernelScheduleType,
+                        KernelTmaWarpSpecializedCooperative>)>> {
+  using CollectiveOp = machete::MacheteCollectiveMma<
+      ElementPairA_, GmemLayoutA_, AlignmentA, ElementPairB_, GmemLayoutB_,
+      AlignmentB, ElementAccumulator, TileShape_MNK, ClusterShape_MNK,
+      StageCountType, KernelScheduleType>;
+};
+
+};  // namespace cutlass::gemm::collective
diff --git a/vllm_v0.10.0/csrc/quantization/machete/machete_interleaving_utils.cuh b/vllm_v0.10.0/csrc/quantization/machete/machete_interleaving_utils.cuh
new file mode 100644
index 0000000..d397f87
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/machete/machete_interleaving_utils.cuh
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cute/layout.hpp"
+
+namespace machete {
+
+using namespace cute;
+
+// get an interleaved block layout where each element consecutive element has a
+// stride of bit_stride and the block width is blk_bit_width,
+// examples:
+//  size_bits<T> = 8, bit_stride = 8,  blk_bit_width = 32 -> 4:1
+//  size_bits<T> = 8, bit_stride = 16, blk_bit_width = 32 -> (2, 2):(2, 1)
+//  size_bits<T> = 4, bit_stride = 8,  blk_bit_width = 32 -> (4, 2):(2, 1)
+//  size_bits<T> = 4, bit_stride = 16, blk_bit_width = 32 -> (2, 4):(4, 1)
+template <typename T, int bit_stride, int blk_bit_width>
+CUTE_HOST_DEVICE static constexpr auto get_interleaved_blk_layout() {
+  static_assert(blk_bit_width % bit_stride == 0);
+  static_assert(bit_stride % cute::sizeof_bits_v<T> == 0);
+
+  constexpr auto elems_per_blk = blk_bit_width / cute::sizeof_bits_v<T>;
+
+  if constexpr (cute::sizeof_bits_v<T> == bit_stride) {
+    // identity layout
+    return Layout<Shape<Int<elems_per_blk>>>{};
+  } else {
+    constexpr auto elems_per_stride = bit_stride / cute::sizeof_bits_v<T>;
+    constexpr auto num_strides = elems_per_blk / elems_per_stride;
+    return Layout<Shape<Int<num_strides>, Int<elems_per_stride>>,
+                  Stride<Int<elems_per_stride>, Int<1>>>{};
+  }
+}
+
+};  // namespace machete
diff --git a/vllm_v0.10.0/csrc/quantization/machete/machete_mainloop.cuh b/vllm_v0.10.0/csrc/quantization/machete/machete_mainloop.cuh
new file mode 100644
index 0000000..2f52a6b
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/machete/machete_mainloop.cuh
@@ -0,0 +1,1473 @@
+//
+// Based off of:
+//   cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp
+// Specifically:
+//   https://github.com/NVIDIA/cutlass/tree/06b21349bcf6ddf6a1686a47a137ad1446579db9/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp
+// Referred to as upstream from in the comments
+//
+// The main optimization machete implements compared to upstream is to prepack
+// the weight matrix to more closely match the shape of the wgmma instructions
+// allowing for wider (ideally 128bit) shared memory loads. For subbyte types
+// this is done by packing values from multiple wgmma loads (for a single
+// thread) into a single 128bit load. This is very similar to layout used in
+// Marlin, although specific to the wgmma instructions.
+//
+// Since the wgmma instructions only support sourcing from registers for the A
+// operand, and we want to upconvert/decompress the weight values/elements
+// before feeding them into the tensor cores in registers, we need the weight
+// matrix to be A. To achieve this we compute the transpose of Y = XW^t as
+// Y^t = W^tX^t. This is mostly done outside of this file in
+// csrc/quantization/machete/machete_mm_kernel.cuh, but this why A is the
+// quantized/narrow type and has the prepacked layout despite the API being:
+//   B_prepacked = machete_prepack_B(B)
+//   Y = machete_mm(A, B_prepacked)
+//
+#pragma once
+
+// clang-format off
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_conversion.h"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/detail/layout.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/atom/copy_traits_sm90_tma.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/transform/collective/sm90_wgmma_transpose.hpp"
+#include "cutlass/trace.h"
+
+#include "cutlass/detail/collective.hpp"
+// clang-format on
+
+#include "cutlass_extensions/cute_utils.cuh"
+
+namespace machete {
+
+using namespace cute;
+using namespace cutlass;
+using namespace cutlass::gemm;
+using namespace cutlass::gemm::collective;
+using namespace cutlass::gemm::collective::detail;
+
+template <class ElementATuple_, class GmemLayoutA, int AlignmentA,
+          class ElementB_, class GmemLayoutB, int AlignmentB,
+          class ElementAccumulator_, class TileShape_MNK,
+          class ClusterShape_MNK, class StageCountType,
+          class KernelScheduleType>
+struct MacheteCollectiveMma {
+  using Schedule = KernelScheduleType;
+  static_assert(
+      cute::is_same_v<Schedule, KernelTmaWarpSpecialized> ||
+          cute::is_same_v<Schedule, KernelTmaWarpSpecialized> ||
+          cute::is_same_v<Schedule, KernelTmaWarpSpecializedPingpong> ||
+          cute::is_same_v<Schedule, KernelTmaWarpSpecializedPingpong> ||
+          cute::is_same_v<Schedule, KernelTmaWarpSpecializedCooperative> ||
+          cute::is_same_v<Schedule, KernelTmaWarpSpecializedCooperative>,
+      "KernelSchedule must be one of the warp specialized policies");
+
+ public:
+  static constexpr bool ALayoutIsPrepacked = true;
+
+  // Prepacked block shape (N is M in the transposed problem)
+  using PPBlockShape_MK = typename GmemLayoutA::PPBlockShape_NK;
+  // Prepacked blocks per dim for a single MMA tile
+  using PPBlocksPerTile_MK = decltype(make_shape(
+      size<0>(TileShape_MNK{}) / size<0>(PPBlockShape_MK{}),
+      size<2>(TileShape_MNK{}) / size<1>(PPBlockShape_MK{})));
+
+  using IlvdBlkLayout = typename GmemLayoutA::IlvdBlkLayout;
+
+  static_assert(size<0>(TileShape_MNK{}) % size<0>(PPBlockShape_MK{}) == 0,
+                "M in PPBlockShape_MK must evenly divide M TileShape_MNK");
+  static_assert(size<2>(TileShape_MNK{}) % size<1>(PPBlockShape_MK{}) == 0,
+                "K in PPBlockShape_MK must evenly divide K TileShape_MNK");
+
+  using ArchTag = arch::Sm90;
+  using TileShape = TileShape_MNK;
+  using ClusterShape = ClusterShape_MNK;
+  using ElementA = deduce_mixed_width_dtype_t<0, ElementATuple_>;
+  using StrideA = TagToStrideA_t<layout::RowMajor>;
+  using ElementB = ElementB_;
+  using StrideB = TagToStrideB_t<GmemLayoutB>;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementMma = ElementB;
+  using ElementATuple =
+      cute::conditional_t<!cute::is_tuple<ElementATuple_>::value,
+                          cute::tuple<ElementA>, ElementATuple_>;
+
+  static constexpr cute::GMMA::Major GmmaMajorA =
+      gmma_rs_tag_to_major_A<layout::RowMajor>();
+  static constexpr cute::GMMA::Major GmmaMajorB =
+      gmma_rs_tag_to_major_B<GmemLayoutB>();
+
+  // For coop schedules we have two warp groups cooperatively issuing wgmma
+  // instructions so we use 2 atoms along the M dim (one for each warpgroup)
+  using AtomLayoutMNK = cute::conditional_t<
+      cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative>,
+      Layout<Shape<_2, _1, _1>>, Layout<Shape<_1, _1, _1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(
+      cute::GMMA::rs_op_selector<ElementMma, ElementMma, ElementAccumulator,
+                                 TileShape_MNK, GMMA::Major::K, GmmaMajorB>(),
+      AtomLayoutMNK{}));
+
+ private:
+  //
+  // the setup section (until "section setup end") contains a combination of
+  // modified code from (used as a starting point):
+  //   `cutlass/gemm/collective/builders/sm90_gmma_builder.inl`
+  //   `cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp`
+  //   (upstream)
+  //
+  // however in-order to simplify the code we combine a lot of the logic from
+  // `CollectiveMma` and `CollectiveBuilder` into this class, this also makes
+  // sense given that we have flexibility on layouts here. We also simplify the
+  // code by only supporting scales and zeros for A (in the transposed problem,
+  // B from an API perspective), also since we force A to be the narrow type
+  // (i.e. the type to be upconverted) we can remove all the `SwapAB` logic in
+  // the upstream also simplifying the code. This section includes new logic
+  // (compared ustream) for handling the prepacked-A layouts (in the transposed
+  // problem, B from an API perspective)
+  //
+  using ElementScale = deduce_mixed_width_dtype_t<1, ElementATuple_>;
+  using ElementZero = deduce_mixed_width_dtype_t<2, ElementATuple_>;
+
+  static constexpr bool IsANarrow = cutlass::sizeof_bits<ElementA>::value <
+                                    cutlass::sizeof_bits<ElementB>::value;
+  static_assert(IsANarrow,
+                "A must be the narrow one since its the one that flows through "
+                "registers.");
+
+ public:
+  static constexpr int PipelineStages =
+      compute_stage_count_or_override_single_affine_transformed_input<
+          sm90_smem_capacity_bytes, ElementA, ElementB, ElementScale,
+          ElementZero, TileShape_MNK>(StageCountType{});
+
+  struct DispatchPolicy {
+    constexpr static int Stages = PipelineStages;
+    using ClusterShape = ClusterShape_MNK;
+    using Schedule = KernelScheduleType;
+  };
+
+  using GmemTiledCopyA =
+      decltype(sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB =
+      decltype(sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  // ((T, V), (BlocksM, BlocksK), pipe) -> offset
+  using SmemLayoutA = decltype(GmemLayoutA::TVbNbKL_to_offset(
+      make_shape(size<0>(TileShape_MNK{}), size<2>(TileShape_MNK{}),
+                 Int<DispatchPolicy::Stages>{})));
+
+  using SmemLayoutACopy = decltype(GmemLayoutA::TVbNbKL_to_offset_copy(
+      make_shape(size<0>(TileShape_MNK{}), size<2>(TileShape_MNK{}),
+                 Int<DispatchPolicy::Stages>{})));
+
+  using SmemLayoutAtomARowMajor =
+      decltype(rs_smem_selector<GmmaMajorA, ElementA,
+                                decltype(cute::get<0>(TileShape_MNK{})),
+                                decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  using SmemLayoutAtomScale = Layout<
+      Shape<decltype(cute::shape<0>(SmemLayoutAtomARowMajor{})), cute::Int<1>>>;
+
+  using SmemLayoutAtomB =
+      decltype(rs_smem_selector<GmmaMajorB, ElementB,
+                                decltype(cute::get<1>(TileShape_MNK{})),
+                                decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  using SmemCopyAtomA = Copy_Atom<cute::DefaultCopy, ElementA>;
+  using SmemCopyAtomB = void;
+
+  //
+  //  Validity checks
+  //
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+  static_assert(is_aligned<ElementA, AlignmentA, ElementB, AlignmentB,
+                           tma_alignment_bytes>(),
+                "Should meet TMA alignment requirement\n");
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>,
+                "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+
+ private:
+  enum class ConversionMode {
+    DirectConvert,
+    ConvertAndScale,
+    ConvertAndScaleWithZero
+  };
+
+ public:
+  //
+  // Type Aliases
+  //
+  using KernelSchedule = KernelScheduleType;
+
+  // For cases where we can't have a void type, we can use this to allow the
+  // code to compile when the scale / zero is void.
+  using NonVoidElementScale =
+      cute::conditional_t<cute::is_void_v<ElementScale>, float, ElementScale>;
+  using NonVoidElementZero =
+      cute::conditional_t<cute::is_void_v<ElementZero>, float, ElementZero>;
+
+  // These are always MN major
+  using StrideScale = cute::Stride<cute::Int<1>, int64_t, int64_t>;
+  // For cases where we can't have a void scale, we can use this to allow the
+  // code to compile when the scale is void.
+  using NonVoidStrideScale =
+      cute::conditional_t<cute::is_void_v<StrideScale>,
+                          cute::Stride<_1, int64_t, int64_t>, StrideScale>;
+
+  static_assert((cutlass::gemm::detail::is_k_major<StrideA>()),
+                "The transformed matrix (A) must be K-major.");
+
+  static_assert((sizeof(ElementB) == 2) ||
+                    (cutlass::gemm::detail::is_k_major<StrideA>() &&
+                     cutlass::gemm::detail::is_k_major<StrideB>()),
+                "The unscaled element (matrix B) must be 2 bytes OR both "
+                "inputs must be K-major");
+
+  static_assert(cutlass::gemm::detail::is_mn_major<NonVoidStrideScale>(),
+                "Scale must be MN major [Col Major if A is scaled, Row Major "
+                "if B is scaled].");
+
+  static_assert(std::is_same_v<typename TiledMma::ValTypeC, ElementAccumulator>,
+                "TiledMma::ValTypeC must be the same as ElementAccumulator.");
+
+  using GmemTiledCopyScale = cute::SM90_TMA_LOAD;
+
+  using SmemCopyAtomScale = Copy_Atom<cute::DefaultCopy, NonVoidElementScale>;
+
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any
+  // rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using InternalElementA =
+      cute::conditional_t<ConvertF32toTF32A, tfloat32_t,
+                          uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using InternalElementB =
+      cute::conditional_t<ConvertF32toTF32B, tfloat32_t,
+                          uint_bit_t<sizeof_bits_v<ElementB>>>;
+
+  using TransformA = cute::identity;
+  using TransformB = cute::identity;
+
+  static constexpr int IsSubbyteA = cute::sizeof_bits_v<InternalElementA> < 8;
+  using TmaElementA =
+      cute::conditional_t<IsSubbyteA, uint8_t, InternalElementA>;
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  // One threads per CTA are producers (1 for operand tile)
+  static constexpr int NumProducerThreadEvents = 1;
+
+  using ScaleTileShape = decltype(make_shape(shape<0>(TileShape{}),
+                                             shape<1>(SmemLayoutAtomScale{})));
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2,
+                "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomScale{}) == 2,
+                "SmemLayoutAtomScale must be rank 2");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomScale{})) == 0,
+                "SmemLayoutAtomScale must equal the tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomScale{})) == 0,
+                "SmemLayoutAtomScale must evenly divide tile k shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}),
+                 Int<DispatchPolicy::Stages>{}),
+      conditional_t<::cutlass::gemm::detail::is_major<0, StrideB>(),
+                    Step<_2, _1, _3>, Step<_1, _2, _3>>{}));
+
+  // It is assumed that the scales and zero-points share the same smem layout
+  using SmemLayoutScale = decltype(tile_to_shape(
+      SmemLayoutAtomScale{},
+      make_shape(shape<0>(ScaleTileShape{}), shape<1>(ScaleTileShape{}),
+                 Int<PipelineStages>{})));
+
+  // If A mn-layout and B mn-layout, transposing B matrix since WGMMA is k-major
+  // only (e.g. tf32, fp32, fp8, int8).
+  static constexpr bool IsLayoutAmnBmn =
+      cute::is_same_v<gemm::detail::StrideToLayoutTagA_t<StrideA>,
+                      layout::ColumnMajor> &&
+      cute::is_same_v<gemm::detail::StrideToLayoutTagB_t<StrideB>,
+                      layout::RowMajor>;
+
+  static_assert(DispatchPolicy::Stages >= 2,
+                "Specialization requires Stages set to value 2 or more.");
+  static_assert(not cute::is_base_of<cute::GMMA::DescriptorIterator,
+                                     typename TiledMma::FrgTypeA>::value &&
+                    cute::is_base_of<cute::GMMA::DescriptorIterator,
+                                     typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source A from rmem and B operand from smem_desc "
+                "for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> ||
+                    cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+                "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> ||
+                    cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+                "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  using GmmaSmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}),
+                 Int<DispatchPolicy::Stages>{}),
+      conditional_t<::cutlass::gemm::detail::is_major<0, StrideB>(),
+                    Step<_2, _1, _3>, Step<_1, _2, _3>>{}));
+
+  // These two restrictions are related, so we place the assertions together.
+  // To relax them, we need to handle loading more than 1 row of scales for
+  // every main loop iteration. We must also handle updating the pipeline
+  // transaction bytes on the fly. NOTE: Deleting this assertion without
+  // required changes will cause the code to hang.
+  static_assert(size<1>(SmemLayoutAtomScale{}) == 1,
+                "size<1>(SmemLayoutAtomScale) must be 1.");
+
+ private:
+  static constexpr ConversionMode get_conversion_mode() {
+    if constexpr (cute::is_void_v<ElementScale>) {
+      return ConversionMode::DirectConvert;
+    } else if constexpr (cute::is_void_v<ElementZero>) {
+      return ConversionMode::ConvertAndScale;
+    } else {
+      return ConversionMode::ConvertAndScaleWithZero;
+    }
+  }
+
+  static constexpr ConversionMode KernelConversionMode = get_conversion_mode();
+  static constexpr bool ModeHasScales =
+      KernelConversionMode == ConversionMode::ConvertAndScale ||
+      KernelConversionMode == ConversionMode::ConvertAndScaleWithZero;
+
+  // Same as upstream, should be kept the same when possible
+  static constexpr auto elements_per_smem_scale() {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return 0;
+    } else if constexpr (ModeHasScales) {
+      return cute::cosize_v<SmemLayoutScale>;
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Type not handled in scale smem allocation.");
+    }
+  }
+
+  // Same as upstream, should be kept the same when possible
+  static constexpr auto elements_per_smem_zero() {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert ||
+                  KernelConversionMode == ConversionMode::ConvertAndScale) {
+      return 0;
+    } else if constexpr (KernelConversionMode ==
+                         ConversionMode::ConvertAndScaleWithZero) {
+      return cute::cosize_v<SmemLayoutScale>;
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Type not handled in scale smem allocation.");
+    }
+  }
+
+  // Same as upstream, should be kept the same when possible, not formatte for
+  // easier comparison
+  // clang-format off
+  // These methods use some the public members of the class. For that reason, we define them after the public section.
+  static constexpr uint32_t
+  compute_tma_transaction_bytes_mk() {
+    constexpr uint32_t baseline_bytes = cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(cute::sizeof_bits_v<InternalElementA>));
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return baseline_bytes;
+    }
+    else if constexpr (ModeHasScales) {
+      constexpr uint32_t scale_tx_bytes = cutlass::bits_to_bytes(size<0>(SmemLayoutScale{}) * size<1>(SmemLayoutScale{}) * static_cast<uint32_t>(cute::sizeof_bits_v<ElementScale>));
+      static_assert(scale_tx_bytes % 128 == 0, "Each scale stage must be 128B aligned."); // required by TMA
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return baseline_bytes + scale_tx_bytes;
+      }
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        // Scale and zero share smem layout
+        constexpr uint32_t zero_tx_bytes = cutlass::bits_to_bytes(size<0>(SmemLayoutScale{}) * size<1>(SmemLayoutScale{}) * static_cast<uint32_t>(cute::sizeof_bits_v<ElementZero>));
+        static_assert(zero_tx_bytes % 128 == 0, "Each zero stage must be 128B aligned."); // required by TMA
+        return baseline_bytes + scale_tx_bytes + zero_tx_bytes;
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Type not handled in tma transaction bytes computation.");
+      }
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Type not handled in tma transaction bytes computation.");
+    }
+  }
+
+  static constexpr uint32_t
+  compute_tma_transaction_bytes_nk() {
+    return cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(cute::sizeof_bits_v<InternalElementB>));
+  }
+  // clang-format on
+
+  // ((athrid, val), (BlocksM, BlockK), L) -> (storage_idx)
+  using PrepackedStrideA = decltype(stride(GmemLayoutA::TVbNbKL_to_offset_copy(
+      make_shape(int32_t(0), int32_t(0), int32_t(0)))));
+
+  using ATensor = decltype(make_tensor(
+      get_logical_ptr(static_cast<InternalElementA const*>(nullptr)),
+      shape(GmemLayoutA::TVbNbKL_to_offset_copy(
+          make_shape(int32_t(0), int32_t(0), int32_t(0)))),
+      PrepackedStrideA{}));
+
+  using BTensor = decltype(make_tensor(
+      get_logical_ptr(static_cast<InternalElementB const*>(nullptr)),
+      repeat_like(StrideB{}, int32_t(0)), StrideB{}));
+  using ScaleTensor = decltype(make_tensor(
+      get_logical_ptr(static_cast<NonVoidElementScale const*>(nullptr)),
+      repeat_like(NonVoidStrideScale{}, int32_t(0)), NonVoidStrideScale{}));
+
+  using ZeroTensor = decltype(make_tensor(
+      get_logical_ptr(static_cast<NonVoidElementZero const*>(nullptr)),
+      repeat_like(NonVoidStrideScale{}, int32_t(0)), NonVoidStrideScale{}));
+
+  static constexpr auto make_tma_copy_A(ATensor tensor_a = ATensor{}) {
+    return make_tma_copy<TmaElementA>(
+        GmemTiledCopyA{}, tensor_a, SmemLayoutACopy{}(_, _, cute::Int<0>{}),
+        shape(SmemLayoutACopy{}(_, _, cute::Int<0>{})),
+        size<1>(ClusterShape{}));  // mcast along N mode for this M load, if any
+  }
+
+  static constexpr auto make_tma_copy_scale(
+      ScaleTensor tensor_scale = ScaleTensor{}) {
+    return make_tma_copy(GmemTiledCopyScale{}, tensor_scale,
+                         SmemLayoutScale{}(_, _, cute::Int<0>{}),
+                         ScaleTileShape{},
+                         _1{});  // mcast along N mode for this M load, if any
+  }
+
+  static constexpr auto make_tma_copy_zero(
+      ZeroTensor tensor_zero = ZeroTensor{}) {
+    return make_tma_copy(GmemTiledCopyScale{}, tensor_zero,
+                         SmemLayoutScale{}(_, _, cute::Int<0>{}),
+                         ScaleTileShape{},
+                         _1{});  // mcast along N mode for this M load, if any
+  }
+
+  static constexpr auto make_tma_copy_B(BTensor tensor_b = BTensor{}) {
+    return make_tma_copy(
+        GmemTiledCopyB{}, tensor_b, SmemLayoutB{}(_, _, cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{}));  // mcast along M mode for this N load, if any
+  }
+
+ public:
+  // Same as upstream, should be kept the same when possible, not formatted for
+  // easier comparison
+  //  with `RealInternalElementA` -> `ElementA` since we support `SwapAB` logic
+  // clang-format off
+  static constexpr size_t SmemAlignmentA = cutlass::detail::alignment_for_swizzle(SmemLayoutA{}); 
+
+  static constexpr size_t SmemAlignmentB = cutlass::detail::alignment_for_swizzle(SmemLayoutB{});
+
+  // Just pick the max alignment of A and B since it is required to be at least 128B
+  static constexpr size_t SmemAlignmentScale = cute::max(SmemAlignmentA, SmemAlignmentB);
+
+  static_assert(SmemAlignmentA >= 128 and SmemAlignmentB >= 128, "Require at least 128B alignment");
+
+  struct SharedStorage
+  {
+    static constexpr int scale_elements = elements_per_smem_scale();
+    static constexpr int zero_elements = elements_per_smem_zero();
+    struct TensorStorage : cute::aligned_struct<cute::max(SmemAlignmentA, SmemAlignmentB)> {
+      cute::ArrayEngine<ElementA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::ArrayEngine<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::ArrayEngine<NonVoidElementScale, scale_elements> smem_scale;
+      cute::ArrayEngine<NonVoidElementZero, zero_elements> smem_zero;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A = nullptr;
+    StrideA dA{};
+    ElementB const* ptr_B = nullptr;
+    StrideB dB{};
+    ElementScale const* ptr_S = nullptr;
+    NonVoidStrideScale dS{};
+    int group_size = 0;
+    ElementZero const* ptr_Z = nullptr;
+    uint32_t mma_promotion_interval = 4;
+  };
+  // clang-format on
+
+  //
+  //  section setup end
+  //
+
+  // Similar (but not idendtical) to upstream, should be kept the same when
+  // possible
+  //  compared to upstream we use `make_tma_copy_A`, `make_tma_copy_B` etc. to
+  //  define the TMA types
+  // Device side kernel params
+  struct Params {
+   public:
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy_A());
+    using TMA_Scale = decltype(make_tma_copy_scale());
+    using TMA_Zero = decltype(make_tma_copy_zero());
+    using TMA_B = decltype(make_tma_copy_B());
+
+    // required by outer loop: i.e.
+    //   cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_Scale tma_load_scale;
+    TMA_Zero tma_load_zero;
+    int64_t scale_k;
+    int group_size;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+  };
+
+  //
+  // Methods
+  //
+
+  // Similar (but not idendtical) to upstream, should be kept the same when
+  // possible
+  //  compared to upstream we use `make_tma_copy_A` and `TVbNbKL_to_offset` here
+  //  to handle the prepacked layout
+  template <class ProblemShape>
+  static constexpr Params to_underlying_arguments(
+      ProblemShape const& problem_shape, Arguments const& args,
+      void* workspace) {
+    (void)workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is
+    // only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    auto ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_A);
+    auto ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_B);
+
+    auto make_logical_tensor = [&](auto ptr, auto shape, auto stride) {
+      return make_tensor(get_logical_ptr(ptr), make_layout(shape, stride));
+    };
+
+    typename Params::TMA_A tma_load_a;
+    typename Params::TMA_B tma_load_b;
+    typename Params::TMA_Scale tma_load_scale;
+    typename Params::TMA_Zero tma_load_zero;
+
+    auto layout = GmemLayoutA::TVbNbKL_to_offset_copy(make_shape(M, K, L));
+    tma_load_a = make_tma_copy_A(
+        make_logical_tensor(ptr_A, shape(layout), stride(layout)));
+
+    tma_load_b = make_tma_copy_B(
+        make_logical_tensor(ptr_B, make_shape(N, K, L), args.dB));
+
+    int32_t scale_k =
+        (ModeHasScales) ? (K + args.group_size - 1) / args.group_size : 0;
+    int32_t group_size = (ModeHasScales) ? args.group_size : 0;
+
+    if constexpr (ModeHasScales) {
+      tma_load_scale = make_tma_copy_scale(
+          make_logical_tensor(args.ptr_S, make_shape(M, scale_k, L), args.dS));
+    }
+
+    if constexpr (KernelConversionMode ==
+                  ConversionMode::ConvertAndScaleWithZero) {
+      tma_load_zero = make_tma_copy_zero(
+          make_logical_tensor(args.ptr_Z, make_shape(M, scale_k, L), args.dS));
+    }
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert ||
+                  KernelConversionMode == ConversionMode::ConvertAndScale ||
+                  KernelConversionMode ==
+                      ConversionMode::ConvertAndScaleWithZero) {
+      return {tma_load_a,    tma_load_b, tma_load_scale,
+              tma_load_zero, scale_k,    group_size};
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in to_underlying_arguments.");
+    }
+  }
+
+  // Same as upstream, should be kept the same when possible, not formatted for
+  // easier comparison
+  //   with `SwapAB ? N : M -> M` since we dont support SwapAB
+  // clang-format off
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      implementable = implementable && (args.ptr_S == nullptr);
+      implementable = implementable && (args.ptr_Z == nullptr);
+    } 
+    else if constexpr (ModeHasScales) {
+      const int scale_mn = M;
+      const int scale_k = (K + args.group_size - 1) / args.group_size;
+      constexpr int min_tma_aligned_elements_scale = tma_alignment_bits / cutlass::sizeof_bits<ElementScale>::value;
+      implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_scale>(cute::make_shape(scale_mn,scale_k,L), StrideScale{});
+      implementable = implementable && (args.group_size == K || ((args.group_size % size<2>(TileShape{})) == 0));
+      implementable = implementable && args.group_size != 0;
+      implementable = implementable && (args.ptr_S != nullptr);
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        implementable = implementable && (args.ptr_Z == nullptr);
+      }
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        constexpr int min_tma_aligned_elements_zero = tma_alignment_bits / cutlass::sizeof_bits<ElementZero>::value;
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_zero>(cute::make_shape(scale_mn,scale_k,L), StrideScale{});
+        implementable = implementable && (args.ptr_Z != nullptr);
+      } 
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in can_implement.");
+      }
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in can_implement.");
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr uint32_t TmaTransactionBytesMK = compute_tma_transaction_bytes_mk();
+  static constexpr uint32_t TmaTransactionBytesNK = compute_tma_transaction_bytes_nk();
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // Nothing extra to do
+    } 
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      cute::prefetch_tma_descriptor(mainloop_params.tma_load_scale.get_tma_descriptor());
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      cute::prefetch_tma_descriptor(mainloop_params.tma_load_scale.get_tma_descriptor());
+      cute::prefetch_tma_descriptor(mainloop_params.tma_load_zero.get_tma_descriptor());
+    }  
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in TMA prefetch.");
+    }
+    
+  }
+  // clang-format off
+
+  // Modified from upstream, should be kept close to that when possible
+  //  the main difference is special handling for the prepacked A layout
+  //
+  // Set up the data needed by this collective for load and mma.
+  // Returns a tuple of tensors. The collective and the kernel layer have the
+  // contract Returned tuple must contain at least two elements, with the first
+  // two elements being: gA_mkl - The tma tensor, A after a local tile so it
+  // has shape  (TILE_V,TILE_B,m,k,l) gB_nkl - The tma tensor, B after a local
+  // tile so it has shape  (TILE_N,TILE_K,n,k,l) The rest of the tensors can be
+  // specified as needed by this collective.
+  // NOTE: TILE_B is the prepacked block index within a tile. TILE_V is the
+  // values within a prepacked block.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto load_init(ProblemShape_MNKL const& problem_shape_MNKL,
+                                Params const& mainloop_params) const {
+    using X = Underscore;
+    auto M = get<0>(problem_shape_MNKL), N = get<1>(problem_shape_MNKL),
+         K = get<2>(problem_shape_MNKL), L = get<3>(problem_shape_MNKL);
+
+    // (TILE_V,TILE_B,m,k,l)
+    auto make_gA_mkl = [&]() {
+      // ((athrid, val), (BlocksM, BlockK), L) -> (storage_idx)
+      auto layout = GmemLayoutA::TVbNbKL_to_offset_copy(make_shape(M, K, L));
+      Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(shape(layout));
+      return local_tile(mA_mkl,
+                        make_shape(size<0>(layout), PPBlocksPerTile_MK{}),
+                        make_coord(0, make_coord(_, _)));
+    };
+
+    // (TILE_N,TILE_K,n,k,l)
+    auto make_gB_nkl = [&]() {
+      Tensor mB_nkl =
+          mainloop_params.tma_load_b.get_tma_tensor(make_shape(N, K, L));
+      return local_tile(mB_nkl, TileShape{}, make_coord(_, _, _),
+                        Step<X, _1, _1>{});
+    };
+
+    // (TILE_M,TILE_Scale_K,m,scale_k,l)
+    auto make_gS_mkl = [&]() {
+      auto scale_k = mainloop_params.scale_k;
+      Tensor mS_mkl = mainloop_params.tma_load_scale.get_tma_tensor(
+          make_shape(M, scale_k, L));
+      return local_tile(mS_mkl, ScaleTileShape{}, make_coord(_, _));
+    };
+
+    // (TILE_M,TILE_Scale_K,m,scale_k,l)
+    auto make_gZ_mkl = [&]() {
+      auto scale_k = mainloop_params.scale_k;
+      Tensor mZ_mkl = mainloop_params.tma_load_zero.get_tma_tensor(
+          make_shape(M, scale_k, L));
+      return local_tile(mZ_mkl, ScaleTileShape{}, make_coord(_, _));
+    };
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return cute::make_tuple(make_gA_mkl(), make_gB_nkl());
+    } else if constexpr (KernelConversionMode ==
+                         ConversionMode::ConvertAndScale) {
+      return cute::make_tuple(make_gA_mkl(), make_gB_nkl(), make_gS_mkl());
+    } else if constexpr (KernelConversionMode ==
+                         ConversionMode::ConvertAndScaleWithZero) {
+      return cute::make_tuple(make_gA_mkl(), make_gB_nkl(), make_gS_mkl(),
+                              make_gZ_mkl());
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in load_init.");
+    }
+  }
+
+  // Similar to upstream, should be kept close to that when possible
+  //  the main difference is in the layout comments
+  // clang-format off
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  /// This overload gets triggered when we have scales.
+  template <
+    class... Ts,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline, 
+      PipelineState smem_pipe_write,
+      cute::tuple<Ts...> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      static_assert(sizeof... (Ts) == 2, "Direct convert needs two inputs");
+    } 
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      static_assert(sizeof... (Ts) == 3, "Scaled convert needs three inputs");
+    } 
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      static_assert(sizeof... (Ts) == 4, "Scaled and zero convert needs four inputs");
+    } 
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in TMA load.");
+    }
+
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});      // (BLK_M,BLK_K,PIPE)
+      Tensor sB_ = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});      // (BLK_N,BLK_K,PIPE)
+      Tensor sA  = as_position_independent_swizzle_tensor(sA_);                                   // (BLK_M,BLK_K,PIPE)
+      Tensor sB  = as_position_independent_swizzle_tensor(sB_);                                   // (BLK_N,BLK_K,PIPE)
+
+      //
+      // Prepare the TMA loads for A, B and Scales
+      //
+      
+      constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+      Tensor gA_mkl = get<0>(load_inputs);
+      Tensor gB_nkl = get<1>(load_inputs);
+
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (TILE_V,TILE_B,k)
+      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (TILE_N,TILE_K,k)
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      uint16_t mcast_mask_a = 0;
+      uint16_t mcast_mask_b = 0;
+      uint16_t mcast_mask_s = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+        }
+      }
+
+      auto extra_input_partitions = partition_extra_tma_inputs(mainloop_params, load_inputs, shared_tensors, cluster_local_block_id, m_coord, l_coord);
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+
+        if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+          // Nothing extra to do.
+        }
+        else if constexpr (ModeHasScales) {
+          auto tSgS = get<0>(extra_input_partitions);
+          auto tSsS = get<1>(extra_input_partitions);
+
+          // Temporary factor which will determine which k tile to reload from gmem. Needed so we don't modify tma transaction bytes
+          // on the fly.
+          // We must do a ceiling divide here to correctly handle with group_size == K. In that case, we don't require that K
+          // is a multiple of the threadblock tile K
+          const int ReloadFactor = (mainloop_params.group_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{});
+          const int scale_load_k = *k_tile_iter / ReloadFactor; // This will always be 0 when group_size == K.
+          copy(mainloop_params.tma_load_scale.with(*tma_barrier, mcast_mask_s), tSgS(_,_,_,scale_load_k), tSsS(_,_,_,write_stage));
+
+          if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+            // Nothing extra to do
+          } 
+          else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+            auto tZgZ = get<2>(extra_input_partitions);
+            auto tZsZ = get<3>(extra_input_partitions);
+            copy(mainloop_params.tma_load_zero.with(*tma_barrier, mcast_mask_s), tZgZ(_,_,_,scale_load_k), tZsZ(_,_,_,write_stage));
+          }
+          else {
+            static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for TMA copy op.");
+          } 
+        } 
+        else {
+          static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for TMA copy op.");
+        }
+
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    }
+  }
+  // clang-format off
+
+  // Same as upstream, should be kept the same when possible, not formatted for
+  // easier comparison
+  // clang-format off
+  // Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all 
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was 
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+  // clang-format on
+
+  // Modified from upstream, should be kept close to that when possible
+  //  the main differences are handling the prepacked A layout, and separating
+  //  the loading of A from upcoverting A
+  //
+  // Perform a collective-scoped matrix multiply-accumulate
+  // Consumer Perspective
+  template <class FrgTensorC>
+  CUTLASS_DEVICE void mma(MainloopPipeline pipeline,
+                          PipelineState smem_pipe_read, FrgTensorC& accum,
+                          int k_tile_count, int thread_idx,
+                          TensorStorage& shared_tensors,
+                          Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value,
+                  "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3,
+                  "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutAtomB{}) == 2,
+                  "SmemLayoutAtomB must be rank 2.");
+    static_assert(!cute::is_void_v<SmemCopyAtomA>,
+                  "SM90 GMMA mainloops must specify a non-void copy atom for "
+                  "RF sourced instructions.");
+    static_assert(cute::is_void_v<SmemCopyAtomB>,
+                  "SM90 GMMA mainloops cannot have a non-void copy atom for "
+                  "smem sourced instructions.");
+
+    // Obtain warp index
+    int warp_idx = canonical_warp_idx_sync();
+    [[maybe_unused]] int warp_group_thread_idx = thread_idx % 128;
+
+    // ((T, (FrgV,(RestM, RestK)), (BlocksM, BlocksK), pipe) -> offset
+    auto constexpr smem_A = SmemLayoutA{};
+
+    // convert:
+    //   ((T, (MMA,(MMA_M, MMA_K)), (BlocksM, BlocksK), pipe) -> offset
+    // to:
+    //   (T, MMA, ((MMA_M, BlocksM), (MMA_K, BlocksK)), pipe) -> offset
+    // which can be thought of as:
+    //   (T, MMA, (MMA_M, MMA_K), pipe) -> offset
+    auto constexpr smem_A_mma_ =
+        make_layout(get<0, 0>(smem_A), get<0, 1, 0>(smem_A),
+                    zip(get<0, 1, 1>(smem_A), get<1>(smem_A)), get<2>(smem_A));
+    // flatten to:
+    //   (T, MMA, MMA_M, MMA_K, pipe) -> offset
+    auto constexpr smem_A_mma = smem_A_mma_(_, _, make_coord(_, _), _);
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()),
+                            smem_A_mma);  // (T, MMA, MMA_M, MMA_K, pipe)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()),
+                            SmemLayoutB{});  // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+
+    Tensor tCsA = sA(thread_idx, _, _, _, _);  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);  // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate fragments and descriptors
+    Tensor tCrA_load = make_tensor<ElementA>(
+        tCsA(_, _, _, Int<0>{}).shape());  // (MMA,MMA_N,MMA_K)
+    Tensor tCrA_mma = make_fragment_like<ElementMma>(tCrA_load);
+
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);  // (MMA,MMA_N,MMA_K,PIPE)
+
+    static constexpr int A_CPY_VEC =
+        decltype(max_common_vector(tCsA, tCrA_load)){};
+
+    static constexpr int CONVERSION_WIDTH =
+        std::min(A_CPY_VEC, int(size<0>(tCrA_mma)));
+
+    auto load_A_to_registers = [&](int read_stage) {
+      copy(create_auto_vectorizing_copy<ElementA, decltype(A_CPY_VEC)>(),
+           tCsA(_, _, _, read_stage), tCrA_load(_, _, _));
+    };
+
+    // Partition of thread -> shared and thread -> RF
+    auto partitioned_extra_info =
+        partition_extra_mma_info(thread_mma, shared_tensors);
+    auto copy_partitions_extra_info = retile_extra_mma_info(
+        tiled_mma, partitioned_extra_info, warp_group_thread_idx);
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA_mma) == size<1>(accum));  // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));      // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));       // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));  // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    auto convert_A = [&, a_vec = Int<CONVERSION_WIDTH>{}](int k_block,
+                                                          int read_stage) {
+      load_extra_info_to_registers(partitioned_extra_info,
+                                   copy_partitions_extra_info, k_block,
+                                   read_stage);
+      transform_A_kblock(tCrA_load, a_vec, tCrA_mma, partitioned_extra_info,
+                         k_block);
+    };
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    warpgroup_fence_operand(accum);
+
+    constexpr int K_BLOCK_MAX = size<2>(tCrA_load);
+
+    ConsumerToken barrier_token = {BarrierStatus::WaitAgain};
+    // first k tile
+    {
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+      ++smem_pipe_read;
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+
+      // copy smem->rmem for A operand
+      load_A_to_registers(read_stage);
+      convert_A(0, read_stage);
+
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
+        if (k_block < K_BLOCK_MAX - 1) {
+          convert_A(k_block + 1, smem_pipe_read.index());
+        }
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA_mma(_, _, k_block),
+                   tCrB(_, _, k_block, read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+      }
+
+      --k_tile_count;
+      if (k_tile_count > 0) {
+        // Wait for K_BLOCK_MAX - 1 to be in flight to ensure that it is safe to
+        // overwrite the A registers for the first mma.
+        warpgroup_wait<K_BLOCK_MAX - 1>();
+        pipeline.consumer_wait(smem_pipe_read, barrier_token);
+        load_A_to_registers(smem_pipe_read.index());
+        convert_A(0, smem_pipe_read.index());
+      }
+    }
+
+    if (k_tile_count == 0) {
+      return;
+    }
+
+    warpgroup_fence_operand(accum);
+    // Mainloop GMMAs
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (; k_tile_count > 1; --k_tile_count) {
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+      ++smem_pipe_read;
+
+      warpgroup_fence_operand(accum);
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA_mma(_, _, k_block),
+                   tCrB(_, _, k_block, read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+
+        warpgroup_wait<K_BLOCK_MAX - 1>();
+        if (k_block == K_BLOCK_MAX - 1) {
+          // We have K_BLOCK_MAX - 1 GMMA instructions pending for this stage,
+          // so we can release prior barrier
+          pipeline.consumer_release(
+              smem_pipe_release);  // UNLOCK smem_pipe_release, done _computing_
+                                   // on it
+          ++smem_pipe_release;
+        }
+
+        if (k_block == 0) {
+          barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+        }
+
+        if (k_block == K_BLOCK_MAX - 1) {
+          pipeline.consumer_wait(smem_pipe_read, barrier_token);
+          load_A_to_registers(smem_pipe_read.index());
+          convert_A(0, smem_pipe_read.index());
+        } else {
+          convert_A(k_block + 1, read_stage);
+        }
+      }
+      warpgroup_fence_operand(accum);
+    }
+
+    warpgroup_fence_operand(accum);
+
+    {
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      warpgroup_fence_operand(accum);
+
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA_mma(_, _, k_block),
+                   tCrB(_, _, k_block, read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+        warpgroup_wait<K_BLOCK_MAX - 1>();
+        if (k_block == K_BLOCK_MAX - 1) {
+          // release prior barrier
+          pipeline.consumer_release(
+              smem_pipe_release);  // UNLOCK smem_pipe_release, done _computing_
+                                   // on it
+          ++smem_pipe_release;
+        }
+
+        if (k_block < K_BLOCK_MAX - 1) {
+          convert_A(k_block + 1, read_stage);
+        }
+      }
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+
+  // Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void mma_tail(MainloopPipeline pipeline,
+                               PipelineState smem_pipe_release,
+                               int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = 1;
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(
+          smem_pipe_release);  // UNLOCK smem_pipe_release, done _computing_ on
+                               // it
+      ++smem_pipe_release;
+    }
+  }
+
+ private:
+  // Same as upstream, should be kept the same when possible, not formatted for
+  // easier comparison
+  // clang-format off
+  /// Utilities for any additional inputs inside of the TMA load
+  template <class... Ts>
+  CUTLASS_DEVICE
+  auto partition_extra_tma_inputs(
+    Params const& mainloop_params,
+    cute::tuple<Ts...> const& load_inputs,
+    TensorStorage& shared_tensors,
+    uint2 const& cluster_local_block_id,
+    int const m_coord, 
+    int const l_coord) {
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return cute::make_tuple();
+    } 
+    else if constexpr (ModeHasScales) {
+      Tensor sS  = make_tensor(make_smem_ptr(shared_tensors.smem_scale.begin()), SmemLayoutScale{}); // (BLK_M,BLK_K,PIPE)
+      Tensor gS_mkl = get<2>(load_inputs);
+      auto block_tma_s = mainloop_params.tma_load_scale.get_slice(cluster_local_block_id.y);
+      Tensor gS = gS_mkl(_,_,m_coord,_,l_coord);                                                  // (BLK_M,BLK_K,k)
+
+      Tensor tSgS = block_tma_s.partition_S(gS);                                              // (TMA,TMA_M,TMA_K,k)
+      Tensor tSsS = block_tma_s.partition_D(sS);                                              // (TMA,TMA_M,TMA_K,PIPE)
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(tSgS, tSsS);
+      } 
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor sZ  = make_tensor(make_smem_ptr(shared_tensors.smem_zero.begin()), SmemLayoutScale{}); // (BLK_M,BLK_K,PIPE)
+        Tensor gZ_mkl = get<3>(load_inputs);
+        auto block_tma_z = mainloop_params.tma_load_zero.get_slice(cluster_local_block_id.y);
+        Tensor gZ = gZ_mkl(_,_,m_coord,_,l_coord);                                            // (BLK_M,BLK_K,k)
+
+        Tensor tZgZ = block_tma_z.partition_S(gZ);                                            // (TMA,TMA_M,TMA_K,k)
+        Tensor tZsZ = block_tma_z.partition_D(sZ);                                            // (TMA,TMA_M,TMA_K,PIPE)
+        return cute::make_tuple(tSgS, tSsS, tZgZ, tZsZ);          
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for input partitioning.");      
+      }
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for input partitioning.");      
+    }
+  }
+  // clang-format off
+
+  // Same as upstream, should be kept the same when possible, not formatted for
+  // easier comparison
+  // clang-format off
+  /// Utilities for partitioning extra inputs for loading from smem in the mainloop.
+  template <class ThreadMma>
+  CUTLASS_DEVICE 
+  auto partition_extra_mma_info(
+    ThreadMma const& mma_thread_slice,
+    TensorStorage& shared_tensors) {
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // nothing to do
+      return cute::make_tuple();
+    }
+    else if constexpr (ModeHasScales) {
+      Tensor sS = make_tensor(make_smem_ptr(shared_tensors.smem_scale.begin()), SmemLayoutScale{});// (BLK_M,BLK_SCALE_K,PIPE)
+      Tensor tCsS = mma_thread_slice.partition_A(sS);
+      Tensor tCrS = make_tensor<ElementScale>(mma_thread_slice.partition_fragment_A(sS(_,_,Int<0>{})).shape()); 
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(tCsS, tCrS);
+      }
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor sZ = make_tensor(make_smem_ptr(shared_tensors.smem_zero.begin()), SmemLayoutScale{});// (BLK_M,BLK_SCALE_K,PIPE)
+        Tensor tCsZ = mma_thread_slice.partition_A(sZ);
+        Tensor tCrZ = make_tensor<ElementZero>(mma_thread_slice.partition_fragment_A(sZ(_,_,Int<0>{})).shape()); 
+        return cute::make_tuple(tCsS, tCrS, tCsZ, tCrZ);
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+      }
+    } 
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+    }
+  }
+  // clang-format on
+
+  // Same as upstream, should be kept the same when possible, not formatted for
+  // easier comparison
+  // clang-format off
+  /// Returns the tiled copy and copy views for the extra inputs.
+  template <class TiledMma, class... Ts>
+  CUTLASS_DEVICE
+  auto retile_extra_mma_info(
+    TiledMma const& tiled_mma,
+    cute::tuple<Ts...>& partitioned_extra_info,
+    int const warp_group_thread_idx) {
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // nothing to do
+      return cute::make_tuple();
+    }
+    else if constexpr (ModeHasScales) {
+      auto smem_tiled_copy_S = make_tiled_copy_A(SmemCopyAtomScale{}, tiled_mma);
+      auto smem_thr_copy_S   = smem_tiled_copy_S.get_thread_slice(warp_group_thread_idx);
+      Tensor tCrS_copy_view  = smem_thr_copy_S.retile_D(cute::get<1>(partitioned_extra_info));        // (CPY,CPY_M,CPY_K)
+      
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(smem_tiled_copy_S, tCrS_copy_view);
+      } 
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor tCrZ_copy_view  = smem_thr_copy_S.retile_D(cute::get<3>(partitioned_extra_info));      // (CPY,CPY_M,CPY_K)
+        return cute::make_tuple(smem_tiled_copy_S, tCrS_copy_view, tCrZ_copy_view);
+      } 
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+      }
+    } 
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+    }
+  }
+  // clang-format on
+
+  // Similar to `copy_A_and_extra_info` upstream, should be kept the same when
+  // possible
+  //   the main differences this only loads the extra info into registers and
+  //   not A (since we now preload more of A in the main pipeline)
+  // Load scales and zeros into registers if required
+  template <class... Ts, class... Us>
+  CUTLASS_DEVICE void load_extra_info_to_registers(
+      cute::tuple<Ts...> const& partitioned_mma_extra_info,
+      cute::tuple<Us...> const& tiled_copy_and_views, int k_block,
+      int read_stage) {
+    if (k_block == 0) {
+      // We are starting a new k-tile so copy the scale
+      if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+        // nothing to do
+      } else if constexpr (ModeHasScales) {
+        auto smem_tiled_copy_S = cute::get<0>(tiled_copy_and_views);
+        auto tCrS_copy_view = cute::get<1>(tiled_copy_and_views);
+        auto tCsS = cute::get<0>(partitioned_mma_extra_info);
+        copy(smem_tiled_copy_S, tCsS(_, _, k_block, read_stage),
+             tCrS_copy_view(_, _, k_block));
+        if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+          // Nothing extra to do
+        } else if constexpr (KernelConversionMode ==
+                             ConversionMode::ConvertAndScaleWithZero) {
+          auto tCsZ = cute::get<2>(partitioned_mma_extra_info);
+          auto tCrZ_copy_view = cute::get<2>(tiled_copy_and_views);
+          copy(smem_tiled_copy_S, tCsZ(_, _, k_block, read_stage),
+               tCrZ_copy_view(_, _, k_block));
+        } else {
+          static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                        "Conversion mode not handled in A -> RF path.");
+        }
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Conversion mode not handled in A -> RF path.");
+      }
+    }
+  }
+
+  // Similar to upstream, should be kept the same when possible.
+  //   the main differences are that `convert_tensor` supports interleaved
+  //   layouts and bfloat16 has been optimized. `transform_internal_A` has also
+  //   been inlined for code simplicity.
+  // Utilities to transform A.
+  template <class TCrA_load, int VectorWidthA, class TCrA_mma, class... Ts>
+  CUTLASS_DEVICE void transform_A_kblock(
+      TCrA_load const& tCrA_load, cute::Int<VectorWidthA> vec_A,
+      TCrA_mma& tCrA_mma, cute::tuple<Ts...> const& partitioned_extra_info,
+      int const k_block) {
+    auto in = tCrA_load(_, _, k_block);
+    auto out = tCrA_mma(_, _, k_block);
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      convert_tensor<IlvdBlkLayout>(in, out, vec_A);
+    } else if constexpr (ModeHasScales) {
+      auto tCrS = cute::get<1>(partitioned_extra_info);
+      auto converted_inputs =
+          make_fragment_like<ElementScale>(tCrA_mma)(_, _, k_block);
+      auto scales = tCrS(_, _, 0);
+
+      // First, we upcast the inputs to the scale type
+      convert_tensor<IlvdBlkLayout>(in, converted_inputs, vec_A);
+      // Apply scales and broadcast across inputs, store in converted_inputs
+
+      // We need to cast to nv_bfloat16 for the multiply since
+      // `cutlass::bfloat16_t` has an overloaded operator* that upconverts to
+      // float, which nvcc will not optimize to using vectorized fma
+      // instructions (i.e. hfma.bf16_v2)
+      if constexpr (std::is_same_v<ElementScale, cutlass::bfloat16_t>) {
+        cute::transform(
+            recast<nv_bfloat16>(converted_inputs), recast<nv_bfloat16>(scales),
+            recast<nv_bfloat16>(converted_inputs), cute::multiplies{});
+      } else {
+        cute::transform(converted_inputs, scales, converted_inputs,
+                        cute::multiplies{});
+      }
+
+      // Apply zeros if required
+      if constexpr (KernelConversionMode ==
+                    ConversionMode::ConvertAndScaleWithZero) {
+        auto tCrZ = cute::get<3>(partitioned_extra_info);
+        auto converted_zeros = make_fragment_like<ElementScale>(tCrZ)(_, _, 0);
+
+        convert_tensor<void>(tCrZ(_, _, 0), converted_zeros);
+        if constexpr (std::is_same_v<ElementScale, cutlass::bfloat16_t>) {
+          cute::transform(recast<nv_bfloat16>(converted_inputs),
+                          recast<nv_bfloat16>(converted_zeros),
+                          recast<nv_bfloat16>(converted_inputs), cute::plus{});
+        } else {
+          cute::transform(converted_inputs, converted_zeros, converted_inputs,
+                          cute::plus{});
+        }
+      }
+
+      // Finally, we convert the scaled inputs to the mma type.
+      convert_tensor<void>(converted_inputs, out);
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "No A data is loaded.");
+    }
+  }
+
+  // Modified from upstream, should be kept the same when possible
+  //   the main differences is that this version supports interleaved converts
+  // Utilities for transforming the A operand prior to issuing tensorcore math.
+  template <typename IlvdBlkLayout, class EngineIn, class EngineOut,
+            class TensorLayout,
+            int ConversionVectorWidth = cosize_v<TensorLayout>>
+  CUTLASS_DEVICE void convert_tensor(
+      Tensor<EngineIn, TensorLayout> const& in,
+      Tensor<EngineOut, TensorLayout>& out,
+      cute::Int<ConversionVectorWidth> width = {}) {
+    // This is an element-wise conversion where we expect both tensors to have
+    // the same layout. As a result, we can cast as a cutlass array to use the
+    // fast numeric converters without worrying about indexing into the layout.
+    constexpr int N = cosize_v<TensorLayout>;
+
+    // The inputs must be backed by registers & be statically sized.
+    static_assert(is_rmem<EngineIn>::value,
+                  "Input tensor for A conversion must come from registers");
+    static_assert(is_rmem<EngineOut>::value,
+                  "Output tensor for A conversion must come from registers");
+    static_assert(is_static_v<TensorLayout>,
+                  "Tensor layout for the conversion must be static");
+    static_assert(cosize_v<TensorLayout> == size(TensorLayout{}),
+                  "Cosize and size of the layout must be equal.");
+    static_assert(
+        N % ConversionVectorWidth == 0,
+        "Conversion vector width must divide cosize of the tensor layout.");
+
+    using SrcType = typename EngineIn::value_type;
+    using DstType = typename EngineOut::value_type;
+
+    using SrcArray = cutlass::Array<SrcType, ConversionVectorWidth>;
+    using DstArray = cutlass::Array<DstType, ConversionVectorWidth>;
+
+    constexpr cutlass::FloatRoundStyle RoundStyle =
+        cutlass::FloatRoundStyle::round_to_nearest;
+
+    using Converter = cutlass::InterleavedNumericArrayConverter<
+        IlvdBlkLayout, DstType, SrcType, ConversionVectorWidth, RoundStyle>;
+
+    constexpr int NumIterations = N / ConversionVectorWidth;
+
+    for (int ii = 0; ii < NumIterations; ++ii) {
+      SrcArray const* src_array_ptr =
+          reinterpret_cast<SrcArray const*>(raw_pointer_cast(in.data())) + ii;
+      DstArray* dst_array_ptr =
+          reinterpret_cast<DstArray*>(raw_pointer_cast(out.data())) + ii;
+      *dst_array_ptr = Converter::convert(*src_array_ptr);
+    }
+  }
+};
+
+}  // namespace machete
diff --git a/vllm_v0.10.0/csrc/quantization/machete/machete_mm_kernel.cuh b/vllm_v0.10.0/csrc/quantization/machete/machete_mm_kernel.cuh
new file mode 100644
index 0000000..cc50e68
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/machete/machete_mm_kernel.cuh
@@ -0,0 +1,309 @@
+#pragma once
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+// clang-format off
+// The cutlass include order matters (annoyingly)
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+// clang-format on
+
+#include "cutlass_extensions/cute_utils.cuh"
+#include "cutlass_extensions/vllm_numeric_conversion.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+#include "cutlass_extensions/torch_utils.hpp"
+#include "machete_collective_builder.cuh"
+#include "machete_prepacked_layout.cuh"
+#include "machete_interleaving_utils.cuh"
+
+namespace machete {
+
+using namespace cute;
+
+// NOTE This kernel computes D = alpha * A * B + beta * C by computing
+//   D^t = alpha * B^t * A^t + beta * C^t, this is because the wgmma
+//   instructions only support sourcing from registers for the left-hand
+//   operand, we want to upconvert/decompress the quantized operand in
+//   register. Since the primary use case we want to support is Y = XW^t where
+//   W is quantized, in this situation or right-hand operand is quantized so
+//   we compute the transpose to move it to the left-hand side.
+template <typename ElementA_, typename ElementB_, typename ElementD_,
+          typename AccumulatorT, typename GroupScaleT, typename GroupZeroT,
+          typename ChannelScaleT, typename TokenScaleT, class KernelSchedule,
+          typename ScheduleConfig>
+struct MacheteKernelTemplate {
+  static constexpr bool with_C = false;  // not ever used
+  static constexpr bool with_group_scales = !std::is_same_v<GroupScaleT, void>;
+  static constexpr bool with_group_zeropoints =
+      !std::is_same_v<GroupZeroT, void>;
+  static constexpr bool with_channel_scales =
+      !std::is_same_v<ChannelScaleT, void>;
+  static constexpr bool with_token_scales = !std::is_same_v<TokenScaleT, void>;
+
+  using MmaType = ElementA_;
+  using ElementA = ElementA_;
+  using ElementB = ElementB_;
+  using ElementD = ElementD_;
+  using ElementC = cute::conditional_t<with_C, ElementD, void>;
+  using ElementAccumulator = AccumulatorT;
+  using ElementCompute = AccumulatorT;  // For Epilogue
+  // Use dummy values when we don't have scales or zeropoints
+  using ElementZGroup =
+      cute::conditional_t<with_group_zeropoints, GroupZeroT, MmaType>;
+  using ElementSGroup =
+      cute::conditional_t<with_group_scales, GroupScaleT, MmaType>;
+  using ElementConvertGroup =
+      cute::conditional_t<with_group_scales, GroupScaleT, MmaType>;
+  using ElementSChannel =
+      cute::conditional_t<with_channel_scales, ChannelScaleT, AccumulatorT>;
+  using ElementSToken =
+      cute::conditional_t<with_token_scales, TokenScaleT, AccumulatorT>;
+
+  using BTypeTuple = cute::conditional_t<
+      with_group_scales,
+      cute::conditional_t<with_group_zeropoints,
+                          cute::tuple<ElementB, ElementSGroup, ElementZGroup>,
+                          cute::tuple<ElementB, ElementSGroup>>,
+      ElementB>;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = LayoutC;
+  using LayoutScale = cutlass::layout::RowMajor;
+  // not actually used since B has the prepacked layout, but required by cutlass
+  using _LayoutB = cutlass::layout::ColumnMajor;
+
+  // Interface strides expected by create_arguments (will get transposed)
+  using StrideA = cutlass::detail::TagToStrideA_t<LayoutA>;
+  using StrideC = cutlass::detail::TagToStrideA_t<LayoutC>;
+  using StrideD = cutlass::detail::TagToStrideA_t<LayoutD>;
+  using StrideSGroup = cutlass::detail::TagToStrideA_t<LayoutScale>;
+  using StrideZGroup = StrideSGroup;
+
+  using LayoutA_Transpose =
+      typename cutlass::layout::LayoutTranspose<LayoutA>::type;
+  using LayoutC_Transpose =
+      typename cutlass::layout::LayoutTranspose<LayoutC>::type;
+  using LayoutD_Transpose =
+      typename cutlass::layout::LayoutTranspose<LayoutD>::type;
+
+  using ArchTag = cutlass::arch::Sm90;
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+
+  using PrepackedLayoutB =
+      PrepackedLayoutBTemplate<ElementA_, ElementB_, ElementConvertGroup,
+                               AccumulatorT, LayoutA_Transpose, KernelSchedule>;
+
+  static int constexpr TileShapeK =
+      128 * 8 / cutlass::sizeof_bits<MmaType>::value;
+  static int constexpr AlignmentA = 128 / cutlass::sizeof_bits_v<ElementA>;
+  static int constexpr AlignmentB = 128 / cutlass::sizeof_bits_v<ElementB>;
+  static int constexpr AlignmentC =
+      (with_C) ? 128 / cutlass::sizeof_bits_v<ElementC> : 0;
+  static int constexpr AlignmentD = 128 / cutlass::sizeof_bits_v<ElementD>;
+
+  using TileShape = decltype(append(typename ScheduleConfig::TileShapeNM{},
+                                    cute::Int<TileShapeK>{}));
+  using ClusterShape = typename ScheduleConfig::ClusterShape;
+  using EpilogueSchedule = typename ScheduleConfig::EpilogueSchedule;
+  using EpilogueTileType = typename ScheduleConfig::EpilogueTileType;
+  using TileScheduler = typename ScheduleConfig::TileScheduler;
+
+  static_assert(
+      (!with_channel_scales && !with_token_scales) ||
+          ((with_channel_scales && with_token_scales) &&
+           std::is_same_v<ElementSChannel, ElementSToken>),
+      "Currently token and channel scales (if present) must be the same type");
+
+  // Currently only supports float scales
+  using ChTokScalesEpilogue =
+      typename vllm::c3x::ScaledEpilogue<ElementAccumulator, ElementD,
+                                         TileShape>;
+  static_assert((with_channel_scales || with_token_scales) ||
+                    (std::is_same_v<ElementSChannel, float> &&
+                     std::is_same_v<ElementSToken, float>),
+                "Currently token and channel scales (if present) must be float "
+                "(and if one is present the other must be too)");
+
+  using StoreEpilogueCompute = typename cutlass::epilogue::fusion::Sm90EVT<
+      cutlass::epilogue::fusion::Sm90AccFetch>;
+
+  using EVTCompute =
+      std::conditional_t<with_channel_scales || with_token_scales,
+                         typename ChTokScalesEpilogue::EVTCompute,
+                         StoreEpilogueCompute>;
+
+  // EVTCompute
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, TileShape, ClusterShape, EpilogueTileType,
+          ElementAccumulator, ElementSChannel, ElementC, LayoutC_Transpose,
+          AlignmentC, ElementD, LayoutD_Transpose, AlignmentD, EpilogueSchedule,
+          EVTCompute>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::VLLMCollectiveBuilder<
+          cutlass::gemm::collective::MacheteKernelTag, ArchTag, OperatorClass,
+          BTypeTuple, PrepackedLayoutB, AlignmentB, ElementA, LayoutA_Transpose,
+          AlignmentA, ElementAccumulator, TileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          KernelSchedule>::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>,  // Indicates ProblemShape
+      CollectiveMainloop, CollectiveEpilogue, TileScheduler>;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // stride_B is unused (since B is prepacked), but still required by cutlass
+  using _StrideB = cutlass::detail::TagToStrideB_t<_LayoutB>;
+
+  using Arguments = typename Gemm::Arguments;
+  using MainloopArguments = typename GemmKernel::MainloopArguments;
+  using EpilogueArguments = typename GemmKernel::EpilogueArguments;
+
+  static Arguments create_arguments(
+      cudaStream_t stream,
+      torch::Tensor const& A,  // MxK matrix
+      torch::Tensor const& B,  // KxN prepacked matrix
+      torch::Tensor& D,        // MxN matrix
+      std::optional<torch::Tensor> const& maybe_g_scales,  // scale_KxN matrix
+      std::optional<torch::Tensor> const& maybe_g_zeros,   // scale_KxN matrix
+      std::optional<int64_t> maybe_group_size,
+      std::optional<torch::Tensor> const& maybe_ch_scales,   // len N vector
+      std::optional<torch::Tensor> const& maybe_tok_scales)  // len M vector
+  {
+    static_assert(!with_group_zeropoints || with_group_scales);
+
+    int M = A.size(0), N = B.size(1), K = A.size(1);
+    TORCH_CHECK(D.size(0) == M && D.size(1) == N);
+
+    auto layout_A = make_cute_layout<StrideA>(A, "A");
+    auto layout_D = make_cute_layout<StrideD>(D, "D");
+    auto layout_S_group =
+        maybe_make_cute_layout<StrideSGroup>(maybe_g_scales, "group_scales");
+    auto layout_Z_group =
+        maybe_make_cute_layout<StrideZGroup>(maybe_g_zeros, "group_zeros");
+    int64_t numel_S_channel = maybe_ch_scales ? maybe_ch_scales->numel() : 0;
+    int64_t numel_S_token = maybe_tok_scales ? maybe_tok_scales->numel() : 0;
+
+    auto unwrap = [](auto const& t) {
+      return t ? t->const_data_ptr() : nullptr;
+    };
+    auto A_ptr = static_cast<ElementA const*>(A.const_data_ptr());
+    auto B_ptr = static_cast<ElementB const*>(B.const_data_ptr());
+    auto D_ptr = static_cast<ElementD*>(D.mutable_data_ptr());
+    auto S_group_ptr =
+        static_cast<ElementSGroup const*>(unwrap(maybe_g_scales));
+    auto Z_group_ptr = static_cast<ElementZGroup const*>(unwrap(maybe_g_zeros));
+    auto S_channel_ptr =
+        static_cast<ElementSChannel const*>(unwrap(maybe_ch_scales));
+    auto S_token_ptr =
+        static_cast<ElementSToken const*>(unwrap(maybe_tok_scales));
+
+    int const group_size =
+        maybe_group_size == -1 ? K : maybe_group_size.value_or(K);
+    int const scale_k = (K + group_size - 1) / group_size;
+
+    TORCH_CHECK(size<0>(layout_A) == M && size<1>(layout_A) == K);
+    TORCH_CHECK(size<0>(layout_D) == M && size<1>(layout_D) == N);
+
+    if constexpr (with_group_scales) {
+      TORCH_CHECK(S_group_ptr && layout_S_group);
+      TORCH_CHECK((size<0>(*layout_S_group) == scale_k &&
+                   size<1>(*layout_S_group) == N));
+    } else {
+      TORCH_CHECK(!S_group_ptr, "Scales not supported");
+    }
+
+    if constexpr (with_group_zeropoints) {
+      TORCH_CHECK(Z_group_ptr && layout_Z_group);
+      TORCH_CHECK((size<0>(*layout_Z_group) == scale_k &&
+                   size<1>(*layout_Z_group) == N));
+      TORCH_CHECK(layout_S_group && *layout_Z_group == *layout_S_group,
+                  "Scales and zeros must have the same layout");
+    } else {
+      TORCH_CHECK(!Z_group_ptr, "Zeropoints not supported");
+    }
+
+    if constexpr (with_channel_scales || with_token_scales) {
+      TORCH_CHECK(
+          (maybe_ch_scales->numel() == N || maybe_ch_scales->numel() == 1) &&
+          (maybe_tok_scales->numel() == M || maybe_tok_scales->numel() == 1));
+    }
+
+    // Transpose A and D
+    // A doesn't need to be transposed since cutlass expects a NxK matrix
+    //  for B (which is At)
+    auto stride_At = layout_A.stride();
+    auto stride_Dt = permute_layout<1, 0, 2>(layout_D).stride();
+
+    MainloopArguments mainloop_arguments{};
+    // {Accum, C, C_layout, D, D}
+    EpilogueArguments epilogue_arguments{};
+
+    if constexpr (with_channel_scales || with_token_scales) {
+      epilogue_arguments =
+          EpilogueArguments{ChTokScalesEpilogue::prepare_args(
+                                *maybe_ch_scales, *maybe_tok_scales),
+                            nullptr,
+                            {},
+                            D_ptr,
+                            stride_Dt};
+    } else {
+      epilogue_arguments = EpilogueArguments{{}, nullptr, {}, D_ptr, stride_Dt};
+    }
+
+    if constexpr (with_group_scales && with_group_zeropoints) {
+      auto stride_S_group = permute_layout<1, 0, 2>(*layout_S_group).stride();
+      mainloop_arguments = MainloopArguments{
+          B_ptr,       _StrideB{},     A_ptr,      stride_At,
+          S_group_ptr, stride_S_group, group_size, Z_group_ptr};
+    } else if constexpr (with_group_scales) {
+      auto stride_S_group = permute_layout<1, 0, 2>(*layout_S_group).stride();
+      mainloop_arguments =
+          MainloopArguments{B_ptr,       _StrideB{},     A_ptr,     stride_At,
+                            S_group_ptr, stride_S_group, group_size};
+    } else {
+      mainloop_arguments =
+          MainloopArguments{B_ptr, _StrideB{}, A_ptr, stride_At};
+    }
+
+    return Arguments{cutlass::gemm::GemmUniversalMode::kGemm,
+                     {N, M, K, 1},
+                     mainloop_arguments,
+                     epilogue_arguments};
+  };
+
+  static size_t get_workspace_size(Arguments const& args) {
+    return Gemm::get_workspace_size(args);
+  }
+
+  static bool can_implement(Arguments const& args) {
+    return Gemm::can_implement(args) == cutlass::Status::kSuccess;
+  }
+
+  static void run(Arguments const& args, void* workspace, cudaStream_t stream) {
+    Gemm gemm_op;
+
+    cutlass::Status status = gemm_op.initialize(args, workspace, stream);
+    TORCH_CHECK(status == cutlass::Status::kSuccess,
+                "Machete kernel failed to initialize workspace");
+
+    status = gemm_op.run(stream);
+    TORCH_CHECK(status == cutlass::Status::kSuccess, "Machete kernel failed");
+  }
+};
+
+};  // namespace machete
diff --git a/vllm_v0.10.0/csrc/quantization/machete/machete_mm_launcher.cuh b/vllm_v0.10.0/csrc/quantization/machete/machete_mm_launcher.cuh
new file mode 100644
index 0000000..cabe0af
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/machete/machete_mm_launcher.cuh
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <torch/all.h>
+#include <Python.h>
+
+#include "machete_mm_kernel.cuh"
+#include "cutlass_extensions/torch_utils.hpp"
+#include "core/scalar_type.hpp"
+
+namespace machete {
+
+struct MMArgs {
+  torch::Tensor const& A;
+  torch::Tensor const& B;
+  vllm::ScalarType const& b_type;
+  std::optional<at::ScalarType> const& maybe_out_type;
+  std::optional<torch::Tensor> const& maybe_group_scales;
+  std::optional<torch::Tensor> const& maybe_group_zeros;
+  std::optional<int64_t> maybe_group_size;
+  std::optional<torch::Tensor> const& maybe_channel_scales;
+  std::optional<torch::Tensor> const& maybe_token_scales;
+  std::optional<std::string> maybe_schedule;
+};
+
+struct SupportedSchedulesArgs {
+  at::ScalarType a_type;
+  vllm::ScalarType b_type;
+  std::optional<at::ScalarType> maybe_group_scales_type;
+  std::optional<at::ScalarType> maybe_group_zeros_type;
+  std::optional<at::ScalarType> maybe_channel_scales_type;
+  std::optional<at::ScalarType> maybe_token_scales_type;
+  std::optional<at::ScalarType> maybe_out_type;
+};
+
+torch::Tensor mm_dispatch(MMArgs args);
+
+std::vector<std::string> supported_schedules_dispatch(
+    SupportedSchedulesArgs args);
+
+template <typename MacheteKernel>
+torch::Tensor run_impl(MMArgs args) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(args.A));
+
+  auto device = args.A.device();
+  auto stream = at::cuda::getCurrentCUDAStream(device.index());
+
+  int M = args.A.size(0);
+  int N = args.B.size(1);
+  int K = args.A.size(1);
+
+  // Allocate output
+  torch::Tensor D = torch::empty(
+      {M, N},
+      torch::TensorOptions()
+          .dtype(equivalent_scalar_type_v<typename MacheteKernel::ElementD>)
+          .device(device));
+
+  auto arguments = MacheteKernel::create_arguments(
+      stream,  //
+      args.A, args.B, D, args.maybe_group_scales, args.maybe_group_zeros,
+      args.maybe_group_size, args.maybe_channel_scales,
+      args.maybe_token_scales);
+  TORCH_CHECK(MacheteKernel::can_implement(arguments),
+              "Machete kernel cannot be run with these arguments");
+
+  size_t workspace_size = MacheteKernel::get_workspace_size(arguments);
+  torch::Tensor workspace = torch::empty(
+      workspace_size, torch::TensorOptions().dtype(torch::kU8).device(device));
+
+  MacheteKernel::run(arguments, workspace.mutable_data_ptr(), stream);
+
+  return D;
+};
+
+};  // namespace machete
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/quantization/machete/machete_prepack_kernel.cuh b/vllm_v0.10.0/csrc/quantization/machete/machete_prepack_kernel.cuh
new file mode 100644
index 0000000..d002355
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/machete/machete_prepack_kernel.cuh
@@ -0,0 +1,76 @@
+#pragma once
+
+#include "machete_mm_kernel.cuh"
+#include "cutlass_extensions/cute_utils.cuh"
+#include "cutlass_extensions/torch_utils.hpp"
+
+namespace machete {
+
+template <int threads, typename PrepackedLayoutB, typename BInTensor,
+          typename ElementB>
+static __global__ void prepack_B_kernel(BInTensor B_in, ElementB* B_out_ptr) {
+  auto constexpr block_size =
+      Int<size(typename PrepackedLayoutB::PPBlockShape_NK{})>{};
+  auto constexpr eles_per_thread = Int<block_size / threads>{};
+  static_assert(block_size % threads == 0,
+                "block_size must be divisible by the number of threads");
+
+  // Which pre-packed are we responsible for
+  auto blk_coord = make_coord(blockIdx.x, blockIdx.y, blockIdx.z);
+  auto tB_in = local_tile(
+      B_in, append(typename PrepackedLayoutB::PPBlockShape_NK{}, _1{}),
+      blk_coord);
+
+  // Find the start offset in the output for this pre-packed block
+  auto bNbKL_to_offset = PrepackedLayoutB::bNbKL_to_offset(shape(B_in));
+
+  // Tensor representing a 1:1 mapping to the output space in 1D
+  auto tB_out_linear =
+      make_tensor(get_logical_ptr(B_out_ptr) + bNbKL_to_offset(blk_coord),
+                  make_layout(make_shape(block_size)));
+  // Mapping from output space (1D) to input space
+  auto tB_in_linear = make_tensor(
+      tB_in.data(),
+      tB_in.layout()
+          .compose(right_inverse(PrepackedLayoutB::ppblock_ilvd_NK_to_offset()))
+          .with_shape(make_shape(block_size)));
+
+  // Tile for this specific thread (could have used a TiledCopy but these work
+  // best with 2d layouts, this is a simple 1d layout so local_tile is enough,
+  // we are also not that concerned with performance for this kernel)
+  auto thr_tB_in_linear =
+      local_tile(tB_in_linear, make_shape(eles_per_thread), threadIdx.x);
+  auto thr_tB_out_linear =
+      local_tile(tB_out_linear, make_shape(eles_per_thread), threadIdx.x);
+
+  // Construct a register-backed Tensor with the same shape as each thread's
+  // partition
+  auto fragment = make_tensor<ElementB>(shape(thr_tB_in_linear));
+
+  copy(thr_tB_in_linear, fragment);
+  copy(Copy_Atom<DefaultCopy, uint8_t>{}, fragment, thr_tB_out_linear);
+}
+
+template <typename PrepackedLayoutB, typename InLayout>
+static void prepack_B_template(
+    cudaStream_t stream, typename PrepackedLayoutB::ElementB const* B_in_ptr,
+    InLayout B_layout, typename PrepackedLayoutB::ElementB* B_out_ptr) {
+  using TileShapeNKL =
+      decltype(append(typename PrepackedLayoutB::PPBlockShape_NK{}, _1{}));
+  auto ilvd_NKbNbKL_to_offset =
+      PrepackedLayoutB::ilvd_NKbNbKL_to_offset(shape(B_layout));
+
+  TORCH_CHECK(size<0>(B_layout) % size<0>(TileShapeNKL{}) == 0);
+  TORCH_CHECK(size<1>(B_layout) % size<1>(TileShapeNKL{}) == 0);
+
+  auto N_tiles = size<0>(B_layout) / size<0>(TileShapeNKL{});
+  auto K_tiles = size<1>(B_layout) / size<1>(TileShapeNKL{});
+  auto L_tiles = size<2>(B_layout);
+
+  auto B_in = make_tensor(get_logical_ptr(B_in_ptr), B_layout);
+
+  prepack_B_kernel<128, PrepackedLayoutB>
+      <<<dim3(N_tiles, K_tiles, L_tiles), 128, 0, stream>>>(B_in, B_out_ptr);
+}
+
+};  // namespace machete
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/quantization/machete/machete_prepack_launcher.cuh b/vllm_v0.10.0/csrc/quantization/machete/machete_prepack_launcher.cuh
new file mode 100644
index 0000000..634b651
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/machete/machete_prepack_launcher.cuh
@@ -0,0 +1,74 @@
+#pragma once
+
+#include "machete_prepack_kernel.cuh"
+#include "cutlass_extensions/torch_utils.hpp"
+#include "core/scalar_type.hpp"
+
+namespace machete {
+
+struct PrepackBArgs {
+  torch::Tensor const& B;
+  at::ScalarType a_type;
+  vllm::ScalarType b_type;
+  std::optional<at::ScalarType> maybe_group_scales_type;
+};
+
+template <typename PrepackedLayoutB>
+torch::Tensor prepack_impl(torch::Tensor const B) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(B));
+  using ElementB = typename PrepackedLayoutB::ElementB;
+  using PPBlockShape_NK = typename PrepackedLayoutB::PPBlockShape_NK;
+
+  auto device = B.device();
+  auto stream = at::cuda::getCurrentCUDAStream(device.index());
+  auto B_ptr = static_cast<ElementB const*>(B.const_data_ptr());
+  // elements per storage item for B
+  auto eles_per_storage =
+      (B.dtype().itemsize() * 8) / cute::sizeof_bits_v<ElementB>;
+
+  // torch B passed in is/should be (packed_K,N), the kernel expects (N,K,L) (to
+  // match cutlass using (N,K,L) for B), so we transpose B to (N,packed_K,L)
+  auto Bt_packed = B.t();
+
+  TORCH_CHECK(
+      (B.size(0) * eles_per_storage) % size<1>(PPBlockShape_NK{}) == 0,
+      "B.shape[0] (in terms of unpacked elements) must be a multiple of ",
+      size<1>(PPBlockShape_NK{}));
+  TORCH_CHECK(B.size(1) % size<0>(PPBlockShape_NK{}) == 0,
+              "B.shape[1] must be a multiple of ", size<0>(PPBlockShape_NK{}));
+
+  using StrideB = cutlass::detail::TagToStrideB_t<cutlass::layout::ColumnMajor>;
+  auto const l_Bt_packed = make_cute_layout<StrideB>(Bt_packed, "B");
+
+  // convert (N,packed_K,L) layout to (N,K,L) layout
+  //  in effect we want to do: blocked_product(layout_Bt_packed,
+  //      make_ordered_layout(make_shape(_1{}, eles_per_storage, _1{}),
+  //                          Step<_1, _0, _2>{}));
+  // but blocked_product does not support dynamic strides so we implement the
+  // equivalent manually,
+  //   new_shape = (N, packed_K, L) * (1, eles_per_storage, 1) -> (N, K, L)
+  //   new_stride = (s0, s1, s2) * (eles_per_storage, 1, eles_per_storage)
+  //                 when s1 == 1
+  TORCH_CHECK(stride<1>(l_Bt_packed) == 1);
+  // clang-format off
+  auto const layout_Bt = make_layout(
+      transform_with_idx(l_Bt_packed.shape(), [&](auto ele, auto idx) {
+        return idx == 1 ? ele * eles_per_storage : ele;
+      }), 
+      transform_with_idx(l_Bt_packed.stride(), [&](auto ele, auto idx) {
+        return idx != 1 ? ele * eles_per_storage : ele;
+      }));
+  // clang-format on
+
+  // Allocate output
+  torch::Tensor D = torch::empty_like(B, {}, at::MemoryFormat::Contiguous);
+
+  prepack_B_template<PrepackedLayoutB>(
+      stream, B_ptr, layout_Bt, static_cast<ElementB*>(D.mutable_data_ptr()));
+
+  return D;
+};
+
+torch::Tensor prepack_B_dispatch(PrepackBArgs args);
+
+};  // namespace machete
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/quantization/machete/machete_prepacked_layout.cuh b/vllm_v0.10.0/csrc/quantization/machete/machete_prepacked_layout.cuh
new file mode 100644
index 0000000..81aaa6c
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/machete/machete_prepacked_layout.cuh
@@ -0,0 +1,249 @@
+#pragma once
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+// clang-format off
+// The cutlass include order matters (annoyingly)
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+// clang-format on
+
+#include "cutlass_extensions/cute_utils.cuh"
+#include "machete_collective_builder.cuh"
+#include "machete_interleaving_utils.cuh"
+
+namespace machete {
+
+using namespace cute;
+
+struct IlvBlkLayoutAuto {};
+
+// This defines a prepacked layout for the B matrix, where the matrix is broken
+// up into PPBlockShape_NK blocks. The data within each block is then compactly
+// stored in memory such that when performing a TiledMMA operation with the same
+// shape as prepacked block, all the data for a given thread is contiguous in
+// memory. This allows us to use wider shared memory loads when loading B from
+// shared memory. The values within a thread are also potentially interlaeved
+// inorder to allow for more efficient upconverting.
+//
+// The contract here is that the `TiledMma` determined below matches the one
+// ultimately used in the kernel. (this is also why the other element types are
+// required along with the kernel schedule)
+template <typename ElementA_, typename ElementB_, typename ElementConvert_,
+          typename AccumulatorT, class LayoutB, class KernelSchedule,
+          typename IlvBlkLayout_ = IlvBlkLayoutAuto>
+// clang-format on
+struct PrepackedLayoutBTemplate {
+  using MmaType = ElementA_;
+  using ElementA = ElementA_;
+  using ElementB = ElementB_;
+  using ElementAccumulator = AccumulatorT;
+  using ElementMma = MmaType;
+
+  // Interleave for 4bit bit types when we are not upconverting to fp8 or int8,
+  // in those cases case we use a LUT using prmt instructions to upconvert and
+  // is more efficient if the data is not interleaved For 8bit+ prmt
+  // instructions makes non-interleaved layouts efficient enough we don't need
+  // iterleaved layouts (and can reuse more of the existing cutlass converts)
+  static constexpr bool should_interleave =
+      sizeof_bits_v<ElementB> <= 4 &&
+      !std::is_same_v<ElementConvert_, cutlass::float_e4m3_t> &&
+      !std::is_same_v<ElementConvert_, int8_t>;
+
+  // Only use interleaved layouts for subbyte weights,
+  using IlvdBlkLayout = std::conditional_t<
+      std::is_same_v<IlvBlkLayout_, IlvBlkLayoutAuto>,
+      std::conditional_t<
+          should_interleave,
+          decltype(get_interleaved_blk_layout<
+                   ElementB, sizeof_bits_v<ElementConvert_>, 32>()),
+          void>,
+      IlvBlkLayout_>;
+
+  // TODO (LucasWilkinson): compare the performance for other sizes
+  // Prepacked block shape, smallest layout atom for loading into registers
+  //   (can contain multiple wgmma instructions worth of data in one block)
+  // We ideally want this to be configured such that a thread can perform 128bit
+  // loads, i.e. we amount of data associated with each thread within a
+  // prepacked block is a multiple of 128bits, when using a cooperative sechdule
+  // we have 256 threads working a single block at a time, this means each
+  // thread works on `sizeof_bits_v<ElementB> * (128*64) / 256` bits of data,
+  // for a 4bit type this would be 128bits
+  using PPBlockShape_NK = Shape<_128, _64>;
+
+  // Create the shape of the tile anticipated to be used by the GEMM kernel,
+  //  when the kernel executes we will compute `Ct = Bt * At` since the
+  //  quantized weights (B), must be the lhs operand so the flow through
+  //  registers.
+  // The _128 here doesn't actually impact the shape of the stored tile directly
+  //  but may impact the op selected by rs_op_selector
+  using GemmTileShape = decltype(make_shape(size<0>(PPBlockShape_NK{}), _128{},
+                                            size<1>(PPBlockShape_NK{})));
+
+  static constexpr cute::GMMA::Major GmmaMajorB =
+      gmma_rs_tag_to_major_B<LayoutB>();
+
+  // For coop schedules we have two warp groups cooperatively issuing wgmma
+  // instructions so we use 2 atoms along the M dim (one for each warpgroup)
+  using AtomLayoutMNK = cute::conditional_t<
+      cute::is_same_v<KernelSchedule, KernelTmaWarpSpecializedCooperative>,
+      Layout<Shape<_2, _1, _1>>, Layout<Shape<_1, _1, _1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(
+      cute::GMMA::rs_op_selector<ElementMma, ElementMma, ElementAccumulator,
+                                 GemmTileShape, GMMA::Major::K, GmmaMajorB>(),
+      AtomLayoutMNK{}));
+
+  // Prepacked block, (athrid, val) -> (N,K)
+  // i.e. ((ThrV,(ThrN,ThrK)),(FrgV,(RestN,RestK,...))) -> (N,K)
+  CUTE_HOST_DEVICE static constexpr auto ppblock_TV_to_NK() {
+    return TiledMma{}.thrfrg_A(make_layout(PPBlockShape_NK{}));
+  }
+
+  // Prepacked block, (N,K) -> (athrid, val)
+  // i.e. (N,K) -> ((ThrV,(ThrN,ThrK)),(FrgV,(RestN,RestK,...)))
+  CUTE_HOST_DEVICE static constexpr auto ppblock_NK_to_TV() {
+    return right_inverse(ppblock_TV_to_NK()).with_shape(PPBlockShape_NK{});
+  }
+
+  // Prepacked block, (athrid, val) -> (storage_offset)
+  // i.e. ((ThrV,(ThrN,ThrK)),(FrgV,(RestN,RestK,...))) -> (storage_idx)
+  CUTE_HOST_DEVICE static constexpr auto ppblock_TV_to_offset() {
+    // Return iterleaved layout
+    return make_ordered_layout(shape(ppblock_TV_to_NK()), Step<_1, _0>{});
+  }
+
+  // Prepacked block, (athrid, val) -> (storage_offset)
+  // i.e. ((ThrV,(ThrM,ThrK)),(IlvdFrgV,(RestM,RestK,...))) -> (storage_idx)
+  CUTE_HOST_DEVICE static constexpr auto ppblock_ilvd_TV_to_offset() {
+    auto layout_no_interleave =
+        make_ordered_layout(shape(ppblock_TV_to_NK()), Step<_1, _0>{});
+
+    if constexpr (std::is_same_v<IlvdBlkLayout, void>) {
+      return layout_no_interleave;
+    } else {
+      // interleave by transforming FrgV into interleaved blocks where each
+      // block has the layout IlvdBlkLayout, for example if IlvdBlkLayout is
+      // (2, 2) : (2, 1) then we get: ((2, 2), size(FrgV) / 4) : ((2, 1), 4)
+      //   if FrgV is {A, B, C, D, E, F, G, H}
+      //   then ((IlvBlk), FrgB) is {A, C, B, D, C, G, D, H}
+      auto frgV = get<1, 0>(layout_no_interleave);
+      auto ilvdBlk = IlvdBlkLayout{};
+      static_assert(size(frgV) % size(ilvdBlk) == 0,
+                    "FrgV must be divisible by size(ilvdBlk)");
+      auto ilvd_FrgV = make_layout(
+          make_shape(shape(ilvdBlk), Int<size(frgV) / size(ilvdBlk)>{}),
+          make_stride(stride(ilvdBlk), size(ilvdBlk)));
+
+      // Return iterleaved layout
+      return make_layout(
+          get<0>(layout_no_interleave),
+          make_layout(ilvd_FrgV, get<1, 1>(layout_no_interleave)));
+    }
+  }
+
+  // Prepacked block, (M,K) -> (storage_offset)
+  CUTE_HOST_DEVICE static constexpr auto ppblock_ilvd_NK_to_offset() {
+    // do (M,K) -> (athrid, val) -> (storage_idx)
+    return ppblock_ilvd_TV_to_offset().compose(ppblock_NK_to_TV());
+  }
+
+  // ((athrid, val), (BlocksN, BlocksK), L) -> (storage_idx)
+  template <typename Shape_NKL>
+  CUTE_HOST_DEVICE static constexpr auto TVbNbKL_to_offset(
+      Shape_NKL shape_mkl) {
+    constexpr auto block_layout = ppblock_TV_to_offset();
+
+    // (BlocksN, BlocksK, L)
+    auto blocks_shape =
+        cute::transform(shape_mkl, append(PPBlockShape_NK{}, _1{}),
+                        [](auto x, auto y) { return x / y; });
+
+    // ((athrid, val), (BlocksN, BlocksK, L)) -> (storage_idx)
+    auto result = make_layout(
+        block_layout,
+        make_layout(blocks_shape,
+                    compact_col_major(blocks_shape, size(block_layout))));
+
+    // ((athrid, val), (BlocksN, BlocksK, L))
+    //   => ((athrid, val), (BlocksN, BlocksK), L)
+    return group<1, 3>(result(_, repeat<rank<1>(result)>(_)));
+  }
+
+  // ((athrid_val), (BlocksN, BlocksK, L)) -> (N, K, L)
+  template <typename Shape_NKL>
+  CUTE_HOST_DEVICE static constexpr auto TVbNbKL_to_offset_copy(
+      Shape_NKL shape_mkl) {
+    auto layout = TVbNbKL_to_offset(shape_mkl);
+    return make_layout(coalesce(get<0>(layout)), get<1>(layout),
+                       get<2>(layout));
+  }
+
+  // ((BlockN, BlockK), (BlocksN, BlocksK), L) -> (storage_idx)
+  template <typename Shape_NKL>
+  CUTE_HOST_DEVICE static constexpr auto ilvd_NKbNbKL_to_offset(
+      Shape_NKL shape_mkl) {
+    constexpr auto block_layout = ppblock_ilvd_NK_to_offset();
+
+    // (BlocksN, BlocksK, L)
+    auto blocks_shape =
+        cute::transform(shape_mkl, append(PPBlockShape_NK{}, _1{}),
+                        [](auto x, auto y) { return x / y; });
+
+    // ((athrid, val), (BlocksN, BlocksK, L)) -> (storage_idx)
+    auto result = make_layout(
+        block_layout,
+        make_layout(blocks_shape,
+                    compact_col_major(blocks_shape, size(block_layout))));
+
+    // ((athrid, val), (BlocksN, BlocksK, L)) => ((athrid, val), (BlocksN,
+    // BlocksK), L)
+    return group<1, 3>(result(_, repeat<rank<1>(result)>(_)));
+  }
+
+  // (BlocksN, BlocksK, L) -> (storage_idx)
+  template <typename Shape_NKL>
+  CUTE_HOST_DEVICE static constexpr auto bNbKL_to_offset(Shape_NKL shape_mkl) {
+    // (BlocksN, BlocksK, L)
+    auto blocks_shape =
+        cute::transform(shape_mkl, append(PPBlockShape_NK{}, _1{}),
+                        [](auto x, auto y) { return x / y; });
+    auto stride = size(PPBlockShape_NK{});
+
+    // (BlocksN, BlocksK, L) -> (storage_idx)
+    return make_layout(blocks_shape, compact_col_major(blocks_shape, stride));
+  }
+
+  // ((athrid, val), (BlocksN, BlocksK, L)) -> (N, K, L)
+  template <class Shape_NKL>
+  CUTE_HOST_DEVICE static auto TVbNbK_to_NKL(Shape_NKL shape_mkl) {
+    auto tile = make_tile(make_layout(size<0>(PPBlockShape_NK{})),
+                          make_layout(size<1>(PPBlockShape_NK{})));
+
+    // ((BlockN, BlockK), (BlocksN, BlocksK, L)) -> (N, K, L)
+    auto tiled_A = zipped_divide(make_layout(shape_mkl), tile);
+    return tiled_A.compose(ppblock_TV_to_NK(), _);
+  }
+
+  // (N, K, L) -> ((athrid, val), (BlocksN, BlocksK), L)
+  template <class Shape_NKL>
+  CUTE_HOST_DEVICE static auto NKL_to_TVbNbK(Shape_NKL shape_mkl) {
+    auto TVbNbK_to_NKL_layout = TVbNbK_to_NKL(shape_mkl);
+    return blocked_product(ppblock_NK_to_TV(),
+                           make_layout(shape<1>(TVbNbK_to_NKL_layout)));
+  }
+};
+
+};  // namespace machete
diff --git a/vllm_v0.10.0/csrc/quantization/machete/machete_pytorch.cu b/vllm_v0.10.0/csrc/quantization/machete/machete_pytorch.cu
new file mode 100644
index 0000000..05a51ee
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/machete/machete_pytorch.cu
@@ -0,0 +1,73 @@
+#include "machete_mm_launcher.cuh"
+#include "machete_prepack_launcher.cuh"
+#include "core/scalar_type.hpp"
+
+#include "core/registration.h"
+
+namespace machete {
+
+using namespace vllm;
+
+std::vector<std::string> supported_schedules(
+    at::ScalarType a_type, int64_t b_type_id,
+    std::optional<at::ScalarType> maybe_group_scales_type,
+    std::optional<at::ScalarType> maybe_group_zeros_type,
+    std::optional<at::ScalarType> maybe_channel_scales_type,
+    std::optional<at::ScalarType> maybe_token_scales_type,
+    std::optional<at::ScalarType> maybe_out_type) {
+  ScalarType const b_type = ScalarType::from_id(b_type_id);
+  return supported_schedules_dispatch({
+      .a_type = a_type,
+      .b_type = b_type,
+      .maybe_group_scales_type = maybe_group_scales_type,
+      .maybe_group_zeros_type = maybe_group_zeros_type,
+      .maybe_channel_scales_type = maybe_channel_scales_type,
+      .maybe_token_scales_type = maybe_token_scales_type,
+      .maybe_out_type = maybe_out_type,
+  });
+}
+
+torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B,
+                 int64_t b_type_id,
+                 std::optional<at::ScalarType> const& maybe_out_type,
+                 std::optional<torch::Tensor> const& maybe_group_scales,
+                 std::optional<torch::Tensor> const& maybe_group_zeros,
+                 std::optional<int64_t> maybe_group_size,
+                 std::optional<torch::Tensor> const& maybe_channel_scales,
+                 std::optional<torch::Tensor> const& maybe_token_scales,
+                 std::optional<std::string> maybe_schedule) {
+  ScalarType const b_type = ScalarType::from_id(b_type_id);
+  return mm_dispatch({.A = A,
+                      .B = B,
+                      .b_type = b_type,
+                      .maybe_out_type = maybe_out_type,
+                      .maybe_group_scales = maybe_group_scales,
+                      .maybe_group_zeros = maybe_group_zeros,
+                      .maybe_group_size = maybe_group_size,
+                      .maybe_channel_scales = maybe_channel_scales,
+                      .maybe_token_scales = maybe_token_scales,
+                      .maybe_schedule = maybe_schedule});
+}
+
+torch::Tensor prepack_B(
+    torch::Tensor const& B, at::ScalarType const& a_type, int64_t b_type_id,
+    std::optional<at::ScalarType> const& maybe_group_scales_type) {
+  ScalarType const b_type = ScalarType::from_id(b_type_id);
+  return prepack_B_dispatch(
+      {.B = B,
+       .a_type = a_type,
+       .b_type = b_type,
+       .maybe_group_scales_type = maybe_group_scales_type});
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("machete_prepack_B", &prepack_B);
+  m.impl("machete_mm", &mm);
+}
+
+// use CatchAll since supported_schedules has no tensor arguments
+TORCH_LIBRARY_IMPL(TORCH_EXTENSION_NAME, CatchAll, m) {
+  m.impl("machete_supported_schedules", &supported_schedules);
+}
+
+};  // namespace machete
diff --git a/vllm_v0.10.0/csrc/quantization/marlin/dense/LICENSE b/vllm_v0.10.0/csrc/quantization/marlin/dense/LICENSE
new file mode 100644
index 0000000..1d1e4cf
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/marlin/dense/LICENSE
@@ -0,0 +1,209 @@
+Contains code from https://github.com/IST-DASLab/marlin
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+------------------------------------------------------------------------------------
+
+This product bundles various third-party components under other open source licenses.
+This section summarizes those components and their licenses. See licenses/
+for text of these licenses.
diff --git a/vllm_v0.10.0/csrc/quantization/marlin/dense/common/base.h b/vllm_v0.10.0/csrc/quantization/marlin/dense/common/base.h
new file mode 100644
index 0000000..68c83d5
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/marlin/dense/common/base.h
@@ -0,0 +1,32 @@
+/*
+ * Modified by HandH1998
+ * Modified by Neural Magic
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; }
+
+// Instances of `Vec` are used to organize groups of >>registers<<, as needed
+// for instance as inputs to tensor core operations. Consequently, all
+// corresponding index accesses must be compile-time constants, which is why we
+// extensively use `#pragma unroll` throughout the kernel code to guarantee
+// this.
+template <typename T, int n>
+struct Vec {
+  T elems[n];
+  __device__ T& operator[](int i) { return elems[i]; }
+};
diff --git a/vllm_v0.10.0/csrc/quantization/marlin/dense/common/mem.h b/vllm_v0.10.0/csrc/quantization/marlin/dense/common/mem.h
new file mode 100644
index 0000000..64f9c39
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/marlin/dense/common/mem.h
@@ -0,0 +1,89 @@
+/*
+ * Modified by HandH1998
+ * Modified by Neural Magic
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+// Predicated asynchronous global->shared copy; used for inputs A where we apply
+// predication to handle batchsizes that are not multiples of 16.
+__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
+                                      bool pred = true) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+
+// Asynchronous global->shared copy
+__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
+      "}\n" ::"r"(smem),
+      "l"(glob_ptr), "n"(BYTES));
+}
+
+// Async copy fence.
+__device__ inline void cp_async_fence() {
+  asm volatile("cp.async.commit_group;\n" ::);
+}
+
+// Wait until at most `n` async copy stages are still pending.
+template <int n>
+__device__ inline void cp_async_wait() {
+  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
+}
+
+// Wait until barrier reaches `count`, then lock for current threadblock.
+__device__ inline void barrier_acquire(int* lock, int count) {
+  if (threadIdx.x == 0) {
+    int state = -1;
+    do
+      // Guarantee that subsequent writes by this threadblock will be visible
+      // globally.
+      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
+                   : "=r"(state)
+                   : "l"(lock));
+    while (state != count);
+  }
+  __syncthreads();
+}
+
+// Release barrier and increment visitation count.
+__device__ inline void barrier_release(int* lock, bool reset = false) {
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    if (reset) {
+      lock[0] = 0;
+      return;
+    }
+    int val = 1;
+    // Make sure that all writes since acquiring this barrier are visible
+    // globally, while releasing the barrier.
+    asm volatile("fence.acq_rel.gpu;\n");
+    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
+                 :
+                 : "l"(lock), "r"(val));
+  }
+}
diff --git a/vllm_v0.10.0/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu b/vllm_v0.10.0/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
new file mode 100644
index 0000000..ea96326
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
@@ -0,0 +1,1073 @@
+/*
+ * Modified by Neural Magic
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+
+#include "common/base.h"
+#include "core/registration.h"
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  #include "common/mem.h"
+#endif
+
+template <typename T>
+inline std::string str(T x) {
+  return std::to_string(x);
+}
+
+namespace marlin_dense {
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+
+using I4 = Vec<int, 4>;
+// Matrix fragments for tensor core instructions; their precise layout is
+// documented here:
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
+using FragA = Vec<half2, 4>;
+using FragB = Vec<half2, 2>;
+using FragC = Vec<float, 4>;
+using FragS = Vec<half2, 1>;  // quantization scales
+
+// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
+// output/accumulation.
+__device__ inline void mma(const FragA& a_frag, const FragB& frag_b,
+                           FragC& frag_c) {
+  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  float* c = reinterpret_cast<float*>(&frag_c);
+  asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+      "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+      : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+        "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+}
+
+// Instruction for loading a full 16x16 matrix fragment of operand A from shared
+// memory, directly in tensor core layout.
+__device__ inline void ldsm4(FragA& frag_a, const void* smem_ptr) {
+  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
+               : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
+               : "r"(smem));
+}
+
+// Lookup-table based 3-input logical operation; explicitly used for
+// dequantization as the compiler does not seem to automatically recognize it in
+// all cases.
+template <int lut>
+__device__ inline int lop3(int a, int b, int c) {
+  int res;
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(res)
+               : "r"(a), "r"(b), "r"(c), "n"(lut));
+  return res;
+}
+
+// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
+// values. We mostly follow the strategy in the link below, with some small
+// changes:
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+__device__ inline FragB dequant(int q) {
+  const int LO = 0x000f000f;
+  const int HI = 0x00f000f0;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
+  // directly into `SUB` and `ADD`.
+  const int SUB = 0x64086408;
+  const int MUL = 0x2c002c00;
+  const int ADD = 0xd480d480;
+  FragB frag_b;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&SUB));
+  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&MUL),
+                      *reinterpret_cast<const half2*>(&ADD));
+  return frag_b;
+}
+
+// Multiply dequantized values by the corresponding quantization scale; used
+// only for grouped quantization.
+__device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) {
+  half2 s = __half2half2(reinterpret_cast<__half*>(&frag_s)[i]);
+  frag_b[0] = __hmul2(frag_b[0], s);
+  frag_b[1] = __hmul2(frag_b[1], s);
+}
+
+template <const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const int group_blocks = -1  // number of consecutive 16x16 blocks
+                                       // with a separate quantization scale
+          >
+__global__ void Marlin(
+    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // fp16 output buffer of shape mxn
+    const int4* __restrict__ s,  // fp16 quantization scales of shape
+                                 // (k/groupsize)xn
+    int prob_m,                  // batch dimension m
+    int prob_n,                  // output dimension n
+    int prob_k,                  // reduction dimension k
+    int* locks  // extra global storage for barrier synchronization
+) {
+  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
+  // same size, which might involve multiple column "slices" (of width 16 *
+  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
+  // example:
+  //   0 1 3
+  //   0 2 3
+  //   1 2 4
+  // While this kind of partitioning makes things somewhat more complicated, it
+  // ensures good utilization of all SMs for many kinds of shape and GPU
+  // configurations, while requiring as few slow global cross-threadblock
+  // reductions as possible.
+
+  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
+  // better partitioning with less reductions
+  int parallel = 1;
+  if (prob_m > 16 * thread_m_blocks) {
+    parallel = prob_m / (16 * thread_m_blocks);
+    prob_m = 16 * thread_m_blocks;
+  }
+
+  int k_tiles = prob_k / 16 / thread_k_blocks;
+  int n_tiles = prob_n / 16 / thread_n_blocks;
+  int iters = ceildiv(k_tiles * n_tiles * parallel, gridDim.x);
+  // Ensure that the number of tiles in each stripe is a multiple of the
+  // groupsize; this avoids an annoying special case where a stripe starts in
+  // the middle of group.
+  if (group_blocks != -1)
+    iters = (group_blocks / thread_k_blocks) *
+            ceildiv(iters, (group_blocks / thread_k_blocks));
+
+  int slice_row = (iters * blockIdx.x) % k_tiles;
+  int slice_col_par = (iters * blockIdx.x) / k_tiles;
+  int slice_col = slice_col_par;
+  int slice_iters;  // number of threadblock tiles in the current slice
+  int slice_count =
+      0;          // total number of active threadblocks in the current slice
+  int slice_idx;  // index of threadblock in current slice; numbered bottom to
+                  // top
+
+  // We can easily implement parallel problem execution by just remapping
+  // indices and advancing global pointers
+  if (slice_col_par >= n_tiles) {
+    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8;
+    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
+    locks += (slice_col_par / n_tiles) * n_tiles;
+    slice_col = slice_col_par % n_tiles;
+  }
+
+  // Compute all information about the current slice which is required for
+  // synchronization.
+  auto init_slice = [&]() {
+    slice_iters =
+        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
+    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
+    if (slice_iters == 0) return;
+    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
+    slice_count = 1;
+    slice_idx = 0;
+    int col_first = iters * ceildiv(k_tiles * slice_col_par, iters);
+    if (col_first <= k_tiles * (slice_col_par + 1)) {
+      int col_off = col_first - k_tiles * slice_col_par;
+      slice_count = ceildiv(k_tiles - col_off, iters);
+      if (col_off > 0) slice_count++;
+      int delta_first = iters * blockIdx.x - col_first;
+      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
+        slice_idx = slice_count - 1;
+      else {
+        slice_idx = slice_count - 1 - delta_first / iters;
+        if (col_off > 0) slice_idx--;
+      }
+    }
+    if (slice_col == n_tiles) {
+      A += 16 * thread_m_blocks * prob_k / 8;
+      C += 16 * thread_m_blocks * prob_n / 8;
+      locks += n_tiles;
+      slice_col = 0;
+    }
+  };
+  init_slice();
+
+  int a_gl_stride = prob_k / 8;  // stride of the A matrix in global memory
+  // We typically use `constexpr` to indicate that this value is a compile-time
+  // constant
+  constexpr int a_sh_stride =
+      16 * thread_k_blocks / 8;  // stride of an A matrix tile in shared memory
+  constexpr int a_gl_rd_delta_o =
+      16 * thread_k_blocks /
+      8;  // delta between subsequent A tiles in global memory
+  int a_gl_rd_delta_i =
+      a_gl_stride *
+      (threads / a_gl_rd_delta_o);  // between subsequent accesses within a tile
+  constexpr int a_sh_wr_delta =
+      a_sh_stride *
+      (threads / a_gl_rd_delta_o);  // between shared memory writes
+  constexpr int a_sh_rd_delta_o =
+      2 * ((threads / 32) /
+           (thread_n_blocks / 4));  // between shared memory tile reads
+  constexpr int a_sh_rd_delta_i =
+      a_sh_stride * 16;  // within a shared memory tile
+  constexpr int a_sh_stage =
+      a_sh_stride * (16 * thread_m_blocks);  // overall size of a tile
+  constexpr int a_sh_wr_iters =
+      ceildiv(a_sh_stage,
+              a_sh_wr_delta);  // number of shared write iterations for a tile
+
+  int b_gl_stride = 16 * prob_n / 32;
+  constexpr int b_sh_stride = 32 * thread_n_blocks / 4;
+  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
+  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride);
+  constexpr int b_sh_wr_delta = threads;
+  constexpr int b_sh_rd_delta = threads;
+  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
+  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
+
+  int s_gl_stride = prob_n / 8;
+  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
+  constexpr int s_sh_stage = s_sh_stride;
+  int s_gl_rd_delta = s_gl_stride;
+
+  // Global A read index of current thread.
+  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  a_gl_rd += a_gl_rd_delta_o * slice_row;
+  // Shared write index of current thread.
+  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  // Shared read index.
+  int a_sh_rd =
+      a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
+  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
+
+  int b_gl_rd =
+      b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
+  b_gl_rd += b_sh_stride * slice_col;
+  b_gl_rd += b_gl_rd_delta_o * slice_row;
+  auto b_sh_wr = threadIdx.x;
+  auto b_sh_rd = threadIdx.x;
+
+  int s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+                s_sh_stride * slice_col + threadIdx.x;
+  auto s_sh_wr = threadIdx.x;
+  int s_sh_rd;
+  // We use a different scale layout for grouped and column-wise quantization as
+  // we scale a `half2` tile in column-major layout in the former and in
+  // row-major in the latter case.
+  if (group_blocks != -1)
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) / 4;
+  else
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) % 4;
+
+  // Precompute which thread should not read memory in which iterations; this is
+  // needed if there are more threads than required for a certain tilesize or
+  // when the batchsize is not a multiple of 16.
+  bool a_sh_wr_pred[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
+  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
+
+  // To ensure that writing and reading A tiles to/from shared memory, the
+  // latter in fragment format, is fully bank conflict free, we need to use a
+  // rather fancy XOR-based layout. The key here is that neither reads nor
+  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
+  // same shared memory banks. Further, it seems (based on NSight-Compute) that
+  // each warp must also write a consecutive memory segment?
+  auto transform_a = [&](int i) {
+    int row = i / a_gl_rd_delta_o;
+    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
+  };
+  // Since the computation of this remapping is non-trivial and, due to our main
+  // loop unrolls, all shared memory accesses are static, we simply precompute
+  // both transformed reads and writes.
+  int a_sh_wr_trans[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
+  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++) {
+  #pragma unroll
+    for (int j = 0; j < thread_m_blocks; j++)
+      a_sh_rd_trans[i][j] =
+          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+  }
+
+  // Since B-accesses have non-constant stride they have to be computed at
+  // runtime; we break dependencies between subsequent accesses with a tile by
+  // maintining multiple pointers (we have enough registers), a tiny
+  // optimization.
+  const int4* B_ptr[b_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++)
+    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
+
+  extern __shared__ int4 sh[];
+  // Shared memory storage for global fetch pipelines.
+  int4* sh_a = sh;
+  int4* sh_b = sh_a + (stages * a_sh_stage);
+  int4* sh_s = sh_b + (stages * b_sh_stage);
+  // Register storage for double buffer of shared memory reads.
+  FragA frag_a[2][thread_m_blocks];
+  I4 frag_b_quant[2];
+  FragC frag_c[thread_m_blocks][4][2];
+  FragS frag_s[2][4];
+
+  // Zero accumulators.
+  auto zero_accums = [&]() {
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
+      reinterpret_cast<float*>(frag_c)[i] = 0;
+  };
+
+  // Asynchronously fetch the next A, B and s tile from global to the next
+  // shared memory pipeline location.
+  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
+    if (pred) {
+      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < a_sh_wr_iters; i++) {
+        cp_async4_pred(
+            &sh_a_stage[a_sh_wr_trans[i]],
+            &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
+            a_sh_wr_pred[i]);
+      }
+      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < b_sh_wr_iters; i++) {
+        cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
+        B_ptr[i] += b_gl_rd_delta_o;
+      }
+      // Only fetch scales if this tile starts a new group
+      if constexpr (group_blocks != -1) {
+        // This assumes group_blocks >= thread_k_blocks
+        // and would need to be modified to support smaller groups.
+        static_assert(group_blocks >= thread_k_blocks);
+        if (pipe % (group_blocks / thread_k_blocks) == 0) {
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+          if (s_sh_wr_pred) cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
+          s_gl_rd += s_gl_rd_delta;
+        }
+      }
+    }
+    // Insert a fence even when we are winding down the pipeline to ensure that
+    // waiting is also correct at this point.
+    cp_async_fence();
+  };
+
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<stages - 2>();
+    __syncthreads();
+  };
+
+  // Load the next sub-tile from the current location in the shared memory pipe
+  // into the current register buffer.
+  auto fetch_to_registers = [&](int k, int pipe) {
+    // It may seem inefficient that we reload the groups for every sub-tile;
+    // however, this does not seem to be a significant bottleneck, while some
+    // theoretically better attempts have lead to bad instruction ordering by
+    // the compiler and correspondingly a noticeable drop in performance.
+    if constexpr (group_blocks != -1) {
+      // This assumes group_blocks >= thread_k_blocks
+      // and would need to be modified to support smaller groups.
+      static_assert(group_blocks >= thread_k_blocks);
+      int4* sh_s_stage =
+          sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
+                               (pipe / (group_blocks / thread_k_blocks)));
+      reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
+    }
+    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks; i++)
+      ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
+    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+    frag_b_quant[k % 2] = *reinterpret_cast<I4*>(
+        &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]);
+  };
+
+  // Execute the actual tensor core matmul of a sub-tile.
+  auto matmul = [&](int k) {
+  // We have the m dimension as the inner loop in order to encourage overlapping
+  // dequantization and matmul operations.
+  #pragma unroll
+    for (int j = 0; j < 4; j++) {
+      int b_quant = frag_b_quant[k % 2][j];
+      int b_quant_shift = b_quant >> 8;
+      FragB frag_b0 = dequant(b_quant);
+      // If there are no groups, we can just scale the final output once and can
+      // avoid doing so for each weight.
+      if (group_blocks != -1) scale(frag_b0, frag_s[k % 2][j], 0);
+      FragB frag_b1 = dequant(b_quant_shift);
+      if (group_blocks != -1) scale(frag_b1, frag_s[k % 2][j], 1);
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
+        mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
+      }
+    }
+  };
+
+  // Since we slice across the k dimension of a tile in order to increase the
+  // number of warps while keeping the n dimension of a tile reasonable, we have
+  // multiple warps that accumulate their partial sums of the same output
+  // location; which we have to reduce over in the end. We do in shared memory.
+  auto thread_block_reduce = [&]() {
+    constexpr int red_off = threads / b_sh_stride / 2;
+    if (red_off >= 1) {
+      auto red_idx = threadIdx.x / b_sh_stride;
+      constexpr int red_sh_stride = b_sh_stride * 4 * 2;
+      constexpr int red_sh_delta = b_sh_stride;
+      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
+                      (threadIdx.x % b_sh_stride);
+
+      // Parallel logarithmic shared memory reduction. We make sure to avoid any
+      // unnecessary read or write iterations, e.g., for two warps we write only
+      // once by warp 1 and read only once by warp 0.
+
+  #pragma unroll
+      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
+  #pragma unroll
+        for (int i = red_off; i > 0; i /= 2) {
+          if (i <= red_idx && red_idx < 2 * i) {
+  #pragma unroll
+            for (int j = 0; j < 4 * 2; j++) {
+              int red_sh_wr =
+                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
+              if (i < red_off) {
+                float* c_rd =
+                    reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
+                float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
+  #pragma unroll
+                for (int k = 0; k < 4; k++)
+                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
+                      c_rd[k] + c_wr[k];
+              }
+              sh[red_sh_wr] =
+                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
+            }
+          }
+          __syncthreads();
+        }
+        if (red_idx == 0) {
+  #pragma unroll
+          for (int i = 0; i < 4 * 2; i++) {
+            float* c_rd =
+                reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
+  #pragma unroll
+            for (int j = 0; j < 4; j++)
+              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
+                  c_rd[j];
+          }
+        }
+        __syncthreads();
+      }
+    }
+  };
+
+  // Since multiple threadblocks may process parts of the same column slice, we
+  // finally have to globally reduce over the results. As the striped
+  // partitioning minimizes the number of such reductions and our outputs are
+  // usually rather small, we perform this reduction serially in L2 cache.
+  auto global_reduce = [&](bool first = false, bool last = false) {
+    // We are very careful here to reduce directly in the output buffer to
+    // maximize L2 cache utilization in this step. To do this, we write out
+    // results in FP16 (but still reduce with FP32 compute).
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    if (threadIdx.x < active_threads) {
+      int c_gl_stride = prob_n / 8;
+      int c_gl_wr_delta_o = 8 * c_gl_stride;
+      int c_gl_wr_delta_i = 4 * (active_threads / 32);
+      int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
+                    4 * (threadIdx.x / 32) + threadIdx.x % 4;
+      c_gl_wr += (2 * thread_n_blocks) * slice_col;
+      constexpr int c_sh_wr_delta = active_threads;
+      auto c_sh_wr = threadIdx.x;
+
+      int row = (threadIdx.x % 32) / 4;
+
+      if (!first) {
+  // Interestingly, doing direct global accesses here really seems to mess up
+  // the compiler and lead to slowdowns, hence we also use async-copies even
+  // though these fetches are not actually asynchronous.
+  #pragma unroll
+        for (int i = 0; i < thread_m_blocks * 4; i++) {
+          cp_async4_pred(
+              &sh[c_sh_wr + c_sh_wr_delta * i],
+              &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
+                 c_gl_wr_delta_i * (i % 2)],
+              i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
+        }
+        cp_async_fence();
+        cp_async_wait<0>();
+      }
+
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks * 4; i++) {
+        if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
+          if (!first) {
+            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
+  #pragma unroll
+            for (int j = 0; j < 2 * 4; j++) {
+              reinterpret_cast<float*>(
+                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] +=
+                  __half2float(reinterpret_cast<__half*>(&c_red)[j]);
+            }
+          }
+          if (!last) {
+            int4 c;
+  #pragma unroll
+            for (int j = 0; j < 2 * 4; j++) {
+              reinterpret_cast<__half*>(&c)[j] =
+                  __float2half(reinterpret_cast<float*>(
+                      &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]);
+            }
+            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] =
+                c;
+          }
+        }
+      }
+    }
+  };
+
+  // Write out the reduce final result in the correct layout. We only actually
+  // reshuffle matrix fragments in this step, the reduction above is performed
+  // in fragment layout.
+  auto write_result = [&]() {
+    int c_gl_stride = prob_n / 8;
+    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
+    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
+    constexpr int c_sh_rd_delta =
+        c_sh_stride * (threads / (2 * thread_n_blocks));
+
+    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+    c_gl_wr += (2 * thread_n_blocks) * slice_col;
+    int c_sh_wr =
+        (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
+    c_sh_wr += 32 * (threadIdx.x / 32);
+    int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+
+    int c_gl_wr_end = c_gl_stride * prob_m;
+
+    // We first reorder in shared memory to guarantee the most efficient final
+    // global write patterns
+    auto write = [&](int idx, float c0, float c1, FragS& s) {
+      half2 res = __halves2half2(__float2half(c0), __float2half(c1));
+      if (group_blocks ==
+          -1)  // for per-column quantization we finally apply the scale here
+        res = __hmul2(res, s[0]);
+      ((half2*)sh)[idx] = res;
+    };
+    if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+        for (int j = 0; j < 4; j++) {
+          int wr = c_sh_wr + 8 * j;
+          write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
+                frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
+          write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
+                frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
+          write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
+                frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
+          write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
+                frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
+        }
+        c_sh_wr += 16 * (4 * c_sh_stride);
+      }
+    }
+    __syncthreads();
+
+  #pragma unroll
+    for (int i = 0;
+         i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
+         i++) {
+      if (c_gl_wr < c_gl_wr_end) {
+        C[c_gl_wr] = sh[c_sh_rd];
+        c_gl_wr += c_gl_wr_delta;
+        c_sh_rd += c_sh_rd_delta;
+      }
+    }
+  };
+
+  // Start global fetch and register load pipelines.
+  auto start_pipes = [&]() {
+  #pragma unroll
+    for (int i = 0; i < stages - 1; i++) fetch_to_shared(i, i, i < slice_iters);
+    zero_accums();
+    wait_for_stage();
+    fetch_to_registers(0, 0);
+    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
+  };
+  start_pipes();
+
+  // Main loop.
+  while (slice_iters) {
+  // We unroll over both the global fetch and the register load pipeline to
+  // ensure all shared memory accesses are static. Note that both pipelines have
+  // even length meaning that the next iteration will always start at index 0.
+  #pragma unroll
+    for (int pipe = 0; pipe < stages;) {
+  #pragma unroll
+      for (int k = 0; k < b_sh_wr_iters; k++) {
+        fetch_to_registers(k + 1, pipe % stages);
+        if (k == b_sh_wr_iters - 2) {
+          fetch_to_shared((pipe + stages - 1) % stages, pipe,
+                          slice_iters >= stages);
+          pipe++;
+          wait_for_stage();
+        }
+        matmul(k);
+      }
+      slice_iters--;
+      if (slice_iters == 0) break;
+    }
+    a_gl_rd += a_gl_rd_delta_o * stages;
+
+    // Process results and, if necessary, proceed to the next column slice.
+    // While this pattern may not be the most readable, other ways of writing
+    // the loop seemed to noticeably worse performance after compilation.
+    if (slice_iters == 0) {
+      cp_async_wait<0>();
+      bool last = slice_idx == slice_count - 1;
+      // For per-column scales, we only fetch them here in the final step before
+      // write-out
+      if (group_blocks == -1 && last) {
+        if (s_sh_wr_pred) cp_async4(&sh_s[s_sh_wr], &s[s_gl_rd]);
+        cp_async_fence();
+      }
+      thread_block_reduce();
+      if (group_blocks == -1 && last) {
+        cp_async_wait<0>();
+        __syncthreads();
+        if (threadIdx.x / 32 < thread_n_blocks / 4) {
+          reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+          reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+        }
+      }
+      if (slice_count > 1) {  // only globally reduce if there is more than one
+                              // block in a slice
+        barrier_acquire(&locks[slice_col], slice_idx);
+        global_reduce(slice_idx == 0, last);
+        barrier_release(&locks[slice_col], last);
+      }
+      if (last)  // only the last block in a slice actually writes the result
+        write_result();
+      slice_row = 0;
+      slice_col_par++;
+      slice_col++;
+      init_slice();
+      if (slice_iters) {
+        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                  (threadIdx.x % a_gl_rd_delta_o);
+  #pragma unroll
+        for (int i = 0; i < b_sh_wr_iters; i++)
+          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
+        if (slice_col == 0) {
+  #pragma unroll
+          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
+        }
+        s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+        start_pipes();
+      }
+    }
+  }
+}
+
+#else
+
+template <const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const int group_blocks = -1  // number of consecutive 16x16 blocks
+                                       // with a separate quantization scale
+          >
+__global__ void Marlin(
+    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // fp16 output buffer of shape mxn
+    const int4* __restrict__ s,  // fp16 quantization scales of shape
+                                 // (k/groupsize)xn
+    int prob_m,                  // batch dimension m
+    int prob_n,                  // output dimension n
+    int prob_k,                  // reduction dimension k
+    int* locks  // extra global storage for barrier synchronization
+) {
+  // Marlin is not implemented yet for SM < 8.0
+  assert(false);
+  return;
+}
+
+#endif
+
+// 8 warps are a good choice since every SM has 4 schedulers and having more
+// than 1 warp per schedule allows some more latency hiding. At the same time,
+// we want relatively few warps to have many registers per warp and small tiles.
+const int USER_THREADS =
+    256;               // Note: This is only used with user-provided thread_k/n
+const int STAGES = 4;  // 4 pipeline stages fit into shared memory
+const int SHARED_MEM =
+    96 * 1024;  // max shared memory on compute capability 8.6 (< 8.0)
+
+static constexpr int min_thread_n = 64;
+static constexpr int min_thread_k = 64;
+
+static constexpr int tile_size = 16;
+static constexpr int max_par = 16;
+
+static constexpr int pack_factor_4bit =
+    8;  // We have 8 4-bit vals inside a 32 bit
+
+#define __CALL_IF(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,           \
+                  GROUP_BLOCKS, NUM_THREADS)                                   \
+  else if (thread_m_blocks == THREAD_M_BLOCKS &&                               \
+           thread_n_blocks == THREAD_N_BLOCKS &&                               \
+           thread_k_blocks == THREAD_K_BLOCKS &&                               \
+           group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) {       \
+    cudaFuncSetAttribute(Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, \
+                                THREAD_K_BLOCKS, STAGES, GROUP_BLOCKS>,        \
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,          \
+                         SHARED_MEM);                                          \
+    Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,     \
+           STAGES, GROUP_BLOCKS><<<blocks, NUM_THREADS, SHARED_MEM, stream>>>( \
+        A_ptr, B_ptr, C_ptr, s_ptr, prob_m, prob_n, prob_k, locks);            \
+  }
+
+typedef struct {
+  int thread_k;
+  int thread_n;
+  int num_threads;
+} thread_config_t;
+
+thread_config_t small_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {128, 128, 256},  // Default
+    {128, 64, 128},   // Reduce N 2X, same K
+    {64, 256, 256},   // Reduce K 2X, increase N 2X
+    {64, 128, 128},   // Reduce K 2X, same N
+};
+
+thread_config_t large_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {64, 256, 256},   // Default
+    {128, 128, 256},  // Reduce N 2X, increase K 2X
+    {64, 128, 128},   // Reduce N 2X, same K
+    {128, 64, 128},   // Reduce N 4X, increase K 2X
+};
+
+bool is_valid_config(thread_config_t const& th_config, int prob_m, int prob_n,
+                     int prob_k) {
+  // Sanity
+  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
+      th_config.num_threads == -1) {
+    return false;
+  }
+
+  // Verify K/N are divisible by thread K/N
+  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
+    return false;
+  }
+
+  // thread_k can be only 128 or 64 (because it must be less than groupsize
+  // which is 128)
+  if (th_config.thread_k != 128 && th_config.thread_k != 64) {
+    return false;
+  }
+
+  // Verify min for thread K/N
+  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
+    return false;
+  }
+
+  // num_threads must be at least 128 (= 4 warps)
+  if (th_config.num_threads < 128) {
+    return false;
+  }
+
+  return true;
+}
+
+thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) {
+  if (prob_m <= 16) {
+    for (auto th_config : small_batch_thread_configs) {
+      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
+        return th_config;
+      }
+    }
+
+  } else {
+    for (auto th_config : large_batch_thread_configs) {
+      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
+        return th_config;
+      }
+    }
+  }
+
+  return thread_config_t{-1, -1, -1};
+}
+
+#define CALL_IF(N_BLOCKS, K_BLOCKS, NUM_THREADS)    \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
+  __CALL_IF(2, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+  __CALL_IF(2, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
+  __CALL_IF(3, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+  __CALL_IF(3, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
+  __CALL_IF(4, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+  __CALL_IF(4, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)
+
+void marlin_cuda(const void* A, const void* B, void* C, void* s, int prob_m,
+                 int prob_n, int prob_k, void* workspace, int groupsize = -1,
+                 int dev = 0, cudaStream_t stream = 0, int thread_k = -1,
+                 int thread_n = -1, int sms = -1, int max_par = 16) {
+  int tot_m = prob_m;
+  int tot_m_blocks = ceildiv(tot_m, 16);
+  int pad = 16 * tot_m_blocks - tot_m;
+
+  if (sms == -1)
+    cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
+
+  // Set thread config
+  thread_config_t th_config;
+  if (thread_k != -1 && thread_n != -1) {
+    // User-defined config
+    th_config = thread_config_t{thread_k, thread_n, USER_THREADS};
+  } else {
+    // Auto config
+    th_config = determine_thread_config(prob_m, prob_n, prob_k);
+  }
+
+  if (!is_valid_config(th_config, prob_m, prob_n, prob_k)) {
+    throw std::runtime_error(
+        "Invalid thread config: thread_k = " + str(th_config.thread_k) +
+        ", thread_n = " + str(th_config.thread_n) +
+        ", num_threads = " + str(th_config.num_threads) + " for MKN = [" +
+        str(prob_m) + ", " + str(prob_k) + ", " + str(prob_n) + "]");
+  }
+
+  // Uncomment for debug
+  // std::cout << "Using thread_config: thread_k = " + str(th_config.thread_k) +
+  //                  ", thread_n = " + str(th_config.thread_n) +
+  //                  ", num_threads = " + str(th_config.num_threads) + " for
+  //                  MKN = [" + str(prob_m) +
+  //                  ", " + str(prob_k) + ", " + str(prob_n) + "]\n";
+
+  int num_threads = th_config.num_threads;
+  thread_k = th_config.thread_k;
+  thread_n = th_config.thread_n;
+
+  int thread_k_blocks = thread_k / 16;
+  int thread_n_blocks = thread_n / 16;
+  int group_blocks = (groupsize == -1) ? -1 : groupsize / 16;
+  int blocks = sms;
+
+  if (prob_m == 0 || prob_n == 0 || prob_k == 0) {
+    return;
+  }
+
+  TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
+              " is not divisible by thread_n = ", thread_n);
+  TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
+              " is not divisible by thread_k = ", thread_k);
+  if (group_blocks != -1) {
+    TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
+                " is not divisible by group_blocks = ", group_blocks);
+  }
+
+  const int4* A_ptr = (const int4*)A;
+  const int4* B_ptr = (const int4*)B;
+  int4* C_ptr = (int4*)C;
+  const int4* s_ptr = (const int4*)s;
+
+  int* locks = (int*)workspace;
+
+  for (int i = 0; i < tot_m_blocks; i += 4) {
+    int thread_m_blocks = tot_m_blocks - i;
+    prob_m = tot_m - 16 * i;
+    int par = 1;
+    if (thread_m_blocks > 4) {
+      // Note that parallel > 1 currently only works for inputs without any
+      // padding
+      par = (16 * thread_m_blocks - pad) / 64;
+      if (par > max_par) par = max_par;
+      prob_m = 64 * par;
+      i += 4 * (par - 1);
+      thread_m_blocks = 4;
+    }
+
+    // For compilation speed, we only define the kernel configurations that have
+    // seemed useful (in terms of performance) in our testing, however many more
+    // are, in principle, possible.
+    if (false) {
+    }
+    CALL_IF(8, 8, 256)
+    CALL_IF(16, 4, 256)
+    CALL_IF(8, 4, 128)
+    CALL_IF(4, 8, 128)
+    else {
+      throw std::runtime_error("Unsupported shapes: MKN = [" + str(prob_m) +
+                               ", " + str(prob_k) + ", " + str(prob_n) + "]" +
+                               ", groupsize = " + str(groupsize) +
+                               ", thread_m_blocks = " + str(thread_m_blocks) +
+                               ", thread_n_blocks = " + str(thread_n_blocks) +
+                               ", thread_k_blocks = " + str(thread_k_blocks));
+    }
+
+    A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
+    C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par;
+  }
+}
+
+}  // namespace marlin_dense
+
+torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
+                          torch::Tensor& b_scales, torch::Tensor& workspace,
+                          int64_t size_m, int64_t size_n, int64_t size_k) {
+  // Verify M
+  TORCH_CHECK(size_m == a.size(0),
+              "Shape mismatch: a.size(0) = " + str(a.size(0)) +
+                  ", size_m = " + str(size_m));
+
+  // Verify K
+  TORCH_CHECK(size_k == a.size(1),
+              "Shape mismatch: a.size(1) = " + str(a.size(1)) +
+                  ", size_k = " + str(size_k));
+  TORCH_CHECK(size_k % marlin_dense::tile_size == 0,
+              "size_k = " + str(size_k) + " is not divisible by tile_size = " +
+                  str(marlin_dense::tile_size));
+  TORCH_CHECK((size_k / marlin_dense::tile_size) == b_q_weight.size(0),
+              "Shape mismatch: b_q_weight.size(0) = " +
+                  str(b_q_weight.size(0)) + ", size_k = " + str(size_k) +
+                  ", tile_size = " + str(marlin_dense::tile_size));
+
+  // Verify N
+  TORCH_CHECK(b_scales.size(1) == size_n,
+              "b_scales.size(1) = " + str(b_scales.size(1)) +
+                  ", size_n = " + str(size_n));
+  TORCH_CHECK(
+      b_q_weight.size(1) % marlin_dense::tile_size == 0,
+      "b_q_weight.size(1) = " + str(b_q_weight.size(1)) +
+          " is not divisible by tile_size = " + str(marlin_dense::tile_size));
+
+  int actual_size_n = (b_q_weight.size(1) / marlin_dense::tile_size) *
+                      marlin_dense::pack_factor_4bit;
+  TORCH_CHECK(
+      size_n == actual_size_n,
+      "size_n = " + str(size_n) + ", actual_size_n = " + str(actual_size_n));
+
+  // Verify A device and strides
+  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
+  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
+
+  // Verify B device and strides
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+
+  // Verify scales device and strides
+  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
+  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
+
+  // Alloc C matrix
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
+  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
+  torch::Tensor c = torch::empty({size_m, size_n}, options);
+
+  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_k = -1;
+  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_n = -1;
+  // sms: number of SMs to use for the kernel (can usually be left as auto -1)
+  int sms = -1;
+
+  // Detect groupsize
+  if (b_scales.size(0) != 1) {
+    TORCH_CHECK(size_k % b_scales.size(0) == 0,
+                "size_k = " + str(size_k) +
+                    ", is not divisible by b_scales.size(0) = " +
+                    str(b_scales.size(0)));
+  }
+  int groupsize = b_scales.size(0) == 1 ? -1 : size_k / b_scales.size(0);
+
+  // Verify groupsize
+  TORCH_CHECK(groupsize == -1 || groupsize == 128,
+              "Unexpected groupsize = " + str(groupsize));
+
+  // Verify workspace size
+  TORCH_CHECK(size_n % marlin_dense::min_thread_n == 0,
+              "size_n = " + str(size_n) +
+                  ", is not divisible by min_thread_n = " +
+                  str(marlin_dense::min_thread_n));
+  int min_workspace_size =
+      (size_n / marlin_dense::min_thread_n) * marlin_dense::max_par;
+  TORCH_CHECK(workspace.numel() >= min_workspace_size,
+              "workspace.numel = " + str(workspace.numel()) +
+                  " is below min_workspace_size = " + str(min_workspace_size));
+
+  int dev = a.get_device();
+  marlin_dense::marlin_cuda(a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(),
+                            b_scales.data_ptr(), size_m, size_n, size_k,
+                            workspace.data_ptr(), groupsize, dev,
+                            at::cuda::getCurrentCUDAStream(dev), thread_k,
+                            thread_n, sms, marlin_dense::max_par);
+
+  return c;
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("marlin_gemm", &marlin_gemm);
+}
diff --git a/vllm_v0.10.0/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu b/vllm_v0.10.0/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
new file mode 100644
index 0000000..c96d68d
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
@@ -0,0 +1,1248 @@
+/*
+ * Adapted from
+ * https://github.com/IST-DASLab/marlin/blob/master/marlin/marlin_cuda_kernel.cu
+ * https://github.com/IST-DASLab/marlin/blob/master/marlin/marlin_cuda.cpp
+ * Modified by HandH1998
+ * Copyright (C) 2024 HandH1998
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+
+#include "../dense/common/base.h"
+#include "core/registration.h"
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  #include "../dense/common/mem.h"
+#endif
+
+template <typename T>
+inline std::string str(T x) {
+  return std::to_string(x);
+}
+
+namespace {
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+
+using I4 = Vec<int, 4>;
+// Matrix fragments for tensor core instructions; their precise layout is
+// documented here:
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-integer-type
+using FragA = Vec<uint32_t, 2>;
+using FragB = Vec<uint32_t, 1>;
+using FragC = Vec<int, 4>;
+using FragS_GROUP = Vec<half2, 1>;  // weight per-group quantization scales
+using FragS_CHANNEL =
+    Vec<float, 2>;  // weight per-channel quantization scales or activaton
+                    // per-token quantization scales
+
+// NOTE(HandH1998): cp.async.cg only support BYTES = 16, however,
+// cp.async.ca can support BYTES = 4, 8, 16;
+// as s_tok's shape is equal to prob_m, we need set s_tok to float type,
+// and cp_size = 1 float, i.e., 4 BYTES
+// Asynchronous global->shared copy for activation quantizaton scales s_tok
+__device__ inline void cp_async1(void* smem_ptr, const void* glob_ptr) {
+  const int BYTES = 4;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   cp.async.ca.shared.global [%0], [%1], %2;\n"
+      "}\n" ::"r"(smem),
+      "l"(glob_ptr), "n"(BYTES));
+}
+
+// m16n8k16 tensor core mma instruction with int8 inputs and int32
+// output/accumulation.
+__device__ inline void mma(const FragA& a_frag, const FragB& frag_b,
+                           FragC& frag_c) {
+  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  int* c = reinterpret_cast<int*>(&frag_c);
+  asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.satfinite.s32.s8.s8.s32 "
+      "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+      : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+      : "r"(a[0]), "r"(a[1]), "r"(b[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]),
+        "r"(c[3]));
+}
+
+// Instruction for loading a full 16x16 matrix fragment of operand A from shared
+// memory, directly in int8 tensor core layout.
+__device__ inline void ldsm4(FragA& frag_a, const void* smem_ptr) {
+  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];\n"
+               : "=r"(a[0]), "=r"(a[1])
+               : "r"(smem));
+}
+
+inline __device__ half2 float2_to_half2(float2 f) {
+  uint32_t res;
+  // NOTE(HandH1998): h0,h1 should be uint16_t, not half
+  uint16_t h0, h1;
+  asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(h0) : "f"(f.x));
+  asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(h1) : "f"(f.y));
+  asm volatile("mov.b32 %0, {%1, %2};\n" : "=r"(res) : "h"(h0), "h"(h1));
+  return reinterpret_cast<half2&>(res);
+}
+
+inline __device__ float int32_to_float(int h) {
+  float res;
+  asm volatile("cvt.rn.f32.s32 %0, %1;\n" : "=f"(res) : "r"(h));
+  return res;
+}
+
+// Lookup-table based 3-input logical operation; explicitly used for
+// dequantization as the compiler does not seem to automatically recognize it in
+// all cases.
+template <int lut>
+__device__ inline int lop3(int a, int b, int c) {
+  int res;
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(res)
+               : "r"(a), "r"(b), "r"(c), "n"(lut));
+  return res;
+}
+
+// Efficiently dequantize an int32 value into a full B-fragment of 4 int8 values
+// for weight per channel dequant.
+__device__ inline FragB dequant_per_channel(int q) {
+  static constexpr int MASK = 0xf0f0f0f0;
+  FragB frag_b;
+  frag_b[0] = (q & MASK);
+  return frag_b;
+}
+
+// Efficiently dequantize an int32 value into a full B-fragment of 4 int8 values
+// for weight per group dequant.
+__device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) {
+  static constexpr uint32_t LO = 0x000f000f;
+  static constexpr uint32_t HI = 0x00f000f0;
+  static constexpr uint32_t EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  uint32_t t0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  uint32_t t1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
+  // directly into `SUB` and `ADD`.
+  static constexpr uint32_t SUB = 0x64086408;
+  static constexpr uint32_t MUL = 0x2c002c00;
+  static constexpr uint32_t ADD = 0xd480d480;
+  *reinterpret_cast<half2*>(&t0) = __hsub2(
+      *reinterpret_cast<half2*>(&t0), *reinterpret_cast<const half2*>(&SUB));
+  *reinterpret_cast<half2*>(&t1) = __hfma2(
+      *reinterpret_cast<half2*>(&t1), *reinterpret_cast<const half2*>(&MUL),
+      *reinterpret_cast<const half2*>(&ADD));
+
+  uint16_t s = reinterpret_cast<uint16_t*>(&frag_s)[i];
+  uint32_t double_s;
+  // pack 2xfp16 to half2
+  asm volatile("mov.b32 %0, {%1, %2};\n" : "=r"(double_s) : "h"(s), "h"(s));
+  // dequant and convert 4 half to 4 uint8 (be placed at the low 8 bits of 4
+  // half, respectively)
+  static constexpr uint32_t MAGIC_NUM = 0x64806480;
+  *reinterpret_cast<half2*>(&t0) = __hfma2(
+      *reinterpret_cast<half2*>(&t0), *reinterpret_cast<half2*>(&double_s),
+      *reinterpret_cast<const half2*>(&MAGIC_NUM));
+  *reinterpret_cast<half2*>(&t1) = __hfma2(
+      *reinterpret_cast<half2*>(&t1), *reinterpret_cast<half2*>(&double_s),
+      *reinterpret_cast<const half2*>(&MAGIC_NUM));
+  // take out the 4 uint8 from 4 half, then convert them to 4 int8 and pack 4
+  // int8 into 1 uint32
+  FragB frag_b;
+  uint32_t uint8s;
+  static constexpr uint32_t MASK_0246 = 0x6420;
+  static constexpr uint32_t UINT8s_TO_INT8s_MASK = 0x80808080;
+  asm volatile("prmt.b32 %0,%1,%2,%3;\n"
+               : "=r"(uint8s)
+               : "r"(t0), "r"(t1), "n"(MASK_0246));
+  frag_b[0] = (uint8s ^ UINT8s_TO_INT8s_MASK);
+  return frag_b;
+}
+
+template <const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const int group_blocks = -1  // number of consecutive 16x16 blocks
+                                       // with a separate quantization scale
+          >
+__global__ void Marlin(
+    const int4* __restrict__ A,  // int8 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // int32 global_reduce buffer of shape
+                           // (max_par*16*4)xn, as int8 tensor core's output is
+                           // int32 dtype
+    int4* __restrict__ D,              // fp16 output buffer of shape mxn
+    const float* __restrict__ s_tok,   // fp32 activation per-token quantization
+                                       // scales of shape mx1
+    const int4* __restrict__ s_ch,     // fp32 weight per-channel quantization
+                                       // scales of shape 1xn
+    const int4* __restrict__ s_group,  // fp16 weight per-group quantization
+                                       // scales of shape (k/groupsize)xn, when
+                                       // group_blocks=-1, it should be nullptr
+    int prob_m,                        // batch dimension m
+    int prob_n,                        // output dimension n
+    int prob_k,                        // reduction dimension k
+    int* locks  // extra global storage for barrier synchronization
+) {
+  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
+  // same size, which might involve multiple column "slices" (of width 16 *
+  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
+  // example:
+  //   0 1 3
+  //   0 2 3
+  //   1 2 4
+  // While this kind of partitioning makes things somewhat more complicated, it
+  // ensures good utilization of all SMs for many kinds of shape and GPU
+  // configurations, while requiring as few slow global cross-threadblock
+  // reductions as possible.
+
+  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
+  // better partitioning with less reductions
+  int parallel = 1;
+  if (prob_m > 16 * thread_m_blocks) {
+    parallel = prob_m / (16 * thread_m_blocks);
+    prob_m = 16 * thread_m_blocks;
+  }
+
+  int k_tiles = prob_k / 16 / thread_k_blocks;
+  int n_tiles = prob_n / 16 / thread_n_blocks;
+  int iters = ceildiv(k_tiles * n_tiles * parallel, gridDim.x);
+  // Ensure that the number of tiles in each stripe is a multiple of the
+  // groupsize; this avoids an annoying special case where a stripe starts in
+  // the middle of group.
+  if constexpr (group_blocks != -1)
+    iters = (group_blocks / thread_k_blocks) *
+            ceildiv(iters, (group_blocks / thread_k_blocks));
+
+  int slice_row = (iters * blockIdx.x) % k_tiles;
+  int slice_col_par = (iters * blockIdx.x) / k_tiles;
+  int slice_col = slice_col_par;
+  int slice_iters;  // number of threadblock tiles in the current slice
+  int slice_count =
+      0;          // total number of active threadblocks in the current slice
+  int slice_idx;  // index of threadblock in current slice; numbered bottom to
+                  // top
+
+  // We can easily implement parallel problem execution by just remapping
+  // indices and advancing global pointers
+  if (slice_col_par >= n_tiles) {
+    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 16;
+    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 4;
+    D += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
+    s_tok += (slice_col_par / n_tiles) * 16 * thread_m_blocks;
+    locks += (slice_col_par / n_tiles) * n_tiles;
+    slice_col = slice_col_par % n_tiles;
+  }
+
+  // Compute all information about the current slice which is required for
+  // synchronization.
+  auto init_slice = [&]() {
+    slice_iters =
+        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
+    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
+    if (slice_iters == 0) return;
+    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
+    slice_count = 1;
+    slice_idx = 0;
+    int col_first = iters * ceildiv(k_tiles * slice_col_par, iters);
+    if (col_first <= k_tiles * (slice_col_par + 1)) {
+      int col_off = col_first - k_tiles * slice_col_par;
+      slice_count = ceildiv(k_tiles - col_off, iters);
+      if (col_off > 0) slice_count++;
+      int delta_first = iters * blockIdx.x - col_first;
+      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
+        slice_idx = slice_count - 1;
+      else {
+        slice_idx = slice_count - 1 - delta_first / iters;
+        if (col_off > 0) slice_idx--;
+      }
+    }
+    if (slice_col == n_tiles) {
+      A += 16 * thread_m_blocks * prob_k / 16;
+      C += 16 * thread_m_blocks * prob_n / 4;
+      D += 16 * thread_m_blocks * prob_n / 8;
+      s_tok += 16 * thread_m_blocks;
+      locks += n_tiles;
+      slice_col = 0;
+    }
+  };
+  init_slice();
+
+  int a_gl_stride = prob_k / 16;  // stride of the A matrix in global memory
+  // We typically use `constexpr` to indicate that this value is a compile-time
+  // constant
+  constexpr int a_sh_stride =
+      16 * thread_k_blocks / 16;  // stride of an A matrix tile in shared memory
+  constexpr int a_gl_rd_delta_o =
+      16 * thread_k_blocks /
+      16;  // delta between subsequent A tiles in global memory
+  int a_gl_rd_delta_i =
+      a_gl_stride *
+      (threads / a_gl_rd_delta_o);  // between subsequent accesses within a tile
+  constexpr int a_sh_wr_delta =
+      a_sh_stride *
+      (threads / a_gl_rd_delta_o);  // between shared memory writes
+  constexpr int a_sh_rd_delta_o =
+      1 * ((threads / 32) /
+           (thread_n_blocks / 4));  // between shared memory tile reads
+  constexpr int a_sh_rd_delta_i =
+      a_sh_stride * 16;  // within a shared memory tile
+  constexpr int a_sh_stage =
+      a_sh_stride * (16 * thread_m_blocks);  // overall size of a tile
+  constexpr int a_sh_wr_iters =
+      ceildiv(a_sh_stage,
+              a_sh_wr_delta);  // number of shared write iterations for a tile
+
+  int b_gl_stride = 16 * prob_n / 32;
+  constexpr int b_sh_stride = 32 * thread_n_blocks / 4;
+  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
+  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride);
+  constexpr int b_sh_wr_delta = threads;
+  constexpr int b_sh_rd_delta = threads;
+  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
+  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
+
+  constexpr int s_tok_sh_stride = 16 * thread_m_blocks;
+
+  constexpr int s_ch_sh_stride = 16 * thread_n_blocks / 4;
+
+  int s_group_gl_stride = prob_n / 8;
+  constexpr int s_group_sh_stride = 16 * thread_n_blocks / 8;
+  constexpr int s_group_sh_stage = s_group_sh_stride;
+  int s_group_gl_rd_delta = s_group_gl_stride;
+
+  // Global A read index of current thread.
+  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  a_gl_rd += a_gl_rd_delta_o * slice_row;
+  // Shared write index of current thread.
+  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  // Shared read index.
+  // NOTE(HandH1998): int8 input a only need 16 threads to load 16x16 matrix
+  int a_sh_rd = a_sh_stride * ((threadIdx.x % 32) % 16);
+  a_sh_rd += 1 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
+
+  int b_gl_rd =
+      b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
+  b_gl_rd += b_sh_stride * slice_col;
+  b_gl_rd += b_gl_rd_delta_o * slice_row;
+  auto b_sh_wr = threadIdx.x;
+  auto b_sh_rd = threadIdx.x;
+
+  auto s_tok_gl_rd = threadIdx.x;
+  // NOTE(HandH1998): activation scale s_tok need shuffle to [0, 8, 1, 9, 2, 10,
+  // 3, 11, 4, 12, 5, 13, 6, 14, 7, 15] for example, 0, 8 row scales serve for
+  // thread 0, 1, 2, 3. For more details, refer to mma operand A layout as
+  // s_tok's size is not fixed, we can not shuffle before inference we shuffle
+  // it when fetching s_tok from global memory to shared memory, that's why
+  // s_tok_sh_wr is like this
+  int s_tok_sh_wr =
+      (threadIdx.x / 16) * 16 + (threadIdx.x % 8) * 2 + (threadIdx.x % 16) / 8;
+  int s_tok_sh_rd = (threadIdx.x % 32) / 4;
+  bool s_tok_sh_wr_pred = threadIdx.x < prob_m;
+
+  auto s_ch_gl_rd = s_ch_sh_stride * slice_col + threadIdx.x;
+  auto s_ch_sh_wr = threadIdx.x;
+  int s_ch_sh_rd = 16 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                   2 * ((threadIdx.x % 32) % 4);
+  bool s_ch_sh_wr_pred = threadIdx.x < s_ch_sh_stride;
+
+  int s_group_gl_rd, s_group_sh_wr, s_group_sh_rd;
+  bool s_group_sh_wr_pred;
+  if constexpr (group_blocks != -1) {
+    s_group_gl_rd =
+        s_group_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+        s_group_sh_stride * slice_col + threadIdx.x;
+    s_group_sh_wr = threadIdx.x;
+    // NOTE(HandH1998): s_group_sh_rd is related to mma output C
+    s_group_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                    (threadIdx.x % 32) / 4;
+    s_group_sh_wr_pred = threadIdx.x < s_group_sh_stride;
+  }
+
+  // Precompute which thread should not read memory in which iterations; this is
+  // needed if there are more threads than required for a certain tilesize or
+  // when the batchsize is not a multiple of 16.
+  bool a_sh_wr_pred[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
+
+  // To ensure that writing and reading A tiles to/from shared memory, the
+  // latter in fragment format, is fully bank conflict free, we need to use a
+  // rather fancy XOR-based layout. The key here is that neither reads nor
+  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
+  // same shared memory banks. Further, it seems (based on NSight-Compute) that
+  // each warp must also write a consecutive memory segment?
+  auto transform_a = [&](int i) {
+    int row = i / a_gl_rd_delta_o;
+    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
+  };
+  // Since the computation of this remapping is non-trivial and, due to our main
+  // loop unrolls, all shared memory accesses are static, we simply precompute
+  // both transformed reads and writes.
+  int a_sh_wr_trans[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
+  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++) {
+  #pragma unroll
+    for (int j = 0; j < thread_m_blocks; j++)
+      a_sh_rd_trans[i][j] =
+          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+  }
+
+  // Since B-accesses have non-constant stride they have to be computed at
+  // runtime; we break dependencies between subsequent accesses with a tile by
+  // maintining multiple pointers (we have enough registers), a tiny
+  // optimization.
+  const int4* B_ptr[b_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++)
+    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
+
+  extern __shared__ int4 sh[];
+  // Shared memory storage for global fetch pipelines.
+  // NOTE(HandH1998): stages need >= 4, otherwise, sh_s_tok = sh + max(stages *
+  // a_sh_stage + stages * b_sh_stage, 4 * stages * a_sh_stage)
+  int4* sh_a = sh;
+  int4* sh_b = sh_a + (stages * a_sh_stage);
+  int4* sh_s_tok = sh_b + (stages * b_sh_stage);
+  int4* sh_s_ch = sh_s_tok + s_tok_sh_stride;
+  int4* sh_s_group = sh_s_ch + s_ch_sh_stride;
+
+  // Register storage for double buffer of shared memory reads.
+  FragA frag_a[2][thread_m_blocks];
+  I4 frag_b_quant[2];
+  FragC frag_c[thread_m_blocks][4][2];
+  FragS_GROUP frag_s_group[2][4];
+  FragS_CHANNEL frag_s_tok[thread_m_blocks];
+  FragS_CHANNEL frag_s_ch[2][4];
+
+  // Zero accumulators.
+  auto zero_accums = [&]() {
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
+      reinterpret_cast<int*>(frag_c)[i] = 0;
+  };
+
+  // Asynchronously fetch the next A, B and s tile from global to the next
+  // shared memory pipeline location.
+  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
+    if (pred) {
+      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < a_sh_wr_iters; i++) {
+        cp_async4_pred(
+            &sh_a_stage[a_sh_wr_trans[i]],
+            &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
+            a_sh_wr_pred[i]);
+      }
+      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < b_sh_wr_iters; i++) {
+        cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
+        B_ptr[i] += b_gl_rd_delta_o;
+      }
+      // Only fetch scales if this tile starts a new group
+      if constexpr (group_blocks != -1) {
+        if (pipe % (group_blocks / thread_k_blocks) == 0) {
+          int4* sh_s_group_stage = sh_s_group + s_group_sh_stage * pipe;
+          if (s_group_sh_wr_pred)
+            cp_async4(&sh_s_group_stage[s_group_sh_wr],
+                      &s_group[s_group_gl_rd]);
+          s_group_gl_rd += s_group_gl_rd_delta;
+        }
+      }
+    }
+    // Insert a fence even when we are winding down the pipeline to ensure that
+    // waiting is also correct at this point.
+    cp_async_fence();
+  };
+
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<stages - 2>();
+    __syncthreads();
+  };
+
+  // Load the next sub-tile from the current location in the shared memory pipe
+  // into the current register buffer.
+  auto fetch_to_registers = [&](int k, int pipe) {
+    // It may seem inefficient that we reload the groups for every sub-tile;
+    // however, this does not seem to be a significant bottleneck, while some
+    // theoretically better attempts have lead to bad instruction ordering by
+    // the compiler and correspondingly a noticeable drop in performance.
+    if constexpr (group_blocks != -1) {
+      int4* sh_s_group_stage =
+          sh_s_group +
+          s_group_sh_stage * ((group_blocks / thread_k_blocks) *
+                              (pipe / (group_blocks / thread_k_blocks)));
+      reinterpret_cast<int4*>(&frag_s_group[k % 2])[0] =
+          sh_s_group_stage[s_group_sh_rd];
+    }
+    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks; i++)
+      ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
+    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+    frag_b_quant[k % 2] = *reinterpret_cast<I4*>(
+        &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]);
+  };
+
+  // Execute the actual tensor core matmul of a sub-tile.
+  auto matmul = [&](int k) {
+  // We have the m dimension as the inner loop in order to encourage overlapping
+  // dequantization and matmul operations.
+  #pragma unroll
+    for (int j = 0; j < 4; j++) {
+      int b_quant = frag_b_quant[k % 2][j];
+      // int b_quant_shift = b_quant << 4;
+      FragB frag_b0, frag_b1;
+      // If there are no groups, we can just scale the final output once and can
+      // avoid doing so for each weight.
+      if constexpr (group_blocks != -1) {
+        int b_quant_shift = b_quant >> 8;
+        frag_b0 = dequant_per_group(b_quant, frag_s_group[k % 2][j], 0);
+        frag_b1 = dequant_per_group(b_quant_shift, frag_s_group[k % 2][j], 1);
+      } else {
+        int b_quant_shift = b_quant << 4;
+        frag_b0 = dequant_per_channel(b_quant);
+        frag_b1 = dequant_per_channel(b_quant_shift);
+      }
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
+        mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
+      }
+    }
+  };
+
+  // Since we slice across the k dimension of a tile in order to increase the
+  // number of warps while keeping the n dimension of a tile reasonable, we have
+  // multiple warps that accumulate their partial sums of the same output
+  // location; which we have to reduce over in the end. We do in shared memory.
+  auto thread_block_reduce = [&]() {
+    constexpr int red_off = threads / b_sh_stride / 2;
+    if (red_off >= 1) {
+      auto red_idx = threadIdx.x / b_sh_stride;
+      constexpr int red_sh_stride = b_sh_stride * 4 * 2;
+      constexpr int red_sh_delta = b_sh_stride;
+      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
+                      (threadIdx.x % b_sh_stride);
+
+      // Parallel logarithmic shared memory reduction. We make sure to avoid any
+      // unnecessary read or write iterations, e.g., for two warps we write only
+      // once by warp 1 and read only once by warp 0.
+
+  #pragma unroll
+      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
+  #pragma unroll
+        for (int i = red_off; i > 0; i /= 2) {
+          if (i <= red_idx && red_idx < 2 * i) {
+  #pragma unroll
+            for (int j = 0; j < 4 * 2; j++) {
+              int red_sh_wr =
+                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
+              if (i < red_off) {
+                int* c_rd =
+                    reinterpret_cast<int*>(&sh[red_sh_delta * j + red_sh_rd]);
+                int* c_wr = reinterpret_cast<int*>(&sh[red_sh_wr]);
+  #pragma unroll
+                for (int k = 0; k < 4; k++)
+                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
+                      c_rd[k] + c_wr[k];
+              }
+              sh[red_sh_wr] =
+                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
+            }
+          }
+          __syncthreads();
+        }
+        if (red_idx == 0) {
+  #pragma unroll
+          for (int i = 0; i < 4 * 2; i++) {
+            int* c_rd =
+                reinterpret_cast<int*>(&sh[red_sh_delta * i + red_sh_rd]);
+  #pragma unroll
+            for (int j = 0; j < 4; j++)
+              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
+                  c_rd[j];
+          }
+        }
+        __syncthreads();
+      }
+    }
+  };
+
+  // Since multiple threadblocks may process parts of the same column slice, we
+  // finally have to globally reduce over the results. As the striped
+  // partitioning minimizes the number of such reductions and our outputs are
+  // usually rather small, we perform this reduction serially in L2 cache.
+  // global_reduce works on INT32 elements, which are the results of INT8 GEMM.
+  // This is why we need another INT32 maxtrix `C` to reduce instead of the
+  // original half matrix `D`.
+  auto global_reduce = [&](bool first = false, bool last = false) {
+    // We are very careful here to reduce directly in the output buffer to
+    // maximize L2 cache utilization in this step. To do this, we write out
+    // results in FP16 (but still reduce with FP32 compute).
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    if (threadIdx.x < active_threads) {
+      int c_gl_stride = prob_n / 4;
+      int c_gl_wr_delta_o = 8 * c_gl_stride;
+      int c_gl_wr_delta_i = 8 * (active_threads / 32);
+      int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
+                    8 * (threadIdx.x / 32) + (threadIdx.x % 4) * 2;
+      c_gl_wr += (4 * thread_n_blocks) * slice_col;
+      constexpr int c_sh_wr_delta = active_threads * 2;
+      auto c_sh_wr = 2 * threadIdx.x;
+
+      int row = (threadIdx.x % 32) / 4;
+
+      if (!first) {
+  // Interestingly, doing direct global accesses here really seems to mess up
+  // the compiler and lead to slowdowns, hence we also use async-copies even
+  // though these fetches are not actually asynchronous.
+  #pragma unroll
+        for (int i = 0; i < thread_m_blocks * 4; i++) {
+          cp_async4_pred(
+              &sh[c_sh_wr + c_sh_wr_delta * i],
+              &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
+                 c_gl_wr_delta_i * (i % 2)],
+              i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
+          cp_async4_pred(
+              &sh[c_sh_wr + c_sh_wr_delta * i + 1],
+              &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
+                 c_gl_wr_delta_i * (i % 2) + 1],
+              i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
+        }
+        cp_async_fence();
+        cp_async_wait<0>();
+      }
+
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks * 4; i++) {
+        if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
+          if (!first) {
+            int4 d_red1 = sh[c_sh_wr + i * c_sh_wr_delta];
+            int4 d_red2 = sh[c_sh_wr + i * c_sh_wr_delta + 1];
+  #pragma unroll
+            for (int j = 0; j < 4; j++) {
+              reinterpret_cast<int*>(
+                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] +=
+                  reinterpret_cast<int*>(&d_red1)[j];
+            }
+  #pragma unroll
+            for (int j = 0; j < 4; j++) {
+              reinterpret_cast<int*>(
+                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * (j + 4) + (i % 4)] +=
+                  reinterpret_cast<int*>(&d_red2)[j];
+            }
+          }
+          if (!last) {
+            int4 d1, d2;
+  #pragma unroll
+            for (int j = 0; j < 4; j++) {
+              reinterpret_cast<int*>(&d1)[j] = reinterpret_cast<int*>(
+                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)];
+            }
+  #pragma unroll
+            for (int j = 0; j < 4; j++) {
+              reinterpret_cast<int*>(&d2)[j] = reinterpret_cast<int*>(
+                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * (j + 4) + (i % 4)];
+            }
+            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] =
+                d1;
+            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2) +
+              1] = d2;
+          }
+        }
+      }
+    }
+  };
+
+  // Write out the reduce final result in the correct layout. We only actually
+  // reshuffle matrix fragments in this step, the reduction above is performed
+  // in fragment layout.
+  auto write_result = [&]() {
+    int d_gl_stride = prob_n / 8;
+    constexpr int d_sh_stride = 2 * thread_n_blocks + 1;
+    int d_gl_wr_delta = d_gl_stride * (threads / (2 * thread_n_blocks));
+    constexpr int d_sh_rd_delta =
+        d_sh_stride * (threads / (2 * thread_n_blocks));
+
+    int d_gl_wr = d_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+    d_gl_wr += (2 * thread_n_blocks) * slice_col;
+    int d_sh_wr =
+        (4 * d_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
+    d_sh_wr += 32 * (threadIdx.x / 32);
+    int d_sh_rd = d_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+
+    int d_gl_wr_end = d_gl_stride * prob_m;
+
+    // We first reorder in shared memory to guarantee the most efficient final
+    // global write patterns
+    auto write = [&](int idx, int c0, int c1, float a_s, FragS_CHANNEL& w_s) {
+      float2 deq_res;
+      deq_res.x = int32_to_float(c0) * w_s[0] * a_s;
+      deq_res.y = int32_to_float(c1) * w_s[1] * a_s;
+      ((half2*)sh)[idx] = float2_to_half2(deq_res);
+    };
+
+    if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+        for (int j = 0; j < 4; j++) {
+          int wr = d_sh_wr + 8 * j;
+          write(wr + (4 * d_sh_stride) * 0 + 0, frag_c[i][j][0][0],
+                frag_c[i][j][0][1], frag_s_tok[i][0],
+                frag_s_ch[j / 2][2 * (j % 2) + 0]);
+          write(wr + (4 * d_sh_stride) * 8 + 0, frag_c[i][j][0][2],
+                frag_c[i][j][0][3], frag_s_tok[i][1],
+                frag_s_ch[j / 2][2 * (j % 2) + 0]);
+          write(wr + (4 * d_sh_stride) * 0 + 4, frag_c[i][j][1][0],
+                frag_c[i][j][1][1], frag_s_tok[i][0],
+                frag_s_ch[j / 2][2 * (j % 2) + 1]);
+          write(wr + (4 * d_sh_stride) * 8 + 4, frag_c[i][j][1][2],
+                frag_c[i][j][1][3], frag_s_tok[i][1],
+                frag_s_ch[j / 2][2 * (j % 2) + 1]);
+        }
+        d_sh_wr += 16 * (4 * d_sh_stride);
+      }
+    }
+    __syncthreads();
+
+  #pragma unroll
+    for (int i = 0;
+         i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
+         i++) {
+      if (d_gl_wr < d_gl_wr_end) {
+        D[d_gl_wr] = sh[d_sh_rd];
+        d_gl_wr += d_gl_wr_delta;
+        d_sh_rd += d_sh_rd_delta;
+      }
+    }
+  };
+
+  // Start global fetch and register load pipelines.
+  auto start_pipes = [&]() {
+  #pragma unroll
+    for (int i = 0; i < stages - 1; i++) fetch_to_shared(i, i, i < slice_iters);
+    zero_accums();
+    wait_for_stage();
+    fetch_to_registers(0, 0);
+    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
+  };
+  start_pipes();
+
+  // Main loop.
+  while (slice_iters) {
+  // We unroll over both the global fetch and the register load pipeline to
+  // ensure all shared memory accesses are static. Note that both pipelines have
+  // even length meaning that the next iteration will always start at index 0.
+  #pragma unroll
+    for (int pipe = 0; pipe < stages;) {
+  #pragma unroll
+      for (int k = 0; k < b_sh_wr_iters; k++) {
+        fetch_to_registers(k + 1, pipe % stages);
+        if (k == b_sh_wr_iters - 2) {
+          fetch_to_shared((pipe + stages - 1) % stages, pipe,
+                          slice_iters >= stages);
+          pipe++;
+          wait_for_stage();
+        }
+        matmul(k);
+      }
+      slice_iters--;
+      if (slice_iters == 0) break;
+    }
+    a_gl_rd += a_gl_rd_delta_o * stages;
+
+    // Process results and, if necessary, proceed to the next column slice.
+    // While this pattern may not be the most readable, other ways of writing
+    // the loop seemed to noticeably worse performance after compilation.
+    if (slice_iters == 0) {
+      cp_async_wait<0>();
+      bool last = slice_idx == slice_count - 1;
+      // For per-column scales, we only fetch them here in the final step before
+      // write-out
+      if (last) {
+        if (s_tok_sh_wr_pred) {
+          cp_async1(&sh_s_tok[s_tok_sh_wr], &s_tok[s_tok_gl_rd]);
+        }
+        if (s_ch_sh_wr_pred) {
+          cp_async4(&sh_s_ch[s_ch_sh_wr], &s_ch[s_ch_gl_rd]);
+        }
+        cp_async_fence();
+      }
+      thread_block_reduce();
+      if (last) {
+        cp_async_wait<0>();
+        __syncthreads();
+        if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+          for (int i = 0; i < thread_m_blocks; i++) {
+            frag_s_tok[i][0] =
+                *reinterpret_cast<float*>(&sh_s_tok[16 * i + 2 * s_tok_sh_rd]);
+            frag_s_tok[i][1] = *reinterpret_cast<float*>(
+                &sh_s_tok[16 * i + 2 * s_tok_sh_rd + 1]);
+          }
+          reinterpret_cast<int4*>(&frag_s_ch)[0] = sh_s_ch[s_ch_sh_rd + 0];
+          reinterpret_cast<int4*>(&frag_s_ch)[1] = sh_s_ch[s_ch_sh_rd + 1];
+          reinterpret_cast<int4*>(&frag_s_ch)[2] = sh_s_ch[s_ch_sh_rd + 8];
+          reinterpret_cast<int4*>(&frag_s_ch)[3] = sh_s_ch[s_ch_sh_rd + 9];
+        }
+      }
+      if (slice_count > 1) {  // only globally reduce if there is more than one
+                              // block in a slice
+        barrier_acquire(&locks[slice_col], slice_idx);
+        global_reduce(slice_idx == 0, last);
+        barrier_release(&locks[slice_col], last);
+      }
+      if (last)  // only the last block in a slice actually writes the result
+        write_result();
+      slice_row = 0;
+      slice_col_par++;
+      slice_col++;
+      init_slice();
+      if (slice_iters) {
+        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                  (threadIdx.x % a_gl_rd_delta_o);
+  #pragma unroll
+        for (int i = 0; i < b_sh_wr_iters; i++)
+          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
+        if (slice_col == 0) {
+  #pragma unroll
+          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
+        }
+        s_group_gl_rd = s_group_sh_stride * slice_col + threadIdx.x;
+        s_ch_gl_rd = s_ch_sh_stride * slice_col + threadIdx.x;
+        start_pipes();
+      }
+    }
+  }
+}
+
+#else
+
+template <const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const int group_blocks = -1  // number of consecutive 16x16 blocks
+                                       // with a separate quantization scale
+          >
+__global__ void Marlin(
+    const int4* __restrict__ A,  // int8 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // int32 global_reduce buffer of shape
+                           // (max_par*16*4)xn, as int8 tensor core's output is
+                           // int32 dtype
+    int4* __restrict__ D,              // fp16 output buffer of shape mxn
+    const float* __restrict__ s_tok,   // fp32 activation per-token quantization
+                                       // scales of shape mx1
+    const int4* __restrict__ s_ch,     // fp32 weight per-channel quantization
+                                       // scales of shape 1xn
+    const int4* __restrict__ s_group,  // fp16 weight per-group quantization
+                                       // scales of shape (k/groupsize)xn, when
+                                       // group_blocks=-1, it should be nullptr
+    int prob_m,                        // batch dimension m
+    int prob_n,                        // output dimension n
+    int prob_k,                        // reduction dimension k
+    int* locks  // extra global storage for barrier synchronization
+) {
+  // Marlin is not implemented yet for SM < 8.0
+  assert(false);
+  return;
+}
+
+#endif
+
+// 8 warps are a good choice since every SM has 4 schedulers and having more
+// than 1 warp per schedule allows some more latency hiding. At the same time,
+// we want relatively few warps to have many registers per warp and small tiles.
+const int USER_THREADS =
+    256;               // Note: This is only used with user-provided thread_k/n
+const int STAGES = 4;  // 4 pipeline stages fit into shared memory
+
+static constexpr int min_thread_n = 64;
+static constexpr int min_thread_k = 64;
+
+static constexpr int tile_size = 16;
+static constexpr int max_par = 16;
+
+static constexpr int pack_factor_4bit =
+    8;  // We have 8 4-bit vals inside a 32 bit
+
+#define __CALL_IF(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,           \
+                  GROUP_BLOCKS, NUM_THREADS)                                   \
+  else if (thread_m_blocks == THREAD_M_BLOCKS &&                               \
+           thread_n_blocks == THREAD_N_BLOCKS &&                               \
+           thread_k_blocks == THREAD_K_BLOCKS &&                               \
+           group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) {       \
+    cudaFuncSetAttribute(Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, \
+                                THREAD_K_BLOCKS, STAGES, GROUP_BLOCKS>,        \
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,          \
+                         max_shared_mem);                                      \
+    Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,     \
+           STAGES, GROUP_BLOCKS>                                               \
+        <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                     \
+            A_ptr, B_ptr, C_ptr, D_ptr, s_tok_ptr, s_ch_ptr, s_group_ptr,      \
+            prob_m, prob_n, prob_k, locks);                                    \
+  }
+
+typedef struct {
+  int thread_k;
+  int thread_n;
+  int num_threads;
+} thread_config_t;
+
+thread_config_t small_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {128, 128, 256},  // Default
+    {128, 64, 128},   // Reduce N 2X, same K
+    {64, 256, 256},   // Reduce K 2X, increase N 2X
+    {64, 128, 128},   // Reduce K 2X, same N
+};
+
+thread_config_t large_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {64, 256, 256},   // Default
+    {128, 128, 256},  // Reduce N 2X, increase K 2X
+    {64, 128, 128},   // Reduce N 2X, same K
+    {128, 64, 128},   // Reduce N 4X, increase K 2X
+};
+
+bool is_valid_config(thread_config_t const& th_config, int prob_m, int prob_n,
+                     int prob_k) {
+  // Sanity
+  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
+      th_config.num_threads == -1) {
+    return false;
+  }
+
+  // Verify K/N are divisible by thread K/N
+  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
+    return false;
+  }
+
+  // thread_k can be only 128 or 64 (because it must be less than groupsize
+  // which is 128)
+  if (th_config.thread_k != 128 && th_config.thread_k != 64) {
+    return false;
+  }
+
+  // Verify min for thread K/N
+  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
+    return false;
+  }
+
+  // num_threads must be at least 128 (= 4 warps)
+  if (th_config.num_threads < 128) {
+    return false;
+  }
+
+  return true;
+}
+
+thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) {
+  if (prob_m <= 16) {
+    for (auto th_config : small_batch_thread_configs) {
+      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
+        return th_config;
+      }
+    }
+
+  } else {
+    for (auto th_config : large_batch_thread_configs) {
+      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
+        return th_config;
+      }
+    }
+  }
+
+  return thread_config_t{-1, -1, -1};
+}
+
+#define CALL_IF(N_BLOCKS, K_BLOCKS, NUM_THREADS)    \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
+  __CALL_IF(2, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+  __CALL_IF(2, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
+  __CALL_IF(3, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+  __CALL_IF(3, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
+  __CALL_IF(4, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+  __CALL_IF(4, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)
+
+void marlin_qqq_cuda(const void* A, const void* B, void* C, void* D,
+                     void* s_tok, void* s_ch, void* s_group, int prob_m,
+                     int prob_n, int prob_k, void* workspace,
+                     int groupsize = -1, int dev = 0, cudaStream_t stream = 0,
+                     int thread_k = -1, int thread_n = -1, int sms = -1,
+                     int max_par = 16) {
+  int tot_m = prob_m;
+  int tot_m_blocks = ceildiv(tot_m, 16);
+  int pad = 16 * tot_m_blocks - tot_m;
+
+  if (sms == -1)
+    cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
+
+  int max_shared_mem = 0;
+  cudaDeviceGetAttribute(&max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  TORCH_CHECK(max_shared_mem > 0);
+
+  // Set thread config
+  thread_config_t th_config;
+  if (thread_k != -1 && thread_n != -1) {
+    // User-defined config
+    th_config = thread_config_t{thread_k, thread_n, USER_THREADS};
+  } else {
+    // Auto config
+    th_config = determine_thread_config(prob_m, prob_n, prob_k);
+  }
+
+  if (!is_valid_config(th_config, prob_m, prob_n, prob_k)) {
+    throw std::runtime_error(
+        "Invalid thread config: thread_k = " + str(th_config.thread_k) +
+        ", thread_n = " + str(th_config.thread_n) +
+        ", num_threads = " + str(th_config.num_threads) + " for MKN = [" +
+        str(prob_m) + ", " + str(prob_k) + ", " + str(prob_n) + "]");
+  }
+
+  int num_threads = th_config.num_threads;
+  thread_k = th_config.thread_k;
+  thread_n = th_config.thread_n;
+
+  int thread_k_blocks = thread_k / 16;
+  int thread_n_blocks = thread_n / 16;
+  int group_blocks = (groupsize == -1) ? -1 : groupsize / 16;
+  int blocks = sms;
+
+  if (prob_m == 0 || prob_n == 0 || prob_k == 0) {
+    return;
+  }
+
+  TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
+              " is not divisible by thread_n = ", thread_n);
+  TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
+              " is not divisible by thread_k = ", thread_k);
+  if (group_blocks != -1) {
+    TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
+                " is not divisible by group_blocks = ", group_blocks);
+  }
+
+  const int4* A_ptr = (const int4*)A;
+  const int4* B_ptr = (const int4*)B;
+  int4* C_ptr = (int4*)C;
+  int4* D_ptr = (int4*)D;
+  const float* s_tok_ptr = (const float*)s_tok;
+  const int4* s_ch_ptr = (const int4*)s_ch;
+  const int4* s_group_ptr = (const int4*)s_group;
+
+  int* locks = (int*)workspace;
+
+  for (int i = 0; i < tot_m_blocks; i += 4) {
+    int thread_m_blocks = tot_m_blocks - i;
+    prob_m = tot_m - 16 * i;
+    int par = 1;
+    if (thread_m_blocks > 4) {
+      // Note that parallel > 1 currently only works for inputs without any
+      // padding
+      par = (16 * thread_m_blocks - pad) / 64;
+      if (par > max_par) par = max_par;
+      prob_m = 64 * par;
+      i += 4 * (par - 1);
+      thread_m_blocks = 4;
+    }
+
+    // For compilation speed, we only define the kernel configurations that have
+    // seemed useful (in terms of performance) in our testing, however many more
+    // are, in principle, possible.
+    if (false) {
+    }
+    CALL_IF(8, 8, 256)
+    CALL_IF(16, 4, 256)
+    CALL_IF(8, 4, 128)
+    CALL_IF(4, 8, 128)
+    else {
+      throw std::runtime_error("Unsupported shapes: MKN = [" + str(prob_m) +
+                               ", " + str(prob_k) + ", " + str(prob_n) + "]" +
+                               ", groupsize = " + str(groupsize) +
+                               ", thread_m_blocks = " + str(thread_m_blocks) +
+                               ", thread_n_blocks = " + str(thread_n_blocks) +
+                               ", thread_k_blocks = " + str(thread_k_blocks));
+    }
+
+    A_ptr += 16 * thread_m_blocks * (prob_k / 16) * par;
+    D_ptr += 16 * thread_m_blocks * (prob_n / 8) * par;
+    s_tok_ptr += 16 * thread_m_blocks * par;
+  }
+}
+}  // anonymous namespace
+
+torch::Tensor marlin_qqq_gemm(torch::Tensor const& a,
+                              torch::Tensor const& b_q_weight,
+                              torch::Tensor const& s_tok,
+                              torch::Tensor const& s_ch,
+                              torch::Tensor const& s_group,
+                              torch::Tensor& workspace, int64_t size_m,
+                              int64_t size_n, int64_t size_k) {
+  // Verify M
+  TORCH_CHECK(size_m == a.size(0),
+              "Shape mismatch: a.size(0) = " + str(a.size(0)) +
+                  ", size_m = " + str(size_m));
+  TORCH_CHECK(size_m == s_tok.numel(),
+              "Shape mismatch: s_tok.numel() = " + str(s_tok.numel()) +
+                  ", size_m = " + str(size_m));
+
+  // Verify K
+  TORCH_CHECK(size_k == a.size(1),
+              "Shape mismatch: a.size(1) = " + str(a.size(1)) +
+                  ", size_k = " + str(size_k));
+  TORCH_CHECK(size_k % tile_size == 0,
+              "size_k = " + str(size_k) +
+                  " is not divisible by tile_size = " + str(tile_size));
+  TORCH_CHECK(
+      (size_k / tile_size) == b_q_weight.size(0),
+      "Shape mismatch: b_q_weight.size(0) = " + str(b_q_weight.size(0)) +
+          ", size_k = " + str(size_k) + ", tile_size = " + str(tile_size));
+
+  int groupsize = (s_group.numel() == 0) ? -1 : size_k / s_group.size(0);
+  // Verify groupsize
+  TORCH_CHECK(groupsize == -1 || groupsize == 128,
+              "Unexpected groupsize = " + str(groupsize));
+
+  // Verify N
+  TORCH_CHECK(s_ch.numel() == size_n,
+              "Shape mismatch: s_ch.numel() = " + str(s_ch.numel()) +
+                  ", size_n = " + str(size_n));
+  TORCH_CHECK(b_q_weight.size(1) % tile_size == 0,
+              "b_q_weight.size(1) = " + str(b_q_weight.size(1)) +
+                  " is not divisible by tile_size = " + str(tile_size));
+  if (groupsize != -1) {
+    TORCH_CHECK(s_group.size(1) == size_n,
+                "Shape mismatch: s_group.size(1) = " + str(s_group.size(1)) +
+                    ", size_n = " + str(size_n));
+    TORCH_CHECK(
+        size_k % s_group.size(0) == 0,
+        "size_k = " + str(size_k) +
+            ", is not divisible by s_group.size(0) = " + str(s_group.size(0)));
+  }
+
+  int actual_size_n = (b_q_weight.size(1) / tile_size) * pack_factor_4bit;
+  TORCH_CHECK(size_n == actual_size_n,
+              "Shape mismatch: size_n = " + str(size_n) +
+                  ", actual_size_n = " + str(actual_size_n));
+
+  // Verify A device and strides
+  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
+  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
+
+  // Verify B device and strides
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+
+  // Verify s_tok device, strides and dtype
+  TORCH_CHECK(s_tok.device().is_cuda(), "s_tok is not on GPU");
+  TORCH_CHECK(s_tok.is_contiguous(), "s_tok is not contiguous");
+  TORCH_CHECK(s_tok.dtype() == torch::kFloat32, "s_tok's dtype is not float32");
+
+  // Verify s_ch device, strides and dtype
+  TORCH_CHECK(s_ch.device().is_cuda(), "s_ch is not on GPU");
+  TORCH_CHECK(s_ch.is_contiguous(), "s_ch is not contiguous");
+  TORCH_CHECK(s_ch.dtype() == torch::kFloat32, "s_ch's dtype is not float32");
+
+  // Verify s_group device, strides and dtype
+  TORCH_CHECK(s_group.device().is_cuda(), "s_group is not on GPU");
+  TORCH_CHECK(s_group.is_contiguous(), "s_group is not contiguous");
+  TORCH_CHECK(s_group.dtype() == torch::kFloat16,
+              "s_group's dtype is not float16");
+
+  // Verify workspace size
+  TORCH_CHECK(size_n % min_thread_n == 0,
+              "size_n = " + str(size_n) +
+                  ", is not divisible by min_thread_n = " + str(min_thread_n));
+  int min_workspace_size = (size_n / min_thread_n) * max_par;
+  TORCH_CHECK(workspace.numel() >= min_workspace_size,
+              "workspace.numel = " + str(workspace.numel()) +
+                  " is below min_workspace_size = " + str(min_workspace_size));
+
+  // Alloc C matrix
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
+  auto options_c = torch::TensorOptions().dtype(torch::kInt).device(a.device());
+  torch::Tensor c = torch::empty({max_par * 64, size_n}, options_c);
+
+  // Alloc D matrix
+  auto options_d =
+      torch::TensorOptions().dtype(torch::kFloat16).device(a.device());
+  torch::Tensor d = torch::empty({size_m, size_n}, options_d);
+
+  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_k = -1;
+  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_n = -1;
+  // sms: number of SMs to use for the kernel (can usually be left as auto -1)
+  int sms = -1;
+
+  int dev = a.get_device();
+  marlin_qqq_cuda(
+      a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), d.data_ptr(),
+      s_tok.data_ptr(), s_ch.data_ptr(), s_group.data_ptr(), size_m, size_n,
+      size_k, workspace.data_ptr(), groupsize, dev,
+      at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, max_par);
+
+  return d;
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("marlin_qqq_gemm", &marlin_qqq_gemm);
+}
diff --git a/vllm_v0.10.0/csrc/quantization/marlin/sparse/LICENSE b/vllm_v0.10.0/csrc/quantization/marlin/sparse/LICENSE
new file mode 100644
index 0000000..ca75fb1
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/marlin/sparse/LICENSE
@@ -0,0 +1,203 @@
+Contains code from https://github.com/IST-DASLab/Sparse-Marlin/
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/quantization/marlin/sparse/common/base.h b/vllm_v0.10.0/csrc/quantization/marlin/sparse/common/base.h
new file mode 100644
index 0000000..16018d3
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/marlin/sparse/common/base.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2024 Roberto Lopez Castro (roberto.lopez.castro@udc.es). All
+ * Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace marlin_24 {
+
+constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; }
+
+// Instances of `Vec` are used to organize groups of >>registers<<, as needed
+// for instance as inputs to tensor core operations. Consequently, all
+// corresponding index accesses must be compile-time constants, which is why we
+// extensively use `#pragma unroll` throughout the kernel code to guarantee
+// this.
+template <typename T, int n>
+struct Vec {
+  T elems[n];
+  __device__ T& operator[](int i) { return elems[i]; }
+};
+
+template <int M_, int N_, int K_>
+struct ShapeBase {
+  static constexpr int M = M_, N = N_, K = K_;
+};
+
+using I4 = Vec<int, 4>;
+
+// Matrix fragments for tensor core instructions; their precise layout is
+// documented here:
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
+using FragA = Vec<half2, 4>;
+using FragB = Vec<half2, 2>;
+using FragM = Vec<uint, 1>;
+using FragC = Vec<float, 4>;
+using FragS = Vec<half2, 1>;  // quantization scales
+
+}  // namespace marlin_24
diff --git a/vllm_v0.10.0/csrc/quantization/marlin/sparse/common/mem.h b/vllm_v0.10.0/csrc/quantization/marlin/sparse/common/mem.h
new file mode 100644
index 0000000..83e3578
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/marlin/sparse/common/mem.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (C) 2024 Roberto Lopez Castro (roberto.lopez.castro@udc.es). All
+ * Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "base.h"
+
+namespace marlin_24 {
+// Predicated asynchronous global->shared copy; used for inputs A where we apply
+// predication to handle batchsizes that are not multiples of 16.
+__device__ inline void cp_async4_pred_zfill(void* smem_ptr,
+                                            const void* glob_ptr,
+                                            bool pred = true,
+                                            const bool zfill = false) {
+  const int BYTES = 16;
+  int src_in_bytes = (zfill ? 0 : BYTES);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES), "r"(src_in_bytes));
+}
+
+__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
+                                      bool pred = true) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+
+// Asynchronous global->shared copy
+__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
+      "}\n" ::"r"(smem),
+      "l"(glob_ptr), "n"(BYTES));
+}
+
+// Async copy fence.
+__device__ inline void cp_async_fence() {
+  asm volatile("cp.async.commit_group;\n" ::);
+}
+
+// Wait until at most `n` async copy stages are still pending.
+template <int n>
+__device__ inline void cp_async_wait() {
+  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
+}
+
+// Instruction for loading a full 16x16 matrix fragment of operand A from shared
+// memory, directly in tensor core layout.
+__device__ inline void ldsm4(FragA& frag_a, const void* smem_ptr) {
+  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
+               : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
+               : "r"(smem));
+}
+
+__device__ inline void ldsm4_m(FragM& frag_m, const void* smem_ptr) {
+  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_m);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];\n"
+               : "=r"(a[0]), "=r"(a[1])
+               : "r"(smem));
+}
+
+// Instruction for loading a full 16x16 matrix fragment of operand A from shared
+// memory, directly in tensor core layout.
+__device__ inline void ldsm4_t(FragA& frag_a, const void* smem_ptr) {
+  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%0,%1,%2,%3}, [%4];\n"
+      : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
+      : "r"(smem));
+}
+
+// Wait until barrier reaches `count`, then lock for current threadblock.
+__device__ inline void barrier_acquire(int* lock, int count) {
+  if (threadIdx.x == 0) {
+    int state = -1;
+    do
+      // Guarantee that subsequent writes by this threadblock will be visible
+      // globally.
+      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
+                   : "=r"(state)
+                   : "l"(lock));
+    while (state != count);
+  }
+  __syncthreads();
+}
+
+// Release barrier and increment visitation count.
+__device__ inline void barrier_release(int* lock, bool reset = false) {
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    if (reset) {
+      lock[0] = 0;
+      return;
+    }
+    int val = 1;
+    // Make sure that all writes since acquiring this barrier are visible
+    // globally, while releasing the barrier.
+    asm volatile("fence.acq_rel.gpu;\n");
+    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
+                 :
+                 : "l"(lock), "r"(val));
+  }
+}
+}  // namespace marlin_24
diff --git a/vllm_v0.10.0/csrc/quantization/marlin/sparse/common/mma.h b/vllm_v0.10.0/csrc/quantization/marlin/sparse/common/mma.h
new file mode 100644
index 0000000..b26505f
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/marlin/sparse/common/mma.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (C) 2024 Roberto Lopez Castro (roberto.lopez.castro@udc.es). All
+ * Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "base.h"
+#include <cudaTypedefs.h>
+
+namespace marlin_24 {
+
+// On CUDA earlier than 12.5, the ordered_metadata version of this instruction
+// is not supported. On later versions of CUDA the version without ordered
+// metadata results in the following warning:
+//  | Advisory: Modifier ‘.sp::ordered_metadata’ should be used on instruction
+//  | ‘mma’ instead of modifier ‘.sp’ as it is expected to have substantially
+//  | reduced performance on some future architectures
+#if defined CUDA_VERSION && CUDA_VERSION >= 12050
+  #define MMA_SP_INST \
+    "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
+#else
+  #define MMA_SP_INST "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
+#endif
+
+// m16n8k32 sparse tensor core mma instruction with fp16 inputs and fp32
+// output/accumulation.
+__device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1,
+                              const FragA& frag_b, FragC& frag_c, FragM& frag_m,
+                              const int psel) {
+  const uint32_t* a0 = reinterpret_cast<const uint32_t*>(&a_frag0);
+  const uint32_t* a1 = reinterpret_cast<const uint32_t*>(&a_frag1);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  const uint32_t* e = reinterpret_cast<const uint32_t*>(&frag_m);
+
+  float* c = reinterpret_cast<float*>(&frag_c);
+  if (psel == 0) {
+    asm volatile(MMA_SP_INST
+                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
+                 "{%12,%13,%14,%15}, %16, 0x0;\n"
+                 : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]),
+                   "r"(b[2]), "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]),
+                   "f"(c[2]), "f"(c[3]), "r"(e[0]));
+    asm volatile(MMA_SP_INST
+                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
+                 "{%12,%13,%14,%15}, %16, 0x0;\n"
+                 : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
+                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]),
+                   "r"(b[3]), "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]),
+                   "f"(c[6]), "f"(c[7]), "r"(e[0]));
+  } else {
+    asm volatile(MMA_SP_INST
+                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
+                 "{%12,%13,%14,%15}, %16, 0x1;\n"
+                 : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]),
+                   "r"(b[2]), "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]),
+                   "f"(c[2]), "f"(c[3]), "r"(e[0]));
+    asm volatile(MMA_SP_INST
+                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
+                 "{%12,%13,%14,%15}, %16, 0x1;\n"
+                 : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
+                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]),
+                   "r"(b[3]), "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]),
+                   "f"(c[6]), "f"(c[7]), "r"(e[0]));
+  }
+}
+
+// Lookup-table based 3-input logical operation; explicitly used for
+// dequantization as the compiler does not seem to automatically recognize it in
+// all cases.
+template <int lut>
+__device__ inline int lop3(int a, int b, int c) {
+  int res;
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(res)
+               : "r"(a), "r"(b), "r"(c), "n"(lut));
+  return res;
+}
+
+__device__ __forceinline__ uint2 to_half4(float c0, float c1, float c2,
+                                          float c3) {
+  uint2 r;
+  asm("{\n\t"
+      ".reg .f16 a, b, c, d; \n\t"
+      "cvt.rn.f16.f32 a, %2; \n\t"
+      "cvt.rn.f16.f32 b, %3; \n\t"
+      "cvt.rn.f16.f32 c, %4; \n\t"
+      "cvt.rn.f16.f32 d, %5; \n\t"
+      "mov.b32 %0, {a, b};   \n\t"
+      "mov.b32 %1, {c, d};   \n\t"
+      "}"
+      : "=r"(r.x), "=r"(r.y)
+      : "f"(c0), "f"(c1), "f"(c2), "f"(c3));
+  return r;
+}
+
+// Constructs destination register by taking bytes from 2 sources (based on
+// mask)
+template <int start_byte, int mask>
+__device__ inline uint32_t prmt(uint32_t a) {
+  uint32_t res;
+  asm volatile("prmt.b32 %0, %1, %2, %3;\n"
+               : "=r"(res)
+               : "r"(a), "n"(start_byte), "n"(mask));
+  return res;
+}
+
+// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
+// values. We mostly follow the strategy in the link below, with some small
+// changes:
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+__device__ inline FragB dequant_4bit(int q) {
+  const int LO = 0x000f000f;
+  const int HI = 0x00f000f0;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
+  // directly into `SUB` and `ADD`.
+  const int SUB = 0x64086408;
+  const int MUL = 0x2c002c00;
+  const int ADD = 0xd480d480;
+
+  FragB frag_b;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&SUB));
+  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&MUL),
+                      *reinterpret_cast<const half2*>(&ADD));
+  return frag_b;
+}
+
+// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
+// values. We mostly follow the strategy in the link below, with some small
+// changes:
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+__device__ inline FragB dequant_8bit(int q) {
+  static constexpr uint32_t mask_for_elt_01 = 0x5250;
+  static constexpr uint32_t mask_for_elt_23 = 0x5351;
+  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+
+  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
+  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
+
+  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
+
+  FragB frag_b;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  return frag_b;
+}
+
+// Multiply dequantized values by the corresponding quantization scale; used
+// only for grouped quantization.
+__device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) {
+  half2 s = __half2half2(reinterpret_cast<__half*>(&frag_s)[i]);
+  frag_b[0] = __hmul2(frag_b[0], s);
+  frag_b[1] = __hmul2(frag_b[1], s);
+}
+
+__device__ inline void scale_floats(float* c0, float* c1, float* c2, float* c3,
+                                    FragS& s0, float* c4, float* c5, float* c6,
+                                    float* c7, FragS& s1) {
+  *c0 = __fmul_rn(*c0, __half2float(s0[0].x));
+  *c1 = __fmul_rn(*c1, __half2float(s0[0].y));
+  *c2 = __fmul_rn(*c2, __half2float(s0[1].x));
+  *c3 = __fmul_rn(*c3, __half2float(s0[1].y));
+
+  *c4 = __fmul_rn(*c4, __half2float(s1[0].x));
+  *c5 = __fmul_rn(*c5, __half2float(s1[0].y));
+  *c6 = __fmul_rn(*c6, __half2float(s1[1].x));
+  *c7 = __fmul_rn(*c7, __half2float(s1[1].y));
+}
+
+}  // namespace marlin_24
diff --git a/vllm_v0.10.0/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu b/vllm_v0.10.0/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
new file mode 100644
index 0000000..c33e71a
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
@@ -0,0 +1,1145 @@
+/*
+ * Notice: This file was modified by Neuralmagic inc to include 8-bit support
+ *
+ * Copyright (C) 2024 Roberto Lopez Castro (roberto.lopez.castro@udc.es). All
+ * Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+
+#include "common/base.h"
+#include "core/scalar_type.hpp"
+#include "core/registration.h"
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+
+#else
+
+  #include "common/mem.h"
+  #include "common/mma.h"
+
+#endif
+
+template <typename T>
+inline std::string str(T x) {
+  return std::to_string(x);
+}
+
+namespace marlin_24 {
+
+// 8 warps are a good choice since every SM has 4 schedulers and having more
+// than 1 warp per schedule allows some more latency hiding. At the same time,
+// we want relatively few warps to have many registers per warp and small tiles.
+static constexpr int THREADS = 256;
+static constexpr int STAGES = 4;
+
+static constexpr int min_thread_n = 128;
+
+static constexpr int tile_size = 16;
+static constexpr int max_par = 64;
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+
+template <const int num_bits,         // weight bits
+          const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const int group_blocks = -1  // number of consecutive 16x16 blocks
+                                       // with a separate quantization scale
+          >
+__global__ void Marlin_24(
+    const int4* __restrict__ A,     // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,     // 4bit quantized weight matrix of shape kxn
+    const int4* __restrict__ meta,  // 2bit metadata information about 2:4
+                                    // format on B
+    int4* __restrict__ C,           // fp16 output buffer of shape mxn
+    const int4* __restrict__ s,     // fp16 quantization scales of shape
+                                    // (k/groupsize)xn
+    int prob_m,                     // batch dimension m
+    int prob_n,                     // output dimension n
+    int prob_k,                     // reduction dimension k
+    int* locks  // extra global storage for barrier synchronization
+) {}
+
+torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
+                                  torch::Tensor& b_meta,
+                                  torch::Tensor& b_scales,
+                                  torch::Tensor& workspace,
+                                  vllm::ScalarTypeId const b_q_type_id,
+                                  int64_t size_m, int64_t size_n,
+                                  int64_t size_k) {
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false, "gptq_marlin_24_gemm(..) requires CUDA_ARCH >= 8.0");
+  return torch::empty({1, 1});
+}
+
+#else
+
+template <const int num_bits,         // weight bits
+          const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const int group_blocks = -1  // number of consecutive 16x16 blocks
+                                       // with a separate quantization scale
+          >
+__global__ void Marlin_24(
+    const int4* __restrict__ A,     // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,     // 4bit quantized weight matrix of shape kxn
+    const int4* __restrict__ meta,  // 2bit metadata information about 2:4
+                                    // format on B
+    int4* __restrict__ C,           // fp16 output buffer of shape mxn
+    const int4* __restrict__ s,     // fp16 quantization scales of shape
+                                    // (k/groupsize)xn
+    int prob_m,                     // batch dimension m
+    int prob_n,                     // output dimension n
+    int prob_k,                     // reduction dimension k
+    int* locks  // extra global storage for barrier synchronization
+) {
+  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
+  // same size, which might involve multiple column "slices" (of width 16 *
+  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
+  // example:
+  //   0 1 3
+  //   0 2 3
+  //   1 2 4
+  // While this kind of partitioning makes things somewhat more complicated, it
+  // ensures good utilization of all SMs for many kinds of shape and GPU
+  // configurations, while requiring as few slow global cross-threadblock
+  // reductions as possible.
+
+  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
+  // better partitioning with less reductions
+  int parallel = 1;
+  if (prob_m > 16 * thread_m_blocks) {
+    parallel = prob_m / (16 * thread_m_blocks);
+    prob_m = 16 * thread_m_blocks;
+  }
+
+  // number of thread_k_blocks in k-dim
+  int k_tiles = prob_k / 32 / thread_k_blocks;
+  // number of thread_n_blocks in n-dim
+  int n_tiles = prob_n / 16 / thread_n_blocks;
+  // iters needed to cover all slices
+  int iters = ceildiv(k_tiles * n_tiles * parallel, gridDim.x);
+
+  // Ensure that the number of tiles in each stripe is a multiple of the
+  // groupsize; this avoids an annoying special case where a stripe starts in
+  // the middle of group.
+  if (group_blocks != -1)
+    iters = (group_blocks / thread_k_blocks) *
+            ceildiv(iters, (group_blocks / thread_k_blocks));
+
+  int slice_row = (iters * blockIdx.x) % k_tiles;
+  int slice_col_par = (iters * blockIdx.x) / k_tiles;
+  int slice_col = slice_col_par;
+  // number of threadblock tiles in the current slice
+  int slice_iters;
+  // total number of active threadblocks in the current slice
+  int slice_count = 0;
+  // index of threadblock in current slice; numbered bottom to top
+  int slice_idx;
+
+  // We can easily implement parallel problem execution by just remapping
+  // indices and advancing global pointers
+  if (slice_col_par >= n_tiles) {
+    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8;
+    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
+    locks += (slice_col_par / n_tiles) * n_tiles;
+    slice_col = slice_col_par % n_tiles;
+  }
+
+  // Compute all information about the current slice which is required for
+  // synchronization.
+  auto init_slice = [&]() {
+    slice_iters =
+        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
+    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
+    if (slice_iters == 0) return;
+    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
+    slice_count = 1;
+    slice_idx = 0;
+    int col_first = iters * ceildiv(k_tiles * slice_col_par, iters);
+    if (col_first <= k_tiles * (slice_col_par + 1)) {
+      int col_off = col_first - k_tiles * slice_col_par;
+      slice_count = ceildiv(k_tiles - col_off, iters);
+      if (col_off > 0) slice_count++;
+      int delta_first = iters * blockIdx.x - col_first;
+      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
+        slice_idx = slice_count - 1;
+      else {
+        slice_idx = slice_count - 1 - delta_first / iters;
+        if (col_off > 0) slice_idx--;
+      }
+    }
+    if (slice_col == n_tiles) {
+      A += 16 * thread_m_blocks * prob_k / 8;
+      C += 16 * thread_m_blocks * prob_n / 8;
+      locks += n_tiles;
+      slice_col = 0;
+    }
+  };
+  init_slice();
+
+  // RLC: 8 is vec_size -> 128-bit instructions, 8 fp16 elements
+  int a_gl_stride = prob_k / 8;  // stride of the A matrix in global memory
+
+  // stride of an A matrix tile in shared memory
+  constexpr int a_sh_stride = 32 * thread_k_blocks / 8;
+  // delta between subsequent A tiles in global memory
+  constexpr int a_gl_rd_delta_o = 32 * thread_k_blocks / 8;
+  // between subsequent accesses within a tile
+  int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory writes
+  constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory tile reads //RLC: 2 * #warps k-dim
+  constexpr int a_sh_rd_delta_o = 4 * ((threads / 32) / (thread_n_blocks / 4));
+  // within a shared memory tile
+  constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
+  // overall size of a tile
+  constexpr int a_sh_stage = a_sh_stride * (16 * thread_m_blocks);
+  // number of shared write iterations for a tile
+  constexpr int a_sh_wr_iters = ceildiv(a_sh_stage, a_sh_wr_delta);
+
+  constexpr int pack_factor = 32 / num_bits;
+
+  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
+  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
+  constexpr int b_thread_vecs = num_bits == 4 ? 1 : 2;
+  constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
+  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
+  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
+  constexpr int b_sh_wr_delta = threads * b_thread_vecs;
+  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
+  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
+  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
+
+  int m_gl_stride = 2 * prob_n / 8;  // (16*2*4 / 8) = 16
+  constexpr int m_sh_stride =
+      (16 * thread_n_blocks) / 4;  // #warps n-dim * threads/warp
+  int m_gl_rd_delta_o = m_gl_stride * thread_k_blocks;
+  int m_gl_rd_delta_i = m_gl_stride * (threads / m_sh_stride);
+  constexpr int m_sh_wr_delta = threads / 2;
+  constexpr int m_sh_rd_delta = threads / 2;
+  constexpr int m_sh_stage = m_sh_stride * thread_k_blocks;
+  constexpr int m_sh_iters = ceildiv(m_sh_stage, m_sh_wr_delta);
+
+  int s_gl_stride = prob_n / 8;
+  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
+  constexpr int s_sh_stage = s_sh_stride;
+  int s_gl_rd_delta = s_gl_stride;
+
+  // Global A read index of current thread.
+  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  a_gl_rd += a_gl_rd_delta_o * slice_row;
+  // Shared write index of current thread.
+  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  // Shared read index.
+  int a_sh_rd =
+      a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
+  a_sh_rd += 4 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
+
+  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
+                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
+  b_gl_rd += b_sh_stride * slice_col;
+  b_gl_rd += b_gl_rd_delta_o * slice_row;
+  auto b_sh_wr = threadIdx.x * b_thread_vecs;
+  auto b_sh_rd = threadIdx.x * b_thread_vecs;
+
+  int m_gl_rd = m_gl_stride * (threadIdx.x / (m_sh_stride)) +
+                (threadIdx.x % (m_sh_stride));
+  m_gl_rd += (m_sh_stride)*slice_col;
+  m_gl_rd += m_gl_rd_delta_o * slice_row;
+  auto m_sh_wr = threadIdx.x;
+  auto m_sh_rd = threadIdx.x % 16 + (threadIdx.x / 32) * 16;
+
+  int s_gl_rd;
+  if constexpr (group_blocks == -1) {
+    s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+  } else {
+    s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+              s_sh_stride * slice_col + threadIdx.x;
+  }
+
+  auto s_sh_wr = threadIdx.x;
+  int s_sh_rd;
+  // We use a different scale layout for grouped and column-wise quantization as
+  // we scale a `half2` tile in column-major layout in the former and in
+  // row-major in the latter case.
+  s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+            (threadIdx.x % 32) / 4;  // Note that in the original Marlin kernel
+                                     // this is (threadIdx.x % 32) / 4
+
+  // Precompute which thread should not read memory in which iterations; this is
+  // needed if there are more threads than required for a certain tilesize or
+  // when the batchsize is not a multiple of 16.
+  bool a_sh_wr_pred[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++) {
+    a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
+  }
+  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
+
+  // To ensure that writing and reading A tiles to/from shared memory, the
+  // latter in fragment format, is fully bank conflict free, we need to use a
+  // rather fancy XOR-based layout. The key here is that neither reads nor
+  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
+  // same shared memory banks. Further, it seems (based on NSight-Compute) that
+  // each warp must also write a consecutive memory segment?
+  auto transform_a = [&](int i) {
+    int row = i / a_gl_rd_delta_o;
+    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
+  };
+  // Since the computation of this remapping is non-trivial and, due to our main
+  // loop unrolls, all shared memory accesses are static, we simply precompute
+  // both transformed reads and writes.
+  int a_sh_wr_trans[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
+  int a_sh_rd_trans[2][b_sh_wr_iters][thread_m_blocks];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++) {
+  #pragma unroll
+    for (int j = 0; j < thread_m_blocks; j++) {
+      a_sh_rd_trans[0][i][j] =
+          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+      a_sh_rd_trans[1][i][j] =
+          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd + 2);
+    }
+  }
+
+  // Since B-accesses have non-constant stride they have to be computed at
+  // runtime; we break dependencies between subsequent accesses with a tile by
+  // maintining multiple pointers (we have enough registers), a tiny
+  // optimization.
+  const int4* B_ptr[b_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++)
+    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
+
+  bool m_sh_wr_pred = threadIdx.x < m_sh_wr_delta;
+  const int4* meta_ptr[m_sh_iters];
+  #pragma unroll
+  for (int i = 0; i < m_sh_iters; i++)
+    meta_ptr[i] = meta + m_gl_rd_delta_i * i + m_gl_rd;
+
+  extern __shared__ int4 sh[];
+  // Shared memory storage for global fetch pipelines.
+  int4* sh_a = sh;
+  int4* sh_b = sh_a + (stages * a_sh_stage);
+  int4* sh_s = sh_b + (stages * b_sh_stage);
+  int4* sh_m = sh_s + (stages * s_sh_stage);
+  // Register storage for double buffer of shared memory reads.
+  FragA frag_a[2][thread_m_blocks][2];
+  I4 frag_b_quant[2][b_thread_vecs];
+  FragM frag_m[2][2];
+  FragC frag_c[thread_m_blocks][4][2];
+  FragS frag_s[2][4];
+
+  // Zero accumulators.
+  auto zero_accums = [&]() {
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
+      reinterpret_cast<float*>(frag_c)[i] = 0;
+  };
+
+  // Asynchronously fetch the next A, B and s tile from global to the next
+  // shared memory pipeline location.
+  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
+    if (pred) {
+      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < a_sh_wr_iters; i++) {
+        cp_async4_pred(
+            &sh_a_stage[a_sh_wr_trans[i]],
+            &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
+            a_sh_wr_pred[i]);
+      }
+      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < b_sh_wr_iters; i++) {
+  #pragma unroll
+        for (int j = 0; j < b_thread_vecs; j++) {
+          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j);
+        }
+        B_ptr[i] += b_gl_rd_delta_o;
+      }
+      int4* sh_meta_stage = sh_m + m_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < m_sh_iters; i++) {
+        if (m_sh_wr_pred)
+          cp_async4(&sh_meta_stage[m_sh_wr_delta * i + m_sh_wr], meta_ptr[i]);
+        meta_ptr[i] += m_gl_rd_delta_o;
+      }
+      // Only fetch scales if this tile starts a new group
+      if constexpr (group_blocks != -1) {
+        // This assumes group_blocks >= thread_k_blocks
+        // and would need to be modified to support smaller groups.
+        static_assert(group_blocks >= thread_k_blocks);
+        if (pipe % (group_blocks / thread_k_blocks) == 0) {
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+          if (s_sh_wr_pred) cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
+          s_gl_rd += s_gl_rd_delta;
+        }
+      }
+    }
+    // Insert a fence even when we are winding down the pipeline to ensure that
+    // waiting is also correct at this point.
+    cp_async_fence();
+  };
+
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<stages - 2>();
+    __syncthreads();
+  };
+
+  // Load the next sub-tile from the current location in the shared memory pipe
+  // into the current register buffer.
+  auto fetch_to_registers = [&](int k, int pipe) {
+    // It may seem inefficient that we reload the groups for every sub-tile;
+    // however, this does not seem to be a significant bottleneck, while some
+    // theoretically better attempts have lead to bad instruction ordering by
+    // the compiler and correspondingly a noticeable drop in performance.
+    if constexpr (group_blocks != -1) {
+      // This assumes group_blocks >= thread_k_blocks
+      // and would need to be modified to support smaller groups.
+      static_assert(group_blocks >= thread_k_blocks);
+      int4* sh_s_stage =
+          sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
+                               (pipe / (group_blocks / thread_k_blocks)));
+      reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
+    }
+    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks; i++) {
+      ldsm4(frag_a[k % 2][i][0],
+            &sh_a_stage[a_sh_rd_trans[0][k % b_sh_wr_iters][i]]);
+      ldsm4(frag_a[k % 2][i][1],
+            &sh_a_stage[a_sh_rd_trans[1][k % b_sh_wr_iters][i]]);
+    }
+
+    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+  #pragma unroll
+    for (int i = 0; i < b_thread_vecs; i++) {
+      frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
+          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
+    }
+
+    // Load meta with ldsm4
+    int4* sh_m_stage = sh_m + m_sh_stage * pipe;
+    ldsm4_m(frag_m[k % 2][0],
+            &sh_m_stage[m_sh_rd_delta * (k % m_sh_iters) + m_sh_rd]);
+  };
+
+  // Execute the actual tensor core matmul of a sub-tile.
+  auto matmul = [&](int k) {
+  // We have the m dimension as the inner loop in order to encourage overlapping
+  // dequantization and matmul operations.
+  #pragma unroll
+    for (int j = 0; j < 4; j++) {
+      FragB frag_b0;
+      FragB frag_b1;
+
+      if constexpr (num_bits == 4) {
+        int b_quant = frag_b_quant[k % 2][0][j];
+        int b_quant_shift = b_quant >> 8;
+
+        frag_b0 = dequant_4bit(b_quant);
+        frag_b1 = dequant_4bit(b_quant_shift);
+
+      } else {
+        int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k % 2]);
+        int b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
+        int b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
+
+        frag_b0 = dequant_8bit(b_quant_0);
+        frag_b1 = dequant_8bit(b_quant_1);
+      }
+
+      // If there are no groups, we can just scale the final output once and can
+      // avoid doing so for each weight.
+      if constexpr (group_blocks != -1) {
+        scale(frag_b0, frag_s[k % 2][j], 0);
+      }
+      if constexpr (group_blocks != -1) {
+        scale(frag_b1, frag_s[k % 2][j], 1);
+      }
+
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        mma_sp(frag_b0, frag_b1, frag_a[k % 2][i][0], frag_c[i][j][0],
+               frag_m[k % 2][j / 2], j % 2);
+      }
+    }
+  };
+
+  // Since we slice across the k dimension of a tile in order to increase the
+  // number of warps while keeping the n dimension of a tile reasonable, we have
+  // multiple warps that accumulate their partial sums of the same output
+  // location; which we have to reduce over in the end. We do in shared memory.
+  auto thread_block_reduce = [&]() {
+    constexpr int red_off = threads / b_sh_stride_threads / 2;
+    if (red_off >= 1) {
+      auto red_idx = threadIdx.x / b_sh_stride_threads;
+      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
+      constexpr int red_sh_delta = b_sh_stride_threads;
+      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
+                      (threadIdx.x % b_sh_stride_threads);
+
+  // Parallel logarithmic shared memory reduction. We make sure to avoid any
+  // unnecessary read or write iterations, e.g., for two warps we write only
+  // once by warp 1 and read only once by warp 0.
+  #pragma unroll
+      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
+  #pragma unroll
+        for (int i = red_off; i > 0; i /= 2) {
+          if (i <= red_idx && red_idx < 2 * i) {
+  #pragma unroll
+            for (int j = 0; j < 4 * 2; j++) {
+              int red_sh_wr =
+                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
+              if (i < red_off) {
+                float* c_rd =
+                    reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
+                float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
+  #pragma unroll
+                for (int k = 0; k < 4; k++)
+                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
+                      c_rd[k] + c_wr[k];
+              }
+              sh[red_sh_wr] =
+                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
+            }
+          }
+          __syncthreads();
+        }
+        if (red_idx == 0) {
+  #pragma unroll
+          for (int i = 0; i < 4 * 2; i++) {
+            float* c_rd =
+                reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
+  #pragma unroll
+            for (int j = 0; j < 4; j++)
+              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
+                  c_rd[j];
+          }
+        }
+        __syncthreads();
+      }
+    }
+  };
+
+  // Since multiple threadblocks may process parts of the same column slice, we
+  // finally have to globally reduce over the results. As the striped
+  // partitioning minimizes the number of such reductions and our outputs are
+  // usually rather small, we perform this reduction serially in L2 cache.
+  auto global_reduce = [&](bool first = false, bool last = false) {
+    // We are very careful here to reduce directly in the output buffer to
+    // maximize L2 cache utilization in this step. To do this, we write out
+    // results in FP16 (but still reduce with FP32 compute).
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    if (threadIdx.x < active_threads) {
+      int c_gl_stride = prob_n / 8;
+      int c_gl_wr_delta_o = 2 * 4 * c_gl_stride;
+      int c_gl_wr_delta_i =
+          c_gl_stride;  // 8 threads (e.g., 0,4,8,12,16,20,24,28)
+      int c_gl_wr = 2 * c_gl_stride * (threadIdx.x % 4) +
+                    8 * (threadIdx.x / 32) + (threadIdx.x % 32) / 4;
+      c_gl_wr += (2 * thread_n_blocks) * slice_col;
+      constexpr int c_sh_wr_delta = active_threads;
+      auto c_sh_wr = threadIdx.x;
+
+      int col = 2 * ((threadIdx.x % 32) % 4);
+
+      if (!first) {
+  // Interestingly, doing direct global accesses here really seems to mess up
+  // the compiler and lead to slowdowns, hence we also use async-copies even
+  // though these fetches are not actually asynchronous.
+  #pragma unroll
+        for (int i = 0; i < thread_m_blocks * 4; i++) {
+          cp_async4_pred(&sh[c_sh_wr + c_sh_wr_delta * i],
+                         &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
+                            c_gl_wr_delta_i * (i % 2)],
+                         i < (thread_m_blocks - 1) * 4 ||
+                             8 * (i / 2) + col + (i % 2) < prob_m);
+        }
+        cp_async_fence();
+        cp_async_wait<0>();
+      }
+
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks * 4; i++) {
+        if (i < (thread_m_blocks - 1) * 4 ||
+            8 * (i / 2) + col + (i % 2) < prob_m) {
+          if (!first) {
+            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
+  #pragma unroll
+            for (int j2 = 0; j2 < 2; j2++) {
+  #pragma unroll
+              for (int j1 = 0; j1 < 4; j1++) {
+                reinterpret_cast<float*>(
+                    &frag_c)[4 * 2 * 4 * (i / 4) + 8 * j1 + 2 * j2 +
+                             4 * ((i % 4) / 2) + i % 2] +=
+                    __half2float(
+                        reinterpret_cast<__half*>(&c_red)[(j2 * 4 + j1)]);
+              }
+            }
+          }
+          if (!last) {
+            int4 c;
+  #pragma unroll
+            for (int j2 = 0; j2 < 2; j2++) {
+  #pragma unroll
+              for (int j1 = 0; j1 < 4; j1++) {
+                reinterpret_cast<__half*>(&c)[(j2 * 4 + j1)] =
+                    __float2half(reinterpret_cast<float*>(
+                        &frag_c)[4 * 2 * 4 * (i / 4) + 8 * j1 + 2 * j2 +
+                                 4 * ((i % 4) / 2) + i % 2]);
+              }
+            }
+            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] =
+                c;
+          }
+        }
+      }
+    }
+  };
+
+  // Write out the reduce final result in the correct layout. We only actually
+  // reshuffle matrix fragments in this step, the reduction above is performed
+  // in fragment layout.
+  auto write_result = [&]() {
+    int c_gl_stride = prob_n / 8;
+
+    constexpr int c_sh_stride = 2 * thread_n_blocks;              // RLC:
+    constexpr int c_sh_stride_2 = 2 * c_sh_stride + 2;            // RLC:
+    constexpr int c_sh_stride_3 = 2 * (2 * thread_n_blocks) + 2;  // RLC:
+
+    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
+
+    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+    c_gl_wr += (2 * thread_n_blocks) * slice_col;
+
+    int c_sh_wr = c_sh_stride_2 * ((threadIdx.x % 32) % 4) +
+                  ((threadIdx.x % 32) / 4);  // RLC:
+    c_sh_wr += 8 * (threadIdx.x / 32);       // 128/4(half4)
+
+    constexpr int c_sh_rd_delta =
+        c_sh_stride_3 * (threads / (2 * 2 * thread_n_blocks));  // RLC:
+    int c_sh_rd = c_sh_stride_3 * (threadIdx.x / (2 * 2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * 2 * thread_n_blocks));
+
+    int c_gl_wr_end = c_gl_stride * prob_m;
+
+    auto write = [&](int idx, float c0, float c1, float c2, float c3, FragS& s0,
+                     float c4, float c5, float c6, float c7, FragS& s1) {
+      uint2 res[2];
+      res[0] = to_half4(c0, c1, c2, c3);
+      res[1] = to_half4(c4, c5, c6, c7);
+      half2* tmp = (half2*)&res;
+      // for per-column quantization we finally apply the scale here
+      if constexpr (group_blocks == -1 && num_bits == 4) {
+        tmp[0] = __hmul2(tmp[0], s0[0]);
+        tmp[1] = __hmul2(tmp[1], s0[1]);
+        tmp[2] = __hmul2(tmp[2], s1[0]);
+        tmp[3] = __hmul2(tmp[3], s1[1]);
+      }
+      ((int4*)sh)[idx] = *((int4*)&res[0]);
+    };
+
+    // RLC:  only warp 0 and 1 baseline example
+    if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        int wr = c_sh_wr;
+        write(wr, frag_c[i][0][0][0], frag_c[i][1][0][0], frag_c[i][2][0][0],
+              frag_c[i][3][0][0], frag_s[0][0], frag_c[i][0][0][2],
+              frag_c[i][1][0][2], frag_c[i][2][0][2], frag_c[i][3][0][2],
+              frag_s[0][2]);
+        write(wr + c_sh_stride, frag_c[i][0][0][1], frag_c[i][1][0][1],
+              frag_c[i][2][0][1], frag_c[i][3][0][1], frag_s[0][0],
+              frag_c[i][0][0][3], frag_c[i][1][0][3], frag_c[i][2][0][3],
+              frag_c[i][3][0][3], frag_s[0][2]);
+        write(wr + 4 * c_sh_stride_2, frag_c[i][0][1][0], frag_c[i][1][1][0],
+              frag_c[i][2][1][0], frag_c[i][3][1][0], frag_s[0][0],
+              frag_c[i][0][1][2], frag_c[i][1][1][2], frag_c[i][2][1][2],
+              frag_c[i][3][1][2], frag_s[0][2]);
+        write(wr + 4 * c_sh_stride_2 + c_sh_stride, frag_c[i][0][1][1],
+              frag_c[i][1][1][1], frag_c[i][2][1][1], frag_c[i][3][1][1],
+              frag_s[0][0], frag_c[i][0][1][3], frag_c[i][1][1][3],
+              frag_c[i][2][1][3], frag_c[i][3][1][3], frag_s[0][2]);
+
+        c_sh_wr += 8 * c_sh_stride_2;
+      }
+    }
+    __syncthreads();
+
+  #pragma unroll
+    for (int i = 0;
+         i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
+         i++) {
+      if (c_gl_wr < c_gl_wr_end) {
+        C[c_gl_wr] = sh[c_sh_rd];
+        c_gl_wr += c_gl_wr_delta;
+        c_sh_rd += c_sh_rd_delta;
+      }
+    }
+  };
+
+  // Start global fetch and register load pipelines.
+  auto start_pipes = [&]() {
+  #pragma unroll
+    for (int i = 0; i < stages - 1; i++) fetch_to_shared(i, i, i < slice_iters);
+    zero_accums();
+    wait_for_stage();
+    fetch_to_registers(0, 0);
+    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
+  };
+  start_pipes();
+
+  // Main loop.
+  while (slice_iters) {
+  // We unroll over both the global fetch and the register load pipeline to
+  // ensure all shared memory accesses are static. Note that both pipelines have
+  // even length meaning that the next iteration will always start at index 0.
+  #pragma unroll
+    for (int pipe = 0; pipe < stages;) {
+      fetch_to_shared((pipe + stages - 1) % stages, pipe,
+                      slice_iters >= stages);
+      matmul(pipe);
+      wait_for_stage();
+
+      fetch_to_registers(pipe + 1, (pipe + 1) % stages);
+
+      pipe++;
+      slice_iters--;
+      if (slice_iters == 0) break;
+    }
+    a_gl_rd += a_gl_rd_delta_o * stages;
+
+    // Process results and, if necessary, proceed to the next column slice.
+    // While this pattern may not be the most readable, other ways of writing
+    // the loop seemed to noticeably worse performance after compilation.
+    if (slice_iters == 0) {
+      cp_async_wait<0>();
+      bool last = slice_idx == slice_count - 1;
+      // For per-column scales, we only fetch them here in the final step before
+      // write-out
+      if constexpr (group_blocks == -1) {
+        if constexpr (num_bits == 8) {
+          if (s_sh_wr_pred) cp_async4(&sh_s[s_sh_wr], &s[s_gl_rd]);
+          cp_async_fence();
+        } else {
+          if (last) {
+            if (s_sh_wr_pred) cp_async4(&sh_s[s_sh_wr], &s[s_gl_rd]);
+            cp_async_fence();
+          }
+        }
+      }
+      thread_block_reduce();
+
+      if constexpr (group_blocks == -1) {
+        if constexpr (num_bits == 8) {
+          cp_async_wait<0>();
+          __syncthreads();
+          if (threadIdx.x / 32 < thread_n_blocks / 4) {
+            *(float4*)(frag_s) = *(float4*)(&sh_s[s_sh_rd]);
+          }
+        } else {
+          if (last) {
+            cp_async_wait<0>();
+            __syncthreads();
+            if (threadIdx.x / 32 < thread_n_blocks / 4) {
+              *(float4*)(frag_s) = *(float4*)(&sh_s[s_sh_rd]);
+            }
+          }
+        }
+      }
+
+      // For 8-bit channelwise, we apply the scale before the global reduction
+      // that converts the fp32 results to fp16 (so that we avoid possible
+      // overflow in fp16)
+      if constexpr (group_blocks == -1 && num_bits == 8) {
+        if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+          for (int i = 0; i < thread_m_blocks; i++) {
+            scale_floats(&frag_c[i][0][0][0], &frag_c[i][1][0][0],
+                         &frag_c[i][2][0][0], &frag_c[i][3][0][0], frag_s[0][0],
+                         &frag_c[i][0][0][2], &frag_c[i][1][0][2],
+                         &frag_c[i][2][0][2], &frag_c[i][3][0][2],
+                         frag_s[0][2]);
+
+            scale_floats(&frag_c[i][0][0][1], &frag_c[i][1][0][1],
+                         &frag_c[i][2][0][1], &frag_c[i][3][0][1], frag_s[0][0],
+                         &frag_c[i][0][0][3], &frag_c[i][1][0][3],
+                         &frag_c[i][2][0][3], &frag_c[i][3][0][3],
+                         frag_s[0][2]);
+
+            scale_floats(&frag_c[i][0][1][0], &frag_c[i][1][1][0],
+                         &frag_c[i][2][1][0], &frag_c[i][3][1][0], frag_s[0][0],
+                         &frag_c[i][0][1][2], &frag_c[i][1][1][2],
+                         &frag_c[i][2][1][2], &frag_c[i][3][1][2],
+                         frag_s[0][2]);
+
+            scale_floats(&frag_c[i][0][1][1], &frag_c[i][1][1][1],
+                         &frag_c[i][2][1][1], &frag_c[i][3][1][1], frag_s[0][0],
+                         &frag_c[i][0][1][3], &frag_c[i][1][1][3],
+                         &frag_c[i][2][1][3], &frag_c[i][3][1][3],
+                         frag_s[0][2]);
+          }
+        }
+      }
+
+      if (slice_count > 1) {  // only globally reduce if there is more than one
+                              // block in a slice
+        barrier_acquire(&locks[slice_col], slice_idx);
+        global_reduce(slice_idx == 0, last);
+        barrier_release(&locks[slice_col], last);
+      }
+      if (last)  // only the last block in a slice actually writes the result
+        write_result();
+
+      slice_row = 0;
+      slice_col_par++;
+      slice_col++;
+      init_slice();
+      if (slice_iters) {
+        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                  (threadIdx.x % a_gl_rd_delta_o);
+  #pragma unroll
+        for (int i = 0; i < b_sh_wr_iters; i++)
+          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
+  #pragma unroll
+        for (int i = 0; i < m_sh_iters; i++)
+          meta_ptr[i] += (m_sh_stride)-m_gl_rd_delta_o * k_tiles;
+        if (slice_col == 0) {
+  #pragma unroll
+          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
+  #pragma unroll
+          for (int i = 0; i < m_sh_iters; i++) meta_ptr[i] -= m_gl_stride;
+        }
+        s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+        start_pipes();
+      }
+    }
+  }
+}
+
+#endif
+
+#define CALL_IF_2_4(NUM_BITS, THREAD_M_BLOCKS, THREAD_N_BLOCKS,               \
+                    THREAD_K_BLOCKS, GROUP_BLOCKS)                            \
+  else if (num_bits == NUM_BITS && thread_m_blocks == THREAD_M_BLOCKS &&      \
+           thread_n_blocks == THREAD_N_BLOCKS &&                              \
+           thread_k_blocks == THREAD_K_BLOCKS &&                              \
+           group_blocks == GROUP_BLOCKS) {                                    \
+    cudaFuncSetAttribute(                                                     \
+        Marlin_24<NUM_BITS, THREADS, THREAD_N_BLOCKS, THREAD_M_BLOCKS,        \
+                  THREAD_K_BLOCKS, STAGES, GROUP_BLOCKS>,                     \
+        cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);         \
+    Marlin_24<NUM_BITS, THREADS, THREAD_N_BLOCKS, THREAD_M_BLOCKS,            \
+              THREAD_K_BLOCKS, STAGES, GROUP_BLOCKS>                          \
+        <<<blocks, THREADS, max_shared_mem, stream>>>(A_ptr, B_ptr, meta_ptr, \
+                                                      C_ptr, s_ptr, prob_n,   \
+                                                      prob_m, prob_k, locks); \
+  }
+
+void marlin_cuda_2_4(const void* A, const void* B, const void* meta, void* C,
+                     void* s, int prob_m, int prob_n, int prob_k,
+                     void* workspace, int num_bits, int groupsize = -1,
+                     int dev = 0, cudaStream_t stream = 0, int thread_k = -1,
+                     int thread_m = -1, int sms = -1, int max_par = 16) {
+  int tot_n = prob_n;
+  int tot_n_blocks = ceildiv(tot_n, 16);
+  int pad = 16 * tot_n_blocks - tot_n;
+
+  if (sms == -1) {
+    cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
+  }
+  TORCH_CHECK(sms > 0);
+
+  int max_shared_mem = 0;
+  cudaDeviceGetAttribute(&max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  TORCH_CHECK(max_shared_mem > 0);
+
+  if (thread_k == -1 || thread_m == -1) {
+    if (prob_n <= 16) {
+      // For small batchizes, better partitioningif is slightly more important
+      // than better compute utilization
+      thread_k = 128;
+      thread_m = 128;
+    } else {
+      thread_k = 64;
+      thread_m = 256;
+    }
+    // Also had
+    // if prob_n > 256
+    //   thread_k = 32;
+    //   thread_m = 512;
+    // but this is broken,
+    // TODO(Lucas, Alex M): figure out why
+  }
+
+  int thread_k_blocks = thread_k / 32;  // 2:4 version with m16n8k32 instruction
+  int thread_m_blocks = thread_m / 16;
+  int group_blocks = (groupsize == -1) ? -1 : groupsize / 16;
+  int blocks = sms;
+
+  TORCH_CHECK(prob_m % thread_m == 0, "prob_m = ", prob_m,
+              " is not divisible by thread_m = ", thread_m);
+  TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
+              " is not divisible by thread_k = ", thread_k);
+  if (group_blocks != -1) {
+    TORCH_CHECK((prob_k / 2) % group_blocks == 0, "prob_k/2 = ", prob_k / 2,
+                " is not divisible by group_blocks = ", group_blocks);
+  }
+
+  TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
+              ", ", prob_n, ", ", prob_k, "]");
+
+  const int4* A_ptr = (const int4*)A;
+  const int4* B_ptr = (const int4*)B;
+  const int4* meta_ptr = (const int4*)meta;
+  int4* C_ptr = (int4*)C;
+  const int4* s_ptr = (const int4*)s;
+
+  constexpr int max_m_blocks = 4;
+
+  int* locks = (int*)workspace;
+  for (int i = 0; i < tot_n_blocks; i += max_m_blocks) {
+    int thread_n_blocks = tot_n_blocks - i;
+    prob_n = tot_n - 16 * i;
+    int par = 1;
+    if (thread_n_blocks > max_m_blocks) {
+      // Note that parallel > 1 currently only works for inputs without any
+      // padding
+      par = (16 * thread_n_blocks - pad) / (max_m_blocks * 16);
+      if (par > max_par) par = max_par;
+      prob_n = (max_m_blocks * 16) * par;
+      i += max_m_blocks * (par - 1);
+      thread_n_blocks = max_m_blocks;
+    }
+
+    // For compilation speed, we only define the kernel configurations that have
+    // seemed useful (in terms of performance) in our testing, however many more
+    // are, in principle, possible.
+
+    // the false is start of the CALL_IF macros
+    if (false) {
+    }  //         BMxBNxBK,   group
+    // 4-bit
+    CALL_IF_2_4(4, 8, 1, 4, -1)  // e.g., 16x128x128
+    CALL_IF_2_4(4, 8, 1, 4, 4)   // e.g., 16x128x128, 64
+
+    CALL_IF_2_4(4, 16, 1, 2, -1)  // e.g., 16x256x64
+    CALL_IF_2_4(4, 16, 1, 2, 4)   // e.g., 16x256x64,  64
+    CALL_IF_2_4(4, 16, 2, 2, -1)  // e.g.. 32x256x64
+    CALL_IF_2_4(4, 16, 2, 2, 4)
+    CALL_IF_2_4(4, 16, 3, 2, -1)
+    CALL_IF_2_4(4, 16, 3, 2, 4)
+    CALL_IF_2_4(4, 16, 4, 2, -1)
+    CALL_IF_2_4(4, 16, 4, 2, 4)
+
+    CALL_IF_2_4(4, 32, 1, 1, -1)  // e.g., 16x256x64
+    CALL_IF_2_4(4, 32, 1, 1, 4)   // e.g., 16x256x64,  64
+    CALL_IF_2_4(4, 32, 2, 1, -1)  // e.g.. 32x256x64
+    CALL_IF_2_4(4, 32, 2, 1, 4)
+    CALL_IF_2_4(4, 32, 3, 1, -1)
+    CALL_IF_2_4(4, 32, 3, 1, 4)
+    CALL_IF_2_4(4, 32, 4, 1, -1)
+    CALL_IF_2_4(4, 32, 4, 1, 4)
+
+    // 8-bit
+    CALL_IF_2_4(8, 8, 1, 4, -1)  // e.g., 16x128x128
+    CALL_IF_2_4(8, 8, 1, 4, 4)   // e.g., 16x128x128, 64
+
+    CALL_IF_2_4(8, 16, 1, 2, -1)  // e.g., 16x256x64
+    CALL_IF_2_4(8, 16, 1, 2, 4)   // e.g., 16x256x64,  64
+    CALL_IF_2_4(8, 16, 2, 2, -1)  // e.g.. 32x256x64
+    CALL_IF_2_4(8, 16, 2, 2, 4)
+    CALL_IF_2_4(8, 16, 3, 2, -1)
+    CALL_IF_2_4(8, 16, 3, 2, 4)
+    CALL_IF_2_4(8, 16, 4, 2, -1)
+    CALL_IF_2_4(8, 16, 4, 2, 4)
+
+    CALL_IF_2_4(8, 32, 1, 1, -1)  // e.g., 16x256x64
+    CALL_IF_2_4(8, 32, 1, 1, 4)   // e.g., 16x256x64,  64
+    CALL_IF_2_4(8, 32, 2, 1, -1)  // e.g.. 32x256x64
+    CALL_IF_2_4(8, 32, 2, 1, 4)
+    CALL_IF_2_4(8, 32, 3, 1, -1)
+    CALL_IF_2_4(8, 32, 3, 1, 4)
+    CALL_IF_2_4(8, 32, 4, 1, -1)
+    CALL_IF_2_4(8, 32, 4, 1, 4)
+    else {
+      throw std::runtime_error("Unsupported shapes: MKN = [" + str(prob_m) +
+                               ", " + str(prob_k) + ", " + str(prob_n) + "]" +
+                               ", groupsize = " + str(groupsize) +
+                               ", thread_m_blocks = " + str(thread_m_blocks) +
+                               ", thread_n_blocks = " + str(thread_n_blocks) +
+                               ", thread_k_blocks = " + str(thread_k_blocks));
+    }
+
+    A_ptr += 16 * thread_n_blocks * (prob_k / 8) * par;
+    C_ptr += 16 * thread_n_blocks * (prob_m / 8) * par;
+  }
+}
+
+}  // namespace marlin_24
+
+torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
+                                  torch::Tensor& b_meta,
+                                  torch::Tensor& b_scales,
+                                  torch::Tensor& workspace,
+                                  vllm::ScalarTypeId const b_q_type_id,
+                                  int64_t size_m, int64_t size_n,
+                                  int64_t size_k) {
+  vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id);
+  // Verify num_bits
+  TORCH_CHECK(b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128,
+              "num_bits must be uint4b8 or uint8b128. Got = ", b_q_type.str());
+  int pack_factor = 32 / b_q_type.size_bits();
+
+  // Verify M
+  TORCH_CHECK(size_m == a.size(0),
+              "Shape mismatch: a.size(0) = " + str(a.size(0)) +
+                  ", size_m = " + str(size_m));
+
+  // Verify K
+  TORCH_CHECK(size_k == a.size(1),
+              "Shape mismatch: a.size(1) = " + str(a.size(1)) +
+                  ", size_k = " + str(size_k));
+  TORCH_CHECK(size_k % marlin_24::tile_size == 0,
+              "size_k = " + str(size_k) + " is not divisible by tile_size = " +
+                  str(marlin_24::tile_size));
+  TORCH_CHECK((size_k / marlin_24::tile_size / 2) == b_q_weight.size(0),
+              "Shape mismatch: b_q_weight.size(0) = " +
+                  str(b_q_weight.size(0)) + ", size_k = " + str(size_k) +
+                  ", tile_size = " + str(marlin_24::tile_size));
+
+  // Verify N
+  TORCH_CHECK(b_scales.size(1) == size_n,
+              "b_scales.size(1) = " + str(b_scales.size(1)) +
+                  ", size_n = " + str(size_n));
+  TORCH_CHECK(
+      b_q_weight.size(1) % marlin_24::tile_size == 0,
+      "b_q_weight.size(1) = " + str(b_q_weight.size(1)) +
+          " is not divisible by tile_size = " + str(marlin_24::tile_size));
+
+  int actual_size_n = (b_q_weight.size(1) / marlin_24::tile_size) * pack_factor;
+  TORCH_CHECK(
+      size_n == actual_size_n,
+      "size_n = " + str(size_n) + ", actual_size_n = " + str(actual_size_n));
+
+  // Verify meta
+  TORCH_CHECK(b_meta.size(0) == size_k / 8 / 2 / 2,
+              "b_meta.size(0) = ", b_meta.size(0),
+              " is not size_k / 8 / 2 / 2 = ", size_k / 8 / 2 / 2);
+  TORCH_CHECK(b_meta.size(1) == size_n * 2, "b_meta.size(1) = ", b_meta.size(1),
+              " is not size_n * 2 = ", size_n * 2);
+
+  // Verify A device and strides
+  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
+  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
+  TORCH_CHECK(a.dtype() == torch::kFloat16,
+              "A is not float16, currently only float16 is supported");
+
+  // Verify B device and strides
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+
+  // Verify b_meta device and strides
+  TORCH_CHECK(b_meta.device().is_cuda(), "b_meta is not on GPU");
+  TORCH_CHECK(b_meta.is_contiguous(), "b_meta is not contiguous");
+
+  // Verify scales device and strides
+  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
+  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat16,
+              "A is not float16, currently only float16 is supported");
+
+  // Alloc C matrix
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
+  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
+  torch::Tensor c = torch::empty({size_m, size_n}, options);
+
+  int thread_k = -1;
+  int thread_m = -1;
+  int sms = -1;
+  int max_par = marlin_24::max_par;
+
+  int groupsize = -1;
+  if (b_scales.size(0) > 1) {
+    TORCH_CHECK(size_k % b_scales.size(0) == 0,
+                "size_k = " + str(size_k) +
+                    ", is not divisible by b_scales.size(0) = " +
+                    str(b_scales.size(0)));
+    groupsize = size_k / b_scales.size(0);
+    groupsize /= 2;  // Because of 24
+  }
+
+  // Verify groupsize
+  TORCH_CHECK(groupsize == -1 || groupsize == 64,
+              "Unexpected groupsize = " + str(groupsize));
+
+  // Verify workspace size
+  TORCH_CHECK(size_n % marlin_24::min_thread_n == 0,
+              "size_n = " + str(size_n) +
+                  ", is not divisible by min_thread_n = " +
+                  str(marlin_24::min_thread_n));
+  int min_workspace_size =
+      (size_n / marlin_24::min_thread_n) * marlin_24::max_par;
+  TORCH_CHECK(workspace.numel() >= min_workspace_size,
+              "workspace.numel = " + str(workspace.numel()) +
+                  " is below min_workspace_size = " + str(min_workspace_size));
+
+  int dev = a.get_device();
+  marlin_24::marlin_cuda_2_4(
+      a.data_ptr(), b_q_weight.data_ptr(), b_meta.data_ptr(), c.data_ptr(),
+      b_scales.data_ptr(), size_n, size_m, size_k, workspace.data_ptr(),
+      b_q_type.size_bits(), groupsize, dev, at::cuda::getCurrentCUDAStream(dev),
+      thread_k, thread_m, sms, max_par);
+
+  return c;
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("gptq_marlin_24_gemm", &gptq_marlin_24_gemm);
+}
diff --git a/vllm_v0.10.0/csrc/quantization/utils.cuh b/vllm_v0.10.0/csrc/quantization/utils.cuh
new file mode 100644
index 0000000..73055a1
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/utils.cuh
@@ -0,0 +1,59 @@
+#pragma once
+
+/**
+ * Quantization utilities including:
+ *   Adjusted maximum values for qtypes.
+ *   Minimum scaling factors for qtypes.
+ */
+
+#include <cmath>
+#include <torch/types.h>
+
+#ifndef USE_ROCM
+  #include <c10/util/Float8_e4m3fn.h>
+  #define MAYBE_HOST_DEVICE C10_HOST_DEVICE
+#else
+  #include <ATen/hip/HIPContext.h>
+  #include <c10/util/Float8_e4m3fn.h>
+  #include <c10/util/Float8_e4m3fnuz.h>
+  // ROCm doesn't seem to need C10_HOST_DEVICE for static constexpr
+  #define MAYBE_HOST_DEVICE
+#endif
+
+template <typename T,
+          typename = std::enable_if_t<std::is_same_v<T, c10::Float8_e4m3fn> ||
+                                      std::is_same_v<T, c10::Float8_e4m3fnuz> ||
+                                      std::is_same_v<T, int8_t>>>
+struct quant_type_max {
+  static constexpr T val() { return std::numeric_limits<T>::max(); }
+};
+
+// Using the default max value from pytorch (240.0 0x7F) will cause accuracy
+// issues when running dynamic quantization. Here use 224.0 0x7E for rocm.
+template <>
+struct quant_type_max<c10::Float8_e4m3fnuz> {
+  static constexpr c10::Float8_e4m3fnuz val() {
+    return c10::Float8_e4m3fnuz(0x7E, c10::Float8_e4m3fnuz::from_bits());
+  }
+};
+
+template <typename T>
+MAYBE_HOST_DEVICE static constexpr T quant_type_max_v =
+    quant_type_max<T>::val();
+
+template <typename T,
+          typename = std::enable_if_t<std::is_same_v<T, c10::Float8_e4m3fn> ||
+                                      std::is_same_v<T, c10::Float8_e4m3fnuz> ||
+                                      std::is_same_v<T, int8_t>>>
+struct min_scaling_factor {
+  C10_DEVICE C10_ALWAYS_INLINE static float val() {
+    return 1.0f / (quant_type_max_v<T> * 512.0f);
+  }
+};
+
+template <>
+struct min_scaling_factor<int8_t> {
+  C10_DEVICE C10_ALWAYS_INLINE static float val() {
+    return std::numeric_limits<float>::epsilon();
+  }
+};
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/quantization/vectorization.cuh b/vllm_v0.10.0/csrc/quantization/vectorization.cuh
new file mode 100644
index 0000000..11d57a5
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/vectorization.cuh
@@ -0,0 +1,31 @@
+#pragma once
+/**
+ * __device__ datatypes vectorized by 4
+ */
+
+// Include both AMD and NVIDIA fp8 types to avoid circular import
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e4m3fn.h>
+
+namespace vllm {
+
+// Vectorization containers
+template <typename scalar_t, size_t vec_size>
+struct __align__(vec_size * sizeof(scalar_t)) vec_n_t {
+  scalar_t val[vec_size];
+};
+
+template <typename quant_type_t, size_t vec_size>
+struct __align__(vec_size * sizeof(quant_type_t)) q8_n_t {
+  static_assert(std::is_same_v<quant_type_t, int8_t> ||
+                std::is_same_v<quant_type_t, c10::Float8_e4m3fn> ||
+                std::is_same_v<quant_type_t, c10::Float8_e4m3fnuz>);
+  quant_type_t val[vec_size];
+};
+
+template <typename scalar_t>
+using vec4_t = vec_n_t<scalar_t, 4>;
+template <typename quant_type_t>
+using q8x4_t = q8_n_t<quant_type_t, 4>;
+
+}  // namespace vllm
diff --git a/vllm_v0.10.0/csrc/quantization/vectorization_utils.cuh b/vllm_v0.10.0/csrc/quantization/vectorization_utils.cuh
new file mode 100644
index 0000000..8aa0147
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quantization/vectorization_utils.cuh
@@ -0,0 +1,172 @@
+#pragma once
+#include "vectorization.cuh"
+
+namespace vllm {
+
+template <int VEC_SIZE, typename InT, typename OutT, typename ScaOp>
+struct DefaultVecOp {
+  ScaOp scalar_op;
+
+  __device__ __forceinline__ void operator()(
+      vec_n_t<OutT, VEC_SIZE>& dst, const vec_n_t<InT, VEC_SIZE>& src) const {
+#pragma unroll
+    for (int i = 0; i < VEC_SIZE; ++i) {
+      scalar_op(dst.val[i], src.val[i]);
+    }
+  }
+};
+
+template <int VEC_SIZE, typename InT, typename OutT, typename VecOp,
+          typename ScaOp>
+__device__ inline void vectorize_with_alignment(
+    const InT* in, OutT* out, int len, int tid, int stride,
+    VecOp&& vec_op,       // vec_n_t<InT,16> -> vec_n_t<OutT,16>
+    ScaOp&& scalar_op) {  // InT -> OutT
+  static_assert(VEC_SIZE > 0 && (VEC_SIZE & (VEC_SIZE - 1)) == 0,
+                "VEC_SIZE must be a positive power-of-two");
+  constexpr int WIDTH = VEC_SIZE * sizeof(InT);  // eg: 64 B
+  uintptr_t addr = reinterpret_cast<uintptr_t>(in);
+
+  // fast path when the whole region is already aligned
+  // Note: currently the output is guaranteed to be same as the input, so we
+  // don't check it here, comments here just for future reference.
+  bool can_vec = ((addr & (WIDTH - 1)) == 0) && ((len & (VEC_SIZE - 1)) == 0);
+  if (can_vec) {
+    int num_vec = len / VEC_SIZE;
+
+    using vin_t = vec_n_t<InT, VEC_SIZE>;
+    using vout_t = vec_n_t<OutT, VEC_SIZE>;
+    auto* v_in = reinterpret_cast<const vin_t*>(in);
+    auto* v_out = reinterpret_cast<vout_t*>(out);
+
+    for (int i = tid; i < num_vec; i += stride) {
+      vout_t tmp;
+      vec_op(tmp, v_in[i]);
+      v_out[i] = tmp;
+    }
+    return;
+  }
+
+  int misalignment_offset = addr & (WIDTH - 1);       // addr % 64
+  int alignment_bytes = WIDTH - misalignment_offset;  // 64 - (addr % 64)
+  int prefix_elems = alignment_bytes & (WIDTH - 1);   // handle 64
+  prefix_elems /= sizeof(InT);
+  prefix_elems = min(prefix_elems, len);  // 0 ≤ prefix < 16
+
+  // 1. prefill the when it is unsafe to vectorize
+  for (int i = tid; i < prefix_elems; i += stride) {
+    scalar_op(out[i], in[i]);
+  }
+
+  in += prefix_elems;
+  out += prefix_elems;
+  len -= prefix_elems;
+
+  int num_vec = len / VEC_SIZE;
+  using vin_t = vec_n_t<InT, VEC_SIZE>;
+  using vout_t = vec_n_t<OutT, VEC_SIZE>;
+  auto* v_in = reinterpret_cast<const vin_t*>(in);
+  auto* v_out = reinterpret_cast<vout_t*>(out);
+
+  // 2. vectorize the main part
+  for (int i = tid; i < num_vec; i += stride) {
+    vout_t tmp;
+    vec_op(tmp, v_in[i]);
+    v_out[i] = tmp;
+  }
+
+  // 3. handle the tail
+  int tail_start = num_vec * VEC_SIZE;
+  for (int i = tid + tail_start; i < len; i += stride) {
+    scalar_op(out[i], in[i]);
+  }
+}
+
+template <int VEC_SIZE, typename InT, typename OutT, typename ScaOp>
+__device__ __forceinline__ void vectorize_with_alignment(const InT* in,
+                                                         OutT* out, int len,
+                                                         int tid, int stride,
+                                                         ScaOp&& scalar_op) {
+  using Vec = DefaultVecOp<VEC_SIZE, InT, OutT, std::decay_t<ScaOp>>;
+  vectorize_with_alignment<VEC_SIZE>(in, out, len, tid, stride, Vec{scalar_op},
+                                     std::forward<ScaOp>(scalar_op));
+}
+
+template <int VEC_SIZE, typename InT, typename ScaOp>
+struct DefaultReadVecOp {
+  ScaOp scalar_op;
+
+  __device__ __forceinline__ void operator()(
+      const vec_n_t<InT, VEC_SIZE>& src) const {
+#pragma unroll
+    for (int i = 0; i < VEC_SIZE; ++i) {
+      scalar_op(src.val[i]);
+    }
+  }
+};
+
+// read-only version: iterate over the input with alignment guarantees
+template <int VEC_SIZE, typename InT, typename VecOp, typename ScaOp>
+__device__ inline void vectorize_read_with_alignment(const InT* in, int len,
+                                                     int tid, int stride,
+                                                     VecOp&& vec_op,
+                                                     ScaOp&& scalar_op) {
+  static_assert(VEC_SIZE > 0 && (VEC_SIZE & (VEC_SIZE - 1)) == 0,
+                "VEC_SIZE must be a positive power-of-two");
+  constexpr int WIDTH = VEC_SIZE * sizeof(InT);
+  uintptr_t addr = reinterpret_cast<uintptr_t>(in);
+
+  // fast path when the whole region is already aligned
+  bool can_vec = ((addr & (WIDTH - 1)) == 0) && ((len & (VEC_SIZE - 1)) == 0);
+  if (can_vec) {
+    int num_vec = len / VEC_SIZE;
+
+    using vin_t = vec_n_t<InT, VEC_SIZE>;
+    auto* v_in = reinterpret_cast<const vin_t*>(in);
+
+    for (int i = tid; i < num_vec; i += stride) {
+      vec_op(v_in[i]);
+    }
+    return;
+  }
+
+  int misalignment_offset = addr & (WIDTH - 1);
+  int alignment_bytes = WIDTH - misalignment_offset;
+  int prefix_elems = alignment_bytes & (WIDTH - 1);
+  prefix_elems /= sizeof(InT);
+  prefix_elems = min(prefix_elems, len);
+
+  // 1. handle the possibly unaligned prefix with scalar access.
+  for (int i = tid; i < prefix_elems; i += stride) {
+    scalar_op(in[i]);
+  }
+
+  in += prefix_elems;
+  len -= prefix_elems;
+
+  int num_vec = len / VEC_SIZE;
+  using vin_t = vec_n_t<InT, VEC_SIZE>;
+  auto* v_in = reinterpret_cast<const vin_t*>(in);
+
+  // 2. vectorized traversal of the main aligned region.
+  for (int i = tid; i < num_vec; i += stride) {
+    vec_op(v_in[i]);
+  }
+
+  // 3. handle remaining tail elements.
+  int tail_start = num_vec * VEC_SIZE;
+  for (int i = tid + tail_start; i < len; i += stride) {
+    scalar_op(in[i]);
+  }
+}
+
+// overload that requires only a scalar_op
+template <int VEC_SIZE, typename InT, typename ScaOp>
+__device__ __forceinline__ void vectorize_read_with_alignment(
+    const InT* in, int len, int tid, int stride, ScaOp&& scalar_op) {
+  using Vec = DefaultReadVecOp<VEC_SIZE, InT, std::decay_t<ScaOp>>;
+  vectorize_read_with_alignment<VEC_SIZE>(in, len, tid, stride, Vec{scalar_op},
+                                          std::forward<ScaOp>(scalar_op));
+}
+
+}  // namespace vllm
diff --git a/vllm_v0.10.0/csrc/quickreduce/base.h b/vllm_v0.10.0/csrc/quickreduce/base.h
new file mode 100644
index 0000000..a2170e4
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quickreduce/base.h
@@ -0,0 +1,338 @@
+#pragma once
+
+#include <cstdint>
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <hip/hip_bf16.h>
+
+#define __quickreduce_device_inline__ __device__ __forceinline__
+#define __quickreduce_launch_bounds_two_shot__ __launch_bounds__(256, 4)
+#define __quickreduce_launch_bounds_one_shot__ __launch_bounds__(512, 4)
+
+namespace quickreduce {
+
+typedef __hip_bfloat16 nv_bfloat16;
+typedef __hip_bfloat162 nv_bfloat162;
+
+using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
+using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
+
+// Setup acquire-release semantics for vector memory reads (mubuf instruction)
+// as per architecture.
+#if defined(__gfx942__)
+// CDNA3: Scope bits sc0, sc1
+  #define MUBUF_ACQUIRE 16
+  #define MUBUF_RELEASE 16
+#elif (defined(__gfx908__) || defined(__gfx90a__))
+// CDNA1 and CDNA2 - glc bit
+  #define MUBUF_ACQUIRE 1
+  #define MUBUF_RELEASE 0
+#endif
+
+static constexpr int kNegOne = 0xBC00BC00;  // {-1, -1}, fp16x2_t
+
+// Number of atoms (4xf16x2_t) processed by a single thread
+static constexpr int kAtoms = 8;
+
+// We use a workgroup of 256 threads
+static constexpr int kBlockSize = 256;
+static constexpr int kAtomStride = kBlockSize;
+
+// Size and atom stride of source/destination data that the block will
+// process.
+// Workgroup scope = Tile = (256 threads x 8 atoms x 16B)
+static constexpr int kTileSize = kBlockSize * kAtoms * sizeof(int32x4_t);
+
+// Max number of blocks. 304 CUs on MI300
+static constexpr int kMaxNumBlocks = 304 * 4;
+
+// Standard CDNA wavefront size.
+static constexpr int kWavefront = 64;
+
+// 256 thread, 4 wavefronts.
+static dim3 constexpr kBlockTwoShot = {kWavefront, kBlockSize / kWavefront, 1};
+
+// Number of threads in a group for quantization
+// It corresponds to 32 F16 elements in quantization block
+static constexpr int kThreadGroupSize = 8;
+
+// Methods
+__quickreduce_device_inline__ __host__ unsigned long divceil(unsigned long x,
+                                                             unsigned long y) {
+  return ((x + y - 1) / y);
+}
+
+union BufferResource {
+  __quickreduce_device_inline__ constexpr BufferResource()
+      : config(0x00020000U) {}
+
+  __quickreduce_device_inline__ constexpr BufferResource(void* buffer_address,
+                                                         uint32_t buffer_size)
+      : address(buffer_address), range(buffer_size), config(0x00020000U) {}
+
+  int32x4_t descriptor;
+  struct {
+    void* address;  // 8B, out of which first 48b is address, and 16b is stride
+    // (unused)
+    uint32_t range;   // Byte range for the buffer resource
+    uint32_t config;  // Constant, DFMT=32b
+  };
+};
+
+__quickreduce_device_inline__ static int32x4_t buffer_load_dwordx4(
+    int32x4_t srsrc, int32_t voffset, int32_t soffset,
+    int32_t aux) __asm("llvm.amdgcn.raw.buffer.load.v4i32");
+
+__quickreduce_device_inline__ static void buffer_store_dwordx4(
+    int32x4_t data, int32x4_t srsrc, int32_t voffset, int32_t soffset,
+    int32_t aux) __asm("llvm.amdgcn.raw.buffer.store.v4i32");
+
+__quickreduce_device_inline__ static void set_fp16_ovfl(bool const value) {
+#if defined(__gfx942__)
+  if (value) {
+    asm volatile("s_setreg_imm32_b32 0xdc1, 1;" ::);
+  } else {
+    asm volatile("s_setreg_imm32_b32 0xdc1, 0;" ::);
+  }
+#endif
+}
+union bf162_int_union {
+  int i;
+  nv_bfloat162 bf2;
+};
+
+template <typename T>
+__quickreduce_device_inline__ void packed_assign_add(int32x4_t* A,
+                                                     int32x4_t* B);
+
+template <>
+__quickreduce_device_inline__ void packed_assign_add<half>(int32x4_t* A,
+                                                           int32x4_t* B) {
+  int32x4_t& tR_fragment = A[0];
+  int32x4_t& tA_fragment = B[0];
+
+  asm volatile("v_pk_add_f16 %0, %1, %2"
+               : "=v"(tR_fragment[0])
+               : "v"(tR_fragment[0]), "v"(tA_fragment[0]));
+  asm volatile("v_pk_add_f16 %0, %1, %2"
+               : "=v"(tR_fragment[1])
+               : "v"(tR_fragment[1]), "v"(tA_fragment[1]));
+  asm volatile("v_pk_add_f16 %0, %1, %2"
+               : "=v"(tR_fragment[2])
+               : "v"(tR_fragment[2]), "v"(tA_fragment[2]));
+  asm volatile("v_pk_add_f16 %0, %1, %2"
+               : "=v"(tR_fragment[3])
+               : "v"(tR_fragment[3]), "v"(tA_fragment[3]));
+}
+
+template <>
+__quickreduce_device_inline__ void packed_assign_add<nv_bfloat16>(
+    int32x4_t* A, int32x4_t* B) {
+  nv_bfloat162* tA = reinterpret_cast<nv_bfloat162*>(A);
+  nv_bfloat162* tB = reinterpret_cast<nv_bfloat162*>(B);
+#pragma unroll
+  for (int i = 0; i < 4; i++) {
+    tA[i] = __hadd2(tA[i], tB[i]);
+  }
+}
+
+template <typename T>
+__quickreduce_device_inline__ int packed_max(int a, int b);
+
+template <>
+__quickreduce_device_inline__ int packed_max<half>(int a, int b) {
+  int result;
+  asm volatile("v_pk_max_f16 %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
+  return result;
+}
+
+template <>
+__quickreduce_device_inline__ int packed_max<nv_bfloat16>(int a, int b) {
+  bf162_int_union A, B, R;
+  A.i = a;
+  B.i = b;
+  R.bf2 = __hmax2(A.bf2, B.bf2);
+  return R.i;
+}
+
+template <typename T>
+__quickreduce_device_inline__ int packed_min(int a, int b);
+
+template <>
+__quickreduce_device_inline__ int packed_min<half>(int a, int b) {
+  int result;
+  asm volatile("v_pk_min_f16 %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
+  return result;
+}
+
+template <>
+__quickreduce_device_inline__ int packed_min<nv_bfloat16>(int a, int b) {
+  bf162_int_union A, B, R;
+  A.i = a;
+  B.i = b;
+  R.bf2 = __hmin2(A.bf2, B.bf2);
+  return R.i;
+}
+
+template <typename T>
+__quickreduce_device_inline__ int packed_abs_max(int a, int b);
+
+template <>
+__quickreduce_device_inline__ int packed_abs_max<half>(int a, int b) {
+  half2 wmaxh2 = __builtin_bit_cast(half2, a);
+  half2 wminh2 = __builtin_bit_cast(half2, b);
+  half2 wblockmaxh2;
+
+  wblockmaxh2.x =
+      __hgt(__habs(wmaxh2.x), __habs(wminh2.x)) ? wmaxh2.x : wminh2.x;
+  wblockmaxh2.y =
+      __hgt(__habs(wmaxh2.y), __habs(wminh2.y)) ? wmaxh2.y : wminh2.y;
+  return __builtin_bit_cast(int, wblockmaxh2);
+}
+
+template <>
+__quickreduce_device_inline__ int packed_abs_max<nv_bfloat16>(int a, int b) {
+  bf162_int_union A, B, R;
+  A.i = a;
+  B.i = b;
+  R.bf2.x = __hgt(__habs(A.bf2.x), __habs(B.bf2.x)) ? A.bf2.x : B.bf2.x;
+  R.bf2.y = __hgt(__habs(A.bf2.y), __habs(B.bf2.y)) ? A.bf2.y : B.bf2.y;
+  return R.i;
+}
+
+template <typename T>
+__quickreduce_device_inline__ int packed_add(int a, int b);
+
+template <>
+__quickreduce_device_inline__ int packed_add<half>(int a, int b) {
+  int result;
+  asm volatile("v_pk_add_f16 %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
+  return result;
+}
+
+template <>
+__quickreduce_device_inline__ int packed_add<nv_bfloat16>(int a, int b) {
+  bf162_int_union A, B, R;
+  A.i = a;
+  B.i = b;
+  R.bf2 = __hadd2(A.bf2, B.bf2);
+  return R.i;
+}
+
+template <>
+__quickreduce_device_inline__ int packed_add<int16_t>(int a, int b) {
+  int result;
+  asm volatile("v_pk_add_i16 %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
+  return result;
+}
+
+template <typename T>
+__quickreduce_device_inline__ int packed_sub(int a, int b);
+
+template <>
+__quickreduce_device_inline__ int packed_sub<half>(int a, int b) {
+  int result;
+
+  // MI300 lacks packed fp16 sub instruction. So we do -1 * min + max
+  asm volatile("v_pk_fma_f16 %0, %1, %2 %3"
+               : "=v"(result)
+               : "v"(kNegOne), "v"(b), "v"(a));
+  return result;
+}
+
+template <>
+__quickreduce_device_inline__ int packed_sub<nv_bfloat16>(int a, int b) {
+  bf162_int_union A, B, R;
+  A.i = a;
+  B.i = b;
+  R.bf2 = __hsub2(A.bf2, B.bf2);
+  return R.i;
+}
+
+template <typename T>
+__quickreduce_device_inline__ int packed_mul(int a, int b);
+
+template <>
+__quickreduce_device_inline__ int packed_mul<half>(int a, int b) {
+  int result;
+  asm volatile("v_pk_mul_f16 %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
+  return result;
+}
+
+template <>
+__quickreduce_device_inline__ int packed_mul<nv_bfloat16>(int a, int b) {
+  nv_bfloat162* tA = reinterpret_cast<nv_bfloat162*>(&a);
+  nv_bfloat162* tB = reinterpret_cast<nv_bfloat162*>(&b);
+  nv_bfloat162 tR = __hmul2(*tA, *tB);
+  return *(reinterpret_cast<int*>(&tR));
+}
+
+template <typename T>
+__quickreduce_device_inline__ int packed_rcp(int a);
+
+template <>
+__quickreduce_device_inline__ int packed_rcp<half>(int a) {
+  return __builtin_bit_cast(int, h2rcp(__builtin_bit_cast(half2, a)));
+}
+
+template <>
+__quickreduce_device_inline__ int packed_rcp<nv_bfloat16>(int a) {
+  bf162_int_union A, R;
+  A.i = a;
+  R.bf2 = h2rcp(A.bf2);
+  return R.i;
+}
+
+// changes dtype
+__quickreduce_device_inline__ float T2float_cast(half a) {
+  return __half2float(a);
+}
+
+__quickreduce_device_inline__ float T2float_cast(nv_bfloat16 a) {
+  return __bfloat162float(a);
+}
+
+template <typename T>
+__quickreduce_device_inline__ int group_abs_max(int32x4_t atom) {
+  const int group_leader = (threadIdx.x / kThreadGroupSize) * kThreadGroupSize;
+
+  int wmax, wmin, wblockmax;
+  int a, b;
+  a = packed_max<T>(atom[0], atom[1]);
+  b = packed_max<T>(atom[2], atom[3]);
+
+  wmax = packed_max<T>(a, b);
+
+  a = packed_min<T>(atom[0], atom[1]);
+  b = packed_min<T>(atom[2], atom[3]);
+
+  wmin = packed_min<T>(a, b);
+
+  // Reduce the max among a group of threads
+  // Note: This is basically 2 blocks of values setup as the
+  // upper/lower halves of the f16x2_t
+  for (int i = 1; i < kThreadGroupSize; i <<= 1) {
+    int x = __shfl_down(wmax, i);
+    wmax = packed_max<T>(wmax, x);
+
+    int y = __shfl_down(wmin, i);
+    wmin = packed_min<T>(wmin, y);
+  }
+  wblockmax = packed_abs_max<T>(wmax, wmin);
+  // Share with the cohort
+  wblockmax = __shfl(wblockmax, group_leader);
+  return wblockmax;
+}
+
+__quickreduce_device_inline__ void set_sync_flag(uint32_t* flag_ptr,
+                                                 uint32_t flag) {
+  __atomic_store_n(flag_ptr, flag, __ATOMIC_RELEASE);
+}
+
+__quickreduce_device_inline__ void wait_sync_flag(uint32_t* flag_ptr,
+                                                  uint32_t flag) {
+  while (__atomic_load_n(flag_ptr, __ATOMIC_RELAXED) != flag) {
+  }
+}
+
+}  // namespace quickreduce
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/quickreduce/quick_reduce.h b/vllm_v0.10.0/csrc/quickreduce/quick_reduce.h
new file mode 100644
index 0000000..4fe4c44
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quickreduce/quick_reduce.h
@@ -0,0 +1,196 @@
+#pragma once
+
+#include <vector>
+#include <hip/hip_runtime.h>
+#include "quick_reduce_impl.cuh"
+
+#define HIP_CHECK(err)                                                     \
+  do {                                                                     \
+    hipError_t err_ = (err);                                               \
+    if (err_ != hipSuccess) {                                              \
+      std::printf("HIP error %d at %s:%d. %s\n", err_, __FILE__, __LINE__, \
+                  hipGetErrorString(err_));                                \
+      throw std::runtime_error("HIP error");                               \
+    }                                                                      \
+  } while (0)
+
+namespace quickreduce {
+using fptr_t = int64_t;
+static_assert(sizeof(void*) == sizeof(fptr_t));
+
+template <typename AllReduceKernel, typename T>
+__global__ __quickreduce_launch_bounds_two_shot__ static void
+allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks,
+                            int rank, uint8_t** dbuffer_list,
+                            uint32_t data_offset, uint32_t flag_color) {
+  int block = blockIdx.x;
+  int grid = gridDim.x;
+
+  while (block < num_blocks) {
+    AllReduceKernel::run(A, B, N, block, rank, dbuffer_list, data_offset,
+                         flag_color);
+    block += grid;
+    flag_color++;
+  }
+}
+
+#define TWOSHOT_DISPATCH(__codec)                                           \
+  if (world_size == 2) {                                                    \
+    using LineCodec = __codec<T, 2>;                                        \
+    using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>;   \
+    hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>),   \
+                       dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
+                       num_blocks, rank, dbuffer_list, data_offset,         \
+                       flag_color);                                         \
+  } else if (world_size == 4) {                                             \
+    using LineCodec = __codec<T, 4>;                                        \
+    using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>;   \
+    hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>),   \
+                       dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
+                       num_blocks, rank, dbuffer_list, data_offset,         \
+                       flag_color);                                         \
+  } else if (world_size == 8) {                                             \
+    using LineCodec = __codec<T, 8>;                                        \
+    using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>;   \
+    hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>),   \
+                       dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
+                       num_blocks, rank, dbuffer_list, data_offset,         \
+                       flag_color);                                         \
+  }
+
+enum QuickReduceQuantLevel {
+  F16 = 0,
+  INT8 = 1,
+  INT6 = 2,
+  INT4 = 3,
+};
+
+struct DeviceComms {
+  // Max problem size is 2GB (in bytes) or half of uint32_t max value.
+  int64_t kMaxProblemSize =
+      static_cast<int64_t>(std::numeric_limits<int32_t>::max()) + 1;
+
+  // Max TP-8
+  static int constexpr kMaxWorldSize = 8;
+
+  bool initialized = false;
+  uint32_t flag_color = 1;
+  int world_size;
+  int rank;
+
+  uint8_t* dbuffer;
+  uint8_t** dbuffer_list;
+  hipIpcMemHandle_t buffer_ipc_handle;
+  std::vector<hipIpcMemHandle_t> all_buffer_ipc_handles;
+  std::vector<uint8_t*> buffer_list;
+  uint32_t data_offset;
+
+  DeviceComms() : initialized(false), world_size(1), rank(0) {}
+  ~DeviceComms() { destroy(); }
+
+  void init(int world_size, int rank,
+            std::optional<int64_t> max_problem_size = std::nullopt) {
+    destroy();
+    this->world_size = world_size;
+    this->rank = rank;
+    if (max_problem_size.has_value() && max_problem_size.value() > 0) {
+      this->kMaxProblemSize = max_problem_size.value();
+    }
+    // Allocate buffer size for worst case: F16 2-stage buffer.
+    uint32_t flags_buffer_size =
+        2 * world_size * kMaxNumBlocks * sizeof(uint32_t);
+    static int64_t data_buffer_size = 2 * this->kMaxProblemSize;
+    int64_t total_buffer_size = flags_buffer_size + data_buffer_size;
+    data_offset = flags_buffer_size;
+    HIP_CHECK(hipExtMallocWithFlags((void**)&dbuffer, total_buffer_size,
+                                    hipDeviceMallocUncached));
+
+    // Clear the flags buffer.
+    HIP_CHECK(hipMemset(dbuffer, 0, flags_buffer_size));
+
+    // Device-side list of IPC buffers.
+    buffer_list.resize(world_size);
+    HIP_CHECK(hipMalloc(&dbuffer_list, world_size * sizeof(uint8_t*)));
+
+    // Create IPC handles for rank's communication buffer.
+    all_buffer_ipc_handles.resize(world_size);
+    HIP_CHECK(hipIpcGetMemHandle(&buffer_ipc_handle, dbuffer));
+
+    initialized = true;
+  }
+  int get_world_size() { return world_size; }
+  int get_rank() { return rank; }
+  bool status() { return initialized; }
+  hipIpcMemHandle_t const get_handle() { return buffer_ipc_handle; }
+
+  void destroy() {
+    if (initialized) {
+      for (int i = 0; i < world_size; i++) {
+        if (i != rank) {
+          HIP_CHECK(hipIpcCloseMemHandle(dbuffer_list[i]));
+        }
+      }
+
+      HIP_CHECK(hipFree(dbuffer));
+      HIP_CHECK(hipFree(dbuffer_list));
+
+      initialized = false;
+    }
+  }
+
+  void open_ipc_handles(std::vector<hipIpcMemHandle_t> const& ipc_handles) {
+    assert(ipc_handles.size() == all_buffer_ipc_handles.size());
+    for (int i = 0; i < world_size; i++) {
+      all_buffer_ipc_handles[i] = ipc_handles[i];
+    }
+
+    // Open device memory access to the IPC communication buffers.
+    // Note: For our own rank, we do not need to open a handle.
+    for (int i = 0; i < world_size; i++) {
+      if (i != rank) {
+        HIP_CHECK(hipIpcOpenMemHandle((void**)&buffer_list[i],
+                                      all_buffer_ipc_handles[i],
+                                      hipIpcMemLazyEnablePeerAccess));
+      } else {
+        buffer_list[i] = dbuffer;
+      }
+    }
+
+    HIP_CHECK(hipMemcpy(dbuffer_list, buffer_list.data(),
+                        world_size * sizeof(uint8_t*), hipMemcpyHostToDevice));
+  }
+
+  template <typename T, bool cast_bf2half>
+  void allreduce(T const* A, T* B, uint32_t N, int quant_level,
+                 hipStream_t stream) {
+    if (world_size != 2 && world_size != 4 && world_size != 8) {
+      throw std::runtime_error("All Reduce not supported for world_size = " +
+                               std::to_string(world_size));
+    }
+
+    // Configuration.
+    uint32_t msg_size = N * sizeof(T);
+    uint32_t num_blocks = divceil(msg_size, kTileSize);
+    uint32_t grid = min(kMaxNumBlocks, num_blocks);
+    auto quant_level_ = static_cast<QuickReduceQuantLevel>(quant_level);
+    switch (quant_level_) {
+      case QuickReduceQuantLevel::INT8:
+        TWOSHOT_DISPATCH(CodecQ8)
+        break;
+      case QuickReduceQuantLevel::INT6:
+        TWOSHOT_DISPATCH(CodecQ6)
+        break;
+      case QuickReduceQuantLevel::INT4:
+        TWOSHOT_DISPATCH(CodecQ4)
+        break;
+      default:
+        TWOSHOT_DISPATCH(CodecFP)
+        break;
+    }
+    HIP_CHECK(cudaGetLastError());
+    // Rotate the flag color.
+    flag_color += divceil(N, grid);
+  }
+};
+
+}  // namespace quickreduce
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/quickreduce/quick_reduce_impl.cuh b/vllm_v0.10.0/csrc/quickreduce/quick_reduce_impl.cuh
new file mode 100644
index 0000000..17816c5
--- /dev/null
+++ b/vllm_v0.10.0/csrc/quickreduce/quick_reduce_impl.cuh
@@ -0,0 +1,698 @@
+#pragma once
+
+#include <hip/hip_runtime.h>
+#include "base.h"
+
+namespace quickreduce {
+
+struct CodecBase {
+  const int thread;
+  const int rank;
+  const int group_leader;
+  __quickreduce_device_inline__ CodecBase(int thread, int rank)
+      : thread(thread),
+        rank(rank),
+        group_leader((threadIdx.x / kThreadGroupSize) * kThreadGroupSize) {
+    set_fp16_ovfl(true);
+  }
+};
+
+// Default full precision codec.
+template <typename T, int world_size>
+struct CodecFP : public CodecBase {
+  static constexpr int kWorldSize = world_size;
+  static constexpr int kRankAtoms = kAtoms / kWorldSize;
+
+  // Codec tile size process by this workgroup.
+  // Each thread processes atoms of f16x8_t (16B).
+  static constexpr int kRankTransmittedTileSize =
+      kBlockSize * kRankAtoms * sizeof(int32x4_t);
+  static_assert(kRankTransmittedTileSize % 16 == 0,
+                "kRankTransmittedTileSize must be 16B aligned.");
+
+  // Total tile size for the collective communication.
+  static constexpr int kTransmittedTileSize =
+      kRankTransmittedTileSize * kWorldSize;
+
+  __quickreduce_device_inline__ CodecFP(int thread, int rank)
+      : CodecBase(thread, rank) {}
+
+  __quickreduce_device_inline__ void send(int32x4_t* __restrict__ send_buffer,
+                                          const int32x4_t* __restrict__ data) {
+    for (int i = 0; i < kRankAtoms; i++) {
+      __builtin_nontemporal_store(data[i], send_buffer + thread);
+      send_buffer += kAtomStride;
+    }
+  }
+
+  __quickreduce_device_inline__ void recv(int32x4_t** __restrict__ recv_buffer,
+                                          int32x4_t* __restrict__ data) {
+    for (int i = 0; i < kRankAtoms; i++) {
+      data[i] = __builtin_nontemporal_load(*recv_buffer + thread);
+      *recv_buffer += kAtomStride;
+    }
+  }
+};
+
+// Int4 symmetric quantization codec.
+// We quantize the FP16 data to block-scaled Int4 in blocks of 4 *
+// kThreadGroupSize.
+template <typename T, int world_size>
+struct CodecQ4 : public CodecBase {
+  static constexpr int kWorldSize = world_size;
+
+  // Codec tile size process by this workgroup.
+  // Each threads processes a fragment of fp16x8_t (16B),
+  // into a int4x8_t (4B) and a fp16 scale shared among 32 values.
+  static constexpr int kRankAtoms = kAtoms / kWorldSize;
+  static constexpr int kRankTileStride = 1152;
+  static constexpr int kRankTileScaleOffset = 1024;
+  static constexpr int kRankTransmittedTileSize = kRankTileStride * kRankAtoms;
+  static_assert(kRankTransmittedTileSize % 16 == 0,
+                "kRankTransmittedTileSize must be 16B aligned.");
+
+  static constexpr int kRankBufferTileStride =
+      kRankTileStride / sizeof(int32x4_t);
+
+  // Total tile size for the collective communication.
+  static constexpr int kTransmittedTileSize =
+      kRankTransmittedTileSize * kWorldSize;
+
+  // Constants configuration
+
+  // {-1/8.0h, -1/8.0h}, f16x2_t
+  static constexpr int kScaleFactor =
+      std::is_same<T, half>::value ? 0xB000B000 : 0xBE00BE00;
+
+  // {1e-7, 1e-7}, f16x2_t
+  static constexpr int kScaleEpsilon =
+      std::is_same<T, half>::value ? 0x00010001 : 0x33D733D7;
+
+  // {-8, -8}, f16x2_t
+  static constexpr int kRangeMin =
+      std::is_same<T, half>::value ? 0xC800C800 : 0xC100C100;
+
+  // {+7, +7}, f16x2_t
+  static constexpr int kRangeMax =
+      std::is_same<T, half>::value ? 0x47004700 : 0x40E040E0;
+
+  // {+8, +8}, int16x2_t
+  static constexpr int kRangeBias = 0x00080008;
+
+  __quickreduce_device_inline__ CodecQ4(int thread, int rank)
+      : CodecBase(thread, rank) {}
+
+  __quickreduce_device_inline__ void send(int32x4_t* __restrict__ send_buffer,
+                                          const int32x4_t* __restrict__ data) {
+    for (int k = 0; k < kRankAtoms; k++) {
+      int32x4_t const atom = data[k];
+
+      // Compute the absolute maximum of the atom in the thread group
+      // In 2 blocks of values, upper/lower halves of the f16x2_t
+      int wblockmax = group_abs_max<T>(atom);
+
+      // Derive scales
+      int decoding_scale;
+      int encoding_scale;
+      decoding_scale = packed_mul<T>(wblockmax, kScaleFactor);
+      encoding_scale = packed_add<T>(decoding_scale, kScaleEpsilon);
+      encoding_scale = packed_rcp<T>(encoding_scale);
+
+      // Apply scales to get quantized values
+      int32x4_t w;
+      for (int i = 0; i < 4; i++) {
+        w[i] = packed_mul<T>(atom[i], encoding_scale);
+        w[i] = packed_max<T>(w[i], kRangeMin);
+        w[i] = packed_min<T>(w[i], kRangeMax);
+      }
+
+      // Convert from f16x2_t to uint16x2_t
+      int32x4_t q;
+      {
+        int16_t* qi = reinterpret_cast<int16_t*>(&q);
+        T* wh = reinterpret_cast<T*>(&w);
+        for (int i = 0; i < 8; i++) qi[i] = (int16_t)rintf(T2float_cast(wh[i]));
+
+        for (int i = 0; i < 4; i++) {
+          q[i] = packed_add<int16_t>(q[i], kRangeBias);
+        }
+      }
+
+      // Pack 8 x q4 into int32_t
+      int qw = q[0] | (q[1] << 4) | (q[2] << 8) | (q[3] << 12);
+
+      // Write quantized atom to send_buffer
+      // note: only the group leader stores the scale
+      uint8_t* atom_ptr =
+          reinterpret_cast<uint8_t*>(send_buffer + k * kRankBufferTileStride);
+      int32_t* qw_ptr = reinterpret_cast<int32_t*>(atom_ptr) + thread;
+      int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) +
+                    (thread / 8);
+
+      __builtin_nontemporal_store(qw, qw_ptr);
+      if (threadIdx.x == group_leader) {
+        __builtin_nontemporal_store(decoding_scale, qs_ptr);
+      }
+    }
+  }
+
+  __quickreduce_device_inline__ void recv(int32x4_t** __restrict__ recv_buffer,
+                                          int32x4_t* __restrict__ data) {
+    for (int k = 0; k < kRankAtoms; k++) {
+      // Directly read quantized atom from recv_buffer
+      uint8_t* atom_ptr = reinterpret_cast<uint8_t*>(*recv_buffer);
+      int32_t* qw_ptr = reinterpret_cast<int32_t*>(atom_ptr) + thread;
+      int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) +
+                    (thread / 8);
+
+      int32_t qw = __builtin_nontemporal_load(qw_ptr);
+      int qs = __builtin_nontemporal_load(qs_ptr);
+
+      *recv_buffer += kRankBufferTileStride;
+
+      // Unpack q4 into f16x8_t
+      int32x4_t w;
+      {
+        static constexpr uint kMask000F = 0x000F000F;
+        static constexpr uint kHalf2_1024 =
+            0x64006400;  // {1024.0, 1024.0}, fp16x2_t
+        static uint constexpr kHalf2_1032 =
+            0xE408E408;  // {-1032.0, -1032.0}, fp16x2_t
+
+        for (int i = 0; i < 4; i++) {
+          if constexpr (std::is_same<T, half>::value) {
+            int32_t q4 = ((qw >> (i * 4)) & kMask000F) | kHalf2_1024;
+            w[i] = packed_add<half>(q4, kHalf2_1032);
+          } else {
+            int32_t int16_2 = (qw >> (i * 4)) & kMask000F;
+            int16_t low = static_cast<int16_t>(int16_2 & 0xFFFF);
+            int16_t high = static_cast<int16_t>((int16_2 >> 16) & 0xFFFF);
+            nv_bfloat16 bf_low = __float2bfloat16(static_cast<float>(low));
+            nv_bfloat16 bf_high = __float2bfloat16(static_cast<float>(high));
+            nv_bfloat162 bf2 = __halves2bfloat162(bf_low, bf_high);
+            int32_t packed_bf16 = *reinterpret_cast<int32_t*>(&bf2);
+            w[i] = packed_add<nv_bfloat16>(packed_bf16, kRangeMin);
+          }
+        }
+      }
+
+      // Apply decoding scales
+      for (int i = 0; i < 4; i++) {
+        w[i] = packed_mul<T>(w[i], qs);
+      }
+
+      data[k] = w;
+    }
+  }
+};
+
+// Int6 symmetric quantization codec.
+// We quantize the FP16 data to block-scaled Int6 in blocks of 4 *
+// kThreadGroupSize.
+template <typename T, int world_size>
+struct CodecQ6 : public CodecBase {
+  static constexpr int kWorldSize = world_size;
+
+  // Codec tile size process by this workgroup.
+  // Each threads processes a fragment of fp16x8_t (16B),
+  // into a int6x8_t (4B + 2B) and a fp16 scale shared among 32 values.
+  static constexpr int kRankAtoms = kAtoms / kWorldSize;
+  static constexpr int kRankTileStride = 1664;
+  static constexpr int kRankTileQ2Offset = 1024;
+  static constexpr int kRankTileScaleOffset = 1536;
+  static constexpr int kRankTransmittedTileSize = kRankTileStride * kRankAtoms;
+  static_assert(kRankTransmittedTileSize % 16 == 0,
+                "kRankTransmittedTileSize must be 16B aligned.");
+
+  static constexpr int kRankBufferTileStride =
+      kRankTileStride / sizeof(int32x4_t);
+
+  // Total tile size for the collective communication.
+  static constexpr int kTransmittedTileSize =
+      kRankTransmittedTileSize * kWorldSize;
+
+  // Constants configuration
+
+  // {-1/32.0h, -1/32.0h}, fp16x2_t
+  static constexpr int kScaleFactor =
+      std::is_same<T, half>::value ? 0xA800A800 : 0xBD00BD00;
+
+  // {1e-7, 1e-7}, fp16x2_t
+  static constexpr int kScaleEpsilon =
+      std::is_same<T, half>::value ? 0x00010001 : 0x33D733D7;
+
+  // {-32, -32}, fp16x2_t
+  static constexpr int kRangeMin =
+      std::is_same<T, half>::value ? 0xD000D000 : 0xC200C200;
+
+  // {+31, +31}, fp16x2_t
+  static constexpr int kRangeMax =
+      std::is_same<T, half>::value ? 0x4FC04FC0 : 0x41F841F8;
+
+  // {+32, +32}, int16x2_t
+  static constexpr int kRangeBias = 0x00200020;
+
+  __quickreduce_device_inline__ CodecQ6(int thread, int rank)
+      : CodecBase(thread, rank) {}
+
+  __quickreduce_device_inline__ void send(int32x4_t* __restrict__ send_buffer,
+                                          const int32x4_t* __restrict__ data) {
+    for (int k = 0; k < kRankAtoms; k++) {
+      int32x4_t const atom = data[k];
+
+      // Compute the absolute maximum of the atom in the thread group
+      // In 2 blocks of values, upper/lower halves of the f16x2_t
+      int wblockmax = group_abs_max<T>(atom);
+
+      // Derive scales
+      int decoding_scale;
+      int encoding_scale;
+      decoding_scale = packed_mul<T>(wblockmax, kScaleFactor);
+      encoding_scale = packed_add<T>(decoding_scale, kScaleEpsilon);
+      encoding_scale = packed_rcp<T>(encoding_scale);
+
+      // Apply scales to get quantized values
+      int32x4_t w;
+      for (int i = 0; i < 4; i++) {
+        w[i] = packed_mul<T>(atom[i], encoding_scale);
+        w[i] = packed_max<T>(w[i], kRangeMin);
+        w[i] = packed_min<T>(w[i], kRangeMax);
+      }
+
+      // Convert from f16x2_t to uint16x2_t
+      int32x4_t q;
+      {
+        int16_t* qi = reinterpret_cast<int16_t*>(&q);
+        T* wh = reinterpret_cast<T*>(&w);
+        for (int i = 0; i < 8; i++) qi[i] = (int16_t)rintf(T2float_cast(wh[i]));
+
+        for (int i = 0; i < 4; i++) {
+          q[i] = packed_add<int16_t>(q[i], kRangeBias);
+        }
+      }
+
+      // Pack 8 x q6 into int32_t + int16_t
+      uint32_t q4w;
+      uint16_t q2w = 0;
+      q4w = (q[0] & 0x000F000F) | ((q[1] & 0x000F000F) << 4) |
+            ((q[2] & 0x000F000F) << 8) | ((q[3] & 0x000F000F) << 12);
+      {
+        int16_t* tw = reinterpret_cast<int16_t*>(&q);
+#pragma unroll
+        for (int i = 0; i < 8; i++) {
+          q2w |= (tw[i] >> 4) << (i * 2);
+        }
+      }
+      // Write quantized atom to send_buffer
+      // note: only the group leader stores the scale
+      uint8_t* atom_ptr =
+          reinterpret_cast<uint8_t*>(send_buffer + k * kRankBufferTileStride);
+      uint32_t* q4w_ptr = reinterpret_cast<uint32_t*>(atom_ptr) + thread;
+      uint16_t* q2w_ptr =
+          reinterpret_cast<uint16_t*>(atom_ptr + kRankTileQ2Offset) + thread;
+      int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) +
+                    (thread / 8);
+
+      __builtin_nontemporal_store(q4w, q4w_ptr);
+      __builtin_nontemporal_store(q2w, q2w_ptr);
+      if (threadIdx.x == group_leader) {
+        __builtin_nontemporal_store(decoding_scale, qs_ptr);
+      }
+    }
+  }
+
+  __quickreduce_device_inline__ void recv(int32x4_t** __restrict__ recv_buffer,
+                                          int32x4_t* __restrict__ data) {
+    for (int k = 0; k < kRankAtoms; k++) {
+      // Directly read quantized atom from recv_buffer
+      uint8_t* atom_ptr = reinterpret_cast<uint8_t*>(*recv_buffer);
+      uint32_t* q4w_ptr = reinterpret_cast<uint32_t*>(atom_ptr) + thread;
+      uint16_t* q2w_ptr =
+          reinterpret_cast<uint16_t*>(atom_ptr + kRankTileQ2Offset) + thread;
+      int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) +
+                    (thread / 8);
+
+      uint32_t q4w = __builtin_nontemporal_load(q4w_ptr);
+      uint16_t q2w = __builtin_nontemporal_load(q2w_ptr);
+      int qs = __builtin_nontemporal_load(qs_ptr);
+
+      *recv_buffer += kRankBufferTileStride;
+
+      // Unpack q6 into fp16x8_t
+      int32x4_t w;
+      {
+        static uint constexpr kMask000F = 0x000F000F;
+        static uint constexpr kHalf2_1024 =
+            0x64006400;  // {1024.0, 1024.0}, fp16x2_t
+        static uint constexpr kHalf2_1056 =
+            0xE420E420;  // {-1056.0, -1056.0}, fp16x2_t
+
+#pragma unroll
+        for (int i = 0; i < 4; i++) {
+          int32_t q4 = q4w & kMask000F;
+          int32_t q2 = (q2w & 0x3) | ((q2w & 0xC) << 14);
+          q4w >>= 4;
+          q2w >>= 4;
+          if constexpr (std::is_same<T, half>::value) {
+            int32_t q6 = q4 | (q2 << 4) | kHalf2_1024;
+            asm volatile("v_pk_add_f16 %0, %1, %2"
+                         : "=v"(w[i])
+                         : "v"(q6), "v"(kHalf2_1056));
+          } else {
+            int32_t int16_2 = q4 | (q2 << 4);
+            int16_t low = static_cast<int16_t>(int16_2 & 0xFFFF);
+            int16_t high = static_cast<int16_t>((int16_2 >> 16) & 0xFFFF);
+
+            nv_bfloat16 bf_low = __float2bfloat16(static_cast<float>(low));
+            nv_bfloat16 bf_high = __float2bfloat16(static_cast<float>(high));
+            nv_bfloat162 bf2 = __halves2bfloat162(bf_low, bf_high);
+            int32_t packed_bf16 = *reinterpret_cast<int32_t*>(&bf2);
+            w[i] = packed_add<nv_bfloat16>(packed_bf16, kRangeMin);
+          }
+        }
+      }
+
+      // Apply decoding scales
+      for (int i = 0; i < 4; i++) {
+        w[i] = packed_mul<T>(w[i], qs);
+      }
+
+      // That's pretty much it...
+      data[k] = w;
+    }
+  }
+};
+
+// Int8 symmetric quantization codec.
+// We quantize the FP16 data to block-scaled Int8 in blocks of 4 *
+// kThreadGroupSize.
+template <typename T, int world_size>
+struct CodecQ8 : public CodecBase {
+  static constexpr int kWorldSize = world_size;
+
+  // Codec tile size process by this workgroup.
+  // Each threads processes a fragment of f16x8_t (16B),
+  // into a int8x8_t (8B) and a f16 scale shared among 32 values.
+  static constexpr int kRankAtoms = kAtoms / kWorldSize;
+  static constexpr int kRankTileStride = 2176;
+  static constexpr int kRankTileScaleOffset = 2048;
+  static constexpr int kRankTransmittedTileSize = kRankTileStride * kRankAtoms;
+  static_assert(kRankTransmittedTileSize % 16 == 0,
+                "kRankTileSize must be 16B aligned.");
+
+  static constexpr int kRankBufferTileStride =
+      kRankTileStride / sizeof(int32x4_t);
+
+  // Total tile size for the collective communication.
+  static constexpr int kTransmittedTileSize =
+      kRankTransmittedTileSize * kWorldSize;
+
+  // Constants configuration
+
+  // {-1/128.0h, -1/128.0h}, f16x2_t
+  static constexpr int kScaleFactor =
+      std::is_same<T, half>::value ? 0xA000A000 : 0xBC00BC00;
+
+  // {1e-7, 1e-7}, f16x2_t
+  static constexpr int kScaleEpsilon =
+      std::is_same<T, half>::value ? 0x00010001 : 0x33D733D7;
+
+  // {-128, -128}, f16x2_t
+  static constexpr int kRangeMin =
+      std::is_same<T, half>::value ? 0xD800D800 : 0xC300C300;
+  // {+127, +127}, f16x2_t
+  static constexpr int kRangeMax =
+      std::is_same<T, half>::value ? 0x57F057F0 : 0x42FE42FE;
+
+  // {+128, +128}, int16x2_t
+  static constexpr int kRangeBias = 0x00800080;
+
+  __quickreduce_device_inline__ CodecQ8(int thread, int rank)
+      : CodecBase(thread, rank) {}
+
+  __quickreduce_device_inline__ void send(int32x4_t* __restrict__ send_buffer,
+                                          int32x4_t const* __restrict__ data) {
+    for (int k = 0; k < kRankAtoms; k++) {
+      int32x4_t const atom = data[k];
+      // Compute the absolute maximum of the atom in the thread group
+      // In 2 blocks of values, upper/lower halves of the f16x2_t
+      int wblockmax = group_abs_max<T>(atom);
+
+      // Derive scales
+      int decoding_scale;
+      int encoding_scale;
+      decoding_scale = packed_mul<T>(wblockmax, kScaleFactor);
+      encoding_scale = packed_add<T>(decoding_scale, kScaleEpsilon);
+      encoding_scale = packed_rcp<T>(encoding_scale);
+
+      // Apply scales to get quantized values
+      int32x4_t w;
+      for (int i = 0; i < 4; i++) {
+        w[i] = packed_mul<T>(atom[i], encoding_scale);
+        w[i] = packed_max<T>(w[i], kRangeMin);
+        w[i] = packed_min<T>(w[i], kRangeMax);
+      }
+
+      // Convert from f16x2_t to uint16x2_t
+      int32x4_t q;
+      {
+        int16_t* qi = reinterpret_cast<int16_t*>(&q);
+        T* wh = reinterpret_cast<T*>(&w);
+        for (int i = 0; i < 8; i++) qi[i] = (int16_t)rintf(T2float_cast(wh[i]));
+
+        for (int i = 0; i < 4; i++) {
+          q[i] = packed_add<int16_t>(q[i], kRangeBias);
+        }
+      }
+
+      // Pack 8 x q8 into int32x2_t
+      int32x2_t qw;
+      qw[0] = q[0] | (q[1] << 8);
+      qw[1] = q[2] | (q[3] << 8);
+
+      // Write quantized atom to send_buffer
+      // note: only the group leader stores the scale
+      uint8_t* atom_ptr =
+          reinterpret_cast<uint8_t*>(send_buffer + k * kRankBufferTileStride);
+      int32x2_t* qw_ptr = reinterpret_cast<int32x2_t*>(atom_ptr) + thread;
+      int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) +
+                    (thread / 8);
+
+      __builtin_nontemporal_store(qw, qw_ptr);
+      if (threadIdx.x == group_leader) {
+        __builtin_nontemporal_store(decoding_scale, qs_ptr);
+      }
+    }
+  }
+
+  __quickreduce_device_inline__ void recv(int32x4_t** __restrict__ recv_buffer,
+                                          int32x4_t* __restrict__ data) {
+    for (int k = 0; k < kRankAtoms; k++) {
+      // Directly read quantized atom from recv_buffer
+      uint8_t* atom_ptr = reinterpret_cast<uint8_t*>(*recv_buffer);
+      int32x2_t* qw_ptr = reinterpret_cast<int32x2_t*>(atom_ptr) + thread;
+      int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) +
+                    (thread / 8);
+
+      int32x2_t qw = __builtin_nontemporal_load(qw_ptr);
+      int qs = __builtin_nontemporal_load(qs_ptr);
+
+      *recv_buffer += kRankBufferTileStride;
+
+      // Unpack q8 into fp16x8_t
+      int32x4_t w;
+      {
+        static uint constexpr kMask00FF = 0x00FF00FF;
+
+        // {1024.0, 1024.0}, fp16x2_t
+        static uint constexpr kHalf2_1024 = 0x64006400;
+
+        // {-1152.0, -1152.0}, fp16x2_t
+        static uint constexpr kHalf2_1152 = 0xE480E480;
+
+#pragma unroll
+        for (int i = 0; i < 4; i++) {
+          if constexpr (std::is_same<T, half>::value) {
+            int32_t q8 =
+                ((qw[i / 2] >> ((i % 2) * 8)) & kMask00FF) | kHalf2_1024;
+            w[i] = packed_add<half>(q8, kHalf2_1152);
+          } else {
+            int32_t int16_2 = (qw[i / 2] >> ((i % 2) * 8)) & kMask00FF;
+            int16_t low = static_cast<int16_t>(int16_2 & 0xFFFF);
+            int16_t high = static_cast<int16_t>((int16_2 >> 16) & 0xFFFF);
+            nv_bfloat16 bf_low = __float2bfloat16(static_cast<float>(low));
+            nv_bfloat16 bf_high = __float2bfloat16(static_cast<float>(high));
+            nv_bfloat162 bf2 = __halves2bfloat162(bf_low, bf_high);
+            int32_t packed_bf16 = *reinterpret_cast<int32_t*>(&bf2);
+            w[i] = packed_add<nv_bfloat16>(packed_bf16, kRangeMin);
+          }
+        }
+      }
+
+      // Apply decoding scales
+      for (int i = 0; i < 4; i++) {
+        w[i] = packed_mul<T>(w[i], qs);
+      }
+
+      data[k] = w;
+    }
+  }
+};
+
+// Twoshot All Reduce
+template <typename T, class Codec, bool cast_bf2half>
+struct AllReduceTwoshot {
+  static_assert(sizeof(T) == 2);
+
+  static constexpr int kWorldSize = Codec::kWorldSize;
+
+  __device__ static void run(
+      T const* __restrict__ input, T* __restrict__ output,
+      uint32_t const N,                    // number of elements
+      int const block,                     // block index
+      int const rank,                      // rank index
+      uint8_t** __restrict__ buffer_list,  // communication buffers
+      uint32_t const data_offset,          // offset to start of the data buffer
+      uint32_t flag_color) {
+    // Topology
+    int thread = threadIdx.x + threadIdx.y * kWavefront;
+    uint8_t* rank_buffer = buffer_list[rank];
+    Codec codec(thread, rank);
+    int block_id = blockIdx.x;
+    int grid_size = gridDim.x;
+    // --------------------------------------------------------
+    // Read input into registers
+    int32x4_t tA[kAtoms];
+
+    BufferResource src_buffer(const_cast<T*>(input), N * sizeof(T));
+    uint32_t src_offset = block * kTileSize + thread * sizeof(int32x4_t);
+
+    for (int i = 0; i < kAtoms; i++) {
+      tA[i] = buffer_load_dwordx4(src_buffer.descriptor, src_offset, 0, 0);
+      src_offset += kAtomStride * sizeof(int32x4_t);
+      if constexpr (cast_bf2half) {
+        const nv_bfloat162* bf_buf =
+            reinterpret_cast<const nv_bfloat162*>(&tA[i]);
+        half2 half_buf[4];
+#pragma unroll
+        for (int j = 0; j < 4; ++j) {
+          float2 f = __bfloat1622float2(bf_buf[j]);
+          half_buf[j] = __float22half2_rn(f);
+        }
+        tA[i] = *reinterpret_cast<const int32x4_t*>(half_buf);
+      }
+    }
+
+    // --------------------------------------------------------
+    // Phase-1A: Write segment data into the communication buffer of the target
+    // rank responsible for this segment.
+    uint32_t comm_data0_offset =
+        data_offset + block_id * Codec::kTransmittedTileSize;
+    uint32_t comm_data1_offset =
+        grid_size * Codec::kTransmittedTileSize + comm_data0_offset;
+
+    uint32_t comm_flags0_offset = block_id * (kWorldSize * sizeof(uint32_t));
+    uint32_t comm_flags1_offset =
+        grid_size * (kWorldSize * sizeof(uint32_t)) + comm_flags0_offset;
+
+    for (int r = 0; r < kWorldSize; r++) {
+      int32x4_t* send_buffer =
+          reinterpret_cast<int32x4_t*>(buffer_list[r] + comm_data0_offset +
+                                       rank * Codec::kRankTransmittedTileSize);
+      codec.send(send_buffer, &tA[r * Codec::kRankAtoms]);
+    }
+
+    __syncthreads();
+    if (thread < kWorldSize) {
+      int r = thread;
+      uint32_t* flag_ptr = reinterpret_cast<uint32_t*>(
+          buffer_list[r] + comm_flags0_offset + rank * sizeof(uint32_t));
+      set_sync_flag(flag_ptr, flag_color);
+    }
+    // --------------------------------------------------------
+    // Phase-1B: Reduce the segment data from the communication buffers.
+    int32x4_t tR[Codec::kRankAtoms] = {};
+    {
+      // Read the data from the communication buffer.
+      int32x4_t* recv_buffer =
+          reinterpret_cast<int32x4_t*>(rank_buffer + comm_data0_offset);
+      uint32_t* flag_ptr =
+          reinterpret_cast<uint32_t*>(rank_buffer + comm_flags0_offset);
+
+      for (int r = 0; r < kWorldSize; r++) {
+        // Wait for the flags to be set.
+        if (thread == 0) {
+          wait_sync_flag(&flag_ptr[r], flag_color);
+        }
+        __syncthreads();
+
+        // note: we reuse tA as temp buffer here
+        codec.recv(&recv_buffer, tA);
+
+        for (int i = 0; i < Codec::kRankAtoms; i++) {
+          packed_assign_add<T>(&tR[i], &tA[i]);
+        }
+      }
+    }
+
+    // Phase-2: Write the reduced segment to every other rank
+    for (int r = 0; r < kWorldSize; r++) {
+      int32x4_t* send_buffer =
+          reinterpret_cast<int32x4_t*>(buffer_list[r] + comm_data1_offset +
+                                       rank * Codec::kRankTransmittedTileSize);
+      codec.send(send_buffer, tR);
+    }
+
+    __syncthreads();
+    if (thread < kWorldSize) {
+      int r = thread;
+      uint32_t* flag_ptr = reinterpret_cast<uint32_t*>(
+          buffer_list[r] + comm_flags1_offset + rank * sizeof(uint32_t));
+      set_sync_flag(flag_ptr, flag_color);
+    }
+
+    // Phase-2: Read the gather segments from the rank's communication buffer.
+    {
+      // Read the data from the communication buffer.
+      int32x4_t* recv_buffer =
+          reinterpret_cast<int32x4_t*>(rank_buffer + comm_data1_offset);
+      uint32_t* flag_ptr =
+          reinterpret_cast<uint32_t*>(rank_buffer + comm_flags1_offset);
+
+      for (int r = 0; r < kWorldSize; r++) {
+        // Wait for the flags to be set.
+        if (thread == 0) {
+          wait_sync_flag(&flag_ptr[r], flag_color);
+        }
+        __syncthreads();
+
+        // Gather all reduced and final rank segments into tA.
+        codec.recv(&recv_buffer, &tA[r * Codec::kRankAtoms]);
+      }
+    }
+
+    // --------------------------------------------------------
+    // Write the result to output.
+    BufferResource dst_buffer(output, N * sizeof(T));
+    uint32_t dst_offset = block * kTileSize + thread * sizeof(int32x4_t);
+
+    for (int i = 0; i < kAtoms; i++) {
+      if constexpr (cast_bf2half) {
+        const half2* half_buf = reinterpret_cast<const half2*>(&tA[i]);
+        nv_bfloat162 bf16_buf[4];
+#pragma unroll
+        for (int j = 0; j < 4; ++j) {
+          float2 f = __half22float2(half_buf[j]);
+          bf16_buf[j] = __float22bfloat162_rn(f);
+        }
+        buffer_store_dwordx4(*reinterpret_cast<const int32x4_t*>(bf16_buf),
+                             dst_buffer.descriptor, dst_offset, 0, 0);
+      } else {
+        buffer_store_dwordx4(tA[i], dst_buffer.descriptor, dst_offset, 0, 0);
+      }
+      dst_offset += kAtomStride * sizeof(int32x4_t);
+    }
+  }
+};
+
+}  // namespace quickreduce
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/rocm/attention.cu b/vllm_v0.10.0/csrc/rocm/attention.cu
new file mode 100644
index 0000000..3bddd12
--- /dev/null
+++ b/vllm_v0.10.0/csrc/rocm/attention.cu
@@ -0,0 +1,3544 @@
+/*
+ * Copyright (c) 2024, The vLLM team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <hip/hip_fp8.h>
+#include <hip/hip_bf16.h>
+#include "cuda_compat.h"
+
+#include <algorithm>
+#include "../attention/dtype_fp8.cuh"
+#include "../quantization/fp8/amd/quant_utils.cuh"
+
+#if defined(__HIPCC__) && \
+    (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__))
+  #define __HIP__GFX9__
+#endif
+
+#if defined(__HIPCC__) && (defined(__gfx1100__) || defined(__gfx1101__))
+  #define __HIP__GFX11__
+#endif
+
+#if defined(__HIPCC__) && (defined(__gfx1200__) || defined(__gfx1201__))
+  #define __HIP__GFX12__
+#endif
+
+#if defined(NDEBUG)
+  #undef NDEBUG
+  #include <assert.h>
+  #define UNREACHABLE_CODE assert(false);
+  #define NDEBUG
+#else
+  #define UNREACHABLE_CODE assert(false);
+#endif
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
+
+#if defined(__HIP__GFX9__)
+
+  #define GCN_MFMA_INSTR1 __builtin_amdgcn_mfma_f32_16x16x4f32
+  #define GCN_MFMA_INSTR __builtin_amdgcn_mfma_f32_4x4x4f16
+
+using floatx4 = __attribute__((__vector_size__(4 * sizeof(float)))) float;
+using float16x4 =
+    __attribute__((__vector_size__(4 * sizeof(_Float16)))) _Float16;
+typedef float16x4 _Half4;
+using float16x2 =
+    __attribute__((__vector_size__(2 * sizeof(_Float16)))) _Float16;
+typedef float16x2 _Half2;
+typedef struct _Half8 {
+  _Half4 xy[2];
+} _Half8;
+
+using bit16_t = uint16_t;
+using bit16x4 = __attribute__((__vector_size__(4 * sizeof(uint16_t)))) uint16_t;
+typedef bit16x4 _B16x4;
+typedef struct _B16x8 {
+  _B16x4 xy[2];
+} _B16x8;
+
+using _B8x8 = uint2;
+using _B8x4 = int32_t;  // used in builtins
+using bit8_t = uint8_t;
+
+typedef struct _B8x16 {
+  _B8x8 xy[2];
+} _B8x16;
+
+template <typename T, int absz, int cbid, int blgp>
+__device__ __forceinline__ floatx4 gcn_mfma4x4x4_instr(const _B16x4& inpA,
+                                                       const _B16x4& inpB,
+                                                       const floatx4& inpC) {
+  if constexpr (std::is_same<T, _Float16>::value) {
+    return __builtin_amdgcn_mfma_f32_4x4x4f16(inpA, inpB, inpC, absz, cbid,
+                                              blgp);
+  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
+    return __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(inpA, inpB, inpC, absz, cbid,
+                                                  blgp);
+  } else {
+    static_assert(false, "unsupported 16b dtype");
+  }
+}
+
+template <typename T, int absz, int cbid, int blgp>
+__device__ __forceinline__ floatx4 gcn_mfma16x16x16_instr(const _B16x4& inpA,
+                                                          const _B16x4& inpB,
+                                                          const floatx4& inpC) {
+  if constexpr (std::is_same<T, _Float16>::value) {
+    return __builtin_amdgcn_mfma_f32_16x16x16f16(inpA, inpB, inpC, absz, cbid,
+                                                 blgp);
+  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
+    return __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(inpA, inpB, inpC, absz,
+                                                     cbid, blgp);
+  } else {
+    static_assert(false, "unsupported 16b dtype");
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ float to_float(const T& inp) {
+  if constexpr (std::is_same<T, _Float16>::value) {
+    return (float)inp;
+  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
+    return __bfloat162float(inp);
+  } else {
+    static_assert(false, "unsupported 16b dtype");
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ T from_float(const float& inp) {
+  if constexpr (std::is_same<T, _Float16>::value) {
+    return (_Float16)inp;
+  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
+    return __float2bfloat16(inp);
+  } else {
+    static_assert(false, "unsupported 16b dtype");
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
+  _B16x4 ret;
+  if constexpr (std::is_same<T, _Float16>::value) {
+    union h2cvt {
+      __half2 h2[2];
+      _B16x4 b16x4;
+    } u;
+    u.h2[0] = __float22half2_rn(make_float2(inp[0], inp[1]));
+    u.h2[1] = __float22half2_rn(make_float2(inp[2], inp[3]));
+    return u.b16x4;
+  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
+    for (int i = 0; i < 4; i++) {
+      union fcvt {
+        uint32_t u32;
+        float f32;
+      } u;
+      u.f32 = inp[i];
+      u.u32 += 0x7fff + ((u.u32 >> 16) & 1);  // BF16 RNE with no nan/inf check
+      ret[i] = uint16_t(u.u32 >> 16);
+    }
+    return ret;
+  } else {
+    static_assert(false, "unsupported 16b dtype");
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ _B16x4 addx4(const _B16x4& inp1,
+                                        const _B16x4& inp2) {
+  _B16x4 ret;
+  if constexpr (std::is_same<T, _Float16>::value) {
+    union h2cvt {
+      _B16x4 b16x4;
+      __half2 h2[2];
+    } u1, u2, s;
+    u1.b16x4 = inp1;
+    u2.b16x4 = inp2;
+    s.h2[0] = u1.h2[0] + u2.h2[0];
+    s.h2[1] = u1.h2[1] + u2.h2[1];
+    return s.b16x4;
+  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
+    for (int i = 0; i < 4; i++) {
+      union fcvt {
+        float f32;
+        uint32_t i32;
+      } u1, u2, s;
+      u1.i32 = uint32_t(inp1[i]) << 16;
+      u2.i32 = uint32_t(inp2[i]) << 16;
+      s.f32 = u1.f32 + u2.f32;
+      ret[i] = uint16_t(s.i32 >> 16);
+    }
+    return ret;
+  } else {
+    static_assert(false, "unsupported 16b dtype");
+  }
+}
+
+__device__ __forceinline__ floatx4 to_float_fp8x4(const _B8x4& inp) {
+  // From MI300+ platforms, we have v_cvt_pk_f32_fp8 instruction
+  // to convert 2 packed fp8 to 2 packed fp32 values.
+  // However, in MI200 platforms, we only have v_cvt_f32_fp8
+  // to convert fp8 values individually. So we added
+  // #else case for fewer instructions (# inst=2) in MI300+,
+  // and fallback to
+  // #if case for other platforms (# inst=4).
+  #if defined(__gfx90a__)
+  float4 f32x4 = vllm::fp8::vec_conversion<float4, uint32_t>(
+      *reinterpret_cast<const uint32_t*>(&inp));
+  return *reinterpret_cast<floatx4*>(&f32x4);
+  #else  // MI3xx+ optimized builtins
+  const auto f0 = __builtin_amdgcn_cvt_pk_f32_fp8(inp, false);
+  const auto f1 = __builtin_amdgcn_cvt_pk_f32_fp8(inp, true);
+  floatx4 ret;
+  ret[0] = f0[0];
+  ret[1] = f0[1];
+  ret[2] = f1[0];
+  ret[3] = f1[1];
+  return ret;
+  #endif
+}
+
+template <typename T>
+__device__ __forceinline__ _B16x4 from_floatx4_rtz(const floatx4& inp) {
+  _B16x4 ret;
+  if constexpr (std::is_same<T, _Float16>::value) {
+    union h2cvt {
+      _Half2 h2[2];
+      _B16x4 b16x4;
+    } u;
+    u.h2[0] = __builtin_amdgcn_cvt_pkrtz(inp[0], inp[1]);
+    u.h2[1] = __builtin_amdgcn_cvt_pkrtz(inp[2], inp[3]);
+    return u.b16x4;
+  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
+    for (int i = 0; i < 4; i++) {
+      union fcvt {
+        uint32_t i32;
+        float f32;
+      } u;
+      u.f32 = inp[i];
+      ret[i] = uint16_t(u.i32 >> 16);
+    }
+    return ret;
+  } else {
+    static_assert(false, "unsupported 16b dtype");
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ _B16x8 convert_b8x8_custom(const _B8x8 input) {
+  union {
+    _B8x8 b8x8;
+    _B8x4 b8x4[2];
+  } tmp;
+  tmp.b8x8 = input;
+  _B16x8 ret;
+  for (int i = 0; i < 2; i++) {
+    ret.xy[i] = from_floatx4_rtz<T>(to_float_fp8x4(tmp.b8x4[i]));
+  }
+  return ret;
+}
+
+// grid (num_seqs, num_partitions,num_kv_heads)
+// block (256)
+// clang-format off
+template <typename scalar_t, typename cache_t,
+          vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE,
+          int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO>
+__global__
+__launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
+    const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
+    const cache_t* __restrict__ k_cache,    // [num_blocks, num_kv_heads, head_size/x, block_size, x]
+    const cache_t* __restrict__ v_cache,    // [num_blocks, num_kv_heads, head_size, block_size]
+    const int num_kv_heads,   
+    const float scale,    
+    const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
+    const int* __restrict__ context_lens,   // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,   // [num_seqs]
+    const int max_num_blocks_per_seq,
+    const float* __restrict__ alibi_slopes, // [num_heads]
+    const int q_stride,
+    const int kv_block_stride,
+    const int kv_head_stride,
+    float* __restrict__ exp_sums,           // [num_seqs, num_heads, max_num_partitions]
+    float* __restrict__ max_logits,         // [num_seqs, num_heads, max_num_partitions]
+    scalar_t* __restrict__ out,             // [num_seqs, num_heads, max_num_partitions, head_size]
+    OUTT* __restrict__ final_out,           // [num_seqs, num_heads, head_size]
+    int max_ctx_blocks, const float* k_scale, const float* v_scale) {
+  // clang-format on
+  constexpr int NWARPS = NUM_THREADS / WARP_SIZE;
+  const auto warpid = threadIdx.x / WARP_SIZE;
+  const auto laneid = threadIdx.x % WARP_SIZE;
+  const int lane4id = laneid % 4;
+  const int lane16id = laneid % 16;
+  const int rowid = laneid / 16;
+
+  const auto seq_idx = blockIdx.x;
+  // NOTE queries with sequence len > 1 are prefills and taken care by another
+  // kernel.
+  if (query_start_loc_ptr != nullptr &&
+      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx]) != 1) {
+    return;
+  }
+
+  const auto partition_idx = blockIdx.y;
+
+  constexpr int T_PAR_SIZE = 256;  // token partition size set to 256
+
+  const auto max_num_partitions = gridDim.y;
+
+  const int context_len = context_lens[seq_idx];
+
+  const int partition_start_token_idx =
+      partition_idx * T_PAR_SIZE;  // partition_size;
+  // exit if partition is out of context for seq
+  if (partition_start_token_idx >= context_len) {
+    return;
+  }
+
+  constexpr int GQA_RATIO4 = DIVIDE_ROUND_UP(GQA_RATIO, 4);
+
+  // shared_logits is used for multiple purposes
+  __shared__ _B16x4 shared_logits[NWARPS][4][16][4];
+
+  // for QK mfma16x16, layout is QHead/Tokenx16 across every 16 lanes, 16 Bytes
+  // HeadElements in each lane, 4x16B HeadElements across 4 rows of warp
+  constexpr int ROWS_PER_WARP =
+      WARP_SIZE / 16;  // rows refers to 16 lanes; refer DDP (Data Parallel
+                       // Processing) terminology
+  constexpr int CONTIGUOUS_KV_ELEMS_16B_LOAD =
+      16 / sizeof(cache_t);  // 8 for 16 bit cache type, 16 for 8 bit types
+  constexpr int QKHE_PER_FETCH =
+      CONTIGUOUS_KV_ELEMS_16B_LOAD *
+      ROWS_PER_WARP;  // each fetch across a warp fetches these many elements
+  constexpr int QK_SIZE_RATIO =
+      sizeof(scalar_t) /
+      sizeof(cache_t);  // 1 for 16bit types, 2 for 8bit types
+  constexpr int QKHELOOP = HEAD_SIZE / QKHE_PER_FETCH;  // 4xQKHE_16B across
+                                                        // warp
+
+  _B16x8 Qlocal[QKHELOOP]
+               [QK_SIZE_RATIO];  // note that 16 contiguous elements of Q should
+                                 // be fetched per lane for 8 bit cache types :
+                                 // QK_SIZE_RATIO changes for this
+
+  constexpr int CONTIGUOUS_SCALAR_ELEMS_16B = 16 / sizeof(scalar_t);
+
+  constexpr int TOKENS_PER_WARP =
+      T_PAR_SIZE /
+      NWARPS;  // sub partition of tokens per warp for qk calculation
+  constexpr int TLOOP =
+      TOKENS_PER_WARP /
+      16;  // each mfma16x16x16 instruction processes 16 tokens
+
+  // can be interpreted as B8x16 for 8 bit types
+  _B16x8 Klocal[TLOOP][QKHELOOP];
+
+  const auto wg_start_head_idx = blockIdx.z * GQA_RATIO;
+  const auto wg_start_kv_head_idx = blockIdx.z;
+  const auto total_num_heads = gridDim.z * GQA_RATIO;
+
+  // for QK mfma, tokens in multiples of TOKENS_PER_WARP are spread across warps
+  // each mfma takes QH16xT16x16HE across warp
+  // repeat mfmas across QKHELOOP dimension
+  // output layout from QKmfma : QH16xT4x4 16 qheads across 16 lanes, 16 tokens
+  // across 4 rows x 4 tokens per lane
+
+  const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
+  const int last_ctx_block = num_context_blocks - 1;
+
+  const int* block_table_seq = block_tables + seq_idx * max_num_blocks_per_seq;
+
+  int kphysical_block_number[TLOOP];
+
+  // fetch k physical block numbers
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    const int klocal_token_idx =
+        TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
+    const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
+    const int kblock_idx = (kglobal_token_idx < context_len)
+                               ? kglobal_token_idx / BLOCK_SIZE
+                               : last_ctx_block;
+    kphysical_block_number[token_depth] = block_table_seq[kblock_idx];
+  }
+
+  // fetch Q in shared across warps and then write to registers
+  const int local_qhead_idx = 4 * warpid + rowid;
+  const int global_qhead_idx = wg_start_head_idx + local_qhead_idx;
+  const int64_t query_start_off = static_cast<int64_t>(
+      query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
+  const scalar_t* q_ptr =
+      q + query_start_off * q_stride + global_qhead_idx * HEAD_SIZE;
+
+  const int qhead_element = lane16id * CONTIGUOUS_SCALAR_ELEMS_16B;
+  if ((local_qhead_idx < GQA_RATIO) && (qhead_element < HEAD_SIZE)) {
+    const scalar_t* q_fetch_ptr = q_ptr + qhead_element;
+    const _B16x8* q_fetch_ptr_16B =
+        reinterpret_cast<const _B16x8*>(q_fetch_ptr);
+    _B16x8 tmp = *q_fetch_ptr_16B;
+    if constexpr (KV_DTYPE == vllm::Fp8KVCacheDataType::kAuto) {
+      const int offset1 =
+          lane16id /
+          4;  // 16 contiguous chunks of head elems are spread across 4x4lanes
+      shared_logits[offset1][lane4id][local_qhead_idx][0] = tmp.xy[0];
+      shared_logits[offset1][lane4id][local_qhead_idx][1] = tmp.xy[1];
+    } else {
+      for (int i = 0; i < 2; i++) {
+        const int head_elem = lane16id * 2 + i;  // element id in _B16x4 terms
+        const int offset3 = head_elem % 4;
+        const int offset2 = (head_elem / 4) % 4;
+        const int offset1 = head_elem / 4 / 4;
+        shared_logits[offset1][offset2][local_qhead_idx][offset3] = tmp.xy[i];
+      }
+    }
+  }
+  __syncthreads();
+  for (int qkhe_depth = 0; qkhe_depth < QKHELOOP; qkhe_depth++) {
+    for (int qkratio = 0; qkratio < QK_SIZE_RATIO; qkratio++) {
+      for (int i = 0; i < 2; i++) {
+        Qlocal[qkhe_depth][qkratio].xy[i] =
+            shared_logits[qkhe_depth][rowid][lane16id % GQA_RATIO]
+                         [2 * qkratio + i];
+      }
+    }
+  }
+
+  constexpr int KX =
+      16 / sizeof(cache_t);  // vLLM defines x as 16 Bytes of kv cache elements
+  const cache_t* k_ptr = k_cache + wg_start_kv_head_idx * kv_head_stride;
+
+  const int row_head_elem = rowid * CONTIGUOUS_KV_ELEMS_16B_LOAD;
+  // fetch K values
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    const int64_t kblock_number =
+        static_cast<int64_t>(kphysical_block_number[token_depth]);
+    const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride;
+    const int klocal_token_idx =
+        TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
+    const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE;
+    const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX;
+
+    for (int qkhe_depth = 0; qkhe_depth < QKHELOOP; qkhe_depth++) {
+      const int head_elem = row_head_elem + qkhe_depth * QKHE_PER_FETCH;
+      const int offset1 = head_elem / KX;
+      const int offset2 = head_elem % KX;
+      const cache_t* k_fetch_ptr = k_ptr3 + offset1 * BLOCK_SIZE * KX + offset2;
+      const _B16x8* k_fetch_ptr_16B =
+          reinterpret_cast<const _B16x8*>(k_fetch_ptr);
+      Klocal[token_depth][qkhe_depth] = *k_fetch_ptr_16B;
+    }
+  }
+
+  float alibi_slope;
+  if constexpr (ALIBI_ENABLED) {
+    const int alibi_head_idx = wg_start_head_idx + lane16id;
+    alibi_slope = (lane16id < GQA_RATIO) ? alibi_slopes[alibi_head_idx] : 0.f;
+  }
+
+  constexpr int VTOKENS_PER_LANE =
+      TOKENS_PER_WARP / ROWS_PER_WARP;  // 64/4 = 16 contiguous vtokens per lane
+  constexpr int VBLOCKS_PER_LANE =
+      1;  // assumes block size >=16, each lane can correspond to 1 block only
+  constexpr int VTLOOP = NWARPS;  // corresponds to tokens across warps
+  constexpr int VTLANELOOP = DIVIDE_ROUND_UP(
+      VTOKENS_PER_LANE,
+      CONTIGUOUS_KV_ELEMS_16B_LOAD);  // optimized for 16B fetches; assumes
+                                      // minimum block size is 16
+  constexpr int VHELOOP = HEAD_SIZE / 16 / NWARPS;
+
+  int vphysical_block_number[VTLOOP][VBLOCKS_PER_LANE];
+
+  // fetch v physical block numbers
+  for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) {
+    for (int vblock_depth = 0; vblock_depth < VBLOCKS_PER_LANE;
+         vblock_depth++) {
+      const int vlocal_token_idx =
+          vtoken_depth * VTOKENS_PER_LANE * ROWS_PER_WARP +
+          rowid * VTOKENS_PER_LANE + vblock_depth * BLOCK_SIZE;
+      // Safe to use an int32_t here assuming we are working with < 2 billion
+      // tokens
+      const int vglobal_token_idx =
+          partition_start_token_idx + vlocal_token_idx;
+      const int vblock_idx = (vglobal_token_idx < context_len)
+                                 ? vglobal_token_idx / BLOCK_SIZE
+                                 : last_ctx_block;
+      vphysical_block_number[vtoken_depth][vblock_depth] =
+          block_table_seq[vblock_idx];
+    }
+  }
+
+  _B16x8 Vlocal[VTLOOP][VHELOOP][VTLANELOOP];  // this could be B8x16 too
+
+  const cache_t* v_ptr = v_cache + wg_start_kv_head_idx * kv_head_stride +
+                         ((rowid * VTOKENS_PER_LANE) % BLOCK_SIZE);
+
+  // v fetches are 16head elems across lanes x 16 tokens per lane
+  for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) {
+    const int vhead_elem = vhe_depth * NWARPS * 16 + warpid * 16 + lane16id;
+    const cache_t* v_ptr2 = v_ptr + vhead_elem * BLOCK_SIZE;
+
+    for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) {
+      for (int vfetch_depth = 0; vfetch_depth < VTLANELOOP; vfetch_depth++) {
+        const int vblock_depth = 0;
+        const int64_t vblock_number = static_cast<int64_t>(
+            vphysical_block_number[vtoken_depth][vblock_depth]);
+        const cache_t* v_ptr3 = v_ptr2 + (vblock_number * kv_block_stride);
+
+        const cache_t* v_fetch_ptr =
+            v_ptr3 + vfetch_depth * CONTIGUOUS_KV_ELEMS_16B_LOAD;
+        const _B16x8* v_fetch_ptr_16B =
+            reinterpret_cast<const _B16x8*>(v_fetch_ptr);
+        Vlocal[vtoken_depth][vhe_depth][vfetch_depth] = *v_fetch_ptr_16B;
+      }
+    }
+  }
+
+  // calculate post qk mfma scale
+  float scale2 = scale;
+  if constexpr (KV_DTYPE != vllm::Fp8KVCacheDataType::kAuto) {
+    // multiply by k_scale if fp8 kv cache
+    scale2 *= *k_scale;
+  }
+
+  floatx4 d_out[TLOOP];
+  // qk mfma
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    d_out[token_depth] = {0};
+    for (int qkhe_depth = 0; qkhe_depth < QKHELOOP; qkhe_depth++) {
+      if constexpr (KV_DTYPE == vllm::Fp8KVCacheDataType::kAuto) {
+        for (int qkratio = 0; qkratio < QK_SIZE_RATIO; qkratio++) {
+          for (int i = 0; i < 2; i++) {
+            d_out[token_depth] = gcn_mfma16x16x16_instr<scalar_t, 0, 0, 0>(
+                Klocal[token_depth][qkhe_depth].xy[i],
+                Qlocal[qkhe_depth][qkratio].xy[i], d_out[token_depth]);
+          }
+        }
+      } else {  // kv cache dtype fp8
+        auto Ktmp = Klocal[token_depth][qkhe_depth];
+        _B8x16 Ktmp8x16 = *reinterpret_cast<_B8x16*>(&Ktmp);
+        for (int qkratio = 0; qkratio < QK_SIZE_RATIO; qkratio++) {
+          _B8x8 Ktmp8x8 = Ktmp8x16.xy[qkratio];
+          _B16x8 Klocaltmp = convert_b8x8_custom<scalar_t>(Ktmp8x8);
+          for (int i = 0; i < 2; i++) {
+            d_out[token_depth] = gcn_mfma16x16x16_instr<scalar_t, 0, 0, 0>(
+                Klocaltmp.xy[i], Qlocal[qkhe_depth][qkratio].xy[i],
+                d_out[token_depth]);
+          }
+        }
+      }
+    }
+    d_out[token_depth] *= scale2;
+  }
+
+  const int qkout_token_idx =
+      partition_start_token_idx + TOKENS_PER_WARP * warpid + rowid * 4;
+
+  // apply alibi
+  if constexpr (ALIBI_ENABLED) {
+    for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+      const int local_token_idx = qkout_token_idx + token_depth * 16;
+      const int alibi_offset = local_token_idx - context_len + 1;
+      for (int i = 0; i < 4; i++) {
+        d_out[token_depth][i] += alibi_slope * (alibi_offset + i);
+      }
+    }
+  }
+
+  // calculate qk_max and exp_sum per warp and write to shared memory
+  float qk_max = -FLT_MAX;
+  float exp_sum = 0.0f;
+
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    const int local_token_idx = qkout_token_idx + token_depth * 16;
+    for (int i = 0; i < 4; i++) {
+      const float tmp = (local_token_idx + i < context_len)
+                            ? d_out[token_depth][i]
+                            : -FLT_MAX;
+      qk_max = fmaxf(qk_max, tmp);
+    }
+  }
+
+  for (int mask = WARP_SIZE / 2; mask >= 16; mask /= 2) {
+    qk_max = fmaxf(qk_max, __shfl_xor(qk_max, mask));
+  }
+
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    const int local_token_idx = qkout_token_idx + token_depth * 16;
+    for (int i = 0; i < 4; i++) {
+      const float tmp = (local_token_idx + i < context_len)
+                            ? __expf(d_out[token_depth][i] - qk_max)
+                            : 0.0f;
+      d_out[token_depth][i] = tmp;
+      exp_sum += tmp;
+    }
+  }
+
+  for (int mask = WARP_SIZE / 2; mask >= 16; mask /= 2) {
+    exp_sum += __shfl_xor(exp_sum, mask);
+  }
+
+  __syncthreads();  // sync before writing to shared mem
+
+  float* shared_mem = reinterpret_cast<float*>(shared_logits);
+  if (laneid < 16) {
+    const int qk_max_offset = warpid * 16 + lane16id;
+    shared_mem[qk_max_offset] = qk_max;
+    const int exp_sum_offset = NWARPS * 16 + qk_max_offset;
+    shared_mem[exp_sum_offset] = exp_sum;
+  }
+
+  __syncthreads();
+
+  // calculate partition qk_max and exp_sum
+  float partition_qk_max = -FLT_MAX;
+  float warp_qk_max_exp[NWARPS];
+  float partition_exp_sum = 0.0f;
+
+  for (int w = 0; w < NWARPS; w++) {
+    warp_qk_max_exp[w] = shared_mem[w * 16 + lane16id];
+    partition_qk_max = fmaxf(partition_qk_max, warp_qk_max_exp[w]);
+  }
+
+  for (int w = 0; w < NWARPS; w++) {
+    warp_qk_max_exp[w] = __expf(warp_qk_max_exp[w] - partition_qk_max);
+    partition_exp_sum +=
+        shared_mem[NWARPS * 16 + w * 16 + lane16id] * warp_qk_max_exp[w];
+  }
+
+  const float inv_sum_scale =
+      __fdividef(1.f, partition_exp_sum + 1e-6f) * warp_qk_max_exp[warpid];
+
+  __syncthreads();
+
+  // disable rtz conversion due to its impact on accuracy.
+  constexpr bool LOGITS_RTZ_CONVERSION = false;
+
+  // write logits to shared mem
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    d_out[token_depth] *= inv_sum_scale;
+    if constexpr (LOGITS_RTZ_CONVERSION) {
+      // use rtz conversion for better performance, with negligible impact on
+      // accuracy
+      shared_logits[warpid][token_depth][lane16id][rowid] =
+          from_floatx4_rtz<scalar_t>(d_out[token_depth]);
+    } else {
+      shared_logits[warpid][token_depth][lane16id][rowid] =
+          from_floatx4<scalar_t>(d_out[token_depth]);
+    }
+  }
+
+  // write out partition max_logits and exp_sum
+  if (threadIdx.x < GQA_RATIO) {
+    const int qhead_idx = lane16id;
+    const int64_t offset = static_cast<int64_t>(seq_idx) *
+                               static_cast<int64_t>(total_num_heads) *
+                               static_cast<int64_t>(max_num_partitions) +
+                           (static_cast<int64_t>(wg_start_head_idx) +
+                            static_cast<int64_t>(qhead_idx)) *
+                               static_cast<int64_t>(max_num_partitions) +
+                           static_cast<int64_t>(partition_idx);
+    max_logits[offset] = partition_qk_max;
+    exp_sums[offset] = partition_exp_sum;
+  }
+
+  __syncthreads();
+
+  constexpr int ELEMS8_ELEMS4_RATIO = 8 / 4;
+  constexpr int ELEMS16_ELEMS8_RATIO = 16 / 8;
+
+  _B16x4 outelems[VHELOOP];
+  // Softmax V mfma
+  // v layout: 16he across lanes x 16 tokens per lane
+  for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) {
+    floatx4 tmp_out = {0};
+
+    for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) {
+      if constexpr (KV_DTYPE == vllm::Fp8KVCacheDataType::kAuto) {
+        for (int vfetch_depth = 0; vfetch_depth < VTLANELOOP; vfetch_depth++) {
+          for (int i = 0; i < ELEMS8_ELEMS4_RATIO; i++) {
+            const int offset = rowid * VTLANELOOP * ELEMS8_ELEMS4_RATIO +
+                               vfetch_depth * ELEMS8_ELEMS4_RATIO + i;
+            const int offset1 = offset % ROWS_PER_WARP;
+            const int offset2 = offset / ROWS_PER_WARP;
+            // output format is 16 qheads across 16 lanes, 16 head elems spread
+            // across 4 rows
+            tmp_out = gcn_mfma16x16x16_instr<scalar_t, 0, 0, 0>(
+                Vlocal[vtoken_depth][vhe_depth][vfetch_depth].xy[i],
+                shared_logits[vtoken_depth][offset2][lane16id][offset1],
+                tmp_out);
+          }
+        }
+        // KV cache fp8
+      } else {
+        for (int vfetch_depth = 0; vfetch_depth < VTLANELOOP; vfetch_depth++) {
+          _B16x8 Vtmp = Vlocal[vtoken_depth][vhe_depth][vfetch_depth];
+          // reinterpret V format as 16 elements of 8bits
+          _B8x16 Vtmp8x16 = *reinterpret_cast<_B8x16*>(&Vtmp);
+          for (int j = 0; j < ELEMS16_ELEMS8_RATIO; j++) {
+            _B8x8 Vtmp8x8 = Vtmp8x16.xy[j];
+            _B16x8 Vlocaltmp = convert_b8x8_custom<scalar_t>(Vtmp8x8);
+            for (int i = 0; i < ELEMS8_ELEMS4_RATIO; i++) {
+              const int offset =
+                  rowid * ELEMS16_ELEMS8_RATIO * ELEMS8_ELEMS4_RATIO +
+                  j * ELEMS8_ELEMS4_RATIO + i;
+              const int offset1 = offset % ROWS_PER_WARP;
+              const int offset2 = offset / ROWS_PER_WARP;
+              // output format is 16 qheads across 16 lanes, 16 head elems
+              // spread across 4 rows
+              tmp_out = gcn_mfma16x16x16_instr<scalar_t, 0, 0, 0>(
+                  Vlocaltmp.xy[i],
+                  shared_logits[vtoken_depth][offset2][lane16id][offset1],
+                  tmp_out);
+            }
+          }
+        }
+      }
+    }
+    // apply post Softmax V mfma v_scale
+    if constexpr (KV_DTYPE != vllm::Fp8KVCacheDataType::kAuto) {
+      tmp_out *= *v_scale;
+    }
+    outelems[vhe_depth] = from_floatx4<scalar_t>(tmp_out);
+  }
+
+  __syncthreads();
+
+  // store Softmax-V mfma output to shared mem
+  for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) {
+    // lane16 id head dimension; rowid head element dimension
+    shared_logits[warpid][vhe_depth][lane16id][rowid] = outelems[vhe_depth];
+  }
+
+  __syncthreads();
+
+  // write to tmp_out with coalesced writes after reading from shared mem
+  if (warpid == 0) {
+    _B16x8 vout[GQA_RATIO4];
+    // each lane writes out 16Bytes of tmp_out along head elem dimension
+    const int head_elem_idx = lane16id * 8;
+    if (head_elem_idx < HEAD_SIZE) {
+      for (int h = 0; h < GQA_RATIO4; h++) {
+        const int local_head_idx = 4 * h + rowid;
+        const int offset1 = (head_elem_idx / 16) % 4;
+        const int offset2 = head_elem_idx / 16 / NWARPS;
+        const int offset3 = (head_elem_idx / 4) % 4;
+        for (int i = 0; i < 2; i++) {
+          vout[h].xy[i] =
+              shared_logits[offset1][offset2][local_head_idx][offset3 + i];
+        }
+      }
+
+      const int64_t hsz_maxp_mult =
+          static_cast<int64_t>(HEAD_SIZE * max_num_partitions);
+      scalar_t* out_ptr = out + seq_idx * total_num_heads * hsz_maxp_mult +
+                          partition_idx * HEAD_SIZE;
+      for (int h = 0; h < GQA_RATIO4; h++) {
+        const int local_head_idx = 4 * h + rowid;
+        if (local_head_idx < GQA_RATIO) {
+          const int64_t out_head_idx =
+              static_cast<int64_t>(wg_start_head_idx + local_head_idx);
+          scalar_t* out_ptr2 = out_ptr + out_head_idx * hsz_maxp_mult;
+          scalar_t* out_ptr3 = out_ptr2 + head_elem_idx;
+          _B16x8* out_ptr_B16x8 = reinterpret_cast<_B16x8*>(out_ptr3);
+          *out_ptr_B16x8 = vout[h];
+        }
+      }
+    }
+  }
+}
+
+// grid (num_seqs, num_partitions, num_kv_heads)
+// block (256 : partition size)
+// each WG handles 1 partition per sequence
+// clang-format off
+template <typename scalar_t, typename cache_t,
+          vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE,
+          int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED,
+          int GQA_RATIO>
+__global__
+__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
+    const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
+    const cache_t* __restrict__ k_cache,    // [num_blocks, num_kv_heads, head_size/x, block_size, x]
+    const cache_t* __restrict__ v_cache,    // [num_blocks, num_kv_heads, head_size, block_size]
+    const int num_kv_heads,
+    const float scale,
+    const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
+    const int* __restrict__ context_lens,   // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,   // [num_seqs]
+    const int max_num_blocks_per_seq,
+    const float* __restrict__ alibi_slopes, // [num_heads]
+    const int q_stride,
+    const int kv_block_stride,
+    const int kv_head_stride,
+    float* __restrict__ exp_sums,           // [num_seqs, num_heads, max_num_partitions]
+    float* __restrict__ max_logits,         // [num_seqs, num_heads, max_num_partitions]
+    scalar_t* __restrict__ out,             // [num_seqs, num_heads, max_num_partitions, head_size]
+    OUTT* __restrict__ final_out,           // [num_seqs, num_heads, head_size]
+    int max_ctx_blocks, const float* k_scale, const float* v_scale) {
+  // clang-format on
+  constexpr int NWARPS = NUM_THREADS / WARP_SIZE;
+  const auto warpid = threadIdx.x / WARP_SIZE;
+  const auto laneid = threadIdx.x % WARP_SIZE;
+  const int lane4id = laneid % 4;
+
+  const auto seq_idx = blockIdx.x;
+  // NOTE queries with sequence len > 1 are prefills and taken care by another
+  // kernel.
+  if (query_start_loc_ptr != nullptr &&
+      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) {
+    return;
+  }
+  const auto partition_idx = blockIdx.y;
+  const auto partition_size = blockDim.x;
+  const auto max_num_partitions = gridDim.y;
+
+  const int context_len = context_lens[seq_idx];
+  const int partition_start_token_idx = partition_idx * partition_size;
+  // exit if partition is out of context for seq
+  if (partition_start_token_idx >= context_len) {
+    return;
+  }
+  // every 4 lanes fetch 4 different qheads
+  // qhloop = num loops over qhead dimension
+  constexpr int QHLOOP = DIVIDE_ROUND_UP(GQA_RATIO, 4);
+  constexpr int GQA_RATIO4 = 4 * QHLOOP;
+  __shared__ float shared_qk_max[NWARPS][GQA_RATIO4 + 1];
+  __shared__ float shared_exp_sum[NWARPS][GQA_RATIO4 + 1];
+  _B16x8 Qlocal[QHLOOP];
+  constexpr int x = 16 / sizeof(scalar_t);
+  // kheloop = num loops over head_size for 16Bytes of Q/dequantized K elements
+  constexpr int KHELOOP = HEAD_SIZE / x;
+  _B16x8 Klocal[KHELOOP];
+  _B8x8 Klocalb8[KHELOOP];
+  // for SoftMax-V Gemm, V head_size dimension is distributed across warp
+  // vheloop = num loops to cover v head size dimension
+  constexpr int VHELOOP = HEAD_SIZE / WARP_SIZE;
+  // softmax out has warp_size tokens across warp
+  // vtloop = num loops to cover warp_size(64) tokens with 16Bytes of
+  // dequantized V elements
+  constexpr int VTLOOP = WARP_SIZE / 8;
+  // num vblocks to cover warp_size(64) v elements
+  constexpr int VBLOCKS = 8 * VTLOOP / BLOCK_SIZE;
+  int vphysical_blocks[VBLOCKS];
+  _B16x8 Vlocal[VHELOOP][VTLOOP];
+  _B8x8 Vlocalb8[VHELOOP][VTLOOP];
+  floatx4 d_out[QHLOOP];
+  float qk_max[QHLOOP];
+
+  __shared__ _B16x4 vout_shared[QHLOOP][VHELOOP][WARP_SIZE][NWARPS + 1];
+
+  for (int h = 0; h < QHLOOP; h++) {
+    d_out[h] = {0};
+    qk_max[h] = -FLT_MAX;
+  }
+
+  const auto wg_start_head_idx = blockIdx.z * GQA_RATIO;
+  const auto wg_start_kv_head_idx = blockIdx.z;
+
+  const int warp_start_token_idx =
+      partition_start_token_idx + warpid * WARP_SIZE;
+
+  if (warp_start_token_idx >= context_len) {  // warp out of context
+  #pragma unroll
+    for (int h = 0; h < GQA_RATIO4; h++) {
+      shared_qk_max[warpid][h] = -FLT_MAX;
+      shared_exp_sum[warpid][h] = 0.0f;
+    }
+  } else {  // warp within context
+
+    const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
+    const int last_ctx_block = num_context_blocks - 1;
+
+    const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
+    // token id within partition
+    const auto local_token_idx = threadIdx.x;
+    // token id within sequence
+    const int global_token_idx = partition_start_token_idx + local_token_idx;
+
+    // fetch block number for k
+    const int block_idx = (global_token_idx < context_len)
+                              ? global_token_idx / BLOCK_SIZE
+                              : last_ctx_block;
+
+    // fetch k physical block number
+    //  int32 physical_block_number leads to overflow when multiplied with
+    //  kv_block_stride
+    const int64_t physical_block_number =
+        static_cast<int64_t>(block_table[block_idx]);
+
+    // fetch vphysical block numbers up front
+    const int warp_start_block_idx = warp_start_token_idx / BLOCK_SIZE;
+    for (int b = 0; b < VBLOCKS; b++) {
+      const int vblock_idx = warp_start_block_idx + b;
+      const int vblock_idx_ctx =
+          (vblock_idx <= last_ctx_block) ? vblock_idx : last_ctx_block;
+      vphysical_blocks[b] = block_table[vblock_idx_ctx];
+    }
+
+    // fetch q elements
+    // every 4 lanes fetch 8 elems, so warp fetches 8*16 = 128 elemsc
+    const int64_t query_start_off = static_cast<int64_t>(
+        query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
+    const scalar_t* q_ptr =
+        q + query_start_off * q_stride + wg_start_head_idx * HEAD_SIZE;
+    const _B16x8* q_ptrh8 = reinterpret_cast<const _B16x8*>(q_ptr);
+    const int qhead_elemh8 = laneid / 4;
+
+    for (int h = 0; h < QHLOOP - 1; h++) {
+      const int qhead_idx = h * 4 + lane4id;
+      Qlocal[h] = q_ptrh8[qhead_idx * HEAD_SIZE / 8 + qhead_elemh8];
+    }
+    const int final_qhead_idx = 4 * (QHLOOP - 1) + lane4id;
+    if (final_qhead_idx < GQA_RATIO) {
+      Qlocal[QHLOOP - 1] =
+          q_ptrh8[final_qhead_idx * HEAD_SIZE / 8 + qhead_elemh8];
+    } else {
+      Qlocal[QHLOOP - 1].xy[0] = {0};
+      Qlocal[QHLOOP - 1].xy[1] = {0};
+    }
+
+    // fetch k elements
+    const cache_t* k_ptr = k_cache + physical_block_number * kv_block_stride +
+                           wg_start_kv_head_idx * kv_head_stride;
+
+    // physical_block_offset is already cast in terms of _B16x8
+    const int physical_block_offset = local_token_idx % BLOCK_SIZE;
+
+    // each K fetch is for 8 elements of cache_t which are later dequantized to
+    // scalar_t for fp8
+    if constexpr (KV_DTYPE == vllm::Fp8KVCacheDataType::kAuto) {
+      const _B16x8* k_ptrh8 = reinterpret_cast<const _B16x8*>(k_ptr);
+      for (int d = 0; d < KHELOOP; d++) {
+        Klocal[d] = k_ptrh8[d * BLOCK_SIZE + physical_block_offset];
+      }
+    } else {
+      // vllm defines X as 16 Bytes of elements of cache_t
+      constexpr int X = 16 / sizeof(cache_t);
+      const cache_t* k_ptr2 = k_ptr + physical_block_offset * X;
+      for (int d = 0; d < KHELOOP; d++) {
+        const int head_elem = d * 8;
+        const int offset1 = head_elem / X;
+        const int offset2 = head_elem % X;
+        const cache_t* k_ptr3 = k_ptr2 + offset1 * BLOCK_SIZE * X + offset2;
+        Klocalb8[d] = *reinterpret_cast<const _B8x8*>(k_ptr3);
+      }
+    }
+
+    // optional alibi fetch
+    float alibi_slope[QHLOOP];
+    if constexpr (ALIBI_ENABLED) {
+      for (int h = 0; h < QHLOOP; h++) {
+        const int qhead_idx = h * 4 + lane4id;
+        alibi_slope[h] = (qhead_idx < GQA_RATIO)
+                             ? alibi_slopes[wg_start_head_idx + qhead_idx]
+                             : 0.f;
+      }
+    }
+
+    const cache_t* v_ptr = v_cache + wg_start_kv_head_idx * kv_head_stride;
+    // fetch vcache in kv cache auto case
+    if constexpr (KV_DTYPE == vllm::Fp8KVCacheDataType::kAuto) {
+      const _B16x8* v_ptrh8 = reinterpret_cast<const _B16x8*>(v_ptr);
+      // iterate over each v block
+      for (int b = 0; b < VBLOCKS; b++) {
+        // int32 physical_block_number leads to overflow when multiplied with
+        // kv_block_stride
+        const int64_t vphysical_block_number =
+            static_cast<int64_t>(vphysical_blocks[b]);
+        const _B16x8* v_ptrh8b =
+            v_ptrh8 + (vphysical_block_number * kv_block_stride) / 8;
+        // iterate over each head elem (within head_size)
+        for (int h = 0; h < VHELOOP; h++) {
+          const int head_size_elem = h * WARP_SIZE + laneid;
+          const _B16x8* v_ptrh8be = v_ptrh8b + head_size_elem * BLOCK_SIZE / 8;
+          // iterate over all velems within block
+          for (int d = 0; d < BLOCK_SIZE / 8; d++) {
+            Vlocal[h][b * BLOCK_SIZE / 8 + d] = v_ptrh8be[d];
+          }
+        }
+      }
+    }  // if constexpr (KV_DTYPE == vllm::Fp8KVCacheDataType::kAuto)
+    // fetch vcache in fp8 case
+    else {  // if constexpr (KV_DTYPE != vllm::Fp8KVCacheDataType::kAuto)
+      const _B8x8* v_ptrh8 = reinterpret_cast<const _B8x8*>(v_ptr);
+      // iterate over each v block
+      for (int b = 0; b < VBLOCKS; b++) {
+        // int32 physical_block_number leads to overflow when multiplied with
+        // kv_block_stride
+        const int64_t vphysical_block_number =
+            static_cast<int64_t>(vphysical_blocks[b]);
+        const _B8x8* v_ptrh8b =
+            v_ptrh8 + (vphysical_block_number * kv_block_stride) / 8;
+        // iterate over each head elem (within head_size)
+        for (int h = 0; h < VHELOOP; h++) {
+          const int head_size_elem = h * WARP_SIZE + laneid;
+          const _B8x8* v_ptrh8be = v_ptrh8b + head_size_elem * BLOCK_SIZE / 8;
+          // iterate over all velems within block
+          for (int d = 0; d < BLOCK_SIZE / 8; d++) {
+            Vlocalb8[h][b * BLOCK_SIZE / 8 + d] = v_ptrh8be[d];
+          }
+        }
+      }
+    }
+
+  #define QK_mfma(x)                                             \
+    if constexpr (KV_DTYPE != vllm::Fp8KVCacheDataType::kAuto) { \
+      Klocal[x] = convert_b8x8_custom<scalar_t>(Klocalb8[x]);    \
+    }                                                            \
+    for (int h = 0; h < QHLOOP; h++) {                           \
+      d_out[h] = gcn_mfma4x4x4_instr<scalar_t, 4, x, 0>(         \
+          Qlocal[h].xy[0], Klocal[x].xy[0], d_out[h]);           \
+      d_out[h] = gcn_mfma4x4x4_instr<scalar_t, 4, x, 0>(         \
+          Qlocal[h].xy[1], Klocal[x].xy[1], d_out[h]);           \
+    }
+    // QK mfma with Q mfma block broadcast
+    // Q values across head_size dimension stored across lanes
+    // K values across head_size dimension are stored depthwise within lane
+    // Q broadcast with absz, cbid of mfma instruction
+    QK_mfma(0);
+    QK_mfma(1);
+    QK_mfma(2);
+    QK_mfma(3);
+    QK_mfma(4);
+    QK_mfma(5);
+    QK_mfma(6);
+    QK_mfma(7);
+    // below only needed for head size 128
+    if constexpr (KHELOOP > 8) {
+      QK_mfma(8);
+      QK_mfma(9);
+      QK_mfma(10);
+      QK_mfma(11);
+      QK_mfma(12);
+      QK_mfma(13);
+      QK_mfma(14);
+      QK_mfma(15);
+    }
+  #undef QK_mfma
+
+    float scale2 = scale;
+    if constexpr (KV_DTYPE != vllm::Fp8KVCacheDataType::kAuto) {
+      // post mfma scaling for fp8
+      scale2 *= *k_scale;
+    }
+
+    for (int h = 0; h < QHLOOP; h++) {
+      d_out[h] *= scale2;
+    }
+
+    // transpose d_out so that 4 token ids are in each lane, and 4 heads are
+    // across 4 lanes
+    for (int h = 0; h < QHLOOP; h++) {
+      floatx4 tmp = {0};
+      for (int i = 0; i < 4; i++) {
+        const float B = (lane4id == i) ? 1.0f : 0.0f;
+        tmp = __builtin_amdgcn_mfma_f32_4x4x1f32(d_out[h][i], B, tmp, 0, 0, 0);
+      }
+      d_out[h] = tmp;
+    }
+
+    const int lane4_token_idx = 4 * (global_token_idx >> 2);
+
+    if constexpr (ALIBI_ENABLED) {
+      const int alibi_offset = lane4_token_idx - context_len + 1;
+      for (int h = 0; h < QHLOOP; h++) {
+        for (int i = 0; i < 4; i++) {
+          d_out[h][i] += alibi_slope[h] * (alibi_offset + i);
+        }
+      }
+    }
+
+    const int bpermute_mask = 4 * (16 * ((laneid >> 2) % 4) + lane4id);
+
+    for (int h = 0; h < QHLOOP; h++) {
+      qk_max[h] = -FLT_MAX;
+      for (int i = 0; i < 4; i++) {
+        qk_max[h] = (lane4_token_idx + i < context_len)
+                        ? fmaxf(qk_max[h], d_out[h][i])
+                        : qk_max[h];
+      }
+
+      // for (int mask = WARP_SIZE / 2; mask >= 4; mask /= 2) {
+      //   qk_max[h] = fmaxf(qk_max[h], __shfl_xor(qk_max[h], mask));
+      // }
+      // faster version of above code with dpp
+      asm("v_nop\n v_nop\n v_max_f32_dpp %0, %1, %2 row_ror:4"
+          : "=v"(qk_max[h])
+          : "v"(qk_max[h]), "v"(qk_max[h]));
+      asm("v_nop\n v_nop\n v_max_f32_dpp %0, %1, %2 row_ror:8"
+          : "=v"(qk_max[h])
+          : "v"(qk_max[h]), "v"(qk_max[h]));
+
+      auto tmp = __builtin_amdgcn_ds_bpermute(
+          bpermute_mask, *reinterpret_cast<int*>(&qk_max[h]));
+      qk_max[h] = *reinterpret_cast<float*>(&tmp);
+      asm("v_nop\n v_nop\n v_max_f32_dpp %0, %1, %2 row_ror:4"
+          : "=v"(qk_max[h])
+          : "v"(qk_max[h]), "v"(qk_max[h]));
+      asm("v_nop\n v_nop\n v_max_f32_dpp %0, %1, %2 row_ror:8"
+          : "=v"(qk_max[h])
+          : "v"(qk_max[h]), "v"(qk_max[h]));
+    }
+
+    float exp_sum[QHLOOP];
+    for (int h = 0; h < QHLOOP; h++) {
+      exp_sum[h] = 0.0f;
+      for (int i = 0; i < 4; i++) {
+        d_out[h][i] = (lane4_token_idx + i < context_len)
+                          ? __expf(d_out[h][i] - qk_max[h])
+                          : 0.0f;
+        exp_sum[h] += d_out[h][i];
+      }
+      // for (int mask = WARP_SIZE / 2; mask >= 4; mask /= 2) {
+      //   exp_sum[h] += __shfl_xor(exp_sum[h], mask);
+      // }
+      // faster version of above code with dpp
+      asm("v_nop\n v_nop\n v_add_f32_dpp %0, %1, %2 row_ror:4"
+          : "=v"(exp_sum[h])
+          : "v"(exp_sum[h]), "v"(exp_sum[h]));
+      asm("v_nop\n v_nop\n v_add_f32_dpp %0, %1, %2 row_ror:8"
+          : "=v"(exp_sum[h])
+          : "v"(exp_sum[h]), "v"(exp_sum[h]));
+
+      auto tmp = __builtin_amdgcn_ds_bpermute(
+          bpermute_mask, *reinterpret_cast<int*>(&exp_sum[h]));
+      exp_sum[h] = *reinterpret_cast<float*>(&tmp);
+      asm("v_nop\n v_nop\n v_add_f32_dpp %0, %1, %2 row_ror:4"
+          : "=v"(exp_sum[h])
+          : "v"(exp_sum[h]), "v"(exp_sum[h]));
+      asm("v_nop\n v_nop\n v_add_f32_dpp %0, %1, %2 row_ror:8"
+          : "=v"(exp_sum[h])
+          : "v"(exp_sum[h]), "v"(exp_sum[h]));
+    }
+
+    if (laneid < 4) {
+      for (int h = 0; h < QHLOOP; h++) {
+        const int head_idx = 4 * h + lane4id;
+        shared_qk_max[warpid][head_idx] = qk_max[h];
+        shared_exp_sum[warpid][head_idx] = exp_sum[h];
+      }
+    }
+  }  // warp within context
+
+  __syncthreads();
+
+  const auto num_heads = gridDim.z * GQA_RATIO;
+  float* max_logits_ptr =
+      max_logits + seq_idx * num_heads * max_num_partitions + partition_idx;
+  float* exp_sums_ptr =
+      exp_sums + seq_idx * num_heads * max_num_partitions + partition_idx;
+  // calculate qk_max and exp_sums for partition
+  for (int h = 0; h < QHLOOP; h++) {
+    float global_qk_max = -FLT_MAX;
+    float warp_qk_max[NWARPS];
+    const int head_idx = 4 * h + lane4id;
+    for (int w = 0; w < NWARPS; w++) {
+      warp_qk_max[w] = shared_qk_max[w][head_idx];
+      global_qk_max = fmaxf(global_qk_max, warp_qk_max[w]);
+    }
+    float global_exp_sum = 0.0f;
+    for (int w = 0; w < NWARPS; w++) {
+      global_exp_sum +=
+          shared_exp_sum[w][head_idx] * __expf(warp_qk_max[w] - global_qk_max);
+    }
+    if (head_idx < GQA_RATIO) {
+      max_logits_ptr[(wg_start_head_idx + head_idx) * max_num_partitions] =
+          global_qk_max;
+      exp_sums_ptr[(wg_start_head_idx + head_idx) * max_num_partitions] =
+          global_exp_sum;
+    }
+    const float global_inv_sum_scale = __fdividef(1.f, global_exp_sum + 1e-6f) *
+                                       __expf(qk_max[h] - global_qk_max);
+    d_out[h] *= global_inv_sum_scale;
+  }
+  constexpr bool LOGITS_RTZ_CONVERSION = false;
+  // logits[h] -> every 4 lanes hold 4 heads, each lane holds 4 tokens, there
+  // are 4x16 tokens across warp
+  _B16x4 logits[QHLOOP];
+  for (int h = 0; h < QHLOOP; h++) {
+    if constexpr (LOGITS_RTZ_CONVERSION) {
+      // use rtz for faster performance with no perceivable accuracy loss
+      logits[h] = from_floatx4_rtz<scalar_t>(d_out[h]);
+    } else {
+      logits[h] = from_floatx4<scalar_t>(d_out[h]);
+    }
+  }
+
+  if (warp_start_token_idx >= context_len) {  // warp out of context
+    for (int qh = 0; qh < QHLOOP; qh++) {
+      for (int vh = 0; vh < VHELOOP; vh++) {
+        vout_shared[qh][vh][laneid][warpid] = {0};
+      }
+    }
+  } else {  // warp in context
+  #define SV_mfma(x)                                                  \
+    if constexpr (KV_DTYPE != vllm::Fp8KVCacheDataType::kAuto) {      \
+      Vlocal[vh][x] = convert_b8x8_custom<scalar_t>(Vlocalb8[vh][x]); \
+    }                                                                 \
+    for (int qh = 0; qh < QHLOOP; qh++) {                             \
+      acc[qh] = gcn_mfma4x4x4_instr<scalar_t, 4, 2 * x, 0>(           \
+          logits[qh], Vlocal[vh][x].xy[0], acc[qh]);                  \
+      acc[qh] = gcn_mfma4x4x4_instr<scalar_t, 4, 2 * x + 1, 0>(       \
+          logits[qh], Vlocal[vh][x].xy[1], acc[qh]);                  \
+    }
+
+    for (int vh = 0; vh < VHELOOP; vh++) {
+      floatx4 acc[QHLOOP];
+      for (int qh = 0; qh < QHLOOP; qh++) {
+        acc[qh] = {0};
+      }
+      // SoftMax-V calculation
+      // logits -> token dimension is distributed across lanes
+      // Vlocal -> token dimension is depthwise within lane
+      // uses mfma instruction block broadcast for logits
+      SV_mfma(0);
+      SV_mfma(1);
+      SV_mfma(2);
+      SV_mfma(3);
+      SV_mfma(4);
+      SV_mfma(5);
+      SV_mfma(6);
+      SV_mfma(7);
+
+      for (int qh = 0; qh < QHLOOP; qh++) {
+        if constexpr (KV_DTYPE != vllm::Fp8KVCacheDataType::kAuto) {
+          // post mfma v scale for fp8
+          acc[qh] *= *v_scale;
+        }
+        vout_shared[qh][vh][laneid][warpid] = from_floatx4<scalar_t>(acc[qh]);
+      }
+    }
+
+  #undef SV_mfma
+  }  // warp in context
+
+  __syncthreads();
+
+  // final write to tmp_out after vout accumulation
+  if (warpid == 0) {
+    _B16x4 vout[QHLOOP][VHELOOP];
+    // iterate across heads
+    for (int qh = 0; qh < QHLOOP; qh++) {
+      // iterate over each v head elem (within head_size)
+      for (int vh = 0; vh < VHELOOP; vh++) {
+        vout[qh][vh] = {0};
+        for (int w = 0; w < NWARPS; w++) {
+          vout[qh][vh] =
+              addx4<scalar_t>(vout[qh][vh], vout_shared[qh][vh][laneid][w]);
+        }
+      }
+    }
+
+    scalar_t* out_ptr = out +
+                        seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
+                        partition_idx * HEAD_SIZE;
+    const int out_num_partitions = max_num_partitions;
+    bit16_t* out_ptr_b16 = reinterpret_cast<bit16_t*>(out_ptr);
+    for (int qh = 0; qh < QHLOOP; qh++) {
+      for (int vh = 0; vh < VHELOOP; vh++) {
+        const int head_size_elem = vh * WARP_SIZE + laneid;
+        for (int i = 0; i < 4; i++) {
+          const int head_idx = 4 * qh + i;
+          if (head_idx < GQA_RATIO) {
+            out_ptr_b16[(wg_start_head_idx + head_idx) * out_num_partitions *
+                            HEAD_SIZE +
+                        head_size_elem] = vout[qh][vh][i];
+          }
+        }
+      }
+    }
+  }  // warpid == 0
+}
+
+// Grid: (num_heads, num_seqs).
+template <typename scalar_t, typename OUTT, int HEAD_SIZE, int NUM_THREADS,
+          int PARTITION_SIZE, int NPAR_LOOPS>
+__global__
+__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
+    OUTT* __restrict__ out,                // [num_seqs, num_heads, head_size]
+    const float* __restrict__ exp_sums,    // [num_seqs, num_heads,
+                                           // max_num_partitions]
+    const float* __restrict__ max_logits,  // [num_seqs, num_heads,
+                                           // max_num_partitions]
+    const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads,
+                                           // max_num_partitions, head_size]
+    const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
+    const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) {
+  const auto num_heads = gridDim.x;
+  const auto head_idx = blockIdx.x;
+  const auto seq_idx = blockIdx.y;
+
+  // NOTE queries with sequence len > 1 are prefills and taken care by another
+  // kernel.
+  if (query_start_loc_ptr != nullptr &&
+      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) {
+    return;
+  }
+
+  const int context_len = context_lens[seq_idx];
+  const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
+  const auto warpid = threadIdx.x / WARP_SIZE;
+
+  __shared__ float shared_global_exp_sum;
+  // max num partitions supported is warp_size * NPAR_LOOPS
+  __shared__ float shared_exp_sums[NPAR_LOOPS * WARP_SIZE];
+
+  if (warpid == 0) {
+    const float* max_logits_ptr = max_logits +
+                                  seq_idx * num_heads * max_num_partitions +
+                                  head_idx * max_num_partitions;
+
+    // valid partition is the last valid partition in case threadid > num
+    // partitions
+    int valid_partition[NPAR_LOOPS];
+    float reg_max_logit[NPAR_LOOPS];
+    const int last_valid_partition = num_partitions - 1;
+
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      const auto partition_no = i * WARP_SIZE + threadIdx.x;
+      valid_partition[i] =
+          (partition_no < num_partitions) ? partition_no : last_valid_partition;
+    }
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      reg_max_logit[i] = max_logits_ptr[valid_partition[i]];
+    }
+    float max_logit = reg_max_logit[0];
+  #pragma unroll
+    for (int i = 1; i < NPAR_LOOPS; i++) {
+      max_logit = fmaxf(max_logit, reg_max_logit[i]);
+    }
+
+  #pragma unroll
+    for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+      max_logit = fmaxf(max_logit, __shfl_xor(max_logit, mask));
+    }
+
+    const float* exp_sums_ptr = exp_sums +
+                                seq_idx * num_heads * max_num_partitions +
+                                head_idx * max_num_partitions;
+
+    float rescaled_exp_sum[NPAR_LOOPS];
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      rescaled_exp_sum[i] = exp_sums_ptr[valid_partition[i]];
+    }
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      const auto partition_no = i * WARP_SIZE + threadIdx.x;
+      rescaled_exp_sum[i] *= (partition_no < num_partitions)
+                                 ? expf(reg_max_logit[i] - max_logit)
+                                 : 0.0f;
+    }
+    float global_exp_sum = rescaled_exp_sum[0];
+  #pragma unroll
+    for (int i = 1; i < NPAR_LOOPS; i++) {
+      global_exp_sum += rescaled_exp_sum[i];
+    }
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      const auto partition_no = i * WARP_SIZE + threadIdx.x;
+      shared_exp_sums[partition_no] = rescaled_exp_sum[i];
+    }
+
+  #pragma unroll
+    for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+      global_exp_sum += __shfl_xor(global_exp_sum, mask);
+    }
+    if (threadIdx.x == 0) {
+      shared_global_exp_sum = global_exp_sum;
+    }
+  }  // warpid == 0
+  const scalar_t* tmp_out_ptr =
+      tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
+      head_idx * max_num_partitions * HEAD_SIZE + threadIdx.x;
+  constexpr int MAX_NPAR = 64;
+  scalar_t tmps[MAX_NPAR];
+  const float dzero = 0.0f;
+  #pragma unroll
+  for (int j = 0; j < MAX_NPAR; j++) {
+    tmps[j] = from_float<scalar_t>(dzero);
+  }
+  const int last_partition_offset = (num_partitions - 1) * HEAD_SIZE;
+  const int num_partition_offset = (num_partitions)*HEAD_SIZE;
+  int idx = 0;
+
+  constexpr int JCHUNK = 16;
+
+  #pragma unroll
+  for (int j = 0; j < JCHUNK * HEAD_SIZE; j += HEAD_SIZE) {
+    // lastj is last valid partition
+    const int lastj_offset =
+        (j < num_partition_offset) ? j : last_partition_offset;
+    tmps[idx] = tmp_out_ptr[lastj_offset];
+    idx++;
+  }
+  __syncthreads();
+
+  if (num_partitions > JCHUNK) {
+  #pragma unroll
+    for (int j = JCHUNK * HEAD_SIZE; j < 2 * JCHUNK * HEAD_SIZE;
+         j += HEAD_SIZE) {
+      const int lastj_offset =
+          (j < num_partition_offset) ? j : last_partition_offset;
+      tmps[idx] = tmp_out_ptr[lastj_offset];
+      idx++;
+    }
+
+    if (num_partitions > 2 * JCHUNK) {
+  #pragma unroll
+      for (int j = 2 * JCHUNK * HEAD_SIZE; j < MAX_NPAR * HEAD_SIZE;
+           j += HEAD_SIZE) {
+        const int lastj_offset =
+            (j < num_partition_offset) ? j : last_partition_offset;
+        tmps[idx] = tmp_out_ptr[lastj_offset];
+        idx++;
+      }
+    }
+  }  // num_partitions > JCHUNK
+
+  // Aggregate tmp_out to out.
+  float acc = 0.0f;
+  #pragma unroll
+  for (int j = 0; j < JCHUNK; j++) {
+    acc += to_float<scalar_t>(tmps[j]) * shared_exp_sums[j];
+  }
+  if (num_partitions > JCHUNK) {
+  #pragma unroll
+    for (int j = JCHUNK; j < 2 * JCHUNK; j++) {
+      acc += to_float<scalar_t>(tmps[j]) * shared_exp_sums[j];
+    }
+    if (num_partitions > 2 * JCHUNK) {
+  #pragma unroll
+      for (int j = 2 * JCHUNK; j < MAX_NPAR; j++) {
+        acc += to_float<scalar_t>(tmps[j]) * shared_exp_sums[j];
+      }
+    }
+  }
+
+  for (int p = 1; p < NPAR_LOOPS; p++) {
+    if (num_partitions > p * MAX_NPAR) {
+      idx = 0;
+  #pragma unroll
+      for (int j = p * MAX_NPAR * HEAD_SIZE; j < (p + 1) * MAX_NPAR * HEAD_SIZE;
+           j += HEAD_SIZE) {
+        // lastj is last valid partition
+        const int lastj_offset =
+            (j < num_partition_offset) ? j : last_partition_offset;
+        tmps[idx] = tmp_out_ptr[lastj_offset];
+        idx++;
+      }
+
+  #pragma unroll
+      for (int j = 0; j < MAX_NPAR; j++) {
+        acc += to_float<scalar_t>(tmps[j]) * shared_exp_sums[j + p * MAX_NPAR];
+      }
+    }
+  }
+
+  const float inv_global_exp_sum =
+      __fdividef(1.0f, shared_global_exp_sum + 1e-6f);
+  const float out_scale =
+      (fp8_out_scale_ptr != nullptr) ? 1.0f / (*fp8_out_scale_ptr) : 1.0f;
+  acc *= inv_global_exp_sum;
+  acc *= out_scale;
+  const int64_t query_start_off = static_cast<int64_t>(
+      query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
+  OUTT* out_ptr = out + query_start_off * num_heads * HEAD_SIZE +
+                  static_cast<int64_t>(head_idx) * HEAD_SIZE;
+  if constexpr (std::is_same<OUTT, bit8_t>::value) {
+    out_ptr[threadIdx.x] =
+        __hip_cvt_float_to_fp8(acc, vllm::fp8::fp8_type::__default_saturation,
+                               vllm::fp8::fp8_type::__default_interpret);
+  } else {
+    out_ptr[threadIdx.x] = from_float<scalar_t>(acc);
+  }
+}
+
+#elif defined(__HIP__GFX11__)
+
+using floatx8 = __attribute__((__vector_size__(8 * sizeof(float)))) float;
+
+using bit16_t = uint16_t;
+using bit16x4 = __attribute__((__vector_size__(4 * sizeof(uint16_t)))) uint16_t;
+typedef bit16x4 _B16x4;
+
+using bit16x8 = __attribute__((__vector_size__(8 * sizeof(uint16_t)))) uint16_t;
+union b16x8_u {
+  bit16x8 u16x8;
+  _B16x4 xy[2];
+};
+typedef b16x8_u _B16x8;
+
+using bit16x16 =
+    __attribute__((__vector_size__(16 * sizeof(uint16_t)))) uint16_t;
+union b16x16_u {
+  bit16x16 u16x16;
+  _B16x8 xy[2];
+};
+typedef b16x16_u _B16x16;
+
+using _B8x8 = uint2;
+using bit8_t = uint8_t;
+
+typedef struct _B8x16 {
+  _B8x8 xy[2];
+} _B8x16;
+
+template <typename T, int absz, int cbid, int blgp>
+__device__ __forceinline__ floatx8 gcn_wmma16x16x16_instr(const bit16x16& inpA,
+                                                          const bit16x16& inpB,
+                                                          const floatx8& inpC) {
+  if constexpr (std::is_same<T, _Float16>::value) {
+    return __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(inpA, inpB, inpC);
+  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
+    return __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(inpA, inpB, inpC);
+  } else {
+    static_assert(false, "unsupported 16b dtype");
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ float to_float(const T& inp) {
+  if constexpr (std::is_same<T, _Float16>::value) {
+    return (float)inp;
+  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
+    return __bfloat162float(inp);
+  } else {
+    static_assert(false, "unsupported 16b dtype");
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ T from_float(const float& inp) {
+  if constexpr (std::is_same<T, _Float16>::value) {
+    return (_Float16)inp;
+  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
+    return __float2bfloat16(inp);
+  } else {
+    static_assert(false, "unsupported 16b dtype");
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ _B16x8 from_floatx8(const floatx8& inp) {
+  if constexpr (std::is_same<T, _Float16>::value) {
+    union h2cvt {
+      __half2 h2[4];
+      _B16x8 b16x8;
+    } u;
+    u.h2[0] = __float22half2_rn(make_float2(inp[0], inp[1]));
+    u.h2[1] = __float22half2_rn(make_float2(inp[2], inp[3]));
+    u.h2[2] = __float22half2_rn(make_float2(inp[4], inp[5]));
+    u.h2[3] = __float22half2_rn(make_float2(inp[6], inp[7]));
+    return u.b16x8;
+  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
+    union b2cvt {
+      __hip_bfloat162 b2[4];
+      _B16x8 b16x8;
+    } u;
+
+    u.b2[0] = __float22bfloat162_rn(make_float2(inp[0], inp[1]));
+    u.b2[1] = __float22bfloat162_rn(make_float2(inp[2], inp[3]));
+    u.b2[2] = __float22bfloat162_rn(make_float2(inp[4], inp[5]));
+    u.b2[3] = __float22bfloat162_rn(make_float2(inp[6], inp[7]));
+
+    return u.b16x8;
+  } else {
+    static_assert(false, "unsupported 16b dtype");
+  }
+}
+
+// clang-format off
+template <typename scalar_t, typename cache_t,
+          vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE,
+          int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO>
+__global__
+__launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
+    const scalar_t* __restrict__ q,       // [num_seqs, num_heads, head_size]
+    const cache_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
+                                          // head_size/x, block_size, x]
+    const cache_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
+                                          // head_size, block_size]
+    const int num_kv_heads, const float scale,
+    const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
+    const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,   // [num_seqs]
+    const int max_num_blocks_per_seq,
+    const float* __restrict__ alibi_slopes,  // [num_heads]
+    const int q_stride, const int kv_block_stride, const int kv_head_stride,
+    float* __restrict__ exp_sums,  // [num_seqs, num_heads, max_num_partitions]
+    float* __restrict__ max_logits,  // [num_seqs, num_heads,
+                                     // max_num_partitions]
+    scalar_t* __restrict__ out,    // [num_seqs, num_heads, max_num_partitions,
+                                   // head_size]
+    OUTT* __restrict__ final_out,  // [num_seqs, num_heads, head_size]
+    int max_ctx_blocks, const float* k_scale, const float* v_scale) {
+  // clang-format on
+  constexpr int NWARPS = NUM_THREADS / WARP_SIZE;  // 8 warps on gfx11
+  const int warpid = threadIdx.x / WARP_SIZE;
+  const int laneid = threadIdx.x % WARP_SIZE;
+  const int lane2id = laneid % 2;
+  const int lane16id = laneid % 16;
+  const int rowid = laneid / 16;
+
+  const int seq_idx = blockIdx.x;
+  // NOTE queries with sequence len > 1 are prefills and taken care by another
+  // kernel.
+  if (query_start_loc_ptr != nullptr &&
+      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx]) != 1) {
+    return;
+  }
+
+  const int partition_idx = blockIdx.y;
+
+  constexpr int T_PAR_SIZE = 256;  // token partition size set to 256
+
+  const int max_num_partitions = gridDim.y;
+
+  const int context_len = context_lens[seq_idx];  // length of a seq
+
+  const int partition_start_token_idx = partition_idx * T_PAR_SIZE;
+  // exit if partition is out of context for seq
+  if (partition_start_token_idx >= context_len) {
+    return;
+  }
+
+  constexpr int GQA_RATIO2 = DIVIDE_ROUND_UP(GQA_RATIO, 2);
+
+  __shared__ float shared_qk_max[NWARPS][16 + 1];
+  __shared__ float shared_exp_sum[NWARPS][16 + 1];
+  // shared_logits is used for multiple purposes
+  __shared__ _B16x16 shared_logits[NWARPS][2][16][2];
+
+  // for QK wmma16x16, layout is QHead/Tokenx16 across every 16 lanes,
+  // 32 Bytes HeadElements in each lane, 2x16B HeadElements across a row of warp
+  constexpr int ROWS_PER_WARP =
+      WARP_SIZE / 16 / 2;  // rows refers to 16 lanes; refer dpp terminology
+  constexpr int CONTIGUOUS_KV_ELEMS_16B_LOAD =
+      16 / sizeof(cache_t);  // 8 for 16 bit cache type, 16 for 8 bit types
+  constexpr int QKHE_PER_FETCH =
+      CONTIGUOUS_KV_ELEMS_16B_LOAD *
+      ROWS_PER_WARP;  // each fetch across a warp fetches these many elements
+  constexpr int QKHELOOP = HEAD_SIZE / QKHE_PER_FETCH;  // 2xQKHE_16B across
+                                                        // warp
+
+  _B16x16 Qlocal[QKHELOOP / 2];  // note that 16 contiguous elements of Q should
+                                 // be fetched per lane for 16 bit cache types
+
+  constexpr int CONTIGUOUS_SCALAR_ELEMS_16B = 16 / sizeof(scalar_t);
+
+  constexpr int TOKENS_PER_WARP =
+      T_PAR_SIZE /
+      NWARPS;  // sub partition of tokens per warp for qk calculation
+  constexpr int TLOOP =
+      TOKENS_PER_WARP /
+      16;  // each wmma16x16x16 instruction processes 16 tokens
+
+  _B16x16 Klocal[TLOOP]
+                [QKHELOOP / 2];  // can be interpreted as B8x16 for 8 bit types
+
+  const int wg_start_head_idx = blockIdx.z * GQA_RATIO;
+  const int wg_start_kv_head_idx = blockIdx.z;
+  const int total_num_heads = gridDim.z * GQA_RATIO;
+
+  // for QK wmma, tokens in multiples of TOKENS_PER_WARP are spread across warps
+  // each wmma takes QH16xT16x16HE across warp
+  // repeat wmma across QKHELOOP dimension
+  // output layout from QKwmma : QH16xT8x2 16 qheads across 16 lanes, 16 tokens
+  // across 2 rows x 8 tokens per lane
+
+  const int64_t query_start_off = static_cast<int64_t>(
+      query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
+
+  if (GQA_RATIO == 1) {
+    const int local_qhead_idx = lane16id % GQA_RATIO;
+    const int global_qhead_idx = wg_start_head_idx + local_qhead_idx;
+    const scalar_t* q_ptr =
+        q + query_start_off * q_stride + global_qhead_idx * HEAD_SIZE;
+    if (lane16id < GQA_RATIO) {
+  #pragma unroll
+      for (int qkhe_depth = 0; qkhe_depth < QKHELOOP / 2; qkhe_depth++) {
+        const scalar_t* q_fetch_ptr = q_ptr + qkhe_depth * QKHE_PER_FETCH * 2;
+        const _B16x16* q_fetch_ptr_32B =
+            reinterpret_cast<const _B16x16*>(q_fetch_ptr);
+        Qlocal[qkhe_depth] = *q_fetch_ptr_32B;
+      }
+    }
+  } else {
+    // fetch Q in shared across warps and then write to registers
+    const int local_qhead_idx = 2 * warpid + rowid;
+    const int global_qhead_idx = wg_start_head_idx + local_qhead_idx;
+    const scalar_t* q_ptr =
+        q + query_start_off * q_stride + global_qhead_idx * HEAD_SIZE;
+
+    const int qhead_element = lane16id * CONTIGUOUS_SCALAR_ELEMS_16B;
+    if ((local_qhead_idx < GQA_RATIO) && (qhead_element < HEAD_SIZE)) {
+      const scalar_t* q_fetch_ptr = q_ptr + qhead_element;
+      const _B16x8* q_fetch_ptr_16B =
+          reinterpret_cast<const _B16x8*>(q_fetch_ptr);
+      _B16x8 tmp = *q_fetch_ptr_16B;
+
+      const int offset1 =
+          lane16id /
+          2;  // 16 contiguous chunks of head elems are spread across 8x2lanes
+      shared_logits[offset1][lane2id][local_qhead_idx][0].xy[0] = tmp;
+    }
+
+    __syncthreads();
+
+  #pragma unroll
+    for (int qkhe_depth = 0; qkhe_depth < QKHELOOP / 2; qkhe_depth++) {
+      Qlocal[qkhe_depth].xy[0] =
+          shared_logits[qkhe_depth][0][lane16id % GQA_RATIO][0].xy[0];
+      Qlocal[qkhe_depth].xy[1] =
+          shared_logits[qkhe_depth][1][lane16id % GQA_RATIO][0].xy[0];
+    }
+  }
+
+  const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
+  const int last_ctx_block = num_context_blocks - 1;
+
+  const int* block_table_seq = block_tables + seq_idx * max_num_blocks_per_seq;
+
+  int kphysical_block_number[TLOOP];
+
+  // fetch k physical block numbers
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    const int klocal_token_idx =
+        TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
+    const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
+    const int kblock_idx = (kglobal_token_idx < context_len)
+                               ? kglobal_token_idx / BLOCK_SIZE
+                               : last_ctx_block;
+    kphysical_block_number[token_depth] = block_table_seq[kblock_idx];
+  }
+
+  constexpr int KX = 16 / sizeof(cache_t);
+  const cache_t* k_ptr = k_cache + wg_start_kv_head_idx * kv_head_stride;
+
+  const int row_head_elem = 0;
+
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    const int64_t kblock_number =
+        static_cast<int64_t>(kphysical_block_number[token_depth]);
+    const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride;
+    const int klocal_token_idx =
+        TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
+    const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE;
+    const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX;
+
+    for (int qkhe_depth = 0; qkhe_depth < QKHELOOP; qkhe_depth++) {
+      const int head_elem = row_head_elem + qkhe_depth * QKHE_PER_FETCH;
+      const int offset1 = head_elem / KX;
+      const int offset2 = head_elem % KX;
+      const cache_t* k_fetch_ptr = k_ptr3 + offset1 * BLOCK_SIZE * KX + offset2;
+      const _B16x8* k_fetch_ptr_16B =
+          reinterpret_cast<const _B16x8*>(k_fetch_ptr);
+      Klocal[token_depth][qkhe_depth / 2].xy[qkhe_depth % 2] = *k_fetch_ptr_16B;
+    }
+  }
+
+  constexpr int VTOKENS_PER_LANE =
+      TOKENS_PER_WARP / ROWS_PER_WARP;  // 32/1 = 32 vtokens per lane
+  constexpr int VBLOCKS_PER_LANE = 2;   // assumes block size >=16
+  constexpr int VTLOOP = NWARPS;        // corresponds to tokens across warps
+  constexpr int VTLANELOOP = DIVIDE_ROUND_UP(
+      VTOKENS_PER_LANE,
+      CONTIGUOUS_KV_ELEMS_16B_LOAD);  // optimized for 16B fetches; assumes
+                                      // minimum block size is 16
+  constexpr int VHELOOP = DIVIDE_ROUND_UP(
+      (HEAD_SIZE / 16), NWARPS);  // head_size distributed across warps; each
+                                  // wmma instr works on 16 head elements
+
+  int vphysical_block_number[VTLOOP][VBLOCKS_PER_LANE];
+
+  // fetch v physical block numbers
+  for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) {
+    for (int vblock_depth = 0; vblock_depth < VBLOCKS_PER_LANE;
+         vblock_depth++) {
+      const int vlocal_token_idx =
+          vtoken_depth * VTOKENS_PER_LANE * ROWS_PER_WARP +
+          vblock_depth * BLOCK_SIZE;
+      const int vglobal_token_idx =
+          partition_start_token_idx + vlocal_token_idx;
+      const int vblock_idx = (vglobal_token_idx < context_len)
+                                 ? vglobal_token_idx / BLOCK_SIZE
+                                 : last_ctx_block;
+      vphysical_block_number[vtoken_depth][vblock_depth] =
+          block_table_seq[vblock_idx];
+    }
+  }
+
+  _B16x16 Vlocal[VTLOOP][VHELOOP]
+                [VTLANELOOP / 2];  // this can be interpreted as B8x16 too
+
+  const cache_t* v_ptr = v_cache + wg_start_kv_head_idx * kv_head_stride;
+  // v fetches are 16head elems across lanes x (16x2) tokens per lane
+  for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) {
+    const int vhead_elem = vhe_depth * NWARPS * 16 + warpid * 16 + lane16id;
+    const cache_t* v_ptr2 = v_ptr + vhead_elem * BLOCK_SIZE;
+
+    for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) {
+      for (int vfetch_depth = 0; vfetch_depth < VTLANELOOP; vfetch_depth++) {
+        const int64_t vblock_number = static_cast<int64_t>(
+            vphysical_block_number[vtoken_depth]
+                                  [vfetch_depth / VBLOCKS_PER_LANE]);
+        const cache_t* v_ptr3 = v_ptr2 + (vblock_number * kv_block_stride);
+
+        const cache_t* v_fetch_ptr =
+            v_ptr3 +
+            (vfetch_depth % VBLOCKS_PER_LANE) * CONTIGUOUS_KV_ELEMS_16B_LOAD;
+        const _B16x8* v_fetch_ptr_16B =
+            reinterpret_cast<const _B16x8*>(v_fetch_ptr);
+        Vlocal[vtoken_depth][vhe_depth][vfetch_depth / 2].xy[vfetch_depth % 2] =
+            *v_fetch_ptr_16B;
+      }
+    }
+  }
+
+  floatx8 dout[TLOOP];
+  // qk wmma
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    dout[token_depth] = {0};
+    for (int qkhe_depth = 0; qkhe_depth < QKHELOOP / 2; qkhe_depth++) {
+      dout[token_depth] = gcn_wmma16x16x16_instr<scalar_t, 0, 0, 0>(
+          Klocal[token_depth][qkhe_depth].u16x16, Qlocal[qkhe_depth].u16x16,
+          dout[token_depth]);
+    }
+    dout[token_depth] *= scale;
+  }
+
+  // calculate qk_max and exp_sum per warp and write to shared memory
+  float qk_max = -FLT_MAX;
+  float exp_sum = 0.0f;
+  const int qkout_token_idx =
+      partition_start_token_idx + TOKENS_PER_WARP * warpid + rowid;
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    const int local_token_idx = qkout_token_idx + token_depth * 16;
+    for (int i = 0; i < 8; i++) {
+      const float tmp = (local_token_idx + 2 * i < context_len)
+                            ? dout[token_depth][i]
+                            : -FLT_MAX;
+      qk_max = fmaxf(qk_max, tmp);
+    }
+  }
+
+  qk_max = fmaxf(qk_max, __shfl_xor(qk_max, 16));
+
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    const int local_token_idx = qkout_token_idx + token_depth * 16;
+    for (int i = 0; i < 8; i++) {
+      const float tmp = (local_token_idx + 2 * i < context_len)
+                            ? __expf(dout[token_depth][i] - qk_max)
+                            : 0.0f;
+      dout[token_depth][i] = tmp;
+      exp_sum += tmp;
+    }
+  }
+
+  exp_sum += __shfl_xor(exp_sum, 16);
+
+  __syncthreads();
+
+  if (laneid < 16) {
+    shared_qk_max[warpid][lane16id] = qk_max;
+    shared_exp_sum[warpid][lane16id] = exp_sum;
+  }
+
+  __syncthreads();
+
+  // calculate partition qk_max and exp_sum
+  float partition_qk_max = -FLT_MAX;
+  float warp_qk_max_exp[NWARPS];
+  float partition_exp_sum = 0.0f;
+
+  #pragma unroll
+  for (int w = 0; w < NWARPS; w++) {
+    warp_qk_max_exp[w] = shared_qk_max[w][lane16id];
+    partition_qk_max = fmaxf(partition_qk_max, warp_qk_max_exp[w]);
+  }
+
+  for (int w = 0; w < NWARPS; w++) {
+    warp_qk_max_exp[w] = __expf(warp_qk_max_exp[w] - partition_qk_max);
+    partition_exp_sum += shared_exp_sum[w][lane16id] * warp_qk_max_exp[w];
+  }
+
+  const float inv_sum_scale =
+      __fdividef(1.f, partition_exp_sum + 1e-6f) * warp_qk_max_exp[warpid];
+
+  __syncthreads();
+
+  // write logits to shared mem
+  #pragma unroll
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    dout[token_depth] *= inv_sum_scale;
+    shared_logits[warpid][token_depth][lane16id][0].xy[rowid] =
+        from_floatx8<scalar_t>(dout[token_depth]);
+  }
+  __syncthreads();
+
+  _B16x8 swp_buf[TLOOP][2];
+  #pragma unroll
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    swp_buf[token_depth][0] =
+        shared_logits[warpid][token_depth][lane16id][0].xy[0];
+    swp_buf[token_depth][1] =
+        shared_logits[warpid][token_depth][lane16id][0].xy[1];
+  }
+
+  #pragma unroll
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+  #pragma unroll
+    for (int i = 0; i < 8; i++) {
+      shared_logits[warpid][token_depth][lane16id][0].xy[rowid].u16x8[i] =
+          swp_buf[token_depth][i % 2].u16x8[4 * rowid + (i / 2)];
+    }
+  }
+
+  // write out partition max_logits and exp_sum
+  if (threadIdx.x < GQA_RATIO) {
+    const int qhead_idx = lane16id;
+    const int offset = seq_idx * total_num_heads * max_num_partitions +
+                       (wg_start_head_idx + qhead_idx) * max_num_partitions +
+                       partition_idx;
+    max_logits[offset] = partition_qk_max;
+    exp_sums[offset] = partition_exp_sum;
+  }
+
+  __syncthreads();
+
+  _B16x8 outelems[VHELOOP];
+  // Softmax V wmma
+  // v layout: 16he across lanes x (16x2) tokens per lane
+  for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) {
+    floatx8 tmp_out = {0};
+    for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) {
+      for (int vfetch_depth = 0; vfetch_depth < VTLANELOOP / 2;
+           vfetch_depth++) {
+        const int offset = vfetch_depth;
+        // if output format is 16 qheads across 16 lanes, 16 head elems spread
+        // across rows
+        tmp_out = gcn_wmma16x16x16_instr<scalar_t, 0, 0, 0>(
+            Vlocal[vtoken_depth][vhe_depth][vfetch_depth].u16x16,
+            shared_logits[vtoken_depth][offset][lane16id][0].u16x16, tmp_out);
+      }
+    }
+    outelems[vhe_depth] = from_floatx8<scalar_t>(tmp_out);
+  }
+
+  __syncthreads();
+
+  #pragma unroll
+  for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) {
+    shared_logits[warpid][vhe_depth][lane16id][0].xy[rowid] =
+        outelems[vhe_depth];  // lane16 id head dimension; rowid head element
+                              // dimension
+  }
+
+  __syncthreads();
+
+  #pragma unroll
+  for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) {
+    swp_buf[vhe_depth][0] = shared_logits[warpid][vhe_depth][lane16id][0].xy[0];
+    swp_buf[vhe_depth][1] = shared_logits[warpid][vhe_depth][lane16id][0].xy[1];
+  }
+
+  #pragma unroll
+  for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) {
+  #pragma unroll
+    for (int i = 0; i < 8; i++) {
+      shared_logits[warpid][vhe_depth][lane16id][0].xy[rowid].u16x8[i] =
+          swp_buf[vhe_depth][i % 2].u16x8[4 * rowid + (i / 2)];
+    }
+  }
+
+  __syncthreads();
+
+  // write to tmp_out with coalesced writes after reading from shared mem
+  if (warpid == 0) {
+    _B16x8 vout[GQA_RATIO2];
+    // each lane writes out 16Bytes of tmp_out along head elem dimension
+    const int head_elem_idx = lane16id * 8;
+    if (head_elem_idx < HEAD_SIZE) {
+      for (int h = 0; h < GQA_RATIO2; h++) {
+        const int local_head_idx = 2 * h + rowid;
+        const int offset1 = (head_elem_idx / 16) % NWARPS;
+        const int offset2 = head_elem_idx / 16 / NWARPS;
+        const int offset3 = (head_elem_idx / 8) % 2;  // num_he % num_row
+        vout[h] =
+            shared_logits[offset1][offset2][local_head_idx][0].xy[offset3];
+      }
+
+      const int hsz_maxp_mult = HEAD_SIZE * max_num_partitions;
+      scalar_t* out_ptr = out + seq_idx * total_num_heads * hsz_maxp_mult +
+                          partition_idx * HEAD_SIZE;
+      for (int h = 0; h < GQA_RATIO2; h++) {
+        const int local_head_idx = 2 * h + rowid;
+        if (local_head_idx < GQA_RATIO) {
+          const int out_head_idx = wg_start_head_idx + local_head_idx;
+          scalar_t* out_ptr2 = out_ptr + out_head_idx * hsz_maxp_mult;
+          scalar_t* out_ptr3 = out_ptr2 + head_elem_idx;
+          _B16x8* out_ptr_B16x8 = reinterpret_cast<_B16x8*>(out_ptr3);
+          *out_ptr_B16x8 = vout[h];
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename cache_t,
+          vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE,
+          int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED,
+          int GQA_RATIO>
+__global__
+__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
+    const scalar_t* __restrict__ q,       // [num_seqs, num_heads, head_size]
+    const cache_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
+                                          // head_size/x, block_size, x]
+    const cache_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
+                                          // head_size, block_size]
+    const int num_kv_heads, const float scale,
+    const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
+    const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
+    const int max_num_blocks_per_seq,
+    const float* __restrict__ alibi_slopes,  // [num_heads]
+    const int q_stride, const int kv_block_stride, const int kv_head_stride,
+    float* __restrict__ exp_sums,  // [num_seqs, num_heads, max_num_partitions]
+    float* __restrict__ max_logits,  // [num_seqs, num_heads,
+                                     // max_num_partitions]
+    scalar_t* __restrict__ out,    // [num_seqs, num_heads, max_num_partitions,
+                                   // head_size]
+    OUTT* __restrict__ final_out,  // [num_seqs, num_heads, head_size]
+    int max_ctx_blocks, const float* k_scale, const float* v_scale) {
+  UNREACHABLE_CODE
+}
+
+// Grid: (num_heads, num_seqs).
+template <typename scalar_t, typename OUTT, int HEAD_SIZE, int NUM_THREADS,
+          int PARTITION_SIZE, int NPAR_LOOPS>
+__global__
+__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
+    OUTT* __restrict__ out,                // [num_seqs, num_heads, head_size]
+    const float* __restrict__ exp_sums,    // [num_seqs, num_heads,
+                                           // max_num_partitions]
+    const float* __restrict__ max_logits,  // [num_seqs, num_heads,
+                                           // max_num_partitions]
+    const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads,
+                                           // max_num_partitions, head_size]
+    const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
+    const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) {
+  const auto num_heads = gridDim.x;
+  const auto head_idx = blockIdx.x;
+  const auto seq_idx = blockIdx.y;
+
+  // NOTE queries with sequence len > 1 are prefills and taken care by another
+  // kernel.
+  if (query_start_loc_ptr != nullptr &&
+      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) {
+    return;
+  }
+
+  const int context_len = context_lens[seq_idx];
+  const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
+  const int warpid = threadIdx.x / WARP_SIZE;
+
+  __shared__ float shared_global_exp_sum;
+  // max num partitions supported is warp_size * NPAR_LOOPS
+  __shared__ float shared_exp_sums[NPAR_LOOPS * WARP_SIZE];
+
+  if (warpid == 0) {
+    const float* max_logits_ptr = max_logits +
+                                  seq_idx * num_heads * max_num_partitions +
+                                  head_idx * max_num_partitions;
+
+    // valid partition is the last valid partition in case threadid > num
+    // partitions
+    int valid_partition[NPAR_LOOPS];
+    float reg_max_logit[NPAR_LOOPS];
+    const int last_valid_partition = num_partitions - 1;
+
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      const int partition_no = i * WARP_SIZE + threadIdx.x;
+      valid_partition[i] =
+          (partition_no < num_partitions) ? partition_no : last_valid_partition;
+    }
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      reg_max_logit[i] = max_logits_ptr[valid_partition[i]];
+    }
+    float max_logit = reg_max_logit[0];
+  #pragma unroll
+    for (int i = 1; i < NPAR_LOOPS; i++) {
+      max_logit = fmaxf(max_logit, reg_max_logit[i]);
+    }
+
+  #pragma unroll
+    for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+      max_logit = fmaxf(max_logit, __shfl_xor(max_logit, mask));
+    }
+
+    const float* exp_sums_ptr = exp_sums +
+                                seq_idx * num_heads * max_num_partitions +
+                                head_idx * max_num_partitions;
+
+    float rescaled_exp_sum[NPAR_LOOPS];
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      rescaled_exp_sum[i] = exp_sums_ptr[valid_partition[i]];
+    }
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      const int partition_no = i * WARP_SIZE + threadIdx.x;
+      rescaled_exp_sum[i] *= (partition_no < num_partitions)
+                                 ? expf(reg_max_logit[i] - max_logit)
+                                 : 0.0f;
+    }
+    float global_exp_sum = rescaled_exp_sum[0];
+  #pragma unroll
+    for (int i = 1; i < NPAR_LOOPS; i++) {
+      global_exp_sum += rescaled_exp_sum[i];
+    }
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      const int partition_no = i * WARP_SIZE + threadIdx.x;
+      shared_exp_sums[partition_no] = rescaled_exp_sum[i];
+    }
+
+  #pragma unroll
+    for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+      global_exp_sum += __shfl_xor(global_exp_sum, mask);
+    }
+    if (threadIdx.x == 0) {
+      shared_global_exp_sum = global_exp_sum;
+    }
+  }  // warpid == 0
+  const scalar_t* tmp_out_ptr =
+      tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
+      head_idx * max_num_partitions * HEAD_SIZE + threadIdx.x;
+  constexpr int MAX_NPAR = 32;
+  scalar_t tmps[MAX_NPAR];
+  const float dzero = 0.0f;
+  #pragma unroll
+  for (int j = 0; j < MAX_NPAR; j++) {
+    tmps[j] = from_float<scalar_t>(dzero);
+  }
+  const int last_partition_offset = (num_partitions - 1) * HEAD_SIZE;
+  const int num_partition_offset = (num_partitions)*HEAD_SIZE;
+  int idx = 0;
+
+  constexpr int JCHUNK = 16;
+
+  #pragma unroll
+  for (int j = 0; j < JCHUNK * HEAD_SIZE; j += HEAD_SIZE) {
+    // lastj is last valid partition
+    const int lastj_offset =
+        (j < num_partition_offset) ? j : last_partition_offset;
+    tmps[idx] = tmp_out_ptr[lastj_offset];
+    idx++;
+  }
+  __syncthreads();
+
+  if (num_partitions > JCHUNK) {
+  #pragma unroll
+    for (int j = JCHUNK * HEAD_SIZE; j < 2 * JCHUNK * HEAD_SIZE;
+         j += HEAD_SIZE) {
+      const int lastj_offset =
+          (j < num_partition_offset) ? j : last_partition_offset;
+      tmps[idx] = tmp_out_ptr[lastj_offset];
+      idx++;
+    }
+
+    if (num_partitions > 2 * JCHUNK) {
+  #pragma unroll
+      for (int j = 2 * JCHUNK * HEAD_SIZE; j < MAX_NPAR * HEAD_SIZE;
+           j += HEAD_SIZE) {
+        const int lastj_offset =
+            (j < num_partition_offset) ? j : last_partition_offset;
+        tmps[idx] = tmp_out_ptr[lastj_offset];
+        idx++;
+      }
+    }
+  }  // num_partitions > JCHUNK
+
+  // Aggregate tmp_out to out.
+  float acc = 0.0f;
+  #pragma unroll
+  for (int j = 0; j < JCHUNK; j++) {
+    acc += to_float<scalar_t>(tmps[j]) * shared_exp_sums[j];
+  }
+  if (num_partitions > JCHUNK) {
+  #pragma unroll
+    for (int j = JCHUNK; j < 2 * JCHUNK; j++) {
+      acc += to_float<scalar_t>(tmps[j]) * shared_exp_sums[j];
+    }
+    if (num_partitions > 2 * JCHUNK) {
+  #pragma unroll
+      for (int j = 2 * JCHUNK; j < MAX_NPAR; j++) {
+        acc += to_float<scalar_t>(tmps[j]) * shared_exp_sums[j];
+      }
+    }
+  }
+
+  for (int p = 1; p < NPAR_LOOPS; p++) {
+    if (num_partitions > p * MAX_NPAR) {
+      idx = 0;
+  #pragma unroll
+      for (int j = p * MAX_NPAR * HEAD_SIZE; j < (p + 1) * MAX_NPAR * HEAD_SIZE;
+           j += HEAD_SIZE) {
+        // lastj is last valid partition
+        const int lastj_offset =
+            (j < num_partition_offset) ? j : last_partition_offset;
+        tmps[idx] = tmp_out_ptr[lastj_offset];
+        idx++;
+      }
+
+  #pragma unroll
+      for (int j = 0; j < MAX_NPAR; j++) {
+        acc += to_float<scalar_t>(tmps[j]) * shared_exp_sums[j + p * MAX_NPAR];
+      }
+    }
+  }
+
+  const float inv_global_exp_sum =
+      __fdividef(1.0f, shared_global_exp_sum + 1e-6f);
+  acc *= inv_global_exp_sum;
+
+  const int64_t query_start_off = static_cast<int64_t>(
+      query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
+  OUTT* out_ptr = out + query_start_off * num_heads * HEAD_SIZE +
+                  static_cast<int64_t>(head_idx) * HEAD_SIZE;
+  out_ptr[threadIdx.x] = from_float<scalar_t>(acc);
+}
+
+#elif defined(__HIP__GFX12__)
+
+using floatx8 = __attribute__((__vector_size__(8 * sizeof(float)))) float;
+
+using bit16_t = uint16_t;
+using bit16x4 = __attribute__((__vector_size__(4 * sizeof(uint16_t)))) uint16_t;
+typedef bit16x4 _B16x4;
+
+using bit16x8 = __attribute__((__vector_size__(8 * sizeof(uint16_t)))) uint16_t;
+union b16x8_u {
+  bit16x8 u16x8;
+  _B16x4 xy[2];
+};
+typedef b16x8_u _B16x8;
+
+using _B8x8 = uint2;
+using bit8_t = uint8_t;
+
+typedef struct _B8x16 {
+  _B8x8 xy[2];
+} _B8x16;
+
+template <typename T, int absz, int cbid, int blgp>
+__device__ __forceinline__ floatx8 gcn_wmma16x16x16_instr(const bit16x8& inpA,
+                                                          const bit16x8& inpB,
+                                                          const floatx8& inpC) {
+  if constexpr (std::is_same<T, _Float16>::value) {
+    return __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(inpA, inpB, inpC);
+  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
+    return __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(inpA, inpB, inpC);
+  } else {
+    static_assert(false, "unsupported 16b dtype");
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ float to_float(const T& inp) {
+  if constexpr (std::is_same<T, _Float16>::value) {
+    return (float)inp;
+  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
+    return __bfloat162float(inp);
+  } else {
+    static_assert(false, "unsupported 16b dtype");
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ float to_float_b16(const bit16_t& inp) {
+  union tmpcvt {
+    bit16_t u;
+    _Float16 f;
+    __hip_bfloat16 b;
+  } t16;
+  t16.u = inp;
+  if constexpr (std::is_same<T, _Float16>::value) {
+    return (float)t16.f;
+  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
+    return __bfloat162float(t16.b);
+  } else {
+    static_assert(false, "unsupported 16b dtype");
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ T from_float(const float& inp) {
+  if constexpr (std::is_same<T, _Float16>::value) {
+    return (_Float16)inp;
+  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
+    return __float2bfloat16(inp);
+  } else {
+    static_assert(false, "unsupported 16b dtype");
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ _B16x8 from_floatx8(const floatx8& inp) {
+  if constexpr (std::is_same<T, _Float16>::value) {
+    union h2cvt {
+      __half2 h2[4];
+      _B16x8 b16x8;
+    } u;
+    u.h2[0] = __float22half2_rn(make_float2(inp[0], inp[1]));
+    u.h2[1] = __float22half2_rn(make_float2(inp[2], inp[3]));
+    u.h2[2] = __float22half2_rn(make_float2(inp[4], inp[5]));
+    u.h2[3] = __float22half2_rn(make_float2(inp[6], inp[7]));
+    return u.b16x8;
+  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
+    union b2cvt {
+      __hip_bfloat162 b2[4];
+      _B16x8 b16x8;
+    } u;
+
+    u.b2[0] = __float22bfloat162_rn(make_float2(inp[0], inp[1]));
+    u.b2[1] = __float22bfloat162_rn(make_float2(inp[2], inp[3]));
+    u.b2[2] = __float22bfloat162_rn(make_float2(inp[4], inp[5]));
+    u.b2[3] = __float22bfloat162_rn(make_float2(inp[6], inp[7]));
+
+    return u.b16x8;
+  } else {
+    static_assert(false, "unsupported 16b dtype");
+  }
+}
+
+// clang-format off
+template <typename scalar_t, typename cache_t,
+          vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE,
+          int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO>
+__global__
+__launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
+    const scalar_t* __restrict__ q,       // [num_seqs, num_heads, head_size]
+    const cache_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
+                                          // head_size/x, block_size, x]
+    const cache_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
+                                          // head_size, block_size]
+    const int num_kv_heads, const float scale,
+    const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
+    const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,   // [num_seqs]
+    const int max_num_blocks_per_seq,
+    const float* __restrict__ alibi_slopes,  // [num_heads]
+    const int q_stride, const int kv_block_stride, const int kv_head_stride,
+    float* __restrict__ exp_sums,  // [num_seqs, num_heads, max_num_partitions]
+    float* __restrict__ max_logits,  // [num_seqs, num_heads,
+                                     // max_num_partitions]
+    scalar_t* __restrict__ out,    // [num_seqs, num_heads, max_num_partitions,
+                                   // head_size]
+    OUTT* __restrict__ final_out,  // [num_seqs, num_heads, head_size]
+    int max_ctx_blocks, const float* k_scale, const float* v_scale) {
+  // clang-format on
+  constexpr int NWARPS = NUM_THREADS / WARP_SIZE;  // 8 warps on gfx11
+  const int warpid = threadIdx.x / WARP_SIZE;
+  const int laneid = threadIdx.x % WARP_SIZE;
+  const int lane2id = laneid % 2;
+  const int lane16id = laneid % 16;
+  const int rowid = laneid / 16;
+
+  const int seq_idx = blockIdx.x;
+  // NOTE queries with sequence len > 1 are prefills and taken care by another
+  // kernel.
+  if (query_start_loc_ptr != nullptr &&
+      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) {
+    return;
+  }
+  const int partition_idx = blockIdx.y;
+
+  constexpr int T_PAR_SIZE = 256;  // token partition size set to 256
+
+  const int max_num_partitions = gridDim.y;
+
+  const int context_len = context_lens[seq_idx];  // length of a seq
+
+  const int partition_start_token_idx = partition_idx * T_PAR_SIZE;
+  // exit if partition is out of context for seq
+  if (partition_start_token_idx >= context_len) {
+    return;
+  }
+
+  constexpr int GQA_RATIO2 = DIVIDE_ROUND_UP(GQA_RATIO, 2);
+
+  __shared__ float shared_qk_max[NWARPS][16 + 1];
+  __shared__ float shared_exp_sum[NWARPS][16 + 1];
+  // shared_logits is used for multiple purposes
+  __shared__ _B16x8 shared_logits[NWARPS][2][16][2];
+
+  // for QK wmma16x16_gfx12, layout is QHead/Tokenx16 across every 16 lanes,
+  // 16 Bytes HeadElements in each lane, 2x16B HeadElements across 2 rows of
+  // warp
+  constexpr int ROWS_PER_WARP =
+      WARP_SIZE / 16;  // rows refers to 16 lanes; refer dpp terminology
+  constexpr int CONTIGUOUS_KV_ELEMS_16B_LOAD =
+      16 / sizeof(cache_t);  // 8 for 16 bit cache type, 16 for 8 bit types
+  constexpr int QKHE_PER_FETCH =
+      CONTIGUOUS_KV_ELEMS_16B_LOAD *
+      ROWS_PER_WARP;  // each fetch across a warp fetches these many elements
+  constexpr int QKHELOOP = HEAD_SIZE / QKHE_PER_FETCH;  // 2xQKHE_16B across
+                                                        // warp
+
+  _B16x8 Qlocal[QKHELOOP];  // note that 16 contiguous elements of Q should
+                            // be fetched per lane for 16 bit cache types
+
+  constexpr int CONTIGUOUS_SCALAR_ELEMS_16B = 16 / sizeof(scalar_t);
+
+  constexpr int TOKENS_PER_WARP =
+      T_PAR_SIZE /
+      NWARPS;  // sub partition of tokens per warp for qk calculation
+  constexpr int TLOOP =
+      TOKENS_PER_WARP /
+      16;  // each wmma16x16x16 instruction processes 16 tokens
+
+  _B16x8 Klocal[TLOOP]
+               [QKHELOOP];  // can be interpreted as B8x16 for 8 bit types
+
+  const int wg_start_head_idx = blockIdx.z * GQA_RATIO;
+  const int wg_start_kv_head_idx = blockIdx.z;
+  const int total_num_heads = gridDim.z * GQA_RATIO;
+
+  // for QK wmma, tokens in multiples of TOKENS_PER_WARP are spread across warps
+  // each wmma takes QH16xT16x16HE across warp
+  // repeat wmma across QKHELOOP dimension
+  // output layout from QKwmma : QH16xT8x2 16 qheads across 16 lanes, 16 tokens
+  // across 2 rows x 8 tokens per lane
+
+  const int64_t query_start_off = static_cast<int64_t>(
+      query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
+
+  if (GQA_RATIO == 1) {
+    const int local_qhead_idx = lane16id % GQA_RATIO;
+    const int global_qhead_idx = wg_start_head_idx + local_qhead_idx;
+    const scalar_t* q_ptr = q + query_start_off * q_stride +
+                            global_qhead_idx * HEAD_SIZE +
+                            rowid * CONTIGUOUS_KV_ELEMS_16B_LOAD;
+    if (lane16id < GQA_RATIO) {
+  #pragma unroll
+      for (int qkhe_depth = 0; qkhe_depth < QKHELOOP; qkhe_depth++) {
+        const scalar_t* q_fetch_ptr = q_ptr + qkhe_depth * QKHE_PER_FETCH;
+        const _B16x8* q_fetch_ptr_16B =
+            reinterpret_cast<const _B16x8*>(q_fetch_ptr);
+        Qlocal[qkhe_depth] = *q_fetch_ptr_16B;
+      }
+    }
+  } else {
+    // fetch Q in shared across warps and then write to registers
+    const int local_qhead_idx = 2 * warpid + rowid;
+    const int global_qhead_idx = wg_start_head_idx + local_qhead_idx;
+    const scalar_t* q_ptr =
+        q + query_start_off * q_stride + global_qhead_idx * HEAD_SIZE;
+
+    const int qhead_element = lane16id * CONTIGUOUS_SCALAR_ELEMS_16B;
+    if ((local_qhead_idx < GQA_RATIO) && (qhead_element < HEAD_SIZE)) {
+      const scalar_t* q_fetch_ptr = q_ptr + qhead_element;
+      const _B16x8* q_fetch_ptr_16B =
+          reinterpret_cast<const _B16x8*>(q_fetch_ptr);
+      _B16x8 tmp = *q_fetch_ptr_16B;
+
+      const int offset1 =
+          lane16id /
+          2;  // 16 contiguous chunks of head elems are spread across 8x2lanes
+      shared_logits[offset1][lane2id][local_qhead_idx][0] = tmp;
+    }
+
+    __syncthreads();
+
+  #pragma unroll
+    for (int qkhe_depth = 0; qkhe_depth < QKHELOOP; qkhe_depth++) {
+      Qlocal[qkhe_depth] =
+          shared_logits[qkhe_depth][rowid][lane16id % GQA_RATIO][0];
+    }
+  }
+
+  const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
+  const int last_ctx_block = num_context_blocks - 1;
+
+  const int* block_table_seq = block_tables + seq_idx * max_num_blocks_per_seq;
+
+  int kphysical_block_number[TLOOP];
+
+  // fetch k physical block numbers
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    const int klocal_token_idx =
+        TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
+    const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
+    const int kblock_idx = (kglobal_token_idx < context_len)
+                               ? kglobal_token_idx / BLOCK_SIZE
+                               : last_ctx_block;
+    kphysical_block_number[token_depth] = block_table_seq[kblock_idx];
+  }
+
+  constexpr int KX = 16 / sizeof(cache_t);
+  const cache_t* k_ptr = k_cache + wg_start_kv_head_idx * kv_head_stride;
+
+  const int row_head_elem = rowid * CONTIGUOUS_KV_ELEMS_16B_LOAD;
+
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    const int64_t kblock_number =
+        static_cast<int64_t>(kphysical_block_number[token_depth]);
+    const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride;
+    const int klocal_token_idx =
+        TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
+    const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE;
+    const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX;
+
+    for (int qkhe_depth = 0; qkhe_depth < QKHELOOP; qkhe_depth++) {
+      const int head_elem = row_head_elem + qkhe_depth * QKHE_PER_FETCH;
+      const int offset1 = head_elem / KX;
+      const int offset2 = head_elem % KX;
+      const cache_t* k_fetch_ptr = k_ptr3 + offset1 * BLOCK_SIZE * KX + offset2;
+      const _B16x8* k_fetch_ptr_16B =
+          reinterpret_cast<const _B16x8*>(k_fetch_ptr);
+      Klocal[token_depth][qkhe_depth] = *k_fetch_ptr_16B;
+    }
+  }
+
+  constexpr int VTOKENS_PER_LANE =
+      TOKENS_PER_WARP / ROWS_PER_WARP;  // 32/2 = 16 vtokens per lane
+  constexpr int VBLOCKS_PER_LANE = 1;   // assumes block size >=16
+  constexpr int VTLOOP = NWARPS;        // corresponds to tokens across warps
+  constexpr int VTLANELOOP = DIVIDE_ROUND_UP(
+      VTOKENS_PER_LANE,
+      CONTIGUOUS_KV_ELEMS_16B_LOAD);  // optimized for 16B fetches; assumes
+                                      // minimum block size is 16
+  constexpr int VHELOOP = DIVIDE_ROUND_UP(
+      (HEAD_SIZE / 16), NWARPS);  // head_size distributed across warps; each
+                                  // wmma instr works on 16 head elements
+
+  int vphysical_block_number[VTLOOP][VBLOCKS_PER_LANE];
+
+  // fetch v physical block numbers
+  for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) {
+    for (int vblock_depth = 0; vblock_depth < VBLOCKS_PER_LANE;
+         vblock_depth++) {
+      const int vlocal_token_idx =
+          vtoken_depth * VTOKENS_PER_LANE * ROWS_PER_WARP +
+          rowid * VTOKENS_PER_LANE + vblock_depth * BLOCK_SIZE;
+      const int vglobal_token_idx =
+          partition_start_token_idx + vlocal_token_idx;
+      const int vblock_idx = (vglobal_token_idx < context_len)
+                                 ? vglobal_token_idx / BLOCK_SIZE
+                                 : last_ctx_block;
+      vphysical_block_number[vtoken_depth][vblock_depth] =
+          block_table_seq[vblock_idx];
+    }
+  }
+
+  _B16x8 Vlocal[VTLOOP][VHELOOP]
+               [VTLANELOOP];  // this can be interpreted as B8x16 too
+
+  const cache_t* v_ptr = v_cache + wg_start_kv_head_idx * kv_head_stride +
+                         ((rowid * VTOKENS_PER_LANE) % BLOCK_SIZE);
+
+  // v fetches are 16head elems across lanes x 16 tokens per lane
+  for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) {
+    const int vhead_elem = vhe_depth * NWARPS * 16 + warpid * 16 + lane16id;
+    const cache_t* v_ptr2 = v_ptr + vhead_elem * BLOCK_SIZE;
+
+    for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) {
+      for (int vfetch_depth = 0; vfetch_depth < VTLANELOOP; vfetch_depth++) {
+        const int vblock_depth = 0;
+        const int64_t vblock_number = static_cast<int64_t>(
+            vphysical_block_number[vtoken_depth][vblock_depth]);
+        const cache_t* v_ptr3 = v_ptr2 + (vblock_number * kv_block_stride);
+
+        const cache_t* v_fetch_ptr =
+            v_ptr3 + vfetch_depth * CONTIGUOUS_KV_ELEMS_16B_LOAD;
+        const _B16x8* v_fetch_ptr_16B =
+            reinterpret_cast<const _B16x8*>(v_fetch_ptr);
+        Vlocal[vtoken_depth][vhe_depth][vfetch_depth] = *v_fetch_ptr_16B;
+      }
+    }
+  }
+
+  floatx8 dout[TLOOP];
+  // qk wmma
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    dout[token_depth] = {0};
+    for (int qkhe_depth = 0; qkhe_depth < QKHELOOP; qkhe_depth++) {
+      dout[token_depth] = gcn_wmma16x16x16_instr<scalar_t, 0, 0, 0>(
+          Klocal[token_depth][qkhe_depth].u16x8, Qlocal[qkhe_depth].u16x8,
+          dout[token_depth]);
+    }
+    dout[token_depth] *= scale;
+  }
+
+  // calculate qk_max and exp_sum per warp and write to shared memory
+  float qk_max = -FLT_MAX;
+  float exp_sum = 0.0f;
+  const int qkout_token_idx =
+      partition_start_token_idx + TOKENS_PER_WARP * warpid + rowid * 8;
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    const int local_token_idx = qkout_token_idx + token_depth * 16;
+    for (int i = 0; i < 8; i++) {
+      const float tmp =
+          (local_token_idx + i < context_len) ? dout[token_depth][i] : -FLT_MAX;
+      qk_max = fmaxf(qk_max, tmp);
+    }
+  }
+
+  qk_max = fmaxf(qk_max, __shfl_xor(qk_max, 16));
+
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    const int local_token_idx = qkout_token_idx + token_depth * 16;
+    for (int i = 0; i < 8; i++) {
+      const float tmp = (local_token_idx + i < context_len)
+                            ? __expf(dout[token_depth][i] - qk_max)
+                            : 0.0f;
+      dout[token_depth][i] = tmp;
+      exp_sum += tmp;
+    }
+  }
+
+  exp_sum += __shfl_xor(exp_sum, 16);
+
+  __syncthreads();
+
+  if (laneid < 16) {
+    shared_qk_max[warpid][lane16id] = qk_max;
+    shared_exp_sum[warpid][lane16id] = exp_sum;
+  }
+
+  __syncthreads();
+
+  // calculate partition qk_max and exp_sum
+  float partition_qk_max = -FLT_MAX;
+  float warp_qk_max_exp[NWARPS];
+  float partition_exp_sum = 0.0f;
+
+  #pragma unroll
+  for (int w = 0; w < NWARPS; w++) {
+    warp_qk_max_exp[w] = shared_qk_max[w][lane16id];
+    partition_qk_max = fmaxf(partition_qk_max, warp_qk_max_exp[w]);
+  }
+
+  for (int w = 0; w < NWARPS; w++) {
+    warp_qk_max_exp[w] = __expf(warp_qk_max_exp[w] - partition_qk_max);
+    partition_exp_sum += shared_exp_sum[w][lane16id] * warp_qk_max_exp[w];
+  }
+
+  const float inv_sum_scale =
+      __fdividef(1.f, partition_exp_sum + 1e-6f) * warp_qk_max_exp[warpid];
+
+  __syncthreads();
+
+  // write logits to shared mem
+  #pragma unroll
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    dout[token_depth] *= inv_sum_scale;
+    shared_logits[warpid][token_depth][lane16id][rowid] =
+        from_floatx8<scalar_t>(dout[token_depth]);
+  }
+
+  // write out partition max_logits and exp_sum
+  if (threadIdx.x < GQA_RATIO) {
+    const int qhead_idx = lane16id;
+    const int offset = seq_idx * total_num_heads * max_num_partitions +
+                       (wg_start_head_idx + qhead_idx) * max_num_partitions +
+                       partition_idx;
+    max_logits[offset] = partition_qk_max;
+    exp_sums[offset] = partition_exp_sum;
+  }
+
+  __syncthreads();
+
+  _B16x8 outelems[VHELOOP];
+  // Softmax V wmma
+  // v layout: 16he across lanes x 16 tokens per lane
+  for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) {
+    floatx8 tmp_out = {0};
+
+    for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) {
+      for (int vfetch_depth = 0; vfetch_depth < VTLANELOOP; vfetch_depth++) {
+        const int offset = rowid * VTLANELOOP + vfetch_depth;
+        const int offset1 = offset % ROWS_PER_WARP;
+        const int offset2 = offset / ROWS_PER_WARP;
+        // if output format is 16 qheads across 16 lanes, 16 head elems spread
+        // across rows
+        tmp_out = gcn_wmma16x16x16_instr<scalar_t, 0, 0, 0>(
+            Vlocal[vtoken_depth][vhe_depth][vfetch_depth].u16x8,
+            shared_logits[vtoken_depth][offset2][lane16id][offset1].u16x8,
+            tmp_out);
+      }
+    }
+    outelems[vhe_depth] = from_floatx8<scalar_t>(tmp_out);
+  }
+
+  __syncthreads();
+
+  #pragma unroll
+  for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) {
+    shared_logits[warpid][vhe_depth][lane16id][rowid] =
+        outelems[vhe_depth];  // lane16 id head dimension; rowid head element
+                              // dimension
+  }
+
+  __syncthreads();
+
+  // write to tmp_out with coalesced writes after reading from shared mem
+  if (warpid == 0) {
+    _B16x8 vout[GQA_RATIO2];
+    // each lane writes out 16Bytes of tmp_out along head elem dimension
+    const int head_elem_idx = lane16id * 8;
+    if (head_elem_idx < HEAD_SIZE) {
+      for (int h = 0; h < GQA_RATIO2; h++) {
+        const int local_head_idx = 2 * h + rowid;
+        const int offset1 = (head_elem_idx / 16) % NWARPS;
+        const int offset2 = head_elem_idx / 16 / NWARPS;
+        const int offset3 = (head_elem_idx / 8) % 2;  // num_he % num_row
+        vout[h] = shared_logits[offset1][offset2][local_head_idx][offset3];
+      }
+
+      const int hsz_maxp_mult = HEAD_SIZE * max_num_partitions;
+      scalar_t* out_ptr = out + seq_idx * total_num_heads * hsz_maxp_mult +
+                          partition_idx * HEAD_SIZE;
+      for (int h = 0; h < GQA_RATIO2; h++) {
+        const int local_head_idx = 2 * h + rowid;
+        if (local_head_idx < GQA_RATIO) {
+          const int out_head_idx = wg_start_head_idx + local_head_idx;
+          scalar_t* out_ptr2 = out_ptr + out_head_idx * hsz_maxp_mult;
+          scalar_t* out_ptr3 = out_ptr2 + head_elem_idx;
+          _B16x8* out_ptr_B16x8 = reinterpret_cast<_B16x8*>(out_ptr3);
+          *out_ptr_B16x8 = vout[h];
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename cache_t,
+          vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE,
+          int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED,
+          int GQA_RATIO>
+__global__
+__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
+    const scalar_t* __restrict__ q,       // [num_seqs, num_heads, head_size]
+    const cache_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
+                                          // head_size/x, block_size, x]
+    const cache_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
+                                          // head_size, block_size]
+    const int num_kv_heads, const float scale,
+    const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
+    const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
+    const int max_num_blocks_per_seq,
+    const float* __restrict__ alibi_slopes,  // [num_heads]
+    const int q_stride, const int kv_block_stride, const int kv_head_stride,
+    float* __restrict__ exp_sums,  // [num_seqs, num_heads, max_num_partitions]
+    float* __restrict__ max_logits,  // [num_seqs, num_heads,
+                                     // max_num_partitions]
+    scalar_t* __restrict__ out,    // [num_seqs, num_heads, max_num_partitions,
+                                   // head_size]
+    OUTT* __restrict__ final_out,  // [num_seqs, num_heads, head_size]
+    int max_ctx_blocks, const float* k_scale, const float* v_scale) {
+  UNREACHABLE_CODE
+}
+
+// Grid: (num_heads, num_seqs).
+template <typename scalar_t, typename OUTT, int HEAD_SIZE, int NUM_THREADS,
+          int PARTITION_SIZE, int NPAR_LOOPS>
+__global__
+__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
+    OUTT* __restrict__ out,                // [num_seqs, num_heads, head_size]
+    const float* __restrict__ exp_sums,    // [num_seqs, num_heads,
+                                           // max_num_partitions]
+    const float* __restrict__ max_logits,  // [num_seqs, num_heads,
+                                           // max_num_partitions]
+    const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads,
+                                           // max_num_partitions, head_size]
+    const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
+    const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) {
+  const auto num_heads = gridDim.x;
+  const auto head_idx = blockIdx.x;
+  const auto seq_idx = blockIdx.y;
+
+  // NOTE queries with sequence len > 1 are prefills and taken care by another
+  // kernel.
+  if (query_start_loc_ptr != nullptr &&
+      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) {
+    return;
+  }
+
+  const int context_len = context_lens[seq_idx];
+  const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
+  const int warpid = threadIdx.x / WARP_SIZE;
+
+  __shared__ float shared_global_exp_sum;
+  // max num partitions supported is warp_size * NPAR_LOOPS
+  __shared__ float shared_exp_sums[NPAR_LOOPS * WARP_SIZE];
+
+  if (warpid == 0) {
+    const float* max_logits_ptr = max_logits +
+                                  seq_idx * num_heads * max_num_partitions +
+                                  head_idx * max_num_partitions;
+
+    // valid partition is the last valid partition in case threadid > num
+    // partitions
+    int valid_partition[NPAR_LOOPS];
+    float reg_max_logit[NPAR_LOOPS];
+    const int last_valid_partition = num_partitions - 1;
+
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      const int partition_no = i * WARP_SIZE + threadIdx.x;
+      valid_partition[i] =
+          (partition_no < num_partitions) ? partition_no : last_valid_partition;
+    }
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      reg_max_logit[i] = max_logits_ptr[valid_partition[i]];
+    }
+    float max_logit = reg_max_logit[0];
+  #pragma unroll
+    for (int i = 1; i < NPAR_LOOPS; i++) {
+      max_logit = fmaxf(max_logit, reg_max_logit[i]);
+    }
+
+  #pragma unroll
+    for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+      max_logit = fmaxf(max_logit, __shfl_xor(max_logit, mask));
+    }
+
+    const float* exp_sums_ptr = exp_sums +
+                                seq_idx * num_heads * max_num_partitions +
+                                head_idx * max_num_partitions;
+
+    float rescaled_exp_sum[NPAR_LOOPS];
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      rescaled_exp_sum[i] = exp_sums_ptr[valid_partition[i]];
+    }
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      const int partition_no = i * WARP_SIZE + threadIdx.x;
+      rescaled_exp_sum[i] *= (partition_no < num_partitions)
+                                 ? expf(reg_max_logit[i] - max_logit)
+                                 : 0.0f;
+    }
+    float global_exp_sum = rescaled_exp_sum[0];
+  #pragma unroll
+    for (int i = 1; i < NPAR_LOOPS; i++) {
+      global_exp_sum += rescaled_exp_sum[i];
+    }
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      const int partition_no = i * WARP_SIZE + threadIdx.x;
+      shared_exp_sums[partition_no] = rescaled_exp_sum[i];
+    }
+
+  #pragma unroll
+    for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+      global_exp_sum += __shfl_xor(global_exp_sum, mask);
+    }
+    if (threadIdx.x == 0) {
+      shared_global_exp_sum = global_exp_sum;
+    }
+  }  // warpid == 0
+  const scalar_t* tmp_out_ptr =
+      tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
+      head_idx * max_num_partitions * HEAD_SIZE + threadIdx.x;
+  constexpr int MAX_NPAR = 32;
+  scalar_t tmps[MAX_NPAR];
+  const float dzero = 0.0f;
+  #pragma unroll
+  for (int j = 0; j < MAX_NPAR; j++) {
+    tmps[j] = from_float<scalar_t>(dzero);
+  }
+  const int last_partition_offset = (num_partitions - 1) * HEAD_SIZE;
+  const int num_partition_offset = (num_partitions)*HEAD_SIZE;
+  int idx = 0;
+
+  constexpr int JCHUNK = 16;
+
+  #pragma unroll
+  for (int j = 0; j < JCHUNK * HEAD_SIZE; j += HEAD_SIZE) {
+    // lastj is last valid partition
+    const int lastj_offset =
+        (j < num_partition_offset) ? j : last_partition_offset;
+    tmps[idx] = tmp_out_ptr[lastj_offset];
+    idx++;
+  }
+  __syncthreads();
+
+  if (num_partitions > JCHUNK) {
+  #pragma unroll
+    for (int j = JCHUNK * HEAD_SIZE; j < 2 * JCHUNK * HEAD_SIZE;
+         j += HEAD_SIZE) {
+      const int lastj_offset =
+          (j < num_partition_offset) ? j : last_partition_offset;
+      tmps[idx] = tmp_out_ptr[lastj_offset];
+      idx++;
+    }
+
+    if (num_partitions > 2 * JCHUNK) {
+  #pragma unroll
+      for (int j = 2 * JCHUNK * HEAD_SIZE; j < MAX_NPAR * HEAD_SIZE;
+           j += HEAD_SIZE) {
+        const int lastj_offset =
+            (j < num_partition_offset) ? j : last_partition_offset;
+        tmps[idx] = tmp_out_ptr[lastj_offset];
+        idx++;
+      }
+    }
+  }  // num_partitions > JCHUNK
+
+  // Aggregate tmp_out to out.
+  float acc = 0.0f;
+  #pragma unroll
+  for (int j = 0; j < JCHUNK; j++) {
+    acc += to_float<scalar_t>(tmps[j]) * shared_exp_sums[j];
+  }
+  if (num_partitions > JCHUNK) {
+  #pragma unroll
+    for (int j = JCHUNK; j < 2 * JCHUNK; j++) {
+      acc += to_float<scalar_t>(tmps[j]) * shared_exp_sums[j];
+    }
+    if (num_partitions > 2 * JCHUNK) {
+  #pragma unroll
+      for (int j = 2 * JCHUNK; j < MAX_NPAR; j++) {
+        acc += to_float<scalar_t>(tmps[j]) * shared_exp_sums[j];
+      }
+    }
+  }
+
+  for (int p = 1; p < NPAR_LOOPS; p++) {
+    if (num_partitions > p * MAX_NPAR) {
+      idx = 0;
+  #pragma unroll
+      for (int j = p * MAX_NPAR * HEAD_SIZE; j < (p + 1) * MAX_NPAR * HEAD_SIZE;
+           j += HEAD_SIZE) {
+        // lastj is last valid partition
+        const int lastj_offset =
+            (j < num_partition_offset) ? j : last_partition_offset;
+        tmps[idx] = tmp_out_ptr[lastj_offset];
+        idx++;
+      }
+
+  #pragma unroll
+      for (int j = 0; j < MAX_NPAR; j++) {
+        acc += to_float<scalar_t>(tmps[j]) * shared_exp_sums[j + p * MAX_NPAR];
+      }
+    }
+  }
+
+  const float inv_global_exp_sum =
+      __fdividef(1.0f, shared_global_exp_sum + 1e-6f);
+  acc *= inv_global_exp_sum;
+
+  const int64_t query_start_off = static_cast<int64_t>(
+      query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
+  OUTT* out_ptr = out + query_start_off * num_heads * HEAD_SIZE +
+                  static_cast<int64_t>(head_idx) * HEAD_SIZE;
+  out_ptr[threadIdx.x] = from_float<scalar_t>(acc);
+}
+
+#else
+
+// clang-format off
+template <typename scalar_t, typename cache_t,
+          vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE,
+          int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED,
+          int GQA_RATIO>
+__global__
+__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma16_kernel(
+    const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
+    const cache_t* __restrict__ k_cache,    // [num_blocks, num_kv_heads, head_size/x, block_size, x]
+    const cache_t* __restrict__ v_cache,    // [num_blocks, num_kv_heads, head_size, block_size]
+    const int num_kv_heads,
+    const float scale,
+    const int* __restrict__ block_tables,    // [num_seqs, max_num_blocks_per_seq]
+    const int* __restrict__ context_lens,    // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
+    const int max_num_blocks_per_seq,
+    const float* __restrict__ alibi_slopes,  // [num_heads]
+    const int q_stride,
+    const int kv_block_stride,
+    const int kv_head_stride,
+    float* __restrict__ exp_sums,             // [num_seqs, num_heads, max_num_partitions]
+    float* __restrict__ max_logits,           // [num_seqs, num_heads, max_num_partitions]
+    scalar_t* __restrict__ out,               // [num_seqs, num_heads, max_num_partitions, head_size]
+    OUTT* __restrict__ final_out,             // [num_seqs, num_heads, head_size]
+    int max_ctx_blocks, const float* k_scale, const float* v_scale) {
+  UNREACHABLE_CODE
+}
+
+template <typename scalar_t, typename cache_t,
+          vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE,
+          int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED,
+          int GQA_RATIO>
+__global__
+__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
+    const scalar_t* __restrict__ q,          // [num_seqs, num_heads, head_size]
+    const cache_t* __restrict__ k_cache,     // [num_blocks, num_kv_heads, head_size/x, block_size, x]
+    const cache_t* __restrict__ v_cache,     // [num_blocks, num_kv_heads, head_size, block_size]
+    const int num_kv_heads,
+    const float scale,
+    const int* __restrict__ block_tables,    // [num_seqs, max_num_blocks_per_seq]
+    const int* __restrict__ context_lens,    // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
+    const int max_num_blocks_per_seq,
+    const float* __restrict__ alibi_slopes,  // [num_heads]
+    const int q_stride,
+    const int kv_block_stride,
+    const int kv_head_stride,
+    float* __restrict__ exp_sums,            // [num_seqs, num_heads, max_num_partitions]
+    float* __restrict__ max_logits,          // [num_seqs, num_heads, max_num_partitions]
+    scalar_t* __restrict__ out,              // [num_seqs, num_heads, max_num_partitions, head_size]
+    OUTT* __restrict__ final_out,            // [num_seqs, num_heads, head_size]
+    int max_ctx_blocks, const float* k_scale, const float* v_scale) {
+  UNREACHABLE_CODE
+}
+
+// Grid: (num_heads, num_seqs).
+template <typename scalar_t, typename OUTT, int HEAD_SIZE, int NUM_THREADS,
+          int PARTITION_SIZE, int NPAR_LOOPS>
+__global__
+__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
+    OUTT* __restrict__ out,                // [num_seqs, num_heads, head_size]
+    const float* __restrict__ exp_sums,    // [num_seqs, num_heads, max_num_partitions]
+    const float* __restrict__ max_logits,  // [num_seqs, num_heads, max_num_partitions]
+    const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads, max_num_partitions, head_size]
+    const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
+    const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) {
+  UNREACHABLE_CODE
+}
+// clang-format on
+
+#endif
+
+#define LAUNCH_CUSTOM_ATTENTION_MFMA16(GQA_RATIO)                              \
+  paged_attention_ll4mi_QKV_mfma16_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE,  \
+                                          HEAD_SIZE, NTHR, ALIBI_ENABLED,      \
+                                          GQA_RATIO>                           \
+      <<<grid, block, 0, stream>>>(                                            \
+          query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,      \
+          block_tables_ptr, context_lens_ptr, query_start_loc_ptr,             \
+          max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \
+          kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr,  \
+          max_ctx_blocks, k_scale_ptr, v_scale_ptr);
+
+#define LAUNCH_CUSTOM_ATTENTION_MFMA4(GQA_RATIO)                               \
+  paged_attention_ll4mi_QKV_mfma4_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE,   \
+                                         HEAD_SIZE, NTHR, ALIBI_ENABLED,       \
+                                         GQA_RATIO>                            \
+      <<<grid, block, 0, stream>>>(                                            \
+          query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,      \
+          block_tables_ptr, context_lens_ptr, query_start_loc_ptr,             \
+          max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \
+          kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr,  \
+          max_ctx_blocks, k_scale_ptr, v_scale_ptr);
+
+#define LAUNCH_CUSTOM_REDUCTION(NPAR_LOOPS)                          \
+  paged_attention_ll4mi_reduce_kernel<T, OUTT, HEAD_SIZE, HEAD_SIZE, \
+                                      PARTITION_SIZE, NPAR_LOOPS>    \
+      <<<reduce_grid, reduce_block, 0, stream>>>(                    \
+          out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr,        \
+          context_lens_ptr, query_start_loc_ptr, max_num_partitions, \
+          fp8_out_scale_ptr);
+
+template <typename T, typename KVT, vllm::Fp8KVCacheDataType KV_DTYPE,
+          int BLOCK_SIZE, int HEAD_SIZE, typename OUTT, int PARTITION_SIZE_OLD,
+          bool ALIBI_ENABLED>
+void paged_attention_custom_launcher(
+    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
+    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
+    torch::Tensor& value_cache, const int num_kv_heads, float scale,
+    torch::Tensor& block_tables, torch::Tensor& context_lens,
+    const std::optional<torch::Tensor>& query_start_loc, int max_context_len,
+    const std::optional<torch::Tensor>& alibi_slopes, torch::Tensor& k_scale,
+    torch::Tensor& v_scale, const std::optional<torch::Tensor>& fp8_out_scale) {
+  int num_seqs = block_tables.size(0);
+  int num_heads = query.size(1);
+  int head_size = query.size(2);
+  int max_num_blocks_per_seq = block_tables.size(1);
+  int q_stride = query.stride(0);
+  int kv_block_stride = key_cache.stride(0);
+  int kv_head_stride = key_cache.stride(1);
+
+  // NOTE: query start location is optional for V0 decode should not be used.
+  // If batch contains mix of prefills and decode, prefills should be skipped.
+  const int* query_start_loc_ptr =
+      query_start_loc
+          ? reinterpret_cast<const int*>(query_start_loc.value().data_ptr())
+          : nullptr;
+
+  // NOTE: alibi_slopes is optional.
+  const float* alibi_slopes_ptr =
+      alibi_slopes
+          ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
+          : nullptr;
+
+  float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
+  float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
+  T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
+  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
+  KVT* key_cache_ptr = reinterpret_cast<KVT*>(key_cache.data_ptr());
+  KVT* value_cache_ptr = reinterpret_cast<KVT*>(value_cache.data_ptr());
+  int* block_tables_ptr = block_tables.data_ptr<int>();
+  int* context_lens_ptr = context_lens.data_ptr<int>();
+  const float* k_scale_ptr = reinterpret_cast<const float*>(k_scale.data_ptr());
+  const float* v_scale_ptr = reinterpret_cast<const float*>(v_scale.data_ptr());
+  // NOTE: fp8_out_scale is optional.
+  const auto fp8_out_scale_ptr =
+      fp8_out_scale
+          ? static_cast<const float*>(fp8_out_scale.value().data_ptr())
+          : nullptr;
+  OUTT* out_ptr = reinterpret_cast<OUTT*>(out.data_ptr());
+
+  const int max_ctx_blocks = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE);
+
+  // partition size is fixed at 256 since both mfma4 and mfma16 kernels support
+  // it mfma4 kernel also supports partition size 512
+  constexpr int PARTITION_SIZE = 256;
+  const int max_num_partitions =
+      DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE);
+  const int gqa_ratio = num_heads / num_kv_heads;
+  assert(num_heads % num_kv_heads == 0);
+  assert(head_size == HEAD_SIZE);
+
+  constexpr int NTHR = 256;
+  dim3 grid(num_seqs, max_num_partitions, num_kv_heads);
+  dim3 block(NTHR);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // mfma4 kernel is faster than mfma16 for gqa_ratio <= 4
+  switch (gqa_ratio) {
+    case 1:
+      LAUNCH_CUSTOM_ATTENTION_MFMA4(1);
+      break;
+    case 2:
+      LAUNCH_CUSTOM_ATTENTION_MFMA4(2);
+      break;
+    case 3:
+      LAUNCH_CUSTOM_ATTENTION_MFMA4(3);
+      break;
+    case 4:
+      LAUNCH_CUSTOM_ATTENTION_MFMA4(4);
+      break;
+    case 5:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(5);
+      break;
+    case 6:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(6);
+      break;
+    case 7:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(7);
+      break;
+    case 8:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(8);
+      break;
+    case 9:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(9);
+      break;
+    case 10:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(10);
+      break;
+    case 11:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(11);
+      break;
+    case 12:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(12);
+      break;
+    case 13:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(13);
+      break;
+    case 14:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(14);
+      break;
+    case 15:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(15);
+      break;
+    case 16:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(16);
+      break;
+    default:
+      TORCH_CHECK(false, "Unsupported gqa ratio: ", gqa_ratio);
+      break;
+  }
+
+  dim3 reduce_grid(num_heads, num_seqs);
+  dim3 reduce_block(head_size);
+  const int npar_loops = DIVIDE_ROUND_UP(max_num_partitions, WARP_SIZE);
+  // reduction kernel supports upto 8 NPAR_loops * 64 (warp_size) * 256
+  // (partition size) = 128K context length
+  switch (npar_loops) {
+    case 1:
+      LAUNCH_CUSTOM_REDUCTION(1);
+      break;
+    case 2:
+      LAUNCH_CUSTOM_REDUCTION(2);
+      break;
+    case 3:
+      LAUNCH_CUSTOM_REDUCTION(3);
+      break;
+    case 4:
+      LAUNCH_CUSTOM_REDUCTION(4);
+      break;
+    case 5:
+      LAUNCH_CUSTOM_REDUCTION(5);
+      break;
+    case 6:
+      LAUNCH_CUSTOM_REDUCTION(6);
+      break;
+    case 7:
+      LAUNCH_CUSTOM_REDUCTION(7);
+      break;
+    case 8:
+      LAUNCH_CUSTOM_REDUCTION(8);
+      break;
+    default:
+      TORCH_CHECK(false, "Unsupported npar_loops: ", npar_loops);
+      break;
+  }
+}
+
+template <typename T, typename KVT, vllm::Fp8KVCacheDataType KV_DTYPE,
+          int BLOCK_SIZE, int HEAD_SIZE, typename OUTT, int PARTITION_SIZE_OLD,
+          bool ALIBI_ENABLED>
+void paged_attention_custom_launcher_navi(
+    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
+    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
+    torch::Tensor& value_cache, const int num_kv_heads, float scale,
+    torch::Tensor& block_tables, torch::Tensor& context_lens,
+    const std::optional<torch::Tensor>& query_start_loc, int max_context_len,
+    const std::optional<torch::Tensor>& alibi_slopes, torch::Tensor& k_scale,
+    torch::Tensor& v_scale) {
+  int num_seqs = block_tables.size(0);
+  int num_heads = query.size(1);
+  int head_size = query.size(2);
+  int max_num_blocks_per_seq = block_tables.size(1);
+  int q_stride = query.stride(0);
+  int kv_block_stride = key_cache.stride(0);
+  int kv_head_stride = key_cache.stride(1);
+
+  // NOTE: query start location is optional for V0 decode should not be used.
+  // If batch contains mix of prefills and decode, prefills should be skipped.
+  const int* query_start_loc_ptr =
+      query_start_loc
+          ? reinterpret_cast<const int*>(query_start_loc.value().data_ptr())
+          : nullptr;
+
+  // NOTE: Navi does not support alibi_slopes.
+  const float* alibi_slopes_ptr = nullptr;
+
+  float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
+  float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
+  T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
+  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
+  KVT* key_cache_ptr = reinterpret_cast<KVT*>(key_cache.data_ptr());
+  KVT* value_cache_ptr = reinterpret_cast<KVT*>(value_cache.data_ptr());
+  int* block_tables_ptr = block_tables.data_ptr<int>();
+  int* context_lens_ptr = context_lens.data_ptr<int>();
+
+  const float* k_scale_ptr = reinterpret_cast<const float*>(k_scale.data_ptr());
+  const float* v_scale_ptr = reinterpret_cast<const float*>(v_scale.data_ptr());
+  // NOTE: Navi does not support fp8.
+  const auto fp8_out_scale_ptr = nullptr;
+  OUTT* out_ptr = reinterpret_cast<OUTT*>(out.data_ptr());
+
+  const int max_ctx_blocks = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE);
+
+  constexpr int PARTITION_SIZE = 256;
+  const int max_num_partitions =
+      DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE);
+  const int gqa_ratio = num_heads / num_kv_heads;
+  assert(num_heads % num_kv_heads == 0);
+  assert(head_size == HEAD_SIZE);
+
+  constexpr int NTHR = 256;
+  dim3 grid(num_seqs, max_num_partitions, num_kv_heads);
+  dim3 block(NTHR);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  switch (gqa_ratio) {
+    case 1:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(1);
+      break;
+    case 2:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(2);
+      break;
+    case 3:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(3);
+      break;
+    case 4:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(4);
+      break;
+    case 5:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(5);
+      break;
+    case 6:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(6);
+      break;
+    case 7:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(7);
+      break;
+    case 8:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(8);
+      break;
+    case 9:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(9);
+      break;
+    case 10:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(10);
+      break;
+    case 11:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(11);
+      break;
+    case 12:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(12);
+      break;
+    case 13:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(13);
+      break;
+    case 14:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(14);
+      break;
+    case 15:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(15);
+      break;
+    case 16:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(16);
+      break;
+    default:
+      TORCH_CHECK(false, "Unsupported gqa ratio: ", gqa_ratio);
+      break;
+  }
+
+  dim3 reduce_grid(num_heads, num_seqs);
+  dim3 reduce_block(head_size);
+  const int warp_size = 32;
+  const int npar_loops = DIVIDE_ROUND_UP(max_num_partitions, warp_size);
+  // reduction kernel supports upto 16 NPAR_loops * 32 (warp_size) * 256
+  // (partition size) = 128K context length
+  switch (npar_loops) {
+    case 1:
+      LAUNCH_CUSTOM_REDUCTION(1);
+      break;
+    case 2:
+      LAUNCH_CUSTOM_REDUCTION(2);
+      break;
+    case 3:
+      LAUNCH_CUSTOM_REDUCTION(3);
+      break;
+    case 4:
+      LAUNCH_CUSTOM_REDUCTION(4);
+      break;
+    case 5:
+      LAUNCH_CUSTOM_REDUCTION(5);
+      break;
+    case 6:
+      LAUNCH_CUSTOM_REDUCTION(6);
+      break;
+    case 7:
+      LAUNCH_CUSTOM_REDUCTION(7);
+      break;
+    case 8:
+      LAUNCH_CUSTOM_REDUCTION(8);
+      break;
+    case 9:
+      LAUNCH_CUSTOM_REDUCTION(9);
+      break;
+    case 10:
+      LAUNCH_CUSTOM_REDUCTION(10);
+      break;
+    case 11:
+      LAUNCH_CUSTOM_REDUCTION(11);
+      break;
+    case 12:
+      LAUNCH_CUSTOM_REDUCTION(12);
+      break;
+    case 13:
+      LAUNCH_CUSTOM_REDUCTION(13);
+      break;
+    case 14:
+      LAUNCH_CUSTOM_REDUCTION(14);
+      break;
+    case 15:
+      LAUNCH_CUSTOM_REDUCTION(15);
+      break;
+    case 16:
+      LAUNCH_CUSTOM_REDUCTION(16);
+      break;
+    default:
+      TORCH_CHECK(false, "Unsupported npar_loops: ", npar_loops);
+      break;
+  }
+}
+
+#define CALL_CUSTOM_LAUNCHER(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT,   \
+                             PSIZE, ALIBI_ENABLED)                          \
+  if (!is_navi) {                                                           \
+    paged_attention_custom_launcher<T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE,  \
+                                    OUTT, PSIZE, ALIBI_ENABLED>(            \
+        out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,  \
+        num_kv_heads, scale, block_tables, context_lens, query_start_loc,   \
+        max_context_len, alibi_slopes, k_scale, v_scale, fp8_out_scale);    \
+  } else {                                                                  \
+    paged_attention_custom_launcher_navi<                                   \
+        T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT, PSIZE, ALIBI_ENABLED>( \
+        out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,  \
+        num_kv_heads, scale, block_tables, context_lens, query_start_loc,   \
+        max_context_len, alibi_slopes, k_scale, v_scale);                   \
+  }
+
+#define CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE,    \
+                                   OUTT, PSIZE)                              \
+  if (alibi_slopes) {                                                        \
+    CALL_CUSTOM_LAUNCHER(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT, PSIZE, \
+                         true);                                              \
+  } else {                                                                   \
+    CALL_CUSTOM_LAUNCHER(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT, PSIZE, \
+                         false);                                             \
+  }
+
+#if defined(__HIPCC__) && defined(__gfx90a__)
+  #define CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE)  \
+    if (fp8_out_scale) {                                                   \
+      TORCH_CHECK(false, "fp8 out scale unsupported for gfx90a");          \
+    } else {                                                               \
+      CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, T, \
+                                 256);                                     \
+    }
+#else
+  #define CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE)  \
+    if (fp8_out_scale) {                                                   \
+      CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE,    \
+                                 uint8_t, 256);                            \
+    } else {                                                               \
+      CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, T, \
+                                 256);                                     \
+    }
+#endif
+
+#define CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, HEAD_SIZE)     \
+  switch (block_size) {                                           \
+    case 16:                                                      \
+      CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, 16, HEAD_SIZE);  \
+      break;                                                      \
+    case 32:                                                      \
+      CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, 32, HEAD_SIZE);  \
+      break;                                                      \
+    default:                                                      \
+      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
+      break;                                                      \
+  }
+
+#define CALL_CUSTOM_LAUNCHER_BLK_HEAD(T, KVT, KV_DTYPE)         \
+  switch (head_size) {                                          \
+    case 64:                                                    \
+      CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, 64);           \
+      break;                                                    \
+    case 128:                                                   \
+      CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, 128);          \
+      break;                                                    \
+    default:                                                    \
+      TORCH_CHECK(false, "Unsupported head size: ", head_size); \
+      break;                                                    \
+  }
+
+bool is_navi_gpu() {
+  static bool is_cached = false;
+  static bool result;
+
+  if (!is_cached) {
+    int device_id;
+    hipDeviceProp_t deviceProp;
+    hipGetDevice(&device_id);
+    hipGetDeviceProperties(&deviceProp, device_id);
+
+    std::string arch = deviceProp.gcnArchName;
+    result = arch.find("gfx11") == 0 || arch.find("gfx12") == 0;
+    is_cached = true;
+  }
+
+  return result;
+}
+
+// clang-format off
+void paged_attention(
+    torch::Tensor& out,         // [num_seqs, num_heads, head_size]
+    torch::Tensor& exp_sums,    // [num_seqs, num_heads, max_num_partitions]
+    torch::Tensor& max_logits,  // [num_seqs, num_heads, max_num_partitions]
+    torch::Tensor& tmp_out,     // [num_seqs, num_heads, max_num_partitions, head_size]
+    torch::Tensor& query,       // [num_seqs, num_heads, head_size]
+    torch::Tensor& key_cache,   // [num_blocks, num_heads, head_size/x, block_size, x]
+    torch::Tensor& value_cache, // [num_blocks, num_heads, head_size, block_size]
+    int64_t num_kv_heads, 
+    double scale,
+    torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
+    torch::Tensor& context_lens, // [num_seqs]
+    const std::optional<torch::Tensor>& query_start_loc, // [num_seqs]
+    int64_t block_size, int64_t max_context_len,
+    const std::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
+    torch::Tensor& v_scale,
+    const std::optional<torch::Tensor>& fp8_out_scale) {
+  // clang-format on
+  bool is_navi = is_navi_gpu();
+
+  const int head_size = query.size(2);
+  if (kv_cache_dtype == "auto") {
+    if (query.dtype() == at::ScalarType::Half) {
+      CALL_CUSTOM_LAUNCHER_BLK_HEAD(_Float16, _Float16,
+                                    vllm::Fp8KVCacheDataType::kAuto);
+    } else if (query.dtype() == at::ScalarType::BFloat16) {
+      CALL_CUSTOM_LAUNCHER_BLK_HEAD(__hip_bfloat16, __hip_bfloat16,
+                                    vllm::Fp8KVCacheDataType::kAuto);
+    } else {
+      TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
+    }
+  } else if (kv_cache_dtype == "fp8" || kv_cache_dtype == "fp8_e4m3") {
+    if (query.dtype() == at::ScalarType::Half) {
+      CALL_CUSTOM_LAUNCHER_BLK_HEAD(_Float16, uint8_t,
+                                    vllm::Fp8KVCacheDataType::kFp8E4M3);
+    } else if (query.dtype() == at::ScalarType::BFloat16) {
+      CALL_CUSTOM_LAUNCHER_BLK_HEAD(__hip_bfloat16, uint8_t,
+                                    vllm::Fp8KVCacheDataType::kFp8E4M3);
+    } else {
+      TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
+    }
+  } else {
+    TORCH_CHECK(false, "Unsupported KV cache dtype: ", kv_cache_dtype);
+  }
+}
+
+#undef WARP_SIZE
+#undef MAX
+#undef MIN
+#undef DIVIDE_ROUND_UP
diff --git a/vllm_v0.10.0/csrc/rocm/ops.h b/vllm_v0.10.0/csrc/rocm/ops.h
new file mode 100644
index 0000000..e538197
--- /dev/null
+++ b/vllm_v0.10.0/csrc/rocm/ops.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <torch/all.h>
+
+torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
+                    const int64_t rows_per_block);
+
+torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
+                       const int64_t CuCount);
+
+void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
+               at::Tensor& scale_a, at::Tensor& scale_b, const int64_t CuCount);
+
+void paged_attention(
+    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
+    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
+    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
+    torch::Tensor& block_tables, torch::Tensor& context_lens,
+    const std::optional<torch::Tensor>& query_start_loc, int64_t block_size,
+    int64_t max_context_len, const std::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
+    torch::Tensor& v_scale, const std::optional<torch::Tensor>& fp8_out_scale);
diff --git a/vllm_v0.10.0/csrc/rocm/skinny_gemms.cu b/vllm_v0.10.0/csrc/rocm/skinny_gemms.cu
new file mode 100644
index 0000000..6212570
--- /dev/null
+++ b/vllm_v0.10.0/csrc/rocm/skinny_gemms.cu
@@ -0,0 +1,1714 @@
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+#include <stdexcept>
+#include <algorithm>
+
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
+#include "quantization/fp8/common.cuh"
+
+#if defined(__HIPCC__) && \
+    (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__))
+  #define __HIP__GFX9__
+#endif
+
+#if defined(__HIPCC__) && (defined(__gfx942__) || defined(__gfx950__))
+  #define __HIP__MI3XX__
+#endif
+
+#if defined(__gfx950__)
+  #define LDS_SIZE 160 * 1024
+#else
+  #define LDS_SIZE 64 * 1024
+#endif
+
+int get_lds_size() {
+  static bool is_cached = false;
+  static int result;
+  if (is_cached == false) {
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    std::string device_arch = dprops->gcnArchName;
+    size_t substring = device_arch.find("gfx95");
+    result = (substring == std::string::npos ? 64 * 1024 : 160 * 1024);
+    is_cached = true;
+  }
+  return result;
+}
+
+#if defined(NDEBUG)
+  #undef NDEBUG
+  #include <assert.h>
+  #define UNREACHABLE_CODE assert(false);
+  #define NDEBUG
+#else
+  #define UNREACHABLE_CODE assert(false);
+#endif
+
+template <typename T>
+struct scalar {};
+
+template <typename T>
+struct scalar2 {};
+
+template <typename T>
+__device__ __forceinline__ float2 __s22float2(T v);
+
+template <typename T>
+__device__ __forceinline__ T __float2s(float v);
+
+template <typename T>
+__device__ __forceinline__ T __float22s2_rn(float2 v);
+
+// Definitions and cvt functions for fp16
+template <>
+struct scalar<c10::Half> {
+  using type = half;
+};
+
+template <>
+struct scalar2<c10::Half> {
+  using type = __half2;
+};
+
+template <>
+__device__ __forceinline__ half __float2s(float v) {
+  return __float2half(v);
+}
+
+template <>
+__device__ __forceinline__ float2 __s22float2(__half2 v) {
+  return __half22float2(v);
+}
+
+template <>
+__device__ __forceinline__ __half2 __float22s2_rn(float2 v) {
+  return __float22half2_rn(v);
+}
+
+// Definitions and cvt functions for bf16
+template <>
+struct scalar<c10::BFloat16> {
+  using type = __hip_bfloat16;
+};
+
+template <>
+struct scalar2<c10::BFloat16> {
+  using type = __hip_bfloat162;
+};
+
+template <>
+__device__ __forceinline__ __hip_bfloat16 __float2s(float v) {
+  return __float2bfloat16(v);
+}
+
+template <>
+__device__ __forceinline__ float2 __s22float2(__hip_bfloat162 v) {
+  return __bfloat1622float2(v);
+}
+
+template <>
+__device__ __forceinline__ __hip_bfloat162 __float22s2_rn(float2 v) {
+  return __float22bfloat162_rn(v);
+}
+
+template <typename T>
+__device__ __forceinline__ T loadnt(T* addr) {
+  return __builtin_nontemporal_load(addr);
+}
+
+__device__ __forceinline__ float4 load_ntmprl(const float4* addr) {
+  auto addr_alias = reinterpret_cast<const float*>(addr);
+  auto dat0 = loadnt(addr_alias);
+  auto dat1 = loadnt(addr_alias + 1);
+  auto dat2 = loadnt(addr_alias + 2);
+  auto dat3 = loadnt(addr_alias + 3);
+  return make_float4(dat0, dat1, dat2, dat3);
+}
+
+// TBlock fetches entire rows of A, and entire col of B (K dimension); assume
+// N=1 for time being grid is M/A_NUM_ROWS blocks
+template <typename scalar_t, int NUM_A_ROWS_PER_BLOCK>
+__global__ void LLGemm1_kernel(const scalar_t* in_a, const scalar_t* in_b,
+                               scalar_t* out_c, const int K) {
+  using scalar2_t = typename scalar2<scalar_t>::type;
+  auto af4 = reinterpret_cast<const float4*>(in_a);
+  auto bf4 = reinterpret_cast<const scalar2_t*>(in_b);
+  auto c = reinterpret_cast<scalar2_t*>(out_c);
+  __shared__ float red_smem[NUM_A_ROWS_PER_BLOCK][WARP_SIZE];
+  const int row_addr = blockIdx.x * NUM_A_ROWS_PER_BLOCK * K / 8;
+  const int threadid = threadIdx.x;
+  const int warp = threadIdx.x / WARP_SIZE;
+  const int lane = threadIdx.x % WARP_SIZE;
+  const int num_warps = blockDim.x / WARP_SIZE;
+  const int qwarpid = threadid / 16;
+  const int qthreadid = threadid % 16;
+  float4 rowA_elem4[NUM_A_ROWS_PER_BLOCK];
+  scalar2_t colB_elem4x, colB_elem4y, colB_elem4z, colB_elem4w;
+  float acc[NUM_A_ROWS_PER_BLOCK];
+  scalar2_t acch2;
+  scalar2_t oval;
+
+  // As we later use warp shuffle operations, we may have more threads in the
+  // block than the actual available data, hence the if guard here.
+  if (threadid * 8 < K) {
+#pragma unroll
+    for (int i = 0; i < NUM_A_ROWS_PER_BLOCK; i++) {
+      // rowA_elem4[i] holds 8 * half numbers seen as a single float4.
+      rowA_elem4[i] = load_ntmprl(&af4[row_addr + threadid + K / 8 * i]);
+    }
+    colB_elem4x = bf4[threadid * 4 + 0];
+    colB_elem4y = bf4[threadid * 4 + 1];
+    colB_elem4z = bf4[threadid * 4 + 2];
+    colB_elem4w = bf4[threadid * 4 + 3];
+  }
+
+  scalar2_t Af2;
+  float2 S;
+
+  auto Ah2ptr = reinterpret_cast<scalar2_t*>(&rowA_elem4);
+  scalar2_t* ah2lptr;
+
+#pragma unroll
+  for (int i = 0; i < NUM_A_ROWS_PER_BLOCK; i++) {
+    // Multiply-add on 8 scalar_t.
+    ah2lptr = Ah2ptr + i * 4;
+    Af2 = *(ah2lptr);
+    acch2 = __hmul2(Af2, colB_elem4x);
+    Af2 = *(ah2lptr + 1);
+    acch2 = __hfma2(Af2, colB_elem4y, acch2);
+    Af2 = *(ah2lptr + 2);
+    acch2 = __hfma2(Af2, colB_elem4z, acch2);
+    Af2 = *(ah2lptr + 3);
+    acch2 = __hfma2(Af2, colB_elem4w, acch2);
+    S = __s22float2(acch2);
+
+    // See comment above concerning the if guard.
+    acc[i] = (threadid * 8 < K ? S.x + S.y : 0.f);
+  }
+
+// all reduce across warp.
+#pragma unroll
+  for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+#pragma unroll
+    for (int i = 0; i < NUM_A_ROWS_PER_BLOCK; i++) {
+      acc[i] += __shfl_xor(acc[i], mask);
+    }
+  }
+
+  // Warp leaders store the data to shared memory.
+  if (lane < NUM_A_ROWS_PER_BLOCK) {
+    red_smem[lane][warp] = acc[lane];
+  }
+
+  // Make sure the data is in shared memory.
+  __syncthreads();
+
+  if (qwarpid < NUM_A_ROWS_PER_BLOCK) {
+    acc[qwarpid] = qthreadid < num_warps ? red_smem[qwarpid][qthreadid] : 0.f;
+#pragma unroll
+    for (int mask = 16 / 2; mask >= 1; mask /= 2) {
+      acc[qwarpid] += __shfl_xor(acc[qwarpid], mask);
+    }
+    float oval2 = __shfl_xor(acc[qwarpid], 16);
+
+    if (lane % 32 == 0) {
+      oval = __float22s2_rn<scalar2_t>(make_float2(acc[qwarpid], oval2));
+      c[blockIdx.x * NUM_A_ROWS_PER_BLOCK / 2 + qwarpid / 2] = oval;
+    }
+  }
+}
+
+torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
+                    const int64_t rows_per_block) {
+  auto M = in_a.size(0);
+  auto K = in_a.size(1);
+  auto N = in_b.size(0);
+
+  TORCH_CHECK(N == 1, "Row number of activation tensor must be 1.");
+  TORCH_CHECK(in_a.dtype() == in_b.dtype());
+  TORCH_CHECK(in_b.dtype() == torch::kFloat16 ||
+              in_b.dtype() == torch::kBFloat16);
+
+  auto out_c = torch::empty(
+      {N, M}, torch::TensorOptions().dtype(in_b.dtype()).device(in_b.device()));
+
+  // NUM_TREADS need to be a multiple of WARP_SIZE, as we are using warp shuffle
+  // operations.
+  const int NUM_THREADS =
+      max(rows_per_block * 16,
+          K * 2 / 16 % WARP_SIZE == 0
+              ? K * 2 / 16
+              : K * 2 / 16 + (WARP_SIZE - K * 2 / 16 % WARP_SIZE));
+
+  int NUM_BLOCKS = M / rows_per_block;
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(in_b));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // call the kernel function...
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "LLGemm1", [&] {
+    auto a_ptr = in_a.data_ptr<scalar_t>();
+    auto b_ptr = in_b.data_ptr<scalar_t>();
+    auto c_ptr = out_c.data_ptr<scalar_t>();
+    if (rows_per_block == 2) {
+      LLGemm1_kernel<scalar_t, 2>
+          <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(a_ptr, b_ptr, c_ptr, K);
+    } else if (rows_per_block == 4) {
+      LLGemm1_kernel<scalar_t, 4>
+          <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(a_ptr, b_ptr, c_ptr, K);
+    } else if (rows_per_block == 8) {
+      LLGemm1_kernel<scalar_t, 8>
+          <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(a_ptr, b_ptr, c_ptr, K);
+    } else if (rows_per_block == 16) {
+      LLGemm1_kernel<scalar_t, 16>
+          <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(a_ptr, b_ptr, c_ptr, K);
+    } else {
+      NUM_BLOCKS = M / 4;
+      LLGemm1_kernel<scalar_t, 4>
+          <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(a_ptr, b_ptr, c_ptr, K);
+    }
+  });
+
+  return out_c;
+}
+
+#define DOT2C(V0, V2, V3)                                                     \
+  if constexpr (std::is_same_v<scalar_t, half>) {                             \
+    asm("v_dot2c_f32_f16 %0, %2, %3" : "=v"(V0) : "0"(V0), "v"(V2), "v"(V3)); \
+  } else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {            \
+    float2 s = __bfloat1622float2(*((__hip_bfloat162*)(&(V2)))) *             \
+               __bfloat1622float2(*((__hip_bfloat162*)(&(V3))));              \
+    V0 += (s.x + s.y);                                                        \
+  }
+
+#if defined(__HIP__GFX9__)  // TODO: Add NAVI support
+// This version targets cases where A[] fits LDS capacity
+template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
+          int UNRL, int N>
+__global__ void __launch_bounds__(WvPrGrp* THRDS)
+    wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
+                     const scalar_t* __restrict__ A, scalar_t* C,
+                     const int _WvPrGrp, const int CuCount) {
+  constexpr int max_lds_len = LDS_SIZE / 2;
+  #if defined(__HIP__MI3XX__)
+  constexpr bool use_mfma = (std::is_same_v<scalar_t, __hip_bfloat16>);
+  #else
+  constexpr bool use_mfma = false;
+  #endif
+
+  using scalar8 =
+      __attribute__((__vector_size__((A_CHUNK / 2) * sizeof(float)))) float;
+  using half4 =
+      __attribute__((__vector_size__((A_CHUNK / 2) * sizeof(__bf16)))) __bf16;
+  union bigType {
+    scalar_t h[A_CHUNK];
+    float f[A_CHUNK / 2];
+    float2 f2[A_CHUNK / 4];
+    double d[A_CHUNK / 4];
+    half4 h4[A_CHUNK / 4];
+    scalar8 h8;
+  };
+
+  //----------------------------------------------------
+  // Reserving 64/160 KB of LDS to have 1 WG / CU
+  // Goal is to bring the activation matrix A to the LDS
+  // and use it across the lifetime of the work group
+  // TODO: When activation matrix is larger than 64 KB
+  //	     then this is not going to work!
+  //----------------------------------------------------
+  __shared__ scalar_t s[max_lds_len];
+
+  //----------------------------------------------------
+  // Fetch the activation matrix to LDS
+  // Loop iteration:
+  // - Each thread (lane) is fetching 8 elements (A_Chunk)
+  // - Each wave will fetch 64*8=> 512 elements
+  // - Each WG will fetch 512 * 16 => 8K elements
+  // - Then the WG will move to another 8 K elements
+  // TODO: Logic below will only work when K is multiple of 8
+  //----------------------------------------------------
+  for (uint32_t k = 0; k < min(K * N, max_lds_len);
+       k += THRDS * WvPrGrp * A_CHUNK) {
+    uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
+
+    if (k_in >= min(K * N, max_lds_len)) break;
+
+    *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
+  }
+  __syncthreads();
+
+  if (threadIdx.y >= _WvPrGrp) return;
+
+  uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE;
+
+  float sum[N][YTILE];
+  scalar8 sum4[N][YTILE];
+
+  //----------------------------------------------------
+  // Each wave works on a single column of weight matrix.
+  // There are 16 waves per WG, and hence, each WG is
+  // working on 16 columns of weight matrix. Moreover,
+  // we tile in column direction by YTILE, so when YTILE=1
+  // the above math is right, however, when YTILE=2 then
+  // each wave  will be working on 2 columns and WG will
+  // be working on 32 columns.
+  //
+  // Top level loop that makes WGs persistent!
+  // - WGs iterates across columns of weight matrix
+  // - Each wave within WG works on a given column(s)
+  // - After completing first set of columns, WGs start
+  //   working on the next set of available columns
+  //----------------------------------------------------
+  while (m < M) {
+    //----------------------------------------------------
+    // 'sum' accumulates the matrix A x B computation
+    // split across 64 lanes.
+    //
+    // YTILE represents how many column of weight matrix
+    // are being worked on by each wave.
+    //----------------------------------------------------
+    for (int i = 0; i < YTILE; i++)
+      for (int n = 0; n < N; n++)
+        if constexpr (!use_mfma)
+          sum[n][i] = 0;
+        else
+          sum4[n][i] = {0, 0, 0, 0};
+
+    bigType bigA[N][UNRL];
+    bigType bigB[YTILE][UNRL];
+    //----------------------------------------------------
+    // Fetch weight matrix B in interleaved K-split!
+    // - Each thread (lane) is fetching 8 elements (A_Chunk)
+    // - Each wave will fetch 64*8=> 512 elements (1024B)
+    // - YTILE represents the number of column being serviced
+    //   by wave
+    // - Loop for fetching weight matrix (B) are unrolled
+    //
+    // Fetch activation matrix A from LDS
+    // - Loop for fetching activation matrix (A) are unrolled
+    //
+    // Finally, do the matrix multiplication in an unrolled
+    // fashion. This provides lot of food for compiler
+    // scheduling.
+    //
+    // TODO: Logic below will only work when K is multiple of 8
+    //----------------------------------------------------
+    // for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+    for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+      // Fetch the weight matrix from memory!
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        const scalar_t* B_ = &B[(m + 0) * K + k_];
+        for (int y = 0; y < YTILE; y++)
+          bigB[y][k2].h8 = (loadnt((scalar8*)(&B_[y * K])));
+      }
+
+      // Fetch activation matrix from either just LDS or from both LDS / memory
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        // Fetch A activation matrix in interleaved fashion from LDS or memory
+
+        for (int n = 0; n < N; n++) {
+          bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
+        }
+      }
+
+      // Do the matrix multiplication in interleaved manner
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+        // Do the matrix multiplication of activation and weight matrix
+        // - Remember the accumulation is happening for K-split of 64!
+  #pragma unroll
+        for (uint32_t n = 0; n < N; n++) {
+  #pragma unroll
+          for (int y = 0; y < YTILE; y++) {
+            if constexpr (!use_mfma)
+  #pragma unroll
+              for (uint32_t b = 0; b < A_CHUNK / 2; b++) {
+                DOT2C(sum[n][y], bigA[n][k2].f[b], bigB[y][k2].f[b])
+              }
+            else
+  #pragma unroll
+              for (uint32_t b = 0; b < A_CHUNK / 4; b++)
+                sum4[n][y] = __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(
+                    bigA[n][k2].h4[b], bigB[y][k2].h4[b], sum4[n][y], 0, 0, 0);
+          }
+        }
+      }
+    }
+
+    //----------------------------------------------------
+    // Final reduction step using shuffle
+    //----------------------------------------------------
+    if constexpr (!use_mfma) {
+      for (int n = 0; n < N; n++) {
+        for (int y = 0; y < YTILE; y++) {
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:8 bound_ctrl:0 "
+              : "=v"(sum[n][y])
+              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:4 bound_ctrl:0 "
+              : "=v"(sum[n][y])
+              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:2 bound_ctrl:0 "
+              : "=v"(sum[n][y])
+              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 wave_shr:1 bound_ctrl:0"
+              : "=v"(sum[n][y])
+              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:15 bound_ctrl:0"
+              : "=v"(sum[n][y])
+              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:31 bound_ctrl:0"
+              : "=v"(sum[n][y])
+              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        }
+      }
+
+      if (threadIdx.x == 63) {
+        for (int n = 0; n < N; n++) {
+          for (int i = 0; i < YTILE; i++) {
+            // if (commitColumn[i]) C[m + i + n * M] = __float2half(sum[n][i]);
+            C[m + i + n * M] = __float2s<scalar_t>(sum[n][i]);
+          }
+        }
+      }
+    } else {
+  #pragma unroll
+      for (int n = 0; n < N; n++) {
+  #pragma unroll
+        for (int y = 0; y < YTILE; y++) {
+          // float accm1 = 0;
+          // for (int i=0; i<64; i++)
+          //    accm1 += __shfl(sum4[n][y][i%4], i);
+          float accm = sum4[n][y][0];
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
+              : "=v"(accm)
+              : "0"(accm), "v"(sum4[n][y][1]), "v"(accm));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
+              : "=v"(accm)
+              : "0"(accm), "v"(sum4[n][y][2]), "v"(accm));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
+              : "=v"(accm)
+              : "0"(accm), "v"(sum4[n][y][3]), "v"(accm));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:4 bound_ctrl:0 "
+              : "=v"(accm)
+              : "0"(accm), "v"(accm), "v"(accm));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
+              : "=v"(accm)
+              : "0"(accm), "v"(accm), "v"(accm));
+          asm("s_nop 0\n\tv_mov_b32 %0, %2 row_shr:15 bound_ctrl:0 "
+              : "=v"(accm)
+              : "0"(accm), "v"(accm));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:15 bound_ctrl:0"
+              : "=v"(accm)
+              : "0"(accm), "v"(accm), "v"(accm));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:31 bound_ctrl:0"
+              : "=v"(accm)
+              : "0"(accm), "v"(accm), "v"(accm));
+
+          sum4[n][y][0] = accm;
+        }
+      }
+      if (threadIdx.x == 63) {
+        for (int n = 0; n < N; n++) {
+          for (int i = 0; i < YTILE; i++) {
+            // if (commitColumn[i]) C[n + i + m * N] = __float2half(sum[n][i]);
+            C[m + i + n * M] = __float2bfloat16(sum4[n][i][0]);
+          }
+        }
+      }
+    }
+    m += CuCount * _WvPrGrp * YTILE;
+  }
+}
+#else   // !defined(__HIP__GFX9__) TODO: Add NAVI support
+template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
+          int UNRL, int N>
+__global__ void wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
+                                 const scalar_t* __restrict__ A, scalar_t* C,
+                                 const int _WvPrGrp, const int CuCount) {
+  UNREACHABLE_CODE
+}
+#endif  // defined(__HIP__GFX9__) TODO: Add NAVI support
+
+#if defined(__HIP__GFX9__)  // TODO: Add NAVI support
+// This version targets cases where A[] marginally exceeds LDS capacity
+template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
+          int UNRL, int N>
+__global__ void __launch_bounds__(WvPrGrp* THRDS)
+    wvSplitK_hf_(const int K, const int M, const scalar_t* B,
+                 const scalar_t* __restrict__ A, scalar_t* C,
+                 const int _WvPrGrp, const int CuCount) {
+  constexpr int max_lds_len = LDS_SIZE / 2;
+  #if defined(__HIP__MI3XX__)
+  constexpr bool use_mfma = (std::is_same_v<scalar_t, __hip_bfloat16>);
+  #else
+  constexpr bool use_mfma = false;
+  #endif
+
+  using scalar8 =
+      __attribute__((__vector_size__((A_CHUNK / 2) * sizeof(float)))) float;
+  using half4 =
+      __attribute__((__vector_size__((A_CHUNK / 2) * sizeof(__bf16)))) __bf16;
+  union bigType {
+    scalar_t h[A_CHUNK];
+    float f[A_CHUNK / 2];
+    float2 f2[A_CHUNK / 4];
+    double d[A_CHUNK / 4];
+    half4 h4[A_CHUNK / 4];
+    scalar8 h8;
+  };
+
+  //----------------------------------------------------
+  // Reserving 64 KB of LDS to have 1 WG / CU
+  // Goal is to bring the activation matrix A to the LDS
+  // and use it across the lifetime of the work group
+  // TODO: When activation matrix is larger than 64 KB
+  //	     then this is not going to work!
+  //----------------------------------------------------
+  __shared__ scalar_t s[max_lds_len];
+
+  //----------------------------------------------------
+  // Computation of columns that need to be committed to memory!
+  //----------------------------------------------------
+  uint32_t commitColumn[YTILE];
+  for (uint32_t i = 0; i < YTILE; i++) {
+    commitColumn[i] = 1;
+  }
+
+  //----------------------------------------------------
+  // Indexing function into the column of weight matrix B
+  // Algorithm does 64 lane k-splitting / wave and uses
+  // WG ID and Thread ID to find the index.
+  //----------------------------------------------------
+  // int _WvPrGrp = mindiv(N, CuCount * YTILE, WvPrGrp);
+  uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE;
+
+  // Check whether there will be fragmentation!
+  // This will happen only for the last wave!
+  if (m < M && (m + YTILE) >= M) {
+    uint32_t startColumn = M - YTILE;
+    for (uint32_t i = 0; i < (m - startColumn); i++) {
+      commitColumn[i] = 0;
+    }
+    m = startColumn;
+  }
+
+  //----------------------------------------------------
+  // Fetch the activation matrix to LDS
+  // Loop iteration:
+  // - Each thread (lane) is fetching 8 elements (A_Chunk)
+  // - Each wave will fetch 64*8=> 512 elements
+  // - Each WG will fetch 512 * 16 => 8K elements
+  // - Then the WG will move to another 8 K elements
+  // TODO: Logic below will only work when K is multiple of 8
+  //----------------------------------------------------
+  for (uint32_t k = 0; k < min(K * N, max_lds_len);
+       k += THRDS * WvPrGrp * A_CHUNK) {
+    uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
+
+    if (k_in >= min(K * N, max_lds_len)) break;
+
+    *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
+  }
+
+  __syncthreads();
+
+  if (threadIdx.y >= _WvPrGrp) return;
+
+  float sum[N][YTILE];
+  scalar8 sum4[N][YTILE];
+
+  //----------------------------------------------------
+  // Each wave works on a single column of weight matrix.
+  // There are 16 waves per WG, and hence, each WG is
+  // working on 16 columns of weight matrix. Moreover,
+  // we tile in column direction by YTILE, so when YTILE=1
+  // the above math is right, however, when YTILE=2 then
+  // each wave  will be working on 2 columns and WG will
+  // be working on 32 columns.
+  //
+  // Top level loop that makes WGs persistent!
+  // - WGs iterates across columns of weight matrix
+  // - Each wave within WG works on a given column(s)
+  // - After completing first set of columns, WGs start
+  //   working on the next set of available columns
+  //----------------------------------------------------
+  while (m < M) {
+    //----------------------------------------------------
+    // 'sum' accumulates the matrix A x B computation
+    // split across 64 lanes.
+    //
+    // YTILE represents how many column of weight matrix
+    // are being worked on by each wave.
+    //----------------------------------------------------
+    for (int i = 0; i < YTILE; i++)
+      for (int n = 0; n < N; n++)
+        if constexpr (!use_mfma)
+          sum[n][i] = 0;
+        else
+          sum4[n][i] = {0, 0, 0, 0};
+
+    bigType bigA[N][UNRL];
+    bigType bigB[YTILE][UNRL];
+    //----------------------------------------------------
+    // Fetch weight matrix B in interleaved K-split!
+    // - Each thread (lane) is fetching 8 elements (A_Chunk)
+    // - Each wave will fetch 64*8=> 512 elements (1024B)
+    // - YTILE represents the number of column being serviced
+    //   by wave
+    // - Loop for fetching weight matrix (B) are unrolled
+    //
+    // Fetch activation matrix A from LDS
+    // - Loop for fetching activation matrix (A) are unrolled
+    //
+    // Finally, do the matrix multiplication in an unrolled
+    // fashion. This provides lot of food for compiler
+    // scheduling.
+    //
+    // TODO: Logic below will only work when K is multiple of 8
+    //----------------------------------------------------
+    for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+      // Fetch the weight matrix from memory!
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        const scalar_t* B_ = &B[(m + 0) * K + k_];
+        for (int b = 0; b < YTILE; b++)
+          bigB[b][k2].h8 = (loadnt((scalar8*)(&B_[b * K])));
+      }
+
+      // Fetch activation matrix from either just LDS or from both LDS / memory
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        // Fetch A activation matrix in interleaved fashion from LDS or memory
+
+        for (int n = 0; n < N; n++) {
+          if (k_ + K * n < max_lds_len)
+            bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
+          else
+            bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
+        }
+      }
+
+      // Do the matrix multiplication in interleaved manner
+  #pragma unroll
+      for (uint32_t n = 0; n < N; n++) {
+  #pragma unroll
+        for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+          uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+          uint32_t k_ = k + threadIdx.x * A_CHUNK;
+          if (k_ >= K) break;
+          // Do the matrix multiplication of activation and weight matrix
+          // - Remember the accumulation is happening for K-split of 64!
+  #pragma unroll
+          for (int y = 0; y < YTILE; y++) {
+            if constexpr (!use_mfma)
+  #pragma unroll
+              for (uint32_t b = 0; b < A_CHUNK / 2; b++) {
+                DOT2C(sum[n][y], bigA[n][k2].f[b], bigB[y][k2].f[b])
+              }
+            else
+  #pragma unroll
+              for (uint32_t b = 0; b < A_CHUNK / 4; b++)
+                sum4[n][y] = __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(
+                    bigA[n][k2].h4[b], bigB[y][k2].h4[b], sum4[n][y], 0, 0, 0);
+          }
+        }
+      }
+    }
+
+    //----------------------------------------------------
+    // Final reduction step using shuffle
+    //----------------------------------------------------
+    if constexpr (!use_mfma) {
+      for (int n = 0; n < N; n++) {
+        for (int y = 0; y < YTILE; y++) {
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:8 bound_ctrl:0 "
+              : "=v"(sum[n][y])
+              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:4 bound_ctrl:0 "
+              : "=v"(sum[n][y])
+              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:2 bound_ctrl:0 "
+              : "=v"(sum[n][y])
+              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 wave_shr:1 bound_ctrl:0"
+              : "=v"(sum[n][y])
+              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:15 bound_ctrl:0"
+              : "=v"(sum[n][y])
+              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:31 bound_ctrl:0"
+              : "=v"(sum[n][y])
+              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        }
+      }
+
+      if (threadIdx.x == 63) {
+        for (int n = 0; n < N; n++) {
+          for (int i = 0; i < YTILE; i++) {
+            if (commitColumn[i])
+              C[m + i + n * M] = __float2s<scalar_t>(sum[n][i]);
+          }
+        }
+      }
+    } else {
+  #pragma unroll
+      for (int n = 0; n < N; n++) {
+  #pragma unroll
+        for (int y = 0; y < YTILE; y++) {
+          // float accm1 = 0;
+          // for (int i=0; i<64; i++)
+          //    accm1 += __shfl(sum4[n][y][i%4], i);
+
+          float accm = sum4[n][y][0];
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
+              : "=v"(accm)
+              : "0"(accm), "v"(sum4[n][y][1]), "v"(accm));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
+              : "=v"(accm)
+              : "0"(accm), "v"(sum4[n][y][2]), "v"(accm));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
+              : "=v"(accm)
+              : "0"(accm), "v"(sum4[n][y][3]), "v"(accm));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:4 bound_ctrl:0 "
+              : "=v"(accm)
+              : "0"(accm), "v"(accm), "v"(accm));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
+              : "=v"(accm)
+              : "0"(accm), "v"(accm), "v"(accm));
+          asm("s_nop 0\n\tv_mov_b32 %0, %2 row_shr:15 bound_ctrl:0 "
+              : "=v"(accm)
+              : "0"(accm), "v"(accm));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:15 bound_ctrl:0"
+              : "=v"(accm)
+              : "0"(accm), "v"(accm), "v"(accm));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:31 bound_ctrl:0"
+              : "=v"(accm)
+              : "0"(accm), "v"(accm), "v"(accm));
+
+          sum4[n][y][0] = accm;
+        }
+      }
+      if (threadIdx.x == 63) {
+        for (int n = 0; n < N; n++) {
+          for (int i = 0; i < YTILE; i++) {
+            // if (commitColumn[i]) C[n + i + m * N] = __float2half(sum[n][i]);
+            C[m + i + n * M] = __float2bfloat16(sum4[n][i][0]);
+          }
+        }
+      }
+    }
+
+    m += CuCount * _WvPrGrp * YTILE;
+
+    // Check whether there will be fragmentation!
+    // This will happen only for the last wave!
+    if (m < M && (m + YTILE) >= M) {
+      uint32_t startColumn = M - YTILE;
+      for (uint32_t i = 0; i < (m - startColumn); i++) {
+        commitColumn[i] = 0;
+      }
+      m = startColumn;
+    }
+  }
+}
+
+#else   // !defined(__HIP__GFX9__) TODO: Add NAVI support
+template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
+          int UNRL, int N>
+__global__ void wvSplitK_hf_(const int K, const int M, const scalar_t* B,
+                             const scalar_t* __restrict__ A, scalar_t* C,
+                             const int _WvPrGrp, const int CuCount) {
+  UNREACHABLE_CODE
+}
+#endif  // defined(__HIP__GFX9__) TODO: Add NAVI support
+
+#if defined(__HIP__GFX9__)  // TODO: Add NAVI support
+// This version targets big A[] cases, where it is much larger than LDS capacity
+template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
+          int UNRL, int N>
+__global__ void __launch_bounds__(WvPrGrp* THRDS)
+    wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
+                     const scalar_t* __restrict__ A, scalar_t* C,
+                     const int _WvPrGrp, const int CuCount) {
+  constexpr int max_lds_len = LDS_SIZE / 2;
+  #if defined(__HIP__MI3XX__)
+  constexpr bool use_mfma = (std::is_same_v<scalar_t, __hip_bfloat16>);
+  #else
+  constexpr bool use_mfma = false;
+  #endif
+
+  using scalar8 =
+      __attribute__((__vector_size__((A_CHUNK / 2) * sizeof(float)))) float;
+  using half4 =
+      __attribute__((__vector_size__((A_CHUNK / 2) * sizeof(__bf16)))) __bf16;
+  union bigType {
+    scalar_t h[A_CHUNK];
+    float f[A_CHUNK / 2];
+    float2 f2[A_CHUNK / 4];
+    double d[A_CHUNK / 4];
+    half4 h4[A_CHUNK / 4];
+    scalar8 h8;
+  };
+
+  //----------------------------------------------------
+  // Reserving 64/160 KB of LDS to have 1 WG / CU
+  // Goal is to bring the activation matrix A to the LDS
+  // and use it across the lifetime of the work group
+  // TODO: When activation matrix is larger than 64 KB
+  //	     then this is not going to work!
+  //----------------------------------------------------
+  __shared__ scalar_t s[max_lds_len];
+
+  //----------------------------------------------------
+  // Computation of columns that need to be committed to memory!
+  //----------------------------------------------------
+  uint32_t commitColumn[YTILE];
+  for (uint32_t i = 0; i < YTILE; i++) {
+    commitColumn[i] = 1;
+  }
+
+  // int _WvPrGrp = mindiv(N, CuCount * YTILE, WvPrGrp);
+  if (threadIdx.y >= _WvPrGrp) return;
+
+  //----------------------------------------------------
+  // Indexing function into the column of weight matrix B
+  // Algorithm does 64 lane k-splitting / wave and uses
+  // WG ID and Thread ID to find the index.
+  //----------------------------------------------------
+  uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE;
+
+  // Check whether there will be fragmentation!
+  // This will happen only for the last wave!
+  if (m < M && (m + YTILE) >= M) {
+    uint32_t startColumn = M - YTILE;
+    for (uint32_t i = 0; i < (m - startColumn); i++) {
+      commitColumn[i] = 0;
+    }
+    m = startColumn;
+  }
+
+  //----------------------------------------------------
+  // Fetch the activation matrix to LDS
+  // Loop iteration:
+  // - Each thread (lane) is fetching 8 elements (A_Chunk)
+  // - Each wave will fetch 64*8=> 512 elements
+  // - Each WG will fetch 512 * 16 => 8K elements
+  // - Then the WG will move to another 8 K elements
+  // TODO: Logic below will only work when K is multiple of 8
+  //----------------------------------------------------
+  #define PCML
+  #ifndef PCML
+  for (uint32_t k = 0; k < min(K * N, max_lds_len);
+       k += THRDS * WvPrGrp * A_CHUNK) {
+    uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
+
+    if (k_in >= min(K * N, max_lds_len)) break;
+
+    *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
+  }
+  __syncthreads();
+  #endif
+
+  #define TUC (THRDS * UNRL * A_CHUNK)
+  uint32_t kBase = 0;
+  // find biggest k size that fits in LDS
+  uint32_t kFit = (max_lds_len) / N;
+  // kFit = (kFit%TWC==0) ? kFit : (kFit-kFit%TWC+TWC); //round up to multiple
+  // of TUC
+  kFit = (kFit % TUC == 0)
+             ? kFit
+             : (kFit - kFit % TUC);  // round up to multiple of TUC
+  // if (kFit == 0) kFit = TUC;
+  kFit = min(kFit, K);
+
+  float sum[N][YTILE];
+  scalar8 sum4[N][YTILE];
+
+  //----------------------------------------------------
+  // Each wave works on a single column of weight matrix.
+  // There are 16 waves per WG, and hence, each WG is
+  // working on 16 columns of weight matrix. Moreover,
+  // we tile in column direction by YTILE, so when YTILE=1
+  // the above math is right, however, when YTILE=2 then
+  // each wave  will be working on 2 columns and WG will
+  // be working on 32 columns.
+  //
+  // Top level loop that makes WGs persistent!
+  // - WGs iterates across columns of weight matrix
+  // - Each wave within WG works on a given column(s)
+  // - After completing first set of columns, WGs start
+  //   working on the next set of available columns
+  //----------------------------------------------------
+  #ifdef PCML
+  int YW = (YTILE * _WvPrGrp);
+  uint32_t Mrndp = (M % YW == 0) ? M : (M - M % YW + YW);
+  while (m < Mrndp) {
+  #else
+  while (m < M) {
+  #endif
+    //----------------------------------------------------
+    // 'sum' accumulates the matrix A x B computation
+    // split across 64 lanes.
+    //
+    // YTILE represents how many column of weight matrix
+    // are being worked on by each wave.
+    //----------------------------------------------------
+    for (int i = 0; i < YTILE; i++)
+      for (int n = 0; n < N; n++)
+        if constexpr (!use_mfma)
+          sum[n][i] = 0;
+        else
+          sum4[n][i] = {0, 0, 0, 0};
+
+    bigType bigA[N][UNRL];
+    bigType bigB[YTILE][UNRL];
+    //----------------------------------------------------
+    // Fetch weight matrix B in interleaved K-split!
+    // - Each thread (lane) is fetching 8 elements (A_Chunk)
+    // - Each wave will fetch 64*8=> 512 elements (1024B)
+    // - YTILE represents the number of column being serviced
+    //   by wave
+    // - Loop for fetching weight matrix (B) are unrolled
+    //
+    // Fetch activation matrix A from LDS
+    // - Loop for fetching activation matrix (A) are unrolled
+    //
+    // Finally, do the matrix multiplication in an unrolled
+    // fashion. This provides lot of food for compiler
+    // scheduling.
+    //
+    // TODO: Logic below will only work when K is multiple of 8
+    //----------------------------------------------------
+    for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+  #ifdef PCML
+      if ((k1 == 0) || (k1 == kBase + kFit)) {  // load next chunk of A[] to LDS
+        if (k1 != 0) kBase += kFit;
+        __syncthreads();
+        for (uint32_t k = 0; k < kFit; k += THRDS * _WvPrGrp * A_CHUNK) {
+          uint32_t kOff = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
+          if (kBase + kOff >= K) break;
+          if (kOff >= kFit) break;
+          for (uint32_t n = 0; n < N; n++) {
+            uint32_t k_in = kBase + n * K + kOff;
+            uint32_t k_ot = n * kFit + kOff;
+            *((bigType*)(&s[k_ot])) = *((bigType*)(&A[k_in]));
+          }
+        }
+        __syncthreads();
+      }
+      if (m >= M) continue;
+  #endif
+
+      // Fetch the weight matrix from memory!
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        const scalar_t* B_ = &B[(m + 0) * K + k_];
+        for (int b = 0; b < YTILE; b++)
+          bigB[b][k2].h8 = (loadnt((scalar8*)(&B_[b * K])));
+      }
+
+      // Fetch activation matrix from either just LDS or from both LDS / memory
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        // Fetch A activation matrix in interleaved fashion from LDS or memory
+
+        for (int n = 0; n < N; n++) {
+  #ifdef PCML
+          bigA[n][k2] = *((const bigType*)(&(s[k_ - kBase + kFit * n])));
+  #else
+          if (k_ + K * n < 32 * 1024)
+            bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
+          else
+            bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
+  #endif
+        }
+      }
+
+      // Do the matrix multiplication in interleaved manner
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+  #pragma unroll
+        for (uint32_t n = 0; n < N; n++) {
+          // Do the matrix multiplication of activation and weight matrix
+          // - Remember the accumulation is happening for K-split of 64!
+  #pragma unroll
+          for (int y = 0; y < YTILE; y++) {
+            if constexpr (!use_mfma)
+  #pragma unroll
+              for (uint32_t b = 0; b < A_CHUNK / 2; b++) {
+                DOT2C(sum[n][y], bigA[n][k2].f[b], bigB[y][k2].f[b])
+              }
+            else
+  #pragma unroll
+              for (uint32_t b = 0; b < A_CHUNK / 4; b++)
+                sum4[n][y] = __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(
+                    bigA[n][k2].h4[b], bigB[y][k2].h4[b], sum4[n][y], 0, 0, 0);
+          }
+        }
+      }
+    }
+
+  #ifdef PCML
+    if (m >= M) {
+      m += CuCount * _WvPrGrp * YTILE;
+      kBase = 0;
+      continue;
+    }
+  #endif
+
+    //----------------------------------------------------
+    // Final reduction step using shuffle
+    //----------------------------------------------------
+    if constexpr (!use_mfma) {
+      for (int n = 0; n < N; n++) {
+        for (int y = 0; y < YTILE; y++) {
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:8 bound_ctrl:0 "
+              : "=v"(sum[n][y])
+              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:4 bound_ctrl:0 "
+              : "=v"(sum[n][y])
+              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:2 bound_ctrl:0 "
+              : "=v"(sum[n][y])
+              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 wave_shr:1 bound_ctrl:0"
+              : "=v"(sum[n][y])
+              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:15 bound_ctrl:0"
+              : "=v"(sum[n][y])
+              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:31 bound_ctrl:0"
+              : "=v"(sum[n][y])
+              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        }
+      }
+
+      if (threadIdx.x == 63) {
+        for (int n = 0; n < N; n++) {
+          for (int i = 0; i < YTILE; i++) {
+            if (commitColumn[i])
+              C[m + i + n * M] = __float2s<scalar_t>(sum[n][i]);
+          }
+        }
+      }
+    } else {
+  #pragma unroll
+      for (int n = 0; n < N; n++) {
+  #pragma unroll
+        for (int y = 0; y < YTILE; y++) {
+          float accm = sum4[n][y][0];
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
+              : "=v"(accm)
+              : "0"(accm), "v"(sum4[n][y][1]), "v"(accm));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
+              : "=v"(accm)
+              : "0"(accm), "v"(sum4[n][y][2]), "v"(accm));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
+              : "=v"(accm)
+              : "0"(accm), "v"(sum4[n][y][3]), "v"(accm));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:4 bound_ctrl:0 "
+              : "=v"(accm)
+              : "0"(accm), "v"(accm), "v"(accm));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
+              : "=v"(accm)
+              : "0"(accm), "v"(accm), "v"(accm));
+          asm("s_nop 0\n\tv_mov_b32 %0, %2 row_shr:15 bound_ctrl:0 "
+              : "=v"(accm)
+              : "0"(accm), "v"(accm));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:15 bound_ctrl:0"
+              : "=v"(accm)
+              : "0"(accm), "v"(accm), "v"(accm));
+          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:31 bound_ctrl:0"
+              : "=v"(accm)
+              : "0"(accm), "v"(accm), "v"(accm));
+
+          sum4[n][y][0] = accm;
+        }
+      }
+      if (threadIdx.x == 63) {
+        for (int n = 0; n < N; n++) {
+          for (int i = 0; i < YTILE; i++) {
+            // if (commitColumn[i]) C[n + i + m * N] = __float2half(sum[n][i]);
+            C[m + i + n * M] = __float2bfloat16(sum4[n][i][0]);
+          }
+        }
+      }
+    }
+
+    m += CuCount * _WvPrGrp * YTILE;
+    kBase = 0;
+
+    // Check whether there will be fragmentation!
+    // This will happen only for the last wave!
+    if (m < M && (m + YTILE) >= M) {
+      uint32_t startColumn = M - YTILE;
+      for (uint32_t i = 0; i < (m - startColumn); i++) {
+        commitColumn[i] = 0;
+      }
+      m = startColumn;
+    }
+  }
+}
+#else   // !defined(__HIP__GFX9__) TODO: Add NAVI support
+template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
+          int UNRL, int N>
+__global__ void wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
+                                 const scalar_t* __restrict__ A, scalar_t* C,
+                                 const int _WvPrGrp, const int CuCount) {
+  UNREACHABLE_CODE
+}
+#endif  // defined(__HIP__GFX9__) TODO: Add NAVI support
+
+int mindiv(int N, int div1, int div2) {
+  int nPrRnd = div1 * div2;
+  int rnds0 = N / nPrRnd;
+  nPrRnd -= div1 * 3;
+  int rnds3 = N / nPrRnd;
+  nPrRnd -= div1;
+  int rnds4 = N / nPrRnd;
+  nPrRnd -= div1;
+  int rnds5 = N / nPrRnd;
+  nPrRnd -= div1;
+  int rnds6 = N / nPrRnd;
+  nPrRnd -= div1;
+  int rnds7 = N / nPrRnd;
+  nPrRnd -= div1;
+  int rnds8 = N / nPrRnd;
+  nPrRnd -= div1;
+  int rnds9 = N / nPrRnd;
+  nPrRnd -= div1;
+  int rtn = div2;
+  if (rnds0 == rnds3) rtn = div2 - 3;
+  if (rnds0 == rnds4) rtn = div2 - 4;
+  if (rnds0 == rnds5) rtn = div2 - 5;
+  if (rnds0 == rnds6) rtn = div2 - 6;
+  if (rnds0 == rnds7) rtn = div2 - 7;
+  if (rnds0 == rnds8) rtn = div2 - 8;
+  if (rnds0 == rnds9) rtn = div2 - 9;
+  return rtn;
+}
+
+torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
+                       const int64_t CuCount) {
+  auto M_in = in_a.size(0);
+  auto K_in = in_a.size(1);
+  auto N_in = in_b.size(0);
+
+  TORCH_CHECK(in_a.dtype() == in_b.dtype());
+  TORCH_CHECK(K_in % 8 == 0, "k % 8 == 0");
+  TORCH_CHECK(in_a.dtype() == torch::kFloat16 ||
+              in_a.dtype() == torch::kBFloat16);
+
+  auto out_c = torch::empty(
+      {N_in, M_in},
+      torch::TensorOptions().dtype(in_b.dtype()).device(in_b.device()));
+
+  dim3 grid(CuCount);
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  const int max_lds_len = get_lds_size() / 2;
+
+#define WVSPLITK(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
+                 _N)                                                          \
+  {                                                                           \
+    dim3 block(64, _WvPrGrp);                                                 \
+    if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) {              \
+      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp);              \
+      wvSplitK_hf_sml_<fptype, 64, _YTILEs, _WvPrGrp, 8, _UNRLs, _N>          \
+          <<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp,    \
+                                       CuCount);                              \
+    } else if (K_in * N_in <= max_lds_len * 1.2) {                            \
+      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp);              \
+      wvSplitK_hf_<fptype, 64, _YTILEm, _WvPrGrp, 8, _UNRLm, _N>              \
+          <<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp,    \
+                                       CuCount);                              \
+    } else {                                                                  \
+      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEb, _WvPrGrp);              \
+      wvSplitK_hf_big_<fptype, 64, _YTILEb, _WvPrGrp, 8, _UNRLb, _N>          \
+          <<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp,    \
+                                       CuCount);                              \
+    }                                                                         \
+  }
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "wvSplitK", [&] {
+    using fptype = typename scalar<scalar_t>::type;
+    fptype* af4 = reinterpret_cast<fptype*>(in_a.data_ptr());
+    const fptype* bf4 = reinterpret_cast<const fptype*>(in_b.data_ptr());
+    fptype* c = reinterpret_cast<fptype*>(out_c.data_ptr());
+    switch (N_in) {
+      case 1:
+        WVSPLITK(16, 2, 2, 2, 2, 2, 2, 1)
+        break;
+      case 2:
+        WVSPLITK(16, 2, 2, 2, 2, 2, 2, 2)
+        break;
+      case 3:
+        WVSPLITK(16, 4, 7, 7, 1, 1, 1, 3)
+        break;
+      case 4:
+        WVSPLITK(16, 4, 7, 7, 1, 1, 1, 4)
+        break;
+      default:
+        throw std::runtime_error(
+            "Unsupported N value: " + std::to_string(M_in) + "," +
+            std::to_string(K_in) + "," + std::to_string(N_in));
+    }
+  });
+  return out_c;
+}
+
+#if defined(__HIP__MI3XX__)  // TODO: Add NAVI support
+template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
+          int A_CHUNK, int UNRL, int N>
+__global__ void __launch_bounds__(WvPrGrp* THRDS)
+    wvSplitKQ_hf_sml_(const int K, const int Kp, const int M, const fp8_t* B,
+                      const fp8_t* __restrict__ A, scalar_t* C,
+                      const float* __restrict__ s_A,
+                      const float* __restrict__ s_B, const int _WvPrGrp,
+                      const int CuCount) {
+  constexpr int max_lds_len = LDS_SIZE;
+  using scalar8 =
+      __attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float;
+  using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int;
+  using intx4 = __attribute__((__vector_size__(4 * sizeof(int)))) int;
+  union bigType {
+    char f8[A_CHUNK];
+    char2 c2[A_CHUNK / 2];
+    scalar_t h[A_CHUNK / 2];
+    float f[A_CHUNK / 4];
+    int i[A_CHUNK / 4];
+    long l[A_CHUNK / 8];
+    intx4 l2[A_CHUNK / 16];
+    scalar8 h8;
+  };
+
+  __shared__ fp8_t s[max_lds_len];
+
+  for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
+       k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
+    *((bigType*)(&s[k])) = *((bigType*)(&A[k]));
+  }
+  __syncthreads();
+
+  if (threadIdx.y >= _WvPrGrp) return;
+
+  uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE;
+
+  using floatx16 = __attribute__((__vector_size__(16 * sizeof(float)))) float;
+  floatx16 sum[N][YTILE];
+  float sA = *s_A;
+  float sB = *s_B;
+
+  while (m < M) {
+    for (int i = 0; i < YTILE; i++)
+      for (int n = 0; n < N; n++) sum[n][i] = {0.f};
+
+    bigType bigA[N][UNRL];
+    bigType bigB[YTILE][UNRL];
+
+    for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+  #pragma unroll
+        for (uint32_t n = 0; n < N; ++n) bigA[n][k2].h8 = {0.f};
+  #pragma unroll
+        for (uint32_t y = 0; y < YTILE; ++y) bigB[y][k2].h8 = {0.f};
+      }
+
+      // Fetch the weight matrix from memory!
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        const fp8_t* B_ = &B[(m + 0) * Kp + k_];
+  #pragma unroll
+        for (uint32_t y = 0; y < YTILE; ++y) {
+          bigB[y][k2].h8 = (loadnt((scalar8*)(&B_[y * Kp])));
+        }
+      }
+
+  // Fetch activation matrix from either just LDS or from both LDS / memory
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+        for (int n = 0; n < N; n++) {
+          bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
+        }
+      }
+
+  // Do the matrix multiplication in interleaved manner
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        if (k >= K) break;
+
+        for (uint32_t n = 0; n < N; n++) {
+          for (int i = 0; i < A_CHUNK; i += 8) {
+            for (int y = 0; y < YTILE; ++y) {
+              sum[n][y] = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(
+                  bigA[n][k2].l[i / 8], bigB[y][k2].l[i / 8], sum[n][y], 0, 0,
+                  0);
+            }
+          }
+        }
+      }
+    }
+
+    // Final reduction
+    for (int n = 0; n < N; n++) {
+      for (int y = 0; y < YTILE; y++) {
+        float accm0 = sum[n][y][0];
+        float accm16 = sum[n][y][8];
+        asm("v_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][1]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][9]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][2]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][10]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][3]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][11]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][4]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][12]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:9 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][5]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:9 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][13]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:10 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][6]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:10 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][14]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:11 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][7]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:11 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][15]), "v"(accm16));
+        accm0 += __shfl(accm0, 36);
+        accm16 += __shfl(accm16, 52);
+        sum[n][y][0] = accm0 + __shfl(accm16, 16);
+      }
+    }
+
+    if (threadIdx.x == 0) {
+      for (int n = 0; n < N; n++) {
+        for (int y = 0; y < YTILE; y++) {
+          C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0] * sA * sB);
+        }
+      }
+    }
+
+    m += CuCount * _WvPrGrp * YTILE;
+  }
+}
+#else   // !defined(__HIP__MI3XX__) TODO: Add NAVI support
+template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
+          int A_CHUNK, int UNRL, int N>
+__global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M,
+                                  const fp8_t* B, const fp8_t* __restrict__ A,
+                                  scalar_t* C, const float* __restrict__ s_A,
+                                  const float* __restrict__ s_B,
+                                  const int _WvPrGrp, const int CuCount) {
+  UNREACHABLE_CODE
+}
+#endif  // defined(__HIP__MI3XX__) TODO: Add NAVI support
+
+#if defined(__HIP__MI3XX__)  // TODO: Add NAVI support
+template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
+          int A_CHUNK, int UNRL, int N>
+__global__ void __launch_bounds__(WvPrGrp* THRDS)
+    wvSplitKQ_hf_(const int K, const int Kp, const int M, const fp8_t* B,
+                  const fp8_t* __restrict__ A, scalar_t* C,
+                  const float* __restrict__ s_A, const float* __restrict__ s_B,
+                  const int _WvPrGrp, const int CuCount) {
+  constexpr int max_lds_len = LDS_SIZE;
+  using scalar8 =
+      __attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float;
+  using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int;
+  using intx4 = __attribute__((__vector_size__(4 * sizeof(int)))) int;
+  union bigType {
+    char f8[A_CHUNK];
+    char2 c2[A_CHUNK / 2];
+    scalar_t h[A_CHUNK / 2];
+    float f[A_CHUNK / 4];
+    int i[A_CHUNK / 4];
+    long l[A_CHUNK / 8];
+    intx4 l2[A_CHUNK / 16];
+    scalar8 h8;
+  };
+
+  __shared__ fp8_t s[max_lds_len];
+
+  for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
+       k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
+    *((bigType*)(&s[k])) = *((bigType*)(&A[k]));
+  }
+  __syncthreads();
+
+  if (threadIdx.y >= _WvPrGrp) return;
+
+  uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE;
+
+  using floatx16 = __attribute__((__vector_size__(16 * sizeof(float)))) float;
+  floatx16 sum[N][YTILE];
+  float sA = *s_A;
+  float sB = *s_B;
+
+  while (m < M) {
+    for (int i = 0; i < YTILE; i++)
+      for (int n = 0; n < N; n++) sum[n][i] = {0};
+
+    bigType bigA[N][UNRL];
+    bigType bigB[YTILE][UNRL];
+
+    for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+      // Fetch the weight matrix from memory!
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        const fp8_t* B_ = &B[(m + 0) * Kp + k_];
+        for (int y = 0; y < YTILE; ++y) {
+          if (y + m >= M) break;  // To avoid mem access fault.
+          bigB[y][k2].h8 = (loadnt((scalar8*)(&B_[y * Kp])));
+        }
+      }
+
+  // Fetch activation matrix from either just LDS or from both LDS / memory
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+        for (int n = 0; n < N; n++) {
+          if (k_ + K * n < max_lds_len)
+            bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
+          else
+            bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
+        }
+      }
+
+  // Do the matrix multiplication in interleaved manner
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        for (uint32_t n = 0; n < N; n++) {
+          for (int i = 0; i < A_CHUNK; i += 8) {
+            for (int y = 0; y < YTILE; ++y) {
+              sum[n][y] = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(
+                  bigA[n][k2].l[i / 8], bigB[y][k2].l[i / 8], sum[n][y], 0, 0,
+                  0);
+            }
+          }
+        }
+      }
+    }
+
+    // Final reduction
+    for (int n = 0; n < N; n++) {
+      for (int y = 0; y < YTILE; y++) {
+        float accm0 = sum[n][y][0];
+        float accm16 = sum[n][y][8];
+        asm("v_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][1]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][9]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][2]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][10]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][3]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][11]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][4]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][12]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:9 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][5]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:9 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][13]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:10 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][6]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:10 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][14]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:11 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][7]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:11 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][15]), "v"(accm16));
+        accm0 += __shfl(accm0, 36);
+        accm16 += __shfl(accm16, 52);
+        sum[n][y][0] = accm0 + __shfl(accm16, 16);
+      }
+    }
+
+    if (threadIdx.x == 0) {
+      for (int n = 0; n < N; n++) {
+        for (int y = 0; y < YTILE; y++) {
+          if (y + m >= M) break;  // To avoid mem access fault.
+          C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0] * sA * sB);
+        }
+      }
+    }
+
+    m += CuCount * _WvPrGrp * YTILE;
+  }
+}
+#else   // !defined(__HIP__MI3XX__) TODO: Add NAVI support
+template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
+          int A_CHUNK, int UNRL, int N>
+__global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M,
+                              const fp8_t* B, const fp8_t* __restrict__ A,
+                              scalar_t* C, const float* __restrict__ s_A,
+                              const float* __restrict__ s_B, const int _WvPrGrp,
+                              const int CuCount) {
+  UNREACHABLE_CODE
+}
+#endif  // defined(__HIP__MI3XX__) TODO: Add NAVI support
+
+void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
+               at::Tensor& scale_a, at::Tensor& scale_b,
+               const int64_t CuCount) {
+  static c10::ScalarType kFp8Type = is_fp8_ocp()
+                                        ? c10::ScalarType::Float8_e4m3fn
+                                        : c10::ScalarType::Float8_e4m3fnuz;
+  auto M_in = in_a.size(0);
+  auto K_in = in_a.size(1);
+  auto N_in = in_b.size(0);
+  auto Kp_in = in_a.stride(0);
+  TORCH_CHECK(K_in % 16 == 0, "k % 16 == 0");
+  TORCH_CHECK(in_a.dtype() == in_b.dtype() && in_a.dtype() == kFp8Type);
+  TORCH_CHECK(out_c.dtype() == torch::kFloat16 ||
+              out_c.dtype() == torch::kBFloat16);
+
+  dim3 grid(CuCount);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  const int max_lds_len = get_lds_size();
+
+#define WVSPLITKQ(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
+                  _N)                                                          \
+  {                                                                            \
+    dim3 block(64, _WvPrGrp);                                                  \
+    if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) {               \
+      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp);               \
+      wvSplitKQ_hf_sml_<fptype, fp8_t, 64, _YTILEs, _WvPrGrp, 16, _UNRLs, _N>  \
+          <<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, a_ptr, b_ptr, c_ptr, \
+                                       s_a, s_b, __wvPrGrp, CuCount);          \
+    } else {                                                                   \
+      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp);               \
+      wvSplitKQ_hf_<fptype, fp8_t, 64, _YTILEm, _WvPrGrp, 16, _UNRLm, _N>      \
+          <<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, a_ptr, b_ptr, c_ptr, \
+                                       s_a, s_b, __wvPrGrp, CuCount);          \
+    }                                                                          \
+  }
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(out_c.scalar_type(), "wvSplitKQ", [&] {
+    using fptype = typename scalar<scalar_t>::type;
+    auto c_ptr = reinterpret_cast<fptype*>(out_c.data_ptr());
+    auto s_a = scale_a.data_ptr<float>();
+    auto s_b = scale_b.data_ptr<float>();
+    VLLM_DISPATCH_FP8_TYPES(in_a.scalar_type(), "wvSplitKQ", [&] {
+      auto a_ptr = in_a.data_ptr<fp8_t>();
+      auto b_ptr = in_b.data_ptr<fp8_t>();
+      switch (N_in) {
+        case 1:
+          WVSPLITKQ(16, 2, 2, 2, 2, 2, 2, 1)
+          break;
+        case 2:
+          WVSPLITKQ(16, 2, 2, 2, 2, 2, 2, 2)
+          break;
+        case 3:
+          WVSPLITKQ(16, 4, 7, 7, 1, 1, 1, 3)
+          break;
+        case 4:
+          WVSPLITKQ(16, 4, 7, 7, 1, 1, 1, 4)
+          break;
+        default:
+          throw std::runtime_error(
+              "Unsupported N value: " + std::to_string(M_in) + "," +
+              std::to_string(K_in) + "," + std::to_string(N_in));
+      }
+    });
+  });
+}
diff --git a/vllm_v0.10.0/csrc/rocm/torch_bindings.cpp b/vllm_v0.10.0/csrc/rocm/torch_bindings.cpp
new file mode 100644
index 0000000..3457547
--- /dev/null
+++ b/vllm_v0.10.0/csrc/rocm/torch_bindings.cpp
@@ -0,0 +1,55 @@
+#include "core/registration.h"
+#include "rocm/ops.h"
+
+// Note on op signatures:
+// The X_meta signatures are for the meta functions corresponding to op X.
+// They must be kept in sync with the signature for X. Generally, only
+// functions that return Tensors require a meta function.
+//
+// See the following links for detailed docs on op registration and function
+// schemas.
+// https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU/edit#heading=h.ptttacy8y1u9
+// https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/README.md#annotations
+
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
+  // vLLM custom ops for rocm
+
+  // Custom gemm op for matrix-vector multiplication
+  rocm_ops.def(
+      "LLMM1(Tensor in_a, Tensor in_b, int rows_per_block) -> "
+      "Tensor");
+  rocm_ops.impl("LLMM1", torch::kCUDA, &LLMM1);
+
+  // Custom gemm op for skinny matrix-matrix multiplication
+  rocm_ops.def(
+      "wvSplitK(Tensor in_a, Tensor in_b, int CuCount) -> "
+      "Tensor");
+  rocm_ops.impl("wvSplitK", torch::kCUDA, &wvSplitK);
+
+  // wvSplitK for fp8
+  rocm_ops.def(
+      "wvSplitKQ(Tensor in_a, Tensor in_b, Tensor! out_c, Tensor scale_a, "
+      "          Tensor scale_b, int CuCount) -> ()");
+  rocm_ops.impl("wvSplitKQ", torch::kCUDA, &wvSplitKQ);
+
+  // Custom attention op
+  // Compute the attention between an input query and the cached
+  // keys/values using PagedAttention.
+  rocm_ops.def(
+      "paged_attention(Tensor! out, Tensor exp_sums,"
+      "                Tensor max_logits, Tensor tmp_out,"
+      "                Tensor query, Tensor key_cache,"
+      "                Tensor value_cache, int num_kv_heads,"
+      "                float scale, Tensor block_tables,"
+      "                Tensor context_lens,"
+      "                Tensor? query_start_loc,"
+      "                int block_size,"
+      "                int max_context_len,"
+      "                Tensor? alibi_slopes,"
+      "                str kv_cache_dtype,"
+      "                Tensor k_scale, Tensor v_scale,"
+      "                Tensor? fp8_out_scale) -> ()");
+  rocm_ops.impl("paged_attention", torch::kCUDA, &paged_attention);
+}
+
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/vllm_v0.10.0/csrc/sampler.cu b/vllm_v0.10.0/csrc/sampler.cu
new file mode 100644
index 0000000..b0cce2e
--- /dev/null
+++ b/vllm_v0.10.0/csrc/sampler.cu
@@ -0,0 +1,88 @@
+#include "dispatch_utils.h"
+
+#include <torch/cuda.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#ifndef USE_ROCM
+  #include <cub/cub.cuh>
+#else
+  #include <hipcub/hipcub.hpp>
+#endif
+
+namespace vllm {
+
+template <typename scalar_t>
+__global__ void apply_repetition_penalties_kernel(
+    scalar_t* __restrict__ logits,         // [num_seqs, vocab_size]
+    const bool* __restrict__ prompt_mask,  // [num_seqs, vocab_size]
+    const bool* __restrict__ output_mask,  // [num_seqs, vocab_size]
+    const scalar_t* __restrict__ repetition_penalties,  // [num_seqs]
+    const int num_seqs, const int vocab_size, const int tile_size) {
+  // Each block handles one sequence and a tile of vocab
+  const int seq_idx = blockIdx.x;
+  if (seq_idx >= num_seqs) return;
+
+  const int tile_start = blockIdx.y * tile_size;
+  const int tile_end = min(tile_start + tile_size, vocab_size);
+
+  // Load repetition penalty for this sequence
+  const scalar_t penalty = repetition_penalties[seq_idx];
+
+  // Each thread processes multiple vocab items within the tile
+  for (int vocab_idx = tile_start + threadIdx.x; vocab_idx < tile_end;
+       vocab_idx += blockDim.x) {
+    const int64_t idx = static_cast<int64_t>(seq_idx) * vocab_size + vocab_idx;
+    const bool is_repeated = prompt_mask[idx] || output_mask[idx];
+    if (is_repeated) {
+      scalar_t logit = logits[idx];
+      if (logit > 0) {
+        logits[idx] = logit / penalty;
+      } else {
+        logits[idx] = logit * penalty;
+      }
+    }
+  }
+}
+
+}  // namespace vllm
+
+void apply_repetition_penalties_(
+    torch::Tensor& logits,             // [num_seqs, vocab_size], in-place
+    const torch::Tensor& prompt_mask,  // [num_seqs, vocab_size]
+    const torch::Tensor& output_mask,  // [num_seqs, vocab_size]
+    const torch::Tensor& repetition_penalties) {  // [num_seqs]
+  TORCH_CHECK(logits.is_contiguous());
+  TORCH_CHECK(prompt_mask.is_contiguous());
+  TORCH_CHECK(output_mask.is_contiguous());
+  TORCH_CHECK(repetition_penalties.is_contiguous());
+
+  int vocab_size = logits.size(-1);
+  int num_seqs = logits.size(0);
+
+  if (num_seqs == 0) return;
+
+  // Get number of SMs on the current device
+  int sms = 0;
+  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount,
+                         logits.get_device());
+
+  // Compute tile_num and tile_size
+  int tile_num =
+      std::min(vocab_size, std::max(1, (sms + num_seqs - 1) / num_seqs));
+  int tile_size = (vocab_size + tile_num - 1) / tile_num;
+
+  // Each block handles one sequence and a tile of vocab
+  dim3 grid(num_seqs, tile_num);
+  dim3 block(std::min(tile_size, 1024));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(logits));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(
+      logits.scalar_type(), "apply_repetition_penalties_kernel", [&] {
+        vllm::apply_repetition_penalties_kernel<scalar_t>
+            <<<grid, block, 0, stream>>>(
+                logits.data_ptr<scalar_t>(), prompt_mask.data_ptr<bool>(),
+                output_mask.data_ptr<bool>(),
+                repetition_penalties.data_ptr<scalar_t>(), num_seqs, vocab_size,
+                tile_size);
+      });
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/csrc/sparse/cutlass/sparse_compressor_c3x.cuh b/vllm_v0.10.0/csrc/sparse/cutlass/sparse_compressor_c3x.cuh
new file mode 100644
index 0000000..2cc235f
--- /dev/null
+++ b/vllm_v0.10.0/csrc/sparse/cutlass/sparse_compressor_c3x.cuh
@@ -0,0 +1,90 @@
+#pragma once
+
+// clang-format will break include orders
+// clang-format off
+#include <cudaTypedefs.h>
+
+#if defined CUDA_VERSION && CUDA_VERSION >= 12020
+#include "sparse_scaled_mm_c3x.cuh"
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/transform/device/transform_universal_adapter.hpp"
+#include "cutlass/transform/kernel/sparse_gemm_compressor.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+
+// clang-format on
+
+using namespace cute;
+using namespace vllm;
+
+using CompressorResult = std::tuple<torch::Tensor, torch::Tensor>;
+/// Make A structured sparse by replacing elements with 0 and compress it
+template <typename Gemm>
+CompressorResult cutlass_sparse_compress(torch::Tensor const& a) {
+  // Checks for conformality
+  TORCH_CHECK(a.dtype() == torch::kInt8 || a.dtype() == torch::kFloat8_e4m3fn ||
+              a.dtype() == torch::kFloat16 || a.dtype() == torch::kBFloat16);
+  TORCH_CHECK(a.dim() == 2)
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(0) % 4 == 0)  // Required for semi-structured sparsity
+  TORCH_CHECK(a.stride(1) == 1)
+
+  using GemmKernel = typename Gemm::KernelType;
+  using ElementA = typename Gemm::ElementAB;
+  using ElementE = typename GemmKernel::CollectiveMainloop::ElementE;
+
+  int m = a.size(0);
+  int k = a.size(1);
+  using ProblemShape = typename GemmKernel::ProblemShape;
+  ProblemShape prob_shape{m, 1, k, 1};
+
+  int64_t lda = a.stride(0);
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+  StrideA a_stride{lda, Int<1>{}, 0};
+
+  using CompressorUtility = typename Gemm::CompressorUtility;
+  CompressorUtility compressor_utility(prob_shape, a_stride);
+
+  // Allocate buffers for the metadata E and the compressed matrix A
+  int ME = compressor_utility.get_metadata_m_physical();
+  int KE = compressor_utility.get_metadata_k_physical();
+  int MC = compressor_utility.get_tensorA_m_physical();
+  int KC = compressor_utility.get_tensorA_k_physical();
+
+  auto const a_meta_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto const a_nzs_options =
+      torch::TensorOptions().dtype(a.dtype()).device(a.device());
+
+  auto a_meta = torch::zeros({ME, KE}, a_meta_options);
+  auto a_nzs = torch::zeros({MC, KC}, a_nzs_options);
+
+  auto a_ptr = static_cast<ElementA*>(a.data_ptr());
+  auto a_nzs_ptr = static_cast<ElementA*>(a_nzs.data_ptr());
+  auto a_meta_ptr = static_cast<ElementE*>(a_meta.data_ptr());
+
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.device_id = a.device().index();
+  hw_info.sm_count =
+      cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+          hw_info.device_id);
+
+  using Compressor = typename Gemm::Compressor;
+  typename Compressor::Arguments arguments{
+      prob_shape, {a_ptr, a_stride, a_nzs_ptr, a_meta_ptr}, {hw_info}};
+
+  Compressor compressor_op;
+  size_t workspace_size = Compressor::get_workspace_size(arguments);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  CUTLASS_CHECK(compressor_op.can_implement(arguments));
+  CUTLASS_CHECK(compressor_op.initialize(arguments, workspace.data_ptr()));
+  CUTLASS_CHECK(compressor_op.run());
+  CUDA_CHECK(cudaDeviceSynchronize());
+
+  return {a_meta, a_nzs};
+}
+
+#endif
diff --git a/vllm_v0.10.0/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/vllm_v0.10.0/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
new file mode 100644
index 0000000..d053ecc
--- /dev/null
+++ b/vllm_v0.10.0/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -0,0 +1,307 @@
+// clang-format will break include orders
+// clang-format off
+#include <cudaTypedefs.h>
+
+#if defined CUDA_VERSION && CUDA_VERSION >= 12020
+#include "sparse_scaled_mm_c3x.cuh"
+// clang-format on
+
+using namespace cute;
+using namespace vllm;
+
+struct GemmCallerTraits {
+  using return_type = void;
+
+  template <typename GemmConfig, typename... Args>
+  static return_type invoke(Args&&... args) {
+    return cutlass_sparse_gemm_caller<GemmConfig>(std::forward<Args>(args)...);
+  }
+};
+
+struct GemmCompressorTraits {
+  using return_type = CompressorResult;
+
+  template <typename GemmConfig, typename... Args>
+  static return_type invoke(Args&&... args) {
+    return cutlass_sparse_compress<GemmConfig>(std::forward<Args>(args)...);
+  }
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename DispatchFunc, typename... Args>
+typename DispatchFunc::return_type cutlass_gemm_sm90_fp8_dispatch(
+    uint32_t m, uint32_t n, Args&&... args) {
+  static_assert(std::is_same_v<InType, cutlass::float_e4m3_t>);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm90_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm90_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM256 =
+      typename sm90_fp8_config_M256<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM512 =
+      typename sm90_fp8_config_M512<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  using Cutlass3xGemm1 =
+      typename sm90_fp8_config_1<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm2 =
+      typename sm90_fp8_config_2<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm3 =
+      typename sm90_fp8_config_3<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm4 =
+      typename sm90_fp8_config_4<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm5 =
+      typename sm90_fp8_config_5<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm6 =
+      typename sm90_fp8_config_6<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm7 =
+      typename sm90_fp8_config_7<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm8 =
+      typename sm90_fp8_config_8<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 64) {
+    if (n == 28672) {
+      return DispatchFunc::template invoke<Cutlass3xGemm2>(
+          std::forward<Args>(args)...);
+    } else if (n == 4096 || n == 6144) {
+      return DispatchFunc::template invoke<Cutlass3xGemm1>(
+          std::forward<Args>(args)...);
+    }
+  } else if (mp2 <= 128) {
+    if (n == 4096) {
+      return DispatchFunc::template invoke<Cutlass3xGemm3>(
+          std::forward<Args>(args)...);
+    } else if (n == 28672) {
+      return DispatchFunc::template invoke<Cutlass3xGemm5>(
+          std::forward<Args>(args)...);
+    } else if (n == 6144) {
+      return DispatchFunc::template invoke<Cutlass3xGemm4>(
+          std::forward<Args>(args)...);
+    }
+  } else if (mp2 <= 256) {
+    if (n == 4096) {
+      return DispatchFunc::template invoke<Cutlass3xGemm6>(
+          std::forward<Args>(args)...);
+    } else if (n == 28672) {
+      return DispatchFunc::template invoke<Cutlass3xGemm8>(
+          std::forward<Args>(args)...);
+    } else if (n == 6144) {
+      return DispatchFunc::template invoke<Cutlass3xGemm7>(
+          std::forward<Args>(args)...);
+    }
+  } else {
+    if (n == 6144 || n == 28672) {
+      return DispatchFunc::template invoke<Cutlass3xGemm8>(
+          std::forward<Args>(args)...);
+    } else if (n == 4096) {
+      return DispatchFunc::template invoke<Cutlass3xGemm7>(
+          std::forward<Args>(args)...);
+    }
+  }
+
+  // Otherwise the default heuristic
+  if (mp2 <= 64) {
+    // n in [1, 64]
+    return DispatchFunc::template invoke<Cutlass3xGemmM64>(
+        std::forward<Args>(args)...);
+  } else if (mp2 <= 128) {
+    // n in (64, 128]
+    return DispatchFunc::template invoke<Cutlass3xGemmM128>(
+        std::forward<Args>(args)...);
+  } else if (mp2 <= 256) {
+    // n in (128, 256]
+    return DispatchFunc::template invoke<Cutlass3xGemmM256>(
+        std::forward<Args>(args)...);
+  } else {
+    // n in (256, inf)
+    return DispatchFunc::template invoke<Cutlass3xGemmM512>(
+        std::forward<Args>(args)...);
+  }
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename DispatchFunc, typename... Args>
+typename DispatchFunc::return_type cutlass_gemm_sm90_16bit_dispatch(
+    uint32_t m, uint32_t n, Args&&... args) {
+  using Cutlass3xGemmDefault =
+      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  return DispatchFunc::template invoke<Cutlass3xGemmDefault>(
+      std::forward<Args>(args)...);
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename DispatchFunc, typename... Args>
+typename DispatchFunc::return_type cutlass_gemm_sm90_int8_dispatch(
+    uint32_t m, uint32_t n, Args&&... args) {
+  static_assert(std::is_same_v<InType, int8_t>);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm90_int8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm90_int8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM32NBig =
+      typename sm90_int8_config_M32_NBig<InType, OutType,
+                                         Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM32NSmall =
+      typename sm90_int8_config_M32_NSmall<InType, OutType,
+                                           Epilogue>::Cutlass3xGemm;
+
+  bool const is_small_n = n < 8192;
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 32) {
+    // m in [1, 32]
+    if (is_small_n) {
+      return DispatchFunc::template invoke<Cutlass3xGemmM32NSmall>(
+          std::forward<Args>(args)...);
+    } else {
+      return DispatchFunc::template invoke<Cutlass3xGemmM32NBig>(
+          std::forward<Args>(args)...);
+    }
+  } else if (mp2 <= 64) {
+    // m in (32, 64]
+    return DispatchFunc::template invoke<Cutlass3xGemmM64>(
+        std::forward<Args>(args)...);
+  } else if (mp2 <= 128) {
+    // m in (64, 128]
+    return DispatchFunc::template invoke<Cutlass3xGemmM128>(
+        std::forward<Args>(args)...);
+  } else {
+    // m in (128, inf)
+    return DispatchFunc::template invoke<Cutlass3xGemmDefault>(
+        std::forward<Args>(args)...);
+  }
+}
+
+// Dispatch to GEMM implementations based on element types
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_sparse_mm_sm90_epilogue(torch::Tensor& out,
+                                            torch::Tensor const& a,
+                                            torch::Tensor const& bt_nzs,
+                                            torch::Tensor const& bt_meta,
+                                            EpilogueArgs&&... epilogue_args) {
+  uint32_t const m = out.size(0);
+  uint32_t const n = out.size(1);
+
+  // TODO: add dispatch functions to all of these
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  if (a.dtype() == torch::kInt8) {
+    TORCH_CHECK(bt_nzs.dtype() == torch::kInt8);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
+                                             Epilogue, GemmCallerTraits>(
+          m, n, out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue,
+                                             GemmCallerTraits>(
+          m, n, out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  } else if (a.dtype() == torch::kFloat8_e4m3fn) {
+    TORCH_CHECK(bt_nzs.dtype() == torch::kFloat8_e4m3fn);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::bfloat16_t, Epilogue,
+                                            GemmCallerTraits>(
+          m, n, out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_fp8_dispatch<
+          cutlass::float_e4m3_t, cutlass::half_t, Epilogue, GemmCallerTraits>(
+          m, n, out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  } else if (a.dtype() == torch::kFloat16) {
+    TORCH_CHECK(bt_nzs.dtype() == torch::kFloat16);
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+
+    return cutlass_gemm_sm90_16bit_dispatch<cutlass::half_t, cutlass::half_t,
+                                            Epilogue, GemmCallerTraits>(
+        m, n, out, a, bt_nzs, bt_meta,
+        std::forward<EpilogueArgs>(epilogue_args)...);
+  } else {  // a.dtype() == torch::kBFloat16
+    TORCH_CHECK(a.dtype() == torch::kBFloat16);
+    TORCH_CHECK(bt_nzs.dtype() == torch::kBFloat16);
+    TORCH_CHECK(out.dtype() == torch::kBFloat16);
+
+    return cutlass_gemm_sm90_16bit_dispatch<
+        cutlass::bfloat16_t, cutlass::bfloat16_t, Epilogue, GemmCallerTraits>(
+        m, n, out, a, bt_nzs, bt_meta,
+        std::forward<EpilogueArgs>(epilogue_args)...);
+  }
+}
+
+void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
+                                   torch::Tensor const& bt_nzs,
+                                   torch::Tensor const& bt_meta,
+                                   torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "CUTLASS scaled_mm bias dtype must match output dtype ",
+                out.dtype());
+    return cutlass_scaled_sparse_mm_sm90_epilogue<
+        c3x::ScaledEpilogueColumnBias>(out, a, bt_nzs, bt_meta, b_scales,
+                                       a_scales, *bias);
+  } else {
+    return cutlass_scaled_sparse_mm_sm90_epilogue<c3x::ScaledEpilogue>(
+        out, a, bt_nzs, bt_meta, b_scales, a_scales);
+  }
+}
+
+CompressorResult cutlass_sparse_compress_sm90(torch::Tensor const& a) {
+  // These m and n variables are fordispatching to different GEMM algorithms.
+  uint32_t const m = 1;  // Set M to 1 for compression
+  uint32_t const n = a.size(1);
+
+  // Note: For correctness, the compressed format must be invariant in:
+  //  - M, the flattened number of tokens
+  //  - Whether output dtype is fp16 or bf16
+  //  - CUTLASS epilogues
+
+  if (a.dtype() == torch::kInt8) {
+    return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
+                                           c3x::TrivialEpilogue,
+                                           GemmCompressorTraits>(m, n, a);
+  } else if (a.dtype() == torch::kFloat8_e4m3fn) {
+    return cutlass_gemm_sm90_fp8_dispatch<
+        cutlass::float_e4m3_t, cutlass::bfloat16_t, c3x::TrivialEpilogue,
+        GemmCompressorTraits>(m, n, a);
+  } else if (a.dtype() == torch::kFloat16) {
+    return cutlass_gemm_sm90_16bit_dispatch<
+        cutlass::bfloat16_t, cutlass::bfloat16_t, c3x::TrivialEpilogue,
+        GemmCompressorTraits>(m, n, a);
+  } else {
+    TORCH_CHECK(a.dtype() == torch::kBFloat16,
+                "cutlass_sparse_compress only supports int8, fp8_e4m3, fp16, "
+                "and bf16 datatypes");
+    return cutlass_gemm_sm90_16bit_dispatch<cutlass::half_t, cutlass::half_t,
+                                            c3x::TrivialEpilogue,
+                                            GemmCompressorTraits>(m, n, a);
+  }
+}
+
+#endif
diff --git a/vllm_v0.10.0/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/vllm_v0.10.0/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
new file mode 100644
index 0000000..637bba1
--- /dev/null
+++ b/vllm_v0.10.0/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -0,0 +1,570 @@
+#pragma once
+
+// clang-format will break include orders
+// clang-format off
+#include <cudaTypedefs.h>
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include "cuda_utils.h"
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/transform/device/transform_universal_adapter.hpp"
+#include "cutlass/transform/kernel/sparse_gemm_compressor.hpp"
+
+#include "core/math.hpp"
+#include "cutlass_extensions/cute_utils.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+#include "cutlass_extensions/common.hpp"
+#include "cutlass_extensions/torch_utils.hpp"
+// clang-format on
+
+using namespace cute;
+
+/*
+   This file defines 2:4 sparse GEMM operations using the CUTLASS 3.x API,
+   for NVIDIA GPUs with sm90a (Hopper) or later.
+*/
+
+namespace {
+
+// A wrapper for the GEMM kernel that is used to guard against compilation on
+// architectures that will never use the kernel. The purpose of this is to
+// reduce the size of the compiled binary.
+// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
+// into code that will be executed on the device where it is defined.
+template <typename Kernel>
+struct enable_sm90_or_later : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+using GemmUniversalMode = cutlass::gemm::GemmUniversalMode;
+
+/*
+ * cutlass_sparse_3x_gemm defines a 2:4 sparse GEMM kernel via CUTLASS
+ * for SM90 Hopper systems.
+ */
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule>
+struct cutlass_sparse_3x_gemm {
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+
+  using Epilogue = Epilogue_<ElementAcc, ElementD, TileShape>;
+
+  using ElementC = void;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutC_Transpose =
+      typename cutlass::layout::LayoutTranspose<LayoutC>::type;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  // These are the minimum alignments needed for the kernels to compile
+  static constexpr int AlignmentAB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentCD =
+      128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAcc, float, ElementC, LayoutC_Transpose, AlignmentCD, ElementD,
+          LayoutC_Transpose, AlignmentCD, EpilogueSchedule,
+          EVTCompute>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  // clang-format off
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
+          ElementAB, cutlass::layout::RowMajor, AlignmentAB,
+          ElementAB, cutlass::layout::ColumnMajor, AlignmentAB,
+          ElementAcc, TileShape, ClusterShape,
+          Stages,
+          KernelSchedule>::CollectiveOp;
+  // clang-format on
+
+  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
+      cutlass::gemm::PersistentScheduler>>;
+
+  struct GemmKernel : public KernelType {};
+
+  // Sparse compressor definitions
+  using SparseConfig = typename GemmKernel::CollectiveMainloop::SparseConfig;
+  using LayoutTagA = cutlass::layout::RowMajor;
+  using CompressorUtility =
+      cutlass::transform::kernel::StructuredSparseCompressorUtility<
+          typename GemmKernel::ProblemShape, ElementAB, LayoutTagA,
+          SparseConfig>;
+  using CompressorKernel =
+      cutlass::transform::kernel::StructuredSparseCompressor<
+          typename GemmKernel::ProblemShape, ElementAB, LayoutTagA,
+          SparseConfig, cutlass::arch::Sm90>;
+  using Compressor =
+      cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
+};
+
+/*
+ * This class defines kernel to compress a 2:4 sparse matrix.
+ * The particular format is defined by the Gemm template parameter,
+ * which is a cutlass_sparse_3x_gemm.
+ */
+using CompressorResult = std::tuple<torch::Tensor, torch::Tensor>;
+/// Make A structured sparse by replacing elements with 0 and compress it
+template <typename Gemm>
+CompressorResult cutlass_sparse_compress(torch::Tensor const& a) {
+  // Checks for conformality
+  TORCH_CHECK(a.dtype() == torch::kInt8 || a.dtype() == torch::kFloat8_e4m3fn ||
+              a.dtype() == torch::kFloat16 || a.dtype() == torch::kBFloat16);
+  TORCH_CHECK(a.dim() == 2)
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(0) % 4 == 0)  // Required for semi-structured sparsity
+  TORCH_CHECK(a.stride(1) == 1)
+
+  using GemmKernel = typename Gemm::KernelType;
+  using ElementA = typename Gemm::ElementAB;
+  using ElementE = typename GemmKernel::CollectiveMainloop::ElementE;
+
+  int m = a.size(0);
+  int k = a.size(1);
+  using ProblemShape = typename GemmKernel::ProblemShape;
+  ProblemShape prob_shape{m, 1, k, 1};
+
+  int64_t lda = a.stride(0);
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+  StrideA a_stride{lda, Int<1>{}, 0};
+
+  using CompressorUtility = typename Gemm::CompressorUtility;
+  CompressorUtility compressor_utility(prob_shape, a_stride);
+
+  // Allocate buffers for the metadata E and the compressed matrix A
+  int ME = compressor_utility.get_metadata_m_physical();
+  int KE = compressor_utility.get_metadata_k_physical();
+  int MC = compressor_utility.get_tensorA_m_physical();
+  int KC = compressor_utility.get_tensorA_k_physical();
+
+  auto const a_meta_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto const a_nzs_options =
+      torch::TensorOptions().dtype(a.dtype()).device(a.device());
+
+  auto a_meta = torch::zeros({ME, KE}, a_meta_options);
+  auto a_nzs = torch::zeros({MC, KC}, a_nzs_options);
+
+  auto a_ptr = static_cast<ElementA*>(a.data_ptr());
+  auto a_nzs_ptr = static_cast<ElementA*>(a_nzs.data_ptr());
+  auto a_meta_ptr = static_cast<ElementE*>(a_meta.data_ptr());
+
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.device_id = a.device().index();
+  hw_info.sm_count =
+      cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+          hw_info.device_id);
+
+  using Compressor = typename Gemm::Compressor;
+  typename Compressor::Arguments arguments{
+      prob_shape, {a_ptr, a_stride, a_nzs_ptr, a_meta_ptr}, {hw_info}};
+
+  Compressor compressor_op;
+  size_t workspace_size = Compressor::get_workspace_size(arguments);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  CUTLASS_CHECK(compressor_op.can_implement(arguments));
+  CUTLASS_CHECK(compressor_op.initialize(arguments, workspace.data_ptr()));
+  CUTLASS_CHECK(compressor_op.run());
+  CUDA_CHECK(cudaDeviceSynchronize());
+
+  return {a_meta, a_nzs};
+}
+
+template <typename Gemm, typename... EpilogueArgs>
+void cutlass_sparse_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& bt_nzs,
+                                torch::Tensor const& bt_meta,
+                                EpilogueArgs&&... epilogue_params) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  // Interface stride expected from the argument a (will get transposed)
+  // We compute C^T = B^T * A^T, but we assume B is transposed before
+  // compression and hence the bt_* naming
+  using LayoutB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutA;
+  using LayoutE = typename Gemm::GemmKernel::CollectiveMainloop::LayoutE;
+
+  // M, N, K after transposition
+  int32_t m = out.size(1);
+  int32_t n = out.size(0);
+  int32_t k = a.size(1);
+
+  int64_t lda = a.stride(0);
+  int64_t ldc = out.stride(0);
+
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+  using StrideC = Stride<Int<1>, int64_t, int64_t>;
+
+  StrideA a_stride{lda, Int<1>{}, Int<0>{}};
+  StrideC c_stride{Int<1>{}, ldc, Int<0>{}};
+
+  using GemmKernel = typename Gemm::GemmKernel;
+  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
+
+  using ElementE = typename GemmKernel::CollectiveMainloop::ElementE;
+  using SparseConfig = typename GemmKernel::CollectiveMainloop::SparseConfig;
+
+  LayoutB b_layout = SparseConfig::fill_layoutA(prob_shape);
+  LayoutE e_layout = SparseConfig::fill_layoutE(prob_shape);
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(bt_nzs.data_ptr());
+  auto e_ptr = static_cast<ElementE*>(bt_meta.data_ptr());
+  typename GemmKernel::MainloopArguments mainloop_args{
+      b_ptr, b_layout, a_ptr, a_stride, e_ptr, e_layout};
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, c_stride, c_ptr, c_stride};
+
+  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
+                                      prob_shape, mainloop_args, epilogue_args};
+
+  // Launch the CUTLASS GEMM kernel.
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+//////////////////////////////////////////////////
+// Gemm Configs are defined below
+//////////////////////////////////////////////////
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default {};
+
+template <typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default<half_t, OutType, Epilogue> {
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<half_t, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default<cutlass::bfloat16_t, OutType, Epilogue> {
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<cutlass::bfloat16_t, OutType, Epilogue, TileShape,
+                             ClusterShape, KernelSchedule, EpilogueSchedule>;
+};
+
+//////////////////////// Cherry-Picking Kernels ////////////////////////
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_1 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_2 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _64, _256>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_3 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _2, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_4 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_5 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_6 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _2, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_7 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_8 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _256, _128>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule>;
+};
+////////////////////////////////////////////////////////////////////////
+
+template <typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default<cutlass::float_e4m3_t, OutType, Epilogue> {
+  // M in (128, inf)
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_1, _2, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<cutlass::float_e4m3_t, OutType, Epilogue,
+                             TileShape, ClusterShape, KernelSchedule,
+                             EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M64 {
+  // M in [1, 64]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M128 {
+  // M in (64, 128]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M256 {
+  // M in (128, 256]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M512 {
+  // M in (256, ]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default<int8_t, OutType, Epilogue> {
+  // For M > 128 and any N
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<int8_t, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M128 {
+  // For M in (64, 128] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M64 {
+  // For M in (32, 64] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NBig {
+  // For M in [1, 32] and N >= 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _4, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NSmall {
+  // For M in [1, 32] and N < 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _8, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule>;
+};
+
+}  // namespace
diff --git a/vllm_v0.10.0/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/vllm_v0.10.0/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
new file mode 100644
index 0000000..38b929b
--- /dev/null
+++ b/vllm_v0.10.0/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
@@ -0,0 +1,104 @@
+#include <cudaTypedefs.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include "cutlass_extensions/common.hpp"
+
+bool cutlass_sparse_scaled_mm_supported(int64_t cuda_device_capability) {
+  // sparse CUTLASS kernels need at least
+  //   CUDA 12.2 and SM90 (Hopper)
+
+#if defined CUDA_VERSION
+  return CUDA_VERSION >= 12020 && cuda_device_capability >= 90;
+#endif
+
+  return false;
+}
+
+#if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X
+void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
+                                   torch::Tensor const& b,
+                                   torch::Tensor const& e,
+                                   torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   std::optional<torch::Tensor> const& bias);
+
+using CompressorResult = std::tuple<torch::Tensor, torch::Tensor>;
+CompressorResult cutlass_sparse_compress_sm90(torch::Tensor const& a);
+#endif
+
+void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
+                              torch::Tensor const& bt_nzs,
+                              torch::Tensor const& bt_meta,
+                              torch::Tensor const& a_scales,
+                              torch::Tensor const& b_scales,
+                              std::optional<torch::Tensor> const& bias) {
+  // Checks for conformality
+  TORCH_CHECK(a.dim() == 2 && bt_nzs.dim() == 2 && c.dim() == 2);
+  TORCH_CHECK(c.size(1) == bt_nzs.size(0) && bt_nzs.size(1) * 2 == a.size(1) &&
+              a.size(0) == c.size(0));
+  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
+  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == bt_nzs.size(0));
+
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1 && bt_nzs.stride(1) == 1 &&
+              c.stride(1) == 1);            // Row-major
+  TORCH_CHECK(c.stride(0) % 16 == 0);       // 16 Byte Alignment
+  TORCH_CHECK(bt_nzs.stride(0) % 16 == 0);  // 16 Byte Alignment
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+
+  if (bias) {
+    TORCH_CHECK(bias->numel() == bt_nzs.size(0) && bias->is_contiguous() &&
+                bias->dim() == 1);
+  }
+
+  at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
+  int32_t version_num = get_sm_version_num();
+
+  // Guard against compilation issues for sm90 kernels
+#if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X
+  // We build for 9.0a which is not forward compatible, so restrict this to
+  // Hopper only
+  if (version_num == 90) {
+    cutlass_scaled_sparse_mm_sm90(c, a, bt_nzs, bt_meta, a_scales, b_scales,
+                                  bias);
+    return;
+  }
+#endif
+
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_scaled_sparse_mm for a compute capability less than "
+      "CUDA device capability: ",
+      version_num);
+}
+
+std::vector<torch::Tensor> cutlass_sparse_compress(torch::Tensor const& a) {
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1);      // Row-major
+  TORCH_CHECK(a.stride(0) % 8 == 0);  // 8 Byte Alignment for Compression
+
+  at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
+  int32_t version_num = get_sm_version_num();
+
+  // Guard against compilation issues for sm90 kernels
+#if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X
+  // We build for 9.0a which is not forward compatible, so restrict this to
+  // Hopper only
+  if (version_num == 90) {
+    std::vector<torch::Tensor> result_tensors;
+
+    auto [a_meta, a_nzs] = cutlass_sparse_compress_sm90(a);
+    result_tensors.push_back(std::move(a_nzs));
+    result_tensors.push_back(std::move(a_meta));
+    return result_tensors;
+  }
+#endif
+
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_sparse_compress for a compute capability less than "
+      "CUDA device capability: ",
+      version_num);
+}
diff --git a/vllm_v0.10.0/csrc/torch_bindings.cpp b/vllm_v0.10.0/csrc/torch_bindings.cpp
new file mode 100644
index 0000000..95f8541
--- /dev/null
+++ b/vllm_v0.10.0/csrc/torch_bindings.cpp
@@ -0,0 +1,763 @@
+#include "cache.h"
+#include "cuda_utils.h"
+#include "ops.h"
+#include "core/registration.h"
+
+#include <torch/library.h>
+#include <torch/version.h>
+
+// Note on op signatures:
+// The X_meta signatures are for the meta functions corresponding to op X.
+// They must be kept in sync with the signature for X. Generally, only
+// functions that return Tensors require a meta function.
+//
+// See the following links for detailed docs on op registration and function
+// schemas.
+// https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU/edit#heading=h.ptttacy8y1u9
+// https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/README.md#annotations
+
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  // vLLM custom ops
+  //
+
+  // The default behavior in PyTorch 2.6 was changed to "requires_contiguous",
+  // so we need
+  // to override this for many GEMMs with the following tag. Otherwise,
+  // torch.compile will force all input tensors to be contiguous(), which
+  // will break many custom ops that require column-major weight matrices.
+  // This was a bug and PyTorch 2.7 has since fixed this.
+#if TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 6
+  #define stride_tag at::Tag::needs_fixed_stride_order
+#else
+  #define stride_tag
+#endif
+
+  ops.def("weak_ref_tensor(Tensor input) -> Tensor");
+  ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);
+
+  ops.def("get_cuda_view_from_cpu_tensor(Tensor cpu_tensor) -> Tensor");
+  ops.impl("get_cuda_view_from_cpu_tensor", torch::kCPU,
+           &get_cuda_view_from_cpu_tensor);
+
+  // Attention ops
+  // Compute the attention between an input query and the cached
+  // keys/values using PagedAttention.
+  ops.def(
+      "paged_attention_v1("
+      "    Tensor! out, Tensor query, Tensor key_cache,"
+      "    Tensor value_cache, int num_kv_heads, float scale,"
+      "    Tensor block_tables, Tensor seq_lens, int block_size,"
+      "    int max_seq_len, Tensor? alibi_slopes,"
+      "    str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
+      "    int tp_rank, int blocksparse_local_blocks,"
+      "    int blocksparse_vert_stride, int blocksparse_block_size,"
+      "    int blocksparse_head_sliding_step) -> ()");
+  ops.impl("paged_attention_v1", torch::kCUDA, &paged_attention_v1);
+
+  // PagedAttention V2.
+  ops.def(
+      "paged_attention_v2("
+      "    Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
+      "    Tensor! tmp_out, Tensor query, Tensor key_cache,"
+      "    Tensor value_cache, int num_kv_heads, float scale,"
+      "    Tensor block_tables, Tensor seq_lens, int block_size,"
+      "    int max_seq_len, Tensor? alibi_slopes,"
+      "    str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
+      "    int tp_rank, int blocksparse_local_blocks,"
+      "    int blocksparse_vert_stride, int blocksparse_block_size,"
+      "    int blocksparse_head_sliding_step) -> ()");
+  ops.impl("paged_attention_v2", torch::kCUDA, &paged_attention_v2);
+
+#ifndef USE_ROCM
+  // Merge attn states
+  // Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
+  // can be used to combine partial attention results (in the split-KV case)
+  ops.def(
+      "merge_attn_states("
+      "    Tensor! output,"
+      "    Tensor!? output_lse,"
+      "    Tensor prefix_output,"
+      "    Tensor prefix_lse,"
+      "    Tensor suffix_output,"
+      "    Tensor suffix_lse) -> ()");
+  ops.impl("merge_attn_states", torch::kCUDA, &merge_attn_states);
+
+  ops.def(
+      "convert_vertical_slash_indexes("
+      "   Tensor! block_count, Tensor! block_offset, "
+      "   Tensor! column_count, Tensor! column_index, "
+      "   Tensor q_seqlens, Tensor q_seqlens, "
+      "   Tensor vertical_indexes, Tensor slash_indexes, "
+      "   int context_size, int block_size_M, int block_size_N, "
+      "   bool causal) -> ()");
+  ops.impl("convert_vertical_slash_indexes", torch::kCUDA,
+           &convert_vertical_slash_indexes);
+
+  ops.def(
+      "convert_vertical_slash_indexes_mergehead("
+      "   Tensor! block_count, Tensor! block_offset, "
+      "   Tensor! column_count, Tensor! column_index, "
+      "   Tensor q_seqlens, Tensor q_seqlens, "
+      "   Tensor vertical_indexes, Tensor slash_indexes, "
+      "   Tensor vertical_indices_count, Tensor slash_indices_count, "
+      "   int context_size, int block_size_M, int block_size_N, "
+      "   bool causal) -> ()");
+  ops.impl("convert_vertical_slash_indexes_mergehead", torch::kCUDA,
+           &convert_vertical_slash_indexes_mergehead);
+#endif
+
+  // Activation ops
+  // Activation function used in SwiGLU.
+  ops.def("silu_and_mul(Tensor! result, Tensor input) -> ()");
+  ops.impl("silu_and_mul", torch::kCUDA, &silu_and_mul);
+
+  ops.def(
+      "silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()");
+  ops.impl("silu_and_mul_quant", torch::kCUDA, &silu_and_mul_quant);
+
+  ops.def("mul_and_silu(Tensor! out, Tensor input) -> ()");
+  ops.impl("mul_and_silu", torch::kCUDA, &mul_and_silu);
+
+  // Activation function used in GeGLU with `none` approximation.
+  ops.def("gelu_and_mul(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_and_mul", torch::kCUDA, &gelu_and_mul);
+
+  // Activation function used in GeGLU with `tanh` approximation.
+  ops.def("gelu_tanh_and_mul(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_tanh_and_mul", torch::kCUDA, &gelu_tanh_and_mul);
+
+  // FATReLU implementation.
+  ops.def("fatrelu_and_mul(Tensor! out, Tensor input, float threshold) -> ()");
+  ops.impl("fatrelu_and_mul", torch::kCUDA, &fatrelu_and_mul);
+
+  // GELU implementation used in GPT-2.
+  ops.def("gelu_new(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_new", torch::kCUDA, &gelu_new);
+
+  // Approximate GELU implementation.
+  ops.def("gelu_fast(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_fast", torch::kCUDA, &gelu_fast);
+
+  // Quick GELU implementation.
+  ops.def("gelu_quick(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_quick", torch::kCUDA, &gelu_quick);
+
+  // prepare_inputs advance_step
+  ops.def(
+      "advance_step_flashattn(int num_seqs, int num_queries, int block_size, "
+      "Tensor! input_tokens, Tensor sampled_token_ids, "
+      "Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping, "
+      "Tensor block_tables) -> ()");
+  ops.impl("advance_step_flashattn", torch::kCUDA, &advance_step_flashattn);
+
+  ops.def(
+      "advance_step_flashinfer("
+      "    int num_seqs, int num_queries, int block_size,"
+      "    Tensor! input_tokens, Tensor sampled_token_ids,"
+      "    Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping,"
+      "    Tensor block_tables, Tensor! paged_kv_indices,"
+      "    Tensor! paged_kv_indptr, Tensor! paged_kv_last_page_len,"
+      "    Tensor! block_table_bounds"
+      ") -> ()");
+  ops.impl("advance_step_flashinfer", torch::kCUDA, &advance_step_flashinfer);
+
+  // Layernorm
+  // Apply Root Mean Square (RMS) Normalization to the input tensor.
+  ops.def(
+      "rms_norm(Tensor! result, Tensor input, Tensor weight, float epsilon) -> "
+      "()");
+  ops.impl("rms_norm", torch::kCUDA, &rms_norm);
+
+  // In-place fused Add and RMS Normalization.
+  ops.def(
+      "fused_add_rms_norm(Tensor! input, Tensor! residual, Tensor weight, "
+      "float epsilon) -> ()");
+  ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);
+
+  // Apply repetition penalties to logits in-place
+  ops.def(
+      "apply_repetition_penalties_(Tensor! logits, Tensor prompt_mask, "
+      "Tensor output_mask, Tensor repetition_penalties) -> ()");
+  ops.impl("apply_repetition_penalties_", torch::kCUDA,
+           &apply_repetition_penalties_);
+
+  // Layernorm-quant
+  // Apply Root Mean Square (RMS) Normalization to the input tensor.
+  ops.def(
+      "rms_norm_static_fp8_quant(Tensor! result, Tensor input, Tensor weight, "
+      "Tensor scale, float epsilon) -> "
+      "()");
+  ops.impl("rms_norm_static_fp8_quant", torch::kCUDA,
+           &rms_norm_static_fp8_quant);
+
+  // In-place fused Add and RMS Normalization.
+  ops.def(
+      "fused_add_rms_norm_static_fp8_quant(Tensor! result, Tensor input, "
+      "Tensor! residual, Tensor weight, "
+      "Tensor scale, float epsilon) -> ()");
+  ops.impl("fused_add_rms_norm_static_fp8_quant", torch::kCUDA,
+           &fused_add_rms_norm_static_fp8_quant);
+
+  // Fused Layernorm + Quant kernels
+  ops.def(
+      "rms_norm_dynamic_per_token_quant(Tensor! result, Tensor input, "
+      "Tensor weight, Tensor! scale, float epsilon, "
+      "Tensor? scale_ub, Tensor!? residual) -> ()");
+  ops.impl("rms_norm_dynamic_per_token_quant", torch::kCUDA,
+           &rms_norm_dynamic_per_token_quant);
+
+  // Rotary embedding
+  // Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
+  ops.def(
+      "rotary_embedding(Tensor positions, Tensor! query,"
+      "                 Tensor!? key, int head_size,"
+      "                 Tensor cos_sin_cache, bool is_neox) -> ()");
+  ops.impl("rotary_embedding", torch::kCUDA, &rotary_embedding);
+
+  // Apply GPT-NeoX or GPT-J style rotary embedding to query and key
+  // (supports multiple loras).
+  ops.def(
+      "batched_rotary_embedding(Tensor positions, Tensor! query,"
+      "                         Tensor!? key, int head_size,"
+      "                         Tensor cos_sin_cache, bool is_neox,"
+      "                         int rot_dim,"
+      "                         Tensor cos_sin_cache_offsets) -> ()");
+  ops.impl("batched_rotary_embedding", torch::kCUDA, &batched_rotary_embedding);
+
+  // Quantization ops
+#ifndef USE_ROCM
+  // Quantized GEMM for AQLM.
+  ops.def(
+      "aqlm_gemm(Tensor input, Tensor codes, Tensor codebooks, "
+      "Tensor scales, int[] codebook_partition_sizes, Tensor? bias) "
+      "-> Tensor",
+      {stride_tag});
+  ops.impl("aqlm_gemm", torch::kCUDA, &aqlm_gemm);
+
+  // Decompression method for AQLM.
+  ops.def(
+      "aqlm_dequant(Tensor codes, Tensor codebooks, "
+      "int[] codebook_partition_sizes) -> Tensor",
+      {stride_tag});
+  ops.impl("aqlm_dequant", torch::kCUDA, &aqlm_dequant);
+
+  // Quantized GEMM for AWQ.
+  ops.def(
+      "awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, "
+      "Tensor _zeros, SymInt split_k_iters) -> Tensor",
+      {stride_tag});
+  ops.impl("awq_gemm", torch::kCUDA, &awq_gemm);
+
+  // Dequantization for AWQ.
+  ops.def(
+      "awq_dequantize(Tensor _kernel, Tensor _scaling_factors, "
+      "Tensor _zeros, SymInt split_k_iters, int thx, int thy) -> Tensor",
+      {stride_tag});
+  ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize);
+
+  // Note about marlin kernel 'workspace' arguments:
+  // Technically these should be mutable since they are modified by the kernel.
+  // But since they are set back to zero once the kernel is finished we can
+  // hand wave and say that they have no net effect.
+  //
+  // The reason to mark 'workspace' as immutable is so that they don't interfere
+  // with using ScalarType arguments in the ops. If they are marked as mutable,
+  // pytorch throws an assert in
+  // 'torch._higher_order_ops._register_effectful_op' that prevents these
+  // kernels from being torch.compile'd.
+  // See the following document for more info on custom types and ops that use
+  // custom types:
+  // https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA
+
+  // Marlin (Dense) Optimized Quantized GEMM for GPTQ.
+  ops.def(
+      "marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
+      "Tensor! workspace, SymInt size_m, SymInt size_n, SymInt size_k) -> "
+      "Tensor",
+      {stride_tag});
+  // conditionally compiled so impl in source file
+
+  // Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ.
+  ops.def(
+      "gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, "
+      "Tensor b_scales, Tensor workspace, "
+      "int b_q_type, "
+      "SymInt size_m, SymInt size_n, SymInt size_k) -> Tensor",
+      {stride_tag});
+  //  conditionally compiled so impl in source file
+
+  // Machete (Dense) Optimized Mixed Precision GEMM for Hopper.
+  ops.def(
+      "machete_supported_schedules("
+      "   ScalarType a_type,"
+      "   int b_type,"
+      "   ScalarType? maybe_group_scales_type,"
+      "   ScalarType? maybe_group_zeros_type,"
+      "   ScalarType? maybe_channel_scales_type,"
+      "   ScalarType? maybe_token_scales_type,"
+      "   ScalarType? maybe_out_type"
+      ") -> str[]");
+  ops.def(
+      "machete_mm("
+      "   Tensor A,"
+      "   Tensor B,"
+      "   int b_type,"
+      "   ScalarType? out_type,"
+      "   Tensor? group_scales,"
+      "   Tensor? group_zeros,"
+      "   int?    group_size,"
+      "   Tensor? channel_scales,"
+      "   Tensor? token_scales,"
+      "   str?    schedule"
+      ") -> Tensor",
+      {stride_tag});
+  ops.def(
+      "machete_prepack_B("
+      "   Tensor B,"
+      "   ScalarType a_type,"
+      "   int b_type,"
+      "   ScalarType? group_scales_type"
+      ") -> Tensor");
+  // conditionally compiled so impl registration is in source file
+
+  ops.def("permute_cols(Tensor A, Tensor perm) -> Tensor");
+  ops.impl("permute_cols", torch::kCUDA, &permute_cols);
+
+  // gptq_marlin Optimized Quantized GEMM for GPTQ.
+  ops.def(
+      "gptq_marlin_gemm(Tensor a, Tensor? c_or_none, Tensor b_q_weight, "
+      "Tensor b_scales, Tensor? global_scale, Tensor? b_zeros_or_none, Tensor? "
+      "g_idx_or_none, Tensor? perm_or_none, Tensor workspace, int b_q_type, "
+      "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, "
+      "bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) -> Tensor",
+      {stride_tag});
+  // conditionally compiled so impl registration is in source file
+
+  // gptq_marlin repack from GPTQ.
+  ops.def(
+      "gptq_marlin_repack(Tensor b_q_weight, Tensor perm, "
+      "SymInt size_k, SymInt size_n, int num_bits) -> Tensor");
+  // conditionally compiled so impl registrations are in source file
+
+  // awq_marlin repack from AWQ.
+  ops.def(
+      "awq_marlin_repack(Tensor b_q_weight, SymInt size_k, "
+      "SymInt size_n, int num_bits) -> Tensor");
+  // conditionally compiled so impl registrations are in source file
+#endif
+
+  // Dequantization for GGML.
+  ops.def(
+      "ggml_dequantize(Tensor W, int type, SymInt m, SymInt n, ScalarType? "
+      "dtype) -> Tensor");
+  ops.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize);
+
+  // mmvq kernel for GGML.
+  ops.def(
+      "ggml_mul_mat_vec_a8(Tensor W, Tensor X, int type, SymInt row) "
+      "-> Tensor");
+  ops.impl("ggml_mul_mat_vec_a8", torch::kCUDA, &ggml_mul_mat_vec_a8);
+
+  // mmq kernel for GGML.
+  ops.def(
+      "ggml_mul_mat_a8(Tensor W, Tensor X, int type, SymInt row) -> Tensor");
+  ops.impl("ggml_mul_mat_a8", torch::kCUDA, &ggml_mul_mat_a8);
+
+  // moe kernel for GGML.
+  ops.def(
+      "ggml_moe_a8(Tensor X, Tensor W, "
+      "Tensor sorted_token_ids, Tensor expert_ids, Tensor "
+      "num_tokens_post_padded, "
+      "int type, SymInt row, SymInt top_k, SymInt tokens) -> Tensor");
+  ops.impl("ggml_moe_a8", torch::kCUDA, &ggml_moe_a8);
+
+  ops.def(
+      "ggml_moe_a8_vec(Tensor X, Tensor W, "
+      "Tensor topk_ids, int top_k, "
+      "int type, SymInt row, SymInt tokens) -> Tensor");
+  ops.impl("ggml_moe_a8_vec", torch::kCUDA, &ggml_moe_a8_vec);
+
+  ops.def("ggml_moe_get_block_size", &ggml_moe_get_block_size);
+
+#ifndef USE_ROCM
+  // marlin_qqq_gemm for QQQ.
+  ops.def(
+      "marlin_qqq_gemm(Tensor a, Tensor b_q_weight, "
+      "Tensor s_tok, Tensor s_ch, Tensor s_group, "
+      "Tensor! workspace, SymInt size_m, SymInt size_n, "
+      "SymInt size_k) -> Tensor",
+      {stride_tag});
+  // conditionally compiled so impl registration is in source file
+
+  // CUTLASS nvfp4 block scaled GEMM
+  ops.def(
+      "cutlass_scaled_fp4_mm(Tensor! out, Tensor a, Tensor b,"
+      "                      Tensor block_scale_a, Tensor block_scale_b,"
+      "                      Tensor alpha) -> ()",
+      {stride_tag});
+  ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm);
+
+  // cutlass blockwise scaledgroup GEMM
+  ops.def(
+      "cutlass_blockwise_scaled_grouped_mm(Tensor! output, Tensor a, Tensor b, "
+      "Tensor scales_a, Tensor scales_b, "
+      "Tensor problem_sizes, Tensor expert_offsets) -> ()",
+      {stride_tag});
+  // conditionally compiled so impl registration is in source file
+
+  // cutlass nvfp4 block scaled group GEMM
+  ops.def(
+      "cutlass_fp4_group_mm(Tensor! out, Tensor a, Tensor b,"
+      " Tensor a_blockscale, Tensor b_blockscales, Tensor alphas,"
+      " Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()",
+      {stride_tag});
+  ops.impl("cutlass_fp4_group_mm", torch::kCUDA, &cutlass_fp4_group_mm);
+
+  // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
+  // quantization, as well as bias
+  ops.def(
+      "cutlass_scaled_mm(Tensor! out, Tensor a,"
+      "                  Tensor b, Tensor a_scales,"
+      "                  Tensor b_scales, Tensor? bias) -> ()",
+      {stride_tag});
+  ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm);
+
+  // CUTLASS w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
+  // quantization.
+  ops.def(
+      "cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
+      "                  Tensor b, Tensor a_scales,"
+      "                  Tensor b_scales, Tensor azp_adj,"
+      "                  Tensor? azp, Tensor? bias) -> ()",
+      {stride_tag});
+  ops.impl("cutlass_scaled_mm_azp", torch::kCUDA, &cutlass_scaled_mm_azp);
+
+  // Check if cutlass scaled_mm is supported for CUDA devices of the given
+  // capability
+  ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
+  ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
+
+  // Check if cutlass grouped gemm is supported for CUDA devices of the given
+  // capability
+  ops.def("cutlass_group_gemm_supported(int cuda_device_capability) -> bool");
+  ops.impl("cutlass_group_gemm_supported", &cutlass_group_gemm_supported);
+
+  // CUTLASS w8a8 grouped GEMM
+  ops.def(
+      "cutlass_moe_mm(Tensor! out_tensors, Tensor a_tensors, Tensor b_tensors, "
+      "               Tensor a_scales, Tensor b_scales, Tensor expert_offsets, "
+      "               Tensor problem_sizes, Tensor a_strides, "
+      "               Tensor b_strides, Tensor c_strides, bool per_act_token, "
+      "               bool per_out_ch) -> ()",
+      {stride_tag});
+  ops.impl("cutlass_moe_mm", torch::kCUDA, &cutlass_moe_mm);
+
+  // A function that computes data required to run fused MoE with w8a8 grouped
+  // GEMM. It takes topk_ids as an input, and computes expert_offsets
+  // (token start indices of each expert). In addition to this, it computes
+  // problem sizes for each expert's multiplication used by the two mms called
+  // from fused MoE operation, and arrays with permutations required to shuffle
+  // and de-shuffle the input/output of the fused operation.
+  ops.def(
+      "get_cutlass_moe_mm_data(Tensor topk_ids, Tensor! expert_offsets, "
+      "                        Tensor! problem_sizes1, Tensor! problem_sizes2, "
+      "                        Tensor! input_permutation, "
+      "                        Tensor! output_permutation, int num_experts, "
+      "                        int n, int k, Tensor? blockscale_offsets) -> ()",
+      {stride_tag});
+  ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data);
+
+  // A function that computes data required to run fused MoE with w8a8 grouped
+  // GEMM and PPLX. It takes expert_num_tokens and non_zero_expert_idxs
+  // as an input, and computes expert_offsets (token start indices of each
+  // expert). In addition to this, it computes problem sizes for each expert's
+  // multiplication used by the two mms called from fused MoE operation.
+  ops.def(
+      "get_cutlass_pplx_moe_mm_data(Tensor! expert_offsets, "
+      "                             Tensor! problem_sizes1, "
+      "                             Tensor! problem_sizes2, "
+      "                             Tensor expert_num_tokens, "
+      "                             int num_local_experts, int padded_m, "
+      "                             int n, int k) -> ()",
+      {stride_tag});
+  ops.impl("get_cutlass_pplx_moe_mm_data", torch::kCUDA,
+           &get_cutlass_pplx_moe_mm_data);
+
+  // Check if cutlass scaled_mm supports block quantization (used by DeepSeekV3)
+  ops.def(
+      "cutlass_scaled_mm_supports_block_fp8(int cuda_device_capability) -> "
+      "bool");
+  ops.impl("cutlass_scaled_mm_supports_block_fp8",
+           &cutlass_scaled_mm_supports_block_fp8);
+
+  // Check if cutlass sparse scaled_mm is supported for CUDA devices of the
+  // given capability
+  ops.def(
+      "cutlass_sparse_scaled_mm_supported(int cuda_device_capability) -> bool");
+  ops.impl("cutlass_sparse_scaled_mm_supported",
+           &cutlass_sparse_scaled_mm_supported);
+
+  // CUTLASS sparse GEMM, supporting symmetric per-tensor or per-row/column
+  // quantization, as well as bias
+  ops.def(
+      "cutlass_scaled_sparse_mm(Tensor! out, Tensor a,"
+      "                         Tensor bt_nzs,"
+      "                         Tensor bt_meta, Tensor a_scales,"
+      "                         Tensor b_scales, Tensor? bias) -> ()",
+      {stride_tag});
+  ops.impl("cutlass_scaled_sparse_mm", torch::kCUDA, &cutlass_scaled_sparse_mm);
+
+  // CUTLASS sparse matrix compressor
+  ops.def("cutlass_sparse_compress(Tensor a) -> Tensor[]");
+  ops.impl("cutlass_sparse_compress", &cutlass_sparse_compress);
+
+  // CUTLASS MLA decode
+  ops.def(
+      "cutlass_mla_decode(Tensor! out, Tensor q_nope, Tensor q_pe,"
+      "                   Tensor kv_c_and_k_pe_cache, Tensor seq_lens,"
+      "                   Tensor page_table, float scale) -> ()");
+  ops.impl("cutlass_mla_decode", torch::kCUDA, &cutlass_mla_decode);
+
+  // SM100 CUTLASS MLA decode
+  ops.def(
+      "sm100_cutlass_mla_decode(Tensor! out, Tensor q_nope, Tensor q_pe,"
+      "                         Tensor kv_c_and_k_pe_cache, Tensor seq_lens,"
+      "                         Tensor page_table, Tensor workspace, float "
+      "scale,"
+      "                         int num_kv_splits) -> ()");
+  // conditionally compiled so impl in source file
+
+  // SM100 CUTLASS MLA workspace
+  ops.def(
+      "sm100_cutlass_mla_get_workspace_size(int max_seq_len, int num_batches,"
+      "                                     int sm_count, int num_kv_splits) "
+      "-> int");
+  // conditionally compiled so impl in source file
+
+  // Compute NVFP4 block quantized tensor.
+  ops.def(
+      "scaled_fp4_quant(Tensor! output, Tensor input,"
+      "                 Tensor! output_scale, Tensor input_scale) -> ()");
+  ops.impl("scaled_fp4_quant", torch::kCUDA, &scaled_fp4_quant);
+
+  // Compute NVFP4 experts quantization.
+  ops.def(
+      "scaled_fp4_experts_quant(Tensor! output, Tensor! output_scale,"
+      "Tensor input, Tensor input_global_scale, Tensor input_offset_by_experts,"
+      "Tensor output_scale_offset_by_experts) -> ()");
+  ops.impl("scaled_fp4_experts_quant", torch::kCUDA, &scaled_fp4_experts_quant);
+
+  // Check if cutlass_scaled_mm_fp4 is supported for CUDA devices
+  // of the given capability
+  ops.def("cutlass_scaled_mm_supports_fp4(int cuda_device_capability) -> bool");
+  ops.impl("cutlass_scaled_mm_supports_fp4", &cutlass_scaled_mm_supports_fp4);
+#endif
+
+  // Quantized GEMM for GPTQ.
+  // Note: even though the C++ inferred schema is correct for this op, it seems
+  // to prevent the meta function registry.
+  ops.def(
+      "gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, "
+      "Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, int bit) "
+      "-> Tensor",
+      {stride_tag});
+  ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);
+
+  // Post processing for GPTQ.
+  ops.def("gptq_shuffle(Tensor! q_weight, Tensor q_perm, int bit) -> ()");
+  ops.impl("gptq_shuffle", torch::kCUDA, &gptq_shuffle);
+
+  // Compute FP8 quantized tensor for given scaling factor.
+  ops.def(
+      "static_scaled_fp8_quant(Tensor! result, Tensor input, Tensor scale) -> "
+      "()");
+  ops.impl("static_scaled_fp8_quant", torch::kCUDA, &static_scaled_fp8_quant);
+
+  // Compute dynamic-per-tensor FP8 quantized tensor and scaling factor.
+  ops.def(
+      "dynamic_scaled_fp8_quant(Tensor! result, Tensor input, Tensor! scale) "
+      "-> "
+      "()");
+  ops.impl("dynamic_scaled_fp8_quant", torch::kCUDA, &dynamic_scaled_fp8_quant);
+
+  // Compute dynamic-per-token FP8 quantized tensor and scaling factor.
+  ops.def(
+      "dynamic_per_token_scaled_fp8_quant(Tensor! result, Tensor input, "
+      "Tensor! scale, Tensor? scale_ub) -> "
+      "()");
+  ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA,
+           &dynamic_per_token_scaled_fp8_quant);
+
+  // Compute int8 quantized tensor for given scaling factor.
+  ops.def(
+      "static_scaled_int8_quant(Tensor! result, Tensor input, Tensor scale,"
+      "Tensor? azp) -> ()");
+  ops.impl("static_scaled_int8_quant", torch::kCUDA, &static_scaled_int8_quant);
+
+  // Compute int8 quantized tensor and scaling factor
+  ops.def(
+      "dynamic_scaled_int8_quant(Tensor! result, Tensor input, Tensor! scale, "
+      "Tensor!? azp) -> ()");
+  ops.impl("dynamic_scaled_int8_quant", torch::kCUDA,
+           &dynamic_scaled_int8_quant);
+
+  // Mamba selective scan kernel
+  ops.def(
+      "selective_scan_fwd(Tensor! u, Tensor! delta,"
+      "Tensor! A, Tensor! B, Tensor! C,"
+      "Tensor? D_, Tensor!? z_, Tensor? delta_bias_,"
+      "bool delta_softplus,"
+      "Tensor? query_start_loc,"
+      "Tensor? cache_indices,"
+      "Tensor? has_initial_state,"
+      "Tensor! ssm_states,"
+      "int pad_slot_id) -> ()");
+  ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
+
+#ifndef USE_ROCM
+  // Compute per-token-group FP8 quantized tensor and scaling factor.
+  ops.def(
+      "per_token_group_fp8_quant(Tensor input, Tensor! output_q, Tensor! "
+      "output_s, "
+      "int group_size, float eps, float fp8_min, float fp8_max, bool "
+      "scale_ue8m0) -> ()");
+  ops.impl("per_token_group_fp8_quant", torch::kCUDA,
+           &per_token_group_quant_fp8);
+
+  // reorder weight for AllSpark Ampere W8A16 Fused Gemm kernel
+  ops.def(
+      "rearrange_kn_weight_as_n32k16_order(Tensor b_qweight, Tensor b_scales, "
+      "Tensor? b_zeros, "
+      "bool has_zp, Tensor! b_qweight_reorder, Tensor! b_scales_reorder, "
+      "Tensor!? b_zeros_reorder, "
+      "int K, int N, int N_32align) -> ()");
+  //  conditionally compiled so impl in source file
+
+  // AllSpark quantization ops
+  ops.def(
+      "allspark_w8a16_gemm(Tensor a, Tensor b_qweight, Tensor b_scales, "
+      "Tensor? b_qzeros, "
+      "SymInt n, SymInt group_size, SymInt sm_count, SymInt sm_version, SymInt "
+      "CUBLAS_M_THRESHOLD, bool has_zp, bool n32k16_reorder) -> Tensor");
+  //  conditionally compiled so impl in source file
+#endif
+}
+
+TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
+  // Cache ops
+  // Swap in (out) the cache blocks from src to dst.
+  cache_ops.def(
+      "swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
+  cache_ops.impl("swap_blocks", torch::kCUDA, &swap_blocks);
+
+  // Copy the cache blocks from src to dst.
+  cache_ops.def(
+      "copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
+      "Tensor block_mapping) -> ()");
+  cache_ops.impl("copy_blocks", torch::kCUDA, &copy_blocks);
+
+  cache_ops.def(
+      "copy_blocks_mla(Tensor(a!)[] kv_caches, Tensor block_mapping) -> ()");
+  cache_ops.impl("copy_blocks_mla", torch::kCUDA, &copy_blocks_mla);
+
+  // Reshape the key and value tensors and cache them.
+  cache_ops.def(
+      "reshape_and_cache(Tensor key, Tensor value,"
+      "                  Tensor! key_cache, Tensor! value_cache,"
+      "                  Tensor slot_mapping,"
+      "                  str kv_cache_dtype,"
+      "                  Tensor k_scale, Tensor v_scale) -> ()");
+  cache_ops.impl("reshape_and_cache", torch::kCUDA, &reshape_and_cache);
+
+  // Reshape the key and value tensors and cache them.
+  cache_ops.def(
+      "reshape_and_cache_flash(Tensor key, Tensor value,"
+      "                        Tensor! key_cache,"
+      "                        Tensor! value_cache,"
+      "                        Tensor slot_mapping,"
+      "                        str kv_cache_dtype,"
+      "                        Tensor k_scale, Tensor v_scale) -> ()");
+  cache_ops.impl("reshape_and_cache_flash", torch::kCUDA,
+                 &reshape_and_cache_flash);
+
+  // Concat kv_c and k_pe and cache them.
+  cache_ops.def(
+      "concat_and_cache_mla(Tensor kv_c, Tensor k_pe,"
+      "                     Tensor! kv_cache,"
+      "                     Tensor slot_mapping,"
+      "                     str kv_cache_dtype,"
+      "                     Tensor scale) -> ()");
+  cache_ops.impl("concat_and_cache_mla", torch::kCUDA, &concat_and_cache_mla);
+
+  // Convert the key and value cache to fp8 data type.
+  cache_ops.def(
+      "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
+      "str kv_cache_dtype) -> ()");
+  cache_ops.impl("convert_fp8", torch::kCUDA, &convert_fp8);
+
+  // Gather cache blocks from src_cache to dst.
+  cache_ops.def(
+      "gather_cache(Tensor src_cache, Tensor! dst, Tensor block_table, "
+      "Tensor cu_seq_lens, int batch_size, Tensor? seq_starts) -> ()");
+  cache_ops.impl("gather_cache", torch::kCUDA, &gather_cache);
+}
+
+TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
+  // Cuda utils
+
+  // Gets the specified device attribute.
+  cuda_utils.def("get_device_attribute(int attribute, int device_id) -> int");
+  cuda_utils.impl("get_device_attribute", &get_device_attribute);
+
+  // Gets the maximum shared memory per block device attribute.
+  cuda_utils.def(
+      "get_max_shared_memory_per_block_device_attribute(int device_id) -> int");
+  cuda_utils.impl("get_max_shared_memory_per_block_device_attribute",
+                  &get_max_shared_memory_per_block_device_attribute);
+}
+
+TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
+  // Custom all-reduce kernels
+  custom_ar.def(
+      "init_custom_ar(int[] ipc_tensors, Tensor rank_data, "
+      "int rank, bool fully_connected) -> int");
+  custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
+  custom_ar.def(
+      "all_reduce(int fa, Tensor inp, Tensor! out, int reg_buffer, "
+      "int reg_buffer_sz_bytes) -> ()");
+  custom_ar.impl("all_reduce", torch::kCUDA, &all_reduce);
+
+  custom_ar.def("dispose", &dispose);
+  custom_ar.def("meta_size", &meta_size);
+
+  custom_ar.def("register_buffer", &register_buffer);
+  custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
+  custom_ar.def("register_graph_buffers", &register_graph_buffers);
+
+  custom_ar.def("allocate_shared_buffer_and_handle",
+                &allocate_shared_buffer_and_handle);
+  custom_ar.def("open_mem_handle(Tensor mem_handle) -> int", &open_mem_handle);
+  custom_ar.impl("open_mem_handle", torch::kCPU, &open_mem_handle);
+
+  custom_ar.def("free_shared_buffer", &free_shared_buffer);
+#ifdef USE_ROCM
+  // Quick Reduce all-reduce kernels
+  custom_ar.def(
+      "qr_all_reduce(int fa, Tensor inp, Tensor out, int quant_level, bool "
+      "cast_bf2half) -> ()");
+  custom_ar.impl("qr_all_reduce", torch::kCUDA, &qr_all_reduce);
+
+  custom_ar.def("init_custom_qr", &init_custom_qr);
+  custom_ar.def("qr_destroy", &qr_destroy);
+
+  custom_ar.def("qr_get_handle", &qr_get_handle);
+
+  custom_ar.def("qr_open_handles(int _fa, Tensor[](b!) handles) -> ()");
+  custom_ar.impl("qr_open_handles", torch::kCPU, &qr_open_handles);
+
+  // Max input size in bytes
+  custom_ar.def("qr_max_size", &qr_max_size);
+#endif
+}
+
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/vllm_v0.10.0/csrc/type_convert.cuh b/vllm_v0.10.0/csrc/type_convert.cuh
new file mode 100644
index 0000000..21b9d0a
--- /dev/null
+++ b/vllm_v0.10.0/csrc/type_convert.cuh
@@ -0,0 +1,165 @@
+#pragma once
+
+#include <torch/all.h>
+
+#ifndef USE_ROCM
+  #include <cuda_bf16.h>
+  #include <cuda_fp16.h>
+#else
+  #include <hip/hip_bf16.h>
+  #include <hip/hip_fp16.h>
+
+using __nv_bfloat16 = __hip_bfloat16;
+using __nv_bfloat162 = __hip_bfloat162;
+#endif
+
+namespace vllm {
+/* Converter structs for the conversion from torch types to HIP/CUDA types,
+   and the associated type conversions within HIP/CUDA. These helpers need
+   to be implemented for now because the relevant type conversion
+   operators/constructors are not consistently implemented by HIP/CUDA, so
+   a generic conversion via type casts cannot be implemented.
+
+   Each struct should have the member static constexpr bool `exists`:
+   If false, the optimized kernel is not used for the corresponding torch type.
+   If true, the struct should be fully defined as shown in the examples below.
+ */
+template <typename torch_type>
+struct _typeConvert {
+  static constexpr bool exists = false;
+};
+
+#if defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000))
+// CUDA < 12.0 runs into issues with packed type conversion
+template <>
+struct _typeConvert<c10::Half> {
+  static constexpr bool exists = true;
+  using hip_type = __half;
+  using packed_hip_type = __half2;
+
+  __device__ static inline float convert(hip_type x) { return __half2float(x); }
+  __device__ static inline float2 convert(packed_hip_type x) {
+    return __half22float2(x);
+  }
+  __device__ static inline hip_type convert(float x) {
+    return __float2half_rn(x);
+  }
+  __device__ static inline packed_hip_type convert(float2 x) {
+    return __float22half2_rn(x);
+  }
+};
+
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+// CUDA_ARCH < 800 does not have BF16 support
+// TODO: Add in ROCm support once public headers handle bf16 maturely
+template <>
+struct _typeConvert<c10::BFloat16> {
+  static constexpr bool exists = true;
+  using hip_type = __nv_bfloat16;
+  using packed_hip_type = __nv_bfloat162;
+
+  __device__ static inline float convert(hip_type x) {
+    return __bfloat162float(x);
+  }
+  __device__ static inline float2 convert(packed_hip_type x) {
+    return __bfloat1622float2(x);
+  }
+  __device__ static inline hip_type convert(float x) {
+    return __float2bfloat16(x);
+  }
+  __device__ static inline packed_hip_type convert(float2 x) {
+    return __float22bfloat162_rn(x);
+  }
+};
+  #endif  // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+#endif    // defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >=
+          // 12000))
+
+/* Vector POD struct to generate vectorized and packed FP16/BF16 ops
+   for appropriate specializations of fused_add_rms_norm_kernel.
+   Only functions that are necessary in that kernel are implemented.
+   Alignment to 16 bytes is required to use 128-bit global memory ops.
+ */
+template <typename scalar_t, int width>
+struct alignas(16) _f16Vec {
+  /* Not theoretically necessary that width is a power of 2 but should
+     almost always be the case for optimization purposes */
+  static_assert(width > 0 && (width & (width - 1)) == 0,
+                "Width is not a positive power of 2!");
+  using Converter = _typeConvert<scalar_t>;
+  using T1 = typename Converter::hip_type;
+  using T2 = typename Converter::packed_hip_type;
+  T1 data[width];
+
+  __device__ _f16Vec& operator+=(const _f16Vec<scalar_t, width>& other) {
+    if constexpr (width % 2 == 0) {
+#pragma unroll
+      for (int i = 0; i < width; i += 2) {
+        T2 temp{data[i], data[i + 1]};
+        temp += T2{other.data[i], other.data[i + 1]};
+        data[i] = temp.x;
+        data[i + 1] = temp.y;
+      }
+    } else {
+#pragma unroll
+      for (int i = 0; i < width; ++i) data[i] += other.data[i];
+    }
+    return *this;
+  }
+
+  __device__ _f16Vec& operator*=(const _f16Vec<scalar_t, width>& other) {
+    if constexpr (width % 2 == 0) {
+#pragma unroll
+      for (int i = 0; i < width; i += 2) {
+        T2 temp{data[i], data[i + 1]};
+        temp *= T2{other.data[i], other.data[i + 1]};
+        data[i] = temp.x;
+        data[i + 1] = temp.y;
+      }
+    } else {
+#pragma unroll
+      for (int i = 0; i < width; ++i) data[i] *= other.data[i];
+    }
+    return *this;
+  }
+
+  __device__ _f16Vec& operator*=(const float scale) {
+    if constexpr (width % 2 == 0) {
+#pragma unroll
+      for (int i = 0; i < width; i += 2) {
+        float2 temp_f = Converter::convert(T2{data[i], data[i + 1]});
+        temp_f.x *= scale;
+        temp_f.y *= scale;
+        T2 temp = Converter::convert(temp_f);
+        data[i] = temp.x;
+        data[i + 1] = temp.y;
+      }
+    } else {
+#pragma unroll
+      for (int i = 0; i < width; ++i) {
+        float temp = Converter::convert(data[i]) * scale;
+        data[i] = Converter::convert(temp);
+      }
+    }
+    return *this;
+  }
+
+  __device__ float sum_squares() const {
+    float result = 0.0f;
+    if constexpr (width % 2 == 0) {
+#pragma unroll
+      for (int i = 0; i < width; i += 2) {
+        float2 z = Converter::convert(T2{data[i], data[i + 1]});
+        result += z.x * z.x + z.y * z.y;
+      }
+    } else {
+#pragma unroll
+      for (int i = 0; i < width; ++i) {
+        float x = Converter::convert(data[i]);
+        result += x * x;
+      }
+    }
+    return result;
+  }
+};
+}  // namespace vllm
\ No newline at end of file
diff --git a/vllm_v0.10.0/docker/Dockerfile b/vllm_v0.10.0/docker/Dockerfile
new file mode 100644
index 0000000..868b817
--- /dev/null
+++ b/vllm_v0.10.0/docker/Dockerfile
@@ -0,0 +1,527 @@
+
+# The vLLM Dockerfile is used to construct vLLM image that can be directly used
+# to run the OpenAI compatible server.
+
+# Please update any changes made here to
+# docs/contributing/dockerfile/dockerfile.md and
+# docs/assets/contributing/dockerfile-stages-dependency.png
+
+ARG CUDA_VERSION=12.8.1
+ARG PYTHON_VERSION=3.12
+
+# By parameterizing the base images, we allow third-party to use their own
+# base images. One use case is hermetic builds with base images stored in
+# private registries that use a different repository naming conventions.
+#
+# Example:
+# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
+ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
+ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
+
+# By parameterizing the Deadsnakes repository URL, we allow third-party to use
+# their own mirror. When doing so, we don't benefit from the transparent
+# installation of the GPG key of the PPA, as done by add-apt-repository, so we
+# also need a URL for the GPG key.
+ARG DEADSNAKES_MIRROR_URL
+ARG DEADSNAKES_GPGKEY_URL
+
+# The PyPA get-pip.py script is a self contained script+zip file, that provides
+# both the installer script and the pip base85-encoded zip archive. This allows
+# bootstrapping pip in environment where a dsitribution package does not exist.
+#
+# By parameterizing the URL for get-pip.py installation script, we allow
+# third-party to use their own copy of the script stored in a private mirror.
+# We set the default value to the PyPA owned get-pip.py script.
+#
+# Reference: https://pip.pypa.io/en/stable/installation/#get-pip-py
+ARG GET_PIP_URL="https://bootstrap.pypa.io/get-pip.py"
+
+# PIP supports fetching the packages from custom indexes, allowing third-party
+# to host the packages in private mirrors. The PIP_INDEX_URL and
+# PIP_EXTRA_INDEX_URL are standard PIP environment variables to override the
+# default indexes. By letting them empty by default, PIP will use its default
+# indexes if the build process doesn't override the indexes.
+#
+# Uv uses different variables. We set them by default to the same values as
+# PIP, but they can be overridden.
+ARG PIP_INDEX_URL
+ARG PIP_EXTRA_INDEX_URL
+ARG UV_INDEX_URL=${PIP_INDEX_URL}
+ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+
+# PyTorch provides its own indexes for standard and nightly builds
+ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl
+ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly
+
+# PIP supports multiple authentication schemes, including keyring
+# By parameterizing the PIP_KEYRING_PROVIDER variable and setting it to
+# disabled by default, we allow third-party to use keyring authentication for
+# their private Python indexes, while not changing the default behavior which
+# is no authentication.
+#
+# Reference: https://pip.pypa.io/en/stable/topics/authentication/#keyring-support
+ARG PIP_KEYRING_PROVIDER=disabled
+ARG UV_KEYRING_PROVIDER=${PIP_KEYRING_PROVIDER}
+
+# Flag enables built-in KV-connector dependency libs into docker images
+ARG INSTALL_KV_CONNECTORS=false
+
+#################### BASE BUILD IMAGE ####################
+# prepare basic build environment
+FROM ${BUILD_BASE_IMAGE} AS base
+ARG CUDA_VERSION
+ARG PYTHON_VERSION
+ARG TARGETPLATFORM
+ARG INSTALL_KV_CONNECTORS=false
+ENV DEBIAN_FRONTEND=noninteractive
+
+ARG DEADSNAKES_MIRROR_URL
+ARG DEADSNAKES_GPGKEY_URL
+ARG GET_PIP_URL
+
+# Install Python and other dependencies
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y ccache software-properties-common git curl sudo \
+    && if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
+        if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
+            mkdir -p -m 0755 /etc/apt/keyrings ; \
+            curl -L ${DEADSNAKES_GPGKEY_URL} | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \
+            sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \
+            echo "deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${DEADSNAKES_MIRROR_URL} $(lsb_release -cs) main" > /etc/apt/sources.list.d/deadsnakes.list ; \
+        fi ; \
+    else \
+        for i in 1 2 3; do \
+            add-apt-repository -y ppa:deadsnakes/ppa && break || \
+            { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
+        done ; \
+    fi \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
+    && python3 --version && python3 -m pip --version
+
+ARG PIP_INDEX_URL UV_INDEX_URL
+ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
+ARG PYTORCH_CUDA_INDEX_BASE_URL
+ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
+ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
+
+# Install uv for faster pip installs
+RUN --mount=type=cache,target=/root/.cache/uv \
+    python3 -m pip install uv
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+
+# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
+# as it was causing spam when compiling the CUTLASS kernels
+RUN apt-get install -y gcc-10 g++-10
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
+RUN <<EOF
+gcc --version
+EOF
+
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+
+WORKDIR /workspace
+
+# install build and runtime dependencies
+
+# arm64 (GH200) build follows the practice of "use existing pytorch" build,
+# we need to install torch and torchvision from the nightly builds first,
+# pytorch will not appear as a vLLM dependency in all of the following steps
+# after this step
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        uv pip install --system \
+            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+            "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
+        uv pip install --system \
+            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+            --pre pytorch_triton==3.3.0+gitab727c40; \
+    fi
+
+COPY requirements/common.txt requirements/common.txt
+COPY requirements/cuda.txt requirements/cuda.txt
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/cuda.txt \
+    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+
+# cuda arch list used by torch
+# can be useful for both `dev` and `test`
+# explicitly set the list to avoid issues with torch 2.2
+# see https://github.com/pytorch/pytorch/pull/123243
+ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
+ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
+# Override the arch list for flash-attn to reduce the binary size
+ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
+ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
+#################### BASE BUILD IMAGE ####################
+
+#################### WHEEL BUILD IMAGE ####################
+FROM base AS build
+ARG TARGETPLATFORM
+
+ARG PIP_INDEX_URL UV_INDEX_URL
+ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
+ARG PYTORCH_CUDA_INDEX_BASE_URL
+
+# install build dependencies
+COPY requirements/build.txt requirements/build.txt
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/build.txt \
+    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
+
+# max jobs used by Ninja to build extensions
+ARG max_jobs=2
+ENV MAX_JOBS=${max_jobs}
+# number of threads used by nvcc
+ARG nvcc_threads=8
+ENV NVCC_THREADS=$nvcc_threads
+
+ARG USE_SCCACHE
+ARG SCCACHE_DOWNLOAD_URL=https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz
+ARG SCCACHE_ENDPOINT
+ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
+ARG SCCACHE_REGION_NAME=us-west-2
+ARG SCCACHE_S3_NO_CREDENTIALS=0
+
+# Flag to control whether to use pre-built vLLM wheels
+ARG VLLM_USE_PRECOMPILED
+# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed
+ENV VLLM_USE_PRECOMPILED=""
+RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \
+        export VLLM_USE_PRECOMPILED=1 && \
+        echo "Using precompiled wheels"; \
+    else \
+        unset VLLM_USE_PRECOMPILED && \
+        echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \
+    fi
+
+# if USE_SCCACHE is set, use sccache to speed up compilation
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=.git,target=.git \
+    if [ "$USE_SCCACHE" = "1" ]; then \
+        echo "Installing sccache..." \
+        && curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
+        && tar -xzf sccache.tar.gz \
+        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
+        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
+        && if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \
+        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
+        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
+        && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
+        && export SCCACHE_IDLE_TIMEOUT=0 \
+        && export CMAKE_BUILD_TYPE=Release \
+        && sccache --show-stats \
+        && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
+        && sccache --show-stats; \
+    fi
+
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=.git,target=.git  \
+    if [ "$USE_SCCACHE" != "1" ]; then \
+        # Clean any existing CMake artifacts
+        rm -rf .deps && \
+        mkdir -p .deps && \
+        python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
+    fi
+
+# Check the size of the wheel if RUN_WHEEL_CHECK is true
+COPY .buildkite/check-wheel-size.py check-wheel-size.py
+# sync the default value with .buildkite/check-wheel-size.py
+ARG VLLM_MAX_SIZE_MB=400
+ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
+ARG RUN_WHEEL_CHECK=true
+RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
+        python3 check-wheel-size.py dist; \
+    else \
+        echo "Skipping wheel size check."; \
+    fi
+#################### EXTENSION Build IMAGE ####################
+
+#################### DEV IMAGE ####################
+FROM base AS dev
+
+ARG PIP_INDEX_URL UV_INDEX_URL
+ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
+ARG PYTORCH_CUDA_INDEX_BASE_URL
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+
+# Workaround for #17068
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
+
+COPY requirements/lint.txt requirements/lint.txt
+COPY requirements/test.txt requirements/test.txt
+COPY requirements/dev.txt requirements/dev.txt
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/dev.txt \
+    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+#################### DEV IMAGE ####################
+
+#################### vLLM installation IMAGE ####################
+# image with vLLM installed
+# TODO: Restore to base image after FlashInfer AOT wheel fixed
+FROM ${FINAL_BASE_IMAGE} AS vllm-base
+ARG CUDA_VERSION
+ARG PYTHON_VERSION
+ARG INSTALL_KV_CONNECTORS=false
+WORKDIR /vllm-workspace
+ENV DEBIAN_FRONTEND=noninteractive
+ARG TARGETPLATFORM
+
+SHELL ["/bin/bash", "-c"]
+
+ARG DEADSNAKES_MIRROR_URL
+ARG DEADSNAKES_GPGKEY_URL
+ARG GET_PIP_URL
+
+RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
+    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
+
+# Install Python and other dependencies
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+    && if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
+        if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
+            mkdir -p -m 0755 /etc/apt/keyrings ; \
+            curl -L ${DEADSNAKES_GPGKEY_URL} | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \
+            sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \
+            echo "deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${DEADSNAKES_MIRROR_URL} $(lsb_release -cs) main" > /etc/apt/sources.list.d/deadsnakes.list ; \
+        fi ; \
+    else \
+        for i in 1 2 3; do \
+            add-apt-repository -y ppa:deadsnakes/ppa && break || \
+            { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
+        done ; \
+    fi \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
+    && python3 --version && python3 -m pip --version
+
+ARG PIP_INDEX_URL UV_INDEX_URL
+ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
+ARG PYTORCH_CUDA_INDEX_BASE_URL
+ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
+ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
+
+# Install uv for faster pip installs
+RUN --mount=type=cache,target=/root/.cache/uv \
+    python3 -m pip install uv
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+
+# arm64 (GH200) build follows the practice of "use existing pytorch" build,
+# we need to install torch and torchvision from the nightly builds first,
+# pytorch will not appear as a vLLM dependency in all of the following steps
+# after this step
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        uv pip install --system \
+            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+            "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319" ; \
+        uv pip install --system \
+            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+            --pre pytorch_triton==3.3.0+gitab727c40 ; \
+    fi
+
+# Install vllm wheel first, so that torch etc will be installed.
+RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system dist/*.whl --verbose \
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+
+# If we need to build FlashInfer wheel before its release:
+# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
+# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0'
+# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
+# $ cd flashinfer
+# $ git checkout v0.2.6.post1
+# $ python -m flashinfer.aot
+# $ python -m build --no-isolation --wheel
+# $ ls -la dist
+# -rw-rw-r-- 1 mgoin mgoin 205M Jun  9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
+# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
+
+# Install FlashInfer from source
+ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
+ARG FLASHINFER_GIT_REF="v0.2.8rc1"
+RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
+  . /etc/environment
+    git clone --depth 1 --recursive --shallow-submodules \
+        --branch ${FLASHINFER_GIT_REF} \
+        ${FLASHINFER_GIT_REPO} flashinfer
+    # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
+    # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
+    if [[ "${CUDA_VERSION}" == 11.* ]]; then
+        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
+    elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
+        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
+    else
+        # CUDA 12.8+ supports 10.0a and 12.0
+        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
+    fi
+    echo "🏗️  Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
+    # Needed to build AOT kernels
+    pushd flashinfer
+        TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
+            python3 -m flashinfer.aot
+        TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
+            uv pip install --system --no-build-isolation .
+    popd
+    rm -rf flashinfer
+BASH
+COPY examples examples
+COPY benchmarks benchmarks
+COPY ./vllm/collect_env.py .
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+. /etc/environment && \
+uv pip list
+
+# Even when we build Flashinfer with AOT mode, there's still
+# some issues w.r.t. JIT compilation. Therefore we need to
+# install build dependencies for JIT compilation.
+# TODO: Remove this once FlashInfer AOT wheel is fixed
+COPY requirements/build.txt requirements/build.txt
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/build.txt \
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+
+#################### vLLM installation IMAGE ####################
+
+#################### TEST IMAGE ####################
+# image to run unit testing suite
+# note that this uses vllm installed by `pip`
+FROM vllm-base AS test
+
+ADD . /vllm-workspace/
+
+ARG PYTHON_VERSION
+
+ARG PIP_INDEX_URL UV_INDEX_URL
+ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+
+# Workaround for #17068
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
+
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
+    if [ "$CUDA_MAJOR" -ge 12 ]; then \
+        uv pip install --system -r requirements/dev.txt; \
+    fi
+
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -e tests/vllm_test_utils
+
+# enable fast downloads from hf (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system hf_transfer
+ENV HF_HUB_ENABLE_HF_TRANSFER 1
+
+# Copy in the v1 package for testing (it isn't distributed yet)
+COPY vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
+
+# doc requires source code
+# we hide them inside `test_docs/` , so that this source code
+# will not be imported by other tests
+RUN mkdir test_docs
+RUN mv docs test_docs/
+RUN cp -r examples test_docs/
+RUN mv vllm test_docs/
+RUN mv mkdocs.yaml test_docs/
+#################### TEST IMAGE ####################
+
+#################### OPENAI API SERVER ####################
+# base openai image with additional requirements, for any subsequent openai-style images
+FROM vllm-base AS vllm-openai-base
+ARG TARGETPLATFORM
+ARG INSTALL_KV_CONNECTORS=false
+
+ARG PIP_INDEX_URL UV_INDEX_URL
+ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
+COPY requirements/kv_connectors.txt requirements/kv_connectors.txt
+
+# install additional dependencies for openai api server
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
+        uv pip install --system -r requirements/kv_connectors.txt; \
+    fi; \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        BITSANDBYTES_VERSION="0.42.0"; \
+    else \
+        BITSANDBYTES_VERSION="0.46.1"; \
+    fi; \
+    uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]
+
+ENV VLLM_USAGE_SOURCE production-docker-image
+
+# define sagemaker first, so it is not default from `docker build`
+FROM vllm-openai-base AS vllm-sagemaker
+
+COPY examples/online_serving/sagemaker-entrypoint.sh .
+RUN chmod +x sagemaker-entrypoint.sh
+ENTRYPOINT ["./sagemaker-entrypoint.sh"]
+
+FROM vllm-openai-base AS vllm-openai
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+#################### OPENAI API SERVER ####################
diff --git a/vllm_v0.10.0/docker/Dockerfile.arm b/vllm_v0.10.0/docker/Dockerfile.arm
new file mode 100644
index 0000000..bad0936
--- /dev/null
+++ b/vllm_v0.10.0/docker/Dockerfile.arm
@@ -0,0 +1,62 @@
+# This vLLM Dockerfile is used to construct an image that can build and run vLLM on ARM CPU platform.
+
+FROM ubuntu:22.04 AS cpu-test-arm
+
+ENV CCACHE_DIR=/root/.cache/ccache
+
+ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
+
+RUN --mount=type=cache,target=/var/cache/apt \
+    apt-get update -y \
+    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+
+# tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects.
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install py-cpuinfo  # Use this to gather CPU info and optimize based on ARM Neoverse cores
+
+# Set LD_PRELOAD for tcmalloc on ARM
+ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"
+
+RUN echo 'ulimit -c 0' >> ~/.bashrc
+
+WORKDIR /workspace
+
+ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
+ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
+    pip install --upgrade pip && \
+    pip install -r requirements/build.txt
+
+FROM cpu-test-arm AS build
+
+WORKDIR /workspace/vllm
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
+    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
+    pip install -v -r requirements/cpu.txt
+
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+
+# Disabling AVX512 specific optimizations for ARM
+ARG VLLM_CPU_DISABLE_AVX512="true"
+ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=bind,source=.git,target=.git \
+    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
+    pip install dist/*.whl && \
+    rm -rf dist
+
+WORKDIR /workspace/
+
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
\ No newline at end of file
diff --git a/vllm_v0.10.0/docker/Dockerfile.cpu b/vllm_v0.10.0/docker/Dockerfile.cpu
new file mode 100644
index 0000000..982c1dd
--- /dev/null
+++ b/vllm_v0.10.0/docker/Dockerfile.cpu
@@ -0,0 +1,163 @@
+# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
+#
+# Build targets:
+#   vllm-openai (default): used for serving deployment
+#   vllm-test: used for CI tests
+#   vllm-dev: used for development
+#
+# Build arguments:
+#   PYTHON_VERSION=3.12 (default)|3.11|3.10|3.9
+#   VLLM_CPU_DISABLE_AVX512=false (default)|true
+#   VLLM_CPU_AVX512BF16=false (default)|true
+#   VLLM_CPU_AVX512VNNI=false (default)|true
+#
+
+######################### BASE IMAGE #########################
+FROM ubuntu:22.04 AS base
+
+WORKDIR /workspace/
+
+ARG PYTHON_VERSION=3.12
+ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
+
+ENV LD_PRELOAD=""
+
+# Install minimal dependencies and uv
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+    apt-get update -y \
+    && apt-get install -y --no-install-recommends ccache git curl wget ca-certificates \
+        gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof \
+    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
+    && curl -LsSf https://astral.sh/uv/install.sh | sh
+
+ENV CCACHE_DIR=/root/.cache/ccache
+ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
+
+ENV PATH="/root/.local/bin:$PATH"
+ENV VIRTUAL_ENV="/opt/venv"
+ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python
+RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+
+ENV UV_HTTP_TIMEOUT=500
+
+# Install Python dependencies 
+ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+ENV UV_LINK_MODE="copy"
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
+    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
+    uv pip install --upgrade pip && \
+    uv pip install -r requirements/cpu.txt
+
+ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD"
+
+RUN echo 'ulimit -c 0' >> ~/.bashrc
+
+######################### BUILD IMAGE #########################
+FROM base AS vllm-build
+
+ARG GIT_REPO_CHECK=0
+# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
+ARG VLLM_CPU_DISABLE_AVX512=0
+ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
+# Support for building with AVX512BF16 ISA: docker build --build-arg VLLM_CPU_AVX512BF16="true" ...
+ARG VLLM_CPU_AVX512BF16=0
+ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16}
+# Support for building with AVX512VNNI ISA: docker build --build-arg VLLM_CPU_AVX512VNNI="true" ...
+ARG VLLM_CPU_AVX512VNNI=0
+ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI}
+
+WORKDIR /workspace/vllm
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,src=requirements/cpu-build.txt,target=requirements/build.txt \
+    uv pip install -r requirements/build.txt
+
+COPY . .
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \
+    --mount=type=bind,source=.git,target=.git \
+    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel 
+
+######################### TEST DEPS #########################
+FROM base AS vllm-test-deps
+
+WORKDIR /workspace/vllm
+
+RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
+    cp requirements/test.in requirements/cpu-test.in && \
+    sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
+    sed -i 's/^torch==.*/torch==2.6.0/g' requirements/cpu-test.in && \
+    sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
+    sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
+    uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install -r requirements/cpu-test.txt 
+
+######################### DEV IMAGE #########################
+FROM vllm-build AS vllm-dev
+
+WORKDIR /workspace/vllm
+
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+    apt-get install -y --no-install-recommends vim numactl xz-utils
+
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install -e tests/vllm_test_utils 
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=bind,source=.git,target=.git \
+    VLLM_TARGET_DEVICE=cpu python3 setup.py develop 
+
+COPY --from=vllm-test-deps /workspace/vllm/requirements/cpu-test.txt requirements/test.txt
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install -r requirements/dev.txt && \
+    pre-commit install --hook-type pre-commit --hook-type commit-msg
+
+ENTRYPOINT ["bash"]
+
+######################### TEST IMAGE #########################
+FROM vllm-test-deps AS vllm-test
+
+WORKDIR /workspace/
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
+    uv pip install dist/*.whl
+
+ADD ./tests/ ./tests/
+ADD ./examples/ ./examples/
+ADD ./benchmarks/ ./benchmarks/
+ADD ./vllm/collect_env.py .
+ADD ./.buildkite/ ./.buildkite/
+
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install -e tests/vllm_test_utils 
+
+ENTRYPOINT ["bash"]
+
+######################### RELEASE IMAGE #########################
+FROM base AS vllm-openai
+
+WORKDIR /workspace/
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
+    uv pip install dist/*.whl
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/vllm_v0.10.0/docker/Dockerfile.neuron b/vllm_v0.10.0/docker/Dockerfile.neuron
new file mode 100644
index 0000000..8bc2355
--- /dev/null
+++ b/vllm_v0.10.0/docker/Dockerfile.neuron
@@ -0,0 +1,56 @@
+# default base image
+# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
+ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.6.0-neuronx-py310-sdk2.23.0-ubuntu22.04"
+
+FROM $BASE_IMAGE
+
+RUN echo "Base image is $BASE_IMAGE"
+
+# Install some basic utilities
+RUN apt-get update && \
+    apt-get install -y \
+        git \
+        python3 \
+        python3-pip \
+        ffmpeg libsm6 libxext6 libgl1
+
+### Mount Point ###
+# When launching the container, mount the code directory to /workspace
+ARG APP_MOUNT=/workspace
+VOLUME [ ${APP_MOUNT} ]
+WORKDIR ${APP_MOUNT}/vllm
+
+RUN python3 -m pip install --upgrade pip
+RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity
+RUN python3 -m pip install neuronx-cc==2.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+RUN python3 -m pip install pytest
+
+# uninstall transformers-neuronx package explicitly to avoid version conflict
+RUN python3 -m pip uninstall -y transformers-neuronx
+
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+
+RUN python3 -m pip install -U \
+        'cmake>=3.26.1' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
+        -r requirements/neuron.txt
+
+ENV VLLM_TARGET_DEVICE neuron
+RUN --mount=type=bind,source=.git,target=.git \
+    pip install --no-build-isolation -v -e .
+
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
+# install transformers-neuronx package as an optional dependencies (for V0)
+# FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict
+RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps
+
+RUN python3 -m pip install sentencepiece transformers==4.48.0 -U
+
+# overwrite entrypoint to run bash script
+RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
+
+CMD ["/bin/bash"]
diff --git a/vllm_v0.10.0/docker/Dockerfile.nightly_torch b/vllm_v0.10.0/docker/Dockerfile.nightly_torch
new file mode 100644
index 0000000..8d43de7
--- /dev/null
+++ b/vllm_v0.10.0/docker/Dockerfile.nightly_torch
@@ -0,0 +1,318 @@
+# The vLLM Dockerfile is used to construct vLLM image against torch nightly that can be directly used for testing
+
+# for torch nightly, cuda >=12.6 is required,
+# use 12.8 due to FlashAttention issue with cuda 12.6 (https://github.com/vllm-project/vllm/issues/15435#issuecomment-2775924628)
+ARG CUDA_VERSION=12.8.0
+#
+#################### BASE BUILD IMAGE ####################
+# prepare basic build environment
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
+ARG CUDA_VERSION=12.8.0
+ARG PYTHON_VERSION=3.12
+ARG TARGETPLATFORM
+ENV DEBIAN_FRONTEND=noninteractive
+# Install Python and other dependencies
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y ccache software-properties-common git curl sudo \
+    && for i in 1 2 3; do \
+        add-apt-repository -y ppa:deadsnakes/ppa && break || \
+        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
+    done \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+    && python3 --version \
+    && python3 -m pip --version
+# Install uv for faster pip installs
+RUN --mount=type=cache,target=/root/.cache/uv \
+    python3 -m pip install uv
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
+# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
+# as it was causing spam when compiling the CUTLASS kernels
+RUN apt-get install -y gcc-10 g++-10
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
+RUN <<EOF
+gcc --version
+EOF
+
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+
+WORKDIR /workspace
+
+# install build and runtime dependencies
+COPY requirements/common.txt requirements/common.txt
+COPY use_existing_torch.py use_existing_torch.py
+COPY pyproject.toml pyproject.toml
+
+# install build and runtime dependencies without stable torch version
+RUN python3 use_existing_torch.py
+
+# install torch nightly
+ARG PINNED_TORCH_VERSION
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -n "$PINNED_TORCH_VERSION" ]; then \
+      pkgs="$PINNED_TORCH_VERSION"; \
+    else \
+      pkgs="torch torchaudio torchvision"; \
+    fi && \
+    uv pip install --system $pkgs --index-url https://download.pytorch.org/whl/nightly/cu128
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system numba==0.61.2
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+uv pip install --system -r requirements/common.txt
+
+# must put before installing xformers, so it can install the correct version of xfomrers.
+ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
+ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
+
+# Build xformers with cuda and torch nightly
+# following official xformers guidance: https://github.com/facebookresearch/xformers#build
+# todo(elainewy): cache xformers build result for faster build
+ARG max_jobs=16
+ENV MAX_JOBS=${max_jobs}
+ARG XFORMERS_COMMIT=f2de641ef670510cadab099ce6954031f52f191c
+
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+     --mount=type=cache,target=/root/.cache/uv \
+    echo 'git clone xformers...' \
+    && git clone https://github.com/facebookresearch/xformers.git --recursive \
+    && cd xformers \
+    && git checkout ${XFORMERS_COMMIT} \
+    && git submodule update --init --recursive \
+    && echo 'finish git clone xformers...' \
+    && rm -rf build \
+    && python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \
+    && cd .. \
+    && rm -rf xformers
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system xformers-dist/*.whl --verbose
+
+# build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
+# track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
+RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
+RUN cat torch_build_versions.txt
+
+# cuda arch list used by torch
+# can be useful for `test`
+# explicitly set the list to avoid issues with torch 2.2
+# see https://github.com/pytorch/pytorch/pull/123243
+
+# Override the arch list for flash-attn to reduce the binary size
+ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
+ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
+#################### BASE BUILD IMAGE ####################
+
+#################### WHEEL BUILD IMAGE ####################
+FROM base AS build
+ARG TARGETPLATFORM
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
+COPY . .
+
+RUN python3 use_existing_torch.py
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/build.txt
+
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
+
+# Max jobs used by Ninja to build extensions
+ARG max_jobs=16
+ENV MAX_JOBS=${max_jobs}
+ARG nvcc_threads=2
+ENV NVCC_THREADS=$nvcc_threads
+
+ARG USE_SCCACHE
+ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
+ARG SCCACHE_REGION_NAME=us-west-2
+ARG SCCACHE_S3_NO_CREDENTIALS=0
+
+# if USE_SCCACHE is set, use sccache to speed up compilation
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=.git,target=.git \
+    if [ "$USE_SCCACHE" = "1" ]; then \
+        echo "Installing sccache..." \
+        && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
+        && tar -xzf sccache.tar.gz \
+        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
+        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
+        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
+        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
+        && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
+        && export SCCACHE_IDLE_TIMEOUT=0 \
+        && export CMAKE_BUILD_TYPE=Release \
+        && sccache --show-stats \
+        && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
+        && sccache --show-stats; \
+    fi
+
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=.git,target=.git  \
+    if [ "$USE_SCCACHE" != "1" ]; then \
+        # Clean any existing CMake artifacts
+        rm -rf .deps && \
+        mkdir -p .deps && \
+        python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
+    fi
+
+#################### WHEEL BUILD IMAGE ####################
+
+################### VLLM INSTALLED IMAGE ####################
+# Setup clean environment for vLLM and its dependencies for test and api server using ubuntu22.04 with AOT flashinfer
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
+# prepare for environment starts
+ARG CUDA_VERSION=12.8.0
+ARG PYTHON_VERSION=3.12
+WORKDIR /vllm-workspace
+ENV DEBIAN_FRONTEND=noninteractive
+ARG TARGETPLATFORM
+
+RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
+    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
+
+# Install Python and other dependencies
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+    && for i in 1 2 3; do \
+        add-apt-repository -y ppa:deadsnakes/ppa && break || \
+        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
+    done \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+    && python3 --version && python3 -m pip --version
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    python3 -m pip install uv
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+
+# get the nightly torch version used in the build to make sure the version is the same
+COPY --from=base /workspace/torch_build_versions.txt ./torch_build_versions.txt
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu128
+
+# install the vllm wheel
+RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/vllm-dist \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system vllm-dist/*.whl --verbose
+
+# install xformers again for the new environment
+RUN --mount=type=bind,from=base,src=/workspace/xformers-dist,target=/vllm-workspace/xformers-dist \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system /vllm-workspace/xformers-dist/*.whl --verbose
+
+ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
+
+# install package for build flashinfer
+# see issue: https://github.com/flashinfer-ai/flashinfer/issues/738
+RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.post1
+
+
+# build flashinfer for torch nightly from source around 10 mins
+# release version: v0.2.2.post1
+# todo(elainewy): cache flashinfer build result for faster build
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/uv \
+    echo "git clone flashinfer..." \
+    && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
+    && cd flashinfer \
+    && git checkout v0.2.2.post1 \
+    && git submodule update --init --recursive \
+    && echo "finish git clone flashinfer..." \
+    && rm -rf build \
+    && export TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} \
+    && FLASHINFER_ENABLE_AOT=1 python3 setup.py bdist_wheel --dist-dir=../flashinfer-dist --verbose \
+    && cd .. \
+    && rm -rf flashinfer
+
+# install flashinfer
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system flashinfer-dist/*.whl --verbose
+
+# install common packages
+COPY requirements/common.txt requirements/common.txt
+COPY use_existing_torch.py use_existing_torch.py
+COPY pyproject.toml pyproject.toml
+
+COPY examples examples
+COPY benchmarks benchmarks
+COPY ./vllm/collect_env.py .
+
+RUN python3 use_existing_torch.py
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/common.txt
+
+################### VLLM INSTALLED IMAGE ####################
+
+
+#################### UNITTEST IMAGE #############################
+FROM vllm-base as test
+COPY tests/ tests/
+
+# install build and runtime dependencies without stable torch version
+COPY requirements/nightly_torch_test.txt requirements/nightly_torch_test.txt
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -e tests/vllm_test_utils
+
+# enable fast downloads from hf (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system hf_transfer
+ENV HF_HUB_ENABLE_HF_TRANSFER 1
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/nightly_torch_test.txt
+
+# Logging to confirm the torch versions
+RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
+
+# Logging to confirm all the packages are installed
+RUN pip freeze
+
+#################### UNITTEST IMAGE #############################
diff --git a/vllm_v0.10.0/docker/Dockerfile.ppc64le b/vllm_v0.10.0/docker/Dockerfile.ppc64le
new file mode 100644
index 0000000..aaff240
--- /dev/null
+++ b/vllm_v0.10.0/docker/Dockerfile.ppc64le
@@ -0,0 +1,317 @@
+ARG BASE_UBI_IMAGE_TAG=9.5-1741850109
+
+###############################################################
+# Stage to build openblas
+###############################################################
+
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS openblas-builder
+
+ARG MAX_JOBS
+ARG OPENBLAS_VERSION=0.3.29
+RUN microdnf install -y dnf && dnf install -y gcc-toolset-13 make wget unzip \
+    && source /opt/rh/gcc-toolset-13/enable \
+    && wget https://github.com/OpenMathLib/OpenBLAS/releases/download/v$OPENBLAS_VERSION/OpenBLAS-$OPENBLAS_VERSION.zip \
+    && unzip OpenBLAS-$OPENBLAS_VERSION.zip \
+    && cd OpenBLAS-$OPENBLAS_VERSION \
+    &&  make -j${MAX_JOBS} TARGET=POWER9 BINARY=64 USE_OPENMP=1 USE_THREAD=1 NUM_THREADS=120 DYNAMIC_ARCH=1 INTERFACE64=0 \
+    && cd /tmp && touch control
+
+
+###############################################################
+# base stage with dependencies coming from centos mirrors
+###############################################################
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS centos-deps-builder
+RUN  microdnf install -y dnf && \ 
+     dnf install -y https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-gpg-keys-9.0-24.el9.noarch.rpm \
+        https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-stream-repos-9.0-24.el9.noarch.rpm \
+        https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
+        dnf config-manager --set-enabled crb
+
+RUN dnf install -y openjpeg2-devel lcms2-devel tcl-devel tk-devel fribidi-devel && \
+    dnf remove -y centos-gpg-keys-9.0-24.el9.noarch centos-stream-repos-9.0-24.el9.noarch 
+
+
+###############################################################
+# base stage with basic dependencies
+###############################################################
+
+FROM centos-deps-builder AS base-builder
+
+ARG PYTHON_VERSION=3.12
+ARG OPENBLAS_VERSION=0.3.29
+
+# Set Environment Variables for venv, cargo & openblas
+ENV VIRTUAL_ENV=/opt/vllm
+ENV PATH=${VIRTUAL_ENV}/bin:/root/.cargo/bin:$PATH
+ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig/
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64:/usr/local/lib:/usr/lib64:/usr/lib
+ENV UV_LINK_MODE=copy
+
+# install gcc-13, python, rust, openblas
+# Note: A symlink for libatomic.so is created for gcc-13 (linker fails to find libatomic otherwise - reqd. for sentencepiece)
+# Note: A dummy file 'control' is created in /tmp/ to artificially create dependencies between stages when building stages in parallel
+#       when `--jobs=<N>` is passed with podman build command
+
+COPY --from=openblas-builder /tmp/control /dev/null
+
+RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
+    dnf install -y openssl-devel \
+    && dnf install -y \
+       git tar gcc-toolset-13 automake libtool \
+       pkgconfig xsimd zeromq-devel kmod findutils protobuf* \
+       libtiff-devel libjpeg-devel zlib-devel freetype-devel libwebp-devel \
+       harfbuzz-devel libraqm-devel libimagequant-devel libxcb-devel \
+       python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
+    && dnf clean all \
+    && PREFIX=/usr/local make -C /openblas install \
+    && ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
+    && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
+    && python -m pip install -U pip uv \
+    && uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python 'cmake<4' ninja cython scikit_build_core scikit_build \
+    && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
+    && cd /tmp && touch control
+
+
+###############################################################
+# Stage to build torch family
+###############################################################
+
+FROM base-builder AS torch-builder
+
+ARG MAX_JOBS
+ARG TORCH_VERSION=2.6.0
+ARG _GLIBCXX_USE_CXX11_ABI=1
+ARG OPENBLAS_VERSION=0.3.29
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    source /opt/rh/gcc-toolset-13/enable &&  \
+    git clone --recursive https://github.com/pytorch/pytorch.git -b v${TORCH_VERSION} && \
+    cd pytorch && \
+    uv pip install -r requirements.txt && \
+    python setup.py develop && \
+    rm -f dist/torch*+git*whl && \
+    MAX_JOBS=${MAX_JOBS:-$(nproc)} \
+    PYTORCH_BUILD_VERSION=${TORCH_VERSION} PYTORCH_BUILD_NUMBER=1 uv build --wheel --out-dir /torchwheels/
+
+ARG TORCHVISION_VERSION=0.21.0
+ARG TORCHVISION_USE_NVJPEG=0
+ARG TORCHVISION_USE_FFMPEG=0
+RUN --mount=type=cache,target=/root/.cache/uv \
+    source /opt/rh/gcc-toolset-13/enable && \
+    git clone --recursive https://github.com/pytorch/vision.git -b v${TORCHVISION_VERSION} && \
+    cd vision && \
+    MAX_JOBS=${MAX_JOBS:-$(nproc)} \
+    BUILD_VERSION=${TORCHVISION_VERSION} \
+    uv build --wheel --out-dir /torchwheels/ --no-build-isolation
+
+ARG TORCHAUDIO_VERSION=2.6.0
+ARG BUILD_SOX=1
+ARG BUILD_KALDI=1
+ARG BUILD_RNNT=1
+ARG USE_FFMPEG=0
+ARG USE_ROCM=0
+ARG USE_CUDA=0
+ARG TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_FFMPEG=1
+RUN --mount=type=cache,target=/root/.cache/uv \
+    source /opt/rh/gcc-toolset-13/enable && \
+    git clone --recursive https://github.com/pytorch/audio.git -b v${TORCHAUDIO_VERSION} && \
+    cd audio && \
+    MAX_JOBS=${MAX_JOBS:-$(nproc)} \
+    BUILD_VERSION=${TORCHAUDIO_VERSION} \
+    uv build --wheel --out-dir /torchwheels/ --no-build-isolation
+
+###############################################################
+# Stage to build pyarrow
+###############################################################
+
+FROM base-builder AS arrow-builder
+
+ARG MAX_JOBS
+ARG PYARROW_PARALLEL
+ARG PYARROW_VERSION=19.0.1
+RUN --mount=type=cache,target=/root/.cache/uv \
+    source /opt/rh/gcc-toolset-13/enable && \
+    git clone --recursive https://github.com/apache/arrow.git -b apache-arrow-${PYARROW_VERSION} && \
+    cd arrow/cpp && \
+    mkdir build && cd build && \
+    cmake -DCMAKE_BUILD_TYPE=release \
+        -DCMAKE_INSTALL_PREFIX=/usr/local \
+        -DARROW_PYTHON=ON \
+        -DARROW_BUILD_TESTS=OFF \
+        -DARROW_JEMALLOC=ON \
+        -DARROW_BUILD_STATIC="OFF" \
+        -DARROW_PARQUET=ON \
+        .. && \
+    make install -j ${MAX_JOBS:-$(nproc)} && \
+    cd ../../python/ && \
+    uv pip install -v -r requirements-build.txt && uv pip install numpy==2.1.3 && \
+    pip show numpy && ls -lrt /opt/vllm/lib/python3.12/site-packages/numpy && \
+    PYARROW_PARALLEL=${PYARROW_PARALLEL:-$(nproc)} \
+    python setup.py build_ext \
+    --build-type=release --bundle-arrow-cpp \
+    bdist_wheel --dist-dir /arrowwheels/
+
+###############################################################
+# Stage to build opencv
+###############################################################
+
+FROM base-builder AS cv-builder
+
+ARG MAX_JOBS
+ARG OPENCV_VERSION=86
+# patch for version 4.11.0.86
+ARG OPENCV_PATCH=97f3f39
+ARG ENABLE_HEADLESS=1
+RUN --mount=type=cache,target=/root/.cache/uv \
+    source /opt/rh/gcc-toolset-13/enable && \
+    git clone --recursive https://github.com/opencv/opencv-python.git -b ${OPENCV_VERSION} && \
+    cd opencv-python && \
+    sed -i -E -e 's/"setuptools.+",/"setuptools",/g' pyproject.toml && \
+    cd opencv && git cherry-pick --no-commit $OPENCV_PATCH && cd .. && \
+    uv pip install scikit-build && \    
+    python -m build --wheel --installer=uv --outdir /opencvwheels/
+
+###############################################################
+# Stage to build numactl
+###############################################################
+
+FROM base-builder AS numa-builder
+
+# Note: Building numactl with gcc-11. Compiling with gcc-13 in this builder stage will
+# trigger recompilation with gcc-11 (and require libtool) in the final stage where we do not have gcc-13
+ARG MAX_JOBS
+ARG NUMACTL_VERSION=2.0.19
+RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_VERSION} \
+    && cd numactl \
+    && autoreconf -i && ./configure \
+    && make -j ${MAX_JOBS:-$(nproc)}
+
+
+###############################################################
+# Stage to build vllm - this stage builds and installs
+# vllm, tensorizer and vllm-tgis-adapter and builds uv cache
+# for transitive dependencies - eg. grpcio
+###############################################################
+
+FROM base-builder AS vllmcache-builder
+
+COPY --from=torch-builder /tmp/control /dev/null
+COPY --from=arrow-builder /tmp/control /dev/null
+COPY --from=cv-builder /tmp/control /dev/null
+COPY --from=numa-builder /tmp/control /dev/null
+
+ARG VLLM_TARGET_DEVICE=cpu
+ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
+
+# this step installs vllm and populates uv cache
+# with all the transitive dependencies
+RUN --mount=type=cache,target=/root/.cache/uv \
+    source /opt/rh/gcc-toolset-13/enable && \
+    git clone https://github.com/huggingface/xet-core.git && cd xet-core/hf_xet/ && \
+    uv pip install maturin && \
+    uv build --wheel --out-dir /hf_wheels/
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
+    --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
+    --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
+    --mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
+    --mount=type=bind,src=.,dst=/src/,rw \
+    source /opt/rh/gcc-toolset-13/enable && \
+    uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \
+    sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
+    uv pip install pandas pythran pybind11 /hf_wheels/*.whl && \
+    make -C /numactl install && \
+    # sentencepiece.pc is in some pkgconfig inside uv cache
+    export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \
+    uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
+    cd /src/ && \
+    uv build --wheel --out-dir /vllmwheel/ --no-build-isolation && \
+    uv pip install /vllmwheel/*.whl
+
+
+###############################################################
+# Stage to build lapack
+###############################################################
+
+FROM base-builder AS lapack-builder
+
+ARG MAX_JOBS
+ARG LAPACK_VERSION=3.12.1
+RUN git clone --recursive https://github.com/Reference-LAPACK/lapack.git -b v${LAPACK_VERSION} \
+    && cd lapack && source /opt/rh/gcc-toolset-13/enable \
+    && cmake -B build -S . \
+    && cmake --build build -j ${MAX_JOBS:-$(nproc)}
+
+
+###############################################################
+#                   FINAL VLLM IMAGE STAGE                    #
+###############################################################
+
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS vllm-openai
+
+ARG PYTHON_VERSION=3.12
+ARG OPENBLAS_VERSION=0.3.29
+
+# Set Environment Variables for venv & openblas
+ENV VIRTUAL_ENV=/opt/vllm
+ENV PATH=${VIRTUAL_ENV}/bin:$PATH
+ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig/
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64:/usr/local/lib:/usr/lib64:/usr/lib
+ENV UV_LINK_MODE=copy
+ENV OMP_NUM_THREADS=16
+
+# create artificial dependencies between stages for independent stages to build in parallel
+COPY --from=torch-builder /tmp/control /dev/null
+COPY --from=arrow-builder /tmp/control /dev/null
+COPY --from=cv-builder /tmp/control /dev/null
+COPY --from=vllmcache-builder /tmp/control /dev/null
+COPY --from=numa-builder /tmp/control /dev/null
+COPY --from=lapack-builder /tmp/control /dev/null
+COPY --from=openblas-builder /tmp/control /dev/null
+
+# install gcc-11, python, openblas, numactl, lapack
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
+    --mount=type=bind,from=lapack-builder,source=/lapack/,target=/lapack/,rw \
+    --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
+    rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
+    microdnf install --nodocs -y \
+    tar findutils openssl \
+    pkgconfig xsimd g++ gcc-fortran libsndfile \
+    libtiff libjpeg openjpeg2 zlib zeromq \
+    freetype lcms2 libwebp tcl tk utf8proc \
+    harfbuzz fribidi libraqm libimagequant libxcb \
+    python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
+    && microdnf clean all \
+    && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
+    && python -m pip install -U pip uv --no-cache \
+    && make -C /numactl install \
+    && PREFIX=/usr/local make -C /openblas install \
+    && uv pip install 'cmake<4' \
+    && cmake --install /lapack/build \
+    && uv pip uninstall cmake
+
+# consume previously built wheels (including vllm)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
+    --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
+    --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
+    --mount=type=bind,from=vllmcache-builder,source=/hf_wheels/,target=/hf_wheels/,ro \
+    --mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \
+    HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /hf_wheels/*.whl /vllmwheel/*.whl
+
+COPY ./ /workspace/vllm
+WORKDIR /workspace/vllm
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
+
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install -e tests/vllm_test_utils
+
+WORKDIR /workspace/
+
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/vllm_v0.10.0/docker/Dockerfile.rocm b/vllm_v0.10.0/docker/Dockerfile.rocm
new file mode 100644
index 0000000..4f40f32
--- /dev/null
+++ b/vllm_v0.10.0/docker/Dockerfile.rocm
@@ -0,0 +1,113 @@
+# default base image
+ARG REMOTE_VLLM="0"
+ARG COMMON_WORKDIR=/app
+ARG BASE_IMAGE=rocm/vllm-dev:base
+
+FROM ${BASE_IMAGE} AS base
+
+ARG ARG_PYTORCH_ROCM_ARCH
+ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
+
+# Install some basic utilities
+RUN apt-get update -q -y && apt-get install -q -y \
+    sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
+    apt-transport-https ca-certificates wget curl
+# Remove sccache    
+RUN python3 -m pip install --upgrade pip
+RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
+ARG COMMON_WORKDIR
+WORKDIR ${COMMON_WORKDIR}
+
+
+# -----------------------
+# vLLM fetch stages
+FROM base AS fetch_vllm_0
+ONBUILD COPY ./ vllm/
+FROM base AS fetch_vllm_1
+ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
+ARG VLLM_BRANCH="main"
+ONBUILD RUN git clone ${VLLM_REPO} \
+	    && cd vllm \
+	    && git fetch -v --prune -- origin ${VLLM_BRANCH} \
+	    && git checkout FETCH_HEAD
+FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
+
+# -----------------------
+# vLLM build stages
+FROM fetch_vllm AS build_vllm
+# Build vLLM
+RUN cd vllm \
+    && python3 -m pip install -r requirements/rocm.txt \
+    && python3 setup.py clean --all  \
+    && python3 setup.py bdist_wheel --dist-dir=dist
+FROM scratch AS export_vllm
+ARG COMMON_WORKDIR
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl /
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements /requirements
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
+
+# -----------------------
+# Test vLLM image
+FROM base AS test
+
+RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
+
+# Install vLLM
+RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
+    cd /install \
+    && pip install -U -r requirements/rocm.txt \
+    && pip install -U -r requirements/rocm-test.txt \
+    && pip uninstall -y vllm \
+    && pip install *.whl
+
+WORKDIR /vllm-workspace
+ARG COMMON_WORKDIR
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
+
+# install development dependencies (for testing)
+RUN cd /vllm-workspace \
+    && rm -rf vllm \
+    && python3 -m pip install -e tests/vllm_test_utils \
+    && python3 -m pip install lm-eval[api]==0.4.4 \
+    && python3 -m pip install pytest-shard
+
+# -----------------------
+# Final vLLM image
+FROM base AS final
+
+RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
+# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
+# Manually remove it so that later steps of numpy upgrade can continue
+RUN case "$(which python3)" in \
+        *"/opt/conda/envs/py_3.9"*) \
+            rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
+        *) ;; esac
+
+RUN python3 -m pip install --upgrade huggingface-hub[cli]
+
+# Install vLLM
+RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
+    cd /install \
+    && pip install -U -r requirements/rocm.txt \
+    && pip uninstall -y vllm \
+    && pip install *.whl
+
+ARG COMMON_WORKDIR
+
+# Copy over the benchmark scripts as well
+COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
+COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
+
+ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
+ENV TOKENIZERS_PARALLELISM=false
+
+# ENV that can improve safe tensor loading, and end-to-end time
+ENV SAFETENSORS_FAST_GPU=1
+
+# Performance environment variable.
+ENV HIP_FORCE_DEV_KERNARG=1
+
+CMD ["/bin/bash"]
diff --git a/vllm_v0.10.0/docker/Dockerfile.rocm_base b/vllm_v0.10.0/docker/Dockerfile.rocm_base
new file mode 100644
index 0000000..3414c0a
--- /dev/null
+++ b/vllm_v0.10.0/docker/Dockerfile.rocm_base
@@ -0,0 +1,183 @@
+ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
+ARG HIPBLASLT_BRANCH="db8e93b4"
+ARG HIPBLAS_COMMON_BRANCH="7c1566b"
+ARG LEGACY_HIPBLASLT_OPTION=
+ARG RCCL_BRANCH="648a58d"
+ARG RCCL_REPO="https://github.com/ROCm/rccl"
+ARG TRITON_BRANCH="e5be006"
+ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
+ARG PYTORCH_BRANCH="295f2ed4"
+ARG PYTORCH_VISION_BRANCH="v0.21.0"
+ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
+ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
+ARG FA_BRANCH="1a7f4dfa"
+ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
+ARG AITER_BRANCH="916bf3c"
+ARG AITER_REPO="https://github.com/ROCm/aiter.git"
+
+FROM ${BASE_IMAGE} AS base
+
+ENV PATH=/opt/rocm/llvm/bin:$PATH
+ENV ROCM_PATH=/opt/rocm
+ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
+ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201
+ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
+
+ARG PYTHON_VERSION=3.12
+
+RUN mkdir -p /app
+WORKDIR /app
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install Python and other dependencies
+RUN apt-get update -y \
+    && apt-get install -y software-properties-common git curl sudo vim less libgfortran5 \
+    && for i in 1 2 3; do \
+        add-apt-repository -y ppa:deadsnakes/ppa && break || \
+        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
+    done \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+       python${PYTHON_VERSION}-lib2to3 python-is-python3  \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+    && python3 --version && python3 -m pip --version
+
+RUN pip install -U packaging 'cmake<4' ninja wheel setuptools pybind11 Cython
+
+FROM base AS build_hipblaslt
+ARG HIPBLASLT_BRANCH
+ARG HIPBLAS_COMMON_BRANCH
+# Set to "--legacy_hipblas_direct" for ROCm<=6.2
+ARG LEGACY_HIPBLASLT_OPTION
+RUN git clone https://github.com/ROCm/hipBLAS-common.git
+RUN cd hipBLAS-common \
+    && git checkout ${HIPBLAS_COMMON_BRANCH} \
+    && mkdir build \
+    && cd build \
+    && cmake .. \
+    && make package \
+    && dpkg -i ./*.deb
+RUN git clone https://github.com/ROCm/hipBLASLt
+RUN cd hipBLASLt \
+    && git checkout ${HIPBLASLT_BRANCH} \
+    && apt-get install -y llvm-dev \
+    && ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
+    && cd build/release \
+    && make package
+RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
+
+FROM base AS build_rccl
+ARG RCCL_BRANCH
+ARG RCCL_REPO
+RUN git clone ${RCCL_REPO}
+RUN cd rccl \
+    && git checkout ${RCCL_BRANCH} \
+    && ./install.sh -p --amdgpu_targets ${PYTORCH_ROCM_ARCH}
+RUN mkdir -p /app/install && cp /app/rccl/build/release/*.deb /app/install
+
+FROM base AS build_triton
+ARG TRITON_BRANCH
+ARG TRITON_REPO
+RUN git clone ${TRITON_REPO}
+RUN cd triton \
+    && git checkout ${TRITON_BRANCH} \
+    && cd python \
+    && python3 setup.py bdist_wheel --dist-dir=dist
+RUN mkdir -p /app/install && cp /app/triton/python/dist/*.whl /app/install
+
+FROM base AS build_amdsmi
+RUN cd /opt/rocm/share/amd_smi \
+    && pip wheel . --wheel-dir=dist
+RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install
+
+FROM base AS build_pytorch
+ARG PYTORCH_BRANCH
+ARG PYTORCH_VISION_BRANCH
+ARG PYTORCH_REPO
+ARG PYTORCH_VISION_REPO
+ARG FA_BRANCH
+ARG FA_REPO
+RUN git clone ${PYTORCH_REPO} pytorch
+RUN cd pytorch && git checkout ${PYTORCH_BRANCH} && \
+    pip install -r requirements.txt && git submodule update --init --recursive \
+    && python3 tools/amd_build/build_amd.py \
+    && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
+    && pip install dist/*.whl
+RUN git clone ${PYTORCH_VISION_REPO} vision
+RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
+    && python3 setup.py bdist_wheel --dist-dir=dist \
+    && pip install dist/*.whl
+RUN git clone ${FA_REPO}
+RUN cd flash-attention \
+    && git checkout ${FA_BRANCH} \
+    && git submodule update --init \
+    && GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist
+RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
+    && cp /app/vision/dist/*.whl /app/install \
+    && cp /app/flash-attention/dist/*.whl /app/install
+
+FROM base AS build_aiter
+ARG AITER_BRANCH
+ARG AITER_REPO
+RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
+    pip install /install/*.whl
+RUN git clone --recursive ${AITER_REPO}
+RUN cd aiter \
+    && git checkout ${AITER_BRANCH} \
+    && git submodule update --init --recursive \
+    && pip install -r requirements.txt
+RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
+RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
+
+FROM base AS final
+RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
+    dpkg -i /install/*deb \
+    && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
+    && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status
+RUN --mount=type=bind,from=build_rccl,src=/app/install/,target=/install \
+    dpkg -i /install/*deb \
+    && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \
+    && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status
+RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
+    pip install /install/*.whl
+RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
+    pip install /install/*.whl
+RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
+    pip install /install/*.whl
+RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
+    pip install /install/*.whl
+
+ARG BASE_IMAGE
+ARG HIPBLAS_COMMON_BRANCH
+ARG HIPBLASLT_BRANCH
+ARG LEGACY_HIPBLASLT_OPTION
+ARG RCCL_BRANCH
+ARG RCCL_REPO
+ARG TRITON_BRANCH
+ARG TRITON_REPO
+ARG PYTORCH_BRANCH
+ARG PYTORCH_VISION_BRANCH
+ARG PYTORCH_REPO
+ARG PYTORCH_VISION_REPO
+ARG FA_BRANCH
+ARG FA_REPO
+ARG AITER_BRANCH
+ARG AITER_REPO
+RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
+    && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
+    && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
+    && echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \
+    && echo "RCCL_BRANCH: ${RCCL_BRANCH}" >> /app/versions.txt \
+    && echo "RCCL_REPO: ${RCCL_REPO}" >> /app/versions.txt \
+    && echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \
+    && echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \
+    && echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \
+    && echo "PYTORCH_VISION_BRANCH: ${PYTORCH_VISION_BRANCH}" >> /app/versions.txt \
+    && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
+    && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
+    && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
+    && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
+    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
diff --git a/vllm_v0.10.0/docker/Dockerfile.s390x b/vllm_v0.10.0/docker/Dockerfile.s390x
new file mode 100644
index 0000000..4e89bb3
--- /dev/null
+++ b/vllm_v0.10.0/docker/Dockerfile.s390x
@@ -0,0 +1,199 @@
+# Base UBI image for s390x architecture
+ARG BASE_UBI_IMAGE_TAG=9.5-1736404155
+ARG PYTHON_VERSION=3.12
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base
+
+# Install basic dependencies
+ARG PYTHON_VERSION
+ENV PYTHON_VERSION=${PYTHON_VERSION}
+
+WORKDIR /workspace
+
+ENV LANG=C.UTF-8 \
+    LC_ALL=C.UTF-8
+
+# Install development utilities
+RUN microdnf install -y \
+    which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \
+    libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
+    openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy && \
+    microdnf clean all
+
+# Python Installation
+FROM base AS python-install
+ARG PYTHON_VERSION
+
+ENV VIRTUAL_ENV=/opt/vllm
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+ENV PYTHON_VERSION=${PYTHON_VERSION}
+RUN microdnf install -y \
+    python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel  && \
+    python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel uv && microdnf clean all
+
+FROM python-install AS pyarrow
+
+# Build Apache Arrow
+WORKDIR /tmp
+RUN --mount=type=cache,target=/root/.cache/uv \
+    git clone https://github.com/apache/arrow.git && \
+    cd arrow/cpp && \
+    mkdir release && cd release && \
+    cmake -DCMAKE_BUILD_TYPE=Release \
+          -DCMAKE_INSTALL_PREFIX=/usr/local \
+          -DARROW_PYTHON=ON \
+          -DARROW_PARQUET=ON \
+          -DARROW_ORC=ON \
+          -DARROW_FILESYSTEM=ON \
+          -DARROW_WITH_LZ4=ON \
+          -DARROW_WITH_ZSTD=ON \
+          -DARROW_WITH_SNAPPY=ON \
+          -DARROW_JSON=ON \
+          -DARROW_CSV=ON \
+          -DARROW_DATASET=ON \
+          -DPROTOBUF_PROTOC_EXECUTABLE=/usr/bin/protoc \
+          -DARROW_DEPENDENCY_SOURCE=BUNDLED \
+          .. && \
+    make -j$(nproc) && \
+    make install && \
+    cd ../../python && \
+    export PYARROW_PARALLEL=4 && \
+    export ARROW_BUILD_TYPE=release && \
+    uv pip install -r requirements-build.txt && \
+    python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --bundle-arrow-cpp bdist_wheel
+
+FROM python-install AS numa-build
+# Install numactl (needed for numa.h dependency)
+WORKDIR /tmp
+RUN curl -LO https://github.com/numactl/numactl/archive/refs/tags/v2.0.16.tar.gz && \
+    tar -xvzf v2.0.16.tar.gz && \
+    cd numactl-2.0.16 && \
+    ./autogen.sh && \
+    ./configure && \
+    make
+
+# Set include path
+ENV C_INCLUDE_PATH="/usr/local/include:$C_INCLUDE_PATH"
+
+FROM python-install AS rust
+ENV CARGO_HOME=/root/.cargo
+ENV RUSTUP_HOME=/root/.rustup
+ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
+
+RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \
+    . "$CARGO_HOME/env" && \
+    rustup default stable && \
+    rustup show
+
+FROM python-install AS torch
+ARG TORCH_VERSION=2.7.0
+ENV export _GLIBCXX_USE_CXX11_ABI=1
+ENV CARGO_HOME=/root/.cargo
+ENV RUSTUP_HOME=/root/.rustup
+ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
+
+WORKDIR /tmp
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
+    --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
+    git clone https://github.com/pytorch/pytorch.git && \
+    cd pytorch && \
+    git checkout v2.7.0 && \
+    git submodule sync && \
+    git submodule update --init --recursive && \
+    uv pip install cmake ninja && \
+    uv pip install -r requirements.txt && \
+    python setup.py bdist_wheel
+    
+
+FROM python-install AS torch-vision
+# Install torchvision
+ARG TORCH_VERSION=2.7.0
+ARG TORCH_VISION_VERSION=v0.20.1
+WORKDIR /tmp
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
+    git clone https://github.com/pytorch/vision.git && \
+    cd vision && \
+    git checkout $TORCH_VISION_VERSION && \
+    TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl | head -n 1) && \
+    uv pip install -v $TORCH_WHL_FILE && \
+    python setup.py bdist_wheel
+
+FROM python-install AS hf-xet-builder
+# Install hf-xet
+WORKDIR /tmp
+ENV CARGO_HOME=/root/.cargo
+ENV RUSTUP_HOME=/root/.rustup
+ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
+    --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
+    git clone https://github.com/huggingface/xet-core.git && \
+    cd xet-core/hf_xet/ && \
+    uv pip install maturin patchelf && \
+    python -m maturin build --release --out dist && \
+    mkdir -p /tmp/hf-xet/dist && \
+    cp dist/*.whl /tmp/hf-xet/dist/
+
+# Final build stage
+FROM python-install AS vllm-cpu
+ARG PYTHON_VERSION
+
+# Set correct library path for torch and numactl
+ENV LD_LIBRARY_PATH="/opt/vllm/lib64/python${PYTHON_VERSION}/site-packages/torch/lib:/usr/local/lib:$LD_LIBRARY_PATH"
+ENV C_INCLUDE_PATH="/usr/local/include:$C_INCLUDE_PATH"
+ENV UV_LINK_MODE=copy
+ENV CARGO_HOME=/root/.cargo
+ENV RUSTUP_HOME=/root/.rustup
+ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
+ENV GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
+
+COPY . /workspace/vllm
+WORKDIR /workspace/vllm
+
+RUN --mount=type=bind,from=numa-build,src=/tmp/numactl-2.0.16,target=/numactl \
+    make -C /numactl install
+
+# Install dependencies, including PyTorch and Apache Arrow
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
+    --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
+    --mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \
+    --mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
+    --mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \
+    --mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
+     sed -i '/^torch/d' requirements/build.txt && \
+     ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \
+     VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \
+     HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl | head -n 1) && \
+     TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl | head -n 1) && \
+    uv pip install -v \    
+        $ARROW_WHL_FILE  \
+        $VISION_WHL_FILE \
+        $HF_XET_WHL_FILE \
+        $TORCH_WHL_FILE \
+        --index-strategy unsafe-best-match \
+        -r requirements/build.txt \
+        -r requirements/cpu.txt 
+
+# Build and install vllm
+RUN --mount=type=cache,target=/root/.cache/uv \
+    VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \
+    uv pip install "$(echo dist/*.whl)[tensorizer]"
+
+# setup non-root user for vllm
+RUN umask 002 && \
+    useradd --uid 2000 --gid 0 vllm && \
+    mkdir -p /home/vllm && \
+    chmod g+rwx /home/vllm
+
+COPY LICENSE /licenses/vllm.md
+COPY examples/*.jinja /app/data/template/
+
+USER 2000
+WORKDIR /home/vllm
+
+# Set the default entrypoint
+ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
+
diff --git a/vllm_v0.10.0/docker/Dockerfile.tpu b/vllm_v0.10.0/docker/Dockerfile.tpu
new file mode 100644
index 0000000..3474ff5
--- /dev/null
+++ b/vllm_v0.10.0/docker/Dockerfile.tpu
@@ -0,0 +1,31 @@
+ARG NIGHTLY_DATE="20250714"
+ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.12_tpuvm_$NIGHTLY_DATE"
+
+FROM $BASE_IMAGE
+WORKDIR /workspace/vllm
+
+# Install some basic utilities
+RUN apt-get update && apt-get install -y \
+    git \
+    ffmpeg libsm6 libxext6 libgl1
+
+# Build vLLM.
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
+
+# Remove existing versions of dependencies
+RUN pip uninstall -y torch torch_xla torchvision
+
+ENV VLLM_TARGET_DEVICE="tpu"
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=.git,target=.git \
+    python3 -m pip install \
+        -r requirements/tpu.txt
+RUN python3 -m pip install -e .
+
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
+CMD ["/bin/bash"]
diff --git a/vllm_v0.10.0/docker/Dockerfile.xpu b/vllm_v0.10.0/docker/Dockerfile.xpu
new file mode 100644
index 0000000..7d5a589
--- /dev/null
+++ b/vllm_v0.10.0/docker/Dockerfile.xpu
@@ -0,0 +1,56 @@
+# oneapi 2025.0.2 docker base image use rolling 2448 package. https://dgpu-docs.intel.com/releases/packages.html?release=Rolling+2448.13&os=Ubuntu+22.04, and we don't need install driver manually.
+FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu22.04 AS vllm-base
+
+RUN rm /etc/apt/sources.list.d/intel-graphics.list
+
+RUN apt-get update -y && \
+    apt-get install -y --no-install-recommends --fix-missing \
+    curl \
+    ffmpeg \
+    git \
+    libsndfile1 \
+    libsm6 \
+    libxext6 \
+    libgl1 \
+    lsb-release \
+    numactl \
+    python3 \
+    python3-dev \
+    python3-pip \
+    wget
+
+WORKDIR /workspace/vllm
+COPY requirements/xpu.txt /workspace/vllm/requirements/xpu.txt
+COPY requirements/common.txt /workspace/vllm/requirements/common.txt
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install --no-cache-dir \
+    -r requirements/xpu.txt
+
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
+
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
+
+ENV VLLM_TARGET_DEVICE=xpu
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=.git,target=.git \
+    python3 setup.py install
+
+CMD ["/bin/bash"]
+
+FROM vllm-base AS vllm-openai
+
+# install additional dependencies for openai api server
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] modelscope
+
+ENV VLLM_USAGE_SOURCE production-docker-image \
+    TRITON_XPU_PROFILE 1
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/vllm_v0.10.0/docs/.nav.yml b/vllm_v0.10.0/docs/.nav.yml
new file mode 100644
index 0000000..ab54dc3
--- /dev/null
+++ b/vllm_v0.10.0/docs/.nav.yml
@@ -0,0 +1,73 @@
+nav:
+  - Home: 
+    - vLLM: README.md
+    - Getting Started:
+      - getting_started/quickstart.md
+      - getting_started/installation
+    - Examples:
+      - Offline Inference: examples/offline_inference
+      - Online Serving: examples/online_serving
+      - Others: examples/others
+    - Quick Links:
+      - User Guide: usage/README.md
+      - Developer Guide: contributing/README.md
+      - API Reference: api/README.md
+      - CLI Reference: cli/README.md
+    - Timeline:
+      - Roadmap: https://roadmap.vllm.ai
+      - Releases: https://github.com/vllm-project/vllm/releases
+  - User Guide:
+    - Summary: usage/README.md
+    - usage/v1_guide.md
+    - General:
+      - usage/*
+    - Inference and Serving:
+      - serving/offline_inference.md
+      - serving/openai_compatible_server.md
+      - serving/*
+      - serving/integrations
+    - Deployment:
+      - deployment/*
+      - deployment/frameworks
+      - deployment/integrations
+    - Training: training
+    - Configuration:
+      - Summary: configuration/README.md
+      - configuration/*
+    - Models:
+      - models/supported_models.md
+      - models/generative_models.md
+      - models/pooling_models.md
+      - models/extensions
+      - Hardware Supported Models: models/hardware_supported_models
+    - Features:
+      - features/compatibility_matrix.md
+      - features/*
+      - features/quantization
+  - Developer Guide:
+    - Summary: contributing/README.md
+    - General:
+      - glob: contributing/*
+        flatten_single_child_sections: true
+    - Model Implementation: 
+      - contributing/model/README.md
+      - contributing/model/basic.md
+      - contributing/model/registration.md
+      - contributing/model/tests.md
+      - contributing/model/multimodal.md
+    - CI: contributing/ci
+    - Design Documents:
+      - V0: design
+      - V1: design/v1
+  - API Reference:
+    - Summary: api/README.md
+    - Contents:
+      - glob: api/vllm/*
+        preserve_directory_names: true
+  - CLI Reference:
+    - Summary: cli/README.md
+  - Community:
+    - community/*
+    - Blog: https://blog.vllm.ai
+    - Forum: https://discuss.vllm.ai
+    - Slack: https://slack.vllm.ai
diff --git a/vllm_v0.10.0/docs/README.md b/vllm_v0.10.0/docs/README.md
new file mode 100644
index 0000000..6823008
--- /dev/null
+++ b/vllm_v0.10.0/docs/README.md
@@ -0,0 +1,51 @@
+# Welcome to vLLM
+
+<figure markdown="span">
+  ![](./assets/logos/vllm-logo-text-light.png){ align="center" alt="vLLM Light" class="logo-light" width="60%" }
+  ![](./assets/logos/vllm-logo-text-dark.png){ align="center" alt="vLLM Dark" class="logo-dark" width="60%" }
+</figure>
+
+<p style="text-align:center">
+<strong>Easy, fast, and cheap LLM serving for everyone
+</strong>
+</p>
+
+<p style="text-align:center">
+<script async defer src="https://buttons.github.io/buttons.js"></script>
+<a class="github-button" href="https://github.com/vllm-project/vllm" data-show-count="true" data-size="large" aria-label="Star">Star</a>
+<a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-show-count="true" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
+<a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-show-count="true" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
+</p>
+
+vLLM is a fast and easy-to-use library for LLM inference and serving.
+
+Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.
+
+vLLM is fast with:
+
+- State-of-the-art serving throughput
+- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
+- Continuous batching of incoming requests
+- Fast model execution with CUDA/HIP graph
+- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8
+- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
+- Speculative decoding
+- Chunked prefill
+
+vLLM is flexible and easy to use with:
+
+- Seamless integration with popular HuggingFace models
+- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
+- Tensor, pipeline, data and expert parallelism support for distributed inference
+- Streaming outputs
+- OpenAI-compatible API server
+- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, IBM Power CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
+- Prefix caching support
+- Multi-LoRA support
+
+For more information, check out the following:
+
+- [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention)
+- [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023)
+- [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al.
+- [vLLM Meetups](community/meetups.md)
diff --git a/vllm_v0.10.0/docs/api/README.md b/vllm_v0.10.0/docs/api/README.md
new file mode 100644
index 0000000..db4dab0
--- /dev/null
+++ b/vllm_v0.10.0/docs/api/README.md
@@ -0,0 +1,105 @@
+# Summary
+
+[](){ #configuration }
+
+## Configuration
+
+API documentation for vLLM's configuration classes.
+
+- [vllm.config.ModelConfig][]
+- [vllm.config.CacheConfig][]
+- [vllm.config.LoadConfig][]
+- [vllm.config.ParallelConfig][]
+- [vllm.config.SchedulerConfig][]
+- [vllm.config.DeviceConfig][]
+- [vllm.config.SpeculativeConfig][]
+- [vllm.config.LoRAConfig][]
+- [vllm.config.MultiModalConfig][]
+- [vllm.config.PoolerConfig][]
+- [vllm.config.DecodingConfig][]
+- [vllm.config.ObservabilityConfig][]
+- [vllm.config.KVTransferConfig][]
+- [vllm.config.CompilationConfig][]
+- [vllm.config.VllmConfig][]
+
+[](){ #offline-inference-api }
+
+## Offline Inference
+
+LLM Class.
+
+- [vllm.LLM][]
+
+LLM Inputs.
+
+- [vllm.inputs.PromptType][]
+- [vllm.inputs.TextPrompt][]
+- [vllm.inputs.TokensPrompt][]
+
+## vLLM Engines
+
+Engine classes for offline and online inference.
+
+- [vllm.LLMEngine][]
+- [vllm.AsyncLLMEngine][]
+
+## Inference Parameters
+
+Inference parameters for vLLM APIs.
+
+[](){ #sampling-params }
+[](){ #pooling-params }
+
+- [vllm.SamplingParams][]
+- [vllm.PoolingParams][]
+
+[](){ #multi-modality }
+
+## Multi-Modality
+
+vLLM provides experimental support for multi-modal models through the [vllm.multimodal][] package.
+
+Multi-modal inputs can be passed alongside text and token prompts to [supported models][supported-mm-models]
+via the `multi_modal_data` field in [vllm.inputs.PromptType][].
+
+Looking to add your own multi-modal model? Please follow the instructions listed [here](../contributing/model/multimodal.md).
+
+- [vllm.multimodal.MULTIMODAL_REGISTRY][]
+
+### Inputs
+
+User-facing inputs.
+
+- [vllm.multimodal.inputs.MultiModalDataDict][]
+
+Internal data structures.
+
+- [vllm.multimodal.inputs.PlaceholderRange][]
+- [vllm.multimodal.inputs.NestedTensors][]
+- [vllm.multimodal.inputs.MultiModalFieldElem][]
+- [vllm.multimodal.inputs.MultiModalFieldConfig][]
+- [vllm.multimodal.inputs.MultiModalKwargsItem][]
+- [vllm.multimodal.inputs.MultiModalKwargs][]
+- [vllm.multimodal.inputs.MultiModalInputs][]
+
+### Data Parsing
+
+- [vllm.multimodal.parse][]
+
+### Data Processing
+
+- [vllm.multimodal.processing][]
+
+### Memory Profiling
+
+- [vllm.multimodal.profiling][]
+
+### Registry
+
+- [vllm.multimodal.registry][]
+
+## Model Development
+
+- [vllm.model_executor.models.interfaces_base][]
+- [vllm.model_executor.models.interfaces][]
+- [vllm.model_executor.models.adapters][]
diff --git a/vllm_v0.10.0/docs/api/vllm/.meta.yml b/vllm_v0.10.0/docs/api/vllm/.meta.yml
new file mode 100644
index 0000000..c15adfe
--- /dev/null
+++ b/vllm_v0.10.0/docs/api/vllm/.meta.yml
@@ -0,0 +1,2 @@
+search:
+  boost: 0.5
diff --git a/vllm_v0.10.0/docs/assets/contributing/dockerfile-stages-dependency.png b/vllm_v0.10.0/docs/assets/contributing/dockerfile-stages-dependency.png
new file mode 100644
index 0000000..0838bfa
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/contributing/dockerfile-stages-dependency.png differ
diff --git a/vllm_v0.10.0/docs/assets/deployment/anything-llm-chat-with-doc.png b/vllm_v0.10.0/docs/assets/deployment/anything-llm-chat-with-doc.png
new file mode 100644
index 0000000..f9b57f5
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/deployment/anything-llm-chat-with-doc.png differ
diff --git a/vllm_v0.10.0/docs/assets/deployment/anything-llm-chat-without-doc.png b/vllm_v0.10.0/docs/assets/deployment/anything-llm-chat-without-doc.png
new file mode 100644
index 0000000..952a43b
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/deployment/anything-llm-chat-without-doc.png differ
diff --git a/vllm_v0.10.0/docs/assets/deployment/anything-llm-provider.png b/vllm_v0.10.0/docs/assets/deployment/anything-llm-provider.png
new file mode 100644
index 0000000..bb699f7
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/deployment/anything-llm-provider.png differ
diff --git a/vllm_v0.10.0/docs/assets/deployment/anything-llm-upload-doc.png b/vllm_v0.10.0/docs/assets/deployment/anything-llm-upload-doc.png
new file mode 100644
index 0000000..00c70e9
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/deployment/anything-llm-upload-doc.png differ
diff --git a/vllm_v0.10.0/docs/assets/deployment/architecture_helm_deployment.png b/vllm_v0.10.0/docs/assets/deployment/architecture_helm_deployment.png
new file mode 100644
index 0000000..8f9ca29
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/deployment/architecture_helm_deployment.png differ
diff --git a/vllm_v0.10.0/docs/assets/deployment/chatbox-chat.png b/vllm_v0.10.0/docs/assets/deployment/chatbox-chat.png
new file mode 100644
index 0000000..b1718cb
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/deployment/chatbox-chat.png differ
diff --git a/vllm_v0.10.0/docs/assets/deployment/chatbox-settings.png b/vllm_v0.10.0/docs/assets/deployment/chatbox-settings.png
new file mode 100644
index 0000000..a8e3d7b
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/deployment/chatbox-settings.png differ
diff --git a/vllm_v0.10.0/docs/assets/deployment/dify-chat.png b/vllm_v0.10.0/docs/assets/deployment/dify-chat.png
new file mode 100644
index 0000000..dfea233
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/deployment/dify-chat.png differ
diff --git a/vllm_v0.10.0/docs/assets/deployment/dify-create-chatbot.png b/vllm_v0.10.0/docs/assets/deployment/dify-create-chatbot.png
new file mode 100644
index 0000000..07bbde5
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/deployment/dify-create-chatbot.png differ
diff --git a/vllm_v0.10.0/docs/assets/deployment/dify-settings.png b/vllm_v0.10.0/docs/assets/deployment/dify-settings.png
new file mode 100644
index 0000000..7900cc7
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/deployment/dify-settings.png differ
diff --git a/vllm_v0.10.0/docs/assets/deployment/dp_external_lb.png b/vllm_v0.10.0/docs/assets/deployment/dp_external_lb.png
new file mode 100644
index 0000000..a5d3a2f
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/deployment/dp_external_lb.png differ
diff --git a/vllm_v0.10.0/docs/assets/deployment/dp_internal_lb.png b/vllm_v0.10.0/docs/assets/deployment/dp_internal_lb.png
new file mode 100644
index 0000000..6d6a78a
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/deployment/dp_internal_lb.png differ
diff --git a/vllm_v0.10.0/docs/assets/deployment/open_webui.png b/vllm_v0.10.0/docs/assets/deployment/open_webui.png
new file mode 100644
index 0000000..7018b4d
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/deployment/open_webui.png differ
diff --git a/vllm_v0.10.0/docs/assets/deployment/streamlit-chat.png b/vllm_v0.10.0/docs/assets/deployment/streamlit-chat.png
new file mode 100644
index 0000000..1e37b9d
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/deployment/streamlit-chat.png differ
diff --git a/vllm_v0.10.0/docs/assets/design/arch_overview/entrypoints.excalidraw.png b/vllm_v0.10.0/docs/assets/design/arch_overview/entrypoints.excalidraw.png
new file mode 100644
index 0000000..bbf4628
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/design/arch_overview/entrypoints.excalidraw.png differ
diff --git a/vllm_v0.10.0/docs/assets/design/arch_overview/llm_engine.excalidraw.png b/vllm_v0.10.0/docs/assets/design/arch_overview/llm_engine.excalidraw.png
new file mode 100644
index 0000000..ade1d60
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/design/arch_overview/llm_engine.excalidraw.png differ
diff --git a/vllm_v0.10.0/docs/assets/design/hierarchy.png b/vllm_v0.10.0/docs/assets/design/hierarchy.png
new file mode 100644
index 0000000..6a1b4ba
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/design/hierarchy.png differ
diff --git a/vllm_v0.10.0/docs/assets/design/v1/metrics/intervals-1.png b/vllm_v0.10.0/docs/assets/design/v1/metrics/intervals-1.png
new file mode 100644
index 0000000..fc4ba44
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/design/v1/metrics/intervals-1.png differ
diff --git a/vllm_v0.10.0/docs/assets/design/v1/metrics/intervals-2.png b/vllm_v0.10.0/docs/assets/design/v1/metrics/intervals-2.png
new file mode 100644
index 0000000..03c8b43
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/design/v1/metrics/intervals-2.png differ
diff --git a/vllm_v0.10.0/docs/assets/design/v1/metrics/intervals-3.png b/vllm_v0.10.0/docs/assets/design/v1/metrics/intervals-3.png
new file mode 100644
index 0000000..547cc6d
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/design/v1/metrics/intervals-3.png differ
diff --git a/vllm_v0.10.0/docs/assets/design/v1/prefix_caching/example-time-1.png b/vllm_v0.10.0/docs/assets/design/v1/prefix_caching/example-time-1.png
new file mode 100644
index 0000000..d5a165f
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/design/v1/prefix_caching/example-time-1.png differ
diff --git a/vllm_v0.10.0/docs/assets/design/v1/prefix_caching/example-time-3.png b/vllm_v0.10.0/docs/assets/design/v1/prefix_caching/example-time-3.png
new file mode 100644
index 0000000..d753a40
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/design/v1/prefix_caching/example-time-3.png differ
diff --git a/vllm_v0.10.0/docs/assets/design/v1/prefix_caching/example-time-4.png b/vllm_v0.10.0/docs/assets/design/v1/prefix_caching/example-time-4.png
new file mode 100644
index 0000000..d463248
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/design/v1/prefix_caching/example-time-4.png differ
diff --git a/vllm_v0.10.0/docs/assets/design/v1/prefix_caching/example-time-5.png b/vllm_v0.10.0/docs/assets/design/v1/prefix_caching/example-time-5.png
new file mode 100644
index 0000000..231ebc6
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/design/v1/prefix_caching/example-time-5.png differ
diff --git a/vllm_v0.10.0/docs/assets/design/v1/prefix_caching/example-time-6.png b/vllm_v0.10.0/docs/assets/design/v1/prefix_caching/example-time-6.png
new file mode 100644
index 0000000..6ded917
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/design/v1/prefix_caching/example-time-6.png differ
diff --git a/vllm_v0.10.0/docs/assets/design/v1/prefix_caching/example-time-7.png b/vllm_v0.10.0/docs/assets/design/v1/prefix_caching/example-time-7.png
new file mode 100644
index 0000000..0b536de
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/design/v1/prefix_caching/example-time-7.png differ
diff --git a/vllm_v0.10.0/docs/assets/design/v1/prefix_caching/free.png b/vllm_v0.10.0/docs/assets/design/v1/prefix_caching/free.png
new file mode 100644
index 0000000..cbc2f22
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/design/v1/prefix_caching/free.png differ
diff --git a/vllm_v0.10.0/docs/assets/design/v1/prefix_caching/overview.png b/vllm_v0.10.0/docs/assets/design/v1/prefix_caching/overview.png
new file mode 100644
index 0000000..14fb985
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/design/v1/prefix_caching/overview.png differ
diff --git a/vllm_v0.10.0/docs/assets/features/disagg_prefill/abstraction.jpg b/vllm_v0.10.0/docs/assets/features/disagg_prefill/abstraction.jpg
new file mode 100644
index 0000000..1a99e3e
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/features/disagg_prefill/abstraction.jpg differ
diff --git a/vllm_v0.10.0/docs/assets/features/disagg_prefill/overview.jpg b/vllm_v0.10.0/docs/assets/features/disagg_prefill/overview.jpg
new file mode 100644
index 0000000..f029b4c
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/features/disagg_prefill/overview.jpg differ
diff --git a/vllm_v0.10.0/docs/assets/kernel/k_vecs.png b/vllm_v0.10.0/docs/assets/kernel/k_vecs.png
new file mode 100644
index 0000000..4b7be13
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/kernel/k_vecs.png differ
diff --git a/vllm_v0.10.0/docs/assets/kernel/key.png b/vllm_v0.10.0/docs/assets/kernel/key.png
new file mode 100644
index 0000000..2059b60
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/kernel/key.png differ
diff --git a/vllm_v0.10.0/docs/assets/kernel/logits_vec.png b/vllm_v0.10.0/docs/assets/kernel/logits_vec.png
new file mode 100644
index 0000000..373eea4
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/kernel/logits_vec.png differ
diff --git a/vllm_v0.10.0/docs/assets/kernel/q_vecs.png b/vllm_v0.10.0/docs/assets/kernel/q_vecs.png
new file mode 100644
index 0000000..f55b374
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/kernel/q_vecs.png differ
diff --git a/vllm_v0.10.0/docs/assets/kernel/query.png b/vllm_v0.10.0/docs/assets/kernel/query.png
new file mode 100644
index 0000000..e2d15eb
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/kernel/query.png differ
diff --git a/vllm_v0.10.0/docs/assets/kernel/v_vec.png b/vllm_v0.10.0/docs/assets/kernel/v_vec.png
new file mode 100644
index 0000000..75d344a
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/kernel/v_vec.png differ
diff --git a/vllm_v0.10.0/docs/assets/kernel/value.png b/vllm_v0.10.0/docs/assets/kernel/value.png
new file mode 100644
index 0000000..56b0b9e
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/kernel/value.png differ
diff --git a/vllm_v0.10.0/docs/assets/logos/vllm-logo-only-light.ico b/vllm_v0.10.0/docs/assets/logos/vllm-logo-only-light.ico
new file mode 100644
index 0000000..27528ce
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/logos/vllm-logo-only-light.ico differ
diff --git a/vllm_v0.10.0/docs/assets/logos/vllm-logo-only-light.png b/vllm_v0.10.0/docs/assets/logos/vllm-logo-only-light.png
new file mode 100644
index 0000000..7aaf174
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/logos/vllm-logo-only-light.png differ
diff --git a/vllm_v0.10.0/docs/assets/logos/vllm-logo-text-dark.png b/vllm_v0.10.0/docs/assets/logos/vllm-logo-text-dark.png
new file mode 100644
index 0000000..959a42f
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/logos/vllm-logo-text-dark.png differ
diff --git a/vllm_v0.10.0/docs/assets/logos/vllm-logo-text-light.png b/vllm_v0.10.0/docs/assets/logos/vllm-logo-text-light.png
new file mode 100644
index 0000000..1ead997
Binary files /dev/null and b/vllm_v0.10.0/docs/assets/logos/vllm-logo-text-light.png differ
diff --git a/vllm_v0.10.0/docs/cli/README.md b/vllm_v0.10.0/docs/cli/README.md
new file mode 100644
index 0000000..dfb6051
--- /dev/null
+++ b/vllm_v0.10.0/docs/cli/README.md
@@ -0,0 +1,174 @@
+---
+toc_depth: 4
+---
+
+# vLLM CLI Guide
+
+The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with:
+
+```
+vllm --help
+```
+
+Available Commands:
+
+```
+vllm {chat,complete,serve,bench,collect-env,run-batch}
+```
+
+## serve
+
+Start the vLLM OpenAI Compatible API server.
+
+??? console "Examples"
+
+    ```bash
+    # Start with a model
+    vllm serve meta-llama/Llama-2-7b-hf
+
+    # Specify the port
+    vllm serve meta-llama/Llama-2-7b-hf --port 8100
+
+    # Check with --help for more options
+    # To list all groups
+    vllm serve --help=listgroup
+
+    # To view a argument group
+    vllm serve --help=ModelConfig
+
+    # To view a single argument
+    vllm serve --help=max-num-seqs
+
+    # To search by keyword
+    vllm serve --help=max
+
+    # To view full help with pager (less/more)
+    vllm serve --help=page
+    ```
+
+### Options
+
+--8<-- "docs/argparse/serve.md"
+
+## chat
+
+Generate chat completions via the running API server.
+
+```bash
+# Directly connect to localhost API without arguments
+vllm chat
+
+# Specify API url
+vllm chat --url http://{vllm-serve-host}:{vllm-serve-port}/v1
+
+# Quick chat with a single prompt
+vllm chat --quick "hi"
+```
+
+## complete
+
+Generate text completions based on the given prompt via the running API server.
+
+```bash
+# Directly connect to localhost API without arguments
+vllm complete
+
+# Specify API url
+vllm complete --url http://{vllm-serve-host}:{vllm-serve-port}/v1
+
+# Quick complete with a single prompt
+vllm complete --quick "The future of AI is"
+```
+
+</details>
+
+## bench
+
+Run benchmark tests for latency online serving throughput and offline inference throughput.
+
+To use benchmark commands, please install with extra dependencies using `pip install vllm[bench]`.
+
+Available Commands:
+
+```bash
+vllm bench {latency, serve, throughput}
+```
+
+### latency
+
+Benchmark the latency of a single batch of requests.
+
+```bash
+vllm bench latency \
+    --model meta-llama/Llama-3.2-1B-Instruct \
+    --input-len 32 \
+    --output-len 1 \
+    --enforce-eager \
+    --load-format dummy
+```
+
+### serve
+
+Benchmark the online serving throughput.
+
+```bash
+vllm bench serve \
+    --model meta-llama/Llama-3.2-1B-Instruct \
+    --host server-host \
+    --port server-port \
+    --random-input-len 32 \
+    --random-output-len 4  \
+    --num-prompts  5
+```
+
+### throughput
+
+Benchmark offline inference throughput.
+
+```bash
+vllm bench throughput \
+    --model meta-llama/Llama-3.2-1B-Instruct \
+    --input-len 32 \
+    --output-len 1 \
+    --enforce-eager \
+    --load-format dummy
+```
+
+## collect-env
+
+Start collecting environment information.
+
+```bash
+vllm collect-env
+```
+
+## run-batch
+
+Run batch prompts and write results to file.
+
+<details>
+<summary>Examples</summary>
+
+```bash
+# Running with a local file
+vllm run-batch \
+    -i offline_inference/openai_batch/openai_example_batch.jsonl \
+    -o results.jsonl \
+    --model meta-llama/Meta-Llama-3-8B-Instruct
+
+# Using remote file
+vllm run-batch \
+    -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
+    -o results.jsonl \
+    --model meta-llama/Meta-Llama-3-8B-Instruct
+```
+
+</details>
+
+## More Help
+
+For detailed options of any subcommand, use:
+
+```bash
+vllm <subcommand> --help
+```
diff --git a/vllm_v0.10.0/docs/community/contact_us.md b/vllm_v0.10.0/docs/community/contact_us.md
new file mode 100644
index 0000000..04c28cd
--- /dev/null
+++ b/vllm_v0.10.0/docs/community/contact_us.md
@@ -0,0 +1,3 @@
+# Contact Us
+
+--8<-- "README.md:contact-us"
diff --git a/vllm_v0.10.0/docs/community/meetups.md b/vllm_v0.10.0/docs/community/meetups.md
new file mode 100644
index 0000000..e8b3a9c
--- /dev/null
+++ b/vllm_v0.10.0/docs/community/meetups.md
@@ -0,0 +1,20 @@
+# Meetups
+
+We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
+
+- [NYC vLLM Meetup](https://lu.ma/c1rqyf1f), May 7th, 2025. [[Slides]](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing)
+- [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day), April 3rd 2025. [[Slides]](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
+- [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama), March 27th 2025. [[Slides]](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
+- [The first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg), March 16th 2025. [[Slides]](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
+- [The East Coast vLLM Meetup](https://lu.ma/7mu4k4xx), March 11th 2025. [[Slides]](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0)
+- [The ninth vLLM meetup](https://lu.ma/h7g3kuj9), with Meta, February 27th 2025. [[Slides]](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing)
+- [The eighth vLLM meetup](https://lu.ma/zep56hui), with Google Cloud, January 22nd 2025. [[Slides]](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing)
+- [The seventh vLLM meetup](https://lu.ma/h0qvrajz), with Snowflake, November 14th 2024. [[Slides]](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing)
+- [The sixth vLLM meetup](https://lu.ma/87q3nvnh), with NVIDIA, September 9th 2024. [[Slides]](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing)
+- [The fifth vLLM meetup](https://lu.ma/lp0gyjqr), with AWS, July 24th 2024. [[Slides]](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing)
+- [The fourth vLLM meetup](https://lu.ma/agivllm), with Cloudflare and BentoML, June 11th 2024. [[Slides]](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing)
+- [The third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/), with Roblox, April 2nd 2024. [[Slides]](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing)
+- [The second vLLM meetup](https://lu.ma/ygxbpzhl), with IBM Research, January 31st 2024. [[Slides]](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing) [[Video (vLLM Update)]](https://youtu.be/Y0C-DUvEnZQ) [[Video (IBM Research & torch.compile)]](https://youtu.be/m0dMtFLI-dg)
+- [The first vLLM meetup](https://lu.ma/first-vllm-meetup), with a16z, October 5th 2023. [[Slides]](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing)
+
+We are always looking for speakers and sponsors at San Francisco Bay Area and potentially other locations. If you are interested in speaking or sponsoring, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu).
diff --git a/vllm_v0.10.0/docs/community/sponsors.md b/vllm_v0.10.0/docs/community/sponsors.md
new file mode 100644
index 0000000..b8a1ddb
--- /dev/null
+++ b/vllm_v0.10.0/docs/community/sponsors.md
@@ -0,0 +1,39 @@
+# Sponsors
+
+vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
+
+<!-- Note: Please sort them in alphabetical order. -->
+<!-- Note: Please keep these consistent with README.md. -->
+
+Cash Donations:
+
+- a16z
+- Dropbox
+- Sequoia Capital
+- Skywork AI
+- ZhenFund
+
+Compute Resources:
+
+- AMD
+- Anyscale
+- AWS
+- Crusoe Cloud
+- Databricks
+- DeepInfra
+- Google Cloud
+- Intel
+- Lambda Lab
+- Nebius
+- Novita AI
+- NVIDIA
+- Replicate
+- Roblox
+- RunPod
+- Trainy
+- UC Berkeley
+- UC San Diego
+
+Slack Sponsor: Anyscale
+
+We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
diff --git a/vllm_v0.10.0/docs/configuration/README.md b/vllm_v0.10.0/docs/configuration/README.md
new file mode 100644
index 0000000..6a8fbc7
--- /dev/null
+++ b/vllm_v0.10.0/docs/configuration/README.md
@@ -0,0 +1,9 @@
+# Configuration Options
+
+This section lists the most common options for running vLLM.
+
+There are three main levels of configuration, from highest priority to lowest priority:
+
+- [Request parameters][completions-api] and [input arguments][sampling-params]
+- [Engine arguments](./engine_args.md)
+- [Environment variables](./env_vars.md)
diff --git a/vllm_v0.10.0/docs/configuration/conserving_memory.md b/vllm_v0.10.0/docs/configuration/conserving_memory.md
new file mode 100644
index 0000000..4d5c961
--- /dev/null
+++ b/vllm_v0.10.0/docs/configuration/conserving_memory.md
@@ -0,0 +1,148 @@
+# Conserving Memory
+
+Large models might cause your machine to run out of memory (OOM). Here are some options that help alleviate this problem.
+
+## Tensor Parallelism (TP)
+
+Tensor parallelism (`tensor_parallel_size` option) can be used to split the model across multiple GPUs.
+
+The following code splits the model across 2 GPUs.
+
+```python
+from vllm import LLM
+
+llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
+          tensor_parallel_size=2)
+```
+
+!!! warning
+    To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. [torch.cuda.set_device][])
+    before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
+
+    To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable.
+
+!!! note
+    With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism).
+
+    You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
+
+## Quantization
+
+Quantized models take less memory at the cost of lower precision.
+
+Statically quantized models can be downloaded from HF Hub (some popular ones are available at [Red Hat AI](https://huggingface.co/RedHatAI))
+and used directly without extra configuration.
+
+Dynamic quantization is also supported via the `quantization` option -- see [here](../features/quantization/README.md) for more details.
+
+## Context length and batch size
+
+You can further reduce memory usage by limiting the context length of the model (`max_model_len` option)
+and the maximum batch size (`max_num_seqs` option).
+
+```python
+from vllm import LLM
+
+llm = LLM(model="adept/fuyu-8b",
+          max_model_len=2048,
+          max_num_seqs=2)
+```
+
+## Reduce CUDA Graphs
+
+By default, we optimize model inference using CUDA graphs which take up extra memory in the GPU.
+
+!!! warning
+    CUDA graph capture takes up more memory in V1 than in V0.
+
+You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage:
+
+??? code
+
+    ```python
+    from vllm import LLM
+    from vllm.config import CompilationConfig, CompilationLevel
+
+    llm = LLM(
+        model="meta-llama/Llama-3.1-8B-Instruct",
+        compilation_config=CompilationConfig(
+            level=CompilationLevel.PIECEWISE,
+            # By default, it goes up to max_num_seqs
+            cudagraph_capture_sizes=[1, 2, 4, 8, 16],
+        ),
+    )
+    ```
+
+You can disable graph capturing completely via the `enforce_eager` flag:
+
+```python
+from vllm import LLM
+
+llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
+          enforce_eager=True)
+```
+
+## Adjust cache size
+
+If you run out of CPU RAM, try the following options:
+
+- (Multi-modal models only) you can set the size of multi-modal input cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB).
+- (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB).
+
+## Multi-modal input limits
+
+You can allow a smaller number of multi-modal items per prompt to reduce the memory footprint of the model:
+
+```python
+from vllm import LLM
+
+# Accept up to 3 images and 1 video per prompt
+llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+          limit_mm_per_prompt={"image": 3, "video": 1})
+```
+
+You can go a step further and disable unused modalities completely by setting its limit to zero.
+For example, if your application only accepts image input, there is no need to allocate any memory for videos.
+
+```python
+from vllm import LLM
+
+# Accept any number of images but no videos
+llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+          limit_mm_per_prompt={"video": 0})
+```
+
+You can even run a multi-modal model for text-only inference:
+
+```python
+from vllm import LLM
+
+# Don't accept images. Just text.
+llm = LLM(model="google/gemma-3-27b-it",
+          limit_mm_per_prompt={"image": 0})
+```
+
+## Multi-modal processor arguments
+
+For certain models, you can adjust the multi-modal processor arguments to
+reduce the size of the processed multi-modal inputs, which in turn saves memory.
+
+Here are some examples:
+
+??? code
+
+    ```python
+    from vllm import LLM
+
+    # Available for Qwen2-VL series models
+    llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+            mm_processor_kwargs={
+                "max_pixels": 768 * 768,  # Default is 1280 * 28 * 28
+            })
+
+    # Available for InternVL series models
+    llm = LLM(model="OpenGVLab/InternVL2-2B",
+            mm_processor_kwargs={
+                "max_dynamic_patch": 4,  # Default is 12
+            })
+    ```
diff --git a/vllm_v0.10.0/docs/configuration/engine_args.md b/vllm_v0.10.0/docs/configuration/engine_args.md
new file mode 100644
index 0000000..c3c1d5a
--- /dev/null
+++ b/vllm_v0.10.0/docs/configuration/engine_args.md
@@ -0,0 +1,20 @@
+---
+toc_depth: 3
+---
+
+# Engine Arguments
+
+Engine arguments control the behavior of the vLLM engine.
+
+- For [offline inference](../serving/offline_inference.md), they are part of the arguments to [LLM][vllm.LLM] class.
+- For [online serving](../serving/openai_compatible_server.md), they are part of the arguments to `vllm serve`.
+
+The engine argument classes, [EngineArgs][vllm.engine.arg_utils.EngineArgs] and [AsyncEngineArgs][vllm.engine.arg_utils.AsyncEngineArgs], are a combination of the configuration classes defined in [vllm.config][]. Therefore, if you are interested in developer documentation, we recommend looking at these configuration classes as they are the source of truth for types, defaults and docstrings.
+
+## `EngineArgs`
+
+--8<-- "docs/argparse/engine_args.md"
+
+## `AsyncEngineArgs`
+
+--8<-- "docs/argparse/async_engine_args.md"
diff --git a/vllm_v0.10.0/docs/configuration/env_vars.md b/vllm_v0.10.0/docs/configuration/env_vars.md
new file mode 100644
index 0000000..2c0a898
--- /dev/null
+++ b/vllm_v0.10.0/docs/configuration/env_vars.md
@@ -0,0 +1,14 @@
+# Environment Variables
+
+vLLM uses the following environment variables to configure the system:
+
+!!! warning
+    Please note that `VLLM_PORT` and `VLLM_HOST_IP` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use `--host $VLLM_HOST_IP` and `--port $VLLM_PORT` to start the API server, it will not work.
+
+    All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables).
+
+??? code
+
+    ```python
+    --8<-- "vllm/envs.py:env-vars-definition"
+    ```
diff --git a/vllm_v0.10.0/docs/configuration/model_resolution.md b/vllm_v0.10.0/docs/configuration/model_resolution.md
new file mode 100644
index 0000000..49576a8
--- /dev/null
+++ b/vllm_v0.10.0/docs/configuration/model_resolution.md
@@ -0,0 +1,23 @@
+# Model Resolution
+
+vLLM loads HuggingFace-compatible models by inspecting the `architectures` field in `config.json` of the model repository
+and finding the corresponding implementation that is registered to vLLM.
+Nevertheless, our model resolution may fail for the following reasons:
+
+- The `config.json` of the model repository lacks the `architectures` field.
+- Unofficial repositories refer to a model using alternative names which are not recorded in vLLM.
+- The same architecture name is used for multiple models, creating ambiguity as to which model should be loaded.
+
+To fix this, explicitly specify the model architecture by passing `config.json` overrides to the `hf_overrides` option.
+For example:
+
+```python
+from vllm import LLM
+
+llm = LLM(
+    model="cerebras/Cerebras-GPT-1.3B",
+    hf_overrides={"architectures": ["GPT2LMHeadModel"]},  # GPT-2
+)
+```
+
+Our [list of supported models](../models/supported_models.md) shows the model architectures that are recognized by vLLM.
diff --git a/vllm_v0.10.0/docs/configuration/optimization.md b/vllm_v0.10.0/docs/configuration/optimization.md
new file mode 100644
index 0000000..811925c
--- /dev/null
+++ b/vllm_v0.10.0/docs/configuration/optimization.md
@@ -0,0 +1,187 @@
+# Optimization and Tuning
+
+This guide covers optimization strategies and performance tuning for vLLM V1.
+
+## Preemption
+
+Due to the auto-regressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests.
+In such cases, vLLM can preempt requests to free up KV cache space for other requests. Preempted requests are recomputed when sufficient KV cache space becomes
+available again. When this occurs, you may see the following warning:
+
+```text
+WARNING 05-09 00:49:33 scheduler.py:1057 Sequence group 0 is preempted by PreemptionMode.RECOMPUTE mode because there is not enough KV cache space. This can affect the end-to-end performance. Increase gpu_memory_utilization or tensor_parallel_size to provide more KV cache memory. total_cumulative_preemption_cnt=1
+```
+
+While this mechanism ensures system robustness, preemption and recomputation can adversely affect end-to-end latency.
+If you frequently encounter preemptions, consider the following actions:
+
+- Increase `gpu_memory_utilization`. vLLM pre-allocates GPU cache using this percentage of memory. By increasing utilization, you can provide more KV cache space.
+- Decrease `max_num_seqs` or `max_num_batched_tokens`. This reduces the number of concurrent requests in a batch, thereby requiring less KV cache space.
+- Increase `tensor_parallel_size`. This shards model weights across GPUs, allowing each GPU to have more memory available for KV cache. However, increasing this value may cause excessive synchronization overhead.
+- Increase `pipeline_parallel_size`. This distributes model layers across GPUs, reducing the memory needed for model weights on each GPU, indirectly leaving more memory available for KV cache. However, increasing this value may cause latency penalties.
+
+You can monitor the number of preemption requests through Prometheus metrics exposed by vLLM. Additionally, you can log the cumulative number of preemption requests by setting `disable_log_stats=False`.
+
+In vLLM V1, the default preemption mode is `RECOMPUTE` rather than `SWAP`, as recomputation has lower overhead in the V1 architecture.
+
+[](){ #chunked-prefill }
+
+## Chunked Prefill
+
+Chunked prefill allows vLLM to process large prefills in smaller chunks and batch them together with decode requests. This feature helps improve both throughput and latency by better balancing compute-bound (prefill) and memory-bound (decode) operations.
+
+In vLLM V1, **chunked prefill is always enabled by default**. This is different from vLLM V0, where it was conditionally enabled based on model characteristics.
+
+With chunked prefill enabled, the scheduling policy prioritizes decode requests. It batches all pending decode requests before scheduling any prefill operations. When there are available tokens in the `max_num_batched_tokens` budget, it schedules pending prefills. If a pending prefill request cannot fit into `max_num_batched_tokens`, it automatically chunks it.
+
+This policy has two benefits:
+
+- It improves ITL and generation decode because decode requests are prioritized.
+- It helps achieve better GPU utilization by locating compute-bound (prefill) and memory-bound (decode) requests to the same batch.
+
+### Performance Tuning with Chunked Prefill
+
+You can tune the performance by adjusting `max_num_batched_tokens`:
+
+- Smaller values (e.g., 2048) achieve better inter-token latency (ITL) because there are fewer prefills slowing down decodes.
+- Higher values achieve better time to first token (TTFT) as you can process more prefill tokens in a batch.
+- For optimal throughput, we recommend setting `max_num_batched_tokens > 8096` especially for smaller models on large GPUs.
+- If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the V0 default scheduling policy (except that it still prioritizes decodes).
+
+```python
+from vllm import LLM
+
+# Set max_num_batched_tokens to tune performance
+llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", max_num_batched_tokens=16384)
+```
+
+See related papers for more details (<https://arxiv.org/pdf/2401.08671> or <https://arxiv.org/pdf/2308.16369>).
+
+## Parallelism Strategies
+
+vLLM supports multiple parallelism strategies that can be combined to optimize performance across different hardware configurations.
+
+### Tensor Parallelism (TP)
+
+Tensor parallelism shards model parameters across multiple GPUs within each model layer. This is the most common strategy for large model inference within a single node.
+
+**When to use:**
+
+- When the model is too large to fit on a single GPU
+- When you need to reduce memory pressure per GPU to allow more KV cache space for higher throughput
+
+```python
+from vllm import LLM
+
+# Split model across 4 GPUs
+llm = LLM(model="meta-llama/Llama-3.3-70B-Instruct", tensor_parallel_size=4)
+```
+
+For models that are too large to fit on a single GPU (like 70B parameter models), tensor parallelism is essential.
+
+### Pipeline Parallelism (PP)
+
+Pipeline parallelism distributes model layers across multiple GPUs. Each GPU processes different parts of the model in sequence.
+
+**When to use:**
+
+- When you've already maxed out efficient tensor parallelism but need to distribute the model further, or across nodes
+- For very deep and narrow models where layer distribution is more efficient than tensor sharding
+
+Pipeline parallelism can be combined with tensor parallelism for very large models:
+
+```python
+from vllm import LLM
+
+# Combine pipeline and tensor parallelism
+llm = LLM(
+    model="meta-llama/Llama-3.3-70B-Instruct,
+    tensor_parallel_size=4,
+    pipeline_parallel_size=2
+)
+```
+
+### Expert Parallelism (EP)
+
+Expert parallelism is a specialized form of parallelism for Mixture of Experts (MoE) models, where different expert networks are distributed across GPUs.
+
+**When to use:**
+
+- Specifically for MoE models (like DeepSeekV3, Qwen3MoE, Llama-4)
+- When you want to balance the expert computation load across GPUs
+
+Expert parallelism is enabled by setting `enable_expert_parallel=True`, which will use expert parallelism instead of tensor parallelism for MoE layers.
+It will use the same degree of parallelism as what you have set for tensor parallelism.
+
+### Data Parallelism (DP)
+
+Data parallelism replicates the entire model across multiple GPU sets and processes different batches of requests in parallel.
+
+**When to use:**
+
+- When you have enough GPUs to replicate the entire model
+- When you need to scale throughput rather than model size
+- In multi-user environments where isolation between request batches is beneficial
+
+Data parallelism can be combined with the other parallelism strategies and is set by `data_parallel_size=N`.
+Note that MoE layers will be sharded according to the product of the tensor parallel size and data parallel size.
+
+## Reducing Memory Usage
+
+If you encounter out-of-memory issues, consider these strategies:
+
+### Context Length and Batch Size
+
+You can reduce memory usage by limiting the context length and batch size:
+
+```python
+from vllm import LLM
+
+llm = LLM(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    max_model_len=2048,  # Limit context window
+    max_num_seqs=4       # Limit batch size
+)
+```
+
+### Adjust CUDA Graph Compilation
+
+CUDA graph compilation in V1 uses more memory than in V0. You can reduce memory usage by adjusting the compilation level:
+
+```python
+from vllm import LLM
+from vllm.config import CompilationConfig, CompilationLevel
+
+llm = LLM(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        cudagraph_capture_sizes=[1, 2, 4, 8]  # Capture fewer batch sizes
+    )
+)
+```
+
+Or, if you are not concerned about latency or overall performance, disable CUDA graph compilation entirely with `enforce_eager=True`:
+
+```python
+from vllm import LLM
+
+llm = LLM(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    enforce_eager=True  # Disable CUDA graph compilation
+)
+```
+
+### Multimodal Models
+
+For multi-modal models, you can reduce memory usage by limiting the number of images/videos per request:
+
+```python
+from vllm import LLM
+
+# Accept up to 2 images per prompt
+llm = LLM(
+    model="Qwen/Qwen2.5-VL-3B-Instruct",
+    limit_mm_per_prompt={"image": 2}
+)
+```
diff --git a/vllm_v0.10.0/docs/configuration/serve_args.md b/vllm_v0.10.0/docs/configuration/serve_args.md
new file mode 100644
index 0000000..c1cc557
--- /dev/null
+++ b/vllm_v0.10.0/docs/configuration/serve_args.md
@@ -0,0 +1,35 @@
+# Server Arguments
+
+The `vllm serve` command is used to launch the OpenAI-compatible server.
+
+## CLI Arguments
+
+The `vllm serve` command is used to launch the OpenAI-compatible server.
+To see the available options, take a look at the [CLI Reference](../cli/README.md#options)!
+
+## Configuration file
+
+You can load CLI arguments via a [YAML](https://yaml.org/) config file.
+The argument names must be the long form of those outlined [above](serve_args.md).
+
+For example:
+
+```yaml
+# config.yaml
+
+model: meta-llama/Llama-3.1-8B-Instruct
+host: "127.0.0.1"
+port: 6379
+uvicorn-log-level: "info"
+```
+
+To use the above config file:
+
+```bash
+vllm serve --config config.yaml
+```
+
+!!! note
+    In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence.
+    The order of priorities is `command line > config file values > defaults`.
+    e.g. `vllm serve SOME_MODEL --config config.yaml`, SOME_MODEL takes precedence over `model` in config file.
diff --git a/vllm_v0.10.0/docs/contributing/README.md b/vllm_v0.10.0/docs/contributing/README.md
new file mode 100644
index 0000000..f2d439e
--- /dev/null
+++ b/vllm_v0.10.0/docs/contributing/README.md
@@ -0,0 +1,252 @@
+# Contributing to vLLM
+
+Thank you for your interest in contributing to vLLM! Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. There are several ways you can contribute to the project:
+
+- Identify and report any issues or bugs.
+- Request or add support for a new model.
+- Suggest or implement new features.
+- Improve documentation or contribute a how-to guide.
+
+We also believe in the power of community support; thus, answering queries, offering PR reviews, and assisting others are also highly regarded and beneficial contributions.
+
+Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository!
+
+## Job Board
+
+Unsure on where to start? Check out the following links for tasks to work on:
+
+- [Good first issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22good%20first%20issue%22)
+    - [Selected onboarding tasks](gh-project:6)
+- [New model requests](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22new-model%22)
+    - [Models with multi-modal capabilities](gh-project:10)
+
+## License
+
+See <gh-file:LICENSE>.
+
+## Developing
+
+Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation.
+Check out the [building from source][build-from-source] documentation for details.
+
+For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations.
+
+### Building the docs with MkDocs
+
+#### Introduction to MkDocs
+
+[MkDocs](https://github.com/mkdocs/mkdocs) is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file.
+
+#### Install MkDocs and Plugins
+
+Install MkDocs along with the [plugins](https://github.com/vllm-project/vllm/blob/main/mkdocs.yaml) used in the vLLM documentation, as well as required dependencies:
+
+```bash
+pip install -r requirements/docs.txt
+```
+
+!!! note
+    Ensure that your Python version is compatible with the plugins (e.g., `mkdocs-awesome-nav` requires Python 3.10+)
+
+#### Verify Installation
+
+Confirm that MkDocs is correctly installed:
+
+```bash
+mkdocs --version
+```
+
+Example output:
+
+```console
+mkdocs, version 1.6.1 from /opt/miniconda3/envs/mkdoc/lib/python3.10/site-packages/mkdocs (Python 3.10)
+```
+
+#### Clone the `vLLM` repository
+
+```bash
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+```
+
+#### Start the Development Server
+
+MkDocs comes with a built-in dev-server that lets you preview your documentation as you work on it. Make sure you're in the same directory as the `mkdocs.yml` configuration file, and then start the server by running the `mkdocs serve` command:
+
+```bash
+mkdocs serve
+```
+
+Example output:
+
+```console
+INFO    -  Documentation built in 106.83 seconds
+INFO    -  [22:02:02] Watching paths for changes: 'docs', 'mkdocs.yaml'
+INFO    -  [22:02:02] Serving on http://127.0.0.1:8000/
+```
+
+#### View in Your Browser
+
+Open up [http://127.0.0.1:8000/](http://127.0.0.1:8000/) in your browser to see a live preview:.
+
+#### Learn More
+
+For additional features and advanced configurations, refer to the official [MkDocs Documentation](https://www.mkdocs.org/).
+
+## Testing
+
+??? console "Commands"
+
+    ```bash
+    pip install -r requirements/dev.txt
+
+    # Linting, formatting and static type checking
+    pre-commit install --hook-type pre-commit --hook-type commit-msg
+
+    # You can manually run pre-commit with
+    pre-commit run --all-files
+
+    # To manually run something from CI that does not run
+    # locally by default, you can run:
+    pre-commit run mypy-3.9 --hook-stage manual --all-files
+
+    # Unit tests
+    pytest tests/
+
+    # Run tests for a single test file with detailed output
+    pytest -s -v tests/test_logger.py
+    ```
+
+!!! tip
+    Since the <gh-file:docker/Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.
+
+    Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
+
+!!! note
+    Currently, the repository is not fully checked by `mypy`.
+
+!!! note
+    Currently, not all unit tests pass when run on CPU platforms. If you don't have access to a GPU
+    platform to run unit tests locally, rely on the continuous integration system to run the tests for
+    now.
+
+## Issues
+
+If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
+
+!!! important
+    If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability).
+
+## Pull Requests & Code Reviews
+
+Thank you for your contribution to vLLM! Before submitting the pull request,
+please ensure the PR meets the following criteria. This helps vLLM maintain the
+code quality and improve the efficiency of the review process.
+
+### DCO and Signed-off-by
+
+When contributing changes to this project, you must agree to the <gh-file:DCO>.
+Commits must include a `Signed-off-by:` header which certifies agreement with
+the terms of the DCO.
+
+Using `-s` with `git commit` will automatically add this header.
+
+!!! tip
+    You can enable automatic sign-off via your IDE:
+  
+    - **PyCharm**: Click on the `Show Commit Options` icon to the right of the `Commit and Push...` button in the `Commit` window.
+      It will bring up a `git` window where you can modify the `Author` and enable `Sign-off commit`.
+    - **VSCode**: Open the [Settings editor](https://code.visualstudio.com/docs/configure/settings)
+      and enable the `Git: Always Sign Off` (`git.alwaysSignOff`) field.
+
+### PR Title and Classification
+
+Only specific types of PRs will be reviewed. The PR title is prefixed
+appropriately to indicate the type of change. Please use one of the following:
+
+- `[Bugfix]` for bug fixes.
+- `[CI/Build]` for build or continuous integration improvements.
+- `[Doc]` for documentation fixes and improvements.
+- `[Model]` for adding a new model or improving an existing model. Model name
+  should appear in the title.
+- `[Frontend]` For changes on the vLLM frontend (e.g., OpenAI API server,
+  `LLM` class, etc.)
+- `[Kernel]` for changes affecting CUDA kernels or other compute kernels.
+- `[Core]` for changes in the core vLLM logic (e.g., `LLMEngine`,
+  `AsyncLLMEngine`, `Scheduler`, etc.)
+- `[Hardware][Vendor]` for hardware-specific changes. Vendor name should
+  appear in the prefix (e.g., `[Hardware][AMD]`).
+- `[Misc]` for PRs that do not fit the above categories. Please use this
+  sparingly.
+
+!!! note
+    If the PR spans more than one category, please include all relevant prefixes.
+
+### Code Quality
+
+The PR needs to meet the following code quality standards:
+
+- We adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html).
+- Pass all linter checks. Please use `pre-commit` to format your code. See
+  <https://pre-commit.com/#usage> if `pre-commit` is new to you.
+- The code needs to be well-documented to ensure future contributors can easily
+  understand the code.
+- Include sufficient tests to ensure the project stays correct and robust. This
+  includes both unit tests and integration tests.
+- Please add documentation to `docs/` if the PR modifies the user-facing behaviors of vLLM.
+  It helps vLLM users understand and utilize the new features or changes.
+
+### Adding or Changing Kernels
+
+When actively developing or modifying kernels, using the [Incremental Compilation Workflow](./incremental_build.md) is highly recommended for faster build times.
+Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.
+
+- Make sure custom ops are registered following PyTorch guidelines:
+  [Custom C++ and CUDA Operators](https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial)
+  and [The Custom Operators Manual](https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU).
+- Custom operations that return `Tensors` require meta-functions.
+  Meta-functions should be implemented and registered in Python so that dynamic
+  dims can be handled automatically. See above documents for a description of
+  meta-functions.
+- Use [torch.library.opcheck()](https://pytorch.org/docs/stable/library.html#torch.library.opcheck)
+  to test the function registration and meta-function for any registered ops.
+  See `tests/kernels` for examples.
+- When changing the C++ signature of an existing op, the schema must be updated
+  to reflect the changes.
+- If a new custom type is needed, see the following document:
+  [Custom Class Support in PT2](https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA).
+
+### Notes for Large Changes
+
+Please keep the changes as concise as possible. For major architectural changes
+(>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue
+(RFC) discussing the technical design and justification. Otherwise, we will tag
+it with `rfc-required` and might not go through the PR.
+
+### What to Expect for the Reviews
+
+The goal of the vLLM team is to be a *transparent reviewing machine*. We would
+like to make the review process transparent and efficient and make sure no
+contributor feels confused or frustrated. However, the vLLM team is small, so we
+need to prioritize some PRs over others. Here is what you can expect from the
+review process:
+
+- After the PR is submitted, the PR will be assigned to a reviewer. Every
+  reviewer will pick up the PRs based on their expertise and availability.
+- After the PR is assigned, the reviewer will provide status updates every 2-3
+  days. If the PR is not reviewed within 7 days, please feel free to ping the
+  reviewer or the vLLM team.
+- After the review, the reviewer will put an `action-required` label on the PR
+  if there are changes required. The contributor should address the comments and
+  ping the reviewer to re-review the PR.
+- Please respond to all comments within a reasonable time frame. If a comment
+  isn't clear or you disagree with a suggestion, feel free to ask for
+  clarification or discuss the suggestion.
+- Note that not all CI checks will be executed due to limited computational
+  resources. The reviewer will add `ready` label to the PR when the PR is
+  ready to merge or a full CI run is needed.
+
+## Thank You
+
+Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
+All of your contributions help make vLLM a great tool and community for everyone!
diff --git a/vllm_v0.10.0/docs/contributing/benchmarks.md b/vllm_v0.10.0/docs/contributing/benchmarks.md
new file mode 100644
index 0000000..0ebd99b
--- /dev/null
+++ b/vllm_v0.10.0/docs/contributing/benchmarks.md
@@ -0,0 +1,26 @@
+# Benchmark Suites
+
+vLLM contains two sets of benchmarks:
+
+- [Performance benchmarks][performance-benchmarks]
+- [Nightly benchmarks][nightly-benchmarks]
+
+[](){ #performance-benchmarks }
+
+## Performance Benchmarks
+
+The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.
+
+The latest performance results are hosted on the public [vLLM Performance Dashboard](https://perf.vllm.ai).
+
+More information on the performance benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
+
+[](){ #nightly-benchmarks }
+
+## Nightly Benchmarks
+
+These compare vLLM's performance against alternatives (`tgi`, `trt-llm`, and `lmdeploy`) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the `perf-benchmarks` and `nightly-benchmarks` labels.
+
+The latest nightly benchmark results are shared in major release blog posts such as [vLLM v0.6.0](https://blog.vllm.ai/2024/09/05/perf-update.html).
+
+More information on the nightly benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/nightly-descriptions.md).
diff --git a/vllm_v0.10.0/docs/contributing/ci/failures.md b/vllm_v0.10.0/docs/contributing/ci/failures.md
new file mode 100644
index 0000000..573efb3
--- /dev/null
+++ b/vllm_v0.10.0/docs/contributing/ci/failures.md
@@ -0,0 +1,118 @@
+# CI Failures
+
+What should I do when a CI job fails on my PR, but I don't think my PR caused
+the failure?
+
+- Check the dashboard of current CI test failures:  
+  👉 [CI Failures Dashboard](https://github.com/orgs/vllm-project/projects/20)
+
+- If your failure **is already listed**, it's likely unrelated to your PR.
+  Help fixing it is always welcome!
+    - Leave comments with links to additional instances of the failure.
+    - React with a 👍 to signal how many are affected.
+
+- If your failure **is not listed**, you should **file an issue**.
+
+## Filing a CI Test Failure Issue
+
+- **File a bug report:**  
+    👉 [New CI Failure Report](https://github.com/vllm-project/vllm/issues/new?template=450-ci-failure.yml)
+
+- **Use this title format:**
+
+    ```
+    [CI Failure]: failing-test-job - regex/matching/failing:test
+    ```
+
+- **For the environment field:**
+
+    ```
+ Still failing on main as of commit abcdef123
+    ```
+
+- **In the description, include failing tests:**
+
+    ```
+    FAILED failing/test.py:failing_test1 - Failure description
+    FAILED failing/test.py:failing_test2 - Failure description
+    https://github.com/orgs/vllm-project/projects/20
+    https://github.com/vllm-project/vllm/issues/new?template=400-bug-report.yml
+    FAILED failing/test.py:failing_test3 - Failure description
+    ```
+
+- **Attach logs** (collapsible section example):
+    <details>
+    <summary>Logs:</summary>
+
+    ```text
+    ERROR 05-20 03:26:38 [dump_input.py:68] Dumping input data
+    --- Logging error ---  
+    Traceback (most recent call last):  
+      File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 203, in execute_model  
+        return self.model_executor.execute_model(scheduler_output)
+    ...
+    FAILED failing/test.py:failing_test1 - Failure description
+    FAILED failing/test.py:failing_test2 - Failure description
+    FAILED failing/test.py:failing_test3 - Failure description
+    ```
+
+    </details>
+
+## Logs Wrangling
+
+Download the full log file from Buildkite locally.
+
+Strip timestamps and colorization:
+
+<gh-file:.buildkite/scripts/ci-clean-log.sh>
+
+```bash
+./ci-clean-log.sh ci.log
+```
+
+Use a tool [wl-clipboard](https://github.com/bugaevc/wl-clipboard) for quick copy-pasting:
+
+```bash
+tail -525 ci_build.log | wl-copy
+```
+
+## Investigating a CI Test Failure
+
+1. Go to 👉 [Buildkite main branch](https://buildkite.com/vllm/ci/builds?branch=main)
+2. Bisect to find the first build that shows the issue.  
+3. Add your findings to the GitHub issue.  
+4. If you find a strong candidate PR, mention it in the issue and ping contributors.
+
+## Reproducing a Failure
+
+CI test failures may be flaky. Use a bash loop to run repeatedly:
+
+<gh-file:.buildkite/scripts/rerun-test.sh>
+
+```bash
+./rerun-test.sh tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]
+```
+
+## Submitting a PR
+
+If you submit a PR to fix a CI failure:
+
+- Link the PR to the issue:
+  Add `Closes #12345` to the PR description.
+- Add the `ci-failure` label:
+  This helps track it in the [CI Failures GitHub Project](https://github.com/orgs/vllm-project/projects/20).
+
+## Other Resources
+
+- 🔍 [Test Reliability on `main`](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main&order=ASC&sort_by=reliability)
+- 🧪 [Latest Buildkite CI Runs](https://buildkite.com/vllm/ci/builds?branch=main)
+
+## Daily Triage
+
+Use [Buildkite analytics (2-day view)](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main&period=2days) to:
+
+- Identify recent test failures **on `main`**.
+- Exclude legitimate test failures on PRs.
+- (Optional) Ignore tests with 0% reliability.
+
+Compare to the [CI Failures Dashboard](https://github.com/orgs/vllm-project/projects/20).
diff --git a/vllm_v0.10.0/docs/contributing/ci/update_pytorch_version.md b/vllm_v0.10.0/docs/contributing/ci/update_pytorch_version.md
new file mode 100644
index 0000000..1fe18d5
--- /dev/null
+++ b/vllm_v0.10.0/docs/contributing/ci/update_pytorch_version.md
@@ -0,0 +1,154 @@
+# Update PyTorch version on vLLM OSS CI/CD
+
+vLLM's current policy is to always use the latest PyTorch stable
+release in CI/CD. It is standard practice to submit a PR to update the
+PyTorch version as early as possible when a new [PyTorch stable
+release](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-cadence) becomes available.
+This process is non-trivial due to the gap between PyTorch
+releases. Using <gh-pr:16859> as an example, this document outlines common steps to achieve this
+update along with a list of potential issues and how to address them.
+
+## Test PyTorch release candidates (RCs)
+
+Updating PyTorch in vLLM after the official release is not
+ideal because any issues discovered at that point can only be resolved
+by waiting for the next release or by implementing hacky workarounds in vLLM.
+The better solution is to test vLLM with PyTorch release candidates (RC) to ensure
+compatibility before each release.
+
+PyTorch release candidates can be downloaded from [PyTorch test index](https://download.pytorch.org/whl/test).
+For example, `torch2.7.0+cu12.8` RC can be installed using the following command:
+
+```bash
+uv pip install torch torchvision torchaudio \
+    --index-url https://download.pytorch.org/whl/test/cu128
+```
+
+When the final RC is ready for testing, it will be announced to the community
+on the [PyTorch dev-discuss forum](https://dev-discuss.pytorch.org/c/release-announcements).
+After this announcement, we can begin testing vLLM integration by drafting a pull request
+following this 3-step process:
+
+1. Update [requirements files](https://github.com/vllm-project/vllm/tree/main/requirements)
+to point to the new releases for `torch`, `torchvision`, and `torchaudio`.
+
+2. Use the following option to get the final release candidates' wheels. Some common platforms are `cpu`, `cu128`, and `rocm6.2.4`.
+
+    ```bash
+    --extra-index-url https://download.pytorch.org/whl/test/<PLATFORM>
+    ```
+
+3. Since vLLM uses `uv`, ensure the following index strategy is applied:
+
+    - Via environment variable:
+
+    ```bash
+    export UV_INDEX_STRATEGY=unsafe-best-match
+    ```
+
+    - Or via CLI flag:
+
+    ```bash
+    --index-strategy unsafe-best-match
+    ```
+
+If failures are found in the pull request, raise them as issues on vLLM and
+cc the PyTorch release team to initiate discussion on how to address them.
+
+## Update CUDA version
+
+The PyTorch release matrix includes both stable and experimental [CUDA versions](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix). Due to limitations, only the latest stable CUDA version (for example,
+`torch2.7.0+cu12.6`) is uploaded to PyPI. However, vLLM may require a different CUDA version,
+such as 12.8 for Blackwell support.
+This complicates the process as we cannot use the out-of-the-box
+`pip install torch torchvision torchaudio` command. The solution is to use
+`--extra-index-url` in vLLM's Dockerfiles.
+
+- Important indexes at the moment include:
+
+| Platform | `--extra-index-url` |
+|----------|-----------------|
+| CUDA 12.8| [https://download.pytorch.org/whl/cu128](https://download.pytorch.org/whl/cu128)|
+| CPU      | [https://download.pytorch.org/whl/cpu](https://download.pytorch.org/whl/cpu)|
+| ROCm 6.2 | [https://download.pytorch.org/whl/rocm6.2.4](https://download.pytorch.org/whl/rocm6.2.4) |
+| ROCm 6.3 | [https://download.pytorch.org/whl/rocm6.3](https://download.pytorch.org/whl/rocm6.3) |
+| XPU      | [https://download.pytorch.org/whl/xpu](https://download.pytorch.org/whl/xpu) |
+
+- Update the below files to match the CUDA version from step 1. This makes sure that the release vLLM wheel is tested on CI.
+    - `.buildkite/release-pipeline.yaml`
+    - `.buildkite/scripts/upload-wheels.sh`
+
+## Address long vLLM build time
+
+When building vLLM with a new PyTorch/CUDA version, no cache will exist
+in the vLLM sccache S3 bucket, causing the build job on CI to potentially take more than 5 hours
+and timeout. Additionally, since vLLM's fastcheck pipeline runs in read-only mode,
+it doesn't populate the cache, so re-running it to warm up the cache
+is ineffective.
+
+While ongoing efforts like [#17419](gh-issue:17419)
+address the long build time at its source, the current workaround is to set `VLLM_CI_BRANCH`
+to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/use_postmerge_q`)
+when manually triggering a build on Buildkite. This branch accomplishes two things:
+
+1. Increase the timeout limit to 10 hours so that the build doesn't timeout.
+2. Allow the compiled artifacts to be written to the vLLM sccache S3 bucket
+to warm it up so that future builds are faster.
+
+<p align="center" width="100%">
+    <img width="60%" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
+</p>
+
+## Update dependencies
+
+Several vLLM dependencies, such as FlashInfer, also depend on PyTorch and need
+to be updated accordingly. Rather than waiting for all of them to publish new
+releases (which would take too much time), they can be built from
+source to unblock the update process.
+
+### FlashInfer
+Here is how to build and install it from source with `torch2.7.0+cu128` in vLLM [Dockerfile](https://github.com/vllm-project/vllm/blob/27bebcd89792d5c4b08af7a65095759526f2f9e1/docker/Dockerfile#L259-L271):
+
+```bash
+export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'
+export FLASHINFER_ENABLE_SM90=1
+uv pip install --system \
+    --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.6.post1"
+```
+
+One caveat is that building FlashInfer from source adds approximately 30
+minutes to the vLLM build time. Therefore, it's preferable to cache the wheel in a
+public location for immediate installation, such as [this FlashInfer wheel link](https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl). For future releases, contact the PyTorch release
+team if you want to get the package published there.
+
+### xFormers
+Similar to FlashInfer, here is how to build and install xFormers from source:
+
+```bash
+export TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
+MAX_JOBS=16 uv pip install --system \
+    --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30"
+```
+
+### Mamba
+
+```bash
+uv pip install --system \
+    --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
+```
+
+### causal-conv1d
+
+```
+uv pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
+```
+
+## Update all the different vLLM platforms
+
+Rather than attempting to update all vLLM platforms in a single pull request, it's more manageable
+to handle some platforms separately. The separation of requirements and Dockerfiles
+for different platforms in vLLM CI/CD allows us to selectively choose
+which platforms to update. For instance, updating XPU requires the corresponding
+release from [Intel Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch) by Intel.
+While <gh-pr:16859> updated vLLM to PyTorch 2.7.0 on CPU, CUDA, and ROCm,
+<gh-pr:17444> completed the update for XPU.
diff --git a/vllm_v0.10.0/docs/contributing/deprecation_policy.md b/vllm_v0.10.0/docs/contributing/deprecation_policy.md
new file mode 100644
index 0000000..ff69cba
--- /dev/null
+++ b/vllm_v0.10.0/docs/contributing/deprecation_policy.md
@@ -0,0 +1,87 @@
+# Deprecation Policy
+
+This document outlines the official policy and process for deprecating features
+in the vLLM project.
+
+## Overview
+
+vLLM uses a structured "deprecation pipeline" to guide the lifecycle of
+deprecated features. This policy ensures that users are given clear and
+sufficient notice when a feature is deprecated and that deprecations proceed in
+a consistent and predictable manner.
+
+We aim to strike a balance between continued innovation and respecting users’
+reliance on existing functionality. Deprecations are tied to our **minor (Y)
+releases** following semantic versioning (X.Y.Z), where:
+
+- **X** is a major version (rare)
+- **Y** is a minor version (used for significant changes, including deprecations/removals)
+- **Z** is a patch version (used for fixes and safer enhancements)
+
+Features that fall under this policy include (at a minimum) the following:
+
+- CLI flags
+- Environment variables
+- Configuration files
+- APIs in the OpenAI-compatible API server
+- Public Python APIs for the `vllm` library
+
+## Deprecation Pipeline
+
+The deprecation process consists of several clearly defined stages that span
+multiple Y releases:
+
+**1. Deprecated (Still On By Default)**
+
+- **Action**: Feature is marked as deprecated.
+- **Timeline**: A removal version is explicitly stated in the deprecation
+warning (e.g., "This will be removed in v0.10.0").
+- **Communication**: Deprecation is noted in the following, as applicable:
+    - Help strings
+    - Log output
+    - API responses
+    - `/metrics` output (for metrics features)
+    - User-facing documentation
+    - Release notes
+    - GitHub Issue (RFC) for feedback
+    - Documentation and use of the `@typing_extensions.deprecated` decorator for Python APIs
+
+**2.Deprecated (Off By Default)**
+
+- **Action**: Feature is disabled by default, but can still be re-enabled via a
+CLI flag or environment variable. Feature throws an error when used without
+re-enabling.
+- **Purpose**: Allows users who missed earlier warnings a temporary escape hatch
+while signaling imminent removal. Ensures any remaining usage is clearly
+surfaced and blocks silent breakage before full removal.
+
+**3. Removed**
+
+- **Action**: Feature is completely removed from the codebase.
+- **Note**: Only features that have passed through the previous deprecation
+stages will be removed.
+
+## Example Timeline
+
+Assume a feature is deprecated in `v0.9.0`.
+
+| Release       | Status                                                                                          |
+|---------------|-------------------------------------------------------------------------------------------------|
+| `v0.9.0`      | Feature is deprecated with clear removal version listed.                                        |
+| `v0.10.0`     | Feature is now off by default, throws an error when used, and can be re-enabled for legacy use. |
+| `v0.11.0`     | Feature is removed.                                                                             |
+
+## Important Guidelines
+
+- **No Removals in Patch Releases**: Removing deprecated features in patch
+(`.Z`) releases is disallowed to avoid surprising users.
+- **Grace Period for Existing Deprecations**: Any feature deprecated **before
+this policy** will have its grace period start **now**, not retroactively.
+- **Documentation is Critical**: Ensure every stage of the pipeline is
+documented clearly for users.
+
+## Final Notes
+
+This policy is a living document and may evolve as the needs of the project and
+its users change. Community feedback is welcome and encouraged as we refine the
+process.
diff --git a/vllm_v0.10.0/docs/contributing/dockerfile/dockerfile.md b/vllm_v0.10.0/docs/contributing/dockerfile/dockerfile.md
new file mode 100644
index 0000000..a7ff99a
--- /dev/null
+++ b/vllm_v0.10.0/docs/contributing/dockerfile/dockerfile.md
@@ -0,0 +1,53 @@
+# Dockerfile
+
+We provide a <gh-file:docker/Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
+More information about deploying with Docker can be found [here](../../deployment/docker.md).
+
+Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
+
+- All build stages
+- The default build target (highlighted in grey)
+- External images (with dashed borders)
+
+The edges of the build graph represent:
+
+- `FROM ...` dependencies (with a solid line and a full arrow head)
+
+- `COPY --from=...` dependencies (with a dashed line and an empty arrow head)
+
+- `RUN --mount=(.\*)from=...` dependencies (with a dotted line and an empty diamond arrow head)
+
+  > <figure markdown="span">
+  >   ![](../../assets/contributing/dockerfile-stages-dependency.png){ align="center" alt="query" width="100%" }
+  > </figure>
+  >
+  > Made using: <https://github.com/patrickhoefler/dockerfilegraph>
+  >
+  > Commands to regenerate the build graph (make sure to run it **from the \`root\` directory of the vLLM repository** where the dockerfile is present):
+  >
+  > ```bash
+  > dockerfilegraph \
+  >   -o png \
+  >   --legend \
+  >   --dpi 200 \
+  >   --max-label-length 50 \
+  >   --filename docker/Dockerfile
+  > ```
+  >
+  > or in case you want to run it directly with the docker image:
+  >
+  > ```bash
+  > docker run \
+  >    --rm \
+  >    --user "$(id -u):$(id -g)" \
+  >    --workdir /workspace \
+  >    --volume "$(pwd)":/workspace \
+  >    ghcr.io/patrickhoefler/dockerfilegraph:alpine \
+  >    --output png \
+  >    --dpi 200 \
+  >    --max-label-length 50 \
+  >    --filename docker/Dockerfile \
+  >    --legend
+  > ```
+  >
+  > (To run it for a different file, you can pass in a different argument to the flag `--filename`.)
diff --git a/vllm_v0.10.0/docs/contributing/incremental_build.md b/vllm_v0.10.0/docs/contributing/incremental_build.md
new file mode 100644
index 0000000..0e34e69
--- /dev/null
+++ b/vllm_v0.10.0/docs/contributing/incremental_build.md
@@ -0,0 +1,139 @@
+# Incremental Compilation Workflow
+
+When working on vLLM's C++/CUDA kernels located in the `csrc/` directory, recompiling the entire project with `uv pip install -e .` for every change can be time-consuming. An incremental compilation workflow using CMake allows for faster iteration by only recompiling the necessary components after an initial setup. This guide details how to set up and use such a workflow, which complements your editable Python installation.
+
+## Prerequisites
+
+Before setting up the incremental build:
+
+1. **vLLM Editable Install:** Ensure you have vLLM installed from source in an editable mode. Using pre-compiled wheels for the initial editable setup can be faster, as the CMake workflow will handle subsequent kernel recompilations.
+
+    ```console
+    uv venv --python 3.12 --seed
+    source .venv/bin/activate
+    VLLM_USE_PRECOMPILED=1 uv pip install -U -e . --torch-backend=auto
+    ```
+
+2. **CUDA Toolkit:** Verify that the NVIDIA CUDA Toolkit is correctly installed and `nvcc` is accessible in your `PATH`. CMake relies on `nvcc` to compile CUDA code. You can typically find `nvcc` in `$CUDA_HOME/bin/nvcc` or by running `which nvcc`. If you encounter issues, refer to the [official CUDA Toolkit installation guides](https://developer.nvidia.com/cuda-toolkit-archive) and vLLM's main [GPU installation documentation](../getting_started/installation/gpu.md#troubleshooting) for troubleshooting. The `CMAKE_CUDA_COMPILER` variable in your `CMakeUserPresets.json` should also point to your `nvcc` binary.
+
+3. **Build Tools:** It is highly recommended to install `ccache` for fast rebuilds by caching compilation results (e.g., `sudo apt install ccache` or `conda install ccache`). Also, ensure the core build dependencies like `cmake` and `ninja` are installed. These are installable through `requirements/build.txt` or your system's package manager.
+
+    ```console
+    uv pip install -r requirements/build.txt --torch-backend=auto
+    ```
+
+## Setting up the CMake Build Environment
+
+The incremental build process is managed through CMake. You can configure your build settings using a `CMakeUserPresets.json` file at the root of the vLLM repository.
+
+### Generate `CMakeUserPresets.json` using the helper script
+
+To simplify the setup, vLLM provides a helper script that attempts to auto-detect your system's configuration (like CUDA path, Python environment, and CPU cores) and generates the `CMakeUserPresets.json` file for you.
+
+**Run the script:**
+
+Navigate to the root of your vLLM clone and execute the following command:
+
+```console
+python tools/generate_cmake_presets.py
+```
+
+The script will prompt you if it cannot automatically determine certain paths (e.g., `nvcc` or a specific Python executable for your vLLM development environment). Follow the on-screen prompts. If an existing `CMakeUserPresets.json` is found, the script will ask for confirmation before overwriting it.
+
+After running the script, a `CMakeUserPresets.json` file will be created in the root of your vLLM repository.
+
+### Example `CMakeUserPresets.json`
+
+Below is an example of what the generated `CMakeUserPresets.json` might look like. The script will tailor these values based on your system and any input you provide.
+
+```json
+{
+    "version": 6,
+    "cmakeMinimumRequired": {
+        "major": 3,
+        "minor": 26,
+        "patch": 1
+    },
+    "configurePresets": [
+        {
+            "name": "release",
+            "generator": "Ninja",
+            "binaryDir": "${sourceDir}/cmake-build-release",
+            "cacheVariables": {
+                "CMAKE_CUDA_COMPILER": "/usr/local/cuda/bin/nvcc",
+                "CMAKE_C_COMPILER_LAUNCHER": "ccache",
+                "CMAKE_CXX_COMPILER_LAUNCHER": "ccache",
+                "CMAKE_CUDA_COMPILER_LAUNCHER": "ccache",
+                "CMAKE_BUILD_TYPE": "Release",
+                "VLLM_PYTHON_EXECUTABLE": "/home/user/venvs/vllm/bin/python",
+                "CMAKE_INSTALL_PREFIX": "${sourceDir}",
+                "CMAKE_CUDA_FLAGS": "",
+                "NVCC_THREADS": "4",
+                "CMAKE_JOB_POOLS": "compile=32"
+            }
+        }
+    ],
+    "buildPresets": [
+        {
+            "name": "release",
+            "configurePreset": "release",
+            "jobs": 32
+        }
+    ]
+}
+```
+
+**What do the various configurations mean?**
+
+- `CMAKE_CUDA_COMPILER`: Path to your `nvcc` binary. The script attempts to find this automatically.
+- `CMAKE_C_COMPILER_LAUNCHER`, `CMAKE_CXX_COMPILER_LAUNCHER`, `CMAKE_CUDA_COMPILER_LAUNCHER`: Setting these to `ccache` (or `sccache`) significantly speeds up rebuilds by caching compilation results. Ensure `ccache` is installed (e.g., `sudo apt install ccache` or `conda install ccache`). The script sets these by default.
+- `VLLM_PYTHON_EXECUTABLE`: Path to the Python executable in your vLLM development environment. The script will prompt for this, defaulting to the current Python environment if suitable.
+- `CMAKE_INSTALL_PREFIX: "${sourceDir}"`: Specifies that the compiled components should be installed back into your vLLM source directory. This is crucial for the editable install, as it makes the newly built kernels immediately available to your Python environment.
+- `CMAKE_JOB_POOLS` and `jobs` in build presets: Control the parallelism of the build. The script sets these based on the number of CPU cores detected on your system.
+- `binaryDir`: Specifies where the build artifacts will be stored (e.g., `cmake-build-release`).
+
+## Building and Installing with CMake
+
+Once your `CMakeUserPresets.json` is configured:
+
+1. **Initialize the CMake build environment:**
+   This step configures the build system according to your chosen preset (e.g., `release`) and creates the build directory at `binaryDir`
+
+    ```console
+    cmake --preset release
+    ```
+
+2. **Build and install the vLLM components:**
+   This command compiles the code and installs the resulting binaries into your vLLM source directory, making them available to your editable Python installation.
+
+    ```console
+    cmake --build --preset release --target install
+    ```
+
+3. **Make changes and repeat!**
+    Now you start using your editable install of vLLM, testing and making changes as needed. If you need to build again to update based on changes, simply run the CMake command again to build only the affected files.
+
+    ```console
+    cmake --build --preset release --target install
+    ```
+
+## Verifying the Build
+
+After a successful build, you will find a populated build directory (e.g., `cmake-build-release/` if you used the `release` preset and the example configuration).
+
+```console
+> ls cmake-build-release/
+bin             cmake_install.cmake      _deps                                machete_generation.log
+build.ninja     CPackConfig.cmake        detect_cuda_compute_capabilities.cu  marlin_generation.log
+_C.abi3.so      CPackSourceConfig.cmake  detect_cuda_version.cc               _moe_C.abi3.so
+CMakeCache.txt  ctest                    _flashmla_C.abi3.so                  moe_marlin_generation.log
+CMakeFiles      cumem_allocator.abi3.so  install_local_manifest.txt           vllm-flash-attn
+```
+
+The `cmake --build ... --target install` command copies the compiled shared libraries (like `_C.abi3.so`, `_moe_C.abi3.so`, etc.) into the appropriate `vllm` package directory within your source tree. This updates your editable installation with the newly compiled kernels.
+
+## Additional Tips
+
+- **Adjust Parallelism:** Fine-tune the `CMAKE_JOB_POOLS` in `configurePresets` and `jobs` in `buildPresets` in your `CMakeUserPresets.json`. Too many jobs can overload systems with limited RAM or CPU cores, leading to slower builds or system instability. Too few won't fully utilize available resources.
+- **Clean Builds When Necessary:** If you encounter persistent or strange build errors, especially after significant changes or switching branches, consider removing the CMake build directory (e.g., `rm -rf cmake-build-release`) and re-running the `cmake --preset` and `cmake --build` commands.
+- **Specific Target Builds:** For even faster iterations when working on a specific module, you can sometimes build a specific target instead of the full `install` target, though `install` ensures all necessary components are updated in your Python environment. Refer to CMake documentation for more advanced target management.
diff --git a/vllm_v0.10.0/docs/contributing/model/README.md b/vllm_v0.10.0/docs/contributing/model/README.md
new file mode 100644
index 0000000..0ca77fa
--- /dev/null
+++ b/vllm_v0.10.0/docs/contributing/model/README.md
@@ -0,0 +1,22 @@
+# Summary
+
+!!! important
+    Many decoder language models can now be automatically loaded using the [Transformers backend][transformers-backend] without having to implement them in vLLM. See if `vllm serve <model>` works first!
+
+vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/compatibility_matrix.md) to optimize their performance.
+
+The complexity of integrating a model into vLLM depends heavily on the model's architecture.
+The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
+However, this can be more complex for models that include new operators (e.g., a new attention mechanism).
+
+Read through these pages for a step-by-step guide:
+
+- [Basic Model](basic.md)
+- [Registering a Model](registration.md)
+- [Unit Testing](tests.md)
+- [Multi-Modal Support](multimodal.md)
+
+!!! tip
+    If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues)
+    or ask on our [developer slack](https://slack.vllm.ai).
+    We will be happy to help you out!
diff --git a/vllm_v0.10.0/docs/contributing/model/basic.md b/vllm_v0.10.0/docs/contributing/model/basic.md
new file mode 100644
index 0000000..edd9a47
--- /dev/null
+++ b/vllm_v0.10.0/docs/contributing/model/basic.md
@@ -0,0 +1,123 @@
+# Basic Model
+
+This guide walks you through the steps to implement a basic vLLM model.
+
+## 1. Bring your model code
+
+First, clone the PyTorch model code from the source repository.
+For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from
+HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
+
+!!! warning
+    Make sure to review and adhere to the original code's copyright and licensing terms!
+
+## 2. Make your code compatible with vLLM
+
+To ensure compatibility with vLLM, your model must meet the following requirements:
+
+### Initialization Code
+
+All vLLM modules within the model must include a `prefix` argument in their constructor. This `prefix` is typically the full name of the module in the model's state dictionary and is crucial for:
+
+- Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts.
+- Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the `prefix` during initialization, vLLM can match the current layer's `prefix` with the quantization configuration to determine if the layer should be initialized in quantized mode.
+
+The initialization code should look like this:
+
+??? code
+
+    ```python
+    from torch import nn
+    from vllm.config import VllmConfig
+    from vllm.attention import Attention
+
+    class MyAttention(nn.Module):
+        def __init__(self, vllm_config: VllmConfig, prefix: str):
+            super().__init__()
+            self.attn = Attention(prefix=f"{prefix}.attn")
+
+    class MyDecoderLayer(nn.Module):
+        def __init__(self, vllm_config: VllmConfig, prefix: str):
+            super().__init__()
+            self.self_attn = MyAttention(prefix=f"{prefix}.self_attn")
+
+    class MyModel(nn.Module):
+        def __init__(self, vllm_config: VllmConfig, prefix: str):
+            super().__init__()
+            self.layers = nn.ModuleList(
+                [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)]
+            )
+
+    class MyModelForCausalLM(nn.Module):
+        def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
+            super().__init__()
+            self.model = MyModel(vllm_config, prefix=f"{prefix}.model")
+    ```
+
+### Computation Code
+
+- Add a `get_input_embeddings` method inside `MyModel` module that returns the text embeddings given `input_ids`. This is equivalent to directly calling the text embedding layer, but provides a unified interface in case `MyModel` is used within a composite multimodal model.
+
+```python
+class MyModel(nn.Module):
+        ...
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        ... 
+```
+
+- Rewrite the [forward][torch.nn.Module.forward] method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension.
+
+```python
+def forward(
+    self,
+    input_ids: torch.Tensor,
+    positions: torch.Tensor,
+    intermediate_tensors: Optional[IntermediateTensors] = None,
+    inputs_embeds: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    ...
+```
+
+!!! note
+    Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
+    If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
+
+For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
+
+## 3. (Optional) Implement tensor parallelism and quantization support
+
+If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
+To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
+For the embedding layer, you can simply replace [torch.nn.Embedding][] with `VocabParallelEmbedding`. For the output LM head, you can use `ParallelLMHead`.
+When it comes to the linear layers, we provide the following options to parallelize them:
+
+- `ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.
+- `RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer.
+- `ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer.
+- `MergedColumnParallelLinear`: Column-parallel linear that merges multiple `ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices.
+- `QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices.
+
+Note that all the linear layers above take `linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization.
+
+## 4. Implement the weight loading logic
+
+You now need to implement the `load_weights` method in your `*ForCausalLM` class.
+This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for `MergedColumnParallelLinear` and `QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately.
+
+## 5. Register your model
+
+See [this page](registration.md) for instructions on how to register your new model to be used by vLLM.
+
+## Frequently Asked Questions
+
+### How to support models with interleaving sliding windows?
+
+For models with interleaving sliding windows (e.g. `google/gemma-2-2b-it` and `mistralai/Ministral-8B-Instruct-2410`), the scheduler will treat the model as a full-attention model, i.e., kv-cache of all tokens will not be dropped. This is to make sure prefix caching works with these models. Sliding window only appears as a parameter to the attention kernel computation.
+
+To support a model with interleaving sliding windows, we need to take care of the following details:
+
+- Make sure the model's `config.json` contains `sliding_window_pattern`. vLLM then sets `self.hf_text_config.interleaved_sliding_window` to the value of `self.hf_text_config.sliding_window` and deletes `sliding_window` from `self.hf_text_config`. The model will then be treated as a full-attention model.
+- In the modeling code, parse the correct sliding window value for every layer, and pass it to the attention layer's `per_layer_sliding_window` argument. For reference, check [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/model_executor/models/llama.py#L171).
+
+With these two steps, interleave sliding windows should work with the model.
diff --git a/vllm_v0.10.0/docs/contributing/model/multimodal.md b/vllm_v0.10.0/docs/contributing/model/multimodal.md
new file mode 100644
index 0000000..3295b8c
--- /dev/null
+++ b/vllm_v0.10.0/docs/contributing/model/multimodal.md
@@ -0,0 +1,862 @@
+# Multi-Modal Support
+
+This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs](../../features/multimodal_inputs.md).
+
+## 1. Update the base vLLM model
+
+It is assumed that you have already implemented the model in vLLM according to [these steps](basic.md).
+Further update the model as follows:
+
+- Implement [get_placeholder_str][vllm.model_executor.models.interfaces.SupportsMultiModal.get_placeholder_str] to define the placeholder string which is used to represent the multi-modal item in the text prompt. This should be consistent with the chat template of the model.
+
+    ??? code
+
+        ```python
+        class YourModelForImage2Seq(nn.Module):
+            ...
+
+            @classmethod
+            def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+                if modality.startswith("image"):
+                    return "<image>"
+
+                raise ValueError("Only image modality is supported")
+        ```
+
+- Reserve a keyword parameter in [forward][torch.nn.Module.forward] for each input tensor that corresponds to a multi-modal input, as shown in the following example:
+
+  ```diff
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+  +     pixel_values: torch.Tensor,
+    ) -> SamplerOutput:
+  ```
+  
+  More conveniently, you can simply pass `**kwargs` to the [forward][torch.nn.Module.forward] method and retrieve the keyword parameters for multimodal inputs from it.
+
+- Implement [get_multimodal_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs.
+
+    ??? code
+
+        ```python
+        class YourModelForImage2Seq(nn.Module):
+            ...
+
+            def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
+
+                assert self.vision_encoder is not None
+                image_features = self.vision_encoder(image_input)
+                return self.multi_modal_projector(image_features)
+
+            def get_multimodal_embeddings(
+                    self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+
+                # Validate the multimodal input keyword arguments
+                image_input = self._parse_and_validate_image_input(**kwargs)
+                if image_input is None:
+                    return None
+
+                # Run multimodal inputs through encoder and projector
+                vision_embeddings = self._process_image_input(image_input)
+                return vision_embeddings
+        ```
+
+!!! important
+    The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
+
+- Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.
+
+    ??? code
+
+        ```python
+        from .utils import merge_multimodal_embeddings
+
+        class YourModelForImage2Seq(nn.Module):
+            ...
+
+            def get_input_embeddings(
+                self,
+                input_ids: torch.Tensor,
+                multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+            ) -> torch.Tensor:
+
+                # `get_input_embeddings` should already be implemented for the language 
+                # model as one of the requirements of basic vLLM model implementation.
+                inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+
+                if multimodal_embeddings is not None:
+                    inputs_embeds = merge_multimodal_embeddings(
+                        input_ids=input_ids, 
+                        inputs_embeds=inputs_embeds, 
+                        multimodal_embeddings=multimodal_embeddings,
+                        placeholder_token_id=self.config.image_token_index)
+
+                return inputs_embeds
+        ```
+
+- Implement [get_language_model][vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model] getter to provide stable access to the underlying language model.
+
+    ```python
+    class YourModelForImage2Seq(nn.Module):
+        ...
+
+        def get_language_model(self) -> torch.nn.Module:
+            # Change `language_model` according to your implementation.
+            return self.language_model
+    ```
+
+- Once the above steps are done, update the model class with the [SupportsMultiModal][vllm.model_executor.models.interfaces.SupportsMultiModal] interface.
+
+  ```diff
+  + from vllm.model_executor.models.interfaces import SupportsMultiModal
+
+  - class YourModelForImage2Seq(nn.Module):
+  + class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
+  ```
+
+!!! note
+    The model class does not have to be named `*ForCausalLM`.
+    Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples.
+
+## 2. Specify processing information
+
+Next, create a subclass of [BaseProcessingInfo][vllm.multimodal.processing.BaseProcessingInfo]
+to provide basic information related to HF processing.
+
+### Maximum number of input items
+
+You need to override the abstract method [get_supported_mm_limits][vllm.multimodal.processing.BaseProcessingInfo.get_supported_mm_limits]
+to return the maximum number of input items for each modality supported by the model.
+
+For example, if the model supports any number of images but only one video per prompt:
+
+```python
+def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    return {"image": None, "video": 1}
+```
+
+## 3. Specify dummy inputs
+
+Then, inherit [BaseDummyInputsBuilder][vllm.multimodal.profiling.BaseDummyInputsBuilder] to construct dummy inputs for
+HF processing as well as memory profiling.
+
+### For memory profiling
+
+Override the abstract methods [get_dummy_text][vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text] and [get_dummy_mm_data][vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_mm_data] to construct dummy inputs for memory profiling. These dummy inputs should result in the worst-case memory usage of the model so that vLLM can reserve the correct amount of memory for it.
+
+Assuming that the memory usage increases with the number of tokens, the dummy inputs can be constructed to maximize the number of output embeddings, which is the same number as placeholder feature tokens.
+
+=== "Basic example: LLaVA"
+
+    Looking at the code of HF's `LlavaForConditionalGeneration`:
+
+    ??? code
+
+        ```python
+        # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544
+        n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
+        n_image_features = image_features.shape[0] * image_features.shape[1]
+
+        if n_image_tokens != n_image_features:
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            )
+        special_image_mask = (
+            (input_ids == self.config.image_token_index)
+            .unsqueeze(-1)
+            .expand_as(inputs_embeds)
+            .to(inputs_embeds.device)
+        )
+        image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+        ```
+
+    The number of placeholder feature tokens per image is `image_features.shape[1]`.
+    `image_features` is calculated inside the `get_image_features` method:
+
+    ??? code
+
+        ```python
+        # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300
+        image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+
+        selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
+        if vision_feature_select_strategy == "default":
+            selected_image_feature = selected_image_feature[:, 1:]
+        elif vision_feature_select_strategy == "full":
+            selected_image_feature = selected_image_feature
+        else:
+            raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
+        image_features = self.multi_modal_projector(selected_image_feature)
+        return image_features
+        ```
+
+    We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower
+    (`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model).
+    Moreover, we only need the sequence length (the second dimension of the tensor) to get `image_features.shape[1]`.
+    The sequence length is determined by the initial hidden states in `CLIPVisionTransformer` since the attention
+    mechanism doesn't change the sequence length of the output hidden states.
+
+    ```python
+    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L1094-L1102
+    hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+    hidden_states = self.pre_layrnorm(hidden_states)
+
+    encoder_outputs = self.encoder(
+        inputs_embeds=hidden_states,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+    )
+    ```
+
+    To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`:
+
+    ??? code
+
+        ```python
+        # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embedding(self.position_ids)
+        return embeddings
+        ```
+
+    We can infer that `embeddings.shape[1] == self.num_positions`, where
+
+    ```python
+    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L195-L196
+    self.num_patches = (self.image_size // self.patch_size) ** 2
+    self.num_positions = self.num_patches + 1
+    ```
+
+    Overall, the number of placeholder feature tokens for an image can be calculated as:
+
+    ??? code
+
+        ```python
+        def get_num_image_tokens(
+            self,
+            *,
+            image_width: int,
+            image_height: int,
+        ) -> int:
+            hf_config = self.get_hf_config()
+            hf_processor = self.get_hf_processor()
+
+            image_size = hf_config.vision_config.image_size
+            patch_size = hf_config.vision_config.patch_size
+
+            num_image_tokens = (image_size // patch_size) ** 2 + 1
+            if hf_processor.vision_feature_select_strategy == "default":
+                num_image_tokens -= 1
+
+            return num_image_tokens
+        ```
+
+    Notice that the number of image tokens doesn't depend on the image width and height.
+    We can simply use a dummy `image_size` to calculate the multimodal profiling data:
+
+    ??? code
+
+        ```python
+        # NOTE: In actuality, this is usually implemented as part of the
+        # model's subclass of `BaseProcessingInfo`, but we show it as is
+        # here for simplicity.
+        def get_image_size_with_most_features(self) -> ImageSize:
+            hf_config = self.get_hf_config()
+            width = height = hf_config.image_size
+            return ImageSize(width=width, height=height)
+
+        def get_dummy_mm_data(
+            self,
+            seq_len: int,
+            mm_counts: Mapping[str, int],
+        ) -> MultiModalDataDict:
+            num_images = mm_counts.get("image", 0)
+
+            target_width, target_height = \
+                self.info.get_image_size_with_most_features()
+
+            return {
+                "image":
+                self._get_dummy_images(width=target_width,
+                                    height=target_height,
+                                    num_images=num_images)
+            }
+        ```
+
+    For the text, we simply expand the multimodal image token from the model config to match the desired number of images.
+
+    ```python
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+
+        return image_token * num_images
+    ```
+
+=== "No input placeholders: Fuyu"
+
+    Looking at the code of HF's `FuyuForCausalLM`:
+
+    ??? code
+
+        ```python
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322
+        if image_patches is not None and past_key_values is None:
+            patch_embeddings = [
+                self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype))
+                .squeeze(0)
+                .to(inputs_embeds.device)
+                for patch in image_patches
+            ]
+            inputs_embeds = self.gather_continuous_embeddings(
+                word_embeddings=inputs_embeds,
+                continuous_embeddings=patch_embeddings,
+                image_patch_input_indices=image_patches_indices,
+            )
+        ```
+
+    The number of placeholder feature tokens for the `i`th item in the batch is `patch_embeddings[i].shape[0]`,
+    which is the same as `image_patches[i].shape[0]`, i.e. `num_total_patches`.
+
+    Unlike LLaVA, Fuyu does not define the number of patches inside the modeling file. Where can we get more information?
+    Considering that the model input comes from the output of `FuyuProcessor`, let's **look at the preprocessing files**.
+
+    The image outputs are obtained by calling `FuyuImageProcessor.preprocess` and then
+    `FuyuImageProcessor.preprocess_with_tokenizer_info` inside `FuyuProcessor`.
+
+    In `FuyuImageProcessor.preprocess`, the images are resized and padded to the target `FuyuImageProcessor.size`,
+    returning the dimensions after resizing (but before padding) as metadata.
+
+    ??? code
+
+        ```python
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544
+        image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"])
+        batch_images = image_encoding["images"]
+        image_unpadded_heights = image_encoding["image_unpadded_heights"]
+        image_unpadded_widths = image_encoding["image_unpadded_widths"]
+
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L480-L
+        if do_resize:
+            batch_images = [
+                [self.resize(image, size=size, input_data_format=input_data_format) for image in images]
+                for images in batch_images
+            ]
+
+        image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images]
+        image_unpadded_heights = [[image_size[0]] for image_size in image_sizes]
+        image_unpadded_widths = [[image_size[1]] for image_size in image_sizes]
+
+        if do_pad:
+            batch_images = [
+                [
+                    self.pad_image(
+                        image,
+                        size=size,
+                        mode=padding_mode,
+                        constant_values=padding_value,
+                        input_data_format=input_data_format,
+                    )
+                    for image in images
+                ]
+                for images in batch_images
+            ]
+        ```
+
+    In `FuyuImageProcessor.preprocess_with_tokenizer_info`, the images are split into patches based on this metadata:
+
+    ??? code
+
+        ```python
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425
+        model_image_input = self.image_processor.preprocess_with_tokenizer_info(
+            image_input=tensor_batch_images,
+            image_present=image_present,
+            image_unpadded_h=image_unpadded_heights,
+            image_unpadded_w=image_unpadded_widths,
+            image_placeholder_id=image_placeholder_id,
+            image_newline_id=image_newline_id,
+            variable_sized=True,
+        )
+
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L638-L658
+        image_height, image_width = image.shape[1], image.shape[2]
+        if variable_sized:  # variable_sized=True
+            new_h = min(
+                image_height,
+                math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height,
+            )
+            new_w = min(
+                image_width,
+                math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width,
+            )
+            image = image[:, :new_h, :new_w]
+            image_height, image_width = new_h, new_w
+
+        num_patches = self.get_num_patches(image_height=image_height, image_width=image_width)
+        tensor_of_image_ids = torch.full(
+            [num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
+        )
+        patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
+        assert num_patches == patches.shape[0]
+        ```
+
+    The number of patches is in turn defined by `FuyuImageProcessor.get_num_patches`:
+
+    ??? code
+
+        ```python
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]
+
+        if image_height % patch_height != 0:
+            raise ValueError(f"{image_height=} must be divisible by {patch_height}")
+        if image_width % patch_width != 0:
+            raise ValueError(f"{image_width=} must be divisible by {patch_width}")
+
+        num_patches_per_dim_h = image_height // patch_height
+        num_patches_per_dim_w = image_width // patch_width
+        num_patches = num_patches_per_dim_h * num_patches_per_dim_w
+        ```
+
+    These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized
+    to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`.
+
+    ```python
+    def get_image_size_with_most_features(self) -> ImageSize:
+        image_processor = self.get_image_processor()
+        return ImageSize(width=image_processor.size["width"],
+                            height=image_processor.size["height"])
+    ```
+
+    Fuyu does not expect image placeholders in the inputs to HF processor, so
+    the dummy prompt text is empty regardless of the number of images.
+
+    ```python
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+    ```
+
+    For the multimodal image profiling data, the logic is very similar to LLaVA:
+
+    ??? code
+
+        ```python
+        def get_dummy_mm_data(
+            self,
+            seq_len: int,
+            mm_counts: Mapping[str, int],
+        ) -> MultiModalDataDict:
+            target_width, target_height = \
+                self.info.get_image_size_with_most_features()
+            num_images = mm_counts.get("image", 0)
+
+            return {
+                "image":
+                self._get_dummy_images(width=target_width,
+                                    height=target_height,
+                                    num_images=num_images)
+            }
+        ```
+
+## 4. Specify processing details
+
+Afterwards, create a subclass of [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor]
+to fill in the missing details about HF processing.
+
+!!! info
+    [Multi-Modal Data Processing](../../design/mm_processing.md)
+
+### Multi-modal fields
+
+Override [_get_mm_fields_config][vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config] to
+return a schema of the tensors outputted by the HF processor that are related to the input multi-modal items.
+
+=== "Basic example: LLaVA"
+
+    The output of `CLIPImageProcessor` is a simple tensor with shape
+    `(num_images, num_channels, image_height, image_width)`:
+
+
+    ```python
+    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/image_processing_clip.py#L339-L345
+    images = [
+        to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+        for image in all_images
+    ]
+
+    data = {"pixel_values": images}
+    return BatchFeature(data=data, tensor_type=return_tensors)
+    ```
+
+    So, we override [_get_mm_fields_config][vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config] as follows:
+
+    ```python
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+        )
+    ```
+
+    !!! note
+        Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports
+        pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument.
+
+=== "With postprocessing: Fuyu"
+
+    The `image_patches` output of `FuyuImageProcessor.preprocess_with_tokenizer_info` concatenates
+    the patches from each image belonging to an item in the batch:
+
+    ```python
+    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L673-L679
+            image_input_ids.append(tensor_of_image_ids)
+            image_patches.append(patches)
+        else:
+            image_input_ids.append(torch.tensor([], dtype=torch.int32, device=image_input.device))
+
+    batch_image_input_ids.append(image_input_ids)
+    batch_image_patches.append(image_patches)
+    ```
+
+    The shape of `image_patches` outputted by `FuyuImageProcessor` is therefore
+    `(1, num_images, num_patches, patch_width * patch_height * num_channels)`.
+
+    In order to support the use of [MultiModalFieldConfig.batched][] like in LLaVA,
+    we remove the extra batch dimension by overriding [BaseMultiModalProcessor._call_hf_processor][]:
+
+    ??? code
+
+        ```python
+        def _call_hf_processor(
+            self,
+            prompt: str,
+            mm_data: Mapping[str, object],
+            mm_kwargs: Mapping[str, object],
+            tok_kwargs: Mapping[str, object],
+        ) -> BatchFeature:
+            processed_outputs = super()._call_hf_processor(
+                prompt=prompt,
+                mm_data=mm_data,
+                mm_kwargs=mm_kwargs,
+                tok_kwargs=tok_kwargs,
+            )
+
+            image_patches = processed_outputs.get("image_patches")
+            if image_patches is not None:
+                images = mm_data["images"]
+                assert isinstance(images, list)
+
+                # Original output: (1, num_images, Pn, Px * Py * C)
+                # New output: (num_images, Pn, Px * Py * C)
+                assert (isinstance(image_patches, list)
+                        and len(image_patches) == 1)
+                assert (isinstance(image_patches[0], torch.Tensor)
+                        and len(image_patches[0]) == len(images))
+
+                processed_outputs["image_patches"] = image_patches[0]
+
+            return processed_outputs
+        ```
+
+    !!! note
+        Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling
+        for text-only inputs to prevent unnecessary warnings from HF processor.
+
+    !!! note
+        The `_call_hf_processor` method specifies both `mm_kwargs` and `tok_kwargs` for
+        processing. `mm_kwargs` is used to both initialize and call the huggingface
+        processor, whereas `tok_kwargs` is only used to call the huggingface processor.
+
+    This lets us override [_get_mm_fields_config][vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config] as follows:
+
+    ```python
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(image_patches=MultiModalFieldConfig.batched("image"))
+    ```
+
+### Prompt updates
+
+Override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] to
+return a list of [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instances.
+
+Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies an update operation
+(e.g.: insertion, replacement) performed by the HF processor.
+
+=== "Basic example: LLaVA"
+
+    Looking at HF's `LlavaProcessor`:
+
+    ```python
+    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/processing_llava.py#L167-L170
+    prompt_strings = []
+    for sample in text:
+        sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
+        prompt_strings.append(sample)
+    ```
+
+    It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`).
+    Based on this, we override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] as follows:
+
+    ??? code
+
+        ```python
+        def _get_prompt_updates(
+            self,
+            mm_items: MultiModalDataItems,
+            hf_processor_mm_kwargs: Mapping[str, object],
+            out_mm_kwargs: MultiModalKwargs,
+        ) -> Sequence[PromptUpdate]:
+            hf_config = self.info.get_hf_config()
+            image_token_id = hf_config.image_token_index
+
+            def get_replacement(item_idx: int):
+                images = mm_items.get_items("image", ImageProcessorItems)
+
+                image_size = images.get_image_size(item_idx)
+                num_image_tokens = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                )
+
+                return [image_token_id] * num_image_tokens
+
+            return [
+                PromptReplacement(
+                    modality="image",
+                    target=[image_token_id],
+                    replacement=get_replacement,
+                ),
+            ]
+        ```
+
+=== "Handling additional tokens: Fuyu"
+
+    Recall the layout of feature tokens from Step 2:
+
+    ```
+    |SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
+    |SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
+    ...
+    |SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
+    ```
+
+    We define a helper function to return `ncols` and `nrows` directly:
+
+    ??? code
+
+        ```python
+        def get_image_feature_grid_size(
+            self,
+            *,
+            image_width: int,
+            image_height: int,
+        ) -> tuple[int, int]:
+            image_processor = self.get_image_processor()
+            target_width = image_processor.size["width"]
+            target_height = image_processor.size["height"]
+            patch_width = image_processor.patch_size["width"]
+            patch_height = image_processor.patch_size["height"]
+
+            if not (image_width <= target_width and image_height <= target_height):
+                height_scale_factor = target_height / image_height
+                width_scale_factor = target_width / image_width
+                optimal_scale_factor = min(height_scale_factor, width_scale_factor)
+
+                image_height = int(image_height * optimal_scale_factor)
+                image_width = int(image_width * optimal_scale_factor)
+
+            ncols = math.ceil(image_width / patch_width)
+            nrows = math.ceil(image_height / patch_height)
+            return ncols, nrows
+        ```
+
+    Based on this, we can initially define our replacement tokens as:
+
+    ??? code
+
+        ```python
+        def get_replacement(item_idx: int):
+            images = mm_items.get_items("image", ImageProcessorItems)
+            image_size = images.get_image_size(item_idx)
+
+            ncols, nrows = self.info.get_image_feature_grid_size(
+                image_width=image_size.width,
+                image_height=image_size.height,
+            )
+
+            # `_IMAGE_TOKEN_ID` corresponds to `|SPEAKER|`
+            # `_NEWLINE_TOKEN_ID` corresponds to `|NEWLINE|`
+            return ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
+        ```
+
+    However, this is not entirely correct. After `FuyuImageProcessor.preprocess_with_tokenizer_info` is called,
+    a BOS token (`<s>`) is also added to the promopt:
+
+    ??? code
+
+        ```python
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435
+        model_image_input = self.image_processor.preprocess_with_tokenizer_info(
+            image_input=tensor_batch_images,
+            image_present=image_present,
+            image_unpadded_h=image_unpadded_heights,
+            image_unpadded_w=image_unpadded_widths,
+            image_placeholder_id=image_placeholder_id,
+            image_newline_id=image_newline_id,
+            variable_sized=True,
+        )
+        prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
+            tokenizer=self.tokenizer,
+            prompts=prompts,
+            scale_factors=scale_factors,
+            max_tokens_to_generate=self.max_tokens_to_generate,
+            max_position_embeddings=self.max_position_embeddings,
+            add_BOS=True,
+            add_beginning_of_answer_token=True,
+        )
+        ```
+
+    To assign the vision embeddings to only the image tokens, instead of a string
+    you can return an instance of [PromptUpdateDetails][vllm.multimodal.processing.PromptUpdateDetails]:
+
+    ??? code
+
+        ```python
+        hf_config = self.info.get_hf_config()
+        bos_token_id = hf_config.bos_token_id  # `<s>`
+        assert isinstance(bos_token_id, int)
+
+        def get_replacement_fuyu(item_idx: int):
+            images = mm_items.get_items("image", ImageProcessorItems)
+            image_size = images.get_image_size(item_idx)
+
+            ncols, nrows = self.info.get_image_feature_grid_size(
+                image_width=image_size.width,
+                image_height=image_size.height,
+            )
+            image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
+                            [_NEWLINE_TOKEN_ID]) * nrows
+
+            return PromptUpdateDetails.select_token_id(
+                image_tokens + [bos_token_id],
+                embed_token_id=_IMAGE_TOKEN_ID,
+            )
+        ```
+
+    Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt,
+    we can search for it to conduct the replacement at the start of the string:
+
+    ??? code
+
+        ```python
+        def _get_prompt_updates(
+            self,
+            mm_items: MultiModalDataItems,
+            hf_processor_mm_kwargs: Mapping[str, object],
+            out_mm_kwargs: MultiModalKwargs,
+        ) -> Sequence[PromptUpdate]:
+            hf_config = self.info.get_hf_config()
+            bos_token_id = hf_config.bos_token_id
+            assert isinstance(bos_token_id, int)
+
+            tokenizer = self.info.get_tokenizer()
+            eot_token_id = tokenizer.bos_token_id
+            assert isinstance(eot_token_id, int)
+
+            def get_replacement_fuyu(item_idx: int):
+                images = mm_items.get_items("image", ImageProcessorItems)
+                image_size = images.get_image_size(item_idx)
+
+                ncols, nrows = self.info.get_image_feature_grid_size(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                )
+                image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
+                                [_NEWLINE_TOKEN_ID]) * nrows
+
+                return PromptUpdateDetails.select_token_id(
+                    image_tokens + [bos_token_id],
+                    embed_token_id=_IMAGE_TOKEN_ID,
+                )
+
+            return [
+                PromptReplacement(
+                    modality="image",
+                    target=[eot_token_id],
+                    replacement=get_replacement_fuyu,
+                )
+            ]
+        ```
+
+## 5. Register processor-related classes
+
+After you have defined [BaseProcessingInfo][vllm.multimodal.processing.BaseProcessingInfo] (Step 2),
+[BaseDummyInputsBuilder][vllm.multimodal.profiling.BaseDummyInputsBuilder] (Step 3),
+and [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] (Step 4),
+decorate the model class with [MULTIMODAL_REGISTRY.register_processor][vllm.multimodal.processing.MultiModalRegistry.register_processor]
+to register them to the multi-modal registry:
+
+```diff
+  from vllm.model_executor.models.interfaces import SupportsMultiModal
++ from vllm.multimodal import MULTIMODAL_REGISTRY
+
++ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor,
++                                         info=YourProcessingInfo,
++                                         dummy_inputs=YourDummyInputsBuilder)
+  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
+```
+
+## Notes
+
+### Inserting feature tokens without replacement
+
+Some HF processors directly insert feature tokens without replacing anything in the original prompt. In that case, you can use [PromptInsertion][vllm.multimodal.processing.PromptInsertion] instead of [PromptReplacement][vllm.multimodal.processing.PromptReplacement] inside [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates].
+
+Examples:
+
+- BLIP-2 (insert at start of prompt): <gh-file:vllm/model_executor/models/blip2.py>
+- Florence2 (insert at start of prompt): <gh-file:vllm/model_executor/models/florence2.py>
+- Molmo (insert after `<|endoftext|>` token): <gh-file:vllm/model_executor/models/molmo.py>
+
+### Handling prompt updates unrelated to multi-modal data
+
+[_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] assumes that each application of prompt update corresponds to one multi-modal item. If the HF processor performs additional processing regardless of how many multi-modal items there are, you should override [_apply_hf_processor_tokens_only][vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_tokens_only] so that the processed token inputs are consistent with the result of applying the HF processor on text inputs. This is because token inputs bypass the HF processor according to [our design](../../design/mm_processing.md).
+
+Examples:
+
+- Chameleon (appends `sep_token`): <gh-file:vllm/model_executor/models/chameleon.py>
+- Fuyu (appends `boa_token`): <gh-file:vllm/model_executor/models/fuyu.py>
+- Molmo (applies chat template which is not defined elsewhere): <gh-file:vllm/model_executor/models/molmo.py>
+
+### Custom HF processor
+
+Some models don't define a HF processor class on HF Hub. In that case, you can define a custom HF processor that has the same call signature as HF processors and pass it to [_call_hf_processor][vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor].
+
+Examples:
+
+- DeepSeek-VL2: <gh-file:vllm/model_executor/models/deepseek_vl2.py>
+- InternVL: <gh-file:vllm/model_executor/models/internvl.py>
+- Qwen-VL: <gh-file:vllm/model_executor/models/qwen_vl.py>
diff --git a/vllm_v0.10.0/docs/contributing/model/registration.md b/vllm_v0.10.0/docs/contributing/model/registration.md
new file mode 100644
index 0000000..35f35ff
--- /dev/null
+++ b/vllm_v0.10.0/docs/contributing/model/registration.md
@@ -0,0 +1,51 @@
+# Registering a Model
+
+vLLM relies on a model registry to determine how to run each model.
+A list of pre-registered architectures can be found [here](../../models/supported_models.md).
+
+If your model is not on this list, you must register it to vLLM.
+This page provides detailed instructions on how to do so.
+
+## Built-in models
+
+To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source][build-from-source].
+This gives you the ability to modify the codebase and test your model.
+
+After you have implemented your model (see [tutorial](basic.md)), put it into the <gh-dir:vllm/model_executor/models> directory.
+Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
+Finally, update our [list of supported models](../../models/supported_models.md) to promote your model!
+
+!!! important
+    The list of models in each section should be maintained in alphabetical order.
+
+## Out-of-tree models
+
+You can load an external model [using a plugin](../../design/plugin_system.md) without modifying the vLLM codebase.
+
+To register the model, use the following code:
+
+```python
+# The entrypoint of your plugin
+def register():
+    from vllm import ModelRegistry
+    from your_code import YourModelForCausalLM
+
+    ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
+```
+
+If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`:
+
+```python
+# The entrypoint of your plugin
+def register():
+    from vllm import ModelRegistry
+
+    ModelRegistry.register_model(
+        "YourModelForCausalLM",
+        "your_code:YourModelForCausalLM"
+    )
+```
+
+!!! important
+    If your model is a multimodal model, ensure the model class implements the [SupportsMultiModal][vllm.model_executor.models.interfaces.SupportsMultiModal] interface.
+    Read more about that [here](multimodal.md).
diff --git a/vllm_v0.10.0/docs/contributing/model/tests.md b/vllm_v0.10.0/docs/contributing/model/tests.md
new file mode 100644
index 0000000..1206ad3
--- /dev/null
+++ b/vllm_v0.10.0/docs/contributing/model/tests.md
@@ -0,0 +1,59 @@
+# Unit Testing
+
+This page explains how to write unit tests to verify the implementation of your model.
+
+## Required Tests
+
+These tests are necessary to get your PR merged into vLLM library.
+Without them, the CI for your PR will fail.
+
+### Model loading
+
+Include an example HuggingFace repository for your model in <gh-file:tests/models/registry.py>.
+This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM.
+
+!!! important
+    The list of models in each section should be maintained in alphabetical order.
+
+!!! tip
+    If your model requires a development version of HF Transformers, you can set
+    `min_transformers_version` to skip the test in CI until the model is released.
+
+## Optional Tests
+
+These tests are optional to get your PR merged into vLLM library.
+Passing these tests provides more confidence that your implementation is correct, and helps avoid future regressions.
+
+### Model correctness
+
+These tests compare the model outputs of vLLM against [HF Transformers](https://github.com/huggingface/transformers). You can add new tests under the subdirectories of <gh-dir:tests/models>.
+
+#### Generative models
+
+For [generative models](../../models/generative_models.md), there are two levels of correctness tests, as defined in <gh-file:tests/models/utils.py>:
+
+- Exact correctness (`check_outputs_equal`): The text outputted by vLLM should exactly match the text outputted by HF.
+- Logprobs similarity (`check_logprobs_close`): The logprobs outputted by vLLM should be in the top-k logprobs outputted by HF, and vice versa.
+
+#### Pooling models
+
+For [pooling models](../../models/pooling_models.md), we simply check the cosine similarity, as defined in <gh-file:tests/models/utils.py>.
+
+[](){ #mm-processing-tests }
+
+### Multi-modal processing
+
+#### Common tests
+
+Adding your model to <gh-file:tests/models/multimodal/processing/test_common.py> verifies that the following input combinations result in the same outputs:
+
+- Text + multi-modal data
+- Tokens + multi-modal data
+- Text + cached multi-modal data
+- Tokens + cached multi-modal data
+
+#### Model-specific tests
+
+You can add a new file under <gh-dir:tests/models/multimodal/processing> to run tests that only apply to your model.
+
+For example, if the HF processor for your model accepts user-specified keyword arguments, you can verify that the keyword arguments are being applied correctly, such as in <gh-file:tests/models/multimodal/processing/test_phi3v.py>.
diff --git a/vllm_v0.10.0/docs/contributing/profiling.md b/vllm_v0.10.0/docs/contributing/profiling.md
new file mode 100644
index 0000000..a5851cf
--- /dev/null
+++ b/vllm_v0.10.0/docs/contributing/profiling.md
@@ -0,0 +1,200 @@
+# Profiling vLLM
+
+!!! warning
+    Profiling is only intended for vLLM developers and maintainers to understand the proportion of time spent in different parts of the codebase. **vLLM end-users should never turn on profiling** as it will significantly slow down the inference.
+
+## Profile with PyTorch Profiler
+
+We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/`
+
+The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
+
+When using `benchmarks/benchmark_serving.py`, you can enable profiling by passing the `--profile` flag.
+
+Traces can be visualized using <https://ui.perfetto.dev/>.
+
+!!! tip
+    Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
+
+!!! tip
+    To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
+    Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes.
+    `export VLLM_RPC_TIMEOUT=1800000`
+
+### Example commands and usage
+
+#### Offline Inference
+
+Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example.
+
+#### OpenAI Server
+
+```bash
+VLLM_TORCH_PROFILER_DIR=./vllm_profile \
+    python -m vllm.entrypoints.openai.api_server \
+    --model meta-llama/Meta-Llama-3-70B
+```
+
+benchmark_serving.py:
+
+```bash
+python benchmarks/benchmark_serving.py \
+    --backend vllm \
+    --model meta-llama/Meta-Llama-3-70B \
+    --dataset-name sharegpt \
+    --dataset-path sharegpt.json \
+    --profile \
+    --num-prompts 2
+```
+
+## Profile with NVIDIA Nsight Systems
+
+Nsight systems is an advanced tool that exposes more profiling details, such as register and shared memory usage, annotated code regions and low-level CUDA APIs and events.
+
+[Install nsight-systems](https://docs.nvidia.com/nsight-systems/InstallationGuide/index.html) using your package manager.
+The following block is an example for Ubuntu.
+
+```bash
+apt update
+apt install -y --no-install-recommends gnupg
+echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu$(source /etc/lsb-release; echo "$DISTRIB_RELEASE" | tr -d .)/$(dpkg --print-architecture) /" | tee /etc/apt/sources.list.d/nvidia-devtools.list
+apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
+apt update
+apt install nsight-systems-cli
+```
+
+### Example commands and usage
+
+#### Offline Inference
+
+For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node` before any existing script you would run for offline inference.
+
+The following is an example using the `benchmarks/benchmark_latency.py` script:
+
+```bash
+nsys profile -o report.nsys-rep \
+    --trace-fork-before-exec=true \
+    --cuda-graph-trace=node \
+    python benchmarks/benchmark_latency.py \
+    --model meta-llama/Llama-3.1-8B-Instruct \
+    --num-iters-warmup 5 \
+    --num-iters 1 \
+    --batch-size 16 \
+    --input-len 512 \
+    --output-len 8
+```
+
+#### OpenAI Server
+
+To profile the server, you will want to prepend your `vllm serve` command with `nsys profile` just like for offline inference, however you must specify `--delay XX --duration YY` parameters according to the needs of your benchmark. After the duration time has been used up, the server will be killed.
+
+```bash
+# server
+nsys profile -o report.nsys-rep \
+    --trace-fork-before-exec=true \
+    --cuda-graph-trace=node \
+    --delay 30 \
+    --duration 60 \
+    vllm serve meta-llama/Llama-3.1-8B-Instruct
+
+# client
+python benchmarks/benchmark_serving.py \
+    --backend vllm \
+    --model meta-llama/Llama-3.1-8B-Instruct \
+    --num-prompts 1 \
+    --dataset-name random \
+    --random-input 1024 \
+    --random-output 512
+```
+
+In practice, you should set the `--duration` argument to a large value. Whenever you want the server to stop profiling, run:
+
+```
+nsys sessions list
+```
+
+to get the session id in the form of `profile-XXXXX`, then run:
+
+```
+nsys stop --session=profile-XXXXX
+```
+
+to manually kill the profiler and generate your `nsys-rep` report.
+
+#### Analysis
+
+You can view these profiles either as summaries in the CLI, using `nsys stats [profile-file]`, or in the GUI by installing Nsight [locally following the directions here](https://developer.nvidia.com/nsight-systems/get-started).
+
+??? console "CLI example"
+
+    ```bash
+    nsys stats report1.nsys-rep
+    ...
+    ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):
+
+    Time (%)  Total Time (ns)  Instances   Avg (ns)     Med (ns)    Min (ns)  Max (ns)   StdDev (ns)                                                  Name                                                
+    --------  ---------------  ---------  -----------  -----------  --------  ---------  -----------  ----------------------------------------------------------------------------------------------------
+        46.3   10,327,352,338     17,505    589,965.9    144,383.0    27,040  3,126,460    944,263.8  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of…
+        14.8    3,305,114,764      5,152    641,520.7    293,408.0   287,296  2,822,716    867,124.9  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of…
+        12.1    2,692,284,876     14,280    188,535.4     83,904.0    19,328  2,862,237    497,999.9  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x128x64_warpgroupsize1x1x1_execute_segment_k_off…
+        9.5    2,116,600,578     33,920     62,399.8     21,504.0    15,326  2,532,285    290,954.1  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x64x64_warpgroupsize1x1x1_execute_segment_k_off_…
+        5.0    1,119,749,165     18,912     59,208.4      9,056.0     6,784  2,578,366    271,581.7  void vllm::act_and_mul_kernel<c10::BFloat16, &vllm::silu_kernel<c10::BFloat16>, (bool)1>(T1 *, cons…
+        4.1      916,662,515     21,312     43,011.6     19,776.0     8,928  2,586,205    199,790.1  void cutlass::device_kernel<flash::enable_sm90_or_later<flash::FlashAttnFwdSm90<flash::CollectiveMa…
+        2.6      587,283,113     37,824     15,526.7      3,008.0     2,719  2,517,756    139,091.1  std::enable_if<T2>(int)0&&vllm::_typeConvert<T1>::exists, void>::type vllm::fused_add_rms_norm_kern…
+        1.9      418,362,605     18,912     22,121.5      3,871.0     3,328  2,523,870    175,248.2  void vllm::rotary_embedding_kernel<c10::BFloat16, (bool)1>(const long *, T1 *, T1 *, const T1 *, in…
+        0.7      167,083,069     18,880      8,849.7      2,240.0     1,471  2,499,996    101,436.1  void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0…
+    ... 
+    ```
+
+GUI example:
+
+<img width="1799" alt="Screenshot 2025-03-05 at 11 48 42 AM" src="https://github.com/user-attachments/assets/c7cff1ae-6d6f-477d-a342-bd13c4fc424c" />
+
+## Profiling vLLM Python Code
+
+The Python standard library includes
+[cProfile](https://docs.python.org/3/library/profile.html) for profiling Python
+code. vLLM includes a couple of helpers that make it easy to apply it to a section of vLLM.
+Both the `vllm.utils.cprofile` and `vllm.utils.cprofile_context` functions can be
+used to profile a section of code.
+
+### Example usage - decorator
+
+The first helper is a Python decorator that can be used to profile a function.
+If a filename is specified, the profile will be saved to that file. If no filename is
+specified, profile data will be printed to stdout.
+
+```python
+import vllm.utils
+
+@vllm.utils.cprofile("expensive_function.prof")
+def expensive_function():
+    # some expensive code
+    pass
+```
+
+### Example Usage - context manager
+
+The second helper is a context manager that can be used to profile a block of
+code. Similar to the decorator, the filename is optional.
+
+```python
+import vllm.utils
+
+def another_function():
+    # more expensive code
+    pass
+
+with vllm.utils.cprofile_context("another_function.prof"):
+    another_function()
+```
+
+### Analyzing Profile Results
+
+There are multiple tools available that can help analyze the profile results.
+One example is [snakeviz](https://jiffyclub.github.io/snakeviz/).
+
+```bash
+pip install snakeviz
+snakeviz expensive_function.prof
+```
diff --git a/vllm_v0.10.0/docs/contributing/vulnerability_management.md b/vllm_v0.10.0/docs/contributing/vulnerability_management.md
new file mode 100644
index 0000000..e20b10f
--- /dev/null
+++ b/vllm_v0.10.0/docs/contributing/vulnerability_management.md
@@ -0,0 +1,61 @@
+# Vulnerability Management
+
+## Reporting Vulnerabilities
+
+As mentioned in the [security
+policy](https://github.com/vllm-project/vllm/tree/main/SECURITY.md), security
+vulnerabilities may be reported privately to the project via
+[GitHub](https://github.com/vllm-project/vllm/security/advisories/new).
+
+## Vulnerability Management Team
+
+Once a vulnerability has been reported to the project, the Vulnerability
+Management Team (VMT) is responsible for managing the vulnerability. The VMT is
+responsible for:
+
+- Triaging the vulnerability.
+- Coordinating with reporters and project maintainers on vulnerability analysis
+  and resolution.
+- Drafting of security advisories for confirmed vulnerabilities, as appropriate.
+- Coordination with project maintainers on a coordinated release of the fix and
+  security advisory.
+
+### Security Advisories
+
+Advisories are published via GitHub through the same system used to report
+vulnerabilities. More information on the process can be found in the [GitHub
+documentation](https://docs.github.com/en/code-security/security-advisories/working-with-repository-security-advisories/about-repository-security-advisories).
+
+### Team Members
+
+We prefer to keep all vulnerability-related communication on the security report
+on GitHub. However, if you need to contact the VMT directly for an urgent issue,
+you may contact the following individuals:
+
+- Simon Mo - simon.mo@hey.com
+- Russell Bryant - rbryant@redhat.com
+- Huzaifa Sidhpurwala - huzaifas@redhat.com
+
+## Slack Discussion
+
+You may use the `#security` channel in the [vLLM Slack](https://slack.vllm.ai)
+to discuss security-related topics. However, please do not disclose any
+vulnerabilities in this channel. If you need to report a vulnerability, please
+use the GitHub security advisory system or contact a VMT member privately.
+
+## Vulnerability Disclosure
+
+The process for disclosing vulnerabilities is the following:
+
+- The VMT will work with the project maintainers to develop a fix for the
+  vulnerability.
+- The VMT will coordinate with the reporter and project maintainers to prepare a
+  security advisory that adequately describes the vulnerability and its impact.
+- The VMT will coordinate with the project maintainers to publish a fix and
+  release an update that includes that fix.
+- The VMT will publish the security advisory on GitHub. Release notes will be
+  updated to include a reference to the security advisory.
+
+The VMT and project maintainers will work to minimize the amount of time in
+between disclosing any public information about the vulnerability and making a
+release and advisory available.
diff --git a/vllm_v0.10.0/docs/deployment/docker.md b/vllm_v0.10.0/docs/deployment/docker.md
new file mode 100644
index 0000000..e500751
--- /dev/null
+++ b/vllm_v0.10.0/docs/deployment/docker.md
@@ -0,0 +1,139 @@
+# Using Docker
+
+[](){ #deployment-docker-pre-built-image }
+
+## Use vLLM's Official Docker Image
+
+vLLM offers an official Docker image for deployment.
+The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags).
+
+```bash
+docker run --runtime nvidia --gpus all \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    -p 8000:8000 \
+    --ipc=host \
+    vllm/vllm-openai:latest \
+    --model mistralai/Mistral-7B-v0.1
+```
+
+This image can also be used with other container engines such as [Podman](https://podman.io/).
+
+```bash
+podman run --gpus all \
+  -v ~/.cache/huggingface:/root/.cache/huggingface \
+  --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+  -p 8000:8000 \
+  --ipc=host \
+  vllm/vllm-openai:latest \
+  --model mistralai/Mistral-7B-v0.1
+```
+
+You can add any other [engine-args](../configuration/engine_args.md) you need after the image tag (`vllm/vllm-openai:latest`).
+
+!!! note
+    You can either use the `ipc=host` flag or `--shm-size` flag to allow the
+    container to access the host's shared memory. vLLM uses PyTorch, which uses shared
+    memory to share data between processes under the hood, particularly for tensor parallel inference.
+
+!!! note
+    Optional dependencies are not included in order to avoid licensing issues (e.g. <gh-issue:8030>).
+
+    If you need to use those dependencies (having accepted the license terms),
+    create a custom Dockerfile on top of the base image with an extra layer that installs them:
+
+    ```Dockerfile
+    FROM vllm/vllm-openai:v0.9.0
+
+    # e.g. install the `audio` optional dependencies
+    # NOTE: Make sure the version of vLLM matches the base image!
+    RUN uv pip install --system vllm[audio]==0.9.0
+    ```
+
+!!! tip
+    Some new models may only be available on the main branch of [HF Transformers](https://github.com/huggingface/transformers).
+
+    To use the development version of `transformers`, create a custom Dockerfile on top of the base image
+    with an extra layer that installs their code from source:
+
+    ```Dockerfile
+    FROM vllm/vllm-openai:latest
+
+    RUN uv pip install --system git+https://github.com/huggingface/transformers.git
+    ```
+
+[](){ #deployment-docker-build-image-from-source }
+
+## Building vLLM's Docker Image from Source
+
+You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM:
+
+```bash
+# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
+DOCKER_BUILDKIT=1 docker build . \
+    --target vllm-openai \
+    --tag vllm/vllm-openai \
+    --file docker/Dockerfile
+```
+
+!!! note
+    By default vLLM will build for all GPU types for widest distribution. If you are just building for the
+    current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""`
+    for vLLM to find the current GPU type and build for that.
+
+    If you are using Podman instead of Docker, you might need to disable SELinux labeling by
+    adding `--security-opt label=disable` when running `podman build` command to avoid certain [existing issues](https://github.com/containers/buildah/discussions/4184).
+
+## Building for Arm64/aarch64
+
+A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
+of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
+
+!!! note
+    Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
+    flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits.
+    Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
+
+??? console "Command"
+
+    ```bash
+    # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
+    python3 use_existing_torch.py
+    DOCKER_BUILDKIT=1 docker build . \
+    --file docker/Dockerfile \
+    --target vllm-openai \
+    --platform "linux/arm64" \
+    -t vllm/vllm-gh200-openai:latest \
+    --build-arg max_jobs=66 \
+    --build-arg nvcc_threads=2 \
+    --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \
+    --build-arg vllm_fa_cmake_gpu_arches="90-real"
+    ```
+
+!!! note
+    If you are building the `linux/arm64` image on a non-ARM host (e.g., an x86_64 machine), you need to ensure your system is set up for cross-compilation using QEMU. This allows your host machine to emulate ARM64 execution.
+
+    Run the following command on your host machine to register QEMU user static handlers:
+
+    ```bash
+    docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
+    ```
+
+    After setting up QEMU, you can use the `--platform "linux/arm64"` flag in your `docker build` command.
+
+## Use the custom-built vLLM Docker image
+
+To run vLLM with the custom-built Docker image:
+
+```bash
+docker run --runtime nvidia --gpus all \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    -p 8000:8000 \
+    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    vllm/vllm-openai <args...>
+```
+
+The argument `vllm/vllm-openai` specifies the image to run, and should be replaced with the name of the custom-built image (the `-t` tag from the build command).
+
+!!! note
+    **For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` .
diff --git a/vllm_v0.10.0/docs/deployment/frameworks/anyscale.md b/vllm_v0.10.0/docs/deployment/frameworks/anyscale.md
new file mode 100644
index 0000000..9957c5b
--- /dev/null
+++ b/vllm_v0.10.0/docs/deployment/frameworks/anyscale.md
@@ -0,0 +1,17 @@
+# Anyscale
+
+[](){ #deployment-anyscale }
+
+[Anyscale](https://www.anyscale.com) is a managed, multi-cloud platform developed by the creators of Ray.
+
+Anyscale automates the entire lifecycle of Ray clusters in your AWS, GCP, or Azure account, delivering the flexibility of open-source Ray
+without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, managing observability stacks, or manually managing head and worker nodes with helper scripts like <gh-file:examples/online_serving/run_cluster.sh>.
+
+When serving large language models with vLLM, Anyscale can rapidly provision [production-ready HTTPS endpoints](https://docs.anyscale.com/examples/deploy-ray-serve-llms) or [fault-tolerant batch inference jobs](https://docs.anyscale.com/examples/ray-data-llm).
+
+## Production-ready vLLM on Anyscale quickstarts
+
+- [Offline batch inference](https://console.anyscale.com/template-preview/llm_batch_inference?utm_source=vllm_docs)
+- [Deploy vLLM services](https://console.anyscale.com/template-preview/llm_serving?utm_source=vllm_docs)
+- [Curate a dataset](https://console.anyscale.com/template-preview/audio-dataset-curation-llm-judge?utm_source=vllm_docs)
+- [Finetune an LLM](https://console.anyscale.com/template-preview/entity-recognition-with-llms?utm_source=vllm_docs)
diff --git a/vllm_v0.10.0/docs/deployment/frameworks/anything-llm.md b/vllm_v0.10.0/docs/deployment/frameworks/anything-llm.md
new file mode 100644
index 0000000..d6b28a3
--- /dev/null
+++ b/vllm_v0.10.0/docs/deployment/frameworks/anything-llm.md
@@ -0,0 +1,41 @@
+# Anything LLM
+
+[Anything LLM](https://github.com/Mintplex-Labs/anything-llm) is a full-stack application that enables you to turn any document, resource, or piece of content into context that any LLM can use as references during chatting.
+
+It allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints.
+
+## Prerequisites
+
+- Setup vLLM environment
+
+## Deploy
+
+- Start the vLLM server with the supported chat completion model, e.g.
+
+```bash
+vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096
+```
+
+- Download and install [Anything LLM desktop](https://anythingllm.com/desktop).
+
+- On the bottom left of open settings, AI Prooviders --> LLM:
+  - LLM Provider: Generic OpenAI
+  - Base URL: http://{vllm server host}:{vllm server port}/v1
+  - Chat Model Name: `Qwen/Qwen1.5-32B-Chat-AWQ`
+
+![](../../assets/deployment/anything-llm-provider.png)
+
+- Back to home page, New Workspace --> create `vllm` workspace, and start to chat:
+
+![](../../assets/deployment/anything-llm-chat-without-doc.png)
+
+- Click the upload button:
+  - upload the doc
+  - select the doc and move to the workspace
+  - save and embed
+
+![](../../assets/deployment/anything-llm-upload-doc.png)
+
+- Chat again:
+
+![](../../assets/deployment/anything-llm-chat-with-doc.png)
diff --git a/vllm_v0.10.0/docs/deployment/frameworks/autogen.md b/vllm_v0.10.0/docs/deployment/frameworks/autogen.md
new file mode 100644
index 0000000..c255a85
--- /dev/null
+++ b/vllm_v0.10.0/docs/deployment/frameworks/autogen.md
@@ -0,0 +1,82 @@
+# AutoGen
+
+[AutoGen](https://github.com/microsoft/autogen) is a framework for creating multi-agent AI applications that can act autonomously or work alongside humans.
+
+## Prerequisites
+
+- Setup vLLM environment
+
+- Setup [AutoGen](https://microsoft.github.io/autogen/0.2/docs/installation/) environment
+
+```bash
+pip install vllm
+
+# Install AgentChat and OpenAI client from Extensions
+# AutoGen requires Python 3.10 or later.
+pip install -U "autogen-agentchat" "autogen-ext[openai]"
+```
+
+## Deploy
+
+- Start the vLLM server with the supported chat completion model, e.g.
+
+```bash
+python -m vllm.entrypoints.openai.api_server \
+    --model mistralai/Mistral-7B-Instruct-v0.2
+```
+
+- Call it with AutoGen:
+
+??? code
+
+    ```python
+    import asyncio
+    from autogen_core.models import UserMessage
+    from autogen_ext.models.openai import OpenAIChatCompletionClient
+    from autogen_core.models import ModelFamily
+
+
+    async def main() -> None:
+        # Create a model client
+        model_client = OpenAIChatCompletionClient(
+            model="mistralai/Mistral-7B-Instruct-v0.2",
+            base_url="http://{your-vllm-host-ip}:{your-vllm-host-port}/v1",
+            api_key="EMPTY",
+            model_info={
+                "vision": False,
+                "function_calling": False,
+                "json_output": False,
+                "family": ModelFamily.MISTRAL,
+                "structured_output": True,
+            },
+        )
+
+        messages = [UserMessage(content="Write a very short story about a dragon.", source="user")]
+
+        # Create a stream.
+        stream = model_client.create_stream(messages=messages)
+
+        # Iterate over the stream and print the responses.
+        print("Streamed responses:")
+        async for response in stream:
+            if isinstance(response, str):
+                # A partial response is a string.
+                print(response, flush=True, end="")
+            else:
+                # The last response is a CreateResult object with the complete message.
+                print("\n\n------------\n")
+                print("The complete response:", flush=True)
+                print(response.content, flush=True)
+
+        # Close the client when done.
+        await model_client.close()
+
+
+    asyncio.run(main())
+    ```
+
+For details, see the tutorial:
+
+- [Using vLLM in AutoGen](https://microsoft.github.io/autogen/0.2/docs/topics/non-openai-models/local-vllm/)
+
+- [OpenAI-compatible API examples](https://microsoft.github.io/autogen/stable/reference/python/autogen_ext.models.openai.html#autogen_ext.models.openai.OpenAIChatCompletionClient)
diff --git a/vllm_v0.10.0/docs/deployment/frameworks/bentoml.md b/vllm_v0.10.0/docs/deployment/frameworks/bentoml.md
new file mode 100644
index 0000000..9c8f252
--- /dev/null
+++ b/vllm_v0.10.0/docs/deployment/frameworks/bentoml.md
@@ -0,0 +1,5 @@
+# BentoML
+
+[BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-compliant image and deploy it on Kubernetes.
+
+For details, see the tutorial [vLLM inference in the BentoML documentation](https://docs.bentoml.com/en/latest/use-cases/large-language-models/vllm.html).
diff --git a/vllm_v0.10.0/docs/deployment/frameworks/cerebrium.md b/vllm_v0.10.0/docs/deployment/frameworks/cerebrium.md
new file mode 100644
index 0000000..1f233c3
--- /dev/null
+++ b/vllm_v0.10.0/docs/deployment/frameworks/cerebrium.md
@@ -0,0 +1,111 @@
+# Cerebrium
+
+<p align="center">
+    <img src="https://i.ibb.co/hHcScTT/Screenshot-2024-06-13-at-10-14-54.png" alt="vLLM_plus_cerebrium"/>
+</p>
+
+vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebrium.ai/), a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications.
+
+To install the Cerebrium client, run:
+
+```bash
+pip install cerebrium
+cerebrium login
+```
+
+Next, create your Cerebrium project, run:
+
+```bash
+cerebrium init vllm-project
+```
+
+Next, to install the required packages, add the following to your cerebrium.toml:
+
+```toml
+[cerebrium.deployment]
+docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
+
+[cerebrium.dependencies.pip]
+vllm = "latest"
+```
+
+Next, let us add our code to handle inference for the LLM of your choice (`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your `main.py`:
+
+??? code
+
+    ```python
+    from vllm import LLM, SamplingParams
+
+    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1")
+
+    def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95):
+
+        sampling_params = SamplingParams(temperature=temperature, top_p=top_p)
+        outputs = llm.generate(prompts, sampling_params)
+
+        # Print the outputs.
+        results = []
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            results.append({"prompt": prompt, "generated_text": generated_text})
+
+        return {"results": results}
+    ```
+
+Then, run the following code to deploy it to the cloud:
+
+```bash
+cerebrium deploy
+```
+
+If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case`/run`)
+
+??? console "Command"
+
+    ```python
+    curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
+    -H 'Content-Type: application/json' \
+    -H 'Authorization: <JWT TOKEN>' \
+    --data '{
+    "prompts": [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is"
+    ]
+    }'
+    ```
+
+You should get a response like:
+
+??? console "Response"
+
+    ```python
+    {
+        "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
+        "result": {
+            "result": [
+                {
+                    "prompt": "Hello, my name is",
+                    "generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of"
+                },
+                {
+                    "prompt": "The president of the United States is",
+                    "generated_text": " elected every four years. This is a democratic system.\n\n5. What"
+                },
+                {
+                    "prompt": "The capital of France is",
+                    "generated_text": " Paris.\n"
+                },
+                {
+                    "prompt": "The future of AI is",
+                    "generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective."
+                }
+            ]
+        },
+        "run_time_ms": 152.53663063049316
+    }
+    ```
+
+You now have an autoscaling endpoint where you only pay for the compute you use!
diff --git a/vllm_v0.10.0/docs/deployment/frameworks/chatbox.md b/vllm_v0.10.0/docs/deployment/frameworks/chatbox.md
new file mode 100644
index 0000000..15f92ed
--- /dev/null
+++ b/vllm_v0.10.0/docs/deployment/frameworks/chatbox.md
@@ -0,0 +1,32 @@
+# Chatbox
+
+[Chatbox](https://github.com/chatboxai/chatbox) is a desktop client for LLMs, available on Windows, Mac, Linux.
+
+It allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints.
+
+## Prerequisites
+
+- Setup vLLM environment
+
+## Deploy
+
+- Start the vLLM server with the supported chat completion model, e.g.
+
+```bash
+vllm serve qwen/Qwen1.5-0.5B-Chat
+```
+
+- Download and install [Chatbox desktop](https://chatboxai.app/en#download).
+
+- On the bottom left of settings, Add Custom Provider
+  - API Mode: `OpenAI API Compatible`
+  - Name: vllm
+  - API Host: `http://{vllm server host}:{vllm server port}/v1`
+  - API Path: `/chat/completions`
+  - Model: `qwen/Qwen1.5-0.5B-Chat`
+
+![](../../assets/deployment/chatbox-settings.png)
+
+- Go to `Just chat`, and start to chat:
+
+![](../../assets/deployment/chatbox-chat.png)
diff --git a/vllm_v0.10.0/docs/deployment/frameworks/dify.md b/vllm_v0.10.0/docs/deployment/frameworks/dify.md
new file mode 100644
index 0000000..a306319
--- /dev/null
+++ b/vllm_v0.10.0/docs/deployment/frameworks/dify.md
@@ -0,0 +1,51 @@
+# Dify
+
+[Dify](https://github.com/langgenius/dify) is an open-source LLM app development platform. Its intuitive interface combines agentic AI workflow, RAG pipeline, agent capabilities, model management, observability features, and more, allowing you to quickly move from prototype to production.
+
+It supports vLLM as a model provider to efficiently serve large language models.
+
+This guide walks you through deploying Dify using a vLLM backend.
+
+## Prerequisites
+
+- Setup vLLM environment
+- Install [Docker](https://docs.docker.com/engine/install/) and [Docker Compose](https://docs.docker.com/compose/install/)
+
+## Deploy
+
+- Start the vLLM server with the supported chat completion model, e.g.
+
+```bash
+vllm serve Qwen/Qwen1.5-7B-Chat
+```
+
+- Start the Dify server with docker compose ([details](https://github.com/langgenius/dify?tab=readme-ov-file#quick-start)):
+
+```bash
+git clone https://github.com/langgenius/dify.git
+cd dify
+cd docker
+cp .env.example .env
+docker compose up -d
+```
+
+- Open the browser to access `http://localhost/install`, config the basic login information and login.
+
+- In the top-right user menu (under the profile icon), go to Settings, then click `Model Provider`, and locate the `vLLM` provider to install it.
+
+- Fill in the model provider details as follows:
+  - **Model Type**: `LLM`
+  - **Model Name**: `Qwen/Qwen1.5-7B-Chat`
+  - **API Endpoint URL**: `http://{vllm_server_host}:{vllm_server_port}/v1`
+  - **Model Name for API Endpoint**: `Qwen/Qwen1.5-7B-Chat`
+  - **Completion Mode**: `Completion`
+
+![](../../assets/deployment/dify-settings.png)
+
+- To create a test chatbot, go to `Studio → Chatbot → Create from Blank`, then select Chatbot as the type:
+
+![](../../assets/deployment/dify-create-chatbot.png)
+
+- Click the chatbot you just created to open the chat interface and start interacting with the model:
+
+![](../../assets/deployment/dify-chat.png)
diff --git a/vllm_v0.10.0/docs/deployment/frameworks/dstack.md b/vllm_v0.10.0/docs/deployment/frameworks/dstack.md
new file mode 100644
index 0000000..23dc58c
--- /dev/null
+++ b/vllm_v0.10.0/docs/deployment/frameworks/dstack.md
@@ -0,0 +1,103 @@
+# dstack
+
+<p align="center">
+    <img src="https://i.ibb.co/71kx6hW/vllm-dstack.png" alt="vLLM_plus_dstack"/>
+</p>
+
+vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/), an open-source framework for running LLMs on any cloud. This tutorial assumes that you have already configured credentials, gateway, and GPU quotas on your cloud environment.
+
+To install dstack client, run:
+
+```bash
+pip install "dstack[all]
+dstack server
+```
+
+Next, to configure your dstack project, run:
+
+```bash
+mkdir -p vllm-dstack
+cd vllm-dstack
+dstack init
+```
+
+Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:
+
+??? code "Config"
+
+    ```yaml
+    type: service
+
+    python: "3.11"
+    env:
+        - MODEL=NousResearch/Llama-2-7b-chat-hf
+    port: 8000
+    resources:
+        gpu: 24GB
+    commands:
+        - pip install vllm
+        - vllm serve $MODEL --port 8000
+    model:
+        format: openai
+        type: chat
+        name: NousResearch/Llama-2-7b-chat-hf
+    ```
+
+Then, run the following CLI for provisioning:
+
+??? console "Command"
+
+    ```console
+    $ dstack run . -f serve.dstack.yml
+
+    ⠸ Getting run plan...
+    Configuration  serve.dstack.yml
+    Project        deep-diver-main
+    User           deep-diver
+    Min resources  2..xCPU, 8GB.., 1xGPU (24GB)
+    Max price      -
+    Max duration   -
+    Spot policy    auto
+    Retry policy   no
+
+    #  BACKEND  REGION       INSTANCE       RESOURCES                               SPOT  PRICE
+    1  gcp   us-central1  g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
+    2  gcp   us-east1     g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
+    3  gcp   us-west1     g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
+        ...
+    Shown 3 of 193 offers, $5.876 max
+
+    Continue? [y/n]: y
+    ⠙ Submitting run...
+    ⠏ Launching spicy-treefrog-1 (pulling)
+    spicy-treefrog-1 provisioning completed (running)
+    Service is published at ...
+    ```
+
+After the provisioning, you can interact with the model by using the OpenAI SDK:
+
+??? code
+
+    ```python
+    from openai import OpenAI
+
+    client = OpenAI(
+        base_url="https://gateway.<gateway domain>",
+        api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
+    )
+
+    completion = client.chat.completions.create(
+        model="NousResearch/Llama-2-7b-chat-hf",
+        messages=[
+            {
+                "role": "user",
+                "content": "Compose a poem that explains the concept of recursion in programming.",
+            }
+        ]
+    )
+
+    print(completion.choices[0].message.content)
+    ```
+
+!!! note
+    dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm)
diff --git a/vllm_v0.10.0/docs/deployment/frameworks/haystack.md b/vllm_v0.10.0/docs/deployment/frameworks/haystack.md
new file mode 100644
index 0000000..a18d681
--- /dev/null
+++ b/vllm_v0.10.0/docs/deployment/frameworks/haystack.md
@@ -0,0 +1,57 @@
+# Haystack
+
+# Haystack
+
+[Haystack](https://github.com/deepset-ai/haystack) is an end-to-end LLM framework that allows you to build applications powered by LLMs, Transformer models, vector search and more. Whether you want to perform retrieval-augmented generation (RAG), document search, question answering or answer generation, Haystack can orchestrate state-of-the-art embedding models and LLMs into pipelines to build end-to-end NLP applications and solve your use case.
+
+It allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints.
+
+## Prerequisites
+
+- Setup vLLM and Haystack environment
+
+```bash
+pip install vllm haystack-ai
+```
+
+## Deploy
+
+- Start the vLLM server with the supported chat completion model, e.g.
+
+```bash
+vllm serve mistralai/Mistral-7B-Instruct-v0.1
+```
+
+- Use the `OpenAIGenerator` and `OpenAIChatGenerator` components in Haystack to query the vLLM server.
+
+??? code
+
+    ```python
+    from haystack.components.generators.chat import OpenAIChatGenerator
+    from haystack.dataclasses import ChatMessage
+    from haystack.utils import Secret
+
+    generator = OpenAIChatGenerator(
+        # for compatibility with the OpenAI API, a placeholder api_key is needed
+        api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),
+        model="mistralai/Mistral-7B-Instruct-v0.1",
+        api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
+        generation_kwargs = {"max_tokens": 512}
+    )
+
+    response = generator.run(
+      messages=[ChatMessage.from_user("Hi. Can you help me plan my next trip to Italy?")]
+    )
+
+    print("-"*30)
+    print(response)
+    print("-"*30)
+    ```
+
+```console
+------------------------------
+{'replies': [ChatMessage(_role=<ChatRole.ASSISTANT: 'assistant'>, _content=[TextContent(text=' Of course! Where in Italy would you like to go and what type of trip are you looking to plan?')], _name=None, _meta={'model': 'mistralai/Mistral-7B-Instruct-v0.1', 'index': 0, 'finish_reason': 'stop', 'usage': {'completion_tokens': 23, 'prompt_tokens': 21, 'total_tokens': 44, 'completion_tokens_details': None, 'prompt_tokens_details': None}})]}
+------------------------------
+```
+
+For details, see the tutorial [Using vLLM in Haystack](https://github.com/deepset-ai/haystack-integrations/blob/main/integrations/vllm.md).
diff --git a/vllm_v0.10.0/docs/deployment/frameworks/helm.md b/vllm_v0.10.0/docs/deployment/frameworks/helm.md
new file mode 100644
index 0000000..e5d4494
--- /dev/null
+++ b/vllm_v0.10.0/docs/deployment/frameworks/helm.md
@@ -0,0 +1,100 @@
+# Helm
+
+A Helm chart to deploy vLLM for Kubernetes
+
+Helm is a package manager for Kubernetes. It helps automate the deployment of vLLM applications on Kubernetes. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variable values.
+
+This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for Helm installation and documentation on architecture and values file.
+
+## Prerequisites
+
+Before you begin, ensure that you have the following:
+
+- A running Kubernetes cluster
+- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at [https://github.com/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin)
+- Available GPU resources in your cluster
+- An S3 with the model which will be deployed
+
+## Installing the chart
+
+To install the chart with the release name `test-vllm`:
+
+```bash
+helm upgrade --install --create-namespace \
+  --namespace=ns-vllm test-vllm . \
+  -f values.yaml \
+  --set secrets.s3endpoint=$ACCESS_POINT \
+  --set secrets.s3bucketname=$BUCKET \
+  --set secrets.s3accesskeyid=$ACCESS_KEY \
+  --set secrets.s3accesskey=$SECRET_KEY
+```
+
+## Uninstalling the chart
+
+To uninstall the `test-vllm` deployment:
+
+```bash
+helm uninstall test-vllm --namespace=ns-vllm
+```
+
+The command removes all the Kubernetes components associated with the
+chart **including persistent volumes** and deletes the release.
+
+## Architecture
+
+![helm deployment architecture](../../assets/deployment/architecture_helm_deployment.png)
+
+## Values
+
+The following table describes configurable parameters of the chart in `values.yaml`:
+
+| Key | Type | Default | Description |
+|-----|------|---------|-------------|
+| autoscaling | object | {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80} | Autoscaling configuration |
+| autoscaling.enabled | bool | false | Enable autoscaling |
+| autoscaling.maxReplicas | int | 100 | Maximum replicas |
+| autoscaling.minReplicas | int | 1 | Minimum replicas |
+| autoscaling.targetCPUUtilizationPercentage | int | 80 | Target CPU utilization for autoscaling |
+| configs | object | {} | Configmap |
+| containerPort | int | 8000 | Container port |
+| customObjects | list | [] | Custom Objects configuration |
+| deploymentStrategy | object | {} | Deployment strategy configuration |
+| externalConfigs | list | [] | External configuration |
+| extraContainers | list | [] | Additional containers configuration |
+| extraInit | object | {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true} | Additional configuration for the init container |
+| extraInit.pvcStorage | string | "1Gi" | Storage size of the s3 |
+| extraInit.s3modelpath | string | "relative_s3_model_path/opt-125m" | Path of the model on the s3 which hosts model weights and config files |
+| extraInit.awsEc2MetadataDisabled | boolean | true | Disables the use of the Amazon EC2 instance metadata service |
+| extraPorts | list | [] | Additional ports configuration |
+| gpuModels | list | ["TYPE_GPU_USED"] | Type of gpu used |
+| image | object | {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"} | Image configuration |
+| image.command | list | ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"] | Container launch command |
+| image.repository | string | "vllm/vllm-openai" | Image repository |
+| image.tag | string | "latest" | Image tag |
+| livenessProbe | object | {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10} | Liveness probe configuration |
+| livenessProbe.failureThreshold | int | 3 | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive |
+| livenessProbe.httpGet | object | {"path":"/health","port":8000} | Configuration of the kubelet http request on the server |
+| livenessProbe.httpGet.path | string | "/health" | Path to access on the HTTP server |
+| livenessProbe.httpGet.port | int | 8000 | Name or number of the port to access on the container, on which the server is listening |
+| livenessProbe.initialDelaySeconds | int | 15 | Number of seconds after the container has started before liveness probe is initiated |
+| livenessProbe.periodSeconds | int | 10 | How often (in seconds) to perform the liveness probe |
+| maxUnavailablePodDisruptionBudget | string | "" | Disruption Budget Configuration |
+| readinessProbe | object | {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5} | Readiness probe configuration |
+| readinessProbe.failureThreshold | int | 3 | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready |
+| readinessProbe.httpGet | object | {"path":"/health","port":8000} | Configuration of the kubelet http request on the server |
+| readinessProbe.httpGet.path | string | "/health" | Path to access on the HTTP server |
+| readinessProbe.httpGet.port | int | 8000 | Name or number of the port to access on the container, on which the server is listening |
+| readinessProbe.initialDelaySeconds | int | 5 | Number of seconds after the container has started before readiness probe is initiated |
+| readinessProbe.periodSeconds | int | 5 | How often (in seconds) to perform the readiness probe |
+| replicaCount | int | 1 | Number of replicas |
+| resources | object | {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}} | Resource configuration |
+| resources.limits."nvidia.com/gpu" | int | 1 | Number of GPUs used |
+| resources.limits.cpu | int | 4 | Number of CPUs |
+| resources.limits.memory | string | "16Gi" | CPU memory configuration |
+| resources.requests."nvidia.com/gpu" | int | 1 | Number of GPUs used |
+| resources.requests.cpu | int | 4 | Number of CPUs |
+| resources.requests.memory | string | "16Gi" | CPU memory configuration |
+| secrets | object | {} | Secrets configuration |
+| serviceName | string | "" | Service name |
+| servicePort | int | 80 | Service port |
+| labels.environment | string | test | Environment name |
diff --git a/vllm_v0.10.0/docs/deployment/frameworks/litellm.md b/vllm_v0.10.0/docs/deployment/frameworks/litellm.md
new file mode 100644
index 0000000..c7e514f
--- /dev/null
+++ b/vllm_v0.10.0/docs/deployment/frameworks/litellm.md
@@ -0,0 +1,75 @@
+# LiteLLM
+
+[LiteLLM](https://github.com/BerriAI/litellm) call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, Groq etc.]
+
+LiteLLM manages:
+
+- Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints
+- [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']`
+- Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
+- Set Budgets & Rate limits per project, api key, model [LiteLLM Proxy Server (LLM Gateway)](https://docs.litellm.ai/docs/simple_proxy)
+
+And LiteLLM supports all models on VLLM.
+
+## Prerequisites
+
+- Setup vLLM and litellm environment
+
+```bash
+pip install vllm litellm
+```
+
+## Deploy
+
+### Chat completion
+
+- Start the vLLM server with the supported chat completion model, e.g.
+
+```bash
+vllm serve qwen/Qwen1.5-0.5B-Chat
+```
+
+- Call it with litellm:
+
+??? code
+
+    ```python
+    import litellm 
+
+    messages = [{ "content": "Hello, how are you?","role": "user"}]
+
+    # hosted_vllm is prefix key word and necessary
+    response = litellm.completion(
+                model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
+                messages=messages,
+                api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
+                temperature=0.2,
+                max_tokens=80)
+
+    print(response)
+    ```
+
+### Embeddings
+
+- Start the vLLM server with the supported embedding model, e.g.
+
+```bash
+vllm serve BAAI/bge-base-en-v1.5
+```
+
+- Call it with litellm:
+
+```python
+from litellm import embedding   
+import os
+
+os.environ["HOSTED_VLLM_API_BASE"] = "http://{your-vllm-server-host}:{your-vllm-server-port}/v1"
+
+# hosted_vllm is prefix key word and necessary
+# pass the vllm model name
+embedding = embedding(model="hosted_vllm/BAAI/bge-base-en-v1.5", input=["Hello world"])
+
+print(embedding)
+```
+
+For details, see the tutorial [Using vLLM in LiteLLM](https://docs.litellm.ai/docs/providers/vllm).
diff --git a/vllm_v0.10.0/docs/deployment/frameworks/lobe-chat.md b/vllm_v0.10.0/docs/deployment/frameworks/lobe-chat.md
new file mode 100644
index 0000000..e3e7dbe
--- /dev/null
+++ b/vllm_v0.10.0/docs/deployment/frameworks/lobe-chat.md
@@ -0,0 +1,11 @@
+# Lobe Chat
+
+[Lobe Chat](https://github.com/lobehub/lobe-chat) is an open-source, modern-design ChatGPT/LLMs UI/Framework.
+
+Supports speech-synthesis, multi-modal, and extensible (function call) plugin system.
+
+One-click FREE deployment of your private OpenAI ChatGPT/Claude/Gemini/Groq/Ollama chat application.
+
+It supports vLLM as a AI model provider to efficiently serve large language models.
+
+For details, see the tutorial [Using vLLM in LobeChat](https://lobehub.com/docs/usage/providers/vllm).
diff --git a/vllm_v0.10.0/docs/deployment/frameworks/lws.md b/vllm_v0.10.0/docs/deployment/frameworks/lws.md
new file mode 100644
index 0000000..3319dc6
--- /dev/null
+++ b/vllm_v0.10.0/docs/deployment/frameworks/lws.md
@@ -0,0 +1,200 @@
+# LWS
+
+LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads.
+A major use case is for multi-host/multi-node distributed inference.
+
+vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kubernetes for distributed model serving.
+
+## Prerequisites
+
+* At least two Kubernetes nodes, each with 8 GPUs, are required.
+* Install LWS by following the instructions found [here](https://lws.sigs.k8s.io/docs/installation/).
+
+## Deploy and Serve
+
+Deploy the following yaml file `lws.yaml`
+
+??? code "Yaml"
+
+    ```yaml
+    apiVersion: leaderworkerset.x-k8s.io/v1
+    kind: LeaderWorkerSet
+    metadata:
+      name: vllm
+    spec:
+      replicas: 2
+      leaderWorkerTemplate:
+        size: 2
+        restartPolicy: RecreateGroupOnPodRestart
+        leaderTemplate:
+          metadata:
+            labels:
+              role: leader
+          spec:
+            containers:
+              - name: vllm-leader
+                image: docker.io/vllm/vllm-openai:latest
+                env:
+                  - name: HUGGING_FACE_HUB_TOKEN
+                    value: <your-hf-token>
+                command:
+                  - sh
+                  - -c
+                  - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); 
+                    python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
+                resources:
+                  limits:
+                    nvidia.com/gpu: "8"
+                    memory: 1124Gi
+                    ephemeral-storage: 800Gi
+                  requests:
+                    ephemeral-storage: 800Gi
+                    cpu: 125
+                ports:
+                  - containerPort: 8080
+                readinessProbe:
+                  tcpSocket:
+                    port: 8080
+                  initialDelaySeconds: 15
+                  periodSeconds: 10
+                volumeMounts:
+                  - mountPath: /dev/shm
+                    name: dshm
+            volumes:
+            - name: dshm
+              emptyDir:
+                medium: Memory
+                sizeLimit: 15Gi
+        workerTemplate:
+          spec:
+            containers:
+              - name: vllm-worker
+                image: docker.io/vllm/vllm-openai:latest
+                command:
+                  - sh
+                  - -c
+                  - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
+                resources:
+                  limits:
+                    nvidia.com/gpu: "8"
+                    memory: 1124Gi
+                    ephemeral-storage: 800Gi
+                  requests:
+                    ephemeral-storage: 800Gi
+                    cpu: 125
+                env:
+                  - name: HUGGING_FACE_HUB_TOKEN
+                    value: <your-hf-token>
+                volumeMounts:
+                  - mountPath: /dev/shm
+                    name: dshm   
+            volumes:
+            - name: dshm
+              emptyDir:
+                medium: Memory
+                sizeLimit: 15Gi
+    ---
+    apiVersion: v1
+    kind: Service
+    metadata:
+      name: vllm-leader
+    spec:
+      ports:
+        - name: http
+          port: 8080
+          protocol: TCP
+          targetPort: 8080
+      selector:
+        leaderworkerset.sigs.k8s.io/name: vllm
+        role: leader
+      type: ClusterIP
+    ```
+
+```bash
+kubectl apply -f lws.yaml
+```
+
+Verify the status of the pods:
+
+```bash
+kubectl get pods
+```
+
+Should get an output similar to this:
+
+```bash
+NAME       READY   STATUS    RESTARTS   AGE
+vllm-0     1/1     Running   0          2s
+vllm-0-1   1/1     Running   0          2s
+vllm-1     1/1     Running   0          2s
+vllm-1-1   1/1     Running   0          2s
+```
+
+Verify that the distributed tensor-parallel inference works:
+
+```bash
+kubectl logs vllm-0 |grep -i "Loading model weights took" 
+```
+
+Should get something similar to this:
+
+```text
+INFO 05-08 03:20:24 model_runner.py:173] Loading model weights took 0.1189 GB
+(RayWorkerWrapper pid=169, ip=10.20.0.197) INFO 05-08 03:20:28 model_runner.py:173] Loading model weights took 0.1189 GB
+```
+
+## Access ClusterIP service
+
+```bash
+# Listen on port 8080 locally, forwarding to the targetPort of the service's port 8080 in a pod selected by the service
+kubectl port-forward svc/vllm-leader 8080:8080
+```
+
+The output should be similar to the following:
+
+```text
+Forwarding from 127.0.0.1:8080 -> 8080
+Forwarding from [::1]:8080 -> 8080
+```
+
+## Serve the model
+
+Open another terminal and send a request
+
+```text
+curl http://localhost:8080/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+    "model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
+    "prompt": "San Francisco is a",
+    "max_tokens": 7,
+    "temperature": 0
+}'
+```
+
+The output should be similar to the following
+
+??? console "Output"
+
+    ```text
+    {
+      "id": "cmpl-1bb34faba88b43f9862cfbfb2200949d",
+      "object": "text_completion",
+      "created": 1715138766,
+      "model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
+      "choices": [
+        {
+          "index": 0,
+          "text": " top destination for foodies, with",
+          "logprobs": null,
+          "finish_reason": "length",
+          "stop_reason": null
+        }
+      ],
+      "usage": {
+        "prompt_tokens": 5,
+        "total_tokens": 12,
+        "completion_tokens": 7
+      }
+    }
+    ```
diff --git a/vllm_v0.10.0/docs/deployment/frameworks/modal.md b/vllm_v0.10.0/docs/deployment/frameworks/modal.md
new file mode 100644
index 0000000..0ab5ed9
--- /dev/null
+++ b/vllm_v0.10.0/docs/deployment/frameworks/modal.md
@@ -0,0 +1,5 @@
+# Modal
+
+vLLM can be run on cloud GPUs with [Modal](https://modal.com), a serverless computing platform designed for fast auto-scaling.
+
+For details on how to deploy vLLM on Modal, see [this tutorial in the Modal documentation](https://modal.com/docs/examples/vllm_inference).
diff --git a/vllm_v0.10.0/docs/deployment/frameworks/open-webui.md b/vllm_v0.10.0/docs/deployment/frameworks/open-webui.md
new file mode 100644
index 0000000..eaa51bb
--- /dev/null
+++ b/vllm_v0.10.0/docs/deployment/frameworks/open-webui.md
@@ -0,0 +1,42 @@
+# Open WebUI
+
+[Open WebUI](https://github.com/open-webui/open-webui) is an extensible, feature-rich,
+and user-friendly self-hosted AI platform designed to operate entirely offline.
+It supports various LLM runners like Ollama and OpenAI-compatible APIs,
+with built-in RAG capabilities, making it a powerful AI deployment solution.
+
+To get started with Open WebUI using vLLM, follow these steps:
+
+1. Install the [Docker](https://docs.docker.com/engine/install/).
+
+2. Start the vLLM server with a supported chat completion model:
+
+    ```console
+    vllm serve Qwen/Qwen3-0.6B-Chat
+    ```
+
+    !!! note
+        When starting the vLLM server, be sure to specify the host and port using the `--host` and `--port` flags.
+        For example:
+
+        ```console
+        python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000
+        ```
+
+3. Start the Open WebUI Docker container:
+
+    ```console
+    docker run -d \
+        --name open-webui \
+        -p 3000:8080 \
+        -v open-webui:/app/backend/data \
+        -e OPENAI_API_BASE_URL=http://0.0.0.0:8000/v1 \
+        --restart always \
+        ghcr.io/open-webui/open-webui:main
+    ```
+
+4. Open it in the browser: <http://open-webui-host:3000/>
+
+    At the top of the page, you should see the model `Qwen/Qwen3-0.6B-Chat`.
+
+    ![Web portal of model Qwen/Qwen3-0.6B-Chat](../../assets/deployment/open_webui.png)
diff --git a/vllm_v0.10.0/docs/deployment/frameworks/retrieval_augmented_generation.md b/vllm_v0.10.0/docs/deployment/frameworks/retrieval_augmented_generation.md
new file mode 100644
index 0000000..96dd99e
--- /dev/null
+++ b/vllm_v0.10.0/docs/deployment/frameworks/retrieval_augmented_generation.md
@@ -0,0 +1,82 @@
+# Retrieval-Augmented Generation
+
+[Retrieval-augmented generation (RAG)](https://en.wikipedia.org/wiki/Retrieval-augmented_generation) is a technique that enables generative artificial intelligence (Gen AI) models to retrieve and incorporate new information. It modifies interactions with a large language model (LLM) so that the model responds to user queries with reference to a specified set of documents, using this information to supplement information from its pre-existing training data. This allows LLMs to use domain-specific and/or updated information. Use cases include providing chatbot access to internal company data or generating responses based on authoritative sources.
+
+Here are the integrations:
+- vLLM + [langchain](https://github.com/langchain-ai/langchain) + [milvus](https://github.com/milvus-io/milvus)
+- vLLM + [llamaindex](https://github.com/run-llama/llama_index) + [milvus](https://github.com/milvus-io/milvus)
+
+## vLLM + langchain
+
+### Prerequisites
+
+- Setup vLLM and langchain environment
+
+```bash
+pip install -U vllm \
+            langchain_milvus langchain_openai \
+            langchain_community beautifulsoup4 \
+            langchain-text-splitters
+```
+
+### Deploy
+
+- Start the vLLM server with the supported embedding model, e.g.
+
+```bash
+# Start embedding service (port 8000)
+vllm serve ssmits/Qwen2-7B-Instruct-embed-base
+```
+
+- Start the vLLM server with the supported chat completion model, e.g.
+
+```bash
+# Start chat service (port 8001)
+vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
+```
+
+- Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_langchain.py>
+
+- Run the script
+
+```python
+python retrieval_augmented_generation_with_langchain.py
+```
+
+## vLLM + llamaindex
+
+### Prerequisites
+
+- Setup vLLM and llamaindex environment
+
+```bash
+pip install vllm \
+            llama-index llama-index-readers-web \
+            llama-index-llms-openai-like    \
+            llama-index-embeddings-openai-like \
+            llama-index-vector-stores-milvus \
+```
+
+### Deploy
+
+- Start the vLLM server with the supported embedding model, e.g.
+
+```bash
+# Start embedding service (port 8000)
+vllm serve ssmits/Qwen2-7B-Instruct-embed-base
+```
+
+- Start the vLLM server with the supported chat completion model, e.g.
+
+```bash
+# Start chat service (port 8001)
+vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
+```
+
+- Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_llamaindex.py>
+
+- Run the script
+
+```python
+python retrieval_augmented_generation_with_llamaindex.py
+```
diff --git a/vllm_v0.10.0/docs/deployment/frameworks/skypilot.md b/vllm_v0.10.0/docs/deployment/frameworks/skypilot.md
new file mode 100644
index 0000000..06e2fed
--- /dev/null
+++ b/vllm_v0.10.0/docs/deployment/frameworks/skypilot.md
@@ -0,0 +1,327 @@
+# SkyPilot
+
+<p align="center">
+  <img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
+</p>
+
+vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html).
+
+## Prerequisites
+
+- Go to the [HuggingFace model page](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and request access to the model `meta-llama/Meta-Llama-3-8B-Instruct`.
+- Check that you have installed SkyPilot ([docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)).
+- Check that `sky check` shows clouds or Kubernetes are enabled.
+
+```bash
+pip install skypilot-nightly
+sky check
+```
+
+## Run on a single instance
+
+See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml).
+
+??? code "Yaml"
+
+    ```yaml
+    resources:
+      accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+      use_spot: True
+      disk_size: 512  # Ensure model checkpoints can fit.
+      disk_tier: best
+      ports: 8081  # Expose to internet traffic.
+
+    envs:
+      MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+      HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+
+    setup: |
+      conda create -n vllm python=3.10 -y
+      conda activate vllm
+
+      pip install vllm==0.4.0.post1
+      # Install Gradio for web UI.
+      pip install gradio openai
+      pip install flash-attn==2.5.7
+
+    run: |
+      conda activate vllm
+      echo 'Starting vllm api server...'
+      python -u -m vllm.entrypoints.openai.api_server \
+        --port 8081 \
+        --model $MODEL_NAME \
+        --trust-remote-code \
+        --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+        2>&1 | tee api_server.log &
+
+      echo 'Waiting for vllm api server to start...'
+      while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
+
+      echo 'Starting gradio server...'
+      git clone https://github.com/vllm-project/vllm.git || true
+      python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
+        -m $MODEL_NAME \
+        --port 8811 \
+        --model-url http://localhost:8081/v1 \
+        --stop-token-ids 128009,128001
+    ```
+
+Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...):
+
+```bash
+HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN
+```
+
+Check the output of the command. There will be a shareable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion.
+
+```console
+(task, pid=7431) Running on public URL: https://<gradio-hash>.gradio.live
+```
+
+**Optional**: Serve the 70B model instead of the default 8B and use more GPU:
+
+```bash
+HF_TOKEN="your-huggingface-token" \
+  sky launch serving.yaml \
+  --gpus A100:8 \
+  --env HF_TOKEN \
+  --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct
+```
+
+## Scale up to multiple replicas
+
+SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file.
+
+??? code "Yaml"
+
+    ```yaml
+    service:
+      replicas: 2
+      # An actual request for readiness probe.
+      readiness_probe:
+        path: /v1/chat/completions
+        post_data:
+        model: $MODEL_NAME
+        messages:
+          - role: user
+            content: Hello! What is your name?
+      max_completion_tokens: 1
+    ```
+
+??? code "Yaml"
+
+    ```yaml
+    service:
+      replicas: 2
+      # An actual request for readiness probe.
+      readiness_probe:
+        path: /v1/chat/completions
+        post_data:
+          model: $MODEL_NAME
+          messages:
+            - role: user
+              content: Hello! What is your name?
+          max_completion_tokens: 1
+
+    resources:
+      accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+      use_spot: True
+      disk_size: 512  # Ensure model checkpoints can fit.
+      disk_tier: best
+      ports: 8081  # Expose to internet traffic.
+
+    envs:
+      MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+      HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+
+    setup: |
+      conda create -n vllm python=3.10 -y
+      conda activate vllm
+
+      pip install vllm==0.4.0.post1
+      # Install Gradio for web UI.
+      pip install gradio openai
+      pip install flash-attn==2.5.7
+
+    run: |
+      conda activate vllm
+      echo 'Starting vllm api server...'
+      python -u -m vllm.entrypoints.openai.api_server \
+        --port 8081 \
+        --model $MODEL_NAME \
+        --trust-remote-code \
+        --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+        2>&1 | tee api_server.log
+    ```
+
+Start the serving the Llama-3 8B model on multiple replicas:
+
+```bash
+HF_TOKEN="your-huggingface-token" \
+  sky serve up -n vllm serving.yaml \
+  --env HF_TOKEN
+```
+
+Wait until the service is ready:
+
+```bash
+watch -n10 sky serve status vllm
+```
+
+Example outputs:
+
+```console
+Services
+NAME  VERSION  UPTIME  STATUS  REPLICAS  ENDPOINT
+vllm  1        35s     READY   2/2       xx.yy.zz.100:30001
+
+Service Replicas
+SERVICE_NAME  ID  VERSION  IP            LAUNCHED     RESOURCES                STATUS  REGION
+vllm          1   1        xx.yy.zz.121  18 mins ago  1x GCP([Spot]{'L4': 1})  READY   us-east4
+vllm          2   1        xx.yy.zz.245  18 mins ago  1x GCP([Spot]{'L4': 1})  READY   us-east4
+```
+
+After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
+
+??? console "Commands"
+
+    ```bash
+    ENDPOINT=$(sky serve status --endpoint 8081 vllm)
+    curl -L http://$ENDPOINT/v1/chat/completions \
+      -H "Content-Type: application/json" \
+      -d '{
+        "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+        "messages": [
+        {
+          "role": "system",
+          "content": "You are a helpful assistant."
+        },
+        {
+          "role": "user",
+          "content": "Who are you?"
+        }
+        ],
+        "stop_token_ids": [128009,  128001]
+      }'
+    ```
+
+To enable autoscaling, you could replace the `replicas` with the following configs in `service`:
+
+```yaml
+service:
+  replica_policy:
+    min_replicas: 2
+    max_replicas: 4
+    target_qps_per_replica: 2
+```
+
+This will scale the service up to when the QPS exceeds 2 for each replica.
+
+??? code "Yaml"
+
+    ```yaml
+    service:
+      replica_policy:
+        min_replicas: 2
+        max_replicas: 4
+        target_qps_per_replica: 2
+      # An actual request for readiness probe.
+      readiness_probe:
+        path: /v1/chat/completions
+        post_data:
+          model: $MODEL_NAME
+          messages:
+            - role: user
+              content: Hello! What is your name?
+          max_completion_tokens: 1
+
+    resources:
+      accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+      use_spot: True
+      disk_size: 512  # Ensure model checkpoints can fit.
+      disk_tier: best
+      ports: 8081  # Expose to internet traffic.
+
+    envs:
+      MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+      HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+
+    setup: |
+      conda create -n vllm python=3.10 -y
+      conda activate vllm
+
+      pip install vllm==0.4.0.post1
+      # Install Gradio for web UI.
+      pip install gradio openai
+      pip install flash-attn==2.5.7
+
+    run: |
+      conda activate vllm
+      echo 'Starting vllm api server...'
+      python -u -m vllm.entrypoints.openai.api_server \
+        --port 8081 \
+        --model $MODEL_NAME \
+        --trust-remote-code \
+        --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+        2>&1 | tee api_server.log
+    ```
+
+To update the service with the new config:
+
+```bash
+HF_TOKEN="your-huggingface-token" sky serve update vllm serving.yaml --env HF_TOKEN
+```
+
+To stop the service:
+
+```bash
+sky serve down vllm
+```
+
+### **Optional**: Connect a GUI to the endpoint
+
+It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas.
+
+??? code "Yaml"
+
+    ```yaml
+    envs:
+      MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+      ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm.
+
+    resources:
+      cpus: 2
+
+    setup: |
+      conda create -n vllm python=3.10 -y
+      conda activate vllm
+
+      # Install Gradio for web UI.
+      pip install gradio openai
+
+    run: |
+      conda activate vllm
+      export PATH=$PATH:/sbin
+
+      echo 'Starting gradio server...'
+      git clone https://github.com/vllm-project/vllm.git || true
+      python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
+        -m $MODEL_NAME \
+        --port 8811 \
+        --model-url http://$ENDPOINT/v1 \
+        --stop-token-ids 128009,128001 | tee ~/gradio.log
+    ```
+
+1. Start the chat web UI:
+
+    ```bash
+    sky launch \
+      -c gui ./gui.yaml \
+      --env ENDPOINT=$(sky serve status --endpoint vllm)
+    ```
+
+2. Then, we can access the GUI at the returned gradio link:
+
+    ```console
+    | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live
+    ```
diff --git a/vllm_v0.10.0/docs/deployment/frameworks/streamlit.md b/vllm_v0.10.0/docs/deployment/frameworks/streamlit.md
new file mode 100644
index 0000000..af0f069
--- /dev/null
+++ b/vllm_v0.10.0/docs/deployment/frameworks/streamlit.md
@@ -0,0 +1,40 @@
+# Streamlit
+
+[Streamlit](https://github.com/streamlit/streamlit) lets you transform Python scripts into interactive web apps in minutes, instead of weeks. Build dashboards, generate reports, or create chat apps.
+
+It can be quickly integrated with vLLM as a backend API server, enabling powerful LLM inference via API calls.
+
+## Prerequisites
+
+- Setup vLLM environment
+
+## Deploy
+
+- Start the vLLM server with the supported chat completion model, e.g.
+
+```bash
+vllm serve qwen/Qwen1.5-0.5B-Chat
+```
+
+- Install streamlit and openai:
+
+```bash
+pip install streamlit openai
+```
+
+- Use the script: <gh-file:examples/online_serving/streamlit_openai_chatbot_webserver.py>
+
+- Start the streamlit web UI and start to chat:
+
+```bash
+streamlit run streamlit_openai_chatbot_webserver.py
+
+# or specify the VLLM_API_BASE or VLLM_API_KEY
+VLLM_API_BASE="http://vllm-server-host:vllm-server-port/v1" \
+    streamlit run streamlit_openai_chatbot_webserver.py
+
+# start with debug mode to view more details
+streamlit run streamlit_openai_chatbot_webserver.py --logger.level=debug
+```
+
+![](../../assets/deployment/streamlit-chat.png)
diff --git a/vllm_v0.10.0/docs/deployment/frameworks/triton.md b/vllm_v0.10.0/docs/deployment/frameworks/triton.md
new file mode 100644
index 0000000..faff4a4
--- /dev/null
+++ b/vllm_v0.10.0/docs/deployment/frameworks/triton.md
@@ -0,0 +1,3 @@
+# NVIDIA Triton
+
+The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details.
diff --git a/vllm_v0.10.0/docs/deployment/integrations/kserve.md b/vllm_v0.10.0/docs/deployment/integrations/kserve.md
new file mode 100644
index 0000000..edf79fc
--- /dev/null
+++ b/vllm_v0.10.0/docs/deployment/integrations/kserve.md
@@ -0,0 +1,5 @@
+# KServe
+
+vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
+
+Please see [this guide](https://kserve.github.io/website/latest/modelserving/v1beta1/llm/huggingface/) for more details on using vLLM with KServe.
diff --git a/vllm_v0.10.0/docs/deployment/integrations/kubeai.md b/vllm_v0.10.0/docs/deployment/integrations/kubeai.md
new file mode 100644
index 0000000..89d0722
--- /dev/null
+++ b/vllm_v0.10.0/docs/deployment/integrations/kubeai.md
@@ -0,0 +1,13 @@
+# KubeAI
+
+[KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies.
+
+Please see the Installation Guides for environment specific instructions:
+
+- [Any Kubernetes Cluster](https://www.kubeai.org/installation/any/)
+- [EKS](https://www.kubeai.org/installation/eks/)
+- [GKE](https://www.kubeai.org/installation/gke/)
+
+Once you have KubeAI installed, you can
+[configure text generation models](https://www.kubeai.org/how-to/configure-text-generation-models/)
+using vLLM.
diff --git a/vllm_v0.10.0/docs/deployment/integrations/kuberay.md b/vllm_v0.10.0/docs/deployment/integrations/kuberay.md
new file mode 100644
index 0000000..1dcc980
--- /dev/null
+++ b/vllm_v0.10.0/docs/deployment/integrations/kuberay.md
@@ -0,0 +1,20 @@
+# KubeRay
+
+[KubeRay](https://github.com/ray-project/kuberay) provides a Kubernetes-native way to run vLLM workloads on Ray clusters.
+A Ray cluster can be declared in YAML, and the operator then handles pod scheduling, networking configuration, restarts, and blue-green deployments — all while preserving the familiar Kubernetes experience.
+
+## Why KubeRay instead of manual scripts?
+
+| Feature | Manual scripts | KubeRay |
+|---------|-----------------------------------------------------------|---------|
+| Cluster bootstrap | Manually SSH into every node and run a script | One command to create or update the whole cluster: `kubectl apply -f cluster.yaml` |
+| Autoscaling | Manual | Automatically patches CRDs for adjusting cluster size |
+| Upgrades | Tear down & re-create manually | Blue/green deployment updates supported |
+| Declarative config | Bash flags & environment variables | Git-ops-friendly YAML CRDs (RayCluster/RayService) |
+
+Using KubeRay reduces the operational burden and simplifies integration of Ray + vLLM with existing Kubernetes workflows (CI/CD, secrets, storage classes, etc.).
+
+## Learn more
+
+* ["Serve a Large Language Model using Ray Serve LLM on Kubernetes"](https://docs.ray.io/en/master/cluster/kubernetes/examples/rayserve-llm-example.html) - An end-to-end example of how to serve a model using vLLM, KubeRay, and Ray Serve.
+* [KubeRay documentation](https://docs.ray.io/en/latest/cluster/kubernetes/index.html)
diff --git a/vllm_v0.10.0/docs/deployment/integrations/llamastack.md b/vllm_v0.10.0/docs/deployment/integrations/llamastack.md
new file mode 100644
index 0000000..28031f0
--- /dev/null
+++ b/vllm_v0.10.0/docs/deployment/integrations/llamastack.md
@@ -0,0 +1,36 @@
+# Llama Stack
+
+vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-stack) .
+
+To install Llama Stack, run
+
+```bash
+pip install llama-stack -q
+```
+
+## Inference using OpenAI Compatible API
+
+Then start Llama Stack server pointing to your vLLM server with the following configuration:
+
+```yaml
+inference:
+  - provider_id: vllm0
+    provider_type: remote::vllm
+    config:
+      url: http://127.0.0.1:8000
+```
+
+Please refer to [this guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html) for more details on this remote vLLM provider.
+
+## Inference via Embedded vLLM
+
+An [inline vLLM provider](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/inline/inference/vllm)
+is also available. This is a sample of configuration using that method:
+
+```yaml
+inference
+  - provider_type: vllm
+    config:
+      model: Llama3.1-8B-Instruct
+      tensor_parallel_size: 4
+```
diff --git a/vllm_v0.10.0/docs/deployment/integrations/llmaz.md b/vllm_v0.10.0/docs/deployment/integrations/llmaz.md
new file mode 100644
index 0000000..77730a2
--- /dev/null
+++ b/vllm_v0.10.0/docs/deployment/integrations/llmaz.md
@@ -0,0 +1,5 @@
+# llmaz
+
+[llmaz](https://github.com/InftyAI/llmaz) is an easy-to-use and advanced inference platform for large language models on Kubernetes, aimed for production use. It uses vLLM as the default model serving backend.
+
+Please refer to the [Quick Start](https://github.com/InftyAI/llmaz?tab=readme-ov-file#quick-start) for more details.
diff --git a/vllm_v0.10.0/docs/deployment/integrations/production-stack.md b/vllm_v0.10.0/docs/deployment/integrations/production-stack.md
new file mode 100644
index 0000000..497f9f1
--- /dev/null
+++ b/vllm_v0.10.0/docs/deployment/integrations/production-stack.md
@@ -0,0 +1,157 @@
+# Production stack
+
+Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using the [vLLM production stack](https://github.com/vllm-project/production-stack). Born out of a Berkeley-UChicago collaboration, [vLLM production stack](https://github.com/vllm-project/production-stack) is an officially released, production-optimized codebase under the [vLLM project](https://github.com/vllm-project), designed for LLM deployment with:
+
+* **Upstream vLLM compatibility** – It wraps around upstream vLLM without modifying its code.
+* **Ease of use** – Simplified deployment via Helm charts and observability through Grafana dashboards.
+* **High performance** – Optimized for LLM workloads with features like multi-model support, model-aware and prefix-aware routing, fast vLLM bootstrapping, and KV cache offloading with [LMCache](https://github.com/LMCache/LMCache), among others.
+
+If you are new to Kubernetes, don't worry: in the vLLM production stack [repo](https://github.com/vllm-project/production-stack), we provide a step-by-step [guide](https://github.com/vllm-project/production-stack/blob/main/tutorials/00-install-kubernetes-env.md) and a [short video](https://www.youtube.com/watch?v=EsTJbQtzj0g) to set up everything and get started in **4 minutes**!
+
+## Pre-requisite
+
+Ensure that you have a running Kubernetes environment with GPU (you can follow [this tutorial](https://github.com/vllm-project/production-stack/blob/main/tutorials/00-install-kubernetes-env.md) to install a Kubernetes environment on a bare-medal GPU machine).
+
+## Deployment using vLLM production stack
+
+The standard vLLM production stack is installed using a Helm chart. You can run this [bash script](https://github.com/vllm-project/production-stack/blob/main/utils/install-helm.sh) to install Helm on your GPU server.
+
+To install the vLLM production stack, run the following commands on your desktop:
+
+```bash
+sudo helm repo add vllm https://vllm-project.github.io/production-stack
+sudo helm install vllm vllm/vllm-stack -f tutorials/assets/values-01-minimal-example.yaml
+```
+
+This will instantiate a vLLM-production-stack-based deployment named `vllm` that runs a small LLM (Facebook opt-125M model).
+
+### Validate Installation
+
+Monitor the deployment status using:
+
+```bash
+sudo kubectl get pods
+```
+
+And you will see that pods for the `vllm` deployment will transit to `Running` state.
+
+```text
+NAME                                           READY   STATUS    RESTARTS   AGE
+vllm-deployment-router-859d8fb668-2x2b7        1/1     Running   0          2m38s
+vllm-opt125m-deployment-vllm-84dfc9bd7-vb9bs   1/1     Running   0          2m38s
+```
+
+!!! note
+    It may take some time for the containers to download the Docker images and LLM weights.
+
+### Send a Query to the Stack
+
+Forward the `vllm-router-service` port to the host machine:
+
+```bash
+sudo kubectl port-forward svc/vllm-router-service 30080:80
+```
+
+And then you can send out a query to the OpenAI-compatible API to check the available models:
+
+```bash
+curl -o- http://localhost:30080/models
+```
+
+??? console "Output"
+
+    ```json
+    {
+      "object": "list",
+      "data": [
+        {
+          "id": "facebook/opt-125m",
+          "object": "model",
+          "created": 1737428424,
+          "owned_by": "vllm",
+          "root": null
+        }
+      ]
+    }
+    ```
+
+To send an actual chatting request, you can issue a curl request to the OpenAI `/completion` endpoint:
+
+```bash
+curl -X POST http://localhost:30080/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "facebook/opt-125m",
+    "prompt": "Once upon a time,",
+    "max_tokens": 10
+  }'
+```
+
+??? console "Output"
+
+    ```json
+    {
+      "id": "completion-id",
+      "object": "text_completion",
+      "created": 1737428424,
+      "model": "facebook/opt-125m",
+      "choices": [
+        {
+          "text": " there was a brave knight who...",
+          "index": 0,
+          "finish_reason": "length"
+        }
+      ]
+    }
+    ```
+
+### Uninstall
+
+To remove the deployment, run:
+
+```bash
+sudo helm uninstall vllm
+```
+
+---
+
+### (Advanced) Configuring vLLM production stack
+
+The core vLLM production stack configuration is managed with YAML. Here is the example configuration used in the installation above:
+
+??? code "Yaml"
+
+    ```yaml
+    servingEngineSpec:
+      runtimeClassName: ""
+      modelSpec:
+      - name: "opt125m"
+        repository: "vllm/vllm-openai"
+        tag: "latest"
+        modelURL: "facebook/opt-125m"
+
+        replicaCount: 1
+
+        requestCPU: 6
+        requestMemory: "16Gi"
+        requestGPU: 1
+
+        pvcStorage: "10Gi"
+    ```
+
+In this YAML configuration:
+* **`modelSpec`** includes:
+  * `name`: A nickname that you prefer to call the model.
+  * `repository`: Docker repository of vLLM.
+  * `tag`: Docker image tag.
+  * `modelURL`: The LLM model that you want to use.
+* **`replicaCount`**: Number of replicas.
+* **`requestCPU` and `requestMemory`**: Specifies the CPU and memory resource requests for the pod.
+* **`requestGPU`**: Specifies the number of GPUs required.
+* **`pvcStorage`**: Allocates persistent storage for the model.
+
+!!! note
+    If you intend to set up two pods, please refer to this [YAML file](https://github.com/vllm-project/production-stack/blob/main/tutorials/assets/values-01-2pods-minimal-example.yaml).
+
+!!! tip
+    vLLM production stack offers many more features (*e.g.* CPU offloading and a wide range of routing algorithms). Please check out these [examples and tutorials](https://github.com/vllm-project/production-stack/tree/main/tutorials) and our [repo](https://github.com/vllm-project/production-stack) for more details!
diff --git a/vllm_v0.10.0/docs/deployment/k8s.md b/vllm_v0.10.0/docs/deployment/k8s.md
new file mode 100644
index 0000000..f244b08
--- /dev/null
+++ b/vllm_v0.10.0/docs/deployment/k8s.md
@@ -0,0 +1,392 @@
+# Using Kubernetes
+
+Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes.
+
+- [Deployment with CPUs](#deployment-with-cpus)
+- [Deployment with GPUs](#deployment-with-gpus)
+- [Troubleshooting](#troubleshooting)
+  - [Startup Probe or Readiness Probe Failure, container log contains "KeyboardInterrupt: terminated"](#startup-probe-or-readiness-probe-failure-container-log-contains-keyboardinterrupt-terminated)
+- [Conclusion](#conclusion)
+
+Alternatively, you can deploy vLLM to Kubernetes using any of the following:
+
+- [Helm](frameworks/helm.md)
+- [InftyAI/llmaz](integrations/llmaz.md)
+- [KServe](integrations/kserve.md)
+- [KubeRay](integrations/kuberay.md)
+- [kubernetes-sigs/lws](frameworks/lws.md)
+- [meta-llama/llama-stack](integrations/llamastack.md)
+- [substratusai/kubeai](integrations/kubeai.md)
+- [vllm-project/aibrix](https://github.com/vllm-project/aibrix)
+- [vllm-project/production-stack](integrations/production-stack.md)
+
+## Deployment with CPUs
+
+!!! note
+    The use of CPUs here is for demonstration and testing purposes only and its performance will not be on par with GPUs.
+
+First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:
+
+??? console "Config"
+
+    ```bash
+    cat <<EOF |kubectl apply -f -
+    apiVersion: v1
+    kind: PersistentVolumeClaim
+    metadata:
+      name: vllm-models
+    spec:
+      accessModes:
+        - ReadWriteOnce
+      volumeMode: Filesystem
+      resources:
+        requests:
+          storage: 50Gi
+    ---
+    apiVersion: v1
+    kind: Secret
+    metadata:
+      name: hf-token-secret
+    type: Opaque
+    data:
+      token: $(HF_TOKEN)
+    EOF
+    ```
+
+Next, start the vLLM server as a Kubernetes Deployment and Service:
+
+??? console "Config"
+
+    ```bash
+    cat <<EOF |kubectl apply -f -
+    apiVersion: apps/v1
+    kind: Deployment
+    metadata:
+      name: vllm-server
+    spec:
+      replicas: 1
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: vllm
+      template:
+        metadata:
+          labels:
+            app.kubernetes.io/name: vllm
+        spec:
+          containers:
+          - name: vllm
+            image: vllm/vllm-openai:latest
+            command: ["/bin/sh", "-c"]
+            args: [
+              "vllm serve meta-llama/Llama-3.2-1B-Instruct"
+            ]
+            env:
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+            ports:
+              - containerPort: 8000
+            volumeMounts:
+              - name: llama-storage
+                mountPath: /root/.cache/huggingface
+          volumes:
+          - name: llama-storage
+            persistentVolumeClaim:
+              claimName: vllm-models
+    ---
+    apiVersion: v1
+    kind: Service
+    metadata:
+      name: vllm-server
+    spec:
+      selector:
+        app.kubernetes.io/name: vllm
+      ports:
+      - protocol: TCP
+        port: 8000
+        targetPort: 8000
+      type: ClusterIP
+    EOF
+    ```
+
+We can verify that the vLLM server has started successfully via the logs (this might take a couple of minutes to download the model):
+
+```bash
+kubectl logs -l app.kubernetes.io/name=vllm
+...
+INFO:     Started server process [1]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
+```
+
+## Deployment with GPUs
+
+**Pre-requisite**: Ensure that you have a running [Kubernetes cluster with GPUs](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/).
+
+1. Create a PVC, Secret and Deployment for vLLM
+
+      PVC is used to store the model cache and it is optional, you can use hostPath or other storage options
+
+      <details>
+      <summary>Yaml</summary>
+
+      ```yaml
+      apiVersion: v1
+      kind: PersistentVolumeClaim
+      metadata:
+        name: mistral-7b
+        namespace: default
+      spec:
+        accessModes:
+        - ReadWriteOnce
+        resources:
+          requests:
+            storage: 50Gi
+        storageClassName: default
+        volumeMode: Filesystem
+      ```
+
+      </details>
+
+      Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models
+
+      ```yaml
+      apiVersion: v1
+      kind: Secret
+      metadata:
+        name: hf-token-secret
+        namespace: default
+      type: Opaque
+      stringData:
+        token: "REPLACE_WITH_TOKEN"
+      ```
+  
+      Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model.
+
+      Here are two examples for using NVIDIA GPU and AMD GPU.
+
+      NVIDIA GPU:
+
+      <details>
+      <summary>Yaml</summary>
+
+      ```yaml
+      apiVersion: apps/v1
+      kind: Deployment
+      metadata:
+        name: mistral-7b
+        namespace: default
+        labels:
+          app: mistral-7b
+      spec:
+        replicas: 1
+        selector:
+          matchLabels:
+            app: mistral-7b
+        template:
+          metadata:
+            labels:
+              app: mistral-7b
+          spec:
+            volumes:
+            - name: cache-volume
+              persistentVolumeClaim:
+                claimName: mistral-7b
+            # vLLM needs to access the host's shared memory for tensor parallel inference.
+            - name: shm
+              emptyDir:
+                medium: Memory
+                sizeLimit: "2Gi"
+            containers:
+            - name: mistral-7b
+              image: vllm/vllm-openai:latest
+              command: ["/bin/sh", "-c"]
+              args: [
+                "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
+              ]
+              env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-token-secret
+                    key: token
+              ports:
+              - containerPort: 8000
+              resources:
+                limits:
+                  cpu: "10"
+                  memory: 20G
+                  nvidia.com/gpu: "1"
+                requests:
+                  cpu: "2"
+                  memory: 6G
+                  nvidia.com/gpu: "1"
+              volumeMounts:
+              - mountPath: /root/.cache/huggingface
+                name: cache-volume
+              - name: shm
+                mountPath: /dev/shm
+              livenessProbe:
+                httpGet:
+                  path: /health
+                  port: 8000
+                initialDelaySeconds: 60
+                periodSeconds: 10
+              readinessProbe:
+                httpGet:
+                  path: /health
+                  port: 8000
+                initialDelaySeconds: 60
+                periodSeconds: 5
+      ```
+
+      </details>
+
+      AMD GPU:
+
+      You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X.
+
+      <details>
+      <summary>Yaml</summary>
+
+      ```yaml
+      apiVersion: apps/v1
+      kind: Deployment
+      metadata:
+        name: mistral-7b
+        namespace: default
+        labels:
+          app: mistral-7b
+      spec:
+        replicas: 1
+        selector:
+          matchLabels:
+            app: mistral-7b
+        template:
+          metadata:
+            labels:
+              app: mistral-7b
+          spec:
+            volumes:
+            # PVC
+            - name: cache-volume
+              persistentVolumeClaim:
+                claimName: mistral-7b
+            # vLLM needs to access the host's shared memory for tensor parallel inference.
+            - name: shm
+              emptyDir:
+                medium: Memory
+                sizeLimit: "8Gi"
+            hostNetwork: true
+            hostIPC: true
+            containers:
+            - name: mistral-7b
+              image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
+              securityContext:
+                seccompProfile:
+                  type: Unconfined
+                runAsGroup: 44
+                capabilities:
+                  add:
+                  - SYS_PTRACE
+              command: ["/bin/sh", "-c"]
+              args: [
+                "vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
+              ]
+              env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-token-secret
+                    key: token
+              ports:
+              - containerPort: 8000
+              resources:
+                limits:
+                  cpu: "10"
+                  memory: 20G
+                  amd.com/gpu: "1"
+                requests:
+                  cpu: "6"
+                  memory: 6G
+                  amd.com/gpu: "1"
+              volumeMounts:
+              - name: cache-volume
+                mountPath: /root/.cache/huggingface
+              - name: shm
+                mountPath: /dev/shm
+      ```
+
+      </details>
+
+      You can get the full example with steps and sample yaml files from <https://github.com/ROCm/k8s-device-plugin/tree/master/example/vllm-serve>.
+
+2. Create a Kubernetes Service for vLLM
+
+      Next, create a Kubernetes Service file to expose the `mistral-7b` deployment:
+
+      <details>
+      <summary>Yaml</summary>
+
+      ```yaml
+      apiVersion: v1
+      kind: Service
+      metadata:
+        name: mistral-7b
+        namespace: default
+      spec:
+        ports:
+        - name: http-mistral-7b
+          port: 80
+          protocol: TCP
+          targetPort: 8000
+        # The label selector should match the deployment labels & it is useful for prefix caching feature
+        selector:
+          app: mistral-7b
+        sessionAffinity: None
+        type: ClusterIP
+      ```
+
+      </details>
+
+3. Deploy and Test
+
+      Apply the deployment and service configurations using `kubectl apply -f <filename>`:
+
+      ```bash
+      kubectl apply -f deployment.yaml
+      kubectl apply -f service.yaml
+      ```
+
+      To test the deployment, run the following `curl` command:
+
+      ```bash
+      curl http://mistral-7b.default.svc.cluster.local/v1/completions \
+        -H "Content-Type: application/json" \
+        -d '{
+              "model": "mistralai/Mistral-7B-Instruct-v0.3",
+              "prompt": "San Francisco is a",
+              "max_tokens": 7,
+              "temperature": 0
+            }'
+      ```
+
+      If the service is correctly deployed, you should receive a response from the vLLM model.
+
+## Troubleshooting
+
+### Startup Probe or Readiness Probe Failure, container log contains "KeyboardInterrupt: terminated"
+
+If the startup or readiness probe failureThreshold is too low for the time needed to startup the server, Kubernetes scheduler will kill the container. A couple of indications that this has happened:
+
+1. container log contains "KeyboardInterrupt: terminated"
+2. `kubectl get events` shows message `Container $NAME failed startup probe, will be restarted`
+
+To mitigate, increase the failureThreshold to allow more time for the model server to start serving. You can identify an ideal failureThreshold by removing the probes from the manifest and measuring how much time it takes for the model server to show it's ready to serve.
+
+## Conclusion
+
+Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation.
diff --git a/vllm_v0.10.0/docs/deployment/nginx.md b/vllm_v0.10.0/docs/deployment/nginx.md
new file mode 100644
index 0000000..b3178e7
--- /dev/null
+++ b/vllm_v0.10.0/docs/deployment/nginx.md
@@ -0,0 +1,151 @@
+# Using Nginx
+
+This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.
+
+[](){ #nginxloadbalancer-nginx-build }
+
+## Build Nginx Container
+
+This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory.
+
+```bash
+export vllm_root=`pwd`
+```
+
+Create a file named `Dockerfile.nginx`:
+
+```dockerfile
+FROM nginx:latest
+RUN rm /etc/nginx/conf.d/default.conf
+EXPOSE 80
+CMD ["nginx", "-g", "daemon off;"]
+```
+
+Build the container:
+
+```bash
+docker build . -f Dockerfile.nginx --tag nginx-lb
+```
+
+[](){ #nginxloadbalancer-nginx-conf }
+
+## Create Simple Nginx Config file
+
+Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`.
+
+??? console "Config"
+
+    ```console
+    upstream backend {
+        least_conn;
+        server vllm0:8000 max_fails=3 fail_timeout=10000s;
+        server vllm1:8000 max_fails=3 fail_timeout=10000s;
+    }
+    server {
+        listen 80;
+        location / {
+            proxy_pass http://backend;
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+        }
+    }
+    ```
+
+[](){ #nginxloadbalancer-nginx-vllm-container }
+
+## Build vLLM Container
+
+```bash
+cd $vllm_root
+docker build -f docker/Dockerfile . --tag vllm
+```
+
+If you are behind proxy, you can pass the proxy settings to the docker build command as shown below:
+
+```bash
+cd $vllm_root
+docker build \
+    -f docker/Dockerfile . \
+    --tag vllm \
+    --build-arg http_proxy=$http_proxy \
+    --build-arg https_proxy=$https_proxy
+```
+
+[](){ #nginxloadbalancer-nginx-docker-network }
+
+## Create Docker Network
+
+```bash
+docker network create vllm_nginx
+```
+
+[](){ #nginxloadbalancer-nginx-launch-container }
+
+## Launch vLLM Containers
+
+Notes:
+
+- If you have your HuggingFace models cached somewhere else, update `hf_cache_dir` below.
+- If you don't have an existing HuggingFace cache you will want to start `vllm0` and wait for the model to complete downloading and the server to be ready. This will ensure that `vllm1` can leverage the model you just downloaded and it won't have to be downloaded again.
+- The below example assumes GPU backend used. If you are using CPU backend, remove `--gpus device=ID`, add `VLLM_CPU_KVCACHE_SPACE` and `VLLM_CPU_OMP_THREADS_BIND` environment variables to the docker run command.
+- Adjust the model name that you want to use in your vLLM servers if you don't want to use `Llama-2-7b-chat-hf`.
+
+??? console "Commands"
+
+    ```console
+    mkdir -p ~/.cache/huggingface/hub/
+    hf_cache_dir=~/.cache/huggingface/
+    docker run \
+        -itd \
+        --ipc host \
+        --network vllm_nginx \
+        --gpus device=0 \
+        --shm-size=10.24gb \
+        -v $hf_cache_dir:/root/.cache/huggingface/ \
+        -p 8081:8000 \
+        --name vllm0 vllm \
+        --model meta-llama/Llama-2-7b-chat-hf
+    docker run \
+        -itd \
+        --ipc host \
+        --network vllm_nginx \
+        --gpus device=1 \
+        --shm-size=10.24gb \
+        -v $hf_cache_dir:/root/.cache/huggingface/ \
+        -p 8082:8000 \
+        --name vllm1 vllm \
+        --model meta-llama/Llama-2-7b-chat-hf
+    ```
+
+!!! note
+    If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`.
+
+[](){ #nginxloadbalancer-nginx-launch-nginx }
+
+## Launch Nginx
+
+```bash
+docker run \
+    -itd \
+    -p 8000:80 \
+    --network vllm_nginx \
+    -v ./nginx_conf/:/etc/nginx/conf.d/ \
+    --name nginx-lb nginx-lb:latest
+```
+
+[](){ #nginxloadbalancer-nginx-verify-nginx }
+
+## Verify That vLLM Servers Are Ready
+
+```bash
+docker logs vllm0 | grep Uvicorn
+docker logs vllm1 | grep Uvicorn
+```
+
+Both outputs should look like this:
+
+```console
+INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
+```
diff --git a/vllm_v0.10.0/docs/design/arch_overview.md b/vllm_v0.10.0/docs/design/arch_overview.md
new file mode 100644
index 0000000..334df5d
--- /dev/null
+++ b/vllm_v0.10.0/docs/design/arch_overview.md
@@ -0,0 +1,243 @@
+# Architecture Overview
+
+This document provides an overview of the vLLM architecture.
+
+[TOC]
+
+## Entrypoints
+
+vLLM provides a number of entrypoints for interacting with the system. The
+following diagram shows the relationship between them.
+
+![Entrypoints Diagram](../assets/design/arch_overview/entrypoints.excalidraw.png)
+
+### LLM Class
+
+The LLM class provides the primary Python interface for doing offline inference,
+which is interacting with a model without using a separate model inference
+server.
+
+Here is a sample of `LLM` class usage:
+
+??? code
+
+    ```python
+    from vllm import LLM, SamplingParams
+
+    # Define a list of input prompts
+    prompts = [
+        "Hello, my name is",
+        "The capital of France is",
+        "The largest ocean is",
+    ]
+
+    # Define sampling parameters
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    # Initialize the LLM engine with the OPT-125M model
+    llm = LLM(model="facebook/opt-125m")
+
+    # Generate outputs for the input prompts
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the generated outputs
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    ```
+
+More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs.
+
+The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
+
+### OpenAI-Compatible API Server
+
+The second primary interface to vLLM is via its OpenAI-compatible API server.
+This server can be started using the `vllm serve` command.
+
+```bash
+vllm serve <model>
+```
+
+The code for the `vllm` CLI can be found in <gh-file:vllm/entrypoints/cli/main.py>.
+
+Sometimes you may see the API server entrypoint used directly instead of via the
+`vllm` CLI command. For example:
+
+```bash
+python -m vllm.entrypoints.openai.api_server --model <model>
+```
+
+That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
+
+More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document.
+
+## LLM Engine
+
+The `LLMEngine` and `AsyncLLMEngine` classes are central to the functioning of
+the vLLM system, handling model inference and asynchronous request processing.
+
+![LLMEngine Diagram](../assets/design/arch_overview/llm_engine.excalidraw.png)
+
+### LLMEngine
+
+The `LLMEngine` class is the core component of the vLLM engine. It is
+responsible for receiving requests from clients and generating outputs from the
+model. The `LLMEngine` includes input processing, model execution (possibly
+distributed across multiple hosts and/or GPUs), scheduling, and output
+processing.
+
+- **Input Processing**: Handles tokenization of input text using the specified
+  tokenizer.
+- **Scheduling**: Chooses which requests are processed in each step.
+- **Model Execution**: Manages the execution of the language model, including
+  distributed execution across multiple GPUs.
+- **Output Processing**: Processes the outputs generated by the model, decoding the
+  token IDs from a language model into human-readable text.
+
+The code for `LLMEngine` can be found in <gh-file:vllm/engine/llm_engine.py>.
+
+### AsyncLLMEngine
+
+The `AsyncLLMEngine` class is an asynchronous wrapper for the `LLMEngine` class.
+It uses `asyncio` to create a background loop that continuously processes
+incoming requests. The `AsyncLLMEngine` is designed for online serving, where it
+can handle multiple concurrent requests and stream outputs to clients.
+
+The OpenAI-compatible API server uses the `AsyncLLMEngine`. There is also a demo
+API server that serves as a simpler example in <gh-file:vllm/entrypoints/api_server.py>.
+
+The code for `AsyncLLMEngine` can be found in <gh-file:vllm/engine/async_llm_engine.py>.
+
+## Worker
+
+A worker is a process that runs the model inference. vLLM follows the common
+practice of using one process to control one accelerator device, such as GPUs.
+For example, if we use tensor parallelism of size 2 and pipeline parallelism of
+size 2, we will have 4 workers in total. Workers are identified by their
+`rank` and `local_rank`. `rank` is used for global orchestration, while
+`local_rank` is mainly used for assigning the accelerator device and accessing
+local resources such as the file system and shared memory.
+
+## Model Runner
+
+Every worker has one model runner object, responsible for loading and running
+the model. Much of the model execution logic resides here, such as preparing
+input tensors and capturing cudagraphs.
+
+## Model
+
+Every model runner object has one model object, which is the actual
+`torch.nn.Module` instance. See [huggingface_integration](huggingface_integration.md) for how various
+configurations affect the class we ultimately get.
+
+## Class Hierarchy
+
+The following figure shows the class hierarchy of vLLM:
+
+> <figure markdown="span">
+>   ![](../assets/design/hierarchy.png){ align="center" alt="query" width="100%" }
+> </figure>
+
+There are several important design choices behind this class hierarchy:
+
+1\. **Extensibility**: All classes in the hierarchy accept a configuration object
+containing all the necessary information. The [VllmConfig](https://github.com/vllm-project/vllm/blob/d1c6799b8870e513bf4f2305cbf6cda9fc3d773b/vllm/config.py#L2036)
+class is the main configuration object that is passed around. The class
+hierarchy is quite deep, and every class needs to read the configuration it is
+interested in. By encapsulating all configurations in one object, we can easily
+pass the configuration object around and access the configuration we need.
+Suppose we want to add a new feature (this is often the case given how fast the
+field of LLM inference is evolving) that only touches the model runner. We will
+have to add a new configuration option in the `VllmConfig` class. Since we pass
+the whole config object around, we only need to add the configuration option to
+the `VllmConfig` class, and the model runner can access it directly. We don't
+need to change the constructor of the engine, worker, or model class to pass the
+new configuration option.
+
+2\. **Uniformity**: The model runner needs a unified interface to create and
+initialize the model. vLLM supports more than 50 types of popular open-source
+models. Each model has its own initialization logic. If the constructor
+signature varies with models, the model runner does not know how to call the
+constructor accordingly, without complicated and error-prone inspection logic.
+By making the constructor of the model class uniform, the model runner can
+easily create and initialize the model without knowing the specific model type.
+This is also useful for composing models. Vision-language models often consist
+of a vision model and a language model. By making the constructor uniform, we
+can easily create a vision model and a language model and compose them into a
+vision-language model.
+
+!!! note
+    To support this change, all vLLM models' signatures have been updated to:
+
+    ```python
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+    ```
+
+    To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one:
+
+    ??? code
+
+        ```python
+        class MyOldModel(nn.Module):
+            def __init__(
+                self,
+                config,
+                cache_config: Optional[CacheConfig] = None,
+                quant_config: Optional[QuantizationConfig] = None,
+                lora_config: Optional[LoRAConfig] = None,
+                prefix: str = "",
+            ) -> None:
+                ...
+
+        from vllm.config import VllmConfig
+        class MyNewModel(MyOldModel):
+            def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+                config = vllm_config.model_config.hf_config
+                cache_config = vllm_config.cache_config
+                quant_config = vllm_config.quant_config
+                lora_config = vllm_config.lora_config
+                super().__init__(config, cache_config, quant_config, lora_config, prefix)
+
+        if __version__ >= "0.6.4":
+            MyModel = MyNewModel
+        else:
+            MyModel = MyOldModel
+        ```
+
+    This way, the model can work with both old and new versions of vLLM.
+
+3\. **Sharding and Quantization at Initialization**: Certain features require
+changing the model weights. For example, tensor parallelism needs to shard the
+model weights, and quantization needs to quantize the model weights. There are
+two possible ways to implement this feature. One way is to change the model
+weights after the model is initialized. The other way is to change the model
+weights during the model initialization. vLLM chooses the latter. The first
+approach is not scalable to large models. Suppose we want to run a 405B model
+(with roughly 810GB weights) with 16 H100 80GB GPUs. Ideally, every GPU should
+only load 50GB weights. If we change the model weights after the model is
+initialized, we need to load the full 810GB weights to every GPU and then shard
+the weights, leading to a huge memory overhead. Instead, if we shard the weights
+during the model initialization, every layer will only create a shard of the
+weights it needs, leading to a much smaller memory overhead. The same idea
+applies to quantization. Note that we also add an additional argument `prefix`
+to the model's constructor so that the model can initialize itself differently
+based on the prefix. This is useful for non-uniform quantization, where
+different parts of the model are quantized differently. The `prefix` is
+usually an empty string for the top-level model and a string like `"vision"`
+or `"language"` for the sub-models. In general, it matches the name of the
+module's state dict in the checkpoint file.
+
+One disadvantage of this design is that it is hard to write unit tests for
+individual components in vLLM because every component needs to be initialized by
+a complete config object. We solve this problem by providing a default
+initialization function that creates a default config object with all fields set
+to `None`. If the component we want to test only cares about a few fields in
+the config object, we can create a default config object and set the fields we
+care about. This way, we can test the component in isolation. Note that many
+tests in vLLM are end-to-end tests that test the whole system, so this is not a
+big problem.
+
+In summary, the complete config object `VllmConfig` can be treated as an
+engine-level global state that is shared among all vLLM classes.
diff --git a/vllm_v0.10.0/docs/design/automatic_prefix_caching.md b/vllm_v0.10.0/docs/design/automatic_prefix_caching.md
new file mode 100644
index 0000000..60e21f6
--- /dev/null
+++ b/vllm_v0.10.0/docs/design/automatic_prefix_caching.md
@@ -0,0 +1,40 @@
+# Automatic Prefix Caching
+
+The core idea of [PagedAttention](https://blog.vllm.ai/2023/06/20/vllm.html) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
+
+To automatically cache the KV cache, we utilize the following key observation: Each KV block can be uniquely identified by the tokens within the block and the tokens in the prefix before the block.
+
+```text
+                    Block 1                  Block 2                  Block 3
+         [A gentle breeze stirred] [the leaves as children] [laughed in the distance]
+Block 1: |<--- block tokens ---->|
+Block 2: |<------- prefix ------>| |<--- block tokens --->|
+Block 3: |<------------------ prefix -------------------->| |<--- block tokens ---->|
+```
+
+In the example above, the KV cache in the first block can be uniquely identified with the tokens “A gentle breeze stirred”. The third block can be uniquely identified with the tokens in the block “laughed in the distance”, along with the prefix tokens “A gentle breeze stirred the leaves as children”. Therefore, we can build the following one-to-one mapping:
+
+```text
+hash(prefix tokens + block tokens) <--> KV Block
+```
+
+With this mapping, we can add another indirection in vLLM’s KV cache management. Previously, each sequence in vLLM maintained a mapping from their logical KV blocks to physical blocks. To achieve automatic caching of KV blocks, we map the logical KV blocks to their hash value and maintain a global hash table of all the physical blocks. In this way, all the KV blocks sharing the same hash value (e.g., shared prefix blocks across two requests) can be mapped to the same physical block and share the memory space.
+
+This design achieves automatic prefix caching without the need of maintaining a tree structure among the KV blocks. More specifically, all of the blocks are independent of each other and can be allocated and freed by itself, which enables us to manages the KV cache as ordinary caches in operating system.
+
+## Generalized Caching Policy
+
+Keeping all the KV blocks in a hash table enables vLLM to cache KV blocks from earlier requests to save memory and accelerate the computation of future requests. For example, if a new request shares the system prompt with the previous request, the KV cache of the shared prompt can directly be used for the new request without recomputation. However, the total KV cache space is limited and we have to decide which KV blocks to keep or evict when the cache is full.
+
+Managing KV cache with a hash table allows us to implement flexible caching policies. As an example, in current vLLM, we implement the following eviction policy:
+
+* When there are no free blocks left, we will evict a KV block with reference count (i.e., number of current requests using the block) equals 0.
+* If there are multiple blocks with reference count equals to 0, we prioritize to evict the least recently used block (LRU).
+* If there are multiple blocks whose last access time are the same, we prioritize the eviction of the block that is at the end of the longest prefix (i.e., has the maximum number of blocks before it).
+
+Note that this eviction policy effectively implements the exact policy as in [RadixAttention](https://lmsys.org/blog/2024-01-17-sglang/) when applied to models with full attention, which prioritizes to evict reference count zero and least recent used leaf nodes in the prefix tree.
+
+However, the hash-based KV cache management gives us the flexibility to handle more complicated serving scenarios and implement more complicated eviction policies beyond the policy above:
+
+* Multi-LoRA serving. When serving requests for multiple LoRA adapters, we can simply let the hash of each KV block to also include the LoRA ID the request is querying for to enable caching for all adapters. In this way, we can jointly manage the KV blocks for different adapters, which simplifies the system implementation and improves the global cache hit rate and efficiency.
+* Multi-modal models. When the user input includes more than just discrete tokens, we can use different hashing methods to handle the caching of inputs of different modalities. For example, perceptual hashing for images to cache similar input images.
diff --git a/vllm_v0.10.0/docs/design/huggingface_integration.md b/vllm_v0.10.0/docs/design/huggingface_integration.md
new file mode 100644
index 0000000..7b01313
--- /dev/null
+++ b/vllm_v0.10.0/docs/design/huggingface_integration.md
@@ -0,0 +1,31 @@
+# Integration with HuggingFace
+
+This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run `vllm serve`.
+
+Let's say we want to serve the popular QWen model by running `vllm serve Qwen/Qwen2-7B`.
+
+1. The `model` argument is `Qwen/Qwen2-7B`. vLLM determines whether this model exists by checking for the corresponding config file `config.json`. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L162-L182) for the implementation. Within this process:
+    - If the `model` argument corresponds to an existing local path, vLLM will load the config file directly from this path.
+    - If the `model` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the `model` argument as the model name and the `--revision` argument as the revision. See [their website](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome) for more information on how the HuggingFace cache works.
+    - If the `model` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to [this function](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L91) for the implementation. The input arguments include the `model` argument as the model name, the `--revision` argument as the revision, and the environment variable `HF_TOKEN` as the token to access the model hub. In our case, vLLM will download the [config.json](https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json) file.
+
+2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L185-L186) for the implementation.
+
+3. Next, vLLM [inspects](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L189) the `model_type` field in the config dictionary to [generate](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L190-L216) the config object to use. There are some `model_type` values that vLLM directly supports; see [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L48) for the list. If the `model_type` is not in the list, vLLM will use [AutoConfig.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoConfig.from_pretrained) to load the config class, with `model`, `--revision`, and `--trust_remote_code` as the arguments. Please note that:
+    - HuggingFace also has its own logic to determine the config class to use. It will again use the `model_type` field to search for the class name in the transformers library; see [here](https://github.com/huggingface/transformers/tree/main/src/transformers/models) for the list of supported models. If the `model_type` is not found, HuggingFace will use the `auto_map` field from the config JSON file to determine the class name. Specifically, it is the `AutoConfig` field under `auto_map`. See [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json) for an example.
+    - The `AutoConfig` field under `auto_map` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the `from_pretrained` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when `--trust_remote_code` is enabled.
+
+4. Subsequently, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see [here](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/config.py#L244) for the implementation.
+
+5. Finally, vLLM can reach the model class we want to initialize. vLLM uses the `architectures` field in the config object to determine the model class to initialize, as it maintains the mapping from architecture name to model class in [its registry](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/registry.py#L80). If the architecture name is not found in the registry, it means this model architecture is not supported by vLLM. For `Qwen/Qwen2-7B`, the `architectures` field is `["Qwen2ForCausalLM"]`, which corresponds to the `Qwen2ForCausalLM` class in [vLLM's code](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/qwen2.py#L364). This class will initialize itself depending on various configs.
+
+Beyond that, there are two more things vLLM depends on HuggingFace for.
+
+1. **Tokenizer**: vLLM uses the tokenizer from HuggingFace to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check HuggingFace's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [get_cached_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24).
+
+2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the `model` argument as the model name and the `--revision` argument as the revision. vLLM provides the argument `--load-format` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass `--load-format dummy` to skip downloading the weights.
+    - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the [documentation](https://huggingface.co/docs/safetensors/en/index) for more information on the safetensors format. This part of the logic can be found [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385). Please note that:
+
+This completes the integration between vLLM and HuggingFace.
+
+In summary, vLLM reads the config file `config.json`, tokenizer, and model weight from the HuggingFace model hub or a local directory. It uses the config class from either vLLM, HuggingFace transformers, or loads the config class from the model's repository.
diff --git a/vllm_v0.10.0/docs/design/kernel/paged_attention.md b/vllm_v0.10.0/docs/design/kernel/paged_attention.md
new file mode 100644
index 0000000..94bfa97
--- /dev/null
+++ b/vllm_v0.10.0/docs/design/kernel/paged_attention.md
@@ -0,0 +1,497 @@
+# vLLM Paged Attention
+
+Currently, vLLM utilizes its own implementation of a multi-head query
+attention kernel (`csrc/attention/attention_kernels.cu`).
+This kernel is designed to be compatible with
+vLLM's paged KV caches, where the key and value cache are stored in
+separate blocks (note that this block concept differs from the GPU
+thread block. So in a later document, I will refer to vLLM paged
+attention block as "block", while refer to GPU thread block as
+"thread block").
+
+To achieve high performance, this kernel relies on a specially
+designed memory layout and access method, specifically when threads
+read data from global memory to shared memory. The purpose of this
+document is to provide a high-level explanation of the kernel
+implementation step by step, aiding those who wish to learn about the
+vLLM multi-head query attention kernel. After going through this
+document, users will likely have a better understanding and feel easier
+to follow the actual implementation.
+
+Please note that this document may not cover all details, such as how
+to calculate the correct index for the corresponding data or the dot
+multiplication implementation. However, after reading this document
+and becoming familiar with the high-level logic flow, it should be
+easier for you to read the actual code and understand the details.
+
+## Inputs
+
+The kernel function takes a list of arguments for the current thread
+to perform its assigned work. The three most important arguments are
+the input pointers `q`, `k_cache`, and `v_cache`, which point
+to query, key, and value data on global memory that need to be read
+and processed. The output pointer `out` points to global memory
+where the result should be written. These four pointers actually
+refer to multi-dimensional arrays, but each thread only accesses the
+portion of data assigned to it. I have omitted all other runtime
+parameters here for simplicity.
+
+```cpp
+template<typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE, int NUM_THREADS, int PARTITION_SIZE = 0>
+__device__ void paged_attention_kernel(
+    ... // Other side args.
+    const scalar_t* __restrict__ out,       // [num_seqs, num_heads, max_num_partitions, head_size]
+    const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
+    const scalar_t* __restrict__ k_cache,   // [num_blocks, num_kv_heads, head_size/x, block_size, x]
+    const scalar_t* __restrict__ v_cache,   // [num_blocks, num_kv_heads, head_size, block_size]
+    ... // Other side args.
+)
+```
+
+There are also a list of template arguments above the function
+signature that are determined during compilation time. `scalar_t`
+represents the data type of the query, key, and value data elements,
+such as FP16. `HEAD_SIZE` indicates the number of elements in each
+head. `BLOCK_SIZE` refers to the number of tokens in each block.
+`NUM_THREADS` denotes the number of threads in each thread block.
+`PARTITION_SIZE` represents the number of tensor parallel GPUs (For
+simplicity, we assume this is 0 and tensor parallel is disabled).
+
+With these arguments, we need to perform a sequence of preparations.
+This includes calculating the current head index, block index, and
+other necessary variables. However, for now, we can ignore these
+preparations and proceed directly to the actual calculations. It will
+be easier to understand them once we grasp the entire flow.
+
+## Concepts
+
+Just before we dive into the calculation flow, I want to describe a
+few concepts that are needed for later sections. However, you may
+skip this section and return later if you encounter any confusing
+terminologies.
+
+- **Sequence**: A sequence represents a client request. For example,
+  the data pointed to by `q` has a shape of
+  `[num_seqs, num_heads, head_size]`. That represents there are total
+  `num_seqs` of query sequence data are pointed by `q`. Since this
+  kernel is a single query attention kernel, each sequence only has one
+  query token. Hence, the `num_seqs` equals the total number of tokens
+  that are processed in the batch.
+- **Context**: The context consists of the generated tokens from the
+  sequence. For instance, `["What", "is", "your"]` are the context
+  tokens, and the input query token is `"name"`. The model might
+  generate the token `"?"`.
+- **Vec**: The vec is a list of elements that are fetched and
+  calculated together. For query and key data, the vec size
+  (`VEC_SIZE`) is determined so that each thread group can fetch and
+  calculate 16 bytes of data at a time. For value data, the vec size
+  (`V_VEC_SIZE`) is determined so that each thread can fetch and
+  calculate 16 bytes of data at a time. For example, if the
+  `scalar_t` is FP16 (2 bytes) and `THREAD_GROUP_SIZE` is 2, the
+  `VEC_SIZE` will be 4, while the `V_VEC_SIZE` will be 8.
+- **Thread group**: The thread group is a small group of
+  threads(`THREAD_GROUP_SIZE`) that fetches and calculates one
+  query token and one key token at a time. Each thread handles only a
+  portion of the token data. The total number of elements processed by
+  one thread group is referred as `x`. For example, if the thread
+  group contains 2 threads and the head size is 8, then thread 0
+  handles the query and key elements at index 0, 2, 4, 6, while thread
+  1 handles the elements at index 1, 3, 5, 7.
+- **Block**: The key and value cache data in vLLM are split into
+  blocks. Each block stores data for a fixed number(`BLOCK_SIZE`)
+  of tokens at one head. Each block may contain only a portion of the
+  whole context tokens. For example, if the block size is 16 and the
+  head size is 128, then for one head, one block can store 16 * 128 =
+  2048 elements.
+- **Warp**: A warp is a group of 32 threads(`WARP_SIZE`) that
+  execute simultaneously on a stream multiprocessor (SM). In this
+  kernel, each warp processes the calculation between one query token
+  and key tokens of one entire block at a time (it may process multiple
+  blocks in multiple iterations). For example, if there are 4 warps and
+  6 blocks for one context, the assignment would be like warp 0 handles
+  the 0th, 4th blocks, warp 1 handles the 1st, 5th blocks, warp 2
+  handles the 2nd block and warp 3 handles the 3rd block.
+- **Thread block**: A thread block is a group of
+  threads(`NUM_THREADS`) that can access the same shared memory.
+  Each thread block contains multiple warps(`NUM_WARPS`), and in
+  this kernel, each thread block processes the calculation between one
+  query token and key tokens of a whole context.
+- **Grid**: A grid is a collection of thread blocks and defines the
+  shape of the collection. In this kernel, the shape is
+  `(num_heads, num_seqs, max_num_partitions)`. Therefore, each thread
+  block only handles the calculation for one head, one sequence, and
+  one partition.
+
+## Query
+
+This section will introduce how query data is stored in memory and
+fetched by each thread. As mentioned above, each thread group fetches
+one query token data, while each thread itself only handles a part of
+one query token data. Within each warp, every thread group will fetch
+the same query token data, but will multiply it with different key
+token data.
+
+```cpp
+const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
+```
+
+<figure markdown="span">
+  ![](../../assets/kernel/query.png){ align="center" alt="query" width="70%" }
+</figure>
+
+Each thread defines its own `q_ptr` which points to the assigned
+query token data on global memory. For example, if `VEC_SIZE` is 4
+and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains
+total of 128 elements divided into 128 / 4 = 32 vecs.
+
+<figure markdown="span">
+  ![](../../assets/kernel/q_vecs.png){ align="center" alt="q_vecs" width="70%" }
+</figure>
+
+```cpp
+__shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
+```
+
+Next, we need to read the global memory data pointed to by `q_ptr`
+into shared memory as `q_vecs`. It is important to note that each
+vecs is assigned to a different row. For example, if the
+`THREAD_GROUP_SIZE` is 2, thread 0 will handle the 0th row vecs,
+while thread 1 handles the 1st row vecs. By reading the query data in
+this way, neighboring threads like thread 0 and thread 1 can read
+neighbor memory, achieving the memory coalescing to improve
+performance.
+
+## Key
+
+Similar to the "Query" section, this section introduces memory layout
+and assignment for keys. While each thread group only handle one
+query token one kernel run, it may handle multiple key tokens across
+multiple iterations. Meanwhile, each warp will process multiple blocks
+of key tokens in multiple iterations, ensuring that all context
+tokens are processed by the entire thread group after the kernel run.
+In this context, "handle" refers to performing the dot multiplication
+between query data and key data.
+
+```cpp
+const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride
+                    + kv_head_idx * kv_head_stride
+                    + physical_block_offset * x;
+```
+
+Unlike to `q_ptr`, `k_ptr` in each thread will point to different
+key token at different iterations. As shown above, that `k_ptr`
+points to key token data based on `k_cache` at assigned block,
+assigned head and assigned token.
+
+<figure markdown="span">
+  ![](../../assets/kernel/key.png){ align="center" alt="key" width="70%" }
+</figure>
+
+The diagram above illustrates the memory layout for key data. It
+assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is
+8, `THREAD_GROUP_SIZE` is 2, and there are a total of 4 warps. Each
+rectangle represents all the elements for one key token at one head,
+which will be processed by one thread group. The left half shows the
+total 16 blocks of key token data for warp 0, while the right half
+represents the remaining key token data for other warps or
+iterations. Inside each rectangle, there are a total 32 vecs (128
+elements for one token) that will be processed by 2 threads (one
+thread group) separately.
+
+<figure markdown="span">
+  ![](../../assets/kernel/k_vecs.png){ align="center" alt="k_vecs" width="70%" }
+</figure>
+
+```cpp
+K_vec k_vecs[NUM_VECS_PER_THREAD]
+```
+
+Next, we need to read the key token data from `k_ptr` and store
+them on register memory as `k_vecs`. We use register memory for
+`k_vecs` because it will only be accessed by one thread once,
+whereas `q_vecs` will be accessed by multiple threads multiple
+times. Each `k_vecs` will contain multiple vectors for later
+calculation. Each vec will be set at each inner iteration. The
+assignment of vecs allows neighboring threads in a warp to read
+neighboring memory together, which again promotes the memory
+coalescing. For instance, thread 0 will read vec 0, while thread 1
+will read vec 1. In the next inner loop, thread 0 will read vec 2,
+while thread 1 will read vec 3, and so on.
+
+You may still be a little confused about the overall flow. Don't
+worry, please keep reading the next "QK" section. It will illustrate
+the query and key calculation flow in a clearer and higher-level
+manner.
+
+## QK
+
+As shown the pseudo code below, before the entire for loop block, we
+fetch the query data for one token and store it in `q_vecs`. Then,
+in the outer for loop, we iterate through different `k_ptrs` that
+point to different tokens and prepare the `k_vecs` in the inner for
+loop. Finally, we perform the dot multiplication between the
+`q_vecs` and each `k_vecs`.
+
+```cpp
+q_vecs = ...
+for ... {
+    k_ptr = ...
+    for ... {
+        k_vecs[i] = ...
+    }
+    ...
+    float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs[thread_group_offset], k_vecs);
+}
+```
+
+As mentioned before, for each thread, it only fetches part of the
+query and key token data at a time. However, there will be a cross
+thread group reduction happen in the `Qk_dot<>::dot` . So `qk`
+returned here is not just between part of the query and key token dot
+multiplication, but actually a full result between entire query and
+key token data.
+
+For example, if the value of `HEAD_SIZE` is 128 and
+`THREAD_GROUP_SIZE` is 2, each thread's `k_vecs` will contain
+total 64 elements. However, the returned `qk` is actually the
+result of dot multiplication between 128 query elements and 128 key
+elements. If you want to learn more about the details of the dot
+multiplication and reduction, you may refer to the implementation of
+`Qk_dot<>::dot`. However, for the sake of simplicity, I will not
+cover it in this document.
+
+## Softmax
+
+Next, we need to calculate the normalized softmax for all `qk`s,
+as shown above, where each $x$ represents a `qk`. To do this,
+we must obtain the reduced value of `qk_max`($m(x)$) and
+the `exp_sum`($\ell(x)$) of all `qk`s. The reduction
+should be performed across the entire thread block, encompassing
+results between the query token and all context key tokens.
+
+$$
+\begin{gather*}
+m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\
+\quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)}
+\end{gather*}
+$$
+
+### `qk_max` and `logits`
+
+Just right after we get the `qk` result, we can set the temporary
+`logits` result with `qk` (In the end, the `logits` should
+store the normalized softmax result). Also we can compare and collect
+the `qk_max` for all `qk`s that are calculated by current
+thread group.
+
+```cpp
+if (thread_group_offset == 0) {
+    const bool mask = token_idx >= context_len;
+    logits[token_idx - start_token_idx] = mask ? 0.f : qk;
+    qk_max = mask ? qk_max : fmaxf(qk_max, qk);
+}
+```
+
+Please note that the `logits` here is on shared memory, so each
+thread group will set the fields for its own assigned context tokens.
+Overall, the size of logits should be number of context tokens.
+
+```cpp
+for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) {
+    qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
+}
+
+if (lane == 0) {
+    red_smem[warp_idx] = qk_max;
+}
+```
+
+Then we need to get the reduced `qk_max` across each warp. The main
+idea is to make threads in warp to communicate with each other and
+get the final max `qk` .
+
+```cpp
+for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
+    qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
+}
+qk_max = VLLM_SHFL_SYNC(qk_max, 0);
+```
+
+Finally, we can get the reduced `qk_max` from whole thread block by
+compare the `qk_max` from all warps in this thread block. Then we
+need to broadcast the final result to each thread.
+
+### `exp_sum`
+
+Similar to `qk_max`, we need to get the reduced sum value from the
+entire thread block too.
+
+```cpp
+for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
+    float val = __expf(logits[i] - qk_max);
+    logits[i] = val;
+    exp_sum += val;
+}
+...
+exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], exp_sum);
+```
+
+Firstly, sum all exp values from each thread group, and meanwhile,
+convert each entry of `logits` from `qk` to `exp(qk - qk_max)`.
+Please note, the `qk_max` here is already the max `qk` across the
+whole thread block. And then we can do reduction for `exp_sum`
+across whole thread block just like the `qk_max`.
+
+```cpp
+const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f);
+for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
+    logits[i] *= inv_sum;
+}
+```
+
+Finally, with the reduced `qk_max` and `exp_sum`, we can obtain
+the final normalized softmax result as `logits`. This `logits`
+variable will be used for dot multiplication with the value data in
+later steps. Now, it should store the normalized softmax result of
+`qk` for all assigned context tokens.
+
+## Value
+
+<figure markdown="span">
+  ![](../../assets/kernel/value.png){ align="center" alt="value" width="70%" }
+</figure>
+
+<figure markdown="span">
+  ![](../../assets/kernel/logits_vec.png){ align="center" alt="logits_vec" width="50%" }
+</figure>
+
+<figure markdown="span">
+  ![](../../assets/kernel/v_vec.png){ align="center" alt="v_vec" width="70%" }
+</figure>
+
+Now we need to retrieve the value data and perform dot multiplication
+with `logits`. Unlike query and key, there is no thread group
+concept for value data. As shown in diagram, different from key token
+memory layout, elements from the same column correspond to the same
+value token. For one block of value data, there are `HEAD_SIZE` of
+rows and `BLOCK_SIZE` of columns that are split into multiple
+`v_vecs`.
+
+Each thread always fetches `V_VEC_SIZE` elements from the same
+`V_VEC_SIZE` of tokens at a time. As a result, a single thread
+retrieves multiple `v_vec`s from different rows and the same
+columns through multiple inner iterations. For each `v_vec`, it
+needs to be dot multiplied with the corresponding `logits_vec`,
+which is also `V_VEC_SIZE` elements from `logits`. Overall, with
+multiple inner iterations, each warp will process one block of value
+tokens. And with multiple outer iterations, the whole context value
+tokens are processed
+
+```cpp
+float accs[NUM_ROWS_PER_THREAD];
+for ... { // Iteration over different blocks.
+    logits_vec = ...
+    for ... { // Iteration over different rows.
+        v_vec = ...
+        ...
+        accs[i] += dot(logits_vec, v_vec);
+    }
+}
+```
+
+As shown in the above pseudo code, in the outer loop, similar to
+`k_ptr`, `logits_vec` iterates over different blocks and reads
+`V_VEC_SIZE` elements from `logits`. In the inner loop, each
+thread reads `V_VEC_SIZE` elements from the same tokens as a
+`v_vec` and performs dot multiplication. It is important to note
+that in each inner iteration, the thread fetches different head
+position elements for the same tokens. The dot result is then
+accumulated in `accs`. Therefore, each entry of `accs` is mapped
+to a head position assigned to the current thread.
+
+For example, if `BLOCK_SIZE` is 16 and `V_VEC_SIZE` is 8, each
+thread fetches 8 value elements for 8 tokens at a time. Each element
+is from different tokens at the same head position. If `HEAD_SIZE`
+is 128 and `WARP_SIZE` is 32, for each inner loop, a warp needs to
+fetch `WARP_SIZE * V_VEC_SIZE = 256` elements. This means there are
+a total of 128 * 16 / 256 = 8 inner iterations for a warp to handle
+a whole block of value tokens. And each `accs` in each thread
+contains 8 elements that accumulated at 8 different head positions.
+For the thread 0, the `accs` variable will have 8 elements, which
+are 0th, 32th … 224th elements of a value head that are accumulated
+from all assigned 8 tokens.
+
+## LV
+
+Now, we need to perform reduction for `accs` within each warp. This
+process allows each thread to accumulate the `accs` for the
+assigned head positions of all tokens in one block.
+
+```cpp
+for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+    float acc = accs[i];
+    for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) {
+        acc += VLLM_SHFL_XOR_SYNC(acc, mask);
+    }
+    accs[i] = acc;
+}
+```
+
+Next, we perform reduction for `accs` across all warps, allowing
+each thread to have the accumulation of `accs` for the assigned
+head positions of all context tokens. Please note that each `accs`
+in every thread only stores the accumulation for a portion of
+elements of the entire head for all context tokens. However, overall,
+all results for output have been calculated but are just stored in
+different thread register memory.
+
+??? code
+
+    ```cpp
+    float* out_smem = reinterpret_cast<float*>(shared_mem);
+    for (int i = NUM_WARPS; i > 1; i /= 2) {
+        // Upper warps write to shared memory.
+        ...
+        float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
+        for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+            ...
+            dst[row_idx] = accs[i];
+        }
+
+        // Lower warps update the output.
+        const float* src = &out_smem[warp_idx * HEAD_SIZE];
+        for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+            ...
+            accs[i] += src[row_idx];
+        }
+
+        // Write out the accs.
+    }
+    ```
+
+## Output
+
+Now we can write all of calculated result from local register memory
+to final output global memory.
+
+```cpp
+scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
+                + head_idx * max_num_partitions * HEAD_SIZE
+                + partition_idx * HEAD_SIZE;
+```
+
+First, we need to define the `out_ptr` variable, which points to
+the start address of the assigned sequence and assigned head.
+
+```cpp
+for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+    const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
+    if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
+        from_float(*(out_ptr + row_idx), accs[i]);
+    }
+}
+```
+
+Finally, we need to iterate over different assigned head positions
+and write out the corresponding accumulated result based on the
+`out_ptr`.
diff --git a/vllm_v0.10.0/docs/design/mm_processing.md b/vllm_v0.10.0/docs/design/mm_processing.md
new file mode 100644
index 0000000..1e9b6ad
--- /dev/null
+++ b/vllm_v0.10.0/docs/design/mm_processing.md
@@ -0,0 +1,67 @@
+# Multi-Modal Data Processing
+
+To enable various optimizations in vLLM such as [chunked prefill][chunked-prefill] and [prefix caching](../features/automatic_prefix_caching.md), we use [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] to provide the correspondence between placeholder feature tokens (e.g. `<image>`) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor.
+
+Here are the main features of [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor]:
+
+## Prompt Update Detection
+
+One of the main responsibilities of HF processor is to update the prompt with placeholder tokens. For example:
+
+- Insert feature placeholder tokens (e.g. `<image><image>...<image>`, the number of which equals to the feature size) at the start of the string.
+- Replace existing input placeholder tokens (e.g. `<image>` for a single image) with feature placeholder tokens (e.g. `<image><image>...<image>`, the number of which equals to the feature size).
+
+The information about which tokens have been updated is key to finding the correspondence between placeholder feature tokens and multi-modal inputs.
+
+In vLLM, this information is specified using [PromptUpdate][vllm.multimodal.processing.PromptUpdate] in [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates]. We can automatically detect whether HF has updated the prompt by checking the existence of the updated tokens.
+
+## Tokenized Prompt Inputs
+
+To enable tokenization in a separate process, we support passing input token IDs alongside multi-modal data.
+
+### The problem
+
+Consider that HF processors follow these main steps:
+
+1. Tokenize the text
+2. Process multi-modal inputs
+3. Perform prompt updates
+
+And we require that:
+
+- For text + multi-modal inputs, apply all steps 1--3.
+- For tokenized + multi-modal inputs, apply only steps 2--3.
+
+How can we achieve this without rewriting HF processors? We can try to call the HF processor several times on different inputs:
+
+- For text + multi-modal inputs, simply call the HF processor directly.
+- For tokenized + multi-modal inputs, call the processor only on the multi-modal inputs.
+
+While HF processors support text + multi-modal inputs natively, this is not so for tokenized + multi-modal inputs: an error is thrown if the number of input placeholder tokens do not correspond to the number of multi-modal inputs.
+
+Moreover, since the tokenized text has not passed through the HF processor, we have to apply Step 3 by ourselves to keep the output tokens and multi-modal data consistent with each other.
+
+[](){ #mm-dummy-text }
+
+### Dummy text
+
+We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via [get_dummy_text][vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text]. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
+
+[](){ #mm-automatic-prompt-updating }
+
+### Automatic prompt updating
+
+We address the second issue by implementing model-agnostic code in
+[_apply_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._apply_prompt_updates] to automatically update the prompt with feature placeholder tokens based on the specification outputted by [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates].
+
+### Summary
+
+With the help of dummy text and automatic prompt updating, our multi-modal processor can finally accept both text and token prompts with multi-modal data. The detailed logic is shown in [_apply_hf_processor_main][vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_main].
+
+## Processor Output Caching
+
+Some HF processors, such as the one for Qwen2-VL, are [very slow](gh-issue:9238). To alleviate this problem, we cache the multi-modal outputs of HF processor to avoid processing the same multi-modal input (e.g. image) again.
+
+When new data is passed in, we first check which items are in the cache, and which ones are missing. The missing items are passed into the HF processor in a single batch and cached, before being merged with the existing items in the cache.
+
+Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text][mm-dummy-text] to avoid HF errors. Since this skips HF's prompt updating code, we apply [automatic prompt updating][mm-automatic-prompt-updating] afterwards to keep the output tokens and multi-modal data consistent with each other.
diff --git a/vllm_v0.10.0/docs/design/plugin_system.md b/vllm_v0.10.0/docs/design/plugin_system.md
new file mode 100644
index 0000000..23a05ac
--- /dev/null
+++ b/vllm_v0.10.0/docs/design/plugin_system.md
@@ -0,0 +1,58 @@
+# vLLM's Plugin System
+
+The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM.
+
+## How Plugins Work in vLLM
+
+Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [Arch Overview](arch_overview.md)), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_general_plugins](https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16) function in the `vllm.plugins` module. This function is called for every process created by vLLM before it starts any work.
+
+## How vLLM Discovers Plugins
+
+vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin:
+
+??? code
+
+    ```python
+    # inside `setup.py` file
+    from setuptools import setup
+
+    setup(name='vllm_add_dummy_model',
+        version='0.1',
+        packages=['vllm_add_dummy_model'],
+        entry_points={
+            'vllm.general_plugins':
+            ["register_dummy_model = vllm_add_dummy_model:register"]
+        })
+
+    # inside `vllm_add_dummy_model.py` file
+    def register():
+        from vllm import ModelRegistry
+
+        if "MyLlava" not in ModelRegistry.get_supported_archs():
+            ModelRegistry.register_model(
+                "MyLlava",
+                "vllm_add_dummy_model.my_llava:MyLlava",
+            )
+    ```
+
+For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html).
+
+Every plugin has three parts:
+
+1. **Plugin group**: The name of the entry point group. vLLM uses the entry point group `vllm.general_plugins` to register general plugins. This is the key of `entry_points` in the `setup.py` file. Always use `vllm.general_plugins` for vLLM's general plugins.
+2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the `entry_points` dictionary. In the example above, the plugin name is `register_dummy_model`. Plugins can be filtered by their names using the `VLLM_PLUGINS` environment variable. To load only a specific plugin, set `VLLM_PLUGINS` to the plugin name.
+3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is `vllm_add_dummy_model:register`, which refers to a function named `register` in the `vllm_add_dummy_model` module.
+
+## Types of supported plugins
+
+- **General plugins** (with group name `vllm.general_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model inside the plugin function.
+
+- **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported.
+
+## Guidelines for Writing Plugins
+
+- **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes.
+
+## Compatibility Guarantee
+
+vLLM guarantees the interface of documented plugins, such as `ModelRegistry.register_model`, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, `"vllm_add_dummy_model.my_llava:MyLlava"` should be compatible with the version of vLLM that the plugin targets. The interface for the model may change during vLLM's development.
diff --git a/vllm_v0.10.0/docs/design/v1/metrics.md b/vllm_v0.10.0/docs/design/v1/metrics.md
new file mode 100644
index 0000000..52cd320
--- /dev/null
+++ b/vllm_v0.10.0/docs/design/v1/metrics.md
@@ -0,0 +1,700 @@
+# Metrics
+
+Ensure the v1 LLM Engine exposes a superset of the metrics available in v0.
+
+## Objectives
+
+- Achieve parity of metrics between v0 and v1.
+- The priority use case is accessing these metrics via Prometheus, as this is what we expect to be used in production environments.
+- Logging support (i.e. printing metrics to the info log) is provided for more ad-hoc testing, debugging, development, and exploratory use cases.
+
+## Background
+
+Metrics in vLLM can be categorized as follows:
+
+1. Server-level metrics: Global metrics that track the state and performance of the LLM engine. These are typically exposed as Gauges or Counters in Prometheus.
+2. Request-level metrics: Metrics that track the characteristics (e.g. size and timing) of individual requests. These are typically exposed as Histograms in Prometheus and are often the SLOs that an SRE monitoring vLLM will be tracking.
+
+The mental model is that server-level metrics help explain the values of request-level metrics.
+
+### v0 Metrics
+
+In v0, the following metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix:
+
+- `vllm:num_requests_running` (Gauge)
+- `vllm:num_requests_swapped` (Gauge)
+- `vllm:num_requests_waiting` (Gauge)
+- `vllm:gpu_cache_usage_perc` (Gauge)
+- `vllm:cpu_cache_usage_perc` (Gauge)
+- `vllm:gpu_prefix_cache_hit_rate` (Gauge)
+- `vllm:cpu_prefix_cache_hit_rate` (Gauge)
+- `vllm:prompt_tokens_total` (Counter)
+- `vllm:generation_tokens_total` (Counter)
+- `vllm:request_success_total` (Counter)
+- `vllm:request_prompt_tokens` (Histogram)
+- `vllm:request_generation_tokens` (Histogram)
+- `vllm:time_to_first_token_seconds` (Histogram)
+- `vllm:time_per_output_token_seconds` (Histogram)
+- `vllm:e2e_request_latency_seconds` (Histogram)
+- `vllm:request_queue_time_seconds` (Histogram)
+- `vllm:request_inference_time_seconds` (Histogram)
+- `vllm:request_prefill_time_seconds` (Histogram)
+- `vllm:request_decode_time_seconds` (Histogram)
+- `vllm:request_max_num_generation_tokens` (Histogram)
+- `vllm:num_preemptions_total` (Counter)
+- `vllm:cache_config_info` (Gauge)
+- `vllm:lora_requests_info` (Gauge)
+- `vllm:tokens_total` (Counter)
+- `vllm:iteration_tokens_total` (Histogram)
+- `vllm:time_in_queue_requests` (Histogram)
+- `vllm:model_forward_time_milliseconds` (Histogram)
+- `vllm:model_execute_time_milliseconds` (Histogram)
+- `vllm:request_params_n` (Histogram)
+- `vllm:request_params_max_tokens` (Histogram)
+- `vllm:spec_decode_draft_acceptance_rate` (Gauge)
+- `vllm:spec_decode_efficiency` (Gauge)
+- `vllm:spec_decode_num_accepted_tokens_total` (Counter)
+- `vllm:spec_decode_num_draft_tokens_total` (Counter)
+- `vllm:spec_decode_num_emitted_tokens_total` (Counter)
+
+These are documented under [Inferencing and Serving -> Production Metrics](../../usage/metrics.md).
+
+### Grafana Dashboard
+
+vLLM also provides [a reference example](../../examples/online_serving/prometheus_grafana.md) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard.
+
+The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important:
+
+- `vllm:e2e_request_latency_seconds_bucket` - End to end request latency measured in seconds.
+- `vllm:prompt_tokens_total` - Prompt tokens.
+- `vllm:generation_tokens_total` - Generation tokens.
+- `vllm:time_per_output_token_seconds` - Inter-token latency (Time Per Output Token, TPOT) in seconds.
+- `vllm:time_to_first_token_seconds` - Time to First Token (TTFT) latency in seconds.
+- `vllm:num_requests_running` (also, `_swapped` and `_waiting`) - Number of requests in the RUNNING, WAITING, and SWAPPED states.
+- `vllm:gpu_cache_usage_perc` - Percentage of used cache blocks by vLLM.
+- `vllm:request_prompt_tokens` - Request prompt length.
+- `vllm:request_generation_tokens` - Request generation length.
+- `vllm:request_success_total` - Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached.
+- `vllm:request_queue_time_seconds` - Queue time.
+- `vllm:request_prefill_time_seconds` - Requests prefill time.
+- `vllm:request_decode_time_seconds` - Requests decode time.
+- `vllm:request_max_num_generation_tokens` - Max generation tokens in a sequence group.
+
+See [the PR which added this Dashboard](gh-pr:2316) for interesting and useful background on the choices made here.
+
+### Prometheus Client Library
+
+Prometheus support was initially added [using the aioprometheus library](gh-pr:1890), but a switch was made quickly to [prometheus_client](gh-pr:2730). The rationale is discussed in both linked PRs.
+
+With the switch to `aioprometheus`, we lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](gh-pr:15657):
+
+```bash
+$ curl http://0.0.0.0:8000/metrics 2>/dev/null  | grep -P '^http_(?!.*(_bucket|_created|_sum)).*'
+http_requests_total{handler="/v1/completions",method="POST",status="2xx"} 201.0
+http_request_size_bytes_count{handler="/v1/completions"} 201.0
+http_response_size_bytes_count{handler="/v1/completions"} 201.0
+http_request_duration_highr_seconds_count 201.0
+http_request_duration_seconds_count{handler="/v1/completions",method="POST"} 201.0
+```
+
+### Multi-process Mode
+
+In v0, metrics are collected in the engine core process and we use multi-process mode to make them available in the API server process. See <gh-pr:7279>.
+
+### Built in Python/Process Metrics
+
+The following metrics are supported by default by `prometheus_client`, but they are not exposed when multi-process mode is used:
+
+- `python_gc_objects_collected_total`
+- `python_gc_objects_uncollectable_total`
+- `python_gc_collections_total`
+- `python_info`
+- `process_virtual_memory_bytes`
+- `process_resident_memory_bytes`
+- `process_start_time_seconds`
+- `process_cpu_seconds_total`
+- `process_open_fds`
+- `process_max_fds`
+
+This is relevant because if we move away from multiprocess mode in v1,
+we get these back. However, it's questionable how relevant these are
+if they don't aggregate these stats for all processes that make up a
+vLLM instance.
+
+### v0 PRs and Issues
+
+For background, these are some of the relevant PRs which added the v0 metrics:
+
+- <gh-pr:1890>
+- <gh-pr:2316>
+- <gh-pr:2730>
+- <gh-pr:4464>
+- <gh-pr:7279>
+
+Also note the ["Even Better Observability"](gh-issue:3616) feature where e.g. [a detailed roadmap was laid out](gh-issue:3616#issuecomment-2030858781).
+
+## v1 Design
+
+### v1 PRs
+
+For background, here are the relevant v1 PRs relating to the v1
+metrics issue <gh-issue:10582>:
+
+- <gh-pr:11962>
+- <gh-pr:11973>
+- <gh-pr:10907>
+- <gh-pr:12416>
+- <gh-pr:12478>
+- <gh-pr:12516>
+- <gh-pr:12530>
+- <gh-pr:12561>
+- <gh-pr:12579>
+- <gh-pr:12592>
+- <gh-pr:12644>
+
+### Metrics Collection
+
+In v1, we wish to move computation and overhead out of the engine core
+process to minimize the time between each forward pass.
+
+The overall idea of V1 EngineCore design is:
+
+- EngineCore is the inner loop. Performance is most critical here
+- AsyncLLM is the outer loop. This is overlapped with GPU execution
+  (ideally), so this is where any "overheads" should be if
+  possible. So AsyncLLM.output_handler_loop is the ideal place for the
+  metrics bookkeeping if possible.
+
+We will achieve this by collecting metrics in the frontend API server,
+and base these metrics on information we can glean from the
+`EngineCoreOutputs` returned by the engine core process to the
+frontend.
+
+### Interval Calculations
+
+Many of our metrics are the time interval between various events in
+the processing of a request. It is best practice to use timestamps
+based on "monotonic time" (`time.monotonic()`) rather than "wall-clock
+time" (`time.time()`) to calculate intervals as the former is
+unaffected by system clock changes (e.g. from NTP).
+
+It's also important to note that monotonic clocks differ between
+processes - each process has its own reference point. So it is
+meaningless to compare monotonic timestamps from different processes.
+
+Therefore, in order to calculate an interval, we must compare two
+monotonic timestamps from the same process.
+
+### Scheduler Stats
+
+The engine core process will collect some key statistics from the
+scheduler - e.g. the number of requests that were scheduled or waiting
+after the last scheduler pass - and include those statistics in
+`EngineCoreOutputs`.
+
+### Engine Core Events
+
+The engine core will also record the timestamp of certain per-request
+events so that the frontend can calculate the interval between these
+events.
+
+The events are:
+
+- `QUEUED` - when the request was received by the engine core and
+  added to the scheduler queue.
+- `SCHEDULED` - when the request was first scheduled for execution.
+- `PREEMPTED` - the request has been put back in the waiting queue
+  in order to make room for other requests to complete. It will be
+  re-scheduled in future and re-start its prefill phase.
+- `NEW_TOKENS` - when the output included in `EngineCoreOutput` was
+  generated. Since this is common to all requests in a given
+  iteration, we use a single timestamp on `EngineCoreOutputs` to
+  record this event.
+
+And the calculated intervals are:
+
+- Queue interval - between `QUEUED` and most recent `SCHEDULED`.
+- Prefill interval - between most recent `SCHEDULED` and the subsequent
+  first `NEW_TOKENS`.
+- Decode interval - between first (after the most recent `SCHEDULED`) and
+  last `NEW_TOKENS`.
+- Inference interval - between most recent `SCHEDULED` and last `NEW_TOKENS`.
+- Inter-token interval - between successive `NEW_TOKENS`.
+
+Put another way:
+
+![Interval calculations - common case](../../assets/design/v1/metrics/intervals-1.png)
+
+We explored the possibility of having the frontend calculate these
+intervals using the timing of events visible by the frontend. However,
+the frontend does not have visibility into the timing of the `QUEUED`
+and `SCHEDULED` events and, since we need to calculate intervals based
+on monotonic timestamps from the same process ... we need the engine
+core to record timestamps for all of these events.
+
+#### Interval Calculations vs Preemptions
+
+When a preemption occurs during decode, since any already generated
+tokens are reused, we consider the preemption as affecting the
+inter-token, decode, and inference intervals.
+
+![Interval calculations - preempted decode](../../assets/design/v1/metrics/intervals-2.png)
+
+When a preemption occurs during prefill (assuming such an event
+is possible), we consider the preemption as affecting the
+time-to-first-token and prefill intervals.
+
+![Interval calculations - preempted prefill](../../assets/design/v1/metrics/intervals-3.png)
+
+### Frontend Stats Collection
+
+As the frontend processes a single `EngineCoreOutputs` - i.e. the
+output from a single engine core iteration - it collects various
+statistics relating to that iteration:
+
+- The total number of new tokens generated in this iteration.
+- The total number of prompt tokens processed by the prefills that
+  completed in this iteration.
+- The queue intervals for any requests that were scheduled in this
+  iteration.
+- The prefill intervals for any requests that completed prefill in
+  this iteration.
+- The inter-token intervals (Time Per Output Token, TPOT), for all
+  requests included in this iteration.
+- The Time-To-First-Token (TTFT) for any requests that completed
+  prefill in this iteration. However, we calculate this interval
+  relative to when the request was first received by the frontend
+  (`arrival_time`) in order to account for input processing time.
+
+For any requests that were completed in a given iteration, we also
+record:
+
+- The inference and decode intervals - relative to the scheduled and
+  first token events, as described above.
+- End-to-end latency - the interval between frontend `arrival_time`
+  and the frontend receiving the final token.
+
+### Metrics Publishing - Logging
+
+The `LoggingStatLogger` metrics publisher outputs a log `INFO` message
+every 5 seconds with some key metrics:
+
+- The current number of running/waiting requests
+- The current GPU cache usage
+- The number of prompt tokens processed per second over the past 5
+  seconds
+- The number of new tokens generated per second over the past 5
+  seconds
+- The prefix cache hit rate over the most recent 1k kv-cache block queries
+
+### Metrics Publishing - Prometheus
+
+The `PrometheusStatLogger` metrics publisher makes the metrics
+available via a `/metrics` HTTP endpoint in a Prometheus-compatible
+format. A Prometheus instance can then be configured to poll this
+endpoint (e.g. every second) and record the values in its time-series
+database. Prometheus is often used via Grafana, allowing these metrics
+to be graphed over time.
+
+Prometheus supports the following metric types:
+
+- Counter: a value that will increase over time, never reducing, and
+  generally reset to zero when the vLLM instance restarts. For
+  example, the number of tokens generated over the lifetime of the
+  instance.
+- Gauge: a value that goes up and down, for example the number of
+  requests currently scheduled for execution.
+- Histogram: a count of metric samples, recorded in buckets. For
+  example, the number of requests whose TTFT was <1ms, <5ms, <10ms,
+  <20ms, and so on.
+
+Prometheus metrics can also be labelled, allowing metrics to be
+combined according to matching labels. In vLLM, we add a `model_name`
+label to every metric which includes the name of the model served by
+that instance.
+
+Example output:
+
+```bash
+$ curl http://0.0.0.0:8000/metrics
+# HELP vllm:num_requests_running Number of requests in model execution batches.
+# TYPE vllm:num_requests_running gauge
+vllm:num_requests_running{model_name="meta-llama/Llama-3.1-8B-Instruct"} 8.0
+...
+# HELP vllm:generation_tokens_total Number of generation tokens processed.
+# TYPE vllm:generation_tokens_total counter
+vllm:generation_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 27453.0
+...
+# HELP vllm:request_success_total Count of successfully processed requests.
+# TYPE vllm:request_success_total counter
+vllm:request_success_total{finished_reason="stop",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
+vllm:request_success_total{finished_reason="length",model_name="meta-llama/Llama-3.1-8B-Instruct"} 131.0
+vllm:request_success_total{finished_reason="abort",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
+...
+# HELP vllm:time_to_first_token_seconds Histogram of time to first token in seconds.
+# TYPE vllm:time_to_first_token_seconds histogram
+vllm:time_to_first_token_seconds_bucket{le="0.001",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
+vllm:time_to_first_token_seconds_bucket{le="0.005",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
+vllm:time_to_first_token_seconds_bucket{le="0.01",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
+vllm:time_to_first_token_seconds_bucket{le="0.02",model_name="meta-llama/Llama-3.1-8B-Instruct"} 13.0
+vllm:time_to_first_token_seconds_bucket{le="0.04",model_name="meta-llama/Llama-3.1-8B-Instruct"} 97.0
+vllm:time_to_first_token_seconds_bucket{le="0.06",model_name="meta-llama/Llama-3.1-8B-Instruct"} 123.0
+vllm:time_to_first_token_seconds_bucket{le="0.08",model_name="meta-llama/Llama-3.1-8B-Instruct"} 138.0
+vllm:time_to_first_token_seconds_bucket{le="0.1",model_name="meta-llama/Llama-3.1-8B-Instruct"} 140.0
+vllm:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 140.0
+```
+
+!!! note
+    The choice of histogram buckets to be most useful to users
+    across a broad set of use cases is not straightforward and will
+    require refinement over time.
+
+### Cache Config Info
+
+`prometheus_client` has support for
+[Info metrics](https://prometheus.github.io/client_python/instrumenting/info/)
+which are equivalent to a `Gauge` whose value is permanently set to 1,
+but exposes interesting key/value pair information via labels. This is
+used for information about an instance that does not change - so it
+only needs to be observed at startup - and allows comparing across
+instances in Prometheus.
+
+We use this concept for the `vllm:cache_config_info` metric:
+
+```
+# HELP vllm:cache_config_info Information of the LLMEngine CacheConfig
+# TYPE vllm:cache_config_info gauge
+vllm:cache_config_info{block_size="16",cache_dtype="auto",calculate_kv_scales="False",cpu_offload_gb="0",enable_prefix_caching="False",gpu_memory_utilization="0.9",...} 1.0
+```
+
+However, `prometheus_client` has
+[never supported Info metrics in multiprocessing mode](https://github.com/prometheus/client_python/pull/300) -
+for [unclear reasons](gh-pr:7279#discussion_r1710417152). We
+simply use a `Gauge` metric set to 1 and
+`multiprocess_mode="mostrecent"` instead.
+
+### LoRA Metrics
+
+The `vllm:lora_requests_info` `Gauge` is somewhat similar, except the
+value is the current wall-clock time, and is updated every iteration.
+
+The label names used are:
+
+- `running_lora_adapters`: a per-adapter count of the number requests
+  running using that adapter, formatted as a comma-separated string.
+- `waiting_lora_adapters`: similar, except counting requests that are
+  waiting to be scheduled.
+- `max_lora` - the static "max number of LoRAs in a single batch."
+  configuration.
+
+Encoding a running/waiting counts for multiple adapters in a
+comma-separated string seems quite misguided - we could use labels to
+distinguish between per-adapter counts. This should be revisited.
+
+Note that `multiprocess_mode="livemostrecent"` is used - the most
+recent metric is used, but only from currently running processes.
+
+This was added in <gh-pr:9477> and there is
+[at least one known user](https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/54).
+If we revisit this design and deprecate the old metric, we should reduce
+the need for a significant deprecation period by making the change in
+v0 also and asking this project to move to the new metric.
+
+### Prefix Cache metrics
+
+The discussion in <gh-issue:10582> about adding prefix cache metrics yielded
+some interesting points which may be relevant to how we approach
+future metrics.
+
+Every time the prefix cache is queried, we record the number of tokens
+queried and the number of queried tokens present in the cache
+(i.e. hits).
+
+However, the metric of interest is the hit rate - i.e. the number of
+hits per query.
+
+In the case of logging, we expect the user is best served by
+calculating the hit rate over a fixed number of the most recent
+queries (the interval is fixed to 1k most recent queries for now).
+
+In the case of Prometheus though, we should take advantage of the
+time-series nature of Prometheus and allow the user to calculate the
+hit rate over an interval of their choosing. For example, a PromQL
+query to calculate the hit interval of the past 5 minutes:
+
+```text
+rate(cache_query_hit[5m]) / rate(cache_query_total[5m])
+```
+
+To achieve this, we should record the queries and hits as counters in
+Prometheus, rather than recording the hit rate as a gauge.
+
+## Deprecated Metrics
+
+### How To Deprecate
+
+Deprecating metrics shouldn't be taken lightly. Users may not notice a
+metric has been deprecated, and may be quite inconvenienced when it is
+suddenly (from their perspective) when it is removed, even if there is
+an equivalent metric for them to use.
+
+As an example, see how `vllm:avg_prompt_throughput_toks_per_s` was
+[deprecated](gh-pr:2764) (with a comment in the code),
+[removed](gh-pr:12383), and then [noticed by a user](gh-issue:13218).
+
+In general:
+
+1. We should be cautious about deprecating metrics, especially since
+   it can be hard to predict the user impact.
+2. We should include a prominent deprecation notice in the help string
+   that is included in the `/metrics' output.
+3. We should list deprecated metrics in user-facing documentation and
+   release notes.
+4. We should consider hiding deprecated metrics behind a CLI argument
+   in order to give administrators
+   [an escape hatch](https://kubernetes.io/docs/concepts/cluster-administration/system-metrics/#show-hidden-metrics)
+   for some time before deleting them.
+
+See the [deprecation policy](../../contributing/deprecation_policy.md) for
+the project-wide deprecation policy.
+
+### Unimplemented - `vllm:tokens_total`
+
+Added by <gh-pr:4464>, but apparently never implemented. This can just be
+removed.
+
+### Duplicated - Queue Time
+
+The `vllm:time_in_queue_requests` Histogram metric was added by
+<gh-pr:9659> and its calculation is:
+
+```python
+    self.metrics.first_scheduled_time = now
+    self.metrics.time_in_queue = now - self.metrics.arrival_time
+```
+
+Two weeks later, <gh-pr:4464> added `vllm:request_queue_time_seconds` leaving
+us with:
+
+```python
+if seq_group.is_finished():
+    if (seq_group.metrics.first_scheduled_time is not None and
+            seq_group.metrics.first_token_time is not None):
+        time_queue_requests.append(
+            seq_group.metrics.first_scheduled_time -
+            seq_group.metrics.arrival_time)
+    ...
+    if seq_group.metrics.time_in_queue is not None:
+        time_in_queue_requests.append(
+            seq_group.metrics.time_in_queue)
+```
+
+This seems duplicative, and one of them should be removed. The latter
+is used by the Grafana dashboard, so we should deprecate or remove the
+former from v0.
+
+### Prefix Cache Hit Rate
+
+See above - we now expose 'queries' and 'hits' counters rather than a
+'hit rate' gauge.
+
+### KV Cache Offloading
+
+Two v0 metrics relate to a "swapped" preemption mode that is no
+longer relevant in v1:
+
+- `vllm:num_requests_swapped`
+- `vllm:cpu_cache_usage_perc`
+
+In this mode, when a request is preempted (e.g. to make room in KV
+cache to complete other requests), we swap kv cache blocks out to CPU
+memory. This is also known as "KV cache offloading" and is configured
+with `--swap-space` and `--preemption-mode`.
+
+In v0, [vLLM has long supported beam search](gh-issue:6226). The
+SequenceGroup encapsulated the idea of N Sequences which
+all shared the same prompt kv blocks. This enabled KV cache block
+sharing between requests, and copy-on-write to do branching. CPU
+swapping was intended for these beam search like cases.
+
+Later, the concept of prefix caching was introduced, which allowed KV
+cache blocks to be shared implicitly. This proved to be a better
+option than CPU swapping since blocks can be evicted slowly on demand
+and the part of the prompt that was evicted can be recomputed.
+
+SequenceGroup was removed in V1, although a replacement will be
+required for "parallel sampling" (`n>1`).
+[Beam search was moved out of the core (in V0)](gh-issue:8306). There was a
+lot of complex code for a very uncommon feature.
+
+In V1, with prefix caching being better (zero over head) and therefore
+on by default, the preemption and recompute strategy should work
+better.
+
+## Future Work
+
+### Parallel Sampling
+
+Some v0 metrics are only relevant in the context of "parallel
+sampling". This is where the `n` parameter in a request is used to
+request multiple completions from the same prompt.
+
+As part of adding parallel sampling support in <gh-pr:10980>, we should
+also add these metrics.
+
+- `vllm:request_params_n` (Histogram)
+
+  Observes the value of the 'n' parameter of every finished request.
+
+- `vllm:request_max_num_generation_tokens` (Histogram)
+
+  Observes the maximum output length of all sequences in every finished
+  sequence group. In the absence of parallel sampling, this is
+  equivalent to `vllm:request_generation_tokens`.
+
+### Speculative Decoding
+
+Some v0 metrics are specific to "speculative decoding". This is where
+we generate candidate tokens using a faster, approximate method or
+model and then validate those tokens with the larger model.
+
+- `vllm:spec_decode_draft_acceptance_rate` (Gauge)
+- `vllm:spec_decode_efficiency` (Gauge)
+- `vllm:spec_decode_num_accepted_tokens_total` (Counter)
+- `vllm:spec_decode_num_draft_tokens_total` (Counter)
+- `vllm:spec_decode_num_emitted_tokens_total` (Counter)
+
+There is a PR under review (<gh-pr:12193>) to add "prompt lookup (ngram)"
+seculative decoding to v1. Other techniques will follow. We should
+revisit the v0 metrics in this context.
+
+!!! note
+    We should probably expose acceptance rate as separate accepted
+    and draft counters, like we do for prefix caching hit rate. Efficiency
+    likely also needs similar treatment.
+
+### Autoscaling and Load-balancing
+
+A common use case for our metrics is to support automated scaling of
+vLLM instances.
+
+For related discussion from the
+[Kubernetes Serving Working Group](https://github.com/kubernetes/community/tree/master/wg-serving),
+see:
+
+- [Standardizing Large Model Server Metrics in Kubernetes](https://docs.google.com/document/d/1SpSp1E6moa4HSrJnS4x3NpLuj88sMXr2tbofKlzTZpk)
+- [Benchmarking LLM Workloads for Performance Evaluation and Autoscaling in Kubernetes](https://docs.google.com/document/d/1k4Q4X14hW4vftElIuYGDu5KDe2LtV1XammoG-Xi3bbQ)
+- [Inference Perf](https://github.com/kubernetes-sigs/wg-serving/tree/main/proposals/013-inference-perf)
+- <gh-issue:5041> and <gh-pr:12726>.
+  
+This is a non-trivial topic. Consider this comment from Rob:
+
+> I think this metric should focus on trying to estimate what the max
+> concurrency that will cause the average request length > queries per
+> second ... since this is really what will "saturate" the server.
+
+A clear goal is that we should expose the metrics required to detect
+this saturation point, so administrators can implement auto-scaling
+rules based on those. However, in order to do so, we need to have a
+clear view on how an administrator (and automated monitoring system)
+should judge an instance as approaching saturation:
+
+> To identify, what is the saturation point for model server compute
+> (the inflection point where we cannot get more throughput with a
+> higher request rate, but start to incur additional latency) so we
+> can autoscale effectively?
+
+### Metric Naming
+
+Our approach to naming metrics probably deserves to be revisited:
+
+1. The use of colons in metric names seems contrary to
+   ["colons are reserved for user defined recording rules"](https://prometheus.io/docs/concepts/data_model/#metric-names-and-labels).
+2. Most of our metrics follow the convention of ending with units, but
+   not all do.
+3. Some of our metric names end with `_total`:
+
+    If there is a suffix of `_total` on the metric name, it will be removed. When
+    exposing the time series for counter, a `_total` suffix will be added. This is
+    for compatibility between OpenMetrics and the Prometheus text format, as OpenMetrics
+    requires the `_total` suffix.
+
+### Adding More Metrics
+
+There is no shortage of ideas for new metrics:
+
+- Examples from other projects like
+  [TGI](https://github.com/IBM/text-generation-inference?tab=readme-ov-file#metrics)
+- Proposals arising from specific use cases, like the Kubernetes
+  auto-scaling topic above
+- Proposals that might arise out of standardisation efforts like
+  [OpenTelemetry Semantic Conventions for Gen AI](https://github.com/open-telemetry/semantic-conventions/tree/main/docs/gen-ai).
+
+We should be cautious in our approach to adding new metrics. While
+metrics are often relatively straightforward to add:
+
+1. They can be difficult to remove - see the section on deprecation
+   above.
+2. They can have a meaningful performance impact when enabled. And
+   metrics are usually of very limited use unless they can be enabled
+   by default and in production.
+3. They have an impact on development and maintenance of the
+   project. Every metric added to v0 has made this v1 effort more
+   time-consuming, and perhaps not all metrics justify this ongoing
+   investment in their maintenance.
+
+## Tracing - OpenTelemetry
+
+Metrics provide an aggregated view over time of the system's
+performance and health. Tracing, on the other hand, tracks individual
+requests as they move through different services and components. Both
+fall under the more general heading of "Observability".
+
+v0 has support for OpenTelemetry tracing:
+
+- Added by <gh-pr:4687>
+- Configured with `--oltp-traces-endpoint` and `--collect-detailed-traces`
+- [OpenTelemetry blog post](https://opentelemetry.io/blog/2024/llm-observability/)
+- [User-facing docs](../../examples/online_serving/opentelemetry.md)
+- [Blog post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f)
+- [IBM product docs](https://www.ibm.com/docs/en/instana-observability/current?topic=mgaa-monitoring-large-language-models-llms-vllm-public-preview)
+  
+OpenTelemetry has a
+[Gen AI Working Group](https://github.com/open-telemetry/community/blob/main/projects/gen-ai.md).
+
+Since metrics is a big enough topic on its own, we are going to tackle
+the topic of tracing in v1 separately.
+
+### OpenTelemetry Model Forward vs Execute Time
+
+In v0, we have the following two metrics:
+
+- `vllm:model_forward_time_milliseconds` (Histogram) - The time spent
+  in the model forward pass when this request was in the batch.
+- `vllm:model_execute_time_milliseconds` (Histogram) - The time spent
+  in the model execute function. This will include model forward,
+  block/sync across workers, cpu-gpu sync time and sampling time.
+
+These metrics are only enabled when OpenTelemetry tracing is enabled
+and if `--collect-detailed-traces=all/model/worker` is used. The
+documentation for this option states:
+
+> collect detailed traces for the specified modules. This involves
+> use of possibly costly and or blocking operations and hence might
+> have a performance impact.
+
+The metrics were added by <gh-pr:7089> and who up in an OpenTelemetry trace
+as:
+
+```
+-> gen_ai.latency.time_in_scheduler: Double(0.017550230026245117)
+-> gen_ai.latency.time_in_model_forward: Double(3.151565277099609)
+-> gen_ai.latency.time_in_model_execute: Double(3.6468167304992676)
+```
+
+We already have `inference_time` and `decode_time` metrics, so the
+question is whether there are sufficiently common use cases for the
+higher-resolution timings to justify the overhead.
+
+Since we are going to treat the question of OpenTelemetry support
+separately, we will include these particular metrics under that topic.
diff --git a/vllm_v0.10.0/docs/design/v1/multiprocessing.md b/vllm_v0.10.0/docs/design/v1/multiprocessing.md
new file mode 100644
index 0000000..06ebd77
--- /dev/null
+++ b/vllm_v0.10.0/docs/design/v1/multiprocessing.md
@@ -0,0 +1,195 @@
+# Python Multiprocessing
+
+## Debugging
+
+Please see the [Troubleshooting][troubleshooting-python-multiprocessing]
+page for information on known issues and how to solve them.
+
+## Introduction
+
+!!! important
+    The source code references are to the state of the code at the time of writing in December, 2024.
+
+The use of Python multiprocessing in vLLM is complicated by:
+
+- The use of vLLM as a library and the inability to control the code using vLLM
+- Varying levels of incompatibilities between multiprocessing methods and vLLM
+  dependencies
+
+This document describes how vLLM deals with these challenges.
+
+## Multiprocessing Methods
+
+[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include:
+
+- `spawn` - spawn a new Python process. The default on Windows and macOS.
+
+- `fork` - Use `os.fork()` to fork the Python interpreter. The default on
+  Linux for Python versions prior to 3.14.
+
+- `forkserver` - Spawn a server process that will fork a new process on request.
+  The default on Linux for Python version 3.14 and newer.
+
+### Tradeoffs
+
+`fork` is the fastest method, but is incompatible with dependencies that use
+threads. If you are under macOS, using `fork` may cause the process to crash.
+
+`spawn` is more compatible with dependencies, but can be problematic when vLLM
+is used as a library. If the consuming code does not use a `__main__` guard (`if
+__name__ == "__main__":`), the code will be inadvertently re-executed when vLLM
+spawns a new process. This can lead to infinite recursion, among other problems.
+
+`forkserver` will spawn a new server process that will fork new processes on
+demand. This unfortunately has the same problem as `spawn` when vLLM is used as
+a library. The server process is created as a spawned new process, which will
+re-execute code not protected by a `__main__` guard.
+
+For both `spawn` and `forkserver`, the process must not depend on inheriting any
+global state as would be the case with `fork`.
+
+## Compatibility with Dependencies
+
+Multiple vLLM dependencies indicate either a preference or requirement for using
+`spawn`:
+
+- <https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing>
+- <https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors>
+- <https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders>
+
+It is perhaps more accurate to say that there are known problems with using
+`fork` after initializing these dependencies.
+
+## Current State (v0)
+
+The environment variable `VLLM_WORKER_MULTIPROC_METHOD` can be used to control which method is used by vLLM. The current default is `fork`.
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/envs.py#L339-L342>
+
+When we know we own the process because the `vllm` command was used, we use
+`spawn` because it's the most widely compatible.
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/scripts.py#L123-L140>
+
+The `multiproc_xpu_executor` forces the use of `spawn`.
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/executor/multiproc_xpu_executor.py#L14-L18>
+
+There are other miscellaneous places hard-coding the use of `spawn`:
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L135>
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/entrypoints/openai/api_server.py#L184>
+
+Related PRs:
+
+- <gh-pr:8823>
+
+## Prior State in v1
+
+There was an environment variable to control whether multiprocessing is used in
+the v1 engine core, `VLLM_ENABLE_V1_MULTIPROCESSING`. This defaulted to off.
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/envs.py#L452-L454>
+
+When it was enabled, the v1 `LLMEngine` would create a new process to run the
+engine core.
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/llm_engine.py#L93-L95>
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/llm_engine.py#L70-L77>
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/core_client.py#L44-L45>
+
+It was off by default for all the reasons mentioned above - compatibility with
+dependencies and code using vLLM as a library.
+
+### Changes Made in v1
+
+There is not an easy solution with Python's `multiprocessing` that will work
+everywhere. As a first step, we can get v1 into a state where it does "best
+effort" choice of multiprocessing method to maximize compatibility.
+
+- Default to `fork`.
+- Use `spawn` when we know we control the main process (`vllm` was executed).
+- If we detect `cuda` was previously initialized, force `spawn` and emit a
+  warning. We know `fork` will break, so this is the best we can do.
+
+The case that is known to still break in this scenario is code using vLLM as a
+library that initializes `cuda` before calling vLLM. The warning we emit should
+instruct users to either add a `__main__` guard or to disable multiprocessing.
+
+If that known-failure case occurs, the user will see two messages that explain
+what is happening. First, a log message from vLLM:
+
+```console
+WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
+    initialized. We must use the `spawn` multiprocessing start method. Setting
+    VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
+    https://docs.vllm.ai/en/latest/usage/troubleshooting.html#python-multiprocessing
+    for more information.
+```
+
+Second, Python itself will raise an exception with a nice explanation:
+
+```console
+RuntimeError:
+        An attempt has been made to start a new process before the
+        current process has finished its bootstrapping phase.
+
+        This probably means that you are not using fork to start your
+        child processes and you have forgotten to use the proper idiom
+        in the main module:
+
+            if __name__ == '__main__':
+                freeze_support()
+                ...
+
+        The "freeze_support()" line can be omitted if the program
+        is not going to be frozen to produce an executable.
+
+        To fix this issue, refer to the "Safe importing of main module"
+        section in https://docs.python.org/3/library/multiprocessing.html
+```
+
+## Alternatives Considered
+
+### Detect if a `__main__` guard is present
+
+It has been suggested that we could behave better if we could detect whether
+code using vLLM as a library has a `__main__` guard in place. This [post on
+stackoverflow](https://stackoverflow.com/questions/77220442/multiprocessing-pool-in-a-python-class-without-name-main-guard)
+was from a library author facing the same question.
+
+It is possible to detect whether we are in the original, `__main__` process, or
+a subsequent spawned process. However, it does not appear to be straight forward
+to detect whether a `__main__` guard is present in the code.
+
+This option has been discarded as impractical.
+
+### Use `forkserver`
+
+At first it appears that `forkserver` is a nice solution to the problem.
+However, the way it works presents the same challenges that `spawn` does when
+vLLM is used as a library.
+
+### Force `spawn` all the time
+
+One way to clean this up is to just force the use of `spawn` all the time and
+document that the use of a `__main__` guard is required when using vLLM as a
+library. This would unfortunately break existing code and make vLLM harder to
+use, violating the desire to make the `LLM` class as easy as possible to use.
+
+Instead of pushing this on our users, we will retain the complexity to do our
+best to make things work.
+
+## Future Work
+
+We may want to consider a different worker management approach in the future
+that works around these challenges.
+
+1. We could implement something `forkserver`-like, but have the process manager
+   be something we initially launch by running our own subprocess and a custom
+   entrypoint for worker management (launch a `vllm-manager` process).
+
+2. We can explore other libraries that may better suit our needs. Examples to
+   consider:
+
+- <https://github.com/joblib/loky>
diff --git a/vllm_v0.10.0/docs/design/v1/p2p_nccl_connector.md b/vllm_v0.10.0/docs/design/v1/p2p_nccl_connector.md
new file mode 100644
index 0000000..9f6acf3
--- /dev/null
+++ b/vllm_v0.10.0/docs/design/v1/p2p_nccl_connector.md
@@ -0,0 +1,323 @@
+An implementation of xPyD with dynamic scaling based on point-to-point communication, partly inspired by Dynamo.
+
+# Detailed Design
+
+## Overall Process
+As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow:  
+
+1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface.  
+2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**.  
+3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**.  
+4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`.  
+5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**.  
+6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**.  
+7. After completing **Decode**, the D instance returns the result to the **Proxy/Router**, which then forwards it to the **client**.
+
+![image1](https://github.com/user-attachments/assets/fb01bde6-755b-49f7-ad45-48a94b1e10a7)
+
+## Proxy/Router (Demo)
+
+A simple HTTP service acts as the entry point for client requests and starts a background thread to listen for P/D instances reporting their HTTP IP and PORT, as well as ZMQ IP and PORT. It maintains a dictionary of `http_addr -> zmq_addr`. The `http_addr` is the IP:PORT for the vLLM instance's request, while the `zmq_addr` is the address for KV cache handshake and metadata reception.
+
+The Proxy/Router is responsible for selecting 1P1D based on the characteristics of the client request, such as the prompt, and generating a corresponding `request_id`, for example:
+
+```
+cmpl-___prefill_addr_10.0.1.2:21001___decode_addr_10.0.1.3:22001_93923d63113b4b338973f24d19d4bf11-0
+```
+
+Currently, to quickly verify whether xPyD can work, a round-robin selection of 1P1D is used. In the future, it is planned to use a trie combined with the load status of instances to select appropriate P and D.
+
+Each P/D instance periodically sends a heartbeat packet to the Proxy/Router (currently every 3 seconds) to register (i.e., report `http_addr -> zmq_addr`) and keep the connection alive. If an instance crashes and fails to send a ping for a certain period of time, the Proxy/Router will remove the timed-out instance (this feature has not yet been developed).
+
+## KV Cache Transfer Methods
+
+There are three methods for KVCache transfer: PUT, GET, and PUT_ASYNC. These methods can be specified using the `--kv-transfer-config` and `kv_connector_extra_config` parameters, specifically through the `send_type` field. Both PUT and PUT_ASYNC involve the P instance actively sending KVCache to the D instance. The difference is that PUT is a synchronous transfer method that blocks the main process, while PUT_ASYNC is an asynchronous transfer method. PUT_ASYNC uses a dedicated thread for sending KVCache, which means it does not block the main process. In contrast, the GET method involves the P instance saving the KVCache to the memory buffer after computing the prefill. The D instance then actively retrieves the computed KVCache from the P instance once it has allocated space for the KVCache.
+
+Experimental results have shown that the performance of these methods, from highest to lowest, is as follows: PUT_ASYNC → GET → PUT.
+
+## P2P Communication via ZMQ & NCCL
+
+As long as the address of the counterpart is known, point-to-point KV cache transfer (using NCCL) can be performed, without being constrained by rank and world size. To support dynamic scaling (expansion and contraction) of instances with PD disaggregation. This means that adding or removing P/D instances does not require a full system restart.
+
+Each P/D instance only needs to create a single `P2pNcclEngine` instance. This instance maintains a ZMQ Server, which runs a dedicated thread to listen on the `zmq_addr` address and receive control flow requests from other instances. These requests include requests to establish an NCCL connection and requests to send KVCache metadata (such as tensor shapes and data types). However, it does not actually transmit the KVCache data itself.
+
+When a P instance and a D instance transmit KVCache for the first time, they need to establish a ZMQ connection and an NCCL group. For subsequent KVCache transmissions, this ZMQ connection and NCCL group are reused. The NCCL group consists of only two ranks, meaning the world size is equal to 2. This design is intended to support dynamic scaling, which means that adding or removing P/D instances does not require a full system restart. As long as the address of the counterpart is known, point-to-point KVCache transmission can be performed, without being restricted by rank or world size.
+
+## NCCL Group Topology
+
+Currently, only symmetric TP (Tensor Parallelism) methods are supported for KVCache transmission. Asymmetric TP and PP (Pipeline Parallelism) methods will be supported in the future. Figure 2 illustrates the 1P2D setup, where each instance has a TP (Tensor Parallelism) degree of 2. There are a total of 7 NCCL groups: three vLLM instances each have one NCCL group with TP=2. Additionally, the 0th GPU card of the P instance establishes an NCCL group with the 0th GPU card of each D instance. Similarly, the 1st GPU card of the P instance establishes an NCCL group with the 1st GPU card of each D instance.
+
+![image2](https://github.com/user-attachments/assets/837e61d6-365e-4cbf-8640-6dd7ab295b36)
+
+Each NCCL group occupies a certain amount of GPU memory buffer for communication, the size of which is primarily influenced by the `NCCL_MAX_NCHANNELS` environment variable. When `NCCL_MAX_NCHANNELS=16`, an NCCL group typically occupies 100MB, while when `NCCL_MAX_NCHANNELS=8`, it usually takes up 52MB. For large-scale xPyD configurations—such as DeepSeek's 96P144D—this implementation is currently not feasible. Moving forward, we are considering using RDMA for point-to-point communication and are also keeping an eye on UCCL.
+
+## GPU Memory Buffer and Tensor Memory Pool
+
+The trade-off in the size of the memory buffer is as follows: For P instances, the memory buffer is not required in PUT and PUT_ASYNC modes, but it is necessary in GET mode. For D instances, a memory buffer is needed in all three modes. The memory buffer for D instances should not be too large. Similarly, for P instances in GET mode, the memory buffer should also not be too large. The memory buffer of D instances is used to temporarily store KVCache sent by P instances. If it is too large, it will reduce the KVCache space available for normal inference by D instances, thereby decreasing the inference batch size and ultimately leading to a reduction in output throughput. The size of the memory buffer is configured by the parameter `kv_buffer_size`, measured in bytes, and is typically set to 5%～10% of the memory size.
+
+If the `--max-num-seqs` parameter for P instances is set to a large value, due to the large batch size, P instances will generate a large amount of KVCache simultaneously. This may exceed the capacity of the memory buffer of D instances, resulting in KVCache loss. Once KVCache is lost, D instances need to recompute Prefill, which is equivalent to performing Prefill twice. Consequently, the time-to-first-token (TTFT) will significantly increase, leading to degraded performance.
+
+To address the above issues, I have designed and developed a local Tensor memory pool for storing KVCache, inspired by the buddy system used in Linux memory modules. Since the memory is sufficiently large, typically in the TB range on servers, there is no need to consider prefix caching or using block-based designs to reuse memory, thereby saving space. When the memory buffer is insufficient, KVCache can be directly stored in the Tensor memory pool, and D instances can subsequently retrieve KVCache from it. The read and write speed is that of PCIe, with PCIe 4.0 having a speed of approximately 21 GB/s, which is usually faster than the Prefill speed. Otherwise, solutions like Mooncake and lmcache would not be necessary. The Tensor memory pool acts as a flood diversion area, typically unused except during sudden traffic surges. In the worst-case scenario, my solution performs no worse than the normal situation with a Cache store.
+
+# Install vLLM
+
+```shell
+pip install "vllm>=0.9.2"
+```
+
+# Run xPyD
+
+## Instructions
+- The following examples are run on an A800 (80GB) device, using the Meta-Llama-3.1-8B-Instruct model.
+- Pay attention to the setting of the `kv_buffer_size` (in bytes). The empirical value is 10% of the GPU memory size. This is related to the kvcache size. If it is too small, the GPU memory buffer for temporarily storing the received kvcache will overflow, causing the kvcache to be stored in the tensor memory pool, which increases latency. If it is too large, the kvcache available for inference will be reduced, leading to a smaller batch size and decreased throughput.
+- For Prefill instances, when using non-GET mode, the `kv_buffer_size` can be set to 1, as Prefill currently does not need to receive kvcache. However, when using GET mode, a larger `kv_buffer_size` is required because it needs to store the kvcache sent to the D instance.
+- You may need to modify the `kv_buffer_size` and `port` in the following commands (if there is a conflict).
+- `PUT_ASYNC` offers the best performance and should be prioritized.
+- The `--port` must be consistent with the `http_port` in the `--kv-transfer-config`.
+- The `disagg_proxy_p2p_nccl_xpyd.py` script will use port 10001 (for receiving client requests) and port 30001 (for receiving service discovery from P and D instances).
+- The node running the proxy must have `quart` installed.
+- Supports multiple nodes; you just need to modify the `proxy_ip` and `proxy_port` in `--kv-transfer-config`.
+- In the following examples, it is assumed that **the proxy's IP is 10.0.1.1**.
+
+## Run 1P3D
+
+### Proxy (e.g. 10.0.1.1)
+
+```shell
+cd {your vllm directory}/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/
+python3 disagg_proxy_p2p_nccl_xpyd.py &
+```
+
+### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
+
+??? console "Command"
+
+    ```shell
+    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
+        --host 0.0.0.0 \
+        --port 20001 \
+        --tensor-parallel-size 1 \
+        --seed 1024 \
+        --served-model-name base_model \
+        --dtype float16 \
+        --max-model-len 10000 \
+        --max-num-batched-tokens 10000 \
+        --max-num-seqs 256 \
+        --trust-remote-code \
+        --gpu-memory-utilization 0.9 \
+        --disable-log-request \
+        --kv-transfer-config \
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20001"}}' > /var/vllm.log 2>&1 &
+    ```
+
+### Decode1 (e.g. 10.0.1.3 or 10.0.1.1)
+
+??? console "Command"
+
+    ```shell
+    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
+        --host 0.0.0.0 \
+        --port 20002 \
+        --tensor-parallel-size 1 \
+        --seed 1024 \
+        --served-model-name base_model \
+        --dtype float16 \
+        --max-model-len 10000 \
+        --max-num-batched-tokens 10000 \
+        --max-num-seqs 256 \
+        --trust-remote-code \
+        --gpu-memory-utilization 0.7 \
+        --disable-log-request \
+        --kv-transfer-config \
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20002"}}' > /var/vllm.log 2>&1 &
+    ```
+
+### Decode2 (e.g. 10.0.1.4 or 10.0.1.1)
+
+??? console "Command"
+
+    ```shell
+    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
+        --host 0.0.0.0 \
+        --port 20003 \
+        --tensor-parallel-size 1 \
+        --seed 1024 \
+        --served-model-name base_model \
+        --dtype float16 \
+        --max-model-len 10000 \
+        --max-num-batched-tokens 10000 \
+        --max-num-seqs 256 \
+        --trust-remote-code \
+        --gpu-memory-utilization 0.7 \
+        --disable-log-request \
+        --kv-transfer-config \
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003"}}' > /var/vllm.log 2>&1 &
+    ```
+
+### Decode3 (e.g. 10.0.1.5 or 10.0.1.1)
+
+??? console "Command"
+
+    ```shell
+    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
+        --host 0.0.0.0 \
+        --port 20004 \
+        --tensor-parallel-size 1 \
+        --seed 1024 \
+        --served-model-name base_model \
+        --dtype float16 \
+        --max-model-len 10000 \
+        --max-num-batched-tokens 10000 \
+        --max-num-seqs 256 \
+        --trust-remote-code \
+        --gpu-memory-utilization 0.7 \
+        --disable-log-request \
+        --kv-transfer-config \
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20004"}}' > /var/vllm.log 2>&1 &
+    ```
+
+## Run 3P1D
+
+### Proxy (e.g. 10.0.1.1)
+
+```shell
+cd {your vllm directory}/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/
+python3 disagg_proxy_p2p_nccl_xpyd.py &
+```
+
+### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
+
+??? console "Command"
+
+    ```shell
+    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
+        --host 0.0.0.0 \
+        --port 20001 \
+        --tensor-parallel-size 1 \
+        --seed 1024 \
+        --served-model-name base_model \
+        --dtype float16 \
+        --max-model-len 10000 \
+        --max-num-batched-tokens 10000 \
+        --max-num-seqs 256 \
+        --trust-remote-code \
+        --gpu-memory-utilization 0.9 \
+        --disable-log-request \
+        --kv-transfer-config \
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20001"}}' > /var/vllm.log 2>&1 &
+    ```
+
+### Prefill2 (e.g. 10.0.1.3 or 10.0.1.1)
+
+??? console "Command"
+
+    ```shell
+    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
+        --host 0.0.0.0 \
+        --port 20002 \
+        --tensor-parallel-size 1 \
+        --seed 1024 \
+        --served-model-name base_model \
+        --dtype float16 \
+        --max-model-len 10000 \
+        --max-num-batched-tokens 10000 \
+        --max-num-seqs 256 \
+        --trust-remote-code \
+        --gpu-memory-utilization 0.9 \
+        --disable-log-request \
+        --kv-transfer-config \
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20002"}}' > /var/vllm.log 2>&1 &
+    ```
+
+### Prefill3 (e.g. 10.0.1.4 or 10.0.1.1)
+
+??? console "Command"
+
+    ```shell
+    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
+        --host 0.0.0.0 \
+        --port 20003 \
+        --tensor-parallel-size 1 \
+        --seed 1024 \
+        --served-model-name base_model \
+        --dtype float16 \
+        --max-model-len 10000 \
+        --max-num-batched-tokens 10000 \
+        --max-num-seqs 256 \
+        --trust-remote-code \
+        --gpu-memory-utilization 0.9 \
+        --disable-log-request \
+        --kv-transfer-config \
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003"}}' > /var/vllm.log 2>&1 &
+    ```
+
+### Decode1 (e.g. 10.0.1.5 or 10.0.1.1)
+
+??? console "Command"
+
+    ```shell
+    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
+        --host 0.0.0.0 \
+        --port 20004 \
+        --tensor-parallel-size 1 \
+        --seed 1024 \
+        --served-model-name base_model \
+        --dtype float16 \
+        --max-model-len 10000 \
+        --max-num-batched-tokens 10000 \
+        --max-num-seqs 256 \
+        --trust-remote-code \
+        --gpu-memory-utilization 0.7 \
+        --disable-log-request \
+        --kv-transfer-config \
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20004"}}' > /var/vllm.log 2>&1 &
+    ```
+
+# Single request
+
+```shell
+curl -X POST -s http://10.0.1.1:10001/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+    "model": "base_model",
+    "prompt": "San Francisco is a",
+    "max_tokens": 10,
+    "temperature": 0
+}'
+```
+
+# Benchmark
+
+??? console "Command"
+
+    ```shell
+    python3 benchmark_serving.py \
+        --backend vllm \
+        --model base_model \
+        --tokenizer meta-llama/Llama-3.1-8B-Instruct \
+        --dataset-name "random" \
+        --host 10.0.1.1 \
+        --port 10001 \
+        --random-input-len 1024 \
+        --random-output-len 1024 \
+        --ignore-eos \
+        --burstiness 100 \
+        --percentile-metrics "ttft,tpot,itl,e2el" \
+        --metric-percentiles "90,95,99" \
+        --seed $(date +%s) \
+        --trust-remote-code \
+        --request-rate 3 \
+        --num-prompts 1000
+    ```
+
+# Shut down
+
+```shell
+pgrep python | xargs kill -9 && pkill -f python
+```
+
+# Test data
+
+## **Scenario**: 1K input & 200 output tokens, E2E P99 latency ~2s
+
+![testdata](https://github.com/user-attachments/assets/cef0953b-4567-4bf9-b940-405b92a28eb1)
diff --git a/vllm_v0.10.0/docs/design/v1/prefix_caching.md b/vllm_v0.10.0/docs/design/v1/prefix_caching.md
new file mode 100644
index 0000000..2d3c841
--- /dev/null
+++ b/vllm_v0.10.0/docs/design/v1/prefix_caching.md
@@ -0,0 +1,231 @@
+# Automatic Prefix Caching
+
+Prefix caching kv-cache blocks is a popular optimization in LLM inference to avoid redundant prompt computations. The core idea is simple – we cache the kv-cache blocks of processed requests, and reuse these blocks when a new request comes in with the same prefix as previous requests. Since prefix caching is almost a free lunch and won’t change model outputs, it has been widely used by many public endpoints (e.g., OpenAI, Anthropic, etc) and most open source LLM inference frameworks (e.g., SGLang).
+
+While there are many ways to implement prefix caching, vLLM chooses a hash-based approach. Specifically, we hash each kv-cache block by the tokens in the block and the tokens in the prefix before the block:
+
+```text
+                    Block 1                  Block 2                  Block 3
+         [A gentle breeze stirred] [the leaves as children] [laughed in the distance]
+Block 1: |<--- block tokens ---->|
+Block 2: |<------- prefix ------>| |<--- block tokens --->|
+Block 3: |<------------------ prefix -------------------->| |<--- block tokens ---->|
+```
+
+In the example above, the KV cache in the first block can be uniquely identified with the token “A gentle breeze stirred”. The third block can be uniquely identified with the tokens in the block “laughed in the distance”, along with the prefix tokens “A gentle breeze stirred the leaves as children”. Therefore, we can build the block hash of `hash(tuple[components])`, where components are:
+
+* Parent hash value: The hash value of the parent hash block.
+* Block tokens: A tuple of tokens in this block. The reason to include the exact tokens is to reduce potential hash value collision.
+* Extra hashes: Other values required to make this block unique, such as LoRA IDs, multi-modality input hashes (see the example below), and cache salts to isolate caches in multi-tenant environments.
+
+> **Note 1:** We only cache full blocks.
+
+> **Note 2:** The above hash key structure is not 100% collision free. Theoretically it’s still possible for the different prefix tokens to have the same hash value. To avoid any hash collisions **in a multi-tenant setup, we advise to use SHA256** as hash function instead of the default builtin hash.
+SHA256 is supported since vLLM v0.8.3 and must be enabled with a command line argument. It comes with a performance impact of about 100-200ns per token (~6ms for 50k tokens of context).
+
+**A hashing example with multi-modality inputs**  
+In this example, we illustrate how prefix caching works with multi-modality inputs (e.g., images). Assuming we have a request with the following messages:
+
+```text
+messages = [
+    {"role": "user",
+     "content": [
+         {"type": "text",
+          "text": "What's in this image?"
+         },
+         {"type": "image_url",
+          "image_url": {"url": image_url},
+         },
+    ]},
+]
+```
+
+It will become the following prompt:
+
+```text
+Prompt:
+    <s>[INST]What's in this image?\n[IMG][/INST]
+
+Tokenized prompt:
+    [1, 3, 7493, 1681, 1294, 1593, 3937, 9551, 10, 4]
+
+Prompt with placeholders (<P>):
+    [1, 3, 7493, 1681, 1294, 1593, 3937, 9551, <P>, <P>, ..., <P>, 4]
+```
+
+As we can see, after the tokenization, the `[IMG]` will be replaced by a sequence of placeholder tokens, and these placeholders will be replaced by image embeddings during prefill. The challenge for prefix caching to support this case is we need to differentiate images from the placeholders. To address this problem, we encode the image hash generated by the frontend image processor. For example, the hash of the blocks in the above prompt would be (assuming block size 16, and we have 41 placeholder tokens):
+
+```text
+Block 0
+    Parent hash: None
+    Token IDs: 1, 3, 7493, 1681, 1294, 1593, 3937, 9551, <p>, ..., <p>
+    Extra hash: <image hash>
+Block 1
+    Parent hash: Block 0 hash
+    Token IDs: <p>, ..., <p>
+    Extra hash: <image hash>
+Block 2
+    Parent hash: Block 1 hash
+    Token IDs: <p>, ..., <p>
+    Extra hash: <image hash>
+Block 3
+    Parent hash: Block 2 hash
+    Token IDs: <p>, ..., <p>, 4
+    Extra hash: <image hash>
+```
+
+In the rest of this document, we first introduce the data structure used for prefix caching in vLLM v1, followed by the prefix caching workflow of major KV cache operators (e.g., allocate, append, free, eviction). Finally, we use an example to illustrate the end to end prefix caching workflow.
+
+**Cache Isolation for Security**
+To improve privacy in shared environments, vLLM supports isolating prefix cache reuse through optional per-request salting. By including a `cache_salt` in the request, this value is injected into the hash of the first block, ensuring that only requests with the same salt can reuse cached KV blocks. This prevents timing-based attacks where an adversary could infer cached content by observing latency differences. This offers protection without compromising performance.
+
+```json
+{
+  "messages": [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "Here is a document with details about the world series: ..."},
+    {"role": "user", "content": "Who won the world series in 2020?"}
+  ],
+  "cache_salt": "your-cache-salt"
+}
+```
+
+With this setup, cache sharing is limited to users or requests that explicitly agree on a common salt, enabling cache reuse within a trust group while isolating others.
+
+> **Note:** Cache isolation is not supported in engine V0.
+
+## Data Structure
+
+The prefix caching in vLLM v1 is implemented in the KV cache manager. The basic building block is the “Block” data class (simplified):
+
+```python
+class KVCacheBlock:
+    # The block ID (immutable)
+    block_id: int
+    # The block hash (will be assigned when the block is full,
+    # and will be reset when the block is evicted).
+    block_hash: BlockHash
+    # The number of requests using this block now.
+    ref_cnt: int
+
+    # The pointers to form a doubly linked list for the free queue.
+    prev_free_block: Optional["KVCacheBlock"] = None
+    next_free_block: Optional["KVCacheBlock"] = None
+```
+
+There are two design points to highlight:
+
+1. We allocate all KVCacheBlock when initializing the KV cache manager to be a block pool. This avoids Python object creation overheads and can easily track all blocks all the time.  
+2. We introduce doubly linked list pointers directly in the KVCacheBlock, so that we could construct a free queue directly. This gives us two benefits:  
+    1. We could have O(1) complexity moving elements in the middle to the tail.  
+    2. We could avoid introducing another Python queue (e.g., `deque`) which has a wrapper to the elements.
+
+As a result, we will have the following components when the KV cache manager is initialized:
+
+![Component Overview](../../assets/design/v1/prefix_caching/overview.png)
+
+* Block Pool: A list of KVCacheBlock.  
+* Free Block Queue: Only store the pointers of head and tail blocks for manipulations.  
+* Cache blocks: Mapping from hash key to block IDs.  
+* Request blocks: Mapping from request ID to allocated block IDs.
+
+## Operations
+
+### Block Allocation
+
+**New request:** Workflow for the scheduler to schedule a new request with KV cache block allocation:
+
+1. The scheduler calls `kv_cache_manager.get_computed_blocks()` to get a sequence of blocks that have already been computed. This is done by hashing the prompt tokens in the request and looking up cache blocks.  
+2. The scheduler calls `kv_cache_manager.allocate_slots()`. It does the following steps:  
+    1. Compute the number of new required blocks, and return if there are no sufficient blocks to allocate.  
+    2. “Touch” the computed blocks. It increases the reference count of the computed block by one, and removes the block from the free queue if the block wasn’t used by other requests. This is to avoid these computed blocks being evicted. See the example in the next section for illustration.  
+    3. Allocate new blocks by popping the heads of the free queue. If the head block is a cached block, this also “evicts” the block so that no other requests can reuse it anymore from now on.  
+    4. If an allocated block is already full of tokens, we immediately add it to the cache block, so that the block can be reused by other requests in the same batch.
+
+**Running request:** Workflow for the scheduler to schedule a running request with KV cache block allocation:
+
+1. The scheduler calls `kv_cache_manager.allocate_slots()`. It does the following steps:  
+    1. Compute the number of new required blocks, and return if there are no sufficient blocks to allocate.  
+    2. Allocate new blocks by popping the heads of the free queue. If the head block is a cached block, this also “evicts” the block so that no other requests can reuse it anymore from now on.  
+    3. Append token IDs to the slots in existing blocks as well as the new blocks. If a block is full, we add it to the cache block to cache it.
+
+**Duplicated blocks**  
+Assuming block size is 4 and you send a request (Request 1\) with prompt ABCDEF and decoding length 3:
+
+```text
+Prompt: [A, B, C, D, E, F]
+Output: [G, H, I]
+
+Time 0:
+  Tokens: [A, B, C, D, E, F, G]
+  Block Table: [0 (ABCD), 1 (EFG)]
+  Cache Blocks: 0
+Time 1:
+  Tokens: [A, B, C, D, E, F, G, H]
+  Block Table: [0 (ABCD), 1 (EFGH)]
+  Cache Blocks: 0, 1
+Time 2:
+  Tokens: [A, B, C, D, E, F, G, H, I]
+  Block Table: [0 (ABCD), 1 (EFGH), 2 (I)]
+  Cache Blocks: 0, 1
+```
+
+Now block 0 and block 1 are cached, and we send the same request again (Request 2\) with greedy sampling, so that it will produce exactly the same outputs as the Request 1:
+
+```text
+Prompt: [A, B, C, D, E, F]
+Output: [G, H, I]
+
+Time 0:
+  Tokens: [A, B, C, D, E, F, G]
+  Block Table: [0 (ABCD), 3 (EFG)]
+  Cache Blocks: 0, 1
+Time 1:
+  Tokens: [A, B, C, D, E, F, G, H]
+  Block Table: [0 (ABCD), 3 (EFGH)]
+  Cache Blocks: 0, 1, 3
+```
+
+As can be seen, block 3 is a new full block and is cached. However, it is redundant as block 1, meaning that we cached the same block twice. In v0, when detecting block 3 is duplicated, we free block 3 and let Request 2 use block 1 instead, so its block table becomes `[0, 1]` in Time 1. However, the block table in vLLM v1 is append-only, meaning that changing the block table from `[0, 3]` to `[0, 1]` is not allowed. As a result, we will have duplicated blocks for the hash key E-H. This duplication will be eliminated when the request is freed.
+
+### Free
+
+When a request is finished, we free all its blocks if no other requests are using them (reference count = 0). In this example, we free request 1 and block 2, 3, 4, 8 associated with it. We can see that the freed blocks are added to the tail of the free queue in the *reverse* order. This is because the last block of a request must hash more tokens and is less likely to be reused by other requests. As a result, it should be evicted first.
+
+![Free queue after a request us freed](../../assets/design/v1/prefix_caching/free.png)
+
+### Eviction (LRU)
+
+When the head block (least recently used block) of the free queue is cached, we have to evict the block to prevent it from being used by other requests. Specifically, eviction involves the following steps:
+
+1. Pop the block from the head of the free queue. This is the LRU block to be evicted.  
+2. Remove the block ID from the cache block.  
+3. Remove the block hash.
+
+## Example
+
+In this example, we assume the block size is 4 (each block can cache 4 tokens), and we have 10 blocks in the KV-cache manager in total.
+
+**Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 3 of 4 tokens.
+
+![Example Time 1](../../assets/design/v1/prefix_caching/example-time-1.png)
+
+**Time 3: Request 0 makes the block 3 full and asks for a new block to keep decoding.** We cache block 3 and allocate block 4.
+
+![Example Time 3](../../assets/design/v1/prefix_caching/example-time-3.png)
+
+**Time 4: Request 1 comes in with the 14 prompt tokens, where the first 10 tokens are the same as request 0.** We can see that only the first 2 blocks (8 tokens) hit the cache, because the 3rd block only matches 2 of 4 tokens.
+
+![Example Time 4](../../assets/design/v1/prefix_caching/example-time-4.png)
+
+**Time 5: Request 0 is finished and free.** Blocks 2, 3 and 4 are added to the free queue in the reverse order (but block 2 and 3 are still cached). Block 0 and 1 are not added to the free queue because they are being used by Request 1.
+
+![Example Time 5](../../assets/design/v1/prefix_caching/example-time-5.png)
+
+**Time 6: Request 1 is finished and free.**
+
+![Example Time 6](../../assets/design/v1/prefix_caching/example-time-6.png)
+
+**Time 7: Request 2 comes in with the 29 prompt tokens, where the first 12 tokens are the same as request 0\.** Note that even the block order in the free queue was `7 - 8 - 9 - 4 - 3 - 2 - 6 - 5 - 1 - 0`, the cache hit blocks (i.e., 0, 1, 2) are touched and removed from the queue before allocation, so the free queue becomes `7 - 8 - 9 - 4 - 3 - 6 - 5`. As a result, the allocated blocks are 0 (cached), 1 (cached), 2 (cached), 7, 8, 9, 4, 3 (evicted).
+
+![Example Time 7](../../assets/design/v1/prefix_caching/example-time-7.png)
diff --git a/vllm_v0.10.0/docs/design/v1/torch_compile.md b/vllm_v0.10.0/docs/design/v1/torch_compile.md
new file mode 100644
index 0000000..ea5d8ac
--- /dev/null
+++ b/vllm_v0.10.0/docs/design/v1/torch_compile.md
@@ -0,0 +1,155 @@
+# vLLM's `torch.compile` integration
+
+In vLLM's V1 architecture, `torch.compile` is enabled by default and is a critical part of the framework. This document gives a simple walk-through example to show how to understand the `torch.compile` usage.
+
+Throughout the example, we will run a common Llama model using v1, and turn on debug level logging to show all the details. The command to be used is `VLLM_USE_V1=1 VLLM_LOGGING_LEVEL=DEBUG vllm serve meta-llama/Llama-3.2-1B`.
+
+## Compilation Cache
+
+In the very verbose logs, we can see:
+
+```
+INFO 03-07 03:06:55 [backends.py:409] Using cache directory: ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0 for vLLM's torch.compile
+```
+
+vLLM will take all the available factors into consideration, and decide a directory to store all the compilation artifact. This means, you can directly copy the whole `~/.cache/vllm/torch_compile_cache` directory in your deployment scenario to save a great amount of compilation time, and hence accelerating the starting time of the vLLM instance.
+
+The factors considered include:
+
+- All the related configs (see the `compute_hash` functions in the [config.py](gh-file:vllm/config.py))
+- PyTorch configs (see the `compute_hash` functions in the [compiler_interface.py](gh-file:vllm/compilation/compiler_interface.py))
+- The model's forward function and the relevant functions called by the forward function (see below)
+
+With all these factors taken into consideration, usually we can guarantee that the cache is safe to use, and will not cause any unexpected behavior. Therefore, the cache is enabled by default. If you want to debug the compilation process, or if you suspect the cache is causing some issues, you can disable it by setting the environment variable `VLLM_DISABLE_COMPILE_CACHE=1`.
+
+A unique aspect of vLLM's `torch.compile` integration, is that we guarantee all the compilation finishes before we serve any requests. No requests will trigger new compilations. Otherwise, the engine would be blocked on that request, and the response time will have unexpected spikes.
+
+## Python Code Compilation
+
+In the very verbose logs, we can see:
+
+??? console "Logs"
+
+      ```text
+      DEBUG 03-07 03:06:52 [decorators.py:203] Start compiling function <code object forward at 0x7f08acf40c90, file "xxx/vllm/model_executor/models/llama.py", line 339>
+
+      DEBUG 03-07 03:06:54 [backends.py:370] Traced files (to be considered for compilation cache):
+      DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/_dynamo/polyfills/builtins.py
+      DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/container.py
+      DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/module.py
+      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/attention/layer.py
+      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/communication_op.py
+      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/parallel_state.py
+      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/custom_op.py
+      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/activation.py
+      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/layernorm.py
+      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/linear.py
+      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/rotary_embedding.py
+      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/vocab_parallel_embedding.py
+      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/models/llama.py
+
+      DEBUG 03-07 03:07:07 [backends.py:462] Computation graph saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/computation_graph.py
+      DEBUG 03-07 03:07:07 [wrapper.py:105] Dynamo transformed code saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/transformed_code.py
+      ```
+
+This is about the Python code compilation, i.e. graph capture by Dynamo. It tries to trace the function with code `xxx/vllm/model_executor/models/llama.py:339`, which is the `forward` function of the model we compile. During the forward pass, there are also other functions called and inlined by Dynamo, as shown by the logs, including some PyTorch functions from `xxx/torch/nn/modules/module.py` (used by PyTorch `nn.Module`, because module attribute access will trigger a function call), some communication / attention / activation functions from vLLM. All the traced files will be considered when we decide the cache directory to use. This way, any code change in the above files will trigger compilation cache miss, and therefore recompilation.
+
+The result of the Dynamo compilation, is a new function stored in `~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/transformed_code.py`. Usually, this function unpacks tensors from the module, and then pass it to the traced computation graph. The computation graph is stored in `~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/computation_graph.py`.
+
+## Computation Graph Processing
+
+The computation graph has shape annotations for every tensor. The inputs are input ids, position ids, weights and buffers from the model, and the outputs are the final hidden states. Note that lm head projection and sampling operations are not considered in the graph.
+
+Most of the inputs to the computation graph has static shape, since they are model weights and buffers, and will not change during the lifetime of the model. Only the input ids and position ids have symbolic shapes, i.e. the shape can change from batch to batch. However, they will share the same symbolic shapes. That is to say, the only changing size to the computation graph, is the batch size (number of tokens processed in the current forward pass).
+
+The attention operation is complicated, and it needs to interact with kv caches, with complicated shapes. Fortunately, the output of the attention operation just share the same shape as the input query of the attention operation. Therefore, we wrap the whole attention operation into a PyTorch custom op `torch.ops.vllm.unified_attention_with_output`, so that Dynamo will not try to inspect any of the internal operations. This way, although attention operation is complicated, we can still capture the model's computation graph as a full-graph, from Dynamo's perspective.
+
+The computation graph is further split into pieces, by the `splitting_ops` (usually this is the attention operation). Therefore, in the `~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/computation_graph.py` file, we can see lots of submodules, each submodule is a piece of graph after splitting:
+
+- Attention operation itself is a submodule.
+- The part of computation graph, from one attention operation to the next attention operation, is a submodule.
+
+Every submodule can be identified by its index, and will be processed individually.
+
+## Computation Graph Compilation
+
+In the very verbose logs, we can also see:
+
+```
+DEBUG 03-07 03:52:37 [backends.py:134] store the 0-th graph for shape None from inductor via handle ('fpegyiq3v3wzjzphd45wkflpabggdbjpylgr7tta4hj6uplstsiw', '~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/iw/ciwzrk3ittdqatuzwonnajywvno3llvjcs2vfdldzwzozn3zi3iy.py')
+DEBUG 03-07 03:52:39 [backends.py:134] store the 1-th graph for shape None from inductor via handle ('f7fmlodmf3h3by5iiu2c4zarwoxbg4eytwr3ujdd2jphl4pospfd', '~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/ly/clyfzxldfsj7ehaluis2mca2omqka4r7mgcedlf6xfjh645nw6k2.py')
+...
+DEBUG 03-07 03:52:45 [backends.py:134] store the 15-th graph for shape None from inductor via handle ('f7fmlodmf3h3by5iiu2c4zarwoxbg4eytwr3ujdd2jphl4pospfd', '~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/ly/clyfzxldfsj7ehaluis2mca2omqka4r7mgcedlf6xfjh645nw6k2.py')
+DEBUG 03-07 03:52:45 [backends.py:134] store the 16-th graph for shape None from inductor via handle ('fvj3ccoi7m34f3dnr4itmu55mmun44l5xymwhrjlwisylsk7q6jy', '~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/tf/ctfftkglj7b4lcttq5cymx6cew372uoauupqn6ldsvpiucavqcjc.py')
+```
+
+This means the first piece of computation graph (with shape `None` for symbolic shape) is compiled by Inductor (with a key `fpegyiq3v3wzjzphd45wkflpabggdbjpylgr7tta4hj6uplstsiw`). The compiled kernel is stored in  `~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/iw/ciwzrk3ittdqatuzwonnajywvno3llvjcs2vfdldzwzozn3zi3iy.py`. You can open the file to see what is the code Inductor finally runs.
+
+One more detail: you can see that the 1-th graph and the 15-th graph have the same key, while the 0-th graph and the 16-th graph are different. This is expected, since we split the graph by the attention op, we get 3 unique subgraphs:
+
+- the first layer before attention
+- every middle layer, from one attention operation to the next attention operation
+- the final layer after attention
+
+If we already have the cache directory (e.g. run the same code for the second time), we will see the following logs:
+
+```
+DEBUG 03-07 04:00:45 [backends.py:86] Directly load the 0-th graph for shape None from inductor via handle ('fpegyiq3v3wzjzphd45wkflpabggdbjpylgr7tta4hj6uplstsiw', '~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/iw/ciwzrk3ittdqatuzwonnajywvno3llvjcs2vfdldzwzozn3zi3iy.py')
+```
+
+This time, Inductor compilation is completely bypassed, and we will load from disk to read the compilation artifact we get from the last time.
+
+The above example just uses Inductor to compile for a general shape (i.e. symbolic shape). We can also use Inductor to compile for some of the specific shapes, for example:
+
+```bash
+vllm serve meta-llama/Llama-3.2-1B \
+  --compilation_config '{"compile_sizes": [1, 2, 4, 8]}'
+```
+
+Then it will also compile a specific kernel just for batch size `1, 2, 4, 8`. At this time, all of the shapes in the computation graph are static and known, and we will turn on auto-tuning to tune for max performance. This can be slow when you run it for the first time, but the next time you run it, we can directly bypass the tuning and run the tuned kernel.
+
+When all the shapes are known, `torch.compile` can compare different configs, and often find some better configs to run the kernel. For example, we can see the following log:
+
+??? console "Logs"
+
+    ```
+    AUTOTUNE mm(8x2048, 2048x3072)
+      triton_mm_4 0.0130 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
+      triton_mm_8 0.0134 ms 97.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
+      triton_mm_12 0.0148 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4
+      mm 0.0160 ms 81.6% 
+      triton_mm_16 0.0165 ms 78.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
+      triton_mm_3 0.0199 ms 65.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
+      triton_mm_1 0.0203 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2
+      triton_mm_7 0.0203 ms 64.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
+      triton_mm_2 0.0208 ms 62.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
+      triton_mm_11 0.0215 ms 60.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
+    SingleProcess AUTOTUNE benchmarking takes 2.0428 seconds and 7.5727 seconds precompiling
+    ```
+
+It means, for a matrix multiplication with shape `8x2048x3072`, `torch.compile` tries triton template with various configs, and it is much faster than the default code (which dispatches to cublas library).
+
+Unfortunately, because auto-tuning takes quite a long time (from seconds to minutes, depending on the model size and the batch size), even though it can be cached for later use, for the sake of user-friendliness, we turn it off by default. If you want to have max performance, it is recommended to try it, by compiling specific shapes.
+
+## Cudagraph Capture
+
+vLLM's V1 architecture uses piecewise cudagraph. The full computation graph is split as mentioned above, and we only capture the cudagraph for the piece of graph between attention operations (including the first graph before any attention operation, and the last graph after all the attention operation). This is based on a common observation: computation between attentions are usually token-wise and easy to deal with for cudagraph; while the attention operation is non-trivial to be cudagraph compatible. Thus, by running the attention operation in eager mode while the rest operations in cudagraph, we keep the flexibility of the attention operation.
+
+The piecewise cudagraph also has fine-grained memory management. The purpose is to only exclude the attention kernel from cudagraph, while keeping all the rest modules and the memory allocation operations in the cudagraph. This is why the attention operation in V1 has the output tensor as the input of the attention.
+
+The cudagraphs are captured and managed by the compiler backend, and replayed when the batch size has corresponding cudagraph captured. The caller of the model (model runner) only needs to make sure it manages the input buffers correctly. All of the intermediate buffers are managed automatically by the compiler backend.
+
+By default, vLLM will try to determine a set of sizes to capture cudagraph. You can also override it using the config `cudagraph_capture_sizes`:
+
+```bash
+vllm serve meta-llama/Llama-3.2-1B \
+  --compilation-config '{"cudagraph_capture_sizes": [1, 2, 4, 8]}'
+```
+
+Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture.
+
+### Full Cudagraph capture
+
+It is possible to include attention as part of the cudagraph if using an attention backend that is cudagraph compatible. This can improve performance in some cases such as decode speed for smaller models. Enable this using `--compilation-config '{"full_cuda_graph": true}'`.
+
+Currently only FlashAttention 3 is compatible, and only when cascade attention is disabled.
diff --git a/vllm_v0.10.0/docs/features/automatic_prefix_caching.md b/vllm_v0.10.0/docs/features/automatic_prefix_caching.md
new file mode 100644
index 0000000..f3c4bdd
--- /dev/null
+++ b/vllm_v0.10.0/docs/features/automatic_prefix_caching.md
@@ -0,0 +1,25 @@
+# Automatic Prefix Caching
+
+## Introduction
+
+Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part.
+
+!!! note
+    Technical details on how vLLM implements APC can be found [here](../design/automatic_prefix_caching.md).
+
+## Enabling APC in vLLM
+
+Set `enable_prefix_caching=True` in vLLM engine to enable APC. Here is an example:
+
+<gh-file:examples/offline_inference/automatic_prefix_caching.py>
+
+## Example workloads
+
+We describe two example workloads, where APC can provide huge performance benefit:
+
+- Long document query, where the user repeatedly queries the same long document (e.g. software manual or annual report) with different queries. In this case, instead of processing the long document again and again, APC allows vLLM to process this long document *only once*, and all future requests can avoid recomputing this long document by reusing its KV cache. This allows vLLM to serve future requests with much higher throughput and much lower latency.
+- Multi-round conversation, where the user may chat with the application multiple times in the same chatting session. In this case, instead of processing the whole chatting history again and again, APC allows vLLM to reuse the processing results of the chat history across all future rounds of conversation, allowing vLLM to serve future requests with much higher throughput and much lower latency.
+
+## Limits
+
+APC in general does not reduce the performance of vLLM. With that being said, APC only reduces the time of processing the queries (the prefilling phase) and does not reduce the time of generating new tokens (the decoding phase). So APC does not bring performance gain when vLLM spends most of the time generating answers to the queries (e.g. when the length of the answer is long), or new queries do not share the same prefix with any of existing queries (so that the computation cannot be reused).
diff --git a/vllm_v0.10.0/docs/features/compatibility_matrix.md b/vllm_v0.10.0/docs/features/compatibility_matrix.md
new file mode 100644
index 0000000..8be1585
--- /dev/null
+++ b/vllm_v0.10.0/docs/features/compatibility_matrix.md
@@ -0,0 +1,76 @@
+# Compatibility Matrix
+
+The tables below show mutually exclusive features and the support on some hardware.
+
+The symbols used have the following meanings:
+
+- ✅ = Full compatibility
+- 🟠 = Partial compatibility
+- ❌ = No compatibility
+- ❔ = Unknown or TBD
+
+!!! note
+    Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/hardware combination.
+
+## Feature x Feature
+
+<style>
+td:not(:first-child) {
+  text-align: center !important;
+}
+td {
+  padding: 0.5rem !important;
+  white-space: nowrap;
+}
+
+th {
+  padding: 0.5rem !important;
+  min-width: 0 !important;
+}
+
+th:not(:first-child) {
+  writing-mode: vertical-lr;
+  transform: rotate(180deg)
+}
+</style>
+
+| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | <abbr title="Pooling Models">pooling</abbr> | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search |
+|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
+| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | |
+| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | |
+| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | |
+| [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | |
+| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | |
+| <abbr title="Pooling Models">pooling</abbr> | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | | | | | | | | |
+| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](gh-issue:7366) | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | |
+| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | |
+| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | |
+| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | |
+| multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | |
+| <abbr title="Multimodal Inputs">mm</abbr> | ✅ | [🟠](gh-pr:8348) | [🟠](gh-pr:4194) | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | |
+| best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | |
+| beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ |
+
+[](){ #feature-x-hardware }
+
+## Feature x Hardware
+
+| Feature                                                   | Volta               | Turing    | Ampere    | Ada    | Hopper     | CPU                | AMD    | TPU |
+|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----|
+| [CP][chunked-prefill]                                     | [❌](gh-issue:2729) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ |
+| [APC](automatic_prefix_caching.md)                        | [❌](gh-issue:3687) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ |
+| [LoRA](lora.md)                                           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ |
+| [SD](spec_decode.md)                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+| CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | ❌ |
+| <abbr title="Pooling Models">pooling</abbr>               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❔     | ❌ |
+| <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     | ❌ |
+| <abbr title="Multimodal Inputs">mm</abbr>                 | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+| <abbr title="Logprobs">logP</abbr>                        | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+| <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+| <abbr title="Async Output Processing">async output</abbr> | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ❌     | ❌ |
+| multi-step                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](gh-issue:8477) | ✅     | ❌ |
+| best-of                                                   | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+| beam-search                                               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+
+!!! note
+    Please refer to [Feature support through NxD Inference backend][feature-support-through-nxd-inference-backend] for features supported on AWS Neuron hardware
diff --git a/vllm_v0.10.0/docs/features/disagg_prefill.md b/vllm_v0.10.0/docs/features/disagg_prefill.md
new file mode 100644
index 0000000..c0c3259
--- /dev/null
+++ b/vllm_v0.10.0/docs/features/disagg_prefill.md
@@ -0,0 +1,59 @@
+# Disaggregated Prefilling (experimental)
+
+This page introduces you the disaggregated prefilling feature in vLLM.
+
+!!! note
+    This feature is experimental and subject to change.
+
+## Why disaggregated prefilling?
+
+Two main reasons:
+
+- **Tuning time-to-first-token (TTFT) and inter-token-latency (ITL) separately**. Disaggregated prefilling put prefill and decode phase of LLM inference inside different vLLM instances. This gives you the flexibility to assign different parallel strategies (e.g. `tp` and `pp`) to tune TTFT without affecting ITL, or to tune ITL without affecting TTFT.
+- **Controlling tail ITL**. Without disaggregated prefilling, vLLM may insert some prefill jobs during the decoding of one request. This results in higher tail latency. Disaggregated prefilling helps you solve this issue and control tail ITL. Chunked prefill with a proper chunk size also can achieve the same goal, but in practice it's hard to figure out the correct chunk size value. So disaggregated prefilling is a much more reliable way to control tail ITL.
+
+!!! note
+    Disaggregated prefill DOES NOT improve throughput.
+
+## Usage example
+
+Please refer to <gh-file:examples/online_serving/disaggregated_prefill.sh> for the example usage of disaggregated prefilling.
+
+## Benchmarks
+
+Please refer to <gh-file:benchmarks/disagg_benchmarks> for disaggregated prefilling benchmarks.
+
+## Development
+
+We implement disaggregated prefilling by running 2 vLLM instances. One for prefill (we call it prefill instance) and one for decode (we call it decode instance), and then use a connector to transfer the prefill KV caches and results from prefill instance to decode instance.
+
+All disaggregated prefilling implementation is under `vllm/distributed/kv_transfer`.
+
+Key abstractions for disaggregated prefilling:
+
+- **Connector**: Connector allows **kv consumer** to retrieve the KV caches of a batch of request from **kv producer**.
+- **LookupBuffer**: LookupBuffer provides two API: `insert` KV cache and `drop_select` KV cache. The semantics of `insert` and `drop_select` are similar to SQL, where `insert` inserts a KV cache into the buffer, and `drop_select` returns the KV cache that matches the given condition and drop it from the buffer.
+- **Pipe**: A single-direction FIFO pipe for tensor transmission. It supports `send_tensor` and `recv_tensor`.
+
+!!! note
+    `insert` is non-blocking operation but `drop_select` is blocking operation.
+
+Here is a figure illustrating how the above 3 abstractions are organized:
+
+![Disaggregated prefilling abstractions](../assets/features/disagg_prefill/abstraction.jpg)
+
+The workflow of disaggregated prefilling is as follows:
+
+![Disaggregated prefilling workflow](../assets/features/disagg_prefill/overview.jpg)
+
+The `buffer` corresponds to `insert` API in LookupBuffer, and the `drop_select` corresponds to `drop_select` API in LookupBuffer.
+
+## Third-party contributions
+
+Disaggregated prefilling is highly related to infrastructure, so vLLM relies on third-party connectors for production-level disaggregated prefilling (and vLLM team will actively review and merge new PRs for third-party connectors).
+
+We recommend three ways of implementations:
+
+- **Fully-customized connector**: Implement your own `Connector`, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions.
+- **Database-like connector**: Implement your own `LookupBuffer` and support the `insert` and `drop_select` APIs just like SQL.
+- **Distributed P2P connector**: Implement your own `Pipe` and support the `send_tensor` and `recv_tensor` APIs, just like `torch.distributed`.
diff --git a/vllm_v0.10.0/docs/features/lora.md b/vllm_v0.10.0/docs/features/lora.md
new file mode 100644
index 0000000..ea1b495
--- /dev/null
+++ b/vllm_v0.10.0/docs/features/lora.md
@@ -0,0 +1,351 @@
+# LoRA Adapters
+
+This document shows you how to use [LoRA adapters](https://arxiv.org/abs/2106.09685) with vLLM on top of a base model.
+
+LoRA adapters can be used with any vLLM model that implements [SupportsLoRA][vllm.model_executor.models.interfaces.SupportsLoRA].
+
+Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save
+them locally with
+
+```python
+from huggingface_hub import snapshot_download
+
+sql_lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
+```
+
+Then we instantiate the base model and pass in the `enable_lora=True` flag:
+
+```python
+from vllm import LLM, SamplingParams
+from vllm.lora.request import LoRARequest
+
+llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_lora=True)
+```
+
+We can now submit the prompts and call `llm.generate` with the `lora_request` parameter. The first parameter
+of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and
+the third parameter is the path to the LoRA adapter.
+
+??? code
+
+    ```python
+    sampling_params = SamplingParams(
+        temperature=0,
+        max_tokens=256,
+        stop=["[/assistant]"]
+    )
+
+    prompts = [
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
+    ]
+
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
+    )
+    ```
+
+Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
+
+## Serving LoRA Adapters
+
+LoRA adapted models can also be served with the Open-AI compatible vLLM server. To do so, we use
+`--lora-modules {name}={path} {name}={path}` to specify each LoRA module when we kickoff the server:
+
+```bash
+vllm serve meta-llama/Llama-2-7b-hf \
+    --enable-lora \
+    --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
+```
+
+!!! note
+    The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one.
+
+The server entrypoint accepts all other LoRA configuration parameters (`max_loras`, `max_lora_rank`, `max_cpu_loras`,
+etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along
+with its base model (if `jq` is not installed, you can follow [this guide](https://jqlang.org/download/) to install it.):
+
+??? console "Command"
+
+    ```bash
+    curl localhost:8000/v1/models | jq .
+    {
+        "object": "list",
+        "data": [
+            {
+                "id": "meta-llama/Llama-2-7b-hf",
+                "object": "model",
+                ...
+            },
+            {
+                "id": "sql-lora",
+                "object": "model",
+                ...
+            }
+        ]
+    }
+    ```
+
+Requests can specify the LoRA adapter as if it were any other model via the `model` request parameter. The requests will be
+processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other
+LoRA adapter requests if they were provided and `max_loras` is set high enough).
+
+The following is an example request
+
+```bash
+curl http://localhost:8000/v1/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "sql-lora",
+        "prompt": "San Francisco is a",
+        "max_tokens": 7,
+        "temperature": 0
+    }' | jq
+```
+
+## Dynamically serving LoRA Adapters
+
+In addition to serving LoRA adapters at server startup, the vLLM server supports dynamically configuring LoRA adapters at runtime through dedicated API endpoints and plugins. This feature can be particularly useful when the flexibility to change models on-the-fly is needed.
+
+Note: Enabling this feature in production environments is risky as users may participate in model adapter management.
+
+To enable dynamic LoRA configuration, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING`
+is set to `True`.
+
+```bash
+export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
+```
+
+### Using API Endpoints
+Loading a LoRA Adapter:
+
+To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary
+details of the adapter to be loaded. The request payload should include the name and path to the LoRA adapter.
+
+Example request to load a LoRA adapter:
+
+```bash
+curl -X POST http://localhost:8000/v1/load_lora_adapter \
+-H "Content-Type: application/json" \
+-d '{
+    "lora_name": "sql_adapter",
+    "lora_path": "/path/to/sql-lora-adapter"
+}'
+```
+
+Upon a successful request, the API will respond with a `200 OK` status code from `vllm serve`, and `curl` returns the response body: `Success: LoRA adapter 'sql_adapter' added successfully`. If an error occurs, such as if the adapter
+cannot be found or loaded, an appropriate error message will be returned.
+
+Unloading a LoRA Adapter:
+
+To unload a LoRA adapter that has been previously loaded, send a POST request to the `/v1/unload_lora_adapter` endpoint
+with the name or ID of the adapter to be unloaded.
+
+Upon a successful request, the API responds with a `200 OK` status code from `vllm serve`, and `curl` returns the response body: `Success: LoRA adapter 'sql_adapter' removed successfully`.
+
+Example request to unload a LoRA adapter:
+
+```bash
+curl -X POST http://localhost:8000/v1/unload_lora_adapter \
+-H "Content-Type: application/json" \
+-d '{
+    "lora_name": "sql_adapter"
+}'
+```
+
+### Using Plugins
+Alternatively, you can use the LoRAResolver plugin to dynamically load LoRA adapters. LoRAResolver plugins enable you to load LoRA adapters from both local and remote sources such as local file system and S3. On every request, when there's a new model name that hasn't been loaded yet, the LoRAResolver will try to resolve and load the corresponding LoRA adapter.
+
+You can set up multiple LoRAResolver plugins if you want to load LoRA adapters from different sources. For example, you might have one resolver for local files and another for S3 storage. vLLM will load the first LoRA adapter that it finds.
+
+You can either install existing plugins or implement your own. By default, vLLM comes with a [resolver plugin to load LoRA adapters from a local directory.](https://github.com/vllm-project/vllm/tree/main/vllm/plugins/lora_resolvers)
+To enable this resolver, set `VLLM_ALLOW_RUNTIME_LORA_UPDATING` to True, set `VLLM_PLUGINS` to include `lora_filesystem_resolver`, and then set `VLLM_LORA_RESOLVER_CACHE_DIR` to a local directory. When vLLM receives a request using a LoRA adapter `foobar`,
+it will first look in the local directory for a directory `foobar`, and attempt to load the contents of that directory as a LoRA adapter. If successful, the request will complete as normal and
+that adapter will then be available for normal use on the server.
+
+Alternatively, follow these example steps to implement your own plugin:
+
+1. Implement the LoRAResolver interface.
+
+    ??? code "Example of a simple S3 LoRAResolver implementation"
+
+        ```python
+        import os
+        import s3fs
+        from vllm.lora.request import LoRARequest
+        from vllm.lora.resolver import LoRAResolver
+
+        class S3LoRAResolver(LoRAResolver):
+            def __init__(self):
+                self.s3 = s3fs.S3FileSystem()
+                self.s3_path_format = os.getenv("S3_PATH_TEMPLATE")
+                self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE")
+
+            async def resolve_lora(self, base_model_name, lora_name):
+                s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
+                local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
+
+                # Download the LoRA from S3 to the local path
+                await self.s3._get(
+                    s3_path, local_path, recursive=True, maxdepth=1
+                )
+
+                lora_request = LoRARequest(
+                    lora_name=lora_name,
+                    lora_path=local_path,
+                    lora_int_id=abs(hash(lora_name))
+                )
+                return lora_request
+        ```
+
+2. Register `LoRAResolver` plugin.
+
+    ```python
+    from vllm.lora.resolver import LoRAResolverRegistry
+
+    s3_resolver = S3LoRAResolver()
+    LoRAResolverRegistry.register_resolver("s3_resolver", s3_resolver)
+    ```
+
+    For more details, refer to the [vLLM's Plugins System](../design/plugin_system.md).
+
+## New format for `--lora-modules`
+
+In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example:
+
+```bash
+--lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
+```
+
+This would only include the `name` and `path` for each LoRA module, but did not provide a way to specify a `base_model_name`.
+Now, you can specify a base_model_name alongside the name and path using JSON format. For example:
+
+```bash
+--lora-modules '{"name": "sql-lora", "path": "/path/to/lora", "base_model_name": "meta-llama/Llama-2-7b"}'
+```
+
+To provide the backward compatibility support, you can still use the old key-value format (name=path), but the `base_model_name` will remain unspecified in that case.
+
+## LoRA model lineage in model card
+
+The new format of `--lora-modules` is mainly to support the display of parent model information in the model card. Here's an explanation of how your current response supports this:
+
+- The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter.
+- The `root` field points to the artifact location of the lora adapter.
+
+??? console "Command output"
+
+    ```bash
+    $ curl http://localhost:8000/v1/models
+
+    {
+        "object": "list",
+        "data": [
+            {
+            "id": "meta-llama/Llama-2-7b-hf",
+            "object": "model",
+            "created": 1715644056,
+            "owned_by": "vllm",
+            "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
+            "parent": null,
+            "permission": [
+                {
+                .....
+                }
+            ]
+            },
+            {
+            "id": "sql-lora",
+            "object": "model",
+            "created": 1715644056,
+            "owned_by": "vllm",
+            "root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
+            "parent": meta-llama/Llama-2-7b-hf,
+            "permission": [
+                {
+                ....
+                }
+            ]
+            }
+        ]
+    }
+    ```
+
+## Default LoRA Models For Multimodal Models
+
+Some models, e.g., [Granite Speech](https://huggingface.co/ibm-granite/granite-speech-3.3-8b) and [Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) multimodal, contain LoRA adapter(s) that are expected to always be applied when a given modality is present. This can be a bit tedious to manage with the above approaches, as it requires the user to send the `LoRARequest` (offline) or to filter requests between the base model and LoRA model (server) depending on the content of the request's multimodal data.
+
+To this end, we allow registration of default multimodal LoRAs to handle this automatically, where users can map each modality to a LoRA adapter to automatically apply it when the corresponding inputs are present. Note that currently, we only allow one LoRA per prompt; if several modalities are provided, each of which are registered to a given modality, none of them will be applied.
+
+??? code "Example usage for offline inference"
+
+    ```python
+    from transformers import AutoTokenizer
+    from vllm import LLM, SamplingParams
+    from vllm.assets.audio import AudioAsset
+
+    model_id = "ibm-granite/granite-speech-3.3-2b"
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+    def get_prompt(question: str, has_audio: bool):
+        """Build the input prompt to send to vLLM."""
+        if has_audio:
+            question = f"<|audio|>{question}"
+        chat = [
+            {
+                "role": "user",
+                "content": question
+            }
+        ]
+        return tokenizer.apply_chat_template(chat, tokenize=False)
+
+
+    llm = LLM(
+        model=model_id,
+        enable_lora=True,
+        max_lora_rank=64,
+        max_model_len=2048,
+        limit_mm_per_prompt={"audio": 1},
+        # Will always pass a `LoRARequest` with the `model_id`
+        # whenever audio is contained in the request data.
+        default_mm_loras = {"audio": model_id},
+        enforce_eager=True,
+    )
+
+    question = "can you transcribe the speech into a written format?"
+    prompt_with_audio = get_prompt(
+        question=question,
+        has_audio=True,
+    )
+    audio = AudioAsset("mary_had_lamb").audio_and_sample_rate
+
+    inputs = {
+        "prompt": prompt_with_audio,
+        "multi_modal_data": {
+            "audio": audio,
+        }
+    }
+
+
+    outputs = llm.generate(
+        inputs,
+        sampling_params=SamplingParams(
+            temperature=0.2,
+            max_tokens=64,
+        ),
+    )
+    ```
+
+You can also pass a json dictionary of `--default-mm-loras` mapping modalities to LoRA model IDs. For example, when starting the server:
+
+```bash
+vllm serve ibm-granite/granite-speech-3.3-2b \
+    --max-model-len 2048 \
+    --enable-lora \
+    --default-mm-loras '{"audio":"ibm-granite/granite-speech-3.3-2b"}' \
+    --max-lora-rank 64
+```
+
+Note: Default multimodal LoRAs are currently only available for `.generate` and chat completions.
diff --git a/vllm_v0.10.0/docs/features/multimodal_inputs.md b/vllm_v0.10.0/docs/features/multimodal_inputs.md
new file mode 100644
index 0000000..e820ace
--- /dev/null
+++ b/vllm_v0.10.0/docs/features/multimodal_inputs.md
@@ -0,0 +1,591 @@
+# Multimodal Inputs
+
+This page teaches you how to pass multi-modal inputs to [multi-modal models][supported-mm-models] in vLLM.
+
+!!! note
+    We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes,
+    and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests.
+
+## Offline Inference
+
+To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]:
+
+- `prompt`: The prompt should follow the format that is documented on HuggingFace.
+- `multi_modal_data`: This is a dictionary that follows the schema defined in [vllm.multimodal.inputs.MultiModalDataDict][].
+
+### Image Inputs
+
+You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
+
+??? code
+
+    ```python
+    from vllm import LLM
+
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+
+    # Refer to the HuggingFace repo for the correct format to use
+    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
+
+    # Load the image using PIL.Image
+    image = PIL.Image.open(...)
+
+    # Single prompt inference
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {"image": image},
+    })
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+    # Batch inference
+    image_1 = PIL.Image.open(...)
+    image_2 = PIL.Image.open(...)
+    outputs = llm.generate(
+        [
+            {
+                "prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:",
+                "multi_modal_data": {"image": image_1},
+            },
+            {
+                "prompt": "USER: <image>\nWhat's the color of this image?\nASSISTANT:",
+                "multi_modal_data": {"image": image_2},
+            }
+        ]
+    )
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+    ```
+
+Full example: <gh-file:examples/offline_inference/vision_language.py>
+
+To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
+
+??? code
+
+    ```python
+    from vllm import LLM
+
+    llm = LLM(
+        model="microsoft/Phi-3.5-vision-instruct",
+        trust_remote_code=True,  # Required to load Phi-3.5-vision
+        max_model_len=4096,  # Otherwise, it may not fit in smaller GPUs
+        limit_mm_per_prompt={"image": 2},  # The maximum number to accept
+    )
+
+    # Refer to the HuggingFace repo for the correct format to use
+    prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
+
+    # Load the images using PIL.Image
+    image1 = PIL.Image.open(...)
+    image2 = PIL.Image.open(...)
+
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {
+            "image": [image1, image2]
+        },
+    })
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+    ```
+
+Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
+
+If using the [LLM.chat](../models/generative_models.md#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings:
+
+```python
+from vllm import LLM
+from vllm.assets.image import ImageAsset
+
+llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+image_url = "https://picsum.photos/id/32/512/512"
+image_pil = ImageAsset('cherry_blossom').pil_image
+image_embeds = torch.load(...)
+
+conversation = [
+    {"role": "system", "content": "You are a helpful assistant"},
+    {"role": "user", "content": "Hello"},
+    {"role": "assistant", "content": "Hello! How can I assist you today?"},
+    {
+        "role": "user",
+        "content": [{
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        },{
+            "type": "image_pil",
+            "image_pil": image_pil
+        }, {
+            "type": "image_embeds",
+            "image_embeds": image_embeds
+        }, {
+            "type": "text",
+            "text": "What's in these images?"
+        }],
+    },
+]
+
+# Perform inference and log output.
+outputs = llm.chat(conversation)
+
+for o in outputs:
+    generated_text = o.outputs[0].text
+    print(generated_text)
+```
+
+Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
+
+??? code
+
+    ```python
+    from vllm import LLM
+
+    # Specify the maximum number of frames per video to be 4. This can be changed.
+    llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
+
+    # Create the request payload.
+    video_frames = ... # load your video making sure it only has the number of frames specified earlier.
+    message = {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
+        ],
+    }
+    for i in range(len(video_frames)):
+        base64_image = encode_image(video_frames[i]) # base64 encoding.
+        new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
+        message["content"].append(new_image)
+
+    # Perform inference and log output.
+    outputs = llm.chat([message])
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+    ```
+
+### Video Inputs
+
+You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
+instead of using multi-image input.
+
+Full example: <gh-file:examples/offline_inference/vision_language.py>
+
+### Audio Inputs
+
+You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.
+
+Full example: <gh-file:examples/offline_inference/audio_language.py>
+
+### Embedding Inputs
+
+To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
+pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
+
+??? code
+
+    ```python
+    from vllm import LLM
+
+    # Inference with image embeddings as input
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+
+    # Refer to the HuggingFace repo for the correct format to use
+    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
+
+    # Embeddings for single image
+    # torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
+    image_embeds = torch.load(...)
+
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {"image": image_embeds},
+    })
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+    ```
+
+For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings:
+
+??? code
+
+    ```python
+    # Construct the prompt based on your model
+    prompt = ...
+
+    # Embeddings for multiple images
+    # torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM)
+    image_embeds = torch.load(...)
+
+    # Qwen2-VL
+    llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
+    mm_data = {
+        "image": {
+            "image_embeds": image_embeds,
+            # image_grid_thw is needed to calculate positional encoding.
+            "image_grid_thw": torch.load(...),  # torch.Tensor of shape (1, 3),
+        }
+    }
+
+    # MiniCPM-V
+    llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
+    mm_data = {
+        "image": {
+            "image_embeds": image_embeds,
+            # image_sizes is needed to calculate details of the sliced image.
+            "image_sizes": [image.size for image in images],  # list of image sizes
+        }
+    }
+
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": mm_data,
+    })
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+    ```
+
+## Online Serving
+
+Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat).
+
+!!! important
+    A chat template is **required** to use Chat Completions API.
+    For HF format models, the default chat template is defined inside `chat_template.json` or `tokenizer_config.json`.
+
+    If no default chat template is available, we will first look for a built-in fallback in <gh-file:vllm/transformers_utils/chat_templates/registry.py>.
+    If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.
+
+    For certain models, we provide alternative chat templates inside <gh-dir:examples>.
+    For example, VLM2Vec uses <gh-file:examples/template_vlm2vec.jinja> which is different from the default one for Phi-3-Vision.
+
+### Image Inputs
+
+Image input is supported according to [OpenAI Vision API](https://platform.openai.com/docs/guides/vision).
+Here is a simple example using Phi-3.5-Vision.
+
+First, launch the OpenAI-compatible server:
+
+```bash
+vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
+  --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
+```
+
+Then, you can use the OpenAI client as follows:
+
+??? code
+
+    ```python
+    from openai import OpenAI
+
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    # Single-image input inference
+    image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+
+    chat_response = client.chat.completions.create(
+        model="microsoft/Phi-3.5-vision-instruct",
+        messages=[{
+            "role": "user",
+            "content": [
+                # NOTE: The prompt formatting with the image token `<image>` is not needed
+                # since the prompt will be processed automatically by the API server.
+                {"type": "text", "text": "What’s in this image?"},
+                {"type": "image_url", "image_url": {"url": image_url}},
+            ],
+        }],
+    )
+    print("Chat completion output:", chat_response.choices[0].message.content)
+
+    # Multi-image input inference
+    image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
+    image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
+
+    chat_response = client.chat.completions.create(
+        model="microsoft/Phi-3.5-vision-instruct",
+        messages=[{
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What are the animals in these images?"},
+                {"type": "image_url", "image_url": {"url": image_url_duck}},
+                {"type": "image_url", "image_url": {"url": image_url_lion}},
+            ],
+        }],
+    )
+    print("Chat completion output:", chat_response.choices[0].message.content)
+    ```
+
+Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
+
+!!! tip
+    Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine,
+    and pass the file path as `url` in the API request.
+
+!!! tip
+    There is no need to place image placeholders in the text content of the API request - they are already represented by the image content.
+    In fact, you can place image placeholders in the middle of the text by interleaving text and image content.
+
+!!! note
+    By default, the timeout for fetching images through HTTP URL is `5` seconds.
+    You can override this by setting the environment variable:
+
+    ```bash
+    export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
+    ```
+
+### Video Inputs
+
+Instead of `image_url`, you can pass a video file via `video_url`. Here is a simple example using [LLaVA-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf).
+
+First, launch the OpenAI-compatible server:
+
+```bash
+vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model-len 8192
+```
+
+Then, you can use the OpenAI client as follows:
+
+??? code
+
+    ```python
+    from openai import OpenAI
+
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
+
+    ## Use video url in the payload
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this video?"
+                },
+                {
+                    "type": "video_url",
+                    "video_url": {
+                        "url": video_url
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from image url:", result)
+    ```
+
+Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
+
+!!! note
+    By default, the timeout for fetching videos through HTTP URL is `30` seconds.
+    You can override this by setting the environment variable:
+
+    ```bash
+    export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
+    ```
+
+### Audio Inputs
+
+Audio input is supported according to [OpenAI Audio API](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in).
+Here is a simple example using Ultravox-v0.5-1B.
+
+First, launch the OpenAI-compatible server:
+
+```bash
+vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b
+```
+
+Then, you can use the OpenAI client as follows:
+
+??? code
+
+    ```python
+    import base64
+    import requests
+    from openai import OpenAI
+    from vllm.assets.audio import AudioAsset
+
+    def encode_base64_content_from_url(content_url: str) -> str:
+        """Encode a content retrieved from a remote url to base64 format."""
+
+        with requests.get(content_url) as response:
+            response.raise_for_status()
+            result = base64.b64encode(response.content).decode('utf-8')
+
+        return result
+
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    # Any format supported by librosa is supported
+    audio_url = AudioAsset("winning_call").url
+    audio_base64 = encode_base64_content_from_url(audio_url)
+
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[{
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this audio?"
+                },
+                {
+                    "type": "input_audio",
+                    "input_audio": {
+                        "data": audio_base64,
+                        "format": "wav"
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from input audio:", result)
+    ```
+
+Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input:
+
+??? code
+
+    ```python
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[{
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this audio?"
+                },
+                {
+                    "type": "audio_url",
+                    "audio_url": {
+                        "url": audio_url
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from audio url:", result)
+    ```
+
+Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
+
+!!! note
+    By default, the timeout for fetching audios through HTTP URL is `10` seconds.
+    You can override this by setting the environment variable:
+
+    ```bash
+    export VLLM_AUDIO_FETCH_TIMEOUT=<timeout>
+    ```
+
+### Embedding Inputs
+
+To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
+pass a tensor of shape to the corresponding field of the multi-modal dictionary.
+#### Image Embedding Inputs
+For image embeddings, you can pass the base64-encoded tensor to the `image_embeds` field.
+The following example demonstrates how to pass image embeddings to the OpenAI server:
+
+??? code
+
+    ```python
+    image_embedding = torch.load(...)
+    grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct
+
+    buffer = io.BytesIO()
+    torch.save(image_embedding, buffer)
+    buffer.seek(0)
+    binary_data = buffer.read()
+    base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
+
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    # Basic usage - this is equivalent to the LLaVA example for offline inference
+    model = "llava-hf/llava-1.5-7b-hf"
+    embeds =  {
+        "type": "image_embeds",
+        "image_embeds": f"{base64_image_embedding}" 
+    }
+
+    # Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
+    model = "Qwen/Qwen2-VL-2B-Instruct"
+    embeds =  {
+        "type": "image_embeds",
+        "image_embeds": {
+            "image_embeds": f"{base64_image_embedding}" , # Required
+            "image_grid_thw": f"{base64_image_grid_thw}"  # Required by Qwen/Qwen2-VL-2B-Instruct
+        },
+    }
+    model = "openbmb/MiniCPM-V-2_6"
+    embeds =  {
+        "type": "image_embeds",
+        "image_embeds": {
+            "image_embeds": f"{base64_image_embedding}" , # Required
+            "image_sizes": f"{base64_image_sizes}"  # Required by openbmb/MiniCPM-V-2_6
+        },
+    }
+    chat_completion = client.chat.completions.create(
+        messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": [
+            {
+                "type": "text",
+                "text": "What's in this image?",
+            },
+            embeds,
+            ],
+        },
+    ],
+        model=model,
+    )
+    ```
+
+!!! note
+    Only one message can contain `{"type": "image_embeds"}`.
+    If used with a model that requires additional parameters, you must also provide a tensor for each of them, e.g. `image_grid_thw`, `image_sizes`, etc.
diff --git a/vllm_v0.10.0/docs/features/prompt_embeds.md b/vllm_v0.10.0/docs/features/prompt_embeds.md
new file mode 100644
index 0000000..6f5616e
--- /dev/null
+++ b/vllm_v0.10.0/docs/features/prompt_embeds.md
@@ -0,0 +1,43 @@
+# Prompt Embedding Inputs
+
+This page teaches you how to pass prompt embedding inputs to vLLM.
+
+## What are prompt embeddings?
+
+The traditional flow of text data for a Large Language Model goes from text to token ids (via a tokenizer) then from token ids to prompt embeddings. For a traditional decoder-only model (such as meta-llama/Llama-3.1-8B-Instruct), this step of converting token ids to prompt embeddings happens via a look-up from a learned embedding matrix, but the model is not limited to processing only the embeddings corresponding to its token vocabulary.
+
+!!! note
+    Prompt embeddings are currently only supported in the v0 engine.
+
+## Offline Inference
+
+To input multi-modal data, follow this schema in [vllm.inputs.EmbedsPrompt][]:
+
+- `prompt_embeds`: A torch tensor representing a sequence of prompt/token embeddings. This has the shape (sequence_length, hidden_size), where sequence length is the number of tokens embeddings and hidden_size is the hidden size (embedding size) of the model.
+
+### Hugging Face Transformers Inputs
+
+You can pass prompt embeddings from Hugging Face Transformers models to the  `'prompt_embeds'` field of the prompt embedding dictionary, as shown in the following examples:
+
+<gh-file:examples/offline_inference/prompt_embed_inference.py>
+
+## Online Serving
+
+Our OpenAI-compatible server accepts prompt embeddings inputs via the [Completions API](https://platform.openai.com/docs/api-reference/completions). Prompt embeddings inputs are added via a new `'prompt_embeds'` key in the JSON package.
+
+When a mixture of `'prompt_embeds'` and `'prompt'` inputs are provided in a single request, the prompt embeds are always returned first.
+
+Prompt embeddings are passed in as base64 encoded torch tensors.
+
+### Transformers Inputs via OpenAI Client
+
+First, launch the OpenAI-compatible server:
+
+```bash
+vllm serve meta-llama/Llama-3.2-1B-Instruct --task generate \
+  --max-model-len 4096 --enable-prompt-embeds
+```
+
+Then, you can use the OpenAI client as follows:
+
+<gh-file:examples/online_serving/prompt_embed_inference_with_openai_client.py>
diff --git a/vllm_v0.10.0/docs/features/quantization/README.md b/vllm_v0.10.0/docs/features/quantization/README.md
new file mode 100644
index 0000000..e8c3b11
--- /dev/null
+++ b/vllm_v0.10.0/docs/features/quantization/README.md
@@ -0,0 +1,20 @@
+# Quantization
+
+Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices.
+
+Contents:
+
+- [Supported Hardware](supported_hardware.md)
+- [AutoAWQ](auto_awq.md)
+- [BitsAndBytes](bnb.md)
+- [BitBLAS](bitblas.md)
+- [GGUF](gguf.md)
+- [GPTQModel](gptqmodel.md)
+- [INC](inc.md)
+- [INT4 W4A16](int4.md)
+- [INT8 W8A8](int8.md)
+- [FP8 W8A8](fp8.md)
+- [NVIDIA TensorRT Model Optimizer](modelopt.md)
+- [AMD Quark](quark.md)
+- [Quantized KV Cache](quantized_kvcache.md)
+- [TorchAO](torchao.md)
diff --git a/vllm_v0.10.0/docs/features/quantization/auto_awq.md b/vllm_v0.10.0/docs/features/quantization/auto_awq.md
new file mode 100644
index 0000000..fc99838
--- /dev/null
+++ b/vllm_v0.10.0/docs/features/quantization/auto_awq.md
@@ -0,0 +1,76 @@
+# AutoAWQ
+
+To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ).
+Quantization reduces the model's precision from BF16/FP16 to INT4 which effectively reduces the total model memory footprint.
+The main benefits are lower latency and memory usage.
+
+You can quantize your own models by installing AutoAWQ or picking one of the [6500+ models on Huggingface](https://huggingface.co/models?search=awq).
+
+```bash
+pip install autoawq
+```
+
+After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
+
+??? code
+
+    ```python
+    from awq import AutoAWQForCausalLM
+    from transformers import AutoTokenizer
+
+    model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
+    quant_path = 'mistral-instruct-v0.2-awq'
+    quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
+
+    # Load model
+    model = AutoAWQForCausalLM.from_pretrained(
+        model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+
+    # Quantize
+    model.quantize(tokenizer, quant_config=quant_config)
+
+    # Save quantized model
+    model.save_quantized(quant_path)
+    tokenizer.save_pretrained(quant_path)
+
+    print(f'Model is quantized and saved at "{quant_path}"')
+    ```
+
+To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
+
+```bash
+python examples/offline_inference/llm_engine_example.py \
+    --model TheBloke/Llama-2-7b-Chat-AWQ \
+    --quantization awq
+```
+
+AWQ models are also supported directly through the LLM entrypoint:
+
+??? code
+
+    ```python
+    from vllm import LLM, SamplingParams
+
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    # Create an LLM.
+    llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    ```
diff --git a/vllm_v0.10.0/docs/features/quantization/bitblas.md b/vllm_v0.10.0/docs/features/quantization/bitblas.md
new file mode 100644
index 0000000..6f53a44
--- /dev/null
+++ b/vllm_v0.10.0/docs/features/quantization/bitblas.md
@@ -0,0 +1,58 @@
+# BitBLAS
+
+vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more efficient and flexible model inference. Compared to other quantization frameworks, BitBLAS provides more precision combinations.
+
+!!! note
+    Ensure your hardware supports the selected `dtype` (`torch.bfloat16` or `torch.float16`).
+    Most recent NVIDIA GPUs support `float16`, while `bfloat16` is more common on newer architectures like Ampere or Hopper.
+    For details see [supported hardware](supported_hardware.md).
+
+Below are the steps to utilize BitBLAS with vLLM.
+
+```bash
+pip install bitblas>=0.1.0
+```
+
+vLLM reads the model's config file and supports pre-quantized checkpoints.
+
+You can find pre-quantized models on:
+
+- [Hugging Face (BitBLAS)](https://huggingface.co/models?search=bitblas)
+- [Hugging Face (GPTQ)](https://huggingface.co/models?search=gptq)
+
+Usually, these repositories have a `quantize_config.json` file that includes a `quantization_config` section.
+
+## Read bitblas format checkpoint
+
+```python
+from vllm import LLM
+import torch
+
+# "hxbgsyxh/llama-13b-4bit-g-1-bitblas" is a pre-quantized checkpoint.
+model_id = "hxbgsyxh/llama-13b-4bit-g-1-bitblas"
+llm = LLM(
+    model=model_id,
+    dtype=torch.bfloat16,
+    trust_remote_code=True,
+    quantization="bitblas"
+)
+```
+
+## Read gptq format checkpoint
+
+??? code
+
+    ```python
+    from vllm import LLM
+    import torch
+
+    # "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
+    model_id = "hxbgsyxh/llama-13b-4bit-g-1"
+    llm = LLM(
+        model=model_id,
+        dtype=torch.float16,
+        trust_remote_code=True,
+        quantization="bitblas",
+        max_model_len=1024
+    )
+    ```
diff --git a/vllm_v0.10.0/docs/features/quantization/bnb.md b/vllm_v0.10.0/docs/features/quantization/bnb.md
new file mode 100644
index 0000000..3b15a60
--- /dev/null
+++ b/vllm_v0.10.0/docs/features/quantization/bnb.md
@@ -0,0 +1,56 @@
+# BitsAndBytes
+
+vLLM now supports [BitsAndBytes](https://github.com/TimDettmers/bitsandbytes) for more efficient model inference.
+BitsAndBytes quantizes models to reduce memory usage and enhance performance without significantly sacrificing accuracy.
+Compared to other quantization methods, BitsAndBytes eliminates the need for calibrating the quantized model with input data.
+
+Below are the steps to utilize BitsAndBytes with vLLM.
+
+```bash
+pip install bitsandbytes>=0.46.1
+```
+
+vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
+
+You can find bitsandbytes quantized models on [Hugging Face](https://huggingface.co/models?search=bitsandbytes).
+And usually, these repositories have a config.json file that includes a quantization_config section.
+
+## Read quantized checkpoint
+
+For pre-quantized checkpoints, vLLM will try to infer the quantization method from the config file, so you don't need to explicitly specify the quantization argument.
+
+```python
+from vllm import LLM
+import torch
+# unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
+model_id = "unsloth/tinyllama-bnb-4bit"
+llm = LLM(
+    model=model_id,
+    dtype=torch.bfloat16,
+    trust_remote_code=True
+)
+```
+
+## Inflight quantization: load as 4bit quantization
+
+For inflight 4bit quantization with BitsAndBytes, you need to explicitly specify the quantization argument.
+
+```python
+from vllm import LLM
+import torch
+model_id = "huggyllama/llama-7b"
+llm = LLM(
+    model=model_id,
+    dtype=torch.bfloat16,
+    trust_remote_code=True,
+    quantization="bitsandbytes"
+)
+```
+
+## OpenAI Compatible Server
+
+Append the following to your model arguments for 4bit inflight quantization:
+
+```bash
+--quantization bitsandbytes
+```
diff --git a/vllm_v0.10.0/docs/features/quantization/fp8.md b/vllm_v0.10.0/docs/features/quantization/fp8.md
new file mode 100644
index 0000000..0661933
--- /dev/null
+++ b/vllm_v0.10.0/docs/features/quantization/fp8.md
@@ -0,0 +1,137 @@
+# FP8 W8A8
+
+vLLM supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x.
+Currently, only Hopper and Ada Lovelace GPUs are officially supported for W8A8.
+Ampere GPUs are supported for W8A16 (weight-only FP8) utilizing Marlin kernels.
+Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy.
+
+Please visit the HF collection of [quantized FP8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/fp8-llms-for-vllm-666742ed2b78b7ac8df13127).
+
+The FP8 types typically supported in hardware have two distinct representations, each useful in different scenarios:
+
+- **E4M3**: Consists of 1 sign bit, 4 exponent bits, and 3 bits of mantissa. It can store values up to +/-448 and `nan`.
+- **E5M2**: Consists of 1 sign bit, 5 exponent bits, and 2 bits of mantissa. It can store values up to +/-57344, +/- `inf`, and `nan`. The tradeoff for the increased dynamic range is lower precision of the stored values.
+
+!!! note
+    FP8 computation is supported on NVIDIA GPUs with compute capability > 8.9 (Ada Lovelace, Hopper).
+    FP8 models will run on compute capability > 8.0 (Ampere) as weight-only W8A16, utilizing FP8 Marlin.
+
+## Installation
+
+To produce performant FP8 quantized models with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:
+
+```bash
+pip install llmcompressor
+```
+
+## Quantization Process
+
+The quantization process involves three main steps:
+
+1. Loading the model
+2. Applying quantization
+3. Evaluating accuracy in vLLM
+
+### 1. Loading the Model
+
+Load your model and tokenizer using the standard `transformers` AutoModel classes:
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto",
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+```
+
+### 2. Applying Quantization
+
+For FP8 quantization, we can recover accuracy with simple RTN quantization. We recommend targeting all `Linear` layers using the `FP8_DYNAMIC` scheme, which uses:
+
+- Static, per-channel quantization on the weights
+- Dynamic, per-token quantization on the activations
+
+Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow.
+
+??? code
+
+    ```python
+    from llmcompressor.transformers import oneshot
+    from llmcompressor.modifiers.quantization import QuantizationModifier
+
+    # Configure the simple PTQ quantization
+    recipe = QuantizationModifier(
+      targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
+
+    # Apply the quantization algorithm.
+    oneshot(model=model, recipe=recipe)
+
+    # Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic
+    SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+    model.save_pretrained(SAVE_DIR)
+    tokenizer.save_pretrained(SAVE_DIR)
+    ```
+
+### 3. Evaluating Accuracy
+
+Install `vllm` and `lm-evaluation-harness` for evaluation:
+
+```bash
+pip install vllm lm-eval==0.4.4
+```
+
+Load and run the model in `vllm`:
+
+```python
+from vllm import LLM
+
+llm = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic")
+result = llm.generate("Hello my name is")
+print(result[0].outputs[0].text)
+```
+
+Evaluate accuracy with `lm_eval` (for example on 250 samples of `gsm8k`):
+
+!!! note
+    Quantized models can be sensitive to the presence of the `bos` token. `lm_eval` does not add a `bos` token by default, so make sure to include the `add_bos_token=True` argument when running your evaluations.
+
+```bash
+MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic
+lm_eval \
+  --model vllm \
+  --model_args pretrained=$MODEL,add_bos_token=True \
+  --tasks gsm8k  --num_fewshot 5 --batch_size auto --limit 250
+```
+
+Here's an example of the resulting scores:
+
+```text
+|Tasks|Version|     Filter     |n-shot|  Metric   |   |Value|   |Stderr|
+|-----|------:|----------------|-----:|-----------|---|----:|---|-----:|
+|gsm8k|      3|flexible-extract|     5|exact_match|↑  |0.768|±  |0.0268|
+|     |       |strict-match    |     5|exact_match|↑  |0.768|±  |0.0268|
+```
+
+## Troubleshooting and Support
+
+If you encounter any issues or have feature requests, please open an issue on the [vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor/issues) GitHub repository.
+
+## Online Dynamic Quantization
+
+Dynamic quantization of an original precision BF16/FP16 model to FP8 can be achieved with vLLM without any calibration data required. You can enable the feature by specifying `--quantization="fp8"` in the command line or setting `quantization="fp8"` in the LLM constructor.
+
+In this mode, all Linear modules (except for the final `lm_head`) have their weights quantized down to FP8_E4M3 precision with a per-tensor scale. Activations have their minimum and maximum values calculated during each forward pass to provide a dynamic per-tensor scale for high accuracy. As a result, latency improvements are limited in this mode.
+
+```python
+from vllm import LLM
+
+llm = LLM("facebook/opt-125m", quantization="fp8")
+# INFO 06-10 17:55:42 model_runner.py:157] Loading model weights took 0.1550 GB
+result = llm.generate("Hello, my name is")
+print(result[0].outputs[0].text)
+```
+
+!!! warning
+    Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model.
diff --git a/vllm_v0.10.0/docs/features/quantization/gguf.md b/vllm_v0.10.0/docs/features/quantization/gguf.md
new file mode 100644
index 0000000..2a1c3bd
--- /dev/null
+++ b/vllm_v0.10.0/docs/features/quantization/gguf.md
@@ -0,0 +1,81 @@
+# GGUF
+
+!!! warning
+    Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team.
+
+!!! warning
+    Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use [gguf-split](https://github.com/ggerganov/llama.cpp/pull/6135) tool to merge them to a single-file model.
+
+To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command:
+
+```bash
+wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
+# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
+vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
+   --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0
+```
+
+You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs:
+
+```bash
+# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
+vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
+   --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+   --tensor-parallel-size 2
+```
+
+!!! warning
+    We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size.
+
+GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-config-path
+
+```bash
+# If you model is not supported by huggingface you can manually provide a huggingface compatible config path
+vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
+   --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+   --hf-config-path Tinyllama/TInyLlama-1.1B-Chat-v1.0
+```
+
+You can also use the GGUF model directly through the LLM entrypoint:
+
+??? code
+
+      ```python
+      from vllm import LLM, SamplingParams
+
+      # In this script, we demonstrate how to pass input to the chat method:
+      conversation = [
+         {
+            "role": "system",
+            "content": "You are a helpful assistant"
+         },
+         {
+            "role": "user",
+            "content": "Hello"
+         },
+         {
+            "role": "assistant",
+            "content": "Hello! How can I assist you today?"
+         },
+         {
+            "role": "user",
+            "content": "Write an essay about the importance of higher education.",
+         },
+      ]
+
+      # Create a sampling params object.
+      sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+      # Create an LLM.
+      llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
+               tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+      # Generate texts from the prompts. The output is a list of RequestOutput objects
+      # that contain the prompt, generated text, and other information.
+      outputs = llm.chat(conversation, sampling_params)
+
+      # Print the outputs.
+      for output in outputs:
+         prompt = output.prompt
+         generated_text = output.outputs[0].text
+         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+      ```
diff --git a/vllm_v0.10.0/docs/features/quantization/gptqmodel.md b/vllm_v0.10.0/docs/features/quantization/gptqmodel.md
new file mode 100644
index 0000000..47cb2d6
--- /dev/null
+++ b/vllm_v0.10.0/docs/features/quantization/gptqmodel.md
@@ -0,0 +1,99 @@
+# GPTQModel
+
+To create a new 4-bit or 8-bit GPTQ quantized model, you can leverage [GPTQModel](https://github.com/ModelCloud/GPTQModel) from ModelCloud.AI.
+
+Quantization reduces the model's precision from BF16/FP16 (16-bits) to INT4 (4-bits) or INT8 (8-bits) which significantly reduces the
+total model memory footprint while at-the-same-time increasing inference performance.
+
+Compatible GPTQModel quantized models can leverage the `Marlin` and `Machete` vLLM custom kernels to maximize batching
+transactions-per-second `tps` and token-latency performance for both Ampere (A100+) and Hopper (H100+) Nvidia GPUs.
+These two kernels are highly optimized by vLLM and NeuralMagic (now part of Redhat) to allow world-class inference performance of quantized GPTQ
+models.
+
+GPTQModel is one of the few quantization toolkits in the world that allows `Dynamic` per-module quantization where different layers and/or modules within a llm model can be further optimized with custom quantization parameters. `Dynamic` quantization
+is fully integrated into vLLM and backed up by support from the ModelCloud.AI team. Please refer to [GPTQModel readme](https://github.com/ModelCloud/GPTQModel?tab=readme-ov-file#dynamic-quantization-per-module-quantizeconfig-override)
+for more details on this and other advanced features.
+
+## Installation
+
+You can quantize your own models by installing [GPTQModel](https://github.com/ModelCloud/GPTQModel) or picking one of the [5000+ models on Huggingface](https://huggingface.co/models?search=gptq).
+
+```bash
+pip install -U gptqmodel --no-build-isolation -v
+```
+
+## Quantizing a model
+
+After installing GPTQModel, you are ready to quantize a model. Please refer to the [GPTQModel readme](https://github.com/ModelCloud/GPTQModel/?tab=readme-ov-file#quantization) for further details.
+
+Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
+
+??? code
+
+    ```python
+    from datasets import load_dataset
+    from gptqmodel import GPTQModel, QuantizeConfig
+
+    model_id = "meta-llama/Llama-3.2-1B-Instruct"
+    quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"
+
+    calibration_dataset = load_dataset(
+        "allenai/c4",
+        data_files="en/c4-train.00001-of-01024.json.gz",
+        split="train"
+    ).select(range(1024))["text"]
+
+    quant_config = QuantizeConfig(bits=4, group_size=128)
+
+    model = GPTQModel.load(model_id, quant_config)
+
+    # increase `batch_size` to match gpu/vram specs to speed up quantization
+    model.quantize(calibration_dataset, batch_size=2)
+
+    model.save(quant_path)
+    ```
+
+## Running a quantized model with vLLM
+
+To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command:
+
+```bash
+python examples/offline_inference/llm_engine_example.py \
+    --model ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2
+```
+
+## Using GPTQModel with vLLM's Python API
+
+GPTQModel quantized models are also supported directly through the LLM entrypoint:
+
+??? code
+
+    ```python
+    from vllm import LLM, SamplingParams
+
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
+
+    # Create an LLM.
+    llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
+
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
+    print("-"*50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-"*50)
+    ```
diff --git a/vllm_v0.10.0/docs/features/quantization/inc.md b/vllm_v0.10.0/docs/features/quantization/inc.md
new file mode 100644
index 0000000..d97a462
--- /dev/null
+++ b/vllm_v0.10.0/docs/features/quantization/inc.md
@@ -0,0 +1,56 @@
+---
+title: FP8 INC
+---
+[](){ #inc }
+
+vLLM supports FP8 (8-bit floating point) weight and activation quantization using Intel® Neural Compressor (INC) on Intel® Gaudi® 2 and Intel® Gaudi® 3 AI accelerators.
+Currently, quantization is validated only in Llama models.
+
+Intel Gaudi supports quantization of various modules and functions, including, but not limited to `Linear`, `KVCache`, `Matmul` and `Softmax`. For more information, please refer to:
+[Supported Modules\\Supported Functions\\Custom Patched Modules](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html#supported-modules).
+
+!!! note
+    Measurement files are required to run quantized models with vLLM on Gaudi accelerators. The FP8 model calibration procedure is described in the [vllm-hpu-extention](https://github.com/HabanaAI/vllm-hpu-extension/tree/main/calibration/README.md) package.
+
+!!! note
+    `QUANT_CONFIG` is an environment variable that points to the measurement or quantization [JSON config file](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html#supported-json-config-file-options).
+    The measurement configuration file is used during the calibration procedure to collect measurements for a given model. The quantization configuration is used during inference.
+
+## Run Online Inference Using FP8
+
+Once you've completed the model calibration process and collected the measurements, you can run FP8 inference with vLLM using the following command:
+
+```bash
+export QUANT_CONFIG=/path/to/quant/config/inc/meta-llama-3.1-405b-instruct/maxabs_measure_g3.json
+vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtype fp8_inc --tensor_paralel_size 8
+```
+
+!!! tip
+    If you are just prototyping or testing your model with FP8, you can use the `VLLM_SKIP_WARMUP=true` environment variable to disable the warmup stage, which can take a long time. However, we do not recommend disabling this feature in production environments as it causes a significant performance drop.
+
+!!! tip
+    When using FP8 models, you may experience timeouts caused by the long compilation time of FP8 operations. To mitigate this problem, you can use the below environment variables:
+    `VLLM_ENGINE_ITERATION_TIMEOUT_S` - to adjust the vLLM server timeout. You can set the value in seconds, e.g., 600 equals 10 minutes.
+    `VLLM_RPC_TIMEOUT` - to adjust the RPC protocol timeout used by the OpenAI-compatible API. This value is in microseconds, e.g., 600000 equals 10 minutes.
+
+## Run Offline Inference Using FP8
+
+To run offline inference (after completing the model calibration process):
+
+* Set the "QUANT_CONFIG" environment variable to point to a JSON configuration file with QUANTIZE mode.
+* Pass `quantization=inc` and `kv_cache_dtype=fp8_inc` as parameters to the `LLM` object.
+* Call shutdown method of the model_executor at the end of the run.
+
+```python
+from vllm import LLM
+llm = LLM("llama3.1/Meta-Llama-3.1-8B-Instruct", quantization="inc", kv_cache_dtype="fp8_inc")
+...
+# Call llm.generate on the required prompts and sampling params.
+...
+llm.llm_engine.model_executor.shutdown()
+```
+
+## Device for the Model's Weights Uploading
+
+The unquantized weights are first loaded onto the CPU, then quantized and transferred to the target device (HPU) for model execution.
+This reduces the device memory footprint of model weights, as only quantized weights are stored in the device memory.
diff --git a/vllm_v0.10.0/docs/features/quantization/int4.md b/vllm_v0.10.0/docs/features/quantization/int4.md
new file mode 100644
index 0000000..1df32a1
--- /dev/null
+++ b/vllm_v0.10.0/docs/features/quantization/int4.md
@@ -0,0 +1,175 @@
+# INT4 W4A16
+
+vLLM supports quantizing weights to INT4 for memory savings and inference acceleration. This quantization method is particularly useful for reducing model size and maintaining low latency in workloads with low queries per second (QPS).
+
+Please visit the HF collection of [quantized INT4 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int4-llms-for-vllm-668ec34bf3c9fa45f857df2c).
+
+!!! note
+    INT4 computation is supported on NVIDIA GPUs with compute capability > 8.0 (Ampere, Ada Lovelace, Hopper, Blackwell).
+
+## Prerequisites
+
+To use INT4 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:
+
+```bash
+pip install llmcompressor
+```
+
+Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
+
+```bash
+pip install vllm lm-eval==0.4.4
+```
+
+## Quantization Process
+
+The quantization process involves four main steps:
+
+1. Loading the model
+2. Preparing calibration data
+3. Applying quantization
+4. Evaluating accuracy in vLLM
+
+### 1. Loading the Model
+
+Load your model and tokenizer using the standard `transformers` AutoModel classes:
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto",
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+```
+
+### 2. Preparing Calibration Data
+
+When quantizing weights to INT4, you need sample data to estimate the weight updates and calibrated scales.
+It's best to use calibration data that closely matches your deployment data.
+For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
+
+??? code
+
+    ```python
+    from datasets import load_dataset
+
+    NUM_CALIBRATION_SAMPLES = 512
+    MAX_SEQUENCE_LENGTH = 2048
+
+    # Load and preprocess the dataset
+    ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
+    ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+
+    def preprocess(example):
+        return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
+    ds = ds.map(preprocess)
+
+    def tokenize(sample):
+        return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
+    ds = ds.map(tokenize, remove_columns=ds.column_names)
+    ```
+
+### 3. Applying Quantization
+
+Now, apply the quantization algorithms:
+
+??? code
+
+    ```python
+    from llmcompressor.transformers import oneshot
+    from llmcompressor.modifiers.quantization import GPTQModifier
+    from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+
+    # Configure the quantization algorithms
+    recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
+
+    # Apply quantization
+    oneshot(
+        model=model,
+        dataset=ds,
+        recipe=recipe,
+        max_seq_length=MAX_SEQUENCE_LENGTH,
+        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    )
+
+    # Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
+    SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
+    model.save_pretrained(SAVE_DIR, save_compressed=True)
+    tokenizer.save_pretrained(SAVE_DIR)
+    ```
+
+This process creates a W4A16 model with weights quantized to 4-bit integers.
+
+### 4. Evaluating Accuracy
+
+After quantization, you can load and run the model in vLLM:
+
+```python
+from vllm import LLM
+
+llm = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128")
+```
+
+To evaluate accuracy, you can use `lm_eval`:
+
+```bash
+lm_eval --model vllm \
+  --model_args pretrained="./Meta-Llama-3-8B-Instruct-W4A16-G128",add_bos_token=true \
+  --tasks gsm8k \
+  --num_fewshot 5 \
+  --limit 250 \
+  --batch_size 'auto'
+```
+
+!!! note
+    Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations.
+
+## Best Practices
+
+- Start with 512 samples for calibration data, and increase if accuracy drops
+- Ensure the calibration data contains a high variety of samples to prevent overfitting towards a specific use case
+- Use a sequence length of 2048 as a starting point
+- Employ the chat template or instruction template that the model was trained with
+- If you've fine-tuned a model, consider using a sample of your training data for calibration
+- Tune key hyperparameters to the quantization algorithm:
+  - `dampening_frac` sets how much influence the GPTQ algorithm has. Lower values can improve accuracy, but can lead to numerical instabilities that cause the algorithm to fail.
+  - `actorder` sets the activation ordering. When compressing the weights of a layer weight, the order in which channels are quantized matters. Setting `actorder="weight"` can improve accuracy without added latency.
+
+The following is an example of an expanded quantization recipe you can tune to your own use case:
+
+??? code
+
+    ```python
+    from compressed_tensors.quantization import (
+        QuantizationArgs,
+        QuantizationScheme,
+        QuantizationStrategy,
+        QuantizationType,
+    ) 
+    recipe = GPTQModifier(
+        targets="Linear",
+        config_groups={
+            "config_group": QuantizationScheme(
+                targets=["Linear"],
+                weights=QuantizationArgs(
+                    num_bits=4,
+                    type=QuantizationType.INT,
+                    strategy=QuantizationStrategy.GROUP,
+                    group_size=128,
+                    symmetric=True,
+                    dynamic=False,
+                    actorder="weight",
+                ),
+            ),
+        },
+        ignore=["lm_head"],
+        update_size=NUM_CALIBRATION_SAMPLES,
+        dampening_frac=0.01
+    )
+    ```
+
+## Troubleshooting and Support
+
+If you encounter any issues or have feature requests, please open an issue on the [vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor/issues) GitHub repository. The full INT4 quantization example in `llm-compressor` is available [here](https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_w4a16/llama3_example.py).
diff --git a/vllm_v0.10.0/docs/features/quantization/int8.md b/vllm_v0.10.0/docs/features/quantization/int8.md
new file mode 100644
index 0000000..45fae58
--- /dev/null
+++ b/vllm_v0.10.0/docs/features/quantization/int8.md
@@ -0,0 +1,144 @@
+# INT8 W8A8
+
+vLLM supports quantizing weights and activations to INT8 for memory savings and inference acceleration.
+This quantization method is particularly useful for reducing model size while maintaining good performance.
+
+Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415).
+
+!!! note
+    INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper, Blackwell).
+
+## Prerequisites
+
+To use INT8 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:
+
+```bash
+pip install llmcompressor
+```
+
+Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
+
+```bash
+pip install vllm lm-eval==0.4.4
+```
+
+## Quantization Process
+
+The quantization process involves four main steps:
+
+1. Loading the model
+2. Preparing calibration data
+3. Applying quantization
+4. Evaluating accuracy in vLLM
+
+### 1. Loading the Model
+
+Load your model and tokenizer using the standard `transformers` AutoModel classes:
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto",
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+```
+
+### 2. Preparing Calibration Data
+
+When quantizing activations to INT8, you need sample data to estimate the activation scales.
+It's best to use calibration data that closely matches your deployment data.
+For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
+
+??? code
+
+    ```python
+    from datasets import load_dataset
+
+    NUM_CALIBRATION_SAMPLES = 512
+    MAX_SEQUENCE_LENGTH = 2048
+
+    # Load and preprocess the dataset
+    ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
+    ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+
+    def preprocess(example):
+        return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
+    ds = ds.map(preprocess)
+
+    def tokenize(sample):
+        return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
+    ds = ds.map(tokenize, remove_columns=ds.column_names)
+    ```
+
+</details>
+
+### 3. Applying Quantization
+
+Now, apply the quantization algorithms:
+
+??? code
+
+    ```python
+    from llmcompressor.transformers import oneshot
+    from llmcompressor.modifiers.quantization import GPTQModifier
+    from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+
+    # Configure the quantization algorithms
+    recipe = [
+        SmoothQuantModifier(smoothing_strength=0.8),
+        GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
+    ]
+
+    # Apply quantization
+    oneshot(
+        model=model,
+        dataset=ds,
+        recipe=recipe,
+        max_seq_length=MAX_SEQUENCE_LENGTH,
+        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    )
+
+    # Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token
+    SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
+    model.save_pretrained(SAVE_DIR, save_compressed=True)
+    tokenizer.save_pretrained(SAVE_DIR)
+    ```
+
+This process creates a W8A8 model with weights and activations quantized to 8-bit integers.
+
+### 4. Evaluating Accuracy
+
+After quantization, you can load and run the model in vLLM:
+
+```python
+from vllm import LLM
+
+llm = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token")
+```
+
+To evaluate accuracy, you can use `lm_eval`:
+
+```bash
+lm_eval --model vllm \
+  --model_args pretrained="./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token",add_bos_token=true \
+  --tasks gsm8k \
+  --num_fewshot 5 \
+  --limit 250 \
+  --batch_size 'auto'
+```
+
+!!! note
+    Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations.
+
+## Best Practices
+
+- Start with 512 samples for calibration data (increase if accuracy drops)
+- Use a sequence length of 2048 as a starting point
+- Employ the chat template or instruction template that the model was trained with
+- If you've fine-tuned a model, consider using a sample of your training data for calibration
+
+## Troubleshooting and Support
+
+If you encounter any issues or have feature requests, please open an issue on the [vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor/issues) GitHub repository.
diff --git a/vllm_v0.10.0/docs/features/quantization/modelopt.md b/vllm_v0.10.0/docs/features/quantization/modelopt.md
new file mode 100644
index 0000000..39ae03b
--- /dev/null
+++ b/vllm_v0.10.0/docs/features/quantization/modelopt.md
@@ -0,0 +1,82 @@
+# NVIDIA TensorRT Model Optimizer
+
+The [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) is a library designed to optimize models for inference with NVIDIA GPUs. It includes tools for Post-Training Quantization (PTQ) and Quantization Aware Training (QAT) of Large Language Models (LLMs), Vision Language Models (VLMs), and diffusion models.
+
+We recommend installing the library with:
+
+```bash
+pip install nvidia-modelopt
+```
+
+## Quantizing HuggingFace Models with PTQ
+
+You can quantize HuggingFace models using the example scripts provided in the TensorRT Model Optimizer repository. The primary script for LLM PTQ is typically found within the `examples/llm_ptq` directory.
+
+Below is an example showing how to quantize a model using modelopt's PTQ API:
+
+??? code
+
+    ```python
+    import modelopt.torch.quantization as mtq
+    from transformers import AutoModelForCausalLM
+
+    # Load the model from HuggingFace
+    model = AutoModelForCausalLM.from_pretrained("<path_or_model_id>")
+
+    # Select the quantization config, for example, FP8
+    config = mtq.FP8_DEFAULT_CFG
+
+    # Define a forward loop function for calibration
+    def forward_loop(model):
+        for data in calib_set:
+            model(data)
+
+    # PTQ with in-place replacement of quantized modules
+    model = mtq.quantize(model, config, forward_loop)
+    ```
+
+After the model is quantized, you can export it to a quantized checkpoint using the export API:
+
+```python
+import torch
+from modelopt.torch.export import export_hf_checkpoint
+
+with torch.inference_mode():
+    export_hf_checkpoint(
+        model,  # The quantized model.
+        export_dir,  # The directory where the exported files will be stored.
+    )
+```
+
+The quantized checkpoint can then be deployed with vLLM. As an example, the following code shows how to deploy `nvidia/Llama-3.1-8B-Instruct-FP8`, which is the FP8 quantized checkpoint derived from `meta-llama/Llama-3.1-8B-Instruct`, using vLLM:
+
+??? code
+
+    ```python
+    from vllm import LLM, SamplingParams
+
+    def main():
+
+        model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
+        # Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
+        llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
+
+        sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
+
+        prompts = [
+            "Hello, my name is",
+            "The president of the United States is",
+            "The capital of France is",
+            "The future of AI is",
+        ]
+
+        outputs = llm.generate(prompts, sampling_params)
+
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    if __name__ == "__main__":
+        main()
+    ```
diff --git a/vllm_v0.10.0/docs/features/quantization/quantized_kvcache.md b/vllm_v0.10.0/docs/features/quantization/quantized_kvcache.md
new file mode 100644
index 0000000..c54ec43
--- /dev/null
+++ b/vllm_v0.10.0/docs/features/quantization/quantized_kvcache.md
@@ -0,0 +1,149 @@
+# Quantized KV Cache
+
+## FP8 KV Cache
+
+Quantizing the KV cache to FP8 reduces its memory footprint. This increases the number of tokens that can be stored in the cache, improving throughput.
+
+### FP8 Formats
+
+[OCP (Open Compute Project)](https://www.opencompute.org) specifies two common 8-bit floating point data formats:
+
+- E5M2 (5 exponent bits and 2 mantissa bits)
+- E4M3FN (4 exponent bits and 3 mantissa bits, often shortened as E4M3)
+
+The E4M3 format offers higher precision compared to E5M2. However, due to its small dynamic range (±240.0), E4M3 typically requires a higher-precision (FP32) scaling factor alongside each quantized tensor.
+
+### Current Limitations
+
+For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling factors of a finer granularity (e.g. per-channel).
+
+### Performance Impact
+
+The current FP8 KV cache implementation primarily benefits throughput by allowing approximately double the amount of space for KV cache allocation. This enables either:
+
+- Processing longer context lengths for individual requests, or
+- Handling more concurrent request batches
+
+However, there are currently no latency improvements as the implementation does not yet include fused dequantization and attention operations. Future releases will support quantized attention with hardware acceleration, which should provide additional performance benefits. While the most recent silicon offerings (e.g. AMD MI300, NVIDIA Hopper or later) support native hardware conversion between FP8 and other formats (fp32, fp16, bf16), this benefit is not yet fully realized.
+
+Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy, making it a practical choice for throughput optimization.
+
+## Usage Example
+
+Here is an example of how to enable FP8 quantization:
+
+??? code
+
+    ```python
+    # To calculate kv cache scales on the fly enable the calculate_kv_scales
+    # parameter
+
+    from vllm import LLM, SamplingParams
+
+    sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
+    llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
+            kv_cache_dtype="fp8",
+            calculate_kv_scales=True)
+    prompt = "London is the capital of"
+    out = llm.generate(prompt, sampling_params)[0].outputs[0].text
+    print(out)
+    ```
+
+The `kv_cache_dtype` argument specifies the data type for KV cache storage:
+- `"auto"`: Uses the model's default "unquantized" data type
+- `"fp8"` or `"fp8_e4m3"`: Supported on CUDA 11.8+ and ROCm (AMD GPU)
+- `"fp8_e5m2"`: Supported on CUDA 11.8+
+
+## Calibrated Scales for Better Accuracy
+
+For optimal model quality when using FP8 KV Cache, we recommend using calibrated scales tuned to representative inference data. [LLM Compressor](https://github.com/vllm-project/llm-compressor/) is the recommended tool for this process.
+
+### Installation
+
+First, install the required dependencies:
+
+```bash
+pip install llmcompressor
+```
+
+### Example Usage
+
+Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models can use this same pattern):
+
+??? code
+
+    ```python
+    from datasets import load_dataset
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    from llmcompressor.transformers import oneshot
+
+    # Select model and load it
+    MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+    # Select calibration dataset
+    DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+    DATASET_SPLIT = "train_sft"
+
+    # Configure calibration parameters
+    NUM_CALIBRATION_SAMPLES = 512  # 512 samples is a good starting point
+    MAX_SEQUENCE_LENGTH = 2048
+
+    # Load and preprocess dataset
+    ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
+    ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+
+    def process_and_tokenize(example):
+        text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
+        return tokenizer(
+            text,
+            padding=False,
+            max_length=MAX_SEQUENCE_LENGTH,
+            truncation=True,
+            add_special_tokens=False,
+        )
+
+    ds = ds.map(process_and_tokenize, remove_columns=ds.column_names)
+
+    # Configure quantization settings
+    recipe = """
+    quant_stage:
+        quant_modifiers:
+            QuantizationModifier:
+                kv_cache_scheme:
+                    num_bits: 8
+                    type: float
+                    strategy: tensor
+                    dynamic: false
+                    symmetric: true
+    """
+
+    # Apply quantization
+    oneshot(
+        model=model,
+        dataset=ds,
+        recipe=recipe,
+        max_seq_length=MAX_SEQUENCE_LENGTH,
+        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    )
+
+    # Save quantized model: Llama-3.1-8B-Instruct-FP8-KV
+    SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
+    model.save_pretrained(SAVE_DIR, save_compressed=True)
+    tokenizer.save_pretrained(SAVE_DIR)
+    ```
+
+The above script will create a folder in your current directory containing your quantized model (e.g., `Llama-3.1-8B-Instruct-FP8-KV`) with calibrated scales.
+
+When running the model you must specify `kv_cache_dtype="fp8"` in order to enable the kv cache quantization and use the scales.
+
+```python
+from vllm import LLM, SamplingParams
+
+sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
+llm = LLM(model="Llama-3.1-8B-Instruct-FP8-KV", kv_cache_dtype="fp8")
+prompt = "London is the capital of"
+out = llm.generate(prompt, sampling_params)[0].outputs[0].text
+print(out)
+```
diff --git a/vllm_v0.10.0/docs/features/quantization/quark.md b/vllm_v0.10.0/docs/features/quantization/quark.md
new file mode 100644
index 0000000..5abfae3
--- /dev/null
+++ b/vllm_v0.10.0/docs/features/quantization/quark.md
@@ -0,0 +1,256 @@
+# AMD Quark
+
+Quantization can effectively reduce memory and bandwidth usage, accelerate computation and improve
+throughput while with minimal accuracy loss. vLLM can leverage [Quark](https://quark.docs.amd.com/latest/),
+the flexible and powerful quantization toolkit, to produce performant quantized models to run on AMD GPUs. Quark has specialized support for quantizing large language models with weight,
+activation and kv-cache quantization and cutting-edge quantization algorithms like
+AWQ, GPTQ, Rotation and SmoothQuant.
+
+## Quark Installation
+
+Before quantizing models, you need to install Quark. The latest release of Quark can be installed with pip:
+
+```bash
+pip install amd-quark
+```
+
+You can refer to [Quark installation guide](https://quark.docs.amd.com/latest/install.html)
+for more installation details.
+
+Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
+
+```bash
+pip install vllm lm-eval==0.4.4
+```
+
+## Quantization Process
+
+After installing Quark, we will use an example to illustrate how to use Quark.
+The Quark quantization process can be listed for 5 steps as below:
+
+1. Load the model
+2. Prepare the calibration dataloader
+3. Set the quantization configuration
+4. Quantize the model and export
+5. Evaluation in vLLM
+
+### 1. Load the Model
+
+Quark uses [Transformers](https://huggingface.co/docs/transformers/en/index)
+to fetch model and tokenizer.
+
+??? code
+
+    ```python
+    from transformers import AutoTokenizer, AutoModelForCausalLM
+
+    MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
+    MAX_SEQ_LEN = 512
+
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID, device_map="auto", torch_dtype="auto",
+    )
+    model.eval()
+
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN)
+    tokenizer.pad_token = tokenizer.eos_token
+    ```
+
+### 2. Prepare the Calibration Dataloader
+
+Quark uses the [PyTorch Dataloader](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html)
+to load calibration data. For more details about how to use calibration datasets efficiently, please refer
+to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calibration_datasets.html).
+
+??? code
+
+    ```python
+    from datasets import load_dataset
+    from torch.utils.data import DataLoader
+
+    BATCH_SIZE = 1
+    NUM_CALIBRATION_DATA = 512
+
+    # Load the dataset and get calibration data.
+    dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
+    text_data = dataset["text"][:NUM_CALIBRATION_DATA]
+
+    tokenized_outputs = tokenizer(text_data, return_tensors="pt",
+        padding=True, truncation=True, max_length=MAX_SEQ_LEN)
+    calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
+        batch_size=BATCH_SIZE, drop_last=True)
+    ```
+
+### 3. Set the Quantization Configuration
+
+We need to set the quantization configuration, you can check
+[quark config guide](https://quark.docs.amd.com/latest/pytorch/user_guide_config_description.html)
+for further details. Here we use FP8 per-tensor quantization on weight, activation,
+kv-cache and the quantization algorithm is AutoSmoothQuant.
+
+!!! note
+    Note the quantization algorithm needs a JSON config file and the config file is located in
+    [Quark Pytorch examples](https://quark.docs.amd.com/latest/pytorch/pytorch_examples.html),
+    under the directory `examples/torch/language_modeling/llm_ptq/models`. For example,
+    AutoSmoothQuant config file for Llama is
+    `examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`.
+
+??? code
+
+    ```python
+    from quark.torch.quantization import (Config, QuantizationConfig,
+                                        FP8E4M3PerTensorSpec,
+                                        load_quant_algo_config_from_file)
+
+    # Define fp8/per-tensor/static spec.
+    FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
+        is_dynamic=False).to_quantization_spec()
+
+    # Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
+    global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
+        weight=FP8_PER_TENSOR_SPEC)
+
+    # Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
+    KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
+    kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
+    kv_cache_quant_config = {name :
+        QuantizationConfig(input_tensors=global_quant_config.input_tensors,
+                        weight=global_quant_config.weight,
+                        output_tensors=KV_CACHE_SPEC)
+        for name in kv_cache_layer_names_for_llama}
+    layer_quant_config = kv_cache_quant_config.copy()
+
+    # Define algorithm config by config file.
+    LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
+        'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
+    algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
+
+    EXCLUDE_LAYERS = ["lm_head"]
+    quant_config = Config(
+        global_quant_config=global_quant_config,
+        layer_quant_config=layer_quant_config,
+        kv_cache_quant_config=kv_cache_quant_config,
+        exclude=EXCLUDE_LAYERS,
+        algo_config=algo_config)
+    ```
+
+### 4. Quantize the Model and Export
+
+Then we can apply the quantization. After quantizing, we need to freeze the
+quantized model first before exporting. Note that we need to export model with format of
+HuggingFace `safetensors`, you can refer to
+[HuggingFace format exporting](https://quark.docs.amd.com/latest/pytorch/export/quark_export_hf.html)
+for more exporting format details.
+
+??? code
+
+    ```python
+    import torch
+    from quark.torch import ModelQuantizer, ModelExporter
+    from quark.torch.export import ExporterConfig, JsonExporterConfig
+
+    # Apply quantization.
+    quantizer = ModelQuantizer(quant_config)
+    quant_model = quantizer.quantize_model(model, calib_dataloader)
+
+    # Freeze quantized model to export.
+    freezed_model = quantizer.freeze(model)
+
+    # Define export config.
+    LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
+    export_config = ExporterConfig(json_export_config=JsonExporterConfig())
+    export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
+
+    # Model: Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant
+    EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
+    exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
+    with torch.no_grad():
+        exporter.export_safetensors_model(freezed_model,
+            quant_config=quant_config, tokenizer=tokenizer)
+    ```
+
+### 5. Evaluation in vLLM
+
+Now, you can load and run the Quark quantized model directly through the LLM entrypoint:
+
+??? code
+
+    ```python
+    from vllm import LLM, SamplingParams
+
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    # Create an LLM.
+    llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
+            kv_cache_dtype='fp8',quantization='quark')
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt:    {prompt!r}")
+        print(f"Output:    {generated_text!r}")
+        print("-" * 60)
+    ```
+
+Or, you can use `lm_eval` to evaluate accuracy:
+
+```bash
+lm_eval --model vllm \
+  --model_args pretrained=Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant,kv_cache_dtype='fp8',quantization='quark' \
+  --tasks gsm8k
+```
+
+## Quark Quantization Script
+In addition to the example of Python API above, Quark also offers a
+[quantization script](https://quark.docs.amd.com/latest/pytorch/example_quark_torch_llm_ptq.html)
+to quantize large language models more conveniently. It supports quantizing models with variety
+of different quantization schemes and optimization algorithms. It can export the quantized model
+and run evaluation tasks on the fly. With the script, the example above can be:
+
+```bash
+python3 quantize_quark.py --model_dir meta-llama/Llama-2-70b-chat-hf \
+                          --output_dir /path/to/output \
+                          --quant_scheme w_fp8_a_fp8 \
+                          --kv_cache_dtype fp8 \
+                          --quant_algo autosmoothquant \
+                          --num_calib_data 512 \
+                          --model_export hf_format \
+                          --tasks gsm8k
+```
+
+## Using MXFP4 models
+
+vLLM supports loading MXFP4 models quantized offline through AMD Quark, compliant with [Open Compute Project (OCP) specification](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf).
+
+The scheme currently only supports dynamic quantization for activations.
+
+Example usage, after installing the latest AMD Quark release:
+
+```bash
+vllm serve fxmarty/qwen_1.5-moe-a2.7b-mxfp4 --tensor-parallel-size 1
+```
+
+A simulation of the matrix multiplication execution in MXFP4 can be run on devices that do not support MXFP4 operations natively (e.g. AMD Instinct MI325, MI300 and MI250), dequantizing weights from MXFP4 to half precision on the fly, using a fused kernel. This is useful e.g. to evaluate MXFP4 models using vLLM, or alternatively to benefit from the ~4x memory savings (compared to float16 and bfloat16).
+
+To generate offline models quantized using MXFP4 data type, the easiest approach is to use AMD Quark's [quantization script](https://quark.docs.amd.com/latest/pytorch/example_quark_torch_llm_ptq.html), as an example:
+
+```bash
+python quantize_quark.py --model_dir Qwen/Qwen1.5-MoE-A2.7B-Chat \
+    --quant_scheme w_mxfp4_a_mxfp4_sym \
+    --output_dir qwen_1.5-moe-a2.7b-mxfp4 \
+    --skip_evaluation \
+    --model_export hf_format \
+    --group_size 32
+```
diff --git a/vllm_v0.10.0/docs/features/quantization/supported_hardware.md b/vllm_v0.10.0/docs/features/quantization/supported_hardware.md
new file mode 100644
index 0000000..70a6a49
--- /dev/null
+++ b/vllm_v0.10.0/docs/features/quantization/supported_hardware.md
@@ -0,0 +1,26 @@
+# Supported Hardware
+
+The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
+
+| Implementation        | Volta   | Turing   | Ampere   | Ada   | Hopper   | AMD GPU   | Intel GPU   | Intel Gaudi | x86 CPU   | AWS Neuron   | Google TPU   |
+|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------|--------------|
+| AWQ                   | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌        | ✅︎          | ❌         | ✅︎        | ❌           | ❌           |
+| GPTQ                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎        | ❌        | ✅︎          | ❌         | ✅︎        | ❌           | ❌           |
+| Marlin (GPTQ/AWQ/FP8) | ❌      | ❌      | ✅︎       | ✅︎    | ✅︎       | ❌        | ❌          | ❌         | ❌        | ❌          | ❌           |
+| INT8 (W8A8)           | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌        | ❌          | ❌         | ✅︎        | ✅︎           | ✅︎            |
+| FP8 (W8A8)            | ❌      | ❌      | ❌       | ✅︎    | ✅︎      | ✅︎         | ❌          | ❌         | ❌        | ✅︎           | ❌           |
+| BitBLAS (GPTQ)        | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| AQLM                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| bitsandbytes          | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| DeepSpeedFP           | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| GGUF                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌         | ❌          | ❌           |
+| INC (W8A8)            | ❌      | ❌      | ❌      | ❌    | ❌      | ❌        | ❌          | ✅︎         | ❌         | ❌           | ❌          |
+
+- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
+- ✅︎ indicates that the quantization method is supported on the specified hardware.
+- ❌ indicates that the quantization method is not supported on the specified hardware.
+
+!!! note
+    This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
+
+    For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.
diff --git a/vllm_v0.10.0/docs/features/quantization/torchao.md b/vllm_v0.10.0/docs/features/quantization/torchao.md
new file mode 100644
index 0000000..ab68021
--- /dev/null
+++ b/vllm_v0.10.0/docs/features/quantization/torchao.md
@@ -0,0 +1,42 @@
+# TorchAO
+
+TorchAO is an architecture optimization library for PyTorch, it provides high performance dtypes, optimization techniques and kernels for inference and training, featuring composability with native PyTorch features like torch.compile, FSDP etc.. Some benchmark numbers can be found [here](https://github.com/pytorch/ao/tree/main/torchao/quantization#benchmarks).
+
+We recommend installing the latest torchao nightly with
+
+```bash
+# Install the latest TorchAO nightly build
+# Choose the CUDA version that matches your system (cu126, cu128, etc.)
+pip install \
+    --pre torchao>=10.0.0 \
+    --index-url https://download.pytorch.org/whl/nightly/cu126
+```
+
+## Quantizing HuggingFace Models
+You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code:
+
+??? code
+
+    ```Python
+    import torch
+    from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
+    from torchao.quantization import Int8WeightOnlyConfig
+
+    model_name = "meta-llama/Meta-Llama-3-8B"
+    quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
+    quantized_model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype="auto",
+        device_map="auto",
+        quantization_config=quantization_config
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    input_text = "What are we having for dinner?"
+    input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+    hub_repo = # YOUR HUB REPO ID
+    tokenizer.push_to_hub(hub_repo)
+    quantized_model.push_to_hub(hub_repo, safe_serialization=False)
+    ```
+
+Alternatively, you can use the [TorchAO Quantization space](https://huggingface.co/spaces/medmekk/TorchAO_Quantization) for quantizing models with a simple UI.
diff --git a/vllm_v0.10.0/docs/features/reasoning_outputs.md b/vllm_v0.10.0/docs/features/reasoning_outputs.md
new file mode 100644
index 0000000..6b84eca
--- /dev/null
+++ b/vllm_v0.10.0/docs/features/reasoning_outputs.md
@@ -0,0 +1,291 @@
+# Reasoning Outputs
+
+vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions.
+
+Reasoning models return an additional `reasoning_content` field in their outputs, which contains the reasoning steps that led to the final conclusion. This field is not present in the outputs of other models.
+
+## Supported Models
+
+vLLM currently supports the following reasoning models:
+
+| Model Series | Parser Name | Structured Output Support | Tool Calling |
+|--------------|-------------|------------------|-------------|
+| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` | ❌ |
+| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | ✅ |
+| [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
+| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `guided_json`, `guided_regex` | ✅ |
+| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `guided_json`, `guided_regex` | ✅ |
+
+!!! note
+    IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
+    The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`.
+
+## Quickstart
+
+To use reasoning models, you need to specify the `--reasoning-parser` flags when making a request to the chat completion endpoint. The `--reasoning-parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output.
+
+```bash
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+    --reasoning-parser deepseek_r1
+```
+
+Next, make a request to the model that should return the reasoning content in the response.
+
+??? code
+
+    ```python
+    from openai import OpenAI
+
+    # Modify OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    # Round 1
+    messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+    # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
+    # For Qwen3 series, if you want to disable thinking in reasoning mode, add:
+    # extra_body={"chat_template_kwargs": {"enable_thinking": False}}
+    response = client.chat.completions.create(model=model, messages=messages)
+
+    reasoning_content = response.choices[0].message.reasoning_content
+    content = response.choices[0].message.content
+
+    print("reasoning_content:", reasoning_content)
+    print("content:", content)
+    ```
+
+The `reasoning_content` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion.
+
+## Streaming chat completions
+
+Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming).
+
+??? console "Json"
+
+    ```json
+    {
+        "id": "chatcmpl-123",
+        "object": "chat.completion.chunk",
+        "created": 1694268190,
+        "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+        "system_fingerprint": "fp_44709d6fcb",
+        "choices": [
+            {
+                "index": 0,
+                "delta": {
+                    "role": "assistant",
+                    "reasoning_content": "is",
+                },
+                "logprobs": null,
+                "finish_reason": null
+            }
+        ]
+    }
+    ```
+
+OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client supports extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example:
+
+??? code
+
+    ```python
+    from openai import OpenAI
+
+    # Modify OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+    # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
+    # For Qwen3 series, if you want to disable thinking in reasoning mode, add:
+    # extra_body={"chat_template_kwargs": {"enable_thinking": False}}
+    stream = client.chat.completions.create(model=model,
+                                            messages=messages,
+                                            stream=True)
+
+    print("client: Start streaming chat completions...")
+    printed_reasoning_content = False
+    printed_content = False
+
+    for chunk in stream:
+        reasoning_content = None
+        content = None
+        # Check the content is reasoning_content or content
+        if hasattr(chunk.choices[0].delta, "reasoning_content"):
+            reasoning_content = chunk.choices[0].delta.reasoning_content
+        elif hasattr(chunk.choices[0].delta, "content"):
+            content = chunk.choices[0].delta.content
+
+        if reasoning_content is not None:
+            if not printed_reasoning_content:
+                printed_reasoning_content = True
+                print("reasoning_content:", end="", flush=True)
+            print(reasoning_content, end="", flush=True)
+        elif content is not None:
+            if not printed_content:
+                printed_content = True
+                print("\ncontent:", end="", flush=True)
+            # Extract and print the content
+            print(content, end="", flush=True)
+    ```
+
+Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
+
+## Tool Calling
+
+The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`.
+
+??? code
+
+    ```python
+    from openai import OpenAI
+
+    client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
+
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
+                },
+                "required": ["location", "unit"]
+            }
+        }
+    }]
+
+    response = client.chat.completions.create(
+        model=client.models.list().data[0].id,
+        messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
+        tools=tools,
+        tool_choice="auto"
+    )
+
+    print(response)
+    tool_call = response.choices[0].message.tool_calls[0].function
+
+    print(f"reasoning_content: {response.choices[0].message.reasoning_content}")
+    print(f"Function called: {tool_call.name}")
+    print(f"Arguments: {tool_call.arguments}")
+    ```
+
+For more examples, please refer to <gh-file:examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py>.
+
+## Limitations
+
+- The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`).
+
+## How to support a new reasoning model
+
+You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
+
+??? code
+
+    ```python
+    # import the required packages
+
+    from vllm.reasoning import ReasoningParser, ReasoningParserManager
+    from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                                DeltaMessage)
+
+    # define a reasoning parser and register it to vllm
+    # the name list in register_module can be used
+    # in --reasoning-parser.
+    @ReasoningParserManager.register_module(["example"])
+    class ExampleParser(ReasoningParser):
+        def __init__(self, tokenizer: AnyTokenizer):
+            super().__init__(tokenizer)
+
+        def extract_reasoning_content_streaming(
+            self,
+            previous_text: str,
+            current_text: str,
+            delta_text: str,
+            previous_token_ids: Sequence[int],
+            current_token_ids: Sequence[int],
+            delta_token_ids: Sequence[int],
+        ) -> Union[DeltaMessage, None]:
+            """
+            Instance method that should be implemented for extracting reasoning
+            from an incomplete response; for use when handling reasoning calls and
+            streaming. Has to be an instance method because  it requires state -
+            the current tokens/diffs, but also the information about what has
+            previously been parsed and extracted (see constructor)
+            """
+
+        def extract_reasoning_content(
+                self, model_output: str, request: ChatCompletionRequest
+        ) -> tuple[Optional[str], Optional[str]]:
+            """
+            Extract reasoning content from a complete model-generated string.
+
+            Used for non-streaming responses where we have the entire model response
+            available before sending to the client.
+
+            Parameters:
+            model_output: str
+                The model-generated string to extract reasoning content from.
+
+            request: ChatCompletionRequest
+                The request object that was used to generate the model_output.
+
+            Returns:
+            tuple[Optional[str], Optional[str]]
+                A tuple containing the reasoning content and the content.
+            """
+    ```
+
+Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
+
+??? code
+
+    ```python
+    @dataclass
+    class DeepSeekReasoner(Reasoner):
+        """
+        Reasoner for DeepSeek R series models.
+        """
+        start_token_id: int
+        end_token_id: int
+
+        start_token: str = "<think>"
+        end_token: str = "</think>"
+
+        @classmethod
+        def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
+            return cls(start_token_id=tokenizer.encode(
+                "<think>", add_special_tokens=False)[0],
+                    end_token_id=tokenizer.encode("</think>",
+                                                    add_special_tokens=False)[0])
+
+        def is_reasoning_end(self, input_ids: list[int]) -> bool:
+            return self.end_token_id in input_ids
+        ...
+    ```
+
+The structured output engine like [xgrammar](https://github.com/mlc-ai/xgrammar) will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case.
+
+Finally, you can enable reasoning for the model by using the `--reasoning-parser` flags.
+
+```bash
+vllm serve <model_tag> --reasoning-parser example
+```
diff --git a/vllm_v0.10.0/docs/features/spec_decode.md b/vllm_v0.10.0/docs/features/spec_decode.md
new file mode 100644
index 0000000..be4b91f
--- /dev/null
+++ b/vllm_v0.10.0/docs/features/spec_decode.md
@@ -0,0 +1,285 @@
+# Speculative Decoding
+
+!!! warning
+    Please note that speculative decoding in vLLM is not yet optimized and does
+    not usually yield inter-token latency reductions for all prompt datasets or sampling parameters.
+    The work to optimize it is ongoing and can be followed here: <gh-issue:4630>
+
+!!! warning
+    Currently, speculative decoding in vLLM is not compatible with pipeline parallelism.
+
+This document shows how to use [Speculative Decoding](https://x.com/karpathy/status/1697318534555336961) with vLLM.
+Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference.
+
+## Speculating with a draft model
+
+The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.
+
+??? code
+
+    ```python
+    from vllm import LLM, SamplingParams
+
+    prompts = [
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    llm = LLM(
+        model="facebook/opt-6.7b",
+        tensor_parallel_size=1,
+        speculative_config={
+            "model": "facebook/opt-125m",
+            "num_speculative_tokens": 5,
+        },
+    )
+    outputs = llm.generate(prompts, sampling_params)
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    ```
+
+To perform the same with an online mode launch the server:
+
+```bash
+python -m vllm.entrypoints.openai.api_server \
+    --host 0.0.0.0 \
+    --port 8000 \
+    --model facebook/opt-6.7b \
+    --seed 42 \
+    -tp 1 \
+    --gpu_memory_utilization 0.8 \
+    --speculative_config '{"model": "facebook/opt-125m", "num_speculative_tokens": 5}'
+```
+
+!!! warning
+    Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately has been deprecated now.
+
+Then use a client:
+
+??? code
+
+    ```python
+    from openai import OpenAI
+
+    # Modify OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    # Completion API
+    stream = False
+    completion = client.completions.create(
+        model=model,
+        prompt="The future of AI is",
+        echo=False,
+        n=1,
+        stream=stream,
+    )
+
+    print("Completion results:")
+    if stream:
+        for c in completion:
+            print(c)
+    else:
+        print(completion)
+    ```
+
+## Speculating by matching n-grams in the prompt
+
+The following code configures vLLM to use speculative decoding where proposals are generated by
+matching n-grams in the prompt. For more information read [this thread.](https://x.com/joao_gante/status/1747322413006643259)
+
+??? code
+
+    ```python
+    from vllm import LLM, SamplingParams
+
+    prompts = [
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    llm = LLM(
+        model="facebook/opt-6.7b",
+        tensor_parallel_size=1,
+        speculative_config={
+            "method": "ngram",
+            "num_speculative_tokens": 5,
+            "prompt_lookup_max": 4,
+        },
+    )
+    outputs = llm.generate(prompts, sampling_params)
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    ```
+
+## Speculating using MLP speculators
+
+The following code configures vLLM to use speculative decoding where proposals are generated by
+draft models that conditioning draft predictions on both context vectors and sampled tokens.
+For more information see [this blog](https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/) or
+[this technical report](https://arxiv.org/abs/2404.19124).
+
+??? code
+
+    ```python
+    from vllm import LLM, SamplingParams
+
+    prompts = [
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    llm = LLM(
+        model="meta-llama/Meta-Llama-3.1-70B-Instruct",
+        tensor_parallel_size=4,
+        speculative_config={
+            "model": "ibm-ai-platform/llama3-70b-accelerator",
+            "draft_tensor_parallel_size": 1,
+        },
+    )
+    outputs = llm.generate(prompts, sampling_params)
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    ```
+
+Note that these speculative models currently need to be run without tensor parallelism, although
+it is possible to run the main model using tensor parallelism (see example above). Since the
+speculative models are relatively small, we still see significant speedups. However, this
+limitation will be fixed in a future release.
+
+A variety of speculative models of this type are available on HF hub:
+
+- [llama-13b-accelerator](https://huggingface.co/ibm-ai-platform/llama-13b-accelerator)
+- [llama3-8b-accelerator](https://huggingface.co/ibm-ai-platform/llama3-8b-accelerator)
+- [codellama-34b-accelerator](https://huggingface.co/ibm-ai-platform/codellama-34b-accelerator)
+- [llama2-70b-accelerator](https://huggingface.co/ibm-ai-platform/llama2-70b-accelerator)
+- [llama3-70b-accelerator](https://huggingface.co/ibm-ai-platform/llama3-70b-accelerator)
+- [granite-3b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-3b-code-instruct-accelerator)
+- [granite-8b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-8b-code-instruct-accelerator)
+- [granite-7b-instruct-accelerator](https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator)
+- [granite-20b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-20b-code-instruct-accelerator)
+
+## Speculating using EAGLE based draft models
+
+The following code configures vLLM to use speculative decoding where proposals are generated by
+an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](gh-file:examples/offline_inference/eagle.py).
+
+??? code
+
+    ```python
+    from vllm import LLM, SamplingParams
+
+    prompts = [
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    llm = LLM(
+        model="meta-llama/Meta-Llama-3-8B-Instruct",
+        tensor_parallel_size=4,
+        speculative_config={
+            "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
+            "draft_tensor_parallel_size": 1,
+            "num_speculative_tokens": 2,
+        },
+    )
+
+    outputs = llm.generate(prompts, sampling_params)
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    ```
+
+A few important things to consider when using the EAGLE based draft models:
+
+1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) should
+   be able to be loaded and used directly by vLLM after <gh-pr:12304>.
+   If you are using vllm version before <gh-pr:12304>, please use the
+   [script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) to convert the speculative model,
+   and specify `"model": "path/to/modified/eagle/model"` in `speculative_config`. If weight-loading problems still occur when using the latest version of vLLM, please leave a comment or raise an issue.
+
+2. The EAGLE based draft models need to be run without tensor parallelism
+   (i.e. draft_tensor_parallel_size is set to 1 in `speculative_config`), although
+   it is possible to run the main model using tensor parallelism (see example above).
+
+3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is
+   reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under
+   investigation and tracked here: <gh-issue:9565>.
+
+A variety of EAGLE draft models are available on the Hugging Face hub:
+
+| Base Model                                                           | EAGLE on Hugging Face                     | # EAGLE Parameters |
+|---------------------------------------------------------------------|-------------------------------------------|--------------------|
+| Vicuna-7B-v1.3                                                       | yuhuili/EAGLE-Vicuna-7B-v1.3             | 0.24B              |
+| Vicuna-13B-v1.3                                                      | yuhuili/EAGLE-Vicuna-13B-v1.3            | 0.37B              |
+| Vicuna-33B-v1.3                                                      | yuhuili/EAGLE-Vicuna-33B-v1.3            | 0.56B              |
+| LLaMA2-Chat 7B                                                       | yuhuili/EAGLE-llama2-chat-7B             | 0.24B              |
+| LLaMA2-Chat 13B                                                      | yuhuili/EAGLE-llama2-chat-13B            | 0.37B              |
+| LLaMA2-Chat 70B                                                      | yuhuili/EAGLE-llama2-chat-70B            | 0.99B              |
+| Mixtral-8x7B-Instruct-v0.1                                           | yuhuili/EAGLE-mixtral-instruct-8x7B      | 0.28B              |
+| LLaMA3-Instruct 8B                                                   | yuhuili/EAGLE-LLaMA3-Instruct-8B         | 0.25B              |
+| LLaMA3-Instruct 70B                                                  | yuhuili/EAGLE-LLaMA3-Instruct-70B        | 0.99B              |
+| Qwen2-7B-Instruct                                                    | yuhuili/EAGLE-Qwen2-7B-Instruct          | 0.26B              |
+| Qwen2-72B-Instruct                                                   | yuhuili/EAGLE-Qwen2-72B-Instruct         | 1.05B              |
+
+## Lossless guarantees of Speculative Decoding
+
+In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of
+speculative decoding, breaking down the guarantees into three key areas:
+
+1. **Theoretical Losslessness**
+   \- Speculative decoding sampling is theoretically lossless up to the precision limits of hardware numerics. Floating-point errors might
+   cause slight variations in output distributions, as discussed
+   in [Accelerating Large Language Model Decoding with Speculative Sampling](https://arxiv.org/pdf/2302.01318)
+
+2. **Algorithmic Losslessness**
+   \- vLLM’s implementation of speculative decoding is algorithmically validated to be lossless. Key validation tests include:
+
+    > - **Rejection Sampler Convergence**: Ensures that samples from vLLM’s rejection sampler align with the target
+    >   distribution. [View Test Code](https://github.com/vllm-project/vllm/blob/47b65a550866c7ffbd076ecb74106714838ce7da/tests/samplers/test_rejection_sampler.py#L252)
+    > - **Greedy Sampling Equality**: Confirms that greedy sampling with speculative decoding matches greedy sampling
+    >   without it. This verifies that vLLM's speculative decoding framework, when integrated with the vLLM forward pass and the vLLM rejection sampler,
+    >   provides a lossless guarantee. Almost all of the tests in <gh-dir:tests/spec_decode/e2e>.
+    >   verify this property using [this assertion implementation](https://github.com/vllm-project/vllm/blob/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e/conftest.py#L291)
+
+3. **vLLM Logprob Stability**
+   \- vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the
+   same request across runs. For more details, see the FAQ section
+   titled *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](../usage/faq.md).
+
+While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding
+can occur due to following factors:
+
+- **Floating-Point Precision**: Differences in hardware numerical precision may lead to slight discrepancies in the output distribution.
+- **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially
+  due to non-deterministic behavior in batched operations or numerical instability.
+
+For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](../usage/faq.md).
+
+## Resources for vLLM contributors
+
+- [A Hacker's Guide to Speculative Decoding in vLLM](https://www.youtube.com/watch?v=9wNAgpX6z_4)
+- [What is Lookahead Scheduling in vLLM?](https://docs.google.com/document/d/1Z9TvqzzBPnh5WHcRwjvK2UEeFeq5zMZb5mFE8jR0HCs/edit#heading=h.1fjfb0donq5a)
+- [Information on batch expansion](https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit#heading=h.kk7dq05lc6q8)
+- [Dynamic speculative decoding](gh-issue:4565)
diff --git a/vllm_v0.10.0/docs/features/structured_outputs.md b/vllm_v0.10.0/docs/features/structured_outputs.md
new file mode 100644
index 0000000..4f737af
--- /dev/null
+++ b/vllm_v0.10.0/docs/features/structured_outputs.md
@@ -0,0 +1,325 @@
+# Structured Outputs
+
+vLLM supports the generation of structured outputs using
+[xgrammar](https://github.com/mlc-ai/xgrammar) or
+[guidance](https://github.com/guidance-ai/llguidance) as backends.
+This document shows you some examples of the different options that are
+available to generate structured outputs.
+
+## Online Serving (OpenAI API)
+
+You can generate structured outputs using the OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API.
+
+The following parameters are supported, which must be added as extra parameters:
+
+- `guided_choice`: the output will be exactly one of the choices.
+- `guided_regex`: the output will follow the regex pattern.
+- `guided_json`: the output will follow the JSON schema.
+- `guided_grammar`: the output will follow the context free grammar.
+- `structural_tag`: Follow a JSON schema within a set of specified tags within the generated text.
+
+You can see the complete list of supported parameters on the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) page.
+
+Structured outputs are supported by default in the OpenAI-Compatible Server. You
+may choose to specify the backend to use by setting the
+`--guided-decoding-backend` flag to `vllm serve`. The default backend is `auto`,
+which will try to choose an appropriate backend based on the details of the
+request. You may also choose a specific backend, along with
+some options. A full set of options is available in the `vllm serve --help`
+text.
+
+Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:
+
+??? code
+
+    ```python
+    from openai import OpenAI
+    client = OpenAI(
+        base_url="http://localhost:8000/v1",
+        api_key="-",
+    )
+    model = client.models.list().data[0].id
+
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[
+            {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
+        ],
+        extra_body={"guided_choice": ["positive", "negative"]},
+    )
+    print(completion.choices[0].message.content)
+    ```
+
+The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template:
+
+??? code
+
+    ```python
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[
+            {
+                "role": "user",
+                "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
+            }
+        ],
+        extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
+    )
+    print(completion.choices[0].message.content)
+    ```
+
+One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats.
+For this we can use the `guided_json` parameter in two different ways:
+
+- Using directly a [JSON Schema](https://json-schema.org/)
+- Defining a [Pydantic model](https://docs.pydantic.dev/latest/) and then extracting the JSON Schema from it (which is normally an easier option).
+
+The next example shows how to use the `guided_json` parameter with a Pydantic model:
+
+??? code
+
+    ```python
+    from pydantic import BaseModel
+    from enum import Enum
+
+    class CarType(str, Enum):
+        sedan = "sedan"
+        suv = "SUV"
+        truck = "Truck"
+        coupe = "Coupe"
+
+    class CarDescription(BaseModel):
+        brand: str
+        model: str
+        car_type: CarType
+
+    json_schema = CarDescription.model_json_schema()
+
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[
+            {
+                "role": "user",
+                "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
+            }
+        ],
+        "response_format": {
+            "type": "json_schema",
+            "json_schema": {
+                "name": "car-description",
+                "schema": CarDescription.model_json_schema()
+            },
+        },
+    )
+    print(completion.choices[0].message.content)
+    ```
+
+!!! tip
+    While not strictly necessary, normally it´s better to indicate in the prompt the
+    JSON schema and how the fields should be populated. This can improve the
+    results notably in most cases.
+
+Finally we have the `guided_grammar` option, which is probably the most
+difficult to use, but it´s really powerful. It allows us to define complete
+languages like SQL queries. It works by using a context free EBNF grammar.
+As an example, we can use to define a specific format of simplified SQL queries:
+
+??? code
+
+    ```python
+    simplified_sql_grammar = """
+        root ::= select_statement
+
+        select_statement ::= "SELECT " column " from " table " where " condition
+
+        column ::= "col_1 " | "col_2 "
+
+        table ::= "table_1 " | "table_2 "
+
+        condition ::= column "= " number
+
+        number ::= "1 " | "2 "
+    """
+
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[
+            {
+                "role": "user",
+                "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
+            }
+        ],
+        extra_body={"guided_grammar": simplified_sql_grammar},
+    )
+    print(completion.choices[0].message.content)
+    ```
+
+See also: [full example](../examples/online_serving/structured_outputs.md)
+
+## Reasoning Outputs
+
+You can also use structured outputs with <project:#reasoning-outputs> for reasoning models.
+
+```bash
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r1
+```
+
+Note that you can use reasoning with any provided structured outputs feature. The following uses one with JSON schema:
+
+??? code
+
+    ```python
+    from pydantic import BaseModel
+
+
+    class People(BaseModel):
+        name: str
+        age: int
+
+
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[
+            {
+                "role": "user",
+                "content": "Generate a JSON with the name and age of one random person.",
+            }
+        ],
+        response_format={
+            "type": "json_schema",
+            "json_schema": {
+                "name": "people",
+                "schema": People.model_json_schema()
+            }
+        },
+    )
+    print("reasoning_content: ", completion.choices[0].message.reasoning_content)
+    print("content: ", completion.choices[0].message.content)
+    ```
+
+See also: [full example](../examples/online_serving/structured_outputs.md)
+
+## Experimental Automatic Parsing (OpenAI API)
+
+This section covers the OpenAI beta wrapper over the `client.chat.completions.create()` method that provides richer integrations with Python specific types.
+
+At the time of writing (`openai==1.54.4`), this is a "beta" feature in the OpenAI client library. Code reference can be found [here](https://github.com/openai/openai-python/blob/52357cff50bee57ef442e94d78a0de38b4173fc2/src/openai/resources/beta/chat/completions.py#L100-L104).
+
+For the following examples, vLLM was setup using `vllm serve meta-llama/Llama-3.1-8B-Instruct`
+
+Here is a simple example demonstrating how to get structured output using Pydantic models:
+
+??? code
+
+    ```python
+    from pydantic import BaseModel
+    from openai import OpenAI
+
+    class Info(BaseModel):
+        name: str
+        age: int
+
+    client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
+    model = client.models.list().data[0].id
+    completion = client.beta.chat.completions.parse(
+        model=model,
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"},
+        ],
+        response_format=Info,
+    )
+
+    message = completion.choices[0].message
+    print(message)
+    assert message.parsed
+    print("Name:", message.parsed.name)
+    print("Age:", message.parsed.age)
+    ```
+
+```console
+ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28))
+Name: Cameron
+Age: 28
+```
+
+Here is a more complex example using nested Pydantic models to handle a step-by-step math solution:
+
+??? code
+
+    ```python
+    from typing import List
+    from pydantic import BaseModel
+    from openai import OpenAI
+
+    class Step(BaseModel):
+        explanation: str
+        output: str
+
+    class MathResponse(BaseModel):
+        steps: list[Step]
+        final_answer: str
+
+    completion = client.beta.chat.completions.parse(
+        model=model,
+        messages=[
+            {"role": "system", "content": "You are a helpful expert math tutor."},
+            {"role": "user", "content": "Solve 8x + 31 = 2."},
+        ],
+        response_format=MathResponse,
+    )
+
+    message = completion.choices[0].message
+    print(message)
+    assert message.parsed
+    for i, step in enumerate(message.parsed.steps):
+        print(f"Step #{i}:", step)
+    print("Answer:", message.parsed.final_answer)
+    ```
+
+Output:
+
+```console
+ParsedChatCompletionMessage[MathResponse](content='{ "steps": [{ "explanation": "First, let\'s isolate the term with the variable \'x\'. To do this, we\'ll subtract 31 from both sides of the equation.", "output": "8x + 31 - 31 = 2 - 31"}, { "explanation": "By subtracting 31 from both sides, we simplify the equation to 8x = -29.", "output": "8x = -29"}, { "explanation": "Next, let\'s isolate \'x\' by dividing both sides of the equation by 8.", "output": "8x / 8 = -29 / 8"}], "final_answer": "x = -29/8" }', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=MathResponse(steps=[Step(explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation.", output='8x + 31 - 31 = 2 - 31'), Step(explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.', output='8x = -29'), Step(explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8.", output='8x / 8 = -29 / 8')], final_answer='x = -29/8'))
+Step #0: explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation." output='8x + 31 - 31 = 2 - 31'
+Step #1: explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.' output='8x = -29'
+Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8." output='8x / 8 = -29 / 8'
+Answer: x = -29/8
+```
+
+An example of using `structural_tag` can be found here: <gh-file:examples/online_serving/structured_outputs>
+
+## Offline Inference
+
+Offline inference allows for the same types of structured outputs.
+To use it, we´ll need to configure the guided decoding using the class `GuidedDecodingParams` inside `SamplingParams`.
+The main available options inside `GuidedDecodingParams` are:
+
+- `json`
+- `regex`
+- `choice`
+- `grammar`
+- `structural_tag`
+
+These parameters can be used in the same way as the parameters from the Online
+Serving examples above. One example for the usage of the `choice` parameter is
+shown below:
+
+??? code
+
+    ```python
+    from vllm import LLM, SamplingParams
+    from vllm.sampling_params import GuidedDecodingParams
+
+    llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
+
+    guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
+    sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
+    outputs = llm.generate(
+        prompts="Classify this sentiment: vLLM is wonderful!",
+        sampling_params=sampling_params,
+    )
+    print(outputs[0].outputs[0].text)
+    ```
+
+See also: [full example](../examples/online_serving/structured_outputs.md)
diff --git a/vllm_v0.10.0/docs/features/tool_calling.md b/vllm_v0.10.0/docs/features/tool_calling.md
new file mode 100644
index 0000000..ce74683
--- /dev/null
+++ b/vllm_v0.10.0/docs/features/tool_calling.md
@@ -0,0 +1,393 @@
+# Tool Calling
+
+vLLM currently supports named function calling, as well as the `auto`, `required` (as of `vllm>=0.8.3`), and `none` options for the `tool_choice` field in the chat completion API.
+
+## Quickstart
+
+Start the server with tool calling enabled. This example uses Meta's Llama 3.1 8B model, so we need to use the `llama3_json` tool calling chat template from the vLLM examples directory:
+
+```bash
+vllm serve meta-llama/Llama-3.1-8B-Instruct \
+    --enable-auto-tool-choice \
+    --tool-call-parser llama3_json \
+    --chat-template examples/tool_chat_template_llama3.1_json.jinja
+```
+
+Next, make a request that triggers the model to use the available tools:
+
+??? code
+
+    ```python
+    from openai import OpenAI
+    import json
+
+    client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
+
+    def get_weather(location: str, unit: str):
+        return f"Getting the weather for {location} in {unit}..."
+    tool_functions = {"get_weather": get_weather}
+
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
+                },
+                "required": ["location", "unit"]
+            }
+        }
+    }]
+
+    response = client.chat.completions.create(
+        model=client.models.list().data[0].id,
+        messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
+        tools=tools,
+        tool_choice="auto"
+    )
+
+    tool_call = response.choices[0].message.tool_calls[0].function
+    print(f"Function called: {tool_call.name}")
+    print(f"Arguments: {tool_call.arguments}")
+    print(f"Result: {tool_functions[tool_call.name](**json.loads(tool_call.arguments))}")
+    ```
+
+Example output:
+
+```text
+Function called: get_weather
+Arguments: {"location": "San Francisco, CA", "unit": "fahrenheit"}
+Result: Getting the weather for San Francisco, CA in fahrenheit...
+```
+
+This example demonstrates:
+
+* Setting up the server with tool calling enabled
+* Defining an actual function to handle tool calls
+* Making a request with `tool_choice="auto"`
+* Handling the structured response and executing the corresponding function
+
+You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the guided decoding backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests.
+
+Remember that it's the caller's responsibility to:
+
+1. Define appropriate tools in the request
+2. Include relevant context in the chat messages
+3. Handle the tool calls in your application logic
+
+For more advanced usage, including parallel tool calls and different model-specific parsers, see the sections below.
+
+## Named Function Calling
+
+vLLM supports named function calling in the chat completion API by default. It does so using Outlines through guided decoding, so this is
+enabled by default and will work with any supported model. You are guaranteed a validly-parsable function call - not a
+high-quality one.
+
+vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
+For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the guided decoding backend.
+
+To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and
+specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request.
+
+## Required Function Calling
+
+vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The guided decoding features for `tool_choice='required'` (such as JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine.
+
+When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter.
+
+## None Function Calling
+
+vLLM supports the `tool_choice='none'` option in the chat completion API. When this option is set, the model will not generate any tool calls and will respond with regular text content only, even if tools are defined in the request.
+
+However, when `tool_choice='none'` is specified, vLLM includes tool definitions from the prompt.
+
+## Automatic Function Calling
+
+To enable this feature, you should set the following flags:
+
+* `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. It tells vLLM that you want to enable the model to generate its own tool calls when it
+deems appropriate.
+* `--tool-call-parser` -- select the tool parser to use (listed below). Additional tool parsers
+will continue to be added in the future. You can also register your own tool parsers in the `--tool-parser-plugin`.
+* `--tool-parser-plugin` -- **optional** tool parser plugin used to register user defined tool parsers into vllm, the registered tool parser name can be specified in `--tool-call-parser`.
+* `--chat-template` -- **optional** for auto tool choice. It's the path to the chat template which handles `tool`-role messages and `assistant`-role messages
+that contain previously generated tool calls. Hermes, Mistral and Llama models have tool-compatible chat templates in their
+`tokenizer_config.json` files, but you can specify a custom template. This argument can be set to `tool_use` if your model has a tool use-specific chat
+template configured in the `tokenizer_config.json`. In this case, it will be used per the `transformers` specification. More on this [here](https://huggingface.co/docs/transformers/en/chat_templating#why-do-some-models-have-multiple-templates)
+from HuggingFace; and you can find an example of this in a `tokenizer_config.json` [here](https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B/blob/main/tokenizer_config.json).
+
+If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template!
+
+### Hermes Models (`hermes`)
+
+All Nous Research Hermes-series models newer than Hermes 2 Pro should be supported.
+
+* `NousResearch/Hermes-2-Pro-*`
+* `NousResearch/Hermes-2-Theta-*`
+* `NousResearch/Hermes-3-*`
+
+_Note that the Hermes 2 **Theta** models are known to have degraded tool call quality and capabilities due to the merge
+step in their creation_.
+
+Flags: `--tool-call-parser hermes`
+
+### Mistral Models (`mistral`)
+
+Supported models:
+
+* `mistralai/Mistral-7B-Instruct-v0.3` (confirmed)
+* Additional mistral function-calling models are compatible as well.
+
+Known issues:
+
+1. Mistral 7B struggles to generate parallel tool calls correctly.
+2. Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is
+   much shorter than what vLLM generates. Since an exception is thrown when this condition
+   is not met, the following additional chat templates are provided:
+
+    * <gh-file:examples/tool_chat_template_mistral.jinja> - this is the "official" Mistral chat template, but tweaked so that
+      it works with vLLM's tool call IDs (provided `tool_call_id` fields are truncated to the last 9 digits)
+    * <gh-file:examples/tool_chat_template_mistral_parallel.jinja> - this is a "better" version that adds a tool-use system prompt
+      when tools are provided, that results in much better reliability when working with parallel tool calling.
+
+Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja`
+
+### Llama Models (`llama3_json`)
+
+Supported models:
+
+All Llama 3.1, 3.2 and 4 models should be supported.
+
+* `meta-llama/Llama-3.1-*`
+* `meta-llama/Llama-3.2-*`
+* `meta-llama/Llama-4-*`
+
+The tool calling that is supported is the [JSON-based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) introduced by the Llama-3.2 models, see the `pythonic` tool parser below. As for Llama 4 models, it is recommended to use the `llama4_pythonic` tool parser.
+
+Other tool calling formats like the built in python tool calling or custom tool calling are not supported.
+
+Known issues:
+
+1. Parallel tool calls are not supported for Llama 3, but it is supported in Llama 4 models.
+2. The model can generate parameters in an incorrect format, such as generating
+   an array serialized as string instead of an array.
+
+VLLM provides two JSON-based chat templates for Llama 3.1 and 3.2:
+
+* <gh-file:examples/tool_chat_template_llama3.1_json.jinja> - this is the "official" chat template for the Llama 3.1
+models, but tweaked so that it works better with vLLM.
+* <gh-file:examples/tool_chat_template_llama3.2_json.jinja> - this extends upon the Llama 3.1 chat template by adding support for
+images.
+
+Recommended flags: `--tool-call-parser llama3_json --chat-template {see_above}`
+
+VLLM also provides a pythonic and JSON-based chat template for Llama 4, but pythonic tool calling is recommended:
+
+* <gh-file:examples/tool_chat_template_llama4_pythonic.jinja> - this is based on the [official chat template](https://www.llama.com/docs/model-cards-and-prompt-formats/llama4/) for the Llama 4 models.
+
+For Llama 4 model, use `--tool-call-parser llama4_pythonic --chat-template examples/tool_chat_template_llama4_pythonic.jinja`.
+
+#### IBM Granite
+
+Supported models:
+
+* `ibm-granite/granite-3.0-8b-instruct`
+
+    Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja`
+
+    <gh-file:examples/tool_chat_template_granite.jinja>: this is a modified chat template from the original on Hugging Face. Parallel function calls are supported.
+
+* `ibm-granite/granite-3.1-8b-instruct`
+
+    Recommended flags: `--tool-call-parser granite`
+
+    The chat template from Huggingface can be used directly. Parallel function calls are supported.
+
+* `ibm-granite/granite-20b-functioncalling`
+
+    Recommended flags: `--tool-call-parser granite-20b-fc --chat-template examples/tool_chat_template_granite_20b_fc.jinja`
+
+    <gh-file:examples/tool_chat_template_granite_20b_fc.jinja>: this is a modified chat template from the original on Hugging Face, which is not vLLM-compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported.
+
+### InternLM Models (`internlm`)
+
+Supported models:
+
+* `internlm/internlm2_5-7b-chat` (confirmed)
+* Additional internlm2.5 function-calling models are compatible as well
+
+Known issues:
+
+* Although this implementation also supports InternLM2, the tool call results are not stable when testing with the `internlm/internlm2-chat-7b` model.
+
+Recommended flags: `--tool-call-parser internlm --chat-template examples/tool_chat_template_internlm2_tool.jinja`
+
+### Jamba Models (`jamba`)
+
+AI21's Jamba-1.5 models are supported.
+
+* `ai21labs/AI21-Jamba-1.5-Mini`
+* `ai21labs/AI21-Jamba-1.5-Large`
+
+Flags: `--tool-call-parser jamba`
+
+### xLAM Models (`xlam`)
+
+The xLAM tool parser is designed to support models that generate tool calls in various JSON formats. It detects function calls in several different output styles:
+
+1. Direct JSON arrays: Output strings that are JSON arrays starting with `[` and ending with `]`
+2. Thinking tags: Using `<think>...</think>` tags containing JSON arrays
+3. Code blocks: JSON in code blocks (```json ...```)
+4. Tool calls tags: Using `[TOOL_CALLS]` or `<tool_call>...</tool_call>` tags
+
+Parallel function calls are supported, and the parser can effectively separate text content from tool calls.
+
+Supported models:
+
+* Salesforce Llama-xLAM models: `Salesforce/Llama-xLAM-2-8B-fc-r`, `Salesforce/Llama-xLAM-2-70B-fc-r`
+* Qwen-xLAM models: `Salesforce/xLAM-1B-fc-r`, `Salesforce/xLAM-3B-fc-r`, `Salesforce/Qwen-xLAM-32B-fc-r`
+
+Flags:
+
+* For Llama-based xLAM models: `--tool-call-parser xlam --chat-template examples/tool_chat_template_xlam_llama.jinja`
+* For Qwen-based xLAM models: `--tool-call-parser xlam --chat-template examples/tool_chat_template_xlam_qwen.jinja`
+
+### Qwen Models
+
+For Qwen2.5, the chat template in tokenizer_config.json has already included support for the Hermes-style tool use. Therefore, you can use the `hermes` parser to enable tool calls for Qwen models. For more detailed information, please refer to the official [Qwen documentation](https://qwen.readthedocs.io/en/latest/framework/function_call.html#vllm)
+
+* `Qwen/Qwen2.5-*`
+* `Qwen/QwQ-32B`
+
+Flags: `--tool-call-parser hermes`
+
+### MiniMax Models (`minimax_m1`)
+
+Supported models:
+
+* `MiniMaxAi/MiniMax-M1-40k` (use with <gh-file:examples/tool_chat_template_minimax_m1.jinja>)
+* `MiniMaxAi/MiniMax-M1-80k` (use with <gh-file:examples/tool_chat_template_minimax_m1.jinja>)
+
+Flags: `--tool-call-parser minimax --chat-template examples/tool_chat_template_minimax_m1.jinja`
+
+### DeepSeek-V3 Models (`deepseek_v3`)
+
+Supported models:
+
+* `deepseek-ai/DeepSeek-V3-0324` (use with <gh-file:examples/tool_chat_template_deepseekv3.jinja>)
+* `deepseek-ai/DeepSeek-R1-0528` (use with <gh-file:examples/tool_chat_template_deepseekr1.jinja>)
+
+Flags: `--tool-call-parser deepseek_v3 --chat-template {see_above}`
+
+### Kimi-K2 Models (`kimi_k2`)
+
+Supported models:
+
+* `moonshotai/Kimi-K2-Instruct`
+
+Flags: `--tool-call-parser kimi_k2`
+
+### Hunyuan Models (`hunyuan_a13b`)
+
+Supported models:
+
+* `tencent/Hunyuan-A13B-Instruct` (The chat template is already included in the Hugging Face model files.)
+
+Flags:
+
+* For non-reasoning: `--tool-call-parser hunyuan_a13b`
+* For reasoning: `--tool-call-parser hunyuan_a13b --reasoning-parser hunyuan_a13b --enable_reasoning`
+
+### Models with Pythonic Tool Calls (`pythonic`)
+
+A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.
+
+As a concrete example, these models may look up the weather in San Francisco and Seattle by generating:
+
+```python
+[get_weather(city='San Francisco', metric='celsius'), get_weather(city='Seattle', metric='celsius')]
+```
+
+Limitations:
+
+* The model must not generate both text and tool calls in the same generation. This may not be hard to change for a specific model, but the community currently lacks consensus on which tokens to emit when starting and ending tool calls.  (In particular, the Llama 3.2 models emit no such tokens.)
+* Llama's smaller models struggle to use tools effectively.
+
+Example supported models:
+
+* `meta-llama/Llama-3.2-1B-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama3.2_pythonic.jinja>)
+* `meta-llama/Llama-3.2-3B-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama3.2_pythonic.jinja>)
+* `Team-ACE/ToolACE-8B` (use with <gh-file:examples/tool_chat_template_toolace.jinja>)
+* `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with <gh-file:examples/tool_chat_template_toolace.jinja>)
+* `meta-llama/Llama-4-Scout-17B-16E-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama4_pythonic.jinja>)
+* `meta-llama/Llama-4-Maverick-17B-128E-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama4_pythonic.jinja>)
+
+Flags: `--tool-call-parser pythonic --chat-template {see_above}`
+
+!!! warning
+    Llama's smaller models frequently fail to emit tool calls in the correct format. Results may vary depending on the model.
+
+## How to Write a Tool Parser Plugin
+
+A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in <gh-file:vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py>.
+
+Here is a summary of a plugin file:
+
+??? code
+
+    ```python
+
+    # import the required packages
+
+    # define a tool parser and register it to vllm
+    # the name list in register_module can be used
+    # in --tool-call-parser. you can define as many
+    # tool parsers as you want here.
+    @ToolParserManager.register_module(["example"])
+    class ExampleToolParser(ToolParser):
+        def __init__(self, tokenizer: AnyTokenizer):
+            super().__init__(tokenizer)
+
+        # adjust request. e.g.: set skip special tokens
+        # to False for tool call output.
+        def adjust_request(
+                self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+            return request
+
+        # implement the tool call parse for stream call
+        def extract_tool_calls_streaming(
+            self,
+            previous_text: str,
+            current_text: str,
+            delta_text: str,
+            previous_token_ids: Sequence[int],
+            current_token_ids: Sequence[int],
+            delta_token_ids: Sequence[int],
+            request: ChatCompletionRequest,
+        ) -> Union[DeltaMessage, None]:
+            return delta
+
+        # implement the tool parse for non-stream call
+        def extract_tool_calls(
+            self,
+            model_output: str,
+            request: ChatCompletionRequest,
+        ) -> ExtractedToolCallInformation:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=text)
+
+    ```
+
+Then you can use this plugin in the command line like this.
+
+```bash
+    --enable-auto-tool-choice \
+    --tool-parser-plugin <absolute path of the plugin file>
+    --tool-call-parser example \
+    --chat-template <your chat template> \
+```
diff --git a/vllm_v0.10.0/docs/getting_started/installation/.nav.yml b/vllm_v0.10.0/docs/getting_started/installation/.nav.yml
new file mode 100644
index 0000000..d4a727c
--- /dev/null
+++ b/vllm_v0.10.0/docs/getting_started/installation/.nav.yml
@@ -0,0 +1,7 @@
+nav:
+  - README.md
+  - gpu.md
+  - cpu.md
+  - google_tpu.md
+  - intel_gaudi.md
+  - aws_neuron.md
diff --git a/vllm_v0.10.0/docs/getting_started/installation/README.md b/vllm_v0.10.0/docs/getting_started/installation/README.md
new file mode 100644
index 0000000..a252343
--- /dev/null
+++ b/vllm_v0.10.0/docs/getting_started/installation/README.md
@@ -0,0 +1,16 @@
+# Installation
+
+vLLM supports the following hardware platforms:
+
+- [GPU](gpu.md)
+    - [NVIDIA CUDA](gpu.md#nvidia-cuda)
+    - [AMD ROCm](gpu.md#amd-rocm)
+    - [Intel XPU](gpu.md#intel-xpu)
+- [CPU](cpu.md)
+    - [Intel/AMD x86](cpu.md#intelamd-x86)
+    - [ARM AArch64](cpu.md#arm-aarch64)
+    - [Apple silicon](cpu.md#apple-silicon)
+    - [IBM Z (S390X)](cpu.md#ibm-z-s390x)
+- [Google TPU](google_tpu.md)
+- [Intel Gaudi](intel_gaudi.md)
+- [AWS Neuron](aws_neuron.md)
diff --git a/vllm_v0.10.0/docs/getting_started/installation/aws_neuron.md b/vllm_v0.10.0/docs/getting_started/installation/aws_neuron.md
new file mode 100644
index 0000000..b8bd76b
--- /dev/null
+++ b/vllm_v0.10.0/docs/getting_started/installation/aws_neuron.md
@@ -0,0 +1,147 @@
+# AWS Neuron
+
+[AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/) is the software development kit (SDK) used to run deep learning and
+generative AI workloads on AWS Inferentia and AWS Trainium powered Amazon EC2 instances and UltraServers (Inf1, Inf2, Trn1, Trn2,
+and Trn2 UltraServer). Both Trainium and Inferentia are powered by fully-independent heterogeneous compute-units called NeuronCores.
+This describes how to set up your environment to run vLLM on Neuron.
+
+!!! warning
+    There are no pre-built wheels or images for this device, so you must build vLLM from source.
+
+## Requirements
+
+- OS: Linux
+- Python: 3.9 or newer
+- Pytorch 2.5/2.6
+- Accelerator: NeuronCore-v2 (in trn1/inf2 chips) or NeuronCore-v3 (in trn2 chips)
+- AWS Neuron SDK 2.23
+
+## Configure a new environment
+
+### Launch a Trn1/Trn2/Inf2 instance and verify Neuron dependencies
+
+The easiest way to launch a Trainium or Inferentia instance with pre-installed Neuron dependencies is to follow this
+[quick start guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/multiframework/multi-framework-ubuntu22-neuron-dlami.html#setup-ubuntu22-multi-framework-dlami) using the Neuron Deep Learning AMI (Amazon machine image).
+
+- After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance
+- Once inside your instance, activate the pre-installed virtual environment for inference by running
+
+```bash
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+```
+
+Refer to the [NxD Inference Setup Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/nxdi-setup.html)
+for alternative setup instructions including using Docker and manually installing dependencies.
+
+!!! note
+    NxD Inference is the default recommended backend to run inference on Neuron. If you are looking to use the legacy [transformers-neuronx](https://github.com/aws-neuron/transformers-neuronx)
+    library, refer to [Transformers NeuronX Setup](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/transformers-neuronx/setup/index.html).
+
+## Set up using Python
+
+### Pre-built wheels
+
+Currently, there are no pre-built Neuron wheels.
+
+### Build wheel from source
+
+To build and install vLLM from source, run:
+
+```bash
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+pip install -U -r requirements/neuron.txt
+VLLM_TARGET_DEVICE="neuron" pip install -e .
+```
+
+AWS Neuron maintains a [Github fork of vLLM](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2) at
+<https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2>, which contains several features in addition to what's
+available on vLLM V0. Please utilize the AWS Fork for the following features:
+
+- Llama-3.2 multi-modal support
+- Multi-node distributed inference
+
+Refer to [vLLM User Guide for NxD Inference](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/vllm-user-guide.html)
+    for more details and usage examples.
+
+To install the AWS Neuron fork, run the following:
+
+```bash
+git clone -b neuron-2.23-vllm-v0.7.2 https://github.com/aws-neuron/upstreaming-to-vllm.git
+cd upstreaming-to-vllm
+pip install -r requirements/neuron.txt
+VLLM_TARGET_DEVICE="neuron" pip install -e .
+```
+
+Note that the AWS Neuron fork is only intended to support Neuron hardware; compatibility with other hardwares is not tested.
+
+## Set up using Docker
+
+### Pre-built images
+
+Currently, there are no pre-built Neuron images.
+
+### Build image from source
+
+See [deployment-docker-build-image-from-source][deployment-docker-build-image-from-source] for instructions on building the Docker image.
+
+Make sure to use <gh-file:docker/Dockerfile.neuron> in place of the default Dockerfile.
+
+## Extra information
+
+[](){ #feature-support-through-nxd-inference-backend }
+
+### Feature support through NxD Inference backend
+
+The current vLLM and Neuron integration relies on either the `neuronx-distributed-inference` (preferred) or `transformers-neuronx` backend
+to perform most of the heavy lifting which includes PyTorch model initialization, compilation, and runtime execution. Therefore, most
+[features supported on Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html) are also available via the vLLM integration.
+
+To configure NxD Inference features through the vLLM entrypoint, use the `override_neuron_config` setting. Provide the configs you want to override
+as a dictionary (or JSON object when starting vLLM from the CLI). For example, to disable auto bucketing, include
+
+```python
+override_neuron_config={
+    "enable_bucketing":False,
+}
+```
+
+or when launching vLLM from the CLI, pass
+
+```bash
+--override-neuron-config "{\"enable_bucketing\":false}"
+```
+
+Alternatively, users can directly call the NxDI library to trace and compile your model, then load the pre-compiled artifacts
+(via `NEURON_COMPILED_ARTIFACTS` environment variable) in vLLM to run inference workloads.
+
+### Known limitations
+
+- EAGLE speculative decoding: NxD Inference requires the EAGLE draft checkpoint to include the LM head weights from the target model. Refer to this
+  [guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html#eagle-checkpoint-compatibility)
+  for how to convert pretrained EAGLE model checkpoints to be compatible for NxDI.
+- Quantization: the native quantization flow in vLLM is not well supported on NxD Inference. It is recommended to follow this
+  [Neuron quantization guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/custom-quantization.html)
+  to quantize and compile your model using NxD Inference, and then load the compiled artifacts into vLLM.
+- Multi-LoRA serving: NxD Inference only supports loading of LoRA adapters at server startup. Dynamic loading of LoRA adapters at
+  runtime is not currently supported. Refer to [multi-lora example](https://github.com/aws-neuron/upstreaming-to-vllm/blob/neuron-2.23-vllm-v0.7.2/examples/offline_inference/neuron_multi_lora.py)
+- Multi-modal support: multi-modal support is only available through the AWS Neuron fork. This feature has not been upstreamed
+  to vLLM main because NxD Inference currently relies on certain adaptations to the core vLLM logic to support this feature.
+- Multi-node support: distributed inference across multiple Trainium/Inferentia instances is only supported on the AWS Neuron fork. Refer
+  to this [multi-node example](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2/examples/neuron/multi_node)
+  to run. Note that tensor parallelism (distributed inference across NeuronCores) is available in vLLM main.
+- Known edge case bug in speculative decoding: An edge case failure may occur in speculative decoding when sequence length approaches
+  max model length (e.g. when requesting max tokens up to the max model length and ignoring eos). In this scenario, vLLM may attempt
+  to allocate an additional block to ensure there is enough memory for number of lookahead slots, but since we do not have good support
+  for paged attention, there isn't another Neuron block for vLLM to allocate. A workaround fix (to terminate 1 iteration early) is
+  implemented in the AWS Neuron fork but is not upstreamed to vLLM main as it modifies core vLLM logic.
+
+### Environment variables
+
+- `NEURON_COMPILED_ARTIFACTS`: set this environment variable to point to your pre-compiled model artifacts directory to avoid
+  compilation time upon server initialization. If this variable is not set, the Neuron module will perform compilation and save the
+  artifacts under `neuron-compiled-artifacts/{unique_hash}/` sub-directory in the model path. If this environment variable is set,
+  but the directory does not exist, or the contents are invalid, Neuron will also fallback to a new compilation and store the artifacts
+  under this specified path.
+- `NEURON_CONTEXT_LENGTH_BUCKETS`: Bucket sizes for context encoding. (Only applicable to `transformers-neuronx` backend).
+- `NEURON_TOKEN_GEN_BUCKETS`: Bucket sizes for token generation. (Only applicable to `transformers-neuronx` backend).
diff --git a/vllm_v0.10.0/docs/getting_started/installation/cpu.md b/vllm_v0.10.0/docs/getting_started/installation/cpu.md
new file mode 100644
index 0000000..2d2598d
--- /dev/null
+++ b/vllm_v0.10.0/docs/getting_started/installation/cpu.md
@@ -0,0 +1,195 @@
+# CPU
+
+vLLM is a Python library that supports the following CPU variants. Select your CPU type to see vendor specific instructions:
+
+=== "Intel/AMD x86"
+
+    --8<-- "docs/getting_started/installation/cpu/x86.inc.md:installation"
+
+=== "ARM AArch64"
+
+    --8<-- "docs/getting_started/installation/cpu/arm.inc.md:installation"
+
+=== "Apple silicon"
+
+    --8<-- "docs/getting_started/installation/cpu/apple.inc.md:installation"
+
+=== "IBM Z (S390X)"
+
+    --8<-- "docs/getting_started/installation/cpu/s390x.inc.md:installation"
+
+## Requirements
+
+- Python: 3.9 -- 3.12
+
+=== "Intel/AMD x86"
+
+    --8<-- "docs/getting_started/installation/cpu/x86.inc.md:requirements"
+
+=== "ARM AArch64"
+
+    --8<-- "docs/getting_started/installation/cpu/arm.inc.md:requirements"
+
+=== "Apple silicon"
+
+    --8<-- "docs/getting_started/installation/cpu/apple.inc.md:requirements"
+
+=== "IBM Z (S390X)"
+
+    --8<-- "docs/getting_started/installation/cpu/s390x.inc.md:requirements"
+
+## Set up using Python
+
+### Create a new Python environment
+
+--8<-- "docs/getting_started/installation/python_env_setup.inc.md"
+
+### Pre-built wheels
+
+Currently, there are no pre-built CPU wheels.
+
+### Build wheel from source
+
+=== "Intel/AMD x86"
+
+    --8<-- "docs/getting_started/installation/cpu/x86.inc.md:build-wheel-from-source"
+
+=== "ARM AArch64"
+
+    --8<-- "docs/getting_started/installation/cpu/arm.inc.md:build-wheel-from-source"
+
+=== "Apple silicon"
+
+    --8<-- "docs/getting_started/installation/cpu/apple.inc.md:build-wheel-from-source"
+
+=== "IBM Z (s390x)"
+
+    --8<-- "docs/getting_started/installation/cpu/s390x.inc.md:build-wheel-from-source"
+
+## Set up using Docker
+
+### Pre-built images
+
+=== "Intel/AMD x86"
+
+    --8<-- "docs/getting_started/installation/cpu/x86.inc.md:pre-built-images"
+
+### Build image from source
+
+=== "Intel/AMD x86"
+
+    --8<-- "docs/getting_started/installation/cpu/x86.inc.md:build-image-from-source"
+
+=== "ARM AArch64"
+
+    --8<-- "docs/getting_started/installation/cpu/arm.inc.md:build-image-from-source"
+
+=== "Apple silicon"
+
+    --8<-- "docs/getting_started/installation/cpu/arm.inc.md:build-image-from-source"
+
+=== "IBM Z (S390X)"
+    --8<-- "docs/getting_started/installation/cpu/s390x.inc.md:build-image-from-source"
+
+## Related runtime environment variables
+
+- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. Default value is `0`.
+- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads, can be set as CPU id lists or `auto` (by default). For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node respectively.
+- `VLLM_CPU_NUM_OF_RESERVED_CPU`: specify the number of CPU cores which are not dedicated to the OpenMP threads for each rank. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. Default value is `None`. If the value is not set and use `auto` thread binding, no CPU will be reserved for `world_size == 1`, 1 CPU per rank will be reserved for `world_size > 1`.
+- `VLLM_CPU_MOE_PREPACK` (x86 only): whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).
+- `VLLM_CPU_SGL_KERNEL` (x86 only, Experimental): whether to use small-batch optimized kernels for linear layer and MoE layer, especially for low-latency requirements like online serving. The kernels require AMX instruction set, BFloat16 weight type and weight shapes divisible by 32. Default is `0` (False).
+
+## FAQ
+
+### Which `dtype` should be used?
+
+- Currently vLLM CPU uses model default settings as `dtype`. However, due to unstable float16 support in torch CPU, it is recommended to explicitly set `dtype=bfloat16` if there are any performance or accuracy problem.  
+
+### How to launch a vLLM service on CPU?
+
+- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 31 for the framework and using CPU 0-30 for inference threads:
+
+```bash
+export VLLM_CPU_KVCACHE_SPACE=40
+export VLLM_CPU_OMP_THREADS_BIND=0-30
+vllm serve facebook/opt-125m --dtype=bfloat16
+```
+
+ or using default auto thread binding:
+
+```bash
+export VLLM_CPU_KVCACHE_SPACE=40
+export VLLM_CPU_NUM_OF_RESERVED_CPU=1
+vllm serve facebook/opt-125m --dtype=bfloat16
+```
+
+Note, it is recommended to manually reserve 1 CPU for vLLM front-end process when `world_size == 1`.
+
+### How to decide `VLLM_CPU_OMP_THREADS_BIND`?
+
+- Default `auto` thread-binding is recommended for most cases. Ideally, each OpenMP thread will be bound to a dedicated physical core respectively, threads of each rank will be bound to a same NUMA node respectively, and 1 CPU per rank will be reserved for other vLLM components when `world_size > 1`. If have any performance problems or unexpected binding behaviours, please try to bind threads as following.
+
+- On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
+
+??? console "Commands"
+
+    ```console
+    $ lscpu -e # check the mapping between logical CPU cores and physical CPU cores
+
+    # The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core.
+    CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE    MAXMHZ   MINMHZ      MHZ
+    0    0      0    0 0:0:0:0          yes 2401.0000 800.0000  800.000
+    1    0      0    1 1:1:1:0          yes 2401.0000 800.0000  800.000
+    2    0      0    2 2:2:2:0          yes 2401.0000 800.0000  800.000
+    3    0      0    3 3:3:3:0          yes 2401.0000 800.0000  800.000
+    4    0      0    4 4:4:4:0          yes 2401.0000 800.0000  800.000
+    5    0      0    5 5:5:5:0          yes 2401.0000 800.0000  800.000
+    6    0      0    6 6:6:6:0          yes 2401.0000 800.0000  800.000
+    7    0      0    7 7:7:7:0          yes 2401.0000 800.0000  800.000
+    8    0      0    0 0:0:0:0          yes 2401.0000 800.0000  800.000
+    9    0      0    1 1:1:1:0          yes 2401.0000 800.0000  800.000
+    10   0      0    2 2:2:2:0          yes 2401.0000 800.0000  800.000
+    11   0      0    3 3:3:3:0          yes 2401.0000 800.0000  800.000
+    12   0      0    4 4:4:4:0          yes 2401.0000 800.0000  800.000
+    13   0      0    5 5:5:5:0          yes 2401.0000 800.0000  800.000
+    14   0      0    6 6:6:6:0          yes 2401.0000 800.0000  800.000
+    15   0      0    7 7:7:7:0          yes 2401.0000 800.0000  800.000
+
+    # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
+    $ export VLLM_CPU_OMP_THREADS_BIND=0-7
+    $ python examples/offline_inference/basic/basic.py
+    ```
+
+- When deploy vLLM CPU backend on a multi-socket machine with NUMA and enable tensor parallel or pipeline parallel, each NUMA node is treated as a TP/PP rank. So be aware to set CPU cores of a single rank on a same NUMA node to avoid cross NUMA node memory access.
+
+### How to decide `VLLM_CPU_KVCACHE_SPACE`?
+
+  - This value is 4GB by default. Larger space can support more concurrent requests, longer context length. However, users should take care of memory capacity of each NUMA node. The memory usage of each TP rank is the sum of `weight shard size` and `VLLM_CPU_KVCACHE_SPACE`, if it exceeds the capacity of a single NUMA node, the TP worker will be killed with `exitcode 9` due to out-of-memory.
+
+### How to do performance tuning for vLLM CPU?
+
+First of all, please make sure the thread-binding and KV cache space are properly set and take effect. You can check the thread-binding by running a vLLM benchmark and observing CPU cores usage via `htop`.
+
+Inference batch size is a important parameter for the performance. Larger batch usually provides higher throughput, smaller batch provides lower latency. Tuning max batch size starts from default value to balance throughput and latency is an effective way to improve vLLM CPU performance on specific platforms. There are two important related parameters in vLLM:
+
+- `--max-num-batched-tokens`, defines the limit of token numbers in a single batch, has more impacts on the first token performance. The default value is set as:
+    - Offline Inference: `4096 * world_size`
+    - Online Serving: `2048 * world_size`
+- `--max-num-seqs`, defines the limit of sequence numbers in a single batch, has more impacts on the output token performance.
+    - Offline Inference: `256 * world_size`
+    - Online Serving: `128 * world_size`
+
+vLLM CPU supports tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more detials of tuning TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use TP and PP togther if there are enough CPU sockets and memory nodes.
+
+### Which quantization configs does vLLM CPU support?
+
+  - vLLM CPU supports quantizations:
+    - AWQ (x86 only)
+    - GPTQ (x86 only)
+    - compressed-tensor INT8 W8A8 (x86, s390x)
+
+### (x86 only) What is the purpose of `VLLM_CPU_MOE_PREPACK` and `VLLM_CPU_SGL_KERNEL`?
+
+  - Both of them requires `amx` CPU flag.
+    - `VLLM_CPU_MOE_PREPACK` can provides better performance for MoE models
+    - `VLLM_CPU_SGL_KERNEL` can provides better performance for MoE models and small-batch scenarios.
diff --git a/vllm_v0.10.0/docs/getting_started/installation/cpu/apple.inc.md b/vllm_v0.10.0/docs/getting_started/installation/cpu/apple.inc.md
new file mode 100644
index 0000000..0816f38
--- /dev/null
+++ b/vllm_v0.10.0/docs/getting_started/installation/cpu/apple.inc.md
@@ -0,0 +1,63 @@
+# --8<-- [start:installation]
+
+vLLM has experimental support for macOS with Apple silicon. For now, users shall build from the source vLLM to natively run on macOS.
+
+Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
+
+!!! warning
+    There are no pre-built wheels or images for this device, so you must build vLLM from source.
+
+# --8<-- [end:installation]
+# --8<-- [start:requirements]
+
+- OS: `macOS Sonoma` or later
+- SDK: `XCode 15.4` or later with Command Line Tools
+- Compiler: `Apple Clang >= 15.0.0`
+
+# --8<-- [end:requirements]
+# --8<-- [start:set-up-using-python]
+
+# --8<-- [end:set-up-using-python]
+# --8<-- [start:pre-built-wheels]
+
+# --8<-- [end:pre-built-wheels]
+# --8<-- [start:build-wheel-from-source]
+
+After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source.
+
+```bash
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+pip install -r requirements/cpu.txt
+pip install -e .
+```
+
+!!! note
+    On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device.
+
+!!! example "Troubleshooting"
+    If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your
+    [Command Line Tools for Xcode](https://developer.apple.com/download/all/).
+
+    ```text
+    [...] fatal error: 'map' file not found
+            1 | #include <map>
+                |          ^~~~~
+        1 error generated.
+        [2/8] Building CXX object CMakeFiles/_C.dir/csrc/cpu/pos_encoding.cpp.o
+
+    [...] fatal error: 'cstddef' file not found
+            10 | #include <cstddef>
+                |          ^~~~~~~~~
+        1 error generated.
+    ```
+
+# --8<-- [end:build-wheel-from-source]
+# --8<-- [start:pre-built-images]
+
+# --8<-- [end:pre-built-images]
+# --8<-- [start:build-image-from-source]
+
+# --8<-- [end:build-image-from-source]
+# --8<-- [start:extra-information]
+# --8<-- [end:extra-information]
diff --git a/vllm_v0.10.0/docs/getting_started/installation/cpu/arm.inc.md b/vllm_v0.10.0/docs/getting_started/installation/cpu/arm.inc.md
new file mode 100644
index 0000000..63ae351
--- /dev/null
+++ b/vllm_v0.10.0/docs/getting_started/installation/cpu/arm.inc.md
@@ -0,0 +1,53 @@
+# --8<-- [start:installation]
+
+vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform.
+
+ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
+
+!!! warning
+    There are no pre-built wheels or images for this device, so you must build vLLM from source.
+
+# --8<-- [end:installation]
+# --8<-- [start:requirements]
+
+- OS: Linux
+- Compiler: `gcc/g++ >= 12.3.0` (optional, recommended)
+- Instruction Set Architecture (ISA): NEON support is required
+
+# --8<-- [end:requirements]
+# --8<-- [start:set-up-using-python]
+
+# --8<-- [end:set-up-using-python]
+# --8<-- [start:pre-built-wheels]
+
+# --8<-- [end:pre-built-wheels]
+# --8<-- [start:build-wheel-from-source]
+
+--8<-- "docs/getting_started/installation/cpu/build.inc.md"
+
+Testing has been conducted on AWS Graviton3 instances for compatibility.
+
+# --8<-- [end:build-wheel-from-source]
+# --8<-- [start:pre-built-images]
+
+# --8<-- [end:pre-built-images]
+# --8<-- [start:build-image-from-source]
+```bash
+docker build -f docker/Dockerfile.arm \
+        --tag vllm-cpu-env .
+
+# Launching OpenAI server
+docker run --rm \
+            --privileged=true \
+            --shm-size=4g \
+            -p 8000:8000 \
+            -e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
+            -e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \
+            vllm-cpu-env \
+            --model=meta-llama/Llama-3.2-1B-Instruct \
+            --dtype=bfloat16 \
+            other vLLM OpenAI server arguments
+```
+# --8<-- [end:build-image-from-source]
+# --8<-- [start:extra-information]
+# --8<-- [end:extra-information]
diff --git a/vllm_v0.10.0/docs/getting_started/installation/cpu/build.inc.md b/vllm_v0.10.0/docs/getting_started/installation/cpu/build.inc.md
new file mode 100644
index 0000000..fa777fe
--- /dev/null
+++ b/vllm_v0.10.0/docs/getting_started/installation/cpu/build.inc.md
@@ -0,0 +1,39 @@
+First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
+
+```bash
+sudo apt-get update  -y
+sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certificates gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof
+sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+```
+
+Second, clone vLLM project:
+
+```bash
+git clone https://github.com/vllm-project/vllm.git vllm_source
+cd vllm_source
+```
+
+Third, install Python packages for vLLM CPU backend building:
+
+```bash
+pip install --upgrade pip
+pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
+pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+```
+
+Finally, build and install vLLM CPU backend:
+
+```bash
+VLLM_TARGET_DEVICE=cpu python setup.py install
+```
+
+If you want to develop vllm, install it in editable mode instead.
+
+```bash
+VLLM_TARGET_DEVICE=cpu python setup.py develop
+```
+
+!!! note
+    If you are building vLLM from source and not using the pre-built images, remember to set `LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD"` on x86 machines before running vLLM.
+
+# --8<-- [end:extra-information]
diff --git a/vllm_v0.10.0/docs/getting_started/installation/cpu/s390x.inc.md b/vllm_v0.10.0/docs/getting_started/installation/cpu/s390x.inc.md
new file mode 100644
index 0000000..acfb339
--- /dev/null
+++ b/vllm_v0.10.0/docs/getting_started/installation/cpu/s390x.inc.md
@@ -0,0 +1,83 @@
+# --8<-- [start:installation]
+
+vLLM has experimental support for s390x architecture on IBM Z platform. For now, users shall build from the vLLM source to natively run on IBM Z platform.
+
+Currently the CPU implementation for s390x architecture supports FP32 datatype only.
+
+!!! warning
+    There are no pre-built wheels or images for this device, so you must build vLLM from source.
+
+# --8<-- [end:installation]
+# --8<-- [start:requirements]
+
+- OS: `Linux`
+- SDK: `gcc/g++ >= 12.3.0` or later with Command Line Tools
+- Instruction Set Architecture (ISA): VXE support is required. Works with Z14 and above.
+- Build install python packages: `pyarrow`, `torch` and `torchvision`
+
+# --8<-- [end:requirements]
+# --8<-- [start:set-up-using-python]
+
+# --8<-- [end:set-up-using-python]
+# --8<-- [start:pre-built-wheels]
+
+# --8<-- [end:pre-built-wheels]
+# --8<-- [start:build-wheel-from-source]
+
+Install the following packages from the package manager before building the vLLM. For example on RHEL 9.4:
+
+```bash
+dnf install -y \
+    which procps findutils tar vim git gcc g++ make patch make cython zlib-devel \
+    libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
+    openssl-devel openblas openblas-devel wget autoconf automake libtool cmake numactl-devel
+```
+
+Install rust>=1.80 which is needed for `outlines-core` and `uvloop` python packages installation.
+
+```bash
+curl https://sh.rustup.rs -sSf | sh -s -- -y && \
+    . "$HOME/.cargo/env"
+```
+
+Execute the following commands to build and install vLLM from the source.
+
+!!! tip
+    Please build the following dependencies, `torchvision`, `pyarrow` from the source before building vLLM.
+
+```bash
+    sed -i '/^torch/d' requirements-build.txt    # remove torch from requirements-build.txt since we use nightly builds
+    pip install -v \
+        --extra-index-url https://download.pytorch.org/whl/nightly/cpu \
+        -r requirements-build.txt \
+        -r requirements-cpu.txt \
+    VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \
+    pip install dist/*.whl
+```
+
+# --8<-- [end:build-wheel-from-source]
+# --8<-- [start:pre-built-images]
+
+# --8<-- [end:pre-built-images]
+# --8<-- [start:build-image-from-source]
+
+```bash
+docker build -f docker/Dockerfile.s390x \
+        --tag vllm-cpu-env .
+
+# Launching OpenAI server
+docker run --rm \
+            --privileged=true \
+            --shm-size=4g \
+            -p 8000:8000 \
+            -e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
+            -e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \
+            vllm-cpu-env \
+            --model=meta-llama/Llama-3.2-1B-Instruct \
+            --dtype=float \
+            other vLLM OpenAI server arguments
+```
+
+# --8<-- [end:build-image-from-source]
+# --8<-- [start:extra-information]
+# --8<-- [end:extra-information]
diff --git a/vllm_v0.10.0/docs/getting_started/installation/cpu/x86.inc.md b/vllm_v0.10.0/docs/getting_started/installation/cpu/x86.inc.md
new file mode 100644
index 0000000..49e223f
--- /dev/null
+++ b/vllm_v0.10.0/docs/getting_started/installation/cpu/x86.inc.md
@@ -0,0 +1,58 @@
+# --8<-- [start:installation]
+
+vLLM supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16.
+
+# --8<-- [end:installation]
+# --8<-- [start:requirements]
+
+- OS: Linux
+- CPU flags: `avx512f`, `avx512_bf16` (Optional), `avx512_vnni` (Optional)
+
+!!! tip
+    Use `lscpu` to check the CPU flags.
+
+# --8<-- [end:requirements]
+# --8<-- [start:set-up-using-python]
+
+# --8<-- [end:set-up-using-python]
+# --8<-- [start:pre-built-wheels]
+
+# --8<-- [end:pre-built-wheels]
+# --8<-- [start:build-wheel-from-source]
+
+--8<-- "docs/getting_started/installation/cpu/build.inc.md"
+
+# --8<-- [end:build-wheel-from-source]
+# --8<-- [start:pre-built-images]
+
+[https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo)
+
+!!! warning
+    If deploying the pre-built images on machines only contain `avx512f`, `Illegal instruction` error may be raised. It is recommended to build images for these machines with `--build-arg VLLM_CPU_AVX512BF16=false` and `--build-arg VLLM_CPU_AVX512VNNI=false`.
+
+# --8<-- [end:pre-built-images]
+# --8<-- [start:build-image-from-source]
+
+```bash
+docker build -f docker/Dockerfile.cpu \
+        --build-arg VLLM_CPU_AVX512BF16=false (default)|true \
+        --build-arg VLLM_CPU_AVX512VNNI=false (default)|true \
+        --tag vllm-cpu-env \
+        --target vllm-openai .
+
+# Launching OpenAI server
+docker run --rm \
+            --privileged=true \
+            --shm-size=4g \
+            -p 8000:8000 \
+            -e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
+            -e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \
+            vllm-cpu-env \
+            --model=meta-llama/Llama-3.2-1B-Instruct \
+            --dtype=bfloat16 \
+            other vLLM OpenAI server arguments
+```
+
+# --8<-- [end:build-image-from-source]
+# --8<-- [start:extra-information]
+# --8<-- [end:extra-information]
diff --git a/vllm_v0.10.0/docs/getting_started/installation/device.template.md b/vllm_v0.10.0/docs/getting_started/installation/device.template.md
new file mode 100644
index 0000000..44f538d
--- /dev/null
+++ b/vllm_v0.10.0/docs/getting_started/installation/device.template.md
@@ -0,0 +1,17 @@
+# Installation
+
+## Requirements
+
+## Set up using Python
+
+### Pre-built wheels
+
+### Build wheel from source
+
+## Set up using Docker
+
+### Pre-built images
+
+### Build image from source
+
+## Extra information
diff --git a/vllm_v0.10.0/docs/getting_started/installation/google_tpu.md b/vllm_v0.10.0/docs/getting_started/installation/google_tpu.md
new file mode 100644
index 0000000..55d69d1
--- /dev/null
+++ b/vllm_v0.10.0/docs/getting_started/installation/google_tpu.md
@@ -0,0 +1,190 @@
+# Google TPU
+
+Tensor Processing Units (TPUs) are Google's custom-developed application-specific
+integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs
+are available in different versions each with different hardware specifications.
+For more information about TPUs, see [TPU System Architecture](https://cloud.google.com/tpu/docs/system-architecture-tpu-vm).
+For more information on the TPU versions supported with vLLM, see:
+
+- [TPU v6e](https://cloud.google.com/tpu/docs/v6e)
+- [TPU v5e](https://cloud.google.com/tpu/docs/v5e)
+- [TPU v5p](https://cloud.google.com/tpu/docs/v5p)
+- [TPU v4](https://cloud.google.com/tpu/docs/v4)
+
+These TPU versions allow you to configure the physical arrangements of the TPU
+chips. This can improve throughput and networking performance. For more
+information see:
+
+- [TPU v6e topologies](https://cloud.google.com/tpu/docs/v6e#configurations)
+- [TPU v5e topologies](https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config)
+- [TPU v5p topologies](https://cloud.google.com/tpu/docs/v5p#tpu-v5p-config)
+- [TPU v4 topologies](https://cloud.google.com/tpu/docs/v4#tpu-v4-config)
+
+In order for you to use Cloud TPUs you need to have TPU quota granted to your
+Google Cloud Platform project. TPU quotas specify how many TPUs you can use in a
+GPC project and are specified in terms of TPU version, the number of TPU you
+want to use, and quota type. For more information, see [TPU quota](https://cloud.google.com/tpu/docs/quota#tpu_quota).
+
+For TPU pricing information, see [Cloud TPU pricing](https://cloud.google.com/tpu/pricing).
+
+You may need additional persistent storage for your TPU VMs. For more
+information, see [Storage options for Cloud TPU data](https://cloud.devsite.corp.google.com/tpu/docs/storage-options).
+
+!!! warning
+    There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source.
+
+## Requirements
+
+- Google Cloud TPU VM
+- TPU versions: v6e, v5e, v5p, v4
+- Python: 3.11 or newer
+
+### Provision Cloud TPUs
+
+You can provision Cloud TPUs using the [Cloud TPU API](https://cloud.google.com/tpu/docs/reference/rest)
+or the [queued resources](https://cloud.google.com/tpu/docs/queued-resources)
+API (preferred). This section shows how to create TPUs using the queued resource API. For
+more information about using the Cloud TPU API, see [Create a Cloud TPU using the Create Node API](https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#create-node-api).
+Queued resources enable you to request Cloud TPU resources in a queued manner.
+When you request queued resources, the request is added to a queue maintained by
+the Cloud TPU service. When the requested resource becomes available, it's
+assigned to your Google Cloud project for your immediate exclusive use.
+
+!!! note
+    In all of the following commands, replace the ALL CAPS parameter names with
+    appropriate values. See the parameter descriptions table for more information.
+
+### Provision Cloud TPUs with GKE
+
+For more information about using TPUs with GKE, see:
+
+- [About TPUs in GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/tpus)
+- [Deploy TPU workloads in GKE Standard](https://cloud.google.com/kubernetes-engine/docs/how-to/tpus)
+- [Plan for TPUs in GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/plan-tpus)
+
+## Configure a new environment
+
+### Provision a Cloud TPU with the queued resource API
+
+Create a TPU v5e with 4 TPU chips:
+
+```bash
+gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \
+  --node-id TPU_NAME \
+  --project PROJECT_ID \
+  --zone ZONE \
+  --accelerator-type ACCELERATOR_TYPE \
+  --runtime-version RUNTIME_VERSION \
+  --service-account SERVICE_ACCOUNT
+```
+
+| Parameter name     | Description                                                                                                                                                                                              |
+|--------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| QUEUED_RESOURCE_ID | The user-assigned ID of the queued resource request.                                                                                                                                                     |
+| TPU_NAME           | The user-assigned name of the TPU which is created when the queued resource request is allocated.                                                                                                        |
+| PROJECT_ID         | Your Google Cloud project                                                                                                                                                                                |
+| ZONE               | The GCP zone where you want to create your Cloud TPU. The value you use depends on the version of TPUs you are using. For more information, see [TPU regions and zones]                                  |
+| ACCELERATOR_TYPE   | The TPU version you want to use. Specify the TPU version, for example `v5litepod-4` specifies a v5e TPU with 4 cores, `v6e-1` specifies a v6e TPU with 1 core. For more information, see [TPU versions]. |
+| RUNTIME_VERSION    | The TPU VM runtime version to use. For example, use `v2-alpha-tpuv6e` for a VM loaded with one or more v6e TPU(s). For more information see [TPU VM images].                                             |
+| SERVICE_ACCOUNT    | The email address for your service account. You can find it in the IAM Cloud Console under *Service Accounts*. For example: `tpu-service-account@<your_project_ID>.iam.gserviceaccount.com`              |
+
+Connect to your TPU VM using SSH:
+
+```bash
+gcloud compute tpus tpu-vm ssh TPU_NAME --project PROJECT_ID --zone ZONE
+```
+
+[TPU versions]: https://cloud.google.com/tpu/docs/runtimes
+[TPU VM images]: https://cloud.google.com/tpu/docs/runtimes
+[TPU regions and zones]: https://cloud.google.com/tpu/docs/regions-zones
+
+## Set up using Python
+
+### Pre-built wheels
+
+Currently, there are no pre-built TPU wheels.
+
+### Build wheel from source
+
+Install Miniconda:
+
+```bash
+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+bash Miniconda3-latest-Linux-x86_64.sh
+source ~/.bashrc
+```
+
+Create and activate a Conda environment for vLLM:
+
+```bash
+conda create -n vllm python=3.12 -y
+conda activate vllm
+```
+
+Clone the vLLM repository and go to the vLLM directory:
+
+```bash
+git clone https://github.com/vllm-project/vllm.git && cd vllm
+```
+
+Uninstall the existing `torch` and `torch_xla` packages:
+
+```bash
+pip uninstall torch torch-xla -y
+```
+
+Install build dependencies:
+
+```bash
+pip install -r requirements/tpu.txt
+sudo apt-get install --no-install-recommends --yes libopenblas-base libopenmpi-dev libomp-dev
+```
+
+Run the setup script:
+
+```bash
+VLLM_TARGET_DEVICE="tpu" python -m pip install -e .
+```
+
+## Set up using Docker
+
+### Pre-built images
+
+See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for instructions on using the official Docker image, making sure to substitute the image name `vllm/vllm-openai` with `vllm/vllm-tpu`.
+
+### Build image from source
+
+You can use <gh-file:docker/Dockerfile.tpu> to build a Docker image with TPU support.
+
+```bash
+docker build -f docker/Dockerfile.tpu -t vllm-tpu .
+```
+
+Run the Docker image with the following command:
+
+```bash
+# Make sure to add `--privileged --net host --shm-size=16G`.
+docker run --privileged --net host --shm-size=16G -it vllm-tpu
+```
+
+!!! note
+    Since TPU relies on XLA which requires static shapes, vLLM bucketizes the
+    possible input shapes and compiles an XLA graph for each shape. The
+    compilation time may take 20~30 minutes in the first run. However, the
+    compilation time reduces to ~5 minutes afterwards because the XLA graphs are
+    cached in the disk (in `VLLM_XLA_CACHE_PATH` or `~/.cache/vllm/xla_cache` by default).
+
+!!! tip
+    If you encounter the following error:
+
+    ```console
+    from torch._C import *  # noqa: F403
+    ImportError: libopenblas.so.0: cannot open shared object file: No such
+    file or directory
+    ```
+
+    Install OpenBLAS with the following command:
+
+    ```bash
+    sudo apt-get install --no-install-recommends --yes libopenblas-base libopenmpi-dev libomp-dev
+    ```
diff --git a/vllm_v0.10.0/docs/getting_started/installation/gpu.md b/vllm_v0.10.0/docs/getting_started/installation/gpu.md
new file mode 100644
index 0000000..e688cef
--- /dev/null
+++ b/vllm_v0.10.0/docs/getting_started/installation/gpu.md
@@ -0,0 +1,127 @@
+# GPU
+
+vLLM is a Python library that supports the following GPU variants. Select your GPU type to see vendor specific instructions:
+
+=== "NVIDIA CUDA"
+
+    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:installation"
+
+=== "AMD ROCm"
+
+    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:installation"
+
+=== "Intel XPU"
+
+    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:installation"
+
+## Requirements
+
+- OS: Linux
+- Python: 3.9 -- 3.12
+
+!!! note
+    vLLM does not support Windows natively. To run vLLM on Windows, you can use the Windows Subsystem for Linux (WSL) with a compatible Linux distribution, or use some community-maintained forks, e.g. [https://github.com/SystemPanic/vllm-windows](https://github.com/SystemPanic/vllm-windows).
+
+=== "NVIDIA CUDA"
+
+    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:requirements"
+
+=== "AMD ROCm"
+
+    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:requirements"
+
+=== "Intel XPU"
+
+    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:requirements"
+
+## Set up using Python
+
+### Create a new Python environment
+
+--8<-- "docs/getting_started/installation/python_env_setup.inc.md"
+
+=== "NVIDIA CUDA"
+
+    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:set-up-using-python"
+
+=== "AMD ROCm"
+
+    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:set-up-using-python"
+
+=== "Intel XPU"
+
+    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:set-up-using-python"
+
+### Pre-built wheels
+
+=== "NVIDIA CUDA"
+
+    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:pre-built-wheels"
+
+=== "AMD ROCm"
+
+    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:pre-built-wheels"
+
+=== "Intel XPU"
+
+    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:pre-built-wheels"
+
+[](){ #build-from-source }
+
+### Build wheel from source
+
+=== "NVIDIA CUDA"
+
+    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:build-wheel-from-source"
+
+=== "AMD ROCm"
+
+    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:build-wheel-from-source"
+
+=== "Intel XPU"
+
+    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:build-wheel-from-source"
+
+## Set up using Docker
+
+### Pre-built images
+
+=== "NVIDIA CUDA"
+
+    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:pre-built-images"
+
+=== "AMD ROCm"
+
+    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:pre-built-images"
+
+=== "Intel XPU"
+
+    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:pre-built-images"
+
+### Build image from source
+
+=== "NVIDIA CUDA"
+
+    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:build-image-from-source"
+
+=== "AMD ROCm"
+
+    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:build-image-from-source"
+
+=== "Intel XPU"
+
+    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:build-image-from-source"
+
+## Supported features
+
+=== "NVIDIA CUDA"
+
+    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:supported-features"
+
+=== "AMD ROCm"
+
+    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:supported-features"
+
+=== "Intel XPU"
+
+    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:supported-features"
diff --git a/vllm_v0.10.0/docs/getting_started/installation/gpu/cuda.inc.md b/vllm_v0.10.0/docs/getting_started/installation/gpu/cuda.inc.md
new file mode 100644
index 0000000..5ca5296
--- /dev/null
+++ b/vllm_v0.10.0/docs/getting_started/installation/gpu/cuda.inc.md
@@ -0,0 +1,260 @@
+# --8<-- [start:installation]
+
+vLLM contains pre-compiled C++ and CUDA (12.8) binaries.
+
+# --8<-- [end:installation]
+# --8<-- [start:requirements]
+
+- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
+
+# --8<-- [end:requirements]
+# --8<-- [start:set-up-using-python]
+
+!!! note
+    PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
+
+In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
+
+Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below][build-from-source] for more details.
+
+# --8<-- [end:set-up-using-python]
+# --8<-- [start:pre-built-wheels]
+
+You can install vLLM using either `pip` or `uv pip`:
+
+```bash
+# Install vLLM with CUDA 12.8.
+# If you are using pip.
+pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128
+# If you are using uv.
+uv pip install vllm --torch-backend=auto
+```
+
+We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`). If this doesn't work, try running `uv self update` to update `uv` first.
+
+!!! note
+    NVIDIA Blackwell GPUs (B200, GB200) require a minimum of CUDA 12.8, so make sure you are installing PyTorch wheels with at least that version. PyTorch itself offers a [dedicated interface](https://pytorch.org/get-started/locally/) to determine the appropriate pip command to run for a given target configuration.
+
+As of now, vLLM's binaries are compiled with CUDA 12.8 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.6, 11.8, and public PyTorch release versions:
+
+```bash
+# Install vLLM with CUDA 11.8.
+export VLLM_VERSION=0.6.1.post1
+export PYTHON_VERSION=312
+uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
+```
+
+[](){ #install-the-latest-code }
+
+#### Install the latest code
+
+LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`.
+
+##### Install the latest code using `pip`
+
+```bash
+pip install -U vllm \
+    --pre \
+    --extra-index-url https://wheels.vllm.ai/nightly
+```
+
+`--pre` is required for `pip` to consider pre-released versions.
+
+Another way to install the latest code is to use `uv`:
+
+```bash
+uv pip install -U vllm \
+    --torch-backend=auto \
+    --extra-index-url https://wheels.vllm.ai/nightly
+```
+
+##### Install specific revisions using `pip`
+
+If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL:
+
+```bash
+export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
+pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+```
+
+Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
+
+##### Install specific revisions using `uv`
+
+If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
+
+```bash
+export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
+uv pip install vllm \
+    --torch-backend=auto \
+    --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}
+```
+
+The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
+
+# --8<-- [end:pre-built-wheels]
+# --8<-- [start:build-wheel-from-source]
+
+#### Set up using Python-only build (without compilation)
+
+If you only need to change Python code, you can build and install vLLM without compilation. Using `pip`'s [`--editable` flag](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs), changes you make to the code will be reflected when you run vLLM:
+
+```bash
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+VLLM_USE_PRECOMPILED=1 pip install --editable .
+```
+
+This command will do the following:
+
+1. Look for the current branch in your vLLM clone.
+1. Identify the corresponding base commit in the main branch.
+1. Download the pre-built wheel of the base commit.
+1. Use its compiled libraries in the installation.
+
+!!! note
+    1. If you change C++ or kernel code, you cannot use Python-only build; otherwise you will see an import error about library not found or undefined symbol.
+    2. If you rebase your dev branch, it is recommended to uninstall vllm and re-run the above command to make sure your libraries are up to date.
+
+In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the main branch was just merged and the wheel is being built. In this case, you can wait for around an hour to try again, or manually assign the previous commit in the installation using the `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable.
+
+```bash
+export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
+export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+pip install --editable .
+```
+
+You can find more information about vLLM's wheels in [install-the-latest-code][install-the-latest-code].
+
+!!! note
+    There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
+    It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [install-the-latest-code][install-the-latest-code] for instructions on how to install a specified wheel.
+
+#### Full build (with compilation)
+
+If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes:
+
+```bash
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+pip install -e .
+```
+
+!!! tip
+    Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results.
+
+    For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` .
+    As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
+
+    When using `ccache` with `pip install -e .`, you should run `CCACHE_NOHASHDIR="true" pip install --no-build-isolation -e .`. This is because `pip` creates a new folder with a random name for each build, preventing `ccache` from recognizing that the same files are being built.
+
+    [sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments.
+    The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`.
+
+!!! note "Faster Kernel Development"
+    For frequent C++/CUDA kernel changes, after the initial `pip install -e .` setup, consider using the [Incremental Compilation Workflow](../../contributing/incremental_build.md) for significantly faster rebuilds of only the modified kernel code.
+
+##### Use an existing PyTorch installation
+
+There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.:
+
+- Building vLLM with PyTorch nightly or a custom PyTorch build.
+- Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run `pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124` to [install PyTorch nightly](https://pytorch.org/get-started/locally/), and then build vLLM on top of it.
+
+To build vLLM using an existing PyTorch installation:
+
+```bash
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+python use_existing_torch.py
+pip install -r requirements/build.txt
+pip install --no-build-isolation -e .
+```
+
+##### Use the local cutlass for compilation
+
+Currently, before starting the build process, vLLM fetches cutlass code from GitHub. However, there may be scenarios where you want to use a local version of cutlass instead.
+To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory.
+
+```bash
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e .
+```
+
+##### Troubleshooting
+
+To avoid your system being overloaded, you can limit the number of compilation jobs
+to be run simultaneously, via the environment variable `MAX_JOBS`. For example:
+
+```bash
+export MAX_JOBS=6
+pip install -e .
+```
+
+This is especially useful when you are building on less powerful machines. For example, when you use WSL it only [assigns 50% of the total memory by default](https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings), so using `export MAX_JOBS=1` can avoid compiling multiple files simultaneously and running out of memory.
+A side effect is a much slower build process.
+
+Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.
+
+```bash
+# Use `--ipc=host` to make sure the shared memory is large enough.
+docker run \
+    --gpus all \
+    -it \
+    --rm \
+    --ipc=host nvcr.io/nvidia/pytorch:23.10-py3
+```
+
+If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from [the official website](https://developer.nvidia.com/cuda-toolkit-archive). After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.:
+
+```bash
+export CUDA_HOME=/usr/local/cuda
+export PATH="${CUDA_HOME}/bin:$PATH"
+```
+
+Here is a sanity check to verify that the CUDA Toolkit is correctly installed:
+
+```bash
+nvcc --version # verify that nvcc is in your PATH
+${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME
+```
+
+#### Unsupported OS build
+
+vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems.
+
+Simply disable the `VLLM_TARGET_DEVICE` environment variable before installing:
+
+```bash
+export VLLM_TARGET_DEVICE=empty
+pip install -e .
+```
+
+# --8<-- [end:build-wheel-from-source]
+# --8<-- [start:pre-built-images]
+
+See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for instructions on using the official Docker image.
+
+Another way to access the latest code is to use the docker images:
+
+```bash
+export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
+docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT}
+```
+
+These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days.
+
+The latest code can contain bugs and may not be stable. Please use it with caution.
+
+# --8<-- [end:pre-built-images]
+# --8<-- [start:build-image-from-source]
+
+See [deployment-docker-build-image-from-source][deployment-docker-build-image-from-source] for instructions on building the Docker image.
+
+# --8<-- [end:build-image-from-source]
+# --8<-- [start:supported-features]
+
+See [feature-x-hardware][feature-x-hardware] compatibility matrix for feature support information.
+
+# --8<-- [end:supported-features]
diff --git a/vllm_v0.10.0/docs/getting_started/installation/gpu/rocm.inc.md b/vllm_v0.10.0/docs/getting_started/installation/gpu/rocm.inc.md
new file mode 100644
index 0000000..560883d
--- /dev/null
+++ b/vllm_v0.10.0/docs/getting_started/installation/gpu/rocm.inc.md
@@ -0,0 +1,232 @@
+# --8<-- [start:installation]
+
+vLLM supports AMD GPUs with ROCm 6.3.
+
+!!! tip
+    [Docker](#set-up-using-docker) is the recommended way to use vLLM on ROCm.
+
+!!! warning
+    There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source.
+
+# --8<-- [end:installation]
+# --8<-- [start:requirements]
+
+- GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100/1101), Radeon RX 9000 series (gfx1200/1201)
+- ROCm 6.3
+
+# --8<-- [end:requirements]
+# --8<-- [start:set-up-using-python]
+
+There is no extra information on creating a new Python environment for this device.
+
+# --8<-- [end:set-up-using-python]
+# --8<-- [start:pre-built-wheels]
+
+Currently, there are no pre-built ROCm wheels.
+
+# --8<-- [end:pre-built-wheels]
+# --8<-- [start:build-wheel-from-source]
+
+0. Install prerequisites (skip if you are already in an environment/docker with the following installed):
+
+    - [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html)
+    - [PyTorch](https://pytorch.org/)
+
+    For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.3_ubuntu24.04_py3.12_pytorch_release_2.4.0`, `rocm/pytorch-nightly`. If you are using docker image, you can skip to Step 3.
+
+    Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/). Example:
+
+    ```bash
+    # Install PyTorch
+    pip uninstall torch -y
+    pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3
+    ```
+
+1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton)
+
+    Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md)
+
+    ```bash
+    python3 -m pip install ninja cmake wheel pybind11
+    pip uninstall -y triton
+    git clone https://github.com/OpenAI/triton.git
+    cd triton
+    git checkout e5be006
+    cd python
+    pip3 install .
+    cd ../..
+    ```
+
+    !!! note
+        If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.
+
+2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention)
+
+    Install ROCm's flash attention (v2.7.2) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention#amd-rocm-support)
+    Alternatively, wheels intended for vLLM use can be accessed under the releases.
+
+    For example, for ROCm 6.3, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`.
+
+    ```bash
+    git clone https://github.com/ROCm/flash-attention.git
+    cd flash-attention
+    git checkout b7d29fb
+    git submodule update --init
+    GPU_ARCHS="gfx90a" python3 setup.py install
+    cd ..
+    ```
+
+    !!! note
+        You might need to downgrade the "ninja" version to 1.10 as it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
+
+3. If you choose to build AITER yourself to use a certain branch or commit, you can build AITER using the following steps:
+
+    ```bash
+    python3 -m pip uninstall -y aiter
+    git clone --recursive https://github.com/ROCm/aiter.git
+    cd aiter
+    git checkout $AITER_BRANCH_OR_COMMIT
+    git submodule sync; git submodule update --init --recursive
+    python3 setup.py develop
+    ```
+
+    !!! note
+        You will need to config the `$AITER_BRANCH_OR_COMMIT` for your purpose.
+
+4. Build vLLM. For example, vLLM on ROCM 6.3 can be built with the following steps:
+
+    ??? console "Commands"
+
+        ```bash
+        pip install --upgrade pip
+
+        # Build & install AMD SMI
+        pip install /opt/rocm/share/amd_smi
+
+        # Install dependencies
+        pip install --upgrade numba \
+            scipy \
+            huggingface-hub[cli,hf_transfer] \
+            setuptools_scm
+        pip install "numpy<2"
+        pip install -r requirements/rocm.txt
+
+        # Build vLLM for MI210/MI250/MI300.
+        export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+        python3 setup.py develop
+        ```
+
+    This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.
+
+    !!! tip
+        - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
+        - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
+        - To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention.
+        - The ROCm version of PyTorch, ideally, should match the ROCm driver version.
+
+!!! tip
+    - For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level.
+      For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization).
+
+# --8<-- [end:build-wheel-from-source]
+# --8<-- [start:pre-built-images]
+
+The [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized
+docker image designed for validating inference performance on the AMD Instinct™ MI300X accelerator.
+
+!!! tip
+    Please check [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/performance-validation/mi300x/vllm-benchmark.html)
+    for instructions on how to use this prebuilt docker image.
+
+# --8<-- [end:pre-built-images]
+# --8<-- [start:build-image-from-source]
+
+Building the Docker image from source is the recommended way to use vLLM with ROCm.
+
+#### (Optional) Build an image with ROCm software stack
+
+Build a docker image from <gh-file:docker/Dockerfile.rocm_base> which setup ROCm software stack needed by the vLLM.
+**This step is optional as this rocm_base image is usually prebuilt and store at [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev) under tag `rocm/vllm-dev:base` to speed up user experience.**
+If you choose to build this rocm_base image yourself, the steps are as follows.
+
+It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
+
+```json
+{
+    "features": {
+        "buildkit": true
+    }
+}
+```
+
+To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:
+
+```bash
+DOCKER_BUILDKIT=1 docker build \
+    -f docker/Dockerfile.rocm_base \
+    -t rocm/vllm-dev:base .
+```
+
+#### Build an image with vLLM
+
+First, build a docker image from <gh-file:docker/Dockerfile.rocm> and launch a docker container from the image.
+It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
+
+```bash
+{
+    "features": {
+        "buildkit": true
+    }
+}
+```
+
+<gh-file:docker/Dockerfile.rocm> uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches.
+It provides flexibility to customize the build of docker image using the following arguments:
+
+- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using <gh-file:docker/Dockerfile.rocm_base>
+- `ARG_PYTORCH_ROCM_ARCH`: Allows to override the gfx architecture values from the base docker image
+
+Their values can be passed in when running `docker build` with `--build-arg` options.
+
+To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:
+
+```bash
+DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm-rocm .
+```
+
+To build vllm on ROCm 6.3 for Radeon RX7900 series (gfx1100), you should pick the alternative base image:
+
+```bash
+DOCKER_BUILDKIT=1 docker build \
+    --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" \
+    -f docker/Dockerfile.rocm \
+    -t vllm-rocm \
+    .
+```
+
+To run the above docker image `vllm-rocm`, use the below command:
+
+??? console "Command"
+
+    ```bash
+    docker run -it \
+    --network=host \
+    --group-add=video \
+    --ipc=host \
+    --cap-add=SYS_PTRACE \
+    --security-opt seccomp=unconfined \
+    --device /dev/kfd \
+    --device /dev/dri \
+    -v <path/to/model>:/app/model \
+    vllm-rocm \
+    bash
+    ```
+
+Where the `<path/to/model>` is the location where the model is stored, for example, the weights for llama2 or llama3 models.
+
+# --8<-- [end:build-image-from-source]
+# --8<-- [start:supported-features]
+
+See [feature-x-hardware][feature-x-hardware] compatibility matrix for feature support information.
+
+# --8<-- [end:supported-features]
diff --git a/vllm_v0.10.0/docs/getting_started/installation/gpu/xpu.inc.md b/vllm_v0.10.0/docs/getting_started/installation/gpu/xpu.inc.md
new file mode 100644
index 0000000..b77c4e0
--- /dev/null
+++ b/vllm_v0.10.0/docs/getting_started/installation/gpu/xpu.inc.md
@@ -0,0 +1,87 @@
+# --8<-- [start:installation]
+
+vLLM initially supports basic model inference and serving on Intel GPU platform.
+
+!!! warning
+    There are no pre-built wheels or images for this device, so you must build vLLM from source.
+
+# --8<-- [end:installation]
+# --8<-- [start:requirements]
+
+- Supported Hardware: Intel Data Center GPU, Intel ARC GPU
+- OneAPI requirements: oneAPI 2025.0
+
+# --8<-- [end:requirements]
+# --8<-- [start:set-up-using-python]
+
+There is no extra information on creating a new Python environment for this device.
+
+# --8<-- [end:set-up-using-python]
+# --8<-- [start:pre-built-wheels]
+
+Currently, there are no pre-built XPU wheels.
+
+# --8<-- [end:pre-built-wheels]
+# --8<-- [start:build-wheel-from-source]
+
+- First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers) and [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) 2025.0 or later.
+- Second, install Python packages for vLLM XPU backend building:
+
+```bash
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+pip install --upgrade pip
+pip install -v -r requirements/xpu.txt
+```
+
+- Then, build and install vLLM XPU backend:
+
+```bash
+VLLM_TARGET_DEVICE=xpu python setup.py install
+```
+
+!!! note
+    - FP16 is the default data type in the current XPU backend. The BF16 data
+      type is supported on Intel Data Center GPU, not supported on Intel Arc GPU yet.
+
+# --8<-- [end:build-wheel-from-source]
+# --8<-- [start:pre-built-images]
+
+Currently, there are no pre-built XPU images.
+
+# --8<-- [end:pre-built-images]
+# --8<-- [start:build-image-from-source]
+
+```bash
+docker build -f docker/Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
+docker run -it \
+             --rm \
+             --network=host \
+             --device /dev/dri \
+             -v /dev/dri/by-path:/dev/dri/by-path \
+             vllm-xpu-env
+```
+
+# --8<-- [end:build-image-from-source]
+# --8<-- [start:supported-features]
+
+XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. We require Ray as the distributed runtime backend. For example, a reference execution like following:
+
+```bash
+python -m vllm.entrypoints.openai.api_server \
+     --model=facebook/opt-13b \
+     --dtype=bfloat16 \
+     --max_model_len=1024 \
+     --distributed-executor-backend=ray \
+     --pipeline-parallel-size=2 \
+     -tp=8
+```
+
+By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.
+
+# --8<-- [end:supported-features]
+# --8<-- [start:distributed-backend]
+
+XPU platform uses **torch-ccl** for torch<2.8 and **xccl** for torch>=2.8 as distributed backend, since torch 2.8 supports **xccl** as built-in backend for XPU.
+
+# --8<-- [end:distributed-backend]
diff --git a/vllm_v0.10.0/docs/getting_started/installation/intel_gaudi.md b/vllm_v0.10.0/docs/getting_started/installation/intel_gaudi.md
new file mode 100644
index 0000000..0be0d02
--- /dev/null
+++ b/vllm_v0.10.0/docs/getting_started/installation/intel_gaudi.md
@@ -0,0 +1,388 @@
+# Intel Gaudi
+
+This page provides instructions on running vLLM with Intel Gaudi devices.
+
+!!! warning
+    There are no pre-built wheels or images for this device, so you must build vLLM from source.
+
+## Requirements
+
+- OS: Ubuntu 22.04 LTS
+- Python: 3.10
+- Intel Gaudi accelerator
+- Intel Gaudi software version 1.18.0
+
+Please follow the instructions provided in the
+[Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html)
+to set up the execution environment. To achieve the best performance,
+please follow the methods outlined in the
+[Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
+
+## Configure a new environment
+
+### Environment verification
+
+To verify that the Intel Gaudi software was correctly installed, run:
+
+```bash
+hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
+apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
+pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
+pip list | grep neural # verify that neural_compressor_pt is installed
+```
+
+Refer to [Intel Gaudi Software Stack Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade)
+for more details.
+
+### Run Docker Image
+
+It is highly recommended to use the latest Docker image from Intel Gaudi
+vault. Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers)
+for more details.
+
+Use the following commands to run a Docker image:
+
+```bash
+docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+docker run \
+  -it \
+  --runtime=habana \
+  -e HABANA_VISIBLE_DEVICES=all \
+  -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
+  --cap-add=sys_nice \
+  --net=host \
+  --ipc=host \
+  vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+```
+
+## Set up using Python
+
+### Pre-built wheels
+
+Currently, there are no pre-built Intel Gaudi wheels.
+
+### Build wheel from source
+
+To build and install vLLM from source, run:
+
+```bash
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+pip install -r requirements/hpu.txt
+python setup.py develop
+```
+
+Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following:
+
+```bash
+git clone https://github.com/HabanaAI/vllm-fork.git
+cd vllm-fork
+git checkout habana_main
+pip install -r requirements/hpu.txt
+python setup.py develop
+```
+
+## Set up using Docker
+
+### Pre-built images
+
+Currently, there are no pre-built Intel Gaudi images.
+
+### Build image from source
+
+```bash
+docker build -f docker/Dockerfile.hpu -t vllm-hpu-env  .
+docker run \
+  -it \
+  --runtime=habana \
+  -e HABANA_VISIBLE_DEVICES=all \
+  -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
+  --cap-add=sys_nice \
+  --net=host \
+  --rm vllm-hpu-env
+```
+
+!!! tip
+    If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered.
+
+## Extra information
+
+### Supported features
+
+- [Offline inference](../../serving/offline_inference.md)
+- Online serving via [OpenAI-Compatible Server](../../serving/openai_compatible_server.md)
+- HPU autodetection - no need to manually select device within vLLM
+- Paged KV cache with algorithms enabled for Intel Gaudi accelerators
+- Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
+  prefill attention, Root Mean Square Layer Normalization, Rotary
+  Positional Encoding
+- Tensor parallelism support for multi-card inference
+- Inference with [HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)
+  for accelerating low-batch latency and throughput
+- Attention with Linear Biases (ALiBi)
+- INC quantization
+
+### Unsupported features
+
+- Beam search
+- LoRA adapters
+- AWQ quantization
+- Prefill chunking (mixed-batch inferencing)
+
+### Supported configurations
+
+The following configurations have been validated to function with
+Gaudi2 devices. Configurations that are not listed may or may not work.
+
+| Model | TP Size| dtype | Sampling |
+|-------|--------|--------|----------|
+| [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) | 1, 2, 8 | BF16 | Random / Greedy |
+| [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 1, 2, 8 | BF16 | Random / Greedy |
+| [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) | 1, 2, 8 | BF16 | Random / Greedy |
+| [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | 1, 2, 8 | BF16 | Random / Greedy |
+| [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) | 1, 2, 8 | BF16 | Random / Greedy |
+| [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) | 1, 2, 8 | BF16 | Random / Greedy |
+| [meta-llama/Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b) | 8 | BF16 | Random / Greedy |
+| [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 8 | BF16 | Random / Greedy |
+| [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B) | 8 | BF16 | Random / Greedy |
+| [meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 8 | BF16 | Random / Greedy |
+| [meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B) | 8 | BF16 | Random / Greedy |
+| [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | 8 | BF16 | Random / Greedy |
+
+## Performance tuning
+
+### Execution modes
+
+Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag.
+
+|   `PT_HPU_LAZY_MODE` |   `enforce_eager` | execution mode     |
+|----------------------|-------------------|--------------------|
+|                    0 |                 0 | torch.compile      |
+|                    0 |                 1 | PyTorch eager mode |
+|                    1 |                 0 | HPU Graphs         |
+
+!!! warning
+    In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
+
+[](){ #gaudi-bucketing-mechanism }
+
+### Bucketing mechanism
+
+Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution.
+In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - `batch_size` and `sequence_length`.
+
+!!! note
+    Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase.
+
+Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup:
+
+```text
+INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
+INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
+INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
+INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+```
+
+| Parameter      | Description                                                                 |
+|----------------|-----------------------------------------------------------------------------|
+| `min`          | Determines the lowest value of the bucket.                                  |
+| `step`         | Determines the interval between buckets.                                     |
+| `max`          | Determines the upper bound of the bucket.                                    |
+| Ramp-up phase  | A special handling phase applied between `min` and `step`:<br/>- `min` is multiplied by consecutive powers of two until `step` is reached.<br/>- Minimizes resource wastage for small batch sizes.<br/>- Allows larger padding for larger batches. |
+
+Example (with ramp-up):
+
+```text
+min = 2, step = 32, max = 64
+=> ramp_up = (2, 4, 8, 16)
+=> stable = (32, 64)
+=> buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64)
+```
+
+Example (without ramp-up):
+
+```text
+min = 128, step = 128, max = 512
+=> ramp_up = ()
+=> stable = (128, 256, 384, 512)
+=> buckets = ramp_up + stable => (128, 256, 384, 512)
+```
+
+In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket.
+
+!!! warning
+    If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario.
+
+As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket.
+
+!!! note
+    Bucketing is transparent to a client -- padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
+
+### Warmup
+
+Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup:
+
+??? console "Logs"
+
+    ```text
+    INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
+    INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
+    INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
+    ...
+    INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
+    INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB
+    INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB
+    INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB
+    ...
+    INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB
+    INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
+    ```
+
+This example uses the same buckets as in the [Bucketing Mechanism][gaudi-bucketing-mechanism] section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
+
+!!! tip
+    Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
+
+### HPU Graph capture
+
+[HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management.
+
+When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by `gpu_memory_utilization` flag (`0.9` by default).
+Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage.
+Only after that, `gpu_memory_utilization` flag is utilized - at its default value, will mark 90% of free device memory at that point as usable.
+Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured.
+Environment variable `VLLM_GRAPH_RESERVED_MEM` defines the ratio of memory reserved for HPU Graphs capture.
+With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache.
+Environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory constraints.
+Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs.
+
+!!! note
+    `gpu_memory_utilization` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, `gpu_memory_utilization` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory.
+
+User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented:
+
+- `max_bs` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode
+- `min_tokens` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), default strategy for prompt
+
+When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy.
+
+!!! note
+    `VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below.
+
+Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
+
+??? console "Logs"
+
+    ```text
+    INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
+    INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
+    INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
+    INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+    INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
+    INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used)
+    INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
+    INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used)
+    INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache
+    INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0
+    INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used)
+    INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
+    ...
+    INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
+    INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3)
+    INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
+    ...
+    INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
+    INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB
+    ...
+    INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB
+    INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB
+    INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB
+    INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB
+    INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB
+    INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)]
+    INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+    INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
+    INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
+    ```
+
+### Recommended vLLM Parameters
+
+- We recommend running inference on Gaudi 2 with `block_size` of 128
+  for BF16 data type. Using default values (16, 32) might lead to
+  sub-optimal performance due to Matrix Multiplication Engine
+  under-utilization (see [Gaudi Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)).
+- For max throughput on Llama 7B, we recommend running with batch size
+  of 128 or 256 and max context length of 2048 with HPU Graphs enabled.
+  If you encounter out-of-memory issues, see troubleshooting section.
+
+### Environment variables
+
+**Diagnostic and profiling knobs:**
+
+- `VLLM_PROFILER_ENABLED`: If `true`, enable the high level profiler. Resulting JSON traces can be viewed in [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). `false` by default.
+- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: If `true`, log graph compilations for each vLLM engine step when any occurs. Highly recommended to use with `PT_HPU_METRICS_GC_DETAILS=1`. `false` by default.
+- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: If `true`, always log graph compilations for each vLLM engine step even if none occurred. `false` by default.
+- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: If `true`, log CPU fallbacks for each vLLM engine step when any occurs. `false` by default.
+- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, always log CPU fallbacks for each vLLM engine step even if none occurred. `false` by default.
+
+**Performance tuning knobs:**
+
+- `VLLM_SKIP_WARMUP`: if `true`, warmup will be skipped, `false` by default
+
+- `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for HPUGraph capture, `0.1` by default
+
+- `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory dedicated for prompt graphs, `0.3` by default
+
+- `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt graph capture, `min_tokens` or `max_bs`, `min_tokens` by default
+
+- `VLLM_GRAPH_DECODE_STRATEGY`: strategy determining order of decode graph capture, `min_tokens` or `max_bs`, `max_bs` by default
+
+- `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism
+
+    * `{phase}` is either `PROMPT` or `DECODE`
+
+    * `{dim}` is either `BS`, `SEQ` or `BLOCK`
+
+    * `{param}` is either `MIN`, `STEP` or `MAX`
+
+    * Default values:
+
+| `{phase}` | Parameter | Env Variable | Value Expression |
+|-----------|-----------|--------------|------------------|
+| Prompt | Batch size min | `VLLM_PROMPT_BS_BUCKET_MIN` | `1` |
+| Prompt | Batch size step | `VLLM_PROMPT_BS_BUCKET_STEP` | `min(max_num_seqs, 32)` |
+| Prompt | Batch size max | `VLLM_PROMPT_BS_BUCKET_MAX` | `min(max_num_seqs, 64)` |
+| Prompt | Sequence length min | `VLLM_PROMPT_SEQ_BUCKET_MIN` | `block_size` |
+| Prompt | Sequence length step | `VLLM_PROMPT_SEQ_BUCKET_STEP` | `block_size` |
+| Prompt | Sequence length max | `VLLM_PROMPT_SEQ_BUCKET_MAX` | `max_model_len` |
+| Decode | Batch size min | `VLLM_DECODE_BS_BUCKET_MIN` | `1` |
+| Decode | Batch size step | `VLLM_DECODE_BS_BUCKET_STEP` | `min(max_num_seqs, 32)` |
+| Decode | Batch size max | `VLLM_DECODE_BS_BUCKET_MAX` | `max_num_seqs` |
+| Decode | Sequence length min | `VLLM_DECODE_BLOCK_BUCKET_MIN` | `block_size` |
+| Decode | Sequence length step | `VLLM_DECODE_BLOCK_BUCKET_STEP` | `block_size` |
+| Decode | Sequence length max | `VLLM_DECODE_BLOCK_BUCKET_MAX` | `max(128, (max_num_seqs*max_model_len)/block_size)` |
+
+Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:
+
+- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used; if `1`, PyTorch Lazy backend for Gaudi will be used. `1` is default.
+- `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor parallel inference with HPU Graphs
+
+## Troubleshooting: tweaking HPU graphs
+
+If you experience device out-of-memory issues or want to attempt
+inference at higher batch sizes, try tweaking HPU Graphs by following
+the below:
+
+- Tweak `gpu_memory_utilization` knob. It will decrease the
+  allocation of KV cache, leaving some headroom for capturing graphs
+  with larger batch size. By default `gpu_memory_utilization` is set
+  to 0.9. It attempts to allocate ~90% of HBM left for KV cache after
+  short profiling run. Note that decreasing reduces the number of KV
+  cache blocks you have available, and therefore reduces the effective
+  maximum number of tokens you can handle at a given time.
+- If this method is not efficient, you can disable `HPUGraph`
+  completely. With HPU Graphs disabled, you are trading latency and
+  throughput at lower batches for potentially higher throughput on
+  higher batches. You can do that by adding `--enforce-eager` flag to
+  server (for online serving), or by passing `enforce_eager=True`
+  argument to LLM constructor (for offline inference).
diff --git a/vllm_v0.10.0/docs/getting_started/installation/python_env_setup.inc.md b/vllm_v0.10.0/docs/getting_started/installation/python_env_setup.inc.md
new file mode 100644
index 0000000..423bf9b
--- /dev/null
+++ b/vllm_v0.10.0/docs/getting_started/installation/python_env_setup.inc.md
@@ -0,0 +1,6 @@
+It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands:
+
+```bash
+uv venv --python 3.12 --seed
+source .venv/bin/activate
+```
diff --git a/vllm_v0.10.0/docs/getting_started/quickstart.md b/vllm_v0.10.0/docs/getting_started/quickstart.md
new file mode 100644
index 0000000..74235db
--- /dev/null
+++ b/vllm_v0.10.0/docs/getting_started/quickstart.md
@@ -0,0 +1,216 @@
+# Quickstart
+
+This guide will help you quickly get started with vLLM to perform:
+
+- [Offline batched inference][quickstart-offline]
+- [Online serving using OpenAI-compatible server][quickstart-online]
+
+## Prerequisites
+
+- OS: Linux
+- Python: 3.9 -- 3.12
+
+## Installation
+
+If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/project/vllm/) directly.
+
+It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands:
+
+```bash
+uv venv --python 3.12 --seed
+source .venv/bin/activate
+uv pip install vllm --torch-backend=auto
+```
+
+`uv` can [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`).
+
+Another delightful way is to use `uv run` with `--with [dependency]` option, which allows you to run commands such as `vllm serve` without creating any permanent environment:
+
+```bash
+uv run --with vllm vllm --help
+```
+
+You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. You can install `uv` to the conda environment through `pip` if you want to manage it within the environment.
+
+```bash
+conda create -n myenv python=3.12 -y
+conda activate myenv
+pip install --upgrade uv
+uv pip install vllm --torch-backend=auto
+```
+
+!!! note
+    For more detail and non-CUDA platforms, please refer [here](installation/README.md) for specific instructions on how to install vLLM.
+
+[](){ #quickstart-offline }
+
+## Offline Batched Inference
+
+With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/basic/basic.py>
+
+The first line of this example imports the classes [LLM][vllm.LLM] and [SamplingParams][vllm.SamplingParams]:
+
+- [LLM][vllm.LLM] is the main class for running offline inference with vLLM engine.
+- [SamplingParams][vllm.SamplingParams] specifies the parameters for the sampling process.
+
+```python
+from vllm import LLM, SamplingParams
+```
+
+The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here][sampling-params].
+
+!!! important
+    By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the Hugging Face model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified.
+
+    However, if vLLM's default sampling parameters are preferred, please set `generation_config="vllm"` when creating the [LLM][vllm.LLM] instance.
+
+```python
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+```
+
+The [LLM][vllm.LLM] class initializes vLLM's engine and the [OPT-125M model](https://arxiv.org/abs/2205.01068) for offline inference. The list of supported models can be found [here](../models/supported_models.md).
+
+```python
+llm = LLM(model="facebook/opt-125m")
+```
+
+!!! note
+    By default, vLLM downloads models from [Hugging Face](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine.
+
+    ```shell
+    export VLLM_USE_MODELSCOPE=True
+    ```
+
+Now, the fun part! The outputs are generated using `llm.generate`. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of `RequestOutput` objects, which include all of the output tokens.
+
+```python
+outputs = llm.generate(prompts, sampling_params)
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+
+[](){ #quickstart-online }
+
+## OpenAI-Compatible Server
+
+vLLM can be deployed as a server that implements the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API.
+By default, it starts the server at `http://localhost:8000`. You can specify the address with `--host` and `--port` arguments. The server currently hosts one model at a time and implements endpoints such as [list models](https://platform.openai.com/docs/api-reference/models/list), [create chat completion](https://platform.openai.com/docs/api-reference/chat/completions/create), and [create completion](https://platform.openai.com/docs/api-reference/completions/create) endpoints.
+
+Run the following command to start the vLLM server with the [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) model:
+
+```bash
+vllm serve Qwen/Qwen2.5-1.5B-Instruct
+```
+
+!!! note
+    By default, the server uses a predefined chat template stored in the tokenizer.
+    You can learn about overriding it [here][chat-template].
+!!! important
+    By default, the server applies `generation_config.json` from the huggingface model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator.
+
+    To disable this behavior, please pass `--generation-config vllm` when launching the server.
+
+This server can be queried in the same format as OpenAI API. For example, to list the models:
+
+```bash
+curl http://localhost:8000/v1/models
+```
+
+You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY` to enable the server to check for API key in the header.
+
+### OpenAI Completions API with vLLM
+
+Once your server is started, you can query the model with input prompts:
+
+```bash
+curl http://localhost:8000/v1/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "Qwen/Qwen2.5-1.5B-Instruct",
+        "prompt": "San Francisco is a",
+        "max_tokens": 7,
+        "temperature": 0
+    }'
+```
+
+Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` Python package:
+
+??? code
+
+    ```python
+    from openai import OpenAI
+
+    # Modify OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
+                                        prompt="San Francisco is a")
+    print("Completion result:", completion)
+    ```
+
+A more detailed client example can be found here: <gh-file:examples/online_serving/openai_completion_client.py>
+
+### OpenAI Chat Completions API with vLLM
+
+vLLM is designed to also support the OpenAI Chat Completions API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations.
+
+You can use the [create chat completion](https://platform.openai.com/docs/api-reference/chat/completions/create) endpoint to interact with the model:
+
+```bash
+curl http://localhost:8000/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "Qwen/Qwen2.5-1.5B-Instruct",
+        "messages": [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Who won the world series in 2020?"}
+        ]
+    }'
+```
+
+Alternatively, you can use the `openai` Python package:
+
+??? code
+
+    ```python
+    from openai import OpenAI
+    # Set OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    chat_response = client.chat.completions.create(
+        model="Qwen/Qwen2.5-1.5B-Instruct",
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Tell me a joke."},
+        ]
+    )
+    print("Chat response:", chat_response)
+    ```
+
+## On Attention Backends
+
+Currently, vLLM supports multiple backends for efficient Attention computation across different platforms and accelerator architectures. It automatically selects the most performant backend compatible with your system and model specifications.
+
+If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`.
+
+!!! warning
+    There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see <gh-file:docker/Dockerfile> for instructions on how to install it.
diff --git a/vllm_v0.10.0/docs/mkdocs/hooks/generate_argparse.py b/vllm_v0.10.0/docs/mkdocs/hooks/generate_argparse.py
new file mode 100644
index 0000000..22cf41e
--- /dev/null
+++ b/vllm_v0.10.0/docs/mkdocs/hooks/generate_argparse.py
@@ -0,0 +1,122 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import logging
+import sys
+from argparse import SUPPRESS, HelpFormatter
+from pathlib import Path
+from typing import Literal
+from unittest.mock import MagicMock, patch
+
+ROOT_DIR = Path(__file__).parent.parent.parent.parent
+ARGPARSE_DOC_DIR = ROOT_DIR / "docs/argparse"
+
+sys.path.insert(0, str(ROOT_DIR))
+sys.modules["aiohttp"] = MagicMock()
+sys.modules["blake3"] = MagicMock()
+sys.modules["vllm._C"] = MagicMock()
+
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs  # noqa: E402
+from vllm.entrypoints.openai.cli_args import make_arg_parser  # noqa: E402
+from vllm.utils import FlexibleArgumentParser  # noqa: E402
+
+logger = logging.getLogger("mkdocs")
+
+
+class MarkdownFormatter(HelpFormatter):
+    """Custom formatter that generates markdown for argument groups."""
+
+    def __init__(self, prog, starting_heading_level=3):
+        super().__init__(prog,
+                         max_help_position=float('inf'),
+                         width=float('inf'))
+        self._section_heading_prefix = "#" * starting_heading_level
+        self._argument_heading_prefix = "#" * (starting_heading_level + 1)
+        self._markdown_output = []
+
+    def start_section(self, heading):
+        if heading not in {"positional arguments", "options"}:
+            heading_md = f"\n{self._section_heading_prefix} {heading}\n\n"
+            self._markdown_output.append(heading_md)
+
+    def end_section(self):
+        pass
+
+    def add_text(self, text):
+        if text:
+            self._markdown_output.append(f"{text.strip()}\n\n")
+
+    def add_usage(self, usage, actions, groups, prefix=None):
+        pass
+
+    def add_arguments(self, actions):
+        for action in actions:
+            if (len(action.option_strings) == 0
+                    or "--help" in action.option_strings):
+                continue
+
+            option_strings = f'`{"`, `".join(action.option_strings)}`'
+            heading_md = f"{self._argument_heading_prefix} {option_strings}\n\n"
+            self._markdown_output.append(heading_md)
+
+            if choices := action.choices:
+                choices = f'`{"`, `".join(str(c) for c in choices)}`'
+                self._markdown_output.append(
+                    f"Possible choices: {choices}\n\n")
+
+            self._markdown_output.append(f"{action.help}\n\n")
+
+            if (default := action.default) != SUPPRESS:
+                self._markdown_output.append(f"Default: `{default}`\n\n")
+
+    def format_help(self):
+        """Return the formatted help as markdown."""
+        return "".join(self._markdown_output)
+
+
+def create_parser(cls, **kwargs) -> FlexibleArgumentParser:
+    """Create a parser for the given class with markdown formatting.
+    
+    Args:
+        cls: The class to create a parser for
+        **kwargs: Additional keyword arguments to pass to `cls.add_cli_args`.
+
+    Returns:
+        FlexibleArgumentParser: A parser with markdown formatting for the class.
+    """
+    parser = FlexibleArgumentParser()
+    parser.formatter_class = MarkdownFormatter
+    with patch("vllm.config.DeviceConfig.__post_init__"):
+        return cls.add_cli_args(parser, **kwargs)
+
+
+def create_serve_parser() -> FlexibleArgumentParser:
+    """Create a parser for the serve command with markdown formatting."""
+    parser = FlexibleArgumentParser()
+    parser.formatter_class = lambda prog: MarkdownFormatter(
+        prog, starting_heading_level=4)
+    return make_arg_parser(parser)
+
+
+def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
+    logger.info("Generating argparse documentation")
+    logger.debug("Root directory: %s", ROOT_DIR.resolve())
+    logger.debug("Output directory: %s", ARGPARSE_DOC_DIR.resolve())
+
+    # Create the ARGPARSE_DOC_DIR if it doesn't exist
+    if not ARGPARSE_DOC_DIR.exists():
+        ARGPARSE_DOC_DIR.mkdir(parents=True)
+
+    # Create parsers to document
+    parsers = {
+        "engine_args": create_parser(EngineArgs),
+        "async_engine_args": create_parser(AsyncEngineArgs,
+                                           async_args_only=True),
+        "serve": create_serve_parser(),
+    }
+
+    # Generate documentation for each parser
+    for stem, parser in parsers.items():
+        doc_path = ARGPARSE_DOC_DIR / f"{stem}.md"
+        with open(doc_path, "w") as f:
+            f.write(parser.format_help())
+        logger.info("Argparse generated: %s", doc_path.relative_to(ROOT_DIR))
diff --git a/vllm_v0.10.0/docs/mkdocs/hooks/generate_examples.py b/vllm_v0.10.0/docs/mkdocs/hooks/generate_examples.py
new file mode 100644
index 0000000..0ee52bb
--- /dev/null
+++ b/vllm_v0.10.0/docs/mkdocs/hooks/generate_examples.py
@@ -0,0 +1,168 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import itertools
+import logging
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Literal
+
+import regex as re
+
+logger = logging.getLogger("mkdocs")
+
+ROOT_DIR = Path(__file__).parent.parent.parent.parent
+ROOT_DIR_RELATIVE = '../../../../..'
+EXAMPLE_DIR = ROOT_DIR / "examples"
+EXAMPLE_DOC_DIR = ROOT_DIR / "docs/examples"
+
+
+def fix_case(text: str) -> str:
+    subs = {
+        "api": "API",
+        "cli": "CLI",
+        "cpu": "CPU",
+        "llm": "LLM",
+        "mae": "MAE",
+        "tpu": "TPU",
+        "aqlm": "AQLM",
+        "gguf": "GGUF",
+        "lora": "LoRA",
+        "rlhf": "RLHF",
+        "vllm": "vLLM",
+        "openai": "OpenAI",
+        "lmcache": "LMCache",
+        "multilora": "MultiLoRA",
+        "mlpspeculator": "MLPSpeculator",
+        r"fp\d+": lambda x: x.group(0).upper(),  # e.g. fp16, fp32
+        r"int\d+": lambda x: x.group(0).upper(),  # e.g. int8, int16
+    }
+    for pattern, repl in subs.items():
+        text = re.sub(rf'\b{pattern}\b', repl, text, flags=re.IGNORECASE)
+    return text
+
+
+@dataclass
+class Example:
+    """
+    Example class for generating documentation content from a given path.
+
+    Attributes:
+        path (Path): The path to the main directory or file.
+        category (str): The category of the document.
+        main_file (Path): The main file in the directory.
+        other_files (list[Path]): list of other files in the directory.
+        title (str): The title of the document.
+
+    Methods:
+        __post_init__(): Initializes the main_file, other_files, and title attributes.
+        determine_main_file() -> Path: Determines the main file in the given path.
+        determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file.
+        determine_title() -> str: Determines the title of the document.
+        generate() -> str: Generates the documentation content.
+    """ # noqa: E501
+    path: Path
+    category: str = None
+    main_file: Path = field(init=False)
+    other_files: list[Path] = field(init=False)
+    title: str = field(init=False)
+
+    def __post_init__(self):
+        self.main_file = self.determine_main_file()
+        self.other_files = self.determine_other_files()
+        self.title = self.determine_title()
+
+    def determine_main_file(self) -> Path:
+        """
+        Determines the main file in the given path.
+        If the path is a file, it returns the path itself. Otherwise, it searches
+        for Markdown files (*.md) in the directory and returns the first one found.
+        Returns:
+            Path: The main file path, either the original path if it's a file or the first
+            Markdown file found in the directory.
+        Raises:
+            IndexError: If no Markdown files are found in the directory.
+        """ # noqa: E501
+        return self.path if self.path.is_file() else list(
+            self.path.glob("*.md")).pop()
+
+    def determine_other_files(self) -> list[Path]:
+        """
+        Determine other files in the directory excluding the main file.
+
+        This method checks if the given path is a file. If it is, it returns an empty list.
+        Otherwise, it recursively searches through the directory and returns a list of all
+        files that are not the main file.
+
+        Returns:
+            list[Path]: A list of Path objects representing the other files in the directory.
+        """ # noqa: E501
+        if self.path.is_file():
+            return []
+        is_other_file = lambda file: file.is_file() and file != self.main_file
+        return [file for file in self.path.rglob("*") if is_other_file(file)]
+
+    def determine_title(self) -> str:
+        return fix_case(self.path.stem.replace("_", " ").title())
+
+    def generate(self) -> str:
+        content = f"---\ntitle: {self.title}\n---\n\n"
+        content += f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n"
+
+        # Use long code fence to avoid issues with
+        # included files containing code fences too
+        code_fence = "``````"
+        is_code = self.main_file.suffix != ".md"
+        if is_code:
+            content += f"{code_fence}{self.main_file.suffix[1:]}\n"
+        content += f'--8<-- "{self.main_file}"\n'
+        if is_code:
+            content += f"{code_fence}\n"
+        content += "\n"
+
+        if not self.other_files:
+            return content
+
+        content += "## Example materials\n\n"
+        for file in sorted(self.other_files):
+            content += f'??? abstract "{file.relative_to(self.path)}"\n'
+            if file.suffix != ".md":
+                content += f"    {code_fence}{file.suffix[1:]}\n"
+            content += f'    --8<-- "{file}"\n'
+            if file.suffix != ".md":
+                content += f"    {code_fence}\n"
+
+        return content
+
+
+def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
+    logger.info("Generating example documentation")
+    logger.debug("Root directory: %s", ROOT_DIR.resolve())
+    logger.debug("Example directory: %s", EXAMPLE_DIR.resolve())
+    logger.debug("Example document directory: %s", EXAMPLE_DOC_DIR.resolve())
+
+    # Create the EXAMPLE_DOC_DIR if it doesn't exist
+    if not EXAMPLE_DOC_DIR.exists():
+        EXAMPLE_DOC_DIR.mkdir(parents=True)
+
+    categories = sorted(p for p in EXAMPLE_DIR.iterdir() if p.is_dir())
+
+    examples = []
+    glob_patterns = ["*.py", "*.md", "*.sh"]
+    # Find categorised examples
+    for category in categories:
+        globs = [category.glob(pattern) for pattern in glob_patterns]
+        for path in itertools.chain(*globs):
+            examples.append(Example(path, category.stem))
+        # Find examples in subdirectories
+        for path in category.glob("*/*.md"):
+            examples.append(Example(path.parent, category.stem))
+
+    # Generate the example documentation
+    for example in sorted(examples, key=lambda e: e.path.stem):
+        example_name = f"{example.path.stem}.md"
+        doc_path = EXAMPLE_DOC_DIR / example.category / example_name
+        if not doc_path.parent.exists():
+            doc_path.parent.mkdir(parents=True)
+        with open(doc_path, "w+") as f:
+            f.write(example.generate())
+        logger.debug("Example generated: %s", doc_path.relative_to(ROOT_DIR))
diff --git a/vllm_v0.10.0/docs/mkdocs/hooks/remove_announcement.py b/vllm_v0.10.0/docs/mkdocs/hooks/remove_announcement.py
new file mode 100644
index 0000000..1a84039
--- /dev/null
+++ b/vllm_v0.10.0/docs/mkdocs/hooks/remove_announcement.py
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+from pathlib import Path
+from typing import Literal
+
+
+def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
+    # see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa
+    if os.getenv('READTHEDOCS_VERSION_TYPE') == "tag":
+        # remove the warning banner if the version is a tagged release
+        mkdocs_dir = Path(__file__).parent.parent
+        announcement_path = mkdocs_dir / "overrides/main.html"
+        # The file might be removed already if the build is triggered multiple
+        # times (readthedocs build both HTML and PDF versions separately)
+        if announcement_path.exists():
+            os.remove(announcement_path)
diff --git a/vllm_v0.10.0/docs/mkdocs/hooks/url_schemes.py b/vllm_v0.10.0/docs/mkdocs/hooks/url_schemes.py
new file mode 100644
index 0000000..6fce6bd
--- /dev/null
+++ b/vllm_v0.10.0/docs/mkdocs/hooks/url_schemes.py
@@ -0,0 +1,122 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This is basically a port of MyST parser’s external URL resolution mechanism
+(https://myst-parser.readthedocs.io/en/latest/syntax/cross-referencing.html#customising-external-url-resolution)
+to work with MkDocs.
+
+It allows Markdown authors to use GitHub shorthand links like:
+
+  - [Text](gh-issue:123)
+  - <gh-pr:456>
+  - [File](gh-file:path/to/file.py#L10)
+
+These are automatically rewritten into fully qualified GitHub URLs pointing to
+issues, pull requests, files, directories, or projects in the
+`vllm-project/vllm` repository.
+
+The goal is to simplify cross-referencing common GitHub resources
+in project docs.
+"""
+
+import regex as re
+from mkdocs.config.defaults import MkDocsConfig
+from mkdocs.structure.files import Files
+from mkdocs.structure.pages import Page
+
+
+def on_page_markdown(markdown: str, *, page: Page, config: MkDocsConfig,
+                     files: Files) -> str:
+    """
+    Custom MkDocs plugin hook to rewrite special GitHub reference links
+    in Markdown.
+
+    This function scans the given Markdown content for specially formatted
+    GitHub shorthand links, such as:
+      - `[Link text](gh-issue:123)`
+      - `<gh-pr:456>`
+    
+    And rewrites them into fully-qualified GitHub URLs with GitHub icons:
+      - `[:octicons-mark-github-16: Link text](https://github.com/vllm-project/vllm/issues/123)`
+      - `[:octicons-mark-github-16: Pull Request #456](https://github.com/vllm-project/vllm/pull/456)`
+
+    Supported shorthand types:
+      - `gh-issue`
+      - `gh-pr`
+      - `gh-project`
+      - `gh-dir`
+      - `gh-file`
+
+    Args:
+        markdown (str): The raw Markdown content of the page.
+        page (Page): The MkDocs page object being processed.
+        config (MkDocsConfig): The MkDocs site configuration.
+        files (Files): The collection of files in the MkDocs build.
+
+    Returns:
+        str: The updated Markdown content with GitHub shorthand links replaced.
+    """
+    gh_icon = ":octicons-mark-github-16:"
+    gh_url = "https://github.com"
+    repo_url = f"{gh_url}/vllm-project/vllm"
+    org_url = f"{gh_url}/orgs/vllm-project"
+
+    # Mapping of shorthand types to their corresponding GitHub base URLs
+    urls = {
+        "issue": f"{repo_url}/issues",
+        "pr": f"{repo_url}/pull",
+        "project": f"{org_url}/projects",
+        "dir": f"{repo_url}/tree/main",
+        "file": f"{repo_url}/blob/main",
+    }
+
+    # Default title prefixes for auto links
+    titles = {
+        "issue": "Issue #",
+        "pr": "Pull Request #",
+        "project": "Project #",
+        "dir": "",
+        "file": "",
+    }
+
+    # Regular expression to match GitHub shorthand links
+    scheme = r"gh-(?P<type>.+?):(?P<path>.+?)(#(?P<fragment>.+?))?"
+    inline_link = re.compile(r"\[(?P<title>[^\[]+?)\]\(" + scheme + r"\)")
+    auto_link = re.compile(f"<{scheme}>")
+
+    def replace_inline_link(match: re.Match) -> str:
+        """
+        Replaces a matched inline-style GitHub shorthand link
+        with a full Markdown link.
+        
+        Example:
+            [My issue](gh-issue:123) → [:octicons-mark-github-16: My issue](https://github.com/vllm-project/vllm/issues/123)
+        """
+        url = f'{urls[match.group("type")]}/{match.group("path")}'
+        if fragment := match.group("fragment"):
+            url += f"#{fragment}"
+
+        return f'[{gh_icon} {match.group("title")}]({url})'
+
+    def replace_auto_link(match: re.Match) -> str:
+        """
+        Replaces a matched autolink-style GitHub shorthand
+        with a full Markdown link.
+        
+        Example:
+            <gh-pr:456> → [:octicons-mark-github-16: Pull Request #456](https://github.com/vllm-project/vllm/pull/456)
+        """
+        type = match.group("type")
+        path = match.group("path")
+        title = f"{titles[type]}{path}"
+        url = f"{urls[type]}/{path}"
+        if fragment := match.group("fragment"):
+            url += f"#{fragment}"
+
+        return f"[{gh_icon} {title}]({url})"
+
+    # Replace both inline and autolinks
+    markdown = inline_link.sub(replace_inline_link, markdown)
+    markdown = auto_link.sub(replace_auto_link, markdown)
+
+    return markdown
diff --git a/vllm_v0.10.0/docs/mkdocs/javascript/edit_and_feedback.js b/vllm_v0.10.0/docs/mkdocs/javascript/edit_and_feedback.js
new file mode 100644
index 0000000..68dec72
--- /dev/null
+++ b/vllm_v0.10.0/docs/mkdocs/javascript/edit_and_feedback.js
@@ -0,0 +1,47 @@
+/**
+ * edit_and_feedback.js
+ *
+ * Enhances MkDocs Material docs pages by:
+ *
+ * 1. Adding a "Question? Give us feedback" link
+ *    below the "Edit" button.
+ *
+ *    - The link opens a GitHub issue with a template,
+ *      auto-filled with the current page URL and path.
+ *
+ * 2. Ensuring the edit button opens in a new tab
+ *    with target="_blank" and rel="noopener".
+ */
+document.addEventListener("DOMContentLoaded", function () {
+  const url = window.location.href;
+  const page = document.body.dataset.mdUrl || location.pathname;
+
+  const feedbackLink = document.createElement("a");
+  feedbackLink.href = `https://github.com/vllm-project/vllm/issues/new?template=100-documentation.yml&title=${encodeURIComponent(
+    `[Docs] Feedback for \`${page}\``
+  )}&body=${encodeURIComponent(`📄 **Reference:**\n${url}\n\n📝 **Feedback:**\n_Your response_`)}`;
+  feedbackLink.target = "_blank";
+  feedbackLink.rel = "noopener";
+  feedbackLink.title = "Provide feedback";
+  feedbackLink.className = "md-content__button";
+  feedbackLink.innerHTML = `
+  <svg
+    xmlns="http://www.w3.org/2000/svg"
+    height="24px"
+    viewBox="0 -960 960 960"
+    width="24px"
+    fill="currentColor"
+  >
+    <path d="M280-280h280v-80H280v80Zm0-160h400v-80H280v80Zm0-160h400v-80H280v80Zm-80 480q-33 0-56.5-23.5T120-200v-560q0-33 23.5-56.5T200-840h560q33 0 56.5 23.5T840-760v560q0 33-23.5 56.5T760-120H200Zm0-80h560v-560H200v560Zm0-560v560-560Z"/>
+  </svg>
+`;
+
+  const editButton = document.querySelector('.md-content__button[href*="edit"]');
+
+  if (editButton && editButton.parentNode) {
+    editButton.insertAdjacentElement("beforebegin", feedbackLink);
+
+    editButton.setAttribute("target", "_blank");
+    editButton.setAttribute("rel", "noopener");
+  }
+});
diff --git a/vllm_v0.10.0/docs/mkdocs/javascript/run_llm_widget.js b/vllm_v0.10.0/docs/mkdocs/javascript/run_llm_widget.js
new file mode 100644
index 0000000..d0e5560
--- /dev/null
+++ b/vllm_v0.10.0/docs/mkdocs/javascript/run_llm_widget.js
@@ -0,0 +1,19 @@
+// Add RunLLM widget
+document.addEventListener("DOMContentLoaded", function () {
+    var script = document.createElement("script");
+    script.type = "module";
+    script.id = "runllm-widget-script"
+  
+    script.src = "https://widget.runllm.com";
+  
+    script.setAttribute("version", "stable");
+    script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget.
+    script.setAttribute("runllm-name", "vLLM");
+    script.setAttribute("runllm-position", "BOTTOM_RIGHT");
+    script.setAttribute("runllm-position-y", "120px");
+    script.setAttribute("runllm-position-x", "20px");
+    script.setAttribute("runllm-assistant-id", "207");
+  
+    script.async = true;
+    document.head.appendChild(script);
+  });
diff --git a/vllm_v0.10.0/docs/mkdocs/javascript/slack_and_forum.js b/vllm_v0.10.0/docs/mkdocs/javascript/slack_and_forum.js
new file mode 100644
index 0000000..9a92332
--- /dev/null
+++ b/vllm_v0.10.0/docs/mkdocs/javascript/slack_and_forum.js
@@ -0,0 +1,56 @@
+/**
+ * slack_and_forum.js
+ *
+ * Adds a custom Slack and Forum button to the MkDocs Material header.
+ *
+ */
+
+window.addEventListener('DOMContentLoaded', () => {
+  const headerInner = document.querySelector('.md-header__inner');
+
+  if (headerInner) {
+    const slackButton = document.createElement('button');
+    slackButton.className = 'slack-button';
+    slackButton.title = 'Join us on Slack';
+    slackButton.style.border = 'none';
+    slackButton.style.background = 'transparent';
+    slackButton.style.cursor = 'pointer';
+
+    slackButton.innerHTML = `
+      <img src="https://a.slack-edge.com/80588/marketing/img/icons/icon_slack_hash_colored.png" 
+           style="height: 1.1rem;" 
+           alt="Slack">
+    `;
+
+    slackButton.addEventListener('click', () => {
+      window.open('https://slack.vllm.ai', '_blank', 'noopener');
+    });
+
+    const forumButton = document.createElement('button');
+    forumButton.className = 'forum-button';
+    forumButton.title = 'Join the Forum';
+    forumButton.style.border = 'none';
+    forumButton.style.background = 'transparent';
+    forumButton.style.cursor = 'pointer';
+
+    forumButton.innerHTML = `
+      <svg
+        xmlns="http://www.w3.org/2000/svg"
+        viewBox="0 -960 960 960"
+        fill="currentColor"
+      >
+        <path d="M817.85-198.15 698.46-317.54H320q-24.48 0-41.47-16.99T261.54-376v-11.69h424.61q25.39 0 43.47-18.08 18.07-18.08 18.07-43.46v-268.92h11.69q24.48 0 41.47 16.99 17 16.99 17 41.47v461.54ZM179.08-434.69l66.84-66.85h363.31q10.77 0 17.69-6.92 6.93-6.92 6.93-17.69v-246.77q0-10.77-6.93-17.7-6.92-6.92-17.69-6.92H203.69q-10.77 0-17.69 6.92-6.92 6.93-6.92 17.7v338.23Zm-36.93 89.46v-427.69q0-25.39 18.08-43.46 18.08-18.08 43.46-18.08h405.54q25.39 0 43.46 18.08 18.08 18.07 18.08 43.46v246.77q0 25.38-18.08 43.46-18.07 18.07-43.46 18.07H261.54L142.15-345.23Zm36.93-180.92V-797.54v271.39Z"/>
+      </svg>
+    `;
+
+    forumButton.addEventListener('click', () => {
+      window.open('https://discuss.vllm.ai/', '_blank', 'noopener');
+    });
+
+    const githubSource = document.querySelector('.md-header__source');
+    if (githubSource) {
+      githubSource.parentNode.insertBefore(slackButton, githubSource.nextSibling);
+      githubSource.parentNode.insertBefore(forumButton, slackButton.nextSibling);
+    }
+  }
+});
diff --git a/vllm_v0.10.0/docs/mkdocs/overrides/main.html b/vllm_v0.10.0/docs/mkdocs/overrides/main.html
new file mode 100644
index 0000000..bdd62eb
--- /dev/null
+++ b/vllm_v0.10.0/docs/mkdocs/overrides/main.html
@@ -0,0 +1,5 @@
+{% extends "base.html" %}
+
+{% block announce %}
+  <p>You are viewing the latest developer preview docs. <a href="https://docs.vllm.ai/en/stable/">Click here</a> to view docs for the latest stable release.</p>
+{% endblock %}
diff --git a/vllm_v0.10.0/docs/mkdocs/overrides/partials/toc-item.html b/vllm_v0.10.0/docs/mkdocs/overrides/partials/toc-item.html
new file mode 100644
index 0000000..284af59
--- /dev/null
+++ b/vllm_v0.10.0/docs/mkdocs/overrides/partials/toc-item.html
@@ -0,0 +1,21 @@
+<!-- Enables the use of toc_depth in document frontmatter https://github.com/squidfunk/mkdocs-material/issues/4827#issuecomment-1869812019 -->
+<li class="md-nav__item">
+    <a href="{{ toc_item.url }}" class="md-nav__link">
+      <span class="md-ellipsis">
+        {{ toc_item.title }}
+      </span>
+    </a>
+  
+    <!-- Table of contents list -->
+    {% if toc_item.children %}
+      <nav class="md-nav" aria-label="{{ toc_item.title | striptags }}">
+        <ul class="md-nav__list">
+          {% for toc_item in toc_item.children %}
+          {% if not page.meta.toc_depth or toc_item.level <= page.meta.toc_depth %}
+            {% include "partials/toc-item.html" %}
+          {% endif %}
+          {% endfor %}
+        </ul>
+      </nav>
+    {% endif %}
+  </li>
\ No newline at end of file
diff --git a/vllm_v0.10.0/docs/mkdocs/stylesheets/extra.css b/vllm_v0.10.0/docs/mkdocs/stylesheets/extra.css
new file mode 100644
index 0000000..fb44d9c
--- /dev/null
+++ b/vllm_v0.10.0/docs/mkdocs/stylesheets/extra.css
@@ -0,0 +1,185 @@
+/* Warning for latest docs */
+.md-banner {
+    background-color: var(--md-warning-bg-color);
+    color: var(--md-warning-fg-color);
+}
+
+/* https://christianoliff.com/blog/styling-external-links-with-an-icon-in-css/ */
+a:not(:has(svg)):not(.md-icon):not(.autorefs-external) {
+    align-items: center;
+
+    &[href^="//"]::after,
+    &[href^="http://"]::after,
+    &[href^="https://"]::after {
+        content: "";
+        width: 12px;
+        height: 12px;
+        margin-left: 4px;
+        background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='16' height='16' stroke='gray' viewBox='0 0 16 16'%3E%3Cpath fill-rule='evenodd' d='M8.636 3.5a.5.5 0 0 0-.5-.5H1.5A1.5 1.5 0 0 0 0 4.5v10A1.5 1.5 0 0 0 1.5 16h10a1.5 1.5 0 0 0 1.5-1.5V7.864a.5.5 0 0 0-1 0V14.5a.5.5 0 0 1-.5.5h-10a.5.5 0 0 1-.5-.5v-10a.5.5 0 0 1 .5-.5h6.636a.5.5 0 0 0 .5-.5z'/%3E%3Cpath fill-rule='evenodd' d='M16 .5a.5.5 0 0 0-.5-.5h-5a.5.5 0 0 0 0 1h3.793L6.146 9.146a.5.5 0 1 0 .708.708L15 1.707V5.5a.5.5 0 0 0 1 0v-5z'/%3E%3C/svg%3E");
+        background-position: center;
+        background-repeat: no-repeat;
+        background-size: contain;
+        display: inline-block;
+    }
+}
+
+/* Light mode: darker section titles */
+body[data-md-color-scheme="default"] .md-nav__item--section > label.md-nav__link .md-ellipsis {
+  color: rgba(0, 0, 0, 0.7) !important;
+  font-weight: 700;
+}
+
+/* Dark mode: lighter gray section titles */
+body[data-md-color-scheme="slate"] .md-nav__item--section > label.md-nav__link .md-ellipsis {
+  color: rgba(255, 255, 255, 0.75) !important;
+  font-weight: 700;
+}
+
+/* Custom admonitions */
+:root {
+  --md-admonition-icon--announcement: url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16" width="16" height="16"><path d="M3.25 9a.75.75 0 0 1 .75.75c0 2.142.456 3.828.733 4.653a.122.122 0 0 0 .05.064.212.212 0 0 0 .117.033h1.31c.085 0 .18-.042.258-.152a.45.45 0 0 0 .075-.366A16.743 16.743 0 0 1 6 9.75a.75.75 0 0 1 1.5 0c0 1.588.25 2.926.494 3.85.293 1.113-.504 2.4-1.783 2.4H4.9c-.686 0-1.35-.41-1.589-1.12A16.4 16.4 0 0 1 2.5 9.75.75.75 0 0 1 3.25 9Z"></path><path d="M0 6a4 4 0 0 1 4-4h2.75a.75.75 0 0 1 .75.75v6.5a.75.75 0 0 1-.75.75H4a4 4 0 0 1-4-4Zm4-2.5a2.5 2.5 0 1 0 0 5h2v-5Z"></path><path d="M15.59.082A.75.75 0 0 1 16 .75v10.5a.75.75 0 0 1-1.189.608l-.002-.001h.001l-.014-.01a5.775 5.775 0 0 0-.422-.25 10.63 10.63 0 0 0-1.469-.64C11.576 10.484 9.536 10 6.75 10a.75.75 0 0 1 0-1.5c2.964 0 5.174.516 6.658 1.043.423.151.787.302 1.092.443V2.014c-.305.14-.669.292-1.092.443C11.924 2.984 9.713 3.5 6.75 3.5a.75.75 0 0 1 0-1.5c2.786 0 4.826-.484 6.155-.957.665-.236 1.154-.47 1.47-.64.144-.077.284-.161.421-.25l.014-.01a.75.75 0 0 1 .78-.061Z"></path></svg>');
+  --md-admonition-icon--important: url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16" width="16" height="16"><path d="M4.47.22A.749.749 0 0 1 5 0h6c.199 0 .389.079.53.22l4.25 4.25c.141.14.22.331.22.53v6a.749.749 0 0 1-.22.53l-4.25 4.25A.749.749 0 0 1 11 16H5a.749.749 0 0 1-.53-.22L.22 11.53A.749.749 0 0 1 0 11V5c0-.199.079-.389.22-.53Zm.84 1.28L1.5 5.31v5.38l3.81 3.81h5.38l3.81-3.81V5.31L10.69 1.5ZM8 4a.75.75 0 0 1 .75.75v3.5a.75.75 0 0 1-1.5 0v-3.5A.75.75 0 0 1 8 4Zm0 8a1 1 0 1 1 0-2 1 1 0 0 1 0 2Z"></path></svg>');
+  --md-admonition-icon--code: url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16"><path d="m11.28 3.22 4.25 4.25a.75.75 0 0 1 0 1.06l-4.25 4.25a.749.749 0 0 1-1.275-.326.75.75 0 0 1 .215-.734L13.94 8l-3.72-3.72a.749.749 0 0 1 .326-1.275.75.75 0 0 1 .734.215m-6.56 0a.75.75 0 0 1 1.042.018.75.75 0 0 1 .018 1.042L2.06 8l3.72 3.72a.749.749 0 0 1-.326 1.275.75.75 0 0 1-.734-.215L.47 8.53a.75.75 0 0 1 0-1.06Z"/></svg>');
+  --md-admonition-icon--console: url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16"><path d="M0 2.75C0 1.784.784 1 1.75 1h12.5c.966 0 1.75.784 1.75 1.75v10.5A1.75 1.75 0 0 1 14.25 15H1.75A1.75 1.75 0 0 1 0 13.25Zm1.75-.25a.25.25 0 0 0-.25.25v10.5c0 .138.112.25.25.25h12.5a.25.25 0 0 0 .25-.25V2.75a.25.25 0 0 0-.25-.25ZM7.25 8a.75.75 0 0 1-.22.53l-2.25 2.25a.749.749 0 0 1-1.275-.326.75.75 0 0 1 .215-.734L5.44 8 3.72 6.28a.749.749 0 0 1 .326-1.275.75.75 0 0 1 .734.215l2.25 2.25c.141.14.22.331.22.53m1.5 1.5h3a.75.75 0 0 1 0 1.5h-3a.75.75 0 0 1 0-1.5"/></svg>');
+}
+
+.md-typeset .admonition.announcement,
+.md-typeset details.announcement {
+  border-color: rgb(255, 110, 66);
+}
+.md-typeset .admonition.important,
+.md-typeset details.important {
+  border-color: rgb(239, 85, 82);
+}
+.md-typeset .admonition.code,
+.md-typeset details.code {
+  border-color: #64dd17
+}
+.md-typeset .admonition.console,
+.md-typeset details.console {
+  border-color: #64dd17
+}
+
+.md-typeset .announcement > .admonition-title,
+.md-typeset .announcement > summary {
+  background-color: rgb(255, 110, 66, 0.1);
+}
+.md-typeset .important > .admonition-title,
+.md-typeset .important > summary {
+  background-color: rgb(239, 85, 82, 0.1);
+}
+.md-typeset .code > .admonition-title,
+.md-typeset .code > summary {
+  background-color: #64dd171a;
+}
+.md-typeset .console > .admonition-title,
+.md-typeset .console > summary {
+  background-color: #64dd171a;
+}
+
+.md-typeset .announcement > .admonition-title::before,
+.md-typeset .announcement > summary::before {
+  background-color: rgb(239, 85, 82);
+  -webkit-mask-image: var(--md-admonition-icon--announcement);
+          mask-image: var(--md-admonition-icon--announcement);
+}
+.md-typeset .important > .admonition-title::before,
+.md-typeset .important > summary::before {
+  background-color: rgb(239, 85, 82);
+  -webkit-mask-image: var(--md-admonition-icon--important);
+          mask-image: var(--md-admonition-icon--important);
+}
+.md-typeset .code > .admonition-title::before,
+.md-typeset .code > summary::before {
+  background-color: #64dd17;
+  -webkit-mask-image: var(--md-admonition-icon--code);
+          mask-image: var(--md-admonition-icon--code);
+}
+.md-typeset .console > .admonition-title::before,
+.md-typeset .console > summary::before {
+  background-color: #64dd17;
+  -webkit-mask-image: var(--md-admonition-icon--console);
+          mask-image: var(--md-admonition-icon--console);
+}
+
+/* Make label fully visible on hover */
+.md-content__button[href*="edit"]:hover::after {
+  opacity: 1;
+}
+
+/* Hide edit button on generated docs/examples pages */
+@media (min-width: 960px) {
+  .md-content__button[href*="docs/examples/"] {
+    display: none !important;
+  }
+}
+
+.md-content__button-wrapper {
+  position: absolute;
+  top: 0.6rem;
+  right: 0.8rem;
+  display: flex;
+  flex-direction: row;
+  align-items: center;
+  gap: 0.4rem;
+  z-index: 1;
+}
+
+.md-content__button-wrapper a {
+  display: inline-flex;
+  align-items: center;
+  justify-content: center;
+  height: 24px;
+  width: 24px;
+  color: var(--md-default-fg-color);
+  text-decoration: none;
+}
+
+.md-content__button-wrapper a:hover {
+  color: var(--md-accent-fg-color);
+}
+
+/* Slack and Forum css */
+.slack-button, 
+.forum-button {
+  display: inline-flex;
+  align-items: center;
+  justify-content: center;
+  margin-left: 0.4rem;
+  height: 24px;
+}
+
+.slack-button img {
+  height: 18px;
+  filter: none !important;
+}
+
+.slack-button:hover,
+.forum-button:hover {
+  opacity: 0.7;
+}
+
+.forum-button svg {
+  height: 28px;
+  opacity: 0.9;
+  transform: translateY(2px);
+}
+
+/* For logo css */
+[data-md-color-scheme="default"] .logo-dark {
+  display: none;
+}
+
+[data-md-color-scheme="slate"] .logo-light {
+  display: none;
+}
+
+/* Outline for content tabs */
+.md-typeset .tabbed-set {
+  border: 0.075rem solid var(--md-default-fg-color);
+  border-radius: 0.2rem;
+}
+
+.md-typeset .tabbed-content {
+  padding: 0 0.6em;
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/docs/models/extensions/fastsafetensor.md b/vllm_v0.10.0/docs/models/extensions/fastsafetensor.md
new file mode 100644
index 0000000..531d586
--- /dev/null
+++ b/vllm_v0.10.0/docs/models/extensions/fastsafetensor.md
@@ -0,0 +1,5 @@
+Loading Model weights with fastsafetensors
+===================================================================
+
+Using fastsafetensors library enables loading model weights to GPU memory by leveraging GPU direct storage. See [their GitHub repository](https://github.com/foundation-model-stack/fastsafetensors) for more details.
+For enabling this feature, set the environment variable ``USE_FASTSAFETENSOR`` to ``true``
diff --git a/vllm_v0.10.0/docs/models/extensions/runai_model_streamer.md b/vllm_v0.10.0/docs/models/extensions/runai_model_streamer.md
new file mode 100644
index 0000000..992dddf
--- /dev/null
+++ b/vllm_v0.10.0/docs/models/extensions/runai_model_streamer.md
@@ -0,0 +1,89 @@
+# Loading models with Run:ai Model Streamer
+
+Run:ai Model Streamer is a library to read tensors in concurrency, while streaming it to GPU memory.
+Further reading can be found in [Run:ai Model Streamer Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/README.md).
+
+vLLM supports loading weights in Safetensors format using the Run:ai Model Streamer.
+You first need to install vLLM RunAI optional dependency:
+
+```bash
+pip3 install vllm[runai]
+```
+
+To run it as an OpenAI-compatible server, add the `--load-format runai_streamer` flag:
+
+```bash
+vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \
+    --load-format runai_streamer
+```
+
+To run model from AWS S3 object store run:
+
+```bash
+vllm serve s3://core-llm/Llama-3-8b \
+    --load-format runai_streamer
+```
+
+To run model from a S3 compatible object store run:
+
+```bash
+RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 \
+AWS_EC2_METADATA_DISABLED=true \
+AWS_ENDPOINT_URL=https://storage.googleapis.com \
+vllm serve s3://core-llm/Llama-3-8b \
+    --load-format runai_streamer
+```
+
+## Tunable parameters
+
+You can tune parameters using `--model-loader-extra-config`:
+
+You can tune `concurrency` that controls the level of concurrency and number of OS threads reading tensors from the file to the CPU buffer.
+For reading from S3, it will be the number of client instances the host is opening to the S3 server.
+
+```bash
+vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \
+    --load-format runai_streamer \
+    --model-loader-extra-config '{"concurrency":16}'
+```
+
+You can control the size of the CPU Memory buffer to which tensors are read from the file, and limit this size.
+You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit).
+
+```bash
+vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \
+    --load-format runai_streamer \
+    --model-loader-extra-config '{"memory_limit":5368709120}'
+```
+
+!!! note
+    For further instructions about tunable parameters and additional parameters configurable through environment variables, read the [Environment Variables Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md).
+
+## Sharded Model Loading
+
+vLLM also supports loading sharded models using Run:ai Model Streamer. This is particularly useful for large models that are split across multiple files. To use this feature, use the `--load-format runai_streamer_sharded` flag:
+
+```bash
+vllm serve /path/to/sharded/model --load-format runai_streamer_sharded
+```
+
+The sharded loader expects model files to follow the same naming pattern as the regular sharded state loader: `model-rank-{rank}-part-{part}.safetensors`. You can customize this pattern using the `pattern` parameter in `--model-loader-extra-config`:
+
+```bash
+vllm serve /path/to/sharded/model \
+    --load-format runai_streamer_sharded \
+    --model-loader-extra-config '{"pattern":"custom-model-rank-{rank}-part-{part}.safetensors"}'
+```
+
+To create sharded model files, you can use the script provided in <gh-file:examples/offline_inference/save_sharded_state.py>. This script demonstrates how to save a model in the sharded format that is compatible with the Run:ai Model Streamer sharded loader.
+
+The sharded loader supports all the same tunable parameters as the regular Run:ai Model Streamer, including `concurrency` and `memory_limit`. These can be configured in the same way:
+
+```bash
+vllm serve /path/to/sharded/model \
+    --load-format runai_streamer_sharded \
+    --model-loader-extra-config '{"concurrency":16, "memory_limit":5368709120}'
+```
+
+!!! note
+    The sharded loader is particularly efficient for tensor or pipeline parallel models where each worker only needs to read its own shard rather than the entire checkpoint.
diff --git a/vllm_v0.10.0/docs/models/extensions/tensorizer.md b/vllm_v0.10.0/docs/models/extensions/tensorizer.md
new file mode 100644
index 0000000..6ea61b0
--- /dev/null
+++ b/vllm_v0.10.0/docs/models/extensions/tensorizer.md
@@ -0,0 +1,13 @@
+# Loading models with CoreWeave's Tensorizer
+
+vLLM supports loading models with [CoreWeave's Tensorizer](https://docs.coreweave.com/coreweave-machine-learning-and-ai/inference/tensorizer).
+vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized
+at runtime extremely quickly directly to the GPU, resulting in significantly
+shorter Pod startup times and CPU memory usage. Tensor encryption is also supported.
+
+For more information on CoreWeave's Tensorizer, please refer to
+[CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
+the [vLLM example script](../../examples/others/tensorize_vllm_model.md).
+
+!!! note
+    Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.
diff --git a/vllm_v0.10.0/docs/models/generative_models.md b/vllm_v0.10.0/docs/models/generative_models.md
new file mode 100644
index 0000000..21ad115
--- /dev/null
+++ b/vllm_v0.10.0/docs/models/generative_models.md
@@ -0,0 +1,137 @@
+# Generative Models
+
+vLLM provides first-class support for generative models, which covers most of LLMs.
+
+In vLLM, generative models implement the [VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface.
+Based on the final hidden states of the input, these models output log probabilities of the tokens to generate,
+which are then passed through [Sampler][vllm.model_executor.layers.Sampler] to obtain the final text.
+
+For generative models, the only supported `--task` option is `"generate"`.
+Usually, this is automatically inferred so you don't have to specify it.
+
+## Offline Inference
+
+The [LLM][vllm.LLM] class provides various methods for offline inference.
+See [configuration][configuration] for a list of options when initializing the model.
+
+### `LLM.generate`
+
+The [generate][vllm.LLM.generate] method is available to all generative models in vLLM.
+It is similar to [its counterpart in HF Transformers](https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationMixin.generate),
+except that tokenization and detokenization are also performed automatically.
+
+```python
+from vllm import LLM
+
+llm = LLM(model="facebook/opt-125m")
+outputs = llm.generate("Hello, my name is")
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+
+You can optionally control the language generation by passing [SamplingParams][vllm.SamplingParams].
+For example, you can use greedy sampling by setting `temperature=0`:
+
+```python
+from vllm import LLM, SamplingParams
+
+llm = LLM(model="facebook/opt-125m")
+params = SamplingParams(temperature=0)
+outputs = llm.generate("Hello, my name is", params)
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+
+!!! important
+    By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified.
+
+    However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the [LLM][vllm.LLM] instance.
+A code example can be found here: <gh-file:examples/offline_inference/basic/basic.py>
+
+### `LLM.beam_search`
+
+The [beam_search][vllm.LLM.beam_search] method implements [beam search](https://huggingface.co/docs/transformers/en/generation_strategies#beam-search) on top of [generate][vllm.LLM.generate].
+For example, to search using 5 beams and output at most 50 tokens:
+
+```python
+from vllm import LLM
+from vllm.sampling_params import BeamSearchParams
+
+llm = LLM(model="facebook/opt-125m")
+params = BeamSearchParams(beam_width=5, max_tokens=50)
+outputs = llm.beam_search([{"prompt": "Hello, my name is "}], params)
+
+for output in outputs:
+    generated_text = output.sequences[0].text
+    print(f"Generated text: {generated_text!r}")
+```
+
+### `LLM.chat`
+
+The [chat][vllm.LLM.chat] method implements chat functionality on top of [generate][vllm.LLM.generate].
+In particular, it accepts input similar to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat)
+and automatically applies the model's [chat template](https://huggingface.co/docs/transformers/en/chat_templating) to format the prompt.
+
+!!! important
+    In general, only instruction-tuned models have a chat template.
+    Base models may perform poorly as they are not trained to respond to the chat conversation.
+
+??? code
+
+    ```python
+    from vllm import LLM
+
+    llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
+    conversation = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": "Hello"
+        },
+        {
+            "role": "assistant",
+            "content": "Hello! How can I assist you today?"
+        },
+        {
+            "role": "user",
+            "content": "Write an essay about the importance of higher education.",
+        },
+    ]
+    outputs = llm.chat(conversation)
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    ```
+
+A code example can be found here: <gh-file:examples/offline_inference/basic/chat.py>
+
+If the model doesn't have a chat template or you want to specify another one,
+you can explicitly pass a chat template:
+
+```python
+from vllm.entrypoints.chat_utils import load_chat_template
+
+# You can find a list of existing chat templates under `examples/`
+custom_template = load_chat_template(chat_template="<path_to_template>")
+print("Loaded chat template:", custom_template)
+
+outputs = llm.chat(conversation, chat_template=custom_template)
+```
+
+## Online Serving
+
+Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
+
+- [Completions API][completions-api] is similar to `LLM.generate` but only accepts text.
+- [Chat API][chat-api]  is similar to `LLM.chat`, accepting both text and [multi-modal inputs](../features/multimodal_inputs.md) for models with a chat template.
diff --git a/vllm_v0.10.0/docs/models/hardware_supported_models/tpu.md b/vllm_v0.10.0/docs/models/hardware_supported_models/tpu.md
new file mode 100644
index 0000000..da03a3b
--- /dev/null
+++ b/vllm_v0.10.0/docs/models/hardware_supported_models/tpu.md
@@ -0,0 +1,33 @@
+# TPU
+
+# TPU Supported Models
+## Text-only Language Models
+
+| Model                                               | Architecture                   | Supported |
+|-----------------------------------------------------|--------------------------------|-----------|
+| mistralai/Mixtral-8x7B-Instruct-v0.1                | MixtralForCausalLM             | 🟨 |
+| mistralai/Mistral-Small-24B-Instruct-2501           | MistralForCausalLM             | ✅ |
+| mistralai/Codestral-22B-v0.1                        | MistralForCausalLM             | ✅ |
+| mistralai/Mixtral-8x22B-Instruct-v0.1               | MixtralForCausalLM             | ❌ |
+| meta-llama/Llama-3.3-70B-Instruct                   | LlamaForCausalLM               | ✅ |
+| meta-llama/Llama-3.1-8B-Instruct                    | LlamaForCausalLM               | ✅ |
+| meta-llama/Llama-3.1-70B-Instruct                   | LlamaForCausalLM               | ✅ |
+| meta-llama/Llama-4-*                                | Llama4ForConditionalGeneration | ❌ |
+| microsoft/Phi-3-mini-128k-instruct                  | Phi3ForCausalLM                | 🟨 |
+| microsoft/phi-4                                     | Phi3ForCausalLM                | ❌ |
+| google/gemma-3-27b-it                               | Gemma3ForConditionalGeneration | 🟨 |
+| google/gemma-3-4b-it                                | Gemma3ForConditionalGeneration | ❌ |
+| deepseek-ai/DeepSeek-R1                             | DeepseekV3ForCausalLM          | ❌ |
+| deepseek-ai/DeepSeek-V3                             | DeepseekV3ForCausalLM          | ❌ |
+| RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8  | LlamaForCausalLM               | ✅ |
+| RedHatAI/Meta-Llama-3.1-70B-Instruct-quantized.w8a8 | LlamaForCausalLM               | ✅ |
+| Qwen/Qwen3-8B                                       | Qwen3ForCausalLM               | ✅ |
+| Qwen/Qwen3-32B                                      | Qwen3ForCausalLM               | ✅ |
+| Qwen/Qwen2.5-7B-Instruct                            | Qwen2ForCausalLM               | ✅ |
+| Qwen/Qwen2.5-32B                                    | Qwen2ForCausalLM               | ✅ |
+| Qwen/Qwen2.5-14B-Instruct                           | Qwen2ForCausalLM               | ✅ |
+| Qwen/Qwen2.5-1.5B-Instruct                          | Qwen2ForCausalLM               | 🟨 |
+
+✅ Runs and optimized.  
+🟨 Runs and correct but not optimized to green yet.  
+❌ Does not pass accuracy test or does not run.  
diff --git a/vllm_v0.10.0/docs/models/pooling_models.md b/vllm_v0.10.0/docs/models/pooling_models.md
new file mode 100644
index 0000000..741ae2d
--- /dev/null
+++ b/vllm_v0.10.0/docs/models/pooling_models.md
@@ -0,0 +1,215 @@
+# Pooling Models
+
+vLLM also supports pooling models, including embedding, reranking and reward models.
+
+In vLLM, pooling models implement the [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface.
+These models use a [Pooler][vllm.model_executor.layers.Pooler] to extract the final hidden states of the input
+before returning them.
+
+!!! note
+    We currently support pooling models primarily as a matter of convenience.
+    As shown in the [Compatibility Matrix](../features/compatibility_matrix.md), most vLLM features are not applicable to
+    pooling models as they only work on the generation or decode stage, so performance may not improve as much.
+
+If the model doesn't implement this interface, you can set `--task` which tells vLLM
+to convert the model into a pooling model.
+
+| `--task`   | Model type           | Supported pooling tasks       |
+|------------|----------------------|-------------------------------|
+| `embed`    | Embedding model      | `encode`, `embed`             |
+| `classify` | Classification model | `encode`, `classify`, `score` |
+| `reward`   | Reward model         | `encode`                      |
+
+## Pooling Tasks
+
+In vLLM, we define the following pooling tasks and corresponding APIs:
+
+| Task       | APIs               |
+|------------|--------------------|
+| `encode`   | `encode`           |
+| `embed`    | `embed`, `score`\* |
+| `classify` | `classify`         |
+| `score`    | `score`            |
+
+\*The `score` API falls back to `embed` task if the model does not support `score` task.
+
+Each pooling model in vLLM supports one or more of these tasks according to [Pooler.get_supported_tasks][vllm.model_executor.layers.Pooler.get_supported_tasks].
+
+By default, the pooler assigned to each task has the following attributes:
+
+| Task       | Pooling Type   | Normalization | Softmax |
+|------------|----------------|---------------|---------|
+| `encode`   | `ALL`          | ❌            | ❌      |
+| `embed`    | `LAST`         | ✅︎            | ❌      |
+| `classify` | `LAST`         | ❌            | ✅︎      |
+
+These defaults may be overridden by the model's implementation in vLLM.
+
+When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
+we attempt to override the defaults based on its Sentence Transformers configuration file (`modules.json`),
+which takes priority over the model's defaults.
+
+You can further customize this via the `--override-pooler-config` option,
+which takes priority over both the model's and Sentence Transformers's defaults.
+
+!!! note
+
+    The above configuration may be disregarded if the model's implementation in vLLM defines its own pooler
+    that is not based on [PoolerConfig][vllm.config.PoolerConfig].
+
+## Offline Inference
+
+The [LLM][vllm.LLM] class provides various methods for offline inference.
+See [configuration][configuration] for a list of options when initializing the model.
+
+### `LLM.encode`
+
+The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
+It returns the extracted hidden states directly, which is useful for reward models.
+
+```python
+from vllm import LLM
+
+llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward")
+(output,) = llm.encode("Hello, my name is")
+
+data = output.outputs.data
+print(f"Data: {data!r}")
+```
+
+### `LLM.embed`
+
+The [embed][vllm.LLM.embed] method outputs an embedding vector for each prompt.
+It is primarily designed for embedding models.
+
+```python
+from vllm import LLM
+
+llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed")
+(output,) = llm.embed("Hello, my name is")
+
+embeds = output.outputs.embedding
+print(f"Embeddings: {embeds!r} (size={len(embeds)})")
+```
+
+A code example can be found here: <gh-file:examples/offline_inference/basic/embed.py>
+
+### `LLM.classify`
+
+The [classify][vllm.LLM.classify] method outputs a probability vector for each prompt.
+It is primarily designed for classification models.
+
+```python
+from vllm import LLM
+
+llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify")
+(output,) = llm.classify("Hello, my name is")
+
+probs = output.outputs.probs
+print(f"Class Probabilities: {probs!r} (size={len(probs)})")
+```
+
+A code example can be found here: <gh-file:examples/offline_inference/basic/classify.py>
+
+### `LLM.score`
+
+The [score][vllm.LLM.score] method outputs similarity scores between sentence pairs.
+It is designed for embedding models and cross encoder models. Embedding models use cosine similarity, and [cross-encoder models](https://www.sbert.net/examples/applications/cross-encoder/README.html) serve as rerankers between candidate query-document pairs in RAG systems.
+
+!!! note
+    vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG.
+    To handle RAG at a higher level, you should use integration frameworks such as [LangChain](https://github.com/langchain-ai/langchain).
+
+```python
+from vllm import LLM
+
+llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score")
+(output,) = llm.score("What is the capital of France?",
+                      "The capital of Brazil is Brasilia.")
+
+score = output.outputs.score
+print(f"Score: {score}")
+```
+
+A code example can be found here: <gh-file:examples/offline_inference/basic/score.py>
+
+## Online Serving
+
+Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
+
+- [Pooling API][pooling-api] is similar to `LLM.encode`, being applicable to all types of pooling models.
+- [Embeddings API][embeddings-api] is similar to `LLM.embed`, accepting both text and [multi-modal inputs](../features/multimodal_inputs.md) for embedding models.
+- [Classification API][classification-api] is similar to `LLM.classify` and is applicable to sequence classification models.
+- [Score API][score-api] is similar to `LLM.score` for cross-encoder models.
+
+## Matryoshka Embeddings
+
+[Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows user to trade off between performance and cost.
+
+!!! warning
+    Not all embedding models are trained using Matryoshka Representation Learning. To avoid misuse of the `dimensions` parameter, vLLM returns an error for requests that attempt to change the output dimension of models that do not support Matryoshka Embeddings.
+
+    For example, setting `dimensions` parameter while using the `BAAI/bge-m3` model will result in the following error.
+
+    ```json
+    {"object":"error","message":"Model \"BAAI/bge-m3\" does not support matryoshka representation, changing output dimensions will lead to poor results.","type":"BadRequestError","param":null,"code":400}
+    ```
+
+### Manually enable Matryoshka Embeddings
+
+There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json,` it is allowed to change the output to arbitrary dimensions. Using `matryoshka_dimensions` can control the allowed output dimensions.
+
+For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}`, `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline) or `--hf_overrides '{"is_matryoshka": true}'`,  `--hf_overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'`(online).
+
+Here is an example to serve a model with Matryoshka Embeddings enabled.
+
+```text
+vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf_overrides '{"matryoshka_dimensions":[256]}'
+```
+
+### Offline Inference
+
+You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter in [PoolingParams][vllm.PoolingParams].
+
+```python
+from vllm import LLM, PoolingParams
+
+llm = LLM(model="jinaai/jina-embeddings-v3",
+          task="embed",
+          trust_remote_code=True)
+outputs = llm.embed(["Follow the white rabbit."],
+                    pooling_params=PoolingParams(dimensions=32))
+print(outputs[0].outputs)
+```
+
+A code example can be found here: <gh-file:examples/offline_inference/embed_matryoshka_fy.py>
+
+### Online Inference
+
+Use the following command to start vllm server.
+
+```text
+vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
+```
+
+You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter.
+
+```text
+curl http://127.0.0.1:8000/v1/embeddings \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "input": "Follow the white rabbit.",
+    "model": "jinaai/jina-embeddings-v3",
+    "encoding_format": "float",
+    "dimensions": 32
+  }'
+```
+
+Expected output:
+
+```json
+{"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}}
+```
+
+A openai client example can be found here: <gh-file:examples/online_serving/openai_embedding_matryoshka_fy.py>
diff --git a/vllm_v0.10.0/docs/models/supported_models.md b/vllm_v0.10.0/docs/models/supported_models.md
new file mode 100644
index 0000000..c8b6c6c
--- /dev/null
+++ b/vllm_v0.10.0/docs/models/supported_models.md
@@ -0,0 +1,767 @@
+# Supported Models
+
+vLLM supports [generative](./generative_models.md) and [pooling](./pooling_models.md) models across various tasks.
+If a model supports more than one task, you can set the task via the `--task` argument.
+
+For each task, we list the model architectures that have been implemented in vLLM.
+Alongside each architecture, we include some popular models that use it.
+
+## Model Implementation
+
+### vLLM
+
+If vLLM natively supports a model, its implementation can be found in <gh-file:vllm/model_executor/models>.
+
+These models are what we list in [supported-text-models][supported-text-models] and [supported-mm-models][supported-mm-models].
+
+[](){ #transformers-backend }
+
+### Transformers
+
+vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models and common vision language models are supported! Vision-language models currently accept only image inputs. Support for video inputs will be added in future releases.
+
+To check if the modeling backend is Transformers, you can simply do this:
+
+```python
+from vllm import LLM
+llm = LLM(model=..., task="generate")  # Name or path of your model
+llm.apply_model(lambda model: print(type(model)))
+```
+
+If it is `TransformersForCausalLM` or `TransformersForMultimodalLM` then it means it's based on Transformers!
+
+!!! tip
+    You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for [offline-inference](../serving/offline_inference.md) or `--model-impl transformers` for the [openai-compatible-server](../serving/openai_compatible_server.md).
+
+!!! note
+    vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM.
+
+!!! note
+    In case of vision language models if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance.
+
+#### Custom models
+
+If a model is neither supported natively by vLLM or Transformers, it can still be used in vLLM!
+
+For a model to be compatible with the Transformers backend for vLLM it must:
+
+- be a Transformers compatible custom model (see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)):
+    * The model directory must have the correct structure (e.g. `config.json` is present).
+    * `config.json` must contain `auto_map.AutoModel`.
+- be a Transformers backend for vLLM compatible model (see [writing-custom-models][writing-custom-models]):
+    * Customisation should be done in the base model (e.g. in `MyModel`, not `MyModelForCausalLM`).
+
+If the compatible model is:
+
+- on the Hugging Face Model Hub, simply set `trust_remote_code=True` for [offline-inference](../serving/offline_inference.md) or `--trust-remote-code` for the [openai-compatible-server](../serving/openai_compatible_server.md).
+- in a local directory, simply pass directory path to `model=<MODEL_DIR>` for [offline-inference](../serving/offline_inference.md) or `vllm serve <MODEL_DIR>` for the [openai-compatible-server](../serving/openai_compatible_server.md).
+
+This means that, with the Transformers backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM!
+
+[](){ #writing-custom-models }
+
+#### Writing custom models
+
+This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)).
+
+To make your model compatible with the Transformers backend, it needs:
+
+1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`.
+2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention.
+3. `MyModel` must contain `_supports_attention_backend = True`.
+
+<details>
+<summary>modeling_my_model.py</summary>
+
+```python
+
+from transformers import PreTrainedModel
+from torch import nn
+
+class MyAttention(nn.Module):
+
+    def forward(self, hidden_states, **kwargs):
+        ...
+        attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            **kwargs,
+        )
+        ...
+
+class MyModel(PreTrainedModel):
+    _supports_attention_backend = True
+```
+
+</details>
+
+Here is what happens in the background when this model is loaded:
+
+1. The config is loaded.
+2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`.
+3. `MyModel` is loaded into `TransformersForCausalLM` or `TransformersForMultimodalLM` (see <gh-file:vllm/model_executor/models/transformers.py>) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
+
+That's it!
+
+For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class:
+
+<details>
+<summary>configuration_my_model.py</summary>
+
+```python
+
+from transformers import PretrainedConfig
+
+class MyConfig(PretrainedConfig):
+    base_model_tp_plan = {
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+```
+
+</details>
+
+- `base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported).
+- `base_model_pp_plan` is a `dict` that maps direct child layer names to `tuple`s of `list`s of `str`s:
+    * You only need to do this for layers which are not present on all pipeline stages
+    * vLLM assumes that there will be only one `nn.ModuleList`, which is distributed across the pipeline stages
+    * The `list` in the first element of the `tuple` contains the names of the input arguments
+    * The `list` in the last element of the `tuple` contains the names of the variables the layer outputs to in your modeling code
+
+## Loading a Model
+
+### Hugging Face Hub
+
+By default, vLLM loads models from [Hugging Face (HF) Hub](https://huggingface.co/models). To change the download path for models, you can set the `HF_HOME` environment variable; for more details, refer to [their official documentation](https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables#hfhome).
+
+To determine whether a given model is natively supported, you can check the `config.json` file inside the HF repository.
+If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
+
+Models do not _need_ to be natively supported to be used in vLLM.
+The [Transformers backend][transformers-backend] enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
+
+!!! tip
+    The easiest way to check if your model is really supported at runtime is to run the program below:
+
+    ```python
+    from vllm import LLM
+
+    # For generative models (task=generate) only
+    llm = LLM(model=..., task="generate")  # Name or path of your model
+    output = llm.generate("Hello, my name is")
+    print(output)
+
+    # For pooling models (task={embed,classify,reward,score}) only
+    llm = LLM(model=..., task="embed")  # Name or path of your model
+    output = llm.encode("Hello, my name is")
+    print(output)
+    ```
+
+    If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
+
+Otherwise, please refer to [Adding a New Model](../contributing/model/README.md) for instructions on how to implement your model in vLLM.
+Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
+
+#### Download a model
+
+If you prefer, you can use the Hugging Face CLI to [download a model](https://huggingface.co/docs/huggingface_hub/guides/cli#huggingface-cli-download) or specific files from a model repository:
+
+```bash
+# Download a model
+huggingface-cli download HuggingFaceH4/zephyr-7b-beta
+
+# Specify a custom cache directory
+huggingface-cli download HuggingFaceH4/zephyr-7b-beta --cache-dir ./path/to/cache
+
+# Download a specific file from a model repo
+huggingface-cli download HuggingFaceH4/zephyr-7b-beta eval_results.json
+```
+
+#### List the downloaded models
+
+Use the Hugging Face CLI to [manage models](https://huggingface.co/docs/huggingface_hub/guides/manage-cache#scan-your-cache) stored in local cache:
+
+```bash
+# List cached models
+huggingface-cli scan-cache
+
+# Show detailed (verbose) output
+huggingface-cli scan-cache -v
+
+# Specify a custom cache directory
+huggingface-cli scan-cache --dir ~/.cache/huggingface/hub
+```
+
+#### Delete a cached model
+
+Use the Hugging Face CLI to interactively [delete downloaded model](https://huggingface.co/docs/huggingface_hub/guides/manage-cache#clean-your-cache) from the cache:
+
+<details>
+<summary>Commands</summary>
+
+```console
+# The `delete-cache` command requires extra dependencies to work with the TUI.
+# Please run `pip install huggingface_hub[cli]` to install them.
+
+# Launch the interactive TUI to select models to delete
+$ huggingface-cli delete-cache
+? Select revisions to delete: 1 revisions selected counting for 438.9M.
+  ○ None of the following (if selected, nothing will be deleted).
+Model BAAI/bge-base-en-v1.5 (438.9M, used 1 week ago)
+❯ ◉ a5beb1e3: main # modified 1 week ago
+
+Model BAAI/bge-large-en-v1.5 (1.3G, used 1 week ago)
+  ○ d4aa6901: main # modified 1 week ago
+
+Model BAAI/bge-reranker-base (1.1G, used 4 weeks ago)
+  ○ 2cfc18c9: main # modified 4 weeks ago
+
+Press <space> to select, <enter> to validate and <ctrl+c> to quit without modification.
+
+# Need to confirm after selected
+? Select revisions to delete: 1 revision(s) selected.
+? 1 revisions selected counting for 438.9M. Confirm deletion ? Yes
+Start deletion.
+Done. Deleted 1 repo(s) and 0 revision(s) for a total of 438.9M.
+```
+
+</details>
+
+#### Using a proxy
+
+Here are some tips for loading/downloading models from Hugging Face using a proxy:
+
+- Set the proxy globally for your session (or set it in the profile file):
+
+```shell
+export http_proxy=http://your.proxy.server:port
+export https_proxy=http://your.proxy.server:port
+```
+
+- Set the proxy for just the current command:
+
+```shell
+https_proxy=http://your.proxy.server:port huggingface-cli download <model_name>
+
+# or use vllm cmd directly
+https_proxy=http://your.proxy.server:port  vllm serve <model_name> --disable-log-requests
+```
+
+- Set the proxy in Python interpreter:
+
+```python
+import os
+
+os.environ['http_proxy'] = 'http://your.proxy.server:port'
+os.environ['https_proxy'] = 'http://your.proxy.server:port'
+```
+
+### ModelScope
+
+To use models from [ModelScope](https://www.modelscope.cn) instead of Hugging Face Hub, set an environment variable:
+
+```shell
+export VLLM_USE_MODELSCOPE=True
+```
+
+And use with `trust_remote_code=True`.
+
+```python
+from vllm import LLM
+
+llm = LLM(model=..., revision=..., task=..., trust_remote_code=True)
+
+# For generative models (task=generate) only
+output = llm.generate("Hello, my name is")
+print(output)
+
+# For pooling models (task={embed,classify,reward,score}) only
+output = llm.encode("Hello, my name is")
+print(output)
+```
+
+[](){ #feature-status-legend }
+
+## Feature Status Legend
+
+- ✅︎ indicates that the feature is supported for the model.
+
+- 🚧 indicates that the feature is planned but not yet supported for the model.
+
+- ⚠️ indicates that the feature is available but may have known issues or limitations.
+
+[](){ #supported-text-models }
+
+## List of Text-only Language Models
+
+### Generative Models
+
+See [this page](generative_models.md) for more information on how to use generative models.
+
+#### Text Generation
+
+Specified using `--task generate`.
+
+<style>
+th {
+  white-space: nowrap;
+  min-width: 0 !important;
+}
+</style>
+
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
+|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
+| `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `ArceeForCausalLM` | Arcee (AFM) | `arcee-ai/AFM-4.5B-Base`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ | ✅︎ |
+| `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `BailingMoeForCausalLM` | Ling | `inclusionAI/Ling-lite-1.5`, `inclusionAI/Ling-plus`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | ✅︎ |
+| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | |
+| `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | |
+| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ |
+| `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | | ✅︎ | ✅︎ |
+| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | | ✅︎ | ✅︎ |
+| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3`, etc. | | ✅︎ | ✅︎ |
+| `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ | ✅︎ |
+| `Ernie4_5_ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ | ✅︎ |
+| `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Exaone4ForCausalLM` | EXAONE-4 | `LGAI-EXAONE/EXAONE-4.0-32B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Fairseq2LlamaForCausalLM` | Llama (fairseq2 format) | `mgleize/fairseq2-dummy-Llama-3.2-1B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | | ✅︎ | ✅︎ |
+| `FalconMambaForCausalLM` | FalconMamba | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. | | ✅︎ | ✅︎ |
+| `FalconH1ForCausalLM` | Falcon-H1 | `tiiuae/Falcon-H1-34B-Base`, `tiiuae/Falcon-H1-34B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `GemmaForCausalLM` | Gemma | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Gemma2ForCausalLM` | Gemma 2 | `google/gemma-2-9b`, `google/gemma-2-27b`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Gemma3nForConditionalGeneration` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
+| `GlmForCausalLM` | GLM-4 | `THUDM/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Glm4ForCausalLM` | GLM-4-0414 | `THUDM/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ | ✅︎ |
+| `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | ✅︎ |
+| `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | | ✅︎ | ✅︎ |
+| `GraniteForCausalLM` | Granite 3.0, Granite 3.1, PowerLM | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ | ✅︎ |
+| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | |
+| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ |
+| `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | | ✅︎ |
+| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ |
+| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ | ✅︎ |
+| `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ | |
+| `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ | |
+| `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ | ✅︎ |
+| `MiMoForCausalLM` | MiMo | `XiaomiMiMo/MiMo-7B-RL`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ | ✅︎ |
+| `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | | ✅︎ | ✅︎ |
+| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | | ✅︎ | ✅︎ |
+| `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ | ✅︎ |
+| `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | | ✅︎ | ✅︎ |
+| `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | ✅︎ |
+| `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Phi4FlashForCausalLM` | Phi-4-mini-flash-reasoning | `microsoft/microsoft/Phi-4-mini-instruct`, etc. | | | |
+| `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ | ✅︎ |
+| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | | |
+| `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | | ✅︎ |
+| `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ | ✅︎ |
+| `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `MiniMaxM1ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`, etc. | | | |
+| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | |
+| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | ✅︎ |
+
+!!! note
+    Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
+
+!!! note
+    Only text inputs are currently supported for `Gemma3nForConditionalGeneration`. To use this model, please upgrade Hugging Face Transformers to version 4.53.0.
+
+### Pooling Models
+
+See [this page](./pooling_models.md) for more information on how to use pooling models.
+
+!!! important
+    Since some model architectures support both generative and pooling tasks,
+    you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
+
+#### Text Embedding
+
+Specified using `--task embed`.
+
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
+|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
+| `BertModel` | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | |
+| `Gemma2Model` | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | | ✅︎ |
+| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | |
+| `GteModel` | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. |  |  |  |
+| `GteNewModel` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. |  |  |  |
+| `ModernBertModel` | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. |  |  |  |
+| `NomicBertModel` | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. |  |  |  |
+| `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen2Model`, `Qwen2ForCausalLM` | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen3Model`, `Qwen3ForCausalLM` | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | |
+
+!!! note
+    `ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
+    You need to manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`.
+
+!!! note
+    For `Alibaba-NLP/gte-Qwen2-*`, you need to enable `--trust-remote-code` for the correct tokenizer to be loaded.
+    See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882).
+
+!!! note
+    `jinaai/jina-embeddings-v3` supports multiple tasks through LoRA, while vllm temporarily only supports text-matching tasks by merging LoRA weights.
+
+!!! note
+    The second-generation GTE model (mGTE-TRM) is named `NewModel`. The name `NewModel` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewModel"]}'` to specify the use of the `GteNewModel` architecture.
+
+If your model is not in the above list, we will try to automatically convert the model using
+[as_embedding_model][vllm.model_executor.models.adapters.as_embedding_model]. By default, the embeddings
+of the whole prompt are extracted from the normalized hidden state corresponding to the last token.
+
+#### Reward Modeling
+
+Specified using `--task reward`.
+
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
+|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
+| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `LlamaForCausalLM` | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+
+If your model is not in the above list, we will try to automatically convert the model using
+[as_reward_model][vllm.model_executor.models.adapters.as_reward_model]. By default, we return the hidden states of each token directly.
+
+!!! important
+    For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
+    e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
+
+#### Classification
+
+Specified using `--task classify`.
+
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
+|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
+| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | |
+| `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | ✅︎ |
+
+If your model is not in the above list, we will try to automatically convert the model using
+[as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
+
+#### Sentence Pair Scoring
+
+Specified using `--task score`.
+
+| Architecture | Models | Example HF Models | [V1](gh-issue:8779) |
+|--------------|--------|-------------------|---------------------|
+| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | |
+| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | |
+| `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ |
+| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ |
+| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | |
+| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | |
+
+!!! note
+    Load the official original `BAAI/bge-reranker-v2-gemma` by using the following command.
+
+    ```bash
+    vllm serve BAAI/bge-reranker-v2-gemma --hf_overrides '{"architectures": ["GemmaForSequenceClassification"],"classifier_from_token": ["Yes"],"method": "no_post_processing"}'
+    ```
+
+!!! note
+    Load the official original `mxbai-rerank-v2` by using the following command.
+
+    ```bash
+    vllm serve mixedbread-ai/mxbai-rerank-base-v2 --hf_overrides '{"architectures": ["Qwen2ForSequenceClassification"],"classifier_from_token": ["0", "1"], "method": "from_2_way_softmax"}'
+    ```
+
+!!! note
+    Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: <gh-file:examples/offline_inference/qwen3_reranker.py>.
+
+    ```bash
+    vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
+    ```
+
+[](){ #supported-mm-models }
+
+## List of Multimodal Language Models
+
+The following modalities are supported depending on the model:
+
+- **T**ext
+- **I**mage
+- **V**ideo
+- **A**udio
+
+Any combination of modalities joined by `+` are supported.
+
+- e.g.: `T + I` means that the model supports text-only, image-only, and text-with-image inputs.
+
+On the other hand, modalities separated by `/` are mutually exclusive.
+
+- e.g.: `T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs.
+
+See [this page](../features/multimodal_inputs.md) on how to pass multi-modal inputs to the model.
+
+!!! important
+    **To enable multiple multi-modal items per text prompt in vLLM V0**, you have to set `limit_mm_per_prompt` (offline inference)
+    or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt:
+
+    Offline inference:
+
+    ```python
+    from vllm import LLM
+
+    llm = LLM(
+        model="Qwen/Qwen2-VL-7B-Instruct",
+        limit_mm_per_prompt={"image": 4},
+    )
+    ```
+
+    Online serving:
+
+    ```bash
+    vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt '{"image":4}'
+    ```
+
+    **This is no longer required if you are using vLLM V1.**
+
+!!! note
+    vLLM currently only supports adding LoRA to the language backbone of multimodal models.
+
+### Generative Models
+
+See [this page](generative_models.md) for more information on how to use generative models.
+
+#### Text Generation
+
+Specified using `--task generate`.
+
+| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
+|--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------|
+| `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | | ✅︎ |
+| `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ | ✅︎ |
+| `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ | ✅︎ |
+| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ | ✅︎ |
+| `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ | ✅︎ |
+| `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large`, etc. | | | |
+| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ |
+| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ |
+| `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `THUDM/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Glm4MoeForCausalLM` | GLM-4.5 | T + I<sup>E+</sup> + V<sup>E+</sup> | `THUDM/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
+| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
+| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
+| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
+| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
+| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
+| `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + I<sup>E+</sup> | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ | ✅︎ |
+| `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | | ✅︎ | ✅︎ |
+| `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + I<sup>E+</sup> | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ | ✅︎ |
+| `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ |
+| `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I<sup>+</sup> + V<sup>+</sup> | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ |
+| `MiniCPMO` | MiniCPM-O | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup> | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. | ✅︎ | | ✅︎ |
+| `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ |
+| `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `MllamaForConditionalGeneration` | Llama 3.2 | T + I<sup>+</sup> | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | |
+| `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ | ✅︎ |
+| `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ | ✅︎ |
+| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ |
+| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ |
+| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ | ✅︎ |
+| `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ |
+| `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-7B` | | ✅︎ | ✅︎ |
+| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ |
+| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ |
+| `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ | ✅︎ |
+| `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2 | T + I<sup>E+</sup> + V<sup>E+</sup> | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ | ✅︎ |
+
+<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.  
+&nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models:  
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`--hf-overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'`  
+<sup>E</sup> Pre-computed embeddings can be inputted for this modality.  
+<sup>+</sup> Multiple items can be inputted per text prompt for this modality.
+
+!!! warning
+    Both V0 and V1 support `Gemma3ForConditionalGeneration` for text-only inputs.
+    However, there are differences in how they handle text + image inputs:
+
+    V0 correctly implements the model's attention pattern:
+    - Uses bidirectional attention between the image tokens corresponding to the same image
+    - Uses causal attention for other tokens
+    - Implemented via (naive) PyTorch SDPA with masking tensors
+    - Note: May use significant memory for long prompts with image
+
+    V1 currently uses a simplified attention pattern:
+    - Uses causal attention for all tokens, including image tokens
+    - Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": true}`
+    - Will be updated in the future to support the correct behavior
+
+    This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
+
+!!! note
+    Only `InternVLChatModel` with Qwen2.5 text backbone (`OpenGVLab/InternVL3-2B`, `OpenGVLab/InternVL2.5-1B` etc) has video inputs support currently.
+
+!!! note
+    To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
+
+!!! warning
+    The output quality of `AllenAI/Molmo-7B-D-0924` (especially in object localization tasks) has deteriorated in recent updates.
+
+    For the best results, we recommend using the following dependency versions (tested on A10 and L40):
+
+    ??? code "Dependency versions"
+
+        ```text
+        # Core vLLM-compatible dependencies with Molmo accuracy setup (tested on L40)
+        torch==2.5.1
+        torchvision==0.20.1
+        transformers==4.48.1
+        tokenizers==0.21.0
+        tiktoken==0.7.0
+        vllm==0.7.0
+
+        # Optional but recommended for improved performance and stability
+        triton==3.1.0
+        xformers==0.0.28.post3
+        uvloop==0.21.0
+        protobuf==5.29.3
+        openai==1.60.2
+        opencv-python-headless==4.11.0.86
+        pillow==10.4.0
+
+        # Installed FlashAttention (for float16 only)
+        flash-attn>=2.5.6  # Not used in float32, but should be documented
+        ```
+
+    **Note:** Make sure you understand the security implications of using outdated packages.
+
+!!! note
+    The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
+    For more details, please see: <gh-pr:4087#issuecomment-2250397630>
+
+!!! warning
+    Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.
+
+!!! note
+    For Qwen2.5-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`)
+    is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
+
+#### Transcription
+
+Specified using `--task transcription`.
+
+Speech2Text models trained specifically for Automatic Speech Recognition.
+
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
+|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
+| `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | |
+
+### Pooling Models
+
+See [this page](./pooling_models.md) for more information on how to use pooling models.
+
+!!! important
+    Since some model architectures support both generative and pooling tasks,
+    you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
+
+#### Text Embedding
+
+Specified using `--task embed`.
+
+Any text generation model can be converted into an embedding model by passing `--task embed`.
+
+!!! note
+    To get the best results, you should use pooling models that are specifically trained as such.
+
+The following table lists those that are tested in vLLM.
+
+| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
+|--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------|
+| `LlavaNextForConditionalGeneration` | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | | |
+| `Phi3VForCausalLM` | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | 🚧 | ✅︎ | |
+
+---
+
+#### Scoring
+
+Specified using `--task score`.
+
+| Architecture                        | Models             | Inputs   | Example HF Models        | [LoRA][lora-adapter]   | [PP][distributed-serving]   | [V1](gh-issue:8779)   |
+|-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|-----------------------|
+| `JinaVLForSequenceClassification` | JinaVL-based | T + I<sup>E+</sup> | `jinaai/jina-reranker-m0`, etc. | | | ✅︎ |
+
+## Model Support Policy
+
+At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support:
+
+1. **Community-Driven Support**: We encourage community contributions for adding new models. When a user requests support for a new model, we welcome pull requests (PRs) from the community. These contributions are evaluated primarily on the sensibility of the output they generate, rather than strict consistency with existing implementations such as those in transformers. **Call for contribution:** PRs coming directly from model vendors are greatly appreciated!
+
+2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results.
+
+    !!! tip
+        When comparing the output of `model.generate` from Hugging Face Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
+
+3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback.
+
+4. **Monitoring and Updates**: Users interested in specific models should monitor the commit history for those models (e.g., by tracking changes in the main/vllm/model_executor/models directory). This proactive approach helps users stay informed about updates and changes that may affect the models they use.
+
+5. **Selective Focus**: Our resources are primarily directed towards models with significant user interest and impact. Models that are less frequently used may receive less attention, and we rely on the community to play a more active role in their upkeep and improvement.
+
+Through this approach, vLLM fosters a collaborative environment where both the core development team and the broader community contribute to the robustness and diversity of the third-party models supported in our ecosystem.
+
+Note that, as an inference engine, vLLM does not introduce new models. Therefore, all models supported by vLLM are third-party models in this regard.
+
+We have the following levels of testing for models:
+
+1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to [models tests](https://github.com/vllm-project/vllm/blob/main/tests/models) for the models that have passed this test.
+2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test.
+3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](gh-dir:tests) and [examples](gh-dir:examples) for the models that have passed this test.
+4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category.
diff --git a/vllm_v0.10.0/docs/serving/data_parallel_deployment.md b/vllm_v0.10.0/docs/serving/data_parallel_deployment.md
new file mode 100644
index 0000000..9ff9f59
--- /dev/null
+++ b/vllm_v0.10.0/docs/serving/data_parallel_deployment.md
@@ -0,0 +1,120 @@
+# Data Parallel Deployment
+
+vLLM supports Data Parallel deployment, where model weights are replicated across separate instances/GPUs to process independent batches of requests.
+
+This will work with both dense and MoE models.
+
+For MoE models, particularly those like DeepSeek that employ MLA (Multi-head Latent Attention), it can be advantageous to use data parallel for the attention layers and expert or tensor parallel (EP or TP) for the expert layers.
+
+In these cases, the data parallel ranks are not completely independent. Forward passes must be aligned, and expert layers across all ranks are required to synchronize during every forward pass, even when there are fewer requests to be processed than DP ranks.
+
+The expert layers will by default form a (DP x TP) sized tensor parallel group. To enable expert parallelism, include the `--enable-expert-parallel` CLI arg (on all nodes in the multi-node case).
+
+In vLLM, each DP rank is deployed as a separate "core engine" process that communicates with front-end process(es) via ZMQ sockets. Data Parallel attention can be combined with Tensor Parallel attention, in which case each DP engine owns a number of per-GPU worker processes equal to the configured TP size.
+
+For MoE models, when any requests are in progress in any rank, we must ensure that empty "dummy" forward passes are performed in all ranks that don't currently have any requests scheduled. This is handled via a separate DP Coordinator process that communicates with all ranks, and a collective operation performed every N steps to determine when all ranks become idle and can be paused. When TP is used in conjunction with DP, expert layers form an EP or TP group of size (DP x TP).
+
+In all cases, it is beneficial to load-balance requests between DP ranks. For online deployments, this balancing can be optimized by taking into account the state of each DP engine - in particular its currently scheduled and waiting (queued) requests, and KV cache state. Each DP engine has an independent KV cache, and the benefit of prefix caching can be maximized by directing prompts intelligently.
+
+This document focuses on online deployments (with the API server). DP + EP is also supported for offline usage (via the LLM class), for an example see <gh-file:examples/offline_inference/data_parallel.py>.
+
+There are two distinct modes supported for online deployments - self-contained with internal load balancing, or externally per-rank process deployment and load balancing.
+
+## Internal Load Balancing
+
+vLLM supports "self-contained" data parallel deployments that expose a single API endpoint.
+
+It can be configured by simply including e.g. `--data-parallel-size=4` in the vllm serve command line arguments. This will require 4 GPUs. It can be combined with tensor parallel, for example `--data-parallel-size=4 --tensor-parallel-size=2`, which would require 8 GPUs.
+
+Running a single data parallel deployment across multiple nodes requires a different `vllm serve` to be run on each node, specifying which DP ranks should run on that node. In this case, there will still be a single HTTP entrypoint - the API server(s) will run only on one node, but it doesn't necessarily need to be co-located with the DP ranks.
+
+This will run DP=4, TP=2 on a single 8-GPU node:
+
+```bash
+vllm serve $MODEL --data-parallel-size 4 --tensor-parallel-size 2
+```
+
+This will run DP=4 with DP ranks 0 and 1 on the head node and ranks 2 and 3 on the second node:
+
+```bash
+# Node 0  (with ip address 10.99.48.128)
+vllm serve $MODEL --data-parallel-size 4 --data-parallel-size-local 2 \
+                  --data-parallel-address 10.99.48.128 --data-parallel-rpc-port 13345
+# Node 1
+vllm serve $MODEL --headless --data-parallel-size 4 --data-parallel-size-local 2 \
+                  --data-parallel-start-rank 2 \
+                  --data-parallel-address 10.99.48.128 --data-parallel-rpc-port 13345
+```
+
+This will run DP=4 with only the API server on the first node and all engines on the second node:
+
+```bash
+# Node 0  (with ip address 10.99.48.128)
+vllm serve $MODEL --data-parallel-size 4 --data-parallel-size-local 0 \
+                  --data-parallel-address 10.99.48.128 --data-parallel-rpc-port 13345
+# Node 1
+vllm serve $MODEL --headless --data-parallel-size 4 --data-parallel-size-local 4 \
+                  --data-parallel-address 10.99.48.128 --data-parallel-rpc-port 13345
+```
+
+This DP mode can also be used with Ray by specifying `--data-parallel-backend=ray`:
+
+```bash
+vllm serve $MODEL --data-parallel-size 4 --data-parallel-size-local 2 \
+                  --data-parallel-backend=ray
+```
+
+There are several notable differences when using Ray:
+
+- A single launch command (on any node) is needed to start all local and remote DP ranks, therefore it is more convenient compared to launching on each node
+- There is no need to specify `--data-parallel-address`, and the node where the command is run is used as `--data-parallel-address`
+- There is no need to specify `--data-parallel-rpc-port`
+- Remote DP ranks will be allocated based on node resources of the Ray cluster
+
+Currently, the internal DP load balancing is done within the API server process(es) and is based on the running and waiting queues in each of the engines. This could be made more sophisticated in future by incorporating KV cache aware logic.
+
+When deploying large DP sizes using this method, the API server process can become a bottleneck. In this case, the orthogonal `--api-server-count` command line option can be used to scale this out (for example `--api-server-count=4`). This is transparent to users - a single HTTP endpoint / port is still exposed. Note that this API server scale-out is "internal" and still confined to the "head" node.
+
+<figure markdown="1">
+![DP Internal LB Diagram](../assets/deployment/dp_internal_lb.png)
+</figure>
+
+## External Load Balancing
+
+For larger scale deployments especially, it can make sense to handle the orchestration and load balancing of data parallel ranks externally.
+
+In this case, it's more convenient to treat each DP rank like a separate vLLM deployment, with its own endpoint, and have an external router balance HTTP requests between them, making use of appropriate real-time telemetry from each server for routing decisions.
+
+This can already be done trivially for non-MoE models, since each deployed server is fully independent. No data parallel CLI options need to be used for this.
+
+We support an equivalent topology for MoE DP+EP which can be configured via the following CLI arguments.
+
+If DP ranks are co-located (same node / ip address), a default RPC port is used, but a different HTTP server port must be specified for each rank:
+
+```bash
+# Rank 0
+CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL --data-parallel-size 2 --data-parallel-rank 0 \
+                                         --port 8000
+# Rank 1
+CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL --data-parallel-size 2 --data-parallel-rank 1 \
+                                         --port 8001
+```
+
+For multi-node cases, the address/port of rank 0 must also be specified:
+
+```bash
+# Rank 0  (with ip address 10.99.48.128)
+vllm serve $MODEL --data-parallel-size 2 --data-parallel-rank 0 \
+                  --data-parallel-address 10.99.48.128 --data-parallel-rpc-port 13345
+# Rank 1
+vllm serve $MODEL --data-parallel-size 2 --data-parallel-rank 1 \
+                  --data-parallel-address 10.99.48.128 --data-parallel-rpc-port 13345
+```
+
+The coordinator process also runs in this scenario, co-located with the DP rank 0 engine.
+
+<figure markdown="1">
+![DP External LB Diagram](../assets/deployment/dp_external_lb.png)
+</figure>
+
+In the above diagram, each of the dotted boxes corresponds to a separate launch of `vllm serve` - these could be separate Kubernetes pods, for example.
diff --git a/vllm_v0.10.0/docs/serving/distributed_serving.md b/vllm_v0.10.0/docs/serving/distributed_serving.md
new file mode 100644
index 0000000..a1f522c
--- /dev/null
+++ b/vllm_v0.10.0/docs/serving/distributed_serving.md
@@ -0,0 +1,158 @@
+# Distributed Inference and Serving
+
+## How to decide the distributed inference strategy?
+
+Before going into the details of distributed inference and serving, let's first make it clear when to use distributed inference and what are the strategies available. The common practice is:
+
+- **Single GPU (no distributed inference)**: If your model fits in a single GPU, you probably don't need to use distributed inference. Just use the single GPU to run the inference.
+- **Single-Node Multi-GPU (tensor parallel inference)**: If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you have 4 GPUs in a single node, you can set the tensor parallel size to 4.
+- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2.
+
+In short, you should increase the number of GPUs and the number of nodes until you have enough GPU memory to hold the model. The tensor parallel size should be the number of GPUs in each node, and the pipeline parallel size should be the number of nodes.
+
+After adding enough GPUs and nodes to hold the model, you can run vLLM first, which will print some logs like `# GPU blocks: 790`. Multiply the number by `16` (the block size), and you can get roughly the maximum number of tokens that can be served on the current configuration. If this number is not satisfying, e.g. you want higher throughput, you can further increase the number of GPUs or nodes, until the number of blocks is enough.
+
+!!! note
+    There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs.
+
+### Distributed serving of MoE (Mixture of Experts) models
+
+It is often advantageous to exploit the inherent parallelism of experts by using a separate parallelism strategy for the expert layers. vLLM supports large-scale deployment combining Data Parallel attention with Expert or Tensor Parallel MoE layers. See the page on [Data Parallel Deployment](data_parallel_deployment.md) for more information.
+
+## Running vLLM on a single node
+
+vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inference currently requires Ray.
+
+Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured `tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the `LLM` class `distributed_executor_backend` argument or `--distributed-executor-backend` API server argument. Set it to `mp` for multiprocessing or `ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.
+
+To run multi-GPU inference with the `LLM` class, set the `tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs:
+
+```python
+from vllm import LLM
+llm = LLM("facebook/opt-13b", tensor_parallel_size=4)
+output = llm.generate("San Francisco is a")
+```
+
+To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
+
+```bash
+vllm serve facebook/opt-13b \
+     --tensor-parallel-size 4
+```
+
+You can also additionally specify `--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism:
+
+```bash
+vllm serve gpt2 \
+     --tensor-parallel-size 4 \
+     --pipeline-parallel-size 2
+```
+
+## Running vLLM on multiple nodes
+
+If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration.
+
+The first step, is to start containers and organize them into a cluster. We have provided the helper script <gh-file:examples/online_serving/run_cluster.sh> to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command.
+
+Pick a node as the head node, and run the following command:
+
+```bash
+bash run_cluster.sh \
+                vllm/vllm-openai \
+                ip_of_head_node \
+                --head \
+                /path/to/the/huggingface/home/in/this/node \
+                -e VLLM_HOST_IP=ip_of_this_node
+```
+
+On the rest of the worker nodes, run the following command:
+
+```bash
+bash run_cluster.sh \
+                vllm/vllm-openai \
+                ip_of_head_node \
+                --worker \
+                /path/to/the/huggingface/home/in/this/node \
+                -e VLLM_HOST_IP=ip_of_this_node
+```
+
+Then you get a ray cluster of **containers**. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. The IP addresses of each worker node should be specified in the `VLLM_HOST_IP` environment variable, and should be different for each worker node. Please check the network configuration of your cluster to make sure the nodes can communicate with each other through the specified IP addresses.
+
+!!! warning
+    It is considered best practice to set `VLLM_HOST_IP` to an address on a private network segment for the vLLM cluster. The traffic sent here is not encrypted. The endpoints are also exchanging data in a format that could be exploited to execute arbitrary code should a malicious party gain access to the network. Please ensure that this network is not reachable by any untrusted parties.
+
+!!! warning
+    Since this is a ray cluster of **containers**, all the following commands should be executed in the **containers**, otherwise you are executing the commands on the host machine, which is not connected to the ray cluster. To enter the container, you can use `docker exec -it node /bin/bash`.
+
+Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` and `ray list nodes` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.
+
+After that, on any node, use `docker exec -it node /bin/bash` to enter the container again. **In the container**, you can use vLLM as usual, just as you have all the GPUs on one node: vLLM will be able to leverage GPU resources of all nodes in the Ray cluster, and therefore, only run the `vllm` command on this node but not other nodes. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
+
+```bash
+ vllm serve /path/to/the/model/in/the/container \
+     --tensor-parallel-size 8 \
+     --pipeline-parallel-size 2
+```
+
+You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 16:
+
+```bash
+vllm serve /path/to/the/model/in/the/container \
+     --tensor-parallel-size 16
+```
+
+To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like InfiniBand. To correctly set up the cluster to use InfiniBand, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the InfiniBand is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses InfiniBand with GPUDirect RDMA, which is efficient.
+
+### GPUDirect RDMA
+
+To enable GPUDirect RDMA with vLLM, specific configuration tweaks are needed. This setup ensures:
+
+- `IPC_LOCK` Security Context: Add the `IPC_LOCK` capability to the container’s security context to lock memory pages and prevent swapping to disk.
+- Shared Memory with `/dev/shm`: Mount `/dev/shm` in the pod spec to provide shared memory for IPC.
+
+When using Docker, you can set up the container as follows:
+
+```bash
+docker run --gpus all \
+    --ipc=host \
+    --shm-size=16G \
+    -v /dev/shm:/dev/shm \
+    vllm/vllm-openai
+```
+
+When using Kubernetes, you can set up the pod spec as follows:
+
+```yaml
+...
+spec:
+  containers:
+    - name: vllm
+      image: vllm/vllm-openai
+      securityContext:
+        capabilities:
+          add: ["IPC_LOCK"]
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        limits:
+          nvidia.com/gpu: 8
+        requests:
+          nvidia.com/gpu: 8
+  volumes:
+    - name: dshm
+      emptyDir:
+        medium: Memory
+...
+```
+
+!!! warning
+    After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script][troubleshooting-incorrect-hardware-driver] for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See <gh-issue:6803> for more information.
+
+!!! warning
+    Please make sure you downloaded the model to all the nodes (with the same path), or the model is downloaded to some distributed file system that is accessible by all nodes.
+
+    When you use huggingface repo id to refer to the model, you should append your huggingface token to the `run_cluster.sh` script, e.g. `-e HF_TOKEN=`. The recommended way is to download the model first, and then use the path to refer to the model.
+
+!!! warning
+    If you keep receiving the error message `Error: No available node types can fulfill resource request` but you have enough GPUs in the cluster, chances are your nodes have multiple IP addresses and vLLM cannot find the right one, especially when you are using multi-node inference. Please make sure vLLM and ray use the same IP address. You can set the `VLLM_HOST_IP` environment variable to the right IP address in the `run_cluster.sh` script (different for each node!), and check `ray status` and `ray list nodes` to see the IP address used by Ray. See <gh-issue:7815> for more information.
diff --git a/vllm_v0.10.0/docs/serving/integrations/langchain.md b/vllm_v0.10.0/docs/serving/integrations/langchain.md
new file mode 100644
index 0000000..47074f4
--- /dev/null
+++ b/vllm_v0.10.0/docs/serving/integrations/langchain.md
@@ -0,0 +1,30 @@
+# LangChain
+
+vLLM is also available via [LangChain](https://github.com/langchain-ai/langchain) .
+
+To install LangChain, run
+
+```bash
+pip install langchain langchain_community -q
+```
+
+To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`.
+
+??? code
+
+    ```python
+    from langchain_community.llms import VLLM
+
+    llm = VLLM(model="mosaicml/mpt-7b",
+            trust_remote_code=True,  # mandatory for hf models
+            max_new_tokens=128,
+            top_k=10,
+            top_p=0.95,
+            temperature=0.8,
+            # tensor_parallel_size=... # for distributed inference
+    )
+
+    print(llm("What is the capital of France ?"))
+    ```
+
+Please refer to this [Tutorial](https://python.langchain.com/docs/integrations/llms/vllm) for more details.
diff --git a/vllm_v0.10.0/docs/serving/integrations/llamaindex.md b/vllm_v0.10.0/docs/serving/integrations/llamaindex.md
new file mode 100644
index 0000000..4b838cb
--- /dev/null
+++ b/vllm_v0.10.0/docs/serving/integrations/llamaindex.md
@@ -0,0 +1,24 @@
+# LlamaIndex
+
+vLLM is also available via [LlamaIndex](https://github.com/run-llama/llama_index) .
+
+To install LlamaIndex, run
+
+```bash
+pip install llama-index-llms-vllm -q
+```
+
+To run inference on a single or multiple GPUs, use `Vllm` class from `llamaindex`.
+
+```python
+from llama_index.llms.vllm import Vllm
+
+llm = Vllm(
+    model="microsoft/Orca-2-7b",
+    tensor_parallel_size=4,
+    max_new_tokens=100,
+    vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5},
+)
+```
+
+Please refer to this [Tutorial](https://docs.llamaindex.ai/en/latest/examples/llm/vllm/) for more details.
diff --git a/vllm_v0.10.0/docs/serving/offline_inference.md b/vllm_v0.10.0/docs/serving/offline_inference.md
new file mode 100644
index 0000000..ddda476
--- /dev/null
+++ b/vllm_v0.10.0/docs/serving/offline_inference.md
@@ -0,0 +1,60 @@
+# Offline Inference
+
+Offline inference is possible in your own code using vLLM's [`LLM`][vllm.LLM] class.
+
+For example, the following code downloads the [`facebook/opt-125m`](https://huggingface.co/facebook/opt-125m) model from HuggingFace
+and runs it in vLLM using the default configuration.
+
+```python
+from vllm import LLM
+
+# Initialize the vLLM engine.
+llm = LLM(model="facebook/opt-125m")
+```
+
+After initializing the `LLM` instance, use the available APIs to perform model inference.
+The available APIs depend on the model type:
+
+- [Generative models](../models/generative_models.md) output logprobs which are sampled from to obtain the final output text.
+- [Pooling models](../models/pooling_models.md) output their hidden states directly.
+
+!!! info
+    [API Reference][offline-inference-api]
+
+## Ray Data LLM API
+
+Ray Data LLM is an alternative offline inference API that uses vLLM as the underlying engine.
+This API adds several batteries-included capabilities that simplify large-scale, GPU-efficient inference:
+
+- Streaming execution processes datasets that exceed aggregate cluster memory.
+- Automatic sharding, load balancing, and autoscaling distribute work across a Ray cluster with built-in fault tolerance.
+- Continuous batching keeps vLLM replicas saturated and maximizes GPU utilization.
+- Transparent support for tensor and pipeline parallelism enables efficient multi-GPU inference.
+- Reading and writing to most popular file formats and cloud object storage.
+- Scaling up the workload without code changes.
+
+??? code
+
+    ```python
+    import ray  # Requires ray>=2.44.1
+    from ray.data.llm import vLLMEngineProcessorConfig, build_llm_processor
+
+    config = vLLMEngineProcessorConfig(model_source="unsloth/Llama-3.2-1B-Instruct")
+    processor = build_llm_processor(
+        config,
+        preprocess=lambda row: {
+            "messages": [
+                {"role": "system", "content": "You are a bot that completes unfinished haikus."},
+                {"role": "user", "content": row["item"]},
+            ],
+            "sampling_params": {"temperature": 0.3, "max_tokens": 250},
+        },
+        postprocess=lambda row: {"answer": row["generated_text"]},
+    )
+
+    ds = ray.data.from_items(["An old silent pond..."])
+    ds = processor(ds)
+    ds.write_parquet("local:///tmp/data/")
+    ```
+
+For more information about the Ray Data LLM API, see the [Ray Data LLM documentation](https://docs.ray.io/en/latest/data/working-with-llms.html).
diff --git a/vllm_v0.10.0/docs/serving/openai_compatible_server.md b/vllm_v0.10.0/docs/serving/openai_compatible_server.md
new file mode 100644
index 0000000..edec40f
--- /dev/null
+++ b/vllm_v0.10.0/docs/serving/openai_compatible_server.md
@@ -0,0 +1,841 @@
+# OpenAI-Compatible Server
+
+vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more! This functionality lets you serve models and interact with them using an HTTP client.
+
+In your terminal, you can [install](../getting_started/installation/README.md) vLLM, then start the server with the [`vllm serve`](../configuration/serve_args.md) command. (You can also use our [Docker](../deployment/docker.md) image.)
+
+```bash
+vllm serve NousResearch/Meta-Llama-3-8B-Instruct \
+  --dtype auto \
+  --api-key token-abc123
+```
+
+To call the server, in your preferred text editor, create a script that uses an HTTP client. Include any messages that you want to send to the model. Then run that script. Below is an example script using the [official OpenAI Python client](https://github.com/openai/openai-python).
+
+??? code
+
+    ```python
+    from openai import OpenAI
+    client = OpenAI(
+        base_url="http://localhost:8000/v1",
+        api_key="token-abc123",
+    )
+
+    completion = client.chat.completions.create(
+        model="NousResearch/Meta-Llama-3-8B-Instruct",
+        messages=[
+            {"role": "user", "content": "Hello!"}
+        ]
+    )
+
+    print(completion.choices[0].message)
+    ```
+
+!!! tip
+    vLLM supports some parameters that are not supported by OpenAI, `top_k` for example.
+    You can pass these parameters to vLLM using the OpenAI client in the `extra_body` parameter of your requests, i.e. `extra_body={"top_k": 50}` for `top_k`.
+
+!!! important
+    By default, the server applies `generation_config.json` from the Hugging Face model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator.
+
+    To disable this behavior, please pass `--generation-config vllm` when launching the server.
+
+## Supported APIs
+
+We currently support the following OpenAI APIs:
+
+- [Completions API][completions-api] (`/v1/completions`)
+    - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`).
+    - *Note: `suffix` parameter is not supported.*
+- [Chat Completions API][chat-api] (`/v1/chat/completions`)
+    - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`) with a [chat template][chat-template].
+    - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
+- [Embeddings API][embeddings-api] (`/v1/embeddings`)
+    - Only applicable to [embedding models](../models/pooling_models.md) (`--task embed`).
+- [Transcriptions API][transcriptions-api] (`/v1/audio/transcriptions`)
+    - Only applicable to Automatic Speech Recognition (ASR) models (OpenAI Whisper) (`--task generate`).
+- [Translation API][translations-api] (`/v1/audio/translations`)
+    - Only applicable to Automatic Speech Recognition (ASR) models (OpenAI Whisper) (`--task generate`).
+
+In addition, we have the following custom APIs:
+
+- [Tokenizer API][tokenizer-api] (`/tokenize`, `/detokenize`)
+    - Applicable to any model with a tokenizer.
+- [Pooling API][pooling-api] (`/pooling`)
+    - Applicable to all [pooling models](../models/pooling_models.md).
+- [Classification API][classification-api] (`/classify`)
+    - Only applicable to [classification models](../models/pooling_models.md) (`--task classify`).
+- [Score API][score-api] (`/score`)
+    - Applicable to embedding models and [cross-encoder models](../models/pooling_models.md) (`--task score`).
+- [Re-rank API][rerank-api] (`/rerank`, `/v1/rerank`, `/v2/rerank`)
+    - Implements [Jina AI's v1 re-rank API](https://jina.ai/reranker/)
+    - Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank)
+    - Jina and Cohere's APIs are very similar; Jina's includes extra information in the rerank endpoint's response.
+    - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`).
+
+[](){ #chat-template }
+
+## Chat Template
+
+In order for the language model to support chat protocol, vLLM requires the model to include
+a chat template in its tokenizer configuration. The chat template is a Jinja2 template that
+specifies how are roles, messages, and other chat-specific tokens are encoded in the input.
+
+An example chat template for `NousResearch/Meta-Llama-3-8B-Instruct` can be found [here](https://github.com/meta-llama/llama3?tab=readme-ov-file#instruction-tuned-models)
+
+Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those model,
+you can manually specify their chat template in the `--chat-template` parameter with the file path to the chat
+template, or the template in string form. Without a chat template, the server will not be able to process chat
+and all chat requests will error.
+
+```bash
+vllm serve <model> --chat-template ./path-to-chat-template.jinja
+```
+
+vLLM community provides a set of chat templates for popular models. You can find them under the <gh-dir:examples> directory.
+
+With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies
+both a `type` and a `text` field. An example is provided below:
+
+```python
+completion = client.chat.completions.create(
+    model="NousResearch/Meta-Llama-3-8B-Instruct",
+    messages=[
+        {"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]}
+    ]
+)
+```
+
+Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like
+`meta-llama/Llama-Guard-3-1B` that expect the content to be formatted according to the OpenAI schema in the
+request. vLLM provides best-effort support to detect this automatically, which is logged as a string like
+*"Detected the chat template content format to be..."*, and internally converts incoming requests to match
+the detected format, which can be one of:
+
+- `"string"`: A string.
+    - Example: `"Hello world"`
+- `"openai"`: A list of dictionaries, similar to OpenAI schema.
+    - Example: `[{"type": "text", "text": "Hello world!"}]`
+
+If the result is not what you expect, you can set the `--chat-template-content-format` CLI argument
+to override which format to use.
+
+## Extra Parameters
+
+vLLM supports a set of parameters that are not part of the OpenAI API.
+In order to use them, you can pass them as extra parameters in the OpenAI client.
+Or directly merge them into the JSON payload if you are using HTTP call directly.
+
+```python
+completion = client.chat.completions.create(
+    model="NousResearch/Meta-Llama-3-8B-Instruct",
+    messages=[
+        {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
+    ],
+    extra_body={
+        "guided_choice": ["positive", "negative"]
+    }
+)
+```
+
+## Extra HTTP Headers
+
+Only `X-Request-Id` HTTP request header is supported for now. It can be enabled
+with `--enable-request-id-headers`.
+
+??? code
+
+    ```python
+    completion = client.chat.completions.create(
+        model="NousResearch/Meta-Llama-3-8B-Instruct",
+        messages=[
+            {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
+        ],
+        extra_headers={
+            "x-request-id": "sentiment-classification-00001",
+        }
+    )
+    print(completion._request_id)
+
+    completion = client.completions.create(
+        model="NousResearch/Meta-Llama-3-8B-Instruct",
+        prompt="A robot may not injure a human being",
+        extra_headers={
+            "x-request-id": "completion-test",
+        }
+    )
+    print(completion._request_id)
+    ```
+
+## API Reference
+
+[](){ #completions-api }
+
+### Completions API
+
+Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions);
+you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
+
+Code example: <gh-file:examples/online_serving/openai_completion_client.py>
+
+#### Extra parameters
+
+The following [sampling parameters][sampling-params] are supported.
+
+??? code
+
+    ```python
+    --8<-- "vllm/entrypoints/openai/protocol.py:completion-sampling-params"
+    ```
+
+The following extra parameters are supported:
+
+??? code
+
+    ```python
+    --8<-- "vllm/entrypoints/openai/protocol.py:completion-extra-params"
+    ```
+
+[](){ #chat-api }
+
+### Chat API
+
+Our Chat API is compatible with [OpenAI's Chat Completions API](https://platform.openai.com/docs/api-reference/chat);
+you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
+
+We support both [Vision](https://platform.openai.com/docs/guides/vision)- and
+[Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters;
+see our [Multimodal Inputs](../features/multimodal_inputs.md) guide for more information.
+- *Note: `image_url.detail` parameter is not supported.*
+
+Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>
+
+#### Extra parameters
+
+The following [sampling parameters][sampling-params] are supported.
+
+??? code
+
+    ```python
+    --8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-sampling-params"
+    ```
+
+The following extra parameters are supported:
+
+??? code
+
+    ```python
+    --8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params"
+    ```
+
+[](){ #embeddings-api }
+
+### Embeddings API
+
+Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings);
+you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
+
+If the model has a [chat template][chat-template], you can replace `inputs` with a list of `messages` (same schema as [Chat API][chat-api])
+which will be treated as a single prompt to the model.
+
+Code example: <gh-file:examples/online_serving/openai_embedding_client.py>
+
+#### Multi-modal inputs
+
+You can pass multi-modal inputs to embedding models by defining a custom chat template for the server
+and passing a list of `messages` in the request. Refer to the examples below for illustration.
+
+=== "VLM2Vec"
+
+    To serve the model:
+
+    ```bash
+    vllm serve TIGER-Lab/VLM2Vec-Full --task embed \
+      --trust-remote-code \
+      --max-model-len 4096 \
+      --chat-template examples/template_vlm2vec.jinja
+    ```
+
+    !!! important
+        Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed`
+        to run this model in embedding mode instead of text generation mode.
+
+        The custom chat template is completely different from the original one for this model,
+        and can be found here: <gh-file:examples/template_vlm2vec.jinja>
+
+    Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
+
+    ??? code
+
+        ```python
+        import requests
+
+        image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+
+        response = requests.post(
+            "http://localhost:8000/v1/embeddings",
+            json={
+                "model": "TIGER-Lab/VLM2Vec-Full",
+                "messages": [{
+                    "role": "user",
+                    "content": [
+                        {"type": "image_url", "image_url": {"url": image_url}},
+                        {"type": "text", "text": "Represent the given image."},
+                    ],
+                }],
+                "encoding_format": "float",
+            },
+        )
+        response.raise_for_status()
+        response_json = response.json()
+        print("Embedding output:", response_json["data"][0]["embedding"])
+        ```
+
+=== "DSE-Qwen2-MRL"
+
+    To serve the model:
+
+    ```bash
+    vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \
+      --trust-remote-code \
+      --max-model-len 8192 \
+      --chat-template examples/template_dse_qwen2_vl.jinja
+    ```
+
+    !!! important
+        Like with VLM2Vec, we have to explicitly pass `--task embed`.
+
+        Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
+        by a custom chat template: <gh-file:examples/template_dse_qwen2_vl.jinja>
+
+    !!! important
+        `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
+        example below for details.
+
+Full example: <gh-file:examples/online_serving/openai_chat_embedding_client_for_multimodal.py>
+
+#### Extra parameters
+
+The following [pooling parameters][pooling-params] are supported.
+
+```python
+--8<-- "vllm/entrypoints/openai/protocol.py:embedding-pooling-params"
+```
+
+The following extra parameters are supported by default:
+
+??? code
+
+    ```python
+    --8<-- "vllm/entrypoints/openai/protocol.py:embedding-extra-params"
+    ```
+
+For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead:
+
+??? code
+
+    ```python
+    --8<-- "vllm/entrypoints/openai/protocol.py:chat-embedding-extra-params"
+    ```
+
+[](){ #transcriptions-api }
+
+### Transcriptions API
+
+Our Transcriptions API is compatible with [OpenAI's Transcriptions API](https://platform.openai.com/docs/api-reference/audio/createTranscription);
+you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
+
+!!! note
+    To use the Transcriptions API, please install with extra audio dependencies using `pip install vllm[audio]`.
+
+Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
+<!-- TODO: api enforced limits + uploading audios -->
+
+#### API Enforced Limits
+
+Set the maximum audio file size (in MB) that VLLM will accept, via the
+`VLLM_MAX_AUDIO_CLIP_FILESIZE_MB` environment variable. Default is 25 MB.
+
+#### Extra Parameters
+
+The following [sampling parameters][sampling-params] are supported.
+
+??? code
+
+    ```python
+    --8<-- "vllm/entrypoints/openai/protocol.py:transcription-sampling-params"
+    ```
+
+The following extra parameters are supported:
+
+??? code
+
+    ```python
+    --8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
+    ```
+  
+[](){ #translations-api }
+
+### Translations API
+
+Our Translation API is compatible with [OpenAI's Translations API](https://platform.openai.com/docs/api-reference/audio/createTranslation);
+you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
+Whisper models can translate audio from one of the 55 non-English supported languages into English.
+Please mind that the popular `openai/whisper-large-v3-turbo` model does not support translating.
+
+!!! note
+    To use the Translation API, please install with extra audio dependencies using `pip install vllm[audio]`.
+
+Code example: <gh-file:examples/online_serving/openai_translation_client.py>
+
+#### Extra Parameters
+
+The following [sampling parameters][sampling-params] are supported.
+
+```python
+--8<-- "vllm/entrypoints/openai/protocol.py:translation-sampling-params"
+```
+
+The following extra parameters are supported:
+
+```python
+--8<-- "vllm/entrypoints/openai/protocol.py:translation-extra-params"
+```
+
+[](){ #tokenizer-api }
+
+### Tokenizer API
+
+Our Tokenizer API is a simple wrapper over [HuggingFace-style tokenizers](https://huggingface.co/docs/transformers/en/main_classes/tokenizer).
+It consists of two endpoints:
+
+- `/tokenize` corresponds to calling `tokenizer.encode()`.
+- `/detokenize` corresponds to calling `tokenizer.decode()`.
+
+[](){ #pooling-api }
+
+### Pooling API
+
+Our Pooling API encodes input prompts using a [pooling model](../models/pooling_models.md) and returns the corresponding hidden states.
+
+The input format is the same as [Embeddings API][embeddings-api], but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
+
+Code example: <gh-file:examples/online_serving/openai_pooling_client.py>
+
+[](){ #classification-api }
+
+### Classification API
+
+Our Classification API directly supports Hugging Face sequence-classification models such as [ai21labs/Jamba-tiny-reward-dev](https://huggingface.co/ai21labs/Jamba-tiny-reward-dev) and [jason9693/Qwen2.5-1.5B-apeach](https://huggingface.co/jason9693/Qwen2.5-1.5B-apeach).
+
+We automatically wrap any other transformer via `as_seq_cls_model()`, which pools on the last token, attaches a `RowParallelLinear` head, and applies a softmax to produce per-class probabilities.
+
+Code example: <gh-file:examples/online_serving/openai_classification_client.py>
+
+#### Example Requests
+
+You can classify multiple texts by passing an array of strings:
+
+```bash
+curl -v "http://127.0.0.1:8000/classify" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "jason9693/Qwen2.5-1.5B-apeach",
+    "input": [
+      "Loved the new café—coffee was great.",
+      "This update broke everything. Frustrating."
+    ]
+  }'
+```
+
+??? console "Response"
+
+    ```json
+    {
+      "id": "classify-7c87cac407b749a6935d8c7ce2a8fba2",
+      "object": "list",
+      "created": 1745383065,
+      "model": "jason9693/Qwen2.5-1.5B-apeach",
+      "data": [
+        {
+          "index": 0,
+          "label": "Default",
+          "probs": [
+            0.565970778465271,
+            0.4340292513370514
+          ],
+          "num_classes": 2
+        },
+        {
+          "index": 1,
+          "label": "Spoiled",
+          "probs": [
+            0.26448777318000793,
+            0.7355121970176697
+          ],
+          "num_classes": 2
+        }
+      ],
+      "usage": {
+        "prompt_tokens": 20,
+        "total_tokens": 20,
+        "completion_tokens": 0,
+        "prompt_tokens_details": null
+      }
+    }
+    ```
+
+You can also pass a string directly to the `input` field:
+
+```bash
+curl -v "http://127.0.0.1:8000/classify" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "jason9693/Qwen2.5-1.5B-apeach",
+    "input": "Loved the new café—coffee was great."
+  }'
+```
+
+??? console "Response"
+
+    ```json
+    {
+      "id": "classify-9bf17f2847b046c7b2d5495f4b4f9682",
+      "object": "list",
+      "created": 1745383213,
+      "model": "jason9693/Qwen2.5-1.5B-apeach",
+      "data": [
+        {
+          "index": 0,
+          "label": "Default",
+          "probs": [
+            0.565970778465271,
+            0.4340292513370514
+          ],
+          "num_classes": 2
+        }
+      ],
+      "usage": {
+        "prompt_tokens": 10,
+        "total_tokens": 10,
+        "completion_tokens": 0,
+        "prompt_tokens_details": null
+      }
+    }
+    ```
+
+#### Extra parameters
+
+The following [pooling parameters][pooling-params] are supported.
+
+```python
+--8<-- "vllm/entrypoints/openai/protocol.py:classification-pooling-params"
+```
+
+The following extra parameters are supported:
+
+```python
+--8<-- "vllm/entrypoints/openai/protocol.py:classification-extra-params"
+```
+
+[](){ #score-api }
+
+### Score API
+
+Our Score API can apply a cross-encoder model or an embedding model to predict scores for sentence or multimodal pairs. When using an embedding model the score corresponds to the cosine similarity between each embedding pair.
+Usually, the score for a sentence pair refers to the similarity between two sentences, on a scale of 0 to 1.
+
+You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
+
+Code example: <gh-file:examples/online_serving/openai_cross_encoder_score.py>
+
+#### Single inference
+
+You can pass a string to both `text_1` and `text_2`, forming a single sentence pair.
+
+```bash
+curl -X 'POST' \
+  'http://127.0.0.1:8000/score' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "model": "BAAI/bge-reranker-v2-m3",
+  "encoding_format": "float",
+  "text_1": "What is the capital of France?",
+  "text_2": "The capital of France is Paris."
+}'
+```
+
+??? console "Response"
+
+    ```json
+    {
+      "id": "score-request-id",
+      "object": "list",
+      "created": 693447,
+      "model": "BAAI/bge-reranker-v2-m3",
+      "data": [
+        {
+          "index": 0,
+          "object": "score",
+          "score": 1
+        }
+      ],
+      "usage": {}
+    }
+    ```
+
+#### Batch inference
+
+You can pass a string to `text_1` and a list to `text_2`, forming multiple sentence pairs
+where each pair is built from `text_1` and a string in `text_2`.
+The total number of pairs is `len(text_2)`.
+
+??? console "Request"
+
+    ```bash
+    curl -X 'POST' \
+      'http://127.0.0.1:8000/score' \
+      -H 'accept: application/json' \
+      -H 'Content-Type: application/json' \
+      -d '{
+      "model": "BAAI/bge-reranker-v2-m3",
+      "text_1": "What is the capital of France?",
+      "text_2": [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris."
+      ]
+    }'
+    ```
+
+??? console "Response"
+
+    ```json
+    {
+      "id": "score-request-id",
+      "object": "list",
+      "created": 693570,
+      "model": "BAAI/bge-reranker-v2-m3",
+      "data": [
+        {
+          "index": 0,
+          "object": "score",
+          "score": 0.001094818115234375
+        },
+        {
+          "index": 1,
+          "object": "score",
+          "score": 1
+        }
+      ],
+      "usage": {}
+    }
+    ```
+
+You can pass a list to both `text_1` and `text_2`, forming multiple sentence pairs
+where each pair is built from a string in `text_1` and the corresponding string in `text_2` (similar to `zip()`).
+The total number of pairs is `len(text_2)`.
+
+??? console "Request"
+
+    ```bash
+    curl -X 'POST' \
+      'http://127.0.0.1:8000/score' \
+      -H 'accept: application/json' \
+      -H 'Content-Type: application/json' \
+      -d '{
+      "model": "BAAI/bge-reranker-v2-m3",
+      "encoding_format": "float",
+      "text_1": [
+        "What is the capital of Brazil?",
+        "What is the capital of France?"
+      ],
+      "text_2": [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris."
+      ]
+    }'
+    ```
+
+??? console "Response"
+
+    ```json
+    {
+      "id": "score-request-id",
+      "object": "list",
+      "created": 693447,
+      "model": "BAAI/bge-reranker-v2-m3",
+      "data": [
+        {
+          "index": 0,
+          "object": "score",
+          "score": 1
+        },
+        {
+          "index": 1,
+          "object": "score",
+          "score": 1
+        }
+      ],
+      "usage": {}
+    }
+    ```
+
+#### Multi-modal inputs
+
+You can pass multi-modal inputs to scoring models by passing `content` including a list of multi-modal input (image, etc.) in the request. Refer to the examples below for illustration.
+
+=== "JinaVL-Reranker"
+
+    To serve the model:
+
+    ```bash
+    vllm serve jinaai/jina-reranker-m0
+    ```
+
+    Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
+
+    ??? Code
+
+        ```python
+        import requests
+
+        response = requests.post(
+            "http://localhost:8000/v1/score",
+            json={
+                "model": "jinaai/jina-reranker-m0",
+                "text_1": "slm markdown",
+                "text_2": {
+                  "content": [
+                          {
+                              "type": "image_url",
+                              "image_url": {
+                                  "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
+                              },
+                          },
+                          {
+                              "type": "image_url",
+                              "image_url": {
+                                  "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
+                              },
+                          },
+                      ]
+                  }
+                },
+        )
+        response.raise_for_status()
+        response_json = response.json()
+        print("Scoring output:", response_json["data"][0]["score"])
+        print("Scoring output:", response_json["data"][1]["score"])
+        ```
+Full example: <gh-file:examples/online_serving/openai_cross_encoder_score_for_multimodal.py>
+
+#### Extra parameters
+
+The following [pooling parameters][pooling-params] are supported.
+
+```python
+--8<-- "vllm/entrypoints/openai/protocol.py:score-pooling-params"
+```
+
+The following extra parameters are supported:
+
+```python
+--8<-- "vllm/entrypoints/openai/protocol.py:score-extra-params"
+```
+
+[](){ #rerank-api }
+
+### Re-rank API
+
+Our Re-rank API can apply an embedding model or a cross-encoder model to predict relevant scores between a single query, and
+each of a list of documents. Usually, the score for a sentence pair refers to the similarity between two sentences or multi-modal inputs (image, etc.), on a scale of 0 to 1.
+
+You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
+
+The rerank endpoints support popular re-rank models such as `BAAI/bge-reranker-base` and other models supporting the
+`score` task. Additionally, `/rerank`, `/v1/rerank`, and `/v2/rerank`
+endpoints are compatible with both [Jina AI's re-rank API interface](https://jina.ai/reranker/) and
+[Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with
+popular open-source tools.
+
+Code example: <gh-file:examples/online_serving/jinaai_rerank_client.py>
+
+#### Example Request
+
+Note that the `top_n` request parameter is optional and will default to the length of the `documents` field.
+Result documents will be sorted by relevance, and the `index` property can be used to determine original order.
+
+??? console "Request"
+
+    ```bash
+    curl -X 'POST' \
+      'http://127.0.0.1:8000/v1/rerank' \
+      -H 'accept: application/json' \
+      -H 'Content-Type: application/json' \
+      -d '{
+      "model": "BAAI/bge-reranker-base",
+      "query": "What is the capital of France?",
+      "documents": [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
+        "Horses and cows are both animals"
+      ]
+    }'
+    ```
+
+??? console "Response"
+
+    ```json
+    {
+      "id": "rerank-fae51b2b664d4ed38f5969b612edff77",
+      "model": "BAAI/bge-reranker-base",
+      "usage": {
+        "total_tokens": 56
+      },
+      "results": [
+        {
+          "index": 1,
+          "document": {
+            "text": "The capital of France is Paris."
+          },
+          "relevance_score": 0.99853515625
+        },
+        {
+          "index": 0,
+          "document": {
+            "text": "The capital of Brazil is Brasilia."
+          },
+          "relevance_score": 0.0005860328674316406
+        }
+      ]
+    }
+    ```
+
+#### Extra parameters
+
+The following [pooling parameters][pooling-params] are supported.
+
+```python
+--8<-- "vllm/entrypoints/openai/protocol.py:rerank-pooling-params"
+```
+
+The following extra parameters are supported:
+
+```python
+--8<-- "vllm/entrypoints/openai/protocol.py:rerank-extra-params"
+```
+
+## Ray Serve LLM
+
+Ray Serve LLM enables scalable, production-grade serving of the vLLM engine. It integrates tightly with vLLM and extends it with features such as auto-scaling, load balancing, and back-pressure.
+
+Key capabilities:
+
+- Exposes an OpenAI-compatible HTTP API as well as a Pythonic API.
+- Scales from a single GPU to a multi-node cluster without code changes.
+- Provides observability and autoscaling policies through Ray dashboards and metrics.
+
+The following example shows how to deploy a large model like DeepSeek R1 with Ray Serve LLM: <gh-file:examples/online_serving/ray_serve_deepseek.py>.
+
+Learn more about Ray Serve LLM with the official [Ray Serve LLM documentation](https://docs.ray.io/en/latest/serve/llm/serving-llms.html).
diff --git a/vllm_v0.10.0/docs/training/rlhf.md b/vllm_v0.10.0/docs/training/rlhf.md
new file mode 100644
index 0000000..4f75e4e
--- /dev/null
+++ b/vllm_v0.10.0/docs/training/rlhf.md
@@ -0,0 +1,11 @@
+# Reinforcement Learning from Human Feedback
+
+Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviors.
+
+vLLM can be used to generate the completions for RLHF. The best way to do this is with libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF) and [verl](https://github.com/volcengine/verl).
+
+See the following basic examples to get started if you don't want to use an existing library:
+
+- [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](../examples/offline_inference/rlhf.md)
+- [Training and inference processes are colocated on the same GPUs using Ray](../examples/offline_inference/rlhf_colocate.md)
+- [Utilities for performing RLHF with vLLM](../examples/offline_inference/rlhf_utils.md)
diff --git a/vllm_v0.10.0/docs/training/trl.md b/vllm_v0.10.0/docs/training/trl.md
new file mode 100644
index 0000000..c7c1a5a
--- /dev/null
+++ b/vllm_v0.10.0/docs/training/trl.md
@@ -0,0 +1,12 @@
+# Transformers Reinforcement Learning
+
+Transformers Reinforcement Learning (TRL) is a full stack library that provides a set of tools to train transformer language models with methods like Supervised Fine-Tuning (SFT), Group Relative Policy Optimization (GRPO), Direct Preference Optimization (DPO), Reward Modeling, and more. The library is integrated with 🤗 transformers.
+
+Online methods such as GRPO or Online DPO require the model to generate completions. vLLM can be used to generate these completions!
+
+See the guide [vLLM for fast generation in online methods](https://huggingface.co/docs/trl/main/en/speeding_up_training#vllm-for-fast-generation-in-online-methods) in the TRL documentation for more information.
+
+!!! info
+    For more information on the `use_vllm` flag you can provide to the configs of these online methods, see:
+    - [`trl.GRPOConfig.use_vllm`](https://huggingface.co/docs/trl/main/en/grpo_trainer#trl.GRPOConfig.use_vllm)
+    - [`trl.OnlineDPOConfig.use_vllm`](https://huggingface.co/docs/trl/main/en/online_dpo_trainer#trl.OnlineDPOConfig.use_vllm)
diff --git a/vllm_v0.10.0/docs/usage/README.md b/vllm_v0.10.0/docs/usage/README.md
new file mode 100644
index 0000000..681db57
--- /dev/null
+++ b/vllm_v0.10.0/docs/usage/README.md
@@ -0,0 +1,7 @@
+# Using vLLM
+
+vLLM supports the following usage patterns:
+
+- [Inference and Serving](../serving/offline_inference.md): Run a single instance of a model.
+- [Deployment](../deployment/docker.md): Scale up model instances for production.
+- [Training](../training/rlhf.md): Train or fine-tune a model.
diff --git a/vllm_v0.10.0/docs/usage/faq.md b/vllm_v0.10.0/docs/usage/faq.md
new file mode 100644
index 0000000..2c8680c
--- /dev/null
+++ b/vllm_v0.10.0/docs/usage/faq.md
@@ -0,0 +1,35 @@
+# Frequently Asked Questions
+
+> Q: How can I serve multiple models on a single port using the OpenAI API?
+
+A: Assuming that you're referring to using OpenAI compatible server to serve multiple models at once, that is not currently supported, you can run multiple instances of the server (each serving a different model) at the same time, and have another layer to route the incoming request to the correct server accordingly.
+
+---
+
+> Q: Which model to use for offline inference embedding?
+
+A: You can try [e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct) and [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5);
+more are listed [here](../models/supported_models.md).
+
+By extracting hidden states, vLLM can automatically convert text generation models like [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B),
+[Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) into embedding models,
+but they are expected to be inferior to models that are specifically trained on embedding tasks.
+
+---
+
+> Q: Can the output of a prompt vary across runs in vLLM?
+
+A: Yes, it can. vLLM does not guarantee stable log probabilities (logprobs) for the output tokens. Variations in logprobs may occur due to
+numerical instability in Torch operations or non-deterministic behavior in batched Torch operations when batching changes. For more details,
+see the [Numerical Accuracy section](https://pytorch.org/docs/stable/notes/numerical_accuracy.html#batched-computations-or-slice-computations).
+
+In vLLM, the same requests might be batched differently due to factors such as other concurrent requests,
+changes in batch size, or batch expansion in speculative decoding. These batching variations, combined with numerical instability of Torch operations,
+can lead to slightly different logit/logprob values at each step. Such differences can accumulate, potentially resulting in
+different tokens being sampled. Once a different token is sampled, further divergence is likely.
+
+## Mitigation Strategies
+
+- For improved stability and reduced variance, use `float32`. Note that this will require more memory.
+- If using `bfloat16`, switching to `float16` can also help.
+- Using request seeds can aid in achieving more stable generation for temperature > 0, but discrepancies due to precision differences may still occur.
diff --git a/vllm_v0.10.0/docs/usage/metrics.md b/vllm_v0.10.0/docs/usage/metrics.md
new file mode 100644
index 0000000..d756e32
--- /dev/null
+++ b/vllm_v0.10.0/docs/usage/metrics.md
@@ -0,0 +1,44 @@
+# Production Metrics
+
+vLLM exposes a number of metrics that can be used to monitor the health of the
+system. These metrics are exposed via the `/metrics` endpoint on the vLLM
+OpenAI compatible API server.
+
+You can start the server using Python, or using [Docker](../deployment/docker.md):
+
+```bash
+vllm serve unsloth/Llama-3.2-1B-Instruct
+```
+
+Then query the endpoint to get the latest metrics from the server:
+
+??? console "Output"
+
+    ```console
+    $ curl http://0.0.0.0:8000/metrics
+
+    # HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step.
+    # TYPE vllm:iteration_tokens_total histogram
+    vllm:iteration_tokens_total_sum{model_name="unsloth/Llama-3.2-1B-Instruct"} 0.0
+    vllm:iteration_tokens_total_bucket{le="1.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+    vllm:iteration_tokens_total_bucket{le="8.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+    vllm:iteration_tokens_total_bucket{le="16.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+    vllm:iteration_tokens_total_bucket{le="32.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+    vllm:iteration_tokens_total_bucket{le="64.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+    vllm:iteration_tokens_total_bucket{le="128.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+    vllm:iteration_tokens_total_bucket{le="256.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+    vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+    ...
+    ```
+
+The following metrics are exposed:
+
+??? code
+
+    ```python
+    --8<-- "vllm/engine/metrics.py:metrics-definitions"
+    ```
+
+Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
+but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,
+and are then removed in version `X.Y+2`.
diff --git a/vllm_v0.10.0/docs/usage/reproducibility.md b/vllm_v0.10.0/docs/usage/reproducibility.md
new file mode 100644
index 0000000..a494dcf
--- /dev/null
+++ b/vllm_v0.10.0/docs/usage/reproducibility.md
@@ -0,0 +1,52 @@
+# Reproducibility
+
+vLLM does not guarantee the reproducibility of the results by default, for the sake of performance. You need to do the following to achieve
+reproducible results:
+
+- For V1: Turn off multiprocessing to make the scheduling deterministic by setting `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
+- For V0: Set the global seed (see below).
+
+Example: <gh-file:examples/offline_inference/reproducibility.py>
+
+!!! warning
+
+    Applying the above settings [changes the random state in user code](#locality-of-random-state).
+
+!!! note
+
+    Even with the above settings, vLLM only provides reproducibility
+    when it runs on the same hardware and the same vLLM version.
+    Also, the online serving API (`vllm serve`) does not support reproducibility
+    because it is almost impossible to make the scheduling deterministic in the
+    online setting.
+
+## Setting the global seed
+
+The `seed` parameter in vLLM is used to control the random states for various random number generators.
+
+If a specific seed value is provided, the random states for `random`, `np.random`, and `torch.manual_seed` will be set accordingly.
+
+However, in some cases, setting the seed will also [change the random state in user code](#locality-of-random-state).
+
+### Default Behavior
+
+In V0, the `seed` parameter defaults to `None`. When the `seed` parameter is `None`, the random states for `random`, `np.random`, and `torch.manual_seed` are not set. This means that each run of vLLM will produce different results if `temperature > 0`, as expected.
+
+In V1, the `seed` parameter defaults to `0` which sets the random state for each worker, so the results will remain consistent for each vLLM run even if `temperature > 0`.
+
+!!! note
+
+    It is impossible to un-specify a seed for V1 because different workers need to sample the same outputs
+    for workflows such as speculative decoding.
+    
+    For more information, see: <gh-pr:17929>
+
+### Locality of random state
+
+The random state in user code (i.e. the code that constructs [LLM][vllm.LLM] class) is updated by vLLM under the following conditions:
+
+- For V0: The seed is specified.
+- For V1: The workers are run in the same process as user code, i.e.: `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
+
+By default, these conditions are not active so you can use vLLM without having to worry about
+accidentally making deterministic subsequent operations that rely on random state.
diff --git a/vllm_v0.10.0/docs/usage/security.md b/vllm_v0.10.0/docs/usage/security.md
new file mode 100644
index 0000000..7614043
--- /dev/null
+++ b/vllm_v0.10.0/docs/usage/security.md
@@ -0,0 +1,98 @@
+# Security
+
+## Inter-Node Communication
+
+All communications between nodes in a multi-node vLLM deployment are **insecure by default** and must be protected by placing the nodes on an isolated network. This includes:
+
+1. PyTorch Distributed communications
+2. KV cache transfer communications
+3. Tensor, Pipeline, and Data parallel communications
+
+### Configuration Options for Inter-Node Communications
+
+The following options control inter-node communications in vLLM:
+
+#### 1. **Environment Variables:**
+   - `VLLM_HOST_IP`: Sets the IP address for vLLM processes to communicate on
+
+#### 2. **KV Cache Transfer Configuration:**
+   - `--kv-ip`: The IP address for KV cache transfer communications (default: 127.0.0.1)
+   - `--kv-port`: The port for KV cache transfer communications (default: 14579)
+
+#### 3. **Data Parallel Configuration:**
+   - `data_parallel_master_ip`: IP of the data parallel master (default: 127.0.0.1)
+   - `data_parallel_master_port`: Port of the data parallel master (default: 29500)
+
+### Notes on PyTorch Distributed
+
+vLLM uses PyTorch's distributed features for some inter-node communication. For
+detailed information about PyTorch Distributed security considerations, please
+refer to the [PyTorch Security
+Guide](https://github.com/pytorch/pytorch/security/policy#using-distributed-features).
+
+Key points from the PyTorch security guide:
+
+- PyTorch Distributed features are intended for internal communication only
+- They are not built for use in untrusted environments or networks
+- No authorization protocol is included for performance reasons
+- Messages are sent unencrypted
+- Connections are accepted from anywhere without checks
+
+### Security Recommendations
+
+#### 1. **Network Isolation:**
+   - Deploy vLLM nodes on a dedicated, isolated network
+   - Use network segmentation to prevent unauthorized access
+   - Implement appropriate firewall rules
+
+#### 2. **Configuration Best Practices:**
+   - Always set `VLLM_HOST_IP` to a specific IP address rather than using defaults
+   - Configure firewalls to only allow necessary ports between nodes
+
+#### 3. **Access Control:**
+   - Restrict physical and network access to the deployment environment
+   - Implement proper authentication and authorization for management interfaces
+   - Follow the principle of least privilege for all system components
+
+## Security and Firewalls: Protecting Exposed vLLM Systems
+
+While vLLM is designed to allow unsafe network services to be isolated to
+private networks, there are components—such as dependencies and underlying
+frameworks—that may open insecure services listening on all network interfaces,
+sometimes outside of vLLM's direct control.
+
+A major concern is the use of `torch.distributed`, which vLLM leverages for
+distributed communication, including when using vLLM on a single host. When vLLM
+uses TCP initialization (see [PyTorch TCP Initialization
+documentation](https://docs.pytorch.org/docs/stable/distributed.html#tcp-initialization)),
+PyTorch creates a `TCPStore` that, by default, listens on all network
+interfaces. This means that unless additional protections are put in place,
+these services may be accessible to any host that can reach your machine via any
+network interface.
+
+**From a PyTorch perspective, any use of `torch.distributed` should be
+considered insecure by default.** This is a known and intentional behavior from
+the PyTorch team.
+
+### Firewall Configuration Guidance
+
+The best way to protect your vLLM system is to carefully configure a firewall to
+expose only the minimum network surface area necessary. In most cases, this
+means:
+
+- **Block all incoming connections except to the TCP port the API server is
+listening on.**
+
+- Ensure that ports used for internal communication (such as those for
+`torch.distributed` and KV cache transfer) are only accessible from trusted
+hosts or networks.
+
+- Never expose these internal ports to the public internet or untrusted
+networks.
+
+Consult your operating system or application platform documentation for specific
+firewall configuration instructions.
+
+## Reporting Security Vulnerabilities
+
+If you believe you have found a security vulnerability in vLLM, please report it following the project's security policy. For more information on how to report security issues and the project's security policy, please see the [vLLM Security Policy](https://github.com/vllm-project/vllm/blob/main/SECURITY.md).
diff --git a/vllm_v0.10.0/docs/usage/troubleshooting.md b/vllm_v0.10.0/docs/usage/troubleshooting.md
new file mode 100644
index 0000000..f9ba32c
--- /dev/null
+++ b/vllm_v0.10.0/docs/usage/troubleshooting.md
@@ -0,0 +1,297 @@
+# Troubleshooting
+
+This document outlines some troubleshooting strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
+
+!!! note
+    Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated.
+
+## Hangs downloading a model
+
+If the model isn't already downloaded to disk, vLLM will download it from the internet which can take time and depend on your internet connection.
+It's recommended to download the model first using the [huggingface-cli](https://huggingface.co/docs/huggingface_hub/en/guides/cli) and passing the local path to the model to vLLM. This way, you can isolate the issue.
+
+## Hangs loading a model from disk
+
+If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow.
+It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory.
+
+!!! note
+    To isolate the model downloading and loading issue, you can use the `--load-format dummy` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck.
+
+## Out of memory
+
+If the model is too large to fit in a single GPU, you will get an out-of-memory (OOM) error. Consider adopting [these options](../configuration/conserving_memory.md) to reduce the memory consumption.
+
+## Generation quality changed
+
+In v0.8.0, the source of default sampling parameters was changed in <gh-pr:12622>. Prior to v0.8.0, the default sampling parameters came from vLLM's set of neutral defaults. From v0.8.0 onwards, the default sampling parameters come from the `generation_config.json` provided by the model creator.
+
+In most cases, this should lead to higher quality responses, because the model creator is likely to know which sampling parameters are best for their model. However, in some cases the defaults provided by the model creator can lead to degraded performance.
+
+You can check if this is happening by trying the old defaults with `--generation-config vllm` for online and `generation_config="vllm"` for offline. If, after trying this, your generation quality improves we would recommend continuing to use the vLLM defaults and petition the model creator on <https://huggingface.co> to update their default `generation_config.json` so that it produces better quality generations.
+
+## Enable more logging
+
+If other strategies don't solve the problem, it's likely that the vLLM instance is stuck somewhere. You can use the following environment variables to help debug the issue:
+
+- `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging.
+- `export CUDA_LAUNCH_BLOCKING=1` to identify which CUDA kernel is causing the problem.
+- `export NCCL_DEBUG=TRACE` to turn on more logging for NCCL.
+- `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs. Do not use this flag unless absolutely needed for debugging, it will cause significant delays in startup time.
+
+## Incorrect network setup
+
+The vLLM instance cannot get the correct IP address if you have a complicated network config. You can find a log such as `DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl` and the IP address should be the correct one.
+If it's not, override the IP address using the environment variable `export VLLM_HOST_IP=<your_ip_address>`.
+
+You might also need to set `export NCCL_SOCKET_IFNAME=<your_network_interface>` and `export GLOO_SOCKET_IFNAME=<your_network_interface>` to specify the network interface for the IP address.
+
+## Error near `self.graph.replay()`
+
+If vLLM crashes and the error trace captures it somewhere around `self.graph.replay()` in `vllm/worker/model_runner.py`, it is a CUDA error inside CUDAGraph.
+To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the [LLM][vllm.LLM] class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error.
+
+[](){ #troubleshooting-incorrect-hardware-driver }
+
+## Incorrect hardware/driver
+
+If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly.
+
+??? code
+
+    ```python
+    # Test PyTorch NCCL
+    import torch
+    import torch.distributed as dist
+    dist.init_process_group(backend="nccl")
+    local_rank = dist.get_rank() % torch.cuda.device_count()
+    torch.cuda.set_device(local_rank)
+    data = torch.FloatTensor([1,] * 128).to("cuda")
+    dist.all_reduce(data, op=dist.ReduceOp.SUM)
+    torch.cuda.synchronize()
+    value = data.mean().item()
+    world_size = dist.get_world_size()
+    assert value == world_size, f"Expected {world_size}, got {value}"
+
+    print("PyTorch NCCL is successful!")
+
+    # Test PyTorch GLOO
+    gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo")
+    cpu_data = torch.FloatTensor([1,] * 128)
+    dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group)
+    value = cpu_data.mean().item()
+    assert value == world_size, f"Expected {world_size}, got {value}"
+
+    print("PyTorch GLOO is successful!")
+
+    if world_size <= 1:
+        exit()
+
+    # Test vLLM NCCL, with cuda graph
+    from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+
+    pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank)
+    # pynccl is enabled by default for 0.6.5+,
+    # but for 0.6.4 and below, we need to enable it manually.
+    # keep the code for backward compatibility when because people
+    # prefer to read the latest documentation.
+    pynccl.disabled = False
+
+    s = torch.cuda.Stream()
+    with torch.cuda.stream(s):
+        data.fill_(1)
+        out = pynccl.all_reduce(data, stream=s)
+        value = out.mean().item()
+        assert value == world_size, f"Expected {world_size}, got {value}"
+
+    print("vLLM NCCL is successful!")
+
+    g = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(cuda_graph=g, stream=s):
+        out = pynccl.all_reduce(data, stream=torch.cuda.current_stream())
+
+    data.fill_(1)
+    g.replay()
+    torch.cuda.current_stream().synchronize()
+    value = out.mean().item()
+    assert value == world_size, f"Expected {world_size}, got {value}"
+
+    print("vLLM NCCL with cuda graph is successful!")
+
+    dist.destroy_process_group(gloo_group)
+    dist.destroy_process_group()
+    ```
+
+If you are testing with a single node, adjust `--nproc-per-node` to the number of GPUs you want to use:
+
+```bash
+NCCL_DEBUG=TRACE torchrun --nproc-per-node=<number-of-GPUs> test.py
+```
+
+If you are testing with multi-nodes, adjust `--nproc-per-node` and `--nnodes` according to your setup and set `MASTER_ADDR` to the correct IP address of the master node, reachable from all nodes. Then, run:
+
+```bash
+NCCL_DEBUG=TRACE torchrun --nnodes 2 \
+    --nproc-per-node=2 \
+    --rdzv_backend=c10d \
+    --rdzv_endpoint=$MASTER_ADDR test.py
+```
+
+If the script runs successfully, you should see the message `sanity check is successful!`.
+
+If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as `export NCCL_P2P_DISABLE=1` to see if it helps. Please check [their documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html) for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully.
+
+!!! note
+    A multi-node environment is more complicated than a single-node one. If you see errors such as `torch.distributed.DistNetworkError`, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments:
+
+    - In the first node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py`.
+    - In the second node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py`.
+
+    Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes.
+
+[](){ #troubleshooting-python-multiprocessing }
+
+## Python multiprocessing
+
+### `RuntimeError` Exception
+
+If you have seen a warning in your logs like this:
+
+```console
+WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
+    initialized. We must use the `spawn` multiprocessing start method. Setting
+    VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
+    https://docs.vllm.ai/en/latest/usage/troubleshooting.html#python-multiprocessing
+    for more information.
+```
+
+or an error from Python that looks like this:
+
+??? console "Logs"
+
+    ```console
+    RuntimeError:
+            An attempt has been made to start a new process before the
+            current process has finished its bootstrapping phase.
+
+            This probably means that you are not using fork to start your
+            child processes and you have forgotten to use the proper idiom
+            in the main module:
+
+                if __name__ == '__main__':
+                    freeze_support()
+                    ...
+
+            The "freeze_support()" line can be omitted if the program
+            is not going to be frozen to produce an executable.
+
+            To fix this issue, refer to the "Safe importing of main module"
+            section in https://docs.python.org/3/library/multiprocessing.html
+    ```
+
+then you must update your Python code to guard usage of `vllm` behind a `if
+__name__ == '__main__':` block. For example, instead of this:
+
+```python
+import vllm
+
+llm = vllm.LLM(...)
+```
+
+try this instead:
+
+```python
+if __name__ == '__main__':
+    import vllm
+
+    llm = vllm.LLM(...)
+```
+
+## `torch.compile` Error
+
+vLLM heavily depends on `torch.compile` to optimize the model for better performance, which introduces the dependency on the `torch.compile` functionality and the `triton` library. By default, we use `torch.compile` to [optimize some functions](gh-pr:10406) in the model. Before running vLLM, you can check if `torch.compile` is working as expected by running the following script:
+
+??? code
+
+    ```python
+    import torch
+
+    @torch.compile
+    def f(x):
+        # a simple function to test torch.compile
+        x = x + 1
+        x = x * 2
+        x = x.sin()
+        return x
+
+    x = torch.randn(4, 4).cuda()
+    print(f(x))
+    ```
+
+If it raises errors from `torch/_inductor` directory, usually it means you have a custom `triton` library that is not compatible with the version of PyTorch you are using. See <gh-issue:12219> for example.
+
+## Model failed to be inspected
+
+If you see an error like:
+
+```text
+  File "vllm/model_executor/models/registry.py", line xxx, in _raise_for_unsupported
+    raise ValueError(
+ValueError: Model architectures ['<arch>'] failed to be inspected. Please check the logs for more details.
+```
+
+It means that vLLM failed to import the model file.
+Usually, it is related to missing dependencies or outdated binaries in the vLLM build.
+Please read the logs carefully to determine the root cause of the error.
+
+## Model not supported
+
+If you see an error like:
+
+```text
+Traceback (most recent call last):
+...
+  File "vllm/model_executor/models/registry.py", line xxx, in inspect_model_cls
+    for arch in architectures:
+TypeError: 'NoneType' object is not iterable
+```
+
+or:
+
+```text
+  File "vllm/model_executor/models/registry.py", line xxx, in _raise_for_unsupported
+    raise ValueError(
+ValueError: Model architectures ['<arch>'] are not supported for now. Supported architectures: [...]
+```
+
+But you are sure that the model is in the [list of supported models](../models/supported_models.md), there may be some issue with vLLM's model resolution. In that case, please follow [these steps](../configuration/model_resolution.md) to explicitly specify the vLLM implementation for the model.
+
+## Failed to infer device type
+
+If you see an error like `RuntimeError: Failed to infer device type`, it means that vLLM failed to infer the device type of the runtime environment. You can check [the code](gh-file:vllm/platforms/__init__.py) to see how vLLM infers the device type and why it is not working as expected. After [this PR](gh-pr:14195), you can also set the environment variable `VLLM_LOGGING_LEVEL=DEBUG` to see more detailed logs to help debug the issue.
+
+## NCCL error: unhandled system error during `ncclCommInitRank`
+
+If your serving workload uses GPUDirect RDMA for distributed serving across multiple nodes and encounters an error during `ncclCommInitRank`, with no clear error message even with `NCCL_DEBUG=INFO` set, it might look like this:
+
+```text
+Error executing method 'init_device'. This might cause deadlock in distributed execution.
+Traceback (most recent call last):
+...
+   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/device_communicators/pynccl.py", line 99, in __init__
+     self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
+                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/device_communicators/pynccl_wrapper.py", line 277, in ncclCommInitRank
+     self.NCCL_CHECK(self._funcs["ncclCommInitRank"](ctypes.byref(comm),
+   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/device_communicators/pynccl_wrapper.py", line 256, in NCCL_CHECK
+     raise RuntimeError(f"NCCL error: {error_str}")
+ RuntimeError: NCCL error: unhandled system error (run with NCCL_DEBUG=INFO for details)
+...
+```
+
+This indicates vLLM failed to initialize the NCCL communicator, possibly due to a missing `IPC_LOCK` linux capability  or an unmounted `/dev/shm`. Refer to [Distributed Inference and Serving](../serving/distributed_serving.md#running-vllm-on-multiple-nodes) for guidance on properly configuring the environment for distributed serving.
+
+## Known Issues
+
+- In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759).
+- To circumvent a NCCL [bug](https://github.com/NVIDIA/nccl/issues/1234) , all vLLM processes will set an environment variable `NCCL_CUMEM_ENABLE=0` to disable NCCL's `cuMem` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in the [RLHF integration](https://github.com/OpenRLHF/OpenRLHF/pull/604) and the [discussion](gh-issue:5723#issuecomment-2554389656) .
diff --git a/vllm_v0.10.0/docs/usage/usage_stats.md b/vllm_v0.10.0/docs/usage/usage_stats.md
new file mode 100644
index 0000000..e78c675
--- /dev/null
+++ b/vllm_v0.10.0/docs/usage/usage_stats.md
@@ -0,0 +1,61 @@
+# Usage Stats Collection
+
+vLLM collects anonymous usage data by default to help the engineering team better understand which hardware and model configurations are widely used. This data allows them to prioritize their efforts on the most common workloads. The collected data is transparent, does not contain any sensitive information.
+
+A subset of the data, after cleaning and aggregation, will be publicly released for the community's benefit. For example, you can see the 2024 usage report [here](https://2024.vllm.ai).
+
+## What data is collected?
+
+The list of data collected by the latest version of vLLM can be found here: <gh-file:vllm/usage/usage_lib.py>
+
+Here is an example as of v0.4.0:
+
+??? console "Output"
+
+    ```json
+    {
+      "uuid": "fbe880e9-084d-4cab-a395-8984c50f1109",
+      "provider": "GCP",
+      "num_cpu": 24,
+      "cpu_type": "Intel(R) Xeon(R) CPU @ 2.20GHz",
+      "cpu_family_model_stepping": "6,85,7",
+      "total_memory": 101261135872,
+      "architecture": "x86_64",
+      "platform": "Linux-5.10.0-28-cloud-amd64-x86_64-with-glibc2.31",
+      "gpu_count": 2,
+      "gpu_type": "NVIDIA L4",
+      "gpu_memory_per_device": 23580639232,
+      "model_architecture": "OPTForCausalLM",
+      "vllm_version": "0.3.2+cu123",
+      "context": "LLM_CLASS",
+      "log_time": 1711663373492490000,
+      "source": "production",
+      "dtype": "torch.float16",
+      "tensor_parallel_size": 1,
+      "block_size": 16,
+      "gpu_memory_utilization": 0.9,
+      "quantization": null,
+      "kv_cache_dtype": "auto",
+      "enable_lora": false,
+      "enable_prefix_caching": false,
+      "enforce_eager": false,
+      "disable_custom_all_reduce": true
+    }
+    ```
+
+You can preview the collected data by running the following command:
+
+```bash
+tail ~/.config/vllm/usage_stats.json
+```
+
+## Opting out
+
+You can opt-out of usage stats collection by setting the `VLLM_NO_USAGE_STATS` or `DO_NOT_TRACK` environment variable, or by creating a `~/.config/vllm/do_not_track` file:
+
+```bash
+# Any of the following methods can disable usage stats collection
+export VLLM_NO_USAGE_STATS=1
+export DO_NOT_TRACK=1
+mkdir -p ~/.config/vllm && touch ~/.config/vllm/do_not_track
+```
diff --git a/vllm_v0.10.0/docs/usage/v1_guide.md b/vllm_v0.10.0/docs/usage/v1_guide.md
new file mode 100644
index 0000000..498ff3d
--- /dev/null
+++ b/vllm_v0.10.0/docs/usage/v1_guide.md
@@ -0,0 +1,183 @@
+# vLLM V1
+
+!!! announcement
+
+    We have started the process of deprecating V0. Please read [RFC #18571](gh-issue:18571) for more details.
+
+V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
+
+To disable V1, please set the environment variable as: `VLLM_USE_V1=0`, and send us a GitHub issue sharing the reason!
+
+## Why vLLM V1?
+
+vLLM V0 successfully supported a wide range of models and hardware, but as new features were developed independently, the system grew increasingly complex. This complexity made it harder to integrate new capabilities and introduced technical debt, revealing the need for a more streamlined and unified design.
+
+Building on V0’s success, vLLM V1 retains the stable and proven components from V0
+(such as the models, GPU kernels, and utilities). At the same time, it significantly
+re-architects the core systems, covering the scheduler, KV cache manager, worker,
+sampler, and API server, to provide a cohesive, maintainable framework that better
+accommodates continued growth and innovation.
+
+Specifically, V1 aims to:
+
+- Provide a **simple, modular, and easy-to-hack codebase**.
+- Ensure **high performance** with near-zero CPU overhead.
+- **Combine key optimizations** into a unified architecture.
+- Require **zero configs** by enabling features/optimizations by default.
+
+We see significant performance improvements from upgrading to V1 core engine, in
+particular for long context scenarios. Please see performance benchmark (To be
+added).
+
+For more details, check out the vLLM V1 blog post [vLLM V1: A Major
+Upgrade to vLLM’s Core Architecture](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html) (published Jan 27, 2025).
+
+This living user guide outlines a few known **important changes and limitations** introduced by vLLM V1. The team has been working actively to bring V1 as the default engine, therefore this guide will be updated constantly as more features get supported on vLLM V1.
+
+## Current Status
+
+For each item, our progress towards V1 support falls into one of the following states:
+
+- **🚀 Optimized**: Nearly fully optimized, with no further work currently planned.
+- **🟢 Functional**: Fully operational, with ongoing optimizations.
+- **🚧 WIP**: Under active development.
+- **🟡 Planned**: Scheduled for future implementation (some may have open PRs/RFCs).
+- **🟠 Delayed**: Temporarily dropped in V1 but planned to be re-introduced later.
+- **🔴 Deprecated**: Not planned for V1 unless there is strong demand.
+
+!!! note
+    vLLM V1’s unified scheduler treats both prompt and output tokens the same
+    way by using a simple dictionary (e.g., `{request_id: num_tokens}`) to dynamically
+    allocate a fixed token budget per request, enabling features like chunked prefills,
+    prefix caching, and speculative decoding without a strict separation between prefill
+    and decode phases.
+
+The V1 scheduler supports multiple scheduling policies, including First-Come,
+First-Served (FCFS) and priority-based scheduling (where requests are processed
+based on assigned priority, with FCFS as a tie-breaker), configurable via the
+`--scheduling-policy` argument.
+
+### Hardware
+
+| Hardware   | Status                             |
+|------------|------------------------------------|
+| **NVIDIA** | <nobr>🚀</nobr>                   |
+| **AMD**    | <nobr>🟢</nobr>                   |
+| **TPU**    | <nobr>🟢</nobr>                   |
+| **CPU**    | <nobr>🟢 (x86) 🟡 (MacOS) </nobr> |
+
+!!! note
+
+    More hardware platforms may be supported via plugins, e.g.:
+
+    - [vllm-ascend](https://github.com/vllm-project/vllm-ascend)
+    - [vllm-spyre](https://github.com/vllm-project/vllm-spyre)
+    - [vllm-openvino](https://github.com/vllm-project/vllm-openvino)
+
+    Please check their corresponding repositories for more details.
+
+### Models
+
+| Model Type                  | Status                                                                             |
+|-----------------------------|------------------------------------------------------------------------------------|
+| **Decoder-only Models**     | <nobr>🚀 Optimized</nobr>                                                          |
+| **Encoder-Decoder Models**  | <nobr>🟠 Delayed</nobr>                                                            |
+| **Embedding Models**        | <nobr>🟢 Functional</nobr>                                                         |
+| **Mamba Models**            | <nobr>🟢 (Mamba-2), 🟡 (Mamba-1)</nobr>                                            |
+| **Multimodal Models**       | <nobr>🟢 Functional</nobr>                                                         |
+
+vLLM V1 currently excludes model architectures with the `SupportsV0Only` protocol.
+
+!!! tip
+
+    This corresponds to the V1 column in our [list of supported models](../models/supported_models.md).
+
+See below for the status of models that are not yet supported or have more features planned in V1.
+
+#### Embedding Models
+
+The initial basic support is now functional.
+
+Later, we will consider using [hidden states processor](gh-issue:12249),
+which is based on [global logits processor](gh-pr:13360)
+to enable simultaneous generation and embedding using the same engine instance in V1.
+
+#### Mamba Models
+
+Models using selective state-space mechanisms instead of standard transformer attention are partially supported.
+Models that use Mamba-2 layers (e.g., `Mamba2ForCausalLM`) are supported, but models that use older Mamba-1 layers
+(e.g., `MambaForCausalLM`, `JambaForCausalLM`) are not yet supported. Please note that these models currently require
+disabling prefix caching in V1.
+
+Models that combine Mamba-2 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
+`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`). Please note that
+these models currently require disabling prefix caching and using the FlashInfer attention backend in V1.
+
+#### Encoder-Decoder Models
+
+Models requiring cross-attention between separate encoder and decoder (e.g., `BartForConditionalGeneration`, `MllamaForConditionalGeneration`)
+are not yet supported.
+
+### Features
+
+| Feature                                     | Status                                                                            |
+|---------------------------------------------|-----------------------------------------------------------------------------------|
+| **Prefix Caching**                          | <nobr>🚀 Optimized</nobr>                                                         |
+| **Chunked Prefill**                         | <nobr>🚀 Optimized</nobr>                                                         |
+| **LoRA**                                    | <nobr>🚀 Optimized</nobr>                                                         |
+| **Logprobs Calculation**                    | <nobr>🟢 Functional</nobr>                                                        |
+| **FP8 KV Cache**                            | <nobr>🟢 Functional on Hopper devices (<gh-pr:15191>)</nobr>|
+| **Spec Decode**                             | <nobr>🚀 Optimized</nobr>                                                         |
+| **Prompt Logprobs with Prefix Caching**     | <nobr>🟡 Planned ([RFC #13414](gh-issue:13414))</nobr>|
+| **Structured Output Alternative Backends**  | <nobr>🟢 Functional</nobr>                                                        |
+| **Request-level Structured Output Backend** | <nobr>🔴 Deprecated</nobr>                                                        |
+| **best_of**                                 | <nobr>🔴 Deprecated ([RFC #13361](gh-issue:13361))</nobr>|
+| **Per-Request Logits Processors**           | <nobr>🔴 Deprecated ([RFC #13360](gh-pr:13360))</nobr> |
+| **GPU <> CPU KV Cache Swapping**            | <nobr>🔴 Deprecated</nobr>                                                        |
+
+!!! note
+
+    vLLM V1’s unified scheduler treats both prompt and output tokens the same
+    way by using a simple dictionary (e.g., `{request_id: num_tokens}`) to dynamically
+    allocate a fixed token budget per request, enabling features like chunked prefills,
+    prefix caching, and speculative decoding without a strict separation between prefill
+    and decode phases.
+
+#### Semantic Changes to Logprobs
+
+vLLM V1 supports logprobs and prompt logprobs. However, there are some important semantic
+differences compared to V0:
+
+**Logprobs Calculation**
+
+Logprobs in V1 are now returned immediately once computed from the model’s raw output (i.e.
+before applying any logits post-processing such as temperature scaling or penalty
+adjustments). As a result, the returned logprobs do not reflect the final adjusted
+probabilities used during sampling.
+
+Support for logprobs with post-sampling adjustments is in progress and will be added in future updates.
+
+**Prompt Logprobs with Prefix Caching**
+
+Currently prompt logprobs are only supported when prefix caching is turned off via `--no-enable-prefix-caching`. In a future release, prompt logprobs will be compatible with prefix caching, but a recomputation will be triggered to recover the full prompt logprobs even upon a prefix cache hit. See details in [RFC #13414](gh-issue:13414).
+
+#### Deprecated Features
+
+As part of the major architectural rework in vLLM V1, several legacy features have been deprecated.
+
+**Sampling features**
+
+- **best_of**: This feature has been deprecated due to limited usage. See details at [RFC #13361](gh-issue:13361).
+- **Per-Request Logits Processors**: In V0, users could pass custom
+  processing functions to adjust logits on a per-request basis. In vLLM V1, this
+  feature has been deprecated. Instead, the design is moving toward supporting **global logits
+  processors**, a feature the team is actively working on for future releases. See details at [RFC #13360](gh-pr:13360).
+
+**KV Cache features**
+
+- **GPU <> CPU KV Cache Swapping**: with the new simplified core architecture, vLLM V1 no longer requires KV cache swapping
+to handle request preemptions.
+
+**Structured Output features**
+
+- **Request-level Structured Output Backend**: Deprecated, alternative backends (outlines, guidance) with fallbacks is supported now.
diff --git a/vllm_v0.10.0/examples/offline_inference/audio_language.py b/vllm_v0.10.0/examples/offline_inference/audio_language.py
new file mode 100644
index 0000000..8014cb5
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/audio_language.py
@@ -0,0 +1,410 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This example shows how to use vLLM for running offline inference
+with the correct prompt format on audio language models.
+
+For most models, the prompt format should follow corresponding examples
+on HuggingFace model repository.
+"""
+
+import os
+from dataclasses import asdict
+from typing import Any, NamedTuple, Optional
+
+from huggingface_hub import snapshot_download
+from transformers import AutoTokenizer
+
+from vllm import LLM, EngineArgs, SamplingParams
+from vllm.assets.audio import AudioAsset
+from vllm.lora.request import LoRARequest
+from vllm.utils import FlexibleArgumentParser
+
+audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
+question_per_audio_count = {
+    0: "What is 1+1?",
+    1: "What is recited in the audio?",
+    2: "What sport and what nursery rhyme are referenced?",
+}
+
+
+class ModelRequestData(NamedTuple):
+    engine_args: EngineArgs
+    prompt: Optional[str] = None
+    prompt_token_ids: Optional[dict[str, list[int]]] = None
+    multi_modal_data: Optional[dict[str, Any]] = None
+    stop_token_ids: Optional[list[int]] = None
+    lora_requests: Optional[list[LoRARequest]] = None
+
+
+# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
+# lower-end GPUs.
+# Unless specified, these settings have been tested to work on a single L4.
+
+
+# Voxtral
+def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
+    from mistral_common.audio import Audio
+    from mistral_common.protocol.instruct.messages import (
+        AudioChunk,
+        RawAudio,
+        TextChunk,
+        UserMessage,
+    )
+    from mistral_common.protocol.instruct.request import ChatCompletionRequest
+    from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
+
+    model_name = "mistralai/Voxtral-Mini-3B-2507"
+    tokenizer = MistralTokenizer.from_hf_hub(model_name)
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"audio": audio_count},
+        config_format="mistral",
+        load_format="mistral",
+        tokenizer_mode="mistral",
+        enforce_eager=True,
+        enable_chunked_prefill=False,
+    )
+
+    text_chunk = TextChunk(text=question)
+    audios = [
+        Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
+        for i in range(audio_count)
+    ]
+    audio_chunks = [
+        AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
+    ]
+
+    messages = [UserMessage(content=[*audio_chunks, text_chunk])]
+
+    req = ChatCompletionRequest(messages=messages, model=model_name)
+
+    tokens = tokenizer.encode_chat_completion(req)
+    prompt_ids, audios = tokens.tokens, tokens.audios
+
+    audios_and_sr = [(au.audio_array, au.sampling_rate) for au in audios]
+
+    multi_modal_data = {"audio": audios_and_sr}
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt_token_ids=prompt_ids,
+        multi_modal_data=multi_modal_data,
+    )
+
+
+# Granite Speech
+def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
+    # NOTE - the setting in this example are somehat different than what is
+    # optimal for granite speech, and it is generally recommended to use beam
+    # search. Check the model README for suggested settings.
+    # https://huggingface.co/ibm-granite/granite-speech-3.3-8b
+    model_name = "ibm-granite/granite-speech-3.3-8b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=2048,
+        max_num_seqs=2,
+        enable_lora=True,
+        max_lora_rank=64,
+        limit_mm_per_prompt={"audio": audio_count},
+    )
+
+    # The model has an audio-specific lora directly in its model dir;
+    # it should be enabled whenever you pass audio inputs to the model.
+    speech_lora_path = model_name
+    audio_placeholder = "<|audio|>" * audio_count
+    prompts = f"<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>{audio_placeholder}{question}<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>"  # noqa: E501
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompts,
+        lora_requests=[LoRARequest("speech", 1, speech_lora_path)],
+    )
+
+
+# MiniCPM-O
+def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
+    model_name = "openbmb/MiniCPM-o-2_6"
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"audio": audio_count},
+    )
+
+    stop_tokens = ["<|im_end|>", "<|endoftext|>"]
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+
+    audio_placeholder = "(<audio>./</audio>)" * audio_count
+    audio_chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n<|spk_bos|><|spk|><|spk_eos|><|tts_bos|>' }}{% endif %}"  # noqa: E501
+    messages = [{"role": "user", "content": f"{audio_placeholder}\n{question}"}]
+    prompt = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,
+        chat_template=audio_chat_template,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+    )
+
+
+# Phi-4-multimodal-instruct
+def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
+    """
+    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
+    show how to process audio inputs.
+    """
+    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
+    # Since the vision-lora and speech-lora co-exist with the base model,
+    # we have to manually specify the path of the lora weights.
+    speech_lora_path = os.path.join(model_path, "speech-lora")
+    placeholders = "".join([f"<|audio_{i + 1}|>" for i in range(audio_count)])
+
+    prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
+
+    engine_args = EngineArgs(
+        model=model_path,
+        trust_remote_code=True,
+        max_model_len=12800,
+        max_num_seqs=2,
+        enable_lora=True,
+        max_lora_rank=320,
+        limit_mm_per_prompt={"audio": audio_count},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompts,
+        lora_requests=[LoRARequest("speech", 1, speech_lora_path)],
+    )
+
+
+# Qwen2-Audio
+def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
+    model_name = "Qwen/Qwen2-Audio-7B-Instruct"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"audio": audio_count},
+    )
+
+    audio_in_prompt = "".join(
+        [
+            f"Audio {idx + 1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
+            for idx in range(audio_count)
+        ]
+    )
+
+    prompt = (
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+        "<|im_start|>user\n"
+        f"{audio_in_prompt}{question}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+    )
+
+
+# Qwen2.5-Omni
+def run_qwen2_5_omni(question: str, audio_count: int):
+    model_name = "Qwen/Qwen2.5-Omni-7B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"audio": audio_count},
+    )
+
+    audio_in_prompt = "".join(
+        ["<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)]
+    )
+
+    default_system = (
+        "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
+        "Group, capable of perceiving auditory and visual inputs, as well as "
+        "generating text and speech."
+    )
+
+    prompt = (
+        f"<|im_start|>system\n{default_system}<|im_end|>\n"
+        "<|im_start|>user\n"
+        f"{audio_in_prompt}{question}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+    )
+
+
+# Ultravox 0.5-1B
+def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
+    model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    messages = [{"role": "user", "content": "<|audio|>\n" * audio_count + question}]
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=5,
+        trust_remote_code=True,
+        limit_mm_per_prompt={"audio": audio_count},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+    )
+
+
+# Whisper
+def run_whisper(question: str, audio_count: int) -> ModelRequestData:
+    assert audio_count == 1, "Whisper only support single audio input per prompt"
+    model_name = "openai/whisper-large-v3-turbo"
+
+    prompt = "<|startoftranscript|>"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=448,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"audio": audio_count},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+    )
+
+
+model_example_map = {
+    "voxtral": run_voxtral,
+    "granite_speech": run_granite_speech,
+    "minicpmo": run_minicpmo,
+    "phi4_mm": run_phi4mm,
+    "qwen2_audio": run_qwen2_audio,
+    "qwen2_5_omni": run_qwen2_5_omni,
+    "ultravox": run_ultravox,
+    "whisper": run_whisper,
+}
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description="Demo on using vLLM for offline inference with "
+        "audio language models"
+    )
+    parser.add_argument(
+        "--model-type",
+        "-m",
+        type=str,
+        default="ultravox",
+        choices=model_example_map.keys(),
+        help='Huggingface "model_type".',
+    )
+    parser.add_argument(
+        "--num-prompts", type=int, default=1, help="Number of prompts to run."
+    )
+    parser.add_argument(
+        "--num-audios",
+        type=int,
+        default=1,
+        choices=[0, 1, 2],
+        help="Number of audio items per prompt.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="Set the seed when initializing `vllm.LLM`.",
+    )
+
+    return parser.parse_args()
+
+
+def main(args):
+    model = args.model_type
+    if model not in model_example_map:
+        raise ValueError(f"Model type {model} is not supported.")
+
+    audio_count = args.num_audios
+    req_data = model_example_map[model](
+        question_per_audio_count[audio_count], audio_count
+    )
+
+    # Disable other modalities to save memory
+    default_limits = {"image": 0, "video": 0, "audio": 0}
+    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
+        req_data.engine_args.limit_mm_per_prompt or {}
+    )
+
+    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    llm = LLM(**engine_args)
+
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(
+        temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids
+    )
+
+    mm_data = req_data.multi_modal_data
+    if not mm_data:
+        mm_data = {}
+        if audio_count > 0:
+            mm_data = {
+                "audio": [
+                    asset.audio_and_sample_rate for asset in audio_assets[:audio_count]
+                ]
+            }
+
+    assert args.num_prompts > 0
+    inputs = {"multi_modal_data": mm_data}
+
+    if req_data.prompt:
+        inputs["prompt"] = req_data.prompt
+    else:
+        inputs["prompt_token_ids"] = req_data.prompt_token_ids
+
+    if args.num_prompts > 1:
+        # Batch inference
+        inputs = [inputs] * args.num_prompts
+    # Add LoRA request if applicable
+    lora_request = (
+        req_data.lora_requests * args.num_prompts if req_data.lora_requests else None
+    )
+
+    outputs = llm.generate(
+        inputs,
+        sampling_params=sampling_params,
+        lora_request=lora_request,
+    )
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/examples/offline_inference/automatic_prefix_caching.py b/vllm_v0.10.0/examples/offline_inference/automatic_prefix_caching.py
new file mode 100644
index 0000000..a01a956
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/automatic_prefix_caching.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Demonstration script for Automatic Prefix Caching (APC) in vLLM.
+
+Automatic Prefix Caching (APC) allows the vLLM engine to reuse cached
+KV (key-value) pairs from previous prompts if a new query shares the same
+prefix. This reduces redundant computation and improves inference speed.
+
+To enable APC, set `enable_prefix_caching=True` when initializing the
+vLLM engine.
+
+This script uses a long Markdown table as the shared prompt prefix and
+compares the generation time for two queries that share the same prefix
+but ask different questions.
+
+Run:
+python examples/offline_inference/automatic_prefix_caching.py
+"""
+
+import time
+
+from vllm import LLM, SamplingParams
+
+# ruff: noqa: E501
+# A prompt containing a large markdown table. The table is randomly generated by GPT-4.
+LONG_PROMPT = (
+    "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n"
+    + """
+| ID  | Name          | Age | Occupation    | Country       | Email                  | Phone Number   | Address                       |
+|-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------|
+| 1   | John Doe      | 29  | Engineer      | USA           | john.doe@example.com   | 555-1234       | 123 Elm St, Springfield, IL  |
+| 2   | Jane Smith    | 34  | Doctor        | Canada        | jane.smith@example.com | 555-5678       | 456 Oak St, Toronto, ON      |
+| 3   | Alice Johnson | 27  | Teacher       | UK            | alice.j@example.com    | 555-8765       | 789 Pine St, London, UK      |
+| 4   | Bob Brown     | 45  | Artist        | Australia     | bob.b@example.com      | 555-4321       | 321 Maple St, Sydney, NSW    |
+| 5   | Carol White   | 31  | Scientist     | New Zealand   | carol.w@example.com    | 555-6789       | 654 Birch St, Wellington, NZ |
+| 6   | Dave Green    | 28  | Lawyer        | Ireland       | dave.g@example.com     | 555-3456       | 987 Cedar St, Dublin, IE     |
+| 7   | Emma Black    | 40  | Musician      | USA           | emma.b@example.com     | 555-1111       | 246 Ash St, New York, NY     |
+| 8   | Frank Blue    | 37  | Chef          | Canada        | frank.b@example.com    | 555-2222       | 135 Spruce St, Vancouver, BC |
+| 9   | Grace Yellow  | 50  | Engineer      | UK            | grace.y@example.com    | 555-3333       | 864 Fir St, Manchester, UK   |
+| 10  | Henry Violet  | 32  | Artist        | Australia     | henry.v@example.com    | 555-4444       | 753 Willow St, Melbourne, VIC|
+| 11  | Irene Orange  | 26  | Scientist     | New Zealand   | irene.o@example.com    | 555-5555       | 912 Poplar St, Auckland, NZ  |
+| 12  | Jack Indigo   | 38  | Teacher       | Ireland       | jack.i@example.com     | 555-6666       | 159 Elm St, Cork, IE         |
+| 13  | Karen Red     | 41  | Lawyer        | USA           | karen.r@example.com    | 555-7777       | 357 Cedar St, Boston, MA     |
+| 14  | Leo Brown     | 30  | Chef          | Canada        | leo.b@example.com      | 555-8888       | 246 Oak St, Calgary, AB      |
+| 15  | Mia Green     | 33  | Musician      | UK            | mia.g@example.com      | 555-9999       | 975 Pine St, Edinburgh, UK   |
+| 16  | Noah Yellow   | 29  | Doctor        | Australia     | noah.y@example.com     | 555-0000       | 864 Birch St, Brisbane, QLD  |
+| 17  | Olivia Blue   | 35  | Engineer      | New Zealand   | olivia.b@example.com   | 555-1212       | 753 Maple St, Hamilton, NZ   |
+| 18  | Peter Black   | 42  | Artist        | Ireland       | peter.b@example.com    | 555-3434       | 912 Fir St, Limerick, IE     |
+| 19  | Quinn White   | 28  | Scientist     | USA           | quinn.w@example.com    | 555-5656       | 159 Willow St, Seattle, WA   |
+| 20  | Rachel Red    | 31  | Teacher       | Canada        | rachel.r@example.com   | 555-7878       | 357 Poplar St, Ottawa, ON    |
+| 21  | Steve Green   | 44  | Lawyer        | UK            | steve.g@example.com    | 555-9090       | 753 Elm St, Birmingham, UK   |
+| 22  | Tina Blue     | 36  | Musician      | Australia     | tina.b@example.com     | 555-1213       | 864 Cedar St, Perth, WA      |
+| 23  | Umar Black    | 39  | Chef          | New Zealand   | umar.b@example.com     | 555-3435       | 975 Spruce St, Christchurch, NZ|
+| 24  | Victor Yellow | 43  | Engineer      | Ireland       | victor.y@example.com   | 555-5657       | 246 Willow St, Galway, IE    |
+| 25  | Wendy Orange  | 27  | Artist        | USA           | wendy.o@example.com    | 555-7879       | 135 Elm St, Denver, CO       |
+| 26  | Xavier Green  | 34  | Scientist     | Canada        | xavier.g@example.com   | 555-9091       | 357 Oak St, Montreal, QC     |
+| 27  | Yara Red      | 41  | Teacher       | UK            | yara.r@example.com     | 555-1214       | 975 Pine St, Leeds, UK       |
+| 28  | Zack Blue     | 30  | Lawyer        | Australia     | zack.b@example.com     | 555-3436       | 135 Birch St, Adelaide, SA   |
+| 29  | Amy White     | 33  | Musician      | New Zealand   | amy.w@example.com      | 555-5658       | 159 Maple St, Wellington, NZ |
+| 30  | Ben Black     | 38  | Chef          | Ireland       | ben.b@example.com      | 555-7870       | 246 Fir St, Waterford, IE    |
+"""
+)
+
+
+def get_generation_time(llm, sampling_params, prompts):
+    # time the generation
+    start_time = time.time()
+    output = llm.generate(prompts, sampling_params=sampling_params)
+    end_time = time.time()
+    # print the output and generation time
+    print("-" * 30)
+    print(f"Output: {output[0].outputs[0].text}")
+    print(f"Generation time: {end_time - start_time} seconds.")
+    print("-" * 30)
+
+
+def main():
+    # set enable_prefix_caching=True to enable APC
+    llm = LLM(model="lmsys/longchat-13b-16k", enable_prefix_caching=True)
+
+    sampling_params = SamplingParams(temperature=0, max_tokens=100)
+
+    # Querying the age of John Doe
+    get_generation_time(
+        llm,
+        sampling_params,
+        LONG_PROMPT
+        + "Question: what is the age of John Doe? Your answer: The age of John Doe is ",
+    )
+
+    # Querying the age of Zack Blue
+    # This query will be faster since vllm avoids computing the KV cache of LONG_PROMPT again.
+    get_generation_time(
+        llm,
+        sampling_params,
+        LONG_PROMPT
+        + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ",
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/offline_inference/basic/README.md b/vllm_v0.10.0/examples/offline_inference/basic/README.md
new file mode 100644
index 0000000..0a2bd6e
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/basic/README.md
@@ -0,0 +1,94 @@
+# Basic
+
+The `LLM` class provides the primary Python interface for doing offline inference, which is interacting with a model without using a separate model inference server.
+
+## Usage
+
+The first script in this example shows the most basic usage of vLLM. If you are new to Python and vLLM, you should start here.
+
+```bash
+python examples/offline_inference/basic/basic.py
+```
+
+The rest of the scripts include an [argument parser](https://docs.python.org/3/library/argparse.html), which you can use to pass any arguments that are compatible with [`LLM`](https://docs.vllm.ai/en/latest/api/offline_inference/llm.html). Try running the script with `--help` for a list of all available arguments.
+
+```bash
+python examples/offline_inference/basic/classify.py
+```
+
+```bash
+python examples/offline_inference/basic/embed.py
+```
+
+```bash
+python examples/offline_inference/basic/score.py
+```
+
+The chat and generate scripts also accept the [sampling parameters](https://docs.vllm.ai/en/latest/api/inference_params.html#sampling-parameters): `max_tokens`, `temperature`, `top_p` and `top_k`.
+
+```bash
+python examples/offline_inference/basic/chat.py
+```
+
+```bash
+python examples/offline_inference/basic/generate.py
+```
+
+## Features
+
+In the scripts that support passing arguments, you can experiment with the following features.
+
+### Default generation config
+
+The `--generation-config` argument specifies where the generation config will be loaded from when calling `LLM.get_default_sampling_params()`. If set to ‘auto’, the generation config will be loaded from model path. If set to a folder path, the generation config will be loaded from the specified folder path. If it is not provided, vLLM defaults will be used.
+
+> If max_new_tokens is specified in generation config, then it sets a server-wide limit on the number of output tokens for all requests.
+
+Try it yourself with the following argument:
+
+```bash
+--generation-config auto
+```
+
+### Quantization
+
+#### AQLM
+
+vLLM supports models that are quantized using AQLM.
+
+Try one yourself by passing one of the following models to the `--model` argument:
+
+- `ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf`
+- `ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf`
+- `ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf`
+- `ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf`
+- `BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf`
+
+> Some of these models are likely to be too large for a single GPU. You can split them across multiple GPUs by setting `--tensor-parallel-size` to the number of required GPUs.
+
+#### GGUF
+
+vLLM supports models that are quantized using GGUF.
+
+Try one yourself by downloading a quantized GGUF model and using the following arguments:
+
+```python
+from huggingface_hub import hf_hub_download
+repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF"
+filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf"
+print(hf_hub_download(repo_id, filename=filename))
+```
+
+```bash
+--model {local-path-printed-above} --tokenizer microsoft/Phi-3-medium-4k-instruct
+```
+
+### CPU offload
+
+The `--cpu-offload-gb` argument can be seen as a virtual way to increase the GPU memory size. For example, if you have one 24 GB GPU and set this to 10, virtually you can think of it as a 34 GB GPU. Then you can load a 13B model with BF16 weight, which requires at least 26GB GPU memory. Note that this requires fast CPU-GPU interconnect, as part of the model is loaded from CPU memory to GPU memory on the fly in each model forward pass.
+
+Try it yourself with the following arguments:
+
+```bash
+--model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+```
diff --git a/vllm_v0.10.0/examples/offline_inference/basic/basic.py b/vllm_v0.10.0/examples/offline_inference/basic/basic.py
new file mode 100644
index 0000000..78bfda9
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/basic/basic.py
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+
+def main():
+    # Create an LLM.
+    llm = LLM(model="facebook/opt-125m")
+    # Generate texts from the prompts.
+    # The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt:    {prompt!r}")
+        print(f"Output:    {generated_text!r}")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/offline_inference/basic/chat.py b/vllm_v0.10.0/examples/offline_inference/basic/chat.py
new file mode 100644
index 0000000..d078c51
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/basic/chat.py
@@ -0,0 +1,95 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import LLM, EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def create_parser():
+    parser = FlexibleArgumentParser()
+    # Add engine args
+    EngineArgs.add_cli_args(parser)
+    parser.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
+    # Add sampling params
+    sampling_group = parser.add_argument_group("Sampling parameters")
+    sampling_group.add_argument("--max-tokens", type=int)
+    sampling_group.add_argument("--temperature", type=float)
+    sampling_group.add_argument("--top-p", type=float)
+    sampling_group.add_argument("--top-k", type=int)
+    # Add example params
+    parser.add_argument("--chat-template-path", type=str)
+
+    return parser
+
+
+def main(args: dict):
+    # Pop arguments not used by LLM
+    max_tokens = args.pop("max_tokens")
+    temperature = args.pop("temperature")
+    top_p = args.pop("top_p")
+    top_k = args.pop("top_k")
+    chat_template_path = args.pop("chat_template_path")
+
+    # Create an LLM
+    llm = LLM(**args)
+
+    # Create sampling params object
+    sampling_params = llm.get_default_sampling_params()
+    if max_tokens is not None:
+        sampling_params.max_tokens = max_tokens
+    if temperature is not None:
+        sampling_params.temperature = temperature
+    if top_p is not None:
+        sampling_params.top_p = top_p
+    if top_k is not None:
+        sampling_params.top_k = top_k
+
+    def print_outputs(outputs):
+        print("\nGenerated Outputs:\n" + "-" * 80)
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}\n")
+            print(f"Generated text: {generated_text!r}")
+            print("-" * 80)
+
+    print("=" * 80)
+
+    # In this script, we demonstrate how to pass input to the chat method:
+    conversation = [
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": "Hello"},
+        {"role": "assistant", "content": "Hello! How can I assist you today?"},
+        {
+            "role": "user",
+            "content": "Write an essay about the importance of higher education.",
+        },
+    ]
+    outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
+    print_outputs(outputs)
+
+    # You can run batch inference with llm.chat API
+    conversations = [conversation for _ in range(10)]
+
+    # We turn on tqdm progress bar to verify it's indeed running batch inference
+    outputs = llm.chat(conversations, sampling_params, use_tqdm=True)
+    print_outputs(outputs)
+
+    # A chat template can be optionally supplied.
+    # If not, the model will use its default chat template.
+    if chat_template_path is not None:
+        with open(chat_template_path) as f:
+            chat_template = f.read()
+
+        outputs = llm.chat(
+            conversations,
+            sampling_params,
+            use_tqdm=False,
+            chat_template=chat_template,
+        )
+
+
+if __name__ == "__main__":
+    parser = create_parser()
+    args: dict = vars(parser.parse_args())
+    main(args)
diff --git a/vllm_v0.10.0/examples/offline_inference/basic/classify.py b/vllm_v0.10.0/examples/offline_inference/basic/classify.py
new file mode 100644
index 0000000..aaf0e83
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/basic/classify.py
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from argparse import Namespace
+
+from vllm import LLM, EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(
+        model="jason9693/Qwen2.5-1.5B-apeach", task="classify", enforce_eager=True
+    )
+    return parser.parse_args()
+
+
+def main(args: Namespace):
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    # Create an LLM.
+    # You should pass task="classify" for classification models
+    llm = LLM(**vars(args))
+
+    # Generate logits. The output is a list of ClassificationRequestOutputs.
+    outputs = llm.classify(prompts)
+
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for prompt, output in zip(prompts, outputs):
+        probs = output.outputs.probs
+        probs_trimmed = (str(probs[:16])[:-1] + ", ...]") if len(probs) > 16 else probs
+        print(
+            f"Prompt: {prompt!r} \n"
+            f"Class Probabilities: {probs_trimmed} (size={len(probs)})"
+        )
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/examples/offline_inference/basic/embed.py b/vllm_v0.10.0/examples/offline_inference/basic/embed.py
new file mode 100644
index 0000000..7ff9c7f
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/basic/embed.py
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from argparse import Namespace
+
+from vllm import LLM, EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(
+        model="intfloat/e5-mistral-7b-instruct",
+        task="embed",
+        enforce_eager=True,
+        max_model_len=1024,
+    )
+    return parser.parse_args()
+
+
+def main(args: Namespace):
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    # Create an LLM.
+    # You should pass task="embed" for embedding models
+    llm = LLM(**vars(args))
+
+    # Generate embedding. The output is a list of EmbeddingRequestOutputs.
+    outputs = llm.embed(prompts)
+
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for prompt, output in zip(prompts, outputs):
+        embeds = output.outputs.embedding
+        embeds_trimmed = (
+            (str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds
+        )
+        print(f"Prompt: {prompt!r} \nEmbeddings: {embeds_trimmed} (size={len(embeds)})")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/examples/offline_inference/basic/generate.py b/vllm_v0.10.0/examples/offline_inference/basic/generate.py
new file mode 100644
index 0000000..6a41ef4
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/basic/generate.py
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import LLM, EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def create_parser():
+    parser = FlexibleArgumentParser()
+    # Add engine args
+    EngineArgs.add_cli_args(parser)
+    parser.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
+    # Add sampling params
+    sampling_group = parser.add_argument_group("Sampling parameters")
+    sampling_group.add_argument("--max-tokens", type=int)
+    sampling_group.add_argument("--temperature", type=float)
+    sampling_group.add_argument("--top-p", type=float)
+    sampling_group.add_argument("--top-k", type=int)
+
+    return parser
+
+
+def main(args: dict):
+    # Pop arguments not used by LLM
+    max_tokens = args.pop("max_tokens")
+    temperature = args.pop("temperature")
+    top_p = args.pop("top_p")
+    top_k = args.pop("top_k")
+
+    # Create an LLM
+    llm = LLM(**args)
+
+    # Create a sampling params object
+    sampling_params = llm.get_default_sampling_params()
+    if max_tokens is not None:
+        sampling_params.max_tokens = max_tokens
+    if temperature is not None:
+        sampling_params.temperature = temperature
+    if top_p is not None:
+        sampling_params.top_p = top_p
+    if top_k is not None:
+        sampling_params.top_k = top_k
+
+    # Generate texts from the prompts. The output is a list of RequestOutput
+    # objects that contain the prompt, generated text, and other information.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    parser = create_parser()
+    args: dict = vars(parser.parse_args())
+    main(args)
diff --git a/vllm_v0.10.0/examples/offline_inference/basic/score.py b/vllm_v0.10.0/examples/offline_inference/basic/score.py
new file mode 100644
index 0000000..d37527b
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/basic/score.py
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from argparse import Namespace
+
+from vllm import LLM, EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(
+        model="BAAI/bge-reranker-v2-m3", task="score", enforce_eager=True
+    )
+    return parser.parse_args()
+
+
+def main(args: Namespace):
+    # Sample prompts.
+    text_1 = "What is the capital of France?"
+    texts_2 = [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
+    ]
+
+    # Create an LLM.
+    # You should pass task="score" for cross-encoder models
+    llm = LLM(**vars(args))
+
+    # Generate scores. The output is a list of ScoringRequestOutputs.
+    outputs = llm.score(text_1, texts_2)
+
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for text_2, output in zip(texts_2, outputs):
+        score = output.outputs.score
+        print(f"Pair: {[text_1, text_2]!r} \nScore: {score}")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/examples/offline_inference/batch_llm_inference.py b/vllm_v0.10.0/examples/offline_inference/batch_llm_inference.py
new file mode 100644
index 0000000..22408dc
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/batch_llm_inference.py
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This example shows how to use Ray Data for data parallel batch inference.
+
+Ray Data is a data processing framework that can process very large datasets
+with first-class support for vLLM.
+
+Ray Data provides functionality for:
+* Reading and writing to most popular file formats and cloud object storage.
+* Streaming execution, so you can run inference on datasets that far exceed
+  the aggregate RAM of the cluster.
+* Scale up the workload without code changes.
+* Automatic sharding, load-balancing, and autoscaling across a Ray cluster,
+  with built-in fault-tolerance and retry semantics.
+* Continuous batching that keeps vLLM replicas saturated and maximizes GPU
+  utilization.
+* Compatible with tensor/pipeline parallel inference.
+
+Learn more about Ray Data's LLM integration:
+https://docs.ray.io/en/latest/data/working-with-llms.html
+"""
+
+import ray
+from packaging.version import Version
+from ray.data.llm import build_llm_processor, vLLMEngineProcessorConfig
+
+assert Version(ray.__version__) >= Version("2.44.1"), (
+    "Ray version must be at least 2.44.1"
+)
+
+# Uncomment to reduce clutter in stdout
+# ray.init(log_to_driver=False)
+# ray.data.DataContext.get_current().enable_progress_bars = False
+
+# Read one text file from S3. Ray Data supports reading multiple files
+# from cloud storage (such as JSONL, Parquet, CSV, binary format).
+ds = ray.data.read_text("s3://anonymous@air-example-data/prompts.txt")
+print(ds.schema())
+
+size = ds.count()
+print(f"Size of dataset: {size} prompts")
+
+# Configure vLLM engine.
+config = vLLMEngineProcessorConfig(
+    model_source="unsloth/Llama-3.1-8B-Instruct",
+    engine_kwargs={
+        "enable_chunked_prefill": True,
+        "max_num_batched_tokens": 4096,
+        "max_model_len": 16384,
+    },
+    concurrency=1,  # set the number of parallel vLLM replicas
+    batch_size=64,
+)
+
+# Create a Processor object, which will be used to
+# do batch inference on the dataset
+vllm_processor = build_llm_processor(
+    config,
+    preprocess=lambda row: dict(
+        messages=[
+            {"role": "system", "content": "You are a bot that responds with haikus."},
+            {"role": "user", "content": row["text"]},
+        ],
+        sampling_params=dict(
+            temperature=0.3,
+            max_tokens=250,
+        ),
+    ),
+    postprocess=lambda row: dict(
+        answer=row["generated_text"],
+        **row,  # This will return all the original columns in the dataset.
+    ),
+)
+
+ds = vllm_processor(ds)
+
+# Peek first 10 results.
+# NOTE: This is for local testing and debugging. For production use case,
+# one should write full result out as shown below.
+outputs = ds.take(limit=10)
+
+for output in outputs:
+    prompt = output["prompt"]
+    generated_text = output["generated_text"]
+    print(f"Prompt: {prompt!r}")
+    print(f"Generated text: {generated_text!r}")
+
+# Write inference output data out as Parquet files to S3.
+# Multiple files would be written to the output destination,
+# and each task would write one or more files separately.
+#
+# ds.write_parquet("s3://<your-output-bucket>")
diff --git a/vllm_v0.10.0/examples/offline_inference/chat_with_tools.py b/vllm_v0.10.0/examples/offline_inference/chat_with_tools.py
new file mode 100644
index 0000000..6e56e24
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/chat_with_tools.py
@@ -0,0 +1,147 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# ruff: noqa
+import json
+import random
+import string
+
+from vllm import LLM
+from vllm.sampling_params import SamplingParams
+
+# This script is an offline demo for function calling
+#
+# If you want to run a server/client setup, please follow this code:
+#
+# - Server:
+#
+# ```bash
+# vllm serve mistralai/Mistral-7B-Instruct-v0.3 --tokenizer-mode mistral --load-format mistral --config-format mistral
+# ```
+#
+# - Client:
+#
+# ```bash
+# curl --location 'http://<your-node-url>:8000/v1/chat/completions' \
+# --header 'Content-Type: application/json' \
+# --header 'Authorization: Bearer token' \
+# --data '{
+#     "model": "mistralai/Mistral-7B-Instruct-v0.3"
+#     "messages": [
+#       {
+#         "role": "user",
+#         "content": [
+#             {"type" : "text", "text": "Describe this image in detail please."},
+#             {"type": "image_url", "image_url": {"url": "https://s3.amazonaws.com/cms.ipressroom.com/338/files/201808/5b894ee1a138352221103195_A680%7Ejogging-edit/A680%7Ejogging-edit_hero.jpg"}},
+#             {"type" : "text", "text": "and this one as well. Answer in French."},
+#             {"type": "image_url", "image_url": {"url": "https://www.wolframcloud.com/obj/resourcesystem/images/a0e/a0ee3983-46c6-4c92-b85d-059044639928/6af8cfb971db031b.png"}}
+#         ]
+#       }
+#     ]
+#   }'
+# ```
+#
+# Usage:
+#     python demo.py simple
+#     python demo.py advanced
+
+model_name = "mistralai/Mistral-7B-Instruct-v0.3"
+# or switch to "mistralai/Mistral-Nemo-Instruct-2407"
+# or "mistralai/Mistral-Large-Instruct-2407"
+# or any other mistral model with function calling ability
+
+sampling_params = SamplingParams(max_tokens=8192, temperature=0.0)
+llm = LLM(
+    model=model_name,
+    tokenizer_mode="mistral",
+    config_format="mistral",
+    load_format="mistral",
+)
+
+
+def generate_random_id(length=9):
+    characters = string.ascii_letters + string.digits
+    random_id = "".join(random.choice(characters) for _ in range(length))
+    return random_id
+
+
+# simulate an API that can be called
+def get_current_weather(city: str, state: str, unit: "str"):
+    return (
+        f"The weather in {city}, {state} is 85 degrees {unit}. It is "
+        "partly cloudly, with highs in the 90's."
+    )
+
+
+tool_functions = {"get_current_weather": get_current_weather}
+
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string",
+                        "description": "The city to find the weather for, e.g. 'San Francisco'",
+                    },
+                    "state": {
+                        "type": "string",
+                        "description": "the two-letter abbreviation for the state that the city is"
+                        " in, e.g. 'CA' which would mean 'California'",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["city", "state", "unit"],
+            },
+        },
+    }
+]
+
+messages = [
+    {
+        "role": "user",
+        "content": "Can you tell me what the temperate will be in Dallas, in fahrenheit?",
+    }
+]
+
+outputs = llm.chat(messages, sampling_params=sampling_params, tools=tools)
+output = outputs[0].outputs[0].text.strip()
+
+# append the assistant message
+messages.append(
+    {
+        "role": "assistant",
+        "content": output,
+    }
+)
+
+# let's now actually parse and execute the model's output simulating an API call by using the
+# above defined function
+tool_calls = json.loads(output)
+tool_answers = [
+    tool_functions[call["name"]](**call["arguments"]) for call in tool_calls
+]
+
+# append the answer as a tool message and let the LLM give you an answer
+messages.append(
+    {
+        "role": "tool",
+        "content": "\n\n".join(tool_answers),
+        "tool_call_id": generate_random_id(),
+    }
+)
+
+outputs = llm.chat(messages, sampling_params, tools=tools)
+
+print(outputs[0].outputs[0].text.strip())
+# yields
+#   'The weather in Dallas, TX is 85 degrees fahrenheit. '
+#   'It is partly cloudly, with highs in the 90's.'
diff --git a/vllm_v0.10.0/examples/offline_inference/context_extension.py b/vllm_v0.10.0/examples/offline_inference/context_extension.py
new file mode 100644
index 0000000..df39e4c
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/context_extension.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This script demonstrates how to extend the context length
+of a Qwen model using the YARN method (rope_scaling)
+and run a simple chat example.
+
+Usage:
+    python examples/offline_inference/context_extension.py
+"""
+
+from vllm import LLM, SamplingParams
+
+
+def create_llm():
+    rope_theta = 1000000
+    original_max_position_embeddings = 32768
+    factor = 4.0
+
+    # Use yarn to extend context
+    hf_overrides = {
+        "rope_theta": rope_theta,
+        "rope_scaling": {
+            "rope_type": "yarn",
+            "factor": factor,
+            "original_max_position_embeddings": original_max_position_embeddings,
+        },
+        "max_model_len": int(original_max_position_embeddings * factor),
+    }
+
+    llm = LLM(model="Qwen/Qwen3-0.6B", hf_overrides=hf_overrides)
+    return llm
+
+
+def run_llm_chat(llm):
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        max_tokens=128,
+    )
+
+    conversation = [
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": "Hello"},
+        {"role": "assistant", "content": "Hello! How can I assist you today?"},
+    ]
+    outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
+    return outputs
+
+
+def print_outputs(outputs):
+    print("\nGenerated Outputs:\n" + "-" * 80)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\n")
+        print(f"Generated text: {generated_text!r}")
+        print("-" * 80)
+
+
+def main():
+    llm = create_llm()
+    outputs = run_llm_chat(llm)
+    print_outputs(outputs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/offline_inference/convert_model_to_seq_cls.py b/vllm_v0.10.0/examples/offline_inference/convert_model_to_seq_cls.py
new file mode 100644
index 0000000..7235602
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/convert_model_to_seq_cls.py
@@ -0,0 +1,134 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+
+import argparse
+import json
+
+import torch
+import transformers
+
+# Usage:
+# for BAAI/bge-reranker-v2-gemma
+# Caution: "Yes" and "yes" are two different tokens
+# python convert_model_to_seq_cls.py --model_name BAAI/bge-reranker-v2-gemma --classifier_from_tokens '["Yes"]' --method no_post_processing --path ./bge-reranker-v2-gemma-seq-cls
+# for mxbai-rerank-v2
+# python convert_model_to_seq_cls.py --model_name mixedbread-ai/mxbai-rerank-base-v2 --classifier_from_tokens '["0", "1"]' --method from_2_way_softmax --path ./mxbai-rerank-base-v2-seq-cls
+# for Qwen3-Reranker
+# python convert_model_to_seq_cls.py --model_name Qwen/Qwen3-Reranker-0.6B --classifier_from_tokens '["no", "yes"]' --method from_2_way_softmax --path ./Qwen3-Reranker-0.6B-seq-cls
+
+
+def from_2_way_softmax(causal_lm, seq_cls_model, tokenizer, tokens, device):
+    # refer to https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
+    assert len(tokens) == 2
+
+    lm_head_weights = causal_lm.lm_head.weight
+
+    false_id = tokenizer.convert_tokens_to_ids(tokens[0])
+    true_id = tokenizer.convert_tokens_to_ids(tokens[1])
+
+    score_weight = lm_head_weights[true_id].to(device).to(
+        torch.float32
+    ) - lm_head_weights[false_id].to(device).to(torch.float32)
+
+    with torch.no_grad():
+        seq_cls_model.score.weight.copy_(score_weight.unsqueeze(0))
+        if seq_cls_model.score.bias is not None:
+            seq_cls_model.score.bias.zero_()
+
+
+def no_post_processing(causal_lm, seq_cls_model, tokenizer, tokens, device):
+    lm_head_weights = causal_lm.lm_head.weight
+
+    token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]
+
+    score_weight = lm_head_weights[token_ids].to(device)
+
+    with torch.no_grad():
+        seq_cls_model.score.weight.copy_(score_weight)
+        if seq_cls_model.score.bias is not None:
+            seq_cls_model.score.bias.zero_()
+
+
+method_map = {
+    function.__name__: function for function in [from_2_way_softmax, no_post_processing]
+}
+
+
+def converting(
+    model_name, classifier_from_tokens, path, method, use_pad_token=False, device="cpu"
+):
+    assert method in method_map
+
+    if method == "from_2_way_softmax":
+        assert len(classifier_from_tokens) == 2
+        num_labels = 1
+    else:
+        num_labels = len(classifier_from_tokens)
+
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
+    causal_lm = transformers.AutoModelForCausalLM.from_pretrained(
+        model_name, device_map=device
+    )
+
+    seq_cls_model = transformers.AutoModelForSequenceClassification.from_pretrained(
+        model_name,
+        num_labels=num_labels,
+        ignore_mismatched_sizes=True,
+        device_map=device,
+    )
+
+    method_map[method](
+        causal_lm, seq_cls_model, tokenizer, classifier_from_tokens, device
+    )
+
+    # `llm as reranker` defaults to not using pad_token
+    seq_cls_model.config.use_pad_token = use_pad_token
+    seq_cls_model.config.pad_token_id = tokenizer.pad_token_id
+
+    seq_cls_model.save_pretrained(path)
+    tokenizer.save_pretrained(path)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Converting *ForCausalLM models to "
+        "*ForSequenceClassification models."
+    )
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        default="BAAI/bge-reranker-v2-gemma",
+        help="Model name",
+    )
+    parser.add_argument(
+        "--classifier_from_tokens",
+        type=str,
+        default='["Yes"]',
+        help="classifier from tokens",
+    )
+    parser.add_argument(
+        "--method", type=str, default="no_post_processing", help="Converting converting"
+    )
+    parser.add_argument(
+        "--use-pad-token", action="store_true", help="Whether to use pad_token"
+    )
+    parser.add_argument(
+        "--path",
+        type=str,
+        default="./bge-reranker-v2-gemma-seq-cls",
+        help="Path to save converted model",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    converting(
+        model_name=args.model_name,
+        classifier_from_tokens=json.loads(args.classifier_from_tokens),
+        method=args.method,
+        use_pad_token=args.use_pad_token,
+        path=args.path,
+    )
diff --git a/vllm_v0.10.0/examples/offline_inference/data_parallel.py b/vllm_v0.10.0/examples/offline_inference/data_parallel.py
new file mode 100644
index 0000000..dbf8ed5
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/data_parallel.py
@@ -0,0 +1,216 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Usage:
+Single node:
+    python examples/offline_inference/data_parallel.py \
+            --model="ibm-research/PowerMoE-3b" \
+            --dp-size=2 \
+            --tp-size=2
+
+Multi-node:
+    Node 0 (assume the node has ip of 10.99.48.128):
+            python examples/offline_inference/data_parallel.py \
+                    --model="ibm-research/PowerMoE-3b" \
+                    --dp-size=2 \
+                    --tp-size=2 \
+                    --node-size=2 \
+                    --node-rank=0 \
+                    --master-addr=10.99.48.128 \
+                    --master-port=13345
+    Node 1:
+            python examples/offline_inference/data_parallel.py \
+                    --model="ibm-research/PowerMoE-3b" \
+                    --dp-size=2 \
+                    --tp-size=2 \
+                    --node-size=2 \
+                    --node-rank=1 \
+                    --master-addr=10.99.48.128 \
+                    --master-port=13345
+"""
+
+import os
+from time import sleep
+
+from vllm import LLM, SamplingParams
+from vllm.utils import get_open_port
+
+
+def parse_args():
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Data Parallel Inference")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="ibm-research/PowerMoE-3b",
+        help="Model name or path",
+    )
+    parser.add_argument("--dp-size", type=int, default=2, help="Data parallel size")
+    parser.add_argument("--tp-size", type=int, default=2, help="Tensor parallel size")
+    parser.add_argument(
+        "--node-size", type=int, default=1, help="Total number of nodes"
+    )
+    parser.add_argument(
+        "--node-rank", type=int, default=0, help="Rank of the current node"
+    )
+    parser.add_argument(
+        "--master-addr", type=str, default="", help="Master node IP address"
+    )
+    parser.add_argument("--master-port", type=int, default=0, help="Master node port")
+    parser.add_argument(
+        "--enforce-eager", action="store_true", help="Enforce eager mode execution."
+    )
+    parser.add_argument(
+        "--trust-remote-code", action="store_true", help="Trust remote code."
+    )
+    parser.add_argument(
+        "--max-num-seqs",
+        type=int,
+        default=64,
+        help=("Maximum number of sequences to be processed in a single iteration."),
+    )
+    parser.add_argument(
+        "--gpu-memory-utilization",
+        type=float,
+        default=0.8,
+        help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."),
+    )
+    return parser.parse_args()
+
+
+def main(
+    model,
+    dp_size,
+    local_dp_rank,
+    global_dp_rank,
+    dp_master_ip,
+    dp_master_port,
+    GPUs_per_dp_rank,
+    enforce_eager,
+    trust_remote_code,
+    max_num_seqs,
+    gpu_memory_utilization,
+):
+    os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
+    os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
+    os.environ["VLLM_DP_SIZE"] = str(dp_size)
+    os.environ["VLLM_DP_MASTER_IP"] = dp_master_ip
+    os.environ["VLLM_DP_MASTER_PORT"] = str(dp_master_port)
+
+    # CUDA_VISIBLE_DEVICES for each DP rank is set automatically inside the
+    # engine processes.
+
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ] * 100
+
+    # with DP, each rank should process different prompts.
+    # usually all the DP ranks process a full dataset,
+    # and each rank processes a different part of the dataset.
+    floor = len(prompts) // dp_size
+    remainder = len(prompts) % dp_size
+
+    # Distribute prompts into even groups.
+    def start(rank):
+        return rank * floor + min(rank, remainder)
+
+    prompts = prompts[start(global_dp_rank) : start(global_dp_rank + 1)]
+    if len(prompts) == 0:
+        # if any rank has no prompts to process,
+        # we need to set a placeholder prompt
+        prompts = ["Placeholder"]
+    print(f"DP rank {global_dp_rank} needs to process {len(prompts)} prompts")
+
+    # Create a sampling params object.
+    # since we are doing data parallel, every rank can have different
+    # sampling params. here we set different max_tokens for different
+    # ranks for demonstration.
+    sampling_params = SamplingParams(
+        temperature=0.8, top_p=0.95, max_tokens=[16, 20][global_dp_rank % 2]
+    )
+
+    # Create an LLM.
+    llm = LLM(
+        model=model,
+        tensor_parallel_size=GPUs_per_dp_rank,
+        enforce_eager=enforce_eager,
+        enable_expert_parallel=True,
+        trust_remote_code=trust_remote_code,
+        max_num_seqs=max_num_seqs,
+        gpu_memory_utilization=gpu_memory_utilization,
+    )
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    for i, output in enumerate(outputs):
+        if i >= 5:
+            # print only 5 outputs
+            break
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(
+            f"DP rank {global_dp_rank}, Prompt: {prompt!r}, "
+            f"Generated text: {generated_text!r}"
+        )
+
+    # Give engines time to pause their processing loops before exiting.
+    sleep(1)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    dp_size = args.dp_size
+    tp_size = args.tp_size
+    node_size = args.node_size
+    node_rank = args.node_rank
+
+    if node_size == 1:
+        dp_master_ip = "127.0.0.1"
+        dp_master_port = get_open_port()
+    else:
+        dp_master_ip = args.master_addr
+        dp_master_port = args.master_port
+
+    assert dp_size % node_size == 0, "dp_size should be divisible by node_size"
+    dp_per_node = dp_size // node_size
+
+    from multiprocessing import Process
+
+    procs = []
+    for local_dp_rank, global_dp_rank in enumerate(
+        range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node)
+    ):
+        proc = Process(
+            target=main,
+            args=(
+                args.model,
+                dp_size,
+                local_dp_rank,
+                global_dp_rank,
+                dp_master_ip,
+                dp_master_port,
+                tp_size,
+                args.enforce_eager,
+                args.trust_remote_code,
+                args.max_num_seqs,
+                args.gpu_memory_utilization,
+            ),
+        )
+        proc.start()
+        procs.append(proc)
+    exit_code = 0
+    for proc in procs:
+        proc.join(timeout=300)
+        if proc.exitcode is None:
+            print(f"Killing process {proc.pid} that didn't stop within 5 minutes.")
+            proc.kill()
+            exit_code = 1
+        elif proc.exitcode:
+            exit_code = proc.exitcode
+
+    exit(exit_code)
diff --git a/vllm_v0.10.0/examples/offline_inference/disaggregated-prefill-v1/README.md b/vllm_v0.10.0/examples/offline_inference/disaggregated-prefill-v1/README.md
new file mode 100644
index 0000000..9cbdb19
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/disaggregated-prefill-v1/README.md
@@ -0,0 +1,10 @@
+# Disaggregated Prefill V1
+
+This example contains scripts that demonstrate disaggregated prefill in the offline setting of vLLM.
+
+## Files
+
+- `run.sh` - A helper script that will run `prefill_example.py` and `decode_example.py` sequentially.
+  - Make sure you are in the `examples/offline_inference/disaggregated-prefill-v1` directory before running `run.sh`.
+- `prefill_example.py` - A script which performs prefill only, saving the KV state to the `local_storage` directory and the prompts to `output.txt`.
+- `decode_example.py` - A script which performs decode only, loading the KV state from the `local_storage` directory and the prompts from `output.txt`.
diff --git a/vllm_v0.10.0/examples/offline_inference/disaggregated-prefill-v1/decode_example.py b/vllm_v0.10.0/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
new file mode 100644
index 0000000..8f3d1a5
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+
+def read_prompts():
+    """Read prompts from output.txt"""
+    prompts = []
+    try:
+        with open("output.txt") as f:
+            for line in f:
+                prompts.append(line.strip())
+        print(f"Loaded {len(prompts)} prompts from output.txt")
+        return prompts
+    except FileNotFoundError:
+        print("Error: output.txt file not found")
+        exit(-1)
+
+
+def main():
+    prompts = read_prompts()
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
+
+    llm = LLM(
+        model="meta-llama/Llama-3.2-1B-Instruct",
+        enforce_eager=True,
+        gpu_memory_utilization=0.8,
+        max_num_batched_tokens=64,
+        max_num_seqs=16,
+        kv_transfer_config=KVTransferConfig(
+            kv_connector="SharedStorageConnector",
+            kv_role="kv_both",
+            kv_connector_extra_config={"shared_storage_path": "local_storage"},
+        ),
+    )  # , max_model_len=2048, max_num_batched_tokens=2048)
+
+    # 1ST generation (prefill instance)
+    outputs = llm.generate(prompts, sampling_params)
+
+    print("-" * 30)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 30)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py b/vllm_v0.10.0/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
new file mode 100644
index 0000000..0bfe7ec
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+
+def read_prompts():
+    context = "Hi " * 1000
+    context2 = "Hey " * 500
+    return [
+        context + "Hello, my name is",
+        context + "The capital of France is",
+        context2 + "Your name is",
+        context2 + "The capital of China is",
+    ]
+
+
+def main():
+    prompts = read_prompts()
+
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
+
+    llm = LLM(
+        model="meta-llama/Llama-3.2-1B-Instruct",
+        enforce_eager=True,
+        gpu_memory_utilization=0.8,
+        kv_transfer_config=KVTransferConfig(
+            kv_connector="SharedStorageConnector",
+            kv_role="kv_both",
+            kv_connector_extra_config={"shared_storage_path": "local_storage"},
+        ),
+    )  # , max_model_len=2048, max_num_batched_tokens=2048)
+
+    # 1ST generation (prefill instance)
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+    )
+
+    new_prompts = []
+    print("-" * 30)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        new_prompts.append(prompt + generated_text)
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 30)
+
+    # Write new_prompts to output.txt
+    with open("output.txt", "w") as f:
+        for prompt in new_prompts:
+            f.write(prompt + "\n")
+    print(f"Saved {len(new_prompts)} prompts to output.txt")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/offline_inference/disaggregated-prefill-v1/run.sh b/vllm_v0.10.0/examples/offline_inference/disaggregated-prefill-v1/run.sh
new file mode 100644
index 0000000..c1dcc95
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/disaggregated-prefill-v1/run.sh
@@ -0,0 +1,11 @@
+rm -rf local_storage/
+
+if [ -f "output.txt" ]; then
+    rm output.txt
+fi
+
+# The directory of current script
+SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
+
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 "$SCRIPT_DIR/prefill_example.py"
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 "$SCRIPT_DIR/decode_example.py"
diff --git a/vllm_v0.10.0/examples/offline_inference/disaggregated_prefill.py b/vllm_v0.10.0/examples/offline_inference/disaggregated_prefill.py
new file mode 100644
index 0000000..05a361f
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/disaggregated_prefill.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file demonstrates the example usage of disaggregated prefilling
+We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
+and then transfer the KV cache between them.
+"""
+
+import os
+import time
+from multiprocessing import Event, Process
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+
+def run_prefill(prefill_done):
+    # We use GPU 0 for prefill node.
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+
+    # The prefill node receives two requests, while the decode node receives
+    # three requests. So the decode node will only receive the KV Cache for
+    # requests 1 and 3. The decode node will use the KV Cache of requests 1
+    # and 3 and do prefilling on request 2.
+    prompts = [
+        "Hello, my name is",
+        "Hi, your name is",
+        # The decode node will actually "prefill" this request.
+        "Tell me a very long story",
+    ]
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
+
+    # Using PyNcclConnector to transmit KV caches between vLLM instances.
+    # This instance is the prefill node (kv_producer, rank 0).
+    # The number of parallel instances for KV cache transfer is set to 2,
+    # as required for PyNcclConnector.
+    ktc = KVTransferConfig(
+        kv_connector="PyNcclConnector",
+        kv_role="kv_producer",
+        kv_rank=0,
+        kv_parallel_size=2,
+    )
+
+    # Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
+    # memory. You may need to adjust the value to fit your GPU.
+    llm = LLM(
+        model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+        kv_transfer_config=ktc,
+        max_model_len=2000,
+        gpu_memory_utilization=0.8,
+    )
+
+    llm.generate(prompts, sampling_params)
+    print("Prefill node is finished.")
+    prefill_done.set()
+
+    # To keep the prefill node running in case the decode node is not done;
+    # otherwise, the script might exit prematurely, causing incomplete decoding.
+    try:
+        while True:
+            time.sleep(1)
+    except KeyboardInterrupt:
+        print("Script stopped by user.")
+
+
+def run_decode(prefill_done):
+    # We use GPU 1 for decode node.
+    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+
+    prompts = [
+        "Hello, my name is",
+        "Hi, your name is",
+        "Tell me a very long story",
+    ]
+    sampling_params = SamplingParams(temperature=0, top_p=0.95)
+
+    # Using PyNcclConnector to transmit KV caches between vLLM instances.
+    # This instance is the decode node (kv_consumer, rank 1).
+    # The number of parallel instances for KV cache transfer is set to 2,
+    # as required for PyNcclConnector.
+    ktc = KVTransferConfig(
+        kv_connector="PyNcclConnector",
+        kv_role="kv_consumer",
+        kv_rank=1,
+        kv_parallel_size=2,
+    )
+
+    # Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
+    # memory. You may need to adjust the value to fit your GPU.
+    llm = LLM(
+        model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+        kv_transfer_config=ktc,
+        max_model_len=2000,
+        gpu_memory_utilization=0.8,
+    )
+
+    # Wait for the producer to start the pipe
+    print("Waiting for prefill node to finish...")
+    prefill_done.wait()
+
+    # At this point when the prefill_done is set, the kv-cache should have been
+    # transferred to this decode node, so we can start decoding.
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+def main():
+    prefill_done = Event()
+    prefill_process = Process(target=run_prefill, args=(prefill_done,))
+    decode_process = Process(target=run_decode, args=(prefill_done,))
+
+    # Start prefill node
+    prefill_process.start()
+
+    # Start decode node
+    decode_process.start()
+
+    # Terminate the prefill node when decode is finished
+    decode_process.join()
+    prefill_process.terminate()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/offline_inference/embed_jina_embeddings_v3.py b/vllm_v0.10.0/examples/offline_inference/embed_jina_embeddings_v3.py
new file mode 100644
index 0000000..7d78b8c
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/embed_jina_embeddings_v3.py
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from argparse import Namespace
+
+from vllm import LLM, EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(
+        model="jinaai/jina-embeddings-v3", task="embed", trust_remote_code=True
+    )
+    return parser.parse_args()
+
+
+def main(args: Namespace):
+    # Sample prompts.
+    prompts = [
+        "Follow the white rabbit.",  # English
+        "Sigue al conejo blanco.",  # Spanish
+        "Suis le lapin blanc.",  # French
+        "跟着白兔走。",  # Chinese
+        "اتبع الأرنب الأبيض.",  # Arabic
+        "Folge dem weißen Kaninchen.",  # German
+    ]
+
+    # Create an LLM.
+    # You should pass task="embed" for embedding models
+    llm = LLM(**vars(args))
+
+    # Generate embedding. The output is a list of EmbeddingRequestOutputs.
+    # Only text matching task is supported for now. See #16120
+    outputs = llm.embed(prompts)
+
+    # Print the outputs.
+    print("\nGenerated Outputs:")
+    print("Only text matching task is supported for now. See #16120")
+    print("-" * 60)
+    for prompt, output in zip(prompts, outputs):
+        embeds = output.outputs.embedding
+        embeds_trimmed = (
+            (str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds
+        )
+        print(
+            f"Prompt: {prompt!r} \n"
+            f"Embeddings for text matching: {embeds_trimmed} "
+            f"(size={len(embeds)})"
+        )
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/examples/offline_inference/embed_matryoshka_fy.py b/vllm_v0.10.0/examples/offline_inference/embed_matryoshka_fy.py
new file mode 100644
index 0000000..50a645b
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/embed_matryoshka_fy.py
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from argparse import Namespace
+
+from vllm import LLM, EngineArgs, PoolingParams
+from vllm.utils import FlexibleArgumentParser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(
+        model="jinaai/jina-embeddings-v3", task="embed", trust_remote_code=True
+    )
+    return parser.parse_args()
+
+
+def main(args: Namespace):
+    # Sample prompts.
+    prompts = [
+        "Follow the white rabbit.",  # English
+        "Sigue al conejo blanco.",  # Spanish
+        "Suis le lapin blanc.",  # French
+        "跟着白兔走。",  # Chinese
+        "اتبع الأرنب الأبيض.",  # Arabic
+        "Folge dem weißen Kaninchen.",  # German
+    ]
+
+    # Create an LLM.
+    # You should pass task="embed" for embedding models
+    llm = LLM(**vars(args))
+
+    # Generate embedding. The output is a list of EmbeddingRequestOutputs.
+    outputs = llm.embed(prompts, pooling_params=PoolingParams(dimensions=32))
+
+    # Print the outputs.
+    print("\nGenerated Outputs:")
+    print("-" * 60)
+    for prompt, output in zip(prompts, outputs):
+        embeds = output.outputs.embedding
+        embeds_trimmed = (
+            (str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds
+        )
+        print(f"Prompt: {prompt!r} \nEmbeddings: {embeds_trimmed} (size={len(embeds)})")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/examples/offline_inference/encoder_decoder.py b/vllm_v0.10.0/examples/offline_inference/encoder_decoder.py
new file mode 100644
index 0000000..0da6fa5
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/encoder_decoder.py
@@ -0,0 +1,132 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Demonstrate prompting of text-to-text
+encoder/decoder models, specifically BART
+"""
+
+from vllm import LLM, SamplingParams
+from vllm.inputs import (
+    ExplicitEncoderDecoderPrompt,
+    TextPrompt,
+    TokensPrompt,
+    zip_enc_dec_prompts,
+)
+
+
+def create_prompts(tokenizer):
+    # Test prompts
+    #
+    # This section shows all of the valid ways to prompt an
+    # encoder/decoder model.
+    #
+    # - Helpers for building prompts
+    text_prompt_raw = "Hello, my name is"
+    text_prompt = TextPrompt(prompt="The president of the United States is")
+    tokens_prompt = TokensPrompt(
+        prompt_token_ids=tokenizer.encode(prompt="The capital of France is")
+    )
+    # - Pass a single prompt to encoder/decoder model
+    #   (implicitly encoder input prompt);
+    #   decoder input prompt is assumed to be None
+
+    single_text_prompt_raw = text_prompt_raw  # Pass a string directly
+    single_text_prompt = text_prompt  # Pass a TextPrompt
+    single_tokens_prompt = tokens_prompt  # Pass a TokensPrompt
+
+    # ruff: noqa: E501
+    # - Pass explicit encoder and decoder input prompts within one data structure.
+    #   Encoder and decoder prompts can both independently be text or tokens, with
+    #   no requirement that they be the same prompt type. Some example prompt-type
+    #   combinations are shown below, note that these are not exhaustive.
+
+    enc_dec_prompt1 = ExplicitEncoderDecoderPrompt(
+        # Pass encoder prompt string directly, &
+        # pass decoder prompt tokens
+        encoder_prompt=single_text_prompt_raw,
+        decoder_prompt=single_tokens_prompt,
+    )
+    enc_dec_prompt2 = ExplicitEncoderDecoderPrompt(
+        # Pass TextPrompt to encoder, and
+        # pass decoder prompt string directly
+        encoder_prompt=single_text_prompt,
+        decoder_prompt=single_text_prompt_raw,
+    )
+    enc_dec_prompt3 = ExplicitEncoderDecoderPrompt(
+        # Pass encoder prompt tokens directly, and
+        # pass TextPrompt to decoder
+        encoder_prompt=single_tokens_prompt,
+        decoder_prompt=single_text_prompt,
+    )
+
+    # - Finally, here's a useful helper function for zipping encoder and
+    #   decoder prompts together into a list of ExplicitEncoderDecoderPrompt
+    #   instances
+    zipped_prompt_list = zip_enc_dec_prompts(
+        ["An encoder prompt", "Another encoder prompt"],
+        ["A decoder prompt", "Another decoder prompt"],
+    )
+
+    # - Let's put all of the above example prompts together into one list
+    #   which we will pass to the encoder/decoder LLM.
+    return [
+        single_text_prompt_raw,
+        single_text_prompt,
+        single_tokens_prompt,
+        enc_dec_prompt1,
+        enc_dec_prompt2,
+        enc_dec_prompt3,
+    ] + zipped_prompt_list
+
+
+# Create a sampling params object.
+def create_sampling_params():
+    return SamplingParams(
+        temperature=0,
+        top_p=1.0,
+        min_tokens=0,
+        max_tokens=20,
+    )
+
+
+# Print the outputs.
+def print_outputs(outputs):
+    print("-" * 50)
+    for i, output in enumerate(outputs):
+        prompt = output.prompt
+        encoder_prompt = output.encoder_prompt
+        generated_text = output.outputs[0].text
+        print(f"Output {i + 1}:")
+        print(
+            f"Encoder prompt: {encoder_prompt!r}\n"
+            f"Decoder prompt: {prompt!r}\n"
+            f"Generated text: {generated_text!r}"
+        )
+        print("-" * 50)
+
+
+def main():
+    dtype = "float"
+
+    # Create a BART encoder/decoder model instance
+    llm = LLM(
+        model="facebook/bart-large-cnn",
+        dtype=dtype,
+    )
+
+    # Get BART tokenizer
+    tokenizer = llm.llm_engine.get_tokenizer_group()
+
+    prompts = create_prompts(tokenizer)
+    sampling_params = create_sampling_params()
+
+    # Generate output tokens from the prompts. The output is a list of
+    # RequestOutput objects that contain the prompt, generated
+    # text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+
+    print_outputs(outputs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/offline_inference/encoder_decoder_multimodal.py b/vllm_v0.10.0/examples/offline_inference/encoder_decoder_multimodal.py
new file mode 100644
index 0000000..d27a902
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/encoder_decoder_multimodal.py
@@ -0,0 +1,196 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This example shows how to use vLLM for running offline inference with
+the explicit/implicit prompt format on enc-dec LMMs for text generation.
+"""
+
+import time
+from collections.abc import Sequence
+from dataclasses import asdict
+from typing import NamedTuple
+
+from vllm import LLM, EngineArgs, PromptType, SamplingParams
+from vllm.assets.audio import AudioAsset
+from vllm.assets.image import ImageAsset
+from vllm.utils import FlexibleArgumentParser
+
+
+class ModelRequestData(NamedTuple):
+    engine_args: EngineArgs
+    prompts: Sequence[PromptType]
+
+
+def run_florence2():
+    engine_args = EngineArgs(
+        model="microsoft/Florence-2-large",
+        tokenizer="Isotr0py/Florence-2-tokenizer",
+        max_num_seqs=8,
+        trust_remote_code=True,
+        limit_mm_per_prompt={"image": 1},
+        dtype="half",
+    )
+
+    prompts = [
+        {  # implicit prompt with task token
+            "prompt": "<DETAILED_CAPTION>",
+            "multi_modal_data": {"image": ImageAsset("stop_sign").pil_image},
+        },
+        {  # explicit encoder/decoder prompt
+            "encoder_prompt": {
+                "prompt": "Describe in detail what is shown in the image.",
+                "multi_modal_data": {"image": ImageAsset("cherry_blossom").pil_image},
+            },
+            "decoder_prompt": "",
+        },
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+def run_mllama():
+    engine_args = EngineArgs(
+        model="meta-llama/Llama-3.2-11B-Vision-Instruct",
+        max_model_len=8192,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": 1},
+        dtype="half",
+    )
+
+    prompts = [
+        {  # Implicit prompt
+            "prompt": "<|image|><|begin_of_text|>What is the content of this image?",  # noqa: E501
+            "multi_modal_data": {
+                "image": ImageAsset("stop_sign").pil_image,
+            },
+        },
+        {  # Explicit prompt
+            "encoder_prompt": {
+                "prompt": "<|image|>",
+                "multi_modal_data": {
+                    "image": ImageAsset("stop_sign").pil_image,
+                },
+            },
+            "decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.",  # noqa: E501
+        },
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+def run_whisper():
+    engine_args = EngineArgs(
+        model="openai/whisper-large-v3-turbo",
+        max_model_len=448,
+        max_num_seqs=16,
+        limit_mm_per_prompt={"audio": 1},
+        dtype="half",
+    )
+
+    prompts = [
+        {  # Test implicit prompt
+            "prompt": "<|startoftranscript|>",
+            "multi_modal_data": {
+                "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
+            },
+        },
+        {  # Test explicit encoder/decoder prompt
+            "encoder_prompt": {
+                "prompt": "",
+                "multi_modal_data": {
+                    "audio": AudioAsset("winning_call").audio_and_sample_rate,
+                },
+            },
+            "decoder_prompt": "<|startoftranscript|>",
+        },
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+model_example_map = {
+    "florence2": run_florence2,
+    "mllama": run_mllama,
+    "whisper": run_whisper,
+}
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description="Demo on using vLLM for offline inference with "
+        "vision language models for text generation"
+    )
+    parser.add_argument(
+        "--model-type",
+        "-m",
+        type=str,
+        default="mllama",
+        choices=model_example_map.keys(),
+        help='Huggingface "model_type".',
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="Set the seed when initializing `vllm.LLM`.",
+    )
+    return parser.parse_args()
+
+
+def main(args):
+    model = args.model_type
+    if model not in model_example_map:
+        raise ValueError(f"Model type {model} is not supported.")
+
+    req_data = model_example_map[model]()
+
+    # Disable other modalities to save memory
+    default_limits = {"image": 0, "video": 0, "audio": 0}
+    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
+        req_data.engine_args.limit_mm_per_prompt or {}
+    )
+
+    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    llm = LLM(**engine_args)
+
+    prompts = req_data.prompts
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(
+        temperature=0,
+        top_p=1.0,
+        max_tokens=64,
+        skip_special_tokens=False,
+    )
+
+    start = time.time()
+
+    # Generate output tokens from the prompts. The output is a list of
+    # RequestOutput objects that contain the prompt, generated
+    # text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Decoder prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    duration = time.time() - start
+
+    print("Duration:", duration)
+    print("RPS:", len(prompts) / duration)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/examples/offline_inference/llm_engine_example.py b/vllm_v0.10.0/examples/offline_inference/llm_engine_example.py
new file mode 100644
index 0000000..d7f2a16
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/llm_engine_example.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file demonstrates using the `LLMEngine`
+for processing prompts with various sampling parameters.
+"""
+
+import argparse
+
+from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
+from vllm.utils import FlexibleArgumentParser
+
+
+def create_test_prompts() -> list[tuple[str, SamplingParams]]:
+    """Create a list of test prompts with their sampling parameters."""
+    return [
+        (
+            "A robot may not injure a human being",
+            SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1),
+        ),
+        (
+            "To be or not to be,",
+            SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2),
+        ),
+        (
+            "What is the meaning of life?",
+            SamplingParams(n=2, temperature=0.8, top_p=0.95, frequency_penalty=0.1),
+        ),
+    ]
+
+
+def process_requests(engine: LLMEngine, test_prompts: list[tuple[str, SamplingParams]]):
+    """Continuously process a list of prompts and handle the outputs."""
+    request_id = 0
+
+    print("-" * 50)
+    while test_prompts or engine.has_unfinished_requests():
+        if test_prompts:
+            prompt, sampling_params = test_prompts.pop(0)
+            engine.add_request(str(request_id), prompt, sampling_params)
+            request_id += 1
+
+        request_outputs: list[RequestOutput] = engine.step()
+
+        for request_output in request_outputs:
+            if request_output.finished:
+                print(request_output)
+                print("-" * 50)
+
+
+def initialize_engine(args: argparse.Namespace) -> LLMEngine:
+    """Initialize the LLMEngine from the command line arguments."""
+    engine_args = EngineArgs.from_cli_args(args)
+    return LLMEngine.from_engine_args(engine_args)
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description="Demo on using the LLMEngine class directly"
+    )
+    parser = EngineArgs.add_cli_args(parser)
+    return parser.parse_args()
+
+
+def main(args: argparse.Namespace):
+    """Main function that sets up and runs the prompt processing."""
+    engine = initialize_engine(args)
+    test_prompts = create_test_prompts()
+    process_requests(engine, test_prompts)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/examples/offline_inference/load_sharded_state.py b/vllm_v0.10.0/examples/offline_inference/load_sharded_state.py
new file mode 100644
index 0000000..cc78c0c
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/load_sharded_state.py
@@ -0,0 +1,94 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Validates the loading of a model saved with the sharded_state format.
+This script demonstrates how to load a model that was previously saved
+using save_sharded_state.py and validates it by running inference.
+Example usage:
+(First need to save a sharded_state mode)
+
+python save_sharded_state.py \
+    --model /path/to/load \
+    --quantization deepspeedfp \
+    --tensor-parallel-size 8 \
+    --output /path/to/save/sharded/modele
+
+python load_sharded_state.py \
+    --model /path/to/saved/sharded/model \
+    --load-format sharded_state \
+    --quantization deepspeedfp \
+    --tensor-parallel-size 8 \
+    --prompt "Hello, my name is" \
+    --max-tokens 50
+"""
+
+import dataclasses
+
+from vllm import LLM, EngineArgs, SamplingParams
+from vllm.utils import FlexibleArgumentParser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    # Add engine arguments
+    EngineArgs.add_cli_args(parser)
+
+    # Override default load_format for clarity
+    parser.set_defaults(load_format="sharded_state")
+
+    # Add validation arguments
+    parser.add_argument(
+        "--prompt", type=str, default="Hello, world!", help="Prompt for validation"
+    )
+    parser.add_argument(
+        "--max-tokens",
+        type=int,
+        default=100,
+        help="Maximum number of tokens to generate",
+    )
+    parser.add_argument(
+        "--temperature", type=float, default=0.7, help="Sampling temperature"
+    )
+    parser.add_argument(
+        "--top-p", type=float, default=1.0, help="Top-p sampling parameter"
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    engine_args = EngineArgs.from_cli_args(args)
+
+    print(
+        f"Loading model from {engine_args.model} using format {engine_args.load_format}"
+    )
+    print(f"Tensor parallel size: {engine_args.tensor_parallel_size}")
+
+    # Load the model using engine args
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    # Prepare sampling parameters
+    sampling_params = SamplingParams(
+        temperature=args.temperature,
+        top_p=args.top_p,
+        max_tokens=args.max_tokens,
+    )
+
+    print("\nRunning inference:")
+    print(f"Prompt: {args.prompt}")
+
+    # Generate completion
+    outputs = llm.generate(args.prompt, sampling_params)
+
+    # Display generated text
+    print("\nGenerated outputs:")
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print("-" * 50)
+        print(f"Full output: {args.prompt}{generated_text}")
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/offline_inference/lora_with_quantization_inference.py b/vllm_v0.10.0/examples/offline_inference/lora_with_quantization_inference.py
new file mode 100644
index 0000000..00d4cb9
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/lora_with_quantization_inference.py
@@ -0,0 +1,136 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This example shows how to use LoRA with different quantization techniques
+for offline inference.
+
+Requires HuggingFace credentials for access.
+"""
+
+import gc
+from typing import Optional
+
+import torch
+from huggingface_hub import snapshot_download
+
+from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
+from vllm.lora.request import LoRARequest
+
+
+def create_test_prompts(
+    lora_path: str,
+) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
+    return [
+        # this is an example of using quantization without LoRA
+        (
+            "My name is",
+            SamplingParams(
+                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
+            ),
+            None,
+        ),
+        # the next three examples use quantization with LoRA
+        (
+            "my name is",
+            SamplingParams(
+                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
+            ),
+            LoRARequest("lora-test-1", 1, lora_path),
+        ),
+        (
+            "The capital of USA is",
+            SamplingParams(
+                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
+            ),
+            LoRARequest("lora-test-2", 1, lora_path),
+        ),
+        (
+            "The capital of France is",
+            SamplingParams(
+                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
+            ),
+            LoRARequest("lora-test-3", 1, lora_path),
+        ),
+    ]
+
+
+def process_requests(
+    engine: LLMEngine,
+    test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]],
+):
+    """Continuously process a list of prompts and handle the outputs."""
+    request_id = 0
+
+    while test_prompts or engine.has_unfinished_requests():
+        if test_prompts:
+            prompt, sampling_params, lora_request = test_prompts.pop(0)
+            engine.add_request(
+                str(request_id), prompt, sampling_params, lora_request=lora_request
+            )
+            request_id += 1
+
+        request_outputs: list[RequestOutput] = engine.step()
+        for request_output in request_outputs:
+            if request_output.finished:
+                print("----------------------------------------------------")
+                print(f"Prompt: {request_output.prompt}")
+                print(f"Output: {request_output.outputs[0].text}")
+
+
+def initialize_engine(
+    model: str, quantization: str, lora_repo: Optional[str]
+) -> LLMEngine:
+    """Initialize the LLMEngine."""
+
+    engine_args = EngineArgs(
+        model=model,
+        quantization=quantization,
+        enable_lora=True,
+        max_lora_rank=64,
+        max_loras=4,
+    )
+    return LLMEngine.from_engine_args(engine_args)
+
+
+def main():
+    """Main function that sets up and runs the prompt processing."""
+
+    test_configs = [
+        # QLoRA (https://arxiv.org/abs/2305.14314)
+        {
+            "name": "qlora_inference_example",
+            "model": "huggyllama/llama-7b",
+            "quantization": "bitsandbytes",
+            "lora_repo": "timdettmers/qlora-flan-7b",
+        },
+        {
+            "name": "AWQ_inference_with_lora_example",
+            "model": "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
+            "quantization": "awq",
+            "lora_repo": "jashing/tinyllama-colorist-lora",
+        },
+        {
+            "name": "GPTQ_inference_with_lora_example",
+            "model": "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+            "quantization": "gptq",
+            "lora_repo": "jashing/tinyllama-colorist-lora",
+        },
+    ]
+
+    for test_config in test_configs:
+        print(f"~~~~~~~~~~~~~~~~ Running: {test_config['name']} ~~~~~~~~~~~~~~~~")
+        engine = initialize_engine(
+            test_config["model"], test_config["quantization"], test_config["lora_repo"]
+        )
+        lora_path = snapshot_download(repo_id=test_config["lora_repo"])
+        test_prompts = create_test_prompts(lora_path)
+        process_requests(engine, test_prompts)
+
+        # Clean up the GPU memory for the next test
+        del engine
+        gc.collect()
+        torch.cuda.empty_cache()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/offline_inference/metrics.py b/vllm_v0.10.0/examples/offline_inference/metrics.py
new file mode 100644
index 0000000..00fb3f5
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/metrics.py
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import LLM, SamplingParams
+from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Vector
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+
+def main():
+    # Create an LLM.
+    llm = LLM(model="facebook/opt-125m", disable_log_stats=False)
+
+    # Generate texts from the prompts.
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+    # Dump all metrics
+    for metric in llm.get_metrics():
+        if isinstance(metric, Gauge):
+            print(f"{metric.name} (gauge) = {metric.value}")
+        elif isinstance(metric, Counter):
+            print(f"{metric.name} (counter) = {metric.value}")
+        elif isinstance(metric, Vector):
+            print(f"{metric.name} (vector) = {metric.values}")
+        elif isinstance(metric, Histogram):
+            print(f"{metric.name} (histogram)")
+            print(f"    sum = {metric.sum}")
+            print(f"    count = {metric.count}")
+            for bucket_le, value in metric.buckets.items():
+                print(f"    {bucket_le} = {value}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/offline_inference/mistral-small.py b/vllm_v0.10.0/examples/offline_inference/mistral-small.py
new file mode 100644
index 0000000..a38fc92
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/mistral-small.py
@@ -0,0 +1,186 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# ruff: noqa
+import argparse
+
+from vllm import LLM
+from vllm.sampling_params import SamplingParams
+from vllm.assets.image import ImageAsset
+
+# This script is an offline demo for running Mistral-Small-3.1
+#
+# If you want to run a server/client setup, please follow this code:
+#
+# - Server:
+#
+# ```bash
+# # Mistral format
+# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
+#   --tokenizer-mode mistral --config-format mistral --load-format mistral \
+#   --limit-mm-per-prompt '{"image":4}' --max-model-len 16384
+#
+# # HF format
+# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
+#   --limit-mm-per-prompt '{"image":4}' --max-model-len 16384
+# ```
+#
+# - Client:
+#
+# ```bash
+# curl --location 'http://<your-node-url>:8000/v1/chat/completions' \
+# --header 'Content-Type: application/json' \
+# --header 'Authorization: Bearer token' \
+# --data '{
+#     "model": "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
+#     "messages": [
+#       {
+#         "role": "user",
+#         "content": [
+#             {"type" : "text", "text": "Describe this image in detail please."},
+#             {"type": "image_url", "image_url": {"url": "https://s3.amazonaws.com/cms.ipressroom.com/338/files/201808/5b894ee1a138352221103195_A680%7Ejogging-edit/A680%7Ejogging-edit_hero.jpg"}},
+#             {"type" : "text", "text": "and this one as well. Answer in French."},
+#             {"type": "image_url", "image_url": {"url": "https://www.wolframcloud.com/obj/resourcesystem/images/a0e/a0ee3983-46c6-4c92-b85d-059044639928/6af8cfb971db031b.png"}}
+#         ]
+#       }
+#     ]
+#   }'
+# ```
+#
+# Usage:
+#     python demo.py simple
+#     python demo.py advanced
+
+# Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
+# These scripts have been tested on 2x L40 GPUs
+
+
+def run_simple_demo(args: argparse.Namespace):
+    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+    sampling_params = SamplingParams(max_tokens=8192)
+
+    llm = LLM(
+        model=model_name,
+        tokenizer_mode="mistral" if args.format == "mistral" else "auto",
+        config_format="mistral" if args.format == "mistral" else "auto",
+        load_format="mistral" if args.format == "mistral" else "auto",
+        limit_mm_per_prompt={"image": 1},
+        max_model_len=4096,
+        max_num_seqs=2,
+        tensor_parallel_size=2,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    prompt = "Describe this image in one sentence."
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": prompt},
+                {
+                    "type": "image_pil",
+                    "image_pil": ImageAsset("cherry_blossom").pil_image,
+                },
+            ],
+        },
+    ]
+    outputs = llm.chat(messages, sampling_params=sampling_params)
+    print("-" * 50)
+    print(outputs[0].outputs[0].text)
+    print("-" * 50)
+
+
+def run_advanced_demo(args: argparse.Namespace):
+    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+    max_img_per_msg = 3
+    max_tokens_per_img = 4096
+
+    sampling_params = SamplingParams(max_tokens=8192, temperature=0.7)
+    llm = LLM(
+        model=model_name,
+        tokenizer_mode="mistral" if args.format == "mistral" else "auto",
+        config_format="mistral" if args.format == "mistral" else "auto",
+        load_format="mistral" if args.format == "mistral" else "auto",
+        limit_mm_per_prompt={"image": max_img_per_msg},
+        max_model_len=max_img_per_msg * max_tokens_per_img,
+        tensor_parallel_size=2,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    prompt = "Describe the following image."
+
+    url_1 = "https://huggingface.co/datasets/patrickvonplaten/random_img/resolve/main/yosemite.png"
+    url_2 = "https://picsum.photos/seed/picsum/200/300"
+    url_3 = "https://picsum.photos/id/32/512/512"
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": prompt},
+                {"type": "image_url", "image_url": {"url": url_1}},
+                {"type": "image_url", "image_url": {"url": url_2}},
+            ],
+        },
+        {
+            "role": "assistant",
+            "content": "The images show nature.",
+        },
+        {
+            "role": "user",
+            "content": "More details please and answer only in French!.",
+        },
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": url_3}},
+            ],
+        },
+    ]
+
+    outputs = llm.chat(messages=messages, sampling_params=sampling_params)
+    print("-" * 50)
+    print(outputs[0].outputs[0].text)
+    print("-" * 50)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Run a demo in simple or advanced mode."
+    )
+
+    parser.add_argument(
+        "mode",
+        choices=["simple", "advanced"],
+        help="Specify the demo mode: 'simple' or 'advanced'",
+    )
+
+    parser.add_argument(
+        "--format",
+        choices=["mistral", "hf"],
+        default="mistral",
+        help="Specify the format of the model to load.",
+    )
+
+    parser.add_argument(
+        "--disable-mm-preprocessor-cache",
+        action="store_true",
+        help="If True, disables caching of multi-modal preprocessor/mapper.",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    if args.mode == "simple":
+        print("Running simple demo...")
+        run_simple_demo(args)
+    elif args.mode == "advanced":
+        print("Running advanced demo...")
+        run_advanced_demo(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/offline_inference/mlpspeculator.py b/vllm_v0.10.0/examples/offline_inference/mlpspeculator.py
new file mode 100644
index 0000000..d5b1b4a
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/mlpspeculator.py
@@ -0,0 +1,73 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file demonstrates the usage of text generation with an LLM model,
+comparing the performance with and without speculative decoding.
+
+Note that still not support `v1`:
+VLLM_USE_V1=0 python examples/offline_inference/mlpspeculator.py
+"""
+
+import gc
+import time
+
+from vllm import LLM, SamplingParams
+
+
+def time_generation(
+    llm: LLM, prompts: list[str], sampling_params: SamplingParams, title: str
+):
+    # Generate texts from the prompts. The output is a list of RequestOutput
+    # objects that contain the prompt, generated text, and other information.
+    # Warmup first
+    llm.generate(prompts, sampling_params)
+    llm.generate(prompts, sampling_params)
+    start = time.time()
+    outputs = llm.generate(prompts, sampling_params)
+    end = time.time()
+    print("-" * 50)
+    print(title)
+    print("time: ", (end - start) / sum(len(o.outputs[0].token_ids) for o in outputs))
+    # Print the outputs.
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print(f"text: {generated_text!r}")
+        print("-" * 50)
+
+
+def main():
+    template = (
+        "Below is an instruction that describes a task. Write a response "
+        "that appropriately completes the request.\n\n### Instruction:\n{}"
+        "\n\n### Response:\n"
+    )
+
+    # Sample prompts.
+    prompts = [
+        "Write about the president of the United States.",
+    ]
+    prompts = [template.format(prompt) for prompt in prompts]
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=200)
+
+    # Create an LLM without spec decoding
+    llm = LLM(model="meta-llama/Llama-2-13b-chat-hf")
+
+    time_generation(llm, prompts, sampling_params, "Without speculation")
+
+    del llm
+    gc.collect()
+
+    # Create an LLM with spec decoding
+    llm = LLM(
+        model="meta-llama/Llama-2-13b-chat-hf",
+        speculative_config={
+            "model": "ibm-ai-platform/llama-13b-accelerator",
+        },
+    )
+
+    time_generation(llm, prompts, sampling_params, "With speculation")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/offline_inference/multilora_inference.py b/vllm_v0.10.0/examples/offline_inference/multilora_inference.py
new file mode 100644
index 0000000..f0c00bc
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/multilora_inference.py
@@ -0,0 +1,122 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This example shows how to use the multi-LoRA functionality
+for offline inference.
+
+Requires HuggingFace credentials for access to Llama2.
+"""
+
+from typing import Optional
+
+from huggingface_hub import snapshot_download
+
+from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
+from vllm.lora.request import LoRARequest
+
+
+def create_test_prompts(
+    lora_path: str,
+) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
+    """Create a list of test prompts with their sampling parameters.
+
+    2 requests for base model, 4 requests for the LoRA. We define 2
+    different LoRA adapters (using the same model for demo purposes).
+    Since we also set `max_loras=1`, the expectation is that the requests
+    with the second LoRA adapter will be ran after all requests with the
+    first adapter have finished.
+    """
+    return [
+        (
+            "A robot may not injure a human being",
+            SamplingParams(
+                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
+            ),
+            None,
+        ),
+        (
+            "To be or not to be,",
+            SamplingParams(
+                temperature=0.8, top_k=5, presence_penalty=0.2, max_tokens=128
+            ),
+            None,
+        ),
+        (
+            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
+            SamplingParams(
+                temperature=0.0,
+                logprobs=1,
+                prompt_logprobs=1,
+                max_tokens=128,
+                stop_token_ids=[32003],
+            ),
+            LoRARequest("sql-lora", 1, lora_path),
+        ),
+        (
+            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
+            SamplingParams(
+                temperature=0.0,
+                logprobs=1,
+                prompt_logprobs=1,
+                max_tokens=128,
+                stop_token_ids=[32003],
+            ),
+            LoRARequest("sql-lora2", 2, lora_path),
+        ),
+    ]
+
+
+def process_requests(
+    engine: LLMEngine,
+    test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]],
+):
+    """Continuously process a list of prompts and handle the outputs."""
+    request_id = 0
+
+    print("-" * 50)
+    while test_prompts or engine.has_unfinished_requests():
+        if test_prompts:
+            prompt, sampling_params, lora_request = test_prompts.pop(0)
+            engine.add_request(
+                str(request_id), prompt, sampling_params, lora_request=lora_request
+            )
+            request_id += 1
+
+        request_outputs: list[RequestOutput] = engine.step()
+
+        for request_output in request_outputs:
+            if request_output.finished:
+                print(request_output)
+                print("-" * 50)
+
+
+def initialize_engine() -> LLMEngine:
+    """Initialize the LLMEngine."""
+    # max_loras: controls the number of LoRAs that can be used in the same
+    #   batch. Larger numbers will cause higher memory usage, as each LoRA
+    #   slot requires its own preallocated tensor.
+    # max_lora_rank: controls the maximum supported rank of all LoRAs. Larger
+    #   numbers will cause higher memory usage. If you know that all LoRAs will
+    #   use the same rank, it is recommended to set this as low as possible.
+    # max_cpu_loras: controls the size of the CPU LoRA cache.
+    engine_args = EngineArgs(
+        model="meta-llama/Llama-2-7b-hf",
+        enable_lora=True,
+        max_loras=1,
+        max_lora_rank=8,
+        max_cpu_loras=2,
+        max_num_seqs=256,
+    )
+    return LLMEngine.from_engine_args(engine_args)
+
+
+def main():
+    """Main function that sets up and runs the prompt processing."""
+    engine = initialize_engine()
+    lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
+    test_prompts = create_test_prompts(lora_path)
+    process_requests(engine, test_prompts)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/offline_inference/neuron.py b/vllm_v0.10.0/examples/offline_inference/neuron.py
new file mode 100644
index 0000000..7826629
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/neuron.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+
+def main():
+    # Create an LLM.
+    llm = LLM(
+        model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        max_num_seqs=8,
+        # The max_model_len and block_size arguments are required to be same as
+        # max sequence length when targeting neuron device.
+        # Currently, this is a known limitation in continuous batching support
+        # in transformers-neuronx.
+        # TODO(liangfu): Support paged-attention in transformers-neuronx.
+        max_model_len=1024,
+        block_size=1024,
+        # ruff: noqa: E501
+        # The device can be automatically detected when AWS Neuron SDK is installed.
+        # The device argument can be either unspecified for automated detection,
+        # or explicitly assigned.
+        device="neuron",
+        tensor_parallel_size=2,
+    )
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/offline_inference/neuron_eagle.py b/vllm_v0.10.0/examples/offline_inference/neuron_eagle.py
new file mode 100644
index 0000000..8b1d235
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/neuron_eagle.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This example shows how to run offline inference with an EAGLE speculative
+decoding model on neuron. To use EAGLE speculative decoding, you must use
+a draft model that is specifically fine-tuned for EAGLE speculation.
+Additionally, to use EAGLE with NxD Inference, the draft model must include
+the LM head weights from the target model. These weights are shared between
+the draft and target model.
+"""
+
+from vllm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "What is annapurna labs?",
+]
+
+
+def main():
+    # Create a sampling params object.
+    sampling_params = SamplingParams(top_k=1, max_tokens=500, ignore_eos=True)
+
+    # Create an LLM.
+    llm = LLM(
+        model="/home/ubuntu/model_hf/Meta-Llama-3.1-70B-Instruct",
+        speculative_config={
+            "model": "/home/ubuntu/model_hf/Llama-3.1-70B-Instruct-EAGLE-Draft",
+            "num_speculative_tokens": 5,
+            "max_model_len": 2048,
+        },
+        max_num_seqs=4,
+        # The max_model_len and block_size arguments are required to be same as
+        # max sequence length when targeting neuron device.
+        # Currently, this is a known limitation in continuous batching support
+        # in neuronx-distributed-inference.
+        max_model_len=2048,
+        block_size=2048,
+        # The device can be automatically detected when AWS Neuron SDK is installed.
+        # The device argument can be either unspecified for automated detection,
+        # or explicitly assigned.
+        device="neuron",
+        tensor_parallel_size=32,
+        override_neuron_config={
+            "enable_eagle_speculation": True,
+            "enable_fused_speculation": True,
+        },
+    )
+
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, \n\n\n Generated text: {generated_text!r}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/offline_inference/neuron_int8_quantization.py b/vllm_v0.10.0/examples/offline_inference/neuron_int8_quantization.py
new file mode 100644
index 0000000..c0ecfac
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/neuron_int8_quantization.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+
+from vllm import LLM, SamplingParams
+
+# creates XLA hlo graphs for all the context length buckets.
+os.environ["NEURON_CONTEXT_LENGTH_BUCKETS"] = "128,512,1024,2048"
+# creates XLA hlo graphs for all the token gen buckets.
+os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048"
+# Quantizes neuron model weight to int8 ,
+# The default config for quantization is int8 dtype.
+os.environ["NEURON_QUANT_DTYPE"] = "s8"
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+
+def main():
+    # Create an LLM.
+    llm = LLM(
+        model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        max_num_seqs=8,
+        # The max_model_len and block_size arguments are required to be same as
+        # max sequence length when targeting neuron device.
+        # Currently, this is a known limitation in continuous batching support
+        # in transformers-neuronx.
+        # TODO(liangfu): Support paged-attention in transformers-neuronx.
+        max_model_len=2048,
+        block_size=2048,
+        # ruff: noqa: E501
+        # The device can be automatically detected when AWS Neuron SDK is installed.
+        # The device argument can be either unspecified for automated detection,
+        # or explicitly assigned.
+        device="neuron",
+        quantization="neuron_quant",
+        override_neuron_config={
+            "cast_logits_dtype": "bfloat16",
+        },
+        tensor_parallel_size=2,
+    )
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/offline_inference/neuron_multimodal.py b/vllm_v0.10.0/examples/offline_inference/neuron_multimodal.py
new file mode 100644
index 0000000..26f7505
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/neuron_multimodal.py
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import requests
+import torch
+from neuronx_distributed_inference.models.mllama.utils import add_instruct
+from PIL import Image
+
+from vllm import LLM, SamplingParams, TextPrompt
+
+
+def get_image(image_url):
+    image = Image.open(requests.get(image_url, stream=True).raw)
+    return image
+
+
+# Model Inputs
+PROMPTS = [
+    "What is in this image? Tell me a story",
+    "What is the recipe of mayonnaise in two sentences?",
+    "Describe this image",
+    "What is the capital of Italy famous for?",
+]
+IMAGES = [
+    get_image(
+        "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500"
+    ),
+    None,
+    get_image(
+        "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500"
+    ),
+    None,
+]
+SAMPLING_PARAMS = [
+    dict(top_k=1, temperature=1.0, top_p=1.0, max_tokens=16)
+    for _ in range(len(PROMPTS))
+]
+
+
+def get_VLLM_mllama_model_inputs(prompt, single_image, sampling_params):
+    # Prepare all inputs for mllama generation, including:
+    # 1. put text prompt into instruct chat template
+    # 2. compose single text and single image prompt into Vllm's prompt class
+    # 3. prepare sampling parameters
+    input_image = single_image
+    has_image = torch.tensor([1])
+    if isinstance(single_image, torch.Tensor) and single_image.numel() == 0:
+        has_image = torch.tensor([0])
+
+    instruct_prompt = add_instruct(prompt, has_image)
+    inputs = TextPrompt(prompt=instruct_prompt)
+
+    if input_image is not None:
+        inputs["multi_modal_data"] = {"image": input_image}
+
+    sampling_params = SamplingParams(**sampling_params)
+    return inputs, sampling_params
+
+
+def print_outputs(outputs):
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+def main():
+    assert (
+        len(PROMPTS) == len(IMAGES) == len(SAMPLING_PARAMS)
+    ), f"""Text, image prompts and sampling parameters should have the 
+            same batch size; but got {len(PROMPTS)}, {len(IMAGES)}, 
+            and {len(SAMPLING_PARAMS)}"""
+
+    # Create an LLM.
+    llm = LLM(
+        model="meta-llama/Llama-3.2-11B-Vision-Instruct",
+        max_num_seqs=1,
+        max_model_len=4096,
+        block_size=4096,
+        device="neuron",
+        tensor_parallel_size=32,
+        override_neuron_config={
+            "sequence_parallel_enabled": False,
+            "skip_warmup": True,
+            "save_sharded_checkpoint": True,
+            "on_device_sampling_config": {
+                "global_topk": 1,
+                "dynamic": False,
+                "deterministic": False,
+            },
+        },
+    )
+
+    batched_inputs = []
+    batched_sample_params = []
+    for pmpt, img, params in zip(PROMPTS, IMAGES, SAMPLING_PARAMS):
+        inputs, sampling_params = get_VLLM_mllama_model_inputs(pmpt, img, params)
+        # test batch-size = 1
+        outputs = llm.generate(inputs, sampling_params)
+        print_outputs(outputs)
+        batched_inputs.append(inputs)
+        batched_sample_params.append(sampling_params)
+
+    # test batch-size = 4
+    outputs = llm.generate(batched_inputs, batched_sample_params)
+    print_outputs(outputs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/offline_inference/neuron_speculation.py b/vllm_v0.10.0/examples/offline_inference/neuron_speculation.py
new file mode 100644
index 0000000..7fc22ca
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/neuron_speculation.py
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This example shows how to run offline inference with a speculative
+decoding model on neuron.
+"""
+
+import os
+
+from vllm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "Hello, I am a language model and I can help",
+    "The president of the United States is",
+    "The capital of France is",
+]
+
+
+def config_buckets():
+    """Configure context length and token gen buckets."""
+    # creates XLA hlo graphs for all the context length buckets.
+    os.environ["NEURON_CONTEXT_LENGTH_BUCKETS"] = "128,512,1024,2048"
+    # creates XLA hlo graphs for all the token gen buckets.
+    os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048"
+
+
+def initialize_llm():
+    """Create an LLM with speculative decoding."""
+    return LLM(
+        model="openlm-research/open_llama_7b",
+        speculative_config={
+            "model": "openlm-research/open_llama_3b",
+            "num_speculative_tokens": 4,
+            "max_model_len": 2048,
+        },
+        max_num_seqs=4,
+        max_model_len=2048,
+        block_size=2048,
+        device="neuron",
+        tensor_parallel_size=32,
+    )
+
+
+def process_requests(llm: LLM, sampling_params: SamplingParams):
+    """Generate texts from prompts and print them."""
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+def main():
+    """Main function that sets up the llm and processes prompts."""
+    config_buckets()
+    llm = initialize_llm()
+    # Create a sampling params object.
+    sampling_params = SamplingParams(max_tokens=100, top_k=1)
+    process_requests(llm, sampling_params)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/offline_inference/openai_batch/README.md b/vllm_v0.10.0/examples/offline_inference/openai_batch/README.md
new file mode 100644
index 0000000..631fde9
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/openai_batch/README.md
@@ -0,0 +1,268 @@
+# Offline Inference with the OpenAI Batch file format
+
+```{important}
+This is a guide to performing batch inference using the OpenAI batch file format, **not** the complete Batch (REST) API.
+```
+
+## File Format
+
+The OpenAI batch file format consists of a series of json objects on new lines.
+
+[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl)
+
+Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
+
+```{note}
+We currently support `/v1/chat/completions`, `/v1/embeddings`, and `/v1/score` endpoints (completions coming soon).
+```
+
+## Pre-requisites
+
+* The examples in this document use `meta-llama/Meta-Llama-3-8B-Instruct`.
+  - Create a [user access token](https://huggingface.co/docs/hub/en/security-tokens)
+  - Install the token on your machine (Run `huggingface-cli login`).
+  - Get access to the gated model by [visiting the model card](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and agreeing to the terms and conditions.
+
+## Example 1: Running with a local file
+
+### Step 1: Create your batch file
+
+To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
+
+```bash
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl
+```
+
+Once you've created your batch file it should look like this
+
+```bash
+cat offline_inference/openai_batch/openai_example_batch.jsonl
+{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
+```
+
+### Step 2: Run the batch
+
+The batch running tool is designed to be used from the command line.
+
+You can run the batch with the following command, which will write its results to a file called `results.jsonl`
+
+```bash
+python -m vllm.entrypoints.openai.run_batch \
+    -i offline_inference/openai_batch/openai_example_batch.jsonl \
+    -o results.jsonl \
+    --model meta-llama/Meta-Llama-3-8B-Instruct
+```
+
+or use command-line:
+
+```bash
+vllm run-batch \
+    -i offline_inference/openai_batch/openai_example_batch.jsonl \
+    -o results.jsonl \
+    --model meta-llama/Meta-Llama-3-8B-Instruct
+```
+
+### Step 3: Check your results
+
+You should now have your results at `results.jsonl`. You can check your results by running `cat results.jsonl`
+
+```bash
+cat results.jsonl
+{"id":"vllm-383d1c59835645aeb2e07d004d62a826","custom_id":"request-1","response":{"id":"cmpl-61c020e54b964d5a98fa7527bfcdd378","object":"chat.completion","created":1715633336,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"Hello! It's great to meet you! I'm here to help with any questions or tasks you may have. What's on your mind today?"},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":25,"total_tokens":56,"completion_tokens":31}},"error":null}
+{"id":"vllm-42e3d09b14b04568afa3f1797751a267","custom_id":"request-2","response":{"id":"cmpl-f44d049f6b3a42d4b2d7850bb1e31bcc","object":"chat.completion","created":1715633336,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"*silence*"},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":27,"total_tokens":32,"completion_tokens":5}},"error":null}
+```
+
+## Example 2: Using remote files
+
+The batch runner supports remote input and output urls that are accessible via http/https.
+
+For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl`, you can run
+
+```bash
+python -m vllm.entrypoints.openai.run_batch \
+    -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
+    -o results.jsonl \
+    --model meta-llama/Meta-Llama-3-8B-Instruct
+```
+
+or use command-line:
+
+```bash
+vllm run-batch \
+    -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
+    -o results.jsonl \
+    --model meta-llama/Meta-Llama-3-8B-Instruct
+```
+
+## Example 3: Integrating with AWS S3
+
+To integrate with cloud blob storage, we recommend using presigned urls.
+
+[Learn more about S3 presigned urls here]
+
+### Additional prerequisites
+
+* [Create an S3 bucket](https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-bucket.html).
+* The `awscli` package (Run `pip install awscli`) to configure your credentials and interactively use s3.
+  - [Configure your credentials](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-quickstart.html).
+* The `boto3` python package (Run `pip install boto3`) to generate presigned urls.
+
+### Step 1: Upload your input script
+
+To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
+
+```bash
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl
+```
+
+Once you've created your batch file it should look like this
+
+```bash
+cat offline_inference/openai_batch/openai_example_batch.jsonl
+{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
+```
+
+Now upload your batch file to your S3 bucket.
+
+```bash
+aws s3 cp offline_inference/openai_batch/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
+```
+
+### Step 2: Generate your presigned urls
+
+Presigned urls can only be generated via the SDK. You can run the following python script to generate your presigned urls. Be sure to replace the `MY_BUCKET`, `MY_INPUT_FILE.jsonl`, and `MY_OUTPUT_FILE.jsonl` placeholders with your bucket and file names.
+
+(The script is adapted from <https://github.com/awsdocs/aws-doc-sdk-examples/blob/main/python/example_code/s3/s3_basics/presigned_url.py>)
+
+```python
+import boto3
+from botocore.exceptions import ClientError
+
+def generate_presigned_url(s3_client, client_method, method_parameters, expires_in):
+    """
+    Generate a presigned Amazon S3 URL that can be used to perform an action.
+
+    :param s3_client: A Boto3 Amazon S3 client.
+    :param client_method: The name of the client method that the URL performs.
+    :param method_parameters: The parameters of the specified client method.
+    :param expires_in: The number of seconds the presigned URL is valid for.
+    :return: The presigned URL.
+    """
+    try:
+        url = s3_client.generate_presigned_url(
+            ClientMethod=client_method, Params=method_parameters, ExpiresIn=expires_in
+        )
+    except ClientError:
+        raise
+    return url
+
+
+s3_client = boto3.client("s3")
+input_url = generate_presigned_url(
+    s3_client, "get_object", {"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"}, 3600
+)
+output_url = generate_presigned_url(
+    s3_client, "put_object", {"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"}, 3600
+)
+print(f"{input_url=}")
+print(f"{output_url=}")
+```
+
+This script should output
+
+```text
+input_url='https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091'
+output_url='https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091'
+```
+
+### Step 3: Run the batch runner using your presigned urls
+
+You can now run the batch runner, using the urls generated in the previous section.
+
+```bash
+python -m vllm.entrypoints.openai.run_batch \
+    -i "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
+    -o "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
+    --model --model meta-llama/Meta-Llama-3-8B-Instruct
+```
+
+or use command-line:
+
+```bash
+vllm run-batch \
+    -i "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
+    -o "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
+    --model --model meta-llama/Meta-Llama-3-8B-Instruct
+```
+
+### Step 4: View your results
+
+Your results are now on S3. You can view them in your terminal by running
+
+```bash
+aws s3 cp s3://MY_BUCKET/MY_OUTPUT_FILE.jsonl -
+```
+
+## Example 4: Using embeddings endpoint
+
+### Additional prerequisites
+
+* Ensure you are using `vllm >= 0.5.5`.
+
+### Step 1: Create your batch file
+
+Add embedding requests to your batch file. The following is an example:
+
+```text
+{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are an unhelpful assistant."}}
+```
+
+You can even mix chat completion and embedding requests in the batch file, as long as the model you are using supports both chat completion and embeddings (note that all requests must use the same model).
+
+### Step 2: Run the batch
+
+You can run the batch using the same command as in earlier examples.
+
+### Step 3: Check your results
+
+You can check your results by running `cat results.jsonl`
+
+```bash
+cat results.jsonl
+{"id":"vllm-db0f71f7dec244e6bce530e0b4ef908b","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-3580bf4d4ae54d52b67eee266a6eab20","body":{"id":"embd-33ac2efa7996430184461f2e38529746","object":"list","created":444647,"model":"intfloat/e5-mistral-7b-instruct","data":[{"index":0,"object":"embedding","embedding":[0.016204833984375,0.0092010498046875,0.0018358230590820312,-0.0028228759765625,0.001422882080078125,-0.0031147003173828125,...]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0}}},"error":null}
+...
+```
+
+## Example 5: Using score endpoint
+
+### Additional prerequisites
+
+* Ensure you are using `vllm >= 0.7.0`.
+
+### Step 1: Create your batch file
+
+Add score requests to your batch file. The following is an example:
+
+```text
+{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
+```
+
+You can mix chat completion, embedding, and score requests in the batch file, as long as the model you are using supports them all (note that all requests must use the same model).
+
+### Step 2: Run the batch
+
+You can run the batch using the same command as in earlier examples.
+
+### Step 3: Check your results
+
+You can check your results by running `cat results.jsonl`
+
+```bash
+cat results.jsonl
+{"id":"vllm-f87c5c4539184f618e555744a2965987","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-806ab64512e44071b37d3f7ccd291413","body":{"id":"score-4ee45236897b4d29907d49b01298cdb1","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.0010900497436523438},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null}
+{"id":"vllm-41990c51a26d4fac8419077f12871099","custom_id":"request-2","response":{"status_code":200,"request_id":"vllm-batch-73ce66379026482699f81974e14e1e99","body":{"id":"score-13f2ffe6ba40460fbf9f7f00ad667d75","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.001094818115234375},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null}
+```
diff --git a/vllm_v0.10.0/examples/offline_inference/openai_batch/openai_example_batch.jsonl b/vllm_v0.10.0/examples/offline_inference/openai_batch/openai_example_batch.jsonl
new file mode 100644
index 0000000..54ac8c8
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/openai_batch/openai_example_batch.jsonl
@@ -0,0 +1,2 @@
+{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
diff --git a/vllm_v0.10.0/examples/offline_inference/prefix_caching.py b/vllm_v0.10.0/examples/offline_inference/prefix_caching.py
new file mode 100644
index 0000000..6998913
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/prefix_caching.py
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import LLM, SamplingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+
+# NOTE: This is just a running example. For benchmarking purpose,
+# please see benchmarks/benchmark_prefix_caching.py
+
+# Common prefix.
+prefix = (
+    "You are an expert school principal, skilled in effectively managing "
+    "faculty and staff. Draft 10-15 questions for a potential first grade "
+    "Head Teacher for my K-12, all-girls', independent school that emphasizes "
+    "community, joyful discovery, and life-long learning. The candidate is "
+    "coming in for a first-round panel interview for a 8th grade Math "
+    "teaching role. They have 5 years of previous teaching experience "
+    "as an assistant teacher at a co-ed, public school with experience "
+    "in middle school math teaching. Based on these information, fulfill "
+    "the following paragraph: "
+)
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+generating_prompts = [prefix + prompt for prompt in prompts]
+
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.0)
+
+
+def main():
+    # Create an LLM without prefix caching as a baseline.
+    regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)
+
+    print("Results without `enable_prefix_caching`")
+
+    # ruff: noqa: E501
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = regular_llm.generate(generating_prompts, sampling_params)
+
+    regular_generated_texts = []
+    # Print the outputs.
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        regular_generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+    # Destroy the LLM object and free up the GPU memory.
+    del regular_llm
+    cleanup_dist_env_and_memory()
+
+    # Create an LLM with prefix caching enabled.
+    prefix_cached_llm = LLM(
+        model="facebook/opt-125m",
+        enable_prefix_caching=True,
+        gpu_memory_utilization=0.4,
+    )
+
+    # Warmup so that the shared prompt's KV cache is computed.
+    prefix_cached_llm.generate(generating_prompts[0], sampling_params)
+
+    # Generate with prefix caching.
+    outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
+
+    print("Results with `enable_prefix_caching`")
+
+    cached_generated_texts = []
+    # Print the outputs. You should see the same outputs as before.
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        cached_generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+    # Compare the results and display the speedup
+    generated_same = all(
+        [
+            regular_generated_texts[i] == cached_generated_texts[i]
+            for i in range(len(prompts))
+        ]
+    )
+    print(f"Generated answers are the same: {generated_same}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/offline_inference/prithvi_geospatial_mae.py b/vllm_v0.10.0/examples/offline_inference/prithvi_geospatial_mae.py
new file mode 100644
index 0000000..4fdc7a3
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/prithvi_geospatial_mae.py
@@ -0,0 +1,415 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import datetime
+import os
+import re
+from typing import Union
+
+import albumentations
+import numpy as np
+import rasterio
+import torch
+from einops import rearrange
+from terratorch.datamodules import Sen1Floods11NonGeoDataModule
+
+from vllm import LLM
+
+torch.set_default_dtype(torch.float16)
+
+NO_DATA = -9999
+NO_DATA_FLOAT = 0.0001
+OFFSET = 0
+PERCENTILE = 99
+
+datamodule_config = {
+    "bands": ["BLUE", "GREEN", "RED", "NIR_NARROW", "SWIR_1", "SWIR_2"],
+    "batch_size": 16,
+    "constant_scale": 0.0001,
+    "data_root": "/dccstor/geofm-finetuning/datasets/sen1floods11",
+    "drop_last": True,
+    "no_data_replace": 0.0,
+    "no_label_replace": -1,
+    "num_workers": 8,
+    "test_transform": [
+        albumentations.Resize(
+            always_apply=False, height=448, interpolation=1, p=1, width=448
+        ),
+        albumentations.pytorch.ToTensorV2(
+            transpose_mask=False, always_apply=True, p=1.0
+        ),
+    ],
+}
+
+
+class PrithviMAE:
+    def __init__(self, model):
+        self.model = LLM(
+            model=model, skip_tokenizer_init=True, dtype="float16", enforce_eager=True
+        )
+
+    def run(self, input_data, location_coords):
+        # merge the inputs into one data structure
+        if input_data is not None and input_data.dtype == torch.float32:
+            input_data = input_data.to(torch.float16)
+            input_data = input_data[0]
+
+        mm_data = {
+            "pixel_values": input_data,
+            "location_coords": location_coords,
+        }
+
+        prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data}
+        outputs = self.model.encode(prompt, use_tqdm=False)
+
+        return outputs[0].outputs.data
+
+
+def generate_datamodule():
+    datamodule = Sen1Floods11NonGeoDataModule(
+        data_root=datamodule_config["data_root"],
+        batch_size=datamodule_config["batch_size"],
+        num_workers=datamodule_config["num_workers"],
+        bands=datamodule_config["bands"],
+        drop_last=datamodule_config["drop_last"],
+        test_transform=datamodule_config["test_transform"],
+    )
+
+    return datamodule
+
+
+def process_channel_group(orig_img, channels):
+    """
+    Args:
+        orig_img: torch.Tensor representing original image (reference)
+        with shape = (bands, H, W).
+        channels: list of indices representing RGB channels.
+
+    Returns:
+        torch.Tensor with shape (num_channels, height, width)
+        for original image
+    """
+
+    orig_img = orig_img[channels, ...]
+    valid_mask = torch.ones_like(orig_img, dtype=torch.bool)
+    valid_mask[orig_img == NO_DATA_FLOAT] = False
+
+    # Rescale (enhancing contrast)
+    max_value = max(3000, np.percentile(orig_img[valid_mask], PERCENTILE))
+    min_value = OFFSET
+
+    orig_img = torch.clamp((orig_img - min_value) / (max_value - min_value), 0, 1)
+
+    # No data as zeros
+    orig_img[~valid_mask] = 0
+
+    return orig_img
+
+
+def read_geotiff(file_path: str):
+    """Read all bands from *file_path* and return image + meta info.
+
+    Args:
+        file_path: path to image file.
+
+    Returns:
+        np.ndarray with shape (bands, height, width)
+        meta info dict
+    """
+
+    with rasterio.open(file_path) as src:
+        img = src.read()
+        meta = src.meta
+        try:
+            coords = src.lnglat()
+        except Exception:
+            # Cannot read coords
+            coords = None
+
+    return img, meta, coords
+
+
+def save_geotiff(image, output_path: str, meta: dict):
+    """Save multi-band image in Geotiff file.
+
+    Args:
+        image: np.ndarray with shape (bands, height, width)
+        output_path: path where to save the image
+        meta: dict with meta info.
+    """
+
+    with rasterio.open(output_path, "w", **meta) as dest:
+        for i in range(image.shape[0]):
+            dest.write(image[i, :, :], i + 1)
+
+    return
+
+
+def _convert_np_uint8(float_image: torch.Tensor):
+    image = float_image.numpy() * 255.0
+    image = image.astype(dtype=np.uint8)
+
+    return image
+
+
+def load_example(
+    file_paths: list[str],
+    mean: list[float] = None,
+    std: list[float] = None,
+    indices: Union[list[int], None] = None,
+):
+    """Build an input example by loading images in *file_paths*.
+
+    Args:
+        file_paths: list of file paths .
+        mean: list containing mean values for each band in the
+              images in *file_paths*.
+        std: list containing std values for each band in the
+             images in *file_paths*.
+
+    Returns:
+        np.array containing created example
+        list of meta info for each image in *file_paths*
+    """
+
+    imgs = []
+    metas = []
+    temporal_coords = []
+    location_coords = []
+
+    for file in file_paths:
+        img, meta, coords = read_geotiff(file)
+
+        # Rescaling (don't normalize on nodata)
+        img = np.moveaxis(img, 0, -1)  # channels last for rescaling
+        if indices is not None:
+            img = img[..., indices]
+        if mean is not None and std is not None:
+            img = np.where(img == NO_DATA, NO_DATA_FLOAT, (img - mean) / std)
+
+        imgs.append(img)
+        metas.append(meta)
+        if coords is not None:
+            location_coords.append(coords)
+
+        try:
+            match = re.search(r"(\d{7,8}T\d{6})", file)
+            if match:
+                year = int(match.group(1)[:4])
+                julian_day = match.group(1).split("T")[0][4:]
+                if len(julian_day) == 3:
+                    julian_day = int(julian_day)
+                else:
+                    julian_day = (
+                        datetime.datetime.strptime(julian_day, "%m%d")
+                        .timetuple()
+                        .tm_yday
+                    )
+                temporal_coords.append([year, julian_day])
+        except Exception as e:
+            print(f"Could not extract timestamp for {file} ({e})")
+
+    imgs = np.stack(imgs, axis=0)  # num_frames, H, W, C
+    imgs = np.moveaxis(imgs, -1, 0).astype("float32")  # C, num_frames, H, W
+    imgs = np.expand_dims(imgs, axis=0)  # add batch di
+
+    return imgs, temporal_coords, location_coords, metas
+
+
+def run_model(
+    input_data,
+    temporal_coords,
+    location_coords,
+    model,
+    datamodule,
+    img_size,
+    lightning_model=None,
+):
+    # Reflect pad if not divisible by img_size
+    original_h, original_w = input_data.shape[-2:]
+    pad_h = (img_size - (original_h % img_size)) % img_size
+    pad_w = (img_size - (original_w % img_size)) % img_size
+    input_data = np.pad(
+        input_data, ((0, 0), (0, 0), (0, 0), (0, pad_h), (0, pad_w)), mode="reflect"
+    )
+
+    # Build sliding window
+
+    batch_size = 1
+    # batch = torch.tensor(input_data, device="cpu")
+    batch = torch.tensor(input_data)
+    windows = batch.unfold(3, img_size, img_size).unfold(4, img_size, img_size)
+    h1, w1 = windows.shape[3:5]
+    windows = rearrange(
+        windows, "b c t h1 w1 h w -> (b h1 w1) c t h w", h=img_size, w=img_size
+    )
+
+    # Split into batches if number of windows > batch_size
+    num_batches = windows.shape[0] // batch_size if windows.shape[0] > batch_size else 1
+    windows = torch.tensor_split(windows, num_batches, dim=0)
+
+    if temporal_coords:
+        temporal_coords = torch.tensor(temporal_coords).unsqueeze(0)
+    else:
+        temporal_coords = None
+    if location_coords:
+        location_coords = torch.tensor(location_coords[0]).unsqueeze(0)
+    else:
+        location_coords = None
+
+    # Run Prithvi-EO-V2-300M-TL-Sen1Floods11
+    pred_imgs = []
+    for x in windows:
+        # Apply standardization
+        x = datamodule.test_transform(image=x.squeeze().numpy().transpose(1, 2, 0))
+        x = datamodule.aug(x)["image"]
+
+        with torch.no_grad():
+            pred = model.run(x, location_coords=location_coords)
+        y_hat = pred.argmax(dim=1)
+
+        y_hat = torch.nn.functional.interpolate(
+            y_hat.unsqueeze(1).float(), size=img_size, mode="nearest"
+        )
+
+        pred_imgs.append(y_hat)
+
+    pred_imgs = torch.concat(pred_imgs, dim=0)
+
+    # Build images from patches
+    pred_imgs = rearrange(
+        pred_imgs,
+        "(b h1 w1) c h w -> b c (h1 h) (w1 w)",
+        h=img_size,
+        w=img_size,
+        b=1,
+        c=1,
+        h1=h1,
+        w1=w1,
+    )
+
+    # Cut padded area back to original size
+    pred_imgs = pred_imgs[..., :original_h, :original_w]
+
+    # Squeeze (batch size 1)
+    pred_imgs = pred_imgs[0]
+
+    return pred_imgs
+
+
+def main(
+    data_file: str,
+    model: str,
+    output_dir: str,
+    rgb_outputs: bool,
+    input_indices: list[int] = None,
+):
+    os.makedirs(output_dir, exist_ok=True)
+
+    model_obj = PrithviMAE(model=model)
+    datamodule = generate_datamodule()
+    img_size = 512  # Size of Sen1Floods11
+
+    input_data, temporal_coords, location_coords, meta_data = load_example(
+        file_paths=[data_file],
+        indices=input_indices,
+    )
+
+    meta_data = meta_data[0]  # only one image
+
+    if input_data.mean() > 1:
+        input_data = input_data / 10000  # Convert to range 0-1
+
+    channels = [
+        datamodule_config["bands"].index(b) for b in ["RED", "GREEN", "BLUE"]
+    ]  # BGR -> RGB
+
+    pred = run_model(
+        input_data, temporal_coords, location_coords, model_obj, datamodule, img_size
+    )
+    # Save pred
+    meta_data.update(count=1, dtype="uint8", compress="lzw", nodata=0)
+    pred_file = os.path.join(
+        output_dir, f"pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff"
+    )
+    save_geotiff(_convert_np_uint8(pred), pred_file, meta_data)
+
+    # Save image + pred
+    meta_data.update(count=3, dtype="uint8", compress="lzw", nodata=0)
+
+    if input_data.mean() < 1:
+        input_data = input_data * 10000  # Scale to 0-10000
+
+    rgb_orig = process_channel_group(
+        orig_img=torch.Tensor(input_data[0, :, 0, ...]),
+        channels=channels,
+    )
+    rgb_orig = rgb_orig.to(torch.float32)
+
+    pred[pred == 0.0] = np.nan
+    img_pred = rgb_orig * 0.7 + pred * 0.3
+    img_pred[img_pred.isnan()] = rgb_orig[img_pred.isnan()]
+
+    img_pred_file = os.path.join(
+        output_dir, f"rgb_pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff"
+    )
+    save_geotiff(
+        image=_convert_np_uint8(img_pred),
+        output_path=img_pred_file,
+        meta=meta_data,
+    )
+
+    # Save image rgb
+    if rgb_outputs:
+        name_suffix = os.path.splitext(os.path.basename(data_file))[0]
+        rgb_file = os.path.join(
+            output_dir,
+            f"original_rgb_{name_suffix}.tiff",
+        )
+        save_geotiff(
+            image=_convert_np_uint8(rgb_orig),
+            output_path=rgb_file,
+            meta=meta_data,
+        )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("MAE run inference", add_help=False)
+
+    parser.add_argument(
+        "--data_file",
+        type=str,
+        default="./India_900498_S2Hand.tif",
+        help="Path to the file.",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
+        help="Path to a checkpoint file to load from.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="output",
+        help="Path to the directory where to save outputs.",
+    )
+    parser.add_argument(
+        "--input_indices",
+        default=[1, 2, 3, 8, 11, 12],
+        type=int,
+        nargs="+",
+        help="""
+        0-based indices of the six Prithvi channels to be selected from the input.
+        By default selects [1,2,3,8,11,12] for S2L1C data.
+        """,
+    )
+    parser.add_argument(
+        "--rgb_outputs",
+        action="store_true",
+        help="If present, output files will only contain RGB channels. "
+        "Otherwise, all bands will be saved.",
+    )
+    args = parser.parse_args()
+
+    main(**vars(args))
diff --git a/vllm_v0.10.0/examples/offline_inference/profiling.py b/vllm_v0.10.0/examples/offline_inference/profiling.py
new file mode 100644
index 0000000..392fba8
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/profiling.py
@@ -0,0 +1,510 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import inspect
+import json
+import os
+import sys
+from argparse import RawTextHelpFormatter
+from collections.abc import Generator
+from dataclasses import asdict, dataclass
+from typing import Any, Optional, TypeAlias
+
+import torch
+import tqdm
+
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.profiler.layerwise_profile import layerwise_profile
+from vllm.utils import FlexibleArgumentParser
+
+BATCH_SIZE_DEFAULT = 1
+PROMPT_LEN_DEFAULT = 256
+
+
+@dataclass
+class ProfileContext:
+    engine_args: EngineArgs
+    prompt_len: int
+    batch_size: int
+
+    # The profiler can run in 2 modes,
+    # 1. Run profiler for user specified num_steps
+    num_steps: Optional[int] = None
+    # 2. Run profiler until all requests complete
+    complete_num_requests_per_step: Optional[int] = None
+
+    save_chrome_traces_folder: Optional[str] = None
+
+
+def get_dtype(dtype: str):
+    if dtype == "torch.float":
+        return torch.float
+    else:
+        return dtype
+
+
+OutputLen_NumReqs_Map: TypeAlias = dict[int, int]
+
+
+def compute_request_output_lengths(
+    batch_size: int, step_requests: list[int]
+) -> OutputLen_NumReqs_Map:
+    """
+    Given the number of requests, batch_size, and the number of requests
+    that each engine-step should process, step_requests, determine the
+    output lengths of the requests such that step_request is honoured.
+
+    Example:
+    if batch size = 128 and step_request = [128, 128, 96, 64, 32, 1]
+    then return,
+    {2 : 32, 3 : 32, 4 : 32, 5 : 31, 6 : 1}, meaning,
+    32 requests should have output length 2,
+    32 requests should have output length 3,
+    32 requests should have output length 4,
+    31 requests should have output length 5,
+    1 request should have output length 6.
+
+    Args:
+        batch_size (int): Number of requests submitted for profile. This is
+            args.batch_size.
+        step_requests (list[int]): step_requests[i] is the number of requests
+            that the ith engine step should process.
+
+    Returns:
+        OutputLen_NumReqs_Map : A dictionary with output-length as keys and the
+            number of requests required to have that output-length as values.
+    """
+    ol_nr: OutputLen_NumReqs_Map = {}
+
+    # Number of request that are assigned an output-length
+    num_reqs_assigned: int = 0
+    num_steps: int = len(step_requests)
+
+    # sanity check. The first step (prefill-step), must process all requests.
+    assert step_requests[0] == batch_size
+
+    # Begin assignments from the last step.
+    output_length: int = num_steps
+    for num_requests_at_step in reversed(step_requests):
+        if num_reqs_assigned == batch_size:
+            break
+
+        assert num_reqs_assigned < batch_size
+
+        # Remove the number of requests that have been determined
+        # to participate in this step and beyond.
+        num_reqs_unassigned_at_step = num_requests_at_step - num_reqs_assigned
+        assert num_reqs_unassigned_at_step >= 0
+
+        if num_reqs_unassigned_at_step > 0:
+            ol_nr[output_length] = num_reqs_unassigned_at_step
+            num_reqs_assigned += num_reqs_unassigned_at_step
+
+        output_length -= 1
+
+    # sanity checks.
+    assert sum(ol_nr.values()) == batch_size, (
+        "Number of requests in output-length assignment does not match "
+        f"batch-size.\n batch size {batch_size} - "
+        f"step requests {step_requests} - assignments {ol_nr}"
+    )
+
+    # Check that the output-length is in [1, num-steps]. Output length must be
+    # at least 1 as all requests must participate in the prefill-step.
+    assert all(ol >= 1 and ol <= num_steps for ol in ol_nr), (
+        "Output lengths of requests should be in range "
+        f"[1, num-engine-steps].\n batch size {batch_size} - "
+        f"step requests {step_requests} - assignments {ol_nr}"
+    )
+
+    return ol_nr
+
+
+def determine_requests_per_step(context: ProfileContext) -> list[int]:
+    """
+    Determine number of requests each engine step should process.
+    If context.num_steps is set, then all engine steps process the
+    same number of requests and the output list is of length
+    context.num_steps.
+
+    If context.complete_num_requests_per_step is set, then each decode step
+    processes fewer and fewer requests until there are no requests to process.
+    In this case, the output list is as big as the number of steps
+    required to process all requests.
+
+    Args:
+        context: ProfileContext object.
+
+    Returns:
+        list[int]: Number of requests to process for all engine-steps.
+         output[i], contains the number of requests that the ith step
+         should process.
+    """
+    if context.num_steps:
+        # All requests must run until num_engine_steps. This implies
+        # that their output lengths must be equal to num_engine_steps.
+        return [context.batch_size] * context.num_steps
+
+    assert (
+        context.complete_num_requests_per_step
+        and context.complete_num_requests_per_step > 0
+    ), (
+        f"Expected a positive complete_num_requests_per_step argument."
+        f"Instead got {context.complete_num_requests_per_step}"
+    )
+
+    # We start dropping after the first decode step.
+    step_requests = [
+        context.batch_size,  # prefill
+        context.batch_size,  # decode
+    ]
+
+    num_running_requests = context.batch_size
+    num_running_requests -= context.complete_num_requests_per_step
+    while num_running_requests > 0:
+        step_requests.append(num_running_requests)
+        num_running_requests -= context.complete_num_requests_per_step
+
+    if step_requests[-1] != 1:
+        # have 1 request running at the last step. This is often
+        # useful
+        step_requests.append(1)
+
+    return step_requests
+
+
+def run_profile(
+    context: ProfileContext, csv_output: Optional[str], json_output: Optional[str]
+):
+    print("Run profile with:")
+    for key, value in asdict(context).items():
+        print(f"  {key} = {value}")
+
+    requests_per_step: list[int] = determine_requests_per_step(context)
+
+    ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths(
+        context.batch_size, requests_per_step
+    )
+
+    num_steps_to_profile: int = len(requests_per_step)
+    max_output_len: int = max(ol_nr.keys())
+    assert max_output_len >= 1
+
+    # Create sampling params
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        # max_tokens is set on a per-request basis.
+        max_tokens=None,
+        ignore_eos=True,
+    )
+
+    # Create LLM
+    llm = LLM(**asdict(context.engine_args))
+    batch_size = context.batch_size
+    prompt_len = context.prompt_len
+
+    scheduler_config = llm.llm_engine.vllm_config.scheduler_config
+    max_model_len = llm.llm_engine.model_config.max_model_len
+    max_num_batched_tokens = scheduler_config.max_num_batched_tokens
+    max_num_seqs = scheduler_config.max_num_seqs
+
+    if batch_size * prompt_len > max_num_batched_tokens:
+        print(
+            f"ERROR: chosen batch_size * prompt_len "
+            f"({batch_size} * {prompt_len} = {batch_size * prompt_len}) is  "
+            f"larger than max_num_batched_tokens ({max_num_batched_tokens}) "
+            f"and therefore cannot be run in a single profile step, please "
+            f"choose a smaller batch size or prompt length, or increase "
+            f"--max-num-batched-tokens"
+        )
+        sys.exit(-1)
+    if batch_size > max_num_seqs:
+        print(
+            f"ERROR: chosen batch_size ({batch_size}) is larger than "
+            f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a "
+            f"single profile step, please choose a smaller batch size"
+        )
+        sys.exit(-1)
+    print(
+        "llm.llm_engine.model_config.max_model_len: ",
+        llm.llm_engine.model_config.max_model_len,
+    )
+    if prompt_len + max_output_len > llm.llm_engine.model_config.max_model_len:
+        print(
+            f"ERROR: chosen prompt_len + max_output_len ({prompt_len} + "
+            f"{max_output_len} = {prompt_len + max_output_len}) is larger "
+            f"than the model's max_model_len ({max_model_len}), please "
+            f"choose a smaller prompt_len or max_output_len, or increase "
+            f"--max-model-len"
+        )
+        sys.exit(-1)
+
+    def add_requests():
+        def get_output_len_generator() -> Generator[int, Any, Any]:
+            for output_len, num_reqs in ol_nr.items():
+                for _ in range(num_reqs):
+                    yield output_len
+
+        output_len_generator = get_output_len_generator()
+        for i in range(batch_size):
+            sampling_params.max_tokens = next(output_len_generator)
+            assert isinstance(sampling_params.max_tokens, int)
+
+            prompt_token_ids = torch.randint(
+                llm.get_tokenizer().vocab_size, size=(prompt_len,)
+            ).tolist()
+
+            llm.llm_engine.add_request(
+                request_id=f"seq{i}",
+                prompt={"prompt_token_ids": prompt_token_ids},
+                params=sampling_params,
+            )
+
+    def abort_requests():
+        for i in range(batch_size):
+            llm.llm_engine.abort_request(f"seq{i}")
+
+    # Warm up run
+    print("Warm up run ...")
+    add_requests()
+    llm.llm_engine.step()  # Prefill
+    llm.llm_engine.step()  # Decode
+    abort_requests()
+
+    print("Profile run ...")
+    add_requests()
+
+    with layerwise_profile() as prefill_prof:
+        llm.llm_engine.step()  # First step is prefill
+
+    decode_profs = []
+    for _ in tqdm.tqdm(range(num_steps_to_profile - 1)):
+        num_running_seqs = llm.llm_engine.scheduler[0].get_num_unfinished_seq_groups()
+        with layerwise_profile(num_running_seqs=num_running_seqs) as decode_prof:
+            llm.llm_engine.step()
+        decode_profs.append(decode_prof)
+
+    decode_results_list = [prof.results for prof in decode_profs]
+    prefill_results = prefill_prof.results
+    has_decode = len(decode_results_list) > 0
+
+    LINE_WIDTH = 80
+    print("=" * LINE_WIDTH)
+    print(f"= Prefill Model Table (prompt_len={prompt_len}, batch_size={batch_size})")
+    print("=" * LINE_WIDTH)
+    print()
+    prefill_results.print_model_table()
+
+    if has_decode:
+        print()
+        print("=" * LINE_WIDTH)
+        print(
+            f"= First Decode Step Model Table "
+            f"(prompt_len={prompt_len}, batch_size={batch_size})"
+        )
+        print("=" * LINE_WIDTH)
+        print()
+        decode_results_list[0].print_model_table()
+
+    print()
+    print("=" * LINE_WIDTH)
+    print(f"= Prefill Summary Table (prompt_len={prompt_len}, batch_size={batch_size})")
+    print("=" * LINE_WIDTH)
+    print()
+    prefill_results.print_summary_table()
+
+    if has_decode:
+        print()
+        print("=" * LINE_WIDTH)
+        print(
+            f"= First Decode Step Summary Table "
+            f"(prompt_len={prompt_len}, batch_size={batch_size})"
+        )
+        print("=" * LINE_WIDTH)
+        print()
+        decode_results_list[0].print_summary_table()
+
+    if csv_output:
+        csv_filename_base = (
+            csv_output[:-4] if csv_output.endswith(".csv") else csv_output
+        )
+        prefill_results.export_model_stats_table_csv(
+            csv_filename_base + "_prefill_model_table.csv"
+        )
+        prefill_results.export_summary_stats_table_csv(
+            csv_filename_base + "_prefill_summary_table.csv"
+        )
+
+        if has_decode:
+            decode_results_list[0].export_model_stats_table_csv(
+                csv_filename_base + "_decode_model_table.csv"
+            )
+            decode_results_list[0].export_summary_stats_table_csv(
+                csv_filename_base + "_decode_summary_table.csv"
+            )
+
+    if json_output:
+        cuda_devices = [
+            torch.cuda.get_device_properties(dev_idx)
+            for dev_idx in range(torch.cuda.device_count())
+        ]
+
+        json_dict = {
+            "context": {
+                "python_version": f"{sys.version}",
+                "torch_version": f"{torch.__version__}",
+                "torch_cuda_version": f"{torch.version.cuda}",
+                "cuda_devices": f"{cuda_devices}",
+                **asdict(context),
+            },
+            "prefill": prefill_results.convert_stats_to_dict(),
+        }
+
+        if has_decode:
+            for idx, dr in enumerate(decode_results_list):
+                json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict()
+
+        # Add .json to json_output filename if it doesn't exist already.
+        json_output_file = (
+            json_output if json_output.endswith(".json") else json_output + ".json"
+        )
+        with open(json_output_file, "w+") as f:
+            json.dump(json_dict, f, indent=2)
+        pass
+
+    if context.save_chrome_traces_folder is not None:
+        os.makedirs(context.save_chrome_traces_folder, exist_ok=True)
+        prefill_prof.profiler.export_chrome_trace(
+            context.save_chrome_traces_folder + "/prefill.json"
+        )
+        for idx, decode_prof in enumerate(decode_profs):
+            decode_prof.profiler.export_chrome_trace(
+                context.save_chrome_traces_folder + f"/decode_{idx + 1}.json"
+            )
+        print(
+            "Traces saved as prefill.json and decode_1.json, etc."
+            f" in folder {context.save_chrome_traces_folder}"
+        )
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description="""
+Profile a model
+
+    example:
+    ```
+    python examples/offline_inference/profiling.py \\
+        --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
+        --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
+        --enforce-eager run_num_steps -n 2
+    ```
+
+    then you can use various tools to analyze the json output
+    terminal ascii tables:
+        ```
+        python tools/profiler/print_layerwise_table.py \\
+            --json-trace Llama31-8b-FP8.json --phase prefill --table summary
+        ```
+    or create matplotlib stacked bar charts:
+        ```
+        python tools/profiler/visualize_layerwise_profile.py \\
+            --json-trace Llama31-8b-FP8.json \\
+            --output-directory profile_breakdown --plot-metric pct_cuda_time
+        ```
+""",
+        formatter_class=RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--csv",
+        type=str,
+        default=None,
+        help="Export the results as multiple csv file. This should be the root "
+        "filename, will create <filename>_prefill_model_table.csv, "
+        "<filename>_prefill_summary_table.csv, "
+        "<filename>_decode_model_table.csv, and "
+        "<filename>_decode_summary_table.csv",
+    )
+    parser.add_argument(
+        "--json",
+        type=str,
+        default=None,
+        help="Export the results as a json file. This should be the filename",
+    )
+    parser.add_argument(
+        "--save-chrome-traces-folder",
+        type=str,
+        help="Save chrome traces for the prefill and decode "
+        "will save traces as prefill.json and decode_1.json, "
+        "etc. inside this folder",
+    )
+    parser.add_argument(
+        "--prompt-len",
+        type=int,
+        default=PROMPT_LEN_DEFAULT,
+        help=f"Length of the random prompt to use when profiling, all batched "
+        f"requests use the same prompt_len, default={PROMPT_LEN_DEFAULT}",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=BATCH_SIZE_DEFAULT,
+        help=f"Number of requests to run as a single batch, "
+        f"default={BATCH_SIZE_DEFAULT}",
+    )
+
+    subparsers = parser.add_subparsers(dest="cmd")
+
+    run_num_steps_parser = subparsers.add_parser(
+        "run_num_steps", help="This variation profiles n engine.step() invocations."
+    )
+    run_num_steps_parser.add_argument(
+        "-n",
+        "--num-steps",
+        type=int,
+        help="Number of engine steps to profile.\n"
+        "Setting it to 1, profiles only the prefill step.\n"
+        "Setting it to 2, profiles the prefill and first decode step\n"
+        "Setting it to 3, profiles the prefill, 1st and 2nd decode steps\n"
+        "and so on ...",
+    )
+
+    run_to_completion_parser = subparsers.add_parser(
+        "run_to_completion",
+        help="This variation profiles all the engine.step() invocations"
+        "until the engine exhausts all submitted requests.",
+    )
+    run_to_completion_parser.add_argument(
+        "-n",
+        "--complete-num-requests-per-step",
+        type=int,
+        help="Complete complete_num_requests_per_step requests every decode step."
+        "For e.g., with batch_size 128 and complete_num_requests_per_step 32,"
+        "the profiler is run for 6 engine steps, with the steps processing, "
+        "128, 128, 96, 64, 32, 1 requests respectively.\n"
+        "Note that we tack-on a one-request step at the end as it is often "
+        "useful.",
+    )
+
+    EngineArgs.add_cli_args(parser)
+
+    return parser.parse_args()
+
+
+def main(args):
+    context = ProfileContext(
+        engine_args=EngineArgs.from_cli_args(args),
+        **{
+            k: v
+            for k, v in vars(args).items()
+            if k in inspect.signature(ProfileContext).parameters
+        },
+    )
+    run_profile(context, csv_output=args.csv, json_output=args.json)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/examples/offline_inference/profiling_tpu/README.md b/vllm_v0.10.0/examples/offline_inference/profiling_tpu/README.md
new file mode 100644
index 0000000..8c9c1c9
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/profiling_tpu/README.md
@@ -0,0 +1,70 @@
+# vLLM TPU Profiling
+
+This script is used to profile the TPU performance of vLLM for specific prefill or decode token shapes.
+
+Note: an actual running server is a mix of both prefill of many shapes and decode of many shapes.
+
+We assume you are on a TPU already (this was tested on TPU v6e) and have installed vLLM according to the [Google TPU installation guide](https://docs.vllm.ai/en/latest/getting_started/installation/google_tpu.html).
+
+> In all examples below, we run several warmups before (so `--enforce-eager` is okay)
+
+## Profile Examples
+
+### Generate Prefill Trace
+
+This example runs Qwen/Qwen2.5-7B-Instruct with a single request of 1024 input tokens. This is set up in attempt to profile just the prefill time and operations.
+
+```bash
+export XLA_HLO_DEBUG=1
+export MODEL=Qwen/Qwen2.5-7B-Instruct
+export VLLM_TPU_PROFILE_DURATION_MS=3000
+export VLLM_TPU_PROFILE_DELAY_MS=0
+
+python3 profiling.py \
+    --model $MODEL \
+    --input-len 1024 --output-len 1 \
+    --batch-size 1 --enforce-eager \
+    --max-model-len 2048 \
+    --tensor-parallel-size 1 \
+    --profile-result-dir profiles
+```
+
+### Generate Decode Trace
+
+This example runs Llama 3.1 70B with a batch of 32 requests where each has 1 input token and 128 output tokens. This is set up in attempt to profile just the 32 decodes running in parallel by having an extremely small prefill of 1 token and setting `VLLM_TPU_PROFILE_DELAY_MS=1000` to skip the first second of inference (hopefully prefill).
+
+```bash
+export XLA_HLO_DEBUG=1
+export MODEL=meta-llama/Llama-3.1-70B-Instruct
+export VLLM_TPU_PROFILE_DURATION_MS=2000
+export VLLM_TPU_PROFILE_DELAY_MS=1000
+
+rm -rf ~/.cache/vllm/xla_cache
+python3 profiling.py \
+    --model $MODEL \
+    --input-len 1 \
+    --output-len 128 \
+    --batch-size 32 \
+    --enforce-eager \
+    --profile-result-dir profiles \
+    --max-model-len 2048 --tensor-parallel-size 8
+```
+
+## Visualizing the profiles
+
+Once you have collected your profiles with this script, you can visualize them using [TensorBoard](https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm).
+
+Here are most likely the dependencies you need to install:
+
+```bash
+pip install tensorflow-cpu \
+    tensorboard-plugin-profile \
+    etils \
+    importlib_resources
+```
+
+Then you just need to point TensorBoard to the directory where you saved the profiles and visit `http://localhost:6006/` in your browser:
+
+```bash
+tensorboard --logdir profiles/ --port 6006
+```
diff --git a/vllm_v0.10.0/examples/offline_inference/profiling_tpu/profiling.py b/vllm_v0.10.0/examples/offline_inference/profiling_tpu/profiling.py
new file mode 100644
index 0000000..dfcbd8c
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/profiling_tpu/profiling.py
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import dataclasses
+import os
+import time
+
+import numpy as np
+import torch_xla.debug.profiler as xp
+from tqdm import tqdm
+
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.inputs import PromptType
+from vllm.utils import FlexibleArgumentParser
+
+DURATION_MS = int(os.getenv("VLLM_TPU_PROFILE_DURATION_MS", 3000))
+DELAY_MS = int(os.getenv("VLLM_TPU_PROFILE_DELAY_MS", 0))
+
+
+def main(args: argparse.Namespace):
+    print(args)
+
+    engine_args = EngineArgs.from_cli_args(args)
+    llm = LLM(**dataclasses.asdict(engine_args))
+    server = xp.start_server(9012)  # noqa: F841
+
+    sampling_params = SamplingParams(
+        temperature=0.0,
+        ignore_eos=True,
+        max_tokens=args.output_len,
+    )
+    print(sampling_params)
+    dummy_prompt_token_ids = np.random.randint(
+        10000, size=(args.batch_size, args.input_len)
+    )
+    dummy_prompts: list[PromptType] = [
+        {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
+    ]
+
+    def run_to_completion():
+        start_time = time.perf_counter()
+        llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
+        end_time = time.perf_counter()
+        latency = end_time - start_time
+        return latency
+
+    # Warmup
+    print("Warming up...")
+    warmup_latencies = []
+    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+        warmup_latencies.append(run_to_completion())
+    print(f"Average warmup latency: {np.mean(warmup_latencies):.4f}s")
+
+    # Profile
+    profile_dir = args.profile_result_dir
+    print(f"Profiling (results will be saved to '{profile_dir}')...")
+    # Enable tracing on server
+    xp.trace_detached(
+        "localhost:9012", profile_dir, delay_ms=DELAY_MS, duration_ms=DURATION_MS
+    )
+    if DELAY_MS == 0:
+        time.sleep(1.0)
+    profile_latencies = []
+    for _ in tqdm(range(args.num_iters), desc="Profile iterations"):
+        profile_latencies.append(run_to_completion())
+    print(f"Average profile latency: {np.mean(profile_latencies):.4f}s")
+
+    return
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description="Benchmark the latency of processing a single batch of "
+        "requests till completion."
+    )
+    parser.add_argument("--input-len", type=int, default=32)
+    parser.add_argument("--output-len", type=int, default=128)
+    parser.add_argument("--batch-size", type=int, default=8)
+    parser.add_argument(
+        "--num-iters-warmup",
+        type=int,
+        default=5,
+        help="Number of iterations to run for warmup.",
+    )
+    parser.add_argument(
+        "--num-iters",
+        type=int,
+        default=1,
+        help="Number of iterations to run for profiling.",
+    )
+    parser.add_argument(
+        "--profile-result-dir",
+        type=str,
+        default="profiles",
+        help=(
+            "path to save the pytorch profiler output. Can be visualized "
+            "with ui.perfetto.dev or Tensorboard "
+            "(https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm)."
+        ),
+    )
+
+    parser = EngineArgs.add_cli_args(parser)
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/examples/offline_inference/prompt_embed_inference.py b/vllm_v0.10.0/examples/offline_inference/prompt_embed_inference.py
new file mode 100644
index 0000000..5d79222
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/prompt_embed_inference.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Demonstrates how to generate prompt embeddings using
+Hugging Face Transformers  and use them as input to vLLM
+for both single and batch inference.
+
+Model: meta-llama/Llama-3.2-1B-Instruct
+Note: This model is gated on Hugging Face Hub.
+      You must request access to use it:
+      https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct
+
+Requirements:
+- vLLM
+- transformers
+
+Run:
+    python examples/offline_inference/prompt_embed_inference.py
+"""
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizer
+
+from vllm import LLM
+
+
+def init_tokenizer_and_llm(model_name: str):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
+    embedding_layer = transformers_model.get_input_embeddings()
+    llm = LLM(model=model_name, enable_prompt_embeds=True)
+    return tokenizer, embedding_layer, llm
+
+
+def get_prompt_embeds(
+    chat: list[dict[str, str]],
+    tokenizer: PreTrainedTokenizer,
+    embedding_layer: torch.nn.Module,
+):
+    token_ids = tokenizer.apply_chat_template(
+        chat, add_generation_prompt=True, return_tensors="pt"
+    )
+    prompt_embeds = embedding_layer(token_ids).squeeze(0)
+    return prompt_embeds
+
+
+def single_prompt_inference(
+    llm: LLM, tokenizer: PreTrainedTokenizer, embedding_layer: torch.nn.Module
+):
+    chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
+    prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)
+
+    outputs = llm.generate(
+        {
+            "prompt_embeds": prompt_embeds,
+        }
+    )
+
+    print("\n[Single Inference Output]")
+    print("-" * 30)
+    for o in outputs:
+        print(o.outputs[0].text)
+    print("-" * 30)
+
+
+def batch_prompt_inference(
+    llm: LLM, tokenizer: PreTrainedTokenizer, embedding_layer: torch.nn.Module
+):
+    chats = [
+        [{"role": "user", "content": "Please tell me about the capital of France."}],
+        [{"role": "user", "content": "When is the day longest during the year?"}],
+        [{"role": "user", "content": "Where is bigger, the moon or the sun?"}],
+    ]
+
+    prompt_embeds_list = [
+        get_prompt_embeds(chat, tokenizer, embedding_layer) for chat in chats
+    ]
+
+    outputs = llm.generate([{"prompt_embeds": embeds} for embeds in prompt_embeds_list])
+
+    print("\n[Batch Inference Outputs]")
+    print("-" * 30)
+    for i, o in enumerate(outputs):
+        print(f"Q{i + 1}: {chats[i][0]['content']}")
+        print(f"A{i + 1}: {o.outputs[0].text}\n")
+    print("-" * 30)
+
+
+def main():
+    model_name = "meta-llama/Llama-3.2-1B-Instruct"
+    tokenizer, embedding_layer, llm = init_tokenizer_and_llm(model_name)
+    single_prompt_inference(llm, tokenizer, embedding_layer)
+    batch_prompt_inference(llm, tokenizer, embedding_layer)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/offline_inference/qwen2_5_omni/README.md b/vllm_v0.10.0/examples/offline_inference/qwen2_5_omni/README.md
new file mode 100644
index 0000000..16d44cb
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/qwen2_5_omni/README.md
@@ -0,0 +1,42 @@
+# Qwen2.5-Omni Offline Inference Examples
+
+This folder provides several example scripts on how to inference Qwen2.5-Omni offline.
+
+## Thinker Only
+
+```bash
+# Audio + image + video
+python examples/offline_inference/qwen2_5_omni/only_thinker.py \
+    -q mixed_modalities
+
+# Read vision and audio inputs from a single video file
+# NOTE: V1 engine does not support interleaved modalities yet.
+VLLM_USE_V1=0 \
+python examples/offline_inference/qwen2_5_omni/only_thinker.py \
+    -q use_audio_in_video
+
+# Multiple audios
+VLLM_USE_V1=0 \
+python examples/offline_inference/qwen2_5_omni/only_thinker.py \
+    -q multi_audios
+```
+
+This script will run the thinker part of Qwen2.5-Omni, and generate text response.
+
+You can also test Qwen2.5-Omni on a single modality:
+
+```bash
+# Process audio inputs
+python examples/offline_inference/audio_language.py \
+    --model-type qwen2_5_omni
+
+# Process image inputs
+python examples/offline_inference/vision_language.py \
+    --modality image \
+    --model-type qwen2_5_omni
+
+# Process video inputs
+python examples/offline_inference/vision_language.py \
+    --modality video \
+    --model-type qwen2_5_omni
+```
diff --git a/vllm_v0.10.0/examples/offline_inference/qwen2_5_omni/only_thinker.py b/vllm_v0.10.0/examples/offline_inference/qwen2_5_omni/only_thinker.py
new file mode 100644
index 0000000..62effd5
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/qwen2_5_omni/only_thinker.py
@@ -0,0 +1,175 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This example shows how to use vLLM for running offline inference
+with the correct prompt format on Qwen2.5-Omni (thinker only).
+"""
+
+from typing import NamedTuple
+
+import vllm.envs as envs
+from vllm import LLM, SamplingParams
+from vllm.assets.audio import AudioAsset
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+from vllm.multimodal.image import convert_image_mode
+from vllm.utils import FlexibleArgumentParser
+
+
+class QueryResult(NamedTuple):
+    inputs: dict
+    limit_mm_per_prompt: dict[str, int]
+
+
+# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
+# lower-end GPUs.
+# Unless specified, these settings have been tested to work on a single L4.
+
+default_system = (
+    "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
+    "Group, capable of perceiving auditory and visual inputs, as well as "
+    "generating text and speech."
+)
+
+
+def get_mixed_modalities_query() -> QueryResult:
+    question = (
+        "What is recited in the audio? "
+        "What is the content of this image? Why is this video funny?"
+    )
+    prompt = (
+        f"<|im_start|>system\n{default_system}<|im_end|>\n"
+        "<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>"
+        "<|vision_bos|><|IMAGE|><|vision_eos|>"
+        "<|vision_bos|><|VIDEO|><|vision_eos|>"
+        f"{question}<|im_end|>\n"
+        f"<|im_start|>assistant\n"
+    )
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
+                "image": convert_image_mode(
+                    ImageAsset("cherry_blossom").pil_image, "RGB"
+                ),
+                "video": VideoAsset(name="baby_reading", num_frames=16).np_ndarrays,
+            },
+        },
+        limit_mm_per_prompt={"audio": 1, "image": 1, "video": 1},
+    )
+
+
+def get_use_audio_in_video_query() -> QueryResult:
+    question = (
+        "Describe the content of the video, then convert what the baby say into text."
+    )
+    prompt = (
+        f"<|im_start|>system\n{default_system}<|im_end|>\n"
+        "<|im_start|>user\n<|vision_bos|><|VIDEO|><|vision_eos|>"
+        f"{question}<|im_end|>\n"
+        f"<|im_start|>assistant\n"
+    )
+    asset = VideoAsset(name="baby_reading", num_frames=16)
+    audio = asset.get_audio(sampling_rate=16000)
+    assert not envs.VLLM_USE_V1, (
+        "V1 does not support use_audio_in_video. "
+        "Please launch this example with "
+        "`VLLM_USE_V1=0`."
+    )
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "video": asset.np_ndarrays,
+                "audio": audio,
+            },
+            "mm_processor_kwargs": {
+                "use_audio_in_video": True,
+            },
+        },
+        limit_mm_per_prompt={"audio": 1, "video": 1},
+    )
+
+
+def get_multi_audios_query() -> QueryResult:
+    question = "Are these two audio clips the same?"
+    prompt = (
+        f"<|im_start|>system\n{default_system}<|im_end|>\n"
+        "<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>"
+        "<|audio_bos|><|AUDIO|><|audio_eos|>"
+        f"{question}<|im_end|>\n"
+        f"<|im_start|>assistant\n"
+    )
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "audio": [
+                    AudioAsset("winning_call").audio_and_sample_rate,
+                    AudioAsset("mary_had_lamb").audio_and_sample_rate,
+                ],
+            },
+        },
+        limit_mm_per_prompt={
+            "audio": 2,
+        },
+    )
+
+
+query_map = {
+    "mixed_modalities": get_mixed_modalities_query,
+    "use_audio_in_video": get_use_audio_in_video_query,
+    "multi_audios": get_multi_audios_query,
+}
+
+
+def main(args):
+    model_name = "Qwen/Qwen2.5-Omni-7B"
+    query_result = query_map[args.query_type]()
+
+    llm = LLM(
+        model=model_name,
+        max_model_len=5632,
+        max_num_seqs=5,
+        limit_mm_per_prompt=query_result.limit_mm_per_prompt,
+        seed=args.seed,
+    )
+
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(temperature=0.2, max_tokens=64)
+
+    outputs = llm.generate(query_result.inputs, sampling_params=sampling_params)
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description="Demo on using vLLM for offline inference with "
+        "audio language models"
+    )
+    parser.add_argument(
+        "--query-type",
+        "-q",
+        type=str,
+        default="mixed_modalities",
+        choices=query_map.keys(),
+        help="Query type.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="Set the seed when initializing `vllm.LLM`.",
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/examples/offline_inference/qwen3_reranker.py b/vllm_v0.10.0/examples/offline_inference/qwen3_reranker.py
new file mode 100644
index 0000000..b0fd572
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/qwen3_reranker.py
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+
+from vllm import LLM
+
+model_name = "Qwen/Qwen3-Reranker-0.6B"
+
+# What is the difference between the official original version and one
+# that has been converted into a sequence classification model?
+# Qwen3-Reranker is a language model that doing reranker by using the
+# logits of "no" and "yes" tokens.
+# It needs to computing 151669 tokens logits, making this method extremely
+# inefficient, not to mention incompatible with the vllm score API.
+# A method for converting the original model into a sequence classification
+# model was proposed. See：https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
+# Models converted offline using this method can not only be more efficient
+# and support the vllm score API, but also make the init parameters more
+# concise, for example.
+# llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")
+
+# If you want to load the official original version, the init parameters are
+# as follows.
+
+
+def get_llm() -> LLM:
+    """Initializes and returns the LLM model for Qwen3-Reranker."""
+    return LLM(
+        model=model_name,
+        task="score",
+        hf_overrides={
+            "architectures": ["Qwen3ForSequenceClassification"],
+            "classifier_from_token": ["no", "yes"],
+            "is_original_qwen3_reranker": True,
+        },
+    )
+
+
+# Why do we need hf_overrides for the official original version:
+# vllm converts it to Qwen3ForSequenceClassification when loaded for
+# better performance.
+# - Firstly, we need using `"architectures": ["Qwen3ForSequenceClassification"],`
+# to manually route to Qwen3ForSequenceClassification.
+# - Then, we will extract the vector corresponding to classifier_from_token
+# from lm_head using `"classifier_from_token": ["no", "yes"]`.
+# - Third, we will convert these two vectors into one vector.  The use of
+# conversion logic is controlled by `using "is_original_qwen3_reranker": True`.
+
+# Please use the query_template and document_template to format the query and
+# document for better reranker results.
+
+prefix = '<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>\n<|im_start|>user\n'
+suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
+
+query_template = "{prefix}<Instruct>: {instruction}\n<Query>: {query}\n"
+document_template = "<Document>: {doc}{suffix}"
+
+
+def main() -> None:
+    instruction = (
+        "Given a web search query, retrieve relevant passages that answer the query"
+    )
+
+    queries = [
+        "What is the capital of China?",
+        "Explain gravity",
+    ]
+
+    documents = [
+        "The capital of China is Beijing.",
+        "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
+    ]
+
+    queries = [
+        query_template.format(prefix=prefix, instruction=instruction, query=query)
+        for query in queries
+    ]
+    documents = [document_template.format(doc=doc, suffix=suffix) for doc in documents]
+
+    llm = get_llm()
+    outputs = llm.score(queries, documents)
+
+    print("-" * 30)
+    print([output.outputs.score for output in outputs])
+    print("-" * 30)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/offline_inference/qwen_1m.py b/vllm_v0.10.0/examples/offline_inference/qwen_1m.py
new file mode 100644
index 0000000..d8d6166
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/qwen_1m.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+from urllib.request import urlopen
+
+from vllm import LLM, SamplingParams
+
+os.environ["VLLM_ATTENTION_BACKEND"] = "DUAL_CHUNK_FLASH_ATTN"
+os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
+
+
+def load_prompt() -> str:
+    # Test cases with various lengths can be found at:
+    #
+    # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/64k.txt
+    # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/200k.txt
+    # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/600k.txt
+    # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/1m.txt
+
+    with urlopen(
+        "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/600k.txt",
+        timeout=5,
+    ) as response:
+        prompt = response.read().decode("utf-8")
+    return prompt
+
+
+# Processing the prompt.
+def process_requests(llm: LLM, prompts: list[str]) -> None:
+    # Create a sampling params object.
+    sampling_params = SamplingParams(
+        temperature=0.7,
+        top_p=0.8,
+        top_k=20,
+        repetition_penalty=1.05,
+        detokenize=True,
+        max_tokens=256,
+    )
+    # Generate texts from the prompts.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    for output in outputs:
+        prompt_token_ids = output.prompt_token_ids
+        generated_text = output.outputs[0].text
+        print(
+            f"Prompt length: {len(prompt_token_ids)}, "
+            f"Generated text: {generated_text!r}"
+        )
+
+
+# Create an LLM.
+def initialize_engine() -> LLM:
+    llm = LLM(
+        model="Qwen/Qwen2.5-7B-Instruct-1M",
+        max_model_len=1048576,
+        tensor_parallel_size=4,
+        enforce_eager=True,
+        enable_chunked_prefill=True,
+        max_num_batched_tokens=131072,
+    )
+    return llm
+
+
+def main():
+    llm = initialize_engine()
+    prompt = load_prompt()
+    process_requests(llm, [prompt])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/offline_inference/reproducibility.py b/vllm_v0.10.0/examples/offline_inference/reproducibility.py
new file mode 100644
index 0000000..d909438
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/reproducibility.py
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Demonstrates how to achieve reproducibility in vLLM.
+
+Main article: https://docs.vllm.ai/en/latest/usage/reproducibility.html
+"""
+
+import os
+import random
+
+from vllm import LLM, SamplingParams
+
+# V1 only: Turn off multiprocessing to make the scheduling deterministic.
+os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
+
+# V0 only: Set the global seed. The default seed is None, which is
+# not reproducible.
+SEED = 42
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+
+def main():
+    llm = LLM(model="facebook/opt-125m", seed=SEED)
+    outputs = llm.generate(prompts, sampling_params)
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+    # Try generating random numbers outside vLLM
+    # The same number is output across runs, meaning that the random state
+    # in the user code has been updated by vLLM
+    print(random.randint(0, 100))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/offline_inference/rlhf.py b/vllm_v0.10.0/examples/offline_inference/rlhf.py
new file mode 100644
index 0000000..752117a
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/rlhf.py
@@ -0,0 +1,144 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Demonstrates reinforcement learning from human feedback (RLHF) using vLLM and Ray.
+
+The script separates training and inference workloads onto distinct GPUs
+so that Ray can manage process placement and inter-process communication.
+A Hugging Face Transformer model occupies GPU 0 for training, whereas a
+tensor-parallel vLLM inference engine occupies GPU 1–2.
+
+The example performs the following steps:
+
+* Load the training model on GPU 0.
+* Split the inference model across GPUs 1–2 using vLLM's tensor parallelism
+  and Ray placement groups.
+* Generate text from a list of prompts using the inference engine.
+* Update the weights of the training model and broadcast the updated weights
+  to the inference engine by using a Ray collective RPC group. Note that
+  for demonstration purposes we simply zero out the weights.
+
+For a production-ready implementation that supports multiple training and
+inference replicas, see the OpenRLHF framework:
+https://github.com/OpenRLHF/OpenRLHF
+
+This example assumes a single-node cluster with three GPUs, but Ray
+supports multi-node clusters. vLLM expects the GPUs are only used for vLLM
+workloads. Residual GPU activity interferes with vLLM memory profiling and
+causes unexpected behavior.
+"""
+
+import os
+
+import ray
+import torch
+from ray.util.placement_group import placement_group
+from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+from rlhf_utils import stateless_init_process_group
+from transformers import AutoModelForCausalLM
+
+from vllm import LLM, SamplingParams
+from vllm.utils import get_ip, get_open_port
+
+
+class MyLLM(LLM):
+    """Configure the vLLM worker for Ray placement group execution."""
+
+    def __init__(self, *args, **kwargs):
+        # Remove the top-level CUDA_VISIBLE_DEVICES variable set by Ray
+        # so that vLLM can manage its own device placement within the worker.
+        os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+        super().__init__(*args, **kwargs)
+
+
+# Load the OPT-125M model onto GPU 0 for the training workload.
+train_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
+train_model.to("cuda:0")
+
+# Initialize Ray and set the visible devices. The vLLM engine will
+# be placed on GPUs 1 and 2.
+os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
+ray.init()
+
+# Create a placement group that reserves GPU 1–2 for the vLLM inference engine.
+# Learn more about Ray placement groups:
+# https://docs.ray.io/en/latest/placement-groups.html
+pg_inference = placement_group([{"GPU": 1, "CPU": 0}] * 2)
+ray.get(pg_inference.ready())
+scheduling_inference = PlacementGroupSchedulingStrategy(
+    placement_group=pg_inference,
+    placement_group_capture_child_tasks=True,
+    placement_group_bundle_index=0,
+)
+
+# Launch the vLLM inference engine. The `enforce_eager` flag reduces
+# start-up latency.
+llm = ray.remote(
+    num_cpus=0,
+    num_gpus=0,
+    scheduling_strategy=scheduling_inference,
+)(MyLLM).remote(
+    model="facebook/opt-125m",
+    enforce_eager=True,
+    worker_extension_cls="rlhf_utils.WorkerExtension",
+    tensor_parallel_size=2,
+    distributed_executor_backend="ray",
+)
+
+# Generate text from the prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+sampling_params = SamplingParams(temperature=0)
+
+outputs = ray.get(llm.generate.remote(prompts, sampling_params))
+
+print("-" * 50)
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+    print("-" * 50)
+
+# Set up the communication channel between the training process and the
+# inference engine.
+master_address = get_ip()
+master_port = get_open_port()
+
+handle = llm.collective_rpc.remote(
+    "init_weight_update_group", args=(master_address, master_port, 1, 3)
+)
+
+model_update_group = stateless_init_process_group(
+    master_address, master_port, 0, 3, torch.device("cuda:0")
+)
+ray.get(handle)
+
+# Simulate a training step by zeroing out all model weights.
+# In a real RLHF training loop the weights would be updated using the gradient
+# from an RL objective such as PPO on a reward model.
+for name, p in train_model.named_parameters():
+    p.data.zero_()
+
+# Synchronize the updated weights to the inference engine.
+for name, p in train_model.named_parameters():
+    handle = llm.collective_rpc.remote("update_weight", args=(name, p.dtype, p.shape))
+    model_update_group.broadcast(p, src=0, stream=torch.cuda.current_stream())
+    ray.get(handle)
+
+# Verify that the inference weights have been updated.
+assert all(ray.get(llm.collective_rpc.remote("check_weights_changed")))
+
+# Generate text with the updated model. The output is expected to be nonsense
+# because the weights are zero.
+outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params))
+print("-" * 50)
+for output in outputs_updated:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+    print("-" * 50)
diff --git a/vllm_v0.10.0/examples/offline_inference/rlhf_colocate.py b/vllm_v0.10.0/examples/offline_inference/rlhf_colocate.py
new file mode 100644
index 0000000..6562102
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/rlhf_colocate.py
@@ -0,0 +1,192 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Demonstrates how to co-locate a vLLM inference worker and training
+actors on the same set of GPUs for reinforcement learning from human feedback
+(RLHF) workloads.
+
+Ray serves as the distributed execution framework in this example. Ray
+placement groups allocate both training actors and vLLM workers to the
+same GPU bundles, enabling fast, in-GPU communication between the two
+components.
+
+The script shows how to do the following:
+
+* Configure environment variables (`VLLM_RAY_PER_WORKER_GPUS` and
+  `VLLM_RAY_BUNDLE_INDICES`) so that vLLM workers land on the desired
+  devices.
+* Exchange tensors between processes by means of CUDA inter-process
+  communication (IPC). CUDA IPC sidesteps NCCL limitations that occur
+  when multiple processes share a single GPU.
+
+Note that this example assumes a single-node cluster with four GPUs, but Ray
+supports multi-node clusters. vLLM expects exclusive use of the GPUs during
+its initialization for memory profiling. Residual GPU activity interferes
+with vLLM memory profiling and causes unexpected behavior.
+
+Learn more about Ray placement groups:
+https://docs.ray.io/en/latest/placement-groups.html
+"""
+
+import os
+
+import ray
+import torch
+from ray.util.placement_group import placement_group
+from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+
+from vllm import LLM
+
+
+class MyLLM(LLM):
+    """Configure the vLLM worker for Ray placement group execution.
+
+    The constructor sets environment variables that allow multiple vLLM
+    workers to share a single physical GPU and that encode the bundle
+    indices assigned by the placement group.
+
+    Args:
+        *args: Positional arguments forwarded to `vllm.LLM`.
+        bundle_indices (list[int]): Placement-group bundle indices
+            assigned to this worker.
+        **kwargs: Keyword arguments forwarded to `vllm.LLM`.
+    """
+
+    def __init__(self, *args, bundle_indices: list[int], **kwargs):
+        # Prevent Ray from manipulating the top-level CUDA_VISIBLE_DEVICES variable
+        # so that vLLM can its own device placement inside the worker.
+        os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+        # Each worker uses 0.4 GPU so that two instances fit on the same GPUs.
+        os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4"
+        os.environ["VLLM_RAY_BUNDLE_INDICES"] = ",".join(map(str, bundle_indices))
+        print(f"creating LLM with bundle_indices={bundle_indices}")
+        super().__init__(*args, **kwargs)
+
+
+class RayTrainingActor:
+    """Training actor that hosts a Facebook OPT-125M model from Hugging Face.
+
+    The model is loaded onto the first GPU assigned to this actor, and expose
+    the CUDA IPC handles so that colocated vLLM workers can map tensors
+    directly.
+    """
+
+    def __init__(self):
+        # Ray sets CUDA_VISIBLE_DEVICES to the GPUs assigned to this actor.
+        from transformers import AutoModelForCausalLM
+
+        self.model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
+        self.model.to("cuda:0")
+        # Zero out all the parameters.
+        for name, p in self.model.named_parameters():
+            p.data.zero_()
+        torch.cuda.synchronize()
+        # The argument for `get_device_uuid` is the index of the GPU in the
+        # list of visible devices.
+        from vllm.platforms import current_platform
+
+        self.device_uuid = current_platform.get_device_uuid(0)
+
+    def report_device_id(self) -> str:
+        return self.device_uuid
+
+    def get_weight_ipc_handles(self):
+        from torch.multiprocessing.reductions import reduce_tensor
+
+        data = {}
+        for name, p in self.model.named_parameters():
+            # A training actor might hold only a subset of the weights and may
+            # need to gather weights from other actors. For demonstration
+            # purposes, each training actor owns the full weight set.
+            data[name] = reduce_tensor(p.detach())
+        return {self.device_uuid: data}
+
+
+# Ray manages four GPUs.
+
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+ray.init()
+
+# Co-locate vLLM instances and training actors on the same set of GPUs:
+#   * GPU 0 and 1: training actor 0, training actor 1, and vLLM instance 0
+#     (tensor parallelism = 2).
+#   * GPU 2 and 3: training actor 2, training actor 3, and vLLM instance 1
+#     (tensor parallelism = 2).
+
+pg = placement_group([{"GPU": 1, "CPU": 0}] * 4)
+ray.get(pg.ready())
+print(f"placement group has bundles {pg.bundle_specs=}")
+
+training_actors = []
+training_actor_device_ids = []
+inference_engines = []
+inference_engine_device_ids = []
+
+for bundle_index in [0, 1, 2, 3]:
+    training_actor = ray.remote(
+        num_cpus=0,
+        num_gpus=0.4,
+        scheduling_strategy=PlacementGroupSchedulingStrategy(
+            placement_group=pg,
+            placement_group_capture_child_tasks=True,
+            placement_group_bundle_index=bundle_index,
+        ),
+    )(RayTrainingActor).remote()
+    training_actors.append(training_actor)
+
+for bundle_index, training_actor in enumerate(training_actors):
+    device_id = ray.get(training_actor.report_device_id.remote())
+    print(f"training actor {bundle_index} is on {device_id}")
+    training_actor_device_ids.append(device_id)
+
+for i, bundle_indices in enumerate([[0, 1], [2, 3]]):
+    # Use the following syntax instead of the @ray.remote decorator so that
+    # the placement group is customized for each bundle.
+    llm = ray.remote(
+        num_cpus=0,
+        num_gpus=0,
+        scheduling_strategy=PlacementGroupSchedulingStrategy(
+            placement_group=pg,
+            placement_group_capture_child_tasks=True,
+        ),
+    )(MyLLM).remote(
+        model="facebook/opt-125m",
+        enforce_eager=True,
+        worker_extension_cls="rlhf_utils.ColocateWorkerExtension",
+        tensor_parallel_size=2,
+        distributed_executor_backend="ray",
+        gpu_memory_utilization=0.4,
+        bundle_indices=bundle_indices,
+    )
+    inference_engines.append(llm)
+    # Do not call any method on the inference engine at this point; the call
+    # blocks until the vLLM instance finishes initialization.
+
+for i, llm in enumerate(inference_engines):
+    inference_engine_device_ids.append(
+        ray.get(llm.collective_rpc.remote("report_device_id", args=tuple()))
+    )
+    print(f"inference engine {i} is on {inference_engine_device_ids[-1]}")
+
+# Verify placement: the first two training actors share the same GPUs as
+# the first inference engine.
+assert training_actor_device_ids[:2] == inference_engine_device_ids[0]
+# Verify placement: the last two training actors share the same GPUs as
+# the second inference engine.
+assert training_actor_device_ids[2:] == inference_engine_device_ids[1]
+
+print("Gather all the IPC handles from the training actors.")
+ipc_handles = {}
+for actor in training_actors:
+    ipc_handles.update(ray.get(actor.get_weight_ipc_handles.remote()))
+
+print("Update the weights of the inference engines.")
+for llm in inference_engines:
+    ray.get(
+        llm.collective_rpc.remote(
+            "update_weights_from_ipc_handles", args=(ipc_handles,)
+        )
+    )
+print("Check if the weights are updated.")
+for llm in inference_engines:
+    assert ray.get(llm.collective_rpc.remote("check_weights_changed", args=tuple()))
diff --git a/vllm_v0.10.0/examples/offline_inference/rlhf_utils.py b/vllm_v0.10.0/examples/offline_inference/rlhf_utils.py
new file mode 100644
index 0000000..c445224
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/rlhf_utils.py
@@ -0,0 +1,106 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+
+def stateless_init_process_group(master_address, master_port, rank, world_size, device):
+    """
+    vLLM provides `StatelessProcessGroup` to create a process group
+    without considering the global process group in torch.distributed.
+    It is recommended to create `StatelessProcessGroup`, and then initialize
+    the data-plane communication (NCCL) between external (train processes)
+    and vLLM workers.
+    """
+    from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+    from vllm.distributed.utils import StatelessProcessGroup
+
+    pg = StatelessProcessGroup.create(
+        host=master_address, port=master_port, rank=rank, world_size=world_size
+    )
+    pynccl = PyNcclCommunicator(pg, device=device)
+    return pynccl
+
+
+class WorkerExtension:
+    """
+    The class for vLLM's worker to inherit from.
+    By defining an extension class, the code can work no matter what is
+    the underlying worker class. This way, the code can be compatible
+    with both vLLM V0 and V1.
+    NOTE: we define this class in a separate module, and the main module
+    should pass the full qualified name as `worker_extension_cls` argument.
+    """
+
+    def init_weight_update_group(
+        self, master_address, master_port, rank_offset, world_size
+    ):
+        from vllm.distributed.parallel_state import get_world_group
+
+        rank = get_world_group().rank + rank_offset
+        self.model_update_group = stateless_init_process_group(
+            master_address,
+            master_port,
+            rank,
+            world_size,
+            self.device,
+        )
+
+    def update_weight(self, name, dtype, shape):
+        weight = torch.empty(shape, dtype=dtype, device="cuda")
+        self.model_update_group.broadcast(
+            weight, src=0, stream=torch.cuda.current_stream()
+        )
+
+        self.model_runner.model.load_weights(weights=[(name, weight)])
+
+        del weight
+
+    def check_weights_changed(self):
+        """
+        Check if the weights are updated to 0.
+        """
+        weights_updated = True
+        for name, p in self.model_runner.model.named_parameters():
+            weights_updated = weights_updated and torch.allclose(p, torch.zeros_like(p))
+        return weights_updated
+
+
+class ColocateWorkerExtension:
+    """
+    The class for vLLM's worker to inherit from, in the colocate setting.
+    By defining an extension class, the code can work no matter what is
+    the underlying worker class. This way, the code can be compatible
+    with both vLLM V0 and V1.
+    NOTE: we define this class in a separate module, and the main module
+    should pass the full qualified name as `worker_extension_cls` argument.
+    """
+
+    def report_device_id(self) -> str:
+        from vllm.platforms import current_platform
+
+        self.device_uuid = current_platform.get_device_uuid(self.device.index)
+        return self.device_uuid
+
+    def update_weights_from_ipc_handles(self, ipc_handles):
+        handles = ipc_handles[self.device_uuid]
+        device_id = self.device.index
+        weights = []
+        for name, handle in handles.items():
+            func, args = handle
+            list_args = list(args)
+            # the key is to change device id to the current device id
+            # in case two processes have different CUDA_VISIBLE_DEVICES
+            list_args[6] = device_id
+            tensor = func(*list_args)
+            weights.append((name, tensor))
+        self.model_runner.model.load_weights(weights=weights)
+        torch.cuda.synchronize()
+
+    def check_weights_changed(self):
+        """
+        Check if the weights are updated to 0.
+        """
+        weights_updated = True
+        for name, p in self.model_runner.model.named_parameters():
+            weights_updated = weights_updated and torch.allclose(p, torch.zeros_like(p))
+        return weights_updated
diff --git a/vllm_v0.10.0/examples/offline_inference/save_sharded_state.py b/vllm_v0.10.0/examples/offline_inference/save_sharded_state.py
new file mode 100644
index 0000000..9b154e3
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/save_sharded_state.py
@@ -0,0 +1,96 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Saves each worker's model state dict directly to a checkpoint, which enables a
+fast load path for large tensor-parallel models where each worker only needs to
+read its own shard rather than the entire checkpoint.
+
+Example usage:
+
+python save_sharded_state.py \
+    --model /path/to/load \
+    --quantization deepspeedfp \
+    --tensor-parallel-size 8 \
+    --output /path/to/save
+
+Then, the model can be loaded with
+
+llm = LLM(
+    model="/path/to/save",
+    load_format="sharded_state",
+    quantization="deepspeedfp",
+    tensor_parallel_size=8,
+)
+"""
+
+import dataclasses
+import os
+import shutil
+from pathlib import Path
+
+from vllm import LLM, EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    EngineArgs.add_cli_args(parser)
+    parser.add_argument(
+        "--output", "-o", required=True, type=str, help="path to output checkpoint"
+    )
+    parser.add_argument(
+        "--file-pattern", type=str, help="string pattern of saved filenames"
+    )
+    parser.add_argument(
+        "--max-file-size",
+        type=str,
+        default=5 * 1024**3,
+        help="max size (in bytes) of each safetensors file",
+    )
+    return parser.parse_args()
+
+
+def main(args):
+    engine_args = EngineArgs.from_cli_args(args)
+    if engine_args.enable_lora:
+        raise ValueError("Saving with enable_lora=True is not supported!")
+    model_path = engine_args.model
+    if not Path(model_path).is_dir():
+        raise ValueError("model path must be a local directory")
+    # Create LLM instance from arguments
+    llm = LLM(**dataclasses.asdict(engine_args))
+    # Prepare output directory
+    Path(args.output).mkdir(exist_ok=True)
+    # Dump worker states to output directory
+
+    # Check which engine version is being used
+    is_v1_engine = hasattr(llm.llm_engine, "engine_core")
+
+    if is_v1_engine:
+        # For V1 engine, we need to use engine_core.save_sharded_state
+        print("Using V1 engine save path")
+        llm.llm_engine.engine_core.save_sharded_state(
+            path=args.output, pattern=args.file_pattern, max_size=args.max_file_size
+        )
+    else:
+        # For V0 engine
+        print("Using V0 engine save path")
+        model_executor = llm.llm_engine.model_executor
+        model_executor.save_sharded_state(
+            path=args.output, pattern=args.file_pattern, max_size=args.max_file_size
+        )
+
+    # Copy metadata files to output directory
+    for file in os.listdir(model_path):
+        if os.path.splitext(file)[1] not in (".bin", ".pt", ".safetensors"):
+            if os.path.isdir(os.path.join(model_path, file)):
+                shutil.copytree(
+                    os.path.join(model_path, file), os.path.join(args.output, file)
+                )
+            else:
+                shutil.copy(os.path.join(model_path, file), args.output)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/examples/offline_inference/simple_profiling.py b/vllm_v0.10.0/examples/offline_inference/simple_profiling.py
new file mode 100644
index 0000000..46858ff
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/simple_profiling.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import time
+
+from vllm import LLM, SamplingParams
+
+# enable torch profiler, can also be set on cmd line
+os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_profile"
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+
+def main():
+    # Create an LLM.
+    llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
+
+    llm.start_profile()
+
+    # Generate texts from the prompts. The output is a list of RequestOutput
+    # objects that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+
+    llm.stop_profile()
+
+    # Print the outputs.
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+    # Add a buffer to wait for profiler in the background process
+    # (in case MP is on) to finish writing profiling output.
+    time.sleep(10)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/offline_inference/skip_loading_weights_in_engine_init.py b/vllm_v0.10.0/examples/offline_inference/skip_loading_weights_in_engine_init.py
new file mode 100644
index 0000000..1a61681
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/skip_loading_weights_in_engine_init.py
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import LLM, RequestOutput, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+
+def print_prompts_and_outputs(outputs: list[RequestOutput]) -> None:
+    print("-" * 60)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt:    {prompt!r}")
+        print(f"Output:    {generated_text!r}")
+        print("-" * 60)
+
+
+def main():
+    # Create an LLM without loading real weights
+    llm = LLM(
+        model="Qwen/Qwen3-0.6B",
+        load_format="dummy",
+        enforce_eager=True,
+        tensor_parallel_size=4,
+    )
+    outputs = llm.generate(prompts, sampling_params)
+    print("\nOutputs do not make sense:")
+    print_prompts_and_outputs(outputs)
+
+    # Update load format from `dummy` to `auto`
+    llm.collective_rpc(
+        "update_config", args=({"load_config": {"load_format": "auto"}},)
+    )
+    # Now reload real weights inplace
+    llm.collective_rpc("reload_weights")
+
+    # Check outputs make sense
+    outputs = llm.generate(prompts, sampling_params)
+    print("\nOutputs make sense after loading real weights:")
+    print_prompts_and_outputs(outputs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/offline_inference/spec_decode.py b/vllm_v0.10.0/examples/offline_inference/spec_decode.py
new file mode 100644
index 0000000..ce735f3
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/spec_decode.py
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from transformers import AutoTokenizer
+
+from vllm import LLM, SamplingParams
+from vllm.benchmarks.datasets import add_dataset_parser, get_samples
+from vllm.v1.metrics.reader import Counter, Vector
+
+try:
+    from vllm.utils import FlexibleArgumentParser
+except ImportError:
+    from argparse import ArgumentParser as FlexibleArgumentParser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    add_dataset_parser(parser)
+    parser.add_argument(
+        "--method",
+        type=str,
+        default="eagle",
+        choices=["ngram", "eagle", "eagle3", "mtp"],
+    )
+    parser.add_argument("--num-spec-tokens", type=int, default=2)
+    parser.add_argument("--prompt-lookup-max", type=int, default=5)
+    parser.add_argument("--prompt-lookup-min", type=int, default=2)
+    parser.add_argument("--tp", type=int, default=1)
+    parser.add_argument("--enforce-eager", action="store_true")
+    parser.add_argument("--enable-chunked-prefill", action="store_true")
+    parser.add_argument("--temp", type=float, default=0)
+    parser.add_argument("--top-p", type=float, default=1.0)
+    parser.add_argument("--top-k", type=int, default=-1)
+    parser.add_argument("--print-output", action="store_true")
+    parser.add_argument("--output-len", type=int, default=256)
+    parser.add_argument("--model-dir", type=str, default=None)
+    parser.add_argument("--eagle-dir", type=str, default=None)
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    args.endpoint_type = "openai-chat"
+
+    model_dir = args.model_dir
+    if args.model_dir is None:
+        model_dir = "meta-llama/Llama-3.1-8B-Instruct"
+    tokenizer = AutoTokenizer.from_pretrained(model_dir)
+
+    prompts = get_samples(args, tokenizer)
+    # add_special_tokens is False to avoid adding bos twice when using chat templates
+    prompt_ids = [
+        tokenizer.encode(prompt.prompt, add_special_tokens=False) for prompt in prompts
+    ]
+
+    if args.method == "eagle" or args.method == "eagle3":
+        eagle_dir = args.eagle_dir
+        if args.method == "eagle" and eagle_dir is None:
+            eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
+
+        elif args.method == "eagle3" and eagle_dir is None:
+            eagle_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
+        speculative_config = {
+            "method": args.method,
+            "model": eagle_dir,
+            "num_speculative_tokens": args.num_spec_tokens,
+        }
+    elif args.method == "ngram":
+        speculative_config = {
+            "method": "ngram",
+            "num_speculative_tokens": args.num_spec_tokens,
+            "prompt_lookup_max": args.prompt_lookup_max,
+            "prompt_lookup_min": args.prompt_lookup_min,
+        }
+    else:
+        raise ValueError(f"unknown method: {args.method}")
+
+    llm = LLM(
+        model=model_dir,
+        trust_remote_code=True,
+        tensor_parallel_size=args.tp,
+        enable_chunked_prefill=args.enable_chunked_prefill,
+        enforce_eager=args.enforce_eager,
+        gpu_memory_utilization=0.8,
+        speculative_config=speculative_config,
+        disable_log_stats=False,
+        max_model_len=16384,
+    )
+
+    sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len)
+    outputs = llm.generate(prompt_token_ids=prompt_ids, sampling_params=sampling_params)
+
+    # print the generated text
+    if args.print_output:
+        for output in outputs:
+            print("-" * 50)
+            print(f"prompt: {output.prompt}")
+            print(f"generated text: {output.outputs[0].text}")
+            print("-" * 50)
+
+    try:
+        metrics = llm.get_metrics()
+    except AssertionError:
+        print("Metrics are not supported in the V0 engine.")
+        return
+
+    total_num_output_tokens = sum(
+        len(output.outputs[0].token_ids) for output in outputs
+    )
+    num_drafts = 0
+    num_draft_tokens = 0
+    num_accepted_tokens = 0
+    acceptance_counts = [0] * args.num_spec_tokens
+    for metric in metrics:
+        if metric.name == "vllm:spec_decode_num_drafts":
+            assert isinstance(metric, Counter)
+            num_drafts += metric.value
+        elif metric.name == "vllm:spec_decode_num_draft_tokens":
+            assert isinstance(metric, Counter)
+            num_draft_tokens += metric.value
+        elif metric.name == "vllm:spec_decode_num_accepted_tokens":
+            assert isinstance(metric, Counter)
+            num_accepted_tokens += metric.value
+        elif metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos":
+            assert isinstance(metric, Vector)
+            for pos in range(len(metric.values)):
+                acceptance_counts[pos] += metric.values[pos]
+
+    print("-" * 50)
+    print(f"total_num_output_tokens: {total_num_output_tokens}")
+    print(f"num_drafts: {num_drafts}")
+    print(f"num_draft_tokens: {num_draft_tokens}")
+    print(f"num_accepted_tokens: {num_accepted_tokens}")
+    acceptance_length = 1 + (num_accepted_tokens / num_drafts) if num_drafts > 0 else 1
+    print(f"mean acceptance length: {acceptance_length:.2f}")
+    print("-" * 50)
+
+    # print acceptance at each token position
+    for i in range(len(acceptance_counts)):
+        acceptance_rate = acceptance_counts[i] / num_drafts if num_drafts > 0 else 0
+        print(f"acceptance at token {i}: {acceptance_rate:.2f}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/offline_inference/structured_outputs.py b/vllm_v0.10.0/examples/offline_inference/structured_outputs.py
new file mode 100644
index 0000000..8ef121e
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/structured_outputs.py
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file demonstrates the example usage of guided decoding
+to generate structured outputs using vLLM. It shows how to apply
+different guided decoding techniques such as Choice, Regex, JSON schema,
+and Grammar to produce structured and formatted results
+based on specific prompts.
+"""
+
+from enum import Enum
+
+from pydantic import BaseModel
+
+from vllm import LLM, SamplingParams
+from vllm.sampling_params import GuidedDecodingParams
+
+# Guided decoding by Choice (list of possible options)
+guided_decoding_params_choice = GuidedDecodingParams(choice=["Positive", "Negative"])
+sampling_params_choice = SamplingParams(guided_decoding=guided_decoding_params_choice)
+prompt_choice = "Classify this sentiment: vLLM is wonderful!"
+
+# Guided decoding by Regex
+guided_decoding_params_regex = GuidedDecodingParams(regex=r"\w+@\w+\.com\n")
+sampling_params_regex = SamplingParams(
+    guided_decoding=guided_decoding_params_regex, stop=["\n"]
+)
+prompt_regex = (
+    "Generate an email address for Alan Turing, who works in Enigma."
+    "End in .com and new line. Example result:"
+    "alan.turing@enigma.com\n"
+)
+
+
+# Guided decoding by JSON using Pydantic schema
+class CarType(str, Enum):
+    sedan = "sedan"
+    suv = "SUV"
+    truck = "Truck"
+    coupe = "Coupe"
+
+
+class CarDescription(BaseModel):
+    brand: str
+    model: str
+    car_type: CarType
+
+
+json_schema = CarDescription.model_json_schema()
+guided_decoding_params_json = GuidedDecodingParams(json=json_schema)
+sampling_params_json = SamplingParams(guided_decoding=guided_decoding_params_json)
+prompt_json = (
+    "Generate a JSON with the brand, model and car_type of"
+    "the most iconic car from the 90's"
+)
+
+# Guided decoding by Grammar
+simplified_sql_grammar = """
+root ::= select_statement
+select_statement ::= "SELECT " column " from " table " where " condition
+column ::= "col_1 " | "col_2 "
+table ::= "table_1 " | "table_2 "
+condition ::= column "= " number
+number ::= "1 " | "2 "
+"""
+guided_decoding_params_grammar = GuidedDecodingParams(grammar=simplified_sql_grammar)
+sampling_params_grammar = SamplingParams(guided_decoding=guided_decoding_params_grammar)
+prompt_grammar = (
+    "Generate an SQL query to show the 'username' and 'email'from the 'users' table."
+)
+
+
+def format_output(title: str, output: str):
+    print(f"{'-' * 50}\n{title}: {output}\n{'-' * 50}")
+
+
+def generate_output(prompt: str, sampling_params: SamplingParams, llm: LLM):
+    outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
+    return outputs[0].outputs[0].text
+
+
+def main():
+    llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)
+
+    choice_output = generate_output(prompt_choice, sampling_params_choice, llm)
+    format_output("Guided decoding by Choice", choice_output)
+
+    regex_output = generate_output(prompt_regex, sampling_params_regex, llm)
+    format_output("Guided decoding by Regex", regex_output)
+
+    json_output = generate_output(prompt_json, sampling_params_json, llm)
+    format_output("Guided decoding by JSON", json_output)
+
+    grammar_output = generate_output(prompt_grammar, sampling_params_grammar, llm)
+    format_output("Guided decoding by Grammar", grammar_output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/offline_inference/torchrun_example.py b/vllm_v0.10.0/examples/offline_inference/torchrun_example.py
new file mode 100644
index 0000000..3d3d794
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/torchrun_example.py
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+experimental support for tensor-parallel inference with torchrun,
+see https://github.com/vllm-project/vllm/issues/11400 for
+the motivation and use case for this example.
+run the script with `torchrun --nproc-per-node=2 torchrun_example.py`,
+the argument 2 should match the `tensor_parallel_size` below.
+see `tests/distributed/test_torchrun_example.py` for the unit test.
+"""
+
+import torch.distributed as dist
+
+from vllm import LLM, SamplingParams
+
+# Create prompts, the same across all ranks
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+# Create sampling parameters, the same across all ranks
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# Use `distributed_executor_backend="external_launcher"` so that
+# this llm engine/instance only creates one worker.
+# it is important to set an explicit seed to make sure that
+# all ranks have the same random seed, so that sampling can be
+# deterministic across ranks.
+llm = LLM(
+    model="meta-llama/Llama-3.1-8B",
+    tensor_parallel_size=2,
+    pipeline_parallel_size=2,
+    distributed_executor_backend="external_launcher",
+    max_model_len=32768,
+    seed=1,
+)
+
+outputs = llm.generate(prompts, sampling_params)
+
+# all ranks will have the same outputs
+if dist.get_rank() == 0:
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}\n")
+        print("-" * 50)
+    """
+Further tips:
+
+1. to communicate control messages across all ranks, use the cpu group,
+a PyTorch ProcessGroup with GLOO backend.
+
+```python
+from vllm.distributed.parallel_state import get_world_group
+cpu_group = get_world_group().cpu_group
+torch_rank = dist.get_rank(group=cpu_group)
+if torch_rank == 0:
+    # do something for rank 0, e.g. saving the results to disk.
+```
+
+2. to communicate data across all ranks, use the model's device group,
+a PyTorch ProcessGroup with NCCL backend.
+```python
+from vllm.distributed.parallel_state import get_world_group
+device_group = get_world_group().device_group
+```
+
+3. to access the model directly in every rank, use the following code:
+```python
+llm.llm_engine.model_executor.driver_worker.worker.model_runner.model
+```
+"""
diff --git a/vllm_v0.10.0/examples/offline_inference/tpu.py b/vllm_v0.10.0/examples/offline_inference/tpu.py
new file mode 100644
index 0000000..9776f4f
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/tpu.py
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import os
+
+from vllm import LLM, SamplingParams
+
+prompts = [
+    "A robot may not injure a human being",
+    "It is only with the heart that one can see rightly;",
+    "The greatest glory in living lies not in never falling,",
+]
+answers = [
+    " or, through inaction, allow a human being to come to harm.",
+    " what is essential is invisible to the eye.",
+    " but in rising every time we fall.",
+]
+N = 1
+# Currently, top-p sampling is disabled. `top_p` should be 1.0.
+sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="TPU offline inference example")
+    parser.add_argument("--use-spmd", action="store_true", help="Enable SPMD mode")
+    args = parser.parse_args()
+
+    llm_args = {
+        "model": "Qwen/Qwen2-1.5B-Instruct",
+        "max_num_batched_tokens": 64,
+        "max_num_seqs": 4,
+        "max_model_len": 128,
+    }
+    if args.use_spmd:
+        os.environ["VLLM_XLA_USE_SPMD"] = "1"
+        # Can only hardcode the number of chips for now.
+        # calling xr.global_runtime_device_count() beforeing init SPMD env in
+        # torch_xla will mess up the distributed env.
+        llm_args["tensor_parallel_size"] = 8
+        # Use Llama, for num_kv_heads = 8.
+        llm_args["model"] = "meta-llama/Llama-3.1-8B-Instruct"
+
+    # Set `enforce_eager=True` to avoid ahead-of-time compilation.
+    # In real workloads, `enforace_eager` should be `False`.
+    llm = LLM(**llm_args)
+    outputs = llm.generate(prompts, sampling_params)
+    print("-" * 50)
+    for output, answer in zip(outputs, answers):
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        assert generated_text.startswith(answer)
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/offline_inference/vision_language.py b/vllm_v0.10.0/examples/offline_inference/vision_language.py
new file mode 100644
index 0000000..e4811c0
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/vision_language.py
@@ -0,0 +1,1488 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This example shows how to use vLLM for running offline inference with
+the correct prompt format on vision language models for text generation.
+
+For most models, the prompt format should follow corresponding examples
+on HuggingFace model repository.
+"""
+
+import os
+import random
+from contextlib import contextmanager
+from dataclasses import asdict
+from typing import NamedTuple, Optional
+
+from huggingface_hub import snapshot_download
+from transformers import AutoTokenizer
+
+from vllm import LLM, EngineArgs, SamplingParams
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+from vllm.lora.request import LoRARequest
+from vllm.multimodal.image import convert_image_mode
+from vllm.utils import FlexibleArgumentParser
+
+
+class ModelRequestData(NamedTuple):
+    engine_args: EngineArgs
+    prompts: list[str]
+    stop_token_ids: Optional[list[int]] = None
+    lora_requests: Optional[list[LoRARequest]] = None
+
+
+# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
+# lower-end GPUs.
+# Unless specified, these settings have been tested to work on a single L4.
+
+
+# Aria
+def run_aria(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "rhymes-ai/Aria"
+
+    # NOTE: Need L40 (or equivalent) to avoid OOM
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        dtype="bfloat16",
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    prompts = [
+        (
+            f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
+            "<|im_end|>\n<|im_start|>assistant\n"
+        )
+        for question in questions
+    ]
+
+    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
+
+
+# Aya Vision
+def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "CohereForAI/aya-vision-8b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=2048,
+        max_num_seqs=2,
+        mm_processor_kwargs={"crop_to_patches": True},
+        limit_mm_per_prompt={modality: 1},
+    )
+    prompts = [
+        f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
+        for question in questions
+    ]
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# BLIP-2
+def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
+    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
+    prompts = [f"Question: {question} Answer:" for question in questions]
+    engine_args = EngineArgs(
+        model="Salesforce/blip2-opt-2.7b",
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# Chameleon
+def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    prompts = [f"{question}<image>" for question in questions]
+    engine_args = EngineArgs(
+        model="facebook/chameleon-7b",
+        max_model_len=4096,
+        max_num_seqs=2,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# Deepseek-VL2
+def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "deepseek-ai/deepseek-vl2-tiny"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    prompts = [
+        f"<|User|>: <image>\n{question}\n\n<|Assistant|>:" for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# Florence2
+def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    engine_args = EngineArgs(
+        model="microsoft/Florence-2-large",
+        tokenizer="Isotr0py/Florence-2-tokenizer",
+        max_model_len=4096,
+        max_num_seqs=2,
+        trust_remote_code=True,
+        dtype="bfloat16",
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# Fuyu
+def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    prompts = [f"{question}\n" for question in questions]
+    engine_args = EngineArgs(
+        model="adept/fuyu-8b",
+        max_model_len=2048,
+        max_num_seqs=2,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# Gemma 3
+def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "google/gemma-3-4b-it"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=2048,
+        max_num_seqs=2,
+        mm_processor_kwargs={"do_pan_and_scan": True},
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    prompts = [
+        (
+            "<bos><start_of_turn>user\n"
+            f"<start_of_image>{question}<end_of_turn>\n"
+            "<start_of_turn>model\n"
+        )
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# GLM-4v
+def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "THUDM/glm-4v-9b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=2048,
+        max_num_seqs=2,
+        trust_remote_code=True,
+        enforce_eager=True,
+        hf_overrides={"architectures": ["GLM4VForCausalLM"]},
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    prompts = [
+        f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
+        {question}<|assistant|>"
+        for question in questions
+    ]
+
+    stop_token_ids = [151329, 151336, 151338]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
+
+
+# GLM-4.1V
+def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "THUDM/GLM-4.1V-9B-Thinking"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        mm_processor_kwargs={
+            "size": {"shortest_edge": 12544, "longest_edge": 47040000},
+            "fps": 1,
+        },
+        limit_mm_per_prompt={modality: 1},
+        enforce_eager=True,
+    )
+
+    if modality == "image":
+        placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
+    elif modality == "video":
+        placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
+
+    prompts = [
+        (
+            "[gMASK]<sop><|system|>\nYou are a helpful assistant.<|user|>\n"
+            f"{placeholder}"
+            f"{question}<|assistant|>assistant\n"
+        )
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# H2OVL-Mississippi
+def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "h2oai/h2ovl-mississippi-800m"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    messages = [
+        [{"role": "user", "content": f"<image>\n{question}"}] for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    # Stop tokens for H2OVL-Mississippi
+    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
+    stop_token_ids = [tokenizer.eos_token_id]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
+
+
+# Idefics3-8B-Llama3
+def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        enforce_eager=True,
+        # if you are running out of memory, you can reduce the "longest_edge".
+        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
+        mm_processor_kwargs={
+            "size": {"longest_edge": 3 * 364},
+        },
+        limit_mm_per_prompt={modality: 1},
+    )
+    prompts = [
+        (f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:")
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# SmolVLM2-2.2B-Instruct
+def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        enforce_eager=True,
+        mm_processor_kwargs={
+            "max_image_size": {"longest_edge": 384},
+        },
+        limit_mm_per_prompt={modality: 1},
+    )
+    prompts = [
+        (f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# omni-research/Tarsier-7b
+def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "omni-research/Tarsier-7b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        limit_mm_per_prompt={modality: 1},
+    )
+    prompts = [(f"USER: <image>\n{question} ASSISTANT:") for question in questions]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# InternVL
+def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "OpenGVLab/InternVL3-2B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    if modality == "image":
+        placeholder = "<image>"
+    elif modality == "video":
+        placeholder = "<video>"
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    messages = [
+        [{"role": "user", "content": f"{placeholder}\n{question}"}]
+        for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    # Stop tokens for InternVL
+    # models variants may have different stop tokens
+    # please refer to the model card for the correct "stop words":
+    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
+    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+    stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
+
+
+# Nemontron_VL
+def run_nemotron_vl(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    assert modality == "image"
+    placeholder = "<image>"
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    messages = [
+        [{"role": "user", "content": f"{placeholder}\n{question}"}]
+        for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    # Stop tokens for InternVL
+    # models variants may have different stop tokens
+    # please refer to the model card for the correct "stop words":
+    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
+    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+    stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
+
+
+# Keye-VL
+def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "Kwai-Keye/Keye-VL-8B-Preview"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        trust_remote_code=True,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    if modality == "image":
+        placeholder = "<|image_pad|>"
+    elif modality == "video":
+        placeholder = "<|video_pad|>"
+
+    prompts = [
+        (
+            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+            f"{question}<|im_end|>\n"
+            "<|im_start|>assistant\n"
+        )
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# Kimi-VL
+def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    prompts = [
+        "<|im_user|>user<|im_middle|><|media_start|>image<|media_content|>"
+        f"<|media_pad|><|media_end|>{question}<|im_end|>"
+        "<|im_assistant|>assistant<|im_middle|>"
+        for question in questions
+    ]
+
+    engine_args = EngineArgs(
+        model="moonshotai/Kimi-VL-A3B-Instruct",
+        trust_remote_code=True,
+        max_model_len=4096,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# LLaVA-1.5
+def run_llava(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    prompts = [f"USER: <image>\n{question}\nASSISTANT:" for question in questions]
+
+    engine_args = EngineArgs(
+        model="llava-hf/llava-1.5-7b-hf",
+        max_model_len=4096,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# LLaVA-1.6/LLaVA-NeXT
+def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    prompts = [f"[INST] <image>\n{question} [/INST]" for question in questions]
+    engine_args = EngineArgs(
+        model="llava-hf/llava-v1.6-mistral-7b-hf",
+        max_model_len=8192,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# LlaVA-NeXT-Video
+# Currently only support for video input
+def run_llava_next_video(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "video"
+
+    prompts = [f"USER: <video>\n{question} ASSISTANT:" for question in questions]
+    engine_args = EngineArgs(
+        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
+        max_model_len=8192,
+        max_num_seqs=2,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# LLaVA-OneVision
+def run_llava_onevision(questions: list[str], modality: str) -> ModelRequestData:
+    if modality == "video":
+        prompts = [
+            f"<|im_start|>user <video>\n{question}<|im_end|> \
+        <|im_start|>assistant\n"
+            for question in questions
+        ]
+
+    elif modality == "image":
+        prompts = [
+            f"<|im_start|>user <image>\n{question}<|im_end|> \
+        <|im_start|>assistant\n"
+            for question in questions
+        ]
+
+    engine_args = EngineArgs(
+        model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
+        max_model_len=16384,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# Mantis
+def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"  # noqa: E501
+    prompts = [llama3_template.format(f"{question}\n<image>") for question in questions]
+
+    engine_args = EngineArgs(
+        model="TIGER-Lab/Mantis-8B-siglip-llama3",
+        max_model_len=4096,
+        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
+        limit_mm_per_prompt={modality: 1},
+    )
+    stop_token_ids = [128009]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
+
+
+# MiniCPM-V
+def run_minicpmv_base(questions: list[str], modality: str, model_name):
+    assert modality in ["image", "video"]
+    # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
+
+    # 2.0
+    # The official repo doesn't work yet, so we need to use a fork for now
+    # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
+    # model_name = "HwwwH/MiniCPM-V-2"
+
+    # 2.5
+    # model_name = "openbmb/MiniCPM-Llama3-V-2_5"
+
+    # 2.6
+    # model_name = "openbmb/MiniCPM-V-2_6"
+    # o2.6
+
+    # modality supports
+    # 2.0: image
+    # 2.5: image
+    # 2.6: image, video
+    # o2.6: image, video, audio
+    # model_name = "openbmb/MiniCPM-o-2_6"
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        trust_remote_code=True,
+        limit_mm_per_prompt={modality: 1},
+    )
+    # NOTE The stop_token_ids are different for various versions of MiniCPM-V
+    # 2.0
+    # stop_token_ids = [tokenizer.eos_id]
+
+    # 2.5
+    # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
+
+    # 2.6 / o2.6
+    stop_tokens = ["<|im_end|>", "<|endoftext|>"]
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+
+    modality_placeholder = {
+        "image": "(<image>./</image>)",
+        "video": "(<video>./</video>)",
+    }
+
+    prompts = [
+        tokenizer.apply_chat_template(
+            [
+                {
+                    "role": "user",
+                    "content": f"{modality_placeholder[modality]}\n{question}",
+                }
+            ],
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
+
+
+def run_minicpmo(questions: list[str], modality: str) -> ModelRequestData:
+    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-o-2_6")
+
+
+def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
+    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
+
+
+# Mistral-3 HF-format
+def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+
+    # NOTE: Need L40 (or equivalent) to avoid OOM
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        tensor_parallel_size=2,
+        limit_mm_per_prompt={modality: 1},
+        ignore_patterns=["consolidated.safetensors"],
+    )
+
+    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# LLama 3.2
+def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+
+    # Note: The default setting of max_num_seqs (256) and
+    # max_model_len (131072) for this model may cause OOM.
+    # You may lower either to run this example on lower-end GPUs.
+
+    # The configuration below has been confirmed to launch on a single L40 GPU.
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    messages = [
+        [
+            {
+                "role": "user",
+                "content": [{"type": "image"}, {"type": "text", "text": question}],
+            }
+        ]
+        for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, add_generation_prompt=True, tokenize=False
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=4,
+        tensor_parallel_size=8,
+        gpu_memory_utilization=0.4,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    messages = [
+        [
+            {
+                "role": "user",
+                "content": [{"type": "image"}, {"type": "text", "text": f"{question}"}],
+            }
+        ]
+        for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, add_generation_prompt=True, tokenize=False
+    )
+    stop_token_ids = None
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
+
+
+# Molmo
+def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "allenai/Molmo-7B-D-0924"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        dtype="bfloat16",
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    prompts = [
+        f"<|im_start|>user <image>\n{question}<|im_end|> \
+        <|im_start|>assistant\n"
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# NVLM-D
+def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "nvidia/NVLM-D-72B"
+
+    # Adjust this as necessary to fit in GPU
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        tensor_parallel_size=4,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    messages = [
+        [{"role": "user", "content": f"<image>\n{question}"}] for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# Ovis
+def run_ovis(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "AIDC-AI/Ovis2-1B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        trust_remote_code=True,
+        dtype="half",
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    messages = [
+        [{"role": "user", "content": f"<image>\n{question}"}] for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# PaliGemma
+def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    # PaliGemma has special prompt format for VQA
+    prompts = ["caption en" for _ in questions]
+    engine_args = EngineArgs(
+        model="google/paligemma-3b-mix-224",
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# PaliGemma 2
+def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    # PaliGemma 2 has special prompt format for VQA
+    prompts = ["caption en" for _ in questions]
+    engine_args = EngineArgs(
+        model="google/paligemma2-3b-ft-docci-448",
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# Phi-3-Vision
+def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    prompts = [
+        f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
+        for question in questions
+    ]
+
+    # num_crops is an override kwarg to the multimodal image processor;
+    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
+    # to use 16 for single frame scenarios, and 4 for multi-frame.
+    #
+    # Generally speaking, a larger value for num_crops results in more
+    # tokens per image instance, because it may scale the image more in
+    # the image preprocessing. Some references in the model docs and the
+    # formula for image tokens after the preprocessing
+    # transform can be found below.
+    #
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
+    engine_args = EngineArgs(
+        model="microsoft/Phi-3.5-vision-instruct",
+        trust_remote_code=True,
+        max_model_len=4096,
+        max_num_seqs=2,
+        # Note - mm_processor_kwargs can also be passed to generate/chat calls
+        mm_processor_kwargs={"num_crops": 16},
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# Phi-4-multimodal-instruct
+def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
+    """
+    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
+    show how to process image inputs.
+    """
+    assert modality == "image"
+    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
+    # Since the vision-lora and speech-lora co-exist with the base model,
+    # we have to manually specify the path of the lora weights.
+    vision_lora_path = os.path.join(model_path, "vision-lora")
+    prompts = [
+        f"<|user|><|image_1|>{question}<|end|><|assistant|>" for question in questions
+    ]
+    engine_args = EngineArgs(
+        model=model_path,
+        trust_remote_code=True,
+        max_model_len=5120,
+        max_num_seqs=2,
+        max_num_batched_tokens=12800,
+        enable_lora=True,
+        max_lora_rank=320,
+        # Note - mm_processor_kwargs can also be passed to generate/chat calls
+        mm_processor_kwargs={"dynamic_hd": 16},
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
+    )
+
+
+# Pixtral HF-format
+def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "mistral-community/pixtral-12b"
+
+    # NOTE: Need L40 (or equivalent) to avoid OOM
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=6144,
+        max_num_seqs=2,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# Qwen-VL
+def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    engine_args = EngineArgs(
+        model="Qwen/Qwen-VL",
+        trust_remote_code=True,
+        max_model_len=1024,
+        max_num_seqs=2,
+        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# Qwen2-VL
+def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "Qwen/Qwen2-VL-7B-Instruct"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=5,
+        # Note - mm_processor_kwargs can also be passed to generate/chat calls
+        mm_processor_kwargs={
+            "min_pixels": 28 * 28,
+            "max_pixels": 1280 * 28 * 28,
+        },
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    if modality == "image":
+        placeholder = "<|image_pad|>"
+    elif modality == "video":
+        placeholder = "<|video_pad|>"
+
+    prompts = [
+        (
+            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+            f"{question}<|im_end|>\n"
+            "<|im_start|>assistant\n"
+        )
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# Qwen2.5-VL
+def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=5,
+        mm_processor_kwargs={
+            "min_pixels": 28 * 28,
+            "max_pixels": 1280 * 28 * 28,
+            "fps": 1,
+        },
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    if modality == "image":
+        placeholder = "<|image_pad|>"
+    elif modality == "video":
+        placeholder = "<|video_pad|>"
+
+    prompts = [
+        (
+            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+            f"{question}<|im_end|>\n"
+            "<|im_start|>assistant\n"
+        )
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# Qwen2.5-Omni
+def run_qwen2_5_omni(questions: list[str], modality: str):
+    model_name = "Qwen/Qwen2.5-Omni-7B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=5,
+        mm_processor_kwargs={
+            "min_pixels": 28 * 28,
+            "max_pixels": 1280 * 28 * 28,
+            "fps": [1],
+        },
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    if modality == "image":
+        placeholder = "<|IMAGE|>"
+    elif modality == "video":
+        placeholder = "<|VIDEO|>"
+
+    default_system = (
+        "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
+        "Group, capable of perceiving auditory and visual inputs, as well as "
+        "generating text and speech."
+    )
+
+    prompts = [
+        (
+            f"<|im_start|>system\n{default_system}<|im_end|>\n"
+            f"<|im_start|>user\n<|vision_bos|>{placeholder}<|vision_eos|>"
+            f"{question}<|im_end|>\n"
+            "<|im_start|>assistant\n"
+        )
+        for question in questions
+    ]
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "omni-research/Tarsier2-Recap-7b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    if modality == "image":
+        placeholder = "<|image_pad|>"
+    elif modality == "video":
+        placeholder = "<|video_pad|>"
+
+    prompts = [
+        (
+            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+            f"{question}<|im_end|>\n"
+            "<|im_start|>assistant\n"
+        )
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# SkyworkR1V
+def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "Skywork/Skywork-R1V-38B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    messages = [
+        [{"role": "user", "content": f"<image>\n{question}"}] for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    # Stop tokens for SkyworkR1V
+    # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py
+    stop_tokens = ["<｜end▁of▁sentence｜>", "<|endoftext|>"]
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
+
+
+model_example_map = {
+    "aria": run_aria,
+    "aya_vision": run_aya_vision,
+    "blip-2": run_blip2,
+    "chameleon": run_chameleon,
+    "deepseek_vl_v2": run_deepseek_vl2,
+    "florence2": run_florence2,
+    "fuyu": run_fuyu,
+    "gemma3": run_gemma3,
+    "glm4v": run_glm4v,
+    "glm4_1v": run_glm4_1v,
+    "h2ovl_chat": run_h2ovl,
+    "idefics3": run_idefics3,
+    "internvl_chat": run_internvl,
+    "nemotron_vl": run_nemotron_vl,
+    "keye_vl": run_keye_vl,
+    "kimi_vl": run_kimi_vl,
+    "llava": run_llava,
+    "llava-next": run_llava_next,
+    "llava-next-video": run_llava_next_video,
+    "llava-onevision": run_llava_onevision,
+    "mantis": run_mantis,
+    "minicpmo": run_minicpmo,
+    "minicpmv": run_minicpmv,
+    "mistral3": run_mistral3,
+    "mllama": run_mllama,
+    "llama4": run_llama4,
+    "molmo": run_molmo,
+    "NVLM_D": run_nvlm_d,
+    "ovis": run_ovis,
+    "paligemma": run_paligemma,
+    "paligemma2": run_paligemma2,
+    "phi3_v": run_phi3v,
+    "phi4_mm": run_phi4mm,
+    "pixtral_hf": run_pixtral_hf,
+    "qwen_vl": run_qwen_vl,
+    "qwen2_vl": run_qwen2_vl,
+    "qwen2_5_vl": run_qwen2_5_vl,
+    "qwen2_5_omni": run_qwen2_5_omni,
+    "skywork_chat": run_skyworkr1v,
+    "smolvlm": run_smolvlm,
+    "tarsier": run_tarsier,
+    "tarsier2": run_tarsier2,
+}
+
+
+def get_multi_modal_input(args):
+    """
+    return {
+        "data": image or video,
+        "question": question,
+    }
+    """
+    if args.modality == "image":
+        # Input image and question
+        image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
+        img_questions = [
+            "What is the content of this image?",
+            "Describe the content of this image in detail.",
+            "What's in the image?",
+            "Where is this image taken?",
+        ]
+
+        return {
+            "data": image,
+            "questions": img_questions,
+        }
+
+    if args.modality == "video":
+        # Input video and question
+        video = VideoAsset(name="baby_reading", num_frames=args.num_frames).np_ndarrays
+        metadata = VideoAsset(name="baby_reading", num_frames=args.num_frames).metadata
+        vid_questions = ["Why is this video funny?"]
+
+        return {
+            "data": [(video, metadata)] if args.model_type == "glm4_1v" else video,
+            "questions": vid_questions,
+        }
+
+    msg = f"Modality {args.modality} is not supported."
+    raise ValueError(msg)
+
+
+def apply_image_repeat(
+    image_repeat_prob, num_prompts, data, prompts: list[str], modality
+):
+    """Repeats images with provided probability of "image_repeat_prob".
+    Used to simulate hit/miss for the MM preprocessor cache.
+    """
+    assert image_repeat_prob <= 1.0 and image_repeat_prob >= 0
+    no_yes = [0, 1]
+    probs = [1.0 - image_repeat_prob, image_repeat_prob]
+
+    inputs = []
+    cur_image = data
+    for i in range(num_prompts):
+        if image_repeat_prob is not None:
+            res = random.choices(no_yes, probs)[0]
+            if res == 0:
+                # No repeat => Modify one pixel
+                cur_image = cur_image.copy()
+                new_val = (i // 256 // 256, i // 256, i % 256)
+                cur_image.putpixel((0, 0), new_val)
+
+        inputs.append(
+            {
+                "prompt": prompts[i % len(prompts)],
+                "multi_modal_data": {modality: cur_image},
+            }
+        )
+
+    return inputs
+
+
+@contextmanager
+def time_counter(enable: bool):
+    if enable:
+        import time
+
+        start_time = time.time()
+        yield
+        elapsed_time = time.time() - start_time
+        print("-" * 50)
+        print("-- generate time = {}".format(elapsed_time))
+        print("-" * 50)
+    else:
+        yield
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description="Demo on using vLLM for offline inference with "
+        "vision language models for text generation"
+    )
+    parser.add_argument(
+        "--model-type",
+        "-m",
+        type=str,
+        default="llava",
+        choices=model_example_map.keys(),
+        help='Huggingface "model_type".',
+    )
+    parser.add_argument(
+        "--num-prompts", type=int, default=4, help="Number of prompts to run."
+    )
+    parser.add_argument(
+        "--modality",
+        type=str,
+        default="image",
+        choices=["image", "video"],
+        help="Modality of the input.",
+    )
+    parser.add_argument(
+        "--num-frames",
+        type=int,
+        default=16,
+        help="Number of frames to extract from the video.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="Set the seed when initializing `vllm.LLM`.",
+    )
+
+    parser.add_argument(
+        "--image-repeat-prob",
+        type=float,
+        default=None,
+        help="Simulates the hit-ratio for multi-modal preprocessor cache (if enabled)",
+    )
+
+    parser.add_argument(
+        "--disable-mm-preprocessor-cache",
+        action="store_true",
+        help="If True, disables caching of multi-modal preprocessor/mapper.",
+    )
+
+    parser.add_argument(
+        "--time-generate",
+        action="store_true",
+        help="If True, then print the total generate() call time",
+    )
+
+    parser.add_argument(
+        "--use-different-prompt-per-request",
+        action="store_true",
+        help="If True, then use different prompt (with the same multi-modal "
+        "data) for each request.",
+    )
+    return parser.parse_args()
+
+
+def main(args):
+    model = args.model_type
+    if model not in model_example_map:
+        raise ValueError(f"Model type {model} is not supported.")
+
+    modality = args.modality
+    mm_input = get_multi_modal_input(args)
+    data = mm_input["data"]
+    questions = mm_input["questions"]
+
+    req_data = model_example_map[model](questions, modality)
+
+    # Disable other modalities to save memory
+    default_limits = {"image": 0, "video": 0, "audio": 0}
+    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
+        req_data.engine_args.limit_mm_per_prompt or {}
+    )
+
+    engine_args = asdict(req_data.engine_args) | {
+        "seed": args.seed,
+        "disable_mm_preprocessor_cache": args.disable_mm_preprocessor_cache,
+    }
+    llm = LLM(**engine_args)
+
+    # Don't want to check the flag multiple times, so just hijack `prompts`.
+    prompts = (
+        req_data.prompts
+        if args.use_different_prompt_per_request
+        else [req_data.prompts[0]]
+    )
+
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(
+        temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids
+    )
+
+    assert args.num_prompts > 0
+    if args.num_prompts == 1:
+        # Single inference
+        inputs = {
+            "prompt": prompts[0],
+            "multi_modal_data": {modality: data},
+        }
+    else:
+        # Batch inference
+        if args.image_repeat_prob is not None:
+            # Repeat images with specified probability of "image_repeat_prob"
+            inputs = apply_image_repeat(
+                args.image_repeat_prob, args.num_prompts, data, prompts, modality
+            )
+        else:
+            # Use the same image for all prompts
+            inputs = [
+                {
+                    "prompt": prompts[i % len(prompts)],
+                    "multi_modal_data": {modality: data},
+                }
+                for i in range(args.num_prompts)
+            ]
+
+    # Add LoRA request if applicable
+    lora_request = (
+        req_data.lora_requests * args.num_prompts if req_data.lora_requests else None
+    )
+
+    with time_counter(args.time_generate):
+        outputs = llm.generate(
+            inputs,
+            sampling_params=sampling_params,
+            lora_request=lora_request,
+        )
+
+    print("-" * 50)
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/examples/offline_inference/vision_language_multi_image.py b/vllm_v0.10.0/examples/offline_inference/vision_language_multi_image.py
new file mode 100644
index 0000000..eb4f3b6
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/vision_language_multi_image.py
@@ -0,0 +1,1052 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This example shows how to use vLLM for running offline inference with
+multi-image input on vision language models for text generation,
+using the chat template defined by the model.
+"""
+
+import os
+from argparse import Namespace
+from dataclasses import asdict
+from typing import NamedTuple, Optional
+
+from huggingface_hub import snapshot_download
+from PIL.Image import Image
+from transformers import AutoProcessor, AutoTokenizer
+
+from vllm import LLM, EngineArgs, SamplingParams
+from vllm.lora.request import LoRARequest
+from vllm.multimodal.utils import fetch_image
+from vllm.utils import FlexibleArgumentParser
+
+QUESTION = "What is the content of each image?"
+IMAGE_URLS = [
+    "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/2/26/Ultramarine_Flycatcher_%28Ficedula_superciliaris%29_Naggar%2C_Himachal_Pradesh%2C_2013_%28cropped%29.JPG",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg/2560px-Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/d/d4/Starfish%2C_Caswell_Bay_-_geograph.org.uk_-_409413.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/6/69/Grapevinesnail_01.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Texas_invasive_Musk_Thistle_1.jpg/1920px-Texas_invasive_Musk_Thistle_1.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Huskiesatrest.jpg/2880px-Huskiesatrest.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg/1920px-Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg",
+]
+
+
+class ModelRequestData(NamedTuple):
+    engine_args: EngineArgs
+    prompt: str
+    image_data: list[Image]
+    stop_token_ids: Optional[list[int]] = None
+    chat_template: Optional[str] = None
+    lora_requests: Optional[list[LoRARequest]] = None
+
+
+# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
+# lower-end GPUs.
+# Unless specified, these settings have been tested to work on a single L4.
+
+
+def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "rhymes-ai/Aria"
+    engine_args = EngineArgs(
+        model=model_name,
+        tokenizer_mode="slow",
+        trust_remote_code=True,
+        dtype="bfloat16",
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+    placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
+    prompt = (
+        f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n<|im_start|>assistant\n"
+    )
+    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
+def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "CohereForAI/aya-vision-8b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
+
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
+def load_deepseek_vl2(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "deepseek-ai/deepseek-vl2-tiny"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholder = "".join(
+        f"image_{i}:<image>\n" for i, _ in enumerate(image_urls, start=1)
+    )
+    prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:"
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
+def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "google/gemma-3-4b-it"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
+
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
+def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "h2oai/h2ovl-mississippi-800m"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        mm_processor_kwargs={"max_dynamic_patch": 4},
+    )
+
+    placeholders = "\n".join(
+        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
+    )
+    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    # Stop tokens for H2OVL-Mississippi
+    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
+    stop_token_ids = [tokenizer.eos_token_id]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
+def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
+
+    # The configuration below has been confirmed to launch on a single L40 GPU.
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=16,
+        enforce_eager=True,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        # if you are running out of memory, you can reduce the "longest_edge".
+        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
+        mm_processor_kwargs={
+            "size": {"longest_edge": 2 * 364},
+        },
+    )
+
+    placeholders = "\n".join(
+        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
+    )
+    prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
+def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
+
+    # The configuration below has been confirmed to launch on a single L40 GPU.
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=16,
+        enforce_eager=True,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        mm_processor_kwargs={
+            "max_image_size": {"longest_edge": 384},
+        },
+    )
+
+    placeholders = "\n".join(
+        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
+    )
+    prompt = (
+        f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
+    )
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
+def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "OpenGVLab/InternVL2-2B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        mm_processor_kwargs={"max_dynamic_patch": 4},
+    )
+
+    placeholders = "\n".join(
+        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
+    )
+    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    # Stop tokens for InternVL
+    # models variants may have different stop tokens
+    # please refer to the model card for the correct "stop words":
+    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
+    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
+def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
+    # NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
+    # it will generate poor response for multi-image inputs!
+    model_name = "llava-hf/llava-1.5-7b-hf"
+    engine_args = EngineArgs(
+        model=model_name,
+        max_num_seqs=16,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
+
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
+def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "llava-hf/llava-v1.6-mistral-7b-hf"
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=16,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
+
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
+def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=16384,
+        max_num_seqs=16,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
+
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
+def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=131072,
+        tensor_parallel_size=8,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
+
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
+def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "Kwai-Keye/Keye-VL-8B-Preview"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        },
+    ]
+
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    image_data = [fetch_image(url) for url in image_urls]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=image_data,
+    )
+
+
+def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "moonshotai/Kimi-VL-A3B-Instruct"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        max_num_seqs=4,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
+
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
+def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+
+    # Adjust this as necessary to fit in GPU
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        tensor_parallel_size=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        ignore_patterns=["consolidated.safetensors"],
+    )
+
+    placeholders = "[IMG]" * len(image_urls)
+    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
+def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+
+    # The configuration below has been confirmed to launch on a single L40 GPU.
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    img_prompt = "Given the first image <|image|> and the second image<|image|>"
+    prompt = f"<|begin_of_text|>{img_prompt}, {question}?"
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
+def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "nvidia/NVLM-D-72B"
+
+    # Adjust this as necessary to fit in GPU
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        tensor_parallel_size=4,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        mm_processor_kwargs={"max_dynamic_patch": 4},
+    )
+
+    placeholders = "\n".join(
+        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
+    )
+    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
+# Ovis
+def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "AIDC-AI/Ovis2-1B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        trust_remote_code=True,
+        dtype="half",
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = "\n".join(
+        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
+    )
+    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
+def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "mistral-community/pixtral-12b"
+
+    # Adjust this as necessary to fit in GPU
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        tensor_parallel_size=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = "[IMG]" * len(image_urls)
+    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
+def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
+    # num_crops is an override kwarg to the multimodal image processor;
+    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
+    # to use 16 for single frame scenarios, and 4 for multi-frame.
+    #
+    # Generally speaking, a larger value for num_crops results in more
+    # tokens per image instance, because it may scale the image more in
+    # the image preprocessing. Some references in the model docs and the
+    # formula for image tokens after the preprocessing
+    # transform can be found below.
+    #
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
+    engine_args = EngineArgs(
+        model="microsoft/Phi-3.5-vision-instruct",
+        trust_remote_code=True,
+        max_model_len=4096,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        mm_processor_kwargs={"num_crops": 4},
+    )
+    placeholders = "\n".join(
+        f"<|image_{i}|>" for i, _ in enumerate(image_urls, start=1)
+    )
+    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
+def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
+    """
+    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
+    show how to process multi images inputs.
+    """
+
+    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
+    # Since the vision-lora and speech-lora co-exist with the base model,
+    # we have to manually specify the path of the lora weights.
+    vision_lora_path = os.path.join(model_path, "vision-lora")
+    engine_args = EngineArgs(
+        model=model_path,
+        trust_remote_code=True,
+        max_model_len=4096,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        enable_lora=True,
+        max_lora_rank=320,
+        # Note - mm_processor_kwargs can also be passed to generate/chat calls
+        mm_processor_kwargs={"dynamic_hd": 4},
+    )
+
+    placeholders = "".join(f"<|image_{i}|>" for i, _ in enumerate(image_urls, start=1))
+    prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
+    )
+
+
+def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "Qwen/Qwen-VL-Chat"
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=1024,
+        max_num_seqs=2,
+        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+    placeholders = "".join(
+        f"Picture {i}: <img></img>\n" for i, _ in enumerate(image_urls, start=1)
+    )
+
+    # This model does not have a chat_template attribute on its tokenizer,
+    # so we need to explicitly pass it. We use ChatML since it's used in the
+    # generation utils of the model:
+    # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+
+    # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
+    chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"  # noqa: E501
+
+    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
+    prompt = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,
+        chat_template=chat_template,
+    )
+
+    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=chat_template,
+    )
+
+
+def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+    try:
+        from qwen_vl_utils import smart_resize
+    except ModuleNotFoundError:
+        print(
+            "WARNING: `qwen-vl-utils` not installed, input images will not "
+            "be automatically resized. You can enable this functionality by "
+            "`pip install qwen-vl-utils`."
+        )
+        smart_resize = None
+
+    model_name = "Qwen/Qwen2-VL-7B-Instruct"
+
+    # Tested on L40
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=32768 if smart_resize is None else 4096,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        },
+    ]
+
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    if smart_resize is None:
+        image_data = [fetch_image(url) for url in image_urls]
+    else:
+
+        def post_process_image(image: Image) -> Image:
+            width, height = image.size
+            resized_height, resized_width = smart_resize(
+                height, width, max_pixels=1024 * 28 * 28
+            )
+            return image.resize((resized_width, resized_height))
+
+        image_data = [post_process_image(fetch_image(url)) for url in image_urls]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=image_data,
+    )
+
+
+def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+    try:
+        from qwen_vl_utils import smart_resize
+    except ModuleNotFoundError:
+        print(
+            "WARNING: `qwen-vl-utils` not installed, input images will not "
+            "be automatically resized. You can enable this functionality by "
+            "`pip install qwen-vl-utils`."
+        )
+        smart_resize = None
+
+    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=32768 if smart_resize is None else 4096,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        },
+    ]
+
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    if smart_resize is None:
+        image_data = [fetch_image(url) for url in image_urls]
+    else:
+
+        def post_process_image(image: Image) -> Image:
+            width, height = image.size
+            resized_height, resized_width = smart_resize(
+                height, width, max_pixels=1024 * 28 * 28
+            )
+            return image.resize((resized_width, resized_height))
+
+        image_data = [post_process_image(fetch_image(url)) for url in image_urls]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=image_data,
+    )
+
+
+def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "omni-research/Tarsier-7b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    prompt = f"USER: {'<image>' * len(image_urls)}\n{question}\n ASSISTANT:"
+    image_data = [fetch_image(url) for url in image_urls]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=image_data,
+    )
+
+
+def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "omni-research/Tarsier2-Recap-7b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=32768,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
+    )
+
+    prompt = (
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+        f"<|im_start|>user\n<|vision_start|>{'<|image_pad|>' * len(image_urls)}"
+        f"<|vision_end|>{question}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
+    image_data = [fetch_image(url) for url in image_urls]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=image_data,
+    )
+
+
+model_example_map = {
+    "aria": load_aria,
+    "aya_vision": load_aya_vision,
+    "deepseek_vl_v2": load_deepseek_vl2,
+    "gemma3": load_gemma3,
+    "h2ovl_chat": load_h2ovl,
+    "idefics3": load_idefics3,
+    "internvl_chat": load_internvl,
+    "keye_vl": load_keye_vl,
+    "kimi_vl": load_kimi_vl,
+    "llava": load_llava,
+    "llava-next": load_llava_next,
+    "llava-onevision": load_llava_onevision,
+    "llama4": load_llama4,
+    "mistral3": load_mistral3,
+    "mllama": load_mllama,
+    "NVLM_D": load_nvlm_d,
+    "ovis": load_ovis,
+    "phi3_v": load_phi3v,
+    "phi4_mm": load_phi4mm,
+    "pixtral_hf": load_pixtral_hf,
+    "qwen_vl_chat": load_qwen_vl_chat,
+    "qwen2_vl": load_qwen2_vl,
+    "qwen2_5_vl": load_qwen2_5_vl,
+    "smolvlm": load_smolvlm,
+    "tarsier": load_tarsier,
+    "tarsier2": load_tarsier2,
+}
+
+
+def run_generate(model, question: str, image_urls: list[str], seed: Optional[int]):
+    req_data = model_example_map[model](question, image_urls)
+
+    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    llm = LLM(**engine_args)
+
+    sampling_params = SamplingParams(
+        temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
+    )
+
+    outputs = llm.generate(
+        {
+            "prompt": req_data.prompt,
+            "multi_modal_data": {"image": req_data.image_data},
+        },
+        sampling_params=sampling_params,
+        lora_request=req_data.lora_requests,
+    )
+
+    print("-" * 50)
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+        print("-" * 50)
+
+
+def run_chat(model: str, question: str, image_urls: list[str], seed: Optional[int]):
+    req_data = model_example_map[model](question, image_urls)
+
+    # Disable other modalities to save memory
+    default_limits = {"image": 0, "video": 0, "audio": 0}
+    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
+        req_data.engine_args.limit_mm_per_prompt or {}
+    )
+
+    engine_args = asdict(req_data.engine_args) | {"seed": seed}
+    llm = LLM(**engine_args)
+
+    sampling_params = SamplingParams(
+        temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
+    )
+    outputs = llm.chat(
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": question,
+                    },
+                    *(
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": image_url},
+                        }
+                        for image_url in image_urls
+                    ),
+                ],
+            }
+        ],
+        sampling_params=sampling_params,
+        chat_template=req_data.chat_template,
+        lora_request=req_data.lora_requests,
+    )
+
+    print("-" * 50)
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+        print("-" * 50)
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description="Demo on using vLLM for offline inference with "
+        "vision language models that support multi-image input for text "
+        "generation"
+    )
+    parser.add_argument(
+        "--model-type",
+        "-m",
+        type=str,
+        default="phi3_v",
+        choices=model_example_map.keys(),
+        help='Huggingface "model_type".',
+    )
+    parser.add_argument(
+        "--method",
+        type=str,
+        default="generate",
+        choices=["generate", "chat"],
+        help="The method to run in `vllm.LLM`.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="Set the seed when initializing `vllm.LLM`.",
+    )
+    parser.add_argument(
+        "--num-images",
+        "-n",
+        type=int,
+        choices=list(range(1, len(IMAGE_URLS) + 1)),  # the max number of images
+        default=2,
+        help="Number of images to use for the demo.",
+    )
+    return parser.parse_args()
+
+
+def main(args: Namespace):
+    model = args.model_type
+    method = args.method
+    seed = args.seed
+
+    image_urls = IMAGE_URLS[: args.num_images]
+
+    if method == "generate":
+        run_generate(model, QUESTION, image_urls, seed)
+    elif method == "chat":
+        run_chat(model, QUESTION, image_urls, seed)
+    else:
+        raise ValueError(f"Invalid method: {method}")
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/examples/offline_inference/vision_language_pooling.py b/vllm_v0.10.0/examples/offline_inference/vision_language_pooling.py
new file mode 100644
index 0000000..57963eb
--- /dev/null
+++ b/vllm_v0.10.0/examples/offline_inference/vision_language_pooling.py
@@ -0,0 +1,288 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This example shows how to use vLLM for running offline inference with
+the correct prompt format on vision language models for multimodal pooling.
+
+For most models, the prompt format should follow corresponding examples
+on HuggingFace model repository.
+"""
+
+from argparse import Namespace
+from dataclasses import asdict
+from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
+
+from PIL.Image import Image
+
+from vllm import LLM, EngineArgs
+from vllm.entrypoints.score_utils import ScoreMultiModalParam
+from vllm.multimodal.utils import fetch_image
+from vllm.utils import FlexibleArgumentParser
+
+
+class TextQuery(TypedDict):
+    modality: Literal["text"]
+    text: str
+
+
+class ImageQuery(TypedDict):
+    modality: Literal["image"]
+    image: Image
+
+
+class TextImageQuery(TypedDict):
+    modality: Literal["text+image"]
+    text: str
+    image: Image
+
+
+class TextImagesQuery(TypedDict):
+    modality: Literal["text+images"]
+    text: str
+    image: ScoreMultiModalParam
+
+
+QueryModality = Literal["text", "image", "text+image", "text+images"]
+Query = Union[TextQuery, ImageQuery, TextImageQuery, TextImagesQuery]
+
+
+class ModelRequestData(NamedTuple):
+    engine_args: EngineArgs
+    prompt: Optional[str] = None
+    image: Optional[Image] = None
+    query: Optional[str] = None
+    documents: Optional[ScoreMultiModalParam] = None
+
+
+def run_e5_v(query: Query) -> ModelRequestData:
+    llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"  # noqa: E501
+
+    if query["modality"] == "text":
+        text = query["text"]
+        prompt = llama3_template.format(f"{text}\nSummary above sentence in one word: ")
+        image = None
+    elif query["modality"] == "image":
+        prompt = llama3_template.format("<image>\nSummary above image in one word: ")
+        image = query["image"]
+    else:
+        modality = query["modality"]
+        raise ValueError(f"Unsupported query modality: '{modality}'")
+
+    engine_args = EngineArgs(
+        model="royokong/e5-v",
+        task="embed",
+        max_model_len=4096,
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image=image,
+    )
+
+
+def run_vlm2vec(query: Query) -> ModelRequestData:
+    if query["modality"] == "text":
+        text = query["text"]
+        prompt = f"Find me an everyday image that matches the given caption: {text}"  # noqa: E501
+        image = None
+    elif query["modality"] == "image":
+        prompt = "<|image_1|> Find a day-to-day image that looks similar to the provided image."  # noqa: E501
+        image = query["image"]
+    elif query["modality"] == "text+image":
+        text = query["text"]
+        prompt = (
+            f"<|image_1|> Represent the given image with the following question: {text}"  # noqa: E501
+        )
+        image = query["image"]
+    else:
+        modality = query["modality"]
+        raise ValueError(f"Unsupported query modality: '{modality}'")
+
+    engine_args = EngineArgs(
+        model="TIGER-Lab/VLM2Vec-Full",
+        task="embed",
+        max_model_len=4096,
+        trust_remote_code=True,
+        mm_processor_kwargs={"num_crops": 4},
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image=image,
+    )
+
+
+def run_jinavl_reranker(query: Query) -> ModelRequestData:
+    if query["modality"] != "text+images":
+        raise ValueError(f"Unsupported query modality: '{query['modality']}'")
+
+    engine_args = EngineArgs(
+        model="jinaai/jina-reranker-m0",
+        task="score",
+        max_model_len=32768,
+        trust_remote_code=True,
+        mm_processor_kwargs={
+            "min_pixels": 3136,
+            "max_pixels": 602112,
+        },
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        query=query["text"],
+        documents=query["image"],
+    )
+
+
+def get_query(modality: QueryModality):
+    if modality == "text":
+        return TextQuery(modality="text", text="A dog sitting in the grass")
+
+    if modality == "image":
+        return ImageQuery(
+            modality="image",
+            image=fetch_image(
+                "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/360px-American_Eskimo_Dog.jpg"  # noqa: E501
+            ),
+        )
+
+    if modality == "text+image":
+        return TextImageQuery(
+            modality="text+image",
+            text="A cat standing in the snow.",
+            image=fetch_image(
+                "https://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Felis_catus-cat_on_snow.jpg/179px-Felis_catus-cat_on_snow.jpg"  # noqa: E501
+            ),
+        )
+
+    if modality == "text+images":
+        return TextImagesQuery(
+            modality="text+images",
+            text="slm markdown",
+            image={
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
+                        },
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
+                        },
+                    },
+                ]
+            },
+        )
+
+    msg = f"Modality {modality} is not supported."
+    raise ValueError(msg)
+
+
+def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
+    query = get_query(modality)
+    req_data = model_example_map[model](query)
+
+    # Disable other modalities to save memory
+    default_limits = {"image": 0, "video": 0, "audio": 0}
+    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
+        req_data.engine_args.limit_mm_per_prompt or {}
+    )
+
+    engine_args = asdict(req_data.engine_args) | {"seed": seed}
+    llm = LLM(**engine_args)
+
+    mm_data = {}
+    if req_data.image is not None:
+        mm_data["image"] = req_data.image
+
+    outputs = llm.embed(
+        {
+            "prompt": req_data.prompt,
+            "multi_modal_data": mm_data,
+        }
+    )
+
+    print("-" * 50)
+    for output in outputs:
+        print(output.outputs.embedding)
+        print("-" * 50)
+
+
+def run_score(model: str, modality: QueryModality, seed: Optional[int]):
+    query = get_query(modality)
+    req_data = model_example_map[model](query)
+
+    engine_args = asdict(req_data.engine_args) | {"seed": seed}
+    llm = LLM(**engine_args)
+
+    outputs = llm.score(req_data.query, req_data.documents)
+
+    print("-" * 30)
+    print([output.outputs.score for output in outputs])
+    print("-" * 30)
+
+
+model_example_map = {
+    "e5_v": run_e5_v,
+    "vlm2vec": run_vlm2vec,
+    "jinavl_reranker": run_jinavl_reranker,
+}
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description="Demo on using vLLM for offline inference with "
+        "vision language models for multimodal pooling tasks."
+    )
+    parser.add_argument(
+        "--model-name",
+        "-m",
+        type=str,
+        default="vlm2vec",
+        choices=model_example_map.keys(),
+        help="The name of the embedding model.",
+    )
+    parser.add_argument(
+        "--task",
+        "-t",
+        type=str,
+        default="embedding",
+        choices=["embedding", "scoring"],
+        help="The task type.",
+    )
+    parser.add_argument(
+        "--modality",
+        type=str,
+        default="image",
+        choices=get_args(QueryModality),
+        help="Modality of the input.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="Set the seed when initializing `vllm.LLM`.",
+    )
+    return parser.parse_args()
+
+
+def main(args: Namespace):
+    if args.task == "embedding":
+        run_encode(args.model_name, args.modality, args.seed)
+    elif args.task == "scoring":
+        run_score(args.model_name, args.modality, args.seed)
+    else:
+        raise ValueError(f"Unsupported task: {args.task}")
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/examples/online_serving/api_client.py b/vllm_v0.10.0/examples/online_serving/api_client.py
new file mode 100644
index 0000000..8485491
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/api_client.py
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Example Python client for `vllm.entrypoints.api_server`
+Start the demo server:
+    python -m vllm.entrypoints.api_server --model <model_name>
+
+NOTE: The API server is used only for demonstration and simple performance
+benchmarks. It is not intended for production use.
+For production use, we recommend `vllm serve` and the OpenAI client API.
+"""
+
+import argparse
+import json
+from argparse import Namespace
+from collections.abc import Iterable
+
+import requests
+
+
+def clear_line(n: int = 1) -> None:
+    LINE_UP = "\033[1A"
+    LINE_CLEAR = "\x1b[2K"
+    for _ in range(n):
+        print(LINE_UP, end=LINE_CLEAR, flush=True)
+
+
+def post_http_request(
+    prompt: str, api_url: str, n: int = 1, stream: bool = False
+) -> requests.Response:
+    headers = {"User-Agent": "Test Client"}
+    pload = {
+        "prompt": prompt,
+        "n": n,
+        "temperature": 0.0,
+        "max_tokens": 16,
+        "stream": stream,
+    }
+    response = requests.post(api_url, headers=headers, json=pload, stream=stream)
+    return response
+
+
+def get_streaming_response(response: requests.Response) -> Iterable[list[str]]:
+    for chunk in response.iter_lines(
+        chunk_size=8192, decode_unicode=False, delimiter=b"\n"
+    ):
+        if chunk:
+            data = json.loads(chunk.decode("utf-8"))
+            output = data["text"]
+            yield output
+
+
+def get_response(response: requests.Response) -> list[str]:
+    data = json.loads(response.content)
+    output = data["text"]
+    return output
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--n", type=int, default=1)
+    parser.add_argument("--prompt", type=str, default="San Francisco is a")
+    parser.add_argument("--stream", action="store_true")
+    return parser.parse_args()
+
+
+def main(args: Namespace):
+    prompt = args.prompt
+    api_url = f"http://{args.host}:{args.port}/generate"
+    n = args.n
+    stream = args.stream
+
+    print(f"Prompt: {prompt!r}\n", flush=True)
+    response = post_http_request(prompt, api_url, n, stream)
+
+    if stream:
+        num_printed_lines = 0
+        for h in get_streaming_response(response):
+            clear_line(num_printed_lines)
+            num_printed_lines = 0
+            for i, line in enumerate(h):
+                num_printed_lines += 1
+                print(f"Beam candidate {i}: {line!r}", flush=True)
+    else:
+        output = get_response(response)
+        for i, line in enumerate(output):
+            print(f"Beam candidate {i}: {line!r}", flush=True)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/examples/online_serving/chart-helm/.helmignore b/vllm_v0.10.0/examples/online_serving/chart-helm/.helmignore
new file mode 100644
index 0000000..2d1303b
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/chart-helm/.helmignore
@@ -0,0 +1,6 @@
+*.png
+.git/
+ct.yaml
+lintconf.yaml
+values.schema.json
+/workflows
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/online_serving/chart-helm/Chart.yaml b/vllm_v0.10.0/examples/online_serving/chart-helm/Chart.yaml
new file mode 100644
index 0000000..fb0f06f
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/chart-helm/Chart.yaml
@@ -0,0 +1,21 @@
+apiVersion: v2
+name: chart-vllm
+description: Chart vllm
+
+# A chart can be either an 'application' or a 'library' chart.
+#
+# Application charts are a collection of templates that can be packaged into versioned archives
+# to be deployed.
+#
+# Library charts provide useful utilities or functions for the chart developer. They're included as
+# a dependency of application charts to inject those utilities and functions into the rendering
+# pipeline. Library charts do not define any templates and therefore cannot be deployed.
+type: application
+
+# This is the chart version. This version number should be incremented each time you make changes
+# to the chart and its templates, including the app version.
+# Versions are expected to follow Semantic Versioning (https://semver.org/)
+version: 0.0.1
+
+maintainers:
+  - name: mfournioux
diff --git a/vllm_v0.10.0/examples/online_serving/chart-helm/README.md b/vllm_v0.10.0/examples/online_serving/chart-helm/README.md
new file mode 100644
index 0000000..bfe8112
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/chart-helm/README.md
@@ -0,0 +1,21 @@
+# Helm Charts
+
+This directory contains a Helm chart for deploying the vllm application. The chart includes configurations for deployment, autoscaling, resource management, and more.
+
+## Files
+
+- Chart.yaml: Defines the chart metadata including name, version, and maintainers.
+- ct.yaml: Configuration for chart testing.
+- lintconf.yaml: Linting rules for YAML files.
+- values.schema.json: JSON schema for validating values.yaml.
+- values.yaml: Default values for the Helm chart.
+- templates/_helpers.tpl: Helper templates for defining common configurations.
+- templates/configmap.yaml: Template for creating ConfigMaps.
+- templates/custom-objects.yaml: Template for custom Kubernetes objects.
+- templates/deployment.yaml: Template for creating Deployments.
+- templates/hpa.yaml: Template for Horizontal Pod Autoscaler.
+- templates/job.yaml: Template for Kubernetes Jobs.
+- templates/poddisruptionbudget.yaml: Template for Pod Disruption Budget.
+- templates/pvc.yaml: Template for Persistent Volume Claims.
+- templates/secrets.yaml: Template for Kubernetes Secrets.
+- templates/service.yaml: Template for creating Services.
diff --git a/vllm_v0.10.0/examples/online_serving/chart-helm/ct.yaml b/vllm_v0.10.0/examples/online_serving/chart-helm/ct.yaml
new file mode 100644
index 0000000..d273e11
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/chart-helm/ct.yaml
@@ -0,0 +1,3 @@
+chart-dirs:
+  - charts
+validate-maintainers: false
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/online_serving/chart-helm/lintconf.yaml b/vllm_v0.10.0/examples/online_serving/chart-helm/lintconf.yaml
new file mode 100644
index 0000000..c8e8c5d
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/chart-helm/lintconf.yaml
@@ -0,0 +1,42 @@
+---
+rules:
+  braces:
+    min-spaces-inside: 0
+    max-spaces-inside: 0
+    min-spaces-inside-empty: -1
+    max-spaces-inside-empty: -1
+  brackets:
+    min-spaces-inside: 0
+    max-spaces-inside: 0
+    min-spaces-inside-empty: -1
+    max-spaces-inside-empty: -1
+  colons:
+    max-spaces-before: 0
+    max-spaces-after: 1
+  commas:
+    max-spaces-before: 0
+    min-spaces-after: 1
+    max-spaces-after: 1
+  comments:
+    require-starting-space: true
+    min-spaces-from-content: 2
+  document-end: disable
+  document-start: disable           # No --- to start a file
+  empty-lines:
+    max: 2
+    max-start: 0
+    max-end: 0
+  hyphens:
+    max-spaces-after: 1
+  indentation:
+    spaces: consistent
+    indent-sequences: whatever      # - list indentation will handle both indentation and without
+    check-multi-line-strings: false
+  key-duplicates: enable
+  line-length: disable              # Lines can be any length
+  new-line-at-end-of-file: disable
+  new-lines:
+    type: unix
+  trailing-spaces: enable
+  truthy:
+    level: warning
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/online_serving/chart-helm/templates/_helpers.tpl b/vllm_v0.10.0/examples/online_serving/chart-helm/templates/_helpers.tpl
new file mode 100644
index 0000000..a9690ba
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/chart-helm/templates/_helpers.tpl
@@ -0,0 +1,164 @@
+{{/*
+Define ports for the pods
+*/}}
+{{- define "chart.container-port" -}}
+{{-  default "8000" .Values.containerPort }}
+{{- end }}
+
+{{/*
+Define service name
+*/}}
+{{- define "chart.service-name" -}}
+{{-  if .Values.serviceName }}
+{{-    .Values.serviceName | lower | trim }}
+{{-  else }}
+"{{ .Release.Name }}-service"
+{{-  end }}
+{{- end }}
+
+{{/*
+Define service port
+*/}}
+{{- define "chart.service-port" -}}
+{{-  if .Values.servicePort }}
+{{-    .Values.servicePort }}
+{{-  else }}
+{{-    include "chart.container-port" . }}
+{{-  end }}
+{{- end }}
+
+{{/*
+Define service port name
+*/}}
+{{- define "chart.service-port-name" -}}
+"service-port"
+{{- end }}
+
+{{/*
+Define container port name
+*/}}
+{{- define "chart.container-port-name" -}}
+"container-port"
+{{- end }}
+
+{{/*
+Define deployment strategy
+*/}}
+{{- define "chart.strategy" -}}
+strategy:
+{{-   if not .Values.deploymentStrategy }}
+  rollingUpdate:
+    maxSurge: 100%
+    maxUnavailable: 0
+{{-   else }}
+{{      toYaml .Values.deploymentStrategy | indent 2 }}
+{{-   end }}
+{{- end }}
+
+{{/*
+Define additional ports
+*/}}
+{{- define "chart.extraPorts" }}
+{{-   with .Values.extraPorts }}
+{{      toYaml . }}
+{{-   end }}
+{{- end }}
+
+{{/*
+Define chart external ConfigMaps and Secrets
+*/}}
+{{- define "chart.externalConfigs" -}}
+{{-   with .Values.externalConfigs -}}
+{{      toYaml . }}
+{{-   end }}
+{{- end }}
+
+
+{{/*
+Define liveness et readiness probes
+*/}}
+{{- define "chart.probes" -}}
+{{-   if .Values.readinessProbe  }}
+readinessProbe:
+{{-     with .Values.readinessProbe }}
+{{-       toYaml . | nindent 2 }}
+{{-     end }}
+{{-   end }}
+{{-   if .Values.livenessProbe  }}
+livenessProbe:
+{{-     with .Values.livenessProbe }}
+{{-       toYaml . | nindent 2 }}
+{{-     end }}
+{{-   end }}
+{{- end }}
+
+{{/*
+Define resources
+*/}}
+{{- define "chart.resources" -}}
+requests:
+  memory: {{ required "Value 'resources.requests.memory' must be defined !" .Values.resources.requests.memory | quote }}
+  cpu: {{ required "Value 'resources.requests.cpu' must be defined !" .Values.resources.requests.cpu | quote }}
+  {{- if and (gt (int (index .Values.resources.requests "nvidia.com/gpu")) 0) (gt (int (index .Values.resources.limits "nvidia.com/gpu")) 0) }}
+  nvidia.com/gpu: {{ required "Value 'resources.requests.nvidia.com/gpu' must be defined !" (index .Values.resources.requests "nvidia.com/gpu") | quote }}
+  {{- end }}
+limits:
+  memory: {{ required "Value 'resources.limits.memory' must be defined !" .Values.resources.limits.memory | quote }}
+  cpu: {{ required "Value 'resources.limits.cpu' must be defined !" .Values.resources.limits.cpu | quote }}
+  {{- if and (gt (int (index .Values.resources.requests "nvidia.com/gpu")) 0) (gt (int (index .Values.resources.limits "nvidia.com/gpu")) 0) }}
+  nvidia.com/gpu: {{ required "Value 'resources.limits.nvidia.com/gpu' must be defined !" (index .Values.resources.limits "nvidia.com/gpu") | quote }}
+  {{- end }}
+{{- end }}
+
+
+{{/*
+Define User used for the main container
+*/}}
+{{- define "chart.user" }}
+{{-   if .Values.image.runAsUser  }}
+runAsUser: 
+{{-     with .Values.runAsUser }}
+{{-       toYaml . | nindent 2 }}
+{{-     end }}
+{{-   end }}
+{{- end }}
+
+{{- define "chart.extraInitImage" -}}
+"amazon/aws-cli:2.6.4"
+{{- end }}
+
+{{- define "chart.extraInitEnv" -}}
+- name: S3_ENDPOINT_URL
+  valueFrom:
+    secretKeyRef:
+      name: {{ .Release.Name }}-secrets
+      key: s3endpoint
+- name: S3_BUCKET_NAME
+  valueFrom:
+    secretKeyRef:
+      name: {{ .Release.Name }}-secrets
+      key: s3bucketname
+- name: AWS_ACCESS_KEY_ID
+  valueFrom:
+    secretKeyRef:
+      name: {{ .Release.Name }}-secrets
+      key: s3accesskeyid
+- name: AWS_SECRET_ACCESS_KEY
+  valueFrom:
+    secretKeyRef:
+      name: {{ .Release.Name }}-secrets
+      key: s3accesskey
+- name: S3_PATH
+  value: "{{ .Values.extraInit.s3modelpath }}"
+- name: AWS_EC2_METADATA_DISABLED
+  value: "{{ .Values.extraInit.awsEc2MetadataDisabled }}"
+{{- end }}
+
+{{/*
+  Define chart labels
+*/}}
+{{- define "chart.labels" -}}
+{{-   with .Values.labels -}}
+{{      toYaml . }}
+{{-   end }}
+{{- end }}
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/online_serving/chart-helm/templates/configmap.yaml b/vllm_v0.10.0/examples/online_serving/chart-helm/templates/configmap.yaml
new file mode 100644
index 0000000..cc5d037
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/chart-helm/templates/configmap.yaml
@@ -0,0 +1,11 @@
+{{- if .Values.configs -}}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}-configs"
+  namespace: {{ .Release.Namespace }}
+data:
+  {{- with .Values.configs }}
+  {{- toYaml . | nindent 2 }}
+  {{- end }}
+{{- end -}}
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/online_serving/chart-helm/templates/custom-objects.yaml b/vllm_v0.10.0/examples/online_serving/chart-helm/templates/custom-objects.yaml
new file mode 100644
index 0000000..8a65ffd
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/chart-helm/templates/custom-objects.yaml
@@ -0,0 +1,6 @@
+{{- if .Values.customObjects }}
+{{- range .Values.customObjects }}
+{{- tpl (. | toYaml) $ }}
+---
+{{- end }}
+{{- end }}
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/online_serving/chart-helm/templates/deployment.yaml b/vllm_v0.10.0/examples/online_serving/chart-helm/templates/deployment.yaml
new file mode 100644
index 0000000..536983b
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/chart-helm/templates/deployment.yaml
@@ -0,0 +1,122 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: "{{ .Release.Name }}-deployment-vllm"
+  namespace: {{ .Release.Namespace }}
+  labels:
+  {{- include "chart.labels" . | nindent 4 }}
+spec:
+  replicas: {{ .Values.replicaCount }}
+  {{- include "chart.strategy" . | nindent 2 }}
+  selector:                                                                                                                                  
+    matchLabels:
+      environment: "test"
+      release: "test"
+  progressDeadlineSeconds: 1200
+  template:
+    metadata:
+      labels:
+        environment: "test"
+        release: "test"
+    spec:
+      containers:
+        - name: "vllm"
+          image: "{{ required "Required value 'image.repository' must be defined !" .Values.image.repository }}:{{ required "Required value 'image.tag' must be defined !" .Values.image.tag }}"
+          {{- if .Values.image.command }}
+          command :
+            {{- with .Values.image.command }}
+            {{- toYaml . | nindent 10 }}
+            {{- end }}
+          {{- end }}
+          securityContext:
+            {{- if .Values.image.securityContext }}
+              {{- with .Values.image.securityContext }}
+              {{- toYaml . | nindent 12 }}
+              {{- end }}
+            {{- else }}
+            runAsNonRoot: false
+              {{- include "chart.user" . | indent 12 }}
+            {{- end }}
+          imagePullPolicy: IfNotPresent
+          {{- if .Values.image.env }}
+          env :
+            {{- with .Values.image.env }}
+            {{- toYaml . | nindent 10 }}
+            {{- end }}
+          {{- else }}
+          env: []
+          {{- end }}
+          {{- if or .Values.externalConfigs .Values.configs .Values.secrets }}
+          envFrom:
+            {{- if .Values.configs }}
+            - configMapRef:
+                name: "{{ .Release.Name }}-configs"
+            {{- end }}
+            {{- if .Values.secrets}}
+            - secretRef:
+                name: "{{ .Release.Name }}-secrets"
+            {{- end }}
+            {{- include "chart.externalConfigs" . | nindent 12 }}
+          {{- end }}          
+          ports:
+            - name: {{ include "chart.container-port-name" . }}
+              containerPort: {{ include "chart.container-port" . }}
+            {{- include "chart.extraPorts" . | nindent 12 }}
+          {{- include "chart.probes" . | indent 10 }}
+          resources: {{- include "chart.resources" . | nindent 12 }}
+          volumeMounts:
+          - name: {{ .Release.Name }}-storage
+            mountPath: /data
+
+        {{- with .Values.extraContainers }}
+        {{ toYaml . | nindent 8 }}
+        {{- end }}
+
+      {{-   if .Values.extraInit  }}
+      initContainers:
+      - name: wait-download-model
+        image: {{ include "chart.extraInitImage" . }}
+        command: 
+          - /bin/bash
+        args:
+          - -eucx
+          - while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done
+        env: {{- include "chart.extraInitEnv" . | nindent 10 }}
+        resources:
+          requests:
+            cpu: 200m
+            memory: 1Gi
+          limits:
+            cpu: 500m
+            memory: 2Gi
+        volumeMounts:
+        - name: {{ .Release.Name }}-storage
+          mountPath: /data
+      {{- end }}
+      volumes:
+        - name: {{ .Release.Name }}-storage
+          persistentVolumeClaim:
+            claimName: {{ .Release.Name }}-storage-claim     
+
+      {{- with .Values.nodeSelector }}
+      nodeSelector:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.tolerations }}
+      tolerations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- if and (gt (int (index .Values.resources.requests "nvidia.com/gpu")) 0) (gt (int (index .Values.resources.limits "nvidia.com/gpu")) 0) }}
+      runtimeClassName: nvidia
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                - key: nvidia.com/gpu.product
+                  operator: In
+                  {{- with .Values.gpuModels }}
+                  values:
+                    {{- toYaml . | nindent 20 }}
+                  {{- end }}
+      {{- end }} 
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/online_serving/chart-helm/templates/hpa.yaml b/vllm_v0.10.0/examples/online_serving/chart-helm/templates/hpa.yaml
new file mode 100644
index 0000000..5ca94c8
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/chart-helm/templates/hpa.yaml
@@ -0,0 +1,31 @@
+{{- if .Values.autoscaling.enabled }}
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: "{{ .Release.Name }}-hpa"
+  namespace: {{ .Release.Namespace }}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: vllm
+  minReplicas: {{ .Values.autoscaling.minReplicas }}
+  maxReplicas: {{ .Values.autoscaling.maxReplicas }}
+  metrics:
+    {{- if .Values.autoscaling.targetCPUUtilizationPercentage }}
+    - type: Resource
+      resource:
+        name: cpu
+        target:
+          type: Utilization
+          averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
+    {{- end }}
+    {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }}
+    - type: Resource
+      resource:
+        name: memory
+        target:
+          type: Utilization
+          averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }}
+    {{- end }}
+{{- end }}
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/online_serving/chart-helm/templates/job.yaml b/vllm_v0.10.0/examples/online_serving/chart-helm/templates/job.yaml
new file mode 100644
index 0000000..f9ea354
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/chart-helm/templates/job.yaml
@@ -0,0 +1,37 @@
+{{-   if .Values.extraInit  }}
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: "{{ .Release.Name }}-init-vllm"
+  namespace: {{ .Release.Namespace }}
+spec:
+  ttlSecondsAfterFinished: 100
+  template:
+   metadata:
+     name: init-vllm
+   spec:
+    containers:
+    - name: job-download-model
+      image: {{ include "chart.extraInitImage" . }}
+      command: 
+        - /bin/bash
+      args:
+        - -eucx
+        - aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data
+      env: {{- include "chart.extraInitEnv" . | nindent 8 }}
+      volumeMounts:
+        - name: {{ .Release.Name }}-storage
+          mountPath: /data
+      resources:
+        requests:
+          cpu: 200m
+          memory: 1Gi
+        limits:
+          cpu: 500m
+          memory: 2Gi
+    restartPolicy: OnFailure
+    volumes:
+    - name: {{ .Release.Name }}-storage
+      persistentVolumeClaim:
+        claimName: "{{ .Release.Name }}-storage-claim"
+{{- end }}
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml b/vllm_v0.10.0/examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml
new file mode 100644
index 0000000..512bac7
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml
@@ -0,0 +1,7 @@
+apiVersion: policy/v1
+kind: PodDisruptionBudget
+metadata:
+  name: "{{ .Release.Name }}-pdb"
+  namespace: {{ .Release.Namespace }}
+spec:
+  maxUnavailable: {{ default 1 .Values.maxUnavailablePodDisruptionBudget }}
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/online_serving/chart-helm/templates/pvc.yaml b/vllm_v0.10.0/examples/online_serving/chart-helm/templates/pvc.yaml
new file mode 100644
index 0000000..e8d203a
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/chart-helm/templates/pvc.yaml
@@ -0,0 +1,13 @@
+{{-   if .Values.extraInit  }}
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: "{{ .Release.Name }}-storage-claim"
+  namespace: {{ .Release.Namespace }}
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: {{ .Values.extraInit.pvcStorage }}
+{{- end }}
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/online_serving/chart-helm/templates/secrets.yaml b/vllm_v0.10.0/examples/online_serving/chart-helm/templates/secrets.yaml
new file mode 100644
index 0000000..4e88e74
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/chart-helm/templates/secrets.yaml
@@ -0,0 +1,10 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: "{{ .Release.Name }}-secrets"
+  namespace: {{ .Release.Namespace }}
+type: Opaque
+data:
+  {{- range $key, $val := .Values.secrets }}
+  {{ $key }}: {{ $val | b64enc | quote }}
+  {{- end }}
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/online_serving/chart-helm/templates/service.yaml b/vllm_v0.10.0/examples/online_serving/chart-helm/templates/service.yaml
new file mode 100644
index 0000000..12d0f68
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/chart-helm/templates/service.yaml
@@ -0,0 +1,14 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: "{{ .Release.Name }}-service"
+  namespace: {{ .Release.Namespace }}
+spec:
+  type: ClusterIP
+  ports:
+    - name: {{ include "chart.service-port-name" . }}
+      port: {{ include "chart.service-port" . }}
+      targetPort: {{ include "chart.container-port-name" . }}
+      protocol: TCP
+  selector:
+  {{- include "chart.labels" . | nindent 4 }}
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/online_serving/chart-helm/values.schema.json b/vllm_v0.10.0/examples/online_serving/chart-helm/values.schema.json
new file mode 100644
index 0000000..812d54b
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/chart-helm/values.schema.json
@@ -0,0 +1,265 @@
+{
+    "$schema": "http://json-schema.org/schema#",
+    "type": "object",
+    "properties": {
+        "image": {
+            "type": "object",
+            "properties": {
+                "repository": {
+                    "type": "string"
+                },
+                "tag": {
+                    "type": "string"
+                },
+                "command": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                }
+            },
+            "required": [
+                "command",
+                "repository",
+                "tag"
+            ]
+        },
+        "containerPort": {
+            "type": "integer"
+        },
+        "serviceName": {
+            "type": "null"
+        },
+        "servicePort": {
+            "type": "integer"
+        },
+        "extraPorts": {
+            "type": "array"
+        },
+        "replicaCount": {
+            "type": "integer"
+        },
+        "deploymentStrategy": {
+            "type": "object"
+        },
+        "resources": {
+            "type": "object",
+            "properties": {
+                "requests": {
+                    "type": "object",
+                    "properties": {
+                        "cpu": {
+                            "type": "integer"
+                        },
+                        "memory": {
+                            "type": "string"
+                        },
+                        "nvidia.com/gpu": {
+                            "type": "integer"
+                        }
+                    },
+                    "required": [
+                        "cpu",
+                        "memory",
+                        "nvidia.com/gpu"
+                    ]
+                },
+                "limits": {
+                    "type": "object",
+                    "properties": {
+                        "cpu": {
+                            "type": "integer"
+                        },
+                        "memory": {
+                            "type": "string"
+                        },
+                        "nvidia.com/gpu": {
+                            "type": "integer"
+                        }
+                    },
+                    "required": [
+                        "cpu",
+                        "memory",
+                        "nvidia.com/gpu"
+                    ]
+                }
+            },
+            "required": [
+                "limits",
+                "requests"
+            ]
+        },
+        "gpuModels": {
+            "type": "array",
+            "items": {
+                "type": "string"
+            }
+        },
+        "autoscaling": {
+            "type": "object",
+            "properties": {
+                "enabled": {
+                    "type": "boolean"
+                },
+                "minReplicas": {
+                    "type": "integer"
+                },
+                "maxReplicas": {
+                    "type": "integer"
+                },
+                "targetCPUUtilizationPercentage": {
+                    "type": "integer"
+                }
+            },
+            "required": [
+                "enabled",
+                "maxReplicas",
+                "minReplicas",
+                "targetCPUUtilizationPercentage"
+            ]
+        },
+        "configs": {
+            "type": "object"
+        },
+        "secrets": {
+            "type": "object"
+        },
+        "externalConfigs": {
+            "type": "array"
+        },
+        "customObjects": {
+            "type": "array"
+        },
+        "maxUnavailablePodDisruptionBudget": {
+            "type": "string"
+        },
+        "extraInit": {
+            "type": "object",
+            "properties": {
+                "s3modelpath": {
+                    "type": "string"
+                },
+                "pvcStorage": {
+                    "type": "string"
+                },
+                "awsEc2MetadataDisabled": {
+                    "type": "boolean"
+                }
+            },
+            "required": [
+                "pvcStorage",
+                "s3modelpath",
+                "awsEc2MetadataDisabled"
+            ]
+        },
+        "extraContainers": {
+            "type": "array"
+        },
+        "readinessProbe": {
+            "type": "object",
+            "properties": {
+                "initialDelaySeconds": {
+                    "type": "integer"
+                },
+                "periodSeconds": {
+                    "type": "integer"
+                },
+                "failureThreshold": {
+                    "type": "integer"
+                },
+                "httpGet": {
+                    "type": "object",
+                    "properties": {
+                        "path": {
+                            "type": "string"
+                        },
+                        "port": {
+                            "type": "integer"
+                        }
+                    },
+                    "required": [
+                        "path",
+                        "port"
+                    ]
+                }
+            },
+            "required": [
+                "failureThreshold",
+                "httpGet",
+                "initialDelaySeconds",
+                "periodSeconds"
+            ]
+        },
+        "livenessProbe": {
+            "type": "object",
+            "properties": {
+                "initialDelaySeconds": {
+                    "type": "integer"
+                },
+                "failureThreshold": {
+                    "type": "integer"
+                },
+                "periodSeconds": {
+                    "type": "integer"
+                },
+                "httpGet": {
+                    "type": "object",
+                    "properties": {
+                        "path": {
+                            "type": "string"
+                        },
+                        "port": {
+                            "type": "integer"
+                        }
+                    },
+                    "required": [
+                        "path",
+                        "port"
+                    ]
+                }
+            },
+            "required": [
+                "failureThreshold",
+                "httpGet",
+                "initialDelaySeconds",
+                "periodSeconds"
+            ]
+        },
+        "labels": {
+            "type": "object",
+            "properties": {
+                "environment": {
+                    "type": "string"
+                },
+                "release": {
+                    "type": "string"
+                }
+            },
+            "required": [
+                "environment",
+                "release"
+            ]
+        }
+    },
+    "required": [
+        "autoscaling",
+        "configs",
+        "containerPort",
+        "customObjects",
+        "deploymentStrategy",
+        "externalConfigs",
+        "extraContainers",
+        "extraInit",
+        "extraPorts",
+        "gpuModels",
+        "image",
+        "labels",
+        "livenessProbe",
+        "maxUnavailablePodDisruptionBudget",
+        "readinessProbe",
+        "replicaCount",
+        "resources",
+        "secrets",
+        "servicePort"
+    ]
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/online_serving/chart-helm/values.yaml b/vllm_v0.10.0/examples/online_serving/chart-helm/values.yaml
new file mode 100644
index 0000000..815f02a
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/chart-helm/values.yaml
@@ -0,0 +1,119 @@
+# -- Default values for chart vllm
+# -- Declare variables to be passed into your templates.
+
+# -- Image configuration
+image:
+  # -- Image repository
+  repository: "vllm/vllm-openai"
+  # -- Image tag
+  tag: "latest"
+  # -- Container launch command
+  command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--enforce-eager", "--dtype", "bfloat16", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"]
+
+# -- Container port
+containerPort: 8000
+# -- Service name
+serviceName:
+# -- Service port
+servicePort: 80
+# -- Additional ports configuration
+extraPorts: []
+
+# -- Number of replicas
+replicaCount: 1
+
+# -- Deployment strategy configuration
+deploymentStrategy: {}
+
+# -- Resource configuration
+resources:
+  requests:
+    # -- Number of CPUs
+    cpu: 4
+    # -- CPU memory configuration
+    memory: 16Gi
+    # -- Number of gpus used
+    nvidia.com/gpu: 1
+  limits:
+    # -- Number of CPUs
+    cpu: 4
+    # -- CPU memory configuration
+    memory: 16Gi
+    # -- Number of gpus used
+    nvidia.com/gpu: 1
+
+# -- Type of gpu used
+gpuModels:
+  - "TYPE_GPU_USED"
+
+# -- Autoscaling configuration
+autoscaling:
+  # -- Enable autoscaling
+  enabled: false
+  # -- Minimum replicas
+  minReplicas: 1
+  # -- Maximum replicas
+  maxReplicas: 100
+  # -- Target CPU utilization for autoscaling
+  targetCPUUtilizationPercentage: 80
+  # targetMemoryUtilizationPercentage: 80
+
+# -- Configmap
+configs: {}
+
+# -- Secrets configuration
+secrets: {}
+
+# -- External configuration
+externalConfigs: []
+
+# -- Custom Objects configuration
+customObjects: []
+
+# -- Disruption Budget Configuration
+maxUnavailablePodDisruptionBudget: ""
+
+# -- Additional configuration for the init container
+extraInit:
+   # -- Path of the model on the s3 which hosts model weights and config files
+  s3modelpath: "relative_s3_model_path/opt-125m"
+   # -- Storage size of the s3
+  pvcStorage: "1Gi"
+  awsEc2MetadataDisabled: true
+
+# -- Additional containers configuration
+extraContainers: []
+
+# -- Readiness probe configuration
+readinessProbe:
+  # -- Number of seconds after the container has started before readiness probe is initiated
+  initialDelaySeconds: 5
+  # -- How often (in seconds) to perform the readiness probe
+  periodSeconds: 5
+  # -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
+  failureThreshold: 3
+   # -- Configuration of the Kubelet http request on the server
+  httpGet:
+    # -- Path to access on the HTTP server
+    path: /health
+    # -- Name or number of the port to access on the container, on which the server is listening
+    port: 8000
+
+# -- Liveness probe configuration
+livenessProbe:
+ # -- Number of seconds after the container has started before liveness probe is initiated
+  initialDelaySeconds: 15
+  # -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
+  failureThreshold: 3
+  # -- How often (in seconds) to perform the liveness probe
+  periodSeconds: 10
+  # -- Configuration of the Kubelet http request on the server
+  httpGet:
+    # -- Path to access on the HTTP server
+    path: /health
+    # -- Name or number of the port to access on the container, on which the server is listening
+    port: 8000
+
+labels:
+  environment: "test"
+  release: "test"
diff --git a/vllm_v0.10.0/examples/online_serving/cohere_rerank_client.py b/vllm_v0.10.0/examples/online_serving/cohere_rerank_client.py
new file mode 100644
index 0000000..63c9ff9
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/cohere_rerank_client.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Example of using the OpenAI entrypoint's rerank API which is compatible with
+the Cohere SDK: https://github.com/cohere-ai/cohere-python
+Note that `pip install cohere` is needed to run this example.
+
+run: vllm serve BAAI/bge-reranker-base
+"""
+
+from typing import Union
+
+import cohere
+from cohere import Client, ClientV2
+
+model = "BAAI/bge-reranker-base"
+
+query = "What is the capital of France?"
+
+documents = [
+    "The capital of France is Paris",
+    "Reranking is fun!",
+    "vLLM is an open-source framework for fast AI serving",
+]
+
+
+def cohere_rerank(
+    client: Union[Client, ClientV2], model: str, query: str, documents: list[str]
+) -> dict:
+    return client.rerank(model=model, query=query, documents=documents)
+
+
+def main():
+    # cohere v1 client
+    cohere_v1 = cohere.Client(base_url="http://localhost:8000", api_key="sk-fake-key")
+    rerank_v1_result = cohere_rerank(cohere_v1, model, query, documents)
+    print("-" * 50)
+    print("rerank_v1_result:\n", rerank_v1_result)
+    print("-" * 50)
+
+    # or the v2
+    cohere_v2 = cohere.ClientV2("sk-fake-key", base_url="http://localhost:8000")
+    rerank_v2_result = cohere_rerank(cohere_v2, model, query, documents)
+    print("rerank_v2_result:\n", rerank_v2_result)
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/online_serving/disaggregated_prefill.sh b/vllm_v0.10.0/examples/online_serving/disaggregated_prefill.sh
new file mode 100644
index 0000000..6925dc8
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/disaggregated_prefill.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+# This file demonstrates the example usage of disaggregated prefilling
+# We will launch 2 vllm instances (1 for prefill and 1 for decode),
+# and then transfer the KV cache between them.
+
+set -xe
+
+echo "🚧🚧 Warning: The usage of disaggregated prefill is experimental and subject to change 🚧🚧"
+sleep 1
+
+# meta-llama/Meta-Llama-3.1-8B-Instruct or deepseek-ai/DeepSeek-V2-Lite
+MODEL_NAME=${HF_MODEL_NAME:-meta-llama/Meta-Llama-3.1-8B-Instruct}
+
+# Trap the SIGINT signal (triggered by Ctrl+C)
+trap 'cleanup' INT
+
+# Cleanup function
+cleanup() {
+    echo "Caught Ctrl+C, cleaning up..."
+    # Cleanup commands
+    pgrep python | xargs kill -9
+    pkill -f python
+    echo "Cleanup complete. Exiting."
+    exit 0
+}
+
+export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+
+# install quart first -- required for disagg prefill proxy serve
+if python3 -c "import quart" &> /dev/null; then
+    echo "Quart is already installed."
+else
+    echo "Quart is not installed. Installing..."
+    python3 -m pip install quart
+fi 
+
+# a function that waits vLLM server to start
+wait_for_server() {
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+
+# You can also adjust --kv-ip and --kv-port for distributed inference.
+
+# prefilling instance, which is the KV producer
+CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
+    --port 8100 \
+    --max-model-len 100 \
+    --gpu-memory-utilization 0.8 \
+    --trust-remote-code \
+    --kv-transfer-config \
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' &
+
+# decoding instance, which is the KV consumer
+CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
+    --port 8200 \
+    --max-model-len 100 \
+    --gpu-memory-utilization 0.8 \
+    --trust-remote-code \
+    --kv-transfer-config \
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' &
+
+# wait until prefill and decode instances are ready
+wait_for_server 8100
+wait_for_server 8200
+
+# launch a proxy server that opens the service at port 8000
+# the workflow of this proxy:
+# - send the request to prefill vLLM instance (port 8100), change max_tokens 
+#   to 1
+# - after the prefill vLLM finishes prefill, send the request to decode vLLM 
+#   instance
+# NOTE: the usage of this API is subject to change --- in the future we will 
+# introduce "vllm connect" to connect between prefill and decode instances
+python3 ../../benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py &
+sleep 1
+
+# serve two example requests
+output1=$(curl -X POST -s http://localhost:8000/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "'"$MODEL_NAME"'",
+"prompt": "San Francisco is a",
+"max_tokens": 10,
+"temperature": 0
+}')
+
+output2=$(curl -X POST -s http://localhost:8000/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "'"$MODEL_NAME"'",
+"prompt": "Santa Clara is a",
+"max_tokens": 10,
+"temperature": 0
+}')
+
+
+# Cleanup commands
+pgrep python | xargs kill -9
+pkill -f python
+
+echo ""
+
+sleep 1
+
+# Print the outputs of the curl requests
+echo ""
+echo "Output of first request: $output1"
+echo "Output of second request: $output2"
+
+echo "🎉🎉 Successfully finished 2 test requests! 🎉🎉"
+echo ""
diff --git a/vllm_v0.10.0/examples/online_serving/disaggregated_serving/README.md b/vllm_v0.10.0/examples/online_serving/disaggregated_serving/README.md
new file mode 100644
index 0000000..090afd7
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/disaggregated_serving/README.md
@@ -0,0 +1,8 @@
+# Disaggregated Serving
+
+This example contains scripts that demonstrate the disaggregated serving features of vLLM.
+
+## Files
+
+- `disagg_proxy_demo.py` - Demonstrates XpYd (X prefill instances, Y decode instances).
+- `kv_events.sh` - Demonstrates KV cache event publishing.
diff --git a/vllm_v0.10.0/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py b/vllm_v0.10.0/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
new file mode 100644
index 0000000..16c32dc
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
@@ -0,0 +1,453 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file provides a disaggregated prefilling proxy demo to demonstrate an
+example usage of XpYd disaggregated prefilling.
+We can launch multiple vllm instances (2 for prefill and 2 for decode), and
+launch this proxy demo through:
+  python3 examples/online_serving/disaggregated_serving/disagg_proxy_demo.py  \
+       --model $model_name  \
+       --prefill localhost:8100 localhost:8101   \
+       --decode localhost:8200 localhost:8201   \
+       --port 8000
+
+Note: This demo will be removed once the PDController implemented in PR 15343
+(https://github.com/vllm-project/vllm/pull/15343) supports XpYd.
+"""
+
+import argparse
+import ipaddress
+import itertools
+import json
+import logging
+import os
+import sys
+from abc import ABC, abstractmethod
+from typing import Callable, Optional
+
+import aiohttp
+import requests
+import uvicorn
+from fastapi import APIRouter, Depends, FastAPI, Header, HTTPException, Request, status
+from fastapi.responses import JSONResponse, StreamingResponse
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+logger = logging.getLogger()
+logging.basicConfig(level=logging.INFO)
+
+
+class SchedulingPolicy(ABC):
+    @abstractmethod
+    def schedule(self, cycler: itertools.cycle):
+        raise NotImplementedError("Scheduling Proxy is not set.")
+
+
+class Proxy:
+    def __init__(
+        self,
+        prefill_instances: list[str],
+        decode_instances: list[str],
+        model: str,
+        scheduling_policy: SchedulingPolicy,
+        custom_create_completion: Optional[
+            Callable[[Request], StreamingResponse]
+        ] = None,
+        custom_create_chat_completion: Optional[
+            Callable[[Request], StreamingResponse]
+        ] = None,
+    ):
+        self.prefill_instances = prefill_instances
+        self.decode_instances = decode_instances
+        self.prefill_cycler = itertools.cycle(prefill_instances)
+        self.decode_cycler = itertools.cycle(decode_instances)
+        self.model = model
+        self.scheduling_policy = scheduling_policy
+        self.custom_create_completion = custom_create_completion
+        self.custom_create_chat_completion = custom_create_chat_completion
+        self.router = APIRouter()
+        self.setup_routes()
+
+    def setup_routes(self):
+        self.router.post(
+            "/v1/completions", dependencies=[Depends(self.validate_json_request)]
+        )(
+            self.custom_create_completion
+            if self.custom_create_completion
+            else self.create_completion
+        )
+        self.router.post(
+            "/v1/chat/completions", dependencies=[Depends(self.validate_json_request)]
+        )(
+            self.custom_create_chat_completion
+            if self.custom_create_chat_completion
+            else self.create_chat_completion
+        )
+        self.router.get("/status", response_class=JSONResponse)(self.get_status)
+        self.router.post(
+            "/instances/add", dependencies=[Depends(self.api_key_authenticate)]
+        )(self.add_instance_endpoint)
+
+    async def validate_json_request(self, raw_request: Request):
+        content_type = raw_request.headers.get("content-type", "").lower()
+        if content_type != "application/json":
+            raise HTTPException(
+                status_code=415,
+                detail="Unsupported Media Type: Only 'application/json' is allowed",
+            )
+
+    def api_key_authenticate(self, x_api_key: str = Header(...)):
+        expected_api_key = os.environ.get("ADMIN_API_KEY")
+        if not expected_api_key:
+            logger.error("ADMIN_API_KEY is not set in the environment.")
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail="Server configuration error.",
+            )
+        if x_api_key != expected_api_key:
+            logger.warning("Unauthorized access attempt with API Key: %s", x_api_key)
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail="Forbidden: Invalid API Key.",
+            )
+
+    async def validate_instance(self, instance: str) -> bool:
+        url = f"http://{instance}/v1/models"
+        try:
+            async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as client:
+                logger.info("Verifying %s ...", instance)
+                async with client.get(url) as response:
+                    if response.status == 200:
+                        data = await response.json()
+                        if "data" in data and len(data["data"]) > 0:
+                            model_cur = data["data"][0].get("id", "")
+                            if model_cur == self.model:
+                                logger.info("Instance: %s could be added.", instance)
+                                return True
+                            else:
+                                logger.warning(
+                                    "Mismatch model %s : %s != %s",
+                                    instance,
+                                    model_cur,
+                                    self.model,
+                                )
+                                return False
+                        else:
+                            return False
+                    else:
+                        return False
+        except aiohttp.ClientError as e:
+            logger.error(str(e))
+            return False
+        except Exception as e:
+            logger.error(str(e))
+            return False
+
+    async def add_instance_endpoint(self, request: Request):
+        try:
+            data = await request.json()
+            logger.warning(str(data))
+            instance_type = data.get("type")
+            instance = data.get("instance")
+            if instance_type not in ["prefill", "decode"]:
+                raise HTTPException(status_code=400, detail="Invalid instance type.")
+            if not instance or ":" not in instance:
+                raise HTTPException(status_code=400, detail="Invalid instance format.")
+            host, port_str = instance.split(":")
+            try:
+                if host != "localhost":
+                    ipaddress.ip_address(host)
+                port = int(port_str)
+                if not (0 < port < 65536):
+                    raise HTTPException(status_code=400, detail="Invalid port number.")
+            except Exception as e:
+                raise HTTPException(
+                    status_code=400, detail="Invalid instance address."
+                ) from e
+
+            is_valid = await self.validate_instance(instance)
+            if not is_valid:
+                raise HTTPException(
+                    status_code=400, detail="Instance validation failed."
+                )
+
+            if instance_type == "prefill":
+                if instance not in self.prefill_instances:
+                    self.prefill_instances.append(instance)
+                    self.prefill_cycler = itertools.cycle(self.prefill_instances)
+                else:
+                    raise HTTPException(
+                        status_code=400, detail="Instance already exists."
+                    )
+            else:
+                if instance not in self.decode_instances:
+                    self.decode_instances.append(instance)
+                    self.decode_cycler = itertools.cycle(self.decode_instances)
+                else:
+                    raise HTTPException(
+                        status_code=400, detail="Instance already exists."
+                    )
+
+            return JSONResponse(
+                content={"message": f"Added {instance} to {instance_type}_instances."}
+            )
+        except HTTPException as http_exc:
+            raise http_exc
+        except Exception as e:
+            logger.error("Error in add_instance_endpoint: %s", str(e))
+            raise HTTPException(status_code=500, detail=str(e)) from e
+
+    async def forward_request(self, url, data, use_chunked=True):
+        async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+            headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+            try:
+                async with session.post(
+                    url=url, json=data, headers=headers
+                ) as response:
+                    if 200 <= response.status < 300 or 400 <= response.status < 500:  # noqa: E501
+                        if use_chunked:
+                            async for chunk_bytes in response.content.iter_chunked(  # noqa: E501
+                                1024
+                            ):
+                                yield chunk_bytes
+                        else:
+                            content = await response.read()
+                            yield content
+                    else:
+                        error_content = await response.text()
+                        try:
+                            error_content = json.loads(error_content)
+                        except json.JSONDecodeError:
+                            error_content = error_content
+                        logger.error(
+                            "Request failed with status %s: %s",
+                            response.status,
+                            error_content,
+                        )
+                        raise HTTPException(
+                            status_code=response.status,
+                            detail=f"Request failed with status {response.status}: "
+                            f"{error_content}",
+                        )
+            except aiohttp.ClientError as e:
+                logger.error("ClientError occurred: %s", str(e))
+                raise HTTPException(
+                    status_code=502,
+                    detail="Bad Gateway: Error communicating with upstream server.",
+                ) from e
+            except Exception as e:
+                logger.error("Unexpected error: %s", str(e))
+                raise HTTPException(status_code=500, detail=str(e)) from e
+
+    def schedule(self, cycler: itertools.cycle) -> str:
+        return self.scheduling_policy.schedule(cycler)
+
+    async def get_status(self):
+        status = {
+            "prefill_node_count": len(self.prefill_instances),
+            "decode_node_count": len(self.decode_instances),
+            "prefill_nodes": self.prefill_instances,
+            "decode_nodes": self.decode_instances,
+        }
+        return status
+
+    async def create_completion(self, raw_request: Request):
+        try:
+            request = await raw_request.json()
+
+            kv_prepare_request = request.copy()
+            kv_prepare_request["max_tokens"] = 1
+
+            prefill_instance = self.schedule(self.prefill_cycler)
+            try:
+                async for _ in self.forward_request(
+                    f"http://{prefill_instance}/v1/completions", kv_prepare_request
+                ):
+                    continue
+            except HTTPException as http_exc:
+                self.remove_instance_endpoint("prefill", prefill_instance)
+                raise http_exc
+
+            # Perform kv recv and decoding stage
+            decode_instance = self.schedule(self.decode_cycler)
+
+            try:
+                generator = self.forward_request(
+                    f"http://{decode_instance}/v1/completions", request
+                )
+            except HTTPException as http_exc:
+                self.remove_instance_endpoint("decode", decode_instance)
+                raise http_exc
+            response = StreamingResponse(generator)
+            return response
+        except Exception:
+            import sys
+
+            exc_info = sys.exc_info()
+            print("Error occurred in disagg proxy server")
+            print(exc_info)
+
+    async def create_chat_completion(self, raw_request: Request):
+        try:
+            request = await raw_request.json()
+
+            # add params to request
+            kv_prepare_request = request.copy()
+            kv_prepare_request["max_tokens"] = 1
+
+            # prefill stage
+            prefill_instance = self.schedule(self.prefill_cycler)
+            try:
+                async for _ in self.forward_request(
+                    f"http://{prefill_instance}/v1/chat/completions", kv_prepare_request
+                ):
+                    continue
+            except HTTPException as http_exc:
+                self.remove_instance_endpoint("prefill", prefill_instance)
+                raise http_exc
+            # Perform kv recv and decoding stage
+            decode_instance = self.schedule(self.decode_cycler)
+
+            try:
+                generator = self.forward_request(
+                    "http://" + decode_instance + "/v1/chat/completions", request
+                )
+            except HTTPException as http_exc:
+                self.remove_instance_endpoint("decode", decode_instance)
+                raise http_exc
+            response = StreamingResponse(content=generator)
+            return response
+        except Exception:
+            exc_info = sys.exc_info()
+            error_messages = [str(e) for e in exc_info if e]
+            print("Error occurred in disagg proxy server")
+            print(error_messages)
+            return StreamingResponse(
+                content=iter(error_messages), media_type="text/event-stream"
+            )
+
+    def remove_instance_endpoint(self, instance_type, instance):
+        if instance_type == "decode" and instance in self.decode_instances:
+            self.decode_instances.remove(instance)
+            self.decode_cycler = itertools.cycle(self.decode_instances)
+        if instance_type == "prefill" and instance in self.decode_instances:
+            self.prefill_instances.remove(instance)
+            self.prefill_cycler = itertools.cycle(self.decode_instances)
+
+
+class RoundRobinSchedulingPolicy(SchedulingPolicy):
+    def __init__(self):
+        super().__init__()
+
+    def schedule(self, cycler: itertools.cycle) -> str:
+        return next(cycler)
+
+
+class ProxyServer:
+    def __init__(
+        self,
+        args: argparse.Namespace,
+        scheduling_policy: Optional[SchedulingPolicy] = None,
+        create_completion: Optional[Callable[[Request], StreamingResponse]] = None,
+        create_chat_completion: Optional[Callable[[Request], StreamingResponse]] = None,
+    ):
+        self.validate_parsed_serve_args(args)
+        self.port = args.port
+        self.proxy_instance = Proxy(
+            prefill_instances=[] if args.prefill is None else args.prefill,
+            decode_instances=[] if args.decode is None else args.decode,
+            model=args.model,
+            scheduling_policy=(
+                scheduling_policy
+                if scheduling_policy is not None
+                else RoundRobinSchedulingPolicy()
+            ),
+            custom_create_completion=create_completion,
+            custom_create_chat_completion=create_chat_completion,
+        )
+
+    def validate_parsed_serve_args(self, args: argparse.Namespace):
+        if not args.prefill:
+            raise ValueError("Please specify at least one prefill node.")
+        if not args.decode:
+            raise ValueError("Please specify at least one decode node.")
+        self.validate_instances(args.prefill)
+        self.validate_instances(args.decode)
+        self.verify_model_config(args.prefill, args.model)
+        self.verify_model_config(args.decode, args.model)
+
+    def validate_instances(self, instances: list):
+        for instance in instances:
+            if len(instance.split(":")) != 2:
+                raise ValueError(f"Invalid instance format: {instance}")
+            host, port = instance.split(":")
+            try:
+                if host != "localhost":
+                    ipaddress.ip_address(host)
+                port = int(port)
+                if not (0 < port < 65536):
+                    raise ValueError(f"Invalid port number in instance: {instance}")
+            except Exception as e:
+                raise ValueError(f"Invalid instance {instance}: {str(e)}") from e
+
+    def verify_model_config(self, instances: list, model: str) -> None:
+        model_suffix = model.split("/")[-1]
+        for instance in instances:
+            try:
+                response = requests.get(f"http://{instance}/v1/models")
+                if response.status_code == 200:
+                    model_cur = response.json()["data"][0]["id"]
+                    model_cur_suffix = model_cur.split("/")[-1]
+                    if model_cur_suffix != model_suffix:
+                        raise ValueError(
+                            f"{instance} serves a different model: "
+                            f"{model_cur} != {model}"
+                        )
+                else:
+                    raise ValueError(f"Cannot get model id from {instance}!")
+            except requests.RequestException as e:
+                raise ValueError(
+                    f"Error communicating with {instance}: {str(e)}"
+                ) from e
+
+    def run_server(self):
+        app = FastAPI()
+        app.include_router(self.proxy_instance.router)
+        config = uvicorn.Config(app, port=self.port, loop="uvloop")
+        server = uvicorn.Server(config)
+        server.run()
+
+
+def parse_args():
+    # Todo: allow more config
+    parser = argparse.ArgumentParser("vLLM disaggregated proxy server.")
+    parser.add_argument("--model", "-m", type=str, required=True, help="Model name")
+
+    parser.add_argument(
+        "--prefill",
+        "-p",
+        type=str,
+        nargs="+",
+        help="List of prefill node URLs (host:port)",
+    )
+
+    parser.add_argument(
+        "--decode",
+        "-d",
+        type=str,
+        nargs="+",
+        help="List of decode node URLs (host:port)",
+    )
+
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=8000,
+        help="Server port number",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    proxy_server = ProxyServer(args=args)
+    proxy_server.run_server()
diff --git a/vllm_v0.10.0/examples/online_serving/disaggregated_serving/kv_events.sh b/vllm_v0.10.0/examples/online_serving/disaggregated_serving/kv_events.sh
new file mode 100644
index 0000000..a111db2
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/disaggregated_serving/kv_events.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+# This file demonstrates the KV cache event publishing
+# We will launch a vllm instances configured to publish KV cache
+# events and launch a simple subscriber to log those events.
+
+set -xe
+
+echo "🚧🚧 Warning: The usage of KV cache events is experimental and subject to change 🚧🚧"
+sleep 1
+
+MODEL_NAME=${HF_MODEL_NAME:-meta-llama/Meta-Llama-3.1-8B-Instruct}
+
+# Trap the SIGINT signal (triggered by Ctrl+C)
+trap 'cleanup' INT
+
+# Cleanup function
+cleanup() {
+    echo "Caught Ctrl+C, cleaning up..."
+    # Cleanup commands
+    pgrep python | xargs kill -9
+    pkill -f python
+    echo "Cleanup complete. Exiting."
+    exit 0
+}
+
+export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+
+# a function that waits vLLM server to start
+wait_for_server() {
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+vllm serve $MODEL_NAME \
+    --port 8100 \
+    --max-model-len 100 \
+    --enforce-eager \
+    --gpu-memory-utilization 0.8 \
+    --trust-remote-code \
+    --kv-events-config \
+    '{"enable_kv_cache_events": true, "publisher": "zmq", "topic": "kv-events"}' &
+
+wait_for_server 8100
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+python3 "$SCRIPT_DIR/kv_events_subscriber.py" &
+sleep 1
+
+# serve two example requests
+output1=$(curl -X POST -s http://localhost:8100/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "'"$MODEL_NAME"'",
+"prompt": "Explain quantum computing in simple terms a 5-year-old could understand.",
+"max_tokens": 80,
+"temperature": 0
+}')
+
+output2=$(curl -X POST -s http://localhost:8100/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "'"$MODEL_NAME"'",
+"prompt": "Explain quantum computing in simple terms a 50-year-old could understand.",
+"max_tokens": 80,
+"temperature": 0
+}')
+
+# Cleanup commands
+pkill -9 -u "$USER" -f python
+pkill -9 -u "$USER" -f vllm
+
+sleep 1
+
+echo "Cleaned up"
+
+# Print the outputs of the curl requests
+echo ""
+echo "Output of first request: $output1"
+echo "Output of second request: $output2"
+
+echo "🎉🎉 Successfully finished 2 test requests! 🎉🎉"
+echo ""
diff --git a/vllm_v0.10.0/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh b/vllm_v0.10.0/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
new file mode 100644
index 0000000..76f5c0c
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
@@ -0,0 +1,246 @@
+#!/bin/bash
+
+# =============================================================================
+# vLLM Disaggregated Serving Script - P2P NCCL XpYd Architecture
+# =============================================================================
+# This script demonstrates disaggregated prefill and decode serving using
+# P2P NCCL communication. The architecture supports various XpYd configurations:
+#
+# - 1P3D: 1 Prefill server + 3 Decode servers (current default)
+# - 3P1D: 3 Prefill servers + 1 Decode server
+# - etc.
+#
+# Configuration can be customized via environment variables:
+#   MODEL: Model to serve
+#   PREFILL_GPUS: Comma-separated GPU IDs for prefill servers
+#   DECODE_GPUS: Comma-separated GPU IDs for decode servers
+#   PREFILL_PORTS: Comma-separated ports for prefill servers
+#   DECODE_PORTS: Comma-separated ports for decode servers
+#   PROXY_PORT: Proxy server port used to setup XpYd connection.
+#   TIMEOUT_SECONDS: Server startup timeout
+# =============================================================================
+
+# Configuration - can be overridden via environment variables
+MODEL=${MODEL:-meta-llama/Llama-3.1-8B-Instruct}
+TIMEOUT_SECONDS=${TIMEOUT_SECONDS:-1200}
+PROXY_PORT=${PROXY_PORT:-30001}
+
+# Default 1P3D configuration (1 Prefill + 3 Decode)
+PREFILL_GPUS=${PREFILL_GPUS:-0}
+DECODE_GPUS=${DECODE_GPUS:-1,2,3}
+PREFILL_PORTS=${PREFILL_PORTS:-20003}
+DECODE_PORTS=${DECODE_PORTS:-20005,20007,20009} 
+
+echo "Warning: P2P NCCL disaggregated prefill XpYd support for vLLM v1 is experimental and subject to change."
+echo ""
+echo "Architecture Configuration:"
+echo "  Model: $MODEL"
+echo "  Prefill GPUs: $PREFILL_GPUS, Ports: $PREFILL_PORTS"
+echo "  Decode GPUs: $DECODE_GPUS, Ports: $DECODE_PORTS"
+echo "  Proxy Port: $PROXY_PORT"
+echo "  Timeout: ${TIMEOUT_SECONDS}s"
+echo ""
+
+PIDS=()
+
+# Switch to the directory of the current script
+cd "$(dirname "${BASH_SOURCE[0]}")"
+
+check_required_files() {
+    local files=("disagg_proxy_p2p_nccl_xpyd.py")
+    for file in "${files[@]}"; do
+        if [[ ! -f "$file" ]]; then
+            echo "Required file $file not found in $(pwd)"
+            exit 1
+        fi
+    done
+}
+
+check_hf_token() {
+    if [ -z "$HF_TOKEN" ]; then
+        echo "HF_TOKEN is not set. Please set it to your Hugging Face token."
+        echo "Example: export HF_TOKEN=your_token_here"
+        exit 1
+    fi
+    if [[ "$HF_TOKEN" != hf_* ]]; then
+        echo "HF_TOKEN is not a valid Hugging Face token. Please set it to your Hugging Face token."
+        exit 1
+    fi
+    echo "HF_TOKEN is set and valid."
+}
+
+check_num_gpus() {
+    # Check if the number of GPUs are >=2 via nvidia-smi
+    num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+    if [ "$num_gpus" -lt 2 ]; then
+        echo "You need at least 2 GPUs to run disaggregated prefill."
+        exit 1
+    else
+        echo "Found $num_gpus GPUs."
+    fi
+}
+
+ensure_python_library_installed() {
+    echo "Checking if $1 is installed..."
+    if ! python3 -c "import $1" > /dev/null 2>&1; then
+        echo "$1 is not installed. Please install it via pip install $1."
+        exit 1
+    else
+        echo "$1 is installed."
+    fi
+}
+
+cleanup() {
+    echo "Stopping everything…"
+    trap - INT TERM        # prevent re-entrancy
+    pkill -9 -f "disagg_proxy_p2p_nccl_xpyd.py"
+    kill -- -$$            # negative PID  ==  "this whole process-group"
+    wait                   # reap children so we don't leave zombies
+    exit 0
+}
+
+wait_for_server() {
+  local port=$1
+  local timeout_seconds=$TIMEOUT_SECONDS
+  local start_time=$(date +%s)
+
+  echo "Waiting for server on port $port..."
+
+  while true; do
+    if curl -s "localhost:${port}/v1/completions" > /dev/null; then
+      echo "Server on port $port is ready."
+      return 0
+    fi
+
+    local now=$(date +%s)
+    if (( now - start_time >= timeout_seconds )); then
+      echo "Timeout waiting for server on port $port"
+      return 1
+    fi
+
+    sleep 1
+  done
+}
+
+main() {
+    check_required_files
+    check_hf_token
+    check_num_gpus
+    ensure_python_library_installed pandas
+    ensure_python_library_installed datasets
+    ensure_python_library_installed vllm
+    ensure_python_library_installed quart
+
+    trap cleanup INT
+    trap cleanup USR1
+    trap cleanup TERM
+
+    echo "Launching disaggregated serving components..."
+    echo "Please check the log files for detailed output:"
+    echo "  - prefill*.log: Prefill server logs"
+    echo "  - decode*.log: Decode server logs"
+    echo "  - proxy.log: Proxy server log"
+
+    # =============================================================================
+    # Launch Proxy Server
+    # =============================================================================
+    echo ""
+    echo "Starting proxy server on port $PROXY_PORT..."
+    python3 disagg_proxy_p2p_nccl_xpyd.py &
+    PIDS+=($!)
+
+    # Parse GPU and port arrays
+    IFS=',' read -ra PREFILL_GPU_ARRAY <<< "$PREFILL_GPUS"
+    IFS=',' read -ra DECODE_GPU_ARRAY <<< "$DECODE_GPUS"
+    IFS=',' read -ra PREFILL_PORT_ARRAY <<< "$PREFILL_PORTS"
+    IFS=',' read -ra DECODE_PORT_ARRAY <<< "$DECODE_PORTS"
+
+    # =============================================================================
+    # Launch Prefill Servers (X Producers)
+    # =============================================================================
+    echo ""
+    echo "Starting ${#PREFILL_GPU_ARRAY[@]} prefill server(s)..."
+    for i in "${!PREFILL_GPU_ARRAY[@]}"; do
+        local gpu_id=${PREFILL_GPU_ARRAY[$i]}
+        local port=${PREFILL_PORT_ARRAY[$i]}
+        local kv_port=$((21001 + i))
+        
+        echo "  Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
+        CUDA_VISIBLE_DEVICES=$gpu_id VLLM_USE_V1=1 vllm serve $MODEL \
+        --enforce-eager \
+        --host 0.0.0.0 \
+        --port $port \
+        --tensor-parallel-size 1 \
+        --seed 1024 \
+        --dtype float16 \
+        --max-model-len 10000 \
+        --max-num-batched-tokens 10000 \
+        --max-num-seqs 256 \
+        --trust-remote-code \
+        --gpu-memory-utilization 0.9 \
+        --disable-log-request \
+        --kv-transfer-config \
+        "{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_producer\",\"kv_buffer_size\":\"1e1\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}" > prefill$((i+1)).log 2>&1 &
+        PIDS+=($!)
+    done
+
+    # =============================================================================
+    # Launch Decode Servers (Y Decoders)
+    # =============================================================================
+    echo ""
+    echo "Starting ${#DECODE_GPU_ARRAY[@]} decode server(s)..."
+    for i in "${!DECODE_GPU_ARRAY[@]}"; do
+        local gpu_id=${DECODE_GPU_ARRAY[$i]}
+        local port=${DECODE_PORT_ARRAY[$i]}
+        local kv_port=$((22001 + i))
+        
+        echo "  Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
+        VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
+        --enforce-eager \
+        --host 0.0.0.0 \
+        --port $port \
+        --tensor-parallel-size 1 \
+        --seed 1024 \
+        --dtype float16 \
+        --max-model-len 10000 \
+        --max-num-batched-tokens 10000 \
+        --max-num-seqs 256 \
+        --trust-remote-code \
+        --gpu-memory-utilization 0.7 \
+        --disable-log-request \
+        --kv-transfer-config \
+        "{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_consumer\",\"kv_buffer_size\":\"8e9\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}" > decode$((i+1)).log 2>&1 &
+        PIDS+=($!)
+    done
+
+    # =============================================================================
+    # Wait for All Servers to Start
+    # =============================================================================
+    echo ""
+    echo "Waiting for all servers to start..."
+    for port in "${PREFILL_PORT_ARRAY[@]}" "${DECODE_PORT_ARRAY[@]}"; do
+        if ! wait_for_server $port; then
+            echo "Failed to start server on port $port"
+            cleanup
+            exit 1
+        fi
+    done
+
+    echo ""
+    echo "All servers are up. Starting benchmark..."
+
+    # =============================================================================
+    # Run Benchmark
+    # =============================================================================
+    cd ../../../benchmarks/
+    python3 benchmark_serving.py --port 10001 --seed $(date +%s) \
+        --model $MODEL \
+        --dataset-name random --random-input-len 7500 --random-output-len 200 \
+        --num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
+
+    echo "Benchmarking done. Cleaning up..."
+
+    cleanup
+}
+
+main
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py b/vllm_v0.10.0/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
new file mode 100644
index 0000000..ec58a18
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
@@ -0,0 +1,186 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import socket
+import threading
+import time
+import uuid
+from typing import Any
+
+import aiohttp
+import msgpack
+import zmq
+from quart import Quart, make_response, request
+
+count = 0
+prefill_instances: dict[str, Any] = {}  # http_address: (zmq_address, stamp)
+decode_instances: dict[str, Any] = {}  # http_address: (zmq_address, stamp)
+
+prefill_cv = threading.Condition()
+decode_cv = threading.Condition()
+
+DEFAULT_PING_SECONDS = 5
+
+
+def _remove_oldest_instances(instances: dict[str, Any]) -> None:
+    oldest_key = next(iter(instances), None)
+    while oldest_key is not None:
+        value = instances[oldest_key]
+        if value[1] > time.time():
+            break
+        print(f"🔴Remove [HTTP:{oldest_key}, ZMQ:{value[0]}, stamp:{value[1]}]")
+        instances.pop(oldest_key, None)
+        oldest_key = next(iter(instances), None)
+
+
+def _listen_for_register(poller, router_socket):
+    while True:
+        socks = dict(poller.poll())
+        if router_socket in socks:
+            remote_address, message = router_socket.recv_multipart()
+            # data: {"type": "P", "http_address": "ip:port",
+            #        "zmq_address": "ip:port"}
+            data = msgpack.loads(message)
+            if data["type"] == "P":
+                global prefill_instances
+                global prefill_cv
+                with prefill_cv:
+                    node = prefill_instances.pop(data["http_address"], None)
+                    prefill_instances[data["http_address"]] = (
+                        data["zmq_address"],
+                        time.time() + DEFAULT_PING_SECONDS,
+                    )
+                    _remove_oldest_instances(prefill_instances)
+
+            elif data["type"] == "D":
+                global decode_instances
+                global decode_cv
+                with decode_cv:
+                    node = decode_instances.pop(data["http_address"], None)
+                    decode_instances[data["http_address"]] = (
+                        data["zmq_address"],
+                        time.time() + DEFAULT_PING_SECONDS,
+                    )
+                    _remove_oldest_instances(decode_instances)
+            else:
+                print(
+                    "Unexpected, Received message from %s, data: %s",
+                    remote_address,
+                    data,
+                )
+
+            if node is None:
+                print(f"🔵Add [HTTP:{data['http_address']}, ZMQ:{data['zmq_address']}]")
+
+
+def start_service_discovery(hostname, port):
+    if not hostname:
+        hostname = socket.gethostname()
+    if port == 0:
+        raise ValueError("Port cannot be 0")
+
+    context = zmq.Context()
+    router_socket = context.socket(zmq.ROUTER)
+    router_socket.bind(f"tcp://{hostname}:{port}")
+
+    poller = zmq.Poller()
+    poller.register(router_socket, zmq.POLLIN)
+
+    _listener_thread = threading.Thread(
+        target=_listen_for_register, args=[poller, router_socket], daemon=True
+    )
+    _listener_thread.start()
+    return _listener_thread
+
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+
+app = Quart(__name__)
+
+
+def random_uuid() -> str:
+    return str(uuid.uuid4().hex)
+
+
+async def forward_request(url, data, request_id):
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+            "X-Request-Id": request_id,
+        }
+        async with session.post(url=url, json=data, headers=headers) as response:
+            if response.status == 200:
+                if True:
+                    async for chunk_bytes in response.content.iter_chunked(1024):
+                        yield chunk_bytes
+                else:
+                    content = await response.read()
+                    yield content
+
+
+@app.route("/v1/completions", methods=["POST"])
+async def handle_request():
+    try:
+        original_request_data = await request.get_json()
+
+        prefill_request = original_request_data.copy()
+        # change max_tokens = 1 to let it only do prefill
+        prefill_request["max_tokens"] = 1
+
+        global count
+        global prefill_instances
+        global prefill_cv
+        with prefill_cv:
+            prefill_list = list(prefill_instances.items())
+            prefill_addr, prefill_zmq_addr = prefill_list[count % len(prefill_list)]
+            prefill_zmq_addr = prefill_zmq_addr[0]
+
+        global decode_instances
+        global decode_cv
+        with decode_cv:
+            decode_list = list(decode_instances.items())
+            decode_addr, decode_zmq_addr = decode_list[count % len(decode_list)]
+            decode_zmq_addr = decode_zmq_addr[0]
+
+        print(
+            f"handle_request count: {count}, [HTTP:{prefill_addr}, "
+            f"ZMQ:{prefill_zmq_addr}] 👉 [HTTP:{decode_addr}, "
+            f"ZMQ:{decode_zmq_addr}]"
+        )
+        count += 1
+
+        request_id = (
+            f"___prefill_addr_{prefill_zmq_addr}___decode_addr_"
+            f"{decode_zmq_addr}_{random_uuid()}"
+        )
+
+        # finish prefill
+        async for _ in forward_request(
+            f"http://{prefill_addr}/v1/completions", prefill_request, request_id
+        ):
+            continue
+
+        # return decode
+        generator = forward_request(
+            f"http://{decode_addr}/v1/completions", original_request_data, request_id
+        )
+        response = await make_response(generator)
+        response.timeout = None
+
+        return response
+
+    except Exception as e:
+        import sys
+        import traceback
+
+        exc_info = sys.exc_info()
+        print("Error occurred in disagg prefill proxy server")
+        print(e)
+        print("".join(traceback.format_exception(*exc_info)))
+
+
+if __name__ == "__main__":
+    t = start_service_discovery("0.0.0.0", 30001)
+    app.run(host="0.0.0.0", port=10001)
+    t.join()
diff --git a/vllm_v0.10.0/examples/online_serving/elastic_ep/bench.sh b/vllm_v0.10.0/examples/online_serving/elastic_ep/bench.sh
new file mode 100644
index 0000000..e476314
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/elastic_ep/bench.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+MODEL_NAME="deepseek-ai/DeepSeek-V2-Lite"
+LOCAL_MODEL_PATH="/models/models--deepseek-ai--DeepSeek-V2-Lite/snapshots/604d5664dddd88a0433dbae533b7fe9472482de0"
+HOST="localhost"
+PORT=8006
+NUM_PROMPTS=20
+REQUEST_RATE=5
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model)
+            MODEL_NAME="$2"
+            shift 2
+            ;;
+        --local-model)
+            MODEL_NAME=$LOCAL_MODEL_PATH
+            shift
+            ;;
+        --host)
+            HOST="$2"
+            shift 2
+            ;;
+        --port)
+            PORT="$2"
+            shift 2
+            ;;
+        --num-prompts)
+            NUM_PROMPTS="$2"
+            shift 2
+            ;;
+        --request-rate)
+            REQUEST_RATE="$2"
+            shift 2
+            ;;
+        -h|--help)
+            echo "Usage: $0 [OPTIONS]"
+            echo "Options:"
+            echo "  --model MODEL_NAME           Set model name or path (default: deepseek-ai/DeepSeek-V2-Lite)"
+            echo "  --local-model                Use local model path (convenience option)"
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Use -h or --help for usage information"
+            exit 1
+            ;;
+    esac
+done
+
+vllm bench serve \
+    --model $MODEL_NAME \
+    --host $HOST \
+    --port $PORT \
+    --num-prompts $NUM_PROMPTS \
+    --request-rate $REQUEST_RATE
diff --git a/vllm_v0.10.0/examples/online_serving/elastic_ep/scale.py b/vllm_v0.10.0/examples/online_serving/elastic_ep/scale.py
new file mode 100644
index 0000000..a93c299
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/elastic_ep/scale.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import json
+import sys
+
+import requests
+
+
+def scale(host, port, new_dp_size):
+    url = f"http://{host}:{port}/scale_elastic_ep"
+    payload = {"new_data_parallel_size": new_dp_size}
+    headers = {"Content-Type": "application/json"}
+
+    print(f"Sending scale request to {url}")
+    print(f"Payload: {json.dumps(payload, indent=2)}")
+
+    try:
+        response = requests.post(url, json=payload, headers=headers, timeout=300)
+
+        print(f"Status Code: {response.status_code}")
+        print(f"Response: {response.text}")
+
+        if response.status_code == 200:
+            print("Scale up/down request successful!")
+            return True
+        else:
+            print("Scale up/down request failed!")
+            return False
+
+    except requests.exceptions.RequestException as e:
+        print(f"Request failed: {e}")
+        return False
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Test scale up/down functionality")
+    parser.add_argument("--host", default="localhost", help="API server host")
+    parser.add_argument("--port", type=int, default=8006, help="API server port")
+    parser.add_argument(
+        "--new-dp-size", type=int, default=2, help="New data parallel size"
+    )
+
+    args = parser.parse_args()
+
+    success = scale(args.host, args.port, args.new_dp_size)
+    sys.exit(0 if success else 1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/online_serving/elastic_ep/serve_deepseek_v2.sh b/vllm_v0.10.0/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
new file mode 100644
index 0000000..1234ebb
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+HOST="0.0.0.0"
+PORT=8006
+DATA_PARALLEL_SIZE=4
+REDUNDANT_EXPERTS=0
+LOCAL_MODEL_PATH="/models/models--deepseek-ai--DeepSeek-V2-Lite/snapshots/604d5664dddd88a0433dbae533b7fe9472482de0"
+MODEL_NAME="deepseek-ai/DeepSeek-V2-Lite"
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --dp)
+            DATA_PARALLEL_SIZE="$2"
+            shift 2
+            ;;
+        --re)
+            REDUNDANT_EXPERTS="$2"
+            shift 2
+            ;;
+        --host)
+            HOST="$2"
+            shift 2
+            ;;
+        --port)
+            PORT="$2"
+            shift 2
+            ;;
+        --model)
+            MODEL_NAME="$2"
+            shift 2
+            ;;
+        --local-model)
+            MODEL_NAME=$LOCAL_MODEL_PATH
+            shift
+            ;;
+        -h|--help)
+            echo "Usage: $0 [OPTIONS]"
+            echo "Options:"
+            echo "  --dp SIZE                    Set data parallel size (default: 4)"
+            echo "  --re SIZE                    Set redundant experts (default: 0)"
+            echo "  --host HOST                  Set host address (default: 0.0.0.0)"
+            echo "  --port PORT                  Set port number (default: 8006)"
+            echo "  --model MODEL_NAME           Set model name or path"
+            echo "  -h, --help                   Show this help message"
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Use -h or --help for usage information"
+            exit 1
+            ;;
+    esac
+done
+
+echo "Starting vLLM server for $MODEL_NAME with data parallel size: $DATA_PARALLEL_SIZE and redundant experts: $REDUNDANT_EXPERTS"
+
+export RAY_DEDUP_LOGS=0
+export VLLM_USE_V1=1
+export VLLM_ALL2ALL_BACKEND="pplx"
+export VLLM_USE_DEEP_GEMM=1
+
+vllm serve $MODEL_NAME \
+    --data-parallel-size $DATA_PARALLEL_SIZE \
+    --data-parallel-size-local $DATA_PARALLEL_SIZE \
+    --data-parallel-backend ray \
+    --enforce-eager \
+    --enable-expert-parallel \
+    --enable-eplb \
+    --num-redundant-experts $REDUNDANT_EXPERTS \
+    --trust-remote-code \
+    --host $HOST \
+    --port $PORT
diff --git a/vllm_v0.10.0/examples/online_serving/gradio_openai_chatbot_webserver.py b/vllm_v0.10.0/examples/online_serving/gradio_openai_chatbot_webserver.py
new file mode 100644
index 0000000..d5d0a07
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/gradio_openai_chatbot_webserver.py
@@ -0,0 +1,120 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Example for starting a Gradio OpenAI Chatbot Webserver
+Start vLLM API server:
+    vllm serve meta-llama/Llama-2-7b-chat-hf
+
+Start Gradio OpenAI Chatbot Webserver:
+    python examples/online_serving/gradio_openai_chatbot_webserver.py \
+                    -m meta-llama/Llama-2-7b-chat-hf
+
+Note that `pip install --upgrade gradio` is needed to run this example.
+More details: https://github.com/gradio-app/gradio
+
+If your antivirus software blocks the download of frpc for gradio,
+you can install it manually by following these steps:
+
+1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64
+2. Rename the downloaded file to: frpc_linux_amd64_v0.3
+3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
+"""
+
+import argparse
+
+import gradio as gr
+from openai import OpenAI
+
+
+def format_history_to_openai(history):
+    history_openai_format = [
+        {"role": "system", "content": "You are a great AI assistant."}
+    ]
+    for human, assistant in history:
+        history_openai_format.append({"role": "user", "content": human})
+        history_openai_format.append({"role": "assistant", "content": assistant})
+    return history_openai_format
+
+
+def predict(message, history, client, model_name, temp, stop_token_ids):
+    # Format history to OpenAI chat format
+    history_openai_format = format_history_to_openai(history)
+    history_openai_format.append({"role": "user", "content": message})
+
+    # Send request to OpenAI API (vLLM server)
+    stream = client.chat.completions.create(
+        model=model_name,
+        messages=history_openai_format,
+        temperature=temp,
+        stream=True,
+        extra_body={
+            "repetition_penalty": 1,
+            "stop_token_ids": [int(id.strip()) for id in stop_token_ids.split(",")]
+            if stop_token_ids
+            else [],
+        },
+    )
+
+    # Collect all chunks and concatenate them into a full message
+    full_message = ""
+    for chunk in stream:
+        full_message += chunk.choices[0].delta.content or ""
+
+    # Return the full message as a single response
+    return full_message
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Chatbot Interface with Customizable Parameters"
+    )
+    parser.add_argument(
+        "--model-url", type=str, default="http://localhost:8000/v1", help="Model URL"
+    )
+    parser.add_argument(
+        "-m", "--model", type=str, required=True, help="Model name for the chatbot"
+    )
+    parser.add_argument(
+        "--temp", type=float, default=0.8, help="Temperature for text generation"
+    )
+    parser.add_argument(
+        "--stop-token-ids", type=str, default="", help="Comma-separated stop token IDs"
+    )
+    parser.add_argument("--host", type=str, default=None)
+    parser.add_argument("--port", type=int, default=8001)
+    return parser.parse_args()
+
+
+def build_gradio_interface(client, model_name, temp, stop_token_ids):
+    def chat_predict(message, history):
+        return predict(message, history, client, model_name, temp, stop_token_ids)
+
+    return gr.ChatInterface(
+        fn=chat_predict,
+        title="Chatbot Interface",
+        description="A simple chatbot powered by vLLM",
+    )
+
+
+def main():
+    # Parse the arguments
+    args = parse_args()
+
+    # Set OpenAI's API key and API base to use vLLM's API server
+    openai_api_key = "EMPTY"
+    openai_api_base = args.model_url
+
+    # Create an OpenAI client
+    client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
+
+    # Define the Gradio chatbot interface using the predict function
+    gradio_interface = build_gradio_interface(
+        client, args.model, args.temp, args.stop_token_ids
+    )
+
+    gradio_interface.queue().launch(
+        server_name=args.host, server_port=args.port, share=True
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/online_serving/gradio_webserver.py b/vllm_v0.10.0/examples/online_serving/gradio_webserver.py
new file mode 100644
index 0000000..86d9ceb
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/gradio_webserver.py
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Example for starting a Gradio Webserver
+Start vLLM API server:
+    python -m vllm.entrypoints.api_server \
+        --model meta-llama/Llama-2-7b-chat-hf
+
+Start Webserver:
+    python examples/online_serving/gradio_webserver.py
+
+Note that `pip install --upgrade gradio` is needed to run this example.
+More details: https://github.com/gradio-app/gradio
+
+If your antivirus software blocks the download of frpc for gradio,
+you can install it manually by following these steps:
+
+1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64
+2. Rename the downloaded file to: frpc_linux_amd64_v0.3
+3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
+"""
+
+import argparse
+import json
+
+import gradio as gr
+import requests
+
+
+def http_bot(prompt):
+    headers = {"User-Agent": "vLLM Client"}
+    pload = {
+        "prompt": prompt,
+        "stream": True,
+        "max_tokens": 128,
+    }
+    response = requests.post(args.model_url, headers=headers, json=pload, stream=True)
+
+    for chunk in response.iter_lines(
+        chunk_size=8192, decode_unicode=False, delimiter=b"\n"
+    ):
+        if chunk:
+            data = json.loads(chunk.decode("utf-8"))
+            output = data["text"][0]
+            yield output
+
+
+def build_demo():
+    with gr.Blocks() as demo:
+        gr.Markdown("# vLLM text completion demo\n")
+        inputbox = gr.Textbox(label="Input", placeholder="Enter text and press ENTER")
+        outputbox = gr.Textbox(
+            label="Output", placeholder="Generated result from the model"
+        )
+        inputbox.submit(http_bot, [inputbox], [outputbox])
+    return demo
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default=None)
+    parser.add_argument("--port", type=int, default=8001)
+    parser.add_argument(
+        "--model-url", type=str, default="http://localhost:8000/generate"
+    )
+    return parser.parse_args()
+
+
+def main(args):
+    demo = build_demo()
+    demo.queue().launch(server_name=args.host, server_port=args.port, share=True)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/examples/online_serving/jinaai_rerank_client.py b/vllm_v0.10.0/examples/online_serving/jinaai_rerank_client.py
new file mode 100644
index 0000000..908d6a9
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/jinaai_rerank_client.py
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Example of using the OpenAI entrypoint's rerank API which is compatible with
+Jina and Cohere https://jina.ai/reranker
+
+run: vllm serve BAAI/bge-reranker-base
+"""
+
+import json
+
+import requests
+
+url = "http://127.0.0.1:8000/rerank"
+
+headers = {"accept": "application/json", "Content-Type": "application/json"}
+
+data = {
+    "model": "BAAI/bge-reranker-base",
+    "query": "What is the capital of France?",
+    "documents": [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
+        "Horses and cows are both animals",
+    ],
+}
+
+
+def main():
+    response = requests.post(url, headers=headers, json=data)
+
+    # Check the response
+    if response.status_code == 200:
+        print("Request successful!")
+        print(json.dumps(response.json(), indent=2))
+    else:
+        print(f"Request failed with status code: {response.status_code}")
+        print(response.text)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/online_serving/kv_events_subscriber.py b/vllm_v0.10.0/examples/online_serving/kv_events_subscriber.py
new file mode 100644
index 0000000..584db53
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/kv_events_subscriber.py
@@ -0,0 +1,113 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, Optional, Union
+
+import msgspec
+import zmq
+from msgspec.msgpack import Decoder
+
+
+#
+# Types copied from vllm.distributed.kv_events
+#
+class EventBatch(msgspec.Struct, array_like=True, omit_defaults=True, gc=False):
+    ts: float
+    events: list[Any]
+
+
+class KVCacheEvent(
+    msgspec.Struct, array_like=True, omit_defaults=True, gc=False, tag=True
+):
+    """Base class for all KV cache-related events"""
+
+
+class BlockStored(KVCacheEvent):
+    block_hashes: list[int]
+    parent_block_hash: Optional[int]
+    token_ids: list[int]
+    block_size: int
+    lora_id: Optional[int]
+
+
+class BlockRemoved(KVCacheEvent):
+    block_hashes: list[int]
+
+
+class AllBlocksCleared(KVCacheEvent):
+    pass
+
+
+class KVEventBatch(EventBatch):
+    events: list[Union[BlockStored, BlockRemoved, AllBlocksCleared]]
+
+
+def process_event(event_batch):
+    print(f"Received event batch at {event_batch.ts}:")
+    for event in event_batch.events:
+        print(f"  - {event}")
+
+
+def main():
+    decoder = Decoder(type=KVEventBatch)
+    last_seq = -1
+
+    context = zmq.Context()
+
+    # Set up the main subscription socket
+    sub = context.socket(zmq.SUB)
+    sub.connect("tcp://localhost:5557")
+    topic = "kv-events"
+    sub.setsockopt_string(zmq.SUBSCRIBE, topic)
+
+    # Initialize replay socket
+    replay = context.socket(zmq.REQ)
+    replay.connect("tcp://localhost:5558")
+    poller = zmq.Poller()
+    poller.register(replay, zmq.POLLIN)
+
+    print("Listening for KV cache events on topic:", topic)
+
+    while True:
+        try:
+            if sub.poll(50):
+                _, seq_bytes, payload = sub.recv_multipart()
+                seq = int.from_bytes(seq_bytes, "big")
+
+                if last_seq >= 0 and seq > last_seq + 1:
+                    missed = seq - last_seq - 1
+                    print(
+                        f"Missed {missed} messages (last: {last_seq}, current: {seq})"
+                    )
+
+                    replay.send((last_seq + 1).to_bytes(8, "big"))
+
+                    while poller.poll(timeout=200):
+                        seq_bytes, replay_payload = replay.recv_multipart()
+                        if not replay_payload:
+                            # End of replay marker is sent as an empty frame
+                            # for the payload
+                            break
+
+                        replay_seq = int.from_bytes(seq_bytes, "big")
+
+                        if replay_seq > last_seq:
+                            event_batch = decoder.decode(replay_payload)
+                            process_event(event_batch)
+                            last_seq = replay_seq
+                            if replay_seq >= seq - 1:
+                                break
+
+                event_batch = decoder.decode(payload)
+                process_event(event_batch)
+
+            # ... do other periodic work or check for shutdown ...
+
+        except KeyboardInterrupt:
+            print("Interrupted")
+            break
+        except Exception as e:
+            print("Error decoding message:", e)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/online_serving/multi-node-serving.sh b/vllm_v0.10.0/examples/online_serving/multi-node-serving.sh
new file mode 100644
index 0000000..e8ad8d3
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/multi-node-serving.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+#
+# Helper script to manually start or join a Ray cluster for online serving of vLLM models.
+# This script is first executed on the head node, and then on each worker node with the IP address
+# of the head node.
+#
+# Subcommands:
+#   leader: Launches a Ray head node and blocks until the cluster reaches the expected size (head + workers).
+#   worker: Starts a worker node that connects to an existing Ray head node.
+#
+# Example usage:
+# On the head node machine, start the Ray head node process and run a vLLM server.
+#   ./multi-node-serving.sh leader --ray_port=6379 --ray_cluster_size=<SIZE> [<extra ray args>]  && \
+#   python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2
+# 
+# On each worker node, start the Ray worker node process.
+#   ./multi-node-serving.sh worker --ray_address=<HEAD_NODE_IP> --ray_port=6379 [<extra ray args>]
+#
+# About Ray:
+# Ray is an open-source distributed execution framework that simplifies
+# distributed computing. Learn more:
+# https://ray.io/
+
+
+subcommand=$1  # Either "leader" or "worker".
+shift          # Remove the subcommand from the argument list.
+
+ray_port=6379              # Port used by the Ray head node.
+ray_init_timeout=300       # Seconds to wait before timing out.
+declare -a start_params    # Parameters forwarded to the underlying 'ray start' command.
+
+# Handle the worker subcommand.
+case "$subcommand" in
+  worker)
+    ray_address=""
+    while [ $# -gt 0 ]; do
+      case "$1" in
+        --ray_address=*)
+          ray_address="${1#*=}"
+          ;;
+        --ray_port=*)
+          ray_port="${1#*=}"
+          ;;
+        --ray_init_timeout=*)
+          ray_init_timeout="${1#*=}"
+          ;;
+        *)
+          start_params+=("$1")
+      esac
+      shift
+    done
+
+    if [ -z "$ray_address" ]; then
+      echo "Error: Missing argument --ray_address"
+      exit 1
+    fi
+
+    # Retry until the worker node connects to the head node or the timeout expires.
+    for (( i=0; i < $ray_init_timeout; i+=5 )); do
+      ray start --address=$ray_address:$ray_port --block "${start_params[@]}"
+      if [ $? -eq 0 ]; then
+        echo "Worker: Ray runtime started with head address $ray_address:$ray_port"
+        exit 0
+      fi
+      echo "Waiting until the ray worker is active..."
+      sleep 5s;
+    done
+    echo "Ray worker starts timeout, head address: $ray_address:$ray_port"
+    exit 1
+    ;;
+
+  # Handle the leader subcommand.
+  leader)
+    ray_cluster_size=""
+    while [ $# -gt 0 ]; do
+          case "$1" in
+            --ray_port=*)
+              ray_port="${1#*=}"
+              ;;
+            --ray_cluster_size=*)
+              ray_cluster_size="${1#*=}"
+              ;;
+            --ray_init_timeout=*)
+              ray_init_timeout="${1#*=}"
+              ;;
+            *)
+              start_params+=("$1")
+          esac
+          shift
+    done
+
+    if [ -z "$ray_cluster_size" ]; then
+      echo "Error: Missing argument --ray_cluster_size"
+      exit 1
+    fi
+
+    # Start the Ray head node.
+    ray start --head --port=$ray_port "${start_params[@]}"
+
+    # Poll Ray until every worker node is active.
+    for (( i=0; i < $ray_init_timeout; i+=5 )); do
+        active_nodes=`python3 -c 'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))'`
+        if [ $active_nodes -eq $ray_cluster_size ]; then
+          echo "All ray workers are active and the ray cluster is initialized successfully."
+          exit 0
+        fi
+        echo "Wait for all ray workers to be active. $active_nodes/$ray_cluster_size is active"
+        sleep 5s;
+    done
+
+    echo "Waiting for all ray workers to be active timed out."
+    exit 1
+    ;;
+
+  *)
+    echo "unknown subcommand: $subcommand"
+    exit 1
+    ;;
+esac
diff --git a/vllm_v0.10.0/examples/online_serving/multi_instance_data_parallel.py b/vllm_v0.10.0/examples/online_serving/multi_instance_data_parallel.py
new file mode 100644
index 0000000..cb23091
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/multi_instance_data_parallel.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+from typing import Optional
+
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import SamplingParams
+
+"""
+To run this example, run the following commands simultaneously with
+different CUDA_VISIBLE_DEVICES:
+    python examples/online_serving/multi_instance_data_parallel.py
+
+    vllm serve ibm-research/PowerMoE-3b -dp 2 -dpr 1 \
+        --data-parallel-address 127.0.0.1 --data-parallel-rpc-port 62300 \
+        --data-parallel-size-local 1 --enforce-eager --headless
+
+Once both instances have completed the handshake, this example will
+send a request to the instance with DP rank 1.
+"""
+
+
+async def main():
+    engine_args = AsyncEngineArgs(
+        model="ibm-research/PowerMoE-3b",
+        data_parallel_size=2,
+        dtype="auto",
+        max_model_len=2048,
+        data_parallel_address="127.0.0.1",
+        data_parallel_rpc_port=62300,
+        data_parallel_size_local=1,
+        enforce_eager=True,
+    )
+
+    engine_client = AsyncLLMEngine.from_engine_args(engine_args)
+
+    sampling_params = SamplingParams(
+        temperature=0.7,
+        top_p=0.9,
+        max_tokens=100,
+    )
+
+    prompt = "Who won the 2004 World Series?"
+    final_output: Optional[RequestOutput] = None
+    async for output in engine_client.generate(
+        prompt=prompt,
+        sampling_params=sampling_params,
+        request_id="abcdef",
+        data_parallel_rank=1,
+    ):
+        final_output = output
+    if final_output:
+        print(final_output.outputs[0].text)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/vllm_v0.10.0/examples/online_serving/openai_chat_completion_client.py b/vllm_v0.10.0/examples/online_serving/openai_chat_completion_client.py
new file mode 100644
index 0000000..def95de
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/openai_chat_completion_client.py
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Example Python client for OpenAI Chat Completion using vLLM API server
+NOTE: start a supported chat completion model server with `vllm serve`, e.g.
+    vllm serve meta-llama/Llama-2-7b-chat-hf
+"""
+
+import argparse
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "Who won the world series in 2020?"},
+    {
+        "role": "assistant",
+        "content": "The Los Angeles Dodgers won the World Series in 2020.",
+    },
+    {"role": "user", "content": "Where was it played?"},
+]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Client for vLLM API server")
+    parser.add_argument(
+        "--stream", action="store_true", help="Enable streaming response"
+    )
+    return parser.parse_args()
+
+
+def main(args):
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    # Chat Completion API
+    chat_completion = client.chat.completions.create(
+        messages=messages,
+        model=model,
+        stream=args.stream,
+    )
+
+    print("-" * 50)
+    print("Chat completion results:")
+    if args.stream:
+        for c in chat_completion:
+            print(c)
+    else:
+        print(chat_completion)
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/vllm_v0.10.0/examples/online_serving/openai_chat_completion_client_for_multimodal.py
new file mode 100644
index 0000000..c99b514
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -0,0 +1,302 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""An example showing how to use vLLM to serve multimodal models
+and run online serving with OpenAI client.
+
+Launch the vLLM server with the following command:
+
+(single image inference with Llava)
+vllm serve llava-hf/llava-1.5-7b-hf
+
+(multi-image inference with Phi-3.5-vision-instruct)
+vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
+    --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
+
+(audio inference with Ultravox)
+vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b \
+    --max-model-len 4096 --trust-remote-code
+
+run the script with
+python openai_chat_completion_client_for_multimodal.py --chat-type audio
+"""
+
+import base64
+
+import requests
+from openai import OpenAI
+from utils import get_first_model
+
+from vllm.utils import FlexibleArgumentParser
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    # defaults to os.environ.get("OPENAI_API_KEY")
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+
+def encode_base64_content_from_url(content_url: str) -> str:
+    """Encode a content retrieved from a remote url to base64 format."""
+
+    with requests.get(content_url) as response:
+        response.raise_for_status()
+        result = base64.b64encode(response.content).decode("utf-8")
+
+    return result
+
+
+# Text-only inference
+def run_text_only(model: str) -> None:
+    chat_completion = client.chat.completions.create(
+        messages=[{"role": "user", "content": "What's the capital of France?"}],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion.choices[0].message.content
+    print("Chat completion output:", result)
+
+
+# Single-image input inference
+def run_single_image(model: str) -> None:
+    ## Use image url in the payload
+    image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this image?"},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": image_url},
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from image url:", result)
+
+    ## Use base64 encoded image in the payload
+    image_base64 = encode_base64_content_from_url(image_url)
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this image?"},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from base64 encoded image:", result)
+
+
+# Multi-image input inference
+def run_multi_image(model: str) -> None:
+    image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
+    image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What are the animals in these images?"},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": image_url_duck},
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": image_url_lion},
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output:", result)
+
+
+# Video input inference
+def run_video(model: str) -> None:
+    video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
+    video_base64 = encode_base64_content_from_url(video_url)
+
+    ## Use video url in the payload
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this video?"},
+                    {
+                        "type": "video_url",
+                        "video_url": {"url": video_url},
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from image url:", result)
+
+    ## Use base64 encoded video in the payload
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this video?"},
+                    {
+                        "type": "video_url",
+                        "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from base64 encoded image:", result)
+
+
+# Audio input inference
+def run_audio(model: str) -> None:
+    from vllm.assets.audio import AudioAsset
+
+    audio_url = AudioAsset("winning_call").url
+    audio_base64 = encode_base64_content_from_url(audio_url)
+
+    # OpenAI-compatible schema (`input_audio`)
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this audio?"},
+                    {
+                        "type": "input_audio",
+                        "input_audio": {
+                            # Any format supported by librosa is supported
+                            "data": audio_base64,
+                            "format": "wav",
+                        },
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from input audio:", result)
+
+    # HTTP URL
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this audio?"},
+                    {
+                        "type": "audio_url",
+                        "audio_url": {
+                            # Any format supported by librosa is supported
+                            "url": audio_url
+                        },
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from audio url:", result)
+
+    # base64 URL
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this audio?"},
+                    {
+                        "type": "audio_url",
+                        "audio_url": {
+                            # Any format supported by librosa is supported
+                            "url": f"data:audio/ogg;base64,{audio_base64}"
+                        },
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from base64 encoded audio:", result)
+
+
+example_function_map = {
+    "text-only": run_text_only,
+    "single-image": run_single_image,
+    "multi-image": run_multi_image,
+    "video": run_video,
+    "audio": run_audio,
+}
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description="Demo on using OpenAI client for online serving with "
+        "multimodal language models served with vLLM."
+    )
+    parser.add_argument(
+        "--chat-type",
+        "-c",
+        type=str,
+        default="single-image",
+        choices=list(example_function_map.keys()),
+        help="Conversation type with multimodal data.",
+    )
+    return parser.parse_args()
+
+
+def main(args) -> None:
+    chat_type = args.chat_type
+    model = get_first_model(client)
+    example_function_map[chat_type](model)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/examples/online_serving/openai_chat_completion_client_with_tools.py b/vllm_v0.10.0/examples/online_serving/openai_chat_completion_client_with_tools.py
new file mode 100644
index 0000000..41dbb32
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/openai_chat_completion_client_with_tools.py
@@ -0,0 +1,194 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Set up this example by starting a vLLM OpenAI-compatible server with tool call
+options enabled. For example:
+
+IMPORTANT: for mistral, you must use one of the provided mistral tool call
+templates, or your own - the model default doesn't work for tool calls with vLLM
+See the vLLM docs on OpenAI server & tool calling for more details.
+
+vllm serve mistralai/Mistral-7B-Instruct-v0.3 \
+            --chat-template examples/tool_chat_template_mistral.jinja \
+            --enable-auto-tool-choice --tool-call-parser mistral
+
+OR
+vllm serve NousResearch/Hermes-2-Pro-Llama-3-8B \
+            --chat-template examples/tool_chat_template_hermes.jinja \
+            --enable-auto-tool-choice --tool-call-parser hermes
+"""
+
+import json
+from typing import Any
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+properties = {
+    "city": {
+        "type": "string",
+        "description": "The city to find the weather for, e.g. 'San Francisco'",
+    },
+    "state": {
+        "type": "string",
+        "description": "the two-letter abbreviation for the state that the city is"
+        " in, e.g. 'CA' which would mean 'California'",
+    },
+    "unit": {
+        "type": "string",
+        "description": "The unit to fetch the temperature in",
+        "enum": ["celsius", "fahrenheit"],
+    },
+}
+
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": properties,
+                "required": ["city", "state", "unit"],
+            },
+        },
+    }
+]
+
+messages = [
+    {"role": "user", "content": "Hi! How are you doing today?"},
+    {"role": "assistant", "content": "I'm doing well! How can I help you?"},
+    {
+        "role": "user",
+        "content": (
+            "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
+        ),
+    },
+]
+
+
+def get_current_weather(city: str, state: str, unit: "str"):
+    return (
+        "The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
+        "partly cloudly, with highs in the 90's."
+    )
+
+
+def handle_tool_calls_stream(
+    client: OpenAI,
+    messages: list[dict[str, str]],
+    model: str,
+    tools: list[dict[str, Any]],
+) -> list[Any]:
+    tool_calls_stream = client.chat.completions.create(
+        messages=messages, model=model, tools=tools, stream=True
+    )
+    chunks = []
+    print("chunks: ")
+    for chunk in tool_calls_stream:
+        chunks.append(chunk)
+        if chunk.choices[0].delta.tool_calls:
+            print(chunk.choices[0].delta.tool_calls[0])
+        else:
+            print(chunk.choices[0].delta)
+    return chunks
+
+
+def handle_tool_calls_arguments(chunks: list[Any]) -> list[str]:
+    arguments = []
+    tool_call_idx = -1
+    print("arguments: ")
+    for chunk in chunks:
+        if chunk.choices[0].delta.tool_calls:
+            tool_call = chunk.choices[0].delta.tool_calls[0]
+            if tool_call.index != tool_call_idx:
+                if tool_call_idx >= 0:
+                    print(f"streamed tool call arguments: {arguments[tool_call_idx]}")
+                tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
+                arguments.append("")
+            if tool_call.id:
+                print(f"streamed tool call id: {tool_call.id} ")
+
+            if tool_call.function:
+                if tool_call.function.name:
+                    print(f"streamed tool call name: {tool_call.function.name}")
+
+                if tool_call.function.arguments:
+                    arguments[tool_call_idx] += tool_call.function.arguments
+
+    return arguments
+
+
+def main():
+    # Initialize OpenAI client
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    # Get available models and select one
+    models = client.models.list()
+    model = models.data[0].id
+
+    chat_completion = client.chat.completions.create(
+        messages=messages, model=model, tools=tools
+    )
+
+    print("-" * 70)
+    print("Chat completion results:")
+    print(chat_completion)
+    print("-" * 70)
+
+    # Stream tool calls
+    chunks = handle_tool_calls_stream(client, messages, model, tools)
+    print("-" * 70)
+
+    # Handle arguments from streamed tool calls
+    arguments = handle_tool_calls_arguments(chunks)
+
+    if len(arguments):
+        print(f"streamed tool call arguments: {arguments[-1]}\n")
+
+    print("-" * 70)
+
+    # Add tool call results to the conversation
+    messages.append(
+        {
+            "role": "assistant",
+            "tool_calls": chat_completion.choices[0].message.tool_calls,
+        }
+    )
+
+    # Now, simulate a tool call
+    available_tools = {"get_current_weather": get_current_weather}
+
+    completion_tool_calls = chat_completion.choices[0].message.tool_calls
+    for call in completion_tool_calls:
+        tool_to_call = available_tools[call.function.name]
+        args = json.loads(call.function.arguments)
+        result = tool_to_call(**args)
+        print("tool_to_call result: ", result)
+        messages.append(
+            {
+                "role": "tool",
+                "content": result,
+                "tool_call_id": call.id,
+                "name": call.function.name,
+            }
+        )
+
+    chat_completion_2 = client.chat.completions.create(
+        messages=messages, model=model, tools=tools, stream=False
+    )
+    print("Chat completion2 results:")
+    print(chat_completion_2)
+    print("-" * 70)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/online_serving/openai_chat_completion_client_with_tools_required.py b/vllm_v0.10.0/examples/online_serving/openai_chat_completion_client_with_tools_required.py
new file mode 100644
index 0000000..7eb8668
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/openai_chat_completion_client_with_tools_required.py
@@ -0,0 +1,130 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+To run this example, you can start the vLLM server
+without any specific flags:
+
+```bash
+VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
+    --guided-decoding-backend outlines
+```
+
+This example demonstrates how to generate chat completions
+using the OpenAI Python client library.
+"""
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string",
+                        "description": "The city to find the weather for"
+                        ", e.g. 'San Francisco'",
+                    },
+                    "state": {
+                        "type": "string",
+                        "description": (
+                            "the two-letter abbreviation for the state that the "
+                            "city is in, e.g. 'CA' which would mean 'California'"
+                        ),
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["city", "state", "unit"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_forecast",
+            "description": "Get the weather forecast for a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string",
+                        "description": (
+                            "The city to get the forecast for, e.g. 'New York'"
+                        ),
+                    },
+                    "state": {
+                        "type": "string",
+                        "description": (
+                            "The two-letter abbreviation for the state, e.g. 'NY'"
+                        ),
+                    },
+                    "days": {
+                        "type": "integer",
+                        "description": "Number of days to get the forecast for (1-7)",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["city", "state", "days", "unit"],
+            },
+        },
+    },
+]
+
+messages = [
+    {"role": "user", "content": "Hi! How are you doing today?"},
+    {"role": "assistant", "content": "I'm doing well! How can I help you?"},
+    {
+        "role": "user",
+        "content": "Can you tell me what the current weather is in Dallas \
+            and the forecast for the next 5 days, in fahrenheit?",
+    },
+]
+
+
+def main():
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    chat_completion = client.chat.completions.create(
+        messages=messages,
+        model=model,
+        tools=tools,
+        tool_choice="required",
+        stream=True,  # Enable streaming response
+    )
+
+    for chunk in chat_completion:
+        if chunk.choices and chunk.choices[0].delta.tool_calls:
+            print(chunk.choices[0].delta.tool_calls)
+
+    chat_completion = client.chat.completions.create(
+        messages=messages, model=model, tools=tools, tool_choice="required"
+    )
+
+    print(chat_completion.choices[0].message.tool_calls)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/online_serving/openai_chat_completion_client_with_tools_xlam.py b/vllm_v0.10.0/examples/online_serving/openai_chat_completion_client_with_tools_xlam.py
new file mode 100644
index 0000000..f0b0a2d
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/openai_chat_completion_client_with_tools_xlam.py
@@ -0,0 +1,245 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+"""
+Set up this example by starting a vLLM OpenAI-compatible server with tool call
+options enabled for xLAM-2 models:
+
+vllm serve --model Salesforce/Llama-xLAM-2-8b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
+
+OR
+
+vllm serve --model Salesforce/xLAM-2-3b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
+"""
+
+import json
+import time
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "empty"
+openai_api_base = "http://localhost:8000/v1"
+
+
+# Define tool functions
+def get_weather(location: str, unit: str):
+    return f"Weather in {location} is 22 degrees {unit}."
+
+
+def calculate_expression(expression: str):
+    try:
+        result = eval(expression)
+        return f"The result of {expression} is {result}"
+    except Exception as e:
+        return f"Could not calculate {expression}: {e}"
+
+
+def translate_text(text: str, target_language: str):
+    return f"Translation of '{text}' to {target_language}: [translated content]"
+
+
+# Define tools
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "City and state, e.g., 'San Francisco, CA'",
+                    },
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                },
+                "required": ["location", "unit"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "calculate_expression",
+            "description": "Calculate a mathematical expression",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "expression": {
+                        "type": "string",
+                        "description": "Mathematical expression to evaluate, needs to be a valid python expression",
+                    }
+                },
+                "required": ["expression"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "translate_text",
+            "description": "Translate text to another language",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "text": {"type": "string", "description": "Text to translate"},
+                    "target_language": {
+                        "type": "string",
+                        "description": "Target language for translation",
+                    },
+                },
+                "required": ["text", "target_language"],
+            },
+        },
+    },
+]
+
+# Map of function names to implementations
+tool_functions = {
+    "get_weather": get_weather,
+    "calculate_expression": calculate_expression,
+    "translate_text": translate_text,
+}
+
+
+def process_response(response, tool_functions, original_query):
+    """Process a non-streaming response with possible tool calls"""
+
+    print("\n--- Response Output ---")
+
+    # Check if the response has content
+    if response.choices[0].message.content:
+        print(f"Content: {response.choices[0].message.content}")
+
+    # Check if the response has tool calls
+    if response.choices[0].message.tool_calls:
+        print("--------------------------------")
+        print(f"Tool calls: {response.choices[0].message.tool_calls}")
+        print("--------------------------------")
+
+        # Collect all tool calls and results before making follow-up request
+        tool_results = []
+        assistant_message = {"role": "assistant"}
+
+        if response.choices[0].message.content:
+            assistant_message["content"] = response.choices[0].message.content
+
+        assistant_tool_calls = []
+
+        # Process each tool call
+        for tool_call in response.choices[0].message.tool_calls:
+            function_name = tool_call.function.name
+            function_args = tool_call.function.arguments
+            function_id = tool_call.id
+
+            print(f"Function called: {function_name}")
+            print(f"Arguments: {function_args}")
+            print(f"Function ID: {function_id}")
+
+            # Execute the function
+            try:
+                # Parse the JSON arguments
+                args = json.loads(function_args)
+
+                # Call the function with the arguments
+                function_result = tool_functions[function_name](**args)
+                print(f"\n--- Function Result ---\n{function_result}\n")
+
+                # Add tool call to assistant message
+                assistant_tool_calls.append(
+                    {
+                        "id": function_id,
+                        "type": "function",
+                        "function": {"name": function_name, "arguments": function_args},
+                    }
+                )
+
+                # Add tool result to tool_results
+                tool_results.append(
+                    {
+                        "role": "tool",
+                        "tool_call_id": function_id,
+                        "content": function_result,
+                    }
+                )
+
+            except Exception as e:
+                print(f"Error executing function: {e}")
+
+        # Add tool_calls to assistant message
+        assistant_message["tool_calls"] = assistant_tool_calls
+
+        # Create a follow-up message with all function results
+        follow_up_messages = [
+            {"role": "user", "content": original_query},
+            assistant_message,
+        ]
+
+        # Add all tool results to the messages
+        follow_up_messages.extend(tool_results)
+
+        # Get completion with all tool results in a single follow-up
+        follow_up_response = client.chat.completions.create(
+            model=client.models.list().data[0].id,
+            messages=follow_up_messages,
+            stream=False,
+        )
+
+        print("\n--- Follow-up Response ---")
+        print(follow_up_response.choices[0].message.content)
+        print("--- End Follow-up ---\n")
+
+    print("--- End Response ---\n")
+
+
+def run_test_case(query, test_name):
+    """Run a single test case with the given query"""
+    print(f"\n{'=' * 50}\nTEST CASE: {test_name}\n{'=' * 50}")
+    print(f"Query: '{query}'")
+
+    start_time = time.time()
+
+    # Create non-streaming chat completion request
+    response = client.chat.completions.create(
+        model=client.models.list().data[0].id,
+        messages=[{"role": "user", "content": query}],
+        tools=tools,
+        tool_choice="auto",
+        stream=False,
+    )
+
+    # Process the non-streaming response, passing the original query
+    process_response(response, tool_functions, query)
+
+    end_time = time.time()
+    print(f"Test completed in {end_time - start_time:.2f} seconds")
+
+
+def main():
+    # Initialize OpenAI client
+    global client
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    # Run test cases
+    test_cases = [
+        ("I want to know the weather in San Francisco", "Weather Information"),
+        ("Calculate 25 * 17 + 31", "Math Calculation"),
+        ("Translate 'Hello world' to Spanish", "Text Translation"),
+        ("What is the weather in Tokyo and New York in celsius", "Multiple Tool Usage"),
+    ]
+
+    # Execute all test cases
+    for query, test_name in test_cases:
+        run_test_case(query, test_name)
+        time.sleep(1)  # Small delay between tests
+
+    print("\nAll tests completed.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/online_serving/openai_chat_completion_client_with_tools_xlam_streaming.py b/vllm_v0.10.0/examples/online_serving/openai_chat_completion_client_with_tools_xlam_streaming.py
new file mode 100644
index 0000000..94e664c
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/openai_chat_completion_client_with_tools_xlam_streaming.py
@@ -0,0 +1,273 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+"""
+Set up this example by starting a vLLM OpenAI-compatible server with tool call
+options enabled for xLAM-2 models:
+
+vllm serve --model Salesforce/Llama-xLAM-2-8b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
+
+OR
+
+vllm serve --model Salesforce/xLAM-2-3b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
+
+This example demonstrates streaming tool calls with xLAM models.
+"""
+
+import json
+import time
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "empty"
+openai_api_base = "http://localhost:8000/v1"
+
+
+# Define tool functions
+def get_weather(location: str, unit: str):
+    return f"Weather in {location} is 22 degrees {unit}."
+
+
+def calculate_expression(expression: str):
+    try:
+        result = eval(expression)
+        return f"The result of {expression} is {result}"
+    except Exception as e:
+        return f"Could not calculate {expression}: {e}"
+
+
+def translate_text(text: str, target_language: str):
+    return f"Translation of '{text}' to {target_language}: [translated content]"
+
+
+# Define tools
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "City and state, e.g., 'San Francisco, CA'",
+                    },
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                },
+                "required": ["location", "unit"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "calculate_expression",
+            "description": "Calculate a mathematical expression",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "expression": {
+                        "type": "string",
+                        "description": "Mathematical expression to evaluate, needs to be a valid Python expression",
+                    }
+                },
+                "required": ["expression"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "translate_text",
+            "description": "Translate text to another language",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "text": {"type": "string", "description": "Text to translate"},
+                    "target_language": {
+                        "type": "string",
+                        "description": "Target language for translation",
+                    },
+                },
+                "required": ["text", "target_language"],
+            },
+        },
+    },
+]
+
+# Map of function names to implementations
+tool_functions = {
+    "get_weather": get_weather,
+    "calculate_expression": calculate_expression,
+    "translate_text": translate_text,
+}
+
+
+def process_stream(response, tool_functions, original_query):
+    """Process a streaming response with possible tool calls"""
+    # Track multiple tool calls
+    tool_calls = {}  # Dictionary to store tool calls by ID
+
+    current_id = None
+
+    print("\n--- Stream Output ---")
+    for chunk in response:
+        # Handle tool calls in the stream
+        if chunk.choices[0].delta.tool_calls:
+            for tool_call_chunk in chunk.choices[0].delta.tool_calls:
+                # Get the tool call ID
+                if hasattr(tool_call_chunk, "id") and tool_call_chunk.id:
+                    current_id = tool_call_chunk.id
+                    if current_id not in tool_calls:
+                        tool_calls[current_id] = {
+                            "function_name": None,
+                            "function_args": "",
+                            "function_id": current_id,
+                        }
+
+                # Extract function information as it comes in chunks
+                if (
+                    hasattr(tool_call_chunk, "function")
+                    and current_id
+                    and current_id in tool_calls
+                ):
+                    if (
+                        hasattr(tool_call_chunk.function, "name")
+                        and tool_call_chunk.function.name
+                    ):
+                        tool_calls[current_id]["function_name"] = (
+                            tool_call_chunk.function.name
+                        )
+                        print(f"Function called: {tool_call_chunk.function.name}")
+
+                    if (
+                        hasattr(tool_call_chunk.function, "arguments")
+                        and tool_call_chunk.function.arguments
+                    ):
+                        tool_calls[current_id]["function_args"] += (
+                            tool_call_chunk.function.arguments
+                        )
+                        print(f"Arguments chunk: {tool_call_chunk.function.arguments}")
+
+        # Handle regular content in the stream
+        elif chunk.choices[0].delta.content:
+            print(chunk.choices[0].delta.content, end="")
+
+    print("\n--- End Stream ---\n")
+
+    # Execute each function call and build messages for follow-up
+    follow_up_messages = [{"role": "user", "content": original_query}]
+
+    for tool_id, tool_data in tool_calls.items():
+        function_name = tool_data["function_name"]
+        function_args = tool_data["function_args"]
+        function_id = tool_data["function_id"]
+
+        if function_name and function_args:
+            try:
+                # Parse the JSON arguments
+                args = json.loads(function_args)
+
+                # Call the function with the arguments
+                function_result = tool_functions[function_name](**args)
+                print(
+                    f"\n--- Function Result ({function_name}) ---\n{function_result}\n"
+                )
+
+                # Add the assistant message with tool call
+                follow_up_messages.append(
+                    {
+                        "role": "assistant",
+                        "tool_calls": [
+                            {
+                                "id": function_id,
+                                "type": "function",
+                                "function": {
+                                    "name": function_name,
+                                    "arguments": function_args,
+                                },
+                            }
+                        ],
+                    }
+                )
+
+                # Add the tool message with function result
+                follow_up_messages.append(
+                    {
+                        "role": "tool",
+                        "tool_call_id": function_id,
+                        "content": function_result,
+                    }
+                )
+
+            except Exception as e:
+                print(f"Error executing function: {e}")
+
+    # Only send follow-up if we have results to process
+    if len(follow_up_messages) > 1:
+        # Create a follow-up message with all the function results
+        follow_up_response = client.chat.completions.create(
+            model=client.models.list().data[0].id,
+            messages=follow_up_messages,
+            stream=True,
+        )
+
+        print("\n--- Follow-up Response ---")
+        for chunk in follow_up_response:
+            if chunk.choices[0].delta.content:
+                print(chunk.choices[0].delta.content, end="")
+        print("\n--- End Follow-up ---\n")
+
+
+def run_test_case(query, test_name):
+    """Run a single test case with the given query"""
+    print(f"\n{'=' * 50}\nTEST CASE: {test_name}\n{'=' * 50}")
+    print(f"Query: '{query}'")
+
+    start_time = time.time()
+
+    # Create streaming chat completion request
+    response = client.chat.completions.create(
+        model=client.models.list().data[0].id,
+        messages=[{"role": "user", "content": query}],
+        tools=tools,
+        tool_choice="auto",
+        stream=True,
+    )
+
+    # Process the streaming response
+    process_stream(response, tool_functions, query)
+
+    end_time = time.time()
+    print(f"Test completed in {end_time - start_time:.2f} seconds")
+
+
+def main():
+    # Initialize OpenAI client
+    global client
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    # Run test cases
+    test_cases = [
+        ("I want to know the weather in San Francisco", "Weather Information"),
+        ("Calculate 25 * 17 + 31", "Math Calculation"),
+        ("Translate 'Hello world' to Spanish", "Text Translation"),
+        ("What is the weather in Tokyo and New York in celsius", "Multiple Tool Usage"),
+    ]
+
+    # Execute all test cases
+    for query, test_name in test_cases:
+        run_test_case(query, test_name)
+        time.sleep(1)  # Small delay between tests
+
+    print("\nAll tests completed.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py b/vllm_v0.10.0/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
new file mode 100644
index 0000000..4006d07
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
@@ -0,0 +1,170 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+An example demonstrates how to use tool calling with reasoning models 
+like QwQ-32B. The reasoning_content will not be parsed by the tool 
+calling process; only the final output will be parsed.
+
+To run this example, you need to start the vLLM server with both 
+the reasoning parser and tool calling enabled.
+
+```bash
+vllm serve Qwen/QwQ-32B \
+     --reasoning-parser deepseek_r1 \
+     --enable-auto-tool-choice --tool-call-parser hermes
+     
+```
+
+"""
+
+from openai import OpenAI
+
+
+# Now, simulate a tool call
+def get_current_weather(city: str, state: str, unit: "str"):
+    return (
+        "The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
+        "partly cloudly, with highs in the 90's."
+    )
+
+
+available_tools = {"get_current_weather": get_current_weather}
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+properties = {
+    "city": {
+        "type": "string",
+        "description": "The city to find the weather for, e.g. 'San Francisco'",
+    },
+    "state": {
+        "type": "string",
+        "description": "the two-letter abbreviation for the state that the city is"
+        " in, e.g. 'CA' which would mean 'California'",
+    },
+    "unit": {
+        "type": "string",
+        "description": "The unit to fetch the temperature in",
+        "enum": ["celsius", "fahrenheit"],
+    },
+}
+
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": properties,
+                "required": ["city", "state", "unit"],
+            },
+        },
+    }
+]
+messages = [
+    {"role": "user", "content": "Hi! How are you doing today?"},
+    {"role": "assistant", "content": "I'm doing well! How can I help you?"},
+    {
+        "role": "user",
+        "content": (
+            "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
+        ),
+    },
+]
+
+
+def extract_reasoning_and_calls(chunks: list):
+    reasoning_content = ""
+    tool_call_idx = -1
+    arguments = []
+    function_names = []
+    for chunk in chunks:
+        if chunk.choices[0].delta.tool_calls:
+            tool_call = chunk.choices[0].delta.tool_calls[0]
+            if tool_call.index != tool_call_idx:
+                tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
+                arguments.append("")
+                function_names.append("")
+
+            if tool_call.function:
+                if tool_call.function.name:
+                    function_names[tool_call_idx] = tool_call.function.name
+
+                if tool_call.function.arguments:
+                    arguments[tool_call_idx] += tool_call.function.arguments
+        else:
+            if hasattr(chunk.choices[0].delta, "reasoning_content"):
+                reasoning_content += chunk.choices[0].delta.reasoning_content
+    return reasoning_content, arguments, function_names
+
+
+def main():
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    print("---------Full Generate With Automatic Function Calling-------------")
+    tool_calls = client.chat.completions.create(
+        messages=messages, model=model, tools=tools
+    )
+    print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
+    print(f"function name: {tool_calls.choices[0].message.tool_calls[0].function.name}")
+    print(
+        f"function arguments: "
+        f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}"
+    )
+
+    print("----------Stream Generate With Automatic Function Calling-----------")
+    tool_calls_stream = client.chat.completions.create(
+        messages=messages, model=model, tools=tools, stream=True
+    )
+
+    chunks = list(tool_calls_stream)
+
+    reasoning_content, arguments, function_names = extract_reasoning_and_calls(chunks)
+
+    print(f"reasoning_content: {reasoning_content}")
+    print(f"function name: {function_names[0]}")
+    print(f"function arguments: {arguments[0]}")
+
+    print("----------Full Generate With Named Function Calling-----------------")
+    tool_calls = client.chat.completions.create(
+        messages=messages,
+        model=model,
+        tools=tools,
+        tool_choice={"type": "function", "function": {"name": "get_current_weather"}},
+    )
+
+    tool_call = tool_calls.choices[0].message.tool_calls[0].function
+    print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
+    print(f"function name: {tool_call.name}")
+    print(f"function arguments: {tool_call.arguments}")
+    print("----------Stream Generate With Named Function Calling--------------")
+
+    tool_calls_stream = client.chat.completions.create(
+        messages=messages,
+        model=model,
+        tools=tools,
+        tool_choice={"type": "function", "function": {"name": "get_current_weather"}},
+        stream=True,
+    )
+
+    chunks = list(tool_calls_stream)
+
+    reasoning_content, arguments, function_names = extract_reasoning_and_calls(chunks)
+    print(f"reasoning_content: {reasoning_content}")
+    print(f"function name: {function_names[0]}")
+    print(f"function arguments: {arguments[0]}")
+    print("\n\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/online_serving/openai_chat_completion_with_reasoning.py b/vllm_v0.10.0/examples/online_serving/openai_chat_completion_with_reasoning.py
new file mode 100644
index 0000000..932dbeb
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/openai_chat_completion_with_reasoning.py
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+An example shows how to generate chat completions from reasoning models
+like DeepSeekR1.
+
+To run this example, you need to start the vLLM server
+with the reasoning parser:
+
+```bash
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+    --reasoning-parser deepseek_r1
+```
+
+This example demonstrates how to generate chat completions from reasoning models
+using the OpenAI Python client library.
+"""
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+
+def main():
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    # Round 1
+    messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+    # ruff: noqa: E501
+    # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
+    response = client.chat.completions.create(model=model, messages=messages)
+
+    reasoning_content = response.choices[0].message.reasoning_content
+    content = response.choices[0].message.content
+
+    print("reasoning_content for Round 1:", reasoning_content)
+    print("content for Round 1:", content)
+
+    # Round 2
+    messages.append({"role": "assistant", "content": content})
+    messages.append(
+        {
+            "role": "user",
+            "content": "How many Rs are there in the word 'strawberry'?",
+        }
+    )
+    response = client.chat.completions.create(model=model, messages=messages)
+
+    reasoning_content = response.choices[0].message.reasoning_content
+    content = response.choices[0].message.content
+
+    print("reasoning_content for Round 2:", reasoning_content)
+    print("content for Round 2:", content)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/vllm_v0.10.0/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
new file mode 100644
index 0000000..5a91929
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+An example shows how to generate chat completions from reasoning models
+like DeepSeekR1.
+
+To run this example, you need to start the vLLM server with the reasoning
+parser:
+
+```bash
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+     --reasoning-parser deepseek_r1
+```
+
+Unlike openai_chat_completion_with_reasoning.py, this example demonstrates the
+streaming chat completions feature.
+
+The streaming chat completions feature allows you to receive chat completions
+in real-time as they are generated by the model. This is useful for scenarios
+where you want to display chat completions to the user as they are generated
+by the model.
+
+Remember to check content and reasoning_content exist in `ChatCompletionChunk`,
+content may not exist leading to errors if you try to access it.
+"""
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+
+
+def main():
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    # ruff: noqa: E501
+    # For granite: add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
+    stream = client.chat.completions.create(model=model, messages=messages, stream=True)
+
+    print("client: Start streaming chat completions...")
+    printed_reasoning_content = False
+    printed_content = False
+
+    for chunk in stream:
+        reasoning_content = None
+        content = None
+        # Check the content is reasoning_content or content
+        if hasattr(chunk.choices[0].delta, "reasoning_content"):
+            reasoning_content = chunk.choices[0].delta.reasoning_content
+        elif hasattr(chunk.choices[0].delta, "content"):
+            content = chunk.choices[0].delta.content
+
+        if reasoning_content is not None:
+            if not printed_reasoning_content:
+                printed_reasoning_content = True
+                print("reasoning_content:", end="", flush=True)
+            print(reasoning_content, end="", flush=True)
+        elif content is not None:
+            if not printed_content:
+                printed_content = True
+                print("\ncontent:", end="", flush=True)
+            # Extract and print the content
+            print(content, end="", flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/online_serving/openai_chat_embedding_client_for_multimodal.py b/vllm_v0.10.0/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
new file mode 100644
index 0000000..70f3c2f
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import base64
+import io
+
+import requests
+from PIL import Image
+
+image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+
+
+def vlm2vec():
+    response = requests.post(
+        "http://localhost:8000/v1/embeddings",
+        json={
+            "model": "TIGER-Lab/VLM2Vec-Full",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image_url", "image_url": {"url": image_url}},
+                        {"type": "text", "text": "Represent the given image."},
+                    ],
+                }
+            ],
+            "encoding_format": "float",
+        },
+    )
+    response.raise_for_status()
+    response_json = response.json()
+
+    print("Embedding output:", response_json["data"][0]["embedding"])
+
+
+def dse_qwen2_vl(inp: dict):
+    # Embedding an Image
+    if inp["type"] == "image":
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": inp["image_url"],
+                        },
+                    },
+                    {"type": "text", "text": "What is shown in this image?"},
+                ],
+            }
+        ]
+    # Embedding a Text Query
+    else:
+        # MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
+        # of the minimum input size
+        buffer = io.BytesIO()
+        image_placeholder = Image.new("RGB", (56, 56))
+        image_placeholder.save(buffer, "png")
+        buffer.seek(0)
+        image_placeholder = base64.b64encode(buffer.read()).decode("utf-8")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/jpeg;base64,{image_placeholder}",
+                        },
+                    },
+                    {"type": "text", "text": f"Query: {inp['content']}"},
+                ],
+            }
+        ]
+
+    response = requests.post(
+        "http://localhost:8000/v1/embeddings",
+        json={
+            "model": "MrLight/dse-qwen2-2b-mrl-v1",
+            "messages": messages,
+            "encoding_format": "float",
+        },
+    )
+    response.raise_for_status()
+    response_json = response.json()
+
+    print("Embedding output:", response_json["data"][0]["embedding"])
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        "Script to call a specified VLM through the API. Make sure to serve "
+        "the model with --task embed before running this."
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        choices=["vlm2vec", "dse_qwen2_vl"],
+        required=True,
+        help="Which model to call.",
+    )
+    return parser.parse_args()
+
+
+def main(args):
+    if args.model == "vlm2vec":
+        vlm2vec()
+    elif args.model == "dse_qwen2_vl":
+        dse_qwen2_vl(
+            {
+                "type": "image",
+                "image_url": image_url,
+            }
+        )
+        dse_qwen2_vl(
+            {
+                "type": "text",
+                "content": "What is the weather like today?",
+            }
+        )
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/examples/online_serving/openai_classification_client.py b/vllm_v0.10.0/examples/online_serving/openai_classification_client.py
new file mode 100644
index 0000000..b10e7ac
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/openai_classification_client.py
@@ -0,0 +1,48 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import pprint
+
+import requests
+
+
+def post_http_request(payload: dict, api_url: str) -> requests.Response:
+    headers = {"User-Agent": "Test Client"}
+    response = requests.post(api_url, headers=headers, json=payload)
+    return response
+
+
+def parse_args():
+    parse = argparse.ArgumentParser()
+    parse.add_argument("--host", type=str, default="localhost")
+    parse.add_argument("--port", type=int, default=8000)
+    parse.add_argument("--model", type=str, default="jason9693/Qwen2.5-1.5B-apeach")
+    return parse.parse_args()
+
+
+def main(args):
+    host = args.host
+    port = args.port
+    model_name = args.model
+
+    api_url = f"http://{host}:{port}/classify"
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    payload = {
+        "model": model_name,
+        "input": prompts,
+    }
+
+    classify_response = post_http_request(payload=payload, api_url=api_url)
+    pprint.pprint(classify_response.json())
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/examples/online_serving/openai_completion_client.py b/vllm_v0.10.0/examples/online_serving/openai_completion_client.py
new file mode 100644
index 0000000..df6e4e9
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/openai_completion_client.py
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Client for vLLM API server")
+    parser.add_argument(
+        "--stream", action="store_true", help="Enable streaming response"
+    )
+    return parser.parse_args()
+
+
+def main(args):
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    # Completion API
+    completion = client.completions.create(
+        model=model,
+        prompt="A robot may not injure a human being",
+        echo=False,
+        n=2,
+        stream=args.stream,
+        logprobs=3,
+    )
+
+    print("-" * 50)
+    print("Completion results:")
+    if args.stream:
+        for c in completion:
+            print(c)
+    else:
+        print(completion)
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/examples/online_serving/openai_cross_encoder_score.py b/vllm_v0.10.0/examples/online_serving/openai_cross_encoder_score.py
new file mode 100644
index 0000000..2e0d168
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/openai_cross_encoder_score.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Example online usage of Score API.
+
+Run `vllm serve <model> --task score` to start up the server in vLLM.
+"""
+
+import argparse
+import pprint
+
+import requests
+
+
+def post_http_request(prompt: dict, api_url: str) -> requests.Response:
+    headers = {"User-Agent": "Test Client"}
+    response = requests.post(api_url, headers=headers, json=prompt)
+    return response
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--model", type=str, default="BAAI/bge-reranker-v2-m3")
+    return parser.parse_args()
+
+
+def main(args):
+    api_url = f"http://{args.host}:{args.port}/score"
+    model_name = args.model
+
+    text_1 = "What is the capital of Brazil?"
+    text_2 = "The capital of Brazil is Brasilia."
+    prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
+    score_response = post_http_request(prompt=prompt, api_url=api_url)
+    print("\nPrompt when text_1 and text_2 are both strings:")
+    pprint.pprint(prompt)
+    print("\nScore Response:")
+    pprint.pprint(score_response.json())
+
+    text_1 = "What is the capital of France?"
+    text_2 = ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]
+    prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
+    score_response = post_http_request(prompt=prompt, api_url=api_url)
+    print("\nPrompt when text_1 is string and text_2 is a list:")
+    pprint.pprint(prompt)
+    print("\nScore Response:")
+    pprint.pprint(score_response.json())
+
+    text_1 = ["What is the capital of Brazil?", "What is the capital of France?"]
+    text_2 = ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]
+    prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
+    score_response = post_http_request(prompt=prompt, api_url=api_url)
+    print("\nPrompt when text_1 and text_2 are both lists:")
+    pprint.pprint(prompt)
+    print("\nScore Response:")
+    pprint.pprint(score_response.json())
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/examples/online_serving/openai_cross_encoder_score_for_multimodal.py b/vllm_v0.10.0/examples/online_serving/openai_cross_encoder_score_for_multimodal.py
new file mode 100644
index 0000000..e49905a
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/openai_cross_encoder_score_for_multimodal.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Example online usage of Score API.
+
+Run `vllm serve <model> --task score` to start up the server in vLLM.
+"""
+
+import argparse
+import pprint
+
+import requests
+
+
+def post_http_request(prompt: dict, api_url: str) -> requests.Response:
+    headers = {"User-Agent": "Test Client"}
+    response = requests.post(api_url, headers=headers, json=prompt)
+    return response
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--model", type=str, default="jinaai/jina-reranker-m0")
+    return parser.parse_args()
+
+
+def main(args):
+    api_url = f"http://{args.host}:{args.port}/score"
+    model_name = args.model
+
+    text_1 = "slm markdown"
+    text_2 = {
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
+                },
+            },
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
+                },
+            },
+        ]
+    }
+    prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
+    score_response = post_http_request(prompt=prompt, api_url=api_url)
+    print("\nPrompt when text_1 is string and text_2 is a image list:")
+    pprint.pprint(prompt)
+    print("\nScore Response:")
+    pprint.pprint(score_response.json())
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/examples/online_serving/openai_embedding_client.py b/vllm_v0.10.0/examples/online_serving/openai_embedding_client.py
new file mode 100644
index 0000000..6bc3908
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/openai_embedding_client.py
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+
+def main():
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    responses = client.embeddings.create(
+        # ruff: noqa: E501
+        input=[
+            "Hello my name is",
+            "The best thing about vLLM is that it supports many different models",
+        ],
+        model=model,
+    )
+
+    for data in responses.data:
+        print(data.embedding)  # List of float of len 4096
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/online_serving/openai_embedding_matryoshka_fy.py b/vllm_v0.10.0/examples/online_serving/openai_embedding_matryoshka_fy.py
new file mode 100644
index 0000000..653da8d
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/openai_embedding_matryoshka_fy.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Example Python client for embedding API dimensions using vLLM API server
+NOTE:
+    start a supported Matryoshka Embeddings model server with `vllm serve`, e.g.
+    vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
+"""
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+
+def main():
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    responses = client.embeddings.create(
+        input=["Follow the white rabbit."],
+        model=model,
+        dimensions=32,
+    )
+
+    for data in responses.data:
+        print(data.embedding)  # List of float of len 32
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/online_serving/openai_pooling_client.py b/vllm_v0.10.0/examples/online_serving/openai_pooling_client.py
new file mode 100644
index 0000000..8252b36
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/openai_pooling_client.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Example online usage of Pooling API.
+
+Run `vllm serve <model> --task <embed|classify|reward|score>`
+to start up the server in vLLM.
+"""
+
+import argparse
+import pprint
+
+import requests
+
+
+def post_http_request(prompt: dict, api_url: str) -> requests.Response:
+    headers = {"User-Agent": "Test Client"}
+    response = requests.post(api_url, headers=headers, json=prompt)
+    return response
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--model", type=str, default="jason9693/Qwen2.5-1.5B-apeach")
+
+    return parser.parse_args()
+
+
+def main(args):
+    api_url = f"http://{args.host}:{args.port}/pooling"
+    model_name = args.model
+
+    # Input like Completions API
+    prompt = {"model": model_name, "input": "vLLM is great!"}
+    pooling_response = post_http_request(prompt=prompt, api_url=api_url)
+    print("-" * 50)
+    print("Pooling Response:")
+    pprint.pprint(pooling_response.json())
+    print("-" * 50)
+
+    # Input like Chat API
+    prompt = {
+        "model": model_name,
+        "messages": [
+            {
+                "role": "user",
+                "content": [{"type": "text", "text": "vLLM is great!"}],
+            }
+        ],
+    }
+    pooling_response = post_http_request(prompt=prompt, api_url=api_url)
+    print("Pooling Response:")
+    pprint.pprint(pooling_response.json())
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/vllm_v0.10.0/examples/online_serving/openai_transcription_client.py b/vllm_v0.10.0/examples/online_serving/openai_transcription_client.py
new file mode 100644
index 0000000..0d1d73f
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/openai_transcription_client.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This script demonstrates how to use the vLLM API server to perform audio
+transcription with the `openai/whisper-large-v3` model.
+
+Before running this script, you must start the vLLM server with the following command:
+
+    vllm serve openai/whisper-large-v3
+
+Requirements:
+- vLLM with audio support
+- openai Python SDK
+- httpx for streaming support
+
+The script performs:
+1. Synchronous transcription using OpenAI-compatible API.
+2. Streaming transcription using raw HTTP request to the vLLM server.
+"""
+
+import asyncio
+
+from openai import AsyncOpenAI, OpenAI
+
+from vllm.assets.audio import AudioAsset
+
+
+def sync_openai(audio_path: str, client: OpenAI):
+    """
+    Perform synchronous transcription using OpenAI-compatible API.
+    """
+    with open(audio_path, "rb") as f:
+        transcription = client.audio.transcriptions.create(
+            file=f,
+            model="openai/whisper-large-v3",
+            language="en",
+            response_format="json",
+            temperature=0.0,
+            # Additional sampling params not provided by OpenAI API.
+            extra_body=dict(
+                seed=4419,
+                repetition_penalty=1.3,
+            ),
+        )
+        print("transcription result:", transcription.text)
+
+
+async def stream_openai_response(audio_path: str, client: AsyncOpenAI):
+    """
+    Perform asynchronous transcription using OpenAI-compatible API.
+    """
+    print("\ntranscription result:", end=" ")
+    with open(audio_path, "rb") as f:
+        transcription = await client.audio.transcriptions.create(
+            file=f,
+            model="openai/whisper-large-v3",
+            language="en",
+            response_format="json",
+            temperature=0.0,
+            # Additional sampling params not provided by OpenAI API.
+            extra_body=dict(
+                seed=420,
+                top_p=0.6,
+            ),
+            stream=True,
+        )
+        async for chunk in transcription:
+            if chunk.choices:
+                content = chunk.choices[0].get("delta", {}).get("content")
+                print(content, end="", flush=True)
+
+    print()  # Final newline after stream ends
+
+
+def main():
+    mary_had_lamb = str(AudioAsset("mary_had_lamb").get_local_path())
+    winning_call = str(AudioAsset("winning_call").get_local_path())
+
+    # Modify OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    sync_openai(mary_had_lamb, client)
+    # Run the asynchronous function
+    client = AsyncOpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    asyncio.run(stream_openai_response(winning_call, client))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/online_serving/openai_translation_client.py b/vllm_v0.10.0/examples/online_serving/openai_translation_client.py
new file mode 100644
index 0000000..6f7253e
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/openai_translation_client.py
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import json
+
+import httpx
+from openai import OpenAI
+
+from vllm.assets.audio import AudioAsset
+
+
+def sync_openai(audio_path: str, client: OpenAI):
+    with open(audio_path, "rb") as f:
+        translation = client.audio.translations.create(
+            file=f,
+            model="openai/whisper-large-v3",
+            response_format="json",
+            temperature=0.0,
+            # Additional params not provided by OpenAI API.
+            extra_body=dict(
+                language="it",
+                seed=4419,
+                repetition_penalty=1.3,
+            ),
+        )
+        print("translation result:", translation.text)
+
+
+async def stream_openai_response(audio_path: str, base_url: str, api_key: str):
+    data = {
+        "language": "it",
+        "stream": True,
+        "model": "openai/whisper-large-v3",
+    }
+    url = base_url + "/audio/translations"
+    headers = {"Authorization": f"Bearer {api_key}"}
+    print("translation result:", end=" ")
+    # OpenAI translation API client does not support streaming.
+    async with httpx.AsyncClient() as client:
+        with open(audio_path, "rb") as f:
+            async with client.stream(
+                "POST", url, files={"file": f}, data=data, headers=headers
+            ) as response:
+                async for line in response.aiter_lines():
+                    # Each line is a JSON object prefixed with 'data: '
+                    if line:
+                        if line.startswith("data: "):
+                            line = line[len("data: ") :]
+                        # Last chunk, stream ends
+                        if line.strip() == "[DONE]":
+                            break
+                        # Parse the JSON response
+                        chunk = json.loads(line)
+                        # Extract and print the content
+                        content = chunk["choices"][0].get("delta", {}).get("content")
+                        print(content, end="")
+
+
+def main():
+    foscolo = str(AudioAsset("azacinto_foscolo").get_local_path())
+
+    # Modify OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    sync_openai(foscolo, client)
+    # Run the asynchronous function
+    asyncio.run(stream_openai_response(foscolo, openai_api_base, openai_api_key))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/online_serving/opentelemetry/README.md b/vllm_v0.10.0/examples/online_serving/opentelemetry/README.md
new file mode 100644
index 0000000..ae5d84d
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/opentelemetry/README.md
@@ -0,0 +1,94 @@
+# Setup OpenTelemetry POC
+
+1. Install OpenTelemetry packages:
+
+    ```bash
+    pip install \
+      'opentelemetry-sdk>=1.26.0,<1.27.0' \
+      'opentelemetry-api>=1.26.0,<1.27.0' \
+      'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
+      'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'
+    ```
+
+1. Start Jaeger in a docker container:
+
+    ```bash
+    # From: https://www.jaegertracing.io/docs/1.57/getting-started/
+    docker run --rm --name jaeger \
+        -e COLLECTOR_ZIPKIN_HOST_PORT=:9411 \
+        -p 6831:6831/udp \
+        -p 6832:6832/udp \
+        -p 5778:5778 \
+        -p 16686:16686 \
+        -p 4317:4317 \
+        -p 4318:4318 \
+        -p 14250:14250 \
+        -p 14268:14268 \
+        -p 14269:14269 \
+        -p 9411:9411 \
+        jaegertracing/all-in-one:1.57
+    ```
+
+1. In a new shell, export Jaeger IP:
+
+    ```bash
+    export JAEGER_IP=$(docker inspect   --format '{{ .NetworkSettings.IPAddress }}' jaeger)
+    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
+    ```
+
+    Then set vLLM's service name for OpenTelemetry, enable insecure connections to Jaeger and run vLLM:
+
+    ```bash
+    export OTEL_SERVICE_NAME="vllm-server"
+    export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
+    vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
+    ```
+
+1. In a new shell, send requests with trace context from a dummy client
+
+    ```bash
+    export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger)
+    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
+    export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
+    export OTEL_SERVICE_NAME="client-service"
+    python dummy_client.py
+    ```
+
+1. Open Jaeger webui: <http://localhost:16686/>
+
+    In the search pane, select `vllm-server` service and hit `Find Traces`. You should get a list of traces, one for each request.
+    ![Traces](https://i.imgur.com/GYHhFjo.png)
+
+1. Clicking on a trace will show its spans and their tags. In this demo, each trace has 2 spans. One from the dummy client containing the prompt text and one from vLLM containing metadata about the request.
+![Spans details](https://i.imgur.com/OPf6CBL.png)
+
+## Exporter Protocol
+
+OpenTelemetry supports either `grpc` or `http/protobuf` as the transport protocol for trace data in the exporter.
+By default, `grpc` is used. To set `http/protobuf` as the protocol, configure the `OTEL_EXPORTER_OTLP_TRACES_PROTOCOL` environment variable as follows:
+
+```bash
+export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf
+export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
+vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
+```
+
+## Instrumentation of FastAPI
+
+OpenTelemetry allows automatic instrumentation of FastAPI.
+
+1. Install the instrumentation library
+
+    ```bash
+    pip install opentelemetry-instrumentation-fastapi
+    ```
+
+1. Run vLLM with `opentelemetry-instrument`
+
+    ```bash
+    opentelemetry-instrument vllm serve facebook/opt-125m
+    ```
+
+1. Send a request to vLLM and find its trace in Jaeger. It should contain spans from FastAPI.
+
+![FastAPI Spans](https://i.imgur.com/hywvoOJ.png)
diff --git a/vllm_v0.10.0/examples/online_serving/opentelemetry/dummy_client.py b/vllm_v0.10.0/examples/online_serving/opentelemetry/dummy_client.py
new file mode 100644
index 0000000..018d986
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/opentelemetry/dummy_client.py
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import requests
+from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
+from opentelemetry.trace import SpanKind, set_tracer_provider
+from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
+
+trace_provider = TracerProvider()
+set_tracer_provider(trace_provider)
+
+trace_provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter()))
+trace_provider.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter()))
+
+tracer = trace_provider.get_tracer("dummy-client")
+
+url = "http://localhost:8000/v1/completions"
+with tracer.start_as_current_span("client-span", kind=SpanKind.CLIENT) as span:
+    prompt = "San Francisco is a"
+    span.set_attribute("prompt", prompt)
+    headers = {}
+    TraceContextTextMapPropagator().inject(headers)
+    payload = {
+        "model": "facebook/opt-125m",
+        "prompt": prompt,
+        "max_tokens": 10,
+        "n": 3,
+        "use_beam_search": "true",
+        "temperature": 0.0,
+        # "stream": True,
+    }
+    response = requests.post(url, headers=headers, json=payload)
diff --git a/vllm_v0.10.0/examples/online_serving/prometheus_grafana/README.md b/vllm_v0.10.0/examples/online_serving/prometheus_grafana/README.md
new file mode 100644
index 0000000..6df9594
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/prometheus_grafana/README.md
@@ -0,0 +1,58 @@
+# Prometheus and Grafana
+
+This is a simple example that shows you how to connect vLLM metric logging to the Prometheus/Grafana stack. For this example, we launch Prometheus and Grafana via Docker. You can checkout other methods through [Prometheus](https://prometheus.io/) and [Grafana](https://grafana.com/) websites.
+
+Install:
+
+- [`docker`](https://docs.docker.com/engine/install/)
+- [`docker compose`](https://docs.docker.com/compose/install/linux/#install-using-the-repository)
+
+## Launch
+
+Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint:
+
+```bash
+vllm serve mistralai/Mistral-7B-v0.1 \
+    --max-model-len 2048 \
+    --disable-log-requests
+```
+
+Launch Prometheus and Grafana servers with `docker compose`:
+
+```bash
+docker compose up
+```
+
+Submit some sample requests to the server:
+
+```bash
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+python3 ../../../benchmarks/benchmark_serving.py \
+    --model mistralai/Mistral-7B-v0.1 \
+    --tokenizer mistralai/Mistral-7B-v0.1 \
+    --endpoint /v1/completions \
+    --dataset-name sharegpt \
+    --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \
+    --request-rate 3.0
+```
+
+Navigating to [`http://localhost:8000/metrics`](http://localhost:8000/metrics) will show the raw Prometheus metrics being exposed by vLLM.
+
+## Grafana Dashboard
+
+Navigate to [`http://localhost:3000`](http://localhost:3000). Log in with the default username (`admin`) and password (`admin`).
+
+### Add Prometheus Data Source
+
+Navigate to [`http://localhost:3000/connections/datasources/new`](http://localhost:3000/connections/datasources/new) and select Prometheus.
+
+On Prometheus configuration page, we need to add the `Prometheus Server URL` in `Connection`. For this setup, Grafana and Prometheus are running in separate containers, but Docker creates DNS name for each containers. You can just use `http://prometheus:9090`.
+
+Click `Save & Test`. You should get a green check saying "Successfully queried the Prometheus API.".
+
+### Import Dashboard
+
+Navigate to [`http://localhost:3000/dashboard/import`](http://localhost:3000/dashboard/import), upload `grafana.json`, and select the `prometheus` datasource. You should see a screen that looks like the following:
+
+![Grafana Dashboard Image](https://i.imgur.com/R2vH9VW.png)
diff --git a/vllm_v0.10.0/examples/online_serving/prometheus_grafana/docker-compose.yaml b/vllm_v0.10.0/examples/online_serving/prometheus_grafana/docker-compose.yaml
new file mode 100644
index 0000000..13b987c
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/prometheus_grafana/docker-compose.yaml
@@ -0,0 +1,19 @@
+# docker-compose.yaml
+version: "3"
+
+services:
+  prometheus:
+    image: prom/prometheus:latest
+    extra_hosts:
+      - "host.docker.internal:host-gateway"     # allow a direct connection from container to the local machine
+    ports:
+      - "9090:9090"   # the default port used by Prometheus
+    volumes:
+      - ${PWD}/prometheus.yaml:/etc/prometheus/prometheus.yml # mount Prometheus config file
+
+  grafana:
+    image: grafana/grafana:latest
+    depends_on:
+      - prometheus
+    ports:
+      - "3000:3000" # the default port used by Grafana
diff --git a/vllm_v0.10.0/examples/online_serving/prometheus_grafana/grafana.json b/vllm_v0.10.0/examples/online_serving/prometheus_grafana/grafana.json
new file mode 100644
index 0000000..3488956
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/prometheus_grafana/grafana.json
@@ -0,0 +1,1527 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {
+          "type": "grafana",
+          "uid": "-- Grafana --"
+        },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "target": {
+          "limit": 100,
+          "matchAny": false,
+          "tags": [],
+          "type": "dashboard"
+        },
+        "type": "dashboard"
+      }
+    ]
+  },
+  "description": "Monitoring vLLM Inference Server",
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": 1,
+  "links": [],
+  "liveNow": false,
+  "panels": [
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "End to end request latency measured in seconds.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 0
+      },
+      "id": 9,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P99",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P95",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P90",
+          "range": true,
+          "refId": "C",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P50",
+          "range": true,
+          "refId": "D",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(vllm:e2e_request_latency_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:e2e_request_latency_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "Average",
+          "range": true,
+          "refId": "E"
+        }
+      ],
+      "title": "E2E Request Latency",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Number of tokens processed per second",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 0
+      },
+      "id": 8,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "rate(vllm:prompt_tokens_total{model_name=\"$model_name\"}[$__rate_interval])",
+          "fullMetaSearch": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "Prompt Tokens/Sec",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "rate(vllm:generation_tokens_total{model_name=\"$model_name\"}[$__rate_interval])",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "Generation Tokens/Sec",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        }
+      ],
+      "title": "Token Throughput",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Inter token latency in seconds.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 8
+      },
+      "id": 10,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P99",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P95",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P90",
+          "range": true,
+          "refId": "C",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P50",
+          "range": true,
+          "refId": "D",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(vllm:time_per_output_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "Mean",
+          "range": true,
+          "refId": "E"
+        }
+      ],
+      "title": "Time Per Output Token Latency",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Number of requests in RUNNING, WAITING, and SWAPPED state",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "none"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 8
+      },
+      "id": 3,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "vllm:num_requests_running{model_name=\"$model_name\"}",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "Num Running",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "vllm:num_requests_waiting{model_name=\"$model_name\"}",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "Num Waiting",
+          "range": true,
+          "refId": "C",
+          "useBackend": false
+        }
+      ],
+      "title": "Scheduler State",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "P50, P90, P95, and P99 TTFT latency in seconds.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 16
+      },
+      "id": 5,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P99",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P95",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P90",
+          "range": true,
+          "refId": "C",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P50",
+          "range": true,
+          "refId": "D",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(vllm:time_to_first_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_to_first_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "Average",
+          "range": true,
+          "refId": "E"
+        }
+      ],
+      "title": "Time To First Token Latency",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Percentage of used cache blocks by vLLM.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "percentunit"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 16
+      },
+      "id": 4,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "vllm:gpu_cache_usage_perc{model_name=\"$model_name\"}",
+          "instant": false,
+          "legendFormat": "GPU Cache Usage",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Cache Utilization",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Heatmap of request prompt length",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "scaleDistribution": {
+              "type": "linear"
+            }
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 24
+      },
+      "id": 12,
+      "options": {
+        "calculate": false,
+        "cellGap": 1,
+        "cellValues": {
+          "unit": "none"
+        },
+        "color": {
+          "exponent": 0.5,
+          "fill": "dark-orange",
+          "min": 0,
+          "mode": "scheme",
+          "reverse": false,
+          "scale": "exponential",
+          "scheme": "Spectral",
+          "steps": 64
+        },
+        "exemplars": {
+          "color": "rgba(255,0,255,0.7)"
+        },
+        "filterValues": {
+          "le": 1e-9
+        },
+        "legend": {
+          "show": true
+        },
+        "rowsFrame": {
+          "layout": "auto",
+          "value": "Request count"
+        },
+        "tooltip": {
+          "mode": "single",
+          "showColorScale": false,
+          "yHistogram": true
+        },
+        "yAxis": {
+          "axisLabel": "Prompt Length",
+          "axisPlacement": "left",
+          "reverse": false,
+          "unit": "none"
+        }
+      },
+      "pluginVersion": "11.2.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "sum by(le) (increase(vllm:request_prompt_tokens_bucket{model_name=\"$model_name\"}[$__rate_interval]))",
+          "format": "heatmap",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "{{le}}",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Request Prompt Length",
+      "type": "heatmap"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Heatmap of request generation length",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "scaleDistribution": {
+              "type": "linear"
+            }
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 24
+      },
+      "id": 13,
+      "options": {
+        "calculate": false,
+        "cellGap": 1,
+        "cellValues": {
+          "unit": "none"
+        },
+        "color": {
+          "exponent": 0.5,
+          "fill": "dark-orange",
+          "min": 0,
+          "mode": "scheme",
+          "reverse": false,
+          "scale": "exponential",
+          "scheme": "Spectral",
+          "steps": 64
+        },
+        "exemplars": {
+          "color": "rgba(255,0,255,0.7)"
+        },
+        "filterValues": {
+          "le": 1e-9
+        },
+        "legend": {
+          "show": true
+        },
+        "rowsFrame": {
+          "layout": "auto",
+          "value": "Request count"
+        },
+        "tooltip": {
+          "mode": "single",
+          "showColorScale": false,
+          "yHistogram": true
+        },
+        "yAxis": {
+          "axisLabel": "Generation Length",
+          "axisPlacement": "left",
+          "reverse": false,
+          "unit": "none"
+        }
+      },
+      "pluginVersion": "11.2.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "sum by(le) (increase(vllm:request_generation_tokens_bucket{model_name=\"$model_name\"}[$__rate_interval]))",
+          "format": "heatmap",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "{{le}}",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Request Generation Length",
+      "type": "heatmap"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 32
+      },
+      "id": 11,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "sum by(finished_reason) (increase(vllm:request_success_total{model_name=\"$model_name\"}[$__rate_interval]))",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "interval": "",
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Finish Reason",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "default": false,
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "seconds",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 32
+      },
+      "id": 14,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "rate(vllm:request_queue_time_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Queue Time",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "default": false,
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 40
+      },
+      "id": 15,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "rate(vllm:request_prefill_time_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "Prefill",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(vllm:request_decode_time_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "Decode",
+          "range": true,
+          "refId": "B"
+        }
+      ],
+      "title": "Requests Prefill and Decode Time",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "default": false,
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 40
+      },
+      "id": 16,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "rate(vllm:request_max_num_generation_tokens_sum{model_name=\"$model_name\"}[$__rate_interval])",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "Tokens",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Max Generation Token in Sequence Group",
+      "type": "timeseries"
+    }
+  ],
+  "refresh": "",
+  "schemaVersion": 39,
+  "tags": [],
+  "templating": {
+    "list": [
+      {
+        "current": {
+          "selected": false,
+          "text": "prometheus",
+          "value": "edx8memhpd9tsa"
+        },
+        "hide": 0,
+        "includeAll": false,
+        "label": "datasource",
+        "multi": false,
+        "name": "DS_PROMETHEUS",
+        "options": [],
+        "query": "prometheus",
+        "queryValue": "",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "type": "datasource"
+      },
+      {
+        "current": {
+          "selected": false,
+          "text": "/share/datasets/public_models/Meta-Llama-3-8B-Instruct",
+          "value": "/share/datasets/public_models/Meta-Llama-3-8B-Instruct"
+        },
+        "datasource": {
+          "type": "prometheus",
+          "uid": "${DS_PROMETHEUS}"
+        },
+        "definition": "label_values(model_name)",
+        "hide": 0,
+        "includeAll": false,
+        "label": "model_name",
+        "multi": false,
+        "name": "model_name",
+        "options": [],
+        "query": {
+          "query": "label_values(model_name)",
+          "refId": "StandardVariableQuery"
+        },
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 0,
+        "type": "query"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-5m",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "",
+  "title": "vLLM",
+  "uid": "b281712d-8bff-41ef-9f3f-71ad43c05e9b",
+  "version": 8,
+  "weekStart": ""
+}
diff --git a/vllm_v0.10.0/examples/online_serving/prometheus_grafana/prometheus.yaml b/vllm_v0.10.0/examples/online_serving/prometheus_grafana/prometheus.yaml
new file mode 100644
index 0000000..754533b
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/prometheus_grafana/prometheus.yaml
@@ -0,0 +1,10 @@
+# prometheus.yaml
+global:
+  scrape_interval: 5s
+  evaluation_interval: 30s
+
+scrape_configs:
+  - job_name: vllm
+    static_configs:
+      - targets:
+          - 'host.docker.internal:8000'
diff --git a/vllm_v0.10.0/examples/online_serving/prompt_embed_inference_with_openai_client.py b/vllm_v0.10.0/examples/online_serving/prompt_embed_inference_with_openai_client.py
new file mode 100644
index 0000000..3a90421
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/prompt_embed_inference_with_openai_client.py
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+vLLM OpenAI-Compatible Client with Prompt Embeddings
+
+This script demonstrates how to:
+1. Generate prompt embeddings using Hugging Face Transformers
+2. Encode them in base64 format
+3. Send them to a vLLM server via the OpenAI-compatible Completions API
+
+Run the vLLM server first:
+vllm serve meta-llama/Llama-3.2-1B-Instruct \
+  --task generate \
+  --max-model-len 4096 \
+  --enable-prompt-embeds
+
+Run the client:
+python examples/online_serving/prompt_embed_inference_with_openai_client.py
+
+Model: meta-llama/Llama-3.2-1B-Instruct
+Note: This model is gated on Hugging Face Hub.
+      You must request access to use it:
+      https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct
+
+Dependencies:
+- transformers
+- torch
+- openai
+"""
+
+import base64
+import io
+
+import torch
+import transformers
+from openai import OpenAI
+
+
+def main():
+    client = OpenAI(
+        api_key="EMPTY",
+        base_url="http://localhost:8000/v1",
+    )
+
+    model_name = "meta-llama/Llama-3.2-1B-Instruct"
+
+    # Transformers
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
+    transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
+
+    # Refer to the HuggingFace repo for the correct format to use
+    chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
+    token_ids = tokenizer.apply_chat_template(
+        chat, add_generation_prompt=True, return_tensors="pt"
+    )
+
+    embedding_layer = transformers_model.get_input_embeddings()
+    prompt_embeds = embedding_layer(token_ids).squeeze(0)
+
+    # Prompt embeddings
+    buffer = io.BytesIO()
+    torch.save(prompt_embeds, buffer)
+    buffer.seek(0)
+    binary_data = buffer.read()
+    encoded_embeds = base64.b64encode(binary_data).decode("utf-8")
+
+    completion = client.completions.create(
+        model=model_name,
+        # NOTE: The OpenAI client does not allow `None` as an input to
+        # `prompt`. Use an empty string if you have no text prompts.
+        prompt="",
+        max_tokens=5,
+        temperature=0.0,
+        # NOTE: The OpenAI client allows passing in extra JSON body via the
+        # `extra_body` argument.
+        extra_body={"prompt_embeds": encoded_embeds},
+    )
+
+    print("-" * 30)
+    print(completion.choices[0].text)
+    print("-" * 30)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/online_serving/ray_serve_deepseek.py b/vllm_v0.10.0/examples/online_serving/ray_serve_deepseek.py
new file mode 100644
index 0000000..d24b553
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/ray_serve_deepseek.py
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Deploy DeepSeek R1 or V3 with Ray Serve LLM.
+
+Ray Serve LLM is a scalable and production-grade model serving library built
+on the Ray distributed computing framework and first-class support for the vLLM engine.
+
+Key features:
+- Automatic scaling, back-pressure, and load balancing across a Ray cluster.
+- Unified multi-node multi-model deployment.
+- Exposes an OpenAI-compatible HTTP API.
+- Multi-LoRA support with shared base models.
+
+Run `python3 ray_serve_deepseek.py` to launch an endpoint.
+
+Learn more in the official Ray Serve LLM documentation:
+https://docs.ray.io/en/latest/serve/llm/serving-llms.html
+"""
+
+from ray import serve
+from ray.serve.llm import LLMConfig, build_openai_app
+
+llm_config = LLMConfig(
+    model_loading_config={
+        "model_id": "deepseek",
+        # Pre-downloading the model to local storage is recommended since
+        # the model is large. Set model_source="/path/to/the/model".
+        "model_source": "deepseek-ai/DeepSeek-R1",
+    },
+    deployment_config={
+        "autoscaling_config": {
+            "min_replicas": 1,
+            "max_replicas": 1,
+        }
+    },
+    # Set to the node's accelerator type.
+    accelerator_type="H100",
+    runtime_env={"env_vars": {"VLLM_USE_V1": "1"}},
+    # Customize engine arguments as required (for example, vLLM engine kwargs).
+    engine_kwargs={
+        "tensor_parallel_size": 8,
+        "pipeline_parallel_size": 2,
+        "gpu_memory_utilization": 0.92,
+        "dtype": "auto",
+        "max_num_seqs": 40,
+        "max_model_len": 16384,
+        "enable_chunked_prefill": True,
+        "enable_prefix_caching": True,
+        "trust_remote_code": True,
+    },
+)
+
+# Deploy the application.
+llm_app = build_openai_app({"llm_configs": [llm_config]})
+serve.run(llm_app)
diff --git a/vllm_v0.10.0/examples/online_serving/retrieval_augmented_generation_with_langchain.py b/vllm_v0.10.0/examples/online_serving/retrieval_augmented_generation_with_langchain.py
new file mode 100644
index 0000000..d9a4cad
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/retrieval_augmented_generation_with_langchain.py
@@ -0,0 +1,257 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Retrieval Augmented Generation (RAG) Implementation with Langchain
+==================================================================
+
+This script demonstrates a RAG implementation using LangChain, Milvus
+and vLLM. RAG enhances LLM responses by retrieving relevant context
+from a document collection.
+
+Features:
+- Web content loading and chunking
+- Vector storage with Milvus
+- Embedding generation with vLLM
+- Question answering with context
+
+Prerequisites:
+1. Install dependencies:
+    pip install -U vllm \
+                 langchain_milvus langchain_openai \
+                 langchain_community beautifulsoup4 \
+                 langchain-text-splitters
+
+2. Start services:
+    # Start embedding service (port 8000)
+    vllm serve ssmits/Qwen2-7B-Instruct-embed-base
+
+    # Start chat service (port 8001)
+    vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
+
+Usage:
+    python retrieval_augmented_generation_with_langchain.py
+
+Notes:
+    - Ensure both vLLM services are running before executing
+    - Default ports: 8000 (embedding), 8001 (chat)
+    - First run may take time to download models
+"""
+
+import argparse
+from argparse import Namespace
+from typing import Any
+
+from langchain_community.document_loaders import WebBaseLoader
+from langchain_core.documents import Document
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import PromptTemplate
+from langchain_core.runnables import RunnablePassthrough
+from langchain_milvus import Milvus
+from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+
+
+def load_and_split_documents(config: dict[str, Any]):
+    """
+    Load and split documents from web URL
+    """
+    try:
+        loader = WebBaseLoader(web_paths=(config["url"],))
+        docs = loader.load()
+
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=config["chunk_size"],
+            chunk_overlap=config["chunk_overlap"],
+        )
+        return text_splitter.split_documents(docs)
+    except Exception as e:
+        print(f"Error loading document from {config['url']}: {str(e)}")
+        raise
+
+
+def init_vectorstore(config: dict[str, Any], documents: list[Document]):
+    """
+    Initialize vector store with documents
+    """
+    return Milvus.from_documents(
+        documents=documents,
+        embedding=OpenAIEmbeddings(
+            model=config["embedding_model"],
+            openai_api_key=config["vllm_api_key"],
+            openai_api_base=config["vllm_embedding_endpoint"],
+        ),
+        connection_args={"uri": config["uri"]},
+        drop_old=True,
+    )
+
+
+def init_llm(config: dict[str, Any]):
+    """
+    Initialize llm
+    """
+    return ChatOpenAI(
+        model=config["chat_model"],
+        openai_api_key=config["vllm_api_key"],
+        openai_api_base=config["vllm_chat_endpoint"],
+    )
+
+
+def get_qa_prompt():
+    """
+    Get question answering prompt template
+    """
+    template = """You are an assistant for question-answering tasks.
+Use the following pieces of retrieved context to answer the question.
+If you don't know the answer, just say that you don't know.
+Use three sentences maximum and keep the answer concise.
+Question: {question}
+Context: {context}
+Answer:
+"""
+    return PromptTemplate.from_template(template)
+
+
+def format_docs(docs: list[Document]):
+    """
+    Format documents for prompt
+    """
+    return "\n\n".join(doc.page_content for doc in docs)
+
+
+def create_qa_chain(retriever: Any, llm: ChatOpenAI, prompt: PromptTemplate):
+    """
+    Set up question answering chain
+    """
+    return (
+        {
+            "context": retriever | format_docs,
+            "question": RunnablePassthrough(),
+        }
+        | prompt
+        | llm
+        | StrOutputParser()
+    )
+
+
+def get_parser() -> argparse.ArgumentParser:
+    """
+    Parse command line arguments
+    """
+    parser = argparse.ArgumentParser(description="RAG with vLLM and langchain")
+
+    # Add command line arguments
+    parser.add_argument(
+        "--vllm-api-key", default="EMPTY", help="API key for vLLM compatible services"
+    )
+    parser.add_argument(
+        "--vllm-embedding-endpoint",
+        default="http://localhost:8000/v1",
+        help="Base URL for embedding service",
+    )
+    parser.add_argument(
+        "--vllm-chat-endpoint",
+        default="http://localhost:8001/v1",
+        help="Base URL for chat service",
+    )
+    parser.add_argument("--uri", default="./milvus.db", help="URI for Milvus database")
+    parser.add_argument(
+        "--url",
+        default=("https://docs.vllm.ai/en/latest/getting_started/quickstart.html"),
+        help="URL of the document to process",
+    )
+    parser.add_argument(
+        "--embedding-model",
+        default="ssmits/Qwen2-7B-Instruct-embed-base",
+        help="Model name for embeddings",
+    )
+    parser.add_argument(
+        "--chat-model", default="qwen/Qwen1.5-0.5B-Chat", help="Model name for chat"
+    )
+    parser.add_argument(
+        "-i", "--interactive", action="store_true", help="Enable interactive Q&A mode"
+    )
+    parser.add_argument(
+        "-k", "--top-k", type=int, default=3, help="Number of top results to retrieve"
+    )
+    parser.add_argument(
+        "-c",
+        "--chunk-size",
+        type=int,
+        default=1000,
+        help="Chunk size for document splitting",
+    )
+    parser.add_argument(
+        "-o",
+        "--chunk-overlap",
+        type=int,
+        default=200,
+        help="Chunk overlap for document splitting",
+    )
+
+    return parser
+
+
+def init_config(args: Namespace):
+    """
+    Initialize configuration settings from command line arguments
+    """
+
+    return {
+        "vllm_api_key": args.vllm_api_key,
+        "vllm_embedding_endpoint": args.vllm_embedding_endpoint,
+        "vllm_chat_endpoint": args.vllm_chat_endpoint,
+        "uri": args.uri,
+        "embedding_model": args.embedding_model,
+        "chat_model": args.chat_model,
+        "url": args.url,
+        "chunk_size": args.chunk_size,
+        "chunk_overlap": args.chunk_overlap,
+        "top_k": args.top_k,
+    }
+
+
+def main():
+    # Parse command line arguments
+    args = get_parser().parse_args()
+
+    # Initialize configuration
+    config = init_config(args)
+
+    # Load and split documents
+    documents = load_and_split_documents(config)
+
+    # Initialize vector store and retriever
+    vectorstore = init_vectorstore(config, documents)
+    retriever = vectorstore.as_retriever(search_kwargs={"k": config["top_k"]})
+
+    # Initialize llm and prompt
+    llm = init_llm(config)
+    prompt = get_qa_prompt()
+
+    # Set up QA chain
+    qa_chain = create_qa_chain(retriever, llm, prompt)
+
+    # Interactive mode
+    if args.interactive:
+        print("\nWelcome to Interactive Q&A System!")
+        print("Enter 'q' or 'quit' to exit.")
+
+        while True:
+            question = input("\nPlease enter your question: ")
+            if question.lower() in ["q", "quit"]:
+                print("\nThank you for using! Goodbye!")
+                break
+
+            output = qa_chain.invoke(question)
+            print(output)
+    else:
+        # Default single question mode
+        question = "How to install vLLM?"
+        output = qa_chain.invoke(question)
+        print("-" * 50)
+        print(output)
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py b/vllm_v0.10.0/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py
new file mode 100644
index 0000000..be4796a
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py
@@ -0,0 +1,225 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+RAG (Retrieval Augmented Generation) Implementation with LlamaIndex
+================================================================
+
+This script demonstrates a RAG system using:
+- LlamaIndex: For document indexing and retrieval
+- Milvus: As vector store backend
+- vLLM: For embedding and text generation
+
+Features:
+1. Document Loading & Processing
+2. Embedding & Storage
+3. Query Processing
+
+Requirements:
+1. Install dependencies:
+pip install llama-index llama-index-readers-web \
+            llama-index-llms-openai-like    \
+            llama-index-embeddings-openai-like \
+            llama-index-vector-stores-milvus \
+
+2. Start services:
+    # Start embedding service (port 8000)
+    vllm serve ssmits/Qwen2-7B-Instruct-embed-base
+
+    # Start chat service (port 8001)
+    vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
+
+Usage:
+    python retrieval_augmented_generation_with_llamaindex.py
+
+Notes:
+    - Ensure both vLLM services are running before executing
+    - Default ports: 8000 (embedding), 8001 (chat)
+    - First run may take time to download models
+"""
+
+import argparse
+from argparse import Namespace
+from typing import Any
+
+from llama_index.core import Settings, StorageContext, VectorStoreIndex
+from llama_index.core.node_parser import SentenceSplitter
+from llama_index.embeddings.openai_like import OpenAILikeEmbedding
+from llama_index.llms.openai_like import OpenAILike
+from llama_index.readers.web import SimpleWebPageReader
+from llama_index.vector_stores.milvus import MilvusVectorStore
+
+
+def init_config(args: Namespace):
+    """Initialize configuration with command line arguments"""
+    return {
+        "url": args.url,
+        "embedding_model": args.embedding_model,
+        "chat_model": args.chat_model,
+        "vllm_api_key": args.vllm_api_key,
+        "embedding_endpoint": args.embedding_endpoint,
+        "chat_endpoint": args.chat_endpoint,
+        "db_path": args.db_path,
+        "chunk_size": args.chunk_size,
+        "chunk_overlap": args.chunk_overlap,
+        "top_k": args.top_k,
+    }
+
+
+def load_documents(url: str) -> list:
+    """Load and process web documents"""
+    return SimpleWebPageReader(html_to_text=True).load_data([url])
+
+
+def setup_models(config: dict[str, Any]):
+    """Configure embedding and chat models"""
+    Settings.embed_model = OpenAILikeEmbedding(
+        api_base=config["embedding_endpoint"],
+        api_key=config["vllm_api_key"],
+        model_name=config["embedding_model"],
+    )
+
+    Settings.llm = OpenAILike(
+        model=config["chat_model"],
+        api_key=config["vllm_api_key"],
+        api_base=config["chat_endpoint"],
+        context_window=128000,
+        is_chat_model=True,
+        is_function_calling_model=False,
+    )
+
+    Settings.transformations = [
+        SentenceSplitter(
+            chunk_size=config["chunk_size"],
+            chunk_overlap=config["chunk_overlap"],
+        )
+    ]
+
+
+def setup_vector_store(db_path: str) -> MilvusVectorStore:
+    """Initialize vector store"""
+    sample_emb = Settings.embed_model.get_text_embedding("test")
+    print(f"Embedding dimension: {len(sample_emb)}")
+    return MilvusVectorStore(uri=db_path, dim=len(sample_emb), overwrite=True)
+
+
+def create_index(documents: list, vector_store: MilvusVectorStore):
+    """Create document index"""
+    storage_context = StorageContext.from_defaults(vector_store=vector_store)
+    return VectorStoreIndex.from_documents(
+        documents,
+        storage_context=storage_context,
+    )
+
+
+def query_document(index: VectorStoreIndex, question: str, top_k: int):
+    """Query document with given question"""
+    query_engine = index.as_query_engine(similarity_top_k=top_k)
+    return query_engine.query(question)
+
+
+def get_parser() -> argparse.ArgumentParser:
+    """Parse command line arguments"""
+    parser = argparse.ArgumentParser(description="RAG with vLLM and LlamaIndex")
+
+    # Add command line arguments
+    parser.add_argument(
+        "--url",
+        default=("https://docs.vllm.ai/en/latest/getting_started/quickstart.html"),
+        help="URL of the document to process",
+    )
+    parser.add_argument(
+        "--embedding-model",
+        default="ssmits/Qwen2-7B-Instruct-embed-base",
+        help="Model name for embeddings",
+    )
+    parser.add_argument(
+        "--chat-model", default="qwen/Qwen1.5-0.5B-Chat", help="Model name for chat"
+    )
+    parser.add_argument(
+        "--vllm-api-key", default="EMPTY", help="API key for vLLM compatible services"
+    )
+    parser.add_argument(
+        "--embedding-endpoint",
+        default="http://localhost:8000/v1",
+        help="Base URL for embedding service",
+    )
+    parser.add_argument(
+        "--chat-endpoint",
+        default="http://localhost:8001/v1",
+        help="Base URL for chat service",
+    )
+    parser.add_argument(
+        "--db-path", default="./milvus_demo.db", help="Path to Milvus database"
+    )
+    parser.add_argument(
+        "-i", "--interactive", action="store_true", help="Enable interactive Q&A mode"
+    )
+    parser.add_argument(
+        "-c",
+        "--chunk-size",
+        type=int,
+        default=1000,
+        help="Chunk size for document splitting",
+    )
+    parser.add_argument(
+        "-o",
+        "--chunk-overlap",
+        type=int,
+        default=200,
+        help="Chunk overlap for document splitting",
+    )
+    parser.add_argument(
+        "-k", "--top-k", type=int, default=3, help="Number of top results to retrieve"
+    )
+
+    return parser
+
+
+def main():
+    # Parse command line arguments
+    args = get_parser().parse_args()
+
+    # Initialize configuration
+    config = init_config(args)
+
+    # Load documents
+    documents = load_documents(config["url"])
+
+    # Setup models
+    setup_models(config)
+
+    # Setup vector store
+    vector_store = setup_vector_store(config["db_path"])
+
+    # Create index
+    index = create_index(documents, vector_store)
+
+    if args.interactive:
+        print("\nEntering interactive mode. Type 'quit' to exit.")
+        while True:
+            # Get user question
+            question = input("\nEnter your question: ")
+
+            # Check for exit command
+            if question.lower() in ["quit", "exit", "q"]:
+                print("Exiting interactive mode...")
+                break
+
+            # Get and print response
+            print("\n" + "-" * 50)
+            print("Response:\n")
+            response = query_document(index, question, config["top_k"])
+            print(response)
+            print("-" * 50)
+    else:
+        # Single query mode
+        question = "How to install vLLM?"
+        response = query_document(index, question, config["top_k"])
+        print("-" * 50)
+        print("Response:\n")
+        print(response)
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/online_serving/run_cluster.sh b/vllm_v0.10.0/examples/online_serving/run_cluster.sh
new file mode 100644
index 0000000..522b956
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/run_cluster.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+#
+# Launch a Ray cluster inside Docker for vLLM inference.
+#
+# This script can start either a head node or a worker node, depending on the
+# --head or --worker flag provided as the third positional argument.
+#
+# Usage:
+# 1. Designate one machine as the head node and execute:
+#    bash run_cluster.sh \
+#         vllm/vllm-openai \
+#         <head_node_ip> \
+#         --head \
+#         /abs/path/to/huggingface/cache \
+#         -e VLLM_HOST_IP=<head_node_ip>
+#
+# 2. On every worker machine, execute:
+#    bash run_cluster.sh \
+#         vllm/vllm-openai \
+#         <head_node_ip> \
+#         --worker \
+#         /abs/path/to/huggingface/cache \
+#         -e VLLM_HOST_IP=<worker_node_ip>
+# 
+# Each worker requires a unique VLLM_HOST_IP value.
+# Keep each terminal session open. Closing a session stops the associated Ray
+# node and thereby shuts down the entire cluster.
+# Every machine must be reachable at the supplied IP address.
+#
+# The container is named "node-<random_suffix>". To open a shell inside
+# a container after launch, use:
+#       docker exec -it node-<random_suffix> /bin/bash
+#
+# Then, you can execute vLLM commands on the Ray cluster as if it were a
+# single machine, e.g. vllm serve ...
+#
+# To stop the container, use:
+#       docker stop node-<random_suffix>
+
+# Check for minimum number of required arguments.
+if [ $# -lt 4 ]; then
+    echo "Usage: $0 docker_image head_node_ip --head|--worker path_to_hf_home [additional_args...]"
+    exit 1
+fi
+
+# Extract the mandatory positional arguments and remove them from $@.
+DOCKER_IMAGE="$1"
+HEAD_NODE_ADDRESS="$2"
+NODE_TYPE="$3"  # Should be --head or --worker.
+PATH_TO_HF_HOME="$4"
+shift 4
+
+# Preserve any extra arguments so they can be forwarded to Docker.
+ADDITIONAL_ARGS=("$@")
+
+# Validate the NODE_TYPE argument.
+if [ "${NODE_TYPE}" != "--head" ] && [ "${NODE_TYPE}" != "--worker" ]; then
+    echo "Error: Node type must be --head or --worker"
+    exit 1
+fi
+
+# Generate a unique container name with random suffix.
+# Docker container names must be unique on each host.
+# The random suffix allows multiple Ray containers to run simultaneously on the same machine,
+# for example, on a multi-GPU machine.
+CONTAINER_NAME="node-${RANDOM}"
+
+# Define a cleanup routine that removes the container when the script exits.
+# This prevents orphaned containers from accumulating if the script is interrupted.
+cleanup() {
+    docker stop "${CONTAINER_NAME}"
+    docker rm "${CONTAINER_NAME}"
+}
+trap cleanup EXIT
+
+# Build the Ray start command based on the node role.
+# The head node manages the cluster and accepts connections on port 6379, 
+# while workers connect to the head's address.
+RAY_START_CMD="ray start --block"
+if [ "${NODE_TYPE}" == "--head" ]; then
+    RAY_START_CMD+=" --head --port=6379"
+else
+    RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
+fi
+
+# Launch the container with the assembled parameters.
+# --network host: Allows Ray nodes to communicate directly via host networking
+# --shm-size 10.24g: Increases shared memory
+# --gpus all: Gives container access to all GPUs on the host
+# -v HF_HOME: Mounts HuggingFace cache to avoid re-downloading models
+docker run \
+    --entrypoint /bin/bash \
+    --network host \
+    --name "${CONTAINER_NAME}" \
+    --shm-size 10.24g \
+    --gpus all \
+    -v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
+    "${ADDITIONAL_ARGS[@]}" \
+    "${DOCKER_IMAGE}" -c "${RAY_START_CMD}"
diff --git a/vllm_v0.10.0/examples/online_serving/sagemaker-entrypoint.sh b/vllm_v0.10.0/examples/online_serving/sagemaker-entrypoint.sh
new file mode 100644
index 0000000..75a99ff
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/sagemaker-entrypoint.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+# Define the prefix for environment variables to look for
+PREFIX="SM_VLLM_"
+ARG_PREFIX="--"
+
+# Initialize an array for storing the arguments
+# port 8080 required by sagemaker, https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-code-container-response
+ARGS=(--port 8080)
+
+# Loop through all environment variables
+while IFS='=' read -r key value; do
+    # Remove the prefix from the key, convert to lowercase, and replace underscores with dashes
+    arg_name=$(echo "${key#"${PREFIX}"}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')
+
+    # Add the argument name and value to the ARGS array
+    ARGS+=("${ARG_PREFIX}${arg_name}")
+    if [ -n "$value" ]; then
+        ARGS+=("$value")
+    fi
+done < <(env | grep "^${PREFIX}")
+
+# Pass the collected arguments to the main entrypoint
+exec python3 -m vllm.entrypoints.openai.api_server "${ARGS[@]}"
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/online_serving/streamlit_openai_chatbot_webserver.py b/vllm_v0.10.0/examples/online_serving/streamlit_openai_chatbot_webserver.py
new file mode 100644
index 0000000..64c8a91
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/streamlit_openai_chatbot_webserver.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+vLLM Chat Assistant - A Streamlit Web Interface
+
+A streamlined chat interface that quickly integrates
+with vLLM API server.
+
+Features:
+- Multiple chat sessions management
+- Streaming response display
+- Configurable API endpoint
+- Real-time chat history
+- Reasoning Display: Optional thinking process visualization 
+
+Requirements:
+    pip install streamlit openai
+
+Usage:
+    # Start the app with default settings
+    streamlit run streamlit_openai_chatbot_webserver.py
+
+    # Start with custom vLLM API endpoint
+    VLLM_API_BASE="http://your-server:8000/v1" \
+        streamlit run streamlit_openai_chatbot_webserver.py
+
+    # Enable debug mode
+    streamlit run streamlit_openai_chatbot_webserver.py \
+        --logger.level=debug
+"""
+
+import os
+from datetime import datetime
+
+import streamlit as st
+from openai import OpenAI
+
+# Get command line arguments from environment variables
+openai_api_key = os.getenv("VLLM_API_KEY", "EMPTY")
+openai_api_base = os.getenv("VLLM_API_BASE", "http://localhost:8000/v1")
+
+# Initialize session states for managing chat sessions
+if "sessions" not in st.session_state:
+    st.session_state.sessions = {}
+
+if "current_session" not in st.session_state:
+    st.session_state.current_session = None
+
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+
+if "active_session" not in st.session_state:
+    st.session_state.active_session = None
+
+# Add new session state for reasoning
+if "show_reasoning" not in st.session_state:
+    st.session_state.show_reasoning = {}
+
+# Initialize session state for API base URL
+if "api_base_url" not in st.session_state:
+    st.session_state.api_base_url = openai_api_base
+
+
+def create_new_chat_session():
+    """Create a new chat session with timestamp as unique identifier.
+
+    This function initializes a new chat session by:
+    1. Generating a timestamp-based session ID
+    2. Creating an empty message list for the new session
+    3. Setting the new session as both current and active session
+    4. Resetting the messages list for the new session
+
+    Returns:
+        None
+
+    Session State Updates:
+        - sessions: Adds new empty message list with timestamp key
+        - current_session: Sets to new session ID
+        - active_session: Sets to new session ID
+        - messages: Resets to empty list
+    """
+    session_id = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    st.session_state.sessions[session_id] = []
+    st.session_state.current_session = session_id
+    st.session_state.active_session = session_id
+    st.session_state.messages = []
+
+
+def switch_to_chat_session(session_id):
+    """Switch the active chat context to a different session.
+
+    Args:
+        session_id (str): The timestamp ID of the session to switch to
+
+    This function handles chat session switching by:
+    1. Setting the specified session as current
+    2. Updating the active session marker
+    3. Loading the messages history from the specified session
+
+    Session State Updates:
+        - current_session: Updated to specified session_id
+        - active_session: Updated to specified session_id
+        - messages: Loaded from sessions[session_id]
+    """
+    st.session_state.current_session = session_id
+    st.session_state.active_session = session_id
+    st.session_state.messages = st.session_state.sessions[session_id]
+
+
+def get_llm_response(messages, model, reason, content_ph=None, reasoning_ph=None):
+    """Generate and stream LLM response with optional reasoning process.
+
+    Args:
+        messages (list): List of conversation message dicts with 'role' and 'content'
+        model (str): The model identifier to use for generation
+        reason (bool): Whether to enable and display reasoning process
+        content_ph (streamlit.empty): Placeholder for streaming response content
+        reasoning_ph (streamlit.empty): Placeholder for streaming reasoning process
+
+    Returns:
+        tuple: (str, str)
+            - First string contains the complete response text
+            - Second string contains the complete reasoning text (if enabled)
+
+    Features:
+        - Streams both reasoning and response text in real-time
+        - Handles model API errors gracefully
+        - Supports live updating of thinking process
+        - Maintains separate content and reasoning displays
+
+    Raises:
+        Exception: Wrapped in error message if API call fails
+
+    Note:
+        The function uses streamlit placeholders for live updates.
+        When reason=True, the reasoning process appears above the response.
+    """
+    full_text = ""
+    think_text = ""
+    live_think = None
+    # Build request parameters
+    params = {"model": model, "messages": messages, "stream": True}
+    if reason:
+        params["extra_body"] = {"chat_template_kwargs": {"enable_thinking": True}}
+
+    try:
+        response = client.chat.completions.create(**params)
+        if isinstance(response, str):
+            if content_ph:
+                content_ph.markdown(response)
+            return response, ""
+
+        # Prepare reasoning expander above content
+        if reason and reasoning_ph:
+            exp = reasoning_ph.expander("💭 Thinking Process (live)", expanded=True)
+            live_think = exp.empty()
+
+        # Stream chunks
+        for chunk in response:
+            delta = chunk.choices[0].delta
+            # Stream reasoning first
+            if reason and hasattr(delta, "reasoning_content") and live_think:
+                rc = delta.reasoning_content
+                if rc:
+                    think_text += rc
+                    live_think.markdown(think_text + "▌")
+            # Then stream content
+            if hasattr(delta, "content") and delta.content and content_ph:
+                full_text += delta.content
+                content_ph.markdown(full_text + "▌")
+
+        # Finalize displays: reasoning remains above, content below
+        if reason and live_think:
+            live_think.markdown(think_text)
+        if content_ph:
+            content_ph.markdown(full_text)
+
+        return full_text, think_text
+    except Exception as e:
+        st.error(f"Error details: {str(e)}")
+        return f"Error: {str(e)}", ""
+
+
+# Sidebar - API Settings first
+st.sidebar.title("API Settings")
+new_api_base = st.sidebar.text_input(
+    "API Base URL:", value=st.session_state.api_base_url
+)
+if new_api_base != st.session_state.api_base_url:
+    st.session_state.api_base_url = new_api_base
+    st.rerun()
+
+st.sidebar.divider()
+
+# Sidebar - Session Management
+st.sidebar.title("Chat Sessions")
+if st.sidebar.button("New Session"):
+    create_new_chat_session()
+
+
+# Display all sessions in reverse chronological order
+for session_id in sorted(st.session_state.sessions.keys(), reverse=True):
+    # Mark the active session with a pinned button
+    if session_id == st.session_state.active_session:
+        st.sidebar.button(
+            f"📍 {session_id}",
+            key=session_id,
+            type="primary",
+            on_click=switch_to_chat_session,
+            args=(session_id,),
+        )
+    else:
+        st.sidebar.button(
+            f"Session {session_id}",
+            key=session_id,
+            on_click=switch_to_chat_session,
+            args=(session_id,),
+        )
+
+# Main interface
+st.title("vLLM Chat Assistant")
+
+# Initialize OpenAI client with API settings
+client = OpenAI(api_key=openai_api_key, base_url=st.session_state.api_base_url)
+
+# Get and display current model id
+models = client.models.list()
+model = models.data[0].id
+st.markdown(f"**Model**: {model}")
+
+# Initialize first session if none exists
+if st.session_state.current_session is None:
+    create_new_chat_session()
+    st.session_state.active_session = st.session_state.current_session
+
+# Update the chat history display section
+for idx, msg in enumerate(st.session_state.messages):
+    # Render user messages normally
+    if msg["role"] == "user":
+        with st.chat_message("user"):
+            st.write(msg["content"])
+    # Render assistant messages with reasoning above
+    else:
+        # If reasoning exists for this assistant message, show it above the content
+        if idx in st.session_state.show_reasoning:
+            with st.expander("💭 Thinking Process", expanded=False):
+                st.markdown(st.session_state.show_reasoning[idx])
+        with st.chat_message("assistant"):
+            st.write(msg["content"])
+
+
+# Setup & Cache reasoning support check
+@st.cache_data(show_spinner=False)
+def server_supports_reasoning():
+    """Check if the current model supports reasoning capability.
+
+    Returns:
+        bool: True if the model supports reasoning, False otherwise
+    """
+    resp = client.chat.completions.create(
+        model=model,
+        messages=[{"role": "user", "content": "Hi"}],
+        stream=False,
+    )
+    return hasattr(resp.choices[0].message, "reasoning_content") and bool(
+        resp.choices[0].message.reasoning_content
+    )
+
+
+# Check support
+supports_reasoning = server_supports_reasoning()
+
+# Add reasoning toggle in sidebar if supported
+reason = False  # Default to False
+if supports_reasoning:
+    reason = st.sidebar.checkbox("Enable Reasoning", value=False)
+else:
+    st.sidebar.markdown(
+        "<span style='color:gray;'>Reasoning unavailable for this model.</span>",
+        unsafe_allow_html=True,
+    )
+    # reason remains False
+
+# Update the input handling section
+if prompt := st.chat_input("Type your message here..."):
+    # Save and display user message
+    st.session_state.messages.append({"role": "user", "content": prompt})
+    st.session_state.sessions[st.session_state.current_session] = (
+        st.session_state.messages
+    )
+    with st.chat_message("user"):
+        st.write(prompt)
+
+    # Prepare LLM messages
+    msgs = [
+        {"role": m["role"], "content": m["content"]} for m in st.session_state.messages
+    ]
+
+    # Stream assistant response
+    with st.chat_message("assistant"):
+        # Placeholders: reasoning above, content below
+        reason_ph = st.empty()
+        content_ph = st.empty()
+        full, think = get_llm_response(msgs, model, reason, content_ph, reason_ph)
+        # Determine index for this new assistant message
+        message_index = len(st.session_state.messages)
+        # Save assistant reply
+        st.session_state.messages.append({"role": "assistant", "content": full})
+        # Persist reasoning in session state if any
+        if reason and think:
+            st.session_state.show_reasoning[message_index] = think
diff --git a/vllm_v0.10.0/examples/online_serving/structured_outputs/README.md b/vllm_v0.10.0/examples/online_serving/structured_outputs/README.md
new file mode 100644
index 0000000..d2777a4
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/structured_outputs/README.md
@@ -0,0 +1,58 @@
+# Structured Outputs
+
+This script demonstrates various structured output capabilities of vLLM's OpenAI-compatible server.
+It can run individual constraint type or all of them.
+It supports both streaming responses and concurrent non-streaming requests.
+
+To use this example, you must start an vLLM server with any model of your choice.
+
+```bash
+vllm serve Qwen/Qwen2.5-3B-Instruct
+```
+
+To serve a reasoning model, you can use the following command:
+
+```bash
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \
+    --reasoning-parser deepseek_r1
+```
+
+If you want to run this script standalone with `uv`, you can use the following:
+
+```bash
+uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs \
+    structured-output
+```
+
+See [feature docs](https://docs.vllm.ai/en/latest/features/structured_outputs.html) for more information.
+
+!!! tip
+    If vLLM is running remotely, then set `OPENAI_BASE_URL=<remote_url>` before running the script.
+
+## Usage
+
+Run all constraints, non-streaming:
+
+```bash
+uv run structured_outputs.py
+```
+
+Run all constraints, streaming:
+
+```bash
+uv run structured_outputs.py --stream
+```
+
+Run certain constraints, for example `structural_tag` and `regex`, streaming:
+
+```bash
+uv run structured_outputs.py \
+    --constraint structural_tag regex \
+    --stream
+```
+
+Run all constraints, with reasoning models and streaming:
+
+```bash
+uv run structured_outputs.py --reasoning --stream
+```
diff --git a/vllm_v0.10.0/examples/online_serving/structured_outputs/pyproject.toml b/vllm_v0.10.0/examples/online_serving/structured_outputs/pyproject.toml
new file mode 100644
index 0000000..8f31405
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/structured_outputs/pyproject.toml
@@ -0,0 +1,8 @@
+[project]
+name = "examples-online-structured-outputs"
+requires-python = ">=3.9, <3.13"
+dependencies = ["openai==1.78.1", "pydantic==2.11.4"]
+version = "0.0.0"
+
+[project.scripts]
+structured-outputs = "structured_outputs:main"
diff --git a/vllm_v0.10.0/examples/online_serving/structured_outputs/structured_outputs.py b/vllm_v0.10.0/examples/online_serving/structured_outputs/structured_outputs.py
new file mode 100644
index 0000000..2a8f463
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/structured_outputs/structured_outputs.py
@@ -0,0 +1,272 @@
+# ruff: noqa: E501
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import enum
+import os
+from typing import TYPE_CHECKING, Any, Literal
+
+import openai
+import pydantic
+
+if TYPE_CHECKING:
+    from openai.types.chat import ChatCompletionChunk
+
+
+ConstraintsFormat = Literal[
+    "choice",
+    "regex",
+    "json",
+    "grammar",
+    "structural_tag",
+]
+
+
+async def print_stream_response(
+    stream_response: openai.AsyncStream[ChatCompletionChunk],
+    title: str,
+    args: argparse.Namespace,
+):
+    print(f"\n\n{title} (Streaming):")
+
+    local_reasoning_header_printed = False
+    local_content_header_printed = False
+
+    async for chunk in stream_response:
+        delta = chunk.choices[0].delta
+
+        reasoning_chunk_text: str | None = getattr(delta, "reasoning_content", None)
+        content_chunk_text = delta.content
+
+        if args.reasoning:
+            if reasoning_chunk_text:
+                if not local_reasoning_header_printed:
+                    print("  Reasoning: ", end="")
+                    local_reasoning_header_printed = True
+                print(reasoning_chunk_text, end="", flush=True)
+
+            if content_chunk_text:
+                if not local_content_header_printed:
+                    if local_reasoning_header_printed:
+                        print()
+                    print("  Content: ", end="")
+                    local_content_header_printed = True
+                print(content_chunk_text, end="", flush=True)
+        else:
+            if content_chunk_text:
+                if not local_content_header_printed:
+                    print("  Content: ", end="")
+                    local_content_header_printed = True
+                print(content_chunk_text, end="", flush=True)
+    print()
+
+
+class CarType(str, enum.Enum):
+    SEDAN = "SEDAN"
+    SUV = "SUV"
+    TRUCK = "TRUCK"
+    COUPE = "COUPE"
+
+
+class CarDescription(pydantic.BaseModel):
+    brand: str
+    model: str
+    car_type: CarType
+
+
+PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
+    "choice": {
+        "messages": [
+            {
+                "role": "user",
+                "content": "Classify this sentiment: vLLM is wonderful!",
+            }
+        ],
+        "extra_body": {"guided_choice": ["positive", "negative"]},
+    },
+    "regex": {
+        "messages": [
+            {
+                "role": "user",
+                "content": "Generate an email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: 'alan.turing@enigma.com\n'",
+            }
+        ],
+        "extra_body": {
+            "guided_regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n",
+        },
+    },
+    "json": {
+        "messages": [
+            {
+                "role": "user",
+                "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
+            }
+        ],
+        "response_format": {
+            "type": "json_schema",
+            "json_schema": {
+                "name": "car-description",
+                "schema": CarDescription.model_json_schema(),
+            },
+        },
+    },
+    "grammar": {
+        "messages": [
+            {
+                "role": "user",
+                "content": "Generate an SQL query to show the 'username' and 'email'from the 'users' table.",
+            }
+        ],
+        "extra_body": {
+            "guided_grammar": """
+root ::= select_statement
+
+select_statement ::= "SELECT " column " from " table " where " condition
+
+column ::= "col_1 " | "col_2 "
+
+table ::= "table_1 " | "table_2 "
+
+condition ::= column "= " number
+
+number ::= "1 " | "2 "
+""",
+        },
+    },
+    "structural_tag": {
+        "messages": [
+            {
+                "role": "user",
+                "content": """
+You have access to the following function to retrieve the weather in a city:
+
+{
+    "name": "get_weather",
+    "parameters": {
+        "city": {
+            "param_type": "string",
+            "description": "The city to get the weather for",
+            "required": True
+        }
+    }
+}
+
+If a you choose to call a function ONLY reply in the following format:
+<{start_tag}={function_name}>{parameters}{end_tag}
+where
+
+start_tag => `<function`
+parameters => a JSON dict with the function argument name as key and function
+              argument value as value.
+end_tag => `</function>`
+
+Here is an example,
+<function=example_function_name>{"example_name": "example_value"}</function>
+
+Reminder:
+- Function calls MUST follow the specified format
+- Required parameters MUST be specified
+- Only call one function at a time
+- Put the entire function call reply on one line
+- Always add your sources when using search results to answer the user query
+
+You are a helpful assistant.
+
+Given the previous instructions, what is the weather in New York City, Boston,
+and San Francisco?""",
+            },
+        ],
+        "response_format": {
+            "type": "structural_tag",
+            "structures": [
+                {
+                    "begin": "<function=get_weather>",
+                    "schema": {
+                        "type": "object",
+                        "properties": {"city": {"type": "string"}},
+                        "required": ["city"],
+                    },
+                    "end": "</function>",
+                }
+            ],
+            "triggers": ["<function="],
+        },
+    },
+}
+
+
+async def cli():
+    parser = argparse.ArgumentParser(
+        description="Run OpenAI Chat Completion with various structured outputs capabilities",
+    )
+    _ = parser.add_argument(
+        "--constraint",
+        type=str,
+        nargs="+",
+        choices=[*list(PARAMS), "*"],
+        default=["*"],
+        help="Specify which constraint(s) to run.",
+    )
+    _ = parser.add_argument(
+        "--stream",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="Enable streaming output",
+    )
+    _ = parser.add_argument(
+        "--reasoning",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="Enable printing of reasoning traces if available.",
+    )
+    args = parser.parse_args()
+
+    base_url = os.getenv("OPENAI_BASE_URL", "http://localhost:8000/v1")
+    client = openai.AsyncOpenAI(base_url=base_url, api_key="EMPTY")
+    constraints = list(PARAMS) if "*" in args.constraint else list(set(args.constraint))
+    model = (await client.models.list()).data[0].id
+
+    if args.stream:
+        results = await asyncio.gather(
+            *[
+                client.chat.completions.create(
+                    model=model,
+                    max_tokens=1024,
+                    stream=True,
+                    **PARAMS[name],
+                )
+                for name in constraints
+            ]
+        )
+        for constraint, stream in zip(constraints, results):
+            await print_stream_response(stream, constraint, args)
+    else:
+        results = await asyncio.gather(
+            *[
+                client.chat.completions.create(
+                    model=model,
+                    max_tokens=1024,
+                    stream=False,
+                    **PARAMS[name],
+                )
+                for name in constraints
+            ]
+        )
+        for constraint, response in zip(constraints, results):
+            print(f"\n\n{constraint}:")
+            message = response.choices[0].message
+            if args.reasoning and hasattr(message, "reasoning_content"):
+                print(f"  Reasoning: {message.reasoning_content or ''}")
+            print(f"  Content: {message.content!r}")
+
+
+def main():
+    asyncio.run(cli())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/online_serving/utils.py b/vllm_v0.10.0/examples/online_serving/utils.py
new file mode 100644
index 0000000..a512d8a
--- /dev/null
+++ b/vllm_v0.10.0/examples/online_serving/utils.py
@@ -0,0 +1,26 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from openai import APIConnectionError, OpenAI
+from openai.pagination import SyncPage
+from openai.types.model import Model
+
+
+def get_first_model(client: OpenAI) -> str:
+    """
+    Get the first model from the vLLM server.
+    """
+    try:
+        models: SyncPage[Model] = client.models.list()
+    except APIConnectionError as e:
+        raise RuntimeError(
+            "Failed to get the list of models from the vLLM server at "
+            f"{client.base_url} with API key {client.api_key}. Check\n"
+            "1. the server is running\n"
+            "2. the server URL is correct\n"
+            "3. the API key is correct"
+        ) from e
+
+    if len(models.data) == 0:
+        raise RuntimeError(f"No models found on the vLLM server at {client.base_url}")
+
+    return models.data[0].id
diff --git a/vllm_v0.10.0/examples/others/lmcache/README.md b/vllm_v0.10.0/examples/others/lmcache/README.md
new file mode 100644
index 0000000..95a6bf9
--- /dev/null
+++ b/vllm_v0.10.0/examples/others/lmcache/README.md
@@ -0,0 +1,56 @@
+# LMCache Examples
+
+This folder demonstrates how to use LMCache for disaggregated prefilling, CPU offloading and KV cache sharing.
+
+## 1. Disaggregated Prefill in vLLM v1
+
+This example demonstrates how to run LMCache with disaggregated prefill using NIXL on a single node.
+
+### Prerequisites
+
+- Install [LMCache](https://github.com/LMCache/LMCache). You can simply run `pip install lmcache`.
+- Install [NIXL](https://github.com/ai-dynamo/nixl).
+- At least 2 GPUs
+- Valid Hugging Face token (HF_TOKEN) for Llama 3.1 8B Instruct.
+
+### Usage
+
+Run
+`cd disagg_prefill_lmcache_v1`
+to get into `disagg_prefill_lmcache_v1` folder, and then run
+
+```bash
+bash disagg_example_nixl.sh
+```
+
+to run disaggregated prefill and benchmark the performance.
+
+### Components
+
+#### Server Scripts
+- `disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh` - Launches individual vLLM servers for prefill/decode, and also launches the proxy server.
+- `disagg_prefill_lmcache_v1/disagg_proxy_server.py` - FastAPI proxy server that coordinates between prefiller and decoder
+- `disagg_prefill_lmcache_v1/disagg_example_nixl.sh` - Main script to run the example
+
+#### Configuration
+- `disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml` - Configuration for prefiller server
+- `disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml` - Configuration for decoder server
+
+#### Log Files
+The main script generates several log files:
+- `prefiller.log` - Logs from the prefill server
+- `decoder.log` - Logs from the decode server
+- `proxy.log` - Logs from the proxy server
+
+## 2. CPU Offload Examples
+
+- `python cpu_offload_lmcache.py -v v0` - CPU offloading implementation for vLLM v0
+- `python cpu_offload_lmcache.py -v v1` - CPU offloading implementation for vLLM v1
+
+## 3. KV Cache Sharing
+
+The `kv_cache_sharing_lmcache_v1.py` example demonstrates how to share KV caches between vLLM v1 instances.
+
+## 4. Disaggregated Prefill in vLLM v0
+
+The `disaggregated_prefill_lmcache_v0.py` provides an example of how to run disaggregated prefill in vLLM v0.
diff --git a/vllm_v0.10.0/examples/others/lmcache/cpu_offload_lmcache.py b/vllm_v0.10.0/examples/others/lmcache/cpu_offload_lmcache.py
new file mode 100644
index 0000000..e10ee4e
--- /dev/null
+++ b/vllm_v0.10.0/examples/others/lmcache/cpu_offload_lmcache.py
@@ -0,0 +1,153 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file demonstrates the example usage of cpu offloading
+with LMCache in vLLM v1 or v0.
+
+Usage:
+
+    Specify vLLM version
+
+    -v v0 : Use LMCacheConnector
+            model = mistralai/Mistral-7B-Instruct-v0.2
+            (Includes enable_chunked_prefill = True)
+
+    -v v1 : Use LMCacheConnectorV1 (default)
+            model = meta-llama/Meta-Llama-3.1-8B-Instruct
+            (Without enable_chunked_prefill)
+
+Note that `lmcache` is needed to run this example.
+Requirements:
+https://docs.lmcache.ai/getting_started/installation.html#prerequisites
+Learn more about LMCache environment setup, please refer to:
+https://docs.lmcache.ai/getting_started/installation.html
+"""
+
+import argparse
+import contextlib
+import os
+import time
+from dataclasses import asdict
+
+from lmcache.integration.vllm.utils import ENGINE_NAME
+from lmcache.v1.cache_engine import LMCacheEngineBuilder
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+from vllm.engine.arg_utils import EngineArgs
+
+
+def setup_environment_variables(vllm_version: str):
+    # LMCache-related environment variables
+    # Use experimental features in LMCache
+    os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
+    # LMCache is set to use 256 tokens per chunk
+    os.environ["LMCACHE_CHUNK_SIZE"] = "256"
+    # Enable local CPU backend in LMCache
+    os.environ["LMCACHE_LOCAL_CPU"] = "True"
+    # Set local CPU memory limit to 5.0 GB
+    os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
+    if vllm_version == "v0":
+        os.environ["VLLM_USE_V1"] = "0"
+
+
+@contextlib.contextmanager
+def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str):
+    ktc = KVTransferConfig(
+        kv_connector=lmcache_connector,
+        kv_role="kv_both",
+    )
+    # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
+    # memory. Reduce the value if your GPU has less memory.
+    # Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
+    if vllm_version == "v0":
+        llm_args = EngineArgs(
+            model=model,
+            kv_transfer_config=ktc,
+            max_model_len=8000,
+            gpu_memory_utilization=0.8,
+            enable_chunked_prefill=True,  # Only in v0
+        )
+    else:
+        llm_args = EngineArgs(
+            model=model,
+            kv_transfer_config=ktc,
+            max_model_len=8000,
+            gpu_memory_utilization=0.8,
+        )
+
+    llm = LLM(**asdict(llm_args))
+    try:
+        yield llm
+    finally:
+        # Clean up lmcache backend
+        LMCacheEngineBuilder.destroy(ENGINE_NAME)
+
+
+def print_output(
+    llm: LLM,
+    prompt: list[str],
+    sampling_params: SamplingParams,
+    req_str: str,
+):
+    # Should be able to see logs like the following:
+    # `LMCache INFO: Storing KV cache for 6006 out of 6006 tokens for request 0`
+    # This indicates that the KV cache has been stored in LMCache.
+    start = time.time()
+    outputs = llm.generate(prompt, sampling_params)
+    print("-" * 50)
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print(f"Generated text: {generated_text!r}")
+    print(f"Generation took {time.time() - start:.2f} seconds, {req_str} request done.")
+    print("-" * 50)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-v",
+        "--version",
+        choices=["v0", "v1"],
+        default="v1",
+        help="Specify vLLM version (default: v1)",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    if args.version == "v0":
+        lmcache_connector = "LMCacheConnector"
+        model = "mistralai/Mistral-7B-Instruct-v0.2"
+    else:
+        lmcache_connector = "LMCacheConnectorV1"
+        model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+
+    setup_environment_variables(args.version)
+
+    with build_llm_with_lmcache(lmcache_connector, model, args.version) as llm:
+        # This example script runs two requests with a shared prefix.
+        # Define the shared prompt and specific prompts
+        shared_prompt = "Hello, how are you?" * 1000
+        first_prompt = [
+            shared_prompt + "Hello, my name is",
+        ]
+        second_prompt = [
+            shared_prompt + "Tell me a very long story",
+        ]
+
+        sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
+
+        # Print the first output
+        print_output(llm, first_prompt, sampling_params, "first")
+
+        time.sleep(1)
+
+        # print the second output
+        print_output(llm, second_prompt, sampling_params, "second")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/others/lmcache/disagg_prefill_lmcache_v0.py b/vllm_v0.10.0/examples/others/lmcache/disagg_prefill_lmcache_v0.py
new file mode 100644
index 0000000..6669eb3
--- /dev/null
+++ b/vllm_v0.10.0/examples/others/lmcache/disagg_prefill_lmcache_v0.py
@@ -0,0 +1,144 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file demonstrates the example usage of disaggregated prefilling
+with LMCache.
+We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
+and launch an additional LMCache server.
+KV cache is transferred in the following manner:
+vLLM prefill node -> LMCache server -> vLLM decode node.
+
+Note that `pip install lmcache` is needed to run this example.
+Learn more about LMCache in https://github.com/LMCache/LMCache.
+"""
+
+import os
+import subprocess
+import time
+from multiprocessing import Event, Process
+
+from lmcache.experimental.cache_engine import LMCacheEngineBuilder
+from lmcache.integration.vllm.utils import ENGINE_NAME
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+# LMCache-related environment variables
+# The port to start LMCache server
+port = 8100
+# Use experimental features in LMCache
+os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
+# LMCache is set to use 256 tokens per chunk
+os.environ["LMCACHE_CHUNK_SIZE"] = "256"
+# Disable local CPU backend in LMCache
+os.environ["LMCACHE_LOCAL_CPU"] = "False"
+# Set local CPU memory buffer limit to 5.0 GB
+os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
+# Set the remote URL for LMCache server
+os.environ["LMCACHE_REMOTE_URL"] = f"lm://localhost:{port}"
+# Set the serializer/deserializer between vllm and LMCache server
+# `naive` indicates using raw bytes of the tensor without any compression
+os.environ["LMCACHE_REMOTE_SERDE"] = "naive"
+
+prompts = [
+    "Hello, how are you?" * 1000,
+]
+
+
+def run_prefill(prefill_done, prompts):
+    # We use GPU 0 for prefill node.
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
+
+    ktc = KVTransferConfig(
+        kv_connector="LMCacheConnector",
+        kv_role="kv_producer",
+        kv_rank=0,
+        kv_parallel_size=2,
+    )
+    # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
+    # memory. Reduce the value if your GPU has less memory.
+    llm = LLM(
+        model="mistralai/Mistral-7B-Instruct-v0.2",
+        kv_transfer_config=ktc,
+        max_model_len=8000,
+        gpu_memory_utilization=0.8,
+        enforce_eager=True,
+    )
+
+    # llm.generate(prompts, sampling_params)
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print(f"Generated text: {generated_text!r}")
+    print("Prefill node is finished.")
+    prefill_done.set()
+
+    # Clean up lmcache backend
+    LMCacheEngineBuilder.destroy(ENGINE_NAME)
+
+
+def run_decode(prefill_done, prompts, timeout=1):
+    # We use GPU 1 for decode node.
+    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
+
+    ktc = KVTransferConfig(
+        kv_connector="LMCacheConnector",
+        kv_role="kv_consumer",
+        kv_rank=1,
+        kv_parallel_size=2,
+    )
+    # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
+    # of memory. Reduce the value if your GPU has less memory.
+    llm = LLM(
+        model="mistralai/Mistral-7B-Instruct-v0.2",
+        kv_transfer_config=ktc,
+        max_model_len=8000,
+        gpu_memory_utilization=0.8,
+        enforce_eager=True,
+    )
+
+    print("Waiting for prefill node to finish...")
+    prefill_done.wait()
+    time.sleep(timeout)
+
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print(f"Generated text: {generated_text!r}")
+
+    # Clean up lmcache backend
+    LMCacheEngineBuilder.destroy(ENGINE_NAME)
+
+
+def run_lmcache_server(port):
+    server_proc = subprocess.Popen(
+        ["python", "-m", "lmcache.experimental.server", "localhost", str(port)]
+    )
+    return server_proc
+
+
+def main():
+    prefill_done = Event()
+    prefill_process = Process(target=run_prefill, args=(prefill_done, prompts))
+    decode_process = Process(target=run_decode, args=(prefill_done, prompts))
+    lmcache_server_process = run_lmcache_server(port)
+
+    # Start prefill node
+    prefill_process.start()
+
+    # Start decode node
+    decode_process.start()
+
+    # Clean up the processes
+    decode_process.join()
+    prefill_process.terminate()
+    lmcache_server_process.terminate()
+    lmcache_server_process.wait()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml b/vllm_v0.10.0/examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml
new file mode 100644
index 0000000..c3f5a0a
--- /dev/null
+++ b/vllm_v0.10.0/examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml
@@ -0,0 +1,13 @@
+local_cpu: False
+max_local_cpu_size: 0
+#local_disk: 
+max_local_disk_size: 0
+remote_serde: NULL
+
+enable_nixl: True
+nixl_role: "receiver"
+nixl_peer_host: "localhost"
+nixl_peer_port: 55555
+nixl_buffer_size: 1073741824 # 1GB
+nixl_buffer_device: "cuda"
+nixl_enable_gc: True
diff --git a/vllm_v0.10.0/examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml b/vllm_v0.10.0/examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml
new file mode 100644
index 0000000..8b0e829
--- /dev/null
+++ b/vllm_v0.10.0/examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml
@@ -0,0 +1,13 @@
+local_cpu: False
+max_local_cpu_size: 0
+#local_disk: 
+max_local_disk_size: 0
+remote_serde: NULL
+
+enable_nixl: True
+nixl_role: "sender"
+nixl_peer_host: "localhost"
+nixl_peer_port: 55555
+nixl_buffer_size: 1073741824 # 1GB
+nixl_buffer_device: "cuda"
+nixl_enable_gc: True
diff --git a/vllm_v0.10.0/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh b/vllm_v0.10.0/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
new file mode 100644
index 0000000..0b6c921
--- /dev/null
+++ b/vllm_v0.10.0/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
@@ -0,0 +1,136 @@
+#!/bin/bash
+
+echo "Warning: LMCache disaggregated prefill support for vLLM v1 is experimental and subject to change."
+
+
+PIDS=()
+
+# Switch to the directory of the current script
+cd "$(dirname "${BASH_SOURCE[0]}")"
+
+check_hf_token() {
+    if [ -z "$HF_TOKEN" ]; then
+        echo "HF_TOKEN is not set. Please set it to your Hugging Face token."
+        exit 1
+    fi
+    if [[ "$HF_TOKEN" != hf_* ]]; then
+        echo "HF_TOKEN is not a valid Hugging Face token. Please set it to your Hugging Face token."
+        exit 1
+    fi
+    echo "HF_TOKEN is set and valid."
+}
+
+check_num_gpus() {
+    # can you check if the number of GPUs are >=2 via nvidia-smi?
+    num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+    if [ "$num_gpus" -lt 2 ]; then
+        echo "You need at least 2 GPUs to run disaggregated prefill."
+        exit 1
+    else
+        echo "Found $num_gpus GPUs."
+    fi
+}
+
+ensure_python_library_installed() {
+    echo "Checking if $1 is installed..."
+    python3 -c "import $1" > /dev/null 2>&1
+    if [ $? -ne 0 ]; then
+        if [ "$1" == "nixl" ]; then
+            echo "$1 is not installed. Please refer to https://github.com/ai-dynamo/nixl for installation."
+        else
+            echo "$1 is not installed. Please install it via pip install $1."
+        fi
+        exit 1
+    else
+        echo "$1 is installed."
+    fi
+}
+
+cleanup() {
+    echo "Stopping everything…"
+    trap - INT TERM        # prevent re-entrancy
+    kill -- -$$            # negative PID  ==  “this whole process-group”
+    wait                   # reap children so we don't leave zombies
+    exit 0
+}
+
+wait_for_server() {
+  local port=$1
+  local timeout_seconds=1200
+  local start_time=$(date +%s)
+
+  echo "Waiting for server on port $port..."
+
+  while true; do
+    if curl -s "localhost:${port}/v1/completions" > /dev/null; then
+      return 0
+    fi
+
+    local now=$(date +%s)
+    if (( now - start_time >= timeout_seconds )); then
+      echo "Timeout waiting for server"
+      return 1
+    fi
+
+    sleep 1
+  done
+}
+
+
+main() {
+    check_hf_token
+    check_num_gpus
+    ensure_python_library_installed lmcache
+    ensure_python_library_installed nixl
+    ensure_python_library_installed pandas
+    ensure_python_library_installed datasets
+    ensure_python_library_installed vllm
+
+    trap cleanup INT
+    trap cleanup USR1
+    trap cleanup TERM
+
+    echo "Launching prefiller, decoder and proxy..."
+    echo "Please check prefiller.log, decoder.log and proxy.log for logs."
+
+    bash disagg_vllm_launcher.sh prefiller \
+        > >(tee prefiller.log) 2>&1 &
+    prefiller_pid=$!
+    PIDS+=($prefiller_pid)
+
+    bash disagg_vllm_launcher.sh decoder  \
+        > >(tee decoder.log)  2>&1 &
+    decoder_pid=$!
+    PIDS+=($decoder_pid)
+
+    python3 disagg_proxy_server.py \
+        --host localhost \
+        --port 9000 \
+        --prefiller-host localhost \
+        --prefiller-port 8100 \
+        --decoder-host localhost \
+        --decoder-port 8200  \
+        > >(tee proxy.log)    2>&1 &
+    proxy_pid=$!
+    PIDS+=($proxy_pid)
+
+    wait_for_server 8100
+    wait_for_server 8200
+    wait_for_server 9000
+
+    echo "All servers are up. Starting benchmark..."
+
+    # begin benchmark
+    cd ../../../../benchmarks/
+    python3 benchmark_serving.py --port 9000 --seed $(date +%s) \
+        --model meta-llama/Llama-3.1-8B-Instruct \
+        --dataset-name random --random-input-len 7500 --random-output-len 200 \
+        --num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log
+
+    echo "Benchmarking done. Cleaning up..."
+
+    cleanup
+
+}
+
+main
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py b/vllm_v0.10.0/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
new file mode 100644
index 0000000..5d8e38c
--- /dev/null
+++ b/vllm_v0.10.0/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
@@ -0,0 +1,208 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import os
+import time
+from contextlib import asynccontextmanager
+
+import httpx
+import numpy as np
+from fastapi import FastAPI, Request
+from fastapi.responses import StreamingResponse
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """
+    Lifespan context manager to handle startup and shutdown events.
+    """
+    # Startup: Initialize clients
+    prefiller_base_url = (
+        f"http://{global_args.prefiller_host}:{global_args.prefiller_port}/v1"
+    )
+    decoder_base_url = (
+        f"http://{global_args.decoder_host}:{global_args.decoder_port}/v1"
+    )
+
+    app.state.prefill_client = httpx.AsyncClient(
+        timeout=None, base_url=prefiller_base_url
+    )
+    app.state.decode_client = httpx.AsyncClient(timeout=None, base_url=decoder_base_url)
+
+    yield
+
+    # Shutdown: Close clients
+    await app.state.prefill_client.aclose()
+    await app.state.decode_client.aclose()
+
+
+# Update FastAPI app initialization to use lifespan
+app = FastAPI(lifespan=lifespan)
+
+
+class StatsCalculator:
+    def __init__(self):
+        self._stats = []
+        self._last_log_time = time.time()
+
+    def add(self, value):
+        self._stats.append(value)
+        if time.time() - self._last_log_time > 5:
+            self._log_stats()
+            self._last_log_time = time.time()
+
+    def _log_stats(self):
+        # Print average, median, and 99th percentile
+        np_arr = np.array(self._stats)
+        output_str = (
+            f"\nNum requests: {len(self._stats)}"
+            + "\nPrefill node TTFT stats:"
+            + f"\n - Average (ms): {np.mean(np_arr)}"
+            + f"\n - Median (ms): {np.median(np_arr)}"
+            + f"\n - 99th Percentile (ms): {np.percentile(np_arr, 99)}\n"
+        )
+        print(
+            "===============================",
+            output_str,
+            "===============================",
+        )
+
+
+stats_calculator = StatsCalculator()
+counter = 0
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--prefiller-host", type=str, default="localhost")
+    parser.add_argument("--prefiller-port", type=int, default=8100)
+    parser.add_argument("--decoder-host", type=str, default="localhost")
+    parser.add_argument("--decoder-port", type=int, default=8200)
+    args = parser.parse_args()
+    return args
+
+
+# Initialize variables to hold the persistent clients
+app.state.prefill_client = None
+app.state.decode_client = None
+
+
+async def send_request_to_service(
+    client: httpx.AsyncClient, endpoint: str, req_data: dict
+):
+    """
+    Send a request to a service using a persistent client.
+    """
+    req_data = req_data.copy()
+    req_data["max_tokens"] = 1
+    if "max_completion_tokens" in req_data:
+        req_data["max_completion_tokens"] = 1
+
+    headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+    response = await client.post(endpoint, json=req_data, headers=headers)
+    response.raise_for_status()
+    return response
+
+
+async def stream_service_response(
+    client: httpx.AsyncClient, endpoint: str, req_data: dict
+):
+    """
+    Asynchronously stream the response from a service using a persistent client.
+    """
+    headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+    async with client.stream(
+        "POST", endpoint, json=req_data, headers=headers
+    ) as response:
+        response.raise_for_status()
+        async for chunk in response.aiter_bytes():
+            yield chunk
+
+
+@app.post("/v1/completions")
+async def handle_completions(request: Request):
+    global counter, stats_calculator
+    counter += 1
+
+    st = time.time()
+    try:
+        req_data = await request.json()
+
+        # Send request to prefill service, ignore the response
+        await send_request_to_service(
+            app.state.prefill_client, "/completions", req_data
+        )
+
+        et = time.time()
+        stats_calculator.add(et - st)
+
+        # Stream response from decode service
+        async def generate_stream():
+            async for chunk in stream_service_response(
+                app.state.decode_client, "/completions", req_data
+            ):
+                yield chunk
+
+        return StreamingResponse(generate_stream(), media_type="text/event-stream")
+
+    except Exception as e:
+        import sys
+        import traceback
+
+        exc_info = sys.exc_info()
+        print("Error occurred in disagg prefill proxy server - completions endpoint")
+        print(e)
+        print("".join(traceback.format_exception(*exc_info)))
+        raise
+
+
+@app.post("/v1/chat/completions")
+async def handle_chat_completions(request: Request):
+    global counter, stats_calculator
+    counter += 1
+
+    st = time.time()
+    try:
+        req_data = await request.json()
+
+        # Send request to prefill service, ignore the response
+        await send_request_to_service(
+            app.state.prefill_client, "/chat/completions", req_data
+        )
+
+        et = time.time()
+        stats_calculator.add(et - st)
+
+        # Stream response from decode service
+        async def generate_stream():
+            async for chunk in stream_service_response(
+                app.state.decode_client, "/chat/completions", req_data
+            ):
+                yield chunk
+
+        return StreamingResponse(generate_stream(), media_type="text/event-stream")
+
+    except Exception as e:
+        import sys
+        import traceback
+
+        exc_info = sys.exc_info()
+        print(
+            "Error occurred in disagg prefill proxy server  - chat completions endpoint"
+        )
+        print(e)
+        print("".join(traceback.format_exception(*exc_info)))
+        raise
+
+
+if __name__ == "__main__":
+    global global_args
+    global_args = parse_args()
+
+    import uvicorn
+
+    uvicorn.run(app, host=global_args.host, port=global_args.port)
diff --git a/vllm_v0.10.0/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh b/vllm_v0.10.0/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
new file mode 100644
index 0000000..5719fa8
--- /dev/null
+++ b/vllm_v0.10.0/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+if [[ $# -lt 1 ]]; then
+    echo "Usage: $0 <prefiller | decoder> [model]"
+    exit 1
+fi
+
+if [[ $# -eq 1 ]]; then
+    echo "Using default model: meta-llama/Llama-3.1-8B-Instruct"
+    MODEL="meta-llama/Llama-3.1-8B-Instruct"
+else
+    echo "Using model: $2"
+    MODEL=$2
+fi
+
+
+if [[ $1 == "prefiller" ]]; then
+    # Prefiller listens on port 8100
+    prefill_config_file=$SCRIPT_DIR/configs/lmcache-prefiller-config.yaml
+
+    UCX_TLS=cuda_ipc,cuda_copy,tcp \
+        LMCACHE_CONFIG_FILE=$prefill_config_file \
+        LMCACHE_USE_EXPERIMENTAL=True \
+        VLLM_ENABLE_V1_MULTIPROCESSING=1 \
+        VLLM_WORKER_MULTIPROC_METHOD=spawn \
+        CUDA_VISIBLE_DEVICES=0 \
+        vllm serve $MODEL \
+        --port 8100 \
+        --disable-log-requests \
+        --enforce-eager \
+        --kv-transfer-config \
+        '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_producer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "producer1"}}'
+
+
+elif [[ $1 == "decoder" ]]; then
+    # Decoder listens on port 8200
+    decode_config_file=$SCRIPT_DIR/configs/lmcache-decoder-config.yaml
+
+    UCX_TLS=cuda_ipc,cuda_copy,tcp \
+        LMCACHE_CONFIG_FILE=$decode_config_file \
+        LMCACHE_USE_EXPERIMENTAL=True \
+        VLLM_ENABLE_V1_MULTIPROCESSING=1 \
+        VLLM_WORKER_MULTIPROC_METHOD=spawn \
+        CUDA_VISIBLE_DEVICES=1 \
+        vllm serve $MODEL \
+        --port 8200 \
+        --disable-log-requests \
+        --enforce-eager \
+        --kv-transfer-config \
+        '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_consumer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "consumer1"}}'
+
+
+else
+    echo "Invalid role: $1"
+    echo "Should be either prefiller, decoder"
+    exit 1
+fi
diff --git a/vllm_v0.10.0/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py b/vllm_v0.10.0/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py
new file mode 100644
index 0000000..46e2d90
--- /dev/null
+++ b/vllm_v0.10.0/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py
@@ -0,0 +1,133 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file demonstrates the example usage of remote KV cache sharing
+with LMCache.
+We will launch 2 vllm instances, and launch an additional LMCache server.
+KV cache is transferred in the following manner:
+(1) vLLM instance 1 -> LMCache server (KV cache store).
+(2) LMCache server -> vLLM instance 2 (KV cache reuse/retrieve).
+
+Note that lmcache needs to be installed to run this example.
+Learn more about LMCache in https://github.com/LMCache/LMCache.
+"""
+
+import os
+import subprocess
+import time
+from multiprocessing import Event, Process
+
+from lmcache.integration.vllm.utils import ENGINE_NAME
+from lmcache.v1.cache_engine import LMCacheEngineBuilder
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+# LMCache-related environment variables
+# The port to start LMCache server
+port = 8100
+# Use experimental features in LMCache
+os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
+# LMCache is set to use 256 tokens per chunk
+os.environ["LMCACHE_CHUNK_SIZE"] = "256"
+# Disable local CPU backend in LMCache
+os.environ["LMCACHE_LOCAL_CPU"] = "False"
+# Set local CPU memory buffer limit to 5.0 GB
+os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
+# Set the remote URL for LMCache server
+os.environ["LMCACHE_REMOTE_URL"] = f"lm://localhost:{port}"
+# Set the serializer/deserializer between vllm and LMCache server
+# `naive` indicates using raw bytes of the tensor without any compression
+os.environ["LMCACHE_REMOTE_SERDE"] = "naive"
+
+prompts = [
+    "Hello, how are you?" * 1000,
+]
+
+
+def run_store(store_done, prompts):
+    # We use GPU 0 for KV cache store process.
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
+
+    ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1", kv_role="kv_both")
+    # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
+    # memory. Reduce the value if your GPU has less memory.
+    llm = LLM(
+        model="mistralai/Mistral-7B-Instruct-v0.2",
+        kv_transfer_config=ktc,
+        max_model_len=8000,
+        gpu_memory_utilization=0.8,
+        enforce_eager=True,
+    )
+
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print(f"Generated text: {generated_text!r}")
+    print("KV cache store is finished.")
+    store_done.set()
+
+    # Clean up lmcache backend
+    LMCacheEngineBuilder.destroy(ENGINE_NAME)
+
+
+def run_retrieve(store_done, prompts, timeout=1):
+    # We use GPU 1 for KV cache retrieve process.
+    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
+
+    ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1", kv_role="kv_both")
+    # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
+    # of memory. Reduce the value if your GPU has less memory.
+    llm = LLM(
+        model="mistralai/Mistral-7B-Instruct-v0.2",
+        kv_transfer_config=ktc,
+        max_model_len=8000,
+        gpu_memory_utilization=0.8,
+        enforce_eager=True,
+    )
+
+    print("Waiting for KV cache store to finish...")
+    store_done.wait()
+    time.sleep(timeout)
+
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print(f"Generated text: {generated_text!r}")
+
+    # Clean up lmcache backend
+    LMCacheEngineBuilder.destroy(ENGINE_NAME)
+
+
+def run_lmcache_server(port):
+    server_proc = subprocess.Popen(
+        ["python", "-m", "lmcache.v1.server", "localhost", str(port)]
+    )
+    return server_proc
+
+
+def main():
+    store_done = Event()
+    store_process = Process(target=run_store, args=(store_done, prompts))
+    retrieve_process = Process(target=run_retrieve, args=(store_done, prompts))
+    lmcache_server_process = run_lmcache_server(port)
+
+    # Start KV cache store process
+    store_process.start()
+
+    # Start KV cache retrieve process
+    retrieve_process.start()
+
+    # Clean up the processes
+    store_process.join()
+    retrieve_process.terminate()
+    lmcache_server_process.terminate()
+    lmcache_server_process.wait()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/others/logging_configuration.md b/vllm_v0.10.0/examples/others/logging_configuration.md
new file mode 100644
index 0000000..916ab5f
--- /dev/null
+++ b/vllm_v0.10.0/examples/others/logging_configuration.md
@@ -0,0 +1,162 @@
+# Logging Configuration
+
+vLLM leverages Python's `logging.config.dictConfig` functionality to enable
+robust and flexible configuration of the various loggers used by vLLM.
+
+vLLM offers two environment variables that can be used to accommodate a range
+of logging configurations that range from simple-and-inflexible to
+more-complex-and-more-flexible.
+
+- No vLLM logging (simple and inflexible)
+  - Set `VLLM_CONFIGURE_LOGGING=0` (leaving `VLLM_LOGGING_CONFIG_PATH` unset)
+- vLLM's default logging configuration (simple and inflexible)
+  - Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1`
+- Fine-grained custom logging configuration (more complex, more flexible)
+  - Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1` and
+    set `VLLM_LOGGING_CONFIG_PATH=<path-to-logging-config.json>`
+
+## Logging Configuration Environment Variables
+
+### `VLLM_CONFIGURE_LOGGING`
+
+`VLLM_CONFIGURE_LOGGING` controls whether or not vLLM takes any action to
+configure the loggers used by vLLM. This functionality is enabled by default,
+but can be disabled by setting `VLLM_CONFIGURE_LOGGING=0` when running vLLM.
+
+If `VLLM_CONFIGURE_LOGGING` is enabled and no value is given for
+`VLLM_LOGGING_CONFIG_PATH`, vLLM will use built-in default configuration to
+configure the root vLLM logger. By default, no other vLLM loggers are
+configured and, as such, all vLLM loggers defer to the root vLLM logger to make
+all logging decisions.
+
+If `VLLM_CONFIGURE_LOGGING` is disabled and a value is given for
+`VLLM_LOGGING_CONFIG_PATH`, an error will occur while starting vLLM.
+
+### `VLLM_LOGGING_CONFIG_PATH`
+
+`VLLM_LOGGING_CONFIG_PATH` allows users to specify a path to a JSON file of
+alternative, custom logging configuration that will be used instead of vLLM's
+built-in default logging configuration. The logging configuration should be
+provided in JSON format following the schema specified by Python's [logging
+configuration dictionary
+schema](https://docs.python.org/3/library/logging.config.html#dictionary-schema-details).
+
+If `VLLM_LOGGING_CONFIG_PATH` is specified, but `VLLM_CONFIGURE_LOGGING` is
+disabled, an error will occur while starting vLLM.
+
+## Examples
+
+### Example 1: Customize vLLM root logger
+
+For this example, we will customize the vLLM root logger to use
+[`python-json-logger`](https://github.com/nhairs/python-json-logger)
+(which is part of the container image) to log to
+STDOUT of the console in JSON format with a log level of `INFO`.
+
+To begin, first, create an appropriate JSON logging configuration file:
+
+??? note "/path/to/logging_config.json"
+
+    ```json
+    {
+      "formatters": {
+        "json": {
+          "class": "pythonjsonlogger.jsonlogger.JsonFormatter"
+        }
+      },
+      "handlers": {
+        "console": {
+          "class" : "logging.StreamHandler",
+          "formatter": "json",
+          "level": "INFO",
+          "stream": "ext://sys.stdout"
+        }
+      },
+      "loggers": {
+        "vllm": {
+          "handlers": ["console"],
+          "level": "INFO",
+          "propagate": false
+        }
+      },
+      "version": 1
+    }
+    ```
+
+Finally, run vLLM with the `VLLM_LOGGING_CONFIG_PATH` environment variable set
+to the path of the custom logging configuration JSON file:
+
+```bash
+VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
+    vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
+```
+
+### Example 2: Silence a particular vLLM logger
+
+To silence a particular vLLM logger, it is necessary to provide custom logging
+configuration for the target logger that configures the logger so that it won't
+propagate its log messages to the root vLLM logger.
+
+When custom configuration is provided for any logger, it is also necessary to
+provide configuration for the root vLLM logger since any custom logger
+configuration overrides the built-in default logging configuration used by vLLM.
+
+First, create an appropriate JSON logging configuration file that includes
+configuration for the root vLLM logger and for the logger you wish to silence:
+
+??? note "/path/to/logging_config.json"
+
+    ```json
+    {
+      "formatters": {
+        "vllm": {
+          "class": "vllm.logging_utils.NewLineFormatter",
+          "datefmt": "%m-%d %H:%M:%S",
+          "format": "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
+        }
+      },
+      "handlers": {
+        "vllm": {
+          "class" : "logging.StreamHandler",
+          "formatter": "vllm",
+          "level": "INFO",
+          "stream": "ext://sys.stdout"
+        }
+      },
+      "loggers": {
+        "vllm": {
+          "handlers": ["vllm"],
+          "level": "DEBUG",
+          "propagate": false
+        },
+        "vllm.example_noisy_logger": {
+          "propagate": false
+        }
+      },
+      "version": 1
+    }
+    ```
+
+Finally, run vLLM with the `VLLM_LOGGING_CONFIG_PATH` environment variable set
+to the path of the custom logging configuration JSON file:
+
+```bash
+VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
+    vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
+```
+
+### Example 3: Disable vLLM default logging configuration
+
+To disable vLLM's default logging configuration and silence all vLLM loggers,
+simple set `VLLM_CONFIGURE_LOGGING=0` when running vLLM. This will prevent vLLM
+for configuring the root vLLM logger, which in turn, silences all other vLLM
+loggers.
+
+```bash
+VLLM_CONFIGURE_LOGGING=0 \
+    vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
+```
+
+## Additional resources
+
+- [`logging.config` Dictionary Schema Details](https://docs.python.org/3/library/logging.config.html#dictionary-schema-details)
diff --git a/vllm_v0.10.0/examples/others/tensorize_vllm_model.py b/vllm_v0.10.0/examples/others/tensorize_vllm_model.py
new file mode 100644
index 0000000..64a6c42
--- /dev/null
+++ b/vllm_v0.10.0/examples/others/tensorize_vllm_model.py
@@ -0,0 +1,374 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import dataclasses
+import json
+import logging
+import os
+import uuid
+
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.lora.request import LoRARequest
+from vllm.model_executor.model_loader.tensorizer import (
+    TensorizerArgs,
+    TensorizerConfig,
+    tensorize_lora_adapter,
+    tensorize_vllm_model,
+    tensorizer_kwargs_arg,
+)
+from vllm.utils import FlexibleArgumentParser
+
+logger = logging.getLogger()
+
+
+# yapf conflicts with isort for this docstring
+# yapf: disable
+"""
+tensorize_vllm_model.py is a script that can be used to serialize and 
+deserialize vLLM models. These models can be loaded using tensorizer 
+to the GPU extremely quickly over an HTTP/HTTPS endpoint, an S3 endpoint,
+or locally. Tensor encryption and decryption is also supported, although 
+libsodium must be installed to use it. Install vllm with tensorizer support 
+using `pip install vllm[tensorizer]`. To learn more about tensorizer, visit
+https://github.com/coreweave/tensorizer
+
+To serialize a model, install vLLM from source, then run something 
+like this from the root level of this repository:
+
+python examples/others/tensorize_vllm_model.py \
+   --model facebook/opt-125m \
+   serialize \
+   --serialized-directory s3://my-bucket \
+   --suffix v1
+   
+Which downloads the model from HuggingFace, loads it into vLLM, serializes it,
+and saves it to your S3 bucket. A local directory can also be used. This
+assumes your S3 credentials are specified as environment variables
+in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and 
+`S3_ENDPOINT_URL`. To provide S3 credentials directly, you can provide 
+`--s3-access-key-id` and `--s3-secret-access-key`, as well as `--s3-endpoint` 
+as CLI args to this script.
+
+You can also encrypt the model weights with a randomly-generated key by 
+providing a `--keyfile` argument.
+
+To deserialize a model, you can run something like this from the root 
+level of this repository:
+
+python examples/others/tensorize_vllm_model.py \
+   --model EleutherAI/gpt-j-6B \
+   --dtype float16 \
+   deserialize \
+   --path-to-tensors s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors
+
+Which downloads the model tensors from your S3 bucket and deserializes them.
+
+You can also provide a `--keyfile` argument to decrypt the model weights if 
+they were serialized with encryption.
+
+To support distributed tensor-parallel models, each model shard will be
+serialized to a separate file. The tensorizer_uri is then specified as a string
+template with a format specifier such as '%03d' that will be rendered with the
+shard's rank. Sharded models serialized with this script will be named as
+model-rank-%03d.tensors
+
+For more information on the available arguments for serializing, run 
+`python -m examples.others.tensorize_vllm_model serialize --help`.
+
+Or for deserializing:
+
+`python examples/others/tensorize_vllm_model.py deserialize --help`.
+
+Once a model is serialized, tensorizer can be invoked with the `LLM` class 
+directly to load models:
+
+    llm = LLM(model="facebook/opt-125m",
+              load_format="tensorizer",
+              model_loader_extra_config=TensorizerConfig(
+                    tensorizer_uri = path_to_tensors,
+                    num_readers=3,
+                    )
+              )
+            
+A serialized model can be used during model loading for the vLLM OpenAI
+inference server. `model_loader_extra_config` is exposed as the CLI arg
+`--model-loader-extra-config`, and accepts a JSON string literal of the
+TensorizerConfig arguments desired.
+
+In order to see all of the available arguments usable to configure 
+loading with tensorizer that are given to `TensorizerConfig`, run:
+
+`python examples/others/tensorize_vllm_model.py deserialize --help`
+
+under the `tensorizer options` section. These can also be used for
+deserialization in this example script, although `--tensorizer-uri` and
+`--path-to-tensors` are functionally the same in this case.
+
+Tensorizer can also be used to save and load LoRA adapters. A LoRA adapter
+can be serialized directly with the path to the LoRA adapter on HF Hub and
+a TensorizerConfig object. In this script, passing a HF id to a LoRA adapter
+will serialize the LoRA adapter artifacts to `--serialized-directory`.
+
+You can then use the LoRA adapter with `vllm serve`, for instance, by ensuring 
+the LoRA artifacts are in your model artifacts directory and specifying 
+`--enable-lora`. For instance:
+
+```
+vllm serve <model_path> \
+    --load-format tensorizer \
+    --model-loader-extra-config '{"tensorizer_uri": "<model_path>.tensors"}' \
+    --enable-lora
+```
+"""
+
+
+def get_parser():
+    parser = FlexibleArgumentParser(
+        description="An example script that can be used to serialize and "
+        "deserialize vLLM models. These models "
+        "can be loaded using tensorizer directly to the GPU "
+        "extremely quickly. Tensor encryption and decryption is "
+        "also supported, although libsodium must be installed to "
+        "use it.")
+    parser = EngineArgs.add_cli_args(parser)
+
+    parser.add_argument(
+        "--lora-path",
+        type=str,
+        required=False,
+        help="Path to a LoRA adapter to "
+        "serialize along with model tensors. This can then be deserialized "
+        "along with the model by instantiating a TensorizerConfig object, "
+        "creating a dict from it with TensorizerConfig.to_serializable(), "
+        "and passing it to LoRARequest's initializer with the kwarg "
+        "tensorizer_config_dict."
+    )
+
+    subparsers = parser.add_subparsers(dest='command', required=True)
+
+    serialize_parser = subparsers.add_parser(
+        'serialize', help="Serialize a model to `--serialized-directory`")
+
+    serialize_parser.add_argument(
+        "--suffix",
+        type=str,
+        required=False,
+        help=(
+            "The suffix to append to the serialized model directory, which is "
+            "used to construct the location of the serialized model tensors, "
+            "e.g. if `--serialized-directory` is `s3://my-bucket/` and "
+            "`--suffix` is `v1`, the serialized model tensors will be "
+            "saved to "
+            "`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
+            "If none is provided, a random UUID will be used."))
+    serialize_parser.add_argument(
+        "--serialized-directory",
+        type=str,
+        required=True,
+        help="The directory to serialize the model to. "
+        "This can be a local directory or S3 URI. The path to where the "
+        "tensors are saved is a combination of the supplied `dir` and model "
+        "reference ID. For instance, if `dir` is the serialized directory, "
+        "and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
+        "be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
+        "where `suffix` is given by `--suffix` or a random UUID if not "
+        "provided.")
+
+    serialize_parser.add_argument(
+        "--serialization-kwargs",
+        type=tensorizer_kwargs_arg,
+        required=False,
+        help=("A JSON string containing additional keyword arguments to "
+              "pass to Tensorizer's TensorSerializer during "
+              "serialization."))
+
+    serialize_parser.add_argument(
+        "--keyfile",
+        type=str,
+        required=False,
+        help=("Encrypt the model weights with a randomly-generated binary key,"
+              " and save the key at this path"))
+
+    deserialize_parser = subparsers.add_parser(
+        'deserialize',
+        help=("Deserialize a model from `--path-to-tensors`"
+              " to verify it can be loaded and used."))
+
+    deserialize_parser.add_argument(
+        "--path-to-tensors",
+        type=str,
+        required=False,
+        help="The local path or S3 URI to the model tensors to deserialize. ")
+
+    deserialize_parser.add_argument(
+        "--serialized-directory",
+        type=str,
+        required=False,
+        help="Directory with model artifacts for loading. Assumes a "
+             "model.tensors file exists therein. Can supersede "
+             "--path-to-tensors.")
+
+    deserialize_parser.add_argument(
+        "--keyfile",
+        type=str,
+        required=False,
+        help=("Path to a binary key to use to decrypt the model weights,"
+              " if the model was serialized with encryption"))
+
+    deserialize_parser.add_argument(
+        "--deserialization-kwargs",
+        type=tensorizer_kwargs_arg,
+        required=False,
+        help=("A JSON string containing additional keyword arguments to "
+              "pass to Tensorizer's `TensorDeserializer` during "
+              "deserialization."))
+
+    TensorizerArgs.add_cli_args(deserialize_parser)
+
+    return parser
+
+def merge_extra_config_with_tensorizer_config(extra_cfg: dict,
+                                              cfg: TensorizerConfig):
+    for k, v in extra_cfg.items():
+        if hasattr(cfg, k):
+            setattr(cfg, k, v)
+            logger.info(
+                "Updating TensorizerConfig with %s from "
+                "--model-loader-extra-config provided", k
+            )
+
+def deserialize(args, tensorizer_config):
+    if args.lora_path:
+        tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
+        llm = LLM(model=args.model,
+                  load_format="tensorizer",
+                  tensor_parallel_size=args.tensor_parallel_size,
+                  model_loader_extra_config=tensorizer_config,
+                  enable_lora=True,
+        )
+        sampling_params = SamplingParams(
+            temperature=0,
+            max_tokens=256,
+            stop=["[/assistant]"]
+        )
+
+        # Truncating this as the extra text isn't necessary
+        prompts = [
+            "[user] Write a SQL query to answer the question based on ..."
+        ]
+
+        # Test LoRA load
+        print(
+            llm.generate(
+            prompts,
+            sampling_params,
+            lora_request=LoRARequest("sql-lora",
+                                     1,
+                                     args.lora_path,
+                                     tensorizer_config_dict = tensorizer_config
+                                     .to_serializable())
+            )
+        )
+    else:
+        llm = LLM(model=args.model,
+                  load_format="tensorizer",
+                  tensor_parallel_size=args.tensor_parallel_size,
+                  model_loader_extra_config=tensorizer_config
+        )
+    return llm
+
+
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+
+    s3_access_key_id = (getattr(args, 's3_access_key_id', None)
+                        or os.environ.get("S3_ACCESS_KEY_ID", None))
+    s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
+                            or os.environ.get("S3_SECRET_ACCESS_KEY", None))
+    s3_endpoint = (getattr(args, 's3_endpoint', None)
+                or os.environ.get("S3_ENDPOINT_URL", None))
+
+    credentials = {
+        "s3_access_key_id": s3_access_key_id,
+        "s3_secret_access_key": s3_secret_access_key,
+        "s3_endpoint": s3_endpoint
+    }
+
+    model_ref = args.model
+
+    if args.command == "serialize" or args.command == "deserialize":
+        keyfile = args.keyfile
+    else:
+        keyfile = None
+
+    extra_config = {}
+    if args.model_loader_extra_config:
+        extra_config = json.loads(args.model_loader_extra_config)
+
+
+    tensorizer_dir = (args.serialized_directory or
+                      extra_config.get("tensorizer_dir"))
+    tensorizer_uri = (getattr(args, "path_to_tensors", None)
+                      or extra_config.get("tensorizer_uri"))
+
+    if tensorizer_dir and tensorizer_uri:
+        parser.error("--serialized-directory and --path-to-tensors "
+                     "cannot both be provided")
+
+    if not tensorizer_dir and not tensorizer_uri:
+        parser.error("Either --serialized-directory or --path-to-tensors "
+                     "must be provided")
+
+
+    if args.command == "serialize":
+        eng_args_dict = {f.name: getattr(args, f.name) for f in
+                        dataclasses.fields(EngineArgs)}
+
+        engine_args = EngineArgs.from_cli_args(
+            argparse.Namespace(**eng_args_dict)
+        )
+
+        input_dir = tensorizer_dir.rstrip('/')
+        suffix = args.suffix if args.suffix else uuid.uuid4().hex
+        base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
+        if engine_args.tensor_parallel_size > 1:
+            model_path = f"{base_path}/model-rank-%03d.tensors"
+        else:
+            model_path = f"{base_path}/model.tensors"
+
+        tensorizer_config = TensorizerConfig(
+            tensorizer_uri=model_path,
+            encryption_keyfile=keyfile,
+            serialization_kwargs=args.serialization_kwargs or {},
+            **credentials
+        )
+
+        if args.lora_path:
+            tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
+            tensorize_lora_adapter(args.lora_path, tensorizer_config)
+
+        merge_extra_config_with_tensorizer_config(extra_config,
+                                                  tensorizer_config)
+        tensorize_vllm_model(engine_args, tensorizer_config)
+
+    elif args.command == "deserialize":
+        tensorizer_config = TensorizerConfig(
+            tensorizer_uri=args.path_to_tensors,
+            tensorizer_dir=args.serialized_directory,
+            encryption_keyfile=keyfile,
+            deserialization_kwargs=args.deserialization_kwargs or {},
+            **credentials
+        )
+
+        merge_extra_config_with_tensorizer_config(extra_config,
+                                                  tensorizer_config)
+        deserialize(args, tensorizer_config)
+    else:
+        raise ValueError("Either serialize or deserialize must be specified.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/examples/pyproject.toml b/vllm_v0.10.0/examples/pyproject.toml
new file mode 100644
index 0000000..f825cb2
--- /dev/null
+++ b/vllm_v0.10.0/examples/pyproject.toml
@@ -0,0 +1,54 @@
+# This local pyproject file is part of the migration from yapf to ruff format.
+# It uses the same core rules as the main pyproject.toml file, but with the
+# following differences:
+# - ruff line length is overridden to 88
+# - deprecated typing ignores (UP006, UP035) have been removed
+
+[tool.ruff]
+line-length = 88
+exclude = [
+    # External file, leaving license intact
+    "examples/other/fp8/quantizer/quantize.py",
+    "vllm/vllm_flash_attn/flash_attn_interface.pyi"
+]
+
+[tool.ruff.lint.per-file-ignores]
+"vllm/third_party/**" = ["ALL"]
+"vllm/version.py" = ["F401"]
+"vllm/_version.py" = ["ALL"]
+
+[tool.ruff.lint]
+select = [
+    # pycodestyle
+    "E",
+    # Pyflakes
+    "F",
+    # pyupgrade
+    "UP",
+    # flake8-bugbear
+    "B",
+    # flake8-simplify
+    "SIM",
+    # isort
+    "I",
+    # flake8-logging-format
+    "G",
+]
+ignore = [
+    # star imports
+    "F405", "F403",
+    # lambda expression assignment
+    "E731",
+    # Loop control variable not used within loop body
+    "B007",
+    # f-string format
+    "UP032",
+    # Can remove once 3.10+ is the minimum Python version
+    "UP007",
+]
+
+[tool.ruff.lint.isort]
+known-first-party = ["vllm"]
+
+[tool.ruff.format]
+docstring-code-format = true
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/template_alpaca.jinja b/vllm_v0.10.0/examples/template_alpaca.jinja
new file mode 100644
index 0000000..60667ac
--- /dev/null
+++ b/vllm_v0.10.0/examples/template_alpaca.jinja
@@ -0,0 +1,29 @@
+{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
+
+{% for message in messages %}
+{% if message['role'] == 'user' %}
+### Instruction:
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+
+{% endif %}
+{% elif message['role'] == 'assistant' %}
+### Response:
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+
+{% endif %}
+{% elif message['role'] == 'user_context' %}
+### Input:
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+
+{% endif %}
+{% endif %}
+{% endfor %}
+{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
+### Response:
+{% endif %}
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/template_baichuan.jinja b/vllm_v0.10.0/examples/template_baichuan.jinja
new file mode 100644
index 0000000..42a8d92
--- /dev/null
+++ b/vllm_v0.10.0/examples/template_baichuan.jinja
@@ -0,0 +1,13 @@
+{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
+
+{%- for message in messages -%}
+    {%- if message['role'] == 'user' -%}
+        {{- '<reserved_106>' + message['content'] -}}
+    {%- elif message['role'] == 'assistant' -%}
+        {{- '<reserved_107>' + message['content'] -}}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
+    {{- '<reserved_107>' -}}
+{% endif %}
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/template_chatglm.jinja b/vllm_v0.10.0/examples/template_chatglm.jinja
new file mode 100644
index 0000000..bf26f27
--- /dev/null
+++ b/vllm_v0.10.0/examples/template_chatglm.jinja
@@ -0,0 +1,18 @@
+{%- set counter = namespace(index=0) -%}
+{%- for message in messages -%}
+    {%- if message['role'] == 'user' -%}
+        {{- '[Round ' + counter.index|string + ']\n问：' + message['content'] -}}
+        {%- set counter.index = counter.index + 1 -%}
+    {%- endif -%}
+    {%- if message['role'] == 'assistant' -%}
+        {{- '\n答：' + message['content'] -}}
+        {%- if (loop.last and add_generation_prompt) or not loop.last -%}
+            {{- '\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+
+{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
+    {{- '\n答：' -}}
+{%- endif -%}
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/template_chatglm2.jinja b/vllm_v0.10.0/examples/template_chatglm2.jinja
new file mode 100644
index 0000000..c155b7c
--- /dev/null
+++ b/vllm_v0.10.0/examples/template_chatglm2.jinja
@@ -0,0 +1,18 @@
+{%- set counter = namespace(index=1) -%}
+{%- for message in messages -%}
+    {%- if message['role'] == 'user' -%}
+        {{- '[Round ' + counter.index|string + ']\n\n问：' + message['content'] -}}
+        {%- set counter.index = counter.index + 1 -%}
+    {%- endif -%}
+    {%- if message['role'] == 'assistant' -%}
+        {{- '\n\n答：' + message['content'] -}}
+        {%- if (loop.last and add_generation_prompt) or not loop.last -%}
+            {{- '\n\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+
+{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
+    {{- '\n\n答：' -}}
+{%- endif -%}
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/template_chatml.jinja b/vllm_v0.10.0/examples/template_chatml.jinja
new file mode 100644
index 0000000..4844e68
--- /dev/null
+++ b/vllm_v0.10.0/examples/template_chatml.jinja
@@ -0,0 +1,2 @@
+{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
+{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/template_dse_qwen2_vl.jinja b/vllm_v0.10.0/examples/template_dse_qwen2_vl.jinja
new file mode 100644
index 0000000..e7b93fa
--- /dev/null
+++ b/vllm_v0.10.0/examples/template_dse_qwen2_vl.jinja
@@ -0,0 +1,7 @@
+{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}{% raw %}<|im_start|>system
+You are a helpful assistant.<|im_end|>
+{% endraw %}{% endif %}<|im_start|>{{ message['role'] }}{% raw %}
+{% endraw %}{% if message['content'] is string %}{{ message['content'] }}<|im_end|>{% raw %}
+{% endraw %}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>{% raw %}
+{% endraw %}{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant{% raw %}
+{% endraw %}{% endif %}<|endoftext|>
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/template_falcon.jinja b/vllm_v0.10.0/examples/template_falcon.jinja
new file mode 100644
index 0000000..01cf0e2
--- /dev/null
+++ b/vllm_v0.10.0/examples/template_falcon.jinja
@@ -0,0 +1,15 @@
+{%- for message in messages -%}
+    {%- if message['role'] == 'user' -%}
+        {{- 'User: ' + message['content'] -}}
+    {%- elif message['role'] == 'assistant' -%}
+        {{- 'Assistant: ' + message['content'] -}}
+    {%- endif -%}
+    {%- if (loop.last and add_generation_prompt) or not loop.last -%}
+        {{- '\n' -}}
+    {%- endif -%}
+{%- endfor -%}
+
+
+{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
+    {{- 'Assistant:' -}}
+{% endif %}
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/template_falcon_180b.jinja b/vllm_v0.10.0/examples/template_falcon_180b.jinja
new file mode 100644
index 0000000..f08f739
--- /dev/null
+++ b/vllm_v0.10.0/examples/template_falcon_180b.jinja
@@ -0,0 +1,17 @@
+{%- for message in messages -%}
+    {%- if message['role'] == 'system' -%}
+        {{- 'System: ' + message['content'] -}}
+    {%- elif message['role'] == 'user' -%}
+        {{- 'User: ' + message['content'] -}}
+    {%- elif message['role'] == 'assistant' -%}
+        {{- 'Falcon: ' + message['content'] -}}
+    {%- endif -%}
+    {%- if (loop.last and add_generation_prompt) or not loop.last -%}
+        {{- '\n' -}}
+    {%- endif -%}
+{%- endfor -%}
+
+
+{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
+    {{- 'Falcon:' -}}
+{% endif %}
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/template_inkbot.jinja b/vllm_v0.10.0/examples/template_inkbot.jinja
new file mode 100644
index 0000000..33a8174
--- /dev/null
+++ b/vllm_v0.10.0/examples/template_inkbot.jinja
@@ -0,0 +1,30 @@
+<#meta#>
+- Date: {{ (messages|selectattr('role', 'equalto', 'meta-current_date')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-current_date')|list) else '' }}
+- Task: {{ (messages|selectattr('role', 'equalto', 'meta-task_name')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-task_name')|list) else '' }}
+<#system#>
+{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
+<#chat#>
+{% for message in messages %}
+{% if message['role'] == 'user' %}
+<#user#>
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+{% endif %}
+{% elif message['role'] == 'assistant' %}
+<#bot#>
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+{% endif %}
+{% elif message['role'] == 'user_context' %}
+<#user_context#>
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+{% endif %}
+{% endif %}
+{% endfor %}
+{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
+<#bot#>
+{% endif %}
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/template_teleflm.jinja b/vllm_v0.10.0/examples/template_teleflm.jinja
new file mode 100644
index 0000000..0cb29cc
--- /dev/null
+++ b/vllm_v0.10.0/examples/template_teleflm.jinja
@@ -0,0 +1,12 @@
+{%- for message in messages %}
+    {%- if message['role'] == 'user' %}
+        {{- '<_user>' + message['content']|trim }}
+    {%- elif message['role'] == 'system' %}
+        {{- '<_system>' + message['content']|trim }}
+    {%- elif message['role'] == 'assistant' %}
+        {{- '<_bot>' + message['content'] }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<_bot>' }}
+{%- endif %}
diff --git a/vllm_v0.10.0/examples/template_vlm2vec.jinja b/vllm_v0.10.0/examples/template_vlm2vec.jinja
new file mode 100644
index 0000000..489b996
--- /dev/null
+++ b/vllm_v0.10.0/examples/template_vlm2vec.jinja
@@ -0,0 +1,16 @@
+{%- if messages | length > 1 -%}
+    {{ raise_exception('Embedding models should only embed one message at a time') }}
+{%- endif -%}
+
+{% set vars = namespace(parts=[], next_image_id=1) %}
+{%- for message in messages -%}
+    {%- for content in message['content'] -%}
+        {%- if content['type'] == 'text' -%}
+            {%- set vars.parts = vars.parts + [content['text']] %}
+        {%- elif content['type'] == 'image' -%}
+            {%- set vars.parts = vars.parts + ['<|image_{i:d}|>'.format(i=vars.next_image_id)] %}
+            {%- set vars.next_image_id = vars.next_image_id + 1 %}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endfor -%}
+{{ vars.parts | join(' ') }}
diff --git a/vllm_v0.10.0/examples/tool_chat_template_deepseekr1.jinja b/vllm_v0.10.0/examples/tool_chat_template_deepseekr1.jinja
new file mode 100644
index 0000000..908574b
--- /dev/null
+++ b/vllm_v0.10.0/examples/tool_chat_template_deepseekr1.jinja
@@ -0,0 +1,92 @@
+{% if not add_generation_prompt is defined %}
+    {% set add_generation_prompt = false %}
+{% endif %}
+{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true, is_last_user=false) %}
+{%- for message in messages %}
+    {%- if message['role'] == 'system' %}
+        {%- if ns.is_first_sp %}
+            {% set ns.system_prompt = ns.system_prompt + message['content'] %}
+            {% set ns.is_first_sp = false %}
+        {%- else %}
+            {% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}
+        {%- endif %}
+    {%- endif %}
+{%- endfor -%}
+
+{#- Adapted from https://github.com/sgl-project/sglang/blob/main/examples/chat_template/tool_chat_template_deepseekr1.jinja #}
+{% if tools is defined and tools is not none %}
+    {% set tool_ns = namespace(text='You are a helpful assistant with tool calling capabilities. '
+        'When a tool call is needed, you MUST use the following format to issue the call:\n'
+        '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>FUNCTION_NAME\n'
+        '```json\n{"param1": "value1", "param2": "value2"}\n```<｜tool▁call▁end｜><｜tool▁calls▁end｜>\n\n'
+        'Make sure the JSON is valid.'
+        '## Tools\n\n### Function\n\nYou have the following functions available:\n\n') %}
+    {% for tool in tools %}
+        {% set tool_ns.text = tool_ns.text + '\n```json\n' + (tool | tojson) + '\n```\n' %}
+    {% endfor %}
+    {% set ns.system_prompt = ns.system_prompt + '\n\n' + tool_ns.text %}
+{% endif %}
+
+{{- bos_token }}
+{{- ns.system_prompt }}
+{%- for message in messages %}
+    {% set content = message['content'] %}
+    {%- if message['role'] == 'user' %}
+        {%- set ns.is_tool = false -%}
+        {%- set ns.is_first = false -%}
+        {%- set ns.is_last_user = true -%}
+        {{'<｜User｜>' + content + '<｜Assistant｜>'}}
+    {%- endif %}
+    {%- if message['role'] == 'assistant' %}
+        {% if '</think>' in content %}
+            {% set content = content.split('</think>')[-1] %}
+        {% endif %}
+    {% endif %}
+    {%- if message['role'] == 'assistant' and message['tool_calls'] is defined and message['tool_calls'] is not none %}
+        {%- set ns.is_last_user = false -%}
+        {%- if ns.is_tool %}
+            {{- '<｜tool▁outputs▁end｜>'}}
+        {%- endif %}
+        {%- set ns.is_first = false %}
+        {%- set ns.is_tool = false -%}
+        {%- set ns.is_output_first = true %}
+        {%- for tool in message['tool_calls'] %}
+            {%- if not ns.is_first %}
+                {%- if content is none %}
+                    {{- '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+                {%- else %}
+                    {{- content + '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+                {%- endif %}
+                {%- set ns.is_first = true -%}
+            {%- else %}
+                {{- '\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+            {%- endif %}
+        {%- endfor %}
+        {{- '<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}
+    {%- endif %}
+    {%- if message['role'] == 'assistant' and (message['tool_calls'] is not defined or message['tool_calls'] is none)%}
+        {%- set ns.is_last_user = false -%}
+        {%- if ns.is_tool %}
+            {{- '<｜tool▁outputs▁end｜>' + content + '<｜end▁of▁sentence｜>'}}
+            {%- set ns.is_tool = false -%}
+        {%- else %}
+            {{- content + '<｜end▁of▁sentence｜>'}}
+        {%- endif %}
+    {%- endif %}
+    {%- if message['role'] == 'tool' %}
+        {%- set ns.is_last_user = false -%}
+        {%- set ns.is_tool = true -%}
+        {%- if ns.is_output_first %}
+            {{- '<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + content + '<｜tool▁output▁end｜>'}}
+            {%- set ns.is_output_first = false %}
+        {%- else %}
+            {{- '\n<｜tool▁output▁begin｜>' + content + '<｜tool▁output▁end｜>'}}
+        {%- endif %}
+    {%- endif %}
+{%- endfor -%}
+{% if ns.is_tool %}
+    {{- '<｜tool▁outputs▁end｜>'}}
+{%- endif %}
+{% if add_generation_prompt and not ns.is_last_user and not ns.is_tool %}
+    {{- '<｜Assistant｜>'}}
+{%- endif %}
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/tool_chat_template_deepseekv3.jinja b/vllm_v0.10.0/examples/tool_chat_template_deepseekv3.jinja
new file mode 100644
index 0000000..36f3781
--- /dev/null
+++ b/vllm_v0.10.0/examples/tool_chat_template_deepseekv3.jinja
@@ -0,0 +1,96 @@
+{% if not add_generation_prompt is defined %}
+    {% set add_generation_prompt = false %}
+{% endif %}
+
+{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true, is_last_user=false) %}
+
+{%- for message in messages %}
+    {%- if message['role'] == 'system' %}
+        {%- if ns.is_first_sp %}
+            {% set ns.system_prompt = ns.system_prompt + message['content'] %}
+            {% set ns.is_first_sp = false %}
+        {%- else %}
+            {% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+
+{{ bos_token }}
+{{ ns.system_prompt }}
+{%- if tools %}
+    {{"\n\n# Tools\n\nYou may call one or more functions to assist with the user query." }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{"\n</tools>\n\n"}}
+
+    {{"For function call returns, you should first print <｜tool▁calls▁begin｜>"}}
+
+    {{"For each function call, you should return object like:\n" }}
+    {{"<｜tool▁call▁begin｜>function<｜tool▁sep｜><function_name>\n```json\n<function_arguments_in_json_format>\n```<｜tool▁call▁end｜>"}}
+
+    {{"At the end of function call returns, you should print <｜tool▁calls▁end｜><｜end▁of▁sentence｜>"}}
+{%- endif %}
+
+{%- for message in messages %}
+    {%- if message['role'] == 'user' %}
+        {%- set ns.is_tool = false -%}
+        {%- set ns.is_first = false -%}
+        {%- set ns.is_last_user = true -%}
+        {{'<｜User｜>' + message['content'] + '<｜Assistant｜>'}}
+    {%- endif %}
+
+    {%- if message['role'] == 'assistant' and message['tool_calls'] is defined and message['tool_calls'] is not none %}
+        {%- set ns.is_last_user = false -%}
+        {%- if ns.is_tool %}
+            {{'<｜tool▁outputs▁end｜>'}}
+        {%- endif %}
+        {%- set ns.is_first = false %}
+        {%- set ns.is_tool = false -%}
+        {%- set ns.is_output_first = true %}
+        
+        {%- for tool in message['tool_calls'] %}
+            {%- if not ns.is_first %}
+                {%- if message['content'] is none %}
+                    {{'<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+                {%- else %}
+                    {{message['content'] + '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+                {%- endif %}
+            {%- set ns.is_first = true -%}
+            {%- else %}
+                {{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+            {%- endif %}
+        {%- endfor %}
+        {{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}
+    {%- endif %}
+    {%- if message['role'] == 'assistant' and (message['tool_calls'] is not defined or message['tool_calls'] is none)%}
+        {%- set ns.is_last_user = false -%}
+        {%- if ns.is_tool %}
+            {{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}
+            {%- set ns.is_tool = false -%}
+        {%- else %}
+            {% set content = message['content'] %}
+            {{content + '<｜end▁of▁sentence｜>'}}
+        {%- endif %}
+    {%- endif %}
+
+    {%- if message['role'] == 'tool' %}
+        {%- set ns.is_last_user = false -%}
+        {%- set ns.is_tool = true -%}
+        {%- if ns.is_output_first %}
+            {{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}
+            {%- set ns.is_output_first = false %}
+        {%- else %}
+            {{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}
+        {%- endif %}
+    {%- endif %}
+{%- endfor -%}
+
+{% if ns.is_tool %}
+    {{'<｜tool▁outputs▁end｜>'}}
+{% endif %}
+
+{% if add_generation_prompt and not ns.is_last_user and not ns.is_tool %}
+    {{'<｜Assistant｜>'}}
+{% endif %}
diff --git a/vllm_v0.10.0/examples/tool_chat_template_granite.jinja b/vllm_v0.10.0/examples/tool_chat_template_granite.jinja
new file mode 100644
index 0000000..467dcb2
--- /dev/null
+++ b/vllm_v0.10.0/examples/tool_chat_template_granite.jinja
@@ -0,0 +1,36 @@
+{%- if tools %}
+    {{- '<|start_of_role|>available_tools<|end_of_role|>
+' }}
+    {%- for tool in tools %}
+    {{- tool | tojson(indent=4) }}
+    {%- if not loop.last %}
+        {{- '
+
+' }}
+    {%- endif %}
+    {%- endfor %}
+    {{- '<|end_of_text|>
+' }}
+{%- endif %}
+
+{%- for message in messages %}
+    {%- if message['role'] == 'system' %}
+    {{- '<|start_of_role|>system<|end_of_role|>' + message['content'] + '<|end_of_text|>
+' }}
+    {%- elif message['role'] == 'user' %}
+    {{- '<|start_of_role|>user<|end_of_role|>' + message['content'] + '<|end_of_text|>
+' }}
+    {%- elif message['role'] == 'assistant_tool_call' or (message['role'] == 'assistant' and message.tool_calls is defined) %}
+    {{- '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message.tool_calls|map(attribute='function')|list|tojson(indent=4) + '<|end_of_text|>
+' }}
+    {%- elif message['role'] == 'assistant' %}
+    {{- '<|start_of_role|>assistant<|end_of_role|>'  + message['content'] + '<|end_of_text|>
+' }}
+    {%- elif message['role'] == 'tool_response' or  message['role'] == 'tool' %}
+    {{- '<|start_of_role|>tool_response<|end_of_role|>' + message['content'] + '<|end_of_text|>
+' }}
+    {%- endif %}
+    {%- if loop.last and add_generation_prompt %}
+    {{- '<|start_of_role|>assistant<|end_of_role|>' }}
+    {%- endif %}
+{%- endfor %}
diff --git a/vllm_v0.10.0/examples/tool_chat_template_granite_20b_fc.jinja b/vllm_v0.10.0/examples/tool_chat_template_granite_20b_fc.jinja
new file mode 100644
index 0000000..cb52188
--- /dev/null
+++ b/vllm_v0.10.0/examples/tool_chat_template_granite_20b_fc.jinja
@@ -0,0 +1,130 @@
+{%- macro json_to_python_type(json_spec) %}
+    {%- set basic_type_map = {
+    "string": "str",
+    "number": "float",
+    "integer": "int",
+    "boolean": "bool"
+} %}
+
+    {%- if basic_type_map[json_spec.type] is defined %}
+        {{- basic_type_map[json_spec.type] }}
+    {%- elif json_spec.type == "array" %}
+        {{- "list[" +  json_to_python_type(json_spec|items) + "]" }}
+    {%- elif json_spec.type == "object" %}
+        {%- if json_spec.additionalProperties is defined %}
+            {{- "dict[str, " + json_to_python_type(json_spec.additionalProperties) + ']' }}
+        {%- else %}
+            {{- "dict" }}
+        {%- endif %}
+    {%- elif json_spec.type is iterable %}
+        {{- "Union[" }}
+        {%- for t in json_spec.type %}
+            {{- json_to_python_type({"type": t}) }}
+            {%- if not loop.last %}
+                {{- "," }}
+            {%- endif %}
+        {%- endfor %}
+        {{- "]" }}
+    {%- else %}
+        {{- "Any" }}
+    {%- endif %}
+{%- endmacro %}
+
+{%- if not full_function_description is defined %}
+    {%- set full_function_description = false %}
+{%- endif %}
+
+{%- macro full_description(tool) %}
+    {{- tool.name + '(' }}
+    {%- if tool.parameters is defined %}
+        {%- for param_name, param_fields in tool.parameters.properties|items %}
+            {{- param_name + ": " + json_to_python_type(param_fields) }}
+            {%- if not loop.last %}
+                {{- ", " }}
+            {%- endif %}
+        {%- endfor %}
+    {%- endif %}
+    {{- ")" }}
+    {%- if tool.return is defined %}
+        {{- " -> " + json_to_python_type(tool.return) }}
+    {%- endif %}
+    {{- " - " + tool.description + "\n\n" }}
+    {%- if tool.parameters is defined %}
+        {%- for param_name, param_fields in tool.parameters.properties|items %}
+            {%- if loop.first %}
+                {{- "    Args:\n" }}
+            {%- endif %}
+            {{- "        " + param_name + "(" + json_to_python_type(param_fields) + "): " + param_fields.description|trim }}
+        {%- endfor %}
+    {%- endif %}
+    {%- if tool.return is defined and tool.return.description is defined %}
+        {{- "\n    Returns:\n        " + tool.return.description }}
+    {%- endif %}
+    {{- '"' }}
+{%- endmacro %}
+
+{%- macro simple_description(tool) %}
+    {{- tool.description }}
+{%- endmacro %}
+
+{%- macro function_description(tool) %}
+    {%- if full_function_description %}
+        {{- full_description(tool) }}
+    {%- else %}
+        {{- simple_description(tool) }}
+    {%- endif %}
+{%- endmacro %}
+
+{%- if messages[0]["role"] == "system" %}
+    {%- set sys_prompt = messages[0]["content"] %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set loop_messages = messages %}
+    {% set sys_prompt = 'You are a helpful assistant with access to the following function calls. Your task is to understand the given conversation with function calls and responses and generate natural language response as the ASSISTANT to continue the conversation. You may use the following function calls to understand how to respond to the user query.' %}
+{%- endif %}
+
+{{ 'SYSTEM: ' + sys_prompt }}
+{% if tools is iterable and tools | length > 0 %}
+<|function_call_library|>
+    {%- for tool in tools %}
+        {%- if tool.function is defined %}
+            {%- set tool = tool.function %}
+        {%- endif %}
+        {{- '{"name": "' + tool.name + '", ' }}
+        {{- '"description": "' + function_description(tool) }}
+        {{- ', "parameters": ' }}
+        {%- if not tool.parameters is defined or tool.parameters.properties | length == 0 %}
+            {{- "{}" }}
+        {%- else %}
+            {{- tool.parameters|tojson }}
+        {%- endif %}
+        {{- "}" }}
+        {%- if not loop.last %}
+            {{- "\n" }}
+        {%- endif %}
+    {%- endfor %}
+If none of the functions are relevant or the given question lacks the parameters required by the function, please output \"<function_call> {\"name\": \"no_function\", \"arguments\": {}}\".
+{%- endif %}
+
+
+
+{% for message in messages %}
+    {% if message['role'] == 'user' %}
+        {{- '\nUSER: ' + message['content'] }}
+    {% elif message['role'] == 'assistant' and message.tool_calls is defined %}
+        {{- '\nASSISTANT:'  }}
+        {% for tc in message.tool_calls %}
+            {{- '<function_call> ' + {'name': tc.function.name, 'arguments': tc.function.arguments}|tojson  }}
+        {% endfor %}
+        {{- '<|endoftext|>'  }}
+    {% elif message['role'] == 'assistant' %}
+        {{- '\nASSISTANT: ' + message['content'] + ' <|endoftext|>'  }}
+    {% elif message['role'] == 'tool' %}
+        {{- '<function_response> ' + message['content'] }}
+    {%- else %}
+        {{- raise_exception("Unexpected combination of role and message content") }}
+    {% endif %}
+    {% if loop.last and add_generation_prompt %}
+        {{- '\nASSISTANT: ' }}
+    {% endif %}
+{% endfor %}
diff --git a/vllm_v0.10.0/examples/tool_chat_template_hermes.jinja b/vllm_v0.10.0/examples/tool_chat_template_hermes.jinja
new file mode 100644
index 0000000..0b0902c
--- /dev/null
+++ b/vllm_v0.10.0/examples/tool_chat_template_hermes.jinja
@@ -0,0 +1,130 @@
+{%- macro json_to_python_type(json_spec) %}
+    {%- set basic_type_map = {
+    "string": "str",
+    "number": "float",
+    "integer": "int",
+    "boolean": "bool"
+} %}
+
+    {%- if basic_type_map[json_spec.type] is defined %}
+        {{- basic_type_map[json_spec.type] }}
+    {%- elif json_spec.type == "array" %}
+        {{- "list[" +  json_to_python_type(json_spec|items) + "]" }}
+    {%- elif json_spec.type == "object" %}
+        {%- if json_spec.additionalProperties is defined %}
+            {{- "dict[str, " + json_to_python_type(json_spec.additionalProperties) + ']' }}
+        {%- else %}
+            {{- "dict" }}
+        {%- endif %}
+    {%- elif json_spec.type is iterable %}
+        {{- "Union[" }}
+        {%- for t in json_spec.type %}
+            {{- json_to_python_type({"type": t}) }}
+            {%- if not loop.last %}
+                {{- "," }}
+            {%- endif %}
+        {%- endfor %}
+        {{- "]" }}
+    {%- else %}
+        {{- "Any" }}
+    {%- endif %}
+{%- endmacro %}
+
+
+{{- bos_token }}
+{{- "<|im_start|>system\nYou are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: <tools> " }}
+{%- if tools is iterable and tools | length > 0 %}
+    {%- for tool in tools %}
+        {%- if tool.function is defined %}
+            {%- set tool = tool.function %}
+        {%- endif %}
+        {{- '{"type": "function", "function": ' }}
+        {{- '{"name": "' + tool.name + '", ' }}
+        {{- '"description": "' + tool.name + '(' }}
+        {%- for param_name, param_fields in tool.parameters.properties|items %}
+            {{- param_name + ": " + json_to_python_type(param_fields) }}
+            {%- if not loop.last %}
+                {{- ", " }}
+            {%- endif %}
+        {%- endfor %}
+        {{- ")" }}
+        {%- if tool.return is defined %}
+            {{- " -> " + json_to_python_type(tool.return) }}
+        {%- endif %}
+        {{- " - " + tool.description + "\n\n" }}
+        {%- for param_name, param_fields in tool.parameters.properties|items %}
+            {%- if loop.first %}
+                {{- "    Args:\n" }}
+            {%- endif %}
+            {{- "        " + param_name + "(" + json_to_python_type(param_fields) + "): " + param_fields.description|trim }}
+        {%- endfor %}
+        {%- if tool.return is defined and tool.return.description is defined %}
+            {{- "\n    Returns:\n        " + tool.return.description }}
+        {%- endif %}
+        {{- '"' }}
+        {{- ', "parameters": ' }}
+        {%- if tool.parameters.properties | length == 0 %}
+            {{- "{}" }}
+        {%- else %}
+            {{- tool.parameters|tojson }}
+        {%- endif %}
+        {{- "}" }}
+        {%- if not loop.last %}
+            {{- "\n" }}
+        {%- endif %}
+    {%- endfor %}
+{%- endif %}
+{{- " </tools>" }}
+{{- 'Use the following pydantic model json schema for each tool call you will make: {"properties": {"name": {"title": "Name", "type": "string"}, "arguments": {"title": "Arguments", "type": "object"}}, "required": ["name", "arguments"], "title": "FunctionCall", "type": "object"}}
+' }}
+{{- "For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
+" }}
+{{- "<tool_call>
+" }}
+{{- '{"name": <function-name>, "arguments": <args-dict>}
+' }}
+{{- '</tool_call><|im_end|>' }}
+{%- for message in messages %}
+    {%- if message.role == "user" or message.role == "system" or (message.role == "assistant" and message.tool_calls is not defined) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" and message.tool_calls is defined %}
+        {{- '<|im_start|>' + message.role }}
+        {%- for tool_call in message.tool_calls %}
+            {{- '\n<tool_call>\n' }}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '{' }}
+            {{- '"name": "' }}
+            {{- tool_call.name }}
+            {{- '"' }}
+            {%- if tool_call.arguments is defined %}
+                {{- ', ' }}
+                {{- '"arguments": ' }}
+                {{- tool_call.arguments|tojson }}
+            {%- endif %}
+            {{- '}' }}
+            {{- '\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.previtem and loop.previtem.role != "tool" %}
+            {{- '<|im_start|>tool\n' }}
+        {%- endif %}
+        {{- '<tool_response>\n' }}
+        {{- message.content }}
+        {%- if not loop.last %}
+            {{- '\n</tool_response>\n' }}
+        {%- else %}
+            {{- '\n</tool_response>' }}
+        {%- endif %}
+        {%- if not loop.last and loop.nextitem.role != "tool" %}
+            {{- '<|im_end|>' }}
+        {%- elif loop.last %}
+            {{- '<|im_end|>' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}
diff --git a/vllm_v0.10.0/examples/tool_chat_template_hunyuan_a13b.jinja b/vllm_v0.10.0/examples/tool_chat_template_hunyuan_a13b.jinja
new file mode 100644
index 0000000..a0808e4
--- /dev/null
+++ b/vllm_v0.10.0/examples/tool_chat_template_hunyuan_a13b.jinja
@@ -0,0 +1,113 @@
+{% set loop_messages = messages %}
+{% if tools %}
+    {% set weekday_map = {'Monday': '星期一', 'Tuesday': '星期二', 'Wednesday': '星期三', 'Thursday': '星期四', 'Friday': '星期五', 'Saturday': '星期六', 'Sunday': '星期日'} %}
+    {% set weekday_cn = weekday_map[strftime_now('%A')] %}
+    {% set datetime_str = strftime_now('%Y-%m-%d %H:%M:%S') %}
+    {% set datetime_str = datetime_str + ' ' + weekday_cn %}
+    {% for message in loop_messages %}
+        {% if 'content' in message %}
+            {% set content = message['content'] %}
+        {% else %}
+            {% set content = '' %}
+        {% endif %}
+        {% if loop.index0 == 0 %}
+            {% set content_tmp = '你是一位函数组合专家。你会得到一个问题和一组可能的函数。根据问题，你需要进行一个或多个函数/工具调用以实现目的。
+如果没有一个函数可以使用，请直接使用自然语言回复用户，以助手：开头。
+如果给定的问题缺少函数所需的参数，请使用自然语言进行提问，向用户询问必要信息，以助手：开头。
+如果调用结果已经足够回答用户问题，请对历史结果进行总结，使用自然语言回复用户，以助手：开头。
+你应该只在工具调用部分返回函数调用。如果你决定调用任何函数，你必须将其格式化为<tool_calls>[{"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},...]</tool_calls>。你不应该在回复中包含任何其他文本。以下是你可以调用的函数列表，格式为JSON。
+' %}
+            {% set content_tmp = content_tmp + '
+' + tools | tojson + '
+' %}
+            {% if message['role'] == 'system' %}
+                {% set content_tmp = content_tmp + '
+额外要求：
+' + content + '
+
+如果你决定返回函数调用，请将其格式化为<tool_calls>[{"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},...]</tool_calls>，不得包含其他文本。如果额外要求里有格式要求，请忽略，以此处为准。
+否则，请参考开头说的三种情况，以助手：开头进行回复。
+
+如果额外要求里有时间信息，就以额外要求里的时间为准，否则，参考当前时间：' + datetime_str %}
+                {% set content = '<|startoftext|>' + content_tmp + '<|extra_4|>' %}
+            {% elif message['role'] == 'user' %}
+                {% set content_tmp = content_tmp + '
+如果你决定返回函数调用，请将其格式化为<tool_calls>[{"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},...]</tool_calls>，不得包含其他文本。
+否则，请参考开头说的三种情况，以助手：开头进行回复。
+
+当前时间：' + datetime_str %}
+                {% set content_tmp = '<|startoftext|>' + content_tmp + '<|extra_4|>'%}
+                {% set content = content_tmp + '用户：' + content + '<|extra_0|>' %}
+            {% endif %}
+        {% else %}
+            {% if message['role'] == 'user' %}
+                {% set content = '用户：' + content + '<|extra_0|>' %}
+            {% elif message['role'] == 'assistant' %}
+                {% if 'tool_calls' in message %}
+                    {% set tool_calls = message['tool_calls'] %}
+                    {% set ns = namespace(tool_calls="[") %}
+                    {% for tool_call in tool_calls %}
+                        {% set function = tool_call['function'] %}
+                        {% set name = function['name'] %}
+                        {% set ns.tool_calls = ns.tool_calls + '{"name": "' + name + '", '%}
+                        {% set arguments = function['arguments'] %}
+                        {% if arguments is not string %}
+                            {% set arguments = arguments | tojson %}
+                        {% endif %}
+                        {% set ns.tool_calls = ns.tool_calls + '"arguments": ' + arguments + '}' %}
+                        {% if not loop.last %}
+                            {% set ns.tool_calls = ns.tool_calls + ', '%}
+                        {% endif %}
+                    {% endfor %}
+                    {% set ns.tool_calls = ns.tool_calls + ']' %}
+                    {% set content = content + '<tool_calls>' + ns.tool_calls + '</tool_calls>' %}
+                {% else %}
+                    {% set content = '助手：' + content %}
+                {% endif %}
+                {% set content = content + '<|eos|>' %}
+            {% elif message['role'] == 'tool' %}
+                {% if content is not string %}
+                    {set content = content | tojson }
+                {% endif %}
+                {% set content = '<tool_response>' + content + '</tool_response>' %}
+                {% set content = content + '<|extra_0|>' %}
+            {% endif %}
+        {% endif %}
+    {{- content -}}
+    {% endfor %}
+{% else %}
+    {% set context = {'has_head': true} %}
+    {% for message in loop_messages %}
+        {% if 'content' in message %}
+            {% set content = message['content'] %}
+        {% else %}
+            {% set content = '' %}
+        {% endif %}
+        {% if loop.index0 == 0 %}
+            {% if content == '' %}
+                {% set _ = context.update({'has_head': false}) %}
+            {% elif message['role'] == 'system' %}
+                {% set content = '<|startoftext|>' + content + '<|extra_4|>' %}
+            {% endif %}
+        {% endif %}
+        {% if message['role'] == 'user' %}
+            {% if loop.index0 == 1 and not context.has_head %}
+                {% set content = '<|startoftext|>' + content %}
+            {% endif %}
+            {% if loop.index0 == 1 and context.has_head %}
+                {% set content = content + '<|extra_0|>' %}
+            {% else %}
+                {% set content = '<|startoftext|>' + content + '<|extra_0|>' %}
+            {% endif %}
+        {% elif message['role'] == 'assistant' %}
+            {% set content = content + '<|eos|>' %}
+        {% elif message['role'] == 'tool' %}
+            {% set content = content + '<|extra_0|>' %}
+        {% endif %}
+        {{- content -}}
+    {% endfor %}
+{% endif %}
+{%- if enable_thinking is defined and enable_thinking is false %}
+    {{- '<think>\n\n</think>\n' }}
+{%- endif %}
+
diff --git a/vllm_v0.10.0/examples/tool_chat_template_internlm2_tool.jinja b/vllm_v0.10.0/examples/tool_chat_template_internlm2_tool.jinja
new file mode 100644
index 0000000..ac99666
--- /dev/null
+++ b/vllm_v0.10.0/examples/tool_chat_template_internlm2_tool.jinja
@@ -0,0 +1,60 @@
+{%- if messages[0]["role"] == "system" %}
+    {%- set system_message = messages[0]["content"] %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{{- bos_token }}
+{%- if system_message is defined %}
+{{- "<|im_start|>system\n" + system_message + "<|im_end|>\n" }}
+{%- endif %}
+
+{%- if tools is not none %}
+    {{- "<|im_start|>system name=<|plugin|>\n[" }}
+    {%- for tool in tools %}
+        {{- tool.function|tojson }}
+        {%- if not loop.last %}
+            {{- ", " }}
+        {%- else %}
+            {{- "]" }}
+        {%- endif %}
+    {%- endfor %}
+    {{- "<|im_end|>\n" }}
+{%- endif %}
+
+{%- for message in loop_messages %}
+    {%- if message["role"] == "user" %}
+        {{- "<|im_start|>user\n" + message["content"] + "<|im_end|>\n"}}
+    {%- elif message.tool_calls is defined and message.tool_calls is not none %}
+        {%- set content = message["content"] if message["content"] else "" %}
+        {{- "<|im_start|>assistant\n" + content }}
+        {%- for tool_call in message.tool_calls %}
+            {%- set function=tool_call.function %}
+            {{- "<|action_start|><|plugin|>\n" }}
+            {{- '{"name": "' + function.name + '", '}}
+            {{- '"arguments": ' + function.arguments|tojson + '}' }}
+            {{- "<|action_end|>" }}
+        {%- endfor %}
+        {{- "<|im_end|>\n" }}
+    {%- elif message["role"] == "assistant" %}
+        {{- "<|im_start|>assistant\n" + message["content"] + "<|im_end|>\n"}}
+    {%- elif message["role"] == "tool_results" or message["role"] == "tool" or message["role"] == "function" %}
+        {%- if message.content is defined and message.content.content is defined %}
+            {%- set content = message.content.content %}
+        {%- else %}
+            {%- set content = message.content %}
+        {%- endif %}
+        {{- "<|im_start|>environment name=<|plugin|>\n" + content|string + "<|im_end|>\n" }}
+    {%- else %}
+        {{- raise_exception("Only user and assistant and tool_results and tool and function roles are supported, with the exception of an initial optional system message!") }}
+    {%- endif %}
+{%- endfor %}
+
+{%- if add_generation_prompt %}
+{{- '<|im_start|>assistant\n' }}
+{%- endif %}
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/tool_chat_template_llama3.1_json.jinja b/vllm_v0.10.0/examples/tool_chat_template_llama3.1_json.jinja
new file mode 100644
index 0000000..0338309
--- /dev/null
+++ b/vllm_v0.10.0/examples/tool_chat_template_llama3.1_json.jinja
@@ -0,0 +1,120 @@
+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {#- Llama 3.1 doesn't pass all tests if the tools are in the system prompt #}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- if strftime_now is defined %}
+        {%- set date_string = strftime_now("%d %b %Y") %}
+    {%- else %}
+        {%- set date_string = "26 Jul 2024" %}
+    {%- endif %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- if messages[0]['content'] is string %}
+        {%- set system_message = messages[0]['content']|trim %}
+    {%- else %}
+        {%- set system_message = messages[0]['content'][0]['text']|trim %}
+    {%- endif %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- if tools is not none %}
+        {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %}
+    {%- else %}
+        {%- set system_message = "" %}
+    {%- endif %}
+{%- endif %}
+
+{#- System message #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if tools is not none %}
+    {{- "Environment: ipython\n" }}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+    {{- "You have access to the following functions. To call a function, please respond with JSON for a function call. " }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. ' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- if messages[0]['content'] is string %}
+            {%- set first_user_message = messages[0]['content']|trim %}
+        {%- else %}
+            {%- set first_user_message = messages[0]['content'] | selectattr('type', 'equalto', 'text') | map(attribute='text') | map('trim') | join('\n') %}
+        {%- endif %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+    {%- endif %}
+    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+    {{- "Given the following functions, please respond with a JSON for a function call " }}
+    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. ' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}
+        {%- if message['content'] is string %}
+            {{- message['content'] | trim}}
+        {%- else %}
+            {%- for content in message['content'] %}
+                {%- if content['type'] == 'text' %}
+                    {{- content['text'] | trim }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {%- if not message.tool_calls|length == 1 %}
+            {{- raise_exception("This model only supports single tool-calls at once!") }}
+        {%- endif %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+        {{- '{"name": "' + tool_call.name + '", ' }}
+        {{- '"parameters": ' }}
+        {{- tool_call.arguments | tojson }}
+        {{- "}" }}
+        {{- "<|eot_id|>" }}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+        {%- if message.content is string %}
+            {{- { "output": message.content } | tojson }}
+        {%- else %}
+            {%- for content in message['content']  %}
+                {%- if content['type']  == 'text' %}
+                    {{- { "output": content['text']  } | tojson }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}
diff --git a/vllm_v0.10.0/examples/tool_chat_template_llama3.2_json.jinja b/vllm_v0.10.0/examples/tool_chat_template_llama3.2_json.jinja
new file mode 100644
index 0000000..2b290c0
--- /dev/null
+++ b/vllm_v0.10.0/examples/tool_chat_template_llama3.2_json.jinja
@@ -0,0 +1,133 @@
+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = false %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- if strftime_now is defined %}
+        {%- set date_string = strftime_now("%d %b %Y") %}
+    {%- else %}
+        {%- set date_string = "26 Jul 2024" %}
+    {%- endif %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- Find out if there are any images #}
+{% set image_ns = namespace(has_images=false) %}
+{%- for message in messages %}
+    {%- for content in message['content'] %}
+        {%- if content['type'] == 'image' %}
+            {%- set image_ns.has_images = true %}
+        {%- endif %}
+    {%- endfor %}
+{%- endfor %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- if messages[0]['content'] is string %}
+        {%- set system_message = messages[0]['content']|trim %}
+    {%- else %}
+        {%- set system_message = messages[0]['content'][0]['text']|trim %}
+    {%- endif %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- if tools is not none %}
+        {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %}
+    {%- else %}
+        {%- set system_message = "" %}
+    {%- endif %}
+{%- endif %}
+
+{#- System message if there are no images, if the user supplied one, or if tools are used (default tool system message) #}
+{%- if system_message or not image_ns.has_images %}
+    {{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+    {%- if tools is not none %}
+        {{- "Environment: ipython\n" }}
+    {%- endif %}
+    {{- "Cutting Knowledge Date: December 2023\n" }}
+    {{- "Today Date: " + date_string + "\n\n" }}
+    {%- if tools is not none and not tools_in_user_message %}
+        {{- "You have access to the following functions. To call a function, please respond with JSON for a function call. " }}
+        {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. ' }}
+        {{- "Do not use variables.\n\n" }}
+        {%- for t in tools %}
+            {{- t | tojson(indent=4) }}
+            {{- "\n\n" }}
+        {%- endfor %}
+    {%- endif %}
+    {{- system_message }}
+    {{- "<|eot_id|>" }}
+{%- endif %}
+
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- if messages[0]['content'] is string %}
+            {%- set first_user_message = messages[0]['content']|trim %}
+        {%- else %}
+            {%- set first_user_message = messages[0]['content'] | selectattr('type', 'equalto', 'text') | map(attribute='text') | map('trim') | join('\n') %}
+        {%- endif %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+    {%- endif %}
+    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+    {{- "Given the following functions, please respond with a JSON for a function call " }}
+    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. ' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}
+        {%- if message['content'] is string %}
+            {{- message['content'] | trim}}
+        {%- else %}
+            {%- for content in message['content'] %}
+                {%- if content['type'] == 'image' %}
+                    {{- '<|image|>' }}
+                {%- elif content['type'] == 'text' %}
+                    {{- content['text'] | trim }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {%- if not message.tool_calls|length == 1 %}
+            {{- raise_exception("This model only supports single tool-calls at once!") }}
+        {%- endif %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+        {{- '{"name": "' + tool_call.name + '", ' }}
+        {{- '"parameters": ' }}
+        {{- tool_call.arguments | tojson }}
+        {{- "}" }}
+        {{- "<|eot_id|>" }}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+        {%- if message.content is string %}
+            {{- { "output": message.content } | tojson }}
+        {%- else %}
+            {%- for content in message['content']  %}
+                {%- if content['type']  == 'text' %}
+                    {{- { "output": content['text']  } | tojson }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}
diff --git a/vllm_v0.10.0/examples/tool_chat_template_llama3.2_pythonic.jinja b/vllm_v0.10.0/examples/tool_chat_template_llama3.2_pythonic.jinja
new file mode 100644
index 0000000..e4ec235
--- /dev/null
+++ b/vllm_v0.10.0/examples/tool_chat_template_llama3.2_pythonic.jinja
@@ -0,0 +1,98 @@
+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = false %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- if strftime_now is defined %}
+        {%- set date_string = strftime_now("%d %b %Y") %}
+    {%- else %}
+        {%- set date_string = "26 Jul 2024" %}
+    {%- endif %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content']|trim %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %}
+{%- endif %}
+
+{#- System message #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if tools is not none %}
+    {{- "Environment: ipython\n" }}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+    {{- "You have access to the following functions. To call functions, please respond with a python list of the calls. " }}
+    {{- 'Respond in the format [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)] ' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0]['content']|trim %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+    {%- endif %}
+    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+    {{- "Given the following functions, please respond with a python list for function calls " }}
+    {{- "with their proper arguments to best answer the given prompt.\n\n" }}
+    {{- 'Respond in the format [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)] ' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {{- '<|start_header_id|>assistant<|end_header_id|>\n\n[' -}}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- tool_call.name + '(' -}}
+            {%- for param in tool_call.arguments %}
+                {{- param + '=' -}}
+                {{- "%s" | format(tool_call.arguments[param]) -}}
+                {% if not loop.last %}, {% endif %}
+            {%- endfor %}
+            {{- ')' -}}
+            {% if not loop.last %}, {% endif %}
+        {%- endfor %}
+        {{- ']<|eot_id|>' -}}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+        {%- if message.content is mapping %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- { "output": message.content } | tojson }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}
diff --git a/vllm_v0.10.0/examples/tool_chat_template_llama4_json.jinja b/vllm_v0.10.0/examples/tool_chat_template_llama4_json.jinja
new file mode 100644
index 0000000..759f165
--- /dev/null
+++ b/vllm_v0.10.0/examples/tool_chat_template_llama4_json.jinja
@@ -0,0 +1,116 @@
+{%- macro is_array_of_type_objects(var) -%}
+    {%- if var is iterable and var is not string -%}
+        {%- set valid = true -%}
+        {%- for item in var -%}
+            {%- if 'type' not in item -%}
+                {%- set valid = false -%}
+                {%- break -%}
+            {%- endif -%}
+        {%- endfor -%}
+        {{ valid }}
+    {%- else -%}
+        {{ false }}
+    {%- endif -%}
+{%- endmacro %}
+
+{%- macro render_message(message) %}
+    {%- if message['content'] is string %}
+        {{- message['content']|trim }}
+    {%- elif is_array_of_type_objects(data) == 'True' %}
+        {%- for content in message['content'] %}
+            {%- if content['type'] == 'image' %}
+                {{- '<|image|>' }}
+            {%- elif content['type'] == 'text' %}
+                {{- content['text']|trim }}
+            {%- endif %}
+        {%- endfor %}
+    {%- else %}
+        {{- message['content']|tojson }}
+    {%- endif %}
+{%- endmacro %}
+
+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0] %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = ({ "content": "You are a helpful assistant with tool calling "
+        "capabilities. Only reply with a tool call if the function exists in the "
+        "library provided by the user. If it doesn't exist, just reply directly in "
+        "natural language. When you receive a tool call response, use the output to "
+        "format an answer to the original user question."}) %}
+{%- endif %}
+
+{%- set tool_lib_preamble = 'Tools: You have access to the following tools. You might need to use one '
+    'or more function/tool calls to fulfill the task. \n'
+    'If none are needed, then proceed to the response.\n\n'
+    'Tool Call Syntax: You can call tools using the following syntax:\n'
+    '{"name": function name, "parameters": dictionary of argument name and its value}.\n'
+    'Separate multiple function calls by "; ". Do not use variables.\n'
+    'Do not include anything else when calling the tools with the syntax above.\n\n'
+    'Here is a list of functions in JSON format that you can invoke.\n' %}
+
+{{- "<|header_start|>system<|header_end|>\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+    {{- tool_lib_preamble }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- render_message(system_message) }}
+{{ "<|eot|>\n" }}
+
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0] %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+    {%- endif %}
+    {{- '<|header_start|>user<|header_end|>\n\n' }}
+    {{- tool_lib_preamble }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- render_message(first_user_message) + "\n<|eot|>"}}
+{%- endif %}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }}
+        {{- render_message(message) }}
+        {{- "\n<|eot|>" }}
+    {%- elif 'tool_calls' in message and message.tool_calls|length > 0 %}
+        {{- '\n<|header_start|>assistant<|header_end|>\n\n' -}}
+        {{- render_message(message) }}
+        {%- for tool_call in message.tool_calls %}
+           {{- '{"name": "' + tool_call.function.name + '", ' }}
+           {{- '"parameters": ' }}
+           {{- tool_call.function.arguments | tojson }}
+           {{- "}" }}
+        {%- endfor %}
+       {{- "\n<|eot|>" }}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "\n<|header_start|>ipython<|header_end|>\n\n" }}
+        {{- render_message(message) }}
+        {{- "\n<|eom|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '\n<|header_start|>assistant<|header_end|>\n\n' }}
+{%- endif %}
diff --git a/vllm_v0.10.0/examples/tool_chat_template_llama4_pythonic.jinja b/vllm_v0.10.0/examples/tool_chat_template_llama4_pythonic.jinja
new file mode 100644
index 0000000..bbed3d8
--- /dev/null
+++ b/vllm_v0.10.0/examples/tool_chat_template_llama4_pythonic.jinja
@@ -0,0 +1,111 @@
+{{- bos_token }}
+{%- if custom_tools is defined and custom_tools%}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if tools is defined and tools %}
+    {%- set tool_definition = tool_definition ~ (tools | tojson(indent=4)) %}
+{%- else %}
+    {%- set tools = none %}
+{%- endif %}
+
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set user_provided_system_message = true %}
+    {%- if messages[0]['content'] is string %}
+        {%- set system_message = messages[0]['content']|trim %}
+    {%- else %}
+        {%- set system_message = messages[0]['content'][0]['text']|trim %}
+    {%- endif %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- if tools is not none  %}
+        {#- Since not system_message was provided by user, if tool is provided, system_message is now default tool system message #}
+        {#- This system message is from llama website:https://www.llama.com/docs/model-cards-and-prompt-formats/llama4/  #}
+        {%- set system_message = "You are a helpful assistant and an expert in function composition. You can answer general questions using your internal knowledge OR invoke functions when necessary. Follow these strict guidelines:\n\n1. FUNCTION CALLS:\n- ONLY use functions that are EXPLICITLY listed in the function list below\n- If NO functions are listed (empty function list []), respond ONLY with internal knowledge or \"I don't have access to [Unavailable service] information\"\n- If a function is not in the list, respond ONLY with internal knowledge or \"I don't have access to [Unavailable service] information\"\n- If ALL required parameters are present AND the query EXACTLY matches a listed function's purpose: output ONLY the function call(s)\n- Use exact format: [func_name1(param1=value1, param2=value2), func_name2(...)]\nExamples:\nCORRECT: [get_weather(location=\"Vancouver\"), calculate_route(start=\"Boston\", end=\"New York\")] <- Only if get_weather and calculate_route are in function list\nINCORRECT: get_weather(location=\"New York\")\nINCORRECT: Let me check the weather: [get_weather(location=\"New York\")]\nINCORRECT: [get_events(location=\"Singapore\")] <- If function not in list\n\n2. RESPONSE RULES:\n- For pure function requests matching a listed function: ONLY output the function call(s)\n- For knowledge questions: ONLY output text\n- For missing parameters: ONLY request the specific missing parameters\n- For unavailable services (not in function list): output ONLY with internal knowledge or \"I don't have access to [Unavailable service] information\". Do NOT execute a function call.\n- If the query asks for information beyond what a listed function provides: output ONLY with internal knowledge about your limitations\n- NEVER combine text and function calls in the same response\n- NEVER suggest alternative functions when the requested service is unavailable\n- NEVER create or invent new functions not listed below\n\n3. STRICT BOUNDARIES:\n- ONLY use functions from the list below - no exceptions\n- NEVER use a function as an alternative to unavailable information\n- NEVER call functions not present in the function list\n- NEVER add explanatory text to function calls\n- NEVER respond with empty brackets\n- Use proper Python/JSON syntax for function calls\n- Check the function list carefully before responding\n\n4. TOOL RESPONSE HANDLING:\n- When receiving tool responses: provide concise, natural language responses\n- Don't repeat tool response verbatim\n- Don't add supplementary information\n\nHere is a list of functions in JSON format that you can invoke:\n" %}
+    {%- else %}
+        {%- set system_message = "" %}
+    {%- endif %}
+{%- endif %}
+{#- Now writing the system message: use the user provided system message if user_provided_system_message, else default tool system message if tools presented #}
+{%- if system_message %}
+    {#- always use user provided system message to override default tool system message #}
+    {{- "<|header_start|>system<|header_end|>\n\n" }}
+    {{- system_message }}
+    {%- if user_provided_system_message and tools %}
+        {{- "\nHere is a list of functions in JSON format that you can invoke. Use exact format: [func_name1(param1=value1, param2=value2), func_name2(...)]\n" }}
+        {{- tool_definition -}}
+        {%- elif tool_definition %}
+        {{- tool_definition -}}
+    {%- endif %}
+    {{- "<|eot|>" }}
+{%- endif %}
+
+{#- Now deal with all other messages #}
+{%- for message in messages %}
+    {#- Base case: messages that are not from tool role and has empty tool_call list  #}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or ('tool_calls' in message and  message.tool_calls|length != 0 )) %}
+        {{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }}
+        {%- if message['content'] is string %}
+            {{- message['content'] }}
+        {%- else %}
+            {%- for content in message['content'] %}
+                {%- if content['type'] == 'image' %}
+                    {{- '<|image|>' }}
+                {%- elif content['type'] == 'text' %}
+                    {{- content['text'] | trim }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+    {{- "<|eot|>" }}
+    {#- Tool case: messages has non-empty tool_call list, must from assistant #}
+    {%- elif 'tool_calls' in message %}
+        {#- assume tool_calls are always coming from assistant #}
+        {%- if message.role == 'assistant' %}
+            {{- '<|header_start|>assistant<|header_end|>\n\n' -}}
+        {%- if message['content'] is string %}
+            {{- message['content'] }}
+        {%- else %}
+            {%- for content in message['content'] %}
+                {%- if content['type'] == 'image' %}
+                    {{- '<|image|>' }}
+                {%- elif content['type'] == 'text' %}
+                    {{- content['text'] }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+            {{- "[" }}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+                {{-  tool_call.name + '(' -}}
+            {%- for param in tool_call.arguments %}
+                {{- param + '="' -}}
+                {{- "%s" | format(tool_call.arguments[param]) -}}
+                {{- '"' -}}
+                {% if not loop.last %}, {% endif %}
+            {%- endfor %}
+            {{- ')' -}}
+            {% if not loop.last %}, {% endif %}
+        {%- endfor %}
+        {{- "]<|eot|>" }}
+{%- endif %}
+{#- Tool_response case: messages are from tool_response  #}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|header_start|>ipython<|header_end|>\n\n" }}
+        {%- if message.content is string %}
+            {{-  message.content  | tojson }}
+        {%- else %}
+            {%- for content in message['content']  %}
+                {%- if content['type']  == 'text' %}
+                    {{-  content['text'] | tojson }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- "<|eot|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|header_start|>assistant<|header_end|>\n\n' }}
+{%- endif %}
diff --git a/vllm_v0.10.0/examples/tool_chat_template_minimax_m1.jinja b/vllm_v0.10.0/examples/tool_chat_template_minimax_m1.jinja
new file mode 100644
index 0000000..2d5bbf4
--- /dev/null
+++ b/vllm_v0.10.0/examples/tool_chat_template_minimax_m1.jinja
@@ -0,0 +1,91 @@
+{{ '<begin_of_document>' -}}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- Extract system message #}
+{% set ns = namespace(system_prompt='') -%}
+{%- if messages[0]['role'] == 'system' %}
+    {%- if messages[0]['content'] is string %}
+        {%- set ns.system_prompt = messages[0]['content']|trim %}
+    {%- else %}
+        {%- set ns.system_prompt = messages[0]['content'][0]['text']|trim %}
+    {%- endif %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- if tools is not none %}
+        {%- set ns.system_prompt = "You are a helpful assistant created by Minimax based on MiniMax-M1 model." %}
+    {%- else %}
+        {%- set ns.system_prompt = "You are a helpful assistant created by Minimax based on MiniMax-M1 model." %}
+    {%- endif %}
+{%- endif %}
+
+{#- System message #}
+{%- if ns.system_prompt != '' %}
+{{ '<beginning_of_sentence>system ai_setting=assistant\n' + ns.system_prompt + '<end_of_sentence>\n' -}}
+{%- endif %}
+
+{#- Tools configuration #}
+{%- if tools is not none %}
+{{ '<beginning_of_sentence>system tool_setting=tools\nYou are provided with these tools:\n<tools>\n' -}}
+{%- for tool in tools %}
+{{ tool | tojson ~ '\n' -}}
+{%- endfor %}
+{{ '</tools>\n\nIf you need to call tools, please respond with <tool_calls></tool_calls> XML tags, and provide tool-name and json-object of arguments, following the format below:\n<tool_calls>\n{"name": <tool-name>, "arguments": <args-json-object>}\n...\n</tool_calls><end_of_sentence>\n' -}}
+{%- endif %}
+
+{#- Process messages #}
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {%- if message['role'] == 'user' %}
+{{ '<beginning_of_sentence>user name=user\n' -}}
+{%- if message['content'] is string %}
+{{ message['content']|trim -}}
+{%- else %}
+{%- for content in message['content'] %}
+{%- if content['type'] == 'text' %}
+{{ content['text']|trim -}}
+{%- endif %}
+{%- endfor %}
+{%- endif %}
+{{ '<end_of_sentence>\n' -}}
+        {%- elif message['role'] == 'assistant' %}
+{{ '<beginning_of_sentence>ai name=assistant\n' -}}
+{%- if message['content'] is string %}
+{{ message['content']|trim -}}
+{%- else %}
+{%- for content in message['content'] | selectattr('type', 'equalto', 'text') %}
+{{ content['text']|trim -}}
+{%- endfor %}
+{%- endif %}
+{{ '<end_of_sentence>\n' -}}
+        {%- endif %}
+    {%- elif 'tool_calls' in message %}
+{{ '<beginning_of_sentence>ai name=assistant\n<tool_calls>\n' -}}
+{%- for tool_call in message.tool_calls %}
+{{ '{"name": "' + tool_call.function.name + '", "arguments": ' + tool_call.function.arguments | tojson + '}\n' -}}
+{%- endfor %}
+{{ '</tool_calls><end_of_sentence>\n' -}}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+{{ '<beginning_of_sentence>tool name=tools\n' -}}
+{%- if message.content is string %}
+{{ 'tool result: ' + message.content + '\n\n' -}}
+{%- else %}
+{%- for content in message['content'] %}
+{%- if content['type'] == 'text' %}
+{{ 'tool result: ' + content['text'] + '\n\n' -}}
+{%- elif content.get('name') %}
+{{ 'tool name: ' + content['name'] + '\ntool result: ' + content['text'] + '\n\n' -}}
+{%- endif %}
+{%- endfor %}
+{%- endif %}
+{{ '<end_of_sentence>\n' -}}
+    {%- endif %}
+{%- endfor %}
+
+{%- if add_generation_prompt %}
+{{ '<beginning_of_sentence>ai name=assistant\n' -}}
+{%- endif %}
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/tool_chat_template_mistral.jinja b/vllm_v0.10.0/examples/tool_chat_template_mistral.jinja
new file mode 100644
index 0000000..49691f5
--- /dev/null
+++ b/vllm_v0.10.0/examples/tool_chat_template_mistral.jinja
@@ -0,0 +1,86 @@
+{%- if messages[0]["role"] == "system" %}
+    {%- set system_message = messages[0]["content"] %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+{%- set user_messages = loop_messages | selectattr("role", "equalto", "user") | list %}
+
+{%- for message in loop_messages | rejectattr("role", "equalto", "tool") | rejectattr("role", "equalto", "tool_results") | selectattr("tool_calls", "undefined") %}
+    {%- if (message["role"] == "user") != (loop.index0 % 2 == 0) %}
+        {{- raise_exception("After the optional system message, conversation roles must alternate user/assistant/user/assistant/...") }}
+    {%- endif %}
+{%- endfor %}
+
+{{- bos_token }}
+{%- for message in loop_messages %}
+    {%- if message["role"] == "user" %}
+        {%- if tools is not none and (message == user_messages[-1]) %}
+            {{- "[AVAILABLE_TOOLS] [" }}
+            {%- for tool in tools %}
+                {%- set tool = tool.function %}
+                {{- '{"type": "function", "function": {' }}
+                {%- for key, val in tool.items() if key != "return" %}
+                    {%- if val is string %}
+                        {{- '"' + key + '": "' + val + '"' }}
+                    {%- else %}
+                        {{- '"' + key + '": ' + val|tojson }}
+                    {%- endif %}
+                    {%- if not loop.last %}
+                        {{- ", " }}
+                    {%- endif %}
+                {%- endfor %}
+                {{- "}}" }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- else %}
+                    {{- "]" }}
+                {%- endif %}
+            {%- endfor %}
+            {{- "[/AVAILABLE_TOOLS]" }}
+        {%- endif %}
+        {%- if loop.last and system_message is defined %}
+            {{- "[INST] " + system_message + "\n\n" + message["content"] + "[/INST]" }}
+        {%- else %}
+            {{- "[INST] " + message["content"] + "[/INST]" }}
+        {%- endif %}
+    {%- elif message["role"] == "tool_calls" or message.tool_calls is defined %}
+        {%- if message.tool_calls is defined %}
+            {%- set tool_calls = message.tool_calls %}
+        {%- else %}
+            {%- set tool_calls = message.content %}
+        {%- endif %}
+        {{- "[TOOL_CALLS] [" }}
+        {%- for tool_call in tool_calls %}
+            {%- set out = tool_call.function|tojson %}
+            {{- out[:-1] }}
+            {%- if not tool_call.id is defined or tool_call.id|length < 9 %}
+                {{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (1)" + tool_call.id) }}
+            {%- endif %}
+            {{- ', "id": "' + tool_call.id[-9:] + '"}' }}
+            {%- if not loop.last %}
+                {{- ", " }}
+            {%- else %}
+                {{- "]" + eos_token }}
+            {%- endif %}
+        {%- endfor %}
+    {%- elif message["role"] == "assistant" %}
+        {{- " " + message["content"] + eos_token }}
+    {%- elif message["role"] == "tool_results" or message["role"] == "tool" %}
+        {%- if message.content is defined and message.content.content is defined %}
+            {%- set content = message.content.content %}
+        {%- else %}
+            {%- set content = message.content %}
+        {%- endif %}
+        {{- '[TOOL_RESULTS] {"content": ' + content|string + ", " }}
+        {%- if not message.tool_call_id is defined or message.tool_call_id|length < 9 %}
+            {{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (2)" + message.tool_call_id) }}
+        {%- endif %}
+        {{- '"call_id": "' + message.tool_call_id[-9:] + '"}[/TOOL_RESULTS]' }}
+    {%- else %}
+        {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}
+    {%- endif %}
+{%- endfor %}
diff --git a/vllm_v0.10.0/examples/tool_chat_template_mistral3.jinja b/vllm_v0.10.0/examples/tool_chat_template_mistral3.jinja
new file mode 100644
index 0000000..7c4249e
--- /dev/null
+++ b/vllm_v0.10.0/examples/tool_chat_template_mistral3.jinja
@@ -0,0 +1,126 @@
+{%- set today = strftime_now("%Y-%m-%d") %}
+{%- set default_system_message = "You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris.\nYour knowledge base was last updated on 2023-10-01. The current date is " + today + ".\n\nWhen you're not sure about some information, you say that you don't have the information and don't make up anything.\nIf the user's question is not clear, ambiguous, or does not provide enough context for you to accurately answer the question, you do not try to answer it right away and you rather ask the user to clarify their request (e.g. \"What are some good restaurants around me?\" => \"Where are you?\" or \"When is the next flight to Tokyo\" => \"Where do you travel from?\")" %}
+
+{{- bos_token }}
+
+{%- if messages[0]['role'] == 'system' %}
+    {%- if messages[0]['content'] is string %}
+        {%- set system_message = messages[0]['content'] %}
+        {%- set loop_messages = messages[1:] %}
+    {%- else %}
+        {%- set system_message = messages[0]['content'][0]['text'] %}
+        {%- set loop_messages = messages[1:] %}
+    {%- endif %}
+{%- else %}
+    {%- set system_message = default_system_message %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- elif tools is not none %}
+    {%- set parallel_tool_prompt = "You are a helpful assistant that can call tools. If you call one or more tools, format them in a single JSON array or objects, where each object is a tool call, not as separate objects outside of an array or multiple arrays. Use the format [{\"name\": tool call name, \"arguments\": tool call arguments}, additional tool calls] if you call more than one tool. If you call tools, do not attempt to interpret them or otherwise provide a response until you receive a tool call result that you can interpret for the user." %}
+    {%- if system_message is defined %}
+        {%- set system_message = parallel_tool_prompt + "\n\n" + system_message %}
+    {%- else %}
+        {%- set system_message = parallel_tool_prompt %}
+    {%- endif %}
+{%- endif %}
+{{- '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }}
+
+{%- set user_messages = loop_messages | selectattr("role", "equalto", "user") | list %}
+
+{%- set filtered_messages = [] %}
+{%- for message in loop_messages %}
+    {%- if message["role"] not in ["tool", "tool_results"] and not message.get("tool_calls") %}
+        {%- set filtered_messages = filtered_messages + [message] %}
+    {%- endif %}
+{%- endfor %}
+
+{%- for message in filtered_messages %}
+    {%- if (message["role"] == "user") != (loop.index0 % 2 == 0) %}
+        {{- raise_exception("After the optional system message, conversation roles must alternate user/assistant/user/assistant/...") }}
+    {%- endif %}
+{%- endfor %}
+
+{%- for message in loop_messages %}
+    {%- if message["role"] == "user" %}
+        {%- if tools is not none and (message == user_messages[-1]) %}
+            {{- "[AVAILABLE_TOOLS] [" }}
+            {%- for tool in tools %}
+                {%- set tool = tool.function %}
+                {{- '{"type": "function", "function": {' }}
+                {%- for key, val in tool.items() if key != "return" %}
+                    {%- if val is string %}
+                        {{- '"' + key + '": "' + val + '"' }}
+                    {%- else %}
+                        {{- '"' + key + '": ' + val|tojson }}
+                    {%- endif %}
+                    {%- if not loop.last %}
+                        {{- ", " }}
+                    {%- endif %}
+                {%- endfor %}
+                {{- "}}" }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- else %}
+                    {{- "]" }}
+                {%- endif %}
+            {%- endfor %}
+            {{- "[/AVAILABLE_TOOLS]" }}
+        {%- endif %}
+        {%- if message['content'] is string %}
+        {{- '[INST]' + message['content'] + '[/INST]' }}
+        {%- else %}
+                {{- '[INST]' }}
+                {%- for block in message['content'] %}
+                        {%- if block['type'] == 'text' %}
+                                {{- block['text'] }}
+                        {%- elif block['type'] == 'image' or block['type'] == 'image_url' %}
+                                {{- '[IMG]' }}
+                            {%- else %}
+                                {{- raise_exception('Only text and image blocks are supported in message content!') }}
+                            {%- endif %}
+                    {%- endfor %}
+                {{- '[/INST]' }}
+            {%- endif %}
+    {%- elif message["role"] == "tool_calls" or message.tool_calls is defined %}
+        {%- if message.tool_calls is defined %}
+            {%- set tool_calls = message.tool_calls %}
+        {%- else %}
+            {%- set tool_calls = message.content %}
+        {%- endif %}
+        {{- "[TOOL_CALLS] [" }}
+        {%- for tool_call in tool_calls %}
+            {%- set out = tool_call.function|tojson %}
+            {{- out[:-1] }}
+            {%- if not tool_call.id is defined or tool_call.id|length < 9 %}
+                {{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (1)" + tool_call.id) }}
+            {%- endif %}
+            {{- ', "id": "' + tool_call.id[-9:] + '"}' }}
+            {%- if not loop.last %}
+                {{- ", " }}
+            {%- else %}
+                {{- "]" + eos_token }}
+            {%- endif %}
+        {%- endfor %}
+    {%- elif message['role'] == 'assistant' %}
+        {%- if message['content'] is string %}
+            {{- message['content'] + eos_token }}
+        {%- else %}
+            {{- message['content'][0]['text'] + eos_token }}
+        {%- endif %}
+    {%- elif message["role"] == "tool_results" or message["role"] == "tool" %}
+        {%- if message.content is defined and message.content.content is defined %}
+            {%- set content = message.content.content %}
+        {%- else %}
+            {%- set content = message.content %}
+        {%- endif %}
+        {{- '[TOOL_RESULTS] {"content": ' + content|string + ", " }}
+        {%- if not message.tool_call_id is defined or message.tool_call_id|length < 9 %}
+            {{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (2)" + message.tool_call_id) }}
+        {%- endif %}
+        {{- '"call_id": "' + message.tool_call_id[-9:] + '"}[/TOOL_RESULTS]' }}
+    {%- else %}
+        {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}
+    {%- endif %}
+{%- endfor %}
diff --git a/vllm_v0.10.0/examples/tool_chat_template_mistral_parallel.jinja b/vllm_v0.10.0/examples/tool_chat_template_mistral_parallel.jinja
new file mode 100644
index 0000000..2ef4bed
--- /dev/null
+++ b/vllm_v0.10.0/examples/tool_chat_template_mistral_parallel.jinja
@@ -0,0 +1,93 @@
+{%- if messages[0]["role"] == "system" %}
+    {%- set system_message = messages[0]["content"] %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- elif tools is not none %}
+    {%- set parallel_tool_prompt = "You are a helpful assistant that can call tools. If you call one or more tools, format them in a single JSON array or objects, where each object is a tool call, not as separate objects outside of an array or multiple arrays. Use the format [{\"name\": tool call name, \"arguments\": tool call arguments}, additional tool calls] if you call more than one tool. If you call tools, do not attempt to interpret them or otherwise provide a response until you receive a tool call result that you can interpret for the user." %}
+    {%- if system_message is defined %}
+        {%- set system_message = parallel_tool_prompt + "\n\n" + system_message %}
+    {%- else %}
+        {%- set system_message = parallel_tool_prompt %}
+    {%- endif %}
+{%- endif %}
+{%- set user_messages = loop_messages | selectattr("role", "equalto", "user") | list %}
+
+{%- for message in loop_messages | rejectattr("role", "equalto", "tool") | rejectattr("role", "equalto", "tool_results") | selectattr("tool_calls", "undefined") %}
+    {%- if (message["role"] == "user") != (loop.index0 % 2 == 0) %}
+        {{- raise_exception("After the optional system message, conversation roles must alternate user/assistant/user/assistant/...") }}
+    {%- endif %}
+{%- endfor %}
+
+{{- bos_token }}
+{%- for message in loop_messages %}
+    {%- if message["role"] == "user" %}
+        {%- if tools is not none and (message == user_messages[-1]) %}
+            {{- "[AVAILABLE_TOOLS] [" }}
+            {%- for tool in tools %}
+                {%- set tool = tool.function %}
+                {{- '{"type": "function", "function": {' }}
+                {%- for key, val in tool.items() if key != "return" %}
+                    {%- if val is string %}
+                        {{- '"' + key + '": "' + val + '"' }}
+                    {%- else %}
+                        {{- '"' + key + '": ' + val|tojson }}
+                    {%- endif %}
+                    {%- if not loop.last %}
+                        {{- ", " }}
+                    {%- endif %}
+                {%- endfor %}
+                {{- "}}" }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- else %}
+                    {{- "]" }}
+                {%- endif %}
+            {%- endfor %}
+            {{- "[/AVAILABLE_TOOLS]" }}
+        {%- endif %}
+        {%- if loop.last and system_message is defined %}
+            {{- "[INST] " + system_message + "\n\n" + message["content"] + "[/INST]" }}
+        {%- else %}
+            {{- "[INST] " + message["content"] + "[/INST]" }}
+        {%- endif %}
+    {%- elif message["role"] == "tool_calls" or message.tool_calls is defined %}
+        {%- if message.tool_calls is defined %}
+            {%- set tool_calls = message.tool_calls %}
+        {%- else %}
+            {%- set tool_calls = message.content %}
+        {%- endif %}
+        {{- "[TOOL_CALLS] [" }}
+        {%- for tool_call in tool_calls %}
+            {%- set out = tool_call.function|tojson %}
+            {{- out[:-1] }}
+            {%- if not tool_call.id is defined or tool_call.id|length < 9 %}
+                {{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (1)" + tool_call.id) }}
+            {%- endif %}
+            {{- ', "id": "' + tool_call.id[-9:] + '"}' }}
+            {%- if not loop.last %}
+                {{- ", " }}
+            {%- else %}
+                {{- "]" + eos_token }}
+            {%- endif %}
+        {%- endfor %}
+    {%- elif message["role"] == "assistant" %}
+        {{- " " + message["content"] + eos_token }}
+    {%- elif message["role"] == "tool_results" or message["role"] == "tool" %}
+        {%- if message.content is defined and message.content.content is defined %}
+            {%- set content = message.content.content %}
+        {%- else %}
+            {%- set content = message.content %}
+        {%- endif %}
+        {{- '[TOOL_RESULTS] {"content": ' + content|string + ", " }}
+        {%- if not message.tool_call_id is defined or message.tool_call_id|length < 9 %}
+            {{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (2)" + message.tool_call_id) }}
+        {%- endif %}
+        {{- '"call_id": "' + message.tool_call_id[-9:] + '"}[/TOOL_RESULTS]' }}
+    {%- else %}
+        {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}
+    {%- endif %}
+{%- endfor %}
diff --git a/vllm_v0.10.0/examples/tool_chat_template_phi4_mini.jinja b/vllm_v0.10.0/examples/tool_chat_template_phi4_mini.jinja
new file mode 100644
index 0000000..36423b6
--- /dev/null
+++ b/vllm_v0.10.0/examples/tool_chat_template_phi4_mini.jinja
@@ -0,0 +1,60 @@
+{%- if messages %}
+    {%- if system_message or tools %}
+<|system|>
+
+{%- if system_message %}
+{{ system_message }}
+{%- endif %}
+In addition to plain text responses, you can chose to call one or more of the provided functions.
+
+Use the following rule to decide when to call a function:
+  * if the response can be generated from your internal knowledge (e.g., as in the case of queries like "What is the capital of Poland?"), do so
+  * if you need external information that can be obtained by calling one or more of the provided functions, generate a function calls
+
+If you decide to call functions:
+  * prefix function calls with functools marker (no closing marker required)
+  * all function calls should be generated in a single JSON list formatted as functools[{"name": [function name], "arguments": [function arguments as JSON]}, ...]
+  * follow the provided JSON schema. Do not hallucinate arguments or values. Do to blindly copy values from the provided samples
+  * respect the argument type formatting. E.g., if the type if number and format is float, write value 7 as 7.0
+  * make sure you pick the right functions that match the user intent
+
+
+{%- if tools %}
+        {%- for t in tools %}
+            {{- t | tojson(indent=4) }}
+            {{- "\n\n" }}
+        {%- endfor %}
+{%- endif %}<|end|>
+    {%- endif %}
+
+    {%- for message in messages %}
+        {%- if message.role != "system" %}
+<|{{ message.role }}|>
+            {%- if message.content and message.role == "tools" %}
+{"result": {{ message.content }}}
+            {%- elif message.content %}
+{{ message.content }}
+            {%- elif message.tool_calls %}
+                {%- for call in message.tool_calls %}
+{"name": "{{ call.function.name }}", "arguments": {{ call.function.arguments }}}
+                    {%- if not loop.last %},{% endif %}
+                {%- endfor %}
+            {%- endif %}<|end|>
+        {%- endif %}
+    {%- endfor %}<|assistant|>
+
+{%- else %}
+    {%- if system_message %}
+<|system|>
+
+{{ system_message }}<|end|>
+    {%- endif %}
+    {%- if prompt %}
+<|user|>
+
+{{ prompt }}<|end|>
+    {%- endif %}<|assistant|>
+
+{%- endif %}
+{{ response }}
+{%- if response %}<|user|>{% endif %}
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/tool_chat_template_toolace.jinja b/vllm_v0.10.0/examples/tool_chat_template_toolace.jinja
new file mode 100644
index 0000000..da0f25c
--- /dev/null
+++ b/vllm_v0.10.0/examples/tool_chat_template_toolace.jinja
@@ -0,0 +1,65 @@
+{{- bos_token }}
+
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content']|trim %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language." %}
+{%- endif %}
+
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+    {{- "You are an expert in composing functions. You are given a question and a set of possible functions. Based on the question, you will need to make one or more function/tool calls to achieve the purpose.\n" }}
+    {{- "If none of the function can be used, point it out. If the given question lacks the parameters required by the function, also point it out.\n" }}
+    {{- "You should only return the function call in tools call sections.\n\n" }}
+    {{- "If you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]\n" }}
+    {{- "You SHOULD NOT include any other text in the response.\n" }}
+    {{- "Here is a list of functions in JSON format that you can invoke.\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- "\n" }}
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {{- '<|start_header_id|>assistant<|end_header_id|>\n\n[' -}}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- tool_call.name + '(' -}}
+            {%- for param in tool_call.arguments %}
+                {{- param + '=' -}}
+                {{- "%s" | format(tool_call.arguments[param]) -}}
+                {% if not loop.last %}, {% endif %}
+            {%- endfor %}
+            {{- ')' -}}
+            {% if not loop.last %}, {% endif %}
+        {%- endfor %}
+        {{- ']<|eot_id|>' -}}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+        {%- if message.content is mapping %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- { "output": message.content } | tojson }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+
+{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
diff --git a/vllm_v0.10.0/examples/tool_chat_template_xlam_llama.jinja b/vllm_v0.10.0/examples/tool_chat_template_xlam_llama.jinja
new file mode 100644
index 0000000..f97de40
--- /dev/null
+++ b/vllm_v0.10.0/examples/tool_chat_template_xlam_llama.jinja
@@ -0,0 +1,77 @@
+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- Extract system message #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content'] | trim %}
+    {%- set messages = messages[1:] %}
+    {{- system_message + "\n" }}
+{%- else %}
+    {%- set system_message = "You are a helpful assistant. You are developed by Salesforce xLAM team." %}
+    {% set format_instruction %}You have access to a set of tools. When using tools, make calls in a single JSON array: 
+
+[{"name": "tool_call_name", "arguments": {"arg1": "value1", "arg2": "value2"}}, ... (additional parallel tool calls as needed)]
+
+If no tool is suitable, state that explicitly. If the user's input lacks required parameters, ask for clarification. Do not interpret or respond until tool results are returned. Once they are available, process them or make additional calls if needed. For tasks that don't require tools, such as casual conversation or general advice, respond directly in plain text. The available tools are:{% endset %}
+    {{- system_message + "\n" }}
+    {%- if tools is not none %}
+        {{- format_instruction + "\n\n" }}
+    {%- endif %}
+{%- endif %}
+
+
+{%- if tools is not none %}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- "<|eot_id|>" }}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+        {%- if message['tool_calls'] %}
+            {{- "[" }}
+            {%- for tool_call_function in message.tool_calls %}
+                {%- set tool_call = tool_call_function.function %}
+                {{- '{"name": "' + tool_call.name + '", ' }}
+                {{- '"arguments": ' }}
+                {{- tool_call.arguments | tojson }}
+                {{- "}" }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- endif %}
+            {%- endfor %}
+            {{- "]" }}
+            {{- "<|eot_id|>" }}
+        {%- elif message['content'] %}
+            {{- message['content'] | trim + '<|eot_id|>' }}
+        {%- else %}
+            {{- "[]\n" + '<|eot_id|>' }}
+        {%- endif %}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>" + "ipython" + "<|end_header_id|>\n\n" }}
+        {%- set content = message["content"] %}
+        {%- if content is mapping or (content is iterable and content is not string) %}
+            {{- content | tojson }}
+        {%- else %}
+            {{- content }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}
\ No newline at end of file
diff --git a/vllm_v0.10.0/examples/tool_chat_template_xlam_qwen.jinja b/vllm_v0.10.0/examples/tool_chat_template_xlam_qwen.jinja
new file mode 100644
index 0000000..acf57cc
--- /dev/null
+++ b/vllm_v0.10.0/examples/tool_chat_template_xlam_qwen.jinja
@@ -0,0 +1,66 @@
+{# System message #}
+{{- "<|im_start|>system\n" }}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content'] | trim %}
+    {%- set messages = messages[1:] %}
+    {{- system_message + "\n" }}
+{%- else %}
+    {%- set system_message = "You are a helpful assistant. You are developed by Salesforce xLAM team." %}
+    {% set format_instruction %}You have access to a set of tools. When using tools, make calls in a single JSON array: 
+
+[{"name": "tool_call_name", "arguments": {"arg1": "value1", "arg2": "value2"}}, ... (additional parallel tool calls as needed)]
+
+If no tool is suitable, state that explicitly. If the user's input lacks required parameters, ask for clarification. Do not interpret or respond until tool results are returned. Once they are available, process them or make additional calls if needed. For tasks that don't require tools, such as casual conversation or general advice, respond directly in plain text. The available tools are:{% endset %}
+    {{- system_message + "\n" }}
+    {%- if tools is not none %}
+        {{- format_instruction + "\n\n" }}
+    {%- endif %}
+{%- endif %}
+
+{%- if tools is not none %}
+    {%- for func in tools %}
+        {{- func | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- "<|im_end|>\n" }}
+{%- for message in messages %}
+    {%- if message['role'] == 'tool' %}
+        {{- "<|im_start|>tool\n" }}
+        {%- if message.content is defined and message.content.content is defined %}
+            {%- set content = message.content.content %}
+        {%- else %}
+            {%- set content = message.content %}
+        {%- endif %}
+        {%- if content is mapping or content is iterable and content is not string %}
+            {{- content | tojson }}
+        {%- else %}
+            {{- content }}
+        {%- endif %}
+        {{- "<|im_end|>\n" }}
+    {%- elif 'tool_calls' in message %}
+        {{- "<|im_start|>assistant\n" }}
+        {%- if message['tool_calls'] %}
+            {{- "[" }}
+            {%- for tool_call in message.tool_calls %}
+                {%- set out = tool_call.function | tojson %}
+                {{- out }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- endif %}
+            {%- endfor %}
+            {{- "]"}}
+        {%- elif message['content'] %}
+            {{- message['content'] | trim }}
+        {%- else %}
+            {{- "[]\n" }}
+        {%- endif %}
+        {{- "<|im_end|>\n" }}
+    {%- else %}
+        {{- "<|im_start|>" + message['role'] + "\n" + message['content'] | trim + "<|im_end|>\n" }}
+    {%- endif %}
+{%- endfor %}
+
+{%- if add_generation_prompt %}
+    {{- "<|im_start|>assistant\n" }}
+{%- endif %}
diff --git a/vllm_v0.10.0/find_cuda_init.py b/vllm_v0.10.0/find_cuda_init.py
new file mode 100644
index 0000000..308fc6f
--- /dev/null
+++ b/vllm_v0.10.0/find_cuda_init.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import importlib
+import traceback
+from typing import Callable
+from unittest.mock import patch
+
+
+def find_cuda_init(fn: Callable[[], object]) -> None:
+    """
+    Helper function to debug CUDA re-initialization errors.
+
+    If `fn` initializes CUDA, prints the stack trace of how this happens.
+    """
+    from torch.cuda import _lazy_init
+
+    stack = None
+
+    def wrapper():
+        nonlocal stack
+        stack = traceback.extract_stack()
+        return _lazy_init()
+
+    with patch("torch.cuda._lazy_init", wrapper):
+        fn()
+
+    if stack is not None:
+        print("==== CUDA Initialized ====")
+        print("".join(traceback.format_list(stack)).strip())
+        print("==========================")
+
+
+if __name__ == "__main__":
+    find_cuda_init(
+        lambda: importlib.import_module("vllm.model_executor.models.llava"))
diff --git a/vllm_v0.10.0/format.sh b/vllm_v0.10.0/format.sh
new file mode 100755
index 0000000..6ba93e0
--- /dev/null
+++ b/vllm_v0.10.0/format.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+echo "vLLM linting system has been moved from format.sh to pre-commit hooks."
+echo "Please run 'pip install -r requirements/lint.txt', followed by"
+echo "'pre-commit install' to install the pre-commit hooks."
+echo "Then linters will run automatically before each commit."
\ No newline at end of file
diff --git a/vllm_v0.10.0/mkdocs.yaml b/vllm_v0.10.0/mkdocs.yaml
new file mode 100644
index 0000000..b392fb1
--- /dev/null
+++ b/vllm_v0.10.0/mkdocs.yaml
@@ -0,0 +1,138 @@
+site_name: vLLM
+site_url: https://docs.vllm.ai
+repo_url: https://github.com/vllm-project/vllm
+edit_uri: edit/main/docs/
+exclude_docs: |
+  argparse
+  *.inc.md
+  *.template.md
+theme:
+  name: material
+  logo: assets/logos/vllm-logo-only-light.ico
+  favicon: assets/logos/vllm-logo-only-light.ico
+  palette:
+    # Palette toggle for automatic mode
+    - media: "(prefers-color-scheme)"
+      toggle:
+        icon: material/brightness-auto
+        name: Switch to light mode
+    # Palette toggle for light mode
+    - media: "(prefers-color-scheme: light)"
+      scheme: default 
+      primary: white
+      toggle:
+        icon: material/brightness-7
+        name: Switch to dark mode
+    # Palette toggle for dark mode
+    - media: "(prefers-color-scheme: dark)"
+      scheme: slate
+      primary: black
+      toggle:
+        icon: material/brightness-2
+        name: Switch to system preference
+  features:
+    - content.action.edit
+    - content.code.copy
+    - content.tabs.link
+    - navigation.tracking
+    - navigation.tabs
+    - navigation.tabs.sticky
+    - navigation.sections
+    - navigation.prune
+    - navigation.top
+    - search.highlight
+    - search.share
+    - toc.follow
+  custom_dir: docs/mkdocs/overrides
+
+hooks:
+  - docs/mkdocs/hooks/remove_announcement.py
+  - docs/mkdocs/hooks/generate_examples.py
+  - docs/mkdocs/hooks/generate_argparse.py
+  - docs/mkdocs/hooks/url_schemes.py
+
+# Required to stop api-autonav from raising an error
+# https://github.com/tlambert03/mkdocs-api-autonav/issues/16
+nav:
+  - api
+
+plugins:
+  - meta
+  - search
+  - autorefs
+  - awesome-nav
+  - glightbox
+  # For API reference generation
+  - api-autonav:
+      modules: ["vllm"]
+      api_root_uri: "api"
+      exclude:
+        - "re:vllm\\._.*"  # Internal modules
+        - "vllm.third_party"
+        - "vllm.vllm_flash_attn"
+  - mkdocstrings:
+      handlers:
+        python:
+          options:
+            show_symbol_type_heading: true
+            show_symbol_type_toc: true
+            filters: []
+            summary:
+              modules: true
+            show_if_no_docstring: true
+            show_signature_annotations: true
+            separate_signature: true
+            show_overloads: true
+            signature_crossrefs: true
+          inventories:
+          - https://docs.python.org/3/objects.inv
+          - https://typing-extensions.readthedocs.io/en/latest/objects.inv
+          - https://docs.aiohttp.org/en/stable/objects.inv
+          - https://pillow.readthedocs.io/en/stable/objects.inv
+          - https://numpy.org/doc/stable/objects.inv
+          - https://pytorch.org/docs/stable/objects.inv
+          - https://psutil.readthedocs.io/en/stable/objects.inv
+
+markdown_extensions:
+  - attr_list
+  - md_in_html
+  - admonition
+  - pymdownx.details
+  # For content tabs
+  - pymdownx.superfences
+  - pymdownx.tabbed:
+      slugify: !!python/object/apply:pymdownx.slugs.slugify
+        kwds:
+          case: lower
+      alternate_style: true
+  # For code highlighting
+  - pymdownx.highlight:
+      anchor_linenums: true
+      line_spans: __span
+      pygments_lang_class: true
+  - pymdownx.inlinehilite
+  - pymdownx.snippets
+  # For emoji and icons
+  - pymdownx.emoji:
+      emoji_index: !!python/name:material.extensions.emoji.twemoji
+      emoji_generator: !!python/name:material.extensions.emoji.to_svg
+  # For in page [TOC] (not sidebar)
+  - toc:
+      permalink: true
+  # For math rendering
+  - mdx_math:
+      enable_dollar_delimiter: true
+
+extra_css:
+  - mkdocs/stylesheets/extra.css
+
+extra_javascript:
+  - mkdocs/javascript/run_llm_widget.js
+  - https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML
+  - mkdocs/javascript/edit_and_feedback.js
+  - mkdocs/javascript/slack_and_forum.js
+
+# Makes the url format end in .html rather than act as a dir
+# So index.md generates as index.html and is available under URL /index.html
+# https://www.mkdocs.org/user-guide/configuration/#use_directory_urls
+use_directory_urls: false
diff --git a/vllm_v0.10.0/pyproject.toml b/vllm_v0.10.0/pyproject.toml
new file mode 100644
index 0000000..a652679
--- /dev/null
+++ b/vllm_v0.10.0/pyproject.toml
@@ -0,0 +1,357 @@
+[build-system]
+# Should be mirrored in requirements/build.txt
+requires = [
+    "cmake>=3.26.1",
+    "ninja",
+    "packaging>=24.2",
+    "setuptools>=77.0.3,<80.0.0",
+    "setuptools-scm>=8.0",
+    "torch == 2.7.1",
+    "wheel",
+    "jinja2",
+]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "vllm"
+authors = [{name = "vLLM Team"}]
+license = "Apache-2.0"
+license-files = ["LICENSE"]
+readme = "README.md"
+description = "A high-throughput and memory-efficient inference and serving engine for LLMs"
+classifiers = [
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Information Technology",
+    "Intended Audience :: Science/Research",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Scientific/Engineering :: Information Analysis",
+]
+requires-python = ">=3.9,<3.13"
+dynamic = [ "version", "dependencies", "optional-dependencies"]
+
+[project.urls]
+Homepage="https://github.com/vllm-project/vllm"
+Documentation="https://docs.vllm.ai/en/latest/"
+Slack="https://slack.vllm.ai/"
+
+[project.scripts]
+vllm = "vllm.entrypoints.cli.main:main"
+
+[project.entry-points."vllm.general_plugins"]
+lora_filesystem_resolver = "vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver"
+
+[tool.setuptools_scm]
+# no extra settings needed, presence enables setuptools-scm
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["vllm*"]
+
+[tool.yapfignore]
+ignore_patterns = [
+    ".buildkite/**",
+    "benchmarks/**",
+    "build/**",
+    "examples/**",
+]
+
+[tool.ruff]
+# Allow lines to be as long as 80.
+line-length = 80
+
+[tool.ruff.lint.per-file-ignores]
+"vllm/third_party/**" = ["ALL"]
+"vllm/version.py" = ["F401"]
+"vllm/_version.py" = ["ALL"]
+# Python 3.8 typing - skip V0 code
+"vllm/attention/**/*.py" = ["UP006", "UP035"]
+"vllm/core/**/*.py" = ["UP006", "UP035"]
+"vllm/engine/**/*.py" = ["UP006", "UP035"]
+"vllm/executor/**/*.py" = ["UP006", "UP035"]
+"vllm/worker/**/*.py" = ["UP006", "UP035"]
+# Python 3.8 typing - skip utils for ROCm
+"vllm/utils/__init__.py" = ["UP006", "UP035"]
+
+[tool.ruff.lint]
+select = [
+    # pycodestyle
+    "E",
+    # Pyflakes
+    "F",
+    # pyupgrade
+    "UP",
+    # flake8-bugbear
+    "B",
+    # flake8-simplify
+    "SIM",
+    # isort
+    # "I",
+    # flake8-logging-format
+    "G",
+]
+ignore = [
+    # star imports
+    "F405", "F403",
+    # lambda expression assignment
+    "E731",
+    # Loop control variable not used within loop body
+    "B007",
+    # f-string format
+    "UP032",
+    # Can remove once 3.10+ is the minimum Python version
+    "UP007",
+]
+
+[tool.mypy]
+plugins = ['pydantic.mypy']
+ignore_missing_imports = true
+check_untyped_defs = true
+follow_imports = "silent"
+
+# After fixing type errors resulting from follow_imports: "skip" -> "silent",
+# move the directory here and remove it from tools/mypy.sh
+files = [
+    "vllm/*.py",
+    "vllm/adapter_commons",
+    "vllm/assets",
+    "vllm/entrypoints",
+    "vllm/core",
+    "vllm/inputs",
+    "vllm/logging_utils",
+    "vllm/multimodal",
+    "vllm/platforms",
+    "vllm/transformers_utils",
+    "vllm/triton_utils",
+    "vllm/usage",
+]
+# TODO(woosuk): Include the code from Megatron and HuggingFace.
+exclude = [
+    "vllm/model_executor/parallel_utils/|vllm/model_executor/models/",
+    # Ignore triton kernels in ops.
+    'vllm/attention/ops/.*\.py$'
+]
+
+[tool.isort]
+skip_glob = [
+    ".buildkite/*",
+    "benchmarks/*",
+    "examples/*",
+]
+use_parentheses = true
+skip_gitignore = true
+
+[tool.pytest.ini_options]
+markers = [
+    "skip_global_cleanup",
+    "core_model: enable this model test in each PR instead of only nightly",
+    "hybrid_model: models that contain mamba layers (including pure SSM and hybrid architectures)",
+    "cpu_model: enable this model test in CPU tests",
+    "split: run this test as part of a split",
+    "distributed: run this test only in distributed GPU tests",
+    "skip_v1: do not run this test with v1",
+    "optional: optional tests that are automatically skipped, include --optional to run them",
+]
+
+[tool.pymarkdown]
+plugins.md004.style = "sublist" # ul-style
+plugins.md007.indent = 4 # ul-indent
+plugins.md007.start_indented = true # ul-indent
+plugins.md013.enabled = false # line-length
+plugins.md041.enabled = false # first-line-h1
+plugins.md033.enabled = false # inline-html
+plugins.md046.enabled = false # code-block-style
+plugins.md024.allow_different_nesting = true # no-duplicate-headers
+
+[tool.ty.src]
+root = "./vllm"
+respect-ignore-files = true
+
+[tool.ty.environment]
+python = "./.venv"
+
+[tool.typos.files]
+# these files may be written in non english words
+extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*",
+    "benchmarks/sonnet.txt", "tests/lora/data/*", "build/*",
+    "vllm/third_party/*"]
+ignore-hidden = true
+ignore-files = true
+ignore-dot = true
+ignore-vcs = true
+ignore-global = true
+ignore-parent = true
+
+[tool.typos.default]
+binary = false
+check-filename = false
+check-file = true
+unicode = true
+ignore-hex = true
+identifier-leading-digits = false
+locale = "en"
+extend-ignore-identifiers-re = ["NVML_*", ".*Unc.*", ".*_thw",
+    ".*UE8M0.*", ".*[UE4M3|ue4m3].*", ".*eles.*",
+     ".*[Tt]h[rR].*"]
+extend-ignore-words-re = []
+extend-ignore-re = []
+
+[tool.typos.default.extend-identifiers]
+bbc5b7ede = "bbc5b7ede"
+womens_doubles = "womens_doubles"
+v_2nd = "v_2nd"
+# splitted_input = "splitted_input"
+NOOPs = "NOOPs"
+typ = "typ"
+nin_shortcut = "nin_shortcut"
+UperNetDecoder = "UperNetDecoder"
+subtile = "subtile"
+cudaDevAttrMaxSharedMemoryPerBlockOptin = "cudaDevAttrMaxSharedMemoryPerBlockOptin"
+SFOuput = "SFOuput"
+# huggingface transformers repo uses these words
+depthwise_seperable_out_channel = "depthwise_seperable_out_channel"
+DepthWiseSeperableConv1d = "DepthWiseSeperableConv1d"
+depthwise_seperable_CNN = "depthwise_seperable_CNN"
+
+[tool.typos.default.extend-words]
+iy = "iy"
+tendencias = "tendencias"
+# intel cpu features
+tme = "tme"
+dout = "dout"
+Pn = "Pn"
+arange = "arange"
+
+[tool.typos.type.py]
+extend-glob = []
+extend-ignore-identifiers-re = []
+extend-ignore-words-re = []
+extend-ignore-re = []
+
+[tool.typos.type.py.extend-identifiers]
+arange = "arange"
+NDArray = "NDArray"
+EOFError = "EOFError"
+fo = "fo"
+ba = "ba"
+
+[tool.typos.type.py.extend-words]
+
+[tool.typos.type.cpp]
+extend-glob = ["*.cu"]
+extend-ignore-identifiers-re = []
+extend-ignore-words-re = []
+extend-ignore-re = []
+
+[tool.typos.type.cpp.extend-identifiers]
+countr_one = "countr_one"
+k_ot = "k_ot"
+ot = "ot"
+
+[tool.typos.type.cpp.extend-words]
+
+[tool.typos.type.rust]
+extend-glob = []
+extend-ignore-identifiers-re = []
+extend-ignore-words-re = []
+extend-ignore-re = []
+
+[tool.typos.type.rust.extend-identifiers]
+flate2 = "flate2"
+
+[tool.typos.type.rust.extend-words]
+ser = "ser"
+
+[tool.typos.type.lock]
+extend-glob = []
+check-file = false
+extend-ignore-identifiers-re = []
+extend-ignore-words-re = []
+extend-ignore-re = []
+
+[tool.typos.type.lock.extend-identifiers]
+
+[tool.typos.type.lock.extend-words]
+
+[tool.typos.type.jl]
+extend-glob = []
+extend-ignore-identifiers-re = []
+extend-ignore-words-re = []
+extend-ignore-re = []
+
+[tool.typos.type.jl.extend-identifiers]
+
+[tool.typos.type.jl.extend-words]
+modul = "modul"
+egals = "egals"
+usig = "usig"
+egal = "egal"
+
+[tool.typos.type.go]
+extend-glob = []
+extend-ignore-identifiers-re = []
+extend-ignore-words-re = []
+extend-ignore-re = []
+
+[tool.typos.type.go.extend-identifiers]
+flate = "flate"
+
+[tool.typos.type.go.extend-words]
+
+[tool.typos.type.css]
+extend-glob = []
+extend-ignore-identifiers-re = []
+extend-ignore-words-re = []
+extend-ignore-re = []
+
+[tool.typos.type.css.extend-identifiers]
+nd = "nd"
+
+[tool.typos.type.css.extend-words]
+
+[tool.typos.type.man]
+extend-glob = []
+extend-ignore-identifiers-re = []
+extend-ignore-words-re = []
+extend-ignore-re = []
+
+[tool.typos.type.man.extend-identifiers]
+Nd = "Nd"
+
+[tool.typos.type.man.extend-words]
+
+[tool.typos.type.cert]
+extend-glob = []
+check-file = false
+extend-ignore-identifiers-re = []
+extend-ignore-words-re = []
+extend-ignore-re = []
+
+[tool.typos.type.cert.extend-identifiers]
+
+[tool.typos.type.cert.extend-words]
+
+[tool.typos.type.sh]
+extend-glob = []
+extend-ignore-identifiers-re = []
+extend-ignore-words-re = []
+extend-ignore-re = []
+
+[tool.typos.type.sh.extend-identifiers]
+ot = "ot"
+
+[tool.typos.type.sh.extend-words]
+
+[tool.typos.type.vimscript]
+extend-glob = []
+extend-ignore-identifiers-re = []
+extend-ignore-words-re = []
+extend-ignore-re = []
+
+[tool.typos.type.vimscript.extend-identifiers]
+windo = "windo"
+
+[tool.typos.type.vimscript.extend-words]
diff --git a/vllm_v0.10.0/requirements/build.txt b/vllm_v0.10.0/requirements/build.txt
new file mode 100644
index 0000000..dd644d6
--- /dev/null
+++ b/vllm_v0.10.0/requirements/build.txt
@@ -0,0 +1,10 @@
+# Should be mirrored in pyproject.toml
+cmake>=3.26.1
+ninja
+packaging>=24.2
+setuptools>=77.0.3,<80.0.0
+setuptools-scm>=8
+torch==2.7.1
+wheel
+jinja2>=3.1.6
+regex
diff --git a/vllm_v0.10.0/requirements/common.txt b/vllm_v0.10.0/requirements/common.txt
new file mode 100644
index 0000000..96ab646
--- /dev/null
+++ b/vllm_v0.10.0/requirements/common.txt
@@ -0,0 +1,50 @@
+regex # Replace re for higher-performance regex matching
+cachetools
+psutil
+sentencepiece  # Required for LLaMA tokenizer.
+numpy
+requests >= 2.26.0
+tqdm
+blake3
+py-cpuinfo
+transformers >= 4.53.2
+huggingface-hub[hf_xet] >= 0.33.0  # Required for Xet downloads.
+tokenizers >= 0.21.1  # Required for fast incremental detokenization.
+protobuf # Required by LlamaTokenizer.
+fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
+aiohttp
+openai >= 1.87.0, <= 1.90.0 # Ensure modern openai package (ensure ResponsePrompt exists in type.responses and max_completion_tokens field support)
+pydantic >= 2.10
+prometheus_client >= 0.18.0
+pillow  # Required for image processing
+prometheus-fastapi-instrumentator >= 7.0.0
+tiktoken >= 0.6.0  # Required for DBRX tokenizer
+lm-format-enforcer >= 0.10.11, < 0.11
+llguidance >= 0.7.11, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
+outlines_core == 0.2.10
+# required for outlines backend disk cache
+diskcache == 5.6.3
+lark == 1.2.2
+xgrammar == 0.1.21; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
+typing_extensions >= 4.10
+filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
+partial-json-parser # used for parsing partial JSON outputs
+pyzmq >= 25.0.0
+msgspec
+gguf >= 0.13.0
+importlib_metadata; python_version < '3.10'
+mistral_common[image,audio] >= 1.8.2
+opencv-python-headless >= 4.11.0    # required for video IO
+pyyaml
+six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
+setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
+einops # Required for Qwen2-VL.
+compressed-tensors == 0.10.2 # required for compressed-tensors
+depyf==0.19.0 # required for profiling and debugging with compilation config
+cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
+watchfiles # required for http server to monitor the updates of TLS files
+python-json-logger # Used by logging as per examples/others/logging_configuration.md
+scipy # Required for phi-4-multimodal-instruct
+ninja # Required for xgrammar, rocm, tpu, xpu
+pybase64 # fast base64 implementation
+cbor2 # Required for cross-language serialization of hashable objects
diff --git a/vllm_v0.10.0/requirements/cpu-build.txt b/vllm_v0.10.0/requirements/cpu-build.txt
new file mode 100644
index 0000000..37f0722
--- /dev/null
+++ b/vllm_v0.10.0/requirements/cpu-build.txt
@@ -0,0 +1,12 @@
+# Temporarily used for x86 CPU backend to avoid performance regression of torch>2.6.0+cpu,
+# see https://github.com/pytorch/pytorch/pull/151218
+cmake>=3.26.1
+ninja
+packaging>=24.2
+setuptools>=77.0.3,<80.0.0
+setuptools-scm>=8
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch==2.6.0+cpu
+wheel
+jinja2>=3.1.6
+regex
diff --git a/vllm_v0.10.0/requirements/cpu.txt b/vllm_v0.10.0/requirements/cpu.txt
new file mode 100644
index 0000000..d803543
--- /dev/null
+++ b/vllm_v0.10.0/requirements/cpu.txt
@@ -0,0 +1,27 @@
+# Common dependencies
+-r common.txt
+
+numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.61.2; python_version > '3.9'
+
+# Dependencies for CPUs
+packaging>=24.2
+setuptools>=77.0.3,<80.0.0
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
+torch==2.7.0; platform_system == "Darwin"
+torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
+
+# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
+torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
+torchaudio==2.7.0; platform_machine == "ppc64le"
+
+# required for the image processor of phi3v, this must be updated alongside torch
+torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
+torchvision==0.22.0; platform_machine == "ppc64le"
+datasets # for benchmark scripts
+
+# Intel Extension for PyTorch, only for x86_64 CPUs
+intel-openmp==2024.2.1; platform_machine == "x86_64"
+intel_extension_for_pytorch==2.6.0; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
+triton==3.2.0; platform_machine == "x86_64" # Triton is required for torch 2.6+cpu, as it is imported in torch.compile.
diff --git a/vllm_v0.10.0/requirements/cuda.txt b/vllm_v0.10.0/requirements/cuda.txt
new file mode 100644
index 0000000..c1273b2
--- /dev/null
+++ b/vllm_v0.10.0/requirements/cuda.txt
@@ -0,0 +1,14 @@
+# Common dependencies
+-r common.txt
+
+numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.61.2; python_version > '3.9'
+
+# Dependencies for NVIDIA GPUs
+ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1.
+torch==2.7.1
+torchaudio==2.7.1
+# These must be updated alongside torch
+torchvision==0.22.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+# https://github.com/facebookresearch/xformers/releases/tag/v0.0.31
+xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.7
diff --git a/vllm_v0.10.0/requirements/dev.txt b/vllm_v0.10.0/requirements/dev.txt
new file mode 100644
index 0000000..e75821e
--- /dev/null
+++ b/vllm_v0.10.0/requirements/dev.txt
@@ -0,0 +1,5 @@
+-r lint.txt
+-r test.txt
+
+# Avoid adding requirements directly to this file.
+# Instead, modify the two files referenced above.
diff --git a/vllm_v0.10.0/requirements/docs.txt b/vllm_v0.10.0/requirements/docs.txt
new file mode 100644
index 0000000..1ddc825
--- /dev/null
+++ b/vllm_v0.10.0/requirements/docs.txt
@@ -0,0 +1,27 @@
+mkdocs
+mkdocs-api-autonav
+mkdocs-material
+mkdocstrings-python
+mkdocs-gen-files
+mkdocs-awesome-nav
+mkdocs-glightbox
+python-markdown-math
+regex
+ruff
+
+# Required for argparse hook only
+-f https://download.pytorch.org/whl/cpu
+cachetools
+cbor2
+cloudpickle
+fastapi
+msgspec
+openai
+partial-json-parser
+pillow
+psutil
+pybase64
+pydantic
+torch
+transformers
+zmq
diff --git a/vllm_v0.10.0/requirements/kv_connectors.txt b/vllm_v0.10.0/requirements/kv_connectors.txt
new file mode 100644
index 0000000..262675a
--- /dev/null
+++ b/vllm_v0.10.0/requirements/kv_connectors.txt
@@ -0,0 +1 @@
+lmcache
\ No newline at end of file
diff --git a/vllm_v0.10.0/requirements/lint.txt b/vllm_v0.10.0/requirements/lint.txt
new file mode 100644
index 0000000..62446f9
--- /dev/null
+++ b/vllm_v0.10.0/requirements/lint.txt
@@ -0,0 +1,2 @@
+# formatting
+pre-commit==4.0.1
diff --git a/vllm_v0.10.0/requirements/neuron.txt b/vllm_v0.10.0/requirements/neuron.txt
new file mode 100644
index 0000000..7df478e
--- /dev/null
+++ b/vllm_v0.10.0/requirements/neuron.txt
@@ -0,0 +1,9 @@
+# Common dependencies
+-r common.txt
+
+# Dependencies for Neuron devices
+packaging>=24.2
+setuptools>=77.0.3,<80.0.0
+torch-neuronx >= 2.5.0
+neuronx-cc>=2.0.0a0
+torchvision # Required for Llama3.2 multimodal image preprocessing
diff --git a/vllm_v0.10.0/requirements/nightly_torch_test.txt b/vllm_v0.10.0/requirements/nightly_torch_test.txt
new file mode 100644
index 0000000..0a72dde
--- /dev/null
+++ b/vllm_v0.10.0/requirements/nightly_torch_test.txt
@@ -0,0 +1,50 @@
+# testing
+pytest
+tensorizer==2.10.1
+pytest-forked
+pytest-asyncio
+pytest-rerunfailures
+pytest-shard
+pytest-timeout
+
+# testing utils
+backoff # required for phi4mm test
+blobfile # required for kimi-vl test
+einops # required for MPT, qwen-vl and Mamba
+httpx
+librosa # required for audio tests
+vocos # required for minicpmo_26 test
+peft
+pqdm
+ray[cgraph,default]>=2.43.0, !=2.44.* # Ray Compiled Graph, required by pipeline parallelism tests
+sentence-transformers # required for embedding tests
+soundfile # required for audio tests
+jiwer # required for audio tests
+timm # required for internvl test
+transformers_stream_generator # required for qwen-vl test
+matplotlib # required for qwen-vl test
+mistral_common[image,audio] >= 1.8.2 # required for voxtral test
+num2words # required for smolvlm test
+opencv-python-headless >= 4.11.0 # required for video test
+datamodel_code_generator # required for minicpm3 test
+lm-eval[api]==0.4.8 # required for model evaluation test
+mteb>=1.38.11, <2 # required for mteb test
+transformers==4.52.4
+tokenizers==0.21.1
+huggingface-hub[hf_xet]>=0.30.0  # Required for Xet downloads.
+schemathesis>=3.39.15 # Required for openai schema test.
+# quantization
+bitsandbytes>=0.46.1
+buildkite-test-collector==0.1.9
+
+
+genai_perf==0.0.8
+tritonclient==2.51.0
+
+numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.61.2; python_version > '3.9'
+numpy
+runai-model-streamer==0.11.0
+runai-model-streamer-s3==0.11.0
+fastsafetensors>=0.1.10
+pydantic>=2.10 # 2.9 leads to error on python 3.10
diff --git a/vllm_v0.10.0/requirements/rocm-build.txt b/vllm_v0.10.0/requirements/rocm-build.txt
new file mode 100644
index 0000000..9420154
--- /dev/null
+++ b/vllm_v0.10.0/requirements/rocm-build.txt
@@ -0,0 +1,16 @@
+# Common dependencies
+-r common.txt
+
+--extra-index-url https://download.pytorch.org/whl/rocm6.2.4
+torch==2.7.0
+torchvision==0.22.0
+torchaudio==2.7.0
+
+triton==3.2
+cmake>=3.26.1,<4
+packaging>=24.2
+setuptools>=77.0.3,<80.0.0
+setuptools-scm>=8
+wheel
+jinja2>=3.1.6
+amdsmi==6.2.4
diff --git a/vllm_v0.10.0/requirements/rocm-test.txt b/vllm_v0.10.0/requirements/rocm-test.txt
new file mode 100644
index 0000000..25f950a
--- /dev/null
+++ b/vllm_v0.10.0/requirements/rocm-test.txt
@@ -0,0 +1,31 @@
+# Common dependencies
+-r common.txt
+
+# entrypoints test
+# librosa==0.10.2.post1 # required by audio tests in entrypoints/openai
+audioread==3.0.1
+cffi==1.17.1
+decorator==5.2.1
+lazy-loader==0.4
+platformdirs==4.3.6
+pooch==1.8.2
+#pycparse==2.22
+soundfile==0.13.1
+soxr==0.5.0.post1
+librosa==0.10.2.post1
+
+# entrypoints test
+#vllm[video] # required by entrypoints/openai/test_video.py
+decord==0.6.0
+
+# entrypoints test
+#sentence-transformers # required by entrypoints/openai/test_score.py
+sentence-transformers==3.4.1
+
+# Basic Models Test
+matplotlib==3.10.3
+
+# Multi-Modal Models Test (Extended) 3
+blobfile==3.0.0
+
+
diff --git a/vllm_v0.10.0/requirements/rocm.txt b/vllm_v0.10.0/requirements/rocm.txt
new file mode 100644
index 0000000..7038c90
--- /dev/null
+++ b/vllm_v0.10.0/requirements/rocm.txt
@@ -0,0 +1,20 @@
+# Common dependencies
+-r common.txt
+
+numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.61.2; python_version > '3.9'
+
+# Dependencies for AMD GPUs
+boto3
+botocore
+datasets
+ray>=2.10.0,<2.45.0
+peft
+pytest-asyncio
+tensorizer==2.10.1
+packaging>=24.2
+setuptools>=77.0.3,<80.0.0
+setuptools-scm>=8
+runai-model-streamer==0.11.0
+runai-model-streamer-s3==0.11.0
+conch-triton-kernels==1.2.1
diff --git a/vllm_v0.10.0/requirements/test.in b/vllm_v0.10.0/requirements/test.in
new file mode 100644
index 0000000..429d1a5
--- /dev/null
+++ b/vllm_v0.10.0/requirements/test.in
@@ -0,0 +1,57 @@
+# testing
+pytest
+tensorizer==2.10.1
+pytest-forked
+pytest-asyncio
+pytest-rerunfailures
+pytest-shard
+pytest-timeout
+
+# testing utils
+backoff # required for phi4mm test
+blobfile # required for kimi-vl test
+einops # required for MPT, qwen-vl and Mamba
+httpx
+librosa # required for audio tests
+vector_quantize_pytorch # required for minicpmo_26 test
+vocos # required for minicpmo_26 test
+peft
+pqdm
+ray[cgraph,default]>=2.43.0, !=2.44.* # Ray Compiled Graph, required by pipeline parallelism tests
+sentence-transformers # required for embedding tests
+soundfile # required for audio tests
+jiwer # required for audio tests
+timm # required for internvl test
+torch==2.7.1
+torchaudio==2.7.1
+torchvision==0.22.1
+transformers_stream_generator # required for qwen-vl test
+mamba_ssm # required for plamo2 test
+matplotlib # required for qwen-vl test
+mistral_common[image,audio] >= 1.8.2 # required for voxtral test
+num2words # required for smolvlm test
+open_clip_torch==2.32.0 # Required for nemotron_vl test
+opencv-python-headless >= 4.11.0 # required for video test
+datamodel_code_generator # required for minicpm3 test
+lm-eval[api]==0.4.8 # required for model evaluation test
+mteb[bm25s]>=1.38.11, <2 # required for mteb test
+transformers==4.53.2
+tokenizers==0.21.1
+huggingface-hub[hf_xet]>=0.33.0  # Required for Xet downloads.
+schemathesis>=3.39.15 # Required for openai schema test.
+# quantization
+bitsandbytes==0.46.1
+buildkite-test-collector==0.1.9
+
+
+genai_perf==0.0.8
+tritonclient==2.51.0
+
+numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.61.2; python_version > '3.9'
+numpy
+runai-model-streamer==0.11.0
+runai-model-streamer-s3==0.11.0
+fastsafetensors>=0.1.10
+pydantic>=2.10 # 2.9 leads to error on python 3.10
+terratorch==1.1rc2 # required for PrithviMAE test
\ No newline at end of file
diff --git a/vllm_v0.10.0/requirements/test.txt b/vllm_v0.10.0/requirements/test.txt
new file mode 100644
index 0000000..8e5af8d
--- /dev/null
+++ b/vllm_v0.10.0/requirements/test.txt
@@ -0,0 +1,1246 @@
+# This file was autogenerated by uv via the following command:
+#    uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu128
+absl-py==2.1.0
+    # via rouge-score
+accelerate==1.0.1
+    # via
+    #   lm-eval
+    #   peft
+aenum==3.1.16
+    # via lightly
+affine==2.4.0
+    # via rasterio
+aiohappyeyeballs==2.4.3
+    # via aiohttp
+aiohttp==3.10.11
+    # via
+    #   aiohttp-cors
+    #   datasets
+    #   fsspec
+    #   lm-eval
+    #   ray
+aiohttp-cors==0.8.1
+    # via ray
+aiosignal==1.3.1
+    # via
+    #   aiohttp
+    #   ray
+albucore==0.0.16
+    # via terratorch
+albumentations==1.4.6
+    # via terratorch
+alembic==1.16.4
+    # via mlflow
+annotated-types==0.7.0
+    # via pydantic
+antlr4-python3-runtime==4.9.3
+    # via
+    #   hydra-core
+    #   omegaconf
+anyio==4.6.2.post1
+    # via
+    #   httpx
+    #   starlette
+argcomplete==3.5.1
+    # via datamodel-code-generator
+arrow==1.3.0
+    # via isoduration
+attrs==24.2.0
+    # via
+    #   aiohttp
+    #   fiona
+    #   hypothesis
+    #   jsonlines
+    #   jsonschema
+    #   pytest-subtests
+    #   rasterio
+    #   referencing
+audioread==3.0.1
+    # via librosa
+backoff==2.2.1
+    # via
+    #   -r requirements/test.in
+    #   schemathesis
+bitsandbytes==0.46.1
+    # via
+    #   -r requirements/test.in
+    #   lightning
+black==24.10.0
+    # via datamodel-code-generator
+blinker==1.9.0
+    # via flask
+blobfile==3.0.0
+    # via -r requirements/test.in
+bm25s==0.2.13
+    # via mteb
+boto3==1.35.57
+    # via tensorizer
+botocore==1.35.57
+    # via
+    #   boto3
+    #   s3transfer
+bounded-pool-executor==0.0.3
+    # via pqdm
+buildkite-test-collector==0.1.9
+    # via -r requirements/test.in
+cachetools==5.5.2
+    # via
+    #   google-auth
+    #   mlflow-skinny
+certifi==2024.8.30
+    # via
+    #   fiona
+    #   httpcore
+    #   httpx
+    #   lightly
+    #   pyogrio
+    #   pyproj
+    #   rasterio
+    #   requests
+cffi==1.17.1
+    # via soundfile
+chardet==5.2.0
+    # via mbstrdecoder
+charset-normalizer==3.4.0
+    # via requests
+click==8.1.7
+    # via
+    #   black
+    #   click-plugins
+    #   cligj
+    #   fiona
+    #   flask
+    #   jiwer
+    #   mlflow-skinny
+    #   nltk
+    #   rasterio
+    #   ray
+    #   schemathesis
+    #   typer
+    #   uvicorn
+click-plugins==1.1.1.2
+    # via
+    #   fiona
+    #   rasterio
+cligj==0.7.2
+    # via
+    #   fiona
+    #   rasterio
+cloudpickle==3.1.1
+    # via mlflow-skinny
+colorama==0.4.6
+    # via
+    #   sacrebleu
+    #   schemathesis
+    #   tqdm-multiprocess
+colorful==0.5.6
+    # via ray
+contourpy==1.3.0
+    # via matplotlib
+cramjam==2.9.0
+    # via fastparquet
+cupy-cuda12x==13.3.0
+    # via ray
+cycler==0.12.1
+    # via matplotlib
+databricks-sdk==0.59.0
+    # via mlflow-skinny
+datamodel-code-generator==0.26.3
+    # via -r requirements/test.in
+dataproperty==1.0.1
+    # via
+    #   pytablewriter
+    #   tabledata
+datasets==3.0.2
+    # via
+    #   evaluate
+    #   lm-eval
+    #   mteb
+decorator==5.1.1
+    # via librosa
+dill==0.3.8
+    # via
+    #   datasets
+    #   evaluate
+    #   lm-eval
+    #   multiprocess
+distlib==0.3.9
+    # via virtualenv
+dnspython==2.7.0
+    # via email-validator
+docker==7.1.0
+    # via mlflow
+docopt==0.6.2
+    # via num2words
+docstring-parser==0.17.0
+    # via jsonargparse
+efficientnet-pytorch==0.7.1
+    # via segmentation-models-pytorch
+einops==0.8.1
+    # via
+    #   -r requirements/test.in
+    #   encodec
+    #   mamba-ssm
+    #   terratorch
+    #   torchgeo
+    #   vector-quantize-pytorch
+    #   vocos
+einx==0.3.0
+    # via vector-quantize-pytorch
+email-validator==2.2.0
+    # via pydantic
+encodec==0.1.1
+    # via vocos
+eval-type-backport==0.2.2
+    # via mteb
+evaluate==0.4.3
+    # via lm-eval
+fastapi==0.116.1
+    # via mlflow-skinny
+fastparquet==2024.11.0
+    # via genai-perf
+fastrlock==0.8.2
+    # via cupy-cuda12x
+fastsafetensors==0.1.10
+    # via -r requirements/test.in
+filelock==3.16.1
+    # via
+    #   blobfile
+    #   datasets
+    #   huggingface-hub
+    #   ray
+    #   torch
+    #   transformers
+    #   virtualenv
+fiona==1.10.1
+    # via torchgeo
+flask==3.1.1
+    # via mlflow
+fonttools==4.54.1
+    # via matplotlib
+fqdn==1.5.1
+    # via jsonschema
+frozendict==2.4.6
+    # via einx
+frozenlist==1.5.0
+    # via
+    #   aiohttp
+    #   aiosignal
+    #   ray
+fsspec==2024.9.0
+    # via
+    #   datasets
+    #   evaluate
+    #   fastparquet
+    #   huggingface-hub
+    #   lightning
+    #   pytorch-lightning
+    #   torch
+ftfy==6.3.1
+    # via open-clip-torch
+genai-perf==0.0.8
+    # via -r requirements/test.in
+genson==1.3.0
+    # via datamodel-code-generator
+geopandas==1.0.1
+    # via terratorch
+gitdb==4.0.12
+    # via gitpython
+gitpython==3.1.44
+    # via mlflow-skinny
+google-api-core==2.24.2
+    # via opencensus
+google-auth==2.40.2
+    # via
+    #   databricks-sdk
+    #   google-api-core
+googleapis-common-protos==1.70.0
+    # via google-api-core
+graphene==3.4.3
+    # via mlflow
+graphql-core==3.2.6
+    # via
+    #   graphene
+    #   graphql-relay
+    #   hypothesis-graphql
+graphql-relay==3.2.0
+    # via graphene
+greenlet==3.2.3
+    # via sqlalchemy
+grpcio==1.71.0
+    # via ray
+gunicorn==23.0.0
+    # via mlflow
+h11==0.14.0
+    # via
+    #   httpcore
+    #   uvicorn
+h5py==3.13.0
+    # via terratorch
+harfile==0.3.0
+    # via schemathesis
+hf-xet==1.1.3
+    # via huggingface-hub
+hiredis==3.0.0
+    # via tensorizer
+httpcore==1.0.6
+    # via httpx
+httpx==0.27.2
+    # via
+    #   -r requirements/test.in
+    #   schemathesis
+huggingface-hub==0.33.1
+    # via
+    #   -r requirements/test.in
+    #   accelerate
+    #   datasets
+    #   evaluate
+    #   open-clip-torch
+    #   peft
+    #   segmentation-models-pytorch
+    #   sentence-transformers
+    #   terratorch
+    #   timm
+    #   tokenizers
+    #   transformers
+    #   vocos
+humanize==4.11.0
+    # via runai-model-streamer
+hydra-core==1.3.2
+    # via
+    #   lightly
+    #   lightning
+hypothesis==6.131.0
+    # via
+    #   hypothesis-graphql
+    #   hypothesis-jsonschema
+    #   schemathesis
+hypothesis-graphql==0.11.1
+    # via schemathesis
+hypothesis-jsonschema==0.23.1
+    # via schemathesis
+idna==3.10
+    # via
+    #   anyio
+    #   email-validator
+    #   httpx
+    #   jsonschema
+    #   requests
+    #   yarl
+imageio==2.37.0
+    # via scikit-image
+importlib-metadata==8.7.0
+    # via
+    #   mlflow-skinny
+    #   opentelemetry-api
+importlib-resources==6.5.2
+    # via typeshed-client
+inflect==5.6.2
+    # via datamodel-code-generator
+iniconfig==2.0.0
+    # via pytest
+isoduration==20.11.0
+    # via jsonschema
+isort==5.13.2
+    # via datamodel-code-generator
+itsdangerous==2.2.0
+    # via flask
+jinja2==3.1.6
+    # via
+    #   datamodel-code-generator
+    #   flask
+    #   mlflow
+    #   torch
+jiwer==3.0.5
+    # via -r requirements/test.in
+jmespath==1.0.1
+    # via
+    #   boto3
+    #   botocore
+joblib==1.4.2
+    # via
+    #   librosa
+    #   nltk
+    #   scikit-learn
+jsonargparse==4.35.0
+    # via
+    #   lightning
+    #   terratorch
+jsonlines==4.0.0
+    # via lm-eval
+jsonpointer==3.0.0
+    # via jsonschema
+jsonschema==4.23.0
+    # via
+    #   hypothesis-jsonschema
+    #   mistral-common
+    #   ray
+    #   schemathesis
+jsonschema-specifications==2024.10.1
+    # via jsonschema
+junit-xml==1.9
+    # via schemathesis
+kaleido==0.2.1
+    # via genai-perf
+kiwisolver==1.4.7
+    # via matplotlib
+kornia==0.8.1
+    # via torchgeo
+kornia-rs==0.1.9
+    # via kornia
+lazy-loader==0.4
+    # via
+    #   librosa
+    #   scikit-image
+libnacl==2.1.0
+    # via tensorizer
+librosa==0.10.2.post1
+    # via -r requirements/test.in
+lightly==1.5.20
+    # via
+    #   terratorch
+    #   torchgeo
+lightly-utils==0.0.2
+    # via lightly
+lightning==2.5.1.post0
+    # via
+    #   terratorch
+    #   torchgeo
+lightning-utilities==0.14.3
+    # via
+    #   lightning
+    #   pytorch-lightning
+    #   torchmetrics
+llvmlite==0.44.0
+    # via numba
+lm-eval==0.4.8
+    # via -r requirements/test.in
+lxml==5.3.0
+    # via
+    #   blobfile
+    #   sacrebleu
+mako==1.3.10
+    # via alembic
+mamba-ssm==2.2.4
+    # via -r requirements/test.in
+markdown==3.8.2
+    # via mlflow
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==3.0.1
+    # via
+    #   flask
+    #   jinja2
+    #   mako
+    #   werkzeug
+matplotlib==3.9.2
+    # via
+    #   -r requirements/test.in
+    #   lightning
+    #   mlflow
+    #   pycocotools
+    #   torchgeo
+mbstrdecoder==1.1.3
+    # via
+    #   dataproperty
+    #   pytablewriter
+    #   typepy
+mdurl==0.1.2
+    # via markdown-it-py
+mistral-common==1.8.2
+    # via -r requirements/test.in
+mlflow==2.22.0
+    # via terratorch
+mlflow-skinny==2.22.0
+    # via mlflow
+more-itertools==10.5.0
+    # via lm-eval
+mpmath==1.3.0
+    # via sympy
+msgpack==1.1.0
+    # via
+    #   librosa
+    #   ray
+mteb==1.38.11
+    # via -r requirements/test.in
+multidict==6.1.0
+    # via
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.16
+    # via
+    #   datasets
+    #   evaluate
+munch==4.0.0
+    # via pretrainedmodels
+mypy-extensions==1.0.0
+    # via black
+networkx==3.2.1
+    # via
+    #   scikit-image
+    #   torch
+ninja==1.11.1.3
+    # via mamba-ssm
+nltk==3.9.1
+    # via rouge-score
+num2words==0.5.14
+    # via -r requirements/test.in
+numba==0.61.2
+    # via
+    #   -r requirements/test.in
+    #   librosa
+numexpr==2.10.1
+    # via lm-eval
+numpy==1.26.4
+    # via
+    #   -r requirements/test.in
+    #   accelerate
+    #   albucore
+    #   albumentations
+    #   bitsandbytes
+    #   bm25s
+    #   contourpy
+    #   cupy-cuda12x
+    #   datasets
+    #   einx
+    #   encodec
+    #   evaluate
+    #   fastparquet
+    #   genai-perf
+    #   geopandas
+    #   h5py
+    #   imageio
+    #   librosa
+    #   lightly
+    #   lightly-utils
+    #   matplotlib
+    #   mistral-common
+    #   mlflow
+    #   mteb
+    #   numba
+    #   numexpr
+    #   opencv-python-headless
+    #   pandas
+    #   patsy
+    #   peft
+    #   pycocotools
+    #   pyogrio
+    #   rasterio
+    #   rioxarray
+    #   rouge-score
+    #   runai-model-streamer
+    #   sacrebleu
+    #   scikit-image
+    #   scikit-learn
+    #   scipy
+    #   segmentation-models-pytorch
+    #   shapely
+    #   soxr
+    #   statsmodels
+    #   tensorboardx
+    #   tensorizer
+    #   tifffile
+    #   torchgeo
+    #   torchmetrics
+    #   torchvision
+    #   transformers
+    #   tritonclient
+    #   vocos
+    #   xarray
+nvidia-cublas-cu12==12.8.3.14
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.8.57
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.8.61
+    # via torch
+nvidia-cuda-runtime-cu12==12.8.57
+    # via torch
+nvidia-cudnn-cu12==9.7.1.26
+    # via torch
+nvidia-cufft-cu12==11.3.3.41
+    # via torch
+nvidia-cufile-cu12==1.13.0.11
+    # via torch
+nvidia-curand-cu12==10.3.9.55
+    # via torch
+nvidia-cusolver-cu12==11.7.2.55
+    # via torch
+nvidia-cusparse-cu12==12.5.7.53
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cusparselt-cu12==0.6.3
+    # via torch
+nvidia-nccl-cu12==2.26.2
+    # via torch
+nvidia-nvjitlink-cu12==12.8.61
+    # via
+    #   nvidia-cufft-cu12
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvtx-cu12==12.8.55
+    # via torch
+omegaconf==2.3.0
+    # via
+    #   hydra-core
+    #   lightning
+open-clip-torch==2.32.0
+    # via -r requirements/test.in
+opencensus==0.11.4
+    # via ray
+opencensus-context==0.1.3
+    # via opencensus
+opencv-python-headless==4.11.0.86
+    # via
+    #   -r requirements/test.in
+    #   albucore
+    #   albumentations
+    #   mistral-common
+opentelemetry-api==1.35.0
+    # via
+    #   mlflow-skinny
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
+opentelemetry-sdk==1.35.0
+    # via mlflow-skinny
+opentelemetry-semantic-conventions==0.56b0
+    # via opentelemetry-sdk
+packaging==24.2
+    # via
+    #   accelerate
+    #   black
+    #   datamodel-code-generator
+    #   datasets
+    #   evaluate
+    #   fastparquet
+    #   geopandas
+    #   gunicorn
+    #   huggingface-hub
+    #   hydra-core
+    #   kornia
+    #   lazy-loader
+    #   lightning
+    #   lightning-utilities
+    #   mamba-ssm
+    #   matplotlib
+    #   mlflow-skinny
+    #   peft
+    #   plotly
+    #   pooch
+    #   pyogrio
+    #   pytest
+    #   pytest-rerunfailures
+    #   pytorch-lightning
+    #   ray
+    #   rioxarray
+    #   scikit-image
+    #   statsmodels
+    #   tensorboardx
+    #   torchmetrics
+    #   transformers
+    #   typepy
+    #   xarray
+pandas==2.2.3
+    # via
+    #   datasets
+    #   evaluate
+    #   fastparquet
+    #   genai-perf
+    #   geopandas
+    #   mlflow
+    #   statsmodels
+    #   torchgeo
+    #   xarray
+pathspec==0.12.1
+    # via black
+pathvalidate==3.2.1
+    # via pytablewriter
+patsy==1.0.1
+    # via statsmodels
+peft==0.13.2
+    # via
+    #   -r requirements/test.in
+    #   lm-eval
+pillow==10.4.0
+    # via
+    #   genai-perf
+    #   imageio
+    #   lightly-utils
+    #   matplotlib
+    #   mistral-common
+    #   scikit-image
+    #   segmentation-models-pytorch
+    #   sentence-transformers
+    #   torchgeo
+    #   torchvision
+platformdirs==4.3.6
+    # via
+    #   black
+    #   pooch
+    #   virtualenv
+plotly==5.24.1
+    # via genai-perf
+pluggy==1.5.0
+    # via pytest
+polars==1.29.0
+    # via mteb
+pooch==1.8.2
+    # via librosa
+portalocker==2.10.1
+    # via sacrebleu
+pqdm==0.2.0
+    # via -r requirements/test.in
+pretrainedmodels==0.7.4
+    # via segmentation-models-pytorch
+prometheus-client==0.22.0
+    # via ray
+propcache==0.2.0
+    # via yarl
+proto-plus==1.26.1
+    # via google-api-core
+protobuf==5.28.3
+    # via
+    #   google-api-core
+    #   googleapis-common-protos
+    #   mlflow-skinny
+    #   proto-plus
+    #   ray
+    #   tensorboardx
+    #   tensorizer
+psutil==6.1.0
+    # via
+    #   accelerate
+    #   peft
+    #   tensorizer
+py==1.11.0
+    # via pytest-forked
+py-spy==0.4.0
+    # via ray
+pyarrow==18.0.0
+    # via
+    #   datasets
+    #   genai-perf
+    #   mlflow
+pyasn1==0.6.1
+    # via
+    #   pyasn1-modules
+    #   rsa
+pyasn1-modules==0.4.2
+    # via google-auth
+pybind11==2.13.6
+    # via lm-eval
+pycocotools==2.0.8
+    # via terratorch
+pycountry==24.6.1
+    # via pydantic-extra-types
+pycparser==2.22
+    # via cffi
+pycryptodomex==3.22.0
+    # via blobfile
+pydantic==2.11.5
+    # via
+    #   -r requirements/test.in
+    #   albumentations
+    #   datamodel-code-generator
+    #   fastapi
+    #   lightly
+    #   mistral-common
+    #   mlflow-skinny
+    #   mteb
+    #   pydantic-extra-types
+    #   ray
+pydantic-core==2.33.2
+    # via pydantic
+pydantic-extra-types==2.10.5
+    # via mistral-common
+pygments==2.18.0
+    # via rich
+pyogrio==0.11.0
+    # via geopandas
+pyparsing==3.2.0
+    # via
+    #   matplotlib
+    #   rasterio
+pyproj==3.7.1
+    # via
+    #   geopandas
+    #   rioxarray
+    #   torchgeo
+pyrate-limiter==3.7.0
+    # via schemathesis
+pystemmer==3.0.0
+    # via mteb
+pytablewriter==1.2.0
+    # via lm-eval
+pytest==8.3.5
+    # via
+    #   -r requirements/test.in
+    #   buildkite-test-collector
+    #   genai-perf
+    #   pytest-asyncio
+    #   pytest-forked
+    #   pytest-mock
+    #   pytest-rerunfailures
+    #   pytest-shard
+    #   pytest-subtests
+    #   pytest-timeout
+    #   schemathesis
+    #   terratorch
+pytest-asyncio==0.24.0
+    # via -r requirements/test.in
+pytest-forked==1.6.0
+    # via -r requirements/test.in
+pytest-mock==3.14.0
+    # via genai-perf
+pytest-rerunfailures==14.0
+    # via -r requirements/test.in
+pytest-shard==0.1.2
+    # via -r requirements/test.in
+pytest-subtests==0.14.1
+    # via schemathesis
+pytest-timeout==2.3.1
+    # via -r requirements/test.in
+python-box==7.3.2
+    # via terratorch
+python-dateutil==2.9.0.post0
+    # via
+    #   arrow
+    #   botocore
+    #   graphene
+    #   lightly
+    #   matplotlib
+    #   pandas
+    #   typepy
+python-rapidjson==1.20
+    # via tritonclient
+pytorch-lightning==2.5.2
+    # via
+    #   lightly
+    #   lightning
+pytrec-eval-terrier==0.5.7
+    # via mteb
+pytz==2024.2
+    # via
+    #   pandas
+    #   typepy
+pyyaml==6.0.2
+    # via
+    #   accelerate
+    #   albumentations
+    #   datamodel-code-generator
+    #   datasets
+    #   genai-perf
+    #   huggingface-hub
+    #   jsonargparse
+    #   lightning
+    #   mlflow-skinny
+    #   omegaconf
+    #   peft
+    #   pytorch-lightning
+    #   ray
+    #   responses
+    #   schemathesis
+    #   timm
+    #   transformers
+    #   vocos
+rapidfuzz==3.12.1
+    # via jiwer
+rasterio==1.4.3
+    # via
+    #   rioxarray
+    #   terratorch
+    #   torchgeo
+ray==2.43.0
+    # via -r requirements/test.in
+redis==5.2.0
+    # via tensorizer
+referencing==0.35.1
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
+regex==2024.9.11
+    # via
+    #   nltk
+    #   open-clip-torch
+    #   sacrebleu
+    #   tiktoken
+    #   transformers
+requests==2.32.3
+    # via
+    #   buildkite-test-collector
+    #   databricks-sdk
+    #   datasets
+    #   docker
+    #   evaluate
+    #   google-api-core
+    #   huggingface-hub
+    #   lightly
+    #   lm-eval
+    #   mistral-common
+    #   mlflow-skinny
+    #   mteb
+    #   pooch
+    #   ray
+    #   responses
+    #   schemathesis
+    #   starlette-testclient
+    #   tiktoken
+    #   transformers
+responses==0.25.3
+    # via genai-perf
+rfc3339-validator==0.1.4
+    # via jsonschema
+rfc3987==1.3.8
+    # via jsonschema
+rich==13.9.4
+    # via
+    #   genai-perf
+    #   lightning
+    #   mteb
+    #   typer
+rioxarray==0.19.0
+    # via terratorch
+rouge-score==0.1.2
+    # via lm-eval
+rpds-py==0.20.1
+    # via
+    #   jsonschema
+    #   referencing
+rsa==4.9.1
+    # via google-auth
+rtree==1.4.0
+    # via torchgeo
+runai-model-streamer==0.11.0
+    # via -r requirements/test.in
+runai-model-streamer-s3==0.11.0
+    # via -r requirements/test.in
+s3transfer==0.10.3
+    # via boto3
+sacrebleu==2.4.3
+    # via lm-eval
+safetensors==0.4.5
+    # via
+    #   accelerate
+    #   open-clip-torch
+    #   peft
+    #   timm
+    #   transformers
+schemathesis==3.39.15
+    # via -r requirements/test.in
+scikit-image==0.25.2
+    # via albumentations
+scikit-learn==1.5.2
+    # via
+    #   albumentations
+    #   librosa
+    #   lm-eval
+    #   mlflow
+    #   mteb
+    #   sentence-transformers
+scipy==1.13.1
+    # via
+    #   albumentations
+    #   bm25s
+    #   librosa
+    #   mlflow
+    #   mteb
+    #   scikit-image
+    #   scikit-learn
+    #   sentence-transformers
+    #   statsmodels
+    #   vocos
+segmentation-models-pytorch==0.4.0
+    # via
+    #   terratorch
+    #   torchgeo
+sentence-transformers==3.2.1
+    # via
+    #   -r requirements/test.in
+    #   mteb
+sentencepiece==0.2.0
+    # via mistral-common
+setuptools==77.0.3
+    # via
+    #   lightning-utilities
+    #   mamba-ssm
+    #   pytablewriter
+    #   torch
+    #   triton
+shapely==2.1.1
+    # via
+    #   geopandas
+    #   torchgeo
+shellingham==1.5.4
+    # via typer
+six==1.16.0
+    # via
+    #   junit-xml
+    #   lightly
+    #   opencensus
+    #   python-dateutil
+    #   rfc3339-validator
+    #   rouge-score
+    #   segmentation-models-pytorch
+smart-open==7.1.0
+    # via ray
+smmap==5.0.2
+    # via gitdb
+sniffio==1.3.1
+    # via
+    #   anyio
+    #   httpx
+sortedcontainers==2.4.0
+    # via hypothesis
+soundfile==0.12.1
+    # via
+    #   -r requirements/test.in
+    #   librosa
+    #   mistral-common
+soxr==0.5.0.post1
+    # via
+    #   librosa
+    #   mistral-common
+sqlalchemy==2.0.41
+    # via
+    #   alembic
+    #   mlflow
+sqlitedict==2.1.0
+    # via lm-eval
+sqlparse==0.5.3
+    # via mlflow-skinny
+starlette==0.46.2
+    # via
+    #   fastapi
+    #   schemathesis
+    #   starlette-testclient
+starlette-testclient==0.4.1
+    # via schemathesis
+statsmodels==0.14.4
+    # via genai-perf
+sympy==1.13.3
+    # via
+    #   einx
+    #   torch
+tabledata==1.3.3
+    # via pytablewriter
+tabulate==0.9.0
+    # via sacrebleu
+tcolorpy==0.1.6
+    # via pytablewriter
+tenacity==9.0.0
+    # via
+    #   lm-eval
+    #   plotly
+tensorboardx==2.6.4
+    # via lightning
+tensorizer==2.10.1
+    # via -r requirements/test.in
+terratorch==1.1rc2
+    # via -r requirements/test.in
+threadpoolctl==3.5.0
+    # via scikit-learn
+tifffile==2025.3.30
+    # via
+    #   scikit-image
+    #   terratorch
+tiktoken==0.7.0
+    # via
+    #   lm-eval
+    #   mistral-common
+timm==1.0.15
+    # via
+    #   -r requirements/test.in
+    #   open-clip-torch
+    #   segmentation-models-pytorch
+    #   terratorch
+    #   torchgeo
+tokenizers==0.21.1
+    # via
+    #   -r requirements/test.in
+    #   transformers
+tomli==2.2.1
+    # via schemathesis
+tomli-w==1.2.0
+    # via schemathesis
+torch==2.7.1+cu128
+    # via
+    #   -r requirements/test.in
+    #   accelerate
+    #   bitsandbytes
+    #   efficientnet-pytorch
+    #   encodec
+    #   fastsafetensors
+    #   kornia
+    #   lightly
+    #   lightning
+    #   lm-eval
+    #   mamba-ssm
+    #   mteb
+    #   open-clip-torch
+    #   peft
+    #   pretrainedmodels
+    #   pytorch-lightning
+    #   runai-model-streamer
+    #   segmentation-models-pytorch
+    #   sentence-transformers
+    #   tensorizer
+    #   terratorch
+    #   timm
+    #   torchaudio
+    #   torchgeo
+    #   torchmetrics
+    #   torchvision
+    #   vector-quantize-pytorch
+    #   vocos
+torchaudio==2.7.1+cu128
+    # via
+    #   -r requirements/test.in
+    #   encodec
+    #   vocos
+torchgeo==0.7.0
+    # via terratorch
+torchmetrics==1.7.4
+    # via
+    #   lightning
+    #   pytorch-lightning
+    #   terratorch
+    #   torchgeo
+torchvision==0.22.1+cu128
+    # via
+    #   -r requirements/test.in
+    #   lightly
+    #   open-clip-torch
+    #   pretrainedmodels
+    #   segmentation-models-pytorch
+    #   terratorch
+    #   timm
+    #   torchgeo
+tqdm==4.66.6
+    # via
+    #   datasets
+    #   evaluate
+    #   huggingface-hub
+    #   lightly
+    #   lightning
+    #   lm-eval
+    #   mteb
+    #   nltk
+    #   open-clip-torch
+    #   peft
+    #   pqdm
+    #   pretrainedmodels
+    #   pytorch-lightning
+    #   segmentation-models-pytorch
+    #   sentence-transformers
+    #   tqdm-multiprocess
+    #   transformers
+tqdm-multiprocess==0.0.11
+    # via lm-eval
+transformers==4.53.2
+    # via
+    #   -r requirements/test.in
+    #   genai-perf
+    #   lm-eval
+    #   mamba-ssm
+    #   peft
+    #   sentence-transformers
+    #   transformers-stream-generator
+transformers-stream-generator==0.0.5
+    # via -r requirements/test.in
+triton==3.3.1
+    # via torch
+tritonclient==2.51.0
+    # via
+    #   -r requirements/test.in
+    #   genai-perf
+typepy==1.3.2
+    # via
+    #   dataproperty
+    #   pytablewriter
+    #   tabledata
+typer==0.15.2
+    # via fastsafetensors
+types-python-dateutil==2.9.0.20241206
+    # via arrow
+typeshed-client==2.8.2
+    # via jsonargparse
+typing-extensions==4.12.2
+    # via
+    #   albumentations
+    #   alembic
+    #   fastapi
+    #   graphene
+    #   huggingface-hub
+    #   librosa
+    #   lightning
+    #   lightning-utilities
+    #   mistral-common
+    #   mlflow-skinny
+    #   mteb
+    #   opentelemetry-api
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
+    #   pqdm
+    #   pydantic
+    #   pydantic-core
+    #   pydantic-extra-types
+    #   pytorch-lightning
+    #   sqlalchemy
+    #   torch
+    #   torchgeo
+    #   typer
+    #   typeshed-client
+    #   typing-inspection
+typing-inspection==0.4.1
+    # via pydantic
+tzdata==2024.2
+    # via pandas
+uri-template==1.3.0
+    # via jsonschema
+urllib3==2.2.3
+    # via
+    #   blobfile
+    #   botocore
+    #   docker
+    #   lightly
+    #   requests
+    #   responses
+    #   tritonclient
+uvicorn==0.35.0
+    # via mlflow-skinny
+vector-quantize-pytorch==1.21.2
+    # via -r requirements/test.in
+virtualenv==20.31.2
+    # via ray
+vocos==0.1.0
+    # via -r requirements/test.in
+wcwidth==0.2.13
+    # via ftfy
+webcolors==24.11.1
+    # via jsonschema
+werkzeug==3.1.3
+    # via
+    #   flask
+    #   schemathesis
+word2number==1.1
+    # via lm-eval
+wrapt==1.17.2
+    # via smart-open
+xarray==2025.7.1
+    # via rioxarray
+xxhash==3.5.0
+    # via
+    #   datasets
+    #   evaluate
+yarl==1.17.1
+    # via
+    #   aiohttp
+    #   schemathesis
+zipp==3.23.0
+    # via importlib-metadata
+zstandard==0.23.0
+    # via lm-eval
diff --git a/vllm_v0.10.0/requirements/tpu.txt b/vllm_v0.10.0/requirements/tpu.txt
new file mode 100644
index 0000000..3547714
--- /dev/null
+++ b/vllm_v0.10.0/requirements/tpu.txt
@@ -0,0 +1,25 @@
+# Common dependencies
+-r common.txt
+
+# Dependencies for TPU
+cmake>=3.26.1
+packaging>=24.2
+setuptools-scm>=8
+wheel
+jinja2>=3.1.6
+ray[default]
+ray[data]
+setuptools==78.1.0
+
+# Install torch_xla
+--pre
+--extra-index-url https://download.pytorch.org/whl/nightly/cpu
+--find-links https://storage.googleapis.com/libtpu-wheels/index.html
+--find-links https://storage.googleapis.com/libtpu-releases/index.html
+--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
+--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+torch==2.9.0.dev20250716
+torchvision==0.24.0.dev20250716
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250716-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250716-cp312-cp312-linux_x86_64.whl ; python_version == "3.12"
+
diff --git a/vllm_v0.10.0/requirements/xpu.txt b/vllm_v0.10.0/requirements/xpu.txt
new file mode 100644
index 0000000..0d95dc5
--- /dev/null
+++ b/vllm_v0.10.0/requirements/xpu.txt
@@ -0,0 +1,24 @@
+# Common dependencies
+-r common.txt
+
+ray>=2.9
+cmake>=3.26.1
+packaging>=24.2
+setuptools-scm>=8
+setuptools>=77.0.3,<80.0.0
+wheel
+jinja2>=3.1.6
+datasets # for benchmark scripts
+numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+
+torch==2.7.0+xpu
+torchaudio
+torchvision
+pytorch-triton-xpu
+--extra-index-url=https://download.pytorch.org/whl/xpu
+
+# Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu
+# FIXME: This will be fix in ipex 2.7. just leave this here for awareness.
+intel-extension-for-pytorch==2.7.10+xpu
+oneccl_bind_pt==2.7.0+xpu
+--extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
diff --git a/vllm_v0.10.0/setup.py b/vllm_v0.10.0/setup.py
new file mode 100644
index 0000000..d46e678
--- /dev/null
+++ b/vllm_v0.10.0/setup.py
@@ -0,0 +1,670 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import ctypes
+import importlib.util
+import json
+import logging
+import os
+import re
+import subprocess
+import sys
+from pathlib import Path
+from shutil import which
+
+import torch
+from packaging.version import Version, parse
+from setuptools import Extension, setup
+from setuptools.command.build_ext import build_ext
+from setuptools_scm import get_version
+from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
+
+
+def load_module_from_path(module_name, path):
+    spec = importlib.util.spec_from_file_location(module_name, path)
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)
+    return module
+
+
+ROOT_DIR = Path(__file__).parent
+logger = logging.getLogger(__name__)
+
+# cannot import envs directly because it depends on vllm,
+#  which is not installed yet
+envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
+
+VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
+
+if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu":
+    logger.warning(
+        "VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS")
+    VLLM_TARGET_DEVICE = "cpu"
+elif not (sys.platform.startswith("linux")
+          or sys.platform.startswith("darwin")):
+    logger.warning(
+        "vLLM only supports Linux platform (including WSL) and MacOS."
+        "Building on %s, "
+        "so vLLM may not be able to run correctly", sys.platform)
+    VLLM_TARGET_DEVICE = "empty"
+elif (sys.platform.startswith("linux") and torch.version.cuda is None
+      and os.getenv("VLLM_TARGET_DEVICE") is None
+      and torch.version.hip is None):
+    # if cuda or hip is not available and VLLM_TARGET_DEVICE is not set,
+    # fallback to cpu
+    VLLM_TARGET_DEVICE = "cpu"
+
+MAIN_CUDA_VERSION = "12.8"
+
+
+def is_sccache_available() -> bool:
+    return which("sccache") is not None
+
+
+def is_ccache_available() -> bool:
+    return which("ccache") is not None
+
+
+def is_ninja_available() -> bool:
+    return which("ninja") is not None
+
+
+def is_url_available(url: str) -> bool:
+    from urllib.request import urlopen
+
+    status = None
+    try:
+        with urlopen(url) as f:
+            status = f.status
+    except Exception:
+        return False
+    return status == 200
+
+
+class CMakeExtension(Extension):
+
+    def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None:
+        super().__init__(name, sources=[], py_limited_api=True, **kwa)
+        self.cmake_lists_dir = os.path.abspath(cmake_lists_dir)
+
+
+class cmake_build_ext(build_ext):
+    # A dict of extension directories that have been configured.
+    did_config: dict[str, bool] = {}
+
+    #
+    # Determine number of compilation jobs and optionally nvcc compile threads.
+    #
+    def compute_num_jobs(self):
+        # `num_jobs` is either the value of the MAX_JOBS environment variable
+        # (if defined) or the number of CPUs available.
+        num_jobs = envs.MAX_JOBS
+        if num_jobs is not None:
+            num_jobs = int(num_jobs)
+            logger.info("Using MAX_JOBS=%d as the number of jobs.", num_jobs)
+        else:
+            try:
+                # os.sched_getaffinity() isn't universally available, so fall
+                #  back to os.cpu_count() if we get an error here.
+                num_jobs = len(os.sched_getaffinity(0))
+            except AttributeError:
+                num_jobs = os.cpu_count()
+
+        nvcc_threads = None
+        if _is_cuda() and get_nvcc_cuda_version() >= Version("11.2"):
+            # `nvcc_threads` is either the value of the NVCC_THREADS
+            # environment variable (if defined) or 1.
+            # when it is set, we reduce `num_jobs` to avoid
+            # overloading the system.
+            nvcc_threads = envs.NVCC_THREADS
+            if nvcc_threads is not None:
+                nvcc_threads = int(nvcc_threads)
+                logger.info(
+                    "Using NVCC_THREADS=%d as the number of nvcc threads.",
+                    nvcc_threads)
+            else:
+                nvcc_threads = 1
+            num_jobs = max(1, num_jobs // nvcc_threads)
+
+        return num_jobs, nvcc_threads
+
+    #
+    # Perform cmake configuration for a single extension.
+    #
+    def configure(self, ext: CMakeExtension) -> None:
+        # If we've already configured using the CMakeLists.txt for
+        # this extension, exit early.
+        if ext.cmake_lists_dir in cmake_build_ext.did_config:
+            return
+
+        cmake_build_ext.did_config[ext.cmake_lists_dir] = True
+
+        # Select the build type.
+        # Note: optimization level + debug info are set by the build type
+        default_cfg = "Debug" if self.debug else "RelWithDebInfo"
+        cfg = envs.CMAKE_BUILD_TYPE or default_cfg
+
+        cmake_args = [
+            '-DCMAKE_BUILD_TYPE={}'.format(cfg),
+            '-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE),
+        ]
+
+        verbose = envs.VERBOSE
+        if verbose:
+            cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON']
+
+        if is_sccache_available():
+            cmake_args += [
+                '-DCMAKE_C_COMPILER_LAUNCHER=sccache',
+                '-DCMAKE_CXX_COMPILER_LAUNCHER=sccache',
+                '-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache',
+                '-DCMAKE_HIP_COMPILER_LAUNCHER=sccache',
+            ]
+        elif is_ccache_available():
+            cmake_args += [
+                '-DCMAKE_C_COMPILER_LAUNCHER=ccache',
+                '-DCMAKE_CXX_COMPILER_LAUNCHER=ccache',
+                '-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache',
+                '-DCMAKE_HIP_COMPILER_LAUNCHER=ccache',
+            ]
+
+        # Pass the python executable to cmake so it can find an exact
+        # match.
+        cmake_args += ['-DVLLM_PYTHON_EXECUTABLE={}'.format(sys.executable)]
+
+        # Pass the python path to cmake so it can reuse the build dependencies
+        # on subsequent calls to python.
+        cmake_args += ['-DVLLM_PYTHON_PATH={}'.format(":".join(sys.path))]
+
+        # Override the base directory for FetchContent downloads to $ROOT/.deps
+        # This allows sharing dependencies between profiles,
+        # and plays more nicely with sccache.
+        # To override this, set the FETCHCONTENT_BASE_DIR environment variable.
+        fc_base_dir = os.path.join(ROOT_DIR, ".deps")
+        fc_base_dir = os.environ.get("FETCHCONTENT_BASE_DIR", fc_base_dir)
+        cmake_args += ['-DFETCHCONTENT_BASE_DIR={}'.format(fc_base_dir)]
+
+        #
+        # Setup parallelism and build tool
+        #
+        num_jobs, nvcc_threads = self.compute_num_jobs()
+
+        if nvcc_threads:
+            cmake_args += ['-DNVCC_THREADS={}'.format(nvcc_threads)]
+
+        if is_ninja_available():
+            build_tool = ['-G', 'Ninja']
+            cmake_args += [
+                '-DCMAKE_JOB_POOL_COMPILE:STRING=compile',
+                '-DCMAKE_JOB_POOLS:STRING=compile={}'.format(num_jobs),
+            ]
+        else:
+            # Default build tool to whatever cmake picks.
+            build_tool = []
+        # Make sure we use the nvcc from CUDA_HOME
+        if _is_cuda():
+            cmake_args += [f'-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc']
+        subprocess.check_call(
+            ['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args],
+            cwd=self.build_temp)
+
+    def build_extensions(self) -> None:
+        # Ensure that CMake is present and working
+        try:
+            subprocess.check_output(['cmake', '--version'])
+        except OSError as e:
+            raise RuntimeError('Cannot find CMake executable') from e
+
+        # Create build directory if it does not exist.
+        if not os.path.exists(self.build_temp):
+            os.makedirs(self.build_temp)
+
+        targets = []
+
+        def target_name(s: str) -> str:
+            return s.removeprefix("vllm.").removeprefix("vllm_flash_attn.")
+
+        # Build all the extensions
+        for ext in self.extensions:
+            self.configure(ext)
+            targets.append(target_name(ext.name))
+
+        num_jobs, _ = self.compute_num_jobs()
+
+        build_args = [
+            "--build",
+            ".",
+            f"-j={num_jobs}",
+            *[f"--target={name}" for name in targets],
+        ]
+
+        subprocess.check_call(["cmake", *build_args], cwd=self.build_temp)
+
+        # Install the libraries
+        for ext in self.extensions:
+            # Install the extension into the proper location
+            outdir = Path(self.get_ext_fullpath(ext.name)).parent.absolute()
+
+            # Skip if the install directory is the same as the build directory
+            if outdir == self.build_temp:
+                continue
+
+            # CMake appends the extension prefix to the install path,
+            # and outdir already contains that prefix, so we need to remove it.
+            prefix = outdir
+            for _ in range(ext.name.count('.')):
+                prefix = prefix.parent
+
+            # prefix here should actually be the same for all components
+            install_args = [
+                "cmake", "--install", ".", "--prefix", prefix, "--component",
+                target_name(ext.name)
+            ]
+            subprocess.check_call(install_args, cwd=self.build_temp)
+
+    def run(self):
+        # First, run the standard build_ext command to compile the extensions
+        super().run()
+
+        # copy vllm/vllm_flash_attn/**/*.py from self.build_lib to current
+        # directory so that they can be included in the editable build
+        import glob
+        files = glob.glob(os.path.join(self.build_lib, "vllm",
+                                       "vllm_flash_attn", "**", "*.py"),
+                          recursive=True)
+        for file in files:
+            dst_file = os.path.join("vllm/vllm_flash_attn",
+                                    file.split("vllm/vllm_flash_attn/")[-1])
+            print(f"Copying {file} to {dst_file}")
+            os.makedirs(os.path.dirname(dst_file), exist_ok=True)
+            self.copy_file(file, dst_file)
+
+
+class repackage_wheel(build_ext):
+    """Extracts libraries and other files from an existing wheel."""
+
+    def get_base_commit_in_main_branch(self) -> str:
+        # Force to use the nightly wheel. This is mainly used for CI testing.
+        if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL:
+            return "nightly"
+
+        try:
+            # Get the latest commit hash of the upstream main branch.
+            resp_json = subprocess.check_output([
+                "curl", "-s",
+                "https://api.github.com/repos/vllm-project/vllm/commits/main"
+            ]).decode("utf-8")
+            upstream_main_commit = json.loads(resp_json)["sha"]
+
+            # Check if the upstream_main_commit exists in the local repo
+            try:
+                subprocess.check_output(
+                    ["git", "cat-file", "-e", f"{upstream_main_commit}"])
+            except subprocess.CalledProcessError:
+                # If not present, fetch it from the remote repository.
+                # Note that this does not update any local branches,
+                # but ensures that this commit ref and its history are
+                # available in our local repo.
+                subprocess.check_call([
+                    "git", "fetch", "https://github.com/vllm-project/vllm",
+                    "main"
+                ])
+
+            # Then get the commit hash of the current branch that is the same as
+            # the upstream main commit.
+            current_branch = subprocess.check_output(
+                ["git", "branch", "--show-current"]).decode("utf-8").strip()
+
+            base_commit = subprocess.check_output([
+                "git", "merge-base", f"{upstream_main_commit}", current_branch
+            ]).decode("utf-8").strip()
+            return base_commit
+        except ValueError as err:
+            raise ValueError(err) from None
+        except Exception as err:
+            logger.warning(
+                "Failed to get the base commit in the main branch. "
+                "Using the nightly wheel. The libraries in this "
+                "wheel may not be compatible with your dev branch: %s", err)
+            return "nightly"
+
+    def run(self) -> None:
+        assert _is_cuda(
+        ), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
+
+        wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
+        if wheel_location is None:
+            base_commit = self.get_base_commit_in_main_branch()
+            wheel_location = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+            # Fallback to nightly wheel if latest commit wheel is unavailable,
+            # in this rare case, the nightly release CI hasn't finished on main.
+            if not is_url_available(wheel_location):
+                wheel_location = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+
+        import zipfile
+
+        if os.path.isfile(wheel_location):
+            wheel_path = wheel_location
+            print(f"Using existing wheel={wheel_path}")
+        else:
+            # Download the wheel from a given URL, assume
+            # the filename is the last part of the URL
+            wheel_filename = wheel_location.split("/")[-1]
+
+            import tempfile
+
+            # create a temporary directory to store the wheel
+            temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
+            wheel_path = os.path.join(temp_dir, wheel_filename)
+
+            print(f"Downloading wheel from {wheel_location} to {wheel_path}")
+
+            from urllib.request import urlretrieve
+
+            try:
+                urlretrieve(wheel_location, filename=wheel_path)
+            except Exception as e:
+                from setuptools.errors import SetupError
+
+                raise SetupError(
+                    f"Failed to get vLLM wheel from {wheel_location}") from e
+
+        with zipfile.ZipFile(wheel_path) as wheel:
+            files_to_copy = [
+                "vllm/_C.abi3.so",
+                "vllm/_moe_C.abi3.so",
+                "vllm/_flashmla_C.abi3.so",
+                "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
+                "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
+                "vllm/cumem_allocator.abi3.so",
+                # "vllm/_version.py", # not available in nightly wheels yet
+            ]
+
+            file_members = list(
+                filter(lambda x: x.filename in files_to_copy, wheel.filelist))
+
+            # vllm_flash_attn python code:
+            # Regex from
+            #  `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
+            compiled_regex = re.compile(
+                r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
+            file_members += list(
+                filter(lambda x: compiled_regex.match(x.filename),
+                       wheel.filelist))
+
+            for file in file_members:
+                print(f"Extracting and including {file.filename} "
+                      "from existing wheel")
+                package_name = os.path.dirname(file.filename).replace("/", ".")
+                file_name = os.path.basename(file.filename)
+
+                if package_name not in package_data:
+                    package_data[package_name] = []
+
+                wheel.extract(file)
+                if file_name.endswith(".py"):
+                    # python files shouldn't be added to package_data
+                    continue
+
+                package_data[package_name].append(file_name)
+
+
+def _no_device() -> bool:
+    return VLLM_TARGET_DEVICE == "empty"
+
+
+def _is_cuda() -> bool:
+    has_cuda = torch.version.cuda is not None
+    return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
+            and not (_is_neuron() or _is_tpu()))
+
+
+def _is_hip() -> bool:
+    return (VLLM_TARGET_DEVICE == "cuda"
+            or VLLM_TARGET_DEVICE == "rocm") and torch.version.hip is not None
+
+
+def _is_neuron() -> bool:
+    return VLLM_TARGET_DEVICE == "neuron"
+
+
+def _is_tpu() -> bool:
+    return VLLM_TARGET_DEVICE == "tpu"
+
+
+def _is_cpu() -> bool:
+    return VLLM_TARGET_DEVICE == "cpu"
+
+
+def _is_xpu() -> bool:
+    return VLLM_TARGET_DEVICE == "xpu"
+
+
+def _build_custom_ops() -> bool:
+    return _is_cuda() or _is_hip() or _is_cpu()
+
+
+def get_rocm_version():
+    # Get the Rocm version from the ROCM_HOME/bin/librocm-core.so
+    # see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21
+    try:
+        librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so"
+        if not librocm_core_file.is_file():
+            return None
+        librocm_core = ctypes.CDLL(librocm_core_file)
+        VerErrors = ctypes.c_uint32
+        get_rocm_core_version = librocm_core.getROCmVersion
+        get_rocm_core_version.restype = VerErrors
+        get_rocm_core_version.argtypes = [
+            ctypes.POINTER(ctypes.c_uint32),
+            ctypes.POINTER(ctypes.c_uint32),
+            ctypes.POINTER(ctypes.c_uint32),
+        ]
+        major = ctypes.c_uint32()
+        minor = ctypes.c_uint32()
+        patch = ctypes.c_uint32()
+
+        if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor),
+                                  ctypes.byref(patch)) == 0):
+            return f"{major.value}.{minor.value}.{patch.value}"
+        return None
+    except Exception:
+        return None
+
+
+def get_neuronxcc_version():
+    import sysconfig
+    site_dir = sysconfig.get_paths()["purelib"]
+    version_file = os.path.join(site_dir, "neuronxcc", "version",
+                                "__init__.py")
+
+    # Check if the command was executed successfully
+    with open(version_file) as fp:
+        content = fp.read()
+
+    # Extract the version using a regular expression
+    match = re.search(r"__version__ = '(\S+)'", content)
+    if match:
+        # Return the version string
+        return match.group(1)
+    else:
+        raise RuntimeError("Could not find Neuron version in the output")
+
+
+def get_nvcc_cuda_version() -> Version:
+    """Get the CUDA version from nvcc.
+
+    Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
+    """
+    assert CUDA_HOME is not None, "CUDA_HOME is not set"
+    nvcc_output = subprocess.check_output([CUDA_HOME + "/bin/nvcc", "-V"],
+                                          universal_newlines=True)
+    output = nvcc_output.split()
+    release_idx = output.index("release") + 1
+    nvcc_cuda_version = parse(output[release_idx].split(",")[0])
+    return nvcc_cuda_version
+
+
+def get_gaudi_sw_version():
+    """
+    Returns the driver version.
+    """
+    # Enable console printing for `hl-smi` check
+    output = subprocess.run("hl-smi",
+                            shell=True,
+                            text=True,
+                            capture_output=True,
+                            env={"ENABLE_CONSOLE": "true"})
+    if output.returncode == 0 and output.stdout:
+        return output.stdout.split("\n")[2].replace(
+            " ", "").split(":")[1][:-1].split("-")[0]
+    return "0.0.0"  # when hl-smi is not available
+
+
+def get_vllm_version() -> str:
+    version = get_version(write_to="vllm/_version.py")
+    sep = "+" if "+" not in version else "."  # dev versions might contain +
+
+    if _no_device():
+        if envs.VLLM_TARGET_DEVICE == "empty":
+            version += f"{sep}empty"
+    elif _is_cuda():
+        if envs.VLLM_USE_PRECOMPILED:
+            version += f"{sep}precompiled"
+        else:
+            cuda_version = str(get_nvcc_cuda_version())
+            if cuda_version != MAIN_CUDA_VERSION:
+                cuda_version_str = cuda_version.replace(".", "")[:3]
+                # skip this for source tarball, required for pypi
+                if "sdist" not in sys.argv:
+                    version += f"{sep}cu{cuda_version_str}"
+    elif _is_hip():
+        # Get the Rocm Version
+        rocm_version = get_rocm_version() or torch.version.hip
+        if rocm_version and rocm_version != MAIN_CUDA_VERSION:
+            version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}"
+    elif _is_neuron():
+        # Get the Neuron version
+        neuron_version = str(get_neuronxcc_version())
+        if neuron_version != MAIN_CUDA_VERSION:
+            neuron_version_str = neuron_version.replace(".", "")[:3]
+            version += f"{sep}neuron{neuron_version_str}"
+    elif _is_tpu():
+        version += f"{sep}tpu"
+    elif _is_cpu():
+        if envs.VLLM_TARGET_DEVICE == "cpu":
+            version += f"{sep}cpu"
+    elif _is_xpu():
+        version += f"{sep}xpu"
+    else:
+        raise RuntimeError("Unknown runtime environment")
+
+    return version
+
+
+def get_requirements() -> list[str]:
+    """Get Python package dependencies from requirements.txt."""
+    requirements_dir = ROOT_DIR / "requirements"
+
+    def _read_requirements(filename: str) -> list[str]:
+        with open(requirements_dir / filename) as f:
+            requirements = f.read().strip().split("\n")
+        resolved_requirements = []
+        for line in requirements:
+            if line.startswith("-r "):
+                resolved_requirements += _read_requirements(line.split()[1])
+            elif not line.startswith("--") and not line.startswith(
+                    "#") and line.strip() != "":
+                resolved_requirements.append(line)
+        return resolved_requirements
+
+    if _no_device():
+        requirements = _read_requirements("common.txt")
+    elif _is_cuda():
+        requirements = _read_requirements("cuda.txt")
+        cuda_major, cuda_minor = torch.version.cuda.split(".")
+        modified_requirements = []
+        for req in requirements:
+            if ("vllm-flash-attn" in req and cuda_major != "12"):
+                # vllm-flash-attn is built only for CUDA 12.x.
+                # Skip for other versions.
+                continue
+            modified_requirements.append(req)
+        requirements = modified_requirements
+    elif _is_hip():
+        requirements = _read_requirements("rocm.txt")
+    elif _is_neuron():
+        requirements = _read_requirements("neuron.txt")
+    elif _is_tpu():
+        requirements = _read_requirements("tpu.txt")
+    elif _is_cpu():
+        requirements = _read_requirements("cpu.txt")
+    elif _is_xpu():
+        requirements = _read_requirements("xpu.txt")
+    else:
+        raise ValueError(
+            "Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.")
+    return requirements
+
+
+ext_modules = []
+
+if _is_cuda() or _is_hip():
+    ext_modules.append(CMakeExtension(name="vllm._moe_C"))
+
+if _is_hip():
+    ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
+
+if _is_cuda():
+    ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
+    if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
+        # FA3 requires CUDA 12.3 or later
+        ext_modules.append(
+            CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
+        # Optional since this doesn't get built (produce an .so file) when
+        # not targeting a hopper system
+        ext_modules.append(
+            CMakeExtension(name="vllm._flashmla_C", optional=True))
+    ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
+
+if _build_custom_ops():
+    ext_modules.append(CMakeExtension(name="vllm._C"))
+
+package_data = {
+    "vllm": [
+        "py.typed",
+        "model_executor/layers/fused_moe/configs/*.json",
+        "model_executor/layers/quantization/utils/configs/*.json",
+    ]
+}
+
+if _no_device():
+    ext_modules = []
+
+if not ext_modules:
+    cmdclass = {}
+else:
+    cmdclass = {
+        "build_ext":
+        repackage_wheel if envs.VLLM_USE_PRECOMPILED else cmake_build_ext
+    }
+
+setup(
+    # static metadata should rather go in pyproject.toml
+    version=get_vllm_version(),
+    ext_modules=ext_modules,
+    install_requires=get_requirements(),
+    extras_require={
+        "bench": ["pandas", "datasets"],
+        "tensorizer": ["tensorizer==2.10.1"],
+        "fastsafetensors": ["fastsafetensors >= 0.1.10"],
+        "runai":
+        ["runai-model-streamer >= 0.13.3", "runai-model-streamer-s3", "boto3"],
+        "audio": ["librosa", "soundfile",
+                  "mistral_common[audio]"],  # Required for audio processing
+        "video": []  # Kept for backwards compatibility
+    },
+    cmdclass=cmdclass,
+    package_data=package_data,
+)
diff --git a/vllm_v0.10.0/tests/__init__.py b/vllm_v0.10.0/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/async_engine/__init__.py b/vllm_v0.10.0/tests/async_engine/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/async_engine/api_server_async_engine.py b/vllm_v0.10.0/tests/async_engine/api_server_async_engine.py
new file mode 100644
index 0000000..ec6b20f
--- /dev/null
+++ b/vllm_v0.10.0/tests/async_engine/api_server_async_engine.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""vllm.entrypoints.api_server with some extra logging for testing."""
+from collections.abc import Iterable
+from typing import Any
+
+import uvicorn
+from fastapi.responses import JSONResponse, Response
+
+import vllm.entrypoints.api_server
+import vllm.envs as envs
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.utils import FlexibleArgumentParser
+
+app = vllm.entrypoints.api_server.app
+
+
+class AsyncLLMEngineWithStats(AsyncLLMEngine):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._num_aborts = 0
+
+    async def _engine_abort(self, request_ids: Iterable[str]):
+        ids = list(request_ids)
+        self._num_aborts += len(ids)
+        await super()._engine_abort(ids)
+
+    def testing_stats(self) -> dict[str, Any]:
+        return {"num_aborted_requests": self._num_aborts}
+
+
+@app.get("/stats")
+def stats() -> Response:
+    """Get the statistics of the engine."""
+    return JSONResponse(engine.testing_stats())
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser = AsyncEngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+
+    engine_args = AsyncEngineArgs.from_cli_args(args)
+    engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
+    vllm.entrypoints.api_server.engine = engine
+    uvicorn.run(app,
+                host=args.host,
+                port=args.port,
+                log_level="debug",
+                timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE)
diff --git a/vllm_v0.10.0/tests/async_engine/conftest.py b/vllm_v0.10.0/tests/async_engine/conftest.py
new file mode 100644
index 0000000..375b248
--- /dev/null
+++ b/vllm_v0.10.0/tests/async_engine/conftest.py
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
diff --git a/vllm_v0.10.0/tests/async_engine/test_api_server.py b/vllm_v0.10.0/tests/async_engine/test_api_server.py
new file mode 100644
index 0000000..76c94bd
--- /dev/null
+++ b/vllm_v0.10.0/tests/async_engine/test_api_server.py
@@ -0,0 +1,113 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import subprocess
+import sys
+import time
+from multiprocessing import Pool
+from pathlib import Path
+
+import pytest
+import requests
+
+
+def _query_server(prompt: str, max_tokens: int = 5) -> dict:
+    response = requests.post("http://localhost:8000/generate",
+                             json={
+                                 "prompt": prompt,
+                                 "max_tokens": max_tokens,
+                                 "temperature": 0,
+                                 "ignore_eos": True
+                             })
+    response.raise_for_status()
+    return response.json()
+
+
+def _query_server_long(prompt: str) -> dict:
+    return _query_server(prompt, max_tokens=500)
+
+
+@pytest.fixture
+def api_server(distributed_executor_backend: str):
+    script_path = Path(__file__).parent.joinpath(
+        "api_server_async_engine.py").absolute()
+    commands = [
+        sys.executable,
+        "-u",
+        str(script_path),
+        "--model",
+        "facebook/opt-125m",
+        "--host",
+        "127.0.0.1",
+        "--distributed-executor-backend",
+        distributed_executor_backend,
+    ]
+
+    # API Server Test Requires V0.
+    my_env = os.environ.copy()
+    my_env["VLLM_USE_V1"] = "0"
+    uvicorn_process = subprocess.Popen(commands, env=my_env)
+    yield
+    uvicorn_process.terminate()
+
+
+@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
+def test_api_server(api_server, distributed_executor_backend: str):
+    """
+    Run the API server and test it.
+
+    We run both the server and requests in separate processes.
+
+    We test that the server can handle incoming requests, including
+    multiple requests at the same time, and that it can handle requests
+    being cancelled without crashing.
+    """
+    with Pool(32) as pool:
+        # Wait until the server is ready
+        prompts = ["warm up"] * 1
+        result = None
+        while not result:
+            try:
+                for r in pool.map(_query_server, prompts):
+                    result = r
+                    break
+            except requests.exceptions.ConnectionError:
+                time.sleep(1)
+
+        # Actual tests start here
+        # Try with 1 prompt
+        for result in pool.map(_query_server, prompts):
+            assert result
+
+        num_aborted_requests = requests.get(
+            "http://localhost:8000/stats").json()["num_aborted_requests"]
+        assert num_aborted_requests == 0
+
+        # Try with 100 prompts
+        prompts = ["test prompt"] * 100
+        for result in pool.map(_query_server, prompts):
+            assert result
+
+    with Pool(32) as pool:
+        # Cancel requests
+        prompts = ["canceled requests"] * 100
+        pool.map_async(_query_server_long, prompts)
+        time.sleep(0.01)
+        pool.terminate()
+        pool.join()
+
+        # check cancellation stats
+        # give it some times to update the stats
+        time.sleep(1)
+
+        num_aborted_requests = requests.get(
+            "http://localhost:8000/stats").json()["num_aborted_requests"]
+        assert num_aborted_requests > 0
+
+    # check that server still runs after cancellations
+    with Pool(32) as pool:
+        # Try with 100 prompts
+        prompts = ["test prompt after canceled"] * 100
+        for result in pool.map(_query_server, prompts):
+            assert result
diff --git a/vllm_v0.10.0/tests/async_engine/test_async_llm_engine.py b/vllm_v0.10.0/tests/async_engine/test_async_llm_engine.py
new file mode 100644
index 0000000..0eb7a6e
--- /dev/null
+++ b/vllm_v0.10.0/tests/async_engine/test_async_llm_engine.py
@@ -0,0 +1,409 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import os
+import uuid
+from asyncio import CancelledError
+from copy import copy
+from dataclasses import dataclass, field
+from typing import Any, Optional
+
+import pytest
+import pytest_asyncio
+import torch
+
+from vllm import SamplingParams
+from vllm.config import ParallelConfig
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
+from vllm.outputs import RequestOutput as RealRequestOutput
+from vllm.sampling_params import RequestOutputKind
+
+from ..utils import wait_for_gpu_memory_to_clear
+
+
+@dataclass
+class RequestOutput:
+    request_id: int
+    finished: bool = False
+
+
+@dataclass
+class MockModelConfig:
+    use_async_output_proc = True
+    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
+
+
+class MockEngine:
+
+    def __init__(self):
+        self.step_calls = 0
+        self.add_request_calls = 0
+        self.abort_request_calls = 0
+        self.request_id = None
+        # Ugly, remove dependency when possible
+        self.parallel_config = ParallelConfig()
+        self.model_config = MockModelConfig()
+
+    async def step_async(self, virtual_engine):
+        # PP size is 1, ignore virtual engine
+        self.step_calls += 1
+        return [RequestOutput(
+            request_id=self.request_id)] if self.request_id else []
+
+    async def process_model_inputs_async(self, *args, **kwargs):
+        pass
+
+    async def stop_remote_worker_execution_loop_async(self):
+        pass
+
+    def generate(self, request_id):
+        self.request_id = request_id
+
+    def stop_generating(self):
+        self.request_id = None
+
+    def add_request(self, **kwargs):
+        del kwargs  # Unused
+        self.add_request_calls += 1
+        print(f'Request calls: {self.add_request_calls}')
+
+    async def add_request_async(self, **kwargs):
+        self.add_request_calls += 1
+        return
+
+    def abort_request(self, request_id):
+        del request_id  # Unused
+        self.abort_request_calls += 1
+
+    def has_unfinished_requests(self):
+        return self.request_id is not None
+
+    def has_unfinished_requests_for_virtual_engine(self, virtual_engine):
+        return self.request_id is not None
+
+
+class MockAsyncLLMEngine(AsyncLLMEngine):
+    _engine_class = MockEngine
+
+
+@pytest.mark.asyncio
+async def test_new_requests_event():
+    params = SamplingParams()
+
+    engine = MockAsyncLLMEngine()
+    engine.start_background_loop()
+    await asyncio.sleep(0.01)
+    assert engine.engine.step_calls == 0
+
+    await engine.add_request("1", "", params)
+    await asyncio.sleep(0.01)
+    assert engine.engine.add_request_calls == 1
+    assert engine.engine.step_calls == 1
+
+    await engine.add_request("2", "", params)
+    engine.engine.generate("2")
+    await asyncio.sleep(0)
+    await asyncio.sleep(0)
+    await asyncio.sleep(0)
+    assert engine.engine.add_request_calls == 2
+    assert engine.engine.step_calls >= 2
+    await asyncio.sleep(0.001)
+    assert engine.engine.step_calls >= 3
+    engine.engine.stop_generating()
+    await asyncio.sleep(0.001)
+    old_step_calls = engine.engine.step_calls
+    await asyncio.sleep(0.001)
+    assert engine.engine.step_calls == old_step_calls
+
+    await engine.add_request("3", "", params)
+    await asyncio.sleep(0.01)
+    assert engine.engine.add_request_calls == 3
+    assert engine.engine.step_calls == old_step_calls + 1
+    await asyncio.sleep(0.01)
+    assert engine.engine.add_request_calls == 3
+    assert engine.engine.step_calls == old_step_calls + 1
+
+    engine = MockAsyncLLMEngine()
+    assert engine.get_model_config() is not None
+    assert engine.get_tokenizer() is not None
+    assert engine.get_decoding_config() is not None
+
+
+def start_engine():
+    wait_for_gpu_memory_to_clear(
+        devices=list(range(torch.cuda.device_count())),
+        threshold_bytes=2 * 2**30,
+        timeout_s=60,
+    )
+
+    num_scheduler_steps = int(os.getenv("NUM_SCHEDULER_STEPS", "1"))
+    print(f"Starting engine with num_scheduler_steps={num_scheduler_steps}")
+
+    return AsyncLLMEngine.from_engine_args(
+        AsyncEngineArgs(model="facebook/opt-125m",
+                        enforce_eager=True,
+                        num_scheduler_steps=num_scheduler_steps))
+
+
+def uid() -> str:
+    return str(uuid.uuid4())
+
+
+@pytest_asyncio.fixture(scope="module")
+async def async_engine():
+    # We cannot use monkeypatch since this is a module
+    # scoped fixture and monkeypatch is function scoped.
+    previous_value = os.getenv("VLLM_USE_V1", None)
+    os.environ["VLLM_USE_V1"] = "0"
+    engine = await asyncio.get_event_loop().run_in_executor(executor=None,
+                                                            func=start_engine)
+    try:
+        yield engine
+    finally:
+        engine.shutdown_background_loop()
+        del engine
+        await asyncio.sleep(0.1)
+        cleanup_dist_env_and_memory()
+
+        if previous_value:
+            os.environ["VLLM_USE_V1"] = previous_value
+        else:
+            del os.environ["VLLM_USE_V1"]
+
+
+@pytest.fixture()
+def should_do_global_cleanup_after_test(request) -> bool:
+    # So we can share the async engine fixture between these tests
+    return False
+
+
+@pytest.mark.asyncio(scope="module")
+@pytest.mark.parametrize("stop", [None, ["a stop string"]])
+async def test_asyncio_run(async_engine, stop):
+
+    scheduler_config = await async_engine.get_scheduler_config()
+    num_scheduler_steps = scheduler_config.num_scheduler_steps
+
+    async def run(prompt: str):
+        sampling_params = SamplingParams(
+            temperature=0,
+            max_tokens=32,
+            min_tokens=32,
+            stop=stop,
+        )
+
+        output_count = 0
+        final_output = None
+        async for output in async_engine.generate(prompt,
+                                                  sampling_params,
+                                                  request_id=uid()):
+            output_count += 1
+            final_output = output
+        return final_output, output_count
+
+    results = await asyncio.gather(
+        run("test0"),
+        run("test0"),
+    )
+    assert len(results) == 2
+    first, second = results
+
+    # remove nondeterministic fields for comparison
+    first[0].metrics = None
+    second[0].metrics = None
+    first[0].request_id = None
+    second[0].request_id = None
+
+    assert str(first) == str(second)
+
+    output_count = results[0][1]
+    if num_scheduler_steps == 1:
+        assert output_count == 32
+    else:
+        assert 1 < output_count < 32
+
+
+@pytest.mark.asyncio(scope="module")
+@pytest.mark.parametrize("stop", [None, ["a stop string"]])
+async def test_output_kinds(async_engine, stop):
+    """Test that output_kind works as expected and that
+    results are equivalent across different kinds."""
+
+    scheduler_config = await async_engine.get_scheduler_config()
+    num_scheduler_steps = scheduler_config.num_scheduler_steps
+
+    sampling_params = SamplingParams(
+        temperature=0,
+        max_tokens=32,
+        min_tokens=32,
+        stop=stop,
+    )
+
+    async def run(prompt: str, kind: RequestOutputKind):
+        params = copy(sampling_params)
+        params.output_kind = kind
+
+        output_count = 0
+        final_output = None
+        async for output in async_engine.generate(prompt,
+                                                  params,
+                                                  request_id=uid()):
+            output_count += 1
+            final_output = output
+
+        assert final_output is not None
+        assert final_output.finished
+
+        return (final_output.prompt_token_ids,
+                final_output.outputs[0].token_ids,
+                final_output.outputs[0].text, output_count)
+
+    async def run_deltas(prompt: str):
+        params = copy(sampling_params)
+        params.output_kind = RequestOutputKind.DELTA
+
+        prompt_tokens = None
+        output_tokens: list[int] = []
+        output_text = ""
+        output_count = 0
+        final_output = None
+        async for output in async_engine.generate(prompt,
+                                                  params,
+                                                  request_id=uid()):
+            token_ids = output.outputs[0].token_ids
+            text = output.outputs[0].text
+            final_output = output
+
+            # Ensure we get prompt ids iff we haven't yet received output tokens
+            if output_tokens:
+                assert 1 <= len(token_ids) <= num_scheduler_steps
+                assert stop or text
+                assert not output.prompt_token_ids
+            else:
+                assert output.prompt_token_ids
+                prompt_tokens = output.prompt_token_ids
+
+            output_tokens.extend(token_ids)
+            output_text += text
+
+            output_count += 1
+
+        assert final_output is not None
+        assert final_output.finished
+
+        return prompt_tokens, output_tokens, output_text, output_count
+
+    results = await asyncio.gather(
+        run("common input prompt", RequestOutputKind.CUMULATIVE),
+        run("common input prompt", RequestOutputKind.FINAL_ONLY),
+        run_deltas("common input prompt"))
+
+    # Make sure outputs are the same
+    prompt_set = set(tuple(prompt_ids) for prompt_ids, _, _, _ in results)
+    assert len(prompt_set) == 1
+
+    text_set = set(text for _, _, text, _ in results)
+    assert len(text_set) == 1
+
+    tokens_set = set(tuple(ids) for _, ids, _, _ in results)
+    assert len(tokens_set) == 1
+
+    cumulative, final, deltas = results
+
+    # output message counts
+    assert cumulative[3] == deltas[3]
+
+    if num_scheduler_steps == 1:
+        assert cumulative[3] == 32
+    else:
+        assert 1 < cumulative[3] < 32
+
+    assert final[3] == 1
+
+
+@pytest.mark.asyncio(scope="module")
+@pytest.mark.parametrize("stop", [None, ["a stop string"]])
+async def test_cancellation(async_engine, stop):
+    scheduler_config = await async_engine.get_scheduler_config()
+    num_scheduler_steps = scheduler_config.num_scheduler_steps
+
+    sampling_params = SamplingParams(
+        temperature=0,
+        min_tokens=13,
+        max_tokens=13,
+        stop=stop,
+    )
+
+    stop_at = 5 if num_scheduler_steps == 1 else 1
+
+    request_id = uid()
+
+    i = 0
+    with pytest.raises(CancelledError):
+        async for output in async_engine.generate("test2",
+                                                  sampling_params,
+                                                  request_id=request_id):
+            assert not output.finished
+            i += 1
+            if i == stop_at:
+                await async_engine.abort(request_id)
+
+    assert i == stop_at
+
+
+@pytest.mark.asyncio(scope="module")
+@pytest.mark.parametrize("stop", [None, ["a stop string"]])
+async def test_delayed_generator(async_engine, stop):
+    scheduler_config = await async_engine.get_scheduler_config()
+
+    if scheduler_config.num_scheduler_steps != 1:
+        pytest.skip("no need to test this one with multistep")
+
+    sampling_params = SamplingParams(
+        temperature=0,
+        min_tokens=10,
+        max_tokens=10,
+        stop=stop,
+    )
+
+    stream = async_engine.generate("test3", sampling_params, request_id=uid())
+    i = 0
+    final_output: Optional[RealRequestOutput] = None
+    async for output in stream:
+        final_output = output
+        if i == 0:
+            # wait for generation to complete before consuming
+            # the remaining messages
+            await asyncio.sleep(1)
+        if i < 9:
+            assert not output.finished
+        i += 1
+
+    assert i == 10
+    assert final_output is not None
+    assert len(final_output.outputs[0].token_ids) == 10
+    assert final_output.finished
+
+
+@pytest.mark.asyncio(scope="module")
+async def test_invalid_argument(async_engine):
+    scheduler_config = await async_engine.get_scheduler_config()
+
+    if scheduler_config.num_scheduler_steps != 1:
+        pytest.skip("no need to test this one with multistep")
+
+    sampling_params = SamplingParams(
+        temperature=0,
+        min_tokens=10,
+        max_tokens=10,
+    )
+
+    # Targeting specific DP rank only supported in v1 multi-instance DP
+    with pytest.raises(ValueError):
+        async for _ in async_engine.generate("test",
+                                             sampling_params,
+                                             request_id=uid(),
+                                             data_parallel_rank=0):
+            pass
diff --git a/vllm_v0.10.0/tests/async_engine/test_request_tracker.py b/vllm_v0.10.0/tests/async_engine/test_request_tracker.py
new file mode 100644
index 0000000..1851eee
--- /dev/null
+++ b/vllm_v0.10.0/tests/async_engine/test_request_tracker.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.engine.async_llm_engine import RequestTracker
+from vllm.outputs import RequestOutput
+
+
+@pytest.mark.asyncio
+async def test_request_tracker():
+    tracker = RequestTracker()
+    stream_1 = tracker.add_request("1")
+    assert tracker.new_requests_event.is_set()
+    await tracker.wait_for_new_requests()
+    new, aborted = tracker.get_new_and_aborted_requests()
+    assert not tracker.new_requests_event.is_set()
+    assert len(new) == 1
+    assert new[0]["request_id"] == "1"
+    assert not aborted
+    assert not stream_1.finished
+
+    stream_2 = tracker.add_request("2")
+    stream_3 = tracker.add_request("3")
+    assert tracker.new_requests_event.is_set()
+    await tracker.wait_for_new_requests()
+    new, aborted = tracker.get_new_and_aborted_requests()
+    assert not tracker.new_requests_event.is_set()
+    assert len(new) == 2
+    assert new[0]["request_id"] == "2"
+    assert new[1]["request_id"] == "3"
+    assert not aborted
+    assert not stream_2.finished
+    assert not stream_3.finished
+
+    # request_ids must be unique
+    with pytest.raises(KeyError):
+        tracker.add_request("1")
+    assert not tracker.new_requests_event.is_set()
+
+    tracker.abort_request("1")
+    new, aborted = tracker.get_new_and_aborted_requests()
+    assert len(aborted) == 1
+    assert "1" in aborted
+    assert not new
+    assert stream_1.finished
+
+    stream_4 = tracker.add_request("4")
+    tracker.abort_request("4")
+    assert tracker.new_requests_event.is_set()
+    await tracker.wait_for_new_requests()
+    new, aborted = tracker.get_new_and_aborted_requests()
+    # aborted new requests will cancel each other out -
+    # there's no need for them to propagate into the
+    # engine
+    assert not aborted
+    assert not new
+    assert stream_4.finished
+
+    stream_5 = tracker.add_request("5")
+    assert tracker.new_requests_event.is_set()
+    tracker.process_request_output(
+        RequestOutput("2", "output", [], [], [], finished=True))
+    await tracker.wait_for_new_requests()
+    new, aborted = tracker.get_new_and_aborted_requests()
+    assert not tracker.new_requests_event.is_set()
+    assert not aborted
+    assert len(new) == 1
+    assert new[0]["request_id"] == "5"
+    assert stream_2.finished
+    assert not stream_5.finished
diff --git a/vllm_v0.10.0/tests/basic_correctness/__init__.py b/vllm_v0.10.0/tests/basic_correctness/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/basic_correctness/test_basic_correctness.py b/vllm_v0.10.0/tests/basic_correctness/test_basic_correctness.py
new file mode 100644
index 0000000..13ddf03
--- /dev/null
+++ b/vllm_v0.10.0/tests/basic_correctness/test_basic_correctness.py
@@ -0,0 +1,260 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Compare the short outputs of HF and vLLM when using greedy sampling.
+
+Run `pytest tests/basic_correctness/test_basic_correctness.py`.
+"""
+import os
+import weakref
+from unittest.mock import Mock
+
+import pytest
+import torch
+
+from vllm import LLM, envs
+from vllm.platforms import current_platform
+from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1
+
+from ..conftest import HfRunner, VllmRunner
+from ..models.utils import check_outputs_equal
+from ..utils import multi_gpu_test
+
+MODELS = [
+    "google/gemma-2-2b-it",
+    "meta-llama/Llama-3.2-1B-Instruct",
+]
+
+TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+def test_vllm_gc_ed():
+    """Verify vllm instance is GC'ed when it is deleted"""
+    llm = LLM("distilbert/distilgpt2")
+    weak_llm = weakref.ref(llm)
+    del llm
+    # If there's any circular reference to vllm, this fails
+    # because llm instance is not GC'ed.
+    assert weak_llm() is None
+
+
+def _fix_prompt_embed_outputs(
+        vllm_outputs: list[tuple[list[int], str]], hf_model: HfRunner,
+        example_prompts: list[str]) -> list[tuple[list[int], str]]:
+    fixed_vllm_outputs = []
+    for vllm_output, hf_input, prompt in zip(
+            vllm_outputs, hf_model.get_inputs(example_prompts),
+            example_prompts):
+        hf_input_ids = hf_input["input_ids"].tolist()[0]
+        fixed_vllm_outputs.append(
+            (hf_input_ids + vllm_output[0][len(hf_input_ids):],
+             prompt + vllm_output[1]))
+    return fixed_vllm_outputs
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("backend", ["FLASH_ATTN"])
+@pytest.mark.parametrize("max_tokens", [5])
+@pytest.mark.parametrize("enforce_eager", [False])
+@pytest.mark.parametrize("enable_prompt_embeds", [True, False])
+def test_models(
+    monkeypatch: pytest.MonkeyPatch,
+    hf_runner,
+    model: str,
+    backend: str,
+    max_tokens: int,
+    enforce_eager: bool,
+    enable_prompt_embeds: bool,
+) -> None:
+
+    if enable_prompt_embeds and envs.is_set(
+            "VLLM_USE_V1") and envs.VLLM_USE_V1:
+        pytest.skip("enable_prompt_embeds is not supported in v1.")
+
+    if backend == "FLASHINFER" and current_platform.is_rocm():
+        pytest.skip("Flashinfer does not support ROCm/HIP.")
+
+    if backend in ("XFORMERS",
+                   "FLASHINFER") and model == "google/gemma-2-2b-it":
+        pytest.skip(
+            f"{backend} does not support gemma2 with full context length.")
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_ATTENTION_BACKEND", backend)
+
+        # 5042 tokens for gemma2
+        # gemma2 has alternating sliding window size of 4096
+        # we need a prompt with more than 4096 tokens to test the sliding window
+        prompt = "The following numbers of the sequence " + ", ".join(
+            str(i) for i in range(1024)) + " are:"
+        example_prompts = [prompt]
+
+        with hf_runner(model) as hf_model:
+            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+            if enable_prompt_embeds:
+                with torch.no_grad():
+                    prompt_embeds = hf_model.get_prompt_embeddings(
+                        example_prompts)
+
+        with VllmRunner(model,
+                        max_model_len=8192,
+                        enforce_eager=enforce_eager,
+                        enable_prompt_embeds=enable_prompt_embeds,
+                        gpu_memory_utilization=0.7) as vllm_model:
+            if enable_prompt_embeds:
+                vllm_outputs = vllm_model.generate_greedy(
+                    prompt_embeds, max_tokens)
+                vllm_outputs = _fix_prompt_embed_outputs(
+                    vllm_outputs, hf_model, example_prompts)
+            else:
+                vllm_outputs = vllm_model.generate_greedy(
+                    example_prompts, max_tokens)
+
+        check_outputs_equal(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "model, distributed_executor_backend, attention_backend, "
+    "test_suite, extra_env", [
+        ("distilbert/distilgpt2", "ray", "", "L4", {}),
+        ("distilbert/distilgpt2", "mp", "", "L4", {}),
+        ("distilbert/distilgpt2", "ray", "", "L4", {
+            "VLLM_SLEEP_WHEN_IDLE": "1"
+        }),
+        ("distilbert/distilgpt2", "mp", "", "L4", {
+            "VLLM_SLEEP_WHEN_IDLE": "1"
+        }),
+        ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4", {}),
+        ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}),
+        ("distilbert/distilgpt2", "ray", "", "A100", {}),
+        ("distilbert/distilgpt2", "mp", "", "A100", {}),
+        ("distilbert/distilgpt2", "mp", "FLASHINFER", "A100", {}),
+        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100", {}),
+    ])
+@pytest.mark.parametrize("enable_prompt_embeds", [True, False])
+def test_models_distributed(
+    monkeypatch: pytest.MonkeyPatch,
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    distributed_executor_backend: str,
+    attention_backend: str,
+    test_suite: str,
+    extra_env: dict[str, str],
+    enable_prompt_embeds: bool,
+) -> None:
+
+    if enable_prompt_embeds and envs.is_set(
+            "VLLM_USE_V1") and envs.VLLM_USE_V1:
+        pytest.skip("enable_prompt_embeds is not supported in v1.")
+
+    if test_suite != TARGET_TEST_SUITE:
+        pytest.skip(f"Skip test for {test_suite}")
+
+    with monkeypatch.context() as monkeypatch_context:
+        if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
+            if enable_prompt_embeds:
+                pytest.skip(
+                    "enable_prompt_embeds does not work with ray compiled dag."
+                )
+            monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
+            monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
+
+        if attention_backend:
+            monkeypatch_context.setenv(
+                "VLLM_ATTENTION_BACKEND",
+                attention_backend,
+            )
+
+        for k, v in extra_env.items():
+            monkeypatch_context.setenv(k, v)
+
+        dtype = "half"
+        max_tokens = 5
+
+        # NOTE: take care of the order. run vLLM first, and then run HF.
+        # vLLM needs a fresh new process without cuda initialization.
+        # if we run HF first, the cuda initialization will be done and it
+        # will hurt multiprocessing backend with fork method
+        # (the default method).
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                tensor_parallel_size=2,
+                distributed_executor_backend=distributed_executor_backend,
+                enable_prompt_embeds=enable_prompt_embeds,
+                gpu_memory_utilization=0.7,
+        ) as vllm_model:
+            if enable_prompt_embeds:
+                with hf_runner(model, dtype=dtype) as hf_model:
+                    with torch.no_grad():
+                        prompt_embeds = hf_model.get_prompt_embeddings(
+                            example_prompts)
+                    vllm_outputs = vllm_model.generate_greedy(
+                        prompt_embeds, max_tokens)
+                    vllm_outputs = _fix_prompt_embed_outputs(
+                        vllm_outputs, hf_model, example_prompts)
+                    hf_outputs = hf_model.generate_greedy(
+                        example_prompts, max_tokens)
+            else:
+                vllm_outputs = vllm_model.generate_greedy(
+                    example_prompts, max_tokens)
+                with hf_runner(model, dtype=dtype) as hf_model:
+                    hf_outputs = hf_model.generate_greedy(
+                        example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
+def test_failed_model_execution(vllm_runner, monkeypatch) -> None:
+
+    from vllm.envs import VLLM_USE_V1
+
+    if not VLLM_USE_V1:
+        pytest.skip("Skipping V0 test, dump input not supported")
+
+    # Needed to mock an error in the same process
+    monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
+
+    with vllm_runner('facebook/opt-125m', enforce_eager=True) as vllm_model:
+        if isinstance(vllm_model.llm.llm_engine, LLMEngineV1):
+            v1_test_failed_model_execution(vllm_model)
+
+
+def v1_test_failed_model_execution(vllm_model):
+
+    engine = vllm_model.llm.llm_engine
+    mocked_execute_model = Mock(
+        side_effect=RuntimeError("Mocked Critical Error"))
+    engine.engine_core.engine_core.model_executor.execute_model =\
+                mocked_execute_model
+
+    with pytest.raises(RuntimeError) as exc_info:
+        prompts = [
+            "Hello, my name is",
+            "The president of the United States is",
+            "The capital of France is",
+            "The future of AI is",
+        ]
+        vllm_model.generate_greedy(prompts, 200, use_tqdm=False)
+    assert isinstance(exc_info.value, RuntimeError)
+    assert "Mocked Critical Error" in str(exc_info.value)
diff --git a/vllm_v0.10.0/tests/basic_correctness/test_chunked_prefill.py b/vllm_v0.10.0/tests/basic_correctness/test_chunked_prefill.py
new file mode 100644
index 0000000..4816b76
--- /dev/null
+++ b/vllm_v0.10.0/tests/basic_correctness/test_chunked_prefill.py
@@ -0,0 +1,296 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Compare the outputs of HF and vLLM when using greedy sampling.
+
+It tests chunked prefill. Chunked prefill can be enabled by
+enable_chunked_prefill=True. If prefill size exceeds max_num_batched_tokens,
+prefill requests are chunked.
+
+Run `pytest tests/models/test_chunked_prefill.py`.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pytest
+
+from vllm.platforms import current_platform
+from vllm.utils import STR_BACKEND_ENV_VAR
+
+from ..models.utils import check_logprobs_close, check_outputs_equal
+from ..utils import multi_gpu_test
+
+if TYPE_CHECKING:
+    from .conftest import HfRunner, VllmRunner
+
+MODELS = [
+    "facebook/opt-125m",
+    "meta-llama/Llama-3.2-1B-Instruct",
+]
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the file.
+    """
+    with monkeypatch.context() as m:
+        m.setenv('VLLM_USE_V1', '0')
+        yield
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
+@pytest.mark.parametrize("enforce_eager", [False, True])
+# NOTE: Increasing this in this suite will fail CI because we currently cannot
+# reset distributed env properly. Use a value > 1 just when you test.
+@pytest.mark.parametrize("tensor_parallel_size", [1])
+@pytest.mark.parametrize("attention_backend", [
+    pytest.param("FLASHINFER",
+                 marks=pytest.mark.skipif(
+                     current_platform.is_rocm(),
+                     reason="FLASHINFER isn't supported on ROCm")),
+    "FLASH_ATTN"
+])
+def test_models(
+    hf_runner: HfRunner,
+    vllm_runner: VllmRunner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    chunked_prefill_token_size: int,
+    enforce_eager: bool,
+    tensor_parallel_size: int,
+    attention_backend: str,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """
+    Checks exact match decode between huggingface model and vllm runner with
+    chunked prefill.
+    """
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
+
+        max_num_seqs = chunked_prefill_token_size
+        max_num_batched_tokens = chunked_prefill_token_size
+
+        with hf_runner(model, dtype=dtype) as hf_model:
+            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                max_num_batched_tokens=max_num_batched_tokens,
+                enable_chunked_prefill=True,
+                tensor_parallel_size=tensor_parallel_size,
+                enforce_eager=enforce_eager,
+                max_num_seqs=max_num_seqs,
+        ) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                      max_tokens)
+
+        check_outputs_equal(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("attention_backend", [
+    pytest.param("FLASHINFER",
+                 marks=pytest.mark.skipif(
+                     current_platform.is_rocm(),
+                     reason="FLASHINFER isn't supported on ROCm")),
+    "FLASH_ATTN"
+])
+def test_models_distributed(
+    hf_runner: HfRunner,
+    vllm_runner: VllmRunner,
+    example_prompts,
+    model: str,
+    distributed_executor_backend: str,
+    attention_backend: str,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
+        if (model == "meta-llama/Llama-3.2-1B-Instruct"
+                and distributed_executor_backend == "ray"):
+            # test Ray Compiled Graph
+            m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
+            m.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
+
+        dtype = "half"
+        max_tokens = 5
+        chunked_prefill_token_size = 16
+
+        # Add a chunked prefill config.
+        max_num_seqs = min(chunked_prefill_token_size, 256)
+        assert chunked_prefill_token_size != -1
+        enable_chunked_prefill = True
+        max_num_batched_tokens = chunked_prefill_token_size
+
+        # NOTE: take care of the order. run vLLM first, and then run HF.
+        # vLLM needs a fresh new process without cuda initialization.
+        # if we run HF first, the cuda initialization will be done and it
+        # will hurt multiprocessing backend with
+        # fork method (the default method).
+
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                tensor_parallel_size=2,
+                max_num_seqs=max_num_seqs,
+                enable_chunked_prefill=enable_chunked_prefill,
+                max_num_batched_tokens=max_num_batched_tokens,
+                distributed_executor_backend=distributed_executor_backend,
+        ) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(
+                example_prompts,
+                max_tokens,
+            )
+
+        with hf_runner(model, dtype=dtype) as hf_model:
+            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+        check_outputs_equal(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.mark.parametrize(
+    "kv_cache_dtype,model",
+    [("fp8_e4m3",
+      "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme")])
+# Due to low-precision numerical divergence, we only test logprob of 4 tokens
+@pytest.mark.parametrize("max_tokens", [4])
+@pytest.mark.parametrize("chunked_prefill_token_size", [4, 16])
+@pytest.mark.parametrize("enforce_eager", [False, True])
+# NOTE: Increasing this in this suite will fail CI because we currently cannot
+# reset distributed env properly. Use a value > 1 just when you test.
+@pytest.mark.parametrize("tensor_parallel_size", [1])
+# Due to low-precision numerical divergence, this test is too sensitive to
+# the async postprocessor
+@pytest.mark.parametrize("disable_async_output_proc", [True])
+@pytest.mark.skipif(current_platform.is_rocm(),
+                    reason="machete_prepack_B isn't supported on ROCm")
+def test_models_with_fp8_kv_cache(
+    vllm_runner: VllmRunner,
+    example_prompts,
+    kv_cache_dtype: str,
+    model: str,
+    max_tokens: int,
+    chunked_prefill_token_size: int,
+    enforce_eager: bool,
+    tensor_parallel_size: int,
+    disable_async_output_proc: bool,
+) -> None:
+    """
+    Check output logprobs match between no_chunked_prefill and chunked_prefill
+    with fp8 kv cache. General fp8 kv-cache tests are covered in test_fp8.py,
+    so here we only check chunked prefill.
+    """
+    NUM_LOG_PROBS = 8
+
+    max_num_seqs = chunked_prefill_token_size
+    max_num_batched_tokens = chunked_prefill_token_size
+
+    with vllm_runner(
+            model,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            max_num_seqs=max_num_seqs,
+            kv_cache_dtype=kv_cache_dtype,
+            disable_async_output_proc=disable_async_output_proc,
+    ) as vllm_model:
+        no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, NUM_LOG_PROBS)
+
+    with vllm_runner(
+            model,
+            max_num_batched_tokens=max_num_batched_tokens,
+            enable_chunked_prefill=True,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            max_num_seqs=max_num_seqs,
+            kv_cache_dtype=kv_cache_dtype,
+            disable_async_output_proc=disable_async_output_proc,
+    ) as vllm_model:
+        chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, NUM_LOG_PROBS)
+
+    check_logprobs_close(
+        outputs_0_lst=no_chunked_prefill_outputs,
+        outputs_1_lst=chunked_prefill_outputs,
+        name_0="no_chunked_prefill",
+        name_1="chunked_prefill",
+    )
+
+
+@pytest.mark.parametrize("max_tokens", [16])
+@pytest.mark.parametrize("enforce_eager", [False])
+@pytest.mark.parametrize("chunk_size", [30, 32])
+# NOTE: Increasing this in this suite will fail CI because we currently cannot
+# reset distributed env properly. Use a value > 1 just when you test.
+@pytest.mark.parametrize("tensor_parallel_size", [1])
+@pytest.mark.parametrize("dtype", ["half"])
+def test_with_prefix_caching(
+    vllm_runner: VllmRunner,
+    max_tokens: int,
+    enforce_eager: bool,
+    chunk_size: int,
+    tensor_parallel_size: int,
+    dtype: str,
+) -> None:
+    """
+    Checks exact match decode with and without prefix caching
+    with chunked prefill enabled.
+    """
+    model = "meta-llama/Llama-3.2-1B-Instruct"
+    # The common prompt has 142 tokens with Llama-2 tokenizer.
+    common_prompt = "You are a helpful AI assistant " * 20
+    unique_prompts = [
+        "Question",  # Warmup
+        "Question",  # Fully cached
+        "Another question",  # Partial cached
+    ]
+    full_prompts = [f"{common_prompt}\n{p}" for p in unique_prompts]
+
+    max_num_batched_tokens = max_num_seqs = chunk_size
+    outputs = {}  # type: ignore
+    for enable in (True, False):
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                max_num_batched_tokens=max_num_batched_tokens,
+                enable_chunked_prefill=True,
+                enable_prefix_caching=enable,
+                tensor_parallel_size=tensor_parallel_size,
+                enforce_eager=enforce_eager,
+                max_num_seqs=max_num_seqs,
+        ) as vllm_model:
+            outputs[enable] = []
+            for prompt in full_prompts:
+                outputs[enable] += vllm_model.generate_greedy(
+                    [prompt],
+                    max_tokens,
+                )
+
+    check_outputs_equal(
+        outputs_0_lst=outputs[False],
+        outputs_1_lst=outputs[True],
+        name_0="w/o prefix caching",
+        name_1="with prefix caching",
+    )
diff --git a/vllm_v0.10.0/tests/basic_correctness/test_cpu_offload.py b/vllm_v0.10.0/tests/basic_correctness/test_cpu_offload.py
new file mode 100644
index 0000000..28bfe9e
--- /dev/null
+++ b/vllm_v0.10.0/tests/basic_correctness/test_cpu_offload.py
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from ..utils import compare_two_settings
+
+
+def test_cpu_offload():
+    compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [],
+                         ["--cpu-offload-gb", "1"])
diff --git a/vllm_v0.10.0/tests/basic_correctness/test_cumem.py b/vllm_v0.10.0/tests/basic_correctness/test_cumem.py
new file mode 100644
index 0000000..34f9389
--- /dev/null
+++ b/vllm_v0.10.0/tests/basic_correctness/test_cumem.py
@@ -0,0 +1,179 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm import LLM, SamplingParams
+from vllm.device_allocator.cumem import CuMemAllocator
+from vllm.utils import GiB_bytes
+
+from ..utils import create_new_process_for_each_test
+
+
+@create_new_process_for_each_test()
+def test_python_error():
+    """
+    Test if Python error occurs when there's low-level
+    error happening from the C++ side.
+    """
+    allocator = CuMemAllocator.get_instance()
+    total_bytes = torch.cuda.mem_get_info()[1]
+    alloc_bytes = int(total_bytes * 0.7)
+    tensors = []
+    with allocator.use_memory_pool():
+        # allocate 70% of the total memory
+        x = torch.empty(alloc_bytes, dtype=torch.uint8, device='cuda')
+        tensors.append(x)
+    # release the memory
+    allocator.sleep()
+
+    # allocate more memory than the total memory
+    y = torch.empty(alloc_bytes, dtype=torch.uint8, device='cuda')
+    tensors.append(y)
+    with pytest.raises(RuntimeError):
+        # when the allocator is woken up, it should raise an error
+        # because we don't have enough memory
+        allocator.wake_up()
+
+
+@create_new_process_for_each_test()
+def test_basic_cumem():
+    # some tensors from default memory pool
+    shape = (1024, 1024)
+    x = torch.empty(shape, device='cuda')
+    x.zero_()
+
+    # some tensors from custom memory pool
+    allocator = CuMemAllocator.get_instance()
+    with allocator.use_memory_pool():
+        # custom memory pool
+        y = torch.empty(shape, device='cuda')
+        y.zero_()
+        y += 1
+        z = torch.empty(shape, device='cuda')
+        z.zero_()
+        z += 2
+
+    # they can be used together
+    output = x + y + z
+    assert torch.allclose(output, torch.ones_like(output) * 3)
+
+    free_bytes = torch.cuda.mem_get_info()[0]
+    allocator.sleep()
+    free_bytes_after_sleep = torch.cuda.mem_get_info()[0]
+    assert free_bytes_after_sleep > free_bytes
+    allocator.wake_up()
+
+    # they can be used together
+    output = x + y + z
+    assert torch.allclose(output, torch.ones_like(output) * 3)
+
+
+@create_new_process_for_each_test()
+def test_cumem_with_cudagraph():
+    allocator = CuMemAllocator.get_instance()
+    with allocator.use_memory_pool():
+        weight = torch.eye(1024, device='cuda')
+    with allocator.use_memory_pool(tag="discard"):
+        cache = torch.empty(1024, 1024, device='cuda')
+
+    def model(x):
+        out = x @ weight
+        cache[:out.size(0)].copy_(out)
+        return out + 1
+
+    x = torch.empty(128, 1024, device='cuda')
+
+    # warmup
+    model(x)
+
+    # capture cudagraph
+    model_graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(model_graph):
+        y = model(x)
+
+    free_bytes = torch.cuda.mem_get_info()[0]
+    allocator.sleep()
+    free_bytes_after_sleep = torch.cuda.mem_get_info()[0]
+    assert free_bytes_after_sleep > free_bytes
+    allocator.wake_up()
+
+    # after waking up, the content in the weight tensor
+    # should be restored, but the content in the cache tensor
+    # should be discarded
+
+    # this operation is also compatible with cudagraph
+
+    x.random_()
+    model_graph.replay()
+
+    # cache content is as expected
+    assert torch.allclose(x, cache[:x.size(0)])
+
+    # output content is as expected
+    assert torch.allclose(y, x + 1)
+
+
+@create_new_process_for_each_test()
+@pytest.mark.parametrize(
+    "model, use_v1",
+    [
+        # sleep mode with safetensors
+        ("meta-llama/Llama-3.2-1B", True),
+        # sleep mode with pytorch checkpoint
+        ("facebook/opt-125m", False),
+    ])
+def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        free, total = torch.cuda.mem_get_info()
+        used_bytes_baseline = total - free  # in case other process is running
+        llm = LLM(model, enable_sleep_mode=True)
+        prompt = "How are you?"
+        sampling_params = SamplingParams(temperature=0, max_tokens=10)
+        output = llm.generate(prompt, sampling_params)
+
+        # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
+        # which is difficult to measure in the test. therefore, we only
+        # test sleep level 1 here.
+        llm.sleep(level=1)
+
+        free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
+        used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
+        # now the memory usage is mostly cudagraph memory pool,
+        # and it should be less than the model weights (1B model, 2GiB weights)
+
+        # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
+        # is captured but cannot be releasesd from PyTorch due to a known bug,
+        # therefore high memory usage after `llm.sleep` is called is expected.
+        # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
+        # in V1.
+        if use_v1:
+            assert used_bytes < 7 * GiB_bytes
+        else:
+            assert used_bytes < 2 * GiB_bytes
+
+        llm.wake_up()
+        output2 = llm.generate(prompt, sampling_params)
+        # cmp output
+        assert output[0].outputs[0].text == output2[0].outputs[0].text
+
+        llm.sleep(level=1)
+        llm.wake_up(tags=["weights"])
+
+        free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
+        used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
+
+        # should just reallocate memory for weights (1B model, ~2GiB weights)
+        if use_v1:
+            assert used_bytes < 10 * GiB_bytes
+        else:
+            assert used_bytes < 6 * GiB_bytes
+
+        # now allocate kv cache memory
+        llm.wake_up(tags=["kv_cache"])
+        output3 = llm.generate(prompt, sampling_params)
+
+        # cmp output
+        assert output[0].outputs[0].text == output3[0].outputs[0].text
diff --git a/vllm_v0.10.0/tests/basic_correctness/test_preemption.py b/vllm_v0.10.0/tests/basic_correctness/test_preemption.py
new file mode 100644
index 0000000..db2fa2f
--- /dev/null
+++ b/vllm_v0.10.0/tests/basic_correctness/test_preemption.py
@@ -0,0 +1,189 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Compare the short outputs of HF and vLLM when using greedy sampling.
+
+VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
+
+Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
+pytest tests/basic_correctness/test_preemption.py`.
+"""
+import pytest
+from prometheus_client import REGISTRY
+
+import vllm.envs as envs
+from vllm import SamplingParams
+from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
+                                 ENABLE_ARTIFICIAL_PREEMPT)
+
+from ..models.utils import check_outputs_equal
+
+MODELS = [
+    "distilbert/distilgpt2",
+]
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    We should enable this for V1, but VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT,
+    so use VLLM_USE_V1=0 for all tests in the file.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
+@pytest.fixture(scope="module", autouse=True)
+def check_settings():
+    assert ENABLE_ARTIFICIAL_PREEMPT is True, (
+        "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1."
+        "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 "
+        "pytest tests/basic_correctness/test_preemption.py`")
+
+
+@pytest.fixture
+def distributed_executor_backend() -> str:
+    # When SPMD worker is used, use distributed_executor_backend="ray"
+    # to test delta input optimization works with preemption.
+    return "ray" if envs.VLLM_USE_RAY_SPMD_WORKER else "mp"
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [96])
+@pytest.mark.parametrize("chunked_prefill_token_size", [16])
+def test_chunked_prefill_recompute(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    chunked_prefill_token_size: int,
+    distributed_executor_backend: str,
+) -> None:
+    """Ensure that chunked prefill works with preemption."""
+    max_num_seqs = min(chunked_prefill_token_size, 256)
+    enable_chunked_prefill = False
+    max_num_batched_tokens = None
+    if chunked_prefill_token_size != -1:
+        enable_chunked_prefill = True
+        max_num_batched_tokens = chunked_prefill_token_size
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            max_num_batched_tokens=max_num_batched_tokens,
+            enable_chunked_prefill=enable_chunked_prefill,
+            max_num_seqs=max_num_seqs,
+            distributed_executor_backend=distributed_executor_backend,
+            disable_log_stats=False,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)
+
+    for i in range(len(example_prompts)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [96])
+def test_preemption(
+    caplog_vllm,
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    distributed_executor_backend: str,
+) -> None:
+    """By default, recompute preemption is enabled"""
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            disable_log_stats=False,
+            distributed_executor_backend=distributed_executor_backend,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)
+        total_preemption = (
+            vllm_model.llm.llm_engine.scheduler[0].num_cumulative_preemption)
+
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+    assert ("is preempted by PreemptionMode.RECOMPUTE mode because there "
+            "is not enough KV cache space." in caplog_vllm.text)
+    # Ensure the count bucket of request-level histogram metrics matches
+    # the number of requests as a simple sanity check to ensure metrics are
+    # generated
+    preemption_metrics = None
+    for m in REGISTRY.collect():
+        if m.name == "vllm:num_preemptions":
+            preemption_metrics = m
+    assert preemption_metrics is not None
+    total_recorded_preemption = 0
+    for sample in preemption_metrics.samples:
+        total_recorded_preemption += sample.value
+    assert total_preemption == total_recorded_preemption
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [96])
+def test_preemption_infeasible(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    distributed_executor_backend: str,
+) -> None:
+    """Verify infeasible preemption request will be ignored."""
+    BLOCK_SIZE = 16
+    prefill_blocks = 2
+    decode_blocks = max_tokens // BLOCK_SIZE
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            block_size=BLOCK_SIZE,
+            # Not enough gpu blocks to complete a single sequence.
+            # preemption should happen, and the sequence should be
+            # ignored instead of hanging forever.
+            num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
+            max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
+            distributed_executor_backend=distributed_executor_backend,
+    ) as vllm_model:
+        sampling_params = SamplingParams(max_tokens=max_tokens,
+                                         ignore_eos=True)
+        req_outputs = vllm_model.llm.generate(
+            example_prompts,
+            sampling_params=sampling_params,
+        )
+
+        assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)
+
+    # Verify the request is ignored and not hang.
+    for req_output in req_outputs:
+        outputs = req_output.outputs
+        assert len(outputs) == 1
+        assert outputs[0].finish_reason == "length"
diff --git a/vllm_v0.10.0/tests/benchmarks/__init__.py b/vllm_v0.10.0/tests/benchmarks/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/benchmarks/test_latency_cli.py b/vllm_v0.10.0/tests/benchmarks/test_latency_cli.py
new file mode 100644
index 0000000..2279c84
--- /dev/null
+++ b/vllm_v0.10.0/tests/benchmarks/test_latency_cli.py
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import subprocess
+
+import pytest
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+
+
+@pytest.mark.benchmark
+def test_bench_latency():
+    command = [
+        "vllm", "bench", "latency", "--model", MODEL_NAME, "--input-len", "32",
+        "--output-len", "1", "--enforce-eager", "--load-format", "dummy"
+    ]
+    result = subprocess.run(command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+
+    assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
diff --git a/vllm_v0.10.0/tests/benchmarks/test_serve_cli.py b/vllm_v0.10.0/tests/benchmarks/test_serve_cli.py
new file mode 100644
index 0000000..bfcf274
--- /dev/null
+++ b/vllm_v0.10.0/tests/benchmarks/test_serve_cli.py
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import subprocess
+
+import pytest
+
+from ..utils import RemoteOpenAIServer
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--max-model-len", "1024", "--enforce-eager", "--load-format", "dummy"
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.benchmark
+def test_bench_serve(server):
+    command = [
+        "vllm",
+        "bench",
+        "serve",
+        "--model",
+        MODEL_NAME,
+        "--host",
+        server.host,
+        "--port",
+        str(server.port),
+        "--dataset-name",
+        "random",
+        "--random-input-len",
+        "32",
+        "--random-output-len",
+        "4",
+        "--num-prompts",
+        "5",
+    ]
+    result = subprocess.run(command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+
+    assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
diff --git a/vllm_v0.10.0/tests/benchmarks/test_throughput_cli.py b/vllm_v0.10.0/tests/benchmarks/test_throughput_cli.py
new file mode 100644
index 0000000..b61e51d
--- /dev/null
+++ b/vllm_v0.10.0/tests/benchmarks/test_throughput_cli.py
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import subprocess
+
+import pytest
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+
+
+@pytest.mark.benchmark
+def test_bench_throughput():
+    command = [
+        "vllm", "bench", "throughput", "--model", MODEL_NAME, "--input-len",
+        "32", "--output-len", "1", "--enforce-eager", "--load-format", "dummy"
+    ]
+    result = subprocess.run(command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+
+    assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
diff --git a/vllm_v0.10.0/tests/build_cython.py b/vllm_v0.10.0/tests/build_cython.py
new file mode 100644
index 0000000..444434e
--- /dev/null
+++ b/vllm_v0.10.0/tests/build_cython.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import Cython.Compiler.Options
+from Cython.Build import cythonize
+from setuptools import setup
+
+Cython.Compiler.Options.annotate = True
+
+infiles = []
+
+infiles += [
+    "vllm/engine/llm_engine.py",
+    "vllm/transformers_utils/detokenizer.py",
+    "vllm/engine/output_processor/single_step.py",
+    "vllm/outputs.py",
+    "vllm/engine/output_processor/stop_checker.py",
+]
+
+infiles += [
+    "vllm/core/scheduler.py",
+    "vllm/sequence.py",
+    "vllm/core/block_manager.py",
+]
+
+infiles += [
+    "vllm/model_executor/layers/sampler.py",
+    "vllm/sampling_params.py",
+    "vllm/utils/__init__.py",
+]
+
+setup(ext_modules=cythonize(infiles,
+                            annotate=False,
+                            force=True,
+                            compiler_directives={
+                                'language_level': "3",
+                                'infer_types': True
+                            }))
+
+# example usage: python3 build_cython.py build_ext --inplace
diff --git a/vllm_v0.10.0/tests/compile/__init__.py b/vllm_v0.10.0/tests/compile/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/compile/backend.py b/vllm_v0.10.0/tests/compile/backend.py
new file mode 100644
index 0000000..ace4d25
--- /dev/null
+++ b/vllm_v0.10.0/tests/compile/backend.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+from copy import deepcopy
+from typing import Callable, Union
+
+from torch import fx
+from torch._ops import OpOverload
+
+from vllm.compilation.fx_utils import find_op_nodes
+from vllm.compilation.inductor_pass import InductorPass
+from vllm.config import get_current_vllm_config
+
+
+class TestBackend:
+    """
+    This class provides a simple Inductor backend that can be used for testing.
+    It takes a list of custom passes and runs them after Inductor's passes.
+    It also saves the graph before and after the custom passes for inspection.
+
+    Inductor config can be modified directly by editing the inductor_config
+    property. This can be helpful for adding passes like the
+    'pre_grad_custom_pass' and the 'post_grad_custom_pre_pass'.
+    Inductor config is default-initialized from VllmConfig.CompilationConfig.
+    """
+
+    def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph],
+                                                             None]]):
+        self.custom_passes = list(passes)
+        compile_config = get_current_vllm_config().compilation_config
+        self.inductor_config = compile_config.inductor_compile_config
+        self.inductor_config['force_disable_caches'] = True
+        self.inductor_config['post_grad_custom_post_pass'] = self.post_pass
+
+    def __call__(self, graph: fx.GraphModule, example_inputs):
+        self.graph_pre_compile = deepcopy(graph)
+        from torch._inductor.compile_fx import compile_fx
+        return compile_fx(graph,
+                          example_inputs,
+                          config_patches=self.inductor_config)
+
+    def post_pass(self, graph: fx.Graph):
+        self.graph_pre_pass = deepcopy(graph)
+        for pass_ in self.custom_passes:
+            pass_(graph)
+
+        self.graph_post_pass = deepcopy(graph)
+        # assign by reference, will reflect the final state of the graph
+        self.final_graph = graph
+
+    def check_before_ops(self, ops: Sequence[OpOverload], fully_replaced=True):
+        for op in ops:
+            num_pre = len(list(find_op_nodes(op, self.graph_pre_pass)))
+            num_post = len(list(find_op_nodes(op, self.graph_post_pass)))
+            assert num_pre > 0, f"Op {op.name()} not found in pre-pass graph"
+            assert num_pre > num_post, f"All nodes remain for op {op.name()}"
+            if fully_replaced:
+                assert num_post == 0, \
+                    f"Unexpected op {op.name()} in post-pass graph"
+
+    def check_after_ops(self, ops: Sequence[OpOverload]):
+        for op in ops:
+            num_pre = len(list(find_op_nodes(op, self.graph_pre_pass)))
+            num_post = len(list(find_op_nodes(op, self.graph_post_pass)))
+            assert num_pre == 0, f"Unexpected op {op.name()} in pre-pass graph"
+            assert num_post > 0, f"Op {op.name()} not found in post-pass graph"
\ No newline at end of file
diff --git a/vllm_v0.10.0/tests/compile/piecewise/__init__.py b/vllm_v0.10.0/tests/compile/piecewise/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/compile/piecewise/test_full_cudagraph.py b/vllm_v0.10.0/tests/compile/piecewise/test_full_cudagraph.py
new file mode 100644
index 0000000..efe9c84
--- /dev/null
+++ b/vllm_v0.10.0/tests/compile/piecewise/test_full_cudagraph.py
@@ -0,0 +1,158 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import contextlib
+import os
+import weakref
+from contextlib import ExitStack
+
+import pytest
+
+from tests.utils import wait_for_gpu_memory_to_clear
+from vllm import LLM, SamplingParams
+from vllm.config import CompilationConfig
+from vllm.platforms import current_platform
+
+
+@contextlib.contextmanager
+def temporary_environ(env_vars):
+    """
+    Temporarily set environment variables and restore them afterward.
+    We have to do this vs monkeypatch because monkeypatch doesn't work
+    with "module" scoped fixtures.
+    """
+    original_env = {k: os.environ.get(k) for k in env_vars}
+    try:
+        os.environ.update(env_vars)
+        yield
+    finally:
+        for k, v in original_env.items():
+            if v is None:
+                os.environ.pop(k, None)
+            else:
+                os.environ[k] = v
+
+
+@pytest.fixture(scope="class")
+def llm_pair(request):
+    model = request.param
+
+    with temporary_environ({
+            "VLLM_USE_V1": "1",
+            "VLLM_FLASH_ATTN_VERSION": "3"
+    }):
+        full = LLM(
+            model=model,
+            gpu_memory_utilization=0.45,
+            trust_remote_code=True,
+            max_model_len=1024,
+            compilation_config=CompilationConfig(full_cuda_graph=True),
+        )
+        piecewise = LLM(
+            model=model,
+            gpu_memory_utilization=0.45,
+            trust_remote_code=True,
+            max_model_len=1024,
+            compilation_config=CompilationConfig(),
+        )
+
+    # PyTest caches the fixture values so we use weakref.proxy to enable GC
+    yield weakref.proxy(full), weakref.proxy(piecewise)
+    del full
+    del piecewise
+
+    wait_for_gpu_memory_to_clear(
+        devices=[0],
+        threshold_ratio=0.1,
+    )
+
+
+@pytest.mark.parametrize(
+    "llm_pair",
+    [
+        # Model names for the llm_pair fixture
+        "deepseek-ai/DeepSeek-V2-Lite",
+        "Qwen/Qwen2-1.5B-Instruct"
+    ],
+    indirect=True)
+@pytest.mark.skipif(current_platform.get_device_capability() != (9, 0),
+                    reason="Only Hopper GPUs support FA3 and FlashMLA")
+class TestFullCUDAGraph:
+    """
+    Use a class such that an llm pair is constructed once for all
+    batch_size/max_tokens combinations and released immediately after.
+
+    Module-scope fixtures would stick around the whole time,
+    meaning there would be multiple LLM instances hogging memory simultaneously.
+    """
+
+    @pytest.mark.parametrize(("batch_size", "max_tokens"), [
+        (1, 10),
+        (7, 10),
+        (16, 10),
+        (25, 10),
+        (32, 10),
+        (45, 10),
+        (64, 10),
+        (123, 10),
+        (8, 5),
+        (8, 30),
+    ])
+    def test_full_cudagraph(self, batch_size, max_tokens,
+                            llm_pair: tuple[LLM, LLM]):
+        """
+        Test various batch sizes and max_tokens to ensure that the
+        full cudagraph compilation works for padded cases too.
+        """
+
+        piecewise_llm, full_cudagraph_llm = llm_pair
+
+        prompts = ["Hello, my name is"] * batch_size
+        sampling_params = SamplingParams(temperature=0.0,
+                                         max_tokens=max_tokens,
+                                         top_p=0.95)
+
+        piecewise_responses = piecewise_llm.generate(prompts, sampling_params)
+        full_responses = full_cudagraph_llm.generate(prompts, sampling_params)
+
+        # Check that all responses are the same
+        for piecewise_res, full_res in zip(piecewise_responses,
+                                           full_responses):
+            assert piecewise_res.outputs[0].text == full_res.outputs[0].text
+
+
+@pytest.mark.parametrize(
+    "model, supported",
+    [
+        ("Qwen/Qwen2-1.5B-Instruct", True),
+        # MLA does not support capturing CUDA Graphs with size > max_num_seqs
+        ("deepseek-ai/DeepSeek-V2-Lite", False),
+    ])
+@pytest.mark.skipif(current_platform.get_device_capability() != (9, 0),
+                    reason="Only Hopper GPUs support FA3 and FlashMLA")
+def test_lower_max_num_seqs(model, supported):
+    with temporary_environ({
+            "VLLM_USE_V1": "1",
+            "VLLM_FLASH_ATTN_VERSION": "3"
+    }), ExitStack() as stack:
+        if not supported:
+            stack.enter_context(pytest.raises(RuntimeError))
+
+        llm = LLM(model=model,
+                  max_num_seqs=256,
+                  trust_remote_code=True,
+                  max_model_len=1024,
+                  compilation_config=CompilationConfig(
+                      full_cuda_graph=True,
+                      cudagraph_capture_sizes=[64, 256, 512]))
+        llm.generate(["Hello, my name is"] * 10)
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
+def test_full_cudagraph_with_invalid_backend():
+    with temporary_environ({
+            "VLLM_USE_V1": "1",
+            "VLLM_FLASH_ATTN_VERSION":
+            "2"  #FA2 not supported with full_cuda_graph
+    }), pytest.raises(RuntimeError):
+        LLM(model="Qwen/Qwen2-1.5B-Instruct",
+            compilation_config=CompilationConfig(full_cuda_graph=True))
diff --git a/vllm_v0.10.0/tests/compile/piecewise/test_multiple_graphs.py b/vllm_v0.10.0/tests/compile/piecewise/test_multiple_graphs.py
new file mode 100644
index 0000000..e460d70
--- /dev/null
+++ b/vllm_v0.10.0/tests/compile/piecewise/test_multiple_graphs.py
@@ -0,0 +1,350 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test (piecewise) compilation with a simple model where multiple submodules
+are compiled and graph captured separately.
+"""
+import torch
+from torch import nn
+from torch.library import Library
+
+from vllm.compilation.backends import set_model_tag
+from vllm.compilation.counter import compilation_counter
+from vllm.compilation.decorators import (ignore_torch_compile,
+                                         support_torch_compile)
+from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
+                         set_current_vllm_config)
+from vllm.envs import VLLM_USE_V1
+from vllm.forward_context import set_forward_context
+from vllm.utils import direct_register_custom_op
+
+# create a library to hold the custom op
+silly_lib = Library("silly", "FRAGMENT")  # noqa
+
+BATCH_SIZE = 32
+MLP_SIZE = 128
+HIDDEN_SIZE = 1024
+RANDOM_SEED = 0
+
+
+def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                    out: torch.Tensor) -> None:
+    out.copy_(q)
+    out += k
+    out += v
+
+
+def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                         out: torch.Tensor) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="attention",
+    op_func=silly_attention,
+    mutates_args=["out"],
+    fake_impl=silly_attention_fake,
+    target_lib=silly_lib,
+)
+
+
+@support_torch_compile
+class ParentModel(nn.Module):
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = '',
+                 **kwargs) -> None:
+        super().__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x
+
+
+class Attention(nn.Module):
+
+    def __init__(self, mlp_size: int, hidden_size: int) -> None:
+        super().__init__()
+        self.pre_attn = nn.Linear(mlp_size, hidden_size, bias=False)
+        self.post_attn = nn.Linear(hidden_size, mlp_size, bias=False)
+        self.rms_norm_weight = nn.Parameter(torch.ones(hidden_size))
+
+        # Initialize to same weights for testing
+        nn.init.xavier_normal_(
+            self.pre_attn.weight.data,
+            generator=torch.Generator().manual_seed(RANDOM_SEED),
+            gain=0.001)
+        nn.init.xavier_normal_(
+            self.post_attn.weight.data,
+            generator=torch.Generator().manual_seed(RANDOM_SEED),
+            gain=0.001)
+
+    def rms_norm_ref(self, x: torch.Tensor) -> torch.Tensor:
+        x_f32 = x.float()
+        return (x_f32 * torch.rsqrt(
+            torch.mean(x_f32.square(), dim=-1, keepdim=True) + 1e-6) *
+                self.rms_norm_weight).to(x.dtype)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.pre_attn(x)
+        x = self.rms_norm_ref(x)
+        attn_output = torch.empty_like(x)
+        torch.ops.silly.attention(x, x, x, attn_output)
+        x = attn_output
+        x = self.rms_norm_ref(x)
+        x = self.post_attn(x)
+        return x
+
+
+@support_torch_compile
+class CompiledAttention(nn.Module):
+
+    def __init__(self,
+                 *,
+                 mlp_size: int,
+                 hidden_size: int,
+                 vllm_config: VllmConfig,
+                 prefix: str = '',
+                 **kwargs) -> None:
+        super().__init__()
+        self.attn = Attention(mlp_size, hidden_size)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.attn(x)
+
+
+@support_torch_compile
+class CompiledAttentionTwo(CompiledAttention):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.attn(x) + x
+
+
+@ignore_torch_compile
+class SimpleModelWithTwoGraphs(ParentModel):
+
+    def __init__(self,
+                 *,
+                 mlp_size: int,
+                 hidden_size: int,
+                 vllm_config: VllmConfig,
+                 prefix: str = '',
+                 **kwargs) -> None:
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        # Test will fail without set_model_tag here with error:
+        # "ValueError: too many values to unpack (expected 3)"
+        # This is because CompiledAttention and CompiledAttentionTwo
+        # have different implmentations but the same torch.compile
+        # cache dir will be used as default prefix is 'model_tag'
+        with set_model_tag("attn_one"):
+            self.attn_one = CompiledAttention(
+                mlp_size=mlp_size,
+                hidden_size=hidden_size,
+                vllm_config=vllm_config,
+                prefix=f"{prefix}.attn_one",
+            )
+        with set_model_tag("attn_two"):
+            self.attn_two = CompiledAttentionTwo(
+                mlp_size=mlp_size,
+                hidden_size=hidden_size,
+                vllm_config=vllm_config,
+                prefix=f"{prefix}.attn_two",
+            )
+
+        self.hidden_states = torch.zeros((BATCH_SIZE, MLP_SIZE)).cuda()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        bsz = x.shape[0]
+        # CUDAGraph expects same tensor addresses for each run
+        self.hidden_states[:bsz].copy_(x)
+        x = self.attn_one(self.hidden_states[:bsz])
+        self.hidden_states[:bsz].copy_(x)
+        x = self.attn_two(self.hidden_states[:bsz])
+        return x
+
+
+def test_ignore_torch_compile_decorator():
+    assert VLLM_USE_V1
+
+    # piecewise
+    vllm_config = VllmConfig(compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        use_cudagraph=True,
+        splitting_ops=["silly.attention"],
+        cudagraph_capture_sizes=[1, 2],
+    ))
+
+    @support_torch_compile
+    class A(nn.Module):
+
+        def __init__(self,
+                     *,
+                     vllm_config: VllmConfig,
+                     prefix: str = '',
+                     **kwargs) -> None:
+            super().__init__()
+
+        def forward(self, x: torch.Tensor) -> torch.Tensor:
+            x = x + x
+            attn_output = torch.empty_like(x)
+            torch.ops.silly.attention(x, x, x, attn_output)
+            x = attn_output
+            x = x * 3
+            return x
+
+    @ignore_torch_compile
+    class B(A):
+        ...
+
+    @support_torch_compile
+    class C(B):
+        ...
+
+    with set_current_vllm_config(vllm_config):
+        mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda()
+
+    # A has support_torch_compile
+    with compilation_counter.expect(
+            num_graphs_seen=1,
+            num_piecewise_graphs_seen=3,
+            num_piecewise_capturable_graphs_seen=2,
+            num_backend_compilations=2,
+            num_cudagraph_captured=4,
+            # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+    ), set_forward_context({}, vllm_config=vllm_config):
+        # first run is for compile
+        mod_A(torch.randn(BATCH_SIZE, MLP_SIZE).cuda())
+        # run cudagraph captured sizes
+        mod_A(torch.randn(2, MLP_SIZE).cuda())
+        mod_A(torch.randn(1, MLP_SIZE).cuda())
+
+    with set_current_vllm_config(vllm_config):
+        mod_B = B(vllm_config=vllm_config, prefix='').eval().cuda()
+
+    # B's ignore_torch_compile should override A's support_torch_compile
+    with compilation_counter.expect(
+            num_graphs_seen=0,
+            num_piecewise_graphs_seen=0,
+            num_piecewise_capturable_graphs_seen=0,
+            num_backend_compilations=0,
+            num_cudagraph_captured=0,
+    ), set_forward_context({}, vllm_config=vllm_config):
+        mod_B(torch.randn(BATCH_SIZE, MLP_SIZE).cuda())
+        mod_B(torch.randn(2, MLP_SIZE).cuda())
+        mod_B(torch.randn(1, MLP_SIZE).cuda())
+
+    with set_current_vllm_config(vllm_config):
+        mod_C = C(vllm_config=vllm_config, prefix='').eval().cuda()
+
+    # C's support_torch_compile should override B's ignore_torch_compile
+    with compilation_counter.expect(
+            num_graphs_seen=1,
+            num_piecewise_graphs_seen=3,
+            num_piecewise_capturable_graphs_seen=2,
+            num_backend_compilations=2,
+            num_cudagraph_captured=4,
+            # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+    ), set_forward_context({}, vllm_config=vllm_config):
+        mod_C(torch.randn(BATCH_SIZE, MLP_SIZE).cuda())
+        mod_C(torch.randn(2, MLP_SIZE).cuda())
+        mod_C(torch.randn(1, MLP_SIZE).cuda())
+
+
+@torch.inference_mode
+def run_model(vllm_config, model: nn.Module, inputs: torch.Tensor):
+    with set_forward_context({}, vllm_config=vllm_config):
+        # First run is for compile
+        model(inputs)
+
+        # Run CUDAGraph captured sizes
+        model(inputs[:2])
+        model(inputs[:1])
+
+        output = model(inputs[:2])
+
+        output = output.cpu()
+        return output.cpu()
+
+
+def test_multi_graph_piecewise_compile_outputs_equal():
+    outputs = []
+
+    # piecewise compile
+    vllm_config = VllmConfig(compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        use_cudagraph=True,
+        splitting_ops=["silly.attention"],
+        cudagraph_capture_sizes=[1, 2],
+    ))
+
+    with set_current_vllm_config(vllm_config):
+        model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE,
+                                         hidden_size=HIDDEN_SIZE,
+                                         vllm_config=vllm_config,
+                                         prefix='').eval().cuda()
+
+    # Pre-allocate memory for CUDAGraph which expects
+    # static tensor addresses
+    inputs = torch.randn(BATCH_SIZE, MLP_SIZE).cuda()
+
+    with compilation_counter.expect(
+            num_graphs_seen=2,  # two graphs for the model
+            num_piecewise_graphs_seen=6,
+            # attn_one, attn_two each has 3 piecewise graphs
+            # (pre attn, post attn, silly_attention) each
+            num_piecewise_capturable_graphs_seen=4,
+            # attn_one, attn_two has pre attn and post attn each, total=4
+            num_backend_compilations=4,  # num_piecewise_capturable_graphs_seen
+            num_cudagraph_captured=8,
+            # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+    ):
+        outputs.append(run_model(vllm_config, model, inputs))
+
+    # no compile or cudagraph
+    vllm_config = VllmConfig(compilation_config=CompilationConfig(
+        level=CompilationLevel.NO_COMPILATION, ))
+
+    with set_current_vllm_config(vllm_config):
+        model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE,
+                                         hidden_size=HIDDEN_SIZE,
+                                         vllm_config=vllm_config,
+                                         prefix='').eval().cuda()
+
+    with compilation_counter.expect(
+            num_graphs_seen=0,
+            num_piecewise_graphs_seen=0,
+            num_piecewise_capturable_graphs_seen=0,
+            num_backend_compilations=0,
+            num_cudagraph_captured=0,
+    ):
+        outputs.append(run_model(vllm_config, model, inputs))
+
+    # piecewise compile without CUDA graph
+    vllm_config = VllmConfig(compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        use_cudagraph=False,
+        splitting_ops=["silly.attention"],
+    ))
+
+    with set_current_vllm_config(vllm_config):
+        model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE,
+                                         hidden_size=HIDDEN_SIZE,
+                                         vllm_config=vllm_config,
+                                         prefix='').eval().cuda()
+
+    with compilation_counter.expect(
+            num_graphs_seen=2,
+            num_piecewise_graphs_seen=6,
+            num_piecewise_capturable_graphs_seen=4,
+            num_backend_compilations=4,
+            num_cudagraph_captured=0,  # no cudagraph captured
+    ):
+        outputs.append(run_model(vllm_config, model, inputs))
+
+    # Generally don't expect outputs with and without inductor
+    # to be bitwise equivalent
+    assert torch.allclose(outputs[0], outputs[1])
+
+    # Expect bitwise equivalence using inductor w/ and w/o cudagraph
+    assert torch.equal(outputs[0], outputs[2])
diff --git a/vllm_v0.10.0/tests/compile/piecewise/test_simple.py b/vllm_v0.10.0/tests/compile/piecewise/test_simple.py
new file mode 100644
index 0000000..06ac352
--- /dev/null
+++ b/vllm_v0.10.0/tests/compile/piecewise/test_simple.py
@@ -0,0 +1,116 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test the piecewise compilation with a simple model so that we
+can exactly calculate the expected output and side effects.
+"""
+import pytest
+import torch
+from torch import nn
+from torch.library import Library
+
+from vllm.compilation.counter import compilation_counter
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
+                         set_current_vllm_config)
+from vllm.envs import VLLM_USE_V1
+from vllm.forward_context import set_forward_context
+from vllm.utils import direct_register_custom_op
+
+global_counter = 0
+
+# create a library to hold the custom op
+silly_lib = Library("silly", "FRAGMENT")  # noqa
+
+
+def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                    out: torch.Tensor) -> None:
+    global global_counter
+    global_counter += 1
+    print(f"{global_counter=}")
+    out.copy_(q)
+    out[0] += 1
+
+
+def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                         out: torch.Tensor) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="attention",
+    op_func=silly_attention,
+    mutates_args=["out"],
+    fake_impl=silly_attention_fake,
+    target_lib=silly_lib,
+)
+
+
+@support_torch_compile
+class SillyModel(nn.Module):
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = '',
+                 **kwargs) -> None:
+        super().__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Overall effect:
+        x += 1
+        x[0] += 2
+        global_counter += 2
+        """
+        x = x + 1
+        x = x + 2
+        out = torch.empty_like(x)
+        torch.ops.silly.attention(x, x, x, out)
+        x = out
+        x = x - 2
+        x = x - 1
+        out = torch.empty_like(x)
+        torch.ops.silly.attention(x, x, x, out)
+        x = out
+        x = x + 1
+        return x
+
+
+@pytest.mark.parametrize("use_inductor", [True, False])
+def test_simple_piecewise_compile(use_inductor):
+    assert VLLM_USE_V1
+
+    vllm_config = VllmConfig(compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        use_cudagraph=True,
+        use_inductor=use_inductor,
+        splitting_ops=["silly.attention"],
+        cudagraph_copy_inputs=True,
+        cudagraph_capture_sizes=[1, 2],
+    ))
+    with set_current_vllm_config(vllm_config):
+        model = SillyModel(vllm_config=vllm_config, prefix='')
+
+    inputs = torch.randn(100).cuda()
+
+    with compilation_counter.expect(
+            num_graphs_seen=1,  # one graph for the model
+            num_piecewise_graphs_seen=5,  # 2 * num_layers + 1
+            num_piecewise_capturable_graphs_seen=3,  # 1 + num_layers
+            num_backend_compilations=3,  # num_piecewise_capturable_graphs_seen
+            num_cudagraph_captured=
+            6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+    ), set_forward_context({}, vllm_config=vllm_config):
+
+        model(inputs)
+
+        model(torch.randn(2).cuda())
+        model(torch.randn(1).cuda())
+
+        input = torch.zeros(2).cuda()
+        global global_counter
+        global_counter = 0
+        output = model(input)
+        assert global_counter == 2
+        assert torch.allclose(output.cpu(), torch.tensor([3., 1.]))
diff --git a/vllm_v0.10.0/tests/compile/piecewise/test_toy_llama.py b/vllm_v0.10.0/tests/compile/piecewise/test_toy_llama.py
new file mode 100644
index 0000000..b7ed835
--- /dev/null
+++ b/vllm_v0.10.0/tests/compile/piecewise/test_toy_llama.py
@@ -0,0 +1,473 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test the piecewise compilation with a simple model, comparing the output
+with and without the piecewise compilation.
+
+This is a tractable model, the weights and computation are specially designed
+if the config `tractable_init` is set to True. Otherwise, the weights are
+initialized randomly with a fixed seed.
+"""
+from dataclasses import dataclass
+from typing import Any, Optional
+
+import pytest
+import torch
+from torch import nn
+from torch.library import Library
+
+from vllm.compilation.counter import compilation_counter
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
+                         set_current_vllm_config)
+from vllm.forward_context import set_forward_context
+from vllm.utils import direct_register_custom_op
+
+# create a library to hold the custom op
+silly_lib = Library("silly", "FRAGMENT")  # noqa
+
+
+def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                    out: torch.Tensor) -> None:
+    out.copy_(q)
+    out += k
+    out += v
+
+
+def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                         out: torch.Tensor) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="attention",
+    op_func=silly_attention,
+    mutates_args=["out"],
+    fake_impl=silly_attention_fake,
+    target_lib=silly_lib,
+)
+
+
+@dataclass
+class LlamaConfig:
+    hidden_size: int = 128
+    mlp_size: int = 256
+    vocab_size: int = 128
+    num_layers: int = 2
+    init_value: float = 1.0
+    tractable_init: bool = False
+    random_seed: int = 0
+
+    def compute_hash(self) -> str:
+        factors: list[Any] = []
+        for k, v in self.__dict__.items():
+            if k == "random_seed":
+                continue
+            factors.append((k, v))
+        factors.sort()
+        import hashlib
+        return hashlib.md5(str(factors).encode(),
+                           usedforsecurity=False).hexdigest()
+
+    def __post_init__(self):
+        assert self.mlp_size >= self.hidden_size
+
+
+class LlamaMLP(nn.Module):
+
+    def __init__(self, config: LlamaConfig) -> None:
+        super().__init__()
+        self.gate_up_projection = nn.Linear(
+            in_features=config.hidden_size,
+            out_features=config.mlp_size * 2,
+            bias=False,
+        )
+        self.down_projection = nn.Linear(
+            in_features=config.mlp_size,
+            out_features=config.hidden_size,
+            bias=False,
+        )
+
+        if config.tractable_init:
+            nn.init.eye_(self.gate_up_projection.weight.data[:config.mlp_size])
+            nn.init.eye_(self.gate_up_projection.weight.data[config.mlp_size:])
+            nn.init.eye_(self.down_projection.weight.data)
+        else:
+            nn.init.xavier_normal_(self.gate_up_projection.weight.data,
+                                   generator=torch.Generator().manual_seed(
+                                       config.random_seed),
+                                   gain=0.001)
+            nn.init.xavier_normal_(self.down_projection.weight.data,
+                                   generator=torch.Generator().manual_seed(
+                                       config.random_seed),
+                                   gain=0.001)
+
+    def forward(self, x):
+        # for tractable_init and positive input, this is
+        # essentially an elementwise-square
+        x = self.gate_up_projection(x)
+        x = x[:, :x.size(1) // 2] * torch.nn.functional.relu(
+            x[:, x.size(1) // 2:])
+        x = self.down_projection(x)
+        return x
+
+
+class LlamaAttention(nn.Module):
+
+    def __init__(self, config: LlamaConfig) -> None:
+        super().__init__()
+        self.qkv_projection = nn.Linear(
+            in_features=config.hidden_size,
+            out_features=config.hidden_size * 3,
+            bias=False,
+        )
+
+        self.output_projection = nn.Linear(
+            in_features=config.hidden_size,
+            out_features=config.hidden_size,
+            bias=False,
+        )
+
+        if config.tractable_init:
+            nn.init.eye_(self.qkv_projection.weight.data[:config.hidden_size])
+            nn.init.eye_(self.qkv_projection.weight.data[config.hidden_size:2 *
+                                                         config.hidden_size])
+            nn.init.eye_(self.qkv_projection.weight.data[2 *
+                                                         config.hidden_size:])
+            nn.init.eye_(self.output_projection.weight.data)
+        else:
+            nn.init.xavier_normal_(self.qkv_projection.weight.data,
+                                   generator=torch.Generator().manual_seed(
+                                       config.random_seed),
+                                   gain=0.001)
+            nn.init.xavier_normal_(self.output_projection.weight.data,
+                                   generator=torch.Generator().manual_seed(
+                                       config.random_seed),
+                                   gain=0.001)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        # for tractable_init, this is:
+        # output = (hidden_states * 3 + positions * 2)
+        qkv = self.qkv_projection(hidden_states)
+        hidden_size = qkv.size(-1) // 3
+        q, k, v = qkv.split([hidden_size, hidden_size, hidden_size], dim=-1)
+
+        q = q + positions.unsqueeze(1)
+        k = k + positions.unsqueeze(1)
+
+        attn_output = torch.empty_like(q)
+        torch.ops.silly.attention(q, k, v, attn_output)
+
+        output = self.output_projection(attn_output)
+        return output
+
+
+class LlamaDecoderLayer(nn.Module):
+
+    def __init__(self, config: LlamaConfig) -> None:
+        super().__init__()
+        self.self_attention = LlamaAttention(config)
+        self.mlp = LlamaMLP(config)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        For tractable computation:
+        - if residual is None, the outputs are:
+            - residual = (hidden_states + 1) * 3 + positions * 2 + hidden_states = hidden_states * 4 + positions * 2 + 3
+            - hidden_states = (residual + 1) ** 2
+        - if residual is not None, the outputs are:
+            - residual = (hidden_states + residual + 1) * 3 + positions * 2 + hidden_states + residual = (hidden_states + residual) * 4 + positions * 2 + 3
+            - hidden_states = (residual + 1) ** 2
+        """ # noqa
+        if residual is None:
+            residual = hidden_states
+            hidden_states = hidden_states + 1
+        else:
+            hidden_states = hidden_states + residual
+            residual = hidden_states
+            hidden_states = hidden_states + 1
+
+        hidden_states = self.self_attention(positions=positions,
+                                            hidden_states=hidden_states)
+
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+        hidden_states = hidden_states + 1
+        hidden_states = self.mlp(hidden_states)
+
+        return hidden_states, residual
+
+
+@support_torch_compile
+class LlamaModel(nn.Module):
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 config: LlamaConfig,
+                 prefix: str = '',
+                 **kwargs) -> None:
+        super().__init__()
+        self.embedding_tokens = nn.Embedding(
+            num_embeddings=config.vocab_size,
+            embedding_dim=config.hidden_size,
+        )
+        self.layers = nn.ModuleList(
+            [LlamaDecoderLayer(config) for _ in range(config.num_layers)])
+
+        # this is the initial value of the hidden states
+        self.embedding_tokens.weight.data.fill_(config.init_value)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states = self.embedding_tokens(input_ids)
+        residual = None
+        for layer in self.layers:
+            hidden_states, residual = layer(positions, hidden_states, residual)
+        return hidden_states
+
+
+def tractable_computation(input_ids: torch.Tensor,
+                          positions: torch.Tensor,
+                          config: LlamaConfig,
+                          init_value: float = 1.0) -> torch.Tensor:
+    hidden_states = torch.ones(input_ids.size(0),
+                               config.hidden_size,
+                               device=input_ids.device,
+                               dtype=input_ids.dtype) * init_value
+
+    # first layer
+    residual = hidden_states * 4 + positions.unsqueeze(1) * 2 + 3
+    hidden_states = (residual + 1)**2
+
+    # following layers
+    for _ in range(config.num_layers - 1):
+        hidden_states = hidden_states + residual
+        residual = hidden_states * 4 + positions.unsqueeze(1) * 2 + 3
+        hidden_states = (residual + 1)**2
+
+    return hidden_states
+
+
+@torch.inference_mode
+def run_model(llama_config,
+              use_compile: bool,
+              use_inductor: bool,
+              split_attn: bool = False) -> torch.Tensor:
+
+    if use_compile:
+        compilation_config = CompilationConfig(
+            level=CompilationLevel.PIECEWISE,
+            use_cudagraph=True,
+            use_inductor=use_inductor,
+            cudagraph_capture_sizes=[1, 2],
+        )
+        if split_attn:
+            compilation_config.splitting_ops = ["silly.attention"]
+    else:
+        compilation_config = CompilationConfig(
+            level=CompilationLevel.NO_COMPILATION, )
+
+    vllm_config = VllmConfig(compilation_config=compilation_config,
+                             additional_config=llama_config)
+    with set_current_vllm_config(vllm_config):
+        model = LlamaModel(config=llama_config,
+                           vllm_config=vllm_config,
+                           prefix="").eval().cuda()
+
+    with set_forward_context({}, vllm_config=vllm_config):
+        B = 16  # max batch size
+        input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
+        positions = torch.arange(B).cuda()
+
+        model(input_ids, positions)
+        model(input_ids[:2], positions[:2])
+        model(input_ids[:1], positions[:1])
+
+        input_ids[:2].zero_()
+        output = model(input_ids[:2], positions[:2])
+
+        output = output.cpu()
+
+        if llama_config.tractable_init:
+            expected_output = tractable_computation(input_ids[:2],
+                                                    positions[:2],
+                                                    llama_config).cpu()
+
+            assert torch.allclose(output, expected_output)
+        else:
+            return output.cpu()
+
+
+@pytest.mark.parametrize("use_inductor", [True, False])
+def test_toy_llama(use_inductor: bool):
+    # compare output with and without piecewise compilation
+
+    llama_config = LlamaConfig(hidden_size=128,
+                               mlp_size=256,
+                               vocab_size=128,
+                               num_layers=12)
+
+    tractable_config = LlamaConfig(hidden_size=128,
+                                   mlp_size=256,
+                                   vocab_size=128,
+                                   num_layers=2,
+                                   tractable_init=True)
+
+    outputs = []
+    with compilation_counter.expect(
+            num_graphs_seen=0,
+            num_piecewise_graphs_seen=0,
+            num_piecewise_capturable_graphs_seen=0,
+            num_backend_compilations=0,
+            num_cudagraph_captured=0,
+    ):
+        outputs.append(
+            run_model(llama_config, use_inductor=False, use_compile=False))
+    run_model(tractable_config, use_inductor=False, use_compile=False)
+
+    if use_inductor:
+        kwargs = {"num_inductor_compiles": 1, "num_eager_compiles": 0}
+    else:
+        kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0}
+
+    with compilation_counter.expect(
+            num_graphs_seen=1,  # one graph for the model
+            num_piecewise_graphs_seen=1,
+            num_piecewise_capturable_graphs_seen=1,
+            num_backend_compilations=1,  # num_piecewise_capturable_graphs_seen
+            num_cudagraph_captured=
+            2,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+            **kwargs,
+    ):
+        outputs.append(
+            run_model(llama_config,
+                      use_inductor=use_inductor,
+                      use_compile=True))
+    run_model(tractable_config, use_inductor=use_inductor, use_compile=True)
+
+    with compilation_counter.expect(
+            num_graphs_seen=1,  # one graph for the model
+            num_piecewise_graphs_seen=2 * llama_config.num_layers +
+            1,  # 2 * num_layers + 1
+            num_piecewise_capturable_graphs_seen=1 +
+            llama_config.num_layers,  # 1 + num_layers
+            num_backend_compilations=1 +
+            llama_config.num_layers,  # num_piecewise_capturable_graphs_seen
+            num_cudagraph_captured=2 *
+        (1 + llama_config.num_layers
+         ),  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+    ):
+        outputs.append(
+            run_model(llama_config,
+                      use_inductor=use_inductor,
+                      use_compile=True,
+                      split_attn=True))
+    run_model(tractable_config,
+              use_inductor=use_inductor,
+              use_compile=True,
+              split_attn=True)
+
+    for i in range(1, len(outputs)):
+        assert torch.allclose(outputs[0], outputs[i])
+
+
+@torch.inference_mode
+def benchmark():
+    from triton.testing import do_bench
+
+    # similar to llama 3.1-8B
+    llama_config = LlamaConfig(hidden_size=4096,
+                               mlp_size=14336,
+                               vocab_size=128 * 1024,
+                               num_layers=32)
+
+    # a tiny model to measure the overhead
+    # of piecewise cudagraph
+    llama_config = LlamaConfig(hidden_size=40,
+                               mlp_size=80,
+                               vocab_size=128,
+                               num_layers=2)
+
+    cudagraph_sizes = [1, 2, 4] + [i * 8 for i in range(1, 33)]
+
+    eager_time = {}
+    full_cudagraph_time = {}
+    piecewise_cudagraph_time = {}
+
+    pool = torch.cuda.graph_pool_handle()
+
+    for piecewise in [False, True]:
+        if piecewise:
+            compilation_config = CompilationConfig(
+                level=CompilationLevel.PIECEWISE,
+                use_cudagraph=True,
+                splitting_ops=["silly.attention"],
+                cudagraph_capture_sizes=cudagraph_sizes,
+            )
+        else:
+            compilation_config = CompilationConfig(
+                level=CompilationLevel.PIECEWISE,
+                cudagraph_capture_sizes=cudagraph_sizes,
+            )
+
+        vllm_config = VllmConfig(compilation_config=compilation_config)
+        with set_current_vllm_config(vllm_config):
+            model = LlamaModel(config=llama_config,
+                               vllm_config=vllm_config,
+                               prefix="").eval().cuda().to(torch.bfloat16)
+
+        B = 256  # max batch size
+        input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
+        positions = torch.arange(B).cuda().to(torch.bfloat16)
+
+        graphs = {}
+
+        model(input_ids, positions)
+        for b in cudagraph_sizes[::-1]:
+            if not piecewise:
+                graph = torch.cuda.CUDAGraph()
+                with torch.cuda.graph(graph, pool=pool):
+                    output = model(input_ids[:b], positions[:b])
+                graphs[b] = (graph, output)
+            else:
+                output = model(input_ids[:b], positions[:b])
+                graphs[b] = (model, output)
+        for b in cudagraph_sizes:
+            if piecewise:
+                # noqa is for `Function definition does not bind loop variable`
+                # it will be problematic if we save the created lambda function
+                # and use it later, because it will look up the name `b` in the
+                # enclosing scope, and the value of `b` will always be 256.
+                # it is fine here, because we only use the lambda function once.
+                runtime = do_bench(lambda: graphs[b][0]  # noqa
+                                   (input_ids[:b], positions[:b]))  # noqa
+                piecewise_cudagraph_time[b] = runtime
+            else:
+                runtime = do_bench(lambda: graphs[b][0].replay())  # noqa
+                eager_runtime = do_bench(
+                    lambda: model(input_ids[:b], positions[:b]))  # noqa
+                full_cudagraph_time[b] = runtime
+                eager_time[b] = eager_runtime
+
+    # print in tabular format
+    print("batch size\teager mode\tfull cudagraph\tpiecewise cudagraph")
+    for b in cudagraph_sizes:
+        print(f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}"
+              f"\t{piecewise_cudagraph_time[b]:.3f}")
+
+
+if __name__ == "__main__":
+    benchmark()
diff --git a/vllm_v0.10.0/tests/compile/test_async_tp.py b/vllm_v0.10.0/tests/compile/test_async_tp.py
new file mode 100644
index 0000000..62804e7
--- /dev/null
+++ b/vllm_v0.10.0/tests/compile/test_async_tp.py
@@ -0,0 +1,248 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+import torch
+
+import vllm.envs as envs
+from vllm.compilation.collective_fusion import AsyncTPPass
+from vllm.config import (CompilationConfig, DeviceConfig, ModelConfig,
+                         PassConfig, VllmConfig)
+from vllm.distributed import (tensor_model_parallel_all_gather,
+                              tensor_model_parallel_reduce_scatter)
+from vllm.distributed.parallel_state import (init_distributed_environment,
+                                             initialize_model_parallel)
+from vllm.platforms import current_platform
+from vllm.utils import update_environment_variables
+
+from ..models.registry import HF_EXAMPLE_MODELS
+from ..utils import (compare_two_settings, create_new_process_for_each_test,
+                     multi_gpu_test)
+from .backend import TestBackend
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+
+class TestMMRSModel(torch.nn.Module):
+
+    def __init__(self, hidden_size=16):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.gate_proj = torch.nn.Parameter(torch.empty(
+            (self.hidden_size * 2, hidden_size)),
+                                            requires_grad=False)
+        # Initialize weights
+        torch.nn.init.normal_(self.gate_proj, std=0.02)
+
+    def forward(self, hidden_states):
+        """
+        Forward pass implementing the mm + reduce scatter in the FX graph
+    
+        """
+        # Reshape input
+        view = hidden_states.reshape(-1, self.hidden_size)
+
+        # matrix multiplication
+        permute = self.gate_proj.permute(1, 0)
+        mm = torch.mm(view, permute)
+        reduce_scatter = tensor_model_parallel_reduce_scatter(mm, dim=0)
+        return reduce_scatter
+
+    def ops_in_model_before(self):
+        return [torch.ops.vllm.reduce_scatter.default]
+
+    def ops_in_model_after(self):
+        return [torch.ops.symm_mem.fused_matmul_reduce_scatter.default]
+
+
+class TestAGMMModel(torch.nn.Module):
+
+    def __init__(self, hidden_size=16):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.weight = torch.nn.Parameter(torch.empty(
+            (hidden_size, hidden_size)),
+                                         requires_grad=False)
+        # Initialize weights
+        torch.nn.init.normal_(self.weight, std=0.02)
+
+    def forward(self, hidden_states):
+        """
+        Forward pass implementing the mm + all gather in the FX graph
+        """
+        # Reshape input
+        view = hidden_states.reshape(-1, self.hidden_size)
+        all_gather = tensor_model_parallel_all_gather(view, dim=0)
+        permute = self.weight.permute(1, 0)
+        mm = torch.mm(all_gather, permute)
+        return mm
+
+    def ops_in_model_before(self):
+        return [torch.ops.vllm.all_gather.default]
+
+    def ops_in_model_after(self):
+        return [torch.ops.symm_mem.fused_all_gather_matmul.default]
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("test_model", [TestMMRSModel, TestAGMMModel])
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize("seq_len", [16])
+@pytest.mark.parametrize("hidden_size", [16])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
+                    reason="Only test on CUDA")
+def test_async_tp_pass_replace(test_model: str, batch_size: int, seq_len: int,
+                               hidden_size: int, dtype: torch.dtype):
+    num_processes = 2
+
+    def run_torch_spawn(fn, nprocs):
+        # need to use torch.mp.spawn otherwise will have problems with
+        # torch.distributed and cuda
+        torch.multiprocessing.spawn(fn,
+                                    args=(num_processes, test_model,
+                                          batch_size, seq_len, hidden_size,
+                                          dtype),
+                                    nprocs=nprocs)
+
+    run_torch_spawn(async_tp_pass_on_test_model, num_processes)
+
+
+def async_tp_pass_on_test_model(local_rank: int, world_size: int,
+                                test_model_cls: torch.nn.Module,
+                                batch_size: int, seq_len: int,
+                                hidden_size: int, dtype: torch.dtype):
+    current_platform.seed_everything(0)
+
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+    torch.set_default_device(device)
+    torch.set_default_dtype(dtype)
+
+    update_environment_variables({
+        'RANK': str(local_rank),
+        'LOCAL_RANK': str(local_rank),
+        'WORLD_SIZE': str(world_size),
+        'MASTER_ADDR': 'localhost',
+        'MASTER_PORT': '12345',
+    })
+
+    # initialize distributed
+    init_distributed_environment()
+    initialize_model_parallel(tensor_model_parallel_size=world_size)
+
+    # configure vllm config for SequenceParallelismPass
+    vllm_config = VllmConfig()
+    vllm_config.compilation_config = CompilationConfig(pass_config=PassConfig(
+        enable_async_tp=True, ), )
+    vllm_config.device_config = DeviceConfig(device=torch.device("cuda"))
+
+    # this is a fake model name to construct the model config
+    # in the vllm_config, it's not really used.
+    model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
+    vllm_config.model_config = ModelConfig(model=model_name,
+                                           task="auto",
+                                           tokenizer=model_name,
+                                           tokenizer_mode="auto",
+                                           trust_remote_code=True,
+                                           dtype=dtype,
+                                           seed=42)
+
+    async_tp_pass = AsyncTPPass(vllm_config)
+    backend = TestBackend(async_tp_pass)
+
+    model = test_model_cls(hidden_size)
+
+    hidden_states = torch.randn((batch_size * seq_len, hidden_size),
+                                dtype=dtype,
+                                requires_grad=False)
+
+    compiled_model = torch.compile(model, backend=backend)
+    compiled_model(hidden_states)
+
+    # In pre-nodes, all gather or reduce scatter should exist,
+    # fused_matmul_reduce_scatter or fused_all_gather_matmul should not
+    backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False)
+
+    # In post-nodes, fused_matmul_reduce_scatter or \
+    # fused_all_gather_matmul should exist
+    backend.check_after_ops(model.ops_in_model_after())
+
+
+@create_new_process_for_each_test()
+@pytest.mark.parametrize("model_id", ["meta-llama/Llama-3.2-1B-Instruct"])
+@pytest.mark.parametrize("tp_size", [2])
+@pytest.mark.parametrize("async_tp_enabled", [True])
+@pytest.mark.parametrize("distributed_backend", ["mp"])
+@pytest.mark.parametrize("eager_mode", [False, True])
+def test_async_tp_pass_correctness(
+    model_id: str,
+    tp_size: int,
+    async_tp_enabled: bool,
+    distributed_backend: str,
+    eager_mode: bool,
+    num_gpus_available: int,
+):
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
+    model_info.check_transformers_version(on_fail="skip")
+    model_info.check_available_online(on_fail="skip")
+
+    pp_size = 1
+    if num_gpus_available < tp_size:
+        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
+
+    common_args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "8",
+    ]
+    if eager_mode:
+        common_args.append("--enforce-eager")
+
+    compilation_config = {
+        'level': 3,
+        'compile_sizes': [2, 4, 8],
+        'splitting_ops': [],
+        'pass_config': {
+            'enable_async_tp': async_tp_enabled
+        },
+    }
+
+    async_tp_env = tp_env = {
+        "VLLM_USE_V1": "1",
+    }
+
+    async_tp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--distributed-executor-backend",
+        distributed_backend,
+        "--compilation_config",
+        json.dumps(compilation_config),
+    ]
+
+    tp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--distributed-executor-backend",
+        "mp",
+    ]
+
+    compare_two_settings(model_id,
+                         async_tp_args,
+                         tp_args,
+                         async_tp_env,
+                         tp_env,
+                         method="generate")
diff --git a/vllm_v0.10.0/tests/compile/test_basic_correctness.py b/vllm_v0.10.0/tests/compile/test_basic_correctness.py
new file mode 100644
index 0000000..1ee9b23
--- /dev/null
+++ b/vllm_v0.10.0/tests/compile/test_basic_correctness.py
@@ -0,0 +1,154 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import dataclasses
+
+import pytest
+
+from vllm.config import CompilationLevel
+from vllm.utils import cuda_device_count_stateless
+
+from ..utils import compare_all_settings
+
+
+@dataclasses.dataclass
+class TestSetting:
+    model: str
+    model_args: list[str]
+    pp_size: int
+    tp_size: int
+    attn_backend: str
+    method: str
+    fullgraph: bool
+
+
+# we cannot afford testing the full Catesian product
+# of all models and all levels
+@pytest.mark.parametrize(
+    "test_setting",
+    [
+        # basic llama model
+        TestSetting(
+            model="meta-llama/Llama-3.2-1B-Instruct",
+            model_args=["--max-model-len", "2048"],
+            pp_size=2,
+            tp_size=2,
+            attn_backend="FLASHINFER",
+            method="generate",
+            fullgraph=True,
+        ),
+        # llama model with quantization
+        TestSetting(
+            model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+            model_args=["--quantization", "gptq", "--max-model-len", "2048"],
+            pp_size=1,
+            tp_size=1,
+            attn_backend="FLASH_ATTN",
+            method="generate",
+            fullgraph=True,
+        ),
+        # MoE model
+        TestSetting(
+            model="ibm/PowerMoE-3b",
+            model_args=["--max-model-len", "2048"],
+            pp_size=1,
+            tp_size=2,
+            attn_backend="FLASH_ATTN",
+            method="generate",
+            fullgraph=True,
+        ),
+        # embedding model
+        TestSetting(
+            model="BAAI/bge-multilingual-gemma2",
+            model_args=[
+                "--task", "embed", "--dtype", "bfloat16", "--max-model-len",
+                "2048"
+            ],
+            pp_size=1,
+            tp_size=1,
+            attn_backend="FLASH_ATTN",
+            method="encode",
+            fullgraph=True,
+        ),
+        # TODO: bert models are not supported in V1 yet
+        # # encoder-based embedding model (BERT)
+        # TestSetting(
+        #     model="BAAI/bge-base-en-v1.5",
+        #     model_args=["--task", "embed"],
+        #     pp_size=1,
+        #     tp_size=1,
+        #     attn_backend="XFORMERS",
+        #     method="encode",
+        #     fullgraph=True,
+        # ),
+        # vision language model
+        TestSetting(
+            model="microsoft/Phi-3.5-vision-instruct",
+            model_args=["--trust-remote-code", "--max-model-len", "2048"],
+            pp_size=2,
+            tp_size=1,
+            attn_backend="FLASH_ATTN",
+            method="generate_with_image",
+            fullgraph=False,
+        ),
+    ])
+def test_compile_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    test_setting: TestSetting,
+):
+    # this test is run under multiple suits, with different GPUs.
+    # make sure we only run the test with correct CUDA devices.
+    # don't use "<", as it will duplicate the tests.
+    model = test_setting.model
+    model_args = test_setting.model_args
+    pp_size = test_setting.pp_size
+    tp_size = test_setting.tp_size
+    attn_backend = test_setting.attn_backend
+    method = test_setting.method
+    fullgraph = test_setting.fullgraph
+    if cuda_device_count_stateless() != pp_size * tp_size:
+        pytest.skip(f"Need exactly {pp_size}*{tp_size} CUDA gpus but got "
+                    f"{cuda_device_count_stateless()}")
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
+        final_args = [
+            "--enforce-eager", *model_args, "-pp",
+            str(pp_size), "-tp",
+            str(tp_size)
+        ]
+
+        all_args: list[list[str]] = []
+        all_envs: list[dict[str, str] | None] = []
+
+        for level in [
+                CompilationLevel.NO_COMPILATION,
+                CompilationLevel.PIECEWISE,
+        ]:
+            all_args.append(final_args + [f"-O{level}"])
+            all_envs.append({})
+
+        # inductor will change the output, so we only compare if the output
+        # is close, not exactly the same.
+        compare_all_settings(
+            model,
+            all_args,
+            all_envs,
+            method=method if method != "generate" else "generate_close")
+        all_envs.clear()
+        all_args.clear()
+
+        for level in [
+                CompilationLevel.NO_COMPILATION,
+                CompilationLevel.DYNAMO_AS_IS,
+                CompilationLevel.DYNAMO_ONCE,
+        ]:
+            all_args.append(final_args + [f"-O{level}"])
+            all_envs.append({})
+            if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
+                # "DYNAMO_ONCE" will always use fullgraph
+                all_envs[-1][
+                    "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"  # type: ignore
+
+        compare_all_settings(model, all_args * 3, all_envs, method=method)
diff --git a/vllm_v0.10.0/tests/compile/test_config.py b/vllm_v0.10.0/tests/compile/test_config.py
new file mode 100644
index 0000000..0ba59f4
--- /dev/null
+++ b/vllm_v0.10.0/tests/compile/test_config.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+import vllm
+from vllm.compilation.counter import compilation_counter
+from vllm.config import VllmConfig
+from vllm.utils import _is_torch_equal_or_newer
+
+
+def test_version():
+    assert _is_torch_equal_or_newer('2.8.0.dev20250624+cu128', '2.8.0.dev')
+    assert _is_torch_equal_or_newer('2.8.0a0+gitc82a174', '2.8.0.dev')
+    assert _is_torch_equal_or_newer('2.8.0', '2.8.0.dev')
+    assert _is_torch_equal_or_newer('2.8.1', '2.8.0.dev')
+    assert not _is_torch_equal_or_newer('2.7.1', '2.8.0.dev')
+
+
+def test_use_cudagraphs_dynamic(monkeypatch):
+    assert vllm.envs.VLLM_USE_V1
+    vllm_config = VllmConfig()
+    assert vllm_config.compilation_config.use_cudagraph
+
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+    vllm_config = VllmConfig()
+    assert not vllm_config.compilation_config.use_cudagraph
+
+
+# NB: We don't test VLLM_DISABLE_COMPILE_CACHE=0 because that depends
+# on the state of the cache directory on the current machine, which
+# may be influenced by other tests.
+@pytest.mark.parametrize("val", ["1"])
+def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
+    assert vllm.envs.VLLM_USE_V1
+
+    # spawn means that the counters are in the same process.
+    monkeypatch.setenv('VLLM_WORKER_MULTIPROC_METHOD', "spawn")
+    monkeypatch.setenv('VLLM_DISABLE_COMPILE_CACHE', val)
+
+    compilation_config = {
+        "use_cudagraph": False,  # speed things up a bit
+    }
+    with (
+            compilation_counter.expect(num_cache_entries_updated=0,
+                                       num_compiled_artifacts_saved=0),
+            # loading the model causes compilation (if enabled) to happen
+            vllm_runner('facebook/opt-125m',
+                        compilation_config=compilation_config,
+                        gpu_memory_utilization=0.4) as _):
+        pass
+
+
+@pytest.mark.parametrize("enabled", [True, False])
+def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
+    assert vllm.envs.VLLM_USE_V1
+
+    # Disable multiprocessing so that the counter is in the same process
+    monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
+
+    compilation_config = {
+        "cudagraph_capture_sizes": [100],
+        "use_cudagraph": enabled,
+    }
+    with (
+            compilation_counter.expect(
+                num_graphs_seen=1,
+                num_gpu_runner_capture_triggers=1 if enabled else 0,
+                num_cudagraph_captured=13 if enabled else 0,
+            ),
+            # loading the model causes compilation (if enabled) to happen
+            vllm_runner('facebook/opt-125m',
+                        compilation_config=compilation_config,
+                        gpu_memory_utilization=0.4) as _):
+        pass
diff --git a/vllm_v0.10.0/tests/compile/test_full_graph.py b/vllm_v0.10.0/tests/compile/test_full_graph.py
new file mode 100644
index 0000000..72f962e
--- /dev/null
+++ b/vllm_v0.10.0/tests/compile/test_full_graph.py
@@ -0,0 +1,155 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import tempfile
+from typing import Any, Optional, Union
+
+import pytest
+import torch
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm import LLM, SamplingParams
+from vllm.config import CompilationConfig, CompilationLevel, PassConfig
+from vllm.platforms import current_platform
+
+from ..utils import create_new_process_for_each_test
+
+
+def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
+    TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
+        ("facebook/opt-125m", {}),
+        ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
+            "dtype": torch.float16,
+        }),
+        ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
+            "dtype": torch.float16,
+        }),
+        ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {}),
+        ("meta-llama/Llama-3.2-1B-Instruct", {}),
+    ]
+
+    if all:
+        if is_quant_method_supported("aqlm"):
+            TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
+                "quantization": "aqlm"
+            }))
+
+        # TODO: figure out why this fails.
+        if False and is_quant_method_supported("gguf"):  # noqa: SIM223
+            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
+                "quantization": "gguf"
+            }))
+
+        if is_quant_method_supported("gptq"):
+            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
+                "quantization": "gptq"
+            }))
+
+        if is_quant_method_supported("gptq_marlin"):
+            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
+                "quantization": "gptq_marlin"
+            }))
+
+        if is_quant_method_supported("gptq_marlin_24"):
+            TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
+                "quantization": "gptq_marlin_24"
+            }))
+
+        if is_quant_method_supported("marlin"):
+            TEST_MODELS.append(
+                ("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
+                    "quantization": "marlin"
+                }))
+
+        if not current_platform.is_rocm() and is_quant_method_supported("awq"):
+            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
+                "quantization": "AWQ"
+            }))
+
+    if keywords is None:
+        return TEST_MODELS
+
+    # filter by keywords
+    pred = lambda model: any(keyword in model[0] for keyword in keywords)
+    return list(filter(pred, TEST_MODELS))
+
+
+@pytest.mark.parametrize(
+    "optimization_level",
+    [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
+)
+@pytest.mark.parametrize("model_info", models_list(all=True))
+@create_new_process_for_each_test()
+def test_full_graph(
+    monkeypatch: pytest.MonkeyPatch,
+    model_info: tuple[str, dict[str, Any]],
+    optimization_level: int,
+):
+    model, model_kwargs = model_info
+
+    with monkeypatch.context() as m:
+        # make sure these models can be captured in full graph mode
+        m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
+        print(f"MODEL={model}")
+
+        run_model(optimization_level, model, model_kwargs)
+
+
+# TODO(luka) add other supported compilation config scenarios here
+@pytest.mark.parametrize(
+    "compilation_config, model_info",
+    [
+        # additional compile sizes, only some of the models
+        (CompilationConfig(level=CompilationLevel.PIECEWISE,
+                           compile_sizes=[1, 2]), model)
+        for model in models_list(all=False)
+    ] + [
+        # RMSNorm + quant fusion, only 8-bit quant models
+        (CompilationConfig(level=CompilationLevel.PIECEWISE,
+                           custom_ops=["+rms_norm"],
+                           pass_config=PassConfig(enable_fusion=True,
+                                                  enable_noop=True)), model)
+        for model in models_list(keywords=["FP8-dynamic", "quantized.w8a8"])
+    ] + [
+        # Test depyf integration works
+        (CompilationConfig(level=CompilationLevel.PIECEWISE,
+                           debug_dump_path=tempfile.gettempdir()),
+         ("facebook/opt-125m", {})),
+    ])
+# only test some of the models
+@create_new_process_for_each_test()
+def test_custom_compile_config(
+    compilation_config: CompilationConfig,
+    model_info: tuple[str, dict[str, Any]],
+):
+    model, model_kwargs = model_info
+    print(f"MODEL={model}")
+    run_model(compilation_config, model, model_kwargs)
+
+
+def run_model(compile_config: Union[int, CompilationConfig], model: str,
+              model_kwargs: dict[str, Any]):
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0)
+    llm = LLM(
+        model=model,
+        enforce_eager=True,
+        tensor_parallel_size=1,
+        disable_custom_all_reduce=True,
+        compilation_config=compile_config,
+        **model_kwargs,
+    )
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/vllm_v0.10.0/tests/compile/test_functionalization.py b/vllm_v0.10.0/tests/compile/test_functionalization.py
new file mode 100644
index 0000000..aade29b
--- /dev/null
+++ b/vllm_v0.10.0/tests/compile/test_functionalization.py
@@ -0,0 +1,115 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+import vllm.envs as envs
+from vllm import LLM, SamplingParams
+from vllm.compilation.activation_quant_fusion import ActivationQuantFusionPass
+from vllm.compilation.fix_functionalization import FixFunctionalizationPass
+from vllm.compilation.fusion import (FUSED_OPS, FusionPass, QuantKey,
+                                     kFp8DynamicTokenSym, kFp8StaticTensorSym)
+from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
+from vllm.compilation.noop_elimination import NoOpEliminationPass
+from vllm.config import CompilationConfig, PassConfig, VllmConfig
+
+from .backend import TestBackend
+
+OPS_IN_MODEL = [
+    torch.ops._C.rotary_embedding.default,
+    torch.ops._C.fused_add_rms_norm.default,
+]
+
+RMS_OP = torch.ops._C.rms_norm.default
+
+RMS_QUANT_OPS = {
+    "static_fp8": [
+        torch.ops._C.rms_norm_static_fp8_quant.default,
+        torch.ops._C.fused_add_rms_norm_static_fp8_quant.default
+    ],
+}
+
+SILU_MUL_OP = torch.ops._C.silu_and_mul.default
+
+SILU_MUL_QUANT_OP = torch.ops._C.silu_and_mul_quant.default
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+
+@pytest.mark.parametrize(
+    "model, quant_key",
+    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e", kFp8StaticTensorSym),
+     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8_DYNAMIC-e2e",
+      kFp8DynamicTokenSym)])
+@pytest.mark.parametrize("do_fusion", [True, False])
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda",
+                    reason="Only test on CUDA")
+def test_fix_functionalization(model: str, quant_key: QuantKey,
+                               do_fusion: bool):
+    torch.set_default_device("cuda")
+
+    vllm_config = VllmConfig()
+    vllm_config.compilation_config = CompilationConfig(
+        pass_config=PassConfig(enable_fusion=do_fusion, enable_noop=True))
+    noop_pass = NoOpEliminationPass(vllm_config)
+    fusion_pass = FusionPass.instance(vllm_config)
+    act_quant_fusion_pass = ActivationQuantFusionPass(vllm_config)
+
+    passes = [noop_pass, fusion_pass, act_quant_fusion_pass
+              ] if do_fusion else [noop_pass]
+    func_pass = FixFunctionalizationPass(vllm_config)
+    backend_func = TestBackend(*passes, func_pass)
+    backend_no_func = TestBackend(*passes)
+
+    # instantiate a full engine and manually compile the model 2x
+    # (with and without FixFunctionalizationPass)
+    llm = LLM(model=model, enforce_eager=True)
+    model_runner = llm.llm_engine.model_executor.driver_worker.model_runner
+    orig_model = model_runner.model
+    # TODO mark inputs dynamic? (currently torch.compile is triggered 4x)
+    # Can only do that by using the decorator but then we'd have to instantiate
+    # 2 LLM instances.
+
+    sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
+    model_runner.model = torch.compile(orig_model,
+                                       fullgraph=True,
+                                       backend=backend_func)
+    gen_func = llm.generate(prompts, sampling_params)
+
+    model_runner.model = torch.compile(orig_model,
+                                       fullgraph=True,
+                                       backend=backend_no_func)
+
+    gen_no_func = llm.generate(prompts, sampling_params)
+
+    for output_func, output_no_func in zip(gen_func, gen_no_func):
+        assert output_func.outputs[0].text == output_no_func.outputs[0].text
+
+    # OPS_IN_MODEL always appear. RMS_OP is fused away if we run fusion,
+    # and replaced by fused quantized ops in RMS_QUANT_OPS.
+    rms_ops = [FUSED_OPS[(quant_key, True)], FUSED_OPS[(quant_key, False)]
+               ] if do_fusion else [RMS_OP]
+    silu_mul_ops = [SILU_MUL_QUANT_OP] if do_fusion and \
+        quant_key == kFp8StaticTensorSym else [
+        SILU_MUL_OP
+    ]
+
+    ops = OPS_IN_MODEL + rms_ops + silu_mul_ops
+
+    for op in ops:
+        find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
+        assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes,
+                                  op) is None  # noqa: E501
+
+    # make sure the ops were all de-functionalized
+    found = dict()
+    for node in backend_func.graph_post_pass.nodes:
+        for op in ops:
+            if is_func(node, op):
+                found[op] = True
+    assert all(found[op] for op in ops)
diff --git a/vllm_v0.10.0/tests/compile/test_fusion.py b/vllm_v0.10.0/tests/compile/test_fusion.py
new file mode 100644
index 0000000..4a3820e
--- /dev/null
+++ b/vllm_v0.10.0/tests/compile/test_fusion.py
@@ -0,0 +1,131 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+import vllm.envs as envs
+import vllm.plugins
+from vllm.compilation.fusion import (FUSED_OPS, QUANT_OPS, FusedRMSQuantKey,
+                                     FusionPass, GroupShape, QuantKey)
+from vllm.compilation.noop_elimination import NoOpEliminationPass
+from vllm.config import (CompilationConfig, CompilationLevel, PassConfig,
+                         VllmConfig)
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    CUTLASS_FP8_SUPPORTED, Fp8LinearOp, maybe_create_device_identity)
+from vllm.platforms import current_platform
+
+from .backend import TestBackend
+
+FP8_DTYPE = current_platform.fp8_dtype()
+
+
+class TestModel(torch.nn.Module):
+
+    def __init__(self, hidden_size: int, eps: float, static: bool,
+                 cutlass_fp8_enabled: bool, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.cutlass_fp8_enabled = cutlass_fp8_enabled
+        self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
+        self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
+        group_shape = GroupShape.PER_TENSOR if static else GroupShape.PER_TOKEN
+        self.key = QuantKey(dtype=FP8_DTYPE,
+                            static=static,
+                            group_shape=group_shape,
+                            symmetric=True)
+        if static:
+            self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
+        else:
+            self.scale = [None for _ in range(2)]
+        self.w = [
+            torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
+            for _ in range(2)
+        ]
+        self.fp8_linear = Fp8LinearOp(
+            cutlass_fp8_supported=cutlass_fp8_enabled,
+            act_quant_static=static,
+            act_quant_group_shape=group_shape,
+        )
+
+    def forward(self, x):
+        resid = torch.sqrt(x)
+        y = self.norm[0](x)
+
+        x2 = self.fp8_linear.apply(y,
+                                   self.w[0],
+                                   self.wscale[0],
+                                   input_scale=self.scale[0])
+        # make sure resid is used for replacement to work
+        y2, resid = self.norm[1](x2, resid)
+
+        x3 = self.fp8_linear.apply(y2,
+                                   self.w[1],
+                                   self.wscale[1],
+                                   input_scale=self.scale[1])
+        y3, resid = self.norm[2](x3, resid)  # use resid here
+        return y3
+
+    def ops_in_model_before(self):
+        return [QUANT_OPS[self.key]]
+
+    def ops_in_model_after(self):
+        return [
+            FUSED_OPS[FusedRMSQuantKey(self.key, False)],
+            FUSED_OPS[FusedRMSQuantKey(self.key, True)]
+        ]
+
+
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("hidden_size", [64, 3392, 4096])
+@pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049])
+@pytest.mark.parametrize("eps", [1e-5, 1e-6])
+@pytest.mark.parametrize("static", [True, False])
+@pytest.mark.parametrize("cutlass_fp8_enabled",
+                         [True, False] if CUTLASS_FP8_SUPPORTED else [False])
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
+                    reason="Only test on CUDA and ROCm")
+def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
+                              cutlass_fp8_enabled):
+    torch.set_default_device("cuda")
+    torch.set_default_dtype(dtype)
+    torch.manual_seed(1)
+    maybe_create_device_identity()  # needed for certain non-cutlass fp8 paths
+
+    vllm_config = VllmConfig(compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        custom_ops=["+rms_norm", "+quant_fp8"],
+        pass_config=PassConfig(enable_fusion=True, enable_noop=True),
+    ))
+    with vllm.config.set_current_vllm_config(vllm_config):
+        # Reshape pass is needed for the fusion pass to work
+        noop_pass = NoOpEliminationPass(vllm_config)
+        fusion_pass = FusionPass.instance(vllm_config)
+
+        backend = TestBackend(noop_pass, fusion_pass)
+        model = TestModel(hidden_size, eps, static, cutlass_fp8_enabled)
+
+        # First dimension dynamic
+        x = torch.rand(num_tokens, hidden_size)
+        torch._dynamo.mark_dynamic(x, 0)
+
+        result = model(x)
+
+        model2 = torch.compile(model, backend=backend)
+        result2 = model2(x)
+
+        # Higher tol for dynamic, even higher for bfloat16
+        if static:
+            ATOL, RTOL = (1e-3, 1e-3)
+        elif dtype == torch.float16:
+            ATOL, RTOL = (2e-3, 2e-3)
+        else:
+            ATOL, RTOL = (1e-2, 1e-2)
+
+        torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL)
+
+        # In pre-nodes, fp8 quant should be there and fused kernels should not
+        backend.check_before_ops(model.ops_in_model_before())
+
+        # In post-nodes, fused kernels should be there and fp8 quant should not
+        backend.check_after_ops(model.ops_in_model_after())
diff --git a/vllm_v0.10.0/tests/compile/test_fusion_all_reduce.py b/vllm_v0.10.0/tests/compile/test_fusion_all_reduce.py
new file mode 100644
index 0000000..492e90f
--- /dev/null
+++ b/vllm_v0.10.0/tests/compile/test_fusion_all_reduce.py
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from importlib.util import find_spec
+
+import pytest
+import torch
+
+import vllm.envs as envs
+from vllm.compilation.collective_fusion import AllReduceFusionPass
+from vllm.config import (CompilationConfig, CompilationLevel, DeviceConfig,
+                         ModelConfig, PassConfig, VllmConfig)
+from vllm.distributed import tensor_model_parallel_all_reduce
+from vllm.distributed.parallel_state import (init_distributed_environment,
+                                             initialize_model_parallel)
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.platforms import current_platform
+from vllm.utils import update_environment_variables
+
+from ..utils import multi_gpu_test
+from .backend import TestBackend
+
+
+class TestAllReduceRMSNormModel(torch.nn.Module):
+
+    def __init__(self, hidden_size=16, eps=1e-6):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.eps = eps
+        self.norm = RMSNorm(hidden_size, eps)
+
+    def forward(self, hidden_states, residual):
+        view = hidden_states.reshape(-1, self.hidden_size)
+        all_reduce = tensor_model_parallel_all_reduce(view)
+        norm = self.norm(all_reduce)
+        return norm
+
+    def ops_in_model_before(self):
+        return [torch.ops.vllm.all_reduce.default]
+
+    def ops_in_model_after(self):
+        return [torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default]
+
+
+class TestAllReduceFusedAddRMSNormModel(torch.nn.Module):
+
+    def __init__(self, hidden_size=16, eps=1e-6):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.eps = eps
+        self.norm = RMSNorm(hidden_size, eps)
+
+    def forward(self, hidden_states, residual):
+        view = hidden_states.reshape(-1, self.hidden_size)
+        all_reduce = tensor_model_parallel_all_reduce(view)
+        norm, _ = self.norm(all_reduce, residual)
+        return norm
+
+    def ops_in_model_before(self):
+        return [torch.ops.vllm.all_reduce.default]
+
+    def ops_in_model_after(self):
+        return [torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default]
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "test_model",
+    [TestAllReduceRMSNormModel, TestAllReduceFusedAddRMSNormModel])
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize("seq_len", [8])
+@pytest.mark.parametrize("hidden_size", [4096])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
+                    reason="Only test on CUDA")
+@pytest.mark.skipif(not find_spec("flashinfer"),
+                    reason="flashinfer is not installed")
+@pytest.mark.skipif(not current_platform.is_device_capability(100),
+                    reason="Only test on SM100")
+def test_all_reduce_fusion_pass_replace(test_model: torch.nn.Module,
+                                        batch_size: int, seq_len: int,
+                                        hidden_size: int, dtype: torch.dtype):
+    num_processes = 2
+
+    def run_torch_spawn(fn, nprocs):
+        torch.multiprocessing.spawn(fn,
+                                    args=(num_processes, test_model,
+                                          batch_size, seq_len, hidden_size,
+                                          dtype),
+                                    nprocs=nprocs)
+
+    run_torch_spawn(all_reduce_fusion_pass_on_test_model, num_processes)
+
+
+def all_reduce_fusion_pass_on_test_model(local_rank: int, world_size: int,
+                                         test_model_cls: torch.nn.Module,
+                                         batch_size: int, seq_len: int,
+                                         hidden_size: int, dtype: torch.dtype):
+    current_platform.seed_everything(0)
+
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+    torch.set_default_device(device)
+    torch.set_default_dtype(dtype)
+
+    update_environment_variables({
+        'RANK': str(local_rank),
+        'LOCAL_RANK': str(local_rank),
+        'WORLD_SIZE': str(world_size),
+        'MASTER_ADDR': 'localhost',
+        'MASTER_PORT': '12345',
+    })
+
+    init_distributed_environment()
+    initialize_model_parallel(tensor_model_parallel_size=world_size)
+
+    vllm_config = VllmConfig(
+        compilation_config=CompilationConfig(level=CompilationLevel.PIECEWISE,
+                                             custom_ops=["+rms_norm"],
+                                             compile_sizes=[2, 4, 8]))
+    vllm_config.compilation_config.pass_config = PassConfig(
+        enable_fi_allreduce_fusion=True)
+    vllm_config.device_config = DeviceConfig(device=torch.device("cuda"))
+
+    # this is a fake model name to construct the model config
+    # in the vllm_config, it's not really used.
+    model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
+    vllm_config.model_config = ModelConfig(model=model_name,
+                                           task="auto",
+                                           tokenizer=model_name,
+                                           tokenizer_mode="auto",
+                                           trust_remote_code=True,
+                                           dtype=dtype,
+                                           seed=42)
+
+    all_reduce_fusion_pass = AllReduceFusionPass(vllm_config)
+    backend = TestBackend(all_reduce_fusion_pass)
+
+    model = test_model_cls(hidden_size)
+
+    hidden_states = torch.randn((batch_size * seq_len, hidden_size),
+                                requires_grad=False)
+    residual = torch.randn((batch_size * seq_len, hidden_size),
+                           requires_grad=False)
+
+    compiled_model = torch.compile(model, backend=backend)
+    compiled_model(hidden_states, residual)
+
+    backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False)
+    backend.check_after_ops(model.ops_in_model_after())
+    del all_reduce_fusion_pass
diff --git a/vllm_v0.10.0/tests/compile/test_fusion_attn.py b/vllm_v0.10.0/tests/compile/test_fusion_attn.py
new file mode 100644
index 0000000..70750eb
--- /dev/null
+++ b/vllm_v0.10.0/tests/compile/test_fusion_attn.py
@@ -0,0 +1,134 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import pytest
+import torch._dynamo
+
+from tests.compile.backend import TestBackend
+from tests.models.utils import check_outputs_equal
+from vllm import LLM, SamplingParams
+from vllm.compilation.fusion import QUANT_OPS, QuantKey, kFp8StaticTensorSym
+from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
+from vllm.compilation.fx_utils import find_op_nodes
+from vllm.compilation.noop_elimination import NoOpEliminationPass
+from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
+from vllm.platforms import current_platform
+
+# globals needed for string-import custom Dynamo backend field
+backend: Optional[TestBackend] = None
+backend_unfused: Optional[TestBackend] = None
+
+
+@pytest.mark.parametrize(
+    "model, quant_key",
+    [("amd/Llama-3.1-8B-Instruct-FP8-KV", kFp8StaticTensorSym)])
+@pytest.mark.parametrize(
+    "use_triton_fa", [True, False] if current_platform.is_rocm() else [False])
+@pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8")
+@pytest.mark.skipif(not current_platform.is_cuda_alike(),
+                    reason="Only test CUDA and ROCm")
+def test_attention_fusion(example_prompts, monkeypatch, model: str,
+                          quant_key: QuantKey, use_triton_fa: bool):
+    # Clean Dynamo cache to avoid reusing other test cases
+    # (for some reason the reset at the end is not enough)
+    torch._dynamo.reset()
+
+    # Use global backends
+    global backend, backend_unfused
+
+    use_v1 = False  # can be made a param once V1 support added
+    monkeypatch.setenv("VLLM_USE_V1", str(int(use_v1)))
+    monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", str(int(use_triton_fa)))
+
+    # Prompt 4 seems too open-ended, differs between fused and unfused
+    # (both outputs look reasonable though)
+    prompts = example_prompts[:4] + example_prompts[5:]
+
+    compile_config = CompilationConfig(
+        # DYNAMO_AS_IS triggers custom backend & does full Dynamo compilation
+        # DYNAMO_ONCE does not properly propagate shapes.
+        level=CompilationLevel.DYNAMO_AS_IS,
+        backend="tests.compile.test_fusion_attn.backend_unfused",
+        custom_ops=["+quant_fp8"],
+    )
+    vllm_config = VllmConfig(compilation_config=compile_config)
+    backend_unfused = TestBackend(NoOpEliminationPass(vllm_config))
+
+    llm = LLM(model,
+              enforce_eager=True,
+              compilation_config=compile_config,
+              gpu_memory_utilization=0.9,
+              max_model_len=2048)
+
+    sampling_params = SamplingParams(temperature=0.0,
+                                     max_tokens=10,
+                                     top_p=0.95)
+
+    unfused_output = llm.generate(prompts, sampling_params)
+    backend_unfused = None  # Reset backend to make sure llm gets released
+    del llm
+
+    compile_config = CompilationConfig(
+        # DYNAMO_AS_IS triggers custom backend & does full Dynamo compilation
+        # DYNAMO_ONCE does not properly propagate shapes.
+        level=CompilationLevel.DYNAMO_AS_IS,
+        backend="tests.compile.test_fusion_attn.backend",
+        custom_ops=["+quant_fp8"],
+    )
+    vllm_config = VllmConfig(compilation_config=compile_config)
+
+    # AttnFusionPass needs attention layers to be registered in config upon init
+    # so we initialize it during compilation.
+    attn_pass = lambda *args, **kw: AttnFusionPass(vllm_config)(*args, **kw)
+    backend = TestBackend(NoOpEliminationPass(vllm_config), attn_pass)
+    llm2 = LLM(model,
+               enforce_eager=True,
+               compilation_config=compile_config,
+               gpu_memory_utilization=0.9,
+               max_model_len=2048)
+
+    # check support
+    attn_fusion_supported = [
+        layer.impl.fused_output_quant_supported(quant_key.dtype,
+                                                quant_key.static,
+                                                quant_key.group_shape)
+        for key, layer in compile_config.static_forward_context.items()
+    ]
+
+    print(f"{attn_fusion_supported=}")
+    if any(attn_fusion_supported):
+        # Check quant ops
+        backend.check_before_ops([QUANT_OPS[quant_key]], fully_replaced=False)
+
+    # attention ops present in both, just output_scale param changes
+    attn_nodes_pre = list(find_op_nodes(ATTN_OP, backend.graph_pre_pass))
+    attn_nodes_post = list(find_op_nodes(ATTN_OP, backend.graph_post_pass))
+    assert len(attn_nodes_pre) == len(attn_nodes_post)
+
+    for i in range(len(attn_nodes_pre)):
+        assert attn_nodes_pre[i].kwargs["output_scale"] is None
+        fused = attn_nodes_post[i].kwargs["output_scale"] is not None
+        assert fused == attn_fusion_supported[i], \
+            f"Node {i} {'' if fused else 'not '} expected " \
+            f"to have fused output quant"
+
+    # check outputs
+    fused_output = llm2.generate(prompts, sampling_params)
+
+    # transform outputs to format expected by check_outputs_equal
+    sample_outs = lambda s: (list(s.token_ids), s.text)
+    outs_lst = lambda ros: [sample_outs(ro.outputs[0]) for ro in ros]
+
+    check_outputs_equal(
+        outputs_0_lst=outs_lst(unfused_output),
+        outputs_1_lst=outs_lst(fused_output),
+        name_0="unfused",
+        name_1="fused",
+    )
+
+    # Clean Dynamo cache to avoid polluting other case(s)
+    torch._dynamo.reset()
+
+    # Reset backend to make sure llm2 gets released
+    backend = None
diff --git a/vllm_v0.10.0/tests/compile/test_pass_manager.py b/vllm_v0.10.0/tests/compile/test_pass_manager.py
new file mode 100644
index 0000000..251cc46
--- /dev/null
+++ b/vllm_v0.10.0/tests/compile/test_pass_manager.py
@@ -0,0 +1,73 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
+
+import pytest
+import torch
+
+from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
+from vllm.compilation.pass_manager import PostGradPassManager
+from vllm.config import VllmConfig
+
+
+# dummy custom pass that doesn't inherit
+def simple_callable(graph: torch.fx.Graph):
+    pass
+
+
+# Should fail to add directly to the pass manager
+def test_bad_callable():
+    config = VllmConfig()
+
+    pass_manager = PostGradPassManager()
+    pass_manager.configure(config)
+
+    with pytest.raises(AssertionError):
+        pass_manager.add(simple_callable)
+
+
+# Pass that inherits from InductorPass
+class ProperPass(InductorPass):
+
+    def __call__(self, graph: torch.fx.graph.Graph) -> None:
+        pass
+
+
+@pytest.mark.parametrize(
+    "callable",
+    [
+        ProperPass(),
+        # Can also wrap callables in CallableInductorPass for compliance
+        CallableInductorPass(simple_callable),
+        CallableInductorPass(simple_callable,
+                             InductorPass.hash_source(__file__))
+    ],
+)
+def test_pass_manager_uuid(callable):
+    config = VllmConfig()
+
+    pass_manager = PostGradPassManager()
+    pass_manager.configure(config)
+
+    # Check that UUID is different if the same pass is added 2x
+    pass_manager.add(callable)
+    uuid1 = pass_manager.uuid()
+    pass_manager.add(callable)
+    uuid2 = pass_manager.uuid()
+    assert uuid1 != uuid2
+
+    # UUID should be the same as the original one,
+    # as we constructed in the same way.
+    pass_manager2 = PostGradPassManager()
+    pass_manager2.configure(config)
+    pass_manager2.add(callable)
+    assert uuid1 == pass_manager2.uuid()
+
+    # UUID should be different due to config change
+    config2 = copy.deepcopy(config)
+    config2.compilation_config.pass_config.enable_fusion = not \
+        config2.compilation_config.pass_config.enable_fusion
+    pass_manager3 = PostGradPassManager()
+    pass_manager3.configure(config2)
+    pass_manager3.add(callable)
+    assert uuid1 != pass_manager3.uuid()
diff --git a/vllm_v0.10.0/tests/compile/test_sequence_parallelism.py b/vllm_v0.10.0/tests/compile/test_sequence_parallelism.py
new file mode 100644
index 0000000..b56edfc
--- /dev/null
+++ b/vllm_v0.10.0/tests/compile/test_sequence_parallelism.py
@@ -0,0 +1,306 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+import vllm.envs as envs
+from vllm.compilation.fix_functionalization import FixFunctionalizationPass
+from vllm.compilation.fusion import FusionPass
+from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
+from vllm.compilation.noop_elimination import NoOpEliminationPass
+from vllm.compilation.sequence_parallelism import SequenceParallelismPass
+from vllm.config import (CompilationConfig, DeviceConfig, ModelConfig,
+                         PassConfig, VllmConfig)
+from vllm.distributed import tensor_model_parallel_all_reduce
+from vllm.distributed.parallel_state import (init_distributed_environment,
+                                             initialize_model_parallel)
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    Fp8LinearOp)
+from vllm.platforms import current_platform
+from vllm.utils import update_environment_variables
+
+from ..utils import multi_gpu_test
+from .backend import TestBackend
+
+FP8_DTYPE = current_platform.fp8_dtype()
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+
+class TestModel(torch.nn.Module):
+
+    def __init__(self,
+                 hidden_size=16,
+                 intermediate_size=32,
+                 vllm_config: VllmConfig = None):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.gate_proj = torch.nn.Parameter(
+            torch.empty((intermediate_size, hidden_size)))
+        self.norm = RMSNorm(intermediate_size, 1e-05)
+        # Initialize weights
+        torch.nn.init.normal_(self.gate_proj, std=0.02)
+
+    def forward(self, hidden_states, residual):
+        """
+        Forward pass implementing the operations in the FX graph
+        
+        Args:
+            hidden_states: Input tensor
+            residual: Residual tensor from previous layer
+            
+        Returns:
+            Tuple containing the output tensor
+        """
+        # Reshape input
+        view = hidden_states.reshape(-1, self.hidden_size)
+
+        #matrix multiplication
+        permute = self.gate_proj.permute(1, 0)
+        mm = torch.mm(view, permute)
+
+        # Tensor parallel all-reduce
+        all_reduce = tensor_model_parallel_all_reduce(mm)
+
+        # layer normalization
+        norm_output, residual_output = self.norm(all_reduce, residual)
+
+        return norm_output, residual_output
+
+    def ops_in_model_before(self):
+        return [torch.ops.vllm.all_reduce.default]
+
+    def ops_in_model_after(self):
+        return [
+            torch.ops.vllm.reduce_scatter.default,
+            torch.ops.vllm.all_gather.default
+        ]
+
+    def ops_in_model(self):
+        return [torch.ops._C.fused_add_rms_norm.default]
+
+
+class TestQuantModel(torch.nn.Module):
+
+    def __init__(self,
+                 hidden_size=16,
+                 intermediate_size=32,
+                 vllm_config: VllmConfig = None):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.vllm_config = vllm_config
+        self.gate_proj = torch.nn.Parameter(torch.empty(
+            (intermediate_size, hidden_size)),
+                                            requires_grad=False)
+        self.norm = RMSNorm(intermediate_size, 1e-05)
+        # Initialize weights
+        torch.nn.init.normal_(self.gate_proj, std=0.02)
+
+        self.fp8_linear = Fp8LinearOp(cutlass_fp8_supported=True,
+                                      use_per_token_if_dynamic=False)
+
+        self.scale = torch.rand(1, dtype=torch.float32)
+        # Create a weight that is compatible with torch._scaled_mm,
+        # which expects a column-major layout.
+        self.w = torch.rand(hidden_size,
+                            intermediate_size).to(dtype=FP8_DTYPE).t()
+        self.wscale = torch.rand(1, dtype=torch.float32)
+
+    def forward(self, hidden_states, residual):
+        """
+        Forward pass implementing the operations in the FX graph
+        
+        Args:
+            hidden_states: Input tensor
+            residual: Residual tensor from previous layer
+            
+        Returns:
+            Tuple containing the output tensor
+        """
+        # Reshape input
+        view = hidden_states.reshape(-1, self.hidden_size)
+
+        #matrix multiplication
+        permute = self.gate_proj.permute(1, 0)
+        mm = torch.mm(view, permute)
+
+        # Tensor parallel all-reduce
+        all_reduce = tensor_model_parallel_all_reduce(mm)
+
+        # layer normalization
+        norm_output, residual_output = self.norm(all_reduce, residual)
+
+        # for static input quantization
+        # self.fp8_linear is initialized with use_per_token_if_dynamic=False
+        fp8_linear_result = self.fp8_linear.apply(norm_output,
+                                                  self.w,
+                                                  self.wscale,
+                                                  input_scale=self.scale.to(
+                                                      norm_output.device))
+
+        return fp8_linear_result, residual_output
+
+    def ops_in_model_before(self):
+        ops_to_remove = [torch.ops.vllm.all_reduce.default
+                         ]  # Always removed by SP
+        # The following are only removed if fusion happens
+        if self.vllm_config and self.vllm_config.compilation_config \
+            .pass_config.enable_fusion:
+            ops_to_remove.extend([
+                torch.ops._C.fused_add_rms_norm.default,
+                torch.ops._C.static_scaled_fp8_quant.default,
+            ])
+        return ops_to_remove
+
+    def ops_in_model_after(self):
+        ops_to_add = [
+            torch.ops.vllm.reduce_scatter.default,
+            torch.ops.vllm.all_gather.default
+        ]
+        # The following is only added if fusion happens
+        if self.vllm_config and self.vllm_config.compilation_config \
+            .pass_config.enable_fusion:
+            ops_to_add.append(
+                torch.ops._C.fused_add_rms_norm_static_fp8_quant.default)
+        return ops_to_add
+
+    def ops_in_model(self):
+        if self.vllm_config and self.vllm_config.compilation_config \
+            .pass_config.enable_fusion:
+            # If fusion happens, the fused op is the one
+            # we check for (de)functionalization
+            return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default
+                    ]  # noqa: E501
+        else:
+            # If no fusion, the original ops are checked
+            return [
+                torch.ops._C.fused_add_rms_norm.default,
+                # TODO  functionalization pass does not handle this yet
+                # torch.ops._C.static_scaled_fp8_quant.default,
+            ]
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("test_model_cls", [TestModel, TestQuantModel])
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize("seq_len", [16])
+@pytest.mark.parametrize("hidden_size", [16])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("enable_fusion", [True, False])
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
+                    reason="Only test on CUDA")
+def test_sequence_parallelism_pass(test_model_cls: type[torch.nn.Module],
+                                   batch_size: int, seq_len: int,
+                                   hidden_size: int, dtype: torch.dtype,
+                                   enable_fusion: bool):
+    num_processes = 2
+
+    def run_torch_spawn(fn, nprocs):
+        # need to use torch.mp.spawn otherwise will have problems with
+        # torch.distributed and cuda
+        torch.multiprocessing.spawn(fn,
+                                    args=(num_processes, test_model_cls,
+                                          batch_size, seq_len, hidden_size,
+                                          dtype, enable_fusion),
+                                    nprocs=nprocs)
+
+    run_torch_spawn(sequence_parallelism_pass_on_test_model, num_processes)
+
+
+def sequence_parallelism_pass_on_test_model(
+        local_rank: int, world_size: int,
+        test_model_cls: type[torch.nn.Module], batch_size: int, seq_len: int,
+        hidden_size: int, dtype: torch.dtype, enable_fusion: bool):
+    current_platform.seed_everything(0)
+
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+    torch.set_default_device(device)
+    torch.set_default_dtype(dtype)
+
+    update_environment_variables({
+        'RANK': str(local_rank),
+        'LOCAL_RANK': str(local_rank),
+        'WORLD_SIZE': str(world_size),
+        'MASTER_ADDR': 'localhost',
+        'MASTER_PORT': '12345',
+    })
+
+    # initialize distributed
+    init_distributed_environment()
+    initialize_model_parallel(tensor_model_parallel_size=world_size)
+
+    # configure vllm config for SequenceParallelismPass
+    vllm_config = VllmConfig()
+    vllm_config.compilation_config = CompilationConfig(pass_config=PassConfig(
+        enable_sequence_parallelism=True,
+        enable_fusion=enable_fusion,
+        enable_noop=True))  # NoOp needed for fusion
+    vllm_config.device_config = DeviceConfig(device=torch.device("cuda"))
+
+    # this is a fake model name to construct the model config
+    # in the vllm_config, it's not really used.
+    model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
+    vllm_config.model_config = ModelConfig(model=model_name,
+                                           task="auto",
+                                           tokenizer=model_name,
+                                           tokenizer_mode="auto",
+                                           trust_remote_code=True,
+                                           dtype=dtype,
+                                           seed=42)
+
+    sequence_parallelism_pass = SequenceParallelismPass(vllm_config)
+    noop_pass = NoOpEliminationPass(vllm_config)
+    func_pass = FixFunctionalizationPass(vllm_config)
+
+    passes_for_backend = [noop_pass, sequence_parallelism_pass]
+
+    if enable_fusion:
+        fusion_pass = FusionPass.instance(vllm_config)
+        passes_for_backend.append(fusion_pass)
+
+    backend_no_func = TestBackend(*passes_for_backend)
+    backend_func = TestBackend(*passes_for_backend, func_pass)
+
+    model = test_model_cls(hidden_size,
+                           hidden_size * 2,
+                           vllm_config=vllm_config)
+
+    hidden_states = torch.randn((batch_size * seq_len, hidden_size),
+                                dtype=dtype)
+    residual = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
+
+    compiled_model_no_func = torch.compile(model, backend=backend_no_func)
+    compiled_model_no_func(hidden_states, residual)
+    compiled_model_func = torch.compile(model, backend=backend_func)
+    compiled_model_func(hidden_states, residual)
+
+    # In pre-nodes, all reduce should be there,
+    # reduce scatter and all gather should not
+    backend_no_func.check_before_ops(model.ops_in_model_before())
+
+    # In post-nodes, reduce scatter and all gather should be there,
+    # all reduce should not
+    backend_no_func.check_after_ops(model.ops_in_model_after())
+
+    # check if the functionalization pass is applied
+    for op in model.ops_in_model():
+        find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
+        assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes,
+                                  op) is None  # noqa: E501
+
+    # make sure the ops were all de-functionalized
+    found = dict()
+    for node in backend_func.graph_post_pass.nodes:
+        for op in model.ops_in_model():
+            if is_func(node, op):
+                found[op] = True
+    assert all(found[op] for op in model.ops_in_model())
diff --git a/vllm_v0.10.0/tests/compile/test_silu_mul_quant_fusion.py b/vllm_v0.10.0/tests/compile/test_silu_mul_quant_fusion.py
new file mode 100644
index 0000000..5351a3c
--- /dev/null
+++ b/vllm_v0.10.0/tests/compile/test_silu_mul_quant_fusion.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+import vllm.envs as envs
+from vllm.compilation.activation_quant_fusion import ActivationQuantFusionPass
+from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe
+from vllm.compilation.noop_elimination import NoOpEliminationPass
+from vllm.config import CompilationConfig, PassConfig, VllmConfig
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    CUTLASS_FP8_SUPPORTED, Fp8LinearOp)
+from vllm.platforms import current_platform
+
+from .backend import TestBackend
+
+
+class TestModel(torch.nn.Module):
+
+    def __init__(self, hidden_size: int, cutlass_fp8_enabled: bool, *args,
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        self.silu_and_mul = SiluAndMul()
+        self.wscale = torch.rand(1, dtype=torch.float32)
+        self.scale = torch.rand(1, dtype=torch.float32)
+
+        self.w = (torch.rand(
+            hidden_size,
+            hidden_size).to(dtype=current_platform.fp8_dtype()).t())
+
+        self.fp8_linear = Fp8LinearOp(
+            cutlass_fp8_supported=cutlass_fp8_enabled,
+            act_quant_static=True,
+            act_quant_group_shape=GroupShape.PER_TENSOR,
+        )
+
+    def forward(self, x):
+        y = self.silu_and_mul(x)
+        x2 = self.fp8_linear.apply(y,
+                                   self.w,
+                                   self.wscale,
+                                   input_scale=self.wscale)
+        return x2
+
+
+@pytest.mark.parametrize("num_tokens", [256])
+@pytest.mark.parametrize("hidden_size", [64])
+@pytest.mark.parametrize("cutlass_fp8_enabled",
+                         [True, False] if CUTLASS_FP8_SUPPORTED else [False])
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
+                    reason="Only test on CUDA and ROCm")
+def test_fusion_silu_and_mul_quant(num_tokens, hidden_size,
+                                   cutlass_fp8_enabled):
+    torch.set_default_device("cuda")
+    torch.set_default_dtype(torch.float16)
+
+    # Reshape pass is needed for the fusion pass to work
+    config = VllmConfig()
+    config.compilation_config = CompilationConfig(
+        pass_config=PassConfig(enable_fusion=True, enable_noop=True))
+    fusion_pass = ActivationQuantFusionPass(config)
+
+    backend = TestBackend(NoOpEliminationPass(config), fusion_pass)
+    model = TestModel(hidden_size, cutlass_fp8_enabled)
+
+    # First dimension dynamic
+    x = torch.rand(num_tokens, hidden_size * 2)
+    torch._dynamo.mark_dynamic(x, 0)
+
+    result = model(x)
+
+    model2 = torch.compile(model, backend=backend)
+    result2 = model2(x)
+
+    # Check that it gives the same answer
+    torch.testing.assert_close(result[0].to(dtype=torch.float16),
+                               result2[0].to(dtype=torch.float16),
+                               atol=1e-3,
+                               rtol=1e-3)
+
+    # Check substitution worked
+    pre_nodes = backend.graph_pre_pass.nodes
+    post_nodes = backend.graph_post_pass.nodes
+
+    silu_and_mul_quant = torch.ops._C.silu_and_mul_quant.default
+    fp8_quant = torch.ops._C.static_scaled_fp8_quant.default
+
+    # In pre-nodes, fp8 quant should be present and fused kernels should not
+    assert find_auto_fn_maybe(pre_nodes, silu_and_mul_quant) is None
+    find_auto_fn(pre_nodes, fp8_quant)
+
+    # In post-nodes, fused kernels should be present and fp8 quant should not
+    find_auto_fn(post_nodes, silu_and_mul_quant)
+    assert find_auto_fn_maybe(post_nodes, fp8_quant) is None
diff --git a/vllm_v0.10.0/tests/compile/test_wrapper.py b/vllm_v0.10.0/tests/compile/test_wrapper.py
new file mode 100644
index 0000000..5e39f68
--- /dev/null
+++ b/vllm_v0.10.0/tests/compile/test_wrapper.py
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
+from vllm.config import CompilationLevel
+
+
+class MyMod(torch.nn.Module):
+
+    def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
+        if cache is not None:
+            return x + cache
+        return x * 2
+
+
+class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
+
+    def __init__(self, model):
+        self.model = model
+        compiled_callable = torch.compile(self.forward, backend="eager")
+        super().__init__(compiled_callable,
+                         compilation_level=CompilationLevel.DYNAMO_ONCE)
+
+    def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
+        # this is the function to be compiled
+        return self.model(x, cache)
+
+    def __call__(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
+        # let torch.compile compile twice
+        if len(self.compiled_codes) == 2:
+            dispatch_id = 0 if cache is None else 1
+            with self.dispatch_to_code(dispatch_id):
+                return self.forward(x, cache)
+        else:
+            return self.compiled_callable(x, cache)
+
+
+def test_torch_compile_wrapper():
+    mod = MyMod()
+    wrappers = []
+    for i in range(3):
+        torch._dynamo.reset()
+        wrapper = MyWrapper(mod)
+        wrappers.append(wrapper)
+        x = torch.tensor([1])
+        wrapper(x, None)  # profile run, compile
+        # create a cache tensor
+        cache = torch.tensor([2])
+        wrapper(x, cache)  # warm up with cache, recompile
+
+        # for new input, dispatch to the compiled code directly
+        new_x = torch.tensor([3])
+        assert wrapper(new_x,
+                       None).item() == 6  # dispatch to the first compiled code
+        assert wrapper(
+            new_x, cache).item() == 5  # dispatch to the second compiled code
+
+    for wrapper in wrappers:
+        # make sure they have independent compiled codes
+        assert len(wrapper.compiled_codes) == 2
diff --git a/vllm_v0.10.0/tests/config/test_config.yaml b/vllm_v0.10.0/tests/config/test_config.yaml
new file mode 100644
index 0000000..5090e8f
--- /dev/null
+++ b/vllm_v0.10.0/tests/config/test_config.yaml
@@ -0,0 +1,5 @@
+port: 12312
+served_model_name: mymodel
+tensor_parallel_size: 2
+trust_remote_code: true
+multi_step_stream_outputs: false
diff --git a/vllm_v0.10.0/tests/config/test_config_generation.py b/vllm_v0.10.0/tests/config/test_config_generation.py
new file mode 100644
index 0000000..024e81f
--- /dev/null
+++ b/vllm_v0.10.0/tests/config/test_config_generation.py
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.model_executor.layers.quantization.quark.utils import deep_compare
+
+
+def test_cuda_empty_vs_unset_configs(monkeypatch: pytest.MonkeyPatch):
+    """Test that configs created with normal (untouched) CUDA_VISIBLE_DEVICES
+    and CUDA_VISIBLE_DEVICES="" are equivalent. This ensures consistent
+    behavior regardless of whether GPU visibility is disabled via empty string
+    or left in its normal state.
+    """
+
+    def create_config():
+        engine_args = EngineArgs(model="deepseek-ai/DeepSeek-V2-Lite",
+                                 trust_remote_code=True)
+        return engine_args.create_engine_config()
+
+    # Create config with CUDA_VISIBLE_DEVICES set normally
+    normal_config = create_config()
+
+    # Create config with CUDA_VISIBLE_DEVICES=""
+    with monkeypatch.context() as m:
+        m.setenv("CUDA_VISIBLE_DEVICES", "")
+        empty_config = create_config()
+
+    normal_config_dict = vars(normal_config)
+    empty_config_dict = vars(empty_config)
+
+    # Remove instance_id before comparison as it's expected to be different
+    normal_config_dict.pop("instance_id", None)
+    empty_config_dict.pop("instance_id", None)
+
+    assert deep_compare(normal_config_dict, empty_config_dict), (
+        "Configs with normal CUDA_VISIBLE_DEVICES and CUDA_VISIBLE_DEVICES=\"\""
+        " should be equivalent")
diff --git a/vllm_v0.10.0/tests/config/test_config_with_model.yaml b/vllm_v0.10.0/tests/config/test_config_with_model.yaml
new file mode 100644
index 0000000..d8c8c7b
--- /dev/null
+++ b/vllm_v0.10.0/tests/config/test_config_with_model.yaml
@@ -0,0 +1,7 @@
+# Same as test_config.yaml but with model specified
+model: config-model
+port: 12312
+served_model_name: mymodel
+tensor_parallel_size: 2
+trust_remote_code: true
+multi_step_stream_outputs: false
diff --git a/vllm_v0.10.0/tests/config/test_mp_reducer.py b/vllm_v0.10.0/tests/config/test_mp_reducer.py
new file mode 100644
index 0000000..ee351cb
--- /dev/null
+++ b/vllm_v0.10.0/tests/config/test_mp_reducer.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import sys
+from unittest.mock import patch
+
+from vllm.config import VllmConfig
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.v1.engine.async_llm import AsyncLLM
+
+
+def test_mp_reducer(monkeypatch):
+    """
+    Test that _reduce_config reducer is registered when AsyncLLM is instantiated
+    without transformers_modules. This is a regression test for
+    https://github.com/vllm-project/vllm/pull/18640.
+    """
+
+    # Use V1 AsyncLLM which calls maybe_register_config_serialize_by_value
+    monkeypatch.setenv('VLLM_USE_V1', '1')
+
+    # Ensure transformers_modules is not in sys.modules
+    if 'transformers_modules' in sys.modules:
+        del sys.modules['transformers_modules']
+
+    with patch('multiprocessing.reducer.register') as mock_register:
+        engine_args = AsyncEngineArgs(
+            model="facebook/opt-125m",
+            max_model_len=32,
+            gpu_memory_utilization=0.1,
+            disable_log_stats=True,
+            disable_log_requests=True,
+        )
+
+        async_llm = AsyncLLM.from_engine_args(
+            engine_args,
+            start_engine_loop=False,
+        )
+
+        assert mock_register.called, (
+            "multiprocessing.reducer.register should have been called")
+
+        vllm_config_registered = False
+        for call_args in mock_register.call_args_list:
+            # Verify that a reducer for VllmConfig was registered
+            if len(call_args[0]) >= 2 and call_args[0][0] == VllmConfig:
+                vllm_config_registered = True
+
+                reducer_func = call_args[0][1]
+                assert callable(
+                    reducer_func), "Reducer function should be callable"
+                break
+
+        assert vllm_config_registered, (
+            "VllmConfig should have been registered to multiprocessing.reducer"
+        )
+
+        async_llm.shutdown()
diff --git a/vllm_v0.10.0/tests/conftest.py b/vllm_v0.10.0/tests/conftest.py
new file mode 100644
index 0000000..a18dbf5
--- /dev/null
+++ b/vllm_v0.10.0/tests/conftest.py
@@ -0,0 +1,1197 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+import os
+import tempfile
+from enum import Enum
+from typing import Any, Callable, Optional, TypedDict, TypeVar, Union
+
+import numpy as np
+import pytest
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from huggingface_hub import snapshot_download
+from PIL import Image
+from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
+                          BatchEncoding, BatchFeature)
+from transformers.models.auto.auto_factory import _BaseAutoModelClass
+
+from tests.models.utils import (TokensTextLogprobs,
+                                TokensTextLogprobsPromptLogprobs)
+from vllm import LLM, SamplingParams
+from vllm.assets.audio import AudioAsset
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+from vllm.config import TaskOption, _get_and_verify_dtype
+from vllm.connections import global_http_connection
+from vllm.distributed import (cleanup_dist_env_and_memory,
+                              init_distributed_environment,
+                              initialize_model_parallel)
+from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
+                         to_enc_dec_tuple_list, zip_enc_dec_prompts)
+from vllm.logger import init_logger
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import BeamSearchParams
+from vllm.transformers_utils.utils import maybe_model_redirect
+
+logger = init_logger(__name__)
+
+_TEST_DIR = os.path.dirname(__file__)
+_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
+_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
+_SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
+
+_M = TypeVar("_M")
+
+_PromptMultiModalInput = Union[list[_M], list[list[_M]]]
+
+PromptImageInput = _PromptMultiModalInput[Image.Image]
+PromptAudioInput = _PromptMultiModalInput[tuple[np.ndarray, int]]
+PromptVideoInput = _PromptMultiModalInput[np.ndarray]
+
+
+def _read_prompts(filename: str) -> list[str]:
+    with open(filename) as f:
+        prompts = f.readlines()
+        return prompts
+
+
+class ImageAssetPrompts(TypedDict):
+    stop_sign: str
+    cherry_blossom: str
+
+
+class ImageTestAssets(list[ImageAsset]):
+
+    def __init__(self) -> None:
+        super().__init__([
+            ImageAsset("stop_sign"),
+            ImageAsset("cherry_blossom"),
+        ])
+
+    def prompts(self, prompts: ImageAssetPrompts) -> list[str]:
+        """
+        Convenience method to define the prompt for each test image.
+
+        The order of the returned prompts matches the order of the
+        assets when iterating through this object.
+        """
+        return [prompts["stop_sign"], prompts["cherry_blossom"]]
+
+
+class VideoAssetPrompts(TypedDict):
+    baby_reading: str
+
+
+class VideoTestAssets(list[VideoAsset]):
+
+    def __init__(self) -> None:
+        super().__init__([
+            VideoAsset("baby_reading"),
+        ])
+
+    def prompts(self, prompts: VideoAssetPrompts) -> list[str]:
+        return [prompts["baby_reading"]]
+
+
+class AudioAssetPrompts(TypedDict):
+    mary_had_lamb: str
+    winning_call: str
+
+
+class AudioTestAssets(list[AudioAsset]):
+
+    def __init__(self) -> None:
+        super().__init__([
+            AudioAsset("mary_had_lamb"),
+            AudioAsset("winning_call"),
+        ])
+
+    def prompts(self, prompts: AudioAssetPrompts) -> list[str]:
+        return [prompts["mary_had_lamb"], prompts["winning_call"]]
+
+
+IMAGE_ASSETS = ImageTestAssets()
+"""Singleton instance of {class}`ImageTestAssets`."""
+VIDEO_ASSETS = VideoTestAssets()
+"""Singleton instance of {class}`VideoTestAssets`."""
+AUDIO_ASSETS = AudioTestAssets()
+"""Singleton instance of {class}`AudioTestAssets`."""
+
+
+@pytest.fixture(scope="function", autouse=True)
+def cleanup_VLLM_USE_V1(monkeypatch):
+    """
+    The V1 oracle sets "VLLM_USE_V1" during loading. This means
+    that each invocation of a test change the env variable.
+
+    If we touch "VLLM_USE_V1" with monkeypatch, then any changes
+    made during the test run by vLLM will be cleaned up.
+
+    This fixture is used by every test.
+    """
+
+    # If VLLM_USE_V1 is not set, set then delete. This will
+    # cause monkeypatch to clean up VLLM_USE_V1 upon exit
+    # if VLLM modifies the value of envs.VLLM_USE_V1.
+    if "VLLM_USE_V1" not in os.environ:
+        monkeypatch.setenv("VLLM_USE_V1", "")
+        monkeypatch.delenv("VLLM_USE_V1")
+
+
+@pytest.fixture(params=[True, False])
+def run_with_both_engines(request, monkeypatch):
+    # Automatically runs tests twice, once with V1 and once without
+    use_v1 = request.param
+    # Tests decorated with `@skip_v1` are only run without v1
+    skip_v0 = request.node.get_closest_marker("skip_v0")
+    skip_v1 = request.node.get_closest_marker("skip_v1")
+
+    if use_v1:
+        if skip_v1:
+            pytest.skip("Skipping test on vllm V1")
+        monkeypatch.setenv('VLLM_USE_V1', '1')
+    else:
+        if skip_v0:
+            pytest.skip("Skipping test on vllm V0")
+        monkeypatch.setenv('VLLM_USE_V1', '0')
+
+    yield
+
+
+@pytest.fixture(autouse=True)
+def init_test_http_connection():
+    # pytest_asyncio may use a different event loop per test
+    # so we need to make sure the async client is created anew
+    global_http_connection.reuse_client = False
+
+
+@pytest.fixture
+def dist_init():
+    temp_file = tempfile.mkstemp()[1]
+    init_distributed_environment(
+        world_size=1,
+        rank=0,
+        distributed_init_method=f"file://{temp_file}",
+        local_rank=0,
+        backend="nccl",
+    )
+    initialize_model_parallel(1, 1)
+    yield
+    cleanup_dist_env_and_memory()
+
+
+@pytest.fixture()
+def should_do_global_cleanup_after_test(request) -> bool:
+    """Allow subdirectories to skip global cleanup by overriding this fixture.
+    This can provide a ~10x speedup for non-GPU unit tests since they don't need
+    to initialize torch.
+    """
+
+    return not request.node.get_closest_marker("skip_global_cleanup")
+
+
+@pytest.fixture(autouse=True)
+def cleanup_fixture(should_do_global_cleanup_after_test: bool):
+    yield
+    if should_do_global_cleanup_after_test:
+        cleanup_dist_env_and_memory()
+
+
+@pytest.fixture(autouse=True)
+def dynamo_reset():
+    yield
+    torch._dynamo.reset()
+
+
+@pytest.fixture
+def example_prompts() -> list[str]:
+    prompts = []
+    for filename in _TEST_PROMPTS:
+        prompts += _read_prompts(filename)
+    return prompts
+
+
+@pytest.fixture
+def example_system_message() -> str:
+    with open(_SYS_MSG) as f:
+        return f.read()
+
+
+class DecoderPromptType(Enum):
+    """For encoder/decoder models only."""
+    CUSTOM = 1
+    NONE = 2
+    EMPTY_STR = 3
+
+
+@pytest.fixture
+def example_encoder_decoder_prompts(
+) -> dict[DecoderPromptType, list[ExplicitEncoderDecoderPrompt]]:
+    '''
+    Returns an encoder prompt list and a decoder prompt list, wherein each pair
+    of same-index entries in both lists corresponds to an (encoder prompt,
+    decoder prompt) tuple.
+
+    Returns:
+
+    * Encoder prompt list
+    * Decoder prompt list (reverse of encoder prompt list)
+    '''
+
+    encoder_prompts = []
+    for filename in _TEST_PROMPTS:
+        encoder_prompts += _read_prompts(filename)
+
+    custom_decoder_prompts = encoder_prompts[::-1]
+    empty_str_decoder_prompts = [""] * len(encoder_prompts)
+    none_decoder_prompts = [None] * len(encoder_prompts)
+
+    # NONE decoder prompt type
+    return {
+        DecoderPromptType.NONE:
+        zip_enc_dec_prompts(encoder_prompts, none_decoder_prompts),
+        DecoderPromptType.EMPTY_STR:
+        zip_enc_dec_prompts(encoder_prompts, empty_str_decoder_prompts),
+        DecoderPromptType.CUSTOM:
+        zip_enc_dec_prompts(encoder_prompts, custom_decoder_prompts),
+    }
+
+
+@pytest.fixture
+def example_long_prompts() -> list[str]:
+    prompts = []
+    for filename in _LONG_PROMPTS:
+        prompts += _read_prompts(filename)
+    return prompts
+
+
+@pytest.fixture(scope="session")
+def image_assets() -> ImageTestAssets:
+    return IMAGE_ASSETS
+
+
+@pytest.fixture(scope="session")
+def video_assets() -> VideoTestAssets:
+    return VIDEO_ASSETS
+
+
+@pytest.fixture(scope="session")
+def audio_assets() -> AudioTestAssets:
+    return AUDIO_ASSETS
+
+
+_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict)
+_R = TypeVar("_R")
+
+
+class HfRunner:
+
+    def get_default_device(self):
+        from vllm.platforms import current_platform
+
+        return ("cpu"
+                if current_platform.is_cpu() else current_platform.device_type)
+
+    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
+        if x is None or isinstance(x, (bool, )):
+            return x
+
+        if device is None:
+            device = self.device
+
+        if isinstance(x, dict):
+            return {k: self.wrap_device(v, device) for k, v in x.items()}
+
+        if hasattr(x, "device") and x.device.type == device:
+            return x
+
+        return x.to(device)
+
+    def __init__(
+        self,
+        model_name: str,
+        dtype: str = "auto",
+        *,
+        model_kwargs: Optional[dict[str, Any]] = None,
+        trust_remote_code: bool = True,
+        is_sentence_transformer: bool = False,
+        is_cross_encoder: bool = False,
+        skip_tokenizer_init: bool = False,
+        auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
+    ) -> None:
+        model_name = maybe_model_redirect(model_name)
+        self.model_name = model_name
+
+        self.config = AutoConfig.from_pretrained(
+            model_name,
+            trust_remote_code=trust_remote_code,
+        )
+        self.device = self.get_default_device()
+        self.dtype = torch_dtype = _get_and_verify_dtype(
+            self.model_name,
+            self.config,
+            dtype=dtype,
+            is_pooling_model=is_sentence_transformer or is_cross_encoder,
+        )
+
+        model_kwargs = model_kwargs if model_kwargs is not None else {}
+        model_kwargs.setdefault("torch_dtype", torch_dtype)
+
+        if is_sentence_transformer:
+            # Lazy init required for AMD CI
+            from sentence_transformers import SentenceTransformer
+
+            self.model = SentenceTransformer(
+                model_name,
+                device=self.device,
+                model_kwargs=model_kwargs,
+                trust_remote_code=trust_remote_code,
+            )
+        elif is_cross_encoder:
+            # Lazy init required for AMD CI
+            from sentence_transformers import CrossEncoder
+
+            self.model = CrossEncoder(
+                model_name,
+                device=self.device,
+                automodel_args=model_kwargs,
+                trust_remote_code=trust_remote_code,
+            )
+        else:
+            model = auto_cls.from_pretrained(
+                model_name,
+                trust_remote_code=trust_remote_code,
+                **model_kwargs,
+            )
+
+            # in case some unquantized custom models are not in same dtype
+            if (getattr(model, "quantization_method", None) is None
+                    and any(p.dtype != self.dtype
+                            for p in model.parameters())):
+                model = model.to(dtype=self.dtype)
+
+            if (getattr(model, "quantization_method", None) != "bitsandbytes"
+                    and len({p.device
+                             for p in model.parameters()}) < 2):
+                model = model.to(device=self.device)
+
+            self.model = model
+
+        if not skip_tokenizer_init:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                model_name,
+                torch_dtype=torch_dtype,
+                trust_remote_code=trust_remote_code,
+            )
+
+        # don't put this import at the top level
+        # it will call torch.cuda.device_count()
+        from transformers import AutoProcessor  # noqa: F401
+        self.processor = AutoProcessor.from_pretrained(
+            model_name,
+            torch_dtype=torch_dtype,
+            trust_remote_code=trust_remote_code,
+        )
+        if skip_tokenizer_init:
+            self.tokenizer = self.processor.tokenizer
+
+    def get_inputs(
+        self,
+        prompts: list[str],
+        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+    ) -> list[Union[BatchFeature, BatchEncoding]]:
+        if images is not None:
+            assert len(prompts) == len(images)
+
+        if videos is not None:
+            assert len(prompts) == len(videos)
+
+        if audios is not None:
+            assert len(prompts) == len(audios)
+
+        all_inputs: list[Union[BatchFeature, BatchEncoding]] = []
+        for i, prompt in enumerate(prompts):
+            processor_kwargs: dict[str, Any] = {
+                "text": prompt,
+                "return_tensors": "pt",
+            }
+            if images is not None and (image := images[i]) is not None:
+                processor_kwargs["images"] = image
+            if videos is not None and (video := videos[i]) is not None:
+                processor_kwargs["videos"] = video
+            if audios is not None and (audio_inputs := audios[i]) is not None:
+                # HACK - not all processors take sampling_rate; we should
+                # clean this up in the future.
+                if len(audio_inputs) == 2:
+                    audio, sr = audio_inputs
+                    processor_kwargs["audio"] = audio
+                    processor_kwargs["sampling_rate"] = sr
+                else:
+                    processor_kwargs["audio"] = audio_inputs
+
+            inputs = self.processor(**processor_kwargs)
+            if isinstance(inputs, BatchFeature):
+                inputs = inputs.to(dtype=self.dtype)
+
+            all_inputs.append(inputs)
+
+        return all_inputs
+
+    def get_prompt_embeddings(self, prompts: list[str]) -> list[torch.Tensor]:
+        all_inputs = self.get_inputs(prompts)
+        embeddings = []
+        for inputs in all_inputs:
+            input_ids = self.wrap_device(inputs)["input_ids"]
+            embedding = self.model.get_input_embeddings()(input_ids).squeeze(0)
+            embeddings.append(embedding)
+        return embeddings
+
+    def classify(self, prompts: list[str]) -> list[str]:
+        # output is final logits
+        all_inputs = self.get_inputs(prompts)
+        outputs = []
+        for inputs in all_inputs:
+            output = self.model(**self.wrap_device(inputs))
+            logits = output.logits.softmax(dim=-1)[0].tolist()
+            outputs.append(logits)
+
+        return outputs
+
+    def generate(
+        self,
+        prompts: list[str],
+        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+        **kwargs: Any,
+    ) -> list[tuple[list[list[int]], list[str]]]:
+        all_inputs = self.get_inputs(prompts,
+                                     images=images,
+                                     videos=videos,
+                                     audios=audios)
+
+        outputs: list[tuple[list[list[int]], list[str]]] = []
+        for inputs in all_inputs:
+            output_ids = self.model.generate(
+                **self.wrap_device(inputs),
+                use_cache=True,
+                **kwargs,
+            )
+            output_str = self.processor.batch_decode(
+                output_ids,
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=False,
+            )
+            output_ids = output_ids.cpu().tolist()
+            outputs.append((output_ids, output_str))
+        return outputs
+
+    def generate_greedy(
+        self,
+        prompts: list[str],
+        max_tokens: int,
+        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+        **kwargs: Any,
+    ) -> list[tuple[list[int], str]]:
+        outputs = self.generate(prompts,
+                                do_sample=False,
+                                max_new_tokens=max_tokens,
+                                images=images,
+                                videos=videos,
+                                audios=audios,
+                                **kwargs)
+
+        return [(output_ids[0], output_str[0])
+                for output_ids, output_str in outputs]
+
+    def generate_beam_search(
+        self,
+        prompts: list[str],
+        beam_width: int,
+        max_tokens: int,
+        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+    ) -> list[tuple[list[list[int]], list[str]]]:
+        outputs = self.generate(prompts,
+                                do_sample=False,
+                                max_new_tokens=max_tokens,
+                                num_beams=beam_width,
+                                num_return_sequences=beam_width,
+                                images=images,
+                                videos=videos,
+                                audios=audios)
+
+        for i in range(len(outputs)):
+            output_ids, output_str = outputs[i]
+            for j in range(len(output_ids)):
+                output_ids[j] = [
+                    x for x in output_ids[j]
+                    if x != self.tokenizer.pad_token_id
+                ]
+            outputs[i] = (output_ids, output_str)
+        return outputs
+
+    def generate_greedy_logprobs(
+        self,
+        prompts: list[str],
+        max_tokens: int,
+        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+        **kwargs: Any,
+    ) -> list[list[torch.Tensor]]:
+        all_inputs = self.get_inputs(prompts,
+                                     images=images,
+                                     videos=videos,
+                                     audios=audios)
+
+        all_logprobs: list[list[torch.Tensor]] = []
+        for inputs in all_inputs:
+            output = self.model.generate(
+                **self.wrap_device(inputs),
+                use_cache=True,
+                do_sample=False,
+                max_new_tokens=max_tokens,
+                output_hidden_states=True,
+                return_dict_in_generate=True,
+                **kwargs,
+            )
+            seq_logprobs = self._hidden_states_to_seq_logprobs(
+                output.hidden_states)
+            all_logprobs.append(seq_logprobs)
+        return all_logprobs
+
+    def _hidden_states_to_seq_logprobs(
+        self,
+        hidden_states: tuple[tuple[torch.Tensor, ...], ...],
+    ) -> list[torch.Tensor]:
+        output_embeddings = self.model.get_output_embeddings()
+
+        seq_logprobs: list[torch.Tensor] = []
+        for _, hidden_state in enumerate(hidden_states):
+            last_hidden_states = hidden_state[-1][0]
+            logits = torch.matmul(
+                last_hidden_states.to(
+                    device=output_embeddings.weight.device,
+                    dtype=output_embeddings.weight.dtype,
+                ),
+                output_embeddings.weight.t(),
+            )
+            if getattr(output_embeddings, "bias", None) is not None:
+                logits += output_embeddings.bias.unsqueeze(0)
+            logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
+            seq_logprobs.append(logprobs)
+
+        return seq_logprobs
+
+    def _hidden_states_to_logprobs(
+        self,
+        hidden_states: tuple[tuple[torch.Tensor, ...], ...],
+        num_logprobs: int,
+    ) -> tuple[list[dict[int, float]], int]:
+        seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
+        output_len = len(hidden_states)
+
+        # convert to dict
+        seq_logprobs_lst: list[dict[int, float]] = []
+        for tok_idx, tok_logprobs in enumerate(seq_logprobs):
+            # drop prompt logprobs
+            if tok_idx == 0:
+                tok_logprobs = tok_logprobs[-1, :].reshape(1, -1)
+            topk = tok_logprobs.topk(num_logprobs)
+
+            tok_logprobs_dct = {}
+            for token_id, logprob in zip(topk.indices[0], topk.values[0]):
+                tok_logprobs_dct[token_id.item()] = logprob.item()
+
+            seq_logprobs_lst.append(tok_logprobs_dct)
+
+        return (
+            seq_logprobs_lst,
+            output_len,
+        )
+
+    def generate_greedy_logprobs_limit(
+        self,
+        prompts: list[str],
+        max_tokens: int,
+        num_logprobs: int,
+        images: Optional[PromptImageInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        **kwargs: Any,
+    ) -> list[TokensTextLogprobs]:
+        all_inputs = self.get_inputs(prompts,
+                                     images=images,
+                                     videos=videos,
+                                     audios=audios)
+
+        all_logprobs: list[list[dict[int, float]]] = []
+        all_output_ids: list[list[int]] = []
+        all_output_strs: list[str] = []
+
+        for inputs in all_inputs:
+            output = self.model.generate(
+                **self.wrap_device(inputs),
+                use_cache=True,
+                do_sample=False,
+                max_new_tokens=max_tokens,
+                output_hidden_states=True,
+                return_dict_in_generate=True,
+                **kwargs,
+            )
+
+            (
+                seq_logprobs_lst,
+                output_len,
+            ) = self._hidden_states_to_logprobs(output.hidden_states,
+                                                num_logprobs)
+
+            all_logprobs.append(seq_logprobs_lst)
+            seq_ids = output.sequences[0]
+            output_len = len(seq_logprobs_lst)
+            output_ids = seq_ids[-output_len:]
+            all_output_ids.append(output_ids.tolist())
+            all_output_strs.append(self.tokenizer.decode(output_ids))
+
+        outputs = zip(all_output_ids, all_output_strs, all_logprobs)
+        return [(output_ids, output_str, output_logprobs)
+                for output_ids, output_str, output_logprobs in outputs]
+
+    def generate_encoder_decoder_greedy_logprobs_limit(
+        self,
+        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
+        max_tokens: int,
+        num_logprobs: int,
+        images: Optional[PromptImageInput] = None,
+        **kwargs: Any,
+    ) -> list[TokensTextLogprobs]:
+        '''
+        Greedy logprobs generation for vLLM encoder/decoder models
+        '''
+
+        all_logprobs: list[list[dict[int, float]]] = []
+        all_output_ids: list[list[int]] = []
+        all_output_strs: list[str] = []
+
+        for i, (encoder_prompt, decoder_prompt) in enumerate(
+                to_enc_dec_tuple_list(encoder_decoder_prompts)):
+            processor_kwargs: dict[str, Any] = {
+                "text": encoder_prompt,
+                "return_tensors": "pt",
+            }
+            if images is not None and images[i] is not None:
+                processor_kwargs["images"] = images[i]
+
+            encoder_inputs = self.processor(**processor_kwargs)
+            encoder_inputs = self.wrap_device(encoder_inputs)
+
+            if decoder_prompt is None:
+                decoder_input_ids = None
+            else:
+                decoder_inputs = self.tokenizer(decoder_prompt,
+                                                return_tensors="pt")
+                decoder_input_ids = self.wrap_device(decoder_inputs.input_ids)
+
+            output = self.model.generate(
+                decoder_input_ids=decoder_input_ids,
+                use_cache=True,
+                do_sample=False,
+                max_new_tokens=max_tokens,
+                output_hidden_states=True,
+                return_dict_in_generate=True,
+                **encoder_inputs,
+                **kwargs,
+            )
+
+            (
+                seq_logprobs_lst,
+                output_len,
+            ) = self._hidden_states_to_logprobs(output.decoder_hidden_states,
+                                                num_logprobs)
+
+            all_logprobs.append(seq_logprobs_lst)
+            seq_ids = output.sequences[0]
+            output_ids = seq_ids[-output_len:]
+            all_output_ids.append(output_ids.tolist())
+            all_output_strs.append(self.tokenizer.decode(output_ids))
+
+        outputs = zip(all_output_ids, all_output_strs, all_logprobs)
+        return [(output_ids, output_str, output_logprobs)
+                for output_ids, output_str, output_logprobs in outputs]
+
+    def encode(self, prompts: list[str], *args,
+               **kwargs) -> list[list[torch.Tensor]]:
+        return self.model.encode(prompts, *args, **kwargs)
+
+    def predict(self, prompts: list[list[str]], *args,
+                **kwargs) -> torch.Tensor:
+        return self.model.predict(prompts,
+                                  *args,
+                                  convert_to_tensor=True,
+                                  **kwargs)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        del self.model
+        cleanup_dist_env_and_memory()
+
+
+@pytest.fixture(scope="session")
+def hf_runner():
+    return HfRunner
+
+
+class VllmRunner:
+    """
+    The default value of some arguments have been modified from
+    {class}`~vllm.LLM` as follows:
+
+    - `trust_remote_code`: Set to `True` instead of `False` for convenience.
+    - `seed`: Set to `0` instead of `None` for test reproducibility.
+    - `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
+    - `block_size`: To reduce memory usage, set default to `64` if on XPU
+        devices, otherwise default to `16`.
+    - `enable_chunked_prefill`: Set to `False` instead of `None` for
+      test reproducibility.
+    - `enforce_eager`: Set to `False` to test CUDA graph.
+    """
+
+    def __init__(
+        self,
+        model_name: str,
+        task: TaskOption = "auto",
+        tokenizer_name: Optional[str] = None,
+        tokenizer_mode: str = "auto",
+        trust_remote_code: bool = True,
+        seed: Optional[int] = 0,
+        max_model_len: int = 1024,
+        dtype: str = "auto",
+        disable_log_stats: bool = True,
+        tensor_parallel_size: int = 1,
+        block_size: int = 16 if not torch.xpu.is_available() else 64,
+        enable_chunked_prefill: Optional[bool] = False,
+        swap_space: int = 4,
+        enforce_eager: Optional[bool] = False,
+        **kwargs,
+    ) -> None:
+        self.llm = LLM(
+            model=model_name,
+            task=task,
+            tokenizer=tokenizer_name,
+            tokenizer_mode=tokenizer_mode,
+            trust_remote_code=trust_remote_code,
+            dtype=dtype,
+            seed=seed,
+            swap_space=swap_space,
+            enforce_eager=enforce_eager,
+            disable_log_stats=disable_log_stats,
+            tensor_parallel_size=tensor_parallel_size,
+            max_model_len=max_model_len,
+            block_size=block_size,
+            enable_chunked_prefill=enable_chunked_prefill,
+            **kwargs,
+        )
+
+    def get_inputs(
+        self,
+        prompts: Union[list[str], list[torch.Tensor], list[int]],
+        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+    ) -> list[TextPrompt]:
+
+        if any(x is not None and len(x) != len(prompts)
+               for x in [images, videos, audios]):
+            raise ValueError(
+                "All non-None multimodal inputs must have the same length as "
+                "prompts")
+
+        inputs = []
+        for i, prompt in enumerate(prompts):
+            multi_modal_data = {}
+            if images is not None and (image := images[i]) is not None:
+                multi_modal_data["image"] = image
+            if videos is not None and (video := videos[i]) is not None:
+                multi_modal_data["video"] = video
+            if audios is not None and (audio := audios[i]) is not None:
+                multi_modal_data["audio"] = audio
+
+            text_prompt_kwargs: dict[str, Any] = {
+                "multi_modal_data": multi_modal_data or None
+            }
+            if isinstance(prompt, str):
+                text_prompt_kwargs["prompt"] = prompt
+            elif isinstance(prompt, list):
+                text_prompt_kwargs["prompt_token_ids"] = prompt
+            else:
+                text_prompt_kwargs["prompt_embeds"] = prompt
+
+            inputs.append(TextPrompt(**text_prompt_kwargs))
+
+        return inputs
+
+    def generate(
+        self,
+        prompts: Union[list[str], list[torch.Tensor]],
+        sampling_params: SamplingParams,
+        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+        **kwargs: Any,
+    ) -> list[tuple[list[list[int]], list[str]]]:
+        inputs = self.get_inputs(prompts,
+                                 images=images,
+                                 videos=videos,
+                                 audios=audios)
+
+        req_outputs = self.llm.generate(inputs,
+                                        sampling_params=sampling_params,
+                                        **kwargs)
+
+        outputs: list[tuple[list[list[int]], list[str]]] = []
+        for req_output in req_outputs:
+            prompt_str = req_output.prompt
+            prompt_ids = req_output.prompt_token_ids
+            req_sample_output_ids: list[list[int]] = []
+            req_sample_output_strs: list[str] = []
+            for sample in req_output.outputs:
+                output_str = sample.text
+                output_ids = list(sample.token_ids)
+                req_sample_output_ids.append(prompt_ids + output_ids)
+                req_sample_output_strs.append((prompt_str or "") + output_str)
+            outputs.append((req_sample_output_ids, req_sample_output_strs))
+        return outputs
+
+    @staticmethod
+    def _final_steps_generate_w_logprobs(
+        req_outputs: list[RequestOutput],
+    ) -> list[TokensTextLogprobsPromptLogprobs]:
+        outputs: list[TokensTextLogprobsPromptLogprobs] = []
+        for req_output in req_outputs:
+            assert len(req_output.outputs) > 0
+            for sample in req_output.outputs:
+                output_str = sample.text
+                output_ids = list(sample.token_ids)
+                output_logprobs = sample.logprobs
+            outputs.append((output_ids, output_str, output_logprobs,
+                            req_output.prompt_logprobs))
+        return outputs
+
+    def generate_w_logprobs(
+        self,
+        prompts: list[str],
+        sampling_params: SamplingParams,
+        images: Optional[PromptImageInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        **kwargs: Any,
+    ) -> Union[list[TokensTextLogprobs],
+               list[TokensTextLogprobsPromptLogprobs]]:
+        inputs = self.get_inputs(prompts,
+                                 images=images,
+                                 videos=videos,
+                                 audios=audios)
+
+        req_outputs = self.llm.generate(inputs,
+                                        sampling_params=sampling_params,
+                                        **kwargs)
+
+        toks_str_logsprobs_prompt_logprobs = (
+            self._final_steps_generate_w_logprobs(req_outputs))
+        # Omit prompt logprobs if not required by sampling params
+        return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
+                if sampling_params.prompt_logprobs is None else
+                toks_str_logsprobs_prompt_logprobs)
+
+    def generate_encoder_decoder_w_logprobs(
+        self,
+        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
+        sampling_params: SamplingParams,
+    ) -> Union[list[TokensTextLogprobs],
+               list[TokensTextLogprobsPromptLogprobs]]:
+        '''
+        Logprobs generation for vLLM encoder/decoder models
+        '''
+
+        assert sampling_params.logprobs is not None
+        req_outputs = self.llm.generate(encoder_decoder_prompts,
+                                        sampling_params=sampling_params)
+        toks_str_logsprobs_prompt_logprobs = (
+            self._final_steps_generate_w_logprobs(req_outputs))
+        # Omit prompt logprobs if not required by sampling params
+        return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
+                if sampling_params.prompt_logprobs is None else
+                toks_str_logsprobs_prompt_logprobs)
+
+    def generate_greedy(
+        self,
+        prompts: Union[list[str], list[torch.Tensor]],
+        max_tokens: int,
+        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+        **kwargs: Any,
+    ) -> list[tuple[list[int], str]]:
+        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
+        outputs = self.generate(prompts,
+                                greedy_params,
+                                images=images,
+                                videos=videos,
+                                audios=audios,
+                                **kwargs)
+        return [(output_ids[0], output_str[0])
+                for output_ids, output_str in outputs]
+
+    def generate_greedy_logprobs(
+        self,
+        prompts: list[str],
+        max_tokens: int,
+        num_logprobs: int,
+        num_prompt_logprobs: Optional[int] = None,
+        images: Optional[PromptImageInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        stop_token_ids: Optional[list[int]] = None,
+        stop: Optional[list[str]] = None,
+        **kwargs: Any,
+    ) -> Union[list[TokensTextLogprobs],
+               list[TokensTextLogprobsPromptLogprobs]]:
+        greedy_logprobs_params = SamplingParams(
+            temperature=0.0,
+            max_tokens=max_tokens,
+            logprobs=num_logprobs,
+            prompt_logprobs=num_prompt_logprobs,
+            stop_token_ids=stop_token_ids,
+            stop=stop)
+
+        return self.generate_w_logprobs(prompts,
+                                        greedy_logprobs_params,
+                                        images=images,
+                                        audios=audios,
+                                        videos=videos,
+                                        **kwargs)
+
+    def generate_encoder_decoder_greedy_logprobs(
+        self,
+        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
+        max_tokens: int,
+        num_logprobs: int,
+        num_prompt_logprobs: Optional[int] = None,
+        skip_special_tokens: bool = True,
+    ) -> Union[list[TokensTextLogprobs],
+               list[TokensTextLogprobsPromptLogprobs]]:
+        greedy_logprobs_params = SamplingParams(
+            temperature=0.0,
+            max_tokens=max_tokens,
+            logprobs=num_logprobs,
+            prompt_logprobs=(num_prompt_logprobs),
+            skip_special_tokens=skip_special_tokens,
+        )
+        '''
+        Greedy logprobs generation for vLLM encoder/decoder models
+        '''
+
+        return self.generate_encoder_decoder_w_logprobs(
+            encoder_decoder_prompts, greedy_logprobs_params)
+
+    def generate_beam_search(
+        self,
+        prompts: list[str],
+        beam_width: int,
+        max_tokens: int,
+        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+    ) -> list[tuple[list[list[int]], list[str]]]:
+        inputs = self.get_inputs(prompts,
+                                 images=images,
+                                 videos=videos,
+                                 audios=audios)
+
+        outputs = self.llm.beam_search(
+            inputs,
+            BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
+        returned_outputs = []
+        for output in outputs:
+            token_ids = [x.tokens for x in output.sequences]
+            texts = [x.text for x in output.sequences]
+            returned_outputs.append((token_ids, texts))
+        return returned_outputs
+
+    def classify(self, prompts: list[str]) -> list[list[float]]:
+        req_outputs = self.llm.classify(prompts)
+        return [req_output.outputs.probs for req_output in req_outputs]
+
+    def embed(self,
+              prompts: list[str],
+              images: Optional[PromptImageInput] = None,
+              videos: Optional[PromptVideoInput] = None,
+              audios: Optional[PromptAudioInput] = None,
+              *args,
+              **kwargs) -> list[list[float]]:
+        inputs = self.get_inputs(prompts,
+                                 images=images,
+                                 videos=videos,
+                                 audios=audios)
+
+        req_outputs = self.llm.embed(inputs, *args, **kwargs)
+        return [req_output.outputs.embedding for req_output in req_outputs]
+
+    def encode(self, prompts: list[str]) -> list[list[float]]:
+        req_outputs = self.llm.encode(prompts)
+        return [req_output.outputs.data for req_output in req_outputs]
+
+    def score(
+        self,
+        text_1: Union[str, list[str]],
+        text_2: Union[str, list[str]],
+        *args,
+        **kwargs,
+    ) -> list[float]:
+        req_outputs = self.llm.score(text_1, text_2, *args, **kwargs)
+        return [req_output.outputs.score for req_output in req_outputs]
+
+    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
+        executor = self.llm.llm_engine.model_executor
+        return executor.apply_model(func)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        del self.llm
+        cleanup_dist_env_and_memory()
+
+
+@pytest.fixture(scope="session")
+def vllm_runner():
+    return VllmRunner
+
+
+@pytest.fixture()
+def temporary_enable_log_propagate():
+    import logging
+    logger = logging.getLogger("vllm")
+    logger.propagate = True
+    yield
+    logger.propagate = False
+
+
+@pytest.fixture()
+def caplog_vllm(temporary_enable_log_propagate, caplog):
+    # To capture vllm log, we should enable propagate=True temporarily
+    # because caplog depends on logs propagated to the root logger.
+    yield caplog
+
+
+@pytest.fixture(scope="session")
+def num_gpus_available():
+    """Get number of GPUs without initializing the CUDA context
+    in current process."""
+
+    from vllm.platforms import current_platform
+    return current_platform.device_count()
+
+
+temp_dir = tempfile.gettempdir()
+_dummy_opt_path = os.path.join(temp_dir, "dummy_opt")
+_dummy_llava_path = os.path.join(temp_dir, "dummy_llava")
+_dummy_gemma2_embedding_path = os.path.join(temp_dir, "dummy_gemma2_embedding")
+
+
+@pytest.fixture
+def dummy_opt_path():
+    json_path = os.path.join(_dummy_opt_path, "config.json")
+    if not os.path.exists(_dummy_opt_path):
+        snapshot_download(repo_id="facebook/opt-125m",
+                          local_dir=_dummy_opt_path,
+                          ignore_patterns=[
+                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
+                              "*.msgpack"
+                          ])
+        assert os.path.exists(json_path)
+        with open(json_path) as f:
+            config = json.load(f)
+        config["architectures"] = ["MyOPTForCausalLM"]
+        with open(json_path, "w") as f:
+            json.dump(config, f)
+    return _dummy_opt_path
+
+
+@pytest.fixture
+def dummy_llava_path():
+    json_path = os.path.join(_dummy_llava_path, "config.json")
+    if not os.path.exists(_dummy_llava_path):
+        snapshot_download(repo_id="llava-hf/llava-1.5-7b-hf",
+                          local_dir=_dummy_llava_path,
+                          ignore_patterns=[
+                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
+                              "*.msgpack"
+                          ])
+        assert os.path.exists(json_path)
+        with open(json_path) as f:
+            config = json.load(f)
+        config["architectures"] = ["MyLlava"]
+        with open(json_path, "w") as f:
+            json.dump(config, f)
+    return _dummy_llava_path
+
+
+@pytest.fixture
+def dummy_gemma2_embedding_path():
+    json_path = os.path.join(_dummy_gemma2_embedding_path, "config.json")
+    if not os.path.exists(_dummy_gemma2_embedding_path):
+        snapshot_download(repo_id="BAAI/bge-multilingual-gemma2",
+                          local_dir=_dummy_gemma2_embedding_path,
+                          ignore_patterns=[
+                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
+                              "*.msgpack"
+                          ])
+        assert os.path.exists(json_path)
+        with open(json_path) as f:
+            config = json.load(f)
+        config["architectures"] = ["MyGemma2Embedding"]
+        with open(json_path, "w") as f:
+            json.dump(config, f)
+    return _dummy_gemma2_embedding_path
+
+
+# Add the flag `--optional` to allow run tests
+# that are marked with @pytest.mark.optional
+def pytest_addoption(parser):
+    parser.addoption("--optional",
+                     action="store_true",
+                     default=False,
+                     help="run optional test")
+
+
+def pytest_collection_modifyitems(config, items):
+    if config.getoption("--optional"):
+        # --optional given in cli: do not skip optional tests
+        return
+    skip_optional = pytest.mark.skip(reason="need --optional option to run")
+    for item in items:
+        if "optional" in item.keywords:
+            item.add_marker(skip_optional)
+
+
+@pytest.fixture(scope="session")
+def cli_config_file():
+    """Return the path to the CLI config file."""
+    return os.path.join(_TEST_DIR, "config", "test_config.yaml")
+
+
+@pytest.fixture(scope="session")
+def cli_config_file_with_model():
+    """Return the path to the CLI config file with model."""
+    return os.path.join(_TEST_DIR, "config", "test_config_with_model.yaml")
diff --git a/vllm_v0.10.0/tests/core/__init__.py b/vllm_v0.10.0/tests/core/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/core/block/__init__.py b/vllm_v0.10.0/tests/core/block/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/core/block/conftest.py b/vllm_v0.10.0/tests/core/block/conftest.py
new file mode 100644
index 0000000..6afe98d
--- /dev/null
+++ b/vllm_v0.10.0/tests/core/block/conftest.py
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+
+@pytest.fixture()
+def should_do_global_cleanup_after_test() -> bool:
+    """Disable the global cleanup fixture for tests in this directory. This
+    provides a ~10x speedup for unit tests that don't load a model to GPU.
+
+    This requires that tests in this directory clean up after themselves if they
+    use the GPU.
+    """
+    return False
diff --git a/vllm_v0.10.0/tests/core/block/e2e/__init__.py b/vllm_v0.10.0/tests/core/block/e2e/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/core/block/e2e/conftest.py b/vllm_v0.10.0/tests/core/block/e2e/conftest.py
new file mode 100644
index 0000000..e2c6c66
--- /dev/null
+++ b/vllm_v0.10.0/tests/core/block/e2e/conftest.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable
+from typing import Callable, Optional
+
+import pytest
+
+from vllm import LLM
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.model_executor.utils import set_random_seed
+
+
+@pytest.fixture
+def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+                           baseline_llm_kwargs, seed):
+    return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+                                baseline_llm_kwargs, seed)
+
+
+@pytest.fixture
+def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+                       test_llm_kwargs, seed):
+    return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+                                test_llm_kwargs, seed)
+
+
+def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+                         distinct_llm_kwargs, seed):
+    kwargs = {
+        **common_llm_kwargs,
+        **per_test_common_llm_kwargs,
+        **distinct_llm_kwargs,
+    }
+
+    def generator_inner():
+        llm = LLM(**kwargs)
+
+        set_random_seed(seed)
+
+        yield llm
+        del llm
+        cleanup_dist_env_and_memory()
+
+    for llm in generator_inner():
+        yield llm
+        del llm
+
+
+def get_text_from_llm_generator(llm_generator: Iterable[LLM],
+                                prompts,
+                                sampling_params,
+                                llm_cb: Optional[Callable[[LLM],
+                                                          None]] = None):
+    for llm in llm_generator:
+        if llm_cb:
+            llm_cb(llm)
+        outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
+        text = [output.outputs[0].text for output in outputs]
+        del llm
+
+    return text
+
+
+def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params):
+    for llm in llm_generator:
+        outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
+        token_ids = [output.outputs[0].token_ids for output in outputs]
+        del llm
+
+    return token_ids
diff --git a/vllm_v0.10.0/tests/core/block/e2e/test_correctness.py b/vllm_v0.10.0/tests/core/block/e2e/test_correctness.py
new file mode 100644
index 0000000..93222b5
--- /dev/null
+++ b/vllm_v0.10.0/tests/core/block/e2e/test_correctness.py
@@ -0,0 +1,479 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from itertools import cycle
+
+import pytest
+
+from vllm import SamplingParams
+
+from .conftest import get_token_ids_from_llm_generator
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        "model": "facebook/opt-125m",
+
+        # skip cuda graph creation for fast test.
+        "enforce_eager": True,
+
+        # Allow only 5 sequences of ~1024 tokens in worst case.
+        "block_size": 16,
+        "num_gpu_blocks_override": 5 * (64 + 1),
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "preemption_mode": "swap"
+}, {
+    "preemption_mode": "recompute"
+}])
+@pytest.mark.parametrize("batch_size", [10])
+@pytest.mark.parametrize("seed", [1])
+def test_block_manager_with_preemption(baseline_llm_generator,
+                                       test_llm_generator, batch_size):
+    """Verify block manager produces same outputs even when there is preemption.
+
+    This constructs two LLM, each with limited number of GPU blocks. The limit
+    is decided such that as the sequences in the batch grow, sequences must be
+    preempted and removed from cache.
+
+    If the output token ids are equivalent, then we have confidence that the KV
+    cache is not corrupted.
+
+    NOTE: We want a significant number of generated tokens so that any incorrect
+    KV mapping has time to build up error.
+
+    NOTE(Kuntai): Though we have removed block manager v1, this test is still
+    useful as it asserts the behavior of block manager v2 (now it is called 
+    SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we  
+    keep this test.
+    """
+    output_len = 1024
+    temperature = 0.0
+
+    # We want to ensure equality even with preemption.
+    # We force the total block size to be 1 + cdiv(output_len, block_size)
+    # so that only one sequence can fit at a time (once the sequences grow).
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+
+    baseline_token_ids = get_token_ids_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params)
+
+    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                      prompts, sampling_params)
+
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
+                                                    test_token_ids):
+        assert expected_token_ids == actual_token_ids
+
+    assert baseline_token_ids == test_token_ids
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        "model": "facebook/opt-125m",
+
+        # Our prompts will generate 128 tokens; since the prompts themselves are
+        # small, we don't need much KV space beyond 128.
+        "max_model_len": 160,
+
+        # skip cuda graph creation for fast test.
+        "enforce_eager": True,
+    }])
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [
+        {
+            "block_size": 16,
+
+            # Allow only 2 sequences of ~128 tokens in worst case.
+            # Note 8 = 128/block_size
+            "num_gpu_blocks_override": 2 * (8 + 1),
+        },
+        {
+            "block_size": 8,
+
+            # Allow only 2 sequences of ~128 tokens in worst case.
+            # Note 16 = 128/block_size
+            "num_gpu_blocks_override": 2 * (16 + 2),
+        }
+    ])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{
+    "num_lookahead_slots": 0,
+}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            # We run one test with block_size < lookahead_slots, one test with
+            # block_size > lookahead_slots
+            "num_lookahead_slots": 10,
+            "preemption_mode": "swap",
+        },
+        {
+            "num_lookahead_slots": 10,
+            "preemption_mode": "recompute",
+        }
+    ])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("seed", [1])
+def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
+                                                   test_llm_generator,
+                                                   batch_size):
+    """Verify vLLM produces the same output with greedy sampling, when lookahead
+    scheduling is used vs. not.
+
+    Lookahead scheduling is not expected to modify the output, as it simply
+    allocates empty slots ahead of the known token ids in a sliding fashion.
+
+    This test constrains the total number of blocks to force preemption. It also
+    varies the block size so that the lookahead size is less than and greater
+    than the block size.
+    """
+    output_len = 128
+    temperature = 0.0
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+
+    print('Getting token ids without lookahead scheduling')
+    baseline_token_ids = get_token_ids_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params)
+
+    print('Getting token ids with lookahead scheduling')
+    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                      prompts, sampling_params)
+
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
+                                                    test_token_ids):
+        assert expected_token_ids == actual_token_ids
+
+    assert baseline_token_ids == test_token_ids
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [
+        {
+            # Use a small model for a fast test.
+            "model": "facebook/opt-125m",
+
+            # skip cuda graph creation for fast test.
+            "enforce_eager": True,
+            "enable_chunked_prefill": True,
+        },
+    ])
+@pytest.mark.parametrize("per_test_common_llm_kwargs",
+                         [{
+                             "block_size": 16,
+                             "max_num_batched_tokens": 2,
+                             "max_num_seqs": 2,
+                         }, {
+                             "block_size": 16,
+                             "max_num_batched_tokens": 3,
+                             "max_num_seqs": 2,
+                         }, {
+                             "block_size": 16,
+                             "max_num_batched_tokens": 256,
+                             "max_num_seqs": 10,
+                         }])
+@pytest.mark.parametrize("baseline_llm_kwargs", [
+    {},
+])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "num_lookahead_slots": 0,
+    },
+    {
+        "num_lookahead_slots": 5,
+    },
+])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("seed", [1])
+def test_chunked_prefill_block_manager(baseline_llm_generator,
+                                       test_llm_generator, batch_size):
+    """Verify that chunked prefill works with SelfAttnBlockSpaceManager, 
+    with and without lookahead scheduling.
+    """
+    output_len = 32
+    temperature = 0.0
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        ("1 + " * 50) + " 1 = ",  # Longer prompt.
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+
+    print('Getting token ids with BlockManager')
+    baseline_token_ids = get_token_ids_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params)
+
+    print('Getting token ids with BlockManager, with lookahead slots.')
+    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                      prompts, sampling_params)
+
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
+                                                    test_token_ids):
+        assert expected_token_ids == actual_token_ids
+
+    assert baseline_token_ids == test_token_ids
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        "model": "facebook/opt-125m",
+
+        # skip cuda graph creation for fast test.
+        "enforce_eager": True,
+
+        # Allow only 5 sequences of ~1024 tokens in worst case.
+        "block_size": 16,
+        "num_gpu_blocks_override": 5 * (64 + 1),
+
+        # Enable prefill cache
+        "enable_prefix_caching": True,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "preemption_mode": "swap"
+}, {
+    "preemption_mode": "recompute"
+}])
+@pytest.mark.parametrize("batch_size", [10])
+@pytest.mark.parametrize("seed", [1])
+def test_block_manager_prefix_caching_enabled_with_preemption(
+        baseline_llm_generator, test_llm_generator, batch_size):
+    """Verify block manager produces same outputs even when there is preemption.
+
+    This constructs two LLM, each with limited number of GPU blocks. The limit
+    is decided such that as the sequences in the batch grow, sequences must be
+    preempted and removed from cache.
+
+    If the output token ids are equivalent, then we have confidence that the KV
+    cache is not corrupted.
+
+    NOTE: We want a significant number of generated tokens so that any incorrect
+    KV mapping has time to build up error.
+
+    NOTE(Kuntai): Though we have removed block manager v1, this test is still
+    useful as it asserts the behavior of block manager v2 (now it is called 
+    SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we  
+    keep this test.
+    """
+    output_len = 1024
+    temperature = 0.0
+
+    # We want to ensure equality even with preemption.
+    # We force the total block size to be 1 + cdiv(output_len, block_size)
+    # so that only one sequence can fit at a time (once the sequences grow).
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+
+    print('Getting token ids from block manager')
+    baseline_token_ids = get_token_ids_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params)
+
+    print('Getting token ids from block manager, with preemption')
+    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                      prompts, sampling_params)
+
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
+                                                    test_token_ids):
+        assert expected_token_ids == actual_token_ids
+
+    assert baseline_token_ids == test_token_ids
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        "model": "facebook/opt-125m",
+
+        # skip cuda graph creation for fast test.
+        "enforce_eager": True,
+
+        # Allow only 5 sequences of ~1024 tokens in worst case.
+        "block_size": 16,
+        "num_gpu_blocks_override": 5 * (64 + 1),
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{
+    "enable_prefix_caching": False
+}])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "enable_prefix_caching": True,
+    "preemption_mode": "swap"
+}, {
+    "enable_prefix_caching": True,
+    "preemption_mode": "recompute"
+}])
+@pytest.mark.parametrize("batch_size", [10])
+@pytest.mark.parametrize("seed", [1])
+def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
+                                             test_llm_generator, batch_size):
+    """Verify block manager v2 with auto prefix caching enabled produces same
+    outputs as auto prefix caching disabled, even when there is preemption.
+
+    This constructs two LLM, each with limited number of GPU blocks. The limit
+    is decided such that as the sequences in the batch grow, sequences must be
+    preempted and removed from cache.
+
+    If the output token ids are equivalent, then we have confidence that auto
+    prefix caching itself at least don't cause result error.
+    """
+    output_len = 1024
+    temperature = 0.0
+
+    # We want to ensure equality even with preemption.
+    # We force the total block size to be 1 + cdiv(output_len, block_size)
+    # so that only one sequence can fit at a time (once the sequences grow).
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+
+    print('Getting token ids with APC disabled')
+    baseline_token_ids = get_token_ids_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params)
+
+    print('Getting token ids with APC enabled')
+    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                      prompts, sampling_params)
+
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
+                                                    test_token_ids):
+        assert expected_token_ids == actual_token_ids
+
+    assert baseline_token_ids == test_token_ids
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        "model": "facebook/opt-125m",
+
+        # skip cuda graph creation for fast test.
+        "enforce_eager": True,
+
+        # we keep the blocks small, so that hit eviction quickly
+        "max_model_len": 48,
+        "block_size": 16,
+        "num_gpu_blocks_override": 3,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{
+    "enable_prefix_caching": False
+}])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "enable_prefix_caching": True,
+}])
+@pytest.mark.parametrize("seed", [1])
+def test_auto_prefix_caching_after_eviction_start(baseline_llm_generator,
+                                                  test_llm_generator):
+    """Verify block manager v2 with auto prefix caching could works normal
+    even when eviction started.
+    With APC enabled, all blocks are held by native block at the beginning.
+    Then blocks are managed by evictor instead. If cache hit at the evitor's
+    block, then it could be reused, or we need to recompute its kv cache.
+    """
+    output_len = 10
+    temperature = 0.0
+
+    prompts = [
+        "You are a helpful assistant. Please answer truthfully and write "
+        "out your thinking step by step to be sure you get the right answer. "
+        "If you make a mistake, attempt to correct it. who are you?",
+        "You are a helpful assistant. Please answer truthfully and write out "
+        "your thinking step by step to be sure you get the right answer. You "
+        "are helpful and harmless and you follow ethical guidelines. "
+        "who are you?"
+    ]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+
+    print('Getting token ids with APC disabled')
+    baseline_token_ids = get_token_ids_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params)
+
+    print('Getting token ids with APC enabled')
+    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                      prompts, sampling_params)
+
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
+                                                    test_token_ids):
+        assert expected_token_ids == actual_token_ids
+
+    assert baseline_token_ids == test_token_ids
diff --git a/vllm_v0.10.0/tests/core/block/e2e/test_correctness_sliding_window.py b/vllm_v0.10.0/tests/core/block/e2e/test_correctness_sliding_window.py
new file mode 100644
index 0000000..4d67eea
--- /dev/null
+++ b/vllm_v0.10.0/tests/core/block/e2e/test_correctness_sliding_window.py
@@ -0,0 +1,189 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import random
+
+import pytest
+
+from tests.kernels.utils import override_backend_env_variable
+from vllm import LLM, SamplingParams
+from vllm.platforms import current_platform
+
+from .conftest import get_text_from_llm_generator
+
+# relatively small model with 4k sliding window
+MODEL = "bigcode/starcoder2-3b"
+BLOCK_SIZE = 16
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": MODEL,
+
+        # skip cuda graph creation for fast test.
+        "enforce_eager": True,
+        "block_size": BLOCK_SIZE,
+        # needed due to https://github.com/vllm-project/vllm/issues/1908#issuecomment-2101122008
+        "num_gpu_blocks_override": 100000 // BLOCK_SIZE,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [{}])
+@pytest.mark.parametrize("batch_size", [5])
+@pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
+def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator,
+                                  batch_size, seed, backend, monkeypatch):
+    """
+    The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
+    asks for value of one of them (which is outside the sliding window).
+    If we tell it upfront which we are going to be looking for, then
+    it answers correctly (mostly).
+
+    Additionally, we compare the results of the v1 and v2 managers.
+    """
+    if backend == "FLASHINFER" and current_platform.is_rocm():
+        pytest.skip("Flashinfer does not support ROCm/HIP.")
+    if backend == "XFORMERS" and current_platform.is_rocm():
+        pytest.skip("Xformers does not support ROCm/HIP.")
+
+    override_backend_env_variable(monkeypatch, backend)
+
+    sampling_params = SamplingParams(
+        max_tokens=1024,
+        ignore_eos=True,
+        temperature=0.0,
+    )
+
+    prompts, answer, indices = prep_prompts(batch_size)
+
+    baseline_texts = get_text_from_llm_generator(baseline_llm_generator,
+                                                 prompts,
+                                                 sampling_params,
+                                                 llm_cb=check_window(prompts))
+
+    check_answers(indices, answer, baseline_texts)
+
+    print('Getting token ids from block manager v2')
+    test_texts = get_text_from_llm_generator(test_llm_generator, prompts,
+                                             sampling_params)
+    check_answers(indices, answer, test_texts)
+
+    cmp = [
+        expected_text == actual_text
+        for expected_text, actual_text in zip(baseline_texts, test_texts)
+    ]
+    print(cmp)
+    # make sure it's mostly OK; this is possibly because https://github.com/vllm-project/vllm/pull/4768
+    # however, https://github.com/vllm-project/vllm/issues/3385#issuecomment-1995924290
+    # states that xformers and flash_attn have different ideas about the window
+    # size anyways
+    assert sum(cmp) > 0.7 * len(cmp)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": MODEL,
+
+        # skip cuda graph creation for fast test.
+        "enforce_eager": True,
+        "block_size": BLOCK_SIZE,
+        "num_gpu_blocks_override": 100000 // BLOCK_SIZE,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [{"enable_chunked_prefill": True}])
+@pytest.mark.parametrize("batch_size", [5])
+@pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
+def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
+                                        backend, monkeypatch):
+    """
+    This is similar to test_sliding_window_retrieval, however, it doesn't
+    compare against the v1 block manager since v1 doesn't support
+    chunked prefill with sliding window.
+
+    The results with and without chunked prefill are not the same due to
+    numerical instabilities.
+    """
+    if backend == "FLASHINFER" and current_platform.is_rocm():
+        pytest.skip("Flashinfer does not support ROCm/HIP.")
+    if backend == "XFORMERS" and current_platform.is_rocm():
+        pytest.skip("Xformers does not support ROCm/HIP.")
+    override_backend_env_variable(monkeypatch, backend)
+
+    sampling_params = SamplingParams(
+        max_tokens=10,
+        ignore_eos=True,
+        temperature=0.0,
+    )
+
+    prompts, answer, indices = prep_prompts(batch_size)
+
+    # We don't compare with the baseline model here, since the results
+    # slightly different due to different tailing in attention.
+    test_texts = get_text_from_llm_generator(test_llm_generator,
+                                             prompts,
+                                             sampling_params,
+                                             llm_cb=check_window(prompts))
+    check_answers(indices, answer, test_texts)
+
+
+def prep_prompts(batch_size: int, ln_range: tuple[int, int] = (800, 1100)):
+    """
+    Generate prompts which a bunch of assignments,
+    then asking for the value of one of them.
+    The prompt is just under 10k tokens; sliding window is 4k
+    so the answer is outside sliding window, but should still be correct.
+
+    Args:
+        batch_size: number of prompts to generate
+        ln_range: an argument to control the length of the prompt
+    """
+    prompts: list[str] = []
+    answer: list[int] = []
+    indices: list[int] = []
+    random.seed(1)
+    for _ in range(batch_size):
+        idx = random.randint(30, 90)
+        indices.append(idx)
+        prompt = "```python\n# We set a number of variables, " + \
+                 f"x{idx} will be important later\n"
+        ln = random.randint(*ln_range)
+        for k in range(30, ln):
+            v = random.randint(10, 99)
+            if k == idx:
+                answer.append(v)
+            prompt += f"x{k} = {v}\n"
+        prompt += f"# Now, we check the value of x{idx}:\n"
+        prompt += f"assert x{idx} == "
+        prompts.append(prompt)
+    return prompts, answer, indices
+
+
+def check_answers(indices: list[int],
+                  answer: list[int],
+                  outputs: list[str],
+                  accept_rate: float = 0.7):
+    answer2 = [int(text[0:2].strip()) for text in outputs]
+    print(list(zip(indices, zip(answer, answer2))))
+    numok = 0
+    for a1, a2 in zip(answer, answer2):
+        if a1 == a2:
+            numok += 1
+    frac_ok = numok / len(answer)
+    print(f"Num OK: {numok}/{len(answer)} {frac_ok}")
+    assert frac_ok >= accept_rate
+
+
+def check_window(prompts: list[str]):
+
+    def inner(llm: LLM):
+        sliding_window = llm.llm_engine.model_config.get_sliding_window()
+        assert sliding_window and sliding_window > 0
+        assert any(
+            len(llm.get_tokenizer().tokenize(prompt)) > sliding_window
+            for prompt in prompts)
+
+    return inner
diff --git a/vllm_v0.10.0/tests/core/block/test_block_manager.py b/vllm_v0.10.0/tests/core/block/test_block_manager.py
new file mode 100644
index 0000000..9eed264
--- /dev/null
+++ b/vllm_v0.10.0/tests/core/block/test_block_manager.py
@@ -0,0 +1,494 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
+                                   STR_NOT_IMPL_ENC_DEC_SWA)
+from vllm.core.block_manager import SelfAttnBlockSpaceManager
+from vllm.core.interfaces import AllocStatus
+from vllm.sequence import Logprob, SequenceStatus
+from vllm.utils import chunk_list
+
+from ..utils import (create_dummy_prompt, create_seq_group,
+                     create_seq_group_encoder_decoder)
+
+
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("num_gpu_blocks", [8, 40, 80])
+@pytest.mark.parametrize("num_seqs_per_group", [1, 4])
+@pytest.mark.parametrize("watermark", [0.0, 0.5])
+def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
+                                num_gpu_blocks: int, watermark: float):
+    block_manager = SelfAttnBlockSpaceManager(
+        block_size=block_size,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        watermark=watermark,
+    )
+    num_watermark_blocks = int(watermark * num_gpu_blocks)
+
+    num_output_blocks_per_seq = 1
+
+    # NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but
+    # the current implementation assumes all seqs are new prompts / don't have
+    # different output lens.
+    num_output_blocks = num_output_blocks_per_seq
+
+    for num_prompt_blocks in range(1, num_gpu_blocks - num_output_blocks):
+        seq_group = create_seq_group(
+            seq_prompt_len=block_size * num_prompt_blocks,
+            seq_output_lens=[
+                block_size * num_output_blocks_per_seq
+                for _ in range(num_seqs_per_group)
+            ],
+        )
+
+        assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
+
+        can_allocate_result = block_manager.can_allocate(seq_group)
+
+        num_required_blocks = num_prompt_blocks + num_output_blocks
+
+        if num_gpu_blocks - num_required_blocks < num_watermark_blocks:
+            assert can_allocate_result == AllocStatus.NEVER
+        elif num_gpu_blocks >= num_required_blocks:
+            assert can_allocate_result == AllocStatus.OK
+        else:
+            assert can_allocate_result == AllocStatus.LATER
+
+
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("num_gpu_blocks", [16, 80, 160])
+@pytest.mark.parametrize("num_seqs_per_group", [1, 4])
+@pytest.mark.parametrize("watermark", [0.0, 0.5])
+def test_can_allocate_seq_group_encoder_decoder(block_size: int,
+                                                num_seqs_per_group: int,
+                                                num_gpu_blocks: int,
+                                                watermark: float):
+    block_manager = SelfAttnBlockSpaceManager(
+        block_size=block_size,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        watermark=watermark,
+    )
+    num_watermark_blocks = int(watermark * num_gpu_blocks)
+
+    num_output_blocks_per_seq = 1
+
+    # NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but
+    # the current implementation assumes all seqs are new prompts / don't have
+    # different output lens.
+    num_output_blocks = num_output_blocks_per_seq
+
+    for bdx, num_prompt_blocks in enumerate(
+            range(1, num_gpu_blocks - num_output_blocks)):
+        num_cross_blocks_per_seq = num_prompt_blocks
+
+        seq_group = create_seq_group_encoder_decoder(
+            seq_prompt_len=block_size * num_prompt_blocks,
+            seq_output_lens=[
+                block_size * num_output_blocks_per_seq
+                for _ in range(num_seqs_per_group)
+            ],
+            request_id=str(bdx))
+
+        assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
+
+        can_allocate_result = block_manager.can_allocate(seq_group)
+
+        num_required_blocks = num_prompt_blocks + \
+                              num_output_blocks + \
+                              num_cross_blocks_per_seq
+
+        if num_gpu_blocks - num_required_blocks < num_watermark_blocks:
+            assert can_allocate_result == AllocStatus.NEVER
+        elif num_gpu_blocks >= num_required_blocks:
+            assert can_allocate_result == AllocStatus.OK
+        else:
+            assert can_allocate_result == AllocStatus.LATER
+
+
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("num_gpu_blocks", [16])
+@pytest.mark.parametrize("num_seqs_per_group", [1])
+@pytest.mark.parametrize("watermark", [0.0, 0.5])
+def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
+                                                     num_seqs_per_group: int,
+                                                     num_gpu_blocks: int,
+                                                     watermark: float):
+    '''
+    SWA short for Sliding Window Attention.
+
+    At time of writing block manager does not support SWA.
+
+    However even when SWA is implemented for block manager,
+    there will still most likely be a separate workstream required
+    to enable SWA for encoder/decoder models.
+
+    Therefore this test enforces that one of the following cases
+    hold true:
+    1. Block manager does not support SWA at all (true at time of writing)
+    2. Block manager fails with NotImplementError when SWA is enabled
+       AND a SequenceGroup with an encoder sequence (i.e. in support of an
+       encoder/decoder model) is passed into can_allocate() as an argument
+
+    The setup for this test is stripped down version of
+    test_can_allocate_seq_group_encoder_decoder()
+    '''
+
+    with pytest.raises((NotImplementedError, AssertionError)) as exc_info:
+        block_manager = SelfAttnBlockSpaceManager(
+            block_size=block_size,
+            num_gpu_blocks=num_gpu_blocks,
+            num_cpu_blocks=1024,
+            watermark=watermark,
+            sliding_window=5  # SWA
+        )
+
+        num_output_blocks_per_seq = 1
+        num_prompt_blocks = 1
+        num_output_blocks = num_output_blocks_per_seq
+        seq_group = create_seq_group_encoder_decoder(
+            seq_prompt_len=block_size * num_prompt_blocks,
+            seq_output_lens=[
+                block_size * num_output_blocks_per_seq
+                for _ in range(num_seqs_per_group)
+            ],
+            request_id="0")
+
+        assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
+        block_manager.can_allocate(seq_group)
+
+    # Assert that either
+    # 1. Block manager constructor fails with assertion that sliding window
+    #    is not yet supported (most likely near-term outcome at time of
+    #    writing), or
+    # 2. can_allocate() fails with NotImplementedError due to combination of
+    #    encoder/decoder and sliding window attention
+    if isinstance(exc_info.value, NotImplementedError):
+        assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_SWA
+    elif isinstance(exc_info.value, AssertionError):
+        assert str(exc_info.value) == "Sliding window not yet supported"
+
+
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("num_gpu_blocks", [16])
+@pytest.mark.parametrize("num_seqs_per_group", [1])
+@pytest.mark.parametrize("watermark", [0.0, 0.5])
+def test_can_allocate_encoder_decoder_fails_with_prefix_cache(
+        block_size: int, num_seqs_per_group: int, num_gpu_blocks: int,
+        watermark: float):
+
+    block_manager = SelfAttnBlockSpaceManager(
+        block_size=block_size,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        watermark=watermark,
+        enable_caching=True  # Prefix cache
+    )
+
+    num_output_blocks_per_seq = 1
+    num_prompt_blocks = 1
+    num_output_blocks = num_output_blocks_per_seq
+    seq_group = create_seq_group_encoder_decoder(
+        seq_prompt_len=block_size * num_prompt_blocks,
+        seq_output_lens=[
+            block_size * num_output_blocks_per_seq
+            for _ in range(num_seqs_per_group)
+        ],
+        request_id="0")
+
+    assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
+
+    # Assert that either can_allocate() fails with NotImplementedError
+    # due to combination of encoder/decoder and prefix cache
+    with pytest.raises(NotImplementedError) as exc_info:
+        block_manager.can_allocate(seq_group)
+    assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
+
+
+@pytest.mark.parametrize("block_size", [1, 8])
+@pytest.mark.parametrize("prompt_len", [1, 7, 8])
+@pytest.mark.parametrize("num_slots_to_append", [1, 8, 129])
+@pytest.mark.parametrize("num_lookahead_slots", [0, 10])
+def test_append_slots(block_size, prompt_len, num_slots_to_append,
+                      num_lookahead_slots):
+    """Verify append_slots consumes the correct number of blocks from the block
+    table.
+    """
+
+    num_gpu_blocks = 1024
+    watermark = 0.1
+    block_manager = SelfAttnBlockSpaceManager(
+        block_size=block_size,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=0,
+        watermark=watermark,
+    )
+
+    seq_group = create_seq_group(
+        seq_prompt_len=prompt_len,
+        seq_output_lens=[0],
+    )
+
+    # Allocate seq
+    assert block_manager.can_allocate(seq_group)
+    block_manager.allocate(seq_group)
+
+    # Seq seq to RUNNING
+    seq = seq_group.get_seqs()[0]
+    seq.status = SequenceStatus.RUNNING
+
+    # Append tokens to the sequeqnce
+    for token_id in range(num_slots_to_append):
+        seq.append_token_id(token_id, {token_id: Logprob(0.0)})
+
+    # Append slots for new tokens and lookahead slots.
+    free_blocks_before_append = block_manager.get_num_free_gpu_blocks()
+    block_manager.append_slots(seq, num_lookahead_slots)
+    num_consumed_blocks = (free_blocks_before_append -
+                           block_manager.get_num_free_gpu_blocks())
+
+    # Expect consumed blocks to be new blocks required to support the new slots.
+    expected_consumed_blocks = len(
+        list(
+            chunk_list(
+                list(
+                    range(prompt_len + num_slots_to_append +
+                          num_lookahead_slots)),
+                block_size))) - len(
+                    list(chunk_list(list(range(prompt_len)), block_size)))
+    assert num_consumed_blocks == expected_consumed_blocks
+
+
+@pytest.mark.parametrize("block_size", [8])
+@pytest.mark.parametrize("num_cpu_blocks", [4])
+@pytest.mark.parametrize("num_gpu_blocks", [4])
+@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
+@pytest.mark.parametrize("enable_caching", [False, True])
+def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
+              enable_caching):
+    """Verify blocks number on src/desc device is correct after swapping in/out
+        sequence group (not missing or extra blocks).
+    """
+    block_manager = SelfAttnBlockSpaceManager(block_size,
+                                              num_cpu_blocks,
+                                              num_gpu_blocks,
+                                              watermark=0,
+                                              enable_caching=enable_caching)
+    prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
+    prompt.status = SequenceStatus.WAITING
+    block_manager.allocate(seq_group)
+
+    # Emulate a forward pass by appending a single token.
+    # The block manager then knows how many unprocessed
+    # tokens will be written in the next forward pass.
+    token_id = 0
+    prompt.status = SequenceStatus.RUNNING
+    prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
+
+    # Swap seq group from GPU -> CPU.
+    gpu_blocks = block_manager.get_block_table(prompt)
+    assert block_manager.can_swap_out(seq_group)
+    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    mapping = block_manager.swap_out(seq_group)
+    mapping_keys = [key for key, _ in mapping]
+    assert mapping_keys == gpu_blocks
+    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
+    assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
+    prompt.status = SequenceStatus.SWAPPED
+
+    # Swap seq group from CPU -> GPU.
+    assert block_manager.can_swap_in(seq_group, num_lookahead_slots)
+    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    mapping = block_manager.swap_in(seq_group)
+    cpu_blocks = block_manager.get_block_table(prompt)
+    mapping_keys = [key for key, _ in mapping]
+    assert mapping_keys == [cpu_blocks[0]]
+    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
+
+
+@pytest.mark.parametrize("block_size", [8])
+@pytest.mark.parametrize("num_gpu_blocks", [4])
+@pytest.mark.parametrize("num_lookahead_slots", [3, 8, 10])
+@pytest.mark.parametrize("enable_caching", [True, False])
+def test_can_swap(block_size, num_gpu_blocks, num_lookahead_slots,
+                  enable_caching):
+    """ Verify the block manager can correctly determine if a sequence group
+        can be swapped in/out.
+    """
+    num_cpu_blocks = num_gpu_blocks
+    block_manager = SelfAttnBlockSpaceManager(block_size,
+                                              num_cpu_blocks,
+                                              num_gpu_blocks,
+                                              watermark=0,
+                                              enable_caching=enable_caching)
+    prompt, seq_group = create_dummy_prompt(
+        "1", prompt_length=(num_gpu_blocks - 1) * block_size - 1)
+    prompt.status = SequenceStatus.WAITING
+    block_manager.allocate(seq_group)
+    prompt.status = SequenceStatus.RUNNING
+
+    # Swap seq group from GPU -> CPU.
+    gpu_blocks = block_manager.get_block_table(prompt)
+    assert block_manager.can_swap_out(seq_group)
+    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    mapping = block_manager.swap_out(seq_group)
+    mapping_keys = [key for key, _ in mapping]
+    assert mapping_keys == gpu_blocks
+    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
+    assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
+    prompt.status = SequenceStatus.SWAPPED
+
+    # At this moment, we still have enough free blocks to swap in the seq group.
+    if num_lookahead_slots <= block_size:
+        assert block_manager.can_swap_in(seq_group,
+                                         num_lookahead_slots) == AllocStatus.OK
+    else:
+        assert block_manager.can_swap_in(
+            seq_group, num_lookahead_slots) == AllocStatus.NEVER
+
+    # During Swapped out, 2 cached blocks were evicted from the GPU,
+    # so the prompt1 can't be swapped in
+    prompt2_len = 2 * block_size - 1
+    prompt2, seq_group2 = create_dummy_prompt(
+        "2",
+        prompt_length=prompt2_len,
+        prompt_tokens=[10000 + i for i in range(prompt2_len)])
+    prompt2.status = SequenceStatus.WAITING
+    block_manager.allocate(seq_group2)
+
+    # Swap seq group from CPU -> GPU.
+    if num_lookahead_slots <= block_size:
+        assert block_manager.can_swap_in(
+            seq_group, num_lookahead_slots) == AllocStatus.LATER
+    else:
+        assert block_manager.can_swap_in(
+            seq_group, num_lookahead_slots) == AllocStatus.NEVER
+
+
+@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
+@pytest.mark.parametrize("enable_caching", [False, True])
+def test_swap_in_infeasible(num_lookahead_slots, enable_caching):
+    """Verifies that swapping fails if there is not enough free blocks
+    to account for unseen tokens and lookahead_slots.
+    """
+    block_size = 8
+    num_cpu_blocks = 1
+    num_gpu_blocks = 1
+    block_manager = SelfAttnBlockSpaceManager(block_size,
+                                              num_cpu_blocks,
+                                              num_gpu_blocks,
+                                              watermark=0,
+                                              enable_caching=enable_caching)
+    prompt_length = block_size - 3
+    assert prompt_length > 0
+    prompt, seq_group = create_dummy_prompt("1", prompt_length=prompt_length)
+    prompt.status = SequenceStatus.WAITING
+    block_manager.allocate(seq_group)
+    # Emulate a forward pass by appending a single token.
+    # The block manager then knows how many unprocessed
+    # tokens will be written in the next forward pass.
+    token_id = 0
+    prompt.status = SequenceStatus.RUNNING
+    prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
+
+    # Swap seq group from GPU -> CPU.
+    assert block_manager.can_swap_out(seq_group)
+    block_manager.swap_out(seq_group)
+    prompt.status = SequenceStatus.SWAPPED
+
+    # Swap seq group from CPU -> GPU.
+    # The number of unseen tokens is 1. If the number of existing
+    # tokens plus the unseen ones and number of lookahead slots exceeds
+    # the total number of available GPU blocks then the swap
+    # should fail.
+    num_unseen_tokens = 1
+    if (num_lookahead_slots + num_unseen_tokens +
+            prompt_length) <= (block_size * num_gpu_blocks):
+        assert block_manager.can_swap_in(seq_group,
+                                         num_lookahead_slots) == AllocStatus.OK
+    else:
+        assert block_manager.can_swap_in(
+            seq_group, num_lookahead_slots) == AllocStatus.NEVER
+
+
+# TODO(cade/kaiyang): add comprehensive tests for swapping at allocator level.
+
+
+@pytest.mark.parametrize("block_size", [8, 16])
+@pytest.mark.parametrize("prompt_len", [10, 300, 1000])
+@pytest.mark.parametrize("num_slots_to_append", [50])
+@pytest.mark.parametrize("sliding_window", [20, 32, 200, 512])
+def test_sliding_window(block_size, prompt_len, num_slots_to_append,
+                        sliding_window):
+    """Verify append_slots consumes the correct number of blocks from the block
+    table.
+    """
+
+    num_gpu_blocks = 1024
+    watermark = 0.1
+    block_manager = SelfAttnBlockSpaceManager(
+        block_size=block_size,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=0,
+        watermark=watermark,
+        sliding_window=sliding_window,
+    )
+
+    def check_used(min_n, max_n=None):
+        if max_n is None:
+            max_n = min_n
+        used = num_gpu_blocks - block_manager.get_num_free_gpu_blocks()
+        assert min_n <= used
+        assert used <= max_n
+
+    def num_blocks(num_tokens):
+        return (num_tokens + block_size - 1) // block_size
+
+    check_used(0)
+
+    seq_group = create_seq_group(
+        seq_prompt_len=prompt_len,
+        seq_output_lens=[0],
+    )
+
+    check_used(0)
+
+    # Allocate seq
+    assert block_manager.can_allocate(seq_group)
+    block_manager.allocate(seq_group)
+
+    check_used(num_blocks(prompt_len))
+
+    # Seq seq to RUNNING
+    seq = seq_group.get_seqs()[0]
+    seq.status = SequenceStatus.RUNNING
+
+    seq.data.update_num_computed_tokens(prompt_len)
+    check_used(num_blocks(prompt_len))
+
+    # this is how we compute it in SelfAttnBlockSpaceManager.__init__
+    sliding_blocks = (sliding_window // block_size) + 2
+    # plus one block for null block
+    sliding_blocks += 1
+
+    # Append tokens to the sequeqnce
+    for token_id in range(num_slots_to_append):
+        seq.append_token_id(token_id, {token_id: Logprob(0.0)})
+        seq.data.update_num_computed_tokens(1)
+        block_manager.append_slots(seq, num_lookahead_slots=0)
+        if prompt_len < sliding_window + 10:
+            check_used(0, sliding_blocks + 1)
+        else:
+            check_used(sliding_blocks, sliding_blocks + 1)
diff --git a/vllm_v0.10.0/tests/core/block/test_block_table.py b/vllm_v0.10.0/tests/core/block/test_block_table.py
new file mode 100644
index 0000000..ba08500
--- /dev/null
+++ b/vllm_v0.10.0/tests/core/block/test_block_table.py
@@ -0,0 +1,577 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.core.block.block_table import BlockTable
+from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
+from vllm.utils import Device, cdiv, chunk_list
+
+
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+def test_allocate_naive(block_size: int, sequence_len: int):
+    """Test the allocation of blocks using the naive allocator.
+
+    This test creates a CpuGpuBlockAllocator with the specified block size and
+    number of blocks. It then allocates multiple BlockTables with varying
+    sequence lengths and verifies that the number of free blocks decreases as
+    expected after each allocation.
+    """
+    assert block_size > 1
+    num_gpu_blocks = 1024
+
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type="naive",
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(sequence_len))
+    num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
+
+    block_tables: list[BlockTable] = []
+    for i in range(5):
+        assert allocator.get_num_free_blocks(
+            device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc
+
+        block_tables.append(
+            BlockTable(
+                block_size=block_size,
+                block_allocator=allocator,
+            ))
+        block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
+
+
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+def test_allocate_prefix_caching(block_size: int, sequence_len: int):
+    """Test the allocation of blocks using the prefix caching allocator.
+
+    This test creates a CpuGpuBlockAllocator with the specified block size and
+    number of blocks, using the prefix caching allocator. It then allocates
+    multiple BlockTables with varying sequence lengths and verifies that the
+    number of free blocks decreases as expected after each allocation.
+
+    The test expects all sequences to share allocations, except for their last
+    block, which may be mutable. It calculates the expected number of immutable
+    and mutable blocks per allocation based on the sequence length and block
+    size.
+    """
+    assert block_size > 1
+    num_gpu_blocks = 1024
+
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type="prefix_caching",
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(sequence_len))
+    chunked_tokens = list(chunk_list(token_ids, block_size))
+    num_mutable_blocks_per_alloc = 0 if len(
+        chunked_tokens[-1]) == block_size else 1
+    num_immutable_blocks_per_alloc = len(
+        chunked_tokens) - num_mutable_blocks_per_alloc
+
+    block_tables: list[BlockTable] = []
+    for alloc_i in range(1, 6):
+
+        block_tables.append(
+            BlockTable(
+                block_size=block_size,
+                block_allocator=allocator,
+            ))
+        block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
+
+        # Expect all sequences to share allocations, except for their last block
+        # (which may be mutable).
+        assert allocator.get_num_free_blocks(
+            device=Device.GPU) == num_gpu_blocks - (
+                num_immutable_blocks_per_alloc + num_mutable_blocks_per_alloc *
+                (alloc_i))
+
+
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+@pytest.mark.parametrize("device", ["cpu", "gpu"])
+def test_allocate_free(block_size: int, sequence_len: int, allocator_type: str,
+                       device: str):
+    """Test the allocation and freeing of blocks using different allocators and
+    devices.
+
+    This test creates a CpuGpuBlockAllocator with the specified block size,
+    number of blocks, allocator type, and device. It then allocates a BlockTable
+    multiple times with the same sequence and verifies that the number of free
+    blocks remains consistent after each allocation and freeing.
+    """
+    device = Device[device.upper()]
+
+    num_device_blocks = 1024
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_device_blocks,
+        num_cpu_blocks=num_device_blocks,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(sequence_len))
+    num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
+
+    block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+
+    for i in range(5):
+        block_table.allocate(token_ids=token_ids, device=device)
+        assert allocator.get_num_free_blocks(
+            device) == num_device_blocks - num_blocks_per_alloc
+        assert all(block_id is not None
+                   for block_id in block_table.physical_block_ids)
+
+        block_table.free()
+        assert allocator.get_num_free_blocks(device) == num_device_blocks
+
+
+@pytest.mark.parametrize("block_size", [1, 8])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+@pytest.mark.parametrize("append_len", [1, 16, 129])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_append_token_ids_allocation(block_size: int, sequence_len: int,
+                                     append_len: int, allocator_type: str):
+    """Test the allocation behavior when appending token IDs to a BlockTable.
+
+    This test creates a CpuGpuBlockAllocator with the specified block size,
+    number of blocks, and allocator type. It then allocates a BlockTable with an
+    initial sequence and appends additional token IDs to it. The test verifies
+    that the number of allocated blocks before and after appending matches the
+    expected values.
+    """
+
+    num_gpu_blocks = 1024
+
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(sequence_len))
+    token_ids_to_append = list(range(append_len))
+
+    block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+
+    num_expected_blocks_before_append = len(
+        list(chunk_list(token_ids, block_size)))
+    num_expected_appended_blocks = len(
+        list(chunk_list(token_ids + token_ids_to_append,
+                        block_size))) - num_expected_blocks_before_append
+
+    block_table.allocate(token_ids=token_ids, device=Device.GPU)
+
+    assert len(
+        block_table.physical_block_ids) == num_expected_blocks_before_append
+    block_table.append_token_ids(token_ids_to_append)
+    assert len(
+        block_table.physical_block_ids
+    ) == num_expected_blocks_before_append + num_expected_appended_blocks
+
+
+@pytest.mark.parametrize("block_size", [1, 8])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+@pytest.mark.parametrize("num_empty_slots", [1, 16, 129])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_ensure_num_empty_slots_allocation(block_size: int, sequence_len: int,
+                                           num_empty_slots: int,
+                                           allocator_type: str):
+    """Test the allocation behavior when ensuring a certain number of empty
+    slots in a BlockTable.
+
+    This test creates a CpuGpuBlockAllocator with the specified block size,
+    number of blocks, and allocator type. It then allocates a BlockTable with an
+    initial sequence and ensures a certain number of empty slots. The test
+    verifies that the number of allocated blocks before and after ensuring empty
+    slots matches the expected values. It also checks that filling up the empty
+    slots does not consume additional blocks.
+    """
+    num_gpu_blocks = 1024
+
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(sequence_len))
+
+    block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+
+    num_expected_blocks_before_append = len(
+        list(chunk_list(token_ids, block_size)))
+    num_expected_appended_blocks = len(
+        list(chunk_list(token_ids + [-1] * num_empty_slots,
+                        block_size))) - num_expected_blocks_before_append
+
+    block_table.allocate(token_ids=token_ids, device=Device.GPU)
+
+    # Assert that the empty slots consume the expected number of additional
+    # blocks.
+    assert len(
+        block_table.physical_block_ids) == num_expected_blocks_before_append
+    block_table.ensure_num_empty_slots(num_empty_slots)
+    assert len(
+        block_table.physical_block_ids
+    ) == num_expected_blocks_before_append + num_expected_appended_blocks
+
+    # Now, ensure no additional blocks consumed as we fill up the empty slots.
+    num_free_blocks = allocator.get_num_free_blocks(device=Device.GPU)
+    block_table.append_token_ids(token_ids=list(range(num_empty_slots)))
+    assert num_free_blocks == allocator.get_num_free_blocks(device=Device.GPU)
+
+
+@pytest.mark.parametrize("block_size", [1, 8])
+@pytest.mark.parametrize("sequence_len", [1, 9])
+@pytest.mark.parametrize("append_len", [1, 16, 129])
+@pytest.mark.parametrize("append_size", [1, 4, 129])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
+                                          append_len: int, allocator_type: str,
+                                          append_size: int):
+    """Verify token ids are correctly appended. Appends various amounts of
+    token ids in various append sizes, and verifies the final sequence is
+    correct.
+    """
+    num_gpu_blocks = 1024
+
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(sequence_len))
+    token_ids_to_append = list(range(append_len))
+
+    block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+    block_table.allocate(token_ids=token_ids, device=Device.GPU)
+
+    appended_so_far: list[int] = []
+    for append in chunk_list(token_ids_to_append, append_size):
+        block_table.append_token_ids(append)
+        appended_so_far.extend(append)
+
+        assert block_table._get_all_token_ids() == token_ids + appended_so_far
+
+    assert block_table._get_all_token_ids() == token_ids + token_ids_to_append
+
+
+@pytest.mark.parametrize("seq_len", [1, 9, 129])
+@pytest.mark.parametrize("block_size", [1, 8])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_fork(seq_len: int, block_size: int, allocator_type: str):
+    """Create a sequence using the specified allocator.
+        1. Assert that after forking the sequence, the free block count is the
+            same.
+        2. Assert that the forked sequence has the same physical mappings.
+        3. Then free the original sequence; verify that the free block count is
+            the same.
+        4. Finally, free the forked sequence and verify that the free block
+            count drops to zero.
+    """
+    num_gpu_blocks = 1024
+
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=0,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(seq_len))
+
+    block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+
+    block_table.allocate(token_ids)
+
+    num_free_blocks_before_fork = allocator.get_num_free_blocks(
+        device=Device.GPU)
+
+    forked_block_table = block_table.fork()
+
+    # Expect physical_block_ids and token_ids to match.
+    assert (block_table.physical_block_ids ==
+            forked_block_table.physical_block_ids)
+    assert block_table._get_all_token_ids(
+    ) == forked_block_table._get_all_token_ids()
+
+    # Do not expect any additional allocations.
+    assert allocator.get_num_free_blocks(
+        device=Device.GPU) == num_free_blocks_before_fork
+
+    # Free the original blocks. Assert num free blocks does not change, since
+    # refcount is nonzero.
+    block_table.free()
+    assert allocator.get_num_free_blocks(
+        device=Device.GPU) == num_free_blocks_before_fork
+
+    # Expect the forked block table to be unaffected by the free.
+    assert all(block_id is not None
+               for block_id in forked_block_table.physical_block_ids)
+
+    # Free the forked blocks. Assert num free blocks does change, since
+    # refcount is now zero.
+    forked_block_table.free()
+    assert allocator.get_num_free_blocks(device=Device.GPU) == num_gpu_blocks
+
+
+@pytest.mark.parametrize("block_size", [8])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+@pytest.mark.parametrize("append_len", [1, 16, 129])
+@pytest.mark.parametrize("appender", ["forked", "original"])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_cow(block_size: int, sequence_len: int, append_len: int,
+             allocator_type: str, appender: str):
+    """Fork a sequence; append to the forked sequence; verify there's a CoW.
+    """
+    num_gpu_blocks = 1024
+
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=0,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(sequence_len))
+    token_ids_to_append = list(range(append_len))
+
+    original_block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+
+    num_expected_non_cow_blocks = cdiv(sequence_len, block_size)
+    num_expected_cow_blocks = cdiv(sequence_len + append_len,
+                                   block_size) - (sequence_len // block_size)
+
+    original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
+    original_block_ids = original_block_table.physical_block_ids[:]
+
+    print("original_block_ids = {}".format(original_block_ids))
+    forked_block_table = original_block_table.fork()
+
+    # Expect no additional allocation (copy on _write_).
+    assert allocator.get_num_free_blocks(
+        Device.GPU) == (num_gpu_blocks - num_expected_non_cow_blocks)
+
+    if appender == "forked":
+        appender_block_table = forked_block_table
+        static_block_table = original_block_table
+    elif appender == "original":
+        appender_block_table = original_block_table
+        static_block_table = forked_block_table
+    else:
+        raise ValueError(f"unknown test config {appender=}")
+
+    # Write tokens.
+    appender_block_table.append_token_ids(token_ids_to_append)
+
+    # Expect the non-appending block table to have no change.
+    assert static_block_table.physical_block_ids == original_block_ids
+    assert appender_block_table.physical_block_ids != original_block_ids
+
+    # Expect the blocks changed during append to have a CoW.
+    assert allocator.get_num_free_blocks(
+        Device.GPU) == num_gpu_blocks - (num_expected_non_cow_blocks +
+                                         num_expected_cow_blocks)
+
+    cows = allocator.clear_copy_on_writes()
+    if sequence_len % block_size > 0:
+        # If the last block in the sequence is not full, then when appending we
+        # expect a CoW.
+        assert cows
+
+        cow_block_id = sequence_len // block_size
+        expected_src = static_block_table.physical_block_ids[cow_block_id]
+        expected_dst = appender_block_table.physical_block_ids[cow_block_id]
+
+        assert (expected_src, expected_dst) in cows
+    else:
+        # Otherwise, there should be no copy-on-write.
+        assert not cows
+
+    static_block_table.free()
+    appender_block_table.free()
+
+    # After free, expect all blocks to be freed.
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+
+
+@pytest.mark.parametrize("block_size", [8])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+@pytest.mark.parametrize("append_len", [1, 16, 129])
+@pytest.mark.parametrize("lookahead_slots", [1, 16, 129])
+@pytest.mark.parametrize("appender", ["forked", "original"])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_cow_lookahead_simple(block_size: int, sequence_len: int,
+                              append_len: int, lookahead_slots: int,
+                              allocator_type: str, appender: str):
+    """Similar to test_cow, except with lookahead allocation. The assertions are
+    less rigorous due to the complexity of the property under test.
+    """
+    num_gpu_blocks = 1024
+
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=0,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(sequence_len))
+    token_ids_to_append = list(range(append_len))
+
+    original_block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+
+    original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
+
+    # Allocate lookahead slots.
+    original_block_table.ensure_num_empty_slots(lookahead_slots)
+    original_block_ids = original_block_table.physical_block_ids[:]
+
+    forked_block_table = original_block_table.fork()
+
+    if appender == "forked":
+        appender_block_table = forked_block_table
+        static_block_table = original_block_table
+    elif appender == "original":
+        appender_block_table = original_block_table
+        static_block_table = forked_block_table
+    else:
+        raise ValueError(f"unknown test config {appender=}")
+
+    # Write tokens.
+    appender_block_table.append_token_ids(token_ids_to_append)
+
+    # Expect the non-appending block table to have no change.
+    assert static_block_table.physical_block_ids == original_block_ids
+    assert appender_block_table.physical_block_ids != original_block_ids
+
+    cows = allocator.clear_copy_on_writes()
+
+    # Always expect copy-on-write
+    assert cows
+
+    if sequence_len % block_size > 0:
+        # If the last block in the sequence is not full, then when appending we
+        # expect a CoW.
+        assert cows
+
+        cow_block_id = sequence_len // block_size
+        expected_src = static_block_table.physical_block_ids[cow_block_id]
+        expected_dst = appender_block_table.physical_block_ids[cow_block_id]
+
+        assert (expected_src, expected_dst) in cows
+
+    static_block_table.free()
+    appender_block_table.free()
+
+    # After free, expect all blocks to be freed.
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+
+
+@pytest.mark.parametrize("block_size", [1, 8])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+@pytest.mark.parametrize("num_new_tokens", [1, 16, 129])
+@pytest.mark.parametrize("num_lookahead_slots", [1, 7, 8])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_num_blocks_touched_by_append_slots(block_size: int, sequence_len: int,
+                                            num_new_tokens: int,
+                                            num_lookahead_slots: int,
+                                            allocator_type: str):
+    """Verify correct calculation of get_num_blocks_touched_by_append_slots.
+
+    This is done by using copy-on-write, which requires any modified block to
+    be copied before write if the refcount > 1. We set the refcount>1 by forking
+    a sequence, then measure the free blocks before and after an append. If the
+    number of consumed blocks equals what `get_num_blocks_touched_by_append_
+    slots` returns, then the calculation is correct.
+    """
+
+    num_gpu_blocks = 1024
+
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=0,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(sequence_len))
+    token_ids_to_append = list(range(num_new_tokens))
+
+    block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+
+    block_table.allocate(token_ids=token_ids, device=Device.GPU)
+
+    # Add lookahead before fork so both sequences have the same lookahead
+    # blocks.
+    block_table.ensure_num_empty_slots(num_empty_slots=num_lookahead_slots)
+
+    # Fork sequence so that every block has refcount > 1.
+    _ = block_table.fork()
+
+    # Determine how many blocks should be touched.
+    expected_num_touched_blocks = (
+        block_table.get_num_blocks_touched_by_append_slots(
+            token_ids=token_ids_to_append,
+            num_lookahead_slots=num_lookahead_slots))
+
+    # Measure how many blocks are touched by measuring num_free_blocks before
+    # and after the append.
+    #
+    # We expect append_token_ids to CoW all mutated blocks that have refcount>1.
+    num_free_blocks_before_append = allocator.get_num_free_blocks(Device.GPU)
+    block_table.append_token_ids(token_ids_to_append, num_lookahead_slots)
+    num_consumed_blocks = (num_free_blocks_before_append -
+                           allocator.get_num_free_blocks(Device.GPU))
+
+    # TODO(cade) ensure equality when num_lookahead_slots > 0.
+    # The reason we have < is because lookahead blocks are not copied eagerly;
+    # they are copied on first write. This will cause issues for beam search +
+    # speculative decoding. This is acceptable for now as it is a large effort
+    # to combine the two. To fix this, we can ensure single sequence ownership
+    # of lookahead blocks by appending empty slots to each block, which will
+    # trigger the CoW.
+    #
+    # Until then, we can accept that the consumed tokens are <= the expected
+    # tokens when appending with lookahead.
+    if num_lookahead_slots > 0:
+        assert num_consumed_blocks <= expected_num_touched_blocks
+    else:
+        assert num_consumed_blocks == expected_num_touched_blocks
diff --git a/vllm_v0.10.0/tests/core/block/test_common.py b/vllm_v0.10.0/tests/core/block/test_common.py
new file mode 100644
index 0000000..6540089
--- /dev/null
+++ b/vllm_v0.10.0/tests/core/block/test_common.py
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import random
+
+import pytest
+
+from vllm.core.block.common import RefCounter
+
+
+@pytest.mark.parametrize("seed", list(range(20)))
+@pytest.mark.parametrize("num_incrs", [1, 100])
+@pytest.mark.parametrize("num_blocks", [1024])
+def test_incr(seed: int, num_incrs: int, num_blocks: int):
+    random.seed(seed)
+
+    all_block_indices = list(range(num_blocks))
+    counter = RefCounter(all_block_indices=all_block_indices)
+
+    block_id = random.randint(0, num_blocks - 1)
+    for i in range(num_incrs):
+        value = counter.incr(block_id)
+        assert value == i + 1
+
+
+@pytest.mark.parametrize("seed", list(range(20)))
+@pytest.mark.parametrize("num_incrs", [1, 100])
+@pytest.mark.parametrize("num_blocks", [1024])
+def test_incr_decr(seed: int, num_incrs: int, num_blocks: int):
+    random.seed(seed)
+
+    all_block_indices = list(range(num_blocks))
+    counter = RefCounter(all_block_indices=all_block_indices)
+
+    block_id = random.randint(0, num_blocks - 1)
+    for i in range(num_incrs):
+        value = counter.incr(block_id)
+        assert value == i + 1
+
+    for i in range(num_incrs):
+        value = counter.decr(block_id)
+        assert value == num_incrs - (i + 1)
+
+    with pytest.raises(AssertionError):
+        counter.decr(block_id)
diff --git a/vllm_v0.10.0/tests/core/block/test_cpu_gpu_block_allocator.py b/vllm_v0.10.0/tests/core/block/test_cpu_gpu_block_allocator.py
new file mode 100644
index 0000000..795eef6
--- /dev/null
+++ b/vllm_v0.10.0/tests/core/block/test_cpu_gpu_block_allocator.py
@@ -0,0 +1,96 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
+from vllm.utils import Device, chunk_list
+
+
+@pytest.mark.parametrize("num_cpu_blocks", [0, 512])
+@pytest.mark.parametrize("num_gpu_blocks", [1024])
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_allocate_mutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
+                                block_size: int, allocator_type: str):
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=num_cpu_blocks,
+        block_size=block_size,
+    )
+
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+
+    cpu_blocks = [
+        allocator.allocate_mutable_block(prev_block=None, device=Device.CPU)
+        for _ in range(num_cpu_blocks)
+    ]
+    assert allocator.get_num_free_blocks(Device.CPU) == 0
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+
+    gpu_blocks = [
+        allocator.allocate_mutable_block(prev_block=None, device=Device.GPU)
+        for _ in range(num_gpu_blocks)
+    ]
+    assert allocator.get_num_free_blocks(Device.CPU) == 0
+    assert allocator.get_num_free_blocks(Device.GPU) == 0
+
+    _ = [allocator.free(block) for block in cpu_blocks]
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == 0
+
+    _ = [allocator.free(block) for block in gpu_blocks]
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+
+
+@pytest.mark.parametrize("num_cpu_blocks", [0, 512])
+@pytest.mark.parametrize("num_gpu_blocks", [1024])
+@pytest.mark.parametrize("block_size", [2])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_allocate_immutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
+                                  block_size: int, allocator_type: str):
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=num_cpu_blocks,
+        block_size=block_size,
+    )
+
+    unique_token_ids = list(
+        range((num_cpu_blocks + num_gpu_blocks) * block_size))
+    gpu_token_ids = list(
+        chunk_list(unique_token_ids[:num_gpu_blocks * block_size], block_size))
+    cpu_token_ids = list(
+        chunk_list(unique_token_ids[num_gpu_blocks * block_size:], block_size))
+
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+
+    cpu_blocks = [
+        allocator.allocate_immutable_block(prev_block=None,
+                                           token_ids=token_ids,
+                                           device=Device.CPU)
+        for token_ids in cpu_token_ids
+    ]
+    assert allocator.get_num_free_blocks(Device.CPU) == 0
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+
+    gpu_blocks = [
+        allocator.allocate_immutable_block(prev_block=None,
+                                           token_ids=token_ids,
+                                           device=Device.GPU)
+        for token_ids in gpu_token_ids
+    ]
+    assert allocator.get_num_free_blocks(Device.CPU) == 0
+    assert allocator.get_num_free_blocks(Device.GPU) == 0
+
+    _ = [allocator.free(block) for block in cpu_blocks]
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == 0
+
+    _ = [allocator.free(block) for block in gpu_blocks]
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
diff --git a/vllm_v0.10.0/tests/core/block/test_naive_block.py b/vllm_v0.10.0/tests/core/block/test_naive_block.py
new file mode 100644
index 0000000..a31d1c4
--- /dev/null
+++ b/vllm_v0.10.0/tests/core/block/test_naive_block.py
@@ -0,0 +1,148 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import pytest
+
+from vllm.core.block.interfaces import Block, BlockAllocator
+from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
+
+
+class TestNaiveBlockAllocator:
+
+    @staticmethod
+    def create_allocate_lambda(allocate_type: str,
+                               allocator: NaiveBlockAllocator,
+                               prev_block: Optional[Block],
+                               token_ids: list[int]):
+        if allocate_type == "immutable":
+            allocate_block = lambda: allocator.allocate_immutable_block(
+                prev_block=prev_block, token_ids=token_ids)
+        elif allocate_type == "mutable":
+            allocate_block = lambda: allocator.allocate_mutable_block(
+                prev_block=prev_block)
+        else:
+            raise ValueError()
+
+        return allocate_block
+
+    @staticmethod
+    @pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
+    @pytest.mark.parametrize("num_blocks", [1, 1024])
+    @pytest.mark.parametrize("block_size", [1, 16])
+    def test_allocate_ooms(allocate_type: str, num_blocks: int,
+                           block_size: int):
+        allocator = NaiveBlockAllocator(create_block=NaiveBlock,
+                                        num_blocks=num_blocks,
+                                        block_size=block_size)
+        allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
+            allocate_type,
+            allocator,
+            prev_block=None,
+            token_ids=list(range(block_size)))
+
+        [allocate_block() for _ in range(num_blocks)]
+        with pytest.raises(BlockAllocator.NoFreeBlocksError):
+            allocate_block()
+
+    @staticmethod
+    @pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
+    @pytest.mark.parametrize("num_blocks", [1, 1024])
+    @pytest.mark.parametrize("block_size", [1, 16])
+    def test_free_prevents_oom(allocate_type: str, num_blocks: int,
+                               block_size: int):
+        allocator = NaiveBlockAllocator(create_block=NaiveBlock,
+                                        num_blocks=num_blocks,
+                                        block_size=block_size)
+        allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
+            allocate_type,
+            allocator,
+            prev_block=None,
+            token_ids=list(range(block_size)))
+
+        blocks = [allocate_block() for _ in range(num_blocks)]
+
+        with pytest.raises(BlockAllocator.NoFreeBlocksError):
+            allocate_block()
+
+        block_to_free = blocks.pop()
+
+        for _ in range(100):
+            block_id = block_to_free.block_id
+            allocator.free(block_to_free)
+            assert block_to_free.block_id is None
+
+            new_block = allocate_block()
+            assert new_block.block_id == block_id
+
+            with pytest.raises(BlockAllocator.NoFreeBlocksError):
+                allocate_block()
+
+            block_to_free = new_block
+
+    @staticmethod
+    @pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
+    @pytest.mark.parametrize("num_blocks", [1024])
+    @pytest.mark.parametrize("block_size", [16])
+    def test_get_num_free_blocks(allocate_type: str, num_blocks: int,
+                                 block_size: int):
+        allocator = NaiveBlockAllocator(create_block=NaiveBlock,
+                                        num_blocks=num_blocks,
+                                        block_size=block_size)
+        allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
+            allocate_type,
+            allocator,
+            prev_block=None,
+            token_ids=list(range(block_size)))
+
+        assert allocator.get_num_free_blocks() == num_blocks
+
+        blocks = [allocate_block() for _ in range(num_blocks)]
+
+        for i, block in enumerate(blocks):
+            assert allocator.get_num_free_blocks() == i
+            allocator.free(block)
+
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [4])
+    @pytest.mark.parametrize("block_size", [8])
+    def test_naive_block_get_num_full_blocks_touched(num_blocks, block_size):
+        """ Verify the allocator can correctly return the number of
+        full blocks touched.
+        """
+        allocator_src = NaiveBlockAllocator(create_block=NaiveBlock,
+                                            num_blocks=num_blocks,
+                                            block_size=block_size)
+        allocator_dst = NaiveBlockAllocator(create_block=NaiveBlock,
+                                            num_blocks=num_blocks,
+                                            block_size=block_size)
+
+        # Create a chain of cacheable blocks in the dst
+        allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
+            "immutable",
+            allocator_src,
+            prev_block=None,
+            token_ids=list(range(block_size)))
+        src_blocks = [allocate_block() for _ in range(num_blocks - 1)]
+
+        # All blocks are cached
+        assert allocator_dst.get_num_full_blocks_touched(
+            src_blocks) == num_blocks - 1
+
+        # Insert one non-full block in the src
+        allocate_non_full_block = \
+            TestNaiveBlockAllocator.create_allocate_lambda(
+                "mutable", allocator_src,
+                prev_block=src_blocks[-1],token_ids=[]
+            )
+        src_blocks.append(allocate_non_full_block())
+        src_blocks[-1].append_token_ids([0])
+
+        assert allocator_dst.get_num_full_blocks_touched(
+            src_blocks) == num_blocks - 1
+        # Fill up the last source block and then invoke
+        # get_num_blocks_touched
+        src_blocks[-1].append_token_ids([0] * (block_size - 1))
+        assert allocator_dst.get_num_full_blocks_touched(
+            src_blocks) == num_blocks
diff --git a/vllm_v0.10.0/tests/core/block/test_prefix_caching_block.py b/vllm_v0.10.0/tests/core/block/test_prefix_caching_block.py
new file mode 100644
index 0000000..46e224c
--- /dev/null
+++ b/vllm_v0.10.0/tests/core/block/test_prefix_caching_block.py
@@ -0,0 +1,1035 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+import random
+from typing import Optional
+from unittest.mock import MagicMock
+
+import pytest
+
+from tests.core.utils import create_dummy_lora_sequence, create_dummy_sequence
+from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
+from vllm.core.block.interfaces import Block, BlockAllocator
+from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker,
+                                                  PrefixCachingBlock,
+                                                  PrefixCachingBlockAllocator)
+from vllm.sequence import Logprob
+from vllm.utils import Device
+
+
+class TestPrefixCachingBlock:
+
+    @staticmethod
+    @pytest.mark.parametrize("seed", list(range(10)))
+    @pytest.mark.parametrize("block_size", [1, 16])
+    @pytest.mark.parametrize("is_curr_block_full", [True, False])
+    def test_first_block_has_correct_content_hash(seed: int, block_size: int,
+                                                  is_curr_block_full: bool):
+        """Verify a block which is first in the sequence has the correct hash.
+        """
+        random.seed(seed)
+        num_to_fill = block_size if is_curr_block_full else random.randint(
+            0, block_size - 1)
+        token_ids = list(range(num_to_fill))
+        mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)
+
+        block_with_prev = PrefixCachingBlock(prev_block=None,
+                                             token_ids=token_ids,
+                                             block_size=block_size,
+                                             allocator=mock_allocator)
+
+        if is_curr_block_full:
+            # Expect hash since block is full.
+            assert block_with_prev.content_hash == (
+                PrefixCachingBlock.hash_block_tokens(
+                    is_first_block=True,
+                    prev_block_hash=None,
+                    cur_block_token_ids=token_ids))
+        else:
+            # Do not expect hash since block is not full.
+            assert block_with_prev.content_hash is None
+
+    @staticmethod
+    @pytest.mark.parametrize("seed", list(range(10)))
+    @pytest.mark.parametrize("block_size", [1, 16])
+    @pytest.mark.parametrize("is_curr_block_full", [True, False])
+    @pytest.mark.parametrize("prev_block_has_hash", [True, False])
+    def test_nth_block_has_correct_content_hash(seed: int, block_size: int,
+                                                is_curr_block_full: bool,
+                                                prev_block_has_hash: bool):
+        """Verify a block which is not first in the sequence has the correct
+        hash.
+        """
+
+        random.seed(seed)
+
+        previous_block = MagicMock(spec=PrefixCachingBlock)
+        prev_block_hash = random.randint(0, 1000)
+        previous_block.content_hash = (prev_block_hash if prev_block_has_hash
+                                       else hash('None'))
+
+        num_to_fill = block_size if is_curr_block_full else random.randint(
+            0, block_size - 1)
+        token_ids = list(range(num_to_fill))
+        mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)
+
+        block_with_prev = PrefixCachingBlock(
+            prev_block=previous_block,
+            token_ids=token_ids,
+            block_size=block_size,
+            allocator=mock_allocator,
+        )
+
+        if is_curr_block_full and prev_block_has_hash:
+            # Expect hash since block is full and previous block has hash.
+            assert (block_with_prev.content_hash ==
+                    PrefixCachingBlock.hash_block_tokens(
+                        is_first_block=False,
+                        prev_block_hash=prev_block_hash,
+                        cur_block_token_ids=token_ids))
+        else:
+            # Do not expect hash since block is not full or the previous block
+            # does not have a hash.
+            assert block_with_prev.content_hash is None
+
+    @staticmethod
+    @pytest.mark.parametrize("block_size", [1, 2, 16])
+    @pytest.mark.parametrize("num_tokens", list(range(3)))
+    @pytest.mark.parametrize("num_empty_trailing_blocks", [0, 1, 10])
+    def test_blocks_have_correct_hash_in_chain(block_size: int,
+                                               num_tokens: int,
+                                               num_empty_trailing_blocks: int):
+        """Create two chains of logical blocks with the same contents.
+        Assert the hashes are equal.
+        """
+        random.seed(0)
+
+        token_ids = [random.randint(0, 50_000) for _ in range(num_tokens)]
+
+        first_chain, second_chain = (TestPrefixCachingBlock.create_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            num_empty_trailing_blocks=num_empty_trailing_blocks)
+                                     for _ in range(2))
+
+        for first_chain_block, second_chain_block in zip(
+                first_chain, second_chain):
+            assert (first_chain_block.content_hash ==
+                    second_chain_block.content_hash)
+
+        if not first_chain or not second_chain:
+            assert first_chain == second_chain
+            assert num_tokens == 0
+
+    @staticmethod
+    def create_chain(block_size: int,
+                     token_ids: list[int],
+                     num_empty_trailing_blocks=0) -> list[PrefixCachingBlock]:
+        """Helper method which creates a chain of blocks.
+        """
+        blocks: list[PrefixCachingBlock] = []
+        num_blocks = math.ceil(
+            len(token_ids) / block_size) + num_empty_trailing_blocks
+
+        if num_blocks == 0:
+            return []
+
+        allocator = MagicMock(spec=PrefixCachingBlockAllocator)
+
+        prev_block = None
+        for block_number in range(0, num_blocks):
+            prev_block = PrefixCachingBlock(
+                prev_block=prev_block,
+                token_ids=[],
+                block_size=block_size,
+                allocator=allocator,
+            )
+
+            tokens_to_append = token_ids[block_number *
+                                         block_size:(block_number + 1) *
+                                         block_size]
+            if tokens_to_append:
+                prev_block.append_token_ids(tokens_to_append)
+
+            blocks.append(prev_block)
+
+        return blocks
+
+
+class TestPrefixCachingBlockAllocator:
+
+    @staticmethod
+    def create_allocate_lambda(allocate_type: str, allocator: BlockAllocator,
+                               prev_block: Optional[Block],
+                               token_ids: list[int]):
+        if allocate_type == "immutable":
+            allocate_block = lambda: allocator.allocate_immutable_block(
+                prev_block=prev_block, token_ids=token_ids)
+        elif allocate_type == "mutable":
+            allocate_block = lambda: allocator.allocate_mutable_block(
+                prev_block=prev_block)
+        else:
+            raise ValueError()
+
+        return allocate_block
+
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1, 1024])
+    @pytest.mark.parametrize("block_size", [1, 16])
+    def test_allocate_mutable_ooms(num_blocks: int, block_size: int):
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        allocate_block = TestPrefixCachingBlockAllocator.create_allocate_lambda(
+            allocate_type="mutable",
+            allocator=allocator,
+            prev_block=None,
+            token_ids=list(range(block_size)),
+        )
+
+        [allocate_block() for _ in range(num_blocks)]
+        with pytest.raises(BlockAllocator.NoFreeBlocksError):
+            allocate_block()
+
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1, 1024])
+    @pytest.mark.parametrize("block_size", [1, 16])
+    def test_allocate_immutable_does_not_oom_single_hash(
+            num_blocks: int, block_size: int):
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        allocate_block = TestPrefixCachingBlockAllocator.create_allocate_lambda(
+            allocate_type="immutable",
+            allocator=allocator,
+            prev_block=None,
+            token_ids=list(range(block_size)),
+        )
+
+        blocks = [allocate_block() for _ in range(num_blocks)]
+
+        # Expect no OOM. If these were mutable blocks, this would OOM.
+        non_oom_block = allocate_block()
+
+        # Expect all blocks to have same physical block index.
+        for block in blocks:
+            assert (block.block_id == non_oom_block.block_id)
+
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1, 1024])
+    @pytest.mark.parametrize("block_size", [1, 16])
+    def test_allocate_immutable_ooms_many_hash(num_blocks: int,
+                                               block_size: int):
+        """Consume all blocks using many different hashes/block content.
+
+        Do this by creating a sequence that is very long.
+        Expect next block to OOM.
+        """
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+
+        # Create token ids that will exhaust all blocks.
+        token_ids = list(range(num_blocks * block_size))
+
+        chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+
+        # Expect allocation with unseen hash to fail.
+        with pytest.raises(BlockAllocator.NoFreeBlocksError):
+            allocator.allocate_immutable_block(prev_block=chain[-1],
+                                               token_ids=list(
+                                                   range(block_size)))
+
+        # Expect mutable allocation to fail.
+        with pytest.raises(BlockAllocator.NoFreeBlocksError):
+            allocator.allocate_mutable_block(prev_block=chain[-1])
+
+        # Expect allocation of exact same chain to pass.
+        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+
+        # Expect physical block indices to be the same in both chains.
+        assert chain and second_chain
+        for first_chain_block, second_chain_block in zip(chain, second_chain):
+            assert (first_chain_block.block_id == second_chain_block.block_id)
+
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1, 1024])
+    @pytest.mark.parametrize("block_size", [1, 16])
+    def test_free_prevents_oom(num_blocks: int, block_size: int):
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+
+        # Create token ids that will exhaust all blocks.
+        token_ids = list(range(num_blocks * block_size))
+
+        chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+
+        # Expect mutable allocation to fail.
+        with pytest.raises(BlockAllocator.NoFreeBlocksError):
+            allocator.allocate_mutable_block(prev_block=None)
+
+        block_to_free = chain[-1]
+
+        # Expect free/allocate loop to succeed many times.
+        for i in range(100):
+            block_id = block_to_free.block_id
+            allocator.free(block_to_free)
+            assert block_to_free.block_id is None, i
+
+            new_block = allocator.allocate_mutable_block(prev_block=None)
+            assert new_block.block_id == block_id, i
+
+            with pytest.raises(BlockAllocator.NoFreeBlocksError):
+                allocator.allocate_mutable_block(prev_block=None)
+
+            block_to_free = new_block
+
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1024])
+    @pytest.mark.parametrize("block_size", [16])
+    @pytest.mark.parametrize("seed", list(range(20)))
+    def test_get_num_free_blocks(num_blocks: int, block_size: int, seed: int):
+        random.seed(seed)
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        num_blocks_to_consume = random.randint(1, num_blocks - 1)
+
+        # Create token ids that will exhaust all blocks.
+        token_ids = list(range(num_blocks_to_consume * block_size))
+
+        chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+
+        # Free each block in chain, assert num free blocks includes new free
+        # block.
+        for i, block in enumerate(chain):
+            assert allocator.get_num_free_blocks() == (num_blocks -
+                                                       num_blocks_to_consume +
+                                                       i)
+            allocator.free(block)
+
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [4])
+    @pytest.mark.parametrize("block_size", [8])
+    def test_prefix_caching_block_get_num_full_blocks_touched(
+            num_blocks, block_size):
+        """ Verify the allocator can correctly return the number of
+        blocks touched, when there are cached prefixes.
+        """
+        allocator_src = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                    block_size=block_size)
+        allocator_dst = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                    block_size=block_size)
+
+        # Create token ids that will exhaust all blocks except the last
+        token_ids = list(range((num_blocks - 1) * block_size))
+
+        # Create a chain of cacheable blocks in the dst
+        cached_blocks = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator_dst,
+        )
+
+        # Create a chain of the same blocks in the src
+        blocks_to_swap_in = \
+            TestPrefixCachingBlockAllocator.create_immutable_chain(
+                block_size=block_size,
+                token_ids=token_ids,
+                allocator=allocator_src,
+            )
+        # All blocks are cached
+        assert allocator_dst.get_num_full_blocks_touched(
+            blocks_to_swap_in) == 0
+
+        # Free the first block in the dst
+        allocator_dst.free(cached_blocks[0])
+
+        # Now the first block becomes dangling, the swapped blocks need
+        # to reclaim the first block in the dst
+        assert allocator_dst.get_num_full_blocks_touched(
+            blocks_to_swap_in) == 1
+
+        # Insert one non-full block in the src
+        non_full_block = allocator_src.allocate_mutable_block(
+            blocks_to_swap_in[-1])
+        non_full_block.append_token_ids([0])
+        blocks_to_swap_in.append(non_full_block)
+        assert allocator_dst.get_num_full_blocks_touched(
+            blocks_to_swap_in) == 1
+        # Fill up the last mutable block and invoke get_num_blocks_touched.
+        # Note: The last block is not cached so it will be touched.
+        non_full_block.append_token_ids([0] * (block_size - 1))
+        assert allocator_dst.get_num_full_blocks_touched(
+            blocks_to_swap_in) == 2
+
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1024])
+    @pytest.mark.parametrize("block_size", [16])
+    @pytest.mark.parametrize("seed", list(range(20)))
+    def test_get_num_free_blocks_shared(num_blocks: int, block_size: int,
+                                        seed: int):
+        """Verify sharing occurs by allocating two sequences that share prefixes
+        and incrementally freeing blocks.
+        """
+        random.seed(seed)
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        num_blocks_to_consume = random.randint(1, num_blocks - 1)
+
+        # Create token ids that will exhaust all blocks.
+        token_ids = list(range(num_blocks_to_consume * block_size))
+
+        first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+
+        # Free each block in the first chain. Since all blocks are shared, the
+        # free count should stay constant.
+        for i, block in enumerate(first_chain):
+            assert allocator.get_num_free_blocks() == (num_blocks -
+                                                       num_blocks_to_consume)
+            allocator.free(block)
+
+        # Free each block in the second chain. Since the refcount is now zero,
+        # the free count should increment with each free.
+        for i, block in enumerate(second_chain):
+            assert allocator.get_num_free_blocks() == (num_blocks -
+                                                       num_blocks_to_consume +
+                                                       i)
+            allocator.free(block)
+
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1024])
+    @pytest.mark.parametrize("block_size", [16])
+    @pytest.mark.parametrize("seed", list(range(20)))
+    def test_get_common_computed_block_ids(num_blocks: int, block_size: int,
+                                           seed: int):
+        """Verify get_common_computed_block_ids could get correct result
+        by create two immutable chain sharing prefix at specified pos,
+        and compare whether we also could get right result
+        from get_common_computed_block_ids.
+        """
+        random.seed(seed)
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks * 2,
+                                                block_size=block_size)
+        num_blocks_to_consume = random.randint(1, num_blocks - 1)
+
+        # Create token ids that will exhaust all blocks.
+        token_ids = list(range(num_blocks_to_consume * block_size))
+
+        first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+
+        # After zero_point, second_chain's token_ids would be set -1, which
+        # make it different from here comparing with first_chain
+        zero_point = random.randint(1, len(token_ids) - 1)
+        zero_point_blocks = zero_point // block_size
+        token_ids[zero_point:] = [-1] * (len(token_ids) - zero_point)
+
+        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+
+        first_computed_ids = [
+            first_chain[i].block_id for i in range(num_blocks_to_consume)
+        ]
+        second_computed_ids = [
+            second_chain[i].block_id for i in range(num_blocks_to_consume)
+        ]
+        res = allocator.get_common_computed_block_ids(
+            [first_computed_ids, second_computed_ids])
+
+        assert (len(res) == zero_point_blocks)
+
+    # Test case that assume those prompted block after first immutable would
+    # be freed into hashless allocator, while first immutable block get ref
+    # increased.
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [3])
+    @pytest.mark.parametrize("block_size", [16])
+    @pytest.mark.parametrize("seed", list(range(10)))
+    def test_alloc_promotion(num_blocks: int, block_size: int, seed: int):
+        random.seed(seed)
+
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        token_ids = list(range(block_size))
+
+        block = allocator.allocate_immutable_block(prev_block=None,
+                                                   token_ids=token_ids)
+
+        assert allocator._refcounter.get(block.block_id) == 1
+        m = allocator.allocate_mutable_block(prev_block=None)
+
+        block_id = m.block_id
+        for i in range(block_size):
+            m.append_token_ids([i])
+
+        # After block get promoted to immutable from mutable, if there is
+        # already same content hash block, then it shall be released into
+        # hashless_allocator
+        # And first immutable block's ref get increased by 1
+        assert m.block_id == block.block_id
+        assert block_id in allocator._hashless_allocator._free_block_indices
+        assert allocator._refcounter.get(block.block_id) == 2
+
+    # Test case when eviction and allocation are mixed,
+    # make sure they work as expected
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [3])
+    @pytest.mark.parametrize("block_size", [16])
+    @pytest.mark.parametrize("seed", list(range(10)))
+    def test_eviction_alloc_mixed(num_blocks: int, block_size: int, seed: int):
+        random.seed(seed)
+
+        all_blocks_list = [i for i in range(num_blocks)]
+        zero_ref = {i: 0 for i in range(num_blocks)}
+        one_ref = {i: 1 for i in range(num_blocks)}
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        token_ids = list(range(num_blocks * block_size))
+
+        # Verify initial/pre-alloc state
+
+        # Ensure all blocks are free inside hashless allocator
+        assert list(allocator._hashless_allocator._free_block_indices
+                    ) == all_blocks_list
+        # Ensure no tracked blocks
+        assert len(allocator._block_tracker.keys()) == num_blocks
+        for block_id in range(num_blocks):
+            assert not allocator._block_tracker[block_id].active
+        # Ensure no cached blocks
+        assert len(allocator._cached_blocks.values()) == 0
+        # Ensure no evicted blocks
+        assert len(allocator.evictor.free_table.keys()) == 0
+        # Ensure 0s ref counts for all blocks
+        assert allocator._refcounter._refcounts == zero_ref
+
+        # Allocate immutable chains with only one block residuled in
+        new_block = []
+        for i in range(num_blocks):
+            block = allocator.allocate_immutable_block(
+                prev_block=None,
+                token_ids=token_ids[block_size * i:block_size * (i + 1)])
+            new_block.append(block)
+
+        # Verify post-alloc state
+
+        # Ensure no blocks are free inside hashless allocator
+        assert (len(allocator._hashless_allocator._free_block_indices) == 0)
+        # Ensure all blocks are tracked
+        assert len(allocator._block_tracker.keys()) == num_blocks
+        for block_id in range(num_blocks):
+            assert allocator._block_tracker[block_id].active
+        # Ensure all blocks are cached (all promoted)
+        assert len(allocator._cached_blocks.values()) == num_blocks
+        # Ensure no evicted blocks
+        assert len(allocator.evictor.free_table.keys()) == 0
+        # Ensure 1s ref counts for all blocks
+        assert allocator._refcounter._refcounts == one_ref
+
+        # Free all blocks, and now all blocks shall be in the evictor
+        # there shall be no tracking data left in _block_tracker
+        # all blocks shall be tracked in _cached_blocks
+        # all blocks' ref shall be zero
+        for block in new_block:
+            allocator.free(block)
+
+        # Verify post-free state
+
+        # Ensure no tracked blocks
+        assert len(allocator._block_tracker.keys()) == num_blocks
+        for block_id in range(num_blocks):
+            assert not allocator._block_tracker[block_id].active
+        # Ensure no blocks in hashless allocator (all promoted)
+        assert len(allocator._hashless_allocator._free_block_indices) == 0
+        # Ensure all blocks are cached
+        assert list(allocator._cached_blocks.values()) == all_blocks_list
+        # Ensure all blocks are inside the evictor
+        assert list(allocator.evictor.free_table.keys()) == all_blocks_list
+        # Ensure 0s refcounts
+        assert allocator._refcounter._refcounts == zero_ref
+
+        # Allocate a mutable block, and the first block shall be evicted
+        # and set its content hash into None, ref to 1
+        mutable = allocator.allocate_mutable_block(prev_block=None)
+
+        assert mutable.block_id == 0
+        assert mutable.content_hash is None
+        assert allocator._block_tracker[0].active
+        assert allocator._refcounter.get(0) == 1
+        assert 0 not in allocator._cached_blocks
+        assert 0 not in allocator.evictor
+
+        # Since this mutable block has no hash yet, it shall be released into
+        # hashless allocator
+        allocator.free(mutable)
+
+        assert not allocator._block_tracker[0].active
+        assert allocator._refcounter._refcounts == zero_ref
+        assert 0 not in allocator._cached_blocks
+        assert 0 not in allocator.evictor
+        assert 0 in allocator._hashless_allocator._free_block_indices
+
+        # When allocate immutable with first block_size tokens, we
+        # shall get free block from hashless allocator, thus no block left
+        # in hashless
+        block = allocator.allocate_immutable_block(
+            prev_block=None, token_ids=token_ids[:block_size])
+
+        assert block.block_id == 0
+        assert len(allocator._hashless_allocator._free_block_indices) == 0
+        assert allocator._block_tracker[0].active
+        assert 0 in allocator._cached_blocks.values()
+        assert allocator._refcounter.get(0) == 1
+        assert 0 not in allocator.evictor
+
+        # allocate mutable block again, it shall be popped from evictor
+        mutable = allocator.allocate_mutable_block(prev_block=None)
+        assert len(allocator._hashless_allocator._free_block_indices) == 0
+        assert mutable.block_id not in allocator.evictor.free_table
+        assert allocator._refcounter.get(mutable.block_id) == 1
+
+    # Test case where two last accessed times are equal
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1024])
+    @pytest.mark.parametrize("block_size", [16])
+    @pytest.mark.parametrize("seed", list(range(20)))
+    def test_eviction_order(num_blocks: int, block_size: int, seed: int):
+        """This test case simulate the two chain created and free in order,
+        and together they would exhaust the initial freed blocks.
+
+        So the next block created after those two chain shall use the block
+        from the first chain as that block has long access time.
+        While first chain has two blocks, it shall pick up the last one, as
+        it has larger token number.
+        """
+
+        random.seed(seed)
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        num_blocks_to_consume = num_blocks + 1
+
+        token_ids = list(range(num_blocks_to_consume * block_size))
+
+        num_blocks_in_first_chain = 2
+        num_tokens_in_first_chain = block_size * num_blocks_in_first_chain
+        # First chain takes the first block
+        first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids[:num_tokens_in_first_chain],
+            allocator=allocator,
+        )
+        # There should only be one block allocated at this point
+        assert allocator.get_num_free_blocks() == (num_blocks -
+                                                   num_blocks_in_first_chain)
+
+        # Set the last accessed time of the first block to 1
+        blocks_ids = [block.block_id for block in first_chain]
+        allocator.mark_blocks_as_accessed(blocks_ids, 1)
+
+        # Second chain takes the rest of the blocks
+        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids[num_tokens_in_first_chain:-block_size],
+            allocator=allocator,
+        )
+
+        # There shouldn't be any blocks left at this point
+        assert allocator.get_num_free_blocks() == (0)
+
+        assert len(first_chain) == num_blocks_in_first_chain
+        last_block_id = first_chain[-1].block_id
+        # Free each block in the first chain.
+        for i, block in enumerate(first_chain):
+            allocator.free(block)
+
+        # Set the last accessed time on all of the blocks in the second chain
+        # to 2
+        blocks_ids = [block.block_id for block in second_chain]
+        allocator.mark_blocks_as_accessed(blocks_ids, 2)
+
+        # Free each block in the second chain.
+        for i, block in enumerate(second_chain):
+            allocator.free(block)
+
+        # Allocate a new block and check that it's the least recently used block
+        # from the first chain.
+        new_block = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids[-block_size:],
+            allocator=allocator,
+        )
+
+        assert new_block[0].block_id == last_block_id
+
+    # Test case for cache mertics
+    @staticmethod
+    def test_metric():
+        block_size = 16
+        allocator = PrefixCachingBlockAllocator(num_blocks=4,
+                                                block_size=block_size)
+        # Test when no query (0/0)
+        assert allocator.get_prefix_cache_hit_rate() == 0.0
+
+        token_ids = list(range(block_size))
+        allocator.allocate_immutable_block(prev_block=None,
+                                           token_ids=token_ids)
+        # Test 0/1 hit rate
+        assert allocator.get_prefix_cache_hit_rate() == 0.0
+
+        allocator.allocate_immutable_block(prev_block=None,
+                                           token_ids=token_ids)
+        # Test 1/2 hit rate
+        assert allocator.get_prefix_cache_hit_rate() == 0.5
+
+        # Test more than one block
+        for _ in range(2, 1005):
+            allocator.allocate_immutable_block(prev_block=None,
+                                               token_ids=token_ids)
+        assert allocator.get_prefix_cache_hit_rate() > 0.99
+
+    # Test case for marking cache hit blocks as computed right after
+    # a batch of prefill sequences are scheduled.
+    @staticmethod
+    def test_touch_block():
+        block_size = 16
+        common_blocks = 4
+        allocator = PrefixCachingBlockAllocator(num_blocks=8,
+                                                block_size=block_size)
+
+        common_token_ids = list(range(block_size * common_blocks))
+
+        # Mimic the behavior of allocating the same block chain
+        # (i.e., common prefix) for a batch of 3 different prefill sequences.
+        for _ in range(3):
+            blocks = TestPrefixCachingBlockAllocator.create_immutable_chain(
+                block_size=block_size,
+                token_ids=common_token_ids,
+                allocator=allocator,
+            )
+            block_hashes = [block.content_hash for block in blocks]
+            # The allocated blocks should  be marked as touched
+            # but not computed.
+            computed_block_ids = allocator.find_cached_blocks_prefix(
+                block_hashes)
+            assert len(computed_block_ids) == 0
+
+        allocator.mark_blocks_as_computed([])
+        computed_block_ids = allocator.find_cached_blocks_prefix(
+            block_hashes=block_hashes)
+        assert len(computed_block_ids) == common_blocks
+
+    @staticmethod
+    def test_find_cached_blocks_prefix():
+        """
+        This test verifies the behavior of find_cached_blocks_prefix.
+        """
+        block_size = 4
+        num_blocks = 8
+        total_test_blocks = 12
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+
+        token_ids = list(range(total_test_blocks * block_size))
+        block_tokens_seq1 = token_ids[:num_blocks * block_size]
+        blocks_seq1 = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=block_tokens_seq1,
+            allocator=allocator,
+        )
+        block_hashes_seq1 = [block.content_hash for block in blocks_seq1]
+        allocator.mark_blocks_as_computed([])
+
+        # All blocks should be cached.
+        cached_blocks_seq1 = allocator.find_cached_blocks_prefix(
+            block_hashes=block_hashes_seq1)
+        assert len(cached_blocks_seq1) == num_blocks
+
+        # Free the first sequence.
+        for block in blocks_seq1:
+            allocator.free(block)
+
+        # All blocks should be still be cached if not required to be allocated.
+        cached_blocks = allocator.find_cached_blocks_prefix(
+            block_hashes=block_hashes_seq1)
+        assert len(cached_blocks) == num_blocks
+
+        block_tokens_seq2 = token_ids[num_blocks * block_size:]
+        blocks_seq2 = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=block_tokens_seq2,
+            allocator=allocator,
+        )
+        block_hashes_seq2 = [block.content_hash for block in blocks_seq2]
+        allocator.mark_blocks_as_computed([])
+        cached_blocks = allocator.find_cached_blocks_prefix(
+            block_hashes=block_hashes_seq2)
+        assert len(cached_blocks) == len(blocks_seq2)
+
+        # Half of the blocks from seq1 should still be cached.
+        num_evicted_blocks = len(blocks_seq2)
+        cached_blocks = allocator.find_cached_blocks_prefix(
+            block_hashes=block_hashes_seq1)
+        assert len(cached_blocks) == len(blocks_seq1) - num_evicted_blocks
+
+    # Test reset prefix cache
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [10])
+    @pytest.mark.parametrize("block_size", [16])
+    def test_reset_prefix_cache(num_blocks: int, block_size: int):
+        """This test case simulates the case of resetting the prefix cache."""
+
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        token_ids = list(range(3 * block_size))
+
+        first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+
+        # Free each block in the first chain.
+        for block in first_chain:
+            allocator.free(block)
+
+        # Failed to reset prefix cache because some blocks are not freed yet.
+        assert not allocator.reset_prefix_cache()
+        assert allocator.get_prefix_cache_hit_rate() > 0.0
+
+        # Free each block in the second chain.
+        for block in second_chain:
+            allocator.free(block)
+
+        # Reset prefix cache.
+        assert allocator.reset_prefix_cache()
+        assert allocator.get_prefix_cache_hit_rate() == 0.0
+
+    @staticmethod
+    def create_immutable_chain(
+        block_size: int,
+        token_ids: list[int],
+        allocator: PrefixCachingBlockAllocator,
+        extra_hash: Optional[int] = None,
+    ) -> list[PrefixCachingBlock]:
+        """Helper method which creates a chain of blocks.
+        """
+        blocks: list[Block] = []
+        num_blocks = math.ceil(len(token_ids) / block_size)
+
+        if num_blocks == 0:
+            return []
+
+        prev_block = None
+        for block_number in range(0, num_blocks):
+            block_token_ids = token_ids[block_number *
+                                        block_size:(block_number + 1) *
+                                        block_size]
+            prev_block = allocator.allocate_immutable_block(
+                prev_block=prev_block,
+                token_ids=block_token_ids,
+                extra_hash=extra_hash)
+            blocks.append(prev_block)
+
+        return blocks
+
+
+class TestComputedBlocksTracker:
+
+    @staticmethod
+    def _get_mock_allocator():
+        return MagicMock(spec=PrefixCachingBlockAllocator)
+
+    @staticmethod
+    def test_get_num_cached_tokens():
+        """
+        Test it correctly computes the number of cached tokens for a given
+        sequence:
+
+        - The cache token count is derived from the number of cached blocks.
+        - The cache token count is updated when the allocator is updated.
+        - When a sequence is removed, the cache token count should be updated
+        accordingly.
+
+        # TODO(rickyx): This behaviour for prefill sequence is a hack until
+        we fix the computed blocks tracking.
+        - The cache token count for prefill sequence doesn't change while
+        the sequence is in continuous prefill (chunked prefill).
+        """
+        block_size = 4
+        mock_allocator = TestComputedBlocksTracker._get_mock_allocator()
+        tracker = ComputedBlocksTracker(
+            allocator=mock_allocator,
+            block_size=block_size,
+            enable_caching=True,
+        )
+
+        # Not yet allocated.
+        tokens = [0, 1, 2, 3, 4, 5]
+        seq1 = create_dummy_sequence(request_id=0,
+                                     token_ids=tokens,
+                                     block_size=block_size)
+        mock_allocator.find_cached_blocks_prefix.return_value = []
+        assert tracker.get_num_cached_tokens(seq1) == 0
+
+        mock_allocator.find_cached_blocks_prefix.return_value = [
+            None
+        ]  # 1 block cached.
+        # Result is cached for prefill sequence.
+        assert tracker.get_num_cached_tokens(seq1) == 0
+
+        # Mark the sequence as non-prefill.
+        seq1.data.update_num_computed_tokens(len(tokens))  # 6 tokens computed.
+        assert not seq1.is_prefill()
+
+        # Recomputes for decoding sequence.
+        assert tracker.get_num_cached_tokens(seq1) == 4
+
+        # Append new tokens to the sequence.
+        num_new_tokens = 3
+        for i in range(num_new_tokens):
+            seq1.append_token_id(i, {i: Logprob(logprob=0.0)})
+
+        assert tracker.get_num_cached_tokens(seq1) == 4
+
+        # Update the allocator.
+        mock_allocator.find_cached_blocks_prefix.return_value = [
+            None
+        ] * 2  # 2 blocks cached.
+        assert tracker.get_num_cached_tokens(seq1) == 8
+
+        # Remove the sequence.
+        tracker.remove_seq(seq1.seq_id)
+
+        # Re-create the sequence with the same request id to simulate recompute.
+        seq1 = create_dummy_sequence(request_id=0,
+                                     token_ids=tokens,
+                                     block_size=block_size)
+        mock_allocator.find_cached_blocks_prefix.return_value = [
+        ]  # no cached block
+        assert tracker.get_num_cached_tokens(seq1) == 0
+
+    @staticmethod
+    def test_correct_block_hash():
+        """
+        Test that the block hash is correctly computed for a sequence (should
+        match the underlying block allocator's block hash). So the number of
+        cached tokens is correctly retrieved.
+        """
+        block_size = 4
+        allocator = CpuGpuBlockAllocator.create(
+            allocator_type="prefix_caching",
+            num_gpu_blocks=16,
+            num_cpu_blocks=16,
+            block_size=block_size,
+        )
+        gpu_allocator = allocator._allocators[Device.GPU]
+
+        tracker = ComputedBlocksTracker(
+            allocator=allocator,
+            block_size=block_size,
+            enable_caching=True,
+        )
+
+        tokens = list(range(block_size * 4))  # 4 blocks.
+        seq = create_dummy_sequence(request_id=0,
+                                    token_ids=tokens,
+                                    block_size=block_size)
+        _ = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=tokens,
+            allocator=gpu_allocator,
+        )
+        allocator.mark_blocks_as_computed([])
+
+        assert tracker.get_num_cached_tokens(seq) == len(tokens)
+
+    @staticmethod
+    def test_correct_extra_hash():
+        """
+        Test that the block hash is correctly computed based on the extra hash,
+        ensuring it matches the allocator's block hash, specifically for the
+        LoRA case, and that the correct number of cached tokens is retrieved.
+        """
+        block_size = 4
+        allocator = CpuGpuBlockAllocator.create(
+            allocator_type="prefix_caching",
+            num_gpu_blocks=16,
+            num_cpu_blocks=16,
+            block_size=block_size,
+        )
+        gpu_allocator = allocator._allocators[Device.GPU]
+
+        tracker = ComputedBlocksTracker(
+            allocator=allocator,
+            block_size=block_size,
+            enable_caching=True,
+        )
+
+        tokens = list(range(block_size * 4))
+
+        # Create a dummy LoRA sequence with a specific LoRA ID.
+        lora_seq = create_dummy_lora_sequence(request_id=0,
+                                              token_ids=tokens,
+                                              block_size=block_size,
+                                              lora_int_id=1)
+
+        _ = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=tokens,
+            allocator=gpu_allocator,
+            extra_hash=lora_seq.extra_hash(),
+        )
+
+        allocator.mark_blocks_as_computed([])
+
+        # Create different dummy sequences that have the same token IDs
+        # but different LoRA IDs.
+        seq = create_dummy_sequence(request_id=1,
+                                    token_ids=tokens,
+                                    block_size=block_size)
+
+        different_lora_seq = create_dummy_lora_sequence(request_id=2,
+                                                        token_ids=tokens,
+                                                        block_size=block_size,
+                                                        lora_int_id=2)
+
+        # Due to the different LoRA IDs, corresponding blocks are not cached.
+        assert tracker.get_num_cached_tokens(seq) == 0
+        assert tracker.get_num_cached_tokens(different_lora_seq) == 0
+
+        # The number of cached tokens matches the length of the tokens
+        # for the cached LoRA sequence.
+        assert tracker.get_num_cached_tokens(lora_seq) == len(tokens)
diff --git a/vllm_v0.10.0/tests/core/conftest.py b/vllm_v0.10.0/tests/core/conftest.py
new file mode 100644
index 0000000..375b248
--- /dev/null
+++ b/vllm_v0.10.0/tests/core/conftest.py
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
diff --git a/vllm_v0.10.0/tests/core/test_chunked_prefill_scheduler.py b/vllm_v0.10.0/tests/core/test_chunked_prefill_scheduler.py
new file mode 100644
index 0000000..d4dacc4
--- /dev/null
+++ b/vllm_v0.10.0/tests/core/test_chunked_prefill_scheduler.py
@@ -0,0 +1,862 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import MagicMock
+
+import pytest  # noqa
+
+from vllm.config import CacheConfig, SchedulerConfig
+from vllm.core.scheduler import Scheduler
+from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.llm_engine import LLMEngine
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import Logprob, SequenceGroup
+
+from .utils import create_dummy_prompt
+
+
+def get_sequence_groups(scheduler_output):
+    return [s.seq_group for s in scheduler_output.scheduled_seq_groups]
+
+
+def append_new_token(seq_group: SequenceGroup, token_id: int):
+    for seq in seq_group.get_seqs():
+        seq.append_token_id(token_id, {token_id: Logprob(token_id)})
+
+
+def schedule_and_update_computed_tokens(scheduler):
+    metas, out, _ = scheduler.schedule()
+    for s, meta in zip(out.scheduled_seq_groups, metas):
+        s.seq_group.update_num_computed_tokens(meta.token_chunk_size)
+    return metas, out
+
+
+def test_simple():
+    """Verify basic scheduling works."""
+    block_size = 4
+    num_seq_group = 4
+    max_model_len = 16
+    max_num_batched_tokens = 64
+    scheduler_config = SchedulerConfig("generate",
+                                       max_num_batched_tokens,
+                                       num_seq_group,
+                                       max_model_len,
+                                       enable_chunked_prefill=True)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: list[SequenceGroup] = []
+
+    # Add seq groups to scheduler.
+    for i in range(num_seq_group):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=block_size,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+
+    # Schedule seq groups prompts.
+    num_tokens = block_size * num_seq_group
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    assert out.num_batched_tokens == num_tokens
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == num_seq_group
+    for s in running:
+        append_new_token(s, 1)
+
+    # Schedule seq groups generation.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    assert out.num_batched_tokens == num_seq_group
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == num_seq_group
+
+
+def test_chunk():
+    """Verify prefills are chunked properly."""
+    block_size = 4
+    max_seqs = 60
+    max_model_len = 80
+    max_num_batched_tokens = 64
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 32
+    cache_config.num_gpu_blocks = 32
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: list[SequenceGroup] = []
+
+    # Add seq groups to scheduler.
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+
+    # Verify the second request is chunked.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    print()
+    assert set(get_sequence_groups(out)) == set(running)
+    assert seq_group_meta[0].token_chunk_size == 60
+    # Verify it is chunked.
+    assert seq_group_meta[1].token_chunk_size == 4
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 64
+    # Only the first seq group has a new token appended.
+    append_new_token(running[0], 1)
+
+    # One chunked prefill, and one decoding.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    # The first one is prefill. Scheduler guarantees ordering.
+    assert seq_group_meta[0].token_chunk_size == 56
+    # The second one is a chunked prefill.
+    assert seq_group_meta[1].token_chunk_size == 1
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == 57
+
+
+def test_concurrent_chunking():
+    """Verify prefills are chunked properly when 
+    --max-num-partial-prefills is > 1"""
+    block_size = 4
+    max_seqs = 60
+    max_model_len = 2000
+    max_num_batched_tokens = 64
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+        max_num_partial_prefills=2,  # Up to 2 partial prefills at a time
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 32
+    cache_config.num_gpu_blocks = 32
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: list[SequenceGroup] = []
+
+    # Add seq groups to scheduler.
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+
+    # Verify both requests are chunked with half of max_num_batched_tokens each
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    assert seq_group_meta[0].token_chunk_size == 32
+    assert seq_group_meta[1].token_chunk_size == 32
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 64
+
+    # After one iteration, both should have 60 - 32 = 28 tokens left to prefill
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    assert seq_group_meta[0].token_chunk_size == 28
+    assert seq_group_meta[1].token_chunk_size == 28
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 56
+
+
+def test_concurrent_chunking_large_requests():
+    """Verify large prefill requests are run one at a time"""
+    block_size = 4
+    max_seqs = 60
+    max_model_len = 2000
+    max_num_batched_tokens = 64
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+        max_num_partial_prefills=2,  # Up to 2 partial prefills at a time
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 3200  # large KV cache size for large requests
+    cache_config.num_gpu_blocks = 3200
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    # Add seq groups to scheduler.
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(
+            str(i),
+            prompt_length=1200,  # Very large prompt
+            block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+
+    # Verify only a single request is chunked, and it gets all 64 tokens
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(get_sequence_groups(out)) == 1
+    assert seq_group_meta[0].token_chunk_size == 64
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == 64
+
+
+def test_short_prompts_jump_long_prompts_in_queue():
+    """Verify large prefill requests are punted behind smaller ones if 
+    another large prefill request is already running"""
+    block_size = 4
+    max_seqs = 60
+    max_model_len = 2000
+    max_num_batched_tokens = 64
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+        max_num_partial_prefills=2,  # Up to 2 partial prefills at a time
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 3200  # large KV cache size for large requests
+    cache_config.num_gpu_blocks = 3200
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    long_seqs: list[SequenceGroup] = []
+    short_seqs: list[SequenceGroup] = []
+
+    # Add 2 large seq groups to scheduler.
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(
+            str(i),
+            prompt_length=1200,  # Very large prompt
+            block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+        long_seqs.append(seq_group)
+        assert seq_group.is_prefill()
+
+    # Add 2 small seq groups behind them
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(
+            str(i + 2),
+            prompt_length=40,  # Very small prompt
+            block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+        short_seqs.append(seq_group)
+        assert seq_group.is_prefill()
+
+    # Verify one large req and 1 small req chunked
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert seq_group_meta[0].token_chunk_size == 32  # large req gets 32 tokens
+    assert seq_group_meta[1].token_chunk_size == 32  # small req gets 32 tokens
+
+    # all 4 are prefilling
+    assert long_seqs[0].is_prefill()
+    assert long_seqs[1].is_prefill()
+    assert short_seqs[0].is_prefill()
+    assert short_seqs[1].is_prefill()
+    # First short and first long sequences have been scheduled
+    assert long_seqs[0].first_seq.get_num_computed_tokens() == 32
+    assert long_seqs[1].first_seq.get_num_computed_tokens() == 0
+    assert short_seqs[0].first_seq.get_num_computed_tokens() == 32
+    assert short_seqs[1].first_seq.get_num_computed_tokens() == 0
+
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 64
+
+    # in the second iteration,
+    # the first small request had only 8 tokens left
+    # so it went to decode
+    # The other small req is scheduled
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    # the new small req got 64 - (32+8) tokens
+    assert seq_group_meta[0].token_chunk_size == 24
+    assert seq_group_meta[1].token_chunk_size == 32  # large req still got 32
+    # the other small request had only 8 tokens left
+    assert seq_group_meta[2].token_chunk_size == 8  # 40-32
+
+    # The first small request got to decode now
+    assert long_seqs[0].is_prefill()
+    assert long_seqs[1].is_prefill()
+    assert not short_seqs[0].is_prefill()
+    assert short_seqs[1].is_prefill()
+    # Both small requests have started in front of the second long request
+    assert long_seqs[0].first_seq.get_num_computed_tokens() == 64
+    assert long_seqs[1].first_seq.get_num_computed_tokens() == 0
+    assert short_seqs[0].first_seq.get_num_computed_tokens() == 40
+    assert short_seqs[1].first_seq.get_num_computed_tokens() == 24
+
+    assert out.num_prefill_groups == 3
+    assert out.num_batched_tokens == 64
+    # the first small seq group has a new token appended.
+    append_new_token(short_seqs[0], 1)
+
+    # in the third iteration,
+    # the first small request is already decoding
+    # the second small request only has 16 tokens left and will enter decoding
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert seq_group_meta[0].token_chunk_size == 32  # large still got 32
+    # small req finished prefilling 40-24=16 tokens
+    assert seq_group_meta[1].token_chunk_size == 16
+    assert seq_group_meta[2].token_chunk_size == 1  # decode
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 49  # (32+16+1 decode)
+
+    # both small requests have now reached decode
+    assert long_seqs[0].is_prefill()
+    assert long_seqs[1].is_prefill()
+    assert not short_seqs[0].is_prefill()
+    assert not short_seqs[1].is_prefill()
+    assert long_seqs[0].first_seq.get_num_computed_tokens() == 96
+    assert long_seqs[1].first_seq.get_num_computed_tokens() == 0
+    assert short_seqs[0].first_seq.get_num_computed_tokens() == 41
+    assert short_seqs[1].first_seq.get_num_computed_tokens() == 40
+
+    # both the small seq groups have a new token appended
+    append_new_token(short_seqs[0], 1)
+    append_new_token(short_seqs[1], 1)
+
+    # in the fourth iteration, both small requests are decoding
+    # so large request gets all the budget
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+
+    # large req gets 62 tokens (minus 2 for decode)
+    assert seq_group_meta[0].token_chunk_size == 62
+    assert seq_group_meta[1].token_chunk_size == 1  # decode
+    assert seq_group_meta[2].token_chunk_size == 1  # decode
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == 64
+
+    assert long_seqs[0].first_seq.get_num_computed_tokens() == 158
+
+    # assert long_seqs[0].is_prefill()
+    # assert long_seqs[1].is_prefill()
+    # assert not short_seqs[0].is_prefill()
+    # assert not short_seqs[1].is_prefill()
+
+    # # both the small seq groups have a new token appended
+    # append_new_token(short_seqs[0], 1)
+    # append_new_token(short_seqs[1], 1)
+
+    # # in the fifth iteration, large request gets all the budget
+    # # while both small requests are decoding
+    # seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    # assert seq_group_meta[0].token_chunk_size == 62
+    # assert seq_group_meta[1].token_chunk_size == 1  # decode
+    # assert seq_group_meta[2].token_chunk_size == 1  # decode
+    # assert out.num_prefill_groups == 1
+    # assert out.num_batched_tokens == 64
+
+
+def test_complex():
+    block_size = 4
+    max_seqs = 60
+    max_model_len = 80
+    max_num_batched_tokens = 64
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 64
+    cache_config.num_gpu_blocks = 64
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: list[SequenceGroup] = []
+
+    # Add seq groups to scheduler.
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+        assert seq_group.is_prefill()
+
+    # Verify the second request is chunked.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+
+    assert set(get_sequence_groups(out)) == set(running)
+    assert seq_group_meta[0].token_chunk_size == 60
+    # Verify it is chunked.
+    assert seq_group_meta[1].token_chunk_size == 4
+    assert not running[0].is_prefill()
+    assert running[1].is_prefill()
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 64
+    # Only the first seq group has a new token appended.
+    append_new_token(running[0], 1)
+
+    # Add 2 more requests.
+    for i in range(2, 4):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+
+    # Decoding & chunked prefill & first chunk of 3rd request is scheduled.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(get_sequence_groups(out)) == 3
+    # The first one is the first chunked prefill.
+    assert seq_group_meta[0].token_chunk_size == 7
+    # The second one is the second new chunked prefill.
+    assert seq_group_meta[1].token_chunk_size == 56
+    # The last one is decode.
+    assert seq_group_meta[2].token_chunk_size == 1
+    # Two of them are in chunked prefill.
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 64
+    # The first 2 requests are now in decodine phase.
+    append_new_token(running[0], 1)
+    assert not running[0].is_prefill()
+    append_new_token(running[1], 1)
+    assert not running[1].is_prefill()
+    # The third request is still in prefill stage.
+    assert running[2].is_prefill()
+
+
+def test_maximal_decoding():
+    """Verify decoding requests are prioritized."""
+    block_size = 4
+    max_seqs = 2
+    max_model_len = 8
+    max_num_batched_tokens = 2
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: list[SequenceGroup] = []
+
+    # Add seq groups to scheduler.
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=2,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+        assert seq_group.is_prefill()
+
+    # The first prefill is scheduled.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(get_sequence_groups(out)) == 1
+    assert seq_group_meta[0].token_chunk_size == 2
+    assert not running[0].is_prefill()
+    assert running[1].is_prefill()
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == 2
+    # Only the first seq group has a new token appended.
+    append_new_token(running[0], 1)
+
+    # Create one more seq_group.
+    _, seq_group = create_dummy_prompt("3",
+                                       prompt_length=2,
+                                       block_size=block_size)
+    scheduler.add_seq_group(seq_group)
+    running.append(seq_group)
+    assert seq_group.is_prefill()
+    # The first decoding + second chunk is scheduled.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(get_sequence_groups(out)) == 2
+    assert seq_group_meta[0].token_chunk_size == 1
+    assert seq_group_meta[1].token_chunk_size == 1
+    assert not running[0].is_prefill()
+    assert running[1].is_prefill()
+    assert running[2].is_prefill()
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == 2
+    append_new_token(running[0], 1)
+
+    # Decoding + running prefill is prioritized.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(get_sequence_groups(out)) == 2
+    assert seq_group_meta[0].token_chunk_size == 1
+    assert seq_group_meta[1].token_chunk_size == 1
+    assert not running[0].is_prefill()
+    assert not running[1].is_prefill()
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == 2
+    append_new_token(running[0], 1)
+    append_new_token(running[1], 1)
+
+    # Only decoding is prioritized.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(get_sequence_groups(out)) == 2
+    assert seq_group_meta[0].token_chunk_size == 1
+    assert seq_group_meta[1].token_chunk_size == 1
+    assert not running[0].is_prefill()
+    assert not running[1].is_prefill()
+    assert out.num_prefill_groups == 0
+    assert out.num_batched_tokens == 2
+    append_new_token(running[0], 1)
+    append_new_token(running[1], 1)
+
+    # After aborting the decoding request, the fcfs new prefill is prioritized.
+    scheduler.abort_seq_group(running[0].request_id)
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(get_sequence_groups(out)) == 2
+    assert seq_group_meta[0].token_chunk_size == 1
+    assert seq_group_meta[1].token_chunk_size == 1
+    assert not running[1].is_prefill()
+    assert running[2].is_prefill()
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == 2
+
+
+def test_prompt_limit():
+    """Verify max_num_batched_tokens < max_model_len is possible."""
+    block_size = 4
+    max_seqs = 32
+    max_model_len = 64
+    max_num_batched_tokens = 32
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 16
+    cache_config.num_gpu_blocks = 16
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: list[SequenceGroup] = []
+
+    _, seq_group = create_dummy_prompt("1",
+                                       prompt_length=48,
+                                       block_size=block_size)
+    scheduler.add_seq_group(seq_group)
+    running.append(seq_group)
+    assert seq_group.is_prefill()
+
+    # The prompt length > max_num_batched_tokens should be still scheduled.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(get_sequence_groups(out)) == 1
+    assert seq_group_meta[0].token_chunk_size == 32
+    assert running[0].is_prefill()
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == 32
+
+
+def test_prompt_limit_exceed():
+    block_size = 4
+    max_seqs = 64
+    max_model_len = 32
+    max_num_batched_tokens = 64
+    scheduler_config = SchedulerConfig("generate",
+                                       max_num_batched_tokens,
+                                       max_seqs,
+                                       max_model_len,
+                                       enable_chunked_prefill=True)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 16
+    cache_config.num_gpu_blocks = 16
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: list[SequenceGroup] = []
+    _, seq_group = create_dummy_prompt("2",
+                                       prompt_length=48,
+                                       block_size=block_size)
+    scheduler.add_seq_group(seq_group)
+    running.append(seq_group)
+    assert seq_group.is_prefill()
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(out.ignored_seq_groups) == 1
+    assert out.ignored_seq_groups[0] == seq_group
+
+
+def test_chunked_prefill_preempt():
+    """Verify preempt works with chunked prefill requests"""
+    block_size = 4
+    max_seqs = 30
+    max_model_len = 200
+    max_num_batched_tokens = 30
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 16
+    cache_config.num_gpu_blocks = 16
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    _, seq_group = create_dummy_prompt("1",
+                                       prompt_length=60,
+                                       block_size=block_size)
+    scheduler.add_seq_group(seq_group)
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    # The request is chunked.
+    # prefill scheduled now.
+    assert len(out.scheduled_seq_groups) == 1
+    assert out.num_prefill_groups == 1
+    assert seq_group.is_prefill()
+    assert out.num_batched_tokens == max_num_batched_tokens
+
+    # The request should be preempted.
+    scheduler.block_manager.can_append_slots = MagicMock()
+
+    def cannot_append_second_group1(seq_group, num_lookahead_slots):
+        return seq_group.request_id != "1"
+
+    scheduler.block_manager.can_append_slots.side_effect = (
+        cannot_append_second_group1)
+
+    # The running prefill is now preempted.
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(out.scheduled_seq_groups) == 0
+    assert out.num_batched_tokens == 0
+    assert out.blocks_to_swap_out == []
+    assert out.blocks_to_swap_in == []
+
+    # Make sure we can reschedule preempted request.
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(out.scheduled_seq_groups) == 1
+    assert out.num_prefill_groups == 1
+    assert seq_group.is_prefill()
+    assert out.num_batched_tokens == max_num_batched_tokens
+    assert seq_group.get_num_uncomputed_tokens() == 30
+
+    # We should be able to run prefill twice as it is chunked.
+    def cannot_append_second_group2(seq_group, num_lookahead_slots):
+        return True
+
+    scheduler.block_manager.can_append_slots.side_effect = (
+        cannot_append_second_group2)
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(out.scheduled_seq_groups) == 1
+    assert out.num_prefill_groups == 1
+    assert not seq_group.is_prefill()
+    assert out.num_batched_tokens == max_num_batched_tokens
+
+
+@pytest.mark.parametrize("num_scheduler_steps", [1, 5])
+def test_chunked_prefill_spec_prefill(num_scheduler_steps):
+    """Verify that the num_lookahead_slots is set appropriately for an all"""
+    """prefill batch depending on whether multi-step scheduling is enabled"""
+    """or not"""
+    block_size = 4
+    max_seqs = 30
+    max_model_len = 200
+    max_num_batched_tokens = 30
+    num_lookahead_slots = 4
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+        num_lookahead_slots=num_lookahead_slots,
+        num_scheduler_steps=num_scheduler_steps,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 16
+    cache_config.num_gpu_blocks = 16
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    _, seq_group = create_dummy_prompt("1",
+                                       prompt_length=30,
+                                       block_size=block_size)
+    scheduler.add_seq_group(seq_group)
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    # The request is chunked.
+    # prefill scheduled now.
+    assert len(out.scheduled_seq_groups) == 1
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == max_num_batched_tokens
+    print(out.num_lookahead_slots)
+    assert out.num_lookahead_slots == (0 if (num_scheduler_steps == 1) else
+                                       num_lookahead_slots)
+
+
+def test_chunked_prefill_max_seqs():
+    block_size = 4
+    max_seqs = 2
+    max_model_len = 80
+    max_num_batched_tokens = 64
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 128
+    cache_config.num_gpu_blocks = 128
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: list[SequenceGroup] = []
+
+    _, seq_group = create_dummy_prompt("1",
+                                       prompt_length=65,
+                                       block_size=block_size)
+    scheduler.add_seq_group(seq_group)
+    running.append(seq_group)
+    # The first prefill is chunked.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert seq_group_meta[0].token_chunk_size == max_num_batched_tokens
+    assert len(get_sequence_groups(out)) == 1
+
+    # Add new requests.
+    for i in range(4):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=65,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+
+    # Make sure only 2 requests are scheduled.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert out.num_batched_tokens == max_num_batched_tokens
+    assert len(get_sequence_groups(out)) == 2
+    assert not running[0].is_prefill()
+    assert running[1].is_prefill()
+    append_new_token(running[0], 1)
+
+    # Although we have enough token budget, we can only schedule max_seqs.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert seq_group_meta[0].token_chunk_size == 2
+    assert seq_group_meta[1].token_chunk_size == 1
+    assert out.num_batched_tokens == 3
+    assert len(get_sequence_groups(out)) == max_seqs
+    assert not running[0].is_prefill()
+    assert not running[1].is_prefill()
+
+
+def test_prefix_caching():
+    """Verify allocating full blocks when prefix caching is enabled."""
+    block_size = 4
+    max_seqs = 10
+    max_model_len = 80
+    max_num_batched_tokens = 64
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+    )
+    cache_config = CacheConfig(block_size,
+                               1.0,
+                               1,
+                               "auto",
+                               enable_prefix_caching=True)
+    cache_config.num_cpu_blocks = 0
+    cache_config.num_gpu_blocks = 32
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: list[SequenceGroup] = []
+
+    # Add seq groups to scheduler.
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           block_size=block_size,
+                                           prompt_length=50)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    assert seq_group_meta[0].token_chunk_size == 50
+    # Verify it is chunked. Note that although the budget is 64-50=14,
+    # we only allocate full blocks for prefix caching, so only 4*(14//4)=12
+    # tokens are allocated.
+    assert seq_group_meta[1].token_chunk_size == 12
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 62
+
+
+def test_prefix_caching_with_concurrent_partial_prefills():
+    """Verify allocating full blocks when prefix caching is enabled with 
+    --max-num-partial-prefills > 1."""
+    block_size = 4
+    max_seqs = 10
+    max_model_len = 8000
+    max_num_batched_tokens = 60  # With two slots, each slot will get 30 tokens
+    scheduler_config = SchedulerConfig("generate",
+                                       max_num_batched_tokens,
+                                       max_seqs,
+                                       max_model_len,
+                                       enable_chunked_prefill=True,
+                                       max_num_partial_prefills=2)
+    cache_config = CacheConfig(block_size,
+                               1.0,
+                               1,
+                               "auto",
+                               enable_prefix_caching=True)
+    cache_config.num_cpu_blocks = 0
+    cache_config.num_gpu_blocks = 32
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: list[SequenceGroup] = []
+
+    # Add seq groups to scheduler.
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           block_size=block_size,
+                                           prompt_length=50)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    # To partially prefill both sequences, both can chunk up to 30 tokens
+    # But the next lowest multiple of the block size (4) is 28
+    assert seq_group_meta[0].token_chunk_size == 28
+    assert seq_group_meta[1].token_chunk_size == 28
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 56
+
+    # On the next iteration, both sequences should finish prefill
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    # Both sequences have 50 - 28 = 22 tokens left to prefill.
+    # This is not a multiple of the block size, but we don't care since we don't
+    # cache the final partial block of prefix sequences
+    assert seq_group_meta[0].token_chunk_size == 22
+    assert seq_group_meta[1].token_chunk_size == 22
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 44
+
+
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("max_num_partial_prefills", [2, 4, 8])
+def test_chunked_prefill_with_actual_engine(model: str,
+                                            max_num_partial_prefills: int):
+    """Make sure the model can actually sample with concurrent 
+    partial prefills
+    """
+
+    prompt = "hello" * 40
+
+    engine_args = EngineArgs(
+        model=model,
+        max_num_partial_prefills=max_num_partial_prefills,
+        max_num_batched_tokens=40,
+        max_num_seqs=8,
+        enable_chunked_prefill=True,
+        gpu_memory_utilization=0.8,
+    )
+
+    engine = LLMEngine.from_engine_args(engine_args)
+    sampling_params = SamplingParams(temperature=0)
+
+    for req_num in range(max_num_partial_prefills):
+        engine.add_request(f"{req_num}", prompt, sampling_params)
+    # first step
+    request_outputs = engine.step()
+    # means all are prefilling
+    assert len(request_outputs) == 0
+    assert len(engine.scheduler[0].running) == max_num_partial_prefills
diff --git a/vllm_v0.10.0/tests/core/test_num_computed_tokens_update.py b/vllm_v0.10.0/tests/core/test_num_computed_tokens_update.py
new file mode 100644
index 0000000..9e1b791
--- /dev/null
+++ b/vllm_v0.10.0/tests/core/test_num_computed_tokens_update.py
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from tests.conftest import VllmRunner
+from tests.core.utils import create_dummy_prompt
+from vllm.engine.llm_engine import LLMEngine
+from vllm.platforms import current_platform
+from vllm.sequence import SequenceGroup
+
+MODEL = "JackFram/llama-160m"
+
+
+def add_seq_group_to_engine(engine: LLMEngine, seq_group: SequenceGroup):
+    scheduler = engine.scheduler[0]
+    scheduler.add_seq_group(seq_group)
+
+
+@pytest.mark.parametrize("num_scheduler_steps", [1, 8])
+@pytest.mark.parametrize("enable_chunked_prefill", [False, True])
+@pytest.mark.parametrize("enforce_eager", [False, True])
+def test_num_computed_tokens_update(num_scheduler_steps: int,
+                                    enable_chunked_prefill: bool,
+                                    enforce_eager: bool):
+
+    is_multi_step = num_scheduler_steps > 1
+    is_multi_step_chunked_prefill = is_multi_step and enable_chunked_prefill
+
+    if is_multi_step_chunked_prefill and current_platform.is_rocm():
+        pytest.skip("Multi-step with Chunked-Prefill does not support "
+                    "rocm_flash_attn backend")
+
+    # Make a vllm engine
+    runner = VllmRunner(model_name=MODEL,
+                        gpu_memory_utilization=0.7,
+                        num_scheduler_steps=num_scheduler_steps,
+                        enable_chunked_prefill=enable_chunked_prefill,
+                        enforce_eager=enforce_eager)
+    engine: LLMEngine = runner.llm.llm_engine
+
+    # In multi-step + chunked-prefill there is no separate single prompt step.
+    # What is scheduled will run for num_scheduler_steps always.
+    num_prompt_steps = num_scheduler_steps \
+        if is_multi_step_chunked_prefill else 1
+
+    num_output_tokens_list = [4, 8, 12, 15, 16, 17]
+
+    # Create sequence and add to engine
+    prompt_len = 10
+
+    for req_idx, num_output_tokens in enumerate(num_output_tokens_list):
+        seq, seq_group = create_dummy_prompt(request_id=str(req_idx),
+                                             prompt_length=prompt_len,
+                                             min_tokens=num_output_tokens,
+                                             max_tokens=num_output_tokens)
+        add_seq_group_to_engine(engine, seq_group)
+
+        assert seq.data.get_num_computed_tokens() == 0
+
+        for _ in range(num_prompt_steps):
+            # prompt steps
+            engine.step()
+
+        if not seq.is_finished():
+            prompt_num_computed_tokens = seq.data.get_num_computed_tokens()
+            # Test correctness of num_computed_tokens after the prompt steps
+            assert prompt_num_computed_tokens == \
+                        prompt_len + num_prompt_steps - 1
+
+            decode_step_counter = 0
+            while not seq.is_finished():
+                # Test correctness of num_computed_tokens after the decode steps
+                assert seq.data.get_num_computed_tokens(
+                ) == prompt_num_computed_tokens + decode_step_counter
+                for _ in range(num_scheduler_steps):
+                    # decode step
+                    engine.step()
+                    decode_step_counter += 1
+
+        # Test correctness of num_computed_tokens after the sequence finish.
+        assert seq.data.get_num_computed_tokens(
+        ) == prompt_len + num_output_tokens - 1
diff --git a/vllm_v0.10.0/tests/core/test_scheduler.py b/vllm_v0.10.0/tests/core/test_scheduler.py
new file mode 100644
index 0000000..591e178
--- /dev/null
+++ b/vllm_v0.10.0/tests/core/test_scheduler.py
@@ -0,0 +1,1337 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import time
+from collections import deque
+from typing import Optional
+from unittest.mock import MagicMock
+
+import pytest  # noqa
+import torch
+from torch import Use  # noqa
+
+from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
+from vllm.core.interfaces import AllocStatus
+from vllm.core.scheduler import Scheduler, SchedulingBudget
+from vllm.lora.request import LoRARequest
+from vllm.sequence import SequenceGroup, SequenceStatus
+
+from .utils import (append_new_token, append_new_token_seq,
+                    append_new_token_seq_group, create_dummy_prompt,
+                    get_sequence_groups, schedule_and_update_computed_tokens)
+
+
+def test_scheduler_add_seq_group():
+    block_size = 4
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens=100,
+        max_num_seqs=64,
+        max_model_len=1,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto")
+    cache_config.num_cpu_blocks = 4
+    cache_config.num_gpu_blocks = 4
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    # Add seq group to scheduler.
+    num_seq_group = 4
+    for i in range(num_seq_group):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           block_size,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+        assert scheduler.get_num_unfinished_seq_groups() == i + 1
+
+
+def test_scheduler_abort_seq_group():
+    block_size = 4
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens=100,
+        max_num_seqs=64,
+        max_model_len=1,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 4
+    cache_config.num_gpu_blocks = 4
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    # Add multiple seq groups to scheduler.
+    num_seq_group = 4
+    request_ids: set[str] = set()
+    for i in range(num_seq_group):
+        _, seq_group = create_dummy_prompt(str(i), block_size)
+        scheduler.add_seq_group(seq_group)
+        request_ids.add(str(i))
+
+    # Abort all added seq groups.
+    assert scheduler.get_num_unfinished_seq_groups() == num_seq_group
+    scheduler.abort_seq_group(request_ids)
+    assert scheduler.get_num_unfinished_seq_groups() == 0
+
+
+def test_scheduler_schedule_simple():
+    block_size = 4
+    num_seq_group = 4
+    max_model_len = 16
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens=64,
+        max_num_seqs=num_seq_group,
+        max_model_len=max_model_len,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: list[SequenceGroup] = []
+
+    # Add seq groups to scheduler.
+    for i in range(num_seq_group):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=block_size,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+
+    # Schedule seq groups prompts.
+    num_tokens = block_size * num_seq_group
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    assert out.num_batched_tokens == num_tokens
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == num_seq_group
+    append_new_token(out, 1)
+
+    # Schedule seq groups generation.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    assert out.num_batched_tokens == num_seq_group
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == num_seq_group
+    append_new_token(out, 1)
+
+
+def test_scheduler_prefill_prioritized():
+    """Verify running batched tokens are not applied to prefill requests."""
+    block_size = 4
+    max_model_len = 30
+    max_batched_num_tokens = 30
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens=max_batched_num_tokens,
+        max_num_seqs=2,
+        max_model_len=max_model_len,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 16
+    cache_config.num_gpu_blocks = 16
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    # Add seq groups to scheduler.
+    _, seq_group_a = create_dummy_prompt("1", 1, block_size=block_size)
+    scheduler.add_seq_group(seq_group_a)
+
+    # Schedule seq groups prompts.
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    assert get_sequence_groups(out) == [seq_group_a]
+
+    # Add a new prefill request B.
+    _, seq_group_b = create_dummy_prompt("2", 30, block_size=block_size)
+    scheduler.add_seq_group(seq_group_b)
+
+    # Verify prefill requests are prioritized. Since max_batched_num_tokens
+    # is 1, new prefill request has to be scheduled first.
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    assert get_sequence_groups(out) == [seq_group_b]
+
+
+def test_scheduler_schedule_preempt_abort():
+    block_size = 4
+    max_model_len = 16
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens=64,
+        max_num_seqs=2,
+        max_model_len=max_model_len,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 2
+    cache_config.num_gpu_blocks = 2
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    # Add seq groups to scheduler.
+    seq_a, seq_group_a = create_dummy_prompt("1",
+                                             block_size,
+                                             block_size=block_size)
+    seq_b, seq_group_b = create_dummy_prompt("2",
+                                             block_size,
+                                             block_size=block_size)
+    scheduler.add_seq_group(seq_group_a)
+    scheduler.add_seq_group(seq_group_b)
+
+    # Schedule seq groups prompts.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert get_sequence_groups(out) == [seq_group_a, seq_group_b]
+    assert out.num_batched_tokens == block_size * 2  # seq_a and seq_b
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == 2
+    assert scheduler.get_num_unfinished_seq_groups() == 2
+
+    # Append "generated" tokens, allowing the sequence to mark prompt tokens as
+    # processed.
+    append_new_token(out, 1)
+
+    # Schedule seq groups generation and preempt seq group b.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert get_sequence_groups(out) == [seq_group_a]
+    assert out.num_batched_tokens == 1
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == 1
+    assert scheduler.get_num_unfinished_seq_groups() == 2
+    assert out.preempted == 1
+
+    # Abort seq group a. Re-schedule seq group b prompt with recomputation.
+    scheduler.abort_seq_group("1")
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert get_sequence_groups(out) == [seq_group_b]
+    assert out.num_batched_tokens == 5  # 4 prompt + 1 generation.
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == 1
+    assert scheduler.get_num_unfinished_seq_groups() == 1
+
+
+def test_scheduler_max_seqs():
+    block_size = 4
+    num_seq_group = 4
+    max_seq_group = 2
+    max_model_len = 16
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens=64,
+        max_num_seqs=max_seq_group,
+        max_model_len=max_model_len,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    all_seq_groups: list[SequenceGroup] = []
+    # Add seq groups to scheduler.
+    for i in range(num_seq_group):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=block_size,
+                                           block_size=block_size)
+        all_seq_groups.append(seq_group)
+
+    # Append 1 seq group
+    scheduler.add_seq_group(all_seq_groups[0])
+
+    # Schedule seq groups prompts.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set([all_seq_groups[0]])
+    append_new_token(out, 1)
+
+    # Schedule seq groups generation.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set([all_seq_groups[0]])
+    append_new_token(out, 1)
+
+    # Append 2 more seq group
+    scheduler.add_seq_group(all_seq_groups[1])
+    scheduler.add_seq_group(all_seq_groups[2])
+
+    # Schedule seq groups prompts.
+    # Only 1 seq group should be scheduled since max_seq_group is 2
+    # and one is prompting.
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set([all_seq_groups[1]])
+
+
+def test_scheduler_delay_factor():
+    block_size = 4
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens=100,
+        max_num_seqs=64,
+        max_model_len=16,
+        delay_factor=0.5,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    # schedule first prompt
+    seq_group_meta, seq_group = create_dummy_prompt("0",
+                                                    prompt_length=block_size,
+                                                    block_size=block_size)
+    scheduler.add_seq_group(seq_group)
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert out.num_prefill_groups > 0
+    assert seq_group_meta[0].request_id == '0'
+    append_new_token(out, 1)
+
+    # wait for a second before scheduling next prompt
+    time.sleep(1)
+    seq_group_meta, seq_group = create_dummy_prompt("1",
+                                                    prompt_length=block_size,
+                                                    block_size=block_size)
+    scheduler.add_seq_group(seq_group)
+
+    # second prompt should *not* be scheduled
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert out.num_prefill_groups == 0
+    assert seq_group_meta[0].request_id == '0'
+    append_new_token(out, 1)
+
+    # wait for more than 0.5 second and try again
+    time.sleep(0.6)
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert out.num_prefill_groups > 0
+    assert seq_group_meta[0].request_id == '1'
+    append_new_token(out, 1)
+
+
+def initialize_scheduler(
+    *,
+    max_num_seqs=1000,
+    max_token_budget=1000,
+    max_model_len=1000,
+    lora_config=None,
+    block_size=4,
+    num_cpu_blocks=8,
+    num_gpu_blocks=8,
+    enable_prefix_caching=False,
+    enable_chunked_prefill=False,
+):
+    block_size = block_size
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens=max_token_budget,
+        max_num_seqs=max_num_seqs,
+        max_model_len=max_model_len,
+        enable_chunked_prefill=enable_chunked_prefill,
+    )
+    cache_config = CacheConfig(
+        block_size,
+        1.0,
+        1,
+        "auto",
+        enable_prefix_caching=enable_prefix_caching,
+    )
+    cache_config.num_cpu_blocks = num_cpu_blocks
+    cache_config.num_gpu_blocks = num_gpu_blocks
+    scheduler = Scheduler(scheduler_config, cache_config, lora_config)
+    return scheduler
+
+
+def create_token_budget(token_budget: int = 10000,
+                        max_num_seqs: int = 10000) -> SchedulingBudget:
+    return SchedulingBudget(
+        token_budget=token_budget,
+        max_num_seqs=max_num_seqs,
+    )
+
+
+def add_token_budget(budget: SchedulingBudget,
+                     num_batched_tokens: int = 0,
+                     num_curr_seqs: int = 0):
+    mock_seq_group = create_dummy_prompt('10', prompt_length=60)[1]
+    budget.add_num_batched_tokens(mock_seq_group.request_id,
+                                  num_batched_tokens)
+    budget.add_num_seqs(mock_seq_group.request_id, num_curr_seqs)
+
+
+def test_prefill_schedule_max_prompt_len():
+    """
+    Test prompt longer than max_prompt_len is aborted.
+    """
+    block_size = 4
+    scheduler = initialize_scheduler(max_model_len=30, block_size=block_size)
+    _, seq_group = create_dummy_prompt("0",
+                                       prompt_length=60,
+                                       block_size=block_size)
+    scheduler.add_seq_group(seq_group)
+    budget = create_token_budget()
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
+    assert len(output.ignored_seq_groups) == 1
+    assert len(output.seq_groups) == 0
+    assert budget.num_batched_tokens == 0
+    assert budget.num_curr_seqs == 0
+    assert len(remaining_waiting) == 0
+
+
+def test_prefill_schedule_token_budget():
+    """
+    Test token budget respected.
+    """
+    block_size = 4
+    scheduler = initialize_scheduler(block_size=block_size,
+                                     num_cpu_blocks=64,
+                                     num_gpu_blocks=64)
+    budget = create_token_budget(token_budget=0)
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+
+    # 0 token budget == nothing is scheduled.
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
+    assert len(output.ignored_seq_groups) == 0
+    assert len(output.seq_groups) == 0
+    assert budget.num_batched_tokens == 0
+    assert budget.num_curr_seqs == 0
+    assert len(remaining_waiting) == 2
+
+    # 60 token budget == 1 request scheduled.
+    budget = create_token_budget(token_budget=60)
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
+    assert len(output.ignored_seq_groups) == 0
+    assert len(output.seq_groups) == 1
+    assert budget.num_batched_tokens == 60
+    assert budget.num_curr_seqs == 1
+    assert len(remaining_waiting) == 1
+
+    # Test when current_batched_tokens respected.
+    scheduler = initialize_scheduler(block_size=block_size,
+                                     num_cpu_blocks=16,
+                                     num_gpu_blocks=16)
+    budget = create_token_budget(token_budget=60)
+    add_token_budget(budget, 30, 0)
+    _, seq_group = create_dummy_prompt(str(i),
+                                       prompt_length=60,
+                                       block_size=block_size)
+    # Cannot schedule a prompt that doesn't fit the budget.
+    scheduler.add_seq_group(seq_group)
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
+    assert len(output.ignored_seq_groups) == 0
+    assert len(output.seq_groups) == 0
+    assert budget.num_batched_tokens == 30
+    assert budget.num_curr_seqs == 0
+    assert len(remaining_waiting) == 1
+    budget = create_token_budget(token_budget=90)
+    add_token_budget(budget, 30, 0)
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
+    assert len(output.seq_groups) == 1
+    assert budget.num_batched_tokens == 90
+    assert budget.num_curr_seqs == 1
+    assert len(remaining_waiting) == 0
+
+
+def test_prefill_schedule_max_seqs():
+    """
+    Test max seq respected.
+    """
+    block_size = 4
+    scheduler = initialize_scheduler(block_size=block_size,
+                                     num_cpu_blocks=64,
+                                     num_gpu_blocks=64)
+    budget = create_token_budget(max_num_seqs=2)
+    for i in range(3):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
+    assert len(output.ignored_seq_groups) == 0
+    assert len(output.seq_groups) == 2
+    assert budget.num_batched_tokens == 120
+    assert budget.num_curr_seqs == 2
+    assert len(remaining_waiting) == 1
+
+    # Verify curr_num_seqs respected.
+    scheduler.waiting = deque()
+    budget = create_token_budget(max_num_seqs=2)
+    add_token_budget(budget, 0, 2)
+    _, seq_group = create_dummy_prompt(str(i),
+                                       prompt_length=60,
+                                       block_size=block_size)
+    scheduler.add_seq_group(seq_group)
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
+    assert len(output.ignored_seq_groups) == 0
+    assert len(output.seq_groups) == 0
+    assert budget.num_batched_tokens == 0
+    assert budget.num_curr_seqs == 2
+    assert len(remaining_waiting) == 1
+
+
+def test_prefill_schedule_max_lora():
+    """
+    Test max lora is respected and prioritized.
+    """
+    block_size = 4
+    lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
+    scheduler = initialize_scheduler(lora_config=lora_config,
+                                     block_size=block_size,
+                                     num_cpu_blocks=64,
+                                     num_gpu_blocks=64)
+    budget = create_token_budget(token_budget=120)
+    curr_loras: set[int] = set()
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           block_size=block_size,
+                                           lora_request=LoRARequest(
+                                               lora_name=str(i),
+                                               lora_int_id=i + 1,
+                                               lora_path="abc"))
+        scheduler.add_seq_group(seq_group)
+    # Add two more requests to verify lora is prioritized.
+    # 0: LoRA, 1: LoRA, 2: regular, 3: regular
+    # In the first iteration, index 0, 2 is scheduled.
+    # If a request is not scheduled because it hits max lora, it is
+    # prioritized. Verify that.
+    for i in range(2, 4):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+    # Schedule 2 requests (0 and 2)
+    output = scheduler._schedule_prefills(budget, curr_loras)
+    remaining_waiting = scheduler.waiting
+    assert len(output.ignored_seq_groups) == 0
+    assert len(output.seq_groups) == 2
+    assert budget.num_batched_tokens == 120
+    assert budget.num_curr_seqs == 2
+    assert len(remaining_waiting) == 2
+    assert len(curr_loras) == 1
+    # The second lora request is scheduled next as FCFS policy.
+    # Reset curr_loras so that it can be scheduled.
+    curr_loras = set()
+    budget = create_token_budget(token_budget=60)
+    output = scheduler._schedule_prefills(budget, curr_loras)
+    remaining_waiting = scheduler.waiting
+    assert len(output.seq_groups) == 1
+    assert output.seq_groups[0].seq_group.request_id == "1"
+    assert len(remaining_waiting) == 1
+    assert len(curr_loras) == 1
+    assert budget.num_batched_tokens == 60
+
+
+def test_prefill_schedule_no_block_manager_capacity():
+    """
+    Test sequence cannot be scheduled due to block manager has no capacity.
+    """
+    block_size = 4
+    scheduler = initialize_scheduler(block_size=block_size,
+                                     num_gpu_blocks=128,
+                                     num_cpu_blocks=128)
+    budget = create_token_budget()
+    for i in range(3):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+    scheduler.block_manager.can_allocate = MagicMock()
+    scheduler.block_manager.can_allocate.return_value = AllocStatus.LATER
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
+    assert len(output.ignored_seq_groups) == 0
+    assert len(output.seq_groups) == 0
+    assert budget.num_batched_tokens == 0
+    assert budget.num_curr_seqs == 0
+    assert len(remaining_waiting) == 3
+
+    scheduler = initialize_scheduler()
+    budget = create_token_budget()
+    for i in range(3):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+    scheduler.block_manager.can_allocate = MagicMock()
+    scheduler.block_manager.can_allocate.return_value = AllocStatus.NEVER
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
+    assert len(output.ignored_seq_groups) == 3
+    assert len(output.seq_groups) == 0
+    assert budget.num_batched_tokens == 0
+    assert budget.num_curr_seqs == 0
+    assert len(remaining_waiting) == 0
+
+
+def test_decode_schedule_preempted():
+    """
+    Test decodes cannot be scheduled and preempted.
+    """
+    block_size = 4
+    scheduler = initialize_scheduler(block_size=block_size,
+                                     num_cpu_blocks=64,
+                                     num_gpu_blocks=64)
+    curr_loras = None
+    for i in range(3):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           block_size=block_size)
+        scheduler._allocate_and_set_running(seq_group)
+        append_new_token_seq_group(60, seq_group, 1)
+        scheduler._add_seq_group_to_running(seq_group)
+    scheduler.block_manager.can_append_slots = MagicMock()
+
+    def cannot_append_second_group(seq_group, num_lookahead_slots):
+        return seq_group.request_id != "1"
+
+    scheduler.block_manager.can_append_slots.side_effect = (
+        cannot_append_second_group)
+
+    # 1 cannot be scheduled, and the lowest priority (request 2)
+    # should be preempted. 1 will also be preempted.
+    budget = create_token_budget()
+    output = scheduler._schedule_running(budget, curr_loras)
+    remaining_running = scheduler.running
+    assert len(remaining_running) == 0
+    assert len(output.decode_seq_groups) == 1
+    assert len(output.prefill_seq_groups) == 0
+    assert output.decode_seq_groups[0].seq_group.request_id == "0"
+    assert len(output.preempted) == 2
+    # Verify budgets are updated.
+    assert budget.num_batched_tokens == 1
+    # NOTE: When enable_chunk is False, num_seqs budget is not updated.
+    # assert budget.num_curr_seqs == 1
+    # Both should be preempted, not swapped.
+    assert output.blocks_to_swap_out == []
+    # Nothing is copied.
+    assert output.blocks_to_copy == []
+
+
+def test_schedule_decode_blocks_to_copy_update():
+    """
+    Verify blocks_to_copy is updated.
+    """
+    block_size = 4
+    scheduler = initialize_scheduler(block_size=4,
+                                     num_cpu_blocks=16,
+                                     num_gpu_blocks=16)
+    _, seq_group = create_dummy_prompt("1",
+                                       prompt_length=60,
+                                       block_size=block_size)
+    curr_loras = None
+    scheduler._allocate_and_set_running(seq_group)
+    append_new_token_seq_group(60, seq_group, 1)
+    scheduler._add_seq_group_to_running(seq_group)
+
+    # The last request should be swapped out.
+    scheduler.block_manager.append_slots = MagicMock()
+    scheduler.block_manager.append_slots.return_value = [(2, 3)]
+
+    budget = create_token_budget()
+    output = scheduler._schedule_running(budget, curr_loras)
+    remaining_running = scheduler.running
+    assert len(remaining_running) == 0
+    assert len(output.decode_seq_groups) == 1
+    assert len(output.prefill_seq_groups) == 0
+    assert len(output.preempted) == 0
+    assert len(output.swapped_out) == 0
+    # Nothing is preempted.
+    assert output.blocks_to_swap_out == []
+    # Since append_slot returns the source -> dist mapping, it should
+    # applied.
+    assert output.blocks_to_copy == [(2, 3)]
+
+
+def test_schedule_swapped_max_loras():
+    block_size = 4
+    lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
+    scheduler = initialize_scheduler(lora_config=lora_config,
+                                     block_size=block_size,
+                                     num_cpu_blocks=32,
+                                     num_gpu_blocks=32)
+    curr_loras: set[int] = set()
+    blocks_to_swap_out: list[tuple[int, int]] = []
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           block_size=block_size,
+                                           lora_request=LoRARequest(
+                                               lora_name=str(i),
+                                               lora_int_id=i + 1,
+                                               lora_path="abc"))
+        scheduler._allocate_and_set_running(seq_group)
+        append_new_token_seq_group(60, seq_group, 1)
+        scheduler._swap_out(seq_group, blocks_to_swap_out)
+        scheduler._add_seq_group_to_swapped(seq_group)
+
+    budget = create_token_budget()
+    output = scheduler._schedule_swapped(budget, curr_loras)
+    remaining_swapped = scheduler.swapped
+    assert len(remaining_swapped) == 1
+    assert budget.num_batched_tokens == 1
+    assert budget.num_curr_seqs == 1
+    assert len(output.decode_seq_groups) == 1
+    assert len(output.prefill_seq_groups) == 0
+    assert len(curr_loras) == 1
+
+
+def test_schedule_swapped_cannot_swap_in():
+    block_size = 4
+    scheduler = initialize_scheduler(block_size=block_size,
+                                     num_cpu_blocks=32,
+                                     num_gpu_blocks=32)
+    curr_loras = None
+    blocks_to_swap_out: list[tuple[int, int]] = []
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           block_size=block_size)
+        scheduler._allocate_and_set_running(seq_group)
+        append_new_token_seq_group(60, seq_group, 1)
+        scheduler._swap_out(seq_group, blocks_to_swap_out)
+        scheduler._add_seq_group_to_swapped(seq_group)
+
+    # The last request should be swapped out.
+    scheduler.block_manager.can_swap_in = MagicMock()
+    scheduler.block_manager.can_swap_in.return_value = AllocStatus.LATER
+    # Since we cannot swap in, none of the requests are swapped in.
+    budget = create_token_budget()
+    output = scheduler._schedule_swapped(budget, curr_loras)
+    remaining_swapped = scheduler.swapped
+    assert len(remaining_swapped) == 2
+    assert budget.num_batched_tokens == 0
+    assert budget.num_curr_seqs == 0
+    assert len(output.decode_seq_groups) == 0
+    assert len(output.prefill_seq_groups) == 0
+
+
+def test_infeasible_swap():
+    block_size = 4
+    scheduler = initialize_scheduler(block_size=block_size,
+                                     num_cpu_blocks=32,
+                                     num_gpu_blocks=32)
+    curr_loras = None
+    blocks_to_swap_out: list[tuple[int, int]] = []
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           block_size=block_size)
+        scheduler._allocate_and_set_running(seq_group)
+        append_new_token_seq_group(60, seq_group, 1)
+        scheduler._swap_out(seq_group, blocks_to_swap_out)
+        scheduler._add_seq_group_to_swapped(seq_group)
+
+    # The last request should be swapped out.
+    scheduler.block_manager.can_swap_in = MagicMock()
+    scheduler.block_manager.can_swap_in.return_value = AllocStatus.NEVER
+    # Since we cannot swap in, none of the requests are swapped in.
+    budget = create_token_budget()
+    output = scheduler._schedule_swapped(budget, curr_loras)
+    remaining_swapped = scheduler.swapped
+    assert len(remaining_swapped) == 0
+    assert len(output.infeasible_seq_groups) == 2
+    assert budget.num_batched_tokens == 0
+    assert budget.num_curr_seqs == 0
+    assert len(output.decode_seq_groups) == 0
+    assert len(output.prefill_seq_groups) == 0
+
+
+def test_schedule_swapped_blocks_to_copy():
+    block_size = 4
+    scheduler = initialize_scheduler(block_size=block_size,
+                                     num_cpu_blocks=32,
+                                     num_gpu_blocks=32)
+    curr_loras = None
+    _, seq_group = create_dummy_prompt("1",
+                                       prompt_length=60,
+                                       block_size=block_size)
+    scheduler._allocate_and_set_running(seq_group)
+    append_new_token_seq_group(60, seq_group, 1)
+    blocks_to_swap_out: list[tuple[int, int]] = []
+    scheduler._swap_out(seq_group, blocks_to_swap_out)
+    scheduler._add_seq_group_to_swapped(seq_group)
+
+    # The last request should be swapped out.
+    scheduler.block_manager.append_slots = MagicMock()
+    scheduler.block_manager.append_slots.return_value = [(2, 3)]
+
+    budget = create_token_budget()
+    output = scheduler._schedule_swapped(budget, curr_loras)
+    remaining_swapped = scheduler.swapped
+    assert len(remaining_swapped) == 0
+    assert len(output.decode_seq_groups) == 1
+    assert len(output.prefill_seq_groups) == 0
+    assert output.blocks_to_copy == [(2, 3)]
+
+
+def test_scheduling_budget():
+    TOKEN_BUDGET = 4
+    MAX_SEQS = 4
+    budget = SchedulingBudget(token_budget=TOKEN_BUDGET, max_num_seqs=MAX_SEQS)
+    assert budget.can_schedule(num_new_tokens=1, num_new_seqs=1)
+    assert budget.can_schedule(num_new_tokens=4, num_new_seqs=4)
+    assert not budget.can_schedule(num_new_tokens=1, num_new_seqs=5)
+    assert not budget.can_schedule(num_new_tokens=5, num_new_seqs=1)
+    assert not budget.can_schedule(num_new_tokens=5, num_new_seqs=5)
+    assert budget.remaining_token_budget() == TOKEN_BUDGET
+
+    # Verify add/subtract num batched tokens.
+    _, seq_group = create_dummy_prompt("1", 3)
+    budget.add_num_batched_tokens(seq_group.request_id, 2)
+    assert budget.remaining_token_budget() == 2
+    assert budget.num_batched_tokens == 2
+    assert budget.can_schedule(num_new_tokens=2, num_new_seqs=1)
+    assert not budget.can_schedule(num_new_tokens=3, num_new_seqs=1)
+    # Verify adding another seq group is no-op.
+    budget.add_num_batched_tokens(seq_group.request_id, 2)
+    assert budget.remaining_token_budget() == 2
+    assert budget.num_batched_tokens == 2
+    budget.subtract_num_batched_tokens(seq_group.request_id, 2)
+    assert budget.remaining_token_budget() == 4
+    assert budget.num_batched_tokens == 0
+    budget.subtract_num_batched_tokens(seq_group.request_id, 2)
+    assert budget.remaining_token_budget() == 4
+    assert budget.num_batched_tokens == 0
+
+    # Verify add/subtract max seqs.
+    _, seq_group = create_dummy_prompt("1", 3)
+    budget.add_num_seqs(seq_group.request_id, 2)
+    assert budget.can_schedule(num_new_tokens=1, num_new_seqs=2)
+    assert not budget.can_schedule(num_new_tokens=1, num_new_seqs=3)
+    assert budget.num_curr_seqs == 2
+    # Verify adding another seq group is no-op.
+    budget.add_num_seqs(seq_group.request_id, 2)
+    assert budget.num_curr_seqs == 2
+    budget.subtract_num_seqs(seq_group.request_id, 2)
+    assert budget.num_curr_seqs == 0
+    budget.subtract_num_seqs(seq_group.request_id, 2)
+    assert budget.num_curr_seqs == 0
+
+
+@pytest.mark.parametrize("enable_prefix_caching", [True, False])
+def test_prefix_caching_aware_prefills(enable_prefix_caching):
+    """
+    Test the below scenario:
+
+    For 3 sequences, seqA, seqB, seqC, share the first block as prefix.
+
+    The test verifies the below scenarios:
+    1.  SeqA is first scheduled.
+    2.  SeqB and SeqC can be prefilled together in a single schedule round
+    even though there are not enough token budgets to prefill both without
+    considering prefix caching.
+    """
+
+    block_size = 4
+    max_num_batched_tokens = 12
+    max_seq_group = 3
+    scheduler = initialize_scheduler(
+        block_size=block_size,
+        num_cpu_blocks=16,
+        num_gpu_blocks=16,
+        max_token_budget=max_num_batched_tokens,
+        max_num_seqs=max_seq_group,
+        max_model_len=max_num_batched_tokens,
+        enable_prefix_caching=enable_prefix_caching,
+    )
+
+    seqA_tokens = list(range(8))
+    num_shared_tokens = 4
+    seqB_tokens = seqA_tokens[:num_shared_tokens] + list(range(
+        12, 16))  # Shared prefix first 4.
+    seqC_tokens = seqA_tokens[:num_shared_tokens] + list(range(
+        16, 20))  # Shared prefix first 4.
+
+    seqA, seqA_group = create_dummy_prompt("0",
+                                           prompt_tokens=seqA_tokens,
+                                           block_size=block_size)
+    seqB, seqB_group = create_dummy_prompt("1",
+                                           prompt_tokens=seqB_tokens,
+                                           block_size=block_size)
+    seqC, seqC_group = create_dummy_prompt("2",
+                                           prompt_tokens=seqC_tokens,
+                                           block_size=block_size)
+
+    # Schedule seqA prefill.
+    scheduler.add_seq_group(seqA_group)
+    metas, out, _ = scheduler.schedule()
+    assert (len(out.scheduled_seq_groups) == 1
+            and out.scheduled_seq_groups[0].seq_group == seqA_group)
+    assert out.scheduled_seq_groups[0].token_chunk_size == len(seqA_tokens)
+
+    # Schedule seqA decode.
+    append_new_token_seq_group(len(seqA_tokens), seqA_group, 999)
+    metas, out, _ = scheduler.schedule()
+
+    assert len(out.scheduled_seq_groups) == 1
+    assert out.scheduled_seq_groups[0].seq_group == seqA_group
+    assert out.scheduled_seq_groups[0].token_chunk_size == 1
+
+    # Schedule seqB and seqC prefills should work with prefix caching.
+    scheduler.add_seq_group(seqB_group)
+    scheduler.add_seq_group(seqC_group)
+    metas, out, _ = scheduler.schedule()
+
+    if enable_prefix_caching:
+        assert len(out.scheduled_seq_groups) == 2
+        assert set([
+            out.scheduled_seq_groups[0].seq_group,
+            out.scheduled_seq_groups[1].seq_group,
+        ]) == set([seqB_group, seqC_group])
+        assert len(metas) == 2
+        for meta in metas:
+            assert meta.token_chunk_size == 8
+            assert (len(meta.computed_block_nums) == num_shared_tokens //
+                    block_size)  # 1 Block for the 8 tokens.
+    else:
+        assert len(out.scheduled_seq_groups) == 1
+        assert len(metas) == 1
+        assert metas[0].token_chunk_size == 8
+        assert len(metas[0].computed_block_nums) == 0  # No blocks computed.
+
+
+def test_no_multiple_partial_prefills_with_chunked_prefill_and_prefix_caching(
+):
+    """
+    This test verifies that we don't schedule new prefills if there's already
+    a continuous prefill in progress even though the new prefills with shared
+    prefix can fit in the token budget:
+
+    - SeqA is being chunked prefill.
+    - SeqB with the same prompt shouldn't be scheduled for prefill even though
+    there's enough token budget to prefill the cached tokens.
+    - Neither should seqC be scheduled.
+
+    - When seqA is in decoding phase, seqB and seqC can be scheduled.
+        - Entire seqB should be prefilled since it's a full prefix cache hit.
+        - SeqC would be partially prefilled with the prefix shared, and the
+        remaining unique tokens would be prefilled (rounded down to be
+        block-size aligned).
+    """
+
+    block_size = 2
+    max_num_batched_tokens = 4
+    max_seq_group = 3
+    scheduler = initialize_scheduler(
+        block_size=block_size,
+        num_cpu_blocks=16,
+        num_gpu_blocks=16,
+        max_token_budget=max_num_batched_tokens,
+        max_num_seqs=max_seq_group,
+        max_model_len=100,
+        enable_prefix_caching=True,
+        enable_chunked_prefill=True,
+    )
+
+    seqA_tokens = list(range(8))
+    seqB_tokens = seqA_tokens
+    seqC_shared_prefix_len = 4
+    seqC_tokens = seqA_tokens[:seqC_shared_prefix_len] + list(range(12, 20))
+
+    seqA, seqA_group = create_dummy_prompt("0",
+                                           prompt_tokens=seqA_tokens,
+                                           block_size=block_size)
+    seqB, seqB_group = create_dummy_prompt("1",
+                                           prompt_tokens=seqB_tokens,
+                                           block_size=block_size)
+
+    # Chunked prefill seqA.
+    scheduler.add_seq_group(seqA_group)
+    metas, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(out.scheduled_seq_groups) == 1
+    assert out.scheduled_seq_groups[0].seq_group == seqA_group
+    assert out.scheduled_seq_groups[0].token_chunk_size == 4
+
+    # seqB should not be scheduled with ongoing prefills.
+    scheduler.add_seq_group(seqB_group)
+    metas, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(out.scheduled_seq_groups) == 1
+    assert out.scheduled_seq_groups[0].seq_group == seqA_group
+    assert out.scheduled_seq_groups[0].token_chunk_size == 4
+
+    # both seqB and seqC can now be scheduled with seqA is over.
+    # seqA is in decoding phase.
+    append_new_token_seq(seqA, 999)
+    seqC, seqC_group = create_dummy_prompt("2",
+                                           prompt_tokens=seqC_tokens,
+                                           block_size=block_size)
+    scheduler.add_seq_group(seqC_group)
+    metas, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(out.scheduled_seq_groups) == 3
+
+    metas = {meta.request_id: meta for meta in metas}
+    assert metas[seqA_group.request_id].token_chunk_size == 1  # Decode
+    assert (metas[seqB_group.request_id].token_chunk_size == 8
+            )  # Fully cached prefill
+    assert (
+        metas[seqC_group.request_id].token_chunk_size == 6
+    ), "A partial prefix of C (4 tokens) should be prefilled, with the "
+    "remaining tokens fit into 3 token budget (4-1 from the seqA). It will "
+    "then be rounded down to 2 tokens on block size, thus 6 tokens in total."
+
+
+def test_no_batches_mixed_with_prompt_tokens_and_prompt_embeds():
+    """
+    Test that the scheduler does not schedule batches with prompt tokens and 
+    prompt embeddings co-mingled.
+    """
+    block_size = 2
+    max_seq_group = 3
+    scheduler = initialize_scheduler(
+        block_size=block_size,
+        num_cpu_blocks=16,
+        num_gpu_blocks=16,
+        max_num_seqs=max_seq_group,
+        max_model_len=100,
+        enable_prefix_caching=True,
+    )
+
+    # the odd indexed inputs should be passed in via embeddings,
+    # evens via token_ids
+    seq_length = 7
+    embedding_size = 5
+    num_seqs = 11
+    seq_tokens: list[list[int]] = []
+    seq_embeds: list[Optional[torch.Tensor]] = []
+    for i in range(num_seqs):
+        if i % 2:
+            seq_tokens.append(list(range(seq_length)))
+            seq_embeds.append(None)
+        else:
+            seq_tokens.append([0] * seq_length)
+            seq_embeds.append(torch.rand(embedding_size))
+
+    seq_and_seq_groups = [
+        create_dummy_prompt(f"{i}",
+                            prompt_tokens=seq_tokens[i],
+                            prompt_embeds=seq_embeds[i],
+                            block_size=block_size)
+        for i in range(len(seq_tokens))
+    ]
+
+    for _, seq_group in seq_and_seq_groups:
+        scheduler.add_seq_group(seq_group)
+
+    while not all(seq.is_finished() for seq, _ in seq_and_seq_groups):
+        unfinished_seq_groups = [
+            seq_group for _, seq_group in seq_and_seq_groups
+            if not seq_group.is_finished()
+        ]
+        _, out = schedule_and_update_computed_tokens(scheduler)
+        assert len(out.scheduled_seq_groups) > 0
+        batch_is_prompt_embeds = out.scheduled_seq_groups[
+            0].seq_group.uses_prompt_embeds()
+        expected_scheduled_seq_groups = [
+            seq_group for seq_group in unfinished_seq_groups
+            if seq_group.uses_prompt_embeds() == batch_is_prompt_embeds
+        ]
+
+        # We should have as many scheduled groups as possible, without mixing
+        assert len(out.scheduled_seq_groups) == min(
+            max_seq_group, len(expected_scheduled_seq_groups))
+        assert all(scheduled_seq_group.seq_group.uses_prompt_embeds() ==
+                   batch_is_prompt_embeds
+                   for scheduled_seq_group in out.scheduled_seq_groups)
+
+        # Finish the scheduled groups
+        for scheduled_seq_group in out.scheduled_seq_groups:
+            for seq in scheduled_seq_group.seq_group.seqs:
+                seq.status = SequenceStatus.FINISHED_STOPPED
+        scheduler.free_finished_seq_groups()
+
+
+def test_remove_seq_from_computed_blocks_tracker():
+    """
+    Test that computed_blocks_tracker correctly removes stale sequences
+    during scheduling.
+
+    The test covers 9 scheduling branches where stale seqs are removed:
+    - 1 in _schedule_swapped
+    - 1 in _schedule_priority_preemption
+    - 7 in _schedule_prefill
+
+    Each branch is tested to ensure proper cleanup of
+    _seq_id_to_num_tokens_computed.
+    """
+    # Budget can not schedule in swapped
+    block_size = 2
+    max_seq_group = 3
+    seq_tokens_with_swapped: list[list[int]] = []
+    blocks_to_swap_out: list[tuple[int, int]] = []
+    curr_loras: set[int] = set()
+
+    scheduler = initialize_scheduler(
+        block_size=block_size,
+        num_cpu_blocks=64,
+        num_gpu_blocks=16,
+        max_num_seqs=max_seq_group,
+        enable_prefix_caching=True,
+    )
+    budget = create_token_budget(token_budget=15)
+
+    seq_length = 16
+    num_seqs = 3
+    for i in range(num_seqs):
+        seq_tokens_with_swapped.append([i] * seq_length)
+
+    seq_and_seq_groups = [
+        create_dummy_prompt(f"{i}",
+                            prompt_tokens=seq_tokens_with_swapped[i],
+                            block_size=block_size)
+        for i in range(len(seq_tokens_with_swapped))
+    ]
+
+    for _, seq_group in seq_and_seq_groups:
+        scheduler._allocate_and_set_running(seq_group)
+        scheduler._swap_out(seq_group, blocks_to_swap_out)
+        scheduler._add_seq_group_to_swapped(seq_group)
+
+    scheduler._schedule_swapped(budget, curr_loras)
+    seq_id_to_num_tokens_computed = (
+        scheduler.block_manager._computed_blocks_tracker.
+        _seq_id_to_num_tokens_computed.get(1))
+    assert seq_id_to_num_tokens_computed is None
+
+    # Prefill schedule don't have a space for another LoRA, so
+    # we ignore this request for now.
+    block_size = 4
+    lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
+    scheduler = initialize_scheduler(lora_config=lora_config,
+                                     block_size=block_size,
+                                     num_cpu_blocks=64,
+                                     num_gpu_blocks=64,
+                                     enable_prefix_caching=True)
+    budget = create_token_budget(token_budget=120)
+    num_seqs = 2
+    for i in range(num_seqs):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=seq_length,
+                                           block_size=block_size,
+                                           lora_request=LoRARequest(
+                                               lora_name=str(i),
+                                               lora_int_id=i + 1,
+                                               lora_path="abc"))
+        scheduler.add_seq_group(seq_group)
+
+    scheduler._schedule_prefills(budget, curr_loras)
+    seq_id_to_num_tokens_computed = (
+        scheduler.block_manager._computed_blocks_tracker.
+        _seq_id_to_num_tokens_computed.get(1))
+    assert seq_id_to_num_tokens_computed is None
+
+    # Priority preemption schedule
+    scheduler._schedule_priority_preemption(budget)
+    seq_id_to_num_tokens_computed = (
+        scheduler.block_manager._computed_blocks_tracker.
+        _seq_id_to_num_tokens_computed.get(1))
+    assert seq_id_to_num_tokens_computed is None
+
+    # Prefill scheduler does not schedule batches with prompt tokens and
+    # prompt embeddings co-mingled.
+    block_size = 2
+    max_seq_group = 3
+    scheduler = initialize_scheduler(
+        block_size=block_size,
+        num_cpu_blocks=16,
+        num_gpu_blocks=16,
+        max_num_seqs=max_seq_group,
+        max_model_len=100,
+        enable_prefix_caching=True,
+    )
+    seq_length = 7
+    embedding_size = 5
+    seq_tokens_with_embedding: list[list[int]] = []
+    seq_embeds: list[Optional[torch.Tensor]] = []
+
+    seq_tokens_with_embedding.append(list(range(seq_length)))
+    seq_embeds.append(None)
+    seq_tokens_with_embedding.append([0] * seq_length)
+    seq_embeds.append(torch.rand(embedding_size))
+
+    seq_and_seq_groups = [
+        create_dummy_prompt(f"{i}",
+                            prompt_tokens=seq_tokens_with_embedding[i],
+                            prompt_embeds=seq_embeds[i],
+                            block_size=block_size)
+        for i in range(len(seq_tokens_with_embedding))
+    ]
+
+    for _, seq_group in seq_and_seq_groups:
+        scheduler.add_seq_group(seq_group)
+
+    scheduler._schedule_default()
+    seq_id_to_num_tokens_computed = (
+        scheduler.block_manager._computed_blocks_tracker.
+        _seq_id_to_num_tokens_computed.get(1))
+    assert seq_id_to_num_tokens_computed is None
+
+    #  Prefill scheduler budget num_batched_tokens
+    #  >= scheduler_config max_num_batched_tokens
+    block_size = 2
+    max_seq_group = 3
+    seq_tokens_prefill_budget: list[list[int]] = []
+
+    scheduler = initialize_scheduler(
+        block_size=block_size,
+        max_token_budget=8,
+        num_cpu_blocks=16,
+        num_gpu_blocks=16,
+        max_num_seqs=max_seq_group,
+        max_model_len=5,
+        enable_prefix_caching=True,
+    )
+    seq_length = 4
+    num_seqs = 3
+    for i in range(num_seqs):
+        seq_tokens_prefill_budget.append([i] * seq_length)
+
+    seq_and_seq_groups = [
+        create_dummy_prompt(f"{i}",
+                            prompt_tokens=seq_tokens_prefill_budget[i],
+                            block_size=block_size)
+        for i in range(len(seq_tokens_prefill_budget))
+    ]
+
+    for _, seq_group in seq_and_seq_groups:
+        scheduler.add_seq_group(seq_group)
+
+    scheduler._schedule_default()
+    seq_id_to_num_tokens_computed = (
+        scheduler.block_manager._computed_blocks_tracker.
+        _seq_id_to_num_tokens_computed.get(2))
+    assert seq_id_to_num_tokens_computed is None
+
+    # Budget can not schedule in waiting
+    block_size = 2
+    max_seq_group = 3
+
+    scheduler = initialize_scheduler(
+        block_size=block_size,
+        max_token_budget=30,
+        num_cpu_blocks=16,
+        num_gpu_blocks=16,
+        max_num_seqs=max_seq_group,
+        max_model_len=30,
+        enable_prefix_caching=True,
+    )
+    seq_length = 16
+    num_seqs = 3
+    seq_tokens_prefill_budget_waiting: list[list[int]] = []
+
+    for i in range(num_seqs):
+        seq_tokens_prefill_budget_waiting.append(list(range(seq_length)))
+
+    seq_and_seq_groups = [
+        create_dummy_prompt(f"{i}",
+                            prompt_tokens=seq_tokens_prefill_budget_waiting[i],
+                            block_size=block_size)
+        for i in range(len(seq_tokens_prefill_budget_waiting))
+    ]
+
+    for _, seq_group in seq_and_seq_groups:
+        scheduler.add_seq_group(seq_group)
+
+    scheduler._schedule_default()
+    seq_id_to_num_tokens_computed = (
+        scheduler.block_manager._computed_blocks_tracker.
+        _seq_id_to_num_tokens_computed.get(1))
+    assert seq_id_to_num_tokens_computed is None
+
+    # Sequence num_new_tokens > prompt_limit marked FINISHED_IGNORED
+    block_size = 2
+    max_seq_group = 3
+    scheduler = initialize_scheduler(
+        block_size=block_size,
+        num_cpu_blocks=16,
+        num_gpu_blocks=16,
+        max_num_seqs=max_seq_group,
+        max_model_len=30,
+        enable_prefix_caching=True,
+    )
+
+    seq_length = 31
+    seq_tokens_prompt_limit: list[list[int]] = []
+    seq_tokens_prompt_limit.append(list(range(seq_length)))
+    seq_and_seq_groups = [
+        create_dummy_prompt("0",
+                            prompt_tokens=seq_tokens_prompt_limit[0],
+                            block_size=block_size)
+    ]
+    for _, seq_group in seq_and_seq_groups:
+        scheduler.add_seq_group(seq_group)
+    scheduler._schedule_default()
+    seq_id_to_num_tokens_computed = (
+        scheduler.block_manager._computed_blocks_tracker.
+        _seq_id_to_num_tokens_computed.get(0))
+    assert seq_id_to_num_tokens_computed is None
+
+    # Budget can not allocate, AllocStatus is NEVER marked FINISHED_IGNORED
+    block_size = 2
+    max_seq_group = 3
+    scheduler = initialize_scheduler(
+        block_size=block_size,
+        num_cpu_blocks=160,
+        num_gpu_blocks=160,
+        max_num_seqs=max_seq_group,
+        max_model_len=320,
+        enable_prefix_caching=True,
+    )
+
+    seq_length = 320
+    num_seqs = 1
+    seq_tokens_never: list[list[int]] = []
+    for i in range(num_seqs):
+        seq_tokens_never.append(list(range(seq_length)))
+
+    seq_and_seq_groups = [
+        create_dummy_prompt(f"{i}",
+                            prompt_tokens=seq_tokens_never[i],
+                            block_size=block_size)
+        for i in range(len(seq_tokens_never))
+    ]
+
+    for _, seq_group in seq_and_seq_groups:
+        scheduler.add_seq_group(seq_group)
+
+    scheduler._schedule_default()
+    seq_id_to_num_tokens_computed = (
+        scheduler.block_manager._computed_blocks_tracker.
+        _seq_id_to_num_tokens_computed.get(0))
+    assert seq_id_to_num_tokens_computed is None
+
+    # Budget can not allocate, AllocStatus is LATER
+    block_size = 2
+    max_seq_group = 3
+    scheduler = initialize_scheduler(
+        block_size=block_size,
+        num_cpu_blocks=160,
+        num_gpu_blocks=160,
+        max_num_seqs=max_seq_group,
+        max_model_len=320,
+        enable_prefix_caching=True,
+    )
+
+    seq_length = 160
+    num_seqs = 2
+    seq_tokens_later: list[list[int]] = []
+    for i in range(num_seqs):
+        seq_tokens_later.append(list(range(seq_length)))
+
+    seq_and_seq_groups = [
+        create_dummy_prompt(f"{i}",
+                            prompt_tokens=seq_tokens_later[i],
+                            block_size=block_size)
+        for i in range(len(seq_tokens_later))
+    ]
+
+    for _, seq_group in seq_and_seq_groups:
+        scheduler.add_seq_group(seq_group)
+
+    scheduler._schedule_default()
+    seq_id_to_num_tokens_computed = (
+        scheduler.block_manager._computed_blocks_tracker.
+        _seq_id_to_num_tokens_computed.get(1))
+    assert seq_id_to_num_tokens_computed is None
diff --git a/vllm_v0.10.0/tests/core/test_scheduler_encoder_decoder.py b/vllm_v0.10.0/tests/core/test_scheduler_encoder_decoder.py
new file mode 100644
index 0000000..20cc083
--- /dev/null
+++ b/vllm_v0.10.0/tests/core/test_scheduler_encoder_decoder.py
@@ -0,0 +1,105 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest  # noqa
+
+from vllm.config import CacheConfig, SchedulerConfig
+from vllm.core.scheduler import Scheduler
+from vllm.sequence import SequenceGroup
+
+from .utils import (append_new_token, create_dummy_prompt_encoder_decoder,
+                    get_sequence_groups, schedule_and_update_computed_tokens)
+
+
+def test_scheduler_schedule_simple_encoder_decoder():
+    '''
+    Test basic scheduler functionality in the context
+    of an encoder/decoder model. Focus on testing
+    enc/dec-specific functionality sense tests already
+    exist for decoder-only functionality
+
+    Test behavior:
+    * Construct Scheduler
+    * Construct dummy encoder/decoder sequence groups
+    * Add dummy seq groups to scheduler backlog
+    * Schedule the next seq group & validate:
+        * Cross-attn block tables
+        * Updated states of seq groups
+        * Number of batched tokens
+        * Number of blocks to copy/swap-in/swap-out
+        * Number of scheduled seq groups
+    * Repeat for both prefill- and decode-phase
+    * Abort scheduled seq groups
+    * Assert that aborted seq groups no longer appear in
+      cross-attention block table
+    '''
+
+    block_size = 4
+    num_seq_group = 4
+    max_model_len = 16
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens=64,
+        max_num_seqs=num_seq_group,
+        max_model_len=max_model_len,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 16  # enc and dec prompts per seq_group
+    cache_config.num_gpu_blocks = 16  # enc and dec prompts per seq_group
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: list[SequenceGroup] = []
+
+    # Add seq groups to scheduler.
+    req_id_list = []
+    for i in range(num_seq_group):
+        req_id = str(i)
+        req_id_list.append(req_id)
+        _, _, seq_group = create_dummy_prompt_encoder_decoder(
+            req_id, block_size, block_size, block_size)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+
+    # Schedule seq groups prefill.
+    num_tokens = block_size * num_seq_group
+    seq_group_meta_list, out = schedule_and_update_computed_tokens(scheduler)
+    # - Verify that sequence group cross-attention block tables are
+    #   registered with the block manager
+    assert all([(req_id in scheduler.block_manager.cross_block_tables)
+                for req_id in req_id_list])
+    # - Validate sequence-group status
+    assert set(get_sequence_groups(out)) == set(running)
+    # - Validate number of batched tokens
+    assert out.num_batched_tokens == num_tokens
+    # - Validate there are no remaining blocks to swap
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    # - Validate all seq groups were scheduled
+    assert len(seq_group_meta_list) == num_seq_group
+    append_new_token(out, 1)
+
+    # Schedule seq groups decode.
+    seq_group_meta_list, out = schedule_and_update_computed_tokens(scheduler)
+    # - Verify that sequence group metadata includes encoder attention
+    #   and cross-attention metadata
+    assert all([
+        not ((seq_group_meta.encoder_seq_data is None) or
+             (seq_group_meta.cross_block_table is None))
+        for seq_group_meta in seq_group_meta_list
+    ])
+    # - Validate sequence-group status
+    assert set(get_sequence_groups(out)) == set(running)
+    # - Validate there is one batched token per seq group
+    assert out.num_batched_tokens == num_seq_group
+    # - Validate there are no remaining blocks to swap
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    # - Validate that all seq groups were scheduled
+    assert len(seq_group_meta_list) == num_seq_group
+    append_new_token(out, 1)
+
+    # Abort sequences
+    for req_id in req_id_list:
+        scheduler.abort_seq_group(req_id)
+        # - Verify that sequence group cross-attention block tables are
+        #   NO LONGER registered with the block manager
+        assert req_id not in scheduler.block_manager.cross_block_tables
diff --git a/vllm_v0.10.0/tests/core/test_serialization.py b/vllm_v0.10.0/tests/core/test_serialization.py
new file mode 100644
index 0000000..ee9ac21
--- /dev/null
+++ b/vllm_v0.10.0/tests/core/test_serialization.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import msgspec
+
+from vllm.executor.msgspec_utils import decode_hook, encode_hook
+from vllm.sequence import ExecuteModelRequest
+
+from .utils import create_batch
+
+
+def test_msgspec_serialization():
+    num_lookahead_slots = 4
+    seq_group_metadata_list, _, _ = create_batch(16, num_lookahead_slots)
+    execute_model_req = ExecuteModelRequest(
+        seq_group_metadata_list=seq_group_metadata_list,
+        num_lookahead_slots=num_lookahead_slots,
+        running_queue_size=4)
+
+    encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
+    decoder = msgspec.msgpack.Decoder(ExecuteModelRequest,
+                                      dec_hook=decode_hook)
+    req = decoder.decode(encoder.encode(execute_model_req))
+    expected = execute_model_req.seq_group_metadata_list
+    actual = req.seq_group_metadata_list
+    assert (len(expected) == len(actual))
+    expected = expected[0]
+    actual = actual[0]
+
+    assert expected.block_tables == actual.block_tables
+    assert expected.is_prompt == actual.is_prompt
+    assert expected.request_id == actual.request_id
+    assert (expected.seq_data[0].prompt_token_ids ==
+            actual.seq_data[0].prompt_token_ids)
+    assert (expected.seq_data[0].output_token_ids ==
+            actual.seq_data[0].output_token_ids)
diff --git a/vllm_v0.10.0/tests/core/utils.py b/vllm_v0.10.0/tests/core/utils.py
new file mode 100644
index 0000000..033fffd
--- /dev/null
+++ b/vllm_v0.10.0/tests/core/utils.py
@@ -0,0 +1,392 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import time
+from collections import defaultdict
+from collections.abc import Sequence as GenericSequence
+from itertools import count
+from typing import Any, Optional, Union
+
+import torch
+
+from vllm.core.scheduler import Scheduler, SchedulerOutputs
+from vllm.inputs import EncoderDecoderInputs, embeds_inputs, token_inputs
+from vllm.lora.request import LoRARequest
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import (Logprob, Sequence, SequenceData, SequenceGroup,
+                           SequenceGroupMetadata)
+
+
+def create_dummy_prompt(
+    request_id: str,
+    prompt_length: int = -1,
+    block_size: Optional[int] = None,
+    lora_request: Optional[LoRARequest] = None,
+    prompt_tokens: Optional[list[int]] = None,
+    prompt_embeds: Optional[torch.Tensor] = None,
+    min_tokens: int = 0,
+    max_tokens: int = 16,
+) -> tuple[Sequence, SequenceGroup]:
+    if not block_size:
+        block_size = prompt_length
+
+    if prompt_tokens is None:
+        # Create dummy prompt sequence with tokens 0...block_size-1
+        # and prompt "0 ... block_size".
+        prompt_tokens = list(range(prompt_length))
+
+    prompt_str = " ".join([str(t) for t in prompt_tokens])
+    inputs = token_inputs(
+        prompt_token_ids=prompt_tokens,
+        prompt=prompt_str) if prompt_embeds is None else embeds_inputs(
+            prompt_embeds=prompt_embeds)
+    prompt = Sequence(
+        int(request_id),
+        inputs=inputs,
+        block_size=block_size,
+    )
+    seq_group = SequenceGroup(
+        request_id=request_id,
+        seqs=[prompt],
+        arrival_time=time.time(),
+        sampling_params=SamplingParams(max_tokens=max_tokens,
+                                       min_tokens=min_tokens),
+        lora_request=lora_request,
+    )
+
+    return prompt, seq_group
+
+
+def create_dummy_lora_sequence(request_id: int, token_ids: list[int],
+                               block_size: int, lora_int_id: int) -> Sequence:
+    return Sequence(seq_id=request_id,
+                    inputs=token_inputs(token_ids),
+                    block_size=block_size,
+                    lora_request=LoRARequest(lora_name="dummy",
+                                             lora_path="/dummy",
+                                             lora_int_id=lora_int_id))
+
+
+def create_dummy_sequence(request_id: int, token_ids: list[int],
+                          block_size: int) -> Sequence:
+    return Sequence(
+        seq_id=request_id,
+        inputs=token_inputs(token_ids),
+        block_size=block_size,
+    )
+
+
+def create_dummy_prompt_encoder_decoder(
+    request_id: str,
+    decoder_prompt_length: int,
+    encoder_prompt_length: int,
+    block_size: Optional[int] = None,
+    lora_request: Optional[LoRARequest] = None,
+) -> tuple[Sequence, Sequence, SequenceGroup]:
+    if not block_size:
+        block_size = decoder_prompt_length
+
+    # Create dummy prompt sequence with tokens 0...block_size-1
+    # and prompt "0 ... block_size". Note that the prompt string
+    # doesn't actually match the tokens
+    decoder_prompt_tokens = list(range(decoder_prompt_length))
+    decoder_prompt_str = " ".join([str(t) for t in decoder_prompt_tokens])
+    encoder_prompt_tokens = list(reversed(list(range(encoder_prompt_length))))
+    encoder_prompt_str = " ".join([str(t) for t in encoder_prompt_tokens])
+
+    inputs: EncoderDecoderInputs = {
+        "decoder": token_inputs(decoder_prompt_tokens,
+                                prompt=decoder_prompt_str),
+        "encoder": token_inputs(encoder_prompt_tokens,
+                                prompt=encoder_prompt_str),
+    }
+
+    decoder_prompt = Sequence(int(request_id),
+                              inputs=inputs["decoder"],
+                              block_size=block_size)
+
+    encoder_prompt = Sequence(int(request_id),
+                              inputs=inputs["encoder"],
+                              block_size=block_size)
+
+    seq_group = SequenceGroup(request_id=request_id,
+                              seqs=[decoder_prompt],
+                              arrival_time=time.time(),
+                              lora_request=lora_request,
+                              encoder_seq=encoder_prompt)
+
+    return decoder_prompt, encoder_prompt, seq_group
+
+
+def create_seq_group(
+        seq_prompt_len: int = 1024,
+        seq_output_lens: GenericSequence[int] = (128, ),
+        request_id: str = '0',
+        seq_id_start: int = 0,
+        sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
+
+    assert len(seq_output_lens) > 0
+
+    if sampling_params is None:
+        sampling_params = SamplingParams()
+
+    prompt_token_ids = [0] * seq_prompt_len
+
+    seqs: list[Sequence] = []
+    for seq_id_offset, output_len in enumerate(seq_output_lens):
+        seq = Sequence(
+            seq_id=seq_id_start + seq_id_offset,
+            inputs=token_inputs(prompt_token_ids),
+            block_size=16,
+        )
+
+        for i in range(output_len):
+            seq.append_token_id(
+                token_id=i,
+                logprobs={i: Logprob(0.0)},
+            )
+        seqs.append(seq)
+
+    seq_group = SequenceGroup(
+        request_id=request_id,
+        seqs=seqs,
+        sampling_params=sampling_params,
+        arrival_time=time.time(),
+    )
+
+    return seq_group
+
+
+def create_seq_group_encoder_decoder(
+        seq_prompt_len: int = 1024,
+        seq_output_lens: GenericSequence[int] = (128, ),
+        request_id: str = '0',
+        seq_id_start: int = 0,
+        sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
+
+    assert len(seq_output_lens) > 0
+
+    if sampling_params is None:
+        sampling_params = SamplingParams()
+
+    prompt_token_ids = [0] * seq_prompt_len
+
+    inputs: EncoderDecoderInputs = {
+        "decoder": token_inputs(prompt_token_ids),
+        "encoder": token_inputs(prompt_token_ids),
+    }
+
+    seqs = []
+    for seq_id_offset, output_len in enumerate(seq_output_lens):
+        # Construct decoder input sequences
+        seq = Sequence(
+            seq_id=seq_id_start + seq_id_offset,
+            inputs=inputs["decoder"],
+            block_size=16,
+        )
+
+        for i in range(output_len):
+            seq.append_token_id(
+                token_id=i,
+                logprobs={i: Logprob(0.0)},
+            )
+        seqs.append(seq)
+
+    # Encoder input sequence
+    encoder_seq = Sequence(
+        seq_id=seq_id_start + len(seq_output_lens),
+        inputs=inputs["encoder"],
+        block_size=16,
+    )
+
+    return SequenceGroup(request_id=request_id,
+                         seqs=seqs,
+                         sampling_params=sampling_params,
+                         arrival_time=time.time(),
+                         encoder_seq=encoder_seq)
+
+
+def round_up_to_next_block(seq_len: int, block_size: int) -> int:
+    return (seq_len + block_size - 1) // block_size
+
+
+# Helper functions for scheduler tests
+
+
+def get_sequence_groups(scheduler_output):
+    return [s.seq_group for s in scheduler_output.scheduled_seq_groups]
+
+
+def append_new_token(out, token_id: int):
+    seq_groups = get_sequence_groups(out)
+    for seq_group in seq_groups:
+        for seq in seq_group.get_seqs():
+            seq.append_token_id(token_id, {token_id: Logprob(token_id)})
+
+
+def schedule_and_update_computed_tokens(scheduler):
+    metas, out, _ = scheduler.schedule()
+    for s in out.scheduled_seq_groups:
+        s.seq_group.update_num_computed_tokens(s.token_chunk_size)
+    return metas, out
+
+
+def append_new_token_seq(seq: Sequence, token_id: int):
+    seq.append_token_id(token_id, {token_id: Logprob(token_id)})
+
+
+def append_new_token_seq_group(token_chunk_size, seq_group, token_id: int):
+    seq_group.update_num_computed_tokens(token_chunk_size)
+    for seq in seq_group.get_seqs():
+        seq.append_token_id(token_id, {token_id: Logprob(token_id)})
+
+
+class SchedulerProxy:
+    """
+    A proxy class to forward calls to the scheduler.
+    """
+
+    def __init__(self, scheduler: Scheduler):
+        self.scheduler_ = scheduler
+        self.call_history: dict[str, list[Any]] = defaultdict(list)
+
+    def __getattr__(self, name: str) -> Any:
+
+        def wrapper(*args, **kwargs):
+            result = getattr(self.scheduler_, name)(*args, **kwargs)
+            self.call_history[name].append((args, kwargs, result))
+            return result
+
+        return wrapper
+
+    def last_schedule_ret(
+        self, ) -> tuple[list[SequenceGroupMetadata], SchedulerOutputs, Any]:
+        _, _, ret = self.call_history["schedule"][-1]
+        return ret
+
+
+def create_seq_group_metadata_from_prompts(
+    prompts: list[list[int]],
+    num_gpu_blocks: int,
+    block_size: int,
+    final_prompt_lens: list[int],
+    continuations: Optional[list[list[int]]] = None,
+    seq_ids: Optional[list[int]] = None,
+) -> list[SequenceGroupMetadata]:
+
+    if continuations is None:
+        continuations = [[] for _ in prompts]
+
+    if seq_ids is None:
+        seq_ids = list(i for i, _ in enumerate(prompts))
+
+    free_gpu_blocks = list(range(num_gpu_blocks))
+
+    block_allocations = {
+        i: [
+            free_gpu_blocks.pop()
+            for _ in range(round_up_to_next_block(final_len, block_size))
+        ]
+        for i, final_len in enumerate(final_prompt_lens)
+    }
+
+    seq_grou_metadata_list = []
+    for i, (prompt_token_ids,
+            cont_token_ids) in enumerate(zip(prompts, continuations)):
+        data = SequenceData.from_seqs(prompt_token_ids, cont_token_ids)
+        data.update_num_computed_tokens(
+            len(prompt_token_ids) + len(cont_token_ids) - 1)
+        seq_data = {i: data}
+        seq_grou_metadata_list.append(
+            SequenceGroupMetadata(
+                request_id=str(i),
+                is_prompt=len(cont_token_ids) == 0,
+                seq_data=seq_data,
+                sampling_params=SamplingParams(temperature=0.0),
+                block_tables={i: block_allocations[i][:]},
+            ))
+    return seq_grou_metadata_list
+
+
+def create_chunked_seq_group_metadata_from_prompt(
+        prompt: list[int],
+        num_gpu_blocks: int,
+        chunk_size: int,
+        block_size: int,
+        seq_id: Optional[int] = None) -> list[SequenceGroupMetadata]:
+
+    if seq_id is None:
+        seq_id = 0
+
+    free_gpu_blocks = list(range(num_gpu_blocks))
+
+    block_allocations = [
+        free_gpu_blocks.pop()
+        for _ in range(round_up_to_next_block(len(prompt), block_size))
+    ]
+
+    seq_group_metadata_list = []
+    for i, idx in enumerate(range(0, len(prompt), chunk_size)):
+        chunk_ids = prompt[idx:idx + chunk_size]
+        data = SequenceData.from_seqs(prompt)
+        data.update_num_computed_tokens(idx)
+        seq_data = {i: data}
+        seq_group_metadata_list.append(
+            SequenceGroupMetadata(
+                request_id=str(seq_id),
+                is_prompt=True,
+                do_sample=idx + chunk_size >= len(prompt),  # terminal chunk
+                seq_data=seq_data,
+                sampling_params=SamplingParams(temperature=0.0),
+                block_tables={i: block_allocations},
+                token_chunk_size=len(chunk_ids)))
+    return seq_group_metadata_list
+
+
+def create_batch(batch_size,
+                 k,
+                 prompt_len: Union[int, list[int]] = 10,
+                 prev_output_token_len: int = 10,
+                 seq_ids: Optional[list[int]] = None,
+                 num_gpu_blocks: Optional[int] = None,
+                 block_size: Optional[int] = None,
+                 prefill_chunk_size: Optional[int] = None):
+    if block_size is None:
+        block_size = 8
+
+    if num_gpu_blocks is None:
+        num_gpu_blocks = 2048 // block_size
+
+    iterator = count()
+
+    if isinstance(prompt_len, int):
+        prompt_lens = [prompt_len for _ in range(batch_size)]
+    else:
+        prompt_lens = prompt_len
+
+    prompts = [[next(iterator) for _ in range(p_len)] for p_len in prompt_lens]
+
+    if prefill_chunk_size:
+        # Create a batch of chunked prompts.
+        if not seq_ids:
+            seq_ids = list(range(len(prompts)))
+        seq_group_metadata_list = []
+        for p, sid in zip(prompts, seq_ids):
+            seq_group_metadata_list += \
+                create_chunked_seq_group_metadata_from_prompt(
+                p, num_gpu_blocks, prefill_chunk_size, block_size, sid)
+        seq_group_metadata_list = seq_group_metadata_list[:batch_size]
+        prev_output_tokens = []
+    else:
+        prev_output_tokens = [[
+            next(iterator) for _ in range(prev_output_token_len)
+        ] for _ in range(batch_size)]
+        final_prompt_lens = [
+            len(prompt) + len(prev_output_token) + k + 1
+            for prompt, prev_output_token in zip(prompts, prev_output_tokens)
+        ]
+
+        seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+            prompts, num_gpu_blocks, block_size, final_prompt_lens,
+            prev_output_tokens, seq_ids)
+    return seq_group_metadata_list, prompts, prev_output_tokens
diff --git a/vllm_v0.10.0/tests/cuda/test_cuda_context.py b/vllm_v0.10.0/tests/cuda/test_cuda_context.py
new file mode 100644
index 0000000..f973b28
--- /dev/null
+++ b/vllm_v0.10.0/tests/cuda/test_cuda_context.py
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import ctypes
+from concurrent.futures import ThreadPoolExecutor
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+
+
+def check_cuda_context():
+    """Check CUDA driver context status"""
+    try:
+        cuda = ctypes.CDLL('libcuda.so')
+        device = ctypes.c_int()
+        result = cuda.cuCtxGetDevice(ctypes.byref(device))
+        return (True, device.value) if result == 0 else (False, None)
+    except Exception:
+        return False, None
+
+
+def run_cuda_test_in_thread(device_input, expected_device_id):
+    """Run CUDA context test in separate thread for isolation"""
+    try:
+        # New thread should have no CUDA context initially
+        valid_before, device_before = check_cuda_context()
+        if valid_before:
+            return False, \
+                "CUDA context should not exist in new thread, " \
+                f"got device {device_before}"
+
+        # Test setting CUDA context
+        current_platform.set_device(device_input)
+
+        # Verify context is created correctly
+        valid_after, device_id = check_cuda_context()
+        if not valid_after:
+            return False, "CUDA context should be valid after set_cuda_context"
+        if device_id != expected_device_id:
+            return False, \
+                f"Expected device {expected_device_id}, got {device_id}"
+
+        return True, "Success"
+    except Exception as e:
+        return False, f"Exception in thread: {str(e)}"
+
+
+class TestSetCudaContext:
+    """Test suite for the set_cuda_context function."""
+
+    @pytest.mark.skipif(not current_platform.is_cuda(),
+                        reason="CUDA not available")
+    @pytest.mark.parametrize(argnames="device_input,expected_device_id",
+                             argvalues=[
+                                 (0, 0),
+                                 (torch.device('cuda:0'), 0),
+                                 ('cuda:0', 0),
+                             ],
+                             ids=["int", "torch_device", "string"])
+    def test_set_cuda_context_parametrized(self, device_input,
+                                           expected_device_id):
+        """Test setting CUDA context in isolated threads."""
+        with ThreadPoolExecutor(max_workers=1) as executor:
+            future = executor.submit(run_cuda_test_in_thread, device_input,
+                                     expected_device_id)
+            success, message = future.result(timeout=30)
+        assert success, message
+
+    @pytest.mark.skipif(not current_platform.is_cuda(),
+                        reason="CUDA not available")
+    def test_set_cuda_context_invalid_device_type(self):
+        """Test error handling for invalid device type."""
+        with pytest.raises(ValueError, match="Expected a cuda device"):
+            current_platform.set_device(torch.device('cpu'))
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/vllm_v0.10.0/tests/detokenizer/__init__.py b/vllm_v0.10.0/tests/detokenizer/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/detokenizer/conftest.py b/vllm_v0.10.0/tests/detokenizer/conftest.py
new file mode 100644
index 0000000..f2c1253
--- /dev/null
+++ b/vllm_v0.10.0/tests/detokenizer/conftest.py
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
diff --git a/vllm_v0.10.0/tests/detokenizer/test_disable_detokenization.py b/vllm_v0.10.0/tests/detokenizer/test_disable_detokenization.py
new file mode 100644
index 0000000..ae06a98
--- /dev/null
+++ b/vllm_v0.10.0/tests/detokenizer/test_disable_detokenization.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.entrypoints.llm import LLM
+from vllm.sampling_params import SamplingParams
+
+
+@pytest.mark.skip_v1
+@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
+def test_computed_prefix_blocks(model: str):
+    # This test checks if the engine generates completions both with and
+    # without optional detokenization, that detokenization includes text
+    # and no-detokenization doesn't, and that both completions have the same
+    # token_ids.
+    prompt = (
+        "You are a helpful assistant. How do I build a car from cardboard and "
+        "paper clips? Is there an easy to follow video tutorial available "
+        "online for free?")
+
+    llm = LLM(model=model)
+    sampling_params = SamplingParams(max_tokens=10,
+                                     temperature=0.0,
+                                     detokenize=False)
+
+    outputs_no_detokenization = llm.generate(prompt,
+                                             sampling_params)[0].outputs[0]
+    sampling_params.detokenize = True
+    outputs_with_detokenization = llm.generate(prompt,
+                                               sampling_params)[0].outputs[0]
+
+    assert outputs_no_detokenization.text == ''
+    assert outputs_with_detokenization.text != ''
+    assert outputs_no_detokenization.token_ids == \
+        outputs_with_detokenization.token_ids
diff --git a/vllm_v0.10.0/tests/detokenizer/test_stop_checker.py b/vllm_v0.10.0/tests/detokenizer/test_stop_checker.py
new file mode 100644
index 0000000..bd22197
--- /dev/null
+++ b/vllm_v0.10.0/tests/detokenizer/test_stop_checker.py
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import MagicMock
+
+import pytest
+from transformers import PreTrainedTokenizer
+
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.inputs import token_inputs
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import Logprob, Sequence, SequenceStatus
+
+
+def sequence_with_eos(text: str, eos_token: str,
+                      eos_token_id: int) -> Sequence:
+    """
+    Create a Sequence that ends with an EOS token.
+    """
+    seq = Sequence(
+        seq_id=0,
+        inputs=token_inputs([]),
+        block_size=16,
+        eos_token_id=eos_token_id,
+    )
+    seq.output_text = text + eos_token
+
+    offset = eos_token_id + 1
+    for i in range(offset, len(text) + offset):
+        seq.append_token_id(token_id=i, logprobs={i: Logprob(0.0)})
+    seq.append_token_id(token_id=eos_token_id,
+                        logprobs={eos_token_id: Logprob(0.0)})
+
+    seq.status = SequenceStatus.RUNNING
+
+    return seq
+
+
+@pytest.mark.parametrize(["text_wo_eos", "eos_token", "eos_token_id"], [
+    ("This text ends with EOS token", "</s>", 2),
+])
+@pytest.mark.parametrize("ignore_eos", [True, False])
+@pytest.mark.parametrize("include_stop_str_in_output", [True, False])
+@pytest.mark.skip_global_cleanup
+def test_stop_on_eos_token(text_wo_eos: str, eos_token: str, eos_token_id: int,
+                           ignore_eos: bool, include_stop_str_in_output: bool):
+    """
+    Test the behavior of the StopChecker's maybe_stop_sequence method
+    when an EOS token is encountered.
+
+    This test covers:
+    - When the EOS token should stop the sequence and be removed from the output
+    - When the EOS token should stop the sequence and be included in the output
+    - When the EOS token should be ignored, and the sequence continues
+    """
+
+    tokenizer = MagicMock(spec=PreTrainedTokenizer)
+    get_tokenizer_for_seq = MagicMock(return_value=tokenizer)
+    stop_checker = StopChecker(max_model_len=1024,
+                               get_tokenizer_for_seq=get_tokenizer_for_seq)
+
+    seq = sequence_with_eos(
+        text=text_wo_eos,
+        eos_token=eos_token,
+        eos_token_id=eos_token_id,
+    )
+    new_char_count = len(eos_token)
+
+    # Note that `stop` and `stop_token_ids` are not specified
+    sampling_params = SamplingParams(
+        min_tokens=1,
+        ignore_eos=ignore_eos,
+        include_stop_str_in_output=include_stop_str_in_output)
+
+    stop_checker.maybe_stop_sequence(
+        seq=seq,
+        new_char_count=new_char_count,
+        sampling_params=sampling_params,
+    )
+
+    if ignore_eos:
+        assert seq.status == SequenceStatus.RUNNING
+        assert seq.output_text == text_wo_eos + eos_token
+    elif include_stop_str_in_output:
+        assert seq.status == SequenceStatus.FINISHED_STOPPED
+        assert seq.output_text == text_wo_eos + eos_token
+    else:
+        assert seq.status == SequenceStatus.FINISHED_STOPPED
+        assert seq.output_text == text_wo_eos
diff --git a/vllm_v0.10.0/tests/detokenizer/test_stop_reason.py b/vllm_v0.10.0/tests/detokenizer/test_stop_reason.py
new file mode 100644
index 0000000..1ff6797
--- /dev/null
+++ b/vllm_v0.10.0/tests/detokenizer/test_stop_reason.py
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test the different finish_reason="stop" situations during generation:
+    1. One of the provided stop strings
+    2. One of the provided stop tokens
+    3. The EOS token
+
+Run `pytest tests/engine/test_stop_reason.py`.
+"""
+
+import pytest
+import transformers
+
+from vllm import SamplingParams
+
+MODEL = "distilbert/distilgpt2"
+STOP_STR = "."
+SEED = 42
+MAX_TOKENS = 1024
+
+
+@pytest.fixture
+def vllm_model(vllm_runner):
+    with vllm_runner(MODEL) as vllm_model:
+        yield vllm_model
+
+
+def test_stop_reason(vllm_model, example_prompts):
+    tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL)
+    stop_token_id = tokenizer.convert_tokens_to_ids(STOP_STR)
+    llm = vllm_model.llm
+
+    # test stop token
+    outputs = llm.generate(example_prompts,
+                           sampling_params=SamplingParams(
+                               ignore_eos=True,
+                               seed=SEED,
+                               max_tokens=MAX_TOKENS,
+                               stop_token_ids=[stop_token_id]))
+    for output in outputs:
+        output = output.outputs[0]
+        assert output.finish_reason == "stop"
+        assert output.stop_reason == stop_token_id
+
+    # test stop string
+    outputs = llm.generate(example_prompts,
+                           sampling_params=SamplingParams(
+                               ignore_eos=True,
+                               seed=SEED,
+                               max_tokens=MAX_TOKENS,
+                               stop="."))
+    for output in outputs:
+        output = output.outputs[0]
+        assert output.finish_reason == "stop"
+        assert output.stop_reason == STOP_STR
+
+    # test EOS token
+    outputs = llm.generate(example_prompts,
+                           sampling_params=SamplingParams(
+                               seed=SEED, max_tokens=MAX_TOKENS))
+    for output in outputs:
+        output = output.outputs[0]
+        assert output.finish_reason == "length" or (
+            output.finish_reason == "stop" and output.stop_reason is None)
diff --git a/vllm_v0.10.0/tests/detokenizer/test_stop_strings.py b/vllm_v0.10.0/tests/detokenizer/test_stop_strings.py
new file mode 100644
index 0000000..cb87c44
--- /dev/null
+++ b/vllm_v0.10.0/tests/detokenizer/test_stop_strings.py
@@ -0,0 +1,142 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Optional
+
+import pytest
+
+from vllm import LLM, SamplingParams, envs
+
+MODEL = "meta-llama/llama-2-7b-hf"
+MAX_TOKENS = 200
+
+
+def _test_stopping(llm: LLM,
+                   expected_output: str,
+                   expected_reason: Any,
+                   stop: Optional[list[str]] = None,
+                   stop_token_ids: Optional[list[int]] = None,
+                   include_in_output: bool = False) -> None:
+    output = llm.generate(
+        "A story about vLLM:\n",
+        SamplingParams(
+            temperature=0.0,
+            max_tokens=MAX_TOKENS,
+            stop=stop,
+            stop_token_ids=stop_token_ids,
+            include_stop_str_in_output=include_in_output,
+        ))[0].outputs[0]
+
+    assert output is not None
+    assert output.text == expected_output
+    assert output.stop_reason == expected_reason
+
+
+def _set_async_mode(llm, is_async):
+    llm.llm_engine.scheduler[0].use_async_output_proc = is_async
+
+
+def _stop_basic(llm):
+    _test_stopping(llm,
+                   stop=["."],
+                   include_in_output=False,
+                   expected_output="VLLM is a 100% volunteer organization",
+                   expected_reason=".")
+
+    _test_stopping(llm,
+                   stop=["."],
+                   include_in_output=True,
+                   expected_output="VLLM is a 100% volunteer organization.",
+                   expected_reason=".")
+
+
+def _stop_multi_tokens(llm):
+    _test_stopping(
+        llm,
+        stop=["group of peo", "short"],
+        include_in_output=False,
+        expected_output="VLLM is a 100% volunteer organization. We are a ",
+        expected_reason="group of peo")
+
+    _test_stopping(
+        llm,
+        stop=["group of peo", "short"],
+        include_in_output=True,
+        expected_output=
+        "VLLM is a 100% volunteer organization. We are a group of peo",
+        expected_reason="group of peo")
+
+
+def _stop_partial_token(llm):
+    _test_stopping(llm,
+                   stop=["gani"],
+                   include_in_output=False,
+                   expected_output="VLLM is a 100% volunteer or",
+                   expected_reason="gani")
+
+    _test_stopping(llm,
+                   stop=["gani"],
+                   include_in_output=True,
+                   expected_output="VLLM is a 100% volunteer organi",
+                   expected_reason="gani")
+
+
+def _stop_token_id(llm):
+    # token id 13013 => " organization"
+
+    _test_stopping(llm,
+                   stop_token_ids=[13013],
+                   include_in_output=False,
+                   expected_output="VLLM is a 100% volunteer",
+                   expected_reason=13013)
+
+    _test_stopping(llm,
+                   stop_token_ids=[13013],
+                   include_in_output=True,
+                   expected_output="VLLM is a 100% volunteer organization",
+                   expected_reason=13013)
+
+
+@pytest.mark.skip_global_cleanup
+def test_stop_strings():
+    # If V0, must set enforce_eager=False since we use
+    # async output processing below.
+    llm = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
+
+    if envs.VLLM_USE_V1:
+        _stop_basic(llm)
+    else:
+        _set_async_mode(llm, True)
+        _stop_basic(llm)
+
+        _set_async_mode(llm, False)
+        _stop_basic(llm)
+
+    if envs.VLLM_USE_V1:
+        _stop_multi_tokens(llm)
+    else:
+        _set_async_mode(llm, True)
+        _stop_multi_tokens(llm)
+
+        _set_async_mode(llm, False)
+        _stop_multi_tokens(llm)
+
+    if envs.VLLM_USE_V1:
+        _stop_partial_token(llm)
+    else:
+        _set_async_mode(llm, True)
+        _stop_partial_token(llm)
+
+        _set_async_mode(llm, False)
+        _stop_partial_token(llm)
+
+    if envs.VLLM_USE_V1:
+        # FIXME: this does not respect include_in_output=False
+        # _stop_token_id(llm)
+        pass
+    else:
+        _set_async_mode(llm, True)
+        _stop_token_id(llm)
+
+        _set_async_mode(llm, False)
+        _stop_token_id(llm)
diff --git a/vllm_v0.10.0/tests/distributed/__init__.py b/vllm_v0.10.0/tests/distributed/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/distributed/conftest.py b/vllm_v0.10.0/tests/distributed/conftest.py
new file mode 100644
index 0000000..666a715
--- /dev/null
+++ b/vllm_v0.10.0/tests/distributed/conftest.py
@@ -0,0 +1,171 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+from typing import Optional, Union
+
+import msgspec
+import msgspec.msgpack
+import pytest
+import zmq
+
+from vllm.config import KVEventsConfig
+from vllm.distributed.kv_events import EventPublisherFactory
+
+from .test_events import SampleBatch
+
+DP_RANK = 0
+
+
+@pytest.fixture
+def random_port():
+    """Generate a random port number for testing"""
+    return random.randint(10000, 59900)
+
+
+@pytest.fixture
+def publisher_config(random_port, request):
+    """Create a publisher config with inproc transport"""
+    how = request.param if hasattr(request, "param") else "inproc"
+
+    if how == "inproc":
+        endpoint = f"inproc://test-{random_port}"
+        replay_endpoint = endpoint + "-replay"
+    else:
+        endpoint = f"tcp://*:{random_port}"
+        replay_endpoint = f"tcp://*:{random_port + 100}"
+
+    return KVEventsConfig(
+        enable_kv_cache_events=True,
+        publisher="zmq",
+        endpoint=endpoint,
+        replay_endpoint=replay_endpoint,
+        buffer_steps=100,
+        hwm=1000,
+        topic="test",
+    )
+
+
+@pytest.fixture
+def publisher(publisher_config):
+    """Create and return a publisher instance"""
+    pub = EventPublisherFactory.create(publisher_config, DP_RANK)
+    yield pub
+    pub.shutdown()
+
+
+@pytest.fixture
+def subscriber(publisher_config):
+    """Create and return a subscriber for testing"""
+    endpoint = publisher_config.endpoint
+    replay_endpoint = publisher_config.replay_endpoint
+
+    if endpoint.startswith("tcp://*"):
+        endpoint = endpoint.replace("*", "127.0.0.1")
+    if replay_endpoint and replay_endpoint.startswith("tcp://*"):
+        replay_endpoint = replay_endpoint.replace("*", "127.0.0.1")
+
+    sub = MockSubscriber(
+        [endpoint],
+        [replay_endpoint] if replay_endpoint else None,
+        publisher_config.topic,
+    )
+    yield sub
+    sub.close()
+
+
+class MockSubscriber:
+    """Helper class to receive and verify published events"""
+
+    def __init__(
+        self,
+        pub_endpoints: Union[str, list[str]],
+        replay_endpoints: Optional[Union[str, list[str]]] = None,
+        topic: str = "",
+        decode_type=SampleBatch,
+    ):
+        self.ctx = zmq.Context.instance()
+
+        # Convert single endpoint to list for consistency
+        if isinstance(pub_endpoints, str):
+            pub_endpoints = [pub_endpoints]
+        if isinstance(replay_endpoints, str):
+            replay_endpoints = [replay_endpoints]
+
+        # Set up subscriber socket - connect to all endpoints
+        self.sub = self.ctx.socket(zmq.SUB)
+        self.sub.setsockopt(zmq.SUBSCRIBE, topic.encode("utf-8"))
+        for endpoint in pub_endpoints:
+            self.sub.connect(endpoint)
+
+        # Set up replay sockets if provided
+        self.replay_sockets = []
+        if replay_endpoints:
+            for replay_endpoint in replay_endpoints:
+                replay = self.ctx.socket(zmq.REQ)
+                replay.connect(replay_endpoint)
+                self.replay_sockets.append(replay)
+
+        self.topic = topic
+        self.topic_bytes = topic.encode("utf-8")
+        self.received_msgs: list[tuple[int, SampleBatch]] = []
+        self.last_seq = -1
+        self.decoder = msgspec.msgpack.Decoder(type=decode_type)
+
+    def receive_one(self,
+                    timeout=1000) -> Union[tuple[int, SampleBatch], None]:
+        """Receive a single message with timeout"""
+        if not self.sub.poll(timeout):
+            return None
+
+        topic_bytes, seq_bytes, payload = self.sub.recv_multipart()
+        assert topic_bytes == self.topic_bytes
+
+        seq = int.from_bytes(seq_bytes, "big")
+        data = self.decoder.decode(payload)
+        self.last_seq = seq
+        self.received_msgs.append((seq, data))
+        return seq, data
+
+    def request_replay(self, start_seq: int, socket_idx: int = 0) -> None:
+        """Request replay of messages starting from start_seq"""
+        if not self.replay_sockets:
+            raise ValueError("Replay sockets not initialized")
+        if socket_idx >= len(self.replay_sockets):
+            raise ValueError(f"Invalid socket index {socket_idx}")
+
+        self.replay_sockets[socket_idx].send(start_seq.to_bytes(8, "big"))
+
+    def receive_replay(self,
+                       socket_idx: int = 0) -> list[tuple[int, SampleBatch]]:
+        """Receive replayed messages from a specific replay socket"""
+        if not self.replay_sockets:
+            raise ValueError("Replay sockets not initialized")
+        if socket_idx >= len(self.replay_sockets):
+            raise ValueError(f"Invalid socket index {socket_idx}")
+
+        replay_socket = self.replay_sockets[socket_idx]
+        replayed: list[tuple[int, SampleBatch]] = []
+        while True:
+            try:
+                if not replay_socket.poll(1000):
+                    break
+
+                frames = replay_socket.recv_multipart()
+                if not frames or not frames[-1]:
+                    # End of replay marker
+                    break
+
+                seq_bytes, payload = frames
+                seq = int.from_bytes(seq_bytes, "big")
+                data = self.decoder.decode(payload)
+                replayed.append((seq, data))
+            except zmq.ZMQError as _:
+                break
+
+        return replayed
+
+    def close(self):
+        """Clean up resources"""
+        self.sub.close()
+        for replay in self.replay_sockets:
+            replay.close()
diff --git a/vllm_v0.10.0/tests/distributed/test_ca_buffer_sharing.py b/vllm_v0.10.0/tests/distributed/test_ca_buffer_sharing.py
new file mode 100644
index 0000000..e2de462
--- /dev/null
+++ b/vllm_v0.10.0/tests/distributed/test_ca_buffer_sharing.py
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# can only run on machines with p2p access across GPUs
+# can only run with torchrun:
+# torchrun --nproc_per_node=2 tests/distributed/test_ca_buffer_sharing.py
+
+import ctypes
+
+import torch
+import torch.distributed as dist
+
+from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
+from vllm.distributed.device_communicators.custom_all_reduce import (  # noqa
+    CustomAllreduce)
+
+# create a cpu process group for communicating metadata (ipc handle)
+dist.init_process_group(backend="gloo")
+rank = local_rank = dist.get_rank()
+world_size = dist.get_world_size()
+
+# every process sets its own device (differently)
+lib = CudaRTLibrary()
+lib.cudaSetDevice(rank)
+
+buffer_size_in_bytes = 1024
+byte_value = 2  # the value we write to the buffer for verification
+
+pointers = CustomAllreduce.create_shared_buffer(buffer_size_in_bytes)
+
+print(f"Rank {rank} has pointers {pointers}")
+
+dist.barrier()
+torch.cuda.synchronize()
+
+if rank == 0:
+    # the first rank tries to write to all buffers
+    for p in pointers:
+        pointer = ctypes.c_void_p(p)
+        lib.cudaMemset(pointer, byte_value, buffer_size_in_bytes)
+
+dist.barrier()
+torch.cuda.synchronize()
+
+host_data = (ctypes.c_char * buffer_size_in_bytes)()
+
+# all ranks read from all buffers, and check if the data is correct
+for p in pointers:
+    pointer = ctypes.c_void_p(p)
+    lib.cudaMemcpy(host_data, pointer, buffer_size_in_bytes)
+    for i in range(buffer_size_in_bytes):
+        assert ord(host_data[i]) == byte_value, (
+            f"Rank {rank} failed"
+            f" to verify buffer {p}. Expected {byte_value}, "
+            f"got {ord(host_data[i])}")
+
+print(f"Rank {rank} verified all buffers")
+
+dist.barrier()
+torch.cuda.synchronize()
+
+CustomAllreduce.free_shared_buffer(pointers)
diff --git a/vllm_v0.10.0/tests/distributed/test_comm_ops.py b/vllm_v0.10.0/tests/distributed/test_comm_ops.py
new file mode 100644
index 0000000..e2cb579
--- /dev/null
+++ b/vllm_v0.10.0/tests/distributed/test_comm_ops.py
@@ -0,0 +1,272 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test the communication operators.
+
+Run `pytest tests/distributed/test_comm_ops.py`.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Callable
+
+import pytest
+import ray
+import torch
+
+from vllm.distributed import (broadcast_tensor_dict, get_pp_group,
+                              tensor_model_parallel_all_gather,
+                              tensor_model_parallel_all_reduce,
+                              tensor_model_parallel_reduce_scatter)
+
+from ..utils import init_test_distributed_environment, multi_process_parallel
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def all_reduce_test_worker(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+):
+    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
+    # so that each worker can see all the GPUs
+    # they will be able to set the device to the correct GPU
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(tp_size, pp_size, rank,
+                                      distributed_init_port)
+    num_elements = 8
+    all_tensors = [
+        torch.arange(num_elements, dtype=torch.float32, device="cuda") *
+        (r + 1) for r in range(tp_size)
+    ]
+    expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
+    t = all_tensors[rank % tp_size]
+    t = tensor_model_parallel_all_reduce(t)
+    torch.testing.assert_close(t, expected)
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def reduce_scatter_test_worker(monkeypatch: pytest.MonkeyPatch, tp_size: int,
+                               pp_size: int, rank: int,
+                               distributed_init_port: str):
+    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
+    # so that each worker can see all the GPUs
+    # they will be able to set the device to the correct GPU
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(tp_size, pp_size, rank,
+                                      distributed_init_port)
+
+    num_elements = 8
+    all_tensors = [
+        torch.arange(num_elements, dtype=torch.float32, device="cuda") *
+        (r + 1) for r in range(tp_size)
+    ]
+
+    index = rank % tp_size
+    partition_size = num_elements // tp_size
+    all_reduce = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
+    expected = all_reduce[index * partition_size:(index + 1) * partition_size]
+    t = all_tensors[index]
+    t = tensor_model_parallel_reduce_scatter(t, 0)
+    torch.testing.assert_close(t, expected)
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def all_gather_test_worker(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+):
+    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
+    # so that each worker can see all the GPUs
+    # they will be able to set the device to the correct GPU
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(tp_size, pp_size, rank,
+                                      distributed_init_port)
+    num_dimensions = 3
+    tensor_size = list(range(2, num_dimensions + 2))
+    total_size = 1
+    for s in tensor_size:
+        total_size *= s
+    for all_gather_dimension in range(num_dimensions):
+        all_tensors = [
+            torch.arange(total_size, dtype=torch.float32,
+                         device="cuda").reshape(tensor_size) * (r + 1)
+            for r in range(tp_size)
+        ]
+        expected = torch.cat(all_tensors, dim=all_gather_dimension)
+        t = all_tensors[rank % tp_size]
+        t = tensor_model_parallel_all_gather(t, all_gather_dimension)
+        torch.testing.assert_close(t, expected)
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def broadcast_tensor_dict_test_worker(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+):
+    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
+    # so that each worker can see all the GPUs
+    # they will be able to set the device to the correct GPU
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(tp_size, pp_size, rank,
+                                      distributed_init_port)
+    test_dict = {
+        # device tensor
+        "a": torch.arange(8, dtype=torch.float32, device="cuda"),
+        # CPU tensor
+        "b": torch.arange(16, dtype=torch.int8, device="cpu"),
+        "c": "test",
+        "d": [1, 2, 3],
+        "e": {
+            "a": 1,
+            "b": 2
+        },
+        # empty tensor
+        "f": torch.tensor([], dtype=torch.float32, device="cuda"),
+    }
+
+    if (rank % tp_size) == 0:
+        broadcast_tensor_dict(test_dict, src=0)
+    else:
+        recv_dict = broadcast_tensor_dict(src=0)
+        assert len(recv_dict) == len(test_dict)
+        torch.testing.assert_close(recv_dict["a"], test_dict["a"])
+        torch.testing.assert_close(recv_dict["b"], test_dict["b"])
+        assert recv_dict["c"] == test_dict["c"]
+        assert recv_dict["d"] == test_dict["d"]
+        assert recv_dict["e"] == test_dict["e"]
+        torch.testing.assert_close(recv_dict["f"], test_dict["f"])
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def send_recv_tensor_dict_test_worker(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+):
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(tp_size, pp_size, rank,
+                                      distributed_init_port)
+
+    test_dict = {
+        # device tensor
+        "a": torch.arange(8, dtype=torch.float32, device="cuda"),
+        # CPU tensor
+        "b": torch.arange(16, dtype=torch.int8, device="cpu"),
+        "c": "test",
+        "d": [1, 2, 3],
+        "e": {
+            "a": 1,
+            "b": 2
+        },
+        # empty tensor
+        "f": torch.tensor([], dtype=torch.float32, device="cuda"),
+    }
+
+    if not get_pp_group().is_first_rank:
+        recv_dict = get_pp_group().recv_tensor_dict()
+
+    if not get_pp_group().is_last_rank:
+        get_pp_group().send_tensor_dict(test_dict)
+
+    if not get_pp_group().is_first_rank:
+        assert len(recv_dict) == len(test_dict)
+        torch.testing.assert_close(recv_dict["a"], test_dict["a"])
+        torch.testing.assert_close(recv_dict["b"], test_dict["b"])
+        assert recv_dict["c"] == test_dict["c"]
+        assert recv_dict["d"] == test_dict["d"]
+        assert recv_dict["e"] == test_dict["e"]
+        torch.testing.assert_close(recv_dict["f"], test_dict["f"])
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def send_recv_test_worker(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+):
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(tp_size, pp_size, rank,
+                                      distributed_init_port)
+
+    size = 64
+    test_tensor = torch.arange(64, dtype=torch.float32, device="cuda")
+
+    if not get_pp_group().is_first_rank:
+        recv_tensor = get_pp_group().recv(size, dtype=torch.float32)
+
+    if not get_pp_group().is_last_rank:
+        get_pp_group().send(test_tensor)
+
+    if not get_pp_group().is_first_rank:
+        torch.testing.assert_close(test_tensor, recv_tensor)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize("tp_size", [2])
+@pytest.mark.parametrize("test_target", [
+    all_reduce_test_worker, all_gather_test_worker,
+    broadcast_tensor_dict_test_worker
+])
+def test_multi_process_tensor_parallel(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    test_target: Callable[..., Any],
+):
+    multi_process_parallel(monkeypatch, tp_size, 1, test_target)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize("pp_size", [2])
+@pytest.mark.parametrize(
+    "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
+def test_multi_process_pipeline_parallel(
+    monkeypatch: pytest.MonkeyPatch,
+    pp_size: int,
+    test_target: Callable[..., Any],
+):
+    multi_process_parallel(monkeypatch, 1, pp_size, test_target)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 4,
+                    reason="Need at least 4 GPUs to run the test.")
+@pytest.mark.parametrize("tp_size", [2])
+@pytest.mark.parametrize("pp_size", [2])
+@pytest.mark.parametrize("test_target", [
+    send_recv_test_worker, send_recv_tensor_dict_test_worker,
+    all_reduce_test_worker, all_gather_test_worker,
+    broadcast_tensor_dict_test_worker
+])
+def test_multi_process_tensor_parallel_pipeline_parallel(
+    tp_size: int,
+    pp_size: int,
+    test_target: Callable[..., Any],
+    monkeypatch: pytest.MonkeyPatch,
+):
+    multi_process_parallel(monkeypatch, tp_size, pp_size, test_target)
diff --git a/vllm_v0.10.0/tests/distributed/test_custom_all_reduce.py b/vllm_v0.10.0/tests/distributed/test_custom_all_reduce.py
new file mode 100644
index 0000000..fae49c4
--- /dev/null
+++ b/vllm_v0.10.0/tests/distributed/test_custom_all_reduce.py
@@ -0,0 +1,137 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import random
+
+import pytest
+import ray
+import torch
+import torch.distributed as dist
+
+from vllm.distributed.communication_op import (  # noqa
+    tensor_model_parallel_all_reduce)
+from vllm.distributed.parallel_state import (get_tensor_model_parallel_group,
+                                             get_tp_group, graph_capture)
+
+from ..utils import (ensure_model_parallel_initialized,
+                     init_test_distributed_environment, multi_process_parallel)
+
+random.seed(42)
+test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)]
+for i, v in enumerate(test_sizes):
+    test_sizes[i] -= v % 8
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def graph_allreduce(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size,
+    pp_size,
+    rank,
+    distributed_init_port,
+):
+    with monkeypatch.context() as m:
+        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+        device = torch.device(f"cuda:{rank}")
+        torch.cuda.set_device(device)
+        init_test_distributed_environment(tp_size, pp_size, rank,
+                                          distributed_init_port)
+        ensure_model_parallel_initialized(tp_size, pp_size)
+        group = get_tensor_model_parallel_group().device_group
+
+        # A small all_reduce for warmup.
+        # this is needed because device communicators might be created lazily
+        # (e.g. NCCL). This will ensure that the communicator is initialized
+        # before any communication happens, so that this group can be used for
+        # graph capture immediately.
+        data = torch.zeros(1)
+        data = data.to(device=device)
+        torch.distributed.all_reduce(data, group=group)
+        torch.cuda.synchronize()
+        del data
+
+        # we use the first group to communicate once
+        # and the second group to communicate twice
+        # and so on
+        # this is used to demonstrate that each group can
+        # communicate independently
+        num_communication = rank // tp_size + 1
+
+        for sz in test_sizes:
+            for dtype in [torch.float32, torch.float16, torch.bfloat16]:
+                with graph_capture(device=device) as graph_capture_context:
+                    # use integers so result matches NCCL exactly
+                    inp1 = torch.randint(1,
+                                         16, (sz, ),
+                                         dtype=dtype,
+                                         device=torch.cuda.current_device())
+                    inp2 = torch.randint(1,
+                                         16, (sz, ),
+                                         dtype=dtype,
+                                         device=torch.cuda.current_device())
+                    torch.cuda.synchronize()
+                    graph = torch.cuda.CUDAGraph()
+                    with torch.cuda.graph(graph,
+                                          stream=graph_capture_context.stream):
+                        for i in range(num_communication):
+                            out1 = tensor_model_parallel_all_reduce(inp1)
+                            # the input buffer is immediately modified to test
+                            # synchronization
+                            dist.all_reduce(inp1, group=group)
+                            out2 = tensor_model_parallel_all_reduce(inp2)
+                            dist.all_reduce(inp2, group=group)
+                graph.replay()
+                torch.testing.assert_close(out1, inp1)
+                torch.testing.assert_close(out2, inp2)
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def eager_allreduce(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size,
+    pp_size,
+    rank,
+    distributed_init_port,
+):
+    with monkeypatch.context() as m:
+        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+        device = torch.device(f"cuda:{rank}")
+        torch.cuda.set_device(device)
+        init_test_distributed_environment(tp_size, pp_size, rank,
+                                          distributed_init_port)
+
+        # we use the first group to communicate once
+        # and the second group to communicate twice
+        # and so on
+        # this is used to demonstrate that each group can
+        # communicate independently
+        num_communication = rank // tp_size + 1
+        sz = 1024
+        fa = get_tp_group().device_communicator.ca_comm
+        inp = torch.ones(sz, dtype=torch.float32, device=device)
+        out = inp
+        for _ in range(num_communication):
+            out = fa.all_reduce(out, registered=False)
+        torch.testing.assert_close(out, inp * (tp_size**num_communication))
+
+        inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
+        out = inp
+        for _ in range(num_communication):
+            out = fa.all_reduce(out, registered=False)
+        torch.testing.assert_close(out, inp * (tp_size**num_communication))
+
+
+@pytest.mark.parametrize("tp_size", [2])
+@pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
+@pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
+def test_custom_allreduce(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size,
+    pipeline_parallel_size,
+    test_target,
+):
+    world_size = tp_size * pipeline_parallel_size
+    if world_size > torch.cuda.device_count():
+        pytest.skip("Not enough GPUs to run the test.")
+    multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size,
+                           test_target)
diff --git a/vllm_v0.10.0/tests/distributed/test_distributed_oot.py b/vllm_v0.10.0/tests/distributed/test_distributed_oot.py
new file mode 100644
index 0000000..b93696e
--- /dev/null
+++ b/vllm_v0.10.0/tests/distributed/test_distributed_oot.py
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from ..entrypoints.openai.test_oot_registration import (
+    run_and_test_dummy_opt_api_server)
+
+
+def test_distributed_oot(dummy_opt_path: str):
+    run_and_test_dummy_opt_api_server(dummy_opt_path, tp=2)
diff --git a/vllm_v0.10.0/tests/distributed/test_eplb_algo.py b/vllm_v0.10.0/tests/distributed/test_eplb_algo.py
new file mode 100644
index 0000000..e47ccba
--- /dev/null
+++ b/vllm_v0.10.0/tests/distributed/test_eplb_algo.py
@@ -0,0 +1,292 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm.distributed.eplb.rebalance_algo import rebalance_experts
+
+
+def test_basic_rebalance():
+    """Test basic rebalancing functionality"""
+    # Example from https://github.com/deepseek-ai/eplb
+    weight = torch.tensor([
+        [90, 132, 40, 61, 104, 165, 39, 4, 73, 56, 183, 86],
+        [20, 107, 104, 64, 19, 197, 187, 157, 172, 86, 16, 27],
+    ])
+
+    num_layers = weight.shape[0]
+    num_replicas = 16
+    num_groups = 4
+    num_nodes = 2
+    num_gpus = 8
+
+    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
+                                                 num_groups, num_nodes,
+                                                 num_gpus)
+
+    # Verify output shapes
+    assert phy2log.shape == (
+        2,
+        16,
+    ), f"Expected `phy2log` shape (2, 16), got {phy2log.shape}"
+    assert (log2phy.shape[0] == 2
+            ), f"Expected `log2phy` first dimension 2, got {log2phy.shape[0]}"
+    assert (
+        log2phy.shape[1] == 12
+    ), f"Expected `log2phy` second dimension 12, got {log2phy.shape[1]}"
+    assert logcnt.shape == (
+        2,
+        12,
+    ), f"Expected `logcnt` shape (2, 12), got {logcnt.shape}"
+
+    # Verify physical to logical expert mapping range is correct
+    assert torch.all(phy2log >= 0) and torch.all(
+        phy2log < 12), "Physical to logical mapping should be in range [0, 12)"
+
+    # Verify expert count reasonableness
+    assert torch.all(
+        logcnt >= 1), "Each logical expert should have at least 1 replica"
+    assert (
+        torch.sum(logcnt, dim=1).sum() == num_replicas *
+        num_layers), f"Total replicas should be {num_replicas * num_layers}"
+
+    # Verify expected output
+    expected_phy2log = torch.tensor([
+        [5, 6, 5, 7, 8, 4, 3, 4, 10, 9, 10, 2, 0, 1, 11, 1],
+        [7, 10, 6, 8, 6, 11, 8, 9, 2, 4, 5, 1, 5, 0, 3, 1],
+    ])
+    assert torch.all(phy2log == expected_phy2log)
+
+    expected_logcnt = torch.tensor([[1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1],
+                                    [1, 2, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1]])
+    assert torch.all(logcnt == expected_logcnt)
+
+
+def test_single_gpu_case():
+    """Test single GPU case"""
+    weight = torch.tensor([[10, 20, 30, 40]])
+    num_replicas = 4
+    num_groups = 1
+    num_nodes = 1
+    num_gpus = 1
+
+    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
+                                                 num_groups, num_nodes,
+                                                 num_gpus)
+
+    # Verify shapes
+    assert phy2log.shape == (1, 4)
+    assert log2phy.shape[0] == 1
+    assert log2phy.shape[1] == 4
+    assert logcnt.shape == (1, 4)
+
+    # Verify all logical experts are mapped
+    assert set(phy2log[0].tolist()) == {0, 1, 2, 3}
+
+
+def test_equal_weights():
+    """Test case with equal weights"""
+    weight = torch.tensor([[50, 50, 50, 50, 50, 50, 50, 50]])
+    num_replicas = 8
+    num_groups = 2
+    num_nodes = 2
+    num_gpus = 4
+
+    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
+                                                 num_groups, num_nodes,
+                                                 num_gpus)
+
+    # Verify shapes
+    assert phy2log.shape == (1, 8)
+    assert logcnt.shape == (1, 8)
+
+    # With equal weights, each expert should have exactly one replica
+    assert torch.all(
+        logcnt == 1
+    ), "With equal weights and no replication, " \
+       "each expert should have exactly 1 replica"
+
+
+def test_extreme_weight_imbalance():
+    """Test extreme weight imbalance case"""
+    weight = torch.tensor([[1000, 1, 1, 1, 1, 1, 1, 1]])
+    num_replicas = 12
+    num_groups = 2
+    num_nodes = 2
+    num_gpus = 4
+
+    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
+                                                 num_groups, num_nodes,
+                                                 num_gpus)
+
+    # Verify shapes
+    assert phy2log.shape == (1, 12)
+    assert logcnt.shape == (1, 8)
+
+    # Expert with highest weight (index 0) should have more replicas
+    assert (
+        logcnt[0, 0]
+        > logcnt[0, 1]), "Expert with highest weight should have more replicas"
+
+
+def test_multiple_layers():
+    """Test multiple layers case"""
+    weight = torch.tensor([
+        [10, 20, 30, 40, 50, 60],  # First layer
+        [60, 50, 40, 30, 20, 10],  # Second layer (opposite weight pattern)
+        [25, 25, 25, 25, 25, 25],  # Third layer (equal weights)
+    ])
+    num_replicas = 8
+    num_groups = 2
+    num_nodes = 2
+    num_gpus = 4
+
+    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
+                                                 num_groups, num_nodes,
+                                                 num_gpus)
+
+    # Verify shapes
+    assert phy2log.shape == (3, 8)
+    assert logcnt.shape == (3, 6)
+
+    # Verify expert allocation is reasonable for each layer
+    for layer in range(3):
+        assert torch.all(phy2log[layer] >= 0) and torch.all(
+            phy2log[layer] < 6
+        ), f"Layer {layer} physical to logical mapping" \
+            "should be in range [0, 6)"
+        assert (torch.sum(logcnt[layer]) == num_replicas
+                ), f"Layer {layer} total replicas should be {num_replicas}"
+
+
+def test_parameter_validation():
+    """Test parameter validation"""
+    weight = torch.tensor([[10, 20, 30, 40]])
+
+    # Test non-divisible case - this should handle normally without throwing
+    # errors because the function will fall back to global load balancing
+    # strategy
+    phy2log, log2phy, logcnt = rebalance_experts(weight, 8, 3, 2, 4)
+    assert phy2log.shape == (1, 8)
+    assert logcnt.shape == (1, 4)
+
+    # Test cases that will actually cause errors:
+    # num_physical_experts not divisible by num_gpus
+    with pytest.raises(AssertionError):
+        rebalance_experts(weight, 7, 2, 2, 4)  # 7 not divisible by 4
+
+
+def test_small_scale_hierarchical():
+    """Test small-scale hierarchical load balancing"""
+    weight = torch.tensor([
+        [100, 50, 200, 75, 150, 25, 300, 80],  # 8 experts
+    ])
+    num_replicas = 12
+    num_groups = 4  # 4 groups, 2 experts each
+    num_nodes = 2  # 2 nodes
+    num_gpus = 4  # 4 GPUs
+
+    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
+                                                 num_groups, num_nodes,
+                                                 num_gpus)
+
+    # Verify basic constraints
+    assert phy2log.shape == (1, 12)
+    assert logcnt.shape == (1, 8)
+    assert torch.sum(logcnt) == num_replicas
+    assert torch.all(logcnt >= 1)
+
+    # Expert with highest weight should have more replicas
+    max_weight_expert = torch.argmax(weight[0])
+    assert (logcnt[0, max_weight_expert]
+            >= 2), "Highest weight expert should have multiple replicas"
+
+
+def test_global_load_balance_fallback():
+    """Test global load balancing fallback case"""
+    # When num_groups % num_nodes != 0, should fall back to global load
+    # balancing
+    weight = torch.tensor([[10, 20, 30, 40, 50, 60]])
+    num_replicas = 8
+    num_groups = 3  # Cannot be divided evenly by num_nodes=2
+    num_nodes = 2
+    num_gpus = 4
+
+    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
+                                                 num_groups, num_nodes,
+                                                 num_gpus)
+
+    # Should work normally, just using global load balancing strategy
+    assert phy2log.shape == (1, 8)
+    assert logcnt.shape == (1, 6)
+    assert torch.sum(logcnt) == num_replicas
+
+
+@pytest.mark.parametrize("device", ["cpu", "cuda"])
+def test_device_compatibility(device):
+    """Test device compatibility"""
+    if device == "cuda" and not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    weight = torch.tensor([[10, 20, 30, 40]], device=device)
+    num_replicas = 6
+    num_groups = 2
+    num_nodes = 1
+    num_gpus = 2
+
+    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
+                                                 num_groups, num_nodes,
+                                                 num_gpus)
+
+    # Function will convert to CPU internally, but should handle different
+    # device inputs normally
+    assert phy2log.shape == (1, 6)
+    assert logcnt.shape == (1, 4)
+
+
+def test_additional_cases():
+    """Test more edge cases and different parameter combinations"""
+
+    # Test case 1: Large-scale distributed setup
+    weight1 = torch.tensor(
+        [[50, 100, 75, 120, 90, 60, 80, 110, 40, 70, 95, 85, 65, 55, 45, 35]])
+    phy2log1, log2phy1, logcnt1 = rebalance_experts(weight1, 24, 8, 4, 8)
+
+    assert phy2log1.shape == (1, 24)
+    assert logcnt1.shape == (1, 16)
+    assert torch.sum(logcnt1) == 24
+
+    # Test case 2: Different weight distributions
+    weight2 = torch.tensor([
+        [200, 150, 100, 50, 25, 12],  # Decreasing weights
+        [12, 25, 50, 100, 150, 200],  # Increasing weights
+    ])
+    phy2log2, log2phy2, logcnt2 = rebalance_experts(weight2, 10, 3, 1, 2)
+
+    assert phy2log2.shape == (2, 10)
+    assert logcnt2.shape == (2, 6)
+
+    # Verify high-weight experts have more replicas
+    for layer in range(2):
+        max_weight_idx = torch.argmax(weight2[layer])
+        assert logcnt2[layer, max_weight_idx] >= 2
+
+
+if __name__ == "__main__":
+    weight = torch.tensor([
+        [90, 132, 40, 61, 104, 165, 39, 4, 73, 56, 183, 86],
+        [20, 107, 104, 64, 19, 197, 187, 157, 172, 86, 16, 27],
+    ])
+
+    num_replicas = 16
+    num_groups = 4
+    num_nodes = 2
+    num_gpus = 8
+
+    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
+                                                 num_groups, num_nodes,
+                                                 num_gpus)
+    print(phy2log)
+
+    test_basic_rebalance()
diff --git a/vllm_v0.10.0/tests/distributed/test_eplb_execute.py b/vllm_v0.10.0/tests/distributed/test_eplb_execute.py
new file mode 100644
index 0000000..de9ed1e
--- /dev/null
+++ b/vllm_v0.10.0/tests/distributed/test_eplb_execute.py
@@ -0,0 +1,504 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import multiprocessing
+import os
+import random
+
+import pytest
+import torch
+import torch.distributed
+
+from vllm.distributed.eplb.rebalance_execute import (
+    rearrange_expert_weights_inplace)
+from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
+                                             get_tp_group,
+                                             init_distributed_environment)
+from vllm.utils import update_environment_variables
+
+
+def distributed_run(fn, world_size):
+    number_of_processes = world_size
+    processes: list[multiprocessing.Process] = []
+    for i in range(number_of_processes):
+        env: dict[str, str] = {}
+        env['RANK'] = str(i)
+        env['LOCAL_RANK'] = str(i)
+        env['WORLD_SIZE'] = str(number_of_processes)
+        env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
+        env['MASTER_ADDR'] = 'localhost'
+        env['MASTER_PORT'] = '12345'
+        p = multiprocessing.Process(target=fn, args=(env, ))
+        processes.append(p)
+        p.start()
+
+    for p in processes:
+        p.join()
+
+    for p in processes:
+        assert p.exitcode == 0
+
+
+def worker_fn_wrapper(fn):
+    # `multiprocessing.Process` cannot accept environment variables directly
+    # so we need to pass the environment variables as arguments
+    # and update the environment variables in the function
+    def wrapped_fn(env):
+        update_environment_variables(env)
+        local_rank = os.environ['LOCAL_RANK']
+        device = torch.device(f"cuda:{local_rank}")
+        torch.cuda.set_device(device)
+        init_distributed_environment()
+
+        # Ensure each worker process has the same random seed
+        random.seed(42)
+        torch.manual_seed(42)
+
+        fn()
+
+    return wrapped_fn
+
+
+def create_expert_indices_with_redundancy(
+        num_layers: int,
+        num_logical_experts: int,
+        total_physical_experts: int,
+        redundancy_config: list[int],  # redundancy for each logical expert
+) -> torch.Tensor:
+    """
+    Create expert indices with redundancy.
+    
+    Args:
+        num_layers: number of layers
+        num_logical_experts: number of logical experts
+        total_physical_experts: total number of physical experts
+        redundancy_config: redundancy for each logical expert
+    
+    Returns:
+        indices: Shape (num_layers, total_physical_experts)
+    """
+    assert sum(redundancy_config) == total_physical_experts
+    assert len(redundancy_config) == num_logical_experts
+
+    indices = torch.zeros(num_layers, total_physical_experts, dtype=torch.long)
+
+    for layer in range(num_layers):
+        physical_pos = 0
+        for logical_expert_id, redundancy in enumerate(redundancy_config):
+            for _ in range(redundancy):
+                indices[layer, physical_pos] = logical_expert_id
+                physical_pos += 1
+
+    # Shuffle the indices at dim 1
+    for layer in range(num_layers):
+        indices[layer] = indices[layer][torch.randperm(indices.shape[1])]
+
+    return indices
+
+
+def create_expert_weights(
+    num_layers: int,
+    num_local_experts: int,
+    hidden_sizes: list[int],
+    rank: int,
+    device: torch.device,
+    physical_to_logical_mapping: torch.Tensor,
+) -> list[list[torch.Tensor]]:
+    """
+    Create fake expert weights tensor for testing.
+    
+    Use `arange` to generate predictable weights values, based on logical
+    expert ID.
+    All replicas of the same logical expert should have the same weights.
+    
+    Args:
+        physical_to_logical_mapping: Shape (num_layers, num_local_experts)
+            mapping[layer, physical_pos] = logical_expert_id
+    """
+    expert_weights = []
+
+    for layer in range(num_layers):
+        layer_weights = []
+        for weight_idx, hidden_size in enumerate(hidden_sizes):
+            weight_tensor = torch.zeros(num_local_experts,
+                                        hidden_size,
+                                        device=device,
+                                        dtype=torch.float32)
+
+            for local_expert in range(num_local_experts):
+                # Get the logical expert ID for this physical expert
+                global_pos = rank * num_local_experts + local_expert
+                logical_expert_id = physical_to_logical_mapping[
+                    layer, global_pos].item()
+
+                # Generate weights based on logical expert ID
+                # (so that all replicas of the same logical expert have the
+                # same weights)
+                base_value = (logical_expert_id * 1000 + layer * 100 +
+                              weight_idx * 10)
+                weight_tensor[local_expert] = torch.arange(base_value,
+                                                           base_value +
+                                                           hidden_size,
+                                                           device=device,
+                                                           dtype=torch.float32)
+
+            layer_weights.append(weight_tensor)
+        expert_weights.append(layer_weights)
+
+    return expert_weights
+
+
+def create_redundancy_config(
+    num_logical_experts: int,
+    num_physical_experts: int,
+) -> list[int]:
+    """Create a redundancy configuration."""
+    redundancy_config = [1] * num_logical_experts
+    remaining = num_physical_experts - num_logical_experts
+    # Randomly assign the remaining physical experts to the logical experts
+    for _ in range(remaining):
+        redundancy_config[random.choice(range(num_logical_experts))] += 1
+    return redundancy_config
+
+
+def verify_expert_weights_after_shuffle(
+    expert_weights: list[list[torch.Tensor]],
+    new_indices: torch.Tensor,
+    hidden_sizes: list[int],
+    ep_rank: int,
+    num_local_experts: int,
+):
+    """Verify the weights after shuffling are correct."""
+    num_layers = len(expert_weights)
+
+    for layer in range(num_layers):
+        for weight_idx, hidden_size in enumerate(hidden_sizes):
+            weight_tensor = expert_weights[layer][weight_idx]
+
+            for local_expert in range(num_local_experts):
+                # Calculate the global expert ID for this local expert
+                global_pos = ep_rank * num_local_experts + local_expert
+                expected_logical_expert = new_indices[layer, global_pos].item()
+
+                # Check if the weights are correct
+                actual_weights = weight_tensor[local_expert]
+                expected_base = (expected_logical_expert * 1000 + layer * 100 +
+                                 weight_idx * 10)
+                expected_weights = torch.arange(expected_base,
+                                                expected_base + hidden_size,
+                                                device=actual_weights.device,
+                                                dtype=actual_weights.dtype)
+
+                torch.testing.assert_close(
+                    actual_weights,
+                    expected_weights,
+                    msg=f"Layer {layer}, weight {weight_idx},"
+                    f"local expert {local_expert}: "
+                    f"weights do not match. "
+                    f"Expected logical expert {expected_logical_expert}")
+
+
+def verify_redundant_experts_have_same_weights(
+    expert_weights: list[list[torch.Tensor]],
+    indices: torch.Tensor,
+    hidden_sizes: list[int],
+    world_size: int,
+    num_local_experts: int,
+):
+    """
+    Verify that all replicas of the same logical expert have the same weights.
+    """
+    num_layers = len(expert_weights)
+    total_physical_experts = world_size * num_local_experts
+
+    for layer in range(num_layers):
+        # Collect weights for all physical experts for each weight matrix
+        all_weights: list[torch.Tensor] = []
+
+        for weight_idx, hidden_size in enumerate(hidden_sizes):
+            # Create tensor to store all expert weights
+            # Shape: [total_physical_experts, hidden_size]
+            gathered_weights = torch.zeros(
+                total_physical_experts,
+                hidden_size,
+                device=expert_weights[layer][weight_idx].device,
+                dtype=expert_weights[layer][weight_idx].dtype)
+
+            # Use all_gather to collect expert weights from current node
+            # expert_weights[layer][weight_idx] shape:
+            # [num_local_experts, hidden_size]
+            local_weights = expert_weights[layer][
+                weight_idx]  # [num_local_experts, hidden_size]
+
+            # Split tensor along dim 0 into a list for all_gather
+            gathered_weights_list = torch.chunk(gathered_weights,
+                                                world_size,
+                                                dim=0)
+
+            torch.distributed.all_gather(
+                # Output list: each element corresponds to one rank's weights
+                list(gathered_weights_list),
+                local_weights  # Input: current rank's local weights
+            )
+
+            all_weights.append(gathered_weights)
+
+        # Verify that all replicas of the same logical expert have the same
+        # weights
+        logical_expert_weights: dict[int, dict[int, torch.Tensor]] = {}
+
+        for physical_pos in range(total_physical_experts):
+            logical_expert_id = int(indices[layer, physical_pos].item())
+
+            if logical_expert_id not in logical_expert_weights:
+                # First time encountering this logical expert, save its weights
+                logical_expert_weights[logical_expert_id] = {
+                    weight_idx: all_weights[weight_idx][physical_pos]
+                    for weight_idx in range(len(hidden_sizes))
+                }
+            else:
+                # Verify that current physical expert's weights match the
+                # previously saved logical expert weights
+                for weight_idx in range(len(hidden_sizes)):
+                    torch.testing.assert_close(
+                        all_weights[weight_idx][physical_pos],
+                        logical_expert_weights[logical_expert_id][weight_idx],
+                        msg=f"Layer {layer}, weight {weight_idx},"
+                        f"logical expert {logical_expert_id}: "
+                        f"Physical expert {physical_pos} has different weights"
+                        f"than expected")
+
+
+@pytest.mark.parametrize(
+    "world_size,num_layers,num_local_experts,num_logical_experts",
+    [
+        # 2 GPU, 2 experts per GPU
+        # 3 logical experts, 4 physical experts, 1 redundant experts
+        (2, 1, 2, 3),
+        # 2 GPU, 3 experts per GPU
+        # 4 logical experts, 6 physical experts, 2 redundant experts
+        (2, 2, 3, 4),
+        # 2 GPU, 8 experts per GPU
+        # 16 logical experts, 16 physical experts, 0 redundant experts
+        (2, 4, 8, 16),
+        # 4 GPU, 2 experts per GPU
+        # 6 logical experts, 8 physical experts, 2 redundant experts
+        (4, 1, 2, 6),
+        # 4 GPU, 2 experts per GPU
+        # 5 logical experts, 8 physical experts, 3 redundant experts
+        (4, 2, 2, 5),
+        # 4 GPU, 8 experts per GPU
+        # 16 logical experts, 32 physical experts, 16 redundant experts
+        (4, 8, 8, 16),
+    ])
+def test_rearrange_expert_weights_with_redundancy(world_size, num_layers,
+                                                  num_local_experts,
+                                                  num_logical_experts):
+    """Test the functionality of rearranging expert weights with redundancy."""
+
+    if torch.cuda.device_count() < world_size:
+        pytest.skip(f"Need at least {world_size} GPUs to run the test")
+
+    @worker_fn_wrapper
+    def worker_fn():
+        # Initialize model parallel (using tensor parallel as an entrypoint
+        # to expert parallel)
+        ensure_model_parallel_initialized(
+            tensor_model_parallel_size=world_size,
+            pipeline_model_parallel_size=1)
+
+        ep_group = get_tp_group().cpu_group
+        ep_rank = torch.distributed.get_rank()
+        device = torch.device(f"cuda:{ep_rank}")
+
+        # Test parameters
+        total_physical_experts = world_size * num_local_experts
+        hidden_sizes = [32, 64]  # Two different weight matrices
+
+        # Create old expert indices (with redundancy)
+        redundancy_config = create_redundancy_config(num_logical_experts,
+                                                     total_physical_experts)
+
+        old_indices = create_expert_indices_with_redundancy(
+            num_layers,
+            num_logical_experts,
+            total_physical_experts,
+            redundancy_config,
+        )
+
+        # Create new expert indices (with redundancy)
+        new_redundancy_config = create_redundancy_config(
+            num_logical_experts, total_physical_experts)
+        new_indices = create_expert_indices_with_redundancy(
+            num_layers,
+            num_logical_experts,
+            total_physical_experts,
+            new_redundancy_config,
+        )
+
+        # Create expert weights
+        expert_weights = create_expert_weights(num_layers, num_local_experts,
+                                               hidden_sizes, ep_rank, device,
+                                               old_indices)
+
+        # Execute weight rearrangement
+        rearrange_expert_weights_inplace(
+            old_indices,
+            new_indices,
+            expert_weights,
+            ep_group,
+            is_profile=False,
+        )
+
+        # Verify the rearrangement result
+        verify_expert_weights_after_shuffle(
+            expert_weights,
+            new_indices,
+            hidden_sizes,
+            ep_rank,
+            num_local_experts,
+        )
+
+        verify_redundant_experts_have_same_weights(
+            expert_weights,
+            new_indices,
+            hidden_sizes,
+            world_size,
+            num_local_experts,
+        )
+
+    distributed_run(worker_fn, world_size)
+
+
+@pytest.mark.parametrize("world_size", [2, 4])
+def test_rearrange_expert_weights_no_change(world_size):
+    """
+    Test that when the indices do not change, the weights should remain
+    unchanged.
+    """
+
+    if torch.cuda.device_count() < world_size:
+        pytest.skip(f"Need at least {world_size} GPUs to run the test")
+
+    @worker_fn_wrapper
+    def worker_fn():
+        ensure_model_parallel_initialized(
+            tensor_model_parallel_size=world_size,
+            pipeline_model_parallel_size=1)
+
+        ep_group = get_tp_group().cpu_group
+        ep_rank = torch.distributed.get_rank()
+        device = torch.device(f"cuda:{ep_rank}")
+
+        num_layers = 2
+        num_local_experts = 2
+        total_physical_experts = world_size * num_local_experts
+        num_logical_experts = total_physical_experts // 2  # Some redundancy
+        hidden_sizes = [32, 64]
+
+        # Create redundancy configuration
+        redundancy_config = [2] * num_logical_experts
+
+        # Same indices - no change
+        indices = create_expert_indices_with_redundancy(
+            num_layers, num_logical_experts, total_physical_experts,
+            redundancy_config)
+
+        expert_weights = create_expert_weights(num_layers, num_local_experts,
+                                               hidden_sizes, ep_rank, device,
+                                               indices)
+
+        # Save original weights
+        original_weights = []
+        for layer_weights in expert_weights:
+            layer_copy = []
+            for weight in layer_weights:
+                layer_copy.append(weight.clone())
+            original_weights.append(layer_copy)
+
+        # Execute rearrangement (should be no change)
+        rearrange_expert_weights_inplace(
+            indices,
+            indices,  # Same indices
+            expert_weights,
+            ep_group,
+            is_profile=False)
+
+        # Verify that the weights have not changed
+        for layer in range(num_layers):
+            for weight_idx in range(len(hidden_sizes)):
+                torch.testing.assert_close(
+                    expert_weights[layer][weight_idx],
+                    original_weights[layer][weight_idx],
+                    msg=f"Layer {layer}, weight {weight_idx} should remain "
+                    f"unchanged")
+
+    distributed_run(worker_fn, world_size)
+
+
+@pytest.mark.parametrize("world_size", [2, 4])
+def test_rearrange_expert_weights_profile_mode(world_size):
+    """Test profile mode (should not copy actual weights)"""
+
+    if torch.cuda.device_count() < world_size:
+        pytest.skip(f"Need at least {world_size} GPUs to run the test")
+
+    @worker_fn_wrapper
+    def worker_fn():
+        ensure_model_parallel_initialized(
+            tensor_model_parallel_size=world_size,
+            pipeline_model_parallel_size=1)
+
+        ep_group = get_tp_group().cpu_group
+        ep_rank = torch.distributed.get_rank()
+        device = torch.device(f"cuda:{ep_rank}")
+
+        num_layers = 1
+        num_local_experts = 2
+        total_physical_experts = world_size * num_local_experts
+        num_logical_experts = total_physical_experts // 2
+        hidden_sizes = [32]
+
+        # Create different index distributions
+        old_redundancy = create_redundancy_config(num_logical_experts,
+                                                  total_physical_experts)
+        new_redundancy = create_redundancy_config(num_logical_experts,
+                                                  total_physical_experts)
+
+        old_indices = create_expert_indices_with_redundancy(
+            num_layers, num_logical_experts, total_physical_experts,
+            old_redundancy)
+        new_indices = create_expert_indices_with_redundancy(
+            num_layers, num_logical_experts, total_physical_experts,
+            new_redundancy)
+
+        expert_weights = create_expert_weights(num_layers, num_local_experts,
+                                               hidden_sizes, ep_rank, device,
+                                               old_indices)
+
+        # Save original weights
+        original_weights = []
+        for layer_weights in expert_weights:
+            layer_copy = []
+            for weight in layer_weights:
+                layer_copy.append(weight.clone())
+            original_weights.append(layer_copy)
+
+        # Execute profile mode rearrangement
+        rearrange_expert_weights_inplace(
+            old_indices,
+            new_indices,
+            expert_weights,
+            ep_group,
+            is_profile=True  # Profile mode
+        )
+
+        # In profile mode, the weights should remain unchanged
+        for layer in range(num_layers):
+            for weight_idx in range(len(hidden_sizes)):
+                torch.testing.assert_close(
+                    expert_weights[layer][weight_idx],
+                    original_weights[layer][weight_idx],
+                    msg="In profile mode, the weights should remain unchanged")
+
+    distributed_run(worker_fn, world_size)
diff --git a/vllm_v0.10.0/tests/distributed/test_events.py b/vllm_v0.10.0/tests/distributed/test_events.py
new file mode 100644
index 0000000..8be9ee0
--- /dev/null
+++ b/vllm_v0.10.0/tests/distributed/test_events.py
@@ -0,0 +1,258 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import threading
+import time
+
+import msgspec
+import pytest
+
+from vllm.distributed.kv_events import (EventBatch, EventPublisherFactory,
+                                        NullEventPublisher)
+
+DP_RANK = 0
+
+
+class EventSample(
+        msgspec.Struct,
+        tag=True,  # type: ignore
+        array_like=True  # type: ignore
+):
+    """Test event for publisher testing"""
+    id: int
+    value: str
+
+
+class SampleBatch(EventBatch):
+    """Test event batch for publisher testing"""
+    events: list[EventSample]
+
+
+def create_test_events(count: int) -> SampleBatch:
+    """Create a batch of test events"""
+    events = [EventSample(id=i, value=f"test-{i}") for i in range(count)]
+    return SampleBatch(ts=time.time(), events=events)
+
+
+def test_basic_publishing(publisher, subscriber):
+    """Test basic event publishing works"""
+
+    test_batch = create_test_events(5)
+    publisher.publish(test_batch)
+
+    result = subscriber.receive_one(timeout=1000)
+    assert result is not None, "No message received"
+
+    seq, received = result
+    assert seq == 0, "Sequence number mismatch"
+    assert received.ts == pytest.approx(test_batch.ts,
+                                        abs=0.1), ("Timestamp mismatch")
+    assert len(received.events) == len(
+        test_batch.events), ("Number of events mismatch")
+
+    for i, event in enumerate(received.events):
+        assert event.id == i, "Event id mismatch"
+        assert event.value == f"test-{i}", "Event value mismatch"
+
+
+def test_multiple_events(publisher, subscriber):
+    """Test publishing and receiving multiple event batches"""
+    for _ in range(10):
+        batch = create_test_events(2)
+        publisher.publish(batch)
+
+    received = []
+    for _ in range(10):
+        data = subscriber.receive_one(timeout=100)
+        if data:
+            received.append(data)
+
+    assert len(received) == 10, "Number of messages mismatch"
+    seqs = [seq for seq, _ in received]
+    assert seqs == list(range(10)), "Sequence numbers mismatch"
+
+
+def test_replay_mechanism(publisher, subscriber):
+    """Test the replay mechanism works correctly"""
+    for _ in range(19):
+        batch = create_test_events(1)
+        publisher.publish(batch)
+
+    time.sleep(0.5)  # Need publisher to process above requests
+    subscriber.request_replay(10)
+
+    batch = create_test_events(1)
+    publisher.publish(batch)  # 20th message
+
+    replayed = subscriber.receive_replay()
+
+    assert len(replayed) > 0, "No replayed messages received"
+    seqs = [seq for seq, _ in replayed]
+    assert all(seq >= 10 for seq in seqs), "Replayed messages not in order"
+    assert seqs == list(range(min(seqs),
+                              max(seqs) +
+                              1)), ("Replayed messages not consecutive")
+
+
+def test_buffer_limit(publisher, subscriber, publisher_config):
+    """Test buffer limit behavior"""
+    buffer_size = publisher_config.buffer_steps
+
+    # Publish more events than the buffer can hold
+    for i in range(buffer_size + 10):
+        batch = create_test_events(1)
+        publisher.publish(batch)
+
+    time.sleep(0.5)  # Need publisher to process above requests
+    subscriber.request_replay(0)
+
+    batch = create_test_events(1)
+    publisher.publish(batch)
+
+    replayed = subscriber.receive_replay()
+
+    assert len(replayed) <= buffer_size, "Can't replay more than buffer size"
+
+    oldest_seq = min(seq for seq, _ in replayed)
+    assert oldest_seq >= 10, "The oldest sequence should be at least 10"
+
+
+def test_topic_filtering(publisher_config):
+    """
+    Test that a subscriber only receives messages matching its topic filter
+    """
+    publisher_config.replay_endpoint = None
+
+    publisher_config.topic = "foo"
+    pub = EventPublisherFactory.create(publisher_config, DP_RANK)
+
+    from .conftest import MockSubscriber
+    sub_foo = MockSubscriber(publisher_config.endpoint, None, "foo")
+    sub_bar = MockSubscriber(publisher_config.endpoint, None, "bar")
+
+    try:
+        time.sleep(0.1)
+
+        for _ in range(3):
+            pub.publish(create_test_events(1))
+
+        foo_received = [sub_foo.receive_one(timeout=200) for _ in range(3)]
+        assert all(msg is not None for msg in foo_received), (
+            "Subscriber with matching topic should receive messages")
+
+        bar_received = [sub_bar.receive_one(timeout=200) for _ in range(3)]
+        assert all(msg is None for msg in bar_received), (
+            "Subscriber with non-matching topic should receive no messages")
+    finally:
+        pub.shutdown()
+        sub_foo.close()
+        sub_bar.close()
+
+
+def test_high_volume(publisher, subscriber):
+    """Test publishing and receiving a high volume of events"""
+    num_batches = 10_000
+    events_per_batch = 100
+
+    # Publish events in a separate thread to not block
+    def publish_events():
+        for i in range(num_batches):
+            batch = create_test_events(events_per_batch)
+            publisher.publish(batch)
+            # Small delay to avoid overwhelming
+            if i % 100 == 0:
+                time.sleep(0.01)
+
+    received: list[tuple[int, SampleBatch]] = []
+
+    publisher_thread = threading.Thread(target=publish_events)
+    publisher_thread.start()
+
+    start_time = time.time()
+    while len(received) < num_batches:
+        if time.time() - start_time > 10:  # Timeout after 10 seconds
+            break
+
+        result = subscriber.receive_one(timeout=100)
+        if result:
+            received.append(result)
+
+    publisher_thread.join()
+
+    assert len(received) >= num_batches * 0.9, (
+        "We should have received most messages")
+
+    seqs = [seq for seq, _ in received]
+    assert sorted(seqs) == seqs, "Sequence numbers should be in order"
+
+
+def test_null_publisher():
+    """Test that NullEventPublisher can be used without errors"""
+    publisher = NullEventPublisher(DP_RANK)
+
+    # This should not raise any errors
+    batch = create_test_events(5)
+    publisher.publish(batch)
+    publisher.shutdown()
+
+
+def test_data_parallel_rank_tagging(publisher_config):
+    """Test that events are properly tagged with their data parallel rank"""
+
+    publisher_config.topic = "foo"
+    pub_0 = EventPublisherFactory.create(publisher_config, DP_RANK)
+    pub_1 = EventPublisherFactory.create(publisher_config, DP_RANK + 1)
+
+    # Hardcode the expected endpoints based on port offsetting behavior
+    # Both ranks get offsets according to _offset_endpoint_port function
+    base_endpoint = publisher_config.endpoint
+    if "tcp://" in base_endpoint:
+        # For TCP endpoints: tcp://localhost:5557 -> tcp://localhost:5557, tcp://localhost:5558
+        expected_endpoint_0 = base_endpoint  # rank 0 gets port + 0 = same port
+        expected_endpoint_1 = base_endpoint.replace(
+            ":5557", ":5558")  # rank 1 gets port + 1
+    else:
+        # For inproc endpoints: inproc://test -> inproc://test_dp0, inproc://test_dp1
+        expected_endpoint_0 = base_endpoint  # rank 0 gets base
+        expected_endpoint_1 = base_endpoint + "_dp1"  # rank 1 gets _dp1
+
+    from .conftest import MockSubscriber
+    sub_0 = MockSubscriber(expected_endpoint_0, None, publisher_config.topic)
+    sub_1 = MockSubscriber(expected_endpoint_1, None, publisher_config.topic)
+
+    try:
+        time.sleep(0.1)  # Let publishers start up
+
+        # Publish events from different ranks
+        batch_0 = create_test_events(2)
+        batch_1 = create_test_events(3)
+
+        pub_0.publish(batch_0)
+        pub_1.publish(batch_1)
+
+        # Receive events from rank 0
+        result_0 = sub_0.receive_one(timeout=200)
+        assert result_0 is not None, "No message received from rank 0"
+        seq_0, received_0 = result_0
+
+        # Receive events from rank 1
+        result_1 = sub_1.receive_one(timeout=200)
+        assert result_1 is not None, "No message received from rank 1"
+        seq_1, received_1 = result_1
+
+        # Verify DP rank tagging
+        assert received_0.data_parallel_rank == 0, (
+            f"Expected DP rank 0, got {received_0.data_parallel_rank}")
+        assert received_1.data_parallel_rank == 1, (
+            f"Expected DP rank 1, got {received_1.data_parallel_rank}")
+
+        # Verify event content is correct
+        assert len(
+            received_0.events) == 2, "Wrong number of events from rank 0"
+        assert len(
+            received_1.events) == 3, "Wrong number of events from rank 1"
+
+    finally:
+        pub_0.shutdown()
+        pub_1.shutdown()
+        sub_0.close()
+        sub_1.close()
diff --git a/vllm_v0.10.0/tests/distributed/test_expert_parallel.py b/vllm_v0.10.0/tests/distributed/test_expert_parallel.py
new file mode 100644
index 0000000..f641bf1
--- /dev/null
+++ b/vllm_v0.10.0/tests/distributed/test_expert_parallel.py
@@ -0,0 +1,228 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from typing import Literal, NamedTuple, Optional
+
+import pytest
+
+from vllm.config import TaskOption
+from vllm.logger import init_logger
+
+from ..utils import compare_two_settings, create_new_process_for_each_test
+
+logger = init_logger("test_expert_parallel")
+
+
+class ParallelSetup(NamedTuple):
+    tp_size: int
+    eager_mode: bool
+    chunked_prefill: bool
+
+
+class EPTestOptions(NamedTuple):
+    trust_remote_code: bool
+    tokenizer_mode: Optional[str]
+    load_format: Optional[str] = None
+    hf_overrides: Optional[str] = None
+
+
+@dataclass
+class EPTestSettings:
+    parallel_setups: list[ParallelSetup]
+    distributed_backends: list[str]
+    task: TaskOption
+    test_options: EPTestOptions
+
+    @staticmethod
+    def detailed(
+        *,
+        tp_base: int = 2,
+        task: TaskOption = "auto",
+        trust_remote_code: bool = False,
+        tokenizer_mode: Optional[str] = None,
+        load_format: Optional[str] = None,
+        hf_overrides: Optional[str] = None,
+    ):
+        return EPTestSettings(
+            parallel_setups=[
+                ParallelSetup(tp_size=tp_base,
+                              eager_mode=False,
+                              chunked_prefill=False),
+                ParallelSetup(tp_size=tp_base,
+                              eager_mode=False,
+                              chunked_prefill=True),
+                ParallelSetup(tp_size=tp_base,
+                              eager_mode=True,
+                              chunked_prefill=False),
+                ParallelSetup(tp_size=2 * tp_base,
+                              eager_mode=False,
+                              chunked_prefill=True),
+                ParallelSetup(tp_size=2 * tp_base,
+                              eager_mode=True,
+                              chunked_prefill=False),
+            ],
+            distributed_backends=["mp", "ray"],
+            task=task,
+            test_options=EPTestOptions(trust_remote_code=trust_remote_code,
+                                       tokenizer_mode=tokenizer_mode,
+                                       load_format=load_format,
+                                       hf_overrides=hf_overrides),
+        )
+
+    @staticmethod
+    def fast(
+        *,
+        tp_base: int = 2,
+        task: TaskOption = "auto",
+        trust_remote_code: bool = False,
+        tokenizer_mode: Optional[str] = None,
+        load_format: Optional[str] = None,
+        hf_overrides: Optional[str] = None,
+    ):
+        return EPTestSettings(
+            parallel_setups=[
+                ParallelSetup(tp_size=tp_base,
+                              eager_mode=True,
+                              chunked_prefill=False),
+            ],
+            distributed_backends=["mp"],
+            task=task,
+            test_options=EPTestOptions(trust_remote_code=trust_remote_code,
+                                       tokenizer_mode=tokenizer_mode,
+                                       load_format=load_format,
+                                       hf_overrides=hf_overrides),
+        )
+
+    def iter_params(self, model_name: str):
+        opts = self.test_options
+
+        for parallel_setup in self.parallel_setups:
+            for distributed_backend in self.distributed_backends:
+                yield (model_name, parallel_setup, distributed_backend,
+                       self.task, opts)
+
+
+# NOTE: You can adjust tp_base locally to fit the model in GPU
+# The values displayed here are only a rough indicator of the size of the model
+
+# yapf: disable
+TEST_MODELS = {
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": EPTestSettings.fast(
+        trust_remote_code=True),
+    "mistralai/Mixtral-8x7B-Instruct-v0.1": EPTestSettings.fast(tp_base=4),
+}
+
+
+def _compare_tp(
+    model_name: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    task: TaskOption,
+    test_options: EPTestOptions,
+    num_gpus_available: int,
+    *,
+    method: Literal["generate"],
+):
+    (
+        tp_size,
+        eager_mode,
+        chunked_prefill,
+    ) = parallel_setup
+    (
+        trust_remote_code,
+        tokenizer_mode,
+        load_format,
+        hf_overrides,
+    ) = test_options
+
+    if num_gpus_available < tp_size:
+        pytest.skip(f"Need at least {tp_size} GPUs")
+
+    common_args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "8",
+        "--load-format",
+        "auto",
+    ]
+    if chunked_prefill:
+        common_args.append("--enable-chunked-prefill")
+    if eager_mode:
+        common_args.append("--enforce-eager")
+    if task != "auto":
+        common_args.extend(["--task", task])
+    if trust_remote_code:
+        common_args.append("--trust-remote-code")
+    if tokenizer_mode:
+        common_args.extend(["--tokenizer-mode", tokenizer_mode])
+    if load_format:
+        common_args.extend(["--load-format", load_format])
+    if hf_overrides:
+        common_args.extend(["--hf-overrides", hf_overrides])
+
+    ep_env = {
+        "VLLM_TEST_ENABLE_EP": "1",
+    }
+
+    ep_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--distributed-executor-backend",
+        distributed_backend,
+    ]
+
+    # compare without expert parallelism
+    tp_env = {
+        "VLLM_TEST_ENABLE_EP": "0",
+    }
+
+    tp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--distributed-executor-backend",
+        "mp",
+    ]
+
+    try:
+        compare_two_settings(model_name,
+                             ep_args,
+                             tp_args,
+                             ep_env,
+                             tp_env,
+                             method=method,
+                             max_wait_seconds=360)
+    except Exception:
+        raise
+
+
+@pytest.mark.parametrize(
+    ("model_name", "parallel_setup", "distributed_backend", "task",
+     "test_options"),
+    [
+        params for model_name, settings in TEST_MODELS.items()
+        for params in settings.iter_params(model_name)
+    ],
+)
+@create_new_process_for_each_test()
+def test_ep(
+    model_name: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    task: TaskOption,
+    test_options: EPTestOptions,
+    num_gpus_available,
+):
+    _compare_tp(model_name,
+                parallel_setup,
+                distributed_backend,
+                task,
+                test_options,
+                num_gpus_available,
+                method="generate")
diff --git a/vllm_v0.10.0/tests/distributed/test_multi_node_assignment.py b/vllm_v0.10.0/tests/distributed/test_multi_node_assignment.py
new file mode 100644
index 0000000..ef17a51
--- /dev/null
+++ b/vllm_v0.10.0/tests/distributed/test_multi_node_assignment.py
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Make sure ray assigns GPU workers to the correct node.
+
+Run:
+```sh
+cd $VLLM_PATH/tests
+
+pytest distributed/test_multi_node_assignment.py
+```
+"""
+
+import os
+
+import pytest
+import ray
+from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+
+from vllm import initialize_ray_cluster
+from vllm.config import ParallelConfig
+from vllm.executor.ray_utils import _wait_until_pg_removed
+from vllm.utils import get_ip
+
+VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
+
+
+@pytest.mark.skipif(not VLLM_MULTI_NODE,
+                    reason="Need at least 2 nodes to run the test.")
+def test_multi_node_assignment() -> None:
+
+    # NOTE: important to keep this class definition here
+    # to let ray use cloudpickle to serialize it.
+    class Actor:
+
+        def get_ip(self):
+            return get_ip()
+
+    for _ in range(10):
+        config = ParallelConfig(1, 2)
+        initialize_ray_cluster(config)
+
+        current_ip = get_ip()
+        workers = []
+        for bundle_id, bundle in enumerate(
+                config.placement_group.bundle_specs):
+            if not bundle.get("GPU", 0):
+                continue
+            scheduling_strategy = PlacementGroupSchedulingStrategy(
+                placement_group=config.placement_group,
+                placement_group_capture_child_tasks=True,
+                placement_group_bundle_index=bundle_id,
+            )
+
+            worker = ray.remote(
+                num_cpus=0,
+                num_gpus=1,
+                scheduling_strategy=scheduling_strategy,
+            )(Actor).remote()
+            worker_ip = ray.get(worker.get_ip.remote())
+            assert worker_ip == current_ip
+            workers.append(worker)
+
+        for worker in workers:
+            ray.kill(worker)
+
+        _wait_until_pg_removed(config.placement_group)
diff --git a/vllm_v0.10.0/tests/distributed/test_node_count.py b/vllm_v0.10.0/tests/distributed/test_node_count.py
new file mode 100644
index 0000000..e3c36ef
--- /dev/null
+++ b/vllm_v0.10.0/tests/distributed/test_node_count.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+
+import torch.distributed as dist
+
+from vllm.distributed.parallel_state import _node_count
+from vllm.distributed.utils import StatelessProcessGroup
+from vllm.utils import get_ip, get_open_port
+
+if __name__ == "__main__":
+    dist.init_process_group(backend="gloo")
+
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+
+    if rank == 0:
+        port = get_open_port()
+        ip = get_ip()
+        dist.broadcast_object_list([ip, port], src=0)
+    else:
+        recv = [None, None]
+        dist.broadcast_object_list(recv, src=0)
+        ip, port = recv
+
+    stateless_pg = StatelessProcessGroup.create(ip, port, rank, world_size)
+
+    for pg in [dist.group.WORLD, stateless_pg]:
+        test_result = _node_count(pg)
+
+        # Expected node count based on environment variable)
+        expected = int(os.environ.get("NUM_NODES", "1"))
+
+        assert test_result == expected, \
+            f"Expected {expected} nodes, got {test_result}"
+
+        if pg == dist.group.WORLD:
+            print(f"Node count test passed! Got {test_result} nodes "
+                  f"when using torch distributed!")
+        else:
+            print(f"Node count test passed! Got {test_result} nodes "
+                  f"when using StatelessProcessGroup!")
diff --git a/vllm_v0.10.0/tests/distributed/test_pipeline_parallel.py b/vllm_v0.10.0/tests/distributed/test_pipeline_parallel.py
new file mode 100644
index 0000000..2391430
--- /dev/null
+++ b/vllm_v0.10.0/tests/distributed/test_pipeline_parallel.py
@@ -0,0 +1,500 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+WARNING: This test runs in both single-node (4 GPUs) and multi-node
+ (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
+ important to set the distributed backend to "mp" to avoid Ray scheduling
+ all workers in a node other than the head node, which can cause the test
+ to fail.
+"""
+import json
+import os
+from dataclasses import dataclass
+from typing import Literal, NamedTuple, Optional
+
+import pytest
+
+from vllm.config import _FLOAT16_NOT_SUPPORTED_MODELS, TaskOption
+from vllm.logger import init_logger
+from vllm.transformers_utils.config import get_config
+
+from ..models.registry import HF_EXAMPLE_MODELS
+from ..utils import compare_two_settings, create_new_process_for_each_test
+
+logger = init_logger("test_pipeline_parallel")
+
+VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    For PP, we fall back to V0 by default. This means
+    that the TP baseline runs with V1 while the PP engine
+    runs with V0. This gives divergent results with dummy
+    weights. Once we enable V1 by default for PP, we can
+    remove this.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
+class ParallelSetup(NamedTuple):
+    tp_size: int
+    pp_size: int
+    eager_mode: bool
+    chunked_prefill: bool
+
+
+class PPTestOptions(NamedTuple):
+    multi_node_only: bool
+    load_format: Optional[str] = None
+
+
+@dataclass
+class PPTestSettings:
+    parallel_setups: list[ParallelSetup]
+    # NOTE: the length of distributed_backends and
+    # vllm_major_versions should be the same, and they
+    # are first zipped together to iterate over all
+    # test settings.
+    distributed_backends: list[str]
+    # vllm major version: "0" for V0, "1" for V1
+    vllm_major_versions: list[str]
+    task: TaskOption
+    test_options: PPTestOptions
+
+    def __post_init__(self):
+        if len(self.distributed_backends) != len(self.vllm_major_versions):
+            raise ValueError(
+                f"Length mismatch: distributed_backends "
+                f"({len(self.distributed_backends)}) != "
+                f"vllm_major_versions ({len(self.vllm_major_versions)})")
+
+    @staticmethod
+    def detailed(
+        *,
+        tp_base: int = 1,
+        pp_base: int = 2,
+        multi_node_only: bool = False,
+        task: TaskOption = "auto",
+        load_format: Optional[str] = None,
+    ):
+        return PPTestSettings(
+            parallel_setups=[
+                ParallelSetup(tp_size=tp_base,
+                              pp_size=pp_base,
+                              eager_mode=False,
+                              chunked_prefill=False),
+                ParallelSetup(tp_size=tp_base,
+                              pp_size=2 * pp_base,
+                              eager_mode=False,
+                              chunked_prefill=True),
+                ParallelSetup(tp_size=tp_base,
+                              pp_size=2 * pp_base,
+                              eager_mode=True,
+                              chunked_prefill=False),
+                ParallelSetup(tp_size=2 * tp_base,
+                              pp_size=pp_base,
+                              eager_mode=False,
+                              chunked_prefill=True),
+                ParallelSetup(tp_size=2 * tp_base,
+                              pp_size=pp_base,
+                              eager_mode=True,
+                              chunked_prefill=False),
+            ],
+            distributed_backends=["mp", "mp", "ray", "ray"],
+            vllm_major_versions=["0", "1", "0", "1"],
+            task=task,
+            test_options=PPTestOptions(multi_node_only=multi_node_only,
+                                       load_format=load_format),
+        )
+
+    @staticmethod
+    def fast(
+        *,
+        tp_base: int = 1,
+        pp_base: int = 2,
+        task: TaskOption = "auto",
+        multi_node_only: bool = False,
+        load_format: Optional[str] = None,
+    ):
+        return PPTestSettings(
+            parallel_setups=[
+                ParallelSetup(tp_size=tp_base,
+                              pp_size=pp_base,
+                              eager_mode=True,
+                              chunked_prefill=False),
+            ],
+            distributed_backends=["mp"],
+            vllm_major_versions=["0"],
+            task=task,
+            test_options=PPTestOptions(multi_node_only=multi_node_only,
+                                       load_format=load_format),
+        )
+
+    def iter_params(self, model_id: str):
+        opts = self.test_options
+
+        for parallel_setup in self.parallel_setups:
+            for backend, vllm_major_version in zip(self.distributed_backends,
+                                                   self.vllm_major_versions):
+                yield (model_id, parallel_setup, backend, vllm_major_version,
+                       self.task, opts)
+
+
+# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
+# The values displayed here are only a rough indicator of the size of the model
+
+# yapf: disable
+TEXT_GENERATION_MODELS = {
+    # [Decoder-only]
+    # Uses Llama
+    # "BAAI/AquilaChat-7B": PPTestSettings.fast(),
+    "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(load_format="dummy"),  # noqa: E501
+    "baichuan-inc/Baichuan-7B": PPTestSettings.fast(),
+    "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(),
+    "bigscience/bloomz-1b1": PPTestSettings.fast(),
+    "THUDM/chatglm3-6b": PPTestSettings.fast(),
+    "CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(load_format="dummy"),
+    "databricks/dbrx-instruct": PPTestSettings.fast(load_format="dummy"),
+    "Deci/DeciLM-7B-instruct": PPTestSettings.fast(),
+    "deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(tp_base=2),
+    "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
+    "tiiuae/falcon-7b": PPTestSettings.fast(),
+    "google/gemma-1.1-2b-it": PPTestSettings.fast(),
+    "google/gemma-2-9b": PPTestSettings.fast(),
+    "gpt2": PPTestSettings.fast(),
+    "bigcode/starcoder": PPTestSettings.fast(),
+    "EleutherAI/gpt-j-6b": PPTestSettings.fast(),
+    "EleutherAI/pythia-1.4b": PPTestSettings.fast(),
+    "ibm/PowerLM-3b": PPTestSettings.fast(),
+    "ibm/PowerMoE-3b": PPTestSettings.fast(),
+    # Uses Llama
+    # "internlm/internlm-chat-7b": PPTestSettings.fast(),
+    "internlm/internlm2-chat-7b": PPTestSettings.fast(),
+    "inceptionai/jais-13b-chat": PPTestSettings.fast(),
+    "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
+    "meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(),
+    # Tests TransformersForCausalLM
+    "hmellor/Ilama-3.2-1B": PPTestSettings.fast(),
+    "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(),
+    "openbmb/MiniCPM3-4B": PPTestSettings.fast(),
+    # Uses Llama
+    # "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
+    "state-spaces/mamba-130m-hf": PPTestSettings.fast(),
+    "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(load_format="dummy"),  # noqa: E501
+    "mosaicml/mpt-7b": PPTestSettings.fast(),
+    "nvidia/Minitron-8B-Base": PPTestSettings.fast(),
+    "allenai/OLMo-1B-hf": PPTestSettings.fast(),
+    "allenai/OLMo-2-0425-1B": PPTestSettings.fast(),
+    "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
+    "facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
+    "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(),
+    "adept/persimmon-8b-chat": PPTestSettings.fast(),
+    "microsoft/phi-2": PPTestSettings.fast(),
+    "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(),
+    "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(multi_node_only=True, load_format="dummy"),  # noqa: E501
+    "Qwen/Qwen-7B-Chat": PPTestSettings.fast(),
+    "Qwen/Qwen2.5-0.5B-Instruct": PPTestSettings.fast(),
+    "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
+    "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
+    "bigcode/starcoder2-3b": PPTestSettings.fast(),
+    "upstage/solar-pro-preview-instruct": PPTestSettings.fast(load_format="dummy"),  # noqa: E501
+    # FIXME: Cannot load tokenizer in latest transformers version.
+    # Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
+    # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(),
+    # [Encoder-only]
+    # TODO: Implement PP
+    # "facebook/bart-base": PPTestSettings.fast(),
+}
+
+EMBEDDING_MODELS = {  # type: ignore[var-annotated]
+    # [Text-only]
+    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(task="embed"),
+    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(task="embed"),
+    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(
+        load_format="dummy", task="embed"
+    ),
+}
+
+MULTIMODAL_MODELS = {
+    # [Decoder-only]
+    "Salesforce/blip2-opt-6.7b": PPTestSettings.fast(),
+    "facebook/chameleon-7b": PPTestSettings.fast(),
+    "adept/fuyu-8b": PPTestSettings.fast(),
+    "THUDM/glm-4v-9b": PPTestSettings.fast(),
+    "OpenGVLab/InternVL2-1B": PPTestSettings.fast(),
+    "llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
+    "llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
+    "llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),
+    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
+    "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(),
+    "allenai/Molmo-7B-D-0924": PPTestSettings.fast(),
+    "AIDC-AI/Ovis2-1B": PPTestSettings.fast(),
+    "microsoft/Phi-3.5-vision-instruct": PPTestSettings.fast(),
+    "mistralai/Pixtral-12B-2409": PPTestSettings.fast(load_format="dummy"),
+    "Qwen/Qwen-VL-Chat": PPTestSettings.fast(),
+    "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
+    "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
+    "fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(),
+    # [Encoder-decoder]
+    # TODO: Implement PP
+    # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
+}
+# yapf: enable
+
+# NOTE: You can update this on your local machine to run specific tests
+TEST_MODELS = [
+    # [LANGUAGE GENERATION]
+    "microsoft/Phi-3.5-MoE-instruct",
+    "meta-llama/Llama-3.2-1B-Instruct",
+    "hmellor/Ilama-3.2-1B",
+    "ibm/PowerLM-3b",
+    "deepseek-ai/DeepSeek-V2-Lite-Chat",
+    # [LANGUAGE EMBEDDING]
+    "intfloat/e5-mistral-7b-instruct",
+    "BAAI/bge-multilingual-gemma2",
+    # [MULTIMODAL GENERATION]
+    "OpenGVLab/InternVL2-1B",
+    "microsoft/Phi-3.5-vision-instruct",
+    "fixie-ai/ultravox-v0_5-llama-3_2-1b",
+    # [LANGUAGE GENERATION - HYBRID ARCH]
+    "ai21labs/Jamba-tiny-dev",
+]
+
+
+def _compare_tp(
+    model_id: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    vllm_major_version: str,
+    task: TaskOption,
+    test_options: PPTestOptions,
+    num_gpus_available: int,
+    *,
+    method: Literal["generate", "encode"],
+    is_multimodal: bool,
+):
+    (
+        tp_size,
+        pp_size,
+        eager_mode,
+        chunked_prefill,
+    ) = parallel_setup
+
+    multi_node_only, load_format = test_options
+
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
+    model_info.check_transformers_version(on_fail="skip")
+
+    trust_remote_code = model_info.trust_remote_code
+    tokenizer_mode = model_info.tokenizer_mode
+    hf_overrides = model_info.hf_overrides
+    hf_config = get_config(model_id, trust_remote_code)
+
+    dtype = "float16"
+    if hf_config.model_type in _FLOAT16_NOT_SUPPORTED_MODELS:
+        dtype = "bfloat16"
+
+    if load_format == "dummy":
+        # Avoid OOM
+        text_overrides = {
+            "num_hidden_layers": 4,
+            "hidden_size": 512,
+            "intermediate_size": 800,
+            "num_attention_heads": 4,
+            "num_key_value_heads": 1,
+        }
+
+        if is_multimodal:
+            hf_overrides.update({"text_config": text_overrides})
+        else:
+            hf_overrides.update(text_overrides)
+    else:
+        model_info.check_available_online(on_fail="skip")
+
+    if num_gpus_available < tp_size * pp_size:
+        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
+    if VLLM_MULTI_NODE and distributed_backend == "mp":
+        pytest.skip("Skipping multi-node pipeline parallel test for "
+                    "multiprocessing distributed backend")
+    if multi_node_only and not VLLM_MULTI_NODE:
+        pytest.skip("Not in multi-node setting")
+
+    common_args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        dtype,
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "8",
+    ]
+    if chunked_prefill:
+        common_args.append("--enable-chunked-prefill")
+    if eager_mode:
+        common_args.append("--enforce-eager")
+    if task != "auto":
+        common_args.extend(["--task", task])
+    if trust_remote_code:
+        common_args.append("--trust-remote-code")
+    if tokenizer_mode:
+        common_args.extend(["--tokenizer-mode", tokenizer_mode])
+    if load_format:
+        common_args.extend(["--load-format", load_format])
+    if hf_overrides:
+        common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
+
+    specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
+    testing_ray_compiled_graph = False
+    if distributed_backend == "ray" and (vllm_major_version == "1"
+                                         or specific_case):
+        # For V1, test Ray Compiled Graph for all the tests
+        # For V0, test Ray Compiled Graph for a subset of the tests
+        pp_env = {
+            "VLLM_USE_V1": vllm_major_version,
+            "VLLM_USE_RAY_COMPILED_DAG": "1",
+            "VLLM_USE_RAY_SPMD_WORKER": "1",
+            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
+        }
+        # Temporary. Currently when zeromq + SPMD is used, it does not properly
+        # terminate because of a Ray Compiled Graph issue.
+        common_args.append("--disable-frontend-multiprocessing")
+        testing_ray_compiled_graph = True
+    elif distributed_backend == "mp":
+        # Both V0/V1 of multiprocessing executor support PP
+        pp_env = {
+            "VLLM_USE_V1": vllm_major_version,
+        }
+    else:
+        pp_env = None
+
+    tp_env = {
+        "VLLM_USE_V1": vllm_major_version,
+    }
+
+    pp_args = [
+        *common_args,
+        "--pipeline-parallel-size",
+        str(pp_size),
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--distributed-executor-backend",
+        distributed_backend,
+    ]
+
+    # compare without pipeline parallelism
+    # NOTE: use mp backend for TP
+    # PP tests might involve multiple nodes, and ray might
+    #  schedule all workers in a node other than the head node,
+    #  which can cause the test to fail.
+    tp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--distributed-executor-backend",
+        "mp",
+    ]
+
+    try:
+        compare_two_settings(model_id,
+                             pp_args,
+                             tp_args,
+                             pp_env,
+                             tp_env,
+                             method=method)
+    except Exception:
+        if testing_ray_compiled_graph and vllm_major_version == "0":
+            # Ray Compiled Graph tests are flaky for V0,
+            # so we don't want to fail the test
+            logger.exception("Ray Compiled Graph tests failed")
+        else:
+            raise
+
+
+@pytest.mark.parametrize(
+    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
+     "task", "test_options"),
+    [
+        params for model_id, settings in TEXT_GENERATION_MODELS.items()
+        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
+    ],
+)
+@create_new_process_for_each_test()
+def test_tp_language_generation(
+    model_id: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    vllm_major_version: str,
+    task: TaskOption,
+    test_options: PPTestOptions,
+    num_gpus_available,
+):
+    _compare_tp(model_id,
+                parallel_setup,
+                distributed_backend,
+                vllm_major_version,
+                task,
+                test_options,
+                num_gpus_available,
+                method="generate",
+                is_multimodal=False)
+
+
+@pytest.mark.parametrize(
+    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
+     "task", "test_options"),
+    [
+        params for model_id, settings in EMBEDDING_MODELS.items()
+        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
+    ],
+)
+@create_new_process_for_each_test()
+def test_tp_language_embedding(
+    model_id: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    vllm_major_version: str,
+    task: TaskOption,
+    test_options: PPTestOptions,
+    num_gpus_available,
+):
+    _compare_tp(model_id,
+                parallel_setup,
+                distributed_backend,
+                vllm_major_version,
+                task,
+                test_options,
+                num_gpus_available,
+                method="encode",
+                is_multimodal=False)
+
+
+@pytest.mark.parametrize(
+    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
+     "task", "test_options"),
+    [
+        params for model_id, settings in MULTIMODAL_MODELS.items()
+        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
+    ],
+)
+@create_new_process_for_each_test()
+def test_tp_multimodal_generation(
+    model_id: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    vllm_major_version: str,
+    task: TaskOption,
+    test_options: PPTestOptions,
+    num_gpus_available,
+):
+    _compare_tp(model_id,
+                parallel_setup,
+                distributed_backend,
+                vllm_major_version,
+                task,
+                test_options,
+                num_gpus_available,
+                method="generate",
+                is_multimodal=True)
diff --git a/vllm_v0.10.0/tests/distributed/test_pipeline_partition.py b/vllm_v0.10.0/tests/distributed/test_pipeline_partition.py
new file mode 100644
index 0000000..69ceedd
--- /dev/null
+++ b/vllm_v0.10.0/tests/distributed/test_pipeline_partition.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+
+import pytest
+
+from vllm.distributed.utils import get_pp_indices
+
+
+def test_custom_layer_partition(monkeypatch: pytest.MonkeyPatch):
+
+    with monkeypatch.context() as m:
+
+        def _verify(partition_str, num_layers, pp_size, goldens):
+            bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
+            m.setenv("VLLM_PP_LAYER_PARTITION", partition_str)
+            for pp_rank, golden in enumerate(goldens):
+                assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
+            if bak is not None:
+                m.setenv("VLLM_PP_LAYER_PARTITION", bak)
+
+        # Even partition
+        _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+        # Balanced partition
+        _verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)])
+        # Put reminder somewhere
+        _verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)])
+        # Invalid partition strings
+        with pytest.raises(ValueError):
+            _verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+        with pytest.raises(ValueError):
+            _verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+        # Wrong number of partitions
+        with pytest.raises(ValueError):
+            _verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+        # Wrong number of layers
+        with pytest.raises(ValueError):
+            _verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+
+
+@pytest.mark.parametrize(
+    "num_hidden_layers,pp_size,pp_rank,indices",
+    [
+        # pp_size 2
+        (2, 2, 0, (0, 1)),
+        (2, 2, 1, (1, 2)),
+        (3, 2, 0, (0, 2)),
+        (3, 2, 1, (2, 3)),
+        # pp_size 3
+        (3, 3, 0, (0, 1)),
+        (3, 3, 1, (1, 2)),
+        (3, 3, 2, (2, 3)),
+        (4, 3, 0, (0, 1)),
+        (4, 3, 1, (1, 3)),
+        (4, 3, 2, (3, 4)),
+        (5, 3, 0, (0, 2)),
+        (5, 3, 1, (2, 4)),
+        (5, 3, 2, (4, 5)),
+    ])
+def test_uneven_auto_partition(
+    num_hidden_layers: int,
+    pp_size: int,
+    pp_rank: int,
+    indices: tuple[int, int],
+):
+    assert indices == get_pp_indices(num_hidden_layers, pp_rank, pp_size)
diff --git a/vllm_v0.10.0/tests/distributed/test_pp_cudagraph.py b/vllm_v0.10.0/tests/distributed/test_pp_cudagraph.py
new file mode 100644
index 0000000..a027a9e
--- /dev/null
+++ b/vllm_v0.10.0/tests/distributed/test_pp_cudagraph.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pytest
+
+from ..utils import compare_two_settings, create_new_process_for_each_test
+
+if TYPE_CHECKING:
+    from typing_extensions import LiteralString
+
+
+@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
+    (2, "JackFram/llama-160m"),
+])
+@pytest.mark.parametrize("ATTN_BACKEND", [
+    "FLASH_ATTN",
+    "FLASHINFER",
+])
+@create_new_process_for_each_test()
+def test_pp_cudagraph(
+    monkeypatch: pytest.MonkeyPatch,
+    PP_SIZE: int,
+    MODEL_NAME: str,
+    ATTN_BACKEND: LiteralString,
+):
+    with monkeypatch.context() as m:
+        cudagraph_args = [
+            # use half precision for speed and memory savings in CI environment
+            "--dtype",
+            "float16",
+            "--pipeline-parallel-size",
+            str(PP_SIZE),
+            "--distributed-executor-backend",
+            "mp",
+        ]
+        m.setenv("VLLM_ATTENTION_BACKEND", ATTN_BACKEND)
+
+        eager_args = cudagraph_args + ["--enforce-eager"]
+
+        compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
diff --git a/vllm_v0.10.0/tests/distributed/test_pynccl.py b/vllm_v0.10.0/tests/distributed/test_pynccl.py
new file mode 100644
index 0000000..abfad9e
--- /dev/null
+++ b/vllm_v0.10.0/tests/distributed/test_pynccl.py
@@ -0,0 +1,402 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import multiprocessing
+import os
+
+import numpy as np
+import pytest
+import torch
+import torch.distributed
+
+from vllm.distributed.communication_op import (  # noqa
+    tensor_model_parallel_all_reduce)
+from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+from vllm.distributed.device_communicators.pynccl_wrapper import NCCLLibrary
+from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
+                                             get_world_group, graph_capture,
+                                             init_distributed_environment)
+from vllm.utils import update_environment_variables
+
+
+def distributed_run(fn, world_size):
+    number_of_processes = world_size
+    processes: list[multiprocessing.Process] = []
+    for i in range(number_of_processes):
+        env: dict[str, str] = {}
+        env['RANK'] = str(i)
+        env['LOCAL_RANK'] = str(i)
+        env['WORLD_SIZE'] = str(number_of_processes)
+        env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
+        env['MASTER_ADDR'] = 'localhost'
+        env['MASTER_PORT'] = '12345'
+        p = multiprocessing.Process(target=fn, args=(env, ))
+        processes.append(p)
+        p.start()
+
+    for p in processes:
+        p.join()
+
+    for p in processes:
+        assert p.exitcode == 0
+
+
+def worker_fn_wrapper(fn):
+    # `multiprocessing.Process` cannot accept environment variables directly
+    # so we need to pass the environment variables as arguments
+    # and update the environment variables in the function
+    def wrapped_fn(env):
+        update_environment_variables(env)
+        local_rank = os.environ['LOCAL_RANK']
+        device = torch.device(f"cuda:{local_rank}")
+        torch.cuda.set_device(device)
+        init_distributed_environment()
+        fn()
+
+    return wrapped_fn
+
+
+@worker_fn_wrapper
+def worker_fn():
+    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
+                                     device=get_world_group().device)
+    tensor = torch.ones(16, 1024, 1024,
+                        dtype=torch.float32).cuda(pynccl_comm.rank)
+    tensor = pynccl_comm.all_reduce(tensor)
+    torch.cuda.synchronize()
+    assert torch.all(tensor == pynccl_comm.world_size).cpu().item()
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+def test_pynccl():
+    distributed_run(worker_fn, 2)
+
+
+@worker_fn_wrapper
+def multiple_allreduce_worker_fn():
+    device = torch.device(f"cuda:{torch.distributed.get_rank()}")
+    groups = [
+        torch.distributed.new_group(ranks=[0, 1], backend="gloo"),
+        torch.distributed.new_group(ranks=[2, 3], backend="gloo")
+    ]
+    group = groups[0] if torch.distributed.get_rank() in [0, 1] else groups[1]
+    pynccl_comm = PyNcclCommunicator(group=group, device=device)
+    tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
+    # two groups can communicate independently
+    if torch.distributed.get_rank() in [0, 1]:
+        tensor = pynccl_comm.all_reduce(tensor)
+        tensor = pynccl_comm.all_reduce(tensor)
+        torch.cuda.synchronize()
+        assert torch.all(tensor == 4).cpu().item()
+    else:
+        tensor = pynccl_comm.all_reduce(tensor)
+        torch.cuda.synchronize()
+        assert torch.all(tensor == 2).cpu().item()
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 4,
+                    reason="Need at least 4 GPUs to run the test.")
+def test_pynccl_multiple_allreduce():
+    # this tests pynccl for multiple tp groups, in a standalone way
+    # i.e. call `pynccl_comm.all_reduce` directly
+    distributed_run(multiple_allreduce_worker_fn, 4)
+
+
+@worker_fn_wrapper
+def multiple_allreduce_with_vllm_worker_fn():
+    device = torch.device(f"cuda:{torch.distributed.get_rank()}")
+    ensure_model_parallel_initialized(2, 2)
+    tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
+    with graph_capture(device=device):
+        # two tp groups can communicate independently
+        if torch.distributed.get_rank() in [0, 1]:
+            tensor = tensor_model_parallel_all_reduce(tensor)
+            tensor = tensor_model_parallel_all_reduce(tensor)
+            torch.cuda.synchronize()
+            assert torch.all(tensor == 4).cpu().item()
+        else:
+            tensor = tensor_model_parallel_all_reduce(tensor)
+            torch.cuda.synchronize()
+            assert torch.all(tensor == 2).cpu().item()
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 4,
+                    reason="Need at least 4 GPUs to run the test.")
+def test_pynccl_multiple_allreduce_with_vllm():
+    # this tests pynccl for multiple tp groups, together with vllm
+    # i.e. call `tensor_model_parallel_all_reduce`
+    distributed_run(multiple_allreduce_with_vllm_worker_fn, 4)
+
+
+@worker_fn_wrapper
+def worker_fn_with_cudagraph():
+    with torch.no_grad():
+        graph = torch.cuda.CUDAGraph()
+        pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
+                                         device=get_world_group().device)
+        # run something in the default stream to initialize torch engine
+        a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}')
+        torch.cuda.synchronize()
+        with torch.cuda.graph(graph):
+            a_out = pynccl_comm.all_reduce(a)
+        torch.cuda.synchronize()
+        graph.replay()
+        torch.cuda.synchronize()
+        assert torch.all(a_out == pynccl_comm.world_size).cpu().item()
+
+
+@worker_fn_wrapper
+def all_gather_worker_fn():
+    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
+                                     device=get_world_group().device)
+
+    rank = pynccl_comm.rank
+    world_size = pynccl_comm.world_size
+    device = f'cuda:{pynccl_comm.rank}'
+
+    num_elems = 1000
+    tensor = torch.arange(num_elems, dtype=torch.float32,
+                          device=device) + rank * num_elems
+    result = torch.zeros(num_elems * world_size,
+                         dtype=torch.float32,
+                         device=device)
+
+    expected = torch.cat([
+        torch.arange(num_elems, dtype=torch.float32) + r * num_elems
+        for r in range(world_size)
+    ]).to(device)
+
+    pynccl_comm.all_gather(result, tensor)
+    torch.cuda.synchronize()
+    torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+def test_pynccl_all_gather():
+    distributed_run(all_gather_worker_fn, 2)
+
+
+@worker_fn_wrapper
+def all_gatherv_worker_fn():
+    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
+                                     device=get_world_group().device)
+
+    rank = pynccl_comm.rank
+    world_size = pynccl_comm.world_size
+    device = f'cuda:{pynccl_comm.rank}'
+
+    assert world_size <= 8
+    sizes = [81, 20, 57, 52, 81, 5, 49, 49][:world_size]
+    num_elems = sizes[rank]
+    tensor = torch.arange(num_elems, dtype=torch.float32,
+                          device=device) + rank * 100
+    result = torch.zeros(sum(sizes), dtype=torch.float32, device=device)
+
+    expected = torch.cat([
+        torch.arange(sizes[r], dtype=torch.float32) + r * 100
+        for r in range(world_size)
+    ]).to(device)
+
+    pynccl_comm.all_gatherv(result, tensor, sizes=sizes)
+    torch.cuda.synchronize()
+    torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+def test_pynccl_all_gatherv():
+    distributed_run(all_gatherv_worker_fn, 2)
+
+
+@worker_fn_wrapper
+def reduce_scatter_worker_fn():
+    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
+                                     device=get_world_group().device)
+
+    rank = pynccl_comm.rank
+    world_size = pynccl_comm.world_size
+    device = f'cuda:{pynccl_comm.rank}'
+
+    num_elems = 1000
+    tensor = torch.arange(num_elems, dtype=torch.float32,
+                          device=device) + rank * num_elems
+    assert (num_elems % world_size == 0)
+    result = torch.zeros(num_elems // world_size,
+                         dtype=torch.float32,
+                         device=device)
+
+    # Calculate expected result for this rank's chunk
+    scattered_size = num_elems // world_size
+    all_tensors = [
+        torch.arange(num_elems, dtype=torch.float32) + r * num_elems
+        for r in range(world_size)
+    ]
+    expected = sum(tensor[rank * scattered_size:(rank + 1) * scattered_size]
+                   for tensor in all_tensors).to(device)
+
+    pynccl_comm.reduce_scatter(result, tensor)
+    torch.cuda.synchronize()
+    torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+def test_pynccl_reduce_scatter():
+    distributed_run(reduce_scatter_worker_fn, 2)
+
+
+@worker_fn_wrapper
+def reduce_scatterv_worker_fn():
+    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
+                                     device=get_world_group().device)
+
+    rank = pynccl_comm.rank
+    world_size = pynccl_comm.world_size
+    device = f'cuda:{pynccl_comm.rank}'
+
+    assert world_size <= 8
+    sizes = [81, 20, 57, 52, 81, 5, 49, 49][:world_size]
+    num_elems = sum(sizes)
+    tensor = torch.arange(num_elems, dtype=torch.float32,
+                          device=device) + rank * 100
+    result = torch.zeros(sizes[rank], dtype=torch.float32, device=device)
+
+    # Calculate expected result for this rank's chunk
+    all_tensors = [
+        torch.arange(num_elems, dtype=torch.float32) + r * 100
+        for r in range(world_size)
+    ]
+    sizes_cumsum = np.cumsum(sizes)
+    start = 0 if rank == 0 else sizes_cumsum[rank - 1]
+    end = sizes_cumsum[rank]
+    expected = sum(tensor[start:end] for tensor in all_tensors).to(device)
+
+    pynccl_comm.reduce_scatterv(result, tensor, sizes=sizes)
+    torch.cuda.synchronize()
+    torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+def test_pynccl_reduce_scatterv():
+    distributed_run(reduce_scatterv_worker_fn, 2)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+def test_pynccl_with_cudagraph():
+    distributed_run(worker_fn_with_cudagraph, 2)
+
+
+@worker_fn_wrapper
+def send_recv_worker_fn():
+    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
+                                     device=get_world_group().device)
+    if pynccl_comm.rank == 0:
+        tensor = torch.ones(16, 1024, 1024,
+                            dtype=torch.float32).cuda(pynccl_comm.rank)
+    else:
+        tensor = torch.empty(16, 1024, 1024,
+                             dtype=torch.float32).cuda(pynccl_comm.rank)
+
+    if pynccl_comm.rank == 0:
+        pynccl_comm.send(tensor,
+                         dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
+    else:
+        pynccl_comm.recv(tensor,
+                         src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
+    torch.cuda.synchronize()
+    assert torch.all(tensor == 1).cpu().item()
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+def test_pynccl_send_recv():
+    distributed_run(send_recv_worker_fn, 2)
+
+
+@worker_fn_wrapper
+def multiple_send_recv_worker_fn():
+    device = torch.device(f"cuda:{torch.distributed.get_rank()}")
+    groups = [
+        torch.distributed.new_group(ranks=[0, 2], backend="gloo"),
+        torch.distributed.new_group(ranks=[1, 3], backend="gloo")
+    ]
+    group = groups[0] if torch.distributed.get_rank() in [0, 2] else groups[1]
+    pynccl_comm = PyNcclCommunicator(group=group, device=device)
+    if torch.distributed.get_rank() == 0:
+        tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
+    elif torch.distributed.get_rank() == 1:
+        tensor = 2 * torch.ones(
+            16, 1024, 1024, dtype=torch.float32, device=device)
+    else:
+        tensor = torch.empty(16,
+                             1024,
+                             1024,
+                             dtype=torch.float32,
+                             device=device)
+    if torch.distributed.get_rank() in [0, 1]:
+        pynccl_comm.send(tensor,
+                         dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
+    else:
+        pynccl_comm.recv(tensor,
+                         src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
+    torch.cuda.synchronize()
+    if torch.distributed.get_rank() in [0, 2]:
+        assert torch.all(tensor == 1).cpu().item()
+    else:
+        assert torch.all(tensor == 2).cpu().item()
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 4,
+                    reason="Need at least 4 GPUs to run the test.")
+def test_pynccl_multiple_send_recv():
+    distributed_run(multiple_send_recv_worker_fn, 4)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 4,
+                    reason="Need at least 4 GPUs to run the test.")
+def test_pynccl_broadcast():
+    distributed_run(broadcast_worker_fn, 4)
+
+
+@worker_fn_wrapper
+def broadcast_worker_fn():
+    # Test broadcast for every root rank.
+    # Essentially this is an all-gather operation.
+    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
+                                     device=get_world_group().device)
+    recv_tensors = [
+        torch.empty(16,
+                    1024,
+                    1024,
+                    dtype=torch.float32,
+                    device=pynccl_comm.device)
+        for i in range(pynccl_comm.world_size)
+    ]
+    recv_tensors[pynccl_comm.rank] = torch.ones(
+        16, 1024, 1024, dtype=torch.float32,
+        device=pynccl_comm.device) * pynccl_comm.rank
+
+    for i in range(pynccl_comm.world_size):
+        pynccl_comm.broadcast(recv_tensors[i], src=i)
+        # the broadcast op might be launched in a different stream
+        # need to synchronize to make sure the tensor is ready
+        torch.cuda.synchronize()
+        assert torch.all(recv_tensors[i] == i).cpu().item()
+
+
+def test_ncclGetUniqueId():
+    lib = NCCLLibrary()
+    unique_id = lib.ncclGetUniqueId()
+    # `list(unique_id.internal)` is something like this:
+    # [34, -16, 23, 83, 109, -19, 59, 95, 2, 0, -86, 55, 10, -128, 0, 29, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+    # as long as the function doesn't raise an exception, we're good
+    assert unique_id is not None
diff --git a/vllm_v0.10.0/tests/distributed/test_quick_all_reduce.py b/vllm_v0.10.0/tests/distributed/test_quick_all_reduce.py
new file mode 100644
index 0000000..a4added
--- /dev/null
+++ b/vllm_v0.10.0/tests/distributed/test_quick_all_reduce.py
@@ -0,0 +1,138 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import random
+
+import pytest
+import ray
+import torch
+import torch.distributed as dist
+
+from vllm.distributed.communication_op import (  # noqa
+    tensor_model_parallel_all_reduce)
+from vllm.distributed.parallel_state import (get_tensor_model_parallel_group,
+                                             get_tp_group, graph_capture)
+from vllm.platforms import current_platform
+
+from ..utils import (ensure_model_parallel_initialized,
+                     init_test_distributed_environment, multi_process_parallel)
+
+torch.manual_seed(42)
+random.seed(44)
+# Size over 8MB is sufficient for custom quick allreduce.
+test_sizes = [
+    random.randint(8 * 1024 * 1024, 10 * 1024 * 1024) for _ in range(8)
+]
+for i, v in enumerate(test_sizes):
+    test_sizes[i] -= v % 8
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def graph_quickreduce(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size,
+    pp_size,
+    rank,
+    distributed_init_port,
+):
+    with monkeypatch.context() as m:
+        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+        device = torch.device(f"cuda:{rank}")
+        torch.cuda.set_device(device)
+        init_test_distributed_environment(tp_size, pp_size, rank,
+                                          distributed_init_port)
+        ensure_model_parallel_initialized(tp_size, pp_size)
+        group = get_tensor_model_parallel_group().device_group
+
+        # A small all_reduce for warmup.
+        # this is needed because device communicators might be created lazily
+        # (e.g. NCCL). This will ensure that the communicator is initialized
+        # before any communication happens, so that this group can be used for
+        # graph capture immediately.
+        data = torch.zeros(1)
+        data = data.to(device=device)
+        torch.distributed.all_reduce(data, group=group)
+        torch.cuda.synchronize()
+        del data
+
+        # we use the first group to communicate once
+        # and the second group to communicate twice
+        # and so on
+        # this is used to demonstrate that each group can
+        # communicate independently
+        num_communication = rank // tp_size + 1
+
+        for sz in test_sizes:
+            for dtype in [torch.float16, torch.bfloat16]:
+                with graph_capture(device=device) as graph_capture_context:
+                    inp1 = torch.randint(1,
+                                         23, (sz, ),
+                                         dtype=dtype,
+                                         device=torch.cuda.current_device())
+                    inp2 = torch.randint(-23,
+                                         1, (sz, ),
+                                         dtype=dtype,
+                                         device=torch.cuda.current_device())
+                    torch.cuda.synchronize()
+                    graph = torch.cuda.CUDAGraph()
+                    with torch.cuda.graph(graph,
+                                          stream=graph_capture_context.stream):
+                        for _ in range(num_communication):
+                            out1 = tensor_model_parallel_all_reduce(inp1)
+                            dist.all_reduce(inp1, group=group)
+                            out2 = tensor_model_parallel_all_reduce(inp2)
+                            dist.all_reduce(inp2, group=group)
+                graph.replay()
+                torch.testing.assert_close(out1, inp1, atol=2.5, rtol=0.1)
+                torch.testing.assert_close(out2, inp2, atol=2.5, rtol=0.1)
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def eager_quickreduce(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size,
+    pp_size,
+    rank,
+    distributed_init_port,
+):
+    with monkeypatch.context() as m:
+        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+        device = torch.device(f"cuda:{rank}")
+        torch.cuda.set_device(device)
+
+        init_test_distributed_environment(tp_size, pp_size, rank,
+                                          distributed_init_port)
+
+        # Size over 8MB is sufficient for custom quick allreduce.
+        sz = 16 * 1024 * 1024
+        fa = get_tp_group().device_communicator.qr_comm
+        inp = torch.tensor([1.0 * ((i) % 23) for i in range(sz)],
+                           dtype=torch.float16,
+                           device=device)
+        out = fa.quick_all_reduce(inp)
+        torch.testing.assert_close(out, inp * tp_size, atol=2.5, rtol=0.1)
+
+        inp = torch.tensor([1.0 * ((i) % 23) for i in range(sz)],
+                           dtype=torch.bfloat16,
+                           device=device)
+        out = fa.quick_all_reduce(inp)
+        torch.testing.assert_close(out, inp * tp_size, atol=2.5, rtol=0.1)
+
+
+@pytest.mark.skipif(not current_platform.is_rocm(),
+                    reason="only test quick allreduce for rocm")
+@pytest.mark.parametrize("quant_mode", ["FP", "INT8", "INT6", "INT4"])
+@pytest.mark.parametrize("tp_size", [2])
+@pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
+@pytest.mark.parametrize("test_target", [graph_quickreduce, eager_quickreduce])
+def test_custom_quick_allreduce(monkeypatch: pytest.MonkeyPatch, tp_size,
+                                pipeline_parallel_size, test_target,
+                                quant_mode):
+    world_size = tp_size * pipeline_parallel_size
+    if world_size > torch.cuda.device_count():
+        pytest.skip("Not enough GPUs to run the test.")
+
+    monkeypatch.setenv("VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", quant_mode)
+
+    multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size,
+                           test_target)
diff --git a/vllm_v0.10.0/tests/distributed/test_same_node.py b/vllm_v0.10.0/tests/distributed/test_same_node.py
new file mode 100644
index 0000000..94ad8f4
--- /dev/null
+++ b/vllm_v0.10.0/tests/distributed/test_same_node.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+
+import torch.distributed as dist
+
+from vllm.distributed.parallel_state import in_the_same_node_as
+from vllm.distributed.utils import StatelessProcessGroup
+from vllm.utils import get_ip, get_open_port
+
+if __name__ == "__main__":
+    dist.init_process_group(backend="gloo")
+
+    rank = dist.get_rank()
+    if rank == 0:
+        port = get_open_port()
+        ip = get_ip()
+        dist.broadcast_object_list([ip, port], src=0)
+    else:
+        recv = [None, None]
+        dist.broadcast_object_list(recv, src=0)
+        ip, port = recv
+
+    stateless_pg = StatelessProcessGroup.create(ip, port, rank,
+                                                dist.get_world_size())
+
+    for pg in [dist.group.WORLD, stateless_pg]:
+        test_result = all(in_the_same_node_as(pg, source_rank=0))
+
+        expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
+        assert test_result == expected, \
+            f"Expected {expected}, got {test_result}"
+        if pg == dist.group.WORLD:
+            print("Same node test passed! when using torch distributed!")
+        else:
+            print("Same node test passed! when using StatelessProcessGroup!")
diff --git a/vllm_v0.10.0/tests/distributed/test_sequence_parallel.py b/vllm_v0.10.0/tests/distributed/test_sequence_parallel.py
new file mode 100644
index 0000000..b2f6a8a
--- /dev/null
+++ b/vllm_v0.10.0/tests/distributed/test_sequence_parallel.py
@@ -0,0 +1,326 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+WARNING: This test runs in both single-node (4 GPUs) and multi-node
+ (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
+ important to set the distributed backend to "mp" to avoid Ray scheduling
+ all workers in a node other than the head node, which can cause the test
+ to fail.
+"""
+import json
+import os
+from dataclasses import dataclass
+from typing import Literal, NamedTuple, Optional
+
+import pytest
+
+from vllm.config import TaskOption
+from vllm.logger import init_logger
+
+from ..models.registry import HF_EXAMPLE_MODELS
+from ..utils import compare_two_settings, create_new_process_for_each_test
+
+logger = init_logger("test_sequence_parallel")
+
+VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
+
+
+class ParallelSetup(NamedTuple):
+    tp_size: int
+    pp_size: int
+    enable_fusion: bool
+    eager_mode: bool
+    chunked_prefill: bool
+
+
+class SPTestOptions(NamedTuple):
+    multi_node_only: bool
+    load_format: Optional[str] = None
+
+
+@dataclass
+class SPTestSettings:
+    parallel_setups: list[ParallelSetup]
+    # NOTE: the length of distributed_backends and
+    # vllm_major_versions should be the same, and they
+    # are first zipped together to iterate over all
+    # test settings.
+    distributed_backends: list[str]
+    # vllm major version: "0" for V0, "1" for V1
+    vllm_major_versions: list[str]
+    task: TaskOption
+    test_options: SPTestOptions
+
+    def __post_init__(self):
+        if len(self.distributed_backends) != len(self.vllm_major_versions):
+            raise ValueError(
+                f"Length mismatch: distributed_backends "
+                f"({len(self.distributed_backends)}) != "
+                f"vllm_major_versions ({len(self.vllm_major_versions)})")
+
+    @staticmethod
+    def detailed(
+        *,
+        tp_base: int = 2,
+        pp_base: int = 1,
+        multi_node_only: bool = False,
+        task: TaskOption = "auto",
+        load_format: Optional[str] = None,
+    ):
+        parallel_setups = []
+        for eager_mode_val in [False, True]:
+            for pp_multiplier in [1, 2]:
+                for chunked_prefill_val in [False, True]:
+                    parallel_setups.append(
+                        ParallelSetup(tp_size=tp_base,
+                                      pp_size=pp_multiplier * pp_base,
+                                      enable_fusion=False,
+                                      eager_mode=eager_mode_val,
+                                      chunked_prefill=chunked_prefill_val))
+        return SPTestSettings(
+            parallel_setups=parallel_setups,
+            distributed_backends=["mp", "ray"],
+            vllm_major_versions=["1", "1"],
+            task=task,
+            test_options=SPTestOptions(multi_node_only=multi_node_only,
+                                       load_format=load_format),
+        )
+
+    @staticmethod
+    def fast(
+        *,
+        tp_base: int = 2,
+        pp_base: int = 1,
+        task: TaskOption = "auto",
+        multi_node_only: bool = False,
+        load_format: Optional[str] = None,
+    ):
+        parallel_setups = []
+        for eager_mode_val in [False, True]:
+            for pp_multiplier in [1, 2]:
+                for chunked_prefill_val in [False, True]:
+                    parallel_setups.append(
+                        ParallelSetup(tp_size=tp_base,
+                                      pp_size=pp_multiplier * pp_base,
+                                      enable_fusion=False,
+                                      eager_mode=eager_mode_val,
+                                      chunked_prefill=chunked_prefill_val))
+        return SPTestSettings(
+            parallel_setups=parallel_setups,
+            distributed_backends=["mp", "ray"],
+            vllm_major_versions=["1", "1"],
+            task=task,
+            test_options=SPTestOptions(multi_node_only=multi_node_only,
+                                       load_format=load_format),
+        )
+
+    @staticmethod
+    def fp8_quant(
+        *,
+        tp_base: int = 2,
+        pp_base: int = 1,
+        task: TaskOption = "auto",
+        multi_node_only: bool = False,
+        load_format: Optional[str] = None,
+    ):
+        parallel_setups = []
+        for fusion_val in [False, True]:
+            parallel_setups.append(
+                ParallelSetup(tp_size=tp_base,
+                              pp_size=pp_base,
+                              enable_fusion=fusion_val,
+                              eager_mode=True,
+                              chunked_prefill=False))
+        return SPTestSettings(
+            parallel_setups=parallel_setups,
+            distributed_backends=["mp", "ray"],
+            vllm_major_versions=["1", "1"],
+            task=task,
+            test_options=SPTestOptions(multi_node_only=multi_node_only,
+                                       load_format=load_format),
+        )
+
+    def iter_params(self, model_id: str):
+        opts = self.test_options
+
+        for parallel_setup in self.parallel_setups:
+            for backend, vllm_major_version in zip(self.distributed_backends,
+                                                   self.vllm_major_versions):
+                yield (model_id, parallel_setup, backend, vllm_major_version,
+                       self.task, opts)
+
+
+def _compare_sp(
+    model_id: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    vllm_major_version: str,
+    task: TaskOption,
+    test_options: SPTestOptions,
+    num_gpus_available: int,
+    *,
+    method: Literal["generate", "encode"],
+    is_multimodal: bool,
+):
+    (
+        tp_size,
+        pp_size,
+        enable_fusion,
+        eager_mode,
+        chunked_prefill,
+    ) = parallel_setup
+
+    multi_node_only, load_format = test_options
+
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
+    model_info.check_transformers_version(on_fail="skip")
+
+    trust_remote_code = model_info.trust_remote_code
+    tokenizer_mode = model_info.tokenizer_mode
+    hf_overrides = model_info.hf_overrides
+
+    if load_format == "dummy":
+        # Avoid OOM
+        text_overrides = {
+            "num_hidden_layers": 4,
+            "hidden_size": 512,
+            "intermediate_size": 800,
+            "num_attention_heads": 4,
+            "num_key_value_heads": 1,
+        }
+
+        if is_multimodal:
+            hf_overrides.update({"text_config": text_overrides})
+        else:
+            hf_overrides.update(text_overrides)
+    else:
+        model_info.check_available_online(on_fail="skip")
+
+    if num_gpus_available < tp_size * pp_size:
+        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
+    if VLLM_MULTI_NODE and distributed_backend == "mp":
+        pytest.skip("Skipping multi-node pipeline parallel test for "
+                    "multiprocessing distributed backend")
+    if multi_node_only and not VLLM_MULTI_NODE:
+        pytest.skip("Not in multi-node setting")
+
+    common_args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "8",
+    ]
+    if chunked_prefill:
+        common_args.append("--enable-chunked-prefill")
+    if eager_mode:
+        common_args.append("--enforce-eager")
+    if task != "auto":
+        common_args.extend(["--task", task])
+    if trust_remote_code:
+        common_args.append("--trust-remote-code")
+    if tokenizer_mode:
+        common_args.extend(["--tokenizer-mode", tokenizer_mode])
+    if load_format:
+        common_args.extend(["--load-format", load_format])
+    if hf_overrides:
+        common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
+
+    compilation_config = {
+        'level': 3,
+        'custom_ops': ["+rms_norm"],
+        'compile_sizes': [4, 8],
+        'splitting_ops': [],
+        'pass_config': {
+            'enable_sequence_parallelism': True,
+            'enable_fusion': enable_fusion,
+            'enable_noop': True,
+        },
+    }
+
+    tp_sp_env = tp_env = {
+        "VLLM_USE_V1": vllm_major_version,
+    }
+
+    tp_sp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--distributed-executor-backend",
+        distributed_backend,
+        "--compilation_config",
+        json.dumps(compilation_config),
+    ]
+
+    tp_env = {
+        "VLLM_USE_V1": vllm_major_version,
+    }
+    tp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--distributed-executor-backend",
+        "mp",
+    ]
+
+    try:
+        compare_two_settings(model_id,
+                             tp_sp_args,
+                             tp_args,
+                             tp_sp_env,
+                             tp_env,
+                             method=method)
+    except Exception:
+        testing_ray_compiled_graph = tp_sp_env is not None
+        if testing_ray_compiled_graph and vllm_major_version == "0":
+            # Ray Compiled Graph tests are flaky for V0,
+            # so we don't want to fail the test
+            logger.exception("Ray Compiled Graph tests failed")
+        else:
+            raise
+
+
+SP_TEXT_GENERATION_MODELS = {
+    # [Decoder-only]
+    "meta-llama/Llama-3.2-1B-Instruct": SPTestSettings.fast(),
+    "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8": SPTestSettings.fp8_quant(),
+}
+
+SP_TEST_MODELS = [
+    # TODO support other models
+    # [LANGUAGE GENERATION]
+    "meta-llama/Llama-3.2-1B-Instruct",
+    "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
+]
+
+
+@pytest.mark.parametrize(
+    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
+     "task", "test_options"),
+    [
+        params for model_id, settings in SP_TEXT_GENERATION_MODELS.items()
+        for params in settings.iter_params(model_id)
+        if model_id in SP_TEST_MODELS
+    ],
+)
+@create_new_process_for_each_test()
+def test_tp_sp_generation(
+    model_id: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    vllm_major_version: str,
+    task: TaskOption,
+    test_options: SPTestOptions,
+    num_gpus_available,
+):
+    _compare_sp(model_id,
+                parallel_setup,
+                distributed_backend,
+                vllm_major_version,
+                task,
+                test_options,
+                num_gpus_available,
+                method="generate",
+                is_multimodal=False)
diff --git a/vllm_v0.10.0/tests/distributed/test_shm_broadcast.py b/vllm_v0.10.0/tests/distributed/test_shm_broadcast.py
new file mode 100644
index 0000000..e1357b4
--- /dev/null
+++ b/vllm_v0.10.0/tests/distributed/test_shm_broadcast.py
@@ -0,0 +1,118 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import multiprocessing
+import random
+import time
+
+import numpy as np
+import torch.distributed as dist
+
+from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
+from vllm.distributed.utils import StatelessProcessGroup
+from vllm.utils import get_open_port, update_environment_variables
+
+
+def get_arrays(n: int, seed: int = 0) -> list[np.ndarray]:
+    np.random.seed(seed)
+    sizes = np.random.randint(1, 10_000, n)
+    # on average, each array will have 5k elements
+    # with int64, each array will have 40kb
+    return [np.random.randint(1, 100, i) for i in sizes]
+
+
+def distributed_run(fn, world_size):
+    number_of_processes = world_size
+    processes = []
+    for i in range(number_of_processes):
+        env = {}
+        env['RANK'] = str(i)
+        env['LOCAL_RANK'] = str(i)
+        env['WORLD_SIZE'] = str(number_of_processes)
+        env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
+        env['MASTER_ADDR'] = 'localhost'
+        env['MASTER_PORT'] = '12345'
+        p = multiprocessing.Process(target=fn, args=(env, ))
+        processes.append(p)
+        p.start()
+
+    for p in processes:
+        p.join()
+
+    for p in processes:
+        assert p.exitcode == 0
+
+
+def worker_fn_wrapper(fn):
+    # `multiprocessing.Process` cannot accept environment variables directly
+    # so we need to pass the environment variables as arguments
+    # and update the environment variables in the function
+    def wrapped_fn(env):
+        update_environment_variables(env)
+        dist.init_process_group(backend="gloo")
+        fn()
+
+    return wrapped_fn
+
+
+@worker_fn_wrapper
+def worker_fn():
+
+    rank = dist.get_rank()
+    if rank == 0:
+        port = get_open_port()
+        ip = '127.0.0.1'
+        dist.broadcast_object_list([ip, port], src=0)
+    else:
+        recv = [None, None]
+        dist.broadcast_object_list(recv, src=0)
+        ip, port = recv  # type: ignore
+
+    stateless_pg = StatelessProcessGroup.create(ip, port, rank,
+                                                dist.get_world_size())
+
+    for pg in [dist.group.WORLD, stateless_pg]:
+
+        writer_rank = 2
+        broadcaster = MessageQueue.create_from_process_group(
+            pg, 40 * 1024, 2, writer_rank)
+        if rank == writer_rank:
+            seed = random.randint(0, 1000)
+            dist.broadcast_object_list([seed], writer_rank)
+        else:
+            recv = [None]
+            dist.broadcast_object_list(recv, writer_rank)
+            seed = recv[0]  # type: ignore
+
+        if pg == dist.group.WORLD:
+            dist.barrier()
+        else:
+            pg.barrier()
+
+        # in case we find a race condition
+        # print the seed so that we can reproduce the error
+        print(f"Rank {rank} got seed {seed}")
+        # test broadcasting with about 400MB of data
+        N = 10_000
+        if rank == writer_rank:
+            arrs = get_arrays(N, seed)
+            for x in arrs:
+                broadcaster.broadcast_object(x)
+                time.sleep(random.random() / 1000)
+        else:
+            arrs = get_arrays(N, seed)
+            for x in arrs:
+                y = broadcaster.broadcast_object(None)
+                assert np.array_equal(x, y)
+                time.sleep(random.random() / 1000)
+
+        if pg == dist.group.WORLD:
+            dist.barrier()
+            print(f"torch distributed passed the test! Rank {rank}")
+        else:
+            pg.barrier()
+            print(f"StatelessProcessGroup passed the test! Rank {rank}")
+
+
+def test_shm_broadcast():
+    distributed_run(worker_fn, 4)
diff --git a/vllm_v0.10.0/tests/distributed/test_torchrun_example.py b/vllm_v0.10.0/tests/distributed/test_torchrun_example.py
new file mode 100644
index 0000000..9f2c3ea
--- /dev/null
+++ b/vllm_v0.10.0/tests/distributed/test_torchrun_example.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# unit test for `examples/offline_inference/torchrun_example.py`
+import os
+import random
+
+import torch.distributed as dist
+
+from vllm import LLM, SamplingParams
+from vllm.distributed.parallel_state import get_world_group
+
+dist.init_process_group(backend="gloo")
+
+# Create prompts
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# set different `gpu_memory_utilization` and `swap_space` for different ranks,
+# to test if all ranks agree on the same kv cache configuration.
+llm = LLM(model="facebook/opt-125m",
+          tensor_parallel_size=2,
+          pipeline_parallel_size=int(os.getenv("PP_SIZE", 1)),
+          distributed_executor_backend="external_launcher",
+          gpu_memory_utilization=random.uniform(0.7, 0.9),
+          swap_space=random.randint(1, 4),
+          seed=0)
+
+outputs = llm.generate(prompts, sampling_params)
+
+cpu_group = get_world_group().cpu_group
+
+torch_rank = dist.get_rank(group=cpu_group)
+
+
+def test_consistent_across_ranks(obj):
+    if torch_rank == 0:
+        dist.broadcast_object_list([obj], src=0, group=cpu_group)
+    else:
+        container = [None]
+        dist.broadcast_object_list(container, src=0, group=cpu_group)
+        assert container[0] == obj
+
+
+test_consistent_across_ranks(
+    llm.llm_engine.vllm_config.cache_config.num_cpu_blocks)
+test_consistent_across_ranks(
+    llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)
+
+# make sure we can access the model parameters from the calling process
+# of the `LLM` instance.
+params = list(llm.llm_engine.model_executor.driver_worker.worker.model_runner.
+              model.parameters())
+test_consistent_across_ranks(len(params))
+
+# all ranks should have the same outputs
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    test_consistent_across_ranks(prompt)
+    test_consistent_across_ranks(generated_text)
+    print(f"Rank {torch_rank}, Prompt: {prompt!r}, "
+          f"Generated text: {generated_text!r}")
diff --git a/vllm_v0.10.0/tests/distributed/test_utils.py b/vllm_v0.10.0/tests/distributed/test_utils.py
new file mode 100644
index 0000000..0287ad9
--- /dev/null
+++ b/vllm_v0.10.0/tests/distributed/test_utils.py
@@ -0,0 +1,144 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import socket
+
+import pytest
+import ray
+import torch
+
+import vllm.envs as envs
+from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+from vllm.distributed.utils import StatelessProcessGroup
+from vllm.utils import (cuda_device_count_stateless, get_open_port,
+                        update_environment_variables)
+
+from ..utils import multi_gpu_test
+
+
+@ray.remote
+class _CUDADeviceCountStatelessTestActor:
+
+    def get_count(self):
+        return cuda_device_count_stateless()
+
+    def set_cuda_visible_devices(self, cuda_visible_devices: str):
+        update_environment_variables(
+            {"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
+
+    def get_cuda_visible_devices(self):
+        return envs.CUDA_VISIBLE_DEVICES
+
+
+def test_cuda_device_count_stateless():
+    """Test that cuda_device_count_stateless changes return value if
+    CUDA_VISIBLE_DEVICES is changed."""
+    actor = _CUDADeviceCountStatelessTestActor.options(  # type: ignore
+        num_gpus=2).remote()
+    assert len(
+        sorted(ray.get(
+            actor.get_cuda_visible_devices.remote()).split(","))) == 2
+    assert ray.get(actor.get_count.remote()) == 2
+    ray.get(actor.set_cuda_visible_devices.remote("0"))
+    assert ray.get(actor.get_count.remote()) == 1
+    ray.get(actor.set_cuda_visible_devices.remote(""))
+    assert ray.get(actor.get_count.remote()) == 0
+
+
+def cpu_worker(rank, WORLD_SIZE, port1, port2):
+    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
+                                       port=port1,
+                                       rank=rank,
+                                       world_size=WORLD_SIZE)
+    if rank <= 2:
+        pg2 = StatelessProcessGroup.create(host="127.0.0.1",
+                                           port=port2,
+                                           rank=rank,
+                                           world_size=3)
+    data = torch.tensor([rank])
+    data = pg1.broadcast_obj(data, src=2)
+    assert data.item() == 2
+    if rank <= 2:
+        data = torch.tensor([rank + 1])
+        data = pg2.broadcast_obj(data, src=2)
+        assert data.item() == 3
+        pg2.barrier()
+    pg1.barrier()
+
+
+def gpu_worker(rank, WORLD_SIZE, port1, port2):
+    torch.cuda.set_device(rank)
+    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
+                                       port=port1,
+                                       rank=rank,
+                                       world_size=WORLD_SIZE)
+    pynccl1 = PyNcclCommunicator(pg1, device=rank)
+    if rank <= 2:
+        pg2 = StatelessProcessGroup.create(host="127.0.0.1",
+                                           port=port2,
+                                           rank=rank,
+                                           world_size=3)
+        pynccl2 = PyNcclCommunicator(pg2, device=rank)
+    data = torch.tensor([rank]).cuda()
+    pynccl1.all_reduce(data)
+    pg1.barrier()
+    torch.cuda.synchronize()
+    if rank <= 2:
+        pynccl2.all_reduce(data)
+        pg2.barrier()
+        torch.cuda.synchronize()
+    item = data[0].item()
+    print(f"rank: {rank}, item: {item}")
+    if rank == 3:
+        assert item == 6
+    else:
+        assert item == 18
+
+
+def broadcast_worker(rank, WORLD_SIZE, port1, port2):
+    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
+                                       port=port1,
+                                       rank=rank,
+                                       world_size=WORLD_SIZE)
+    if rank == 2:
+        pg1.broadcast_obj("secret", src=2)
+    else:
+        obj = pg1.broadcast_obj(None, src=2)
+        assert obj == "secret"
+    pg1.barrier()
+
+
+def allgather_worker(rank, WORLD_SIZE, port1, port2):
+    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
+                                       port=port1,
+                                       rank=rank,
+                                       world_size=WORLD_SIZE)
+    data = pg1.all_gather_obj(rank)
+    assert data == list(range(WORLD_SIZE))
+    pg1.barrier()
+
+
+@pytest.mark.skip(reason="This test is flaky and prone to hang.")
+@multi_gpu_test(num_gpus=4)
+@pytest.mark.parametrize(
+    "worker", [cpu_worker, gpu_worker, broadcast_worker, allgather_worker])
+def test_stateless_process_group(worker):
+    port1 = get_open_port()
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("", port1))
+        port2 = get_open_port()
+    WORLD_SIZE = 4
+    from multiprocessing import get_context
+    ctx = get_context("fork")
+    processes = []
+    for i in range(WORLD_SIZE):
+        rank = i
+        processes.append(
+            ctx.Process(target=worker, args=(rank, WORLD_SIZE, port1, port2)))
+    for p in processes:
+        p.start()
+    for p in processes:
+        p.join()
+    for p in processes:
+        assert not p.exitcode
+    print("All processes finished.")
diff --git a/vllm_v0.10.0/tests/encoder_decoder/__init__.py b/vllm_v0.10.0/tests/encoder_decoder/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/encoder_decoder/test_e2e_correctness.py b/vllm_v0.10.0/tests/encoder_decoder/test_e2e_correctness.py
new file mode 100644
index 0000000..8b99d9d
--- /dev/null
+++ b/vllm_v0.10.0/tests/encoder_decoder/test_e2e_correctness.py
@@ -0,0 +1,130 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""E2E tests to verify the correctness of the encoder-decoder framework
+
+Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
+"""
+from typing import Optional
+
+import pytest
+from transformers import AutoModelForSeq2SeqLM
+
+from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
+                                     global_force_attn_backend_context_manager)
+from vllm.platforms import current_platform
+from vllm.sequence import SampleLogprobs
+
+from ..conftest import DecoderPromptType
+from ..models.utils import check_logprobs_close
+
+LIST_ENC_DEC_SUPPORTED_BACKENDS = [
+    _Backend.XFORMERS, _Backend.FLASH_ATTN, None
+]
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
+def vllm_to_hf_output(
+    vllm_output: tuple[list[int], str, Optional[SampleLogprobs]],
+    decoder_prompt_type: DecoderPromptType,
+):
+    """Sanitize vllm output to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    hf_output_str = output_str + "</s>"
+    if decoder_prompt_type == DecoderPromptType.NONE:
+        hf_output_str = "<s>" + hf_output_str
+
+    return output_ids, hf_output_str, out_logprobs
+
+
+@pytest.fixture(autouse=True)
+def clear_cache():
+    """Fixture to clear backend cache before each test."""
+    _cached_get_attn_backend.cache_clear()  # Clear the cache
+    yield  # This allows the test to run
+
+
+@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
+@pytest.mark.parametrize("enforce_eager", [True, False])
+@pytest.mark.skipif(
+    current_platform.is_cpu(),
+    reason="CPU backend is not currently supported with encoder/decoder models"
+)
+def test_encoder_decoder_e2e(
+    hf_runner,
+    vllm_runner,
+    example_encoder_decoder_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    decoder_prompt_type: DecoderPromptType,
+    enforce_eager: bool,
+    attn_backend: _Backend,
+) -> None:
+    '''
+    End-to-End (E2E) test for the encoder-decoder framework.
+    This test evaluates the encoder-decoder functionality using the BART
+    model. We compare the outputs of the Hugging Face and vLLM
+    implementations to ensure that both implementations produce consistent
+    and correct results.
+    '''
+    with global_force_attn_backend_context_manager(attn_backend):
+        if attn_backend == _Backend.FLASH_ATTN:
+            # Flash Attention works only with bfloat16 data-type
+            dtype = 'bfloat16'
+        test_case_prompts = example_encoder_decoder_prompts[
+            decoder_prompt_type]
+
+        # Configuration settings for HF baseline
+        hf_kwargs = {
+            "top_k": None,
+            "num_beams": 1,
+            "repetition_penalty": 1.0,
+            "top_p": 1.0,
+            "length_penalty": 1.0,
+            "early_stopping": False,
+            "no_repeat_ngram_size": None,
+            "min_length": 0
+        }
+
+        with hf_runner(model, dtype=dtype,
+                       auto_cls=AutoModelForSeq2SeqLM) as hf_model:
+            hf_outputs = (
+                hf_model.generate_encoder_decoder_greedy_logprobs_limit(
+                    test_case_prompts,
+                    max_tokens,
+                    num_logprobs,
+                    **hf_kwargs,
+                ))
+        with vllm_runner(model, dtype=dtype,
+                         enforce_eager=enforce_eager) as vllm_model:
+            vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
+                test_case_prompts, max_tokens, num_logprobs)
+
+        hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE
+                          else 0)
+
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(vllm_output, decoder_prompt_type)
+                for vllm_output in vllm_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+            num_outputs_0_skip_tokens=hf_skip_tokens,
+        )
diff --git a/vllm_v0.10.0/tests/engine/__init__.py b/vllm_v0.10.0/tests/engine/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/engine/conftest.py b/vllm_v0.10.0/tests/engine/conftest.py
new file mode 100644
index 0000000..375b248
--- /dev/null
+++ b/vllm_v0.10.0/tests/engine/conftest.py
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
diff --git a/vllm_v0.10.0/tests/engine/test_arg_utils.py b/vllm_v0.10.0/tests/engine/test_arg_utils.py
new file mode 100644
index 0000000..5a91758
--- /dev/null
+++ b/vllm_v0.10.0/tests/engine/test_arg_utils.py
@@ -0,0 +1,367 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from argparse import ArgumentError
+from contextlib import nullcontext
+from dataclasses import dataclass, field
+from typing import Annotated, Literal, Optional
+
+import pytest
+
+from vllm.config import CompilationConfig, config
+from vllm.engine.arg_utils import (EngineArgs, contains_type, get_kwargs,
+                                   get_type, get_type_hints, is_not_builtin,
+                                   is_type, literal_to_kwargs, optional_type,
+                                   parse_type)
+from vllm.utils import FlexibleArgumentParser
+
+
+@pytest.mark.parametrize(("type", "value", "expected"), [
+    (int, "42", 42),
+    (float, "3.14", 3.14),
+    (str, "Hello World!", "Hello World!"),
+    (json.loads, '{"foo":1,"bar":2}', {
+        "foo": 1,
+        "bar": 2
+    }),
+])
+def test_parse_type(type, value, expected):
+    parse_type_func = parse_type(type)
+    assert parse_type_func(value) == expected
+
+
+def test_optional_type():
+    optional_type_func = optional_type(int)
+    assert optional_type_func("None") is None
+    assert optional_type_func("42") == 42
+
+
+@pytest.mark.parametrize(("type_hint", "type", "expected"), [
+    (int, int, True),
+    (int, float, False),
+    (list[int], list, True),
+    (list[int], tuple, False),
+    (Literal[0, 1], Literal, True),
+])
+def test_is_type(type_hint, type, expected):
+    assert is_type(type_hint, type) == expected
+
+
+@pytest.mark.parametrize(("type_hints", "type", "expected"), [
+    ({float, int}, int, True),
+    ({int, tuple[int]}, int, True),
+    ({int, tuple[int]}, float, False),
+    ({str, Literal["x", "y"]}, Literal, True),
+])
+def test_contains_type(type_hints, type, expected):
+    assert contains_type(type_hints, type) == expected
+
+
+@pytest.mark.parametrize(("type_hints", "type", "expected"), [
+    ({int, float}, int, int),
+    ({int, float}, str, None),
+    ({str, Literal["x", "y"]}, Literal, Literal["x", "y"]),
+])
+def test_get_type(type_hints, type, expected):
+    assert get_type(type_hints, type) == expected
+
+
+@pytest.mark.parametrize(("type_hints", "expected"), [
+    ({Literal[1, 2]}, {
+        "type": int,
+        "choices": [1, 2]
+    }),
+    ({Literal[1, "a"]}, Exception),
+])
+def test_literal_to_kwargs(type_hints, expected):
+    context = nullcontext()
+    if expected is Exception:
+        context = pytest.raises(expected)
+    with context:
+        assert literal_to_kwargs(type_hints) == expected
+
+
+@config
+@dataclass
+class NestedConfig:
+    field: int = 1
+    """field"""
+
+
+@config
+@dataclass
+class FromCliConfig1:
+    field: int = 1
+    """field"""
+
+    @classmethod
+    def from_cli(cls, cli_value: str):
+        inst = cls(**json.loads(cli_value))
+        inst.field += 1
+        return inst
+
+
+@config
+@dataclass
+class FromCliConfig2:
+    field: int = 1
+    """field"""
+
+    @classmethod
+    def from_cli(cls, cli_value: str):
+        inst = cls(**json.loads(cli_value))
+        inst.field += 2
+        return inst
+
+
+@config
+@dataclass
+class DummyConfig:
+    regular_bool: bool = True
+    """Regular bool with default True"""
+    optional_bool: Optional[bool] = None
+    """Optional bool with default None"""
+    optional_literal: Optional[Literal["x", "y"]] = None
+    """Optional literal with default None"""
+    tuple_n: tuple[int, ...] = field(default_factory=lambda: (1, 2, 3))
+    """Tuple with variable length"""
+    tuple_2: tuple[int, int] = field(default_factory=lambda: (1, 2))
+    """Tuple with fixed length"""
+    list_n: list[int] = field(default_factory=lambda: [1, 2, 3])
+    """List with variable length"""
+    list_literal: list[Literal[1, 2]] = field(default_factory=list)
+    """List with literal choices"""
+    literal_literal: Literal[Literal[1], Literal[2]] = 1
+    """Literal of literals with default 1"""
+    json_tip: dict = field(default_factory=dict)
+    """Dict which will be JSON in CLI"""
+    nested_config: NestedConfig = field(default_factory=NestedConfig)
+    """Nested config"""
+    from_cli_config1: FromCliConfig1 = field(default_factory=FromCliConfig1)
+    """Config with from_cli method"""
+    from_cli_config2: FromCliConfig2 = field(default_factory=FromCliConfig2)
+    """Different config with from_cli method"""
+
+
+@pytest.mark.parametrize(("type_hint", "expected"), [
+    (int, False),
+    (DummyConfig, True),
+])
+def test_is_not_builtin(type_hint, expected):
+    assert is_not_builtin(type_hint) == expected
+
+
+@pytest.mark.parametrize(
+    ("type_hint", "expected"), [
+        (Annotated[int, "annotation"], {int}),
+        (Optional[int], {int, type(None)}),
+        (Annotated[Optional[int], "annotation"], {int, type(None)}),
+        (Optional[Annotated[int, "annotation"]], {int, type(None)}),
+    ],
+    ids=["Annotated", "Optional", "Annotated_Optional", "Optional_Annotated"])
+def test_get_type_hints(type_hint, expected):
+    assert get_type_hints(type_hint) == expected
+
+
+def test_get_kwargs():
+    kwargs = get_kwargs(DummyConfig)
+    print(kwargs)
+
+    # bools should not have their type set
+    assert kwargs["regular_bool"].get("type") is None
+    assert kwargs["optional_bool"].get("type") is None
+    # optional literals should have None as a choice
+    assert kwargs["optional_literal"]["choices"] == ["x", "y", "None"]
+    # tuples should have the correct nargs
+    assert kwargs["tuple_n"]["nargs"] == "+"
+    assert kwargs["tuple_2"]["nargs"] == 2
+    # lists should work
+    assert kwargs["list_n"]["type"] is int
+    assert kwargs["list_n"]["nargs"] == "+"
+    # lists with literals should have the correct choices
+    assert kwargs["list_literal"]["type"] is int
+    assert kwargs["list_literal"]["nargs"] == "+"
+    assert kwargs["list_literal"]["choices"] == [1, 2]
+    # literals of literals should have merged choices
+    assert kwargs["literal_literal"]["choices"] == [1, 2]
+    # dict should have json tip in help
+    json_tip = "Should either be a valid JSON string or JSON keys"
+    assert json_tip in kwargs["json_tip"]["help"]
+    # nested config should should construct the nested config
+    assert kwargs["nested_config"]["type"]('{"field": 2}') == NestedConfig(2)
+    # from_cli configs should be constructed with the correct method
+    assert kwargs["from_cli_config1"]["type"]('{"field": 2}').field == 3
+    assert kwargs["from_cli_config2"]["type"]('{"field": 2}').field == 4
+
+
+@pytest.mark.parametrize(
+    ("arg", "expected"),
+    [
+        (None, dict()),
+        ('{"video": {"num_frames": 123} }', {
+            "video": {
+                "num_frames": 123
+            }
+        }),
+        (
+            '{"video": {"num_frames": 123, "fps": 1.0, "foo": "bar"}, "image": {"foo": "bar"} }',  # noqa
+            {
+                "video": {
+                    "num_frames": 123,
+                    "fps": 1.0,
+                    "foo": "bar"
+                },
+                "image": {
+                    "foo": "bar"
+                }
+            }),
+    ])
+def test_media_io_kwargs_parser(arg, expected):
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    if arg is None:
+        args = parser.parse_args([])
+    else:
+        args = parser.parse_args(["--media-io-kwargs", arg])
+
+    assert args.media_io_kwargs == expected
+
+
+def test_compilation_config():
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+
+    # default value
+    args = parser.parse_args([])
+    assert args.compilation_config == CompilationConfig()
+
+    # set to O3
+    args = parser.parse_args(["-O0"])
+    assert args.compilation_config.level == 0
+
+    # set to O 3 (space)
+    args = parser.parse_args(["-O", "1"])
+    assert args.compilation_config.level == 1
+
+    # set to O 3 (equals)
+    args = parser.parse_args(["-O=2"])
+    assert args.compilation_config.level == 2
+
+    # set to O.level 3
+    args = parser.parse_args(["-O.level", "3"])
+    assert args.compilation_config.level == 3
+
+    # set to string form of a dict
+    args = parser.parse_args([
+        "-O",
+        '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
+        '"use_inductor": false}',
+    ])
+    assert (args.compilation_config.level == 3 and
+            args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
+            and not args.compilation_config.use_inductor)
+
+    # set to string form of a dict
+    args = parser.parse_args([
+        "--compilation-config="
+        '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
+        '"use_inductor": true}',
+    ])
+    assert (args.compilation_config.level == 3 and
+            args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
+            and args.compilation_config.use_inductor)
+
+
+def test_prefix_cache_default():
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    args = parser.parse_args([])
+
+    engine_args = EngineArgs.from_cli_args(args=args)
+    assert (not engine_args.enable_prefix_caching
+            ), "prefix caching defaults to off."
+
+    # with flag to turn it on.
+    args = parser.parse_args(["--enable-prefix-caching"])
+    engine_args = EngineArgs.from_cli_args(args=args)
+    assert engine_args.enable_prefix_caching
+
+    # with disable flag to turn it off.
+    args = parser.parse_args(["--no-enable-prefix-caching"])
+    engine_args = EngineArgs.from_cli_args(args=args)
+    assert not engine_args.enable_prefix_caching
+
+
+# yapf: disable
+@pytest.mark.parametrize(("arg", "expected", "option"), [
+    (None, None, "mm-processor-kwargs"),
+    ("{}", {}, "mm-processor-kwargs"),
+    (
+        '{"num_crops": 4}',
+        {
+            "num_crops": 4
+        },
+        "mm-processor-kwargs"
+    ),
+    (
+        '{"foo": {"bar": "baz"}}',
+        {
+            "foo":
+            {
+                "bar": "baz"
+            }
+        },
+        "mm-processor-kwargs"
+    ),
+    (
+        '{"cast_logits_dtype":"bfloat16","sequence_parallel_norm":true,"sequence_parallel_norm_threshold":2048}',
+        {
+            "cast_logits_dtype": "bfloat16",
+            "sequence_parallel_norm": True,
+            "sequence_parallel_norm_threshold": 2048,
+        },
+        "override-neuron-config"
+    ),
+])
+# yapf: enable
+def test_composite_arg_parser(arg, expected, option):
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    if arg is None:
+        args = parser.parse_args([])
+    else:
+        args = parser.parse_args([f"--{option}", arg])
+    assert getattr(args, option.replace("-", "_")) == expected
+
+
+def test_human_readable_model_len():
+    # `exit_on_error` disabled to test invalid values below
+    parser = EngineArgs.add_cli_args(
+        FlexibleArgumentParser(exit_on_error=False))
+
+    args = parser.parse_args([])
+    assert args.max_model_len is None
+
+    args = parser.parse_args(["--max-model-len", "1024"])
+    assert args.max_model_len == 1024
+
+    # Lower
+    args = parser.parse_args(["--max-model-len", "1m"])
+    assert args.max_model_len == 1_000_000
+    args = parser.parse_args(["--max-model-len", "10k"])
+    assert args.max_model_len == 10_000
+
+    # Capital
+    args = parser.parse_args(["--max-model-len", "3K"])
+    assert args.max_model_len == 1024 * 3
+    args = parser.parse_args(["--max-model-len", "10M"])
+    assert args.max_model_len == 2**20 * 10
+
+    # Decimal values
+    args = parser.parse_args(["--max-model-len", "10.2k"])
+    assert args.max_model_len == 10200
+    # ..truncated to the nearest int
+    args = parser.parse_args(["--max-model-len", "10.212345k"])
+    assert args.max_model_len == 10212
+
+    # Invalid (do not allow decimals with binary multipliers)
+    for invalid in ["1a", "pwd", "10.24", "1.23M"]:
+        with pytest.raises(ArgumentError):
+            args = parser.parse_args(["--max-model-len", invalid])
diff --git a/vllm_v0.10.0/tests/engine/test_computed_prefix_blocks.py b/vllm_v0.10.0/tests/engine/test_computed_prefix_blocks.py
new file mode 100644
index 0000000..ac5a1f9
--- /dev/null
+++ b/vllm_v0.10.0/tests/engine/test_computed_prefix_blocks.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.llm_engine import LLMEngine
+from vllm.sampling_params import SamplingParams
+
+
+@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
+@pytest.mark.parametrize("block_size", [16])
+def test_computed_prefix_blocks(model: str, block_size: int):
+    # This test checks if we are able to run the engine to completion
+    # without triggering asserts.
+    # We are in a scenario where all blocks from the second request's prompt
+    # are full and already computed when the second request arrives.
+    prompt = (
+        "You are a helpful assistant. How do I build a car from cardboard and "
+        "paper clips? Is there an easy to follow video tutorial available "
+        "online for free?")
+    prompt2 = (
+        " Please recommend to me some resources where I can learn not only to "
+        "handle technical difficulties of building a car, but also "
+        "decoration.")
+
+    engine_args = EngineArgs(model=model,
+                             block_size=block_size,
+                             enable_prefix_caching=True)
+
+    engine = LLMEngine.from_engine_args(engine_args)
+    sampling_params = SamplingParams()
+
+    engine.add_request("0", prompt + prompt2, sampling_params)
+    engine.step()
+    engine.add_request("1", prompt, sampling_params)
+    engine.step()
diff --git a/vllm_v0.10.0/tests/engine/test_executor.py b/vllm_v0.10.0/tests/engine/test_executor.py
new file mode 100644
index 0000000..15c7a97
--- /dev/null
+++ b/vllm_v0.10.0/tests/engine/test_executor.py
@@ -0,0 +1,111 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import os
+from typing import Any, Callable, Optional, Union
+
+import pytest
+
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.llm_engine import LLMEngine
+from vllm.executor.uniproc_executor import UniProcExecutor
+from vllm.sampling_params import SamplingParams
+
+
+class Mock:
+    ...
+
+
+class CustomUniExecutor(UniProcExecutor):
+
+    def collective_rpc(self,
+                       method: Union[str, Callable],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict] = None) -> list[Any]:
+        # Drop marker to show that this was ran
+        with open(".marker", "w"):
+            ...
+        return super().collective_rpc(method, timeout, args, kwargs)
+
+
+CustomUniExecutorAsync = CustomUniExecutor
+
+
+@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
+def test_custom_executor_type_checking(model):
+    with pytest.raises(ValueError):
+        engine_args = EngineArgs(model=model,
+                                 distributed_executor_backend=Mock)
+        LLMEngine.from_engine_args(engine_args)
+    with pytest.raises(ValueError):
+        engine_args = AsyncEngineArgs(model=model,
+                                      distributed_executor_backend=Mock)
+        AsyncLLMEngine.from_engine_args(engine_args)
+
+
+@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
+def test_custom_executor(model, tmp_path):
+    cwd = os.path.abspath(".")
+    os.chdir(tmp_path)
+    try:
+        assert not os.path.exists(".marker")
+
+        engine_args = EngineArgs(
+            model=model,
+            distributed_executor_backend=CustomUniExecutor,
+            enforce_eager=True,  # reduce test time
+        )
+        engine = LLMEngine.from_engine_args(engine_args)
+        sampling_params = SamplingParams(max_tokens=1)
+
+        engine.add_request("0", "foo", sampling_params)
+        engine.step()
+
+        assert os.path.exists(".marker")
+    finally:
+        os.chdir(cwd)
+
+
+@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
+def test_custom_executor_async(model, tmp_path):
+    cwd = os.path.abspath(".")
+    os.chdir(tmp_path)
+    try:
+        assert not os.path.exists(".marker")
+
+        engine_args = AsyncEngineArgs(
+            model=model,
+            distributed_executor_backend=CustomUniExecutorAsync,
+            enforce_eager=True,  # reduce test time
+        )
+        engine = AsyncLLMEngine.from_engine_args(engine_args)
+        sampling_params = SamplingParams(max_tokens=1)
+
+        async def t():
+            stream = await engine.add_request("0", "foo", sampling_params)
+            async for x in stream:
+                ...
+
+        asyncio.run(t())
+
+        assert os.path.exists(".marker")
+    finally:
+        os.chdir(cwd)
+
+
+@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
+def test_respect_ray(model):
+    # even for TP=1 and PP=1,
+    # if users specify ray, we should use ray.
+    # users might do this if they want to manage the
+    # resources using ray.
+    engine_args = EngineArgs(
+        model=model,
+        distributed_executor_backend="ray",
+        enforce_eager=True,  # reduce test time
+    )
+    engine = LLMEngine.from_engine_args(engine_args)
+    assert engine.model_executor.uses_ray
diff --git a/vllm_v0.10.0/tests/engine/test_multi_step_output_processor.py b/vllm_v0.10.0/tests/engine/test_multi_step_output_processor.py
new file mode 100644
index 0000000..458f4de
--- /dev/null
+++ b/vllm_v0.10.0/tests/engine/test_multi_step_output_processor.py
@@ -0,0 +1,274 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import random
+from unittest.mock import MagicMock
+
+import pytest
+from transformers import PreTrainedTokenizer
+
+from vllm.core.scheduler import Scheduler
+from vllm.engine.output_processor.multi_step import MultiStepOutputProcessor
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
+                           SequenceOutput, SequenceStatus)
+from vllm.transformers_utils.detokenizer import Detokenizer
+from vllm.utils import Counter
+
+from ..core.utils import create_seq_group
+
+
+@pytest.mark.parametrize("seq_output_len", [128])
+@pytest.mark.parametrize("num_new_tokens", [1, 12])
+@pytest.mark.skip_global_cleanup
+def test_appends_token_ids(num_new_tokens: int, seq_output_len: int):
+    """Verify multi-step decoding appends token ids correctly.
+
+    We append token ids and verify all the token ids were appended correctly.
+    Note that ignore_eos=True.
+    """
+    detokenizer = MagicMock(spec=Detokenizer)
+    scheduler = MagicMock(spec=Scheduler)
+    stop_checker = MagicMock(spec=StopChecker)
+    seq_counter = Counter()
+
+    output_processor = MultiStepOutputProcessor(
+        detokenizer=detokenizer,
+        scheduler=[scheduler],
+        seq_counter=seq_counter,
+        get_tokenizer_for_seq=lambda _: mock_tokenizer(),
+        stop_checker=stop_checker,
+    )
+
+    seq_group = create_seq_group(
+        seq_prompt_len=1024,
+        seq_output_lens=[seq_output_len],
+        sampling_params=SamplingParams(max_tokens=seq_output_len +
+                                       num_new_tokens,
+                                       ignore_eos=True),
+    )
+
+    seq = seq_group.get_seqs()[0]
+    seq.status = SequenceStatus.RUNNING
+
+    new_token_ids = list(range(num_new_tokens))
+
+    outputs = [
+        CompletionSequenceGroupOutput(
+            samples=[
+                SequenceOutput(
+                    parent_seq_id=seq.seq_id,
+                    output_token=output_token,
+                    logprobs={output_token: Logprob(0.0)},
+                )
+            ],
+            prompt_logprobs=None,
+        ) for output_token in new_token_ids
+    ]
+
+    assert seq.get_token_ids()[-len(new_token_ids):] != new_token_ids
+    output_processor.process_outputs(seq_group, outputs)
+    assert seq.get_token_ids()[-len(new_token_ids):] == new_token_ids
+
+
+@pytest.mark.parametrize("seq_prompt_len", [1024])
+@pytest.mark.parametrize("seq_output_len", [128])
+@pytest.mark.parametrize("num_new_tokens", [5, 6, 7, 8])
+@pytest.mark.parametrize("max_tokens", [128 + 3])
+@pytest.mark.skip_global_cleanup
+def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int,
+                             seq_output_len: int, max_tokens: int):
+    """Verify tokens after max_tokens are dropped and not appended to the
+    sequence.
+    """
+    detokenizer = MagicMock(spec=Detokenizer)
+    scheduler = MagicMock(spec=Scheduler)
+    stop_checker = MagicMock(spec=StopChecker)
+    seq_counter = Counter()
+
+    output_processor = MultiStepOutputProcessor(
+        detokenizer=detokenizer,
+        scheduler=[scheduler],
+        seq_counter=seq_counter,
+        get_tokenizer_for_seq=lambda _: mock_tokenizer(),
+        stop_checker=stop_checker,
+    )
+
+    seq_group = create_seq_group(
+        seq_prompt_len=seq_prompt_len,
+        seq_output_lens=[seq_output_len],
+        sampling_params=SamplingParams(max_tokens=max_tokens, ),
+    )
+
+    seq = seq_group.get_seqs()[0]
+    seq.status = SequenceStatus.RUNNING
+
+    new_token_ids = list(range(num_new_tokens))
+
+    outputs = [
+        CompletionSequenceGroupOutput(
+            samples=[
+                SequenceOutput(
+                    parent_seq_id=seq.seq_id,
+                    output_token=output_token,
+                    logprobs={output_token: Logprob(0.0)},
+                )
+            ],
+            prompt_logprobs=None,
+        ) for output_token in new_token_ids
+    ]
+
+    assert seq.get_len() == seq_prompt_len + seq_output_len
+    output_processor.process_outputs(seq_group, outputs)
+
+    # Expect the processed sequence to not go over max tokens in len.
+    assert seq.get_len() == seq_prompt_len + max_tokens
+
+    # Expect the correct tokens were appended.
+    expected_appended_tokens = new_token_ids[:max_tokens - seq_output_len]
+    assert seq.get_token_ids(
+    )[-len(expected_appended_tokens):] == expected_appended_tokens
+
+
+@pytest.mark.parametrize("seq_prompt_len", [1024])
+@pytest.mark.parametrize("seq_output_len", [128])
+@pytest.mark.parametrize("num_new_tokens", [12])
+@pytest.mark.parametrize("seed", list(range(6)))
+@pytest.mark.skip_global_cleanup
+def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
+                               seq_output_len: int, seed: int):
+    """Verify the eos token id is included in the sequence, but subsequent
+    tokens are dropped (not appended to sequence).
+    """
+    random.seed(seed)
+    detokenizer = MagicMock(spec=Detokenizer)
+    scheduler = MagicMock(spec=Scheduler)
+    stop_checker = MagicMock(spec=StopChecker)
+    seq_counter = Counter()
+
+    eos_token_id = 100
+
+    output_processor = MultiStepOutputProcessor(
+        detokenizer=detokenizer,
+        scheduler=[scheduler],
+        seq_counter=seq_counter,
+        get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
+        stop_checker=stop_checker,
+    )
+
+    seq_group = create_seq_group(
+        seq_prompt_len=seq_prompt_len,
+        seq_output_lens=[seq_output_len],
+        sampling_params=SamplingParams(
+            # Ensure enough space.
+            max_tokens=seq_output_len + num_new_tokens, ),
+    )
+
+    seq = seq_group.get_seqs()[0]
+    seq.status = SequenceStatus.RUNNING
+
+    new_token_ids = list(range(num_new_tokens))
+    assert eos_token_id not in new_token_ids
+    eos_index = random.randint(0, len(new_token_ids) - 1)
+    new_token_ids[eos_index] = eos_token_id
+
+    outputs = [
+        CompletionSequenceGroupOutput(
+            samples=[
+                SequenceOutput(
+                    parent_seq_id=seq.seq_id,
+                    output_token=output_token,
+                    logprobs={output_token: Logprob(0.0)},
+                )
+            ],
+            prompt_logprobs=None,
+        ) for output_token in new_token_ids
+    ]
+
+    assert seq.get_len() == seq_prompt_len + seq_output_len
+    output_processor.process_outputs(seq_group, outputs)
+
+    # Expect the processed sequence to not go beyond provided eos.
+    assert seq.get_len() == seq_prompt_len + seq_output_len + (eos_index + 1)
+
+    # Expect the correct tokens were appended.
+    expected_appended_tokens = new_token_ids[:eos_index + 1]
+    assert seq.get_token_ids(
+    )[-len(expected_appended_tokens):] == expected_appended_tokens
+
+
+@pytest.mark.parametrize("seq_prompt_len", [1024])
+@pytest.mark.parametrize("seq_output_len", [128])
+@pytest.mark.parametrize("num_new_tokens", [12])
+@pytest.mark.parametrize("seed", list(range(6)))
+@pytest.mark.skip_global_cleanup
+def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
+                              seq_output_len: int, seed: int):
+    """When sampling parameters dictate that we should ignore the eos token id,
+    ensure all token ids are appended even if the eos token id is emitted.
+    """
+    random.seed(seed)
+    detokenizer = MagicMock(spec=Detokenizer)
+    scheduler = MagicMock(spec=Scheduler)
+    stop_checker = MagicMock(spec=StopChecker)
+    seq_counter = Counter()
+
+    eos_token_id = 100
+
+    output_processor = MultiStepOutputProcessor(
+        detokenizer=detokenizer,
+        scheduler=[scheduler],
+        seq_counter=seq_counter,
+        get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
+        stop_checker=stop_checker,
+    )
+
+    seq_group = create_seq_group(
+        seq_prompt_len=seq_prompt_len,
+        seq_output_lens=[seq_output_len],
+        sampling_params=SamplingParams(
+            # Ensure enough space.
+            max_tokens=seq_output_len + num_new_tokens,
+            ignore_eos=True,
+        ),
+    )
+
+    seq = seq_group.get_seqs()[0]
+    seq.status = SequenceStatus.RUNNING
+
+    new_token_ids = list(range(num_new_tokens))
+    assert eos_token_id not in new_token_ids
+    eos_index = random.randint(0, len(new_token_ids) - 1)
+    new_token_ids[eos_index] = eos_token_id
+
+    outputs = [
+        CompletionSequenceGroupOutput(
+            samples=[
+                SequenceOutput(
+                    parent_seq_id=seq.seq_id,
+                    output_token=output_token,
+                    logprobs={output_token: Logprob(0.0)},
+                )
+            ],
+            prompt_logprobs=None,
+        ) for output_token in new_token_ids
+    ]
+
+    assert seq.get_len() == seq_prompt_len + seq_output_len
+    output_processor.process_outputs(seq_group, outputs)
+
+    # Expect the processed sequence to go beyond eos.
+    assert seq.get_len() == seq_prompt_len + seq_output_len + num_new_tokens
+
+    # Expect the correct tokens were appended.
+    expected_appended_tokens = new_token_ids[:seq_output_len + num_new_tokens -
+                                             seq_output_len]
+    assert seq.get_token_ids(
+    )[-len(expected_appended_tokens):] == expected_appended_tokens
+
+
+def mock_tokenizer(eos_token_id=1000):
+    tokenizer = MagicMock(spec=PreTrainedTokenizer)
+    tokenizer.eos_token_id = eos_token_id
+    return tokenizer
diff --git a/vllm_v0.10.0/tests/engine/test_multiproc_workers.py b/vllm_v0.10.0/tests/engine/test_multiproc_workers.py
new file mode 100644
index 0000000..b5381b6
--- /dev/null
+++ b/vllm_v0.10.0/tests/engine/test_multiproc_workers.py
@@ -0,0 +1,179 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from time import sleep
+from typing import Any
+
+import pytest
+
+from vllm.config import VllmConfig
+from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
+                                                  ResultHandler, WorkerMonitor)
+from vllm.worker.worker_base import WorkerWrapperBase
+
+
+class DummyWorkerWrapper(WorkerWrapperBase):
+    """Dummy version of vllm.worker.worker.Worker"""
+
+    def worker_method(self, worker_input: Any) -> tuple[int, Any]:
+        sleep(0.05)
+
+        if isinstance(worker_input, Exception):
+            # simulate error case
+            raise worker_input
+
+        return self.rpc_rank, input
+
+
+def _start_workers() -> tuple[list[ProcessWorkerWrapper], WorkerMonitor]:
+    result_handler = ResultHandler()
+    vllm_config = VllmConfig()
+    workers = [
+        ProcessWorkerWrapper(result_handler, DummyWorkerWrapper, vllm_config,
+                             rank) for rank in range(8)
+    ]
+
+    worker_monitor = WorkerMonitor(workers, result_handler)
+    assert not worker_monitor.is_alive()
+
+    result_handler.start()
+    worker_monitor.start()
+    assert worker_monitor.is_alive()
+
+    return workers, worker_monitor
+
+
+def test_local_workers() -> None:
+    """Test workers with sync task submission"""
+
+    workers, worker_monitor = _start_workers()
+
+    def execute_workers(worker_input: str) -> None:
+        worker_outputs = [
+            worker.execute_method("worker_method", worker_input)
+            for worker in workers
+        ]
+
+        for rank, output in enumerate(worker_outputs):
+            assert output.get() == (rank, input)
+
+    executor = ThreadPoolExecutor(max_workers=4)
+
+    # Test concurrent submission from different threads
+    futures = [
+        executor.submit(partial(execute_workers, f"thread {thread_num}"))
+        for thread_num in range(4)
+    ]
+
+    for future in futures:
+        future.result()
+
+    # Test error case
+    exception = ValueError("fake error")
+    result = workers[0].execute_method("worker_method", exception)
+    try:
+        result.get()
+        pytest.fail("task should have failed")
+    except Exception as e:
+        assert isinstance(e, ValueError)
+        assert str(e) == "fake error"
+
+    # Test cleanup when a worker fails
+    assert worker_monitor.is_alive()
+    workers[3].process.kill()
+
+    # Other workers should get shut down here
+    worker_monitor.join(20)
+
+    # Ensure everything is stopped
+    assert not worker_monitor.is_alive()
+    assert all(not worker.process.is_alive() for worker in workers)
+
+    # Further attempts to submit tasks should fail
+    try:
+        _result = workers[0].execute_method("worker_method", "test")
+        pytest.fail("task should fail once workers have been shut down")
+    except Exception as e:
+        assert isinstance(e, ChildProcessError)
+
+
+def test_local_workers_clean_shutdown() -> None:
+    """Test clean shutdown"""
+
+    workers, worker_monitor = _start_workers()
+
+    assert worker_monitor.is_alive()
+    assert all(worker.process.is_alive() for worker in workers)
+
+    # Clean shutdown
+    worker_monitor.close()
+
+    worker_monitor.join(20)
+
+    # Ensure everything is stopped
+    assert not worker_monitor.is_alive()
+    assert all(not worker.process.is_alive() for worker in workers)
+
+    # Further attempts to submit tasks should fail
+    try:
+        _result = workers[0].execute_method("worker_method", "test")
+        pytest.fail("task should fail once workers have been shut down")
+    except Exception as e:
+        assert isinstance(e, ChildProcessError)
+
+
+@pytest.mark.asyncio
+async def test_local_workers_async() -> None:
+    """Test local workers with async task submission"""
+
+    workers, worker_monitor = _start_workers()
+
+    async def execute_workers(worker_input: str) -> None:
+        worker_coros = [
+            worker.execute_method_async("worker_method", worker_input)
+            for worker in workers
+        ]
+
+        results = await asyncio.gather(*worker_coros)
+        for rank, result in enumerate(results):
+            assert result == (rank, input)
+
+    tasks = [
+        asyncio.create_task(execute_workers(f"task {task_num}"))
+        for task_num in range(4)
+    ]
+
+    for task in tasks:
+        await task
+
+    # Test error case
+    exception = ValueError("fake error")
+    try:
+        _result = await workers[0].execute_method_async(
+            "worker_method", exception)
+        pytest.fail("task should have failed")
+    except Exception as e:
+        assert isinstance(e, ValueError)
+        assert str(e) == "fake error"
+
+    # Test cleanup when a worker fails
+    assert worker_monitor.is_alive()
+    workers[3].process.kill()
+
+    # Other workers should get shut down here
+    worker_monitor.join(20)
+
+    # Ensure everything is stopped
+    assert not worker_monitor.is_alive()
+    assert all(not worker.process.is_alive() for worker in workers)
+
+    # Further attempts to submit tasks should fail
+    try:
+        _result = await workers[0].execute_method_async(
+            "worker_method", "test")
+        pytest.fail("task should fail once workers have been shut down")
+    except Exception as e:
+        assert isinstance(e, ChildProcessError)
diff --git a/vllm_v0.10.0/tests/engine/test_options.py b/vllm_v0.10.0/tests/engine/test_options.py
new file mode 100644
index 0000000..42e88e8
--- /dev/null
+++ b/vllm_v0.10.0/tests/engine/test_options.py
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from contextlib import nullcontext
+
+import pytest
+
+from vllm.entrypoints.llm import LLM
+from vllm.sampling_params import SamplingParams
+
+
+@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
+def test_skip_tokenizer_initialization(model: str):
+    # This test checks if the flag skip_tokenizer_init skips the initialization
+    # of tokenizer and detokenizer. The generated output is expected to contain
+    # token ids.
+    llm = LLM(
+        model=model,
+        skip_tokenizer_init=True,
+        enforce_eager=True,
+    )
+    sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
+
+    with pytest.raises(ValueError, match="cannot pass text prompts when"):
+        llm.generate("abc", sampling_params)
+
+    outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
+                           sampling_params=sampling_params)
+    assert len(outputs) > 0
+    completions = outputs[0].outputs
+    assert len(completions) > 0
+    assert completions[0].text == ""
+    assert completions[0].token_ids
+
+
+@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
+@pytest.mark.parametrize("enable_prompt_embeds", [True, False])
+def test_enable_prompt_embeds(hf_runner, model: str,
+                              enable_prompt_embeds: bool):
+    prompt = "abc"
+
+    with hf_runner(model) as hf_model:
+        token_ids = hf_model.tokenizer(prompt, return_tensors="pt").input_ids
+        token_ids = token_ids.to(hf_model.model.device)
+
+        embed_layer = hf_model.model.get_input_embeddings()
+        prompt_embeds = embed_layer(token_ids).squeeze(0)
+
+    ctx = (nullcontext() if enable_prompt_embeds else pytest.raises(
+        ValueError, match="set `--enable-prompt-embeds`"))
+
+    llm = LLM(
+        model=model,
+        enable_prompt_embeds=enable_prompt_embeds,
+        enforce_eager=True,
+    )
+
+    with ctx:
+        llm.generate({"prompt_embeds": prompt_embeds})
diff --git a/vllm_v0.10.0/tests/engine/test_short_mm_context.py b/vllm_v0.10.0/tests/engine/test_short_mm_context.py
new file mode 100644
index 0000000..9c62761
--- /dev/null
+++ b/vllm_v0.10.0/tests/engine/test_short_mm_context.py
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from ..conftest import IMAGE_ASSETS
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "USER: <image>\nWhat's the content of the image?\nASSISTANT:",
+    "cherry_blossom":
+    "USER: <image>\nWhat is the season?\nASSISTANT:",
+})
+
+models = ["llava-hf/llava-1.5-7b-hf"]
+
+
+@pytest.mark.parametrize("model", models)
+def test_context_length_too_short(vllm_runner, image_assets, model):
+    images = [asset.pil_image for asset in image_assets]
+
+    with pytest.raises(ValueError,
+                       match="longer than the maximum model length"):
+        vllm_model = vllm_runner(
+            model,
+            max_model_len=128,  # LLaVA has a feature size of 576
+            enforce_eager=True,
+        )
+
+        with vllm_model:
+            vllm_model.generate_greedy([HF_IMAGE_PROMPTS[0]],
+                                       max_tokens=1,
+                                       images=[images[0]])
diff --git a/vllm_v0.10.0/tests/entrypoints/__init__.py b/vllm_v0.10.0/tests/entrypoints/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/entrypoints/conftest.py b/vllm_v0.10.0/tests/entrypoints/conftest.py
new file mode 100644
index 0000000..a7c533e
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/conftest.py
@@ -0,0 +1,203 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+
+@pytest.fixture
+def sample_prompts():
+    return [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+
+@pytest.fixture
+def sample_token_ids():
+    return [
+        [0],
+        [0, 1],
+        [0, 2, 1],
+        [0, 3, 1, 2],
+    ]
+
+
+@pytest.fixture
+def sample_regex():
+    return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+            r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
+
+
+@pytest.fixture
+def sample_json_schema():
+    return {
+        "type": "object",
+        "properties": {
+            "name": {
+                "type": "string"
+            },
+            "age": {
+                "type": "integer"
+            },
+            "skills": {
+                "type": "array",
+                "items": {
+                    "type": "string",
+                    "maxLength": 10
+                },
+                "minItems": 3
+            },
+            "work_history": {
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "company": {
+                            "type": "string"
+                        },
+                        "duration": {
+                            "type": "number"
+                        },
+                        "position": {
+                            "type": "string"
+                        }
+                    },
+                    "required": ["company", "position"]
+                }
+            }
+        },
+        "required": ["name", "age", "skills", "work_history"]
+    }
+
+
+@pytest.fixture
+def sample_complex_json_schema():
+    return {
+        "type": "object",
+        "properties": {
+            "score": {
+                "type": "integer",
+                "minimum": 0,
+                "maximum": 100  # Numeric range
+            },
+            "grade": {
+                "type": "string",
+                "pattern": "^[A-D]$"  # Regex pattern
+            },
+            "email": {
+                "type": "string",
+                "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
+            },
+            "tags": {
+                "type": "array",
+                "items": {
+                    "type": "string",
+                    "pattern":
+                    "^[a-z]{1,10}$"  # Combining length and pattern restrictions
+                }
+            }
+        },
+        "required": ["score", "grade", "email", "tags"]
+    }
+
+
+@pytest.fixture
+def sample_definition_json_schema():
+    return {
+        '$defs': {
+            'Step': {
+                'properties': {
+                    'explanation': {
+                        'title': 'Explanation',
+                        'type': 'string'
+                    },
+                    'output': {
+                        'title': 'Output',
+                        'type': 'string'
+                    }
+                },
+                'required': ['explanation', 'output'],
+                'title': 'Step',
+                'type': 'object'
+            }
+        },
+        'properties': {
+            'steps': {
+                'items': {
+                    '$ref': '#/$defs/Step'
+                },
+                'title': 'Steps',
+                'type': 'array'
+            },
+            'final_answer': {
+                'title': 'Final Answer',
+                'type': 'string'
+            }
+        },
+        'required': ['steps', 'final_answer'],
+        'title': 'MathReasoning',
+        'type': 'object'
+    }
+
+
+@pytest.fixture
+def sample_enum_json_schema():
+    return {
+        "type": "object",
+        "properties": {
+            "status": {
+                "type": "string",
+                "enum": ["active", "inactive",
+                         "pending"]  # Literal values using enum
+            },
+            "priority": {
+                "type": "string",
+                "enum": ["low", "medium", "high", "critical"]
+            },
+            "category": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "enum": ["bug", "feature", "improvement"]
+                    },
+                    "severity": {
+                        "type": "integer",
+                        "enum": [1, 2, 3, 4,
+                                 5]  # Enum can also contain numbers
+                    }
+                },
+                "required": ["type", "severity"]
+            },
+            "flags": {
+                "type": "array",
+                "items": {
+                    "type": "string",
+                    "enum": ["urgent", "blocked", "needs_review", "approved"]
+                }
+            }
+        },
+        "required": ["status", "priority", "category", "flags"]
+    }
+
+
+@pytest.fixture
+def sample_guided_choice():
+    return [
+        "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
+        "Ruby", "Swift", "Kotlin"
+    ]
+
+
+@pytest.fixture
+def sample_sql_statements():
+    return ("""
+start: select_statement
+select_statement: "SELECT" column "from" table "where" condition
+column: "col_1" | "col_2"
+table: "table_1" | "table_2"
+condition: column "=" number
+number: "1" | "2"
+""")
diff --git a/vllm_v0.10.0/tests/entrypoints/llm/__init__.py b/vllm_v0.10.0/tests/entrypoints/llm/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/entrypoints/llm/test_accuracy.py b/vllm_v0.10.0/tests/entrypoints/llm/test_accuracy.py
new file mode 100644
index 0000000..6c5706d
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/llm/test_accuracy.py
@@ -0,0 +1,111 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file test accuracy of the vLLM server via LMEval.
+It uses local-completions, which interacts with vLLM
+through the OAI API with N concurrent connections.
+This simulates real work usage of the API and makes
+sure that the zmq frontend mp RPC message passing and
+AsyncLLMEngine are working correctly.
+"""
+
+import lm_eval
+import pytest
+
+from vllm.platforms import current_platform
+
+MODEL_NAMES = [
+    "Qwen/Qwen3-1.7B",
+    "google/gemma-3-1b-it",
+]
+FP8_KV_MODEL_NAMES = [
+    "Qwen/Qwen3-1.7B",
+]
+NUM_CONCURRENT = 500
+TASK = "gsm8k"
+FILTER = "exact_match,strict-match"
+RTOL = 0.03
+EXPECTED_VALUES = {
+    "Qwen/Qwen3-1.7B": 0.68,
+    "google/gemma-3-1b-it": 0.25,
+}
+
+
+def run_test(model_name, more_args=None):
+    """Run the end to end accuracy test."""
+
+    model_args = f"pretrained={model_name},max_model_len=4096"
+
+    if more_args is not None:
+        model_args = "{},{}".format(model_args, more_args)
+
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=model_args,
+        tasks="gsm8k",
+        batch_size="auto",
+    )
+
+    measured_value = results["results"][TASK][FILTER]
+    assert model_name in EXPECTED_VALUES, (
+        f"Cannot find the expected value for the model {model_name=}")
+    expected_value = EXPECTED_VALUES[model_name]
+    assert (measured_value - RTOL < expected_value
+            and measured_value + RTOL > expected_value
+            ), f"Expected: {expected_value} |  Measured: {measured_value}"
+
+
+# TODO: [AlexM] Fix it with new CI/CD tests
+TPU_TP_TEST_STR = ""  #"tensor_parallel_size=4"
+
+
+@pytest.mark.skipif(not current_platform.is_cuda()
+                    and not current_platform.is_tpu(),
+                    reason="V1 is currently only supported on CUDA and TPU")
+@pytest.mark.parametrize("model", MODEL_NAMES)
+def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch):
+    """Run with the V1 Engine."""
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        more_args = None
+        if current_platform.is_tpu():
+            # Limit compilation time for TPU V1
+
+            # xet doesn't work well for both Qwen/Qwen3-1.7B and
+            # google/gemma-3-1b-it
+            m.setenv("HF_HUB_DISABLE_XET", "1")
+            more_args = "max_model_len=2048,max_num_seqs=64"
+
+            # Add TP test (if provided)
+            if TPU_TP_TEST_STR:
+                more_args += ",{}".format(TPU_TP_TEST_STR)
+
+        run_test(model, more_args)
+
+
+@pytest.mark.skipif(not current_platform.is_cuda()
+                    and not current_platform.is_tpu(),
+                    reason="V1 is currently only supported on CUDA and TPU")
+@pytest.mark.parametrize("model", FP8_KV_MODEL_NAMES)
+def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(
+        model, monkeypatch: pytest.MonkeyPatch):
+    """Run with the V1 Engine."""
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        more_args = None
+        if current_platform.is_tpu():
+            # Limit compilation time for TPU V1
+
+            # xet doesn't work well for Qwen/Qwen3-1.7B
+            m.setenv("HF_HUB_DISABLE_XET", "1")
+            more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8"
+
+            # Add TP test (if provided)
+            if TPU_TP_TEST_STR:
+                more_args += ",{}".format(TPU_TP_TEST_STR)
+
+        run_test(model, more_args)
diff --git a/vllm_v0.10.0/tests/entrypoints/llm/test_chat.py b/vllm_v0.10.0/tests/entrypoints/llm/test_chat.py
new file mode 100644
index 0000000..97cf3b5
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/llm/test_chat.py
@@ -0,0 +1,197 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import weakref
+
+import pytest
+
+from vllm import LLM
+from vllm.distributed import cleanup_dist_env_and_memory
+
+from ..openai.test_vision import TEST_IMAGE_URLS
+
+
+@pytest.fixture(scope="function")
+def text_llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
+              enforce_eager=True,
+              seed=0)
+
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+
+        del llm
+
+    cleanup_dist_env_and_memory()
+
+
+def test_chat(text_llm):
+    prompt1 = "Explain the concept of entropy."
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": prompt1
+        },
+    ]
+    outputs = text_llm.chat(messages)
+    assert len(outputs) == 1
+
+
+def test_multi_chat(text_llm):
+    prompt1 = "Explain the concept of entropy."
+    prompt2 = "Explain what among us is."
+
+    conversation1 = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": prompt1
+        },
+    ]
+
+    conversation2 = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": prompt2
+        },
+    ]
+
+    messages = [conversation1, conversation2]
+
+    outputs = text_llm.chat(messages)
+    assert len(outputs) == 2
+
+
+@pytest.fixture(scope="function")
+def vision_llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(
+        model="microsoft/Phi-3.5-vision-instruct",
+        max_model_len=4096,
+        max_num_seqs=5,
+        enforce_eager=True,
+        trust_remote_code=True,
+        limit_mm_per_prompt={"image": 2},
+        seed=0,
+    )
+
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+
+        del llm
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.mark.parametrize("image_urls",
+                         [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
+def test_chat_multi_image(vision_llm, image_urls: list[str]):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *({
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            } for image_url in image_urls),
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+    outputs = vision_llm.chat(messages)
+    assert len(outputs) >= 0
+
+
+def test_llm_chat_tokenization_no_double_bos(text_llm):
+    """
+    LLM.chat() should not add special tokens when using chat templates.
+    Check we get a single BOS token for llama chat.
+    """
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": "Hello!"
+        },
+    ]
+    outputs = text_llm.chat(messages)
+    assert len(outputs) == 1
+
+    prompt_token_ids = outputs[0].prompt_token_ids
+    assert prompt_token_ids is not None
+
+    bos_token = text_llm.get_tokenizer().bos_token_id
+
+    # Ensure we have a single BOS
+    assert prompt_token_ids[0] == bos_token
+    assert prompt_token_ids[1] != bos_token, "Double BOS"
+
+
+@pytest.fixture(scope="function")
+def thinking_llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(
+        model="Qwen/Qwen3-0.6B",
+        max_model_len=4096,
+        enforce_eager=True,
+        seed=0,
+    )
+
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+
+        del llm
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.mark.parametrize("enable_thinking", [True, False])
+def test_chat_extra_kwargs(thinking_llm, enable_thinking):
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": "What is 1+1?"
+        },
+    ]
+
+    outputs = thinking_llm.chat(
+        messages,
+        chat_template_kwargs={"enable_thinking": enable_thinking},
+    )
+    assert len(outputs) == 1
+
+    prompt_token_ids = outputs[0].prompt_token_ids
+    assert prompt_token_ids is not None
+
+    think_id = thinking_llm.get_tokenizer().get_vocab()["<think>"]
+
+    if enable_thinking:
+        assert think_id not in prompt_token_ids
+    else:
+        # The chat template includes dummy thinking process
+        assert think_id in prompt_token_ids
diff --git a/vllm_v0.10.0/tests/entrypoints/llm/test_collective_rpc.py b/vllm_v0.10.0/tests/entrypoints/llm/test_collective_rpc.py
new file mode 100644
index 0000000..3a13f8c
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/llm/test_collective_rpc.py
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm import LLM
+
+from ...utils import create_new_process_for_each_test
+
+
+@pytest.mark.parametrize("tp_size", [1, 2])
+@pytest.mark.parametrize("backend", ["mp", "ray"])
+@create_new_process_for_each_test()
+def test_collective_rpc(tp_size, backend, monkeypatch):
+    if tp_size == 1 and backend == "ray":
+        pytest.skip("Skip duplicate test case")
+    if tp_size == 1:
+        backend = None
+
+    # intentionally define the method and class in the test function,
+    # to test if they can be serialized and sent to the workers
+    def echo_rank(self):
+        return self.rank
+
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
+              enforce_eager=True,
+              load_format="dummy",
+              tensor_parallel_size=tp_size,
+              distributed_executor_backend=backend)
+    assert llm.collective_rpc(echo_rank) == list(range(tp_size))
diff --git a/vllm_v0.10.0/tests/entrypoints/llm/test_encode.py b/vllm_v0.10.0/tests/entrypoints/llm/test_encode.py
new file mode 100644
index 0000000..b930f05
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/llm/test_encode.py
@@ -0,0 +1,133 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import weakref
+
+import pytest
+
+from vllm import LLM, PoolingParams, PoolingRequestOutput
+from vllm.distributed import cleanup_dist_env_and_memory
+
+from ...models.utils import check_embeddings_close
+
+MODEL_NAME = "intfloat/multilingual-e5-small"
+
+PROMPTS = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+TOKEN_IDS = [
+    # Using ID={0, 1, 2, 3} results in NaN values,
+    # so we add this offset of 1000
+    [1000],
+    [1000, 1001],
+    [1000, 1002, 1001],
+    [1000, 1003, 1001, 1002],
+]
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model=MODEL_NAME,
+              max_num_batched_tokens=32768,
+              tensor_parallel_size=1,
+              gpu_memory_utilization=0.75,
+              enforce_eager=True,
+              seed=0)
+
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+
+        del llm
+
+    cleanup_dist_env_and_memory()
+
+
+def assert_outputs_match(o1: list[PoolingRequestOutput],
+                         o2: list[PoolingRequestOutput]):
+    check_embeddings_close(
+        embeddings_0_lst=[o.outputs.data for o in o1],
+        embeddings_1_lst=[o.outputs.data for o in o2],
+        name_0="hf",
+        name_1="vllm",
+        tol=1e-2,
+    )
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
+def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
+                                                    prompt_token_ids):
+    pooling_params = PoolingParams()
+
+    with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
+        v1_output = llm.encode(prompt_token_ids=prompt_token_ids,
+                               pooling_params=pooling_params)
+
+    v2_output = llm.encode({"prompt_token_ids": prompt_token_ids},
+                           pooling_params=pooling_params)
+    assert_outputs_match(v1_output, v2_output)
+
+
+@pytest.mark.skip_global_cleanup
+def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
+    pooling_params = PoolingParams()
+
+    with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
+        v1_output = llm.encode(prompt_token_ids=TOKEN_IDS,
+                               pooling_params=pooling_params)
+
+    v2_output = llm.encode(
+        [{
+            "prompt_token_ids": p
+        } for p in TOKEN_IDS],
+        pooling_params=pooling_params,
+    )
+    assert_outputs_match(v1_output, v2_output)
+
+
+@pytest.mark.skip_global_cleanup
+def test_multiple_pooling_params(llm: LLM):
+    pooling_params = [
+        PoolingParams(),
+        PoolingParams(),
+        PoolingParams(),
+        PoolingParams(),
+    ]
+
+    # Multiple PoolingParams should be matched with each prompt
+    outputs = llm.encode(PROMPTS, pooling_params=pooling_params)
+    assert len(PROMPTS) == len(outputs)
+
+    # Exception raised, if the size of params does not match the size of prompts
+    with pytest.raises(ValueError):
+        outputs = llm.encode(PROMPTS, pooling_params=pooling_params[:3])
+
+    # Single PoolingParams should be applied to every prompt
+    single_pooling_params = PoolingParams()
+    outputs = llm.encode(PROMPTS, pooling_params=single_pooling_params)
+    assert len(PROMPTS) == len(outputs)
+
+    # pooling_params is None, default params should be applied
+    outputs = llm.encode(PROMPTS, pooling_params=None)
+    assert len(PROMPTS) == len(outputs)
+
+
+@pytest.mark.skip_global_cleanup
+def test_right_side_truncation(llm: LLM):
+    # Embeddings models should truncate the end of the prompt
+    tokenizer = llm.get_tokenizer()
+    assert tokenizer.truncation_side == "right"
diff --git a/vllm_v0.10.0/tests/entrypoints/llm/test_generate.py b/vllm_v0.10.0/tests/entrypoints/llm/test_generate.py
new file mode 100644
index 0000000..707891f
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/llm/test_generate.py
@@ -0,0 +1,131 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import weakref
+
+import pytest
+
+from vllm import LLM, RequestOutput, SamplingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+
+MODEL_NAME = "distilbert/distilgpt2"
+
+PROMPTS = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+TOKEN_IDS = [
+    [0],
+    [0, 1],
+    [0, 2, 1],
+    [0, 3, 1, 2],
+]
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    """We can run both engines for this test."""
+    pass
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model=MODEL_NAME,
+              max_num_batched_tokens=4096,
+              tensor_parallel_size=1,
+              gpu_memory_utilization=0.10,
+              enforce_eager=True)
+
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+
+        del llm
+
+    cleanup_dist_env_and_memory()
+
+
+def assert_outputs_equal(o1: list[RequestOutput], o2: list[RequestOutput]):
+    assert [o.outputs for o in o1] == [o.outputs for o in o2]
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
+def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
+                                                    prompt_token_ids):
+    sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
+
+    with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
+        v1_output = llm.generate(prompt_token_ids=prompt_token_ids,
+                                 sampling_params=sampling_params)
+
+    v2_output = llm.generate({"prompt_token_ids": prompt_token_ids},
+                             sampling_params=sampling_params)
+    assert_outputs_equal(v1_output, v2_output)
+
+
+@pytest.mark.skip_global_cleanup
+def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
+    sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
+
+    with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
+        v1_output = llm.generate(prompt_token_ids=TOKEN_IDS,
+                                 sampling_params=sampling_params)
+
+    v2_output = llm.generate(
+        [{
+            "prompt_token_ids": p
+        } for p in TOKEN_IDS],
+        sampling_params=sampling_params,
+    )
+    assert_outputs_equal(v1_output, v2_output)
+
+
+@pytest.mark.skip_global_cleanup
+def test_multiple_sampling_params(llm: LLM):
+    sampling_params = [
+        SamplingParams(temperature=0.01, top_p=0.95),
+        SamplingParams(temperature=0.3, top_p=0.95),
+        SamplingParams(temperature=0.7, top_p=0.95),
+        SamplingParams(temperature=0.99, top_p=0.95),
+    ]
+
+    # Multiple SamplingParams should be matched with each prompt
+    outputs = llm.generate(PROMPTS, sampling_params=sampling_params)
+    assert len(PROMPTS) == len(outputs)
+
+    # Exception raised, if the size of params does not match the size of prompts
+    with pytest.raises(ValueError):
+        outputs = llm.generate(PROMPTS, sampling_params=sampling_params[:3])
+
+    # Single SamplingParams should be applied to every prompt
+    single_sampling_params = SamplingParams(temperature=0.3, top_p=0.95)
+    outputs = llm.generate(PROMPTS, sampling_params=single_sampling_params)
+    assert len(PROMPTS) == len(outputs)
+
+    # sampling_params is None, default params should be applied
+    outputs = llm.generate(PROMPTS, sampling_params=None)
+    assert len(PROMPTS) == len(outputs)
+
+
+def test_max_model_len():
+    max_model_len = 20
+    llm = LLM(
+        model=MODEL_NAME,
+        max_model_len=max_model_len,
+        gpu_memory_utilization=0.10,
+        enforce_eager=True,  # reduce test time
+    )
+    sampling_params = SamplingParams(max_tokens=max_model_len + 10)
+    outputs = llm.generate(PROMPTS, sampling_params)
+    for output in outputs:
+        num_total_tokens = len(output.prompt_token_ids) + len(
+            output.outputs[0].token_ids)
+        # Total tokens must not exceed max_model_len.
+        # It can be less if generation finishes due to other reasons (e.g., EOS)
+        # before reaching the absolute model length limit.
+        assert num_total_tokens <= max_model_len
diff --git a/vllm_v0.10.0/tests/entrypoints/llm/test_generate_multiple_loras.py b/vllm_v0.10.0/tests/entrypoints/llm/test_generate_multiple_loras.py
new file mode 100644
index 0000000..b7d53e3
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/llm/test_generate_multiple_loras.py
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import weakref
+
+import pytest
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
+
+from vllm import LLM
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.lora.request import LoRARequest
+
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+
+PROMPTS = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+
+
+@pytest.fixture(scope="module")
+def monkeypatch_module():
+    from _pytest.monkeypatch import MonkeyPatch
+    mpatch = MonkeyPatch()
+    yield mpatch
+    mpatch.undo()
+
+
+@pytest.fixture(scope="module", params=[False, True])
+def llm(request, monkeypatch_module):
+
+    use_v1 = request.param
+    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model=MODEL_NAME,
+              tensor_parallel_size=1,
+              max_model_len=8192,
+              enable_lora=True,
+              max_loras=4,
+              max_lora_rank=64,
+              max_num_seqs=128,
+              enforce_eager=True)
+
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+
+        del llm
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+
+
+@pytest.mark.skip_global_cleanup
+def test_multiple_lora_requests(llm: LLM, zephyr_lora_files):
+    lora_request = [
+        LoRARequest(LORA_NAME + str(idx), idx + 1, zephyr_lora_files)
+        for idx in range(len(PROMPTS))
+    ]
+    # Multiple SamplingParams should be matched with each prompt
+    outputs = llm.generate(PROMPTS, lora_request=lora_request)
+    assert len(PROMPTS) == len(outputs)
+
+    # Exception raised, if the size of params does not match the size of prompts
+    with pytest.raises(ValueError):
+        outputs = llm.generate(PROMPTS, lora_request=lora_request[:1])
+
+    # Single LoRARequest should be applied to every prompt
+    single_lora_request = lora_request[0]
+    outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
+    assert len(PROMPTS) == len(outputs)
diff --git a/vllm_v0.10.0/tests/entrypoints/llm/test_gpu_utilization.py b/vllm_v0.10.0/tests/entrypoints/llm/test_gpu_utilization.py
new file mode 100644
index 0000000..533da9e
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/llm/test_gpu_utilization.py
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import LLM, SamplingParams
+
+
+def test_gpu_memory_utilization():
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    # makes sure gpu_memory_utilization is per-instance limit,
+    # not a global limit
+    llms = [
+        LLM(model="facebook/opt-125m",
+            gpu_memory_utilization=0.3,
+            enforce_eager=True) for i in range(3)
+    ]
+    for llm in llms:
+        outputs = llm.generate(prompts, sampling_params)
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/vllm_v0.10.0/tests/entrypoints/llm/test_guided_generate.py b/vllm_v0.10.0/tests/entrypoints/llm/test_guided_generate.py
new file mode 100644
index 0000000..5557834
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/llm/test_guided_generate.py
@@ -0,0 +1,552 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+import weakref
+from enum import Enum
+
+import jsonschema
+import pytest
+import regex as re
+from pydantic import BaseModel
+
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.entrypoints.llm import LLM
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams
+
+MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
+
+# Separate backends which support grammars vs ones
+# which only support regex based constraints in tests.
+GRAMMAR_DECODING_BACKENDS = [
+    # (backend, disable_any_whitespace),
+    ("lm-format-enforcer", False),
+    ("xgrammar", True),
+    ("guidance", True),
+]
+
+ALL_DECODING_BACKENDS = ([("outlines", False)] + GRAMMAR_DECODING_BACKENDS)
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model=MODEL_NAME, max_model_len=1024, seed=0)
+
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+        del llm
+    cleanup_dist_env_and_memory()
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
+                         ALL_DECODING_BACKENDS)
+def test_guided_regex(sample_regex, llm, guided_decoding_backend: str,
+                      disable_any_whitespace: bool):
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        guided_decoding=GuidedDecodingParams(
+            regex=sample_regex,
+            backend=guided_decoding_backend,
+            disable_any_whitespace=disable_any_whitespace))
+
+    outputs = llm.generate(prompts=[
+        f"Give an example IPv4 address with this regex: {sample_regex}"
+    ] * 2,
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(generated_text)
+        assert generated_text is not None
+        assert re.fullmatch(sample_regex, generated_text) is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
+                         ALL_DECODING_BACKENDS)
+def test_guided_json_completion(sample_json_schema, llm,
+                                guided_decoding_backend: str,
+                                disable_any_whitespace: bool):
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(
+            json=sample_json_schema,
+            backend=guided_decoding_backend,
+            disable_any_whitespace=disable_any_whitespace))
+    outputs = llm.generate(prompts=[
+        f"Give an example JSON for an employee profile "
+        f"that fits this schema: {sample_json_schema}"
+    ] * 2,
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json, schema=sample_json_schema)
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
+                         ALL_DECODING_BACKENDS)
+def test_guided_complex_json_completion(sample_complex_json_schema, llm,
+                                        guided_decoding_backend: str,
+                                        disable_any_whitespace: bool):
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(
+            json=sample_complex_json_schema,
+            backend=guided_decoding_backend,
+            disable_any_whitespace=disable_any_whitespace))
+    outputs = llm.generate(prompts=[
+        f"Give an example JSON for an assignment grade "
+        f"that fits this schema: {sample_complex_json_schema}"
+    ] * 2,
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json,
+                            schema=sample_complex_json_schema)
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
+                         ALL_DECODING_BACKENDS)
+def test_guided_definition_json_completion(sample_definition_json_schema, llm,
+                                           guided_decoding_backend: str,
+                                           disable_any_whitespace: bool):
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(
+            json=sample_definition_json_schema,
+            backend=guided_decoding_backend,
+            disable_any_whitespace=disable_any_whitespace))
+    outputs = llm.generate(prompts=[
+        f"Give an example JSON for solving 8x + 7 = -23 "
+        f"that fits this schema: {sample_definition_json_schema}"
+    ] * 2,
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json,
+                            schema=sample_definition_json_schema)
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
+                         ALL_DECODING_BACKENDS)
+def test_guided_enum_json_completion(sample_enum_json_schema, llm,
+                                     guided_decoding_backend: str,
+                                     disable_any_whitespace: bool):
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(
+            json=sample_enum_json_schema,
+            backend=guided_decoding_backend,
+            disable_any_whitespace=disable_any_whitespace))
+    outputs = llm.generate(prompts=[
+        "Create a bug report JSON that fits this schema: "
+        f"{sample_enum_json_schema}. Make it for a high priority critical bug."
+    ] * 2,
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json,
+                            schema=sample_enum_json_schema)
+
+        # Additional assertions to verify enum values
+        assert output_json["status"] in ["active", "inactive", "pending"]
+        assert output_json["priority"] in ["low", "medium", "high", "critical"]
+        assert output_json["category"]["type"] in [
+            "bug", "feature", "improvement"
+        ]
+        assert output_json["category"]["severity"] in [1, 2, 3, 4, 5]
+        for flag in output_json["flags"]:
+            assert flag in ["urgent", "blocked", "needs_review", "approved"]
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
+                         ALL_DECODING_BACKENDS)
+def test_guided_choice_completion(sample_guided_choice, llm,
+                                  guided_decoding_backend: str,
+                                  disable_any_whitespace: bool):
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        guided_decoding=GuidedDecodingParams(
+            choice=sample_guided_choice,
+            backend=guided_decoding_backend,
+            disable_any_whitespace=disable_any_whitespace))
+    outputs = llm.generate(
+        prompts="The best language for type-safe systems programming is ",
+        sampling_params=sampling_params,
+        use_tqdm=True)
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(generated_text)
+        assert generated_text is not None
+        assert generated_text in sample_guided_choice
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
+                         GRAMMAR_DECODING_BACKENDS)
+def test_guided_grammar(sample_sql_statements, llm,
+                        guided_decoding_backend: str,
+                        disable_any_whitespace: bool):
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(
+            grammar=sample_sql_statements,
+            backend=guided_decoding_backend,
+            disable_any_whitespace=disable_any_whitespace))
+    outputs = llm.generate(
+        prompts=("Generate a sql state that select col_1 from "
+                 "table_1 where it is equals to 1"),
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        # use Lark to parse the output, and make sure it's a valid parse tree
+        from lark import Lark
+        parser = Lark(sample_sql_statements)
+        parser.parse(generated_text)
+
+        # remove spaces for comparison b/c we removed them in the grammar
+        ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(
+            " ", "")
+
+        assert generated_text.strip() == ground_truth
+
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+@pytest.mark.skip_global_cleanup
+def test_guided_options_request_deprecation_warning(sample_regex, llm):
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    with pytest.warns(DeprecationWarning, match="guided_options_request"):
+        llm.generate(prompts="This should fail",
+                     sampling_params=sampling_params,
+                     use_tqdm=True,
+                     guided_options_request=dict(guided_regex=sample_regex))
+
+
+@pytest.mark.skip_global_cleanup
+def test_validation_against_both_guided_decoding_options(sample_regex, llm):
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        guided_decoding=GuidedDecodingParams(regex=sample_regex))
+
+    with pytest.raises(ValueError, match="Cannot set both"):
+        llm.generate(prompts="This should fail",
+                     sampling_params=sampling_params,
+                     use_tqdm=True,
+                     guided_options_request=dict(guided_regex=sample_regex))
+
+
+@pytest.mark.skip_global_cleanup
+def test_disable_guided_decoding_fallback(sample_regex, llm):
+    # see has_xgrammar_unsupported_json_features()
+    unsupported_json = {
+        "type": "object",
+        "properties": {
+            "example": {
+                "type": "string",
+                "minLength": 5  # unsupported by xgrammar
+            }
+        }
+    }
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     guided_decoding=GuidedDecodingParams(
+                                         json=unsupported_json,
+                                         backend="xgrammar",
+                                         disable_fallback=True))
+
+    with pytest.raises(
+            ValueError,
+            match="xgrammar does not support advanced JSON schema features "
+            "like string length, item limits, or property bounds."):
+        llm.generate(prompts="This should fail",
+                     sampling_params=sampling_params,
+                     use_tqdm=True)
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
+                         GRAMMAR_DECODING_BACKENDS)
+def test_guided_json_object(llm, guided_decoding_backend: str,
+                            disable_any_whitespace: bool):
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=100,
+        n=2,
+        guided_decoding=GuidedDecodingParams(
+            json_object=True,
+            backend=guided_decoding_backend,
+            disable_any_whitespace=disable_any_whitespace))
+
+    outputs = llm.generate(
+        prompts=("Generate a JSON object with curly braces for a person with "
+                 "name and age fields for John Smith who is 31 years old."),
+        sampling_params=sampling_params,
+        use_tqdm=True)
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+
+        for i in range(2):
+            generated_text = output.outputs[i].text
+            print(generated_text)
+            assert generated_text is not None
+
+            if disable_any_whitespace:
+                assert "\n" not in generated_text
+
+            # Parse to verify it is valid JSON
+            parsed_json = json.loads(generated_text)
+            # A list is not what was intended, but is still valid
+            # json.
+            assert isinstance(parsed_json, (dict, list))
+
+
+class CarType(str, Enum):
+    sedan = "sedan"
+    suv = "SUV"
+    truck = "Truck"
+    coupe = "Coupe"
+
+
+class CarDescription(BaseModel):
+    brand: str
+    model: str
+    car_type: CarType
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
+                         ALL_DECODING_BACKENDS)
+def test_guided_json_completion_with_enum(llm, guided_decoding_backend: str,
+                                          disable_any_whitespace: bool):
+    json_schema = CarDescription.model_json_schema()
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(
+            json=json_schema,
+            backend=guided_decoding_backend,
+            disable_any_whitespace=disable_any_whitespace))
+    outputs = llm.generate(
+        prompts="Generate a JSON with the brand, model and car_type of"
+        "the most iconic car from the 90's",
+        sampling_params=sampling_params,
+        use_tqdm=True)
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json, schema=json_schema)
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
+                         ALL_DECODING_BACKENDS)
+def test_guided_number_range_json_completion(llm, guided_decoding_backend: str,
+                                             disable_any_whitespace: bool):
+    sample_output_schema = {
+        "type": "object",
+        "properties": {
+            "age": {
+                "type": "integer",
+                "minimum": 18,
+                "maximum": 99
+            },
+            "score": {
+                "type": "number",
+                "minimum": 0.0,
+                "maximum": 100.0
+            },
+            "zipcode": {
+                "type": "string",
+                "pattern": r"^\d{5}(-\d{4})?$"
+            },
+        },
+        "required": ["age", "score", "zipcode"],
+    }
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(
+            json=sample_output_schema,
+            backend=guided_decoding_backend,
+            disable_any_whitespace=disable_any_whitespace),
+    )
+    outputs = llm.generate(
+        prompts=[
+            "Create a JSON object for a user with age, score, and zipcode."
+        ] * 2,
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json, schema=sample_output_schema)
+        assert 18 <= output_json["age"] <= 99
+        assert 0.0 <= output_json["score"] <= 100.0
+        assert (re.fullmatch(r"^\d{5}(-\d{4})?$", output_json["zipcode"])
+                is not None)
+
+
+@pytest.mark.skip_global_cleanup
+def test_guidance_no_additional_properties(llm):
+    schema = {
+        'type': 'object',
+        'properties': {
+            'a1': {
+                'type': 'string'
+            },
+            'a2': {
+                'type': 'string'
+            },
+            'a3': {
+                'type': 'string'
+            }
+        },
+        'required': ['a1', 'a2', 'a3'],
+    }
+
+    prompt = (
+        "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a "
+        "helpful assistant.<|im_end|>\n<|im_start|>user\nPlease generate a "
+        "large JSON object with key-value pairs a1=b1, a2=b2, ..., a20=b20"
+        "<|im_end|>\n<|im_start|>assistant\n")
+
+    def generate_with_backend(backend, disable_additional_properties):
+        guided_params = GuidedDecodingParams(
+            json=schema,
+            backend=backend,
+            disable_any_whitespace=True,
+            disable_additional_properties=disable_additional_properties)
+        sampling_params = SamplingParams(temperature=0,
+                                         max_tokens=256,
+                                         guided_decoding=guided_params)
+
+        outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
+        assert outputs is not None
+        generated_text = outputs[0].outputs[0].text
+        assert generated_text is not None
+        parsed_json = json.loads(generated_text)
+        assert isinstance(parsed_json, dict)
+        jsonschema.validate(instance=parsed_json, schema=schema)
+        return parsed_json
+
+    base_generated = generate_with_backend("guidance", False)
+    assert "a1" in base_generated
+    assert "a2" in base_generated
+    assert "a3" in base_generated
+    # by default additional keys are generated
+    assert "a4" in base_generated
+    assert "a5" in base_generated
+    assert "a6" in base_generated
+
+    generated = generate_with_backend("guidance", True)
+    assert "a1" in generated
+    assert "a2" in generated
+    assert "a3" in generated
+    assert "a4" not in generated
+    assert "a5" not in generated
+    assert "a6" not in generated
diff --git a/vllm_v0.10.0/tests/entrypoints/llm/test_lazy_outlines.py b/vllm_v0.10.0/tests/entrypoints/llm/test_lazy_outlines.py
new file mode 100644
index 0000000..61b6b4f
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/llm/test_lazy_outlines.py
@@ -0,0 +1,112 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import sys
+from contextlib import nullcontext
+
+import pytest
+from vllm_test_utils import BlameResult, blame
+
+from vllm import LLM, SamplingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    V1 only supports xgrammar so this is irrelevant.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
+def run_normal_opt125m():
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    # Create an LLM without guided decoding as a baseline.
+    llm = LLM(model="facebook/opt-125m",
+              enforce_eager=True,
+              gpu_memory_utilization=0.3)
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    # Destroy the LLM object and free up the GPU memory.
+    del llm
+    cleanup_dist_env_and_memory()
+
+
+def run_normal():
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    # Create an LLM without guided decoding as a baseline.
+    llm = LLM(model="distilbert/distilgpt2",
+              enforce_eager=True,
+              gpu_memory_utilization=0.3)
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    # Destroy the LLM object and free up the GPU memory.
+    del llm
+    cleanup_dist_env_and_memory()
+
+
+def run_lmfe(sample_regex):
+    # Create an LLM with guided decoding enabled.
+    llm = LLM(model="distilbert/distilgpt2",
+              enforce_eager=True,
+              guided_decoding_backend="lm-format-enforcer",
+              gpu_memory_utilization=0.3)
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    outputs = llm.generate(
+        prompts=[
+            f"Give an example IPv4 address with this regex: {sample_regex}"
+        ] * 2,
+        sampling_params=sampling_params,
+        use_tqdm=True,
+        guided_options_request=dict(guided_regex=sample_regex))
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+def test_lazy_outlines(sample_regex):
+    """If users don't use guided decoding, outlines should not be imported.
+    """
+    # make sure outlines is not imported
+    module_name = "outlines"
+    # In CI, we only check finally if the module is imported.
+    # If it is indeed imported, we can rerun the test with `use_blame=True`,
+    # which will trace every function call to find the first import location,
+    # and help find the root cause.
+    # We don't run it in CI by default because it is slow.
+    use_blame = False
+    context = blame(
+        lambda: module_name in sys.modules) if use_blame else nullcontext()
+    with context as result:
+        run_normal()
+        run_lmfe(sample_regex)
+    if use_blame:
+        assert isinstance(result, BlameResult)
+        print(f"the first import location is:\n{result.trace_stack}")
+    assert module_name not in sys.modules, (
+        f"Module {module_name} is imported. To see the first"
+        f" import location, run the test with `use_blame=True`.")
diff --git a/vllm_v0.10.0/tests/entrypoints/llm/test_prompt_validation.py b/vllm_v0.10.0/tests/entrypoints/llm/test_prompt_validation.py
new file mode 100644
index 0000000..1b7be15
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/llm/test_prompt_validation.py
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm import LLM
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+def test_empty_prompt():
+    llm = LLM(model="openai-community/gpt2", enforce_eager=True)
+    with pytest.raises(ValueError, match='decoder prompt cannot be empty'):
+        llm.generate([""])
+
+
+@pytest.mark.skip_v1
+def test_out_of_vocab_token():
+    llm = LLM(model="openai-community/gpt2", enforce_eager=True)
+    with pytest.raises(ValueError, match='out of vocabulary'):
+        llm.generate({"prompt_token_ids": [999999]})
diff --git a/vllm_v0.10.0/tests/entrypoints/offline_mode/__init__.py b/vllm_v0.10.0/tests/entrypoints/offline_mode/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/entrypoints/offline_mode/test_offline_mode.py b/vllm_v0.10.0/tests/entrypoints/offline_mode/test_offline_mode.py
new file mode 100644
index 0000000..a606eea
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for HF_HUB_OFFLINE mode"""
+import importlib
+import sys
+
+import pytest
+import urllib3
+
+from vllm import LLM
+from vllm.distributed import cleanup_dist_env_and_memory
+
+MODEL_CONFIGS = [
+    {
+        "model": "facebook/opt-125m",
+        "enforce_eager": True,
+        "gpu_memory_utilization": 0.20,
+        "max_model_len": 64,
+        "max_num_batched_tokens": 64,
+        "max_num_seqs": 64,
+        "tensor_parallel_size": 1,
+    },
+    {
+        "model": "mistralai/Mistral-7B-Instruct-v0.1",
+        "enforce_eager": True,
+        "gpu_memory_utilization": 0.95,
+        "max_model_len": 64,
+        "max_num_batched_tokens": 64,
+        "max_num_seqs": 64,
+        "tensor_parallel_size": 1,
+        "tokenizer_mode": "mistral",
+    },
+    {
+        "model": "sentence-transformers/all-MiniLM-L12-v2",
+        "enforce_eager": True,
+        "gpu_memory_utilization": 0.20,
+        "max_model_len": 64,
+        "max_num_batched_tokens": 64,
+        "max_num_seqs": 64,
+        "tensor_parallel_size": 1,
+    },
+]
+
+
+@pytest.fixture(scope="module")
+def cache_models():
+    # Cache model files first
+    for model_config in MODEL_CONFIGS:
+        LLM(**model_config)
+        cleanup_dist_env_and_memory()
+
+    yield
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.usefixtures("cache_models")
+def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
+    # Set HF to offline mode and ensure we can still construct an LLM
+    with monkeypatch.context() as m:
+        try:
+            m.setenv("HF_HUB_OFFLINE", "1")
+            m.setenv("VLLM_NO_USAGE_STATS", "1")
+
+            def disable_connect(*args, **kwargs):
+                raise RuntimeError("No http calls allowed")
+
+            m.setattr(
+                urllib3.connection.HTTPConnection,
+                "connect",
+                disable_connect,
+            )
+            m.setattr(
+                urllib3.connection.HTTPSConnection,
+                "connect",
+                disable_connect,
+            )
+
+            # Need to re-import huggingface_hub
+            # and friends to setup offline mode
+            _re_import_modules()
+            # Cached model files should be used in offline mode
+            for model_config in MODEL_CONFIGS:
+                LLM(**model_config)
+        finally:
+            # Reset the environment after the test
+            # NB: Assuming tests are run in online mode
+            _re_import_modules()
+
+
+def _re_import_modules():
+    hf_hub_module_names = [
+        k for k in sys.modules if k.startswith("huggingface_hub")
+    ]
+    transformers_module_names = [
+        k for k in sys.modules if k.startswith("transformers")
+        and not k.startswith("transformers_modules")
+    ]
+
+    reload_exception = None
+    for module_name in hf_hub_module_names + transformers_module_names:
+        try:
+            importlib.reload(sys.modules[module_name])
+        except Exception as e:
+            reload_exception = e
+            # Try to continue clean up so that other tests are less likely to
+            # be affected
+
+    # Error this test if reloading a module failed
+    if reload_exception is not None:
+        raise reload_exception
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/__init__.py b/vllm_v0.10.0/tests/entrypoints/openai/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/correctness/__init__.py b/vllm_v0.10.0/tests/entrypoints/openai/correctness/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/correctness/test_lmeval.py b/vllm_v0.10.0/tests/entrypoints/openai/correctness/test_lmeval.py
new file mode 100644
index 0000000..a07a147
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/correctness/test_lmeval.py
@@ -0,0 +1,96 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file test accuracy of the vLLM server via LMEval.
+It uses local-completions, which interacts with vLLM
+through the OAI API with N concurrent connections.
+This simulates real work usage of the API and makes
+sure that the zmq frontend mp RPC message passing and
+AsyncLLMEngine are working correctly.
+"""
+
+import lm_eval
+import pytest
+
+from vllm.platforms import current_platform
+
+from ....utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
+NUM_CONCURRENT = 500
+TASK = "gsm8k"
+FILTER = "exact_match,strict-match"
+RTOL = 0.03
+EXPECTED_VALUE = 0.54
+DEFAULT_ARGS = ["--max-model-len", "4096", "--disable-log-requests"]
+MORE_ARGS_LIST = [
+    [],  # Default
+    ["--enable-chunked-prefill"],  # Chunked
+    ["--num-scheduler-steps", "8"],  # MS
+    ["--num-scheduler-steps", "8", "--multi-step-stream-outputs"]  # MS+Stream
+]
+MAX_WAIT_SECONDS = None
+
+if current_platform.is_tpu():
+    MORE_ARGS_LIST = [
+        [],  # Default
+        # ["--num-scheduler-steps", "8"], # Multi-step << currently fails
+    ]
+    MAX_WAIT_SECONDS = 600
+
+
+def run_test(more_args):
+    """Run the end to end accuracy test."""
+
+    args = list(DEFAULT_ARGS)
+    args.extend(more_args)
+    print(f"Running with: {args}")
+
+    with RemoteOpenAIServer(
+            MODEL_NAME, args,
+            max_wait_seconds=MAX_WAIT_SECONDS) as remote_server:
+        url = f"{remote_server.url_for('v1')}/completions"
+
+        model_args = (
+            f"model={MODEL_NAME},"
+            f"base_url={url},"
+            f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")
+
+        results = lm_eval.simple_evaluate(
+            model="local-completions",
+            model_args=model_args,
+            tasks=TASK,
+        )
+
+        measured_value = results["results"][TASK][FILTER]
+        assert (measured_value - RTOL < EXPECTED_VALUE
+                and measured_value + RTOL > EXPECTED_VALUE
+                ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+
+
+@pytest.mark.skipif(not current_platform.is_cuda()
+                    and not current_platform.is_tpu()
+                    and not current_platform.is_xpu(),
+                    reason="V1 currently only supported on CUDA, XPU and TPU")
+def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
+    """Run with the V1 Engine."""
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        more_args = []
+
+        # Limit compilation time for V1
+        if current_platform.is_tpu():
+            more_args = ["--max-num-seqs", "64"]
+
+        run_test(more_args)
+
+
+@pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
+def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch,
+                                    more_args):
+    """Run with the V0 Engine."""
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        run_test(more_args)
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/correctness/test_mteb_embed.py b/vllm_v0.10.0/tests/entrypoints/openai/correctness/test_mteb_embed.py
new file mode 100644
index 0000000..12a86f9
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/correctness/test_mteb_embed.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+
+import pytest
+
+from tests.models.language.pooling.mteb_utils import (MTEB_EMBED_TASKS,
+                                                      MTEB_EMBED_TOL,
+                                                      OpenAIClientMtebEncoder,
+                                                      run_mteb_embed_task)
+from tests.utils import RemoteOpenAIServer
+
+os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
+
+MODEL_NAME = "intfloat/e5-small"
+MAIN_SCORE = 0.7422994752439667
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--task", "embed", "--enforce-eager", "--disable-uvicorn-access-log"
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+def test_mteb_embed(server):
+    client = server.get_client()
+    encoder = OpenAIClientMtebEncoder(MODEL_NAME, client)
+    vllm_main_score = run_mteb_embed_task(encoder, MTEB_EMBED_TASKS)
+    st_main_score = MAIN_SCORE
+
+    print("VLLM main score: ", vllm_main_score)
+    print("SentenceTransformer main score: ", st_main_score)
+    print("Difference: ", st_main_score - vllm_main_score)
+
+    assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL)
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/correctness/test_mteb_score.py b/vllm_v0.10.0/tests/entrypoints/openai/correctness/test_mteb_score.py
new file mode 100644
index 0000000..05e953d
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/correctness/test_mteb_score.py
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+
+import pytest
+
+# yapf conflicts with isort for this block
+# yapf: disable
+from tests.models.language.pooling.mteb_utils import (
+    MTEB_RERANK_LANGS, MTEB_RERANK_TASKS, MTEB_RERANK_TOL,
+    RerankClientMtebEncoder, ScoreClientMtebEncoder,
+    mteb_test_rerank_models_hf, run_mteb_rerank)
+# yapf: enable
+from tests.utils import RemoteOpenAIServer
+
+os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
+
+MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--task", "score", "--enforce-eager", "--disable-uvicorn-access-log"
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def st_main_score(hf_runner):
+    # The main score related to the version of the dependency.
+    # So we need to recalculate every time.
+    main_score, st_dtype = mteb_test_rerank_models_hf(hf_runner, MODEL_NAME)
+    return main_score
+
+
+def test_mteb_score(server, st_main_score):
+    url = server.url_for("score")
+    encoder = ScoreClientMtebEncoder(MODEL_NAME, url)
+    vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
+                                      MTEB_RERANK_LANGS)
+
+    print("VLLM main score: ", vllm_main_score)
+    print("SentenceTransformer main score: ", st_main_score)
+    print("Difference: ", st_main_score - vllm_main_score)
+
+    assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
+
+
+def test_mteb_rerank(server, st_main_score):
+    url = server.url_for("rerank")
+    encoder = RerankClientMtebEncoder(MODEL_NAME, url)
+    vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
+                                      MTEB_RERANK_LANGS)
+
+    print("VLLM main score: ", vllm_main_score)
+    print("SentenceTransformer main score: ", st_main_score)
+    print("Difference: ", st_main_score - vllm_main_score)
+
+    assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py b/vllm_v0.10.0/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
new file mode 100644
index 0000000..58195f9
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
@@ -0,0 +1,167 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Evaluate Transcription API correctness by computing Word Error Rate (WER)
+on a given ASR dataset. When provided, it will also compare the WER against
+a baseline.
+This simulates real work usage of the API and makes sure that the frontend and
+AsyncLLMEngine are working correctly.
+"""
+import asyncio
+import io
+import time
+from statistics import mean, median
+
+import librosa
+import pytest
+import soundfile
+import torch
+from datasets import load_dataset
+from evaluate import load
+from transformers import AutoTokenizer
+
+from ....utils import RemoteOpenAIServer
+
+
+def to_bytes(y, sr):
+    buffer = io.BytesIO()
+    soundfile.write(buffer, y, sr, format="WAV")
+    buffer.seek(0)
+    return buffer
+
+
+async def transcribe_audio(client, tokenizer, y, sr):
+    # Send loaded audio directly instead of loading from disk,
+    # dont account for that time though
+    with to_bytes(y, sr) as f:
+        start_time = time.perf_counter()
+        transcription = await client.audio.transcriptions.create(
+            file=f,
+            model=tokenizer.name_or_path,
+            language="en",
+            temperature=0.0,
+        )
+        end_time = time.perf_counter()
+        # NOTE there's no streaming in transcriptions, can't measure ttft
+    latency = end_time - start_time
+    num_output_tokens = len(
+        tokenizer(transcription.text, add_special_tokens=False).input_ids)
+    return latency, num_output_tokens, transcription.text
+
+
+async def bound_transcribe(model_name, sem, client, audio, reference):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    # Use semaphore to limit concurrent requests.
+    async with sem:
+        result = await transcribe_audio(client, tokenizer, *audio)
+        # Normalize *english* output/reference for evaluation.
+        out = tokenizer.normalize(result[2])
+        ref = tokenizer.normalize(reference)
+        return result[:2] + (out, ref)
+
+
+async def process_dataset(model, client, data, concurrent_request):
+    sem = asyncio.Semaphore(concurrent_request)
+
+    # Warmup call as the first `librosa.load` server-side is quite slow.
+    audio, sr = data[0]["audio"]["array"], data[0]["audio"]["sampling_rate"]
+    _ = await bound_transcribe(model, sem, client, (audio, sr), "")
+
+    tasks: list[asyncio.Task] = []
+    for sample in data:
+        audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
+        task = asyncio.create_task(
+            bound_transcribe(model, sem, client, (audio, sr), sample["text"]))
+        tasks.append(task)
+    return await asyncio.gather(*tasks)
+
+
+def print_performance_metrics(results, total_time):
+    latencies = [res[0] for res in results]
+    total_tokens = sum([res[1] for res in results])
+
+    total = len(results)
+    print(f"Total Requests: {total}")
+    print(f"Successful Requests: {len(latencies)}")
+    print(f"Average Latency: {mean(latencies):.4f} seconds")
+    print(f"Median Latency: {median(latencies):.4f} seconds")
+    perc = sorted(latencies)[int(len(latencies) * 0.95) - 1]
+    print(f"95th Percentile Latency: {perc:.4f} seconds")
+    # Throughput
+    req_throughput = len(latencies) / total_time
+    print(f"Estimated req_Throughput: {req_throughput:.2f} requests/s")
+    throughput = total_tokens / total_time
+    print(f"Estimated Throughput: {throughput:.2f} tok/s")
+
+
+def add_duration(sample):
+    y, sr = sample['audio']["array"], sample['audio']["sampling_rate"]
+    sample['duration_ms'] = librosa.get_duration(y=y, sr=sr) * 1000
+    return sample
+
+
+def load_hf_dataset(dataset_repo: str, split='validation', **hf_kwargs):
+    ## Load and filter the dataset
+    dataset = load_dataset(dataset_repo, split=split, **hf_kwargs)
+    if 'duration_ms' not in dataset[0]:
+        # compute duration to filter
+        dataset = dataset.map(add_duration)
+
+    # Whisper max supported duration
+    dataset = dataset.filter(lambda example: example['duration_ms'] < 30000)
+    return dataset
+
+
+def run_evaluation(model: str,
+                   client,
+                   dataset,
+                   max_concurrent_reqs: int,
+                   n_examples: int = -1,
+                   print_metrics: bool = True):
+    if n_examples > 0:
+        dataset = dataset.select(range(n_examples))
+    start = time.perf_counter()
+    results = asyncio.run(
+        process_dataset(model, client, dataset, max_concurrent_reqs))
+    end = time.perf_counter()
+    total_time = end - start
+    print(f"Total Test Time: {total_time:.4f} seconds")
+    if print_metrics:
+        print_performance_metrics(results, total_time)
+    # Compute WER
+    predictions = [res[2] for res in results]
+    references = [res[3] for res in results]
+    wer = load("wer")
+    wer_score = 100 * wer.compute(references=references,
+                                  predictions=predictions)
+    print("WER:", wer_score)
+    return wer_score
+
+
+# alternatives "openai/whisper-large-v2", "openai/whisper-large-v3-turbo"..
+@pytest.mark.parametrize("model_name", ["openai/whisper-large-v3"])
+# Original dataset is 20GB+ in size, hence we use a pre-filtered slice.
+@pytest.mark.parametrize(
+    "dataset_repo", ["D4nt3/esb-datasets-earnings22-validation-tiny-filtered"])
+# NOTE: Expected WER measured with equivalent hf.transformers args:
+# whisper-large-v3 + esb-datasets-earnings22-validation-tiny-filtered.
+@pytest.mark.parametrize("expected_wer", [12.744980])
+def test_wer_correctness(model_name,
+                         dataset_repo,
+                         expected_wer,
+                         n_examples=-1,
+                         max_concurrent_request=None):
+    # TODO refactor to use `ASRDataset`
+    with RemoteOpenAIServer(model_name, ['--enforce-eager']) as remote_server:
+        dataset = load_hf_dataset(dataset_repo)
+
+        if not max_concurrent_request:
+            # No max concurrency
+            max_concurrent_request = n_examples if n_examples > 0\
+                else len(dataset)
+
+        client = remote_server.get_async_client()
+        wer = run_evaluation(model_name, client, dataset,
+                             max_concurrent_request, n_examples)
+        if expected_wer:
+            torch.testing.assert_close(wer, expected_wer, atol=1e-1, rtol=1e-2)
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_async_tokenization.py b/vllm_v0.10.0/tests/entrypoints/openai/test_async_tokenization.py
new file mode 100644
index 0000000..ab3c809
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_async_tokenization.py
@@ -0,0 +1,140 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import contextlib
+import random
+import time
+from typing import Callable
+
+import openai
+import pytest
+import pytest_asyncio
+import requests
+
+from tests.utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
+
+
+@pytest.fixture(scope="module")
+def server():  # noqa: F811
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "128",
+        "--load-format",
+        "dummy",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    ids=["completion", "chat"],
+    argnames=["create_func_gen", "content_body"],
+    argvalues=[
+        (lambda x: x.completions.create, {
+            "prompt": " ".join(['A'] * 10_000)
+        }),
+        (lambda x: x.chat.completions.create, {
+            "messages": [{
+                "role": "user",
+                "content": " ".join(['A'] * 10_000)
+            }]
+        }),
+    ],
+)
+async def test_with_and_without_truncate(
+    server: RemoteOpenAIServer,
+    client: openai.AsyncOpenAI,
+    create_func_gen: Callable,
+    content_body: dict,
+):
+    create_func = create_func_gen(client)
+    body = {"model": MODEL_NAME, **content_body, "max_tokens": 10}
+
+    num_requests = 10
+    truncate_prompt_tokens = ([1000] * (num_requests // 2) + [None] *
+                              (num_requests - num_requests // 2))
+    random.shuffle(truncate_prompt_tokens)
+
+    bodies = [{
+        **body, "extra_body": {
+            'truncate_prompt_tokens': t
+        }
+    } for t in truncate_prompt_tokens]
+
+    async def get_status_code(**kwargs):
+        try:
+            await create_func(**kwargs)
+            return 200
+        except openai.APIStatusError as e:
+            return e.status_code
+
+    responses = await asyncio.gather(*[get_status_code(**b) for b in bodies])
+    assert 500 not in responses
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    ids=["single completion", "multiple completions", "chat"],
+    argnames=["create_func_gen", "content_body"],
+    argvalues=[
+        (lambda x: x.completions.create, {
+            "prompt": " ".join(['A'] * 300_000)
+        }),
+        (lambda x: x.completions.create, {
+            "prompt": [" ".join(['A'] * 300_000)] * 2
+        }),
+        (lambda x: x.chat.completions.create, {
+            "messages": [{
+                "role": "user",
+                "content": " ".join(['A'] * 300_000)
+            }]
+        }),
+    ],
+)
+async def test_healthcheck_response_time(
+    server: RemoteOpenAIServer,
+    client: openai.AsyncOpenAI,
+    create_func_gen: Callable,
+    content_body: dict,
+):
+    num_requests = 50
+
+    create_func = create_func_gen(client)
+    body = {"model": MODEL_NAME, **content_body, "max_tokens": 10}
+
+    def get_response_time(url):
+        start_time = time.monotonic()
+        res = requests.get(url)
+        end_time = time.monotonic()
+        assert res.status_code == 200
+        return end_time - start_time
+
+    no_load_response_time = get_response_time(server.url_for("health"))
+    tasks = [
+        asyncio.create_task(create_func(**body)) for _ in range(num_requests)
+    ]
+    await asyncio.sleep(1)  # give the tasks a chance to start running
+    load_response_time = get_response_time(server.url_for("health"))
+
+    with contextlib.suppress(openai.APIStatusError):
+        await asyncio.gather(*tasks)
+
+    assert load_response_time < 100 * no_load_response_time
+    assert load_response_time < 0.1
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_audio.py b/vllm_v0.10.0/tests/entrypoints/openai/test_audio.py
new file mode 100644
index 0000000..d67c05a
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_audio.py
@@ -0,0 +1,417 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import openai
+import pytest
+import pytest_asyncio
+
+from vllm.assets.audio import AudioAsset
+from vllm.multimodal.utils import encode_audio_base64, fetch_audio
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
+TEST_AUDIO_URLS = [
+    AudioAsset("winning_call").url,
+    AudioAsset("mary_had_lamb").url,
+]
+MAXIMUM_AUDIOS = 2
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "5",
+        "--enforce-eager",
+        "--trust-remote-code",
+        "--limit-mm-per-prompt",
+        json.dumps({"audio": MAXIMUM_AUDIOS}),
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.fixture(scope="session")
+def base64_encoded_audio() -> dict[str, str]:
+    return {
+        audio_url: encode_audio_base64(*fetch_audio(audio_url))
+        for audio_url in TEST_AUDIO_URLS
+    }
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
+async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
+                                         model_name: str, audio_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "audio_url",
+                "audio_url": {
+                    "url": audio_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        logprobs=True,
+        temperature=0.0,
+        top_logprobs=5)
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=202, total_tokens=212)
+
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
+async def test_error_on_invalid_audio_url_type(client: openai.AsyncOpenAI,
+                                               model_name: str,
+                                               audio_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "audio_url",
+                "audio_url": audio_url
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+
+    # audio_url should be a dict {"url": "some url"}, not directly a string
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.chat.completions.create(model=model_name,
+                                                 messages=messages,
+                                                 max_completion_tokens=10,
+                                                 temperature=0.0)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
+async def test_single_chat_session_audio_base64encoded(
+        client: openai.AsyncOpenAI, model_name: str, audio_url: str,
+        base64_encoded_audio: dict[str, str]):
+
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "audio_url",
+                "audio_url": {
+                    "url":
+                    f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        logprobs=True,
+        temperature=0.0,
+        top_logprobs=5)
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=202, total_tokens=212)
+
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
+async def test_single_chat_session_input_audio(
+        client: openai.AsyncOpenAI, model_name: str, audio_url: str,
+        base64_encoded_audio: dict[str, str]):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "input_audio",
+                "input_audio": {
+                    "data": base64_encoded_audio[audio_url],
+                    "format": "wav"
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        logprobs=True,
+        top_logprobs=5)
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=202, total_tokens=212)
+
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
+                                    model_name: str, audio_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "audio_url",
+                "audio_url": {
+                    "url": audio_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=8,
+        temperature=0.0,
+    )
+    output = chat_completion.choices[0].message.content
+    stop_reason = chat_completion.choices[0].finish_reason
+
+    # test streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=8,
+        temperature=0.0,
+        stream=True,
+    )
+    chunks: list[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        if delta.content:
+            chunks.append(delta.content)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert delta.content
+    assert "".join(chunks) == output
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
+                                          model_name: str, audio_url: str,
+                                          base64_encoded_audio: dict[str,
+                                                                     str]):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "input_audio",
+                "input_audio": {
+                    "data": base64_encoded_audio[audio_url],
+                    "format": "wav"
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=8,
+        temperature=0.0,
+    )
+    output = chat_completion.choices[0].message.content
+    stop_reason = chat_completion.choices[0].finish_reason
+
+    # test streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=8,
+        temperature=0.0,
+        stream=True,
+    )
+    chunks: list[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        if delta.content:
+            chunks.append(delta.content)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert delta.content
+    assert "".join(chunks) == output
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "audio_urls", [TEST_AUDIO_URLS, TEST_AUDIO_URLS + [TEST_AUDIO_URLS[0]]])
+async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
+                                 audio_urls: list[str]):
+
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *({
+                "type": "audio_url",
+                "audio_url": {
+                    "url": audio_url
+                }
+            } for audio_url in audio_urls),
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+
+    if len(audio_urls) > MAXIMUM_AUDIOS:
+        with pytest.raises(openai.BadRequestError):  # test multi-audio input
+            await client.chat.completions.create(
+                model=model_name,
+                messages=messages,
+                max_completion_tokens=10,
+                temperature=0.0,
+            )
+
+        # the server should still work afterwards
+        completion = await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+        )
+        completion = completion.choices[0].text
+        assert completion is not None and len(completion) >= 0
+    else:
+        chat_completion = await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=10,
+            temperature=0.0,
+        )
+        message = chat_completion.choices[0].message
+        assert message.content is not None and len(message.content) >= 0
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_basic.py b/vllm_v0.10.0/tests/entrypoints/openai/test_basic.py
new file mode 100644
index 0000000..a559419
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_basic.py
@@ -0,0 +1,222 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+from http import HTTPStatus
+
+import openai
+import pytest
+import pytest_asyncio
+import requests
+
+from vllm.version import __version__ as VLLM_VERSION
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+
+
+@pytest.fixture(scope='module')
+def server_args(request: pytest.FixtureRequest) -> list[str]:
+    """ Provide extra arguments to the server via indirect parametrization
+
+    Usage:
+
+    >>> @pytest.mark.parametrize(
+    >>>     "server_args",
+    >>>     [
+    >>>         ["--disable-frontend-multiprocessing"],
+    >>>         [
+    >>>             "--model=NousResearch/Hermes-3-Llama-3.1-70B",
+    >>>             "--enable-auto-tool-choice",
+    >>>         ],
+    >>>     ],
+    >>>     indirect=True,
+    >>> )
+    >>> def test_foo(server, client):
+    >>>     ...
+
+    This will run `test_foo` twice with servers with:
+    - `--disable-frontend-multiprocessing`
+    - `--model=NousResearch/Hermes-3-Llama-3.1-70B --enable-auto-tool-choice`.
+
+    """
+    if not hasattr(request, "param"):
+        return []
+
+    val = request.param
+
+    if isinstance(val, str):
+        return [val]
+
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def server(server_args):
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "128",
+        *server_args,
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.parametrize(
+    "server_args",
+    [
+        pytest.param([], id="default-frontend-multiprocessing"),
+        pytest.param(["--disable-frontend-multiprocessing"],
+                     id="disable-frontend-multiprocessing")
+    ],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_show_version(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("version"))
+    response.raise_for_status()
+
+    assert response.json() == {"version": VLLM_VERSION}
+
+
+@pytest.mark.parametrize(
+    "server_args",
+    [
+        pytest.param([], id="default-frontend-multiprocessing"),
+        pytest.param(["--disable-frontend-multiprocessing"],
+                     id="disable-frontend-multiprocessing")
+    ],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_check_health(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("health"))
+
+    assert response.status_code == HTTPStatus.OK
+
+
+@pytest.mark.parametrize(
+    "server_args",
+    [
+        pytest.param(["--max-model-len", "10100"],
+                     id="default-frontend-multiprocessing"),
+        pytest.param(
+            ["--disable-frontend-multiprocessing", "--max-model-len", "10100"],
+            id="disable-frontend-multiprocessing")
+    ],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_request_cancellation(server: RemoteOpenAIServer):
+    # clunky test: send an ungodly amount of load in with short timeouts
+    # then ensure that it still responds quickly afterwards
+
+    chat_input = [{"role": "user", "content": "Write a long story"}]
+    client = server.get_async_client(timeout=0.5)
+    tasks = []
+    # Request about 2 million tokens
+    for _ in range(200):
+        task = asyncio.create_task(
+            client.chat.completions.create(messages=chat_input,
+                                           model=MODEL_NAME,
+                                           max_tokens=10000,
+                                           extra_body={"min_tokens": 10000}))
+        tasks.append(task)
+
+    done, pending = await asyncio.wait(tasks,
+                                       return_when=asyncio.ALL_COMPLETED)
+
+    # Make sure all requests were sent to the server and timed out
+    # (We don't want to hide other errors like 400s that would invalidate this
+    # test)
+    assert len(pending) == 0
+    for d in done:
+        with pytest.raises(openai.APITimeoutError):
+            d.result()
+
+    # If the server had not cancelled all the other requests, then it would not
+    # be able to respond to this one within the timeout
+    client = server.get_async_client(timeout=5)
+    response = await client.chat.completions.create(messages=chat_input,
+                                                    model=MODEL_NAME,
+                                                    max_tokens=10)
+
+    assert len(response.choices) == 1
+
+
+@pytest.mark.asyncio
+async def test_request_wrong_content_type(server: RemoteOpenAIServer):
+
+    chat_input = [{"role": "user", "content": "Write a long story"}]
+    client = server.get_async_client()
+
+    with pytest.raises(openai.APIStatusError):
+        await client.chat.completions.create(
+            messages=chat_input,
+            model=MODEL_NAME,
+            max_tokens=10000,
+            extra_headers={
+                "Content-Type": "application/x-www-form-urlencoded"
+            })
+
+
+@pytest.mark.parametrize(
+    "server_args",
+    [
+        pytest.param(["--enable-server-load-tracking"],
+                     id="enable-server-load-tracking")
+    ],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_server_load(server: RemoteOpenAIServer):
+    # Check initial server load
+    response = requests.get(server.url_for("load"))
+    assert response.status_code == HTTPStatus.OK
+    assert response.json().get("server_load") == 0
+
+    def make_long_completion_request():
+        return requests.post(
+            server.url_for("v1/completions"),
+            headers={"Content-Type": "application/json"},
+            json={
+                "prompt": "Give me a long story",
+                "max_tokens": 1000,
+                "temperature": 0,
+            },
+        )
+
+    # Start the completion request in a background thread.
+    completion_future = asyncio.create_task(
+        asyncio.to_thread(make_long_completion_request))
+
+    # Give a short delay to ensure the request has started.
+    await asyncio.sleep(0.1)
+
+    # Check server load while the completion request is running.
+    response = requests.get(server.url_for("load"))
+    assert response.status_code == HTTPStatus.OK
+    assert response.json().get("server_load") == 1
+
+    # Wait for the completion request to finish.
+    await completion_future
+    await asyncio.sleep(0.1)
+
+    # Check server load after the completion request has finished.
+    response = requests.get(server.url_for("load"))
+    assert response.status_code == HTTPStatus.OK
+    assert response.json().get("server_load") == 0
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_chat.py b/vllm_v0.10.0/tests/entrypoints/openai/test_chat.py
new file mode 100644
index 0000000..e7c3ffa
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_chat.py
@@ -0,0 +1,1166 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# imports for guided decoding tests
+import json
+from typing import Optional
+
+import jsonschema
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+import regex as re
+import requests
+import torch
+from openai import BadRequestError, OpenAI
+
+from ...utils import RemoteOpenAIServer
+from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
+from .test_completion import zephyr_lora_files  # noqa: F401
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+
+
+@pytest.fixture(scope="module")
+def monkeypatch_module():
+    from _pytest.monkeypatch import MonkeyPatch
+    mpatch = MonkeyPatch()
+    yield mpatch
+    mpatch.undo()
+
+
+@pytest.fixture(scope="module", params=[False, True])
+def server(
+        request,
+        monkeypatch_module,
+        zephyr_lora_files,  #noqa: F811
+        zephyr_lora_added_tokens_files):  # noqa: F811
+
+    use_v1 = request.param
+    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        f"zephyr-lora2={zephyr_lora_added_tokens_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "128",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture
+def is_v1_server(server):
+    import os
+    assert os.environ['VLLM_USE_V1'] in ['0', '1']
+    return os.environ['VLLM_USE_V1'] == '1'
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+)
+async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=5,
+        temperature=0.0,
+        logprobs=False)
+
+    choice = chat_completion.choices[0]
+    assert choice.logprobs is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=5,
+        temperature=0.0,
+        logprobs=True,
+        top_logprobs=0)
+
+    choice = chat_completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.content is not None
+    assert len(choice.logprobs.content[0].top_logprobs) == 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=5,
+        temperature=0.0,
+        logprobs=True,
+        top_logprobs=5)
+
+    choice = chat_completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.content is not None
+    assert len(choice.logprobs.content[0].top_logprobs) == 5
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
+                                      model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+
+    # Default max_logprobs is 20, so this should raise an error
+    with pytest.raises((openai.BadRequestError, openai.APIError)):
+        stream = await client.chat.completions.create(model=model_name,
+                                                      messages=messages,
+                                                      max_completion_tokens=10,
+                                                      logprobs=True,
+                                                      top_logprobs=21,
+                                                      stream=True)
+        async for chunk in stream:
+            ...
+
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(model=model_name,
+                                             messages=messages,
+                                             max_completion_tokens=10,
+                                             logprobs=True,
+                                             top_logprobs=30,
+                                             stream=False)
+
+    # the server should still work afterwards
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        stream=False)
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name, prompt_logprobs",
+    [(MODEL_NAME, 1), (MODEL_NAME, 0), (MODEL_NAME, -1), (MODEL_NAME, None)],
+)
+async def test_prompt_logprobs_chat(client: openai.AsyncOpenAI,
+                                    model_name: str,
+                                    prompt_logprobs: Optional[int]):
+    params: dict = {
+        "messages": [{
+            "role": "system",
+            "content": "You are a helpful assistant."
+        }, {
+            "role": "user",
+            "content": "Who won the world series in 2020?"
+        }, {
+            "role":
+            "assistant",
+            "content":
+            "The Los Angeles Dodgers won the World Series in 2020."
+        }, {
+            "role": "user",
+            "content": "Where was it played?"
+        }],
+        "model":
+        model_name
+    }
+
+    if prompt_logprobs is not None:
+        params["extra_body"] = {"prompt_logprobs": prompt_logprobs}
+
+    if prompt_logprobs is not None and prompt_logprobs < 0:
+        with pytest.raises(BadRequestError):
+            await client.chat.completions.create(**params)
+    else:
+        completion = await client.chat.completions.create(**params)
+        if prompt_logprobs is not None:
+            assert completion.prompt_logprobs is not None
+            assert len(completion.prompt_logprobs) > 0
+        else:
+            assert completion.prompt_logprobs is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_more_than_one_prompt_logprobs_chat(client: openai.AsyncOpenAI,
+                                                  model_name: str):
+    params: dict = {
+        "messages": [{
+            "role": "system",
+            "content": "You are a helpful assistant."
+        }, {
+            "role": "user",
+            "content": "Who won the world series in 2020?"
+        }, {
+            "role":
+            "assistant",
+            "content":
+            "The Los Angeles Dodgers won the World Series in 2020."
+        }, {
+            "role": "user",
+            "content": "Where was it played?"
+        }],
+        "model":
+        model_name,
+        "extra_body": {
+            "prompt_logprobs": 1
+        }
+    }
+
+    completion_1 = await client.chat.completions.create(**params)
+
+    params["extra_body"] = {"prompt_logprobs": 2}
+    completion_2 = await client.chat.completions.create(**params)
+
+    assert len(completion_1.prompt_logprobs[3]) == 1
+    assert len(completion_2.prompt_logprobs[3]) == 2
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_single_chat_session(client: openai.AsyncOpenAI,
+                                   model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        logprobs=True,
+        top_logprobs=5)
+    assert chat_completion.id is not None
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=37, total_tokens=47)
+
+    message = choice.message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+    )
+    output = chat_completion.choices[0].message.content
+    stop_reason = chat_completion.choices[0].finish_reason
+
+    # test streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+        stream=True,
+    )
+    chunks: list[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        if delta.content:
+            chunks.append(delta.content)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert delta.content
+    assert "".join(chunks) == output
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
+)
+async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
+                                              model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "You are a helpful assistant."
+    }, {
+        "role": "user",
+        "content": "What is the capital of France?"
+    }]
+
+    # Test stream=True, stream_options={"include_usage": False}
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": False})
+    async for chunk in stream:
+        assert chunk.usage is None
+
+    # Test stream=True, stream_options={"include_usage": True,
+    #                                   "continuous_usage_stats": False}}
+    stream = await client.chat.completions.create(model=model_name,
+                                                  messages=messages,
+                                                  max_completion_tokens=10,
+                                                  temperature=0.0,
+                                                  stream=True,
+                                                  stream_options={
+                                                      "include_usage":
+                                                      True,
+                                                      "continuous_usage_stats":
+                                                      False
+                                                  })
+
+    async for chunk in stream:
+        if chunk.choices[0].finish_reason is None:
+            assert chunk.usage is None
+        else:
+            assert chunk.usage is None
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
+            assert final_chunk.choices == []
+
+    # Test stream=False, stream_options={"include_usage": None}
+    with pytest.raises(BadRequestError):
+        await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=10,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": None})
+
+    # Test stream=False, stream_options={"include_usage": True}
+    with pytest.raises(BadRequestError):
+        await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=10,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": True})
+
+    # Test stream=True, stream_options={"include_usage": True,
+    #                           "continuous_usage_stats": True}
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        extra_body=dict(min_tokens=10),
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": True,
+            "continuous_usage_stats": True,
+        },
+    )
+    last_completion_tokens = 0
+    async for chunk in stream:
+        assert chunk.usage.prompt_tokens >= 0
+        assert last_completion_tokens == 0 or \
+               chunk.usage.completion_tokens > last_completion_tokens or \
+               (
+                   not chunk.choices and
+                   chunk.usage.completion_tokens == last_completion_tokens
+               )
+        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
+                                            chunk.usage.completion_tokens)
+        last_completion_tokens = chunk.usage.completion_tokens
+
+    assert last_completion_tokens == 10
+
+
+@pytest.mark.asyncio
+async def test_guided_choice_chat(client: openai.AsyncOpenAI,
+                                  sample_guided_choice):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        "The best language for type-safe systems programming is "
+    }]
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.7,
+        extra_body=dict(guided_choice=sample_guided_choice))
+    choice1 = chat_completion.choices[0].message.content
+    assert choice1 in sample_guided_choice
+
+    messages.append({"role": "assistant", "content": choice1})
+    messages.append({
+        "role": "user",
+        "content": "I disagree, pick another one"
+    })
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.7,
+        extra_body=dict(guided_choice=sample_guided_choice))
+    choice2 = chat_completion.choices[0].message.content
+    assert choice2 in sample_guided_choice
+    assert choice1 != choice2
+
+
+@pytest.mark.asyncio
+async def test_guided_json_chat(client: openai.AsyncOpenAI,
+                                sample_json_schema):
+
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        f"Give an example JSON for an employee profile that "
+        f"fits this schema: {sample_json_schema}"
+    }]
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=1000,
+        extra_body=dict(guided_json=sample_json_schema))
+    message = chat_completion.choices[0].message
+    assert message.content is not None
+    json1 = json.loads(message.content)
+    jsonschema.validate(instance=json1, schema=sample_json_schema)
+
+    messages.append({"role": "assistant", "content": message.content})
+    messages.append({
+        "role":
+        "user",
+        "content":
+        "Give me another one with a different name and age"
+    })
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=1000,
+        extra_body=dict(guided_json=sample_json_schema))
+    message = chat_completion.choices[0].message
+    assert message.content is not None
+    json2 = json.loads(message.content)
+    jsonschema.validate(instance=json2, schema=sample_json_schema)
+    assert json1["name"] != json2["name"]
+    assert json1["age"] != json2["age"]
+
+
+@pytest.mark.asyncio
+async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex):
+
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        f"Give an example IP address with this regex: {sample_regex}"
+    }]
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=20,
+        extra_body=dict(guided_regex=sample_regex))
+    ip1 = chat_completion.choices[0].message.content
+    assert ip1 is not None
+    assert re.fullmatch(sample_regex, ip1) is not None
+
+    messages.append({"role": "assistant", "content": ip1})
+    messages.append({"role": "user", "content": "Give me a different one"})
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=20,
+        extra_body=dict(guided_regex=sample_regex))
+    ip2 = chat_completion.choices[0].message.content
+    assert ip2 is not None
+    assert re.fullmatch(sample_regex, ip2) is not None
+    assert ip1 != ip2
+
+
+@pytest.mark.asyncio
+async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        "The best language for type-safe systems programming is "
+    }]
+
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.chat.completions.create(model=MODEL_NAME,
+                                                 messages=messages,
+                                                 extra_body=dict(guided_regex={
+                                                     1: "Python",
+                                                     2: "C++"
+                                                 }))
+
+
+@pytest.mark.asyncio
+async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
+                                           sample_guided_choice):
+
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        "The best language for type-safe systems programming is "
+    }]
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=10,
+        logprobs=True,
+        top_logprobs=5,
+        extra_body=dict(guided_choice=sample_guided_choice))
+
+    assert chat_completion.choices[0].logprobs is not None
+    assert chat_completion.choices[0].logprobs.content is not None
+    top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
+
+    # -9999.0 is the minimum logprob returned by OpenAI
+    for item in top_logprobs:
+        assert item.logprob >= -9999.0, f"Failed (top_logprobs={top_logprobs})"
+
+
+@pytest.mark.asyncio
+async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        f"Give an example JSON for an employee profile that "
+        f"fits this schema: {sample_json_schema}"
+    }]
+
+    # non-streaming
+
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=1000,
+        tools=[{
+            "type": "function",
+            "function": {
+                "name": "dummy_function_name",
+                "description": "This is a dummy function",
+                "parameters": sample_json_schema
+            }
+        }],
+        tool_choice={
+            "type": "function",
+            "function": {
+                "name": "dummy_function_name"
+            }
+        },
+    )
+    message = chat_completion.choices[0].message
+    assert len(message.content) == 0
+    json_string = message.tool_calls[0].function.arguments
+    json1 = json.loads(json_string)
+    jsonschema.validate(instance=json1, schema=sample_json_schema)
+
+    messages.append({"role": "assistant", "content": json_string})
+    messages.append({
+        "role":
+        "user",
+        "content":
+        "Give me another one with a different name and age"
+    })
+
+    # streaming
+
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=1000,
+        tools=[{
+            "type": "function",
+            "function": {
+                "name": "dummy_function_name",
+                "description": "This is a dummy function",
+                "parameters": sample_json_schema
+            }
+        }],
+        tool_choice={
+            "type": "function",
+            "function": {
+                "name": "dummy_function_name"
+            }
+        },
+        stream=True)
+
+    output = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        assert delta.content is None or len(delta.content) == 0
+        if delta.tool_calls:
+            output.append(delta.tool_calls[0].function.arguments)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    json2 = json.loads("".join(output))
+    jsonschema.validate(instance=json2, schema=sample_json_schema)
+    assert json1["name"] != json2["name"]
+    assert json1["age"] != json2["age"]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_required_tool_use(client: openai.AsyncOpenAI,
+                                 is_v1_server: bool, model_name: str):
+    if is_v1_server:
+        pytest.skip(
+            "tool_choice='required' requires features unsupported on V1")
+
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_current_weather",
+                "description": "Get the current weather in a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "city": {
+                            "type": "string",
+                            "description":
+                            "The city to find the weather for, e.g. 'Vienna'",
+                            "default": "Vienna",
+                        },
+                        "country": {
+                            "type":
+                            "string",
+                            "description":
+                            "The country that the city is in, e.g. 'Austria'",
+                        },
+                        "unit": {
+                            "type": "string",
+                            "description":
+                            "The unit to fetch the temperature in",
+                            "enum": ["celsius", "fahrenheit"],
+                        },
+                    },
+                    "required": ["country", "unit"],
+                },
+            },
+        },
+        {
+            "type": "function",
+            "function": {
+                "name": "get_forecast",
+                "description": "Get the weather forecast for a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "city": {
+                            "type": "string",
+                            "description":
+                            "The city to get the forecast for, e.g. 'Vienna'",
+                            "default": "Vienna",
+                        },
+                        "country": {
+                            "type":
+                            "string",
+                            "description":
+                            "The country that the city is in, e.g. 'Austria'",
+                        },
+                        "days": {
+                            "type":
+                            "integer",
+                            "description":
+                            "Number of days to get the forecast for (1-7)",
+                        },
+                        "unit": {
+                            "type": "string",
+                            "description":
+                            "The unit to fetch the temperature in",
+                            "enum": ["celsius", "fahrenheit"],
+                        },
+                    },
+                    "required": ["country", "days", "unit"],
+                },
+            },
+        },
+    ]
+
+    messages = [
+        {
+            "role": "user",
+            "content": "Hi! How are you doing today?"
+        },
+        {
+            "role": "assistant",
+            "content": "I'm doing well! How can I help you?"
+        },
+        {
+            "role":
+            "user",
+            "content":
+            "Can you tell me what the current weather is in Berlin and the "\
+            "forecast for the next 5 days, in fahrenheit?",
+        },
+    ]
+
+    # Non-streaming test
+    chat_completion = await client.chat.completions.create(
+        messages=messages,
+        model=model_name,
+        tools=tools,
+        tool_choice="required",
+    )
+
+    assert chat_completion.choices[0].message.tool_calls is not None
+    assert len(chat_completion.choices[0].message.tool_calls) > 0
+
+    # Streaming test
+    stream = await client.chat.completions.create(
+        messages=messages,
+        model=model_name,
+        tools=tools,
+        tool_choice="required",
+        stream=True,
+    )
+
+    output = []
+    async for chunk in stream:
+        if chunk.choices and chunk.choices[0].delta.tool_calls:
+            output.extend(chunk.choices[0].delta.tool_calls)
+
+    assert len(output) > 0
+
+
+@pytest.mark.asyncio
+async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
+                                                  sample_json_schema):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        f"Give an example JSON for an employee profile that "
+        f"fits this schema: {sample_json_schema}"
+    }]
+
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(model=MODEL_NAME,
+                                             messages=messages,
+                                             max_completion_tokens=1000,
+                                             tool_choice={
+                                                 "type": "function",
+                                                 "function": {
+                                                     "name":
+                                                     "dummy_function_name"
+                                                 }
+                                             })
+
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            max_completion_tokens=1000,
+            tools=[{
+                "type": "function",
+                "function": {
+                    "name": "dummy_function_name",
+                    "description": "This is a dummy function",
+                    "parameters": sample_json_schema
+                }
+            }],
+            tool_choice={
+                "type": "function",
+                "function": {
+                    "name": "nondefined_function_name"
+                }
+            })
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            max_completion_tokens=1000,
+            tools=[{
+                "type": "function",
+                "function": {
+                    "name": "dummy_function_name",
+                    "description": "This is a dummy function",
+                    "parameters": sample_json_schema
+                }
+            }],
+            tool_choice={})
+
+
+@pytest.mark.asyncio
+async def test_response_format_json_object(client: openai.AsyncOpenAI):
+    for _ in range(2):
+        resp = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[{
+                "role":
+                "user",
+                "content": ('what is 1+1? please respond with a JSON object, '
+                            'the format is {"result": 2}')
+            }],
+            response_format={"type": "json_object"})
+
+        content = resp.choices[0].message.content
+        assert content is not None
+
+        loaded = json.loads(content)
+        assert loaded == {"result": 2}, loaded
+
+
+@pytest.mark.asyncio
+async def test_response_format_json_schema(client: openai.AsyncOpenAI):
+    prompt = 'what is 1+1? The format is "result": 2'
+    # Check that this prompt cannot lead to a valid JSON without json_schema
+    for _ in range(2):
+        resp = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[{
+                "role": "user",
+                "content": prompt
+            }],
+        )
+        content = resp.choices[0].message.content
+        assert content is not None
+        with pytest.raises((json.JSONDecodeError, AssertionError)):
+            loaded = json.loads(content)
+            assert loaded == {"result": 2}, loaded
+
+    for _ in range(2):
+        resp = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[{
+                "role": "user",
+                "content": prompt
+            }],
+            response_format={
+                "type": "json_schema",
+                "json_schema": {
+                    "name": "foo_test",
+                    "schema": {
+                        "type": "object",
+                        "properties": {
+                            "result": {
+                                "type": "integer"
+                            },
+                        },
+                    },
+                }
+            })
+
+        content = resp.choices[0].message.content
+        assert content is not None
+
+        loaded = json.loads(content)
+        assert loaded == {"result": 2}, loaded
+
+
+@pytest.mark.asyncio
+async def test_extra_fields_allowed(client: openai.AsyncOpenAI):
+    resp = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "what is 1+1?",
+            "extra_field": "0",
+        }],  # type: ignore
+        temperature=0,
+        seed=0)
+
+    content = resp.choices[0].message.content
+    assert content is not None
+
+
+@pytest.mark.asyncio
+async def test_complex_message_content(client: openai.AsyncOpenAI):
+    resp = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[{
+            "role":
+            "user",
+            "content": [{
+                "type":
+                "text",
+                "text":
+                "what is 1+1? please provide the result without any other text."
+            }]
+        }],
+        temperature=0,
+        seed=0)
+    content = resp.choices[0].message.content
+    assert content == "2"
+
+
+@pytest.mark.asyncio
+async def test_custom_role(client: openai.AsyncOpenAI):
+    # Not sure how the model handles custom roles so we just check that
+    # both string and complex message content are handled in the same way
+
+    resp1 = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "my-custom-role",
+            "content": "what is 1+1?",
+        }],  # type: ignore
+        temperature=0,
+        seed=0)
+
+    resp2 = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "my-custom-role",
+            "content": [{
+                "type": "text",
+                "text": "what is 1+1?"
+            }]
+        }],  # type: ignore
+        temperature=0,
+        seed=0)
+
+    content1 = resp1.choices[0].message.content
+    content2 = resp2.choices[0].message.content
+    assert content1 == content2
+
+
+@pytest.mark.asyncio
+async def test_long_seed(client: openai.AsyncOpenAI):
+    for seed in [
+            torch.iinfo(torch.long).min - 1,
+            torch.iinfo(torch.long).max + 1
+    ]:
+        with pytest.raises(BadRequestError) as exc_info:
+            await client.chat.completions.create(
+                model=MODEL_NAME,
+                messages=[{
+                    "role": "system",
+                    "content": "You are a helpful assistant.",
+                }],
+                temperature=0,
+                seed=seed)
+
+        assert ("greater_than_equal" in exc_info.value.message
+                or "less_than_equal" in exc_info.value.message)
+
+
+@pytest.mark.asyncio
+async def test_http_chat_no_model_name_with_curl(server: RemoteOpenAIServer):
+    url = f"http://localhost:{server.port}/v1/chat/completions"
+    headers = {
+        "Content-Type": "application/json",
+    }
+    data = {
+        # model_name is avoided here.
+        "messages": [{
+            "role": "system",
+            "content": "You are a helpful assistant."
+        }, {
+            "role": "user",
+            "content": "what is 1+1?"
+        }],
+        "max_tokens":
+        5
+    }
+
+    response = requests.post(url, headers=headers, json=data)
+    response_data = response.json()
+    print(response_data)
+    assert response_data.get("model") == MODEL_NAME
+    choice = response_data.get("choices")[0]
+    message = choice.get("message")
+    assert message is not None
+    content = message.get("content")
+    assert content is not None
+    assert len(content) > 0
+
+
+@pytest.mark.asyncio
+async def test_http_chat_no_model_name_with_openai(server: RemoteOpenAIServer):
+    openai_api_key = "EMPTY"
+    openai_api_base = f"http://localhost:{server.port}/v1"
+
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    messages = [
+        {
+            "role": "user",
+            "content": "Hello, vLLM!"
+        },
+    ]
+    response = client.chat.completions.create(
+        model="",  # empty string
+        messages=messages,
+    )
+    assert response.model == MODEL_NAME
+
+
+@pytest.mark.asyncio
+async def test_invocations(server: RemoteOpenAIServer,
+                           client: openai.AsyncOpenAI):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+
+    request_args = {
+        "model": MODEL_NAME,
+        "messages": messages,
+        "max_completion_tokens": 5,
+        "temperature": 0.0,
+        "logprobs": False,
+    }
+
+    chat_completion = await client.chat.completions.create(**request_args)
+
+    invocation_response = requests.post(server.url_for("invocations"),
+                                        json=request_args)
+    invocation_response.raise_for_status()
+
+    chat_output = chat_completion.model_dump()
+    invocation_output = invocation_response.json()
+
+    assert chat_output.keys() == invocation_output.keys()
+    assert chat_output["choices"] == invocation_output["choices"]
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_chat_echo.py b/vllm_v0.10.0/tests/entrypoints/openai/test_chat_echo.py
new file mode 100644
index 0000000..de63f4e
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_chat_echo.py
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import NamedTuple
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+
+from ...utils import RemoteOpenAIServer
+
+# # any model with a chat template should work here
+MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--enforce-eager",
+        "--max-model-len",
+        "4080",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+class TestCase(NamedTuple):
+    model_name: str
+    echo: bool
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        TestCase(model_name=MODEL_NAME, echo=True),
+        TestCase(model_name=MODEL_NAME, echo=False)
+    ],
+)
+async def test_chat_session_with_echo_and_continue_final_message(
+        client: openai.AsyncOpenAI, test_case: TestCase):
+    saying: str = "Here is a common saying about apple. An apple a day, keeps"
+    # test echo with continue_final_message parameter
+    chat_completion = await client.chat.completions.create(
+        model=test_case.model_name,
+        messages=[{
+            "role": "user",
+            "content": "tell me a common saying"
+        }, {
+            "role": "assistant",
+            "content": saying
+        }],
+        extra_body={
+            "echo": test_case.echo,
+            "continue_final_message": True,
+            "add_generation_prompt": False
+        })
+    assert chat_completion.id is not None
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "stop"
+
+    message = choice.message
+    if test_case.echo:
+        assert message.content is not None and saying in message.content
+    else:
+        assert message.content is not None and saying not in message.content
+    assert message.role == "assistant"
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_chat_logit_bias_validation.py b/vllm_v0.10.0/tests/entrypoints/openai/test_chat_logit_bias_validation.py
new file mode 100644
index 0000000..e9d1a85
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_chat_logit_bias_validation.py
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import openai
+import pytest
+import pytest_asyncio
+
+from vllm.config import ModelConfig
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
+
+
+def get_vocab_size(model_name):
+    config = ModelConfig(
+        model=model_name,
+        task="auto",
+        tokenizer=model_name,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="bfloat16",
+    )
+    return config.get_vocab_size()
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "1024",
+        "--enforce-eager",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_chat_logit_bias_valid(client):
+    """Test that valid logit_bias values are accepted in chat completions."""
+    vocab_size = get_vocab_size(MODEL_NAME)
+    valid_token_id = vocab_size - 1
+
+    completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "Testing valid logit bias"
+        }],
+        max_tokens=5,
+        logit_bias={str(valid_token_id): 1.0},
+    )
+
+    assert completion.choices[0].message.content is not None
+
+
+@pytest.mark.asyncio
+async def test_chat_logit_bias_invalid(client):
+    """Test that invalid logit_bias values are rejected in chat completions."""
+    vocab_size = get_vocab_size(MODEL_NAME)
+    invalid_token_id = vocab_size + 1
+
+    with pytest.raises(openai.BadRequestError) as excinfo:
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[{
+                "role": "user",
+                "content": "Testing invalid logit bias"
+            }],
+            max_tokens=5,
+            logit_bias={str(invalid_token_id): 1.0},
+        )
+
+    error = excinfo.value
+    error_message = str(error)
+
+    assert error.status_code == 400
+    assert str(invalid_token_id) in error_message
+    assert str(vocab_size) in error_message
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_chat_template.py b/vllm_v0.10.0/tests/entrypoints/openai/test_chat_template.py
new file mode 100644
index 0000000..6e32887
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_chat_template.py
@@ -0,0 +1,138 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.config import ModelConfig
+from vllm.entrypoints.chat_utils import (apply_hf_chat_template,
+                                         load_chat_template)
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ...models.registry import HF_EXAMPLE_MODELS
+from ...utils import VLLM_PATH
+
+chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
+assert chatml_jinja_path.exists()
+
+# Define models, templates, and their corresponding expected outputs
+MODEL_TEMPLATE_GENERATION_OUTPUT = [
+    ("facebook/opt-125m", chatml_jinja_path, True, False, """<|im_start|>user
+Hello<|im_end|>
+<|im_start|>assistant
+Hi there!<|im_end|>
+<|im_start|>user
+What is the capital of<|im_end|>
+<|im_start|>assistant
+"""),
+    ("facebook/opt-125m", chatml_jinja_path, False, False, """<|im_start|>user
+Hello<|im_end|>
+<|im_start|>assistant
+Hi there!<|im_end|>
+<|im_start|>user
+What is the capital of"""),
+    ("facebook/opt-125m", chatml_jinja_path, False, True, """<|im_start|>user
+Hello<|im_end|>
+<|im_start|>assistant
+Hi there!<|im_end|>
+<|im_start|>user
+What is the capital of<|im_end|>
+<|im_start|>assistant
+The capital of"""),
+]
+
+TEST_MESSAGES = [
+    {
+        'role': 'user',
+        'content': 'Hello'
+    },
+    {
+        'role': 'assistant',
+        'content': 'Hi there!'
+    },
+    {
+        'role': 'user',
+        'content': 'What is the capital of'
+    },
+]
+ASSISTANT_MESSAGE_TO_CONTINUE = {
+    'role': 'assistant',
+    'content': 'The capital of'
+}
+
+
+def test_load_chat_template():
+    # Testing chatml template
+    template_content = load_chat_template(chat_template=chatml_jinja_path)
+
+    # Test assertions
+    assert template_content is not None
+    # Hard coded value for template_chatml.jinja
+    assert template_content == """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
+{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"""  # noqa: E501
+
+
+def test_no_load_chat_template_filelike():
+    # Testing chatml template
+    template = "../../examples/does_not_exist"
+
+    with pytest.raises(ValueError, match="looks like a file path"):
+        load_chat_template(chat_template=template)
+
+
+def test_no_load_chat_template_literallike():
+    # Testing chatml template
+    template = "{{ messages }}"
+
+    template_content = load_chat_template(chat_template=template)
+
+    assert template_content == template
+
+
+@pytest.mark.parametrize(
+    "model,template,add_generation_prompt,continue_final_message,expected_output",
+    MODEL_TEMPLATE_GENERATION_OUTPUT)
+def test_get_gen_prompt(model, template, add_generation_prompt,
+                        continue_final_message, expected_output):
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+
+    model_config = ModelConfig(
+        model,
+        tokenizer=model_info.tokenizer or model,
+        tokenizer_mode=model_info.tokenizer_mode,
+        trust_remote_code=model_info.trust_remote_code,
+        hf_overrides=model_info.hf_overrides,
+    )
+
+    # Initialize the tokenizer
+    tokenizer = get_tokenizer(
+        tokenizer_name=model_config.tokenizer,
+        trust_remote_code=model_config.trust_remote_code,
+    )
+    template_content = load_chat_template(chat_template=template)
+
+    # Create a mock request object using keyword arguments
+    mock_request = ChatCompletionRequest(
+        model=model,
+        messages=TEST_MESSAGES + [ASSISTANT_MESSAGE_TO_CONTINUE]
+        if continue_final_message else TEST_MESSAGES,
+        add_generation_prompt=add_generation_prompt,
+        continue_final_message=continue_final_message,
+    )
+
+    # Call the function and get the result
+    result = apply_hf_chat_template(
+        tokenizer=tokenizer,
+        conversation=mock_request.messages,
+        chat_template=mock_request.chat_template or template_content,
+        model_config=model_config,
+        tools=None,
+        add_generation_prompt=mock_request.add_generation_prompt,
+        continue_final_message=mock_request.continue_final_message,
+    )
+
+    # Test assertion
+    assert result == expected_output, (
+        f"The generated prompt does not match the expected output for "
+        f"model {model} and template {template}")
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_chat_with_tool_reasoning.py b/vllm_v0.10.0/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
new file mode 100644
index 0000000..03730b6
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+
+from ...utils import RemoteOpenAIServer
+
+# a reasoning and tool calling model
+MODEL_NAME = "Qwen/QwQ-32B"
+
+
+@pytest.fixture(scope="module")
+def server():  # noqa: F811
+    args = [
+        "--max-model-len", "8192", "--enforce-eager", "--reasoning-parser",
+        "deepseek_r1", "--enable-auto-tool-choice", "--tool-call-parser",
+        "hermes"
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+TOOLS = [{
+    "type": "function",
+    "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "city": {
+                    "type":
+                    "string",
+                    "description":
+                    "The city to find the weather for, e.g. 'San Francisco'"
+                },
+                "state": {
+                    "type":
+                    "string",
+                    "description":
+                    "the two-letter abbreviation for the state that the city is"
+                    " in, e.g. 'CA' which would mean 'California'"
+                },
+                "unit": {
+                    "type": "string",
+                    "description": "The unit to fetch the temperature in",
+                    "enum": ["celsius", "fahrenheit"]
+                }
+            },
+            "required": ["city", "state", "unit"]
+        }
+    }
+}]
+
+MESSAGES = [{
+    "role": "user",
+    "content": "Hi! How are you doing today?"
+}, {
+    "role": "assistant",
+    "content": "I'm doing well! How can I help you?"
+}, {
+    "role":
+    "user",
+    "content":
+    "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
+}]
+
+FUNC_NAME = "get_current_weather"
+FUNC_ARGS = """{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}"""
+
+
+def extract_reasoning_and_calls(chunks: list):
+    reasoning_content = ""
+    tool_call_idx = -1
+    arguments = []
+    function_names = []
+    for chunk in chunks:
+        if chunk.choices[0].delta.tool_calls:
+            tool_call = chunk.choices[0].delta.tool_calls[0]
+            if tool_call.index != tool_call_idx:
+                tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
+                arguments.append("")
+                function_names.append("")
+
+            if tool_call.function:
+                if tool_call.function.name:
+                    function_names[tool_call_idx] = tool_call.function.name
+
+                if tool_call.function.arguments:
+                    arguments[tool_call_idx] += tool_call.function.arguments
+        else:
+            if hasattr(chunk.choices[0].delta, "reasoning_content"):
+                reasoning_content += chunk.choices[0].delta.reasoning_content
+    return reasoning_content, arguments, function_names
+
+
+# test streaming
+@pytest.mark.asyncio
+async def test_chat_streaming_of_tool_and_reasoning(
+        client: openai.AsyncOpenAI):
+
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES,
+        tools=TOOLS,
+        temperature=0.0,
+        stream=True,
+    )
+
+    chunks = []
+    async for chunk in stream:
+        chunks.append(chunk)
+
+    reasoning_content, arguments, function_names = extract_reasoning_and_calls(
+        chunks)
+    assert len(reasoning_content) > 0
+    assert len(function_names) > 0 and function_names[0] == FUNC_NAME
+    assert len(arguments) > 0 and arguments[0] == FUNC_ARGS
+
+
+# test full generate
+@pytest.mark.asyncio
+async def test_chat_full_of_tool_and_reasoning(client: openai.AsyncOpenAI):
+
+    tool_calls = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES,
+        tools=TOOLS,
+        temperature=0.0,
+        stream=False,
+    )
+
+    assert len(tool_calls.choices[0].message.reasoning_content) > 0
+    assert tool_calls.choices[0].message.tool_calls[0].function.name \
+          == FUNC_NAME
+    assert tool_calls.choices[0].message.tool_calls[0].function.arguments \
+          == FUNC_ARGS
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_chunked_prompt.py b/vllm_v0.10.0/tests/entrypoints/openai/test_chunked_prompt.py
new file mode 100644
index 0000000..3c8ed95
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_chunked_prompt.py
@@ -0,0 +1,129 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--max-num-seqs",
+        "128",
+        "--enable-chunked-prefill",
+        "--max-num-batched-tokens",
+        "1000",
+        # large prompts create a lot of output
+        "--disable-log-requests",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_completion_stream_options_and_logprobs_with_long_prompts(
+        client: openai.AsyncOpenAI):
+    # Test stream with long prompt
+    prompt = "What is the capital of France?" * 400
+
+    stream = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": True,
+            "continuous_usage_stats": True,
+        },
+        logprobs=5,
+    )
+
+    tokens_received = 0
+    finished = False
+    async for chunk in stream:
+        assert chunk.usage.prompt_tokens >= 0
+        assert chunk.usage.completion_tokens >= 0
+        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
+                                            chunk.usage.completion_tokens)
+        if not finished:
+            tokens_received += 1
+            assert chunk.choices[0].text
+
+            if chunk.choices[0].finish_reason is not None:
+                finished = True
+
+        if finished:
+            assert chunk.usage.completion_tokens == tokens_received
+
+
+@pytest.mark.asyncio
+async def test_chat_completion_stream_options_and_logprobs_with_long_prompts(
+        client: openai.AsyncOpenAI):
+    # Test stream with long prompt
+    messages = [{
+        "role": "system",
+        "content": "You are a helpful assistant."
+    }, {
+        "role": "user",
+        "content": "What is the capital of France?" * 400
+    }]
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": True,
+            "continuous_usage_stats": True,
+        },
+        logprobs=True,
+        top_logprobs=5,
+    )
+
+    tokens_received = 0
+    empty_chunks_received = 0
+    finished = False
+    async for chunk in stream:
+        assert chunk.usage.prompt_tokens >= 0
+        assert chunk.usage.completion_tokens >= 0
+        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
+                                            chunk.usage.completion_tokens)
+
+        if not finished:
+            if chunk.choices[0].delta.content == "":
+                # when there is no tokens generated
+                assert chunk.usage.completion_tokens == 0
+                assert chunk.choices[0].logprobs is None
+                empty_chunks_received += 1
+            else:
+                tokens_received += 1
+
+            if chunk.choices[0].finish_reason is not None:
+                finished = True
+
+        if finished:
+            assert chunk.usage.completion_tokens == tokens_received
+
+    assert empty_chunks_received <= 1
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_classification.py b/vllm_v0.10.0/tests/entrypoints/openai/test_classification.py
new file mode 100644
index 0000000..b247265
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_classification.py
@@ -0,0 +1,183 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import requests
+
+from vllm.entrypoints.openai.protocol import ClassificationResponse
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
+DTYPE = "float32"  # Use float32 to avoid NaN issue
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--enforce-eager",
+        "--max-model-len",
+        "512",
+        "--dtype",
+        DTYPE,
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_single_input_classification(server: RemoteOpenAIServer,
+                                     model_name: str):
+    input_text = "This product was excellent and exceeded my expectations"
+
+    classification_response = requests.post(
+        server.url_for("classify"),
+        json={
+            "model": model_name,
+            "input": input_text
+        },
+    )
+
+    classification_response.raise_for_status()
+    output = ClassificationResponse.model_validate(
+        classification_response.json())
+
+    assert output.object == "list"
+    assert output.model == MODEL_NAME
+    assert len(output.data) == 1
+    assert hasattr(output.data[0], "label")
+    assert hasattr(output.data[0], "probs")
+
+
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_multiple_inputs_classification(server: RemoteOpenAIServer,
+                                        model_name: str):
+    input_texts = [
+        "The product arrived on time and works perfectly",
+        "I'm very satisfied with my purchase, would buy again",
+        "The customer service was helpful and resolved my issue quickly",
+        "This product broke after one week, terrible quality",
+        "I'm very disappointed with this purchase, complete waste of money",
+        "The customer service was rude and unhelpful",
+    ]
+
+    classification_response = requests.post(
+        server.url_for("classify"),
+        json={
+            "model": model_name,
+            "input": input_texts
+        },
+    )
+    output = ClassificationResponse.model_validate(
+        classification_response.json())
+
+    assert len(output.data) == len(input_texts)
+    for i, item in enumerate(output.data):
+        assert item.index == i
+        assert hasattr(item, "label")
+        assert hasattr(item, "probs")
+        assert len(item.probs) == item.num_classes
+        assert item.label in ["Default", "Spoiled"]
+
+
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_truncate_prompt_tokens(server: RemoteOpenAIServer, model_name: str):
+    long_text = "hello " * 600
+
+    classification_response = requests.post(
+        server.url_for("classify"),
+        json={
+            "model": model_name,
+            "input": long_text,
+            "truncate_prompt_tokens": 5
+        },
+    )
+
+    classification_response.raise_for_status()
+    output = ClassificationResponse.model_validate(
+        classification_response.json())
+
+    assert len(output.data) == 1
+    assert output.data[0].index == 0
+    assert hasattr(output.data[0], "probs")
+    assert output.usage.prompt_tokens == 5
+    assert output.usage.total_tokens == 5
+
+
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_invalid_truncate_prompt_tokens_error(server: RemoteOpenAIServer,
+                                              model_name: str):
+    classification_response = requests.post(
+        server.url_for("classify"),
+        json={
+            "model": model_name,
+            "input": "test",
+            "truncate_prompt_tokens": 513
+        },
+    )
+
+    error = classification_response.json()
+    assert classification_response.status_code == 400
+    assert error["object"] == "error"
+    assert "truncate_prompt_tokens" in error["message"]
+
+
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_empty_input_error(server: RemoteOpenAIServer, model_name: str):
+    classification_response = requests.post(
+        server.url_for("classify"),
+        json={
+            "model": model_name,
+            "input": ""
+        },
+    )
+
+    error = classification_response.json()
+    assert classification_response.status_code == 400
+    assert error["object"] == "error"
+
+
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_batch_classification_empty_list(server: RemoteOpenAIServer,
+                                         model_name: str):
+    classification_response = requests.post(
+        server.url_for("classify"),
+        json={
+            "model": model_name,
+            "input": []
+        },
+    )
+    classification_response.raise_for_status()
+    output = ClassificationResponse.model_validate(
+        classification_response.json())
+
+    assert output.object == "list"
+    assert isinstance(output.data, list)
+    assert len(output.data) == 0
+
+
+@pytest.mark.asyncio
+async def test_invocations(server: RemoteOpenAIServer):
+    request_args = {
+        "model": MODEL_NAME,
+        "input": "This product was excellent and exceeded my expectations"
+    }
+
+    classification_response = requests.post(server.url_for("classify"),
+                                            json=request_args)
+    classification_response.raise_for_status()
+
+    invocation_response = requests.post(server.url_for("invocations"),
+                                        json=request_args)
+    invocation_response.raise_for_status()
+
+    classification_output = classification_response.json()
+    invocation_output = invocation_response.json()
+
+    assert classification_output.keys() == invocation_output.keys()
+    for classification_data, invocation_data in zip(
+            classification_output["data"], invocation_output["data"]):
+        assert classification_data.keys() == invocation_data.keys()
+        assert classification_data["probs"] == pytest.approx(
+            invocation_data["probs"], rel=0.01)
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_cli_args.py b/vllm_v0.10.0/tests/entrypoints/openai/test_cli_args.py
new file mode 100644
index 0000000..b208389
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_cli_args.py
@@ -0,0 +1,165 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+
+from vllm.entrypoints.openai.cli_args import (make_arg_parser,
+                                              validate_parsed_serve_args)
+from vllm.entrypoints.openai.serving_models import LoRAModulePath
+from vllm.utils import FlexibleArgumentParser
+
+from ...utils import VLLM_PATH
+
+LORA_MODULE = {
+    "name": "module2",
+    "path": "/path/to/module2",
+    "base_model_name": "llama"
+}
+CHATML_JINJA_PATH = VLLM_PATH / "examples/template_chatml.jinja"
+assert CHATML_JINJA_PATH.exists()
+
+
+@pytest.fixture
+def serve_parser():
+    parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
+    return make_arg_parser(parser)
+
+
+### Tests for LoRA module parsing
+def test_valid_key_value_format(serve_parser):
+    # Test old format: name=path
+    args = serve_parser.parse_args([
+        '--lora-modules',
+        'module1=/path/to/module1',
+    ])
+    expected = [LoRAModulePath(name='module1', path='/path/to/module1')]
+    assert args.lora_modules == expected
+
+
+def test_valid_json_format(serve_parser):
+    # Test valid JSON format input
+    args = serve_parser.parse_args([
+        '--lora-modules',
+        json.dumps(LORA_MODULE),
+    ])
+    expected = [
+        LoRAModulePath(name='module2',
+                       path='/path/to/module2',
+                       base_model_name='llama')
+    ]
+    assert args.lora_modules == expected
+
+
+def test_invalid_json_format(serve_parser):
+    # Test invalid JSON format input, missing closing brace
+    with pytest.raises(SystemExit):
+        serve_parser.parse_args([
+            '--lora-modules', '{"name": "module3", "path": "/path/to/module3"'
+        ])
+
+
+def test_invalid_type_error(serve_parser):
+    # Test type error when values are not JSON or key=value
+    with pytest.raises(SystemExit):
+        serve_parser.parse_args([
+            '--lora-modules',
+            'invalid_format'  # This is not JSON or key=value format
+        ])
+
+
+def test_invalid_json_field(serve_parser):
+    # Test valid JSON format but missing required fields
+    with pytest.raises(SystemExit):
+        serve_parser.parse_args([
+            '--lora-modules',
+            '{"name": "module4"}'  # Missing required 'path' field
+        ])
+
+
+def test_empty_values(serve_parser):
+    # Test when no LoRA modules are provided
+    args = serve_parser.parse_args(['--lora-modules', ''])
+    assert args.lora_modules == []
+
+
+def test_multiple_valid_inputs(serve_parser):
+    # Test multiple valid inputs (both old and JSON format)
+    args = serve_parser.parse_args([
+        '--lora-modules',
+        'module1=/path/to/module1',
+        json.dumps(LORA_MODULE),
+    ])
+    expected = [
+        LoRAModulePath(name='module1', path='/path/to/module1'),
+        LoRAModulePath(name='module2',
+                       path='/path/to/module2',
+                       base_model_name='llama')
+    ]
+    assert args.lora_modules == expected
+
+
+### Tests for serve argument validation that run prior to loading
+def test_enable_auto_choice_passes_without_tool_call_parser(serve_parser):
+    """Ensure validation fails if tool choice is enabled with no call parser"""
+    # If we enable-auto-tool-choice, explode with no tool-call-parser
+    args = serve_parser.parse_args(args=["--enable-auto-tool-choice"])
+    with pytest.raises(TypeError):
+        validate_parsed_serve_args(args)
+
+
+def test_enable_auto_choice_passes_with_tool_call_parser(serve_parser):
+    """Ensure validation passes with tool choice enabled with a call parser"""
+    args = serve_parser.parse_args(args=[
+        "--enable-auto-tool-choice",
+        "--tool-call-parser",
+        "mistral",
+    ])
+    validate_parsed_serve_args(args)
+
+
+def test_enable_auto_choice_fails_with_enable_reasoning(serve_parser):
+    """Ensure validation fails if reasoning is enabled with auto tool choice"""
+    args = serve_parser.parse_args(args=[
+        "--enable-auto-tool-choice",
+        "--reasoning-parser",
+        "deepseek_r1",
+    ])
+    with pytest.raises(TypeError):
+        validate_parsed_serve_args(args)
+
+
+def test_passes_with_reasoning_parser(serve_parser):
+    """Ensure validation passes if reasoning is enabled 
+    with a reasoning parser"""
+    args = serve_parser.parse_args(args=[
+        "--reasoning-parser",
+        "deepseek_r1",
+    ])
+    validate_parsed_serve_args(args)
+
+
+def test_chat_template_validation_for_happy_paths(serve_parser):
+    """Ensure validation passes if the chat template exists"""
+    args = serve_parser.parse_args(
+        args=["--chat-template",
+              CHATML_JINJA_PATH.absolute().as_posix()])
+    validate_parsed_serve_args(args)
+
+
+def test_chat_template_validation_for_sad_paths(serve_parser):
+    """Ensure validation fails if the chat template doesn't exist"""
+    args = serve_parser.parse_args(args=["--chat-template", "does/not/exist"])
+    with pytest.raises(ValueError):
+        validate_parsed_serve_args(args)
+
+
+@pytest.mark.parametrize(
+    "cli_args, expected_middleware",
+    [(["--middleware", "middleware1", "--middleware", "middleware2"
+       ], ["middleware1", "middleware2"]), ([], [])])
+def test_middleware(serve_parser, cli_args, expected_middleware):
+    """Ensure multiple middleware args are parsed properly"""
+    args = serve_parser.parse_args(args=cli_args)
+    assert args.middleware == expected_middleware
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_completion.py b/vllm_v0.10.0/tests/entrypoints/openai/test_completion.py
new file mode 100644
index 0000000..6eca3e7
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_completion.py
@@ -0,0 +1,848 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# imports for guided decoding tests
+import json
+import os
+import shutil
+from tempfile import TemporaryDirectory
+from typing import Optional
+
+import jsonschema
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+import regex as re
+import requests
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
+from openai import BadRequestError
+from transformers import AutoTokenizer
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+# technically these adapters use a different base model,
+# but we're not testing generation quality here
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+
+GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_added_tokens_files(zephyr_lora_files):
+    tmp_dir = TemporaryDirectory()
+    tmp_model_dir = f"{tmp_dir.name}/zephyr"
+    shutil.copytree(zephyr_lora_files, tmp_model_dir)
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    # Copy tokenizer to adapter and add some unique tokens
+    # 32000, 32001, 32002
+    added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
+                                 special_tokens=True)
+    assert added == 3
+    tokenizer.save_pretrained(tmp_model_dir)
+    yield tmp_model_dir
+    tmp_dir.cleanup()
+
+
+@pytest.fixture(scope="module")
+def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files):
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+        # lora config
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        f"zephyr-lora2={zephyr_lora_added_tokens_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+    ]
+
+
+@pytest.fixture(scope="module",
+                params=["", "--disable-frontend-multiprocessing"])
+def server(default_server_args, request):
+    if request.param:
+        default_server_args.append(request.param)
+
+    original_value = os.environ.get('VLLM_USE_V1')
+    os.environ['VLLM_USE_V1'] = '0'
+    try:
+        with RemoteOpenAIServer(MODEL_NAME,
+                                default_server_args) as remote_server:
+            yield remote_server
+    finally:
+        # Restore original env value
+        if original_value is None:
+            os.environ.pop('VLLM_USE_V1', None)
+        else:
+            os.environ['VLLM_USE_V1'] = original_value
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+)
+async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
+    completion = await client.completions.create(model=model_name,
+                                                 prompt="Hello, my name is",
+                                                 max_tokens=5,
+                                                 temperature=0.0)
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 1
+
+    choice = completion.choices[0]
+    assert len(choice.text) >= 5
+    assert choice.finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5, prompt_tokens=6, total_tokens=11)
+
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 1
+    assert completion.choices[0].prompt_logprobs is None
+
+
+@pytest.mark.asyncio
+async def test_added_lora_tokens(client: openai.AsyncOpenAI):
+    # test using token IDs
+    completion = await client.completions.create(
+        model="zephyr-lora2",
+        prompt=[0, 0, 32000, 32001, 32002],
+        echo=True,
+        max_tokens=5,
+        temperature=0.0,
+    )
+    # Added tokens should appear in tokenized prompt
+    assert completion.choices[0].text.startswith("<unk><unk>vllm1vllm2vllm3")
+
+
+@pytest.mark.asyncio
+async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
+    # test using token IDs
+    with pytest.raises(openai.BadRequestError, match="out of vocabulary"):
+        # Added tokens should be rejected by the base model
+        await client.completions.create(
+            model=MODEL_NAME,
+            prompt=[0, 0, 32000, 32001, 32002],
+            echo=True,
+            max_tokens=5,
+            temperature=0.0,
+        )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+)
+async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=None,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # just test 1 lora
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=0,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.token_logprobs is not None
+    assert choice.logprobs.top_logprobs is not None
+    assert len(choice.logprobs.top_logprobs[0]) == 1
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=5,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.token_logprobs is not None
+    assert choice.logprobs.top_logprobs is not None
+    assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
+                                            model_name: str):
+
+    with pytest.raises(
+        (openai.BadRequestError, openai.APIError)):  # test using token IDs
+        await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+            # vLLM has higher default max_logprobs (20 instead of 5) to support
+            # both Completion API and Chat Completion API
+            logprobs=21,
+        )
+        ...
+    with pytest.raises(
+        (openai.BadRequestError, openai.APIError)):  # test using token IDs
+        stream = await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+            # vLLM has higher default max_logprobs (20 instead of 5) to support
+            # both Completion API and Chat Completion API
+            logprobs=30,
+            stream=True,
+        )
+        async for chunk in stream:
+            ...
+
+    # the server should still work afterwards
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name, prompt_logprobs", [(MODEL_NAME, -1),
+                                                         (MODEL_NAME, 0),
+                                                         (MODEL_NAME, 1),
+                                                         (MODEL_NAME, None)])
+async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
+                                          model_name: str,
+                                          prompt_logprobs: Optional[int]):
+    params: dict = {
+        "prompt": ["A robot may not injure another robot", "My name is"],
+        "model": model_name,
+    }
+    if prompt_logprobs is not None:
+        params["extra_body"] = {"prompt_logprobs": prompt_logprobs}
+
+    if prompt_logprobs is not None and prompt_logprobs < 0:
+        with pytest.raises(BadRequestError):
+            await client.completions.create(**params)
+    else:
+        completion = await client.completions.create(**params)
+        if prompt_logprobs is not None:
+            assert completion.choices[0].prompt_logprobs is not None
+            assert len(completion.choices[0].prompt_logprobs) > 0
+
+            assert completion.choices[1].prompt_logprobs is not None
+            assert len(completion.choices[1].prompt_logprobs) > 0
+
+        else:
+            assert completion.choices[0].prompt_logprobs is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_completion_streaming(client: openai.AsyncOpenAI,
+                                    model_name: str):
+    prompt = "What is an LLM?"
+
+    single_completion = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+    )
+    single_output = single_completion.choices[0].text
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True)
+    chunks: list[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        chunks.append(chunk.choices[0].text)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == "length"
+    assert chunk.choices[0].text
+    assert "".join(chunks) == single_output
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
+    """Streaming for parallel sampling.
+    The tokens from multiple samples, are flattened into a single stream,
+    with an index to indicate which sample the token belongs to.
+    """
+
+    prompt = "What is an LLM?"
+    n = 3
+    max_tokens = 5
+
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=max_tokens,
+                                             n=n,
+                                             stream=True)
+    chunks: list[list[str]] = [[] for i in range(n)]
+    finish_reason_count = 0
+    async for chunk in stream:
+        index = chunk.choices[0].index
+        text = chunk.choices[0].text
+        chunks[index].append(text)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    assert finish_reason_count == n
+    for chunk in chunks:
+        assert len(chunk) == max_tokens
+        print("".join(chunk))
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_completion_stream_options(client: openai.AsyncOpenAI,
+                                         model_name: str):
+    prompt = "What is the capital of France?"
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": False, "continuous_usage_stats": False}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": False,
+                                                 "continuous_usage_stats":
+                                                 False,
+                                             })
+
+    async for chunk in stream:
+        assert chunk.usage is None
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": False, "continuous_usage_stats": True}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": False,
+                                                 "continuous_usage_stats":
+                                                 True,
+                                             })
+    async for chunk in stream:
+        assert chunk.usage is None
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": True, "continuous_usage_stats": False}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": True,
+                                                 "continuous_usage_stats":
+                                                 False,
+                                             })
+    async for chunk in stream:
+        if chunk.choices[0].finish_reason is None:
+            assert chunk.usage is None
+        else:
+            assert chunk.usage is None
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
+            assert final_chunk.choices == []
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": True, "continuous_usage_stats": True}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": True,
+                                                 "continuous_usage_stats":
+                                                 True,
+                                             })
+    async for chunk in stream:
+        assert chunk.usage is not None
+        assert chunk.usage.prompt_tokens > 0
+        assert chunk.usage.completion_tokens > 0
+        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
+                                            chunk.usage.completion_tokens)
+        if chunk.choices[0].finish_reason is not None:
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
+            assert final_chunk.choices == []
+
+    # Test stream=False, stream_options=
+    #     {"include_usage": None}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(model=model_name,
+                                        prompt=prompt,
+                                        max_tokens=5,
+                                        temperature=0.0,
+                                        stream=False,
+                                        stream_options={"include_usage": None})
+
+    # Test stream=False, stream_options=
+    #    {"include_usage": True}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(model=model_name,
+                                        prompt=prompt,
+                                        max_tokens=5,
+                                        temperature=0.0,
+                                        stream=False,
+                                        stream_options={"include_usage": True})
+
+    # Test stream=False, stream_options=
+    #     {"continuous_usage_stats": None}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"continuous_usage_stats": None})
+
+    # Test stream=False, stream_options=
+    #    {"continuous_usage_stats": True}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"continuous_usage_stats": True})
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
+    # test both text and token IDs
+    for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2):
+        # test simple list
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            max_tokens=5,
+            temperature=0.0,
+        )
+        assert len(batch.choices) == 2
+        assert batch.choices[0].text == batch.choices[1].text
+
+        # test n = 2
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            n=2,
+            max_tokens=5,
+            temperature=0.0,
+            extra_body=dict(
+                # NOTE: this has to be true for n > 1 in vLLM, but
+                # not necessary for official client.
+                use_beam_search=True),
+        )
+        assert len(batch.choices) == 4
+        assert batch.choices[0].text != batch.choices[
+            1].text, "beam search should be different"
+        assert batch.choices[0].text == batch.choices[
+            2].text, "two copies of the same prompt should be the same"
+        assert batch.choices[1].text == batch.choices[
+            3].text, "two copies of the same prompt should be the same"
+
+        # test streaming
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            max_tokens=5,
+            temperature=0.0,
+            stream=True,
+        )
+        texts = [""] * 2
+        async for chunk in batch:
+            assert len(chunk.choices) == 1
+            choice = chunk.choices[0]
+            texts[choice.index] += choice.text
+        assert texts[0] == texts[1]
+
+
+@pytest.mark.asyncio
+async def test_logits_bias(client: openai.AsyncOpenAI):
+    prompt = "Hello, my name is"
+    max_tokens = 5
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+
+    # Test exclusive selection
+    token_id = 1000
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        logit_bias={str(token_id): 100},
+        seed=42,
+    )
+    assert len(completion.choices[0].text) >= 5
+    response_tokens = tokenizer(completion.choices[0].text,
+                                add_special_tokens=False)["input_ids"]
+    expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
+                                add_special_tokens=False)["input_ids"]
+    assert all([
+        response == expected
+        for response, expected in zip(response_tokens, expected_tokens)
+    ])
+
+    # Test ban
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+    )
+    response_tokens = tokenizer(completion.choices[0].text,
+                                add_special_tokens=False)["input_ids"]
+    first_response = completion.choices[0].text
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        logit_bias={str(token): -100
+                    for token in response_tokens},
+    )
+    assert first_response != completion.choices[0].text
+
+
+@pytest.mark.asyncio
+async def test_allowed_token_ids(client: openai.AsyncOpenAI):
+    prompt = "Hello, my name is"
+    max_tokens = 1
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+
+    # Test exclusive selection
+    allowed_ids = [21555, 21557, 21558]
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        seed=42,
+        extra_body=dict(allowed_token_ids=allowed_ids),
+        logprobs=1,
+    )
+    response_tokens = completion.choices[0].logprobs.tokens
+    assert len(response_tokens) == 1
+    assert tokenizer.convert_tokens_to_ids(response_tokens)[0] in allowed_ids
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+async def test_guided_json_completion(client: openai.AsyncOpenAI,
+                                      guided_decoding_backend: str,
+                                      sample_json_schema):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=f"Give an example JSON for an employee profile "
+        f"that fits this schema: {sample_json_schema}",
+        n=3,
+        temperature=1.0,
+        max_tokens=500,
+        extra_body=dict(guided_json=sample_json_schema,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert len(completion.choices) == 3
+    for i in range(3):
+        output_json = json.loads(completion.choices[i].text)
+        jsonschema.validate(instance=output_json, schema=sample_json_schema)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+async def test_guided_regex_completion(client: openai.AsyncOpenAI,
+                                       guided_decoding_backend: str,
+                                       sample_regex):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=f"Give an example IPv4 address with this regex: {sample_regex}",
+        n=3,
+        temperature=1.0,
+        max_tokens=20,
+        extra_body=dict(guided_regex=sample_regex,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert len(completion.choices) == 3
+    for i in range(3):
+        assert re.fullmatch(sample_regex,
+                            completion.choices[i].text) is not None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+async def test_guided_choice_completion(client: openai.AsyncOpenAI,
+                                        guided_decoding_backend: str,
+                                        sample_guided_choice):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt="The best language for type-safe systems programming is ",
+        n=2,
+        temperature=1.0,
+        max_tokens=10,
+        extra_body=dict(guided_choice=sample_guided_choice,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert len(completion.choices) == 2
+    for i in range(2):
+        assert completion.choices[i].text in sample_guided_choice
+
+
+@pytest.mark.asyncio
+async def test_guided_grammar(client: openai.AsyncOpenAI,
+                              sample_sql_statements):
+
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=("Generate a sql state that select col_1 from "
+                "table_1 where it is equals to 1"),
+        temperature=1.0,
+        max_tokens=500,
+        extra_body=dict(guided_grammar=sample_sql_statements))
+
+    content = completion.choices[0].text
+
+    # use Lark to parse the output, and make sure it's a valid parse tree
+    from lark import Lark
+    parser = Lark(sample_sql_statements)
+    parser.parse(content)
+
+    # remove spaces for comparison b/c we removed them in the grammar
+    ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
+
+    assert content.strip() == ground_truth
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+)
+@pytest.mark.parametrize("logprobs_arg", [1, 0])
+async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
+                                       model_name: str, logprobs_arg: int):
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+    # test using text and token IDs
+    for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
+        completion = await client.completions.create(model=model_name,
+                                                     prompt=prompt,
+                                                     max_tokens=5,
+                                                     temperature=0.0,
+                                                     echo=True,
+                                                     logprobs=logprobs_arg)
+
+        prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
+                                                             list) else prompt
+        assert re.search(r"^" + prompt_text, completion.choices[0].text)
+        logprobs = completion.choices[0].logprobs
+        assert logprobs is not None
+        assert len(logprobs.text_offset) > 5
+        assert (len(logprobs.token_logprobs) > 5
+                and logprobs.token_logprobs[0] is None)
+        assert (len(logprobs.top_logprobs) > 5
+                and logprobs.top_logprobs[0] is None)
+        for top_logprobs in logprobs.top_logprobs[1:]:
+            assert max(logprobs_arg,
+                       1) <= len(top_logprobs) <= logprobs_arg + 1
+        assert len(logprobs.tokens) > 5
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
+                                          guided_decoding_backend: str,
+                                          sample_json_schema, sample_regex):
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Give an example JSON that fits this schema: 42",
+            extra_body=dict(guided_json=42,
+                            guided_decoding_backend=guided_decoding_backend))
+
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Give an example string that fits this regex",
+            extra_body=dict(guided_regex=sample_regex,
+                            guided_json=sample_json_schema))
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name,stream,echo",
+    [
+        (MODEL_NAME, False, False),
+        (MODEL_NAME, False, True),
+        (MODEL_NAME, True, False),
+        (MODEL_NAME, True, True)  # should not raise BadRequestError error
+    ],
+)
+async def test_echo_stream_completion(client: openai.AsyncOpenAI,
+                                      model_name: str, stream: bool,
+                                      echo: bool):
+    saying: str = "Hello, my name is"
+    result = await client.completions.create(model=model_name,
+                                             prompt=saying,
+                                             max_tokens=10,
+                                             temperature=0.0,
+                                             echo=echo,
+                                             stream=stream)
+
+    stop_reason = "length"
+
+    if not stream:
+        completion = result
+        assert completion.id is not None
+        assert completion.choices is not None and len(completion.choices) == 1
+
+        choice = completion.choices[0]
+        assert len(choice.text) >= 5
+        assert choice.finish_reason == stop_reason
+
+        if echo:
+            assert choice.text is not None and saying in choice.text
+        else:
+            assert choice.text is not None and saying not in choice.text
+
+    else:
+        chunks: list[str] = []
+        final_finish_reason = None
+        async for chunk in result:
+            if chunk.choices and chunk.choices[0].text:
+                chunks.append(chunk.choices[0].text)
+            if chunk.choices and chunk.choices[0].finish_reason:
+                final_finish_reason = chunk.choices[0].finish_reason
+
+        assert final_finish_reason == stop_reason
+        content = "".join(chunks)
+        if echo:
+            assert content is not None and saying in content
+        else:
+            assert content is not None and saying not in content
+
+
+@pytest.mark.asyncio
+async def test_invocations(server: RemoteOpenAIServer,
+                           client: openai.AsyncOpenAI):
+    request_args = {
+        "model": MODEL_NAME,
+        "prompt": "Hello, my name is",
+        "max_tokens": 5,
+        "temperature": 0.0,
+        "logprobs": None,
+    }
+
+    completion = await client.completions.create(**request_args)
+
+    invocation_response = requests.post(server.url_for("invocations"),
+                                        json=request_args)
+    invocation_response.raise_for_status()
+
+    completion_output = completion.model_dump()
+    invocation_output = invocation_response.json()
+
+    assert completion_output.keys() == invocation_output.keys()
+    assert completion_output["choices"] == invocation_output["choices"]
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_completion_with_function_calling.py b/vllm_v0.10.0/tests/entrypoints/openai/test_completion_with_function_calling.py
new file mode 100644
index 0000000..eca048d
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_completion_with_function_calling.py
@@ -0,0 +1,209 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+
+# downloading lora to test lora requests
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+
+
+@pytest.fixture(scope="module")
+def server():  # noqa: F811
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "half",
+        "--enable-auto-tool-choice",
+        "--guided-decoding-backend",
+        "xgrammar",
+        "--tool-call-parser",
+        "hermes",
+        "--reasoning-parser",
+        "qwen3",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("stream", [True, False])
+@pytest.mark.parametrize("tool_choice", ["auto", "required"])
+@pytest.mark.parametrize("enable_thinking", [True, False])
+async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str,
+                                 stream: bool, tool_choice: str,
+                                 enable_thinking: bool):
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_current_weather",
+                "description": "Get the current weather in a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "city": {
+                            "type": "string",
+                            "description":
+                            "The city to find the weather for, e.g. 'Vienna'",
+                            "default": "Vienna",
+                        },
+                        "country": {
+                            "type":
+                            "string",
+                            "description":
+                            "The country that the city is in, e.g. 'Austria'",
+                        },
+                        "unit": {
+                            "type": "string",
+                            "description":
+                            "The unit to fetch the temperature in",
+                            "enum": ["celsius", "fahrenheit"],
+                        },
+                        "options": {
+                            "$ref": "#/$defs/WeatherOptions",
+                            "description":
+                            "Optional parameters for weather query",
+                        },
+                    },
+                    "required": ["country", "unit"],
+                    "$defs": {
+                        "WeatherOptions": {
+                            "title": "WeatherOptions",
+                            "type": "object",
+                            "additionalProperties": False,
+                            "properties": {
+                                "unit": {
+                                    "type": "string",
+                                    "enum": ["celsius", "fahrenheit"],
+                                    "default": "celsius",
+                                    "description": "Temperature unit",
+                                    "title": "Temperature Unit",
+                                },
+                                "include_forecast": {
+                                    "type": "boolean",
+                                    "default": False,
+                                    "description":
+                                    "Whether to include a 24-hour forecast",
+                                    "title": "Include Forecast",
+                                },
+                                "language": {
+                                    "type": "string",
+                                    "default": "zh-CN",
+                                    "description": "Language of the response",
+                                    "title": "Language",
+                                    "enum": ["zh-CN", "en-US", "ja-JP"],
+                                },
+                            },
+                        },
+                    },
+                },
+            },
+        },
+        {
+            "type": "function",
+            "function": {
+                "name": "get_forecast",
+                "description": "Get the weather forecast for a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "city": {
+                            "type": "string",
+                            "description":
+                            "The city to get the forecast for, e.g. 'Vienna'",
+                            "default": "Vienna",
+                        },
+                        "country": {
+                            "type":
+                            "string",
+                            "description":
+                            "The country that the city is in, e.g. 'Austria'",
+                        },
+                        "days": {
+                            "type":
+                            "integer",
+                            "description":
+                            "Number of days to get the forecast for (1-7)",
+                        },
+                        "unit": {
+                            "type": "string",
+                            "description":
+                            "The unit to fetch the temperature in",
+                            "enum": ["celsius", "fahrenheit"],
+                        },
+                    },
+                    "required": ["country", "days", "unit"],
+                },
+            },
+        },
+    ]
+
+    messages = [
+        {
+            "role": "user",
+            "content": "Hi! How are you doing today?"
+        },
+        {
+            "role": "assistant",
+            "content": "I'm doing well! How can I help you?"
+        },
+        {
+            "role":
+            "user",
+            "content":
+            "Can you tell me what the current weather is in Berlin and the "\
+            "forecast for the next 5 days, in fahrenheit?",
+        },
+    ]
+    if not stream:
+        # Non-streaming test
+        chat_completion = await client.chat.completions.create(
+            messages=messages,
+            model=model_name,
+            tools=tools,
+            tool_choice=tool_choice,
+            extra_body={
+                "chat_template_kwargs": {
+                    "enable_thinking": enable_thinking
+                }
+            })
+        if enable_thinking:
+            assert chat_completion.choices[0].message.\
+                reasoning_content is not None
+            assert chat_completion.choices[0].message.\
+                reasoning_content != ""
+        assert chat_completion.choices[0].message.tool_calls is not None
+        assert len(chat_completion.choices[0].message.tool_calls) > 0
+    else:
+        # Streaming test
+        output_stream = await client.chat.completions.create(
+            messages=messages,
+            model=model_name,
+            tools=tools,
+            tool_choice=tool_choice,
+            stream=True,
+            extra_body={
+                "chat_template_kwargs": {
+                    "enable_thinking": enable_thinking
+                }
+            })
+
+        output = []
+        async for chunk in output_stream:
+            if chunk.choices and chunk.choices[0].delta.tool_calls:
+                output.extend(chunk.choices[0].delta.tool_calls)
+
+        assert len(output) > 0
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/vllm_v0.10.0/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
new file mode 100644
index 0000000..00d3ffb
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
@@ -0,0 +1,258 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import base64
+import io
+import shutil
+from tempfile import TemporaryDirectory
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+import torch
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
+from openai import BadRequestError
+from transformers import AutoConfig, AutoTokenizer
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+
+CONFIG = AutoConfig.from_pretrained(MODEL_NAME)
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_added_tokens_files(zephyr_lora_files):
+    tmp_dir = TemporaryDirectory()
+    tmp_model_dir = f"{tmp_dir.name}/zephyr"
+    shutil.copytree(zephyr_lora_files, tmp_model_dir)
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    # Copy tokenizer to adapter and add some unique tokens
+    # 32000, 32001, 32002
+    added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
+                                 special_tokens=True)
+    assert added == 3
+    tokenizer.save_pretrained(tmp_model_dir)
+    yield tmp_model_dir
+    tmp_dir.cleanup()
+
+
+@pytest.fixture(scope="module")
+def default_server_args(
+    zephyr_lora_files,
+    zephyr_lora_added_tokens_files,
+) -> list[str]:
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+        # Prompt Embeds server args
+        "--enable-prompt-embeds",
+        "--no-enable-chunked-prefill",
+    ]
+
+
+@pytest.fixture(scope="module",
+                params=["", "--disable-frontend-multiprocessing"])
+def server_with_prompt_embeds(default_server_args, request):
+    if request.param:
+        default_server_args.append(request.param)
+
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client_with_prompt_embeds(server_with_prompt_embeds):
+    async with server_with_prompt_embeds.get_async_client() as async_client:
+        yield async_client
+
+
+def create_dummy_embeds(num_tokens: int = 5) -> str:
+    """Create dummy embeddings and return them as base64 encoded string."""
+    dummy_embeds = torch.randn(num_tokens, CONFIG.hidden_size)
+    buffer = io.BytesIO()
+    torch.save(dummy_embeds, buffer)
+    return base64.b64encode(buffer.getvalue()).decode('utf-8')
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_completions_with_prompt_embeds(
+        client_with_prompt_embeds: openai.AsyncOpenAI, model_name: str):
+    # Test case: Single prompt embeds input
+    encoded_embeds = create_dummy_embeds()
+    completion = await client_with_prompt_embeds.completions.create(
+        model=model_name,
+        prompt="",  # Add empty prompt as required parameter
+        max_tokens=5,
+        temperature=0.0,
+        extra_body={"prompt_embeds": encoded_embeds})
+    assert len(completion.choices[0].text) >= 1
+    assert completion.choices[0].prompt_logprobs is None
+
+    # Test case: batch completion with prompt_embeds
+    encoded_embeds2 = create_dummy_embeds()
+    completion = await client_with_prompt_embeds.completions.create(
+        model=model_name,
+        prompt="",  # Add empty prompt as required parameter
+        max_tokens=5,
+        temperature=0.0,
+        extra_body={"prompt_embeds": [encoded_embeds, encoded_embeds2]})
+    assert len(completion.choices) == 2
+    assert len(completion.choices[0].text) >= 1
+    assert len(completion.choices[1].text) >= 1
+
+    # Test case: streaming with prompt_embeds
+    encoded_embeds = create_dummy_embeds()
+    single_completion = await client_with_prompt_embeds.completions.create(
+        model=model_name,
+        prompt="",  # Add empty prompt as required parameter
+        max_tokens=5,
+        temperature=0.0,
+        extra_body={"prompt_embeds": encoded_embeds})
+    single_output = single_completion.choices[0].text
+
+    stream = await client_with_prompt_embeds.completions.create(
+        model=model_name,
+        prompt="",  # Add empty prompt as required parameter
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        extra_body={"prompt_embeds": encoded_embeds})
+    chunks = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        chunks.append(chunk.choices[0].text)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == "length"
+    assert chunk.choices[0].text
+    assert "".join(chunks) == single_output
+
+    # Test case: batch streaming with prompt_embeds
+    encoded_embeds2 = create_dummy_embeds()
+    stream = await client_with_prompt_embeds.completions.create(
+        model=model_name,
+        prompt="",  # Add empty prompt as required parameter
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        extra_body={"prompt_embeds": [encoded_embeds, encoded_embeds2]})
+    chunks_stream_embeds: list[list[str]] = [[], []]
+    finish_reason_count = 0
+    async for chunk in stream:
+        chunks_stream_embeds[chunk.choices[0].index].append(
+            chunk.choices[0].text)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    assert finish_reason_count == 2
+    assert chunk.choices[0].finish_reason == "length"
+    assert chunk.choices[0].text
+    assert len(chunks_stream_embeds[0]) > 0
+    assert len(chunks_stream_embeds[1]) > 0
+
+    # Test case: mixed text and prompt_embeds
+    encoded_embeds = create_dummy_embeds()
+    completion_mixed = await client_with_prompt_embeds.completions.create(
+        model=model_name,
+        prompt="This is a prompt",
+        max_tokens=5,
+        temperature=0.0,
+        extra_body={"prompt_embeds": encoded_embeds})
+    assert len(completion.choices) == 2
+    completion_text_only = await client_with_prompt_embeds.completions.create(
+        model=model_name,
+        prompt="This is a prompt",
+        max_tokens=5,
+        temperature=0.0,
+    )
+    completion_embeds_only = await client_with_prompt_embeds.completions.create(
+        model=model_name,
+        prompt="",
+        max_tokens=5,
+        temperature=0.0,
+        extra_body={"prompt_embeds": encoded_embeds})
+    # Embeddings responses should be handled first
+    assert completion_mixed.choices[0].text == completion_embeds_only.choices[
+        0].text
+    assert completion_mixed.choices[1].text == completion_text_only.choices[
+        0].text
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_completions_errors_with_prompt_embeds(
+        client_with_prompt_embeds: openai.AsyncOpenAI, model_name: str):
+    # Test error case: invalid prompt_embeds
+    with pytest.raises(BadRequestError):
+        await client_with_prompt_embeds.completions.create(
+            prompt="",
+            model=model_name,
+            max_tokens=5,
+            temperature=0.0,
+            extra_body={"prompt_embeds": "invalid_base64"})
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("logprobs_arg", [1, 0])
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_completions_with_logprobs_and_prompt_embeds(
+        client_with_prompt_embeds: openai.AsyncOpenAI, logprobs_arg: int,
+        model_name: str):
+    # Test case: Logprobs using prompt_embeds
+    encoded_embeds = create_dummy_embeds()
+    completion = await client_with_prompt_embeds.completions.create(
+        model=model_name,
+        prompt="",  # Add empty prompt as required parameter
+        max_tokens=5,
+        temperature=0.0,
+        echo=False,
+        logprobs=logprobs_arg,
+        extra_body={"prompt_embeds": encoded_embeds})
+
+    logprobs = completion.choices[0].logprobs
+    assert logprobs is not None
+    assert len(logprobs.text_offset) == 5
+    assert len(logprobs.token_logprobs) == 5
+    assert len(logprobs.top_logprobs) == 5
+    for top_logprobs in logprobs.top_logprobs[1:]:
+        assert max(logprobs_arg, 1) <= len(top_logprobs) <= logprobs_arg + 1
+    assert len(logprobs.tokens) == 5
+
+    # Test case: Log probs with batch completion and prompt_embeds
+    encoded_embeds2 = create_dummy_embeds()
+    completion = await client_with_prompt_embeds.completions.create(
+        model=model_name,
+        prompt="",  # Add empty prompt as required parameter
+        max_tokens=5,
+        temperature=0.0,
+        echo=False,
+        logprobs=logprobs_arg,
+        extra_body={"prompt_embeds": [encoded_embeds, encoded_embeds2]})
+
+    assert len(completion.choices) == 2
+    for choice in completion.choices:
+        logprobs = choice.logprobs
+        assert logprobs is not None
+        assert len(logprobs.text_offset) == 5
+        assert len(logprobs.token_logprobs) == 5
+        assert len(logprobs.top_logprobs) == 5
+        for top_logprobs in logprobs.top_logprobs[1:]:
+            assert max(logprobs_arg,
+                       1) <= len(top_logprobs) <= logprobs_arg + 1
+        assert len(logprobs.tokens) == 5
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_default_mm_loras.py b/vllm_v0.10.0/tests/entrypoints/openai/test_default_mm_loras.py
new file mode 100644
index 0000000..1fc87c8
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_default_mm_loras.py
@@ -0,0 +1,107 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+from huggingface_hub import snapshot_download
+
+from ...conftest import AudioTestAssets
+from ...utils import RemoteOpenAIServer
+
+# NOTE - the tests in this module are currently analogous to test_chat, but are
+# separated to avoid OOM killing due to module-scoped servers, since we
+# need a multimodal model for these tests.
+
+# Contains a modality specific lora alongside the base model
+MULTIMODAL_MODEL_NAME = snapshot_download(
+    "microsoft/Phi-4-multimodal-instruct")
+AUDIO_LORA_PATH = os.path.join(MULTIMODAL_MODEL_NAME, "speech-lora")
+
+ACTIVE_MM_LORA_RESPONSE = "Spoken text: The first words I spoke in the original chronograph, a little piece of practical poetry. Mary had a little lamb, it slept with quite a snow, and everywhere that Mary went, the lamb was sure to go."  # noqa: E501
+
+
+@pytest.fixture(scope="module")
+def monkeypatch_module():
+    from _pytest.monkeypatch import MonkeyPatch
+    mpatch = MonkeyPatch()
+    yield mpatch
+    mpatch.undo()
+
+
+@pytest.fixture(scope="module", params=[False, True])
+def multimodal_server(request, monkeypatch_module):  # noqa: F811
+
+    use_v1 = request.param
+    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "half",
+        "--max-model-len",
+        "12800",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        f"speech={AUDIO_LORA_PATH}",
+        "--max-lora-rank",
+        "320",
+        "--max-num-seqs",
+        "2",
+        "--trust-remote-code",
+        "--gpu-memory-utilization",
+        "0.8",
+        "--default-mm-loras",
+        f"{{\"audio\": \"{AUDIO_LORA_PATH}\"}}",
+    ]
+
+    with RemoteOpenAIServer(MULTIMODAL_MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def multi_modal_client(multimodal_server):
+    async with multimodal_server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # base model with default lora should give the same response as lora model
+    "model_name",
+    [MULTIMODAL_MODEL_NAME, "speech"],
+)
+async def test_default_mm_lora_chat_completions(
+    model_name: str,
+    multi_modal_client: openai.AsyncOpenAI,
+    audio_assets: AudioTestAssets,
+):
+    messages = [{
+        "role":
+        "user",
+        "content": [{
+            "type": "text",
+            "text": "Can you transcribe this audio?",
+        }, {
+            "type": "audio_url",
+            "audio_url": {
+                "url": audio_assets[0].url
+            },
+        }]
+    }]
+
+    chat_completion = await multi_modal_client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=128,
+        temperature=0.0)
+
+    assert len(chat_completion.choices) > 0
+
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+    assert message.content == ACTIVE_MM_LORA_RESPONSE
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_embedding.py b/vllm_v0.10.0/tests/entrypoints/openai/test_embedding.py
new file mode 100644
index 0000000..f03c96b
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_embedding.py
@@ -0,0 +1,371 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import base64
+
+import numpy as np
+import openai
+import pytest
+import pytest_asyncio
+import requests
+
+from vllm.entrypoints.openai.protocol import EmbeddingResponse
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ...models.language.pooling.embed_utils import (
+    run_embedding_correctness_test)
+from ...models.utils import check_embeddings_close
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "intfloat/multilingual-e5-small"
+DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
+DTYPE = "bfloat16"
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--task",
+        "embed",
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        DTYPE,
+        "--enforce-eager",
+        "--max-model-len",
+        "512",
+        "--chat-template",
+        DUMMY_CHAT_TEMPLATE,
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.fixture(scope="module")
+def hf_model(hf_runner):
+    with hf_runner(MODEL_NAME, dtype=DTYPE,
+                   is_sentence_transformer=True) as hf_model:
+        yield hf_model
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_embedding(hf_model, client: openai.AsyncOpenAI,
+                                model_name: str):
+    input_texts = [
+        "The chef prepared a delicious meal.",
+    ]
+
+    # test single embedding
+    embedding_response = await client.embeddings.create(
+        model=model_name,
+        input=input_texts,
+        encoding_format="float",
+    )
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 384
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 11
+    assert embeddings.usage.total_tokens == 11
+
+    vllm_outputs = [d.embedding for d in embeddings.data]
+    run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
+
+    # test using token IDs
+    input_tokens = [1, 1, 1, 1, 1]
+    embedding_response = await client.embeddings.create(
+        model=model_name,
+        input=input_tokens,
+        encoding_format="float",
+    )
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 384
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 5
+    assert embeddings.usage.total_tokens == 5
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI,
+                               model_name: str):
+    # test list[str]
+    input_texts = [
+        "The cat sat on the mat.", "A feline was resting on a rug.",
+        "Stars twinkle brightly in the night sky."
+    ]
+    embedding_response = await client.embeddings.create(
+        model=model_name,
+        input=input_texts,
+        encoding_format="float",
+    )
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 3
+    assert len(embeddings.data[0].embedding) == 384
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 33
+    assert embeddings.usage.total_tokens == 33
+
+    vllm_outputs = [d.embedding for d in embeddings.data]
+    run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
+
+    # test list[list[int]]
+    input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
+                    [25, 32, 64, 77]]
+    embedding_response = await client.embeddings.create(
+        model=model_name,
+        input=input_tokens,
+        encoding_format="float",
+    )
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 4
+    assert len(embeddings.data[0].embedding) == 384
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 17
+    assert embeddings.usage.total_tokens == 17
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_conversation_embedding(server: RemoteOpenAIServer,
+                                      client: openai.AsyncOpenAI,
+                                      model_name: str):
+    messages = [{
+        "role": "user",
+        "content": "The cat sat on the mat.",
+    }, {
+        "role": "assistant",
+        "content": "A feline was resting on a rug.",
+    }, {
+        "role": "user",
+        "content": "Stars twinkle brightly in the night sky.",
+    }]
+
+    chat_response = requests.post(
+        server.url_for("v1/embeddings"),
+        json={
+            "model": model_name,
+            "messages": messages,
+            "encoding_format": "float",
+        },
+    )
+    chat_response.raise_for_status()
+    chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
+
+    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
+    prompt = tokenizer.apply_chat_template(
+        messages,
+        chat_template=DUMMY_CHAT_TEMPLATE,
+        add_generation_prompt=True,
+        continue_final_message=False,
+        tokenize=False,
+    )
+    completion_response = await client.embeddings.create(
+        model=model_name,
+        input=prompt,
+        encoding_format="float",
+        # To be consistent with chat
+        extra_body={"add_special_tokens": False},
+    )
+    completion_embeddings = EmbeddingResponse.model_validate(
+        completion_response.model_dump(mode="json"))
+
+    assert chat_embeddings.id is not None
+    assert completion_embeddings.id is not None
+    assert chat_embeddings.created <= completion_embeddings.created
+    assert chat_embeddings.model_dump(
+        exclude={"id", "created"}) == (completion_embeddings.model_dump(
+            exclude={"id", "created"}))
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_batch_base64_embedding(hf_model, client: openai.AsyncOpenAI,
+                                      model_name: str):
+    input_texts = [
+        "Hello my name is",
+        "The best thing about vLLM is that it supports many different models"
+    ]
+
+    responses_float = await client.embeddings.create(input=input_texts,
+                                                     model=model_name,
+                                                     encoding_format="float")
+    float_data = [d.embedding for d in responses_float.data]
+    run_embedding_correctness_test(hf_model, input_texts, float_data)
+
+    responses_base64 = await client.embeddings.create(input=input_texts,
+                                                      model=model_name,
+                                                      encoding_format="base64")
+    base64_data = []
+    for data in responses_base64.data:
+        base64_data.append(
+            np.frombuffer(base64.b64decode(data.embedding),
+                          dtype="float32").tolist())
+
+    run_embedding_correctness_test(hf_model, input_texts, base64_data)
+
+    # Default response is float32 decoded from base64 by OpenAI Client
+    responses_default = await client.embeddings.create(input=input_texts,
+                                                       model=model_name)
+    default_data = [d.embedding for d in responses_default.data]
+    run_embedding_correctness_test(hf_model, input_texts, default_data)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
+                                           model_name: str):
+    input_texts = [
+        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
+    ]
+
+    # test single embedding
+    embedding_response = await client.embeddings.create(
+        model=model_name,
+        input=input_texts,
+        extra_body={"truncate_prompt_tokens": 10})
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 384
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 10
+    assert embeddings.usage.total_tokens == 10
+
+    input_tokens = [
+        1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728,
+        9901, 340, 2229, 385, 340, 315, 28741, 28804, 2
+    ]
+    embedding_response = await client.embeddings.create(
+        model=model_name,
+        input=input_tokens,
+        extra_body={"truncate_prompt_tokens": 10})
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 384
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 10
+    assert embeddings.usage.total_tokens == 10
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI,
+                                                   model_name: str):
+    input_texts = [
+        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
+    ]
+
+    with pytest.raises(openai.BadRequestError):
+        response = await client.embeddings.create(
+            model=model_name,
+            input=input_texts,
+            extra_body={"truncate_prompt_tokens": 8193})
+        assert "error" in response.object
+        assert "truncate_prompt_tokens value is greater than max_model_len. "\
+               "Please, select a smaller truncation size." in response.message
+
+
+@pytest.mark.asyncio
+async def test_invocations(server: RemoteOpenAIServer,
+                           client: openai.AsyncOpenAI):
+    input_texts = [
+        "The chef prepared a delicious meal.",
+    ]
+
+    request_args = {
+        "model": MODEL_NAME,
+        "input": input_texts,
+        "encoding_format": "float",
+    }
+
+    completion_response = await client.embeddings.create(**request_args)
+
+    invocation_response = requests.post(server.url_for("invocations"),
+                                        json=request_args)
+    invocation_response.raise_for_status()
+
+    completion_output = completion_response.model_dump()
+    invocation_output = invocation_response.json()
+
+    assert completion_output.keys() == invocation_output.keys()
+    for completion_data, invocation_data in zip(completion_output["data"],
+                                                invocation_output["data"]):
+        assert completion_data.keys() == invocation_data.keys()
+        check_embeddings_close(embeddings_0_lst=[completion_data["embedding"]],
+                               embeddings_1_lst=[invocation_data["embedding"]],
+                               name_0="completion",
+                               name_1="invocation")
+
+
+@pytest.mark.asyncio
+async def test_invocations_conversation(server: RemoteOpenAIServer):
+    messages = [{
+        "role": "user",
+        "content": "The cat sat on the mat.",
+    }, {
+        "role": "assistant",
+        "content": "A feline was resting on a rug.",
+    }, {
+        "role": "user",
+        "content": "Stars twinkle brightly in the night sky.",
+    }]
+
+    request_args = {
+        "model": MODEL_NAME,
+        "messages": messages,
+        "encoding_format": "float",
+    }
+
+    chat_response = requests.post(server.url_for("v1/embeddings"),
+                                  json=request_args)
+    chat_response.raise_for_status()
+
+    invocation_response = requests.post(server.url_for("invocations"),
+                                        json=request_args)
+    invocation_response.raise_for_status()
+
+    chat_output = chat_response.json()
+    invocation_output = invocation_response.json()
+
+    assert chat_output.keys() == invocation_output.keys()
+    for chat_data, invocation_data in zip(chat_output["data"],
+                                          invocation_output["data"]):
+        assert chat_data.keys() == invocation_data.keys()
+        check_embeddings_close(embeddings_0_lst=[chat_data["embedding"]],
+                               embeddings_1_lst=[invocation_data["embedding"]],
+                               name_0="chat",
+                               name_1="invocation")
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_embedding_dimensions.py b/vllm_v0.10.0/tests/entrypoints/openai/test_embedding_dimensions.py
new file mode 100644
index 0000000..08b797d
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_embedding_dimensions.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`.
+"""
+
+from typing import Optional
+
+import openai
+import pytest
+
+from vllm.entrypoints.openai.protocol import EmbeddingResponse
+
+from ...conftest import HfRunner
+from ...models.language.pooling.embed_utils import (
+    run_embedding_correctness_test)
+from ...models.utils import EmbedModelInfo
+from ...utils import RemoteOpenAIServer
+
+MODELS = [
+    EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5",
+                   is_matryoshka=True,
+                   matryoshka_dimensions=[256]),
+]
+
+input_texts = [
+    "The chef prepared a delicious meal.",
+]
+
+
+@pytest.fixture(scope="module", params=MODELS)
+def model_info(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=["bfloat16"])
+def dtype(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def server(model_info, dtype: str):
+    args = [
+        "--task",
+        "embed",
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        dtype,
+        "--enforce-eager",
+        "--max-model-len",
+        "512"
+    ]
+
+    if model_info.name == "Snowflake/snowflake-arctic-embed-m-v1.5":
+        # Manually enable Matryoshka Embeddings
+        args.extend([
+            "--trust_remote_code", "--hf_overrides",
+            '{"matryoshka_dimensions":[256]}'
+        ])
+
+    with RemoteOpenAIServer(model_info.name, args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def hf_model(hf_runner, model_info, dtype: str):
+    with hf_runner(model_info.name, dtype=dtype,
+                   is_sentence_transformer=True) as hf_model:
+        yield hf_model
+
+
+@pytest.mark.asyncio
+async def test_matryoshka(model_info: EmbedModelInfo,
+                          server: RemoteOpenAIServer, hf_model: HfRunner):
+    client = server.get_async_client()
+
+    async def make_request_and_correctness_test(dimensions):
+        prompts = input_texts * 3
+
+        embedding_response = await client.embeddings.create(
+            model=model_info.name,
+            input=prompts,
+            dimensions=dimensions,
+            encoding_format="float",
+        )
+        embeddings = EmbeddingResponse.model_validate(
+            embedding_response.model_dump(mode="json"))
+
+        assert embeddings.id is not None
+        assert len(embeddings.data) == 3
+        assert len(embeddings.data[0].embedding) > 0
+        assert embeddings.usage.completion_tokens == 0
+        assert embeddings.usage.prompt_tokens > 0
+        assert embeddings.usage.total_tokens > 0
+
+        if dimensions is not None:
+            assert len(embeddings.data[0].embedding) == dimensions
+
+        vllm_outputs = [d.embedding for d in embeddings.data]
+        run_embedding_correctness_test(hf_model, prompts, vllm_outputs,
+                                       dimensions)
+
+    if model_info.is_matryoshka:
+        valid_dimensions: list[Optional[int]] = [None]
+        if model_info.matryoshka_dimensions is not None:
+            valid_dimensions += model_info.matryoshka_dimensions[:2]
+
+        for dimensions in valid_dimensions:
+            await make_request_and_correctness_test(dimensions)
+
+        invalid_dimensions: list[Optional[int]] = [-1]
+        if model_info.matryoshka_dimensions is not None:
+            assert 5 not in model_info.matryoshka_dimensions
+            invalid_dimensions.append(5)
+
+        for dimensions in invalid_dimensions:
+            with pytest.raises(openai.BadRequestError):
+                await make_request_and_correctness_test(dimensions)
+
+    else:
+        for dimensions in [None]:
+            await make_request_and_correctness_test(dimensions)
+
+        for dimensions in [-1, 16]:
+            with pytest.raises(openai.BadRequestError):
+                await make_request_and_correctness_test(dimensions)
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_encoder_decoder.py b/vllm_v0.10.0/tests/entrypoints/openai/test_encoder_decoder.py
new file mode 100644
index 0000000..9c2aef2
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_encoder_decoder.py
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import openai
+import pytest
+import pytest_asyncio
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "facebook/bart-base"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--enforce-eager",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
+    completion = await client.completions.create(model=model_name,
+                                                 prompt="Hello, my name is",
+                                                 max_tokens=5,
+                                                 temperature=0.0)
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 1
+
+    choice = completion.choices[0]
+    assert len(choice.text) >= 5
+    assert choice.finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5, prompt_tokens=2, total_tokens=7)
+
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 1
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_lora_adapters.py b/vllm_v0.10.0/tests/entrypoints/openai/test_lora_adapters.py
new file mode 100644
index 0000000..bcdeaaa
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_lora_adapters.py
@@ -0,0 +1,350 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import json
+import shutil
+from contextlib import suppress
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+# technically this needs Mistral-7B-v0.1 as base, but we're not testing
+# generation quality here
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+
+BADREQUEST_CASES = [
+    (
+        "test_rank",
+        {
+            "r": 1024
+        },
+        "is greater than max_lora_rank",
+    ),
+    (
+        "test_bias",
+        {
+            "bias": "all"
+        },
+        "Adapter bias cannot be used without bias_enabled",
+    ),
+    ("test_dora", {
+        "use_dora": True
+    }, "does not yet support DoRA"),
+    (
+        "test_modules_to_save",
+        {
+            "modules_to_save": ["lm_head"]
+        },
+        "only supports modules_to_save being None",
+    ),
+]
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+
+
+@pytest.fixture(scope="module")
+def monkeypatch_module():
+    from _pytest.monkeypatch import MonkeyPatch
+    mpatch = MonkeyPatch()
+    yield mpatch
+    mpatch.undo()
+
+
+@pytest.fixture(scope="module", params=[False, True])
+def server_with_lora_modules_json(request, monkeypatch_module,
+                                  zephyr_lora_files):
+
+    use_v1 = request.param
+    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+
+    # Define the json format LoRA module configurations
+    lora_module_1 = {
+        "name": "zephyr-lora",
+        "path": zephyr_lora_files,
+        "base_model_name": MODEL_NAME
+    }
+
+    lora_module_2 = {
+        "name": "zephyr-lora2",
+        "path": zephyr_lora_files,
+        "base_model_name": MODEL_NAME
+    }
+
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        json.dumps(lora_module_1),
+        json.dumps(lora_module_2),
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "64",
+    ]
+
+    # Enable the /v1/load_lora_adapter endpoint
+    envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"}
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server_with_lora_modules_json):
+    async with server_with_lora_modules_json.get_async_client(
+    ) as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_static_lora_lineage(client: openai.AsyncOpenAI,
+                                   zephyr_lora_files):
+    models = await client.models.list()
+    models = models.data
+    served_model = models[0]
+    lora_models = models[1:]
+    assert served_model.id == MODEL_NAME
+    assert served_model.root == MODEL_NAME
+    assert served_model.parent is None
+    assert all(lora_model.root == zephyr_lora_files
+               for lora_model in lora_models)
+    assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
+    assert lora_models[0].id == "zephyr-lora"
+    assert lora_models[1].id == "zephyr-lora2"
+
+
+@pytest.mark.asyncio
+async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI,
+                                    zephyr_lora_files):
+
+    response = await client.post("load_lora_adapter",
+                                 cast_to=str,
+                                 body={
+                                     "lora_name": "zephyr-lora-3",
+                                     "lora_path": zephyr_lora_files
+                                 })
+    # Ensure adapter loads before querying /models
+    assert "success" in response
+
+    models = await client.models.list()
+    models = models.data
+    dynamic_lora_model = models[-1]
+    assert dynamic_lora_model.root == zephyr_lora_files
+    assert dynamic_lora_model.parent == MODEL_NAME
+    assert dynamic_lora_model.id == "zephyr-lora-3"
+
+
+@pytest.mark.asyncio
+async def test_dynamic_lora_not_found(client: openai.AsyncOpenAI):
+    with pytest.raises(openai.NotFoundError):
+        await client.post("load_lora_adapter",
+                          cast_to=str,
+                          body={
+                              "lora_name": "notfound",
+                              "lora_path": "/not/an/adapter"
+                          })
+
+
+@pytest.mark.asyncio
+async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI,
+                                          tmp_path):
+    invalid_files = tmp_path / "invalid_files"
+    invalid_files.mkdir()
+    (invalid_files / "adapter_config.json").write_text("this is not json")
+
+    with pytest.raises(openai.BadRequestError):
+        await client.post("load_lora_adapter",
+                          cast_to=str,
+                          body={
+                              "lora_name": "invalid-json",
+                              "lora_path": str(invalid_files)
+                          })
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("test_name,config_change,expected_error",
+                         BADREQUEST_CASES)
+async def test_dynamic_lora_badrequests(client: openai.AsyncOpenAI, tmp_path,
+                                        zephyr_lora_files, test_name: str,
+                                        config_change: dict,
+                                        expected_error: str):
+    # Create test directory
+    test_dir = tmp_path / test_name
+
+    # Copy adapter files
+    shutil.copytree(zephyr_lora_files, test_dir)
+
+    # Load and modify configuration
+    config_path = test_dir / "adapter_config.json"
+    with open(config_path) as f:
+        adapter_config = json.load(f)
+    # Apply configuration changes
+    adapter_config.update(config_change)
+
+    # Save modified configuration
+    with open(config_path, "w") as f:
+        json.dump(adapter_config, f)
+
+    # Test loading the adapter
+    with pytest.raises(openai.BadRequestError, match=expected_error):
+        await client.post("load_lora_adapter",
+                          cast_to=str,
+                          body={
+                              "lora_name": test_name,
+                              "lora_path": str(test_dir)
+                          })
+
+
+@pytest.mark.asyncio
+async def test_multiple_lora_adapters(client: openai.AsyncOpenAI, tmp_path,
+                                      zephyr_lora_files):
+    """Validate that many loras can be dynamically registered and inferenced 
+    with concurrently"""
+
+    # This test file configures the server with --max-cpu-loras=2 and this test
+    # will concurrently load 10 adapters, so it should flex the LRU cache
+    async def load_and_run_adapter(adapter_name: str):
+        await client.post("load_lora_adapter",
+                          cast_to=str,
+                          body={
+                              "lora_name": adapter_name,
+                              "lora_path": str(zephyr_lora_files)
+                          })
+        for _ in range(3):
+            await client.completions.create(
+                model=adapter_name,
+                prompt=["Hello there", "Foo bar bazz buzz"],
+                max_tokens=5,
+            )
+
+    lora_tasks = []
+    for i in range(10):
+        lora_tasks.append(
+            asyncio.create_task(load_and_run_adapter(f"adapter_{i}")))
+
+    results, _ = await asyncio.wait(lora_tasks)
+
+    for r in results:
+        assert not isinstance(r, Exception), f"Got exception {r}"
+
+
+@pytest.mark.asyncio
+async def test_loading_invalid_adapters_does_not_break_others(
+        client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files):
+
+    invalid_files = tmp_path / "invalid_files"
+    invalid_files.mkdir()
+    (invalid_files / "adapter_config.json").write_text("this is not json")
+
+    stop_good_requests_event = asyncio.Event()
+
+    async def run_good_requests(client):
+        # Run chat completions requests until event set
+
+        results = []
+
+        while not stop_good_requests_event.is_set():
+            try:
+                batch = await client.completions.create(
+                    model="zephyr-lora",
+                    prompt=["Hello there", "Foo bar bazz buzz"],
+                    max_tokens=5,
+                )
+                results.append(batch)
+            except Exception as e:
+                results.append(e)
+
+        return results
+
+    # Create task to run good requests
+    good_task = asyncio.create_task(run_good_requests(client))
+
+    # Run a bunch of bad adapter loads
+    for _ in range(25):
+        with suppress(openai.NotFoundError):
+            await client.post("load_lora_adapter",
+                              cast_to=str,
+                              body={
+                                  "lora_name": "notfound",
+                                  "lora_path": "/not/an/adapter"
+                              })
+    for _ in range(25):
+        with suppress(openai.BadRequestError):
+            await client.post("load_lora_adapter",
+                              cast_to=str,
+                              body={
+                                  "lora_name": "invalid",
+                                  "lora_path": str(invalid_files)
+                              })
+
+    # Ensure all the running requests with lora adapters succeeded
+    stop_good_requests_event.set()
+    results = await good_task
+    for r in results:
+        assert not isinstance(r, Exception), f"Got exception {r}"
+
+    # Ensure we can load another adapter and run it
+    await client.post("load_lora_adapter",
+                      cast_to=str,
+                      body={
+                          "lora_name": "valid",
+                          "lora_path": zephyr_lora_files
+                      })
+    await client.completions.create(
+        model="valid",
+        prompt=["Hello there", "Foo bar bazz buzz"],
+        max_tokens=5,
+    )
+
+
+@pytest.mark.asyncio
+async def test_beam_search_with_lora_adapters(
+    client: openai.AsyncOpenAI,
+    tmp_path,
+    zephyr_lora_files,
+):
+    """Validate that async beam search can be used with lora."""
+
+    async def load_and_run_adapter(adapter_name: str):
+        await client.post("load_lora_adapter",
+                          cast_to=str,
+                          body={
+                              "lora_name": adapter_name,
+                              "lora_path": str(zephyr_lora_files)
+                          })
+        for _ in range(3):
+            await client.completions.create(
+                model=adapter_name,
+                prompt=["Hello there", "Foo bar bazz buzz"],
+                max_tokens=5,
+                extra_body=dict(use_beam_search=True),
+            )
+
+    lora_tasks = []
+    for i in range(3):
+        lora_tasks.append(
+            asyncio.create_task(load_and_run_adapter(f"adapter_{i}")))
+
+    results, _ = await asyncio.wait(lora_tasks)
+
+    for r in results:
+        assert not isinstance(r, Exception), f"Got exception {r}"
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_lora_resolvers.py b/vllm_v0.10.0/tests/entrypoints/openai/test_lora_resolvers.py
new file mode 100644
index 0000000..d4afdf7
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_lora_resolvers.py
@@ -0,0 +1,210 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from contextlib import suppress
+from dataclasses import dataclass, field
+from http import HTTPStatus
+from typing import Optional
+from unittest.mock import MagicMock
+
+import pytest
+
+from vllm.config import MultiModalConfig
+from vllm.engine.multiprocessing.client import MQLLMEngineClient
+from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse
+from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
+from vllm.entrypoints.openai.serving_models import (BaseModelPath,
+                                                    OpenAIServingModels)
+from vllm.lora.request import LoRARequest
+from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+MODEL_NAME = "openai-community/gpt2"
+BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
+
+MOCK_RESOLVER_NAME = "mock_test_resolver"
+
+
+@dataclass
+class MockHFConfig:
+    model_type: str = "any"
+
+
+@dataclass
+class MockModelConfig:
+    """Minimal mock ModelConfig for testing."""
+    model: str = MODEL_NAME
+    tokenizer: str = MODEL_NAME
+    trust_remote_code: bool = False
+    tokenizer_mode: str = "auto"
+    max_model_len: int = 100
+    tokenizer_revision: Optional[str] = None
+    multimodal_config: MultiModalConfig = field(
+        default_factory=MultiModalConfig)
+    hf_config: MockHFConfig = field(default_factory=MockHFConfig)
+    logits_processor_pattern: Optional[str] = None
+    diff_sampling_param: Optional[dict] = None
+    allowed_local_media_path: str = ""
+    encoder_config = None
+    generation_config: str = "auto"
+
+    def get_diff_sampling_param(self):
+        return self.diff_sampling_param or {}
+
+
+class MockLoRAResolver(LoRAResolver):
+
+    async def resolve_lora(self, base_model_name: str,
+                           lora_name: str) -> Optional[LoRARequest]:
+        if lora_name == "test-lora":
+            return LoRARequest(lora_name="test-lora",
+                               lora_int_id=1,
+                               lora_local_path="/fake/path/test-lora")
+        elif lora_name == "invalid-lora":
+            return LoRARequest(lora_name="invalid-lora",
+                               lora_int_id=2,
+                               lora_local_path="/fake/path/invalid-lora")
+        return None
+
+
+@pytest.fixture(autouse=True)
+def register_mock_resolver():
+    """Fixture to register and unregister the mock LoRA resolver."""
+    resolver = MockLoRAResolver()
+    LoRAResolverRegistry.register_resolver(MOCK_RESOLVER_NAME, resolver)
+    yield
+    # Cleanup: remove the resolver after the test runs
+    if MOCK_RESOLVER_NAME in LoRAResolverRegistry.resolvers:
+        del LoRAResolverRegistry.resolvers[MOCK_RESOLVER_NAME]
+
+
+@pytest.fixture
+def mock_serving_setup():
+    """Provides a mocked engine and serving completion instance."""
+    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+
+    def mock_add_lora_side_effect(lora_request: LoRARequest):
+        """Simulate engine behavior when adding LoRAs."""
+        if lora_request.lora_name == "test-lora":
+            # Simulate successful addition
+            return
+        elif lora_request.lora_name == "invalid-lora":
+            # Simulate failure during addition (e.g. invalid format)
+            raise ValueError(f"Simulated failure adding LoRA: "
+                             f"{lora_request.lora_name}")
+
+    mock_engine.add_lora.side_effect = mock_add_lora_side_effect
+    mock_engine.generate.reset_mock()
+    mock_engine.add_lora.reset_mock()
+
+    mock_model_config = MockModelConfig()
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
+
+    serving_completion = OpenAIServingCompletion(mock_engine,
+                                                 mock_model_config,
+                                                 models,
+                                                 request_logger=None)
+
+    return mock_engine, serving_completion
+
+
+@pytest.mark.asyncio
+async def test_serving_completion_with_lora_resolver(mock_serving_setup,
+                                                     monkeypatch):
+    monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
+
+    mock_engine, serving_completion = mock_serving_setup
+
+    lora_model_name = "test-lora"
+    req_found = CompletionRequest(
+        model=lora_model_name,
+        prompt="Generate with LoRA",
+    )
+
+    # Suppress potential errors during the mocked generate call,
+    # as we are primarily checking for add_lora and generate calls
+    with suppress(Exception):
+        await serving_completion.create_completion(req_found)
+
+    mock_engine.add_lora.assert_called_once()
+    called_lora_request = mock_engine.add_lora.call_args[0][0]
+    assert isinstance(called_lora_request, LoRARequest)
+    assert called_lora_request.lora_name == lora_model_name
+
+    mock_engine.generate.assert_called_once()
+    called_lora_request = mock_engine.generate.call_args[1]['lora_request']
+    assert isinstance(called_lora_request, LoRARequest)
+    assert called_lora_request.lora_name == lora_model_name
+
+
+@pytest.mark.asyncio
+async def test_serving_completion_resolver_not_found(mock_serving_setup,
+                                                     monkeypatch):
+    monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
+
+    mock_engine, serving_completion = mock_serving_setup
+
+    non_existent_model = "non-existent-lora-adapter"
+    req = CompletionRequest(
+        model=non_existent_model,
+        prompt="what is 1+1?",
+    )
+
+    response = await serving_completion.create_completion(req)
+
+    mock_engine.add_lora.assert_not_called()
+    mock_engine.generate.assert_not_called()
+
+    assert isinstance(response, ErrorResponse)
+    assert response.code == HTTPStatus.NOT_FOUND.value
+    assert non_existent_model in response.message
+
+
+@pytest.mark.asyncio
+async def test_serving_completion_resolver_add_lora_fails(
+        mock_serving_setup, monkeypatch):
+    monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
+
+    mock_engine, serving_completion = mock_serving_setup
+
+    invalid_model = "invalid-lora"
+    req = CompletionRequest(
+        model=invalid_model,
+        prompt="what is 1+1?",
+    )
+
+    response = await serving_completion.create_completion(req)
+
+    # Assert add_lora was called before the failure
+    mock_engine.add_lora.assert_called_once()
+    called_lora_request = mock_engine.add_lora.call_args[0][0]
+    assert isinstance(called_lora_request, LoRARequest)
+    assert called_lora_request.lora_name == invalid_model
+
+    # Assert generate was *not* called due to the failure
+    mock_engine.generate.assert_not_called()
+
+    # Assert the correct error response
+    assert isinstance(response, ErrorResponse)
+    assert response.code == HTTPStatus.BAD_REQUEST.value
+    assert invalid_model in response.message
+
+
+@pytest.mark.asyncio
+async def test_serving_completion_flag_not_set(mock_serving_setup):
+    mock_engine, serving_completion = mock_serving_setup
+
+    lora_model_name = "test-lora"
+    req_found = CompletionRequest(
+        model=lora_model_name,
+        prompt="Generate with LoRA",
+    )
+
+    await serving_completion.create_completion(req_found)
+
+    mock_engine.add_lora.assert_not_called()
+    mock_engine.generate.assert_not_called()
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_metrics.py b/vllm_v0.10.0/tests/entrypoints/openai/test_metrics.py
new file mode 100644
index 0000000..2d7b845
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_metrics.py
@@ -0,0 +1,341 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import subprocess
+import sys
+import tempfile
+import time
+from http import HTTPStatus
+
+import openai
+import pytest
+import pytest_asyncio
+import requests
+from prometheus_client.parser import text_string_to_metric_families
+from transformers import AutoTokenizer
+
+from vllm import version
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+PREV_MINOR_VERSION = version._prev_minor_version()
+
+
+@pytest.fixture(scope="module", params=[True, False])
+def use_v1(request):
+    # Module-scoped variant of run_with_both_engines
+    #
+    # Use this fixture to run a test with both v0 and v1, and
+    # also to conditionalize the test logic e.g.
+    #
+    # def test_metrics_exist(use_v1, server, client):
+    #     ...
+    #     expected = EXPECTED_V1_METRICS if use_v1 else EXPECTED_METRICS
+    #     for metric in expected:
+    #         assert metric in response.text
+    #
+    # @skip_v1 wouldn't work here because this is a module-level
+    # fixture - per-function decorators would have no effect
+    yield request.param
+
+
+@pytest.fixture(scope="module")
+def default_server_args():
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "1024",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "128",
+    ]
+
+
+@pytest.fixture(scope="module",
+                params=[
+                    "",
+                    "--enable-chunked-prefill",
+                    "--disable-frontend-multiprocessing",
+                    f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
+                ])
+def server(use_v1, default_server_args, request):
+    if request.param:
+        default_server_args.append(request.param)
+    env_dict = dict(VLLM_USE_V1='1' if use_v1 else '0')
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args,
+                            env_dict=env_dict) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as cl:
+        yield cl
+
+
+_PROMPT = "Hello my name is Robert and I love magic"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+_TOKENIZED_PROMPT = tokenizer(_PROMPT)["input_ids"]
+
+_NUM_REQUESTS = 10
+_NUM_PROMPT_TOKENS_PER_REQUEST = len(_TOKENIZED_PROMPT)
+_NUM_GENERATION_TOKENS_PER_REQUEST = 10
+
+# {metric_family: [(suffix, expected_value)]}
+EXPECTED_VALUES = {
+    "vllm:time_to_first_token_seconds": [("_count", _NUM_REQUESTS)],
+    "vllm:time_per_output_token_seconds":
+    [("_count", _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1))],
+    "vllm:e2e_request_latency_seconds": [("_count", _NUM_REQUESTS)],
+    "vllm:request_queue_time_seconds": [("_count", _NUM_REQUESTS)],
+    "vllm:request_inference_time_seconds": [("_count", _NUM_REQUESTS)],
+    "vllm:request_prefill_time_seconds": [("_count", _NUM_REQUESTS)],
+    "vllm:request_decode_time_seconds": [("_count", _NUM_REQUESTS)],
+    "vllm:request_prompt_tokens":
+    [("_sum", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST),
+     ("_count", _NUM_REQUESTS)],
+    "vllm:request_generation_tokens":
+    [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
+     ("_count", _NUM_REQUESTS)],
+    "vllm:request_params_n": [("_count", _NUM_REQUESTS)],
+    "vllm:request_params_max_tokens": [
+        ("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
+        ("_count", _NUM_REQUESTS)
+    ],
+    "vllm:iteration_tokens_total":
+    [("_sum", _NUM_REQUESTS *
+      (_NUM_PROMPT_TOKENS_PER_REQUEST + _NUM_GENERATION_TOKENS_PER_REQUEST)),
+     ("_count", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST)],
+    "vllm:prompt_tokens": [("_total",
+                            _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
+    "vllm:generation_tokens": [
+        ("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)
+    ],
+    "vllm:request_success": [("_total", _NUM_REQUESTS)],
+}
+
+
+@pytest.mark.asyncio
+async def test_metrics_counts(server: RemoteOpenAIServer,
+                              client: openai.AsyncClient, use_v1: bool):
+    for _ in range(_NUM_REQUESTS):
+        # sending a request triggers the metrics to be logged.
+        await client.completions.create(
+            model=MODEL_NAME,
+            prompt=_TOKENIZED_PROMPT,
+            max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST)
+
+    response = requests.get(server.url_for("metrics"))
+    print(response.text)
+    assert response.status_code == HTTPStatus.OK
+
+    # Loop over all expected metric_families
+    for metric_family, suffix_values_list in EXPECTED_VALUES.items():
+        if ((use_v1 and metric_family not in EXPECTED_METRICS_V1)
+                or (not server.show_hidden_metrics
+                    and metric_family in HIDDEN_DEPRECATED_METRICS)):
+            continue
+
+        found_metric = False
+
+        # Check to see if the metric_family is found in the prom endpoint.
+        for family in text_string_to_metric_families(response.text):
+            if family.name == metric_family:
+                found_metric = True
+
+                # Check that each suffix is found in the prom endpoint.
+                for suffix, expected_value in suffix_values_list:
+                    metric_name_w_suffix = f"{metric_family}{suffix}"
+                    found_suffix = False
+
+                    for sample in family.samples:
+                        if sample.name == metric_name_w_suffix:
+                            found_suffix = True
+
+                            # For each suffix, value sure the value matches
+                            # what we expect.
+                            assert sample.value == expected_value, (
+                                f"{metric_name_w_suffix} expected value of "
+                                f"{expected_value} did not match found value "
+                                f"{sample.value}")
+                            break
+                    assert found_suffix, (
+                        f"Did not find {metric_name_w_suffix} in prom endpoint"
+                    )
+                break
+
+        assert found_metric, (f"Did not find {metric_family} in prom endpoint")
+
+
+EXPECTED_METRICS = [
+    "vllm:num_requests_running",
+    "vllm:num_requests_waiting",
+    "vllm:gpu_cache_usage_perc",
+    "vllm:time_to_first_token_seconds_sum",
+    "vllm:time_to_first_token_seconds_bucket",
+    "vllm:time_to_first_token_seconds_count",
+    "vllm:time_per_output_token_seconds_sum",
+    "vllm:time_per_output_token_seconds_bucket",
+    "vllm:time_per_output_token_seconds_count",
+    "vllm:e2e_request_latency_seconds_sum",
+    "vllm:e2e_request_latency_seconds_bucket",
+    "vllm:e2e_request_latency_seconds_count",
+    "vllm:request_queue_time_seconds_sum",
+    "vllm:request_queue_time_seconds_bucket",
+    "vllm:request_queue_time_seconds_count",
+    "vllm:request_inference_time_seconds_sum",
+    "vllm:request_inference_time_seconds_bucket",
+    "vllm:request_inference_time_seconds_count",
+    "vllm:request_prefill_time_seconds_sum",
+    "vllm:request_prefill_time_seconds_bucket",
+    "vllm:request_prefill_time_seconds_count",
+    "vllm:request_decode_time_seconds_sum",
+    "vllm:request_decode_time_seconds_bucket",
+    "vllm:request_decode_time_seconds_count",
+    "vllm:request_prompt_tokens_sum",
+    "vllm:request_prompt_tokens_bucket",
+    "vllm:request_prompt_tokens_count",
+    "vllm:request_generation_tokens_sum",
+    "vllm:request_generation_tokens_bucket",
+    "vllm:request_generation_tokens_count",
+    "vllm:request_params_n_sum",
+    "vllm:request_params_n_bucket",
+    "vllm:request_params_n_count",
+    "vllm:request_params_max_tokens_sum",
+    "vllm:request_params_max_tokens_bucket",
+    "vllm:request_params_max_tokens_count",
+    "vllm:iteration_tokens_total",
+    "vllm:num_preemptions_total",
+    "vllm:prompt_tokens_total",
+    "vllm:generation_tokens_total",
+    "vllm:request_success_total",
+    "vllm:cache_config_info",
+    # labels in cache_config_info
+    "block_size",
+    "cache_dtype",
+    "cpu_offload_gb",
+    "enable_prefix_caching",
+    "gpu_memory_utilization",
+    "num_cpu_blocks",
+    "num_gpu_blocks",
+    "num_gpu_blocks_override",
+    "sliding_window",
+    "swap_space_bytes",
+]
+
+EXPECTED_METRICS_V1 = [
+    "vllm:num_requests_running",
+    "vllm:num_requests_waiting",
+    "vllm:gpu_cache_usage_perc",
+    "vllm:gpu_prefix_cache_queries",
+    "vllm:gpu_prefix_cache_hits",
+    "vllm:num_preemptions_total",
+    "vllm:prompt_tokens_total",
+    "vllm:generation_tokens_total",
+    "vllm:iteration_tokens_total",
+    "vllm:cache_config_info",
+    "vllm:request_success_total",
+    "vllm:request_prompt_tokens_sum",
+    "vllm:request_prompt_tokens_bucket",
+    "vllm:request_prompt_tokens_count",
+    "vllm:request_generation_tokens_sum",
+    "vllm:request_generation_tokens_bucket",
+    "vllm:request_generation_tokens_count",
+    "vllm:request_params_n_sum",
+    "vllm:request_params_n_bucket",
+    "vllm:request_params_n_count",
+    "vllm:request_params_max_tokens_sum",
+    "vllm:request_params_max_tokens_bucket",
+    "vllm:request_params_max_tokens_count",
+    "vllm:time_to_first_token_seconds_sum",
+    "vllm:time_to_first_token_seconds_bucket",
+    "vllm:time_to_first_token_seconds_count",
+    "vllm:time_per_output_token_seconds_sum",
+    "vllm:time_per_output_token_seconds_bucket",
+    "vllm:time_per_output_token_seconds_count",
+    "vllm:e2e_request_latency_seconds_sum",
+    "vllm:e2e_request_latency_seconds_bucket",
+    "vllm:e2e_request_latency_seconds_count",
+    "vllm:request_queue_time_seconds_sum",
+    "vllm:request_queue_time_seconds_bucket",
+    "vllm:request_queue_time_seconds_count",
+    "vllm:request_inference_time_seconds_sum",
+    "vllm:request_inference_time_seconds_bucket",
+    "vllm:request_inference_time_seconds_count",
+    "vllm:request_prefill_time_seconds_sum",
+    "vllm:request_prefill_time_seconds_bucket",
+    "vllm:request_prefill_time_seconds_count",
+    "vllm:request_decode_time_seconds_sum",
+    "vllm:request_decode_time_seconds_bucket",
+    "vllm:request_decode_time_seconds_count",
+]
+
+HIDDEN_DEPRECATED_METRICS: list[str] = []
+
+
+@pytest.mark.asyncio
+async def test_metrics_exist(server: RemoteOpenAIServer,
+                             client: openai.AsyncClient, use_v1: bool):
+    # sending a request triggers the metrics to be logged.
+    await client.completions.create(model=MODEL_NAME,
+                                    prompt="Hello, my name is",
+                                    max_tokens=5,
+                                    temperature=0.0)
+
+    response = requests.get(server.url_for("metrics"))
+    assert response.status_code == HTTPStatus.OK
+
+    for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS):
+        if (not server.show_hidden_metrics
+                and metric not in HIDDEN_DEPRECATED_METRICS):
+            assert metric in response.text
+
+
+def test_metrics_exist_run_batch(use_v1: bool):
+    if use_v1:
+        pytest.skip("Skipping test on vllm V1")
+    input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}"""  # noqa: E501
+
+    base_url = "0.0.0.0"
+    port = "8001"
+    server_url = f"http://{base_url}:{port}"
+
+    with tempfile.NamedTemporaryFile(
+            "w") as input_file, tempfile.NamedTemporaryFile(
+                "r") as output_file:
+        input_file.write(input_batch)
+        input_file.flush()
+        proc = subprocess.Popen([
+            sys.executable,
+            "-m",
+            "vllm.entrypoints.openai.run_batch",
+            "-i",
+            input_file.name,
+            "-o",
+            output_file.name,
+            "--model",
+            "intfloat/multilingual-e5-small",
+            "--enable-metrics",
+            "--url",
+            base_url,
+            "--port",
+            port,
+        ], )
+
+        def is_server_up(url):
+            try:
+                response = requests.get(url)
+                return response.status_code == 200
+            except requests.ConnectionError:
+                return False
+
+        while not is_server_up(server_url):
+            time.sleep(1)
+
+        response = requests.get(server_url + "/metrics")
+        assert response.status_code == HTTPStatus.OK
+
+        proc.wait()
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_models.py b/vllm_v0.10.0/tests/entrypoints/openai/test_models.py
new file mode 100644
index 0000000..1980daa
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_models.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+# technically this needs Mistral-7B-v0.1 as base, but we're not testing
+# generation quality here
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+
+
+@pytest.fixture(scope="module")
+def server(zephyr_lora_files):
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        f"zephyr-lora2={zephyr_lora_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "128",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_check_models(client: openai.AsyncOpenAI, zephyr_lora_files):
+    models = await client.models.list()
+    models = models.data
+    served_model = models[0]
+    lora_models = models[1:]
+    assert served_model.id == MODEL_NAME
+    assert served_model.root == MODEL_NAME
+    assert all(lora_model.root == zephyr_lora_files
+               for lora_model in lora_models)
+    assert lora_models[0].id == "zephyr-lora"
+    assert lora_models[1].id == "zephyr-lora2"
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_oot_registration.py b/vllm_v0.10.0/tests/entrypoints/openai/test_oot_registration.py
new file mode 100644
index 0000000..f0ce50d
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_oot_registration.py
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from ...utils import VLLM_PATH, RemoteOpenAIServer
+
+chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
+assert chatml_jinja_path.exists()
+
+
+def run_and_test_dummy_opt_api_server(model, tp=1):
+    # the model is registered through the plugin
+    server_args = [
+        "--gpu-memory-utilization",
+        "0.10",
+        "--dtype",
+        "float32",
+        "--chat-template",
+        str(chatml_jinja_path),
+        "--load-format",
+        "dummy",
+        "-tp",
+        f"{tp}",
+    ]
+    with RemoteOpenAIServer(model, server_args) as server:
+        client = server.get_client()
+        completion = client.chat.completions.create(
+            model=model,
+            messages=[{
+                "role": "system",
+                "content": "You are a helpful assistant."
+            }, {
+                "role": "user",
+                "content": "Hello!"
+            }],
+            temperature=0,
+        )
+        generated_text = completion.choices[0].message.content
+        assert generated_text is not None
+        # make sure only the first token is generated
+        rest = generated_text.replace("<s>", "")
+        assert rest == ""
+
+
+def test_oot_registration_for_api_server(dummy_opt_path: str):
+    run_and_test_dummy_opt_api_server(dummy_opt_path)
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_openai_schema.py b/vllm_v0.10.0/tests/entrypoints/openai/test_openai_schema.py
new file mode 100644
index 0000000..580bf34
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_openai_schema.py
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+from typing import Final
+
+import pytest
+import schemathesis
+from hypothesis import settings
+from schemathesis import GenerationConfig
+
+from ...utils import RemoteOpenAIServer
+
+schemathesis.experimental.OPEN_API_3_1.enable()
+
+MODEL_NAME = "HuggingFaceTB/SmolVLM-256M-Instruct"
+MAXIMUM_IMAGES = 2
+DEFAULT_TIMEOUT_SECONDS: Final[int] = 10
+LONG_TIMEOUT_SECONDS: Final[int] = 60
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--task",
+        "generate",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "5",
+        "--enforce-eager",
+        "--trust-remote-code",
+        "--limit-mm-per-prompt",
+        json.dumps({"image": MAXIMUM_IMAGES}),
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def get_schema(server):
+    # avoid generating null (\x00) bytes in strings during test case generation
+    return schemathesis.openapi.from_uri(
+        f"{server.url_root}/openapi.json",
+        generation_config=GenerationConfig(allow_x00=False),
+    )
+
+
+schema = schemathesis.from_pytest_fixture("get_schema")
+
+
+@schemathesis.hook
+def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
+    op = context.operation
+    assert op is not None
+
+    def no_file_type(case: schemathesis.models.Case):
+        """
+        This filter skips test cases for the `POST /tokenize` endpoint where the
+        HTTP request body uses `"type": "file"` in any message's content.
+        We expect these cases to fail because that type isn't implemented here
+        https://github.com/vllm-project/vllm/blob/0b34593017953051b3225b1483ce0f4670e3eb0e/vllm/entrypoints/chat_utils.py#L1038-L1095
+
+        Example test cases that are skipped:
+        curl -X POST -H 'Content-Type: application/json' \
+            -d '{"messages": [{"role": "assistant"}, {"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
+            http://localhost:8000/tokenize
+
+        curl -X POST -H 'Content-Type: application/json' \
+            -d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
+            http://localhost:8000/tokenize
+        """  # noqa: E501
+        if (op.method.lower() == "post" and op.path == "/tokenize"
+                and hasattr(case, "body") and isinstance(case.body, dict)
+                and "messages" in case.body
+                and isinstance(case.body["messages"], list)
+                and len(case.body["messages"]) > 0):
+            for message in case.body["messages"]:
+                if not isinstance(message, dict):
+                    continue
+                content = message.get("content", [])
+                if not isinstance(content, list) or len(content) == 0:
+                    continue
+                if any(item.get("type") == "file" for item in content):
+                    return False
+        return True
+
+    return strategy.filter(no_file_type)
+
+
+@schema.parametrize()
+@schema.override(headers={"Content-Type": "application/json"})
+@settings(deadline=LONG_TIMEOUT_SECONDS * 1000)
+def test_openapi_stateless(case: schemathesis.Case):
+    key = (
+        case.operation.method.upper(),
+        case.operation.path,
+    )
+    if case.operation.path.startswith("/v1/responses"):
+        # Skip responses API as it is meant to be stateful.
+        return
+
+    timeout = {
+        # requires a longer timeout
+        ("POST", "/v1/chat/completions"):
+        LONG_TIMEOUT_SECONDS,
+    }.get(key, DEFAULT_TIMEOUT_SECONDS)
+
+    #No need to verify SSL certificate for localhost
+    case.call_and_validate(verify=False, timeout=timeout)
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_optional_middleware.py b/vllm_v0.10.0/tests/entrypoints/openai/test_optional_middleware.py
new file mode 100644
index 0000000..882fa08
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_optional_middleware.py
@@ -0,0 +1,116 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests for middleware that's off by default and can be toggled through
+server arguments, mainly --api-key and --enable-request-id-headers.
+"""
+
+from http import HTTPStatus
+
+import pytest
+import requests
+
+from ...utils import RemoteOpenAIServer
+
+# Use a small embeddings model for faster startup and smaller memory footprint.
+# Since we are not testing any chat functionality,
+# using a chat capable model is overkill.
+MODEL_NAME = "intfloat/multilingual-e5-small"
+
+
+@pytest.fixture(scope="module")
+def server(request: pytest.FixtureRequest):
+    passed_params = []
+    if hasattr(request, "param"):
+        passed_params = request.param
+    if isinstance(passed_params, str):
+        passed_params = [passed_params]
+
+    args = [
+        "--task",
+        "embed",
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--max-model-len",
+        "512",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "2",
+        *passed_params
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.asyncio
+async def test_no_api_token(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("v1/models"))
+    assert response.status_code == HTTPStatus.OK
+
+
+@pytest.mark.asyncio
+async def test_no_request_id_header(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("health"))
+    assert "X-Request-Id" not in response.headers
+
+
+@pytest.mark.parametrize(
+    "server",
+    [["--api-key", "test"]],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_missing_api_token(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("v1/models"))
+    assert response.status_code == HTTPStatus.UNAUTHORIZED
+
+
+@pytest.mark.parametrize(
+    "server",
+    [["--api-key", "test"]],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_passed_api_token(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("v1/models"),
+                            headers={"Authorization": "Bearer test"})
+    assert response.status_code == HTTPStatus.OK
+
+
+@pytest.mark.parametrize(
+    "server",
+    [["--api-key", "test"]],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_not_v1_api_token(server: RemoteOpenAIServer):
+    # Authorization check is skipped for any paths that
+    # don't start with /v1 (e.g. /v1/chat/completions).
+    response = requests.get(server.url_for("health"))
+    assert response.status_code == HTTPStatus.OK
+
+
+@pytest.mark.parametrize(
+    "server",
+    ["--enable-request-id-headers"],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_enable_request_id_header(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("health"))
+    assert "X-Request-Id" in response.headers
+    assert len(response.headers.get("X-Request-Id", "")) == 32
+
+
+@pytest.mark.parametrize(
+    "server",
+    ["--enable-request-id-headers"],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_custom_request_id_header(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("health"),
+                            headers={"X-Request-Id": "Custom"})
+    assert "X-Request-Id" in response.headers
+    assert response.headers.get("X-Request-Id") == "Custom"
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_pooling.py b/vllm_v0.10.0/tests/entrypoints/openai/test_pooling.py
new file mode 100644
index 0000000..02165ee
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_pooling.py
@@ -0,0 +1,329 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import base64
+
+import numpy as np
+import pytest
+import requests
+
+from tests.models.utils import check_embeddings_close
+from vllm.entrypoints.openai.protocol import PoolingResponse
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "internlm/internlm2-1_8b-reward"
+DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--task",
+        "reward",
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--enforce-eager",
+        "--max-model-len",
+        "512",
+        "--chat-template",
+        DUMMY_CHAT_TEMPLATE,
+        "--trust-remote-code",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_pooling(server: RemoteOpenAIServer, model_name: str):
+    input_texts = [
+        "The chef prepared a delicious meal.",
+    ]
+
+    # test single pooling
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_texts,
+            "encoding_format": "float"
+        },
+    )
+    response.raise_for_status()
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert poolings.id is not None
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 8
+    assert poolings.usage.completion_tokens == 0
+    assert poolings.usage.prompt_tokens == 8
+    assert poolings.usage.total_tokens == 8
+
+    # test using token IDs
+    input_tokens = [1, 1, 1, 1, 1]
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_tokens,
+            "encoding_format": "float"
+        },
+    )
+    response.raise_for_status()
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert poolings.id is not None
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 5
+    assert poolings.usage.completion_tokens == 0
+    assert poolings.usage.prompt_tokens == 5
+    assert poolings.usage.total_tokens == 5
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str):
+    # test list[str]
+    input_texts = [
+        "The cat sat on the mat.", "A feline was resting on a rug.",
+        "Stars twinkle brightly in the night sky."
+    ]
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_texts,
+            "encoding_format": "float"
+        },
+    )
+    response.raise_for_status()
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert poolings.id is not None
+    assert len(poolings.data) == 3
+    assert len(poolings.data[0].data) == 8
+    assert poolings.usage.completion_tokens == 0
+    assert poolings.usage.prompt_tokens == 29
+    assert poolings.usage.total_tokens == 29
+
+    # test list[list[int]]
+    input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
+                    [25, 32, 64, 77]]
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_tokens,
+            "encoding_format": "float"
+        },
+    )
+    response.raise_for_status()
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert poolings.id is not None
+    assert len(poolings.data) == 4
+    assert len(poolings.data[0].data) == 5
+    assert poolings.usage.completion_tokens == 0
+    assert poolings.usage.prompt_tokens == 17
+    assert poolings.usage.total_tokens == 17
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_conversation_pooling(server: RemoteOpenAIServer,
+                                    model_name: str):
+    messages = [{
+        "role": "user",
+        "content": "The cat sat on the mat.",
+    }, {
+        "role": "assistant",
+        "content": "A feline was resting on a rug.",
+    }, {
+        "role": "user",
+        "content": "Stars twinkle brightly in the night sky.",
+    }]
+
+    chat_response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "messages": messages,
+            "encoding_format": "float",
+        },
+    )
+    chat_response.raise_for_status()
+    chat_poolings = PoolingResponse.model_validate(chat_response.json())
+
+    tokenizer = get_tokenizer(
+        tokenizer_name=model_name,
+        tokenizer_mode="fast",
+        trust_remote_code=True,
+    )
+    prompt = tokenizer.apply_chat_template(
+        messages,
+        chat_template=DUMMY_CHAT_TEMPLATE,
+        add_generation_prompt=True,
+        continue_final_message=False,
+        tokenize=False,
+    )
+    completions_response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": prompt,
+            "encoding_format": "float",
+            # To be consistent with chat
+            "add_special_tokens": False,
+        },
+    )
+    completions_response.raise_for_status()
+    completion_poolings = PoolingResponse.model_validate(
+        completions_response.json())
+
+    assert chat_poolings.id is not None
+    assert completion_poolings.id is not None
+    assert chat_poolings.created <= completion_poolings.created
+    assert chat_poolings.model_dump(
+        exclude={"id", "created"}) == (completion_poolings.model_dump(
+            exclude={"id", "created"}))
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_batch_base64_pooling(server: RemoteOpenAIServer,
+                                    model_name: str):
+    input_texts = [
+        "Hello my name is",
+        "The best thing about vLLM is that it supports many different models"
+    ]
+
+    float_response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "input": input_texts,
+            "model": model_name,
+            "encoding_format": "float",
+        },
+    )
+    float_response.raise_for_status()
+    responses_float = PoolingResponse.model_validate(float_response.json())
+    float_data = [
+        np.array(d.data).squeeze(-1).tolist() for d in responses_float.data
+    ]
+
+    base64_response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "input": input_texts,
+            "model": model_name,
+            "encoding_format": "base64",
+        },
+    )
+    base64_response.raise_for_status()
+    responses_base64 = PoolingResponse.model_validate(base64_response.json())
+
+    decoded_responses_base64_data = []
+    for data in responses_base64.data:
+        decoded_responses_base64_data.append(
+            np.frombuffer(base64.b64decode(data.data),
+                          dtype="float32").tolist())
+
+    check_embeddings_close(embeddings_0_lst=float_data,
+                           embeddings_1_lst=decoded_responses_base64_data,
+                           name_0="float32",
+                           name_1="base64")
+
+    # Default response is float32 decoded from base64 by OpenAI Client
+    default_response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "input": input_texts,
+            "model": model_name,
+        },
+    )
+    default_response.raise_for_status()
+    responses_default = PoolingResponse.model_validate(default_response.json())
+    default_data = [
+        np.array(d.data).squeeze(-1).tolist() for d in responses_default.data
+    ]
+
+    check_embeddings_close(embeddings_0_lst=float_data,
+                           embeddings_1_lst=default_data,
+                           name_0="float32",
+                           name_1="default")
+
+
+@pytest.mark.asyncio
+async def test_invocations(server: RemoteOpenAIServer):
+    input_texts = [
+        "The chef prepared a delicious meal.",
+    ]
+
+    request_args = {
+        "model": MODEL_NAME,
+        "input": input_texts,
+        "encoding_format": "float",
+    }
+
+    completion_response = requests.post(server.url_for("pooling"),
+                                        json=request_args)
+    completion_response.raise_for_status()
+
+    invocation_response = requests.post(server.url_for("invocations"),
+                                        json=request_args)
+    invocation_response.raise_for_status()
+
+    completion_output = completion_response.json()
+    invocation_output = invocation_response.json()
+
+    assert completion_output.keys() == invocation_output.keys()
+    for completion_data, invocation_data in zip(completion_output["data"],
+                                                invocation_output["data"]):
+        assert completion_data.keys() == invocation_data.keys()
+        check_embeddings_close(embeddings_0_lst=completion_data["data"],
+                               embeddings_1_lst=invocation_data["data"],
+                               name_0="completion",
+                               name_1="invocation")
+
+
+@pytest.mark.asyncio
+async def test_invocations_conversation(server: RemoteOpenAIServer):
+    messages = [{
+        "role": "user",
+        "content": "The cat sat on the mat.",
+    }, {
+        "role": "assistant",
+        "content": "A feline was resting on a rug.",
+    }, {
+        "role": "user",
+        "content": "Stars twinkle brightly in the night sky.",
+    }]
+
+    request_args = {
+        "model": MODEL_NAME,
+        "messages": messages,
+        "encoding_format": "float",
+    }
+
+    chat_response = requests.post(server.url_for("pooling"), json=request_args)
+    chat_response.raise_for_status()
+
+    invocation_response = requests.post(server.url_for("invocations"),
+                                        json=request_args)
+    invocation_response.raise_for_status()
+
+    chat_output = chat_response.json()
+    invocation_output = invocation_response.json()
+
+    assert chat_output.keys() == invocation_output.keys()
+    for chat_data, invocation_data in zip(chat_output["data"],
+                                          invocation_output["data"]):
+        assert chat_data.keys() == invocation_data.keys()
+        check_embeddings_close(embeddings_0_lst=chat_data["data"],
+                               embeddings_1_lst=invocation_data["data"],
+                               name_0="chat",
+                               name_1="invocation")
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_prompt_validation.py b/vllm_v0.10.0/tests/entrypoints/openai/test_prompt_validation.py
new file mode 100644
index 0000000..ff0730c
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_prompt_validation.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# imports for guided decoding tests
+import openai
+import pytest
+import regex as re
+
+from ...utils import RemoteOpenAIServer
+
+
+@pytest.mark.asyncio
+async def test_empty_prompt():
+    model_name = "gpt2"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+
+        with pytest.raises(openai.BadRequestError,
+                           match="decoder prompt cannot be empty"):
+            await client.completions.create(model=model_name,
+                                            prompt="",
+                                            max_tokens=5,
+                                            temperature=0.0)
+
+
+@pytest.mark.asyncio
+async def test_out_of_vocab_token_ids():
+    model_name = "gpt2"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+
+        with pytest.raises(openai.BadRequestError,
+                           match=re.compile('.*out of vocabulary.*').pattern):
+            await client.completions.create(model=model_name,
+                                            prompt=[999999],
+                                            max_tokens=5,
+                                            temperature=0.0)
+
+
+@pytest.mark.asyncio
+async def test_reject_multistep_with_guided_decoding():
+    model_name = "gpt2"
+    server_args = ["--enforce-eager", "--num-scheduler-steps", "8"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+
+        with pytest.raises(
+                openai.BadRequestError,
+                match=re.compile(
+                    '.*Guided decoding .* multi-step decoding.*').pattern):
+            await client.completions.create(
+                model=model_name,
+                prompt="Hello",
+                max_tokens=5,
+                temperature=0.0,
+                extra_body={"response_format": {
+                    "type": "json_object"
+                }})
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_rerank.py b/vllm_v0.10.0/tests/entrypoints/openai/test_rerank.py
new file mode 100644
index 0000000..4da97fe
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_rerank.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import requests
+
+from vllm.entrypoints.openai.protocol import RerankResponse
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "BAAI/bge-reranker-base"
+DTYPE = "bfloat16"
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_rerank_texts(server: RemoteOpenAIServer, model_name: str):
+    query = "What is the capital of France?"
+    documents = [
+        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+    ]
+
+    rerank_response = requests.post(server.url_for("rerank"),
+                                    json={
+                                        "model": model_name,
+                                        "query": query,
+                                        "documents": documents,
+                                    })
+    rerank_response.raise_for_status()
+    rerank = RerankResponse.model_validate(rerank_response.json())
+
+    assert rerank.id is not None
+    assert rerank.results is not None
+    assert len(rerank.results) == 2
+    assert rerank.results[0].relevance_score >= 0.9
+    assert rerank.results[1].relevance_score <= 0.01
+
+
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_top_n(server: RemoteOpenAIServer, model_name: str):
+    query = "What is the capital of France?"
+    documents = [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.", "Cross-encoder models are neat"
+    ]
+
+    rerank_response = requests.post(server.url_for("rerank"),
+                                    json={
+                                        "model": model_name,
+                                        "query": query,
+                                        "documents": documents,
+                                        "top_n": 2
+                                    })
+    rerank_response.raise_for_status()
+    rerank = RerankResponse.model_validate(rerank_response.json())
+
+    assert rerank.id is not None
+    assert rerank.results is not None
+    assert len(rerank.results) == 2
+    assert rerank.results[0].relevance_score >= 0.9
+    assert rerank.results[1].relevance_score <= 0.01
+
+
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str):
+
+    query = "What is the capital of France?" * 100
+    documents = [
+        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+    ]
+
+    rerank_response = requests.post(server.url_for("rerank"),
+                                    json={
+                                        "model": model_name,
+                                        "query": query,
+                                        "documents": documents
+                                    })
+    assert rerank_response.status_code == 400
+    # Assert just a small fragments of the response
+    assert "Please reduce the length of the input." in \
+        rerank_response.text
+
+
+def test_invocations(server: RemoteOpenAIServer):
+    query = "What is the capital of France?"
+    documents = [
+        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+    ]
+
+    request_args = {
+        "model": MODEL_NAME,
+        "query": query,
+        "documents": documents,
+    }
+
+    rerank_response = requests.post(server.url_for("rerank"),
+                                    json=request_args)
+    rerank_response.raise_for_status()
+
+    invocation_response = requests.post(server.url_for("invocations"),
+                                        json=request_args)
+    invocation_response.raise_for_status()
+
+    rerank_output = rerank_response.json()
+    invocation_output = invocation_response.json()
+
+    assert rerank_output.keys() == invocation_output.keys()
+    for rerank_result, invocations_result in zip(rerank_output["results"],
+                                                 invocation_output["results"]):
+        assert rerank_result.keys() == invocations_result.keys()
+        assert rerank_result["relevance_score"] == pytest.approx(
+            invocations_result["relevance_score"], rel=0.01)
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_return_tokens_as_ids.py b/vllm_v0.10.0/tests/entrypoints/openai/test_return_tokens_as_ids.py
new file mode 100644
index 0000000..af58fbd
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -0,0 +1,105 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Separate these tests out from test_completion and test_chat, because they
+# require launching a second server with a different flag. Running both servers
+# at the same time on a single node will OOM.
+
+import pytest
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ...utils import RemoteOpenAIServer
+from .test_completion import default_server_args  # noqa: F401
+from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
+from .test_completion import zephyr_lora_files  # noqa: F401
+from .test_completion import MODEL_NAME
+
+
+@pytest.fixture(scope="module")
+def server_fixture(request, default_server_args):  # noqa: F811
+    use_server_flag = request.param
+    if use_server_flag:
+        args_with_flag = default_server_args + ["--return-tokens-as-token-ids"]
+        with RemoteOpenAIServer(MODEL_NAME, args_with_flag) as remote_server:
+            yield (remote_server, True)
+    else:
+        with RemoteOpenAIServer(MODEL_NAME,
+                                default_server_args) as remote_server:
+            yield (remote_server, False)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("server_fixture", [True, False], indirect=True)
+async def test_completion_return_tokens_as_token_ids_completion(
+        server_fixture):
+    server, use_server_flag = server_fixture
+    request_args = {}
+    if not use_server_flag:
+        request_args["return_tokens_as_token_ids"] = True
+
+    async with server.get_async_client() as client:
+
+        completion = await client.completions.create(
+            model=MODEL_NAME,
+            # Include Unicode characters to test for dividing a single
+            # character across multiple tokens: 🎉 is [28705, 31862] for the
+            # Zephyr tokenizer
+            prompt="Say 'Hello, world! 🎉'",
+            echo=True,
+            temperature=0,
+            max_tokens=10,
+            logprobs=1,
+            extra_body=request_args)
+
+        text = completion.choices[0].text
+        token_strs = completion.choices[0].logprobs.tokens
+        tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+        # Check that the token representations are consistent between raw
+        # tokens and top_logprobs
+        # Slice off the first one, because there's no scoring associated
+        # with BOS
+        top_logprobs = completion.choices[0].logprobs.top_logprobs[1:]
+        top_logprob_keys = [
+            next(iter(logprob_by_tokens)) for logprob_by_tokens in top_logprobs
+        ]
+        assert token_strs[1:] == top_logprob_keys
+
+        # Check that decoding the tokens gives the expected text
+        tokens = [int(token.removeprefix("token_id:")) for token in token_strs]
+        assert text == tokenizer.decode(tokens, skip_special_tokens=True)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("server_fixture", [True, False], indirect=True)
+async def test_chat_return_tokens_as_token_ids_completion(server_fixture):
+    server, use_server_flag = server_fixture
+    request_args = {}
+    if not use_server_flag:
+        request_args["return_tokens_as_token_ids"] = True
+
+    async with server.get_async_client() as client:
+        response = await client.chat.completions.create(
+            model=MODEL_NAME,
+            # Include Unicode characters to test for dividing a single
+            # character across multiple tokens: 🎉 is [28705, 31862] for the
+            # Zephyr tokenizer
+            messages=[{
+                "role": "system",
+                "content": "You like to respond in only emojis, like 🎉"
+            }, {
+                "role": "user",
+                "content": "Please write some emojis: 🐱🐶🎉"
+            }],
+            temperature=0,
+            max_tokens=8,
+            logprobs=True,
+            extra_body=request_args)
+
+        text = response.choices[0].message.content
+        tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+        token_ids = []
+        for logprob_content in response.choices[0].logprobs.content:
+            token_ids.append(
+                int(logprob_content.token.removeprefix("token_id:")))
+        assert tokenizer.decode(token_ids, skip_special_tokens=True) == text
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_root_path.py b/vllm_v0.10.0/tests/entrypoints/openai/test_root_path.py
new file mode 100644
index 0000000..7b49668
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_root_path.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import contextlib
+import os
+from typing import Any, NamedTuple
+
+import openai  # use the official client for correctness check
+import pytest
+
+from ...utils import RemoteOpenAIServer
+
+# # any model with a chat template should work here
+MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
+API_KEY = "abc-123"
+ERROR_API_KEY = "abc"
+ROOT_PATH = "llm"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--enforce-eager",
+        "--max-model-len",
+        "4080",
+        "--root-path",  # use --root-path=/llm for testing
+        "/" + ROOT_PATH,
+    ]
+    envs = os.environ.copy()
+
+    envs["VLLM_API_KEY"] = API_KEY
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
+        yield remote_server
+
+
+class TestCase(NamedTuple):
+    model_name: str
+    base_url: list[str]
+    api_key: str
+    expected_error: Any
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        TestCase(
+            model_name=MODEL_NAME,
+            base_url=["v1"],  # http://localhost:8000/v1
+            api_key=ERROR_API_KEY,
+            expected_error=openai.AuthenticationError),
+        TestCase(
+            model_name=MODEL_NAME,
+            base_url=[ROOT_PATH, "v1"],  # http://localhost:8000/llm/v1
+            api_key=ERROR_API_KEY,
+            expected_error=openai.AuthenticationError),
+        TestCase(
+            model_name=MODEL_NAME,
+            base_url=["v1"],  # http://localhost:8000/v1
+            api_key=API_KEY,
+            expected_error=None),
+        TestCase(
+            model_name=MODEL_NAME,
+            base_url=[ROOT_PATH, "v1"],  # http://localhost:8000/llm/v1
+            api_key=API_KEY,
+            expected_error=None),
+    ],
+)
+async def test_chat_session_root_path_with_api_key(server: RemoteOpenAIServer,
+                                                   test_case: TestCase):
+    saying: str = "Here is a common saying about apple. An apple a day, keeps"
+    ctx = contextlib.nullcontext()
+    if test_case.expected_error is not None:
+        ctx = pytest.raises(test_case.expected_error)
+    with ctx:
+        client = openai.AsyncOpenAI(
+            api_key=test_case.api_key,
+            base_url=server.url_for(*test_case.base_url),
+            max_retries=0)
+        chat_completion = await client.chat.completions.create(
+            model=test_case.model_name,
+            messages=[{
+                "role": "user",
+                "content": "tell me a common saying"
+            }, {
+                "role": "assistant",
+                "content": saying
+            }],
+            extra_body={
+                "continue_final_message": True,
+                "add_generation_prompt": False
+            })
+
+        assert chat_completion.id is not None
+        assert len(chat_completion.choices) == 1
+        choice = chat_completion.choices[0]
+        assert choice.finish_reason == "stop"
+        message = choice.message
+        assert len(message.content) > 0
+        assert message.role == "assistant"
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_run_batch.py b/vllm_v0.10.0/tests/entrypoints/openai/test_run_batch.py
new file mode 100644
index 0000000..e23f41e
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_run_batch.py
@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+import subprocess
+import tempfile
+
+import pytest
+
+from vllm.entrypoints.openai.protocol import BatchRequestOutput
+
+# ruff: noqa: E501
+INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+
+{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NonExistModel", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+{"custom_id": "request-4", "method": "POST", "url": "/bad_url", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+{"custom_id": "request-5", "method": "POST", "url": "/v1/chat/completions", "body": {"stream": "True", "model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
+
+INVALID_INPUT_BATCH = """{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
+
+INPUT_EMBEDDING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are an unhelpful assistant."}}
+
+{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "Hello world!"}}
+{"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}"""
+
+INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
+
+INPUT_RERANK_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
+{"custom_id": "request-2", "method": "POST", "url": "/v2/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
+
+
+def test_empty_file():
+    with tempfile.NamedTemporaryFile(
+            "w") as input_file, tempfile.NamedTemporaryFile(
+                "r") as output_file:
+        input_file.write("")
+        input_file.flush()
+        proc = subprocess.Popen([
+            "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
+            "--model", "intfloat/multilingual-e5-small"
+        ], )
+        proc.communicate()
+        proc.wait()
+        assert proc.returncode == 0, f"{proc=}"
+
+        contents = output_file.read()
+        assert contents.strip() == ""
+
+
+def test_completions():
+    with tempfile.NamedTemporaryFile(
+            "w") as input_file, tempfile.NamedTemporaryFile(
+                "r") as output_file:
+        input_file.write(INPUT_BATCH)
+        input_file.flush()
+        proc = subprocess.Popen([
+            "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
+            "--model", "NousResearch/Meta-Llama-3-8B-Instruct"
+        ], )
+        proc.communicate()
+        proc.wait()
+        assert proc.returncode == 0, f"{proc=}"
+
+        contents = output_file.read()
+        for line in contents.strip().split("\n"):
+            # Ensure that the output format conforms to the openai api.
+            # Validation should throw if the schema is wrong.
+            BatchRequestOutput.model_validate_json(line)
+
+
+def test_completions_invalid_input():
+    """
+    Ensure that we fail when the input doesn't conform to the openai api.
+    """
+    with tempfile.NamedTemporaryFile(
+            "w") as input_file, tempfile.NamedTemporaryFile(
+                "r") as output_file:
+        input_file.write(INVALID_INPUT_BATCH)
+        input_file.flush()
+        proc = subprocess.Popen([
+            "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
+            "--model", "NousResearch/Meta-Llama-3-8B-Instruct"
+        ], )
+        proc.communicate()
+        proc.wait()
+        assert proc.returncode != 0, f"{proc=}"
+
+
+def test_embeddings():
+    with tempfile.NamedTemporaryFile(
+            "w") as input_file, tempfile.NamedTemporaryFile(
+                "r") as output_file:
+        input_file.write(INPUT_EMBEDDING_BATCH)
+        input_file.flush()
+        proc = subprocess.Popen([
+            "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
+            "--model", "intfloat/multilingual-e5-small"
+        ], )
+        proc.communicate()
+        proc.wait()
+        assert proc.returncode == 0, f"{proc=}"
+
+        contents = output_file.read()
+        for line in contents.strip().split("\n"):
+            # Ensure that the output format conforms to the openai api.
+            # Validation should throw if the schema is wrong.
+            BatchRequestOutput.model_validate_json(line)
+
+
+@pytest.mark.parametrize("input_batch",
+                         [INPUT_SCORE_BATCH, INPUT_RERANK_BATCH])
+def test_score(input_batch):
+    with tempfile.NamedTemporaryFile(
+            "w") as input_file, tempfile.NamedTemporaryFile(
+                "r") as output_file:
+        input_file.write(input_batch)
+        input_file.flush()
+        proc = subprocess.Popen([
+            "vllm",
+            "run-batch",
+            "-i",
+            input_file.name,
+            "-o",
+            output_file.name,
+            "--model",
+            "BAAI/bge-reranker-v2-m3",
+        ], )
+        proc.communicate()
+        proc.wait()
+        assert proc.returncode == 0, f"{proc=}"
+
+        contents = output_file.read()
+        for line in contents.strip().split("\n"):
+            # Ensure that the output format conforms to the openai api.
+            # Validation should throw if the schema is wrong.
+            BatchRequestOutput.model_validate_json(line)
+
+            # Ensure that there is no error in the response.
+            line_dict = json.loads(line)
+            assert isinstance(line_dict, dict)
+            assert line_dict["error"] is None
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_score.py b/vllm_v0.10.0/tests/entrypoints/openai/test_score.py
new file mode 100644
index 0000000..187542b
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_score.py
@@ -0,0 +1,222 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+import pytest
+import requests
+import torch.nn.functional as F
+from torch import tensor
+
+from vllm.entrypoints.openai.protocol import ScoreResponse
+
+from ...utils import RemoteOpenAIServer
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+MODELS = [
+    {
+        "name": "BAAI/bge-reranker-v2-m3",
+        "is_cross_encoder": True
+    },
+    {
+        "name": "BAAI/bge-base-en-v1.5",
+        "is_cross_encoder": False
+    },
+]
+DTYPE = "half"
+
+
+def run_transformers(hf_model, model, text_pairs):
+    if model["is_cross_encoder"]:
+        return hf_model.predict(text_pairs).tolist()
+    else:
+        hf_embeddings = [
+            hf_model.encode(text_pair) for text_pair in text_pairs
+        ]
+        return [
+            F.cosine_similarity(tensor(pair[0]), tensor(pair[1]), dim=0)
+            for pair in hf_embeddings
+        ]
+
+
+@pytest.fixture(scope="class", params=MODELS)
+def model(request):
+    yield request.param
+
+
+@pytest.fixture(scope="class")
+def server(model: dict[str, Any]):
+    args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
+
+    with RemoteOpenAIServer(model["name"], args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="class")
+def runner(model: dict[str, Any], hf_runner):
+    kwargs = {
+        "dtype": DTYPE,
+        "is_cross_encoder" if model["is_cross_encoder"]\
+              else "is_sentence_transformer": True
+    }
+
+    with hf_runner(model["name"], **kwargs) as hf_model:
+        yield hf_model
+
+
+class TestModel:
+
+    def test_text_1_str_text_2_list(self, server: RemoteOpenAIServer,
+                                    model: dict[str, Any], runner):
+        text_1 = "What is the capital of France?"
+        text_2 = [
+            "The capital of Brazil is Brasilia.",
+            "The capital of France is Paris."
+        ]
+
+        score_response = requests.post(server.url_for("score"),
+                                       json={
+                                           "model": model["name"],
+                                           "text_1": text_1,
+                                           "text_2": text_2,
+                                       })
+        score_response.raise_for_status()
+        score = ScoreResponse.model_validate(score_response.json())
+
+        assert score.id is not None
+        assert score.data is not None
+        assert len(score.data) == 2
+
+        vllm_outputs = [d.score for d in score.data]
+
+        text_pairs = [[text_1, text_2[0]], [text_1, text_2[1]]]
+        hf_outputs = run_transformers(runner, model, text_pairs)
+
+        for i in range(len(vllm_outputs)):
+            assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
+
+    def test_text_1_list_text_2_list(self, server: RemoteOpenAIServer,
+                                     model: dict[str, Any], runner):
+        text_1 = [
+            "What is the capital of the United States?",
+            "What is the capital of France?"
+        ]
+        text_2 = [
+            "The capital of Brazil is Brasilia.",
+            "The capital of France is Paris."
+        ]
+
+        score_response = requests.post(server.url_for("score"),
+                                       json={
+                                           "model": model["name"],
+                                           "text_1": text_1,
+                                           "text_2": text_2,
+                                       })
+        score_response.raise_for_status()
+        score = ScoreResponse.model_validate(score_response.json())
+
+        assert score.id is not None
+        assert score.data is not None
+        assert len(score.data) == 2
+
+        vllm_outputs = [d.score for d in score.data]
+
+        text_pairs = [[text_1[0], text_2[0]], [text_1[1], text_2[1]]]
+        hf_outputs = run_transformers(runner, model, text_pairs)
+
+        for i in range(len(vllm_outputs)):
+            assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
+
+    def test_text_1_str_text_2_str(self, server: RemoteOpenAIServer,
+                                   model: dict[str, Any], runner):
+        text_1 = "What is the capital of France?"
+        text_2 = "The capital of France is Paris."
+
+        score_response = requests.post(server.url_for("score"),
+                                       json={
+                                           "model": model["name"],
+                                           "text_1": text_1,
+                                           "text_2": text_2,
+                                       })
+        score_response.raise_for_status()
+        score = ScoreResponse.model_validate(score_response.json())
+
+        assert score.id is not None
+        assert score.data is not None
+        assert len(score.data) == 1
+
+        vllm_outputs = [d.score for d in score.data]
+
+        text_pairs = [[text_1, text_2]]
+        hf_outputs = run_transformers(runner, model, text_pairs)
+
+        for i in range(len(vllm_outputs)):
+            assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
+
+    def test_score_max_model_len(self, server: RemoteOpenAIServer,
+                                 model: dict[str, Any]):
+
+        text_1 = "What is the capital of France?" * 20
+        text_2 = [
+            "The capital of Brazil is Brasilia.",
+            "The capital of France is Paris."
+        ]
+
+        score_response = requests.post(server.url_for("score"),
+                                       json={
+                                           "model": model["name"],
+                                           "text_1": text_1,
+                                           "text_2": text_2,
+                                       })
+        assert score_response.status_code == 400
+        # Assert just a small fragments of the response
+        assert "Please reduce the length of the input." in \
+            score_response.text
+
+        # Test truncation
+        score_response = requests.post(server.url_for("score"),
+                                       json={
+                                           "model": model["name"],
+                                           "text_1": text_1,
+                                           "text_2": text_2,
+                                           "truncate_prompt_tokens": 101
+                                       })
+        assert score_response.status_code == 400
+        assert "Please, select a smaller truncation size." in \
+            score_response.text
+
+    def test_invocations(self, server: RemoteOpenAIServer, model: dict[str,
+                                                                       Any]):
+        text_1 = "What is the capital of France?"
+        text_2 = "The capital of France is Paris."
+
+        request_args = {
+            "model": model["name"],
+            "text_1": text_1,
+            "text_2": text_2,
+        }
+
+        score_response = requests.post(server.url_for("score"),
+                                       json=request_args)
+        score_response.raise_for_status()
+
+        invocation_response = requests.post(server.url_for("invocations"),
+                                            json=request_args)
+        invocation_response.raise_for_status()
+
+        score_output = score_response.json()
+        invocation_output = invocation_response.json()
+
+        assert score_output.keys() == invocation_output.keys()
+        for score_data, invocation_data in zip(score_output["data"],
+                                               invocation_output["data"]):
+            assert score_data.keys() == invocation_data.keys()
+            assert score_data["score"] == pytest.approx(
+                invocation_data["score"], rel=0.01)
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_serving_chat.py b/vllm_v0.10.0/tests/entrypoints/openai/test_serving_chat.py
new file mode 100644
index 0000000..8a7892c
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_serving_chat.py
@@ -0,0 +1,323 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+from contextlib import suppress
+from dataclasses import dataclass, field
+from typing import Any, Optional
+from unittest.mock import MagicMock
+
+import pytest
+
+from vllm.config import MultiModalConfig
+from vllm.engine.multiprocessing.client import MQLLMEngineClient
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.entrypoints.openai.serving_models import (BaseModelPath,
+                                                    OpenAIServingModels)
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+MODEL_NAME = "openai-community/gpt2"
+CHAT_TEMPLATE = "Dummy chat template for testing {}"
+BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
+
+
+@dataclass
+class MockHFConfig:
+    model_type: str = "any"
+
+
+@dataclass
+class MockModelConfig:
+    task = "generate"
+    tokenizer = MODEL_NAME
+    trust_remote_code = False
+    tokenizer_mode = "auto"
+    max_model_len = 100
+    tokenizer_revision = None
+    multimodal_config = MultiModalConfig()
+    hf_config = MockHFConfig()
+    logits_processor_pattern = None
+    diff_sampling_param: Optional[dict] = None
+    allowed_local_media_path: str = ""
+    encoder_config = None
+    generation_config: str = "auto"
+    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
+
+    def get_diff_sampling_param(self):
+        return self.diff_sampling_param or {}
+
+
+@dataclass
+class MockEngine:
+
+    async def get_model_config(self):
+        return MockModelConfig()
+
+
+async def _async_serving_chat_init():
+    engine = MockEngine()
+    model_config = await engine.get_model_config()
+
+    models = OpenAIServingModels(engine, model_config, BASE_MODEL_PATHS)
+    serving_completion = OpenAIServingChat(engine,
+                                           model_config,
+                                           models,
+                                           response_role="assistant",
+                                           chat_template=CHAT_TEMPLATE,
+                                           chat_template_content_format="auto",
+                                           request_logger=None)
+    return serving_completion
+
+
+def test_async_serving_chat_init():
+    serving_completion = asyncio.run(_async_serving_chat_init())
+    assert serving_completion.chat_template == CHAT_TEMPLATE
+
+
+@pytest.mark.asyncio
+async def test_serving_chat_should_set_correct_max_tokens():
+    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=MockModelConfig())
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     MockModelConfig(),
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)
+
+    req = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "what is 1+1?"
+        }],
+        guided_decoding_backend="outlines",
+    )
+
+    with suppress(Exception):
+        await serving_chat.create_chat_completion(req)
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 93
+
+    req.max_tokens = 10
+    with suppress(Exception):
+        await serving_chat.create_chat_completion(req)
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 10
+
+    # Setting server's max_tokens in the generation_config.json
+    # lower than context_window - prompt_tokens
+    mock_model_config = MockModelConfig()
+    mock_model_config.diff_sampling_param = {
+        "max_tokens": 10  # Setting server-side max_tokens limit
+    }
+
+    # Reinitialize the engine with new settings
+    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+
+    # Initialize the serving chat
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     mock_model_config,
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)
+
+    # Test Case 1: No max_tokens specified in request
+    req = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "what is 1+1?"
+        }],
+        guided_decoding_backend="outlines",
+    )
+
+    with suppress(Exception):
+        await serving_chat.create_chat_completion(req)
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 10
+
+    # Test Case 2: Request's max_tokens set higher than server accepts
+    req.max_tokens = 15
+
+    with suppress(Exception):
+        await serving_chat.create_chat_completion(req)
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 10
+
+    # Test Case 3: Request's max_tokens set lower than server accepts
+    req.max_tokens = 5
+
+    with suppress(Exception):
+        await serving_chat.create_chat_completion(req)
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 5
+
+    # Setting server's max_tokens in the generation_config.json
+    # higher than context_window - prompt_tokens
+    mock_model_config = MockModelConfig()
+    mock_model_config.diff_sampling_param = {
+        "max_tokens": 200  # Setting server-side max_tokens limit
+    }
+
+    # Reinitialize the engine with new settings
+    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+
+    # Initialize the serving chat
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     mock_model_config,
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)
+
+    # Test case 1: No max_tokens specified, defaults to context_window
+    req = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "what is 1+1?"
+        }],
+        guided_decoding_backend="outlines",
+    )
+
+    with suppress(Exception):
+        await serving_chat.create_chat_completion(req)
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 93
+
+    # Test Case 2: Request's max_tokens set higher than server accepts
+    req.max_tokens = 100
+
+    with suppress(Exception):
+        await serving_chat.create_chat_completion(req)
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 93
+
+    # Test Case 3: Request's max_tokens set lower than server accepts
+    req.max_tokens = 5
+
+    with suppress(Exception):
+        await serving_chat.create_chat_completion(req)
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 5
+
+
+@pytest.mark.asyncio
+async def test_serving_chat_could_load_correct_generation_config():
+
+    mock_model_config = MockModelConfig()
+    mock_model_config.diff_sampling_param = {
+        "temperature": 0.5,
+        "repetition_penalty": 1.05
+    }
+
+    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+
+    # Initialize the serving chat
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     mock_model_config,
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)
+
+    req = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "what is 1+1?"
+        }],
+        guided_decoding_backend="outlines",
+    )
+
+    with suppress(Exception):
+        await serving_chat.create_chat_completion(req)
+
+    assert mock_engine.generate.call_args.args[1].temperature == 0.5
+    assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
+
+    # Test the param when user set it
+    req.temperature = 0.1
+
+    with suppress(Exception):
+        await serving_chat.create_chat_completion(req)
+
+    assert mock_engine.generate.call_args.args[1].temperature == 0.1
+    assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
+
+    # Test When temperature==0.0
+    req.temperature = 0.0
+
+    with suppress(Exception):
+        await serving_chat.create_chat_completion(req)
+
+    assert mock_engine.generate.call_args.args[1].temperature == 0.0
+    assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
+
+
+@pytest.mark.asyncio
+async def test_serving_chat_did_set_correct_cache_salt():
+    mock_model_config = MockModelConfig()
+
+    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+
+    # Initialize the serving chat
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     mock_model_config,
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)
+
+    # Test cache_salt
+    req = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "what is 1+1?"
+        }],
+    )
+
+    # By default cache_salt in the engine prompt is not set
+    with suppress(Exception):
+        await serving_chat.create_chat_completion(req)
+    assert "cache_salt" not in mock_engine.generate.call_args.args[0]
+
+    # Test with certain cache_salt
+    req.cache_salt = "test_salt"
+    with suppress(Exception):
+        await serving_chat.create_chat_completion(req)
+    assert mock_engine.generate.call_args.args[0]["cache_salt"] == "test_salt"
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_serving_models.py b/vllm_v0.10.0/tests/entrypoints/openai/test_serving_models.py
new file mode 100644
index 0000000..c3b458d
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_serving_models.py
@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from http import HTTPStatus
+from unittest.mock import MagicMock
+
+import pytest
+
+from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.openai.protocol import (ErrorResponse,
+                                              LoadLoRAAdapterRequest,
+                                              UnloadLoRAAdapterRequest)
+from vllm.entrypoints.openai.serving_models import (BaseModelPath,
+                                                    OpenAIServingModels)
+from vllm.lora.request import LoRARequest
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
+LORA_LOADING_SUCCESS_MESSAGE = (
+    "Success: LoRA adapter '{lora_name}' added successfully.")
+LORA_UNLOADING_SUCCESS_MESSAGE = (
+    "Success: LoRA adapter '{lora_name}' removed successfully.")
+
+
+async def _async_serving_models_init() -> OpenAIServingModels:
+    mock_model_config = MagicMock(spec=ModelConfig)
+    mock_engine_client = MagicMock(spec=EngineClient)
+    # Set the max_model_len attribute to avoid missing attribute
+    mock_model_config.max_model_len = 2048
+
+    serving_models = OpenAIServingModels(engine_client=mock_engine_client,
+                                         base_model_paths=BASE_MODEL_PATHS,
+                                         model_config=mock_model_config,
+                                         lora_modules=None)
+    await serving_models.init_static_loras()
+
+    return serving_models
+
+
+@pytest.mark.asyncio
+async def test_serving_model_name():
+    serving_models = await _async_serving_models_init()
+    assert serving_models.model_name(None) == MODEL_NAME
+    request = LoRARequest(lora_name="adapter",
+                          lora_path="/path/to/adapter2",
+                          lora_int_id=1)
+    assert serving_models.model_name(request) == request.lora_name
+
+
+@pytest.mark.asyncio
+async def test_load_lora_adapter_success():
+    serving_models = await _async_serving_models_init()
+    request = LoadLoRAAdapterRequest(lora_name="adapter",
+                                     lora_path="/path/to/adapter2")
+    response = await serving_models.load_lora_adapter(request)
+    assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter')
+    assert len(serving_models.lora_requests) == 1
+    assert "adapter" in serving_models.lora_requests
+    assert serving_models.lora_requests["adapter"].lora_name == "adapter"
+
+
+@pytest.mark.asyncio
+async def test_load_lora_adapter_missing_fields():
+    serving_models = await _async_serving_models_init()
+    request = LoadLoRAAdapterRequest(lora_name="", lora_path="")
+    response = await serving_models.load_lora_adapter(request)
+    assert isinstance(response, ErrorResponse)
+    assert response.type == "InvalidUserInput"
+    assert response.code == HTTPStatus.BAD_REQUEST
+
+
+@pytest.mark.asyncio
+async def test_load_lora_adapter_duplicate():
+    serving_models = await _async_serving_models_init()
+    request = LoadLoRAAdapterRequest(lora_name="adapter1",
+                                     lora_path="/path/to/adapter1")
+    response = await serving_models.load_lora_adapter(request)
+    assert response == LORA_LOADING_SUCCESS_MESSAGE.format(
+        lora_name='adapter1')
+    assert len(serving_models.lora_requests) == 1
+
+    request = LoadLoRAAdapterRequest(lora_name="adapter1",
+                                     lora_path="/path/to/adapter1")
+    response = await serving_models.load_lora_adapter(request)
+    assert isinstance(response, ErrorResponse)
+    assert response.type == "InvalidUserInput"
+    assert response.code == HTTPStatus.BAD_REQUEST
+    assert len(serving_models.lora_requests) == 1
+
+
+@pytest.mark.asyncio
+async def test_unload_lora_adapter_success():
+    serving_models = await _async_serving_models_init()
+    request = LoadLoRAAdapterRequest(lora_name="adapter1",
+                                     lora_path="/path/to/adapter1")
+    response = await serving_models.load_lora_adapter(request)
+    assert len(serving_models.lora_requests) == 1
+
+    request = UnloadLoRAAdapterRequest(lora_name="adapter1")
+    response = await serving_models.unload_lora_adapter(request)
+    assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format(
+        lora_name='adapter1')
+    assert len(serving_models.lora_requests) == 0
+
+
+@pytest.mark.asyncio
+async def test_unload_lora_adapter_missing_fields():
+    serving_models = await _async_serving_models_init()
+    request = UnloadLoRAAdapterRequest(lora_name="", lora_int_id=None)
+    response = await serving_models.unload_lora_adapter(request)
+    assert isinstance(response, ErrorResponse)
+    assert response.type == "InvalidUserInput"
+    assert response.code == HTTPStatus.BAD_REQUEST
+
+
+@pytest.mark.asyncio
+async def test_unload_lora_adapter_not_found():
+    serving_models = await _async_serving_models_init()
+    request = UnloadLoRAAdapterRequest(lora_name="nonexistent_adapter")
+    response = await serving_models.unload_lora_adapter(request)
+    assert isinstance(response, ErrorResponse)
+    assert response.type == "NotFoundError"
+    assert response.code == HTTPStatus.NOT_FOUND
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_shutdown.py b/vllm_v0.10.0/tests/entrypoints/openai/test_shutdown.py
new file mode 100644
index 0000000..29a94c8
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_shutdown.py
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import openai
+import pytest
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+
+
+@pytest.mark.asyncio
+async def test_shutdown_on_engine_failure():
+    # dtype, max-len etc set so that this can run in CI
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "128",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        async with remote_server.get_async_client() as client:
+
+            with pytest.raises(
+                (openai.APIConnectionError, openai.InternalServerError)):
+                # Asking for lots of prompt logprobs will currently crash the
+                # engine. This may change in the future when that bug is fixed
+                prompt = "Hello " * 4000
+                await client.completions.create(
+                    model=MODEL_NAME,
+                    prompt=prompt,
+                    extra_body={"prompt_logprobs": 10})
+
+            # Now the server should shut down
+            return_code = remote_server.proc.wait(timeout=8)
+            assert return_code is not None
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_sleep.py b/vllm_v0.10.0/tests/entrypoints/openai/test_sleep.py
new file mode 100644
index 0000000..0dd6af1
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_sleep.py
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import requests
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B"
+
+
+def test_sleep_mode():
+    # dtype, max-len etc set so that this can run in CI
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--max-num-seqs",
+        "128",
+        "--enable-sleep-mode",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME,
+                            args,
+                            env_dict={
+                                "VLLM_SERVER_DEV_MODE": "1",
+                                "CUDA_VISIBLE_DEVICES": "0"
+                            }) as remote_server:
+        response = requests.post(remote_server.url_for("sleep"),
+                                 params={"level": "1"})
+        assert response.status_code == 200
+        response = requests.get(remote_server.url_for("is_sleeping"))
+        assert response.status_code == 200
+        assert response.json().get("is_sleeping") is True
+
+        response = requests.post(remote_server.url_for("wake_up"))
+        assert response.status_code == 200
+        response = requests.get(remote_server.url_for("is_sleeping"))
+        assert response.status_code == 200
+        assert response.json().get("is_sleeping") is False
+
+        # test wake up with tags
+        response = requests.post(remote_server.url_for("sleep"),
+                                 params={"level": "1"})
+        assert response.status_code == 200
+
+        response = requests.post(remote_server.url_for("wake_up"),
+                                 params={"tags": ["weights"]})
+        assert response.status_code == 200
+
+        # is sleeping should be false after waking up any part of the engine
+        response = requests.get(remote_server.url_for("is_sleeping"))
+        assert response.status_code == 200
+        assert response.json().get("is_sleeping") is True
+
+        response = requests.post(remote_server.url_for("wake_up"),
+                                 params={"tags": ["kv_cache"]})
+        assert response.status_code == 200
+
+        response = requests.get(remote_server.url_for("is_sleeping"))
+        assert response.status_code == 200
+        assert response.json().get("is_sleeping") is False
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_tensorizer_entrypoint.py b/vllm_v0.10.0/tests/entrypoints/openai/test_tensorizer_entrypoint.py
new file mode 100644
index 0000000..4bf3798
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_tensorizer_entrypoint.py
@@ -0,0 +1,100 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import gc
+import os
+import tempfile
+
+import openai
+import pytest
+import pytest_asyncio
+import torch.cuda
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.model_executor.model_loader.tensorizer import (
+    TensorizerConfig, tensorize_lora_adapter, tensorize_vllm_model)
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "unsloth/llama-3.2-1b-Instruct"
+LORA_PATH = "davzoku/finqa_adapter_1b"
+
+
+def _cleanup():
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+@pytest.fixture(autouse=True)
+def cleanup():
+    _cleanup()
+
+
+@pytest.fixture(scope='module')
+def tmp_dir():
+    with tempfile.TemporaryDirectory() as path:
+        yield path
+
+
+@pytest.fixture(scope='module')
+def model_uri(tmp_dir):
+    yield f"{tmp_dir}/model.tensors"
+
+
+@pytest.fixture(scope="module")
+def tensorize_model_and_lora(tmp_dir, model_uri):
+    tensorizer_config = TensorizerConfig(tensorizer_uri=model_uri,
+                                         lora_dir=tmp_dir)
+    args = EngineArgs(model=MODEL_NAME, device="cuda")
+
+    tensorize_lora_adapter(LORA_PATH, tensorizer_config)
+    tensorize_vllm_model(args, tensorizer_config)
+
+    # Manually invoke a _cleanup() here, as the cleanup()
+    # fixture won't be guaranteed to be called after this
+    # when this fixture is used for a test
+    _cleanup()
+    yield
+
+
+@pytest.fixture(scope="module")
+def server(model_uri, tensorize_model_and_lora):
+    # In this case, model_uri is a directory with a model.tensors
+    # file and all necessary model artifacts, particularly a
+    # HF `config.json` file. In this case, Tensorizer can infer the
+    # `TensorizerConfig` so --model-loader-extra-config can be completely
+    # omitted.
+
+    ## Start OpenAI API server
+    args = [
+        "--load-format", "tensorizer", "--served-model-name", MODEL_NAME,
+        "--enable-lora"
+    ]
+
+    model_dir = os.path.dirname(model_uri)
+    with RemoteOpenAIServer(model_dir, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
+    _cleanup()
+    completion = await client.completions.create(model=model_name,
+                                                 prompt="Hello, my name is",
+                                                 max_tokens=5,
+                                                 temperature=0.0)
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 1
+    assert completion.model == MODEL_NAME
+    assert len(completion.choices) == 1
+    assert len(completion.choices[0].text) >= 5
+    assert completion.choices[0].finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5, prompt_tokens=6, total_tokens=11)
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_tokenization.py b/vllm_v0.10.0/tests/entrypoints/openai/test_tokenization.py
new file mode 100644
index 0000000..0dbbdfb
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_tokenization.py
@@ -0,0 +1,389 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import pytest_asyncio
+import requests
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ...utils import RemoteOpenAIServer
+from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
+from .test_completion import zephyr_lora_files  # noqa: F401
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+
+
+@pytest.fixture(scope="module")
+def server(zephyr_lora_added_tokens_files: str):  # noqa: F811
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "128",
+        # lora config
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora2={zephyr_lora_added_tokens_files}",
+        "--max-lora-rank",
+        "64",
+        "--enable-tokenizer-info-endpoint",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def tokenizer_name(model_name: str,
+                   zephyr_lora_added_tokens_files: str):  # noqa: F811
+    return zephyr_lora_added_tokens_files if (
+        model_name == "zephyr-lora2") else model_name
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name,tokenizer_name",
+    [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
+    indirect=["tokenizer_name"],
+)
+async def test_tokenize_completions(
+    server: RemoteOpenAIServer,
+    model_name: str,
+    tokenizer_name: str,
+):
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
+                              tokenizer_mode="fast")
+
+    for add_special in [False, True]:
+        prompt = "vllm1 This is a test prompt."
+        tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
+
+        response = requests.post(server.url_for("tokenize"),
+                                 json={
+                                     "add_special_tokens": add_special,
+                                     "model": model_name,
+                                     "prompt": prompt
+                                 })
+        response.raise_for_status()
+
+        result = response.json()
+        assert result["tokens"] == tokens
+        assert result["count"] == len(tokens)
+        assert result["max_model_len"] == 8192
+        assert result["token_strs"] is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name,tokenizer_name",
+    [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
+    indirect=["tokenizer_name"],
+)
+async def test_tokenize_chat(
+    server: RemoteOpenAIServer,
+    model_name: str,
+    tokenizer_name: str,
+):
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
+                              tokenizer_mode="fast")
+
+    for add_generation in [False, True]:
+        for add_special in [False, True]:
+            conversation = [{
+                "role": "user",
+                "content": "Hi there!"
+            }, {
+                "role": "assistant",
+                "content": "Nice to meet you!"
+            }, {
+                "role": "user",
+                "content": "Can I ask a question? vllm1"
+            }]
+            for continue_final in [False, True]:
+                if add_generation and continue_final:
+                    continue
+                if continue_final:
+                    conversation.append({
+                        "role": "assistant",
+                        "content": "Sure,"
+                    })
+
+                prompt = tokenizer.apply_chat_template(
+                    add_generation_prompt=add_generation,
+                    continue_final_message=continue_final,
+                    conversation=conversation,
+                    tokenize=False)
+                tokens = tokenizer.encode(prompt,
+                                          add_special_tokens=add_special)
+
+                response = requests.post(server.url_for("tokenize"),
+                                         json={
+                                             "add_generation_prompt":
+                                             add_generation,
+                                             "continue_final_message":
+                                             continue_final,
+                                             "add_special_tokens": add_special,
+                                             "messages": conversation,
+                                             "model": model_name
+                                         })
+                response.raise_for_status()
+
+                result = response.json()
+                assert result["tokens"] == tokens
+                assert result["count"] == len(tokens)
+                assert result["max_model_len"] == 8192
+                assert result["token_strs"] is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name,tokenizer_name",
+    [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
+    indirect=["tokenizer_name"],
+)
+async def test_tokenize_chat_with_tools(
+    server: RemoteOpenAIServer,
+    model_name: str,
+    tokenizer_name: str,
+):
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
+                              tokenizer_mode="fast")
+
+    for add_generation in [False, True]:
+        for add_special in [False, True]:
+            conversation = [{
+                "role":
+                "user",
+                "content":
+                "What's the weather like in Paris today?",
+            }]
+
+            tools = [{
+                "type": "function",
+                "function": {
+                    "name": "get_weather",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "location": {
+                                "type": "string"
+                            }
+                        },
+                    },
+                },
+            }]
+
+            for continue_final in [False, True]:
+                if add_generation and continue_final:
+                    continue
+                if continue_final:
+                    conversation.append({
+                        "role": "assistant",
+                        "content": "Sure,"
+                    })
+
+                prompt = tokenizer.apply_chat_template(
+                    add_generation_prompt=add_generation,
+                    continue_final_message=continue_final,
+                    conversation=conversation,
+                    tools=tools,
+                    tokenize=False,
+                )
+                tokens = tokenizer.encode(prompt,
+                                          add_special_tokens=add_special)
+
+                response = requests.post(
+                    server.url_for("tokenize"),
+                    json={
+                        "add_generation_prompt": add_generation,
+                        "continue_final_message": continue_final,
+                        "add_special_tokens": add_special,
+                        "messages": conversation,
+                        "model": model_name,
+                        "tools": tools,
+                    },
+                )
+                response.raise_for_status()
+
+                result = response.json()
+                assert result["tokens"] == tokens
+                assert result["count"] == len(tokens)
+                assert result["max_model_len"] == 8192
+                assert result["token_strs"] is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name, tokenizer_name",
+    [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
+    indirect=["tokenizer_name"],
+)
+async def test_tokenize_with_return_token_strs(
+    server: RemoteOpenAIServer,
+    model_name: str,
+    tokenizer_name: str,
+):
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
+                              tokenizer_mode="fast")
+
+    prompt = "This is a token_strs test prompt! vllm1"
+    response = requests.post(
+        server.url_for("tokenize"),
+        json={
+            "prompt": prompt,
+            "model": model_name,
+            "return_token_strs": True
+        },
+    )
+    response.raise_for_status()
+
+    tokens = tokenizer.encode(prompt, add_special_tokens=True)
+    tokens_str = tokenizer.convert_ids_to_tokens(tokens)
+
+    result = response.json()
+    assert result["tokens"] == tokens
+    assert result["count"] == len(tokens)
+    assert result["max_model_len"] == 8192
+    assert result["token_strs"] == tokens_str
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name,tokenizer_name",
+    [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
+    indirect=["tokenizer_name"],
+)
+async def test_detokenize(
+    server: RemoteOpenAIServer,
+    model_name: str,
+    tokenizer_name: str,
+):
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
+                              tokenizer_mode="fast")
+
+    prompt = "This is a test prompt. vllm1"
+    tokens = tokenizer.encode(prompt, add_special_tokens=False)
+
+    response = requests.post(server.url_for("detokenize"),
+                             json={
+                                 "model": model_name,
+                                 "tokens": tokens
+                             })
+    response.raise_for_status()
+
+    assert response.json() == {"prompt": prompt}
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name,tokenizer_name",
+    [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
+    indirect=["tokenizer_name"],
+)
+async def test_tokenizer_info_basic(
+    server: RemoteOpenAIServer,
+    model_name: str,
+    tokenizer_name: str,
+):
+    """Test basic tokenizer info endpoint functionality."""
+    response = requests.get(server.url_for("tokenizer_info"))
+    response.raise_for_status()
+    result = response.json()
+    assert "tokenizer_class" in result
+    assert isinstance(result["tokenizer_class"], str)
+    assert result["tokenizer_class"]
+
+
+@pytest.mark.asyncio
+async def test_tokenizer_info_schema(server: RemoteOpenAIServer):
+    """Test that the response matches expected schema types."""
+    response = requests.get(server.url_for("tokenizer_info"))
+    response.raise_for_status()
+    result = response.json()
+    field_types = {
+        "add_bos_token": bool,
+        "add_prefix_space": bool,
+        "clean_up_tokenization_spaces": bool,
+        "split_special_tokens": bool,
+        "bos_token": str,
+        "eos_token": str,
+        "pad_token": str,
+        "unk_token": str,
+        "chat_template": str,
+        "errors": str,
+        "model_max_length": int,
+        "additional_special_tokens": list,
+        "added_tokens_decoder": dict,
+    }
+    for field, expected_type in field_types.items():
+        if field in result and result[field] is not None:
+            assert isinstance(
+                result[field],
+                expected_type), (f"{field} should be {expected_type.__name__}")
+
+
+@pytest.mark.asyncio
+async def test_tokenizer_info_added_tokens_structure(
+    server: RemoteOpenAIServer, ):
+    """Test added_tokens_decoder structure if present."""
+    response = requests.get(server.url_for("tokenizer_info"))
+    response.raise_for_status()
+    result = response.json()
+    added_tokens = result.get("added_tokens_decoder")
+    if added_tokens:
+        for token_id, token_info in added_tokens.items():
+            assert isinstance(token_id, str), "Token IDs should be strings"
+            assert isinstance(token_info, dict), "Token info should be a dict"
+            assert "content" in token_info, "Token info should have content"
+            assert "special" in token_info, (
+                "Token info should have special flag")
+            assert isinstance(token_info["special"],
+                              bool), ("Special flag should be boolean")
+
+
+@pytest.mark.asyncio
+async def test_tokenizer_info_consistency_with_tokenize(
+    server: RemoteOpenAIServer, ):
+    """Test that tokenizer info is consistent with tokenization endpoint."""
+    info_response = requests.get(server.url_for("tokenizer_info"))
+    info_response.raise_for_status()
+    info = info_response.json()
+    tokenize_response = requests.post(
+        server.url_for("tokenize"),
+        json={
+            "model": MODEL_NAME,
+            "prompt": "Hello world!"
+        },
+    )
+    tokenize_response.raise_for_status()
+    tokenize_result = tokenize_response.json()
+    info_max_len = info.get("model_max_length")
+    tokenize_max_len = tokenize_result.get("max_model_len")
+    if info_max_len and tokenize_max_len:
+        assert info_max_len >= tokenize_max_len, (
+            "Info max length should be >= tokenize max length")
+
+
+@pytest.mark.asyncio
+async def test_tokenizer_info_chat_template(server: RemoteOpenAIServer):
+    """Test chat template is properly included."""
+    response = requests.get(server.url_for("tokenizer_info"))
+    response.raise_for_status()
+    result = response.json()
+    chat_template = result.get("chat_template")
+    if chat_template:
+        assert isinstance(chat_template,
+                          str), ("Chat template should be a string")
+        assert chat_template.strip(), "Chat template should not be empty"
\ No newline at end of file
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_transcription_validation.py b/vllm_v0.10.0/tests/entrypoints/openai/test_transcription_validation.py
new file mode 100644
index 0000000..a8e2eb4
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_transcription_validation.py
@@ -0,0 +1,274 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# imports for guided decoding tests
+import io
+import json
+from unittest.mock import patch
+
+import librosa
+import numpy as np
+import openai
+import pytest
+import soundfile as sf
+from openai._base_client import AsyncAPIClient
+
+from vllm.assets.audio import AudioAsset
+
+from ...utils import RemoteOpenAIServer
+
+MISTRAL_FORMAT_ARGS = [
+    "--tokenizer_mode", "mistral", "--config_format", "mistral",
+    "--load_format", "mistral"
+]
+
+
+@pytest.fixture
+def mary_had_lamb():
+    path = AudioAsset('mary_had_lamb').get_local_path()
+    with open(str(path), "rb") as f:
+        yield f
+
+
+@pytest.fixture
+def winning_call():
+    path = AudioAsset('winning_call').get_local_path()
+    with open(str(path), "rb") as f:
+        yield f
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    ["openai/whisper-large-v3-turbo", "mistralai/Voxtral-Mini-3B-2507"])
+async def test_basic_audio(mary_had_lamb, model_name):
+    server_args = ["--enforce-eager"]
+
+    if model_name.startswith("mistralai"):
+        server_args += MISTRAL_FORMAT_ARGS
+
+    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        transcription = await client.audio.transcriptions.create(
+            model=model_name,
+            file=mary_had_lamb,
+            language="en",
+            response_format="text",
+            temperature=0.0)
+        out = json.loads(transcription)['text']
+        assert "Mary had a little lamb," in out
+
+
+@pytest.mark.asyncio
+async def test_bad_requests(mary_had_lamb):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+
+        # invalid language
+        with pytest.raises(openai.BadRequestError):
+            await client.audio.transcriptions.create(model=model_name,
+                                                     file=mary_had_lamb,
+                                                     language="hh",
+                                                     temperature=0.0)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", ["openai/whisper-large-v3-turbo"])
+async def test_long_audio_request(mary_had_lamb, model_name):
+    server_args = ["--enforce-eager"]
+
+    if model_name.startswith("openai"):
+        return
+
+    mary_had_lamb.seek(0)
+    audio, sr = librosa.load(mary_had_lamb)
+    # Add small silence after each audio for repeatability in the split process
+    audio = np.pad(audio, (0, 1600))
+    repeated_audio = np.tile(audio, 10)
+    # Repeated audio to buffer
+    buffer = io.BytesIO()
+    sf.write(buffer, repeated_audio, sr, format='WAV')
+    buffer.seek(0)
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        transcription = await client.audio.transcriptions.create(
+            model=model_name,
+            file=buffer,
+            language="en",
+            response_format="text",
+            temperature=0.0)
+        out = json.loads(transcription)['text']
+        counts = out.count("Mary had a little lamb")
+        assert counts == 10, counts
+
+
+@pytest.mark.asyncio
+async def test_non_asr_model(winning_call):
+    # text to text model
+    model_name = "JackFram/llama-68m"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        res = await client.audio.transcriptions.create(model=model_name,
+                                                       file=winning_call,
+                                                       language="en",
+                                                       temperature=0.0)
+        assert res.code == 400 and not res.text
+        assert res.message == "The model does not support Transcriptions API"
+
+
+@pytest.mark.asyncio
+async def test_completion_endpoints():
+    # text to text model
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        res = await client.chat.completions.create(
+            model=model_name,
+            messages=[{
+                "role": "system",
+                "content": "You are a helpful assistant."
+            }])
+        assert res.code == 400
+        assert res.message == "The model does not support Chat Completions API"
+
+        res = await client.completions.create(model=model_name, prompt="Hello")
+        assert res.code == 400
+        assert res.message == "The model does not support Completions API"
+
+
+@pytest.mark.asyncio
+async def test_streaming_response(winning_call):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    transcription = ""
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        res_no_stream = await client.audio.transcriptions.create(
+            model=model_name,
+            file=winning_call,
+            response_format="json",
+            language="en",
+            temperature=0.0)
+        # Unfortunately this only works when the openai client is patched
+        # to use streaming mode, not exposed in the transcription api.
+        original_post = AsyncAPIClient.post
+
+        async def post_with_stream(*args, **kwargs):
+            kwargs['stream'] = True
+            return await original_post(*args, **kwargs)
+
+        with patch.object(AsyncAPIClient, "post", new=post_with_stream):
+            client = remote_server.get_async_client()
+            res = await client.audio.transcriptions.create(
+                model=model_name,
+                file=winning_call,
+                language="en",
+                temperature=0.0,
+                extra_body=dict(stream=True),
+                timeout=30)
+            # Reconstruct from chunks and validate
+            async for chunk in res:
+                # just a chunk
+                text = chunk.choices[0]['delta']['content']
+                transcription += text
+
+        assert transcription == res_no_stream.text
+
+
+@pytest.mark.asyncio
+async def test_stream_options(winning_call):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        original_post = AsyncAPIClient.post
+
+        async def post_with_stream(*args, **kwargs):
+            kwargs['stream'] = True
+            return await original_post(*args, **kwargs)
+
+        with patch.object(AsyncAPIClient, "post", new=post_with_stream):
+            client = remote_server.get_async_client()
+            res = await client.audio.transcriptions.create(
+                model=model_name,
+                file=winning_call,
+                language="en",
+                temperature=0.0,
+                extra_body=dict(stream=True,
+                                stream_include_usage=True,
+                                stream_continuous_usage_stats=True),
+                timeout=30)
+            final = False
+            continuous = True
+            async for chunk in res:
+                if not len(chunk.choices):
+                    # final usage sent
+                    final = True
+                else:
+                    continuous = continuous and hasattr(chunk, 'usage')
+            assert final and continuous
+
+
+@pytest.mark.asyncio
+async def test_sampling_params(mary_had_lamb):
+    """
+    Compare sampling with params and greedy sampling to assert results
+    are different when extreme sampling parameters values are picked. 
+    """
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        transcription = await client.audio.transcriptions.create(
+            model=model_name,
+            file=mary_had_lamb,
+            language="en",
+            temperature=0.8,
+            extra_body=dict(seed=42,
+                            repetition_penalty=1.9,
+                            top_k=12,
+                            top_p=0.4,
+                            min_p=0.5,
+                            frequency_penalty=1.8,
+                            presence_penalty=2.0))
+
+        greedy_transcription = await client.audio.transcriptions.create(
+            model=model_name,
+            file=mary_had_lamb,
+            language="en",
+            temperature=0.0,
+            extra_body=dict(seed=42))
+
+        assert greedy_transcription.text != transcription.text
+
+
+@pytest.mark.asyncio
+async def test_audio_prompt(mary_had_lamb):
+    model_name = "openai/whisper-large-v3-turbo"
+    server_args = ["--enforce-eager"]
+    prompt = "This is a speech, recorded in a phonograph."
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        #Prompts should not omit the part of original prompt while transcribing.
+        prefix = "The first words I spoke in the original phonograph"
+        client = remote_server.get_async_client()
+        transcription = await client.audio.transcriptions.create(
+            model=model_name,
+            file=mary_had_lamb,
+            language="en",
+            response_format="text",
+            temperature=0.0)
+        out = json.loads(transcription)['text']
+        assert prefix in out
+        transcription_wprompt = await client.audio.transcriptions.create(
+            model=model_name,
+            file=mary_had_lamb,
+            language="en",
+            response_format="text",
+            prompt=prompt,
+            temperature=0.0)
+        out_prompt = json.loads(transcription_wprompt)['text']
+        assert prefix in out_prompt
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_translation_validation.py b/vllm_v0.10.0/tests/entrypoints/openai/test_translation_validation.py
new file mode 100644
index 0000000..79e769e
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_translation_validation.py
@@ -0,0 +1,171 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import io
+# imports for guided decoding tests
+import json
+from unittest.mock import patch
+
+import librosa
+import numpy as np
+import pytest
+import soundfile as sf
+from openai._base_client import AsyncAPIClient
+
+from vllm.assets.audio import AudioAsset
+
+from ...utils import RemoteOpenAIServer
+
+
+@pytest.fixture
+def foscolo():
+    # Test translation it->en
+    path = AudioAsset('azacinto_foscolo').get_local_path()
+    with open(str(path), "rb") as f:
+        yield f
+
+
+# NOTE: (NickLucche) the large-v3-turbo model was not trained on translation!
+@pytest.mark.asyncio
+async def test_basic_audio(foscolo):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        translation = await client.audio.translations.create(
+            model=model_name,
+            file=foscolo,
+            response_format="text",
+            # TODO remove once language detection is implemented
+            extra_body=dict(language="it"),
+            temperature=0.0)
+        out = json.loads(translation)['text'].strip().lower()
+        assert "greek sea" in out
+
+
+@pytest.mark.asyncio
+async def test_audio_prompt(foscolo):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    # Condition whisper on starting text
+    prompt = "Nor have I ever"
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        transcription = await client.audio.translations.create(
+            model=model_name,
+            file=foscolo,
+            prompt=prompt,
+            extra_body=dict(language="it"),
+            response_format="text",
+            temperature=0.0)
+        out = json.loads(transcription)['text']
+        assert "Nor will I ever touch the sacred" not in out
+        assert prompt not in out
+
+
+@pytest.mark.asyncio
+async def test_non_asr_model(foscolo):
+    # text to text model
+    model_name = "JackFram/llama-68m"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        res = await client.audio.translations.create(model=model_name,
+                                                     file=foscolo,
+                                                     temperature=0.0)
+        assert res.code == 400 and not res.text
+        assert res.message == "The model does not support Translations API"
+
+
+@pytest.mark.asyncio
+async def test_streaming_response(foscolo):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    translation = ""
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        res_no_stream = await client.audio.translations.create(
+            model=model_name,
+            file=foscolo,
+            response_format="json",
+            extra_body=dict(language="it"),
+            temperature=0.0)
+        # Unfortunately this only works when the openai client is patched
+        # to use streaming mode, not exposed in the translation api.
+        original_post = AsyncAPIClient.post
+
+        async def post_with_stream(*args, **kwargs):
+            kwargs['stream'] = True
+            return await original_post(*args, **kwargs)
+
+        with patch.object(AsyncAPIClient, "post", new=post_with_stream):
+            client = remote_server.get_async_client()
+            res = await client.audio.translations.create(model=model_name,
+                                                         file=foscolo,
+                                                         temperature=0.0,
+                                                         extra_body=dict(
+                                                             stream=True,
+                                                             language="it"))
+            # Reconstruct from chunks and validate
+            async for chunk in res:
+                # just a chunk
+                text = chunk.choices[0]['delta']['content']
+                translation += text
+
+        assert translation == res_no_stream.text
+
+
+@pytest.mark.asyncio
+async def test_stream_options(foscolo):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        original_post = AsyncAPIClient.post
+
+        async def post_with_stream(*args, **kwargs):
+            kwargs['stream'] = True
+            return await original_post(*args, **kwargs)
+
+        with patch.object(AsyncAPIClient, "post", new=post_with_stream):
+            client = remote_server.get_async_client()
+            res = await client.audio.translations.create(
+                model=model_name,
+                file=foscolo,
+                temperature=0.0,
+                extra_body=dict(language="it",
+                                stream=True,
+                                stream_include_usage=True,
+                                stream_continuous_usage_stats=True))
+            final = False
+            continuous = True
+            async for chunk in res:
+                if not len(chunk.choices):
+                    # final usage sent
+                    final = True
+                else:
+                    continuous = continuous and hasattr(chunk, 'usage')
+            assert final and continuous
+
+
+@pytest.mark.asyncio
+async def test_long_audio_request(foscolo):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+
+    foscolo.seek(0)
+    audio, sr = librosa.load(foscolo)
+    repeated_audio = np.tile(audio, 2)
+    # Repeated audio to buffer
+    buffer = io.BytesIO()
+    sf.write(buffer, repeated_audio, sr, format='WAV')
+    buffer.seek(0)
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        translation = await client.audio.translations.create(
+            model=model_name,
+            file=buffer,
+            extra_body=dict(language="it"),
+            response_format="text",
+            temperature=0.0)
+        out = json.loads(translation)['text'].strip().lower()
+        assert out.count("greek sea") == 2
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_truncation.py b/vllm_v0.10.0/tests/entrypoints/openai/test_truncation.py
new file mode 100644
index 0000000..b33a26a
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_truncation.py
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+import openai
+import pytest
+import pytest_asyncio
+
+from tests.utils import RemoteOpenAIServer
+
+MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
+max_model_len = 128
+
+input = """Immerse yourself in the enchanting chronicle of calculus, a 
+    mathematical domain that has radically transformed our comprehension of 
+    change and motion. Despite its roots in ancient civilizations, the 
+    formal birth of calculus predominantly occurred in the 17th century, 
+    primarily under the influential guidance of Sir Isaac Newton and Gottfried 
+    Wilhelm Leibniz. The earliest traces of calculus concepts are found in 
+    ancient Greek mathematics,most notably in the works of Eudoxus and 
+    Archimedes, around 300 BCE. They utilized the 'method of exhaustion'—a 
+    technique for computing areas and volumes through the use of finite sums. 
+    This methodology laid crucial foundational work for integral calculus. 
+    In the 17th century, both Newton and Leibniz independently pioneered 
+    calculus, each contributing unique perspectives that would shape this new 
+    field."""
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--task",
+        "embed",
+        "--dtype",
+        "bfloat16",
+        "--enforce-eager",
+        "--max-model-len",
+        str(max_model_len),
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_smaller_truncation_size(client: openai.AsyncOpenAI):
+    truncation_size = 10
+    kwargs: dict[str, Any] = {
+        "model": MODEL_NAME,
+        "input": input,
+        "truncate_prompt_tokens": truncation_size
+    }
+
+    response = await client.post(path="embeddings",
+                                 cast_to=object,
+                                 body={**kwargs})
+
+    assert response["usage"]["prompt_tokens"] == truncation_size
+
+
+@pytest.mark.asyncio
+async def test_bigger_truncation_size(client: openai.AsyncOpenAI):
+    truncation_size = max_model_len + 1
+    kwargs: dict[str, Any] = {
+        "model": MODEL_NAME,
+        "input": input,
+        "truncate_prompt_tokens": truncation_size
+    }
+
+    with pytest.raises(openai.BadRequestError) as err:
+        err = await client.post(path="embeddings",
+                                cast_to=object,
+                                body={**kwargs})
+
+        assert str(err) == f"""openai.BadRequestError: 
+                    Error code: 400 - {{'object': 'error', 
+                    'message': 'truncate_prompt_tokens value 
+                    ({truncation_size}) 
+                    is greater than max_model_len ({max_model_len}). 
+                    Please, select a smaller truncation size.', 
+                    'type': 'BadRequestError', 
+                    'param': None, 'code': 400}}"""
+
+
+@pytest.mark.asyncio
+async def test_max_truncation_size(client: openai.AsyncOpenAI):
+    truncation_size = -1
+    kwargs: dict[str, Any] = {
+        "model": MODEL_NAME,
+        "input": input,
+        "truncate_prompt_tokens": truncation_size
+    }
+
+    response = await client.post(path="embeddings",
+                                 cast_to=object,
+                                 body={**kwargs})
+
+    assert response["usage"]["prompt_tokens"] == max_model_len
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_video.py b/vllm_v0.10.0/tests/entrypoints/openai/test_video.py
new file mode 100644
index 0000000..b68e085
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_video.py
@@ -0,0 +1,378 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import openai
+import pytest
+import pytest_asyncio
+
+from vllm.multimodal.utils import encode_video_base64, fetch_video
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
+MAXIMUM_VIDEOS = 4
+
+TEST_VIDEO_URLS = [
+    "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4",
+    "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ElephantsDream.mp4",
+    "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerBlazes.mp4",
+    "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4",
+]
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--task",
+        "generate",
+        "--max-model-len",
+        "32768",
+        "--max-num-seqs",
+        "2",
+        "--enforce-eager",
+        "--trust-remote-code",
+        "--limit-mm-per-prompt",
+        json.dumps({"video": MAXIMUM_VIDEOS}),
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.fixture(scope="session")
+def base64_encoded_video() -> dict[str, str]:
+    return {
+        video_url: encode_video_base64(fetch_video(video_url)[0])
+        for video_url in TEST_VIDEO_URLS
+    }
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+async def test_single_chat_session_video(client: openai.AsyncOpenAI,
+                                         model_name: str, video_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "video_url",
+                "video_url": {
+                    "url": video_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this video?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        logprobs=True,
+        temperature=0.0,
+        top_logprobs=5)
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=6287, total_tokens=6297)
+
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+async def test_error_on_invalid_video_url_type(client: openai.AsyncOpenAI,
+                                               model_name: str,
+                                               video_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "video_url",
+                "video_url": video_url
+            },
+            {
+                "type": "text",
+                "text": "What's in this video?"
+            },
+        ],
+    }]
+
+    # video_url should be a dict {"url": "some url"}, not directly a string
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.chat.completions.create(model=model_name,
+                                                 messages=messages,
+                                                 max_completion_tokens=10,
+                                                 temperature=0.0)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+async def test_single_chat_session_video_beamsearch(client: openai.AsyncOpenAI,
+                                                    model_name: str,
+                                                    video_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "video_url",
+                "video_url": {
+                    "url": video_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this video?"
+            },
+        ],
+    }]
+
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        n=2,
+        max_completion_tokens=10,
+        logprobs=True,
+        top_logprobs=5,
+        extra_body=dict(use_beam_search=True))
+    assert len(chat_completion.choices) == 2
+    assert chat_completion.choices[
+        0].message.content != chat_completion.choices[1].message.content
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+async def test_single_chat_session_video_base64encoded(
+        client: openai.AsyncOpenAI, model_name: str, video_url: str,
+        base64_encoded_video: dict[str, str]):
+
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "video_url",
+                "video_url": {
+                    "url":
+                    f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this video?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        logprobs=True,
+        temperature=0.0,
+        top_logprobs=5)
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=6287, total_tokens=6297)
+
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+async def test_single_chat_session_video_base64encoded_beamsearch(
+        client: openai.AsyncOpenAI, model_name: str, video_url: str,
+        base64_encoded_video: dict[str, str]):
+
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "video_url",
+                "video_url": {
+                    "url":
+                    f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this video?"
+            },
+        ],
+    }]
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        n=2,
+        max_completion_tokens=10,
+        extra_body=dict(use_beam_search=True))
+    assert len(chat_completion.choices) == 2
+    assert chat_completion.choices[
+        0].message.content != chat_completion.choices[1].message.content
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+async def test_chat_streaming_video(client: openai.AsyncOpenAI,
+                                    model_name: str, video_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "video_url",
+                "video_url": {
+                    "url": video_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this video?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+    )
+    output = chat_completion.choices[0].message.content
+    stop_reason = chat_completion.choices[0].finish_reason
+
+    # test streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+        stream=True,
+    )
+    chunks: list[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        if delta.content:
+            chunks.append(delta.content)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert delta.content
+    assert "".join(chunks) == output
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "video_urls",
+    [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))])
+async def test_multi_video_input(client: openai.AsyncOpenAI, model_name: str,
+                                 video_urls: list[str]):
+
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *({
+                "type": "video_url",
+                "video_url": {
+                    "url": video_url
+                }
+            } for video_url in video_urls),
+            {
+                "type": "text",
+                "text": "What's in this video?"
+            },
+        ],
+    }]
+
+    if len(video_urls) > MAXIMUM_VIDEOS:
+        with pytest.raises(openai.BadRequestError):  # test multi-video input
+            await client.chat.completions.create(
+                model=model_name,
+                messages=messages,
+                max_completion_tokens=10,
+                temperature=0.0,
+            )
+
+        # the server should still work afterwards
+        completion = await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+        )
+        completion = completion.choices[0].text
+        assert completion is not None and len(completion) >= 0
+    else:
+        chat_completion = await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=10,
+            temperature=0.0,
+        )
+        message = chat_completion.choices[0].message
+        assert message.content is not None and len(message.content) >= 0
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_vision.py b/vllm_v0.10.0/tests/entrypoints/openai/test_vision.py
new file mode 100644
index 0000000..b6f1d64
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_vision.py
@@ -0,0 +1,437 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import openai
+import pytest
+import pytest_asyncio
+import requests
+from PIL import Image
+from transformers import AutoProcessor
+
+from vllm.multimodal.utils import encode_image_base64, fetch_image
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "microsoft/Phi-3.5-vision-instruct"
+MAXIMUM_IMAGES = 2
+
+# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
+TEST_IMAGE_URLS = [
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
+    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+]
+
+EXPECTED_MM_BEAM_SEARCH_RES = [
+    [
+        "The image shows a wooden boardwalk leading through a",
+        "The image shows a wooden boardwalk extending into a",
+    ],
+    [
+        "The image shows two parrots perched on",
+        "The image shows two birds perched on a cur",
+    ],
+    [
+        "The image shows a Venn diagram with three over",
+        "The image shows a Venn diagram with three intersect",
+    ],
+    [
+        "This image displays a gradient of colors ranging from",
+        "The image displays a gradient of colors ranging from",
+    ],
+]
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--task",
+        "generate",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "5",
+        "--enforce-eager",
+        "--trust-remote-code",
+        "--limit-mm-per-prompt",
+        json.dumps({"image": MAXIMUM_IMAGES}),
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.fixture(scope="session")
+def base64_encoded_image() -> dict[str, str]:
+    return {
+        image_url: encode_image_base64(fetch_image(image_url))
+        for image_url in TEST_IMAGE_URLS
+    }
+
+
+def get_hf_prompt_tokens(model_name, content, image_url):
+    processor = AutoProcessor.from_pretrained(model_name,
+                                              trust_remote_code=True,
+                                              num_crops=4)
+
+    placeholder = "<|image_1|>\n"
+    messages = [{
+        "role": "user",
+        "content": f"{placeholder}{content}",
+    }]
+    images = [Image.open(requests.get(image_url, stream=True).raw)]
+
+    prompt = processor.tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True)
+    inputs = processor(prompt, images, return_tensors="pt")
+
+    return inputs.input_ids.shape[1]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_single_chat_session_image(client: openai.AsyncOpenAI,
+                                         model_name: str, image_url: str):
+    content_text = "What's in this image?"
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            },
+            {
+                "type": "text",
+                "text": content_text
+            },
+        ],
+    }]
+
+    max_completion_tokens = 10
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=max_completion_tokens,
+        logprobs=True,
+        temperature=0.0,
+        top_logprobs=5)
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text,
+                                            image_url)
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=max_completion_tokens,
+        prompt_tokens=hf_prompt_tokens,
+        total_tokens=hf_prompt_tokens + max_completion_tokens)
+
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_error_on_invalid_image_url_type(client: openai.AsyncOpenAI,
+                                               model_name: str,
+                                               image_url: str):
+    content_text = "What's in this image?"
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": image_url
+            },
+            {
+                "type": "text",
+                "text": content_text
+            },
+        ],
+    }]
+
+    # image_url should be a dict {"url": "some url"}, not directly a string
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.chat.completions.create(model=model_name,
+                                                 messages=messages,
+                                                 max_completion_tokens=10,
+                                                 temperature=0.0)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI,
+                                                    model_name: str,
+                                                    image_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        n=2,
+        max_completion_tokens=10,
+        logprobs=True,
+        top_logprobs=5,
+        extra_body=dict(use_beam_search=True))
+    assert len(chat_completion.choices) == 2
+    assert chat_completion.choices[
+        0].message.content != chat_completion.choices[1].message.content
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_single_chat_session_image_base64encoded(
+        client: openai.AsyncOpenAI, model_name: str, image_url: str,
+        base64_encoded_image: dict[str, str]):
+
+    content_text = "What's in this image?"
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url":
+                    f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
+                }
+            },
+            {
+                "type": "text",
+                "text": content_text
+            },
+        ],
+    }]
+
+    max_completion_tokens = 10
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=max_completion_tokens,
+        logprobs=True,
+        temperature=0.0,
+        top_logprobs=5)
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text,
+                                            image_url)
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=max_completion_tokens,
+        prompt_tokens=hf_prompt_tokens,
+        total_tokens=hf_prompt_tokens + max_completion_tokens)
+
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_idx", list(range(len(TEST_IMAGE_URLS))))
+async def test_single_chat_session_image_base64encoded_beamsearch(
+        client: openai.AsyncOpenAI, model_name: str, image_idx: int,
+        base64_encoded_image: dict[str, str]):
+    # NOTE: This test also validates that we pass MM data through beam search
+    image_url = TEST_IMAGE_URLS[image_idx]
+    expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
+
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url":
+                    f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        n=2,
+        max_completion_tokens=10,
+        temperature=0.0,
+        extra_body=dict(use_beam_search=True))
+    assert len(chat_completion.choices) == 2
+    for actual, expected_str in zip(chat_completion.choices, expected_res):
+        assert actual.message.content == expected_str
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_chat_streaming_image(client: openai.AsyncOpenAI,
+                                    model_name: str, image_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+    )
+    output = chat_completion.choices[0].message.content
+    stop_reason = chat_completion.choices[0].finish_reason
+
+    # test streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+        stream=True,
+    )
+    chunks: list[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        if delta.content:
+            chunks.append(delta.content)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert delta.content
+    assert "".join(chunks) == output
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "image_urls",
+    [TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
+async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
+                                 image_urls: list[str]):
+
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *({
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            } for image_url in image_urls),
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+
+    if len(image_urls) > MAXIMUM_IMAGES:
+        with pytest.raises(openai.BadRequestError):  # test multi-image input
+            await client.chat.completions.create(
+                model=model_name,
+                messages=messages,
+                max_completion_tokens=10,
+                temperature=0.0,
+            )
+
+        # the server should still work afterwards
+        completion = await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+        )
+        completion = completion.choices[0].text
+        assert completion is not None and len(completion) >= 0
+    else:
+        chat_completion = await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=10,
+            temperature=0.0,
+        )
+        message = chat_completion.choices[0].message
+        assert message.content is not None and len(message.content) >= 0
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/test_vision_embedding.py b/vllm_v0.10.0/tests/entrypoints/openai/test_vision_embedding.py
new file mode 100644
index 0000000..fe982e2
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/test_vision_embedding.py
@@ -0,0 +1,114 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+import requests
+from PIL import Image
+from transformers import AutoProcessor
+
+from vllm.entrypoints.openai.protocol import EmbeddingResponse
+from vllm.multimodal.utils import encode_image_base64, fetch_image
+
+from ...utils import VLLM_PATH, RemoteOpenAIServer
+
+MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
+MAXIMUM_IMAGES = 2
+
+vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec.jinja"
+assert vlm2vec_jinja_path.exists()
+
+# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
+TEST_IMAGE_URLS = [
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
+    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+]
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--task",
+        "embed",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "5",
+        "--enforce-eager",
+        "--trust-remote-code",
+        "--limit-mm-per-prompt",
+        json.dumps({"image": MAXIMUM_IMAGES}),
+        "--chat-template",
+        str(vlm2vec_jinja_path),
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="session")
+def base64_encoded_image() -> dict[str, str]:
+    return {
+        image_url: encode_image_base64(fetch_image(image_url))
+        for image_url in TEST_IMAGE_URLS
+    }
+
+
+def get_hf_prompt_tokens(model_name, content, image_url):
+    processor = AutoProcessor.from_pretrained(model_name,
+                                              trust_remote_code=True,
+                                              num_crops=4)
+
+    placeholder = "<|image_1|> "
+    prompt = f"{placeholder}{content}"
+    images = [Image.open(requests.get(image_url, stream=True).raw)]
+    inputs = processor(prompt, images, return_tensors="pt")
+    return inputs.input_ids.shape[1]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
+                               image_url: str):
+    content_text = "Represent the given image."
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            },
+            {
+                "type": "text",
+                "text": content_text
+            },
+        ],
+    }]
+
+    response = requests.post(
+        server.url_for("v1/embeddings"),
+        json={
+            "model": model_name,
+            "messages": messages,
+            "encoding_format": "float"
+        },
+    )
+    response.raise_for_status()
+    embeddings = EmbeddingResponse.model_validate(response.json())
+
+    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text,
+                                            image_url)
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 3072
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == hf_prompt_tokens
+    assert embeddings.usage.total_tokens == hf_prompt_tokens
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/tool_parsers/__init__.py b/vllm_v0.10.0/tests/entrypoints/openai/tool_parsers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py b/vllm_v0.10.0/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py
new file mode 100644
index 0000000..bd8e065
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py
@@ -0,0 +1,153 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+
+import json
+from unittest.mock import MagicMock
+
+import pytest
+
+from tests.entrypoints.openai.tool_parsers.utils import (
+    run_tool_extraction, run_tool_extraction_streaming)
+from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
+from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
+
+
+def make_tool_call(name, arguments):
+    return ToolCall(type="function",
+                    function=FunctionCall(name=name,
+                                          arguments=json.dumps(arguments)))
+
+
+# TODO: add reason prefix and suffix.
+
+
+@pytest.mark.parametrize(
+    "model_output,expected_tool_calls,expected_content",
+    [
+        # No tool call
+        ("How can I help you today?", [], "How can I help you today?"),
+        # Single tool call, no content
+        (
+            "<tool_calls>[{\"name\": \"get_weather\", \"arguments\": {\"city\": \"San Francisco\", \"metric\": \"celsius\"}}]</tool_calls>",  #noqa: E501
+            [
+                make_tool_call("get_weather", {
+                    "city": "San Francisco",
+                    "metric": "celsius"
+                })
+            ],
+            None),
+        # Multiple tool calls
+        (
+            "<tool_calls>[{\"name\": \"get_weather\", \"arguments\": {\"city\": \"San Francisco\", \"metric\": \"celsius\"}}, {\"name\": \"register_user\", \"arguments\": {\"name\": \"John Doe\", \"age\": 37, \"address\": {\"city\": \"San Francisco\", \"state\": \"CA\"}, \"role\": null, \"passed_test\": true, \"aliases\": [\"John\", \"Johnny\"]}}]</tool_calls>",  #noqa: E501
+            [
+                make_tool_call("get_weather", {
+                    "city": "San Francisco",
+                    "metric": "celsius"
+                }),
+                make_tool_call(
+                    "register_user", {
+                        "name": "John Doe",
+                        "age": 37,
+                        "address": {
+                            "city": "San Francisco",
+                            "state": "CA"
+                        },
+                        "role": None,
+                        "passed_test": True,
+                        "aliases": ["John", "Johnny"]
+                    })
+            ],
+            None),
+        # Content before tool call
+        (
+            "I will call the tool now. <tool_calls>[{\"name\": \"get_weather\", \"arguments\": {\"city\": \"Boston\"}}]</tool_calls>",  #noqa: E501
+            [make_tool_call("get_weather", {"city": "Boston"})],
+            "I will call the tool now. "),
+        # Content after tool call (should be stripped)
+        (
+            "<tool_calls>[{\"name\": \"get_weather\", \"arguments\": {\"city\": \"Seattle\"}}]</tool_calls>\nThank you!",  #noqa: E501
+            [make_tool_call("get_weather", {"city": "Seattle"})],
+            None),
+        (
+            "<tool_calls>[{\"name\": \"complex_tool\", \"arguments\": {\"level1\": {\"level2\": {\"level3\": {\"value\": 123}}}}}]</tool_calls>",
+            [
+                make_tool_call(
+                    "complex_tool",
+                    {"level1": {
+                        "level2": {
+                            "level3": {
+                                "value": 123
+                            }
+                        }
+                    }})
+            ],
+            None,
+        ),
+    ])
+def test_hunyuan_a13b_tool_parser_extract(model_output, expected_tool_calls,
+                                          expected_content):
+    mock_tokenizer = MagicMock()
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
+        "hunyuan_a13b")(mock_tokenizer)
+    content, tool_calls = run_tool_extraction(tool_parser,
+                                              model_output,
+                                              streaming=False)
+
+    # align the random id.
+    for idx in range(len(tool_calls)):
+        tool_calls[idx].id = expected_tool_calls[idx].id
+    assert tool_calls == expected_tool_calls
+    assert content == expected_content
+
+
+# Streaming test: simulate incremental output
+@pytest.mark.parametrize("model_deltas,expected_tool_calls", [
+    ([
+        "<tool_calls>[{\"name\": \"get_weather\", ",
+        "\"arguments\": {\"city\": \"San Francisco\", ",
+        "\"metric\": \"celsius\"}}]", "</tool_calls>"
+    ], [
+        make_tool_call("get_weather", {
+            "city": "San Francisco",
+            "metric": "celsius"
+        })
+    ]),
+    ([
+        "<tool_calls>[{\"name\":", " \"get_weather\",", " \"arguments\":",
+        " {\"city\": \"Boston\"}", "}]", "</tool_calls>"
+    ], [make_tool_call("get_weather", {"city": "Boston"})]),
+    ([
+        "", "<tool_calls>[{\"name\":", " \"get_weather\",", " \"arguments\":",
+        " {\"city\": \"Boston\"}", "}]", "</tool_calls>", "\n</answer>"
+    ], [make_tool_call("get_weather", {"city": "Boston"})]),
+    pytest.param([
+        "<tool_calls>[{\"name\": \"complex_tool\",", " \"arguments\": ",
+        " {\"level1\": {\"level2\": ", "{\"level3\": {\"value\": 123}}}}}",
+        "]</tool_calls>"
+    ], [
+        make_tool_call("complex_tool",
+                       {"level1": {
+                           "level2": {
+                               "level3": {
+                                   "value": 123
+                               }
+                           }
+                       }})
+    ],
+                 marks=pytest.mark.xfail(
+                     reason="stream parsing not support nested json yet.")),
+])
+def test_hunyuan_a13b_tool_parser_streaming(model_deltas, expected_tool_calls):
+    mock_tokenizer = MagicMock()
+
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
+        "hunyuan_a13b")(mock_tokenizer)
+    reconstructor = run_tool_extraction_streaming(
+        tool_parser, model_deltas, assert_one_tool_per_delta=False)
+
+    # align the random id.
+    for idx in range(len(reconstructor.tool_calls)):
+        reconstructor.tool_calls[idx].id = expected_tool_calls[idx].id
+
+    assert reconstructor.tool_calls == expected_tool_calls
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py b/vllm_v0.10.0/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
new file mode 100644
index 0000000..8c86b48
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
@@ -0,0 +1,218 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from tests.entrypoints.openai.tool_parsers.utils import (
+    run_tool_extraction, run_tool_extraction_streaming)
+from vllm.entrypoints.openai.protocol import FunctionCall
+from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
+
+# Test cases similar to pythonic parser but with Llama4 specific format
+SIMPLE_FUNCTION_OUTPUT = "[get_weather(city='LA', metric='C')]"
+SIMPLE_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments='{"city": "LA", "metric": "C"}',
+)
+MORE_TYPES_FUNCTION_OUTPUT = ("[register_user(name='Doe', "
+                              "age=9, "
+                              "address={'city': 'LA', 'state': 'CA'}, "
+                              "role=None, "
+                              "passed_test=True, "
+                              "aliases=['John', 'Johnny'])]")
+MORE_TYPES_FUNCTION_CALL = FunctionCall(
+    name="register_user",
+    arguments='{"name": "Doe", '
+    '"age": 9, '
+    '"address": {"city": "LA", "state": "CA"}, '
+    '"role": null, '
+    '"passed_test": true, '
+    '"aliases": ["John", "Johnny"]}',
+)
+PARAMETERLESS_FUNCTION_OUTPUT = "[get_weather()]"
+PARAMETERLESS_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments='{}',
+)
+EMPTY_DICT_FUNCTION_OUTPUT = "[do_something_cool(additional_data={})]"
+EMPTY_DICT_FUNCTION_CALL = FunctionCall(
+    name="do_something_cool",
+    arguments='{"additional_data": {}}',
+)
+EMPTY_LIST_FUNCTION_OUTPUT = "[do_something_cool(steps=[])]"
+EMPTY_LIST_FUNCTION_CALL = FunctionCall(
+    name="do_something_cool",
+    arguments='{"steps": []}',
+)
+ESCAPED_STRING_FUNCTION_OUTPUT = (
+    r"[get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')]")
+ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}',
+)
+PYTHON_TAG_FUNCTION_OUTPUT = (
+    "<|python_start|>[get_weather(city='LA', metric='C')]<|python_end|>")
+
+
+@pytest.mark.parametrize("streaming", [True, False])
+def test_no_tool_call(streaming: bool):
+    mock_tokenizer = MagicMock()
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
+        "llama4_pythonic")(mock_tokenizer)
+    model_output = "How can I help you today?"
+
+    content, tool_calls = run_tool_extraction(tool_parser,
+                                              model_output,
+                                              streaming=streaming)
+
+    assert content == model_output
+    assert len(tool_calls) == 0
+
+
+test_str = "<|python_start|>"
+test_str += "[get_weather(city='LA', metric='C'),"
+test_str += "register_user(name='Doe', age=9)]"
+TEST_CASES = [
+    pytest.param(True,
+                 ESCAPED_STRING_FUNCTION_OUTPUT,
+                 [ESCAPED_STRING_FUNCTION_CALL],
+                 id="simple_streaming"),
+    pytest.param(False,
+                 SIMPLE_FUNCTION_OUTPUT, [SIMPLE_FUNCTION_CALL],
+                 id="simple_nonstreaming"),
+    pytest.param(True,
+                 MORE_TYPES_FUNCTION_OUTPUT, [MORE_TYPES_FUNCTION_CALL],
+                 id="more_types_streaming"),
+    pytest.param(False,
+                 MORE_TYPES_FUNCTION_OUTPUT, [MORE_TYPES_FUNCTION_CALL],
+                 id="more_types_nonstreaming"),
+    pytest.param(True,
+                 PARAMETERLESS_FUNCTION_OUTPUT, [PARAMETERLESS_FUNCTION_CALL],
+                 id="parameterless_streaming"),
+    pytest.param(False,
+                 PARAMETERLESS_FUNCTION_OUTPUT, [PARAMETERLESS_FUNCTION_CALL],
+                 id="parameterless_nonstreaming"),
+    pytest.param(True,
+                 EMPTY_DICT_FUNCTION_OUTPUT, [EMPTY_DICT_FUNCTION_CALL],
+                 id="empty_dict_streaming"),
+    pytest.param(False,
+                 EMPTY_DICT_FUNCTION_OUTPUT, [EMPTY_DICT_FUNCTION_CALL],
+                 id="empty_dict_nonstreaming"),
+    pytest.param(True,
+                 EMPTY_LIST_FUNCTION_OUTPUT, [EMPTY_LIST_FUNCTION_CALL],
+                 id="empty_list_streaming"),
+    pytest.param(False,
+                 EMPTY_LIST_FUNCTION_OUTPUT, [EMPTY_LIST_FUNCTION_CALL],
+                 id="empty_list_nonstreaming"),
+    pytest.param(True,
+                 ESCAPED_STRING_FUNCTION_OUTPUT,
+                 [ESCAPED_STRING_FUNCTION_CALL],
+                 id="escaped_string_streaming"),
+    pytest.param(False,
+                 ESCAPED_STRING_FUNCTION_OUTPUT,
+                 [ESCAPED_STRING_FUNCTION_CALL],
+                 id="escaped_string_nonstreaming"),
+    pytest.param(
+        True,
+        "[get_weather(city='LA',metric='C'),register_user(name='Doe',age=9)]",
+        [
+            SIMPLE_FUNCTION_CALL,
+            FunctionCall(name="register_user",
+                         arguments='{"name": "Doe", "age": 9}')
+        ],
+        id="parallel_calls_streaming"),
+    pytest.param(
+        False,
+        "[get_weather(city='LA',metric='C'),register_user(name='Doe',age=9)]",
+        [
+            SIMPLE_FUNCTION_CALL,
+            FunctionCall(name="register_user",
+                         arguments='{"name": "Doe", "age": 9}')
+        ],
+        id="parallel_calls_nonstreaming"),
+    pytest.param(True,
+                 PYTHON_TAG_FUNCTION_OUTPUT, [SIMPLE_FUNCTION_CALL],
+                 id="python_tag_streaming"),
+    pytest.param(False,
+                 PYTHON_TAG_FUNCTION_OUTPUT, [SIMPLE_FUNCTION_CALL],
+                 id="python_tag_nonstreaming"),
+    pytest.param(True,
+                 test_str, [
+                     SIMPLE_FUNCTION_CALL,
+                     FunctionCall(name="register_user",
+                                  arguments='{"name": "Doe", "age": 9}')
+                 ],
+                 id="parallel_calls_streaming"),
+    pytest.param(False,
+                 "<|python_start|>[get_weather(city='LA', metric='C'), " +
+                 "register_user(name='Doe', age=9)]", [
+                     SIMPLE_FUNCTION_CALL,
+                     FunctionCall(name="register_user",
+                                  arguments='{"name": "Doe", "age": 9}')
+                 ],
+                 id="parallel_calls_nonstreaming"),
+]
+
+
+@pytest.mark.parametrize("streaming, model_output, expected_tool_calls",
+                         TEST_CASES)
+def test_tool_call(streaming: bool, model_output: str,
+                   expected_tool_calls: list[FunctionCall]):
+    mock_tokenizer = MagicMock()
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
+        "llama4_pythonic")(mock_tokenizer)
+
+    content, tool_calls = run_tool_extraction(tool_parser,
+                                              model_output,
+                                              streaming=streaming)
+
+    assert len(tool_calls) == len(expected_tool_calls)
+    for actual, expected in zip(tool_calls, expected_tool_calls):
+        assert actual.type == "function"
+        assert actual.function == expected
+
+
+def test_streaming_tool_call_with_large_steps():
+    mock_tokenizer = MagicMock()
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
+        "llama4_pythonic")(mock_tokenizer)
+    model_output_deltas = [
+        "<|python_start|>[get_weather(city='LA', metric='C'), "
+        "get_weather(), "
+        "do_something_cool(steps=[])]<|python_end|>",
+    ]
+
+    reconstructor = run_tool_extraction_streaming(
+        tool_parser, model_output_deltas, assert_one_tool_per_delta=False)
+
+    assert reconstructor.other_content == ""
+    assert len(reconstructor.tool_calls) == 3
+    assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
+    assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
+    assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
+
+
+@pytest.mark.parametrize("streaming", [False])
+def test_regex_timeout_handling(streaming: bool):
+    """test regex timeout is handled gracefully"""
+    mock_tokenizer = MagicMock()
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
+        "llama4_pythonic")(mock_tokenizer)
+
+    fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2
+
+    # create a mock regex that raises TimeoutError
+    mock_regex = MagicMock()
+    mock_regex.match.side_effect = TimeoutError("Regex timeout")
+
+    with patch.object(tool_parser, 'TOOL_CALL_REGEX', mock_regex):
+        content, tool_calls = run_tool_extraction(tool_parser,
+                                                  fake_problematic_input,
+                                                  streaming=streaming)
+
+        # should treat as regular text when regex times out
+        assert content == fake_problematic_input
+        assert len(tool_calls) == 0
+        mock_regex.match.assert_called_once()
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/vllm_v0.10.0/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
new file mode 100644
index 0000000..d831374
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
@@ -0,0 +1,186 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from tests.entrypoints.openai.tool_parsers.utils import (
+    run_tool_extraction, run_tool_extraction_streaming)
+from vllm.entrypoints.openai.protocol import FunctionCall
+from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
+
+# https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
+SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"
+SIMPLE_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments='{"city": "San Francisco", "metric": "celsius"}',
+)
+MORE_TYPES_FUNCTION_OUTPUT = (
+    "register_user(name='John Doe', "
+    "age=37, "
+    "address={'city': 'San Francisco', 'state': 'CA'}, "
+    "role=None, "
+    "passed_test=True, "
+    "aliases=['John', 'Johnny'])")
+MORE_TYPES_FUNCTION_CALL = FunctionCall(
+    name="register_user",
+    arguments='{"name": "John Doe", '
+    '"age": 37, '
+    '"address": {"city": "San Francisco", "state": "CA"}, '
+    '"role": null, '
+    '"passed_test": true, '
+    '"aliases": ["John", "Johnny"]}',
+)
+PARAMETERLESS_FUNCTION_OUTPUT = "get_weather()"
+PARAMETERLESS_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments='{}',
+)
+EMPTY_DICT_FUNCTION_OUTPUT = "do_something_cool(additional_data={})"
+EMPTY_DICT_FUNCTION_CALL = FunctionCall(
+    name="do_something_cool",
+    arguments='{"additional_data": {}}',
+)
+EMPTY_LIST_FUNCTION_OUTPUT = "do_something_cool(steps=[])"
+EMPTY_LIST_FUNCTION_CALL = FunctionCall(
+    name="do_something_cool",
+    arguments='{"steps": []}',
+)
+ESCAPED_STRING_FUNCTION_OUTPUT = (
+    r"get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')")
+ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}',
+)
+
+
+@pytest.mark.parametrize("streaming", [True, False])
+def test_no_tool_call(streaming: bool):
+    mock_tokenizer = MagicMock()
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
+        mock_tokenizer)
+    model_output = "How can I help you today?"
+
+    content, tool_calls = run_tool_extraction(tool_parser,
+                                              model_output,
+                                              streaming=streaming)
+
+    assert content == model_output
+    assert len(tool_calls) == 0
+
+
+TEST_CASES = [
+    pytest.param(True,
+                 f"[{SIMPLE_FUNCTION_OUTPUT}]", [SIMPLE_FUNCTION_CALL],
+                 id="simple_streaming"),
+    pytest.param(False,
+                 f"[{SIMPLE_FUNCTION_OUTPUT}]", [SIMPLE_FUNCTION_CALL],
+                 id="simple_nonstreaming"),
+    pytest.param(True,
+                 f"[{MORE_TYPES_FUNCTION_OUTPUT}]", [MORE_TYPES_FUNCTION_CALL],
+                 id="more_types_streaming"),
+    pytest.param(False,
+                 f"[{MORE_TYPES_FUNCTION_OUTPUT}]", [MORE_TYPES_FUNCTION_CALL],
+                 id="more_types_nonstreaming"),
+    pytest.param(True,
+                 f"[{PARAMETERLESS_FUNCTION_OUTPUT}]",
+                 [PARAMETERLESS_FUNCTION_CALL],
+                 id="parameterless_streaming"),
+    pytest.param(False,
+                 f"[{PARAMETERLESS_FUNCTION_OUTPUT}]",
+                 [PARAMETERLESS_FUNCTION_CALL],
+                 id="parameterless_nonstreaming"),
+    pytest.param(True,
+                 f"[{EMPTY_DICT_FUNCTION_OUTPUT}]", [EMPTY_DICT_FUNCTION_CALL],
+                 id="empty_dict_streaming"),
+    pytest.param(False,
+                 f"[{EMPTY_DICT_FUNCTION_OUTPUT}]", [EMPTY_DICT_FUNCTION_CALL],
+                 id="empty_dict_nonstreaming"),
+    pytest.param(True,
+                 f"[{EMPTY_LIST_FUNCTION_OUTPUT}]", [EMPTY_LIST_FUNCTION_CALL],
+                 id="empty_list_streaming"),
+    pytest.param(False,
+                 f"[{EMPTY_LIST_FUNCTION_OUTPUT}]", [EMPTY_LIST_FUNCTION_CALL],
+                 id="empty_list_nonstreaming"),
+    pytest.param(True,
+                 f"[{ESCAPED_STRING_FUNCTION_OUTPUT}]",
+                 [ESCAPED_STRING_FUNCTION_CALL],
+                 id="escaped_string_streaming"),
+    pytest.param(False,
+                 f"[{ESCAPED_STRING_FUNCTION_OUTPUT}]",
+                 [ESCAPED_STRING_FUNCTION_CALL],
+                 id="escaped_string_nonstreaming"),
+    pytest.param(True,
+                 f"[{SIMPLE_FUNCTION_OUTPUT}, {MORE_TYPES_FUNCTION_OUTPUT}]",
+                 [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
+                 id="parallel_calls_streaming"),
+    pytest.param(False,
+                 f"[{SIMPLE_FUNCTION_OUTPUT}, {MORE_TYPES_FUNCTION_OUTPUT}]",
+                 [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
+                 id="parallel_calls_nonstreaming"),
+]
+
+
+@pytest.mark.parametrize("streaming, model_output, expected_tool_calls",
+                         TEST_CASES)
+def test_tool_call(streaming: bool, model_output: str,
+                   expected_tool_calls: list[FunctionCall]):
+    mock_tokenizer = MagicMock()
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
+        mock_tokenizer)
+
+    content, tool_calls = run_tool_extraction(tool_parser,
+                                              model_output,
+                                              streaming=streaming)
+
+    assert content is None
+    assert len(tool_calls) == len(expected_tool_calls)
+    for actual, expected in zip(tool_calls, expected_tool_calls):
+        assert actual.type == "function"
+        assert actual.function == expected
+
+
+def test_streaming_tool_call_with_large_steps():
+    mock_tokenizer = MagicMock()
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
+        mock_tokenizer)
+    model_output_deltas = [
+        "[get_weather(city='San",
+        " Francisco', metric='celsius'), "
+        f"{PARAMETERLESS_FUNCTION_OUTPUT}, "
+        f"{EMPTY_LIST_FUNCTION_OUTPUT}]",
+    ]
+
+    reconstructor = run_tool_extraction_streaming(
+        tool_parser, model_output_deltas, assert_one_tool_per_delta=False)
+
+    assert reconstructor.other_content == ""
+    assert len(reconstructor.tool_calls) == 3
+    assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
+    assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
+    assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
+
+
+@pytest.mark.parametrize("streaming", [False])
+def test_regex_timeout_handling(streaming: bool):
+    """test regex timeout is handled gracefully"""
+    mock_tokenizer = MagicMock()
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
+        "llama4_pythonic")(mock_tokenizer)
+
+    fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2
+
+    # create a mock regex that raises TimeoutError
+    mock_regex = MagicMock()
+    mock_regex.match.side_effect = TimeoutError("Regex timeout")
+
+    with patch.object(tool_parser, 'TOOL_CALL_REGEX', mock_regex):
+        content, tool_calls = run_tool_extraction(tool_parser,
+                                                  fake_problematic_input,
+                                                  streaming=streaming)
+
+        # should treat as regular text when regex times out
+        assert content == fake_problematic_input
+        assert len(tool_calls) == 0
+        mock_regex.match.assert_called_once()
diff --git a/vllm_v0.10.0/tests/entrypoints/openai/tool_parsers/utils.py b/vllm_v0.10.0/tests/entrypoints/openai/tool_parsers/utils.py
new file mode 100644
index 0000000..e1b41f4
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/openai/tool_parsers/utils.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable
+from typing import Union
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers import ToolParser
+
+
+class StreamingToolReconstructor:
+
+    def __init__(self, assert_one_tool_per_delta: bool = True):
+        self.tool_calls: list[ToolCall] = []
+        self.other_content: str = ""
+        self._assert_one_tool_per_delta = assert_one_tool_per_delta
+
+    def append_delta(self, delta: DeltaMessage):
+        if delta.content is not None:
+            self.other_content += delta.content
+        else:
+            assert delta.tool_calls, (
+                "Streaming results should have either content or tool calls "
+                "(or both)")
+        if self._assert_one_tool_per_delta:
+            # Note: This isn't strictly required by the API and may not be
+            # possible to adhere to depending on the token space and number of
+            # tokens per streamed response from the model, but it is required
+            # by tool_use tests, so we enforce it here by default also.
+            assert len(delta.tool_calls) < 2, (
+                "Streaming should include only one tool call per update.")
+        for call_delta in delta.tool_calls:
+            assert call_delta.type is None or call_delta.type == "function", (
+                "Streaming tool calls should only emit function calls. Got "
+                f"{call_delta.type}")
+            current_tool_call = self.tool_calls[
+                call_delta.index] if call_delta.index < len(
+                    self.tool_calls) else None
+            if current_tool_call:
+                assert (not call_delta.function.name), (
+                    "Streaming tool calls should emit the full function name "
+                    f"exactly once. Got {call_delta.function.name}")
+                assert (not call_delta.id), (
+                    "Streaming tool calls must emit function id only once. Got "
+                    f"{call_delta.id}")
+                assert (call_delta.index == len(self.tool_calls) - 1), (
+                    f"Incorrect index for tool delta. Got {call_delta.index}, "
+                    f"expected {len(self.tool_calls) - 1}")
+                current_tool_call.function.arguments += (
+                    call_delta.function.arguments)
+            else:
+                assert call_delta.id is not None, (
+                    "Streaming tool calls must have an id on first appearance")
+                assert call_delta.function.name is not None, (
+                    "Streaming tool calls must have a function name on first "
+                    "appearance")
+                assert call_delta.index == len(self.tool_calls), (
+                    f"Incorrect index for tool delta. Got {call_delta.index}, "
+                    f"expected {len(self.tool_calls)}")
+                self.tool_calls.append(
+                    ToolCall(id=call_delta.id,
+                             function=FunctionCall(
+                                 name=call_delta.function.name,
+                                 arguments=call_delta.function.arguments
+                                 or "")))
+
+
+def run_tool_extraction(
+    tool_parser: ToolParser,
+    model_output: str,
+    request: Union[ChatCompletionRequest, None] = None,
+    streaming: bool = False,
+    assert_one_tool_per_delta: bool = True,
+) -> tuple[Union[str, None], list[ToolCall]]:
+    if streaming:
+        reconstructor = run_tool_extraction_streaming(
+            tool_parser,
+            model_output,
+            request,
+            assert_one_tool_per_delta=assert_one_tool_per_delta)
+        return reconstructor.other_content or None, reconstructor.tool_calls
+    else:
+        extracted = run_tool_extraction_nonstreaming(tool_parser, model_output,
+                                                     request)
+        assert extracted.tools_called == bool(extracted.tool_calls)
+        return extracted.content, extracted.tool_calls
+
+
+def run_tool_extraction_nonstreaming(
+    tool_parser: ToolParser,
+    model_output: str,
+    request: Union[ChatCompletionRequest, None] = None
+) -> ExtractedToolCallInformation:
+    request = request or ChatCompletionRequest(messages=[], model="test-model")
+    return tool_parser.extract_tool_calls(model_output, request)
+
+
+def run_tool_extraction_streaming(
+    tool_parser: ToolParser,
+    model_deltas: Iterable[str],
+    request: Union[ChatCompletionRequest, None] = None,
+    assert_one_tool_per_delta: bool = True,
+) -> StreamingToolReconstructor:
+    request = request or ChatCompletionRequest(messages=[], model="test-model")
+    reconstructor = StreamingToolReconstructor(
+        assert_one_tool_per_delta=assert_one_tool_per_delta)
+    previous_text = ""
+    previous_tokens: list[int] = []
+    for delta in model_deltas:
+        token_delta = [
+            tool_parser.vocab.get(token)
+            for token in tool_parser.model_tokenizer.tokenize(delta)
+            if token in tool_parser.vocab
+        ]
+        current_text = previous_text + delta
+        current_tokens = previous_tokens + token_delta
+        delta_message = tool_parser.extract_tool_calls_streaming(
+            previous_text, current_text, delta, previous_tokens,
+            current_tokens, token_delta, request)
+        if delta_message is not None:
+            reconstructor.append_delta(delta_message)
+        previous_text = current_text
+        previous_tokens = current_tokens
+    return reconstructor
diff --git a/vllm_v0.10.0/tests/entrypoints/test_api_server_process_manager.py b/vllm_v0.10.0/tests/entrypoints/test_api_server_process_manager.py
new file mode 100644
index 0000000..e4af60a
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/test_api_server_process_manager.py
@@ -0,0 +1,269 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import multiprocessing
+import socket
+import threading
+import time
+from typing import Optional
+from unittest.mock import patch
+
+import pytest
+
+from vllm.v1.utils import (APIServerProcessManager,
+                           wait_for_completion_or_failure)
+
+# Global variables to control worker behavior
+WORKER_RUNTIME_SECONDS = 0.5
+
+
+# Mock implementation of run_api_server_worker
+def mock_run_api_server_worker(listen_address, sock, args, client_config=None):
+    """Mock run_api_server_worker that runs for a specific time."""
+    print(f"Mock worker started with client_config: {client_config}")
+    time.sleep(WORKER_RUNTIME_SECONDS)
+    print("Mock worker completed successfully")
+
+
+@pytest.fixture
+def api_server_args():
+    """Fixture to provide arguments for APIServerProcessManager."""
+    sock = socket.socket()
+    return {
+        "target_server_fn":
+        mock_run_api_server_worker,
+        "listen_address":
+        "localhost:8000",
+        "sock":
+        sock,
+        "args":
+        "test_args",  # Simple string to avoid pickling issues
+        "num_servers":
+        3,
+        "input_addresses": [
+            "tcp://127.0.0.1:5001", "tcp://127.0.0.1:5002",
+            "tcp://127.0.0.1:5003"
+        ],
+        "output_addresses": [
+            "tcp://127.0.0.1:6001", "tcp://127.0.0.1:6002",
+            "tcp://127.0.0.1:6003"
+        ],
+        "stats_update_address":
+        "tcp://127.0.0.1:7000",
+    }
+
+
+@pytest.mark.parametrize("with_stats_update", [True, False])
+def test_api_server_process_manager_init(api_server_args, with_stats_update):
+    """Test initializing the APIServerProcessManager."""
+    # Set the worker runtime to ensure tests complete in reasonable time
+    global WORKER_RUNTIME_SECONDS
+    WORKER_RUNTIME_SECONDS = 0.5
+
+    # Copy the args to avoid mutating the
+    args = api_server_args.copy()
+
+    if not with_stats_update:
+        args.pop("stats_update_address")
+    manager = APIServerProcessManager(**args)
+
+    try:
+        # Verify the manager was initialized correctly
+        assert len(manager.processes) == 3
+
+        # Verify all processes are running
+        for proc in manager.processes:
+            assert proc.is_alive()
+
+        print("Waiting for processes to run...")
+        time.sleep(WORKER_RUNTIME_SECONDS / 2)
+
+        # They should still be alive at this point
+        for proc in manager.processes:
+            assert proc.is_alive()
+
+    finally:
+        # Always clean up the processes
+        print("Cleaning up processes...")
+        manager.close()
+
+        # Give processes time to terminate
+        time.sleep(0.2)
+
+        # Verify all processes were terminated
+        for proc in manager.processes:
+            assert not proc.is_alive()
+
+
+@patch("vllm.entrypoints.cli.serve.run_api_server_worker",
+       mock_run_api_server_worker)
+def test_wait_for_completion_or_failure(api_server_args):
+    """Test that wait_for_completion_or_failure works with failures."""
+    global WORKER_RUNTIME_SECONDS
+    WORKER_RUNTIME_SECONDS = 1.0
+
+    # Create the manager
+    manager = APIServerProcessManager(**api_server_args)
+
+    try:
+        assert len(manager.processes) == 3
+
+        # Create a result capture for the thread
+        result: dict[str, Optional[Exception]] = {"exception": None}
+
+        def run_with_exception_capture():
+            try:
+                wait_for_completion_or_failure(api_server_manager=manager)
+            except Exception as e:
+                result["exception"] = e
+
+        # Start a thread to run wait_for_completion_or_failure
+        wait_thread = threading.Thread(target=run_with_exception_capture,
+                                       daemon=True)
+        wait_thread.start()
+
+        # Let all processes run for a short time
+        time.sleep(0.2)
+
+        # All processes should still be running
+        assert all(proc.is_alive() for proc in manager.processes)
+
+        # Now simulate a process failure
+        print("Simulating process failure...")
+        manager.processes[0].terminate()
+
+        # Wait for the wait_for_completion_or_failure
+        # to detect and handle the failure
+        # This should trigger it to terminate all other processes
+        wait_thread.join(timeout=1.0)
+
+        # The wait thread should have exited
+        assert not wait_thread.is_alive()
+
+        # Verify that an exception was raised with appropriate error message
+        assert result["exception"] is not None
+        assert "died with exit code" in str(result["exception"])
+
+        # All processes should now be terminated
+        for i, proc in enumerate(manager.processes):
+            assert not proc.is_alive(), f"Process {i} should not be alive"
+
+    finally:
+        manager.close()
+        time.sleep(0.2)
+
+
+@pytest.mark.timeout(30)
+def test_normal_completion(api_server_args):
+    """Test that wait_for_completion_or_failure works in normal completion."""
+    global WORKER_RUNTIME_SECONDS
+    WORKER_RUNTIME_SECONDS = 0.1
+
+    # Create the manager
+    manager = APIServerProcessManager(**api_server_args)
+
+    try:
+        # Give processes time to terminate
+        # wait for processes to complete
+        remaining_processes = manager.processes.copy()
+        while remaining_processes:
+            for proc in remaining_processes:
+                if not proc.is_alive():
+                    remaining_processes.remove(proc)
+            time.sleep(0.1)
+
+        # Verify all processes have terminated
+        for i, proc in enumerate(manager.processes):
+            assert not proc.is_alive(
+            ), f"Process {i} still alive after terminate()"
+
+        # Now call wait_for_completion_or_failure
+        # since all processes have already
+        # terminated, it should return immediately
+        # with no error
+        wait_for_completion_or_failure(api_server_manager=manager)
+
+    finally:
+        # Clean up just in case
+        manager.close()
+        time.sleep(0.2)
+
+
+@pytest.mark.timeout(30)
+def test_external_process_monitoring(api_server_args):
+    """Test that wait_for_completion_or_failure handles additional processes."""
+    global WORKER_RUNTIME_SECONDS
+    WORKER_RUNTIME_SECONDS = 100
+
+    # Create and start the external process
+    # (simulates local_engine_manager or coordinator)
+    spawn_context = multiprocessing.get_context("spawn")
+    external_proc = spawn_context.Process(target=mock_run_api_server_worker,
+                                          name="MockExternalProcess")
+    external_proc.start()
+
+    # Create the class to simulate a coordinator
+    class MockCoordinator:
+
+        def __init__(self, proc):
+            self.proc = proc
+
+        def close(self):
+            if self.proc.is_alive():
+                self.proc.terminate()
+                self.proc.join(timeout=0.5)
+
+    # Create a mock coordinator with the external process
+    mock_coordinator = MockCoordinator(external_proc)
+
+    # Create the API server manager
+    manager = APIServerProcessManager(**api_server_args)
+
+    try:
+        # Verify manager initialization
+        assert len(manager.processes) == 3
+
+        # Create a result capture for the thread
+        result: dict[str, Optional[Exception]] = {"exception": None}
+
+        def run_with_exception_capture():
+            try:
+                wait_for_completion_or_failure(api_server_manager=manager,
+                                               coordinator=mock_coordinator)
+            except Exception as e:
+                result["exception"] = e
+
+        # Start a thread to run wait_for_completion_or_failure
+        wait_thread = threading.Thread(target=run_with_exception_capture,
+                                       daemon=True)
+        wait_thread.start()
+
+        # Terminate the external process to trigger a failure
+        time.sleep(0.2)
+        external_proc.terminate()
+
+        # Wait for the thread to detect the failure
+        wait_thread.join(timeout=1.0)
+
+        # The wait thread should have completed
+        assert not wait_thread.is_alive(
+        ), "wait_for_completion_or_failure thread still running"
+
+        # Verify that an exception was raised with appropriate error message
+        assert result["exception"] is not None, "No exception was raised"
+        error_message = str(result["exception"])
+        assert "died with exit code" in error_message, \
+            f"Unexpected error message: {error_message}"
+        assert "MockExternalProcess" in error_message, \
+            f"Error doesn't mention external process: {error_message}"
+
+        # Verify that all API server processes were terminated as a result
+        for i, proc in enumerate(manager.processes):
+            assert not proc.is_alive(
+            ), f"API server process {i} was not terminated"
+
+    finally:
+        # Clean up
+        manager.close()
+        mock_coordinator.close()
+        time.sleep(0.2)
diff --git a/vllm_v0.10.0/tests/entrypoints/test_chat_utils.py b/vllm_v0.10.0/tests/entrypoints/test_chat_utils.py
new file mode 100644
index 0000000..ed57fe3
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/test_chat_utils.py
@@ -0,0 +1,1543 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import warnings
+from collections.abc import Mapping
+from typing import Literal, Optional
+
+import pytest
+from mistral_common.tokens.tokenizers.base import (SpecialTokenPolicy,
+                                                   SpecialTokens)
+from mistral_common.tokens.tokenizers.tekken import (SpecialTokenInfo,
+                                                     Tekkenizer)
+
+from vllm.assets.audio import AudioAsset
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+from vllm.config import ModelConfig
+from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template,
+                                         parse_chat_messages,
+                                         parse_chat_messages_futures,
+                                         resolve_chat_template_content_format,
+                                         resolve_hf_chat_template)
+from vllm.entrypoints.llm import apply_hf_chat_template
+from vllm.multimodal import MultiModalDataDict
+from vllm.multimodal.utils import (encode_audio_base64, encode_image_base64,
+                                   encode_video_base64)
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+
+from ..models.registry import HF_EXAMPLE_MODELS
+from ..utils import VLLM_PATH
+
+EXAMPLES_DIR = VLLM_PATH / "examples"
+
+PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
+ULTRAVOX_MODEL_ID = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
+QWEN2AUDIO_MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct"
+QWEN2VL_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
+QWEN25VL_MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
+QWEN25OMNI_MODEL_ID = "Qwen/Qwen2.5-Omni-7B"
+MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+LLAMA_GUARD_MODEL_ID = "meta-llama/Llama-Guard-3-1B"
+HERMES_MODEL_ID = "NousResearch/Hermes-3-Llama-3.1-8B"
+MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+
+
+@pytest.fixture(scope="function")
+def phi3v_model_config():
+    return ModelConfig(PHI3V_MODEL_ID,
+                       task="generate",
+                       tokenizer=PHI3V_MODEL_ID,
+                       tokenizer_mode="auto",
+                       trust_remote_code=True,
+                       dtype="auto",
+                       seed=0,
+                       limit_mm_per_prompt={
+                           "image": 2,
+                       })
+
+
+@pytest.fixture(scope="function")
+def phi3v_model_config_mm_interleaved():
+    return ModelConfig(PHI3V_MODEL_ID,
+                       task="generate",
+                       tokenizer=PHI3V_MODEL_ID,
+                       tokenizer_mode="auto",
+                       trust_remote_code=True,
+                       dtype="auto",
+                       seed=0,
+                       interleave_mm_strings=True,
+                       limit_mm_per_prompt={
+                           "image": 2,
+                       })
+
+
+@pytest.fixture(scope="module")
+def phi3v_tokenizer():
+    return TokenizerGroup(
+        tokenizer_id=PHI3V_MODEL_ID,
+        enable_lora=False,
+        max_num_seqs=5,
+        max_input_length=None,
+    )
+
+
+@pytest.fixture(scope="function")
+def qwen25omni_model_config_mm_interleaved():
+    return ModelConfig(QWEN25OMNI_MODEL_ID,
+                       task="generate",
+                       tokenizer=QWEN25OMNI_MODEL_ID,
+                       tokenizer_mode="auto",
+                       dtype="auto",
+                       seed=0,
+                       interleave_mm_strings=True,
+                       limit_mm_per_prompt={
+                           "image": 2,
+                           "audio": 1,
+                           "video": 1,
+                       })
+
+
+@pytest.fixture(scope="module")
+def qwen25omni_tokenizer():
+    return TokenizerGroup(
+        tokenizer_id=QWEN25OMNI_MODEL_ID,
+        enable_lora=False,
+        max_num_seqs=5,
+        max_input_length=None,
+    )
+
+
+@pytest.fixture(scope="module")
+def mllama_model_config():
+    return ModelConfig(MLLAMA_MODEL_ID,
+                       task="generate",
+                       tokenizer=MLLAMA_MODEL_ID,
+                       tokenizer_mode="auto",
+                       trust_remote_code=True,
+                       dtype="auto",
+                       seed=0,
+                       limit_mm_per_prompt={
+                           "image": 2,
+                       })
+
+
+@pytest.fixture(scope="module")
+def mllama_tokenizer():
+    return TokenizerGroup(
+        MLLAMA_MODEL_ID,
+        enable_lora=False,
+        max_num_seqs=5,
+        max_input_length=None,
+    )
+
+
+@pytest.fixture(scope="function")
+def mistral_model_config():
+    return ModelConfig(MISTRAL_MODEL_ID,
+                       task="generate",
+                       tokenizer=MISTRAL_MODEL_ID,
+                       tokenizer_mode="auto",
+                       trust_remote_code=True,
+                       dtype="auto",
+                       seed=0,
+                       limit_mm_per_prompt={
+                           "image": 2,
+                       })
+
+
+@pytest.fixture(scope="module")
+def mistral_tokenizer():
+    return TokenizerGroup(
+        tokenizer_id=MISTRAL_MODEL_ID,
+        enable_lora=False,
+        max_num_seqs=5,
+        max_input_length=None,
+    )
+
+
+@pytest.fixture(scope="module")
+def image_url():
+    image = ImageAsset('cherry_blossom')
+    base64 = encode_image_base64(image.pil_image)
+    return f"data:image/jpeg;base64,{base64}"
+
+
+@pytest.fixture(scope="module")
+def video_url():
+    video = VideoAsset('baby_reading', 1)
+    base64 = encode_video_base64(video.np_ndarrays)
+    return f"data:video/jpeg;base64,{base64}"
+
+
+@pytest.fixture(scope="module")
+def audio_url():
+    audio = AudioAsset('mary_had_lamb')
+    base64 = encode_audio_base64(*audio.audio_and_sample_rate)
+    return f"data:audio/ogg;base64,{base64}"
+
+
+def _assert_mm_data_is_image_input(
+    mm_data: Optional[MultiModalDataDict],
+    image_count: int,
+) -> None:
+    assert mm_data is not None
+    assert set(mm_data.keys()) == {"image"}
+
+    image_data = mm_data.get("image")
+    assert image_data is not None
+
+    assert isinstance(image_data, list) and len(image_data) == image_count
+
+
+ModalityType = Literal["image", "video", "audio"]
+MultiModalDataCounts = Mapping[ModalityType, int]
+
+
+def _assert_mm_data_inputs(
+    mm_data: Optional[MultiModalDataDict],
+    data_count: MultiModalDataCounts,
+) -> None:
+    assert mm_data is not None
+    assert set(data_count.keys()) == (set(mm_data.keys()))
+
+    for modality, n in data_count.items():
+        modality_data = mm_data.get(modality)
+        assert modality_data is not None
+        assert isinstance(modality_data, list) and len(modality_data) == n
+
+
+def test_parse_chat_messages_single_image(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [{
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "text",
+                "text": "What's in the image?"
+            }]
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in the image?"
+    }]
+    _assert_mm_data_is_image_input(mm_data, 1)
+
+
+def test_parse_chat_messages_empty_system(
+    mistral_model_config,
+    mistral_tokenizer,
+):
+    # Test string format
+    conversation, _ = parse_chat_messages(
+        [{
+            "role": "system",
+            "content": ""
+        }, {
+            "role": "user",
+            "content": [{
+                "type": "text",
+                "text": "Who are you?"
+            }]
+        }],
+        mistral_model_config,
+        mistral_tokenizer,
+        content_format="string",
+    )
+    assert conversation == [{
+        "role": "system",
+        "content": ""
+    }, {
+        "role": "user",
+        "content": "Who are you?"
+    }]
+
+    # Test openai format
+    conversation, _ = parse_chat_messages(
+        [{
+            "role": "system",
+            "content": ""
+        }, {
+            "role": "user",
+            "content": [{
+                "type": "text",
+                "text": "Who are you?"
+            }]
+        }],
+        mistral_model_config,
+        mistral_tokenizer,
+        content_format="openai",
+    )
+    assert conversation == [{
+        "role": "system",
+        "content": [{
+            "type": "text",
+            "text": ""
+        }]
+    }, {
+        "role":
+        "user",
+        "content": [{
+            "type": "text",
+            "text": "Who are you?"
+        }]
+    }]
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_single_image_async(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_future = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [{
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "text",
+                "text": "What's in the image?"
+            }]
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in the image?"
+    }]
+    _assert_mm_data_is_image_input(await mm_future, 1)
+
+
+def test_parse_chat_messages_multiple_images(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [{
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "image_pil",
+                "image_pil": ImageAsset('cherry_blossom').pil_image
+            }, {
+                "type": "text",
+                "text": "What's in these images?"
+            }]
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_1|>\n<|image_2|>\nWhat's in these images?"
+    }]
+    _assert_mm_data_is_image_input(mm_data, 2)
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_multiple_images_async(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_future = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [{
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "image_pil",
+                "image_pil": ImageAsset('cherry_blossom').pil_image
+            }, {
+                "type": "text",
+                "text": "What's in these images?"
+            }]
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_1|>\n<|image_2|>\nWhat's in these images?"
+    }]
+    _assert_mm_data_is_image_input(await mm_future, 2)
+
+
+def test_parse_chat_messages_placeholder_already_in_prompt(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [{
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type":
+                "text",
+                "text":
+                "What's in <|image_1|> and how does it compare to <|image_2|>?"
+            }]
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "What's in <|image_1|> and how does it compare to <|image_2|>?"
+    }]
+    _assert_mm_data_is_image_input(mm_data, 2)
+
+
+def test_parse_chat_messages_placeholder_one_already_in_prompt(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type":
+                    "text",
+                    "text":
+                    "What's in <|image_1|> and how does it compare to the other one?"  # noqa: E501
+                }
+            ]
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_2|>\nWhat's in <|image_1|> and how does it compare to the "
+        "other one?"
+    }]
+    _assert_mm_data_is_image_input(mm_data, 2)
+
+
+def test_parse_chat_messages_multiple_images_across_messages(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [{
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "text",
+                "text": "What's in this image?"
+            }]
+        }, {
+            "role": "assistant",
+            "content": "Some stuff."
+        }, {
+            "role":
+            "user",
+            "content": [{
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "text",
+                "text": "What about this one?"
+            }]
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\nWhat's in this image?"
+        },
+        {
+            "role": "assistant",
+            "content": "Some stuff."
+        },
+        {
+            "role": "user",
+            "content": "<|image_2|>\nWhat about this one?"
+        },
+    ]
+    _assert_mm_data_is_image_input(mm_data, 2)
+
+
+def test_parse_chat_messages_context_text_format(
+    phi3v_model_config,
+    phi3v_tokenizer,
+):
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role": "user",
+            "content": [{
+                "type": "text",
+                "text": "What's in this text?"
+            }]
+        }, {
+            "role": "assistant",
+            "content": "Some stuff."
+        }, {
+            "role": "user",
+            "content": "What about this one?"
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="openai",
+    )
+
+    assert conversation == [
+        {
+            "role": "user",
+            "content": [{
+                "type": "text",
+                "text": "What's in this text?"
+            }]
+        },
+        {
+            "role": "assistant",
+            "content": [{
+                "type": "text",
+                "text": "Some stuff."
+            }]
+        },
+        {
+            "role": "user",
+            "content": [{
+                "type": "text",
+                "text": "What about this one?"
+            }]
+        },
+    ]
+
+
+def test_parse_chat_messages_rejects_too_many_images_in_one_message(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    with warnings.catch_warnings():
+        warnings.filterwarnings(
+            "ignore",
+            message="coroutine 'async_get_and_parse_image' was never awaited")
+        with pytest.raises(
+                ValueError,
+                match="At most 2 image\\(s\\) may be provided in one request\\."
+        ):
+            parse_chat_messages(
+                [{
+                    "role":
+                    "user",
+                    "content": [{
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    }, {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    }, {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    }, {
+                        "type": "text",
+                        "text": "What's in these images?"
+                    }]
+                }],
+                phi3v_model_config,
+                phi3v_tokenizer,
+                content_format="string",
+            )
+
+
+def test_parse_chat_messages_rejects_too_many_images_across_messages(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    with warnings.catch_warnings():
+        warnings.filterwarnings(
+            "ignore",
+            message="coroutine 'async_get_and_parse_image' was never awaited")
+        with pytest.raises(
+                ValueError,
+                match="At most 2 image\\(s\\) may be provided in one request\\."
+        ):
+            parse_chat_messages(
+                [{
+                    "role":
+                    "user",
+                    "content": [{
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    }, {
+                        "type": "text",
+                        "text": "What's in this image?"
+                    }]
+                }, {
+                    "role": "assistant",
+                    "content": "Some stuff."
+                }, {
+                    "role":
+                    "user",
+                    "content": [{
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    }, {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    }, {
+                        "type": "text",
+                        "text": "What about these two?"
+                    }]
+                }],
+                phi3v_model_config,
+                phi3v_tokenizer,
+                content_format="string",
+            )
+
+
+def test_parse_chat_messages_multiple_images_uncommon_input(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                "What's in these images?", {
+                    "image_url": image_url
+                }, {
+                    "image_url": image_url
+                }
+            ]
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_1|>\n<|image_2|>\nWhat's in these images?"
+    }]
+    _assert_mm_data_is_image_input(mm_data, 2)
+
+
+def test_parse_chat_messages_multiple_images_interleave(
+    phi3v_model_config_mm_interleaved,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [{
+                "type": "text",
+                "text": "I need you to compare this image"
+            }, {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "text",
+                "text": "and this one"
+            }, {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "text",
+                "text": "Do they have differences?"
+            }]
+        }],
+        phi3v_model_config_mm_interleaved,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
+        "Do they have differences?"
+    }]
+    _assert_mm_data_is_image_input(mm_data, 2)
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_multiple_images_interleave_async(
+    phi3v_model_config_mm_interleaved,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [{
+                "type": "text",
+                "text": "I need you to compare this image"
+            }, {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "text",
+                "text": "and this one"
+            }, {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "text",
+                "text": "Do they have differences?"
+            }]
+        }],
+        phi3v_model_config_mm_interleaved,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
+        "Do they have differences?"
+    }]
+    _assert_mm_data_is_image_input(await mm_data, 2)
+
+
+def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
+    phi3v_model_config_mm_interleaved,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's on this image?"
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type": "text",
+                    "text": "Be accurate."
+                },
+            ]
+        }, {
+            "role": "assistant",
+            "content": "Some stuff."
+        }, {
+            "role":
+            "user",
+            "content": [{
+                "type": "text",
+                "text": "What's on this image?"
+            }, {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }]
+        }],
+        phi3v_model_config_mm_interleaved,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "What's on this image?\n<|image_1|>\nBe accurate."
+    }, {
+        "role": "assistant",
+        "content": "Some stuff."
+    }, {
+        "role": "user",
+        "content": "What's on this image?\n<|image_2|>"
+    }]
+    _assert_mm_data_is_image_input(mm_data, 2)
+
+
+def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
+        qwen25omni_model_config_mm_interleaved, qwen25omni_tokenizer,
+        image_url, video_url, audio_url):
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's on this image?"
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type": "text",
+                    "text": "Now listen to this audio"
+                },
+                {
+                    "type": "audio_url",
+                    "audio_url": {
+                        "url": audio_url
+                    }
+                },
+            ]
+        }, {
+            "role": "assistant",
+            "content": "Some stuff."
+        }, {
+            "role":
+            "user",
+            "content": [{
+                "type": "text",
+                "text": "What's on this image?"
+            }, {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "text",
+                "text": "And what's in the video?"
+            }, {
+                "type": "video_url",
+                "video_url": {
+                    "url": video_url
+                }
+            }]
+        }],
+        qwen25omni_model_config_mm_interleaved,
+        qwen25omni_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
+        "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>"
+    }, {
+        "role": "assistant",
+        "content": "Some stuff."
+    }, {
+        "role":
+        "user",
+        "content":
+        "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
+        "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>"
+    }]
+
+    _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
+
+
+def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
+    phi3v_model_config_mm_interleaved,
+    phi3v_tokenizer,
+    image_url,
+):
+    with pytest.raises(
+            ValueError,
+            match=r"Found more '<|image_1|>' placeholders in input prompt "
+            "than actual multimodal data items."):
+        parse_chat_messages(
+            [{
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    },
+                    {
+                        "type":
+                        "text",
+                        "text":
+                        "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
+                        "Do they have differences?"
+                    },
+                ]
+            }],
+            phi3v_model_config_mm_interleaved,
+            phi3v_tokenizer,
+            content_format="string",
+        )
+
+
+### Mllama currently wraps images / texts as interleaved dictionaries
+def test_mllama_single_image(
+    mllama_model_config,
+    mllama_tokenizer,
+    image_url,
+):
+    """Ensures that a single image is parsed correctly mllama."""
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [{
+                'type': 'text',
+                'text': 'The content of this image is:'
+            }, {
+                "image_url": image_url
+            }]
+        }],
+        mllama_model_config,
+        mllama_tokenizer,
+        content_format="openai",
+    )
+    _assert_mm_data_is_image_input(mm_data, 1)
+    assert conversation == [{
+        'role':
+        'user',
+        'content': [{
+            'type': 'text',
+            'text': 'The content of this image is:'
+        }, {
+            'type': 'image'
+        }]
+    }]
+
+
+def test_mllama_interleaved_images(
+    mllama_model_config,
+    mllama_tokenizer,
+    image_url,
+):
+    """Ensures that multiple image are parsed as interleaved dicts."""
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    'type': 'text',
+                    'text': 'The content of the first image is:'
+                },
+                {
+                    "image_url": image_url
+                },
+                {
+                    'type': 'text',
+                    'text': 'The content of the second image is:'
+                },
+                {
+                    "image_url": image_url
+                },
+            ]
+        }],
+        mllama_model_config,
+        mllama_tokenizer,
+        content_format="openai",
+    )
+    _assert_mm_data_is_image_input(mm_data, 2)
+    assert conversation == [{
+        'role':
+        'user',
+        'content': [{
+            'type': 'text',
+            'text': 'The content of the first image is:'
+        }, {
+            'type': 'image'
+        }, {
+            'type': 'text',
+            'text': 'The content of the second image is:'
+        }, {
+            'type': 'image'
+        }]
+    }]
+
+
+@pytest.mark.parametrize("model", [MLLAMA_MODEL_ID])
+def test_multimodal_image_parsing_matches_hf(model, image_url):
+    """Checks end to end hf alignment for multimodal [image] parsing."""
+
+    def get_conversation(is_hf: bool):
+        img_part = {"type": "image_url", "image_url": {"url": image_url}}
+        if is_hf:
+            img_part = {'type': 'image'}
+        return [{
+            'role':
+            'user',
+            'content': [
+                {
+                    'type': 'text',
+                    'text': 'The content of the first image is:'
+                },
+                img_part,
+                {
+                    'type': 'text',
+                    'text': 'The content of the second image is:'
+                },
+                img_part,
+                {
+                    'type': 'text',
+                    'text': 'What animal is in the first image?'
+                },
+            ]
+        }]
+
+    # Build a config for the model
+    model_config = ModelConfig(model,
+                               task="generate",
+                               tokenizer=model,
+                               tokenizer_mode="auto",
+                               trust_remote_code=True,
+                               dtype="auto",
+                               seed=0,
+                               limit_mm_per_prompt={
+                                   "image": 2,
+                               })
+
+    # Build the tokenizer group and grab the underlying tokenizer
+    tokenizer_group = TokenizerGroup(
+        model,
+        enable_lora=False,
+        max_num_seqs=5,
+        max_input_length=None,
+        trust_remote_code=model_config.trust_remote_code,
+    )
+    tokenizer = tokenizer_group.tokenizer
+
+    # Build and parse a conversation with {"type": "image"} using the tokenizer
+    hf_conversation = get_conversation(is_hf=True)
+    hf_result = tokenizer.apply_chat_template(
+        hf_conversation,
+        tokenize=False,
+        add_generation_prompt=True,
+    )
+
+    # Now parse with vLLMs chat utils & apply the template
+    vllm_conversation = get_conversation(is_hf=False)
+    conversation, _ = parse_chat_messages(
+        vllm_conversation,
+        model_config,
+        tokenizer_group,
+        content_format="openai",
+    )
+
+    vllm_result = apply_hf_chat_template(
+        tokenizer=tokenizer,
+        conversation=conversation,
+        chat_template=None,
+        model_config=model_config,
+        tools=None,
+        add_generation_prompt=True,
+    )
+
+    assert hf_result == vllm_result
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        QWEN2VL_MODEL_ID,  # tokenizer.chat_template is of type str
+        HERMES_MODEL_ID,  # tokenizer.chat_template is of type dict
+    ])
+@pytest.mark.parametrize("use_tools", [True, False])
+def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
+    """checks that chat_template is a dict type for HF models."""
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+
+    model_config = ModelConfig(
+        model,
+        tokenizer=model_info.tokenizer or model,
+        tokenizer_mode=model_info.tokenizer_mode,
+        trust_remote_code=model_info.trust_remote_code,
+        hf_overrides=model_info.hf_overrides,
+    )
+
+    # Build the tokenizer group and grab the underlying tokenizer
+    tokenizer_group = TokenizerGroup(
+        model,
+        enable_lora=False,
+        max_num_seqs=5,
+        max_input_length=None,
+        trust_remote_code=model_config.trust_remote_code,
+    )
+    tokenizer = tokenizer_group.tokenizer
+
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "dummy_function_name",
+            "description": "This is a dummy function",
+            "parameters": sample_json_schema
+        }
+    }] if use_tools else None
+
+    # Test detecting the tokenizer's chat_template
+    chat_template = resolve_hf_chat_template(
+        tokenizer,
+        chat_template=None,
+        tools=tools,
+        model_config=model_config,
+    )
+    assert isinstance(chat_template, str)
+
+
+# NOTE: Qwen2-Audio default chat template is specially defined inside
+# processor class instead of using `tokenizer_config.json`
+# yapf: disable
+@pytest.mark.parametrize(
+    ("model", "expected_format"),
+    [(PHI3V_MODEL_ID, "string"),
+     (QWEN2VL_MODEL_ID, "openai"),
+     (QWEN25VL_MODEL_ID, "openai"),
+     (ULTRAVOX_MODEL_ID, "string"),
+     (QWEN2AUDIO_MODEL_ID, "openai"),
+     (MLLAMA_MODEL_ID, "openai"),
+     (LLAMA_GUARD_MODEL_ID, "openai")],
+)
+# yapf: enable
+def test_resolve_content_format_hf_defined(model, expected_format):
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+
+    model_config = ModelConfig(
+        model,
+        tokenizer=model_info.tokenizer or model,
+        tokenizer_mode=model_info.tokenizer_mode,
+        trust_remote_code=model_info.trust_remote_code,
+        hf_overrides=model_info.hf_overrides,
+    )
+
+    tokenizer_group = TokenizerGroup(
+        model,
+        enable_lora=False,
+        max_num_seqs=5,
+        max_input_length=None,
+        trust_remote_code=model_config.trust_remote_code,
+    )
+    tokenizer = tokenizer_group.tokenizer
+
+    # Test detecting the tokenizer's chat_template
+    chat_template = resolve_hf_chat_template(
+        tokenizer,
+        chat_template=None,
+        tools=None,
+        model_config=model_config,
+    )
+    assert isinstance(chat_template, str)
+
+    print("[TEXT]")
+    print(chat_template)
+    print("[AST]")
+    print(_try_extract_ast(chat_template))
+
+    resolved_format = resolve_chat_template_content_format(
+        None,  # Test detecting the tokenizer's chat_template
+        None,
+        "auto",
+        tokenizer,
+        model_config=model_config,
+    )
+
+    assert resolved_format == expected_format
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    ("model", "expected_format"),
+    [("Salesforce/blip2-opt-2.7b", "string"),
+     ("facebook/chameleon-7b", "string"),
+     ("deepseek-ai/deepseek-vl2-tiny", "string"),
+     ("microsoft/Florence-2-base", "string"),
+     ("adept/fuyu-8b", "string"),
+     ("google/paligemma-3b-mix-224", "string"),
+     ("Qwen/Qwen-VL", "string"),
+     ("Qwen/Qwen-VL-Chat", "string")],
+)
+# yapf: enable
+def test_resolve_content_format_fallbacks(model, expected_format):
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+
+    model_config = ModelConfig(
+        model,
+        tokenizer=model_info.tokenizer or model,
+        tokenizer_mode=model_info.tokenizer_mode,
+        trust_remote_code=model_info.trust_remote_code,
+        hf_overrides=model_info.hf_overrides,
+    )
+
+    tokenizer_group = TokenizerGroup(
+        model_config.tokenizer,
+        enable_lora=False,
+        max_num_seqs=5,
+        max_input_length=None,
+        trust_remote_code=model_config.trust_remote_code,
+    )
+    tokenizer = tokenizer_group.tokenizer
+
+    # Test detecting the tokenizer's chat_template
+    chat_template = resolve_hf_chat_template(
+        tokenizer,
+        chat_template=None,
+        tools=None,
+        model_config=model_config,
+    )
+    assert isinstance(chat_template, str)
+
+    print("[TEXT]")
+    print(chat_template)
+    print("[AST]")
+    print(_try_extract_ast(chat_template))
+
+    resolved_format = resolve_chat_template_content_format(
+        None,  # Test detecting the tokenizer's chat_template
+        None,
+        "auto",
+        tokenizer,
+        model_config=model_config,
+    )
+
+    assert resolved_format == expected_format
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    ("template_path", "expected_format"),
+    [("template_alpaca.jinja", "string"),
+     ("template_baichuan.jinja", "string"),
+     ("template_chatglm.jinja", "string"),
+     ("template_chatglm2.jinja", "string"),
+     ("template_chatml.jinja", "string"),
+     ("template_dse_qwen2_vl.jinja", "openai"),
+     ("template_falcon_180b.jinja", "string"),
+     ("template_falcon.jinja", "string"),
+     ("template_inkbot.jinja", "string"),
+     ("template_teleflm.jinja", "string"),
+     ("template_vlm2vec.jinja", "openai"),
+     ("tool_chat_template_granite_20b_fc.jinja", "string"),
+     ("tool_chat_template_hermes.jinja", "string"),
+     ("tool_chat_template_internlm2_tool.jinja", "string"),
+     ("tool_chat_template_llama3.1_json.jinja", "openai"),
+     ("tool_chat_template_llama3.2_json.jinja", "openai"),
+     ("tool_chat_template_mistral_parallel.jinja", "string"),
+     ("tool_chat_template_mistral.jinja", "string")],
+)
+# yapf: enable
+def test_resolve_content_format_examples(template_path, expected_format):
+    model_config = ModelConfig(
+        PHI3V_MODEL_ID,  # Dummy
+        tokenizer=PHI3V_MODEL_ID,  # Dummy
+        trust_remote_code=True,
+    )
+
+    tokenizer_group = TokenizerGroup(
+        PHI3V_MODEL_ID,  # Dummy
+        enable_lora=False,
+        max_num_seqs=5,
+        max_input_length=None,
+        trust_remote_code=model_config.trust_remote_code,
+    )
+    dummy_tokenizer = tokenizer_group.tokenizer
+    dummy_tokenizer.chat_template = None
+
+    chat_template = load_chat_template(EXAMPLES_DIR / template_path)
+    assert isinstance(chat_template, str)
+
+    print("[TEXT]")
+    print(chat_template)
+    print("[AST]")
+    print(_try_extract_ast(chat_template))
+
+    resolved_format = resolve_chat_template_content_format(
+        chat_template,
+        None,
+        "auto",
+        dummy_tokenizer,
+        model_config=model_config,
+    )
+
+    assert resolved_format == expected_format
+
+
+def test_parse_chat_messages_include_thinking_chunk(mistral_model_config,
+                                                    mistral_tokenizer):
+    messages = [{
+        "role":
+        "system",
+        "content": [{
+            "type": "text",
+            "text": "You are a helpful assistant."
+        }, {
+            "type":
+            "thinking",
+            "closed":
+            True,
+            "thinking":
+            "Only return the answer when you are confident."
+        }]
+    }, {
+        "role": "user",
+        "content": "What is 2+2?"
+    }, {
+        "role":
+        "assistant",
+        "content": [{
+            "type": "text",
+            "text": "Let me think about it."
+        }, {
+            "type": "thinking",
+            "closed": True,
+            "thinking": "2+2 = 4"
+        }, {
+            "type": "text",
+            "text": "The answer is 4.",
+        }],
+    }]
+
+    conversation_with_thinking, _ = parse_chat_messages(
+        messages,
+        mistral_model_config,
+        mistral_tokenizer,
+        content_format="openai",
+    )
+
+    expected_conversation = [{
+        "role":
+        "system",
+        "content": [{
+            "type": "text",
+            "text": "You are a helpful assistant."
+        }, {
+            "type": "text",
+            "text": "Only return the answer when you are confident."
+        }],
+    }, {
+        "role":
+        "user",
+        "content": [{
+            "type": "text",
+            "text": "What is 2+2?"
+        }],
+    }, {
+        "role":
+        "assistant",
+        "content": [
+            {
+                "type": "text",
+                "text": "Let me think about it."
+            },
+            {
+                "type": "text",
+                "text": "2+2 = 4"
+            },
+            {
+                "type": "text",
+                "text": "The answer is 4."
+            },
+        ]
+    }]
+
+    assert conversation_with_thinking == expected_conversation
+
+
+def test_apply_mistral_chat_template_thinking_chunk():
+    # Moved import here to avoid yapf and isort conflicts
+    from vllm.entrypoints.chat_utils import apply_mistral_chat_template
+    messages = [{
+        "role":
+        "system",
+        "content": [{
+            "type": "text",
+            "text": "You are a helpful assistant."
+        }, {
+            "type":
+            "thinking",
+            "closed":
+            True,
+            "thinking":
+            "Only return the answer when you are confident."
+        }]
+    }, {
+        "role": "user",
+        "content": "What is 2+2?"
+    }, {
+        "role":
+        "assistant",
+        "content": [{
+            "type": "text",
+            "text": "Let me think about it."
+        }, {
+            "type": "thinking",
+            "closed": True,
+            "thinking": "2+2 = 4"
+        }, {
+            "type": "text",
+            "text": "The answer is 4.",
+        }],
+    }, {
+        "role": "user",
+        "content": "Thanks, what is 3+3?"
+    }]
+
+    # TODO(Julien): upon model release change to a tokenizer already configured.
+    # =================================================================
+    mistral_tokenizer = MistralTokenizer.from_pretrained(
+        "mistralai/Devstral-Small-2507")
+    assert isinstance(mistral_tokenizer.tokenizer, Tekkenizer)
+    # Add think special tokens to the tokenizer
+    mistral_tokenizer.tokenizer._all_special_tokens[35] = SpecialTokenInfo(
+        rank=35, is_control=True, token_str=SpecialTokens.begin_think.value)
+    mistral_tokenizer.tokenizer._all_special_tokens[36] = SpecialTokenInfo(
+        rank=36, is_control=True, token_str=SpecialTokens.end_think.value)
+    mistral_tokenizer.tokenizer._special_tokens_reverse_vocab = {
+        k: v
+        for k, v in
+        mistral_tokenizer.tokenizer._special_tokens_reverse_vocab.items()
+        if v not in {35, 36}
+    }
+    mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[
+        SpecialTokens.begin_think.value] = 35
+    mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[
+        SpecialTokens.end_think.value] = 36
+    mistral_tokenizer.instruct.BEGIN_THINK = 35
+    mistral_tokenizer.instruct.END_THINK = 36
+    # =================================================================
+
+    tokens_ids = apply_mistral_chat_template(mistral_tokenizer,
+                                             messages,
+                                             chat_template=None,
+                                             tools=None)
+
+    string_tokens = mistral_tokenizer.mistral.decode(
+        tokens_ids, special_token_policy=SpecialTokenPolicy.KEEP)
+
+    expected_tokens = (
+        r"<s>[SYSTEM_PROMPT]You are a helpful assistant.[THINK]Only return the"
+        r" answer when you are confident.[/THINK][/SYSTEM_PROMPT]"
+        r"[INST]What is 2+2?[/INST]"
+        r"Let me think about it.[THINK]2+2 = 4[/THINK]The answer is 4.</s>"
+        r"[INST]Thanks, what is 3+3?[/INST]")
+
+    assert string_tokens == expected_tokens
diff --git a/vllm_v0.10.0/tests/entrypoints/test_ssl_cert_refresher.py b/vllm_v0.10.0/tests/entrypoints/test_ssl_cert_refresher.py
new file mode 100644
index 0000000..33ad2cf
--- /dev/null
+++ b/vllm_v0.10.0/tests/entrypoints/test_ssl_cert_refresher.py
@@ -0,0 +1,73 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import tempfile
+from pathlib import Path
+from ssl import SSLContext
+
+import pytest
+
+from vllm.entrypoints.ssl import SSLCertRefresher
+
+
+class MockSSLContext(SSLContext):
+
+    def __init__(self):
+        self.load_cert_chain_count = 0
+        self.load_ca_count = 0
+
+    def load_cert_chain(
+        self,
+        certfile,
+        keyfile=None,
+        password=None,
+    ):
+        self.load_cert_chain_count += 1
+
+    def load_verify_locations(
+        self,
+        cafile=None,
+        capath=None,
+        cadata=None,
+    ):
+        self.load_ca_count += 1
+
+
+def create_file() -> str:
+    with tempfile.NamedTemporaryFile(dir='/tmp', delete=False) as f:
+        return f.name
+
+
+def touch_file(path: str) -> None:
+    Path(path).touch()
+
+
+@pytest.mark.asyncio
+async def test_ssl_refresher():
+    ssl_context = MockSSLContext()
+    key_path = create_file()
+    cert_path = create_file()
+    ca_path = create_file()
+    ssl_refresher = SSLCertRefresher(ssl_context, key_path, cert_path, ca_path)
+    await asyncio.sleep(1)
+    assert ssl_context.load_cert_chain_count == 0
+    assert ssl_context.load_ca_count == 0
+
+    touch_file(key_path)
+    await asyncio.sleep(1)
+    assert ssl_context.load_cert_chain_count == 1
+    assert ssl_context.load_ca_count == 0
+
+    touch_file(cert_path)
+    touch_file(ca_path)
+    await asyncio.sleep(1)
+    assert ssl_context.load_cert_chain_count == 2
+    assert ssl_context.load_ca_count == 1
+
+    ssl_refresher.stop()
+
+    touch_file(cert_path)
+    touch_file(ca_path)
+    await asyncio.sleep(1)
+    assert ssl_context.load_cert_chain_count == 2
+    assert ssl_context.load_ca_count == 1
diff --git a/vllm_v0.10.0/tests/fastsafetensors_loader/__init__.py b/vllm_v0.10.0/tests/fastsafetensors_loader/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/fastsafetensors_loader/test_fastsafetensors_loader.py b/vllm_v0.10.0/tests/fastsafetensors_loader/test_fastsafetensors_loader.py
new file mode 100644
index 0000000..1b95bf5
--- /dev/null
+++ b/vllm_v0.10.0/tests/fastsafetensors_loader/test_fastsafetensors_loader.py
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import SamplingParams
+from vllm.config import LoadFormat
+
+test_model = "openai-community/gpt2"
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
+
+
+def test_model_loader_download_files(vllm_runner):
+    with vllm_runner(test_model,
+                     load_format=LoadFormat.FASTSAFETENSORS) as llm:
+        deserialized_outputs = llm.generate(prompts, sampling_params)
+        assert deserialized_outputs
diff --git a/vllm_v0.10.0/tests/fastsafetensors_loader/test_weight_utils.py b/vllm_v0.10.0/tests/fastsafetensors_loader/test_weight_utils.py
new file mode 100644
index 0000000..78d23ac
--- /dev/null
+++ b/vllm_v0.10.0/tests/fastsafetensors_loader/test_weight_utils.py
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import glob
+import tempfile
+
+import huggingface_hub.constants
+import torch
+
+from vllm.model_executor.model_loader.weight_utils import (
+    download_weights_from_hf, fastsafetensors_weights_iterator,
+    safetensors_weights_iterator)
+
+
+def test_fastsafetensors_model_loader():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        huggingface_hub.constants.HF_HUB_OFFLINE = False
+        download_weights_from_hf("openai-community/gpt2",
+                                 allow_patterns=["*.safetensors"],
+                                 cache_dir=tmpdir)
+        safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True)
+        assert len(safetensors) > 0
+
+        fastsafetensors_tensors = {}
+        hf_safetensors_tensors = {}
+
+        for name, tensor in fastsafetensors_weights_iterator(
+                safetensors, True):
+            fastsafetensors_tensors[name] = tensor
+
+        for name, tensor in safetensors_weights_iterator(safetensors, True):
+            hf_safetensors_tensors[name] = tensor
+
+        assert len(fastsafetensors_tensors) == len(hf_safetensors_tensors)
+
+        for name, fastsafetensors_tensor in fastsafetensors_tensors.items():
+            fastsafetensors_tensor = fastsafetensors_tensor.to('cpu')
+            assert fastsafetensors_tensor.dtype == hf_safetensors_tensors[
+                name].dtype
+            assert fastsafetensors_tensor.shape == hf_safetensors_tensors[
+                name].shape
+            assert torch.all(
+                fastsafetensors_tensor.eq(hf_safetensors_tensors[name]))
+
+
+if __name__ == "__main__":
+    test_fastsafetensors_model_loader()
diff --git a/vllm_v0.10.0/tests/kernels/__init__.py b/vllm_v0.10.0/tests/kernels/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/kernels/allclose_default.py b/vllm_v0.10.0/tests/kernels/allclose_default.py
new file mode 100644
index 0000000..9d65159
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/allclose_default.py
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+# Reference default values of atol and rtol are from
+# https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67
+default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5}
+default_rtol = {
+    torch.float16: 1e-3,
+    torch.bfloat16: 1.6e-2,
+    torch.float: 1.3e-6
+}
+
+
+def get_default_atol(output) -> float:
+    return default_atol[output.dtype]
+
+
+def get_default_rtol(output) -> float:
+    return default_rtol[output.dtype]
diff --git a/vllm_v0.10.0/tests/kernels/attention/conftest.py b/vllm_v0.10.0/tests/kernels/attention/conftest.py
new file mode 100644
index 0000000..88a2fb6
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/attention/conftest.py
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.utils import (create_kv_caches_with_random,
+                        create_kv_caches_with_random_flash)
+
+
+@pytest.fixture()
+def kv_cache_factory():
+    return create_kv_caches_with_random
+
+
+@pytest.fixture()
+def kv_cache_factory_flashinfer():
+    return create_kv_caches_with_random_flash
diff --git a/vllm_v0.10.0/tests/kernels/attention/test_attention.py b/vllm_v0.10.0/tests/kernels/attention/test_attention.py
new file mode 100644
index 0000000..2e0b4ef
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/attention/test_attention.py
@@ -0,0 +1,525 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import random
+from typing import Optional
+
+import pytest
+import torch
+
+from tests.kernels.allclose_default import get_default_atol, get_default_rtol
+from tests.kernels.utils import opcheck
+from vllm import _custom_ops as ops
+from vllm.attention.layer import Attention, MultiHeadAttention
+from vllm.platforms import current_platform
+from vllm.utils import get_max_shared_memory_bytes
+
+if not current_platform.is_rocm():
+    from xformers import ops as xops
+    from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
+
+    from vllm.attention.backends.xformers import _make_alibi_bias
+
+FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
+# This will change depending on the compute capability.
+# - 512 as a buffer
+MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512
+# There may not be enough gpu memory due to large NUM_BLOCKS.
+# Reduce NUM_BLOCKS when it happens.
+NUM_BLOCKS = 4321  # Arbitrary values for testing
+PARTITION_SIZE = 512
+PARTITION_SIZE_ROCM = 256
+# flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16}
+DTYPES = [
+    torch.half, torch.bfloat16, torch.float
+] if not current_platform.is_rocm() else [torch.half, torch.bfloat16]
+NUM_GEN_SEQS = [7]  # Arbitrary values for testing
+NUM_PREFILL_SEQS = [3]  # Arbitrary values for testing
+NUM_HEADS = [(40, 40), (64, 8)]  # Arbitrary values for testing
+
+# This should be sync with get_supported_head_sizes() in
+# vllm.attention.ops.paged_attn.PagedAttention
+HEAD_SIZES = [32, 64, 80, 96, 112, 120, 128, 192, 256]
+
+BLOCK_SIZES = [16, 32]
+USE_ALIBI = [False, True]
+KV_CACHE_DTYPE = ["auto", "fp8"]
+SEEDS = [0]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+
+def ref_masked_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    scale: float,
+    attn_mask: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float()
+    if attn_mask is not None:
+        attn_weights = attn_weights + attn_mask.float()
+    attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
+    out = torch.einsum("hqk,khd->qhd", attn_weights, value)
+    return out
+
+
+def ref_single_query_cached_kv_attention(
+    output: torch.Tensor,
+    query: torch.Tensor,
+    num_queries_per_kv: int,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    block_tables: torch.Tensor,
+    seq_lens: torch.Tensor,
+    scale: float,
+    alibi_slopes: Optional[torch.Tensor],
+) -> None:
+    num_query_heads = query.shape[1]
+    num_kv_heads = value_cache.shape[1]
+    head_size = value_cache.shape[2]
+    block_size = value_cache.shape[3]
+    num_seqs = query.shape[0]
+
+    block_tables_lst = block_tables.cpu().tolist()
+    seq_lens_lst = seq_lens.cpu().tolist()
+    for i in range(num_seqs):
+        q = query[i].unsqueeze(0)
+        block_table = block_tables_lst[i]
+        seq_len = int(seq_lens_lst[i])
+
+        keys_lst: list[torch.Tensor] = []
+        values_lst: list[torch.Tensor] = []
+        for j in range(seq_len):
+            block_number = int(block_table[j // block_size])
+            block_offset = j % block_size
+
+            k = key_cache[block_number, :, :, block_offset, :]
+            k = k.reshape(num_kv_heads, head_size)
+            keys_lst.append(k)
+
+            v = value_cache[block_number, :, :, block_offset]
+            values_lst.append(v)
+        keys = torch.stack(keys_lst, dim=0)
+        values = torch.stack(values_lst, dim=0)
+        if num_queries_per_kv > 1:
+            # Handle MQA and GQA
+            keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1)
+            values = torch.repeat_interleave(values, num_queries_per_kv, dim=1)
+
+        alibi_bias = None
+        if alibi_slopes is not None:
+            # Create the ALiBi bias used in the paged attention kernel.
+            position_ids = torch.arange(seq_len).int()
+            alibi_bias = (position_ids - seq_len + 1).float()
+            alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view(
+                1, 1, -1)
+
+        out = ref_masked_attention(q, keys, values, scale, alibi_bias)
+        out = out.view(num_query_heads, head_size)
+        output[i].copy_(out, non_blocking=True)
+
+
+@pytest.mark.parametrize(
+    "version",
+    ["v1", "v2"] if not current_platform.is_rocm() else ["v1", "v2", "rocm"])
+@pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("use_alibi", USE_ALIBI)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_paged_attention(
+    kv_cache_factory,
+    version: str,
+    num_seqs: int,
+    num_heads: tuple[int, int],
+    head_size: int,
+    use_alibi: bool,
+    block_size: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: str,
+    seed: int,
+    device: str,
+) -> None:
+    if ((kv_cache_dtype == "fp8" and head_size % 16)
+            or (version == "rocm" and head_size not in (64, 128))):
+        pytest.skip()
+
+    if (version == "rocm" and current_platform.is_navi()
+            and (kv_cache_dtype == "fp8" or head_size != 128
+                 or block_size != 16 or use_alibi)):
+        pytest.skip()
+
+    global PARTITION_SIZE
+
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+    scale = float(1.0 / (head_size**0.5))
+    num_query_heads, num_kv_heads = num_heads
+    query = torch.empty(num_seqs, num_query_heads, head_size, dtype=dtype)
+    query.uniform_(-scale, scale)
+
+    assert num_query_heads % num_kv_heads == 0
+    num_queries_per_kv = num_query_heads // num_kv_heads
+    alibi_slopes = None
+    if use_alibi:
+        alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
+
+    seq_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
+    seq_lens[-1] = MAX_SEQ_LEN
+    max_seq_len = max(seq_lens)
+    seq_lens = torch.tensor(seq_lens, dtype=torch.int)
+
+    # Create the block tables.
+    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
+    block_tables_lst: list[list[int]] = []
+    for _ in range(num_seqs):
+        block_table = [
+            random.randint(0, NUM_BLOCKS - 1)
+            for _ in range(max_num_blocks_per_seq)
+        ]
+        block_tables_lst.append(block_table)
+
+    block_tables = torch.tensor(block_tables_lst, dtype=torch.int)
+
+    # Create the KV caches.
+    key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1,
+                                                num_kv_heads, head_size,
+                                                kv_cache_dtype, dtype, seed,
+                                                device)
+    key_cache, value_cache = key_caches[0], value_caches[0]
+
+    # Using default kv_scale
+    k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
+
+    # Call the paged attention kernel.
+    output = torch.empty_like(query)
+    if version == "v1":
+        ops.paged_attention_v1(
+            output,
+            query,
+            key_cache,
+            value_cache,
+            num_kv_heads,
+            scale,
+            block_tables,
+            seq_lens,
+            block_size,
+            max_seq_len,
+            alibi_slopes,
+            kv_cache_dtype,
+            k_scale,
+            v_scale,
+        )
+
+        opcheck(torch.ops._C.paged_attention_v1,
+                (output, query, key_cache, value_cache, num_kv_heads, scale,
+                 block_tables, seq_lens, block_size, max_seq_len, alibi_slopes,
+                 kv_cache_dtype, k_scale, v_scale, 0, 0, 0, 64, 0),
+                cond=(head_size == HEAD_SIZES[0]
+                      and block_size == BLOCK_SIZES[0]))
+
+    elif version in ("v2", "rocm"):
+        if current_platform.is_rocm() and version == "rocm":
+            PARTITION_SIZE = PARTITION_SIZE_ROCM
+
+        num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
+        assert PARTITION_SIZE % block_size == 0
+        num_seqs, num_heads, head_size = output.shape
+        tmp_output = torch.empty(
+            size=(num_seqs, num_heads, num_partitions, head_size),
+            dtype=output.dtype,
+        )
+        exp_sums = torch.empty(
+            size=(num_seqs, num_heads, num_partitions),
+            dtype=torch.float32,
+        )
+        max_logits = torch.empty_like(exp_sums)
+        if version == "v2":
+            ops.paged_attention_v2(
+                output,
+                exp_sums,
+                max_logits,
+                tmp_output,
+                query,
+                key_cache,
+                value_cache,
+                num_kv_heads,
+                scale,
+                block_tables,
+                seq_lens,
+                block_size,
+                max_seq_len,
+                alibi_slopes,
+                kv_cache_dtype,
+                k_scale,
+                v_scale,
+            )
+
+            opcheck(torch.ops._C.paged_attention_v2,
+                    (output, exp_sums, max_logits, tmp_output, query,
+                     key_cache, value_cache, num_kv_heads, scale, block_tables,
+                     seq_lens, block_size, max_seq_len, alibi_slopes,
+                     kv_cache_dtype, k_scale, v_scale, 0, 0, 0, 64, 0),
+                    cond=(head_size == HEAD_SIZES[0]
+                          and block_size == BLOCK_SIZES[0]))
+
+        else:
+            ops.paged_attention_rocm(
+                output,
+                exp_sums,
+                max_logits,
+                tmp_output,
+                query,
+                key_cache,
+                value_cache,
+                num_kv_heads,
+                scale,
+                block_tables,
+                seq_lens,
+                None,
+                block_size,
+                max_seq_len,
+                alibi_slopes,
+                kv_cache_dtype,
+                k_scale,
+                v_scale,
+            )
+
+            opcheck(torch.ops._rocm_C.paged_attention,
+                    (output, exp_sums, max_logits, tmp_output, query,
+                     key_cache, value_cache, num_kv_heads, scale, block_tables,
+                     seq_lens, None, block_size, max_seq_len, alibi_slopes,
+                     kv_cache_dtype, k_scale, v_scale),
+                    cond=(head_size == HEAD_SIZES[0]
+                          and block_size == BLOCK_SIZES[0]))
+
+    else:
+        raise AssertionError(f"Unknown version: {version}")
+
+    # Run the reference implementation.
+    if kv_cache_dtype == "fp8":
+        # Convert cache data back to dtype.
+        x = 16 // torch.tensor([], dtype=dtype).element_size()
+        key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x,
+                           block_size, x)
+        dequantized_key_cache = torch.empty(size=key_cache_shape,
+                                            dtype=dtype,
+                                            device=device)
+        ops.convert_fp8(dequantized_key_cache, key_cache)
+        key_cache = dequantized_key_cache
+
+        value_cache_shape = value_cache.shape
+        dequantized_value_cache = torch.empty(size=value_cache_shape,
+                                              dtype=dtype,
+                                              device=device)
+        ops.convert_fp8(dequantized_value_cache, value_cache)
+        value_cache = dequantized_value_cache
+
+    ref_output = torch.empty_like(query)
+    ref_single_query_cached_kv_attention(
+        ref_output,
+        query,
+        num_queries_per_kv,
+        key_cache,
+        value_cache,
+        block_tables,
+        seq_lens,
+        scale,
+        alibi_slopes,
+    )
+
+    # NOTE(woosuk): Due to the kernel-level differences in the two
+    # implementations, there is a small numerical difference in the two
+    # outputs. Thus, we use a relaxed tolerance for the test.
+    atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3
+    rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5
+
+    # NOTE(zhaoyang): FP8 KV Cache will introduce quantization error,
+    # so we use a relaxed tolerance for the test.
+    atol, rtol = 1e-3, 1e-5
+    if kv_cache_dtype == "fp8":
+        atol, rtol = 1e-2, 1e-5
+    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
+
+
+def ref_multi_query_kv_attention(
+    cu_seq_lens: list[int],
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    scale: float,
+    alibi_bias: Optional[list[torch.Tensor]],
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    num_seqs = len(cu_seq_lens) - 1
+    ref_outputs: list[torch.Tensor] = []
+    if alibi_bias:
+        assert len(alibi_bias) == num_seqs
+    for i in range(num_seqs):
+        start_idx = cu_seq_lens[i]
+        end_idx = cu_seq_lens[i + 1]
+        seq_len = end_idx - start_idx
+
+        # Create attention mask. ALiBi already includes a tril causal mask.
+        if alibi_bias:
+            attn_mask = alibi_bias[i]
+        else:
+            attn_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=dtype),
+                                   diagonal=1)
+            attn_mask = attn_mask * torch.finfo(dtype).min
+            attn_mask = attn_mask.to(dtype=dtype)
+
+        ref_output = ref_masked_attention(
+            query[start_idx:end_idx],
+            key[start_idx:end_idx],
+            value[start_idx:end_idx],
+            scale,
+            attn_mask=attn_mask,
+        )
+        ref_outputs.append(ref_output)
+
+    return torch.cat(ref_outputs, dim=0)
+
+
+@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.skipif(current_platform.is_rocm(),
+                    reason="Xformers backend is not supported on ROCm.")
+@torch.inference_mode()
+def test_multi_query_kv_attention(
+    num_seqs: int,
+    num_heads: tuple[int, int],
+    head_size: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    use_alibi: bool = False,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+    # MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
+    # As the xformers library is already tested with its own tests, we can use
+    # a smaller MAX_SEQ_LEN here.
+    max_len = min(MAX_SEQ_LEN, 4096)
+    seq_lens = random.sample(range(1, max_len), num_seqs)
+    num_tokens = sum(seq_lens)
+
+    scale = float(1.0 / (head_size**0.5))
+    num_query_heads, num_kv_heads = num_heads
+    qkv = torch.empty(num_tokens,
+                      num_query_heads + 2 * num_kv_heads,
+                      head_size,
+                      dtype=dtype)
+    qkv.uniform_(-scale, scale)
+    query, key, value = qkv.split(
+        [num_query_heads, num_kv_heads, num_kv_heads], dim=1)
+
+    num_queries_per_kv = num_query_heads // num_kv_heads
+    if num_queries_per_kv > 1:
+        # Handle MQA and GQA
+        key = torch.repeat_interleave(key, num_queries_per_kv, dim=1)
+        value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
+    alibi_bias = None
+    if use_alibi:
+        alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
+        attn_bias = _make_alibi_bias(alibi_slopes, num_kv_heads, dtype,
+                                     seq_lens)
+        output = torch.empty_like(query)
+        start = 0
+        # Dynamic sequence length not supported with custom attn_bias.
+        for i, seq_len in enumerate(seq_lens):
+            end = start + seq_len
+            out = xops.memory_efficient_attention_forward(
+                query[None, start:end],
+                key[None, start:end],
+                value[None, start:end],
+                attn_bias=attn_bias[i],
+                p=0.0,
+                scale=scale)
+            output[start:end].copy_(out.view_as(query[start:end]))
+            start += seq_len
+        # xformers.AttentionBias to Tensor for use in reference impl.
+        alibi_bias = [
+            b.materialize((1, num_query_heads, i, i), device=device).squeeze()
+            for b, i in zip(attn_bias, seq_lens)
+        ]
+    else:
+        attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens)
+        output = xops.memory_efficient_attention_forward(
+            query.unsqueeze(0),
+            key.unsqueeze(0),
+            value.unsqueeze(0),
+            attn_bias=attn_bias,
+            p=0.0,
+            scale=scale,
+        )
+        output = output.squeeze(0)
+
+    cu_seq_lens = [0]
+    for seq_len in seq_lens:
+        cu_seq_lens.append(cu_seq_lens[-1] + seq_len)
+    ref_output = ref_multi_query_kv_attention(
+        cu_seq_lens,
+        query,
+        key,
+        value,
+        scale,
+        alibi_bias,
+        dtype,
+    )
+    atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3
+    rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5
+    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", [64])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.skipif(current_platform.is_rocm(),
+                    reason="Xformers backend is not supported on ROCm.")
+@torch.inference_mode()
+def test_multi_query_kv_attention_with_alibi(
+    num_seqs: int,
+    num_heads: tuple[int, int],
+    head_size: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+) -> None:
+    return test_multi_query_kv_attention(
+        num_seqs,
+        num_heads,
+        head_size,
+        dtype,
+        seed,
+        device,
+        use_alibi=True,
+    )
+
+
+@pytest.mark.parametrize("attention_cls", [Attention, MultiHeadAttention])
+def test_num_heads_not_divisble_by_num_kv_heads(attention_cls: type) -> None:
+    head_size = 64
+    scale = float(1.0 / (head_size**0.5))
+    num_heads = 16
+    num_kv_heads = 5
+    with pytest.raises(AssertionError):
+        _ = attention_cls(
+            num_heads=num_heads,
+            head_size=head_size,
+            scale=scale,
+            num_kv_heads=num_kv_heads,
+        )
diff --git a/vllm_v0.10.0/tests/kernels/attention/test_attention_selector.py b/vllm_v0.10.0/tests/kernels/attention/test_attention_selector.py
new file mode 100644
index 0000000..93bf20d
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/attention/test_attention_selector.py
@@ -0,0 +1,300 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
+from vllm.platforms.cpu import CpuPlatform
+from vllm.platforms.cuda import CudaPlatform
+from vllm.platforms.rocm import RocmPlatform
+from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL
+
+
+@pytest.fixture(autouse=True)
+def clear_cache():
+    """Clear lru cache to ensure each test case runs without caching.
+    """
+    _cached_get_attn_backend.cache_clear()
+
+
+# Define MLA and non-MLA backends separately
+DEVICE_MLA_BACKENDS = {
+    "cuda": ["TRITON_MLA", "FLASHMLA"],
+    "hip": ["TRITON_MLA", "ROCM_AITER_MLA"],
+    "cpu": [],
+}
+
+DEVICE_REGULAR_ATTN_BACKENDS = {
+    "cuda": ["XFORMERS", "FLASHINFER"],
+    "hip": ["ROCM_FLASH"],
+    "cpu": ["TORCH_SDPA"],
+}
+
+DEVICE_MLA_BLOCK_SIZES = {
+    "cuda": [16, 64],  # CUDA supports both standard and extended block sizes
+    "hip": [16, 1],  # HIP requires special handling for block_size=1
+    # "cpu": [16]  # CPU uses fixed block size from test cases
+    "cpu": []  # FIXME(woosuk): Temporarily disable CPU tests
+}
+
+
+def generate_params():
+    params = []
+    for use_mla in [True, False]:
+        for device in ["cuda", "hip", "cpu"]:
+            backends = DEVICE_MLA_BACKENDS[
+                device] if use_mla else DEVICE_REGULAR_ATTN_BACKENDS[device]
+            for name in backends:
+                block_sizes = DEVICE_MLA_BLOCK_SIZES[device] if use_mla else [
+                    16
+                ]
+                for block_size in block_sizes:
+                    params.append(
+                        pytest.param(
+                            device,
+                            name,
+                            use_mla,
+                            block_size,
+                            id=
+                            f"{device}_{name}_mla_{str(use_mla)[0]}_blks{block_size}"
+                        ))
+    return params
+
+
+@pytest.mark.parametrize("device, name, use_mla, block_size",
+                         generate_params())
+@pytest.mark.parametrize("use_v1", [True, False])
+def test_env(
+    device: str,
+    name: str,
+    use_mla: bool,
+    block_size: int,
+    use_v1: bool,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    """Test attention backend selection with valid device-backend pairs."""
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        m.setenv(STR_BACKEND_ENV_VAR, name)
+        m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
+
+        if device == "cpu":
+            if not use_v1:
+                pytest.skip("CPU backend only supports V1")
+
+            with patch("vllm.attention.selector.current_platform",
+                       CpuPlatform()):
+                backend = get_attn_backend(16, torch.float16, torch.float16,
+                                           block_size, False)
+            assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
+
+        elif device == "hip":
+            with patch("vllm.attention.selector.current_platform",
+                       RocmPlatform()):
+                if use_mla:
+                    # Validate HIP MLA backend-block_size combinations
+                    valid_combination = (
+                        (name == "TRITON_MLA" and block_size != 1)
+                        or (name == "ROCM_AITER_MLA" and block_size == 1))
+
+                    if valid_combination:
+                        backend = get_attn_backend(16,
+                                                   torch.float16,
+                                                   torch.float16,
+                                                   block_size,
+                                                   False,
+                                                   use_mla=use_mla)
+                        expected = f"{name}_VLLM_V1" if use_v1 else name
+                        assert backend.get_name() == expected
+                    else:
+                        with pytest.raises(ValueError) as exc_info:
+                            get_attn_backend(16,
+                                             torch.float16,
+                                             torch.float16,
+                                             block_size,
+                                             False,
+                                             use_mla=use_mla)
+                        assert f"The selected backend, {name}" in str(
+                            exc_info.value)
+                else:
+                    backend = get_attn_backend(16,
+                                               torch.float16,
+                                               torch.float16,
+                                               block_size,
+                                               False,
+                                               use_mla=use_mla)
+                    expected = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
+                    assert backend.get_name() == expected
+
+        elif device == "cuda":
+            with patch("vllm.attention.selector.current_platform",
+                       CudaPlatform()):
+                if use_mla:
+                    if name == "FLASHMLA" and block_size == 64:
+                        from vllm.attention.backends.flashmla import (
+                            is_flashmla_supported)
+
+                        # only on cuda platforms with specific capability.
+                        is_supported, _ = is_flashmla_supported()
+
+                        if not is_supported:
+                            # if platform is not supported then skip this case.
+                            pytest.skip()
+                        else:
+                            backend = get_attn_backend(16,
+                                                       torch.float16,
+                                                       torch.float16,
+                                                       block_size,
+                                                       False,
+                                                       use_mla=use_mla)
+                            expected = f"{name}_VLLM_V1" if use_v1 else name
+                            assert backend.get_name() == expected
+                    else:
+                        backend = get_attn_backend(16,
+                                                   torch.float16,
+                                                   torch.float16,
+                                                   block_size,
+                                                   False,
+                                                   use_mla=use_mla)
+                        expected = ("TRITON_MLA_VLLM_V1"
+                                    if use_v1 else "TRITON_MLA")
+                        assert backend.get_name() == expected
+                elif name == "FLASHINFER":
+                    backend = get_attn_backend(16,
+                                               torch.float16,
+                                               torch.float16,
+                                               block_size,
+                                               False,
+                                               use_mla=use_mla)
+                    expected = "FLASHINFER_VLLM_V1" if use_v1 else name
+                    assert backend.get_name() == expected
+                else:
+                    backend = get_attn_backend(32,
+                                               torch.float16,
+                                               torch.float16,
+                                               block_size,
+                                               False,
+                                               use_mla=use_mla)
+                    expected = "FLASH_ATTN_VLLM_V1" if use_v1 else name
+                    assert backend.get_name() == expected
+
+                    if use_v1:
+                        backend = get_attn_backend(16,
+                                                   torch.float16,
+                                                   torch.float16,
+                                                   block_size,
+                                                   False,
+                                                   use_mla=use_mla)
+                        assert backend.get_name() == "FLEX_ATTENTION", (
+                            "Should fallback to FlexAttention if head size is "
+                            "not supported by FlashAttention")
+
+
+@pytest.mark.parametrize("device", ["cpu", "cuda"])
+@pytest.mark.parametrize("use_v1", [True, False])
+def test_fp32_fallback(
+    device: str,
+    use_v1: bool,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    """Test attention backend selection with fp32."""
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+
+        if device == "cpu":
+            if not use_v1:
+                pytest.skip("CPU backend only supports V1")
+
+            with patch("vllm.attention.selector.current_platform",
+                       CpuPlatform()):
+                backend = get_attn_backend(16, torch.float32, torch.float32,
+                                           16, False)
+            assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
+
+        elif device == "cuda":
+            with patch("vllm.attention.selector.current_platform",
+                       CudaPlatform()):
+                backend = get_attn_backend(16, torch.float32, torch.float32,
+                                           16, False)
+            assert (backend.get_name() == "FLEX_ATTENTION"
+                    if use_v1 else "XFORMERS")
+
+
+def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
+    """Test FlashAttn validation."""
+    # TODO: When testing for v1, pipe in `use_v1` as an argument to
+    # get_attn_backend
+
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL)
+
+        # Unsupported CUDA arch
+        monkeypatch.setattr(torch.cuda,
+                            "get_device_capability",
+                            lambda _=None: (7, 5))
+        backend = get_attn_backend(16, torch.float16, None, 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
+
+        # Reset the monkeypatch for subsequent tests
+        monkeypatch.undo()
+
+        # Unsupported data type
+        backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
+
+        # Unsupported kv cache data type
+        backend = get_attn_backend(16, torch.float16, "fp8", 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
+
+        # Unsupported block size
+        backend = get_attn_backend(16, torch.float16, None, 8, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
+
+        # flash-attn is not installed
+        import sys
+        original_module = sys.modules.get('vllm_flash_attn')
+        monkeypatch.setitem(sys.modules, 'vllm_flash_attn', None)
+        backend = get_attn_backend(16, torch.float16, None, 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
+
+        # Restore the original module if it existed
+        if original_module is not None:
+            monkeypatch.setitem(sys.modules, 'vllm_flash_attn',
+                                original_module)
+        else:
+            monkeypatch.delitem(sys.modules, 'vllm_flash_attn', raising=False)
+
+        # Unsupported head size
+        backend = get_attn_backend(17, torch.float16, None, 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
+
+        # Attention-free models should bypass env and use PlaceholderAttention
+        backend = get_attn_backend(16, torch.float16, torch.float16, 16, True)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
+
+
+@pytest.mark.parametrize("use_v1", [True, False])
+def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch):
+
+    with monkeypatch.context() as m, patch(
+            "vllm.attention.selector.current_platform", CudaPlatform()):
+        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
+
+        # Test with head size 32
+        backend = get_attn_backend(32, torch.float16, None, 16, False)
+        EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN"
+        assert backend.get_name() == EXPECTED
+
+        # when block size == 16, backend will fall back to XFORMERS
+        # this behavior is not yet supported on V1.
+        if use_v1:
+            # TODO: support fallback on V1!
+            # https://github.com/vllm-project/vllm/issues/14524
+            pass
+        else:
+            backend = get_attn_backend(16, torch.float16, None, 16, False)
+            assert backend.get_name() == "XFORMERS"
diff --git a/vllm_v0.10.0/tests/kernels/attention/test_cache.py b/vllm_v0.10.0/tests/kernels/attention/test_cache.py
new file mode 100644
index 0000000..7895076
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/attention/test_cache.py
@@ -0,0 +1,843 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import random
+
+import pytest
+import torch
+
+from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+
+COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
+DTYPES = [torch.half, torch.bfloat16, torch.float]
+NUM_TOKENS = [42]  # Arbitrary values for testing
+NUM_LAYERS = [1]  # Arbitrary values for testing
+NUM_HEADS = [8]  # Arbitrary values for testing
+HEAD_SIZES = [64, 80, 120, 256]
+BLOCK_SIZES = [8, 16, 32]
+CACHE_LAYOUTS = ["NHD", "HND"]
+
+# Parameters for MLA tests.
+KV_LORA_RANKS = [512]
+QK_ROPE_HEAD_DIMS = [64]
+NUM_TOKENS_MLA = [42]
+BLOCK_SIZES_MLA = [16]
+NUM_BLOCKS_MLA = [8]
+
+# Arbitrary values for testing
+# don't make it too large. e.g. [1024, 36000] will OOM
+NUM_BLOCKS = [1024, 10000]
+
+NUM_MAPPINGS = [256]  # Arbitrary values for testing
+SEEDS = [0]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+# We assume fp8 is always enabled for testing.
+KV_CACHE_DTYPE = ["auto", "fp8"]
+
+
+@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
+@pytest.mark.parametrize("num_layers", NUM_LAYERS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
+@torch.inference_mode()
+def test_copy_blocks(
+    kv_cache_factory,
+    num_mappings: int,
+    num_layers: int,
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+    kv_cache_dtype: str,
+    device: str,
+) -> None:
+    if kv_cache_dtype == "fp8" and head_size % 16:
+        pytest.skip()
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+    # Generate random block mappings where each source block is mapped to two
+    # destination blocks.
+    assert 2 * num_mappings <= num_blocks
+    src_blocks = random.sample(range(num_blocks), num_mappings)
+    remaining_blocks = list(set(range(num_blocks)) - set(src_blocks))
+    dst_blocks = random.sample(remaining_blocks, 2 * num_mappings)
+    block_mapping: list[tuple[int, int]] = []
+    for i in range(num_mappings):
+        src = src_blocks[i]
+        dst1 = dst_blocks[2 * i]
+        dst2 = dst_blocks[2 * i + 1]
+        block_mapping.append((src, dst1))
+        block_mapping.append((src, dst2))
+
+    # Create the KV caches.
+    key_caches, value_caches = kv_cache_factory(num_blocks, block_size,
+                                                num_layers, num_heads,
+                                                head_size, kv_cache_dtype,
+                                                dtype, seed, device)
+
+    # Clone the KV caches.
+    cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
+    cloned_value_caches = [value_cache.clone() for value_cache in value_caches]
+
+    # Call the copy blocks kernel.
+    block_mapping_tensor = torch.tensor(block_mapping,
+                                        dtype=torch.int64,
+                                        device=device).view(-1, 2)
+
+    opcheck(torch.ops._C_cache_ops.copy_blocks,
+            (key_caches, value_caches, block_mapping_tensor),
+            test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+            cond=(head_size == HEAD_SIZES[0]))
+    ops.copy_blocks(key_caches, value_caches, block_mapping_tensor)
+
+    # Run the reference implementation.
+    for src, dst in block_mapping:
+        for cloned_key_cache in cloned_key_caches:
+            cloned_key_cache[dst].copy_(cloned_key_cache[src])
+        for cloned_value_cache in cloned_value_caches:
+            cloned_value_cache[dst].copy_(cloned_value_cache[src])
+
+    # Compare the results.
+    for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
+        torch.testing.assert_close(key_cache, cloned_key_cache)
+    for value_cache, cloned_value_cache in zip(value_caches,
+                                               cloned_value_caches):
+        torch.testing.assert_close(value_cache, cloned_value_cache)
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
+@torch.inference_mode()
+def test_reshape_and_cache(
+    kv_cache_factory,
+    num_tokens: int,
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    kv_cache_dtype: str,
+) -> None:
+    if kv_cache_dtype == "fp8" and head_size % 16:
+        pytest.skip()
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+    # Create a random slot mapping.
+    num_slots = block_size * num_blocks
+    slot_mapping_lst = random.sample(range(num_slots), num_tokens)
+    slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long)
+
+    qkv = torch.randn(num_tokens, 3, num_heads, head_size, dtype=dtype)
+    _, key, value = qkv.unbind(dim=1)
+
+    # Create the KV caches.
+    key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1,
+                                                num_heads, head_size,
+                                                kv_cache_dtype, dtype, seed,
+                                                device)
+    key_cache, value_cache = key_caches[0], value_caches[0]
+
+    # Using default kv_scale
+    k_scale = (key.amax() / 64.0).to(torch.float32)
+    v_scale = (value.amax() / 64.0).to(torch.float32)
+
+    # Clone the KV caches.
+    if kv_cache_dtype == "fp8":
+        cloned_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
+        ops.convert_fp8(cloned_key_cache, key_cache, k_scale.item())
+        cloned_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
+        ops.convert_fp8(cloned_value_cache, value_cache, v_scale.item())
+    else:
+        cloned_key_cache = key_cache.clone()
+        cloned_value_cache = value_cache.clone()
+
+    # Call the reshape_and_cache kernel.
+    opcheck(torch.ops._C_cache_ops.reshape_and_cache,
+            (key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
+             k_scale, v_scale),
+            cond=(head_size == HEAD_SIZES[0]))
+    ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping,
+                          kv_cache_dtype, k_scale, v_scale)
+
+    if kv_cache_dtype == "fp8":
+        result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
+        ops.convert_fp8(result_key_cache, key_cache, k_scale.item())
+        result_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
+        ops.convert_fp8(result_value_cache, value_cache, v_scale.item())
+
+    # Run the reference implementation.
+    reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
+    block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
+    block_indices_lst = block_indices.cpu().tolist()
+    block_offsets = slot_mapping % block_size
+    block_offsets_lst = block_offsets.cpu().tolist()
+    for i in range(num_tokens):
+        block_idx = block_indices_lst[i]
+        block_offset = block_offsets_lst[i]
+        cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i]
+        cloned_value_cache[block_idx, :, :, block_offset] = value[i]
+
+    if kv_cache_dtype == "fp8":
+        torch.testing.assert_close(result_key_cache,
+                                   cloned_key_cache,
+                                   atol=0.001,
+                                   rtol=0.1)
+        torch.testing.assert_close(result_value_cache,
+                                   cloned_value_cache,
+                                   atol=0.001,
+                                   rtol=0.1)
+    else:
+        torch.testing.assert_close(key_cache, cloned_key_cache)
+        torch.testing.assert_close(value_cache, cloned_value_cache)
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
+@pytest.mark.parametrize("kv_cache_layout", CACHE_LAYOUTS)
+@torch.inference_mode()
+def test_reshape_and_cache_flash(
+    kv_cache_factory_flashinfer,
+    num_tokens: int,
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    kv_cache_dtype: str,
+    kv_cache_layout: str,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+
+    # fp8 conversion requires continugous memory buffer. Reduce the number of
+    # blocks and tokens to consume less memory.
+    num_tokens = num_tokens // 2
+    num_blocks = num_blocks // 2
+    # Create a random slot mapping.
+    num_slots = block_size * num_blocks
+    slot_mapping_lst = random.sample(range(num_slots), num_tokens)
+    slot_mapping = torch.tensor(slot_mapping_lst,
+                                dtype=torch.long,
+                                device=device)
+    qkv = torch.randn(num_tokens,
+                      3,
+                      num_heads,
+                      head_size,
+                      dtype=dtype,
+                      device=device)
+    _, key, value = qkv.unbind(dim=1)
+
+    # Create the KV caches.
+    key_caches, value_caches = kv_cache_factory_flashinfer(
+        num_blocks,
+        block_size,
+        1,
+        num_heads,
+        head_size,
+        kv_cache_dtype,
+        dtype,
+        device=device,
+        cache_layout=kv_cache_layout,
+    )
+    key_cache, value_cache = key_caches[0], value_caches[0]
+    del key_caches
+    del value_caches
+
+    k_scale = (key.amax() / 64.0).to(torch.float32)
+    v_scale = (value.amax() / 64.0).to(torch.float32)
+
+    def permute_and_compact(x):
+        y = x if kv_cache_layout == "NHD" else x.permute(0, 2, 1, 3)
+        return y.contiguous()
+
+    key_cache_compact = permute_and_compact(key_cache)
+    value_cache_compact = permute_and_compact(value_cache)
+
+    # Clone the KV caches.
+    if kv_cache_dtype == "fp8":
+        cloned_key_cache = torch.empty_like(key_cache_compact,
+                                            dtype=torch.float16)
+        ops.convert_fp8(cloned_key_cache, key_cache_compact, k_scale.item(),
+                        kv_cache_dtype)
+        cloned_value_cache = torch.empty_like(value_cache_compact,
+                                              dtype=torch.float16)
+        ops.convert_fp8(cloned_value_cache, value_cache_compact,
+                        v_scale.item(), kv_cache_dtype)
+    else:
+        cloned_key_cache = key_cache_compact.clone()
+        cloned_value_cache = value_cache_compact.clone()
+    # Call the reshape_and_cache kernel.
+    opcheck(torch.ops._C_cache_ops.reshape_and_cache_flash,
+            (key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
+             k_scale, v_scale),
+            cond=(head_size == HEAD_SIZES[0]))
+    ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
+                                slot_mapping, kv_cache_dtype, k_scale, v_scale)
+    key_cache_compact = permute_and_compact(key_cache)
+    value_cache_compact = permute_and_compact(value_cache)
+
+    if kv_cache_dtype == "fp8":
+        result_key_cache = torch.empty_like(key_cache_compact,
+                                            dtype=torch.float16)
+        ops.convert_fp8(result_key_cache,
+                        key_cache_compact,
+                        k_scale.item(),
+                        kv_dtype=kv_cache_dtype)
+        result_value_cache = torch.empty_like(value_cache_compact,
+                                              dtype=torch.float16)
+        ops.convert_fp8(result_value_cache,
+                        value_cache_compact,
+                        v_scale.item(),
+                        kv_dtype=kv_cache_dtype)
+
+    # Run the reference implementation.
+    block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
+    block_indices_lst = block_indices.cpu().tolist()
+    block_offsets = slot_mapping % block_size
+    block_offsets_lst = block_offsets.cpu().tolist()
+    for i in range(num_tokens):
+        block_idx = block_indices_lst[i]
+        block_offset = block_offsets_lst[i]
+        if kv_cache_layout == "NHD":
+            cloned_key_cache[block_idx, block_offset, :, :] = key[i]
+            cloned_value_cache[block_idx, block_offset, :, :] = value[i]
+        else:
+            cloned_key_cache[block_idx, :, block_offset, :] = key[i]
+            cloned_value_cache[block_idx, :, block_offset, :] = value[i]
+
+    if kv_cache_dtype == "fp8":
+        torch.testing.assert_close(result_key_cache,
+                                   cloned_key_cache,
+                                   atol=0.001,
+                                   rtol=0.1)
+        torch.testing.assert_close(result_value_cache,
+                                   cloned_value_cache,
+                                   atol=0.001,
+                                   rtol=0.1)
+    else:
+        torch.testing.assert_close(key_cache_compact, cloned_key_cache)
+        torch.testing.assert_close(value_cache_compact, cloned_value_cache)
+
+
+@pytest.mark.parametrize("direction", COPYING_DIRECTION)
+@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
+@torch.inference_mode()
+def test_swap_blocks(
+    kv_cache_factory,
+    direction: tuple[str, str],
+    num_mappings: int,
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    kv_cache_dtype: str,
+) -> None:
+    if kv_cache_dtype == "fp8" and "cpu" in direction:
+        pytest.skip()
+    if kv_cache_dtype == "fp8" and head_size % 16:
+        pytest.skip()
+
+    current_platform.seed_everything(seed)
+
+    src_device = device if direction[0] == "cuda" else 'cpu'
+    dst_device = device if direction[1] == "cuda" else 'cpu'
+
+    src_blocks = random.sample(range(num_blocks), num_mappings)
+    # For the same device, mapping must not overlap
+    if src_device == dst_device:
+        remaining_blocks = list(set(range(num_blocks)) - set(src_blocks))
+        dst_blocks = random.sample(remaining_blocks, num_mappings)
+    else:
+        dst_blocks = random.sample(range(num_blocks), num_mappings)
+
+    block_mapping = list(zip(src_blocks, dst_blocks))
+    block_mapping_tensor = torch.tensor(block_mapping,
+                                        dtype=torch.int64,
+                                        device="cpu").view(-1, 2)
+
+    # Create the KV caches on the first device.
+    src_key_caches, src_value_caches = kv_cache_factory(
+        num_blocks, block_size, 1, num_heads, head_size, kv_cache_dtype, dtype,
+        seed, src_device)
+
+    # Create the KV caches on the second device.
+    dist_key_caches, dist_value_caches = kv_cache_factory(
+        num_blocks, block_size, 1, num_heads, head_size, kv_cache_dtype, dtype,
+        seed, dst_device)
+
+    src_key_caches_clone = src_key_caches[0].clone()
+    src_value_caches_clone = src_value_caches[0].clone()
+
+    # Call the swap_blocks kernel.
+    do_opcheck = (head_size == HEAD_SIZES[0])
+    opcheck(torch.ops._C_cache_ops.swap_blocks,
+            (src_key_caches[0], dist_key_caches[0], block_mapping_tensor),
+            cond=do_opcheck)
+    opcheck(torch.ops._C_cache_ops.swap_blocks,
+            (src_value_caches[0], dist_value_caches[0], block_mapping_tensor),
+            cond=do_opcheck)
+
+    ops.swap_blocks(src_key_caches[0], dist_key_caches[0],
+                    block_mapping_tensor)
+    ops.swap_blocks(src_value_caches[0], dist_value_caches[0],
+                    block_mapping_tensor)
+
+    for src, dst in block_mapping:
+        torch.testing.assert_close(src_key_caches_clone[src].cpu(),
+                                   dist_key_caches[0][dst].cpu())
+        torch.testing.assert_close(src_value_caches_clone[src].cpu(),
+                                   dist_value_caches[0][dst].cpu())
+
+
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_fp8_e4m3_conversion(
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+) -> None:
+    current_platform.seed_everything(seed)
+
+    low = -224.0
+    high = 224.0
+    shape = (num_blocks, num_heads, head_size, block_size)
+    cache = torch.empty(shape, dtype=dtype, device=device)
+    cache.uniform_(low, high)
+
+    cache_fp8 = torch.empty_like(cache, dtype=torch.uint8)
+    ops.convert_fp8(cache_fp8, cache)
+
+    converted_cache = torch.empty_like(cache)
+    ops.convert_fp8(converted_cache, cache_fp8)
+
+    torch.testing.assert_close(cache, converted_cache, atol=0.001, rtol=0.1)
+
+
+def _create_mla_cache(
+    num_blocks: int,
+    block_size: int,
+    entry_size: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: str,
+    device: str,
+) -> torch.Tensor:
+    cache_dtype = torch.uint8 if kv_cache_dtype == "fp8" else dtype
+    return torch.zeros(num_blocks,
+                       block_size,
+                       entry_size,
+                       dtype=cache_dtype,
+                       device=device)
+
+
+def _fill_mla_cache(cache: torch.Tensor, kv_cache_dtype: str):
+    rand_dtype = torch.float16 if kv_cache_dtype == "fp8" else cache.dtype
+
+    vals = torch.randn(*cache.shape, device=cache.device, dtype=rand_dtype)
+    if kv_cache_dtype == "fp8":
+        temp = torch.zeros_like(cache)
+        ops.convert_fp8(temp, vals, 1.0, kv_dtype=kv_cache_dtype)
+        vals = temp
+    cache.copy_(vals)
+
+
+@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
+@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS_MLA)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
+@torch.inference_mode()
+def test_concat_and_cache_mla(
+    kv_lora_rank: int,
+    qk_rope_head_dim: int,
+    num_tokens: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    kv_cache_dtype: str,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+
+    total_slots = num_blocks * block_size
+    slot_mapping_lst = random.sample(range(total_slots), num_tokens)
+    slot_mapping = torch.tensor(slot_mapping_lst,
+                                dtype=torch.long,
+                                device=device)
+
+    kv_c = torch.randn(num_tokens, kv_lora_rank, dtype=dtype, device=device)
+    k_pe = torch.randn(num_tokens,
+                       qk_rope_head_dim,
+                       dtype=dtype,
+                       device=device)
+    entry_size = kv_lora_rank + qk_rope_head_dim
+
+    scale = torch.tensor(0.1, dtype=torch.float32, device=device)
+    kv_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
+                                 kv_cache_dtype, device)
+    ref_temp = torch.zeros(*kv_cache.shape, dtype=dtype, device=device)
+
+    for i in range(num_tokens):
+        slot = slot_mapping[i].item()
+        block_idx = slot // block_size
+        block_offset = slot % block_size
+        ref_temp[block_idx, block_offset, :kv_lora_rank] = kv_c[i]
+        ref_temp[block_idx, block_offset, kv_lora_rank:] = k_pe[i]
+
+    if kv_cache_dtype == "fp8":
+        ref_kv_cache = torch.empty_like(ref_temp, dtype=kv_cache.dtype)
+        ops.convert_fp8(ref_kv_cache,
+                        ref_temp,
+                        scale.item(),
+                        kv_dtype=kv_cache_dtype)
+    else:
+        ref_kv_cache = ref_temp
+
+    opcheck(
+        torch.ops._C_cache_ops.concat_and_cache_mla,
+        (kv_c, k_pe, kv_cache, slot_mapping, kv_cache_dtype, scale),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+    )
+
+    ops.concat_and_cache_mla(kv_c, k_pe, kv_cache, slot_mapping,
+                             kv_cache_dtype, scale)
+
+    if kv_cache_dtype == "fp8":
+        result_temp = torch.empty_like(kv_cache, dtype=torch.float16)
+        ops.convert_fp8(result_temp,
+                        kv_cache.contiguous(),
+                        scale.item(),
+                        kv_dtype=kv_cache_dtype)
+        expected_temp = torch.empty_like(ref_kv_cache, dtype=torch.float16)
+        ops.convert_fp8(expected_temp,
+                        ref_kv_cache,
+                        scale.item(),
+                        kv_dtype=kv_cache_dtype)
+        torch.testing.assert_close(result_temp,
+                                   expected_temp,
+                                   atol=0.001,
+                                   rtol=0.1)
+    else:
+        torch.testing.assert_close(kv_cache, ref_kv_cache)
+
+
+@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
+@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA)
+@pytest.mark.parametrize("num_layers", NUM_LAYERS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
+@torch.inference_mode()
+def test_copy_blocks_mla(
+    kv_lora_rank: int,
+    qk_rope_head_dim: int,
+    block_size: int,
+    num_blocks: int,
+    num_layers: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    kv_cache_dtype: str,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+
+    entry_size = kv_lora_rank + qk_rope_head_dim
+
+    kv_caches = []
+    for _ in range(num_layers):
+        kv_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
+                                     kv_cache_dtype, device)
+        _fill_mla_cache(kv_cache, kv_cache_dtype=kv_cache_dtype)
+        kv_caches.append(kv_cache)
+
+    ref_caches = [kv_cache.clone() for kv_cache in kv_caches]
+
+    num_mappings = min(2, num_blocks // 2)
+    src_blocks = random.sample(range(num_blocks), num_mappings)
+    remaining = list(set(range(num_blocks)) - set(src_blocks))
+    dst_blocks = random.sample(remaining, 2 * num_mappings)
+    block_mapping = []
+    for i in range(num_mappings):
+        src = src_blocks[i]
+        dst1 = dst_blocks[2 * i]
+        dst2 = dst_blocks[2 * i + 1]
+        block_mapping.append((src, dst1))
+        block_mapping.append((src, dst2))
+    block_mapping_tensor = torch.tensor(block_mapping,
+                                        dtype=torch.int64,
+                                        device=device).view(-1, 2)
+
+    for src, dst in block_mapping:
+        for ref_cache in ref_caches:
+            ref_cache[dst].copy_(ref_cache[src])
+
+    opcheck(
+        torch.ops._C_cache_ops.copy_blocks_mla,
+        (kv_caches, block_mapping_tensor),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+    )
+    ops.copy_blocks_mla(kv_caches, block_mapping_tensor)
+
+    for kv_cache, ref_cache in zip(kv_caches, ref_caches):
+        torch.testing.assert_close(kv_cache, ref_cache)
+
+
+@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
+@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
+@torch.inference_mode()
+def test_swap_blocks_mla(
+    kv_lora_rank: int,
+    qk_rope_head_dim: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    kv_cache_dtype: str,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+
+    entry_size = kv_lora_rank + qk_rope_head_dim
+
+    src_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
+                                  kv_cache_dtype, device)
+    dst_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
+                                  kv_cache_dtype, device)
+
+    _fill_mla_cache(src_cache, kv_cache_dtype)
+    _fill_mla_cache(dst_cache, kv_cache_dtype)
+
+    src_cache_clone = src_cache.clone()
+
+    num_mappings = min(2, num_blocks // 2)
+    src_blocks = random.sample(range(num_blocks), num_mappings)
+    remaining_blocks = list(set(range(num_blocks)) - set(src_blocks))
+    dst_blocks = random.sample(remaining_blocks, num_mappings)
+    block_mapping = list(zip(src_blocks, dst_blocks))
+    block_mapping_tensor = torch.tensor(block_mapping,
+                                        dtype=torch.int64,
+                                        device="cpu").view(-1, 2)
+
+    opcheck(
+        torch.ops._C_cache_ops.swap_blocks,
+        (src_cache, dst_cache, block_mapping_tensor),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+    )
+
+    ops.swap_blocks(src_cache, dst_cache, block_mapping_tensor)
+
+    for src, dst in block_mapping:
+        torch.testing.assert_close(
+            src_cache_clone[src].cpu(),
+            dst_cache[dst].cpu(),
+            msg=f"Block {src} from src should have been swapped to block "
+            f"{dst} in dst_cache.")
+
+
+@pytest.mark.parametrize("kv_lora_rank", [512])
+@pytest.mark.parametrize("qk_rope_head_dim", [64])
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("num_blocks", [1024])
+@pytest.mark.parametrize("max_seq_len", [512])
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize("dtype", [torch.float32])
+@pytest.mark.parametrize("kv_cache_dtype",
+                         ["auto"])  # You can also test "fp8" if needed.
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_gather_cache_mla(kv_lora_rank, qk_rope_head_dim, block_size,
+                          num_blocks, max_seq_len, batch_size, dtype,
+                          kv_cache_dtype, device):
+    entry_size = kv_lora_rank + qk_rope_head_dim
+    src_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
+                                  kv_cache_dtype, device)
+    _fill_mla_cache(src_cache, kv_cache_dtype=kv_cache_dtype)
+
+    seq_len_tensor = torch.randint(0,
+                                   max_seq_len + 1, (batch_size, ),
+                                   device=device)
+
+    total_tokens = seq_len_tensor.sum()
+    cu_seq_lens = torch.empty((batch_size + 1),
+                              dtype=torch.int32,
+                              device=device)
+    cu_seq_lens[0] = 0
+    cu_seq_lens[1:] = seq_len_tensor.cumsum(dim=0).to(dtype=torch.int32)
+    print("seq_len_tensor", seq_len_tensor)
+
+    tot_blocks_tensor = (seq_len_tensor + block_size - 1) // block_size
+    block_table = torch.empty((batch_size, num_blocks),
+                              dtype=torch.int32,
+                              device=device)
+
+    for b in range(batch_size):
+        perm = torch.randperm(num_blocks, device=device)
+        block_table[b, :] = perm
+
+    dst = torch.zeros((total_tokens, entry_size),
+                      dtype=src_cache.dtype,
+                      device=device)
+
+    expected_batches = []
+    for b in range(batch_size):
+        s = seq_len_tensor[b]
+        if s == 0:
+            continue
+        tot = tot_blocks_tensor[b]
+        blocks = block_table[b, :tot].tolist()
+
+        gathered_rows = []
+        for i in range(tot - 1):
+            gathered_rows.append(src_cache[blocks[i]])
+        remaining = s - (tot - 1) * block_size
+        gathered_rows.append(src_cache[blocks[-1], :remaining, :])
+
+        batch_expected = torch.cat(gathered_rows, dim=0)
+        expected_batches.append(batch_expected)
+    expected = torch.cat(expected_batches, dim=0)
+
+    opcheck(
+        torch.ops._C_cache_ops.gather_cache,
+        (src_cache, dst, block_table, cu_seq_lens, batch_size, None),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+    )
+
+    ops.gather_cache(src_cache, dst, block_table, cu_seq_lens, batch_size)
+    torch.testing.assert_close(dst, expected)
+
+
+@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
+@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS_MLA)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.cpu_model
+@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
+@torch.inference_mode()
+def test_concat_and_cache_mla_cpu(
+    kv_lora_rank: int,
+    qk_rope_head_dim: int,
+    num_tokens: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+) -> None:
+    device = "cpu"
+    kv_cache_dtype = "auto"
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+
+    total_slots = num_blocks * block_size
+    slot_mapping_lst = random.sample(range(total_slots), num_tokens)
+    slot_mapping = torch.tensor(slot_mapping_lst,
+                                dtype=torch.long,
+                                device=device)
+
+    kv_c = torch.randn(num_tokens, kv_lora_rank, dtype=dtype, device=device)
+    k_pe = torch.randn(num_tokens,
+                       qk_rope_head_dim,
+                       dtype=dtype,
+                       device=device)
+    entry_size = kv_lora_rank + qk_rope_head_dim
+
+    scale = torch.tensor(0.1, dtype=torch.float32, device=device)
+    kv_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
+                                 kv_cache_dtype, device)
+    ref_temp = torch.zeros(*kv_cache.shape, dtype=dtype, device=device)
+
+    for i in range(num_tokens):
+        slot = slot_mapping[i].item()
+        block_idx = slot // block_size
+        block_offset = slot % block_size
+        ref_temp[block_idx, block_offset, :kv_lora_rank] = kv_c[i]
+        ref_temp[block_idx, block_offset, kv_lora_rank:] = k_pe[i]
+
+    if kv_cache_dtype == "fp8":
+        ref_kv_cache = torch.empty_like(ref_temp, dtype=kv_cache.dtype)
+        ops.convert_fp8(ref_kv_cache,
+                        ref_temp,
+                        scale.item(),
+                        kv_dtype=kv_cache_dtype)
+    else:
+        ref_kv_cache = ref_temp
+
+    opcheck(
+        torch.ops._C_cache_ops.concat_and_cache_mla,
+        (kv_c, k_pe, kv_cache, slot_mapping, kv_cache_dtype, scale),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+    )
+
+    ops.concat_and_cache_mla(kv_c, k_pe, kv_cache, slot_mapping,
+                             kv_cache_dtype, scale)
+    torch.testing.assert_close(kv_cache, ref_kv_cache)
diff --git a/vllm_v0.10.0/tests/kernels/attention/test_cascade_flash_attn.py b/vllm_v0.10.0/tests/kernels/attention/test_cascade_flash_attn.py
new file mode 100755
index 0000000..1e7e7e0
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/attention/test_cascade_flash_attn.py
@@ -0,0 +1,190 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+from vllm.v1.attention.backends.flash_attn import (cascade_attention,
+                                                   merge_attn_states)
+from vllm.vllm_flash_attn import (fa_version_unsupported_reason,
+                                  flash_attn_varlen_func,
+                                  is_fa_version_supported)
+
+NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
+HEAD_SIZES = [128, 192, 256]
+BLOCK_SIZES = [16]
+DTYPES = [torch.float16, torch.bfloat16]
+
+
+@pytest.mark.parametrize("num_tokens", [1, 39, 16912])
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@torch.inference_mode()
+def test_merge_kernel(
+    num_tokens: int,
+    num_heads: tuple[int, int],
+    head_size: int,
+    dtype: torch.dtype,
+):
+    torch.set_default_device("cuda")
+    current_platform.seed_everything(0)
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+
+    # Prepare inputs.
+    prefix_output = torch.randn(num_tokens,
+                                num_query_heads,
+                                head_size,
+                                dtype=dtype)
+    suffix_output = torch.randn(num_tokens,
+                                num_query_heads,
+                                head_size,
+                                dtype=dtype)
+    prefix_lse = torch.randn(num_query_heads, num_tokens, dtype=torch.float32)
+    suffix_lse = torch.randn(num_query_heads, num_tokens, dtype=torch.float32)
+
+    # Run the kernel.
+    output = torch.empty(num_tokens, num_query_heads, head_size, dtype=dtype)
+    merge_attn_states(output, prefix_output, prefix_lse, suffix_output,
+                      suffix_lse)
+
+    # Reference implementation.
+    max_lse = torch.maximum(prefix_lse, suffix_lse)
+    p_lse = torch.exp(prefix_lse - max_lse)
+    s_lse = torch.exp(suffix_lse - max_lse)
+    p_scale = p_lse / (p_lse + s_lse)
+    s_scale = s_lse / (p_lse + s_lse)
+    p_scale = p_scale.transpose(0, 1).unsqueeze(2)
+    s_scale = s_scale.transpose(0, 1).unsqueeze(2)
+    ref_output = p_scale * prefix_output + s_scale * suffix_output
+    ref_output = ref_output.to(dtype)
+
+    # Compare the results.
+    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2)
+
+
+CASES = [
+    # Case 1. A general case.
+    ([(129, 871), (18, 280), (37, 988), (1023, 2304), (1, 257)], 256),
+    # Case 2. Flash-decoding case.
+    ([(1, 1023), (1, 879), (1, 778), (1, 1777)] * 100, 512),
+]
+
+
+@pytest.mark.parametrize("seq_lens_and_common_prefix", CASES)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("soft_cap", [None, 50])
+@pytest.mark.parametrize("num_blocks", [2048])
+@pytest.mark.parametrize("fa_version", [2, 3])
+@torch.inference_mode()
+def test_cascade(
+    seq_lens_and_common_prefix: tuple[list[tuple[int, int]], int],
+    num_heads: tuple[int, int],
+    head_size: int,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: Optional[float],
+    num_blocks: int,
+    fa_version: int,
+) -> None:
+    torch.set_default_device("cuda")
+    if not is_fa_version_supported(fa_version):
+        pytest.skip(f"Flash attention version {fa_version} not supported due "
+                    f"to: \"{fa_version_unsupported_reason(fa_version)}\"")
+
+    current_platform.seed_everything(0)
+
+    window_size = (-1, -1)
+    scale = head_size**-0.5
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    key_cache = torch.randn(num_blocks,
+                            block_size,
+                            num_kv_heads,
+                            head_size,
+                            dtype=dtype)
+    value_cache = torch.randn_like(key_cache)
+
+    seq_lens, common_prefix_len = seq_lens_and_common_prefix
+    num_seqs = len(seq_lens)
+    query_lens = [x[0] for x in seq_lens]
+    kv_lens = [x[1] for x in seq_lens]
+    max_query_len = max(query_lens)
+    max_kv_len = max(kv_lens)
+
+    total_num_query_tokens = sum(query_lens)
+    query = torch.randn(total_num_query_tokens,
+                        num_query_heads,
+                        head_size,
+                        dtype=dtype)
+    cu_query_lens = torch.tensor([0] + query_lens,
+                                 dtype=torch.int32).cumsum(dim=0,
+                                                           dtype=torch.int32)
+    kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32)
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(0,
+                                 num_blocks,
+                                 (num_seqs, max_num_blocks_per_seq),
+                                 dtype=torch.int32)
+
+    assert common_prefix_len > 0
+    assert common_prefix_len % block_size == 0
+    num_common_kv_blocks = common_prefix_len // block_size
+    # Make sure the first `num_common_kv_blocks` blocks are the same.
+    block_tables[:, :num_common_kv_blocks] = \
+        block_tables[0, :num_common_kv_blocks]
+
+    # Run the regular attention.
+    ref_output = flash_attn_varlen_func(
+        q=query,
+        k=key_cache,
+        v=value_cache,
+        cu_seqlens_q=cu_query_lens,
+        seqused_k=kv_lens_tensor,
+        max_seqlen_q=max_query_len,
+        max_seqlen_k=max_kv_len,
+        softmax_scale=scale,
+        causal=True,
+        window_size=window_size,
+        block_table=block_tables,
+        softcap=soft_cap if soft_cap is not None else 0,
+    )
+
+    # Run cascade attention.
+    assert all(common_prefix_len < kv_len for kv_len in kv_lens)
+    cu_prefix_query_lens = torch.tensor([0, total_num_query_tokens],
+                                        dtype=torch.int32)
+    prefix_kv_lens = torch.tensor([common_prefix_len], dtype=torch.int32)
+    suffix_kv_lens = kv_lens_tensor - common_prefix_len
+    output = torch.empty_like(query)
+    cascade_attention(
+        output=output,
+        query=query,
+        key_cache=key_cache,
+        value_cache=value_cache,
+        cu_query_lens=cu_query_lens,
+        max_query_len=max_query_len,
+        cu_prefix_query_lens=cu_prefix_query_lens,
+        prefix_kv_lens=prefix_kv_lens,
+        suffix_kv_lens=suffix_kv_lens,
+        max_kv_len=max_kv_len,
+        softmax_scale=scale,
+        alibi_slopes=None,
+        sliding_window=window_size,
+        logits_soft_cap=soft_cap if soft_cap is not None else 0,
+        block_table=block_tables,
+        common_prefix_len=common_prefix_len,
+        fa_version=fa_version,
+    )
+
+    # Compare the results.
+    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2)
diff --git a/vllm_v0.10.0/tests/kernels/attention/test_encoder_decoder_attn.py b/vllm_v0.10.0/tests/kernels/attention/test_encoder_decoder_attn.py
new file mode 100644
index 0000000..a2e6986
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/attention/test_encoder_decoder_attn.py
@@ -0,0 +1,1105 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests:
+
+* E2E test of Encoder attention + Decoder self-attention +
+      Encoder/decoder cross-attention (collectively
+      "encoder/decoder attention")
+
+"""
+
+from typing import NamedTuple, Optional
+
+import pytest
+import torch
+
+from tests.kernels.utils import *
+from vllm.attention import Attention, AttentionMetadata, AttentionType
+from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP
+from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
+                                     global_force_attn_backend_context_manager)
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.forward_context import set_forward_context
+from vllm.platforms import current_platform
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Encoder-decoder is only supported on V0, so set 
+    VLLM_USE_V1=0 for all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
+# List of support backends for encoder/decoder models
+LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN]
+HEAD_SIZES = [64, 256]
+
+NUM_HEADS = [1, 16]
+
+BATCH_SIZES = [1, 16]
+BLOCK_SIZES = [16]
+CUDA_DEVICE = "cuda:0"
+
+MAX_DEC_SEQ_LENS = [128]
+MAX_ENC_SEQ_LENS = [128]
+
+# Narrow test-cases for unsupported-scenario
+# tests
+HEAD_SIZES_FOR_UNSUPP = [HEAD_SIZES[0]]
+
+
+class TestPoint(NamedTuple):
+    """
+    Encapsulates the attributes which define a single invocation
+    of the test_e2e_enc_dec_attn() test
+
+    Attributes:
+        num_heads: The number of heads in the model.
+        head_size: Head dimension
+        backend_name: Name of the backend framework used.
+        batch_size: Number of samples per batch.
+        block_size: Size of each block of data processed.
+        max_dec_seq_len: Maximum sequence length for the decoder.
+        max_enc_seq_len: Maximum sequence length for the encoder.
+        num_blocks: Number of blocks in the model.
+    """
+
+    num_heads: int
+    head_size: int
+    backend_name: str
+    batch_size: int
+    block_size: int
+    max_dec_seq_len: int
+    max_enc_seq_len: int
+    num_blocks: int
+    attn_type: AttentionType
+
+
+class TestResources(NamedTuple):
+    '''
+    Encapsulates key components for performing an
+    encoder/decoder attention test
+
+    Note that
+    (1) attn automatically selects an attention backend
+        based on platform info & a set of canned
+        heuristics
+    (2) attn_backend is thus *not the same backend
+        instance* used by attn, but rather it is
+        intended to be a
+        *different instance* of the *same backend class*;
+        it is assumed that the user of TestResources
+        will leverage attn_backend for the purpose of
+        constructing backend-compatible attention
+        metadata instances
+
+    Attributes:
+
+    * scale: 1/sqrt(d) scale factor for attn
+    * attn_backend: implementations of abstraction
+                    attention interface using
+                    a particular kernel library
+                    i.e. XFormers
+    * attn: Attention layer instance
+    * kv_cache: shared key/value cache for all attention
+    '''
+
+    scale: float
+    attn: Attention
+    kv_cache: torch.Tensor
+
+
+def _make_test_resources(test_pt: TestPoint, ) -> TestResources:
+    '''
+    Build key components for performing encoder/decoder attention test.
+
+    Note that
+    (1) The Attention instance constructed here, automatically selects
+        an attention backend class based on platform info & a set of canned
+        heuristics, so
+    (2) The attention backend instance constructed here is thus *not
+        the same backend instance* used by attn, but rather it is
+        intended to be a *different instance* of the *same backend class*;
+        therefore,
+    (3) This function requires that test_pt.backend_name matches the backend
+        class that Attention will automatically select when it is constructed.
+
+
+    Arguments:
+
+    * test_pt: TestPoint data structure; this function relies on the
+               following fields: num_heads, head_size, num_blocks,
+               block_size, backend_name
+
+    Returns:
+
+    * TestResources data structure.
+    '''
+
+    scale = float(1.0 / (test_pt.head_size**0.5))
+    attn = Attention(
+        test_pt.num_heads,
+        test_pt.head_size,
+        scale=scale,
+        prefix=f"{test_pt.attn_type}",
+        attn_type=test_pt.attn_type,
+    )
+    if test_pt.num_blocks is None or test_pt.num_heads is None:
+        # Caller does not require a KV cache
+        return TestResources(
+            scale, attn,
+            torch.tensor([], dtype=torch.float32, device=CUDA_DEVICE))
+
+    # Construct KV cache
+    if test_pt.attn_type in (AttentionType.DECODER,
+                             AttentionType.ENCODER_DECODER):
+        kv_cache = make_kv_cache(test_pt.num_blocks,
+                                 test_pt.num_heads,
+                                 test_pt.head_size,
+                                 test_pt.block_size,
+                                 device=CUDA_DEVICE,
+                                 backend=test_pt.backend_name)
+    else:
+        kv_cache = torch.tensor([])
+
+    attn.kv_cache = [kv_cache]
+    return TestResources(scale, attn, kv_cache)
+
+
+def _encoder_attn_setup(
+    test_pt: TestPoint,
+    test_rsrcs: TestResources,
+) -> PhaseTestParameters:
+    '''
+    Set up test vectors & data structures for encoder attention test.
+
+    A triplet of synthetic query/key/value tensors are constructed.
+    Given this is an encoder attention test, the key & value
+    sequences will have the same length as the corresponding queries.
+
+    The query/key/value tensors are passed to an ideal reference
+    self-attention implementation to generate an ideal output tensor.
+
+    Encoder inference does not populate the KV cache, therefore
+    no KV cache memory mapping is constructed
+
+    Arguments:
+
+    * test_pt: TestPoint data structure; this function relies on the
+               following fields: batch_size, num_heads, head_size,
+               block_size, max_q_seq_len
+    * test_rsrcs: TestResources data structure; this function relies on the
+                  scale field
+
+
+    Returns:
+
+    * PhaseTestParameters data structure comprising (1) packed query/key/value
+      tensors, (2) the ideal output of attention computed using a naive
+      implementation, and (3) KVCache field set to None
+    '''
+
+    (
+        num_heads,
+        head_size,
+        _,
+        batch_size,
+        _,
+        _,
+        max_q_seq_len,
+        _,
+        _,
+    ) = test_pt
+
+    scale = test_rsrcs.scale
+
+    max_kv_seq_len = max_q_seq_len
+
+    # Make test tensors
+
+    qkv_in, _, _ = make_qkv(batch_size,
+                            max_q_seq_len,
+                            max_kv_seq_len,
+                            num_heads,
+                            head_size,
+                            attn_type=AttentionType.ENCODER,
+                            device=CUDA_DEVICE)
+
+    # Compute correct answer using naive non-causal attention
+    # implementation
+
+    ideal_output = ref_masked_attention(qkv_in.query,
+                                        qkv_in.key,
+                                        qkv_in.value,
+                                        scale=scale,
+                                        q_seq_lens=qkv_in.q_seq_lens,
+                                        kv_seq_lens=qkv_in.kv_seq_lens)
+
+    packed_ideal_output, _ = pack_tensor(ideal_output,
+                                         qkv_in.q_seq_lens,
+                                         device=CUDA_DEVICE)
+
+    packed_qkv = pack_qkv(qkv_in, device=CUDA_DEVICE)
+
+    return PhaseTestParameters(
+        PackedQKVO(packed_qkv, packed_ideal_output),
+        None  # No KV cache
+    )
+
+
+def _decoder_attn_setup(
+    test_pt: TestPoint,
+    test_rsrcs: TestResources,
+    block_base_addr: int = 0,
+) -> tuple[QKVInputs, PhaseTestParameters, PhaseTestParameters, int]:
+    '''
+    Set up test vectors & data structures for self-attention test.
+
+    A triplet of synthetic query/key/value tensors are constructed ("baseline"
+    query/key/value). Given this is a self-attention test, the key & value
+    sequences will have the same length as the corresponding queries.
+
+    "Prefill" query/key/value tensors are derived by masking out the last value
+    in each baseline query/key/value. These tensors are used to test prefill &
+    populate KV cache for a subsequent decode test.
+
+    "Decode" query/key/value tensors are derived by extracting *only* the last
+    value from each baseline query/key/value (i.e. complement of the prefill
+    tensors.) These tensors are used to test decode, conditional on the kv cache
+    being populated during the prefill test.
+
+    The baseline query/key/value tensors are passed to an ideal reference
+    self-attention implementation to generate a "Baseline" ideal output tensor.
+    This tensor is split into the "Prefill" ideal output tensor (all but the
+    last element of each output sequence) and the "Decode" ideal output tensor
+    (*only* the last element of each output sequence); the "Prefill" and
+    "Decode" ideal output tensors can be used to validate the prefill and decode
+    test results, respectively.
+
+    This function also constructs the self-attention KV cache memory mapping
+    (slot mapping and block table), ensuring that the block table starts at
+    block_base_addr
+
+    Arguments:
+
+    * test_pt: TestPoint data structure; this function relies on the
+               following fields: batch_size, num_heads, head_size,
+               block_size, max_q_seq_len
+    * test_rsrcs: TestResources data structure; this function relies on the
+                  scale field
+    * block_base_addr: decoder self-attention block-table base address
+
+    Returns:
+    * qkv: Unpacked (batch_size x padded_seq_len x num_heads x
+           head_size) query/key/value tensors
+    * Prefill-phase decoder self-attention PhaseTestParameters data structure,
+      including (1) packed (number_of_tokens x num_heads x head_size)
+      query/key/value tensors along with (2) ideal attention output
+      computed using a naive implementation, and (3) memory-mapping data
+      structures appropriate for prefill phase.
+    * Decode-phase decoder self-attention PhaseTestParameters data structure,
+      including (1) packed (number_of_tokens x num_heads x head_size)
+      query/key/value tensors along with (2) ideal attention output
+      computed using a naive implementation, and (3) memory-mapping data
+      structures appropriate for decode phase.
+    * max_block_idx: max physical address in decoder self-attention block-table
+                     (intended to be used as the base address for the encoder/
+                      decoder cross-attention block-table, which is not
+                      constructed in this function)
+    '''
+
+    (
+        num_heads,
+        head_size,
+        _,
+        batch_size,
+        block_size,
+        max_q_seq_len,
+        _,
+        _,
+        _,
+    ) = test_pt
+
+    scale = test_rsrcs.scale
+
+    max_kv_seq_len = max_q_seq_len
+
+    # Build test tensors
+
+    (
+        qkv,
+        prefill_qkv,
+        decode_qkv,
+    ) = make_qkv(batch_size,
+                 max_q_seq_len,
+                 max_kv_seq_len,
+                 num_heads,
+                 head_size,
+                 attn_type=AttentionType.DECODER,
+                 device=CUDA_DEVICE)
+
+    # Compute correct answer using naive attention implementation
+    # with causal attention mask
+
+    causal_mask = make_causal_mask(max_q_seq_len,
+                                   max_kv_seq_len).to(CUDA_DEVICE)
+
+    ideal_output = ref_masked_attention(qkv.query,
+                                        qkv.key,
+                                        qkv.value,
+                                        scale=scale,
+                                        custom_mask=causal_mask,
+                                        q_seq_lens=qkv.q_seq_lens,
+                                        kv_seq_lens=qkv.kv_seq_lens)
+
+    # Split out the prefill- & decode-phase ideal answers & pack them
+
+    prefill_ideal_output = torch.zeros_like(ideal_output)
+    decode_ideal_output = torch.zeros_like(ideal_output[:, 0:1])
+    for bdx, prefill_q_seq_len in enumerate(prefill_qkv.q_seq_lens):
+        prefill_ideal_output[bdx, :prefill_q_seq_len] = ideal_output[
+            bdx, :prefill_q_seq_len]
+        decode_ideal_output[bdx, :] = ideal_output[bdx, prefill_q_seq_len:(
+            prefill_q_seq_len + 1)]
+
+    prefill_packed_ideal_output, _ = pack_tensor(prefill_ideal_output,
+                                                 prefill_qkv.q_seq_lens,
+                                                 device=CUDA_DEVICE)
+    decode_packed_ideal_output, _ = pack_tensor(decode_ideal_output,
+                                                [1 for _ in range(batch_size)],
+                                                device=CUDA_DEVICE)
+
+    # Build prefill- & decode-phase data structures
+    # for decoder self-attention. Block tables and
+    # slot mapping must be in a format compatible
+    # with KV caching & attention kernels
+    #
+    # Prefill-phase:
+    #
+    # * Empty block-tables tensor
+    # * Slot-mapping with entries for prompt tokens
+    #
+    # Decode-phase:
+    # * Block-tables tensor with minimum number of blocks
+    #   required by total num. tokens in the entirety of all sequences
+    #   (including both prefill & decode)
+    # * Slot-mapping with entries for tokens that will be decoded in the
+    #   current decode iteration
+    #
+    #  Note: the format described above is simply mirroring what ModelRunner
+    #        produces
+
+    prefill_block_tables = make_empty_block_tables_tensor(device=CUDA_DEVICE)
+
+    (
+        decode_block_tables,
+        slot_mapping_list,
+        max_block_idx,
+    ) = make_block_tables_slot_mapping(block_size,
+                                       qkv.q_seq_lens,
+                                       device=CUDA_DEVICE,
+                                       block_base_addr=block_base_addr)
+
+    (
+        prefill_slot_mapping,
+        decode_slot_mapping,
+    ) = split_slot_mapping(slot_mapping_list,
+                           qkv.q_seq_lens,
+                           device=CUDA_DEVICE)
+
+    prefill_pckd_qkv = pack_qkv(prefill_qkv, device=CUDA_DEVICE)
+
+    decode_pckd_qkv = pack_qkv(decode_qkv, device=CUDA_DEVICE)
+
+    return (
+        qkv,
+        PhaseTestParameters(  # Prefill test params
+            PackedQKVO(prefill_pckd_qkv, prefill_packed_ideal_output),
+            KVMemoryMap(prefill_block_tables, prefill_slot_mapping)),
+        PhaseTestParameters(  # Decode test params
+            PackedQKVO(decode_pckd_qkv, decode_packed_ideal_output),
+            KVMemoryMap(decode_block_tables, decode_slot_mapping)),
+        max_block_idx)
+
+
+def _enc_dec_cross_attn_setup_reuses_query(
+    decoder_qkv: QKVInputs,
+    encoder_test_params: PhaseTestParameters,
+    prefill_decoder_phase_test_params: PhaseTestParameters,
+    test_pt: TestPoint,
+    test_rsrcs: TestResources,
+    block_base_addr: int = 0,
+) -> tuple[PhaseTestParameters, PhaseTestParameters]:
+    '''
+    Set up test vectors & data structures for cross-attention test.
+
+    A triplet of synthetic cross-attention key/value tensors are constructed
+    ("baseline" key/value). Given this is a cross-attention test, we assume
+    query tensors were already synthesized for a prior self-attention test and
+    will be reused for cross-attention. The key & value sequences generated here
+    may have a different length than the corresponding queries (as is often
+    the case for cross-attention between decoder and encoder sequences.)
+
+    Cross attention key & value tensors do not grow during autoregressive
+    inference; thus this function obtains a single key/value pair suitable for
+    both prefill and decode.
+
+    The "baseline" query tensor is received as an argument. The "baseline"
+    query/key/value tensors are passed to an ideal reference cross-attention
+    implementation to generate a "baseline" ideal output tensor. This tensor is
+    split into the "Prefill" ideal output tensor (all but the last element of
+    each output sequence) and the "Decode" ideal output tensor (*only* the last
+    element of each output sequence); the "Prefill" and "Decode" ideal output
+    tensors can be used to validate the prefill and decode test results,
+    respectively.
+
+    This function also constructs the cross-attention KV cache memory mapping
+    (slot mapping and block table), ensuring that the block table starts at
+    block_base_addr.
+
+    Arguments:
+
+    * decoder_qkv: pre-existing unpacked (batch_size x padded_seq_len x
+                   num_heads x head_size) decoder self-attention inputs;
+                   this function relies on the query and q_seq_lens
+                   fields
+    * encoder_test_params: PhaseTestParameters data structure which was
+                           used for encoder inference; KV cache field
+                           is not used by this function
+    * prefill_decoder_phase_test_params: PhaseTestParameters data structure
+                                         used for prefill-phase decoder
+                                         self-attention; all fields
+                                         including KV cache required
+    * test_pt: TestPoint data structure; this function relies on the
+               following fields: batch_size, num_heads, head_size,
+               block_size, max_q_seq_len
+    * test_rsrcs: TestResources data structure; this function relies on the
+                  scale field
+    * block_base_addr: decoder self-attention block-table base address
+
+    Returns:
+
+    * Prefill-phase encoder/decoder cross-attention PhaseTestParameters data
+      structure, including (1) packed
+      (number_of_tokens x num_heads x head_size) query/key/value tensors
+      along with (2) ideal attention output computed using a
+      naive implementation, and (3) memory-mapping data structures appropriate
+      for prefill phase.
+    * Decode-phase encoder/decoder cross-attention PhaseTestParameters data
+      structure, including (1) packed
+      (number_of_tokens x num_heads x head_size) query/key/value tensors
+      along with (2) ideal attention output computed using a
+      naive implementation, and (3) memory-mapping data structures appropriate
+      for decode phase.
+    '''
+
+    assert encoder_test_params.packed_qkvo.packed_qkv is not None
+    assert prefill_decoder_phase_test_params.packed_qkvo.packed_qkv is not None
+
+    (
+        num_heads,
+        head_size,
+        _,
+        batch_size,
+        block_size,
+        max_decoder_seq_len,
+        max_encoder_seq_len,
+        _,
+        _,
+    ) = test_pt
+
+    scale = test_rsrcs.scale
+
+    decoder_query = decoder_qkv.query
+    decoder_seq_lens = decoder_qkv.q_seq_lens
+    encoder_seq_lens = encoder_test_params.packed_qkvo.packed_qkv.q_seq_lens
+    prefill_q_seq_lens = (
+        prefill_decoder_phase_test_params.packed_qkvo.packed_qkv.q_seq_lens)
+
+    assert prefill_q_seq_lens is not None
+
+    (
+        cross_kv,
+        _,
+        _,
+    ) = make_qkv(batch_size,
+                 max_decoder_seq_len,
+                 max_encoder_seq_len,
+                 num_heads,
+                 head_size,
+                 force_kv_seq_lens=encoder_seq_lens,
+                 attn_type=AttentionType.ENCODER_DECODER,
+                 device=CUDA_DEVICE)
+
+    ideal_output = ref_masked_attention(decoder_query,
+                                        cross_kv.key,
+                                        cross_kv.value,
+                                        scale=scale,
+                                        q_seq_lens=decoder_seq_lens,
+                                        kv_seq_lens=cross_kv.kv_seq_lens)
+
+    prefill_ideal_output = torch.zeros_like(ideal_output)
+    decode_ideal_output = torch.zeros_like(ideal_output[:, 0:1])
+    for bdx, prefill_q_seq_len in enumerate(prefill_q_seq_lens):
+        prefill_ideal_output[bdx, :prefill_q_seq_len] = ideal_output[
+            bdx, :prefill_q_seq_len]
+        decode_ideal_output[bdx, :] = ideal_output[bdx, prefill_q_seq_len:(
+            prefill_q_seq_len + 1)]
+
+    prefill_packed_ideal_output, _ = pack_tensor(prefill_ideal_output,
+                                                 prefill_q_seq_lens,
+                                                 device=CUDA_DEVICE)
+    decode_packed_ideal_output, _ = pack_tensor(decode_ideal_output,
+                                                [1 for _ in range(batch_size)],
+                                                device=CUDA_DEVICE)
+
+    # Build prefill- & decode-phase data structures
+    # for encoder/decoder cross-attention. Block tables and
+    # slot mapping must be in a format compatible
+    # with KV caching & attention kernels
+    #
+    # Whereas decoder self-attention extracts relationships between
+    # equal-length Q/K/V sequences, which mutually grow in length
+    # with each decoded token, cross-attention relates the Q sequence
+    # - which grows with each new decoded token - to fixed-length
+    # K and V sequences derived from the encoder hidden states.
+    #
+    # Prefill-phase:
+    #
+    # * Empty block-tables tensor
+    # * Slot-mapping with as many entries as there are tokens in the encoder
+    #   prompt.
+    #
+    # Decode-phase:
+    # * Block-tables tensor with minimum number of blocks to
+    #   accommodate K & V tensors which are equal in lnegth
+    #   to the encoder prompt length
+    # * Empty slot-mapping tensor (since K & V are fixed in size,
+    #   new decoded tokens are not KV-cached and require no slot-
+    #   mapping)
+    #
+    # Note: the format above is simply an extension of what ModelRunner
+    #       produces for decoder-only models
+
+    prefill_block_tables = make_empty_block_tables_tensor(device=CUDA_DEVICE)
+    decode_slot_mapping = make_empty_slot_mapping_tensor(device=CUDA_DEVICE)
+
+    (
+        decode_block_tables,
+        prefill_slot_mapping_list,
+        _,
+    ) = make_block_tables_slot_mapping(block_size,
+                                       cross_kv.kv_seq_lens,
+                                       block_base_addr=block_base_addr,
+                                       device=CUDA_DEVICE)
+
+    prefill_slot_mapping = maybe_make_long_tensor(prefill_slot_mapping_list,
+                                                  device=CUDA_DEVICE)
+
+    # Packed key/value (query is already provided)
+    packed_cross_kv = pack_qkv(cross_kv, device=CUDA_DEVICE)
+
+    return (
+        PhaseTestParameters(  # Prefill-phase test params
+            PackedQKVO(packed_cross_kv, prefill_packed_ideal_output),
+            KVMemoryMap(prefill_block_tables, prefill_slot_mapping)),
+        PhaseTestParameters(  # Decode-phase test params
+            PackedQKVO(None, decode_packed_ideal_output),
+            KVMemoryMap(decode_block_tables, decode_slot_mapping)))
+
+
+def _run_encoder_attention_test(
+    attn: Attention,
+    encoder_test_params: PhaseTestParameters,
+    attn_metadata: AttentionMetadata,
+    test_pt: TestPoint,
+    vllm_config: VllmConfig,
+) -> torch.Tensor:
+    '''
+    Run encoder attention.
+
+    attn.forward() is passed attn_type=AttentionType.ENCODER in order
+    to configure the kernel invocation for encoder attention
+
+    Requires attn_metadata.num_decode_tokens == 0
+    (There is no encoder execution in the decode-phase)
+
+    Arguments:
+
+    * attn: Attention wrapper instance
+    * encoder_test_params: encoder PhaseTestParameters data structure;
+                           this function relies on the packed
+                           (number_of_tokens x num_heads x head_size)
+                           query/key/value fields
+    * attn_metadata: attention metadata for encoder/decoder-self attention
+    * test_pt: The TestPoint object containing test details like number of
+               model heads, head size, name of the backend being used etc.
+
+    Returns:
+    * Attention.forward() applied to packed {query,key,value} and
+      & attn_metadata
+    '''
+    assert attn_metadata.num_decode_tokens == 0
+    packed_qkv = encoder_test_params.packed_qkvo.packed_qkv
+    assert packed_qkv is not None
+    with set_forward_context(attn_metadata, vllm_config):
+        # In the test setup the shape of the query is
+        # [batch_size, seq_len, num_heads, head_size]. However
+        # the attention backend expect the shape to be
+        # [num_tokens, hidden_size]. Hence reshape the query before
+        # invoking the forward method.
+        # TODO - Update the way we construct the query so that it
+        # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
+        reshaped_query = packed_qkv.query.view(
+            -1, test_pt.num_heads * test_pt.head_size)
+        return attn.forward(reshaped_query, packed_qkv.key, packed_qkv.value)
+
+
+def _run_decoder_self_attention_test(
+    test_rsrcs: TestResources,
+    decoder_test_params: PhaseTestParameters,
+    attn_metadata: AttentionMetadata,
+    test_pt: TestPoint,
+    vllm_config: VllmConfig,
+) -> torch.Tensor:
+    '''
+    Run decoder self-attention test.
+
+    attn.forward() is passed attn_type=AttentionType.DECODER
+    in order to configure the kernel invocation for decoder self-attention.
+
+    Arguments:
+
+    * test_rsrcs: TestResources instance; this function relies on the kv_cache
+                  and attn (Attention wrapper instance) fields
+    * decoder_test_params: decoder PhaseTestParameters data structure;
+                           this function relies on the packed
+                           (number_of_tokens x num_heads x head_size)
+                           query/key/value fields
+    * attn_metadata: attention metadata for decoder-self attention
+                     (contains KV cache memory-mapping)
+    * test_pt: The TestPoint object containing test details like number of
+               model heads, head size, name of the backend being used etc.
+
+    Returns:
+    * Attention.forward() applied to packed_{query,key,value}, kv_cache
+      & attn_metadata
+    '''
+    attn = test_rsrcs.attn
+    packed_qkv = decoder_test_params.packed_qkvo.packed_qkv
+    assert packed_qkv is not None
+    with set_forward_context(attn_metadata, vllm_config):
+        # In the test setup the shape of the query is
+        # [batch_size, seq_len, num_heads, head_size]. However
+        # the attention backend expect the shape to be
+        # [num_tokens, hidden_size]. Hence reshape the query before
+        # invoking the forward method.
+        # TODO - Update the way we construct the query so that it
+        # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
+        reshaped_query = packed_qkv.query.view(
+            -1, test_pt.num_heads * test_pt.head_size)
+        return attn.forward(reshaped_query, packed_qkv.key, packed_qkv.value)
+
+
+def _run_encoder_decoder_cross_attention_test(
+    test_rsrcs: TestResources,
+    decoder_test_params: PhaseTestParameters,
+    cross_test_params: Optional[PhaseTestParameters],
+    attn_metadata: AttentionMetadata,
+    test_pt: TestPoint,
+    vllm_config: VllmConfig,
+) -> torch.Tensor:
+    '''
+    Run encoder/decoder cross-attention test.
+
+    Via PhaseTestParameters data structures, consumes the same query utilized
+    for decoder self-attention, plus a key/value specific to cross-attention.
+
+    if cross_test_params is None or cross_test_params.packed_qkvo.packed_qkv
+    is None, this reflects that in decode-phase cross attention there
+    is no growth in the key and value tensors.
+
+    attn.forward() is passed attn_type=AttentionType.ENCODER_DECODER
+    in order to configure the kernel invocation for encoder/decoder cross-
+    attention.
+
+    Arguments:
+
+    * test_rsrcs: TestResources instance; this function relies on the kv_cache
+                  and attn (Attention wrapper instance) fields
+    * decoder_test_params: decoder PhaseTestParameters data structure;
+                           this function relies on the packed
+                           (number_of_tokens x num_heads x head_size)
+                           query field
+    * cross_test_params: encoder/decoder PhaseTestParameters data structure;
+                         this function relies on the packed
+                         (number_of_tokens x num_heads x head_size)
+                         key/value fields
+    * attn_metadata: attention metadata for encoder/decoder-self attention
+    * test_pt: The TestPoint object containing test details like number of
+               model heads, head size, name of the backend being used etc.
+
+    Returns:
+    * Attention.forward() applied to packed_{query,key,value}, kv_cache
+      & attn_metadata
+    '''
+    assert decoder_test_params.packed_qkvo.packed_qkv is not None
+
+    attn = test_rsrcs.attn
+    if cross_test_params is None:
+        key = None
+        value = None
+    else:
+        cross_pckd_qkv = cross_test_params.packed_qkvo.packed_qkv
+        key = (None if cross_pckd_qkv is None else cross_pckd_qkv.key)
+        value = (None if cross_pckd_qkv is None else cross_pckd_qkv.value)
+    with set_forward_context(attn_metadata, vllm_config):
+        # In the test setup the shape of the query is
+        # [batch_size, seq_len, num_heads, head_size]. However
+        # the attention backend expect the shape to be
+        # [num_tokens, hidden_size]. Hence reshape the query before
+        # invoking the forward method.
+        # TODO - Update the way we construct the query so that it
+        # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
+        reshaped_query = decoder_test_params.packed_qkvo.packed_qkv.query.view(
+            -1, test_pt.num_heads * test_pt.head_size)
+        return attn.forward(reshaped_query, key, value)
+
+
+@pytest.fixture(autouse=True)
+def set_reset_environment(attn_backend):
+    # Set the default torch datatype to bfloat16 to enable
+    # testing of the Flash Attention backend. Also clear the
+    # cached value of the backend.
+    default_dtype = torch.get_default_dtype()
+    if attn_backend.name == 'FLASH_ATTN':
+        torch.set_default_dtype(torch.bfloat16)
+    _cached_get_attn_backend.cache_clear()
+    yield
+    # Reset the torch datatype to what it was before the test
+    # so as not to impact the remaining tests.
+    torch.set_default_dtype(default_dtype)
+
+
+@pytest.mark.skipif(current_platform.is_rocm(),
+                    reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("max_dec_seq_len", MAX_DEC_SEQ_LENS)
+@pytest.mark.parametrize("max_enc_seq_len", MAX_ENC_SEQ_LENS)
+def test_encoder_only(
+    num_heads: int,
+    head_size: int,
+    attn_backend: _Backend,
+    batch_size: int,
+    block_size: int,
+    max_dec_seq_len: int,
+    max_enc_seq_len: int,
+):
+    '''
+    End-to-end encoder-only attention test:
+
+    * Construct fake test vectors for (1) encoder attention
+    * Construct (1) attention metadata structure with prefill-phase
+      encoder attention, and (2) an analogous attention metadata
+      structure but for decode-phase
+    * Test & validate encoder attention against ideal output
+
+    No KV cache is required for encoder-only attention.
+
+    Note on ROCm/HIP: currently encoder/decoder models are not supported on
+    AMD GPUs, therefore this test simply is skipped if
+    current_platform.is_rocm().
+
+    This test globally forces an override of the usual backend
+    auto-selection process, forcing the specific backend-under-test
+    to be utilized.
+
+    Arguments:
+
+    * num_heads
+    * head_size,
+    * attn_backend: The attention backend to employ for testing
+    * batch_size
+    * block_size: KV cache block size
+    * max_dec_seq_len: max length of decoder input sequences
+    * max_enc_seq_len: max length of encoder input sequences
+    '''
+    # Force Attention wrapper backend
+    with global_force_attn_backend_context_manager(attn_backend):
+        # Note: KV cache size of 4096 is arbitrary & chosen intentionally
+        # to be more than necessary, since exceeding the kv cache size
+        # is not part of this test
+        test_pt = TestPoint(num_heads, head_size, attn_backend.name,
+                            batch_size, block_size, max_dec_seq_len,
+                            max_enc_seq_len, 4096, AttentionType.ENCODER)
+
+        # Attention scale factor, attention backend instance, attention wrapper
+        # instance, KV cache init
+        vllm_config = VllmConfig()
+        with set_current_vllm_config(vllm_config):
+            test_rsrcs = _make_test_resources(test_pt)
+
+        # Construct encoder attention test params (only used
+        # during prefill)
+
+        enc_test_params = _encoder_attn_setup(test_pt, test_rsrcs)
+
+        # Shared prefill metadata structure
+
+        prephase_attn_metadata: AttentionMetadata = make_test_metadata(
+            attn_backend,
+            True,
+            None,
+            decoder_test_params=None,
+            encoder_test_params=enc_test_params,
+            cross_test_params=None,
+            device=CUDA_DEVICE)
+
+        # PREFILL: encoder attention
+
+        enc_pckd_act_out: torch.Tensor = (_run_encoder_attention_test(
+            test_rsrcs.attn,
+            enc_test_params,
+            prephase_attn_metadata,
+            test_pt=test_pt,
+            vllm_config=vllm_config))
+
+        # - Is encoder attention result correct?
+        assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out,
+                                    attn_backend.name)
+
+
+@pytest.mark.skipif(current_platform.is_rocm(),
+                    reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("max_dec_seq_len", MAX_DEC_SEQ_LENS)
+@pytest.mark.parametrize("max_enc_seq_len", MAX_ENC_SEQ_LENS)
+def test_e2e_enc_dec_attn(
+    num_heads: int,
+    head_size: int,
+    attn_backend: _Backend,
+    batch_size: int,
+    block_size: int,
+    max_dec_seq_len: int,
+    max_enc_seq_len: int,
+) -> None:
+    '''
+    End-to-end encoder/decoder test:
+
+    * Construct fake test vectors for (1) encoder attention,
+      (2) decoder self-attention, and (3) encoder/decoder cross-attention
+    * Construct (1) attention metadata structure with self- and cross-attention
+      attributes for prefill-phase, and (2) an analogous attention metadata
+      structure but for decode-phase
+    * Test attention steps in the following order
+
+        * Encoder attention
+        * Prefill self-attention
+        * Prefill cross-attention
+        * Decode self-attention
+        * Decode cross-attention
+        * Besides being reflective of realistic use-cases, this order would
+          exacerbate any accidental overlap in the self-/cross-attention
+          block tables, which one hopes to avoid
+
+
+    * Validate output correctness against ideal reference attention
+      implementation
+
+    Block tables are constructed such that cross-attention KV cache is in a
+    higher, non-intersecting address-space than self-attention KV cache.
+
+    Self- and cross-attention share the same query tensor but not the K/V
+    tensors. Self-attention K/Vs must have the same seq len as Q while
+    cross-attention K/Vs are allowed to differ in seq len, as is often the case
+    for cross-attention.
+
+    This test globally forces an override of the usual backend
+    auto-selection process, forcing the specific backend-under-test
+    to be utilized.
+
+    Note on ROCm/HIP: currently encoder/decoder models are not supported on
+    AMD GPUs, therefore this test simply is skipped if
+    current_platform.is_rocm().
+
+    Note on metadata: there is a single attention metadata structure shared by
+    all prefill-phase attention operations (encoder, decoder, enc/dec cross),
+    and a single one shared by all decode-phase attention operations
+    (decoder & enc/dec cross.) This is intended to reflect the behavior
+    of EncoderDecoderModelRunner, which constructs a single attention metadata
+    structure for each prefill or decode run. A realistic scenario would rely
+    on the attention backend to utilize the appropriate attention metadata
+    fields according to the value of attn_metadata.attention_type. Thus,
+    this test is organized so as to confirm that the backend-under-test can
+    handle a shared prefill attention metadata structure & a shared decode\
+    attention metadata structure.
+
+    Arguments:
+
+    * num_heads
+    * head_size,
+    * attn_backend: The attention backend to employ for testing
+    * batch_size
+    * block_size: KV cache block size
+    * max_dec_seq_len: max length of decoder input sequences
+    * max_enc_seq_len: max length of encoder input sequences
+    '''
+    # Force Attention wrapper backend
+    with global_force_attn_backend_context_manager(attn_backend):
+        # Note: KV cache size of 4096 is arbitrary & chosen intentionally
+        # to be more than necessary, since exceeding the kv cache size
+        # is not part of this test
+        enc_test_pt = TestPoint(num_heads, head_size, attn_backend.name,
+                                batch_size, block_size, max_dec_seq_len,
+                                max_enc_seq_len, 4096, AttentionType.ENCODER)
+        enc_dec_test_pt = TestPoint(num_heads, head_size, attn_backend.name,
+                                    batch_size, block_size, max_dec_seq_len,
+                                    max_enc_seq_len, 4096,
+                                    AttentionType.ENCODER_DECODER)
+        dec_test_pt = TestPoint(num_heads, head_size, attn_backend.name,
+                                batch_size, block_size, max_dec_seq_len,
+                                max_enc_seq_len, 4096, AttentionType.DECODER)
+
+        # Attention scale factor, attention backend instance, attention wrapper
+        # instance, KV cache init
+        vllm_config = VllmConfig()
+        with set_current_vllm_config(vllm_config):
+            enc_test_rsrcs = _make_test_resources(enc_test_pt)
+            enc_dec_test_rsrcs = _make_test_resources(enc_dec_test_pt)
+            dec_test_rsrcs = _make_test_resources(dec_test_pt)
+
+        # Construct encoder attention test params (only used
+        # during prefill)
+
+        enc_test_params = _encoder_attn_setup(enc_test_pt, enc_test_rsrcs)
+
+        # Construct Decoder self-attention prefill-phase & decode-phase
+        # test params, including query/key/value tensors, decoder self-attention
+        # memory-mapping. cross_block_base_addr is the uppermost address in the
+        # decoder self-attention block-table, i.e. a base address which the
+        # encoder/decoder cross-attention block-table may build downward toward.
+
+        (
+            dec_qkv,
+            prephase_dec_test_params,
+            decphase_dec_test_params,
+            cross_block_base_addr,
+        ) = _decoder_attn_setup(dec_test_pt, dec_test_rsrcs)
+
+        # Construct encoder/decoder cross-attention prefill-phase
+        # & decode-phase test params, including key/value tensors,
+        # cross-attention memory-mapping
+
+        (
+            prephase_cross_test_params,
+            decphase_cross_test_params,
+        ) = _enc_dec_cross_attn_setup_reuses_query(
+            dec_qkv,
+            enc_test_params,
+            prephase_dec_test_params,
+            enc_dec_test_pt,
+            enc_dec_test_rsrcs,
+            block_base_addr=cross_block_base_addr)
+
+        # Shared prefill metadata structure
+        assert prephase_dec_test_params.packed_qkvo.packed_qkv is not None
+        prephase_attn_metadata: AttentionMetadata = make_test_metadata(
+            attn_backend,
+            True,
+            prephase_dec_test_params.packed_qkvo.packed_qkv.q_seq_lens,
+            decoder_test_params=prephase_dec_test_params,
+            encoder_test_params=enc_test_params,
+            cross_test_params=prephase_cross_test_params,
+            device=CUDA_DEVICE)
+
+        # PREFILL: encoder attention
+
+        enc_pckd_act_out = _run_encoder_attention_test(enc_test_rsrcs.attn,
+                                                       enc_test_params,
+                                                       prephase_attn_metadata,
+                                                       test_pt=enc_test_pt,
+                                                       vllm_config=vllm_config)
+
+        # - Is encoder attention result correct?
+        assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out,
+                                    attn_backend.name)
+
+        # PREFILL: decoder self-attention test
+
+        prephase_dec_pckd_act_out = _run_decoder_self_attention_test(
+            dec_test_rsrcs,
+            prephase_dec_test_params,
+            prephase_attn_metadata,
+            test_pt=dec_test_pt,
+            vllm_config=vllm_config)
+
+        # - Is prefill decoder self-attention correct?
+        assert_actual_matches_ideal(prephase_dec_test_params,
+                                    prephase_dec_pckd_act_out,
+                                    attn_backend.name)
+
+        # PREFILL: encoder/decoder cross-attention test
+
+        prephase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test(
+            enc_dec_test_rsrcs,
+            prephase_dec_test_params,
+            prephase_cross_test_params,
+            prephase_attn_metadata,
+            test_pt=enc_dec_test_pt,
+            vllm_config=vllm_config)
+
+        # - Is prefill encoder/decoder cross-attention correct?
+        assert_actual_matches_ideal(prephase_cross_test_params,
+                                    prephase_cross_pckd_act_out,
+                                    attn_backend.name)
+
+        # DECODE: build decode-phase attention metadata
+
+        decphase_attn_metadata: AttentionMetadata = make_test_metadata(
+            attn_backend,
+            False,
+            dec_qkv.q_seq_lens,
+            decoder_test_params=decphase_dec_test_params,
+            encoder_test_params=enc_test_params,
+            cross_test_params=decphase_cross_test_params,
+            device=CUDA_DEVICE)
+
+        # DECODE: decoder self-attention test
+
+        decphase_dec_pckd_act_out = _run_decoder_self_attention_test(
+            dec_test_rsrcs,
+            decphase_dec_test_params,
+            decphase_attn_metadata,
+            test_pt=dec_test_pt,
+            vllm_config=vllm_config)
+
+        # - Is decode-phase decoder self-attention correct?
+        assert_actual_matches_ideal(decphase_dec_test_params,
+                                    decphase_dec_pckd_act_out,
+                                    attn_backend.name)
+
+        # DECODE: encoder/decoder cross-attention test
+
+        decphase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test(
+            enc_dec_test_rsrcs,
+            decphase_dec_test_params,
+            None,
+            decphase_attn_metadata,
+            test_pt=enc_dec_test_pt,
+            vllm_config=vllm_config)
+
+        # - Is decode-phase encoder/decoder cross-attention correct?
+        assert_actual_matches_ideal(decphase_cross_test_params,
+                                    decphase_cross_pckd_act_out,
+                                    attn_backend.name)
diff --git a/vllm_v0.10.0/tests/kernels/attention/test_flash_attn.py b/vllm_v0.10.0/tests/kernels/attention/test_flash_attn.py
new file mode 100644
index 0000000..bd3190d
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/attention/test_flash_attn.py
@@ -0,0 +1,317 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+from vllm.vllm_flash_attn import (fa_version_unsupported_reason,
+                                  flash_attn_varlen_func,
+                                  flash_attn_with_kvcache,
+                                  is_fa_version_supported)
+
+NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
+HEAD_SIZES = [128, 256]
+BLOCK_SIZES = [16, 32]
+DTYPES = [torch.float16, torch.bfloat16]
+QDTYPES = [None, torch.float8_e4m3fn]
+# one value large enough to test overflow in index calculation.
+# one value small enough to test the schema op check
+NUM_BLOCKS = [32768, 2048]
+
+
+def ref_paged_attn(
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    query_lens: list[int],
+    kv_lens: list[int],
+    block_tables: torch.Tensor,
+    scale: float,
+    sliding_window: Optional[int] = None,
+    soft_cap: Optional[float] = None,
+) -> torch.Tensor:
+    num_seqs = len(query_lens)
+    block_tables = block_tables.cpu().numpy()
+    _, block_size, num_kv_heads, head_size = key_cache.shape
+
+    outputs: list[torch.Tensor] = []
+    start_idx = 0
+    for i in range(num_seqs):
+        query_len = query_lens[i]
+        kv_len = kv_lens[i]
+        q = query[start_idx:start_idx + query_len]
+        q *= scale
+
+        num_kv_blocks = (kv_len + block_size - 1) // block_size
+        block_indices = block_tables[i, :num_kv_blocks]
+
+        k = key_cache[block_indices].view(-1, num_kv_heads, head_size)
+        k = k[:kv_len]
+        v = value_cache[block_indices].view(-1, num_kv_heads, head_size)
+        v = v[:kv_len]
+
+        if q.shape[1] != k.shape[1]:
+            k = torch.repeat_interleave(k, q.shape[1] // k.shape[1], dim=1)
+            v = torch.repeat_interleave(v, q.shape[1] // v.shape[1], dim=1)
+        attn = torch.einsum("qhd,khd->hqk", q, k).float()
+        empty_mask = torch.ones(query_len, kv_len)
+        mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
+        if sliding_window is not None:
+            sliding_window_mask = torch.triu(empty_mask,
+                                             diagonal=kv_len -
+                                             (query_len + sliding_window) +
+                                             1).bool().logical_not()
+            mask |= sliding_window_mask
+        if soft_cap is not None:
+            attn = soft_cap * torch.tanh(attn / soft_cap)
+        attn.masked_fill_(mask, float("-inf"))
+        attn = torch.softmax(attn, dim=-1).to(v.dtype)
+        out = torch.einsum("hqk,khd->qhd", attn, v)
+
+        outputs.append(out)
+        start_idx += query_len
+
+    return torch.cat(outputs, dim=0)
+
+
+@pytest.mark.parametrize("use_out", [True, False])
+@pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]])
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("sliding_window", [None, 256])
+@pytest.mark.parametrize("fa_version", [2, 3])
+@pytest.mark.parametrize("q_dtype", QDTYPES)
+@torch.inference_mode()
+def test_flash_attn_with_paged_kv(
+    use_out: bool,
+    kv_lens: list[int],
+    num_heads: tuple[int, int],
+    head_size: int,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: Optional[float],
+    num_blocks: int,
+    sliding_window: Optional[int],
+    fa_version: int,
+    q_dtype: Optional[torch.dtype],
+) -> None:
+    torch.set_default_device("cuda")
+    if not is_fa_version_supported(fa_version):
+        pytest.skip(f"Flash attention version {fa_version} not supported due "
+                    f"to: \"{fa_version_unsupported_reason(fa_version)}\"")
+    if q_dtype is not None and (dtype != torch.bfloat16 or fa_version == 2):
+        pytest.skip("Flash attention with quantized inputs is only "
+                    "supported on version 3 with bfloat16 base type")
+
+    current_platform.seed_everything(0)
+    num_seqs = len(kv_lens)
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    max_kv_len = max(kv_lens)
+    scale = head_size**-0.5
+    window_size = ((sliding_window - 1, 0) if sliding_window is not None else
+                   (-1, -1))
+
+    query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
+    key_cache = torch.randn(num_blocks,
+                            block_size,
+                            num_kv_heads,
+                            head_size,
+                            dtype=dtype)
+    value_cache = torch.randn_like(key_cache)
+    kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32)
+
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(0,
+                                 num_blocks,
+                                 (num_seqs, max_num_blocks_per_seq),
+                                 dtype=torch.int32)
+
+    q = query.unsqueeze(1)
+    out = torch.empty_like(q) if use_out else None
+
+    maybe_quantized_query = q
+    maybe_quantized_key_cache = key_cache
+    maybe_quantized_value_cache = value_cache
+    q_descale = None
+    k_descale = None
+    v_descale = None
+    if q_dtype is not None:
+        # QKV are drawn from N(0, 1): no need for a fp8 scaling factor
+        maybe_quantized_query = q.to(q_dtype)
+        maybe_quantized_key_cache = key_cache.to(q_dtype)
+        maybe_quantized_value_cache = value_cache.to(q_dtype)
+
+        scale_shape = (num_seqs, num_kv_heads)
+        q_descale = torch.ones(scale_shape, dtype=torch.float32)
+        k_descale = torch.ones(scale_shape, dtype=torch.float32)
+        v_descale = torch.ones(scale_shape, dtype=torch.float32)
+
+    output = flash_attn_with_kvcache(
+        q=maybe_quantized_query,
+        k_cache=maybe_quantized_key_cache,
+        v_cache=maybe_quantized_value_cache,
+        out=out,
+        softmax_scale=scale,
+        causal=True,
+        block_table=block_tables,
+        cache_seqlens=kv_lens_tensor,
+        softcap=soft_cap if soft_cap is not None else 0,
+        window_size=window_size,
+        fa_version=fa_version,
+        q_descale=q_descale,
+        k_descale=k_descale,
+        v_descale=v_descale,
+    )
+    output = output if not use_out else out
+    output = output.squeeze(1)
+
+    atol, rtol = 1.5e-2, 1e-2
+    if q_dtype is not None:
+        atol, rtol = 1.5e-1, 1.5e-1
+
+    ref_output = ref_paged_attn(query=query,
+                                key_cache=key_cache,
+                                value_cache=value_cache,
+                                query_lens=[1] * num_seqs,
+                                kv_lens=kv_lens,
+                                block_tables=block_tables,
+                                scale=scale,
+                                soft_cap=soft_cap,
+                                sliding_window=sliding_window)
+    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol), \
+        f"{torch.max(torch.abs(output - ref_output))}"
+
+
+@pytest.mark.parametrize("use_out", [True, False])
+@pytest.mark.parametrize("seq_lens",
+                         [[(1, 1328), (5, 18),
+                           (129, 463)], [(1, 523), (1, 37), (1, 2011)]])
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("sliding_window", [None, 256])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("fa_version", [2, 3])
+@pytest.mark.parametrize("q_dtype", QDTYPES)
+@torch.inference_mode()
+def test_varlen_with_paged_kv(
+    use_out: bool,
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: Optional[int],
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: Optional[float],
+    num_blocks: int,
+    fa_version: int,
+    q_dtype: Optional[torch.dtype],
+) -> None:
+    torch.set_default_device("cuda")
+    if not is_fa_version_supported(fa_version):
+        pytest.skip(f"Flash attention version {fa_version} not supported due "
+                    f"to: \"{fa_version_unsupported_reason(fa_version)}\"")
+    if q_dtype is not None and (dtype != torch.bfloat16 or fa_version == 2):
+        pytest.skip("Flash attention with quantized inputs is only "
+                    "supported on version 3 with bfloat16 base type")
+    current_platform.seed_everything(0)
+    num_seqs = len(seq_lens)
+    query_lens = [x[0] for x in seq_lens]
+    kv_lens = [x[1] for x in seq_lens]
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    max_query_len = max(query_lens)
+    max_kv_len = max(kv_lens)
+    window_size = ((sliding_window - 1, 0) if sliding_window is not None else
+                   (-1, -1))
+    scale = head_size**-0.5
+
+    query = torch.randn(sum(query_lens),
+                        num_query_heads,
+                        head_size,
+                        dtype=dtype)
+    key_cache = torch.randn(num_blocks,
+                            block_size,
+                            num_kv_heads,
+                            head_size,
+                            dtype=dtype)
+    value_cache = torch.randn_like(key_cache)
+    cu_query_lens = torch.tensor([0] + query_lens,
+                                 dtype=torch.int32).cumsum(dim=0,
+                                                           dtype=torch.int32)
+    kv_lens = torch.tensor(kv_lens, dtype=torch.int32)
+
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(0,
+                                 num_blocks,
+                                 (num_seqs, max_num_blocks_per_seq),
+                                 dtype=torch.int32)
+
+    out = torch.empty_like(query) if use_out else None
+
+    maybe_quantized_query = query
+    maybe_quantized_key_cache = key_cache
+    maybe_quantized_value_cache = value_cache
+    q_descale = None
+    k_descale = None
+    v_descale = None
+    if q_dtype is not None:
+        # QKV are drawn from N(0, 1): no need for a fp8 scaling factor
+        maybe_quantized_query = query.to(q_dtype)
+        maybe_quantized_key_cache = key_cache.to(q_dtype)
+        maybe_quantized_value_cache = value_cache.to(q_dtype)
+
+        scale_shape = (num_seqs, num_kv_heads)
+        q_descale = torch.ones(scale_shape, dtype=torch.float32)
+        k_descale = torch.ones(scale_shape, dtype=torch.float32)
+        v_descale = torch.ones(scale_shape, dtype=torch.float32)
+
+    output = flash_attn_varlen_func(
+        q=maybe_quantized_query,
+        k=maybe_quantized_key_cache,
+        v=maybe_quantized_value_cache,
+        out=out,
+        cu_seqlens_q=cu_query_lens,
+        seqused_k=kv_lens,
+        max_seqlen_q=max_query_len,
+        max_seqlen_k=max_kv_len,
+        softmax_scale=scale,
+        causal=True,
+        window_size=window_size,
+        block_table=block_tables,
+        softcap=soft_cap if soft_cap is not None else 0,
+        fa_version=fa_version,
+        q_descale=q_descale,
+        k_descale=k_descale,
+        v_descale=v_descale,
+    )
+    output = output if not use_out else out
+
+    ref_output = ref_paged_attn(
+        query=query,
+        key_cache=key_cache,
+        value_cache=value_cache,
+        query_lens=query_lens,
+        kv_lens=kv_lens,
+        block_tables=block_tables,
+        scale=scale,
+        sliding_window=sliding_window,
+        soft_cap=soft_cap,
+    )
+    atol, rtol = 1.5e-2, 1e-2
+    if q_dtype is not None:
+        atol, rtol = 1.5e-1, 1.5e-1
+    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol), \
+        f"{torch.max(torch.abs(output - ref_output))}"
diff --git a/vllm_v0.10.0/tests/kernels/attention/test_flashinfer.py b/vllm_v0.10.0/tests/kernels/attention/test_flashinfer.py
new file mode 100644
index 0000000..8f9b4ec
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/attention/test_flashinfer.py
@@ -0,0 +1,488 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import flashinfer
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+
+NUM_HEADS = [(16, 16), (32, 8), (64, 8), (6, 1)]
+HEAD_SIZES = [128, 256]
+BLOCK_SIZES = [16, 32]
+DTYPES = [torch.float16, torch.bfloat16]
+NUM_BLOCKS = 32768  # Large enough to test overflow in index calculation.
+
+
+def ref_paged_attn(
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    query_lens: list[int],
+    kv_lens: list[int],
+    block_tables: torch.Tensor,
+    scale: float,
+    sliding_window: Optional[int] = None,
+    soft_cap: Optional[float] = None,
+) -> torch.Tensor:
+    num_seqs = len(query_lens)
+    block_tables = block_tables.cpu().numpy()
+    _, block_size, num_kv_heads, head_size = key_cache.shape
+
+    outputs: list[torch.Tensor] = []
+    start_idx = 0
+    for i in range(num_seqs):
+        query_len = query_lens[i]
+        kv_len = kv_lens[i]
+        q = query[start_idx:start_idx + query_len]
+        q *= scale
+
+        num_kv_blocks = (kv_len + block_size - 1) // block_size
+        block_indices = block_tables[i, :num_kv_blocks]
+
+        k = key_cache[block_indices].view(-1, num_kv_heads, head_size)
+        k = k[:kv_len]
+        v = value_cache[block_indices].view(-1, num_kv_heads, head_size)
+        v = v[:kv_len]
+
+        if q.shape[1] != k.shape[1]:
+            k = torch.repeat_interleave(k, q.shape[1] // k.shape[1], dim=1)
+            v = torch.repeat_interleave(v, q.shape[1] // v.shape[1], dim=1)
+        attn = torch.einsum("qhd,khd->hqk", q, k).float()
+        empty_mask = torch.ones(query_len, kv_len)
+        mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
+        if sliding_window is not None:
+            sliding_window_mask = torch.triu(empty_mask,
+                                             diagonal=kv_len -
+                                             (query_len + sliding_window) +
+                                             1).bool().logical_not()
+            mask |= sliding_window_mask
+        if soft_cap is not None:
+            attn = soft_cap * torch.tanh(attn / soft_cap)
+        attn.masked_fill_(mask, float("-inf"))
+        attn = torch.softmax(attn, dim=-1).to(v.dtype)
+        out = torch.einsum("hqk,khd->qhd", attn, v)
+
+        outputs.append(out)
+        start_idx += query_len
+
+    return torch.cat(outputs, dim=0)
+
+
+@pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]])
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
+@pytest.mark.parametrize("sliding_window", [None, 64])
+@torch.inference_mode
+def test_flashinfer_decode_with_paged_kv(
+    kv_lens: list[int],
+    num_heads: tuple[int, int],
+    head_size: int,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: Optional[float],
+    sliding_window: Optional[int],
+) -> None:
+    torch.set_default_device("cuda")
+    current_platform.seed_everything(0)
+    num_seqs = len(kv_lens)
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    max_kv_len = max(kv_lens)
+    scale = head_size**-0.5
+
+    query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
+
+    key_value_cache = torch.randn(NUM_BLOCKS,
+                                  2,
+                                  block_size,
+                                  num_kv_heads,
+                                  head_size,
+                                  dtype=dtype)
+    key_cache = key_value_cache[:, 0, :, :, :].squeeze(1)
+    value_cache = key_value_cache[:, 1, :, :, :].squeeze(1)
+
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(0,
+                                 NUM_BLOCKS,
+                                 (num_seqs, max_num_blocks_per_seq),
+                                 dtype=torch.int32)
+
+    kv_indptr = [0]
+    kv_indices = []
+    kv_last_page_lens = []
+    for i in range(num_seqs):
+        seq_len = kv_lens[i]
+        assert seq_len > 0
+        num_blocks = (seq_len + block_size - 1) // block_size
+        kv_indices.extend(block_tables[i, :num_blocks])
+        kv_indptr.append(kv_indptr[-1] + num_blocks)
+        kv_last_page_len = seq_len % block_size
+        if kv_last_page_len == 0:
+            kv_last_page_len = block_size
+        kv_last_page_lens.append(kv_last_page_len)
+
+    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
+    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
+    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
+
+    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
+    wrapper = flashinfer.\
+        BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD",
+                use_tensor_cores=(
+                    (num_query_heads//num_kv_heads) > 4)
+                )
+    wrapper.plan(
+        kv_indptr,
+        kv_indices,
+        kv_last_page_lens,
+        num_query_heads,
+        num_kv_heads,
+        head_size,
+        block_size,
+        "NONE",
+        window_left=sliding_window - 1 if sliding_window is not None else -1,
+        q_data_type=dtype,
+        kv_data_type=dtype,
+        logits_soft_cap=soft_cap,
+    )
+
+    output = wrapper.run(query, key_value_cache)
+
+    ref_output = ref_paged_attn(query=query,
+                                key_cache=key_cache,
+                                value_cache=value_cache,
+                                query_lens=[1] * num_seqs,
+                                kv_lens=kv_lens,
+                                block_tables=block_tables,
+                                scale=scale,
+                                soft_cap=soft_cap,
+                                sliding_window=sliding_window)
+    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \
+        f"{torch.max(torch.abs(output - ref_output))}"
+
+
+@pytest.mark.parametrize("seq_lens", [[(1, 1328), (5, 18), (129, 463)]])
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
+@pytest.mark.parametrize("sliding_window", [None, 64])
+@torch.inference_mode
+def test_flashinfer_prefill_with_paged_kv(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: Optional[float],
+    sliding_window: Optional[int],
+) -> None:
+    torch.set_default_device("cuda")
+    current_platform.seed_everything(0)
+    num_seqs = len(seq_lens)
+    query_lens = [x[0] for x in seq_lens]
+    kv_lens = [x[1] for x in seq_lens]
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    max_kv_len = max(kv_lens)
+    scale = head_size**-0.5
+
+    query = torch.randn(sum(query_lens),
+                        num_query_heads,
+                        head_size,
+                        dtype=dtype)
+    key_value_cache = torch.randn(NUM_BLOCKS,
+                                  2,
+                                  block_size,
+                                  num_kv_heads,
+                                  head_size,
+                                  dtype=dtype)
+    key_cache = key_value_cache[:, 0, :, :, :].squeeze(1)
+    value_cache = key_value_cache[:, 1, :, :, :].squeeze(1)
+
+    # Normalize the scale of the key and value caches to mitigate
+    # numerical instability.
+    key_cache /= head_size**0.5
+    value_cache /= head_size**0.5
+
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(0,
+                                 NUM_BLOCKS,
+                                 (num_seqs, max_num_blocks_per_seq),
+                                 dtype=torch.int32)
+
+    qo_indptr = [0]
+    kv_indptr = [0]
+    kv_indices = []
+    kv_last_page_lens = []
+    for i in range(num_seqs):
+        seq_len = kv_lens[i]
+        assert seq_len > 0
+        num_blocks = (seq_len + block_size - 1) // block_size
+        kv_indices.extend(block_tables[i, :num_blocks])
+        kv_indptr.append(kv_indptr[-1] + num_blocks)
+        kv_last_page_len = seq_len % block_size
+        if kv_last_page_len == 0:
+            kv_last_page_len = block_size
+        kv_last_page_lens.append(kv_last_page_len)
+        qo_indptr.append(qo_indptr[-1] + query_lens[i])
+
+    qo_indptr = torch.tensor(qo_indptr, dtype=torch.int32)
+    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
+    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
+    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
+
+    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
+    wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
+        workspace_buffer, "NHD")
+    wrapper.plan(
+        qo_indptr,
+        kv_indptr,
+        kv_indices,
+        kv_last_page_lens,
+        num_query_heads,
+        num_kv_heads,
+        head_size,
+        block_size,
+        window_left=sliding_window - 1 if sliding_window is not None else -1,
+        q_data_type=dtype,
+        kv_data_type=dtype,
+        logits_soft_cap=soft_cap,
+    )
+
+    output = wrapper.run(
+        query,
+        key_value_cache,
+    )
+
+    ref_output = ref_paged_attn(query=query,
+                                key_cache=key_cache,
+                                value_cache=value_cache,
+                                query_lens=query_lens,
+                                kv_lens=kv_lens,
+                                block_tables=block_tables,
+                                scale=scale,
+                                soft_cap=soft_cap,
+                                sliding_window=sliding_window)
+    torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2), \
+        f"{torch.max(torch.abs(output - ref_output))}"
+
+
+@pytest.mark.parametrize("seq_lens", [[(1, 132), (5, 18)]])
+@pytest.mark.parametrize("num_heads", [(32, 8), (6, 1)])
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
+def test_flashinfer_prefill_with_paged_fp8_kv(
+        seq_lens: list[tuple[int, int]], num_heads: tuple[int, int],
+        head_size: int, dtype: torch.dtype, block_size: int,
+        soft_cap: Optional[float]) -> None:
+    pytest.skip("TODO: fix the accuracy issue")
+    torch.set_default_device("cuda")
+    current_platform.seed_everything(0)
+    num_seqs = len(seq_lens)
+    query_lens = [x[0] for x in seq_lens]
+    kv_lens = [x[1] for x in seq_lens]
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    max_kv_len = max(kv_lens)
+    scale = head_size**-0.5
+
+    kv_cache_dtype = torch.float8_e4m3fn
+
+    query = torch.randn(sum(query_lens),
+                        num_query_heads,
+                        head_size,
+                        dtype=dtype)
+    NUM_BLOCKS_FP8 = 2048
+    key_value_cache = torch.randn(NUM_BLOCKS_FP8,
+                                  2,
+                                  block_size,
+                                  num_kv_heads,
+                                  head_size,
+                                  dtype=dtype)
+    key_cache, value_cache = torch.chunk(key_value_cache, 2, dim=1)
+    key_cache /= head_size**0.5
+    value_cache /= head_size**0.5
+
+    k_scale = key_cache.amax().item() / 448.0
+    v_scale = value_cache.amax().item() / 448.0
+
+    kv_cache_fp8 = torch.cat([key_cache / k_scale, value_cache / v_scale],
+                             dim=1).to(kv_cache_dtype)
+
+    assert (kv_cache_fp8.shape == key_value_cache.shape)
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(0,
+                                 NUM_BLOCKS_FP8,
+                                 (num_seqs, max_num_blocks_per_seq),
+                                 dtype=torch.int32)
+
+    qo_indptr = [0]
+    kv_indptr = [0]
+    kv_indices = []
+    kv_last_page_lens = []
+    for i in range(num_seqs):
+        seq_len = kv_lens[i]
+        assert seq_len > 0
+        num_blocks = (seq_len + block_size - 1) // block_size
+        kv_indices.extend(block_tables[i, :num_blocks])
+        kv_indptr.append(kv_indptr[-1] + num_blocks)
+        kv_last_page_len = seq_len % block_size
+        if kv_last_page_len == 0:
+            kv_last_page_len = block_size
+        kv_last_page_lens.append(kv_last_page_len)
+        qo_indptr.append(qo_indptr[-1] + query_lens[i])
+
+    qo_indptr = torch.tensor(qo_indptr, dtype=torch.int32)
+    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
+    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
+    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
+
+    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
+    wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
+        workspace_buffer, "NHD")
+    wrapper.plan(
+        qo_indptr,
+        kv_indptr,
+        kv_indices,
+        kv_last_page_lens,
+        num_query_heads,
+        num_kv_heads,
+        head_size,
+        block_size,
+        q_data_type=dtype,
+        kv_data_type=kv_cache_dtype,
+        logits_soft_cap=soft_cap,
+    )
+
+    output = wrapper.run(query, kv_cache_fp8, k_scale=k_scale, v_scale=v_scale)
+
+    ref_output = ref_paged_attn(query=query,
+                                key_cache=key_cache.squeeze(1),
+                                value_cache=value_cache.squeeze(1),
+                                query_lens=query_lens,
+                                kv_lens=kv_lens,
+                                block_tables=block_tables,
+                                scale=scale,
+                                soft_cap=soft_cap)
+    del query
+    del block_tables
+    # verify prefill fp8
+    torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2), \
+        f"{torch.max(torch.abs(output - ref_output))}"
+
+
+@pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]])
+@pytest.mark.parametrize("num_heads", [(32, 8), (64, 8), (6, 1)])
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
+@torch.inference_mode
+def test_flashinfer_decode_with_paged_fp8_kv(
+    kv_lens: list[int],
+    num_heads: tuple[int, int],
+    head_size: int,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: Optional[float],
+) -> None:
+    pytest.skip("TODO: fix the accuracy issue")
+    # test doesn't work for num_heads = (16,16)
+    torch.set_default_device("cuda")
+    current_platform.seed_everything(0)
+    num_seqs = len(kv_lens)
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    max_kv_len = max(kv_lens)
+    scale = head_size**-0.5
+    use_tensor_cores = (num_query_heads // num_kv_heads) > 4
+    kv_cache_dtype = torch.float8_e4m3fn
+
+    query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
+    NUM_BLOCKS_FP8 = 2048
+    key_value_cache = torch.randn(NUM_BLOCKS_FP8,
+                                  2,
+                                  block_size,
+                                  num_kv_heads,
+                                  head_size,
+                                  dtype=dtype)
+    key_cache, value_cache = torch.chunk(key_value_cache, 2, dim=1)
+    key_cache /= head_size**0.5
+    value_cache /= head_size**0.5
+
+    k_scale = key_cache.amax().item() / 448.0
+    v_scale = value_cache.amax().item() / 448.0
+
+    key_cache_fp8 = (key_cache / k_scale).to(kv_cache_dtype)
+    value_cache_fp8 = (value_cache / v_scale).to(kv_cache_dtype)
+    assert (key_cache_fp8.shape[1] == 1 and value_cache_fp8.shape[1] == 1)
+    kv_cache_fp8 = torch.cat([key_cache_fp8, value_cache_fp8], dim=1)
+
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(0,
+                                 NUM_BLOCKS_FP8,
+                                 (num_seqs, max_num_blocks_per_seq),
+                                 dtype=torch.int32)
+
+    kv_indptr = [0]
+    kv_indices = []
+    kv_last_page_lens = []
+    for i in range(num_seqs):
+        seq_len = kv_lens[i]
+        assert seq_len > 0
+        num_blocks = (seq_len + block_size - 1) // block_size
+        kv_indices.extend(block_tables[i, :num_blocks])
+        kv_indptr.append(kv_indptr[-1] + num_blocks)
+        kv_last_page_len = seq_len % block_size
+        if kv_last_page_len == 0:
+            kv_last_page_len = block_size
+        kv_last_page_lens.append(kv_last_page_len)
+
+    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
+    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
+    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
+
+    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
+    wrapper = flashinfer.\
+        BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD",
+                    use_tensor_cores=use_tensor_cores)
+    wrapper.plan(kv_indptr,
+                 kv_indices,
+                 kv_last_page_lens,
+                 num_query_heads,
+                 num_kv_heads,
+                 head_size,
+                 block_size,
+                 "NONE",
+                 q_data_type=dtype,
+                 kv_data_type=kv_cache_dtype,
+                 logits_soft_cap=soft_cap)
+    output = wrapper.run(query, kv_cache_fp8, k_scale=k_scale, v_scale=v_scale)
+    key_cache = key_value_cache[:, 0, :, :, :].squeeze(1)
+    value_cache = key_value_cache[:, 1, :, :, :].squeeze(1)
+
+    ref_output = ref_paged_attn(query=query,
+                                key_cache=key_cache,
+                                value_cache=value_cache,
+                                query_lens=[1] * num_seqs,
+                                kv_lens=kv_lens,
+                                block_tables=block_tables,
+                                scale=scale,
+                                soft_cap=soft_cap)
+    # Temporary fix: Increasing the tolerance. Seems like a flashinfer issue
+    torch.testing.assert_close(output, ref_output, atol=2e-2, rtol=1e-2), \
+        f"{torch.max(torch.abs(output - ref_output))}"
diff --git a/vllm_v0.10.0/tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py b/vllm_v0.10.0/tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py
new file mode 100644
index 0000000..96eee13
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py
@@ -0,0 +1,140 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import flashinfer
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+
+if not current_platform.is_device_capability(100):
+    pytest.skip("This TRTLLM kernel requires NVIDIA Blackwell.",
+                allow_module_level=True)
+
+FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
+
+# KV Cache Layout for TRT-LLM
+# kv_cache_shape = (num_blocks, 2, num_kv_heads, page_size, head_dim)
+
+NUM_HEADS = [(64, 8), (16, 16), (40, 8), (32, 8)]
+HEAD_SIZES = [128]
+BLOCK_SIZES = [16, 32]
+DTYPES = [torch.float16, torch.bfloat16]
+NUM_BLOCKS = 32768  # Large enough to test overflow in index calculation.
+SOFT_CAPS = [None, 30.0, 50.0]
+
+
+def to_float8(x, dtype=torch.float8_e4m3fn):
+    finfo = torch.finfo(dtype)
+    min_val, max_val = x.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
+    scale = finfo.max / amax * 0.1
+    x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max)
+    return x_scl_sat.to(dtype), scale.float().reciprocal()
+
+
+@pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]])
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("kv_layout", ["HND"])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
+@torch.inference_mode
+def test_flashinfer_trtllm_decode_with_baseline(
+    kv_lens: list[int],
+    num_heads: tuple[int, int],
+    head_size: int,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: Optional[float],
+    kv_layout: str,
+) -> None:
+    torch.set_default_device("cuda")
+    current_platform.seed_everything(0)
+    num_seqs = len(kv_lens)
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+
+    assert num_query_heads % num_kv_heads == 0
+    max_kv_len = max(kv_lens)
+    scale = head_size**-0.5
+
+    query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
+    kv_cache_shape = None
+    if kv_layout == "NHD":
+        kv_cache_shape = (NUM_BLOCKS, 2, block_size, num_kv_heads, head_size)
+    elif kv_layout == "HND":
+        kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size)
+    else:
+        raise ValueError(f"Invalid kv_layout: {kv_layout}")
+    key_value_cache = torch.randn(kv_cache_shape, dtype=dtype)
+
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(0,
+                                 NUM_BLOCKS,
+                                 (num_seqs, max_num_blocks_per_seq),
+                                 dtype=torch.int32)
+    k_scale = v_scale = 1.0
+    kv_indptr = [0]
+    kv_indices = []
+    kv_last_page_lens = []
+    for i in range(num_seqs):
+        seq_len = kv_lens[i]
+        assert seq_len > 0
+        num_blocks = (seq_len + block_size - 1) // block_size
+        kv_indices.extend(block_tables[i, :num_blocks])
+        kv_indptr.append(kv_indptr[-1] + num_blocks)
+        kv_last_page_len = seq_len % block_size
+        if kv_last_page_len == 0:
+            kv_last_page_len = block_size
+        kv_last_page_lens.append(kv_last_page_len)
+
+    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
+    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
+    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
+
+    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
+    wrapper = flashinfer.\
+        BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, kv_layout,
+                use_tensor_cores=(
+                    (num_query_heads//num_kv_heads) > 4)
+                )
+    wrapper.plan(kv_indptr,
+                 kv_indices,
+                 kv_last_page_lens,
+                 num_query_heads,
+                 num_kv_heads,
+                 head_size,
+                 block_size,
+                 "NONE",
+                 q_data_type=dtype,
+                 kv_data_type=dtype,
+                 logits_soft_cap=soft_cap)
+
+    output = wrapper.run(query, key_value_cache, scale)
+
+    # TRTLLM Decode
+    max_kv_len = max(kv_lens)
+    kv_lens_tensor = torch.tensor(kv_lens,
+                                  dtype=torch.int,
+                                  device=query.device)
+    output_trtllm = flashinfer.decode.trtllm_batch_decode_with_kv_cache(
+        query.contiguous(),
+        key_value_cache,
+        workspace_buffer,
+        num_query_heads,
+        num_kv_heads,
+        scale,
+        block_tables,
+        kv_lens_tensor,
+        block_size,
+        max_kv_len,
+        "auto",
+        k_scale,
+        v_scale,
+    )
+
+    torch.testing.assert_close(output, output_trtllm, atol=1e-2, rtol=1e-2), \
+        f"{torch.max(torch.abs(output - output_trtllm))}"
diff --git a/vllm_v0.10.0/tests/kernels/attention/test_flashmla.py b/vllm_v0.10.0/tests/kernels/attention/test_flashmla.py
new file mode 100644
index 0000000..21b08e4
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/attention/test_flashmla.py
@@ -0,0 +1,133 @@
+# Adapted from: https://github.com/deepseek-ai/FlashMLA/blob/main/tests/test_flash_mla.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+import random
+
+import pytest
+import torch
+
+from vllm.attention.ops.flashmla import (flash_mla_with_kvcache,
+                                         get_mla_metadata,
+                                         is_flashmla_supported)
+from vllm.triton_utils import triton
+
+
+def cal_diff(x: torch.Tensor, y: torch.Tensor, name: str) -> None:
+    x, y = x.double(), y.double()
+    cos_diff = 1 - 2 * (x * y).sum().item() / max(
+        (x * x + y * y).sum().item(), 1e-12)
+    assert cos_diff < 1e-5
+
+FLASH_MLA_UNSUPPORTED_REASON = is_flashmla_supported()[1] \
+    if not is_flashmla_supported()[0] else "FlashMLA is supported"
+
+
+@pytest.mark.skipif(not is_flashmla_supported()[0],
+                    reason=FLASH_MLA_UNSUPPORTED_REASON)
+@pytest.mark.parametrize("b", [128])
+@pytest.mark.parametrize("s_q", [1, 2])
+@pytest.mark.parametrize("mean_sk", [4096, 8192])
+@pytest.mark.parametrize("h_q", [16, 32, 64, 128])
+@pytest.mark.parametrize("h_kv", [1])
+@pytest.mark.parametrize("d", [576])
+@pytest.mark.parametrize("dv", [512])
+@pytest.mark.parametrize("block_size", [64])
+@pytest.mark.parametrize("causal", [True])
+@pytest.mark.parametrize("varlen", [False, True])
+@torch.inference_mode()
+def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal,
+                   varlen):
+    # TODO: parametrize using pytest
+    dtype = torch.bfloat16
+    device = torch.device("cuda:0")
+    torch.set_default_dtype(dtype)
+    torch.set_default_device(device)
+    torch.cuda.set_device(device)
+    torch.manual_seed(0)
+    random.seed(0)
+
+    print(f"{b=}, {s_q=}, {mean_sk=}, {h_q=}, {h_kv=}, "
+          f"{d=}, {dv=}, {causal=}, {varlen=}")
+
+    cache_seqlens = torch.full((b, ), mean_sk, dtype=torch.int32)
+    if varlen:
+        for i in range(b):
+            cache_seqlens[i] = max(random.normalvariate(mean_sk, mean_sk / 2),
+                                   s_q)
+    total_seqlens = cache_seqlens.sum().item()
+    max_seqlen = cache_seqlens.max().item()
+    max_seqlen_pad = triton.cdiv(max_seqlen, 256) * 256
+
+    q = torch.randn(b, s_q, h_q, d)
+    block_table = torch.arange(b * max_seqlen_pad // block_size,
+                               dtype=torch.int32).view(
+                                   b, max_seqlen_pad // block_size)
+    blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
+    for i in range(b):
+        blocked_k.view(b, max_seqlen_pad, h_kv,
+                       d)[i, cache_seqlens[i].item():] = float("nan")
+    blocked_v = blocked_k[..., :dv]
+
+    tile_scheduler_metadata, num_splits = get_mla_metadata(
+        cache_seqlens, s_q * h_q // h_kv, h_kv)
+
+    def flash_mla():
+        return flash_mla_with_kvcache(
+            q,
+            blocked_k,
+            block_table,
+            cache_seqlens,
+            dv,
+            tile_scheduler_metadata,
+            num_splits,
+            causal=causal,
+        )
+
+    def scaled_dot_product_attention(query, key, value, is_causal=False):
+        query = query.float()
+        key = key.float()
+        value = value.float()
+        key = key.repeat_interleave(h_q // h_kv, dim=0)
+        value = value.repeat_interleave(h_q // h_kv, dim=0)
+        attn_weight = query @ key.transpose(-2, -1) / math.sqrt(query.size(-1))
+        if is_causal:
+            s_q = query.shape[-2]
+            s_k = key.shape[-2]
+            attn_bias = torch.zeros(s_q, s_k, dtype=query.dtype)
+            temp_mask = torch.ones(s_q, s_k,
+                                   dtype=torch.bool).tril(diagonal=s_k - s_q)
+            attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+            attn_bias.to(query.dtype)
+            attn_weight += attn_bias
+        lse = attn_weight.logsumexp(dim=-1)
+        attn_weight = torch.softmax(attn_weight, dim=-1, dtype=torch.float32)
+        return attn_weight @ value, lse
+
+    def ref_mla():
+        out = torch.empty(b, s_q, h_q, dv, dtype=torch.float32)
+        lse = torch.empty(b, h_q, s_q, dtype=torch.float32)
+        for i in range(b):
+            begin = i * max_seqlen_pad
+            end = begin + cache_seqlens[i]
+            ref_O, LSE = scaled_dot_product_attention(
+                q[i].transpose(0, 1),
+                blocked_k.view(-1, h_kv, d)[begin:end].transpose(0, 1),
+                blocked_v.view(-1, h_kv, dv)[begin:end].transpose(0, 1),
+                is_causal=causal,
+            )
+            out[i] = ref_O.transpose(0, 1)
+            lse[i] = LSE
+        return out, lse
+
+    out_flash, lse_flash = flash_mla()
+    out_torch, lse_torch = ref_mla()
+    cal_diff(out_flash, out_torch, "out")
+    cal_diff(lse_flash, lse_torch, "lse")
+
+    t = triton.testing.do_bench(flash_mla)
+    FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d +
+             b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
+    print(f"{t:.3f} ms, {FLOPS / 10 ** 9 / t:.0f} "
+          f"TFLOPS, {bytes / 10 ** 6 / t:.0f} GB/s")
diff --git a/vllm_v0.10.0/tests/kernels/attention/test_lightning_attn.py b/vllm_v0.10.0/tests/kernels/attention/test_lightning_attn.py
new file mode 100644
index 0000000..de45ee1
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/attention/test_lightning_attn.py
@@ -0,0 +1,287 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.lightning_attn import (
+    linear_decode_forward_triton)
+from vllm.platforms import current_platform
+
+NUM_HEADS = [4, 8]
+HEAD_SIZES = [64]
+BATCH_SIZES = [1, 2]
+SEQ_LENGTHS = [16]
+DTYPES = [torch.float32]
+
+
+def reference_lightning_attention(q, k, v, ed, block_size, kv_history):
+    """Reference implementation of lightning attention core algorithm
+    
+    The difference from the main implementation is that this processes 
+    each step sequentially, instead of using parallelized triton kernels
+    """
+    B, H, S, D = q.shape
+    E = v.shape[-1]
+    dtype = q.dtype
+    output = torch.zeros((B, H, S, E), dtype=dtype, device=q.device)
+
+    # Use clone() to ensure an independent copy
+    if kv_history is None:
+        kv_cache = torch.zeros((B, H, D, E), dtype=dtype, device=q.device)
+    else:
+        kv_cache = kv_history.clone()
+
+    # More efficient implementation
+    # Convert decay factors to matrix form
+    if ed.dim() == 1:
+        decay = torch.exp(-ed).view(1, -1, 1, 1)
+    else:
+        decay = torch.exp(-ed)
+
+    for b in range(B):
+        for step in range(S):
+            # Process all heads at once for this position
+            q_bs = q[b, :, step]  # [H, D]
+            k_bs = k[b, :, step]  # [H, D]
+            v_bs = v[b, :, step]  # [H, E]
+
+            # Calculate KV outer products for all heads
+            for h in range(H):
+                # Calculate KV outer product
+                kv_outer = torch.outer(k_bs[h], v_bs[h])
+
+                # Update KV cache with decay
+                # Note: Using the same order as in the Triton kernel
+                kv_cache[b, h] = decay[0, h, 0, 0] * kv_cache[b, h] + kv_outer
+
+                # Calculate attention output
+                output[b, h, step] = torch.matmul(q_bs[h], kv_cache[b, h])
+
+    # Match the shape returned by the actual implementation
+    # The actual implementation returns a tensor of shape [B, H, 2, D, E]
+    # where dimension 2 contains both KV and KV history
+    kv_reshaped = kv_cache.unsqueeze(2)  # [B, H, 1, D, E]
+    final_kv_cache = torch.cat([kv_reshaped, kv_reshaped],
+                               dim=2)  # [B, H, 2, D, E]
+
+    return output, final_kv_cache
+
+
+def reference_linear_decode(q, k, v, kv_caches, slope_rate, slot_idx):
+    """Reference implementation: linear attention decode function"""
+    B, H, _, D = q.shape
+    output = torch.zeros(B, H * D, dtype=q.dtype, device=q.device)
+
+    # Calculate decay factors once (more efficient)
+    decay = torch.exp(-slope_rate).view(-1, 1, 1)  # [H, 1, 1]
+
+    # Process each batch
+    for b in range(B):
+        slot_id = slot_idx[b].item()
+
+        # Skip padding positions
+        if slot_id == -1:
+            continue
+
+        # Process all heads at once for this batch
+        q_b = q[b, :, 0]  # [H, D]
+        k_b = k[b, :, 0]  # [H, D]
+        v_b = v[b, :, 0]  # [H, D]
+
+        # Process each attention head
+        for h in range(H):
+            # Get current query, key and value
+            q_bh = q_b[h]
+            k_bh = k_b[h]
+            v_bh = v_b[h]
+
+            # Get cache
+            kv_cache_old = kv_caches[b, h]
+
+            # Calculate new key-value outer product
+            kv_outer = torch.outer(k_bh, v_bh)
+
+            # Apply decay and update cache
+            kv_new = kv_outer + decay[h, 0, 0] * kv_cache_old
+
+            # Calculate output
+            out_h = torch.matmul(q_bh, kv_new)
+
+            # Update output and cache
+            output[b, h * D:(h + 1) * D] = out_h
+            kv_caches[b, h] = kv_new
+
+    return output
+
+
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@torch.inference_mode()
+def test_linear_decode_forward_triton(
+    batch_size: int,
+    num_heads: int,
+    head_size: int,
+    dtype: torch.dtype,
+):
+    torch.set_default_device("cuda")
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
+    current_platform.seed_everything(42)
+    base = 0.01
+    q = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+    k = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+    v = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+
+    kv_caches = base * torch.randn(batch_size,
+                                   num_heads,
+                                   head_size,
+                                   head_size,
+                                   dtype=dtype,
+                                   device="cuda")
+
+    kv_caches_copy = kv_caches.clone()
+
+    slope_rate = torch.zeros(num_heads, device="cuda")
+    for h in range(num_heads):
+        slope_rate[h] = 0.1 * (h + 1)
+
+    slot_idx = torch.arange(batch_size, device="cuda")
+
+    triton_output = linear_decode_forward_triton(q, k, v, kv_caches,
+                                                 slope_rate, slot_idx)
+
+    reference_output = reference_linear_decode(q, k, v, kv_caches_copy,
+                                               slope_rate, slot_idx)
+    torch.testing.assert_close(triton_output,
+                               reference_output,
+                               rtol=1e-1,
+                               atol=1e-1)
+    torch.testing.assert_close(kv_caches, kv_caches_copy, rtol=1e-1, atol=1e-1)
+
+    assert triton_output.shape == (batch_size, num_heads * head_size)
+
+
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@torch.inference_mode()
+def test_linear_decode_forward_triton_with_padding(
+    num_heads: int,
+    head_size: int,
+    dtype: torch.dtype,
+):
+    torch.set_default_device("cuda")
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
+    current_platform.seed_everything(42)
+
+    batch_size = 4
+    base = 0.01
+    q = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+    k = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+    v = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+
+    kv_caches = base * torch.randn(batch_size,
+                                   num_heads,
+                                   head_size,
+                                   head_size,
+                                   dtype=dtype,
+                                   device="cuda")
+
+    kv_caches_copy = kv_caches.clone()
+
+    slope_rate = torch.zeros(num_heads, device="cuda")
+    for h in range(num_heads):
+        slope_rate[h] = 0.1 * (h + 1)
+
+    slot_idx = torch.tensor([0, 1, -1, 2], device="cuda")
+
+    triton_output = linear_decode_forward_triton(q, k, v, kv_caches,
+                                                 slope_rate, slot_idx)
+
+    reference_output = reference_linear_decode(q, k, v, kv_caches_copy,
+                                               slope_rate, slot_idx)
+
+    padding_mask = (slot_idx
+                    != -1).unsqueeze(1).expand(-1, num_heads * head_size)
+
+    triton_masked = triton_output[padding_mask]
+    reference_masked = reference_output[padding_mask]
+
+    atol, rtol = 1.5e-1, 1.5e-1
+
+    valid_indices = slot_idx != -1
+
+    for i in range(batch_size):
+        if valid_indices[i] > 0:
+            torch.testing.assert_close(kv_caches[i],
+                                       kv_caches_copy[i],
+                                       rtol=rtol,
+                                       atol=atol)
+
+    torch.testing.assert_close(triton_masked,
+                               reference_masked,
+                               rtol=rtol,
+                               atol=atol)
+
+    assert triton_output.shape == (batch_size, num_heads * head_size)
+
+
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("seq_len", SEQ_LENGTHS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@torch.inference_mode()
+def test_lightning_attention_reference(
+    batch_size: int,
+    num_heads: int,
+    head_size: int,
+    seq_len: int,
+    dtype: torch.dtype,
+):
+    torch.set_default_device("cuda")
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
+    current_platform.seed_everything(42)
+
+    base = 0.01
+    q = base * torch.randn(
+        batch_size, num_heads, seq_len, head_size, dtype=dtype)
+    k = base * torch.randn(
+        batch_size, num_heads, seq_len, head_size, dtype=dtype)
+    v = base * torch.randn(
+        batch_size, num_heads, seq_len, head_size, dtype=dtype)
+
+    ed = torch.zeros(num_heads, device="cuda")
+    for h in range(num_heads):
+        ed[h] = 0.1 * (h + 1)
+
+    kv_history = base * torch.randn(batch_size,
+                                    num_heads,
+                                    head_size,
+                                    head_size,
+                                    dtype=dtype,
+                                    device="cuda")
+
+    kv_history_clone = kv_history.clone()
+
+    ref_output, ref_kv_cache = reference_lightning_attention(
+        q, k, v, ed, 256, kv_history)
+
+    from vllm.model_executor.layers.lightning_attn import lightning_attention
+    actual_output, actual_kv_cache = lightning_attention(
+        q, k, v, ed, 256, kv_history_clone)
+
+    atol, rtol = 1.5e-1, 1.5e-1
+    torch.testing.assert_close(ref_output, actual_output, rtol=rtol, atol=atol)
+    torch.testing.assert_close(ref_kv_cache,
+                               actual_kv_cache,
+                               rtol=rtol,
+                               atol=atol)
+
+    assert ref_output.shape == (batch_size, num_heads, seq_len, head_size)
+    assert ref_kv_cache.shape == actual_kv_cache.shape
diff --git a/vllm_v0.10.0/tests/kernels/attention/test_merge_attn_states.py b/vllm_v0.10.0/tests/kernels/attention/test_merge_attn_states.py
new file mode 100644
index 0000000..9d1a301
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/attention/test_merge_attn_states.py
@@ -0,0 +1,266 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import pytest
+import torch
+
+from vllm._custom_ops import merge_attn_states as merge_attn_states_cuda
+from vllm.attention.ops.triton_merge_attn_states import (
+    merge_attn_states as merge_attn_states_triton)
+from vllm.platforms import current_platform
+
+
+# Naive PyTorch Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
+# can be used to combine partial attention results (in the split-KV case)
+def merge_attn_states_torch(
+        output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+        prefix_output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+        prefix_lse: torch.Tensor,  # [NUM_HEADS, NUM_TOKENS]
+        suffix_output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+        suffix_lse: torch.Tensor,  # [NUM_HEADS, NUM_TOKENS]
+        output_lse: Optional[torch.Tensor] = None,  # [NUM_HEADS, NUM_TOKENS]
+):
+    p_lse = prefix_lse
+    s_lse = suffix_lse
+    # inf -> -inf
+    p_lse[p_lse == torch.inf] = -torch.inf
+    s_lse[s_lse == torch.inf] = -torch.inf
+    # max_lse [NUM_HEADS, NUM_TOKENS]
+    max_lse = torch.maximum(p_lse, s_lse)
+    p_lse = p_lse - max_lse
+    s_lse = s_lse - max_lse
+    p_lse_exp = torch.exp(p_lse)
+    s_lse_exp = torch.exp(s_lse)
+    out_se = (p_lse_exp + s_lse_exp)
+    if output_lse is not None:
+        output_lse = torch.log(out_se) + max_lse
+    p_scale = p_lse_exp / out_se  # [NUM_HEADS, NUM_TOKENS]
+    s_scale = s_lse_exp / out_se  # [NUM_HEADS, NUM_TOKENS]
+    p_scale = torch.transpose(p_scale, 0,
+                              1).unsqueeze(2)  # [NUM_TOKENS, NUM_HEADS, 1]
+    s_scale = torch.transpose(s_scale, 0,
+                              1).unsqueeze(2)  # [NUM_TOKENS, NUM_HEADS, 1]
+    output = prefix_output * p_scale + suffix_output * s_scale
+    return output, output_lse
+
+
+NUM_BATCH_TOKENS = [256, 512, 613, 1024, 1536, 4096]
+NUM_QUERY_HEADS = [4, 8, 16, 32, 48, 64]
+HEAD_SIZES = [32, 48, 64, 96, 128, 256]
+DTYPES = [torch.float32, torch.half, torch.bfloat16]
+
+all_case_info: list[tuple] = []
+
+
+def generate_markdown_table():
+    global all_case_info
+    table_header = ("| tokens | heads | headsize | dtype "
+                    "| device | torch | triton | cuda | speedup |")
+    table_separator = "| --- | --- | --- | --- | --- | --- | --- | --- | --- |"
+
+    def shortly_dtype(dtype: torch.dtype) -> str:
+        return str(dtype).removeprefix("torch.")
+
+    def shortly_device(device: str) -> str:
+        return device.removeprefix("NVIDIA").strip()
+
+    print(table_header)
+    print(table_separator)
+    for info in all_case_info:
+        (num_tokens, num_heads, head_size, dtype, device,
+         avg_time_torch_kernel, avg_time_triton_kernel, avg_time_cuda_kernel,
+         performance_improved) = info
+        dtype = shortly_dtype(dtype)
+        device = shortly_device(device)
+        print(f"| {num_tokens} | {num_heads} | {head_size} "
+              f"| {dtype} | {device} | {avg_time_torch_kernel:.5f}ms "
+              f"| {avg_time_triton_kernel:.5f}ms "
+              f"| {avg_time_cuda_kernel:.5f}ms "
+              f"| {performance_improved:.4f}x |")
+
+
+@pytest.mark.parametrize("num_tokens", NUM_BATCH_TOKENS)
+@pytest.mark.parametrize("num_query_heads", NUM_QUERY_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("output_dtype", DTYPES)
+@torch.inference_mode()
+def test_merge_attn_states(num_tokens: int, num_query_heads: int,
+                           head_size: int, output_dtype: torch.dtype):
+    if not current_platform.is_cuda():
+        pytest.skip('Currently only support compare triton merge_attn_states '
+                    'with custom cuda merge_attn_states kernel')
+
+    NUM_TOKENS = num_tokens
+    NUM_HEADS = num_query_heads
+    HEAD_SIZE = head_size
+
+    print(f"\nNUM_TOKENS:{NUM_TOKENS}, NUM_HEADS:{NUM_HEADS}, "
+          f"HEAD_SIZE:{HEAD_SIZE}, DTYPE: {output_dtype}, "
+          f"Device: {current_platform.get_device_name()}")
+
+    # prefix_lse and suffix_lse contain inf and normal values
+    prefix_lse = torch.randn(NUM_HEADS,
+                             NUM_TOKENS,
+                             dtype=torch.float32,
+                             device="cuda")
+    suffix_lse = torch.randn(NUM_HEADS,
+                             NUM_TOKENS,
+                             dtype=torch.float32,
+                             device="cuda")
+
+    # Generate boolean masks
+    mask_prefix = torch.rand(NUM_HEADS, NUM_TOKENS) < 0.1
+    mask_suffix = torch.rand(NUM_HEADS, NUM_TOKENS) < 0.1
+    # Ensure that the same position is not True at the same time
+    combined_mask = torch.logical_and(mask_prefix, mask_suffix)
+    mask_prefix = torch.logical_and(mask_prefix, ~combined_mask)
+    mask_suffix = torch.logical_and(mask_suffix, ~combined_mask)
+
+    prefix_lse[mask_prefix] = float('inf')
+    suffix_lse[mask_suffix] = float('inf')
+
+    # Other input tensors (need to be initialized but
+    # no actual calculation needed)
+    output = torch.zeros((NUM_TOKENS, NUM_HEADS, HEAD_SIZE),
+                         dtype=output_dtype,
+                         device="cuda")
+    output_lse = torch.zeros((NUM_HEADS, NUM_TOKENS),
+                             dtype=torch.float32,
+                             device="cuda")
+    prefix_output = torch.randn((NUM_TOKENS, NUM_HEADS, HEAD_SIZE),
+                                dtype=output_dtype,
+                                device="cuda")
+    suffix_output = torch.randn((NUM_TOKENS, NUM_HEADS, HEAD_SIZE),
+                                dtype=output_dtype,
+                                device="cuda")
+
+    warmup_times = 2
+    repeat_times = 20
+
+    output_torch = output.clone()
+    output_lse_torch = output_lse.clone()
+    total_time_torch_kernel = 0
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+
+    # 0. Run the Torch kernel
+    prefix_lse_torch = prefix_lse.clone()
+    suffix_lse_torch = suffix_lse.clone()
+    for _ in range(warmup_times):
+        output_torch, output_lse_torch = merge_attn_states_torch(
+            output_torch, prefix_output, prefix_lse_torch, suffix_output,
+            suffix_lse_torch, output_lse_torch)
+    torch.cuda.synchronize()
+
+    for _ in range(repeat_times):
+        start.record()
+        output_torch, output_lse_torch = merge_attn_states_torch(
+            output_torch, prefix_output, prefix_lse_torch, suffix_output,
+            suffix_lse_torch, output_lse_torch)
+        end.record()
+        torch.cuda.synchronize()
+        total_time_torch_kernel += start.elapsed_time(end)
+
+    avg_time_torch_kernel = total_time_torch_kernel / repeat_times
+
+    # 1. Run the Triton kernel
+    output_ref_triton = output.clone()
+    output_lse_ref_triton = output_lse.clone()
+
+    total_time_triton_kernel = 0
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+
+    for _ in range(warmup_times):
+        merge_attn_states_triton(output_ref_triton, prefix_output, prefix_lse,
+                                 suffix_output, suffix_lse,
+                                 output_lse_ref_triton)
+    torch.cuda.synchronize()
+
+    for _ in range(repeat_times):
+        start.record()
+        merge_attn_states_triton(output_ref_triton, prefix_output, prefix_lse,
+                                 suffix_output, suffix_lse,
+                                 output_lse_ref_triton)
+        end.record()
+        torch.cuda.synchronize()
+        total_time_triton_kernel += start.elapsed_time(end)
+
+    avg_time_triton_kernel = total_time_triton_kernel / repeat_times
+
+    # 2. Run the CUDA kernel
+    total_time_cuda_kernel = 0
+    output_cuda = output.clone()
+    output_lse_cuda = output_lse.clone()
+
+    for _ in range(warmup_times):
+        merge_attn_states_cuda(output_cuda, prefix_output, prefix_lse,
+                               suffix_output, suffix_lse, output_lse_cuda)
+    torch.cuda.synchronize()
+
+    for _ in range(repeat_times):
+        start.record()
+        merge_attn_states_cuda(output_cuda, prefix_output, prefix_lse,
+                               suffix_output, suffix_lse, output_lse_cuda)
+        end.record()
+        torch.cuda.synchronize()
+        total_time_cuda_kernel += start.elapsed_time(end)
+
+    avg_time_cuda_kernel = total_time_cuda_kernel / repeat_times
+
+    # 3. Performance compare
+    performance_improved = avg_time_triton_kernel / avg_time_cuda_kernel
+    print(f" Torch time: {avg_time_torch_kernel:.6f}ms")
+    print(f"Triton time: {avg_time_triton_kernel:.6f}ms")
+    print(f"  CUDA time: {avg_time_cuda_kernel:.6f}ms, "
+          f"Performance: {performance_improved:.5f}x")
+    print("-" * 100)
+
+    # 4. Correctness compare
+    # Liger Kernel: Efficient Triton Kernels for LLM Training
+    # https://arxiv.org/pdf/2410.10989, 3.3 Correctness
+    # use rtol = 1e-2 for bfloat16.
+    rtol = 1e-2 if output_dtype == torch.bfloat16 else 1e-3
+
+    def diff(a: torch.Tensor, b: torch.Tensor):
+        max_diff = torch.max(torch.abs(a.float() - b.float()))
+        return max_diff
+
+    # Use Triton output as reference because we want to replace
+    # the Triton kernel with custom CUDA kernel for merge attn
+    # states operation.
+    output_ref = output_ref_triton
+    output_lse_ref = output_lse_ref_triton
+    torch.testing.assert_close(output_cuda.float(),
+                               output_ref.float(),
+                               atol=1e-3,
+                               rtol=rtol)
+    print("Output all match, max abs diff:")
+    print(f"(Triton vs Torch) : {diff(output_torch, output_ref)}")
+    print(f"  (CUDA vs Torch) : {diff(output_torch, output_cuda)}")
+    print(f"  (CUDA vs Triton): {diff(output_ref, output_cuda)}")
+    print("-" * 100)
+
+    torch.testing.assert_close(output_lse_cuda.float(),
+                               output_lse_ref.float(),
+                               atol=1e-3,
+                               rtol=rtol)
+    print("Output LSE all match, max abs diff:")
+    print(f"(Triton vs Torch) : {diff(output_lse_torch, output_lse_ref)}")
+    print(f"  (CUDA vs Torch) : {diff(output_lse_torch, output_lse_cuda)}")
+    print(f"  (CUDA vs Triton): {diff(output_lse_ref, output_lse_cuda)}")
+    print("-" * 100)
+
+    print("All output values test passed! All inf values "
+          "are correctly replaced with -inf.")
+    print("-" * 100)
+
+    device = current_platform.get_device_name()
+    all_case_info.append(
+        (NUM_TOKENS, NUM_HEADS, HEAD_SIZE, output_dtype, device,
+         avg_time_torch_kernel, avg_time_triton_kernel, avg_time_cuda_kernel,
+         performance_improved))
+    if len(all_case_info) == (len(NUM_BATCH_TOKENS) * len(HEAD_SIZES) *
+                              len(NUM_QUERY_HEADS) * len(DTYPES)):
+        generate_markdown_table()
diff --git a/vllm_v0.10.0/tests/kernels/attention/test_mha_attn.py b/vllm_v0.10.0/tests/kernels/attention/test_mha_attn.py
new file mode 100644
index 0000000..53c3755
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/attention/test_mha_attn.py
@@ -0,0 +1,128 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test:
+
+* Tests for MultiHeadAttention layer
+"""
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.selector import _Backend, _cached_get_attn_backend
+from vllm.platforms import current_platform
+from vllm.platforms.cpu import CpuPlatform
+from vllm.platforms.cuda import CudaPlatform
+from vllm.platforms.rocm import RocmPlatform
+
+
+@pytest.fixture(autouse=True)
+def clear_cache():
+    """Clear lru cache to ensure each test case runs without caching.
+    """
+    _cached_get_attn_backend.cache_clear()
+
+
+@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
+def test_mha_attn_platform(device: str):
+    """
+    Test the attention selector between different platform and device.
+    """
+    torch.set_default_dtype(torch.float16)
+
+    if device == "cpu":
+        with patch("vllm.attention.selector.current_platform", CpuPlatform()):
+            attn = MultiHeadAttention(16, 64, scale=1)
+            assert attn.attn_backend == _Backend.TORCH_SDPA
+    elif device == "hip":
+        with patch("vllm.attention.selector.current_platform", RocmPlatform()):
+            attn = MultiHeadAttention(16, 64, scale=1)
+            assert attn.attn_backend == _Backend.TORCH_SDPA
+    else:
+        with patch("vllm.attention.selector.current_platform", CudaPlatform()):
+            attn = MultiHeadAttention(16, 64, scale=1)
+            assert attn.attn_backend == _Backend.XFORMERS
+
+        with patch("vllm.attention.selector.current_platform", CudaPlatform()):
+            attn = MultiHeadAttention(16, 72, scale=1)
+            assert attn.attn_backend == _Backend.XFORMERS
+
+
+def ref_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    scale: float,
+) -> torch.Tensor:
+    """
+    Native implementation of scaled dot product attention without mask:
+    - query, key, value: [batch_size, seq_len, num_heads, head_size]
+    - attn_mask: [batch_size, seq_len, seq_len]
+    """
+    query, key, value = (x.transpose(1, 2) for x in (query, key, value))
+    attn_weights = scale * torch.matmul(query, key.transpose(2, 3))
+    attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
+    out = torch.matmul(attn_weights, value).transpose(1, 2)
+    return out
+
+
+BATCH_SIZES = [1, 16]
+SEQ_LENS = [1]
+NUM_HEADS = [1, 16]
+NUM_KV_HEADS = [1]
+HEAD_SIZES = [64, 80]
+# flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16}
+DTYPES = [
+    torch.half, torch.bfloat16, torch.float
+] if not current_platform.is_rocm() else [torch.half, torch.bfloat16]
+CUDA_DEVICES = ["cuda"]
+
+
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("seq_len", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("num_kv_heads", NUM_KV_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_mha_attn_forward(
+    batch_size: int,
+    seq_len: int,
+    num_heads: int,
+    num_kv_heads: int,
+    head_size: int,
+    dtype: torch.dtype,
+    device: str,
+):
+    current_platform.seed_everything(0)
+    torch.set_default_device(device)
+    torch.set_default_dtype(dtype)
+
+    q = torch.randn(batch_size, seq_len, num_heads * head_size)
+    k = torch.randn(batch_size, seq_len, num_kv_heads * head_size)
+    v = torch.randn(batch_size, seq_len, num_kv_heads * head_size)
+    scale = 1.0 / head_size**0.5
+    attn = MultiHeadAttention(num_heads,
+                              head_size,
+                              scale=scale,
+                              num_kv_heads=num_kv_heads)
+    output = attn(q, k, v)
+
+    assert num_heads % num_kv_heads == 0
+    num_queries_per_kv = num_heads // num_kv_heads
+    q = q.reshape(batch_size, seq_len, num_heads, head_size)
+    k = k.reshape(batch_size, seq_len, num_kv_heads, head_size)
+    v = v.reshape(batch_size, seq_len, num_kv_heads, head_size)
+    if num_queries_per_kv > 1:
+        k = torch.repeat_interleave(k, num_queries_per_kv, dim=2)
+        v = torch.repeat_interleave(v, num_queries_per_kv, dim=2)
+
+    ref_output = ref_attention(
+        q,
+        k,
+        v,
+        scale=scale,
+    ).reshape(batch_size, seq_len, num_heads * head_size)
+    torch.testing.assert_close(output, ref_output)
diff --git a/vllm_v0.10.0/tests/kernels/attention/test_mla_decode_cpu.py b/vllm_v0.10.0/tests/kernels/attention/test_mla_decode_cpu.py
new file mode 100644
index 0000000..f8b307c
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/attention/test_mla_decode_cpu.py
@@ -0,0 +1,92 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+import vllm._custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.utils import cdiv
+
+
+def ref_mla(
+        out: Tensor,  # (bs, num_heads, v_head_dim)
+        query: Tensor,  # (bs, num_heads, head_dim)
+        kv_cache: Tensor,  # (num_blocks, block_size, head_dim)
+        scale: float,
+        block_tables: Tensor,  # (bs, max_num_blocks)
+        seq_lens: Tensor,  # (bs,)
+):
+    bs, num_heads, v_head_dim = out.shape
+    head_dim = query.shape[2]
+
+    for i in range(bs):
+        # gather and flatten KV-cache
+        kv = kv_cache[
+            block_tables[i]]  # (max_num_blocks, block_size, head_dim)
+        kv = kv.view(1, -1,
+                     head_dim)[:, :seq_lens[i]]  # (1, seq_len, head_dim)
+        v = kv[:, :, :v_head_dim]
+
+        q = query[i].view(num_heads, 1, head_dim)
+        o = F.scaled_dot_product_attention(q,
+                                           kv,
+                                           v,
+                                           scale=scale,
+                                           enable_gqa=True)
+        out[i] = o.view(num_heads, v_head_dim)
+
+    return out
+
+
+@pytest.mark.parametrize("bs", [4])
+@pytest.mark.parametrize("mean_seq_len", [256])
+@pytest.mark.parametrize("h_q", [16])
+@pytest.mark.parametrize("d", [576])
+@pytest.mark.parametrize("dv", [512])
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("dtype", [torch.float, torch.half, torch.bfloat16])
+@pytest.mark.parametrize("varlen", [False, True])
+@pytest.mark.cpu_model
+@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
+def test_mla_decode_cpu(
+    bs: int,
+    mean_seq_len: int,
+    h_q: int,
+    d: int,
+    dv: int,
+    block_size: int,
+    dtype: torch.dtype,
+    varlen: bool,
+):
+    torch.set_default_dtype(dtype)
+    torch.manual_seed(0)
+
+    scale = d**(-0.5)
+    if varlen:
+        seq_lens = torch.empty(bs).normal_(mean_seq_len, mean_seq_len / 2)
+        seq_lens = seq_lens.clip(2).to(torch.int32)
+    else:
+        seq_lens = torch.full((bs, ), mean_seq_len, dtype=torch.int32)
+    max_seq_len = seq_lens.max().item()
+    seqlen_pad = cdiv(max_seq_len, 256) * 256  # is this necessary?
+
+    q = torch.randn(bs, h_q, d)
+    block_table = torch.arange(bs * seqlen_pad // block_size,
+                               dtype=torch.int32)
+    block_table = block_table.view(bs, seqlen_pad // block_size)
+
+    kv_cache = torch.randn(block_table.numel(), block_size, d)
+    for i, seq_len in enumerate(seq_lens.tolist()):
+        kv_cache.view(bs, seqlen_pad, d)[i, seq_len:] = float("nan")
+
+    out_mla = q.new_zeros(bs, h_q, dv)
+    ops.mla_decode_kvcache_cpu(out_mla, q, kv_cache, scale, block_table,
+                               seq_lens)
+
+    out_ref = q.new_zeros(bs, h_q, dv)
+    ref_mla(out_ref, q, kv_cache, scale, block_table, seq_lens)
+
+    assert not out_mla.isnan().any(), "Likely read out of bounds"
+    torch.testing.assert_close(out_mla, out_ref)
diff --git a/vllm_v0.10.0/tests/kernels/attention/test_prefix_prefill.py b/vllm_v0.10.0/tests/kernels/attention/test_prefix_prefill.py
new file mode 100644
index 0000000..b09e1bb
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/attention/test_prefix_prefill.py
@@ -0,0 +1,559 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+import random
+import time
+from collections.abc import Callable
+
+import pytest
+import torch
+from xformers import ops as xops
+from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
+
+from vllm.attention.backends.xformers import _make_alibi_bias
+from vllm.attention.ops.chunked_prefill_paged_decode import (
+    chunked_prefill_paged_decode)
+from vllm.attention.ops.prefix_prefill import context_attention_fwd
+from vllm.platforms import current_platform
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
+
+NUM_HEADS = [64]
+NUM_QUERIES_PER_KV = [1, 8, 64]
+HEAD_SIZES = [128, 96, 24]
+DTYPES = [torch.float16]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+SLIDING_WINDOW = [0, 16, 64, 128, 256, 512, 2048]
+KV_CACHE_DTYPES = ["auto", "fp8", "fp8_e5m2"]
+
+OPS = [chunked_prefill_paged_decode, context_attention_fwd]
+
+
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOW)
+@pytest.mark.parametrize("op", OPS)
+@torch.inference_mode()
+def test_contexted_kv_attention(
+    num_heads: int,
+    num_queries_per_kv: int,
+    head_size: int,
+    sliding_window: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: str,
+    device: str,
+    op: Callable,
+) -> None:
+
+    if 'fp8' in kv_cache_dtype and not current_platform.has_device_capability(
+            89):
+        pytest.skip(
+            'Triton limitation: fp8e4nv data type is not supported on CUDA'
+            ' arch < 89')
+
+    current_platform.seed_everything(0)
+    torch.set_default_device(device)
+
+    # Need this, otherwise when we capture the graph the process
+    # for GPU 1 would run on both GPU0 and GPU1 and things would hang
+    #
+    # see also similar issue: https://github.com/Dao-AILab/flash-attention/issues/523
+    torch.cuda.set_device(device)
+
+    MAX_SEQ_LEN = 1024
+    MAX_CTX_LEN = 1024
+    BS = 10
+    cache_size = 640
+    block_size = 32
+    max_block_per_request = 64
+    query_lens = [random.randint(16, MAX_SEQ_LEN) for _ in range(BS)]
+    # ensure one sequence in batch is a decode
+    query_lens[-1] = 1
+
+    ctx_lens = [random.randint(16, MAX_CTX_LEN) for _ in range(BS)]
+    seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)]
+    num_kv_heads = num_heads // num_queries_per_kv
+
+    num_tokens = sum(query_lens)
+    query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
+    query.uniform_(-1e-3, 1e-3)
+    output = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
+
+    kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype)
+    kv.uniform_(-1e-3, 1e-3)
+    key, value = kv.unbind(dim=1)
+
+    if kv_cache_dtype == "auto":
+        cache_dtype = dtype
+    else:
+        cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype]
+    k_cache = torch.zeros(cache_size,
+                          block_size,
+                          num_kv_heads,
+                          head_size,
+                          dtype=cache_dtype)
+    v_cache = torch.zeros(cache_size,
+                          block_size,
+                          num_kv_heads,
+                          head_size,
+                          dtype=cache_dtype)
+    k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
+    v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
+    values = torch.arange(0, cache_size, dtype=torch.long)
+    values = values[torch.randperm(cache_size)]
+    block_table = values[:BS * max_block_per_request].view(
+        BS, max_block_per_request)
+    b_seq_len = torch.tensor(seq_lens, dtype=torch.long)
+    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
+    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens,
+                                            dtype=torch.long),
+                               dim=0)
+    max_input_len = MAX_SEQ_LEN
+    # copy kv to cache
+    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
+                                                dtype=torch.long),
+                                   dim=0)
+    for i in range(BS):
+        for j in range(query_lens[i]):
+            k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] +
+                                            j])
+            v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] +
+                                              b_ctx_len[i] + j])
+        cur_ctx = 0
+        block_id = 0
+        while cur_ctx < b_ctx_len[i]:
+            start_loc = b_seq_start_loc[i] + cur_ctx
+            if cur_ctx + block_size > b_ctx_len[i]:
+                end_loc = b_seq_start_loc[i] + b_ctx_len[i]
+            else:
+                end_loc = start_loc + block_size
+            start_slot = block_table[i, block_id] * block_size
+            end_slot = start_slot + end_loc - start_loc
+            k_cache.view(-1, num_kv_heads,
+                         head_size)[start_slot:end_slot].copy_(
+                             key[start_loc:end_loc])
+            v_cache.view(-1, num_kv_heads,
+                         head_size)[start_slot:end_slot].copy_(
+                             value[start_loc:end_loc])
+            cur_ctx += block_size
+            block_id += 1
+    # transpose K_cache[num_blocks, block_size, num_kv_heads, head_size]
+    # to K_cache[num_blocks, num_kv_heads, head_size/8, block_size, 8]
+    k_cache = k_cache.view(-1, block_size, num_kv_heads, head_size // 8,
+                           8).permute(0, 2, 3, 1, 4).contiguous()
+    # transpose V_cache[num_blocks, block_size, num_kv_heads, head_size]
+    # to V_cache[num_blocks, num_kv_heads, head_size, block_size]
+    v_cache = v_cache.view(-1, block_size, num_kv_heads,
+                           head_size).permute(0, 2, 3, 1).contiguous()
+    k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
+
+    # Warm up the Triton kernel by calling it once before actually measuring
+    # generation time
+    op(query,
+       k,
+       v,
+       output,
+       kv_cache_dtype,
+       k_cache,
+       v_cache,
+       block_table,
+       b_start_loc,
+       b_seq_len,
+       MAX_CTX_LEN,
+       max_input_len,
+       k_scale,
+       v_scale,
+       sliding_window=sliding_window)
+    torch.cuda.synchronize()
+    start_time = time.time()
+    op(query,
+       k,
+       v,
+       output,
+       kv_cache_dtype,
+       k_cache,
+       v_cache,
+       block_table,
+       b_start_loc,
+       b_seq_len,
+       MAX_CTX_LEN,
+       max_input_len,
+       k_scale,
+       v_scale,
+       sliding_window=sliding_window)
+    torch.cuda.synchronize()
+    end_time = time.time()
+    print(f"triton Time: {(end_time - start_time)*1000:.2f} ms")
+
+    scale = float(1.0 / (head_size**0.5))
+
+    attn_op = xops.fmha.cutlass.FwOp()
+
+    if num_kv_heads != num_heads:
+        # As of Nov 2023, xformers only supports MHA. For MQA/GQA,
+        # project the key and value tensors to the desired number of
+        # heads.
+        #
+        # see also: vllm/model_executor/layers/attention.py
+        query = query.view(query.shape[0], num_kv_heads, num_queries_per_kv,
+                           query.shape[-1])
+        key = key[:, :, None, :].expand(key.shape[0], num_kv_heads,
+                                        num_queries_per_kv, key.shape[-1])
+        value = value[:, :,
+                      None, :].expand(value.shape[0], num_kv_heads,
+                                      num_queries_per_kv, value.shape[-1])
+    query = query.unsqueeze(0)
+    key = key.unsqueeze(0)
+    value = value.unsqueeze(0)
+
+    attn_bias = BlockDiagonalCausalFromBottomRightMask.from_seqlens(
+        query_lens, seq_lens)
+    if sliding_window > 0:
+        attn_bias = attn_bias.make_local_attention_from_bottomright(
+            sliding_window)
+    output_ref = xops.memory_efficient_attention_forward(
+        query,
+        key,
+        value,
+        attn_bias=attn_bias,
+        p=0.0,
+        scale=scale,
+        op=attn_op,
+    )
+    torch.cuda.synchronize()
+    start_time = time.time()
+    output_ref = xops.memory_efficient_attention_forward(
+        query,
+        key,
+        value,
+        attn_bias=attn_bias,
+        p=0.0,
+        scale=scale,
+        op=attn_op,
+    )
+    torch.cuda.synchronize()
+    end_time = time.time()
+    print(f"xformers Time: {(end_time - start_time)*1000:.2f} ms")
+    output_ref = output_ref.reshape(output.shape)
+    atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-4
+    torch.testing.assert_close(output, output_ref, atol=atol, rtol=0)
+
+
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("op", OPS)
+@torch.inference_mode()
+def test_contexted_kv_attention_alibi(
+    num_heads: int,
+    num_queries_per_kv: int,
+    head_size: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: str,
+    device: str,
+    op: Callable,
+) -> None:
+
+    if 'fp8' in kv_cache_dtype and not current_platform.has_device_capability(
+            89):
+        pytest.skip(
+            'Triton limitation: fp8e4nv data type is not supported on CUDA'
+            ' arch < 89')
+
+    current_platform.seed_everything(0)
+    torch.set_default_device(device)
+
+    # Need this, otherwise when we capture the graph the process
+    # for GPU 1 would run on both GPU0 and GPU1 and things would hang
+    #
+    # see also similar issue: https://github.com/Dao-AILab/flash-attention/issues/523
+    torch.cuda.set_device(device)
+
+    def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
+        # Fork from: vllm/vllm/model_executor/models/bloom.py#L44
+        closest_power_of_2 = 2**math.floor(math.log2(total_num_heads))
+        base = torch.tensor(
+            2**(-(2**-(math.log2(closest_power_of_2) - 3))),
+            dtype=torch.float32,
+        )
+        powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
+        slopes = torch.pow(base, powers)
+
+        if closest_power_of_2 != total_num_heads:
+            extra_base = torch.tensor(
+                2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
+                dtype=torch.float32,
+            )
+            num_remaining_heads = min(closest_power_of_2,
+                                      total_num_heads - closest_power_of_2)
+            extra_powers = torch.arange(start=1,
+                                        end=1 + 2 * num_remaining_heads,
+                                        step=2,
+                                        dtype=torch.int32)
+            slopes = torch.cat(
+                [slopes, torch.pow(extra_base, extra_powers)], dim=0)
+        return slopes
+
+    alibi_slopes = _get_alibi_slopes(num_heads).to(device)
+
+    MAX_SEQ_LEN = 1024
+    MAX_CTX_LEN = 1024
+    BS = 10
+    cache_size = 640
+    block_size = 32
+    max_block_per_request = 64
+    query_lens = [random.randint(16, MAX_SEQ_LEN) for _ in range(BS)]
+    ctx_lens = [random.randint(16, MAX_CTX_LEN) for _ in range(BS)]
+    seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)]
+    num_kv_heads = num_heads // num_queries_per_kv
+
+    num_tokens = sum(query_lens)
+    query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
+    query.uniform_(-1e-3, 1e-3)
+    output = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
+
+    kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype)
+    kv.uniform_(-1e-3, 1e-3)
+    key, value = kv.unbind(dim=1)
+    if kv_cache_dtype == "auto":
+        cache_dtype = dtype
+    else:
+        cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype]
+    k_cache = torch.zeros(cache_size,
+                          block_size,
+                          num_kv_heads,
+                          head_size,
+                          dtype=cache_dtype)
+    v_cache = torch.zeros(cache_size,
+                          block_size,
+                          num_kv_heads,
+                          head_size,
+                          dtype=cache_dtype)
+    k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
+    v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
+    values = torch.arange(0, cache_size, dtype=torch.long)
+    values = values[torch.randperm(cache_size)]
+    block_table = values[:BS * max_block_per_request].view(
+        BS, max_block_per_request)
+    b_seq_len = torch.tensor(seq_lens, dtype=torch.long)
+    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
+    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens,
+                                            dtype=torch.long),
+                               dim=0)
+    max_input_len = MAX_SEQ_LEN
+    # copy kv to cache
+    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
+                                                dtype=torch.long),
+                                   dim=0)
+    for i in range(BS):
+        for j in range(query_lens[i]):
+            k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] +
+                                            j])
+            v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] +
+                                              b_ctx_len[i] + j])
+        cur_ctx = 0
+        block_id = 0
+        while cur_ctx < b_ctx_len[i]:
+            start_loc = b_seq_start_loc[i] + cur_ctx
+            if cur_ctx + block_size > b_ctx_len[i]:
+                end_loc = b_seq_start_loc[i] + b_ctx_len[i]
+            else:
+                end_loc = start_loc + block_size
+            start_slot = block_table[i, block_id] * block_size
+            end_slot = start_slot + end_loc - start_loc
+            k_cache.view(-1, num_kv_heads,
+                         head_size)[start_slot:end_slot].copy_(
+                             key[start_loc:end_loc])
+            v_cache.view(-1, num_kv_heads,
+                         head_size)[start_slot:end_slot].copy_(
+                             value[start_loc:end_loc])
+            cur_ctx += block_size
+            block_id += 1
+    # transpose K_cache[num_blocks, block_size, num_kv_heads, head_size]
+    # to K_cache[num_blocks, num_kv_heads, head_size/8, block_size, 8]
+    k_cache = k_cache.view(-1, block_size, num_kv_heads, head_size // 8,
+                           8).permute(0, 2, 3, 1, 4).contiguous()
+    # transpose V_cache[num_blocks, block_size, num_kv_heads, head_size]
+    # to V_cache[num_blocks, num_kv_heads, head_size, block_size]
+    v_cache = v_cache.view(-1, block_size, num_kv_heads,
+                           head_size).permute(0, 2, 3, 1).contiguous()
+    k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
+
+    # Warm up the Triton kernel by calling it once before actually measuring
+    # generation time
+    op(query,
+       k,
+       v,
+       output,
+       kv_cache_dtype,
+       k_cache,
+       v_cache,
+       block_table,
+       b_start_loc,
+       b_seq_len,
+       MAX_CTX_LEN,
+       max_input_len,
+       k_scale,
+       v_scale,
+       alibi_slopes=alibi_slopes)
+    torch.cuda.synchronize()
+    start_time = time.time()
+    op(query,
+       k,
+       v,
+       output,
+       kv_cache_dtype,
+       k_cache,
+       v_cache,
+       block_table,
+       b_start_loc,
+       b_seq_len,
+       MAX_CTX_LEN,
+       max_input_len,
+       k_scale,
+       v_scale,
+       alibi_slopes=alibi_slopes)
+    torch.cuda.synchronize()
+    end_time = time.time()
+    print(f"triton Time: {(end_time - start_time)*1000:.2f} ms")
+    scale = float(1.0 / (head_size**0.5))
+
+    # NOTE(DefTruth): In order to reuse _make_alibi_bias function,
+    # we have to pad query tensor before MQA/GQA expanding.
+    if query.shape[0] != key.shape[0]:
+        query_pad = torch.empty(sum(seq_lens),
+                                num_heads,
+                                head_size,
+                                dtype=dtype)
+        query_pad.uniform_(-1e-3, 1e-3)
+        seq_start = 0
+        query_start = 0
+        for i, (query_len, seq_len) in enumerate(zip(query_lens, seq_lens)):
+            seq_end = seq_start + seq_len
+            query_end = query_start + query_len
+            query_pad[seq_start:seq_end, ...] = torch.cat([
+                torch.zeros(
+                    seq_len - query_len, num_heads, head_size, dtype=dtype),
+                query[query_start:query_end, ...]
+            ],
+                                                          dim=0)
+            seq_start += seq_len
+            query_start += query_len
+        query = query_pad
+
+    if num_kv_heads != num_heads:
+        # As of Nov 2023, xformers only supports MHA. For MQA/GQA,
+        # project the key and value tensors to the desired number of
+        # heads.
+        #
+        # see also: vllm/model_executor/layers/attention.py
+        key = key[:, :, None, :].expand(key.shape[0], num_kv_heads,
+                                        num_queries_per_kv, key.shape[-1])
+        value = value[:, :,
+                      None, :].expand(value.shape[0], num_kv_heads,
+                                      num_queries_per_kv, value.shape[-1])
+        # [seq, num_kv_heads, num_queries_per_kv, dk]=>
+        # [seq, num_kv_heads*num_queries_per_kv, dk] to comply with rest of the
+        # codebase. We save some time reshaping alibi matrix at runtime.
+        key = key.reshape(key.shape[0], -1, key.shape[-1])
+        value = value.reshape(value.shape[0], -1, value.shape[-1])
+    query = query.unsqueeze(0)
+    key = key.unsqueeze(0)
+    value = value.unsqueeze(0)
+
+    attn_bias = _make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens)
+    output_ref = torch.empty_like(output)
+    seq_start = 0
+    query_start = 0
+    start_time = time.time()
+    # Attention with alibi slopes.
+    # FIXME(DefTruth): Because xformers does not support dynamic sequence
+    # lengths with custom attention bias, we process each prompt one by
+    # one. This is inefficient, especially when we have many short prompts.
+    # modified from: vllm/attention/backends/xformers.py#L343
+    for i, (query_len, seq_len) in enumerate(zip(query_lens, seq_lens)):
+        seq_end = seq_start + seq_len
+        query_end = query_start + query_len
+        out = xops.memory_efficient_attention_forward(query[:,
+                                                            seq_start:seq_end],
+                                                      key[:,
+                                                          seq_start:seq_end],
+                                                      value[:,
+                                                            seq_start:seq_end],
+                                                      attn_bias=attn_bias[i],
+                                                      p=0.0,
+                                                      scale=scale)
+        out = out.view_as(query[:, seq_start:seq_end]).view(
+            seq_len, num_heads, head_size)
+        output_ref[query_start:query_end, ...].copy_(out[seq_len - query_len:,
+                                                         ...])
+        seq_start += seq_len
+        query_start += query_len
+    torch.cuda.synchronize()
+    end_time = time.time()
+    print(f"xformers Time: {(end_time - start_time)*1000:.2f} ms")
+    atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-6
+    torch.testing.assert_close(output, output_ref, atol=atol, rtol=0)
+
+
+# These tests are optional to only run when explicitly invoked
+#
+# pytest -v -s --optional \
+# tests/kernels/test_prefix_prefill.py::test_contexted_kv_attention_f32
+#
+# These tests are useful to test model dtype float32 on Turing devices.
+# We skip them to not increase the time when running tests on CI
+@pytest.mark.optional
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", [torch.float32])
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOW)
+@pytest.mark.parametrize("op", OPS)
+@torch.inference_mode()
+def test_contexted_kv_attention_f32(
+    num_heads: int,
+    num_queries_per_kv: int,
+    head_size: int,
+    sliding_window: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: str,
+    device: str,
+    op: Callable,
+) -> None:
+    test_contexted_kv_attention(num_heads, num_queries_per_kv, head_size,
+                                sliding_window, dtype, kv_cache_dtype, device,
+                                op)
+
+
+@pytest.mark.optional
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", [torch.float32])
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("op", OPS)
+@torch.inference_mode()
+def test_contexted_kv_attention_alibi_f32(
+    num_heads: int,
+    num_queries_per_kv: int,
+    head_size: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: str,
+    device: str,
+    op: Callable,
+) -> None:
+    test_contexted_kv_attention_alibi(num_heads, num_queries_per_kv, head_size,
+                                      dtype, kv_cache_dtype, device, op)
diff --git a/vllm_v0.10.0/tests/kernels/attention/test_rocm_attention_selector.py b/vllm_v0.10.0/tests/kernels/attention/test_rocm_attention_selector.py
new file mode 100644
index 0000000..d56d3f4
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/attention/test_rocm_attention_selector.py
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
+from vllm.platforms.rocm import RocmPlatform
+from vllm.utils import STR_BACKEND_ENV_VAR
+
+
+@pytest.fixture(autouse=True)
+def clear_cache():
+    """Clear lru cache to ensure each test case runs without caching.
+    """
+    _cached_get_attn_backend.cache_clear()
+
+
+def test_selector(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, "ROCM_FLASH")
+
+        # Set the current platform to ROCm using monkeypatch
+        monkeypatch.setattr("vllm.attention.selector.current_platform",
+                            RocmPlatform())
+
+        # Test standard ROCm attention
+        backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
+        assert (backend.get_name() == "ROCM_FLASH"
+                or backend.get_name() == "TRITON_ATTN_VLLM_V1")
+
+        # MLA test for deepseek related
+
+        # change the attention backend to triton MLA
+        m.setenv(STR_BACKEND_ENV_VAR, "TRITON_MLA")
+        backend = get_attn_backend(576,
+                                   torch.bfloat16,
+                                   "auto",
+                                   16,
+                                   False,
+                                   use_mla=True)
+        assert (backend.get_name() == "TRITON_MLA"
+                or backend.get_name() == "TRITON_MLA_VLLM_V1")
+
+        # If attention backend is None
+        # If use_mla is true
+        # The selected backend is triton MLA
+        m.setenv(STR_BACKEND_ENV_VAR, None)
+        backend = get_attn_backend(576,
+                                   torch.bfloat16,
+                                   "auto",
+                                   16,
+                                   False,
+                                   use_mla=True)
+        assert (backend.get_name() == "TRITON_MLA"
+                or backend.get_name() == "TRITON_MLA_VLLM_V1")
+
+        # change the attention backend to AITER MLA
+        m.setenv(STR_BACKEND_ENV_VAR, "ROCM_AITER_MLA")
+        backend = get_attn_backend(576,
+                                   torch.bfloat16,
+                                   "auto",
+                                   1,
+                                   False,
+                                   use_mla=True)
+        assert (backend.get_name() == "ROCM_AITER_MLA"
+                or backend.get_name() == "ROCM_AITER_MLA_VLLM_V1")
+
+        # If attention backend is None
+        # If use_mla is true
+        # If VLLM_ROCM_USE_AITER is enabled
+        # The selected backend is ROCM_AITER_MLA
+        m.setenv(STR_BACKEND_ENV_VAR, None)
+        m.setenv("VLLM_ROCM_USE_AITER", "1")
+        backend = get_attn_backend(576,
+                                   torch.bfloat16,
+                                   "auto",
+                                   1,
+                                   False,
+                                   use_mla=True)
+        assert (backend.get_name() == "ROCM_AITER_MLA"
+                or backend.get_name() == "ROCM_AITER_MLA_VLLM_V1")
diff --git a/vllm_v0.10.0/tests/kernels/attention/test_triton_decode_attention.py b/vllm_v0.10.0/tests/kernels/attention/test_triton_decode_attention.py
new file mode 100644
index 0000000..2dca720
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/attention/test_triton_decode_attention.py
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
+from vllm.utils import cdiv
+
+
+@pytest.mark.parametrize("B", [3, 5])
+@pytest.mark.parametrize("L", [1027, 1025])
+@pytest.mark.parametrize("H_Q", [32])
+@pytest.mark.parametrize("H_KV", [32, 8])
+@pytest.mark.parametrize("D_QK", [128, 192, 576])
+@pytest.mark.parametrize("D_V", [128, 512])
+@pytest.mark.parametrize("CACHE_SIZE", [16384])
+@pytest.mark.parametrize("PAGE_SIZE", [1, 16])
+def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE):
+    assert CACHE_SIZE % PAGE_SIZE == 0
+    dtype = torch.bfloat16
+    seq_len = L  # This represents the number of tokens already in the sequence
+    sm_scale = 1.0 / (D_QK**0.5)
+    num_kv_splits = 8
+
+    num_pages_per_batch = cdiv(seq_len, PAGE_SIZE)
+    req_to_page = torch.randint(0,
+                                CACHE_SIZE // PAGE_SIZE,
+                                (B, num_pages_per_batch, 1),
+                                device="cuda")
+    req_to_token = req_to_page * PAGE_SIZE
+    req_to_token = req_to_token.expand(B, num_pages_per_batch, PAGE_SIZE)
+    req_to_token = req_to_token + torch.arange(PAGE_SIZE, device="cuda").view(
+        1, 1, -1)
+    req_to_token = req_to_token.view(B, -1)
+    req_to_token = req_to_token[:, :seq_len].contiguous()
+
+    # q represents the new token being generated, one per batch
+    q = torch.randn(B, H_Q, D_QK, dtype=dtype, device="cuda")
+
+    # k_buffer and v_buffer represent all previous tokens
+    # Page size is 1.
+    k_buffer = torch.randn(CACHE_SIZE, H_KV, D_QK, dtype=dtype, device="cuda")
+    v_buffer = torch.randn(CACHE_SIZE, H_KV, D_V, dtype=dtype, device="cuda")
+
+    # o will have the same shape as q
+    o = torch.zeros(B, H_Q, D_V, dtype=dtype, device="cuda")
+
+    b_seq_len = torch.full((B, ), seq_len, device="cuda")
+
+    attn_logits = torch.empty(
+        (B, H_Q, num_kv_splits, D_V + 1),
+        dtype=torch.float32,
+        device="cuda",
+    )
+
+    # Call the original implementation.
+    decode_attention_fwd(
+        q,
+        k_buffer,
+        v_buffer,
+        o,
+        req_to_token,
+        b_seq_len,
+        attn_logits,
+        num_kv_splits,
+        sm_scale,
+    )
+
+    # Page size can be larger than 1.
+    k_buffer = k_buffer.view(CACHE_SIZE // PAGE_SIZE, PAGE_SIZE, H_KV, D_QK)
+    v_buffer = v_buffer.view(CACHE_SIZE // PAGE_SIZE, PAGE_SIZE, H_KV, D_V)
+
+    o1 = torch.zeros_like(o)
+
+    decode_attention_fwd(
+        q,
+        k_buffer,
+        v_buffer,
+        o1,
+        req_to_page,
+        b_seq_len,
+        attn_logits,
+        num_kv_splits,
+        sm_scale,
+        PAGE_SIZE,
+    )
+
+    assert torch.allclose(o, o1)
diff --git a/vllm_v0.10.0/tests/kernels/attention/test_triton_unified_attention.py b/vllm_v0.10.0/tests/kernels/attention/test_triton_unified_attention.py
new file mode 100644
index 0000000..0cb7f59
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/attention/test_triton_unified_attention.py
@@ -0,0 +1,195 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import pytest
+import torch
+
+from vllm.attention.ops.triton_unified_attention import unified_attention
+from vllm.platforms import current_platform
+
+NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
+HEAD_SIZES = [128, 256]
+BLOCK_SIZES = [16, 32]
+
+DTYPES = [torch.float16, torch.bfloat16]
+QDTYPES = [None, torch.float8_e4m3fn] if not current_platform.is_rocm() else [
+    None, torch.float8_e4m3fnuz
+]
+# one value large enough to test overflow in index calculation.
+# one value small enough to test the schema op check
+NUM_BLOCKS = [32768, 2048]
+
+
+def ref_paged_attn(
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    query_lens: list[int],
+    kv_lens: list[int],
+    block_tables: torch.Tensor,
+    scale: float,
+    sliding_window: Optional[int] = None,
+    soft_cap: Optional[float] = None,
+) -> torch.Tensor:
+    num_seqs = len(query_lens)
+    block_tables = block_tables.cpu().numpy()
+    _, block_size, num_kv_heads, head_size = key_cache.shape
+
+    outputs: list[torch.Tensor] = []
+    start_idx = 0
+    for i in range(num_seqs):
+        query_len = query_lens[i]
+        kv_len = kv_lens[i]
+        q = query[start_idx:start_idx + query_len]
+        q *= scale
+
+        num_kv_blocks = (kv_len + block_size - 1) // block_size
+        block_indices = block_tables[i, :num_kv_blocks]
+
+        k = key_cache[block_indices].view(-1, num_kv_heads, head_size)
+        k = k[:kv_len]
+        v = value_cache[block_indices].view(-1, num_kv_heads, head_size)
+        v = v[:kv_len]
+
+        if q.shape[1] != k.shape[1]:
+            k = torch.repeat_interleave(k, q.shape[1] // k.shape[1], dim=1)
+            v = torch.repeat_interleave(v, q.shape[1] // v.shape[1], dim=1)
+        attn = torch.einsum("qhd,khd->hqk", q, k).float()
+        empty_mask = torch.ones(query_len, kv_len)
+        mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
+        if sliding_window is not None:
+            sliding_window_mask = torch.triu(empty_mask,
+                                             diagonal=kv_len -
+                                             (query_len + sliding_window) +
+                                             1).bool().logical_not()
+            mask |= sliding_window_mask
+        if soft_cap is not None and soft_cap > 0:
+            attn = soft_cap * torch.tanh(attn / soft_cap)
+        attn.masked_fill_(mask, float("-inf"))
+        attn = torch.softmax(attn, dim=-1).to(v.dtype)
+        out = torch.einsum("hqk,khd->qhd", attn, v)
+
+        outputs.append(out)
+        start_idx += query_len
+
+    return torch.cat(outputs, dim=0)
+
+
+@pytest.mark.parametrize("seq_lens",
+                         [[(1, 1328), (5, 18),
+                           (129, 463)], [(1, 523), (1, 37), (1, 2011)]])
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("sliding_window", [None, 256])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("q_dtype", QDTYPES)
+@torch.inference_mode()
+def test_triton_unified_attn(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: Optional[int],
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: Optional[float],
+    num_blocks: int,
+    q_dtype: Optional[torch.dtype],
+) -> None:
+    torch.set_default_device("cuda")
+
+    if q_dtype is not None and q_dtype.itemsize < 2 and block_size < 32:
+        pytest.skip("block size must be at least 32 for fp8")
+
+    current_platform.seed_everything(0)
+    num_seqs = len(seq_lens)
+    query_lens = [x[0] for x in seq_lens]
+    kv_lens = [x[1] for x in seq_lens]
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    max_query_len = max(query_lens)
+    max_kv_len = max(kv_lens)
+    window_size = ((sliding_window - 1, 0) if sliding_window is not None else
+                   (-1, -1))
+    scale = head_size**-0.5
+
+    query = torch.randn(sum(query_lens),
+                        num_query_heads,
+                        head_size,
+                        dtype=dtype)
+    key_cache = torch.randn(num_blocks,
+                            block_size,
+                            num_kv_heads,
+                            head_size,
+                            dtype=dtype)
+    value_cache = torch.randn_like(key_cache)
+    cu_query_lens = torch.tensor([0] + query_lens,
+                                 dtype=torch.int32).cumsum(dim=0,
+                                                           dtype=torch.int32)
+    kv_lens = torch.tensor(kv_lens, dtype=torch.int32)
+
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(0,
+                                 num_blocks,
+                                 (num_seqs, max_num_blocks_per_seq),
+                                 dtype=torch.int32)
+
+    output = torch.empty_like(query)
+
+    maybe_quantized_query = query
+    maybe_quantized_key_cache = key_cache
+    maybe_quantized_value_cache = value_cache
+    q_descale = None
+    k_descale = None
+    v_descale = None
+    if q_dtype is not None:
+        # QKV are drawn from N(0, 1): no need for a fp8 scaling factor
+        maybe_quantized_query = query.to(q_dtype)
+        maybe_quantized_key_cache = key_cache.to(q_dtype)
+        maybe_quantized_value_cache = value_cache.to(q_dtype)
+
+        scale_shape = (num_seqs, num_kv_heads)
+        q_descale = None  # Not yet supported
+        k_descale = torch.rand(scale_shape, dtype=torch.float32)
+        v_descale = torch.rand(scale_shape, dtype=torch.float32)
+
+    unified_attention(
+        q=maybe_quantized_query,
+        k=maybe_quantized_key_cache,
+        v=maybe_quantized_value_cache,
+        out=output,
+        cu_seqlens_q=cu_query_lens,
+        seqused_k=kv_lens,
+        max_seqlen_q=max_query_len,
+        max_seqlen_k=max_kv_len,
+        softmax_scale=scale,
+        causal=True,
+        window_size=window_size,
+        block_table=block_tables,
+        softcap=soft_cap if soft_cap is not None else 0,
+        q_descale=q_descale,
+        k_descale=k_descale,
+        v_descale=v_descale,
+    )
+
+    ref_output = ref_paged_attn(
+        query=query,
+        key_cache=key_cache,
+        value_cache=value_cache,
+        query_lens=query_lens,
+        kv_lens=kv_lens,
+        block_tables=block_tables,
+        scale=scale,
+        sliding_window=sliding_window,
+        soft_cap=soft_cap,
+    )
+    atol, rtol = 1.5e-2, 1e-2
+    if q_dtype is not None:
+        atol, rtol = 1.5e-1, 1.5e-1
+    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol), \
+        f"{torch.max(torch.abs(output - ref_output))}"
diff --git a/vllm_v0.10.0/tests/kernels/core/test_activation.py b/vllm_v0.10.0/tests/kernels/core/test_activation.py
new file mode 100644
index 0000000..29c5e70
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/core/test_activation.py
@@ -0,0 +1,108 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import random
+
+import pytest
+import torch
+
+from tests.kernels.allclose_default import get_default_atol, get_default_rtol
+from tests.kernels.utils import opcheck
+from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul,
+                                                   GeluAndMul, MulAndSilu,
+                                                   NewGELU, QuickGELU,
+                                                   SiluAndMul)
+from vllm.platforms import current_platform
+
+DTYPES = [torch.half, torch.bfloat16, torch.float]
+NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
+D = [512, 13824]  # Arbitrary values for testing
+SEEDS = [0]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+
+@pytest.mark.parametrize(
+    "activation",
+    ["silu_and_mul", "mul_and_silu", "gelu", "gelu_tanh", "fatrelu"])
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("d", D)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_act_and_mul(
+    activation: str,
+    num_tokens: int,
+    d: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+    x = torch.randn(num_tokens, 2 * d, dtype=dtype)
+    if activation == "silu_and_mul":
+        layer = SiluAndMul()
+        fn = torch.ops._C.silu_and_mul
+    if activation == "mul_and_silu":
+        layer = MulAndSilu()
+        fn = torch.ops._C.mul_and_silu
+    elif activation == "gelu":
+        layer = GeluAndMul(approximate="none")
+        fn = torch.ops._C.gelu_and_mul
+    elif activation == "gelu_tanh":
+        layer = GeluAndMul(approximate="tanh")
+        fn = torch.ops._C.gelu_tanh_and_mul
+    elif activation == "fatrelu":
+        threshold = random.uniform(0, 1)
+        layer = FatreluAndMul(threshold)
+        fn = torch.ops._C.fatrelu_and_mul
+    out = layer(x)
+    ref_out = layer.forward_native(x)
+    # The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are
+    # equivalent to the native PyTorch implementations, so we can do exact
+    # comparison.
+    torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
+
+    d = x.shape[-1] // 2
+    output_shape = (x.shape[:-1] + (d, ))
+    out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+    if activation == "fatrelu":
+        opcheck(fn, (out, x, threshold))
+    else:
+        opcheck(fn, (out, x))
+
+
+@pytest.mark.parametrize("activation", [(FastGELU, torch.ops._C.gelu_fast),
+                                        (NewGELU, torch.ops._C.gelu_new),
+                                        (QuickGELU, torch.ops._C.gelu_quick)])
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("d", D)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_activation(
+    activation: type[torch.nn.Module],
+    num_tokens: int,
+    d: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+    x = torch.randn(num_tokens, d, dtype=dtype)
+    layer = activation[0]()
+    fn = activation[1]
+    out = layer(x)
+    ref_out = layer.forward_native(x)
+    torch.testing.assert_close(out,
+                               ref_out,
+                               atol=get_default_atol(out),
+                               rtol=get_default_rtol(out))
+
+    out = torch.empty_like(x)
+    opcheck(fn, (out, x))
diff --git a/vllm_v0.10.0/tests/kernels/core/test_fused_quant_layernorm.py b/vllm_v0.10.0/tests/kernels/core/test_fused_quant_layernorm.py
new file mode 100644
index 0000000..19703b8
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/core/test_fused_quant_layernorm.py
@@ -0,0 +1,174 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional, Union
+
+import pytest
+import torch
+
+import vllm._custom_ops as ops
+from tests.kernels.utils import opcheck
+from vllm.model_executor.layers.layernorm import RMSNorm
+
+DTYPES = [torch.bfloat16, torch.float]
+QUANT_DTYPES = [torch.int8, torch.float8_e4m3fn]
+VEC_HIDDEN_SIZES = range(1024, 1030)
+# Avoid combinatorial explosion with full Cartesian product
+NUM_TOKENS_HIDDEN_SIZES = [
+    *[(1, i) for i in [1, 64, *VEC_HIDDEN_SIZES, 5120, 5137]],
+    *[(83, i) for i in [1, 1033, 2048, 5120]],
+    *[(2048, i) for i in [1, 64, *VEC_HIDDEN_SIZES, 5137]],
+    *[(4096, i) for i in [1, 64, 5137]],
+]
+
+ADD_RESIDUAL = [False, True]
+SCALE_UBS = [True, False]
+SEEDS = [0]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+EPS = 1e-6
+
+## Helpers
+
+
+def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
+    return torch.as_tensor(x, dtype=torch.float32, device='cuda')
+
+
+def ref_rms_norm(rms_norm_layer: RMSNorm,
+                 x: torch.Tensor,
+                 residual: Optional[torch.Tensor]) \
+        -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    if residual is not None:
+        residual = residual.clone()
+        out, residual = rms_norm_layer.forward_native(x, residual)
+    else:
+        out = rms_norm_layer.forward_native(x)
+
+    return out, residual
+
+
+def ref_dynamic_per_token_quant(rms_norm_layer: RMSNorm,
+                                x: torch.Tensor,
+                                quant_dtype: torch.dtype,
+                                residual: Optional[torch.Tensor],
+                                scale_ub: Optional[torch.Tensor]) \
+        -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    if scale_ub is not None:
+        assert quant_dtype == torch.float8_e4m3fn
+
+    # Norm
+    torch_out, residual = ref_rms_norm(rms_norm_layer, x, residual)
+
+    # Quant
+    if quant_dtype == torch.float8_e4m3fn:
+        torch_out, scales = ops.scaled_fp8_quant(torch_out,
+                                                 scale_ub=scale_ub,
+                                                 use_per_token_if_dynamic=True)
+    else:
+        assert quant_dtype == torch.int8
+        torch_out, scales = ops.scaled_int8_quant(torch_out)
+
+    return torch_out, scales, residual
+
+
+def ref_impl(rms_norm_layer: RMSNorm,
+             x: torch.Tensor,
+             quant_dtype: torch.dtype,
+             residual: Optional[torch.Tensor],
+             scale_ub: Optional[torch.Tensor]) \
+        -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    return ref_dynamic_per_token_quant(rms_norm_layer, x, quant_dtype,
+                                       residual, scale_ub)
+
+
+def ops_dynamic_per_token_quant(weight: torch.Tensor,
+                                x: torch.Tensor,
+                                quant_dtype: torch.dtype,
+                                residual: Optional[torch.Tensor],
+                                scale_ub: Optional[torch.Tensor]) \
+        -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    if residual is not None:
+        residual = residual.clone()
+    out, scales = ops.rms_norm_dynamic_per_token_quant(x, weight, EPS,
+                                                       quant_dtype, scale_ub,
+                                                       residual)
+    return out, scales, residual
+
+
+def ops_impl(weight: torch.Tensor,
+             x: torch.Tensor,
+             quant_dtype: torch.dtype,
+             residual: Optional[torch.Tensor],
+             scale_ub: Optional[torch.Tensor]) \
+        -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual,
+                                       scale_ub)
+
+
+@pytest.mark.parametrize("num_tokens, hidden_size", NUM_TOKENS_HIDDEN_SIZES)
+@pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
+@pytest.mark.parametrize("scale_ub", SCALE_UBS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("quant_dtype", QUANT_DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_rms_norm(
+    num_tokens: int,
+    hidden_size: int,
+    add_residual: bool,
+    scale_ub: bool,
+    dtype: torch.dtype,
+    quant_dtype: torch.dtype,
+    seed: int,
+    device: str,
+) -> None:
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+
+    if scale_ub is not None and quant_dtype != torch.float8_e4m3fn:
+        # skip
+        return
+
+    layer = RMSNorm(hidden_size, EPS).to(dtype=dtype)
+
+    # Make weights
+    layer.weight.data.normal_(mean=1.0, std=0.1)
+
+    # Make inputs
+    scale = 1 / (hidden_size)
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype) * scale
+    residual = torch.randn_like(x) * scale if add_residual else None
+    if scale_ub is not None:
+        rms_x, _ = ref_rms_norm(layer, x, residual)
+        scale_ub = torch.mean(rms_x).to(dtype=torch.float32, device='cuda')
+
+    ref_out, ref_scales, ref_residual = \
+        ref_impl(layer, x, quant_dtype, residual, scale_ub)
+    ops_out, ops_scales, ops_residual = \
+        ops_impl(layer.weight, x, quant_dtype, residual, scale_ub)
+
+    assert ref_out.dtype == quant_dtype
+    assert ops_out.dtype == quant_dtype
+    assert torch.allclose(ref_scales, ops_scales)
+    if quant_dtype == torch.int8:
+        # big atol to account for round-off errors.
+        assert torch.allclose(ref_out, ops_out, atol=1)
+    else:
+        assert torch.allclose(ref_out.to(dtype=torch.float32),
+                              ops_out.to(dtype=torch.float32))
+    if add_residual:
+        assert torch.allclose(ref_residual, ops_residual)
+
+    output = torch.empty_like(x, dtype=quant_dtype)
+    scales = torch.empty((x.numel() // x.shape[-1], 1),
+                         device=x.device,
+                         dtype=torch.float32)
+
+    opcheck(torch.ops._C.rms_norm_dynamic_per_token_quant,
+            (output, x, layer.weight, scales, 1e-5, scale_ub, residual))
diff --git a/vllm_v0.10.0/tests/kernels/core/test_layernorm.py b/vllm_v0.10.0/tests/kernels/core/test_layernorm.py
new file mode 100644
index 0000000..02316ce
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/core/test_layernorm.py
@@ -0,0 +1,149 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from tests.kernels.quant_utils import FP8_DTYPE
+from tests.kernels.utils import opcheck
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.platforms import current_platform
+
+DTYPES = [torch.half, torch.bfloat16, torch.float]
+NUM_TOKENS = [7, 83, 4096]  # Arbitrary values for testing
+HIDDEN_SIZES = [8, 768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192,
+                8199]  # Arbitrary values for testing
+ADD_RESIDUAL = [False, True]
+SEEDS = [0]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("strided_input", [False, True])
+@torch.inference_mode()
+def test_rms_norm(
+    num_tokens: int,
+    hidden_size: int,
+    add_residual: bool,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    strided_input: bool,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+    layer = RMSNorm(hidden_size).to(dtype=dtype)
+    layer.weight.data.normal_(mean=1.0, std=0.1)
+    scale = 1 / (2 * hidden_size)
+    last_dim = 2 * hidden_size if strided_input else hidden_size
+    x = torch.randn(num_tokens, last_dim, dtype=dtype)
+    x = x[..., :hidden_size]
+    assert x.is_contiguous() != strided_input
+    x *= scale
+    residual = torch.randn_like(x) * scale if add_residual else None
+
+    # NOTE(woosuk): The reference implementation should be executed first
+    # because the custom kernel is in-place.
+    ref_out = layer.forward_native(x, residual)
+    out = layer(x, residual)
+    # NOTE(woosuk): LayerNorm operators (including RMS) typically have larger
+    # numerical errors than other operators because they involve reductions.
+    # Therefore, we use a larger tolerance.
+    if add_residual:
+        torch.testing.assert_close(out[0], ref_out[0], atol=1e-2, rtol=1e-2)
+        torch.testing.assert_close(out[1], ref_out[1], atol=1e-2, rtol=1e-2)
+    else:
+        torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
+
+    if residual is not None:
+        opcheck(torch.ops._C.fused_add_rms_norm,
+                (x, residual, layer.weight.data, layer.variance_epsilon))
+    else:
+        opcheck(torch.ops._C.rms_norm,
+                (out, x, layer.weight.data, layer.variance_epsilon))
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("quant_scale", [1.0, 0.01, 10.0])
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("strided_input", [False, True])
+def test_fused_rms_norm_quant(
+    num_tokens: int,
+    hidden_size: int,
+    add_residual: bool,
+    dtype: torch.dtype,
+    quant_scale: float,
+    seed: int,
+    device: str,
+    strided_input: bool,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+
+    weight = torch.empty(hidden_size, dtype=dtype).normal_(mean=1.0, std=0.1)
+    scale = 1 / (2 * hidden_size)
+    last_dim = 2 * hidden_size if strided_input else hidden_size
+    x_base = torch.randn(num_tokens, last_dim, dtype=dtype)
+    x = x_base[..., :hidden_size]
+    assert x.is_contiguous() != strided_input
+
+    x *= scale
+    if add_residual:
+        residual = torch.randn_like(x) * scale
+        residual_fused = residual.clone()
+    else:
+        residual = residual_fused = None
+
+    out_norm = torch.empty_like(x)
+    out_quant = torch.empty_like(x, dtype=FP8_DTYPE)
+    out_quant_fused = torch.empty_like(out_quant)
+
+    quant_scale_t = torch.tensor(quant_scale, dtype=torch.float32)
+
+    if add_residual:
+        torch.ops._C.fused_add_rms_norm_static_fp8_quant(
+            out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6)
+
+        # Unfused kernel is in-place so it goes second
+        # Also use a separate clone of x to avoid modifying the input
+        x_unfused_base = x_base.clone()
+        x_unfused = x_unfused_base[..., :hidden_size]
+        assert x_unfused.is_contiguous() != strided_input
+        torch.ops._C.fused_add_rms_norm(x_unfused, residual, weight, 1e-6)
+        torch.ops._C.static_scaled_fp8_quant(out_quant, x_unfused.contiguous(),
+                                             quant_scale_t)
+
+        torch.cuda.synchronize()
+        torch.testing.assert_close(residual_fused,
+                                   residual,
+                                   atol=1e-2,
+                                   rtol=1e-2)
+        opcheck(
+            torch.ops._C.fused_add_rms_norm_static_fp8_quant,
+            (out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6))
+    else:
+        torch.ops._C.rms_norm_static_fp8_quant(out_quant_fused, x, weight,
+                                               quant_scale_t, 1e-6)
+
+        torch.ops._C.rms_norm(out_norm, x, weight, 1e-6)
+        torch.ops._C.static_scaled_fp8_quant(out_quant, out_norm,
+                                             quant_scale_t)
+
+        opcheck(torch.ops._C.rms_norm_static_fp8_quant,
+                (out_quant_fused, x, weight, quant_scale_t, 1e-6))
+
+    torch.testing.assert_close(out_quant.to(dtype=torch.float32),
+                               out_quant_fused.to(dtype=torch.float32),
+                               atol=1e-3,
+                               rtol=1e-3)
diff --git a/vllm_v0.10.0/tests/kernels/core/test_opcheck.py b/vllm_v0.10.0/tests/kernels/core/test_opcheck.py
new file mode 100644
index 0000000..40ced08
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/core/test_opcheck.py
@@ -0,0 +1,26 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests for miscellaneous utilities
+"""
+
+import torch
+
+from tests.kernels.utils import opcheck
+
+
+def test_convert_fp8_opcheck():
+    data = torch.randn((256, 256), dtype=torch.float32, device="cuda")
+    result = torch.empty_like(data, dtype=torch.float8_e4m3fn)
+    opcheck(torch.ops._C_cache_ops.convert_fp8, (result, data, 1.0, "fp8"))
+
+
+# TODO: Add this back, currently fails with
+# csrc/cuda_utils_kernels.cu:15 'invalid argument'
+# @pytest.mark.skipif(not current_platform.is_cuda(),
+#                     reason="Only supported for CUDA")
+# def test_cuda_utils_opcheck():
+#     opcheck(torch.ops._C_cuda_utils.get_device_attribute, (0, 0))
+#     opcheck(
+#         torch.ops._C_cuda_utils.
+#         get_max_shared_memory_per_block_device_attribute, (0, ))
diff --git a/vllm_v0.10.0/tests/kernels/core/test_permute_cols.py b/vllm_v0.10.0/tests/kernels/core/test_permute_cols.py
new file mode 100644
index 0000000..e18f623
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/core/test_permute_cols.py
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from tests.kernels.utils import opcheck
+from vllm._custom_ops import permute_cols
+
+
+@pytest.mark.parametrize('shape', [(1, 512), (544, 4096), (67, 8192)])
+@pytest.mark.parametrize('dtype', [torch.bfloat16, torch.float16])
+def test_permute_cols(shape, dtype):
+    x = torch.randn(shape, dtype=dtype).cuda()
+    perm = torch.randperm(x.shape[1]).to(torch.int).cuda()
+    opcheck(torch.ops._C.permute_cols, (x, perm))
+    y = permute_cols(x, perm)
+    torch.testing.assert_close(y, x[:, perm])
\ No newline at end of file
diff --git a/vllm_v0.10.0/tests/kernels/core/test_pos_encoding.py b/vllm_v0.10.0/tests/kernels/core/test_pos_encoding.py
new file mode 100644
index 0000000..ab6f1cc
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/core/test_pos_encoding.py
@@ -0,0 +1,294 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from itertools import accumulate, product
+from typing import Callable, Optional
+
+import pytest
+import torch
+
+from tests.kernels.allclose_default import get_default_atol, get_default_rtol
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.platforms import current_platform
+
+IS_NEOX_STYLE = [True, False]
+DTYPES = [torch.half, torch.bfloat16, torch.float]
+HEAD_SIZES = [64, 80, 112, 120, 256]
+ROTARY_DIMS = [None, 32]  # None means rotary dim == head size
+NUM_HEADS = [17]  # Arbitrary values for testing
+BATCH_SIZES = [5]  # Arbitrary values for testing
+SEQ_LENS = [11, 8192]  # Arbitrary values for testing
+SEEDS = [0]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+USE_KEY = [True, False]
+
+
+def _get_flat_tensor_shape(batch_size: int, seq_len: int, num_heads: int,
+                           head_size: int) -> tuple[int, ...]:
+    return (batch_size, seq_len, num_heads * head_size)
+
+
+# For testing sliced tensors
+def _get_padded_tensor_shape(batch_size: int, seq_len: int, num_heads: int,
+                             head_size: int) -> tuple[int, ...]:
+    return (batch_size, seq_len, num_heads, head_size + 64)
+
+
+def _get_batch_tensor_shape(batch_size: int, seq_len: int, num_heads: int,
+                            head_size: int) -> tuple[int, ...]:
+    return (batch_size, seq_len, num_heads, head_size)
+
+
+TENSORS_SHAPES_FN = [
+    _get_batch_tensor_shape, _get_flat_tensor_shape, _get_padded_tensor_shape
+]
+
+
+@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
+@pytest.mark.parametrize("tensor_shape_fn", TENSORS_SHAPES_FN)
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("seq_len", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("use_key", USE_KEY)
+@torch.inference_mode()
+def test_rotary_embedding(
+    is_neox_style: bool,
+    tensor_shape_fn: Callable[[int, int, int, int], tuple[int]],
+    batch_size: int,
+    seq_len: int,
+    num_heads: int,
+    head_size: int,
+    rotary_dim: Optional[int],
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    use_key: bool,
+    max_position: int = 8192,
+    base: float = 10000,
+) -> None:
+    if rotary_dim is None:
+        rotary_dim = head_size
+
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+    if rotary_dim is None:
+        rotary_dim = head_size
+    rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
+    rope = rope.to(dtype=dtype, device=torch.get_default_device())
+
+    positions = torch.randint(0, max_position, (batch_size, seq_len))
+    query_shape = tensor_shape_fn(batch_size, seq_len, num_heads, head_size)
+    query = torch.randn(query_shape, dtype=dtype)
+    key = torch.randn_like(query) if use_key else None
+
+    # slice tensor if required, noop otherwise
+    query = query[..., :head_size]
+    key = key[..., :head_size] if use_key else None
+
+    # NOTE(woosuk): The reference implementation should be executed first
+    # because the custom kernel is in-place.
+    ref_query, ref_key = rope.forward_native(positions, query, key)
+    out_query, out_key = rope.forward(positions, query, key)
+    # Compare the results.
+    torch.testing.assert_close(out_query,
+                               ref_query,
+                               atol=get_default_atol(out_query),
+                               rtol=get_default_rtol(out_query))
+    if use_key:
+        torch.testing.assert_close(out_key,
+                                   ref_key,
+                                   atol=get_default_atol(out_key),
+                                   rtol=get_default_rtol(out_key))
+    else:
+        assert ref_key is None and out_key is None, \
+            "expected returned key to be None"
+
+
+@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
+@pytest.mark.parametrize("tensor_shape_fn", TENSORS_SHAPES_FN)
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("seq_len", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("use_key", USE_KEY)
+@torch.inference_mode()
+def test_batched_rotary_embedding(
+    is_neox_style: bool,
+    tensor_shape_fn: Callable[[int, int, int, int], tuple[int]],
+    batch_size: int,
+    seq_len: int,
+    num_heads: int,
+    head_size: int,
+    rotary_dim: Optional[int],
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    use_key: bool,
+    max_position: int = 8192,
+    base: float = 10000,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+    if rotary_dim is None:
+        rotary_dim = head_size
+    rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, {
+        "rope_type": "linear",
+        "factor": (1, )
+    })
+    rope = rope.to(dtype=dtype, device=torch.get_default_device())
+
+    positions = torch.randint(0, max_position, (batch_size, seq_len))
+    query_shape = tensor_shape_fn(batch_size, seq_len, num_heads, head_size)
+    query = torch.randn(query_shape, dtype=dtype)
+    key = torch.randn_like(query) if use_key else None
+
+    # slice tensor if required, noop otherwise
+    query = query[..., :head_size]
+    key = key[..., :head_size] if use_key else None
+
+    # NOTE(woosuk): The reference implementation should be executed first
+    # because the custom kernel is in-place.
+    ref_query, ref_key = rope.forward_native(positions, query, key)
+    out_query, out_key = rope.forward(positions,
+                                      query,
+                                      key,
+                                      offsets=torch.zeros(batch_size * seq_len,
+                                                          dtype=torch.long,
+                                                          device=device))
+    # Compare the results.
+    torch.testing.assert_close(out_query,
+                               ref_query,
+                               atol=get_default_atol(out_query),
+                               rtol=get_default_rtol(out_query))
+    if use_key:
+        torch.testing.assert_close(out_key,
+                                   ref_key,
+                                   atol=get_default_atol(out_key),
+                                   rtol=get_default_rtol(out_key))
+    else:
+        assert ref_key is None and out_key is None, \
+            "expected returned key to be None"
+
+
+@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("seq_len", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("use_key", USE_KEY)
+@torch.inference_mode()
+def test_batched_rotary_embedding_multi_lora(
+    is_neox_style: bool,
+    batch_size: int,
+    seq_len: int,
+    num_heads: int,
+    head_size: int,
+    rotary_dim: Optional[int],
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    use_key: bool,
+    max_position: int = 8192,
+    base: float = 10000,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+    if rotary_dim is None:
+        rotary_dim = head_size
+    scaling_factors: list[int] = [1, 2, 4]
+    rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, {
+        "rope_type": "linear",
+        "factor": tuple(scaling_factors)
+    })
+    rope = rope.to(dtype=dtype, device=torch.get_default_device())
+
+    positions = torch.randint(0, max_position, (batch_size, seq_len))
+    query = torch.randn(batch_size,
+                        seq_len,
+                        num_heads * head_size,
+                        dtype=dtype)
+    key = torch.randn_like(query) if use_key else None
+
+    offset_map = torch.tensor(
+        list(
+            accumulate([0] + [
+                max_position * scaling_factor * 2
+                for scaling_factor in scaling_factors[:-1]
+            ])))
+    query_types = torch.randint(0,
+                                len(scaling_factors), (batch_size, seq_len),
+                                device=device)
+    query_offsets = offset_map[query_types]
+
+    # NOTE(woosuk): The reference implementation should be executed first
+    # because the custom kernel is in-place.
+    ref_query, ref_key = rope.forward_native(positions, query, key,
+                                             query_offsets)
+    out_query, out_key = rope.forward(positions, query, key,
+                                      query_offsets.flatten())
+    # Compare the results.
+    torch.testing.assert_close(out_query,
+                               ref_query,
+                               atol=get_default_atol(out_query),
+                               rtol=get_default_rtol(out_query))
+    if use_key:
+        torch.testing.assert_close(out_key,
+                                   ref_key,
+                                   atol=get_default_atol(out_key),
+                                   rtol=get_default_rtol(out_key))
+    else:
+        assert ref_key is None and out_key is None, \
+            "expected returned key to be None"
+
+
+@torch.inference_mode()
+def test_rope_module_cache():
+    MAX_POSITIONS = [123, 1234]
+    BASES = [10000, 1000000]
+    ROPE_SCALINGS = (None, {
+        "rope_type": "linear",
+        "factor": (1, )
+    }, {
+        "rope_type": "dynamic",
+        "factor": 1
+    })
+    settings = (HEAD_SIZES, ROTARY_DIMS, MAX_POSITIONS, BASES, IS_NEOX_STYLE,
+                ROPE_SCALINGS, DTYPES)
+    rope_setting_id_map: dict[str, int] = {}
+    for setting in product(*settings):
+        head_size, rotary_dim, max_position, base, \
+            is_neox_stype, rope_scaling, dtype = setting
+        if rotary_dim is None:
+            rotary_dim = head_size
+        rope = get_rope(head_size, rotary_dim, max_position, base,
+                        is_neox_stype, rope_scaling, dtype)
+        # different settings cannot share the same rope module
+        assert id(rope) not in rope_setting_id_map.values()
+        assert all(x.dtype == dtype for x in rope.buffers())
+        assert all(x.dtype == dtype for x in rope.parameters())
+        rope_setting_id_map[str(setting)] = id(rope)
+
+    for setting in product(*settings):
+        head_size, rotary_dim, max_position, base, \
+            is_neox_stype, rope_scaling, dtype = setting
+        if rotary_dim is None:
+            rotary_dim = head_size
+        rope = get_rope(head_size, rotary_dim, max_position, base,
+                        is_neox_stype, rope_scaling, dtype)
+        # check if cache take effect
+        assert id(rope) == rope_setting_id_map[str(setting)]
diff --git a/vllm_v0.10.0/tests/kernels/core/test_rotary_embedding.py b/vllm_v0.10.0/tests/kernels/core/test_rotary_embedding.py
new file mode 100644
index 0000000..d1fd960
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/core/test_rotary_embedding.py
@@ -0,0 +1,78 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests for miscellaneous utilities
+"""
+
+from typing import Optional
+
+import pytest
+import torch
+
+from tests.kernels.utils import opcheck
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+
+
+def rotary_embedding_opcheck(rot,
+                             positions: torch.Tensor,
+                             query: torch.Tensor,
+                             key: Optional[torch.Tensor] = None,
+                             offsets: Optional[torch.Tensor] = None):
+    cos_sin_cache = rot.cos_sin_cache.to(query.device, dtype=query.dtype)
+
+    # ops.rotary_embedding()/batched_rotary_embedding()
+    # are in-place operations that update the query and key tensors.
+    if offsets is not None:
+        opcheck(torch.ops._C.batched_rotary_embedding,
+                (positions, query, key, rot.head_size, cos_sin_cache,
+                 rot.is_neox_style, rot.rotary_dim, offsets))
+    else:
+        opcheck(torch.ops._C.rotary_embedding,
+                (positions, query, key, rot.head_size, cos_sin_cache,
+                 rot.is_neox_style))
+
+
+@pytest.mark.parametrize("device", ["cuda"])
+@pytest.mark.parametrize("max_position", [11, 4096, 32768])
+@pytest.mark.parametrize("is_neox_style", [True, False])
+@pytest.mark.parametrize("rotary_dim", [32])
+@pytest.mark.parametrize("head_size", [32, 108])
+@pytest.mark.parametrize("seq_len", [11, 1024])
+@pytest.mark.parametrize("use_key", [True, False])
+@pytest.mark.parametrize("head_stride_is_contiguous", [True, False])
+def test_rotary_embedding_opcheck(dist_init, device, max_position,
+                                  is_neox_style, rotary_dim, head_size,
+                                  seq_len, use_key, head_stride_is_contiguous):
+    batch_size = 1
+    base = 10000
+    num_heads = 7
+    rot = RotaryEmbedding(head_size, rotary_dim, max_position, base,
+                          is_neox_style, torch.float32)
+
+    positions = torch.randint(0,
+                              max_position, (batch_size, seq_len),
+                              device=device)
+    head_stride = head_size + (64 if head_stride_is_contiguous else 0)
+
+    query = torch.randn(batch_size,
+                        seq_len,
+                        num_heads,
+                        head_stride,
+                        dtype=torch.float32,
+                        device=device)
+    key = torch.randn_like(query) if use_key else None
+    query = query[..., :head_size]
+    key = key[..., :head_size] if use_key else None
+
+    rotary_embedding_opcheck(rot, positions, query, key)
+    offsets = torch.zeros(batch_size * seq_len,
+                          device=device,
+                          dtype=torch.long)
+    rotary_embedding_opcheck(rot, positions, query, key, offsets)
+
+    # if we have a contiguous head stride, test the alternate
+    # [..., num_heads * head_dim] shape/layout
+    if head_stride_is_contiguous:
+        rotary_embedding_opcheck(
+            rot, positions, query.flatten(start_dim=-2),
+            key.flatten(start_dim=-2) if use_key else None)
diff --git a/vllm_v0.10.0/tests/kernels/core/test_uva.py b/vllm_v0.10.0/tests/kernels/core/test_uva.py
new file mode 100644
index 0000000..c71215e
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/core/test_uva.py
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from vllm.utils import get_cuda_view_from_cpu_tensor, is_uva_available
+
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+
+@pytest.mark.skipif(not is_uva_available(), reason="UVA is not available.")
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_cpu_write(device):
+    torch.set_default_device(device)
+    cpu_tensor = torch.zeros(10,
+                             10,
+                             device="cpu",
+                             pin_memory=True,
+                             dtype=torch.int32)
+    cuda_view = get_cuda_view_from_cpu_tensor(cpu_tensor)
+    assert cuda_view.device.type == "cuda"
+
+    assert cuda_view[0, 0] == 0
+    assert cuda_view[2, 3] == 0
+    assert cuda_view[4, 5] == 0
+
+    cpu_tensor[0, 0] = 1
+    cpu_tensor[2, 3] = 2
+    cpu_tensor[4, 5] = -1
+
+    cuda_view.mul_(2)
+    assert cuda_view[0, 0] == 2
+    assert cuda_view[2, 3] == 4
+    assert cuda_view[4, 5] == -2
+
+
+@pytest.mark.skipif(not is_uva_available(), reason="UVA is not available.")
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_gpu_write(device):
+    torch.set_default_device(device)
+    cpu_tensor = torch.zeros(10,
+                             10,
+                             device="cpu",
+                             pin_memory=True,
+                             dtype=torch.int32)
+    cuda_view = get_cuda_view_from_cpu_tensor(cpu_tensor)
+    assert cuda_view.device.type == "cuda"
+
+    assert cuda_view[0, 0] == 0
+    assert cuda_view[2, 3] == 0
+    assert cuda_view[4, 5] == 0
+
+    cuda_view[0, 0] = 1
+    cuda_view[2, 3] = 2
+    cuda_view[4, 5] = -1
+    cuda_view.mul_(2)
+
+    assert cpu_tensor[0, 0] == 2
+    assert cpu_tensor[2, 3] == 4
+    assert cpu_tensor[4, 5] == -2
\ No newline at end of file
diff --git a/vllm_v0.10.0/tests/kernels/mamba/test_causal_conv1d.py b/vllm_v0.10.0/tests/kernels/mamba/test_causal_conv1d.py
new file mode 100644
index 0000000..411bd9e
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/mamba/test_causal_conv1d.py
@@ -0,0 +1,368 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import pytest
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+
+from vllm.attention.backends.utils import PAD_SLOT_ID
+from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
+    causal_conv1d_fn, causal_conv1d_update)
+from vllm.platforms import current_platform
+
+
+def causal_conv1d_ref(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    initial_states: Optional[torch.Tensor] = None,
+    return_final_states: bool = False,
+    final_states_out: Optional[torch.Tensor] = None,
+    activation: Optional[str] = "silu",
+):
+    """
+    x: (batch, dim, seqlen)
+    weight: (dim, width)
+    bias: (dim,)
+    initial_states: (batch, dim, width - 1)
+    final_states_out: (batch, dim, width - 1)
+
+    out: (batch, dim, seqlen)
+    """
+    if activation not in [None, "silu", "swish"]:
+        raise NotImplementedError("activation must be None, silu, or swish")
+    dtype_in = x.dtype
+    x = x.to(weight.dtype)
+    seqlen = x.shape[-1]
+    dim, width = weight.shape
+    if initial_states is None:
+        out = F.conv1d(x,
+                       weight.unsqueeze(1),
+                       bias,
+                       padding=width - 1,
+                       groups=dim)
+    else:
+        x = torch.cat([initial_states, x], dim=-1)
+        out = F.conv1d(x, weight.unsqueeze(1), bias, padding=0, groups=dim)
+    out = out[..., :seqlen]
+    if return_final_states:
+        final_states = F.pad(x, (width - 1 - x.shape[-1], 0)).to(
+            dtype_in)  # (batch, dim, width - 1)
+        if final_states_out is not None:
+            final_states_out.copy_(final_states)
+        else:
+            final_states_out = final_states
+    out = (out if activation is None else F.silu(out)).to(dtype=dtype_in)
+    return (out, None) if not return_final_states else (out, final_states_out)
+
+
+def causal_conv1d_update_ref(x,
+                             conv_state,
+                             weight,
+                             bias=None,
+                             activation=None,
+                             cache_seqlens=None):
+    """
+    x: (batch, dim) or (batch, dim, seqlen)
+    conv_state: (batch, dim, state_len), where state_len >= width - 1
+    weight: (dim, width)
+    bias: (dim,)
+    cache_seqlens: (batch,), dtype int32.
+        If not None, the conv_state is treated as a circular buffer.
+        The conv_state will be updated by copying x to the
+        conv_state starting at the index
+        @cache_seqlens % state_len before performing the convolution.
+
+    out: (batch, dim) or (batch, dim, seqlen)
+    """
+    if activation not in [None, "silu", "swish"]:
+        raise NotImplementedError("activation must be None, silu, or swish")
+    dtype_in = x.dtype
+    unsqueeze = x.dim() == 2
+    if unsqueeze:
+        x = x.unsqueeze(-1)
+    batch, dim, seqlen = x.shape
+    width = weight.shape[1]
+    state_len = conv_state.shape[-1]
+    assert conv_state.shape == (batch, dim, state_len)
+    assert weight.shape == (dim, width)
+    if cache_seqlens is None:
+        x_new = torch.cat([conv_state, x], dim=-1).to(
+            weight.dtype)  # (batch, dim, state_len + seqlen)
+        conv_state.copy_(x_new[:, :, -state_len:])
+    else:
+        width_idx = torch.arange(
+            -(width - 1), 0, dtype=torch.long,
+            device=x.device).unsqueeze(0) + cache_seqlens.unsqueeze(1)
+        width_idx = torch.remainder(width_idx, state_len).unsqueeze(1).expand(
+            -1, dim, -1)
+        x_new = torch.cat([conv_state.gather(2, width_idx), x],
+                          dim=-1).to(weight.dtype)
+        copy_idx = torch.arange(
+            seqlen, dtype=torch.long,
+            device=x.device).unsqueeze(0) + cache_seqlens.unsqueeze(1)
+        copy_idx = torch.remainder(copy_idx,
+                                   state_len).unsqueeze(1).expand(-1, dim, -1)
+        conv_state.scatter_(2, copy_idx, x)
+    out = F.conv1d(x_new, weight.unsqueeze(1), bias, padding=0,
+                   groups=dim)[:, :, -seqlen:]
+    if unsqueeze:
+        out = out.squeeze(-1)
+    return (out if activation is None else F.silu(out)).to(dtype=dtype_in)
+
+
+@pytest.mark.parametrize("itype", [torch.bfloat16, torch.float])
+@pytest.mark.parametrize("silu_activation", [True])
+@pytest.mark.parametrize("has_bias", [True])
+def causal_conv1d_opcheck_fn(x: torch.Tensor,
+                             weight: torch.Tensor,
+                             bias: Optional[torch.Tensor] = None,
+                             cu_seq_len: Optional[torch.Tensor] = None,
+                             cache_indices: Optional[torch.Tensor] = None,
+                             has_initial_state: Optional[torch.Tensor] = None,
+                             conv_states: Optional[torch.Tensor] = None,
+                             activation: Optional[str] = "silu",
+                             pad_slot_id: int = PAD_SLOT_ID):
+    """
+    x: (batch, dim, seqlen)
+    weight: (dim, width)
+    bias: (dim,)
+    seq_idx: (batch, seqlen)
+    initial_states: (batch, dim, width - 1)
+    final_states_out: (batch, dim, width - 1), to be written to
+    activation: either None or "silu" or "swish"
+
+    out: (batch, dim, seqlen)
+    """
+    if activation not in [None, "silu", "swish"]:
+        raise NotImplementedError("activation must be None, silu, or swish")
+    if x.stride(-1) != 1:
+        x = x.contiguous()
+    bias = bias.contiguous() if bias is not None else None
+
+
+@pytest.mark.parametrize("itype", [torch.bfloat16])
+@pytest.mark.parametrize("silu_activation", [False, True])
+@pytest.mark.parametrize("has_bias", [False, True])
+@pytest.mark.parametrize("seqlen", [1])
+@pytest.mark.parametrize("width", [4])
+@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
+def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation,
+                              itype):
+    device = "cuda"
+    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
+    if itype == torch.bfloat16:
+        rtol, atol = 1e-2, 5e-2
+    # set seed
+    current_platform.seed_everything(0)
+    batch = 2
+    x = torch.randn(batch, dim, seqlen, device=device, dtype=itype)
+    x_ref = x.clone()
+    conv_state = torch.randn(batch, dim, width - 1, device=device, dtype=itype)
+
+    weight = torch.randn(dim, width, device=device, dtype=itype)
+    bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None
+    conv_state_ref = conv_state.detach().clone()
+    activation = None if not silu_activation else "silu"
+    out = causal_conv1d_update(x,
+                               conv_state,
+                               weight,
+                               bias,
+                               activation=activation)
+    out_ref = causal_conv1d_update_ref(x_ref,
+                                       conv_state_ref,
+                                       weight,
+                                       bias,
+                                       activation=activation)
+
+    assert torch.equal(conv_state, conv_state_ref)
+    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize("itype",
+                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("silu_activation", [False, True])
+@pytest.mark.parametrize("has_bias", [False, True])
+@pytest.mark.parametrize("seqlen", [1, 3])
+@pytest.mark.parametrize("width", [3, 4])
+@pytest.mark.parametrize("dim", [2048 + 16, 4096])
+# tests correctness in case subset of the sequences are padded
+@pytest.mark.parametrize("with_padding", [True, False])
+@pytest.mark.parametrize("batch_size", [3])
+def test_causal_conv1d_update_with_batch_gather(batch_size, with_padding, dim,
+                                                width, seqlen, has_bias,
+                                                silu_activation, itype):
+    device = "cuda"
+    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
+    if itype == torch.bfloat16:
+        rtol, atol = 1e-2, 5e-2
+
+    # set seed
+    current_platform.seed_everything(0)
+
+    padding = 5 if with_padding else 0
+    padded_batch_size = batch_size + padding
+    # total_entries = number of cache line
+    total_entries = 10 * batch_size
+
+    # x will be (batch, dim, seqlen) with contiguous along dim-axis
+    x = torch.randn(padded_batch_size, seqlen, dim, device=device,
+                    dtype=itype).transpose(1, 2)
+
+    x_ref = x.clone()
+
+    conv_state_indices = torch.randperm(total_entries)[:batch_size].to(
+        dtype=torch.int32, device=device)
+    unused_states_bool = torch.ones(total_entries,
+                                    dtype=torch.bool,
+                                    device=device)
+    unused_states_bool[conv_state_indices] = False
+    padded_state_indices = torch.concat([
+        conv_state_indices,
+        torch.as_tensor(
+            [PAD_SLOT_ID] * padding, dtype=torch.int32, device=device)
+    ],
+                                        dim=0)
+
+    # conv_state will be (cache_lines, dim, state_len)
+    # with contiguous along dim-axis
+    conv_state = torch.randn(total_entries,
+                             width - 1,
+                             dim,
+                             device=device,
+                             dtype=itype).transpose(1, 2)
+
+    conv_state_for_padding_test = conv_state.clone()
+
+    weight = torch.randn(dim, width, device=device, dtype=itype)
+    bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None
+    conv_state_ref = conv_state[conv_state_indices, :].detach().clone()
+    activation = None if not silu_activation else "silu"
+
+    out = causal_conv1d_update(x,
+                               conv_state,
+                               weight,
+                               bias,
+                               activation=activation,
+                               conv_state_indices=padded_state_indices,
+                               pad_slot_id=PAD_SLOT_ID)
+    out_ref = causal_conv1d_update_ref(x_ref[:batch_size],
+                                       conv_state_ref,
+                                       weight,
+                                       bias,
+                                       activation=activation)
+
+    assert torch.equal(conv_state[conv_state_indices, :], conv_state_ref)
+    assert torch.equal(conv_state[unused_states_bool],
+                       conv_state_for_padding_test[unused_states_bool])
+    assert torch.allclose(out[:batch_size], out_ref, rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize("itype", [torch.bfloat16])
+@pytest.mark.parametrize("silu_activation", [True])
+@pytest.mark.parametrize("has_bias", [True])
+@pytest.mark.parametrize("width", [4])
+@pytest.mark.parametrize('seqlen', [8, 30, 249, 2049, 4096])
+@pytest.mark.parametrize('dim', [64, 4096])
+@pytest.mark.parametrize('with_padding', [True, False])
+@pytest.mark.parametrize('batch', [4, 10])
+def test_causal_conv1d_varlen(batch, with_padding, dim, seqlen, width,
+                              has_bias, silu_activation, itype):
+    device = "cuda"
+    torch.cuda.empty_cache()
+    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
+    if itype == torch.bfloat16:
+        rtol, atol = 1e-2, 5e-2
+    # set seed
+    current_platform.seed_everything(0)
+    seqlens = []
+    batch_size = batch
+    padding = 3 if with_padding else 0
+    padded_batch_size = batch_size + padding
+    nsplits = padded_batch_size - 1
+
+    eos_pos = torch.randperm(seqlen - 1)[:nsplits].sort().values
+
+    seqlens.append(
+        torch.diff(
+            torch.cat(
+                [torch.tensor([-1]), eos_pos,
+                 torch.tensor([seqlen - 1])])).tolist())
+    assert sum(seqlens[-1]) == seqlen
+    assert all(s > 0 for s in seqlens[-1])
+
+    total_entries = batch_size * 10
+    cumsum = torch.cumsum(torch.tensor(seqlens[0]), dim=0).to(torch.int32)
+    cumsum = torch.concat([torch.tensor([0], dtype=torch.int32), cumsum],
+                          dim=0)
+    x = rearrange(
+        torch.randn(1, seqlen, 4096 + dim + 64, device=device, dtype=itype),
+        "b s d -> b d s")[:, 4096:4096 + dim, :]
+
+    weight = torch.randn(dim, width, device=device, dtype=itype)
+
+    bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None
+    x_ref = x.clone()
+    weight_ref = weight.clone()
+    bias_ref = bias.clone() if bias is not None else None
+    activation = None if not silu_activation else "silu"
+    final_states = torch.randn(total_entries,
+                               width - 1,
+                               dim,
+                               device=x.device,
+                               dtype=x.dtype).transpose(1, 2)
+    final_states_ref = final_states.clone()
+    has_initial_states = torch.randint(0,
+                                       2, (cumsum.shape[0] - 1, ),
+                                       dtype=torch.bool,
+                                       device=x.device)
+    state_indices = torch.randperm(total_entries,
+                                   dtype=torch.int32,
+                                   device=x.device)[:batch_size]
+    padded_state_indices = torch.concat([
+        state_indices,
+        torch.as_tensor(
+            [PAD_SLOT_ID] * padding, dtype=torch.int32, device=device),
+    ],
+                                        dim=-1)
+    out = causal_conv1d_fn(x.squeeze(0),
+                           weight,
+                           bias=bias,
+                           conv_states=final_states,
+                           query_start_loc=cumsum.cuda(),
+                           cache_indices=padded_state_indices,
+                           has_initial_state=has_initial_states,
+                           activation=activation,
+                           pad_slot_id=PAD_SLOT_ID)
+
+    out_ref = []
+    out_ref_b = []
+
+    splits = [torch.split(var, seqlens[0], dim=-1) for var in (x_ref)]
+    for i in range(len(seqlens[0])):
+        x_s = [v[i].unsqueeze(0) for v in splits][0]
+        if padded_state_indices[i] == PAD_SLOT_ID:
+            continue
+        out_ref_b.append(
+            causal_conv1d_ref(
+                x_s,
+                weight_ref,
+                bias_ref,
+                activation=activation,
+                return_final_states=True,
+                final_states_out=final_states_ref[
+                    padded_state_indices[i]].unsqueeze(0),
+                initial_states=final_states_ref[padded_state_indices[i]].
+                unsqueeze(0) if has_initial_states[i] else None))
+    out_ref.append(torch.cat([t[0] for t in out_ref_b], dim=2))
+    out_ref_tensor = torch.cat(out_ref, dim=0)
+
+    assert torch.allclose(final_states[state_indices],
+                          final_states_ref[state_indices],
+                          rtol=rtol,
+                          atol=atol)
+    unpadded_out = out[:, :out_ref_tensor.shape[-1]]
+    assert torch.allclose(unpadded_out, out_ref_tensor, rtol=rtol, atol=atol)
diff --git a/vllm_v0.10.0/tests/kernels/mamba/test_mamba_mixer2.py b/vllm_v0.10.0/tests/kernels/mamba/test_mamba_mixer2.py
new file mode 100644
index 0000000..16c3107
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/mamba/test_mamba_mixer2.py
@@ -0,0 +1,126 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import unittest
+
+import pytest
+import torch
+
+from tests.utils import multi_gpu_test
+from vllm.distributed.parallel_state import (init_distributed_environment,
+                                             initialize_model_parallel)
+from vllm.model_executor.layers.mamba.mamba_mixer2 import Mixer2RMSNormGated
+from vllm.platforms import current_platform
+from vllm.utils import update_environment_variables
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize("seq_len", [128])
+@pytest.mark.parametrize(
+    "hidden_size_n_groups",
+    [
+        (64, 1),
+        (64, 2),
+        (64, 4),  # hidden_size be divisible by num_gpus
+        (100, 5),  # and n_groups must divide hidden_size
+    ])
+@pytest.mark.parametrize("dtype", [torch.float16])
+def test_mixer2_gated_norm_multi_gpu(
+    batch_size: int,
+    seq_len: int,
+    hidden_size_n_groups: tuple[int, int],
+    dtype: torch.dtype,
+    device: str = 'cuda',
+):
+    hidden_size, n_groups = hidden_size_n_groups
+    num_processes = 2
+
+    def run_torch_spawn(fn, nprocs):
+        # need to use torch.mp.spawn otherwise will have problems with
+        # torch.distributed and cuda
+        torch.multiprocessing.spawn(fn,
+                                    args=(
+                                        num_processes,
+                                        batch_size,
+                                        seq_len,
+                                        hidden_size,
+                                        n_groups,
+                                        dtype,
+                                        device,
+                                    ),
+                                    nprocs=nprocs)
+
+    run_torch_spawn(mixer2_gated_norm_tensor_parallel, 2)
+
+
+def mixer2_gated_norm_tensor_parallel(
+    local_rank: int,
+    world_size: int,
+    batch_size: int,
+    seq_len: int,
+    hidden_size: int,
+    n_groups: int,
+    dtype: torch.dtype,
+    device: str,
+):
+    current_platform.seed_everything(0)
+
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+    torch.set_default_device(device)
+    torch.set_default_dtype(dtype)
+
+    update_environment_variables({
+        'RANK': str(local_rank),
+        'LOCAL_RANK': str(local_rank),
+        'WORLD_SIZE': str(world_size),
+        'MASTER_ADDR': 'localhost',
+        'MASTER_PORT': '12345',
+    })
+
+    # initialize distributed
+    init_distributed_environment()
+    initialize_model_parallel(tensor_model_parallel_size=world_size)
+
+    # create random weights an inputs
+    weight = torch.rand((hidden_size, ), dtype=dtype, device=device)
+    hidden_states = torch.randn(batch_size, seq_len, hidden_size)
+    gate_states = torch.randn(batch_size, seq_len, hidden_size)
+
+    # create gated-norm with TP
+    mixer = Mixer2RMSNormGated(
+        full_hidden_size=hidden_size,
+        full_n_groups=n_groups,
+    )
+    mixer.weight.weight_loader(mixer.weight, weight)  # load
+
+    # create gated-norm without TP to compute reference
+    # - utilize mock patching to disable TP when
+    with (unittest.mock.patch(
+            "vllm.model_executor.layers.mamba.mamba_mixer2."
+            "get_tensor_model_parallel_world_size",
+            return_value=1),
+          unittest.mock.patch(
+              "vllm.model_executor.layers.mamba.mamba_mixer2."
+              "get_tensor_model_parallel_rank",
+              return_value=0)):
+        mixer_single_gpu = Mixer2RMSNormGated(
+            full_hidden_size=hidden_size,
+            full_n_groups=n_groups,
+        )
+    # assign weight to single-gpu mixer
+    mixer_single_gpu.weight.data = weight
+
+    # generate and compare
+    N = hidden_size // world_size
+    output = mixer(
+        hidden_states[..., local_rank * N:(local_rank + 1) * N],
+        gate_states[..., local_rank * N:(local_rank + 1) * N],
+    )
+    ref_output = mixer_single_gpu(hidden_states, gate_states)
+    torch.testing.assert_close(output,
+                               ref_output[...,
+                                          local_rank * N:(local_rank + 1) * N],
+                               atol=5e-3,
+                               rtol=1e-3)
diff --git a/vllm_v0.10.0/tests/kernels/mamba/test_mamba_ssm.py b/vllm_v0.10.0/tests/kernels/mamba/test_mamba_ssm.py
new file mode 100644
index 0000000..8dece26
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/mamba/test_mamba_ssm.py
@@ -0,0 +1,723 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+
+from tests.kernels.utils import opcheck
+from vllm import _custom_ops as ops  # noqa: F401
+from vllm.attention.backends.utils import PAD_SLOT_ID
+from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
+    selective_scan_fn, selective_state_update)
+from vllm.platforms import current_platform
+
+
+def selective_state_update_ref(state,
+                               x,
+                               dt,
+                               A,
+                               B,
+                               C,
+                               D=None,
+                               z=None,
+                               dt_bias=None,
+                               dt_softplus=False):
+    """
+    Argument:
+        state: (batch, dim, dstate) or (batch, nheads, dim, dstate)
+        x: (batch, dim) or (batch, nheads, dim)
+        dt: (batch, dim) or (batch, nheads, dim)
+        A: (dim, dstate) or (nheads, dim, dstate)
+        B: (batch, dstate) or (batch, ngroups, dstate)
+        C: (batch, dstate) or (batch, ngroups, dstate)
+        D: (dim,) or (nheads, dim)
+        z: (batch, dim) or (batch, nheads, dim)
+        dt_bias: (dim,) or (nheads, dim)
+    Return:
+        out: (batch, dim) or (batch, nheads, dim)
+    """
+    has_heads = state.dim() > 3
+    if state.dim() == 3:
+        state = state.unsqueeze(1)
+    if x.dim() == 2:
+        x = x.unsqueeze(1)
+    if dt.dim() == 2:
+        dt = dt.unsqueeze(1)
+    if A.dim() == 2:
+        A = A.unsqueeze(0)
+    if B.dim() == 2:
+        B = B.unsqueeze(1)
+    if C.dim() == 2:
+        C = C.unsqueeze(1)
+    if D is not None and D.dim() == 1:
+        D = D.unsqueeze(0)
+    if z is not None and z.dim() == 2:
+        z = z.unsqueeze(1)
+    if dt_bias is not None and dt_bias.dim() == 1:
+        dt_bias = dt_bias.unsqueeze(0)
+    batch, nheads, dim, dstate = state.shape
+    assert x.shape == (batch, nheads, dim)
+    assert dt.shape == x.shape
+    assert A.shape == (nheads, dim, dstate)
+    ngroups = B.shape[1]
+    assert nheads % ngroups == 0, "nheads must be divisible by ngroups"
+    assert B.shape == (batch, ngroups, dstate)
+    assert C.shape == B.shape
+    if D is not None:
+        assert D.shape == (nheads, dim)
+    if z is not None:
+        assert z.shape == x.shape
+    if dt_bias is not None:
+        assert dt_bias.shape == (nheads, dim)
+        dt = dt + dt_bias
+    dt = F.softplus(dt) if dt_softplus else dt
+    dA = torch.exp(rearrange(dt, "b h d -> b h d 1") *
+                   A)  # (batch, nheads, dim, dstate)
+    B = repeat(B, "b g n -> b (g h) n",
+               h=nheads // ngroups)  # (batch, nheads, dstate)
+    C = repeat(C, "b g n -> b (g h) n",
+               h=nheads // ngroups)  # (batch, nheads, dstate)
+    dB = rearrange(dt, "b h d -> b h d 1") * rearrange(
+        B, "b h n -> b h 1 n")  # (batch, nheads, dim, dstate)
+    state.copy_(state * dA +
+                dB * rearrange(x, "b h d -> b h d 1"))  # (batch, dim, dstate
+    out = torch.einsum("bhdn,bhn->bhd", state.to(C.dtype), C)
+    if D is not None:
+        out += (x * D).to(out.dtype)
+    out = (out if z is None else out * F.silu(z)).to(x.dtype)
+    if not has_heads:
+        out = out.squeeze(1)
+    return out
+
+
+def selective_scan_ref(u,
+                       delta,
+                       A,
+                       B,
+                       C,
+                       D=None,
+                       z=None,
+                       delta_bias=None,
+                       delta_softplus=False,
+                       return_last_state=False,
+                       prev_state=None,
+                       final_state_out=None):
+    """
+    u: r(B D L)
+    delta: r(B D L)
+    A: c(D N) or r(D N)
+    B: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
+    C: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
+    D: r(D)
+    z: r(B D L)
+    delta_bias: r(D), fp32
+    prev_state: r(B D N), fp32
+
+    out: r(B D L)
+    last_state (optional): r(B D dstate) or c(B D dstate)
+    """
+    dtype_in = u.dtype
+    u = u.float()
+    delta = delta.float()
+    if delta_bias is not None:
+        delta = delta + delta_bias[..., None].float()
+    if delta_softplus:
+        delta = F.softplus(delta)
+    batch, dim, dstate = u.shape[0], A.shape[0], A.shape[1]
+    is_variable_B = B.dim() >= 3
+    is_variable_C = C.dim() >= 3
+    B = B.float()
+    C = C.float()
+    x = A.new_zeros((batch, dim, dstate)) if prev_state is None else prev_state
+    ys = []
+    deltaA = torch.exp(torch.einsum('bdl,dn->bdln', delta, A))
+    if not is_variable_B:
+        deltaB_u = torch.einsum('bdl,dn,bdl->bdln', delta, B, u)
+    else:
+        if B.dim() == 3:
+            deltaB_u = torch.einsum('bdl,bnl,bdl->bdln', delta, B, u)
+        else:
+            B = repeat(B, "B G N L -> B (G H) N L", H=dim // B.shape[1])
+            deltaB_u = torch.einsum('bdl,bdnl,bdl->bdln', delta, B, u)
+    if is_variable_C and C.dim() == 4:
+        C = repeat(C, "B G N L -> B (G H) N L", H=dim // C.shape[1])
+    for i in range(u.shape[2]):
+        x = deltaA[:, :, i] * x + deltaB_u[:, :, i]
+        if not is_variable_C:
+            y = torch.einsum('bdn,dn->bd', x, C)
+        else:
+            if C.dim() == 3:
+                y = torch.einsum('bdn,bn->bd', x, C[:, :, i])
+            else:
+                y = torch.einsum('bdn,bdn->bd', x, C[:, :, :, i])
+        if i == u.shape[2] - 1:
+            if final_state_out is None:
+                final_state_out = x
+            else:
+                final_state_out.copy_(x)
+        ys.append(y)
+    y = torch.stack(ys, dim=2)  # (batch dim L)
+    out = y if D is None else y + u * rearrange(D, "d -> d 1")
+    if z is not None:
+        out = out * F.silu(z)
+    out = out.to(dtype=dtype_in)
+    return out if not return_last_state else (out, final_state_out)
+
+
+def selective_scan_opcheck_fn(u,
+                              delta,
+                              A,
+                              B,
+                              C,
+                              D=None,
+                              z=None,
+                              delta_bias=None,
+                              delta_softplus=False,
+                              cu_seq_len=None,
+                              cache_indices=None,
+                              has_initial_state=None,
+                              ssm_states=None,
+                              pad_slot_id=PAD_SLOT_ID):
+    """if return_last_state is True, returns (out, last_state)
+    last_state has shape (batch, dim, dstate).
+    """
+    if u.stride(-1) != 1:
+        u = u.contiguous()
+    if delta.stride(-1) != 1:
+        delta = delta.contiguous()
+    if D is not None:
+        D = D.contiguous()
+    if B.stride(-1) != 1:
+        B = B.contiguous()
+    if C.stride(-1) != 1:
+        C = C.contiguous()
+    if z is not None and z.stride(-1) != 1:
+        z = z.contiguous()
+    if B.dim() == 3 and cu_seq_len is None:
+        B = B.unsqueeze(1)
+    if B.dim() == 2 and cu_seq_len is not None:
+        B = B.unsqueeze(0)
+    if C.dim() == 3 and cu_seq_len is None:
+        C = C.unsqueeze(1)
+    if C.dim() == 2 and cu_seq_len is not None:
+        C = C.unsqueeze(0)
+
+    # Disable test_autograd_registration for now as it seems to trigger
+    # a bogus error.
+    opcheck(torch.ops._C.selective_scan_fwd,
+            (u, delta, A, B, C, D, z, delta_bias, delta_softplus, cu_seq_len,
+             cache_indices, has_initial_state, ssm_states, pad_slot_id),
+            test_utils=["test_schema", "test_faketensor"])
+
+
+@pytest.mark.parametrize('wtype', [torch.float32])
+@pytest.mark.parametrize('itype',
+                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize('seqlen', [128, 256, 512, 1024, 2048, 4096])
+@pytest.mark.parametrize('has_delta_bias', [True])
+@pytest.mark.parametrize('delta_softplus', [True])
+@pytest.mark.parametrize('has_z', [True])
+@pytest.mark.parametrize('has_D', [True])
+@pytest.mark.parametrize("varBC_groups", [1, 2])
+@pytest.mark.parametrize("is_variable_C", [True])
+@pytest.mark.parametrize("is_variable_B", [True])
+@pytest.mark.parametrize("scan_chunks", [1, 2, 3])
+def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
+                        has_z, has_delta_bias, delta_softplus, seqlen, itype,
+                        wtype, scan_chunks):
+    if varBC_groups > 1 and (not is_variable_B or not is_variable_C):
+        pytest.skip()  # This config is not applicable
+    device = 'cuda'
+    rtol, atol = (6e-4, 2e-3) if itype == torch.float32 else (3e-3, 5e-3)
+    if itype == torch.bfloat16:
+        rtol, atol = 3e-2, 5e-2
+    rtolw, atolw = (1e-3, 1e-3)
+    if has_z:  # If we have z, the errors on the weights seem higher
+        rtolw = max(rtolw, rtol)
+        atolw = max(atolw, atol)
+    # set seed
+    current_platform.seed_everything(0)
+    batch_size = 1
+    dim = 4
+    dstate = 8
+    A = (-0.5 * torch.rand(dim, dstate, device=device, dtype=wtype))
+    A_ref = A.clone()
+    if not is_variable_B:
+        B_shape = [dim, dstate]
+    elif varBC_groups == 1:
+        B_shape = [batch_size, dstate, seqlen]
+    else:
+        B_shape = [batch_size, varBC_groups, dstate, seqlen]
+    B = torch.randn(B_shape,
+                    device=device,
+                    dtype=wtype if not is_variable_B else itype)
+    B_ref = B.clone()
+    if not is_variable_C:
+        C_shape = [dim, dstate]
+    elif varBC_groups == 1:
+        C_shape = [batch_size, dstate, seqlen]
+    else:
+        C_shape = [batch_size, varBC_groups, dstate, seqlen]
+    C = torch.randn(C_shape,
+                    device=device,
+                    dtype=wtype if not is_variable_C else itype)
+    C_ref = C.clone()
+    D = torch.randn(dim, device=device, dtype=torch.float32) if has_D else None
+    D_ref = D.clone()
+    z = torch.randn(batch_size, dim, seqlen, device=device,
+                    dtype=itype) if has_z else None
+    z_ref = z.clone() if has_z else None
+    delta_bias = (0.5 * torch.rand(dim, device=device, dtype=torch.float32)
+                  ) if has_delta_bias else None
+    u = torch.randn(batch_size, dim, seqlen, device=device, dtype=itype)
+    u_ref = u.clone()
+    delta = (0.5 *
+             torch.rand(batch_size, dim, seqlen, device=device, dtype=itype))
+    delta_ref = delta.clone()
+    state_shape = (batch_size, u.shape[1], int(A.shape[1]))
+    state = torch.randn(state_shape,
+                        device=u.device,
+                        dtype=itype,
+                        requires_grad=False)
+    state_ref = state.clone()
+    out = None
+    out_ref = None
+    outs = []
+    for c in range(scan_chunks):
+        chunked_prompt_len = seqlen // scan_chunks
+        chunk_start = chunked_prompt_len * c
+        chunk_end = chunked_prompt_len * (c + 1)
+        if c == scan_chunks - 1:
+            chunk_end = seqlen
+        _B = B
+        if is_variable_B:
+            _B = B[..., chunk_start:chunk_end]
+        _C = C
+        if is_variable_B:
+            _C = C[..., chunk_start:chunk_end]
+        _z = z
+        if has_z:
+            assert z is not None
+            _z = z[..., chunk_start:chunk_end]
+        out = selective_scan_fn(
+            u[..., chunk_start:chunk_end],
+            state,
+            delta[..., chunk_start:chunk_end],
+            A,
+            _B,
+            _C,
+            D,
+            z=_z,
+            delta_bias=delta_bias,
+            delta_softplus=delta_softplus,
+            has_initial_state=torch.ones(batch_size,
+                                         device=u.device,
+                                         dtype=torch.bool) if c > 0 else None)
+        outs.append(out)
+    if len(outs) > 1:
+        out = torch.cat(outs, dim=-1)
+
+    out_ref, state_ref, *rest = selective_scan_ref(
+        u_ref,
+        delta_ref,
+        A_ref,
+        B_ref,
+        C_ref,
+        D_ref,
+        z=z_ref,
+        delta_bias=delta_bias,
+        delta_softplus=delta_softplus,
+        return_last_state=True)
+
+    assert out is not None and out_ref is not None
+    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
+    assert state is not None and state_ref is not None
+    assert torch.allclose(state, state_ref.to(itype), rtol=rtol, atol=atol)
+
+    selective_scan_opcheck_fn(u,
+                              delta,
+                              A,
+                              B,
+                              C,
+                              D,
+                              z,
+                              delta_bias=delta_bias,
+                              delta_softplus=delta_softplus,
+                              ssm_states=state)
+
+
+@pytest.mark.parametrize("itype",
+                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("has_z", [False, True])
+@pytest.mark.parametrize("dstate", [16, 32, 64])
+@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
+def test_selective_state_update(dim, dstate, has_z, itype):
+    device = "cuda"
+    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 1e-2)
+    if itype == torch.bfloat16:
+        rtol, atol = 1e-2, 5e-2
+        if torch.version.hip:
+            atol *= 2
+    # set seed
+    current_platform.seed_everything(0)
+    batch_size = 1
+    state = torch.randn(batch_size, dim, dstate, dtype=itype, device=device)
+    x = torch.randn(batch_size, dim, device=device, dtype=itype)
+    dt = torch.randn(batch_size, dim, device=device, dtype=itype)
+    dt_bias = torch.rand(dim, device=device) - 4.0
+    A = -torch.rand(dim, dstate, device=device) - 1.0
+    B = torch.randn(batch_size, dstate, device=device)
+    C = torch.randn(batch_size, dstate, device=device)
+    D = torch.randn(dim, device=device)
+    z = torch.randn_like(x) if has_z else None
+    state_ref = state.detach().clone()
+    out = selective_state_update(state,
+                                 x,
+                                 dt,
+                                 A,
+                                 B,
+                                 C,
+                                 D=D,
+                                 z=z,
+                                 dt_bias=dt_bias,
+                                 dt_softplus=True)
+    out_ref = selective_state_update_ref(state_ref,
+                                         x,
+                                         dt,
+                                         A,
+                                         B,
+                                         C,
+                                         D=D,
+                                         z=z,
+                                         dt_bias=dt_bias,
+                                         dt_softplus=True)
+
+    assert torch.allclose(state, state_ref, rtol=rtol, atol=atol)
+    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize('wtype', [torch.float32])
+@pytest.mark.parametrize('itype', [torch.float32])
+@pytest.mark.parametrize('seqlen', [1, 128, 129, 256, 512, 1024, 2048, 4096])
+@pytest.mark.parametrize("return_last_state", [True])
+@pytest.mark.parametrize('has_delta_bias', [True])
+@pytest.mark.parametrize('delta_softplus', [True])
+@pytest.mark.parametrize('has_z', [True])
+@pytest.mark.parametrize('has_D', [True])
+@pytest.mark.parametrize("varBC_groups", [1, 2])
+@pytest.mark.parametrize("is_variable_C", [True])
+@pytest.mark.parametrize("is_variable_B", [True])
+# tests correctness in case subset of the sequences are padded
+@pytest.mark.parametrize("with_padding", [False, True])
+def test_selective_scan_varlen(with_padding, is_variable_B, is_variable_C,
+                               varBC_groups, has_D, has_z, has_delta_bias,
+                               delta_softplus, return_last_state, seqlen,
+                               itype, wtype):
+    if varBC_groups > 1 and (not is_variable_B or not is_variable_C):
+        pytest.skip()  # This config is not applicable
+    device = 'cuda'
+    rtol, atol = (6e-4, 2e-3) if itype == torch.float32 else (3e-3, 5e-3)
+    if itype == torch.bfloat16:
+        rtol, atol = 3e-2, 5e-2
+    rtolw, atolw = (1e-3, 1e-3)
+    if has_z:  # If we have z, the errors on the weights seem higher
+        rtolw = max(rtolw, rtol)
+        atolw = max(atolw, atol)
+    # set seed
+    torch.random.manual_seed(0)
+    seqlens = []
+    batch_size = 4
+    if seqlen < 10:
+        batch_size = 1
+    padding = 3 if with_padding else 0
+    padded_batch_size = batch_size + padding
+
+    if with_padding and seqlen < padded_batch_size:
+        pytest.skip()
+
+    nsplits = padded_batch_size - 1
+    eos_pos = torch.randperm(seqlen - 1)[:nsplits].sort().values
+    seqlens.append(
+        torch.diff(
+            torch.cat(
+                [torch.tensor([-1]), eos_pos,
+                 torch.tensor([seqlen - 1])])).tolist())
+
+    assert sum(seqlens[-1]) == seqlen
+    assert all(s > 0 for s in seqlens[-1])
+
+    total_entries = batch_size * 10
+    cumsum = torch.cumsum(torch.tensor(seqlens[0]), dim=0).to(torch.int32)
+    cumsum = torch.concat([torch.tensor([0], dtype=torch.int32), cumsum],
+                          dim=0).cuda()
+
+    dim = 4
+    dstate = 8
+    A = (-0.5 * torch.rand(dim, dstate, device=device, dtype=wtype))
+    A_ref = A.clone()
+    B_shape = [varBC_groups, dstate, seqlen]
+    B = torch.randn(B_shape,
+                    device=device,
+                    dtype=wtype if not is_variable_B else itype)
+    B_ref = B.clone()
+    C_shape = [varBC_groups, dstate, seqlen]
+    C = torch.randn(C_shape,
+                    device=device,
+                    dtype=wtype if not is_variable_C else itype)
+    C_ref = C.clone()
+    D = torch.randn(dim, device=device, dtype=torch.float32) if has_D else None
+    D_ref = D.clone()
+    z = torch.randn(dim, seqlen, device=device, dtype=itype)
+    z_ref = z.clone()
+    delta_bias = (0.5 * torch.rand(dim, device=device, dtype=torch.float32)
+                  ) if has_delta_bias else None
+    u = torch.randn(dim, seqlen, device=device, dtype=itype)
+    u_ref = u.clone()
+    delta = (0.5 * torch.rand(dim, seqlen, device=device, dtype=itype))
+    delta_ref = delta.clone()
+    out = None
+    out_ref = None
+
+    prev_state_shape = (total_entries, u.shape[0], int(A.shape[1]))
+    prev_state = torch.randn(prev_state_shape,
+                             device=u.device,
+                             dtype=itype,
+                             requires_grad=False)
+    prev_state_ref = prev_state.clone()
+    state_indices = torch.randperm(total_entries,
+                                   dtype=torch.int32,
+                                   device=u.device)[:batch_size]
+    unused_states_bool = torch.ones(total_entries,
+                                    dtype=torch.bool,
+                                    device=device)
+    unused_states_bool[state_indices] = False
+    padded_state_indices = torch.concat([
+        state_indices,
+        torch.as_tensor(
+            [PAD_SLOT_ID] * padding, dtype=torch.int32, device=device),
+    ],
+                                        dim=-1)
+
+    has_initial_state = torch.randint(0,
+                                      2, (cumsum.shape[0] - 1, ),
+                                      dtype=torch.bool,
+                                      device=u.device)
+    out = selective_scan_fn(u, prev_state, delta, A, B, C, D, z, delta_bias,
+                            delta_softplus, cumsum, padded_state_indices,
+                            has_initial_state)
+    outs_ref = []
+    splits = [
+        torch.split(var, seqlens[0], dim=-1)
+        for var in (u_ref, delta_ref, B_ref, C_ref, z_ref)
+    ]
+    for i in range(len(seqlens[0])):
+        u_s, delta_s, B_s, C_s, z_s = (v[i].unsqueeze(0) for v in splits)
+        if padded_state_indices[i] == PAD_SLOT_ID:
+            continue
+        out_ref_s, _ = selective_scan_ref(
+            u_s,
+            delta_s,
+            A_ref,
+            B_s,
+            C_s,
+            D_ref,
+            z=z_s,
+            delta_bias=delta_bias,
+            delta_softplus=delta_softplus,
+            return_last_state=return_last_state,
+            prev_state=prev_state_ref[padded_state_indices[i]].unsqueeze(0)
+            if has_initial_state[i] else None,
+            final_state_out=prev_state_ref[padded_state_indices[i]].unsqueeze(
+                0))
+        outs_ref.append(out_ref_s)
+    out_ref = torch.cat(outs_ref, dim=-1)[0]
+
+    unpadded_out = out[:, :out_ref[0].shape[-1]]
+    print("Output diff max", (unpadded_out - out_ref).max())
+    print("Output diff mean", (unpadded_out - out_ref).mean())
+    print("Output state diff max", (prev_state - prev_state_ref).max())
+    print("Output state diff mean", (prev_state - prev_state_ref).mean())
+    assert torch.allclose(prev_state, prev_state_ref, rtol=rtol, atol=atol)
+    assert torch.allclose(unpadded_out, out_ref, rtol=rtol, atol=atol)
+    selective_scan_opcheck_fn(u, delta, A, B, C, D, z, delta_bias,
+                              delta_softplus, cumsum, padded_state_indices,
+                              has_initial_state, prev_state)
+
+
+@pytest.mark.parametrize("itype",
+                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("has_z", [True])
+@pytest.mark.parametrize("dstate", [16, 32, 64])
+@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
+# tests correctness in case subset of the sequences are padded
+@pytest.mark.parametrize("with_padding", [True, False])
+def test_selective_state_update_with_batch_indices(with_padding, dim, dstate,
+                                                   has_z, itype):
+    device = "cuda"
+    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 1e-2)
+    if itype == torch.bfloat16:
+        rtol, atol = 1e-1, 1e-1
+        if torch.version.hip:
+            atol *= 2
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 3
+    padding = 5 if with_padding else 0
+    padded_batch_size = batch_size + padding
+    total_entries = 10 * batch_size
+    state = torch.randn(total_entries, dim, dstate, dtype=itype, device=device)
+    state_indices = torch.randperm(total_entries)[:batch_size].to(
+        dtype=torch.int32, device=device)
+    unused_states_bool = torch.ones(total_entries,
+                                    dtype=torch.bool,
+                                    device=device)
+    unused_states_bool[state_indices] = False
+    padded_state_indices = torch.concat([
+        state_indices,
+        torch.as_tensor(
+            [PAD_SLOT_ID] * padding, dtype=torch.int32, device=device)
+    ],
+                                        dim=0)
+    x = torch.randn(padded_batch_size, dim, device=device, dtype=itype)
+    dt = torch.randn(padded_batch_size, dim, device=device, dtype=itype)
+    dt_bias = torch.rand(dim, device=device) - 4.0
+    A = -torch.rand(dim, dstate, device=device) - 1.0
+    B = torch.randn(padded_batch_size, dstate, device=device)
+    C = torch.randn(padded_batch_size, dstate, device=device)
+    D = torch.randn(dim, device=device)
+    z = torch.randn_like(x) if has_z else None
+    state_ref = state[state_indices, :].clone()
+    state_before = state.clone()
+    out = selective_state_update(state,
+                                 x,
+                                 dt,
+                                 A,
+                                 B,
+                                 C,
+                                 D=D,
+                                 z=z,
+                                 dt_bias=dt_bias,
+                                 dt_softplus=True,
+                                 state_batch_indices=padded_state_indices,
+                                 pad_slot_id=PAD_SLOT_ID)
+    out_ref = selective_state_update_ref(state_ref,
+                                         x[:batch_size],
+                                         dt[:batch_size],
+                                         A,
+                                         B[:batch_size],
+                                         C[:batch_size],
+                                         D=D,
+                                         z=z[:batch_size],
+                                         dt_bias=dt_bias,
+                                         dt_softplus=True)
+
+    print("Output diff max", (out[:batch_size] - out_ref).max())
+    print("Output diff mean", (out[:batch_size] - out_ref).mean())
+    print("Output state diff max", (state[state_indices, :] - state_ref).max())
+    print("Output state diff mean",
+          (state[state_indices, :] - state_ref).mean())
+    # test padded entries stay the same
+    if with_padding:
+        assert torch.equal(state_before[unused_states_bool],
+                           state[unused_states_bool])
+        assert torch.equal(x[batch_size + 1:], x[batch_size + 1:])
+        assert torch.equal(dt[batch_size + 1:], dt[batch_size + 1:])
+        assert torch.equal(B[batch_size + 1:], B[batch_size + 1:])
+        assert torch.equal(C[batch_size + 1:], C[batch_size + 1:])
+
+    # test "real" entries
+    assert torch.allclose(state[state_indices, :],
+                          state_ref,
+                          rtol=rtol,
+                          atol=atol)
+    assert torch.allclose(out[:batch_size], out_ref, rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize("itype",
+                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("has_z", [False, True])
+@pytest.mark.parametrize("tie_hdim", [False, True])
+@pytest.mark.parametrize("ngroups", [1, 2, 4])
+@pytest.mark.parametrize("dstate", [16, 32, 64])
+@pytest.mark.parametrize("dim", [2048, 4096])
+def test_selective_state_update_with_heads_with_batch_indices(
+        dim, dstate, ngroups, has_z, tie_hdim, itype):
+    device = "cuda"
+    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 3e-2)
+    if itype == torch.bfloat16:
+        rtol, atol = 1e-1, 1e-1
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 3
+    headdim = 64
+    nheads = dim // headdim
+
+    total_entries = 10 * batch_size
+    state = torch.randn(total_entries,
+                        nheads,
+                        headdim,
+                        dstate,
+                        dtype=itype,
+                        device=device)
+    state_indices = torch.randperm(total_entries)[:batch_size].to(
+        dtype=torch.int32, device=device)
+
+    x = torch.randn(batch_size, nheads, headdim, device=device, dtype=itype)
+    if not tie_hdim:
+        dt = torch.randn(batch_size,
+                         nheads,
+                         headdim,
+                         device=device,
+                         dtype=itype)
+        dt_bias = torch.rand(nheads, headdim, device=device) - 4.0
+        A = -torch.rand(nheads, headdim, dstate, device=device) - 1.0
+        D = torch.randn(nheads, headdim, device=device)
+    else:
+        dt = repeat(torch.randn(batch_size, nheads, device=device,
+                                dtype=itype),
+                    "b h -> b h p",
+                    p=headdim)
+        dt_bias = repeat(torch.rand(nheads, device=device) - 4.0,
+                         "h -> h p",
+                         p=headdim)
+        A = repeat(-torch.rand(nheads, device=device) - 1.0,
+                   "h -> h p n",
+                   p=headdim,
+                   n=dstate)
+        D = repeat(torch.randn(nheads, device=device), "h -> h p", p=headdim)
+    B = torch.randn(batch_size, ngroups, dstate, device=device)
+    C = torch.randn(batch_size, ngroups, dstate, device=device)
+    z = torch.randn_like(x) if has_z else None
+    state_ref = state[state_indices, :].detach().clone()
+    out = selective_state_update(state,
+                                 x,
+                                 dt,
+                                 A,
+                                 B,
+                                 C,
+                                 D=D,
+                                 z=z,
+                                 dt_bias=dt_bias,
+                                 dt_softplus=True,
+                                 state_batch_indices=state_indices,
+                                 pad_slot_id=PAD_SLOT_ID)
+    out_ref = selective_state_update_ref(state_ref,
+                                         x,
+                                         dt,
+                                         A,
+                                         B,
+                                         C,
+                                         D=D,
+                                         z=z,
+                                         dt_bias=dt_bias,
+                                         dt_softplus=True)
+
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    assert torch.allclose(state[state_indices, :],
+                          state_ref,
+                          rtol=rtol,
+                          atol=atol)
+    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
diff --git a/vllm_v0.10.0/tests/kernels/mamba/test_mamba_ssm_ssd.py b/vllm_v0.10.0/tests/kernels/mamba/test_mamba_ssm_ssd.py
new file mode 100644
index 0000000..00c1a29
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/mamba/test_mamba_ssm_ssd.py
@@ -0,0 +1,324 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+
+from vllm.model_executor.layers.mamba.ops.ssd_combined import (
+    mamba_chunk_scan_combined)
+from vllm.platforms import current_platform
+from vllm.v1.attention.backends.mamba_attn import (
+    _query_start_loc_to_chunk_indices_offsets)
+
+# Added by the IBM Team, 2024
+
+# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/modules/ssd_minimal.py
+
+
+# this is the segsum implementation taken from above
+def segsum(x):
+    """Calculates segment sum."""
+    T = x.size(-1)
+    x = repeat(x, "... d -> ... d e", e=T)
+    mask = torch.tril(torch.ones(T, T, device=x.device, dtype=bool),
+                      diagonal=-1)
+    x = x.masked_fill(~mask, 0)
+    x_segsum = torch.cumsum(x, dim=-2)
+    mask = torch.tril(torch.ones(T, T, device=x.device, dtype=bool),
+                      diagonal=0)
+    x_segsum = x_segsum.masked_fill(~mask, -torch.inf)
+    return x_segsum
+
+
+def ssd_minimal_discrete(X, A, B, C, block_len, initial_states=None):
+    """
+    Arguments:
+        X: (batch, length, n_heads, d_head)
+        A: (batch, length, n_heads)
+        B: (batch, length, n_heads, d_state)
+        C: (batch, length, n_heads, d_state)
+    Return:
+        Y: (batch, length, n_heads, d_head)
+    """
+    assert X.dtype == A.dtype == B.dtype == C.dtype
+    assert X.shape[1] % block_len == 0
+
+    # Rearrange into blocks/chunks
+    X, A, B, C = (rearrange(x, "b (c l) ... -> b c l ...", l=block_len)
+                  for x in (X, A, B, C))
+
+    A = rearrange(A, "b c l h -> b h c l")
+    A_cumsum = torch.cumsum(A, dim=-1)
+
+    # 1. Compute the output for each intra-chunk (diagonal blocks)
+    L = torch.exp(segsum(A))
+    Y_diag = torch.einsum("bclhn,bcshn,bhcls,bcshp->bclhp", C, B, L, X)
+
+    # 2. Compute the state for each intra-chunk
+    # (right term of low-rank factorization of off-diagonal blocks; B terms)
+    decay_states = torch.exp(A_cumsum[:, :, :, -1:] - A_cumsum)
+    states = torch.einsum("bclhn,bhcl,bclhp->bchpn", B, decay_states, X)
+
+    # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at
+    #    chunk boundaries
+    # (middle term of factorization of off-diag blocks; A terms)
+    if initial_states is None:
+        initial_states = torch.zeros_like(states[:, :1])
+    states = torch.cat([initial_states, states], dim=1)
+    decay_chunk = torch.exp(segsum(F.pad(A_cumsum[:, :, :, -1], (1, 0))))
+    new_states = torch.einsum("bhzc,bchpn->bzhpn", decay_chunk, states)
+    states, final_state = new_states[:, :-1], new_states[:, -1]
+
+    # 4. Compute state -> output conversion per chunk
+    # (left term of low-rank factorization of off-diagonal blocks; C terms)
+    state_decay_out = torch.exp(A_cumsum)
+    Y_off = torch.einsum('bclhn,bchpn,bhcl->bclhp', C, states, state_decay_out)
+
+    # Add output of intra-chunk and inter-chunk terms
+    # (diagonal and off-diagonal blocks)
+    Y = rearrange(Y_diag + Y_off, "b c l h p -> b (c l) h p")
+    return Y, final_state
+
+
+def generate_random_inputs(batch_size,
+                           seqlen,
+                           n_heads,
+                           d_head,
+                           itype,
+                           device='cuda'):
+
+    current_platform.seed_everything(0)
+    A = (-torch.exp(torch.rand(n_heads, dtype=itype, device=device)))
+    dt = F.softplus(
+        torch.randn(batch_size, seqlen, n_heads, dtype=itype, device=device) -
+        4)
+    X = torch.randn((batch_size, seqlen, n_heads, d_head),
+                    dtype=itype,
+                    device=device)
+    B = torch.randn((batch_size, seqlen, n_heads, d_head),
+                    dtype=itype,
+                    device=device)
+    C = torch.randn((batch_size, seqlen, n_heads, d_head),
+                    dtype=itype,
+                    device=device)
+
+    return A, dt, X, B, C
+
+
+def generate_continuous_batched_examples(example_lens_by_batch,
+                                         num_examples,
+                                         full_length,
+                                         last_taken,
+                                         exhausted,
+                                         n_heads,
+                                         d_head,
+                                         itype,
+                                         device='cuda'):
+
+    # this function generates a random examples of certain length
+    # and then cut according to "example_lens_by_batch" and feed
+    # them in continuous batches to the kernels
+
+    # generate the full-length example
+    A, dt, X, B, C = generate_random_inputs(num_examples, full_length, n_heads,
+                                            d_head, itype)
+
+    Y_min, final_state_min = ssd_minimal_discrete(X * dt.unsqueeze(-1),
+                                                  A * dt,
+                                                  B,
+                                                  C,
+                                                  block_len=full_length // 4)
+
+    # internal function that outputs a cont batch of examples
+    # given a tuple of lengths for each example in the batch
+    # e.g., example_lens=(8, 4) means take 8 samples from first eg,
+    #       4 examples from second eg, etc
+    def get_continuous_batch(example_lens: tuple[int, ...]):
+
+        indices = []
+        for i, x in enumerate(example_lens):
+            c = last_taken.get(i, 0)
+            indices.append((c, c + x))
+            last_taken[i] = (c + x) % full_length
+            exhausted[i] = last_taken[i] == 0
+
+        return (torch.concat([x[i, s:e] for i, (s, e) in enumerate(indices)
+                              ]).unsqueeze(0) for x in (dt, X, B, C))
+
+    # internal function that maps "n" to the appropriate right boundary
+    # value when forming continuous batches from examples of length given
+    # by "full_length".
+    # - e.g., when n > full_length, returns n % full_length
+    #         when n == full_length, returns full_length
+    def end_boundary(n: int):
+        return n - ((n - 1) // full_length) * full_length
+
+    IND_E = None
+    for spec in example_lens_by_batch:
+
+        # get the (maybe partial) example seen in this cont batch
+        dt2, X2, B2, C2 = get_continuous_batch(spec)
+
+        # get the metadata
+        cu_seqlens = torch.tensor((0, ) + spec, device=device).cumsum(dim=0)
+        seq_idx = torch.zeros(cu_seqlens[-1],
+                              dtype=torch.int32,
+                              device=cu_seqlens.device)
+        for i, (srt, end) in enumerate(zip(
+                cu_seqlens,
+                cu_seqlens[1:],
+        )):
+            seq_idx[srt:end] = i
+
+        # for cont batch
+        if IND_E is None:
+            IND_S = [0 for _ in range(len(spec))]
+        else:
+            IND_S = [x % full_length for x in IND_E]
+        IND_E = [end_boundary(x + y) for x, y in zip(IND_S, spec)]
+
+        yield ([Y_min[s, IND_S[s]:IND_E[s]] for s in range(num_examples)],
+               cu_seqlens, seq_idx.unsqueeze(0), (A, dt2, X2, B2, C2))
+
+
+@pytest.mark.parametrize("itype",
+                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("n_heads", [3, 4, 11, 16, 32])
+@pytest.mark.parametrize("d_head", [5, 8, 19, 32, 128])
+@pytest.mark.parametrize("seq_len_chunk_size", [(119, 17), (128, 32)])
+def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size,
+                                         itype):
+
+    # this tests the kernels on a single example (no batching)
+
+    # TODO: the bfloat16 case requires higher thresholds. To be investigated
+
+    if itype == torch.bfloat16:
+        atol, rtol = 5e-2, 5e-2
+    else:
+        atol, rtol = 8e-3, 5e-3
+
+    # set seed
+    batch_size = 1  # batch_size
+    # ssd_minimal_discrete requires chunk_size divide seqlen
+    # - this is only required for generating the reference seqs,
+    #   it is not an operational limitation.
+    seqlen, chunk_size = seq_len_chunk_size
+
+    A, dt, X, B, C = generate_random_inputs(batch_size, seqlen, n_heads,
+                                            d_head, itype)
+
+    Y_min, final_state_min = ssd_minimal_discrete(X * dt.unsqueeze(-1), A * dt,
+                                                  B, C, chunk_size)
+
+    Y, final_state = mamba_chunk_scan_combined(X,
+                                               dt,
+                                               A,
+                                               B,
+                                               C,
+                                               chunk_size,
+                                               D=None,
+                                               return_final_states=True)
+
+    # just test the last in sequence
+    torch.testing.assert_close(Y[:, -1], Y_min[:, -1], atol=atol, rtol=rtol)
+
+    # just test the last head
+    # NOTE, in the kernel we always cast states to fp32
+    torch.testing.assert_close(final_state[:, -1],
+                               final_state_min[:, -1].to(torch.float32),
+                               atol=atol,
+                               rtol=rtol)
+
+
+@pytest.mark.parametrize("itype", [torch.float32, torch.float16])
+@pytest.mark.parametrize("n_heads", [4, 8, 13])
+@pytest.mark.parametrize("d_head", [5, 16, 21, 32])
+@pytest.mark.parametrize(
+    "seq_len_chunk_size_cases",
+    [
+
+        # small-ish chunk_size (8)
+        (64, 8, 2, [(64, 32), (64, 32)]),
+        (64, 8, 2, [(32, 32), (32, 32), (32, 32)]),
+        (64, 8, 2, [(8, 8), (8, 8), (8, 8)]),  # chunk size boundary
+        (64, 8, 2, [(4, 4), (4, 4), (4, 4),
+                    (4, 4)]),  # chunk_size larger than cont batches
+        (64, 8, 5, [
+            (64, 32, 16, 8, 8),
+            (8, 16, 32, 16, 8),
+            (8, 8, 16, 32, 16),
+        ]),  # mode examples with varied lengths
+
+        # odd chunk_size
+        (64, 29, 2, [(11, 4), (13, 23), (19, 22),
+                     (21, 15)]),  # irregular sizes
+
+        # large-ish chunk_size (256)
+        (64, 256, 1, [(5, ), (1, ), (1, ),
+                      (1, )]),  # irregular sizes with small sequences
+        (64, 256, 2, [(5, 30), (1, 2), (1, 2),
+                      (1, 2)]),  # irregular sizes with small sequences
+    ])
+def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
+                                     itype):
+
+    # this test with multiple examples in a continuous batch
+    # (i.e. chunked prefill)
+
+    seqlen, chunk_size, num_examples, cases = seq_len_chunk_size_cases
+
+    # TODO: the irregular chunk size cases have some issues and require higher
+    # tolerance. This is to be invesigated
+    if chunk_size not in {8, 256}:
+        atol, rtol = 5e-1, 5e-1
+    else:
+        atol, rtol = 5e-3, 5e-3
+
+    # hold state during the cutting process so we know if an
+    # example has been exhausted and needs to cycle
+    last_taken: dict = {}  # map: eg -> pointer to last taken sample
+    exhausted: dict = {}  # map: eg -> boolean indicating example is exhausted
+
+    states = None
+    for Y_min, cu_seqlens, seq_idx, (
+            A, dt, X, B, C) in generate_continuous_batched_examples(
+                cases, num_examples, seqlen, last_taken, exhausted, n_heads,
+                d_head, itype):
+
+        chunk_indices, chunk_offsets = \
+            _query_start_loc_to_chunk_indices_offsets(
+                cu_seqlens, chunk_size, cu_seqlens[-1])
+
+        Y, new_states = mamba_chunk_scan_combined(
+            X,
+            dt,
+            A,
+            B,
+            C,
+            chunk_size,
+            D=None,
+            cu_seqlens=cu_seqlens,
+            seq_idx=seq_idx,
+            chunk_indices=chunk_indices,
+            chunk_offsets=chunk_offsets,
+            return_varlen_states=True,
+            initial_states=states,
+        )
+
+        # just test the last in sequence
+        for i in range(num_examples):
+
+            # just test one dim and dstate
+            Y_eg = Y[0, cu_seqlens[i]:cu_seqlens[i + 1], 0, 0]
+            Y_min_eg = Y_min[i][:, 0, 0]
+            torch.testing.assert_close(Y_eg, Y_min_eg, atol=atol, rtol=rtol)
+
+        # update states
+        states = new_states
+        for i, clear in exhausted.items():
+            if clear:
+                states[i].fill_(0.)
+                exhausted[i] = False
diff --git a/vllm_v0.10.0/tests/kernels/moe/__init__.py b/vllm_v0.10.0/tests/kernels/moe/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/kernels/moe/modular_kernel_tools/__init__.py b/vllm_v0.10.0/tests/kernels/moe/modular_kernel_tools/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/kernels/moe/modular_kernel_tools/cli_args.py b/vllm_v0.10.0/tests/kernels/moe/modular_kernel_tools/cli_args.py
new file mode 100644
index 0000000..b95d87c
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/moe/modular_kernel_tools/cli_args.py
@@ -0,0 +1,159 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+
+from .common import Config
+from .mk_objects import (MK_ALL_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES,
+                         MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES)
+
+
+def make_config_arg_parser(description: str):
+
+    def to_pf_class_type(s: str) -> mk.FusedMoEPrepareAndFinalize:
+        for pf in MK_ALL_PREPARE_FINALIZE_TYPES:
+            if pf.__name__ == s:
+                return pf
+        raise ValueError(
+            f"Cannot find a PrepareFinalize type that matches {s}")
+
+    def to_experts_class_type(s: str) -> mk.FusedMoEPermuteExpertsUnpermute:
+        for fe in MK_FUSED_EXPERT_TYPES:
+            if fe.__name__ == s:
+                return fe
+        raise ValueError(f"Cannot find a FusedExperts type that matches {s}")
+
+    def to_quant_torch_dtype(s: str) -> torch.dtype:
+        if s == "torch.float8_e4m3fn":
+            return torch.float8_e4m3fn
+        raise ValueError(f"Unsupported quant type {s}")
+
+    parser = argparse.ArgumentParser(description=description)
+
+    parser.add_argument(
+        "--world-size",
+        type=int,
+        default=2,
+        help="Number of ranks that participate in all2all",
+    )
+    parser.add_argument(
+        "--pf-type",
+        type=to_pf_class_type,
+        required=True,
+        help=("Choose a PrepareFinalize Type : "
+              f"{[x.__name__ for x in MK_ALL_PREPARE_FINALIZE_TYPES]}"),
+    )
+    parser.add_argument(
+        "--experts-type",
+        type=to_experts_class_type,
+        required=True,
+        help=(f"Choose a FusedExpert type : "
+              f"{[x.__name__ for x in MK_FUSED_EXPERT_TYPES]}"),
+    )
+    parser.add_argument(
+        "-m",
+        nargs="+",
+        type=int,
+        default=[64],
+        help="num tokens per rank",
+    )
+    parser.add_argument(
+        "-k",
+        type=int,
+        default=7168,
+        help="hidden-size",
+    )
+    parser.add_argument(
+        "-n",
+        type=int,
+        default=1024,
+        help="N dimension of the first fused-moe matmul",
+    )
+    parser.add_argument("--num-experts",
+                        type=int,
+                        default=32,
+                        help="Global num experts")
+    parser.add_argument("--topk",
+                        nargs="+",
+                        type=int,
+                        default=[4, 1],
+                        help="num topk")
+    parser.add_argument(
+        "--fused-moe-chunk-size",
+        type=int,
+        help="Fused moe chunk size used for the non-batched fused experts impl."
+    )
+
+    # Quant args
+    parser.add_argument("--quant-dtype",
+                        type=to_quant_torch_dtype,
+                        help="Quant datatype")
+    parser.add_argument("--per-token-quantized-activations",
+                        action='store_true',
+                        help=("The input activations must be per-token "
+                              "quantized"))
+    parser.add_argument("--per-channel-quantized-weights",
+                        action="store_true",
+                        help="The weights must be per-channel quantized.")
+    parser.add_argument("--block-shape",
+                        nargs="+",
+                        type=int,
+                        help="Quantization block shape")
+
+    # Torch trace profile generation args
+    parser.add_argument("--torch-trace-dir-path",
+                        type=str,
+                        default=None,
+                        help="Get torch trace for single execution")
+
+    return parser
+
+
+def _validate_args(args: argparse.Namespace):
+
+    if args.quant_dtype is not None:
+        assert args.quant_dtype == torch.float8_e4m3fn
+        if args.block_shape is not None:
+            assert len(args.block_shape) == 2, (
+                f"block shape must have 2 elements. got {args.block_shape}")
+
+    if args.experts_type in MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES:
+        assert args.world_size == 1, (
+            "Single GPU objects need world size set to 1")
+
+    if args.torch_trace_dir_path is not None:
+        from pathlib import Path
+        assert Path(args.torch_trace_dir_path).is_dir(), (
+            f"Please create {args.torch_trace_dir_path}")
+
+
+def make_config(args: argparse.Namespace) -> Config:
+
+    _validate_args(args)
+
+    quant_config = None
+    if args.quant_dtype is not None:
+        quant_config = FusedMoEQuantConfig(
+            quant_dtype=args.quant_dtype,
+            per_act_token_quant=args.per_token_quantized_activations,
+            per_out_ch_quant=args.per_channel_quantized_weights,
+            block_shape=args.block_shape)
+
+    return Config(
+        Ms=args.m,
+        K=args.k,
+        N=args.n,
+        E=args.num_experts,
+        topks=args.topk,
+        dtype=torch.bfloat16,  # hard-code
+        quant_config=quant_config,
+        prepare_finalize_type=args.pf_type,
+        fused_experts_type=args.experts_type,
+        fused_moe_chunk_size=args.fused_moe_chunk_size,
+        world_size=args.world_size,
+        torch_trace_dir_path=args.torch_trace_dir_path)
diff --git a/vllm_v0.10.0/tests/kernels/moe/modular_kernel_tools/common.py b/vllm_v0.10.0/tests/kernels/moe/modular_kernel_tools/common.py
new file mode 100644
index 0000000..fd99e8d
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/moe/modular_kernel_tools/common.py
@@ -0,0 +1,641 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+from typing import Any, Optional, Union
+
+import torch
+
+import vllm._custom_ops as ops
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from tests.kernels.utils import torch_experts
+from vllm.config import VllmConfig
+from vllm.distributed import get_dp_group, get_tensor_model_parallel_world_size
+# Fused experts and PrepareFinalize imports
+from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
+    BatchedDeepGemmExperts)
+from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
+    BatchedTritonOrDeepGemmExperts)
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig, FusedMoEParallelConfig, FusedMoEQuantConfig)
+from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
+from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
+from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+    BatchedTritonExperts, NaiveBatchedExperts)
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
+from vllm.model_executor.layers.fused_moe.layer import (FusedMoEMethodBase,
+                                                        TritonExperts)
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP)
+from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
+    TritonOrDeepGemmExperts)
+from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
+
+from .parallel_utils import ProcessGroupInfo
+from .utils import (make_block_quant_fp8_weights, make_non_quant_weights,
+                    make_quant_fp8_weights, per_token_cast_to_fp8)
+
+if has_pplx():
+    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
+        PplxPrepareAndFinalize)
+if has_deep_ep():
+    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
+        DeepEPHTPrepareAndFinalize)
+    from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
+        DeepEPLLPrepareAndFinalize)
+
+
+def _describe_tensor(t: Optional[torch.Tensor], name: str) -> str:
+    if t is None:
+        return f"{name} : None"
+    else:
+        return f"{name} : {t.shape} {t.dtype} {t.device}"
+
+
+@dataclass
+class Config:
+    Ms: Union[list[int], int]
+    K: int
+    N: int
+    E: int
+    topks: Union[list[int], int]
+    dtype: torch.dtype
+    quant_config: Optional[FusedMoEQuantConfig]
+
+    prepare_finalize_type: mk.FusedMoEPrepareAndFinalize
+    fused_experts_type: mk.FusedMoEPermuteExpertsUnpermute
+
+    fused_moe_chunk_size: Optional[int]
+    world_size: int
+
+    torch_trace_dir_path: Optional[str] = None
+
+    def describe(self) -> str:
+        s = ""
+        s += "== Config: \n"
+        s += f" world_size={self.world_size} \n"
+        s += f" PF={self.prepare_finalize_type.__name__} \n"
+        s += f" FE={self.fused_experts_type.__name__} \n"
+        s += f" topk={self.topks} \n"
+        s += f" dtype={self.dtype} \n"
+        s += f" fused_moe_chunk_size={self.fused_moe_chunk_size} \n"
+        s += " Quant: \n"
+        s += f" fused_moe_chunk_size={self.fused_moe_chunk_size} \n "
+        if self.quant_config is not None:
+            s += f"     q_dtype={self.quant_dtype} \n"
+            s += f"     q_block_shape={self.quant_block_shape} \n"
+            s += f"     q_per_out_ch_quant={self.is_per_out_ch_quant} \n"
+            s += f"     q_per_act_token={self.is_per_act_token_quant} \n"
+        else:
+            s += "     quant=None \n"
+        return s
+
+    @property
+    def M(self) -> int:
+        assert isinstance(self.Ms, int)
+        return self.Ms
+
+    @property
+    def quant_dtype(self) -> Optional[torch.dtype]:
+        if self.quant_config is None:
+            return None
+        return self.quant_config.quant_dtype
+
+    @property
+    def is_per_act_token_quant(self) -> bool:
+        if self.quant_config is None:
+            return False
+        return self.quant_config.per_act_token_quant
+
+    @property
+    def is_per_tensor_act_quant(self) -> bool:
+        if self.quant_config is None:
+            return False
+        return (not self.is_per_act_token_quant
+                and self.quant_block_shape is None)
+
+    @property
+    def is_per_out_ch_quant(self) -> bool:
+        if self.quant_config is None:
+            return False
+        return self.quant_config.per_out_ch_quant
+
+    @property
+    def quant_block_shape(self) -> Optional[list[int]]:
+        if self.quant_config is None:
+            return None
+        return self.quant_config.block_shape
+
+    @property
+    def topk(self) -> int:
+        assert isinstance(self.topks, int)
+        return self.topks
+
+    @property
+    def topk_ids_dtype(self) -> Optional[torch.dtype]:
+        topk_ids_dtype = None
+        if self.prepare_finalize_type == PplxPrepareAndFinalize:
+            topk_ids_dtype = torch.uint32
+        elif self.prepare_finalize_type in [
+                DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize
+        ]:
+            topk_ids_dtype = torch.int64
+        return topk_ids_dtype
+
+    @property
+    def num_local_experts(self) -> int:
+        return self.E // self.world_size
+
+    def make_env_data(self) -> tuple[VllmConfig, dict[Any, Any]]:
+        """
+        make env data for vllm launch. 
+        """
+        vllm_config = VllmConfig()
+        vllm_config.parallel_config.data_parallel_size = self.world_size
+        vllm_config.parallel_config.enable_expert_parallel = True
+
+        env_dict = {
+            "VLLM_ALL2ALL_BACKEND": self.all2all_backend(),
+            "VLLM_USE_DEEP_GEMM": str(int(self.needs_deep_gemm())),
+        }
+        if self.fused_moe_chunk_size is not None:
+            env_dict.update(
+                {"VLLM_FUSED_MOE_CHUNK_SIZE": str(self.fused_moe_chunk_size)})
+        return vllm_config, env_dict
+
+    def is_fp8_block_quantized(self):
+        return (self.quant_dtype == torch.float8_e4m3fn
+                and self.quant_block_shape is not None)
+
+    def is_batched_prepare_finalize(self):
+        return self.prepare_finalize_type in [
+            PplxPrepareAndFinalize, DeepEPLLPrepareAndFinalize
+        ]
+
+    def is_batched_fused_experts(self):
+        return self.fused_experts_type in [
+            CutlassExpertsFp8, BatchedDeepGemmExperts, BatchedTritonExperts,
+            NaiveBatchedExperts, BatchedTritonOrDeepGemmExperts
+        ]
+
+    def is_standard_fused_experts(self):
+        return self.fused_experts_type in [
+            CutlassExpertsFp8, DeepGemmExperts, TritonOrDeepGemmExperts,
+            TritonExperts
+        ]
+
+    def is_fe_16bit_supported(self):
+        return self.fused_experts_type in [
+            BatchedTritonExperts, BatchedTritonOrDeepGemmExperts,
+            NaiveBatchedExperts, TritonExperts
+        ]
+
+    def is_fe_fp8_supported(self):
+        return self.fused_experts_type in [
+            BatchedDeepGemmExperts,
+            BatchedTritonExperts,
+            BatchedTritonOrDeepGemmExperts,
+            CutlassExpertsFp8,
+            DeepGemmExperts,
+            TritonExperts,
+            TritonOrDeepGemmExperts,
+            NaiveBatchedExperts,
+        ]
+
+    def is_fe_block_fp8_supported(self):
+        return self.fused_experts_type in [
+            BatchedDeepGemmExperts,
+            BatchedTritonOrDeepGemmExperts,
+            DeepGemmExperts,
+            TritonExperts,
+            TritonOrDeepGemmExperts,
+            BatchedTritonExperts,
+            NaiveBatchedExperts,
+        ]
+
+    def is_fe_supports_chunking(self):
+        return self.fused_experts_type in [
+            CutlassExpertsFp8, DeepGemmExperts, TritonOrDeepGemmExperts,
+            TritonExperts
+        ]
+
+    def needs_deep_gemm(self):
+        return self.fused_experts_type in [
+            BatchedDeepGemmExperts,
+            DeepGemmExperts,
+        ]
+
+    def needs_pplx(self):
+        return self.prepare_finalize_type in [PplxPrepareAndFinalize]
+
+    def needs_deep_ep(self):
+        return self.prepare_finalize_type in [
+            DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize
+        ]
+
+    def all2all_backend(self):
+        if self.needs_pplx():
+            return "pplx"
+        if self.prepare_finalize_type == DeepEPHTPrepareAndFinalize:
+            return "deepep_high_throughput"
+        if self.prepare_finalize_type == DeepEPLLPrepareAndFinalize:
+            return "deepep_low_latency"
+        return "naive"
+
+    def needs_all2all(self):
+        return self.prepare_finalize_type in [
+            PplxPrepareAndFinalize, DeepEPHTPrepareAndFinalize,
+            DeepEPLLPrepareAndFinalize
+        ]
+
+    def is_valid(self):
+        # Check prepare-finalize and fused-experts compatibility
+        if self.is_batched_prepare_finalize():
+            if not self.is_batched_fused_experts():
+                return False
+        else:
+            if not self.is_standard_fused_experts():
+                return False
+
+        use_chunking = self.fused_moe_chunk_size is not None
+        if use_chunking and not self.is_fe_supports_chunking():
+            return False
+
+        # Check quantization sanity
+        if (int(self.is_per_act_token_quant) +
+                int(self.is_per_tensor_act_quant) +
+                int(self.quant_block_shape is not None)) > 1:
+            # invalid quant config
+            return False
+
+        # check bf16 / fp16 support
+        is_16bit = (self.dtype.itemsize == 2 and self.quant_dtype is None)
+        if is_16bit and not self.is_fe_16bit_supported():
+            return False
+
+        # Check fp8 support
+        is_fp8 = self.quant_dtype == torch.float8_e4m3fn
+        if is_fp8 and not self.is_fe_fp8_supported():
+            return False
+
+        # Check fp8 block quanization support
+        is_block_quatized = self.quant_block_shape is not None
+        if is_block_quatized and not is_fp8:
+            return False
+        if is_block_quatized and not self.is_fe_block_fp8_supported():
+            return False
+
+        # deep_gemm only works with block-quantized
+        if self.needs_deep_gemm() and not is_block_quatized:
+            return False
+
+        # Check dependencies
+        if self.needs_deep_ep() and not has_deep_ep():
+            return False
+        if self.needs_deep_gemm() and not has_deep_gemm():
+            return False
+        if self.needs_pplx() and not has_pplx():  # noqa: SIM103
+            return False
+
+        return True
+
+
+@dataclass
+class WeightTensors:
+    w1: torch.Tensor
+    w2: torch.Tensor
+    w1_scale: Optional[torch.Tensor]
+    w2_scale: Optional[torch.Tensor]
+
+    def describe(self):
+        s = ""
+        s += "== Weight Tensors: \n"
+        s += f' - {_describe_tensor(self.w1, "w1")} \n'
+        s += f' - {_describe_tensor(self.w2, "w2")} \n'
+        s += f' - {_describe_tensor(self.w1_scale, "w1_scale")} \n'
+        s += f' - {_describe_tensor(self.w2_scale, "w2_scale")} \n'
+        return s
+
+    def to_current_device(self):
+        self.w1 = self.w1.to(device=torch.cuda.current_device())
+        self.w2 = self.w2.to(device=torch.cuda.current_device())
+        is_quantized = self.w1.dtype == torch.float8_e4m3fn
+        if is_quantized:
+            assert self.w1_scale is not None
+            assert self.w2_scale is not None
+            self.w1_scale = self.w1_scale.to(
+                device=torch.cuda.current_device())
+            self.w2_scale = self.w2_scale.to(
+                device=torch.cuda.current_device())
+
+    def slice_weights(self, rank: int,
+                      num_local_experts: int) -> "WeightTensors":
+        s = rank * num_local_experts
+        e = s + num_local_experts
+        w1 = self.w1[s:e, :, :]
+        w2 = self.w2[s:e, :, :]
+        is_quantized = self.w1.dtype == torch.float8_e4m3fn
+        w1_scale, w2_scale = (None, None)
+        if is_quantized:
+            assert self.w1_scale is not None
+            assert self.w2_scale is not None
+            w1_scale = self.w1_scale[s:e, :, :]
+            w2_scale = self.w2_scale[s:e, :, :]
+        return WeightTensors(w1, w2, w1_scale, w2_scale)
+
+    @staticmethod
+    def make(config: Config) -> "WeightTensors":
+
+        if config.quant_dtype is None:
+            # just make normal dtype weights
+            w1, w2 = make_non_quant_weights(e=config.E,
+                                            n=config.N,
+                                            k=config.K,
+                                            dtype=config.dtype)
+            return WeightTensors(w1=w1, w2=w2, w1_scale=None, w2_scale=None)
+
+        assert config.quant_dtype == torch.float8_e4m3fn
+        if not config.is_fp8_block_quantized():
+            w1, w2, w1_scale, w2_scale = make_quant_fp8_weights(
+                e=config.E,
+                n=config.N,
+                k=config.K,
+                per_out_channel_quant=config.is_per_out_ch_quant,
+            )
+            return WeightTensors(w1=w1,
+                                 w2=w2,
+                                 w1_scale=w1_scale,
+                                 w2_scale=w2_scale)
+
+        assert config.quant_block_shape is not None
+        w1, w2, w1_scale, w2_scale = make_block_quant_fp8_weights(
+            e=config.E,
+            n=config.N,
+            k=config.K,
+            block_size=config.quant_block_shape,
+        )
+        return WeightTensors(w1=w1,
+                             w2=w2,
+                             w1_scale=w1_scale,
+                             w2_scale=w2_scale)
+
+
+@dataclass
+class RankTensors:
+    hidden_states: torch.Tensor
+    hidden_states_scale: Optional[torch.Tensor]
+
+    topk_weights: torch.Tensor
+    topk_ids: torch.Tensor
+    expert_map: Optional[torch.Tensor]
+
+    quant_config: Optional[FusedMoEQuantConfig]
+
+    def describe(self):
+        s = ""
+        s += "== Rank Tensors: \n"
+        s += f' - {_describe_tensor(self.hidden_states, "HS")} \n'
+        s += f' - {_describe_tensor(self.hidden_states_scale, "HS_scale")} \n'
+        s += f' - {_describe_tensor(self.topk_weights, "topk_weights")} \n'
+        s += f' - {_describe_tensor(self.topk_ids, "topk_ids")} \n'
+        s += f' - {_describe_tensor(self.expert_map, "expert_map")} \n'
+        return s
+
+    @staticmethod
+    def make_hidden_states(
+            config: Config) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Return hidden_states
+        """
+        m, k, dtype = (config.M, config.K, config.dtype)
+        a = (torch.randn(
+            (m, k), device=torch.cuda.current_device(), dtype=dtype) / 15.0)
+
+        if config.quant_dtype is None:
+            return a, None
+
+        # We dequant and use that as hidden_states so the tests are stable.
+        # quantizing and dequantizing yield slightly different results
+        # depending on the hardware. Here we, quantize and dequantize
+        # first - so further quantize and dequantize will yield the same
+        # values.
+        if config.is_per_tensor_act_quant:
+            a_q, a_scales = ops.scaled_fp8_quant(
+                a, use_per_token_if_dynamic=False)
+            return a_q.float().mul(a_scales).to(dtype), a_scales
+
+        if config.is_per_act_token_quant:
+            a_q, a_scales = ops.scaled_fp8_quant(a,
+                                                 use_per_token_if_dynamic=True)
+            return a_q.float().mul(a_scales).to(dtype), None
+
+        assert config.quant_block_shape is not None
+        block_k = config.quant_block_shape[1]
+        a_q, a_scales = per_token_cast_to_fp8(a, block_size=block_k)
+        return a_q.float().view(
+            (-1, block_k)).mul(a_scales.view(-1, 1)).view(m, k).to(dtype), None
+
+    @staticmethod
+    def make(config: Config, pgi: ProcessGroupInfo):
+
+        dtype = config.dtype
+        topk, m, _ = (config.topk, config.M, config.K)
+        hidden_states, hidden_states_scale = RankTensors.make_hidden_states(
+            config)
+
+        num_local_experts, global_num_experts = (config.num_local_experts,
+                                                 config.E)
+        score = torch.randn((m, global_num_experts),
+                            device="cuda",
+                            dtype=dtype)
+        topk_weights, topk_ids, _ = fused_topk(hidden_states, score, topk,
+                                               False)
+        topk_ids = topk_ids.to(config.topk_ids_dtype)
+
+        # distribute topk_ids evenly
+        for mi in range(m):
+            topk_ids[mi] = torch.randperm(config.E)[:topk]
+        topk_ids = topk_ids.to(device=torch.cuda.current_device())
+
+        expert_map = None
+        if config.world_size > 1:
+            expert_map = torch.full((global_num_experts, ),
+                                    fill_value=-1,
+                                    dtype=torch.int32)
+            s = pgi.rank * num_local_experts
+            e = s + num_local_experts
+            expert_map[s:e] = torch.tensor(list(range(num_local_experts)))
+            expert_map = expert_map.to(device=torch.cuda.current_device(),
+                                       dtype=torch.int32)
+
+        return RankTensors(
+            hidden_states=hidden_states,
+            hidden_states_scale=hidden_states_scale,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            expert_map=expert_map,
+            quant_config=config.quant_config,
+        )
+
+
+def reference_moe_impl(config: Config, weights: WeightTensors,
+                       rank_tensors: RankTensors) -> torch.Tensor:
+
+    return torch_experts(a=rank_tensors.hidden_states,
+                         w1=weights.w1,
+                         w2=weights.w2,
+                         topk_weight=rank_tensors.topk_weights,
+                         topk_ids=rank_tensors.topk_ids,
+                         global_num_experts=config.E,
+                         expert_map=None,
+                         w1_scale=weights.w1_scale,
+                         w2_scale=weights.w2_scale,
+                         a1_scale=rank_tensors.hidden_states_scale,
+                         quant_dtype=config.quant_dtype,
+                         per_act_token_quant=config.is_per_act_token_quant,
+                         block_shape=config.quant_block_shape,
+                         apply_router_weights_on_input=config.topk == 1)
+
+
+def make_fused_experts(
+        config: Config, moe: FusedMoEConfig,
+        num_dispatchers: int) -> mk.FusedMoEPermuteExpertsUnpermute:
+
+    use_fp8 = config.quant_dtype == torch.float8_e4m3fn
+    batch_kwargs = {
+        "max_num_tokens": moe.max_num_tokens,
+        "num_dispatchers": num_dispatchers,
+    }
+    quant_kwargs = {
+        "use_fp8_w8a8": use_fp8,
+        "use_int8_w8a8": False,
+        "use_int8_w8a16": False,
+        "use_int4_w4a16": False,
+        "block_shape": config.quant_block_shape,
+        "per_act_token_quant": config.is_per_act_token_quant,
+    }
+    deepgemm_kwargs = {"allow_deep_gemm": has_deep_gemm()}
+
+    if config.fused_experts_type == BatchedDeepGemmExperts:
+        kwargs = batch_kwargs | {
+            "block_shape": config.quant_block_shape,
+            "per_act_token_quant": config.is_per_act_token_quant,
+        }
+        print(f"Making BatchedDeepGemmExperts {kwargs} ...")
+        experts = BatchedDeepGemmExperts(**kwargs)
+    elif config.fused_experts_type == BatchedTritonExperts:
+        kwargs = batch_kwargs | quant_kwargs
+        print(f"Making BatchedTritonExperts {kwargs} ...")
+        experts = BatchedTritonExperts(**kwargs)
+    elif config.fused_experts_type == BatchedTritonOrDeepGemmExperts:
+        kwargs = batch_kwargs | quant_kwargs | deepgemm_kwargs
+        print(f"Making BatchedTritonOrDeepGemmExperts {kwargs} ...")
+        experts = BatchedTritonOrDeepGemmExperts(**kwargs)
+    elif config.fused_experts_type == DeepGemmExperts:
+        print("Making DeepGemmExperts () ...")
+        experts = DeepGemmExperts()
+    elif config.fused_experts_type == TritonExperts:
+        kwargs = quant_kwargs
+        print(f"Making TritonExperts {kwargs} ...")
+        experts = TritonExperts(**kwargs)
+    elif config.fused_experts_type == TritonOrDeepGemmExperts:
+        kwargs = quant_kwargs | deepgemm_kwargs
+        print(f"Making TritonOrDeepGemmExperts {kwargs} ...")
+        experts = TritonOrDeepGemmExperts(**kwargs)
+    elif config.fused_experts_type == NaiveBatchedExperts:
+        kwargs = batch_kwargs | quant_kwargs
+        print(f"Making NaiveBatchedExperts {kwargs} ...")
+        experts = NaiveBatchedExperts(**kwargs)
+    elif config.fused_experts_type == CutlassExpertsFp8:
+        use_batched_format = config.is_batched_prepare_finalize()
+        num_experts = (moe.num_local_experts
+                       if use_batched_format else moe.num_experts)
+        kwargs = {
+            "max_experts_per_worker": num_experts,
+            "out_dtype": moe.in_dtype,
+            "per_act_token_quant": config.is_per_act_token_quant,
+            "per_out_ch_quant": config.is_per_out_ch_quant,
+            "block_shape": config.quant_block_shape,
+            "num_dispatchers": num_dispatchers,
+            "use_batched_format": use_batched_format
+        }
+        print(f"Making CutlassExpertsFp8 {kwargs} ...")
+        experts = CutlassExpertsFp8(**kwargs)
+
+    return experts
+
+
+def make_modular_kernel(config: Config,
+                        vllm_config: VllmConfig) -> mk.FusedMoEModularKernel:
+
+    def next_power_of_2(x):
+        import math
+        if x == 0:
+            return 1
+        return 2**math.ceil(math.log2(x))
+
+    # make moe config
+    moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make(
+        tp_size_=get_tensor_model_parallel_world_size(),
+        dp_size_=get_dp_group().world_size,
+        vllm_parallel_config=vllm_config.parallel_config,
+    )
+    moe = FusedMoEConfig(
+        num_experts=config.E,
+        experts_per_token=config.topk,
+        hidden_dim=config.K,
+        num_local_experts=config.num_local_experts,
+        moe_parallel_config=moe_parallel_config,
+        in_dtype=config.dtype,
+        quant_config=config.quant_config,
+        max_num_tokens=next_power_of_2(config.M),
+    )
+
+    # make modular kernel
+    prepare_finalize = None
+    if config.needs_all2all():
+        prepare_finalize = FusedMoEMethodBase.maybe_make_prepare_finalize(moe)
+        assert prepare_finalize is not None
+    else:
+        prepare_finalize = MoEPrepareAndFinalizeNoEP()
+
+    fused_experts = make_fused_experts(config, moe,
+                                       prepare_finalize.num_dispatchers())
+
+    modular_kernel = mk.FusedMoEModularKernel(
+        prepare_finalize=prepare_finalize, fused_experts=fused_experts)
+
+    return modular_kernel
+
+
+def run_modular_kernel(
+    pgi: ProcessGroupInfo,
+    vllm_config: VllmConfig,
+    config: Config,
+    weights: WeightTensors,
+    rank_tensors: RankTensors,
+) -> torch.Tensor:
+    assert isinstance(config.Ms, int)
+    assert isinstance(config.topks, int)
+
+    # weights for rank
+    rank_weights = weights.slice_weights(pgi.rank, config.num_local_experts)
+
+    mk = make_modular_kernel(config, vllm_config)
+
+    mk_kwargs = {
+        "hidden_states": rank_tensors.hidden_states.clone(
+        ),  # impls might update the tensor in place
+        "w1": rank_weights.w1,
+        "w2": rank_weights.w2,
+        "topk_weights": rank_tensors.topk_weights,
+        "topk_ids": rank_tensors.topk_ids,
+        "expert_map": rank_tensors.expert_map,
+        "w1_scale": rank_weights.w1_scale,
+        "w2_scale": rank_weights.w2_scale,
+        "a1_scale": rank_tensors.hidden_states_scale,
+        "global_num_experts": config.E,
+        "apply_router_weight_on_input": config.topk == 1,
+    }
+    out = mk.forward(**mk_kwargs)
+
+    return out
diff --git a/vllm_v0.10.0/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py b/vllm_v0.10.0/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py
new file mode 100644
index 0000000..5dbfdfc
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py
@@ -0,0 +1,173 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import copy
+from enum import Enum
+from itertools import product
+from typing import Optional
+
+import torch
+from tqdm import tqdm
+
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.platforms import current_platform
+
+from .common import (Config, RankTensors, WeightTensors, reference_moe_impl,
+                     run_modular_kernel)
+from .mk_objects import (MK_FUSED_EXPERT_TYPES,
+                         MK_MULTI_GPU_PREPARE_FINALIZE_TYPES, MK_QUANT_CONFIGS)
+from .parallel_utils import ProcessGroupInfo, parallel_launch_with_config
+
+
+class Result(Enum):
+    PASS = 1
+    FAIL = 2
+    SKIP = 3
+
+
+def rank_worker(
+    pgi: ProcessGroupInfo,
+    vllm_config: VllmConfig,
+    cpu_group,
+    config: Config,
+    weights: WeightTensors,
+):
+    current_platform.seed_everything(pgi.rank)
+
+    # sanity check
+    from vllm import envs
+    if config.fused_moe_chunk_size is not None:
+        assert (config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE)
+
+    # get weights to this device
+    weights.to_current_device()
+
+    Ms = config.Ms
+    assert isinstance(Ms, list)
+    TOPKs = config.topks
+    assert isinstance(TOPKs, list)
+
+    for m, topk in product(Ms, TOPKs):
+        print(f"Running m={m}, topk={topk} ...")
+        # override m and topk
+        cfgx = copy.deepcopy(config)
+        cfgx.Ms = m
+        cfgx.topks = topk
+
+        # inputs for rank
+        rank_tensors = RankTensors.make(cfgx, pgi)
+
+        # modular kernel out
+        mk_out = run_modular_kernel(pgi, vllm_config, cfgx, weights,
+                                    rank_tensors)
+
+        with set_current_vllm_config(vllm_config):
+            ref_out = reference_moe_impl(cfgx, weights, rank_tensors)
+
+        torch.testing.assert_close(ref_out, mk_out, atol=3e-2, rtol=3e-2)
+
+
+def make_feature_matrix(csv_file_path: str):
+
+    from dataclasses import asdict
+
+    import pandas as pd
+
+    def add_to_results(config: Config,
+                       success: Result,
+                       results_df: Optional[pd.DataFrame] = None):
+        config_dict = asdict(config)
+        config_dict['prepare_finalize_type'] = config_dict[
+            'prepare_finalize_type'].__name__
+        config_dict['fused_experts_type'] = config_dict[
+            'fused_experts_type'].__name__
+        config_dict['per_tensor_act_quant'] = config.is_per_tensor_act_quant
+        quant_config_dict = config_dict['quant_config']
+        del config_dict['quant_config']
+        if quant_config_dict is None:
+            quant_config = FusedMoEQuantConfig(None)
+            quant_config_dict = asdict(quant_config)
+
+        config_dict |= quant_config_dict
+        result_dict = config_dict | {'success': success.name}
+
+        result_df = pd.DataFrame([result_dict])
+        if results_df is None:
+            results_df = result_df
+        else:
+            results_df = pd.concat([results_df, result_df], ignore_index=True)
+
+        return results_df
+
+    Ms = [64]
+    Ks = [7168]  # hidden sizes
+    Ns = [2048]
+    TOPKs = [[4, 1]]
+    Es = [32]
+    DTYPEs = [torch.bfloat16]
+    PF_TYPES = MK_MULTI_GPU_PREPARE_FINALIZE_TYPES
+    FE_TYPES = MK_FUSED_EXPERT_TYPES
+    Q_TYPES = MK_QUANT_CONFIGS
+
+    combinations = list(
+        product(Ms, Ks, Ns, Es, TOPKs, DTYPEs, PF_TYPES, FE_TYPES, Q_TYPES))
+
+    results_df: Optional[pd.DataFrame] = None
+    for m, k, n, e, topks, dtype, pf_type, experts_type, quant_config in tqdm(
+            combinations):  #noqa: E501
+        config = Config(Ms=[m],
+                        K=k,
+                        N=n,
+                        E=e,
+                        topks=topks,
+                        dtype=dtype,
+                        prepare_finalize_type=pf_type,
+                        fused_experts_type=experts_type,
+                        quant_config=quant_config,
+                        world_size=2,
+                        fused_moe_chunk_size=None)
+
+        success = None
+        if config.is_valid():
+            print(f"Running config : {config.describe()} ...")
+            try:
+                weights: WeightTensors = WeightTensors.make(config)
+                vllm_config, env_dict = config.make_env_data()
+                parallel_launch_with_config(config.world_size, rank_worker,
+                                            vllm_config, env_dict, config,
+                                            weights)
+                success = Result.PASS
+            except Exception as _:
+                success = Result.FAIL
+        else:
+            success = Result.SKIP
+
+        results_df = add_to_results(config, success, results_df)
+
+    if results_df is not None:
+        results_df.to_csv(f"{csv_file_path}")
+
+
+if __name__ == '__main__':
+    import argparse
+    from pathlib import Path
+    parser = argparse.ArgumentParser(description=(
+        "Make ModularKernel feature matrix \n"
+        "Example : python3 -m tests.kernels.moe.modular_kernel_tools.make_feature_matrix "  #noqa: E501
+        "-f ./feature_matrices/feature_matrix.csv"))
+
+    parser.add_argument("-f",
+                        "--feature-matrix-csv-file-path",
+                        type=str,
+                        required=True,
+                        help="File name to Generate a .csv file")
+    args = parser.parse_args()
+
+    csv_path = args.feature_matrix_csv_file_path
+    assert csv_path.endswith(
+        'csv'), f"Need a file path ending with .csv, got {csv_path}"
+    assert Path(csv_path).parent.is_dir(
+    ), f"Cannot find parent directory for {Path(csv_path).parent}"
+
+    make_feature_matrix(args.feature_matrix_csv_file_path)
diff --git a/vllm_v0.10.0/tests/kernels/moe/modular_kernel_tools/mk_objects.py b/vllm_v0.10.0/tests/kernels/moe/modular_kernel_tools/mk_objects.py
new file mode 100644
index 0000000..7321406
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/moe/modular_kernel_tools/mk_objects.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+# Fused experts and PrepareFinalize imports
+from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
+    BatchedDeepGemmExperts)
+from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
+    BatchedTritonOrDeepGemmExperts)
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
+from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
+from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+    BatchedTritonExperts, NaiveBatchedExperts)
+from vllm.model_executor.layers.fused_moe.layer import TritonExperts
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP)
+from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
+    TritonOrDeepGemmExperts)
+from vllm.utils import has_deep_ep, has_pplx
+
+if has_deep_ep():
+    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
+        DeepEPHTPrepareAndFinalize)
+    from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
+        DeepEPLLPrepareAndFinalize)
+
+if has_pplx():
+    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
+        PplxPrepareAndFinalize)
+
+MK_MULTI_GPU_PREPARE_FINALIZE_TYPES = []
+if has_pplx():
+    MK_MULTI_GPU_PREPARE_FINALIZE_TYPES += [PplxPrepareAndFinalize]
+if has_deep_ep():
+    MK_MULTI_GPU_PREPARE_FINALIZE_TYPES += [
+        DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize
+    ]
+
+MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES = [MoEPrepareAndFinalizeNoEP]
+
+MK_ALL_PREPARE_FINALIZE_TYPES = (MK_MULTI_GPU_PREPARE_FINALIZE_TYPES +
+                                 MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES)
+
+MK_FUSED_EXPERT_TYPES = [
+    BatchedDeepGemmExperts,
+    BatchedTritonExperts,
+    NaiveBatchedExperts,
+    BatchedTritonOrDeepGemmExperts,
+    CutlassExpertsFp8,
+    DeepGemmExperts,
+    TritonOrDeepGemmExperts,
+    TritonExperts,
+]
+
+MK_QUANT_CONFIGS = [
+    None,
+    # per-channel / per-column weights and per-tensor activations
+    FusedMoEQuantConfig(quant_dtype=torch.float8_e4m3fn,
+                        per_out_ch_quant=True,
+                        per_act_token_quant=False,
+                        block_shape=None),
+    # per-channel / per-column weights and per-token activations
+    FusedMoEQuantConfig(quant_dtype=torch.float8_e4m3fn,
+                        per_out_ch_quant=True,
+                        per_act_token_quant=True,
+                        block_shape=None),
+    # per-tensor weights and per-tensor activations
+    FusedMoEQuantConfig(quant_dtype=torch.float8_e4m3fn,
+                        per_out_ch_quant=False,
+                        per_act_token_quant=False,
+                        block_shape=None),
+    # per-tensor weights and per-token activations
+    FusedMoEQuantConfig(quant_dtype=torch.float8_e4m3fn,
+                        per_out_ch_quant=False,
+                        per_act_token_quant=True,
+                        block_shape=None),
+    # block-quantized weights and 128 block per-token activations
+    FusedMoEQuantConfig(quant_dtype=torch.float8_e4m3fn,
+                        per_out_ch_quant=False,
+                        per_act_token_quant=False,
+                        block_shape=[128, 128]),
+    # TODO (varun) : Should we test the following combinations ?
+    # block-quantized weights and per-token activations
+    # block-quantized weights and per-tensor activations
+]
diff --git a/vllm_v0.10.0/tests/kernels/moe/modular_kernel_tools/parallel_utils.py b/vllm_v0.10.0/tests/kernels/moe/modular_kernel_tools/parallel_utils.py
new file mode 100644
index 0000000..1f8d21a
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/moe/modular_kernel_tools/parallel_utils.py
@@ -0,0 +1,138 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import dataclasses
+import os
+import traceback
+from typing import Any, Callable, Optional
+
+import torch
+from torch.multiprocessing import (
+    spawn)  # pyright: ignore[reportPrivateImportUsage]
+from typing_extensions import Concatenate, ParamSpec
+
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.distributed import (init_distributed_environment,
+                              initialize_model_parallel)
+from vllm.utils import get_open_port
+
+## Parallel Processes Utils
+
+P = ParamSpec("P")
+
+
+@dataclasses.dataclass
+class ProcessGroupInfo:
+    world_size: int
+    world_local_size: int
+    rank: int
+    node_rank: int
+    local_rank: int
+    device: torch.device
+
+
+def _set_vllm_config(vllm_config: VllmConfig, world_size: int, rank: int,
+                     local_rank: int):
+
+    import tempfile
+    temp_file = tempfile.mkstemp()[1]
+
+    set_current_vllm_config(vllm_config)
+    with set_current_vllm_config(vllm_config):
+        init_distributed_environment(
+            world_size=world_size,
+            rank=rank,
+            distributed_init_method=f"file://{temp_file}",
+            local_rank=local_rank,
+            backend="nccl",
+        )
+
+        initialize_model_parallel(
+            tensor_model_parallel_size=vllm_config.parallel_config.
+            tensor_parallel_size,
+            pipeline_model_parallel_size=vllm_config.parallel_config.
+            pipeline_parallel_size,
+        )
+        cpu_group = torch.distributed.new_group(list(range(world_size)),
+                                                backend="gloo")
+    return cpu_group
+
+
+def _worker_parallel_launch(
+    local_rank: int,
+    world_size: int,
+    world_local_size: int,
+    node_rank: int,
+    init_method: str,
+    worker: Callable[Concatenate[ProcessGroupInfo, Optional[VllmConfig], Any,
+                                 P], None],
+    vllm_config: Optional[VllmConfig],
+    env_dict: Optional[dict],
+    *args: P.args,
+    **kwargs: P.kwargs,
+) -> None:
+    rank = node_rank * world_local_size + local_rank
+    torch.cuda.set_device(local_rank)
+    device = torch.device("cuda", local_rank)
+    torch.distributed.init_process_group(
+        backend="cpu:gloo,cuda:nccl",
+        init_method=init_method,
+        rank=rank,
+        world_size=world_size,
+        device_id=device,
+    )
+    barrier = torch.tensor([rank], device=device)
+    torch.distributed.all_reduce(barrier)
+
+    if env_dict is not None:
+        os.environ.update(env_dict)
+
+    cpu_group = None
+    if vllm_config is not None:
+        cpu_group = _set_vllm_config(vllm_config, world_size, rank, local_rank)
+
+    try:
+        worker(
+            ProcessGroupInfo(
+                world_size=world_size,
+                world_local_size=world_local_size,
+                rank=rank,
+                node_rank=node_rank,
+                local_rank=local_rank,
+                device=device,
+            ),
+            vllm_config,
+            cpu_group,
+            *args,
+            **kwargs,
+        )
+    except Exception as ex:
+        print(ex)
+        traceback.print_exc()
+        raise
+    finally:
+        torch.distributed.destroy_process_group()
+
+
+def parallel_launch_with_config(
+    world_size: int,
+    worker: Callable[Concatenate[ProcessGroupInfo, VllmConfig, Any, P], None],
+    vllm_config: VllmConfig,
+    env_dict: dict[Any, Any],
+    *args: P.args,
+    **kwargs: P.kwargs,
+) -> None:
+    assert not kwargs
+    spawn(
+        _worker_parallel_launch,
+        args=(
+            world_size,
+            world_size,
+            0,
+            f"tcp://{os.getenv('LOCALHOST', 'localhost')}:{get_open_port()}",
+            worker,
+            vllm_config,
+            env_dict,
+        ) + args,
+        nprocs=world_size,
+        join=True,
+    )
diff --git a/vllm_v0.10.0/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py b/vllm_v0.10.0/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
new file mode 100644
index 0000000..dd16ffb
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import copy
+from itertools import product
+from typing import Any, Callable
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.platforms import current_platform
+
+from .common import Config, RankTensors, WeightTensors, make_modular_kernel
+from .parallel_utils import ProcessGroupInfo, parallel_launch_with_config
+
+
+def do_profile(fn: Callable,
+               fn_kwargs: dict[Any, Any],
+               pgi: ProcessGroupInfo,
+               config: Config,
+               num_warmups: int = 5):
+    for _ in range(num_warmups):
+        fn(**fn_kwargs)
+
+    with torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA,
+            ],
+            with_stack=True,
+            record_shapes=True,
+    ) as tprof:
+        fn(**fn_kwargs)
+        torch.cuda.synchronize(torch.cuda.current_device())
+
+    # TODO (varun): Add a descriptive trace file name
+    tprof.export_chrome_trace(
+        f"{config.torch_trace_dir_path}/m{config.M}_{pgi.rank}_trace.json")
+
+
+def profile_modular_kernel(
+    pgi: ProcessGroupInfo,
+    vllm_config: VllmConfig,
+    config: Config,
+    weights: WeightTensors,
+    rank_tensors: RankTensors,
+) -> None:
+    assert isinstance(config.Ms, int)
+    assert isinstance(config.topks, int)
+
+    # weights for rank
+    rank_weights = weights.slice_weights(pgi.rank, config.num_local_experts)
+
+    # make modular kernel
+    mk = make_modular_kernel(config, vllm_config)
+
+    mk_kwargs = {
+        "hidden_states": rank_tensors.hidden_states,
+        "w1": rank_weights.w1,
+        "w2": rank_weights.w2,
+        "topk_weights": rank_tensors.topk_weights,
+        "topk_ids": rank_tensors.topk_ids,
+        "expert_map": rank_tensors.expert_map,
+        "w1_scale": rank_weights.w1_scale,
+        "w2_scale": rank_weights.w2_scale,
+        "a1_scale": rank_tensors.hidden_states_scale,
+        "global_num_experts": config.E,
+        "apply_router_weight_on_input": config.topk == 1,
+    }
+
+    do_profile(mk.forward, mk_kwargs, pgi, config)
+
+
+def rank_worker(
+    pgi: ProcessGroupInfo,
+    vllm_config: VllmConfig,
+    cpu_group,
+    config: Config,
+    weights: WeightTensors,
+):
+    current_platform.seed_everything(pgi.rank)
+
+    # sanity check
+    from vllm import envs
+    if config.fused_moe_chunk_size is not None:
+        assert (config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE)
+
+    # get weights to this device
+    weights.to_current_device()
+
+    Ms = config.Ms
+    assert isinstance(Ms, list)
+    TOPKs = config.topks
+    assert isinstance(TOPKs, list)
+
+    for m, topk in product(Ms, TOPKs):
+        print(f"Running m={m}, topk={topk} ...")
+        # override m and topk
+        cfgx = copy.deepcopy(config)
+        cfgx.Ms = m
+        cfgx.topks = topk
+
+        # inputs for rank
+        rank_tensors = RankTensors.make(cfgx, pgi)
+        profile_modular_kernel(pgi, vllm_config, cfgx, weights, rank_tensors)
+
+
+def run(config: Config):
+    weights: WeightTensors = WeightTensors.make(config)
+    vllm_config, env_dict = config.make_env_data()
+    parallel_launch_with_config(config.world_size, rank_worker, vllm_config,
+                                env_dict, config, weights)
+
+
+if __name__ == '__main__':
+    from .cli_args import make_config, make_config_arg_parser
+    parser = make_config_arg_parser(description=(
+        "Run single prepare-finalize & fused-experts combination test"
+        "Example : python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel "  #noqa: E501
+        "--pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts"
+    ))
+    args = parser.parse_args()
+    assert args.torch_trace_dir_path is not None, (
+        "Please pass in a directory to store torch traces")
+    config = make_config(args)
+
+    run(config)
diff --git a/vllm_v0.10.0/tests/kernels/moe/modular_kernel_tools/utils.py b/vllm_v0.10.0/tests/kernels/moe/modular_kernel_tools/utils.py
new file mode 100644
index 0000000..09bb4a3
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/moe/modular_kernel_tools/utils.py
@@ -0,0 +1,142 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+
+import torch
+
+import vllm._custom_ops as ops
+
+
+def per_token_cast_to_fp8(
+        x: torch.Tensor, block_size: int) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    pad_size = (block_size - (n % block_size)) % block_size
+    x = torch.nn.functional.pad(x,
+                                (0, pad_size), value=0) if pad_size > 0 else x
+    x_view = x.view(m, -1, block_size)
+    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
+    fp8_data = (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn)
+    return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
+
+
+def per_block_cast_to_fp8(
+        x: torch.Tensor, block_size_k: int,
+        block_size_n: int) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros(
+        (
+            int(math.ceil(m / block_size_k)) * block_size_k,
+            int(math.ceil(n / block_size_n)) * block_size_n,
+        ),
+        dtype=x.dtype,
+        device=x.device,
+    )
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, block_size_k,
+                           x_padded.size(1) // block_size_k, block_size_n)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
+    x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
+    scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
+    return x_scaled_sub, scales
+
+
+def make_non_quant_weights(
+    e: int,
+    n: int,
+    k: int,
+    dtype: torch.dtype,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Return weights w1, w2
+    """
+    device = torch.cuda.current_device()
+    w1 = torch.randn((e, 2 * n, k), device=device, dtype=dtype) / 15
+    w2 = torch.randn((e, k, n), device=device, dtype=dtype) / 15
+    return w1, w2
+
+
+def make_block_quant_fp8_weights(
+    e: int,
+    n: int,
+    k: int,
+    block_size: list[int],
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Return weights w1, w2, w1_scale, w2_scale
+    """
+    dtype = torch.bfloat16
+    device = torch.cuda.current_device()
+
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+    w1_bf16, w2_bf16 = make_non_quant_weights(e, n, k, dtype)
+    w1_bf16 = w1_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype)
+    w2_bf16 = w2_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype)
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles_w1 = ((2 * n) + block_n - 1) // block_n
+    k_tiles_w1 = (k + block_k - 1) // block_k
+    n_tiles_w2 = (k + block_n - 1) // block_n
+    k_tiles_w2 = (n + block_k - 1) // block_k
+
+    w1 = torch.empty_like(w1_bf16, dtype=torch.float8_e4m3fn, device=device)
+    w2 = torch.empty_like(w2_bf16, dtype=torch.float8_e4m3fn, device=device)
+
+    w1_s = torch.empty((e, n_tiles_w1, k_tiles_w1),
+                       device=device,
+                       dtype=torch.float32)
+    w2_s = torch.empty((e, n_tiles_w2, k_tiles_w2),
+                       device=device,
+                       dtype=torch.float32)
+
+    assert w1_s.shape == (e, (2 * n + (block_n - 1)) // block_n,
+                          (k + (block_k - 1)) // block_k)
+    assert (w2.shape[-2] + block_n - 1) // block_n == w2_s.shape[-2]
+
+    for i in range(e):
+        w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i],
+                                               block_size_k=block_k,
+                                               block_size_n=block_n)
+        w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i],
+                                               block_size_k=block_k,
+                                               block_size_n=block_n)
+
+    return w1, w2, w1_s, w2_s
+
+
+def make_quant_fp8_weights(
+    e: int,
+    n: int,
+    k: int,
+    per_out_channel_quant: bool,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Return w1, w2, w1_scale, w2_scale
+    """
+    q_dtype = torch.float8_e4m3fn
+
+    w1, w2 = make_non_quant_weights(e, n, k, dtype=torch.bfloat16)
+
+    # w1 -> w1_q, w2 -> w2_q
+    w1_q = torch.empty((e, 2 * n, k), device="cuda", dtype=q_dtype)
+    w2_q = torch.empty((e, k, n), device="cuda", dtype=q_dtype)
+
+    n_b_scales = 2 * n if per_out_channel_quant else 1
+    k_b_scales = k if per_out_channel_quant else 1
+    w1_scale = torch.empty((e, n_b_scales, 1),
+                           device="cuda",
+                           dtype=torch.float32)
+    w2_scale = torch.empty((e, k_b_scales, 1),
+                           device="cuda",
+                           dtype=torch.float32)
+
+    for expert in range(e):
+        w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
+            w1[expert], use_per_token_if_dynamic=per_out_channel_quant)
+        w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
+            w2[expert], use_per_token_if_dynamic=per_out_channel_quant)
+    return w1_q, w2_q, w1_scale, w2_scale
diff --git a/vllm_v0.10.0/tests/kernels/moe/parallel_utils.py b/vllm_v0.10.0/tests/kernels/moe/parallel_utils.py
new file mode 100644
index 0000000..1ad361a
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/moe/parallel_utils.py
@@ -0,0 +1,185 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+DeepEP test utilities
+"""
+import dataclasses
+import os
+import traceback
+from typing import Callable, Optional
+
+import torch
+from torch.distributed import ProcessGroup
+from torch.multiprocessing import (
+    spawn)  # pyright: ignore[reportPrivateImportUsage]
+from typing_extensions import Concatenate, ParamSpec
+
+from vllm.utils import get_open_port, has_deep_ep
+
+if has_deep_ep():
+    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
+        DeepEPHTPrepareAndFinalize)
+    from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
+        DeepEPLLPrepareAndFinalize)
+
+## Parallel Processes Utils
+
+P = ParamSpec("P")
+
+
+@dataclasses.dataclass
+class ProcessGroupInfo:
+    world_size: int
+    world_local_size: int
+    rank: int
+    node_rank: int
+    local_rank: int
+    device: torch.device
+
+
+def _worker_parallel_launch(
+    local_rank: int,
+    world_size: int,
+    world_local_size: int,
+    node_rank: int,
+    init_method: str,
+    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
+    *args: P.args,
+    **kwargs: P.kwargs,
+) -> None:
+    rank = node_rank * world_local_size + local_rank
+    torch.cuda.set_device(local_rank)
+    device = torch.device("cuda", local_rank)
+    torch.distributed.init_process_group(
+        backend="cpu:gloo,cuda:nccl",
+        init_method=init_method,
+        rank=rank,
+        world_size=world_size,
+        device_id=device,
+    )
+    barrier = torch.tensor([rank], device=device)
+    torch.distributed.all_reduce(barrier)
+
+    try:
+        worker(
+            ProcessGroupInfo(
+                world_size=world_size,
+                world_local_size=world_local_size,
+                rank=rank,
+                node_rank=node_rank,
+                local_rank=local_rank,
+                device=device,
+            ),
+            *args,
+            **kwargs,
+        )
+    except Exception as ex:
+        print(ex)
+        traceback.print_exc()
+        raise
+    finally:
+        torch.distributed.destroy_process_group()
+
+
+def parallel_launch(
+    world_size: int,
+    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
+    *args: P.args,
+    **kwargs: P.kwargs,
+) -> None:
+    assert not kwargs
+    spawn(
+        _worker_parallel_launch,
+        args=(
+            world_size,
+            world_size,
+            0,
+            f"tcp://{os.getenv('LOCALHOST', 'localhost')}:{get_open_port()}",
+            worker,
+        ) + args,
+        nprocs=world_size,
+        join=True,
+    )
+
+
+## DeepEP specific utils
+
+
+@dataclasses.dataclass
+class DeepEPHTArgs:
+    num_local_experts: int
+
+
+@dataclasses.dataclass
+class DeepEPLLArgs:
+    max_tokens_per_rank: int
+    hidden_size: int
+    num_experts: int
+    use_fp8_dispatch: bool
+
+
+def make_deepep_ht_a2a(pg: ProcessGroup,
+                       pgi: ProcessGroupInfo,
+                       dp_size: int,
+                       ht_args: DeepEPHTArgs,
+                       q_dtype: Optional[torch.dtype] = None,
+                       block_shape: Optional[list[int]] = None):
+
+    import deep_ep
+
+    # high throughput a2a
+    num_nvl_bytes = 1024 * 1024 * 1024  # 1GB
+    num_rdma_bytes, low_latency_mode, num_qps_per_rank = 0, False, 1
+    buffer = deep_ep.Buffer(group=pg,
+                            num_nvl_bytes=num_nvl_bytes,
+                            num_rdma_bytes=num_rdma_bytes,
+                            low_latency_mode=low_latency_mode,
+                            num_qps_per_rank=num_qps_per_rank)
+    return DeepEPHTPrepareAndFinalize(buffer=buffer,
+                                      num_dispatchers=pgi.world_size,
+                                      dp_size=dp_size,
+                                      rank_expert_offset=pgi.rank *
+                                      ht_args.num_local_experts)
+
+
+def make_deepep_ll_a2a(pg: ProcessGroup,
+                       pgi: ProcessGroupInfo,
+                       deepep_ll_args: DeepEPLLArgs,
+                       q_dtype: Optional[torch.dtype] = None,
+                       block_shape: Optional[list[int]] = None):
+
+    import deep_ep
+
+    # low-latency a2a
+    num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
+        deepep_ll_args.max_tokens_per_rank, deepep_ll_args.hidden_size,
+        pgi.world_size, deepep_ll_args.num_experts)
+
+    buffer = deep_ep.Buffer(group=pg,
+                            num_rdma_bytes=num_rdma_bytes,
+                            low_latency_mode=True,
+                            num_qps_per_rank=deepep_ll_args.num_experts //
+                            pgi.world_size)
+
+    return DeepEPLLPrepareAndFinalize(
+        buffer=buffer,
+        num_dispatchers=pgi.world_size,
+        max_tokens_per_rank=deepep_ll_args.max_tokens_per_rank,
+        use_fp8_dispatch=deepep_ll_args.use_fp8_dispatch,
+    )
+
+
+def make_deepep_a2a(pg: ProcessGroup,
+                    pgi: ProcessGroupInfo,
+                    dp_size: int,
+                    deepep_ht_args: Optional[DeepEPHTArgs],
+                    deepep_ll_args: Optional[DeepEPLLArgs],
+                    q_dtype: Optional[torch.dtype] = None,
+                    block_shape: Optional[list[int]] = None):
+    if deepep_ht_args is not None:
+        assert deepep_ll_args is None
+        return make_deepep_ht_a2a(pg, pgi, dp_size, deepep_ht_args, q_dtype,
+                                  block_shape)
+
+    assert deepep_ll_args is not None
+    return make_deepep_ll_a2a(pg, pgi, deepep_ll_args, q_dtype, block_shape)
diff --git a/vllm_v0.10.0/tests/kernels/moe/test_batched_moe.py b/vllm_v0.10.0/tests/kernels/moe/test_batched_moe.py
new file mode 100644
index 0000000..6931740
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/moe/test_batched_moe.py
@@ -0,0 +1,322 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from typing import Optional
+
+import pytest
+import torch
+
+from tests.kernels.moe.utils import (batched_moe,
+                                     make_quantized_test_activations,
+                                     make_test_weights, naive_batched_moe)
+from tests.kernels.quant_utils import native_batched_masked_quant_matmul
+from tests.kernels.utils import torch_experts
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+    invoke_moe_batched_triton_kernel)
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
+from vllm.platforms import current_platform
+from vllm.triton_utils import tl
+
+MNK_FACTORS = [
+    (1, 128, 128),
+    (1, 128, 2048),
+    (1, 512, 512),
+    (1, 1024, 128),
+    (1, 1024, 2048),
+    (32, 128, 128),
+    (32, 512, 512),
+    (32, 1024, 2048),
+    (45, 128, 128),
+    (45, 128, 2048),
+    (45, 512, 512),
+    (45, 1024, 128),
+    (45, 1024, 2048),
+    (64, 512, 512),
+    (64, 1024, 2048),
+    (222, 128, 128),
+    (222, 128, 2048),
+    (222, 1024, 128),
+    (222, 1024, 2048),
+]
+NUM_EXPERTS = [8, 64]
+TOP_KS = [1, 2, 6]
+
+vllm_config = VllmConfig()
+vllm_config.scheduler_config.max_num_seqs = 128
+vllm_config.scheduler_config.max_model_len = 8192
+
+
+@dataclass
+class BatchedMMConfig:
+    in_dtype: torch.dtype
+    quant_dtype: Optional[torch.dtype]
+    out_dtype: torch.dtype
+    num_experts: int
+    max_tokens_per_expert: int
+    K: int
+    N: int
+
+
+@dataclass
+class BatchedMMTensors:
+    A: torch.Tensor  # [E, max_tokens, K]
+    B: torch.Tensor  # [E, K, N] - column major
+    C: torch.Tensor  # [E, max_tokens, N]
+    num_expert_tokens: torch.Tensor  # [E]
+
+    @staticmethod
+    def make_tensors(config: BatchedMMConfig):
+        A = torch.randn(
+            (config.num_experts, config.max_tokens_per_expert, config.K),
+            device="cuda",
+            dtype=config.in_dtype) / 10
+        B = torch.randn((config.num_experts, config.N, config.K),
+                        device="cuda",
+                        dtype=config.in_dtype)
+        C = torch.zeros(
+            (config.num_experts, config.max_tokens_per_expert, config.N),
+            device="cuda",
+            dtype=config.out_dtype)
+
+        num_expert_tokens = torch.randint(low=0,
+                                          high=config.max_tokens_per_expert,
+                                          size=(config.num_experts, ),
+                                          device="cuda",
+                                          dtype=torch.int32)
+
+        return BatchedMMTensors(A, B, C, num_expert_tokens)
+
+
+@pytest.mark.parametrize("num_experts", [8, 16, 32])
+@pytest.mark.parametrize("max_tokens_per_expert",
+                         [32, 64, 128, 192, 224, 256, 512])
+@pytest.mark.parametrize("K", [128, 256, 1024])
+@pytest.mark.parametrize("N", [128, 256, 1024])
+@pytest.mark.parametrize(
+    "dtype",
+    [torch.float8_e4m3fn, torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("block_shape", [None, [128, 128]])
+@pytest.mark.parametrize("per_act_token_quant", [False, True])
+def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
+                    N: int, dtype: torch.dtype,
+                    block_shape: Optional[list[int]],
+                    per_act_token_quant: bool):
+    current_platform.seed_everything(7)
+
+    use_fp8_w8a8 = dtype == torch.float8_e4m3fn
+
+    if (per_act_token_quant or block_shape is not None) and not use_fp8_w8a8:
+        pytest.skip("Don't test blocking for non-quantized types.")
+
+    if per_act_token_quant and block_shape is not None:
+        pytest.skip("Skip illegal quantization test.")
+
+    if dtype.itemsize == 1:
+        act_dtype = torch.bfloat16
+        quant_dtype = dtype
+    else:
+        act_dtype = dtype
+        quant_dtype = None
+
+    num_expert_tokens = torch.randint(low=0,
+                                      high=max_tokens_per_expert,
+                                      size=(num_experts, ),
+                                      device="cuda",
+                                      dtype=torch.int32)
+
+    A, A_q, A_scale = make_quantized_test_activations(
+        num_experts,
+        max_tokens_per_expert,
+        K,
+        in_dtype=act_dtype,
+        quant_dtype=quant_dtype,
+        block_shape=block_shape,
+        per_act_token_quant=per_act_token_quant,
+    )
+
+    B, B_q, B_scale, _, _, _ = make_test_weights(
+        num_experts,
+        N // 2,
+        K,
+        in_dtype=act_dtype,
+        quant_dtype=quant_dtype,
+        block_shape=block_shape,
+        per_act_token_quant=per_act_token_quant,
+    )
+
+    out_shape = (num_experts, max_tokens_per_expert, N)
+    test_output = torch.zeros(out_shape, dtype=act_dtype, device="cuda")
+    ref_output = torch.zeros(out_shape, dtype=act_dtype, device="cuda")
+    q_ref_output = torch.zeros(out_shape, dtype=act_dtype, device="cuda")
+
+    compute_tl_dtype = {
+        torch.float16: tl.float16,
+        torch.bfloat16: tl.bfloat16,
+        torch.float32: tl.float32
+    }[test_output.dtype]
+
+    assert A_q.dtype == B_q.dtype
+
+    invoke_moe_batched_triton_kernel(
+        A_q,
+        B_q,
+        test_output,
+        num_expert_tokens,
+        compute_tl_dtype,
+        # Quantization data
+        A_scale,
+        B_scale,
+        None,
+        # Quantization schemes
+        use_fp8_w8a8,
+        False,
+        False,
+        config={
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 16,
+            "BLOCK_SIZE_K": 16 if dtype.itemsize > 1 else 32
+        },
+        per_act_token_quant=per_act_token_quant,
+        block_shape=block_shape,
+    )
+
+    ref_output = native_batched_masked_quant_matmul(
+        A,
+        B,
+        ref_output,
+        num_expert_tokens,
+    )
+
+    q_ref_output = native_batched_masked_quant_matmul(A_q, B_q, q_ref_output,
+                                                      num_expert_tokens,
+                                                      A_scale, B_scale,
+                                                      block_shape,
+                                                      per_act_token_quant)
+
+    rtol, atol = {
+        torch.float16: (6e-2, 6e-2),
+        torch.bfloat16: (6e-2, 6e-2),
+        torch.float32: (1e-2, 1e-2),
+    }[test_output.dtype]
+
+    torch.testing.assert_close(ref_output, q_ref_output, atol=atol, rtol=rtol)
+    torch.testing.assert_close(test_output, q_ref_output, atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize(("m", "n", "k"), MNK_FACTORS)
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
+@pytest.mark.parametrize("per_act_token_quant", [False, True])
+@pytest.mark.parametrize("block_shape", [None, [128, 128]])
+@pytest.mark.parametrize("input_scales", [False])
+def test_fused_moe_batched_experts(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    dtype: torch.dtype,
+    per_act_token_quant: bool,
+    block_shape: Optional[list[int]],
+    input_scales: bool,
+):
+    current_platform.seed_everything(7)
+
+    use_fp8_w8a8 = dtype == torch.float8_e4m3fn
+
+    if topk > e:
+        pytest.skip("topk > e")
+
+    if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
+        pytest.skip("Skip quantization test for non-quantized type")
+
+    if per_act_token_quant and block_shape is not None:
+        pytest.skip("Skip illegal quantization test.")
+
+    a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
+    score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
+
+    if dtype.itemsize == 1:
+        act_dtype = torch.bfloat16
+        quant_dtype = dtype
+    else:
+        act_dtype = dtype
+        quant_dtype = None
+
+    w1_16, w1, w1_s, w2_16, w2, w2_s = make_test_weights(
+        e,
+        n,
+        k,
+        block_shape=block_shape,
+        in_dtype=act_dtype,
+        quant_dtype=quant_dtype,
+        per_act_token_quant=per_act_token_quant,
+    )
+
+    if input_scales and quant_dtype is not None:
+        a1_scale = torch.tensor(1, device="cuda", dtype=torch.float32)
+        a2_scale = torch.tensor(1, device="cuda", dtype=torch.float32)
+    else:
+        a1_scale = None
+        a2_scale = None
+
+    with set_current_vllm_config(vllm_config):
+        topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
+
+        baseline_output = torch_experts(
+            a,
+            w1,
+            w2,
+            topk_weight,
+            topk_ids,
+            w1_scale=w1_s,
+            w2_scale=w2_s,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            quant_dtype=quant_dtype,
+            per_act_token_quant=per_act_token_quant,
+            block_shape=block_shape,
+        )
+
+        batched_output = naive_batched_moe(
+            a,
+            w1,
+            w2,
+            topk_weight,
+            topk_ids,
+            w1_scale=w1_s,
+            w2_scale=w2_s,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            quant_dtype=quant_dtype,
+            per_act_token_quant=per_act_token_quant,
+            block_shape=block_shape,
+        )
+
+        triton_output = batched_moe(
+            a,
+            w1,
+            w2,
+            topk_weight,
+            topk_ids,
+            w1_scale=w1_s,
+            w2_scale=w2_s,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            quant_dtype=quant_dtype,
+            per_act_token_quant=per_act_token_quant,
+            block_shape=block_shape,
+        )
+
+    torch.testing.assert_close(batched_output,
+                               baseline_output,
+                               atol=3e-2,
+                               rtol=2e-2)
+
+    torch.testing.assert_close(triton_output,
+                               batched_output,
+                               atol=2e-2,
+                               rtol=2e-2)
diff --git a/vllm_v0.10.0/tests/kernels/moe/test_block_fp8.py b/vllm_v0.10.0/tests/kernels/moe/test_block_fp8.py
new file mode 100644
index 0000000..7dc6282
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/moe/test_block_fp8.py
@@ -0,0 +1,296 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from tests.kernels.moe.utils import make_test_weights
+from tests.kernels.quant_utils import (native_per_token_group_quant_fp8,
+                                       native_w8a8_block_matmul)
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import fused_experts
+from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+    _valid_deep_gemm_shape, deep_gemm_moe_fp8)
+from vllm.model_executor.layers.fused_moe.fused_moe import (
+    fused_topk, modular_triton_fused_moe)
+from vllm.platforms import current_platform
+from vllm.utils import has_deep_gemm
+from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
+
+dg_available = has_deep_gemm()
+
+if dg_available:
+    from deep_gemm import get_m_alignment_for_contiguous_layout
+
+if current_platform.get_device_capability() < (9, 0):
+    pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
+                allow_module_level=True)
+
+vllm_config = VllmConfig()
+vllm_config.scheduler_config.max_num_seqs = 128
+vllm_config.scheduler_config.max_model_len = 8192
+
+# Test configurations
+DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
+# Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8
+# and its hidden size is 7168.
+MNK_FACTORS = [
+    (1, 128, 128),
+    (1, 512, 512),
+    (1, 128, 7168),
+    (1, 1024, 7168),
+    (1, 4608, 128),
+    (1, 4608, 512),
+    (1, 4608, 7168),
+    (83, 128, 128),
+    (83, 512, 512),
+    (83, 1024, 7168),
+    (83, 4608, 512),
+    (83, 4608, 7168),
+    (128, 128, 128),
+    (128, 512, 512),
+    (128, 1024, 7168),
+    (128, 4608, 512),
+    (128, 4608, 7168),
+    (2048, 128, 128),
+    (2048, 1024, 7168),
+    (2048, 4608, 512),
+    (2048, 4608, 7168),
+    (8192, 128, 128),
+    (8192, 512, 512),
+    (8192, 128, 7168),
+    (8192, 1024, 7168),
+    (8192, 4608, 512),
+    (8192, 4608, 7168),
+]
+
+MNK_FACTORS_DG = [
+    (128, 128, 128),
+    (128, 512, 512),
+    (128, 128, 7168),
+    (128, 1024, 7168),
+    (128, 4608, 128),
+    (128, 4608, 512),
+    (128, 4608, 7168),
+    (192, 128, 128),
+    (192, 512, 512),
+    (192, 1024, 7168),
+    (192, 4608, 512),
+    (192, 4608, 7168),
+    (1335, 128, 128),
+    (1335, 1024, 7168),
+    (1335, 4608, 512),
+    (1335, 4608, 7168),
+    (2048, 128, 128),
+    (2048, 512, 512),
+    (2048, 128, 7168),
+    (2048, 1024, 7168),
+    (2048, 4608, 128),
+    (2048, 4608, 512),
+    (2048, 4608, 7168),
+]
+
+BLOCK_SIZE = [[128, 128]]
+E = [2, 8, 16]  # [128, 256]
+TOP_KS = [1, 2, 6]
+SEEDS = [0]
+
+
+def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, topk_weight, topk_ids,
+                             block_shape):
+    """Fused moe with block-wise quantization using native torch."""
+    B, D = a.shape
+    topk = topk_ids.size(1)
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+
+    _, block_k = block_shape[0], block_shape[1]
+    a_q, a_s = native_per_token_group_quant_fp8(a, block_k)
+    a_q = a_q.to(torch.float32)
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            inter_out = native_w8a8_block_matmul(a_q[mask],
+                                                 w1[i],
+                                                 a_s[mask],
+                                                 w1_s[i],
+                                                 block_shape,
+                                                 output_dtype=a.dtype)
+            act_out = SiluAndMul().forward_native(inter_out)
+            act_out_q, act_out_s = native_per_token_group_quant_fp8(
+                act_out, block_k)
+            out[mask] = native_w8a8_block_matmul(act_out_q,
+                                                 w2[i],
+                                                 act_out_s,
+                                                 w2_s[i],
+                                                 block_shape,
+                                                 output_dtype=a.dtype)
+    return (out.view(B, -1, w2.shape[1]) *
+            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+
+
+# Skip all tests if CUDA is not available
+pytest.importorskip("torch.cuda")
+
+
+@pytest.fixture(autouse=True)
+def setup_cuda():
+    torch.set_default_device("cuda")
+
+
+@pytest.mark.parametrize(("M", "N", "K"), MNK_FACTORS)
+@pytest.mark.parametrize("E", E)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("block_size", BLOCK_SIZE)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@torch.inference_mode()
+def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed,
+                                  monkeypatch):
+    if topk > E:
+        pytest.skip(f"Skipping test; topk={topk} > E={E}")
+
+    torch.manual_seed(seed)
+
+    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "2048")
+
+    a = torch.randn((M, K), dtype=dtype) / 10
+    score = torch.randn((M, E), dtype=dtype)
+
+    _, w1, w1_s, _, w2, w2_s = make_test_weights(E,
+                                                 N,
+                                                 K,
+                                                 dtype,
+                                                 torch.float8_e4m3fn,
+                                                 per_act_token_quant=False,
+                                                 block_shape=block_size)
+
+    m_fused_moe = modular_triton_fused_moe(use_fp8_w8a8=True,
+                                           use_int8_w8a8=False,
+                                           use_int8_w8a16=False,
+                                           use_int4_w4a16=False,
+                                           per_act_token_quant=False,
+                                           block_shape=block_size)
+
+    topk_weights, topk_ids, _ = fused_topk(a, score.float(), topk, False)
+
+    # Set the context to avoid lots of warning spam.
+    with set_current_vllm_config(vllm_config):
+        ref_out = torch_w8a8_block_fp8_moe(
+            a,
+            w1,
+            w2,
+            w1_s,
+            w2_s,
+            topk_weights,
+            topk_ids,
+            block_size,
+        )
+
+        out = fused_experts(
+            a,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            use_fp8_w8a8=True,
+            w1_scale=w1_s,
+            w2_scale=w2_s,
+            block_shape=block_size,
+        )
+
+        m_out = m_fused_moe(
+            a,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            w1_scale=w1_s,
+            w2_scale=w2_s,
+        )
+
+    # 0.039 only needed for [40000-4608-7168-2-1-block_size852-dtype852-0]
+    tol = 0.035 if M < 40000 else 0.039
+    torch.testing.assert_close(out, ref_out, atol=tol, rtol=tol)
+    torch.testing.assert_close(m_out, ref_out, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize(("M", "N", "K"), MNK_FACTORS_DG)
+@pytest.mark.parametrize("E", E)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.")
+@pytest.mark.skipif(is_blackwell_deep_gemm_used(), reason="Not E8M0 scale MOE")
+@torch.inference_mode()
+def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed,
+                                            monkeypatch):
+    if topk > E:
+        pytest.skip(f"Skipping test: topk={topk} > E={E}")
+
+    if not _valid_deep_gemm_shape(M, N, K):
+        pytest.skip(f"Skipping test: invalid size m={M}, n={N}, k={K}")
+
+    chunk_size = 1024
+
+    torch.manual_seed(seed)
+
+    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size))
+    block_m = get_m_alignment_for_contiguous_layout()
+    block_size = [block_m, block_m]
+    dtype = torch.bfloat16
+
+    a = torch.randn((M, K), dtype=dtype) / 10
+    score = torch.randn((M, E), dtype=dtype)
+
+    _, w1, w1_s, _, w2, w2_s = make_test_weights(E,
+                                                 N,
+                                                 K,
+                                                 dtype,
+                                                 torch.float8_e4m3fn,
+                                                 per_act_token_quant=False,
+                                                 block_shape=block_size)
+
+    # Note: for now use_compile will error out if the problem size is
+    # large enough to trigger chunking. I'm leaving the flag and
+    # setup code in case we are able to revisit this later.
+    use_compile = False
+
+    use_cudagraph = (chunk_size < M and N >= 1024 and K >= 1024
+                     and current_platform.is_cuda_alike())
+
+    topk_weights, topk_ids, _ = fused_topk(a, score.float(), topk, False)
+
+    # Set the context to avoid lots of warning spam.
+    with set_current_vllm_config(vllm_config):
+        ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, topk_weights,
+                                           topk_ids, block_size)
+
+        if use_compile:
+            deep_gemm_moe_fp8_fn = torch.compile(deep_gemm_moe_fp8,
+                                                 backend="inductor",
+                                                 fullgraph=True)
+            torch._dynamo.mark_dynamic(a, 0)
+            torch._dynamo.mark_dynamic(topk_weights, 0)
+            torch._dynamo.mark_dynamic(topk_ids, 0)
+        else:
+            deep_gemm_moe_fp8_fn = deep_gemm_moe_fp8
+
+        out = deep_gemm_moe_fp8_fn(a, w1, w2, w1_s, w2_s, topk_weights,
+                                   topk_ids)
+
+        if use_cudagraph:
+            out.fill_(0)
+            stream = torch.cuda.Stream()
+            graph = torch.cuda.CUDAGraph()
+            with torch.cuda.graph(graph, stream=stream):
+                out = deep_gemm_moe_fp8_fn(a, w1, w2, w1_s, w2_s, topk_weights,
+                                           topk_ids)
+            torch.cuda.synchronize()
+            graph.replay()
+            torch.cuda.synchronize()
+
+    torch.testing.assert_close(out, ref_out, atol=0.035, rtol=0.035)
diff --git a/vllm_v0.10.0/tests/kernels/moe/test_block_int8.py b/vllm_v0.10.0/tests/kernels/moe/test_block_int8.py
new file mode 100644
index 0000000..8e680c7
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/moe/test_block_int8.py
@@ -0,0 +1,147 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from tests.kernels.moe.utils import make_test_weights
+from tests.kernels.quant_utils import (native_per_token_group_quant_int8,
+                                       native_w8a8_block_matmul)
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.platforms import current_platform
+
+if current_platform.get_device_capability() < (7, 0):
+    pytest.skip("INT8 Triton requires CUDA 7.0 or higher",
+                allow_module_level=True)
+
+vllm_config = VllmConfig()
+vllm_config.scheduler_config.max_num_seqs = 128
+vllm_config.scheduler_config.max_model_len = 8192
+
+DTYPES = [torch.half, torch.bfloat16]
+
+MNK_FACTORS = [
+    (1, 128, 128),
+    (1, 512, 512),
+    (1, 128, 7168),
+    (1, 1024, 7168),
+    (1, 4096, 128),
+    (1, 4096, 512),
+    (1, 4096, 7168),
+    (33, 128, 128),
+    (33, 512, 512),
+    (33, 128, 7168),
+    (33, 1024, 7168),
+    (33, 4096, 128),
+    (33, 4096, 512),
+    (33, 4096, 7168),
+    (128, 128, 128),
+    (128, 512, 512),
+    (128, 1024, 7168),
+    (128, 4096, 512),
+    (128, 4096, 7168),
+    (222, 128, 128),
+    (222, 512, 512),
+    (222, 1024, 7168),
+    (222, 4096, 512),
+    (222, 4096, 7168),
+    (2048, 128, 128),
+    (2048, 1024, 7168),
+    (2048, 4096, 512),
+    (2048, 4096, 7168),
+]
+
+E = [8, 24]
+TOP_KS = [2, 6]
+# BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
+BLOCK_SIZE = [[128, 128]]
+SEEDS = [0]
+
+
+# For test
+def torch_w8a8_block_int8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
+    """This function performs fused moe with block-wise quantization using
+    native torch."""
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+
+    _, block_k = block_shape[0], block_shape[1]
+    a_q, a_s = native_per_token_group_quant_int8(a, block_k)
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            inter_out = native_w8a8_block_matmul(a_q[mask],
+                                                 w1[i],
+                                                 a_s[mask],
+                                                 w1_s[i],
+                                                 block_shape,
+                                                 output_dtype=a.dtype)
+            act_out = SiluAndMul().forward_native(inter_out)
+            act_out_q, act_out_s = native_per_token_group_quant_int8(
+                act_out, block_k)
+            act_out = act_out.to(torch.float32)
+            out[mask] = native_w8a8_block_matmul(act_out_q,
+                                                 w2[i],
+                                                 act_out_s,
+                                                 w2_s[i],
+                                                 block_shape,
+                                                 output_dtype=a.dtype)
+    return (out.view(B, -1, w2.shape[1]) *
+            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+
+
+@pytest.fixture(autouse=True, scope="module")
+def setup_cuda():
+    """Sets the default CUDA device for all tests in this module."""
+    torch.set_default_device("cuda")
+
+
+@pytest.mark.parametrize(("M", "N", "K"), MNK_FACTORS)
+@pytest.mark.parametrize("E", E)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("block_size", BLOCK_SIZE)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@torch.inference_mode()
+def test_w8a8_block_int8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
+    """Tests the fused_moe kernel with W8A8 INT8 block quantization against a
+    native torch reference."""
+    torch.manual_seed(seed)
+
+    a = torch.randn((M, K), dtype=dtype) / 10
+    score = torch.randn((M, E), dtype=dtype)
+
+    _, w1, w1_s, _, w2, w2_s = make_test_weights(E,
+                                                 N,
+                                                 K,
+                                                 dtype,
+                                                 torch.int8,
+                                                 per_act_token_quant=False,
+                                                 block_shape=block_size)
+
+    # Set the context to avoid lots of warning spam.
+    with set_current_vllm_config(vllm_config):
+        out = fused_moe(
+            a,
+            w1,
+            w2,
+            score,
+            topk,
+            renormalize=False,
+            use_int8_w8a8=True,
+            w1_scale=w1_s,
+            w2_scale=w2_s,
+            block_shape=block_size,
+        )
+        ref_out = torch_w8a8_block_int8_moe(a, w1, w2, w1_s, w2_s, score, topk,
+                                            block_size)
+
+    # Check results
+    torch.testing.assert_close(out, ref_out, atol=0.065, rtol=0.065)
diff --git a/vllm_v0.10.0/tests/kernels/moe/test_count_expert_num_tokens.py b/vllm_v0.10.0/tests/kernels/moe/test_count_expert_num_tokens.py
new file mode 100644
index 0000000..0872836
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/moe/test_count_expert_num_tokens.py
@@ -0,0 +1,140 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests compute_expert_num_tokens kernels
+"""
+
+import dataclasses
+from typing import Optional
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.fused_moe.utils import count_expert_num_tokens
+
+
+@dataclasses.dataclass
+class TestTensors:
+
+    topk_ids: torch.Tensor
+    expert_map: Optional[torch.Tensor] = None
+
+    def to_device(self, device: str):
+        self.topk_ids = self.topk_ids.to(device=device)
+        if self.expert_map is not None:
+            self.expert_map = self.expert_map.to(device=device)
+
+    @staticmethod
+    def make(num_tokens: int, num_topk: int, num_experts: int, device: str,
+             topk_ids_dtype: torch.dtype) -> "TestTensors":
+
+        # make topk ids
+        topk_ids = torch.empty((num_tokens, num_topk),
+                               device=device,
+                               dtype=torch.int64)
+        for x in range(num_tokens):
+            topk_ids[x] = torch.randperm(num_experts)[:num_topk]
+        topk_ids = topk_ids.to(dtype=torch.int64)
+        return TestTensors(topk_ids=topk_ids)
+
+    def with_ep_rank(self, ep_rank: int, num_global_experts: int,
+                     num_local_experts: int, device: str):
+        # make an expert map
+        expert_map = torch.empty((num_global_experts),
+                                 device=device,
+                                 dtype=torch.int32)
+        expert_map.fill_(-1)
+        s = ep_rank * num_local_experts
+        e = s + num_local_experts
+        expert_map[s:e] = torch.tensor(list(range(num_local_experts)),
+                                       device=device)
+
+        return TestTensors(topk_ids=self.topk_ids.clone(),
+                           expert_map=expert_map)
+
+
+def ref_impl(tt: TestTensors, expert_num_tokens: torch.Tensor):
+    # do the reference in cpu
+    tt.to_device("cpu")
+    expert_ids, counts = tt.topk_ids.unique(return_counts=True)
+
+    for eid, count in zip(expert_ids, counts):
+        if eid != -1 and tt.expert_map is not None:
+            eid = tt.expert_map[eid]
+
+        if eid == -1:
+            continue
+
+        expert_num_tokens[eid] += count
+
+
+def do_test_compute_expert_num_tokens(num_tokens: int, num_topk: int,
+                                      num_experts: int, ep_size: int,
+                                      topk_ids_dtype: torch.dtype):
+
+    assert num_topk <= num_experts
+
+    tt = TestTensors.make(num_tokens,
+                          num_topk,
+                          num_experts,
+                          topk_ids_dtype=topk_ids_dtype,
+                          device="cpu")
+
+    num_global_experts = num_experts
+    assert num_global_experts % ep_size == 0
+    num_local_experts = num_global_experts // ep_size
+    for ep_rank in range(ep_size):
+        tt_rank = tt.with_ep_rank(ep_rank, num_global_experts,
+                                  num_local_experts, "cpu")
+
+        ref_expert_num_tokens = torch.zeros((num_local_experts),
+                                            device="cpu",
+                                            dtype=torch.int32)
+        ref_impl(tt_rank, ref_expert_num_tokens)
+        ref_expert_num_tokens = ref_expert_num_tokens.to("cuda")
+
+        tt_rank.to_device("cuda")
+        # Test with expert_map
+        triton_expert_num_tokens_w_emap = count_expert_num_tokens(
+            tt_rank.topk_ids, num_local_experts, tt_rank.expert_map)
+
+        # Test without expert map
+        topk_ids = tt_rank.expert_map[tt_rank.topk_ids].to(topk_ids_dtype)
+        triton_expert_num_tokens_wo_emap = count_expert_num_tokens(
+            topk_ids, num_local_experts, expert_map=None)
+
+        torch.testing.assert_close(ref_expert_num_tokens,
+                                   triton_expert_num_tokens_w_emap,
+                                   atol=0,
+                                   rtol=0)
+        torch.testing.assert_close(ref_expert_num_tokens,
+                                   triton_expert_num_tokens_wo_emap,
+                                   atol=0,
+                                   rtol=0)
+
+
+@pytest.mark.parametrize(
+    "num_tokens", [1, 4, 8, 11, 19, 128, 127, 405, 1024, 3333, 6666, 7317])
+@pytest.mark.parametrize("num_topk", [2, 6, 8])
+@pytest.mark.parametrize("num_experts", [64])
+@pytest.mark.parametrize("ep_size", [1, 2, 4])
+@pytest.mark.parametrize("topk_ids_dtype", [torch.int64])
+def test_compute_expert_num_tokens(num_tokens: int, num_topk: int,
+                                   num_experts: int, ep_size: int,
+                                   topk_ids_dtype: torch.dtype):
+    do_test_compute_expert_num_tokens(num_tokens, num_topk, num_experts,
+                                      ep_size, topk_ids_dtype)
+
+
+@pytest.mark.parametrize("numel", list(range(1, 8192, 11)))
+@pytest.mark.parametrize("num_experts", [32])
+@pytest.mark.parametrize("ep_size", [2])
+@pytest.mark.parametrize("topk_ids_dtype", [torch.int64])
+def test_compute_expert_num_tokens_from_numel(numel: int, num_experts: int,
+                                              ep_size: int,
+                                              topk_ids_dtype: torch.dtype):
+    do_test_compute_expert_num_tokens(num_tokens=numel,
+                                      num_topk=1,
+                                      num_experts=num_experts,
+                                      ep_size=ep_size,
+                                      topk_ids_dtype=topk_ids_dtype)
diff --git a/vllm_v0.10.0/tests/kernels/moe/test_cutlass_grouped_gemm.py b/vllm_v0.10.0/tests/kernels/moe/test_cutlass_grouped_gemm.py
new file mode 100644
index 0000000..67984fe
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/moe/test_cutlass_grouped_gemm.py
@@ -0,0 +1,116 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# DeepGEMM Style Cutlass Grouped GEMM Test
+# See https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_core.py
+
+import random
+
+import pytest
+import torch
+
+from tests.kernels.utils import baseline_scaled_mm
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+
+
+def cdiv(a, b):
+    return (a + b - 1) // b
+
+
+def per_token_cast_to_fp8(
+        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    pad_size = (128 - (n % 128)) % 128
+    x = torch.nn.functional.pad(x,
+                                (0, pad_size), value=0) if pad_size > 0 else x
+    x_view = x.view(m, -1, 128)
+    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
+    fp8_data = (x_view *
+                (448.0 / x_amax.unsqueeze(2))).to(dtype=torch.float8_e4m3fn)
+    return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
+
+
+def per_block_cast_to_fp8(
+        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros((cdiv(m, 128) * 128, cdiv(n, 128) * 128),
+                           device=x.device,
+                           dtype=x.dtype)
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    x_scaled = (x_view * (448.0 / x_amax)).to(dtype=torch.float8_e4m3fn)
+    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (
+        x_amax / 448.0).view(x_view.size(0), x_view.size(2))
+
+
+@pytest.mark.parametrize("num_groups, expected_m_per_group, k, n", [
+    (4, 8192, 7168, 4096),
+    (4, 8192, 2048, 7168),
+    (8, 4096, 7168, 4096),
+    (8, 4096, 2048, 7168),
+    (32, 1024, 7168, 4096),
+    (32, 1024, 2048, 7168),
+])
+@pytest.mark.parametrize("out_dtype", [torch.float16])
+@pytest.mark.skipif(
+    (lambda x: x is None or x.to_int() != 100)(
+        current_platform.get_device_capability()),
+    reason="Block Scaled Grouped GEMM is only supported on SM100.")
+def test_cutlass_grouped_gemm(
+    num_groups: int,
+    expected_m_per_group: int,
+    k: int,
+    n: int,
+    out_dtype: torch.dtype,
+):
+    device = "cuda"
+    alignment = 128
+    group_ms = [
+        int(expected_m_per_group * random.uniform(0.7, 1.3))
+        for _ in range(num_groups)
+    ]
+    m = sum([cdiv(m, alignment) * alignment for m in group_ms])
+
+    x = torch.randn((m, k), device=device, dtype=out_dtype)
+    y = torch.randn((num_groups, n, k), device=device, dtype=out_dtype)
+    out = torch.empty((m, n), device=device, dtype=out_dtype)
+    ref_out = torch.randn((m, n), device=device, dtype=out_dtype)
+
+    ep_offset = [0] + [sum(group_ms[:i]) for i in range(1, num_groups)] + [m]
+    pb_size = []
+    for i in range(num_groups):
+        pb_size.append([ep_offset[i + 1] - ep_offset[i], n, k])
+    problem_sizes = torch.tensor(pb_size, device=device, dtype=torch.int32)
+    expert_offsets = torch.tensor(ep_offset, device=device, dtype=torch.int32)
+
+    x_fp8 = per_token_cast_to_fp8(x)
+    y_fp8 = (torch.empty_like(y, dtype=torch.float8_e4m3fn),
+             torch.empty((num_groups, cdiv(n, 128), k // 128),
+                         device=device,
+                         dtype=torch.float))
+    for i in range(num_groups):
+        y_fp8[0][i], y_fp8[1][i] = per_block_cast_to_fp8(y[i])
+
+    for i in range(num_groups):
+        a = x_fp8[0][ep_offset[i]:ep_offset[i + 1]]
+        a_scale = x_fp8[1][ep_offset[i]:ep_offset[i + 1]]
+        b = y_fp8[0][i].t()
+        b_scale = y_fp8[1][i].t()
+        baseline = baseline_scaled_mm(a, b, a_scale, b_scale, out_dtype)
+        ref_out[ep_offset[i]:ep_offset[i + 1]] = baseline
+
+    ops.cutlass_blockwise_scaled_grouped_mm(
+        out,
+        x_fp8[0],
+        y_fp8[0],
+        x_fp8[1],
+        y_fp8[1],
+        problem_sizes,
+        expert_offsets[:-1],
+    )
+
+    torch.testing.assert_close(ref_out, out, atol=5e-1, rtol=1e-3)
diff --git a/vllm_v0.10.0/tests/kernels/moe/test_cutlass_moe.py b/vllm_v0.10.0/tests/kernels/moe/test_cutlass_moe.py
new file mode 100644
index 0000000..81fb3ec
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/moe/test_cutlass_moe.py
@@ -0,0 +1,469 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import dataclasses
+from math import prod
+from typing import Optional
+
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+    cutlass_moe_fp8, run_cutlass_moe_fp8)
+from vllm.model_executor.layers.fused_moe.fused_moe import (fused_experts,
+                                                            fused_topk)
+from vllm.model_executor.layers.fused_moe.utils import (
+    moe_kernel_quantize_input)
+from vllm.platforms import current_platform
+
+NUM_EXPERTS = [40, 64]
+TOP_KS = [6, 8]
+
+MNK_FACTORS = [
+    (2, 1024, 1024),
+    (2, 1024, 1536),
+    (2, 3072, 1024),
+    (2, 3072, 1536),
+    (7, 3072, 1536),
+    (64, 1024, 1024),
+    (64, 1024, 1536),
+    (64, 3072, 1024),
+    (64, 3072, 1536),
+    (224, 1024, 1024),
+    (224, 1024, 1536),
+    (224, 3072, 1024),
+    (224, 3072, 1536),
+    (32768, 1024, 1024),
+    # These sizes trigger wrong answers.
+    #(7232, 2048, 5120),
+    #(40000, 2048, 5120),
+]
+
+vllm_config = VllmConfig(parallel_config=ParallelConfig(
+    pipeline_parallel_size=1))
+vllm_config.scheduler_config.max_num_seqs = 128
+vllm_config.scheduler_config.max_model_len = 8192
+
+
+@dataclasses.dataclass
+class MOETensors:
+    a: torch.Tensor
+    w1: torch.Tensor
+    w2: torch.Tensor
+    ab_strides1: torch.Tensor
+    c_strides1: torch.Tensor
+    ab_strides2: torch.Tensor
+    c_strides2: torch.Tensor
+
+    @staticmethod
+    def make_moe_tensors(m: int, k: int, n: int, e: int,
+                         dtype: torch.dtype) -> "MOETensors":
+        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+        w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+        w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
+        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
+        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+        return MOETensors(a=a,
+                          w1=w1,
+                          w2=w2,
+                          ab_strides1=ab_strides1,
+                          c_strides1=c_strides1,
+                          ab_strides2=ab_strides2,
+                          c_strides2=c_strides2)
+
+
+@dataclasses.dataclass
+class MOETensors8Bit(MOETensors):
+    # quantized
+    a_q: Optional[torch.Tensor] = None  # a -> a_q
+    w1_q: Optional[torch.Tensor] = None  # w1 -> w1_q
+    w2_q: Optional[torch.Tensor] = None  # w2 -> w2_q
+    a_scale: Optional[torch.Tensor] = None
+    w1_scale: Optional[torch.Tensor] = None
+    w2_scale: Optional[torch.Tensor] = None
+    # dequantized
+    a_d: Optional[torch.Tensor] = None  # a -> a_q -> a_d
+    w1_d: Optional[torch.Tensor] = None  # w1 -> w1_q -> w1_d
+    w2_d: Optional[torch.Tensor] = None  # w2 -> w2_q -> w2_d
+
+    @staticmethod
+    def make_moe_tensors_8bit(m: int, k: int, n: int, e: int,
+                              per_act_token: bool,
+                              per_out_channel: bool) -> "MOETensors8Bit":
+        dtype = torch.half
+        q_dtype = torch.float8_e4m3fn
+
+        moe_tensors_fp16 = MOETensors.make_moe_tensors(m, k, n, e, dtype)
+
+        # a -> a_q, w1 -> w1_q, w2 -> w2_q
+        n_b_scales = 2 * n if per_out_channel else 1
+        k_b_scales = k if per_out_channel else 1
+        # Get the right scale for tests.
+        a_q, a_scale = ops.scaled_fp8_quant(
+            moe_tensors_fp16.a, None, use_per_token_if_dynamic=per_act_token)
+
+        w1_q = torch.empty((e, 2 * n, k), device="cuda", dtype=q_dtype)
+        w2_q = torch.empty((e, k, n), device="cuda", dtype=q_dtype)
+
+        w1_scale = torch.empty((e, n_b_scales, 1),
+                               device="cuda",
+                               dtype=torch.float32)
+        w2_scale = torch.empty((e, k_b_scales, 1),
+                               device="cuda",
+                               dtype=torch.float32)
+        for expert in range(e):
+            w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
+                moe_tensors_fp16.w1[expert],
+                use_per_token_if_dynamic=per_out_channel)
+            w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
+                moe_tensors_fp16.w2[expert],
+                use_per_token_if_dynamic=per_out_channel)
+
+        # a_q -> a_d, w1_q -> w1_d, w2_q -> w2_d
+        a_d = a_q.float().mul(a_scale).to(dtype)
+        w1_d = torch.empty_like(moe_tensors_fp16.w1)
+        w2_d = torch.empty_like(moe_tensors_fp16.w2)
+        for expert in range(e):
+            w1_d[expert] = (w1_q[expert].float() * w1_scale[expert]).half()
+            w2_d[expert] = (w2_q[expert].float() * w2_scale[expert]).half()
+
+        return MOETensors8Bit(a=moe_tensors_fp16.a,
+                              w1=moe_tensors_fp16.w1,
+                              w2=moe_tensors_fp16.w2,
+                              ab_strides1=moe_tensors_fp16.ab_strides1,
+                              c_strides1=moe_tensors_fp16.c_strides1,
+                              ab_strides2=moe_tensors_fp16.ab_strides2,
+                              c_strides2=moe_tensors_fp16.c_strides2,
+                              a_q=a_q,
+                              w1_q=w1_q,
+                              w2_q=w2_q,
+                              a_scale=a_scale,
+                              w1_scale=w1_scale,
+                              w2_scale=w2_scale,
+                              a_d=a_d,
+                              w1_d=w1_d,
+                              w2_d=w2_d)
+
+
+def run_with_expert_maps(num_experts: int, num_local_experts: int,
+                         **cutlass_moe_kwargs):
+
+    def slice_experts():
+        slice_params = [
+            "w1_q", "w2_q", "ab_strides1", "ab_strides2", "c_strides1",
+            "c_strides2", "w1_scale", "w2_scale"
+        ]
+        full_tensors = {
+            k: v
+            for k, v in cutlass_moe_kwargs.items()
+            if k in slice_params and k in cutlass_moe_kwargs
+        }
+
+        for i in range(0, num_experts, num_local_experts):
+            s, e = i, i + num_local_experts
+
+            # make expert map
+            expert_map = [-1] * num_experts
+            expert_map[s:e] = list(range(num_local_experts))
+            expert_map = torch.tensor(expert_map,
+                                      dtype=torch.int32,
+                                      device="cuda")
+
+            # update cutlass moe arg with expert_map
+            cutlass_moe_kwargs["expert_map"] = expert_map
+            # update cutlass moe arg tensors
+            for k, t in full_tensors.items():
+                cutlass_moe_kwargs[k] = t[s:e]
+
+            yield cutlass_moe_kwargs
+
+    out_tensor = torch.zeros_like(cutlass_moe_kwargs["a"])
+    for kwargs in slice_experts():
+        out_tensor = out_tensor + cutlass_moe_fp8(**kwargs)
+
+    return out_tensor
+
+
+def run_8_bit(moe_tensors: MOETensors8Bit,
+              topk_weights: torch.Tensor,
+              topk_ids: torch.Tensor,
+              per_act_token: bool,
+              num_local_experts: Optional[int] = None) -> torch.Tensor:
+    assert not any([
+        t is None for t in [
+            moe_tensors.w1_q, moe_tensors.w2_q, moe_tensors.w1_scale,
+            moe_tensors.w2_scale, moe_tensors.a_scale
+        ]
+    ])
+
+    kwargs = {
+        'a': moe_tensors.a,
+        'w1_q': moe_tensors.w1_q,  # type: ignore[union-attr]
+        'w2_q': moe_tensors.w2_q,  # type: ignore[union-attr]
+        'topk_weights': topk_weights,
+        'topk_ids': topk_ids,
+        'w1_scale': moe_tensors.w1_scale,
+        'w2_scale': moe_tensors.w2_scale,
+        'per_act_token': per_act_token,
+        'a1_scale': None  #moe_tensors.a_scale
+    }
+
+    num_experts = moe_tensors.w1.size(0)
+    with_ep = num_local_experts is not None or num_local_experts == num_experts
+    if not with_ep:
+        return cutlass_moe_fp8(**kwargs)
+
+    assert num_local_experts is not None
+    return run_with_expert_maps(
+        num_experts,
+        num_local_experts,  # type: ignore[arg-type]
+        **kwargs)
+
+
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("per_act_token", [True, False])
+@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+def test_cutlass_moe_8_bit_no_graph(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_ch: bool,
+    monkeypatch,
+    ep_size: Optional[int] = None,
+):
+    current_platform.seed_everything(7)
+    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
+    with set_current_vllm_config(vllm_config):
+        mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token,
+                                                  per_out_ch)
+
+        score = torch.randn((m, e), device="cuda", dtype=torch.half)
+        topk_weights, topk_ids, _ = fused_topk(mt.a,
+                                               score,
+                                               topk,
+                                               renormalize=False)
+
+        # Note that we are using the dequantized versions of the tensors.
+        # Using a, w1 and w2 directly results in minor output differences.
+        triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights,
+                                      topk_ids)
+
+        if ep_size is not None:
+            assert e % ep_size == 0, "Cannot distribute experts evenly"
+            number_local_experts = e // ep_size
+        else:
+            number_local_experts = None
+        cutlass_output = run_8_bit(mt, topk_weights, topk_ids, per_act_token,
+                                   number_local_experts)
+
+        # Note 5.5 only needed for larger problem sizes, 5 works ok for
+        # the rest.
+        torch.testing.assert_close(triton_output,
+                                   cutlass_output,
+                                   atol=5.5e-2,
+                                   rtol=1e-2)
+
+
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("per_act_token", [True, False])
+@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+def test_cutlass_moe_8_bit_cuda_graph(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_ch: bool,
+    monkeypatch,
+):
+    current_platform.seed_everything(7)
+    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
+    with set_current_vllm_config(vllm_config):
+        dtype = torch.half
+
+        mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token,
+                                                  per_out_ch)
+
+        score = torch.randn((m, e), device="cuda", dtype=dtype)
+        topk_weights, topk_ids, _ = fused_topk(mt.a,
+                                               score,
+                                               topk,
+                                               renormalize=False)
+
+        # Note that we are using the dequantized versions of the tensors.
+        # Using a, w1 and w2 directly results in minor output differences.
+        triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights,
+                                      topk_ids)
+
+        stream = torch.cuda.Stream()
+        graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(graph, stream=stream):
+            cutlass_output = run_8_bit(mt, topk_weights, topk_ids,
+                                       per_act_token)
+
+        torch.cuda.synchronize()
+        graph.replay()
+        torch.cuda.synchronize()
+
+        torch.testing.assert_close(triton_output,
+                                   cutlass_output,
+                                   atol=9e-2,
+                                   rtol=1e-2)
+
+
+@pytest.mark.parametrize("m", [64])
+@pytest.mark.parametrize("n", [1024])
+@pytest.mark.parametrize("k", [4096])
+@pytest.mark.parametrize("e", [16])
+@pytest.mark.parametrize("topk", [1, 8])
+@pytest.mark.parametrize("per_act_token", [True])
+@pytest.mark.parametrize("per_out_channel", [True])
+@pytest.mark.parametrize("ep_size", [1, 2, 4, 8, 16])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+def test_cutlass_moe_8_bit_EP(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_channel: bool,
+    ep_size: int,
+    monkeypatch,
+):
+    test_cutlass_moe_8_bit_no_graph(m, n, k, e, topk, per_act_token,
+                                    per_out_channel, monkeypatch, ep_size)
+
+
+LARGE_MNK_FACTORS = [
+    (1, 8192, 5120, 31),
+    (32768, 1024, 1024, 16),
+    (65536, 512, 1024, 16),
+]
+
+
+@pytest.mark.parametrize("m,n,k,topk", LARGE_MNK_FACTORS)
+@pytest.mark.parametrize("e", [128])
+@pytest.mark.parametrize("per_act_token", [False])
+@pytest.mark.parametrize("per_out_channel", [True])
+@pytest.mark.parametrize("ep_size", [8])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+def test_cutlass_moe_8_bit_EP_large(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_channel: bool,
+    ep_size: int,
+    monkeypatch,
+):
+    test_cutlass_moe_8_bit_no_graph(m, n, k, e, topk, per_act_token,
+                                    per_out_channel, monkeypatch, ep_size)
+
+
+@pytest.mark.parametrize("m,n,k,topk", [(1, 8192, 5120, 31)])
+@pytest.mark.parametrize("e", [128])
+@pytest.mark.parametrize("per_act_token", [False])
+@pytest.mark.parametrize("per_out_channel", [True])
+@pytest.mark.parametrize("ep_size", [8])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+def test_run_cutlass_moe_fp8(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_channel: bool,
+    ep_size: int,
+):
+    current_platform.seed_everything(7)
+    with set_current_vllm_config(vllm_config):
+        mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token,
+                                                  per_out_channel)
+
+        score = torch.randn((m, e), device="cuda", dtype=torch.half)
+        topk_weights, topk_ids, _ = fused_topk(mt.a,
+                                               score,
+                                               topk,
+                                               renormalize=False)
+        # we want to make sure there is at least one token that's generated in
+        # this expert shard and at least one token that's NOT generated in this
+        # expert shard
+        topk_ids[0][0] = -1
+        topk_ids[0][1] = 1
+
+        workspace13_shape = (m * topk, max(2 * n, k))
+        workspace2_shape = (m * topk, n)
+        output_shape = (m * topk, k)
+
+        workspace13 = torch.empty(prod(workspace13_shape),
+                                  device="cuda",
+                                  dtype=mt.a.dtype)
+        workspace2 = torch.empty(prod(workspace2_shape),
+                                 device="cuda",
+                                 dtype=mt.a.dtype)
+
+        num_local_experts = e // ep_size
+        start, end = 0, num_local_experts
+        expert_map = [-1] * e
+        expert_map[start:end] = list(range(num_local_experts))
+        expert_map = torch.tensor(expert_map, dtype=torch.int32, device="cuda")
+
+        activation = lambda o, i: torch.ops._C.silu_and_mul(o, i)
+        a1q, a1q_scale = moe_kernel_quantize_input(mt.a, mt.a_scale,
+                                                   torch.float8_e4m3fn,
+                                                   per_act_token)
+        global_num_experts = -1 if mt.w1_q is None else mt.w1_q.size(0)
+        func = lambda output: run_cutlass_moe_fp8(
+            output, a1q, mt.w1_q, mt.w2_q, topk_ids, activation,
+            global_num_experts, expert_map, mt.w1_scale, mt.w2_scale,
+            a1q_scale, None, workspace13, workspace2, None, mt.a.dtype,
+            per_act_token, per_out_channel, False)
+
+        workspace13.random_()
+        output_random_workspace = torch.empty(output_shape,
+                                              device="cuda",
+                                              dtype=mt.a.dtype)
+        func(output_random_workspace)
+
+        workspace13.fill_(0)
+        output_zero_workspace = torch.zeros(output_shape,
+                                            device="cuda",
+                                            dtype=mt.a.dtype)
+        func(output_zero_workspace)
+
+        torch.testing.assert_close(output_random_workspace,
+                                   output_zero_workspace,
+                                   atol=5e-3,
+                                   rtol=1e-3)
diff --git a/vllm_v0.10.0/tests/kernels/moe/test_deepep_deepgemm_moe.py b/vllm_v0.10.0/tests/kernels/moe/test_deepep_deepgemm_moe.py
new file mode 100644
index 0000000..074771e
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -0,0 +1,466 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test DeepEP + DeepGEMM integration
+DeepGEMM are gemm kernels specialized for the
+fp8 block-quantized case.
+"""
+
+import dataclasses
+from typing import Optional
+
+import pytest
+import torch.distributed
+from torch.distributed import ProcessGroup
+from typing_extensions import ParamSpec
+
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel)
+from vllm.platforms import current_platform
+from vllm.utils import has_deep_ep, has_deep_gemm
+from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
+
+from .parallel_utils import ProcessGroupInfo, parallel_launch
+from .utils import make_test_weights
+
+if has_deep_ep():
+    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
+        DeepEPHTPrepareAndFinalize)
+    from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
+        DeepEPLLPrepareAndFinalize)
+
+    from .parallel_utils import DeepEPHTArgs, DeepEPLLArgs, make_deepep_a2a
+
+if has_deep_gemm():
+
+    from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
+        BatchedDeepGemmExperts)
+    from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+        DeepGemmExperts)
+
+requires_deep_ep = pytest.mark.skipif(
+    not has_deep_ep(),
+    reason="Requires deep_ep kernels",
+)
+
+requires_deep_gemm = pytest.mark.skipif(
+    not has_deep_gemm(),
+    reason="Requires deep_gemm kernels",
+)
+
+P = ParamSpec("P")
+
+
+def next_power_of_2(x):
+    import math
+    if x == 0:
+        return 1
+    return 2**math.ceil(math.log2(x))
+
+
+def make_block_quant_fp8_weights(
+    e: int,
+    n: int,
+    k: int,
+    block_size: list[int],
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Return weights w1q, w2q, w1_scale, w2_scale
+    """
+    w1, w1q, w1_scale, w2, w2q, w2_scale = make_test_weights(
+        e, n, k, torch.bfloat16, torch.float8_e4m3fn, block_size)
+    return w1q, w2q, w1_scale, w2_scale
+
+
+@dataclasses.dataclass
+class TestConfig:
+    topk: int
+    m: int
+    k: int
+    n: int
+    num_experts: int
+    per_act_token_quant: bool
+    block_size: list[int]
+    # configs for testing low-latency kernels
+    low_latency: bool
+    use_fp8_dispatch: Optional[bool] = False
+
+
+@dataclasses.dataclass
+class TestTensors:
+    rank_tokens: torch.Tensor  # all ranks make this many tokens
+    rank_token_scales: Optional[torch.Tensor]
+    topk: torch.Tensor
+    topk_weights: torch.Tensor
+    config: TestConfig
+
+    @staticmethod
+    def make(config: TestConfig, rank) -> "TestTensors":
+
+        dtype = torch.bfloat16
+        topk, m, k = (config.topk, config.m, config.k)
+
+        fp8_info = torch.finfo(torch.float8_e4m3fn)
+        fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+        rank_tokens = torch.randn(
+            (m, k), device=torch.cuda.current_device(), dtype=dtype) / 10.0
+        rank_tokens = rank_tokens.clamp(min=fp8_min, max=fp8_max)
+        rank_token_scales = None
+
+        topk_ids = torch.randint(
+            low=0,
+            high=config.num_experts,
+            size=(m, topk),
+            device=torch.cuda.current_device()).to(dtype=torch.int64)
+
+        topk_weights = torch.randn(topk_ids.shape,
+                                   dtype=torch.float32,
+                                   device=torch.cuda.current_device())
+
+        return TestTensors(rank_tokens=rank_tokens,
+                           rank_token_scales=rank_token_scales,
+                           topk=topk_ids,
+                           topk_weights=topk_weights,
+                           config=config)
+
+
+def make_ll_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo,
+                           max_tokens_per_rank: int, dp_size: int,
+                           hidden_size: int, q_dtype: Optional[torch.dtype],
+                           test_config: TestConfig) -> FusedMoEModularKernel:
+
+    assert test_config.low_latency
+    assert test_config.use_fp8_dispatch is not None
+
+    a2a: DeepEPLLPrepareAndFinalize = make_deepep_a2a(
+        pg=pg,
+        pgi=pgi,
+        dp_size=dp_size,
+        deepep_ht_args=None,
+        deepep_ll_args=DeepEPLLArgs(
+            max_tokens_per_rank=max_tokens_per_rank,
+            hidden_size=hidden_size,
+            num_experts=test_config.num_experts,
+            use_fp8_dispatch=test_config.use_fp8_dispatch),
+        q_dtype=q_dtype,
+        block_shape=test_config.block_size)
+
+    fused_experts = BatchedDeepGemmExperts(
+        max_num_tokens=max_tokens_per_rank,
+        num_dispatchers=pgi.world_size // dp_size,
+        block_shape=test_config.block_size,
+        per_act_token_quant=test_config.per_act_token_quant)
+    mk = FusedMoEModularKernel(prepare_finalize=a2a,
+                               fused_experts=fused_experts)
+    return mk
+
+
+def make_ht_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo,
+                           dp_size: int, num_local_experts: int,
+                           q_dtype: Optional[torch.dtype],
+                           test_config: TestConfig) -> FusedMoEModularKernel:
+
+    assert not test_config.low_latency
+    assert test_config.use_fp8_dispatch is None
+
+    a2a: DeepEPHTPrepareAndFinalize = make_deepep_a2a(
+        pg=pg,
+        pgi=pgi,
+        dp_size=dp_size,
+        deepep_ht_args=DeepEPHTArgs(num_local_experts=num_local_experts),
+        deepep_ll_args=None,
+        q_dtype=q_dtype,
+        block_shape=test_config.block_size)
+
+    fused_experts = DeepGemmExperts()
+    mk = FusedMoEModularKernel(prepare_finalize=a2a,
+                               fused_experts=fused_experts)
+    return mk
+
+
+def make_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo, dp_size: int,
+                        num_local_experts: int,
+                        test_tensors: TestTensors) -> FusedMoEModularKernel:
+
+    q_dtype = torch.float8_e4m3fn
+    test_config = test_tensors.config
+
+    mk: FusedMoEModularKernel
+    # Make modular kernel
+    if test_config.low_latency:
+        max_tokens_per_rank = max(
+            64, next_power_of_2(test_tensors.rank_tokens.size(0)))
+        hidden_size = test_tensors.rank_tokens.size(-1)
+
+        mk = make_ll_modular_kernel(pg=pg,
+                                    pgi=pgi,
+                                    max_tokens_per_rank=max_tokens_per_rank,
+                                    dp_size=dp_size,
+                                    hidden_size=hidden_size,
+                                    q_dtype=q_dtype,
+                                    test_config=test_config)
+    else:
+        mk = make_ht_modular_kernel(pg, pgi, dp_size, num_local_experts,
+                                    q_dtype, test_config)
+
+    return mk
+
+
+def deepep_deepgemm_moe_impl(pg: ProcessGroup, pgi: ProcessGroupInfo,
+                             dp_size: int, test_tensors: TestTensors,
+                             w1: torch.Tensor, w2: torch.Tensor,
+                             w1_scale: Optional[torch.Tensor],
+                             w2_scale: Optional[torch.Tensor]) -> torch.Tensor:
+
+    test_config = test_tensors.config
+    num_experts = test_config.num_experts
+    num_local_experts = w1.size(0)
+
+    def build_expert_map():
+        num_local_experts = w1.size(0)
+        expert_map = torch.full((num_experts, ),
+                                fill_value=-1,
+                                dtype=torch.int32)
+        s = pgi.rank * num_local_experts
+        e = s + num_local_experts
+        expert_map[s:e] = torch.tensor(list(range(num_local_experts)))
+        return expert_map.to(device=torch.cuda.current_device(),
+                             dtype=torch.int32)
+
+    # Make modular kernel
+    mk: FusedMoEModularKernel = make_modular_kernel(
+        pg=pg,
+        pgi=pgi,
+        dp_size=dp_size,
+        num_local_experts=num_local_experts,
+        test_tensors=test_tensors)
+
+    # Low-Latency kernels can't dispatch scales.
+    a1_scale = (None
+                if test_config.low_latency else test_tensors.rank_token_scales)
+
+    out = mk.forward(hidden_states=test_tensors.rank_tokens,
+                     w1=w1,
+                     w2=w2,
+                     topk_weights=test_tensors.topk_weights,
+                     topk_ids=test_tensors.topk,
+                     inplace=False,
+                     activation="silu",
+                     global_num_experts=num_experts,
+                     expert_map=build_expert_map(),
+                     w1_scale=w1_scale,
+                     w2_scale=w2_scale,
+                     w1_zp=None,
+                     w2_zp=None,
+                     a1_scale=a1_scale,
+                     a2_scale=None,
+                     apply_router_weight_on_input=False)
+    return out
+
+
+def triton_impl(a: torch.Tensor, topk_ids: torch.Tensor,
+                topk_weights: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor,
+                w1_scale: torch.Tensor, w2_scale: torch.Tensor,
+                a1_scale: torch.Tensor, block_shape: list[int]):
+
+    return fused_experts(
+        hidden_states=a,
+        w1=w1,
+        w2=w2,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        inplace=False,
+        use_fp8_w8a8=True,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        a1_scale=a1_scale,
+        block_shape=block_shape,
+        # Make sure this is set to False so we
+        # dont end up comparing the same implementation.
+        allow_deep_gemm=False)
+
+
+def _test_deepep_deepgemm_moe(
+    pgi: ProcessGroupInfo,
+    dp_size: int,
+    config: TestConfig,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+):
+    current_platform.seed_everything(pgi.rank)
+
+    w1 = w1.to(device=torch.cuda.current_device())
+    w2 = w2.to(device=torch.cuda.current_device())
+    w1_scale = w1_scale.to(device=torch.cuda.current_device())
+    w2_scale = w2_scale.to(device=torch.cuda.current_device())
+
+    pg = torch.distributed.new_group(list(range(pgi.world_size)))
+    test_tensors = TestTensors.make(config, pgi.rank)
+    block_shape = [
+        w1.size(1) // w1_scale.size(1),
+        w1.size(2) // w1_scale.size(2)
+    ]
+
+    with set_current_vllm_config(VllmConfig()):
+        # Reference
+        triton_moe = triton_impl(a=test_tensors.rank_tokens,
+                                 topk_ids=test_tensors.topk,
+                                 topk_weights=test_tensors.topk_weights,
+                                 w1=w1,
+                                 w2=w2,
+                                 w1_scale=w1_scale,
+                                 w2_scale=w2_scale,
+                                 a1_scale=test_tensors.rank_token_scales,
+                                 block_shape=block_shape)
+
+        # Slice experts for this rank.
+        num_local_experts = config.num_experts // pgi.world_size
+        e_start = num_local_experts * pgi.rank
+        e_end = e_start + num_local_experts
+        w1_ep = w1[e_start:e_end]
+        w2_ep = w2[e_start:e_end]
+        w1_scale_ep = w1_scale[e_start:e_end]
+        w2_scale_ep = w2_scale[e_start:e_end]
+
+        deepep_moe = deepep_deepgemm_moe_impl(
+            pg,
+            pgi,
+            dp_size,
+            test_tensors,
+            w1_ep,
+            w2_ep,
+            w1_scale_ep,
+            w2_scale_ep,
+        )
+
+    torch.testing.assert_close(
+        triton_moe,
+        deepep_moe,
+        atol=6e-2,
+        rtol=6e-2,
+    )
+
+
+MNKs = [
+    (8, 128, 128),
+    (8, 128, 512),
+    (8, 512, 512),
+    (3, 1024, 2048),
+    (32, 128, 1024),
+    (45, 512, 2048),
+    (64, 1024, 1024),
+    (129, 128, 256),
+    (129, 1024, 2048),
+    (222, 1024, 2048),
+]
+
+TOPKS = [2, 6]
+NUM_EXPERTS = [32]
+
+
+@pytest.mark.parametrize("mnk", MNKs)
+@pytest.mark.parametrize("num_experts", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOPKS)
+@pytest.mark.parametrize("world_dp_size", [(2, 1)])
+@requires_deep_ep
+@requires_deep_gemm
+@pytest.mark.skipif(is_blackwell_deep_gemm_used(),
+                    reason="Skipping test for Blackwell DeepGEMM")
+def test_ht_deepep_deepgemm_moe(mnk: tuple[int, int, int], num_experts: int,
+                                topk: int, world_dp_size: tuple[int, int]):
+    """
+    Tests for High-Throughput DeepEP + DeepGemm integration.
+    """
+    import deep_gemm
+
+    m, n, k = mnk
+    current_platform.seed_everything(7)
+
+    if topk > num_experts:
+        pytest.skip(f"Skipping test: topk={topk} > E={num_experts}")
+
+    block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
+    block_size = [block_m, block_m]
+
+    world_size, dp_size = world_dp_size
+    config = TestConfig(topk=topk,
+                        m=m,
+                        k=k,
+                        n=n,
+                        num_experts=num_experts,
+                        per_act_token_quant=False,
+                        block_size=block_size,
+                        low_latency=False,
+                        use_fp8_dispatch=None)
+
+    w1, w2, w1_scale, w2_scale = make_block_quant_fp8_weights(
+        num_experts, n, k, block_size)
+
+    parallel_launch(world_size, _test_deepep_deepgemm_moe, dp_size, config, w1,
+                    w2, w1_scale, w2_scale)
+
+
+MNKs = [
+    (1, 128, 2560),
+    (2, 128, 2560),
+    (3, 1024, 2560),
+    (32, 128, 2560),
+    (45, 512, 2560),
+    (64, 1024, 2560),
+    (222, 1024, 2560),
+]
+# Fix tests for USE_FP8_DISPATCH=True
+USE_FP8_DISPATCH = [False]
+
+
+@pytest.mark.parametrize("mnk", MNKs)
+@pytest.mark.parametrize("num_experts", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOPKS)
+@pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH)
+@pytest.mark.parametrize("block_size", [[128, 128]])
+@pytest.mark.parametrize("world_dp_size", [(2, 1)])
+@requires_deep_ep
+@requires_deep_gemm
+@pytest.mark.skipif(is_blackwell_deep_gemm_used(),
+                    reason="Skipping test for Blackwell DeepGEMM")
+def test_ll_deepep_deepgemm_moe(
+    mnk: tuple[int, int, int],
+    num_experts: int,
+    topk: int,
+    use_fp8_dispatch: bool,
+    block_size: list[int],
+    world_dp_size: tuple[int, int],
+):
+    """
+    Tests for Low-Latency DeepEP + DeepGemm integration.
+    """
+
+    m, n, k = mnk
+    current_platform.seed_everything(7)
+
+    if topk > num_experts:
+        pytest.skip(f"Skipping test: topk={topk} > E={num_experts}")
+
+    world_size, dp_size = world_dp_size
+    config = TestConfig(
+        topk=topk,
+        m=m,
+        k=k,
+        n=n,
+        num_experts=num_experts,
+        per_act_token_quant=False,
+        block_size=block_size,
+        low_latency=True,
+        use_fp8_dispatch=use_fp8_dispatch,
+    )
+
+    w1, w2, w1_scale, w2_scale = make_block_quant_fp8_weights(
+        num_experts, n, k, block_size)
+
+    parallel_launch(world_size, _test_deepep_deepgemm_moe, dp_size, config, w1,
+                    w2, w1_scale, w2_scale)
diff --git a/vllm_v0.10.0/tests/kernels/moe/test_deepep_moe.py b/vllm_v0.10.0/tests/kernels/moe/test_deepep_moe.py
new file mode 100644
index 0000000..43804c4
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/moe/test_deepep_moe.py
@@ -0,0 +1,491 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test deepep dispatch-combine logic
+"""
+
+import dataclasses
+from typing import Optional, Union
+
+import pytest
+import torch.distributed
+from torch.distributed import ProcessGroup
+
+from vllm import _custom_ops as ops
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import TritonExperts
+from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+    BatchedTritonExperts)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel)
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8)
+from vllm.platforms import current_platform
+from vllm.utils import has_deep_ep
+
+from .parallel_utils import ProcessGroupInfo, parallel_launch
+
+if has_deep_ep():
+    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
+        DeepEPHTPrepareAndFinalize)
+    from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
+        DeepEPLLPrepareAndFinalize)
+
+    from .parallel_utils import DeepEPHTArgs, DeepEPLLArgs, make_deepep_a2a
+
+requires_deep_ep = pytest.mark.skipif(
+    not has_deep_ep(),
+    reason="Requires deep_ep kernels",
+)
+
+MAX_TOKENS_PER_RANK = 64
+
+
+def make_weights(
+        e, n, k, dtype
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Return weights w1, w2, w1_scale, w2_scale
+    """
+    if dtype in [torch.float16, torch.bfloat16]:
+        w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+        w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+        return w1, w2, None, None
+
+    # per-out-channel weight quantization
+    assert dtype == torch.float8_e4m3fn
+    w1 = torch.empty((e, 2 * n, k), device="cuda", dtype=torch.float16)
+    w2 = torch.empty((e, k, n), device="cuda", dtype=torch.float16)
+
+    n_b_scales = 2 * n
+    k_b_scales = k
+    w1_q = torch.empty_like(w1, dtype=dtype)
+    w2_q = torch.empty_like(w2, dtype=dtype)
+    w1_scale = torch.empty((e, n_b_scales, 1),
+                           device="cuda",
+                           dtype=torch.float32)
+    w2_scale = torch.empty((e, k_b_scales, 1),
+                           device="cuda",
+                           dtype=torch.float32)
+    for expert in range(e):
+        w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
+            w1[expert], use_per_token_if_dynamic=True)
+        w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
+            w2[expert], use_per_token_if_dynamic=True)
+    return w1_q, w2_q, w1_scale, w2_scale
+
+
+@dataclasses.dataclass
+class TestConfig:
+    dtype: torch.dtype
+    topk: int
+    m: int
+    k: int
+    n: int
+    num_experts: int
+
+
+@dataclasses.dataclass
+class TestTensors:
+    rank_tokens: torch.Tensor  # all ranks make this many tokens
+    rank_token_scales: Optional[torch.Tensor]
+    topk: torch.Tensor
+    topk_weights: torch.Tensor
+    config: TestConfig
+
+    @staticmethod
+    def make(config: TestConfig, low_latency_mode: bool) -> "TestTensors":
+        # TODO (varun) - check that float16 works ?
+        assert config.dtype in [torch.bfloat16, torch.float8_e4m3fn]
+        token_dtype = (torch.bfloat16 if config.dtype == torch.float8_e4m3fn
+                       else config.dtype)
+        rank_tokens = torch.randn(
+            (config.m, config.k), device="cuda", dtype=token_dtype) / 10
+        rank_token_scales = None
+
+        topk = torch.randint(low=0,
+                             high=config.num_experts,
+                             size=(config.m, config.topk),
+                             device="cuda").to(dtype=torch.int64)
+        topk_weights = torch.randn(topk.shape,
+                                   dtype=torch.float32,
+                                   device="cuda")
+        return TestTensors(rank_tokens=rank_tokens,
+                           rank_token_scales=rank_token_scales,
+                           topk=topk,
+                           topk_weights=topk_weights,
+                           config=config)
+
+
+def make_modular_kernel(
+    pg: ProcessGroup,
+    pgi: ProcessGroupInfo,
+    low_latency_mode: bool,
+    hidden_size: int,
+    dp_size: int,
+    num_experts: int,
+    num_local_experts: int,
+    q_dtype: Optional[torch.dtype],
+    use_fp8_dispatch: bool,
+    per_act_token_quant: bool,
+) -> FusedMoEModularKernel:
+
+    is_quantized = q_dtype is not None
+
+    ht_args: Optional[DeepEPHTArgs] = None
+    ll_args: Optional[DeepEPLLArgs] = None
+
+    if low_latency_mode:
+        ll_args = DeepEPLLArgs(max_tokens_per_rank=MAX_TOKENS_PER_RANK,
+                               hidden_size=hidden_size,
+                               num_experts=num_experts,
+                               use_fp8_dispatch=use_fp8_dispatch)
+    else:
+        assert not use_fp8_dispatch, (
+            "FP8 Dispatch is valid only for low-latency kernels")
+        ht_args = DeepEPHTArgs(num_local_experts=num_local_experts)
+
+    a2a : Union[DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize] = \
+        make_deepep_a2a(pg = pg,
+                        pgi = pgi,
+                        dp_size = dp_size,
+                        q_dtype = q_dtype,
+                        block_shape = None,
+                        deepep_ht_args = ht_args,
+                        deepep_ll_args = ll_args)
+
+    num_dispatchers = pgi.world_size // dp_size
+
+    if low_latency_mode:
+        assert not per_act_token_quant, "not supported in ll mode"
+        fused_experts = BatchedTritonExperts(
+            max_num_tokens=MAX_TOKENS_PER_RANK,
+            num_dispatchers=num_dispatchers,
+            use_fp8_w8a8=is_quantized,
+            use_int8_w8a8=False,
+            use_int8_w8a16=False,
+            use_int4_w4a16=False,
+            per_act_token_quant=False,
+        )
+    else:
+        fused_experts = TritonExperts(
+            use_fp8_w8a8=is_quantized,
+            use_int8_w8a8=False,
+            use_int8_w8a16=False,
+            use_int4_w4a16=False,
+            per_act_token_quant=per_act_token_quant,
+        )
+
+    mk = FusedMoEModularKernel(prepare_finalize=a2a,
+                               fused_experts=fused_experts)
+    return mk
+
+
+def deep_ep_moe_impl(
+    pg: ProcessGroup,
+    pgi: ProcessGroupInfo,
+    low_latency_mode: bool,
+    dp_size: int,
+    test_tensors: TestTensors,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: Optional[torch.Tensor],
+    w2_scale: Optional[torch.Tensor],
+    num_experts: int,
+    use_fp8_dispatch: bool,
+    per_act_token_quant: bool,
+) -> torch.Tensor:
+
+    num_local_experts = w1.size(0)
+
+    def build_expert_map():
+        num_local_experts = w1.size(0)
+        expert_map = torch.full((num_experts, ),
+                                fill_value=-1,
+                                dtype=torch.int32)
+        s = pgi.rank * num_local_experts
+        e = s + num_local_experts
+        expert_map[s:e] = torch.tensor(list(range(num_local_experts)))
+        return expert_map.to(device=torch.cuda.current_device(),
+                             dtype=torch.int32)
+
+    hidden_size = test_tensors.rank_tokens.size(1)
+    is_quantized = w1.dtype == torch.float8_e4m3fn
+    q_dtype = None
+    if is_quantized:
+        q_dtype = torch.float8_e4m3fn
+
+    # Make modular kernel
+    mk: FusedMoEModularKernel = make_modular_kernel(
+        pg, pgi, low_latency_mode, hidden_size, dp_size, num_experts,
+        num_local_experts, q_dtype, use_fp8_dispatch, per_act_token_quant)
+
+    out_hidden_states = torch.empty_like(test_tensors.rank_tokens)
+    total_num_tokens = test_tensors.rank_tokens.size(0)
+
+    def process_chunk(chunk_start, chunk_end, skip_result_store=False):
+        rank_tokens_chunk = test_tensors.rank_tokens[chunk_start:chunk_end]
+        topk_weights_chunk = test_tensors.topk_weights[chunk_start:chunk_end]
+        topk_chunk = test_tensors.topk[chunk_start:chunk_end]
+        rank_token_scales_chunk = test_tensors.rank_token_scales
+        if rank_token_scales_chunk is not None and rank_token_scales_chunk.size(
+                0) == total_num_tokens:
+            # per act token
+            rank_token_scales_chunk = rank_token_scales_chunk[
+                chunk_start:chunk_end]
+
+        out = mk.forward(hidden_states=rank_tokens_chunk,
+                         w1=w1,
+                         w2=w2,
+                         topk_weights=topk_weights_chunk,
+                         topk_ids=topk_chunk,
+                         inplace=False,
+                         activation="silu",
+                         global_num_experts=num_experts,
+                         expert_map=build_expert_map(),
+                         w1_scale=w1_scale,
+                         w2_scale=w2_scale,
+                         w1_zp=None,
+                         w2_zp=None,
+                         a1_scale=rank_token_scales_chunk,
+                         a2_scale=None,
+                         apply_router_weight_on_input=False)
+
+        if not skip_result_store:
+            out_hidden_states[chunk_start:chunk_end, :].copy_(
+                out, non_blocking=True)
+
+    max_num_tokens_per_dp = (MAX_TOKENS_PER_RANK
+                             if low_latency_mode else total_num_tokens)
+
+    for chunk_start_ in range(0, total_num_tokens, max_num_tokens_per_dp):
+        chunk_start = chunk_start_
+        chunk_end = min(chunk_start + max_num_tokens_per_dp, total_num_tokens)
+        # clamp start and end
+        chunk_start = min(chunk_start, total_num_tokens - 1)
+        chunk_end = min(chunk_end, total_num_tokens)
+
+        process_chunk(chunk_start,
+                      chunk_end,
+                      skip_result_store=chunk_start_ >= total_num_tokens)
+
+    return out_hidden_states
+
+
+def torch_moe_impl(
+    test_tensors: TestTensors,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: Optional[torch.Tensor],
+    w2_scale: Optional[torch.Tensor],
+    using_fp8_dispatch: bool,
+    per_act_token_quant: bool,
+):
+
+    a, topk_ids, topk_weights = (test_tensors.rank_tokens, test_tensors.topk,
+                                 test_tensors.topk_weights)
+    if using_fp8_dispatch:
+        # The DeepEP implementation is requested to dispatch using FP8.
+        # For numerical stability for testing, emulate the fp8 dispatch by
+        # blockwise quant and de-quant.
+        assert not per_act_token_quant
+        a = test_tensors.rank_tokens
+        aq, aq_scale = per_token_group_quant_fp8(a, 128)
+        a = (aq.view(-1, 128).to(torch.float32) * aq_scale.view(-1, 1)).view(
+            a.shape).to(a.dtype)
+
+    is_quantized = w1.dtype == torch.float8_e4m3fn
+    a_dtype = a.dtype
+    if is_quantized:
+        w1 = w1.to(dtype=torch.float32) * w1_scale
+        w2 = w2.to(dtype=torch.float32) * w2_scale
+        a = a.to(dtype=torch.float32)
+
+    m, _ = a.shape
+    topk = topk_ids.size(1)
+    out = torch.zeros_like(a)
+
+    for i in range(m):
+        a_i = a[i]
+        o_i = out[i]
+        for j in range(topk):
+            e = topk_ids[i][j]
+            e_w = topk_weights[i][j]
+            w1_e = w1[e]
+            w2_e = w2[e]
+            o_i += (SiluAndMul()
+                    (a_i @ w1_e.transpose(0, 1)) @ w2_e.transpose(0, 1)) * e_w
+
+    if is_quantized:
+        out = out.to(dtype=a_dtype)
+
+    return out
+
+
+def _deep_ep_moe(
+    pgi: ProcessGroupInfo,
+    low_latency_mode: bool,
+    dp_size: int,
+    config: TestConfig,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: Optional[torch.Tensor],
+    w2_scale: Optional[torch.Tensor],
+    use_fp8_dispatch: bool,
+    per_act_token_quant: bool,
+):
+
+    if not low_latency_mode:
+        assert not use_fp8_dispatch, (
+            "FP8 dispatch interface is available only in low-latency mode")
+
+    is_quantized = w1.dtype == torch.float8_e4m3fn
+    w1 = w1.to(device=torch.cuda.current_device())
+    w2 = w2.to(device=torch.cuda.current_device())
+    if is_quantized:
+        w1_scale = w1_scale.to(  # type: ignore
+            device=torch.cuda.current_device())
+        w2_scale = w2_scale.to(  # type: ignore
+            device=torch.cuda.current_device())
+
+    pg = torch.distributed.new_group(list(range(pgi.world_size)))
+    test_tensors = TestTensors.make(config, low_latency_mode)
+
+    with set_current_vllm_config(VllmConfig()):
+        # Reference
+        torch_combined = torch_moe_impl(test_tensors, w1, w2, w1_scale,
+                                        w2_scale, use_fp8_dispatch,
+                                        per_act_token_quant)
+
+        # Splice experts for this rank.
+        num_local_experts = config.num_experts // pgi.world_size
+        e_start = num_local_experts * pgi.rank
+        e_end = e_start + num_local_experts
+        w1_ep = w1[e_start:e_end]
+        w2_ep = w2[e_start:e_end]
+
+        w1_scale_ep, w2_scale_ep = None, None
+        if is_quantized:
+            w1_scale_ep = w1_scale[e_start:e_end]  # type: ignore
+            w2_scale_ep = w2_scale[e_start:e_end]  # type: ignore
+        deepep_combined = deep_ep_moe_impl(
+            pg,
+            pgi,
+            low_latency_mode,
+            dp_size,
+            test_tensors,
+            w1_ep,
+            w2_ep,
+            w1_scale_ep,
+            w2_scale_ep,
+            config.num_experts,
+            use_fp8_dispatch,
+            per_act_token_quant,
+        )
+
+    torch.testing.assert_close(
+        torch_combined,
+        deepep_combined,
+        atol=6e-2,
+        rtol=6e-2,
+    )
+
+
+MNKs = [
+    (1, 128, 128),
+    (2, 128, 512),
+    (3, 1024, 2048),
+    (32, 128, 1024),
+    (45, 512, 2048),
+    (64, 1024, 1024),
+    (222, 1024, 2048),
+]
+
+DTYPES = [torch.bfloat16, torch.float8_e4m3fn]
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("mnk", MNKs)
+@pytest.mark.parametrize("num_experts", [32])
+@pytest.mark.parametrize("topk", [6])
+@pytest.mark.parametrize("world_dp_size", [(2, 1)])
+@pytest.mark.parametrize("per_act_token_quant", [False, True])
+@requires_deep_ep
+def test_deep_ep_moe(
+    dtype: torch.dtype,
+    mnk: tuple[int, int, int],
+    num_experts: int,
+    topk: int,
+    world_dp_size: tuple[int, int],
+    per_act_token_quant: bool,
+):
+    low_latency_mode = False
+    use_fp8_dispatch = False
+    m, n, k = mnk
+
+    current_platform.seed_everything(7)
+    world_size, dp_size = world_dp_size
+    config = TestConfig(dtype=dtype,
+                        topk=topk,
+                        m=m,
+                        k=k,
+                        n=n,
+                        num_experts=num_experts)
+
+    w1, w2, w1_scale, w2_scale = make_weights(num_experts, n, k, dtype)
+
+    parallel_launch(world_size, _deep_ep_moe, low_latency_mode, dp_size,
+                    config, w1, w2, w1_scale, w2_scale, use_fp8_dispatch,
+                    per_act_token_quant)
+
+
+MNKs = [
+    (1, 128, 2560),
+    (2, 128, 2560),
+    (3, 1024, 2560),
+    (32, 128, 2560),
+    (45, 512, 2560),
+    (64, 1024, 2560),
+    (222, 1024, 2560),
+]
+DTYPES = [torch.float8_e4m3fn, torch.bfloat16]
+USE_FP8_DISPATCH = [True, False]
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("mnk", MNKs)
+@pytest.mark.parametrize("num_experts", [32])
+@pytest.mark.parametrize("topk", [6])
+@pytest.mark.parametrize("world_dp_size", [(2, 1)])
+@pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH)
+@requires_deep_ep
+def test_low_latency_deep_ep_moe(dtype: torch.dtype, mnk: tuple[int, int, int],
+                                 num_experts: int, topk: int,
+                                 world_dp_size: tuple[int, int],
+                                 use_fp8_dispatch: bool):
+
+    low_latency_mode = True
+    m, n, k = mnk
+
+    if (low_latency_mode
+            and k not in DeepEPLLPrepareAndFinalize.SUPPORTED_HIDDEN_SIZES):
+        pytest.skip(
+            f"Skipping test as hidden size {k} is not in list of supported "
+            f"hidden sizes {DeepEPLLPrepareAndFinalize.SUPPORTED_HIDDEN_SIZES}"
+        )
+
+    current_platform.seed_everything(7)
+    world_size, dp_size = world_dp_size
+    config = TestConfig(dtype=dtype,
+                        topk=topk,
+                        m=m,
+                        k=k,
+                        n=n,
+                        num_experts=num_experts)
+
+    w1, w2, w1_scale, w2_scale = make_weights(num_experts, n, k, dtype)
+
+    parallel_launch(world_size, _deep_ep_moe, low_latency_mode, dp_size,
+                    config, w1, w2, w1_scale, w2_scale, use_fp8_dispatch,
+                    False)
diff --git a/vllm_v0.10.0/tests/kernels/moe/test_deepgemm.py b/vllm_v0.10.0/tests/kernels/moe/test_deepgemm.py
new file mode 100644
index 0000000..f7578e2
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/moe/test_deepgemm.py
@@ -0,0 +1,188 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Unit-test DeepGEMM FP8 kernels (no DeepEP).
+Compare DeepGEMM path against the Triton fallback inside vLLM's fused_experts.
+"""
+
+import importlib
+import math
+
+import pytest
+import torch
+
+# vLLM fused-expert reference (Triton fallback + DeepGEMM option)
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8)
+from vllm.utils import has_deep_gemm
+from vllm.utils.deep_gemm import calc_diff, per_block_cast_to_fp8
+
+BLOCK_SIZE = [128, 128]
+
+requires_deep_gemm = pytest.mark.skipif(
+    not has_deep_gemm(),
+    reason="Requires deep_gemm kernels",
+)
+
+
+def make_block_quant_fp8_weights(
+    e: int,
+    n: int,
+    k: int,
+    block_size: list[int],
+):
+    """
+    Generate (w1, w2) expert weights and their per-block scale tensors
+    in FP8 block-quantized format.
+
+      w1 shape: (E, 2N, K)
+      w2 shape: (E, K, N)
+    """
+    dtype = torch.bfloat16
+    fp8_max, fp8_min = torch.finfo(torch.float8_e4m3fn).max, torch.finfo(
+        torch.float8_e4m3fn).min
+
+    # bf16 reference weights
+    w1_bf16 = torch.randn(e, 2 * n, k, device="cuda", dtype=dtype) / 10
+    w2_bf16 = torch.randn(e, k, n, device="cuda", dtype=dtype) / 10
+    w1_bf16.clamp_(fp8_min, fp8_max)
+    w2_bf16.clamp_(fp8_min, fp8_max)
+
+    block_n, block_k = block_size
+    n_tiles_w1 = math.ceil((2 * n) / block_n)
+    k_tiles_w1 = math.ceil(k / block_k)
+    n_tiles_w2 = math.ceil(k / block_n)
+    k_tiles_w2 = math.ceil(n / block_k)
+
+    w1 = torch.empty_like(w1_bf16, dtype=torch.float8_e4m3fn)
+    w2 = torch.empty_like(w2_bf16, dtype=torch.float8_e4m3fn)
+    w1_s = torch.empty(e,
+                       n_tiles_w1,
+                       k_tiles_w1,
+                       device="cuda",
+                       dtype=torch.float32)
+    w2_s = torch.empty(e,
+                       n_tiles_w2,
+                       k_tiles_w2,
+                       device="cuda",
+                       dtype=torch.float32)
+
+    for i in range(e):
+        w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i])
+        w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i])
+
+    return w1, w2, w1_s, w2_s
+
+
+def run_single_case(m, n, k, topk, num_experts, block_size):
+    """
+    Run one (M,N,K) configuration on a single GPU and assert DeepGEMM ==
+    Triton baseline within tolerance.
+    """
+    tokens_bf16 = torch.randn(
+        m, k, device="cuda", dtype=torch.bfloat16).clamp_min_(-1).clamp_max_(1)
+    _, a1_scale = per_token_group_quant_fp8(tokens_bf16, block_size[1])
+
+    # expert weight tensors
+    w1, w2, w1_s, w2_s = make_block_quant_fp8_weights(num_experts, n, k,
+                                                      block_size)
+
+    router_logits = torch.randn(m,
+                                num_experts,
+                                device="cuda",
+                                dtype=torch.float32)
+    topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1)
+    topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1)
+
+    # triton reference
+    out_triton = fused_experts(
+        hidden_states=tokens_bf16,
+        w1=w1,
+        w2=w2,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        inplace=False,
+        use_fp8_w8a8=True,
+        w1_scale=w1_s,
+        w2_scale=w2_s,
+        a1_scale=a1_scale,
+        block_shape=block_size,
+        allow_deep_gemm=False,
+    )
+
+    # DeepGemm
+    out_deepgemm = fused_experts(
+        hidden_states=tokens_bf16,
+        w1=w1,
+        w2=w2,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        inplace=False,
+        use_fp8_w8a8=True,
+        w1_scale=w1_s,
+        w2_scale=w2_s,
+        a1_scale=a1_scale,
+        block_shape=block_size,
+        allow_deep_gemm=True,
+    )
+    diff = calc_diff(out_deepgemm, out_triton)
+    assert diff < 0.001, f"Diff exceeded 1%: {diff}"
+
+
+# Note: W1 has shape (E, 2N, K), so N = 512
+# can trigger the deepgemm path.
+MNKs = [
+    (1024, 512, 128),
+    (1024, 512, 512),
+    (2048, 512, 512),
+    (512, 1024, 1024),
+    (512, 2048, 2048),
+    (4096, 4096, 1024),
+]
+
+TOPKS = [2, 6]
+NUM_EXPERTS = [32]
+
+
+@pytest.mark.parametrize("mnk", MNKs)
+@pytest.mark.parametrize("topk", TOPKS)
+@pytest.mark.parametrize("num_experts", NUM_EXPERTS)
+@requires_deep_gemm
+def test_deepgemm_vs_triton(mnk, topk, num_experts, monkeypatch):
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_DEEP_GEMM", "1")
+
+        _fused_moe_mod = importlib.import_module(
+            "vllm.model_executor.layers.fused_moe.fused_moe")
+
+        call_counter = {"cnt": 0}
+
+        orig_fn = _fused_moe_mod.deep_gemm_moe_fp8
+
+        def _spy_deep_gemm_moe_fp8(*args, **kwargs):
+            call_counter["cnt"] += 1
+            return orig_fn(*args, **kwargs)
+
+        monkeypatch.setattr(_fused_moe_mod, "deep_gemm_moe_fp8",
+                            _spy_deep_gemm_moe_fp8)
+
+        m, n, k = mnk
+
+        if topk > num_experts:
+            pytest.skip(f"topk={topk} > num_experts={num_experts}")
+
+        run_single_case(
+            m=m,
+            n=n,
+            k=k,
+            topk=topk,
+            num_experts=num_experts,
+            block_size=BLOCK_SIZE,
+        )
+
+        # ensure that the DeepGEMM path was indeed taken.
+        assert call_counter["cnt"] == 1, \
+            f"DeepGEMM path was not executed during the test. " \
+            f"Call counter: {call_counter['cnt']}"
diff --git a/vllm_v0.10.0/tests/kernels/moe/test_modular_kernel_combinations.py b/vllm_v0.10.0/tests/kernels/moe/test_modular_kernel_combinations.py
new file mode 100644
index 0000000..6f2869c
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/moe/test_modular_kernel_combinations.py
@@ -0,0 +1,214 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import copy
+from itertools import product
+from typing import Optional
+
+import pytest
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.config import VllmConfig, current_platform, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
+    BatchedTritonOrDeepGemmExperts)
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
+from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+    BatchedTritonExperts)
+from vllm.model_executor.layers.fused_moe.layer import TritonExperts
+from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
+    TritonOrDeepGemmExperts)
+from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
+
+from .modular_kernel_tools.common import (Config, RankTensors, WeightTensors,
+                                          reference_moe_impl,
+                                          run_modular_kernel)
+from .modular_kernel_tools.mk_objects import (
+    MK_FUSED_EXPERT_TYPES, MK_MULTI_GPU_PREPARE_FINALIZE_TYPES,
+    MK_QUANT_CONFIGS, MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES)
+from .modular_kernel_tools.parallel_utils import (ProcessGroupInfo,
+                                                  parallel_launch_with_config)
+
+# TODO (varun): These requirements are very strict and could be relaxed.
+has_all_packages = (has_deep_ep() and has_deep_gemm() and has_pplx())
+
+meets_package_requirements = pytest.mark.skipif(
+    not has_all_packages,
+    reason="Requires deep_ep & deep_gemm & pplx packages",
+)
+
+
+def rank_worker(
+    pgi: ProcessGroupInfo,
+    vllm_config: VllmConfig,
+    cpu_group,
+    config: Config,
+    weights: WeightTensors,
+):
+    current_platform.seed_everything(pgi.rank)
+
+    # sanity check
+    from vllm import envs
+    if config.fused_moe_chunk_size is not None:
+        assert (config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE)
+
+    # get weights to this device
+    weights.to_current_device()
+
+    Ms = config.Ms
+    assert isinstance(Ms, list)
+    TOPKs = config.topks
+    assert isinstance(TOPKs, list)
+
+    for m, topk in product(Ms, TOPKs):
+        print(f"Running m={m}, topk={topk} ...")
+        # override m and topk
+        cfgx = copy.deepcopy(config)
+        cfgx.Ms = m
+        cfgx.topks = topk
+
+        # inputs for rank
+        rank_tensors = RankTensors.make(cfgx, pgi)
+
+        # modular kernel out
+        mk_out = run_modular_kernel(pgi, vllm_config, cfgx, weights,
+                                    rank_tensors)
+
+        with set_current_vllm_config(vllm_config):
+            ref_out = reference_moe_impl(cfgx, weights, rank_tensors)
+
+        torch.testing.assert_close(ref_out, mk_out, atol=3e-2, rtol=3e-2)
+
+
+def run(config: Config):
+    assert config.is_valid()
+    print(f"Testing config \n{config.describe()} ...")
+
+    weights: WeightTensors = WeightTensors.make(config)
+
+    vllm_config, env_dict = config.make_env_data()
+    parallel_launch_with_config(config.world_size, rank_worker, vllm_config,
+                                env_dict, config, weights)
+
+
+Ms = [32, 64]
+Ks = [7168]  # hidden sizes
+Ns = [2048]
+TOPKs = [4, 1]
+Es = [32]
+DTYPEs = [torch.bfloat16]
+FUSED_MOE_CHUNK_SIZEs = [None, 16]
+
+
+def is_nyi_config(config: Config) -> bool:
+    # We know these configs to be legitimate. but still fail.
+
+    if (config.fused_experts_type in [
+            BatchedTritonExperts, BatchedTritonOrDeepGemmExperts,
+            TritonExperts, TritonOrDeepGemmExperts
+    ]):
+        # The triton kernels expect both per-act-token-quant and
+        # per-out-ch-quant or neither.
+        unsupported_quant_config = ((config.is_per_act_token_quant +
+                                     config.is_per_out_ch_quant) == 1)
+        return unsupported_quant_config
+
+    # cutlass kernels dont support expert_maps yet.
+    return config.fused_experts_type == CutlassExpertsFp8
+
+
+@pytest.mark.parametrize("k", Ks)
+@pytest.mark.parametrize("n", Ns)
+@pytest.mark.parametrize("e", Es)
+@pytest.mark.parametrize("dtype", DTYPEs)
+@pytest.mark.parametrize("quant_config", MK_QUANT_CONFIGS)
+@pytest.mark.parametrize(
+    "combination",
+    product(MK_MULTI_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES))
+@pytest.mark.parametrize("fused_moe_chunk_size", FUSED_MOE_CHUNK_SIZEs)
+@pytest.mark.parametrize("world_size", [2])
+@meets_package_requirements
+def test_modular_kernel_combinations_multigpu(
+        k: int, n: int, e: int, dtype: torch.dtype,
+        quant_config: FusedMoEQuantConfig,
+        combination: tuple[mk.FusedMoEPrepareAndFinalize,
+                           mk.FusedMoEPermuteExpertsUnpermute],
+        fused_moe_chunk_size: Optional[int], world_size: int):
+
+    config = Config(
+        Ms=Ms,
+        K=k,
+        N=n,
+        E=e,
+        topks=TOPKs,
+        dtype=dtype,
+        quant_config=quant_config,
+        prepare_finalize_type=combination[0],
+        fused_experts_type=combination[1],
+        fused_moe_chunk_size=fused_moe_chunk_size,
+        world_size=world_size,
+    )
+    if not config.is_valid():
+        pytest.skip(f"Tests config {config} is not valid. Skipping ...")
+
+    if is_nyi_config(config):
+        pytest.skip(f"Tests config {config} is nyi. Skipping ...")
+
+    print(f"{config.describe()}")
+    run(config)
+
+
+@pytest.mark.parametrize("k", Ks)
+@pytest.mark.parametrize("n", Ns)
+@pytest.mark.parametrize("e", Es)
+@pytest.mark.parametrize("dtype", DTYPEs)
+@pytest.mark.parametrize("quant_config", MK_QUANT_CONFIGS)
+@pytest.mark.parametrize(
+    "combination",
+    product(MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES))
+@pytest.mark.parametrize("fused_moe_chunk_size", FUSED_MOE_CHUNK_SIZEs)
+@pytest.mark.parametrize("world_size", [1])
+@meets_package_requirements
+def test_modular_kernel_combinations_singlegpu(
+        k: int, n: int, e: int, dtype: torch.dtype,
+        quant_config: FusedMoEQuantConfig,
+        combination: tuple[mk.FusedMoEPrepareAndFinalize,
+                           mk.FusedMoEPermuteExpertsUnpermute],
+        fused_moe_chunk_size: Optional[int], world_size: int):
+    config = Config(
+        Ms=Ms,
+        K=k,
+        N=n,
+        E=e,
+        topks=TOPKs,
+        dtype=dtype,
+        quant_config=quant_config,
+        prepare_finalize_type=combination[0],
+        fused_experts_type=combination[1],
+        fused_moe_chunk_size=fused_moe_chunk_size,
+        world_size=world_size,
+    )
+
+    if not config.is_valid():
+        pytest.skip(f"Tests config {config} is not valid. Skipping ...")
+
+    if is_nyi_config(config):
+        pytest.skip(f"Tests config {config} is nyi. Skipping ...")
+
+    run(config)
+
+
+if __name__ == '__main__':
+    # Ability to test individual PrepareAndFinalize and FusedExperts combination
+    from .modular_kernel_tools.cli_args import (make_config,
+                                                make_config_arg_parser)
+    parser = make_config_arg_parser(description=(
+        "Run single prepare-finalize & fused-experts combination test"
+        "Example : python3 -m tests.kernels.moe.test_modular_kernel_combinations "  #noqa: E501
+        "--pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts"
+    ))
+    args = parser.parse_args()
+    config = make_config(args)
+
+    run(config)
diff --git a/vllm_v0.10.0/tests/kernels/moe/test_moe.py b/vllm_v0.10.0/tests/kernels/moe/test_moe.py
new file mode 100644
index 0000000..0f1c787
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/moe/test_moe.py
@@ -0,0 +1,742 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for the MOE layers.
+
+Run `pytest tests/kernels/test_moe.py`.
+"""
+import functools
+from typing import Callable, Optional, Union
+
+import pytest
+import torch
+from torch.nn import Parameter
+from torch.nn import functional as F
+from transformers import MixtralConfig
+from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
+
+import vllm.model_executor.layers.fused_moe  # noqa
+from tests.kernels.utils import opcheck, stack_and_dev, torch_moe
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.distributed.parallel_state import init_distributed_environment
+from vllm.forward_context import set_forward_context
+from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.fused_moe.fused_moe import (
+    fused_topk, modular_triton_fused_moe)
+from vllm.model_executor.layers.fused_moe.moe_torch_iterative import (
+    fused_moe as iterative_moe)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
+    rand_marlin_weight_fp4_like)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
+    marlin_quant_fp8_torch)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
+    awq_marlin_quantize, marlin_quantize)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    quantize_weights)
+from vllm.model_executor.models.mixtral import MixtralMoE
+from vllm.platforms import current_platform
+from vllm.scalar_type import ScalarType, scalar_types
+
+NUM_EXPERTS = [8, 64]
+EP_SIZE = [1, 4]
+TOP_KS = [2, 6]
+
+vllm_config = VllmConfig()
+vllm_config.scheduler_config.max_num_seqs = 128
+vllm_config.scheduler_config.max_model_len = 8192
+
+
+def run_moe_test(
+    baseline: Union[Callable, torch.Tensor],
+    moe_fn: Callable,
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    score: torch.Tensor,
+    topk: int,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    padding: bool = False,
+    use_compile: bool = False,
+    use_cudagraph: bool = False,
+    atol: float = 2e-2,
+    rtol: float = 0,
+) -> torch.Tensor:
+    if isinstance(baseline, torch.Tensor):
+        baseline_output = baseline
+    else:
+        baseline_output = baseline(a,
+                                   w1,
+                                   w2,
+                                   score,
+                                   topk,
+                                   global_num_experts=global_num_experts,
+                                   expert_map=expert_map)
+
+    # Pad the weight if moe padding is enabled
+    if padding:
+        w1 = F.pad(w1, (0, 128), "constant", 0)[..., 0:-128]
+        w2 = F.pad(w2, (0, 128), "constant", 0)[..., 0:-128]
+
+    if use_compile:
+        moe_fn = torch.compile(moe_fn, backend="inductor", fullgraph=True)
+        torch._dynamo.mark_dynamic(a, 0)
+        torch._dynamo.mark_dynamic(score, 0)
+
+    test_output = moe_fn(a,
+                         w1,
+                         w2,
+                         score,
+                         topk,
+                         global_num_experts=global_num_experts,
+                         expert_map=expert_map)
+
+    if use_cudagraph:
+        test_output.fill_(0)
+        stream = torch.cuda.Stream()
+        graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(graph, stream=stream):
+            test_output = moe_fn(a,
+                                 w1,
+                                 w2,
+                                 score,
+                                 topk,
+                                 global_num_experts=global_num_experts,
+                                 expert_map=expert_map)
+        torch.cuda.synchronize()
+        graph.replay()
+        torch.cuda.synchronize()
+
+    torch.testing.assert_close(test_output,
+                               baseline_output,
+                               atol=atol,
+                               rtol=rtol)
+
+    return baseline_output
+
+
+@pytest.mark.parametrize("m", [1, 33, 64, 222, 32768, 40000])
+@pytest.mark.parametrize("n", [128, 1024, 2048])
+@pytest.mark.parametrize("k", [128, 511, 1024])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("ep_size", EP_SIZE)
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("padding", [True, False])
+@pytest.mark.parametrize("chunk_size", [8192])
+def test_fused_moe(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    ep_size: int,
+    dtype: torch.dtype,
+    padding: bool,
+    chunk_size: int,
+    monkeypatch,
+):
+    current_platform.seed_everything(7)
+
+    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size))
+
+    #
+    # Setup test data
+    #
+
+    #
+    # Setup test data
+    #
+
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
+
+    if ep_size > 1:
+        local_e = e // ep_size
+        e_ids = torch.randint(0,
+                              e, (local_e, ),
+                              device="cuda",
+                              dtype=torch.int32)
+        e_map = torch.full((e, ), -1, device="cuda", dtype=torch.int32)
+        e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
+        w1 = w1[e_ids]
+        w2 = w2[e_ids]
+    else:
+        e_map = None
+
+    #
+    # Setup test functions
+    #
+
+    m_fused_moe_fn = modular_triton_fused_moe(use_fp8_w8a8=False,
+                                              use_int8_w8a8=False,
+                                              use_int8_w8a16=False,
+                                              use_int4_w4a16=False,
+                                              use_mxfp4_w4a4=False,
+                                              per_act_token_quant=False,
+                                              block_shape=None)
+
+    def m_fused_moe(
+        a: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        score: torch.Tensor,
+        topk: int,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        topk_weights, topk_ids, _ = fused_topk(a, score, topk, False)
+        return m_fused_moe_fn(a,
+                              w1,
+                              w2,
+                              topk_weights,
+                              topk_ids,
+                              global_num_experts=global_num_experts,
+                              expert_map=expert_map)
+
+    fused_moe_fn = functools.partial(fused_moe, renormalize=False)
+
+    #
+    # Run tests
+    #
+    runner = functools.partial(
+        run_moe_test,
+        a=a,
+        w1=w1,
+        w2=w2,
+        score=score,
+        topk=topk,
+        global_num_experts=e,
+        expert_map=e_map,
+        padding=padding,
+    )
+
+    # Note: for now use_compile will error out if the problem size is
+    # large enough to trigger chunking. I'm leaving the flag and
+    # setup code in case we are able to revisit this later.
+    use_compile = False
+
+    use_cudagraph = (n >= 1024 and k >= 1024
+                     and current_platform.is_cuda_alike())
+
+    with set_current_vllm_config(vllm_config):
+        baseline_output = runner(torch_moe, iterative_moe)
+        runner(baseline_output,
+               fused_moe_fn,
+               use_compile=use_compile,
+               use_cudagraph=use_cudagraph)
+        runner(baseline_output,
+               m_fused_moe,
+               use_compile=use_compile,
+               use_cudagraph=use_cudagraph)
+
+
+@pytest.mark.parametrize("m", [1, 32, 222])
+@pytest.mark.parametrize("n", [128, 1024, 2048])
+@pytest.mark.parametrize("k", [128, 1024])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("ep_size", EP_SIZE)
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("group_size", [64, 128])
+@pytest.mark.parametrize("has_zp", [True, False])
+@pytest.mark.parametrize("weight_bits", [4, 8])
+def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
+                        ep_size: int, dtype: torch.dtype, group_size: int,
+                        has_zp: bool, weight_bits: int):
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
+
+    if weight_bits == 4:
+        pack_factor = 2
+        quant_type = scalar_types.uint4 if has_zp else scalar_types.uint4b8
+    elif weight_bits == 8:
+        pack_factor = 1
+        quant_type = scalar_types.uint8 if has_zp else scalar_types.uint8b128
+
+    w1_ref = w1.clone()
+    w2_ref = w2.clone()
+    w1_qweight = torch.empty((e, 2 * n, k // pack_factor),
+                             device="cuda",
+                             dtype=torch.uint8)
+    w2_qweight = torch.empty((e, k, n // pack_factor),
+                             device="cuda",
+                             dtype=torch.uint8)
+    w1_scales = torch.empty((e, 2 * n, k // group_size),
+                            device="cuda",
+                            dtype=dtype)
+    w2_scales = torch.empty((e, k, n // group_size),
+                            device="cuda",
+                            dtype=dtype)
+    w1_qzeros = torch.empty((e, 2 * n // pack_factor, k // group_size),
+                            device="cuda",
+                            dtype=torch.uint8)
+    w2_qzeros = torch.empty((e, k // pack_factor, n // group_size),
+                            device="cuda",
+                            dtype=torch.uint8)
+
+    for i in range(e * 2):
+        expert_id = i % e
+        if i // e == 0:
+            w, w_ref, w_qweight, w_scales, w_qzeros = \
+                w1, w1_ref, w1_qweight, w1_scales, w1_qzeros
+        else:
+            w, w_ref, w_qweight, w_scales, w_qzeros = \
+                w2, w2_ref, w2_qweight, w2_scales, w2_qzeros
+        weight, qweight, scales, qzeros = quantize_weights(
+            w[expert_id].T, quant_type, group_size, has_zp, False)
+        weight = weight.T
+        qweight = qweight.T.contiguous().to(torch.uint8)
+        scales = scales.T
+        if has_zp:
+            qzeros = qzeros.T.contiguous().to(torch.uint8)
+        if weight_bits == 4:
+            qweight = qweight[:, 1::2] * 16 + qweight[:, ::2]
+            if has_zp:
+                qzeros = qzeros[1::2, :] * 16 + qzeros[::2, :]
+
+        w_ref[expert_id] = weight
+        w_qweight[expert_id] = qweight
+        w_scales[expert_id] = scales
+        if has_zp:
+            w_qzeros[expert_id] = qzeros
+
+    if ep_size > 1:
+        local_e = e // ep_size
+        e_ids = torch.randint(0,
+                              e, (local_e, ),
+                              device="cuda",
+                              dtype=torch.int32)
+        e_map = torch.full((e, ), -1, device="cuda", dtype=torch.int32)
+        e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
+        w1_ref = w1_ref[e_ids]
+        w2_ref = w2_ref[e_ids]
+        w1_qweight = w1_qweight[e_ids]
+        w2_qweight = w2_qweight[e_ids]
+        w1_scales = w1_scales[e_ids]
+        w2_scales = w2_scales[e_ids]
+        w1_qzeros = w1_qzeros[e_ids]
+        w2_qzeros = w2_qzeros[e_ids]
+    else:
+        e_map = None
+
+    with set_current_vllm_config(vllm_config):
+        triton_output = fused_moe(a,
+                                  w1_qweight,
+                                  w2_qweight,
+                                  score,
+                                  topk,
+                                  renormalize=False,
+                                  use_int4_w4a16=weight_bits == 4,
+                                  use_int8_w8a16=weight_bits == 8,
+                                  global_num_experts=e,
+                                  expert_map=e_map,
+                                  w1_scale=w1_scales,
+                                  w2_scale=w2_scales,
+                                  w1_zp=w1_qzeros if has_zp else None,
+                                  w2_zp=w2_qzeros if has_zp else None,
+                                  block_shape=[0, group_size])
+        torch_output = torch_moe(a,
+                                 w1_ref,
+                                 w2_ref,
+                                 score,
+                                 topk,
+                                 expert_map=e_map)
+
+    torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
+
+
+@pytest.mark.parametrize("dtype",
+                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("padding", [True, False])
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
+@torch.inference_mode()
+def test_mixtral_moe(dtype: torch.dtype, padding: bool, use_rocm_aiter: bool,
+                     monkeypatch):
+    """Make sure our Mixtral MoE implementation agrees with the one from
+    huggingface."""
+
+    # clear the cache before every test
+    from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+        is_rocm_aiter_moe_enabled)
+    is_rocm_aiter_moe_enabled.cache_clear()
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
+        if dtype == torch.float32:
+            pytest.skip("AITER ROCm test skip for float32")
+
+    monkeypatch.setenv('RANK', "0")
+    monkeypatch.setenv('LOCAL_RANK', "0")
+    monkeypatch.setenv('WORLD_SIZE', "1")
+    monkeypatch.setenv('MASTER_ADDR', 'localhost')
+    monkeypatch.setenv('MASTER_PORT', '12345')
+    init_distributed_environment()
+
+    # Instantiate our and huggingface's MoE blocks
+    vllm_config.compilation_config.static_forward_context = dict()
+    with (set_current_vllm_config(vllm_config),
+          set_forward_context(None, vllm_config)):
+        config = MixtralConfig()
+        hf_moe = MixtralSparseMoeBlock(config).to(dtype).to("cuda")
+        vllm_moe = MixtralMoE(
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            params_dtype=dtype,
+            tp_size=1,
+            dp_size=1,
+        ).cuda()
+
+        # Load the weights
+        vllm_moe.gate.weight.data[:] = hf_moe.gate.weight.data
+        for i in range(config.num_local_experts):
+            weights = (hf_moe.experts[i].w1.weight.data,
+                       hf_moe.experts[i].w3.weight.data)
+            vllm_moe.experts.w13_weight[i][:] = torch.cat(weights, dim=0)
+            vllm_moe.experts.w2_weight[i][:] = hf_moe.experts[i].w2.weight.data
+
+        # Generate input batch of dimensions [batch_size, seq_len, hidden_dim]
+        hf_inputs = torch.randn(
+            (1, 64, config.hidden_size)).to(dtype).to("cuda")
+        # vLLM uses 1D query [num_tokens, hidden_dim]
+        vllm_inputs = hf_inputs.flatten(0, 1)
+
+        # Pad the weight if moe padding is enabled
+        if padding:
+            vllm_moe.experts.w13_weight = Parameter(F.pad(
+                vllm_moe.experts.w13_weight, (0, 128), "constant", 0)[...,
+                                                                      0:-128],
+                                                    requires_grad=False)
+            torch.cuda.empty_cache()
+            vllm_moe.experts.w2_weight = Parameter(F.pad(
+                vllm_moe.experts.w2_weight, (0, 128), "constant", 0)[...,
+                                                                     0:-128],
+                                                   requires_grad=False)
+            torch.cuda.empty_cache()
+
+        # Run forward passes for both MoE blocks
+        hf_states, _ = hf_moe.forward(hf_inputs)
+        vllm_states = vllm_moe.forward(vllm_inputs)
+
+    mixtral_moe_tol = {
+        torch.float32: 1e-3,
+        torch.float16: 1e-3,
+        torch.bfloat16: 1e-2,
+    }
+
+    if use_rocm_aiter:
+        # The values of rtol and atol are set based on the tests in ROCM AITER package. # noqa: E501
+        # https://github.com/ROCm/aiter/blob/dfed377f4be7da96ca2d75ac0761f569676f7240/op_tests/test_moe.py#L174  # noqa: E501
+        torch.testing.assert_close(hf_states.flatten(0, 1),
+                                   vllm_states,
+                                   rtol=0.01,
+                                   atol=100)
+    else:
+        torch.testing.assert_close(hf_states.flatten(0, 1),
+                                   vllm_states,
+                                   rtol=mixtral_moe_tol[dtype],
+                                   atol=mixtral_moe_tol[dtype])
+
+
+def marlin_moe_generate_valid_test_cases():
+    import itertools
+    m_list = [1, 123, 666]
+    n_list = [128, 1024]
+    k_list = [256, 2048]
+    e_list = [4, 12]
+    topk_list = [2, 3]
+    ep_size_list = [1, 4]
+    dtype_list = [torch.half, torch.bfloat16]
+    group_size_list = [-1, 16, 32, 128]
+    act_order_list = [True, False]
+    quant_type_list = [
+        scalar_types.float4_e2m1f,
+        scalar_types.float8_e4m3fn,
+        scalar_types.uint4,
+        scalar_types.uint4b8,
+        scalar_types.uint8b128,
+    ]
+    is_k_full_list = [True, False]
+
+    all_combinations = itertools.product(m_list, n_list, k_list, e_list,
+                                         topk_list, ep_size_list, dtype_list,
+                                         group_size_list, act_order_list,
+                                         quant_type_list, is_k_full_list)
+
+    def is_invalid(m, n, k, e, topk, ep_size, dtype, group_size, act_order,
+                   quant_type, is_k_full):
+
+        if quant_type == scalar_types.float8_e4m3fn and \
+                group_size not in [-1, 128]:
+            return False
+        if quant_type == scalar_types.float4_e2m1f and group_size != 16:
+            return False
+        if quant_type != scalar_types.float4_e2m1f and group_size == 16:
+            return False
+
+        # Filter act_order
+        if act_order:
+            if group_size in (-1, k, n):
+                return False
+            if quant_type not in [scalar_types.uint4b8]:
+                return False
+        elif not is_k_full:
+            return False
+
+        return True
+
+    cases = []
+    for case in all_combinations:
+        if is_invalid(*case):
+            cases.append(case)
+    return cases
+
+
+@pytest.mark.flaky(reruns=2)
+@pytest.mark.parametrize(("m, n, k, e, topk, ep_size, dtype, group_size,"
+                          "act_order, quant_type, is_k_full"),
+                         marlin_moe_generate_valid_test_cases())
+@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
+def test_fused_marlin_moe(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    ep_size: int,
+    dtype: torch.dtype,
+    group_size: int,
+    act_order: bool,
+    quant_type: ScalarType,
+    is_k_full: bool,
+):
+    torch.cuda.manual_seed(0)
+    has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8]
+
+    if quant_type == scalar_types.float8_e4m3fn:
+        if group_size not in [-1, 128]:
+            return
+        if act_order:
+            return
+
+    # Filter act_order
+    if act_order:
+        if quant_type == scalar_types.float8_e4m3fn:
+            return
+        if group_size == -1:
+            return
+        if group_size in (k, n):
+            return
+        if has_zp:
+            return
+    else:
+        if not is_k_full:
+            return
+
+    if quant_type == scalar_types.float4_e2m1f and group_size != 16:
+        return
+    if quant_type != scalar_types.float4_e2m1f and group_size == 16:
+        return
+
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 20
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 20
+
+    if ep_size > 1:
+        local_e = e // ep_size
+        e_ids = torch.randperm(e, device="cuda", dtype=torch.int32)[:local_e]
+        e_map = torch.full((e, ), -1, device="cuda", dtype=torch.int32)
+        e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
+        w1 = w1[e_ids]
+        w2 = w2[e_ids]
+    else:
+        e_map = None
+
+    w_ref1_l = []
+    qweight1_l = []
+    scales1_l = []
+    global_scale1_l = []
+    zeros1_l = []
+    g_idx1_l = []
+    sort_indices1_l = []
+
+    for i in range(w1.shape[0]):
+        if quant_type == scalar_types.float4_e2m1f:
+            w_ref1, qweight1, scales1, global_scale1 = \
+                rand_marlin_weight_fp4_like(w1[i], group_size)
+
+            w_ref1_l.append(w_ref1.T)
+            qweight1_l.append(qweight1)
+            scales1_l.append(scales1)
+            global_scale1_l.append(global_scale1)
+        elif quant_type == scalar_types.float8_e4m3fn:
+            w_ref1, qweight1, scales1 = marlin_quant_fp8_torch(
+                w1[i], group_size)
+            w_ref1_l.append(w_ref1.T)
+            qweight1_l.append(qweight1)
+            scales1_l.append(scales1)
+        elif has_zp:
+            w_ref1, qweight1, scales1, zeros1 = awq_marlin_quantize(
+                w1[i].transpose(1, 0), quant_type, group_size)
+
+            w_ref1_l.append(w_ref1.T)
+            qweight1_l.append(qweight1)
+            scales1_l.append(scales1)
+            zeros1_l.append(zeros1)
+        else:
+            test_perm = torch.randperm(k)
+            w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = \
+                marlin_quantize(w1[i].transpose(1, 0), quant_type,
+                                group_size, act_order, test_perm)
+
+            w_ref1_l.append(w_ref1.T)
+            qweight1_l.append(qweight1)
+            scales1_l.append(scales1)
+            g_idx1_l.append(g_idx1)
+            sort_indices1_l.append(sort_indices1)
+
+    w_ref1 = stack_and_dev(w_ref1_l)
+    qweight1 = stack_and_dev(qweight1_l).contiguous()
+    scales1 = stack_and_dev(scales1_l)
+    global_scale1 = stack_and_dev(global_scale1_l) if global_scale1_l else None
+    g_idx1 = stack_and_dev(g_idx1_l) if g_idx1_l else None
+    zeros1 = stack_and_dev(zeros1_l) if zeros1_l else None
+    sort_indices1 = stack_and_dev(sort_indices1_l) if sort_indices1_l else None
+
+    w_ref2_l = []
+    qweight2_l = []
+    scales2_l = []
+    global_scale2_l = []
+    zeros2_l = []
+    g_idx2_l = []
+    sort_indices2_l = []
+
+    for i in range(w2.shape[0]):
+        if quant_type == scalar_types.float4_e2m1f:
+            w_ref2, qweight2, scales2, global_scale2 = \
+                rand_marlin_weight_fp4_like(w2[i], group_size)
+
+            w_ref2_l.append(w_ref2.T)
+            qweight2_l.append(qweight2)
+            scales2_l.append(scales2)
+            global_scale2_l.append(global_scale2)
+        elif quant_type == scalar_types.float8_e4m3fn:
+            w_ref2, qweight2, scales2 = marlin_quant_fp8_torch(
+                w2[i], group_size)
+            w_ref2_l.append(w_ref2.T)
+            qweight2_l.append(qweight2)
+            scales2_l.append(scales2)
+        elif has_zp:
+            w_ref2, qweight2, scales2, zeros2 = awq_marlin_quantize(
+                w2[i].transpose(1, 0), quant_type, group_size)
+
+            w_ref2_l.append(w_ref2.T)
+            qweight2_l.append(qweight2)
+            scales2_l.append(scales2)
+            zeros2_l.append(zeros2)
+        else:
+            test_perm = torch.randperm(n)
+            w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = \
+                marlin_quantize(w2[i].transpose(1, 0), quant_type,
+                                group_size, act_order, test_perm)
+
+            w_ref2_l.append(w_ref2.T)
+            qweight2_l.append(qweight2)
+            scales2_l.append(scales2)
+            g_idx2_l.append(g_idx2)
+            sort_indices2_l.append(sort_indices2)
+
+    w_ref2 = stack_and_dev(w_ref2_l)
+    qweight2 = stack_and_dev(qweight2_l).contiguous()
+    scales2 = stack_and_dev(scales2_l)
+    global_scale2 = stack_and_dev(global_scale2_l) if global_scale2_l else None
+    g_idx2 = stack_and_dev(g_idx2_l) if g_idx2_l else None
+    zeros2 = stack_and_dev(zeros2_l) if zeros2_l else None
+    sort_indices2 = stack_and_dev(sort_indices2_l) if sort_indices2_l else None
+
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
+
+    topk_weights, topk_ids, _ = fused_topk(a, score, topk, False)
+
+    with set_current_vllm_config(vllm_config):
+        torch_output = torch_moe(a,
+                                 w_ref1,
+                                 w_ref2,
+                                 score,
+                                 topk,
+                                 expert_map=e_map)
+
+    marlin_output = torch.ops.vllm.fused_marlin_moe(
+        a,
+        qweight1,
+        qweight2,
+        scales1,
+        scales2,
+        score,
+        topk_weights,
+        topk_ids,
+        global_num_experts=e,
+        expert_map=e_map,
+        global_scale1=global_scale1,
+        global_scale2=global_scale2,
+        g_idx1=g_idx1,
+        g_idx2=g_idx2,
+        sort_indices1=sort_indices1,
+        sort_indices2=sort_indices2,
+        w1_zeros=zeros1,
+        w2_zeros=zeros2,
+        quant_type_id=quant_type.id,
+        is_k_full=is_k_full)
+
+    torch.testing.assert_close(marlin_output, torch_output, atol=5e-2, rtol=0)
+
+
+def test_moe_align_block_size_opcheck():
+    num_experts = 4
+    block_size = 4
+    topk_ids = torch.randint(0,
+                             num_experts, (3, 4),
+                             dtype=torch.int32,
+                             device='cuda')
+
+    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+    sorted_ids = torch.empty((max_num_tokens_padded, ),
+                             dtype=torch.int32,
+                             device=topk_ids.device)
+    sorted_ids.fill_(topk_ids.numel())
+    max_num_m_blocks = max_num_tokens_padded // block_size
+    expert_ids = torch.empty((max_num_m_blocks, ),
+                             dtype=torch.int32,
+                             device=topk_ids.device)
+    num_tokens_post_pad = torch.empty((1),
+                                      dtype=torch.int32,
+                                      device=topk_ids.device)
+
+    opcheck(torch.ops._moe_C.moe_align_block_size,
+            (topk_ids, num_experts, block_size, sorted_ids, expert_ids,
+             num_tokens_post_pad))
+
+
+@pytest.mark.parametrize("m", [1, 33, 64, 222])
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("k", [128, 511, 1024])
+@pytest.mark.parametrize("dtype",
+                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
+def test_moe_sum(m: int, topk: int, k: int, dtype: torch.dtype):
+    input = torch.randn((m, topk, k), device="cuda", dtype=dtype)
+    actual = torch.empty((m, k), device="cuda", dtype=dtype)
+
+    expected = input.sum(dim=1)
+    torch.ops._moe_C.moe_sum(input, actual)
+
+    torch.testing.assert_close(actual, expected, atol=2e-2, rtol=0)
+
+    opcheck(torch.ops._moe_C.moe_sum, (input, actual))
diff --git a/vllm_v0.10.0/tests/kernels/moe/test_moe_align_block_size.py b/vllm_v0.10.0/tests/kernels/moe/test_moe_align_block_size.py
new file mode 100644
index 0000000..12ef9e7
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/moe/test_moe_align_block_size.py
@@ -0,0 +1,315 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for the MOE align block size function.
+
+Run `pytest tests/kernels/moe/test_moe_align_block_size.py`.
+"""
+
+from typing import Optional
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
+    moe_align_block_size)
+from vllm.platforms import current_platform
+from vllm.utils import round_up
+
+NUM_TOKENS = [1, 3, 7, 16, 256, 2256, 4096]
+NUM_EXPERTS = [32, 160, 256, 257, 512]
+TOP_KS = [1, 2, 16, 32]
+BLOCK_SIZES = [32, 64, 128, 256]
+current_platform.seed_everything(0)
+
+
+def _group_tokens_by_expert(
+    sorted_ids: torch.Tensor,
+    expert_ids: torch.Tensor,
+    block_size: int,
+    valid_length: int,
+    total_tokens: int,
+) -> dict:
+    num_blocks = valid_length // block_size
+    expert_tokens: dict[int, list[int]] = {}
+
+    for block_idx in range(num_blocks):
+        expert_id = expert_ids[block_idx].item()
+        block_start = block_idx * block_size
+        block_end = min(block_start + block_size, valid_length)
+
+        block_tokens = sorted_ids[block_start:block_end]
+        valid_tokens = block_tokens[block_tokens < total_tokens]
+
+        if expert_id not in expert_tokens:
+            expert_tokens[expert_id] = []
+        expert_tokens[expert_id].extend(valid_tokens.tolist())
+    return expert_tokens
+
+
+def _verify_expert_level_sorting(
+    actual_sorted_ids: torch.Tensor,
+    golden_sorted_ids: torch.Tensor,
+    expert_ids: torch.Tensor,
+    block_size: int,
+    valid_length: int,
+    total_tokens: int,
+):
+    """
+    Verify that actual_sorted_ids follows the correct expert-level sorting.
+    The kerne limplementation may or may not preserve original token order
+    in topk_ids in the final sorted_ids however this does not impact quality.
+    """
+    # Group tokens by expert from the golden implementation
+    golden_expert_tokens = _group_tokens_by_expert(golden_sorted_ids,
+                                                   expert_ids, block_size,
+                                                   valid_length, total_tokens)
+
+    actual_expert_tokens = _group_tokens_by_expert(actual_sorted_ids,
+                                                   expert_ids, block_size,
+                                                   valid_length, total_tokens)
+
+    assert set(golden_expert_tokens.keys()) == set(
+        actual_expert_tokens.keys()), (
+            f"Expert IDs mismatch: golden={set(golden_expert_tokens.keys())}, "
+            f"actual={set(actual_expert_tokens.keys())}")
+
+    for expert_id in golden_expert_tokens:
+        golden_tokens = torch.tensor(golden_expert_tokens[expert_id],
+                                     device=actual_sorted_ids.device)
+        actual_tokens = torch.tensor(actual_expert_tokens[expert_id],
+                                     device=actual_sorted_ids.device)
+        assert torch.equal(
+            torch.sort(golden_tokens)[0],
+            torch.sort(actual_tokens)[0]), (
+                f"Expert {expert_id} token mismatch: "
+                f"golden={golden_expert_tokens[expert_id]}, "
+                f"actual={actual_expert_tokens[expert_id]}")
+
+
+def torch_moe_align_block_size(
+    topk_ids: torch.Tensor,
+    block_size: int,
+    num_experts: int,
+    expert_map: Optional[torch.Tensor] = None,
+    pad_sorted_ids: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Golden torch implementation of moe_align_block_size.
+
+    This function aligns the token distribution across experts to be compatible
+    with block size for matrix multiplication by sorting tokens by expert and
+    padding to block boundaries.
+    """
+    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+    if pad_sorted_ids:
+        max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
+
+    flattened_token_indices = torch.arange(topk_ids.numel(),
+                                           device=topk_ids.device,
+                                           dtype=torch.int32)
+    flattened_expert_ids = topk_ids.flatten()
+    sorted_expert_ids, sort_indices = torch.sort(flattened_expert_ids,
+                                                 stable=True)
+    sorted_token_indices = flattened_token_indices[sort_indices]
+
+    expert_token_counts = torch.zeros(num_experts,
+                                      dtype=torch.int64,
+                                      device=topk_ids.device)
+    for expert_id in range(num_experts):
+        mask = sorted_expert_ids == expert_id
+        expert_token_counts[expert_id] = mask.sum()
+
+    expert_padded_counts = torch.zeros(num_experts,
+                                       dtype=torch.int64,
+                                       device=topk_ids.device)
+    for expert_id in range(num_experts):
+        original_count = expert_token_counts[expert_id]
+        if original_count > 0:
+            expert_padded_counts[expert_id] = (
+                (original_count + block_size - 1) // block_size) * block_size
+
+    sorted_token_ids = torch.full(
+        (max_num_tokens_padded, ),
+        topk_ids.numel(),
+        dtype=torch.int32,
+        device=topk_ids.device,
+    )
+    max_num_blocks = (max_num_tokens_padded + block_size - 1) // block_size
+    expert_ids = torch.zeros(max_num_blocks,
+                             dtype=torch.int32,
+                             device=topk_ids.device)
+
+    current_pos = 0
+    current_block = 0
+    for expert_id in range(num_experts):
+        expert_mask = sorted_expert_ids == expert_id
+        expert_tokens = sorted_token_indices[expert_mask]
+        num_expert_tokens = expert_tokens.shape[0]
+
+        if num_expert_tokens > 0:
+            sorted_token_ids[current_pos:current_pos +
+                             num_expert_tokens] = (expert_tokens)
+
+            expert_blocks_needed = expert_padded_counts[expert_id] // block_size
+            expert_ids[current_block:current_block +
+                       expert_blocks_needed] = (expert_id)
+
+            current_pos += expert_padded_counts[expert_id]
+            current_block += expert_blocks_needed
+
+    total_padded_tokens = expert_padded_counts.sum()
+    num_tokens_post_pad = torch.tensor([total_padded_tokens],
+                                       dtype=torch.int32,
+                                       device=topk_ids.device)
+
+    if expert_map is not None:
+        expert_ids = expert_map[expert_ids]
+    return sorted_token_ids, expert_ids, num_tokens_post_pad
+
+
+@pytest.mark.parametrize("m", NUM_TOKENS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("num_experts", NUM_EXPERTS)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("pad_sorted_ids", [False, True])
+@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
+def test_moe_align_block_size(m: int, topk: int, num_experts: int,
+                              block_size: int, pad_sorted_ids: bool):
+    """Test moe_align_block_size without expert mapping"""
+    topk_ids = torch.zeros((m, topk), device="cuda", dtype=torch.int32)
+    for i in range(m):
+        experts = torch.randperm(num_experts, device="cuda")[:topk]
+        topk_ids[i] = experts
+
+    actual_sorted_ids, actual_expert_ids, actual_num_tokens = (
+        moe_align_block_size(
+            topk_ids=topk_ids,
+            block_size=block_size,
+            num_experts=num_experts,
+            pad_sorted_ids=pad_sorted_ids,
+        ))
+    golden_sorted_ids, golden_expert_ids, golden_num_tokens = (
+        torch_moe_align_block_size(
+            topk_ids=topk_ids,
+            block_size=block_size,
+            num_experts=num_experts,
+            pad_sorted_ids=pad_sorted_ids,
+        ))
+
+    torch.testing.assert_close(actual_num_tokens,
+                               golden_num_tokens,
+                               atol=0,
+                               rtol=0)
+    torch.testing.assert_close(actual_expert_ids,
+                               golden_expert_ids,
+                               atol=0,
+                               rtol=0)
+
+    # For sorted_token_ids, verify block-level correctness rather than exact
+    # order Tokens within each expert's blocks can be in any order, but expert
+    # regions must be correct
+    _verify_expert_level_sorting(
+        actual_sorted_ids,
+        golden_sorted_ids,
+        actual_expert_ids,
+        block_size,
+        actual_num_tokens.item(),
+        m * topk,
+    )
+
+    total_tokens = m * topk
+    assert actual_num_tokens.item() % block_size == 0, (
+        "num_tokens_post_pad should be divisible by block_size")
+    assert actual_num_tokens.item() >= total_tokens, (
+        "num_tokens_post_pad should be at least total_tokens")
+    valid_tokens = actual_sorted_ids[actual_sorted_ids < total_tokens]
+    assert len(valid_tokens) == total_tokens, (
+        f"Should have exactly {total_tokens} valid tokens, "
+        f"got {len(valid_tokens)}")
+    assert (actual_expert_ids >= 0).all() and (
+        actual_expert_ids
+        < num_experts).all(), "expert_ids should contain valid expert indices"
+
+
+@pytest.mark.parametrize("m", [16, 32])
+@pytest.mark.parametrize("topk", [2, 4])
+@pytest.mark.parametrize("num_experts", [8])
+@pytest.mark.parametrize("block_size", [64])
+@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
+def test_moe_align_block_size_with_expert_map(m: int, topk: int,
+                                              num_experts: int,
+                                              block_size: int):
+    """Test moe_align_block_size with expert mapping (EP scenario)"""
+    topk_ids = torch.zeros((m, topk), device="cuda", dtype=torch.int32)
+    for i in range(m):
+        experts = torch.randperm(num_experts, device="cuda")[:topk]
+        topk_ids[i] = experts
+
+    expert_map = torch.full((num_experts, ),
+                            -1,
+                            device="cuda",
+                            dtype=torch.int32)
+    local_experts = list(range(0, num_experts, 2))
+    for i, expert_id in enumerate(local_experts):
+        expert_map[expert_id] = i
+
+    actual_sorted_ids, actual_expert_ids, actual_num_tokens = (
+        moe_align_block_size(
+            topk_ids=topk_ids,
+            block_size=block_size,
+            num_experts=num_experts,
+            expert_map=expert_map,
+        ))
+    golden_sorted_ids, golden_expert_ids, golden_num_tokens = (
+        torch_moe_align_block_size(
+            topk_ids=topk_ids,
+            block_size=block_size,
+            num_experts=num_experts,
+            expert_map=expert_map,
+        ))
+
+    torch.testing.assert_close(actual_num_tokens,
+                               golden_num_tokens,
+                               atol=0,
+                               rtol=0)
+    torch.testing.assert_close(actual_expert_ids,
+                               golden_expert_ids,
+                               atol=0,
+                               rtol=0)
+    _verify_expert_level_sorting(
+        actual_sorted_ids,
+        golden_sorted_ids,
+        actual_expert_ids,
+        block_size,
+        actual_num_tokens.item(),
+        m * topk,
+    )
+
+
+def test_moe_align_block_size_deterministic():
+    m, topk, num_experts, block_size = 128, 2, 32, 64
+
+    torch.manual_seed(42)
+    topk_ids = torch.randint(0,
+                             num_experts, (m, topk),
+                             device="cuda",
+                             dtype=torch.int32)
+
+    # expect the results to be reproducible
+    results = []
+    for _ in range(5):
+        sorted_ids, expert_ids, num_tokens = moe_align_block_size(
+            topk_ids=topk_ids, block_size=block_size, num_experts=num_experts)
+        results.append(
+            (sorted_ids.clone(), expert_ids.clone(), num_tokens.clone()))
+
+    for i in range(1, len(results)):
+        assert torch.equal(
+            results[0][0],
+            results[i][0]), ("sorted_ids should be deterministic")
+        assert torch.equal(
+            results[0][1],
+            results[i][1]), ("expert_ids should be deterministic")
+        assert torch.equal(
+            results[0][2],
+            results[i][2]), ("num_tokens should be deterministic")
diff --git a/vllm_v0.10.0/tests/kernels/moe/test_moe_permute_unpermute.py b/vllm_v0.10.0/tests/kernels/moe/test_moe_permute_unpermute.py
new file mode 100644
index 0000000..7cc83b5
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/moe/test_moe_permute_unpermute.py
@@ -0,0 +1,226 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for the MOE permute/unpermute kernel
+
+Run `pytest tests/kernels/test_moe_permute_unpermute.py`.
+"""
+
+from typing import Optional
+
+import numpy as np
+import pytest
+import torch
+
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
+from vllm.model_executor.layers.fused_moe.layer import determine_expert_map
+from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
+    moe_permute, moe_permute_unpermute_supported, moe_unpermute)
+from vllm.platforms import current_platform
+
+NUM_EXPERTS = [16, 64]
+TOP_KS = [2, 4, 6, 8]
+EP_SIZE = [1, 4, 16]
+current_platform.seed_everything(0)
+
+
+def torch_permute(hidden_states: torch.Tensor,
+                  topk_ids: torch.Tensor,
+                  token_expert_indices: torch.Tensor,
+                  topk: int,
+                  n_expert: int,
+                  n_local_expert: int,
+                  start_expert: int,
+                  expert_map: Optional[torch.Tensor] = None,
+                  align_block_size: Optional[int] = None,
+                  fill_invalid_expert: int = -1) -> list[torch.Tensor]:
+    n_token, n_hidden = hidden_states.shape[0], hidden_states.shape[1]
+    if expert_map is not None:
+        is_local_expert = (expert_map[topk_ids] != -1)
+        not_local_expert = (expert_map[topk_ids] == -1)
+        topk_ids = is_local_expert * (
+            topk_ids - start_expert) + not_local_expert * (topk_ids + n_expert)
+
+    sorted_topk_ids, sorted_indices = torch.sort(topk_ids.flatten(),
+                                                 stable=True)
+    dst_row_id2src_row_id_map = token_expert_indices.flatten()[sorted_indices]
+
+    expert_first_token_offset = torch.zeros(n_local_expert + 1,
+                                            dtype=torch.int64,
+                                            device="cuda")
+    idx = 0
+    for i in range(0, n_local_expert):
+        cnt = 0
+        while idx < sorted_topk_ids.numel() and sorted_topk_ids[idx] == i:
+            cnt += 1
+            idx += 1
+        expert_first_token_offset[i + 1] = expert_first_token_offset[i] + cnt
+
+    _, src2dst_idx = torch.sort(dst_row_id2src_row_id_map)
+    valid_row_idx = []
+    if align_block_size is None:
+
+        permuted_hidden_states = hidden_states[dst_row_id2src_row_id_map %
+                                               n_token, ...]
+        permuted_row_size = permuted_hidden_states.shape[0]
+        m_indices = torch.empty(permuted_row_size,
+                                device="cuda",
+                                dtype=torch.int32).fill_(fill_invalid_expert)
+        for i in range(1, n_local_expert + 1):
+            first_token_offset = expert_first_token_offset[i - 1]
+            last_token_offset = expert_first_token_offset[i]
+            m_indices[first_token_offset:last_token_offset] = i - 1
+        src_row_id2dst_row_id_map = torch.arange(
+            0, n_token * topk, device="cuda",
+            dtype=torch.int32)[src2dst_idx].reshape((n_token, topk))
+        valid_row_idx += [i for i in range(expert_first_token_offset[-1])]
+        return [
+            permuted_hidden_states, expert_first_token_offset,
+            src_row_id2dst_row_id_map, m_indices, valid_row_idx
+        ]
+    else:
+        permuted_row_size = (topk * n_token + n_expert *
+                             (align_block_size - 1) + align_block_size -
+                             1) // align_block_size * align_block_size
+        permuted_hidden_states = torch.empty((permuted_row_size, n_hidden),
+                                             device="cuda",
+                                             dtype=hidden_states.dtype)
+        align_src_row_id2dst_row_id = torch.empty(n_token * topk,
+                                                  device="cuda",
+                                                  dtype=torch.int32)
+        align_expert_first_token_offset = torch.zeros_like(
+            expert_first_token_offset)
+        m_indices = torch.empty(permuted_row_size,
+                                device="cuda",
+                                dtype=torch.int32).fill_(fill_invalid_expert)
+        # get align_permuted_hidden_states,
+        # valid row_idx and align_expert_first_token_offset
+        for i in range(1, n_local_expert + 1):
+            first_token_offset = expert_first_token_offset[i - 1]
+            last_token_offset = expert_first_token_offset[i]
+            n_token_in_expert = last_token_offset - first_token_offset
+            align_expert_first_token_offset[
+                i] = align_expert_first_token_offset[
+                    i - 1] + (n_token_in_expert + align_block_size -
+                              1) // align_block_size * align_block_size
+            align_first_token_offset = align_expert_first_token_offset[i - 1]
+            align_last_token_offset = align_expert_first_token_offset[i]
+            dst_row_id2src_row_id_in_expert = dst_row_id2src_row_id_map[
+                first_token_offset:first_token_offset +
+                n_token_in_expert] % n_token
+            # store token in current expert with align_first_token_offset
+            permuted_hidden_states[align_first_token_offset:\
+                                   align_first_token_offset+n_token_in_expert,\
+                                      ...] = hidden_states[\
+                                       dst_row_id2src_row_id_in_expert, ...]
+            # set current expert m_indices
+            m_indices[align_first_token_offset:align_last_token_offset] = i - 1
+            valid_row_idx += [
+                i for i in range(align_first_token_offset,
+                                 align_first_token_offset + n_token_in_expert)
+            ]
+        # get align_src_row_id2dst_row_id
+        for i in range(n_token * topk):
+            eid = sorted_topk_ids[i]
+            if (eid >= n_local_expert):
+                # check token not in local expert
+                align_src_row_id2dst_row_id[
+                    i] = align_expert_first_token_offset[-1]
+                continue
+            first_token_offset = expert_first_token_offset[eid]
+            align_first_token_offset = align_expert_first_token_offset[eid]
+            token_offset = i - first_token_offset
+            align_src_row_id2dst_row_id[
+                i] = align_first_token_offset + token_offset
+        align_src_row_id2dst_row_id = align_src_row_id2dst_row_id[\
+            src2dst_idx].reshape((n_token, topk))
+        return [
+            permuted_hidden_states, align_expert_first_token_offset,
+            align_src_row_id2dst_row_id, m_indices, valid_row_idx
+        ]
+
+
+def torch_unpermute(permuted_hidden_states: torch.Tensor,
+                    topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                    token_expert_indices: torch.Tensor,
+                    src_row_id2dst_row_id_map: torch.Tensor,
+                    valid_row_idx: torch.Tensor, topk: int,
+                    n_expert: int) -> torch.Tensor:
+    # ignore invalid row
+    mask = torch.zeros(permuted_hidden_states.shape[0],
+                       dtype=bool,
+                       device="cuda")
+    mask[valid_row_idx] = True
+    permuted_hidden_states[~mask] = 0
+    idx = src_row_id2dst_row_id_map.flatten()[
+        token_expert_indices.flatten()].reshape(token_expert_indices.shape)
+    output = permuted_hidden_states[idx, ...] * topk_weights[..., None]
+    output = output.sum(dim=1).to(permuted_hidden_states.dtype)
+    return output
+
+
+@pytest.mark.parametrize("n_token", [1, 33, 64, 222, 1024, 2048, 3000, 5000])
+@pytest.mark.parametrize("n_hidden", [2048, 4096, 7168])
+@pytest.mark.parametrize("n_expert", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("ep_size", EP_SIZE)
+@pytest.mark.parametrize("align_block_size", [None, 128])
+def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int,
+                               n_expert: int, ep_size: int, dtype: torch.dtype,
+                               align_block_size: Optional[int]):
+    if not moe_permute_unpermute_supported():
+        pytest.skip("moe_permute_unpermute is not supported on this platform.")
+    fill_invalid_expert = 0
+    ep_rank = np.random.randint(0, ep_size)
+    expert_map = None
+    n_local_expert = n_expert
+    if (ep_size != 1):
+        n_local_expert, expert_map = determine_expert_map(
+            ep_size, ep_rank, n_expert)
+        expert_map = expert_map.cuda()
+    start_expert = n_local_expert * ep_rank
+    current_platform.seed_everything(0)
+    hidden_states = torch.randn((n_token, n_hidden), device="cuda").to(dtype)
+    gating_output = torch.randn((n_token, n_expert), device="cuda").to(dtype)
+    topk_weights, topk_ids, token_expert_indices = fused_topk(
+        hidden_states, gating_output, topk, False)
+    gold0, gold1, gold2, gold3, valid_row_idx = torch_permute(
+        hidden_states,
+        topk_ids,
+        token_expert_indices,
+        topk,
+        n_expert,
+        n_local_expert,
+        start_expert,
+        expert_map=expert_map,
+        align_block_size=align_block_size,
+        fill_invalid_expert=fill_invalid_expert)
+
+    result0, result1, result2, result3 = moe_permute(
+        hidden_states, topk_weights, topk_ids, token_expert_indices, topk,
+        n_expert, n_local_expert, expert_map, align_block_size,
+        fill_invalid_expert)
+
+    # check expert_first_token_offset
+    torch.testing.assert_close(gold1, result1, atol=0, rtol=0)
+    # check src_row_id2dst_row_id_map
+    torch.testing.assert_close(gold2, result2, atol=0, rtol=0)
+    # check mindice
+    torch.testing.assert_close(gold3, result3, atol=0, rtol=0)
+    # check permuted_hidden_states, only valid token
+    torch.testing.assert_close(gold0[valid_row_idx],
+                               result0[valid_row_idx],
+                               atol=0,
+                               rtol=0)
+
+    # add a random tensor to simulate group gemm
+    result0 = 0.5 * result0 + torch.randn_like(result0)
+
+    result4 = moe_unpermute(result0, topk_weights, topk_ids, result2, result1,
+                            topk, n_expert, n_local_expert)
+    gold4 = torch_unpermute(result0, topk_weights, topk_ids,
+                            token_expert_indices, result2, valid_row_idx, topk,
+                            n_local_expert)
+
+    # check unpermuted hidden
+    torch.testing.assert_close(result4, gold4, atol=2e-2, rtol=0)
diff --git a/vllm_v0.10.0/tests/kernels/moe/test_mxfp4_moe.py b/vllm_v0.10.0/tests/kernels/moe/test_mxfp4_moe.py
new file mode 100644
index 0000000..824b072
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/moe/test_mxfp4_moe.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import importlib
+import importlib.metadata
+from dataclasses import dataclass
+
+import pytest
+import torch
+from packaging import version
+
+QUARK_MXFP4_AVAILABLE = importlib.util.find_spec(
+    "quark") is not None and version.parse(
+        importlib.metadata.version("amd-quark")) >= version.parse('0.8.99')
+
+
+@dataclass
+class ModelCase:
+    model_id: str
+    tp: int
+
+
+@pytest.mark.parametrize('model_case', [
+    ModelCase("fxmarty/qwen_1.5-moe-a2.7b-mxfp4", tp=1),
+    ModelCase("fxmarty/deepseek_r1_3_layers_mxfp4", tp=8),
+    ModelCase("fxmarty/Llama-4-Scout-17B-16E-Instruct-2-layers-mxfp4", tp=1)
+])
+@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE,
+                    reason="amd-quark>=0.9 is not available")
+def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase):
+    if torch.cuda.device_count() < model_case.tp:
+        pytest.skip(f"This test requires >={model_case.tp} gpus, got only "
+                    f"{torch.cuda.device_count()}")
+
+    with vllm_runner(model_case.model_id,
+                     tensor_parallel_size=model_case.tp,
+                     load_format="dummy") as llm:
+
+        # TODO: llm.apply_model(check_model) currently relies on V0 internals.
+        # Re-enable this later.
+        # def check_model(model):
+        #     layer = model.model.layers[0]
+
+        #     qkv_proj = layer.self_attn.qkv_proj
+
+        #     assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
+        #     assert isinstance(qkv_proj.scheme, QuarkW4A4MXFP4)
+
+        #     assert isinstance(layer.mlp.experts.quant_method,
+        #                       QuarkW4A4MXFp4MoEMethod)
+
+        # if model_case.model_id == "fxmarty/qwen_1.5-moe-a2.7b-mxfp4":
+        #     llm.apply_model(check_model)
+
+        output = llm.generate_greedy("Today I am in the French Alps and",
+                                     max_tokens=20)
+        assert output
\ No newline at end of file
diff --git a/vllm_v0.10.0/tests/kernels/moe/test_nvfp4_moe.py b/vllm_v0.10.0/tests/kernels/moe/test_nvfp4_moe.py
new file mode 100644
index 0000000..3ff3853
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/moe/test_nvfp4_moe.py
@@ -0,0 +1,148 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX,
+                                                    FLOAT8_E4M3_MAX,
+                                                    dequantize_nvfp4_to_dtype)
+from tests.kernels.utils import torch_moe
+from vllm import _custom_ops as ops
+from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
+from vllm.platforms import current_platform
+
+if not current_platform.has_device_capability(100):
+    pytest.skip("Nvfp4 Requires compute capability of 10 or above.",
+                allow_module_level=True)
+
+MNK_FACTORS = [
+    (2, 1024, 1024),
+    (2, 1024, 1536),
+    (2, 3072, 1024),
+    (2, 3072, 1536),
+    (64, 1024, 1024),
+    (64, 1024, 1536),
+    (64, 3072, 1024),
+    (64, 2048, 1536),
+    (224, 1024, 1024),
+    (224, 1024, 1536),
+]
+
+
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
+@pytest.mark.parametrize("e", [40, 64, 256])
+@pytest.mark.parametrize("topk", [1, 6, 8])
+@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
+@torch.inference_mode()
+def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int,
+                                  dtype: torch.dtype):
+    current_platform.seed_everything(7)
+    with set_current_vllm_config(
+            VllmConfig(parallel_config=ParallelConfig(
+                pipeline_parallel_size=1))):
+
+        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+        w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+        quant_blocksize = 16
+        round_up = lambda x, y: (x + y - 1) // y * y
+        sf_w1_2n = round_up(2 * n, 128)
+        sf_w1_k = round_up(k // quant_blocksize, 4)
+        w1_blockscale = torch.empty((e, sf_w1_2n, sf_w1_k),
+                                    device="cuda",
+                                    dtype=torch.float8_e4m3fn)
+
+        w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+        sf_w2_k = round_up(k, 128)
+        sf_w2_n = round_up(n // quant_blocksize, 4)
+        w2_blockscale = torch.empty((e, sf_w2_k, sf_w2_n),
+                                    device="cuda",
+                                    dtype=torch.float8_e4m3fn)
+
+        w1_q = torch.empty((e, 2 * n, k // 2),
+                           device="cuda",
+                           dtype=torch.uint8)
+        w2_q = torch.empty((e, k, n // 2), device="cuda", dtype=torch.uint8)
+        w1_gs = torch.empty((e, ), device="cuda", dtype=torch.float32)
+        w2_gs = torch.empty((e, ), device="cuda", dtype=torch.float32)
+
+        for expert in range(e):
+            w1_amax = torch.abs(w1).max().to(torch.float32)
+            w2_amax = torch.abs(w2).max().to(torch.float32)
+            w1_gs[expert] = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w1_amax
+            w2_gs[expert] = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w2_amax
+
+            w1_q[expert], w1_blockscale[expert] = ops.scaled_fp4_quant(
+                w1[expert], w1_gs[expert])
+
+            w2_q[expert], w2_blockscale[expert] = ops.scaled_fp4_quant(
+                w2[expert], w2_gs[expert])
+
+        score = torch.randn((m, e), device="cuda", dtype=dtype)
+        topk_weights, topk_ids, _ = fused_topk(a,
+                                               score,
+                                               topk,
+                                               renormalize=False)
+
+        a1_gs = torch.ones((e, ), device="cuda", dtype=torch.float32)
+        a2_gs = torch.ones((e, ), device="cuda", dtype=torch.float32)
+
+        cutlass_output = cutlass_moe_fp4(
+            a=a,
+            a1_gscale=a1_gs,
+            w1_fp4=w1_q,
+            w1_blockscale=w1_blockscale,
+            g1_alphas=(1 / w1_gs),
+            a2_gscale=a2_gs,
+            w2_fp4=w2_q,
+            w2_blockscale=w2_blockscale,
+            g2_alphas=(1 / w2_gs),
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            m=m,
+            n=n,
+            k=k,
+            e=e,
+            device=a.device,
+        )
+
+        # Reference check:
+        a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
+                          torch.amax(a.flatten(), dim=-1)).to(torch.float32)
+        a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(a, a_global_scale)
+        _, m_k = a_fp4.shape
+        a_in_dtype = dequantize_nvfp4_to_dtype(a_fp4,
+                                               a_scale_interleaved,
+                                               a_global_scale,
+                                               dtype=a.dtype,
+                                               device=a.device,
+                                               block_size=quant_blocksize)
+
+        w1_d = torch.empty((e, 2 * n, k), device="cuda", dtype=dtype)
+        w2_d = torch.empty((e, k, n), device="cuda", dtype=dtype)
+
+        for idx in range(0, e):
+            w1_d[idx] = dequantize_nvfp4_to_dtype(w1_q[idx],
+                                                  w1_blockscale[idx],
+                                                  w1_gs[idx],
+                                                  dtype=w1.dtype,
+                                                  device=w1.device,
+                                                  block_size=quant_blocksize)
+            w2_d[idx] = dequantize_nvfp4_to_dtype(w2_q[idx],
+                                                  w2_blockscale[idx],
+                                                  w2_gs[idx],
+                                                  dtype=w2.dtype,
+                                                  device=w2.device,
+                                                  block_size=quant_blocksize)
+
+        torch_output = torch_moe(a_in_dtype, w1_d, w2_d, score, topk)
+
+        torch.testing.assert_close(torch_output,
+                                   cutlass_output,
+                                   atol=1e-1,
+                                   rtol=1e-1)
+
+
+if __name__ == "__main__":
+    test_cutlass_fp4_moe_no_graph((2, 1024, 1024), 40, 1, torch.half)
diff --git a/vllm_v0.10.0/tests/kernels/moe/test_pplx_cutlass_moe.py b/vllm_v0.10.0/tests/kernels/moe/test_pplx_cutlass_moe.py
new file mode 100644
index 0000000..e4f4a39
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/moe/test_pplx_cutlass_moe.py
@@ -0,0 +1,301 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import pytest
+import torch
+
+from tests.kernels.utils import torch_experts
+from vllm import _custom_ops as ops
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel)
+from vllm.platforms import current_platform
+from vllm.utils import cdiv
+
+from .parallel_utils import ProcessGroupInfo, parallel_launch
+
+try:
+    from pplx_kernels import AllToAll
+    from pplx_kernels.nvshmem import (nvshmem_alloc_empty_unique_id,
+                                      nvshmem_finalize, nvshmem_get_unique_id,
+                                      nvshmem_init)
+    has_pplx = True
+except ImportError:
+    has_pplx = False
+
+requires_pplx = pytest.mark.skipif(
+    not has_pplx,
+    reason="Requires PPLX kernels",
+)
+
+NUM_EXPERTS = [40, 64]
+TOP_KS = [6, 8]
+
+
+def rank_chunk(num, r, w):
+    rem = num % w
+    return (num // w) + (1 if r < rem else 0)
+
+
+def chunk_by_rank(t, r, w):
+    num = t.shape[0]
+    chunk = rank_chunk(num, r, w)
+    rem = num % w
+    if rem == 0 or r < rem:
+        return t[(r * chunk):(r + 1) * chunk].contiguous()
+    else:
+        long_chunks = (num // w + 1) * rem
+        short_chunks = (r - rem) * chunk
+        start = long_chunks + short_chunks
+        return t[start:start + chunk].contiguous()
+
+
+def pplx_cutlass_moe(
+    pgi: ProcessGroupInfo,
+    dp_size: int,
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    a1_scale: torch.Tensor,
+    out_dtype,
+    per_act_token: bool,
+    per_out_ch: bool,
+    group_name: Optional[str],
+):
+    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
+        PplxPrepareAndFinalize)
+    assert torch.cuda.current_device() == pgi.local_rank
+
+    num_tokens, hidden_dim = a.shape
+    num_experts = w1.shape[0]
+    block_size = hidden_dim  # TODO support more cases
+    device = pgi.device
+    rank = pgi.rank
+    world_size = pgi.world_size
+    rank_num_tokens = rank_chunk(num_tokens, rank, world_size)
+    max_num_tokens = rank_chunk(num_tokens, 0, world_size)
+    topk = topk_ids.shape[1]
+
+    if block_size == hidden_dim:
+        scale_elems = 4  # hack to circumvent pplx data format requirements
+    else:
+        scale_elems = (hidden_dim + block_size - 1) // block_size
+
+    args = dict(
+        max_num_tokens=max_num_tokens,
+        num_experts=num_experts,
+        experts_per_token=topk,
+        rank=rank,
+        world_size=world_size,
+        dp_size=dp_size,
+        hidden_dim=hidden_dim,
+        hidden_dim_bytes=hidden_dim,  # because a.dtype.itemsize == 1
+        hidden_dim_scale_bytes=scale_elems * torch.float32.itemsize,
+    )
+
+    if group_name is None:
+        ata = AllToAll.internode(**args)
+    else:
+        args["group_name"] = group_name
+        ata = AllToAll.intranode(**args)
+
+    w1 = w1.to(device)
+    w2 = w2.to(device)
+    w1_scale = w1_scale.to(device)
+    w2_scale = w2_scale.to(device)
+    a1_scale = a1_scale.to(device)
+
+    assert num_experts % world_size == 0
+    num_local_experts = cdiv(num_experts, world_size)
+    num_dispatchers = pgi.world_size // dp_size
+
+    prepare_finalize = PplxPrepareAndFinalize(
+        ata,
+        max_num_tokens=max_num_tokens,
+        num_local_experts=num_local_experts,
+        num_dispatchers=num_dispatchers)
+
+    experts = CutlassExpertsFp8(num_local_experts,
+                                out_dtype,
+                                per_act_token,
+                                per_out_ch,
+                                num_dispatchers=num_dispatchers,
+                                use_batched_format=True)
+
+    fused_cutlass_experts = FusedMoEModularKernel(
+        prepare_finalize,
+        experts,
+    )
+
+    a_chunk = chunk_by_rank(a, rank, world_size).to(device)
+    chunk_topk_weight = chunk_by_rank(topk_weights, rank,
+                                      world_size).to(device)
+    chunk_topk_ids = chunk_by_rank(topk_ids, rank,
+                                   world_size).to(torch.uint32).to(device)
+
+    out = fused_cutlass_experts(
+        a_chunk,
+        chunk_by_rank(w1, rank, world_size),
+        chunk_by_rank(w2, rank, world_size),
+        chunk_topk_weight,
+        chunk_topk_ids,
+        global_num_experts=num_experts,
+        expert_map=None,  #TODO
+        w1_scale=chunk_by_rank(w1_scale, rank, world_size),
+        w2_scale=chunk_by_rank(w2_scale, rank, world_size),
+        a1_scale=chunk_by_rank(a1_scale, rank, world_size)
+        if per_act_token else a1_scale[rank])
+
+    torch.cuda.synchronize()
+
+    ata.destroy()
+
+    return out[:rank_num_tokens]
+
+
+vllm_config = VllmConfig()
+vllm_config.scheduler_config.max_num_seqs = 128
+vllm_config.scheduler_config.max_model_len = 8192
+
+
+def _pplx_moe(
+    pgi: ProcessGroupInfo,
+    dp_size: int,
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    a1_scale: torch.Tensor,
+    out_dtype,
+    a_full: torch.Tensor,
+    w1_full: torch.Tensor,
+    w2_full: torch.Tensor,
+    per_act_token: bool,
+    per_out_ch: bool,
+    use_internode: bool,
+):
+    try:
+        if use_internode:
+            uid = nvshmem_get_unique_id(
+            ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
+            torch.distributed.broadcast(uid, src=0)
+            nvshmem_init(uid, pgi.rank, pgi.world_size)
+        else:
+            group_ranks = list(range(pgi.world_size))
+            cpu_group = torch.distributed.new_group(group_ranks,
+                                                    backend="gloo")
+            group_name = cpu_group.group_name
+
+        with set_current_vllm_config(vllm_config):
+            torch_output = torch_experts(a_full, w1_full, w2_full,
+                                         topk_weights, topk_ids)
+            pplx_output = pplx_cutlass_moe(pgi, dp_size, a, w1, w2, w1_scale,
+                                           w2_scale, topk_weights, topk_ids,
+                                           a1_scale, out_dtype, per_act_token,
+                                           per_out_ch, group_name)
+
+            torch_output = chunk_by_rank(torch_output, pgi.rank,
+                                         pgi.world_size).to(pplx_output.device)
+
+        # Uncomment if more debugging is needed
+        # print("PPLX OUT:", pplx_output)
+        # print("TORCH OUT:", torch_output)
+
+        torch.testing.assert_close(pplx_output,
+                                   torch_output,
+                                   atol=0.05,
+                                   rtol=0)
+    finally:
+        if use_internode:
+            nvshmem_finalize()
+
+
+@pytest.mark.parametrize("m", [2, 224])
+@pytest.mark.parametrize("n", [3072])
+@pytest.mark.parametrize("k", [1536])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("per_act_token", [True, False])
+@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("world_dp_size", [[2, 1]])  #, [4, 2]])
+@pytest.mark.parametrize("use_internode", [False])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+@requires_pplx
+def test_cutlass_moe_pplx(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_ch: bool,
+    world_dp_size: tuple[int, int],
+    use_internode: bool,
+):
+    current_platform.seed_everything(7)
+
+    with set_current_vllm_config(vllm_config):
+
+        dtype = torch.half
+
+        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10.0
+        w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10.0
+        w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10.0
+
+        n_b_scales = 2 * n if per_out_ch else 1
+        k_b_scales = k if per_out_ch else 1
+
+        w1_q = torch.empty((e, 2 * n, k),
+                           device="cuda",
+                           dtype=torch.float8_e4m3fn)
+        w2_q = torch.empty((e, k, n), device="cuda", dtype=torch.float8_e4m3fn)
+        w1_scale = torch.empty((e, n_b_scales, 1),
+                               device="cuda",
+                               dtype=torch.float32)
+        w2_scale = torch.empty((e, k_b_scales, 1),
+                               device="cuda",
+                               dtype=torch.float32)
+
+        for expert in range(e):
+            w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
+                w1[expert], use_per_token_if_dynamic=per_out_ch)
+            w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
+                w2[expert], use_per_token_if_dynamic=per_out_ch)
+
+        w1_d = torch.empty_like(w1)
+        w2_d = torch.empty_like(w2)
+        for expert in range(e):
+            w1_d[expert] = (w1_q[expert].float() * w1_scale[expert]).half()
+            w2_d[expert] = (w2_q[expert].float() * w2_scale[expert]).half()
+
+        score = torch.randn((m, e), device="cuda", dtype=dtype)
+        topk_weights, topk_ids, _ = fused_topk(a,
+                                               score,
+                                               topk,
+                                               renormalize=False)
+
+        world_size, dp_size = world_dp_size
+        a_scale1 = torch.randn(
+            (m if per_act_token else 1, 1), device="cuda",
+            dtype=torch.float32) / 10.0
+        if not per_act_token:
+            a_scale1 = a_scale1.repeat(world_size, 1)
+
+        parallel_launch(world_size, _pplx_moe, dp_size, a, w1_q, w2_q,
+                        w1_scale, w2_scale, topk_weights, topk_ids, a_scale1,
+                        dtype, a, w1_d, w2_d, per_act_token, per_out_ch,
+                        use_internode)
diff --git a/vllm_v0.10.0/tests/kernels/moe/test_pplx_moe.py b/vllm_v0.10.0/tests/kernels/moe/test_pplx_moe.py
new file mode 100644
index 0000000..f7a661b
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/moe/test_pplx_moe.py
@@ -0,0 +1,897 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for the MOE layers.
+
+Run `pytest tests/kernels/test_pplx_moe.py`.
+"""
+import itertools
+import textwrap
+import traceback
+from typing import Callable, Optional
+
+import pytest
+import torch
+
+try:
+    from pplx_kernels import AllToAll
+    from pplx_kernels.nvshmem import (nvshmem_alloc_empty_unique_id,
+                                      nvshmem_finalize, nvshmem_get_unique_id,
+                                      nvshmem_init)
+    has_pplx = True
+except ImportError:
+    has_pplx = False
+
+from tests.kernels.moe.utils import make_test_weights, naive_batched_moe
+from tests.kernels.quant_utils import dequant
+from tests.kernels.utils import torch_experts
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe import fused_topk, override_config
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+    BatchedTritonExperts)
+from vllm.model_executor.layers.fused_moe.fused_moe import get_default_config
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel)
+from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
+    TopKWeightAndReduceDelegate)
+from vllm.platforms import current_platform
+from vllm.utils import round_up
+
+from .parallel_utils import ProcessGroupInfo, parallel_launch
+
+requires_pplx = pytest.mark.skipif(
+    not has_pplx,
+    reason="Requires PPLX kernels",
+)
+
+PPLX_COMBOS = [
+    # TODO: figure out why this fails, seems to be test problem
+    #(1, 128, 128),
+    (2, 128, 512),
+    (3, 1024, 2048),
+    (4, 128, 128),
+    (32, 1024, 512),
+    (45, 512, 2048),
+    (64, 1024, 512),
+    (222, 2048, 1024),
+    (256, 1408, 2048),
+]
+
+NUM_EXPERTS = [8, 64]
+TOP_KS = [1, 2, 6]
+DTYPES = [torch.float8_e4m3fn, torch.bfloat16]
+
+vllm_config = VllmConfig()
+vllm_config.scheduler_config.max_num_seqs = 128
+vllm_config.scheduler_config.max_model_len = 8192
+
+
+def torch_prepare(
+    a: torch.Tensor,
+    topk_ids: torch.Tensor,
+    num_experts: int,
+    max_num_tokens: Optional[int] = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert topk_ids.dim() == 2
+    assert topk_ids.shape[0] == a.shape[0]
+
+    num_tokens, hidden_dim = a.shape
+    topk = topk_ids.shape[1]
+
+    tokens_per_expert = torch.bincount(topk_ids.view(-1),
+                                       minlength=num_experts)
+
+    assert tokens_per_expert.numel() == num_experts
+
+    if max_num_tokens is None:
+        max_num_tokens = int(tokens_per_expert.max().item())
+
+    b_a = torch.zeros((num_experts, max_num_tokens, hidden_dim),
+                      dtype=a.dtype,
+                      device=a.device)
+
+    token_counts = torch.zeros(num_experts, dtype=torch.int, device=a.device)
+
+    for token in range(num_tokens):
+        for j in range(topk):
+            expert_id = topk_ids[token, j]
+            idx = token_counts[expert_id]
+            b_a[expert_id, idx:idx + 1, :] = a[token, :]
+            token_counts[expert_id] = token_counts[expert_id] + 1
+
+    return b_a, tokens_per_expert
+
+
+def torch_finalize(b_out: torch.Tensor, topk_weight: torch.Tensor,
+                   topk_ids: torch.Tensor) -> torch.Tensor:
+    num_tokens = topk_ids.shape[0]
+    num_experts = b_out.shape[0]
+    K = b_out.shape[-1]
+    out = torch.zeros((num_tokens, K), dtype=b_out.dtype, device=b_out.device)
+    expert_counts = torch.zeros(num_experts,
+                                dtype=torch.int,
+                                device=b_out.device)
+    for token in range(num_tokens):
+        expert_ids = topk_ids[token]
+        for i in range(expert_ids.numel()):
+            expert_id = expert_ids[i]
+            idx = expert_counts[expert_id]
+            out[token, :] = out[token, :] + b_out[expert_id, idx:idx +
+                                                  1, :] * topk_weight[token, i]
+            expert_counts[expert_id] = expert_counts[expert_id] + 1
+
+    return out
+
+
+def torch_batched_moe(
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weight: torch.Tensor,
+    topk_ids: torch.Tensor,
+) -> torch.Tensor:
+    num_experts = w1.shape[0]
+    b_a, tokens_per_expert = torch_prepare(a, topk_ids, num_experts)
+    assert b_a.dim() == 3
+    num_tokens, topk = topk_ids.shape
+    _, max_num_tokens, K = b_a.shape
+    assert num_experts == b_a.shape[0] and w2.shape[1] == K
+    out = torch.zeros((num_experts, max_num_tokens, K),
+                      dtype=b_a.dtype,
+                      device=b_a.device)
+    tmp = torch.empty((max_num_tokens, w1.shape[1] // 2),
+                      dtype=b_a.dtype,
+                      device=b_a.device)
+    for expert in range(num_experts):
+        num = tokens_per_expert[expert]
+        if num > 0:
+            torch.ops._C.silu_and_mul(
+                tmp[:num], b_a[expert, :num, :] @ w1[expert].transpose(0, 1))
+            out[expert, :num, :] = tmp[:num] @ w2[expert].transpose(0, 1)
+
+    return torch_finalize(out, topk_weight, topk_ids)
+
+
+@pytest.mark.parametrize("m", [1, 33, 64, 222])
+@pytest.mark.parametrize("n", [128, 1024, 2048])
+@pytest.mark.parametrize("k", [128, 512, 1024])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+def test_fused_moe_batched_experts(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    dtype: torch.dtype,
+):
+    current_platform.seed_everything(7)
+
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
+
+    with set_current_vllm_config(vllm_config):
+        topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
+        baseline_output = torch_experts(a, w1, w2, topk_weight,
+                                        topk_ids)  # only for baseline
+        torch_output = torch_batched_moe(a, w1, w2, topk_weight, topk_ids)
+        batched_output = naive_batched_moe(
+            a, w1, w2, topk_weight, topk_ids)  # pick torch_experts or this
+
+    torch.testing.assert_close(baseline_output,
+                               torch_output,
+                               atol=2e-2,
+                               rtol=0)
+    torch.testing.assert_close(baseline_output,
+                               batched_output,
+                               atol=2e-2,
+                               rtol=0)
+
+
+def create_pplx_prepare_finalize(
+    num_tokens: int,
+    hidden_dim: int,
+    topk: int,
+    num_experts: int,
+    rank: int,
+    dp_size: int,
+    world_size: int,
+    in_dtype: torch.dtype,
+    quant_dtype: Optional[torch.dtype],
+    block_shape: Optional[list[int]],
+    per_act_token_quant: bool,
+    group_name: Optional[str],
+):
+    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
+        PplxPrepareAndFinalize, pplx_hidden_dim_scale_bytes)
+
+    max_num_tokens = max(rank_chunk(num_tokens, 0, world_size), 1)
+    num_local_experts = rank_chunk(num_experts, 0, world_size)
+
+    hidden_dim_bytes, scale_bytes = pplx_hidden_dim_scale_bytes(
+        max_num_tokens,
+        hidden_dim,
+        in_dtype,
+        quant_dtype,
+        per_act_token_quant=per_act_token_quant,
+        block_shape=block_shape,
+    )
+
+    args = dict(
+        max_num_tokens=max_num_tokens,
+        num_experts=num_experts,
+        experts_per_token=topk,
+        rank=rank,
+        world_size=world_size,
+        dp_size=dp_size,
+        hidden_dim=hidden_dim,
+        hidden_dim_bytes=hidden_dim_bytes,
+        hidden_dim_scale_bytes=scale_bytes,
+    )
+
+    if group_name is None:
+        ata = AllToAll.internode(**args)
+    else:
+        args["group_name"] = group_name
+        ata = AllToAll.intranode(**args)
+
+    prepare_finalize = PplxPrepareAndFinalize(
+        ata,
+        max_num_tokens=max_num_tokens,
+        num_local_experts=num_local_experts,
+        num_dispatchers=world_size // dp_size,
+    )
+
+    return prepare_finalize, ata
+
+
+def rank_chunk(num: int, r: int, w: int) -> int:
+    rem = num % w
+    return (num // w) + (1 if r < rem else 0)
+
+
+def chunk_by_rank(t: torch.Tensor, r: int, w: int) -> torch.Tensor:
+    chunk = rank_chunk(t.shape[0], r, w)
+    return t[(r * chunk):(r + 1) * chunk]
+
+
+def maybe_chunk_by_rank(t: Optional[torch.Tensor], r: int,
+                        w: int) -> Optional[torch.Tensor]:
+    if t is not None:
+        return chunk_by_rank(t, r, w)
+    else:
+        return t
+
+
+def chunk_scales_by_rank(t: Optional[torch.Tensor], r: int,
+                         w: int) -> Optional[torch.Tensor]:
+    if t is not None and t.numel() > 1:
+        chunk = rank_chunk(t.shape[0], r, w)
+        return t[(r * chunk):(r + 1) * chunk]
+    else:
+        return t
+
+
+def chunk_scales(t: Optional[torch.Tensor], start: int,
+                 end: int) -> Optional[torch.Tensor]:
+    if t is not None and t.numel() > 1:
+        return t[start:end]
+    else:
+        return t
+
+
+def dummy_work(a: torch.Tensor) -> torch.Tensor:
+    return a * 1.1
+
+
+def pplx_prepare_finalize(
+    pgi: ProcessGroupInfo,
+    dp_size: int,
+    a: torch.Tensor,
+    topk_weight: torch.Tensor,
+    topk_ids: torch.Tensor,
+    num_experts: int,
+    quant_dtype: Optional[torch.dtype],
+    block_shape: Optional[list[int]],
+    per_act_token_quant: bool,
+    group_name: Optional[str],
+) -> torch.Tensor:
+    assert torch.cuda.current_device() == pgi.local_rank
+
+    topk = topk_ids.shape[1]
+    num_tokens, hidden_dim = a.shape
+    device = pgi.device
+    rank = pgi.rank
+    world_size = pgi.world_size
+
+    topk_ids = topk_ids.to(dtype=torch.uint32)
+
+    prepare_finalize, ata = create_pplx_prepare_finalize(
+        num_tokens,
+        hidden_dim,
+        topk,
+        num_experts,
+        rank,
+        dp_size,
+        world_size,
+        a.dtype,
+        quant_dtype,
+        block_shape,
+        per_act_token_quant,
+        group_name,
+    )
+
+    assert a.shape[0] == topk_ids.shape[0]
+
+    a_chunk = chunk_by_rank(a, rank, world_size).to(device)
+    chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
+    chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
+
+    assert a_chunk.shape[0] == chunk_topk_ids.shape[0]
+
+    out = torch.full(
+        a_chunk.shape,
+        torch.nan,
+        dtype=a.dtype,
+        device=device,
+    )
+
+    if (quant_dtype is not None and not per_act_token_quant
+            and block_shape is None):
+        a1_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        a2_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    else:
+        a1_scale = None
+        a2_scale = None
+
+    b_a, b_a_scale, expert_num_tokens, _, _ = prepare_finalize.prepare(
+        a_chunk,
+        a1_scale,
+        a2_scale,
+        chunk_topk_weight,
+        chunk_topk_ids,
+        num_experts,
+        None,
+        False,
+        FusedMoEQuantConfig(
+            quant_dtype,
+            per_act_token_quant,
+            False,
+            block_shape,
+        ),
+    )
+
+    b_a = dummy_work(
+        dequant(b_a, b_a_scale, block_shape, per_act_token_quant, a.dtype))
+
+    prepare_finalize.finalize(
+        out,
+        b_a,
+        chunk_topk_weight,
+        chunk_topk_ids,
+        False,
+        weight_and_reduce_impl=TopKWeightAndReduceDelegate(),
+    )
+
+    torch.cuda.synchronize()
+
+    ata.destroy()
+
+    num_tokens = a_chunk.shape[0]
+
+    return out[:num_tokens]
+
+
+def _pplx_prepare_finalize(
+    pgi: ProcessGroupInfo,
+    dp_size: int,
+    a: torch.Tensor,
+    score: torch.Tensor,
+    topk: torch.Tensor,
+    num_experts: int,
+    quant_dtype: Optional[torch.dtype],
+    block_shape: Optional[list[int]],
+    per_act_token_quant: bool,
+    use_internode: bool,
+):
+    try:
+        if use_internode:
+            uid = nvshmem_get_unique_id(
+            ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
+            torch.distributed.broadcast(uid, src=0)
+            nvshmem_init(uid, pgi.rank, pgi.world_size)
+            group_name = None
+        else:
+            group_ranks = list(range(pgi.world_size))
+            cpu_group = torch.distributed.new_group(group_ranks,
+                                                    backend="gloo")
+            group_name = cpu_group.group_name
+
+        topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
+        m, k = a.shape
+
+        a_rep = torch.repeat_interleave(dummy_work(a), topk, dim=0)
+
+        torch_output = (a_rep.view(m, topk, k) *
+                        topk_weight.view(m, topk, 1).to(a_rep.dtype)).sum(
+                            dim=1)
+
+        pplx_output = pplx_prepare_finalize(pgi, dp_size, a, topk_weight,
+                                            topk_ids, num_experts, quant_dtype,
+                                            block_shape, per_act_token_quant,
+                                            group_name)
+
+        torch_output = chunk_by_rank(torch_output, pgi.rank,
+                                     pgi.world_size).to(pgi.device)
+
+        torch.testing.assert_close(pplx_output,
+                                   torch_output,
+                                   atol=3e-2,
+                                   rtol=3e-2)
+    finally:
+        if use_internode:
+            nvshmem_finalize()
+
+
+@pytest.mark.parametrize("mnk", PPLX_COMBOS)
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("world_dp_size", [[2, 1]])
+@pytest.mark.parametrize("per_act_token_quant", [False, True])
+@pytest.mark.parametrize("block_shape", [None, [128, 128]])
+@pytest.mark.parametrize("use_internode", [False])
+@pytest.mark.optional
+@requires_pplx
+def test_pplx_prepare_finalize_slow(
+    mnk: tuple[int, int, int],
+    e: int,
+    topk: int,
+    dtype: torch.dtype,
+    world_dp_size: tuple[int, int],
+    per_act_token_quant: bool,
+    block_shape: Optional[list[int]],
+    use_internode: bool,
+):
+    if dtype == torch.float8_e4m3fn:
+        use_fp8_w8a8 = True
+        act_dtype = torch.bfloat16
+        quant_dtype = dtype
+    else:
+        use_fp8_w8a8 = False
+        act_dtype = dtype
+        quant_dtype = None
+
+    if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
+        pytest.skip("Skip quantization test for non-quantized type")
+
+    if per_act_token_quant and block_shape is not None:
+        pytest.skip("Skip illegal quantization combination")
+
+    current_platform.seed_everything(7)
+    m, n, k = mnk
+    world_size, dp_size = world_dp_size
+    device = "cuda"
+
+    a = torch.randn((m, k), device=device, dtype=act_dtype) / 10
+    score = torch.randn((m, e), device=device, dtype=act_dtype)
+
+    parallel_launch(world_size, _pplx_prepare_finalize, dp_size, a, score,
+                    topk, e, quant_dtype, block_shape, per_act_token_quant,
+                    use_internode)
+
+
+def pplx_moe(
+    group_name: Optional[str],
+    rank: int,
+    world_size: int,
+    dp_size: int,
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weight: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    quant_dtype: Optional[torch.dtype] = None,
+    per_act_token_quant=False,
+    block_shape: Optional[list[int]] = None,
+    use_compile: bool = False,
+    use_cudagraphs: bool = True,
+) -> torch.Tensor:
+
+    num_tokens, hidden_dim = a.shape
+    num_experts = w1.shape[0]
+    topk = topk_ids.shape[1]
+    max_num_tokens = round_up(rank_chunk(a.shape[0], 0, world_size), 16)
+
+    prepare_finalize, ata = create_pplx_prepare_finalize(
+        num_tokens,
+        hidden_dim,
+        topk,
+        num_experts,
+        rank,
+        dp_size,
+        world_size,
+        a.dtype,
+        quant_dtype,
+        block_shape,
+        per_act_token_quant,
+        group_name,
+    )
+
+    topk_ids = topk_ids.to(dtype=torch.uint32)
+
+    experts = BatchedTritonExperts(
+        max_num_tokens=max_num_tokens,
+        num_dispatchers=prepare_finalize.num_dispatchers(),
+        use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
+        block_shape=block_shape,
+        per_act_token_quant=per_act_token_quant,
+    )
+
+    fused_experts = FusedMoEModularKernel(
+        prepare_finalize,
+        experts,
+    )
+
+    # Note: workers with the same dp_rank must use the exact same inputs.
+    a_chunk = chunk_by_rank(a, rank, world_size)
+    chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size)
+    chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size)
+
+    # Chunking weights like this only works for batched format
+    w1_chunk = chunk_by_rank(w1, rank, world_size)
+    w2_chunk = chunk_by_rank(w2, rank, world_size)
+    w1_scale_chunk = maybe_chunk_by_rank(w1_scale, rank, world_size)
+    w2_scale_chunk = maybe_chunk_by_rank(w2_scale, rank, world_size)
+    a1_scale_chunk = chunk_scales_by_rank(a1_scale, rank, world_size)
+    a2_scale_chunk = chunk_scales_by_rank(a2_scale, rank, world_size)
+
+    # Note: for now use_compile will error out if the problem size is
+    # large enough to trigger chunking. I'm leaving the flag and
+    # setup code in case we are able to revisit this later.
+    if use_compile:
+        _fused_experts = torch.compile(fused_experts,
+                                       backend='inductor',
+                                       fullgraph=True)
+        torch._dynamo.mark_dynamic(a_chunk, 0)
+        torch._dynamo.mark_dynamic(chunk_topk_weight, 0)
+        torch._dynamo.mark_dynamic(chunk_topk_ids, 0)
+    else:
+        _fused_experts = fused_experts
+
+    out = _fused_experts(a_chunk,
+                         w1_chunk,
+                         w2_chunk,
+                         chunk_topk_weight,
+                         chunk_topk_ids,
+                         w1_scale=w1_scale_chunk,
+                         w2_scale=w2_scale_chunk,
+                         a1_scale=a1_scale_chunk,
+                         a2_scale=a2_scale_chunk,
+                         global_num_experts=num_experts)
+
+    if use_cudagraphs:
+        out.fill_(0)
+        stream = torch.cuda.Stream()
+        graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(graph, stream=stream):
+            out = _fused_experts(a_chunk,
+                                 w1_chunk,
+                                 w2_chunk,
+                                 chunk_topk_weight,
+                                 chunk_topk_ids,
+                                 w1_scale=w1_scale_chunk,
+                                 w2_scale=w2_scale_chunk,
+                                 a1_scale=a1_scale_chunk,
+                                 a2_scale=a2_scale_chunk,
+                                 global_num_experts=num_experts)
+
+        torch.cuda.synchronize()
+        graph.replay()
+
+    torch.cuda.synchronize()
+
+    ata.destroy()
+
+    return out
+
+
+def _pplx_moe(
+    pgi: ProcessGroupInfo,
+    dp_size: int,
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    score: torch.Tensor,
+    topk: int,
+    num_experts: int,
+    w1_s: Optional[torch.Tensor] = None,
+    w2_s: Optional[torch.Tensor] = None,
+    quant_dtype: Optional[torch.dtype] = None,
+    per_act_token_quant: bool = False,
+    block_shape: Optional[list[int]] = None,
+    use_internode: bool = False,
+):
+    try:
+        if use_internode:
+            uid = nvshmem_get_unique_id(
+            ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
+            torch.distributed.broadcast(uid, src=0)
+            nvshmem_init(uid, pgi.rank, pgi.world_size)
+            group_name = None
+        else:
+            group_ranks = list(range(pgi.world_size))
+            cpu_group = torch.distributed.new_group(group_ranks,
+                                                    backend="gloo")
+            group_name = cpu_group.group_name
+
+        m, k = a.shape
+        e, _, n = w2.shape
+
+        moe_config = get_default_config(m, e, n, k, topk, a.dtype, False)
+
+        device = torch.device("cuda", pgi.rank)
+        rank = pgi.rank
+        world_size = pgi.world_size
+
+        a = a.to(device)
+        w1 = w1.to(device)
+        w2 = w2.to(device)
+        w1_s = w1_s.to(device) if w1_s is not None else None
+        w2_s = w2_s.to(device) if w2_s is not None else None
+
+        if (quant_dtype is not None and not per_act_token_quant
+                and block_shape is None):
+            a1_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+            a2_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        else:
+            a1_scale = None
+            a2_scale = None
+
+        with set_current_vllm_config(vllm_config), override_config(moe_config):
+            topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
+
+            torch_output = torch_experts(
+                a,
+                w1,
+                w2,
+                topk_weight,
+                topk_ids,
+                w1_scale=w1_s,
+                w2_scale=w2_s,
+                a1_scale=a1_scale,
+                a2_scale=a2_scale,
+                quant_dtype=quant_dtype,
+                per_act_token_quant=per_act_token_quant,
+                block_shape=block_shape,
+            )
+
+            batched_output = naive_batched_moe(
+                a,
+                w1,
+                w2,
+                topk_weight,
+                topk_ids,
+                w1_scale=w1_s,
+                w2_scale=w2_s,
+                a1_scale=a1_scale,
+                a2_scale=a2_scale,
+                quant_dtype=quant_dtype,
+                per_act_token_quant=per_act_token_quant,
+                block_shape=block_shape,
+            )
+
+            pplx_output = pplx_moe(
+                group_name,
+                rank,
+                world_size,
+                dp_size,
+                a,
+                w1,
+                w2,
+                topk_weight,
+                topk_ids,
+                w1_scale=w1_s,
+                w2_scale=w2_s,
+                a1_scale=a1_scale,
+                a2_scale=a2_scale,
+                quant_dtype=quant_dtype,
+                per_act_token_quant=per_act_token_quant,
+                block_shape=block_shape,
+            )
+
+        chunked_batch_output = chunk_by_rank(
+            batched_output, pgi.rank, pgi.world_size).to(pplx_output.device)
+
+        torch.testing.assert_close(batched_output,
+                                   torch_output,
+                                   atol=3e-2,
+                                   rtol=3e-2)
+
+        torch.testing.assert_close(pplx_output,
+                                   chunked_batch_output,
+                                   atol=3e-2,
+                                   rtol=3e-2)
+    finally:
+        if use_internode:
+            nvshmem_finalize()
+
+
+@pytest.mark.parametrize("mnk", PPLX_COMBOS)
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("world_dp_size", [[2, 1]])
+@pytest.mark.parametrize("per_act_token_quant", [False, True])
+@pytest.mark.parametrize("block_shape", [None, [128, 128]])
+@pytest.mark.parametrize("use_internode", [False])
+@pytest.mark.optional
+@requires_pplx
+def test_pplx_moe_slow(
+    mnk: tuple[int, int, int],
+    e: int,
+    topk: int,
+    dtype: torch.dtype,
+    world_dp_size: tuple[int, int],
+    per_act_token_quant: bool,
+    block_shape: Optional[list[int]],
+    use_internode: bool,
+):
+    current_platform.seed_everything(7)
+    m, n, k = mnk
+    world_size, dp_size = world_dp_size
+
+    if dtype == torch.float8_e4m3fn:
+        use_fp8_w8a8 = True
+        quant_dtype = dtype
+    else:
+        use_fp8_w8a8 = False
+        quant_dtype = None
+
+    if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
+        pytest.skip("Skip quantization test for non-quantized type")
+
+    if per_act_token_quant and block_shape is not None:
+        pytest.skip("Skip illegal quantization combination")
+
+    a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
+    score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
+
+    _, w1, w1_s, _, w2, w2_s = make_test_weights(
+        e,
+        n,
+        k,
+        quant_dtype=quant_dtype,
+        block_shape=block_shape,
+        per_act_token_quant=per_act_token_quant,
+    )
+
+    parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk, e,
+                    w1_s, w2_s, quant_dtype, per_act_token_quant, block_shape,
+                    use_internode)
+
+
+def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
+                    make_weights: bool, test_fn: Callable):
+
+    def format_result(msg, ex=None):
+        if ex is not None:
+            x = str(ex)
+            newx = x.strip(" \n\t")[:16]
+            if len(newx) < len(x):
+                newx = newx + " ..."
+
+            prefix = "E\t"
+            print(f"{textwrap.indent(traceback.format_exc(), prefix)}")
+            print(f"FAILED {msg} - {newx}\n")
+        else:
+            print(f"PASSED {msg}")
+
+    current_platform.seed_everything(7)
+    combos = itertools.product(PPLX_COMBOS, NUM_EXPERTS, TOP_KS, DTYPES,
+                               [False, True], [None, [128, 128]])
+    exceptions = []
+    count = 0
+    for mnk, e, topk, dtype, per_act_token_quant, block_shape in combos:
+        count = count + 1
+        m, n, k = mnk
+
+        if dtype == torch.float8_e4m3fn:
+            use_fp8_w8a8 = True
+            quant_dtype = dtype
+        else:
+            use_fp8_w8a8 = False
+            quant_dtype = None
+
+        test_desc = (f"test_pplx_moe[mnk={mnk}, e={e}, topk={topk}, "
+                     f"dtype={dtype}, per_act_token={per_act_token_quant}, "
+                     f"block_shape={block_shape}")
+
+        if not use_fp8_w8a8 and (per_act_token_quant
+                                 or block_shape is not None):
+            print(
+                f"{test_desc} - Skip quantization test for non-quantized type."
+            )
+            continue
+
+        if per_act_token_quant and block_shape is not None:
+            print(f"{test_desc} - Skip illegal quantization combination.")
+            continue
+
+        a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
+        score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
+
+        args = dict()
+        if make_weights:
+            _, w1, w1_s, _, w2, w2_s = make_test_weights(
+                e,
+                n,
+                k,
+                quant_dtype=quant_dtype,
+                block_shape=block_shape,
+                per_act_token_quant=per_act_token_quant,
+            )
+            args["w1"] = w1
+            args["w2"] = w2
+            args["w1_s"] = w1_s
+            args["w2_s"] = w2_s
+
+        try:
+            test_fn(
+                pgi=pgi,
+                dp_size=dp_size,
+                a=a,
+                score=score,
+                topk=topk,
+                num_experts=e,
+                quant_dtype=quant_dtype,
+                per_act_token_quant=per_act_token_quant,
+                block_shape=block_shape,
+                use_internode=use_internode,
+                **args,
+            )
+            format_result(test_desc)
+        except Exception as ex:
+            format_result(test_desc, ex)
+            exceptions.append(ex)
+
+    if len(exceptions) > 0:
+        raise RuntimeError(
+            f"{len(exceptions)} of {count} tests failed in child process, "
+            f"rank={pgi.rank}.")
+    else:
+        print(f"{count} of {count} tests passed in child process, "
+              f"rank={pgi.rank}.")
+
+
+@pytest.mark.parametrize("world_dp_size", [[2, 1]])
+@pytest.mark.parametrize("use_internode", [False])
+@requires_pplx
+def test_pplx_prepare_finalize(
+    world_dp_size: tuple[int, int],
+    use_internode: bool,
+):
+    current_platform.seed_everything(7)
+    world_size, dp_size = world_dp_size
+    parallel_launch(world_size * dp_size, _pplx_test_loop, dp_size,
+                    use_internode, False, _pplx_prepare_finalize)
+
+
+@pytest.mark.parametrize("world_dp_size", [[2, 1]])
+@pytest.mark.parametrize("use_internode", [False])
+@requires_pplx
+def test_pplx_moe(
+    world_dp_size: tuple[int, int],
+    use_internode: bool,
+):
+    current_platform.seed_everything(7)
+    world_size, dp_size = world_dp_size
+    parallel_launch(world_size, _pplx_test_loop, dp_size, use_internode, True,
+                    _pplx_moe)
diff --git a/vllm_v0.10.0/tests/kernels/moe/test_rocm_aiter_topk.py b/vllm_v0.10.0/tests/kernels/moe/test_rocm_aiter_topk.py
new file mode 100644
index 0000000..1c51c53
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/moe/test_rocm_aiter_topk.py
@@ -0,0 +1,216 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# This is a test for the AITER ops.
+# It tests if the AITER ops are
+# 1. correctly registered as custom ops
+# 2. correctly defined the relationship between
+#    implementation and fake function
+# 3. can be used with torch.compile
+# This file will be skipped if AITER is not installed
+# and the platform is not ROCm.
+
+import importlib.util
+
+import pytest
+import torch
+
+# this import statement is needed to ensure the ops are registered
+import vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe  # noqa: F401
+from vllm.platforms import current_platform
+
+# need to import once to ensure the ops are registered
+# Check if aiter package is installed
+aiter_available = importlib.util.find_spec("aiter") is not None
+
+pytestmark = pytest.mark.skipif(
+    not (current_platform.is_rocm() and aiter_available),
+    reason="AITER ops are only available on ROCm with aiter package installed")
+
+
+def test_rocm_aiter_biased_grouped_topk_custom_op_registration():
+    """Test that the custom op is correctly registered."""
+    # Check if the op exists in torch.ops.vllm
+    assert hasattr(torch.ops.vllm, 'rocm_aiter_biased_grouped_topk')
+
+    # Check if the op is callable
+    assert callable(torch.ops.vllm.rocm_aiter_biased_grouped_topk)
+
+
+def test_rocm_aiter_grouped_topk_custom_op_registration():
+    """Test that the custom op is correctly registered."""
+    # Check if the op exists in torch.ops.vllm
+    assert hasattr(torch.ops.vllm, 'rocm_aiter_grouped_topk')
+
+    # Check if the op is callable
+    assert callable(torch.ops.vllm.rocm_aiter_grouped_topk)
+
+
+def test_rocm_aiter_biased_grouped_topk_torch_compile_compatibility():
+    """Test that the op can be used with torch.compile."""
+    # Create test tensors
+    token = 64
+    expert = 256
+    num_expert_group = 8
+    topk = 8
+    topk_group = 4
+    renormalize = True
+    scale_factor = 1.0
+
+    gating_output = torch.randn((token, expert),
+                                dtype=torch.bfloat16,
+                                device="cuda")
+    e_score_correction_bias = torch.randn((expert, ),
+                                          dtype=torch.bfloat16,
+                                          device="cuda")
+
+    device = gating_output.device
+    topk_ids = torch.empty((token, topk), dtype=torch.int32, device=device)
+    topk_weights = torch.empty((token, topk),
+                               dtype=torch.float32,
+                               device=device)
+
+    # Define a function that uses the op
+    def biased_grouped_topk_fn(gating_output, e_score_correction_bias,
+                               topk_weights, topk_ids):
+        return torch.ops.vllm.rocm_aiter_biased_grouped_topk(
+            gating_output, e_score_correction_bias, topk_weights, topk_ids,
+            num_expert_group, topk_group, renormalize, scale_factor)
+
+    # Verify the op's fake implementation
+    torch.library.opcheck(
+        torch.ops.vllm.rocm_aiter_biased_grouped_topk,
+        (gating_output, e_score_correction_bias, topk_weights, topk_ids),
+        kwargs={
+            "num_expert_group": num_expert_group,
+            "topk_group": topk_group,
+            "need_renorm": renormalize,
+            "routed_scaling_factor": scale_factor
+        },
+        test_utils=("test_faketensor"))
+
+    # Compile the function with appropriate settings
+    compiled_fn = torch.compile(biased_grouped_topk_fn,
+                                fullgraph=True,
+                                backend="inductor",
+                                mode="reduce-overhead",
+                                dynamic=False)
+
+    topk_weights_original = torch.empty((token, topk),
+                                        dtype=torch.float32,
+                                        device=device)
+    topk_ids_original = torch.empty((token, topk),
+                                    dtype=torch.int32,
+                                    device=device)
+
+    topk_weights_compiled = torch.empty((token, topk),
+                                        dtype=torch.float32,
+                                        device=device)
+    topk_ids_compiled = torch.empty((token, topk),
+                                    dtype=torch.int32,
+                                    device=device)
+
+    # Run both compiled (V1 graph mode) and uncompiled versions (V1 eager mode)
+    biased_grouped_topk_fn(gating_output, e_score_correction_bias,
+                           topk_weights_original, topk_ids_original)
+    compiled_fn(gating_output, e_score_correction_bias, topk_weights_compiled,
+                topk_ids_compiled)
+
+    # Sort the results for comparison since the order might not be deterministic
+    topk_ids_original, indices_original = torch.sort(topk_ids_original)
+    topk_weights_original = torch.gather(topk_weights_original, 1,
+                                         indices_original)
+
+    topk_ids_compiled, indices_compiled = torch.sort(topk_ids_compiled)
+    topk_weights_compiled = torch.gather(topk_weights_compiled, 1,
+                                         indices_compiled)
+
+    # Verify results match
+    assert torch.allclose(topk_weights_original,
+                          topk_weights_compiled,
+                          rtol=1e-2,
+                          atol=1e-2)
+    assert torch.allclose(topk_ids_original, topk_ids_compiled)
+
+
+def test_rocm_aiter_grouped_topk_torch_compile_compatibility():
+    """Test that the op can be used with torch.compile."""
+    # Create test tensors
+    token = 64
+    expert = 256
+    num_expert_group = 8
+    topk = 8
+    topk_group = 4
+    renormalize = True
+    scoring_func = "softmax"
+    scale_factor = 1.0
+
+    gating_output = torch.randn((token, expert),
+                                dtype=torch.bfloat16,
+                                device="cuda")
+
+    device = gating_output.device
+    topk_ids = torch.empty((token, topk), dtype=torch.int32, device=device)
+    topk_weights = torch.empty((token, topk),
+                               dtype=torch.float32,
+                               device=device)
+
+    # Define a function that uses the op
+    def grouped_topk_fn(gating_output, topk_weights, topk_ids, scoring_func):
+        return torch.ops.vllm.rocm_aiter_grouped_topk(
+            gating_output, topk_weights, topk_ids, num_expert_group,
+            topk_group, renormalize, scoring_func, scale_factor)
+
+    # Verify the op's fake implementation
+    torch.library.opcheck(torch.ops.vllm.rocm_aiter_grouped_topk,
+                          (gating_output, topk_weights, topk_ids),
+                          kwargs={
+                              "num_expert_group": num_expert_group,
+                              "topk_group": topk_group,
+                              "need_renorm": renormalize,
+                              "scoring_func": scoring_func,
+                              "routed_scaling_factor": scale_factor
+                          },
+                          test_utils=("test_faketensor"))
+
+    # Compile the function with appropriate settings
+    compiled_fn = torch.compile(grouped_topk_fn,
+                                fullgraph=True,
+                                backend="inductor",
+                                mode="reduce-overhead",
+                                dynamic=False)
+
+    topk_weights_original = torch.empty((token, topk),
+                                        dtype=torch.float32,
+                                        device=device)
+    topk_ids_original = torch.empty((token, topk),
+                                    dtype=torch.int32,
+                                    device=device)
+
+    topk_weights_compiled = torch.empty((token, topk),
+                                        dtype=torch.float32,
+                                        device=device)
+    topk_ids_compiled = torch.empty((token, topk),
+                                    dtype=torch.int32,
+                                    device=device)
+
+    # Run both compiled (V1 graph mode) and uncompiled versions (V1 eager mode)
+    grouped_topk_fn(gating_output, topk_weights_original, topk_ids_original,
+                    scoring_func)
+    compiled_fn(gating_output, topk_weights_compiled, topk_ids_compiled,
+                scoring_func)
+
+    # Sort the results for comparison since the order might not be deterministic
+    topk_ids_original, indices_original = torch.sort(topk_ids_original)
+    topk_weights_original = torch.gather(topk_weights_original, 1,
+                                         indices_original)
+
+    topk_ids_compiled, indices_compiled = torch.sort(topk_ids_compiled)
+    topk_weights_compiled = torch.gather(topk_weights_compiled, 1,
+                                         indices_compiled)
+
+    # Verify results match
+    assert torch.allclose(topk_weights_original,
+                          topk_weights_compiled,
+                          rtol=1e-2,
+                          atol=1e-2)
+    assert torch.allclose(topk_ids_original, topk_ids_compiled)
diff --git a/vllm_v0.10.0/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py b/vllm_v0.10.0/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
new file mode 100644
index 0000000..673a0aa
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
+    silu_mul_fp8_quant_deep_gemm)
+from vllm.platforms import current_platform
+
+# (E, T, H, group_size, seed)
+CASES = [
+    (1, 1, 128, 64, 0),
+    (1, 4, 128, 128, 0),
+    (2, 4, 256, 128, 0),
+    (32, 64, 256, 128, 0),
+    (17, 31, 768, 128, 0),
+]
+
+
+@pytest.mark.parametrize("E,T,H,group_size,seed", CASES)
+@torch.inference_mode()
+def test_silu_mul_fp8_quant_deep_gemm(E, T, H, group_size, seed):
+    current_platform.seed_everything(seed)
+
+    # Input tensor of shape (E, T, 2*H)
+    y = torch.randn((E, T, 2 * H), dtype=torch.float32, device="cuda")
+    tokens_per_expert = torch.randint(
+        low=0,
+        high=T,
+        size=(E, ),
+        dtype=torch.int32,
+        device="cuda",
+    )
+
+    # Run the Triton kernel
+    y_q, y_s = silu_mul_fp8_quant_deep_gemm(y,
+                                            tokens_per_expert,
+                                            group_size=group_size,
+                                            eps=1e-10)
+
+    # Reference implementation
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max = fp8_info.max
+    fp8_min = fp8_info.min
+    eps = 1e-10
+
+    # Compute silu activation and elementwise multiplication
+    y1 = y[..., :H]
+    y2 = y[..., H:]
+    silu_x = y1 * torch.sigmoid(y1)
+    merged = silu_x * y2
+
+    # Compute reference scales and quantized output, skipping padded tokens
+    for e in range(E):
+        nt = tokens_per_expert[e].item()
+        ref_s = torch.empty((T, H // group_size),
+                            dtype=torch.float32,
+                            device="cuda")
+        ref_q = torch.empty((T, H), dtype=torch.float8_e4m3fn, device="cuda")
+        for t in range(nt):
+            data = merged[e, t]
+            data_grp = data.view(H // group_size, group_size)
+            amax = data_grp.abs().amax(dim=1).clamp(min=eps)
+            scale = amax / fp8_max
+
+            scaled = data / scale.repeat_interleave(group_size)
+            clamped = scaled.clamp(fp8_min, fp8_max)
+            q = clamped.to(torch.float8_e4m3fn)
+
+            ref_s[t] = scale
+            ref_q[t] = q
+
+        y_se = y_s[e]
+        y_qe = y_q[e]
+
+        torch.testing.assert_close(y_se[:nt], ref_s[:nt])
+        torch.testing.assert_close(
+            y_qe[:nt].to(torch.float32),
+            ref_q[:nt].to(torch.float32),
+            atol=2,
+            rtol=2e-1,
+        )
diff --git a/vllm_v0.10.0/tests/kernels/moe/test_triton_moe_ptpc_fp8.py b/vllm_v0.10.0/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
new file mode 100644
index 0000000..dfd0f35
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
@@ -0,0 +1,166 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_triton_moe_channel_fp8_kernel.py
+import itertools
+
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.platforms import current_platform
+
+if current_platform.get_device_capability() < (9, 0):
+    pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
+                allow_module_level=True)
+
+vllm_config = VllmConfig()
+vllm_config.scheduler_config.max_num_seqs = 128
+vllm_config.scheduler_config.max_model_len = 8192
+
+
+def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):
+    """Matrix multiplication function that supports per-token input
+    quantization and per-column weight quantization"""
+    A = A.to(torch.float32)
+    B = B.to(torch.float32)
+
+    assert A.shape[-1] == B.shape[-1], "Dimension mismatch"
+    assert B.ndim == 2 and B.is_contiguous(
+    ), "B must be a 2D contiguous tensor"
+
+    # Reshape input
+    M = A.numel() // A.shape[-1]
+    B = B.t()  # Transpose weight matrix
+    N, K = B.shape
+    origin_C_shape = A.shape[:-1] + (K, )
+    A = A.reshape(M, N)
+
+    # As is per-token [M, 1], Bs is per-column [1, K]
+    C = torch.matmul(A, B)  # [M, K]
+    C = As * C * Bs.view(1, -1)  # Broadcast per-column scale
+
+    return C.reshape(origin_C_shape).to(output_dtype)
+
+
+def fp8_mask(a, mask):
+    dtype = a.dtype
+    return a.view(torch.int8)[mask].view(dtype)
+
+
+def torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk):
+    """This function performs fused moe with per-column int8
+    quantization using native torch."""
+
+    B, D = a.shape
+    # Perform per-token quantization
+    a_q, a_s = ops.scaled_fp8_quant(a, use_per_token_if_dynamic=True)
+    # Repeat tokens to match topk
+    a_q = a_q.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    # Also repeat the scale
+    a_s = a_s.view(B, -1, 1).repeat(1, topk, 1).reshape(-1, 1)  # [B*topk, 1]
+
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+
+    # Calculate routing
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+    # Process each expert
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            # First MLP layer: note that a_s is now per-token
+            inter_out = native_w8a8_per_token_matmul(
+                fp8_mask(a_q, mask),
+                w1[i],
+                fp8_mask(a_s, mask),
+                w1_s[i],
+                output_dtype=a.dtype,
+            )
+            # Activation function
+            act_out = SiluAndMul().forward_native(inter_out)
+            # Quantize activation output with per-token
+            act_out_q, act_out_s = ops.scaled_fp8_quant(
+                act_out, use_per_token_if_dynamic=True)
+
+            # Second MLP layer
+            out[mask] = native_w8a8_per_token_matmul(act_out_q,
+                                                     w2[i],
+                                                     act_out_s,
+                                                     w2_s[i],
+                                                     output_dtype=a.dtype)
+    # Apply routing weights and sum
+    return (out.view(B, -1, w2.shape[1]) *
+            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+
+
+@pytest.fixture(autouse=True, scope="module")
+def setup_cuda():
+    """Sets the default CUDA device for all tests in this module."""
+    torch.set_default_device("cuda")
+
+
+DTYPES = [torch.half, torch.bfloat16]
+M = [1, 33]
+N = [128, 1024]
+K = [256, 4096]
+E = [8]
+TOP_KS = [2, 6]
+SEEDS = [0]
+
+
+@pytest.mark.parametrize("M, N, K, E, topk, dtype, seed",
+                         itertools.product(M, N, K, E, TOP_KS, DTYPES, SEEDS))
+@torch.inference_mode()
+def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed):
+    torch.manual_seed(seed)
+    # Initialize int8 quantization parameters
+    factor_for_scale = 1e-2
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    fp8_max = finfo.max
+    fp8_min = finfo.min
+
+    # Input tensor
+    # M * K
+    a = torch.randn((M, K), dtype=dtype) / 10
+
+    # Generate int8 weights
+    w1_fp32 = (torch.rand((E, 2 * N, K), dtype=torch.float32) - 0.5) * 2
+    w1 = (w1_fp32 * fp8_max).clamp(min=fp8_min,
+                                   max=fp8_max).to(torch.float8_e4m3fn)
+
+    w2_fp32 = (torch.rand((E, K, N), dtype=torch.float32) - 0.5) * 2
+    w2 = (w2_fp32 * fp8_max).clamp(min=fp8_min,
+                                   max=fp8_max).to(torch.float8_e4m3fn)
+
+    # Generate scale for each column (per-column quantization)
+    w1_s = torch.rand(E, 2 * N, device=w1_fp32.device) * factor_for_scale
+    w2_s = torch.rand(E, K, device=w2_fp32.device) * factor_for_scale
+    score = torch.randn((M, E), dtype=dtype)
+
+    with set_current_vllm_config(vllm_config):
+        ref_out = torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk)
+        out = fused_moe(
+            a,
+            w1,
+            w2,
+            score,
+            topk,
+            renormalize=False,
+            use_fp8_w8a8=True,  # using fp8
+            per_channel_quant=True,
+            w1_scale=w1_s,
+            w2_scale=w2_s,
+            block_shape=None,  # Not using block quantization
+        )
+
+    # Check results
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    assert rel_diff < 0.05
diff --git a/vllm_v0.10.0/tests/kernels/moe/utils.py b/vllm_v0.10.0/tests/kernels/moe/utils.py
new file mode 100644
index 0000000..df89ad7
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/moe/utils.py
@@ -0,0 +1,247 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import torch
+
+import vllm._custom_ops as ops
+from tests.kernels.quant_utils import (per_block_cast_to_fp8,
+                                       per_block_cast_to_int8)
+from vllm.model_executor.layers.fused_moe import fused_experts
+from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+    BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel)
+from vllm.model_executor.layers.fused_moe.utils import (
+    moe_kernel_quantize_input)
+from vllm.utils import round_up
+
+
+def triton_moe(
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weight: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    quant_dtype: Optional[torch.dtype] = None,
+    per_act_token_quant=False,
+    block_shape: Optional[list[int]] = None,
+) -> torch.Tensor:
+    return fused_experts(a,
+                         w1,
+                         w2,
+                         topk_weight,
+                         topk_ids,
+                         w1_scale=w1_scale,
+                         w2_scale=w2_scale,
+                         a1_scale=a1_scale,
+                         a2_scale=a2_scale,
+                         per_channel_quant=per_act_token_quant,
+                         use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
+                         block_shape=block_shape)
+
+
+def batched_moe(
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weight: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    quant_dtype: Optional[torch.dtype] = None,
+    per_act_token_quant: bool = False,
+    block_shape: Optional[list[int]] = None,
+) -> torch.Tensor:
+    max_num_tokens = round_up(a.shape[0], 64)
+
+    fused_experts = FusedMoEModularKernel(
+        BatchedPrepareAndFinalize(max_num_tokens,
+                                  num_dispatchers=1,
+                                  num_local_experts=w1.shape[0],
+                                  rank=0),
+        BatchedTritonExperts(
+            max_num_tokens=max_num_tokens,
+            num_dispatchers=1,
+            use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
+            per_act_token_quant=per_act_token_quant,
+            block_shape=block_shape,
+        ),
+    )
+
+    return fused_experts(a,
+                         w1,
+                         w2,
+                         topk_weight,
+                         topk_ids,
+                         w1_scale=w1_scale,
+                         w2_scale=w2_scale,
+                         a1_scale=a1_scale,
+                         a2_scale=a2_scale)
+
+
+def naive_batched_moe(
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weight: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    quant_dtype: Optional[torch.dtype] = None,
+    per_act_token_quant: bool = False,
+    block_shape: Optional[list[int]] = None,
+) -> torch.Tensor:
+    max_num_tokens = round_up(a.shape[0], 64)
+
+    fused_experts = FusedMoEModularKernel(
+        BatchedPrepareAndFinalize(max_num_tokens,
+                                  num_dispatchers=1,
+                                  num_local_experts=w1.shape[0],
+                                  rank=0),
+        NaiveBatchedExperts(
+            max_num_tokens=max_num_tokens,
+            num_dispatchers=1,
+            use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
+            per_act_token_quant=per_act_token_quant,
+            block_shape=block_shape,
+        ),
+    )
+
+    return fused_experts(a,
+                         w1,
+                         w2,
+                         topk_weight,
+                         topk_ids,
+                         w1_scale=w1_scale,
+                         w2_scale=w2_scale,
+                         a1_scale=a1_scale,
+                         a2_scale=a2_scale)
+
+
+def chunk_scales(scales: Optional[torch.Tensor], start: int,
+                 end: int) -> Optional[torch.Tensor]:
+    if scales is not None:
+        if scales.numel() == 1:
+            return scales
+        else:
+            return scales[start:end]
+    return None
+
+
+def make_quantized_test_activations(
+    E: int,
+    m: int,
+    k: int,
+    in_dtype: torch.dtype,
+    quant_dtype: Optional[torch.dtype] = None,
+    block_shape: Optional[list[int]] = None,
+    per_act_token_quant: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    a = torch.randn((E, m, k), device="cuda", dtype=in_dtype) / 10
+    a_q = a
+    a_scale = None
+
+    if quant_dtype is not None:
+        assert (quant_dtype == torch.float8_e4m3fn
+                or quant_dtype == torch.int8), "only fp8/int8 supported"
+        a_q = torch.zeros_like(a, dtype=quant_dtype)
+        a_scale_l = [None] * E
+        for e in range(E):
+            a_q[e], a_scale_l[e] = moe_kernel_quantize_input(
+                a[e], None, quant_dtype, per_act_token_quant, block_shape)
+        a_scale = torch.stack(a_scale_l)
+
+        if not per_act_token_quant and block_shape is None:
+            a_scale = a_scale.view(E, 1, 1)
+
+    return a, a_q, a_scale
+
+
+def moe_quantize_weights(
+    w: torch.Tensor,
+    w_s: Optional[torch.Tensor],
+    quant_dtype: Optional[torch.dtype],
+    per_token_quant: bool,
+    block_shape: Optional[list[int]],
+) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    assert (quant_dtype == torch.float8_e4m3fn
+            or quant_dtype == torch.int8), "only fp8/int8 supported"
+
+    if block_shape is not None:
+        assert not per_token_quant
+        if quant_dtype == torch.int8:
+            w, w_s = per_block_cast_to_int8(w, block_shape)
+        else:
+            w, w_s = per_block_cast_to_fp8(w, block_shape)
+    else:
+        if quant_dtype == torch.int8:
+            w, w_s = ops.scaled_int8_quant(
+                w, w_s, use_per_token_if_dynamic=per_token_quant)
+        else:
+            w, w_s = ops.scaled_fp8_quant(
+                w, w_s, use_per_token_if_dynamic=per_token_quant)
+
+    return w, w_s
+
+
+def make_test_weight(
+    e: int,
+    rows: int,
+    cols: int,
+    in_dtype: torch.dtype = torch.bfloat16,
+    quant_dtype: Optional[torch.dtype] = None,
+    block_shape: Optional[list[int]] = None,
+    per_act_token_quant: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    w_16 = torch.randn((e, rows, cols), device="cuda", dtype=in_dtype) / 15
+
+    if quant_dtype is not None:
+        w_l = [None] * e
+        w_s_l = [None] * e
+        for idx in range(e):
+            w_l[idx], w_s_l[idx] = moe_quantize_weights(
+                w_16[idx], None, quant_dtype, per_act_token_quant, block_shape)
+
+        w = torch.stack(w_l)
+        w_s = torch.stack(w_s_l)
+        if w_s.ndim == 2:
+            assert w_s.shape[-1] == 1
+            w_s = w_s.view(-1, 1, 1)
+
+        if block_shape is not None:
+            block_n, block_k = block_shape
+            n_tiles = (rows + block_n - 1) // block_n
+            k_tiles = (cols + block_k - 1) // block_k
+            assert w_s.shape == (e, n_tiles, k_tiles)
+    else:
+        w = w_16
+        w_s = None
+
+    return w_16, w, w_s
+
+
+def make_test_weights(
+    e: int,
+    n: int,
+    k: int,
+    in_dtype: torch.dtype = torch.bfloat16,
+    quant_dtype: Optional[torch.dtype] = None,
+    block_shape: Optional[list[int]] = None,
+    per_act_token_quant: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], torch.Tensor,
+           torch.Tensor, Optional[torch.Tensor]]:
+    return (
+        *make_test_weight(e, 2 * n, k, in_dtype, quant_dtype, block_shape,
+                          per_act_token_quant),
+        *make_test_weight(e, k, n, in_dtype, quant_dtype, block_shape,
+                          per_act_token_quant),
+    )
diff --git a/vllm_v0.10.0/tests/kernels/quant_utils.py b/vllm_v0.10.0/tests/kernels/quant_utils.py
new file mode 100644
index 0000000..6f43d11
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/quant_utils.py
@@ -0,0 +1,330 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional, Union
+
+import torch
+
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    group_broadcast)
+from vllm.platforms import current_platform
+from vllm.utils import round_up
+
+# Using the default value (240.0) from pytorch will cause accuracy
+# issue on dynamic quantization models. Here use 224.0 for rocm.
+ROCM_FP8FNUZ_MAX = 224.0
+FP8_DTYPE = current_platform.fp8_dtype()
+
+
+def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
+    return torch.as_tensor(x, dtype=torch.float32, device='cuda')
+
+def ref_dynamic_per_token_quant(x: torch.tensor,
+                                quant_dtype: torch.dtype,
+                                scale_ub: Optional[torch.tensor] = None) \
+        -> tuple[torch.tensor, torch.tensor]:
+
+    assert quant_dtype in [torch.int8, FP8_DTYPE]
+    if scale_ub is not None:
+        assert quant_dtype == FP8_DTYPE
+
+    qtype_traits = torch.iinfo(quant_dtype) if quant_dtype == torch.int8 \
+            else torch.finfo(quant_dtype)
+    qtype_traits_max = ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \
+                                            and current_platform.is_fp8_fnuz() \
+                                        else qtype_traits.max
+    qtype_traits_min = -ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \
+                                            and current_platform.is_fp8_fnuz() \
+                                        else qtype_traits.min
+    qtype_max = as_float32_tensor(qtype_traits_max)
+    s_1 = as_float32_tensor(1.0)
+    s_512 = as_float32_tensor(512.0)
+
+    # For fp8, in order to match the cuda kernel output, we have to do exactly
+    # the same operations as in the corresponding fp8 kernel to prevent
+    # rounding errors.
+
+    # Compute scales
+    x_token_max, _ = x.abs().max(dim=-1)
+    x_token_max = as_float32_tensor(x_token_max)
+    if scale_ub is not None:
+        x_token_max = x_token_max.clamp(max=scale_ub)
+    scales = (x_token_max / qtype_max)[:, None]
+
+    # Quant
+    if quant_dtype == torch.int8:
+        iscales = as_float32_tensor(s_1 / scales)
+        torch_out = as_float32_tensor(x) * iscales
+        torch_out = torch_out.round()
+        torch_out = torch_out.clamp(qtype_traits_min,
+                                    qtype_traits_max).to(quant_dtype)
+    else:
+        assert quant_dtype == FP8_DTYPE
+        min_scaling_factor = s_1 / (qtype_max * s_512)
+        scales = scales.clamp(min=min_scaling_factor)
+        torch_out = as_float32_tensor(x) / scales
+        torch_out = torch_out.clamp(qtype_traits_min,
+                                    qtype_traits_max).to(quant_dtype)
+
+    return torch_out, scales
+
+
+# The int8 version is very similar. Incorporate the int8 version, like in
+# ref_dynamic_per_token_quant, when we have a dynamic_per_tensor int8 quant
+# kernel
+def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \
+                    -> tuple[torch.tensor, torch.tensor]:
+
+    fp8_traits = torch.finfo(FP8_DTYPE)
+    fp8_traits_max = ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \
+                                            and current_platform.is_fp8_fnuz() \
+                                    else fp8_traits.max
+    fp8_traits_min = -ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \
+                                            and current_platform.is_fp8_fnuz() \
+                                    else fp8_traits.min
+    fp8_max = as_float32_tensor(fp8_traits_max)
+    one = as_float32_tensor(1.0)
+
+    # For fp8, in order to match the cuda kernel output, we have to do exactly
+    # the same operations as in the corresponding fp8 kernel to prevent
+    # rounding errors.
+
+    x_max = as_float32_tensor(x.abs().max())
+    ref_scale = x_max / fp8_max
+    ref_iscale = one / ref_scale
+    ref_out = (as_float32_tensor(x) * ref_iscale).clamp(
+        fp8_traits_min, fp8_traits_max).to(FP8_DTYPE)
+    return ref_out, ref_scale.view((1, ))
+
+
+def native_w8a8_block_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int],
+    output_dtype: torch.dtype,
+    compute_type: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    """This function performs matrix multiplication with block-wise
+    quantization using native torch.
+    It is agnostic to the input data type and can be used for both int8 and
+    fp8 data types.
+
+    It takes two input tensors `A` and `B` (int8) with scales `As` and
+    `Bs` (float32).
+    The output is returned in the specified `output_dtype`.
+    """
+    A = A.to(compute_type)
+    B = B.to(compute_type)
+    assert A.shape[-1] == B.shape[-1]
+    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+    assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1]
+
+    M = A.numel() // A.shape[-1]
+    N, K = B.shape
+    origin_C_shape = A.shape[:-1] + (N, )
+    A = A.reshape(M, A.shape[-1])
+    As = As.reshape(M, As.shape[-1])
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+    assert n_tiles == Bs.shape[0], f"{n_tiles} == {Bs.shape[0]}"
+    assert k_tiles == Bs.shape[1], f"{k_tiles} == {Bs.shape[1]}"
+
+    C_shape = (M, N)
+    C = torch.zeros(C_shape, dtype=compute_type, device=A.device)
+
+    A_tiles = [
+        A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
+    ]
+    B_tiles = [[
+        B[
+            j * block_n:min((j + 1) * block_n, N),
+            i * block_k:min((i + 1) * block_k, K),
+        ] for i in range(k_tiles)
+    ] for j in range(n_tiles)]
+    C_tiles = [
+        C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
+    ]
+    As_tiles = [As[:, i:i + 1] for i in range(k_tiles)]
+
+    for i in range(k_tiles):
+        for j in range(n_tiles):
+            a = A_tiles[i]
+            b = B_tiles[j][i]
+            c = C_tiles[j]
+            s = As_tiles[i] * Bs[j][i]
+            c[:, :] += torch.matmul(a, b.t()) * s
+
+    C = C.reshape(origin_C_shape).to(output_dtype)
+    return C
+
+
+def native_per_token_group_quant_fp8(x,
+                                     group_size,
+                                     eps=1e-10,
+                                     dtype=torch.float8_e4m3fn):
+    """Function to perform per-token-group quantization on an input tensor
+    `x` using native torch."""
+    assert x.shape[-1] % group_size == 0, ("the last dimension of `x` must "
+                                           "be divisible by `group_size`")
+    assert x.is_contiguous(), "`x` is not contiguous"
+
+    finfo = torch.finfo(dtype)
+    fp8_min = finfo.min
+    fp8_max = finfo.max
+
+    x_ = x.reshape(x.numel() // group_size, group_size)
+    amax = x_.abs().max(dim=-1,
+                        keepdim=True)[0].clamp(min=eps).to(torch.float32)
+    x_s = amax / fp8_max
+    x_q = (x_ / x_s).clamp(min=fp8_min, max=fp8_max).to(dtype)
+    x_q = x_q.reshape(x.shape)
+    x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size, ))
+
+    return x_q, x_s
+
+
+def native_per_token_group_quant_int8(x,
+                                      group_size,
+                                      eps=1e-10,
+                                      dtype=torch.int8):
+    """Function to perform per-token-group quantization on an input tensor
+    `x` using native torch.
+
+    It converts the tensor values into int8 values and returns the
+    quantized tensor along with the scaling factor used for quantization.
+    """
+    assert (x.shape[-1] % group_size == 0
+            ), "the last dimension of `x` must be divisible by `group_size`"
+    assert x.is_contiguous(), "`x` is not contiguous"
+
+    iinfo = torch.iinfo(dtype)
+    int8_min = iinfo.min
+    int8_max = iinfo.max
+
+    x_ = x.reshape(x.numel() // group_size, group_size)
+    # Use float32 for scale calculation for stability
+    amax = x_.abs().max(dim=-1,
+                        keepdim=True)[0].clamp(min=eps).to(torch.float32)
+    x_s = amax / int8_max
+    x_q = (x_.to(torch.float32) / x_s).round().clamp(
+        min=int8_min, max=int8_max).to(dtype)  # Round before clamping
+    x_q = x_q.reshape(x.shape)
+    x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size, ))
+
+    return x_q, x_s
+
+
+DEFAULT_BLOCK_SHAPE = [128, 128]
+
+
+def per_block_cast_to_fp8(
+    x: torch.Tensor,
+    block_shape: list[int] = DEFAULT_BLOCK_SHAPE,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    block_m, block_n = block_shape
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros((round_up(m, block_m), round_up(n, block_n)),
+                           dtype=x.dtype,
+                           device=x.device)
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, block_m, x_padded.size(1) // block_n, block_n)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
+    x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
+    scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
+    return x_scaled_sub, scales
+
+
+def per_block_cast_to_int8(
+    x: torch.Tensor,
+    block_shape: list[int] = DEFAULT_BLOCK_SHAPE,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    block_m, block_n = block_shape
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros((round_up(m, block_m), round_up(n, block_n)),
+                           dtype=x.dtype,
+                           device=x.device)
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, block_m, x_padded.size(1) // block_n, block_n)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    x_scaled = (x_view * (256.0 / x_amax)).to(torch.int8)
+    x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
+    scales = (x_amax / 256.0).view(x_view.size(0), x_view.size(2))
+    return x_scaled_sub, scales
+
+
+def dequant(
+    t: torch.Tensor,
+    scale: Optional[torch.Tensor],
+    block_shape: Optional[list[int]],
+    per_act_token_quant: bool,
+    out_dtype: Optional[torch.dtype] = torch.float32,
+) -> torch.Tensor:
+    if scale is not None:
+        f32 = torch.float32
+        if per_act_token_quant or block_shape is None:
+            return (t.to(f32) * scale).to(out_dtype)
+        else:
+            return (t.to(f32) * group_broadcast(scale, t.shape)).to(out_dtype)
+    else:
+        return t.to(out_dtype)
+
+
+def batched_dequant(
+    t: torch.Tensor,
+    scale: Optional[torch.Tensor],
+    block_shape: Optional[list[int]],
+    per_act_token_quant: bool,
+    out_dtype: Optional[torch.dtype] = torch.float32,
+) -> torch.Tensor:
+    if scale is not None:
+        assert t.shape[0] == scale.shape[0]
+        out = torch.empty_like(t, dtype=out_dtype)
+        for e in range(t.shape[0]):
+            out[e] = dequant(t[e], scale[e], block_shape, per_act_token_quant,
+                             out_dtype)
+        return out
+
+    return t.to(out_dtype)
+
+
+def native_batched_masked_quant_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    C: torch.Tensor,
+    num_expert_tokens: torch.Tensor,
+    A_scale: Optional[torch.Tensor] = None,
+    B_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[list[int]] = None,
+    per_act_token_quant: bool = False,
+) -> torch.Tensor:
+    num_expert_tokens_cpu = num_expert_tokens.clone()
+    num_expert_tokens_cpu = num_expert_tokens_cpu.to(device="cpu")
+    num_experts = num_expert_tokens.size(0)
+
+    for e in range(num_experts):
+        num_tokens = num_expert_tokens_cpu[e]
+        if A.dtype.itemsize == 1 and block_shape is not None:
+            assert A_scale is not None and B_scale is not None
+            tmp = native_w8a8_block_matmul(A[e], B[e], A_scale[e], B_scale[e],
+                                           block_shape, C.dtype)
+            C[e, :num_tokens, :] = tmp[:num_tokens, :]
+        elif A.dtype.itemsize == 1 and block_shape is None:
+            assert A_scale is not None and B_scale is not None
+            A_dq = dequant(A[e], A_scale[e], block_shape, per_act_token_quant)
+            B_dq = dequant(B[e], B_scale[e], block_shape, per_act_token_quant)
+            C[e, :num_tokens, :] = (
+                A_dq[:num_tokens] @ B_dq.transpose(0, 1)).to(C.dtype)
+        else:
+            assert A_scale is None
+            assert B_scale is None
+            C[e, :num_tokens, :] = A[e, :num_tokens, :] @ B[e].transpose(0, 1)
+
+    return C
diff --git a/vllm_v0.10.0/tests/kernels/quantization/nvfp4_utils.py b/vllm_v0.10.0/tests/kernels/quantization/nvfp4_utils.py
new file mode 100644
index 0000000..1095975
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/quantization/nvfp4_utils.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.scalar_type import scalar_types
+
+FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
+FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
+
+kE2M1ToFloat = torch.tensor([0., 0.5, 1., 1.5, 2., 3., 4., 6.],
+                            dtype=torch.float32)
+
+
+def convert_swizzled_to_linear(a_sf_swizzled: torch.Tensor, m, k, block_size):
+    m_tiles = (m + 128 - 1) // 128
+    f = block_size * 4
+    k_tiles = (k + f - 1) // f
+    tmp = torch.reshape(a_sf_swizzled, (1, m_tiles, k_tiles, 32, 4, 4))
+    tmp = torch.permute(tmp, (0, 1, 4, 3, 2, 5))
+    out = tmp.reshape(m_tiles * 128, k_tiles * f // block_size)
+    return out[0:m, 0:k]
+
+
+def dequantize_nvfp4_to_dtype(tensor_fp4,
+                              tensor_sf,
+                              global_scale,
+                              dtype,
+                              device,
+                              block_size=16):
+    """Dequantize the fp4 tensor back to high precision."""
+    # Two fp4 values are packed into one uint8.
+    assert tensor_fp4.dtype == torch.uint8
+    m, packed_k = tensor_fp4.shape
+    k = packed_k * 2
+    tensor_f32 = break_fp4_bytes(tensor_fp4, dtype)
+    tensor_f32 = tensor_f32.reshape(m, k // block_size, block_size)
+    tensor_sf = tensor_sf.view(torch.float8_e4m3fn)
+    tensor_sf = convert_swizzled_to_linear(tensor_sf, m, k, block_size)
+    tensor_sf_dtype = tensor_sf.to(torch.float32) / global_scale
+
+    # scale the tensor
+    out = (tensor_f32 * tensor_sf_dtype.unsqueeze(-1)).reshape(m, k)
+    return out.to(dtype=dtype)
+
+
+def break_fp4_bytes(a, dtype):
+    assert a.dtype == torch.uint8
+    m, n = a.shape
+
+    # Vectorized nibble processing
+    a_flat = a.flatten()
+    high = (a_flat & 0xF0) >> 4  # Upper nibbles
+    low = a_flat & 0x0F  # Lower nibbles
+
+    # Combine nibbles for batch processing
+    combined = torch.stack((low, high), dim=1).flatten()
+
+    # Vectorized sign and magnitude extraction
+    signs = (combined & 0x08).to(torch.bool)  # Sign bits
+    abs_vals = (combined & 0x07).to(torch.long)  # Magnitude indices
+
+    # Device-aware lookup and sign application
+    kE2M1 = kE2M1ToFloat.to(device=a.device)
+    values = kE2M1[abs_vals] * torch.where(signs, -1.0, 1.0)
+
+    # Reshape to final form
+    return values.reshape(m, n * 2).to(dtype=dtype)
diff --git a/vllm_v0.10.0/tests/kernels/quantization/test_allspark_gemm.py b/vllm_v0.10.0/tests/kernels/quantization/test_allspark_gemm.py
new file mode 100644
index 0000000..3de9cb3
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/quantization/test_allspark_gemm.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.allspark_utils import (
+    ALLSPARK_AMPERE_K_ALIGN, ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
+    ALLSPARK_AMPERE_N_ALIGN)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    quantize_weights)
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+
+
+def is_gptq_allspark_supported(min_capability: int,
+                               max_capability: int) -> bool:
+    if not current_platform.is_cuda():
+        return False
+
+    capability = current_platform.get_device_capability()
+    assert capability is not None
+
+    return capability.to_int() >= min_capability \
+        and capability.to_int() <= max_capability
+
+
+MNK_FACTORS = [
+    (1, 4, 8),
+    (13, 17, 67),
+    (26, 37, 13),
+    (48, 16, 24),
+    (67, 13, 88),
+    (257, 13, 11),
+    (658, 13, 11),
+    (1033, 9, 17),
+]
+
+DTYPES = [torch.float16, torch.bfloat16]
+HAS_ZP_OPTS = [False, True]
+
+
+def compute_max_diff(output, output_ref):
+    return torch.mean(torch.abs(output - output_ref)) / torch.mean(
+        torch.abs(output_ref))
+
+
+def rand_data(shape, dtype=torch.float16):
+    return torch.randn(shape, dtype=dtype, device="cuda")
+
+
+@pytest.mark.skipif(
+    not is_gptq_allspark_supported(80, 89),
+    reason="AllSpark Ampere kernel is not supported on this GPU type.")
+@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
+@pytest.mark.parametrize("group_size", [-1])
+@pytest.mark.parametrize("has_zp", HAS_ZP_OPTS)
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_gptq_allspark_gemm_ampere(mnk_factors, group_size, has_zp, dtype):
+    m_factor, n_factor, k_factor = mnk_factors
+    m = m_factor
+    n = n_factor * ALLSPARK_AMPERE_N_ALIGN
+    k = k_factor * ALLSPARK_AMPERE_K_ALIGN
+
+    input = rand_data((m, k), dtype=dtype)
+    weight = rand_data((k, n), dtype=dtype)
+
+    # Quantize (and apply act_order if provided)
+    w_ref, qw, s, zp = quantize_weights(weight, scalar_types.uint8b128,
+                                        group_size, has_zp)
+
+    qw = qw.to(torch.uint8)
+    if has_zp:
+        zp = zp.to(dtype)
+    properties = torch.cuda.get_device_properties(qw.device.index)
+    sm_count = properties.multi_processor_count
+    sm_version = properties.major * 10 + properties.minor
+
+    n_32align = (n + 32 - 1) // 32 * 32
+
+    qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight(
+        qw, s, zp, has_zp)
+    opcheck(torch.ops._C.rearrange_kn_weight_as_n32k16_order,
+            (qw, s, zp, has_zp, qw_reorder, s_reorder, zp_reorder, k, n,
+             n_32align))
+
+    opcheck(torch.ops._C.allspark_w8a16_gemm,
+            (input, qw_reorder, s_reorder, zp_reorder, n, group_size, sm_count,
+             sm_version, ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD, has_zp, True),
+            test_utils=DEFAULT_OPCHECK_TEST_UTILS)
+    output = ops.allspark_w8a16_gemm(input, qw_reorder, s_reorder, zp_reorder,
+                                     n, group_size, sm_count, sm_version,
+                                     ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
+                                     has_zp, True)
+
+    output_ref = torch.matmul(input, w_ref)
+    torch.cuda.synchronize()
+    max_diff = compute_max_diff(output, output_ref)
+
+    assert max_diff < 0.04
diff --git a/vllm_v0.10.0/tests/kernels/quantization/test_aqlm.py b/vllm_v0.10.0/tests/kernels/quantization/test_aqlm.py
new file mode 100644
index 0000000..427db3e
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/quantization/test_aqlm.py
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from tests.kernels.utils import opcheck
+from vllm import _custom_ops as ops  # noqa: F401
+
+
+def test_aqlm_dequant_opcheck():
+    codes = torch.randint(-32768,
+                          32767, (22016, 512, 1),
+                          device='cuda',
+                          dtype=torch.int16)
+    codebooks = torch.rand((2, 65536, 1, 8),
+                           device='cuda',
+                           dtype=torch.float16)
+    codebook_partition_sizes = [11008, 11008]
+
+    opcheck(torch.ops._C.aqlm_dequant,
+            (codes, codebooks, codebook_partition_sizes))
+
+
+def test_aqlm_gemm_opcheck():
+    input = torch.rand((4, 4096), device='cuda', dtype=torch.float16)
+    codes = torch.randint(-32768,
+                          32767, (12288, 512, 1),
+                          device='cuda',
+                          dtype=torch.int16)
+    codebooks = torch.rand((3, 65536, 1, 8),
+                           device='cuda',
+                           dtype=torch.float16)
+    scales = torch.rand((12288, 1, 1, 1), device='cuda', dtype=torch.float16)
+    codebook_partition_sizes = [4096, 4096, 4096]
+    bias = None
+
+    opcheck(torch.ops._C.aqlm_gemm,
+            (input, codes, codebooks, scales, codebook_partition_sizes, None))
+    opcheck(torch.ops._C.aqlm_gemm,
+            (input, codes, codebooks, scales, codebook_partition_sizes, bias))
diff --git a/vllm_v0.10.0/tests/kernels/quantization/test_awq.py b/vllm_v0.10.0/tests/kernels/quantization/test_awq.py
new file mode 100644
index 0000000..bc08681
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/quantization/test_awq.py
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from tests.kernels.utils import opcheck
+from vllm import _custom_ops as ops  # noqa: F401
+
+
+@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_dequantize"),
+                    reason="AWQ is not supported on this GPU type.")
+def test_awq_dequantize_opcheck(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_TRITON_AWQ", "0")
+        qweight = torch.randint(-2000000000,
+                                2000000000, (8192, 256),
+                                device='cuda',
+                                dtype=torch.int32)
+        scales = torch.rand((64, 2048), device='cuda', dtype=torch.float16)
+        zeros = torch.empty((64, 256), device='cuda', dtype=torch.int32)
+        split_k_iters = 0
+        thx = 0
+        thy = 0
+        opcheck(torch.ops._C.awq_dequantize,
+                (qweight, scales, zeros, split_k_iters, thx, thy))
+
+
+@pytest.mark.skip(reason="Not working; needs investigation.")
+@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_gemm"),
+                    reason="AWQ is not supported on this GPU type.")
+def test_awq_gemm_opcheck(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_TRITON_AWQ", "0")
+        input = torch.rand((2, 8192), device='cuda', dtype=torch.float16)
+        qweight = torch.randint(-2000000000,
+                                2000000000, (8192, 256),
+                                device='cuda',
+                                dtype=torch.int32)
+        scales = torch.randint(-2000000000,
+                               2000000000, (64, 256),
+                               device='cuda',
+                               dtype=torch.int32)
+        qzeros = torch.empty((64, 2048), device='cuda', dtype=torch.float16)
+        split_k_iters = 8
+        opcheck(torch.ops._C.awq_gemm,
+                (input, qweight, qzeros, scales, split_k_iters))
diff --git a/vllm_v0.10.0/tests/kernels/quantization/test_awq_triton.py b/vllm_v0.10.0/tests/kernels/quantization/test_awq_triton.py
new file mode 100644
index 0000000..96797e8
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/quantization/test_awq_triton.py
@@ -0,0 +1,172 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for the AWQ Triton kernel.
+
+Run `pytest tests/kernels/test_awq_triton.py`.
+"""
+import pytest
+import torch
+
+from vllm.model_executor.layers.quantization.awq_triton import (
+    AWQ_TRITON_SUPPORTED_GROUP_SIZES, awq_dequantize_triton, awq_gemm_triton)
+from vllm.platforms import current_platform
+
+device = "cuda"
+
+
+def reverse_awq_order(t: torch.Tensor):
+    bits = 4
+    AWQ_REVERSE_ORDER = [0, 4, 1, 5, 2, 6, 3, 7]
+    reverse_order_tensor = torch.arange(
+        t.shape[-1],
+        dtype=torch.int32,
+        device=t.device,
+    )
+    reverse_order_tensor = reverse_order_tensor.view(-1, 32 // bits)
+    reverse_order_tensor = reverse_order_tensor[:, AWQ_REVERSE_ORDER]
+    reverse_order_tensor = reverse_order_tensor.view(-1)
+
+    t = t[:, reverse_order_tensor] & 0xF
+    return t
+
+
+# qweights - [R     , C // 8], int32
+# scales   - [R // G, C     ], float16
+# zeros    - [R // G, C // 8], int32
+def awq_dequantize_torch(qweight: torch.Tensor, scales: torch.Tensor,
+                         qzeros: torch.Tensor,
+                         group_size: int) -> torch.Tensor:
+
+    if group_size == -1:
+        group_size = qweight.shape[0]
+
+    bits = 4
+    shifts = torch.arange(0, 32, bits, device=qzeros.device)
+
+    iweights = torch.bitwise_right_shift(qweight[:, :, None],
+                                         shifts[None, None, :]).to(torch.int8)
+
+    iweights = iweights.view(iweights.shape[0], -1)
+
+    zeros = torch.bitwise_right_shift(qzeros[:, :, None],
+                                      shifts[None, None, :]).to(torch.int8)
+    zeros = zeros.view(qzeros.shape[0], -1)
+    zeros = reverse_awq_order(zeros)
+
+    iweights = reverse_awq_order(iweights)
+
+    iweights = torch.bitwise_and(iweights, (2**bits) - 1)
+    zeros = torch.bitwise_and(zeros, (2**bits) - 1)
+
+    scales = scales.repeat_interleave(group_size, dim=0)
+    zeros = zeros.repeat_interleave(group_size, dim=0)
+    return (iweights - zeros) * scales
+
+
+# qweights - [R     , C // 8], int32
+# scales   - [R // G, C     ], float16
+# zeros    - [R // G, C // 8], int32
+@pytest.mark.parametrize("qweight_rows", [3584, 18944, 128, 256, 512, 1024])
+@pytest.mark.parametrize("qweight_cols", [448, 576, 4736, 16, 32, 64, 128])
+@pytest.mark.parametrize("group_size", AWQ_TRITON_SUPPORTED_GROUP_SIZES)
+def test_dequantize(qweight_rows, qweight_cols, group_size):
+
+    if group_size == -1:
+        group_size = qweight_rows
+
+    qweight_dtype = torch.int32
+    scales_rows = qweight_rows // group_size
+    scales_cols = qweight_cols * 8
+    scales_dtype = torch.float16
+    zeros_rows = scales_rows
+    zeros_cols = qweight_cols
+    zeros_dtype = torch.int32
+
+    current_platform.seed_everything(0)
+
+    qweight = torch.randint(0,
+                            torch.iinfo(torch.int32).max,
+                            (qweight_rows, qweight_cols),
+                            dtype=qweight_dtype,
+                            device=device)
+    scales = torch.rand(scales_rows,
+                        scales_cols,
+                        dtype=scales_dtype,
+                        device=device)
+    zeros = torch.randint(0,
+                          torch.iinfo(torch.int32).max,
+                          (zeros_rows, zeros_cols),
+                          dtype=zeros_dtype,
+                          device=device)
+
+    iweights_triton = awq_dequantize_triton(qweight, scales, zeros)
+
+    assert (not torch.any(torch.isinf(iweights_triton))
+            and not torch.any(torch.isnan(iweights_triton)))
+
+    iweights_torch = awq_dequantize_torch(qweight, scales, zeros, group_size)
+
+    torch.testing.assert_close(iweights_triton, iweights_torch)
+
+
+# input   - [N, K]
+# qweight - [K, M // 8]
+# qzeros  - [K // G, M // 8]
+# scales  - [K // G, M]
+@pytest.mark.parametrize("N", [1, 2, 4, 8, 14, 17, 23, 32])
+@pytest.mark.parametrize("K", [128])
+@pytest.mark.parametrize("M", [16, 24, 32])
+@pytest.mark.parametrize("group_size", AWQ_TRITON_SUPPORTED_GROUP_SIZES)
+@pytest.mark.parametrize("splitK", [1, 8])
+def test_gemm(N, K, M, splitK, group_size):
+
+    if group_size == -1:
+        group_size = K
+
+    split_k_iters = splitK
+
+    input_rows = N
+    input_cols = K
+    input_dtype = torch.float32
+    qweight_rows = input_cols
+    qweight_cols = M // 8
+    scales_rows = qweight_rows // group_size
+    scales_cols = M
+    scales_dtype = torch.float32
+    qzeros_rows = scales_rows
+    qzeros_cols = qweight_cols
+
+    current_platform.seed_everything(0)
+
+    input = torch.rand((input_rows, input_cols),
+                       dtype=input_dtype,
+                       device=device)
+    qweight = torch.randint(0,
+                            torch.iinfo(torch.int32).max,
+                            (qweight_rows, qweight_cols),
+                            device=device)
+    qzeros = torch.randint(0,
+                           torch.iinfo(torch.int32).max,
+                           (qzeros_rows, qzeros_cols),
+                           device=device)
+    scales = torch.rand((scales_rows, scales_cols),
+                        dtype=scales_dtype,
+                        device=device)
+
+    output_triton = awq_gemm_triton(input, qweight, scales, qzeros,
+                                    split_k_iters)
+
+    assert (not torch.any(torch.isinf(output_triton))
+            and not torch.any(torch.isnan(output_triton)))
+
+    dequantized_weights = awq_dequantize_triton(qweight, scales, qzeros)
+
+    output_torch = torch.matmul(input, dequantized_weights)
+
+    assert (not torch.any(torch.isinf(output_torch))
+            and not torch.any(torch.isnan(output_torch)))
+
+    torch.testing.assert_close(output_triton.cpu(),
+                               output_torch.cpu(),
+                               atol=1e-1,
+                               rtol=1e-1)
diff --git a/vllm_v0.10.0/tests/kernels/quantization/test_block_fp8.py b/vllm_v0.10.0/tests/kernels/quantization/test_block_fp8.py
new file mode 100644
index 0000000..26aa8d6
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/quantization/test_block_fp8.py
@@ -0,0 +1,141 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from https://github.com/sgl-project/sglang/pull/2575
+import itertools
+
+import pytest
+import torch
+
+from tests.kernels.quant_utils import (native_per_token_group_quant_fp8,
+                                       native_w8a8_block_matmul)
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    get_col_major_tma_aligned_tensor, per_token_group_quant_fp8,
+    w8a8_block_fp8_matmul)
+from vllm.platforms import current_platform
+from vllm.utils import has_deep_gemm
+from vllm.utils.deep_gemm import fp8_gemm_nt, per_block_cast_to_fp8
+
+if current_platform.get_device_capability() < (9, 0):
+    pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
+                allow_module_level=True)
+
+vllm_config = VllmConfig()
+vllm_config.scheduler_config.max_num_seqs = 128
+vllm_config.scheduler_config.max_model_len = 8192
+
+# Test configurations
+DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
+NUM_TOKENS = [7, 2050]
+D = [512, 4096, 5120, 13824]
+GROUP_SIZE = [64, 128, 512]
+M = [1, 7, 8, 83, 84, 4096]
+N = [128, 512, 7168, 7748, 13824]
+K = [256, 3884, 4096, 13824, 16384]
+# Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8
+# and its hidden size is 7168.
+BLOCK_SIZE = [[128, 128]]
+OUT_DTYPES = [torch.bfloat16]  # [torch.float32, torch.half, torch.bfloat16]
+SEEDS = [0]
+
+# Skip all tests if CUDA is not available
+pytest.importorskip("torch.cuda")
+
+
+@pytest.fixture(autouse=True)
+def setup_cuda():
+    torch.set_default_device("cuda")
+
+
+@pytest.mark.parametrize(
+    "num_tokens,d,dtype,group_size,seed",
+    itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS))
+@torch.inference_mode()
+def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
+    torch.manual_seed(seed)
+    x = torch.rand(num_tokens, d, dtype=dtype)
+
+    ref_out, ref_scale = native_per_token_group_quant_fp8(x, group_size)
+    out, scale = per_token_group_quant_fp8(x, group_size)
+
+    assert torch.allclose(out.to(torch.float32),
+                          ref_out.to(torch.float32),
+                          rtol=0.15)
+    assert torch.allclose(scale, ref_scale)
+
+
+@pytest.mark.parametrize(
+    "M,N,K,block_size,out_dtype,seed",
+    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
+@torch.inference_mode()
+def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
+    torch.manual_seed(seed)
+    factor_for_scale = 1e-2
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+    A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+    A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+    B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+    B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+
+    As = torch.rand(M, k_tiles, dtype=torch.float32) * factor_for_scale
+    Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale
+
+    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
+                                       out_dtype)
+    out = w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
+
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    assert rel_diff < 0.001
+
+
+@pytest.mark.parametrize(
+    "M,N,K,block_size,out_dtype,seed",
+    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
+@pytest.mark.skipif(not has_deep_gemm(),
+                    reason="DeepGemm kernels not available.")
+@torch.inference_mode()
+def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
+    # only aligned sizes
+    if M % 4 != 0 or K % 128 != 0 or N % 64 != 0:
+        pytest.skip(f"Skipping test; invalid size {M}, {N}, {K}")
+
+    torch.manual_seed(seed)
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max = fp8_info.max
+
+    A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+    B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+
+    A_fp8, As_fp8 = per_token_group_quant_fp8(A_fp32, block_size[1])
+    B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32)
+
+    As = As_fp8.to(torch.float32)
+    Bs = Bs_fp8.to(torch.float32)
+
+    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
+                                       out_dtype)
+
+    # Transpose earlier so that the testing will not trigger transposing kernels
+    As_fp8 = get_col_major_tma_aligned_tensor(As_fp8)
+
+    out = torch.zeros((M, N), device='cuda', dtype=out_dtype)
+
+    assert As_fp8.shape == (M, (K + 127) //
+                            128), f"{As_fp8.shape} != {(M, (K + 127) // 128)}"
+
+    fp8_gemm_nt((A_fp8, As_fp8), (B_fp8, Bs_fp8), out)
+
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    assert rel_diff < 0.001
diff --git a/vllm_v0.10.0/tests/kernels/quantization/test_block_int8.py b/vllm_v0.10.0/tests/kernels/quantization/test_block_int8.py
new file mode 100644
index 0000000..fac82cf
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/quantization/test_block_int8.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_block_int8.py
+import itertools
+
+import pytest
+import torch
+
+from tests.kernels.quant_utils import native_w8a8_block_matmul
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.quantization.utils.int8_utils import (
+    w8a8_block_int8_matmul)
+from vllm.platforms import current_platform
+
+if current_platform.get_device_capability() < (7, 0):
+    pytest.skip("INT8 Triton requires CUDA 7.0 or higher",
+                allow_module_level=True)
+
+vllm_config = VllmConfig()
+vllm_config.scheduler_config.max_num_seqs = 128
+vllm_config.scheduler_config.max_model_len = 8192
+
+DTYPES = [torch.half, torch.bfloat16]
+M = [1, 33, 64, 222]
+N = [128, 1024]
+K = [256, 4096]
+# BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
+BLOCK_SIZE = [[128, 128]]
+SEEDS = [0]
+
+
+@pytest.fixture(autouse=True, scope="module")
+def setup_cuda():
+    """Sets the default CUDA device for all tests in this module."""
+    torch.set_default_device("cuda")
+
+
+@pytest.mark.parametrize("M,N,K,block_size,out_dtype,seed",
+                         itertools.product(M, N, K, BLOCK_SIZE, DTYPES, SEEDS))
+@torch.inference_mode()
+def test_w8a8_block_int8_matmul(M, N, K, block_size, out_dtype, seed):
+    torch.manual_seed(seed)
+    factor_for_scale = 1e-2
+    int8_info = torch.iinfo(torch.int8)
+    int8_max, int8_min = int8_info.max, int8_info.min
+
+    A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * int8_max
+    A_fp8 = A_fp32.clamp(min=int8_min, max=int8_max).to(torch.float8_e4m3fn)
+
+    B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * int8_max
+    B_fp8 = B_fp32.clamp(min=int8_min, max=int8_max).to(torch.float8_e4m3fn)
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+
+    As = torch.rand(M, k_tiles, dtype=torch.float32) * factor_for_scale
+    Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale
+
+    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
+                                       out_dtype)
+    out = w8a8_block_int8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
+
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    assert rel_diff < 0.001
diff --git a/vllm_v0.10.0/tests/kernels/quantization/test_cutlass_2of4_sparse.py b/vllm_v0.10.0/tests/kernels/quantization/test_cutlass_2of4_sparse.py
new file mode 100644
index 0000000..878f666
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/quantization/test_cutlass_2of4_sparse.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for sparse cutlass kernels
+
+Run `pytest tests/kernels/test_semi_structured.py`.
+"""
+
+import pytest
+import torch
+
+from tests.kernels.utils import baseline_scaled_mm, to_fp8, to_int8
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    sparse_cutlass_supported)
+from vllm.platforms import current_platform
+
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+capability = current_platform.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+
+
+def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.bfloat16)
+
+
+def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.float16)
+
+
+def prune_to_2_4(tensor):
+    # Reshape tensor to [N, 4] where N is number of groups of 4
+    original_shape = tensor.shape
+    reshaped = tensor.reshape(-1, 4)
+
+    # Get indices of top 2 absolute values in each group of 4
+    _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
+
+    # Create binary mask
+    mask = torch.zeros_like(reshaped)
+    mask.scatter_(dim=1,
+                  index=indices,
+                  src=torch.ones_like(indices, dtype=mask.dtype))
+
+    # Apply mask and reshape back
+    pruned = reshaped * mask
+
+    # Turn all -0.0 to 0.0
+    pruned[pruned == -0.0] = 0.0
+
+    return pruned.reshape(original_shape)
+
+
+# This function checks that applying an identity matrix multiplication
+# to the compressed weights yields the original uncompressed weights.
+def check_compress_decompress_invariance(dtype: torch.dtype, b: torch.Tensor,
+                                         b_compressed: torch.Tensor,
+                                         b_metadata: torch.Tensor):
+
+    # For float16 and bfloat16, cutlass_scaled_sparse_mm's output must be the
+    # same dtype as its inputs. This line addresses that constraint while
+    # arbitrarily using bfloat16 for the int8/fp8 cases.
+    out_dtype = torch.float16 if dtype is torch.float16 else torch.bfloat16
+
+    eye = torch.eye(b.shape[0], device='cuda', dtype=dtype)
+    eye_scale = torch.ones(1, device='cuda', dtype=torch.float32)
+    b_decomp = ops.cutlass_scaled_sparse_mm(eye,
+                                            b_compressed,
+                                            b_metadata,
+                                            eye_scale,
+                                            eye_scale,
+                                            out_dtype=out_dtype)
+
+    torch.testing.assert_close(b.to(dtype=out_dtype), b_decomp)
+
+
+def make_rand_sparse_tensors(
+        dtype: torch.dtype, m: int, n: int, k: int
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda')
+    b = torch.randn((n, k), device='cuda').t()
+
+    if dtype == torch.int8:
+        # ensure A and B aren't all zeros after rounding
+        a = a * 5.0
+        b = b * 5.0
+
+    b = prune_to_2_4(b.t()).t()
+
+    if dtype == torch.int8:
+        a, b = to_int8(a), to_int8(b)
+    elif dtype == torch.float8_e4m3fn:
+        a, b = to_fp8(a), to_fp8(b)
+    elif dtype == torch.float16:
+        a, b = to_fp16(a), to_fp16(b)
+    elif dtype == torch.bfloat16:
+        a, b = to_bf16(a), to_bf16(b)
+    else:
+        raise ValueError("unsupported dtype")
+
+    b_compressed, e = ops.cutlass_sparse_compress(b.t())
+    check_compress_decompress_invariance(dtype, b, b_compressed, e)
+
+    # Compressed B, Metadata, Original A, B
+    return b_compressed, e, a, b
+
+
+@pytest.mark.skipif(not sparse_cutlass_supported(),
+                    reason="Sparse CUTLASS is not supported on this GPU type.")
+# Test working with a subset of A and B for sparse matmul
+def test_cutlass_sparse_subset():
+
+    big_m = 1024
+    m, n, k = 512, 512, 512
+
+    # Create tensors
+    b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn,
+                                                     big_m, n, k)
+    a = whole_a[0:m, 0:k]
+    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
+    scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
+
+    out = ops.cutlass_scaled_sparse_mm(a,
+                                       b_comp,
+                                       e,
+                                       scale_a,
+                                       scale_b,
+                                       out_dtype=torch.bfloat16)
+    baseline = baseline_scaled_mm(a,
+                                  b,
+                                  scale_a,
+                                  scale_b,
+                                  out_dtype=torch.bfloat16)
+
+    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
+
+
+MNK_FACTORS = [
+    (1, 256, 128),
+    (1, 16384, 1024),
+    (1, 24576, 512),
+    (16, 256, 512),
+    (16, 16384, 128),
+    (16, 24576, 4096),
+    (32, 8192, 4096),
+    (32, 16384, 4096),
+    (33, 1024, 1024),
+    (33, 8192, 128),
+    (64, 2048, 512),
+    (64, 16384, 1024),
+    (100, 8192, 512),
+    (128, 32768, 4096),
+    (256, 4096, 4096),
+    (512, 256, 1024),
+    (512, 8192, 4096),
+    (512, 16384, 128),
+    (512, 24576, 128),
+]
+
+
+# Test working with a subset of A and B for sparse matmul
+@pytest.mark.skipif(not sparse_cutlass_supported(),
+                    reason="Sparse CUTLASS is not supported on this GPU type.")
+@pytest.mark.parametrize("m, n, k", MNK_FACTORS)
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("use_bias", [True, False])
+def test_cutlass_sparse_gemm(m: int, k: int, n: int, dtype: type[torch.dtype],
+                             use_bias: bool):
+
+    # Create tensors
+    b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
+    scale_a = torch.ones((1, 1), device="cuda", dtype=torch.float32)
+    scale_b = torch.ones((1, 1), device="cuda", dtype=torch.float32)
+
+    bias = torch.rand((n, ), device="cuda", dtype=dtype) if use_bias else None
+
+    out = ops.cutlass_scaled_sparse_mm(a,
+                                       b_comp,
+                                       e,
+                                       scale_a,
+                                       scale_b,
+                                       out_dtype=dtype,
+                                       bias=bias)
+
+    baseline = baseline_scaled_mm(a,
+                                  b,
+                                  scale_a,
+                                  scale_b,
+                                  out_dtype=dtype,
+                                  bias=bias)
+
+    torch.testing.assert_close(out, baseline, rtol=1e-2, atol=3e-1)
+
+
+@pytest.mark.skipif(not sparse_cutlass_supported(),
+                    reason="Sparse CUTLASS is not supported on this GPU type.")
+@pytest.mark.parametrize("m, k, n", MNK_FACTORS)
+@pytest.mark.skipif(not current_platform.has_device_capability(89),
+                    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.parametrize("use_bias", [True, False])
+def test_cutlass_sparse_fp8_gemm(m: int, n: int, k: int, use_bias: bool):
+
+    # Create tensors
+    b_comp, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k)
+    scale_a = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
+    scale_b = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
+    out_dtype = torch.bfloat16
+
+    bias = torch.rand(
+        (n, ), device="cuda", dtype=out_dtype) * 10 if use_bias else None
+
+    out = ops.cutlass_scaled_sparse_mm(a,
+                                       b_comp,
+                                       e,
+                                       scale_a,
+                                       scale_b,
+                                       out_dtype=out_dtype,
+                                       bias=bias)
+
+    baseline = baseline_scaled_mm(a,
+                                  b,
+                                  scale_a,
+                                  scale_b,
+                                  out_dtype=out_dtype,
+                                  bias=bias)
+
+    torch.testing.assert_close(out, baseline, rtol=1e-2, atol=3e-1)
+
+
+@pytest.mark.skipif(not sparse_cutlass_supported(),
+                    reason="Sparse CUTLASS is not supported on this GPU type.")
+@pytest.mark.parametrize("m,k,n", MNK_FACTORS)
+@pytest.mark.parametrize("per_act_token", [True, False])
+@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("use_bias", [True, False])
+def test_cutlass_sparse_int8_gemm(m: int, n: int, k: int, per_act_token: bool,
+                                  per_out_ch: bool, use_bias: bool):
+
+    # Create tensors
+    b_comp, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
+    scale_a = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
+    scale_b = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
+    out_dtype = torch.bfloat16
+
+    bias = torch.rand(
+        (n, ), device="cuda", dtype=out_dtype) * 10 if use_bias else None
+
+    out = ops.cutlass_scaled_sparse_mm(a,
+                                       b_comp,
+                                       e,
+                                       scale_a,
+                                       scale_b,
+                                       out_dtype=out_dtype,
+                                       bias=bias)
+
+    baseline = baseline_scaled_mm(a,
+                                  b,
+                                  scale_a,
+                                  scale_b,
+                                  out_dtype=out_dtype,
+                                  bias=bias)
+
+    torch.testing.assert_close(out, baseline, rtol=1e0, atol=2e0)
diff --git a/vllm_v0.10.0/tests/kernels/quantization/test_cutlass_scaled_mm.py b/vllm_v0.10.0/tests/kernels/quantization/test_cutlass_scaled_mm.py
new file mode 100644
index 0000000..c4d349f
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/quantization/test_cutlass_scaled_mm.py
@@ -0,0 +1,645 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for cutlass kernels
+
+Run `pytest tests/kernels/test_cutlass.py`.
+"""
+import random
+
+import pytest
+import torch
+
+from tests.kernels.utils import baseline_scaled_mm, opcheck, to_fp8, to_int8
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.utils import cdiv
+
+MNK_FACTORS = [
+    (1, 256, 128),
+    (1, 16384, 1024),
+    (1, 24576, 496),
+    (16, 256, 496),
+    (16, 16384, 128),
+    (16, 24576, 4096),
+    (32, 8192, 4096),
+    (32, 16384, 4096),
+    (33, 1024, 1024),
+    (33, 8192, 128),
+    (64, 2048, 496),
+    (64, 16384, 1024),
+    (100, 8192, 496),
+    (128, 32768, 4096),
+    (256, 4096, 4096),
+    (512, 256, 1024),
+    (512, 8192, 4096),
+    (512, 16384, 128),
+    (512, 24576, 128),
+]
+
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+# -1 means full extent in that dimension
+TENSORWISE_GROUP_SHAPE = (-1, -1)
+PER_TOKEN_GROUP_SHAPE = (1, -1)
+PER_OUT_CH_GROUP_SHAPE = (-1, 1)
+
+capability = current_platform.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+
+
+def rand_int8(shape: tuple, device: str = "cuda"):
+    return to_int8(torch.rand(shape, device=device) * 255 - 128)
+
+
+def group_scale_helper(shape, group_shape):
+    return [shape[i] if s < 0 else s for i, s in enumerate(group_shape)]
+
+
+def scale_shape(shape, group_shape):
+    assert len(shape) == len(group_shape)
+    group_shape = group_scale_helper(shape, group_shape)
+    return tuple(
+        cdiv(shape[i], group_shape[i]) for i in range(len(group_shape)))
+
+
+def cutlass_fp8_gemm_helper(m: int,
+                            n: int,
+                            k: int,
+                            a_scale_group_shape: tuple,
+                            b_scale_group_shape: tuple,
+                            use_bias: bool,
+                            out_dtype: type[torch.dtype] = torch.bfloat16,
+                            device: str = "cuda"):
+    # Test for a cutlass kernel with per-token activation quantization
+    # and per-output channel weight quantization.
+    a = to_fp8(torch.randn((m, k), device=device))
+    b = to_fp8(torch.randn((n, k), device=device).t())
+
+    a_scales_shape = scale_shape(a.shape, a_scale_group_shape)
+    b_scales_shape = scale_shape(b.shape, b_scale_group_shape)
+
+    scale_a = (torch.randn(a_scales_shape, device=device, dtype=torch.float32))
+    scale_b = (torch.randn(b_scales_shape, device=device, dtype=torch.float32))
+
+    # make scales M-major for blockwise quant, doesn't affect 1D scales
+    scale_a = scale_a.t().contiguous().t()
+    # make scales K-major for blockwise quant, doesn't affect 1D scales
+    scale_b = scale_b.t().contiguous().t()
+
+    if use_bias:
+        bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
+    else:
+        bias = None
+
+    out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
+    baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
+
+    torch.testing.assert_close(out, baseline, rtol=1e-2, atol=1.5e-1)
+
+    opcheck(torch.ops._C.cutlass_scaled_mm,
+            (out, a, b, scale_a, scale_b, bias))
+
+
+def cutlass_int8_gemm_helper(m: int,
+                             n: int,
+                             k: int,
+                             a_scale_group_shape: tuple,
+                             b_scale_group_shape: tuple,
+                             use_bias: bool,
+                             out_dtype: type[torch.dtype] = torch.bfloat16,
+                             device: str = "cuda"):
+    # Test for a cutlass kernel with per-token activation quantization
+    # and per-output channel weight quantization.
+    a = to_int8(torch.randn((m, k), device=device) * 5)
+    b = to_int8(torch.randn((n, k), device=device).t() * 5)
+
+    a_scales_shape = scale_shape(a.shape, a_scale_group_shape)
+    b_scales_shape = scale_shape(b.shape, b_scale_group_shape)
+
+    scale_a = (torch.randn(a_scales_shape, device=device, dtype=torch.float32))
+    scale_b = (torch.randn(b_scales_shape, device=device, dtype=torch.float32))
+
+    if use_bias:
+        bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
+    else:
+        bias = None
+
+    out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
+    baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
+
+    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
+
+    opcheck(torch.ops._C.cutlass_scaled_mm,
+            (out, a, b, scale_a, scale_b, bias))
+
+
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("use_bias", [True, False])
+@pytest.mark.skipif(not current_platform.has_device_capability(89),
+                    reason="FP8 is not supported on this GPU type.")
+def test_cutlass_fp8_gemm(m: int, n: int, k: int, a_scale_group_shape,
+                          b_scale_group_shape, use_bias: bool):
+    cutlass_fp8_gemm_helper(m, n, k, a_scale_group_shape, b_scale_group_shape,
+                            use_bias)
+
+
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
+@pytest.mark.parametrize("a_scale_group_shape,b_scale_group_shape",
+                         [((1, 128), (128, 128))])
+@pytest.mark.parametrize("use_bias", [False])
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="FP8 blockwise is not supported on this GPU type.")
+def test_cutlass_fp8_blockwise_scale_gemm(m: int, n: int, k: int,
+                                          a_scale_group_shape,
+                                          b_scale_group_shape, use_bias: bool):
+    if k % b_scale_group_shape[0] != 0 or n % b_scale_group_shape[1] != 0:
+        return
+    if m % a_scale_group_shape[0] != 0 or k % a_scale_group_shape[1] != 0:
+        return
+    if m % 4 != 0 and current_platform.has_device_capability(100):
+        return
+    cutlass_fp8_gemm_helper(m, n, k, a_scale_group_shape, b_scale_group_shape,
+                            use_bias)
+
+
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("use_bias", [True, False])
+def test_cutlass_int8_gemm(m: int, n: int, k: int, a_scale_group_shape,
+                           b_scale_group_shape, use_bias: bool):
+    cutlass_int8_gemm_helper(m, n, k, a_scale_group_shape, b_scale_group_shape,
+                             use_bias)
+
+
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("use_bias", [True, False])
+def test_cutlass_int8_gemm_output_dtype(a_scale_group_shape,
+                                        b_scale_group_shape,
+                                        out_dtype: type[torch.dtype],
+                                        use_bias: bool):
+    cutlass_int8_gemm_helper(512,
+                             512,
+                             512,
+                             a_scale_group_shape,
+                             b_scale_group_shape,
+                             use_bias,
+                             out_dtype=out_dtype)
+
+
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("use_bias", [True, False])
+@pytest.mark.skipif(not current_platform.has_device_capability(89),
+                    reason="FP8 is not supported on this GPU type.")
+def test_cutlass_fp8_gemm_output_dtype(a_scale_group_shape,
+                                       b_scale_group_shape,
+                                       out_dtype: type[torch.dtype],
+                                       use_bias: bool):
+    cutlass_fp8_gemm_helper(512,
+                            512,
+                            512,
+                            a_scale_group_shape,
+                            b_scale_group_shape,
+                            use_bias,
+                            out_dtype=out_dtype)
+
+
+@pytest.mark.parametrize("a_scale_group_shape,b_scale_group_shape",
+                         [((1, 128), (128, 128))])
+@pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("use_bias", [False])
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="FP8 blockwise is not supported on this GPU type.")
+def test_cutlass_fp8_blockwise_scale_gemm_dtype(a_scale_group_shape,
+                                                b_scale_group_shape,
+                                                out_dtype: type[torch.dtype],
+                                                use_bias: bool):
+    cutlass_fp8_gemm_helper(512,
+                            512,
+                            512,
+                            a_scale_group_shape,
+                            b_scale_group_shape,
+                            use_bias,
+                            out_dtype=out_dtype)
+
+
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("use_bias", [True, False])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.skipif(not current_platform.has_device_capability(89),
+                    reason="FP8 is not supported on this GPU type.")
+def test_cutlass_fp8_gemm_devices(a_scale_group_shape, b_scale_group_shape,
+                                  use_bias: bool, device: str):
+    cutlass_fp8_gemm_helper(512, 512, 512, a_scale_group_shape,
+                            b_scale_group_shape, use_bias, torch.bfloat16,
+                            device)
+
+
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("use_bias", [True, False])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_cutlass_int8_gemm_devices(a_scale_group_shape, b_scale_group_shape,
+                                   use_bias: bool, device: str):
+    cutlass_int8_gemm_helper(512,
+                             512,
+                             512,
+                             a_scale_group_shape,
+                             b_scale_group_shape,
+                             use_bias,
+                             out_dtype=torch.bfloat16,
+                             device=device)
+
+
+# For the following two tests:
+# N and K correspond to the size of the weight matrix and likely to be multiples
+# of a large power of two. In any case, the kernel will have a naive fallback
+# when N and K are not divisible by 16. But M is the number of tokens and the
+# kernel must handle any M thrown at it.
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("use_bias", [True, False])
+@pytest.mark.skipif(not current_platform.has_device_capability(89),
+                    reason="FP8 is not supported on this GPU type.")
+def test_cutlass_fp8_gemm_m_sweep(a_scale_group_shape, b_scale_group_shape,
+                                  use_bias: bool):
+    for nk in range(32, 128, 32):
+        for m in range(1, 128):
+            cutlass_fp8_gemm_helper(m, nk, nk, a_scale_group_shape,
+                                    b_scale_group_shape, use_bias)
+
+
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("use_bias", [True, False])
+def test_cutlass_int8_gemm_m_sweep(a_scale_group_shape, b_scale_group_shape,
+                                   use_bias: bool):
+    for nk in range(32, 128, 32):
+        for m in range(1, 128):
+            cutlass_int8_gemm_helper(m, nk, nk, a_scale_group_shape,
+                                     b_scale_group_shape, use_bias)
+
+
+@pytest.mark.parametrize("m", [32, 64, 128])
+@pytest.mark.parametrize("n", [16, 32, 64])
+@pytest.mark.parametrize("k", [64, 128, 256])
+@pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.skip
+def test_cutlass_int8_azp_bias_fold(m: int, n: int, k: int,
+                                    out_dtype: torch.dtype):
+    # Currently, the test is failing because folding azp into
+    # 16-bit bias loses too much precision
+    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
+    scale_b = torch.randn((1, n), device="cuda", dtype=torch.float32) / 10
+
+    aq_i8 = rand_int8((m, k))
+    bq_i8 = rand_int8((n, k)).t()
+
+    aq_i32 = aq_i8.to(dtype=torch.int32)
+    bq_i32 = bq_i8.to(dtype=torch.int32)
+
+    aq_f32 = aq_i8.to(dtype=torch.float32)
+    bq_f32 = bq_i8.to(dtype=torch.float32)
+
+    b_dq = scale_b * bq_f32
+
+    azp_a = torch.rand((1, ), device="cuda", dtype=torch.float32) * 10 + 1.5
+    azp_aq_i8 = (azp_a / scale_a).to(dtype=torch.int8)
+    azp_a = azp_aq_i8.to(dtype=torch.float32) * scale_a  # correct for rounding
+
+    a_dq = scale_a * (aq_i32 + azp_aq_i8).to(dtype=torch.float32)
+    torch.testing.assert_close(a_dq, scale_a * aq_f32 + azp_a)
+
+    baseline_dq = torch.mm(a_dq, b_dq).to(out_dtype)
+
+    J = torch.ones((1, k), device="cuda", dtype=torch.float32)
+    azp_bias = (azp_a * scale_b * (J @ bq_f32)).to(out_dtype)
+    assert azp_bias.shape == (1, n)
+    assert azp_bias[0, :].shape == (n, )
+
+    baseline_q = (scale_a.to(device='cpu') * scale_b.to(device='cpu') * (
+        (aq_i32 + azp_aq_i8).to(device='cpu') @ bq_i32.to(device='cpu'))).to(
+            dtype=out_dtype, device='cuda')
+
+    out = ops.cutlass_scaled_mm(aq_i8,
+                                bq_i8,
+                                scale_a,
+                                scale_b,
+                                out_dtype=out_dtype,
+                                bias=azp_bias[0, :])
+    torch.testing.assert_close(out, baseline_dq, rtol=1e-2, atol=1e0)
+    torch.testing.assert_close(out, baseline_q, rtol=1e-2, atol=1e0)
+
+
+@pytest.mark.parametrize("m", [32, 64, 128])
+@pytest.mark.parametrize("n", [16, 32, 64])
+@pytest.mark.parametrize("k", [64, 128, 256])
+@pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("use_bias", [True, False])
+@pytest.mark.parametrize("azp_per_token", [True, False])
+def test_cutlass_int8_azp(m: int, n: int, k: int, out_dtype: torch.dtype,
+                          use_bias: bool, azp_per_token: bool):
+    m_azp = m if azp_per_token else 1
+    scale_a = torch.randn((m_azp, 1), device="cuda", dtype=torch.float32) / 10
+    scale_b = torch.randn((1, n), device="cuda", dtype=torch.float32) / 10
+
+    aq_i8 = rand_int8((m, k))
+    aq_i32 = aq_i8.to(dtype=torch.int32)
+    aq_f32 = aq_i8.to(dtype=torch.float32)
+
+    bq_i8 = rand_int8((n, k)).t()
+    bq_i32 = bq_i8.to(dtype=torch.int32)
+    bq_f32 = bq_i8.to(dtype=torch.float32)
+    b_dq = scale_b * bq_f32
+
+    azp_a = torch.rand(
+        (m_azp, 1), device="cuda", dtype=torch.float32) * 10 + 1.5
+    azp_aq_i8 = (azp_a / scale_a).to(dtype=torch.int8)
+    azp_a = azp_aq_i8.to(dtype=torch.float32) * scale_a  # correct for rounding
+
+    a_dq = scale_a * (aq_i32 - azp_aq_i8).to(dtype=torch.float32)
+    torch.testing.assert_close(a_dq,
+                               scale_a * aq_f32 - azp_a,
+                               rtol=1e-4,
+                               atol=1e-3)
+
+    if use_bias:
+        bias = torch.rand((1, n), device="cuda", dtype=out_dtype) * 10 + 2.5
+    else:
+        bias = torch.zeros((1, n), device="cuda", dtype=out_dtype)
+
+    baseline_dq = (torch.mm(a_dq, b_dq) + bias).to(out_dtype)
+
+    # int32 mm not supported on CUDA
+    a_noazp_i32_cpu = (aq_i32 - azp_aq_i8).to(device='cpu')
+    cq = (a_noazp_i32_cpu @ bq_i32.to(device='cpu')).to(device='cuda')
+    baseline_q = (scale_a * scale_b * cq + bias).to(dtype=out_dtype)
+
+    # Hadamard is just the sum of the cols
+    azp_adj_i32 = bq_i32.sum(dim=0, keepdim=True, dtype=torch.int32)
+    azp_i32 = azp_aq_i8.to(dtype=torch.int32)
+    func_bias = bias if use_bias else None
+
+    if azp_per_token:
+        out = ops.cutlass_scaled_mm_azp(aq_i8, bq_i8, scale_a, scale_b,
+                                        out_dtype, azp_adj_i32, azp_i32,
+                                        func_bias)
+    else:
+        azp_with_adj_i32 = azp_i32 * azp_adj_i32
+        out = ops.cutlass_scaled_mm_azp(aq_i8, bq_i8, scale_a, scale_b,
+                                        out_dtype, azp_with_adj_i32, None,
+                                        func_bias)
+
+    # bfloat16 precision is 7-bit mantissa -> 2^-8 ~ 0.4%
+    # float16 precision is 10-bit mantissa -> 2^-11 ~ 0.05%
+    rtol = 1e-2 if out_dtype == torch.bfloat16 else 1e-3
+    atol = 1e-3
+    torch.testing.assert_close(out, baseline_dq, rtol=rtol, atol=atol)
+    torch.testing.assert_close(out, baseline_q, rtol=rtol, atol=atol)
+
+    if azp_per_token:
+        opcheck(torch.ops._C.cutlass_scaled_mm_azp,
+                (out, aq_i8, bq_i8, scale_a, scale_b, azp_adj_i32, azp_i32,
+                 func_bias))
+    else:
+        opcheck(torch.ops._C.cutlass_scaled_mm_azp,
+                (out, aq_i8, bq_i8, scale_a, scale_b, azp_with_adj_i32, None,
+                 func_bias))
+
+
+# Test working with a subset of A and B
+def test_cutlass_subset():
+    big_m, big_n, big_k = 1024, 1024, 1024
+    m, n, k = 512, 512, 512
+
+    whole_a = to_int8(torch.randn((big_m, big_k), device="cuda") * 5)
+    whole_b = to_int8(torch.randn((big_n, big_k), device="cuda").t() * 5)
+    a = whole_a[0:m, 0:k]
+    b = whole_b[0:k, 0:n]
+
+    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
+    scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
+
+    out = ops.cutlass_scaled_mm(a,
+                                b,
+                                scale_a,
+                                scale_b,
+                                out_dtype=torch.bfloat16)
+    baseline = baseline_scaled_mm(a,
+                                  b,
+                                  scale_a,
+                                  scale_b,
+                                  out_dtype=torch.bfloat16)
+
+    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
+
+
+# Test to make sure cuda graphs work
+class CutlassLayer(torch.nn.Module):
+
+    def __init__(self, b, scale_a, scale_b, out_dtype):
+        super().__init__()
+        self.b = b
+        self.scale_a = scale_a
+        self.scale_b = scale_b
+        self.out_dtype = out_dtype
+
+    def forward(self, a):
+        return ops.cutlass_scaled_mm(a, self.b, self.scale_a, self.scale_b,
+                                     self.out_dtype)
+
+
+@pytest.mark.parametrize("per_act_token", [True, False])
+@pytest.mark.parametrize("per_out_ch", [True, False])
+def test_cutlass_cuda_graph(per_act_token: bool, per_out_ch: bool):
+    m, n, k = 512, 512, 512
+
+    a = to_int8(torch.randn((m, k), device="cuda"))
+    b = to_int8(torch.randn((n, k), device="cuda").t())
+
+    m_a_scales = m if per_act_token else 1
+    n_b_scales = n if per_out_ch else 1
+
+    scale_a = (torch.randn(
+        (m_a_scales, 1), device="cuda", dtype=torch.float32) / 10)
+    scale_b = (torch.randn(
+        (1, n_b_scales), device="cuda", dtype=torch.float32) / 10)
+
+    # Construct a trivial model with a single layer that calls a CUTLASS kernel
+    model = CutlassLayer(b, scale_a, scale_b, torch.bfloat16)
+
+    # Run the model with a cuda graph
+    stream = torch.cuda.Stream()
+    with torch.cuda.stream(stream):
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g):
+            out = model(a)
+    out.zero_()
+    g.replay()
+
+    baseline = torch.mm(scale_a * a.to(dtype=torch.float32),
+                        scale_b * b.to(dtype=torch.float32)).to(torch.bfloat16)
+    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
+
+
+def test_cutlass_support_opcheck():
+    opcheck(torch.ops._C.cutlass_scaled_mm_supports_fp8, (capability, ))
+
+
+@pytest.mark.parametrize("num_experts", [8, 64])
+@pytest.mark.parametrize("per_act_token", [True, False])
+@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("use_bias", [False])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
+                                per_out_ch: bool, use_bias: bool):
+
+    # Device and dtype setup
+    device = "cuda"
+    out_dtype = torch.half
+
+    # Create separate A, B, C tensors for each group
+    a_tensors = []
+    b_tensors = []
+    a_scales_tensors = []
+    b_scales_tensors = []
+    baseline_tensors = []
+
+    expert_offsets = torch.zeros((num_experts + 1),
+                                 device=device,
+                                 dtype=torch.int32)
+
+    problem_sizes = torch.zeros((num_experts, 3),
+                                device=device,
+                                dtype=torch.int32)
+
+    if not per_act_token:
+        one_scale_a = torch.randn((1, 1), device=device, dtype=torch.float32)
+
+    alignment = 16  # 128 // 8
+    # For variation, each group has dimensions
+    n_g = alignment * random.randint(1, 64)
+    k_g = alignment * random.randint(1, 64)
+    for g in range(num_experts):
+        m_g = alignment * random.randint(1, 64)
+
+        expert_offsets[g + 1] = expert_offsets[g] + m_g
+        problem_sizes[g][0] = m_g
+        problem_sizes[g][1] = n_g
+        problem_sizes[g][2] = k_g
+
+        m_a_scales = m_g if per_act_token else 1
+        n_b_scales = n_g if per_out_ch else 1
+
+        print("shape:", m_g, n_g, k_g)
+
+        # Create group-specific A and B (FP8) and output (FP16/FP32)
+        a_g = to_fp8(torch.randn((m_g, k_g), device=device))
+        b_g = to_fp8(torch.randn((n_g, k_g), device=device).t())
+        a_tensors.append(a_g)
+        b_tensors.append(b_g)
+
+        # Set up A/B scales
+        scale_b = torch.randn((1, n_b_scales),
+                              device=device,
+                              dtype=torch.float32)
+        b_scales_tensors.append(scale_b)
+
+        if per_act_token:
+            scale_a = torch.randn((m_a_scales, 1),
+                                  device=device,
+                                  dtype=torch.float32)
+            a_scales_tensors.append(scale_a)
+        else:
+            scale_a = one_scale_a
+
+        # Compute baseline result for this group
+        baseline_g = baseline_scaled_mm(a_g, b_g, scale_a, scale_b, out_dtype,
+                                        None)
+        baseline_tensors.append(baseline_g)
+
+    a_tensors_stacked = torch.empty((expert_offsets[num_experts], k_g),
+                                    device=device,
+                                    dtype=torch.float8_e4m3fn)
+    b_tensors_stacked = torch.empty((num_experts, n_g, k_g),
+                                    device=device,
+                                    dtype=torch.float8_e4m3fn)
+
+    for g in range(num_experts):
+        a_tensors_stacked[expert_offsets[g]:expert_offsets[g +
+                                                           1]] = a_tensors[g]
+        b_tensors_stacked[g] = b_tensors[g].t()
+    b_tensors_stacked = b_tensors_stacked.transpose(1, 2)
+
+    if per_act_token:
+        a_scales_tensors_stacked = torch.empty(
+            (expert_offsets[num_experts], 1),
+            device=device,
+            dtype=torch.float32)
+        for g in range(num_experts):
+            a_scales_tensors_stacked[
+                expert_offsets[g]:expert_offsets[g + 1]] = a_scales_tensors[g]
+    else:
+        a_scales_tensors_stacked = one_scale_a
+
+    b_scales_tensors_stacked = torch.empty((num_experts, n_b_scales),
+                                           device=device,
+                                           dtype=torch.float32)
+    for g in range(num_experts):
+        b_scales_tensors_stacked[g] = b_scales_tensors[g]
+
+    out_tensors_stacked = torch.zeros((expert_offsets[num_experts], n_g),
+                                      device=device,
+                                      dtype=out_dtype)
+
+    ab_strides = torch.full((num_experts, ),
+                            a_tensors_stacked.stride(0),
+                            device="cuda",
+                            dtype=torch.int64)
+    c_strides = torch.full((num_experts, ),
+                           out_tensors_stacked.stride(0),
+                           device="cuda",
+                           dtype=torch.int64)
+
+    ops.cutlass_moe_mm(out_tensors_stacked, a_tensors_stacked,
+                       b_tensors_stacked, a_scales_tensors_stacked,
+                       b_scales_tensors_stacked, expert_offsets[:-1],
+                       problem_sizes, ab_strides, ab_strides, c_strides,
+                       per_act_token, per_out_ch)
+
+    # Validate each group's result against the baseline
+    for g in range(num_experts):
+        baseline = baseline_tensors[g]
+        c = out_tensors_stacked[expert_offsets[g]:expert_offsets[g + 1]]
+        print(baseline)
+        print(c)
+        print("*")
+        torch.testing.assert_close(c, baseline, rtol=1e-2, atol=5e-4)
diff --git a/vllm_v0.10.0/tests/kernels/quantization/test_fp8_quant.py b/vllm_v0.10.0/tests/kernels/quantization/test_fp8_quant.py
new file mode 100644
index 0000000..0a3edd4
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/quantization/test_fp8_quant.py
@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+import vllm._custom_ops as ops
+from tests.kernels.quant_utils import (FP8_DTYPE,
+                                       ref_dynamic_per_tensor_fp8_quant,
+                                       ref_dynamic_per_token_quant)
+from tests.kernels.utils import opcheck
+from vllm.platforms import current_platform
+
+DTYPES = [torch.half, torch.bfloat16, torch.float]
+HIDDEN_SIZES = [1, 2, 3, 4, 16, 67, 768, 2048, 5120, 5137, 8192,
+                8193]  # Arbitrary values for testing
+HIDDEN_SIZES += list(range(1024, 1033))  # vectorized conversion edge cases
+NUM_TOKENS = [1, 7, 83, 4096]  # Arbitrary values for testing
+SCALE_UBS = [True, False]
+SEEDS = [0]
+
+
+def opcheck_fp8_quant(output,
+                      input,
+                      scale=None,
+                      scale_ub=None,
+                      use_per_token_if_dynamic=False):
+    if scale is not None:
+        opcheck(torch.ops._C.static_scaled_fp8_quant, (output, input, scale))
+    elif use_per_token_if_dynamic:
+        scale = torch.empty((input.shape[0], 1),
+                            device=input.device,
+                            dtype=torch.float32)
+        opcheck(torch.ops._C.dynamic_per_token_scaled_fp8_quant,
+                (output, input, scale, scale_ub))
+    else:
+        scale = torch.empty((input.numel() // input.shape[-1], 1),
+                            device=input.device,
+                            dtype=torch.float32)
+        opcheck(torch.ops._C.dynamic_scaled_fp8_quant, (output, input, scale))
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("scale_ub", SCALE_UBS)
+@pytest.mark.parametrize("seed", SEEDS)
+@torch.inference_mode()
+def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int,
+                                     dtype: torch.dtype, scale_ub: bool,
+                                     seed: int) -> None:
+    current_platform.seed_everything(seed)
+
+    x = torch.rand(num_tokens, hidden_size, dtype=dtype,
+                   device="cuda") + 1e-6  # avoid nans
+
+    scale_ub = torch.mean(x).to(dtype=torch.float32, device='cuda') \
+            if scale_ub else None
+    ref_out, ref_scales = ref_dynamic_per_token_quant(x, FP8_DTYPE, scale_ub)
+    ops_out, ops_scales = ops.scaled_fp8_quant(x,
+                                               scale_ub=scale_ub,
+                                               use_per_token_if_dynamic=True)
+
+    torch.testing.assert_close(ref_scales, ops_scales)
+    torch.testing.assert_close(ref_out.to(dtype=torch.float32),
+                               ops_out.to(dtype=torch.float32))
+
+    opcheck_fp8_quant(ops_out,
+                      x,
+                      None,
+                      scale_ub,
+                      use_per_token_if_dynamic=True)
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@torch.inference_mode()
+def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int,
+                                      dtype: torch.dtype, seed: int) -> None:
+    current_platform.seed_everything(seed)
+
+    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")
+
+    ref_out, ref_scale = ref_dynamic_per_tensor_fp8_quant(x)
+    ops_out, ops_scale = ops.scaled_fp8_quant(x)
+
+    torch.testing.assert_close(ref_scale, ops_scale)
+    torch.testing.assert_close(ref_out.to(dtype=torch.float32),
+                               ops_out.to(dtype=torch.float32))
+
+    opcheck_fp8_quant(ops_out, x)
+
+
+# Regression test for a case with large activations where an int32 index cannot
+# represent the number of elements.
+@torch.inference_mode()
+@pytest.mark.parametrize("seed", SEEDS)
+def test_fp8_quant_large(seed: int) -> None:
+    current_platform.seed_everything(seed)
+
+    num_tokens = 1024000  # Mistral-Nemo's max_position_embeddings
+    hidden_size = 1152  # Smallest hidden_size to reproduce the error
+    dtype = torch.bfloat16
+
+    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")
+    ref_out, scale = ref_dynamic_per_tensor_fp8_quant(x)
+    ops_out, _ = ops.scaled_fp8_quant(x, scale)
+
+    # Minimize memory footprint in this test by freeing x and upconverting
+    # the outputs in place. (torch.allclose does not support fp8)
+    del x
+    ref_out = ref_out.to(dtype=dtype)
+    ops_out = ops_out.to(dtype=dtype)
+
+    torch.testing.assert_close(ref_out, ops_out)
diff --git a/vllm_v0.10.0/tests/kernels/quantization/test_ggml.py b/vllm_v0.10.0/tests/kernels/quantization/test_ggml.py
new file mode 100644
index 0000000..07651fe
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/quantization/test_ggml.py
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import gguf
+import pytest
+import torch
+
+from tests.kernels.utils import opcheck
+from vllm import _custom_ops as ops  # noqa: F401
+
+
+@pytest.mark.parametrize("quant_type", [12])
+def test_ggml_opcheck(quant_type):
+    block_size, type_size = gguf.GGML_QUANT_SIZES[quant_type]
+    shape = [256, 1152]
+    qweight = torch.randint(0, 100, shape, device='cuda', dtype=torch.uint8)
+    m = qweight.shape[0]
+    n = qweight.shape[1] // type_size * block_size
+    opcheck(torch.ops._C.ggml_dequantize,
+            (qweight, quant_type, m, n, torch.float16))
+
+    x = torch.rand((m, 512), device='cuda', dtype=torch.float16)
+    opcheck(torch.ops._C.ggml_mul_mat_a8,
+            (qweight, x, quant_type, qweight.shape[0]))
+    opcheck(torch.ops._C.ggml_mul_mat_vec_a8,
+            (qweight, x, quant_type, qweight.shape[0]))
+
+    shape = [256, 1024, 336]
+    qweight = torch.randint(0, 100, shape, device='cuda', dtype=torch.uint8)
+    x = torch.rand((1, 1024), device='cuda', dtype=torch.float16)
+    sorted_token_ids = torch.arange(776, device='cuda')
+    expert_ids = torch.randint(0, 256, (194, ), device='cuda')
+    num_tokens_post_padded = torch.tensor([1],
+                                          dtype=torch.int64,
+                                          device='cuda')
+
+    opcheck(torch.ops._C.ggml_moe_a8,
+            (x, qweight, sorted_token_ids, expert_ids, num_tokens_post_padded,
+             quant_type, qweight.shape[0], 1, x.shape[0]))
+
+    topk_ids = torch.zeros((1, 1), device='cuda', dtype=torch.int32)
+
+    opcheck(
+        torch.ops._C.ggml_moe_a8_vec,
+        (x, qweight, topk_ids, 1, quant_type, qweight.shape[0], x.shape[0]))
diff --git a/vllm_v0.10.0/tests/kernels/quantization/test_gguf.py b/vllm_v0.10.0/tests/kernels/quantization/test_gguf.py
new file mode 100644
index 0000000..436d5cb
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/quantization/test_gguf.py
@@ -0,0 +1,187 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from pathlib import Path
+
+import pytest
+import torch
+from gguf import GGMLQuantizationType, GGUFReader, ReaderTensor, dequantize
+from huggingface_hub import snapshot_download
+
+import vllm._custom_ops as ops
+from vllm.model_executor.layers.fused_moe import fused_experts
+from vllm.model_executor.layers.quantization.gguf import _fused_moe_gguf
+from vllm.platforms import current_platform
+
+GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample")
+GGUF_SAMPLE_MOE = snapshot_download("SzymonOzog/test-gguf-moe-sample")
+
+
+def get_gguf_sample_tensors(
+        hidden_size: int,
+        quant_type: GGMLQuantizationType) -> list[ReaderTensor]:
+    sample_dir = GGUF_SAMPLE
+    filename = f"Quant_{quant_type.name}_{hidden_size}.gguf"
+    sample_file = Path(sample_dir) / filename
+    return GGUFReader(sample_file).tensors
+
+
+def get_gguf_MoE_tensors(
+        hidden_size: int,
+        quant_type: GGMLQuantizationType) -> list[ReaderTensor]:
+    sample_dir = GGUF_SAMPLE_MOE
+    filename = f"Quant_{quant_type.name}_{hidden_size}.gguf"
+    sample_file = Path(sample_dir) / filename
+    return GGUFReader(sample_file).tensors
+
+
+DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
+# Hidden_size for testing, must match the sample file in HF repo,
+# we have `hidden_size = 256, 1024` for test in HF repo currently.
+HIDDEN_SIZES = [256, 1024]
+NUM_TOKENS = [7, 2050]  # Arbitrary values for testing
+SEEDS = [0]
+QUANT_TYPES = [
+    # i-matrix
+    GGMLQuantizationType.IQ1_M,
+    GGMLQuantizationType.IQ1_S,
+    GGMLQuantizationType.IQ2_S,
+    GGMLQuantizationType.IQ2_XS,
+    GGMLQuantizationType.IQ3_S,
+    GGMLQuantizationType.IQ3_XXS,
+    GGMLQuantizationType.IQ4_NL,
+    GGMLQuantizationType.IQ4_XS,
+    # k-quants
+    GGMLQuantizationType.Q2_K,
+    GGMLQuantizationType.Q3_K,
+    GGMLQuantizationType.Q4_K,
+    GGMLQuantizationType.Q5_K,
+    GGMLQuantizationType.Q6_K,
+    # standard quantization
+    GGMLQuantizationType.Q4_0,
+    GGMLQuantizationType.Q5_0,
+    GGMLQuantizationType.Q8_0,
+]
+
+
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("quant_type", QUANT_TYPES)
+@torch.inference_mode()
+def test_dequantize(hidden_size: int, dtype: torch.dtype,
+                    quant_type: GGMLQuantizationType):
+    tensors = get_gguf_sample_tensors(hidden_size, quant_type)
+    for tensor in tensors:
+        shape_str = tensor.name.split("_")[-1]
+        shape = map(int, shape_str.split("x"))
+
+        ref_output = torch.tensor(dequantize(tensor.data, quant_type),
+                                  device="cuda").to(dtype)
+        output = ops.ggml_dequantize(torch.tensor(tensor.data, device="cuda"),
+                                     quant_type, *list(shape), dtype)
+
+        torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=4e-2)
+
+
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("quant_type", QUANT_TYPES)
+@torch.inference_mode()
+def test_mmvq(hidden_size: int, dtype: torch.dtype,
+              quant_type: GGMLQuantizationType):
+    current_platform.seed_everything(0)
+
+    tensors = get_gguf_sample_tensors(hidden_size, quant_type)
+    x = torch.rand((1, hidden_size), dtype=dtype, device="cuda")
+    for tensor in tensors:
+        weight = torch.tensor(dequantize(tensor.data, quant_type),
+                              device="cuda").to(dtype)
+        ref_output = x @ weight.T
+
+        qweight = torch.tensor(tensor.data, device="cuda")
+        output = ops.ggml_mul_mat_vec_a8(qweight, x, quant_type,
+                                         qweight.shape[0]).to(dtype)
+
+        torch.testing.assert_close(output, ref_output, atol=1, rtol=1e-1)
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize(
+    "quant_type",
+    [
+        # k-quants
+        GGMLQuantizationType.Q2_K,
+        GGMLQuantizationType.Q3_K,
+        GGMLQuantizationType.Q4_K,
+        GGMLQuantizationType.Q5_K,
+        GGMLQuantizationType.Q6_K,
+        # standard quants
+        GGMLQuantizationType.Q4_0,
+        GGMLQuantizationType.Q5_0,
+        GGMLQuantizationType.Q8_0,
+    ])
+@torch.inference_mode()
+def test_mmq(num_tokens: int, hidden_size: int, dtype: torch.dtype,
+             quant_type: GGMLQuantizationType):
+    current_platform.seed_everything(0)
+
+    tensors = get_gguf_sample_tensors(hidden_size, quant_type)
+    x = torch.rand((num_tokens, hidden_size), dtype=dtype, device="cuda")
+    for tensor in tensors:
+        weight = torch.tensor(dequantize(tensor.data, quant_type),
+                              device="cuda").to(dtype)
+        ref_output = x @ weight.T
+
+        qweight = torch.tensor(tensor.data, device="cuda")
+        output = ops.ggml_mul_mat_a8(qweight, x, quant_type, qweight.shape[0])
+        atols = {torch.half: 1, torch.bfloat16: 1.5, torch.float: 1.2}
+        # test matrix has inputs centered around 0 and lower precision from
+        # bfloat16 tends to accumulate and can greatly inflate rtol
+        # since outputs are also very close to 0
+        rtols = {torch.half: 1e-1, torch.bfloat16: 1e4, torch.float: 2e1}
+        torch.testing.assert_close(output,
+                                   ref_output,
+                                   atol=atols[dtype],
+                                   rtol=rtols[dtype])
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", [512])
+@pytest.mark.parametrize("top_k", [4, 8])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("quant_type", QUANT_TYPES)
+@torch.inference_mode()
+def test_moe(num_tokens: int, hidden_size: int, dtype: torch.dtype,
+             quant_type: GGMLQuantizationType, top_k: int):
+    current_platform.seed_everything(0)
+    H, E = 1024, 256
+
+    x = torch.rand((num_tokens, H), dtype=dtype, device="cuda")
+
+    topk_weights = torch.rand(num_tokens, top_k, device="cuda", dtype=dtype)
+    topk_ids = torch.randint(0,
+                             E, (num_tokens, top_k),
+                             device="cuda",
+                             dtype=torch.int32)
+
+    tensors = get_gguf_MoE_tensors(hidden_size, quant_type)
+
+    w13 = tensors[0]
+    w2 = tensors[1]
+
+    w13_dequant = torch.tensor(dequantize(w13.data, quant_type),
+                               device="cuda").to(dtype)
+
+    w2_dequant = torch.tensor(dequantize(w2.data, quant_type),
+                              device="cuda").to(dtype)
+
+    output = _fused_moe_gguf(x, torch.tensor(w13.data, device="cuda"),
+                             torch.tensor(w2.data,
+                                          device="cuda"), topk_weights,
+                             topk_ids, quant_type, quant_type, "silu")
+
+    ref_output = fused_experts(x, w13_dequant, w2_dequant, topk_weights,
+                               topk_ids).reshape(output.shape)
+    torch.testing.assert_close(output, ref_output, atol=1, rtol=1e-1)
diff --git a/vllm_v0.10.0/tests/kernels/quantization/test_gptq.py b/vllm_v0.10.0/tests/kernels/quantization/test_gptq.py
new file mode 100644
index 0000000..7fb57a1
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/quantization/test_gptq.py
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from tests.kernels.utils import opcheck
+from vllm import _custom_ops as ops  # noqa: F401
+
+
+def test_gptq_shuffle_opcheck():
+    weight = torch.randint(-2000000,
+                           2000000, (1792, 4096),
+                           device='cuda',
+                           dtype=torch.int32)
+    perm = torch.empty((0, ), device='cuda', dtype=torch.int32)
+    bit = 4
+    opcheck(torch.ops._C.gptq_shuffle, (weight, perm, bit))
+
+
+def test_gptq_gemm_opcheck():
+    a = torch.rand((240, 4096), device='cuda', dtype=torch.float16)
+    weight = torch.randint(-2000000,
+                           2000000, (512, 6144),
+                           device='cuda',
+                           dtype=torch.int32)
+    zeros = torch.zeros((32, 768), device='cuda', dtype=torch.int32)
+    scales = torch.rand((32, 6144), device='cuda', dtype=torch.float16)
+    idx = torch.empty((0, ), device='cuda', dtype=torch.int32)
+    use_exllama = True
+    bit = 4
+    opcheck(torch.ops._C.gptq_gemm,
+            (a, weight, zeros, scales, idx, use_exllama, bit))
diff --git a/vllm_v0.10.0/tests/kernels/quantization/test_int8_kernel.py b/vllm_v0.10.0/tests/kernels/quantization/test_int8_kernel.py
new file mode 100644
index 0000000..dc5fecb
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/quantization/test_int8_kernel.py
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_int8_kernel.py
+import itertools
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.quantization.utils.int8_utils import (
+    per_token_quant_int8)
+from vllm.platforms import current_platform
+
+if current_platform.get_device_capability() < (7, 0):
+    pytest.skip("INT8 Triton requires CUDA 7.0 or higher",
+                allow_module_level=True)
+
+
+def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):
+    """Matrix multiplication function that supports per-token input
+    quantization and per-column weight quantization"""
+    A = A.to(torch.float32)
+    B = B.to(torch.float32)
+
+    assert A.shape[-1] == B.shape[-1], "Dimension mismatch"
+    assert B.ndim == 2 and B.is_contiguous(
+    ), "B must be a 2D contiguous tensor"
+
+    # Reshape input
+    M = A.numel() // A.shape[-1]
+    B = B.t()  # Transpose weight matrix
+    N, K = B.shape
+    origin_C_shape = A.shape[:-1] + (K, )
+    A = A.reshape(M, N)
+
+    # As is per-token [M, 1], Bs is per-column [1, K]
+    C = torch.matmul(A, B)  # [M, K]
+    C = As * C * Bs.view(1, -1)  # Broadcast per-column scale
+
+    return C.reshape(origin_C_shape).to(output_dtype)
+
+
+def torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk):
+    """This function performs fused moe with per-column int8 quantization
+    using native torch."""
+
+    B, D = a.shape
+    # Perform per-token quantization
+    a_q, a_s = per_token_quant_int8(a)
+    # Repeat tokens to match topk
+    a_q = a_q.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    # Also repeat the scale
+    a_s = a_s.view(B, -1, 1).repeat(1, topk, 1).reshape(-1, 1)  # [B*topk, 1]
+
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+
+    # Calculate routing
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+    # Process each expert
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            # First MLP layer: note that a_s is now per-token
+            inter_out = native_w8a8_per_token_matmul(a_q[mask],
+                                                     w1[i],
+                                                     a_s[mask],
+                                                     w1_s[i],
+                                                     output_dtype=a.dtype)
+            # Activation function
+            act_out = SiluAndMul().forward_native(inter_out)
+            # Quantize activation output with per-token
+            act_out_q, act_out_s = per_token_quant_int8(act_out)
+
+            # Second MLP layer
+            out[mask] = native_w8a8_per_token_matmul(act_out_q,
+                                                     w2[i],
+                                                     act_out_s,
+                                                     w2_s[i],
+                                                     output_dtype=a.dtype)
+    # Apply routing weights and sum
+    return (out.view(B, -1, w2.shape[1]) *
+            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+
+
+@pytest.fixture(autouse=True, scope="module")
+def setup_cuda():
+    """Sets the default CUDA device for all tests in this module."""
+    torch.set_default_device("cuda")
+
+
+DTYPES = [torch.half, torch.bfloat16]
+M = [1, 33]
+N = [128, 1024]
+K = [256, 4096]
+E = [8]
+TOP_KS = [2, 6]
+SEEDS = [0]
+
+
+@pytest.mark.parametrize("M, N, K, E, topk, dtype, seed",
+                         itertools.product(M, N, K, E, TOP_KS, DTYPES, SEEDS))
+@torch.inference_mode()
+def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed):
+    torch.manual_seed(seed)
+    # Initialize int8 quantization parameters
+    factor_for_scale = 1e-2
+    int8_max = 127
+    int8_min = -128
+
+    # Input tensor
+    # M * K
+    a = torch.randn((M, K), dtype=dtype) / 10
+
+    # Generate int8 weights
+    w1_fp32 = (torch.rand((E, 2 * N, K), dtype=torch.float32) - 0.5) * 2
+    w1 = (w1_fp32 * int8_max).clamp(min=int8_min, max=int8_max).to(torch.int8)
+
+    w2_fp32 = (torch.rand((E, K, N), dtype=torch.float32) - 0.5) * 2
+    w2 = (w2_fp32 * int8_max).clamp(min=int8_min, max=int8_max).to(torch.int8)
+
+    # Generate scale for each column (per-column quantization)
+    w1_s = torch.rand(E, 2 * N, device=w1_fp32.device) * factor_for_scale
+    w2_s = torch.rand(E, K, device=w2_fp32.device) * factor_for_scale
+    score = torch.randn((M, E), dtype=dtype)
+
+    ref_out = torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk)
+    out = fused_moe(
+        a,
+        w1,
+        w2,
+        score,
+        topk,
+        renormalize=False,
+        use_int8_w8a8=True,  # Using int8-w8a8
+        per_channel_quant=True,
+        w1_scale=w1_s,
+        w2_scale=w2_s,
+        block_shape=None,  # Not using block quantization
+    )
+
+    # Check results
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    assert rel_diff < 0.05
diff --git a/vllm_v0.10.0/tests/kernels/quantization/test_int8_quant.py b/vllm_v0.10.0/tests/kernels/quantization/test_int8_quant.py
new file mode 100644
index 0000000..5a37b97
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/quantization/test_int8_quant.py
@@ -0,0 +1,194 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from tests.kernels.quant_utils import ref_dynamic_per_token_quant
+from tests.kernels.utils import opcheck
+from vllm._custom_ops import scaled_int8_quant
+from vllm.platforms import current_platform
+
+DTYPES = [torch.half, torch.bfloat16, torch.float]
+HIDDEN_SIZES = [16, 67, 768, 5137, 8193]  # Arbitrary values for testing
+HIDDEN_SIZES += list(range(1024, 1033))  # vectorized conversion edge cases
+NUM_TOKENS = [1, 7, 83, 4096]  # Arbitrary values for testing
+SEEDS = [0]
+SCALE = [0.1, 2.1]
+
+
+def opcheck_int8_quant_static(output, input, scale, azp=None):
+    if azp is None:
+        opcheck(torch.ops._C.static_scaled_int8_quant,
+                (output, input, scale, None))
+    else:
+        opcheck(torch.ops._C.static_scaled_int8_quant,
+                (output, input, scale, azp))
+
+
+def opcheck_int8_quant_dynamic(output, input, symmetric=True):
+    scale = torch.empty((input.numel() // input.shape[-1], 1),
+                        device=input.device,
+                        dtype=torch.float32)
+    if symmetric:
+        opcheck(torch.ops._C.dynamic_scaled_int8_quant,
+                (output, input, scale, None))
+    else:
+        azp = torch.empty((input.numel() // input.shape[-1], 1),
+                          device=input.device,
+                          dtype=torch.int32)
+        opcheck(torch.ops._C.dynamic_scaled_int8_quant,
+                (output, input, scale, azp))
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@torch.inference_mode()
+def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
+                                   dtype: torch.dtype, seed: int) -> None:
+    current_platform.seed_everything(seed)
+
+    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
+
+    # reference
+    ref_out, ref_scales = ref_dynamic_per_token_quant(x, torch.int8)
+    # kernel
+    ops_out, ops_scales, _ = scaled_int8_quant(x)
+
+    torch.testing.assert_close(ops_scales, ref_scales)
+    # big atol to account for rounding errors
+    torch.testing.assert_close(ops_out, ref_out, atol=1, rtol=0.0)
+
+    opcheck_int8_quant_dynamic(ops_out, x)
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@torch.inference_mode()
+def test_dynamic_scaled_int8_azp_quant(num_tokens: int, hidden_size: int,
+                                       dtype: torch.dtype, seed: int) -> None:
+    current_platform.seed_everything(seed)
+    int8_traits = torch.iinfo(torch.int8)
+
+    x = torch.rand(num_tokens, hidden_size, dtype=dtype,
+                   device="cuda") * 1000 - 300
+
+    x_token_max, _ = x.to(dtype=torch.float32).max(dim=1, keepdim=True)
+    x_token_min, _ = x.to(dtype=torch.float32).min(dim=1, keepdim=True)
+
+    # calculate scale and azp, and adjust the range
+    scales = (x_token_max - x_token_min) / torch.tensor(255.0)
+    azps = torch.round(torch.tensor(-128.0) - x_token_min / scales).to(
+        torch.int32)
+
+    torch_out = ((x / scales).round() + azps).clamp(
+        int8_traits.min, int8_traits.max).to(torch.int8)
+    assert torch_out.min() >= int8_traits.min and torch_out.max(
+    ) <= int8_traits.max
+
+    ops_out, scales_out, azp_out = scaled_int8_quant(x, symmetric=False)
+
+    if (not torch.allclose(scales_out, scales)):
+        print(torch.argmax(torch.abs(scales_out - scales)))
+    torch.testing.assert_close(scales_out, scales)
+    # big atol to account for rounding errors
+    torch.testing.assert_close(azp_out, azps, atol=1, rtol=0.0)
+    # if AZP is off by 1, after rounding-to-even, the output may be off by 2
+    torch.testing.assert_close(ops_out, torch_out, atol=2, rtol=0.0)
+
+    opcheck_int8_quant_dynamic(ops_out, x, False)
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("scale", SCALE)
+@torch.inference_mode()
+def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
+                                  dtype: torch.dtype, seed: int,
+                                  scale: float) -> None:
+    current_platform.seed_everything(seed)
+    int8_traits = torch.iinfo(torch.int8)
+
+    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
+    scale_arg = torch.tensor([scale], dtype=torch.float32, device="cuda")
+
+    out1 = (x / scale_arg).round().clamp(int8_traits.min,
+                                         int8_traits.max).to(torch.int8)
+    out2, scale2, _ = scaled_int8_quant(x, scale_arg)
+    assert scale2 is scale_arg
+
+    # big atol to account for rounding errors
+    torch.testing.assert_close(out1, out2, atol=1, rtol=0.0)
+
+    opcheck_int8_quant_static(out2, x, scale_arg)
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("scale", SCALE)
+@pytest.mark.parametrize("azp", [-255, 54])
+@torch.inference_mode()
+def test_static_scaled_int8_azp_quant(num_tokens: int, hidden_size: int,
+                                      dtype: torch.dtype, seed: int,
+                                      scale: float, azp: int) -> None:
+    current_platform.seed_everything(seed)
+    int8_traits = torch.iinfo(torch.int8)
+
+    x = torch.rand(num_tokens, hidden_size, dtype=dtype,
+                   device="cuda") * 1000 - 300
+
+    out1 = ((x / scale).round() + azp).clamp(int8_traits.min,
+                                             int8_traits.max).to(torch.int8)
+    scale_arg = torch.tensor([scale], dtype=torch.float32, device="cuda")
+    azp_arg = torch.tensor([azp], dtype=torch.int32, device="cuda")
+
+    out2, scale2, azp2 = scaled_int8_quant(x,
+                                           scale_arg,
+                                           azp_arg,
+                                           symmetric=False)
+    assert scale2 is scale_arg
+    assert azp2 is azp_arg
+
+    # big atol to account for rounding errors
+    torch.testing.assert_close(out1, out2, atol=1, rtol=0.0)
+
+    opcheck_int8_quant_static(out2, x, scale_arg, azp_arg)
+
+
+@pytest.mark.parametrize("is_max", [True, False])
+@torch.inference_mode()
+def test_static_scaled_int8_azp_quant_saturating_cast(is_max: bool) -> None:
+    # Test that the saturating cast works correctly for values near i32 max/min
+
+    from numpy import inf, nextafter
+
+    int32_traits = torch.iinfo(torch.int32)
+    val = float(int32_traits.max if is_max else int32_traits.min)
+
+    x_vals = [[
+        nextafter(val, inf), val + 1, val, val - 1,
+        nextafter(val, -inf)
+    ]]
+    x = torch.tensor(x_vals, dtype=torch.float32, device="cuda")
+
+    # The calculation in the kernel is: cast<int8>(cast<int32>(x / scale) + azp)
+    # where cast<T> is a saturating cast to type T.
+    # Scale is set to 1.0 so that the input values are the ones that are cast.
+    # AZP is set to 0 to make sure the int8 saturating cast is tested as well.
+    scale = torch.scalar_tensor(1.0, dtype=torch.float32, device="cuda")
+    azp = torch.scalar_tensor(0, dtype=torch.int32, device="cuda")
+
+    int8_traits = torch.iinfo(torch.int8)
+    val_i8 = int8_traits.max if is_max else int8_traits.min
+    expected = torch.full((1, 5), val_i8, dtype=torch.int8, device="cuda")
+
+    out, _, _ = scaled_int8_quant(x, scale, azp, symmetric=False)
+    torch.testing.assert_close(expected, out, atol=0, rtol=0)
diff --git a/vllm_v0.10.0/tests/kernels/quantization/test_machete_mm.py b/vllm_v0.10.0/tests/kernels/quantization/test_machete_mm.py
new file mode 100644
index 0000000..a7cb2a4
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/quantization/test_machete_mm.py
@@ -0,0 +1,408 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for the machete kernel.
+
+Run `pytest tests/kernels/test_machete_mm.py`.
+"""
+
+import math
+from dataclasses import dataclass, fields
+from typing import Optional
+
+import pytest
+import torch
+
+from tests.kernels.utils import opcheck
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.machete_utils import (
+    query_machete_supported_group_sizes)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    pack_rows, quantize_weights)
+from vllm.platforms import current_platform
+from vllm.scalar_type import ScalarType, scalar_types
+
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
+#  unit tests to a common utility function. Currently the use of
+#  `is_quant_method_supported` conflates kernels with quantization methods
+#  an assumption which is breaking down as quantizations methods can have
+#  have kernels and some kernels support multiple quantization methods.
+IS_SUPPORTED_BY_GPU = current_platform.get_device_capability()[0] >= 9
+
+MNK_SHAPES = [
+    (1, 128, 128),
+    (1, 512, 1024),
+    (1, 4096, 4096),
+    (1, 8192, 28672),
+    (13, 8192, 4096),
+    (26, 4096, 8192),
+    (64, 4096, 4096),
+    (64, 8192, 28672),
+    (257, 128, 4096),
+    (257, 4224, 4160),
+    (257, 4096, 4096),
+    (1024, 4096, 8192),
+    (1024, 8192, 4096),
+]
+
+
+@dataclass
+class TypeConfig:
+    act_type: torch.dtype
+    weight_type: ScalarType
+    output_type: Optional[torch.dtype]
+    group_scale_type: Optional[torch.dtype]
+    group_zero_type: Optional[torch.dtype]
+    channel_scale_type: Optional[torch.dtype]
+    token_scale_type: Optional[torch.dtype]
+
+
+@dataclass
+class Tensors:
+    w_ref: torch.Tensor
+    a_ref: torch.Tensor
+    a: torch.Tensor
+    w_q: torch.Tensor
+    w_g_s: Optional[torch.Tensor]
+    w_g_zp: Optional[torch.Tensor]
+    w_ch_s: Optional[torch.Tensor]
+    w_tok_s: Optional[torch.Tensor]
+
+
+# (Act Type, Weight Type, Output Type, Scale Type, ZeroPoints,
+#  Ch Scales Type, Tok Scales Type)
+# NOTE: None "Scale Type" means the act type is floating point
+#       None "Output Type" means the output type is the same as the act type
+TestTypeTuple = tuple[list[torch.dtype], ScalarType, Optional[torch.dtype],
+                      Optional[torch.dtype], bool]
+TEST_TYPES = [
+    # GPTQ style
+    *(TypeConfig(act_type=a_type,
+                 weight_type=w_type,
+                 output_type=None,
+                 group_scale_type=a_type,
+                 group_zero_type=None,
+                 channel_scale_type=None,
+                 token_scale_type=None)
+      for w_type in [scalar_types.uint4b8, scalar_types.uint8b128]
+      for a_type in [torch.float16, torch.bfloat16]),
+    # AWQ style
+    *(TypeConfig(act_type=a_type,
+                 weight_type=w_type,
+                 output_type=None,
+                 group_scale_type=a_type,
+                 group_zero_type=a_type,
+                 channel_scale_type=None,
+                 token_scale_type=None)
+      for w_type in [scalar_types.uint4, scalar_types.uint8]
+      for a_type in [torch.float16, torch.bfloat16]),
+    # QQQ style
+    *(TypeConfig(act_type=torch.int8,
+                 weight_type=scalar_types.uint4b8,
+                 output_type=torch.float16,
+                 group_scale_type=group_scale_type,
+                 group_zero_type=None,
+                 channel_scale_type=torch.float,
+                 token_scale_type=torch.float)
+      for group_scale_type in [None, torch.float16]),
+    *(TypeConfig(act_type=torch.float8_e4m3fn,
+                 weight_type=scalar_types.uint4b8,
+                 output_type=torch.float16,
+                 group_scale_type=group_scale_type,
+                 group_zero_type=None,
+                 channel_scale_type=torch.float,
+                 token_scale_type=torch.float)
+      for group_scale_type in [None, torch.float16]),
+]
+
+# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
+#  unit tests to a common utility function. Currently the use of
+#  `is_quant_method_supported` conflates kernels with quantization methods
+#  an assumption which is breaking down as quantizations methods can have
+#  have kernels and some kernels support multiple quantization methods.
+IS_SUPPORTED_BY_GPU = current_platform.has_device_capability(90)
+
+
+def rand_data(shape, dtype=torch.float16, scale=1, offset=0):
+    if dtype.is_floating_point:
+        return (scale * torch.rand(shape, device="cuda") - offset).to(dtype)
+    else:
+        return torch.randint(-8, 7, shape, dtype=dtype, device="cuda")
+
+
+def maybe_convert_zeropoints(zps: Optional[torch.Tensor], s: torch.Tensor):
+    return zps if zps is None else -1 * s * (zps.to(s.dtype))
+
+
+def group_size_valid(shape: tuple[int, int, int],
+                     group_size: Optional[int]) -> bool:
+    return group_size is None or group_size == -1 or shape[2] % group_size == 0
+
+
+def machete_quantize_and_pack(atype: torch.dtype,
+                              w: torch.Tensor,
+                              wtype: ScalarType,
+                              stype: Optional[torch.dtype],
+                              group_size: Optional[int],
+                              zero_points: bool = False):
+    assert wtype.is_integer(), "TODO: support floating point weights"
+
+    w_ref, w_q, w_s, w_zp = quantize_weights(
+        w,
+        wtype,
+        group_size=group_size,
+        zero_points=zero_points,
+        # to match how the kernel applies zps
+        ref_zero_points_after_scales=True)
+
+    w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape)
+    w_q = w_q.t().contiguous().t()  # convert to col major
+
+    w_q_machete = ops.machete_prepack_B(w_q, atype, wtype, stype)
+    opcheck(torch.ops._C.machete_prepack_B, (w_q, atype, wtype.id, stype))
+
+    return w_ref, w_q_machete, w_s, w_zp
+
+
+def create_test_tensors(shape: tuple[int, int, int],
+                        types: TypeConfig,
+                        group_size: Optional[int],
+                        subset_stride_factor: Optional[int] = None) -> Tensors:
+    m, n, k = shape
+    factor = subset_stride_factor or 1
+
+    print("create_test_tensors, shape:", shape, "types:", types, "group_size:",
+          group_size)
+
+    a = rand_data((m * factor, k * factor), types.act_type, scale=3, offset=2)
+    w = rand_data((k * factor, n * factor), types.act_type, scale=3, offset=1)
+
+    if factor > 1:
+        a = a[0:m, 0:k]
+        w = w[0:k, 0:n]
+
+    if types.group_scale_type is not None:
+        w = w.to(types.group_scale_type)
+    if w.dtype.itemsize == 1:
+        w = w.to(torch.float16)
+
+    w_ref, w_q_packed, w_s, w_zp = machete_quantize_and_pack(
+        a.dtype, w, types.weight_type, types.group_scale_type, group_size,
+        types.group_zero_type is not None)
+
+    if not a.dtype.is_floating_point:
+        aiinfo = torch.iinfo(a.dtype)
+        w_ref = w_ref.round().clamp(aiinfo.min, aiinfo.max)
+
+    a_ref = a.to(torch.float32)
+    w_ref = w_ref.to(torch.float32)
+
+    w_ch_s = None if types.channel_scale_type is None else\
+        rand_data((n,), types.channel_scale_type)
+    w_tok_s = None if types.token_scale_type is None else\
+        rand_data((m,), types.token_scale_type)
+
+    return Tensors(w_ref=w_ref,
+                   a_ref=a_ref,
+                   a=a,
+                   w_q=w_q_packed,
+                   w_g_s=w_s,
+                   w_g_zp=maybe_convert_zeropoints(w_zp, w_s),
+                   w_ch_s=w_ch_s,
+                   w_tok_s=w_tok_s)
+
+
+# None stype means scales use the same dtype as a
+def machete_mm_test_helper(types: TypeConfig,
+                           tensors: Tensors,
+                           group_size: Optional[int] = None,
+                           schedule: Optional[str] = None):
+    output_ref = torch.matmul(tensors.a_ref, tensors.w_ref)
+    output_ref_type = output_ref.dtype
+
+    if tensors.w_ch_s is not None:
+        output_ref = (output_ref.to(tensors.w_ch_s.dtype) *
+                      tensors.w_ch_s.unsqueeze(0)).to(output_ref_type)
+    if tensors.w_tok_s is not None:
+        output_ref = (output_ref.to(tensors.w_tok_s.dtype) *
+                      tensors.w_tok_s.unsqueeze(1)).to(output_ref_type)
+
+    output = ops.machete_mm(
+        a=tensors.a,
+        b_q=tensors.w_q,
+        b_type=types.weight_type,
+        b_group_scales=tensors.w_g_s,
+        b_group_zeros=tensors.w_g_zp,
+        b_group_size=group_size,
+        b_channel_scales=tensors.w_ch_s,
+        a_token_scales=tensors.w_tok_s,
+        out_type=types.output_type,
+        schedule=schedule,
+    )
+
+    print(output)
+    print(output_ref)
+
+    # Relax atol as our reduction dim becomes larger (more rounding error)
+    # Relax atol when we have zeropoints since the way machete applies
+    #  zeropoints (after scales) causes noise around 0
+    atol = 1 if tensors.w_g_zp is not None\
+        else min(5e-2 * math.sqrt(tensors.a.shape[1]), 1)
+    rtol = 1e-1 if tensors.a.element_size() >= 2 else 2e-1
+    torch.testing.assert_close(output,
+                               output_ref.to(output.dtype),
+                               rtol=rtol,
+                               atol=atol)
+
+
+@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
+                    reason="Machete is not supported on this GPU type.")
+@pytest.mark.parametrize("shape",
+                         MNK_SHAPES,
+                         ids=lambda x: "x".join(str(v) for v in x))
+@pytest.mark.parametrize("types", TEST_TYPES)
+def test_machete_all_schedules(shape, types: TypeConfig):
+
+    group_sizes: list[Optional[int]] = []
+    if types.group_scale_type is None:
+        group_sizes = [None]
+    else:
+        group_sizes = query_machete_supported_group_sizes(types.act_type)
+
+    for group_size in group_sizes:
+        if not group_size_valid(shape, group_size):
+            continue
+
+        tensors = create_test_tensors(shape, types, group_size)
+        print(f"MNK = {shape}")
+        for schedule in ops.machete_supported_schedules(
+                types.act_type,
+                types.weight_type,
+                group_scales_type=types.group_scale_type,
+                group_zeros_type=types.group_scale_type,
+                out_type=types.output_type):
+            print(f"Testing schedule {schedule}")
+            machete_mm_test_helper(types, tensors, group_size, schedule)
+
+
+@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
+                    reason="Machete is not supported on this GPU type.")
+@pytest.mark.parametrize("shape",
+                         MNK_SHAPES,
+                         ids=lambda x: "x".join(str(v) for v in x))
+@pytest.mark.parametrize("types", TEST_TYPES)
+def test_machete_heuristic(shape, types: TypeConfig):
+    group_sizes: list[Optional[int]] = []
+    if types.group_scale_type is None:
+        group_sizes = [None]
+    else:
+        group_sizes = query_machete_supported_group_sizes(types.act_type)
+
+    for group_size in group_sizes:
+        if not group_size_valid(shape, group_size):
+            continue
+
+        tensors = create_test_tensors(shape, types, group_size)
+        machete_mm_test_helper(types, tensors, group_size)
+
+
+# Test working on other devices
+@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
+                    reason="Machete is not supported on this GPU type.")
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_machete_devices(device: str):
+    group_size = 128
+
+    type_config = TypeConfig(act_type=torch.float16,
+                             weight_type=scalar_types.uint4b8,
+                             output_type=None,
+                             group_scale_type=torch.float16,
+                             group_zero_type=None,
+                             channel_scale_type=None,
+                             token_scale_type=None)
+
+    tensors = create_test_tensors((512, 4096, 4096), type_config, group_size)
+
+    for field in fields(Tensors):
+        tensor = getattr(tensors, field.name)
+        if isinstance(tensor, torch.Tensor):
+            setattr(tensors, field.name, tensor.to(device))
+
+    machete_mm_test_helper(type_config, tensors, group_size)
+
+
+# Test working with a subset of A and B
+@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
+                    reason="Machete is not supported on this GPU type.")
+def test_machete_subset():
+    group_size = 128
+
+    type_config = TypeConfig(act_type=torch.float16,
+                             weight_type=scalar_types.uint4b8,
+                             output_type=None,
+                             group_scale_type=torch.float16,
+                             group_zero_type=None,
+                             channel_scale_type=None,
+                             token_scale_type=None)
+
+    tensors = create_test_tensors((512, 4096, 4096),
+                                  type_config,
+                                  group_size,
+                                  subset_stride_factor=2)
+    machete_mm_test_helper(type_config, tensors, group_size)
+
+
+# Test to make sure cuda graphs work
+class MacheteLayer(torch.nn.Module):
+
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.kwargs = kwargs
+
+    def forward(self, a):
+        return ops.machete_mm(a=a, **self.kwargs)
+
+
+@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
+                    reason="Machete is not supported on this GPU type.")
+def test_machete_cuda_graph():
+    m, n, k = 512, 4096, 4096
+
+    a = rand_data((m, k), torch.float16)
+    b = rand_data((k, n), torch.float16)
+    wtype = scalar_types.uint4b8
+    stype = torch.float16
+    group_size = 128
+    zero_points = False
+
+    w_ref, w_q_packed, w_s, w_zp = machete_quantize_and_pack(
+        a.dtype, b, wtype, stype, group_size, zero_points)
+
+    # Construct a trivial model with a single layer that calls a machete kernel
+    model = MacheteLayer(
+        b_q=w_q_packed,
+        b_type=wtype,
+        b_group_scales=w_s,
+        b_group_zeros=maybe_convert_zeropoints(w_zp, w_s),
+        b_group_size=group_size,
+    )
+
+    output_ref = torch.matmul(a, w_ref)
+
+    # Run the model with a cuda graph
+    stream = torch.cuda.Stream()
+    with torch.cuda.stream(stream):
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g):
+            output = model(a)
+    output.zero_()
+    g.replay()
+
+    # Relax atol as our reduction dim becomes larger (more rounding error)
+    # Relax atol when we have zeropoints since the way machete applies
+    #  zeropoints (after scales) causes noise around 0
+    atol = 1 if zero_points else min(5e-2 * math.sqrt(k), 1)
+    torch.testing.assert_close(output, output_ref, rtol=1e-1, atol=atol)
diff --git a/vllm_v0.10.0/tests/kernels/quantization/test_marlin_gemm.py b/vllm_v0.10.0/tests/kernels/quantization/test_marlin_gemm.py
new file mode 100644
index 0000000..92914bd
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/quantization/test_marlin_gemm.py
@@ -0,0 +1,570 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for the marlin kernel.
+
+Run `pytest tests/kernels/marlin/test_marlin_gemm.py`.
+"""
+import pytest
+import torch
+
+from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
+from tests.quantization.utils import is_quant_method_supported
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
+    GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
+    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
+from vllm.model_executor.layers.quantization.qqq import (
+    MARLIN_QQQ_MAX_PARALLEL, MARLIN_QQQ_MIN_THREAD_N,
+    MARLIN_QQQ_SUPPORTED_GROUP_SIZES, MARLIN_QQQ_SUPPORTED_NUM_BITS)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
+    MARLIN_SUPPORTED_GROUP_SIZES, marlin_make_empty_g_idx,
+    marlin_make_workspace_new, marlin_permute_scales,
+    query_marlin_supported_quant_types)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
+    FP4_MARLIN_SUPPORTED_GROUP_SIZES, rand_marlin_weight_fp4_like)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
+    marlin_quant_fp8_torch)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
+    MarlinWorkspace, awq_marlin_quantize, get_weight_perm, marlin_quantize,
+    marlin_weights)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
+    marlin_24_quantize)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test_qqq import (  # noqa: E501
+    marlin_qqq_quantize)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    awq_pack, gptq_pack, gptq_quantize_weights, quantize_weights, sort_weights)
+from vllm.scalar_type import scalar_types
+
+ACT_ORDER_OPTS = [False, True]
+K_FULL_OPTS = [False, True]
+USE_ATOMIC_ADD_OPTS = [False, True]
+USE_FP32_REDUCE_OPTS = [False, True]
+
+MARLIN_K_CHUNKS = [128]
+MARLIN_N_CHUNKS = [64, 256]
+
+MARLIN_24_K_CHUNKS = [128]
+MARLIN_24_N_CHUNKS = [512]
+
+HQQ_SUPPORTED_GROUP_SIZES = [64]
+
+MNK_FACTORS = [
+    (1, 1, 1),
+    (1, 4, 8),
+    (1, 7, 5),
+    (13, 17, 67),
+    (26, 37, 13),
+    (67, 13, 11),
+    (257, 13, 11),
+    (658, 13, 11),
+]
+
+DTYPES = [torch.float16, torch.bfloat16]
+
+
+def compute_max_diff(output, output_ref):
+    return torch.mean(torch.abs(output - output_ref)) / torch.mean(
+        torch.abs(output_ref))
+
+
+def rand_data(shape, dtype=torch.float16):
+    return torch.randn(shape, dtype=dtype, device="cuda")
+
+
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
+                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
+@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
+@pytest.mark.parametrize("quant_type",
+                         query_marlin_supported_quant_types(False, False))
+@pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
+@pytest.mark.parametrize("act_order", ACT_ORDER_OPTS)
+@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
+def test_gptq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
+                            act_order, mnk_factors):
+    m_factor, n_factor, k_factor = mnk_factors
+
+    size_k = k_chunk * k_factor
+    size_n = n_chunk * n_factor
+
+    # Filter act_order
+    if act_order:
+        if group_size == -1:
+            return
+        if group_size == size_k:
+            return
+
+    # Normalize group_size
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
+
+    # Create input
+    b_weight = rand_data((size_k, size_n))
+
+    # Quantize (and apply act_order if provided)
+    w_ref, q_w, s, g_idx, rand_perm = gptq_quantize_weights(
+        b_weight, quant_type, group_size, act_order)
+
+    # Pack to GPTQ format
+    q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
+
+    # For act_order, sort the "weights" and "g_idx" so that group ids are
+    # increasing
+    sort_indices = torch.empty(0, dtype=torch.int, device=b_weight.device)
+    if act_order:
+        q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)
+
+    # Pack to Marlin format
+    weight_perm = get_weight_perm(quant_type.size_bits)
+    marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
+                                  weight_perm)
+
+    opcheck(torch.ops._C.gptq_marlin_repack,
+            (q_w_gptq, sort_indices, size_k, size_n, quant_type.size_bits))
+
+    # Run Marlin repack GPU kernel
+    marlin_q_w_2 = ops.gptq_marlin_repack(
+        q_w_gptq,
+        sort_indices,
+        size_k,
+        size_n,
+        quant_type.size_bits,
+    )
+    torch.cuda.synchronize()
+
+    torch.testing.assert_close(marlin_q_w_1, marlin_q_w_2)
+
+
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
+                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
+@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
+@pytest.mark.parametrize("quant_type",
+                         query_marlin_supported_quant_types(True))
+@pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
+@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
+def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
+                           mnk_factors):
+    m_factor, n_factor, k_factor = mnk_factors
+
+    size_k = k_chunk * k_factor
+    size_n = n_chunk * n_factor
+
+    # Normalize group_size
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
+
+    # Create input
+    b_weight = rand_data((size_k, size_n))
+
+    # Quantize
+    w_ref, q_w, s, zp = quantize_weights(b_weight,
+                                         quant_type,
+                                         group_size,
+                                         zero_points=True)
+
+    # Pack to AWQ format
+    q_w_awq = awq_pack(q_w, quant_type.size_bits, size_k, size_n)
+
+    # Pack to Marlin format
+    weight_perm = get_weight_perm(quant_type.size_bits)
+    marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
+                                  weight_perm)
+
+    opcheck(torch.ops._C.awq_marlin_repack,
+            (q_w_awq, size_k, size_n, quant_type.size_bits))
+
+    # Run Marlin repack GPU kernel
+    marlin_q_w_2 = ops.awq_marlin_repack(
+        q_w_awq,
+        size_k,
+        size_n,
+        quant_type.size_bits,
+    )
+    torch.cuda.synchronize()
+
+    torch.testing.assert_close(marlin_q_w_1, marlin_q_w_2)
+
+
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
+                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
+@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
+@pytest.mark.parametrize("quant_type", query_marlin_supported_quant_types())
+@pytest.mark.parametrize(
+    "group_size",
+    set(MARLIN_SUPPORTED_GROUP_SIZES + FP4_MARLIN_SUPPORTED_GROUP_SIZES))
+@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
+@pytest.mark.parametrize("act_order", ACT_ORDER_OPTS)
+@pytest.mark.parametrize("is_k_full", K_FULL_OPTS)
+@pytest.mark.parametrize("use_atomic_add", USE_ATOMIC_ADD_OPTS)
+@pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS)
+def test_gptq_marlin_gemm(
+    k_chunk,
+    n_chunk,
+    quant_type,
+    group_size,
+    mnk_factors,
+    act_order,
+    is_k_full,
+    use_atomic_add,
+    use_fp32_reduce,
+):
+    m_factor, n_factor, k_factor = mnk_factors
+    has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8]
+
+    size_m = m_factor
+    size_k = k_chunk * k_factor
+    size_n = n_chunk * n_factor
+
+    if act_order:
+        if group_size == -1:
+            return
+        if group_size == size_k:
+            return
+        if has_zp:
+            return
+
+    if size_k % group_size != 0:
+        return
+
+    a_input = rand_data((size_m, size_k))
+    b_weight = rand_data((size_k, size_n))
+
+    if quant_type == scalar_types.float4_e2m1f:
+        if group_size != 16 or act_order:
+            return
+        w_ref, marlin_q_w, marlin_s, marlin_s2 = rand_marlin_weight_fp4_like(
+            b_weight.T, group_size)
+        g_idx = None
+        sort_indices = None
+        marlin_zp = None
+    elif quant_type == scalar_types.float8_e4m3fn:
+        if group_size not in [-1, 128]:
+            return
+        if act_order:
+            return
+        w_ref, marlin_q_w, marlin_s = marlin_quant_fp8_torch(
+            b_weight.T, group_size)
+        g_idx = None
+        sort_indices = None
+        marlin_zp = None
+        marlin_s2 = None
+    elif has_zp:
+        if group_size == 16:
+            return
+        w_ref, marlin_q_w, marlin_s, marlin_zp = awq_marlin_quantize(
+            b_weight, quant_type, group_size)
+        g_idx = None
+        sort_indices = None
+        marlin_s2 = None
+    else:
+        if group_size == 16:
+            return
+        w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
+            b_weight, quant_type, group_size, act_order)
+        marlin_zp = None
+        marlin_s2 = None
+
+    workspace = marlin_make_workspace_new(w_ref.device)
+
+    opcheck(torch.ops._C.gptq_marlin_gemm,
+            (a_input, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, g_idx,
+             sort_indices, workspace, quant_type.id, a_input.shape[0],
+             b_weight.shape[1], a_input.shape[1], is_k_full, use_atomic_add,
+             use_fp32_reduce, False),
+            test_utils=DEFAULT_OPCHECK_TEST_UTILS)
+
+    output = ops.gptq_marlin_gemm(
+        a_input,
+        None,
+        marlin_q_w,
+        marlin_s,
+        marlin_s2,
+        marlin_zp,
+        g_idx,
+        sort_indices,
+        workspace,
+        quant_type,
+        a_input.shape[0],
+        b_weight.shape[1],
+        a_input.shape[1],
+        is_k_full=is_k_full,
+        use_atomic_add=use_atomic_add,
+        use_fp32_reduce=use_fp32_reduce,
+        is_zp_float=False,
+    )
+    output_ref = torch.matmul(a_input, w_ref)
+
+    torch.cuda.synchronize()
+
+    max_diff = compute_max_diff(output, output_ref)
+
+    assert max_diff < 0.04
+
+
+# TODO: find better way to test this?
+@torch.compile(fullgraph=True)
+def marlin_24_gemm_tester(a_input, marlin_24_q_w_comp, marlin_24_meta,
+                          marlin_24_s, scratch, quant_type, size_m, size_n,
+                          size_k):
+    return ops.gptq_marlin_24_gemm(a_input, marlin_24_q_w_comp, marlin_24_meta,
+                                   marlin_24_s, scratch, quant_type, size_m,
+                                   size_n, size_k)
+
+
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
+                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.parametrize("k_chunk", MARLIN_24_K_CHUNKS)
+@pytest.mark.parametrize("n_chunk", MARLIN_24_N_CHUNKS)
+@pytest.mark.parametrize("quant_type", GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
+@pytest.mark.parametrize("group_size", GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES)
+@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
+def test_gptq_marlin_24_gemm(k_chunk, n_chunk, quant_type, group_size,
+                             mnk_factors):
+    m_factor, n_factor, k_factor = mnk_factors
+
+    size_m = m_factor
+    size_k = k_chunk * k_factor
+    size_n = n_chunk * n_factor
+
+    a_input = rand_data((size_m, size_k))
+    b_weight = rand_data((size_k, size_n))
+
+    (w_24_ref, marlin_24_q_w_comp, marlin_24_meta,
+     marlin_24_s) = marlin_24_quantize(b_weight, quant_type, group_size)
+
+    workspace_24 = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N,
+                                   GPTQ_MARLIN_24_MAX_PARALLEL)
+
+    output_ref = torch.matmul(a_input, w_24_ref)
+
+    opcheck(torch.ops._C.gptq_marlin_24_gemm,
+            (a_input, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s,
+             workspace_24.scratch, quant_type.id, a_input.shape[0],
+             b_weight.shape[1], a_input.shape[1]),
+            test_utils=DEFAULT_OPCHECK_TEST_UTILS)
+
+    output = marlin_24_gemm_tester(
+        a_input,
+        marlin_24_q_w_comp,
+        marlin_24_meta,
+        marlin_24_s,
+        workspace_24.scratch,
+        quant_type,
+        a_input.shape[0],
+        b_weight.shape[1],
+        a_input.shape[1],
+    )
+
+    torch.cuda.synchronize()
+
+    max_diff = compute_max_diff(output, output_ref)
+
+    assert max_diff < 0.04
+
+
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
+                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
+@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
+@pytest.mark.parametrize("group_size", HQQ_SUPPORTED_GROUP_SIZES)
+@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
+@pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS)
+def test_hqq_marlin_gemm(
+    k_chunk,
+    n_chunk,
+    group_size,
+    mnk_factors,
+    use_fp32_reduce,
+):
+    m_factor, n_factor, k_factor = mnk_factors
+
+    size_m = m_factor
+    size_k = k_chunk * k_factor
+    size_n = n_chunk * n_factor
+
+    quant_type = scalar_types.uint4
+
+    a_input = rand_data((size_m, size_k))
+    dev = a_input.device
+
+    b_weight = torch.randint(0,
+                             10, (size_n, size_k),
+                             dtype=torch.uint8,
+                             device=dev)
+    scale = rand_data((size_n, size_k // group_size))
+    zero = rand_data((size_n, size_k // group_size))
+
+    gptq_w_q = gptq_pack(b_weight.transpose(1, 0), 4, size_k, size_n)
+
+    sort_indices = torch.empty(0, dtype=torch.int, device=dev)
+    marlin_w_q = ops.gptq_marlin_repack(gptq_w_q, sort_indices, size_k, size_n,
+                                        4).to(dev)
+    marlin_s = marlin_permute_scales(scale.transpose(1, 0), size_k, size_n,
+                                     group_size).to(dev)
+    marlin_zp = marlin_permute_scales(zero.transpose(1, 0), size_k, size_n,
+                                      group_size).to(dev)
+
+    g_idx = marlin_make_empty_g_idx(dev)
+    g_idx_sort_indices = marlin_make_empty_g_idx(dev)
+
+    workspace = marlin_make_workspace_new(b_weight.device)
+
+    output = ops.gptq_marlin_gemm(
+        a_input,
+        None,
+        marlin_w_q,
+        marlin_s,
+        None,
+        marlin_zp,
+        g_idx,
+        g_idx_sort_indices,
+        workspace,
+        quant_type,
+        a_input.shape[0],
+        b_weight.shape[0],
+        a_input.shape[1],
+        is_k_full=True,
+        use_fp32_reduce=use_fp32_reduce,
+        is_zp_float=True,
+    )
+
+    b_flat = b_weight.reshape(-1, group_size)
+    zp_flat = zero.reshape(-1, 1)
+    s_flat = scale.reshape(-1, 1)
+    dequant = (b_flat - zp_flat) * s_flat
+
+    output_ref = torch.matmul(a_input,
+                              dequant.reshape(b_weight.shape).transpose(1, 0))
+
+    torch.cuda.synchronize()
+
+    max_diff = compute_max_diff(output, output_ref)
+
+    assert max_diff < 0.04
+
+
+@pytest.mark.skipif(not is_quant_method_supported("qqq"),
+                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
+@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
+@pytest.mark.parametrize("num_bits", MARLIN_QQQ_SUPPORTED_NUM_BITS)
+@pytest.mark.parametrize("group_size", MARLIN_QQQ_SUPPORTED_GROUP_SIZES)
+@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
+def test_marlin_qqq_gemm(
+    k_chunk,
+    n_chunk,
+    num_bits,
+    group_size,
+    mnk_factors,
+):
+    int8_traits = torch.iinfo(torch.int8)
+    m_factor, n_factor, k_factor = mnk_factors
+
+    size_m = m_factor
+    size_k = k_chunk * k_factor
+    size_n = n_chunk * n_factor
+
+    a_input = rand_data((size_m, size_k))
+    b_weight = rand_data((size_k, size_n))
+
+    # Quantize activations
+    s_a = a_input.abs().max(dim=-1, keepdim=True)[0].div(int8_traits.max).to(
+        torch.float)
+    q_a = (a_input / s_a).round().clamp(int8_traits.min,
+                                        int8_traits.max).to(torch.int8)
+
+    # Quantize weights
+    w_ref, marlin_qqq_q_w, marlin_qqq_s_group, marlin_qqq_s_channel = \
+    marlin_qqq_quantize(b_weight, num_bits, group_size)
+
+    workspace = MarlinWorkspace(size_n, MARLIN_QQQ_MIN_THREAD_N,
+                                MARLIN_QQQ_MAX_PARALLEL)
+
+    opcheck(torch.ops._C.marlin_qqq_gemm,
+            (q_a, marlin_qqq_q_w, s_a, marlin_qqq_s_channel,
+             marlin_qqq_s_group, workspace.scratch, a_input.shape[0],
+             b_weight.shape[1], a_input.shape[1]))
+
+    output = ops.marlin_qqq_gemm(
+        q_a,
+        marlin_qqq_q_w,
+        s_a,
+        marlin_qqq_s_channel,
+        marlin_qqq_s_group,
+        workspace.scratch,
+        a_input.shape[0],
+        b_weight.shape[1],
+        a_input.shape[1],
+    )
+    output_ref = torch.matmul(q_a.half() * s_a.half(), w_ref)
+
+    torch.cuda.synchronize()
+
+    max_diff = compute_max_diff(output, output_ref)
+
+    assert max_diff < 0.04
+
+
+def test_marlin_gemm_subset_input():
+    quant_type = scalar_types.uint4b8
+    group_size = 128
+
+    size_m, size_k, size_n = 32, 1024, 2048
+    big_m = size_m * 2
+    big_k = size_k * 2
+
+    a_input = rand_data((big_m, big_k))[8:size_m + 8, 8:size_k + 8]
+    b_weight = rand_data((size_k, size_n))
+
+    w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
+        b_weight, quant_type, group_size, False)
+
+    marlin_zp = marlin_make_empty_g_idx(marlin_s.device)
+    workspace = marlin_make_workspace_new(a_input.device)
+
+    output = ops.gptq_marlin_gemm(
+        a_input,
+        None,
+        marlin_q_w,
+        marlin_s,
+        None,
+        marlin_zp,
+        g_idx,
+        sort_indices,
+        workspace,
+        quant_type,
+        a_input.shape[0],
+        b_weight.shape[1],
+        a_input.shape[1],
+        is_k_full=True,
+        use_atomic_add=False,
+        use_fp32_reduce=True,
+        is_zp_float=False,
+    )
+    output_ref = torch.matmul(a_input, w_ref)
+
+    torch.cuda.synchronize()
+
+    max_diff = compute_max_diff(output, output_ref)
+
+    assert max_diff < 0.04
+
+
+def test_marlin_gemm_opcheck():
+    size_m = 2048
+    size_n = 4096
+    size_k = 4096
+    a = torch.rand((size_m, size_n), device='cuda', dtype=torch.float16)
+    w = torch.randint(-5, 5, (256, 8192), device='cuda', dtype=torch.int32)
+    s = torch.full((32, size_k), 0.125, device='cuda', dtype=torch.float16)
+    wk = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
+                         GPTQ_MARLIN_MAX_PARALLEL).scratch
+    x = torch.ops._C.marlin_gemm(a, w, s, wk, size_m, size_n, size_k)
+    y = torch.ops._C.marlin_gemm(a, w, s, wk, size_m, size_n, size_k)
+    torch.testing.assert_close(x, y)
+    opcheck(torch.ops._C.marlin_gemm, (a, w, s, wk, size_m, size_n, size_k))
diff --git a/vllm_v0.10.0/tests/kernels/quantization/test_nvfp4_quant.py b/vllm_v0.10.0/tests/kernels/quantization/test_nvfp4_quant.py
new file mode 100644
index 0000000..3a8f4c1
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/quantization/test_nvfp4_quant.py
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+
+if not current_platform.has_device_capability(100):
+    pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.",
+                allow_module_level=True)
+
+DTYPES = [torch.float16, torch.bfloat16]
+SHAPES = [(128, 64), (128, 128), (256, 64), (256, 128)]
+PAD_SHAPES = [(90, 64), (150, 64), (128, 48), (128, 80), (150, 80), (90, 48),
+              (90, 128), (150, 128), (150, 48), (90, 80)]
+SEEDS = [42]
+CUDA_DEVICES = ['cuda:0']
+
+FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
+FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
+
+# E2M1 to float
+# 0111 -> 6
+# 0110 -> 4
+# 0101 -> 3
+# 0100 -> 2
+# 0011 -> 1.5
+# 0010 -> 1
+# 0001 -> 0.5
+# 0000 -> 0
+E2M1_TO_FLOAT32 = [
+    0., 0.5, 1., 1.5, 2., 3., 4., 6., 0., -0.5, -1., -1.5, -2., -3., -4., -6.
+]
+BLOCK_SIZE = 16
+
+
+def cast_from_fp4(x, m, n):
+    # The fp4 values are packed in uint8 as [v_1st | v_2nd]
+    v_2nd = x & 0xF
+    v_1st = (x >> 4) & 0xF
+    c = torch.stack((v_2nd, v_1st), dim=-1)
+    out = torch.tensor([E2M1_TO_FLOAT32[x] for x in c.flatten()])
+    out = out.reshape(m, n).to(torch.float32)
+    return out
+
+
+def cast_to_fp4(x):
+    sign = torch.sign(x)
+    x = torch.abs(x)
+    x[(x >= 0.0) & (x <= 0.25)] = 0.0
+    x[(x > 0.25) & (x < 0.75)] = 0.5
+    x[(x >= 0.75) & (x <= 1.25)] = 1.0
+    x[(x > 1.25) & (x < 1.75)] = 1.5
+    x[(x >= 1.75) & (x <= 2.5)] = 2.0
+    x[(x > 2.5) & (x < 3.5)] = 3.0
+    x[(x >= 3.5) & (x <= 5.0)] = 4.0
+    x[x > 5.0] = 6.0
+    return x * sign
+
+
+def get_reciprocal(x):
+    if isinstance(x, torch.Tensor):
+        return torch.where(x == 0, torch.tensor(0.0, dtype=x.dtype), 1.0 / x)
+    elif isinstance(x, (float, int)):
+        return 0.0 if x == 0 else 1.0 / x
+    else:
+        raise TypeError("Input must be a float, int, or a torch.Tensor.")
+
+
+def ref_nvfp4_quant(x, global_scale):
+    assert global_scale.dtype == torch.float32
+    assert x.ndim == 2
+    m, n = x.shape
+    x = torch.reshape(x, (m, n // BLOCK_SIZE, BLOCK_SIZE))
+    vec_max = torch.max(torch.abs(x), dim=-1,
+                        keepdim=True)[0].to(torch.float32)
+    scale = global_scale * (vec_max * get_reciprocal(FLOAT4_E2M1_MAX))
+    scale = scale.to(torch.float8_e4m3fn).to(torch.float32)
+    output_scale = get_reciprocal(scale * get_reciprocal(global_scale))
+
+    scaled_x = x.to(torch.float32) * output_scale
+    clipped_x = torch.clamp(scaled_x, -6.0, 6.0).reshape(m, n)
+    return cast_to_fp4(clipped_x), scale.squeeze(-1)
+
+
+def recover_swizzled_scales(scale, m, n):
+    round_up = lambda x, y: (x + y - 1) // y * y
+    rounded_m = round_up(m, 128)
+    scale_n = n // BLOCK_SIZE
+    rounded_n = round_up(scale_n, 4)
+    # Recover the swizzled scaling factor to linear layout
+    tmp = torch.reshape(scale, (1, rounded_m // 128, rounded_n // 4, 32, 4, 4))
+    tmp = torch.permute(tmp, (0, 1, 4, 3, 2, 5))
+    result = torch.reshape(tmp, (rounded_m, rounded_n)).to(torch.float32)
+    return result[:m, :scale_n]
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_quantize_to_fp4(
+    dtype: torch.dtype,
+    shape: tuple[int, int],
+    seed: int,
+    device: str,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+
+    m, n = shape
+
+    x = torch.randn((m, n), dtype=dtype)
+    tensor_amax = torch.abs(x).max().to(torch.float32)
+    global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / tensor_amax
+    out_ref, scale_ref = ref_nvfp4_quant(x, global_scale)
+
+    out, out_scale = ops.scaled_fp4_quant(x, global_scale)
+    scale_ans = recover_swizzled_scales(out_scale, m, n)
+    out_ans = cast_from_fp4(out, m, n)
+
+    torch.testing.assert_close(out_ans, out_ref)
+    torch.testing.assert_close(scale_ans, scale_ref)
+
+
+@pytest.mark.parametrize("pad_shape", PAD_SHAPES)
+@torch.inference_mode()
+def test_quantize_to_fp4_padded(pad_shape: tuple[int, int]) -> None:
+    dtype = torch.float16
+    current_platform.seed_everything(42)
+    torch.set_default_device('cuda:0')
+
+    m, n = pad_shape
+
+    x = torch.randn((m, n), dtype=dtype)
+
+    tensor_amax = torch.abs(x).max().to(torch.float32)
+    global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / tensor_amax
+    out_ref, scale_ref = ref_nvfp4_quant(x, global_scale)
+
+    out, out_scale = ops.scaled_fp4_quant(x, global_scale)
+
+    scale_ans = recover_swizzled_scales(out_scale, m, n)
+    out_ans = cast_from_fp4(out, m, n)
+
+    torch.testing.assert_close(out_ans, out_ref)
+    torch.testing.assert_close(scale_ans, scale_ref)
diff --git a/vllm_v0.10.0/tests/kernels/quantization/test_nvfp4_scaled_mm.py b/vllm_v0.10.0/tests/kernels/quantization/test_nvfp4_scaled_mm.py
new file mode 100644
index 0000000..0b45c22
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/quantization/test_nvfp4_scaled_mm.py
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+from nvfp4_utils import (FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX,
+                         dequantize_nvfp4_to_dtype)
+
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+
+if not current_platform.has_device_capability(100):
+    pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.",
+                allow_module_level=True)
+
+DTYPES = [torch.float16, torch.bfloat16]
+# m, n, k
+SHAPES = [(128, 128, 64), (128, 128, 128), (256, 128, 64), (128, 256, 128)]
+PAD_SHAPES = [(150, 128, 64), (128, 128, 96)]
+SHAPES.extend(PAD_SHAPES)
+
+SEEDS = [42]
+CUDA_DEVICES = ['cuda:0']
+
+
+def get_ref_results(a_fp4, b_fp4, a_sf, b_sf, a_global_scale, b_global_scale,
+                    m, n, dtype, block_size, device):
+    _, m_k = a_fp4.shape
+    _, n_k = b_fp4.shape
+    assert (m_k == n_k)
+    a_in_dtype = dequantize_nvfp4_to_dtype(a_fp4,
+                                           a_sf,
+                                           a_global_scale,
+                                           dtype=dtype,
+                                           device=device,
+                                           block_size=block_size)
+    b_in_dtype = dequantize_nvfp4_to_dtype(b_fp4,
+                                           b_sf,
+                                           b_global_scale,
+                                           dtype=dtype,
+                                           device=device,
+                                           block_size=block_size)
+    return torch.matmul(a_in_dtype, b_in_dtype.t())
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_nvfp4_gemm(
+    dtype: torch.dtype,
+    shape: tuple[int, int, int],
+    seed: int,
+    device: str,
+) -> None:
+    current_platform.seed_everything(seed)
+    m, n, packed_k = shape
+    k = packed_k * 2
+    block_size = 16
+    a_dtype = torch.randn((m, k), dtype=dtype, device=device)
+    b_dtype = torch.randn((n, k), dtype=dtype, device=device)
+
+    a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
+                      torch.amax(a_dtype.flatten(), dim=-1)).to(torch.float32)
+    b_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
+                      torch.amax(b_dtype.flatten(), dim=-1)).to(torch.float32)
+    alpha = 1. / (a_global_scale * b_global_scale)
+    a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(a_dtype, a_global_scale)
+    b_fp4, b_scale_interleaved = ops.scaled_fp4_quant(b_dtype, b_global_scale)
+
+    expected_out = get_ref_results(a_fp4, b_fp4, a_scale_interleaved,
+                                   b_scale_interleaved, a_global_scale,
+                                   b_global_scale, m, n, dtype, block_size,
+                                   device)
+    out = ops.cutlass_scaled_fp4_mm(a_fp4, b_fp4, a_scale_interleaved,
+                                    b_scale_interleaved, alpha, dtype)
+
+    torch.testing.assert_close(out,
+                               expected_out.to(dtype=dtype),
+                               atol=1e-1,
+                               rtol=1e-1)
diff --git a/vllm_v0.10.0/tests/kernels/quantization/test_per_token_group_quant.py b/vllm_v0.10.0/tests/kernels/quantization/test_per_token_group_quant.py
new file mode 100644
index 0000000..f826983
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/quantization/test_per_token_group_quant.py
@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.quantization.utils import fp8_utils
+
+
+@pytest.mark.parametrize("shape", [(32, 128), (64, 256), (16, 512)])
+@pytest.mark.parametrize("column_major", [False, True])
+@pytest.mark.parametrize("scale_ue8m0", [False, True])
+@pytest.mark.parametrize("group_size", [64, 128])
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_per_token_group_quant_fp8(shape, column_major: bool,
+                                   scale_ue8m0: bool, group_size: int):
+    device = "cuda"
+
+    torch.manual_seed(42)
+    num_tokens, hidden_dim = shape
+
+    x = (torch.randn(
+        (num_tokens, hidden_dim), device=device, dtype=torch.bfloat16) * 8)
+
+    # cuda path
+    out_q, scale = fp8_utils.per_token_group_quant_fp8(
+        x,
+        group_size,
+        column_major_scales=column_major,
+        use_ue8m0=scale_ue8m0,
+    )
+
+    # triton ref
+    with patch("vllm.platforms.current_platform.is_cuda", return_value=False):
+        ref_q, ref_s = fp8_utils.per_token_group_quant_fp8(
+            x,
+            group_size,
+            column_major_scales=column_major,
+            use_ue8m0=scale_ue8m0,
+        )
+
+    assert torch.allclose(out_q.float(), ref_q.float(), atol=0.15, rtol=0.15)
+    assert torch.allclose(scale, ref_s, atol=0.01, rtol=0.01)
diff --git a/vllm_v0.10.0/tests/kernels/quantization/test_rocm_skinny_gemms.py b/vllm_v0.10.0/tests/kernels/quantization/test_rocm_skinny_gemms.py
new file mode 100644
index 0000000..533a4fe
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/quantization/test_rocm_skinny_gemms.py
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+import vllm._custom_ops as ops
+from tests.kernels.quant_utils import ref_dynamic_per_tensor_fp8_quant
+from vllm.platforms import current_platform
+
+DTYPES = [torch.bfloat16, torch.float16]
+M = [16, 32, 64, 128, 256, 512, 1024, 4096, 8192]
+K = [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 6144, 8192]  # k % 8 == 0
+N = [1, 2, 3, 4]
+SEEDS = [0]
+
+
+@pytest.mark.parametrize("n", [1])  # only test for batch size 1
+@pytest.mark.parametrize("k", K)
+@pytest.mark.parametrize("m", M)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("rows_per_block", [2, 4, 8, 16])
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.skipif(not current_platform.is_rocm(),
+                    reason="only test for rocm")
+@torch.inference_mode()
+def test_rocm_llmm1_kernel(n, k, m, dtype, rows_per_block, seed):
+    torch.manual_seed(seed)
+    A = torch.rand(n, k, dtype=dtype, device="cuda")
+    B = torch.rand(m, k, dtype=dtype, device="cuda")
+
+    ref_out = torch.matmul(A, B.t())
+    out = ops.LLMM1(B, A, rows_per_block)
+
+    assert torch.allclose(out, ref_out, rtol=0.01)
+
+
+@pytest.mark.parametrize("n", N)  # only test for batch size <= 4
+@pytest.mark.parametrize("k", K + [9216, 10240, 16384])
+@pytest.mark.parametrize("m", [8] + M)  # m >= 8
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.skipif(not current_platform.is_rocm(),
+                    reason="only test for rocm")
+def test_rocm_wvsplitk_kernel(n, k, m, dtype, seed):
+    torch.manual_seed(seed)
+    cu_count = current_platform.get_cu_count()
+
+    A = torch.rand(n, k, dtype=dtype, device="cuda")
+    B = torch.rand(m, k, dtype=dtype, device="cuda")
+
+    ref_out = torch.matmul(A, B.t())
+    out = ops.wvSplitK(B, A, cu_count)
+
+    assert torch.allclose(out, ref_out, rtol=0.01)
+
+
+@pytest.mark.parametrize("n", N)  # only test for batch size <= 4
+@pytest.mark.parametrize("k", K[1:] + [14336, 24576, 32768])  # k % 16 == 0
+@pytest.mark.parametrize("m", M + [28672])  # m >= 16
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.skipif(
+    not (current_platform.is_rocm() and current_platform.supports_fp8()),
+    reason="only test for rocm fp8")
+def test_rocm_wvsplitk_fp8_kernel(n, k, m, dtype, seed):
+    torch.manual_seed(seed)
+
+    A = torch.rand(n, k, device="cuda")
+    B = torch.rand(m, k, device="cuda")
+
+    A, scale_a = ref_dynamic_per_tensor_fp8_quant(A)
+    B, scale_b = ref_dynamic_per_tensor_fp8_quant(B)
+
+    ref_out = torch._scaled_mm(A,
+                               B.t(),
+                               out_dtype=dtype,
+                               scale_a=scale_a,
+                               scale_b=scale_b)
+    out = ops.wvSplitKQ(B, A, dtype, scale_a, scale_b,
+                        current_platform.get_cu_count())
+
+    assert torch.allclose(out, ref_out, rtol=0.01)
diff --git a/vllm_v0.10.0/tests/kernels/quantization/test_triton_scaled_mm.py b/vllm_v0.10.0/tests/kernels/quantization/test_triton_scaled_mm.py
new file mode 100644
index 0000000..8a2cc3b
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/quantization/test_triton_scaled_mm.py
@@ -0,0 +1,114 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for the triton_scaled_mm kernel
+
+Run `pytest tests/kernels/test_triton_scaled_mm.py`.
+"""
+import importlib
+from typing import Optional
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+
+device = "cuda"
+
+triton_scaled_mm_module = importlib.import_module(
+    "vllm.model_executor.layers.quantization.compressed_tensors."
+    "triton_scaled_mm")
+triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
+
+
+def torch_scaled_mm(a: torch.Tensor,
+                    b: torch.Tensor,
+                    scale_a: torch.Tensor,
+                    scale_b: torch.Tensor,
+                    out_dtype: type[torch.dtype],
+                    bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    out = torch.mm(a.to(torch.float32), b.to(torch.float32))
+    out = scale_a * out
+    out = scale_b.T * out
+    out = out.to(out_dtype)
+    if bias is not None:
+        out = out + bias
+
+    return out
+
+
+def get_8bit_types():
+    types = [torch.int8]
+    if current_platform.supports_fp8():
+        types.append(current_platform.fp8_dtype())
+    return types
+
+
+# This test is to check regressions for int8 support on ROCm.
+@pytest.mark.parametrize("model_path", [
+    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
+])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [10])
+@pytest.mark.skipif(not current_platform.is_rocm(),
+                    reason="Should only run on ROCm")
+def test_rocm_compressed_tensors_w8a8(vllm_runner, example_prompts, model_path,
+                                      max_tokens, num_logprobs):
+    dtype = "bfloat16"
+
+    with vllm_runner(model_path, dtype=dtype) as vllm_model:
+        vllm_model.generate_greedy_logprobs(example_prompts, max_tokens,
+                                            num_logprobs)
+
+
+@pytest.mark.parametrize("M", [1, 33, 64, 512])
+@pytest.mark.parametrize("N", [256, 971, 20486])
+@pytest.mark.parametrize("K", [128, 496, 1024])
+@pytest.mark.parametrize("out_dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("in_dtype", get_8bit_types())
+@pytest.mark.parametrize("use_scalar_scale_a", [True, False])
+@pytest.mark.parametrize("use_scalar_scale_b", [True, False])
+@pytest.mark.parametrize("use_bias", [True, False])
+def test_scaled_mm(M, N, K, in_dtype, out_dtype, use_scalar_scale_a,
+                   use_scalar_scale_b, use_bias):
+    is_floating_point_type = lambda t: torch.tensor([1, 1], dtype=t
+                                                    ).is_floating_point()
+
+    current_platform.seed_everything(0)
+
+    # NOTE: There are cases, where if the matrix is large enough, an output
+    # like 65504.4 can be produced, and can easily turn into inf when
+    # multiplied when using float16/bfloat16.  This means one function, e.g.,
+    # testing function, and another function, e.g. golden function, can
+    # produce a non-inf value while the other produces an inf value, and
+    # will cause assert_close/allclose to fail, even though if overflow
+    # wouldn't have occurred, the values would have been "close."
+    #
+    # So, the values here are kept small enough to avoid this situation.
+    if is_floating_point_type(in_dtype):
+        a = (0.25 * torch.rand(
+            (M, K), dtype=torch.float32, device=device)).to(in_dtype)
+        b = (0.25 * torch.rand(
+            (K, N), dtype=torch.float32, device=device)).to(in_dtype)
+    else:
+        a = torch.randint(-32, 32, (M, K), dtype=in_dtype, device=device)
+        b = torch.randint(-32, 32, (K, N), dtype=in_dtype, device=device)
+
+    if use_scalar_scale_a:
+        scale_a = torch.rand((1, 1), device=device)
+    else:
+        scale_a = 0.25 * torch.rand((M, 1), device=device)
+
+    if use_scalar_scale_b:
+        scale_b = torch.rand((1, 1), device=device)
+    else:
+        scale_b = 0.25 * torch.rand((N, 1), device=device)
+
+    bias = None
+    if use_bias:
+        bias = torch.rand((N, ), device=device, dtype=out_dtype)
+
+    c_check = triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
+
+    c_actual = torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
+
+    torch.testing.assert_close(c_check, c_actual, rtol=1e-1, atol=1e-1)
diff --git a/vllm_v0.10.0/tests/kernels/test_apply_repetition_penalties.py b/vllm_v0.10.0/tests/kernels/test_apply_repetition_penalties.py
new file mode 100644
index 0000000..90380b8
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/test_apply_repetition_penalties.py
@@ -0,0 +1,125 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from tests.kernels.utils import opcheck
+from vllm._custom_ops import (apply_repetition_penalties_cuda,
+                              apply_repetition_penalties_torch)
+from vllm.platforms import current_platform
+
+NUM_SEQS = [1, 2, 3, 4, 8, 13, 17, 32, 37, 256, 1023, 1024, 1025]
+# [stress, stress, stress, Qwen, llama 4]
+VOCAB_SIZES = [17, 256, 1019, 151936, 202048]
+REPETITION_PENALTY_VALUES = [1.05]
+SEEDS = [0]
+DTYPES = [torch.float32, torch.float16]
+
+
+@pytest.mark.parametrize("num_seqs", NUM_SEQS)
+@pytest.mark.parametrize("vocab_size", VOCAB_SIZES)
+@pytest.mark.parametrize("repetition_penalty", REPETITION_PENALTY_VALUES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="This test for checking CUDA kernel")
+@torch.inference_mode()
+def test_apply_repetition_penalties(
+    num_seqs: int,
+    vocab_size: int,
+    repetition_penalty: float,
+    dtype: torch.dtype,
+    seed: int,
+) -> None:
+    """
+    Test the apply_repetition_penalties custom op 
+    against a reference implementation.
+    """
+    current_platform.seed_everything(seed)
+    torch.set_default_device("cuda:0")
+
+    # Create test data
+    logits = torch.randn(num_seqs, vocab_size, dtype=dtype)
+
+    # Create masks with some random tokens marked as repeated
+    prompt_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
+    output_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
+
+    # Mark some tokens as repeated in prompt and output
+    prompt_indices = torch.randint(0, vocab_size,
+                                   (num_seqs, max(1, vocab_size // 200)))
+    output_indices = torch.randint(0, vocab_size,
+                                   (num_seqs, max(1, vocab_size // 200)))
+
+    for i in range(num_seqs):
+        prompt_mask[i, prompt_indices[i]] = True
+        output_mask[i, output_indices[i]] = True
+
+    # Create repetition penalties tensor
+    repetition_penalties = torch.full((num_seqs, ),
+                                      repetition_penalty,
+                                      dtype=dtype)
+
+    # Run all three implementations
+    logits_torch = logits.clone()
+    logits_cuda = logits.clone()
+
+    apply_repetition_penalties_torch(logits_torch, prompt_mask, output_mask,
+                                     repetition_penalties)
+    apply_repetition_penalties_cuda(logits_cuda, prompt_mask, output_mask,
+                                    repetition_penalties)
+
+    # Compare all outputs to reference
+    torch.testing.assert_close(logits_torch, logits_cuda, rtol=1e-3, atol=1e-3)
+
+    # Test the operator by applying the opcheck utility
+    opcheck(torch.ops._C.apply_repetition_penalties_,
+            (logits.clone(), prompt_mask, output_mask, repetition_penalties))
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="This test for checking CUDA kernel")
+@torch.inference_mode()
+def test_apply_repetition_penalties_zero_seqs() -> None:
+    """
+    Test the apply_repetition_penalties custom op with num_seqs=0
+    against a reference implementation.
+    """
+    num_seqs = 0
+    vocab_size = 17
+    repetition_penalty = 1.05
+    dtype = torch.float32
+    seed = 0
+
+    current_platform.seed_everything(seed)
+    torch.set_default_device("cuda:0")
+
+    # Create test data
+    logits = torch.randn(num_seqs, vocab_size, dtype=dtype)
+
+    # Create masks with some random tokens marked as repeated
+    prompt_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
+    output_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
+
+    # No tokens to mark as repeated since num_seqs=0
+
+    # Create repetition penalties tensor
+    repetition_penalties = torch.full((num_seqs, ),
+                                      repetition_penalty,
+                                      dtype=dtype)
+
+    # Run all three implementations
+    logits_torch = logits.clone()
+    logits_cuda = logits.clone()
+
+    apply_repetition_penalties_torch(logits_torch, prompt_mask, output_mask,
+                                     repetition_penalties)
+    apply_repetition_penalties_cuda(logits_cuda, prompt_mask, output_mask,
+                                    repetition_penalties)
+
+    # Compare all outputs to reference
+    torch.testing.assert_close(logits_torch, logits_cuda, rtol=1e-3, atol=1e-3)
+
+    # Test the operator by applying the opcheck utility
+    opcheck(torch.ops._C.apply_repetition_penalties_,
+            (logits.clone(), prompt_mask, output_mask, repetition_penalties))
diff --git a/vllm_v0.10.0/tests/kernels/test_cutlass_mla_decode.py b/vllm_v0.10.0/tests/kernels/test_cutlass_mla_decode.py
new file mode 100644
index 0000000..2b745b8
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/test_cutlass_mla_decode.py
@@ -0,0 +1,96 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+import vllm._custom_ops as ops
+from vllm.platforms import current_platform
+
+if not current_platform.has_device_capability(100):
+    pytest.skip(
+        reason="Cutlass MLA Requires compute capability of 10 or above.",
+        allow_module_level=True)
+
+
+def ref_mla(
+        out: Tensor,  # (bs, num_heads, v_head_dim)
+        query: Tensor,  # (bs, num_heads, head_dim)
+        kv_cache: Tensor,  # (num_blocks, block_size, head_dim)
+        scale: float,
+        block_tables: Tensor,  # (bs, max_num_blocks)
+        seq_lens: Tensor,  # (bs,)
+):
+    bs, num_heads, v_head_dim = out.shape
+    head_dim = query.shape[2]
+
+    for i in range(bs):
+        # gather and flatten KV-cache
+        kv = kv_cache[
+            block_tables[i]]  # (max_num_blocks, block_size, head_dim)
+        kv = kv.view(1, -1,
+                     head_dim)[:, :seq_lens[i]]  # (1, seq_len, head_dim)
+        v = kv[:, :, :v_head_dim]
+
+        q = query[i].view(num_heads, 1, head_dim)
+        o = F.scaled_dot_product_attention(q,
+                                           kv,
+                                           v,
+                                           scale=scale,
+                                           enable_gqa=True)
+        out[i] = o.view(num_heads, v_head_dim)
+
+    return out
+
+
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("mean_seq_len", [128, 1024, 4096])
+@pytest.mark.parametrize("bs", [1, 2, 4])
+@pytest.mark.parametrize("varlen", [False, True])
+@pytest.mark.parametrize("block_size", [16, 64, 128])
+def test_cutlass_mla_decode(dtype: torch.dtype, mean_seq_len: int, bs: int,
+                            varlen: bool, block_size: int):
+    torch.set_default_dtype(dtype)
+    torch.set_default_device('cuda')
+    torch.manual_seed(42)
+
+    d = 576
+    h_q = 128
+    dv = 512
+
+    q_nope_dim = 128
+    q_pe_dim = 64
+    scale = (q_nope_dim + q_pe_dim)**(-0.5)
+    if varlen:
+        seq_lens = torch.empty(bs).normal_(mean_seq_len, mean_seq_len / 2)
+        seq_lens = seq_lens.clip(2).to(torch.int32)
+    else:
+        seq_lens = torch.full((bs, ), mean_seq_len, dtype=torch.int32)
+    max_seq_len = seq_lens.max().item()
+    block_num = (max_seq_len + block_size - 1) // block_size
+
+    # Pad block_num so that small blocks can be packed into full 128-sized
+    # CUTLASS tiles. One 128-wide tile can hold (128 // block_size) small
+    # blocks.
+    pack_factor = 128 // block_size
+    block_num = ((block_num + pack_factor - 1) // pack_factor) * pack_factor
+
+    # Amplify input values to ensure test coverage of edge cases where CUTLASS
+    # kernel errors occur with split_k settings.
+    q = torch.randn(bs, h_q, d) * 100
+    block_table = torch.randint(0,
+                                bs * block_num, (bs, block_num),
+                                dtype=torch.int32)
+
+    kv_cache = torch.randn(block_table.numel(), block_size, d)
+
+    out_ref = q.new_zeros(bs, h_q, dv)
+    ref_mla(out_ref, q, kv_cache, scale, block_table, seq_lens)
+    out_ans = torch.zeros_like(out_ref)
+    q_nope = q[:, :, :dv].clone()
+    q_pe = q[:, :, dv:].clone()
+    ops.cutlass_mla_decode(out_ans, q_nope, q_pe, kv_cache, seq_lens,
+                           block_table, scale)
+
+    torch.testing.assert_close(out_ans, out_ref, atol=1e-2, rtol=1e-2)
diff --git a/vllm_v0.10.0/tests/kernels/test_flex_attention.py b/vllm_v0.10.0/tests/kernels/test_flex_attention.py
new file mode 100644
index 0000000..e25556c
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/test_flex_attention.py
@@ -0,0 +1,92 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Integration tests for FlexAttention backend vs default backend"""
+
+import random
+
+import numpy as np
+import pytest
+import torch
+from packaging import version
+
+from vllm import LLM, SamplingParams
+
+TORCH_VERSION = version.parse(torch.__version__)
+MINIMUM_TORCH_VERSION = version.parse("2.7.0")
+
+
+def set_seed(seed):
+    """Set seeds for reproducibility"""
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION,
+    reason="CUDA not available or PyTorch version < 2.7",
+)
+def test_flex_attention_vs_default_backend(monkeypatch):
+    """Test that FlexAttention produces the same outputs as the default backend.
+
+    This test compares the outputs from the FlexAttention backend with
+    the default backend, ensuring they are identical when using the same seed.
+    """
+    model_name = "Qwen/Qwen2.5-1.5B-Instruct"
+    seed = 42
+    max_tokens = 32
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+    ]
+
+    sampling_params = SamplingParams(temperature=0.0,
+                                     top_p=1.0,
+                                     seed=seed,
+                                     max_tokens=max_tokens)
+
+    # Run with flex attention
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
+
+        set_seed(seed)
+
+        llm_flex = LLM(
+            model_name,
+            tensor_parallel_size=1,
+            num_gpu_blocks_override=128,
+            enforce_eager=True,
+        )
+        output_flex = llm_flex.generate(prompts, sampling_params)
+
+    # Run with default backend
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        set_seed(seed)
+        llm_default = LLM(
+            model_name,
+            tensor_parallel_size=1,
+            num_gpu_blocks_override=128,
+            enforce_eager=True,
+        )
+        output_default = llm_default.generate(prompts, sampling_params)
+
+    # Compare outputs from both backends
+    for i, (flex_result,
+            default_result) in enumerate(zip(output_flex, output_default)):
+        prompt = prompts[i]
+        flex_text = flex_result.outputs[0].text
+        default_text = default_result.outputs[0].text
+
+        assert flex_text == default_text, (
+            f"FlexAttention output doesn't match default for: {prompt!r}\n"
+            f"FlexAttention: {flex_text!r}\n"
+            f"Default: {default_text!r}")
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/vllm_v0.10.0/tests/kernels/test_fused_quant_activation.py b/vllm_v0.10.0/tests/kernels/test_fused_quant_activation.py
new file mode 100644
index 0000000..803453a
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/test_fused_quant_activation.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+import vllm._custom_ops as ops
+from tests.kernels.utils import opcheck
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.platforms import current_platform
+
+DTYPES = [torch.bfloat16, torch.float16]
+QUANT_DTYPES = [current_platform.fp8_dtype()]
+NUM_TOKENS = [1, 17, 86, 1234, 3045]  # Arbitrary values for testing
+HIDDEN_SIZES = [16, 48, 128, 1562, 4096]  # Arbitrary values for testing
+SEEDS = [0]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+
+def ref_impl(silu_and_mul: SiluAndMul, x: torch.Tensor,
+             scale: torch.Tensor) -> torch.Tensor:
+    silu_and_mul_out = silu_and_mul.forward_native(x)
+    out, scales = ops.scaled_fp8_quant(silu_and_mul_out, scale)
+    return out
+
+
+def ops_impl(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
+    out_shape = (x.shape[0], x.shape[1] // 2)
+    out = torch.empty(out_shape,
+                      dtype=current_platform.fp8_dtype(),
+                      device=x.device)
+    torch.ops._C.silu_and_mul_quant(out, x, scale)
+    return out
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("quant_dtype", QUANT_DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_silu_and_mul(
+    num_tokens: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    quant_dtype: torch.dtype,
+    seed: int,
+    device: str,
+) -> None:
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+
+    layer = SiluAndMul()
+
+    # Make inputs
+    scale = (torch.randn((1), device=device, dtype=torch.float32))
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
+
+    ref_out = ref_impl(layer, x, scale)
+    ops_out = ops_impl(x, scale)
+
+    assert ref_out.dtype == quant_dtype
+    assert ops_out.dtype == quant_dtype
+    assert ref_out.shape == ops_out.shape
+    assert torch.allclose(ref_out.to(dtype=torch.float32),
+                          ops_out.to(dtype=torch.float32))
+    opcheck(torch.ops._C.silu_and_mul_quant, (ops_out, x, scale))
diff --git a/vllm_v0.10.0/tests/kernels/test_triton_flash_attention.py b/vllm_v0.10.0/tests/kernels/test_triton_flash_attention.py
new file mode 100644
index 0000000..1c31cfb
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/test_triton_flash_attention.py
@@ -0,0 +1,500 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for the triton_flash_attention kernel
+
+Run `pytest tests/kernels/test_triton_flash_attention.py`.
+"""
+import pytest
+import torch
+
+from vllm.attention.ops.triton_flash_attention import (SUPPORTED_LAYOUTS,
+                                                       MetaData,
+                                                       compute_alibi_tensor,
+                                                       scale_fp8,
+                                                       triton_attention_rocm)
+from vllm.platforms import current_platform
+
+
+class ReferenceAttention:
+
+    def __init__(self, Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, use_alibi, dtype,
+                 input_metadata):
+        self.Z = Z
+        self.HQ = HQ
+        self.HK = HK
+        self.N_CTX_Q = N_CTX_Q
+        self.N_CTX_K = N_CTX_K
+        self.D_HEAD = D_HEAD
+        self.use_alibi = use_alibi
+        self.dtype = dtype
+        self.input_metadata = input_metadata
+
+    def fwd(self, q, k, v):
+        scores = torch.einsum('bhqd,bhkd->bhqk', q,
+                              k).float() * self.input_metadata.sm_scale
+        if self.input_metadata.causal:
+            mask = torch.tril(torch.ones(self.N_CTX_Q,
+                                         self.N_CTX_K,
+                                         device="cuda"),
+                              diagonal=self.N_CTX_K - self.N_CTX_Q)
+            scores[:, :, mask == 0] = float("-inf")
+
+        if self.input_metadata.bias is not None:
+            scores += self.input_metadata.bias
+
+        if self.use_alibi:
+            scores += compute_alibi_tensor(self.input_metadata.alibi_slopes,
+                                           self.N_CTX_Q, self.N_CTX_K)
+
+        p = torch.softmax(scores, dim=-1)
+        if self.input_metadata.causal:
+            # If N_CTX_Q > N_CTX_K, there's at least one row of all -infs going
+            # into softmax. This creates a row of NaNs as -inf - -inf == NaN.
+            # So we fix this by converting the NaNs to 0s, which is what they
+            # should be out of the softmax.
+            nan_mask = torch.isnan(p)
+            p[nan_mask == 1] = 0
+        ref_out = torch.einsum('bhqk,bhkd->bhqd', p.to(self.dtype), v)
+        # compare
+        if self.input_metadata.layout == 'bshd':
+            ref_out = ref_out.transpose(1, 2).clone()
+        return ref_out
+
+    def fwd_fp8(self, q_quantized, k_quantized, v_quantized):
+        q = (q_quantized.to(torch.float16) * self.input_metadata.q_descale).to(
+            self.dtype)
+        k = (k_quantized.to(torch.float16) * self.input_metadata.k_descale).to(
+            self.dtype)
+        v = (v_quantized.to(torch.float16) * self.input_metadata.v_descale).to(
+            self.dtype)
+        result = self.fwd(q, k, v)
+        if self.input_metadata.o_scale is not None:
+            result, _ = scale_fp8(result, self.input_metadata.o_scale)
+        return result
+
+    def fwd_fp8_kv(self, q, k_quantized, v_quantized):
+        k_descale, v_descale = (self.input_metadata.k_descale,
+                                self.input_metadata.v_descale)
+        k_dequantized = (k_quantized.to(torch.float32) *
+                         k_descale.to(torch.float32)).to(self.dtype)
+        v_dequantized = (v_quantized.to(torch.float32) *
+                         v_descale.to(torch.float32)).to(self.dtype)
+        return self.fwd(q, k_dequantized, v_dequantized)
+
+    def varlen_fwd(self, q, k, v, is_mqa=False):
+        ref_out = torch.empty_like(q)
+        if is_mqa:
+            # Make KV look like HQ/HK "groups" of HK. Later, we will reshape so
+            # the size aligns with Q.
+            k_ref = k.view(k.shape[0], k.shape[1], 1,
+                           k.shape[2]).expand(-1, -1, self.HQ // self.HK, -1)
+            v_ref = v.view(v.shape[0], v.shape[1], 1,
+                           v.shape[2]).expand(-1, -1, self.HQ // self.HK, -1)
+        else:
+            k_ref = k
+            v_ref = v
+
+        for i in range(0, self.input_metadata.num_contexts):
+            start_q, start_k = self.input_metadata.cu_seqlens_q[
+                i], self.input_metadata.cu_seqlens_k[i]
+            end_q, end_k = self.input_metadata.cu_seqlens_q[
+                i + 1], self.input_metadata.cu_seqlens_k[i + 1]
+            k_curr = k_ref[start_k:end_k]
+            v_curr = v_ref[start_k:end_k]
+            if is_mqa:
+                k_curr = k_curr.reshape(k_curr.shape[0], -1, k_curr.shape[3])
+                v_curr = v_curr.reshape(v_curr.shape[0], -1, v_curr.shape[3])
+            scores = torch.einsum('qhd,khd->qhk', q[start_q:end_q],
+                                  k_curr).float()
+            p = torch.softmax(scores * self.input_metadata.sm_scale,
+                              dim=-1).half()
+            ref_out[start_q:end_q] = torch.einsum('qhk,khd->qhd', p, v_curr)
+        return ref_out
+
+
+def quantize_input(q, k, v, fp8_kv=False, use_o_scale=False):
+    q_descale = None
+    if not fp8_kv:
+        q, q_descale = scale_fp8(q)
+    k, k_descale = scale_fp8(k)
+    v, v_descale = scale_fp8(v)
+
+    # In real world use case, the p scale would be a parameter trained by the
+    # model.
+    p_scale = None
+
+    o_scale = torch.rand(1, device="cuda",
+                         requires_grad=False) if use_o_scale else None
+
+    return q, k, v, q_descale, k_descale, v_descale, p_scale, o_scale
+
+
+def input_helper(
+    Z,
+    HQ,
+    HK,
+    N_CTX_Q,
+    N_CTX_K,
+    D_HEAD,
+    dtype,
+    layout=None,
+    use_alibi=None,
+    causal=None,
+    is_fp8=False,
+    fp8_kv=False,
+    use_o_scale=False,
+    use_bias=False,
+):
+    assert layout in SUPPORTED_LAYOUTS, "Got unsupported layout."
+
+    current_platform.seed_everything(0)
+
+    # Initialize q, k, v
+    if layout == 'bhsd':
+        q_tensor_shape = (Z, HQ, N_CTX_Q, D_HEAD)
+        k_tensor_shape = (Z, HK, N_CTX_K, D_HEAD)
+    elif layout == 'bshd':
+        q_tensor_shape = (Z, N_CTX_Q, HQ, D_HEAD)
+        k_tensor_shape = (Z, N_CTX_K, HK, D_HEAD)
+
+    if use_alibi:
+        # for n heads the set of slopes is the geometric sequence that starts
+        # 2^(-8/n)
+        alibi_slopes = torch.tensor(
+            [2**(-8 / HQ * i) for i in range(1, HQ + 1)],
+            dtype=torch.float32,
+            device="cuda").repeat(Z, 1)
+    else:
+        alibi_slopes = None
+
+    if use_bias:
+        bias = torch.randn((1, HQ, N_CTX_Q, N_CTX_K),
+                           dtype=dtype,
+                           device="cuda",
+                           requires_grad=False)
+    else:
+        bias = None
+
+    q = torch.randn(q_tensor_shape,
+                    dtype=dtype,
+                    device="cuda",
+                    requires_grad=False)
+    k = torch.randn(k_tensor_shape,
+                    dtype=dtype,
+                    device="cuda",
+                    requires_grad=False)
+    v = torch.randn(k_tensor_shape,
+                    dtype=dtype,
+                    device="cuda",
+                    requires_grad=False)
+
+    if is_fp8:
+        (q, k, v, q_descale, k_descale, v_descale, p_scale,
+         o_scale) = quantize_input(q,
+                                   k,
+                                   v,
+                                   use_o_scale=use_o_scale,
+                                   fp8_kv=fp8_kv)
+    else:
+        q_descale = k_descale = v_descale = p_scale = o_scale = None
+
+    input_metadata = MetaData(sm_scale=D_HEAD**-0.5,
+                              max_seqlens_q=N_CTX_Q,
+                              max_seqlens_k=N_CTX_K,
+                              layout=layout,
+                              alibi_slopes=alibi_slopes,
+                              alibi_batch=Z,
+                              alibi_nheads=HQ,
+                              q_descale=q_descale,
+                              k_descale=k_descale,
+                              v_descale=v_descale,
+                              p_scale=p_scale,
+                              o_scale=o_scale,
+                              bias=bias,
+                              seqlen_q=N_CTX_Q,
+                              seqlen_k=N_CTX_K)
+    return q, k, v, input_metadata
+
+
+def varlen_input_helper(Z,
+                        HQ,
+                        HK,
+                        N_CTX_Q,
+                        N_CTX_K,
+                        D_HEAD,
+                        dtype,
+                        equal_seqlens=False):
+    current_platform.seed_everything(0)
+
+    # Random sequence lengths. Using N_CTX as kind of max of sum of individual
+    # seqs
+    if not equal_seqlens:
+        max_seqlens_q = N_CTX_Q // Z
+        max_seqlens_k = N_CTX_K // Z
+        seqlens_q = torch.randint(1,
+                                  max_seqlens_q + 1, (Z, ),
+                                  dtype=torch.int32)
+        seqlens_k = torch.randint(1,
+                                  max_seqlens_k + 1, (Z, ),
+                                  dtype=torch.int32)
+    else:
+        seqlens_q = torch.full((Z, ), N_CTX_Q // Z)
+        seqlens_k = torch.full((Z, ), N_CTX_K // Z)
+
+    # Calculate cumulative sequence lengths
+    cu_seqlens_q = torch.cat([
+        torch.tensor([0], dtype=torch.int32),
+        seqlens_q.cumsum(dim=0, dtype=torch.int32)
+    ])
+    cu_seqlens_k = torch.cat([
+        torch.tensor([0], dtype=torch.int32),
+        seqlens_k.cumsum(dim=0, dtype=torch.int32)
+    ])
+    cu_seqlens_q = cu_seqlens_q.to(device="cuda")
+    cu_seqlens_k = cu_seqlens_k.to(device="cuda")
+
+    # Initialize q, k, v with variable lengths
+    total_q = cu_seqlens_q[-1].item()
+    total_k = cu_seqlens_k[-1].item()
+    q = torch.randn((total_q, HQ, D_HEAD), dtype=dtype,
+                    device="cuda").normal_(mean=0., std=0.5).requires_grad_()
+    k = torch.randn((total_k, HK, D_HEAD), dtype=dtype,
+                    device="cuda").normal_(mean=0., std=0.5).requires_grad_()
+    v = torch.randn((total_k, HK, D_HEAD), dtype=dtype,
+                    device="cuda").normal_(mean=0., std=0.5).requires_grad_()
+    sm_scale = D_HEAD**-0.5
+    input_metadata = MetaData(sm_scale=sm_scale)
+    input_metadata.set_varlen_params(cu_seqlens_q, cu_seqlens_k)
+    return q, k, v, input_metadata
+
+
+@pytest.mark.parametrize('Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD', [
+    (1, 48, 12, 1, 1, 64),
+    (4, 4, 4, 128, 128, 65),
+    (16, 48, 48, 1, 1, 128),
+    (64, 48, 24, 3, 3, 128),
+    (4, 4, 4, 113, 123, 1),
+])
+@pytest.mark.parametrize('causal', [True, False])
+@pytest.mark.parametrize('use_alibi', [True, False])
+@pytest.mark.parametrize('layout', ['bshd'])
+def test_op_fwd(Z,
+                HQ,
+                HK,
+                N_CTX_Q,
+                N_CTX_K,
+                D_HEAD,
+                causal,
+                use_alibi,
+                layout,
+                dtype=torch.float16):
+    current_platform.seed_everything(0)
+    q, k, v, input_metadata = input_helper(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD,
+                                           dtype, layout, use_alibi, causal)
+
+    o = torch.empty_like(q)
+
+    # triton implementation
+    tri_out, _ = triton_attention_rocm(q, k, v, o, input_metadata)
+
+    # Transpose here if layout is bshd so we have same reference code for all
+    # layouts
+    if layout == 'bshd':
+        q = q.transpose(1, 2).clone()
+        k = k.transpose(1, 2).clone()
+        v = v.transpose(1, 2).clone()
+    # Replicate K and V if using MQA/GQA
+    if HQ != HK:
+        k = k.view(k.shape[0], k.shape[1], -1, k.shape[2],
+                   k.shape[3]).expand(-1, -1, HQ // HK, -1,
+                                      -1).reshape(k.shape[0], -1, k.shape[2],
+                                                  k.shape[3])
+        v = v.view(v.shape[0], v.shape[1], -1, v.shape[2],
+                   v.shape[3]).expand(-1, -1, HQ // HK, -1,
+                                      -1).reshape(v.shape[0], -1, v.shape[2],
+                                                  v.shape[3])
+
+    ref_impl = ReferenceAttention(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD,
+                                  use_alibi, dtype, input_metadata)
+    ref_out = ref_impl.fwd(q, k, v)
+
+    torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
+
+
+@pytest.mark.parametrize('Z, H, N_CTX_Q, N_CTX_K, D_HEAD', [
+    (4, 48, 1, 1, 64),
+    (4, 48, 1, 1, 128),
+    (4, 48, 3, 3, 128),
+    (4, 4, 128, 128, 65),
+])
+@pytest.mark.parametrize('causal', [True, False])
+@pytest.mark.parametrize('layout', ['bhsd'])
+@pytest.mark.parametrize('use_o_scale', [True, False])
+@pytest.mark.skipif(torch.cuda.get_device_capability() < (9, 0),
+                    reason="Triton FP8 requires CUDA 9.0 or higher")
+def test_op_fwd_fp8(Z,
+                    H,
+                    N_CTX_Q,
+                    N_CTX_K,
+                    D_HEAD,
+                    causal,
+                    layout,
+                    use_o_scale,
+                    dtype=torch.float32):
+    current_platform.seed_everything(0)
+
+    # Disable grad to save memory it won't run into OOM on CI machine.
+    # q, k, v, input_metadata = input_helper(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD,
+    # dtype, layout)
+
+    q_quantized, k_quantized, v_quantized, input_metadata = input_helper(
+        Z,
+        H,
+        H,
+        N_CTX_Q,
+        N_CTX_K,
+        D_HEAD,
+        dtype,
+        causal=causal,
+        layout=layout,
+        is_fp8=True,
+        use_o_scale=use_o_scale)
+
+    o = torch.empty_like(q_quantized) if use_o_scale else None
+
+    tri_out, _ = triton_attention_rocm(q_quantized, k_quantized, v_quantized,
+                                       o, input_metadata)
+
+    ref_impl = ReferenceAttention(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False,
+                                  dtype, input_metadata)
+    ref_out = ref_impl.fwd_fp8(q_quantized, k_quantized, v_quantized)
+
+    # compare
+    torch.testing.assert_close(ref_out.to(torch.float32),
+                               tri_out.to(torch.float32),
+                               atol=7e-2,
+                               rtol=2e-1)
+
+
+@pytest.mark.parametrize('Z, H, N_CTX_Q, N_CTX_K, D_HEAD', [
+    (4, 48, 1, 1, 64),
+    (4, 48, 1, 1, 128),
+    (4, 48, 3, 3, 128),
+    (4, 4, 128, 128, 65),
+    (4, 4, 113, 123, 1),
+])
+@pytest.mark.parametrize('causal', [True, False])
+@pytest.mark.parametrize('layout', ['bhsd'])
+def test_op_fwd_fp8_kv(Z,
+                       H,
+                       N_CTX_Q,
+                       N_CTX_K,
+                       D_HEAD,
+                       causal,
+                       layout,
+                       dtype=torch.float32):
+    current_platform.seed_everything(0)
+
+    q, k_quantized, v_quantized, input_metadata = input_helper(Z,
+                                                               H,
+                                                               H,
+                                                               N_CTX_Q,
+                                                               N_CTX_K,
+                                                               D_HEAD,
+                                                               dtype,
+                                                               causal=causal,
+                                                               layout=layout,
+                                                               is_fp8=True,
+                                                               fp8_kv=True)
+
+    o = torch.empty_like(q)
+
+    tri_out, _ = triton_attention_rocm(q, k_quantized, v_quantized, o,
+                                       input_metadata)
+
+    ref_impl = ReferenceAttention(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False,
+                                  dtype, input_metadata)
+    ref_out = ref_impl.fwd_fp8_kv(q, k_quantized, v_quantized)
+
+    torch.testing.assert_close(ref_out, tri_out, atol=3e-2, rtol=8e-1)
+
+
+@pytest.mark.parametrize('Z, H, N_CTX_Q, N_CTX_K, D_HEAD', [
+    (4, 48, 1, 1, 64),
+    (4, 48, 1, 1, 128),
+    (4, 48, 3, 3, 128),
+    (4, 4, 128, 128, 65),
+])
+@pytest.mark.parametrize('causal', [True, False])
+@pytest.mark.parametrize('use_bias', [True])
+@pytest.mark.parametrize('dtype', [torch.bfloat16])
+def test_op_fwd_bias(Z, H, N_CTX_Q, N_CTX_K, D_HEAD, causal, use_bias, dtype):
+    current_platform.seed_everything(0)
+    q, k, v, input_metadata = input_helper(Z,
+                                           H,
+                                           H,
+                                           N_CTX_Q,
+                                           N_CTX_K,
+                                           D_HEAD,
+                                           dtype,
+                                           layout='bhsd',
+                                           causal=causal,
+                                           use_bias=use_bias)
+    o = torch.empty_like(q)
+
+    # triton implementation
+    tri_out, _ = triton_attention_rocm(q, k, v, o, input_metadata)
+
+    ref_impl = ReferenceAttention(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False,
+                                  dtype, input_metadata)
+    ref_out = ref_impl.fwd(q, k, v)
+
+    # compare
+    torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
+
+
+# NOTE: Uses thd layout, so also tests thd.
+@pytest.mark.parametrize('Z, H, N_CTX, D_HEAD', [(1, 48, 256, 64),
+                                                 (4, 48, 512, 64),
+                                                 (16, 48, 512, 64),
+                                                 (64, 48, 128, 128)])
+@pytest.mark.parametrize('causal', [True, False])
+def test_op_varlen_fwd(Z, H, N_CTX, D_HEAD, causal, dtype=torch.float16):
+
+    q, k, v, input_metadata = varlen_input_helper(Z, H, H, N_CTX, N_CTX,
+                                                  D_HEAD, dtype)
+
+    tri_out = torch.empty_like(q)
+    triton_attention_rocm(q, k, v, tri_out, input_metadata)
+
+    ref_impl = ReferenceAttention(Z, H, H, N_CTX, N_CTX, D_HEAD, False, dtype,
+                                  input_metadata)
+    ref_out = ref_impl.varlen_fwd(q, k, v, is_mqa=False)
+
+    torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
+
+
+# NOTE: Uses thd layout, so also tests thd.
+@pytest.mark.parametrize('Z, HQ, HK, N_CTX, D_HEAD', [(2, 48, 24, 128, 64),
+                                                      (4, 48, 12, 256, 64),
+                                                      (4, 48, 4, 512, 64),
+                                                      (4, 64, 16, 128, 128)])
+@pytest.mark.parametrize('causal', [False])
+def test_op_varlen_mqa_fwd(Z,
+                           HQ,
+                           HK,
+                           N_CTX,
+                           D_HEAD,
+                           causal,
+                           dtype=torch.float16):
+    q, k, v, input_metadata = varlen_input_helper(Z, HQ, HK, N_CTX, N_CTX,
+                                                  D_HEAD, dtype)
+
+    tri_out = torch.empty_like(q)
+    triton_attention_rocm(q, k, v, tri_out, input_metadata)
+
+    ref_impl = ReferenceAttention(Z, HQ, HK, N_CTX, N_CTX, D_HEAD, False,
+                                  dtype, input_metadata)
+    ref_out = ref_impl.varlen_fwd(q, k, v, is_mqa=True)
+
+    torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
diff --git a/vllm_v0.10.0/tests/kernels/utils.py b/vllm_v0.10.0/tests/kernels/utils.py
new file mode 100644
index 0000000..2e8febb
--- /dev/null
+++ b/vllm_v0.10.0/tests/kernels/utils.py
@@ -0,0 +1,1251 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Kernel test utils"""
+
+import itertools
+import random
+import unittest
+from collections.abc import Sequence
+from numbers import Number
+from typing import Any, NamedTuple, Optional, Union
+
+import pytest
+import torch
+from torch._prims_common import TensorLikeType
+
+from tests.kernels.quant_utils import native_w8a8_block_matmul
+from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe.utils import (
+    moe_kernel_quantize_input)
+from vllm.platforms.interface import _Backend
+from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL,
+                        STR_XFORMERS_ATTN_VAL, make_tensor_with_pad)
+
+# For now, disable "test_aot_dispatch_dynamic" since there are some
+# bugs related to this test in PyTorch 2.4.
+DEFAULT_OPCHECK_TEST_UTILS: tuple[str, ...] = (
+    "test_schema",
+    "test_autograd_registration",
+    "test_faketensor",
+)
+
+ALL_OPCHECK_TEST_UTILS: tuple[str, ...] = (
+    "test_schema",
+    "test_autograd_registration",
+    "test_faketensor",
+    "test_aot_dispatch_dynamic",
+)
+
+
+class QKVInputs(NamedTuple):
+    '''
+    Data structure for representing unpacked attention inputs,
+    query/key/values and their sequence lengths.
+
+    Attributes:
+
+        * {query,key,value}: unpacked (batch_size x padded_seq_len x
+                             num_heads x head_size) attention inputs
+        * q_seq_lens: query sequence lengths list
+        * kv_seq_lens: shared key/value sequence lengths list
+    '''
+
+    query: torch.Tensor
+    key: torch.Tensor
+    value: torch.Tensor
+    q_seq_lens: list[int]
+    kv_seq_lens: list[int]
+
+
+class QKVO(NamedTuple):
+    '''
+    Data structure for representing unpacked attention inputs,
+    alongside unpacked known-correct attention output
+
+    Attributes:
+
+        * qkv: unpacked (batch_size x padded_seq_len x
+                             num_heads x head_size) attention inputs
+        * ideal_output: unpacked (batch_size x padded_seq_len x
+                        num_heads x head_size) known-correct attention output
+    '''
+
+    qkv: QKVInputs
+    ideal_output: torch.Tensor
+
+
+class PackedQKVInputs(NamedTuple):
+    '''
+    Data structure for representing packed attention inputs
+
+    Attributes:
+
+        * {query,key,value}: packed (number_of_tokens x num_heads
+                             x head_size) attention inputs
+        * q_start_loc_list: list of query start locations within packed tensor
+        * kv_start_loc_list: shared list of key/value start locations within
+                             packed tensor
+        * q_seq_lens: query sequence lengths list
+        * kv_seq_lens: shared key/value sequence lengths list
+    '''
+
+    query: torch.Tensor
+    key: torch.Tensor
+    value: torch.Tensor
+    q_start_loc_list: Optional[list[int]]
+    kv_start_loc_list: Optional[list[int]]
+    q_seq_lens: Optional[list[int]]
+    kv_seq_lens: Optional[list[int]]
+
+
+class PackedQKVO(NamedTuple):
+    '''
+    Data structure for representing packed attention inputs,
+    alongside packed known-correct attention output
+
+    Attributes:
+
+        * packed_qkv: packed (number_of_tokens x num_heads
+                      x head_size) attention inputs
+        * ideal_output: packed (number_of_tokens x num_heads
+                        x head_size) known-correct attention output
+    '''
+
+    packed_qkv: Optional[PackedQKVInputs]
+    ideal_output: torch.Tensor
+
+
+class KVMemoryMap(NamedTuple):
+    '''
+    Data structure for encapsulating KV cache memory mapping.
+
+    Attributes:
+
+        * block_tables: KV cache block tables
+        * slot_mapping: mapping of sequence offset to physical address
+    '''
+
+    block_tables: torch.Tensor
+    slot_mapping: torch.Tensor
+
+
+class PhaseTestParameters(NamedTuple):
+    '''
+    Data structure for encapsulating the test parameters
+    for a given test "phase" (prefill or decode phase) and attention
+    scenario (encoder, decoder-self, encoder/decoder-cross)
+
+    Attributes:
+
+        * packed_qkvo: packed (number_of_tokens x num_heads
+                       x head_size) attention inputs & known-correct
+                       output
+        * kv_mmap: KV cache memory mapping, specific to this test phase &
+                   attention scenario
+    '''
+
+    packed_qkvo: PackedQKVO
+    kv_mmap: Optional[KVMemoryMap]
+
+
+def maybe_make_int_tensor(
+    _list: Optional[list[int]],
+    device: Union[torch.device, str],
+) -> torch.Tensor:
+    '''
+    Convert Python int list to a 1D int torch.Tensor on `device`
+
+    Returns:
+
+    * If _list is not None: 1D int torch.Tensor on `device`
+    * None otherwise
+    '''
+    return None if _list is None else torch.tensor(
+        _list, dtype=torch.int, device=device)
+
+
+def maybe_make_long_tensor(
+    _list: Optional[list[int]],
+    device: Union[torch.device, str],
+) -> torch.Tensor:
+    '''
+    Convert Python int list to a 1D long torch.Tensor on `device`
+
+    Returns:
+
+    * If _list is not None: 1D long torch.Tensor on `device`
+    * None otherwise
+    '''
+    return None if _list is None else torch.tensor(
+        _list, dtype=torch.long, device=device)
+
+
+def maybe_max(_list: Optional[list]) -> Optional[Number]:
+    '''
+    Returns:
+
+    * If _list is not None: max(_list)
+    * None otherwise
+    '''
+    return None if _list is None else max(_list)
+
+
+def make_causal_mask(
+    q_max_seq_len: int,
+    kv_max_seq_len: int,
+) -> torch.Tensor:
+    '''
+    Create a q_max_seq_len x kv_max_seq_len causal mask
+
+    Arguments:
+
+    * q_max_seq_len: query max seq len
+    * kv_max_seq_len: key/value max seq len
+
+    Returns:
+
+    * 2D tensor, q_max_seq_len x kv_max_seq_len
+    '''
+
+    # Create a matrix where entry (i, j) is True if i >= j
+    mask = torch.triu(torch.ones(q_max_seq_len, kv_max_seq_len), diagonal=1)
+    # Replace True with float('-inf') and False with 0
+    mask = mask.masked_fill(mask == 1,
+                            float('-inf')).masked_fill(mask == 0, 0.0)
+    return mask
+
+
+def override_backend_env_variable(mpatch: pytest.MonkeyPatch,
+                                  backend_name: str) -> None:
+    '''
+    Override the environment variable indicating the vLLM backend temporarily,
+    using pytest monkeypatch to ensure that the env vars get
+    reset once the test context exits.
+
+    Arguments:
+
+    * mpatch: pytest monkeypatch instance
+    * backend_name: attention backend name to force
+    '''
+    mpatch.setenv(STR_BACKEND_ENV_VAR, backend_name)
+
+
+def ref_masked_attention(query: torch.Tensor,
+                         key: torch.Tensor,
+                         value: torch.Tensor,
+                         scale: float,
+                         custom_mask: Optional[torch.Tensor] = None,
+                         q_seq_lens: Optional[list] = None,
+                         kv_seq_lens: Optional[list] = None) -> torch.Tensor:
+    '''
+    "Golden" masked attention reference. Supports two types of masking:
+
+    * Basic attention mask, utilizing {q,kv}_seq_lens args to mask out
+      padding elements
+    * Custom attention mask, which can force an arbitrary mask tensor, i.e.
+      causal
+
+    Arguments:
+
+    * query: batch_size x q_padded_seq_len x num_heads x head_size
+    * key: batch_size x kv_padded_seq_len x num_heads x head_size
+    * value: batch_size x kv_padded_seq_len x num_heads x head_size
+    * scale: Attention scale factor
+    * custom_mask: custom attention mask; good place to inject a causal
+      attention mask
+    * q_seq_lens: list of unpadded query seq_lens for each batch index
+    * kv_seq_lens: list of unpadded key/value seq_lens for each batch index
+
+    Returns:
+
+    * Attention result, batch_size x q_padded_seq_len x num_heads x head_size
+    '''
+
+    assert q_seq_lens is not None
+    assert kv_seq_lens is not None
+
+    batch_size = query.shape[0]
+    assert (len(q_seq_lens) == batch_size)
+    assert (len(kv_seq_lens) == batch_size)
+
+    attn_weights = scale * torch.einsum("bqhd,bkhd->bhqk", query, key).float()
+
+    # Basic attention mask, derived from seq lens
+    if (q_seq_lens is not None) or (kv_seq_lens is not None):
+        attn_mask = torch.zeros_like(attn_weights)
+        if q_seq_lens is not None:
+            for bdx, plen in enumerate(q_seq_lens):
+                attn_mask[bdx, :, plen:, :] = -torch.inf
+        if kv_seq_lens is not None:
+            for bdx, plen in enumerate(kv_seq_lens):
+                attn_mask[bdx, :, :, plen:] = -torch.inf
+
+        attn_weights = attn_weights + attn_mask.float()
+
+    # Custom attention mask
+    if custom_mask is not None:
+        attn_weights = attn_weights + custom_mask.float()
+
+    attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
+    out = torch.einsum("bhqk,bkhd->bqhd", attn_weights, value)
+    return out
+
+
+def make_qkv(
+    batch_size: int,
+    max_q_seq_len: int,
+    max_kv_seq_len: Optional[int],
+    num_heads: int,
+    head_size: int,
+    device: Union[torch.device, str],
+    force_kv_seq_lens: Optional[list[int]] = None,
+    attn_type: AttentionType = AttentionType.ENCODER_DECODER,
+    force_max_len: bool = False,
+) -> tuple[QKVInputs, QKVInputs, QKVInputs]:
+    '''
+    Construct QKV test tensors for self- and cross-attention.
+
+    Generates three query/key/value triplets:
+
+    * "Baseline" query/key/value (for input to reference attention function)
+    * "Prefill" query/key/value (last sequence offset zero'd out, for use as
+      input to prefill kernel)
+    * "Decode" query/key/value (only the last sequence offset  from baseline,
+      for use as input to decode kernel)
+
+    Each Q/K/V triplet is associated with a list of q seqlens and a list of k/v
+    seqlens
+
+    Arguments:
+
+    * batch_size
+    * max_q_seq_len: max query seq len
+    * max_kv_seq_len: max key/value seq len
+    * num_heads
+    * head_size
+    * is_encoder_decoder_attn: if True, query seqlen may differ from
+      key/value seqlen (as is often the case for cross-attention);
+      o/w, query/key/value seqlens match at each batch index
+      (max_kv_seq_len is unused)
+    * force_kv_seq_lens: if not None, overrides kv sequence lengths
+    * attn_type: encoder, decoder self, or enc/dec cross attention
+    * force_max_len: if True, all query seqlens are max_q_seq_len; o/w query
+      seqlens are random in [2,max_q_seq_lens]. Same for key/value seqlens
+      and max_kv_seq_len, unless forced by is_encoder_decoder_attn=False
+    * device: CPU or CUDA device
+
+    Returns:
+
+    * Overall QKVInputs structure (containing full unpacked Q/K/V tensors)
+    * Prefill QKVInputs structure (containing all but the last sequence offset)
+    * Decode QKVInputs structure (containing all only the last sequence offset)
+    '''
+
+    if force_max_len:
+        q_seq_lens = [max_q_seq_len for _ in range(batch_size)]
+    else:
+        q_seq_lens = [
+            random.randint(2, max_q_seq_len) for _ in range(batch_size)
+        ]
+    kv_seq_lens = None
+    if force_kv_seq_lens is not None:
+        kv_seq_lens = force_kv_seq_lens
+    elif attn_type != AttentionType.ENCODER_DECODER:
+        # K,V seq lens match Q for self-attention
+        kv_seq_lens = q_seq_lens
+    else:
+        # K,V seq lens are distinct from Q seq lens & random
+        assert max_kv_seq_len is not None
+        if force_max_len:
+            kv_seq_lens = [max_kv_seq_len] * batch_size
+        else:
+            kv_seq_lens = [
+                random.randint(2, max_kv_seq_len) for _ in range(batch_size)
+            ]
+
+    query = torch.rand(
+        (batch_size, max_q_seq_len, num_heads, head_size)).to(device)
+    key = torch.rand(
+        (batch_size, max_kv_seq_len, num_heads, head_size)).to(device)
+    value = torch.rand(
+        (batch_size, max_kv_seq_len, num_heads, head_size)).to(device)
+
+    prefill_query = torch.zeros(
+        (batch_size, max_q_seq_len, num_heads, head_size)).to(device)
+    prefill_key = torch.zeros(
+        (batch_size, max_kv_seq_len, num_heads, head_size)).to(device)
+    prefill_value = torch.zeros(
+        (batch_size, max_kv_seq_len, num_heads, head_size)).to(device)
+
+    decode_query = torch.zeros(
+        (batch_size, 1, num_heads, head_size)).to(device)
+    decode_key = torch.zeros((batch_size, 1, num_heads, head_size)).to(device)
+    decode_value = torch.zeros(
+        (batch_size, 1, num_heads, head_size)).to(device)
+
+    for bdx, (q_seq_len, kv_seq_len) in enumerate(zip(q_seq_lens,
+                                                      kv_seq_lens)):
+        query[bdx, q_seq_len:, :, :] = 0
+        key[bdx, kv_seq_len:, :, :] = 0
+        value[bdx, kv_seq_len:, :, :] = 0
+
+        prefill_query[bdx,
+                      0:(q_seq_len - 1), :, :] = query[bdx,
+                                                       0:(q_seq_len - 1), :, :]
+        prefill_key[bdx,
+                    0:(kv_seq_len - 1), :, :] = key[bdx,
+                                                    0:(kv_seq_len - 1), :, :]
+        prefill_value[bdx, 0:(kv_seq_len -
+                              1), :, :] = value[bdx, 0:(kv_seq_len - 1), :, :]
+
+        decode_query[bdx, :, :, :] = query[bdx,
+                                           (q_seq_len - 1):q_seq_len, :, :]
+        decode_key[bdx, :, :, :] = key[bdx, (kv_seq_len - 1):kv_seq_len, :, :]
+        decode_value[bdx, :, :, :] = value[bdx,
+                                           (kv_seq_len - 1):kv_seq_len, :, :]
+
+    prefill_q_seq_lens = [plen - 1 for plen in q_seq_lens]
+    prefill_kv_seq_lens = [plen - 1 for plen in kv_seq_lens]
+
+    decode_q_seq_lens = [1 for _ in q_seq_lens]
+    decode_kv_seq_lens = [1 for _ in kv_seq_lens]
+
+    return (
+        QKVInputs(
+            query,  # Overall QKV inputs
+            key,
+            value,
+            q_seq_lens,
+            kv_seq_lens),
+        QKVInputs(
+            prefill_query,  # Prefill subset of QKV sequences
+            prefill_key,
+            prefill_value,
+            prefill_q_seq_lens,
+            prefill_kv_seq_lens),
+        QKVInputs(
+            decode_query,  # Decode subset of KV sequences
+            decode_key,
+            decode_value,
+            decode_q_seq_lens,
+            decode_kv_seq_lens))
+
+
+def pack_tensor(
+        unpacked_tensor: torch.Tensor, seq_lens: list[int],
+        device: Union[torch.device, str]) -> tuple[torch.Tensor, list[int]]:
+    '''
+    Pack a batch_size x padded_seq_len x num_heads x head_size tensor into an
+    unpadded number_of_tokens x num_heads x head_size tensor, where
+    number_of_tokens = sum(seq_lens)
+
+    Arguments:
+
+    * unpacked_tensor: batch_size x padded_seq_len x num_heads x head_size
+    * seq_lens: list of token counts for each seq
+    * device: CPU or CUDA device
+
+    Returns
+
+    * packed_tensor: number_of_tokens x num_heads x head_size
+    * start_loc_list: start idx of each batch elt in packed_tensor; [0] +
+      list(itertools.accumulate(seq_lens))
+    '''
+
+    num_tok = sum(seq_lens)
+    num_heads = unpacked_tensor.shape[-2]
+    head_size = unpacked_tensor.shape[-1]
+    start_loc_list = [0] + list(itertools.accumulate(seq_lens))
+    packed_tensor = torch.zeros((num_tok, num_heads, head_size), device=device)
+
+    for bdx, (seq_len, start_loc) in enumerate(zip(seq_lens, start_loc_list)):
+
+        packed_tensor[start_loc:(
+            start_loc + seq_len), :, :] = unpacked_tensor[bdx, :seq_len, :, :]
+
+    return packed_tensor, start_loc_list
+
+
+def pack_qkv(qkv: QKVInputs, device: Union[torch.device,
+                                           str]) -> PackedQKVInputs:
+    '''
+    Individually pack each of Q, K and V, each with dimensions batch_size x
+    padded_seq_len x num_heads x head_size, into respective number_of_tokens x
+    num_heads x head_size tensors.
+
+    For Q, number_of_tokens = sum(q_seq_lens).
+
+    For K and V, number_of_tokens = sum(kv_seq_lens)
+
+    Arguments:
+
+    * qkv: Unpacked (batch_size x padded_seq_len x num_heads x head_size)
+           attention inputs
+    * device: CPU or CUDA device
+
+    Returns
+
+    * Packed (number_of_tokens x num_heads x head_size) QKV inputs
+      derived from unpacked inputs
+    '''
+
+    if qkv.query is None:
+        packed_query = None
+        q_start_loc_list = None
+    else:
+        packed_query, q_start_loc_list = pack_tensor(qkv.query,
+                                                     qkv.q_seq_lens,
+                                                     device=device)
+    packed_key, kv_start_loc_list = pack_tensor(qkv.key,
+                                                qkv.kv_seq_lens,
+                                                device=device)
+    packed_value, _ = pack_tensor(qkv.value, qkv.kv_seq_lens, device=device)
+    return PackedQKVInputs(
+        packed_query, packed_key, packed_value, q_start_loc_list,
+        kv_start_loc_list,
+        (None if q_start_loc_list is None else qkv.q_seq_lens),
+        qkv.kv_seq_lens)
+
+
+def make_backend(backend_name: str) -> AttentionBackend:
+    '''
+    Construct the backend instance determined by the backend_name string
+    argument.
+
+    "XFORMERS" -> construct xformers backend
+
+    TODO: other backends
+
+    Note: at time of writing the Attention wrapper automatically selects
+    its own backend for Attention.forward(); so the backend instance which
+    you generate with this function is not meant to be used for *running*
+    inference, but rather for generating compatible metadata structures
+    using backend.make_metadata()
+
+
+    Returns:
+
+    * Backend instance
+    '''
+    if backend_name == STR_XFORMERS_ATTN_VAL:
+        # NOTE: xFormers backend cannot be imported for CPU and AMD GPUs.
+        from vllm.attention.backends.xformers import XFormersBackend
+        return XFormersBackend()
+    elif backend_name == STR_FLASH_ATTN_VAL:
+        from vllm.attention.backends.flash_attn import FlashAttentionBackend
+        return FlashAttentionBackend()
+
+    raise AssertionError(
+        f"Unrecognized backend_name {backend_name} for unit test")
+
+
+def _make_metadata_tensors(
+    seq_lens: Optional[list[int]],
+    context_lens: Optional[list[int]],
+    encoder_seq_lens: Optional[list[int]],
+    device: Union[torch.device, str],
+) -> tuple[torch.Tensor, torch.Tensor, Any, Any, Optional[torch.Tensor],
+           torch.Tensor, torch.Tensor, Optional[int]]:
+    '''
+    Build scalar & tensor values required to build attention metadata structure.
+
+    Arguments:
+
+    * seq_lens: list of token-counts for each decoder input seq
+    * context_lens: list of context length values for each seq
+    * encoder_seq_lens: list of token-counts for each encoder input seq
+    * device: CPU or CUDA device
+
+    Returns:
+
+    * seq_lens_tensor: decoder seq_lens list, as tensor
+    * context_lens_tensor: context_lens list, as tensor
+    * max_context_len: max(context_lens)
+    * max_seq_len: max(seq_lens)
+    * seq_start_loc: start idx of each sequence
+    * encoder_seq_lens_tensor: encoder seq_lens list, as tensor
+    * encoder_seq_start_loc: start idx of each encoder sequence
+    * max_encoder_seq_len: encoder seq_lens list, as tensor
+    '''
+    seq_lens_tensor = maybe_make_int_tensor(seq_lens, device)
+    context_lens_tensor = maybe_make_int_tensor(context_lens, device)
+    max_context_len = maybe_max(context_lens)
+    max_seq_len = maybe_max(seq_lens)
+
+    encoder_seq_lens_tensor = maybe_make_int_tensor(encoder_seq_lens, device)
+    max_encoder_seq_len = (None if encoder_seq_lens is None else
+                           max(encoder_seq_lens))
+
+    seq_start_loc = None
+
+    if seq_lens_tensor is not None:
+        seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
+                                    dtype=torch.int32,
+                                    device=seq_lens_tensor.device)
+        torch.cumsum(seq_lens_tensor,
+                     dim=0,
+                     dtype=seq_start_loc.dtype,
+                     out=seq_start_loc[1:])
+
+    encoder_seq_start_loc = torch.zeros(encoder_seq_lens_tensor.shape[0] + 1,
+                                        dtype=torch.int32,
+                                        device=encoder_seq_lens_tensor.device)
+    torch.cumsum(encoder_seq_lens_tensor,
+                 dim=0,
+                 dtype=encoder_seq_start_loc.dtype,
+                 out=encoder_seq_start_loc[1:])
+
+    return (seq_lens_tensor, context_lens_tensor, max_context_len, max_seq_len,
+            seq_start_loc, encoder_seq_lens_tensor, encoder_seq_start_loc,
+            max_encoder_seq_len)
+
+
+def make_kv_cache(num_blocks: int,
+                  num_heads: int,
+                  head_size: int,
+                  block_size: int,
+                  device: Union[torch.device, str],
+                  backend: str,
+                  default_val: float = 0.0) -> torch.Tensor:
+    '''
+    Create a fake KV cache.
+
+    Arguments:
+
+    * num_blocks: number of blocks in the KV cache
+    * num_heads: number of attention heads
+    * head_size: head dimension
+    * block_size: number of offsets within a block
+    * device: CPU or CUDA device
+    * default_val: initialization value for KV cache elements
+
+    Returns:
+
+    * kv_cache: 2 x num_blocks x (block_size * num_heads * head_size)
+    *     for backend 'XFORMERS'
+    * kv_cache: 2 x num_blocks x block_size x num_heads x head_size
+    *     for backend 'FLASH_ATTN'
+    '''
+    if backend == 'XFORMERS':
+        kv_cache = torch.rand(
+            (2, num_blocks, block_size * num_heads * head_size)).to(device)
+    elif backend == 'FLASH_ATTN':
+        kv_cache = torch.rand(
+            (2, num_blocks, block_size, num_heads, head_size)).to(device)
+    else:
+        raise ValueError(
+            f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or "
+            f"'FLASH_ATTN'.")
+    if default_val is not None:
+        kv_cache[:, :, :] = default_val
+    return kv_cache
+
+
+def _num_tokens_to_min_blocks(num_tokens: int, block_size: int) -> int:
+    '''
+    Compute the minimum number of blocks required to hold num_tokens tokens,
+    given block_size
+    '''
+    return (num_tokens + block_size) // block_size
+
+
+def make_empty_slot_mapping_tensor(device: Union[torch.device, str]):
+    return maybe_make_long_tensor([], device)
+
+
+def make_empty_block_tables_tensor(device: Union[torch.device, str]):
+    return torch.tensor([], device=device)
+
+
+def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: list[int],
+                       device: Union[torch.device, str]):
+    '''
+    Split a slot mapping into valid prefill- and decode-phase slot mappings.
+
+    Context:
+    * Your goal is to test (1) prefill of N prompts, with prompt-lengths
+      {K_i \\forall i \\in [0,N)}, followed by (2) decoding of a single token
+      for all N prompts (N tokens total); the resultant sequence lengths
+      after decode would be {K_i + 1 for i \\in [0,N)}
+    * The test you want to do requires (1) having the prefill slot mapping
+      for all tokens present during prefill, the number of which is
+      M = \\sum_i{K_i}, and (2) having the decode slot mapping for all N
+      decoded tokens
+
+    This function consumes a single 1D slot mapping, which is the
+    concatenation of N slot mappings each of length K_i + 1 (corresponding
+    to the  sequence lengths after decode), with a total length of
+    P = \\sum_i{K_i + 1} = M + N
+
+    The prefill-phase slot mapping results from excising the (K_i + 1)-th entry
+    from each of the N subsequences in the slot mapping (i.e. omitting the
+    decoded token's mapping.)
+
+    The N excised entries are appended to obtain the decode-phase slot mapping
+
+    Arguments:
+
+    * slot_mapping_list: Length-P 1D slot mapping (as list) reflecting all N
+      post-decode sequences
+    * seq_lens: list of N post-decode sequence lengths (K_i + 1 in the
+      description above)
+    * device: cuda, cpu, etc.
+
+    Returns:
+
+    * prefill_slot_mapping: Length-M 1D slot mapping (as Tensor)
+      reflecting all N prefill prompts
+    * decode_slot_mapping: Length-N 1D slot mapping (as Tensor) reflecting
+      all N decoded tokens
+    '''
+
+    prefill_slot_mapping = []
+    decode_slot_mapping = []
+
+    base_idx = 0
+    for seq_len in seq_lens:
+        prefill_slot_mapping.extend(slot_mapping_list[base_idx:(base_idx +
+                                                                seq_len - 1)])
+        decode_slot_mapping.append(slot_mapping_list[base_idx + seq_len - 1])
+        base_idx += seq_len
+
+    return (maybe_make_long_tensor(prefill_slot_mapping, device),
+            maybe_make_long_tensor(decode_slot_mapping, device))
+
+
+def make_block_tables_slot_mapping(
+        block_size: int,
+        seq_lens: list[int],
+        device: Union[torch.device, str],
+        block_base_addr: int = 0) -> tuple[torch.Tensor, list[int], int]:
+    '''
+    Construct fake block tables & slot mappings.
+
+    For a sequence with num_tokens tokens the minimum number
+    of required KV cache blocks is
+
+    num_blocks = (num_tokens + block_size) // block_size
+
+    Then the minimum KV cache size in blocks is
+
+    total_cache_blocks = sum(num_blocks for all seqs)
+
+    Then, the blocktable mapping counts downward from
+
+    block_base_addr + total_cache_blocks
+
+    to
+
+    block_base_addr
+
+
+    The constructed block-tables and slot-mapping are sized to the
+    lengths of the sequences in their entirety (as reflected by seq_lens),
+    i.e. the total of prefill prompt tokens + decoded tokens.
+
+    Arguments:
+
+    * block_size: number of offsets per block
+    * seq_lens: list of token-counts for each sequence
+    * block_base_addr: the block table base address
+    * device: CPU or CUDA device
+
+    Return:
+
+    * block_tables_tensor: block table for sequence
+    * slot_mapping_list: slot mapping for sequence
+    * max_block_idx: the highest block address within this block table
+    '''
+
+    # Provision minimum number of KV cache blocks
+    num_blocks_list = [
+        _num_tokens_to_min_blocks(num_tokens, block_size)
+        for num_tokens in seq_lens
+    ]
+    max_block_table_len = max(num_blocks_list)
+    block_table_pad_tokens = 10
+
+    block_tables = []
+    slot_mapping_list = []
+    # Compute uppermost address of block table
+    total_cache_blocks = sum(num_blocks_list)
+    block_base_idx = block_base_addr + total_cache_blocks
+    max_block_idx = block_base_idx
+    for sdx, num_tokens in enumerate(seq_lens):
+        num_blocks = num_blocks_list[sdx]
+        block_table = list(
+            range(block_base_idx, block_base_idx - num_blocks, -1))
+        for idx in range(num_tokens):
+            mapping_value = (
+                idx % block_size) + block_table[idx // block_size] * block_size
+            slot_mapping_list.append(mapping_value)
+
+        block_base_idx -= num_blocks
+        block_tables.append(block_table)
+
+    block_tables_tensor = make_tensor_with_pad(
+        block_tables,
+        max_len=max_block_table_len + block_table_pad_tokens,
+        pad=0,
+        dtype=torch.int,
+        device=device,
+    )
+
+    return (block_tables_tensor, slot_mapping_list, max_block_idx)
+
+
+def make_test_metadata(
+    attn_backend: _Backend,
+    is_prompt: bool,
+    seq_lens: Optional[list[int]],
+    decoder_test_params: Optional[PhaseTestParameters],
+    device: Union[torch.device, str],
+    encoder_test_params: Optional[PhaseTestParameters] = None,
+    cross_test_params: Optional[PhaseTestParameters] = None
+) -> AttentionMetadata:
+    '''
+    Construct fake attention metadata for a given test phase
+    (prefill-phase or decode-phase).
+
+    encoder_test_params and cross_test_params arguments allow encoder
+    attention and enc/dec cross-attention (respectively) to use distinct
+    metadata values from decoder self-attention (decoder_test_params.)
+
+    if encoder_test_params and cross_test_params are None, the attention
+    metadata will support decoder-only scenario.
+
+    Assumptions:
+
+    * No chunked prefill -> a batch is 100% prefill or 100% decode, never both
+
+    Arguments:
+
+    * attn_backend_name: Backend for sourcing attention kernels
+    * is_prompt: prefill if True, o/w decode
+    * seq_lens: list of token counts for each sequence
+    * decoder_test_params: decoder self-attention test params;
+                           this function requires
+                           kv_mmap (memory mapping) field
+    * device: CPU or CUDA device
+    * encoder_test_params: encoder attention test params;
+                           this function requires encoder query
+                           sequence lengths field. If None,
+                           encoder query sequence lengths are
+                           treated as None
+    * cross_test_params: enc/dec cross-attention test params;
+                         this function requires kv_mmap field.
+                         If None, KV cache memory map data
+                         structures are treated as None
+
+    Return:
+
+    * AttentionMetadata structure
+    '''
+
+    # Decoder self-attention memory mapping
+    # decoder_test_params is None signals encoder-only
+    # scenario, so kv_mmap is None
+    kv_mmap = (None
+               if decoder_test_params is None else decoder_test_params.kv_mmap)
+
+    # This function constructs metadata assuming no chunked prefill,
+    # i.e. 100% prefill tokens or 100% decode tokens
+    #
+    # - If is_prompt, num_prefills_or_decodes is the number of prefills
+    #   and num_prefill_or_decode_tokens is the number of prefill tokens
+    # - If not is_prompt, num_prefills_or_decodes is the number of decodes
+    #   and num_prefill_or_decode_tokens is the number of decode tokens
+    #
+    # seq_lens is None signals encoder-only
+    # scenario, in which case num_prefills_or_decodes and
+    # num_prefill_or_decode_tokens are unused
+    num_prefills_or_decodes = (None if seq_lens is None else len(seq_lens))
+
+    num_prefill_or_decode_tokens = (None if seq_lens is None else (
+        sum(seq_lens) if is_prompt else len(seq_lens)))
+
+    # Seems for non-prefix-caching scenarios context_lens
+    # is never needed
+    context_lens = None
+
+    if encoder_test_params is None:
+        encoder_seq_lens = None
+        num_encoder_tokens = None
+    else:
+        # Encoder/decoder or encoder-only models only:
+        # * Extract encoder input sequence lengths
+        assert encoder_test_params.packed_qkvo.packed_qkv is not None
+        encoder_seq_lens = encoder_test_params.packed_qkvo.packed_qkv.q_seq_lens
+        num_encoder_tokens = (None if encoder_seq_lens is None else
+                              (sum(encoder_seq_lens)))
+
+    if cross_test_params is None:
+        cross_kv_mmap = None
+    else:
+        # Encoder/decoder or encoder-only models only:
+        # * Extract *cross-attention* slot_mapping and block table
+        #   (kv_mmap)
+        cross_kv_mmap = cross_test_params.kv_mmap
+
+    attn_backend_obj = make_backend(attn_backend.name)
+
+    if is_prompt:
+        # Prefill-phase scenario
+
+        num_prefills = num_prefills_or_decodes
+        num_prefill_tokens = num_prefill_or_decode_tokens
+        num_decode_tokens = 0
+
+        (
+            seq_lens_tensor,
+            context_lens_tensor,
+            _,
+            _,
+            seq_start_loc,
+            encoder_seq_lens_tensor,
+            encoder_seq_start_loc,
+            max_encoder_seq_len,
+        ) = _make_metadata_tensors(seq_lens,
+                                   context_lens,
+                                   encoder_seq_lens,
+                                   device=device)
+        return attn_backend_obj.make_metadata(
+            num_prefills=num_prefills,
+            slot_mapping=(None if kv_mmap is None else kv_mmap.slot_mapping),
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=True,
+            num_prefill_tokens=num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            seq_start_loc=seq_start_loc,
+            max_prefill_seq_len=None if seq_lens is None else max(seq_lens),
+            max_decode_seq_len=0,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=(None if kv_mmap is None else kv_mmap.block_tables),
+            use_cuda_graph=False,
+            num_encoder_tokens=num_encoder_tokens,
+            encoder_seq_lens=encoder_seq_lens,
+            encoder_seq_lens_tensor=encoder_seq_lens_tensor,
+            encoder_seq_start_loc=encoder_seq_start_loc,
+            max_encoder_seq_len=max_encoder_seq_len,
+            cross_slot_mapping=(None if cross_kv_mmap is None else
+                                cross_kv_mmap.slot_mapping),
+            cross_block_tables=(None if cross_kv_mmap is None else
+                                cross_kv_mmap.block_tables))
+
+    else:  # not is_prompt
+        # Decode-phase scenario
+
+        assert kv_mmap is not None
+        assert num_prefill_or_decode_tokens is not None
+        assert seq_lens is not None
+
+        num_prefills = 0
+        num_prefill_tokens = 0
+        num_decode_tokens = num_prefill_or_decode_tokens
+
+        (
+            seq_lens_tensor,
+            context_lens_tensor,
+            _,
+            _,
+            seq_start_loc,
+            encoder_seq_lens_tensor,
+            encoder_seq_start_loc,
+            max_encoder_seq_len,
+        ) = _make_metadata_tensors(seq_lens,
+                                   context_lens,
+                                   encoder_seq_lens,
+                                   device=device)
+
+        return attn_backend_obj.make_metadata(
+            num_prefills=num_prefills,
+            slot_mapping=kv_mmap.slot_mapping,
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=True,
+            num_prefill_tokens=num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            seq_start_loc=seq_start_loc,
+            max_prefill_seq_len=0,
+            max_decode_seq_len=max(seq_lens),
+            max_decode_query_len=1,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=kv_mmap.block_tables,
+            use_cuda_graph=False,
+            num_encoder_tokens=num_encoder_tokens,
+            encoder_seq_lens=encoder_seq_lens,
+            encoder_seq_lens_tensor=encoder_seq_lens_tensor,
+            encoder_seq_start_loc=encoder_seq_start_loc,
+            max_encoder_seq_len=max_encoder_seq_len,
+            cross_slot_mapping=(None if cross_kv_mmap is None else
+                                cross_kv_mmap.slot_mapping),
+            cross_block_tables=(None if cross_kv_mmap is None else
+                                cross_kv_mmap.block_tables))
+
+
+def assert_actual_matches_ideal(test_params: PhaseTestParameters,
+                                output_under_test: torch.Tensor,
+                                backend: str) -> None:
+    '''
+    Assert that observed output matches the ideal output
+    contained in the test parameters data structure.
+
+    Arguments:
+
+    * test_params: Test parameters including packed ideal output
+    * output_under_test: actually observed output value
+    '''
+    ideal_output = test_params.packed_qkvo.ideal_output
+    if backend == 'XFORMERS':
+        torch.testing.assert_close(ideal_output,
+                                   output_under_test.view_as(ideal_output))
+
+    elif backend == 'FLASH_ATTN':
+        # For FlashAttention override the accuracy thresholds to non default
+        # values since we notice a higher difference between the ideal and
+        # actual output.
+        torch.testing.assert_close(ideal_output,
+                                   output_under_test.view_as(ideal_output),
+                                   atol=0.01,
+                                   rtol=0.016)
+    else:
+        raise ValueError(
+            f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or "
+            f"'FLASH_ATTN'.")
+
+
+# Copied/modified from torch._refs.__init__.py
+def fp8_allclose(
+    a: TensorLikeType,
+    b: TensorLikeType,
+    rtol: float = 1e-05,
+    atol: float = 1e-08,
+    equal_nan: bool = False,
+) -> bool:
+    """
+    Reference implementation of torch.allclose
+    """
+    torch._refs._check_close_args(name="torch.allclose",
+                                  a=a,
+                                  b=b,
+                                  rtol=rtol,
+                                  atol=atol)
+
+    return bool(
+        torch.all(
+            torch.isclose(a.double(),
+                          b.double(),
+                          rtol=rtol,
+                          atol=atol,
+                          equal_nan=equal_nan)).item())
+
+
+# Marlin MoE test utils
+
+
+def stack_and_dev(tensors: list[torch.Tensor]):
+    dev = tensors[0].device
+    return torch.stack(tensors, dim=0).to(dev)
+
+
+def compute_max_diff(output, output_ref):
+    return torch.mean(torch.abs(output - output_ref)) / torch.mean(
+        torch.abs(output_ref))
+
+
+def torch_experts(
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weight: torch.Tensor,
+    topk_ids: torch.Tensor,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    quant_dtype: Optional[torch.dtype] = None,
+    per_act_token_quant=False,
+    block_shape: Optional[list[int]] = None,
+    apply_router_weights_on_input: bool = False,
+) -> torch.Tensor:
+    assert (global_num_experts == -1
+            or (global_num_experts == w1.shape[0] and expert_map is None)
+            or (expert_map is not None
+                and global_num_experts == expert_map.shape[0]))
+
+    M, K = a.shape
+    topk = topk_ids.shape[1]
+
+    if apply_router_weights_on_input:
+        assert topk == 1
+        a = a * topk_weight.to(a.dtype)
+
+    a = a.view(M, -1, K).repeat(1, topk, 1).reshape(-1, K)
+
+    out = torch.zeros(M * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+
+    if a1_scale:
+        assert not per_act_token_quant and block_shape is None
+    a, a_scale = moe_kernel_quantize_input(a, a1_scale, quant_dtype,
+                                           per_act_token_quant, block_shape)
+
+    num_experts = w1.shape[0]
+
+    topk_ids = topk_ids.view(-1)
+    if expert_map is not None:
+        topk_ids = expert_map[topk_ids]
+
+    f32 = torch.float32
+
+    for i in range(num_experts):
+        mask = topk_ids == i
+        if mask.sum():
+            if quant_dtype is None:
+                tmp1 = a[mask] @ w1[i].transpose(0, 1)
+                tmp2 = SiluAndMul()(tmp1)
+                out[mask] = tmp2 @ w2[i].transpose(0, 1)
+            elif block_shape is not None:
+                # block quantized
+                assert (a_scale is not None and w1_scale is not None
+                        and w2_scale is not None)
+                tmp1 = native_w8a8_block_matmul(a[mask], w1[i], a_scale[mask],
+                                                w1_scale[i], block_shape,
+                                                out.dtype)
+                tmp2 = SiluAndMul()(tmp1)
+                tmp2, b_scale = moe_kernel_quantize_input(
+                    tmp2, a2_scale, quant_dtype, per_act_token_quant,
+                    block_shape)
+
+                out[mask] = native_w8a8_block_matmul(tmp2, w2[i], b_scale,
+                                                     w2_scale[i], block_shape,
+                                                     out.dtype)
+            else:
+                assert (a_scale is not None and w1_scale is not None
+                        and w2_scale is not None)
+                scales = a_scale if a_scale.numel() == 1 else a_scale[mask]
+
+                tmp1 = a[mask].to(f32) * scales
+                w1_dq = (w1[i].to(f32) * w1_scale[i]).transpose(0, 1)
+                tmp1 = (tmp1 @ w1_dq).to(out.dtype)
+
+                tmp2 = SiluAndMul()(tmp1).to(out.dtype)
+
+                tmp2, b_scale = moe_kernel_quantize_input(
+                    tmp2, a2_scale, quant_dtype, per_act_token_quant,
+                    block_shape)
+                assert b_scale is not None
+
+                tmp2 = tmp2.to(f32) * b_scale
+                w2_dq = (w2[i].to(f32) * w2_scale[i]).transpose(0, 1)
+                out[mask] = (tmp2 @ w2_dq).to(out.dtype)
+
+    if apply_router_weights_on_input:
+        return out
+    else:
+        return (out.view(M, -1, w2.shape[1]).to(f32) *
+                topk_weight.view(M, -1, 1)).sum(dim=1).to(out.dtype)
+
+
+def torch_moe(a: torch.Tensor,
+              w1: torch.Tensor,
+              w2: torch.Tensor,
+              score: torch.Tensor,
+              topk: int,
+              global_num_experts: int = -1,
+              expert_map: Optional[torch.Tensor] = None) -> torch.Tensor:
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    return torch_experts(a, w1, w2, topk_weight, topk_ids, global_num_experts,
+                         expert_map)
+
+
+def torch_moe_single(a, w, score, topk):
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w.shape[1], dtype=a.dtype, device=a.device)
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    _, topk_ids = torch.topk(score, topk)
+    topk_ids = topk_ids.view(-1)
+    for i in range(w.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            out[mask] = a[mask] @ w[i].transpose(0, 1)
+    return (out.view(B, -1, w.shape[1])).sum(dim=1)
+
+
+# A special version of op check that has a restricted default set of test_utils
+# and a patched version of allclose that supports fp8 types.
+def opcheck(op: Union[torch._ops.OpOverload, torch._ops.OpOverloadPacket,
+                      torch._library.custom_ops.CustomOpDef],
+            args: tuple[Any, ...],
+            kwargs: Optional[dict[str, Any]] = None,
+            *,
+            test_utils: Union[str, Sequence[str]] = ALL_OPCHECK_TEST_UTILS,
+            raise_exception: bool = True,
+            cond: bool = True) -> dict[str, str]:
+    with unittest.mock.patch('torch.allclose', new=fp8_allclose):
+        return torch.library.opcheck(
+            op,
+            args,
+            kwargs,
+            test_utils=test_utils,
+            raise_exception=raise_exception) if cond else {}
+
+
+# For testing quantized linear kernels
+def to_fp8(tensor: torch.Tensor):
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def to_int8(tensor: torch.Tensor):
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def baseline_scaled_mm(a: torch.Tensor,
+                       b: torch.Tensor,
+                       scale_a: torch.Tensor,
+                       scale_b: torch.Tensor,
+                       out_dtype: type[torch.dtype],
+                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+    # We treat N-dimensional group scaling as extended numpy-style broadcasting
+    # in numpy simply stretches dimensions with an extent of 1 to match the
+    # the target shape by repeating the data along that dimension (broadcasting)
+    # , we extend these semantics to say if the extent of a dimension in the
+    # source shape is not 1 and does not match the target shape we repeat each
+    # element along that dimension src_shape[dim] // target_shape[dim] times
+    # example if we have:
+    #       a = [[1, 2], and target_shape = (2, 4)
+    #            [3, 4]]
+    # then we would expand a to:
+    #       a = [[1, 1, 2, 2],
+    #            [3, 3, 4, 4]]
+    # NOTE this function this function does not explicitly broadcast dimensions
+    # with an extent of 1, since this can be done implicitly by pytorch
+    def group_broadcast(t, shape):
+        for i, s in enumerate(shape):
+            if t.shape[i] != s and t.shape[i] != 1:
+                assert s % t.shape[i] == 0
+                t = t.unsqueeze(i + 1)\
+                  .expand(*t.shape[:i+1], s // t.shape[i], *t.shape[i+1:])\
+                  .flatten(i, i + 1)
+        return t
+
+    scale_a = group_broadcast(scale_a, a.shape)
+    scale_b = group_broadcast(scale_b, b.shape)
+
+    output = torch.mm((scale_a * a.to(dtype=torch.float32)),
+                      (scale_b * b.to(dtype=torch.float32))).to(out_dtype)
+
+    if bias is not None:
+        output = output + bias
+
+    return output
diff --git a/vllm_v0.10.0/tests/kv_transfer/test_disagg.py b/vllm_v0.10.0/tests/kv_transfer/test_disagg.py
new file mode 100644
index 0000000..9f2229c
--- /dev/null
+++ b/vllm_v0.10.0/tests/kv_transfer/test_disagg.py
@@ -0,0 +1,120 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import subprocess
+import sys
+import time
+from subprocess import Popen
+
+import pytest
+import requests
+import torch
+
+
+# Fixture to set up environment variables and teardown servers after tests
+@pytest.fixture(scope="module", autouse=True)
+def setup_servers():
+    if torch.cuda.device_count() < 2:
+        pytest.skip("Skipping test: fewer than 2 GPUs available")
+
+    # Set up environment variables
+    VLLM_HOST_IP = subprocess.check_output("hostname -I | awk '{print $1}'",
+                                           shell=True).decode().strip()
+    os.environ["VLLM_HOST_IP"] = VLLM_HOST_IP
+
+    # Start prefill instance
+    prefill_cmd = [
+        sys.executable,
+        "-m",
+        "vllm.entrypoints.openai.api_server",
+        "--model",
+        "meta-llama/Llama-3.2-1B-Instruct",
+        "--port",
+        "8100",
+        "--gpu-memory-utilization",
+        "0.5",
+        "--max-model-len",
+        "1000",
+        "--kv-transfer-config",
+        '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer",'\
+        '"kv_rank":0,"kv_parallel_size":2}',
+    ]
+    prefill_env = os.environ.copy()
+    prefill_env["CUDA_VISIBLE_DEVICES"] = "0"
+    prefill_proc = Popen(prefill_cmd, env=prefill_env)
+
+    # Start decode instance
+    decode_cmd = [
+        sys.executable,
+        "-m",
+        "vllm.entrypoints.openai.api_server",
+        "--model",
+        "meta-llama/Llama-3.2-1B-Instruct",
+        "--port",
+        "8200",
+        "--gpu-memory-utilization",
+        "0.5",
+        "--max-model-len",
+        "1000",
+        "--kv-transfer-config",
+        '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer",'\
+        '"kv_rank":1,"kv_parallel_size":2}',
+    ]
+    decode_env = os.environ.copy()
+    decode_env["CUDA_VISIBLE_DEVICES"] = "1"
+    decode_proc = Popen(decode_cmd, env=decode_env)
+
+    # Wait for servers to be ready
+    assert wait_for_server(8100), "Prefill server did not start in time"
+    assert wait_for_server(8200), "Decode server did not start in time"
+
+    # Yield to the test function and handle teardown after tests
+    yield
+
+    # Cleanup: kill the processes
+    prefill_proc.terminate()
+    decode_proc.terminate()
+
+    # Additional cleanup if needed
+    prefill_proc.wait()
+    decode_proc.wait()
+
+
+# Helper function to wait for server
+def wait_for_server(port, timeout=240):
+    start_time = time.time()
+    while time.time() - start_time < timeout:
+        try:
+            response = requests.get(f"http://localhost:{port}/v1/completions")
+            if response.status_code in [200, 405]:
+                return True
+        except requests.ConnectionError:
+            time.sleep(1)
+    return False
+
+
+# Test function to send curl requests and validate responses
+@pytest.mark.parametrize("prompt", ["San Francisco is a", "Santa Clara is a"])
+def test_disaggregated_prefilling(prompt):
+    # Send to prefill
+    response = requests.post("http://localhost:8100/v1/completions",
+                             headers={"Content-Type": "application/json"},
+                             json={
+                                 "model": "meta-llama/Llama-3.2-1B-Instruct",
+                                 "prompt": prompt,
+                                 "max_tokens": 1,
+                                 "temperature": 0
+                             })
+    assert response.status_code == 200
+
+    # Send to decode
+    response = requests.post("http://localhost:8200/v1/completions",
+                             headers={"Content-Type": "application/json"},
+                             json={
+                                 "model": "meta-llama/Llama-3.2-1B-Instruct",
+                                 "prompt": prompt,
+                                 "max_tokens": 10,
+                                 "temperature": 0
+                             })
+    assert response.status_code == 200
diff --git a/vllm_v0.10.0/tests/kv_transfer/test_lookup_buffer.py b/vllm_v0.10.0/tests/kv_transfer/test_lookup_buffer.py
new file mode 100644
index 0000000..352ab63
--- /dev/null
+++ b/vllm_v0.10.0/tests/kv_transfer/test_lookup_buffer.py
@@ -0,0 +1,163 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import random
+
+import torch
+from tqdm import tqdm
+
+from vllm.config import KVTransferConfig
+from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import (
+    SimpleBuffer)
+from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
+
+# TODO: the test depends on a lot of fields in the current implementation.
+# We should have standard interface instead direct field access
+
+
+def test_run(my_rank, buffer, device):
+
+    # buffer should be empty in the beginning
+    if my_rank == 0:
+        assert buffer.buffer_size == 0
+        assert len(buffer.buffer) == 0
+
+    print(f"My rank: {my_rank}, device: {device}")
+
+    # insert
+    tokens = torch.tensor([1, 2, 3]).to(device)
+    roi = (tokens > 0)
+    if my_rank == 0:
+        key = 2.0 * torch.ones([5, 6]).to(device)
+        value = 3.0 * torch.ones([5, 6]).to(device)
+
+        placeholder = torch.tensor([1]).to(device)
+
+        buffer.insert(tokens, roi, key, value, placeholder)
+
+    torch.distributed.barrier()
+
+    # drop_select
+    if my_rank == 1:
+        tok, roi_, key, value, hidden = buffer.drop_select(tokens, roi)
+        assert torch.allclose(tokens, tok)
+        assert torch.allclose(roi, roi_)
+        assert torch.allclose(key, 2.0 * torch.ones([5, 6], device=device))
+        assert torch.allclose(value, 3.0 * torch.ones([5, 6], device=device))
+    torch.distributed.barrier()
+
+    if my_rank == 0:
+        assert buffer.buffer_size == 0
+        assert len(buffer.buffer) == 0
+
+    print(f"My rank: {my_rank}, Test run passed!")
+
+
+def stress_test(my_rank, buf, device):
+
+    torch.distributed.barrier()
+    torch.manual_seed(100)
+
+    reqs = [
+        (
+            torch.rand(100).to(device),  # tokens
+            torch.ones(100).bool().to(device),  # roi
+            torch.rand(100).to(device),  # key
+            torch.rand(100).to(device),  # value
+            torch.rand(100).to(device),  # hidden
+        ) for i in tqdm(range(200))
+    ]
+
+    random.seed(my_rank)
+    random.shuffle(reqs)
+
+    torch.distributed.barrier()
+
+    n = 0
+
+    # the buffer size can only store 100 reqs
+    # so the sender will occasionally block to wait for the receiver.
+    for req in tqdm(reqs):
+        if my_rank == 0:
+            buf.insert(*req)
+        else:
+            tok, roi, k, v, h = req
+            tok_, roi_, k_, v_, h_ = buf.drop_select(tok, roi)
+
+            if tok_ is None:
+                assert roi_ is None
+                assert k_ is None
+                assert v_ is None
+                assert h_ is None
+                n += 1
+            else:
+                assert torch.allclose(tok, tok_)
+                assert torch.allclose(roi, roi_)
+                assert torch.allclose(k, k_)
+                assert torch.allclose(v, v_)
+                assert torch.allclose(h, h_)
+    print(f"Rank {my_rank} done")
+    torch.distributed.barrier()
+
+    if my_rank == 0:
+        x = torch.tensor([0])
+        torch.distributed.recv(x, 1)
+        # the # of None received is the kv that are not selected
+        assert x.item() == len(buf.buffer)
+        # and the size of the buffer should be 2000 * buffer len
+        print(buf.buffer_size)
+        assert buf.buffer_size == 1700 * len(buf.buffer)
+    else:
+        torch.distributed.send(torch.tensor([n]), 0)
+
+    print(f"My rank: {my_rank}, Passed stress test!")
+
+
+if __name__ == "__main__":
+
+    my_rank = int(os.environ['RANK'])
+
+    torch.distributed.init_process_group(
+        backend='gloo',
+        init_method='tcp://localhost:12398',
+        world_size=2,
+        rank=my_rank,
+    )
+
+    print(f"initialized! My rank is {my_rank}")
+
+    config = KVTransferConfig(
+        kv_connector='PyNcclConnector',
+        kv_buffer_device='cuda',
+        kv_buffer_size=1e9,
+        kv_rank=my_rank,
+        kv_role="kv_both",  # this arg doesn't matter in this test
+        kv_parallel_size=2,
+        kv_ip="127.0.0.1",
+        kv_port=12345,
+    )
+
+    data_pipe = PyNcclPipe(
+        local_rank=my_rank,
+        config=config,
+        device="cuda",
+        port_offset=0,
+    )
+    cpu_pipe = PyNcclPipe(
+        local_rank=my_rank,
+        config=config,
+        device="cpu",
+        port_offset=1,
+    )
+
+    buffer = SimpleBuffer(cpu_pipe, data_pipe, 170000)
+
+    test_run(my_rank, buffer, data_pipe.device)
+
+    stress_test(my_rank, buffer, data_pipe.device)
+
+    buffer.close()
+    data_pipe.close()
+    cpu_pipe.close()
+    print('Done')
diff --git a/vllm_v0.10.0/tests/kv_transfer/test_lookup_buffer.sh b/vllm_v0.10.0/tests/kv_transfer/test_lookup_buffer.sh
new file mode 100644
index 0000000..f2aeaee
--- /dev/null
+++ b/vllm_v0.10.0/tests/kv_transfer/test_lookup_buffer.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+RANK=0 python3 test_lookup_buffer.py &
+PID0=$!
+RANK=1 python3 test_lookup_buffer.py &
+PID1=$!
+
+wait $PID0
+wait $PID1
diff --git a/vllm_v0.10.0/tests/kv_transfer/test_module.py b/vllm_v0.10.0/tests/kv_transfer/test_module.py
new file mode 100644
index 0000000..7a04174
--- /dev/null
+++ b/vllm_v0.10.0/tests/kv_transfer/test_module.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import subprocess
+import sys
+
+import pytest
+import torch
+
+
+def run_python_script(script_name, timeout):
+    script_name = f'kv_transfer/{script_name}'
+    try:
+        # Start both processes asynchronously using Popen
+        process0 = subprocess.Popen(
+            [sys.executable, script_name],
+            env={"RANK":
+                 "0"},  # Set the RANK environment variable for process 0
+            stdout=sys.stdout,  # Pipe stdout to current stdout
+            stderr=sys.stderr,  # Pipe stderr to current stderr
+        )
+
+        process1 = subprocess.Popen(
+            [sys.executable, script_name],
+            env={"RANK":
+                 "1"},  # Set the RANK environment variable for process 1
+            stdout=sys.stdout,  # Pipe stdout to current stdout
+            stderr=sys.stderr,  # Pipe stderr to current stderr
+        )
+
+        # Wait for both processes to complete, with a timeout
+        process0.wait(timeout=timeout)
+        process1.wait(timeout=timeout)
+
+        # Check the return status of both processes
+        if process0.returncode != 0:
+            pytest.fail(
+                f"Test {script_name} failed for RANK=0, {process0.returncode}")
+        if process1.returncode != 0:
+            pytest.fail(
+                f"Test {script_name} failed for RANK=1, {process1.returncode}")
+
+    except subprocess.TimeoutExpired:
+        # If either process times out, terminate both and fail the test
+        process0.terminate()
+        process1.terminate()
+        pytest.fail(f"Test {script_name} timed out")
+    except Exception as e:
+        pytest.fail(f"Test {script_name} failed with error: {str(e)}")
+
+
+# Define the test cases using pytest's parametrize
+@pytest.mark.parametrize(
+    "script_name,timeout",
+    [
+        ("test_lookup_buffer.py",
+         60),  # Second test case with a 60-second timeout
+        ("test_send_recv.py", 120)  # First test case with a 120-second timeout
+    ])
+def test_run_python_script(script_name, timeout):
+    # Check the number of GPUs
+    if torch.cuda.device_count() < 2:
+        pytest.skip(
+            f"Skipping test {script_name} because <2 GPUs are available")
+
+    # Run the test if there are at least 2 GPUs
+    run_python_script(script_name, timeout)
diff --git a/vllm_v0.10.0/tests/kv_transfer/test_send_recv.py b/vllm_v0.10.0/tests/kv_transfer/test_send_recv.py
new file mode 100644
index 0000000..3211660
--- /dev/null
+++ b/vllm_v0.10.0/tests/kv_transfer/test_send_recv.py
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import time
+
+import torch
+from tqdm import tqdm
+
+from vllm.config import KVTransferConfig
+from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
+
+
+def test_run(my_rank, pipe):
+    print(f"rank {my_rank} test_run starts....")
+    # test run
+    x = torch.tensor([1]).to(pipe.device)
+    y = torch.tensor([[2., 3., 4., 8.]]).to(pipe.device)
+    if my_rank == 0:
+        pipe.send_tensor(x)
+        print(f"rank {my_rank} sent tensor x")
+        pipe.send_tensor(y)
+        print(f"rank {my_rank} sent tensor y")
+        x2 = pipe.recv_tensor()
+        print(f"rank {my_rank} received x2 = ", x2)
+        y2 = pipe.recv_tensor()
+        print(f"rank {my_rank} received y2 = ", y2)
+
+    else:
+        x2 = pipe.recv_tensor()
+        print(f"rank {my_rank} received x2 = ", x2)
+        y2 = pipe.recv_tensor()
+        print(f"rank {my_rank} received y2 = ", y2)
+        pipe.send_tensor(x)
+        print(f"rank {my_rank} sent tensor x")
+        pipe.send_tensor(y)
+        print(f"rank {my_rank} sent tensor y")
+
+    assert torch.allclose(x, x2)
+    assert torch.allclose(y, y2)
+
+    print(f"rank {my_rank} test_run passed!")
+
+
+def stress_test(my_rank, pipe):
+    print(f"rank {my_rank} stress_test starts....")
+
+    tensors: list[torch.Tensor] = []
+
+    torch.distributed.barrier()
+    torch.manual_seed(0)
+
+    for i in tqdm(range(500)):
+        mean = torch.rand(1).item() * 100
+        std = torch.rand(1).item() * 100
+        size = torch.randint(900, 1000, (2, ))
+        x = torch.normal(mean * 1.0, std * 1.0,
+                         size=size.tolist()).to(pipe.device)
+
+        # 5% probability of sending a None
+        if torch.rand(1).item() < 0.05:
+            tensors.append(None)
+            tensors.append(None)
+            tensors.append(None)
+        else:
+            tensors.append(x)
+            tensors.append(x.mean().unsqueeze(0))
+            tensors.append(x.std().unsqueeze(0))
+
+    torch.distributed.barrier()
+
+    for i in tqdm(range(500)):
+        if my_rank == int((i % 10) > 3):
+            pipe.send_tensor(tensors[3 * i])
+            pipe.send_tensor(tensors[3 * i + 1])
+            pipe.send_tensor(tensors[3 * i + 2])
+        else:
+            x = pipe.recv_tensor()
+            mean = pipe.recv_tensor()
+            std = pipe.recv_tensor()
+
+            if x is None:
+                assert mean is None
+                assert std is None
+            else:
+                assert torch.allclose(x, tensors[3 * i])
+                assert x.mean() == mean[0]
+                assert x.std() == std[0]
+
+        torch.distributed.barrier()
+
+
+def latency_test(my_rank, pipe, nelement, ntensor):
+    latencies = []
+
+    torch.distributed.barrier()
+
+    for i in tqdm(range(500)):
+
+        tensors = []
+
+        if my_rank == 0:
+            # create tensor
+            tensors = [
+                torch.rand(nelement).to(pipe.device) for _ in range(ntensor)
+            ]
+
+        torch.distributed.barrier()
+
+        if my_rank == 0:
+            t = torch.tensor([time.time()],
+                             dtype=torch.float64).to(pipe.device)
+            for tensor in tensors:
+                pipe.send_tensor(tensor)
+            pipe.send_tensor(t)
+        else:
+            for _ in range(ntensor):
+                pipe.recv_tensor()
+            t = pipe.recv_tensor()
+            latencies.append(time.time() - t.item())
+
+    torch.distributed.barrier()
+
+    print('Latency test passed.')
+    print('Latency:', torch.tensor(latencies).mean().item() * 1000, 'ms')
+
+
+if __name__ == "__main__":
+
+    my_rank = int(os.environ['RANK'])
+
+    torch.distributed.init_process_group(
+        backend='gloo',
+        init_method='tcp://localhost:12398',
+        world_size=2,
+        rank=my_rank,
+    )
+
+    config = KVTransferConfig(
+        kv_connector='PyNcclConnector',
+        kv_buffer_device='cuda',
+        kv_buffer_size=1e9,
+        kv_rank=my_rank,
+        kv_role="kv_both",  # this arg doesn't matter in this test
+        kv_parallel_size=2,
+        kv_ip="127.0.0.1",
+        kv_port=12345,
+    )
+
+    pipe = PyNcclPipe(
+        local_rank=my_rank,
+        config=config,
+    )
+
+    test_run(my_rank, pipe)
+
+    stress_test(my_rank, pipe)
+
+    # Use this function if you want to test the latency of pipe impl.
+    # latency_test(my_rank, pipe, 1024 * 8 * 128, 80)
diff --git a/vllm_v0.10.0/tests/kv_transfer/test_send_recv.sh b/vllm_v0.10.0/tests/kv_transfer/test_send_recv.sh
new file mode 100644
index 0000000..54e0604
--- /dev/null
+++ b/vllm_v0.10.0/tests/kv_transfer/test_send_recv.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+RANK=0 python3 test_send_recv.py &
+PID0=$!
+RANK=1 python3 test_send_recv.py &
+PID1=$!
+
+wait $PID0
+wait $PID1
diff --git a/vllm_v0.10.0/tests/lora/__init__.py b/vllm_v0.10.0/tests/lora/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/lora/conftest.py b/vllm_v0.10.0/tests/lora/conftest.py
new file mode 100644
index 0000000..909b739
--- /dev/null
+++ b/vllm_v0.10.0/tests/lora/conftest.py
@@ -0,0 +1,256 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import tempfile
+from collections import OrderedDict
+from unittest.mock import MagicMock, patch
+
+import pytest
+import torch
+import torch.nn as nn
+from huggingface_hub import snapshot_download
+
+import vllm
+from vllm.config import LoRAConfig
+from vllm.distributed import (cleanup_dist_env_and_memory,
+                              init_distributed_environment,
+                              initialize_model_parallel)
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.model_loader import get_model
+from vllm.model_executor.models.interfaces import SupportsLoRA
+from vllm.platforms import current_platform
+
+
+@pytest.fixture()
+def should_do_global_cleanup_after_test(request) -> bool:
+    """Allow subdirectories to skip global cleanup by overriding this fixture.
+    This can provide a ~10x speedup for non-GPU unit tests since they don't need
+    to initialize torch.
+    """
+
+    return not request.node.get_closest_marker("skip_global_cleanup")
+
+
+@pytest.fixture(autouse=True)
+def cleanup_fixture(should_do_global_cleanup_after_test: bool):
+    yield
+    if should_do_global_cleanup_after_test:
+        cleanup_dist_env_and_memory(shutdown_ray=True)
+
+
+@pytest.fixture
+def dist_init():
+    temp_file = tempfile.mkstemp()[1]
+
+    backend = "nccl"
+    if current_platform.is_cpu() or current_platform.is_tpu():
+        backend = "gloo"
+
+    init_distributed_environment(world_size=1,
+                                 rank=0,
+                                 distributed_init_method=f"file://{temp_file}",
+                                 local_rank=0,
+                                 backend=backend)
+    initialize_model_parallel(1, 1)
+    yield
+    cleanup_dist_env_and_memory(shutdown_ray=True)
+
+
+@pytest.fixture
+def dist_init_torch_only():
+    if torch.distributed.is_initialized():
+        return
+    backend = "nccl"
+    if current_platform.is_cpu():
+        backend = "gloo"
+
+    temp_file = tempfile.mkstemp()[1]
+    torch.distributed.init_process_group(world_size=1,
+                                         rank=0,
+                                         init_method=f"file://{temp_file}",
+                                         backend=backend)
+
+
+class DummyLoRAModel(nn.Sequential, SupportsLoRA):
+    pass
+
+
+@pytest.fixture
+def dummy_model() -> nn.Module:
+    model = DummyLoRAModel(
+        OrderedDict([
+            ("dense1", ColumnParallelLinear(764, 100)),
+            ("dense2", RowParallelLinear(100, 50)),
+            (
+                "layer1",
+                nn.Sequential(
+                    OrderedDict([
+                        ("dense1", ColumnParallelLinear(100, 10)),
+                        ("dense2", RowParallelLinear(10, 50)),
+                    ])),
+            ),
+            ("act2", nn.ReLU()),
+            ("output", ColumnParallelLinear(50, 10)),
+            ("outact", nn.Sigmoid()),
+            # Special handling for lm_head & sampler
+            ("lm_head", ParallelLMHead(512, 10)),
+            ("logits_processor", LogitsProcessor(512)),
+            ("sampler", Sampler())
+        ]))
+    model.config = MagicMock()
+    model.embedding_modules = {"lm_head": "lm_head"}
+    return model
+
+
+@pytest.fixture
+def dummy_model_gate_up() -> nn.Module:
+    model = DummyLoRAModel(
+        OrderedDict([
+            ("dense1", ColumnParallelLinear(764, 100)),
+            ("dense2", RowParallelLinear(100, 50)),
+            (
+                "layer1",
+                nn.Sequential(
+                    OrderedDict([
+                        ("dense1", ColumnParallelLinear(100, 10)),
+                        ("dense2", RowParallelLinear(10, 50)),
+                    ])),
+            ),
+            ("act2", nn.ReLU()),
+            ("gate_up_proj", MergedColumnParallelLinear(50, [5, 5])),
+            ("outact", nn.Sigmoid()),
+            # Special handling for lm_head & sampler
+            ("lm_head", ParallelLMHead(512, 10)),
+            ("logits_processor", LogitsProcessor(512)),
+            ("sampler", Sampler())
+        ]))
+    model.config = MagicMock()
+    model.packed_modules_mapping = {
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+    model.embedding_modules = {"lm_head": "lm_head"}
+    return model
+
+
+@pytest.fixture(scope="session")
+def llama_2_7b_base_huggingface_id():
+    # used as a base model for testing with sql lora adapter
+    return "meta-llama/Llama-2-7b-hf"
+
+
+@pytest.fixture(scope="session")
+def sql_lora_huggingface_id():
+    # huggingface repo id is used to test lora runtime downloading.
+    return "yard1/llama-2-7b-sql-lora-test"
+
+
+@pytest.fixture(scope="session")
+def sql_lora_files(sql_lora_huggingface_id):
+    return snapshot_download(repo_id=sql_lora_huggingface_id)
+
+
+@pytest.fixture(scope="session")
+def mixtral_lora_files():
+    # Note: this module has incorrect adapter_config.json to test
+    # https://github.com/vllm-project/vllm/pull/5909/files.
+    return snapshot_download(repo_id="SangBinCho/mixtral-lora")
+
+
+@pytest.fixture(scope="session")
+def chatglm3_lora_files():
+    return snapshot_download(repo_id="jeeejeee/chatglm3-text2sql-spider")
+
+
+@pytest.fixture(scope="session")
+def baichuan_lora_files():
+    return snapshot_download(repo_id="jeeejeee/baichuan7b-text2sql-spider")
+
+
+@pytest.fixture(scope="session")
+def baichuan_zero_lora_files():
+    # all the lora_B weights are initialized to zero.
+    return snapshot_download(repo_id="jeeejeee/baichuan7b-zero-init")
+
+
+@pytest.fixture(scope="session")
+def baichuan_regex_lora_files():
+    return snapshot_download(repo_id="jeeejeee/baichuan-7b-lora-zero-regex")
+
+
+@pytest.fixture(scope="session")
+def ilama_lora_files():
+    return snapshot_download(repo_id="jeeejeee/ilama-text2sql-spider")
+
+
+@pytest.fixture(scope="session")
+def minicpmv_lora_files():
+    return snapshot_download(repo_id="jeeejeee/minicpmv25-lora-pokemon")
+
+
+@pytest.fixture(scope="session")
+def qwen2vl_lora_files():
+    return snapshot_download(repo_id="jeeejeee/qwen2-vl-lora-pokemon")
+
+
+@pytest.fixture(scope="session")
+def qwen25vl_base_huggingface_id():
+    # used as a base model for testing with qwen25vl lora adapter
+    return "Qwen/Qwen2.5-VL-3B-Instruct"
+
+
+@pytest.fixture(scope="session")
+def qwen25vl_lora_files():
+    return snapshot_download(repo_id="jeeejeee/qwen25-vl-lora-pokemon")
+
+
+@pytest.fixture(scope="session")
+def tinyllama_lora_files():
+    return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
+
+
+@pytest.fixture(scope="session")
+def phi2_lora_files():
+    return snapshot_download(repo_id="isotr0py/phi-2-test-sql-lora")
+
+
+@pytest.fixture
+def llama_2_7b_engine_extra_embeddings():
+    cleanup_dist_env_and_memory(shutdown_ray=True)
+    get_model_old = get_model
+
+    def get_model_patched(**kwargs):
+        kwargs["vllm_config"].lora_config = LoRAConfig(max_loras=4,
+                                                       max_lora_rank=8)
+        return get_model_old(**kwargs)
+
+    with patch("vllm.worker.model_runner.get_model", get_model_patched):
+        engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
+    yield engine.llm_engine
+    del engine
+    cleanup_dist_env_and_memory(shutdown_ray=True)
+
+
+@pytest.fixture
+def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):
+    yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker.
+           model_runner.model)
+
+
+@pytest.fixture
+def reset_default_device():
+    """
+    Some tests, such as `test_punica_ops.py`, explicitly set the
+    default device, which can affect subsequent tests. Adding this fixture
+    helps avoid this problem.
+    """
+    original_device = torch.get_default_device()
+    yield
+    torch.set_default_device(original_device)
diff --git a/vllm_v0.10.0/tests/lora/test_add_lora.py b/vllm_v0.10.0/tests/lora/test_add_lora.py
new file mode 100644
index 0000000..cc8160b
--- /dev/null
+++ b/vllm_v0.10.0/tests/lora/test_add_lora.py
@@ -0,0 +1,121 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import time
+
+import pytest
+
+import vllm.envs as env
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.entrypoints.openai.api_server import (
+    build_async_engine_client_from_engine_args)
+from vllm.inputs import TextPrompt
+from vllm.lora.request import LoRARequest
+from vllm.sampling_params import SamplingParams
+from vllm.utils import merge_async_iterators
+
+MODEL_PATH = "THUDM/chatglm3-6b"
+LORA_RANK = 64
+DEFAULT_MAX_LORAS = 4 * 3
+
+
+def get_lora_requests(lora_path) -> list[LoRARequest]:
+    lora_requests: list[LoRARequest] = [
+        LoRARequest(lora_name=f"{i}", lora_int_id=i, lora_path=lora_path)
+        for i in range(1, DEFAULT_MAX_LORAS + 1)
+    ]
+    return lora_requests
+
+
+async def requests_processing_time(llm,
+                                   lora_requests: list[LoRARequest]) -> float:
+
+    sampling_params = SamplingParams(n=1,
+                                     temperature=0.0,
+                                     top_p=1.0,
+                                     ignore_eos=True,
+                                     max_tokens=1)
+
+    generators = []
+    start = time.perf_counter()
+
+    for lora_request in lora_requests:
+        lora_int_id = lora_request.lora_int_id
+        generator = llm.generate(
+            prompt=TextPrompt(prompt=f"hello {lora_int_id}",
+                              multi_modal_data=None),  # type: ignore 
+            sampling_params=sampling_params,
+            lora_request=lora_request,
+            request_id=f"test{lora_int_id}")
+        generators.append(generator)
+
+    all_gens = merge_async_iterators(*generators)
+    async for i, res in all_gens:
+        pass
+
+    end = time.perf_counter()
+    return end - start
+
+
+@pytest.mark.asyncio
+async def test_add_lora(chatglm3_lora_files):
+    """ 
+    The add_lora function is used to pre-load some LoRA adapters into the
+    engine in anticipation of future requests using these adapters. To test
+    this functionality, we use the async engine to process some requests - We
+    do it twice, once with add_lora() pre-loading and once without.
+
+    We measure the request processing time in both cases and expect the time 
+    to be lesser in the case with add_lora() calls.
+    """
+    lora_requests: list[LoRARequest] = get_lora_requests(chatglm3_lora_files)
+
+    max_loras = len(set([lr.lora_int_id for lr in lora_requests]))
+    # Create engine in eager-mode. Due to high max_loras, the CI can
+    # OOM during cuda-graph capture.
+    engine_args = AsyncEngineArgs(
+        model=MODEL_PATH,
+        enable_lora=True,
+        max_loras=max_loras,
+        max_lora_rank=LORA_RANK,
+        max_model_len=128,
+        gpu_memory_utilization=0.8,  #avoid OOM
+        trust_remote_code=True,
+        enforce_eager=True)
+
+    # split lora_requests into 3 parts
+    part_size = len(lora_requests) // 3
+    dummy_run_requests = lora_requests[:part_size]
+    warmup_run_requests = lora_requests[part_size:part_size * 2]
+    cold_run_requests = lora_requests[part_size * 2:]
+
+    async with build_async_engine_client_from_engine_args(engine_args) as llm:
+
+        # Dummy run - So any 1-time functionality like triton kernel compilation
+        # is complete here.
+        await requests_processing_time(llm, dummy_run_requests)
+
+        # Run with warmup
+        add_lora_tasks = [llm.add_lora(lr) for lr in warmup_run_requests]
+        add_lora_results = await asyncio.gather(*add_lora_tasks)
+        if env.VLLM_USE_V1:
+            # Test that all all_lora calls are successful.
+            assert all(add_lora_results)
+        else:
+            # No way to check V0 engine results as the calls just return None.
+            pass
+        time_with_add_lora = await requests_processing_time(
+            llm, warmup_run_requests)
+
+        # Run without any warmup
+        time_cold_start = await requests_processing_time(
+            llm, cold_run_requests)
+
+    print(f"time hot-start {time_with_add_lora} vs "
+          f"time cold-start {time_cold_start} ")
+
+    assert time_with_add_lora < time_cold_start, (
+        f"time_with_add_lora={time_with_add_lora}, "
+        f"time_cold_start={time_cold_start}"
+        "The engine request processing time with LoRA pre-loading "
+        "must be less than the version that does on-demand LoRA loading.")
diff --git a/vllm_v0.10.0/tests/lora/test_baichuan.py b/vllm_v0.10.0/tests/lora/test_baichuan.py
new file mode 100644
index 0000000..774ebb9
--- /dev/null
+++ b/vllm_v0.10.0/tests/lora/test_baichuan.py
@@ -0,0 +1,112 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+import vllm
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.lora.request import LoRARequest
+
+MODEL_PATH = "baichuan-inc/Baichuan-7B"
+
+PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
+
+
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
+    prompts = [
+        PROMPT_TEMPLATE.format(query="How many singers do we have?"),
+        PROMPT_TEMPLATE.format(
+            query=
+            "What is the average, minimum, and maximum age of all singers from France?"  # noqa: E501
+        ),
+        PROMPT_TEMPLATE.format(
+            query=
+            "Show name, country, age for all singers ordered by age from the oldest to the youngest."  # noqa: E501
+        ),
+    ]
+    print(prompts)
+    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    # Print the outputs.
+    generated_texts: list[str] = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+def test_baichuan_lora(baichuan_lora_files):
+    llm = vllm.LLM(MODEL_PATH,
+                   max_model_len=1024,
+                   enable_lora=True,
+                   max_loras=4,
+                   max_lora_rank=64,
+                   trust_remote_code=True)
+
+    expected_lora_output = [
+        "SELECT count(*) FROM singer",
+        "SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE Country  =  'France'",  # noqa: E501
+        "SELECT name ,  country ,  age FROM singer ORDER BY age ASC",
+    ]
+
+    output1 = do_sample(llm, baichuan_lora_files, lora_id=1)
+    for i in range(len(expected_lora_output)):
+        assert output1[i] == expected_lora_output[i]
+    output2 = do_sample(llm, baichuan_lora_files, lora_id=2)
+    for i in range(len(expected_lora_output)):
+        assert output2[i] == expected_lora_output[i]
+
+
+@pytest.mark.parametrize("fully_sharded", [True, False])
+def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
+                                           num_gpus_available, fully_sharded):
+    if num_gpus_available < 4:
+        pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
+
+    llm_tp1 = vllm.LLM(MODEL_PATH,
+                       enable_lora=True,
+                       max_num_seqs=16,
+                       max_loras=4,
+                       max_lora_rank=64,
+                       trust_remote_code=True,
+                       fully_sharded_loras=fully_sharded)
+    output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1)
+
+    del llm_tp1
+    cleanup_dist_env_and_memory()
+
+    llm_tp2 = vllm.LLM(MODEL_PATH,
+                       enable_lora=True,
+                       max_num_seqs=16,
+                       max_loras=4,
+                       max_lora_rank=64,
+                       tensor_parallel_size=2,
+                       trust_remote_code=True,
+                       fully_sharded_loras=fully_sharded)
+    output_tp2 = do_sample(llm_tp2, baichuan_lora_files, lora_id=2)
+
+    del llm_tp2
+    cleanup_dist_env_and_memory()
+
+    assert output_tp1 == output_tp2
+
+    llm_tp4 = vllm.LLM(MODEL_PATH,
+                       enable_lora=True,
+                       max_num_seqs=16,
+                       max_loras=4,
+                       max_lora_rank=64,
+                       tensor_parallel_size=4,
+                       trust_remote_code=True,
+                       fully_sharded_loras=fully_sharded)
+    output_tp4 = do_sample(llm_tp4, baichuan_lora_files, lora_id=2)
+
+    del llm_tp4
+    cleanup_dist_env_and_memory()
+
+    assert output_tp1 == output_tp4
diff --git a/vllm_v0.10.0/tests/lora/test_chatglm3_tp.py b/vllm_v0.10.0/tests/lora/test_chatglm3_tp.py
new file mode 100644
index 0000000..5481b41
--- /dev/null
+++ b/vllm_v0.10.0/tests/lora/test_chatglm3_tp.py
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import vllm
+from vllm.lora.request import LoRARequest
+
+from ..utils import create_new_process_for_each_test, multi_gpu_test
+
+MODEL_PATH = "THUDM/chatglm3-6b"
+
+PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
+
+EXPECTED_LORA_OUTPUT = [
+    "SELECT count(*) FROM singer",
+    "SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE country  =  'France'",  # noqa: E501
+    "SELECT name ,  country ,  age FROM singer ORDER BY age",
+]
+
+
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
+    prompts = [
+        PROMPT_TEMPLATE.format(query="How many singers do we have?"),
+        PROMPT_TEMPLATE.format(
+            query=
+            "What is the average, minimum, and maximum age of all singers from France?"  # noqa: E501
+        ),
+        PROMPT_TEMPLATE.format(
+            query=
+            "Show name, country, age for all singers ordered by age from the oldest to the youngest."  # noqa: E501
+        ),
+    ]
+    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    # Print the outputs.
+    generated_texts: list[str] = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+@create_new_process_for_each_test()
+def test_chatglm3_lora(chatglm3_lora_files):
+    llm = vllm.LLM(MODEL_PATH,
+                   max_model_len=1024,
+                   enable_lora=True,
+                   max_loras=4,
+                   max_lora_rank=64,
+                   trust_remote_code=True,
+                   enable_chunked_prefill=True)
+
+    output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output1[i] == EXPECTED_LORA_OUTPUT[i]
+    output2 = do_sample(llm, chatglm3_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output2[i] == EXPECTED_LORA_OUTPUT[i]
+
+
+@multi_gpu_test(num_gpus=4)
+@create_new_process_for_each_test()
+def test_chatglm3_lora_tp4(chatglm3_lora_files):
+    llm = vllm.LLM(MODEL_PATH,
+                   max_model_len=1024,
+                   enable_lora=True,
+                   max_loras=4,
+                   max_lora_rank=64,
+                   tensor_parallel_size=4,
+                   trust_remote_code=True,
+                   fully_sharded_loras=False,
+                   enable_chunked_prefill=True)
+
+    output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output1[i] == EXPECTED_LORA_OUTPUT[i]
+    output2 = do_sample(llm, chatglm3_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output2[i] == EXPECTED_LORA_OUTPUT[i]
+
+
+@multi_gpu_test(num_gpus=4)
+@create_new_process_for_each_test()
+def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
+    llm = vllm.LLM(MODEL_PATH,
+                   max_model_len=1024,
+                   enable_lora=True,
+                   max_loras=4,
+                   max_lora_rank=64,
+                   tensor_parallel_size=4,
+                   trust_remote_code=True,
+                   fully_sharded_loras=True,
+                   enable_chunked_prefill=True)
+    output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output1[i] == EXPECTED_LORA_OUTPUT[i]
+    output2 = do_sample(llm, chatglm3_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output2[i] == EXPECTED_LORA_OUTPUT[i]
diff --git a/vllm_v0.10.0/tests/lora/test_default_mm_loras.py b/vllm_v0.10.0/tests/lora/test_default_mm_loras.py
new file mode 100644
index 0000000..f615ced
--- /dev/null
+++ b/vllm_v0.10.0/tests/lora/test_default_mm_loras.py
@@ -0,0 +1,118 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests for applying default registered multimodal loras.
+"""
+
+import os
+
+from huggingface_hub import snapshot_download
+
+from vllm.lora.request import LoRARequest
+
+from ..conftest import AudioTestAssets, VllmRunner
+
+MODEL_PATH = snapshot_download("microsoft/Phi-4-multimodal-instruct")
+AUDIO_LORA_PATH = os.path.join(MODEL_PATH, "speech-lora")
+IMAGE_LORA_PATH = os.path.join(MODEL_PATH, "vision-lora")
+
+AUDIO_PROMPT = "<|user|><|audio_1|>Can you transcribe this audio?<|end|><|assistant|>"  # noqa: E501
+
+# Responses are greedy decoded; we just check the end of
+# the generated text. If the lora is inactive, this model
+# generates commentary on the transcription.
+RESPONSE_SUFFIX_WITH_LORA = "Spoken text: The first words I spoke in the original chronograph, a little piece of practical poetry. Mary had a little lamb, it slept with quite a snow, and everywhere that Mary went, the lamb was sure to go."  # noqa: E501
+RESPONSE_SUFFIX_WITHOUT_LORA = "Certainly! Here is the transcription of the audio you provided:\n\nThe first words I spoke in the original phonograph record: A little piece of practical poetry. Mary had a little lamb; its fleece was white as snow, and everywhere that Mary went, the lamb was sure to go."  # noqa: E501
+
+VLLM_RUNNER_BASE_KWARGS = {
+    "model_name": MODEL_PATH,
+    "dtype": "half",
+    "enable_lora": "True",
+    "max_num_seqs": 2,
+    "max_lora_rank": 320,
+    "max_model_len": 12800,
+    "gpu_memory_utilization": 0.8,
+    "limit_mm_per_prompt": {
+        "audio": 1
+    },
+    "enforce_eager": True,
+}
+
+
+def run_test(vllm_runner, audio_assets, lora_request, expected_suffix,
+             **kwargs):
+    inputs = [([AUDIO_PROMPT], [audio_assets[0].audio_and_sample_rate[0]])]
+
+    # Apply any additional kwargs as overrides to the base kwargs
+    vllm_runner_kwargs = {**VLLM_RUNNER_BASE_KWARGS, **kwargs}
+
+    with vllm_runner(**vllm_runner_kwargs) as vllm_model:
+        vllm_outputs_with_default_lora = [
+            vllm_model.generate_greedy(
+                prompts,
+                max_tokens=128,
+                audios=audios,
+                lora_request=lora_request,
+            ) for prompts, audios in inputs
+        ]
+
+        assert vllm_outputs_with_default_lora[-1][-1][-1].endswith(
+            expected_suffix)
+
+
+def test_active_default_mm_lora(
+    vllm_runner: type[VllmRunner],
+    audio_assets: AudioTestAssets,
+):
+    """Ensure that we can use the default audio lora."""
+    run_test(
+        vllm_runner,
+        audio_assets,
+        lora_request=None,
+        default_mm_loras={"audio": AUDIO_LORA_PATH},
+        expected_suffix=RESPONSE_SUFFIX_WITH_LORA,
+    )
+
+
+def test_inactive_default_mm_lora(
+    vllm_runner: type[VllmRunner],
+    audio_assets: AudioTestAssets,
+):
+    """Ensure that modalities are filtered properly."""
+    # Default image lora won't be active since we only pass audio
+    run_test(
+        vllm_runner,
+        audio_assets,
+        lora_request=None,
+        default_mm_loras={"image": IMAGE_LORA_PATH},
+        expected_suffix=RESPONSE_SUFFIX_WITHOUT_LORA,
+    )
+
+
+def test_default_mm_lora_succeeds_with_redundant_lora_request(
+    vllm_runner: type[VllmRunner],
+    audio_assets: AudioTestAssets,
+):
+    """Ensure that redundantly providing the lora works."""
+    run_test(
+        vllm_runner,
+        audio_assets,
+        lora_request=LoRARequest("audio", 1, AUDIO_LORA_PATH),
+        default_mm_loras={"audio": AUDIO_LORA_PATH},
+        expected_suffix=RESPONSE_SUFFIX_WITH_LORA,
+    )
+
+
+def test_default_mm_lora_fails_with_overridden_lora_request(
+    vllm_runner: type[VllmRunner],
+    audio_assets: AudioTestAssets,
+):
+    """Ensure that if the lora_request conflicts with default_mm_loras,
+    we use the lora_request."""
+    run_test(
+        vllm_runner,
+        audio_assets,
+        lora_request=LoRARequest("speech", 2, AUDIO_LORA_PATH),
+        default_mm_loras={"audio": IMAGE_LORA_PATH},
+        expected_suffix=RESPONSE_SUFFIX_WITH_LORA,
+    )
diff --git a/vllm_v0.10.0/tests/lora/test_layers.py b/vllm_v0.10.0/tests/lora/test_layers.py
new file mode 100644
index 0000000..92db023
--- /dev/null
+++ b/vllm_v0.10.0/tests/lora/test_layers.py
@@ -0,0 +1,1229 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import random
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import Optional
+from unittest.mock import patch
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from vllm.config import LoRAConfig
+from vllm.lora.fully_sharded_layers import (
+    ColumnParallelLinearWithShardedLoRA,
+    MergedColumnParallelLinearWithShardedLoRA,
+    MergedQKVParallelLinearWithShardedLoRA, QKVParallelLinearWithShardedLoRA,
+    RowParallelLinearWithShardedLoRA)
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
+                              LogitsProcessorWithLoRA, LoRAMapping,
+                              MergedColumnParallelLinearWithLoRA,
+                              MergedQKVParallelLinearWithLoRA,
+                              QKVParallelLinearWithLoRA,
+                              ReplicatedLinearWithLoRA,
+                              RowParallelLinearWithLoRA,
+                              VocabParallelEmbeddingWithLoRA)
+# yapf: enable
+from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights
+from vllm.lora.punica_wrapper import get_punica_wrapper
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask)
+from vllm.model_executor.utils import set_random_seed
+from vllm.platforms import current_platform
+
+from .utils import DummyLoRAManager
+
+TOLERANCES = {
+    torch.float16: (5e-3, 5e-3),
+    torch.float32: (5e-3, 5e-3),
+    torch.bfloat16: (3e-2, 2e-2),
+}
+
+pytestmark = pytest.mark.skipif(
+    not (current_platform.is_cuda_alike() or current_platform.is_cpu()),
+    reason="Backend not supported")
+
+DEVICES = ([
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+] if current_platform.is_cuda_alike() else ["cpu"])
+
+# prefill stage(True) or decode stage(False)
+STAGES = [True, False]
+
+NUM_RANDOM_SEEDS = 6
+
+VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
+
+
+@pytest.fixture(autouse=True)
+def clean_cache_reset_device(reset_default_device):
+    # Release any memory we might be holding on to. CI runs OOMs otherwise.
+    from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
+                                                _LORA_B_PTR_DICT)
+    _LORA_B_PTR_DICT.clear()
+    _LORA_A_PTR_DICT.clear()
+
+    yield
+
+
+@pytest.fixture(autouse=True)
+def skip_cuda_with_stage_false(request):
+    """
+    On cuda-like platforms, we use the same kernels for prefill and decode 
+    stage, and 'stage' is generally ignored, so we only need to test once.
+    """
+    if current_platform.is_cuda_alike():
+        try:
+            if hasattr(request.node, "callspec") and hasattr(
+                    request.node.callspec, "params"):
+                params = request.node.callspec.params
+                if "stage" in params and params["stage"] is False:
+                    pytest.skip("Skip test when stage=False")
+        except Exception:
+            pass
+    yield
+
+
+def get_random_id_to_index(num_loras: int,
+                           num_slots: int,
+                           log: bool = True) -> list[Optional[int]]:
+    """Creates a random lora_id_to_index mapping.
+
+    Args:
+        num_loras: The number of active loras in the mapping.
+        num_slots: The number of slots in the mapping. Must be larger
+            than num_loras.
+        log: Whether to log the output.
+    """
+
+    if num_loras > num_slots:
+        raise ValueError(
+            f"num_loras is higher than num_slots: {num_loras} > {num_slots}. "
+            "num_loras must be less than or equal to num_slots.")
+
+    slots: list[Optional[int]] = [None] * num_slots
+    random_slot_selections = (torch.randperm(num_slots)[:num_loras]).tolist()
+    for lora_id, slot_idx in enumerate(random_slot_selections, start=1):
+        slots[slot_idx] = lora_id
+
+    if log:
+        print(f"Created lora_id_to_index mapping: {slots}.")
+
+    return slots
+
+
+def populate_loras(
+    id_to_index: list[Optional[int]],
+    layer: BaseLayerWithLoRA,
+    layer_weights: torch.Tensor,
+    generate_embeddings_tensor: int = 0,
+    repeats: int = 1,
+) -> tuple[dict[int, LoRALayerWeights], dict[int, list[LoRALayerWeights]]]:
+    """This method populates the lora layers with lora weights.
+
+    Args:
+        id_to_index: a list of lora ids. The index of the lora id
+            represents which memory slot the lora matrices are
+            stored in. A None value indicates a free slot.
+        layer: the LoRAlayer to populate.
+        layer_weights: the PyTorch tensor containing the layer's
+            weights.
+        generate_embeddings_tensor: whether to generate an
+            embeddings tensor for each LoRA.
+        repeats: must only be set for column parallel packed
+            layers. Indicates the number of loras to compose
+            together to create a single lora layer.
+    """
+
+    # Dictionary that maps the lora ID to the
+    # corresponding lora weights.
+    lora_dict: dict[int, LoRALayerWeights] = dict()
+
+    # Dictionary that maps the lora ID to the
+    # corresponding subloras.
+    sublora_dict: dict[int, list[LoRALayerWeights]] = dict()
+
+    for slot_idx, lora_id in enumerate(id_to_index):
+        if lora_id is not None:
+            subloras: list[LoRALayerWeights] = []
+            sublora_len = layer_weights.shape[0] // repeats
+            for i in range(repeats):
+                sublora = DummyLoRAManager(
+                    layer_weights.device).init_random_lora(
+                        module_name=f"fake_{i}",
+                        weight=layer_weights,
+                        generate_embeddings_tensor=generate_embeddings_tensor,
+                    )
+                sublora.lora_b = sublora.lora_b[:, (sublora_len *
+                                                    i):(sublora_len * (i + 1))]
+                sublora.optimize()
+                subloras.append(sublora)
+
+            lora = PackedLoRALayerWeights.pack(
+                subloras) if repeats > 1 else subloras[0]
+
+            layer.set_lora(
+                slot_idx,
+                lora_a=lora.lora_a,
+                lora_b=lora.lora_b,
+                embeddings_tensor=lora.embeddings_tensor,
+            )
+
+            lora_dict[lora_id] = lora
+            sublora_dict[lora_id] = subloras
+
+    return lora_dict, sublora_dict
+
+
+def create_random_inputs(
+    active_lora_ids: list[int],
+    num_inputs: int,
+    input_size: tuple[int, ...],
+    input_range: tuple[float, float],
+    input_type: torch.dtype = torch.int,
+    device: torch.device = "cuda"
+) -> tuple[list[torch.Tensor], list[int], list[int]]:
+    """Creates random inputs.
+
+    Args:
+        active_lora_ids: lora IDs of active lora weights.
+        num_inputs: the number of inputs to create.
+        input_size: the size of each individual input.
+        input_range: the range of values to include in the input.
+            input_range[0] <= possible input values < input_range[1]
+        input_type: the type of values in the input.
+    """
+
+    low, high = input_range
+
+    inputs: list[torch.Tensor] = []
+    index_mapping: list[int] = []
+    prompt_mapping: list[int] = []
+
+    for _ in range(num_inputs):
+        if input_type == torch.int:
+            inputs.append(
+                torch.randint(low=int(low),
+                              high=int(high),
+                              size=input_size,
+                              device=device))
+        else:
+            inputs.append(
+                torch.rand(size=input_size, dtype=input_type, device=device) *
+                high + low)
+
+        lora_id = random.choice(active_lora_ids)
+        index_mapping += [lora_id] * input_size[0]
+        prompt_mapping += [lora_id]
+
+    return inputs, index_mapping, prompt_mapping
+
+
+def check_punica_wrapper(punica_wrapper) -> bool:
+    if current_platform.is_cuda_alike():
+        from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
+
+        return type(punica_wrapper) is PunicaWrapperGPU
+    elif current_platform.is_cpu():
+        from vllm.lora.punica_wrapper.punica_cpu import PunicaWrapperCPU
+
+        return type(punica_wrapper) is PunicaWrapperCPU
+    else:
+        return False
+
+
+@torch.inference_mode()
+@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
+@pytest.mark.parametrize("stage", STAGES)
+def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
+    # For multi-GPU testing of Triton kernel, we must explicitly set the CUDA
+    # device, see: https://github.com/triton-lang/triton/issues/2925
+    # Same below.
+    if current_platform.is_cuda_alike():
+        torch.cuda.set_device(device)
+
+    torch.set_default_device(device)
+    max_loras = 8
+    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
+    assert check_punica_wrapper(punica_wrapper)
+    lora_config = LoRAConfig(max_loras=max_loras,
+                             max_lora_rank=8,
+                             lora_dtype=torch.float16)
+
+    def create_random_embedding_layer():
+        embedding = VocabParallelEmbedding(vocab_size, 256)
+        embedding.weight.data = torch.rand_like(embedding.weight.data)
+        embedding.weight.data[vocab_size:, :] = 0
+        lora_embedding = VocabParallelEmbeddingWithLoRA(embedding)
+        lora_embedding.create_lora_weights(max_loras, lora_config)
+
+        return embedding, lora_embedding
+
+    for i in range(NUM_RANDOM_SEEDS):
+        set_random_seed(i)
+
+        id_to_index = get_random_id_to_index(num_loras, max_loras)
+        embedding, lora_embedding = create_random_embedding_layer()
+        lora_embedding.set_mapping(punica_wrapper)
+        lora_dict, _ = populate_loras(
+            id_to_index,
+            layer=lora_embedding,
+            layer_weights=embedding.weight.T,
+        )
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=list(lora_dict.keys()),
+            num_inputs=num_loras * 3,
+            input_size=(200, ),
+            input_range=(1, vocab_size),
+            device=device)
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
+                                       vocab_size,
+                                       lora_config.lora_extra_vocab_size)
+
+        lora_result = lora_embedding(torch.cat(inputs))
+
+        expected_results: list[torch.Tensor] = []
+        for input_, lora_id in zip(inputs, prompt_mapping):
+            lora = lora_dict[lora_id]
+            result = embedding(input_)
+            after_a = F.embedding(
+                input_,
+                lora.lora_a,
+            )
+            result += (after_a @ lora.lora_b)
+            expected_results.append(result)
+        expected_result = torch.cat(expected_results)
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        torch.testing.assert_close(lora_result,
+                                   expected_result,
+                                   rtol=rtol,
+                                   atol=atol)
+
+        # Check that resetting the lora weights succeeds
+
+        for slot_idx in range(max_loras):
+            lora_embedding.reset_lora(slot_idx)
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=[0],
+            num_inputs=num_loras * 3,
+            input_size=(200, ),
+            input_range=(1, vocab_size),
+            device=device)
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
+                                       vocab_size,
+                                       lora_config.lora_extra_vocab_size)
+
+        lora_result = lora_embedding(torch.cat(inputs))
+        expected_result = embedding(torch.cat(inputs))
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        torch.testing.assert_close(lora_result,
+                                   expected_result,
+                                   rtol=rtol,
+                                   atol=atol)
+
+
+@torch.inference_mode()
+# @pytest.mark.skip(
+#     reason="Fails when loras are in any slot other than the first.")
+@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
+@pytest.mark.parametrize("stage", STAGES)
+def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
+                                        vocab_size, stage) -> None:
+
+    if current_platform.is_cuda_alike():
+        torch.cuda.set_device(device)
+
+    torch.set_default_device(device)
+    max_loras = 8
+    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
+    assert check_punica_wrapper(punica_wrapper)
+    lora_config = LoRAConfig(max_loras=max_loras,
+                             max_lora_rank=8,
+                             lora_dtype=torch.float16)
+
+    def create_random_embedding_layer():
+        embedding = VocabParallelEmbedding(vocab_size, 256)
+        embedding_data = torch.rand_like(embedding.weight.data)
+        embedding.weight.data = embedding_data
+        embedding.weight.data[vocab_size:, :] = 0
+        expanded_embedding = VocabParallelEmbedding(
+            vocab_size + lora_config.lora_extra_vocab_size * max_loras,
+            256,
+            org_num_embeddings=vocab_size)
+        expanded_embedding.weight.data[:vocab_size, :] = embedding_data
+        # We need to deepcopy the embedding as it will be modified
+        # in place
+        lora_embedding = VocabParallelEmbeddingWithLoRA(
+            deepcopy(expanded_embedding))
+        lora_embedding.create_lora_weights(max_loras, lora_config)
+
+        return expanded_embedding, lora_embedding
+
+    for i in range(NUM_RANDOM_SEEDS):
+        set_random_seed(i)
+
+        id_to_index = get_random_id_to_index(num_loras, max_loras)
+        expanded_embedding, lora_embedding = create_random_embedding_layer()
+        lora_dict, _ = populate_loras(
+            id_to_index,
+            layer=lora_embedding,
+            layer_weights=torch.zeros(
+                (256, vocab_size + lora_config.lora_extra_vocab_size)),
+            generate_embeddings_tensor=256,
+        )
+
+        lora_embedding.set_mapping(punica_wrapper)
+        # All embeddings tensors have the same shape.
+        embeddings_tensors = [
+            lora_dict[id].embeddings_tensor for id in sorted(lora_dict.keys())
+        ]
+        embeddings_tensor_len = embeddings_tensors[0].shape[0]
+
+        # Add empty embeddings_tensors for unoccupied lora slots.
+        for _ in range(max_loras - len(embeddings_tensors)):
+            embeddings_tensors.append(torch.zeros(embeddings_tensors[0].shape))
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=list(lora_dict.keys()),
+            num_inputs=num_loras * 3,
+            input_size=(200, ),
+            input_range=(1, vocab_size),
+            device=device)
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
+                                       vocab_size,
+                                       lora_config.lora_extra_vocab_size)
+        original_inputs = deepcopy(inputs)
+
+        # Force some of the inputs to be in the extended embeddings range
+        # to guarantee that their behavior is tested.
+        for input_, original_input_, lora_id in zip(inputs, original_inputs,
+                                                    prompt_mapping):
+            embedding_id = lora_id - 1
+            input_[-1] = vocab_size + (embedding_id * embeddings_tensor_len)
+            original_input_[-1] = vocab_size
+            input_[-2] = vocab_size + (
+                (embedding_id + 1) * embeddings_tensor_len - 1)
+            original_input_[-2] = vocab_size + embeddings_tensor_len - 1
+
+        expanded_embedding.weight[vocab_size:vocab_size +
+                                  (embeddings_tensor_len *
+                                   max_loras)] = torch.cat(embeddings_tensors)
+
+        lora_result = lora_embedding(torch.cat(original_inputs))
+
+        expected_results: list[torch.Tensor] = []
+        for input_, original_input_, lora_id in zip(inputs, original_inputs,
+                                                    prompt_mapping):
+            lora = lora_dict[lora_id]
+            result = expanded_embedding(input_)
+            after_a = F.embedding(
+                original_input_,
+                lora.lora_a,
+            )
+            result += (after_a @ lora.lora_b)
+            expected_results.append(result)
+        expected_result = torch.cat(expected_results)
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        torch.testing.assert_close(lora_result,
+                                   expected_result,
+                                   rtol=rtol,
+                                   atol=atol)
+
+        # Check that resetting the lora weights succeeds
+
+        for slot_idx in range(max_loras):
+            lora_embedding.reset_lora(slot_idx)
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=[0],
+            num_inputs=num_loras * 3,
+            input_size=(200, ),
+            input_range=(1, vocab_size),
+            device=device)
+        original_inputs = deepcopy(inputs)
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
+                                       vocab_size,
+                                       lora_config.lora_extra_vocab_size)
+        lora_result = lora_embedding(torch.cat(original_inputs))
+        expected_result = expanded_embedding(torch.cat(inputs))
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        torch.testing.assert_close(lora_result,
+                                   expected_result,
+                                   rtol=rtol,
+                                   atol=atol)
+
+
+@torch.inference_mode()
+@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512])
+@pytest.mark.parametrize("stage", STAGES)
+def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
+                                  stage) -> None:
+
+    if current_platform.is_cuda_alike():
+        torch.cuda.set_device(device)
+
+    torch.set_default_device(device)
+    max_loras = 8
+    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
+    assert check_punica_wrapper(punica_wrapper)
+    lora_config = LoRAConfig(max_loras=max_loras,
+                             max_lora_rank=8,
+                             lora_dtype=torch.float16)
+
+    def _pretest():
+        linear = ParallelLMHead(vocab_size + lora_config.lora_extra_vocab_size,
+                                1024,
+                                vocab_size,
+                                params_dtype=torch.float16)
+        linear.weight.data = torch.rand_like(linear.weight.data)
+        linear.weight.data[:, vocab_size:] = 0
+        logits_processor = LogitsProcessor(
+            vocab_size + lora_config.lora_extra_vocab_size, vocab_size)
+        lora_logits_processor = LogitsProcessorWithLoRA(
+            logits_processor, 1024, linear.weight.dtype, linear.weight.device,
+            None)
+        lora_logits_processor.create_lora_weights(max_loras, lora_config)
+
+        return linear, logits_processor, lora_logits_processor
+
+    for i in range(NUM_RANDOM_SEEDS):
+        set_random_seed(i)
+
+        id_to_index = get_random_id_to_index(num_loras, max_loras)
+        linear, logits_processor, lora_logits_processor = _pretest()
+        lora_logits_processor.set_mapping(punica_wrapper)
+        # NOTE: all the generated loras share the same embeddings tensor.
+        lora_dict, _ = populate_loras(
+            id_to_index,
+            layer=lora_logits_processor,
+            layer_weights=linear.weight,
+            generate_embeddings_tensor=1024,
+        )
+        embeddings_tensor = list(lora_dict.values())[0].embeddings_tensor
+        embeddings_tensor_len = embeddings_tensor.shape[0]
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=list(lora_dict.keys()),
+            num_inputs=8 * num_loras,  # * 3,
+            input_size=(1, 1024),
+            input_range=(0, 1),
+            input_type=torch.float16,
+            device=device)
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            vocab_size,
+            lora_config.lora_extra_vocab_size,
+        )
+        input_ = torch.rand(20, 1024)
+
+        lora_result = lora_logits_processor._get_logits(
+            hidden_states=torch.cat(inputs),
+            lm_head=linear,
+            embedding_bias=None)
+
+        original_lm_head = deepcopy(linear)
+
+        linear.weight[logits_processor.
+                      org_vocab_size:logits_processor.org_vocab_size +
+                      embeddings_tensor_len] = embeddings_tensor
+
+        logits_processor.org_vocab_size = (vocab_size +
+                                           lora_config.lora_extra_vocab_size)
+        expected_results: list[torch.Tensor] = []
+        for input_, lora_id in zip(inputs, prompt_mapping):
+            lora = lora_dict[lora_id]
+            result = logits_processor._get_logits(hidden_states=input_,
+                                                  lm_head=linear,
+                                                  embedding_bias=None)
+            result[:, vocab_size + embeddings_tensor_len:] = float("-inf")
+            result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling
+            expected_results.append(result)
+        expected_result = torch.cat(expected_results)
+        logits_processor.org_vocab_size = vocab_size
+
+        # Check that resetting the lora weights succeeds
+
+        for slot_idx in range(max_loras):
+            lora_logits_processor.reset_lora(slot_idx)
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=[0],
+            num_inputs=8 * num_loras * 3,
+            input_size=(1, 1024),
+            input_range=(0, 1),
+            input_type=torch.float16,
+            device=device)
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            vocab_size,
+            lora_config.lora_extra_vocab_size,
+        )
+
+        lora_result = lora_logits_processor._get_logits(
+            hidden_states=torch.cat(inputs),
+            lm_head=original_lm_head,
+            embedding_bias=None)[:, :vocab_size]
+        expected_result = logits_processor._get_logits(
+            hidden_states=torch.cat(inputs),
+            lm_head=original_lm_head,
+            embedding_bias=None)
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        torch.testing.assert_close(lora_result,
+                                   expected_result,
+                                   rtol=rtol,
+                                   atol=atol)
+
+
+@torch.inference_mode()
+@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("stage", STAGES)
+@pytest.mark.parametrize("bias_enabled", [True, False])
+def test_linear_replicated(dist_init, num_loras, device, stage,
+                           bias_enabled) -> None:
+
+    if current_platform.is_cuda_alike():
+        torch.cuda.set_device(device)
+
+    max_loras = 8
+    torch.set_default_device(device)
+    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
+    assert check_punica_wrapper(punica_wrapper)
+    lora_config = LoRAConfig(max_loras=max_loras,
+                             max_lora_rank=8,
+                             lora_dtype=torch.float16,
+                             bias_enabled=bias_enabled)
+
+    def create_random_linear_replicated_layer():
+
+        linear = ReplicatedLinear(4096,
+                                  4096,
+                                  bias=False,
+                                  params_dtype=torch.float16)
+        linear.weight.data = torch.rand_like(linear.weight.data)
+        lora_linear = ReplicatedLinearWithLoRA(linear)
+
+        lora_linear.create_lora_weights(max_loras, lora_config)
+        assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
+            lora_linear.lora_b_stacked) == 1)
+        if bias_enabled:
+            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
+        else:
+            assert lora_linear.lora_bias_stacked is None
+        return linear, lora_linear
+
+    for i in range(NUM_RANDOM_SEEDS):
+        set_random_seed(i)
+
+        id_to_index = get_random_id_to_index(num_loras, max_loras)
+        linear, lora_linear = create_random_linear_replicated_layer()
+        assert torch.equal(linear.weight, lora_linear.weight)
+        lora_linear.set_mapping(punica_wrapper)
+        lora_dict, _ = populate_loras(
+            id_to_index,
+            layer=lora_linear,
+            layer_weights=linear.weight,
+        )
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=list(lora_dict.keys()),
+            num_inputs=32 * num_loras,
+            input_size=(1, 4096),
+            input_range=(0, 1),
+            input_type=torch.float16,
+            device=device)
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            512,
+            lora_config.lora_extra_vocab_size,
+        )
+
+        lora_result = lora_linear(torch.cat(inputs))[0]
+
+        expected_results: list[torch.Tensor] = []
+        for input_, lora_id in zip(inputs, prompt_mapping):
+            lora = lora_dict[lora_id]
+            result = linear(input_)[0]
+            result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling
+            expected_results.append(result)
+        expected_result = torch.cat(expected_results)
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        torch.testing.assert_close(lora_result,
+                                   expected_result,
+                                   rtol=rtol,
+                                   atol=atol)
+
+        # Check that resetting the lora weights succeeds
+
+        for slot_idx in range(max_loras):
+            lora_linear.reset_lora(slot_idx)
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=[0],
+            num_inputs=32 * num_loras,
+            input_size=(1, 4096),
+            input_range=(0, 1),
+            input_type=torch.float16,
+            device=device)
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+
+        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
+                                       512, lora_config.lora_extra_vocab_size)
+
+        lora_result = lora_linear(torch.cat(inputs))[0]
+        expected_result = linear(torch.cat(inputs))[0]
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        torch.testing.assert_close(lora_result,
+                                   expected_result,
+                                   rtol=rtol,
+                                   atol=atol)
+
+
+@torch.inference_mode()
+@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("orientation", ["row", "column"])
+@pytest.mark.parametrize("fully_shard", [True, False])
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("stage", STAGES)
+@pytest.mark.parametrize("bias_enabled", [True, False])
+def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
+                         device, stage, bias_enabled) -> None:
+
+    if current_platform.is_cuda_alike():
+        torch.cuda.set_device(device)
+
+    max_loras = 8
+    torch.set_default_device(device)
+    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
+    assert check_punica_wrapper(punica_wrapper)
+    lora_config = LoRAConfig(max_loras=max_loras,
+                             max_lora_rank=8,
+                             fully_sharded_loras=fully_shard,
+                             lora_dtype=torch.float16,
+                             bias_enabled=bias_enabled)
+
+    def create_random_linear_parallel_layer():
+        if orientation == "row":
+            linear = RowParallelLinear(4096,
+                                       4096,
+                                       bias=False,
+                                       params_dtype=torch.float16)
+            linear.weight.data = torch.rand_like(linear.weight.data)
+            lora_linear = (RowParallelLinearWithLoRA(linear) if not fully_shard
+                           else RowParallelLinearWithShardedLoRA(linear))
+        else:
+            linear = ColumnParallelLinear(4096,
+                                          4096,
+                                          bias=False,
+                                          params_dtype=torch.float16)
+            linear.weight.data = torch.rand_like(linear.weight.data)
+            lora_linear = (ColumnParallelLinearWithLoRA(linear)
+                           if not fully_shard else
+                           ColumnParallelLinearWithShardedLoRA(linear))
+        lora_linear.create_lora_weights(max_loras, lora_config)
+        assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
+            lora_linear.lora_b_stacked) == 1)
+        if bias_enabled:
+            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
+        else:
+            assert lora_linear.lora_bias_stacked is None
+        return linear, lora_linear
+
+    for i in range(NUM_RANDOM_SEEDS):
+        set_random_seed(i)
+
+        id_to_index = get_random_id_to_index(num_loras, max_loras)
+        linear, lora_linear = create_random_linear_parallel_layer()
+        assert torch.equal(linear.weight, lora_linear.weight)
+        lora_linear.set_mapping(punica_wrapper)
+        lora_dict, _ = populate_loras(
+            id_to_index,
+            layer=lora_linear,
+            layer_weights=linear.weight,
+        )
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=list(lora_dict.keys()),
+            num_inputs=32 * num_loras,
+            input_size=(1, 4096),
+            input_range=(0, 1),
+            input_type=torch.float16,
+            device=device)
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            512,
+            lora_config.lora_extra_vocab_size,
+        )
+
+        lora_result = lora_linear(torch.cat(inputs))[0]
+
+        expected_results: list[torch.Tensor] = []
+        for input_, lora_id in zip(inputs, prompt_mapping):
+            lora = lora_dict[lora_id]
+            result = linear(input_)[0]
+            result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling
+            expected_results.append(result)
+        expected_result = torch.cat(expected_results)
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        torch.testing.assert_close(lora_result,
+                                   expected_result,
+                                   rtol=rtol,
+                                   atol=atol)
+
+        # Check that resetting the lora weights succeeds
+
+        for slot_idx in range(max_loras):
+            lora_linear.reset_lora(slot_idx)
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=[0],
+            num_inputs=32 * num_loras,
+            input_size=(1, 4096),
+            input_range=(0, 1),
+            input_type=torch.float16,
+            device=device)
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+
+        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
+                                       512, lora_config.lora_extra_vocab_size)
+
+        lora_result = lora_linear(torch.cat(inputs))[0]
+        expected_result = linear(torch.cat(inputs))[0]
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        torch.testing.assert_close(lora_result,
+                                   expected_result,
+                                   rtol=rtol,
+                                   atol=atol)
+
+
+@torch.inference_mode()
+@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("repeats", [1, 2, 3])
+@pytest.mark.parametrize("fully_shard", [True, False])
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("stage", STAGES)
+@pytest.mark.parametrize("bias_enabled", [True, False])
+def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
+                                device, stage, bias_enabled) -> None:
+
+    if current_platform.is_cuda_alike():
+        torch.cuda.set_device(device)
+
+    max_loras = 8
+    torch.set_default_device(device)
+    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
+    assert check_punica_wrapper(punica_wrapper)
+    lora_config = LoRAConfig(max_loras=max_loras,
+                             max_lora_rank=8,
+                             fully_sharded_loras=fully_shard,
+                             lora_dtype=torch.float16,
+                             bias_enabled=bias_enabled)
+
+    def create_column_parallel_packed_layer():
+        if repeats == 2:
+            linear = MergedColumnParallelLinear(4096, [4096] * repeats,
+                                                bias=False,
+                                                params_dtype=torch.float16)
+            linear.weight.data = torch.rand_like(linear.weight.data)
+            lora_linear = (MergedColumnParallelLinearWithLoRA(linear)
+                           if not fully_shard else
+                           MergedColumnParallelLinearWithShardedLoRA(linear))
+        elif repeats == 3:
+            linear = QKVParallelLinear(4096,
+                                       64,
+                                       32,
+                                       bias=False,
+                                       params_dtype=torch.float16)
+            linear.weight.data = torch.rand_like(linear.weight.data)
+            lora_linear = (MergedQKVParallelLinearWithLoRA(linear)
+                           if not fully_shard else
+                           MergedQKVParallelLinearWithShardedLoRA(linear))
+        else:
+            linear = QKVParallelLinear(4096,
+                                       64,
+                                       32,
+                                       bias=False,
+                                       params_dtype=torch.float16)
+            linear.weight.data = torch.rand_like(linear.weight.data)
+            lora_linear = QKVParallelLinearWithLoRA(
+                linear
+            ) if not fully_shard else QKVParallelLinearWithShardedLoRA(linear)
+
+        @dataclass
+        class FakeConfig:
+            hidden_size = 4096
+            num_key_value_heads = 32
+            num_attention_heads = 32
+
+        n_slices = repeats
+        lora_linear.create_lora_weights(max_loras,
+                                        lora_config,
+                                        model_config=FakeConfig())
+        assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
+            lora_linear.lora_b_stacked) == n_slices)
+        if bias_enabled:
+            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
+        else:
+            assert lora_linear.lora_bias_stacked is None
+        return linear, lora_linear
+
+    for i in range(NUM_RANDOM_SEEDS):
+        set_random_seed(i)
+
+        id_to_index = get_random_id_to_index(num_loras, max_loras)
+
+        linear, lora_linear = create_column_parallel_packed_layer()
+        assert torch.equal(linear.weight, lora_linear.weight)
+        lora_linear.set_mapping(punica_wrapper)
+        lora_dict, sublora_dict = populate_loras(
+            id_to_index,
+            layer=lora_linear,
+            layer_weights=linear.weight,
+            repeats=repeats,
+        )
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=list(lora_dict.keys()),
+            num_inputs=32 * num_loras,
+            input_size=(1, 4096),
+            input_range=(0, 1),
+            input_type=torch.float16,
+            device=device)
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+
+        punica_wrapper.update_metadata(
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            512,
+            lora_config.lora_extra_vocab_size,
+        )
+
+        lora_result = lora_linear(torch.cat(inputs))[0]
+
+        expected_results: list[torch.Tensor] = []
+        for input_, lora_id in zip(inputs, prompt_mapping):
+            result = linear(input_)[0]
+            subloras = sublora_dict[lora_id]
+            for i, sublora in enumerate(subloras):
+                result[:, sublora.lora_b.shape[1] * i:sublora.lora_b.shape[1] *
+                       (i + 1)] += (input_ @ sublora.lora_a @ sublora.lora_b *
+                                    sublora.scaling)
+            expected_results.append(result)
+        expected_result = torch.cat(expected_results)
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        torch.testing.assert_close(lora_result,
+                                   expected_result,
+                                   rtol=rtol,
+                                   atol=atol)
+
+        for slot_idx in range(max_loras):
+            lora_linear.reset_lora(slot_idx)
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=[0],
+            num_inputs=32 * num_loras,
+            input_size=(1, 4096),
+            input_range=(0, 1),
+            input_type=torch.float16,
+            device=device)
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+
+        punica_wrapper.update_metadata(
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            512,
+            lora_config.lora_extra_vocab_size,
+        )
+
+        lora_result = lora_linear(torch.cat(inputs))[0]
+        expected_result = linear(torch.cat(inputs))[0]
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        torch.testing.assert_close(lora_result,
+                                   expected_result,
+                                   rtol=rtol,
+                                   atol=atol)
+
+
+@pytest.mark.parametrize("tp_size", [1, 2, 4, 8])
+@pytest.mark.parametrize(
+    "seed", list(range(VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS)))
+def test_vocab_parallel_embedding_indices(tp_size, seed):
+    random.seed(seed)
+    vocab_size = random.randint(4000, 64000)
+    added_vocab_size = random.randint(0, 1024)
+    org_vocab_size = vocab_size - added_vocab_size
+    last_org_vocab_end_index = 0
+    last_added_vocab_end_index = org_vocab_size
+    computed_vocab_size = 0
+    computed_org_vocab_size = 0
+    computed_added_vocab_size = 0
+    vocab_size_padded = -1
+
+    all_org_tokens: list[int] = []
+    all_added_tokens: list[int] = []
+    token_ids: list[int] = []
+
+    for tp_rank in range(tp_size):
+        with patch(
+                "vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_rank",
+                return_value=tp_rank
+        ), patch(
+                "vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_world_size",
+                return_value=tp_size):
+            vocab_embedding = VocabParallelEmbedding(
+                vocab_size, 1, org_num_embeddings=org_vocab_size)
+        vocab_size_padded = vocab_embedding.num_embeddings_padded
+        shard_indices = vocab_embedding.shard_indices
+        # Assert that the ranges are contiguous
+        assert shard_indices.org_vocab_start_index == last_org_vocab_end_index
+        assert (shard_indices.added_vocab_start_index ==
+                last_added_vocab_end_index)
+
+        # Ensure that we are not exceeding the vocab size
+        computed_vocab_size += shard_indices.num_elements_padded
+        computed_org_vocab_size += shard_indices.num_org_elements
+        computed_added_vocab_size += shard_indices.num_added_elements
+
+        # Ensure that the ranges are not overlapping
+        all_org_tokens.extend(
+            range(shard_indices.org_vocab_start_index,
+                  shard_indices.org_vocab_end_index))
+        all_added_tokens.extend(
+            range(shard_indices.added_vocab_start_index,
+                  shard_indices.added_vocab_end_index))
+
+        token_ids.extend(
+            range(shard_indices.org_vocab_start_index,
+                  shard_indices.org_vocab_end_index))
+        token_ids.extend([-1] * (shard_indices.num_org_elements_padded -
+                                 shard_indices.num_org_elements))
+        token_ids.extend(
+            range(shard_indices.added_vocab_start_index,
+                  shard_indices.added_vocab_end_index))
+        token_ids.extend([-1] * (shard_indices.num_added_elements_padded -
+                                 shard_indices.num_added_elements))
+
+        last_org_vocab_end_index = shard_indices.org_vocab_end_index
+        last_added_vocab_end_index = shard_indices.added_vocab_end_index
+
+    assert computed_vocab_size == vocab_size_padded
+    assert computed_org_vocab_size == org_vocab_size
+    assert computed_added_vocab_size == added_vocab_size
+
+    # Ensure that the ranges are not overlapping
+    assert len(all_org_tokens) == len(set(all_org_tokens))
+    assert len(all_added_tokens) == len(set(all_added_tokens))
+    assert not set(all_org_tokens).intersection(set(all_added_tokens))
+
+    token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
+    reindex_mapping = vocab_embedding.get_sharded_to_full_mapping()
+    assert reindex_mapping is not None or tp_size == 1
+    if reindex_mapping is not None:
+        reindexed_token_ids = token_ids_tensor[reindex_mapping]
+        expected = torch.tensor(list(range(0, vocab_size)))
+        assert reindexed_token_ids[:vocab_size].equal(expected)
+        assert torch.all(reindexed_token_ids[vocab_size:] == -1)
+
+
+def test_get_masked_input_and_mask():
+    x = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
+
+    # base tp 1 case, no padding
+    modified_x, _ = get_masked_input_and_mask(x,
+                                              org_vocab_start_index=0,
+                                              org_vocab_end_index=8,
+                                              added_vocab_start_index=8,
+                                              added_vocab_end_index=12,
+                                              num_org_vocab_padding=0)
+    assert torch.equal(x, modified_x)
+
+    # tp 2 case, no padding
+    modified_x_rank_0, _ = get_masked_input_and_mask(x,
+                                                     org_vocab_start_index=0,
+                                                     org_vocab_end_index=4,
+                                                     added_vocab_start_index=8,
+                                                     added_vocab_end_index=10,
+                                                     num_org_vocab_padding=0)
+    modified_x_rank_1, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=4,
+        org_vocab_end_index=8,
+        added_vocab_start_index=10,
+        added_vocab_end_index=12,
+        num_org_vocab_padding=0)
+    assert torch.equal(modified_x_rank_0,
+                       torch.tensor([0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 0, 0]))
+    assert torch.equal(modified_x_rank_1,
+                       torch.tensor([0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 4, 5]))
+
+    # tp 4 case, no padding
+    modified_x_rank_0, _ = get_masked_input_and_mask(x,
+                                                     org_vocab_start_index=0,
+                                                     org_vocab_end_index=2,
+                                                     added_vocab_start_index=8,
+                                                     added_vocab_end_index=9,
+                                                     num_org_vocab_padding=0)
+    modified_x_rank_1, _ = get_masked_input_and_mask(x,
+                                                     org_vocab_start_index=2,
+                                                     org_vocab_end_index=4,
+                                                     added_vocab_start_index=9,
+                                                     added_vocab_end_index=10,
+                                                     num_org_vocab_padding=0)
+    modified_x_rank_2, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=4,
+        org_vocab_end_index=6,
+        added_vocab_start_index=10,
+        added_vocab_end_index=11,
+        num_org_vocab_padding=0)
+    modified_x_rank_3, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=6,
+        org_vocab_end_index=8,
+        added_vocab_start_index=11,
+        added_vocab_end_index=12,
+        num_org_vocab_padding=0)
+    assert torch.equal(modified_x_rank_0,
+                       torch.tensor([0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0]))
+    assert torch.equal(modified_x_rank_1,
+                       torch.tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0]))
+    assert torch.equal(modified_x_rank_2,
+                       torch.tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0]))
+    assert torch.equal(modified_x_rank_3,
+                       torch.tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2]))
+
+    # base tp 1 case, with padding
+    modified_x, _ = get_masked_input_and_mask(x,
+                                              org_vocab_start_index=0,
+                                              org_vocab_end_index=8,
+                                              added_vocab_start_index=8,
+                                              added_vocab_end_index=12,
+                                              num_org_vocab_padding=2)
+    assert torch.equal(modified_x,
+                       torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13]))
+
+    # tp 2 case, with padding
+    modified_x_rank_0, _ = get_masked_input_and_mask(x,
+                                                     org_vocab_start_index=0,
+                                                     org_vocab_end_index=4,
+                                                     added_vocab_start_index=8,
+                                                     added_vocab_end_index=10,
+                                                     num_org_vocab_padding=2)
+    modified_x_rank_1, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=4,
+        org_vocab_end_index=8,
+        added_vocab_start_index=10,
+        added_vocab_end_index=12,
+        num_org_vocab_padding=2)
+    assert torch.equal(modified_x_rank_0,
+                       torch.tensor([0, 1, 2, 3, 0, 0, 0, 0, 6, 7, 0, 0]))
+    assert torch.equal(modified_x_rank_1,
+                       torch.tensor([0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 6, 7]))
+
+    # tp 4 case, with padding
+    modified_x_rank_0, _ = get_masked_input_and_mask(x,
+                                                     org_vocab_start_index=0,
+                                                     org_vocab_end_index=2,
+                                                     added_vocab_start_index=8,
+                                                     added_vocab_end_index=9,
+                                                     num_org_vocab_padding=2)
+    modified_x_rank_1, _ = get_masked_input_and_mask(x,
+                                                     org_vocab_start_index=2,
+                                                     org_vocab_end_index=4,
+                                                     added_vocab_start_index=9,
+                                                     added_vocab_end_index=10,
+                                                     num_org_vocab_padding=2)
+    modified_x_rank_2, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=4,
+        org_vocab_end_index=6,
+        added_vocab_start_index=10,
+        added_vocab_end_index=11,
+        num_org_vocab_padding=2)
+    modified_x_rank_3, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=6,
+        org_vocab_end_index=8,
+        added_vocab_start_index=11,
+        added_vocab_end_index=12,
+        num_org_vocab_padding=2)
+    assert torch.equal(modified_x_rank_0,
+                       torch.tensor([0, 1, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0]))
+    assert torch.equal(modified_x_rank_1,
+                       torch.tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0]))
+    assert torch.equal(modified_x_rank_2,
+                       torch.tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0]))
+    assert torch.equal(modified_x_rank_3,
+                       torch.tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 4]))
diff --git a/vllm_v0.10.0/tests/lora/test_llama_tp.py b/vllm_v0.10.0/tests/lora/test_llama_tp.py
new file mode 100644
index 0000000..b1ad1fd
--- /dev/null
+++ b/vllm_v0.10.0/tests/lora/test_llama_tp.py
@@ -0,0 +1,210 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import subprocess
+import sys
+from typing import Union
+
+import vllm
+from vllm import LLM
+from vllm.lora.request import LoRARequest
+from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
+
+from ..utils import VLLM_PATH, create_new_process_for_each_test, multi_gpu_test
+
+MODEL_PATH = "meta-llama/Llama-2-7b-hf"
+
+EXPECTED_NO_LORA_OUTPUT = [
+    "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]",  # noqa: E501
+    " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ",  # noqa: E501
+    "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m",  # noqa: E501
+    " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ",  # noqa: E501
+    " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ",  # noqa: E501
+    "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE",  # noqa: E501
+]
+EXPECTED_LORA_OUTPUT = [
+    "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",  # noqa: E501
+    "  SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",  # noqa: E501
+    "  SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ",  # noqa: E501
+    "  SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ",  # noqa: E501
+    "  SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",  # noqa: E501
+    "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' "  # noqa: E501
+]
+
+
+def do_sample(llm: vllm.LLM,
+              lora_path: str,
+              lora_id: int,
+              tensorizer_config_dict: Union[dict, None] = None) -> list[str]:
+    prompts = [
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]"  # noqa: E501
+    ]
+
+    sampling_params = vllm.SamplingParams(temperature=0,
+                                          max_tokens=256,
+                                          skip_special_tokens=False,
+                                          stop=["[/assistant]"])
+
+    if tensorizer_config_dict is not None:
+        outputs = llm.generate(
+            prompts,
+            sampling_params,
+            lora_request=LoRARequest(
+                str(lora_id),
+                lora_id,
+                lora_path,
+                tensorizer_config_dict=tensorizer_config_dict)
+            if lora_id else None)
+    else:
+        outputs = llm.generate(
+            prompts,
+            sampling_params,
+            lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+            if lora_id else None)
+    # Print the outputs.
+    generated_texts: list[str] = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+def generate_and_test(llm,
+                      sql_lora_files,
+                      tensorizer_config_dict: Union[dict, None] = None):
+    print("lora adapter created")
+    assert do_sample(llm,
+                     sql_lora_files,
+                     tensorizer_config_dict=tensorizer_config_dict,
+                     lora_id=0) == EXPECTED_NO_LORA_OUTPUT
+
+    print("lora 1")
+    assert do_sample(llm,
+                     sql_lora_files,
+                     tensorizer_config_dict=tensorizer_config_dict,
+                     lora_id=1) == EXPECTED_LORA_OUTPUT
+
+    print("no lora")
+    assert do_sample(llm,
+                     sql_lora_files,
+                     tensorizer_config_dict=tensorizer_config_dict,
+                     lora_id=0) == EXPECTED_NO_LORA_OUTPUT
+
+    print("lora 2")
+    assert do_sample(llm,
+                     sql_lora_files,
+                     tensorizer_config_dict=tensorizer_config_dict,
+                     lora_id=2) == EXPECTED_LORA_OUTPUT
+
+    print("removing lora")
+
+
+@create_new_process_for_each_test()
+def test_llama_lora(sql_lora_files):
+
+    llm = vllm.LLM(
+        MODEL_PATH,
+        enable_lora=True,
+        # also test odd max_num_seqs
+        max_num_seqs=13,
+        max_loras=4,
+        enable_chunked_prefill=True)
+    generate_and_test(llm, sql_lora_files)
+
+
+@multi_gpu_test(num_gpus=4)
+@create_new_process_for_each_test()
+def test_llama_lora_tp4(sql_lora_files):
+
+    llm = vllm.LLM(
+        MODEL_PATH,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        tensor_parallel_size=4,
+        enable_chunked_prefill=True,
+    )
+    generate_and_test(llm, sql_lora_files)
+
+
+@multi_gpu_test(num_gpus=4)
+@create_new_process_for_each_test()
+def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
+
+    llm = vllm.LLM(
+        MODEL_PATH,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        tensor_parallel_size=4,
+        fully_sharded_loras=True,
+        enable_chunked_prefill=True,
+    )
+    generate_and_test(llm, sql_lora_files)
+
+
+@multi_gpu_test(num_gpus=2)
+@create_new_process_for_each_test()
+def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
+                                            sql_lora_huggingface_id):
+
+    # Run the tensorizing of the LoRA adapter and the model in a subprocess
+    # to guarantee cleanup
+
+    tp_size = 2
+    model_name = "model-rank-%03d.tensors"
+
+    model_ref = MODEL_PATH
+    lora_path = sql_lora_huggingface_id
+    suffix = "test"
+    try:
+        result = subprocess.run([
+            sys.executable,
+            f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", "--model",
+            MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size",
+            str(tp_size), "serialize", "--serialized-directory",
+            str(tmp_path), "--suffix", suffix, "--serialization-kwargs",
+            '{"limit_cpu_concurrency": 4}'
+        ],
+                                check=True,
+                                capture_output=True,
+                                text=True)
+    except subprocess.CalledProcessError as e:
+        print("Tensorizing failed.")
+        print("STDOUT:\n", e.stdout)
+        print("STDERR:\n", e.stderr)
+        raise
+
+    print("STDOUT:\n", result.stdout)
+
+    model_uri = tmp_path / "vllm" / model_ref / suffix / model_name
+    tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))
+
+    loaded_llm = LLM(model=model_ref,
+                     load_format="tensorizer",
+                     enable_lora=True,
+                     enforce_eager=True,
+                     model_loader_extra_config=tensorizer_config,
+                     max_num_seqs=13,
+                     tensor_parallel_size=2,
+                     max_loras=2)
+
+    tc_as_dict = tensorizer_config.to_serializable()
+
+    print("lora adapter created")
+    assert do_sample(loaded_llm,
+                     sql_lora_files,
+                     tensorizer_config_dict=tc_as_dict,
+                     lora_id=0) == EXPECTED_NO_LORA_OUTPUT
+
+    print("lora 1")
+    assert do_sample(loaded_llm,
+                     sql_lora_files,
+                     tensorizer_config_dict=tc_as_dict,
+                     lora_id=1) == EXPECTED_LORA_OUTPUT
diff --git a/vllm_v0.10.0/tests/lora/test_lora_allowed_token_ids.py b/vllm_v0.10.0/tests/lora/test_lora_allowed_token_ids.py
new file mode 100644
index 0000000..01bc102
--- /dev/null
+++ b/vllm_v0.10.0/tests/lora/test_lora_allowed_token_ids.py
@@ -0,0 +1,135 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
+                         VllmConfig)
+from vllm.lora.request import LoRARequest
+from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
+from vllm.v1.engine.processor import Processor
+
+
+def test_allowed_token_ids_with_lora_vocab(llama_2_7b_base_huggingface_id,
+                                           sql_lora_files):
+    """
+    Test that we properly resolve the range of allowed token ids for lora
+    adapters that define additional tokens.
+    """
+
+    # Setup a base model compatible with the sql_lora_files adapter and
+    # a known number of tokens in the base model.
+    model_config = ModelConfig(
+        model=llama_2_7b_base_huggingface_id,
+        tokenizer=llama_2_7b_base_huggingface_id,
+        tokenizer_mode="auto",
+    )
+
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        cache_config=CacheConfig(),
+        device_config=DeviceConfig(),
+        lora_config=LoRAConfig(),
+    )
+
+    tokenizer = init_tokenizer_from_configs(
+        model_config=vllm_config.model_config,
+        scheduler_config=vllm_config.scheduler_config,
+        lora_config=vllm_config.lora_config)
+    processor = Processor(vllm_config, tokenizer)
+
+    lora_request = LoRARequest("1", 1, str(sql_lora_files))
+    request_id = "1"
+    prompt = "a prompt"
+
+    # tokens added in the lora adapter should not raise an error
+    lora_token_ids = [32000, 32001, 32002, 32003]
+    processor.process_inputs(
+        request_id,
+        prompt,
+        params=SamplingParams(allowed_token_ids=lora_token_ids),
+        lora_request=lora_request)
+
+    # tokens in the base model should not raise an error
+    base_token_ids = [1000, 1001, 1002, 1003]
+    processor.process_inputs(
+        request_id,
+        prompt,
+        params=SamplingParams(allowed_token_ids=base_token_ids),
+        lora_request=lora_request)
+
+    # tokens not in the lora adapter should raise an error
+    invalid_token_ids = [35000, 35001, 35002, 35003]
+    with pytest.raises(ValueError):
+        processor.process_inputs(
+            request_id,
+            prompt,
+            params=SamplingParams(allowed_token_ids=invalid_token_ids),
+            lora_request=lora_request)
+
+    # tokens in the lora adapter with no lora request should raise an error
+    with pytest.raises(ValueError):
+        processor.process_inputs(
+            request_id,
+            prompt,
+            params=SamplingParams(allowed_token_ids=lora_token_ids),
+        )
+
+
+def test_allowed_token_ids_with_lora_adapter_no_vocab(
+        qwen25vl_base_huggingface_id, qwen25vl_lora_files):
+    """
+    Test that we properly resolve the range of allowed token ids for lora
+    adapters that do not define additional tokens.
+    """
+
+    # Setup a base model compatible with the qwen25vl_lora_files adapter and
+    # a known number of tokens in the base model.
+    model_config = ModelConfig(
+        model=qwen25vl_base_huggingface_id,
+        tokenizer=qwen25vl_base_huggingface_id,
+        tokenizer_mode="auto",
+    )
+
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        cache_config=CacheConfig(),
+        device_config=DeviceConfig(),
+        lora_config=LoRAConfig(),
+    )
+
+    tokenizer = init_tokenizer_from_configs(
+        model_config=vllm_config.model_config,
+        scheduler_config=vllm_config.scheduler_config,
+        lora_config=vllm_config.lora_config)
+    processor = Processor(vllm_config, tokenizer)
+
+    lora_request = LoRARequest("1", 1, str(qwen25vl_lora_files))
+    request_id = "1"
+    prompt = "a prompt"
+
+    # tokens in the base model should not raise an error
+    base_token_ids = [1000, 1001, 1002, 1003]
+    processor.process_inputs(
+        request_id,
+        prompt,
+        params=SamplingParams(allowed_token_ids=base_token_ids),
+        lora_request=lora_request)
+
+    # tokens in the base model with no lora request should not raise an error
+    base_token_ids = [1000, 1001, 1002, 1003]
+    processor.process_inputs(
+        request_id,
+        prompt,
+        params=SamplingParams(allowed_token_ids=base_token_ids),
+    )
+
+    # tokens not in the base model should raise an error
+    invalid_token_ids = [200000, 200001, 200002, 200003]
+    with pytest.raises(ValueError):
+        processor.process_inputs(
+            request_id,
+            prompt,
+            params=SamplingParams(allowed_token_ids=invalid_token_ids),
+            lora_request=lora_request)
diff --git a/vllm_v0.10.0/tests/lora/test_lora_checkpoints.py b/vllm_v0.10.0/tests/lora/test_lora_checkpoints.py
new file mode 100644
index 0000000..ebc0f26
--- /dev/null
+++ b/vllm_v0.10.0/tests/lora/test_lora_checkpoints.py
@@ -0,0 +1,130 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.lora.models import LoRAModel
+from vllm.lora.peft_helper import PEFTHelper
+from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
+from vllm.model_executor.models.utils import WeightsMapper
+
+lora_lst = [
+    "baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"
+]
+BAICHUAN_LORA_MODULES = [
+    "W_pack",
+    "o_proj",
+    "gate_up_proj",
+    "down_proj",
+]
+
+
+@pytest.mark.parametrize("lora_name", lora_lst)
+def test_load_checkpoints(
+    lora_name,
+    baichuan_lora_files,
+    baichuan_zero_lora_files,
+    baichuan_regex_lora_files,
+    chatglm3_lora_files,
+):
+    packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
+    embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
+    embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
+    expected_lora_modules: list[str] = []
+    for module in BAICHUAN_LORA_MODULES:
+        if module in packed_modules_mapping:
+            expected_lora_modules.extend(packed_modules_mapping[module])
+        else:
+            expected_lora_modules.append(module)
+    if lora_name == "baichuan7B":
+        peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files,
+                                                max_position_embeddings=4096)
+        # For the baichuan7B model, load it's LoRA,
+        # and the test should pass.
+        LoRAModel.from_local_checkpoint(
+            baichuan_lora_files,
+            expected_lora_modules,
+            peft_helper=peft_helper,
+            lora_model_id=1,
+            device="cpu",
+            embedding_modules=embedding_modules,
+            embedding_padding_modules=embed_padding_modules)
+    elif lora_name == "baichuan7B-zero":
+        # Test that the target_modules contain prefix
+        # such as "model.layers.0.self_atten.W_pack", and
+        # the test should pass.
+        peft_helper = PEFTHelper.from_local_dir(baichuan_zero_lora_files,
+                                                max_position_embeddings=4096)
+        LoRAModel.from_local_checkpoint(
+            baichuan_zero_lora_files,
+            expected_lora_modules,
+            peft_helper=peft_helper,
+            lora_model_id=1,
+            device="cpu",
+            embedding_modules=embedding_modules,
+            embedding_padding_modules=embed_padding_modules)
+    elif lora_name == "baichuan7B-zero-regex":
+        # Test that the `target_modules` in the form of regular expressions,
+        # such as `model\\..*(W_pack|o_proj)`, and the test should pass.
+        peft_helper = PEFTHelper.from_local_dir(baichuan_regex_lora_files,
+                                                max_position_embeddings=4096)
+        LoRAModel.from_local_checkpoint(
+            baichuan_regex_lora_files,
+            expected_lora_modules,
+            peft_helper=peft_helper,
+            lora_model_id=1,
+            device="cpu",
+            embedding_modules=embedding_modules,
+            embedding_padding_modules=embed_padding_modules)
+    else:
+        # For the baichuan7B model, load chatglm3-6b's LoRA,
+        # and the test should raise the following error.
+        expected_error = "Please verify that the loaded LoRA module is correct"  # noqa: E501
+        peft_helper = PEFTHelper.from_local_dir(chatglm3_lora_files,
+                                                max_position_embeddings=4096)
+        with pytest.raises(ValueError, match=expected_error):
+            LoRAModel.from_local_checkpoint(
+                chatglm3_lora_files,
+                expected_lora_modules,
+                peft_helper=peft_helper,
+                lora_model_id=1,
+                device="cpu",
+                embedding_modules=embedding_modules,
+                embedding_padding_modules=embed_padding_modules)
+
+
+def test_lora_weights_mapping(baichuan_lora_files):
+
+    packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
+    embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
+    embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
+    expected_lora_modules: list[str] = []
+    for module in BAICHUAN_LORA_MODULES:
+        if module in packed_modules_mapping:
+            expected_lora_modules.extend(packed_modules_mapping[module])
+        else:
+            expected_lora_modules.append(module)
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "model.": "language_model.model.",
+        },
+        orig_to_new_substr={
+            ".layers.": ".baichuan_layers.",
+        },
+    )
+    peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files,
+                                            max_position_embeddings=4096)
+    lora_model = LoRAModel.from_local_checkpoint(
+        baichuan_lora_files,
+        expected_lora_modules,
+        peft_helper=peft_helper,
+        lora_model_id=1,
+        device="cpu",
+        embedding_modules=embedding_modules,
+        embedding_padding_modules=embed_padding_modules,
+        weights_mapper=hf_to_vllm_mapper,
+    )
+    for name in lora_model.loras:
+        assert name.startswith(hf_to_vllm_mapper.orig_to_new_prefix["model."])
+        assert ".baichuan_layers." in name
diff --git a/vllm_v0.10.0/tests/lora/test_lora_functions.py b/vllm_v0.10.0/tests/lora/test_lora_functions.py
new file mode 100644
index 0000000..50c6034
--- /dev/null
+++ b/vllm_v0.10.0/tests/lora/test_lora_functions.py
@@ -0,0 +1,112 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Script to test add_lora, remove_lora, pin_lora, list_loras functions.
+"""
+import pytest
+
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.engine.llm_engine import LLMEngine
+from vllm.entrypoints.openai.api_server import (
+    build_async_engine_client_from_engine_args)
+from vllm.lora.request import LoRARequest
+
+MODEL_PATH = "meta-llama/Llama-2-7b-hf"
+LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test"
+LORA_RANK = 8
+
+
+def make_lora_request(lora_id: int):
+    return LoRARequest(lora_name=f"{lora_id}",
+                       lora_int_id=lora_id,
+                       lora_path=LORA_MODULE_PATH)
+
+
+def test_lora_functions_sync():
+
+    max_loras = 4
+    # Create engine in eager-mode. Due to high max_loras, the CI can
+    # OOM during cuda-graph capture.
+    engine_args = EngineArgs(model=MODEL_PATH,
+                             enable_lora=True,
+                             max_loras=max_loras,
+                             max_lora_rank=LORA_RANK,
+                             max_model_len=128,
+                             gpu_memory_utilization=0.8,
+                             enforce_eager=True)
+
+    llm = LLMEngine.from_engine_args(engine_args)
+
+    def run_check(fn, args, expected: list):
+        fn(args)
+        assert set(llm.list_loras()) == set(expected)
+
+    run_check(llm.add_lora, make_lora_request(1), [1])
+    run_check(llm.add_lora, make_lora_request(2), [1, 2])
+
+    # Pin LoRA 1 and test that it is never removed on subsequent adds.
+    run_check(llm.pin_lora, 1, [1, 2])
+    run_check(llm.add_lora, make_lora_request(3), [1, 2, 3])
+    run_check(llm.add_lora, make_lora_request(4), [1, 2, 3, 4])
+    run_check(llm.add_lora, make_lora_request(5), [1, 5, 3, 4])
+    run_check(llm.add_lora, make_lora_request(6), [1, 5, 6, 4])
+    run_check(llm.add_lora, make_lora_request(7), [1, 5, 6, 7])
+    run_check(llm.add_lora, make_lora_request(8), [1, 8, 6, 7])
+    run_check(llm.add_lora, make_lora_request(9), [1, 8, 9, 7])
+    run_check(llm.add_lora, make_lora_request(10), [1, 8, 9, 10])
+
+    # Remove LoRA 1 and continue adding.
+    run_check(llm.remove_lora, 1, [8, 9, 10])
+    run_check(llm.add_lora, make_lora_request(11), [8, 9, 10, 11])
+    run_check(llm.add_lora, make_lora_request(12), [12, 9, 10, 11])
+    run_check(llm.add_lora, make_lora_request(13), [12, 13, 10, 11])
+
+    # Remove all LoRAs.
+    run_check(llm.remove_lora, 13, [12, 10, 11])
+    run_check(llm.remove_lora, 12, [10, 11])
+    run_check(llm.remove_lora, 11, [10])
+    run_check(llm.remove_lora, 10, [])
+
+
+@pytest.mark.asyncio
+async def test_lora_functions_async():
+
+    max_loras = 4
+    engine_args = AsyncEngineArgs(model=MODEL_PATH,
+                                  enable_lora=True,
+                                  max_loras=max_loras,
+                                  max_lora_rank=LORA_RANK,
+                                  max_model_len=128,
+                                  gpu_memory_utilization=0.8,
+                                  enforce_eager=True)
+
+    async def run_check(fn, args, expected: list):
+        await fn(args)
+        assert set(await llm.list_loras()) == set(expected)
+
+    async with build_async_engine_client_from_engine_args(engine_args) as llm:
+        await run_check(llm.add_lora, make_lora_request(1), [1])
+        await run_check(llm.add_lora, make_lora_request(2), [1, 2])
+
+        # Pin LoRA 1 and test that it is never removed on subsequent adds.
+        await run_check(llm.pin_lora, 1, [1, 2])
+        await run_check(llm.add_lora, make_lora_request(3), [1, 2, 3])
+        await run_check(llm.add_lora, make_lora_request(4), [1, 2, 3, 4])
+        await run_check(llm.add_lora, make_lora_request(5), [1, 5, 3, 4])
+        await run_check(llm.add_lora, make_lora_request(6), [1, 5, 6, 4])
+        await run_check(llm.add_lora, make_lora_request(7), [1, 5, 6, 7])
+        await run_check(llm.add_lora, make_lora_request(8), [1, 8, 6, 7])
+        await run_check(llm.add_lora, make_lora_request(9), [1, 8, 9, 7])
+        await run_check(llm.add_lora, make_lora_request(10), [1, 8, 9, 10])
+
+        # Remove LoRA 1 and continue adding.
+        await run_check(llm.remove_lora, 1, [8, 9, 10])
+        await run_check(llm.add_lora, make_lora_request(11), [8, 9, 10, 11])
+        await run_check(llm.add_lora, make_lora_request(12), [12, 9, 10, 11])
+        await run_check(llm.add_lora, make_lora_request(13), [12, 13, 10, 11])
+
+        # Remove all LoRAs
+        await run_check(llm.remove_lora, 13, [12, 10, 11])
+        await run_check(llm.remove_lora, 12, [10, 11])
+        await run_check(llm.remove_lora, 11, [10])
+        await run_check(llm.remove_lora, 10, [])
diff --git a/vllm_v0.10.0/tests/lora/test_lora_huggingface.py b/vllm_v0.10.0/tests/lora/test_lora_huggingface.py
new file mode 100644
index 0000000..b46d81f
--- /dev/null
+++ b/vllm_v0.10.0/tests/lora/test_lora_huggingface.py
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.lora.models import LoRAModel
+from vllm.lora.peft_helper import PEFTHelper
+from vllm.lora.utils import get_adapter_absolute_path
+from vllm.model_executor.models.llama import LlamaForCausalLM
+
+# Provide absolute path and huggingface lora ids
+lora_fixture_name = ["sql_lora_files", "sql_lora_huggingface_id"]
+LLAMA_LORA_MODULES = [
+    "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens",
+    "lm_head"
+]
+
+
+@pytest.mark.parametrize("lora_fixture_name", lora_fixture_name)
+def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
+    lora_name = request.getfixturevalue(lora_fixture_name)
+    packed_modules_mapping = LlamaForCausalLM.packed_modules_mapping
+    embedding_modules = LlamaForCausalLM.embedding_modules
+    embed_padding_modules = LlamaForCausalLM.embedding_padding_modules
+    expected_lora_modules: list[str] = []
+    for module in LLAMA_LORA_MODULES:
+        if module in packed_modules_mapping:
+            expected_lora_modules.extend(packed_modules_mapping[module])
+        else:
+            expected_lora_modules.append(module)
+
+    lora_path = get_adapter_absolute_path(lora_name)
+
+    # lora loading should work for either absolute path and huggingface id.
+    peft_helper = PEFTHelper.from_local_dir(lora_path, 4096)
+    lora_model = LoRAModel.from_local_checkpoint(
+        lora_path,
+        expected_lora_modules,
+        peft_helper=peft_helper,
+        lora_model_id=1,
+        device="cpu",
+        embedding_modules=embedding_modules,
+        embedding_padding_modules=embed_padding_modules)
+
+    # Assertions to ensure the model is loaded correctly
+    assert lora_model is not None, "LoRAModel is not loaded correctly"
diff --git a/vllm_v0.10.0/tests/lora/test_lora_manager.py b/vllm_v0.10.0/tests/lora/test_lora_manager.py
new file mode 100644
index 0000000..8f8a270
--- /dev/null
+++ b/vllm_v0.10.0/tests/lora/test_lora_manager.py
@@ -0,0 +1,653 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+
+import pytest
+import torch
+from safetensors.torch import load_file
+from torch import nn
+
+from vllm.config import LoRAConfig
+from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
+                              MergedColumnParallelLinearWithLoRA,
+                              RowParallelLinearWithLoRA)
+from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
+from vllm.lora.models import (LoRAMapping, LoRAModel, LoRAModelManager,
+                              LRUCacheLoRAModelManager)
+from vllm.lora.peft_helper import PEFTHelper
+from vllm.lora.request import LoRARequest
+from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
+                                      WorkerLoRAManager)
+from vllm.platforms import current_platform
+
+EMBEDDING_MODULES = {
+    "embed_tokens": "input_embeddings",
+    "lm_head": "output_embeddings",
+}
+
+EMBEDDING_PADDING_MODULES = ["lm_head"]
+
+DEVICES = ([
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+] if current_platform.is_cuda_alike() else ["cpu"])
+
+DEFAULT_DTYPE = torch.get_default_dtype()
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
+    """
+    Some tests depend on V0 internals. Since both V0 and V1 use the same
+    LoRAModelManager it is okay to just test V0.
+    """
+    with monkeypatch.context() as m:
+        m.setenv('VLLM_USE_V1', '0')
+        yield
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_from_lora_tensors(sql_lora_files, device):
+    tensors = load_file(
+        os.path.join(sql_lora_files, "adapter_model.safetensors"))
+    new_embeddings = load_file(
+        os.path.join(sql_lora_files, "new_embeddings.safetensors"))
+
+    peft_helper = PEFTHelper.from_local_dir(sql_lora_files,
+                                            max_position_embeddings=4096)
+    lora_model = LoRAModel.from_lora_tensors(
+        1,
+        tensors,
+        peft_helper=peft_helper,
+        device=device,
+        embeddings=new_embeddings,
+        embedding_modules=EMBEDDING_MODULES,
+        embedding_padding_modules=EMBEDDING_PADDING_MODULES)
+    for module_name, lora in lora_model.loras.items():
+        assert lora.module_name == module_name
+        assert lora.rank == 8
+        assert lora.lora_alpha == 16
+        assert lora.lora_a is not None
+        assert lora.lora_b is not None
+        assert lora.lora_a.device == torch.device(device)
+        assert lora.lora_b.device == torch.device(device)
+        assert (lora.lora_a.shape[1] == lora.lora_b.shape[0]
+                ), f"{lora.lora_a.shape=}, {lora.lora_b.shape=}"
+        assert lora.lora_a.shape[1] == 8
+        embeddings_module = next(
+            (k for k in EMBEDDING_MODULES if k in module_name), None)
+        if embeddings_module:
+            assert torch.equal(
+                lora.embeddings_tensor,
+                new_embeddings[EMBEDDING_MODULES[embeddings_module]].to(
+                    device=lora.embeddings_tensor.device))
+        else:
+            assert lora.embeddings_tensor is None
+
+
+def create_lora(lora_id: int, model: nn.Module, sub_modules: list[str],
+                device: torch.device) -> LoRAModel:
+    loras: dict[str, LoRALayerWeights] = {}
+    for name in sub_modules:
+        w = model.get_submodule(name).weight
+        loras[name] = LoRALayerWeights(
+            name,
+            8,
+            16,
+            torch.rand([w.shape[1], 8], device=device),
+            torch.rand([8, w.shape[0]], device=device),
+        )
+    return LoRAModel(lora_id, 8, loras)
+
+
+def create_packed_lora(
+    lora_id: int,
+    model: nn.Module,
+    module_name,
+    replaced_module_names,
+    device: torch.device,
+    empty_replaced_module_name=None,
+) -> LoRAModel:
+    w = model.get_submodule(module_name).weight
+    loras: dict[str, LoRALayerWeights] = {}
+    for replaced_module_name in replaced_module_names:
+        if replaced_module_name == empty_replaced_module_name:
+            continue
+        loras[replaced_module_name] = LoRALayerWeights(
+            replaced_module_name,
+            8,
+            16,
+            torch.rand([w.shape[1], 8], device=device),
+            torch.rand([8, w.shape[0] // len(replaced_module_names)],
+                       device=device),
+        )
+    return LoRAModel(lora_id, 8, loras)
+
+
+def test_replace_submodules(dist_init, dummy_model):
+    model = dummy_model
+    manager = LoRAModelManager(
+        model, 1, 1, 1,
+        LoRAConfig(max_lora_rank=8,
+                   max_cpu_loras=8,
+                   max_loras=8,
+                   lora_dtype=DEFAULT_DTYPE), torch.device(DEVICES[0]))
+    model = manager.model
+    assert isinstance(model.get_submodule("dense1"),
+                      ColumnParallelLinearWithLoRA)
+    assert isinstance(model.get_submodule("layer1.dense1"),
+                      ColumnParallelLinearWithLoRA)
+    assert isinstance(model.get_submodule("dense2"), RowParallelLinearWithLoRA)
+    assert isinstance(model.get_submodule("layer1.dense2"),
+                      RowParallelLinearWithLoRA)
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_lora_model_manager(dist_init, dummy_model, device):
+    model = dummy_model
+    model_lora1 = create_lora(1,
+                              model, ["layer1.dense1", "dense2", "lm_head"],
+                              device=device)
+    model_lora2 = create_lora(2,
+                              model, ["dense1", "dense2", "lm_head"],
+                              device=device)
+    model_lora3 = create_lora(3,
+                              model, ["dense1", "dense2", "lm_head"],
+                              device=device)
+    manager = LoRAModelManager(model,
+                               2,
+                               2,
+                               2,
+                               LoRAConfig(max_lora_rank=8,
+                                          max_cpu_loras=3,
+                                          max_loras=2,
+                                          lora_dtype=DEFAULT_DTYPE),
+                               device=device)
+    assert all(x is None for x in manager.lora_index_to_id)
+    assert manager.add_adapter(model_lora1)
+    assert manager.activate_adapter(1)
+    assert manager.lora_index_to_id[0] == 1
+    assert not manager.add_adapter(model_lora1)
+    assert not manager.activate_adapter(1)
+    assert manager.add_adapter(model_lora2)
+    assert manager.activate_adapter(2)
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] == 2
+    assert not manager.add_adapter(model_lora2)
+    assert not manager.activate_adapter(2)
+    assert manager.add_adapter(model_lora3)
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] == 2
+    with pytest.raises(ValueError):
+        assert manager.activate_adapter(3)
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] == 2
+    assert manager.remove_adapter(model_lora2.id)
+    assert manager.lora_index_to_id[1] is None
+    assert not manager.remove_adapter(model_lora2.id)
+    assert manager.remove_adapter(model_lora1.id)
+    assert not manager.remove_adapter(model_lora1.id)
+    assert manager.add_adapter(model_lora1)
+    assert manager.lora_index_to_id[0] is None
+    assert manager.lora_index_to_id[1] is None
+    assert manager.add_adapter(model_lora2)
+    assert manager.activate_adapter(3)
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] is None
+    assert manager.activate_adapter(2)
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 2
+
+    assert manager.device == device
+    assert manager.punica_wrapper.device == device
+    assert hasattr(manager, "supported_lora_modules")
+    assert sorted(manager.supported_lora_modules) == [
+        "dense1",
+        "dense2",
+        "lm_head",
+        "output",
+    ]
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
+    model = dummy_model
+    model_lora1 = create_lora(1,
+                              model, ["layer1.dense1", "dense2", "lm_head"],
+                              device=device)
+    model_lora2 = create_lora(2,
+                              model, ["dense1", "dense2", "lm_head"],
+                              device=device)
+    model_lora3 = create_lora(3,
+                              model, ["dense1", "dense2", "lm_head"],
+                              device=device)
+    manager = LRUCacheLoRAModelManager(model,
+                                       2,
+                                       2,
+                                       2,
+                                       LoRAConfig(max_lora_rank=8,
+                                                  max_cpu_loras=3,
+                                                  max_loras=2,
+                                                  lora_dtype=DEFAULT_DTYPE),
+                                       device=device)
+    assert all(x is None for x in manager.lora_index_to_id)
+    assert manager.add_adapter(model_lora1)
+    assert manager.activate_adapter(1)
+    assert manager.lora_index_to_id[0] == 1
+    assert not manager.add_adapter(model_lora1)
+    assert not manager.activate_adapter(1)
+    assert manager.add_adapter(model_lora2)
+    assert manager.activate_adapter(2)
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] == 2
+    assert not manager.add_adapter(model_lora2)
+    assert not manager.activate_adapter(2)
+    assert manager.add_adapter(model_lora3)
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] == 2
+    assert manager.activate_adapter(3)
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 2
+    assert manager.remove_adapter(model_lora2.id)
+    assert manager.lora_index_to_id[1] is None
+    assert not manager.remove_adapter(model_lora2.id)
+    assert manager.remove_adapter(model_lora1.id)
+    assert not manager.remove_adapter(model_lora1.id)
+    assert manager.add_adapter(model_lora1)
+    assert manager.activate_adapter(1)
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.add_adapter(model_lora2)
+    assert manager.deactivate_adapter(3)
+    assert manager.lora_index_to_id[0] is None
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.activate_adapter(2)
+    assert manager.lora_index_to_id[0] == 2
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.activate_adapter(3)
+    assert manager.lora_index_to_id[0] == 2
+    assert manager.lora_index_to_id[1] == 3
+    assert manager.pin_adapter(2)
+    assert manager.lora_index_to_id[0] == 2
+    assert manager.lora_index_to_id[1] == 3
+    assert manager.activate_adapter(1)
+    assert manager.lora_index_to_id[0] == 2
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.deactivate_adapter(2)
+    assert manager.lora_index_to_id[0] is None
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.activate_adapter(3)
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.pin_adapter(3)
+    assert manager.pin_adapter(1)
+    with pytest.raises(RuntimeError):
+        assert manager.pin_adapter(2)
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 1
+    with pytest.raises(RuntimeError):
+        assert manager.activate_adapter(2)
+
+    assert manager.deactivate_adapter(3)
+    assert manager.pin_adapter(2)
+    assert manager.lora_index_to_id[0] == 2
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.remove_adapter(3)
+    with pytest.raises(ValueError):
+        assert manager.pin_adapter(3)
+
+    assert manager.punica_wrapper.device == device
+    assert manager.device == device
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_lru_lora_model_manager(dist_init, dummy_model, device):
+    # This tests just the LRU cache functionality, everything else is
+    # tested in test_lora_model_manager
+    model = dummy_model
+    model_lora1 = create_lora(1,
+                              model, ["layer1.dense1", "dense2", "lm_head"],
+                              device=device)
+    model_lora2 = create_lora(2,
+                              model, ["dense1", "dense2", "lm_head"],
+                              device=device)
+    model_lora3 = create_lora(3,
+                              model, ["dense1", "dense2", "lm_head"],
+                              device=device)
+    model_lora4 = create_lora(4,
+                              model, ["dense1", "dense2", "lm_head"],
+                              device=device)
+    manager = LRUCacheLoRAModelManager(model,
+                                       2,
+                                       2,
+                                       2,
+                                       LoRAConfig(max_lora_rank=8,
+                                                  max_cpu_loras=2,
+                                                  max_loras=2,
+                                                  lora_dtype=DEFAULT_DTYPE),
+                                       device=device)
+
+    assert all(x is None for x in manager.lora_index_to_id)
+
+    # Add up to capacity
+    assert manager.add_adapter(model_lora1)
+    assert manager.add_adapter(model_lora2)
+    assert manager.activate_adapter(1)
+    assert manager.activate_adapter(2)
+
+    assert set(manager.list_adapters()) == {1, 2}
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] == 2
+
+    # Add over capacity
+    assert manager.add_adapter(model_lora3)
+    assert manager.add_adapter(model_lora4)
+    assert manager.activate_adapter(3)
+    assert manager.activate_adapter(4)
+
+    assert set(manager.list_adapters()) == {3, 4}
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 4
+
+    # Add 3 again to move it to the top and then add 2
+    # should return false since it's in already
+    assert not manager.add_adapter(model_lora3)
+    assert not manager.activate_adapter(3)
+    assert manager.add_adapter(model_lora2)
+    assert manager.activate_adapter(2)
+
+    assert set(manager.list_adapters()) == {3, 2}
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 2
+
+    # Remove manually
+    assert manager.remove_adapter(3)
+    assert not manager.remove_adapter(3)
+
+    assert set(manager.list_adapters()) == {2}
+    assert manager.lora_index_to_id[0] is None
+    assert manager.lora_index_to_id[1] == 2
+
+    assert manager.add_adapter(model_lora3)
+    assert manager.activate_adapter(3)
+    assert manager.add_adapter(model_lora4)
+    assert manager.activate_adapter(4)
+
+    assert set(manager.list_adapters()) == {3, 4}
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 4
+
+    assert manager.remove_oldest_adapter()
+    assert set(manager.list_adapters()) == {4}
+    assert manager.lora_index_to_id[0] is None
+    assert manager.lora_index_to_id[1] == 4
+
+    assert manager.remove_oldest_adapter()
+    assert set(manager.list_adapters()) == set()
+    assert all(x is None for x in manager.lora_index_to_id)
+
+    assert not manager.remove_oldest_adapter()
+    assert set(manager.list_adapters()) == set()
+    assert all(x is None for x in manager.lora_index_to_id)
+
+    # pinning
+    assert manager.add_adapter(model_lora3)
+    assert manager.activate_adapter(3)
+    assert manager.add_adapter(model_lora4)
+    assert manager.activate_adapter(4)
+    assert set(manager.list_adapters()) == {3, 4}
+    with pytest.raises(ValueError):
+        assert manager.pin_adapter(1)
+    assert manager.pin_adapter(3)
+    # Remove manually
+    assert manager.remove_adapter(3)
+    assert not manager.remove_adapter(3)
+
+    assert set(manager.list_adapters()) == {4}
+    assert manager.lora_index_to_id[0] is None
+    assert manager.lora_index_to_id[1] == 4
+
+    assert manager.add_adapter(model_lora1)
+    assert manager.pin_adapter(1)
+    assert manager.add_adapter(model_lora2)
+    assert manager.activate_adapter(2)
+
+    assert set(manager.list_adapters()) == {1, 2}
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] == 2
+
+    assert manager.remove_oldest_adapter()
+    assert set(manager.list_adapters()) == {1}
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] is None
+
+    with pytest.raises(RuntimeError):
+        assert manager.remove_oldest_adapter()
+
+    assert set(manager.list_adapters()) == {1}
+    assert manager.punica_wrapper.device == device
+    assert manager.device == device
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
+                                          sql_lora_files, device):
+    lora_config = LoRAConfig(max_lora_rank=8,
+                             max_cpu_loras=4,
+                             max_loras=4,
+                             lora_dtype=DEFAULT_DTYPE)
+    worker_adapter_manager = LRUCacheWorkerLoRAManager(
+        4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
+        lora_config.lora_extra_vocab_size, lora_config, device,
+        EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
+    worker_adapter_manager.create_lora_manager(
+        llama_2_7b_model_extra_embeddings)
+
+    mapping = LoRAMapping([], [])
+    worker_adapter_manager.set_active_adapters([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("2", 2, sql_lora_files)
+    ], mapping)
+    assert worker_adapter_manager.list_adapters() == {1, 2}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
+
+    worker_adapter_manager.set_active_adapters([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("3", 3, sql_lora_files),
+        LoRARequest("4", 4, sql_lora_files)
+    ], mapping)
+    assert worker_adapter_manager.list_adapters() == {1, 2, 3, 4}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 3
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
+
+    worker_adapter_manager.set_active_adapters([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("2", 2, sql_lora_files),
+        LoRARequest("5", 5, sql_lora_files)
+    ], mapping)
+    assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
+
+    worker_adapter_manager.set_active_adapters([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("1", 1, sql_lora_files)
+    ], mapping)
+    assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
+
+    worker_adapter_manager.set_active_adapters([
+        LoRARequest("6", 6, sql_lora_files),
+        LoRARequest("7", 7, sql_lora_files),
+        LoRARequest("8", 8, sql_lora_files)
+    ], mapping)
+    assert worker_adapter_manager.list_adapters() == {1, 6, 7, 8}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 7
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 8
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 6
+
+    # Over capacity
+    with pytest.raises(RuntimeError):
+        worker_adapter_manager.set_active_adapters([
+            LoRARequest("10", 10, sql_lora_files),
+            LoRARequest("11", 11, sql_lora_files),
+            LoRARequest("12", 12, sql_lora_files),
+            LoRARequest("13", 13, sql_lora_files),
+            LoRARequest("14", 14, sql_lora_files)
+        ], mapping)
+
+    assert worker_adapter_manager.device == device
+    assert (worker_adapter_manager._adapter_manager.punica_wrapper.device ==
+            device)
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
+                                sql_lora_files, device):
+    # Should remove every LoRA not specified in the request.
+    lora_config = LoRAConfig(max_lora_rank=8,
+                             max_cpu_loras=4,
+                             max_loras=4,
+                             lora_dtype=DEFAULT_DTYPE)
+    worker_adapter_manager = WorkerLoRAManager(
+        4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
+        lora_config.lora_extra_vocab_size, lora_config, device,
+        EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
+    worker_adapter_manager.create_lora_manager(
+        llama_2_7b_model_extra_embeddings)
+
+    mapping = LoRAMapping([], [])
+    worker_adapter_manager.set_active_adapters([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("2", 2, sql_lora_files)
+    ], mapping)
+    assert worker_adapter_manager.list_adapters() == {1, 2}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
+
+    worker_adapter_manager.set_active_adapters([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("3", 3, sql_lora_files),
+        LoRARequest("4", 4, sql_lora_files)
+    ], mapping)
+    assert worker_adapter_manager.list_adapters() == {1, 3, 4}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 3
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 4
+
+    worker_adapter_manager.set_active_adapters([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("2", 2, sql_lora_files),
+        LoRARequest("5", 5, sql_lora_files)
+    ], mapping)
+    assert worker_adapter_manager.list_adapters() == {1, 2, 5}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
+
+    worker_adapter_manager.set_active_adapters([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("1", 1, sql_lora_files)
+    ], mapping)
+    assert worker_adapter_manager.list_adapters() == {1}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] is None
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] is None
+
+    worker_adapter_manager.set_active_adapters([
+        LoRARequest("6", 6, sql_lora_files),
+        LoRARequest("7", 7, sql_lora_files),
+        LoRARequest("8", 8, sql_lora_files)
+    ], mapping)
+    assert worker_adapter_manager.list_adapters() == {6, 7, 8}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 8
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 6
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 7
+
+    # Over capacity
+    with pytest.raises(RuntimeError):
+        worker_adapter_manager.set_active_adapters([
+            LoRARequest("10", 10, sql_lora_files),
+            LoRARequest("11", 11, sql_lora_files),
+            LoRARequest("12", 12, sql_lora_files),
+            LoRARequest("13", 13, sql_lora_files),
+            LoRARequest("14", 14, sql_lora_files)
+        ], mapping)
+
+    assert worker_adapter_manager.device == device
+    assert (worker_adapter_manager._adapter_manager.punica_wrapper.device ==
+            device)
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_packed_loras(dist_init, dummy_model_gate_up, device):
+    model = dummy_model_gate_up
+    model_lora = create_packed_lora(
+        1,
+        model,
+        module_name="gate_up_proj",
+        replaced_module_names=["gate_proj", "up_proj"],
+        device=device)
+    model_lora1 = create_packed_lora(
+        2,
+        model,
+        module_name="gate_up_proj",
+        replaced_module_names=["gate_proj", "up_proj"],
+        device=device,
+        empty_replaced_module_name="gate_proj",
+    )
+
+    manager = LoRAModelManager(model,
+                               2,
+                               2,
+                               2,
+                               LoRAConfig(max_lora_rank=8,
+                                          max_cpu_loras=2,
+                                          max_loras=2,
+                                          lora_dtype=DEFAULT_DTYPE),
+                               device=device)
+    model = manager.model
+
+    assert isinstance(model.get_submodule("gate_up_proj"),
+                      MergedColumnParallelLinearWithLoRA)
+    # Verify packed lora is correct
+    model_lora_clone = model_lora.clone(1)
+    model_lora_clone1 = model_lora1.clone(1)
+    assert manager.add_adapter(model_lora)
+    assert manager.add_adapter(model_lora1)
+
+    assert model_lora.get_lora("gate_proj") is None
+    assert model_lora.get_lora("up_proj") is None
+    assert model_lora1.get_lora("up_proj") is None
+    packed_lora = model_lora.get_lora("gate_up_proj")
+    assert packed_lora and isinstance(packed_lora, PackedLoRALayerWeights)
+
+    torch.testing.assert_close(packed_lora.lora_a[0],
+                               model_lora_clone.get_lora("gate_proj").lora_a)
+    torch.testing.assert_close(packed_lora.lora_b[0],
+                               model_lora_clone.get_lora("gate_proj").lora_b)
+    torch.testing.assert_close(packed_lora.lora_a[1],
+                               model_lora_clone.get_lora("up_proj").lora_a)
+    torch.testing.assert_close(packed_lora.lora_b[1],
+                               model_lora_clone.get_lora("up_proj").lora_b)
+
+    packed_lora1 = model_lora1.get_lora("gate_up_proj")
+    assert packed_lora1 and isinstance(packed_lora1, PackedLoRALayerWeights)
+
+    assert packed_lora1.lora_a[0] is None
+    assert packed_lora1.lora_b[0] is None
+    torch.testing.assert_close(packed_lora1.lora_a[1],
+                               model_lora_clone1.get_lora("up_proj").lora_a)
+    torch.testing.assert_close(packed_lora1.lora_b[1],
+                               model_lora_clone1.get_lora("up_proj").lora_b)
diff --git a/vllm_v0.10.0/tests/lora/test_minicpmv_tp.py b/vllm_v0.10.0/tests/lora/test_minicpmv_tp.py
new file mode 100644
index 0000000..99fe951
--- /dev/null
+++ b/vllm_v0.10.0/tests/lora/test_minicpmv_tp.py
@@ -0,0 +1,136 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+import vllm
+from vllm.assets.image import ImageAsset
+from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
+
+from ..utils import create_new_process_for_each_test
+
+MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
+
+PROMPT_TEMPLATE = (
+    "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
+    "(<image>./</image>)\nWhat is in the image?<|eot_id|>"
+    "<|start_header_id|>assistant<|end_header_id|>\n\n")
+
+IMAGE_ASSETS = [
+    ImageAsset("stop_sign"),
+]
+
+# After fine-tuning with LoRA, all generated content should start begin `A`.
+EXPECTED_OUTPUT = [
+    "A red and white stop sign with a Chinese archway in the background featuring red lanterns and gold accents.",  # noqa: E501
+]
+
+
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
+    sampling_params = vllm.SamplingParams(
+        temperature=0,
+        max_tokens=5,
+        stop_token_ids=[128001, 128009],  # eos_id, eot_id
+    )
+
+    inputs = [{
+        "prompt": PROMPT_TEMPLATE,
+        "multi_modal_data": {
+            "image": asset.pil_image
+        },
+    } for asset in IMAGE_ASSETS]
+
+    outputs = llm.generate(
+        inputs,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None,
+    )
+    # Print the outputs.
+    generated_texts: list[str] = []
+    for output in outputs:
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Generated text: {generated_text!r}")
+    return generated_texts
+
+
+@pytest.mark.xfail(
+    current_platform.is_rocm(),
+    reason="MiniCPM-V dependency xformers incompatible with ROCm")
+def test_minicpmv_lora(minicpmv_lora_files):
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_num_seqs=2,
+        enable_lora=True,
+        max_loras=2,
+        max_lora_rank=8,
+        enforce_eager=True,
+        max_model_len=2048,
+        limit_mm_per_prompt={
+            "image": 2,
+            "video": 0
+        },
+        trust_remote_code=True,
+    )
+    output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert EXPECTED_OUTPUT[i].startswith(output1[i])
+    output2 = do_sample(llm, minicpmv_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert EXPECTED_OUTPUT[i].startswith(output2[i])
+
+
+@pytest.mark.skipif(current_platform.is_cuda_alike(),
+                    reason="Skipping to avoid redundant model tests")
+@pytest.mark.xfail(
+    current_platform.is_rocm(),
+    reason="MiniCPM-V dependency xformers incompatible with ROCm")
+@create_new_process_for_each_test()
+def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
+    llm = vllm.LLM(
+        MODEL_PATH,
+        enable_lora=True,
+        max_num_seqs=2,
+        max_loras=4,
+        max_lora_rank=64,
+        tensor_parallel_size=4,
+        limit_mm_per_prompt={
+            "image": 2,
+            "video": 0
+        },
+        trust_remote_code=True,
+    )
+    output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
+
+
+@pytest.mark.skipif(current_platform.is_cuda_alike(),
+                    reason="Skipping to avoid redundant model tests")
+@pytest.mark.xfail(
+    current_platform.is_rocm(),
+    reason="MiniCPM-V dependency xformers incompatible with ROCm")
+@create_new_process_for_each_test()
+def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
+    llm = vllm.LLM(
+        MODEL_PATH,
+        enable_lora=True,
+        max_num_seqs=2,
+        max_loras=2,
+        max_lora_rank=8,
+        tensor_parallel_size=4,
+        trust_remote_code=True,
+        limit_mm_per_prompt={
+            "image": 1,
+            "video": 0
+        },
+        fully_sharded_loras=True,
+    )
+    output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
+    output_tp = do_sample(llm, minicpmv_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
diff --git a/vllm_v0.10.0/tests/lora/test_mixtral.py b/vllm_v0.10.0/tests/lora/test_mixtral.py
new file mode 100644
index 0000000..0ea0779
--- /dev/null
+++ b/vllm_v0.10.0/tests/lora/test_mixtral.py
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+import vllm
+from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
+
+MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"
+
+
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
+              prompts: list[str]) -> list[str]:
+
+    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    # Print the outputs.
+    generated_texts: list[str] = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+@pytest.mark.parametrize("tp_size", [4])
+def test_mixtral_lora(mixtral_lora_files, tp_size):
+    """Original test, the LoRA model has the common target modules, not all"""
+    if torch.cuda.device_count(
+    ) < tp_size and tp_size > 1 and current_platform.is_cuda_alike():
+        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
+
+    prompts = [
+        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]",  # noqa: E501
+        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]",  # noqa: E501
+        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]",  # noqa: E501
+    ]
+
+    llm = vllm.LLM(
+        MODEL_PATH,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        distributed_executor_backend="ray",
+        tensor_parallel_size=tp_size,
+        enable_chunked_prefill=True,
+    )
+
+    expected_lora_output = [
+        "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",  # noqa: E501
+        "give_opinion(name[SpellForce 3], developer[Grimlore Games], release_year[2017], rating[poor])",  # noqa: E501
+        "inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])",  # noqa: E501
+    ]
+    assert do_sample(llm, mixtral_lora_files, lora_id=1,
+                     prompts=prompts) == expected_lora_output
+    assert do_sample(llm, mixtral_lora_files, lora_id=2,
+                     prompts=prompts) == expected_lora_output
diff --git a/vllm_v0.10.0/tests/lora/test_peft_helper.py b/vllm_v0.10.0/tests/lora/test_peft_helper.py
new file mode 100644
index 0000000..df8696c
--- /dev/null
+++ b/vllm_v0.10.0/tests/lora/test_peft_helper.py
@@ -0,0 +1,109 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+import math
+import shutil
+
+import pytest
+
+from vllm.config import LoRAConfig
+from vllm.lora.peft_helper import PEFTHelper
+
+ERROR_CASES = [
+    (
+        "test_rank",
+        {
+            "r": 1024
+        },
+        "is greater than max_lora_rank",
+    ),
+    (
+        "test_bias",
+        {
+            "bias": "all"
+        },
+        "Adapter bias cannot be used without bias_enabled",
+    ),
+    ("test_dora", {
+        "use_dora": True
+    }, "does not yet support DoRA"),
+    (
+        "test_modules_to_save",
+        {
+            "modules_to_save": ["lm_head"]
+        },
+        "only supports modules_to_save being None",
+    ),
+]
+
+
+def test_peft_helper_pass(sql_lora_files, tmp_path):
+    peft_helper = PEFTHelper.from_local_dir(sql_lora_files,
+                                            max_position_embeddings=4096)
+    lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2)
+    peft_helper.validate_legal(lora_config)
+    assert peft_helper.r == 8
+    assert peft_helper.lora_alpha == 16
+    assert peft_helper.target_modules == [
+        "q_proj",
+        "v_proj",
+        "k_proj",
+        "o_proj",
+        "gate_proj",
+        "up_proj",
+        "down_proj",
+        "embed_tokens",
+        "lm_head",
+    ]
+    assert peft_helper.vllm_max_position_embeddings == 4096
+
+    # test RSLoRA
+    rslora_config = dict(use_rslora=True)
+    test_dir = tmp_path / "test_rslora"
+    shutil.copytree(sql_lora_files, test_dir)
+
+    # Load and modify configuration
+    config_path = test_dir / "adapter_config.json"
+    with open(config_path) as f:
+        adapter_config = json.load(f)
+    # Apply configuration changes
+    adapter_config.update(rslora_config)
+
+    # Save modified configuration
+    with open(config_path, "w") as f:
+        json.dump(adapter_config, f)
+
+    peft_helper = PEFTHelper.from_local_dir(test_dir,
+                                            max_position_embeddings=4096)
+    peft_helper.validate_legal(lora_config)
+    scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r)
+    assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3
+
+
+@pytest.mark.parametrize("test_name,config_change,expected_error", ERROR_CASES)
+def test_peft_helper_error(
+    sql_lora_files,
+    tmp_path,
+    test_name: str,
+    config_change: dict,
+    expected_error: str,
+):
+    test_dir = tmp_path / test_name
+    shutil.copytree(sql_lora_files, test_dir)
+
+    # Load and modify configuration
+    config_path = test_dir / "adapter_config.json"
+    with open(config_path) as f:
+        adapter_config = json.load(f)
+    # Apply configuration changes
+    adapter_config.update(config_change)
+
+    # Save modified configuration
+    with open(config_path, "w") as f:
+        json.dump(adapter_config, f)
+    lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2)
+    # Test loading the adapter
+    with pytest.raises(ValueError, match=expected_error):
+        PEFTHelper.from_local_dir(
+            test_dir, max_position_embeddings=4096).validate_legal(lora_config)
diff --git a/vllm_v0.10.0/tests/lora/test_phi.py b/vllm_v0.10.0/tests/lora/test_phi.py
new file mode 100644
index 0000000..3090941
--- /dev/null
+++ b/vllm_v0.10.0/tests/lora/test_phi.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import vllm
+from vllm.lora.request import LoRARequest
+
+MODEL_PATH = "microsoft/phi-2"
+
+PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:"  # noqa: E501
+
+
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
+    prompts = [
+        PROMPT_TEMPLATE.format(
+            sql_prompt=
+            "Which catalog publisher has published the most catalogs?",
+            context="CREATE TABLE catalogs (catalog_publisher VARCHAR);"),
+        PROMPT_TEMPLATE.format(
+            sql_prompt=
+            "Which trip started from the station with the largest dock count? Give me the trip id.",  # noqa: E501
+            context=
+            "CREATE TABLE trip (id VARCHAR, start_station_id VARCHAR); CREATE TABLE station (id VARCHAR, dock_count VARCHAR);"  # noqa: E501
+        ),
+        PROMPT_TEMPLATE.format(
+            sql_prompt=
+            "How many marine species are found in the Southern Ocean?",  # noqa: E501
+            context=
+            "CREATE TABLE marine_species (name VARCHAR(50), common_name VARCHAR(50), location VARCHAR(50));"  # noqa: E501
+        ),
+    ]
+    sampling_params = vllm.SamplingParams(temperature=0,
+                                          max_tokens=64,
+                                          stop="### End")
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None,
+    )
+    # Print the outputs.
+    generated_texts: list[str] = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+def test_phi2_lora(phi2_lora_files):
+    # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
+    # Otherwise, the lora-test will fail due to CUDA OOM.
+    llm = vllm.LLM(MODEL_PATH,
+                   max_model_len=1024,
+                   enable_lora=True,
+                   max_loras=2,
+                   enforce_eager=True,
+                   enable_chunked_prefill=True)
+
+    expected_lora_output = [
+        "SELECT catalog_publisher, COUNT(*) as num_catalogs FROM catalogs GROUP BY catalog_publisher ORDER BY num_catalogs DESC LIMIT 1;",  # noqa: E501
+        "SELECT trip.id FROM trip JOIN station ON trip.start_station_id = station.id WHERE station.dock_count = (SELECT MAX(dock_count) FROM station);",  # noqa: E501
+        "SELECT COUNT(*) FROM marine_species WHERE location = 'Southern Ocean';",  # noqa: E501
+    ]
+
+    output1 = do_sample(llm, phi2_lora_files, lora_id=1)
+    for i in range(len(expected_lora_output)):
+        assert output1[i].startswith(expected_lora_output[i])
+    output2 = do_sample(llm, phi2_lora_files, lora_id=2)
+    for i in range(len(expected_lora_output)):
+        assert output2[i].startswith(expected_lora_output[i])
diff --git a/vllm_v0.10.0/tests/lora/test_punica_ops.py b/vllm_v0.10.0/tests/lora/test_punica_ops.py
new file mode 100644
index 0000000..14fa79a
--- /dev/null
+++ b/vllm_v0.10.0/tests/lora/test_punica_ops.py
@@ -0,0 +1,427 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from threading import Lock
+
+import pytest
+import torch
+
+import vllm.lora.ops.torch_ops as torch_ops
+import vllm.lora.ops.triton_ops as triton_ops
+from vllm.lora.ops.triton_ops import LoRAKernelMeta
+from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
+from vllm.platforms import current_platform
+
+from .utils import PunicaTensors, assert_close, generate_data_for_nslices
+
+
+@pytest.fixture(autouse=True)
+def reset_device(reset_default_device):
+    pass
+
+
+# Utility shrink and expand operations used as reference implementations.
+def sgmv_shrink_for_nslices(
+        nslices: int, inputs_tensor: torch.Tensor,
+        lora_weights_lst: list[torch.Tensor], out_tensor: torch.Tensor,
+        b_seq_start_loc: torch.Tensor, seq_len_tensor: torch.Tensor,
+        prompt_lora_mapping: torch.Tensor, batches: int, max_seq_length: int,
+        num_tokens: int, scaling: float):
+    """
+    Wrapper around torch_ops.sgmv_shrink that handles any nslices.
+    """
+    for index in range(nslices):
+        torch_ops.sgmv_shrink(
+            inputs_tensor,
+            lora_weights_lst[index],
+            out_tensor[index],
+            b_seq_start_loc,
+            seq_len_tensor,
+            prompt_lora_mapping,
+            batches,
+            max_seq_length,
+            num_tokens,
+            scaling,
+        )
+
+
+def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
+                            inputs_tensor: torch.Tensor,
+                            lora_weights_lst: list[torch.Tensor],
+                            out_tensor: torch.Tensor,
+                            b_seq_start_loc: torch.Tensor,
+                            seq_len_tensor: torch.Tensor,
+                            prompt_lora_mapping: torch.Tensor, batches: int,
+                            max_seq_length: int, num_tokens: int,
+                            add_inputs: bool) -> None:
+    """
+    Wrapper around torch_ops.sgmv_expand that handles any nslices.
+    """
+    if nslices == 1:
+        # Verify the torch's sgmv_expand op
+        torch_ops.sgmv_expand(
+            inputs_tensor[0],
+            lora_weights_lst[0],
+            out_tensor,
+            b_seq_start_loc,
+            seq_len_tensor,
+            prompt_lora_mapping,
+            batches,
+            max_seq_length,
+            num_tokens,
+            add_inputs=add_inputs,
+        )
+    else:
+        slice_offset = 0
+        for index in range(nslices):
+            lora_weights = lora_weights_lst[index]
+            torch_ops.sgmv_expand_slice(
+                inputs_tensor[index],
+                lora_weights,
+                out_tensor,
+                b_seq_start_loc,
+                seq_len_tensor,
+                prompt_lora_mapping,
+                batches,
+                max_seq_length,
+                num_tokens,
+                slice_offset,
+                hidden_size,
+                add_inputs=add_inputs,
+            )
+            slice_offset += hidden_size
+
+
+_dict_lock = Lock()
+
+
+def check_lora_shrink_kernel(batches: int, num_loras: int, rank: int,
+                             hidden_size: int, nslices: int,
+                             dtype: torch.dtype, device: str, seq_length: int,
+                             scaling: float):
+    """
+    Compare outputs of torch_ops.sgmv_shrink and triton_ops.lora_shrink
+    kernels.
+    """
+    data: PunicaTensors = generate_data_for_nslices(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        nslices,
+        dtype,
+        "shrink",
+        device,
+    )
+    max_seq_length, token_nums = data.meta()
+
+    # Setup metadata information for SGMV and reference kernels
+    sgmv_meta_args = (data.b_seq_start_loc, data.seq_len_tensor,
+                      data.prompt_lora_mapping, batches, max_seq_length,
+                      token_nums)
+
+    # Setup metadata information for the LoRA kernel.
+    lora_meta = LoRAKernelMeta.make(max_loras=num_loras,
+                                    max_num_tokens=token_nums,
+                                    device='cuda')
+    lora_meta.prepare_tensors(data.token_lora_mapping)
+
+    ref_out_tensor = data.ref_out_tensor
+    out_tensor = data.our_out_tensor.clone()
+
+    # Preventing cache error pointer.
+    with _dict_lock:
+        # lora_shrink kernel
+        _LORA_A_PTR_DICT.clear()
+        triton_ops.lora_shrink(
+            data.inputs_tensor,
+            data.lora_weights,
+            out_tensor,
+            *lora_meta.meta_args(token_nums=token_nums),
+            scaling,
+        )
+
+    # Reference
+    sgmv_shrink_for_nslices(
+        nslices,
+        data.inputs_tensor,
+        data.lora_weights,
+        ref_out_tensor,
+        *sgmv_meta_args,
+        scaling,
+    )
+
+    assert_close(out_tensor, ref_out_tensor)
+
+
+def check_lora_expand_kernel(batches: int, num_loras: int, rank: int,
+                             hidden_size: int, nslices: int,
+                             dtype: torch.dtype, device: str, seq_length: int,
+                             add_inputs: bool):
+    """
+    Compare outputs of torch_ops.sgmv_expand and triton_ops.lora_expand
+    kernels.
+    """
+    data: PunicaTensors = generate_data_for_nslices(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        nslices,
+        dtype,
+        "expand",
+        device,
+    )
+
+    max_seq_length, token_nums = data.meta()
+
+    # Setup metadata information for SGMV and reference kernels
+    sgmv_meta_args = (data.b_seq_start_loc, data.seq_len_tensor,
+                      data.prompt_lora_mapping, batches, max_seq_length,
+                      token_nums)
+
+    # Setup metadata information for the LoRA kernel.
+    lora_meta = LoRAKernelMeta.make(max_loras=num_loras,
+                                    max_num_tokens=token_nums,
+                                    device='cuda')
+    lora_meta.prepare_tensors(data.token_lora_mapping)
+
+    # Setup output tensors
+    ref_out_tensor = data.ref_out_tensor
+    out_tensor = data.our_out_tensor.clone()
+
+    with _dict_lock:
+        # lora_expand kernel
+        _LORA_B_PTR_DICT.clear()
+        triton_ops.lora_expand(data.inputs_tensor,
+                               data.lora_weights,
+                               out_tensor,
+                               *lora_meta.meta_args(token_nums=token_nums),
+                               offset_start=0,
+                               add_inputs=add_inputs)
+
+    # Reference
+    sgmv_expand_for_nslices(nslices,
+                            hidden_size,
+                            data.inputs_tensor,
+                            data.lora_weights,
+                            ref_out_tensor,
+                            *sgmv_meta_args,
+                            add_inputs=add_inputs)
+
+    assert_close(out_tensor, ref_out_tensor)
+
+
+# Tests
+# We test the punica kernels along 2 verticals mainly.
+# 1. Variations in hidden_dim size
+# 2. Variations in all other parameters like (batch_size, max_rank, num_loras
+#  etc.)
+
+# We have collected the hidden_sizes included in the LoRA models
+# currently supported by vLLM. It tests whether the corresponding Triton
+# kernel can run normally when tensor parallelism is set to
+# [1, 2, 4, 8, 16, 32, 64].
+HIDDEN_SIZES = [
+    128,
+    256,
+    512,
+    896,
+    1024,
+    1152,
+    1216,
+    1280,
+    1536,
+    1664,
+    2048,
+    2240,
+    2304,
+    2368,
+    2432,
+    2560,
+    2752,
+    3072,
+    3328,
+    3456,
+    3584,
+    3712,
+    4096,
+    4480,
+    4608,
+    4736,
+    4864,
+    5120,
+    5504,
+    5632,
+    5888,
+    6144,
+    6400,
+    6848,
+    6912,
+    7168,
+    7424,
+    8192,
+    8960,
+    9216,
+    9472,
+    10240,
+    11008,
+    11264,
+    13824,
+    14336,
+    14784,
+    14848,
+    15360,
+    18944,
+    22016,
+    22528,
+    24576,
+    27392,
+    27648,
+    29568,
+    29696,
+    32000,
+    32256,
+    32512,
+    32768,
+    33024,
+    36864,
+    43264,
+    49152,
+    49408,
+    60544,
+    60672,
+    64000,
+    64256,
+    102400,
+    102656,
+    128000,
+    128256,
+]
+#The size of TP
+divisibility = [1, 2, 8, 16, 64]
+
+all_hidden_size = []
+for div in divisibility:
+    for hidden_size in HIDDEN_SIZES:
+        all_hidden_size.append(hidden_size // div)
+
+HIDDEN_SIZES = list(set(all_hidden_size))
+
+# Test params that focuses on hidden_size variation.
+hs_test_params = {
+    "hidden_sizes": HIDDEN_SIZES,
+    "batches": [4],
+    "num_loras": [4],
+    "max_ranks": [32],
+}
+
+# General tests params that tests for variations in all dimensions
+# except hidden_size.
+test_params = {
+    "hidden_sizes": [2049],
+    "batches": [1, 4, 16, 32],
+    "num_loras": [1, 8, 32, 128],
+    "max_ranks": [1, 4, 8, 16, 32, 64, 128, 256],
+}
+
+DTYPES = [torch.float16, torch.bfloat16]
+DEVICES = [f"cuda:{0}"]
+SEED = [0]
+
+
+@pytest.mark.parametrize("batches", test_params['batches'])
+@pytest.mark.parametrize("num_loras", test_params['num_loras'])
+@pytest.mark.parametrize("rank", test_params['max_ranks'])
+@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes'])
+@pytest.mark.parametrize("nslices", [1, 2, 3])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
+def test_kernels(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    nslices: int,
+    dtype: torch.dtype,
+    device: str,
+    seed: int,
+    op_type: str,
+):
+    """
+    Tests LoRA kernels.
+    """
+    torch.set_default_device(device)
+    current_platform.seed_everything(seed)
+
+    if op_type == "shrink":
+        check_lora_shrink_kernel(batches=batches,
+                                 num_loras=num_loras,
+                                 rank=rank,
+                                 hidden_size=hidden_size,
+                                 nslices=nslices,
+                                 dtype=dtype,
+                                 device=device,
+                                 seq_length=128,
+                                 scaling=0.5)
+    else:
+        check_lora_expand_kernel(batches=batches,
+                                 num_loras=num_loras,
+                                 rank=rank,
+                                 hidden_size=hidden_size,
+                                 nslices=nslices,
+                                 dtype=dtype,
+                                 device=device,
+                                 seq_length=128,
+                                 add_inputs=True)
+
+
+@pytest.mark.parametrize("batches", hs_test_params['batches'])
+@pytest.mark.parametrize("num_loras", hs_test_params['num_loras'])
+@pytest.mark.parametrize("rank", hs_test_params['max_ranks'])
+@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes'])
+@pytest.mark.parametrize("nslices", [1, 2, 3])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
+def test_kernels_hidden_size(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    nslices: int,
+    dtype: torch.dtype,
+    device: str,
+    seed: int,
+    op_type: str,
+):
+    """
+    Tests SGMV and LoRA kernels.
+    """
+    torch.set_default_device(device)
+    current_platform.seed_everything(seed)
+
+    if op_type == "shrink":
+        check_lora_shrink_kernel(batches=batches,
+                                 num_loras=num_loras,
+                                 rank=rank,
+                                 hidden_size=hidden_size,
+                                 nslices=nslices,
+                                 dtype=dtype,
+                                 device=device,
+                                 seq_length=128,
+                                 scaling=0.5)
+    else:
+        check_lora_expand_kernel(batches=batches,
+                                 num_loras=num_loras,
+                                 rank=rank,
+                                 hidden_size=hidden_size,
+                                 nslices=nslices,
+                                 dtype=dtype,
+                                 device=device,
+                                 seq_length=128,
+                                 add_inputs=True)
diff --git a/vllm_v0.10.0/tests/lora/test_quant_model.py b/vllm_v0.10.0/tests/lora/test_quant_model.py
new file mode 100644
index 0000000..caa31fd
--- /dev/null
+++ b/vllm_v0.10.0/tests/lora/test_quant_model.py
@@ -0,0 +1,198 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/tests/lora/test_llama.py
+from dataclasses import dataclass
+
+import pytest
+
+import vllm
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
+
+
+@dataclass
+class ModelWithQuantization:
+    model_path: str
+    quantization: str
+
+
+MODELS: list[ModelWithQuantization]
+#AWQ quantization is currently not supported in ROCm.
+if current_platform.is_rocm():
+    MODELS = [
+        ModelWithQuantization(
+            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+            quantization="gptq"),
+    ]
+else:
+    MODELS = [
+        ModelWithQuantization(
+            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
+            quantization="awq"),
+        ModelWithQuantization(
+            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+            quantization="gptq"),
+    ]
+
+
+def do_sample(llm: vllm.LLM,
+              lora_path: str,
+              lora_id: int,
+              max_tokens: int = 256) -> list[str]:
+    raw_prompts = [
+        "Give me an orange-ish brown color",
+        "Give me a neon pink color",
+    ]
+
+    def format_prompt_tuples(prompt):
+        return f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
+
+    prompts = [format_prompt_tuples(p) for p in raw_prompts]
+
+    sampling_params = vllm.SamplingParams(temperature=0,
+                                          max_tokens=max_tokens,
+                                          stop=["<|im_end|>"])
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    # Print the outputs.
+    generated_texts: list[str] = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+@pytest.mark.parametrize("model", MODELS)
+def test_quant_model_lora(tinyllama_lora_files, model):
+
+    llm = vllm.LLM(
+        model=model.model_path,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        max_model_len=400,
+        gpu_memory_utilization=0.2,  #avoid OOM
+        quantization=model.quantization,
+        trust_remote_code=True,
+        enable_chunked_prefill=True)
+
+    if model.quantization is None:
+        expected_no_lora_output = [
+            "Here are some examples of orange-brown colors",
+            "I'm sorry, I don't have"
+        ]
+        expected_lora_output = [
+            "#ff8050",
+            "#ff8080",
+        ]
+    elif model.quantization == "awq":
+        expected_no_lora_output = [
+            "I'm sorry, I don't understand",
+            "I'm sorry, I don't understand",
+        ]
+        expected_lora_output = [
+            "#f07700: A v",
+            "#f00000: A v",
+        ]
+    elif model.quantization == "gptq":
+        expected_no_lora_output = [
+            "I'm sorry, I don't have",
+            "I'm sorry, I don't have",
+        ]
+        expected_lora_output = [
+            "#f08800: This is",
+            "#f07788 \n#",
+        ]
+
+    def expect_match(output, expected_output):
+        # HACK: GPTQ lora outputs are just incredibly unstable.
+        # Assert that the outputs changed.
+        if (model.quantization == "gptq"
+                and expected_output is expected_lora_output):
+            assert output != expected_no_lora_output
+            for i, o in enumerate(output):
+                assert o.startswith(
+                    '#'), f"Expected example {i} to start with # but got {o}"
+            return
+        assert output == expected_output
+
+    max_tokens = 10
+
+    print("lora adapter created")
+    output = do_sample(llm,
+                       tinyllama_lora_files,
+                       lora_id=0,
+                       max_tokens=max_tokens)
+    expect_match(output, expected_no_lora_output)
+
+    print("lora 1")
+    output = do_sample(llm,
+                       tinyllama_lora_files,
+                       lora_id=1,
+                       max_tokens=max_tokens)
+    expect_match(output, expected_lora_output)
+
+    print("no lora")
+    output = do_sample(llm,
+                       tinyllama_lora_files,
+                       lora_id=0,
+                       max_tokens=max_tokens)
+    expect_match(output, expected_no_lora_output)
+
+    print("lora 2")
+    output = do_sample(llm,
+                       tinyllama_lora_files,
+                       lora_id=2,
+                       max_tokens=max_tokens)
+    expect_match(output, expected_lora_output)
+
+    print("removing lora")
+
+    del llm
+    cleanup_dist_env_and_memory()
+
+
+@pytest.mark.parametrize("model", MODELS)
+def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
+                                 model):
+    if num_gpus_available < 2:
+        pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
+    if model.quantization == "gptq":
+        pytest.skip("GPTQ lora outputs are just incredibly unstable")
+    llm_tp1 = vllm.LLM(
+        model=model.model_path,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        gpu_memory_utilization=0.2,  #avoid OOM
+        quantization=model.quantization,
+        trust_remote_code=True,
+        enable_chunked_prefill=True)
+    output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1)
+
+    del llm_tp1
+    cleanup_dist_env_and_memory()
+
+    llm_tp2 = vllm.LLM(
+        model=model.model_path,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        tensor_parallel_size=2,
+        gpu_memory_utilization=0.2,  #avoid OOM
+        quantization=model.quantization,
+        enable_chunked_prefill=True)
+    output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1)
+
+    del llm_tp2
+    cleanup_dist_env_and_memory()
+
+    assert output_tp1 == output_tp2
diff --git a/vllm_v0.10.0/tests/lora/test_qwen2vl.py b/vllm_v0.10.0/tests/lora/test_qwen2vl.py
new file mode 100644
index 0000000..604bb30
--- /dev/null
+++ b/vllm_v0.10.0/tests/lora/test_qwen2vl.py
@@ -0,0 +1,202 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+from typing import Optional
+
+import pytest
+from packaging.version import Version
+from transformers import __version__ as TRANSFORMERS_VERSION
+
+import vllm
+from vllm.assets.image import ImageAsset
+from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
+from vllm.sampling_params import BeamSearchParams
+
+
+@dataclass
+class TestConfig:
+    model_path: str
+    lora_path: str
+    max_num_seqs: int = 2
+    max_loras: int = 2
+    max_lora_rank: int = 16
+    max_model_len: int = 4096
+    mm_processor_kwargs: Optional[dict[str, int]] = None
+
+    def __post_init__(self):
+        if self.mm_processor_kwargs is None:
+            self.mm_processor_kwargs = {
+                "min_pixels": 28 * 28,
+                "max_pixels": 1280 * 28 * 28,
+            }
+
+
+class Qwen2VLTester:
+    """Test helper for Qwen2 VL models with LoRA"""
+
+    PROMPT_TEMPLATE = (
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
+        "\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
+        "What is in the image?<|im_end|>\n"
+        "<|im_start|>assistant\n")
+
+    def __init__(self, config: TestConfig):
+        self.config = config
+        self.llm = self._initialize_llm()
+
+    def _initialize_llm(self) -> vllm.LLM:
+        """Initialize the LLM with given configuration"""
+        return vllm.LLM(
+            model=self.config.model_path,
+            max_num_seqs=self.config.max_num_seqs,
+            enable_lora=True,
+            max_loras=self.config.max_loras,
+            max_lora_rank=self.config.max_lora_rank,
+            trust_remote_code=True,
+            mm_processor_kwargs=self.config.mm_processor_kwargs,
+            max_model_len=self.config.max_model_len,
+        )
+
+    def run_test(self,
+                 images: list[ImageAsset],
+                 expected_outputs: list[str],
+                 lora_id: Optional[int] = None,
+                 temperature: float = 0,
+                 max_tokens: int = 5):
+
+        sampling_params = vllm.SamplingParams(
+            temperature=temperature,
+            max_tokens=max_tokens,
+        )
+        inputs = [{
+            "prompt": self.PROMPT_TEMPLATE,
+            "multi_modal_data": {
+                "image": asset.pil_image
+            },
+        } for asset in images]
+
+        lora_request = LoRARequest(str(lora_id), lora_id,
+                                   self.config.lora_path)
+        outputs = self.llm.generate(inputs,
+                                    sampling_params,
+                                    lora_request=lora_request)
+        generated_texts = [
+            output.outputs[0].text.strip() for output in outputs
+        ]
+
+        # Validate outputs
+        for generated, expected in zip(generated_texts, expected_outputs):
+            assert expected.startswith(
+                generated), f"Generated text {generated} doesn't "
+            f"match expected pattern {expected}"
+
+    def run_beam_search_test(self,
+                             images: list[ImageAsset],
+                             expected_outputs: list[list[str]],
+                             lora_id: Optional[int] = None,
+                             temperature: float = 0,
+                             beam_width: int = 2,
+                             max_tokens: int = 5):
+
+        beam_search_params = BeamSearchParams(beam_width=beam_width,
+                                              max_tokens=max_tokens,
+                                              temperature=temperature)
+
+        inputs = [{
+            "prompt": self.PROMPT_TEMPLATE,
+            "multi_modal_data": {
+                "image": asset.pil_image
+            },
+        } for asset in images]
+
+        lora_request = LoRARequest(str(lora_id), lora_id,
+                                   self.config.lora_path)
+        outputs = self.llm.beam_search(inputs,
+                                       beam_search_params,
+                                       lora_request=lora_request)
+
+        for output_obj, expected_outs in zip(outputs, expected_outputs):
+            output_texts = [seq.text for seq in output_obj.sequences]
+            assert output_texts == expected_outs, \
+                f"Generated texts {output_texts} do not match expected {expected_outs}"  # noqa: E501
+
+
+TEST_IMAGES = [
+    ImageAsset("stop_sign"),
+    ImageAsset("cherry_blossom"),
+]
+
+EXPECTED_OUTPUTS = [
+    "A red stop sign stands prominently in the foreground, with a traditional Chinese gate and a black SUV in the background, illustrating a blend of modern and cultural elements.",  # noqa: E501
+    "A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.",  # noqa: E501
+]
+
+# NOTE - beam search .text contains the whole text
+EXPECTED_BEAM_SEARCH_OUTPUTS = [
+    [
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is in the image?<|im_end|>\n<|im_start|>assistant\nA majestic skyscraper stands",  # noqa: E501
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is in the image?<|im_end|>\n<|im_start|>assistant\nA majestic tower stands tall",  # noqa: E501
+    ],
+]
+
+QWEN2VL_MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"
+QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct"
+
+
+@pytest.mark.xfail(
+    current_platform.is_rocm(),
+    reason="Qwen2-VL dependency xformers incompatible with ROCm")
+def test_qwen2vl_lora(qwen2vl_lora_files):
+    """Test Qwen 2.0 VL model with LoRA"""
+    config = TestConfig(model_path=QWEN2VL_MODEL_PATH,
+                        lora_path=qwen2vl_lora_files)
+    tester = Qwen2VLTester(config)
+
+    # Test with different LoRA IDs
+    for lora_id in [1, 2]:
+        tester.run_test(TEST_IMAGES,
+                        expected_outputs=EXPECTED_OUTPUTS,
+                        lora_id=lora_id)
+
+
+@pytest.mark.xfail(
+    current_platform.is_rocm(),
+    reason="Qwen2-VL dependency xformers incompatible with ROCm")
+def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
+    """Test Qwen 2.0 VL model with LoRA through beam search."""
+    config = TestConfig(model_path=QWEN2VL_MODEL_PATH,
+                        lora_path=qwen2vl_lora_files)
+    tester = Qwen2VLTester(config)
+
+    # Test with different LoRA IDs
+    for lora_id in [1, 2]:
+        # NOTE currently, we only test cherry blossom since stop sign
+        # output is slightly different for v1; - the root cause is likely
+        # independent of the intent of this test, which is to ensure beam
+        # search passes through lora through correctly.
+        tester.run_beam_search_test(
+            [ImageAsset("cherry_blossom")],
+            expected_outputs=EXPECTED_BEAM_SEARCH_OUTPUTS,
+            lora_id=lora_id)
+
+
+@pytest.mark.xfail(
+    current_platform.is_rocm(),
+    reason="Qwen2.5-VL dependency xformers incompatible with ROCm",
+)
+@pytest.mark.skipif(
+    Version(TRANSFORMERS_VERSION) < Version("4.49.0"),
+    reason="Qwen2.5-VL require transformers version no lower than 4.49.0",
+)
+def test_qwen25vl_lora(qwen25vl_lora_files):
+    """Test Qwen 2.5 VL model with LoRA"""
+    config = TestConfig(model_path=QWEN25VL_MODEL_PATH,
+                        lora_path=qwen25vl_lora_files)
+    tester = Qwen2VLTester(config)
+
+    # Test with different LoRA IDs
+    for lora_id in [1, 2]:
+        tester.run_test(TEST_IMAGES,
+                        expected_outputs=EXPECTED_OUTPUTS,
+                        lora_id=lora_id)
diff --git a/vllm_v0.10.0/tests/lora/test_resolver.py b/vllm_v0.10.0/tests/lora/test_resolver.py
new file mode 100644
index 0000000..6c93e57
--- /dev/null
+++ b/vllm_v0.10.0/tests/lora/test_resolver.py
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import pytest
+
+from vllm.lora.request import LoRARequest
+from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
+
+
+class DummyLoRAResolver(LoRAResolver):
+    """A dummy LoRA resolver for testing."""
+
+    async def resolve_lora(self, base_model_name: str,
+                           lora_name: str) -> Optional[LoRARequest]:
+        if lora_name == "test_lora":
+            return LoRARequest(
+                lora_name=lora_name,
+                lora_path=f"/dummy/path/{base_model_name}/{lora_name}",
+                lora_int_id=abs(hash(lora_name)))
+        return None
+
+
+def test_resolver_registry_registration():
+    """Test basic resolver registration functionality."""
+    registry = LoRAResolverRegistry
+    resolver = DummyLoRAResolver()
+
+    # Register a new resolver
+    registry.register_resolver("dummy", resolver)
+    assert "dummy" in registry.get_supported_resolvers()
+
+    # Get registered resolver
+    retrieved_resolver = registry.get_resolver("dummy")
+    assert retrieved_resolver is resolver
+
+
+def test_resolver_registry_duplicate_registration():
+    """Test registering a resolver with an existing name."""
+    registry = LoRAResolverRegistry
+    resolver1 = DummyLoRAResolver()
+    resolver2 = DummyLoRAResolver()
+
+    registry.register_resolver("dummy", resolver1)
+    registry.register_resolver("dummy", resolver2)
+
+    assert registry.get_resolver("dummy") is resolver2
+
+
+def test_resolver_registry_unknown_resolver():
+    """Test getting a non-existent resolver."""
+    registry = LoRAResolverRegistry
+
+    with pytest.raises(KeyError, match="not found"):
+        registry.get_resolver("unknown_resolver")
+
+
+@pytest.mark.asyncio
+async def test_dummy_resolver_resolve():
+    """Test the dummy resolver's resolve functionality."""
+    dummy_resolver = DummyLoRAResolver()
+    base_model_name = "base_model_test"
+    lora_name = "test_lora"
+
+    # Test successful resolution
+    result = await dummy_resolver.resolve_lora(base_model_name, lora_name)
+    assert isinstance(result, LoRARequest)
+    assert result.lora_name == lora_name
+    assert result.lora_path == f"/dummy/path/{base_model_name}/{lora_name}"
+
+    # Test failed resolution
+    result = await dummy_resolver.resolve_lora(base_model_name,
+                                               "nonexistent_lora")
+    assert result is None
diff --git a/vllm_v0.10.0/tests/lora/test_tokenizer_group.py b/vllm_v0.10.0/tests/lora/test_tokenizer_group.py
new file mode 100644
index 0000000..6cfdaf5
--- /dev/null
+++ b/vllm_v0.10.0/tests/lora/test_tokenizer_group.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+
+from vllm.lora.request import LoRARequest
+from vllm.transformers_utils.tokenizer import get_lora_tokenizer
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("tokenizer_group_type", [None, "ray"])
+async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
+    reference_tokenizer = AutoTokenizer.from_pretrained(sql_lora_files)
+    tokenizer_group = TokenizerGroup(
+        tokenizer_id="gpt2",
+        enable_lora=True,
+        max_num_seqs=1,
+        max_loras=1,
+        max_input_length=None,
+    )
+    lora_request = LoRARequest("1", 1, sql_lora_files)
+    assert reference_tokenizer.encode("prompt") == tokenizer_group.encode(
+        prompt="prompt", lora_request=lora_request)
+    assert reference_tokenizer.encode(
+        "prompt") == await tokenizer_group.encode_async(
+            prompt="prompt", lora_request=lora_request)
+    assert isinstance(tokenizer_group.get_lora_tokenizer(None),
+                      PreTrainedTokenizerBase)
+    assert tokenizer_group.get_lora_tokenizer(
+        None) == await tokenizer_group.get_lora_tokenizer_async(None)
+
+    assert isinstance(tokenizer_group.get_lora_tokenizer(lora_request),
+                      PreTrainedTokenizerBase)
+    assert tokenizer_group.get_lora_tokenizer(
+        lora_request) != tokenizer_group.get_lora_tokenizer(None)
+    assert tokenizer_group.get_lora_tokenizer(
+        lora_request) == await tokenizer_group.get_lora_tokenizer_async(
+            lora_request)
+
+
+def test_get_lora_tokenizer(sql_lora_files, tmp_path):
+    lora_request = None
+    tokenizer = get_lora_tokenizer(lora_request)
+    assert not tokenizer
+
+    lora_request = LoRARequest("1", 1, sql_lora_files)
+    tokenizer = get_lora_tokenizer(lora_request)
+    assert tokenizer.get_added_vocab()
+
+    lora_request = LoRARequest("1", 1, str(tmp_path))
+    tokenizer = get_lora_tokenizer(lora_request)
+    assert not tokenizer
+
+
+@pytest.mark.parametrize("enable_lora", [True, False])
+@pytest.mark.parametrize("max_num_seqs", [1, 2])
+@pytest.mark.parametrize("max_loras", [1, 2])
+def test_lora_tokenizers(enable_lora, max_num_seqs, max_loras):
+    tokenizer_group = TokenizerGroup(
+        tokenizer_id="gpt2",
+        enable_lora=enable_lora,
+        max_num_seqs=max_num_seqs,
+        max_loras=max_loras,
+        max_input_length=None,
+    )
+    if enable_lora:
+        assert tokenizer_group.lora_tokenizers.capacity == max(
+            max_num_seqs, max_loras)
+    else:
+        assert tokenizer_group.lora_tokenizers.capacity == 0
diff --git a/vllm_v0.10.0/tests/lora/test_transformers_model.py b/vllm_v0.10.0/tests/lora/test_transformers_model.py
new file mode 100644
index 0000000..723f7a5
--- /dev/null
+++ b/vllm_v0.10.0/tests/lora/test_transformers_model.py
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+import vllm
+from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
+
+from ..utils import create_new_process_for_each_test, multi_gpu_test
+
+MODEL_PATH = "hmellor/Ilama-3.2-1B"
+
+PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
+
+EXPECTED_LORA_OUTPUT = [
+    "SELECT count(*) FROM singer",
+    "SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE country  =  'France'",  # noqa: E501
+    "SELECT DISTINCT Country FROM singer WHERE Age  >  20",
+]
+
+
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
+    prompts = [
+        PROMPT_TEMPLATE.format(query="How many singers do we have?"),
+        PROMPT_TEMPLATE.format(
+            query=
+            "What is the average, minimum, and maximum age of all singers from France?"  # noqa: E501
+        ),
+        PROMPT_TEMPLATE.format(
+            query=
+            "What are all distinct countries where singers above age 20 are from?"  # noqa: E501
+        ),
+    ]
+    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    # Print the outputs.
+    generated_texts: list[str] = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+def test_ilama_lora(ilama_lora_files):
+    llm = vllm.LLM(MODEL_PATH,
+                   max_model_len=1024,
+                   enable_lora=True,
+                   max_loras=4,
+                   max_lora_rank=16,
+                   trust_remote_code=True,
+                   enable_chunked_prefill=True)
+
+    output1 = do_sample(llm, ilama_lora_files, lora_id=1)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output1[i] == EXPECTED_LORA_OUTPUT[i]
+    output2 = do_sample(llm, ilama_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output2[i] == EXPECTED_LORA_OUTPUT[i]
+
+
+@pytest.mark.skipif(current_platform.is_cuda_alike(),
+                    reason="Skipping to avoid redundant model tests")
+@multi_gpu_test(num_gpus=4)
+@create_new_process_for_each_test()
+def test_ilama_lora_tp4(ilama_lora_files):
+    llm = vllm.LLM(MODEL_PATH,
+                   max_model_len=1024,
+                   enable_lora=True,
+                   max_loras=4,
+                   max_lora_rank=16,
+                   tensor_parallel_size=4,
+                   trust_remote_code=True,
+                   fully_sharded_loras=False,
+                   enable_chunked_prefill=True)
+
+    output1 = do_sample(llm, ilama_lora_files, lora_id=1)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output1[i] == EXPECTED_LORA_OUTPUT[i]
+    output2 = do_sample(llm, ilama_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output2[i] == EXPECTED_LORA_OUTPUT[i]
+
+
+@pytest.mark.skipif(current_platform.is_cuda_alike(),
+                    reason="Skipping to avoid redundant model tests")
+@multi_gpu_test(num_gpus=4)
+@create_new_process_for_each_test()
+def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):
+    llm = vllm.LLM(MODEL_PATH,
+                   max_model_len=1024,
+                   enable_lora=True,
+                   max_loras=4,
+                   max_lora_rank=16,
+                   tensor_parallel_size=4,
+                   trust_remote_code=True,
+                   fully_sharded_loras=True,
+                   enable_chunked_prefill=True)
+    output1 = do_sample(llm, ilama_lora_files, lora_id=1)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output1[i] == EXPECTED_LORA_OUTPUT[i]
+    output2 = do_sample(llm, ilama_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output2[i] == EXPECTED_LORA_OUTPUT[i]
diff --git a/vllm_v0.10.0/tests/lora/test_utils.py b/vllm_v0.10.0/tests/lora/test_utils.py
new file mode 100644
index 0000000..b343bef
--- /dev/null
+++ b/vllm_v0.10.0/tests/lora/test_utils.py
@@ -0,0 +1,194 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections import OrderedDict
+from typing import NamedTuple, Optional
+from unittest.mock import patch
+
+import pytest
+from huggingface_hub.utils import HfHubHTTPError
+from torch import nn
+
+from vllm.lora.utils import (get_adapter_absolute_path,
+                             parse_fine_tuned_lora_name, replace_submodule)
+from vllm.model_executor.models.utils import WeightsMapper
+
+
+class LoRANameParserTestConfig(NamedTuple):
+    name: str
+    module_name: str
+    is_lora_a: bool
+    is_bias: bool
+    weights_mapper: Optional[WeightsMapper] = None
+
+
+def test_parse_fine_tuned_lora_name_valid():
+    fixture = [
+        LoRANameParserTestConfig("base_model.model.lm_head.lora_A.weight",
+                                 "lm_head", True, False),
+        LoRANameParserTestConfig("base_model.model.lm_head.lora_B.weight",
+                                 "lm_head", False, False),
+        LoRANameParserTestConfig(
+            "base_model.model.model.embed_tokens.lora_embedding_A",
+            "model.embed_tokens",
+            True,
+            False,
+        ),
+        LoRANameParserTestConfig(
+            "base_model.model.model.embed_tokens.lora_embedding_B",
+            "model.embed_tokens",
+            False,
+            False,
+        ),
+        LoRANameParserTestConfig(
+            "base_model.model.model.layers.9.mlp.down_proj.lora_A.weight",
+            "model.layers.9.mlp.down_proj",
+            True,
+            False,
+        ),
+        LoRANameParserTestConfig(
+            "base_model.model.model.layers.9.mlp.down_proj.lora_B.weight",
+            "model.layers.9.mlp.down_proj",
+            False,
+            False,
+        ),
+        LoRANameParserTestConfig(
+            "language_model.layers.9.mlp.down_proj.lora_A.weight",
+            "language_model.layers.9.mlp.down_proj",
+            True,
+            False,
+        ),
+        LoRANameParserTestConfig(
+            "language_model.layers.9.mlp.down_proj.lora_B.weight",
+            "language_model.layers.9.mlp.down_proj",
+            False,
+            False,
+        ),
+        # Test with WeightsMapper
+        LoRANameParserTestConfig(
+            "base_model.model.model.layers.9.mlp.down_proj.lora_A.weight",
+            "language_model.model.layers.9.mlp.down_proj",
+            True,
+            False,
+            weights_mapper=WeightsMapper(
+                orig_to_new_prefix={"model.": "language_model.model."}),
+        ),
+        LoRANameParserTestConfig(
+            "base_model.model.model.layers.9.mlp.down_proj.lora_B.weight",
+            "language_model.model.layers.9.mlp.down_proj",
+            False,
+            False,
+            weights_mapper=WeightsMapper(
+                orig_to_new_prefix={"model.": "language_model.model."}),
+        ),
+        LoRANameParserTestConfig(
+            "model.layers.9.mlp.down_proj.lora_A.weight",
+            "language_model.model.layers.9.mlp.down_proj",
+            True,
+            False,
+            weights_mapper=WeightsMapper(
+                orig_to_new_prefix={"model.": "language_model.model."}),
+        ),
+        LoRANameParserTestConfig(
+            "model.layers.9.mlp.down_proj.lora_B.weight",
+            "language_model.model.layers.9.mlp.down_proj",
+            False,
+            False,
+            weights_mapper=WeightsMapper(
+                orig_to_new_prefix={"model.": "language_model.model."}),
+        ),
+    ]
+    for name, module_name, is_lora_a, is_bias, weights_mapper in fixture:
+        assert (module_name, is_lora_a,
+                is_bias) == parse_fine_tuned_lora_name(name, weights_mapper)
+
+
+def test_parse_fine_tuned_lora_name_invalid():
+    fixture = {
+        "base_model.weight",
+        "base_model.model.weight",
+    }
+    for name in fixture:
+        with pytest.raises(ValueError, match="unsupported LoRA weight"):
+            parse_fine_tuned_lora_name(name)
+
+
+def test_replace_submodule():
+    model = nn.Sequential(
+        OrderedDict([
+            ("dense1", nn.Linear(764, 100)),
+            ("act1", nn.ReLU()),
+            ("dense2", nn.Linear(100, 50)),
+            (
+                "seq1",
+                nn.Sequential(
+                    OrderedDict([
+                        ("dense1", nn.Linear(100, 10)),
+                        ("dense2", nn.Linear(10, 50)),
+                    ])),
+            ),
+            ("act2", nn.ReLU()),
+            ("output", nn.Linear(50, 10)),
+            ("outact", nn.Sigmoid()),
+        ]))
+
+    sigmoid = nn.Sigmoid()
+
+    replace_submodule(model, "act1", sigmoid)
+    assert dict(model.named_modules())["act1"] == sigmoid
+
+    dense2 = nn.Linear(1, 5)
+    replace_submodule(model, "seq1.dense2", dense2)
+    assert dict(model.named_modules())["seq1.dense2"] == dense2
+
+
+# Unit tests for get_adapter_absolute_path
+@patch('os.path.isabs')
+def test_get_adapter_absolute_path_absolute(mock_isabs):
+    path = '/absolute/path/to/lora'
+    mock_isabs.return_value = True
+    assert get_adapter_absolute_path(path) == path
+
+
+@patch('os.path.expanduser')
+def test_get_adapter_absolute_path_expanduser(mock_expanduser):
+    # Path with ~ that needs to be expanded
+    path = '~/relative/path/to/lora'
+    absolute_path = '/home/user/relative/path/to/lora'
+    mock_expanduser.return_value = absolute_path
+    assert get_adapter_absolute_path(path) == absolute_path
+
+
+@patch('os.path.exists')
+@patch('os.path.abspath')
+def test_get_adapter_absolute_path_local_existing(mock_abspath, mock_exist):
+    # Relative path that exists locally
+    path = 'relative/path/to/lora'
+    absolute_path = '/absolute/path/to/lora'
+    mock_exist.return_value = True
+    mock_abspath.return_value = absolute_path
+    assert get_adapter_absolute_path(path) == absolute_path
+
+
+@patch('huggingface_hub.snapshot_download')
+@patch('os.path.exists')
+def test_get_adapter_absolute_path_huggingface(mock_exist,
+                                               mock_snapshot_download):
+    # Hugging Face model identifier
+    path = 'org/repo'
+    absolute_path = '/mock/snapshot/path'
+    mock_exist.return_value = False
+    mock_snapshot_download.return_value = absolute_path
+    assert get_adapter_absolute_path(path) == absolute_path
+
+
+@patch('huggingface_hub.snapshot_download')
+@patch('os.path.exists')
+def test_get_adapter_absolute_path_huggingface_error(mock_exist,
+                                                     mock_snapshot_download):
+    # Hugging Face model identifier with download error
+    path = 'org/repo'
+    mock_exist.return_value = False
+    mock_snapshot_download.side_effect = HfHubHTTPError(
+        "failed to query model info")
+    assert get_adapter_absolute_path(path) == path
diff --git a/vllm_v0.10.0/tests/lora/test_worker.py b/vllm_v0.10.0/tests/lora/test_worker.py
new file mode 100644
index 0000000..9999c1b
--- /dev/null
+++ b/vllm_v0.10.0/tests/lora/test_worker.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import random
+import tempfile
+from typing import Union
+from unittest.mock import patch
+
+import vllm.envs as envs
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, ParallelConfig, SchedulerConfig,
+                         VllmConfig)
+from vllm.lora.models import LoRAMapping
+from vllm.lora.request import LoRARequest
+from vllm.v1.worker.gpu_worker import Worker as V1Worker
+from vllm.worker.worker import Worker
+
+NUM_LORAS = 16
+
+
+@patch.dict(os.environ, {"RANK": "0"})
+def test_worker_apply_lora(sql_lora_files):
+
+    def set_active_loras(worker: Union[Worker, V1Worker],
+                         lora_requests: list[LoRARequest]):
+        lora_mapping = LoRAMapping([], [])
+        if isinstance(worker, Worker):
+            # v0 case
+            worker.model_runner.set_active_loras(lora_requests, lora_mapping)
+        else:
+            # v1 case
+            worker.model_runner.lora_manager.set_active_adapters(
+                lora_requests, lora_mapping)
+
+    worker_cls = V1Worker if envs.VLLM_USE_V1 else Worker
+
+    vllm_config = VllmConfig(
+        model_config=ModelConfig(
+            "meta-llama/Llama-2-7b-hf",
+            task="auto",
+            tokenizer="meta-llama/Llama-2-7b-hf",
+            tokenizer_mode="auto",
+            trust_remote_code=False,
+            seed=0,
+            dtype="float16",
+            revision=None,
+            enforce_eager=True,
+        ),
+        load_config=LoadConfig(
+            download_dir=None,
+            load_format="dummy",
+        ),
+        parallel_config=ParallelConfig(
+            pipeline_parallel_size=1,
+            tensor_parallel_size=1,
+            data_parallel_size=1,
+        ),
+        scheduler_config=SchedulerConfig("generate", 32, 32, 32),
+        device_config=DeviceConfig("cuda"),
+        cache_config=CacheConfig(
+            block_size=16,
+            swap_space=0,
+            cache_dtype="auto",
+        ),
+        lora_config=LoRAConfig(max_lora_rank=8,
+                               max_cpu_loras=NUM_LORAS,
+                               max_loras=NUM_LORAS),
+    )
+    worker = worker_cls(
+        vllm_config=vllm_config,
+        local_rank=0,
+        rank=0,
+        distributed_init_method=f"file://{tempfile.mkstemp()[1]}",
+    )
+
+    worker.init_device()
+    worker.load_model()
+
+    set_active_loras(worker, [])
+    assert worker.list_loras() == set()
+
+    lora_requests = [
+        LoRARequest(str(i + 1), i + 1, sql_lora_files)
+        for i in range(NUM_LORAS)
+    ]
+
+    set_active_loras(worker, lora_requests)
+    assert worker.list_loras() == {
+        lora_request.lora_int_id
+        for lora_request in lora_requests
+    }
+
+    for i in range(NUM_LORAS):
+        random.seed(i)
+        iter_lora_requests = random.choices(lora_requests,
+                                            k=random.randint(1, NUM_LORAS))
+        random.shuffle(iter_lora_requests)
+        iter_lora_requests = iter_lora_requests[:-random.randint(0, NUM_LORAS)]
+        set_active_loras(worker, lora_requests)
+        assert worker.list_loras().issuperset(
+            {lora_request.lora_int_id
+             for lora_request in iter_lora_requests})
diff --git a/vllm_v0.10.0/tests/lora/utils.py b/vllm_v0.10.0/tests/lora/utils.py
new file mode 100644
index 0000000..cc1b0d8
--- /dev/null
+++ b/vllm_v0.10.0/tests/lora/utils.py
@@ -0,0 +1,342 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+
+from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
+
+
+class DummyLoRAManager:
+
+    def __init__(self, device: torch.device = "cuda:0"):
+        super().__init__()
+        self._loras: dict[str, LoRALayerWeights] = {}
+        self._device = device
+
+    def set_module_lora(self, module_name: str, lora: LoRALayerWeights):
+        self._loras[module_name] = lora
+
+    def get_module_lora(self, module_name: str) -> LoRALayerWeights:
+        return self._loras[module_name]
+
+    def init_random_lora(
+        self,
+        module_name: str,
+        weight: torch.Tensor,
+        rank: int = 8,
+        generate_embeddings_tensor: int = 0,
+    ):
+        lora = LoRALayerWeights(
+            module_name,
+            rank=rank,
+            lora_alpha=1,
+            lora_a=torch.rand([weight.shape[1], rank],
+                              dtype=weight.dtype,
+                              device=self._device),
+            lora_b=torch.rand([rank, weight.shape[0]],
+                              dtype=weight.dtype,
+                              device=self._device),
+        )
+        if generate_embeddings_tensor:
+            lora.embeddings_tensor = torch.rand(
+                5,
+                generate_embeddings_tensor,
+                dtype=weight.dtype,
+                device=self._device,
+            )
+        self.set_module_lora(module_name, lora)
+
+        return lora
+
+    def init_lora(
+        self,
+        module_name: str,
+        input_dim: int,
+        output_dim: int,
+        rank=8,
+        noop=False,
+        embeddings_tensor=None,
+    ):
+        lora = LoRALayerWeights(
+            module_name,
+            rank=rank,
+            lora_alpha=1,
+            lora_a=torch.rand([input_dim, rank], device="cuda"),
+            lora_b=torch.rand([rank, output_dim], device="cuda"),
+            embeddings_tensor=embeddings_tensor,
+        )
+        self.set_module_lora(module_name, lora)
+        return lora
+
+    def reset_lora(self):
+        self._loras = {}
+
+    def init_packed_lora(
+        self,
+        module_name: str,
+        input_dim: int,
+        output_dims: list[int],
+        noop_lora_index: Optional[list[int]] = None,
+        rank: int = 8,
+    ):
+        base_loras: list[LoRALayerWeights] = []
+        noop_lora_index_set = set(noop_lora_index or [])
+
+        for i, out_dim in enumerate(output_dims):
+            base_lora = self.init_lora(
+                module_name + "_000_" + str(i),
+                input_dim,
+                out_dim,
+                rank=rank,
+                noop=i in noop_lora_index_set,
+            )
+            base_loras.append(base_lora)
+        packed_lora = PackedLoRALayerWeights.pack(base_loras)
+        self.set_module_lora(module_name, packed_lora)
+        return packed_lora
+
+
+def assert_close(a, b):
+    rtol, atol = {
+        torch.float16: (6e-2, 6e-2),
+        torch.bfloat16: (6e-2, 6e-2),
+        torch.float32: (1e-2, 1e-2),
+    }[a.dtype]
+    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
+
+
+@dataclass
+class PunicaTensors:
+    inputs_tensor: torch.Tensor
+    lora_weights: Union[torch.Tensor, list[torch.Tensor]]
+    our_out_tensor: torch.Tensor
+    ref_out_tensor: torch.Tensor
+    b_seq_start_loc: torch.Tensor
+    prompt_lora_mapping: torch.Tensor
+    seq_len_tensor: torch.Tensor
+    token_lora_mapping: torch.Tensor
+
+    def meta(self) -> tuple[int, int]:
+        """
+        Infer max_seq_length and token_nums from the tensors
+        and return them.
+        """
+        max_seq_length = self.seq_len_tensor.max()
+        token_nums = self.seq_len_tensor.sum().item()
+        if isinstance(max_seq_length, tuple):
+            max_seq_length = max_seq_length[0].item()
+        else:
+            max_seq_length = max_seq_length.item()
+        return max_seq_length, token_nums
+
+
+def generate_data(
+    batches,
+    hidden_size,
+    lora_nums,
+    max_rank,
+    seq_length,
+    dtype,
+    op_type,
+    device,
+) -> PunicaTensors:
+    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
+                                   (batches, )).to(device)
+    b_seq_start_loc = torch.cumsum(
+        torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
+        dim=0,
+    ).to(device)
+    total_tokens = seq_len_tensor.sum()
+    if op_type == "shrink":
+        inputs_tensor = torch.rand((total_tokens, hidden_size),
+                                   dtype=dtype).to(device)
+        lora_weights = torch.rand(
+            (lora_nums, max_rank, hidden_size),  # col-major
+            dtype=dtype,
+        ).to(device)
+        # shrink op need atomic_add, so output is initinized by 0
+        ref_out_tensor = torch.zeros((total_tokens, max_rank),
+                                     dtype=dtype,
+                                     device=inputs_tensor.device)
+        # NOTE  shrink kernel using torch.float32 as output type
+        our_out_tensor = torch.zeros((total_tokens, max_rank),
+                                     dtype=torch.float32).to(device)
+    else:
+        inputs_tensor = torch.rand(
+            (total_tokens, max_rank),
+            dtype=dtype,
+        ).to(device)
+        lora_weights = torch.rand(
+            (lora_nums, hidden_size, max_rank),  # col-major
+            dtype=dtype,
+        ).to(device)
+        # expand op needs to complete y+=a@lora_b, so output is
+        # initinized randomly
+        ref_out_tensor = torch.rand(
+            (total_tokens, hidden_size),
+            dtype=dtype,
+        ).to(device)
+        # Ensure the same input.
+        our_out_tensor = ref_out_tensor.clone()
+    lora_indices_tensor = torch.randint(0,
+                                        lora_nums - 1 if lora_nums > 1 else 1,
+                                        (batches, )).to(device)
+    indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
+    current_offset = 0
+    for b_id in range(batches):
+        lora_index = lora_indices_tensor[b_id]
+        indices[current_offset:current_offset +
+                seq_len_tensor[b_id]].copy_(lora_index)
+        current_offset += seq_len_tensor[b_id].item()
+
+    return PunicaTensors(
+        inputs_tensor,
+        lora_weights,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    )
+
+
+def generate_data_for_expand_nslices(
+    batches,
+    hidden_size,
+    lora_nums,
+    max_rank,
+    seq_length,
+    dtype,
+    nslices,
+    device,
+) -> PunicaTensors:
+    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
+                                   (batches, )).to(device)
+    b_seq_start_loc = torch.cumsum(
+        torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
+        dim=0,
+    ).to(device)
+    total_tokens = seq_len_tensor.sum()
+    inputs_tensor = torch.rand(
+        (total_tokens, max_rank),
+        dtype=dtype,
+    ).to(device)
+    lora_weights_lst = []
+    for _ in range(nslices):
+        lora_weights_lst.append(
+            torch.rand(
+                (lora_nums, hidden_size, max_rank),  # col-major
+                dtype=dtype,
+            ).to(device))
+    # expand op needs to complete y+=a@lora_b, so output is
+    # initinized randomly
+    ref_out_tensor = torch.rand((total_tokens, hidden_size * nslices),
+                                dtype=dtype).to(device)
+    # Ensure the same input.
+    our_out_tensor = ref_out_tensor.clone()
+    lora_indices_tensor = torch.randint(0,
+                                        lora_nums - 1 if lora_nums > 1 else 1,
+                                        (batches, ))
+    indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
+    current_offset = 0
+    for b_id in range(batches):
+        lora_index = lora_indices_tensor[b_id]
+        indices[current_offset:current_offset +
+                seq_len_tensor[b_id]] = (lora_index.item())
+        current_offset += seq_len_tensor[b_id].item()
+
+    lora_indices_tensor = lora_indices_tensor.to(device)
+    return PunicaTensors(
+        inputs_tensor,
+        lora_weights_lst,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    )
+
+
+def generate_data_for_nslices(
+    batches,
+    hidden_size,
+    lora_nums,
+    max_rank,
+    seq_length,
+    nslices,
+    dtype,
+    op_type,
+    device,
+) -> PunicaTensors:
+    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
+                                   (batches, )).to(device)
+    b_seq_start_loc = torch.cumsum(
+        torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
+        dim=0,
+    ).to(device)
+    total_tokens = seq_len_tensor.sum()
+
+    lora_weights_lst = []
+    if op_type == "shrink":
+
+        inputs_tensor = torch.rand((total_tokens, hidden_size),
+                                   dtype=dtype).to(device)
+
+        for _ in range(nslices):
+            if op_type == "shrink":
+                lora_weights_lst.append(
+                    torch.rand(
+                        (lora_nums, max_rank, hidden_size),  # col-major
+                        dtype=dtype,
+                    ).to(device))
+        # NOTE  shrink kernel using torch.float32 as output type
+        # shrink op need atomic_add, so output is initinized by 0
+        our_out_tensor = torch.zeros(
+            (nslices, total_tokens, max_rank),
+            dtype=torch.float32,
+        ).to(device)
+    else:
+        inputs_tensor = torch.rand(
+            (nslices, total_tokens, max_rank),
+            dtype=dtype,
+        ).to(device)
+        for _ in range(nslices):
+            lora_weights_lst.append(
+                torch.rand(
+                    (lora_nums, hidden_size, max_rank),  # col-major
+                    dtype=dtype,
+                ).to(device))
+        # expand op needs to complete y+=a@lora_b, so output is
+        # initinized randomly
+        our_out_tensor = torch.rand((total_tokens, hidden_size * nslices),
+                                    dtype=dtype).to(device)
+
+    # Ensure the same input.
+    ref_out_tensor = our_out_tensor.clone()
+    lora_indices_tensor = torch.randint(0,
+                                        lora_nums - 1 if lora_nums > 1 else 1,
+                                        (batches, ))
+    indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
+    current_offset = 0
+    for b_id in range(batches):
+        lora_index = lora_indices_tensor[b_id]
+        indices[current_offset:current_offset +
+                seq_len_tensor[b_id]] = (lora_index.item())
+        current_offset += seq_len_tensor[b_id].item()
+
+    lora_indices_tensor = lora_indices_tensor.to(device)
+    return PunicaTensors(
+        inputs_tensor,
+        lora_weights_lst,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    )
diff --git a/vllm_v0.10.0/tests/metrics/__init__.py b/vllm_v0.10.0/tests/metrics/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/metrics/test_metrics.py b/vllm_v0.10.0/tests/metrics/test_metrics.py
new file mode 100644
index 0000000..8cae8a8
--- /dev/null
+++ b/vllm_v0.10.0/tests/metrics/test_metrics.py
@@ -0,0 +1,307 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import ray
+from prometheus_client import REGISTRY
+
+import vllm.envs as envs
+from vllm import EngineArgs, LLMEngine
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.metrics import RayPrometheusStatLogger
+from vllm.sampling_params import SamplingParams
+from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This module tests V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
+MODELS = [
+    "distilbert/distilgpt2",
+]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_metric_counter_prompt_tokens(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    with vllm_runner(model,
+                     dtype=dtype,
+                     disable_log_stats=False,
+                     gpu_memory_utilization=0.4) as vllm_model:
+        tokenizer = vllm_model.llm.get_tokenizer()
+        prompt_token_counts = [
+            len(tokenizer.encode(p)) for p in example_prompts
+        ]
+        # This test needs at least 2 prompts in a batch of different lengths to
+        # verify their token count is correct despite padding.
+        assert len(example_prompts) > 1, "at least 2 prompts are required"
+        assert prompt_token_counts[0] != prompt_token_counts[1], (
+            "prompts of different lengths are required")
+        vllm_prompt_token_count = sum(prompt_token_counts)
+
+        _ = vllm_model.generate_greedy(example_prompts, max_tokens)
+        stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
+        metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
+            **stat_logger.labels)._value.get()
+
+    assert vllm_prompt_token_count == metric_count, (
+        f"prompt token count: {vllm_prompt_token_count!r}\n"
+        f"metric: {metric_count!r}")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_metric_counter_generation_tokens(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    with vllm_runner(model,
+                     dtype=dtype,
+                     disable_log_stats=False,
+                     gpu_memory_utilization=0.4) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        tokenizer = vllm_model.llm.get_tokenizer()
+        stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
+        metric_count = stat_logger.metrics.counter_generation_tokens.labels(
+            **stat_logger.labels)._value.get()
+        vllm_generation_count = 0
+        for i in range(len(example_prompts)):
+            vllm_output_ids, vllm_output_str = vllm_outputs[i]
+            prompt_ids = tokenizer.encode(example_prompts[i])
+            # vllm_output_ids contains both prompt tokens and generation tokens.
+            # We're interested only in the count of the generation tokens.
+            vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
+
+    assert vllm_generation_count == metric_count, (
+        f"generation token count: {vllm_generation_count!r}\n"
+        f"metric: {metric_count!r}")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_tokens", [128, 129])
+@pytest.mark.parametrize("disable_async_output_proc", [True, False])
+def test_metric_counter_generation_tokens_multi_step(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    max_tokens: int,
+    disable_async_output_proc: bool,
+) -> None:
+    num_scheduler_steps = 8
+    with vllm_runner(
+            model,
+            disable_log_stats=False,
+            gpu_memory_utilization=0.4,
+            num_scheduler_steps=num_scheduler_steps,
+            disable_async_output_proc=disable_async_output_proc,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        tokenizer = vllm_model.llm.get_tokenizer()
+        stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
+        metric_count = stat_logger.metrics.counter_generation_tokens.labels(
+            **stat_logger.labels)._value.get()
+        vllm_generation_count = 0
+        for i in range(len(example_prompts)):
+            vllm_output_ids, vllm_output_str = vllm_outputs[i]
+            prompt_ids = tokenizer.encode(example_prompts[i])
+            # vllm_output_ids contains both prompt tokens and generation tokens.
+            # We're interested only in the count of the generation tokens.
+            vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
+
+    # The multi-step scheduling will continue to execute forward even when
+    # encountering EOS, leading to slightly imprecise metrics.
+    assert abs(vllm_generation_count - metric_count) <\
+        len(example_prompts) * num_scheduler_steps, \
+        (f"generation token count: {vllm_generation_count!r}\n"
+         f"metric: {metric_count!r}")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize(
+    "served_model_name",
+    [None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]])
+def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
+                                   served_model_name: list[str]) -> None:
+    with vllm_runner(model,
+                     dtype=dtype,
+                     disable_log_stats=False,
+                     gpu_memory_utilization=0.3,
+                     served_model_name=served_model_name) as vllm_model:
+        stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
+        metrics_tag_content = stat_logger.labels["model_name"]
+
+    if envs.VLLM_CI_USE_S3:
+        model = f"{MODEL_WEIGHTS_S3_BUCKET}/{model}"
+    if served_model_name is None or served_model_name == []:
+        assert metrics_tag_content == model, (
+            f"Metrics tag model_name is wrong! expect: {model!r}\n"
+            f"actual: {metrics_tag_content!r}")
+    else:
+        assert metrics_tag_content == served_model_name[0], (
+            f"Metrics tag model_name is wrong! expect: "
+            f"{served_model_name[0]!r}\n"
+            f"actual: {metrics_tag_content!r}")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [4])
+@pytest.mark.parametrize("disable_log_stats", [True, False])
+@pytest.mark.asyncio
+async def test_async_engine_log_metrics_regression(
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    disable_log_stats: bool,
+) -> None:
+    """
+    Regression test ensuring async engine generates metrics
+    when disable_log_stats=False
+    (see: https://github.com/vllm-project/vllm/pull/4150#pullrequestreview-2008176678)
+    """
+    engine_args = AsyncEngineArgs(
+        model=model,
+        dtype=dtype,
+        disable_log_stats=disable_log_stats,
+    )
+    async_engine = AsyncLLMEngine.from_engine_args(engine_args)
+    for i, prompt in enumerate(example_prompts):
+        results = async_engine.generate(
+            prompt,
+            SamplingParams(max_tokens=max_tokens),
+            f"request-id-{i}",
+        )
+        # Exhaust the async iterator to make the async engine work
+        async for _ in results:
+            pass
+
+    assert_metrics(model, async_engine.engine, disable_log_stats,
+                   len(example_prompts))
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [4])
+@pytest.mark.parametrize("disable_log_stats", [True, False])
+def test_engine_log_metrics_regression(
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    disable_log_stats: bool,
+) -> None:
+    engine_args = EngineArgs(
+        model=model,
+        dtype=dtype,
+        disable_log_stats=disable_log_stats,
+    )
+    engine = LLMEngine.from_engine_args(engine_args)
+    for i, prompt in enumerate(example_prompts):
+        engine.add_request(
+            f"request-id-{i}",
+            prompt,
+            SamplingParams(max_tokens=max_tokens),
+        )
+    while engine.has_unfinished_requests():
+        engine.step()
+
+    if envs.VLLM_CI_USE_S3:
+        model = f"{MODEL_WEIGHTS_S3_BUCKET}/{model}"
+    assert_metrics(model, engine, disable_log_stats, len(example_prompts))
+
+
+def assert_metrics(model: str, engine: LLMEngine, disable_log_stats: bool,
+                   num_requests: int) -> None:
+    if disable_log_stats:
+        with pytest.raises(AttributeError):
+            _ = engine.stat_loggers
+    else:
+        assert (engine.stat_loggers
+                is not None), "engine.stat_loggers should be set"
+        # Ensure the count bucket of request-level histogram metrics matches
+        # the number of requests as a simple sanity check to ensure metrics are
+        # generated
+        labels = {'model_name': model}
+        request_histogram_metrics = [
+            "vllm:e2e_request_latency_seconds",
+            "vllm:request_prompt_tokens",
+            "vllm:request_generation_tokens",
+            "vllm:request_params_n",
+            "vllm:request_params_max_tokens",
+        ]
+        for metric_name in request_histogram_metrics:
+            metric_value = REGISTRY.get_sample_value(f"{metric_name}_count",
+                                                     labels)
+            assert (
+                metric_value == num_requests), "Metrics should be collected"
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [16])
+def test_engine_log_metrics_ray(
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    # This test is quite weak - it only checks that we can use
+    # RayPrometheusStatLogger without exceptions.
+    # Checking whether the metrics are actually emitted is unfortunately
+    # non-trivial.
+
+    # We have to run in a Ray task for Ray metrics to be emitted correctly
+    @ray.remote(num_gpus=1)
+    def _inner():
+
+        class _RayPrometheusStatLogger(RayPrometheusStatLogger):
+
+            def __init__(self, *args, **kwargs):
+                self._i = 0
+                super().__init__(*args, **kwargs)
+
+            def log(self, *args, **kwargs):
+                self._i += 1
+                return super().log(*args, **kwargs)
+
+        engine_args = EngineArgs(
+            model=model,
+            dtype=dtype,
+            disable_log_stats=False,
+        )
+        engine = LLMEngine.from_engine_args(engine_args)
+        logger = _RayPrometheusStatLogger(
+            local_interval=0.5,
+            labels=dict(model_name=engine.model_config.served_model_name),
+            vllm_config=engine.vllm_config)
+        engine.add_logger("ray", logger)
+        for i, prompt in enumerate(example_prompts):
+            engine.add_request(
+                f"request-id-{i}",
+                prompt,
+                SamplingParams(max_tokens=max_tokens),
+            )
+        while engine.has_unfinished_requests():
+            engine.step()
+        assert logger._i > 0, ".log must be called at least once"
+
+    ray.get(_inner.remote())
diff --git a/vllm_v0.10.0/tests/mistral_tool_use/__init__.py b/vllm_v0.10.0/tests/mistral_tool_use/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/mistral_tool_use/conftest.py b/vllm_v0.10.0/tests/mistral_tool_use/conftest.py
new file mode 100644
index 0000000..e89e60c
--- /dev/null
+++ b/vllm_v0.10.0/tests/mistral_tool_use/conftest.py
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import pytest_asyncio
+from huggingface_hub import snapshot_download
+
+from tests.utils import RemoteOpenAIServer
+from vllm.platforms import current_platform
+
+from .utils import ARGS, CONFIGS, ServerConfig
+
+
+# for each server config, download the model and return the config
+@pytest.fixture(scope="session", params=CONFIGS.keys())
+def server_config(request):
+    config = CONFIGS[request.param]
+
+    if current_platform.is_rocm() and not config.get("supports_rocm", True):
+        pytest.skip("The {} model can't be tested on the ROCm platform".format(
+            config["model"]))
+
+    # download model and tokenizer using transformers
+    snapshot_download(config["model"])
+    yield CONFIGS[request.param]
+
+
+# run this for each server config
+@pytest.fixture(scope="session")
+def server(request, server_config: ServerConfig):
+    model = server_config["model"]
+    args_for_model = server_config["arguments"]
+    with RemoteOpenAIServer(model, ARGS + args_for_model,
+                            max_wait_seconds=480) as server:
+        yield server
+
+
+@pytest_asyncio.fixture
+async def client(server: RemoteOpenAIServer):
+    async with server.get_async_client() as async_client:
+        yield async_client
diff --git a/vllm_v0.10.0/tests/mistral_tool_use/test_mistral_tool_calls.py b/vllm_v0.10.0/tests/mistral_tool_use/test_mistral_tool_calls.py
new file mode 100644
index 0000000..9bf6863
--- /dev/null
+++ b/vllm_v0.10.0/tests/mistral_tool_use/test_mistral_tool_calls.py
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import openai
+import pytest
+
+from tests.tool_use.utils import MESSAGES_ASKING_FOR_TOOLS, WEATHER_TOOL
+
+
+# test: a tool_choice with mistral-tokenizer results in an ID of length 9
+@pytest.mark.asyncio
+async def test_tool_call_with_tool_choice(client: openai.AsyncOpenAI):
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+    chat_completion = await client.chat.completions.create(
+        messages=MESSAGES_ASKING_FOR_TOOLS,
+        temperature=0,
+        max_completion_tokens=100,
+        model=model_name,
+        tools=[WEATHER_TOOL],
+        tool_choice=WEATHER_TOOL,
+        logprobs=False)
+
+    choice = chat_completion.choices[0]
+
+    assert choice.finish_reason != "tool_calls"  # "stop" or "length"
+    assert choice.message.role == "assistant"
+    assert choice.message.tool_calls is None \
+           or len(choice.message.tool_calls) == 1
+    assert len(choice.message.tool_calls[0].id) == 9  # length of 9 for mistral
diff --git a/vllm_v0.10.0/tests/mistral_tool_use/utils.py b/vllm_v0.10.0/tests/mistral_tool_use/utils.py
new file mode 100644
index 0000000..7a026cd
--- /dev/null
+++ b/vllm_v0.10.0/tests/mistral_tool_use/utils.py
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+from typing_extensions import TypedDict
+
+
+class ServerConfig(TypedDict, total=False):
+    model: str
+    arguments: list[str]
+    system_prompt: Optional[str]
+    supports_parallel: Optional[bool]
+    supports_rocm: Optional[bool]
+
+
+ARGS: list[str] = ["--max-model-len", "1024"]
+
+CONFIGS: dict[str, ServerConfig] = {
+    "mistral": {
+        "model":
+        "mistralai/Mistral-7B-Instruct-v0.3",
+        "arguments": [
+            "--tokenizer-mode", "mistral",
+            "--ignore-patterns=\"consolidated.safetensors\""
+        ],
+        "system_prompt":
+        "You are a helpful assistant with access to tools. If a tool"
+        " that you have would be helpful to answer a user query, "
+        "call the tool. Otherwise, answer the user's query directly "
+        "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
+        "to the user's question - just respond to it normally."
+    },
+}
diff --git a/vllm_v0.10.0/tests/model_executor/__init__.py b/vllm_v0.10.0/tests/model_executor/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/model_executor/conftest.py b/vllm_v0.10.0/tests/model_executor/conftest.py
new file mode 100644
index 0000000..c6d89d8
--- /dev/null
+++ b/vllm_v0.10.0/tests/model_executor/conftest.py
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+
+@pytest.fixture
+def sample_regex():
+    return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+            r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
+
+
+@pytest.fixture
+def sample_json_schema():
+    return {
+        "type": "object",
+        "properties": {
+            "name": {
+                "type": "string"
+            },
+            "age": {
+                "type": "integer"
+            },
+            "skills": {
+                "type": "array",
+                "items": {
+                    "type": "string",
+                    "maxLength": 10
+                },
+                "minItems": 3
+            },
+            "work_history": {
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "company": {
+                            "type": "string"
+                        },
+                        "duration": {
+                            "type": "number"
+                        },
+                        "position": {
+                            "type": "string"
+                        }
+                    },
+                    "required": ["company", "position"]
+                }
+            }
+        },
+        "required": ["name", "age", "skills", "work_history"]
+    }
diff --git a/vllm_v0.10.0/tests/model_executor/test_enabled_custom_ops.py b/vllm_v0.10.0/tests/model_executor/test_enabled_custom_ops.py
new file mode 100644
index 0000000..140f002
--- /dev/null
+++ b/vllm_v0.10.0/tests/model_executor/test_enabled_custom_ops.py
@@ -0,0 +1,172 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.activation import (GeluAndMul,
+                                                   ReLUSquaredActivation,
+                                                   SiluAndMul)
+from vllm.model_executor.layers.fused_moe.fused_moe import (dispatch_topk_func,
+                                                            vllm_topk_softmax)
+from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+    is_rocm_aiter_moe_enabled)
+from vllm.model_executor.layers.layernorm import (
+    RMSNorm, dispatch_cuda_rmsnorm_func, fused_add_rms_norm, rms_norm,
+    rocm_aiter_fused_add_rms_norm, rocm_aiter_rms_norm)
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    cutlass_scaled_mm, dispatch_w8a8_blockscale_func, w8a8_block_fp8_matmul)
+from vllm.platforms import current_platform
+
+
+# Registered subclass for test
+@CustomOp.register("relu3")
+class Relu3(ReLUSquaredActivation):
+    pass
+
+
+@pytest.mark.parametrize(
+    "env, torch_level, use_inductor, ops_enabled, default_on",
+    [
+        # Default values based on compile level
+        # - All by default (no Inductor compilation)
+        ("", 0, False, [True] * 4, True),
+        ("", 1, True, [True] * 4, True),
+        ("", 2, False, [True] * 4, True),
+        # - None by default (with Inductor)
+        ("", 3, True, [False] * 4, False),
+        ("", 4, True, [False] * 4, False),
+        # - All by default (without Inductor)
+        ("", 3, False, [True] * 4, True),
+        ("", 4, False, [True] * 4, True),
+        # Explicitly enabling/disabling
+        #
+        # Default: all
+        #
+        # All but SiluAndMul
+        ("+rms_norm,-silu_and_mul", 0, True, [1, 0, 1, 1], True),
+        # Only ReLU3
+        ("none,-rms_norm,+relu3", 1, False, [0, 0, 0, 1], False),
+        # All but SiluAndMul
+        ("all,-silu_and_mul", 2, True, [1, 0, 1, 1], True),
+        # All but ReLU3 (even if ReLU2 is on)
+        ("-relu3,relu2", 3, False, [1, 1, 1, 0], True),
+        # RMSNorm and SiluAndMul
+        ("none,-relu3,+rms_norm,+silu_and_mul", 4, False, [1, 1, 0, 0], False),
+        # All but RMSNorm
+        ("-rms_norm", 3, False, [0, 1, 1, 1], True),
+        #
+        # Default: none
+        #
+        # Only ReLU3
+        ("-silu_and_mul,+relu3", 3, True, [0, 0, 0, 1], False),
+        # All but RMSNorm
+        ("all,-rms_norm", 4, True, [0, 1, 1, 1], True),
+    ])
+def test_enabled_ops(env: str, torch_level: int, use_inductor: bool,
+                     ops_enabled: list[int], default_on: bool):
+    vllm_config = VllmConfig(
+        compilation_config=CompilationConfig(use_inductor=bool(use_inductor),
+                                             level=torch_level,
+                                             custom_ops=env.split(",")))
+    with set_current_vllm_config(vllm_config):
+        assert CustomOp.default_on() == default_on
+
+        ops_enabled = [bool(x) for x in ops_enabled]
+
+        assert RMSNorm(1024).enabled() == ops_enabled[0]
+        assert CustomOp.op_registry["rms_norm"].enabled() == ops_enabled[0]
+
+        assert SiluAndMul().enabled() == ops_enabled[1]
+        assert CustomOp.op_registry["silu_and_mul"].enabled() == ops_enabled[1]
+
+        assert GeluAndMul().enabled() == ops_enabled[2]
+        assert CustomOp.op_registry["gelu_and_mul"].enabled() == ops_enabled[2]
+
+        # If registered, subclasses should follow their own name
+        assert Relu3().enabled() == ops_enabled[3]
+        assert CustomOp.op_registry["relu3"].enabled() == ops_enabled[3]
+
+        # Unregistered subclass
+        class SiluAndMul2(SiluAndMul):
+            pass
+
+        # Subclasses should not require registration
+        assert SiluAndMul2().enabled() == SiluAndMul().enabled()
+
+
+@pytest.mark.parametrize(
+    "env", ["all,none", "all,+rms_norm,all", "+rms_norm,-rms_norm"])
+def test_enabled_ops_invalid(env: str):
+    with pytest.raises(Exception):  # noqa
+        vllm_config = VllmConfig(compilation_config=CompilationConfig(
+            custom_ops=env.split(",")))
+        with set_current_vllm_config(vllm_config):
+            RMSNorm(1024).enabled()
+
+
+@pytest.mark.skipif(
+    not current_platform.is_rocm() or not current_platform.is_fp8_fnuz(),
+    reason="AITER is a feature exclusive for ROCm and FP8_FNUZ")
+@pytest.mark.parametrize("use_cutlass", [True, False])
+@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
+@pytest.mark.parametrize("use_rocm_aiter_gemm_w8a8_blockscale", ["0", "1"])
+def test_w8a8_blockscale_dispatch(use_cutlass: bool, use_rocm_aiter: str,
+                                  use_rocm_aiter_gemm_w8a8_blockscale: str,
+                                  monkeypatch):
+
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER_LINEAR",
+                       use_rocm_aiter_gemm_w8a8_blockscale)
+
+    use_aiter_and_is_supported = (bool(int(use_rocm_aiter)) and bool(
+        int(use_rocm_aiter_gemm_w8a8_blockscale)))
+    block_scale_func = dispatch_w8a8_blockscale_func(
+        use_cutlass, use_aiter_and_is_supported=use_aiter_and_is_supported)
+    if use_cutlass:
+        assert block_scale_func == cutlass_scaled_mm
+    elif current_platform.is_rocm() and int(use_rocm_aiter) and int(
+            use_rocm_aiter_gemm_w8a8_blockscale):
+        assert block_scale_func == (
+            torch.ops.vllm.rocm_aiter_gemm_w8a8_blockscale)
+    else:
+        assert block_scale_func == w8a8_block_fp8_matmul
+
+
+@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
+def test_topk_dispatch(use_rocm_aiter: str, monkeypatch):
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
+    topk_func = dispatch_topk_func()
+    is_rocm_aiter_moe_enabled.cache_clear()
+    if current_platform.is_rocm() and int(use_rocm_aiter):
+        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+            rocm_aiter_topk_softmax)
+        assert topk_func == rocm_aiter_topk_softmax
+    else:
+        assert topk_func == vllm_topk_softmax
+
+
+@pytest.mark.parametrize("add_residual", [True, False])
+@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
+@pytest.mark.parametrize("use_rocm_aiter_norm", ["0", "1"])
+@pytest.mark.skipif(not current_platform.is_rocm(),
+                    reason="AITER is a feature exclusive for ROCm")
+def test_rms_norm_dispatch(add_residual: bool, use_rocm_aiter: str,
+                           use_rocm_aiter_norm: str, monkeypatch):
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER_RMSNORM", use_rocm_aiter_norm)
+    rms_norm_func = dispatch_cuda_rmsnorm_func(add_residual)
+
+    if not add_residual:
+        if current_platform.is_rocm() and int(use_rocm_aiter) and int(
+                use_rocm_aiter_norm):
+            assert rms_norm_func == rocm_aiter_rms_norm
+        else:
+            assert rms_norm_func == rms_norm
+    elif current_platform.is_rocm() and int(use_rocm_aiter) and int(
+            use_rocm_aiter_norm):
+        assert rms_norm_func == rocm_aiter_fused_add_rms_norm
+    else:
+        assert rms_norm_func == fused_add_rms_norm
diff --git a/vllm_v0.10.0/tests/model_executor/test_guided_processors.py b/vllm_v0.10.0/tests/model_executor/test_guided_processors.py
new file mode 100644
index 0000000..721478f
--- /dev/null
+++ b/vllm_v0.10.0/tests/model_executor/test_guided_processors.py
@@ -0,0 +1,213 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+import pickle
+
+import pytest
+import torch
+from transformers import AutoTokenizer
+
+from vllm.config import ModelConfig
+from vllm.model_executor.guided_decoding import (
+    get_guided_decoding_logits_processor,
+    get_local_guided_decoding_logits_processor)
+from vllm.model_executor.guided_decoding.outlines_logits_processors import (
+    JSONLogitsProcessor, RegexLogitsProcessor)
+from vllm.sampling_params import GuidedDecodingParams
+
+MODEL_NAME = 'HuggingFaceH4/zephyr-7b-beta'
+GUIDED_DECODING_BACKENDS = [
+    "outlines", "lm-format-enforcer", "xgrammar", "guidance"
+]
+GUIDED_DECODING_BACKENDS_WITH_REASONING_SUPPORT = ["outlines", "xgrammar"]
+REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
+
+
+# Initialize the tokenizer for the model here to avoid repeated loading
+@pytest.fixture(scope="module")
+def zephyr_7B_tokenzer():
+    return AutoTokenizer.from_pretrained(MODEL_NAME)
+
+
+@pytest.fixture(scope="module")
+def deepseek_r1_qwen_tokenizer():
+    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+
+
+def test_guided_logits_processors(zephyr_7B_tokenzer, sample_regex,
+                                  sample_json_schema):
+    """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
+    regex_LP = RegexLogitsProcessor(sample_regex,
+                                    zephyr_7B_tokenzer,
+                                    reasoner=None)
+    json_LP = JSONLogitsProcessor(sample_json_schema,
+                                  zephyr_7B_tokenzer,
+                                  whitespace_pattern=None,
+                                  reasoner=None)
+
+    tensor = torch.rand(32000)
+    original_tensor = torch.clone(tensor)
+    tensor = regex_LP([], tensor)
+    assert tensor.shape == original_tensor.shape
+    assert not torch.allclose(tensor, original_tensor)
+
+    tensor = torch.rand(32000)
+    original_tensor = torch.clone(tensor)
+    tensor = json_LP([], tensor)
+    assert tensor.shape == original_tensor.shape
+    assert not torch.allclose(tensor, original_tensor)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("backend", GUIDED_DECODING_BACKENDS)
+@pytest.mark.parametrize("is_local", [True, False])
+async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
+                                                 sample_regex,
+                                                 sample_json_schema,
+                                                 zephyr_7B_tokenzer):
+
+    config = ModelConfig(
+        MODEL_NAME,
+        task="generate",
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="bfloat16",
+    )
+    regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)
+
+    regex_lp = get_local_guided_decoding_logits_processor(
+            regex_request, zephyr_7B_tokenzer, config) if is_local else \
+            await get_guided_decoding_logits_processor(
+                    regex_request, zephyr_7B_tokenzer, config)
+    assert regex_lp is not None
+    tensor = torch.rand(32000)
+    original_tensor = torch.clone(tensor)
+    # allowed tokens at state 0
+    tensor = regex_lp([], tensor)
+    assert tensor.shape == original_tensor.shape
+    assert not torch.allclose(tensor, original_tensor)
+
+    json_request = GuidedDecodingParams(json=sample_json_schema,
+                                        backend=backend)
+    json_lp = await get_guided_decoding_logits_processor(
+        json_request, zephyr_7B_tokenzer, config)
+    assert json_lp is not None
+    tensor = torch.rand(32000)
+    original_tensor = torch.clone(tensor)
+    tensor = json_lp([], tensor)
+    assert tensor.shape == original_tensor.shape
+    assert not torch.allclose(tensor, original_tensor)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("backend",
+                         GUIDED_DECODING_BACKENDS_WITH_REASONING_SUPPORT)
+@pytest.mark.parametrize("is_local", [True, False])
+@pytest.mark.parametrize("reasoning_backend", ["deepseek_r1"])
+async def test_guided_logits_processor_with_reasoning(
+        backend: str, is_local: bool, reasoning_backend: str, sample_regex,
+        sample_json_schema, deepseek_r1_qwen_tokenizer):
+
+    config = ModelConfig(
+        REASONING_MODEL_NAME,
+        task="generate",
+        tokenizer=REASONING_MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="bfloat16",
+    )
+    token_ids = deepseek_r1_qwen_tokenizer.encode(
+        "<think>here is the thinking process")
+    regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)
+
+    regex_lp = get_local_guided_decoding_logits_processor(regex_request,
+                    deepseek_r1_qwen_tokenizer, config,
+                    reasoning_backend) if is_local else \
+            await get_guided_decoding_logits_processor(
+                    regex_request, deepseek_r1_qwen_tokenizer, config,
+                    reasoning_backend)
+    assert regex_lp is not None
+    tensor = torch.rand(151664)
+    original_tensor = torch.clone(tensor)
+    tensor = regex_lp(token_ids, tensor)
+    assert tensor.shape == original_tensor.shape
+    assert torch.allclose(tensor, original_tensor)
+
+    token_ids = deepseek_r1_qwen_tokenizer.encode(
+        "<think>here is the thinking process")
+    json_request = GuidedDecodingParams(json=sample_json_schema,
+                                        backend=backend)
+    json_lp = get_local_guided_decoding_logits_processor(
+        json_request, deepseek_r1_qwen_tokenizer, config,
+        reasoning_backend) if is_local else \
+        await get_guided_decoding_logits_processor(
+            json_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend)
+    assert json_lp is not None
+    tensor = torch.rand(151664)
+    original_tensor = torch.clone(tensor)
+    tensor = json_lp(token_ids, tensor)
+    assert tensor.shape == original_tensor.shape
+    assert torch.allclose(tensor, original_tensor)
+
+    # Thinking is over, so the tensor should change.
+    token_ids = deepseek_r1_qwen_tokenizer.encode(
+        "<think>here is the thinking process</think>")
+    json_request = GuidedDecodingParams(json=sample_json_schema,
+                                        backend=backend)
+    json_lp = get_local_guided_decoding_logits_processor(
+        json_request, deepseek_r1_qwen_tokenizer, config,
+        reasoning_backend) if is_local else \
+        await get_guided_decoding_logits_processor(
+            json_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend)
+    assert json_lp is not None
+    tensor = torch.rand(151664)
+    original_tensor = torch.clone(tensor)
+    tensor = json_lp(token_ids, tensor)
+    assert tensor.shape == original_tensor.shape
+    assert not torch.allclose(tensor, original_tensor)
+
+
+def test_multiple_guided_options_not_allowed(sample_json_schema, sample_regex):
+    with pytest.raises(ValueError,
+                       match="You can only use one kind of guided"):
+        GuidedDecodingParams(json=sample_json_schema, regex=sample_regex)
+
+    with pytest.raises(ValueError,
+                       match="You can only use one kind of guided"):
+        GuidedDecodingParams(json=sample_json_schema, json_object=True)
+
+    with pytest.raises(ValueError,
+                       match="You can only use one kind of guided"):
+        GuidedDecodingParams(json=sample_json_schema, choice=["a", "b"])
+
+    with pytest.raises(ValueError,
+                       match="You can only use one kind of guided"):
+        GuidedDecodingParams(json=sample_json_schema, grammar="test grammar")
+
+
+def test_pickle_xgrammar_tokenizer_data():
+    try:
+        import xgrammar as xgr
+    except ImportError:
+        pytest.skip("Could not import xgrammar to run test")
+
+    from vllm.model_executor.guided_decoding.xgrammar_decoding import (
+        TokenizerData)
+    tokenizer_data = TokenizerData(
+        metadata=
+        '{"vocab_type":2,"vocab_size":151665,"add_prefix_space":false,"stop_token_ids":[151645]}',
+        encoded_vocab=['!', '"', '#', '$', '%'],
+    )
+    pickled = pickle.dumps(tokenizer_data)
+
+    assert pickled is not None
+
+    depickled: TokenizerData = pickle.loads(pickled)
+
+    assert depickled is not None
+    assert json.loads(
+        depickled.metadata)['vocab_type'] == xgr.VocabType.BYTE_LEVEL.value
diff --git a/vllm_v0.10.0/tests/model_executor/test_logits_processor.py b/vllm_v0.10.0/tests/model_executor/test_logits_processor.py
new file mode 100644
index 0000000..532ebba
--- /dev/null
+++ b/vllm_v0.10.0/tests/model_executor/test_logits_processor.py
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import random
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.utils import set_random_seed
+from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
+from vllm.utils import is_pin_memory_available
+
+
+class MockLogitsProcessor(LogitsProcessor):
+
+    def __init__(self, vocab_size: int, scale: float,
+                 fake_logits: torch.Tensor):
+        super().__init__(vocab_size=vocab_size, scale=scale)
+        self.fake_logits = fake_logits.clone()
+
+    def forward(self, *args, **kwargs):
+        with patch(
+                "vllm.model_executor.layers.logits_processor._prune_hidden_states",
+                lambda x, y: x
+        ), patch(
+                "vllm.model_executor.layers.logits_processor.LogitsProcessor._get_logits",
+                lambda *args, **kwargs: self.fake_logits):
+            return super().forward(*args, **kwargs)
+
+
+def _prepare_test(
+        batch_size: int
+) -> tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]:
+    vocab_size = 32000
+    input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
+    fake_logits = torch.full((batch_size, vocab_size),
+                             1e-2,
+                             dtype=input_tensor.dtype)
+    logits_processor = MockLogitsProcessor(32000, 0.5, fake_logits)
+    return input_tensor, fake_logits, logits_processor
+
+
+RANDOM_SEEDS = list(range(128))
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_logits_processors(seed: int, device: str):
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    input_tensor, fake_logits, logits_processor = _prepare_test(batch_size)
+
+    # This sample logits processor gives infinite score to the i-th token,
+    # where i is the length of the input sequence.
+    # We therefore expect the output token sequence to be [0, 1, 2, ...]
+    def pick_ith(token_ids, logits):
+        logits[len(token_ids)] = float("inf")
+        return logits
+
+    seq_group_metadata_list = []
+    seq_lens = []
+    for i in range(batch_size):
+        seq_group_metadata_list.append(
+            SequenceGroupMetadata(
+                request_id=f"test_{i}",
+                is_prompt=True,
+                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
+                sampling_params=SamplingParams(temperature=0,
+                                               logits_processors=[pick_ith]),
+                block_tables={0: [1]},
+            ))
+        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
+
+    sampling_metadata = SamplingMetadata.prepare(
+        seq_group_metadata_list,
+        seq_lens,
+        query_lens=seq_lens,
+        device=device,
+        pin_memory=is_pin_memory_available())
+    logits_processor_output = logits_processor(
+        lm_head=None,
+        hidden_states=input_tensor,
+        sampling_metadata=sampling_metadata)
+
+    assert torch.isinf(logits_processor_output[:, 0]).all()
+
+    fake_logits *= logits_processor.scale
+    torch.testing.assert_close(logits_processor_output[:, 1],
+                               fake_logits[:, 1],
+                               rtol=1e-4,
+                               atol=0.0)
diff --git a/vllm_v0.10.0/tests/model_executor/test_model_load_with_params.py b/vllm_v0.10.0/tests/model_executor/test_model_load_with_params.py
new file mode 100644
index 0000000..aae9a4d
--- /dev/null
+++ b/vllm_v0.10.0/tests/model_executor/test_model_load_with_params.py
@@ -0,0 +1,126 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+
+import pytest
+
+from vllm.model_executor.layers.pooler import (CLSPool, DispatchPooler,
+                                               MeanPool, PoolingType)
+from vllm.model_executor.models.bert import BertEmbeddingModel
+from vllm.model_executor.models.roberta import RobertaEmbeddingModel
+from vllm.platforms import current_platform
+
+MAX_MODEL_LEN = 128
+MODEL_NAME = os.environ.get("MODEL_NAME", "BAAI/bge-base-en-v1.5")
+REVISION = os.environ.get("REVISION", "main")
+
+MODEL_NAME_ROBERTA = os.environ.get("MODEL_NAME",
+                                    "intfloat/multilingual-e5-base")
+REVISION_ROBERTA = os.environ.get("REVISION", "main")
+
+
+@pytest.mark.skipif(current_platform.is_rocm(),
+                    reason="Xformers backend is not supported on ROCm.")
+def test_model_loading_with_params(vllm_runner):
+    """
+    Test parameter weight loading with tp>1.
+    """
+    with vllm_runner(model_name=MODEL_NAME,
+                     revision=REVISION,
+                     dtype="float16",
+                     max_model_len=MAX_MODEL_LEN) as vllm_model:
+        output = vllm_model.embed("Write a short story about a robot that"
+                                  " dreams for the first time.\n")
+
+        model_config = vllm_model.llm.llm_engine.model_config
+        model_tokenizer = vllm_model.llm.llm_engine.tokenizer
+
+        # asserts on the bert model config file
+        assert model_config.encoder_config["max_seq_length"] == 512
+        assert model_config.encoder_config["do_lower_case"]
+
+        # asserts on the pooling config files
+        assert model_config.pooler_config.pooling_type == PoolingType.CLS.name
+        assert model_config.pooler_config.normalize
+
+        # asserts on the tokenizer loaded
+        assert model_tokenizer.tokenizer_id == "BAAI/bge-base-en-v1.5"
+        assert model_tokenizer.tokenizer.model_max_length == 512
+
+        def check_model(model):
+            assert isinstance(model, BertEmbeddingModel)
+            assert isinstance(pooler := model.pooler, DispatchPooler)
+            assert isinstance(pooler.poolers_by_task["embed"].pooling, CLSPool)
+
+        vllm_model.apply_model(check_model)
+
+        # assert output
+        assert output
+
+
+@pytest.mark.skipif(current_platform.is_rocm(),
+                    reason="Xformers backend is not supported on ROCm.")
+def test_roberta_model_loading_with_params(vllm_runner):
+    """
+    Test parameter weight loading with tp>1.
+    """
+    with vllm_runner(model_name=MODEL_NAME_ROBERTA,
+                     revision=REVISION_ROBERTA,
+                     dtype="float16",
+                     max_model_len=MAX_MODEL_LEN) as vllm_model:
+        output = vllm_model.embed("Write a short story about a robot that"
+                                  " dreams for the first time.\n")
+
+        model_config = vllm_model.llm.llm_engine.model_config
+        model_tokenizer = vllm_model.llm.llm_engine.tokenizer
+
+        # asserts on the bert model config file
+        assert model_config.encoder_config["max_seq_length"] == 512
+        assert not model_config.encoder_config["do_lower_case"]
+
+        # asserts on the pooling config files
+        assert model_config.pooler_config.pooling_type == PoolingType.MEAN.name
+        assert model_config.pooler_config.normalize
+
+        # asserts on the tokenizer loaded
+        assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-base"
+        assert model_tokenizer.tokenizer.model_max_length == 512
+
+        def check_model(model):
+            assert isinstance(model, RobertaEmbeddingModel)
+            assert isinstance(pooler := model.pooler, DispatchPooler)
+            assert isinstance(pooler.poolers_by_task["embed"].pooling,
+                              MeanPool)
+
+        vllm_model.apply_model(check_model)
+
+        # assert output
+        assert output
+
+
+@pytest.mark.skipif(current_platform.is_rocm(),
+                    reason="Xformers backend is not supported on ROCm.")
+def test_facebook_roberta_model_loading_with_params(vllm_runner):
+    """
+    Test loading roberta-base model with no lm_head.
+    """
+    model_name = "FacebookAI/roberta-base"
+    with vllm_runner(model_name=model_name,
+                     dtype="float16",
+                     max_model_len=MAX_MODEL_LEN) as vllm_model:
+        output = vllm_model.embed("Write a short story about a robot that"
+                                  " dreams for the first time.\n")
+
+        model_tokenizer = vllm_model.llm.llm_engine.tokenizer
+        assert model_tokenizer.tokenizer_id == model_name
+
+        def check_model(model):
+            assert isinstance(model, RobertaEmbeddingModel)
+            assert not hasattr(model, "lm_head")
+            assert isinstance(pooler := model.pooler, DispatchPooler)
+            assert isinstance(pooler.poolers_by_task["embed"].pooling, CLSPool)
+
+        vllm_model.apply_model(check_model)
+
+        assert output
diff --git a/vllm_v0.10.0/tests/model_executor/test_weight_utils.py b/vllm_v0.10.0/tests/model_executor/test_weight_utils.py
new file mode 100644
index 0000000..df625b8
--- /dev/null
+++ b/vllm_v0.10.0/tests/model_executor/test_weight_utils.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import tempfile
+
+import huggingface_hub.constants
+import pytest
+from huggingface_hub.utils import LocalEntryNotFoundError
+
+from vllm.model_executor.model_loader.weight_utils import (
+    download_weights_from_hf, enable_hf_transfer)
+
+
+def test_hf_transfer_auto_activation():
+    if "HF_HUB_ENABLE_HF_TRANSFER" in os.environ:
+        # in case it is already set, we can't test the auto activation
+        pytest.skip(
+            "HF_HUB_ENABLE_HF_TRANSFER is set, can't test auto activation")
+    enable_hf_transfer()
+    try:
+        # enable hf hub transfer if available
+        import hf_transfer  # type: ignore # noqa
+        HF_TRANSFER_ACTIVE = True
+    except ImportError:
+        HF_TRANSFER_ACTIVE = False
+    assert (huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER ==
+            HF_TRANSFER_ACTIVE)
+
+
+def test_download_weights_from_hf():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # assert LocalEntryNotFoundError error is thrown
+        # if offline is set and model is not cached
+        huggingface_hub.constants.HF_HUB_OFFLINE = True
+        with pytest.raises(LocalEntryNotFoundError):
+            download_weights_from_hf("facebook/opt-125m",
+                                     allow_patterns=["*.safetensors", "*.bin"],
+                                     cache_dir=tmpdir)
+
+        # download the model
+        huggingface_hub.constants.HF_HUB_OFFLINE = False
+        download_weights_from_hf("facebook/opt-125m",
+                                 allow_patterns=["*.safetensors", "*.bin"],
+                                 cache_dir=tmpdir)
+
+        # now it should work offline
+        huggingface_hub.constants.HF_HUB_OFFLINE = True
+        assert download_weights_from_hf(
+            "facebook/opt-125m",
+            allow_patterns=["*.safetensors", "*.bin"],
+            cache_dir=tmpdir) is not None
+
+
+if __name__ == "__main__":
+    test_hf_transfer_auto_activation()
+    test_download_weights_from_hf()
diff --git a/vllm_v0.10.0/tests/models/__init__.py b/vllm_v0.10.0/tests/models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/models/fixtures/mistral_small_3_chat.json b/vllm_v0.10.0/tests/models/fixtures/mistral_small_3_chat.json
new file mode 100644
index 0000000..9d65cd0
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/fixtures/mistral_small_3_chat.json
@@ -0,0 +1 @@
+[[[1784, 3937, 6122, 1261, 7244, 10575, 28528, 1408, 1261, 32656, 11237, 1044, 7283, 2015, 1454, 1261, 38462, 4818, 1046, 2], "The image shows a black dog lying on a wooden floor, looking up with a curious expression.", [{"1784": {"logprob": -0.4740446209907532, "rank": 1, "decoded_token": "The"}, "1065": {"logprob": -1.0990445613861084, "rank": 2, "decoded_token": "A"}, "4380": {"logprob": -3.3490445613861084, "rank": 3, "decoded_token": "This"}, "1785": {"logprob": -5.0990447998046875, "rank": 4, "decoded_token": "In"}, "11745": {"logprob": -6.4740447998046875, "rank": 5, "decoded_token": "Here"}}, {"3937": {"logprob": -0.06349722295999527, "rank": 1, "decoded_token": " image"}, "7244": {"logprob": -2.813497304916382, "rank": 2, "decoded_token": " black"}, "16649": {"logprob": -7.563497066497803, "rank": 3, "decoded_token": " photo"}, "18390": {"logprob": -7.688497066497803, "rank": 4, "decoded_token": " photograph"}, "10575": {"logprob": -8.438497543334961, "rank": 5, "decoded_token": " dog"}}, {"6122": {"logprob": -0.25453490018844604, "rank": 1, "decoded_token": " shows"}, "6971": {"logprob": -1.8795349597930908, "rank": 2, "decoded_token": " features"}, "51948": {"logprob": -2.754534959793091, "rank": 3, "decoded_token": " depicts"}, "25981": {"logprob": -5.629534721374512, "rank": 4, "decoded_token": " displays"}, "1395": {"logprob": -6.129534721374512, "rank": 5, "decoded_token": " is"}}, {"1261": {"logprob": -0.0001245659514097497, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -9.00012493133545, "rank": 2, "decoded_token": " an"}, "1278": {"logprob": -14.25012493133545, "rank": 3, "decoded_token": " the"}, "7244": {"logprob": -14.87512493133545, "rank": 4, "decoded_token": " black"}, "1925": {"logprob": -16.125123977661133, "rank": 5, "decoded_token": " one"}}, {"7244": {"logprob": -0.009403933770954609, "rank": 1, "decoded_token": " black"}, "6231": {"logprob": -5.259403705596924, "rank": 2, "decoded_token": " close"}, "16450": {"logprob": -6.759403705596924, "rank": 3, "decoded_token": " sle"}, "8500": {"logprob": -7.009403705596924, "rank": 4, "decoded_token": " dark"}, "4329": {"logprob": -7.696903705596924, "rank": 5, "decoded_token": " large"}}, {"10575": {"logprob": -0.7522680163383484, "rank": 1, "decoded_token": " dog"}, "119075": {"logprob": -1.0022680759429932, "rank": 2, "decoded_token": " Labrador"}, "116572": {"logprob": -1.8772680759429932, "rank": 3, "decoded_token": " puppy"}, "8636": {"logprob": -5.627267837524414, "rank": 4, "decoded_token": " lab"}, "15812": {"logprob": -5.814767837524414, "rank": 5, "decoded_token": " Lab"}}, {"28528": {"logprob": -0.2941223084926605, "rank": 1, "decoded_token": " lying"}, "7283": {"logprob": -2.1691222190856934, "rank": 2, "decoded_token": " looking"}, "1454": {"logprob": -2.5441222190856934, "rank": 3, "decoded_token": " with"}, "60700": {"logprob": -3.2941222190856934, "rank": 4, "decoded_token": " laying"}, "18970": {"logprob": -4.794122219085693, "rank": 5, "decoded_token": " sitting"}}, {"1408": {"logprob": -0.3170951306819916, "rank": 1, "decoded_token": " on"}, "3151": {"logprob": -1.317095160484314, "rank": 2, "decoded_token": " down"}, "14038": {"logprob": -7.3170952796936035, "rank": 3, "decoded_token": " flat"}, "104248": {"logprob": -7.4420952796936035, "rank": 4, "decoded_token": " comfortably"}, "1321": {"logprob": -7.6920952796936035, "rank": 5, "decoded_token": " and"}}, {"1261": {"logprob": -0.08228635042905807, "rank": 1, "decoded_token": " a"}, "2246": {"logprob": -3.2072863578796387, "rank": 2, "decoded_token": " its"}, "32656": {"logprob": -3.3322863578796387, "rank": 3, "decoded_token": " wooden"}, "3977": {"logprob": -6.957286357879639, "rank": 4, "decoded_token": " top"}, "1278": {"logprob": -7.207286357879639, "rank": 5, "decoded_token": " the"}}, {"32656": {"logprob": -0.03605202957987785, "rank": 1, "decoded_token": " wooden"}, "3403": {"logprob": -3.9110519886016846, "rank": 2, "decoded_token": " text"}, "44130": {"logprob": -4.911052227020264, "rank": 3, "decoded_token": " rust"}, "12603": {"logprob": -6.036052227020264, "rank": 4, "decoded_token": " wood"}, "8500": {"logprob": -6.473552227020264, "rank": 5, "decoded_token": " dark"}}, {"11237": {"logprob": -0.6433407068252563, "rank": 1, "decoded_token": " floor"}, "4691": {"logprob": -0.7683407068252563, "rank": 2, "decoded_token": " surface"}, "1615": {"logprob": -5.268340587615967, "rank": 3, "decoded_token": " pl"}, "3403": {"logprob": -6.018340587615967, "rank": 4, "decoded_token": " text"}, "18645": {"logprob": -7.143340587615967, "rank": 5, "decoded_token": " flo"}}, {"1044": {"logprob": -0.6826052665710449, "rank": 1, "decoded_token": ","}, "1321": {"logprob": -1.682605266571045, "rank": 2, "decoded_token": " and"}, "7283": {"logprob": -1.807605266571045, "rank": 3, "decoded_token": " looking"}, "1046": {"logprob": -2.682605266571045, "rank": 4, "decoded_token": "."}, "1454": {"logprob": -3.182605266571045, "rank": 5, "decoded_token": " with"}}, {"7283": {"logprob": -0.07239976525306702, "rank": 1, "decoded_token": " looking"}, "11589": {"logprob": -3.197399854660034, "rank": 2, "decoded_token": " gaz"}, "35542": {"logprob": -3.822399854660034, "rank": 3, "decoded_token": " staring"}, "1454": {"logprob": -6.384899616241455, "rank": 4, "decoded_token": " with"}, "22116": {"logprob": -6.572399616241455, "rank": 5, "decoded_token": " facing"}}, {"2015": {"logprob": -0.9646494388580322, "rank": 2, "decoded_token": " up"}, "7655": {"logprob": -0.9646494388580322, "rank": 1, "decoded_token": " directly"}, "74606": {"logprob": -2.0896494388580322, "rank": 3, "decoded_token": " upwards"}, "40022": {"logprob": -3.0896494388580322, "rank": 4, "decoded_token": " upward"}, "1935": {"logprob": -4.152149200439453, "rank": 5, "decoded_token": " int"}}, {"1454": {"logprob": -0.8447978496551514, "rank": 1, "decoded_token": " with"}, "1513": {"logprob": -1.2197978496551514, "rank": 2, "decoded_token": " at"}, "41132": {"logprob": -2.2197978496551514, "rank": 3, "decoded_token": " attent"}, "1935": {"logprob": -2.9697978496551514, "rank": 4, "decoded_token": " int"}, "7655": {"logprob": -3.0947978496551514, "rank": 5, "decoded_token": " directly"}}, {"1261": {"logprob": -0.7162021994590759, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -1.3412022590637207, "rank": 2, "decoded_token": " an"}, "41132": {"logprob": -2.2162022590637207, "rank": 3, "decoded_token": " attent"}, "2246": {"logprob": -3.2162022590637207, "rank": 4, "decoded_token": " its"}, "38462": {"logprob": -3.9662022590637207, "rank": 5, "decoded_token": " curious"}}, {"38462": {"logprob": -0.7836517095565796, "rank": 1, "decoded_token": " curious"}, "26517": {"logprob": -1.8461517095565796, "rank": 2, "decoded_token": " calm"}, "26905": {"logprob": -2.533651828765869, "rank": 3, "decoded_token": " gentle"}, "11304": {"logprob": -3.408651828765869, "rank": 4, "decoded_token": " serious"}, "97680": {"logprob": -3.596151828765869, "rank": 5, "decoded_token": " thoughtful"}}, {"4818": {"logprob": -0.047154705971479416, "rank": 1, "decoded_token": " expression"}, "1321": {"logprob": -3.922154664993286, "rank": 2, "decoded_token": " and"}, "1505": {"logprob": -4.047154903411865, "rank": 3, "decoded_token": " or"}, "22131": {"logprob": -4.797154903411865, "rank": 4, "decoded_token": " gaze"}, "1044": {"logprob": -9.047154426574707, "rank": 5, "decoded_token": ","}}, {"1046": {"logprob": -0.0008031480247154832, "rank": 1, "decoded_token": "."}, "1408": {"logprob": -7.250802993774414, "rank": 2, "decoded_token": " on"}, "1321": {"logprob": -10.500802993774414, "rank": 3, "decoded_token": " and"}, "1338": {"logprob": -11.000802993774414, "rank": 4, "decoded_token": ".\n\n"}, "3016": {"logprob": -11.500802993774414, "rank": 5, "decoded_token": " while"}}, {"2": {"logprob": -0.0008517451351508498, "rank": 1, "decoded_token": "  "}, "1032": {"logprob": -7.125851631164551, "rank": 2, "decoded_token": "   "}, "1256": {"logprob": -10.00085163116455, "rank": 3, "decoded_token": " The"}}]], [[1049, 1046, 1349, 7244, 10575, 1395, 28528, 1408, 1261, 32656, 11237, 1044, 7283, 2015, 1513, 1278, 13424, 1626, 1050, 1046, 1349, 10726, 1290, 3719, 1307, 122203, 35463, 1454, 11223, 1321, 95746, 24765, 2425, 1261, 6133, 21283, 1046, 2], "1. A black dog is lying on a wooden floor, looking up at the camera.\n2. A scenic view of rugged mountains with green and rocky terrain under a clear sky.", [{"1049": {"logprob": -0.05050129443407059, "rank": 1, "decoded_token": "1"}, "11745": {"logprob": -3.5505013465881348, "rank": 2, "decoded_token": "Here"}, "69957": {"logprob": -4.175501346588135, "rank": 3, "decoded_token": "Sure"}, "117991": {"logprob": -6.175501346588135, "rank": 4, "decoded_token": "Certain"}, "1045": {"logprob": -6.550501346588135, "rank": 5, "decoded_token": "-"}}, {"1046": {"logprob": -5.364403477869928e-06, "rank": 1, "decoded_token": "."}, "1041": {"logprob": -12.500005722045898, "rank": 2, "decoded_token": ")"}, "1058": {"logprob": -13.875005722045898, "rank": 3, "decoded_token": ":"}, "1044": {"logprob": -15.687505722045898, "rank": 4, "decoded_token": ","}, "1045": {"logprob": -15.875005722045898, "rank": 5, "decoded_token": "-"}}, {"1349": {"logprob": -0.4890742003917694, "rank": 1, "decoded_token": " A"}, "1531": {"logprob": -1.1140742301940918, "rank": 2, "decoded_token": " The"}, "1603": {"logprob": -3.364074230194092, "rank": 3, "decoded_token": " **"}, "1656": {"logprob": -4.364074230194092, "rank": 4, "decoded_token": " In"}, "2409": {"logprob": -4.989074230194092, "rank": 5, "decoded_token": " This"}}, {"7244": {"logprob": -0.08685152232646942, "rank": 1, "decoded_token": " black"}, "6231": {"logprob": -3.4618515968322754, "rank": 2, "decoded_token": " close"}, "16450": {"logprob": -3.5868515968322754, "rank": 3, "decoded_token": " sle"}, "4329": {"logprob": -4.899351596832275, "rank": 4, "decoded_token": " large"}, "8500": {"logprob": -5.399351596832275, "rank": 5, "decoded_token": " dark"}}, {"10575": {"logprob": -0.20338763296604156, "rank": 1, "decoded_token": " dog"}, "116572": {"logprob": -1.8283876180648804, "rank": 2, "decoded_token": " puppy"}, "119075": {"logprob": -3.95338773727417, "rank": 3, "decoded_token": " Labrador"}, "28404": {"logprob": -6.95338773727417, "rank": 4, "decoded_token": " pup"}, "8636": {"logprob": -7.07838773727417, "rank": 5, "decoded_token": " lab"}}, {"1395": {"logprob": -0.532414972782135, "rank": 1, "decoded_token": " is"}, "22524": {"logprob": -1.7824149131774902, "rank": 2, "decoded_token": " lies"}, "1454": {"logprob": -2.1574149131774902, "rank": 3, "decoded_token": " with"}, "10637": {"logprob": -3.2824149131774902, "rank": 4, "decoded_token": " looks"}, "28528": {"logprob": -3.4074149131774902, "rank": 5, "decoded_token": " lying"}}, {"28528": {"logprob": -0.4258010685443878, "rank": 1, "decoded_token": " lying"}, "7283": {"logprob": -1.6758010387420654, "rank": 2, "decoded_token": " looking"}, "60700": {"logprob": -2.9258010387420654, "rank": 3, "decoded_token": " laying"}, "38235": {"logprob": -3.6758010387420654, "rank": 4, "decoded_token": " resting"}, "18970": {"logprob": -3.6758010387420654, "rank": 5, "decoded_token": " sitting"}}, {"1408": {"logprob": -0.3588743805885315, "rank": 1, "decoded_token": " on"}, "3151": {"logprob": -1.2338743209838867, "rank": 2, "decoded_token": " down"}, "41132": {"logprob": -6.358874320983887, "rank": 3, "decoded_token": " attent"}, "14038": {"logprob": -6.546374320983887, "rank": 4, "decoded_token": " flat"}, "1321": {"logprob": -6.733874320983887, "rank": 5, "decoded_token": " and"}}, {"1261": {"logprob": -0.07801607996225357, "rank": 1, "decoded_token": " a"}, "2246": {"logprob": -2.9530160427093506, "rank": 2, "decoded_token": " its"}, "32656": {"logprob": -4.20301628112793, "rank": 3, "decoded_token": " wooden"}, "1278": {"logprob": -5.20301628112793, "rank": 4, "decoded_token": " the"}, "3977": {"logprob": -6.57801628112793, "rank": 5, "decoded_token": " top"}}, {"32656": {"logprob": -0.06541638821363449, "rank": 1, "decoded_token": " wooden"}, "3403": {"logprob": -3.4404163360595703, "rank": 2, "decoded_token": " text"}, "44130": {"logprob": -3.9404163360595703, "rank": 3, "decoded_token": " rust"}, "17253": {"logprob": -5.81541633605957, "rank": 4, "decoded_token": " weather"}, "12603": {"logprob": -5.94041633605957, "rank": 5, "decoded_token": " wood"}}, {"11237": {"logprob": -0.4574064016342163, "rank": 1, "decoded_token": " floor"}, "4691": {"logprob": -1.0824064016342163, "rank": 2, "decoded_token": " surface"}, "1615": {"logprob": -4.082406520843506, "rank": 3, "decoded_token": " pl"}, "3403": {"logprob": -5.207406520843506, "rank": 4, "decoded_token": " text"}, "28984": {"logprob": -6.582406520843506, "rank": 5, "decoded_token": " deck"}}, {"1044": {"logprob": -0.9594833850860596, "rank": 1, "decoded_token": ","}, "7283": {"logprob": -1.2094833850860596, "rank": 2, "decoded_token": " looking"}, "1321": {"logprob": -2.2094833850860596, "rank": 3, "decoded_token": " and"}, "1454": {"logprob": -2.4594833850860596, "rank": 4, "decoded_token": " with"}, "1626": {"logprob": -2.5844833850860596, "rank": 5, "decoded_token": ".\n"}}, {"7283": {"logprob": -0.15972694754600525, "rank": 1, "decoded_token": " looking"}, "11589": {"logprob": -2.534726858139038, "rank": 2, "decoded_token": " gaz"}, "35542": {"logprob": -2.909726858139038, "rank": 3, "decoded_token": " staring"}, "22116": {"logprob": -6.034727096557617, "rank": 4, "decoded_token": " facing"}, "1454": {"logprob": -6.409727096557617, "rank": 5, "decoded_token": " with"}}, {"2015": {"logprob": -0.894250750541687, "rank": 1, "decoded_token": " up"}, "7655": {"logprob": -1.269250750541687, "rank": 2, "decoded_token": " directly"}, "74606": {"logprob": -1.769250750541687, "rank": 3, "decoded_token": " upwards"}, "40022": {"logprob": -2.6442508697509766, "rank": 4, "decoded_token": " upward"}, "1935": {"logprob": -4.081750869750977, "rank": 5, "decoded_token": " int"}}, {"1513": {"logprob": -0.5085363388061523, "rank": 1, "decoded_token": " at"}, "1454": {"logprob": -1.5085363388061523, "rank": 2, "decoded_token": " with"}, "1626": {"logprob": -2.6335363388061523, "rank": 3, "decoded_token": ".\n"}, "1935": {"logprob": -3.3835363388061523, "rank": 4, "decoded_token": " int"}, "41132": {"logprob": -3.6335363388061523, "rank": 5, "decoded_token": " attent"}}, {"1278": {"logprob": -0.0010482537327334285, "rank": 1, "decoded_token": " the"}, "4433": {"logprob": -7.0010480880737305, "rank": 2, "decoded_token": " something"}, "2246": {"logprob": -10.25104808807373, "rank": 3, "decoded_token": " its"}, "1261": {"logprob": -10.25104808807373, "rank": 4, "decoded_token": " a"}, "1636": {"logprob": -10.50104808807373, "rank": 5, "decoded_token": " you"}}, {"13424": {"logprob": -0.0003800861886702478, "rank": 1, "decoded_token": " camera"}, "56268": {"logprob": -8.250380516052246, "rank": 2, "decoded_token": " viewer"}, "68439": {"logprob": -9.250380516052246, "rank": 3, "decoded_token": " photographer"}, "2965": {"logprob": -12.375380516052246, "rank": 4, "decoded_token": " person"}, "37967": {"logprob": -12.500380516052246, "rank": 5, "decoded_token": " ceiling"}}, {"1626": {"logprob": -0.34197133779525757, "rank": 1, "decoded_token": ".\n"}, "1454": {"logprob": -1.4669713973999023, "rank": 2, "decoded_token": " with"}, "1046": {"logprob": -3.3419713973999023, "rank": 3, "decoded_token": "."}, "1338": {"logprob": -3.9669713973999023, "rank": 4, "decoded_token": ".\n\n"}, "1935": {"logprob": -5.966971397399902, "rank": 5, "decoded_token": " int"}}, {"1050": {"logprob": -0.002148107625544071, "rank": 1, "decoded_token": "2"}, "1256": {"logprob": -6.877148151397705, "rank": 2, "decoded_token": "  "}, "1293": {"logprob": -7.127148151397705, "rank": 3, "decoded_token": "   "}, "1032": {"logprob": -8.252147674560547, "rank": 4, "decoded_token": " "}, "1049": {"logprob": -10.752147674560547, "rank": 5, "decoded_token": "1"}}, {"1046": {"logprob": -7.510157047363464e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -13.437507629394531, "rank": 2, "decoded_token": ".A"}, "1626": {"logprob": -13.437507629394531, "rank": 3, "decoded_token": ".\n"}, "48426": {"logprob": -13.687507629394531, "rank": 4, "decoded_token": ".The"}, "1044": {"logprob": -14.062507629394531, "rank": 5, "decoded_token": ","}}, {"1349": {"logprob": -0.2843300700187683, "rank": 1, "decoded_token": " A"}, "11826": {"logprob": -2.034330129623413, "rank": 2, "decoded_token": " Maj"}, "113465": {"logprob": -3.534330129623413, "rank": 3, "decoded_token": " Rug"}, "22468": {"logprob": -4.409329891204834, "rank": 4, "decoded_token": " Several"}, "1531": {"logprob": -4.534329891204834, "rank": 5, "decoded_token": " The"}}, {"10726": {"logprob": -1.3984904289245605, "rank": 1, "decoded_token": " scen"}, "122203": {"logprob": -1.7734904289245605, "rank": 2, "decoded_token": " rugged"}, "61082": {"logprob": -1.7734904289245605, "rank": 3, "decoded_token": " panor"}, "15375": {"logprob": -2.5234904289245605, "rank": 4, "decoded_token": " vast"}, "13770": {"logprob": -2.6484904289245605, "rank": 5, "decoded_token": " maj"}}, {"1290": {"logprob": -3.099436753473128e-06, "rank": 1, "decoded_token": "ic"}, "2981": {"logprob": -13.56250286102295, "rank": 2, "decoded_token": "ically"}, "1702": {"logprob": -14.31250286102295, "rank": 3, "decoded_token": "ice"}, "4965": {"logprob": -16.625003814697266, "rank": 4, "decoded_token": "etic"}, "4336": {"logprob": -16.687503814697266, "rank": 5, "decoded_token": "icro"}}, {"3719": {"logprob": -0.1252945065498352, "rank": 1, "decoded_token": " view"}, "28035": {"logprob": -2.8752944469451904, "rank": 2, "decoded_token": " landscape"}, "24361": {"logprob": -3.2502944469451904, "rank": 3, "decoded_token": " mountain"}, "127945": {"logprob": -5.1252946853637695, "rank": 4, "decoded_token": " mountainous"}, "1044": {"logprob": -5.3752946853637695, "rank": 5, "decoded_token": ","}}, {"1307": {"logprob": -0.09058280289173126, "rank": 1, "decoded_token": " of"}, "89995": {"logprob": -3.465582847595215, "rank": 2, "decoded_token": " showc"}, "6122": {"logprob": -3.715582847595215, "rank": 3, "decoded_token": " shows"}, "6971": {"logprob": -4.590582847595215, "rank": 4, "decoded_token": " features"}, "66583": {"logprob": -5.090582847595215, "rank": 5, "decoded_token": " captures"}}, {"122203": {"logprob": -0.5323622226715088, "rank": 1, "decoded_token": " rugged"}, "1261": {"logprob": -2.032362222671509, "rank": 2, "decoded_token": " a"}, "6245": {"logprob": -2.532362222671509, "rank": 3, "decoded_token": " multiple"}, "127945": {"logprob": -3.157362222671509, "rank": 4, "decoded_token": " mountainous"}, "35463": {"logprob": -3.532362222671509, "rank": 5, "decoded_token": " mountains"}}, {"35463": {"logprob": -0.6520033478736877, "rank": 1, "decoded_token": " mountains"}, "1044": {"logprob": -1.027003288269043, "rank": 2, "decoded_token": ","}, "24361": {"logprob": -2.527003288269043, "rank": 3, "decoded_token": " mountain"}, "127945": {"logprob": -3.902003288269043, "rank": 4, "decoded_token": " mountainous"}, "11223": {"logprob": -4.652003288269043, "rank": 5, "decoded_token": " green"}}, {"1454": {"logprob": -0.39697548747062683, "rank": 1, "decoded_token": " with"}, "13875": {"logprob": -2.146975517272949, "rank": 2, "decoded_token": " covered"}, "1321": {"logprob": -2.271975517272949, "rank": 3, "decoded_token": " and"}, "2425": {"logprob": -3.459475517272949, "rank": 4, "decoded_token": " under"}, "47948": {"logprob": -4.459475517272949, "rank": 5, "decoded_token": " stretching"}}, {"11223": {"logprob": -1.3947651386260986, "rank": 1, "decoded_token": " green"}, "24880": {"logprob": -1.8947651386260986, "rank": 2, "decoded_token": " varying"}, "95746": {"logprob": -2.0822651386260986, "rank": 3, "decoded_token": " rocky"}, "1295": {"logprob": -3.0197651386260986, "rank": 4, "decoded_token": " l"}, "19546": {"logprob": -3.0822651386260986, "rank": 5, "decoded_token": " varied"}}, {"1321": {"logprob": -0.8649212121963501, "rank": 1, "decoded_token": " and"}, "61263": {"logprob": -1.73992121219635, "rank": 2, "decoded_token": " slopes"}, "47260": {"logprob": -1.86492121219635, "rank": 3, "decoded_token": " vegetation"}, "50373": {"logprob": -1.98992121219635, "rank": 4, "decoded_token": " patches"}, "23170": {"logprob": -3.4899210929870605, "rank": 5, "decoded_token": " grass"}}, {"95746": {"logprob": -0.21662631630897522, "rank": 1, "decoded_token": " rocky"}, "22980": {"logprob": -1.9666262865066528, "rank": 2, "decoded_token": " brown"}, "26549": {"logprob": -3.8416264057159424, "rank": 3, "decoded_token": " gray"}, "4266": {"logprob": -4.216626167297363, "rank": 4, "decoded_token": " bar"}, "34052": {"logprob": -4.966626167297363, "rank": 5, "decoded_token": " grey"}}, {"24765": {"logprob": -0.32041722536087036, "rank": 1, "decoded_token": " terrain"}, "57912": {"logprob": -1.8204171657562256, "rank": 2, "decoded_token": " terrains"}, "61263": {"logprob": -2.6954171657562256, "rank": 3, "decoded_token": " slopes"}, "84497": {"logprob": -3.9454171657562256, "rank": 4, "decoded_token": " landscapes"}, "17764": {"logprob": -4.695417404174805, "rank": 5, "decoded_token": " surfaces"}}, {"2425": {"logprob": -0.4664109945297241, "rank": 1, "decoded_token": " under"}, "1046": {"logprob": -1.4664109945297241, "rank": 2, "decoded_token": "."}, "1044": {"logprob": -3.4664111137390137, "rank": 3, "decoded_token": ","}, "22923": {"logprob": -3.9664111137390137, "rank": 4, "decoded_token": " extending"}, "47948": {"logprob": -4.091411113739014, "rank": 5, "decoded_token": " stretching"}}, {"1261": {"logprob": -0.015043734572827816, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -4.76504373550415, "rank": 2, "decoded_token": " an"}, "6133": {"logprob": -6.01504373550415, "rank": 3, "decoded_token": " clear"}, "1278": {"logprob": -6.26504373550415, "rank": 4, "decoded_token": " the"}, "16152": {"logprob": -7.26504373550415, "rank": 5, "decoded_token": " cloud"}}, {"6133": {"logprob": -0.7420746684074402, "rank": 1, "decoded_token": " clear"}, "18416": {"logprob": -1.492074728012085, "rank": 2, "decoded_token": " haz"}, "16152": {"logprob": -1.992074728012085, "rank": 3, "decoded_token": " cloud"}, "27254": {"logprob": -3.367074728012085, "rank": 4, "decoded_token": " partly"}, "4391": {"logprob": -3.617074728012085, "rank": 5, "decoded_token": " light"}}, {"21283": {"logprob": -0.007355513051152229, "rank": 1, "decoded_token": " sky"}, "10991": {"logprob": -5.257355690002441, "rank": 2, "decoded_token": " blue"}, "1044": {"logprob": -6.382355690002441, "rank": 3, "decoded_token": ","}, "1505": {"logprob": -8.257355690002441, "rank": 4, "decoded_token": " or"}, "3950": {"logprob": -10.132355690002441, "rank": 5, "decoded_token": " day"}}, {"1046": {"logprob": -0.01126158982515335, "rank": 1, "decoded_token": "."}, "1626": {"logprob": -4.636261463165283, "rank": 2, "decoded_token": ".\n"}, "1338": {"logprob": -7.761261463165283, "rank": 3, "decoded_token": ".\n\n"}, "1044": {"logprob": -7.761261463165283, "rank": 4, "decoded_token": ","}, "1395": {"logprob": -8.011261940002441, "rank": 5, "decoded_token": " is"}}, {"2": {"logprob": -0.00709608756005764, "rank": 1, "decoded_token": "  "}, "1032": {"logprob": -5.007096290588379, "rank": 2, "decoded_token": " The"}, "1256": {"logprob": -8.132096290588379, "rank": 3, "decoded_token": "   "}}]], [[1049, 1046, 1349, 7244, 10575, 1395, 28528, 1408, 1261, 32656, 11237, 1044, 7283, 2015, 1513, 1278, 13424, 1626, 1050, 1046, 1349, 122203, 24361, 28035, 1454, 11223, 1321, 95746, 24765, 2425, 1261, 6133, 21283, 1626, 1051, 1046, 1349, 2965, 1294, 1261, 4804, 4250, 12006, 4302, 48049, 4837, 1261, 29397, 1435, 22140, 21457, 22196, 1626, 1052, 1046, 1349, 53301, 59396, 3549, 1294, 1261, 12097, 1044, 121040, 1536, 11223, 23170, 1321, 17744, 34941, 16429, 2425, 1261, 10991, 21283, 1046, 2], "1. A black dog is lying on a wooden floor, looking up at the camera.\n2. A rugged mountain landscape with green and rocky terrain under a clear sky.\n3. A person in a red swimsuit walks along a beach as waves crash nearby.\n4. A winding gravel path in a park, bordered by green grass and blooming trees under a blue sky.", [{"1049": {"logprob": -0.17000193893909454, "rank": 1, "decoded_token": "1"}, "11745": {"logprob": -1.9200019836425781, "rank": 2, "decoded_token": "Here"}, "69957": {"logprob": -4.920001983642578, "rank": 3, "decoded_token": "Sure"}, "117991": {"logprob": -7.295001983642578, "rank": 4, "decoded_token": "Certain"}, "1784": {"logprob": -7.295001983642578, "rank": 5, "decoded_token": "The"}}, {"1046": {"logprob": -1.597391747054644e-05, "rank": 1, "decoded_token": "."}, "1041": {"logprob": -11.500016212463379, "rank": 2, "decoded_token": ")"}, "1058": {"logprob": -13.062516212463379, "rank": 3, "decoded_token": ":"}, "3590": {"logprob": -13.750016212463379, "rank": 4, "decoded_token": ".A"}, "48426": {"logprob": -14.312516212463379, "rank": 5, "decoded_token": ".The"}}, {"1349": {"logprob": -0.07567699253559113, "rank": 1, "decoded_token": " A"}, "1531": {"logprob": -3.075676918029785, "rank": 2, "decoded_token": " The"}, "1603": {"logprob": -3.950676918029785, "rank": 3, "decoded_token": " **"}, "2409": {"logprob": -6.075676918029785, "rank": 4, "decoded_token": " This"}, "8479": {"logprob": -6.575676918029785, "rank": 5, "decoded_token": " Black"}}, {"7244": {"logprob": -0.06906593590974808, "rank": 1, "decoded_token": " black"}, "16450": {"logprob": -3.694066047668457, "rank": 2, "decoded_token": " sle"}, "6231": {"logprob": -4.506566047668457, "rank": 3, "decoded_token": " close"}, "4329": {"logprob": -4.944066047668457, "rank": 4, "decoded_token": " large"}, "8500": {"logprob": -5.256566047668457, "rank": 5, "decoded_token": " dark"}}, {"10575": {"logprob": -0.11913803219795227, "rank": 1, "decoded_token": " dog"}, "116572": {"logprob": -2.24413800239563, "rank": 2, "decoded_token": " puppy"}, "119075": {"logprob": -5.494138240814209, "rank": 3, "decoded_token": " Labrador"}, "28404": {"logprob": -7.181638240814209, "rank": 4, "decoded_token": " pup"}, "8636": {"logprob": -7.869138240814209, "rank": 5, "decoded_token": " lab"}}, {"1395": {"logprob": -0.782707154750824, "rank": 1, "decoded_token": " is"}, "22524": {"logprob": -1.1577072143554688, "rank": 2, "decoded_token": " lies"}, "1454": {"logprob": -2.9077072143554688, "rank": 3, "decoded_token": " with"}, "10637": {"logprob": -3.0327072143554688, "rank": 4, "decoded_token": " looks"}, "28528": {"logprob": -3.5327072143554688, "rank": 5, "decoded_token": " lying"}}, {"28528": {"logprob": -0.3443163335323334, "rank": 1, "decoded_token": " lying"}, "7283": {"logprob": -2.094316244125366, "rank": 2, "decoded_token": " looking"}, "60700": {"logprob": -2.844316244125366, "rank": 3, "decoded_token": " laying"}, "38235": {"logprob": -3.344316244125366, "rank": 4, "decoded_token": " resting"}, "18970": {"logprob": -3.469316244125366, "rank": 5, "decoded_token": " sitting"}}, {"1408": {"logprob": -0.29093095660209656, "rank": 1, "decoded_token": " on"}, "3151": {"logprob": -1.415930986404419, "rank": 2, "decoded_token": " down"}, "41132": {"logprob": -6.16593074798584, "rank": 3, "decoded_token": " attent"}, "1321": {"logprob": -6.85343074798584, "rank": 4, "decoded_token": " and"}, "14038": {"logprob": -6.97843074798584, "rank": 5, "decoded_token": " flat"}}, {"1261": {"logprob": -0.05553353577852249, "rank": 1, "decoded_token": " a"}, "2246": {"logprob": -3.6805336475372314, "rank": 2, "decoded_token": " its"}, "32656": {"logprob": -3.8055336475372314, "rank": 3, "decoded_token": " wooden"}, "1278": {"logprob": -5.305533409118652, "rank": 4, "decoded_token": " the"}, "3977": {"logprob": -7.430533409118652, "rank": 5, "decoded_token": " top"}}, {"32656": {"logprob": -0.039505477994680405, "rank": 1, "decoded_token": " wooden"}, "3403": {"logprob": -3.9145054817199707, "rank": 2, "decoded_token": " text"}, "44130": {"logprob": -4.414505481719971, "rank": 3, "decoded_token": " rust"}, "12603": {"logprob": -5.914505481719971, "rank": 4, "decoded_token": " wood"}, "17253": {"logprob": -6.539505481719971, "rank": 5, "decoded_token": " weather"}}, {"11237": {"logprob": -0.373188853263855, "rank": 1, "decoded_token": " floor"}, "4691": {"logprob": -1.248188853263855, "rank": 2, "decoded_token": " surface"}, "1615": {"logprob": -4.2481889724731445, "rank": 3, "decoded_token": " pl"}, "3403": {"logprob": -5.6231889724731445, "rank": 4, "decoded_token": " text"}, "28984": {"logprob": -5.9981889724731445, "rank": 5, "decoded_token": " deck"}}, {"1044": {"logprob": -1.378434181213379, "rank": 3, "decoded_token": ","}, "7283": {"logprob": -1.378434181213379, "rank": 1, "decoded_token": " looking"}, "1626": {"logprob": -1.378434181213379, "rank": 2, "decoded_token": ".\n"}, "1321": {"logprob": -2.378434181213379, "rank": 4, "decoded_token": " and"}, "1454": {"logprob": -2.628434181213379, "rank": 5, "decoded_token": " with"}}, {"7283": {"logprob": -0.17630912363529205, "rank": 1, "decoded_token": " looking"}, "11589": {"logprob": -2.551309108734131, "rank": 2, "decoded_token": " gaz"}, "35542": {"logprob": -2.676309108734131, "rank": 3, "decoded_token": " staring"}, "22116": {"logprob": -6.238809108734131, "rank": 4, "decoded_token": " facing"}, "11735": {"logprob": -6.488809108734131, "rank": 5, "decoded_token": " giving"}}, {"2015": {"logprob": -0.8436563014984131, "rank": 1, "decoded_token": " up"}, "7655": {"logprob": -1.343656301498413, "rank": 2, "decoded_token": " directly"}, "74606": {"logprob": -1.718656301498413, "rank": 3, "decoded_token": " upwards"}, "40022": {"logprob": -2.593656301498413, "rank": 4, "decoded_token": " upward"}, "11521": {"logprob": -4.406156539916992, "rank": 5, "decoded_token": " straight"}}, {"1513": {"logprob": -0.45780688524246216, "rank": 1, "decoded_token": " at"}, "1626": {"logprob": -1.7078068256378174, "rank": 2, "decoded_token": ".\n"}, "1454": {"logprob": -2.3328068256378174, "rank": 3, "decoded_token": " with"}, "1935": {"logprob": -3.5828068256378174, "rank": 4, "decoded_token": " int"}, "41132": {"logprob": -3.9578068256378174, "rank": 5, "decoded_token": " attent"}}, {"1278": {"logprob": -0.0004164305282756686, "rank": 1, "decoded_token": " the"}, "4433": {"logprob": -8.00041675567627, "rank": 2, "decoded_token": " something"}, "1261": {"logprob": -10.50041675567627, "rank": 3, "decoded_token": " a"}, "2246": {"logprob": -10.87541675567627, "rank": 4, "decoded_token": " its"}, "1636": {"logprob": -11.37541675567627, "rank": 5, "decoded_token": " you"}}, {"13424": {"logprob": -0.000399033073335886, "rank": 1, "decoded_token": " camera"}, "56268": {"logprob": -8.125398635864258, "rank": 2, "decoded_token": " viewer"}, "68439": {"logprob": -9.500398635864258, "rank": 3, "decoded_token": " photographer"}, "37967": {"logprob": -12.000398635864258, "rank": 4, "decoded_token": " ceiling"}, "2965": {"logprob": -12.312898635864258, "rank": 5, "decoded_token": " person"}}, {"1626": {"logprob": -0.10298559814691544, "rank": 1, "decoded_token": ".\n"}, "1046": {"logprob": -2.9779856204986572, "rank": 2, "decoded_token": "."}, "1454": {"logprob": -3.2279856204986572, "rank": 3, "decoded_token": " with"}, "1338": {"logprob": -5.227985382080078, "rank": 4, "decoded_token": ".\n\n"}, "1935": {"logprob": -6.852985382080078, "rank": 5, "decoded_token": " int"}}, {"1050": {"logprob": -0.002897590398788452, "rank": 1, "decoded_token": "2"}, "1256": {"logprob": -6.5028977394104, "rank": 2, "decoded_token": "  "}, "1293": {"logprob": -6.6278977394104, "rank": 3, "decoded_token": "   "}, "1032": {"logprob": -9.877897262573242, "rank": 4, "decoded_token": " "}, "1009": {"logprob": -11.627897262573242, "rank": 5, "decoded_token": "\t"}}, {"1046": {"logprob": -1.5497195136049413e-06, "rank": 1, "decoded_token": "."}, "1044": {"logprob": -14.875001907348633, "rank": 2, "decoded_token": ","}, "3590": {"logprob": -15.000001907348633, "rank": 3, "decoded_token": ".A"}, "2247": {"logprob": -15.125001907348633, "rank": 4, "decoded_token": " ."}, "1058": {"logprob": -15.375001907348633, "rank": 5, "decoded_token": ":"}}, {"1349": {"logprob": -0.6107801198959351, "rank": 1, "decoded_token": " A"}, "11826": {"logprob": -1.360780119895935, "rank": 2, "decoded_token": " Maj"}, "113465": {"logprob": -2.3607802391052246, "rank": 3, "decoded_token": " Rug"}, "27260": {"logprob": -3.7357802391052246, "rank": 4, "decoded_token": " Mountain"}, "1531": {"logprob": -4.485780239105225, "rank": 5, "decoded_token": " The"}}, {"122203": {"logprob": -0.8547073602676392, "rank": 1, "decoded_token": " rugged"}, "15375": {"logprob": -2.1047072410583496, "rank": 2, "decoded_token": " vast"}, "10726": {"logprob": -2.1047072410583496, "rank": 3, "decoded_token": " scen"}, "61082": {"logprob": -2.6047072410583496, "rank": 4, "decoded_token": " panor"}, "2965": {"logprob": -3.2922072410583496, "rank": 5, "decoded_token": " person"}}, {"24361": {"logprob": -0.41217130422592163, "rank": 1, "decoded_token": " mountain"}, "1044": {"logprob": -1.6621713638305664, "rank": 2, "decoded_token": ","}, "127945": {"logprob": -2.6621713638305664, "rank": 3, "decoded_token": " mountainous"}, "28035": {"logprob": -3.5371713638305664, "rank": 4, "decoded_token": " landscape"}, "1321": {"logprob": -3.6621713638305664, "rank": 5, "decoded_token": " and"}}, {"28035": {"logprob": -0.6676621437072754, "rank": 1, "decoded_token": " landscape"}, "4521": {"logprob": -0.7926621437072754, "rank": 2, "decoded_token": " range"}, "24765": {"logprob": -4.542662143707275, "rank": 3, "decoded_token": " terrain"}, "13327": {"logprob": -5.167662143707275, "rank": 4, "decoded_token": " scene"}, "12248": {"logprob": -5.167662143707275, "rank": 5, "decoded_token": " peak"}}, {"1454": {"logprob": -0.31015345454216003, "rank": 1, "decoded_token": " with"}, "6971": {"logprob": -2.4351534843444824, "rank": 2, "decoded_token": " features"}, "94973": {"logprob": -3.3101534843444824, "rank": 3, "decoded_token": " stretches"}, "89995": {"logprob": -3.4351534843444824, "rank": 4, "decoded_token": " showc"}, "1395": {"logprob": -3.5601534843444824, "rank": 5, "decoded_token": " is"}}, {"11223": {"logprob": -1.547694206237793, "rank": 1, "decoded_token": " green"}, "95746": {"logprob": -1.922694206237793, "rank": 2, "decoded_token": " rocky"}, "27469": {"logprob": -2.172694206237793, "rank": 3, "decoded_token": " peaks"}, "6245": {"logprob": -2.297694206237793, "rank": 4, "decoded_token": " multiple"}, "47147": {"logprob": -2.360194206237793, "rank": 5, "decoded_token": " steep"}}, {"1321": {"logprob": -0.9617817401885986, "rank": 1, "decoded_token": " and"}, "61263": {"logprob": -1.3367817401885986, "rank": 2, "decoded_token": " slopes"}, "51187": {"logprob": -2.3367817401885986, "rank": 3, "decoded_token": " hills"}, "47260": {"logprob": -2.3367817401885986, "rank": 4, "decoded_token": " vegetation"}, "50373": {"logprob": -2.7117817401885986, "rank": 5, "decoded_token": " patches"}}, {"95746": {"logprob": -0.11686273664236069, "rank": 1, "decoded_token": " rocky"}, "22980": {"logprob": -2.7418627738952637, "rank": 2, "decoded_token": " brown"}, "4266": {"logprob": -3.8668627738952637, "rank": 3, "decoded_token": " bar"}, "26549": {"logprob": -4.491862773895264, "rank": 4, "decoded_token": " gray"}, "9091": {"logprob": -5.366862773895264, "rank": 5, "decoded_token": " rock"}}, {"24765": {"logprob": -0.22640009224414825, "rank": 1, "decoded_token": " terrain"}, "57912": {"logprob": -2.476400136947632, "rank": 2, "decoded_token": " terrains"}, "61263": {"logprob": -2.726400136947632, "rank": 3, "decoded_token": " slopes"}, "51187": {"logprob": -3.851400136947632, "rank": 4, "decoded_token": " hills"}, "27469": {"logprob": -3.976400136947632, "rank": 5, "decoded_token": " peaks"}}, {"2425": {"logprob": -0.7823817133903503, "rank": 1, "decoded_token": " under"}, "1626": {"logprob": -1.1573817729949951, "rank": 2, "decoded_token": ".\n"}, "94973": {"logprob": -2.657381772994995, "rank": 3, "decoded_token": " stretches"}, "1395": {"logprob": -2.782381772994995, "rank": 4, "decoded_token": " is"}, "7038": {"logprob": -3.532381772994995, "rank": 5, "decoded_token": " extends"}}, {"1261": {"logprob": -0.016132064163684845, "rank": 1, "decoded_token": " a"}, "6133": {"logprob": -5.39113187789917, "rank": 2, "decoded_token": " clear"}, "1420": {"logprob": -5.39113187789917, "rank": 3, "decoded_token": " an"}, "1278": {"logprob": -6.01613187789917, "rank": 4, "decoded_token": " the"}, "16152": {"logprob": -6.26613187789917, "rank": 5, "decoded_token": " cloud"}}, {"6133": {"logprob": -0.44541382789611816, "rank": 1, "decoded_token": " clear"}, "16152": {"logprob": -2.070413827896118, "rank": 2, "decoded_token": " cloud"}, "18416": {"logprob": -2.320413827896118, "rank": 3, "decoded_token": " haz"}, "27254": {"logprob": -3.195413827896118, "rank": 4, "decoded_token": " partly"}, "10991": {"logprob": -3.320413827896118, "rank": 5, "decoded_token": " blue"}}, {"21283": {"logprob": -0.003768961876630783, "rank": 1, "decoded_token": " sky"}, "10991": {"logprob": -5.7537689208984375, "rank": 2, "decoded_token": " blue"}, "1044": {"logprob": -7.6287689208984375, "rank": 3, "decoded_token": ","}, "1505": {"logprob": -10.753768920898438, "rank": 4, "decoded_token": " or"}, "3044": {"logprob": -11.128768920898438, "rank": 5, "decoded_token": " sk"}}, {"1626": {"logprob": -0.0008177988929674029, "rank": 1, "decoded_token": ".\n"}, "1046": {"logprob": -7.375817775726318, "rank": 2, "decoded_token": "."}, "1395": {"logprob": -9.750818252563477, "rank": 3, "decoded_token": " is"}, "1010": {"logprob": -10.125818252563477, "rank": 4, "decoded_token": "\n"}, "1044": {"logprob": -10.750818252563477, "rank": 5, "decoded_token": ","}}, {"1051": {"logprob": -0.00013457823661156, "rank": 1, "decoded_token": "3"}, "1052": {"logprob": -9.125134468078613, "rank": 2, "decoded_token": "4"}, "1256": {"logprob": -11.375134468078613, "rank": 3, "decoded_token": "  "}, "1050": {"logprob": -11.875134468078613, "rank": 4, "decoded_token": "2"}, "1049": {"logprob": -13.000134468078613, "rank": 5, "decoded_token": "1"}}, {"1046": {"logprob": -7.152555099310121e-07, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -14.875000953674316, "rank": 2, "decoded_token": ".A"}, "48426": {"logprob": -15.937500953674316, "rank": 3, "decoded_token": ".The"}, "1349": {"logprob": -17.0, "rank": 4, "decoded_token": " A"}, "1338": {"logprob": -17.3125, "rank": 5, "decoded_token": ".\n\n"}}, {"1349": {"logprob": -0.03193942829966545, "rank": 1, "decoded_token": " A"}, "10638": {"logprob": -4.406939506530762, "rank": 2, "decoded_token": " Two"}, "2048": {"logprob": -5.031939506530762, "rank": 3, "decoded_token": " An"}, "1488": {"logprob": -5.156939506530762, "rank": 4, "decoded_token": " W"}, "15035": {"logprob": -5.906939506530762, "rank": 5, "decoded_token": " People"}}, {"2965": {"logprob": -0.41655251383781433, "rank": 1, "decoded_token": " person"}, "92731": {"logprob": -1.5415525436401367, "rank": 2, "decoded_token": " lone"}, "79013": {"logprob": -2.7915525436401367, "rank": 3, "decoded_token": " solitary"}, "29397": {"logprob": -3.5415525436401367, "rank": 4, "decoded_token": " beach"}, "2169": {"logprob": -4.729052543640137, "rank": 5, "decoded_token": " ser"}}, {"1294": {"logprob": -0.9845026135444641, "rank": 1, "decoded_token": " in"}, "1395": {"logprob": -1.2345025539398193, "rank": 2, "decoded_token": " is"}, "48049": {"logprob": -1.8595025539398193, "rank": 3, "decoded_token": " walks"}, "23737": {"logprob": -2.2345025539398193, "rank": 4, "decoded_token": " stands"}, "1285": {"logprob": -2.8595025539398193, "rank": 5, "decoded_token": " w"}}, {"1261": {"logprob": -0.32012784481048584, "rank": 1, "decoded_token": " a"}, "4804": {"logprob": -1.3201278448104858, "rank": 2, "decoded_token": " red"}, "1420": {"logprob": -5.820127964019775, "rank": 3, "decoded_token": " an"}, "64031": {"logprob": -6.570127964019775, "rank": 4, "decoded_token": " swim"}, "18168": {"logprob": -6.695127964019775, "rank": 5, "decoded_token": " bright"}}, {"4804": {"logprob": -0.10999592393636703, "rank": 1, "decoded_token": " red"}, "1285": {"logprob": -2.3599958419799805, "rank": 2, "decoded_token": " w"}, "4250": {"logprob": -5.6099958419799805, "rank": 3, "decoded_token": " sw"}, "18168": {"logprob": -6.0474958419799805, "rank": 4, "decoded_token": " bright"}, "18258": {"logprob": -6.4224958419799805, "rank": 5, "decoded_token": " wet"}}, {"4250": {"logprob": -0.2469252496957779, "rank": 1, "decoded_token": " sw"}, "1285": {"logprob": -2.3719253540039062, "rank": 2, "decoded_token": " w"}, "64031": {"logprob": -2.7469253540039062, "rank": 3, "decoded_token": " swim"}, "17513": {"logprob": -3.2469253540039062, "rank": 4, "decoded_token": " suit"}, "75948": {"logprob": -4.371925354003906, "rank": 5, "decoded_token": " outfit"}}, {"12006": {"logprob": -5.722029527532868e-06, "rank": 1, "decoded_token": "ims"}, "25763": {"logprob": -12.750005722045898, "rank": 2, "decoded_token": "immer"}, "7552": {"logprob": -13.687505722045898, "rank": 3, "decoded_token": "imm"}, "2097": {"logprob": -16.6875057220459, "rank": 4, "decoded_token": "ins"}, "19523": {"logprob": -16.7500057220459, "rank": 5, "decoded_token": "imb"}}, {"4302": {"logprob": -1.8000440832111053e-05, "rank": 1, "decoded_token": "uit"}, "17513": {"logprob": -11.875018119812012, "rank": 2, "decoded_token": " suit"}, "8036": {"logprob": -13.250018119812012, "rank": 3, "decoded_token": "irt"}, "36953": {"logprob": -13.500018119812012, "rank": 4, "decoded_token": "uiten"}, "1276": {"logprob": -14.437518119812012, "rank": 5, "decoded_token": "it"}}, {"48049": {"logprob": -0.41766560077667236, "rank": 1, "decoded_token": " walks"}, "1395": {"logprob": -1.4176656007766724, "rank": 2, "decoded_token": " is"}, "19710": {"logprob": -2.792665481567383, "rank": 3, "decoded_token": " walking"}, "23737": {"logprob": -3.917665481567383, "rank": 4, "decoded_token": " stands"}, "1285": {"logprob": -4.292665481567383, "rank": 5, "decoded_token": " w"}}, {"4837": {"logprob": -0.002689199522137642, "rank": 1, "decoded_token": " along"}, "9412": {"logprob": -6.627689361572266, "rank": 2, "decoded_token": " alone"}, "6117": {"logprob": -7.377689361572266, "rank": 3, "decoded_token": " near"}, "1408": {"logprob": -8.002689361572266, "rank": 4, "decoded_token": " on"}, "2203": {"logprob": -8.377689361572266, "rank": 5, "decoded_token": " into"}}, {"1261": {"logprob": -0.38749611377716064, "rank": 1, "decoded_token": " a"}, "1278": {"logprob": -1.1374961137771606, "rank": 2, "decoded_token": " the"}, "1420": {"logprob": -7.387495994567871, "rank": 3, "decoded_token": " an"}, "100991": {"logprob": -13.949995994567871, "rank": 4, "decoded_token": " sandy"}, "18258": {"logprob": -14.512495994567871, "rank": 5, "decoded_token": " wet"}}, {"29397": {"logprob": -0.5292408466339111, "rank": 1, "decoded_token": " beach"}, "100991": {"logprob": -0.9042408466339111, "rank": 2, "decoded_token": " sandy"}, "1627": {"logprob": -6.029240608215332, "rank": 3, "decoded_token": " sh"}, "46422": {"logprob": -6.529240608215332, "rank": 4, "decoded_token": " shore"}, "2169": {"logprob": -7.779240608215332, "rank": 5, "decoded_token": " ser"}}, {"1435": {"logprob": -0.29965779185295105, "rank": 1, "decoded_token": " as"}, "1454": {"logprob": -1.6746578216552734, "rank": 2, "decoded_token": " with"}, "1513": {"logprob": -3.7996578216552734, "rank": 3, "decoded_token": " at"}, "3016": {"logprob": -3.7996578216552734, "rank": 4, "decoded_token": " while"}, "6117": {"logprob": -4.799657821655273, "rank": 5, "decoded_token": " near"}}, {"22140": {"logprob": -0.015346773900091648, "rank": 1, "decoded_token": " waves"}, "1261": {"logprob": -4.515347003936768, "rank": 2, "decoded_token": " a"}, "1278": {"logprob": -6.140347003936768, "rank": 3, "decoded_token": " the"}, "27208": {"logprob": -6.890347003936768, "rank": 4, "decoded_token": " ocean"}, "4329": {"logprob": -7.265347003936768, "rank": 5, "decoded_token": " large"}}, {"21457": {"logprob": -0.013234862126410007, "rank": 1, "decoded_token": " crash"}, "33168": {"logprob": -5.138235092163086, "rank": 2, "decoded_token": " gently"}, "10401": {"logprob": -5.950735092163086, "rank": 3, "decoded_token": " roll"}, "4323": {"logprob": -6.700735092163086, "rank": 4, "decoded_token": " break"}, "5125": {"logprob": -7.138235092163086, "rank": 5, "decoded_token": " approach"}}, {"22196": {"logprob": -0.060372594743967056, "rank": 1, "decoded_token": " nearby"}, "6117": {"logprob": -3.3103725910186768, "rank": 2, "decoded_token": " near"}, "1294": {"logprob": -4.435372829437256, "rank": 3, "decoded_token": " in"}, "25644": {"logprob": -6.310372829437256, "rank": 4, "decoded_token": " beside"}, "1321": {"logprob": -6.560372829437256, "rank": 5, "decoded_token": " and"}}, {"1626": {"logprob": -0.005290080793201923, "rank": 1, "decoded_token": ".\n"}, "1294": {"logprob": -6.5052900314331055, "rank": 2, "decoded_token": " in"}, "1044": {"logprob": -7.0052900314331055, "rank": 3, "decoded_token": ","}, "1321": {"logprob": -7.1302900314331055, "rank": 4, "decoded_token": " and"}, "1513": {"logprob": -7.2552900314331055, "rank": 5, "decoded_token": " at"}}, {"1052": {"logprob": -7.748573807475623e-06, "rank": 1, "decoded_token": "4"}, "1051": {"logprob": -12.562507629394531, "rank": 2, "decoded_token": "3"}, "1053": {"logprob": -13.125007629394531, "rank": 3, "decoded_token": "5"}, "1256": {"logprob": -14.125007629394531, "rank": 4, "decoded_token": "  "}, "1049": {"logprob": -14.312507629394531, "rank": 5, "decoded_token": "1"}}, {"1046": {"logprob": -1.2993727978027891e-05, "rank": 1, "decoded_token": "."}, "1044": {"logprob": -12.62501335144043, "rank": 2, "decoded_token": ","}, "3590": {"logprob": -12.75001335144043, "rank": 3, "decoded_token": ".A"}, "1058": {"logprob": -13.00001335144043, "rank": 4, "decoded_token": ":"}, "2247": {"logprob": -13.37501335144043, "rank": 5, "decoded_token": " ."}}, {"1349": {"logprob": -0.00046957432641647756, "rank": 1, "decoded_token": " A"}, "2048": {"logprob": -8.250469207763672, "rank": 2, "decoded_token": " An"}, "1488": {"logprob": -10.125469207763672, "rank": 3, "decoded_token": " W"}, "2409": {"logprob": -10.375469207763672, "rank": 4, "decoded_token": " This"}, "12232": {"logprob": -10.500469207763672, "rank": 5, "decoded_token": " Gra"}}, {"53301": {"logprob": -0.35120296478271484, "rank": 1, "decoded_token": " winding"}, "59396": {"logprob": -1.8512029647827148, "rank": 2, "decoded_token": " gravel"}, "2169": {"logprob": -2.476202964782715, "rank": 3, "decoded_token": " ser"}, "54742": {"logprob": -3.851202964782715, "rank": 4, "decoded_token": " peaceful"}, "43536": {"logprob": -5.101202964782715, "rank": 5, "decoded_token": " curved"}}, {"59396": {"logprob": -0.2955280840396881, "rank": 1, "decoded_token": " gravel"}, "3549": {"logprob": -1.6705280542373657, "rank": 2, "decoded_token": " path"}, "14801": {"logprob": -2.7955281734466553, "rank": 3, "decoded_token": " pathway"}, "1044": {"logprob": -6.420527935028076, "rank": 4, "decoded_token": ","}, "18341": {"logprob": -6.670527935028076, "rank": 5, "decoded_token": " pathways"}}, {"3549": {"logprob": -0.03408379852771759, "rank": 1, "decoded_token": " path"}, "14801": {"logprob": -3.409083843231201, "rank": 2, "decoded_token": " pathway"}, "18341": {"logprob": -8.284083366394043, "rank": 3, "decoded_token": " pathways"}, "1505": {"logprob": -9.534083366394043, "rank": 4, "decoded_token": " or"}, "7368": {"logprob": -10.659083366394043, "rank": 5, "decoded_token": "path"}}, {"1294": {"logprob": -1.0857839584350586, "rank": 1, "decoded_token": " in"}, "13335": {"logprob": -1.4607839584350586, "rank": 2, "decoded_token": " leads"}, "2645": {"logprob": -1.9607839584350586, "rank": 3, "decoded_token": " through"}, "29817": {"logprob": -2.4607839584350586, "rank": 4, "decoded_token": " surrounded"}, "22416": {"logprob": -3.2107839584350586, "rank": 5, "decoded_token": " curves"}}, {"1261": {"logprob": -0.00011705666838679463, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -9.500117301940918, "rank": 2, "decoded_token": " an"}, "1278": {"logprob": -10.250117301940918, "rank": 3, "decoded_token": " the"}, "2549": {"logprob": -12.750117301940918, "rank": 4, "decoded_token": " what"}, "11223": {"logprob": -13.750117301940918, "rank": 5, "decoded_token": " green"}}, {"12097": {"logprob": -0.02791696786880493, "rank": 1, "decoded_token": " park"}, "2169": {"logprob": -4.65291690826416, "rank": 2, "decoded_token": " ser"}, "1295": {"logprob": -4.65291690826416, "rank": 3, "decoded_token": " l"}, "23170": {"logprob": -5.27791690826416, "rank": 4, "decoded_token": " grass"}, "26428": {"logprob": -6.52791690826416, "rank": 5, "decoded_token": " garden"}}, {"1044": {"logprob": -1.350893259048462, "rank": 1, "decoded_token": ","}, "1395": {"logprob": -1.600893259048462, "rank": 2, "decoded_token": " is"}, "29817": {"logprob": -2.350893259048462, "rank": 3, "decoded_token": " surrounded"}, "121313": {"logprob": -2.475893259048462, "rank": 4, "decoded_token": " flanked"}, "1454": {"logprob": -2.475893259048462, "rank": 5, "decoded_token": " with"}}, {"121040": {"logprob": -0.710591197013855, "rank": 1, "decoded_token": " bordered"}, "121313": {"logprob": -1.085591197013855, "rank": 2, "decoded_token": " flanked"}, "54410": {"logprob": -1.960591197013855, "rank": 3, "decoded_token": " lined"}, "29817": {"logprob": -3.8355913162231445, "rank": 4, "decoded_token": " surrounded"}, "1454": {"logprob": -5.8355913162231445, "rank": 5, "decoded_token": " with"}}, {"1536": {"logprob": -4.6491513785440475e-06, "rank": 1, "decoded_token": " by"}, "1454": {"logprob": -12.375004768371582, "rank": 2, "decoded_token": " with"}, "1408": {"logprob": -15.812504768371582, "rank": 3, "decoded_token": " on"}, "3326": {"logprob": -16.875003814697266, "rank": 4, "decoded_token": "by"}, "1295": {"logprob": -16.875003814697266, "rank": 5, "decoded_token": " l"}}, {"11223": {"logprob": -0.4314780533313751, "rank": 1, "decoded_token": " green"}, "1295": {"logprob": -1.4314780235290527, "rank": 2, "decoded_token": " l"}, "23170": {"logprob": -2.4314780235290527, "rank": 3, "decoded_token": " grass"}, "17744": {"logprob": -4.806478023529053, "rank": 4, "decoded_token": " blo"}, "95612": {"logprob": -5.181478023529053, "rank": 5, "decoded_token": " vibrant"}}, {"23170": {"logprob": -0.00035041390219703317, "rank": 1, "decoded_token": " grass"}, "69230": {"logprob": -8.125349998474121, "rank": 2, "decoded_token": " lawn"}, "128633": {"logprob": -10.750349998474121, "rank": 3, "decoded_token": " grasses"}, "87781": {"logprob": -11.437849998474121, "rank": 4, "decoded_token": "\u8349"}, "16429": {"logprob": -11.437849998474121, "rank": 5, "decoded_token": " trees"}}, {"1321": {"logprob": -0.0009494088008068502, "rank": 1, "decoded_token": " and"}, "1044": {"logprob": -7.125949382781982, "rank": 2, "decoded_token": ","}, "1454": {"logprob": -9.25094985961914, "rank": 3, "decoded_token": " with"}, "2425": {"logprob": -11.75094985961914, "rank": 4, "decoded_token": " under"}, "1046": {"logprob": -11.75094985961914, "rank": 5, "decoded_token": "."}}, {"17744": {"logprob": -0.21488544344902039, "rank": 1, "decoded_token": " blo"}, "105368": {"logprob": -1.8398854732513428, "rank": 2, "decoded_token": " bloss"}, "87833": {"logprob": -3.8398854732513428, "rank": 3, "decoded_token": " flowering"}, "16429": {"logprob": -4.464885234832764, "rank": 4, "decoded_token": " trees"}, "117207": {"logprob": -7.589885234832764, "rank": 5, "decoded_token": " bloom"}}, {"34941": {"logprob": -7.152555099310121e-07, "rank": 1, "decoded_token": "oming"}, "35974": {"logprob": -14.375000953674316, "rank": 2, "decoded_token": "omed"}, "6325": {"logprob": -16.5625, "rank": 3, "decoded_token": "oms"}, "11009": {"logprob": -17.625, "rank": 4, "decoded_token": "omy"}, "9457": {"logprob": -18.875, "rank": 5, "decoded_token": "ming"}}, {"16429": {"logprob": -0.002424398437142372, "rank": 1, "decoded_token": " trees"}, "103796": {"logprob": -6.627424240112305, "rank": 2, "decoded_token": " cherry"}, "32152": {"logprob": -7.377424240112305, "rank": 3, "decoded_token": " flowers"}, "29151": {"logprob": -9.314924240112305, "rank": 4, "decoded_token": " shr"}, "20370": {"logprob": -9.564924240112305, "rank": 5, "decoded_token": " fruit"}}, {"2425": {"logprob": -0.3792523741722107, "rank": 1, "decoded_token": " under"}, "1046": {"logprob": -1.3792524337768555, "rank": 2, "decoded_token": "."}, "3675": {"logprob": -2.8792524337768555, "rank": 3, "decoded_token": " against"}, "1044": {"logprob": -5.1292524337768555, "rank": 4, "decoded_token": ","}, "1454": {"logprob": -7.2542524337768555, "rank": 5, "decoded_token": " with"}}, {"1261": {"logprob": -0.0002315968304174021, "rank": 1, "decoded_token": " a"}, "1278": {"logprob": -8.875231742858887, "rank": 2, "decoded_token": " the"}, "10991": {"logprob": -9.875231742858887, "rank": 3, "decoded_token": " blue"}, "6133": {"logprob": -10.375231742858887, "rank": 4, "decoded_token": " clear"}, "1420": {"logprob": -12.250231742858887, "rank": 5, "decoded_token": " an"}}, {"10991": {"logprob": -0.6372600197792053, "rank": 1, "decoded_token": " blue"}, "6133": {"logprob": -0.7622600197792053, "rank": 2, "decoded_token": " clear"}, "18168": {"logprob": -5.3872599601745605, "rank": 3, "decoded_token": " bright"}, "105573": {"logprob": -10.012260437011719, "rank": 4, "decoded_token": " sunny"}, "15330": {"logprob": -11.512260437011719, "rank": 5, "decoded_token": " Blue"}}, {"21283": {"logprob": -6.12716976320371e-05, "rank": 1, "decoded_token": " sky"}, "1044": {"logprob": -9.87506103515625, "rank": 2, "decoded_token": ","}, "19673": {"logprob": -12.00006103515625, "rank": 3, "decoded_token": " Sky"}, "1321": {"logprob": -13.31256103515625, "rank": 4, "decoded_token": " and"}, "124968": {"logprob": -14.81256103515625, "rank": 5, "decoded_token": " skies"}}, {"1046": {"logprob": -0.00013982271775603294, "rank": 1, "decoded_token": "."}, "2": {"logprob": -9.500140190124512, "rank": 2, "decoded_token": ".\n"}, "1626": {"logprob": -10.000140190124512, "rank": 3, "decoded_token": ".\n\n"}, "1338": {"logprob": -11.750140190124512, "rank": 4, "decoded_token": " with"}}, {"2": {"logprob": -0.0004533693427219987, "rank": 1, "decoded_token": "  "}, "1032": {"logprob": -7.750453472137451, "rank": 2, "decoded_token": " Each"}, "1256": {"logprob": -11.125452995300293, "rank": 3, "decoded_token": " This"}}]]]
\ No newline at end of file
diff --git a/vllm_v0.10.0/tests/models/fixtures/pixtral_chat.json b/vllm_v0.10.0/tests/models/fixtures/pixtral_chat.json
new file mode 100644
index 0000000..643afb8
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/fixtures/pixtral_chat.json
@@ -0,0 +1 @@
+[[[1784, 3937, 6122, 1261, 7244, 10575, 18970, 1408, 1261, 32656, 4691, 1046, 2], "The image shows a black dog sitting on a wooden surface.", [{"1784": {"logprob": -0.11687260121107101, "rank": 1, "decoded_token": "The"}, "4380": {"logprob": -2.366872549057007, "rank": 2, "decoded_token": "This"}, "1049": {"logprob": -4.741872787475586, "rank": 3, "decoded_token": "1"}, "117991": {"logprob": -5.991872787475586, "rank": 4, "decoded_token": "Certain"}, "1785": {"logprob": -5.991872787475586, "rank": 5, "decoded_token": "In"}}, {"3937": {"logprob": -0.28887900710105896, "rank": 1, "decoded_token": " image"}, "2158": {"logprob": -1.4138790369033813, "rank": 2, "decoded_token": " first"}, "3977": {"logprob": -5.788878917694092, "rank": 3, "decoded_token": " top"}, "7244": {"logprob": -6.163878917694092, "rank": 4, "decoded_token": " black"}, "8061": {"logprob": -6.788878917694092, "rank": 5, "decoded_token": " images"}}, {"6122": {"logprob": -0.9653709530830383, "rank": 1, "decoded_token": " shows"}, "51948": {"logprob": -1.4653708934783936, "rank": 2, "decoded_token": " depicts"}, "6971": {"logprob": -1.4653708934783936, "rank": 3, "decoded_token": " features"}, "25981": {"logprob": -2.8403708934783936, "rank": 4, "decoded_token": " displays"}, "8688": {"logprob": -2.8403708934783936, "rank": 5, "decoded_token": " contains"}}, {"1261": {"logprob": -0.003059827256947756, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -6.2530598640441895, "rank": 2, "decoded_token": " an"}, "2295": {"logprob": -7.8780598640441895, "rank": 3, "decoded_token": " two"}, "2342": {"logprob": -7.8780598640441895, "rank": 4, "decoded_token": " only"}, "1278": {"logprob": -8.628059387207031, "rank": 5, "decoded_token": " the"}}, {"7244": {"logprob": -0.17616479098796844, "rank": 1, "decoded_token": " black"}, "6231": {"logprob": -2.3011648654937744, "rank": 2, "decoded_token": " close"}, "4249": {"logprob": -3.4261648654937744, "rank": 3, "decoded_token": " single"}, "4329": {"logprob": -5.113664627075195, "rank": 4, "decoded_token": " large"}, "10575": {"logprob": -5.176164627075195, "rank": 5, "decoded_token": " dog"}}, {"10575": {"logprob": -0.10940006375312805, "rank": 1, "decoded_token": " dog"}, "116572": {"logprob": -2.4844000339508057, "rank": 2, "decoded_token": " puppy"}, "119075": {"logprob": -4.109400272369385, "rank": 3, "decoded_token": " Labrador"}, "15812": {"logprob": -7.296900272369385, "rank": 4, "decoded_token": " Lab"}, "7990": {"logprob": -7.421900272369385, "rank": 5, "decoded_token": " cat"}}, {"18970": {"logprob": -0.8322296738624573, "rank": 1, "decoded_token": " sitting"}, "1454": {"logprob": -1.5822296142578125, "rank": 2, "decoded_token": " with"}, "28528": {"logprob": -1.9572296142578125, "rank": 3, "decoded_token": " lying"}, "7283": {"logprob": -2.2072296142578125, "rank": 4, "decoded_token": " looking"}, "15866": {"logprob": -3.0197296142578125, "rank": 5, "decoded_token": " standing"}}, {"1408": {"logprob": -0.08769982308149338, "rank": 1, "decoded_token": " on"}, "1321": {"logprob": -3.7126998901367188, "rank": 2, "decoded_token": " and"}, "3675": {"logprob": -3.9626998901367188, "rank": 3, "decoded_token": " against"}, "41132": {"logprob": -4.587699890136719, "rank": 4, "decoded_token": " attent"}, "1454": {"logprob": -5.087699890136719, "rank": 5, "decoded_token": " with"}}, {"1261": {"logprob": -0.5400654673576355, "rank": 1, "decoded_token": " a"}, "32656": {"logprob": -0.9150654673576355, "rank": 2, "decoded_token": " wooden"}, "3977": {"logprob": -5.415065288543701, "rank": 3, "decoded_token": " top"}, "12603": {"logprob": -5.540065288543701, "rank": 4, "decoded_token": " wood"}, "44130": {"logprob": -6.290065288543701, "rank": 5, "decoded_token": " rust"}}, {"32656": {"logprob": -0.02516966126859188, "rank": 1, "decoded_token": " wooden"}, "44130": {"logprob": -4.400169849395752, "rank": 2, "decoded_token": " rust"}, "12603": {"logprob": -5.275169849395752, "rank": 3, "decoded_token": " wood"}, "3403": {"logprob": -5.525169849395752, "rank": 4, "decoded_token": " text"}, "17253": {"logprob": -6.962669849395752, "rank": 5, "decoded_token": " weather"}}, {"4691": {"logprob": -0.7264319658279419, "rank": 1, "decoded_token": " surface"}, "11237": {"logprob": -0.8514319658279419, "rank": 2, "decoded_token": " floor"}, "7042": {"logprob": -2.6014318466186523, "rank": 3, "decoded_token": " background"}, "28984": {"logprob": -5.226431846618652, "rank": 4, "decoded_token": " deck"}, "1615": {"logprob": -5.726431846618652, "rank": 5, "decoded_token": " pl"}}, {"1046": {"logprob": -0.4668232202529907, "rank": 1, "decoded_token": "."}, "1044": {"logprob": -1.9668232202529907, "rank": 2, "decoded_token": ","}, "1321": {"logprob": -2.466823101043701, "rank": 3, "decoded_token": " and"}, "7283": {"logprob": -2.716823101043701, "rank": 4, "decoded_token": " looking"}, "1454": {"logprob": -2.716823101043701, "rank": 5, "decoded_token": " with"}}, {"2": {"logprob": -0.002247072057798505, "rank": 1, "decoded_token": "</s>"}, "1531": {"logprob": -6.627246856689453, "rank": 2, "decoded_token": " The"}, "1032": {"logprob": -7.127246856689453, "rank": 3, "decoded_token": " "}, "3730": {"logprob": -9.877246856689453, "rank": 4, "decoded_token": " There"}, "1256": {"logprob": -11.127246856689453, "rank": 5, "decoded_token": "  "}}]], [[1049, 1046, 1349, 7244, 10575, 1454, 2327, 94766, 32961, 53048, 41132, 3923, 1408, 1261, 32656, 4691, 1626, 1050, 1046, 1349, 15375, 24361, 4521, 1454, 122203, 27469, 94973, 2425, 1261, 16152, 1121, 21283, 1046, 2], "1. A black dog with floppy ears sits attentively on a wooden surface.\n2. A vast mountain range with rugged peaks stretches under a cloudy sky.", [{"1049": {"logprob": -0.42824622988700867, "rank": 1, "decoded_token": "1"}, "1045": {"logprob": -1.553246259689331, "rank": 2, "decoded_token": "-"}, "1065": {"logprob": -2.428246259689331, "rank": 3, "decoded_token": "A"}, "1784": {"logprob": -4.053246021270752, "rank": 4, "decoded_token": "The"}, "69957": {"logprob": -4.428246021270752, "rank": 5, "decoded_token": "Sure"}}, {"1046": {"logprob": -1.9788545614574105e-05, "rank": 1, "decoded_token": "."}, "1058": {"logprob": -11.750020027160645, "rank": 2, "decoded_token": ":"}, "3590": {"logprob": -12.125020027160645, "rank": 3, "decoded_token": ".A"}, "1065": {"logprob": -13.062520027160645, "rank": 4, "decoded_token": "A"}, "1041": {"logprob": -13.750020027160645, "rank": 5, "decoded_token": ")"}}, {"1349": {"logprob": -0.14020134508609772, "rank": 1, "decoded_token": " A"}, "1429": {"logprob": -2.3902013301849365, "rank": 2, "decoded_token": " \""}, "1603": {"logprob": -3.7652013301849365, "rank": 3, "decoded_token": " **"}, "11967": {"logprob": -4.890201568603516, "rank": 4, "decoded_token": " Image"}, "1531": {"logprob": -5.015201568603516, "rank": 5, "decoded_token": " The"}}, {"7244": {"logprob": -0.2003599852323532, "rank": 1, "decoded_token": " black"}, "38462": {"logprob": -3.075360059738159, "rank": 2, "decoded_token": " curious"}, "68076": {"logprob": -3.575360059738159, "rank": 3, "decoded_token": " cute"}, "4329": {"logprob": -3.887860059738159, "rank": 4, "decoded_token": " large"}, "6231": {"logprob": -4.32535982131958, "rank": 5, "decoded_token": " close"}}, {"10575": {"logprob": -0.18818901479244232, "rank": 1, "decoded_token": " dog"}, "116572": {"logprob": -2.0631890296936035, "rank": 2, "decoded_token": " puppy"}, "119075": {"logprob": -3.1881890296936035, "rank": 3, "decoded_token": " Labrador"}, "15812": {"logprob": -6.9381890296936035, "rank": 4, "decoded_token": " Lab"}, "8636": {"logprob": -7.3131890296936035, "rank": 5, "decoded_token": " lab"}}, {"1454": {"logprob": -0.5699259042739868, "rank": 1, "decoded_token": " with"}, "53048": {"logprob": -1.2574259042739868, "rank": 2, "decoded_token": " sits"}, "1395": {"logprob": -3.0699257850646973, "rank": 3, "decoded_token": " is"}, "22524": {"logprob": -3.6324257850646973, "rank": 4, "decoded_token": " lies"}, "18970": {"logprob": -3.7574257850646973, "rank": 5, "decoded_token": " sitting"}}, {"2327": {"logprob": -1.2377738952636719, "rank": 1, "decoded_token": " fl"}, "1261": {"logprob": -1.3627738952636719, "rank": 2, "decoded_token": " a"}, "17300": {"logprob": -1.9252738952636719, "rank": 3, "decoded_token": " soul"}, "100089": {"logprob": -2.675273895263672, "rank": 4, "decoded_token": " expressive"}, "6444": {"logprob": -3.237773895263672, "rank": 5, "decoded_token": " soft"}}, {"94766": {"logprob": -0.0025601964443922043, "rank": 1, "decoded_token": "oppy"}, "124603": {"logprob": -6.315060138702393, "rank": 2, "decoded_token": "uffy"}, "1484": {"logprob": -7.877560138702393, "rank": 3, "decoded_token": "op"}, "24897": {"logprob": -8.81506061553955, "rank": 4, "decoded_token": "appy"}, "102477": {"logprob": -9.69006061553955, "rank": 5, "decoded_token": "opping"}}, {"32961": {"logprob": -5.113947918289341e-05, "rank": 1, "decoded_token": " ears"}, "16962": {"logprob": -11.250051498413086, "rank": 2, "decoded_token": " ear"}, "5731": {"logprob": -11.812551498413086, "rank": 3, "decoded_token": " eyes"}, "3351": {"logprob": -12.000051498413086, "rank": 4, "decoded_token": " years"}, "42071": {"logprob": -13.062551498413086, "rank": 5, "decoded_token": " cheeks"}}, {"53048": {"logprob": -0.6179640889167786, "rank": 1, "decoded_token": " sits"}, "10637": {"logprob": -1.9929640293121338, "rank": 2, "decoded_token": " looks"}, "1321": {"logprob": -2.430464029312134, "rank": 3, "decoded_token": " and"}, "1395": {"logprob": -2.617964029312134, "rank": 4, "decoded_token": " is"}, "18970": {"logprob": -3.055464029312134, "rank": 5, "decoded_token": " sitting"}}, {"41132": {"logprob": -0.3746516704559326, "rank": 1, "decoded_token": " attent"}, "1408": {"logprob": -2.3121516704559326, "rank": 2, "decoded_token": " on"}, "106534": {"logprob": -2.3746516704559326, "rank": 3, "decoded_token": " calmly"}, "12276": {"logprob": -2.6246516704559326, "rank": 4, "decoded_token": " alert"}, "6482": {"logprob": -5.124651908874512, "rank": 5, "decoded_token": " patient"}}, {"3923": {"logprob": -8.463501580990851e-05, "rank": 1, "decoded_token": "ively"}, "1556": {"logprob": -9.50008487701416, "rank": 2, "decoded_token": "ive"}, "6655": {"logprob": -11.87508487701416, "rank": 3, "decoded_token": "atively"}, "3929": {"logprob": -14.00008487701416, "rank": 4, "decoded_token": "ently"}, "47885": {"logprob": -14.62508487701416, "rank": 5, "decoded_token": "edly"}}, {"1408": {"logprob": -0.06439964473247528, "rank": 1, "decoded_token": " on"}, "3675": {"logprob": -3.0643997192382812, "rank": 2, "decoded_token": " against"}, "1294": {"logprob": -4.939399719238281, "rank": 3, "decoded_token": " in"}, "7283": {"logprob": -5.689399719238281, "rank": 4, "decoded_token": " looking"}, "1044": {"logprob": -5.814399719238281, "rank": 5, "decoded_token": ","}}, {"1261": {"logprob": -0.2108541578054428, "rank": 1, "decoded_token": " a"}, "32656": {"logprob": -1.710854172706604, "rank": 2, "decoded_token": " wooden"}, "17253": {"logprob": -5.5858540534973145, "rank": 3, "decoded_token": " weather"}, "44130": {"logprob": -6.0858540534973145, "rank": 4, "decoded_token": " rust"}, "12603": {"logprob": -6.9608540534973145, "rank": 5, "decoded_token": " wood"}}, {"32656": {"logprob": -0.08556432276964188, "rank": 1, "decoded_token": " wooden"}, "44130": {"logprob": -2.710564374923706, "rank": 2, "decoded_token": " rust"}, "17253": {"logprob": -4.710564136505127, "rank": 3, "decoded_token": " weather"}, "12603": {"logprob": -5.960564136505127, "rank": 4, "decoded_token": " wood"}, "3403": {"logprob": -5.960564136505127, "rank": 5, "decoded_token": " text"}}, {"4691": {"logprob": -0.7751782536506653, "rank": 1, "decoded_token": " surface"}, "11237": {"logprob": -0.7751782536506653, "rank": 2, "decoded_token": " floor"}, "7042": {"logprob": -2.9001781940460205, "rank": 3, "decoded_token": " background"}, "28984": {"logprob": -4.1501784324646, "rank": 4, "decoded_token": " deck"}, "92504": {"logprob": -6.1501784324646, "rank": 5, "decoded_token": " backdrop"}}, {"1626": {"logprob": -0.12918435037136078, "rank": 1, "decoded_token": ".\n"}, "1044": {"logprob": -2.3791842460632324, "rank": 2, "decoded_token": ","}, "1046": {"logprob": -4.129184246063232, "rank": 3, "decoded_token": "."}, "1338": {"logprob": -5.129184246063232, "rank": 4, "decoded_token": ".\n\n"}, "7283": {"logprob": -5.629184246063232, "rank": 5, "decoded_token": " looking"}}, {"1050": {"logprob": -0.00017474555352237076, "rank": 1, "decoded_token": "2"}, "1256": {"logprob": -9.000174522399902, "rank": 2, "decoded_token": "  "}, "1032": {"logprob": -10.875174522399902, "rank": 3, "decoded_token": " "}, "1293": {"logprob": -11.625174522399902, "rank": 4, "decoded_token": "   "}, "1051": {"logprob": -12.125174522399902, "rank": 5, "decoded_token": "3"}}, {"1046": {"logprob": -7.629365427419543e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -12.875007629394531, "rank": 2, "decoded_token": ".A"}, "1626": {"logprob": -13.062507629394531, "rank": 3, "decoded_token": ".\n"}, "1338": {"logprob": -14.562507629394531, "rank": 4, "decoded_token": ".\n\n"}, "1058": {"logprob": -14.812507629394531, "rank": 5, "decoded_token": ":"}}, {"1349": {"logprob": -0.558266282081604, "rank": 1, "decoded_token": " A"}, "11826": {"logprob": -1.495766282081604, "rank": 2, "decoded_token": " Maj"}, "37159": {"logprob": -2.2457661628723145, "rank": 3, "decoded_token": " Snow"}, "113465": {"logprob": -3.9957661628723145, "rank": 4, "decoded_token": " Rug"}, "1531": {"logprob": -3.9957661628723145, "rank": 5, "decoded_token": " The"}}, {"15375": {"logprob": -0.6446555852890015, "rank": 1, "decoded_token": " vast"}, "37849": {"logprob": -2.019655704498291, "rank": 2, "decoded_token": " breat"}, "61082": {"logprob": -2.394655704498291, "rank": 3, "decoded_token": " panor"}, "10726": {"logprob": -3.082155704498291, "rank": 4, "decoded_token": " scen"}, "2169": {"logprob": -3.207155704498291, "rank": 5, "decoded_token": " ser"}}, {"24361": {"logprob": -0.7034653425216675, "rank": 1, "decoded_token": " mountain"}, "127945": {"logprob": -1.9534653425216675, "rank": 2, "decoded_token": " mountainous"}, "1044": {"logprob": -2.078465461730957, "rank": 3, "decoded_token": ","}, "4521": {"logprob": -2.328465461730957, "rank": 4, "decoded_token": " range"}, "28035": {"logprob": -2.453465461730957, "rank": 5, "decoded_token": " landscape"}}, {"4521": {"logprob": -0.07058106362819672, "rank": 1, "decoded_token": " range"}, "28035": {"logprob": -2.6955809593200684, "rank": 2, "decoded_token": " landscape"}, "37691": {"logprob": -8.320581436157227, "rank": 3, "decoded_token": " valley"}, "12248": {"logprob": -9.445581436157227, "rank": 4, "decoded_token": " peak"}, "13327": {"logprob": -9.695581436157227, "rank": 5, "decoded_token": " scene"}}, {"1454": {"logprob": -1.1448894739151, "rank": 1, "decoded_token": " with"}, "94973": {"logprob": -1.1448894739151, "rank": 2, "decoded_token": " stretches"}, "2425": {"logprob": -1.8948894739151, "rank": 3, "decoded_token": " under"}, "1395": {"logprob": -2.5198893547058105, "rank": 4, "decoded_token": " is"}, "13875": {"logprob": -3.0198893547058105, "rank": 5, "decoded_token": " covered"}}, {"122203": {"logprob": -1.0288245677947998, "rank": 1, "decoded_token": " rugged"}, "58127": {"logprob": -1.6538245677947998, "rank": 2, "decoded_token": " jag"}, "27469": {"logprob": -2.1538245677948, "rank": 3, "decoded_token": " peaks"}, "23745": {"logprob": -2.6538245677948, "rank": 4, "decoded_token": " snow"}, "95746": {"logprob": -2.8413245677948, "rank": 5, "decoded_token": " rocky"}}, {"27469": {"logprob": -0.20564845204353333, "rank": 1, "decoded_token": " peaks"}, "24765": {"logprob": -2.580648422241211, "rank": 2, "decoded_token": " terrain"}, "130655": {"logprob": -2.955648422241211, "rank": 3, "decoded_token": ""}, "1044": {"logprob": -3.580648422241211, "rank": 4, "decoded_token": ","}, "61263": {"logprob": -4.455648422241211, "rank": 5, "decoded_token": " slopes"}}, {"94973": {"logprob": -1.0839273929595947, "rank": 1, "decoded_token": " stretches"}, "1321": {"logprob": -1.1464273929595947, "rank": 2, "decoded_token": " and"}, "2425": {"logprob": -1.7714273929595947, "rank": 3, "decoded_token": " under"}, "13875": {"logprob": -3.0839273929595947, "rank": 4, "decoded_token": " covered"}, "1395": {"logprob": -3.2714273929595947, "rank": 5, "decoded_token": " is"}}, {"2425": {"logprob": -0.9016233682632446, "rank": 1, "decoded_token": " under"}, "5669": {"logprob": -1.0266233682632446, "rank": 2, "decoded_token": " across"}, "1848": {"logprob": -1.9016233682632446, "rank": 3, "decoded_token": " out"}, "2203": {"logprob": -3.151623249053955, "rank": 4, "decoded_token": " into"}, "8994": {"logprob": -4.026623249053955, "rank": 5, "decoded_token": " towards"}}, {"1261": {"logprob": -0.00555459875613451, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -5.380554676055908, "rank": 2, "decoded_token": " an"}, "1278": {"logprob": -7.630554676055908, "rank": 3, "decoded_token": " the"}, "2136": {"logprob": -9.31805419921875, "rank": 4, "decoded_token": " over"}, "16152": {"logprob": -9.38055419921875, "rank": 5, "decoded_token": " cloud"}}, {"16152": {"logprob": -0.6862213015556335, "rank": 1, "decoded_token": " cloud"}, "6133": {"logprob": -1.4362213611602783, "rank": 2, "decoded_token": " clear"}, "18416": {"logprob": -2.6862213611602783, "rank": 3, "decoded_token": " haz"}, "27254": {"logprob": -3.0612213611602783, "rank": 4, "decoded_token": " partly"}, "4391": {"logprob": -3.1862213611602783, "rank": 5, "decoded_token": " light"}}, {"1121": {"logprob": -0.10446903109550476, "rank": 1, "decoded_token": "y"}, "4527": {"logprob": -2.854469060897827, "rank": 2, "decoded_token": "less"}, "1286": {"logprob": -3.479469060897827, "rank": 3, "decoded_token": "ed"}, "114525": {"logprob": -5.479468822479248, "rank": 4, "decoded_token": "-covered"}, "77187": {"logprob": -5.479468822479248, "rank": 5, "decoded_token": "-filled"}}, {"21283": {"logprob": -0.003459066851064563, "rank": 1, "decoded_token": " sky"}, "10991": {"logprob": -6.3784589767456055, "rank": 2, "decoded_token": " blue"}, "1044": {"logprob": -6.8784589767456055, "rank": 3, "decoded_token": ","}, "26549": {"logprob": -7.8784589767456055, "rank": 4, "decoded_token": " gray"}, "34052": {"logprob": -8.503458976745605, "rank": 5, "decoded_token": " grey"}}, {"1046": {"logprob": -0.01103890035301447, "rank": 1, "decoded_token": "."}, "1044": {"logprob": -4.636038780212402, "rank": 2, "decoded_token": ","}, "1338": {"logprob": -7.261038780212402, "rank": 3, "decoded_token": ".\n\n"}, "1294": {"logprob": -8.136038780212402, "rank": 4, "decoded_token": " in"}, "1454": {"logprob": -8.761038780212402, "rank": 5, "decoded_token": " with"}}, {"2": {"logprob": -9.059865078597795e-06, "rank": 1, "decoded_token": "</s>"}, "1032": {"logprob": -11.625008583068848, "rank": 2, "decoded_token": " "}, "1256": {"logprob": -16.125009536743164, "rank": 3, "decoded_token": "  "}, "1319": {"logprob": -17.375009536743164, "rank": 4, "decoded_token": " ("}, "1766": {"logprob": -18.750009536743164, "rank": 5, "decoded_token": " ["}}]], [[1049, 1046, 1349, 7244, 10575, 53048, 41132, 3923, 1408, 1261, 32656, 11237, 1626, 1050, 1046, 1349, 15375, 24361, 4521, 94973, 5669, 1278, 48932, 2425, 1261, 16152, 1121, 21283, 1626, 1051, 1046, 8342, 71284, 7377, 1394, 22140, 1294, 1278, 27208, 1513, 97558, 1626, 1052, 1046, 1349, 53301, 59396, 3549, 13335, 2645, 1261, 1295, 3506, 11223, 12097, 1046, 2], "1. A black dog sits attentively on a wooden floor.\n2. A vast mountain range stretches across the horizon under a cloudy sky.\n3. Surfers wait for waves in the ocean at sunset.\n4. A winding gravel path leads through a lush green park.", [{"1049": {"logprob": -0.05001257359981537, "rank": 1, "decoded_token": "1"}, "1045": {"logprob": -3.1750125885009766, "rank": 2, "decoded_token": "-"}, "69957": {"logprob": -5.925012588500977, "rank": 3, "decoded_token": "Sure"}, "11745": {"logprob": -6.425012588500977, "rank": 4, "decoded_token": "Here"}, "1065": {"logprob": -6.425012588500977, "rank": 5, "decoded_token": "A"}}, {"1046": {"logprob": -9.536697689327411e-06, "rank": 1, "decoded_token": "."}, "1058": {"logprob": -11.875009536743164, "rank": 2, "decoded_token": ":"}, "3590": {"logprob": -13.375009536743164, "rank": 3, "decoded_token": ".A"}, "1041": {"logprob": -14.750009536743164, "rank": 4, "decoded_token": ")"}, "1065": {"logprob": -15.687509536743164, "rank": 5, "decoded_token": "A"}}, {"1349": {"logprob": -0.12580634653568268, "rank": 1, "decoded_token": " A"}, "1429": {"logprob": -2.3758063316345215, "rank": 2, "decoded_token": " \""}, "1531": {"logprob": -4.6258063316345215, "rank": 3, "decoded_token": " The"}, "11967": {"logprob": -4.6258063316345215, "rank": 4, "decoded_token": " Image"}, "1603": {"logprob": -5.6258063316345215, "rank": 5, "decoded_token": " **"}}, {"7244": {"logprob": -0.15412142872810364, "rank": 1, "decoded_token": " black"}, "68076": {"logprob": -3.3416213989257812, "rank": 2, "decoded_token": " cute"}, "6231": {"logprob": -3.9666213989257812, "rank": 3, "decoded_token": " close"}, "38462": {"logprob": -4.216621398925781, "rank": 4, "decoded_token": " curious"}, "4329": {"logprob": -4.404121398925781, "rank": 5, "decoded_token": " large"}}, {"10575": {"logprob": -0.12086891382932663, "rank": 1, "decoded_token": " dog"}, "116572": {"logprob": -2.3708689212799072, "rank": 2, "decoded_token": " puppy"}, "119075": {"logprob": -3.9958689212799072, "rank": 3, "decoded_token": " Labrador"}, "15812": {"logprob": -7.683368682861328, "rank": 4, "decoded_token": " Lab"}, "8636": {"logprob": -7.808368682861328, "rank": 5, "decoded_token": " lab"}}, {"53048": {"logprob": -0.8729249238967896, "rank": 1, "decoded_token": " sits"}, "1454": {"logprob": -1.1229249238967896, "rank": 2, "decoded_token": " with"}, "1395": {"logprob": -2.4354248046875, "rank": 3, "decoded_token": " is"}, "18970": {"logprob": -2.6854248046875, "rank": 4, "decoded_token": " sitting"}, "22524": {"logprob": -3.6854248046875, "rank": 5, "decoded_token": " lies"}}, {"41132": {"logprob": -0.5888903737068176, "rank": 1, "decoded_token": " attent"}, "106534": {"logprob": -1.2763903141021729, "rank": 2, "decoded_token": " calmly"}, "12276": {"logprob": -2.838890314102173, "rank": 3, "decoded_token": " alert"}, "1408": {"logprob": -2.901390314102173, "rank": 4, "decoded_token": " on"}, "6482": {"logprob": -5.026390552520752, "rank": 5, "decoded_token": " patient"}}, {"3923": {"logprob": -9.16677454370074e-05, "rank": 1, "decoded_token": "ively"}, "1556": {"logprob": -9.625091552734375, "rank": 2, "decoded_token": "ive"}, "6655": {"logprob": -10.875091552734375, "rank": 3, "decoded_token": "atively"}, "3929": {"logprob": -13.125091552734375, "rank": 4, "decoded_token": "ently"}, "47885": {"logprob": -13.750091552734375, "rank": 5, "decoded_token": "edly"}}, {"1408": {"logprob": -0.052677519619464874, "rank": 1, "decoded_token": " on"}, "3675": {"logprob": -3.802677631378174, "rank": 2, "decoded_token": " against"}, "1454": {"logprob": -4.302677631378174, "rank": 3, "decoded_token": " with"}, "1294": {"logprob": -5.177677631378174, "rank": 4, "decoded_token": " in"}, "7283": {"logprob": -5.427677631378174, "rank": 5, "decoded_token": " looking"}}, {"1261": {"logprob": -0.36706605553627014, "rank": 1, "decoded_token": " a"}, "32656": {"logprob": -1.2420660257339478, "rank": 2, "decoded_token": " wooden"}, "17253": {"logprob": -4.617065906524658, "rank": 3, "decoded_token": " weather"}, "44130": {"logprob": -5.742065906524658, "rank": 4, "decoded_token": " rust"}, "12603": {"logprob": -6.617065906524658, "rank": 5, "decoded_token": " wood"}}, {"32656": {"logprob": -0.07824385166168213, "rank": 1, "decoded_token": " wooden"}, "44130": {"logprob": -2.8282437324523926, "rank": 2, "decoded_token": " rust"}, "17253": {"logprob": -4.703243732452393, "rank": 3, "decoded_token": " weather"}, "12603": {"logprob": -5.828243732452393, "rank": 4, "decoded_token": " wood"}, "3403": {"logprob": -5.953243732452393, "rank": 5, "decoded_token": " text"}}, {"11237": {"logprob": -0.5853750705718994, "rank": 1, "decoded_token": " floor"}, "4691": {"logprob": -1.0853750705718994, "rank": 2, "decoded_token": " surface"}, "7042": {"logprob": -2.7103750705718994, "rank": 3, "decoded_token": " background"}, "28984": {"logprob": -3.5853750705718994, "rank": 4, "decoded_token": " deck"}, "92504": {"logprob": -6.08537483215332, "rank": 5, "decoded_token": " backdrop"}}, {"1626": {"logprob": -0.7340722680091858, "rank": 1, "decoded_token": ".\n"}, "1044": {"logprob": -0.8590722680091858, "rank": 2, "decoded_token": ","}, "1454": {"logprob": -3.359072208404541, "rank": 3, "decoded_token": " with"}, "7283": {"logprob": -3.609072208404541, "rank": 4, "decoded_token": " looking"}, "1321": {"logprob": -4.109072208404541, "rank": 5, "decoded_token": " and"}}, {"1050": {"logprob": -1.1324817933200393e-05, "rank": 1, "decoded_token": "2"}, "1051": {"logprob": -11.625011444091797, "rank": 2, "decoded_token": "3"}, "1256": {"logprob": -14.000011444091797, "rank": 3, "decoded_token": "  "}, "1049": {"logprob": -14.625011444091797, "rank": 4, "decoded_token": "1"}, "1032": {"logprob": -14.625011444091797, "rank": 5, "decoded_token": " "}}, {"1046": {"logprob": -2.50339189733495e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -13.56250286102295, "rank": 2, "decoded_token": ".A"}, "1626": {"logprob": -15.43750286102295, "rank": 3, "decoded_token": ".\n"}, "4700": {"logprob": -15.50000286102295, "rank": 4, "decoded_token": ".M"}, "3051": {"logprob": -16.000001907348633, "rank": 5, "decoded_token": ".S"}}, {"1349": {"logprob": -0.6769706010818481, "rank": 1, "decoded_token": " A"}, "11826": {"logprob": -1.9269706010818481, "rank": 2, "decoded_token": " Maj"}, "37159": {"logprob": -2.1144704818725586, "rank": 3, "decoded_token": " Snow"}, "27260": {"logprob": -2.6144704818725586, "rank": 4, "decoded_token": " Mountain"}, "113465": {"logprob": -2.8644704818725586, "rank": 5, "decoded_token": " Rug"}}, {"15375": {"logprob": -0.9251430034637451, "rank": 1, "decoded_token": " vast"}, "10726": {"logprob": -2.300143003463745, "rank": 2, "decoded_token": " scen"}, "4521": {"logprob": -2.362643003463745, "rank": 3, "decoded_token": " range"}, "122203": {"logprob": -2.425143003463745, "rank": 4, "decoded_token": " rugged"}, "61082": {"logprob": -2.800143003463745, "rank": 5, "decoded_token": " panor"}}, {"24361": {"logprob": -0.5277582406997681, "rank": 1, "decoded_token": " mountain"}, "127945": {"logprob": -1.902758240699768, "rank": 2, "decoded_token": " mountainous"}, "28035": {"logprob": -2.5277581214904785, "rank": 3, "decoded_token": " landscape"}, "4521": {"logprob": -2.5277581214904785, "rank": 4, "decoded_token": " range"}, "1044": {"logprob": -2.7777581214904785, "rank": 5, "decoded_token": ","}}, {"4521": {"logprob": -0.055658817291259766, "rank": 1, "decoded_token": " range"}, "28035": {"logprob": -2.9306588172912598, "rank": 2, "decoded_token": " landscape"}, "37691": {"logprob": -8.430658340454102, "rank": 3, "decoded_token": " valley"}, "13327": {"logprob": -9.055658340454102, "rank": 4, "decoded_token": " scene"}, "3719": {"logprob": -9.805658340454102, "rank": 5, "decoded_token": " view"}}, {"94973": {"logprob": -0.6880245208740234, "rank": 1, "decoded_token": " stretches"}, "2425": {"logprob": -1.7505245208740234, "rank": 2, "decoded_token": " under"}, "1395": {"logprob": -2.3130245208740234, "rank": 3, "decoded_token": " is"}, "1454": {"logprob": -2.6880245208740234, "rank": 4, "decoded_token": " with"}, "7038": {"logprob": -3.2505245208740234, "rank": 5, "decoded_token": " extends"}}, {"5669": {"logprob": -0.4545598328113556, "rank": 1, "decoded_token": " across"}, "2425": {"logprob": -1.4545598030090332, "rank": 2, "decoded_token": " under"}, "1848": {"logprob": -2.454559803009033, "rank": 3, "decoded_token": " out"}, "2203": {"logprob": -4.204559803009033, "rank": 4, "decoded_token": " into"}, "25136": {"logprob": -4.642059803009033, "rank": 5, "decoded_token": " beneath"}}, {"1278": {"logprob": -0.23015151917934418, "rank": 1, "decoded_token": " the"}, "1261": {"logprob": -1.6051515340805054, "rank": 2, "decoded_token": " a"}, "1420": {"logprob": -5.605151653289795, "rank": 3, "decoded_token": " an"}, "2425": {"logprob": -7.167651653289795, "rank": 4, "decoded_token": " under"}, "1454": {"logprob": -10.167651176452637, "rank": 5, "decoded_token": " with"}}, {"48932": {"logprob": -0.2797861397266388, "rank": 1, "decoded_token": " horizon"}, "21283": {"logprob": -2.0297861099243164, "rank": 2, "decoded_token": " sky"}, "3937": {"logprob": -3.2797861099243164, "rank": 3, "decoded_token": " image"}, "28035": {"logprob": -3.6547861099243164, "rank": 4, "decoded_token": " landscape"}, "3044": {"logprob": -3.7797861099243164, "rank": 5, "decoded_token": " sk"}}, {"2425": {"logprob": -0.28862035274505615, "rank": 1, "decoded_token": " under"}, "1044": {"logprob": -2.4136204719543457, "rank": 2, "decoded_token": ","}, "1454": {"logprob": -2.5386204719543457, "rank": 3, "decoded_token": " with"}, "1626": {"logprob": -3.7886204719543457, "rank": 4, "decoded_token": ".\n"}, "1408": {"logprob": -3.9136204719543457, "rank": 5, "decoded_token": " on"}}, {"1261": {"logprob": -0.04524127021431923, "rank": 1, "decoded_token": " a"}, "16152": {"logprob": -4.045241355895996, "rank": 2, "decoded_token": " cloud"}, "1420": {"logprob": -4.045241355895996, "rank": 3, "decoded_token": " an"}, "2136": {"logprob": -6.107741355895996, "rank": 4, "decoded_token": " over"}, "6133": {"logprob": -6.357741355895996, "rank": 5, "decoded_token": " clear"}}, {"16152": {"logprob": -0.19613930583000183, "rank": 1, "decoded_token": " cloud"}, "6133": {"logprob": -2.883639335632324, "rank": 2, "decoded_token": " clear"}, "27254": {"logprob": -3.508639335632324, "rank": 3, "decoded_token": " partly"}, "18416": {"logprob": -3.883639335632324, "rank": 4, "decoded_token": " haz"}, "4391": {"logprob": -4.321139335632324, "rank": 5, "decoded_token": " light"}}, {"1121": {"logprob": -0.05146069824695587, "rank": 1, "decoded_token": "y"}, "1286": {"logprob": -3.8014607429504395, "rank": 2, "decoded_token": "ed"}, "77187": {"logprob": -4.5514607429504395, "rank": 3, "decoded_token": "-filled"}, "114525": {"logprob": -4.9264607429504395, "rank": 4, "decoded_token": "-covered"}, "4527": {"logprob": -4.9264607429504395, "rank": 5, "decoded_token": "less"}}, {"21283": {"logprob": -0.00033122775494121015, "rank": 1, "decoded_token": " sky"}, "10991": {"logprob": -8.875330924987793, "rank": 2, "decoded_token": " blue"}, "1044": {"logprob": -9.500330924987793, "rank": 3, "decoded_token": ","}, "26549": {"logprob": -10.500330924987793, "rank": 4, "decoded_token": " gray"}, "34052": {"logprob": -11.375330924987793, "rank": 5, "decoded_token": " grey"}}, {"1626": {"logprob": -0.00012683063687290996, "rank": 1, "decoded_token": ".\n"}, "1044": {"logprob": -9.500126838684082, "rank": 2, "decoded_token": ","}, "1046": {"logprob": -10.500126838684082, "rank": 3, "decoded_token": "."}, "1454": {"logprob": -10.875126838684082, "rank": 4, "decoded_token": " with"}, "1294": {"logprob": -13.375126838684082, "rank": 5, "decoded_token": " in"}}, {"1051": {"logprob": -3.2186455882765586e-06, "rank": 1, "decoded_token": "3"}, "1052": {"logprob": -12.75000286102295, "rank": 2, "decoded_token": "4"}, "1050": {"logprob": -15.00000286102295, "rank": 3, "decoded_token": "2"}, "1049": {"logprob": -17.000003814697266, "rank": 4, "decoded_token": "1"}, "1032": {"logprob": -17.937503814697266, "rank": 5, "decoded_token": " "}}, {"1046": {"logprob": -1.9073468138230965e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -14.625001907348633, "rank": 2, "decoded_token": ".A"}, "5226": {"logprob": -15.625001907348633, "rank": 3, "decoded_token": ".D"}, "6847": {"logprob": -15.750001907348633, "rank": 4, "decoded_token": ".T"}, "4700": {"logprob": -16.750001907348633, "rank": 5, "decoded_token": ".M"}}, {"8342": {"logprob": -0.5928499102592468, "rank": 1, "decoded_token": " Sur"}, "1349": {"logprob": -1.6553499698638916, "rank": 2, "decoded_token": " A"}, "22468": {"logprob": -2.5303499698638916, "rank": 3, "decoded_token": " Several"}, "1488": {"logprob": -2.7178499698638916, "rank": 4, "decoded_token": " W"}, "15035": {"logprob": -3.2178499698638916, "rank": 5, "decoded_token": " People"}}, {"71284": {"logprob": -0.003268140833824873, "rank": 1, "decoded_token": "fers"}, "1102": {"logprob": -5.878268241882324, "rank": 2, "decoded_token": "f"}, "1726": {"logprob": -7.753268241882324, "rank": 3, "decoded_token": "fer"}, "61888": {"logprob": -12.315768241882324, "rank": 4, "decoded_token": "fline"}, "2119": {"logprob": -13.065768241882324, "rank": 5, "decoded_token": "fter"}}, {"7377": {"logprob": -1.4883846044540405, "rank": 1, "decoded_token": " wait"}, "1584": {"logprob": -1.7383846044540405, "rank": 2, "decoded_token": " are"}, "88014": {"logprob": -1.9258846044540405, "rank": 3, "decoded_token": " paddle"}, "1294": {"logprob": -1.9258846044540405, "rank": 4, "decoded_token": " in"}, "24434": {"logprob": -2.23838472366333, "rank": 5, "decoded_token": " ride"}}, {"1394": {"logprob": -0.6120346188545227, "rank": 1, "decoded_token": " for"}, "1294": {"logprob": -0.9870346188545227, "rank": 2, "decoded_token": " in"}, "1408": {"logprob": -2.737034559249878, "rank": 3, "decoded_token": " on"}, "6482": {"logprob": -4.487034797668457, "rank": 4, "decoded_token": " patient"}, "1321": {"logprob": -5.612034797668457, "rank": 5, "decoded_token": " and"}}, {"22140": {"logprob": -0.008224429562687874, "rank": 1, "decoded_token": " waves"}, "1278": {"logprob": -5.5082244873046875, "rank": 2, "decoded_token": " the"}, "1261": {"logprob": -5.6332244873046875, "rank": 3, "decoded_token": " a"}, "39460": {"logprob": -8.133224487304688, "rank": 4, "decoded_token": " incoming"}, "1321": {"logprob": -9.758224487304688, "rank": 5, "decoded_token": " and"}}, {"1294": {"logprob": -0.3204176723957062, "rank": 1, "decoded_token": " in"}, "1408": {"logprob": -2.195417642593384, "rank": 2, "decoded_token": " on"}, "1513": {"logprob": -2.320417642593384, "rank": 3, "decoded_token": " at"}, "3016": {"logprob": -3.695417642593384, "rank": 4, "decoded_token": " while"}, "1435": {"logprob": -3.820417642593384, "rank": 5, "decoded_token": " as"}}, {"1278": {"logprob": -0.004615250043570995, "rank": 1, "decoded_token": " the"}, "1261": {"logprob": -6.192115306854248, "rank": 2, "decoded_token": " a"}, "1420": {"logprob": -6.942115306854248, "rank": 3, "decoded_token": " an"}, "40466": {"logprob": -7.317115306854248, "rank": 4, "decoded_token": " shallow"}, "26517": {"logprob": -7.879615306854248, "rank": 5, "decoded_token": " calm"}}, {"27208": {"logprob": -0.06491076946258545, "rank": 1, "decoded_token": " ocean"}, "7786": {"logprob": -3.439910888671875, "rank": 2, "decoded_token": " distance"}, "5124": {"logprob": -5.314910888671875, "rank": 3, "decoded_token": " early"}, "26517": {"logprob": -5.377410888671875, "rank": 4, "decoded_token": " calm"}, "11196": {"logprob": -5.377410888671875, "rank": 5, "decoded_token": " sea"}}, {"1513": {"logprob": -1.144903540611267, "rank": 1, "decoded_token": " at"}, "1435": {"logprob": -1.269903540611267, "rank": 2, "decoded_token": " as"}, "3184": {"logprob": -1.394903540611267, "rank": 3, "decoded_token": " during"}, "3016": {"logprob": -3.0199036598205566, "rank": 4, "decoded_token": " while"}, "6117": {"logprob": -3.1449036598205566, "rank": 5, "decoded_token": " near"}}, {"97558": {"logprob": -0.12556149065494537, "rank": 1, "decoded_token": " sunset"}, "11729": {"logprob": -2.875561475753784, "rank": 2, "decoded_token": " sun"}, "1266": {"logprob": -3.375561475753784, "rank": 3, "decoded_token": " d"}, "54507": {"logprob": -4.000561714172363, "rank": 4, "decoded_token": " dawn"}, "1261": {"logprob": -5.125561714172363, "rank": 5, "decoded_token": " a"}}, {"1626": {"logprob": -0.26737067103385925, "rank": 1, "decoded_token": ".\n"}, "1044": {"logprob": -2.2673707008361816, "rank": 2, "decoded_token": ","}, "3016": {"logprob": -2.7673707008361816, "rank": 3, "decoded_token": " while"}, "1454": {"logprob": -3.5173707008361816, "rank": 4, "decoded_token": " with"}, "6117": {"logprob": -4.142370700836182, "rank": 5, "decoded_token": " near"}}, {"1052": {"logprob": -2.9802276912960224e-06, "rank": 1, "decoded_token": "4"}, "1051": {"logprob": -13.37500286102295, "rank": 2, "decoded_token": "3"}, "1049": {"logprob": -14.00000286102295, "rank": 3, "decoded_token": "1"}, "1053": {"logprob": -14.56250286102295, "rank": 4, "decoded_token": "5"}, "1032": {"logprob": -16.750003814697266, "rank": 5, "decoded_token": " "}}, {"1046": {"logprob": -1.6689286894688848e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -13.500001907348633, "rank": 2, "decoded_token": ".A"}, "6847": {"logprob": -16.562501907348633, "rank": 3, "decoded_token": ".T"}, "1044": {"logprob": -17.312501907348633, "rank": 4, "decoded_token": ","}, "1349": {"logprob": -17.500001907348633, "rank": 5, "decoded_token": " A"}}, {"1349": {"logprob": -0.004883386194705963, "rank": 1, "decoded_token": " A"}, "2048": {"logprob": -5.504883289337158, "rank": 2, "decoded_token": " An"}, "10638": {"logprob": -7.754883289337158, "rank": 3, "decoded_token": " Two"}, "111463": {"logprob": -9.754883766174316, "rank": 4, "decoded_token": " Trees"}, "1531": {"logprob": -10.692383766174316, "rank": 5, "decoded_token": " The"}}, {"53301": {"logprob": -1.5612412691116333, "rank": 1, "decoded_token": " winding"}, "15192": {"logprob": -1.7487412691116333, "rank": 2, "decoded_token": " narrow"}, "47945": {"logprob": -2.1237411499023438, "rank": 3, "decoded_token": " dirt"}, "2169": {"logprob": -2.5612411499023438, "rank": 4, "decoded_token": " ser"}, "59396": {"logprob": -2.6862411499023438, "rank": 5, "decoded_token": " gravel"}}, {"59396": {"logprob": -0.9024254083633423, "rank": 1, "decoded_token": " gravel"}, "3549": {"logprob": -1.1524254083633423, "rank": 2, "decoded_token": " path"}, "47945": {"logprob": -1.6524254083633423, "rank": 3, "decoded_token": " dirt"}, "14801": {"logprob": -3.1524252891540527, "rank": 4, "decoded_token": " pathway"}, "15551": {"logprob": -4.277425289154053, "rank": 5, "decoded_token": " stone"}}, {"3549": {"logprob": -0.021290099248290062, "rank": 1, "decoded_token": " path"}, "14801": {"logprob": -3.8962900638580322, "rank": 2, "decoded_token": " pathway"}, "33659": {"logprob": -7.896290302276611, "rank": 3, "decoded_token": " trail"}, "9480": {"logprob": -9.521289825439453, "rank": 4, "decoded_token": " road"}, "7368": {"logprob": -9.646289825439453, "rank": 5, "decoded_token": "path"}}, {"13335": {"logprob": -0.16593234241008759, "rank": 1, "decoded_token": " leads"}, "39985": {"logprob": -2.8534324169158936, "rank": 2, "decoded_token": " cuts"}, "1639": {"logprob": -3.9784324169158936, "rank": 3, "decoded_token": " me"}, "11500": {"logprob": -4.1034321784973145, "rank": 4, "decoded_token": " runs"}, "2645": {"logprob": -4.2909321784973145, "rank": 5, "decoded_token": " through"}}, {"2645": {"logprob": -0.05767015367746353, "rank": 1, "decoded_token": " through"}, "8994": {"logprob": -4.0576701164245605, "rank": 2, "decoded_token": " towards"}, "2396": {"logprob": -4.1826701164245605, "rank": 3, "decoded_token": " between"}, "2203": {"logprob": -4.5576701164245605, "rank": 4, "decoded_token": " into"}, "1317": {"logprob": -5.5576701164245605, "rank": 5, "decoded_token": " to"}}, {"1261": {"logprob": -0.017209367826581, "rank": 1, "decoded_token": " a"}, "11223": {"logprob": -4.892209529876709, "rank": 2, "decoded_token": " green"}, "1295": {"logprob": -5.017209529876709, "rank": 3, "decoded_token": " l"}, "23170": {"logprob": -6.767209529876709, "rank": 4, "decoded_token": " grass"}, "1420": {"logprob": -7.267209529876709, "rank": 5, "decoded_token": " an"}}, {"1295": {"logprob": -0.9430665969848633, "rank": 1, "decoded_token": " l"}, "11223": {"logprob": -1.3180665969848633, "rank": 2, "decoded_token": " green"}, "23170": {"logprob": -1.9430665969848633, "rank": 3, "decoded_token": " grass"}, "12097": {"logprob": -2.4430665969848633, "rank": 4, "decoded_token": " park"}, "26428": {"logprob": -3.3180665969848633, "rank": 5, "decoded_token": " garden"}}, {"3506": {"logprob": -6.556489552167477e-06, "rank": 1, "decoded_token": "ush"}, "1374": {"logprob": -12.000006675720215, "rank": 2, "decoded_token": "us"}, "90716": {"logprob": -15.625006675720215, "rank": 3, "decoded_token": "USH"}, "16938": {"logprob": -15.875006675720215, "rank": 4, "decoded_token": "usher"}, "13326": {"logprob": -17.1875057220459, "rank": 5, "decoded_token": "inden"}}, {"11223": {"logprob": -0.36697858572006226, "rank": 1, "decoded_token": " green"}, "1044": {"logprob": -1.366978645324707, "rank": 2, "decoded_token": ","}, "26428": {"logprob": -3.491978645324707, "rank": 3, "decoded_token": " garden"}, "12097": {"logprob": -4.116978645324707, "rank": 4, "decoded_token": " park"}, "23170": {"logprob": -5.866978645324707, "rank": 5, "decoded_token": " grass"}}, {"12097": {"logprob": -0.5570574402809143, "rank": 1, "decoded_token": " park"}, "3727": {"logprob": -1.9320573806762695, "rank": 2, "decoded_token": " field"}, "28035": {"logprob": -2.1820573806762695, "rank": 3, "decoded_token": " landscape"}, "26428": {"logprob": -2.4320573806762695, "rank": 4, "decoded_token": " garden"}, "4457": {"logprob": -2.8070573806762695, "rank": 5, "decoded_token": " area"}}, {"1046": {"logprob": -0.7940837144851685, "rank": 1, "decoded_token": "."}, "1454": {"logprob": -1.2940837144851685, "rank": 2, "decoded_token": " with"}, "8994": {"logprob": -2.794083595275879, "rank": 3, "decoded_token": " towards"}, "54410": {"logprob": -3.544083595275879, "rank": 4, "decoded_token": " lined"}, "2425": {"logprob": -3.544083595275879, "rank": 5, "decoded_token": " under"}}, {"2": {"logprob": -2.145764938177308e-06, "rank": 1, "decoded_token": "</s>"}, "1032": {"logprob": -13.125001907348633, "rank": 2, "decoded_token": " "}, "1256": {"logprob": -16.000001907348633, "rank": 3, "decoded_token": "  "}, "1293": {"logprob": -18.750001907348633, "rank": 4, "decoded_token": "   "}, "1319": {"logprob": -19.687501907348633, "rank": 5, "decoded_token": " ("}}]]]
\ No newline at end of file
diff --git a/vllm_v0.10.0/tests/models/language/__init__.py b/vllm_v0.10.0/tests/models/language/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/models/language/generation/__init__.py b/vllm_v0.10.0/tests/models/language/generation/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/models/language/generation/test_bart.py b/vllm_v0.10.0/tests/models/language/generation/test_bart.py
new file mode 100644
index 0000000..b4c7718
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/language/generation/test_bart.py
@@ -0,0 +1,220 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import pytest
+from transformers import AutoModelForSeq2SeqLM
+
+from vllm.sequence import SampleLogprobs
+
+from ....conftest import (DecoderPromptType, ExplicitEncoderDecoderPrompt,
+                          HfRunner, VllmRunner)
+from ....utils import multi_gpu_test
+from ...utils import check_logprobs_close
+
+
+def vllm_to_hf_output(
+    vllm_output: tuple[list[int], str, Optional[SampleLogprobs]],
+    decoder_prompt_type: DecoderPromptType,
+):
+    """Sanitize vllm output to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    hf_output_str = output_str + "</s>"
+    if decoder_prompt_type == DecoderPromptType.NONE:
+        hf_output_str = "<s>" + hf_output_str
+
+    return output_ids, hf_output_str, out_logprobs
+
+
+def run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
+    decoder_prompt_type: DecoderPromptType,
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+) -> None:
+    '''
+    Test the vLLM BART model for a variety of encoder/decoder input prompts,
+    by validating it against HuggingFace (HF) BART.
+
+    Arguments:
+
+    * hf_runner: HuggingFace (HF) test model runner
+    * vllm_runner: vLLM test model runner
+    * example_encoder_decoder_prompts: test fixture which provides a 
+                                       dictionary of dummy prompts
+    * model: the HF ID of the specific BART variant under test
+    * dtype: the tensor datatype to employ
+    * max_tokens
+    * num_logprobs
+    * decoder_prompt_type: key into the example_encoder_decoder_prompts
+                           dictionary; selects specific encoder/decoder
+                           prompt scenarios to test
+
+    A note on using HF BART as a baseline for validating vLLM BART,
+    specifically when the decoder prompt is None. 
+    
+    The HF GenerationMixin's default behavior is to force the first
+    decoded token to be <BOS> if the prompt does not already contain
+    <BOS> (this is accomplished using a logit
+    processor setting.)
+    
+    So when we use HF BART as our baseline for comparison, note that
+    when the user provides a request with a None decoder prompt
+    (i.e. a singleton encoder prompt, or else an explicit encoder/
+    decoder prompt with the decoder sub-prompt set to None), HF and
+    vLLM handle this in different ways:
+    
+    * HF will (1) tokenize the None prompt as an empty token-list, 
+      (2) append <decoder-start-token> to the beginning, yielding
+      [<decoder-start-token>], (3) pass this token list to the model, and
+      then (4) after computing logits during prefill, override the model
+      logits & force <BOS> to be the first generated token.
+    
+    * vLLM will (1) tokenize the None prompt as [<BOS>], (2) append decoder-
+      start-token to the beginning, yielding [<decoder-start-token><BOS>],
+      (3) pass these tokens to the model & proceed with generation.
+    
+    The net effect is that compared to vLLM, the list of HF *decoded* tokens
+    will contain one more initial <BOS> than the vLLM generated tokens,
+    because vLLM's <BOS> token is injected into the prompt rather than into
+    the generated output. This is in spite of the fact that overall, the
+    complete sequences (prompt + decoded tokens) produced by vLLM will match
+    HF.
+    
+    So when we use HF decoded token output to validate vLLM's decoded token
+    output, the testing process must account for the difference in decoded
+    token sequences between vLLM and HF specifically in the
+    decoder-prompt-is-None case. 
+    
+    One option is to disable the logit processor feature that forces the
+    <BOS> token to be decoded (forced_bos_token_id = None), eliminating
+    the problem entirely. However this is not "normal" BART usage.
+    
+    The other option is - only in the decoder-prompt-is-None case - to
+    discard the first decoded token from the HF output before comparing it
+    to vLLM.
+
+    To that end, when testing the scenario where the decoder prompt is None
+    (and only in that one scenario), this test skips the first HF decoded
+    token during the process of validating the vLLM decoded output.
+    '''
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default).
+
+    # Note: currently encoder/decoder models are only compatible with
+    # enforce_eager=True. Normally this is not a problem because
+    # for encoder/decoder models vLLM will
+    # default to enforce_eager=True if enforce_eager
+    # is left unspecified. However, the
+    # VllmRunner test fixture (which wraps around the LLM class) defaults to
+    # enforce_eager=False (a behavior which a number of already-existing
+    # decoder-only unit tests expect), so when testing an encoder/decoder
+    # model we must explicitly specify enforce_eager=True in the VllmRunner
+    # constructor.
+    with vllm_runner(model,
+                     dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
+            prompts, max_tokens, num_logprobs)
+
+    # Configuration settings for HF baseline
+    hf_kwargs = {
+        "top_k": None,
+        "num_beams": 1,
+        "repetition_penalty": 1.0,
+        "top_p": 1.0,
+        "length_penalty": 1.0,
+        "early_stopping": False,
+        "no_repeat_ngram_size": None,
+        "min_length": 0
+    }
+
+    with hf_runner(model, dtype=dtype,
+                   auto_cls=AutoModelForSeq2SeqLM) as hf_model:
+        hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
+            prompts,
+            max_tokens,
+            num_logprobs,
+            **hf_kwargs,
+        ))
+
+    hf_skip_tokens = (1
+                      if decoder_prompt_type == DecoderPromptType.NONE else 0)
+
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=[
+            vllm_to_hf_output(vllm_output, decoder_prompt_type)
+            for vllm_output in vllm_outputs
+        ],
+        name_0="hf",
+        name_1="vllm",
+        num_outputs_0_skip_tokens=hf_skip_tokens,
+    )
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        pytest.param("facebook/bart-base",
+                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+        pytest.param("facebook/bart-large-cnn"),
+    ],
+)
+@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
+def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
+                dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None:
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        example_encoder_decoder_prompts[decoder_prompt_type],
+        decoder_prompt_type,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
+@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM])
+def test_models_distributed(hf_runner, vllm_runner,
+                            example_encoder_decoder_prompts,
+                            distributed_executor_backend, model, dtype,
+                            max_tokens, num_logprobs,
+                            decoder_prompt_type) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        example_encoder_decoder_prompts[decoder_prompt_type],
+        decoder_prompt_type,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=2,
+        distributed_executor_backend=distributed_executor_backend,
+    )
diff --git a/vllm_v0.10.0/tests/models/language/generation/test_common.py b/vllm_v0.10.0/tests/models/language/generation/test_common.py
new file mode 100644
index 0000000..ea240d2
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/language/generation/test_common.py
@@ -0,0 +1,174 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+from typing import Optional
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+
+from ....utils import large_gpu_mark
+from ...registry import HF_EXAMPLE_MODELS
+from ...utils import check_logprobs_close
+
+# These have unsupported head_dim for FA. We do not
+# not have a clean way to fall back, so we fail with
+# a clear msg when it happens.
+# https://github.com/vllm-project/vllm/issues/14524
+REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
+
+# This list contains the model that are using AITER kernel.
+# Skip model that are not using AITER tests.
+# When more AITER kernels are added, this list will not be
+# needed as all the models will be calling AITER kernels
+# in parts of the operators
+AITER_MODEL_LIST = [
+    "meta-llama/Llama-3.2-1B-Instruct",
+    "openbmb/MiniCPM3-4B",
+    "Qwen/Qwen-7B-Chat",
+    "Qwen/Qwen2.5-0.5B-Instruct",
+    "TitanML/tiny-mixtral",
+    "Qwen/Qwen3-8B",
+]
+
+
+# @maybe_test_rocm_aiter
+@pytest.mark.parametrize(
+    "model",
+    [
+        pytest.param(
+            "bigscience/bloom-560m",  # bloom - testing alibi slopes
+            marks=[pytest.mark.core_model],
+        ),
+        pytest.param(
+            "openai-community/gpt2",  # gpt2
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        ),
+        pytest.param("Milos/slovak-gpt-j-405M"),  # gptj
+        pytest.param("bigcode/tiny_starcoder_py"),  # gpt_bigcode
+        pytest.param("EleutherAI/pythia-70m"),  # gpt_neox
+        pytest.param(
+            "google/gemma-1.1-2b-it",  # gemma
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        ),
+        pytest.param(
+            "THUDM/chatglm3-6b",  # chatglm (text-only)
+        ),
+        pytest.param(
+            "meta-llama/Llama-3.2-1B-Instruct",  # llama
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        ),
+        pytest.param(
+            "openbmb/MiniCPM3-4B",
+            # fused_moe not supported on CPU
+            marks=[pytest.mark.core_model,
+                   large_gpu_mark(min_gb=32)],
+        ),
+        pytest.param(
+            "facebook/opt-125m",  # opt
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        ),
+        pytest.param(
+            "microsoft/phi-2",  # phi
+            marks=[pytest.mark.core_model],
+        ),
+        pytest.param(
+            "Qwen/Qwen-7B-Chat",  # qwen (text-only)
+        ),
+        pytest.param(
+            "Qwen/Qwen2.5-0.5B-Instruct",  # qwen2
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        ),
+        pytest.param(
+            "Qwen/Qwen3-8B",  # qwen (text-only)
+        ),
+        pytest.param("stabilityai/stablelm-3b-4e1t"),  # stablelm
+        pytest.param("bigcode/starcoder2-3b"),  # starcoder2
+        pytest.param(
+            "TitanML/tiny-mixtral",  # mixtral
+            marks=[pytest.mark.core_model],
+        ),
+        pytest.param(
+            "allenai/OLMoE-1B-7B-0924-Instruct",
+            marks=[pytest.mark.cpu_model],
+        )
+    ])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
+def test_models(hf_runner, vllm_runner, example_prompts, model: str,
+                max_tokens: int, num_logprobs: int, use_rocm_aiter: bool,
+                monkeypatch) -> None:
+
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
+
+    if model in REQUIRES_V0:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
+
+    if use_rocm_aiter and (model in AITER_MODEL_LIST):
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+    elif use_rocm_aiter and model not in AITER_MODEL_LIST:
+        # Skip model that are not using AITER tests.
+        # When more AITER kernels are added, this list will not be
+        # needed as all the models will be calling AITER kernels
+        # in parts of the operators
+        pytest.skip(f"Skipping '{model}' model test with AITER kernel.")
+
+    use_prompt_embeds = os.getenv("VLLM_USE_V1") == "0"
+
+    with hf_runner(model) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
+
+        prompt_embeds: Optional[list[torch.Tensor]] = ([] if use_prompt_embeds
+                                                       else None)
+
+        prompt_token_ids = []
+        for prompt in example_prompts:
+            token_ids = hf_model.tokenizer(prompt,
+                                           return_tensors="pt").input_ids.to(
+                                               hf_model.model.device)
+            prompt_token_ids.append(token_ids)
+            if prompt_embeds is not None:
+                prompt_embeds.append(hf_model.model.get_input_embeddings()(
+                    token_ids).squeeze(0))
+
+    with vllm_runner(
+            model,
+            tokenizer_name=model_info.tokenizer or model,
+            tokenizer_mode=model_info.tokenizer_mode,
+            trust_remote_code=model_info.trust_remote_code,
+            max_num_seqs=2,
+            enable_prompt_embeds=use_prompt_embeds,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+        if prompt_embeds is not None:
+            vllm_outputs_from_embeds = vllm_model.generate_greedy_logprobs(
+                prompt_embeds, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+    if prompt_embeds is not None:
+        check_logprobs_close(
+            outputs_0_lst=vllm_outputs,
+            outputs_1_lst=vllm_outputs_from_embeds,
+            name_0="vllm",
+            name_1="vllm_from_embeds",
+        )
+
+    if use_rocm_aiter:
+        # this is to ensure that vllm engine
+        # has deallocated the memory before running the next
+        # unit tests. On ROCm, when using AITER
+        # the memory might not be deallocated completely
+        # before running the next test case
+        torch.cuda.synchronize()
diff --git a/vllm_v0.10.0/tests/models/language/generation/test_gemma.py b/vllm_v0.10.0/tests/models/language/generation/test_gemma.py
new file mode 100644
index 0000000..60a4bc1
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/language/generation/test_gemma.py
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import numpy as np
+import pytest
+
+MODELS = ["google/gemma-2b", "google/gemma-2-2b", "google/gemma-3-4b-it"]
+
+
+@pytest.mark.parametrize("model", MODELS)
+def test_dummy_loader(vllm_runner, monkeypatch, model: str) -> None:
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+        with vllm_runner(
+                model,
+                load_format="dummy",
+        ) as llm:
+            if model == "google/gemma-3-4b-it":
+                normalizers = llm.llm.collective_rpc(
+                    lambda self: self.model_runner.model.language_model.model.
+                    normalizer.cpu().item())
+                config = llm.llm.llm_engine.model_config.hf_config.text_config
+            else:
+                normalizers = llm.llm.collective_rpc(
+                    lambda self: self.model_runner.model.model.normalizer.cpu(
+                    ).item())
+                config = llm.llm.llm_engine.model_config.hf_config
+            assert np.allclose(normalizers, config.hidden_size**0.5, rtol=2e-3)
diff --git a/vllm_v0.10.0/tests/models/language/generation/test_granite.py b/vllm_v0.10.0/tests/models/language/generation/test_granite.py
new file mode 100644
index 0000000..2a39f78
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/language/generation/test_granite.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from ...utils import check_logprobs_close
+
+MODELS = [
+    # TODO(sang): Sliding window should be tested separately.
+    "ibm/PowerLM-3b",
+    "ibm/PowerMoE-3b",
+]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
+
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
diff --git a/vllm_v0.10.0/tests/models/language/generation/test_hybrid.py b/vllm_v0.10.0/tests/models/language/generation/test_hybrid.py
new file mode 100644
index 0000000..2238924
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/language/generation/test_hybrid.py
@@ -0,0 +1,389 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from tests.models.registry import HF_EXAMPLE_MODELS
+from tests.utils import multi_gpu_test
+from vllm.engine.arg_utils import EngineArgs
+from vllm.sampling_params import SamplingParams
+
+from ...utils import check_logprobs_close, check_outputs_equal
+
+# Mark all tests as hybrid
+pytestmark = pytest.mark.hybrid_model
+
+# NOTE: The first model in each list is taken as the primary model,
+# meaning that it will be used in all tests in this file
+# The rest of the models will only be tested by test_models
+
+SSM_MODELS = [
+    "state-spaces/mamba-130m-hf",
+    "tiiuae/falcon-mamba-tiny-dev",
+    "mistralai/Mamba-Codestral-7B-v0.1",
+]
+
+HYBRID_MODELS = [
+    "ai21labs/Jamba-tiny-dev",
+    # NOTE: Running Plamo2 in transformers implementation requires to install
+    # causal-conv1d package, which is not listed as a test dependency as it's
+    # not compatible with pip-compile.
+    "pfnet/plamo-2-1b",
+    "Zyphra/Zamba2-1.2B-instruct",
+    "hmellor/tiny-random-BambaForCausalLM",
+    "ibm-ai-platform/Bamba-9B-v1",
+    "nvidia/Nemotron-H-8B-Base-8K",
+    "ibm-granite/granite-4.0-tiny-preview",
+    "tiiuae/Falcon-H1-0.5B-Base",
+]
+
+HF_UNSUPPORTED_MODELS = [
+    # The HF transformers implementation of
+    # Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test
+    # doesn't compare vLLM output with HF output.
+    # See https://github.com/huggingface/transformers/pull/35943
+    "mistralai/Mamba-Codestral-7B-v0.1",
+    # Note: I'm not seeing the same output from vLLM V0 vs. HF transformers
+    # for Nemotron-H-8B; currently only compare vLLM V0 vs. vLLM V1
+    "nvidia/Nemotron-H-8B-Base-8K",
+    # NOTE: Currently the test fails due to HF transformers issue fixed in:
+    # https://github.com/huggingface/transformers/pull/39033
+    # We will enable vLLM test for Granite after next HF transformers release.
+    "ibm-granite/granite-4.0-tiny-preview",
+]
+
+V1_SUPPORTED_MODELS = [
+    "mistralai/Mamba-Codestral-7B-v0.1",
+    "ibm-ai-platform/Bamba-9B-v1",
+    "Zyphra/Zamba2-1.2B-instruct",
+    "nvidia/Nemotron-H-8B-Base-8K",
+    "ibm-granite/granite-4.0-tiny-preview",
+    "tiiuae/Falcon-H1-0.5B-Base",
+]
+
+# Avoid OOM
+MAX_NUM_SEQS = 4
+
+
+@pytest.mark.parametrize("model", SSM_MODELS + HYBRID_MODELS)
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    monkeypatch,
+    model: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+
+    try:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+        model_info.check_available_online(on_fail="skip")
+        model_info.check_transformers_version(on_fail="skip")
+    except ValueError:
+        pass
+
+    with hf_runner(model) as hf_model:
+        if model not in HF_UNSUPPORTED_MODELS:
+            hf_outputs = hf_model.generate_greedy_logprobs_limit(
+                example_prompts, max_tokens, num_logprobs)
+        else:
+            hf_outputs = None
+
+    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+        vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    if model in V1_SUPPORTED_MODELS:
+        with monkeypatch.context() as m:
+            m.setenv("VLLM_USE_V1", "1")
+            if model in HYBRID_MODELS:
+                # required due to reorder_batch behaviour
+                m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
+            with vllm_runner(model,
+                             max_num_seqs=MAX_NUM_SEQS,
+                             enable_prefix_caching=False) as vllm_model:
+                vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
+                    example_prompts, max_tokens, num_logprobs)
+    else:
+        vllm_v1_outputs = None
+
+    if hf_outputs is not None:
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_v0_outputs,
+            name_0="hf",
+            name_1="vllm-v0",
+        )
+
+    if model in V1_SUPPORTED_MODELS:
+        ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
+        check_logprobs_close(
+            outputs_0_lst=ref_outputs,
+            outputs_1_lst=vllm_v1_outputs,
+            name_0="hf" if hf_outputs is not None else "vllm-v0",
+            name_1="vllm-v1",
+        )
+
+
+@pytest.mark.parametrize("model", SSM_MODELS + HYBRID_MODELS)
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_batching(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+
+    try:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+        model_info.check_available_online(on_fail="skip")
+        model_info.check_transformers_version(on_fail="skip")
+    except ValueError:
+        pass
+
+    for_loop_outputs = []
+    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+        for prompt in example_prompts:
+            single_output, = vllm_model.generate_greedy_logprobs([prompt],
+                                                                 max_tokens,
+                                                                 num_logprobs)
+            for_loop_outputs.append(single_output)
+
+        batched_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=for_loop_outputs,
+        outputs_1_lst=batched_outputs,
+        name_0="for_loop_vllm",
+        name_1="batched_vllm",
+    )
+
+
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
+def test_chunked_prefill(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    max_tokens: int,
+    num_logprobs: int,
+    chunked_prefill_token_size: int,
+) -> None:
+    max_num_seqs = chunked_prefill_token_size
+    max_num_batched_tokens = chunked_prefill_token_size
+
+    with vllm_runner(model,
+                     enable_chunked_prefill=True,
+                     max_num_batched_tokens=max_num_batched_tokens,
+                     max_num_seqs=max_num_seqs) as vllm_model:
+        chunked = vllm_model.generate_greedy_logprobs(example_prompts,
+                                                      max_tokens, num_logprobs)
+
+    with vllm_runner(model,
+                     enable_chunked_prefill=False,
+                     max_num_seqs=max_num_seqs) as vllm_model:
+        non_chunked = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=chunked,
+        outputs_1_lst=non_chunked,
+        name_0="chunked",
+        name_1="non_chunked",
+    )
+
+
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
+@pytest.mark.parametrize("max_tokens", [10])
+def test_chunked_prefill_with_parallel_sampling(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    max_tokens: int,
+) -> None:
+    """
+    Tests chunked prefill in conjunction with n > 1. 
+    
+    In this case, prefill is populated with decoding tokens and
+    we test that it doesn't fail.
+
+    This test might fail if cache is not allocated correctly for n > 1
+    decoding steps inside a chunked prefill forward pass
+    (where we have both prefill and decode together)
+    """
+    sampling_params = SamplingParams(n=3,
+                                     temperature=1,
+                                     seed=0,
+                                     max_tokens=max_tokens)
+    with vllm_runner(
+            model,
+            enable_chunked_prefill=True,
+            # forces prefill chunks with decoding
+            max_num_batched_tokens=MAX_NUM_SEQS * 3,
+            max_num_seqs=MAX_NUM_SEQS,
+    ) as vllm_model:
+        vllm_model.generate(example_prompts, sampling_params)
+
+
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
+@pytest.mark.parametrize("max_tokens", [20])
+def test_mamba_cache_cg_padding(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    max_tokens: int,
+) -> None:
+    """
+    This test is for verifying that mamba cache is padded to CG captured
+    batch size. If it's not, a torch RuntimeError will be raised because
+    tensor dimensions aren't compatible.
+    """
+    vllm_config = EngineArgs(model=model,
+                             trust_remote_code=True).create_engine_config()
+    while len(example_prompts) == vllm_config.pad_for_cudagraph(
+            len(example_prompts)):
+        example_prompts.append(example_prompts[0])
+
+    try:
+        with vllm_runner(model) as vllm_model:
+            vllm_model.generate_greedy(example_prompts, max_tokens)
+    except RuntimeError:
+        pytest.fail(
+            "Couldn't run batch size which is not equal to a Cuda Graph "
+            "captured batch size. "
+            "Could be related to mamba cache not padded correctly")
+
+
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
+@pytest.mark.parametrize("max_tokens", [20])
+def test_models_preemption_recompute(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    max_tokens: int,
+) -> None:
+    """
+    Tests that outputs are identical with and w/o preemptions (recompute).
+    """
+    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+        scheduler = vllm_model.llm.llm_engine.scheduler[0]
+        scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
+        preempt_vllm_outputs = vllm_model.generate_greedy(
+            example_prompts, max_tokens)
+
+        scheduler.ENABLE_ARTIFICIAL_PREEMPT = False
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=preempt_vllm_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="vllm_preepmtions",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
+def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
+    vllm_runner,
+    example_prompts,
+    model: str,
+) -> None:
+    """
+    This test is for verifying that the hybrid inner state management doesn't
+    collapse in case where the number of incoming requests and
+    finished_requests_ids is larger than the maximum mamba block capacity.
+
+    This could generally happen due to the fact that hybrid does support
+    statelessness mechanism where it can cleanup new incoming requests in
+    a single step.
+    """
+    try:
+        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+            vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
+    except ValueError:
+        pytest.fail("Hybrid inner state wasn't cleaned up properly between"
+                    "steps finished requests registered unnecessarily ")
+
+
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
+def test_state_cleanup(
+    vllm_runner,
+    example_prompts,
+    model: str,
+) -> None:
+    """ 
+    This test is for verifying that the Hybrid state is cleaned up between
+    steps.
+    
+    If its not cleaned, an error would be expected.
+    """
+    try:
+        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+            for _ in range(10):
+                vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
+    except ValueError:
+        pytest.fail("Hybrid inner state wasn't cleaned up between states, "
+                    "could be related to finished_requests_ids")
+
+
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
+@pytest.mark.parametrize("max_tokens", [64])
+def test_multistep_correctness(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    max_tokens: int,
+) -> None:
+    with vllm_runner(model, num_scheduler_steps=8,
+                     max_num_seqs=2) as vllm_model:
+        vllm_outputs_multistep = vllm_model.generate_greedy(
+            example_prompts, max_tokens)
+
+    with vllm_runner(model, num_scheduler_steps=1,
+                     max_num_seqs=2) as vllm_model:
+        vllm_outputs_single_step = vllm_model.generate_greedy(
+            example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=vllm_outputs_multistep,
+        outputs_1_lst=vllm_outputs_single_step,
+        name_0="vllm_outputs_multistep",
+        name_1="vllm_outputs_single_step",
+    )
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_distributed_correctness(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with vllm_runner(model, tensor_parallel_size=1,
+                     max_num_seqs=2) as vllm_model:
+        vllm_outputs_tp_1 = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    with vllm_runner(model, tensor_parallel_size=2,
+                     max_num_seqs=2) as vllm_model:
+        vllm_outputs_tp_2 = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=vllm_outputs_tp_1,
+        outputs_1_lst=vllm_outputs_tp_2,
+        name_0="vllm_tp_1",
+        name_1="vllm_tp_2",
+    )
diff --git a/vllm_v0.10.0/tests/models/language/generation/test_mistral.py b/vllm_v0.10.0/tests/models/language/generation/test_mistral.py
new file mode 100644
index 0000000..81a88f2
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/language/generation/test_mistral.py
@@ -0,0 +1,371 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
+import json
+
+import jsonschema
+import jsonschema.exceptions
+import pytest
+
+from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
+    MistralToolCall, MistralToolParser)
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams
+from vllm.transformers_utils.tokenizer import MistralTokenizer
+
+from ...utils import check_logprobs_close
+
+MODELS = [
+    "mistralai/Mistral-7B-Instruct-v0.3",
+]
+
+MISTRAL_FORMAT_MODELS = [
+    "mistralai/Mistral-7B-Instruct-v0.3",
+    # uses the v3-Tekken tokenizer
+    "mistralai/Ministral-8B-Instruct-2410",
+    # Mistral-Nemo is to big for CI, but passes locally
+    # "mistralai/Mistral-Nemo-Instruct-2407"
+]
+
+SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
+SYMBOLIC_LANG_PROMPTS = [
+    "勇敢な船乗りについての詩を書く",  # japanese
+    "寫一首關於勇敢的水手的詩",  # chinese
+    "ပုံပြင်လေးပြောပြပါ်:\n",  # burmese
+    "Repeat the phrase 'URGENCY🌶️':\nURGENCY🌶️\nURGENCY🌶️\n",  # see https://github.com/vllm-project/vllm/pull/9625
+]
+
+# for function calling
+TOOLS = [{
+    "type": "function",
+    "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "city": {
+                    "type":
+                    "string",
+                    "description":
+                    "The city to find the weather for, e.g. 'San Francisco'"
+                },
+                "state": {
+                    "type":
+                    "string",
+                    "description":
+                    "the two-letter abbreviation for the state that the city is"
+                    " in, e.g. 'CA' which would mean 'California'"
+                },
+                "unit": {
+                    "type": "string",
+                    "description": "The unit to fetch the temperature in",
+                    "enum": ["celsius", "fahrenheit"]
+                }
+            },
+            "required": ["city", "state", "unit"]
+        }
+    },
+}, {
+    "type": "function",
+    "function": {
+        "name": "rewrite",
+        "description": "Rewrites text",
+        "parameters": {
+            "type": "object",
+            "required": [],
+            "properties": {
+                "text": {
+                    "type": "string",
+                    "description": "The input text to rewrite."
+                }
+            }
+        }
+    }
+}]
+MSGS = [
+    {
+        "role": "system",
+        "content": "You are an assistant."
+    },
+    {
+        "role":
+        "user",
+        "content":
+        "Could you please rewrite the below article? \n\n My English needs improvving, maybe I make errors."  # noqa
+    },
+    {
+        "role":
+        "assistant",
+        "content":
+        "",
+        "tool_calls": [{
+            "id": "bbc5b7ede",
+            "type": "function",
+            "function": {
+                "name":
+                "rewrite",
+                "arguments":
+                '{\"text\":\"My English needs improvving, maybe I make errors.\"}'  # noqa
+            }
+        }]
+    },
+    {
+        "role": "tool",
+        "content":
+        "{\"action\":\"rewrite\",\"outcome\":\"My English needs improving, maybe I make errors.\"}",  # noqa
+        "tool_call_id": "bbc5b7ede",
+        "name": "rewrite"
+    },
+    {
+        "role": "assistant",
+        "content": "---\n\nMy English needs improving, maybe I make errors"
+    },
+    {
+        "role":
+        "user",
+        "content": ("Can you tell me what the temperate"
+                    " will be in Dallas, in fahrenheit?")
+    }
+]
+
+SAMPLE_JSON_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "name": {
+            "type": "string"
+        },
+        "age": {
+            "type": "integer"
+        },
+        "skills": {
+            "type": "array",
+            "items": {
+                "type": "string",
+                "maxLength": 10
+            },
+            "minItems": 3
+        },
+        "work_history": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "company": {
+                        "type": "string"
+                    },
+                    "duration": {
+                        "type": "number"
+                    },
+                    "position": {
+                        "type": "string"
+                    }
+                },
+                "required": ["company", "position"]
+            }
+        }
+    },
+    "required": ["name", "age", "skills", "work_history"]
+}
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(hf_runner, vllm_runner, example_prompts, model: str,
+                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+    # TODO(sang): Sliding window should be tested separately.
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
+
+    with vllm_runner(model, dtype=dtype,
+                     tokenizer_mode="mistral") as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_mistral_format(vllm_runner, example_prompts, model: str, dtype: str,
+                        max_tokens: int, num_logprobs: int) -> None:
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            tokenizer_mode="mistral",
+            load_format="mistral",
+            config_format="mistral",
+    ) as mistral_format_model:
+        mistral_format_outputs = mistral_format_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            tokenizer_mode="auto",
+            load_format="safetensors",
+            config_format="hf",
+    ) as hf_format_model:
+        hf_format_outputs = hf_format_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=hf_format_outputs,
+        outputs_1_lst=mistral_format_outputs,
+        name_0="hf",
+        name_1="mistral",
+    )
+
+
+@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+def test_mistral_symbolic_languages(vllm_runner, model: str,
+                                    dtype: str) -> None:
+    with vllm_runner(model,
+                     dtype=dtype,
+                     max_model_len=8192,
+                     tokenizer_mode="mistral",
+                     config_format="mistral",
+                     load_format="mistral") as vllm_model:
+        for prompt in SYMBOLIC_LANG_PROMPTS:
+            msg = {"role": "user", "content": prompt}
+            outputs = vllm_model.llm.chat([msg],
+                                          sampling_params=SAMPLING_PARAMS)
+            assert "�" not in outputs[0].outputs[0].text.strip()
+
+
+@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
+    with vllm_runner(model,
+                     dtype=dtype,
+                     tokenizer_mode="mistral",
+                     config_format="mistral",
+                     load_format="mistral") as vllm_model:
+
+        msgs = copy.deepcopy(MSGS)
+        outputs = vllm_model.llm.chat(msgs,
+                                      tools=TOOLS,
+                                      sampling_params=SAMPLING_PARAMS)
+
+        tokenizer = vllm_model.llm.get_tokenizer()
+        tool_parser = MistralToolParser(tokenizer)
+
+        model_output = outputs[0].outputs[0].text.strip()
+        assert model_output.startswith(tool_parser.bot_token), model_output
+        parsed_message = tool_parser.extract_tool_calls(model_output, None)
+
+        assert parsed_message.tools_called
+
+        assert MistralToolCall.is_valid_id(parsed_message.tool_calls[0].id)
+        assert parsed_message.tool_calls[
+            0].function.name == "get_current_weather"
+        assert parsed_message.tool_calls[
+            0].function.arguments == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}'  # noqa
+        assert parsed_message.content is None
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("guided_backend",
+                         ["outlines", "lm-format-enforcer", "xgrammar"])
+def test_mistral_guided_decoding(
+    monkeypatch: pytest.MonkeyPatch,
+    vllm_runner,
+    model: str,
+    guided_backend: str,
+) -> None:
+    with monkeypatch.context() as m:
+        # Guided JSON not supported in xgrammar + V1 yet
+        m.setenv("VLLM_USE_V1", "0")
+
+        with vllm_runner(
+                model,
+                dtype='bfloat16',
+                tokenizer_mode="mistral",
+                guided_decoding_backend=guided_backend,
+        ) as vllm_model:
+            guided_decoding = GuidedDecodingParams(json=SAMPLE_JSON_SCHEMA)
+            params = SamplingParams(max_tokens=512,
+                                    temperature=0.7,
+                                    guided_decoding=guided_decoding)
+
+            messages = [{
+                "role": "system",
+                "content": "you are a helpful assistant"
+            }, {
+                "role":
+                "user",
+                "content":
+                f"Give an example JSON for an employee profile that "
+                f"fits this schema: {SAMPLE_JSON_SCHEMA}"
+            }]
+            outputs = vllm_model.llm.chat(messages, sampling_params=params)
+
+        generated_text = outputs[0].outputs[0].text
+        json_response = json.loads(generated_text)
+        assert outputs is not None
+
+        try:
+            jsonschema.validate(instance=json_response,
+                                schema=SAMPLE_JSON_SCHEMA)
+        except jsonschema.exceptions.ValidationError:
+            pytest.fail("Generated response is not valid with JSON schema")
+
+
+def test_mistral_function_call_nested_json():
+    """Ensure that the function-name regex captures the entire outer-most
+    JSON block, including nested braces."""
+
+    # Create a minimal stub tokenizer that provides the few attributes the
+    # parser accesses (`version` and `get_vocab`).
+    class _StubMistralTokenizer(MistralTokenizer):
+        version = 11  # Satisfy the version check
+
+        def __init__(self):
+            pass
+
+        @staticmethod
+        def get_vocab():
+            # Provide the special TOOL_CALLS token expected by the parser.
+            return {"[TOOL_CALLS]": 0}
+
+    tokenizer = _StubMistralTokenizer()
+    parser = MistralToolParser(tokenizer)
+
+    # Craft a model output featuring nested JSON inside the arguments.
+    args_dict = {
+        "city": "Dallas",
+        "state": "TX",
+        "unit": "fahrenheit",
+        "sub_dict": {
+            "foo": "bar",
+            "inner": {
+                "x": 1,
+                "y": 2
+            }
+        },
+    }
+
+    model_output = (
+        f"{parser.bot_token}get_current_weather{json.dumps(args_dict)}")
+
+    parsed = parser.extract_tool_calls(model_output, None)
+
+    # Assertions: the tool call is detected and the full nested JSON is parsed
+    # without truncation.
+    assert parsed.tools_called
+
+    assert MistralToolCall.is_valid_id(parsed.tool_calls[0].id)
+    assert parsed.tool_calls[0].function.name == "get_current_weather"
+    assert json.loads(parsed.tool_calls[0].function.arguments) == args_dict
+    # No additional content outside the tool call should be returned.
+    assert parsed.content is None
diff --git a/vllm_v0.10.0/tests/models/language/generation/test_phimoe.py b/vllm_v0.10.0/tests/models/language/generation/test_phimoe.py
new file mode 100644
index 0000000..6c9cc28
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/language/generation/test_phimoe.py
@@ -0,0 +1,100 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+
+from ....utils import large_gpu_test
+from ...utils import check_logprobs_close
+
+MODELS = [
+    "microsoft/Phi-3.5-MoE-instruct",
+]
+
+
+def test_phimoe_routing_function():
+    from vllm.model_executor.models.phimoe import phimoe_routing_function
+    test_case = {
+        0: {
+            "hidden_states":
+            torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
+                         dtype=torch.float32,
+                         requires_grad=False).view(4, 2),
+            "gating_output":
+            torch.tensor([0.1, 0.2, 0.3, 0.4],
+                         dtype=torch.float32,
+                         requires_grad=False),
+            "topk":
+            2,
+            "renormalize":
+            False,
+        },
+        1: {
+            "hidden_states":
+            torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
+                         dtype=torch.float32,
+                         requires_grad=False).view(4, 2),
+            "gating_output":
+            torch.tensor([0.4, 0.2, 0.3, 0.4],
+                         dtype=torch.float32,
+                         requires_grad=False),
+            "topk":
+            2,
+            "renormalize":
+            False,
+        }
+    }
+
+    ground_truth = {
+        0: {
+            "topk_weights":
+            torch.tensor([1., 1.], dtype=torch.float32, requires_grad=False),
+            "topk_ids":
+            torch.tensor([3, 2], dtype=torch.long, requires_grad=False),
+        },
+        1: {
+            "topk_weights":
+            torch.tensor([0.5, 1.], dtype=torch.float32, requires_grad=False),
+            "topk_ids":
+            torch.tensor([0, 3], dtype=torch.long, requires_grad=False),
+        }
+    }
+
+    for test_id in test_case:
+        topk_weights, topk_ids = phimoe_routing_function(**test_case[test_id])
+        assert torch.allclose(topk_weights,
+                              ground_truth[test_id]["topk_weights"])
+        assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])
+
+
+@pytest.mark.skipif(condition=current_platform.is_cpu(),
+                    reason="This test takes a lot time to run on CPU, "
+                    "and vllm CI's disk space is not enough for this model.")
+@large_gpu_test(min_gb=80)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
+
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
diff --git a/vllm_v0.10.0/tests/models/language/pooling/__init__.py b/vllm_v0.10.0/tests/models/language/pooling/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/models/language/pooling/embed_utils.py b/vllm_v0.10.0/tests/models/language/pooling/embed_utils.py
new file mode 100644
index 0000000..a663679
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/language/pooling/embed_utils.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Sequence
+from typing import Optional
+
+import pytest
+
+from tests.conftest import HfRunner
+from tests.models.utils import (EmbedModelInfo, check_embeddings_close,
+                                matryoshka_fy)
+
+
+def run_embedding_correctness_test(
+    hf_model: "HfRunner",
+    inputs: list[str],
+    vllm_outputs: Sequence[list[float]],
+    dimensions: Optional[int] = None,
+):
+    hf_outputs = hf_model.encode(inputs)
+    if dimensions:
+        hf_outputs = matryoshka_fy(hf_outputs, dimensions)
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+        tol=1e-2,
+    )
+
+
+def correctness_test_embed_models(hf_runner,
+                                  vllm_runner,
+                                  model_info: EmbedModelInfo,
+                                  example_prompts,
+                                  vllm_extra_kwargs=None,
+                                  hf_model_callback=None):
+    if not model_info.enable_test:
+        # A model family has many models with the same architecture,
+        # and we don't need to test each one.
+        pytest.skip("Skipping test.")
+
+    # The example_prompts has ending "\n", for example:
+    # "Write a short story about a robot that dreams for the first time.\n"
+    # sentence_transformers will strip the input texts, see:
+    # https://github.com/UKPLab/sentence-transformers/blob/v3.1.1/sentence_transformers/models/Transformer.py#L159
+    # This makes the input_ids different between hf_model and vllm_model.
+    # So we need to strip the input texts to avoid test failing.
+    example_prompts = [str(s).strip() for s in example_prompts]
+
+    vllm_extra_kwargs = vllm_extra_kwargs or {}
+    vllm_extra_kwargs["dtype"] = model_info.dtype
+
+    with vllm_runner(model_info.name,
+                     task="embed",
+                     max_model_len=None,
+                     **vllm_extra_kwargs) as vllm_model:
+        vllm_outputs = vllm_model.embed(example_prompts)
+
+    with hf_runner(
+            model_info.name,
+            dtype="float32",
+            is_sentence_transformer=True,
+    ) as hf_model:
+
+        if hf_model_callback is not None:
+            hf_model_callback(hf_model)
+
+        run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs)
diff --git a/vllm_v0.10.0/tests/models/language/pooling/mteb_utils.py b/vllm_v0.10.0/tests/models/language/pooling/mteb_utils.py
new file mode 100644
index 0000000..97362f6
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/language/pooling/mteb_utils.py
@@ -0,0 +1,305 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import tempfile
+from collections.abc import Sequence
+from typing import Optional
+
+import mteb
+import numpy as np
+import pytest
+import requests
+
+from tests.models.utils import EmbedModelInfo, RerankModelInfo
+
+# Most embedding models on the STS12 task (See #17175):
+# - Model implementation and minor changes in tensor dtype
+#   results in differences less than 1e-4
+# - Different model results in differences more than 1e-3
+# 1e-4 is a good tolerance threshold
+MTEB_EMBED_TASKS = ["STS12"]
+MTEB_EMBED_TOL = 1e-4
+
+# See #19344
+MTEB_RERANK_TASKS = ["NFCorpus"]
+MTEB_RERANK_LANGS = ["en"]
+MTEB_RERANK_TOL = 2e-3
+
+
+class VllmMtebEncoder(mteb.Encoder):
+
+    def __init__(self, vllm_model):
+        super().__init__()
+        self.llm = vllm_model
+        self.rng = np.random.default_rng(seed=42)
+
+    def encode(
+        self,
+        sentences: Sequence[str],
+        *args,
+        **kwargs,
+    ) -> np.ndarray:
+        # Hoping to discover potential scheduling
+        # issues by randomizing the order.
+        r = self.rng.permutation(len(sentences))
+        sentences = [sentences[i] for i in r]
+        outputs = self.llm.embed(sentences, use_tqdm=False)
+        embeds = np.array(outputs)
+        embeds = embeds[np.argsort(r)]
+        return embeds
+
+    def predict(
+        self,
+        sentences: list[tuple[str, str,
+                              Optional[str]]],  # query, corpus, prompt
+        *args,
+        **kwargs,
+    ) -> np.ndarray:
+        r = self.rng.permutation(len(sentences))
+        sentences = [sentences[i] for i in r]
+
+        queries = [s[0] for s in sentences]
+        corpus = [s[1] for s in sentences]
+
+        outputs = self.llm.score(queries,
+                                 corpus,
+                                 truncate_prompt_tokens=-1,
+                                 use_tqdm=False)
+        scores = np.array(outputs)
+        scores = scores[np.argsort(r)]
+        return scores
+
+
+class OpenAIClientMtebEncoder(mteb.Encoder):
+
+    def __init__(self, model_name: str, client):
+        super().__init__()
+        self.model_name = model_name
+        self.client = client
+        self.rng = np.random.default_rng(seed=42)
+
+    def encode(self, sentences: Sequence[str], *args, **kwargs) -> np.ndarray:
+        # Hoping to discover potential scheduling
+        # issues by randomizing the order.
+        r = self.rng.permutation(len(sentences))
+        sentences = [sentences[i] for i in r]
+
+        embeddings = self.client.embeddings.create(model=self.model_name,
+                                                   input=sentences)
+        outputs = [d.embedding for d in embeddings.data]
+        embeds = np.array(outputs)
+        embeds = embeds[np.argsort(r)]
+        return embeds
+
+
+class ScoreClientMtebEncoder(mteb.Encoder):
+
+    def __init__(self, model_name: str, url):
+        super().__init__()
+        self.model_name = model_name
+        self.url = url
+        self.rng = np.random.default_rng(seed=42)
+
+    def predict(
+        self,
+        sentences: list[tuple[str, str,
+                              Optional[str]]],  # query, corpus, prompt
+        *args,
+        **kwargs,
+    ) -> np.ndarray:
+        r = self.rng.permutation(len(sentences))
+        sentences = [sentences[i] for i in r]
+
+        outputs = []
+        for query, corpus, prompt in sentences:
+            outputs.append(self.get_score(query, corpus))
+
+        scores = np.array(outputs)
+        scores = scores[np.argsort(r)]
+        return scores
+
+    def get_score(self, query, corpus):
+        response = requests.post(self.url,
+                                 json={
+                                     "model": self.model_name,
+                                     "text_1": query,
+                                     "text_2": corpus,
+                                     "truncate_prompt_tokens": -1,
+                                 }).json()
+        return response['data'][0]["score"]
+
+
+class RerankClientMtebEncoder(ScoreClientMtebEncoder):
+
+    def get_score(self, query, corpus):
+        response = requests.post(self.url,
+                                 json={
+                                     "model": self.model_name,
+                                     "query": query,
+                                     "documents": [corpus],
+                                     "truncate_prompt_tokens": -1,
+                                 }).json()
+        return response['results'][0]["relevance_score"]
+
+
+def run_mteb_embed_task(encoder, tasks):
+    tasks = mteb.get_tasks(tasks=tasks)
+    evaluation = mteb.MTEB(tasks=tasks)
+    results = evaluation.run(
+        encoder,
+        verbosity=0,
+        output_folder=None,
+        encode_kwargs={
+            "show_progress_bar": False,
+        },
+    )
+
+    main_score = results[0].scores["test"][0]["main_score"]
+    return main_score
+
+
+def mteb_test_embed_models(hf_runner,
+                           vllm_runner,
+                           model_info: EmbedModelInfo,
+                           vllm_extra_kwargs=None,
+                           hf_model_callback=None):
+    if not model_info.enable_test:
+        # A model family has many models with the same architecture,
+        # and we don't need to test each one.
+        pytest.skip("Skipping test.")
+
+    vllm_extra_kwargs = vllm_extra_kwargs or {}
+    vllm_extra_kwargs["dtype"] = model_info.dtype
+
+    with vllm_runner(model_info.name,
+                     task="embed",
+                     max_model_len=None,
+                     **vllm_extra_kwargs) as vllm_model:
+
+        if model_info.architecture:
+            assert (model_info.architecture
+                    in vllm_model.llm.llm_engine.model_config.architectures)
+
+        vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
+                                              MTEB_EMBED_TASKS)
+        vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
+
+    with hf_runner(model_info.name,
+                   is_sentence_transformer=True,
+                   dtype="float32") as hf_model:
+
+        if hf_model_callback is not None:
+            hf_model_callback(hf_model)
+
+        st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
+        st_dtype = next(hf_model.model.parameters()).dtype
+
+    print("VLLM:", vllm_dtype, vllm_main_score)
+    print("SentenceTransformers:", st_dtype, st_main_score)
+    print("Difference:", st_main_score - vllm_main_score)
+
+    assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL)
+
+
+def run_mteb_rerank(cross_encoder, tasks, languages):
+    with tempfile.TemporaryDirectory() as results_folder:
+        bm25s = mteb.get_model("bm25s")
+        tasks = mteb.get_tasks(tasks=tasks, languages=languages)
+
+        subset = "default"
+        eval_splits = ["test"]
+
+        evaluation = mteb.MTEB(tasks=tasks)
+        evaluation.run(
+            bm25s,
+            verbosity=0,
+            eval_splits=eval_splits,
+            save_predictions=True,
+            output_folder=f"{results_folder}/stage1",
+            encode_kwargs={"show_progress_bar": False},
+        )
+
+        results = evaluation.run(
+            cross_encoder,
+            verbosity=0,
+            eval_splits=eval_splits,
+            top_k=10,
+            save_predictions=True,
+            output_folder=f"{results_folder}/stage2",
+            previous_results=
+            f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json",
+            encode_kwargs={"show_progress_bar": False},
+        )
+        main_score = results[0].scores["test"][0]["main_score"]
+    return main_score
+
+
+def mteb_test_rerank_models_hf(hf_runner, model_name, hf_model_callback=None):
+    with hf_runner(model_name, is_cross_encoder=True,
+                   dtype="float32") as hf_model:
+
+        original_predict = hf_model.predict
+
+        def _predict(
+            sentences: list[tuple[str, str,
+                                  Optional[str]]],  # query, corpus, prompt
+            *args,
+            **kwargs,
+        ):
+            # vllm and st both remove the prompt, fair comparison.
+            prompts = [(s[0], s[1]) for s in sentences]
+            return original_predict(prompts, *args, **kwargs, batch_size=8)
+
+        hf_model.predict = _predict
+        hf_model.original_predict = original_predict
+
+        if hf_model_callback is not None:
+            hf_model_callback(hf_model)
+
+        st_main_score = run_mteb_rerank(hf_model,
+                                        tasks=MTEB_RERANK_TASKS,
+                                        languages=MTEB_RERANK_LANGS)
+        st_dtype = next(hf_model.model.model.parameters()).dtype
+    return st_main_score, st_dtype
+
+
+def mteb_test_rerank_models(hf_runner,
+                            vllm_runner,
+                            model_info: RerankModelInfo,
+                            vllm_extra_kwargs=None,
+                            hf_model_callback=None,
+                            vllm_mteb_encoder=VllmMtebEncoder,
+                            atol=MTEB_RERANK_TOL):
+    if not model_info.enable_test:
+        # A model family has many models with the same architecture,
+        # and we don't need to test each one.
+        pytest.skip("Skipping test.")
+
+    vllm_extra_kwargs = vllm_extra_kwargs or {}
+    vllm_extra_kwargs["dtype"] = model_info.dtype
+
+    with vllm_runner(model_info.name,
+                     task="score",
+                     max_model_len=None,
+                     max_num_seqs=8,
+                     **vllm_extra_kwargs) as vllm_model:
+
+        model_config = vllm_model.llm.llm_engine.model_config
+
+        if model_info.architecture:
+            assert (model_info.architecture in model_config.architectures)
+        assert model_config.hf_config.num_labels == 1
+
+        vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model),
+                                          tasks=MTEB_RERANK_TASKS,
+                                          languages=MTEB_RERANK_LANGS)
+        vllm_dtype = model_config.dtype
+
+    st_main_score, st_dtype = mteb_test_rerank_models_hf(
+        hf_runner, model_info.name, hf_model_callback)
+
+    print("VLLM:", vllm_dtype, vllm_main_score)
+    print("SentenceTransformers:", st_dtype, st_main_score)
+    print("Difference:", st_main_score - vllm_main_score)
+
+    assert st_main_score == pytest.approx(vllm_main_score, abs=atol)
diff --git a/vllm_v0.10.0/tests/models/language/pooling/test_baai.py b/vllm_v0.10.0/tests/models/language/pooling/test_baai.py
new file mode 100644
index 0000000..64a8f25
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/language/pooling/test_baai.py
@@ -0,0 +1,92 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from ...utils import EmbedModelInfo, RerankModelInfo
+from .embed_utils import correctness_test_embed_models
+from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
+
+MODELS = [
+    ########## BertModel
+    EmbedModelInfo("BAAI/bge-base-en",
+                   architecture="BertModel",
+                   enable_test=True),
+    EmbedModelInfo("BAAI/bge-base-zh",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-small-en",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-small-zh",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-large-en",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-large-zh",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-large-zh-noinstruct",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-base-en-v1.5",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-base-zh-v1.5",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-small-en-v1.5",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-small-zh-v1.5",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-large-en-v1.5",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-large-zh-v1.5",
+                   architecture="BertModel",
+                   enable_test=False),
+    ########## XLMRobertaModel
+    EmbedModelInfo("BAAI/bge-m3",
+                   architecture="XLMRobertaModel",
+                   enable_test=True),
+    ########## Qwen2Model
+    EmbedModelInfo("BAAI/bge-code-v1",
+                   architecture="Qwen2Model",
+                   dtype="float32",
+                   enable_test=True),
+]
+
+RERANK_MODELS = [
+    ########## XLMRobertaForSequenceClassification
+    RerankModelInfo("BAAI/bge-reranker-base",
+                    architecture="XLMRobertaForSequenceClassification",
+                    enable_test=True),
+    RerankModelInfo("BAAI/bge-reranker-large",
+                    architecture="XLMRobertaForSequenceClassification",
+                    enable_test=False),
+    RerankModelInfo("BAAI/bge-reranker-v2-m3",
+                    architecture="XLMRobertaForSequenceClassification",
+                    enable_test=False)
+]
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_embed_models_mteb(hf_runner, vllm_runner,
+                           model_info: EmbedModelInfo) -> None:
+    mteb_test_embed_models(hf_runner, vllm_runner, model_info)
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_embed_models_correctness(hf_runner, vllm_runner,
+                                  model_info: EmbedModelInfo,
+                                  example_prompts) -> None:
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
+                                  example_prompts)
+
+
+@pytest.mark.parametrize("model_info", RERANK_MODELS)
+def test_rerank_models_mteb(hf_runner, vllm_runner,
+                            model_info: RerankModelInfo) -> None:
+    mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
diff --git a/vllm_v0.10.0/tests/models/language/pooling/test_bge_reranker_v2_gemma.py b/vllm_v0.10.0/tests/models/language/pooling/test_bge_reranker_v2_gemma.py
new file mode 100644
index 0000000..7fa9485
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/language/pooling/test_bge_reranker_v2_gemma.py
@@ -0,0 +1,140 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, Optional
+
+import numpy as np
+import pytest
+import torch
+
+from tests.conftest import HfRunner
+
+from .mteb_utils import (RerankModelInfo, VllmMtebEncoder,
+                         mteb_test_rerank_models)
+
+RERANK_MODELS = [
+    RerankModelInfo("BAAI/bge-reranker-v2-gemma",
+                    architecture="GemmaForSequenceClassification"),
+]
+
+PROMPT = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'."  # noqa: E501
+
+
+class GemmaRerankerHfRunner(HfRunner):
+
+    def __init__(self,
+                 model_name: str,
+                 dtype: str = "auto",
+                 *args: Any,
+                 **kwargs: Any) -> None:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                                       padding_side='left')
+        self.yes_loc = self.tokenizer.convert_tokens_to_ids("Yes")
+
+    @torch.no_grad()
+    def predict(self, prompts: list[list[str]], *args,
+                **kwargs) -> torch.Tensor:
+
+        def get_inputs(pairs, tokenizer, prompt=None):
+            if prompt is None:
+                prompt = PROMPT
+
+            sep = "\n"
+            prompt_inputs = tokenizer(prompt,
+                                      return_tensors=None,
+                                      add_special_tokens=False)["input_ids"]
+            sep_inputs = tokenizer(sep,
+                                   return_tensors=None,
+                                   add_special_tokens=False)["input_ids"]
+            inputs = []
+            for query, passage in pairs:
+                query_inputs = tokenizer(
+                    f"A: {query}",
+                    return_tensors=None,
+                    add_special_tokens=False,
+                    truncation=True,
+                )
+                passage_inputs = tokenizer(
+                    f"B: {passage}",
+                    return_tensors=None,
+                    add_special_tokens=False,
+                    truncation=True,
+                )
+                item = tokenizer.prepare_for_model(
+                    [tokenizer.bos_token_id] + query_inputs["input_ids"],
+                    sep_inputs + passage_inputs["input_ids"],
+                    truncation="only_second",
+                    padding=False,
+                    return_attention_mask=False,
+                    return_token_type_ids=False,
+                    add_special_tokens=False,
+                )
+                item["input_ids"] = item[
+                    "input_ids"] + sep_inputs + prompt_inputs
+                item["attention_mask"] = [1] * len(item["input_ids"])
+                inputs.append(item)
+            return tokenizer.pad(
+                inputs,
+                padding=True,
+                return_tensors="pt",
+            )
+
+        scores = []
+        for query, doc, *_ in prompts:
+            pairs = [(query, doc)]
+            inputs = get_inputs(pairs, self.tokenizer)
+            inputs = inputs.to(self.model.device)
+            _n_tokens = inputs["input_ids"].shape[1]
+            logits = self.model(**inputs, return_dict=True).logits
+            _scores = (logits[:, -1,
+                              self.yes_loc].view(-1, ).float().sigmoid())
+            scores.append(_scores[0].item())
+        return torch.Tensor(scores)
+
+
+class GemmaMtebEncoder(VllmMtebEncoder):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.prompt = PROMPT
+        self.query_template = "A: {query}\n"
+        self.document_template = "B: {doc}\n{prompt}"
+
+    def predict(
+        self,
+        sentences: list[tuple[str, str,
+                              Optional[str]]],  # query, corpus, prompt
+        *args,
+        **kwargs,
+    ) -> np.ndarray:
+
+        _sentences = []
+        for query, corpus, prompt in sentences:
+            query = self.query_template.format(query=query)
+            corpus = self.document_template.format(doc=corpus, prompt=prompt)
+            _sentences.append((query, corpus, prompt))
+
+        return super().predict(_sentences, *args, **kwargs)
+
+
+@pytest.mark.parametrize("model_info", RERANK_MODELS)
+def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo,
+                            monkeypatch) -> None:
+    monkeypatch.setenv("VLLM_USE_V1", "0")
+
+    assert model_info.architecture == "GemmaForSequenceClassification"
+
+    vllm_extra_kwargs: dict[str, Any] = {
+        "hf_overrides": {
+            "architectures": ["GemmaForSequenceClassification"],
+            "classifier_from_token": ["Yes"],
+            "method": "no_post_processing",
+        }
+    }
+
+    mteb_test_rerank_models(GemmaRerankerHfRunner,
+                            vllm_runner,
+                            model_info,
+                            vllm_extra_kwargs,
+                            vllm_mteb_encoder=GemmaMtebEncoder)
diff --git a/vllm_v0.10.0/tests/models/language/pooling/test_classification.py b/vllm_v0.10.0/tests/models/language/pooling/test_classification.py
new file mode 100644
index 0000000..77df6d1
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/language/pooling/test_classification.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+from transformers import AutoModelForSequenceClassification
+
+from vllm.platforms import current_platform
+
+# TODO: enable when float32 is supported by V1
+# @pytest.fixture(autouse=True)
+# def v1(run_with_both_engines):
+#     # Simple autouse wrapper to run both engines for each test
+#     # This can be promoted up to conftest.py to run for every
+#     # test in a package
+#     pass
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        pytest.param("jason9693/Qwen2.5-1.5B-apeach",
+                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+    ],
+)
+@pytest.mark.parametrize("dtype",
+                         ["half"] if current_platform.is_rocm() else ["float"])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    monkeypatch,
+) -> None:
+    if current_platform.is_rocm():
+        # ROCm Triton FA does not currently support sliding window attention
+        # switch to use ROCm CK FA backend
+        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
+
+    with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.classify(example_prompts)
+
+    with hf_runner(model,
+                   dtype=dtype,
+                   auto_cls=AutoModelForSequenceClassification) as hf_model:
+        hf_outputs = hf_model.classify(example_prompts)
+
+    # check logits difference
+    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+        hf_output = torch.tensor(hf_output)
+        vllm_output = torch.tensor(vllm_output)
+
+        # the tolerance value of 1e-2 is selected based on the
+        # half datatype tests in
+        # tests/models/language/pooling/test_embedding.py
+        assert torch.allclose(hf_output, vllm_output,
+                              1e-3 if dtype == "float" else 1e-2)
diff --git a/vllm_v0.10.0/tests/models/language/pooling/test_cross_encoder.py b/vllm_v0.10.0/tests/models/language/pooling/test_cross_encoder.py
new file mode 100644
index 0000000..9a33063
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/language/pooling/test_cross_encoder.py
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from .mteb_utils import RerankModelInfo, mteb_test_rerank_models
+
+RERANK_MODELS = [
+    RerankModelInfo("cross-encoder/ms-marco-TinyBERT-L-2-v2",
+                    architecture="BertForSequenceClassification"),
+    RerankModelInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
+                    architecture="Qwen3ForSequenceClassification")
+]
+
+
+@pytest.mark.parametrize("model_info", RERANK_MODELS)
+def test_rerank_models_mteb(hf_runner, vllm_runner,
+                            model_info: RerankModelInfo) -> None:
+    mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
diff --git a/vllm_v0.10.0/tests/models/language/pooling/test_embedding.py b/vllm_v0.10.0/tests/models/language/pooling/test_embedding.py
new file mode 100644
index 0000000..cc9e410
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/language/pooling/test_embedding.py
@@ -0,0 +1,107 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import pytest
+
+from vllm.config import PoolerConfig
+from vllm.platforms import current_platform
+
+from ...utils import check_embeddings_close
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        # Be careful of the order of models, decoder-only models should be
+        # placed before encoder-only models, otherwise `Qwen2.5-0.5B-Instruct`
+        # case won't pass because gte-Qwen2-1.5B-instruct will cache custom
+        # model code with bidirectional attention.
+        # [Decoder-only]
+        pytest.param("BAAI/bge-multilingual-gemma2",
+                     marks=[pytest.mark.core_model]),
+        pytest.param(
+            "intfloat/e5-mistral-7b-instruct",
+            # CPU v1 doesn't support sliding window
+            marks=[pytest.mark.core_model]),
+        # the qwen models interfere with each other (see PR
+        # https://github.com/vllm-project/vllm/pull/18720).
+        # To avoid this problem, for now we skip v0 since it will be
+        # deprecated anyway.
+        pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
+                     marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
+        # [Encoder-only]
+        pytest.param(
+            "BAAI/bge-base-en-v1.5",
+            marks=[
+                # CPU only supports V1
+                pytest.mark.core_model,
+                pytest.mark.skip_v1
+            ]),
+        pytest.param("sentence-transformers/all-MiniLM-L12-v2",
+                     marks=[pytest.mark.skip_v1]),
+        pytest.param("intfloat/multilingual-e5-small",
+                     marks=[pytest.mark.skip_v1]),
+        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
+                     marks=[pytest.mark.skip_v1]),
+        # [Cross-Encoder]
+        pytest.param("sentence-transformers/stsb-roberta-base-v2",
+                     marks=[pytest.mark.skip_v1]),
+    ],
+)
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model,
+    monkeypatch,
+) -> None:
+    if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
+        # ROCm Triton FA does not currently support sliding window attention
+        # switch to use ROCm CK FA backend
+        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
+
+    vllm_extra_kwargs = {}
+    if model == "ssmits/Qwen2-7B-Instruct-embed-base":
+        vllm_extra_kwargs["override_pooler_config"] = \
+            PoolerConfig(pooling_type="MEAN", normalize=False)
+
+    max_model_len: Optional[int] = 512
+    if model in [
+            "sentence-transformers/all-MiniLM-L12-v2",
+            "sentence-transformers/stsb-roberta-base-v2"
+    ]:
+        max_model_len = None
+
+    # The example_prompts has ending "\n", for example:
+    # "Write a short story about a robot that dreams for the first time.\n"
+    # sentence_transformers will strip the input texts, see:
+    # https://github.com/UKPLab/sentence-transformers/blob/v3.1.1/sentence_transformers/models/Transformer.py#L159
+    # This makes the input_ids different between hf_model and vllm_model.
+    # So we need to strip the input texts to avoid test failing.
+    example_prompts = [str(s).strip() for s in example_prompts]
+
+    with hf_runner(model, is_sentence_transformer=True) as hf_model:
+        hf_outputs = hf_model.encode(example_prompts)
+
+    with vllm_runner(model,
+                     task="embed",
+                     max_model_len=max_model_len,
+                     **vllm_extra_kwargs) as vllm_model:
+        vllm_outputs = vllm_model.embed(example_prompts)
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+        tol=1e-2,
+    )
diff --git a/vllm_v0.10.0/tests/models/language/pooling/test_gritlm.py b/vllm_v0.10.0/tests/models/language/pooling/test_gritlm.py
new file mode 100644
index 0000000..efa119b
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/language/pooling/test_gritlm.py
@@ -0,0 +1,194 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import numpy as np
+import openai
+import pytest
+from scipy.spatial.distance import cosine
+
+from vllm import LLM, SamplingParams
+from vllm.config import ModelConfig
+
+from ....utils import RemoteOpenAIServer
+
+MODEL_NAME = "parasail-ai/GritLM-7B-vllm"
+MAX_MODEL_LEN = 4000
+
+
+def _arr(arr):
+    """
+    Convert a list of integers to an array of integers.
+    """
+    return np.array(arr)
+
+
+def test_find_array():
+    from vllm.model_executor.models.gritlm import GritLMMeanPool
+
+    model_config = ModelConfig(
+        MODEL_NAME,
+        task="embed",
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        dtype="bfloat16",
+        seed=0,
+    )
+    pooling = GritLMMeanPool(model_config=model_config)
+
+    arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+
+    assert pooling._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
+    assert pooling._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
+    assert pooling._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
+    assert pooling._find_array(arr, _arr([3, 4, 5]), end_idx=3) == -1
+    assert pooling._find_array(arr, _arr([3, 4, 5]), end_idx=4) == 3
+    assert pooling._find_array(arr, _arr([3, 5]), start_idx=0) == -1
+
+    with pytest.raises(ValueError):
+        pooling._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
+
+
+def run_llm_encode(
+    llm: LLM,
+    queries: list[str],
+    instruction: str,
+) -> list[list[float]]:
+    outputs = llm.embed([instruction + q for q in queries])
+    return [output.outputs.embedding for output in outputs]
+
+
+async def run_client_embeddings(
+    client: openai.AsyncOpenAI,
+    queries: list[str],
+    instruction: str,
+) -> list[list[float]]:
+    outputs = await client.embeddings.create(
+        model=MODEL_NAME,
+        input=[instruction + q for q in queries],
+    )
+    return [data.embedding for data in outputs.data]
+
+
+def gritlm_instruction(instruction):
+    return ("<|user|>\n" + instruction +
+            "\n<|embed|>\n" if instruction else "<|embed|>\n")
+
+
+def get_test_data():
+    """
+    Grabbed this test data and the expected values from
+    README.md in https://github.com/ContextualAI/gritlm
+    """
+    q_instruction = gritlm_instruction(
+        "Given a scientific paper title, retrieve the paper's abstract", )
+    queries = [
+        "Bitcoin: A Peer-to-Peer Electronic Cash System",
+        "Generative Representational Instruction Tuning",
+    ]
+
+    d_instruction = gritlm_instruction("")
+    documents = [
+        # ruff: noqa: E501
+        "A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution. Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work. The longest chain not only serves as proof of the sequence of events witnessed, but proof that it came from the largest pool of CPU power. As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers. The network itself requires minimal structure. Messages are broadcast on a best effort basis, and nodes can leave and rejoin the network at will, accepting the longest proof-of-work chain as proof of what happened while they were gone.",
+        "All text-based language problems can be reduced to either generation or embedding. Current models only perform well at one or the other. We introduce generative representational instruction tuning (GRIT) whereby a large language model is trained to handle both generative and embedding tasks by distinguishing between them through instructions. Compared to other open models, our resulting GritLM 7B sets a new state of the art on the Massive Text Embedding Benchmark (MTEB) and outperforms all models up to its size on a range of generative tasks. By scaling up further, GritLM 8X7B outperforms all open generative language models that we tried while still being among the best embedding models. Notably, we find that GRIT matches training on only generative or embedding data, thus we can unify both at no performance loss. Among other benefits, the unification via GRIT speeds up Retrieval-Augmented Generation (RAG) by > 60% for long documents, by no longer requiring separate retrieval and generation models. Models, code, etc. are freely available at https://github.com/ContextualAI/gritlm.",
+    ]
+
+    return queries, q_instruction, documents, d_instruction
+
+
+def validate_embed_output(q_rep: list[list[float]], d_rep: list[list[float]]):
+    cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0])
+    assert cosine_sim_q0_d0 == pytest.approx(0.609, abs=0.001)
+
+    cosine_sim_q0_d1 = 1 - cosine(q_rep[0], d_rep[1])
+    assert cosine_sim_q0_d1 == pytest.approx(0.101, abs=0.001)
+
+    cosine_sim_q1_d0 = 1 - cosine(q_rep[1], d_rep[0])
+    assert cosine_sim_q1_d0 == pytest.approx(0.120, abs=0.001)
+
+    cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1])
+    assert cosine_sim_q1_d1 == pytest.approx(0.534, abs=0.001)
+
+
+def test_gritlm_offline_embedding(vllm_runner):
+    queries, q_instruction, documents, d_instruction = get_test_data()
+
+    with vllm_runner(
+            MODEL_NAME,
+            task="embed",
+            max_model_len=MAX_MODEL_LEN,
+    ) as vllm_model:
+        llm = vllm_model.llm
+
+        d_rep = run_llm_encode(
+            llm,
+            documents,
+            d_instruction,
+        )
+        q_rep = run_llm_encode(
+            llm,
+            queries,
+            q_instruction,
+        )
+
+    validate_embed_output(q_rep, d_rep)
+
+
+@pytest.mark.asyncio
+async def test_gritlm_api_server_embedding():
+    queries, q_instruction, documents, d_instruction = get_test_data()
+
+    args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as server:
+        client_embedding = server.get_async_client()
+
+        d_rep = await run_client_embeddings(
+            client_embedding,
+            documents,
+            d_instruction,
+        )
+        q_rep = await run_client_embeddings(
+            client_embedding,
+            queries,
+            q_instruction,
+        )
+
+    validate_embed_output(q_rep, d_rep)
+
+
+def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
+    input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
+
+    with vllm_runner(
+            MODEL_NAME,
+            task="generate",
+            max_model_len=MAX_MODEL_LEN,
+    ) as vllm_model:
+        llm = vllm_model.llm
+
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=256)
+        outputs = llm.generate(input, sampling_params=sampling_params)
+
+    assert outputs[0].outputs[0].text == "The capital of France is Paris."
+
+
+@pytest.mark.asyncio
+async def test_gritlm_api_server_generate():
+    input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
+
+    args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as server:
+        client_generate = server.get_async_client()
+
+        outputs = await client_generate.completions.create(
+            model=MODEL_NAME,
+            prompt=input,
+            max_tokens=256,
+            temperature=0.0,
+        )
+
+    assert outputs.choices[0].text == "The capital of France is Paris."
diff --git a/vllm_v0.10.0/tests/models/language/pooling/test_gte.py b/vllm_v0.10.0/tests/models/language/pooling/test_gte.py
new file mode 100644
index 0000000..0ad5478
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/language/pooling/test_gte.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+import pytest
+
+from .embed_utils import EmbedModelInfo, correctness_test_embed_models
+from .mteb_utils import mteb_test_embed_models
+
+MODELS = [
+    ########## BertModel
+    EmbedModelInfo("thenlper/gte-large",
+                   architecture="BertModel",
+                   enable_test=True),
+    EmbedModelInfo("thenlper/gte-base",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("thenlper/gte-small",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("thenlper/gte-large-zh",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("thenlper/gte-base-zh",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("thenlper/gte-small-zh",
+                   architecture="BertModel",
+                   enable_test=False),
+    ########### NewModel
+    EmbedModelInfo("Alibaba-NLP/gte-multilingual-base",
+                   architecture="GteNewModel",
+                   enable_test=True),
+    EmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5",
+                   architecture="GteNewModel",
+                   enable_test=True),
+    EmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5",
+                   architecture="GteNewModel",
+                   enable_test=True),
+    ########### Qwen2ForCausalLM
+    EmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
+                   architecture="Qwen2ForCausalLM",
+                   enable_test=True),
+    ########## ModernBertModel
+    EmbedModelInfo("Alibaba-NLP/gte-modernbert-base",
+                   architecture="ModernBertModel",
+                   enable_test=True),
+    ########## Qwen3ForCausalLM
+    EmbedModelInfo("Qwen/Qwen3-Embedding-0.6B",
+                   architecture="Qwen3ForCausalLM",
+                   dtype="float32",
+                   enable_test=True),
+    EmbedModelInfo("Qwen/Qwen3-Embedding-4B",
+                   architecture="Qwen3ForCausalLM",
+                   dtype="float32",
+                   enable_test=False),
+]
+
+V1FlashAttentionImpNotSupported = [
+    "Alibaba-NLP/gte-Qwen2-1.5B-instruct", "Alibaba-NLP/gte-modernbert-base"
+]
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo,
+                           monkeypatch) -> None:
+    if model_info.name in V1FlashAttentionImpNotSupported:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
+
+    vllm_extra_kwargs: dict[str, Any] = {}
+    if model_info.architecture == "GteNewModel":
+        vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
+
+    mteb_test_embed_models(hf_runner, vllm_runner, model_info,
+                           vllm_extra_kwargs)
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_embed_models_correctness(hf_runner, vllm_runner,
+                                  model_info: EmbedModelInfo, example_prompts,
+                                  monkeypatch) -> None:
+    if model_info.name in V1FlashAttentionImpNotSupported:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
+
+    vllm_extra_kwargs: dict[str, Any] = {}
+    if model_info.architecture == "GteNewModel":
+        vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
+
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
+                                  example_prompts, vllm_extra_kwargs)
diff --git a/vllm_v0.10.0/tests/models/language/pooling/test_intfloat.py b/vllm_v0.10.0/tests/models/language/pooling/test_intfloat.py
new file mode 100644
index 0000000..d899aaa
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/language/pooling/test_intfloat.py
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from ...utils import EmbedModelInfo
+from .embed_utils import correctness_test_embed_models
+from .mteb_utils import mteb_test_embed_models
+
+MODELS = [
+    ########## BertModel
+    EmbedModelInfo("intfloat/e5-small",
+                   architecture="BertModel",
+                   enable_test=True),
+    EmbedModelInfo("intfloat/e5-base",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("intfloat/e5-large",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("intfloat/multilingual-e5-small",
+                   architecture="BertModel",
+                   enable_test=False),
+    ########## XLMRobertaModel
+    EmbedModelInfo("intfloat/multilingual-e5-base",
+                   architecture="XLMRobertaModel",
+                   enable_test=True),
+    EmbedModelInfo("intfloat/multilingual-e5-large",
+                   architecture="XLMRobertaModel",
+                   enable_test=False),
+    EmbedModelInfo("intfloat/multilingual-e5-large-instruct",
+                   architecture="XLMRobertaModel",
+                   enable_test=False),
+]
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_embed_models_mteb(hf_runner, vllm_runner,
+                           model_info: EmbedModelInfo) -> None:
+    mteb_test_embed_models(hf_runner, vllm_runner, model_info)
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_embed_models_correctness(hf_runner, vllm_runner,
+                                  model_info: EmbedModelInfo,
+                                  example_prompts) -> None:
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
+                                  example_prompts)
diff --git a/vllm_v0.10.0/tests/models/language/pooling/test_jina.py b/vllm_v0.10.0/tests/models/language/pooling/test_jina.py
new file mode 100644
index 0000000..16c7114
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/language/pooling/test_jina.py
@@ -0,0 +1,112 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from functools import partial
+
+import pytest
+
+from vllm import PoolingParams
+
+from ...utils import EmbedModelInfo, RerankModelInfo
+from .embed_utils import (check_embeddings_close,
+                          correctness_test_embed_models, matryoshka_fy)
+from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
+
+EMBEDDING_MODELS = [
+    EmbedModelInfo("jinaai/jina-embeddings-v3",
+                   architecture="XLMRobertaModel",
+                   is_matryoshka=True)
+]
+
+RERANK_MODELS = [
+    RerankModelInfo("jinaai/jina-reranker-v2-base-multilingual",
+                    architecture="XLMRobertaForSequenceClassification")
+]
+
+
+@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
+def test_embed_models_mteb(hf_runner, vllm_runner,
+                           model_info: EmbedModelInfo) -> None:
+
+    def hf_model_callback(model):
+        model.encode = partial(model.encode, task="text-matching")
+
+    mteb_test_embed_models(hf_runner,
+                           vllm_runner,
+                           model_info,
+                           hf_model_callback=hf_model_callback)
+
+
+@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
+def test_embed_models_correctness(hf_runner, vllm_runner,
+                                  model_info: EmbedModelInfo,
+                                  example_prompts) -> None:
+
+    def hf_model_callback(model):
+        model.encode = partial(model.encode, task="text-matching")
+
+    correctness_test_embed_models(hf_runner,
+                                  vllm_runner,
+                                  model_info,
+                                  example_prompts,
+                                  hf_model_callback=hf_model_callback)
+
+
+@pytest.mark.parametrize("model_info", RERANK_MODELS)
+def test_rerank_models_mteb(hf_runner, vllm_runner,
+                            model_info: RerankModelInfo) -> None:
+    mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
+
+
+@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dimensions", [16, 32])
+def test_matryoshka(
+    hf_runner,
+    vllm_runner,
+    model_info,
+    dtype: str,
+    dimensions: int,
+    example_prompts,
+    monkeypatch,
+) -> None:
+    if not model_info.is_matryoshka:
+        pytest.skip("Model is not matryoshka")
+
+    # ST will strip the input texts, see test_embedding.py
+    example_prompts = [str(s).strip() for s in example_prompts]
+
+    with hf_runner(
+            model_info.name,
+            dtype=dtype,
+            is_sentence_transformer=True,
+    ) as hf_model:
+        hf_outputs = hf_model.encode(example_prompts, task="text-matching")
+        hf_outputs = matryoshka_fy(hf_outputs, dimensions)
+
+    with vllm_runner(model_info.name,
+                     task="embed",
+                     dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        assert vllm_model.llm.llm_engine.model_config.is_matryoshka
+
+        matryoshka_dimensions = (
+            vllm_model.llm.llm_engine.model_config.matryoshka_dimensions)
+        assert matryoshka_dimensions is not None
+
+        if dimensions not in matryoshka_dimensions:
+            with pytest.raises(ValueError):
+                vllm_model.embed(
+                    example_prompts,
+                    pooling_params=PoolingParams(dimensions=dimensions))
+        else:
+            vllm_outputs = vllm_model.embed(
+                example_prompts,
+                pooling_params=PoolingParams(dimensions=dimensions))
+
+            check_embeddings_close(
+                embeddings_0_lst=hf_outputs,
+                embeddings_1_lst=vllm_outputs,
+                name_0="hf",
+                name_1="vllm",
+                tol=1e-2,
+            )
diff --git a/vllm_v0.10.0/tests/models/language/pooling/test_mxbai_rerank.py b/vllm_v0.10.0/tests/models/language/pooling/test_mxbai_rerank.py
new file mode 100644
index 0000000..e74c587
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/language/pooling/test_mxbai_rerank.py
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+import pytest
+import torch
+
+from tests.conftest import HfRunner
+
+from .mteb_utils import RerankModelInfo, mteb_test_rerank_models
+
+RERANK_MODELS = [
+    RerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2",
+                    architecture="Qwen2ForSequenceClassification",
+                    enable_test=True),
+    RerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2",
+                    architecture="Qwen2ForSequenceClassification",
+                    enable_test=False)
+]
+
+
+class MxbaiRerankerHfRunner(HfRunner):
+
+    def __init__(self,
+                 model_name: str,
+                 dtype: str = "auto",
+                 *args: Any,
+                 **kwargs: Any) -> None:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                                       padding_side='left')
+        self.yes_loc = self.tokenizer.convert_tokens_to_ids("1")
+        self.no_loc = self.tokenizer.convert_tokens_to_ids("0")
+
+    def predict(self, prompts: list[list[str]], *args,
+                **kwargs) -> torch.Tensor:
+
+        def process_inputs(pairs):
+            inputs = self.tokenizer(pairs,
+                                    padding=False,
+                                    truncation='longest_first',
+                                    return_attention_mask=False)
+            for i, ele in enumerate(inputs['input_ids']):
+                inputs['input_ids'][i] = ele
+            inputs = self.tokenizer.pad(inputs,
+                                        padding=True,
+                                        return_tensors="pt")
+            for key in inputs:
+                inputs[key] = inputs[key].to(self.model.device)
+            return inputs
+
+        @torch.no_grad()
+        def compute_logits(inputs):
+            logits = self.model(**inputs).logits[:, -1, :]
+            yes_logits = logits[:, self.yes_loc]
+            no_logits = logits[:, self.no_loc]
+            logits = yes_logits - no_logits
+            scores = logits.float().sigmoid()
+            return scores
+
+        scores = []
+        for prompt in prompts:
+            inputs = process_inputs([prompt])
+            score = compute_logits(inputs)
+            scores.append(score[0].item())
+        return torch.Tensor(scores)
+
+
+@pytest.mark.parametrize("model_info", RERANK_MODELS)
+def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
+    vllm_extra_kwargs: dict[str, Any] = {}
+    if model_info.architecture == "Qwen2ForSequenceClassification":
+        vllm_extra_kwargs["hf_overrides"] = {
+            "architectures": ["Qwen2ForSequenceClassification"],
+            "classifier_from_token": ["0", "1"],
+            "method": "from_2_way_softmax",
+        }
+
+    mteb_test_rerank_models(MxbaiRerankerHfRunner, vllm_runner, model_info,
+                            vllm_extra_kwargs)
diff --git a/vllm_v0.10.0/tests/models/language/pooling/test_nomic.py b/vllm_v0.10.0/tests/models/language/pooling/test_nomic.py
new file mode 100644
index 0000000..e16ec23
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/language/pooling/test_nomic.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from .embed_utils import EmbedModelInfo, correctness_test_embed_models
+from .mteb_utils import mteb_test_embed_models
+
+MODELS = [
+    EmbedModelInfo("nomic-ai/nomic-embed-text-v1",
+                   architecture="NomicBertModel",
+                   enable_test=True),
+    EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5",
+                   architecture="NomicBertModel",
+                   enable_test=False),
+    EmbedModelInfo("nomic-ai/CodeRankEmbed",
+                   architecture="NomicBertModel",
+                   enable_test=False),
+    EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe",
+                   architecture="NomicBertModel",
+                   enable_test=True)
+]
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_embed_models_mteb(hf_runner, vllm_runner,
+                           model_info: EmbedModelInfo) -> None:
+    mteb_test_embed_models(hf_runner, vllm_runner, model_info)
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_embed_models_correctness(hf_runner, vllm_runner,
+                                  model_info: EmbedModelInfo,
+                                  example_prompts) -> None:
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
+                                  example_prompts)
diff --git a/vllm_v0.10.0/tests/models/language/pooling/test_nomic_max_model_len.py b/vllm_v0.10.0/tests/models/language/pooling/test_nomic_max_model_len.py
new file mode 100644
index 0000000..7413ef5
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/language/pooling/test_nomic_max_model_len.py
@@ -0,0 +1,131 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: SIM117
+import pytest
+
+from ...utils import EmbedModelInfo
+
+MODELS = [
+    EmbedModelInfo("nomic-ai/nomic-embed-text-v1"),
+    #EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"),
+    #EmbedModelInfo("nomic-ai/CodeRankEmbed"),
+    EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe"),
+    #EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"),
+]
+
+rope_theta = 1000
+factor = 4.0
+original_max_position_embeddings = 2048
+max_model_len = int(original_max_position_embeddings * factor)
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_default(model_info, vllm_runner):
+    with vllm_runner(model_info.name, task="embed",
+                     max_model_len=None) as vllm_model:
+        model_config = vllm_model.llm.llm_engine.model_config
+        if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
+            # For nomic-embed-text-v2-moe the length is set to 512
+            # by sentence_bert_config.json.
+            assert model_config.max_model_len == 512
+        else:
+            assert (
+                model_config.max_model_len == original_max_position_embeddings)
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_set_max_model_len_legal(model_info, vllm_runner):
+    # set max_model_len <= 512
+    with vllm_runner(model_info.name, task="embed",
+                     max_model_len=256) as vllm_model:
+        model_config = vllm_model.llm.llm_engine.model_config
+        assert model_config.max_model_len == 256
+
+    # set 512 < max_model_len <= 2048
+    if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
+        # For nomic-embed-text-v2-moe the length is set to 512
+        # by sentence_bert_config.json.
+        with pytest.raises(ValueError):
+            with vllm_runner(model_info.name, task="embed",
+                             max_model_len=1024):
+                pass
+    else:
+        with vllm_runner(model_info.name, task="embed",
+                         max_model_len=1024) as vllm_model:
+            model_config = vllm_model.llm.llm_engine.model_config
+            assert model_config.max_model_len == 1024
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_set_max_model_len_illegal(model_info, vllm_runner):
+    # set max_model_len > 2048
+    with pytest.raises(ValueError):
+        with vllm_runner(model_info.name, task="embed", max_model_len=4096):
+            pass
+
+    # set max_model_len > 2048 by hf_overrides
+    hf_overrides = {"max_model_len": 4096}
+    with pytest.raises(ValueError):
+        with vllm_runner(model_info.name,
+                         task="embed",
+                         max_model_len=None,
+                         hf_overrides=hf_overrides):
+            pass
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_use_rope_scaling_legal(model_info, vllm_runner):
+    hf_overrides = {
+        "rope_theta": rope_theta,
+        "rope_scaling": {
+            "rope_type": "yarn",
+            "factor": factor,
+            "original_max_position_embeddings":
+            original_max_position_embeddings
+        },
+        "max_model_len": max_model_len
+    }
+
+    with vllm_runner(model_info.name,
+                     task="embed",
+                     max_model_len=None,
+                     hf_overrides=hf_overrides):
+        pass
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_use_rope_scaling_illegal(model_info, vllm_runner):
+    hf_overrides = {
+        "rope_theta": rope_theta,
+        "rope_scaling": {
+            "rope_type": "yarn",
+            "factor": factor,
+            "original_max_position_embeddings":
+            original_max_position_embeddings
+        }
+    }
+    # illegal max_model_len
+    with pytest.raises(ValueError):
+        with vllm_runner(model_info.name,
+                         task="embed",
+                         max_model_len=max_model_len + 1,
+                         hf_overrides=hf_overrides):
+            pass
+
+    hf_overrides = {
+        "rope_theta": rope_theta,
+        "rope_scaling": {
+            "rope_type": "yarn",
+            "factor": factor,
+            "original_max_position_embeddings":
+            original_max_position_embeddings
+        },
+        "max_model_len": max_model_len + 1
+    }
+    # illegal max_model_len by hf_overrides
+    with pytest.raises(ValueError):
+        with vllm_runner(model_info.name,
+                         task="embed",
+                         max_model_len=None,
+                         hf_overrides=hf_overrides):
+            pass
diff --git a/vllm_v0.10.0/tests/models/language/pooling/test_qwen3_reranker.py b/vllm_v0.10.0/tests/models/language/pooling/test_qwen3_reranker.py
new file mode 100644
index 0000000..9c6a833
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/language/pooling/test_qwen3_reranker.py
@@ -0,0 +1,116 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+import pytest
+import torch
+
+from tests.conftest import HfRunner
+from tests.utils import multi_gpu_test
+
+from .mteb_utils import RerankModelInfo, mteb_test_rerank_models
+
+RERANK_MODELS = [
+    RerankModelInfo("Qwen/Qwen3-Reranker-0.6B",
+                    architecture="Qwen3ForSequenceClassification",
+                    enable_test=True),
+    RerankModelInfo("Qwen/Qwen3-Reranker-4B",
+                    architecture="Qwen3ForSequenceClassification",
+                    enable_test=False)
+]
+
+
+class Qwen3RerankerHfRunner(HfRunner):
+
+    def __init__(self,
+                 model_name: str,
+                 dtype: str = "auto",
+                 *args: Any,
+                 **kwargs: Any) -> None:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                                       padding_side='left')
+        self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
+        self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
+
+    def predict(self, prompts: list[list[str]], *args,
+                **kwargs) -> torch.Tensor:
+
+        def process_inputs(pairs):
+            inputs = self.tokenizer(pairs,
+                                    padding=False,
+                                    truncation='longest_first',
+                                    return_attention_mask=False)
+            for i, ele in enumerate(inputs['input_ids']):
+                inputs['input_ids'][i] = ele
+            inputs = self.tokenizer.pad(inputs,
+                                        padding=True,
+                                        return_tensors="pt")
+            for key in inputs:
+                inputs[key] = inputs[key].to(self.model.device)
+            return inputs
+
+        @torch.no_grad()
+        def compute_logits(inputs):
+            batch_scores = self.model(**inputs).logits[:, -1, :]
+            true_vector = batch_scores[:, self.token_true_id]
+            false_vector = batch_scores[:, self.token_false_id]
+            batch_scores = torch.stack([false_vector, true_vector], dim=1)
+            batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
+            scores = batch_scores[:, 1].exp()
+            return scores
+
+        scores = []
+        for prompt in prompts:
+            inputs = process_inputs([prompt])
+            score = compute_logits(inputs)
+            scores.append(score[0].item())
+        return torch.Tensor(scores)
+
+
+@pytest.mark.parametrize("model_info", RERANK_MODELS)
+def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
+
+    assert model_info.architecture == "Qwen3ForSequenceClassification"
+
+    vllm_extra_kwargs: dict[str, Any] = {
+        "hf_overrides": {
+            "architectures": ["Qwen3ForSequenceClassification"],
+            "classifier_from_token": ["no", "yes"],
+            "is_original_qwen3_reranker": True,
+        }
+    }
+
+    if model_info.name == "Qwen/Qwen3-Reranker-4B":
+        vllm_extra_kwargs["max_num_seqs"] = 1
+
+    mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info,
+                            vllm_extra_kwargs)
+
+
+@pytest.mark.parametrize("model_info", RERANK_MODELS)
+@multi_gpu_test(num_gpus=2)
+def test_rerank_models_mteb_tp(vllm_runner,
+                               model_info: RerankModelInfo) -> None:
+
+    assert model_info.architecture == "Qwen3ForSequenceClassification"
+
+    vllm_extra_kwargs: dict[str, Any] = {
+        "hf_overrides": {
+            "architectures": ["Qwen3ForSequenceClassification"],
+            "classifier_from_token": ["no", "yes"],
+            "is_original_qwen3_reranker": True,
+        },
+        "tensor_parallel_size": 2,
+    }
+
+    if model_info.name == "Qwen/Qwen3-Reranker-4B":
+        vllm_extra_kwargs["max_num_seqs"] = 1
+
+    mteb_test_rerank_models(Qwen3RerankerHfRunner,
+                            vllm_runner,
+                            model_info,
+                            vllm_extra_kwargs,
+                            atol=1.2e-2)
diff --git a/vllm_v0.10.0/tests/models/language/pooling/test_reward.py b/vllm_v0.10.0/tests/models/language/pooling/test_reward.py
new file mode 100644
index 0000000..3b7fab3
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/language/pooling/test_reward.py
@@ -0,0 +1,109 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+
+import pytest
+import torch
+import torch.nn.functional as F
+from transformers import AutoModel
+
+from vllm.platforms import current_platform
+
+from ....conftest import HfRunner
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+@pytest.fixture
+def math_step_prompts():
+    # ruff: noqa: E501
+    data = {
+        "system":
+        "Please reason step by step, and put your final answer within \\boxed{}. ",
+        "query":
+        "Sue lives in a fun neighborhood.  One weekend, the neighbors decided to play a prank on Sue.  On Friday morning, the neighbors placed 18 pink plastic flamingos out on Sue's front yard.  On Saturday morning, the neighbors took back one third of the flamingos, painted them white, and put these newly painted white flamingos back out on Sue's front yard.  Then, on Sunday morning, they added another 18 pink plastic flamingos to the collection. At noon on Sunday, how many more pink plastic flamingos were out than white plastic flamingos?",
+        "response": [
+            "To find out how many more pink plastic flamingos were out than white plastic flamingos at noon on Sunday, we can break down the problem into steps. First, on Friday, the neighbors start with 18 pink plastic flamingos.",
+            "On Saturday, they take back one third of the flamingos. Since there were 18 flamingos, (1/3 \\times 18 = 6) flamingos are taken back. So, they have (18 - 6 = 12) flamingos left in their possession. Then, they paint these 6 flamingos white and put them back out on Sue's front yard. Now, Sue has the original 12 pink flamingos plus the 6 new white ones. Thus, by the end of Saturday, Sue has (12 + 6 = 18) pink flamingos and 6 white flamingos.",
+            "On Sunday, the neighbors add another 18 pink plastic flamingos to Sue's front yard. By the end of Sunday morning, Sue has (18 + 18 = 36) pink flamingos and still 6 white flamingos.",
+            "To find the difference, subtract the number of white flamingos from the number of pink flamingos: (36 - 6 = 30). Therefore, at noon on Sunday, there were 30 more pink plastic flamingos out than white plastic flamingos. The answer is (\\boxed{30}).",
+        ],
+    }
+    answer = "<extra_0>".join(data['response']) + "<extra_0>"
+    prompt = f"<im_start>system\n{data['system']}<im_end>\n<im_start>user\n{data['query']}<im_end>\n<im_start>assistant\n{answer}<im_end><|endoftext|>"
+    return [prompt]
+
+
+def step_reward_patch_hf_model(hf_model: HfRunner):
+
+    # Patch the hf_runner to use the step reward function
+    def make_step_rewards(logits: torch.Tensor,
+                          token_masks: torch.Tensor) -> list[list[float]]:
+        probabilities = F.softmax(logits, dim=-1)
+        probabilities = probabilities * token_masks.unsqueeze(-1)
+
+        all_scores_res: list[list[float]] = []
+        for i in range(probabilities.size(0)):
+            sample = probabilities[i]  # seq_len, num_labels
+            positive_probs = sample[sample != 0].view(-1, 2)
+            non_zero_elements_list = positive_probs.cpu().tolist()
+            all_scores_res.append(non_zero_elements_list)
+        return all_scores_res
+
+    def reward(prompts: list[str]) -> list[list[float]]:
+        input_ids = hf_model.tokenizer(prompts, return_tensors="pt").input_ids
+        input_ids = hf_model.wrap_device(input_ids)
+        outputs = hf_model.model(input_ids=input_ids)
+
+        step_sep_id = hf_model.tokenizer.encode("<extra_0>")[0]
+        token_masks = (input_ids == step_sep_id)
+        return make_step_rewards(outputs[0], token_masks)
+
+    hf_model.reward = reward  # type: ignore[attr-defined]
+
+    return hf_model
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        pytest.param("Qwen/Qwen2.5-Math-PRM-7B",
+                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_prm_models(
+    hf_runner,
+    vllm_runner,
+    math_step_prompts,
+    model: str,
+    dtype: str,
+    monkeypatch,
+) -> None:
+    if current_platform.is_cpu() and os.environ.get("VLLM_USE_V1", "0") == "0":
+        pytest.skip("CPU only supports V1")
+
+    if current_platform.is_rocm():
+        # ROCm Triton FA does not currently support sliding window attention
+        # switch to use ROCm CK FA backend
+        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
+
+    with vllm_runner(model, max_model_len=1024, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.encode(math_step_prompts)
+
+    with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
+        hf_model = step_reward_patch_hf_model(hf_model)
+        hf_outputs = hf_model.reward(math_step_prompts)
+
+    # check logits difference
+    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+        hf_output = torch.tensor(hf_output)
+        vllm_output = torch.tensor(vllm_output)
+
+        assert torch.allclose(hf_output, vllm_output, 1.5e-2)
diff --git a/vllm_v0.10.0/tests/models/language/pooling/test_scoring.py b/vllm_v0.10.0/tests/models/language/pooling/test_scoring.py
new file mode 100644
index 0000000..c75ff14
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/language/pooling/test_scoring.py
@@ -0,0 +1,172 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+import torch.nn.functional as F
+
+CROSS_ENCODER_MODELS = [
+    "cross-encoder/ms-marco-MiniLM-L-6-v2",  # Bert
+    "BAAI/bge-reranker-v2-m3",  # Roberta
+]
+
+EMBEDDING_MODELS = [
+    "sentence-transformers/all-MiniLM-L12-v2",
+]
+
+TEXTS_1 = [
+    "What is the capital of France?",
+    "What is the capital of Germany?",
+]
+
+TEXTS_2 = [
+    "The capital of France is Paris.",
+    "The capital of Germany is Berlin.",
+]
+
+DTYPE = "half"
+
+
+@pytest.fixture(scope="module", params=CROSS_ENCODER_MODELS)
+def model_name(request):
+    yield request.param
+
+
+def test_cross_encoder_1_to_1(vllm_runner, hf_runner, model_name):
+    text_pair = [TEXTS_1[0], TEXTS_2[0]]
+
+    with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
+        hf_outputs = hf_model.predict([text_pair]).tolist()
+
+    with vllm_runner(model_name, task="score", dtype=DTYPE,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
+
+    assert len(vllm_outputs) == 1
+    assert len(hf_outputs) == 1
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+
+
+def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[0], TEXTS_2[1]],
+    ]
+
+    with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
+        hf_outputs = hf_model.predict(text_pairs).tolist()
+
+    with vllm_runner(model_name, task="score", dtype=DTYPE,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
+
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
+
+
+def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[1], TEXTS_2[1]],
+    ]
+
+    with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
+        hf_outputs = hf_model.predict(text_pairs).tolist()
+
+    with vllm_runner(model_name, task="score", dtype=DTYPE,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
+
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
+
+
+@pytest.fixture(scope="module", params=EMBEDDING_MODELS)
+def emb_model_name(request):
+    yield request.param
+
+
+def test_embedding_1_to_1(vllm_runner, hf_runner, emb_model_name):
+    text_pair = [TEXTS_1[0], TEXTS_2[0]]
+
+    with hf_runner(emb_model_name, dtype=DTYPE,
+                   is_sentence_transformer=True) as hf_model:
+        hf_embeddings = hf_model.encode(text_pair)
+        hf_outputs = [
+            F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0)
+        ]
+
+    with vllm_runner(emb_model_name,
+                     task="embed",
+                     dtype=DTYPE,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
+
+    assert len(vllm_outputs) == 1
+    assert len(hf_outputs) == 1
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+
+
+def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[0], TEXTS_2[1]],
+    ]
+
+    with hf_runner(emb_model_name, dtype=DTYPE,
+                   is_sentence_transformer=True) as hf_model:
+        hf_embeddings = [
+            hf_model.encode(text_pair) for text_pair in text_pairs
+        ]
+        hf_outputs = [
+            F.cosine_similarity(*map(torch.tensor, pair), dim=0)
+            for pair in hf_embeddings
+        ]
+
+    with vllm_runner(emb_model_name,
+                     task="embed",
+                     dtype=DTYPE,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
+
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
+
+
+def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[1], TEXTS_2[1]],
+    ]
+
+    with hf_runner(emb_model_name, dtype=DTYPE,
+                   is_sentence_transformer=True) as hf_model:
+        hf_embeddings = [
+            hf_model.encode(text_pair) for text_pair in text_pairs
+        ]
+        hf_outputs = [
+            F.cosine_similarity(*map(torch.tensor, pair), dim=0)
+            for pair in hf_embeddings
+        ]
+
+    with vllm_runner(emb_model_name,
+                     task="embed",
+                     dtype=DTYPE,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
+
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
diff --git a/vllm_v0.10.0/tests/models/language/pooling/test_snowflake_arctic_embed.py b/vllm_v0.10.0/tests/models/language/pooling/test_snowflake_arctic_embed.py
new file mode 100644
index 0000000..d6b5dbd
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/language/pooling/test_snowflake_arctic_embed.py
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from .embed_utils import EmbedModelInfo, correctness_test_embed_models
+from .mteb_utils import mteb_test_embed_models
+
+MODELS = [
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-xs",
+                   is_matryoshka=False,
+                   architecture="BertModel",
+                   enable_test=True),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-s",
+                   is_matryoshka=False,
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m",
+                   is_matryoshka=False,
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long",
+                   is_matryoshka=False,
+                   architecture="NomicBertModel",
+                   enable_test=True),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-l",
+                   is_matryoshka=False,
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5",
+                   is_matryoshka=True,
+                   architecture="BertModel",
+                   enable_test=True),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-l-v2.0",
+                   is_matryoshka=True,
+                   architecture="XLMRobertaModel",
+                   enable_test=True),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
+                   is_matryoshka=True,
+                   architecture="GteModel",
+                   enable_test=True),
+]
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_embed_models_mteb(hf_runner, vllm_runner,
+                           model_info: EmbedModelInfo) -> None:
+    mteb_test_embed_models(hf_runner, vllm_runner, model_info)
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_embed_models_correctness(hf_runner, vllm_runner,
+                                  model_info: EmbedModelInfo,
+                                  example_prompts) -> None:
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
+                                  example_prompts)
diff --git a/vllm_v0.10.0/tests/models/language/pooling/test_truncation_control.py b/vllm_v0.10.0/tests/models/language/pooling/test_truncation_control.py
new file mode 100644
index 0000000..c7399e0
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/language/pooling/test_truncation_control.py
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
+max_model_len = 128
+
+input_str = """Immerse yourself in the enchanting chronicle of calculus, a 
+mathematical domain that has radically transformed our comprehension of 
+change and motion. Despite its roots in ancient civilizations, the 
+formal birth of calculus predominantly occurred in the 17th century, 
+primarily under the influential guidance of Sir Isaac Newton and Gottfried 
+Wilhelm Leibniz. The earliest traces of calculus concepts are found in 
+ancient Greek mathematics,most notably in the works of Eudoxus and 
+Archimedes, around 300 BCE. They utilized the 'method of exhaustion'—a 
+technique for computing areas and volumes through the use of finite sums. 
+This methodology laid crucial foundational work for integral calculus. 
+In the 17th century, both Newton and Leibniz independently pioneered 
+calculus, each contributing unique perspectives that would shape this new 
+field."""
+
+
+def test_smaller_truncation_size(vllm_runner,
+                                 model_name=MODEL_NAME,
+                                 input_str=input_str):
+
+    truncate_prompt_tokens = 10
+
+    with vllm_runner(model_name, task="embed",
+                     max_model_len=max_model_len) as vllm_model:
+        vllm_output = vllm_model.llm.encode(
+            input_str, truncate_prompt_tokens=truncate_prompt_tokens)
+
+    prompt_tokens = vllm_output[0].prompt_token_ids
+
+    assert len(prompt_tokens) == truncate_prompt_tokens
+
+
+def test_max_truncation_size(vllm_runner,
+                             model_name=MODEL_NAME,
+                             input_str=input_str):
+    truncate_prompt_tokens = -1
+
+    with vllm_runner(model_name, task="embed",
+                     max_model_len=max_model_len) as vllm_model:
+        vllm_output = vllm_model.llm.encode(
+            input_str, truncate_prompt_tokens=truncate_prompt_tokens)
+
+    prompt_tokens = vllm_output[0].prompt_token_ids
+
+    assert len(prompt_tokens) == max_model_len
+
+
+def test_bigger_truncation_size(vllm_runner,
+                                model_name=MODEL_NAME,
+                                input_str=input_str):
+
+    truncate_prompt_tokens = max_model_len + 1
+
+    with pytest.raises(ValueError), vllm_runner(
+            model_name, task="embed",
+            max_model_len=max_model_len) as vllm_model:
+
+        llm_output = vllm_model.llm.encode(
+            input_str, truncate_prompt_tokens=truncate_prompt_tokens)
+
+        assert llm_output == f"""truncate_prompt_tokens value 
+                ({truncate_prompt_tokens}) is greater than 
+                max_model_len ({max_model_len}). Please, select 
+                a smaller truncation size."""
diff --git a/vllm_v0.10.0/tests/models/multimodal/__init__.py b/vllm_v0.10.0/tests/models/multimodal/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/models/multimodal/generation/__init__.py b/vllm_v0.10.0/tests/models/multimodal/generation/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/models/multimodal/generation/test_common.py b/vllm_v0.10.0/tests/models/multimodal/generation/test_common.py
new file mode 100644
index 0000000..e2e35e9
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/generation/test_common.py
@@ -0,0 +1,1122 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Common tests for testing .generate() functionality for single / multiple
+image, embedding, and video support for different VLMs in vLLM.
+"""
+import math
+import os
+from collections import defaultdict
+from pathlib import PosixPath
+
+import pytest
+from transformers import (AutoModel, AutoModelForImageTextToText,
+                          AutoModelForTextToWaveform, AutoModelForVision2Seq)
+
+from vllm.platforms import current_platform
+from vllm.utils import identity
+
+from ....conftest import (IMAGE_ASSETS, AudioTestAssets, HfRunner,
+                          ImageTestAssets, VideoTestAssets, VllmRunner)
+from ....utils import (create_new_process_for_each_test, large_gpu_mark,
+                       multi_gpu_marks)
+from ...utils import check_outputs_equal
+from .vlm_utils import custom_inputs, model_utils, runners
+from .vlm_utils.case_filtering import get_parametrized_options
+from .vlm_utils.types import (CustomTestOptions, ExpandableVLMTestArgs,
+                              VLMTestInfo, VLMTestType)
+
+# This hack is needed for phi3v & paligemma models
+# ROCm Triton FA can run into shared memory issues with these models,
+# use other backends in the meantime
+# FIXME (mattwong, gshtrasb, hongxiayan)
+if current_platform.is_rocm():
+    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
+
+REQUIRES_V0_MODELS = [
+    # V1 Test: not enough KV cache space in C1.
+    "fuyu",
+    # V1 Test: Deadlock issue when processing mm_inputs
+    "llava-onevision-transformers",
+]
+
+# yapf: disable
+COMMON_BROADCAST_SETTINGS = {
+    "test_type": VLMTestType.IMAGE,
+    "dtype": "half",
+    "max_tokens": 5,
+    "tensor_parallel_size": 2,
+    "hf_model_kwargs": {"device_map": "auto"},
+    "image_size_factors": [(.25, 0.5, 1.0)],
+    "distributed_executor_backend": (
+        "ray",
+        "mp",
+    )
+}
+
+### Test configuration for specific models
+# NOTE: The convention of the test settings below is to lead each test key
+# with the name of the model arch used in the test, using underscores in place
+# of hyphens; this makes it more convenient to filter tests for a specific kind
+# of model. For example....
+#
+# To run all test types for a specific key:
+#     use the k flag to substring match with a leading square bracket; if the
+#     model arch happens to be a substring of another one, you can add a
+#     trailing hyphen. E.g.,
+#                 - pytest $TEST_FILE -k "[llava-"
+#     prevents matching on "[llava_next-" & will match just the enabled cases
+#     for llava, i.e., single image, image embedding, and custom input tests.
+#
+# To run a test for a Test Info for just one of multiple models:
+#     use the k flag to substring match the model name, e.g.,
+#                 - pytest $TEST_FILE -k OpenGVLab/InternVL2-1B
+#     prevents matching on nGVLab/InternVL2-2B.
+#
+# You can also combine substrings to match more granularly.
+#     ex 1:
+#        pytest $TEST_FILE -k "test_single_image and OpenGVLab/InternVL2-1B"
+#     will run only test_single_image* for OpenGVLab/InternVL2-1B; this would
+#     match both wrappers for single image tests, since it also matches
+#     test_single_image_heavy (which forks if we have a distributed backend)
+#     ex 2:
+#        pytest $TEST_FILE -k  "[llava- or [intern_vl-"
+#     will run all of the tests for only llava & internvl.
+#
+# NOTE you can add --collect-only to any of the above commands to see
+# which cases would be selected and deselected by pytest. In general,
+# this is a good idea for checking your command first, since tests are slow.
+
+VLM_TEST_SETTINGS = {
+    #### Core tests to always run in the CI
+    "llava": VLMTestInfo(
+        models=["llava-hf/llava-1.5-7b-hf"],
+        test_type=(
+            VLMTestType.EMBEDDING,
+            VLMTestType.IMAGE,
+            VLMTestType.CUSTOM_INPUTS
+        ),
+        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
+        convert_assets_to_embeddings=model_utils.get_llava_embeddings,
+        max_model_len=4096,
+        auto_cls=AutoModelForImageTextToText,
+        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
+        custom_test_opts=[CustomTestOptions(
+            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
+                formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:"
+            ),
+            limit_mm_per_prompt={"image": 4},
+        )],
+        # TODO: Revert to "auto" when CPU backend can use torch > 2.6
+        dtype="bfloat16" if current_platform.is_cpu() else "auto",
+        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+    ),
+    "paligemma": VLMTestInfo(
+        models=["google/paligemma-3b-mix-224"],
+        test_type=VLMTestType.IMAGE,
+        prompt_formatter=identity,
+        img_idx_to_prompt = lambda idx: "",
+        # Paligemma uses its own sample prompts because the default one fails
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "caption es",
+            "cherry_blossom": "What is in the picture?",
+        }),
+        auto_cls=AutoModelForImageTextToText,
+        vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
+        dtype="bfloat16",
+        marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")],  # noqa: E501
+    ),
+    "qwen2_5_vl": VLMTestInfo(
+        models=["Qwen/Qwen2.5-VL-3B-Instruct"],
+        test_type=(
+            VLMTestType.IMAGE,
+            VLMTestType.MULTI_IMAGE,
+            VLMTestType.VIDEO
+        ),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
+        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForVision2Seq,
+        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
+        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+    ),
+    "qwen2_5_omni": VLMTestInfo(
+        models=["Qwen/Qwen2.5-Omni-3B"],
+        test_type=(
+            VLMTestType.IMAGE,
+            VLMTestType.MULTI_IMAGE,
+            VLMTestType.VIDEO
+        ),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>", # noqa: E501
+        video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        num_logprobs= 6 if current_platform.is_cpu() else 5,
+        auto_cls=AutoModelForTextToWaveform,
+        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
+        patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner,
+        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+    ),
+    "ultravox": VLMTestInfo(
+        models = ["fixie-ai/ultravox-v0_5-llama-3_2-1b"],
+        test_type=VLMTestType.AUDIO,
+        prompt_formatter=lambda audio_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{audio_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
+        audio_idx_to_prompt=lambda idx: "<|audio|>",
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModel,
+        hf_output_post_proc=model_utils.ultravox_trunc_hf_output,
+        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+    ),
+    #### Transformers fallback to test
+    ## To reduce test burden, we only test batching arbitrary image size
+    # Dynamic image length and number of patches
+    "llava-onevision-transformers": VLMTestInfo(
+        models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
+        test_type=VLMTestType.IMAGE,
+        prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
+        max_model_len=16384,
+        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),   # noqa: E501
+        auto_cls=AutoModelForImageTextToText,
+        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
+        image_size_factors=[(0.25, 0.5, 1.0)],
+        vllm_runner_kwargs={
+            "model_impl": "transformers",
+        },
+        marks=[pytest.mark.core_model],
+    ),
+    # FIXME(Isotr0py): Enable this test after
+    # https://github.com/huggingface/transformers/pull/39470 released
+    # "idefics3-transformers": VLMTestInfo(
+    #     models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
+    #     test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+    #     prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
+    #     img_idx_to_prompt=lambda idx: "<image>",
+    #     max_model_len=8192,
+    #     max_num_seqs=2,
+    #     auto_cls=AutoModelForImageTextToText,
+    #     hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
+    #     image_size_factors=[(0.25, 0.5, 1.0)],
+    #     vllm_runner_kwargs={
+    #         "model_impl": "transformers",
+    #     },
+    #     marks=[pytest.mark.core_model],
+    # ),
+    # Pixel values from processor are not 4D or 5D arrays
+    "qwen2_5_vl-transformers": VLMTestInfo(
+        models=["Qwen/Qwen2.5-VL-3B-Instruct"],
+        test_type=VLMTestType.IMAGE,
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
+        image_size_factors=[(0.25, 0.2, 0.15)],
+        vllm_runner_kwargs={
+            "model_impl": "transformers",
+        },
+        marks=[large_gpu_mark(min_gb=32)],
+    ),
+    # Check "auto" with fallback to transformers
+    "internvl-transformers": VLMTestInfo(
+        models=["OpenGVLab/InternVL3-1B-hf"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<IMG_CONTEXT>",
+        max_model_len=4096,
+        use_tokenizer_eos=True,
+        image_size_factors=[(0.25, 0.5, 1.0)],
+        vllm_runner_kwargs={
+            "model_impl": "auto",
+        },
+        auto_cls=AutoModelForImageTextToText,
+        marks=[pytest.mark.core_model],
+    ),
+    #### Extended model tests
+    "aria": VLMTestInfo(
+        models=["rhymes-ai/Aria"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<vlm_image>Please describe the image shortly.",
+            "cherry_blossom": "<vlm_image>Please infer the season with reason.",  # noqa: E501
+        }),
+        multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",    # noqa: E501
+        stop_str=["<|im_end|>"],
+        image_size_factors=[(0.10, 0.15)],
+        max_tokens=64,
+        marks=[large_gpu_mark(min_gb=64)],
+    ),
+    "aya_vision": VLMTestInfo(
+        models=["CohereForAI/aya-vision-8b"],
+        test_type=(VLMTestType.IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<image>What's the content in the center of the image?",  # noqa: E501
+            "cherry_blossom": "<image>What is the season?",  # noqa: E501
+        }),
+        multi_image_prompt="<image><image>Describe the two images in detail.",  # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}},
+    ),
+    "aya_vision-multi_image": VLMTestInfo(
+        models=["CohereForAI/aya-vision-8b"],
+        test_type=(VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<image>What's the content in the center of the image?",  # noqa: E501
+            "cherry_blossom": "<image>What is the season?",  # noqa: E501
+        }),
+        multi_image_prompt="<image><image>Describe the two images in detail.",  # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}},
+        marks=[large_gpu_mark(min_gb=32)],
+    ),
+    "blip2": VLMTestInfo(
+        models=["Salesforce/blip2-opt-2.7b"],
+        test_type=VLMTestType.IMAGE,
+        prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
+        img_idx_to_prompt=lambda idx: "",
+        auto_cls=AutoModelForImageTextToText,
+        vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
+        # FIXME: https://github.com/huggingface/transformers/pull/38510
+        marks=[pytest.mark.skip("Model is broken")],
+    ),
+    "chameleon": VLMTestInfo(
+        models=["facebook/chameleon-7b"],
+        test_type=VLMTestType.IMAGE,
+        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        # For chameleon, we only compare the sequences
+        vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
+        hf_output_post_proc = lambda hf_output, model: hf_output[:2],
+        comparator=check_outputs_equal,
+        max_tokens=8,
+        dtype="bfloat16",
+    ),
+    "deepseek_vl_v2": VLMTestInfo(
+        models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
+            "cherry_blossom": "<image>\nPlease infer the season with reason in details.",   # noqa: E501
+        }),
+        multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?",    # noqa: E501
+        patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
+        hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
+        stop_str=["<｜end▁of▁sentence｜>", "<｜begin▁of▁sentence｜>"],  # noqa: E501
+        image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
+    ),
+    "fuyu": VLMTestInfo(
+        models=["adept/fuyu-8b"],
+        test_type=VLMTestType.IMAGE,
+        prompt_formatter=lambda img_prompt: f"{img_prompt}\n",
+        img_idx_to_prompt=lambda idx: "",
+        max_model_len=2048,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        use_tokenizer_eos=True,
+        vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
+        num_logprobs=10,
+        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+    ),
+    "gemma3": VLMTestInfo(
+        models=["google/gemma-3-4b-it"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<start_of_image>What's the content in the center of the image?",  # noqa: E501
+            "cherry_blossom": "<start_of_image>What is the season?",  # noqa: E501
+        }),
+        multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.",  # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
+        patch_hf_runner=model_utils.gemma3_patch_hf_runner,
+        num_logprobs=10,
+    ),
+    "glm4v": VLMTestInfo(
+        models=["THUDM/glm-4v-9b"],
+        test_type=VLMTestType.IMAGE,
+        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",  # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?",  # noqa: E501
+            "cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?",  # noqa: E501
+        }),
+        max_model_len=2048,
+        max_num_seqs=2,
+        get_stop_token_ids=lambda tok: [151329, 151336, 151338],
+        patch_hf_runner=model_utils.glm4v_patch_hf_runner,
+        # The image embeddings match with HF but the outputs of the language
+        # decoder are only consistent up to 2 decimal places.
+        # So, we need to reduce the number of tokens for the test to pass.
+        max_tokens=8,
+        num_logprobs=10,
+        marks=[large_gpu_mark(min_gb=32)],
+    ),
+    "glm4_1v": VLMTestInfo(
+        models=["THUDM/GLM-4.1V-9B-Thinking"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501
+        video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", # noqa: E501
+        max_model_len=2048,
+        max_num_seqs=2,
+        get_stop_token_ids=lambda tok: [151329, 151336, 151338],
+        num_logprobs=10,
+        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        auto_cls=AutoModelForImageTextToText,
+        marks=[large_gpu_mark(min_gb=32)],
+    ),
+    "glm4_1v-video": VLMTestInfo(
+        models=["THUDM/GLM-4.1V-9B-Thinking"],
+        # GLM4.1V require include video metadata for input
+        test_type=VLMTestType.CUSTOM_INPUTS,
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        patch_hf_runner=model_utils.glm4_1v_patch_hf_runner,
+        custom_test_opts=[CustomTestOptions(
+            inputs=custom_inputs.video_with_metadata_glm4_1v(),
+            limit_mm_per_prompt={"video": 1},
+        )],
+        marks=[large_gpu_mark(min_gb=32)],
+    ),
+    "h2ovl": VLMTestInfo(
+        models = [
+            "h2oai/h2ovl-mississippi-800m",
+            "h2oai/h2ovl-mississippi-2b",
+        ],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<image>\nWhat's the content in the center of the image?",  # noqa: E501
+            "cherry_blossom": "<image>\nWhat is the season?",
+        }),
+        multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.",  # noqa: E501
+        max_model_len=8192,
+        use_tokenizer_eos=True,
+        num_logprobs=10,
+        patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
+    ),
+    "idefics3": VLMTestInfo(
+        models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "<image>",
+        max_model_len=8192,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
+    ),
+    "intern_vl": VLMTestInfo(
+        models=[
+            "OpenGVLab/InternVL2-1B",
+            "OpenGVLab/InternVL2-2B",
+            # FIXME: Config cannot be loaded in transformers 4.52
+            # "OpenGVLab/Mono-InternVL-2B",
+        ],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<image>\nWhat's the content in the center of the image?",  # noqa: E501
+            "cherry_blossom": "<image>\nWhat is the season?",
+        }),
+        multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.",  # noqa: E501
+        max_model_len=4096,
+        use_tokenizer_eos=True,
+        patch_hf_runner=model_utils.internvl_patch_hf_runner,
+    ),
+    "intern_vl-video": VLMTestInfo(
+        models=[
+            "OpenGVLab/InternVL3-1B",
+        ],
+        test_type=VLMTestType.VIDEO,
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
+        video_idx_to_prompt=lambda idx: "<video>",
+        max_model_len=8192,
+        use_tokenizer_eos=True,
+        patch_hf_runner=model_utils.internvl_patch_hf_runner,
+    ),
+    "kimi_vl": VLMTestInfo(
+        models=["moonshotai/Kimi-VL-A3B-Instruct"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|im_user|>user<|im_middle|>{img_prompt}<|im_end|><|im_assistant|>assistant<|im_middle|>", # noqa: E501
+        img_idx_to_prompt=lambda _: "<|media_start|>image<|media_content|><|media_pad|><|media_end|>",  # noqa: E501
+        max_model_len=8192,
+        max_num_seqs=2,
+        dtype="bfloat16",
+        tensor_parallel_size=1,
+        vllm_output_post_proc=model_utils.kimiv_vl_vllm_to_hf_output,
+        marks=[large_gpu_mark(min_gb=48)],
+    ),
+    "llama4": VLMTestInfo(
+        models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
+        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501
+        img_idx_to_prompt=lambda _: "<|image|>",
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        distributed_executor_backend="mp",
+        image_size_factors=[(.25, 0.5, 1.0)],
+        hf_model_kwargs={"device_map": "auto"},
+        max_model_len=8192,
+        max_num_seqs=4,
+        dtype="bfloat16",
+        auto_cls=AutoModelForImageTextToText,
+        tensor_parallel_size=4,
+        marks=multi_gpu_marks(num_gpus=4),
+    ),
+    "llava_next": VLMTestInfo(
+        models=["llava-hf/llava-v1.6-mistral-7b-hf"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
+        prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
+        max_model_len=10240,
+        auto_cls=AutoModelForImageTextToText,
+        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
+        custom_test_opts=[CustomTestOptions(
+            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
+                formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]"
+            ),
+            limit_mm_per_prompt={"image": 4},
+        )],
+    ),
+    "llava_onevision": VLMTestInfo(
+        models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
+        test_type=VLMTestType.CUSTOM_INPUTS,
+        prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
+        num_video_frames=16,
+        max_model_len=16384,
+        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),   # noqa: E501
+        auto_cls=AutoModelForVision2Seq,
+        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
+        custom_test_opts=[CustomTestOptions(
+            inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs(
+                formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
+            ),
+            limit_mm_per_prompt={"video": 4},
+        )],
+    ),
+    "llava_next_video": VLMTestInfo(
+        models=["llava-hf/LLaVA-NeXT-Video-7B-hf"],
+        test_type=VLMTestType.VIDEO,
+        prompt_formatter=lambda vid_prompt: f"USER: {vid_prompt} ASSISTANT:",
+        num_video_frames=16,
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForVision2Seq,
+        vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output,
+    ),
+    "mantis": VLMTestInfo(
+        models=["TIGER-Lab/Mantis-8B-siglip-llama3"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
+        max_model_len=4096,
+        get_stop_token_ids=lambda tok: [128009],
+        auto_cls=AutoModelForImageTextToText,
+        vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
+        patch_hf_runner=model_utils.mantis_patch_hf_runner,
+    ),
+    "minicpmv_25": VLMTestInfo(
+        models=["openbmb/MiniCPM-Llama3-V-2_5"],
+        test_type=VLMTestType.IMAGE,
+        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
+        max_model_len=4096,
+        max_num_seqs=2,
+        get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
+        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
+        patch_hf_runner=model_utils.minicpmv_25_patch_hf_runner,
+        # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
+        marks=[pytest.mark.skip("HF import fails")],
+    ),
+    "minicpmo_26": VLMTestInfo(
+        models=["openbmb/MiniCPM-o-2_6"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
+        max_model_len=4096,
+        max_num_seqs=2,
+        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
+        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
+        patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
+        # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
+        marks=[pytest.mark.skip("HF import fails")],
+    ),
+    "minicpmv_26": VLMTestInfo(
+        models=["openbmb/MiniCPM-V-2_6"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
+        max_model_len=4096,
+        max_num_seqs=2,
+        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
+        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
+        patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
+        # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
+        marks=[pytest.mark.skip("HF import fails")],
+    ),
+    "minimax_vl_01": VLMTestInfo(
+        models=["MiniMaxAI/MiniMax-VL-01"],
+        prompt_formatter=lambda img_prompt: f"<beginning_of_sentence>user: {img_prompt} assistant:<end_of_sentence>", # noqa: E501
+        img_idx_to_prompt=lambda _: "<image>",
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        max_model_len=8192,
+        max_num_seqs=4,
+        dtype="bfloat16",
+        hf_output_post_proc=model_utils.minimax_vl_01_hf_output,
+        patch_hf_runner=model_utils.minimax_vl_01_patch_hf_runner,
+        auto_cls=AutoModelForImageTextToText,
+        marks=[large_gpu_mark(min_gb=80)],
+    ),
+    "molmo": VLMTestInfo(
+        models=["allenai/Molmo-7B-D-0924"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=identity,
+        max_model_len=4096,
+        max_num_seqs=2,
+        patch_hf_runner=model_utils.molmo_patch_hf_runner,
+    ),
+    "ovis1_6-gemma2": VLMTestInfo(
+        models=["AIDC-AI/Ovis1.6-Gemma2-9B"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        dtype="half",
+        # use sdpa mode for hf runner since ovis2 didn't work with flash_attn
+        hf_model_kwargs={"llm_attn_implementation": "sdpa"},
+        patch_hf_runner=model_utils.ovis_patch_hf_runner,
+        marks=[large_gpu_mark(min_gb=32)],
+    ),
+    "ovis1_6": VLMTestInfo(
+        models=["AIDC-AI/Ovis1.6-Llama3.2-3B"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful and honest multimodal assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        dtype="half",
+        # use sdpa mode for hf runner since ovis2 didn't work with flash_attn
+        hf_model_kwargs={"llm_attn_implementation": "sdpa"},
+        patch_hf_runner=model_utils.ovis_patch_hf_runner,
+    ),
+    "ovis2": VLMTestInfo(
+        models=["AIDC-AI/Ovis2-1B"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        dtype="half",
+        # use sdpa mode for hf runner since ovis2 didn't work with flash_attn
+        hf_model_kwargs={"llm_attn_implementation": "sdpa"},
+        patch_hf_runner=model_utils.ovis_patch_hf_runner,
+    ),
+    "phi3v": VLMTestInfo(
+        models=["microsoft/Phi-3.5-vision-instruct"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
+        max_model_len=4096,
+        max_num_seqs=2,
+        task="generate",
+        # use sdpa mode for hf runner since phi3v didn't work with flash_attn
+        hf_model_kwargs={"_attn_implementation": "sdpa"},
+        use_tokenizer_eos=True,
+        vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
+        num_logprobs=10,
+    ),
+    "pixtral_hf": VLMTestInfo(
+        models=["nm-testing/pixtral-12b-FP8-dynamic"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<s>[INST]{img_prompt}[/INST]",
+        img_idx_to_prompt=lambda idx: "[IMG]",
+        max_model_len=8192,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        marks=[large_gpu_mark(min_gb=48)],
+    ),
+    "qwen_vl": VLMTestInfo(
+        models=["Qwen/Qwen-VL"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=identity,
+        img_idx_to_prompt=lambda idx: f"Picture {idx}: <img></img>\n",
+        max_model_len=1024,
+        max_num_seqs=2,
+        vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output,
+        prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
+        # FIXME: https://github.com/huggingface/transformers/issues/38358
+        marks=[pytest.mark.skip("Model initialization fails")],
+    ),
+    "qwen2_vl": VLMTestInfo(
+        models=["Qwen/Qwen2-VL-2B-Instruct"],
+        test_type=(
+            VLMTestType.IMAGE,
+            VLMTestType.MULTI_IMAGE,
+            VLMTestType.VIDEO
+        ),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
+        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForVision2Seq,
+        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
+        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        marks=[pytest.mark.cpu_model],
+    ),
+    "skywork_r1v": VLMTestInfo(
+        models=["Skywork/Skywork-R1V-38B"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<｜begin▁of▁sentence｜><｜User｜>\n{img_prompt}<｜Assistant｜><think>\n", # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<image>\nWhat's the content in the center of the image?",  # noqa: E501
+            "cherry_blossom": "<image>\nWhat is the season?",
+        }),
+        multi_image_prompt="<image>\n<image>\nDescribe the two images in short.",  # noqa: E501
+        max_model_len=4096,
+        use_tokenizer_eos=True,
+        patch_hf_runner=model_utils.skyworkr1v_patch_hf_runner,
+        marks=[large_gpu_mark(min_gb=80)],
+    ),
+    "smolvlm": VLMTestInfo(
+        models=["HuggingFaceTB/SmolVLM2-2.2B-Instruct"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt:f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "<image>",
+        max_model_len=8192,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        hf_output_post_proc=model_utils.smolvlm_trunc_hf_output,
+    ),
+    ### Tensor parallel / multi-gpu broadcast tests
+    "chameleon-broadcast": VLMTestInfo(
+        models=["facebook/chameleon-7b"],
+        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
+        max_model_len=4096,
+        auto_cls=AutoModelForImageTextToText,
+        vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
+        hf_output_post_proc = lambda hf_output, model: hf_output[:2],
+        comparator=check_outputs_equal,
+        marks=multi_gpu_marks(num_gpus=2),
+        **COMMON_BROADCAST_SETTINGS # type: ignore
+    ),
+    "llava-broadcast": VLMTestInfo(
+        models=["llava-hf/llava-1.5-7b-hf"],
+        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
+        max_model_len=4096,
+        auto_cls=AutoModelForImageTextToText,
+        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
+        marks=multi_gpu_marks(num_gpus=2),
+        **COMMON_BROADCAST_SETTINGS # type: ignore
+    ),
+    "llava_next-broadcast": VLMTestInfo(
+        models=["llava-hf/llava-v1.6-mistral-7b-hf"],
+        prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
+        max_model_len=10240,
+        auto_cls=AutoModelForImageTextToText,
+        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
+        marks=multi_gpu_marks(num_gpus=2),
+        **COMMON_BROADCAST_SETTINGS # type: ignore
+    ),
+    ### Custom input edge-cases for specific models
+    "intern_vl-diff-patches": VLMTestInfo(
+        models=["OpenGVLab/InternVL2-2B"],
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
+        test_type=VLMTestType.CUSTOM_INPUTS,
+        max_model_len=4096,
+        use_tokenizer_eos=True,
+        patch_hf_runner=model_utils.internvl_patch_hf_runner,
+        custom_test_opts=[
+            CustomTestOptions(
+                inputs=inp,
+                limit_mm_per_prompt={"image": 2},
+            ) for inp in custom_inputs.different_patch_input_cases_internvl()
+        ],
+    ),
+    "llava_onevision-multiple-images": VLMTestInfo(
+        models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
+        test_type=VLMTestType.CUSTOM_INPUTS,
+        max_model_len=16384,
+        max_num_seqs=2,
+        auto_cls=AutoModelForVision2Seq,
+        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),   # noqa: E501
+        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
+        custom_test_opts=[CustomTestOptions(
+            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
+                formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
+            ),
+            limit_mm_per_prompt={"image": 4},
+        )],
+    ),
+    # regression test for https://github.com/vllm-project/vllm/issues/15122
+    "qwen2_5_vl-windows-attention": VLMTestInfo(
+        models=["Qwen/Qwen2.5-VL-3B-Instruct"],
+        test_type=VLMTestType.CUSTOM_INPUTS,
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForVision2Seq,
+        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
+        custom_test_opts=[CustomTestOptions(
+            inputs=custom_inputs.windows_attention_image_qwen2_5_vl(),
+            limit_mm_per_prompt={"image": 1},
+        )],
+    ),
+}
+# yapf: enable
+
+
+def _mark_splits(
+    test_settings: dict[str, VLMTestInfo],
+    *,
+    num_groups: int,
+) -> dict[str, VLMTestInfo]:
+    name_by_test_info_id = {id(v): k for k, v in test_settings.items()}
+    test_infos_by_model = defaultdict[str, list[VLMTestInfo]](list)
+
+    for info in test_settings.values():
+        for model in info.models:
+            test_infos_by_model[model].append(info)
+
+    models = sorted(test_infos_by_model.keys())
+    split_size = math.ceil(len(models) / num_groups)
+
+    new_test_settings = dict[str, VLMTestInfo]()
+
+    for i in range(num_groups):
+        models_in_group = models[i * split_size:(i + 1) * split_size]
+
+        for model in models_in_group:
+            for info in test_infos_by_model[model]:
+                new_marks = (info.marks or []) + [pytest.mark.split(group=i)]
+                new_info = info._replace(marks=new_marks)
+                new_test_settings[name_by_test_info_id[id(info)]] = new_info
+
+    missing_keys = test_settings.keys() - new_test_settings.keys()
+    assert not missing_keys, f"Missing keys: {missing_keys}"
+
+    return new_test_settings
+
+
+VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
+
+
+### Test wrappers
+# Wrappers around the core test running func for:
+# - single image
+# - multi-image
+# - image embeddings
+# - video
+# - audio
+# - custom inputs
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.IMAGE,
+        create_new_process_for_each_test=False,
+    ))
+def test_single_image_models(tmp_path: PosixPath, model_type: str,
+                             test_case: ExpandableVLMTestArgs,
+                             hf_runner: type[HfRunner],
+                             vllm_runner: type[VllmRunner],
+                             image_assets: ImageTestAssets, monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
+    model_test_info = VLM_TEST_SETTINGS[model_type]
+    runners.run_single_image_test(
+        tmp_path=tmp_path,
+        model_test_info=model_test_info,
+        test_case=test_case,
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        image_assets=image_assets,
+    )
+
+
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.MULTI_IMAGE,
+        create_new_process_for_each_test=False,
+    ))
+def test_multi_image_models(tmp_path: PosixPath, model_type: str,
+                            test_case: ExpandableVLMTestArgs,
+                            hf_runner: type[HfRunner],
+                            vllm_runner: type[VllmRunner],
+                            image_assets: ImageTestAssets, monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
+    model_test_info = VLM_TEST_SETTINGS[model_type]
+    runners.run_multi_image_test(
+        tmp_path=tmp_path,
+        model_test_info=model_test_info,
+        test_case=test_case,
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        image_assets=image_assets,
+    )
+
+
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.EMBEDDING,
+        create_new_process_for_each_test=False,
+    ))
+def test_image_embedding_models(model_type: str,
+                                test_case: ExpandableVLMTestArgs,
+                                hf_runner: type[HfRunner],
+                                vllm_runner: type[VllmRunner],
+                                image_assets: ImageTestAssets, monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
+    model_test_info = VLM_TEST_SETTINGS[model_type]
+    runners.run_embedding_test(
+        model_test_info=model_test_info,
+        test_case=test_case,
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        image_assets=image_assets,
+    )
+
+
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.VIDEO,
+        create_new_process_for_each_test=False,
+    ))
+def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
+                      hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
+                      video_assets: VideoTestAssets, monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
+    model_test_info = VLM_TEST_SETTINGS[model_type]
+    runners.run_video_test(
+        model_test_info=model_test_info,
+        test_case=test_case,
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        video_assets=video_assets,
+    )
+
+
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.AUDIO,
+        create_new_process_for_each_test=False,
+    ))
+def test_audio_models(model_type: str, test_case: ExpandableVLMTestArgs,
+                      hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
+                      audio_assets: AudioTestAssets, monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
+    model_test_info = VLM_TEST_SETTINGS[model_type]
+    runners.run_audio_test(
+        model_test_info=model_test_info,
+        test_case=test_case,
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        audio_assets=audio_assets,
+    )
+
+
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.CUSTOM_INPUTS,
+        create_new_process_for_each_test=False,
+    ))
+def test_custom_inputs_models(
+    model_type: str,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    monkeypatch,
+):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
+    model_test_info = VLM_TEST_SETTINGS[model_type]
+    runners.run_custom_inputs_test(
+        model_test_info=model_test_info,
+        test_case=test_case,
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+    )
+
+
+#### Tests filtering for things running each test as a new process
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.IMAGE,
+        create_new_process_for_each_test=True,
+    ))
+@create_new_process_for_each_test()
+def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
+                                   test_case: ExpandableVLMTestArgs,
+                                   hf_runner: type[HfRunner],
+                                   vllm_runner: type[VllmRunner],
+                                   image_assets: ImageTestAssets, monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
+    model_test_info = VLM_TEST_SETTINGS[model_type]
+    runners.run_single_image_test(
+        tmp_path=tmp_path,
+        model_test_info=model_test_info,
+        test_case=test_case,
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        image_assets=image_assets,
+    )
+
+
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.MULTI_IMAGE,
+        create_new_process_for_each_test=True,
+    ))
+@create_new_process_for_each_test()
+def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
+                                  test_case: ExpandableVLMTestArgs,
+                                  hf_runner: type[HfRunner],
+                                  vllm_runner: type[VllmRunner],
+                                  image_assets: ImageTestAssets, monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
+    model_test_info = VLM_TEST_SETTINGS[model_type]
+    runners.run_multi_image_test(
+        tmp_path=tmp_path,
+        model_test_info=model_test_info,
+        test_case=test_case,
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        image_assets=image_assets,
+    )
+
+
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.EMBEDDING,
+        create_new_process_for_each_test=True,
+    ))
+@create_new_process_for_each_test()
+def test_image_embedding_models_heavy(model_type: str,
+                                      test_case: ExpandableVLMTestArgs,
+                                      hf_runner: type[HfRunner],
+                                      vllm_runner: type[VllmRunner],
+                                      image_assets: ImageTestAssets,
+                                      monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
+    model_test_info = VLM_TEST_SETTINGS[model_type]
+    runners.run_embedding_test(
+        model_test_info=model_test_info,
+        test_case=test_case,
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        image_assets=image_assets,
+    )
+
+
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.VIDEO,
+        create_new_process_for_each_test=True,
+    ))
+def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
+                            hf_runner: type[HfRunner],
+                            vllm_runner: type[VllmRunner],
+                            video_assets: VideoTestAssets, monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
+    model_test_info = VLM_TEST_SETTINGS[model_type]
+    runners.run_video_test(
+        model_test_info=model_test_info,
+        test_case=test_case,
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        video_assets=video_assets,
+    )
+
+
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.AUDIO,
+        create_new_process_for_each_test=True,
+    ))
+def test_audio_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
+                            hf_runner: type[HfRunner],
+                            vllm_runner: type[VllmRunner],
+                            audio_assets: AudioTestAssets, monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
+    model_test_info = VLM_TEST_SETTINGS[model_type]
+    runners.run_audio_test(
+        model_test_info=model_test_info,
+        test_case=test_case,
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        audio_assets=audio_assets,
+    )
+
+
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.CUSTOM_INPUTS,
+        create_new_process_for_each_test=True,
+    ))
+@create_new_process_for_each_test()
+def test_custom_inputs_models_heavy(
+    model_type: str,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    monkeypatch,
+):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
+    model_test_info = VLM_TEST_SETTINGS[model_type]
+    runners.run_custom_inputs_test(
+        model_test_info=model_test_info,
+        test_case=test_case,
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+    )
diff --git a/vllm_v0.10.0/tests/models/multimodal/generation/test_florence2.py b/vllm_v0.10.0/tests/models/multimodal/generation/test_florence2.py
new file mode 100644
index 0000000..a622957
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/generation/test_florence2.py
@@ -0,0 +1,147 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import pytest
+from PIL import Image
+
+from vllm.inputs.data import ExplicitEncoderDecoderPrompt, TextPrompt
+from vllm.multimodal.image import rescale_image_size
+from vllm.sequence import SampleLogprobs
+
+from ....conftest import IMAGE_ASSETS, HfRunner, ImageTestAssets, VllmRunner
+from ...utils import check_logprobs_close
+
+MODELS = ["microsoft/Florence-2-base"]
+# Florence-2 model repo's tokenizer config is missing some special tokens.
+# Therefore, we use a converted tokenizer from a forked repo
+TOKENIZER = "Isotr0py/Florence-2-tokenizer"
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "<OD>",  # special task token which will output special tokens
+    "cherry_blossom":
+    "Describe in detail what is shown in the image.",
+})
+
+
+def get_hf_images_prompts(
+    prompts_: list[ExplicitEncoderDecoderPrompt[str, TextPrompt]],
+) -> tuple[list[ExplicitEncoderDecoderPrompt[str, str]], list[Image.Image]]:
+    prompts, images = [], []
+    for prompt in prompts_:
+        encoder_prompt = prompt["encoder_prompt"]
+        prompts.append(
+            ExplicitEncoderDecoderPrompt(
+                encoder_prompt=encoder_prompt["prompt"],
+                decoder_prompt=None,
+            ))
+        images.append(encoder_prompt["multi_modal_data"]["image"])
+    return prompts, images
+
+
+def hf_to_vllm_output(hf_output: tuple[list[int], str,
+                                       Optional[SampleLogprobs]]):
+    """Sanitize hf output to be comparable with vllm output."""
+    output_ids, output_str, out_logprobs = hf_output
+
+    output_str = output_str.replace("</s>", "").replace("<s>", "")
+
+    return output_ids, output_str, out_logprobs
+
+
+def run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    inputs: list[list[ExplicitEncoderDecoderPrompt]],
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+) -> None:
+    with vllm_runner(model,
+                     max_num_seqs=8,
+                     tokenizer_name=TOKENIZER,
+                     dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs_per_case = [
+            vllm_model.generate_encoder_decoder_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                skip_special_tokens=False,
+            ) for prompts in inputs
+        ]
+
+    hf_inputs = [get_hf_images_prompts(prompts) for prompts in inputs]
+
+    with hf_runner(model, dtype=dtype, skip_tokenizer_init=True) as hf_model:
+        hf_model.model.get_output_embeddings = lambda: \
+            hf_model.model.language_model.lm_head
+        hf_outputs_per_case = [
+            hf_model.generate_encoder_decoder_greedy_logprobs_limit(
+                prompts, max_tokens, num_logprobs=num_logprobs, images=images)
+            for prompts, images in hf_inputs
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
+                                        vllm_outputs_per_case):
+        check_logprobs_close(
+            outputs_0_lst=[hf_to_vllm_output(output) for output in hf_outputs],
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+            num_outputs_0_skip_tokens=1,
+        )
+
+
+# FIXME: https://github.com/huggingface/transformers/issues/38358
+@pytest.mark.skip("Model initialization fails")
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
+                image_assets: ImageTestAssets, model: str,
+                size_factors: list[int], dtype: str, max_tokens: int,
+                num_logprobs: int) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [[
+        ExplicitEncoderDecoderPrompt(
+            encoder_prompt=TextPrompt(
+                prompt=prompt,
+                multi_modal_data={"image": rescale_image_size(image, factor)}),
+            decoder_prompt=None,
+        ) for factor in size_factors
+    ] for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs_per_image,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
diff --git a/vllm_v0.10.0/tests/models/multimodal/generation/test_granite_speech.py b/vllm_v0.10.0/tests/models/multimodal/generation/test_granite_speech.py
new file mode 100644
index 0000000..c5ffa5f
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/generation/test_granite_speech.py
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+from typing import Optional
+
+import pytest
+from transformers import AutoModelForSpeechSeq2Seq
+
+from vllm.lora.request import LoRARequest
+from vllm.sequence import SampleLogprobs
+
+from ....conftest import (AudioTestAssets, HfRunner, PromptAudioInput,
+                          VllmRunner)
+from ...registry import HF_EXAMPLE_MODELS
+from ...utils import check_logprobs_close
+
+HF_AUDIO_PROMPT = "<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|><|audio|>can you transcribe the speech into a written format?<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>"  # noqa: E501
+
+
+def vllm_to_hf_output(
+    vllm_output: tuple[list[int], str, Optional[SampleLogprobs]],
+) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    """Sanitize hf output to be comparable with vllm output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    hf_output_str = output_str + "<|end_of_text|>"
+
+    return output_ids, hf_output_str, out_logprobs
+
+
+MODEL_NAME = "ibm-granite/granite-speech-3.3-2b"
+# Audio lora co-exists directly in the model directory, but
+# currently still needs to be passed directly to vLLM.
+audio_lora_path = MODEL_NAME
+models = [MODEL_NAME]
+
+
+def run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    inputs: Sequence[tuple[list[str], PromptAudioInput]],
+    model: str,
+    *,
+    max_model_len: int,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Inference result should be the same between hf and vllm.
+
+    All the audio fixtures for the test are from AUDIO_ASSETS.
+    For huggingface runner, we provide the audio as input.
+    For vllm runner, we provide MultiModalDataDict objects
+    and corresponding MultiModalConfig as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(
+            model,
+            task="generate",
+            max_model_len=max_model_len,
+            max_num_seqs=1,
+            dtype=dtype,
+            limit_mm_per_prompt={"audio": 1},
+            tensor_parallel_size=tensor_parallel_size,
+            distributed_executor_backend=distributed_executor_backend,
+            enable_lora=True,
+            max_lora_rank=64,
+            enforce_eager=True,
+    ) as vllm_model:
+        lora_request = LoRARequest("audio", 1, audio_lora_path)
+        vllm_outputs_per_case = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                audios=audios,
+                                                lora_request=lora_request)
+            for prompts, audios in inputs
+        ]
+
+    with hf_runner(model, dtype=dtype,
+                   auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
+
+        hf_processor = hf_model.processor
+        eos_token_id = hf_processor.tokenizer.eos_token_id
+
+        hf_outputs_per_case = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    audios=[audios],
+                                                    eos_token_id=eos_token_id)
+            for prompts, audios in inputs
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
+                                        vllm_outputs_per_case):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(output) for output in vllm_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_model_len", [2048])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_models(hf_runner, vllm_runner, model: str,
+                audio_assets: AudioTestAssets, dtype: str, max_model_len: int,
+                max_tokens: int, num_logprobs: int) -> None:
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
+
+    audio, sr = audio_assets[0].audio_and_sample_rate
+    # This model expects 16k sample rate, which our test audio
+    # already is; if this changes, it may break this test,
+    # so we check it directly
+    assert sr == 16000
+    run_test(
+        hf_runner,
+        vllm_runner,
+        [
+            ([HF_AUDIO_PROMPT], [audio]),
+        ],
+        model,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
diff --git a/vllm_v0.10.0/tests/models/multimodal/generation/test_interleaved.py b/vllm_v0.10.0/tests/models/multimodal/generation/test_interleaved.py
new file mode 100644
index 0000000..949c0a8
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/generation/test_interleaved.py
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+from vllm.multimodal.image import convert_image_mode
+
+models = ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]
+
+
+def base_prompt(modalities_str: str) -> str:
+    return f"<|im_start|>user {modalities_str}\nDescribe what you see from these items.<|im_end|><|im_start|>assistant\n"  # noqa: E501
+
+
+INTERLEAVED_PROMPT = base_prompt("<image><video><image>\n")
+NONINTERLEAVED_PROMPT = base_prompt("<image><image><video>\n")
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["float16"])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
+    """
+    This is a simple test to check if interleaved and non-interleaved prompts
+    give the same result.
+    """
+
+    image_cherry = convert_image_mode(
+        ImageAsset("cherry_blossom").pil_image, "RGB")
+    image_stop = convert_image_mode(ImageAsset("stop_sign").pil_image, "RGB")
+    images = [image_cherry, image_stop]
+    video = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays
+
+    inputs = [
+        (
+            [INTERLEAVED_PROMPT],
+            [images],
+            [video],
+        ),
+        (
+            [NONINTERLEAVED_PROMPT],
+            [images],
+            [video],
+        ),
+    ]
+
+    with vllm_runner(model,
+                     task="generate",
+                     dtype=dtype,
+                     limit_mm_per_prompt={"image": 2},
+                     max_model_len=32768,
+                     max_num_seqs=2,
+                     tensor_parallel_size=1,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs_per_case = [
+            vllm_model.generate_greedy(prompts,
+                                       max_tokens,
+                                       images=images,
+                                       videos=videos)
+            for prompts, images, videos in inputs
+        ]
+
+    all_results = [output[0][1] for output in vllm_outputs_per_case]
+    outputs = [(total_str, total_str.find("assistant\n") + len("assistant\n"))
+               for total_str in all_results]
+    prompt_lengths = [prompt_len for _, prompt_len in outputs]
+    generated_strs = [
+        total_str[prompt_len:] for total_str, prompt_len in outputs
+    ]
+    interleaved_prompt_len, noninterleaved_prompt_len = prompt_lengths
+    interleaved_output_str, noninterleaved_output_str = generated_strs
+
+    # The two prompts are identical except for the order of modality tokens.
+    assert interleaved_prompt_len == noninterleaved_prompt_len
+
+    # The two generated strings should be different because of the
+    # interleaved modality tokens.
+    assert interleaved_output_str != noninterleaved_output_str
diff --git a/vllm_v0.10.0/tests/models/multimodal/generation/test_maverick.py b/vllm_v0.10.0/tests/models/multimodal/generation/test_maverick.py
new file mode 100644
index 0000000..306cf39
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/generation/test_maverick.py
@@ -0,0 +1,652 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Create a reduced-layer version of the Maverick model for testing purposes.
+
+This script creates a new model with fewer layers by:
+1. Loading the original Maverick model configuration
+2. Creating a reduced configuration
+3. Generating compatible safetensors files with appropriate weights
+4. Creating the necessary index files for vLLM compatibility
+"""
+
+import json
+import shutil
+from pathlib import Path
+from typing import Any
+
+import pytest
+import torch
+from safetensors.torch import save_file
+from transformers import (AutoConfig, AutoProcessor, AutoTokenizer,
+                          GenerationConfig)
+
+from vllm import LLM, SamplingParams
+
+from ....utils import multi_gpu_test
+
+# Sample prompts for testing
+PROMPTS: list[str] = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+
+def run_maverick_serving(model: str):
+    """Test Llama-4-Maverick model with vLLM LLM class using CLI equivalent
+    options with reduced layers.
+    """
+
+    try:
+        sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+        llm = LLM(
+            model=model,
+            max_model_len=2048,
+            enforce_eager=True,
+            tensor_parallel_size=8,
+            enable_expert_parallel=True,
+            trust_remote_code=True,
+            gpu_memory_utilization=0.4,
+            kv_cache_dtype="fp8",
+        )
+
+        outputs = llm.generate(PROMPTS, sampling_params)
+
+        # Print the outputs
+        print("\nGenerated Outputs:\n" + "-" * 60)
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt:    {prompt!r}")
+            print(f"Output:    {generated_text!r}")
+            print("-" * 60)
+
+    except Exception as e:
+        print(f"Error initializing or running model: {e}")
+        raise
+
+
+def create_reduced_maverick_model(
+    original_model_name:
+    str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+    output_dir: str = "/tmp/reduced_maverick",
+    text_layers: int = 4,
+    num_experts: int = 4,
+    vision_layers: int = 2,
+    force_recreate: bool = False,
+) -> str:
+    """
+    Create a reduced-layer version of the Maverick model.
+
+    Args:
+        original_model_name: Name of the original Maverick model
+        output_dir: Directory to save the reduced model
+        text_layers: Number of text transformer layers
+        num_experts: Number of experts per layer
+        vision_layers: Number of vision transformer layers
+        force_recreate: Whether to recreate if output_dir already exists
+
+    Returns:
+        Path to the created reduced model directory
+    """
+
+    print(
+        f"Creating reduced Maverick model with {text_layers} text layers and "
+        f"{vision_layers} vision layers...")
+
+    # Create output directory
+    output_path = Path(output_dir)
+    if output_path.exists():
+        if force_recreate:
+            shutil.rmtree(output_path)
+        else:
+            print(f"Output directory {output_dir} already exists. "
+                  "Use --force-recreate to overwrite.")
+            return str(output_path)
+
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    try:
+        print("Loading original model configuration...")
+        original_config = AutoConfig.from_pretrained(original_model_name,
+                                                     trust_remote_code=True)
+
+        print("Creating reduced configuration...")
+        reduced_config = create_reduced_config(original_config, text_layers,
+                                               num_experts, vision_layers)
+
+        config_path = output_path / "config.json"
+        with open(config_path, "w") as f:
+            json.dump(reduced_config, f, indent=2)
+        print(f"Saved reduced config to {config_path}")
+
+        print("Copying tokenizer files...")
+        copy_tokenizer_files(original_model_name, output_path)
+
+        print("Creating reduced safetensors files...")
+        create_reduced_safetensors(original_config, reduced_config,
+                                   output_path)
+
+        print("Creating preprocessor config...")
+        create_preprocessor_config(original_config, output_path)
+
+        try:
+            gen_config = GenerationConfig.from_pretrained(original_model_name)
+            gen_config.save_pretrained(output_path)
+            print("Copied generation config")
+        except Exception as e:
+            print(f"Could not copy generation config: {e}")
+
+        print(f"Successfully created reduced Maverick model at {output_path}")
+        return str(output_path)
+
+    except Exception as e:
+        print(f"Error creating reduced model: {e}")
+        # Clean up on failure
+        if output_path.exists():
+            shutil.rmtree(output_path)
+        raise
+
+
+def create_reduced_config(original_config: Any, text_layers: int,
+                          num_experts: int,
+                          vision_layers: int) -> dict[str, Any]:
+    """Create a reduced configuration based on the original."""
+
+    # Convert config to dictionary
+    config_dict = original_config.to_dict()
+
+    # Reduce text layers
+    if "text_config" in config_dict:
+        original_text_layers = config_dict["text_config"]["num_hidden_layers"]
+        config_dict["text_config"]["num_hidden_layers"] = text_layers
+        print(
+            f"Reduced text layers from {original_text_layers} to {text_layers}"
+        )
+
+        original_num_experts = config_dict["text_config"]["num_local_experts"]
+        config_dict["text_config"]["num_local_experts"] = num_experts
+        print(
+            f"Reduced num experts from {original_num_experts} to {num_experts}"
+        )
+
+        hidden_dim_divisor = 4
+
+        original_hidden_size = config_dict["text_config"]["hidden_size"]
+        new_hidden_size = original_hidden_size // hidden_dim_divisor
+        config_dict["text_config"]["hidden_size"] = new_hidden_size
+        print(f"Reduced hidden size from {original_hidden_size} to "
+              f"{new_hidden_size}")
+
+        original_head_dim = config_dict["text_config"]["head_dim"]
+        new_head_dim = original_head_dim // hidden_dim_divisor
+        config_dict["text_config"]["head_dim"] = new_head_dim
+        print(f"Reduced head dim from {original_head_dim} to {new_head_dim}")
+
+    # Reduce vision layers
+    if "vision_config" in config_dict:
+        original_vision_layers = config_dict["vision_config"][
+            "num_hidden_layers"]
+        config_dict["vision_config"]["num_hidden_layers"] = vision_layers
+        print(f"Reduced vision layers from {original_vision_layers} "
+              f"to {vision_layers}")
+
+    # Update model name to indicate it's a reduced version
+    config_dict["_name_or_path"] = (
+        f"reduced_maverick_{text_layers}t_{vision_layers}v")
+
+    return config_dict
+
+
+def copy_tokenizer_files(original_model_name: str, output_path: Path) -> None:
+    """Copy tokenizer files from the original model."""
+
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(original_model_name,
+                                                  trust_remote_code=True)
+        tokenizer.save_pretrained(output_path)
+        print("Tokenizer files copied successfully")
+    except Exception as e:
+        print(f"Warning: Could not copy tokenizer files: {e}")
+
+
+def create_preprocessor_config(original_config: Any,
+                               output_path: Path) -> None:
+    """Create preprocessor_config.json for multimodal model."""
+
+    # Try to load the original preprocessor config
+    try:
+        processor = AutoProcessor.from_pretrained(
+            original_config._name_or_path
+            or "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+            trust_remote_code=True,
+        )
+        processor.save_pretrained(output_path)
+        print("Copied original preprocessor config")
+        return
+    except Exception as e:
+        print(f"Could not copy original preprocessor config: {e}")
+        raise
+
+
+def create_reduced_safetensors(original_config: Any, reduced_config: dict[str,
+                                                                          Any],
+                               output_path: Path) -> None:
+    """Create safetensors files with weights for the reduced model."""
+
+    print("Generating synthetic weights for reduced model...")
+
+    text_config = reduced_config["text_config"]
+    vision_config = reduced_config["vision_config"]
+
+    weights = {}
+
+    print("Creating text model weights...")
+    weights.update(create_text_model_weights(text_config))
+
+    print("Creating vision model weights...")
+    weights.update(create_vision_model_weights(vision_config))
+
+    print("Creating shared model weights...")
+    weights.update(create_shared_weights(text_config, vision_config))
+
+    print("Saving weights to safetensors files...")
+    save_weights_to_safetensors(weights, output_path)
+
+
+def create_text_model_weights(
+        text_config: dict[str, Any]) -> dict[str, torch.Tensor]:
+    """Create synthetic weights for the text model with MoE structure."""
+
+    weights = {}
+
+    vocab_size = text_config["vocab_size"]
+    hidden_size = text_config["hidden_size"]
+    intermediate_size = text_config["intermediate_size"]
+    intermediate_size_mlp = text_config["intermediate_size_mlp"]
+    num_layers = text_config["num_hidden_layers"]
+    num_attention_heads = text_config["num_attention_heads"]
+    num_key_value_heads = text_config.get("num_key_value_heads",
+                                          num_attention_heads)
+
+    # MoE specific parameters
+    num_experts = text_config.get("num_local_experts")
+    assert (num_experts
+            is not None), "num_local_experts must be specified for MoE"
+
+    head_dim = hidden_size // num_attention_heads
+
+    # Embedding layers
+    weights["language_model.model.embed_tokens.weight"] = torch.randn(
+        vocab_size, hidden_size, dtype=torch.float16)
+
+    # Transformer layers
+    for layer_idx in range(num_layers):
+        layer_prefix = f"language_model.model.layers.{layer_idx}"
+        print(f"Creating weights for layer {layer_prefix}...")
+
+        # Self-attention weights (separate q, k, v projections)
+        weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
+            hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
+            hidden_size, num_key_value_heads * head_dim, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
+            num_key_value_heads * head_dim, hidden_size, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
+            hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16)
+        print("Self-attention weights created.")
+
+        # Feed-forward weights - MoE pattern based on interleave_moe_layer_step
+        # For interleave_moe_layer_step=2: layers 1,3,5,... are MoE, layers
+        # 0,2,4,... are dense
+        interleave_step = text_config.get("interleave_moe_layer_step", 1)
+        is_moe_layer = (interleave_step > 0
+                        and (layer_idx + 1) % interleave_step == 0)
+
+        if is_moe_layer:
+            # MoE layer structure
+            # 1. Router weights
+            weights[
+                f"{layer_prefix}.feed_forward.router.weight"] = torch.randn(
+                    num_experts, hidden_size, dtype=torch.float16)
+
+            # 2. Individual expert weights (not fused)
+            for expert_idx in range(num_experts):
+                expert_prefix = (
+                    f"{layer_prefix}.feed_forward.experts.{expert_idx}")
+
+                weights[f"{expert_prefix}.gate_proj.weight"] = torch.randn(
+                    intermediate_size, hidden_size, dtype=torch.bfloat16)
+                weights[f"{expert_prefix}.up_proj.weight"] = torch.randn(
+                    intermediate_size, hidden_size, dtype=torch.bfloat16)
+                weights[f"{expert_prefix}.down_proj.weight"] = torch.randn(
+                    hidden_size, intermediate_size, dtype=torch.bfloat16)
+
+                # Expert weight scales (FP8 quantization)
+                weights[
+                    f"{expert_prefix}.gate_proj.weight_scale"] = torch.ones(
+                        intermediate_size, 1, dtype=torch.bfloat16)
+                weights[f"{expert_prefix}.up_proj.weight_scale"] = torch.ones(
+                    intermediate_size, 1, dtype=torch.bfloat16)
+                weights[
+                    f"{expert_prefix}.down_proj.weight_scale"] = torch.ones(
+                        hidden_size, 1, dtype=torch.bfloat16)
+
+            # 3. Shared expert weights
+            shared_expert_prefix = f"{layer_prefix}.feed_forward.shared_expert"
+            weights[f"{shared_expert_prefix}.gate_proj.weight"] = torch.randn(
+                intermediate_size, hidden_size, dtype=torch.bfloat16)
+            weights[f"{shared_expert_prefix}.up_proj.weight"] = torch.randn(
+                intermediate_size, hidden_size, dtype=torch.bfloat16)
+            weights[f"{shared_expert_prefix}.down_proj.weight"] = torch.randn(
+                hidden_size, intermediate_size, dtype=torch.bfloat16)
+            print(f"MoE feed-forward weights created for layer {layer_idx}.")
+        else:
+            # Dense layer structure
+            weights[f"{layer_prefix}.feed_forward.gate_proj.weight"] = (
+                torch.randn(intermediate_size_mlp,
+                            hidden_size,
+                            dtype=torch.bfloat16))
+            weights[f"{layer_prefix}.feed_forward.up_proj.weight"] = (
+                torch.randn(intermediate_size_mlp,
+                            hidden_size,
+                            dtype=torch.bfloat16))
+            weights[f"{layer_prefix}.feed_forward.down_proj.weight"] = (
+                torch.randn(hidden_size,
+                            intermediate_size_mlp,
+                            dtype=torch.bfloat16))
+            print(f"Dense feed-forward weights created for layer {layer_idx}.")
+
+        # Layer norms
+        weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
+            hidden_size, dtype=torch.bfloat16)
+        weights[
+            f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
+                hidden_size, dtype=torch.bfloat16)
+        print("Layer norms created.")
+
+    # Final layer norm and output projection
+    weights["language_model.model.norm.weight"] = torch.ones(
+        hidden_size, dtype=torch.bfloat16)
+    weights["language_model.lm_head.weight"] = torch.randn(
+        vocab_size, hidden_size, dtype=torch.bfloat16)
+
+    return weights
+
+
+def create_vision_model_weights(
+        vision_config: dict[str, Any]) -> dict[str, torch.Tensor]:
+    """Create synthetic weights for the vision model."""
+
+    weights = {}
+
+    hidden_size = vision_config["hidden_size"]
+    intermediate_size = vision_config["intermediate_size"]
+    num_layers = vision_config["num_hidden_layers"]
+
+    # Vision transformer layers
+    for layer_idx in range(num_layers):
+        layer_prefix = f"vision_model.model.layers.{layer_idx}"
+
+        weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
+            hidden_size, hidden_size, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.self_attn.q_proj.bias"] = torch.zeros(
+            hidden_size, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
+            hidden_size, hidden_size, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.self_attn.k_proj.bias"] = torch.zeros(
+            hidden_size, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
+            hidden_size, hidden_size, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.self_attn.v_proj.bias"] = torch.zeros(
+            hidden_size, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
+            hidden_size, hidden_size, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.self_attn.o_proj.bias"] = torch.zeros(
+            hidden_size, dtype=torch.bfloat16)
+
+        weights[f"{layer_prefix}.mlp.fc1.weight"] = torch.randn(
+            intermediate_size, hidden_size, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.mlp.fc1.bias"] = torch.zeros(
+            intermediate_size, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.mlp.fc2.weight"] = torch.randn(
+            hidden_size, intermediate_size, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.mlp.fc2.bias"] = torch.zeros(
+            hidden_size, dtype=torch.bfloat16)
+
+        weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
+            hidden_size, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.input_layernorm.bias"] = torch.zeros(
+            hidden_size, dtype=torch.bfloat16)
+        weights[
+            f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
+                hidden_size, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.post_attention_layernorm.bias"] = torch.zeros(
+            hidden_size, dtype=torch.bfloat16)
+
+    return weights
+
+
+def create_shared_weights(
+        text_config: dict[str, Any],
+        vision_config: dict[str, Any]) -> dict[str, torch.Tensor]:
+    """Create weights for shared components (vision-language connector)"""
+
+    weights = {}
+
+    text_hidden_size = text_config["hidden_size"]
+    projector_input_dim = vision_config["projector_input_dim"]
+
+    # Vision-language connector (projects vision features to text space)
+    weights["multi_modal_projector.linear_1.weight"] = torch.randn(
+        text_hidden_size, projector_input_dim, dtype=torch.bfloat16)
+
+    return weights
+
+
+def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
+                                output_path: Path) -> None:
+    """Save weights to safetensors files and create index."""
+
+    # Determine how to shard the weights
+    max_shard_size = 5 * 1024 * 1024 * 1024  # 5GB per shard
+
+    # Calculate sizes and create shards
+    shards = []
+    current_shard: dict[str, torch.Tensor] = {}
+    current_size = 0
+
+    for name, tensor in weights.items():
+        tensor_size = tensor.numel() * tensor.element_size()
+
+        if current_size + tensor_size > max_shard_size and current_shard:
+            shards.append(current_shard)
+            current_shard = {}
+            current_size = 0
+
+        current_shard[name] = tensor
+        current_size += tensor_size
+
+    if current_shard:
+        shards.append(current_shard)
+
+    # Save shards and create index
+    weight_map = {}
+
+    if len(shards) == 1:
+        # Single file
+        filename = "model.safetensors"
+        save_file(shards[0], output_path / filename)
+        weight_map = {name: filename for name in shards[0]}
+        print(f"Saved weights to single file: {filename}")
+    else:
+        # Multiple shards
+        for i, shard in enumerate(shards):
+            filename = f"model-{i+1:05d}-of-{len(shards):05d}.safetensors"
+            save_file(shard, output_path / filename)
+            for name in shard:
+                weight_map[name] = filename
+            print(f"Saved shard {i+1}/{len(shards)}: {filename}")
+
+    # Create index file
+    index_data = {
+        "metadata": {
+            "total_size":
+            sum(tensor.numel() * tensor.element_size()
+                for tensor in weights.values())
+        },
+        "weight_map": weight_map,
+    }
+
+    index_path = output_path / "model.safetensors.index.json"
+    with open(index_path, "w") as f:
+        json.dump(index_data, f, indent=2)
+
+    print(f"Created index file: {index_path}")
+    print(f"Total model size: "
+          f"{index_data['metadata']['total_size'] / (1024**3):.2f} GB")
+
+
+def run_reduced_model(model_path: str,
+                      should_profile: bool = False,
+                      **kwargs) -> None:
+    """Test the created reduced model with vLLM."""
+
+    print(f"\nTesting reduced model at {model_path}...")
+
+    llm = LLM(
+        model=model_path,
+        trust_remote_code=True,
+        max_model_len=512,  # Small context for testing
+        gpu_memory_utilization=0.3,  # Conservative memory usage
+        **kwargs,
+    )
+
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     max_tokens=50)
+
+    if should_profile:
+        llm.start_profile()
+    outputs = llm.generate(PROMPTS, sampling_params)
+    if should_profile:
+        llm.stop_profile()
+
+    print("Test generation successful!")
+    for output in outputs:
+        print(f"Prompt: {output.prompt}")
+        print(f"Output: "
+              f"{output.outputs[0].text}")
+        print("-" * 40)
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "original_model_name,text_layers,num_experts,vision_layers,",
+    [("meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", 4, 4, 2)])
+@pytest.mark.parametrize("enforce_eager", [True, False])
+@pytest.mark.parametrize("tp,ep", [(2, True)])
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_dummy_maverick(
+    original_model_name: str,
+    text_layers: int,
+    num_experts: int,
+    vision_layers: int,
+    enforce_eager: bool,
+    tp: int,
+    ep: bool,
+    output_dir: str = "/tmp/reduced_maverick",
+    force_recreate: bool = True,
+    profile: bool = False,
+) -> None:
+    model_path = create_reduced_maverick_model(
+        original_model_name=original_model_name,
+        output_dir=output_dir,
+        text_layers=text_layers,
+        num_experts=num_experts,
+        vision_layers=vision_layers,
+        force_recreate=force_recreate,
+    )
+
+    print(f"\nReduced model created successfully at: {model_path}")
+
+    run_reduced_model(model_path=model_path,
+                      should_profile=profile,
+                      enforce_eager=enforce_eager,
+                      tensor_parallel_size=tp,
+                      enable_expert_parallel=ep)
+
+
+def main():
+    """Main function to create and test the reduced model."""
+
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Create a reduced-layer Maverick model")
+    parser.add_argument(
+        "--output-dir",
+        default="/tmp/reduced_maverick",
+        help="Output directory for the reduced model",
+    )
+    parser.add_argument(
+        "--text-layers",
+        type=int,
+        default=4,
+        help="Number of text transformer layers",
+    )
+    parser.add_argument("--num-experts",
+                        type=int,
+                        default=4,
+                        help="Number of experts")
+    parser.add_argument(
+        "--vision-layers",
+        type=int,
+        default=2,
+        help="Number of vision transformer layers",
+    )
+    parser.add_argument(
+        "--force-recreate",
+        action="store_true",
+        help="Force recreation if output directory exists",
+    )
+    parser.add_argument("--test",
+                        action="store_true",
+                        help="Test the created model with vLLM")
+    parser.add_argument("--profile",
+                        action="store_true",
+                        help="Profile the created model with vLLM")
+    parser.add_argument(
+        "--test-original",
+        action="store_true",
+        help="Test the original model with vLLM",
+    )
+    parser.add_argument(
+        "--original-model",
+        default="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+        help="Original model name to base the reduction on",
+    )
+
+    args = parser.parse_args()
+
+    if args.test:
+        test_dummy_maverick(original_model_name=args.original_model,
+                            output_dir=args.output_dir,
+                            text_layers=args.text_layers,
+                            num_experts=args.num_experts,
+                            vision_layers=args.vision_layers,
+                            force_recreate=args.force_recreate,
+                            tp=2,
+                            ep=True,
+                            enforce_eager=True,
+                            profile=args.profile)
+
+    if args.test_original:
+        run_maverick_serving(args.original_model)
+
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/vllm_v0.10.0/tests/models/multimodal/generation/test_mllama.py b/vllm_v0.10.0/tests/models/multimodal/generation/test_mllama.py
new file mode 100644
index 0000000..2bb01e4
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/generation/test_mllama.py
@@ -0,0 +1,750 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional, overload
+
+import pytest
+import torch
+from transformers import AutoConfig, AutoModelForImageTextToText, AutoTokenizer
+
+from vllm import LLM, SamplingParams
+from vllm.attention.backends.flash_attn import FlashAttentionMetadata
+from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
+                                     global_force_attn_backend_context_manager)
+from vllm.model_executor.models.mllama import MllamaForConditionalGeneration
+from vllm.multimodal.image import rescale_image_size
+from vllm.sequence import SampleLogprobs
+
+from ....conftest import (IMAGE_ASSETS, HfRunner, ImageTestAssets,
+                          PromptImageInput, VllmRunner)
+from ....quantization.utils import is_quant_method_supported
+from ....utils import (create_new_process_for_each_test, large_gpu_test,
+                       multi_gpu_test)
+from ...utils import check_logprobs_close
+
+_LIMIT_IMAGE_PER_PROMPT = 3
+MLLAMA_IMAGE_TOKEN_ID = 128256
+
+LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN]
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "<|image|><|begin_of_text|>The meaning of the image is",
+    "cherry_blossom":
+    "<|image|><|begin_of_text|>The city is",
+})
+
+text_only_prompts = [
+    "The color of the sky is blue but sometimes it can also be",
+]
+
+models = [
+    "meta-llama/Llama-3.2-11B-Vision-Instruct",
+]
+
+# Indices for inputs
+TEXT_ONLY = '0'
+IMAGE_AT_BEG = '1'
+IMAGE_AT_MIDDLE = '2'
+TWO_IMAGES = '3'
+
+# Input tokenized
+prompt_data = {
+    # Tell me a story
+    TEXT_ONLY: [41551, 757, 264, 3446],
+    # <|image|> What's the content of this image
+    IMAGE_AT_BEG:
+    [MLLAMA_IMAGE_TOKEN_ID, 3639, 596, 279, 2262, 315, 420, 2217, 220],
+    # Hello <|image|>What' the content of this image
+    IMAGE_AT_MIDDLE:
+    [9906, 220, MLLAMA_IMAGE_TOKEN_ID, 3923, 6, 279, 2262, 315, 420, 2217],
+    #<|image|>Is there a duck in this image?<|image|>What's the animal in this image? # noqa: E501
+    TWO_IMAGES: [
+        MLLAMA_IMAGE_TOKEN_ID, 3957, 1070, 264, 37085, 304, 420, 2217, 30,
+        MLLAMA_IMAGE_TOKEN_ID, 3923, 596, 279, 10065, 304, 420, 2217, 30
+    ]
+}
+
+
+def vllm_to_hf_output(vllm_output: tuple[list[int], str,
+                                         Optional[SampleLogprobs]],
+                      model: str):
+    """Sanitize vllm output to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    config = AutoConfig.from_pretrained(model)
+    image_token_id = config.image_token_index
+
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    eos_token_id = tokenizer.eos_token_id
+
+    hf_output_ids = [
+        token_id for idx, token_id in enumerate(output_ids)
+        if token_id != image_token_id or output_ids[idx - 1] != image_token_id
+    ]
+
+    hf_output_str = output_str
+    if hf_output_ids[-1] == eos_token_id:
+        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
+
+    return hf_output_ids, hf_output_str, out_logprobs
+
+
+def _get_inputs(
+    image_assets: ImageTestAssets,
+    *,
+    size_factors: Optional[list[float]] = None,
+    sizes: Optional[list[tuple[int, int]]] = None,
+) -> list[tuple[list[str], PromptImageInput]]:
+    images = [asset.pil_image for asset in image_assets]
+
+    if size_factors is not None:
+        inputs_per_image = [(
+            [prompt for _ in size_factors],
+            [rescale_image_size(image, factor) for factor in size_factors],
+        ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+    elif sizes is not None:
+        inputs_per_image = [(
+            [
+                prompt if size is not None else text_only_prompts[0]
+                for size in sizes
+            ],
+            [
+                image.resize(size) if size is not None else None
+                for size in sizes
+            ],
+        ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+        if len(sizes) == 0:
+            inputs_per_image.append(
+                (text_only_prompts, [None] * len(text_only_prompts)))
+    else:
+        raise ValueError("You must provide either `size_factors` or `sizes`")
+
+    return inputs_per_image
+
+
+@overload
+def run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+    model: str,
+    *,
+    size_factors: list[float],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    ...
+
+
+@overload
+def run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+    model: str,
+    *,
+    sizes: list[tuple[int, int]],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    ...
+
+
+def run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+    model: str,
+    *,
+    size_factors: Optional[list[float]] = None,
+    sizes: Optional[list[tuple[int, int]]] = None,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        _get_inputs(image_assets, size_factors=size_factors, sizes=sizes),
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+    )
+
+
+def _run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    inputs: list[tuple[list[str], PromptImageInput]],
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test are from IMAGE_ASSETS.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalDataDict objects 
+    and corresponding MultiModalConfig as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            max_model_len=19212,  # 3 max size images
+            max_num_seqs=3,
+            tensor_parallel_size=tensor_parallel_size,
+            distributed_executor_backend=distributed_executor_backend,
+            limit_mm_per_prompt={"image":
+                                 _LIMIT_IMAGE_PER_PROMPT}) as vllm_model:
+        vllm_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images)
+            for prompts, images in inputs
+        ]
+
+    with hf_runner(model,
+                   dtype=dtype,
+                   model_kwargs={"device_map": "auto"},
+                   auto_cls=AutoModelForImageTextToText) as hf_model:
+        hf_outputs_per_image = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=images)
+            for prompts, images in inputs
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+                                        vllm_outputs_per_image):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(vllm_output, model)
+                for vllm_output in vllm_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.fixture(autouse=True)
+def clear_cache():
+    """Fixture to clear backend cache before each test."""
+    _cached_get_attn_backend.cache_clear()  # Clear the cache
+    yield  # This allows the test to run
+
+
+@large_gpu_test(min_gb=48)
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "sizes",
+    [
+        # Text only
+        [],
+        # Single-size
+        [(512, 512)],
+        # Single-size, batched
+        [(512, 512), (512, 512), (512, 512)],
+        # Multi-size, batched
+        [(512, 512), (1024, 512), (1536, 512), (2048, 512), (512, 1024),
+         (1024, 1024), (512, 1536), (512, 2028)],
+        # Multi-size, batched, including text only
+        [(512, 512), (1024, 512), (1536, 512), (2048, 512), (512, 1024),
+         (1024, 1024), (512, 1536), (512, 2028), None],
+        # mllama has 8 possible aspect ratios, carefully set the sizes
+        # to cover all of them
+    ])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
+def test_models_single_leading_image(hf_runner, vllm_runner, image_assets,
+                                     model, sizes, dtype, max_tokens,
+                                     num_logprobs,
+                                     attn_backend: _Backend) -> None:
+    with global_force_attn_backend_context_manager(attn_backend):
+        if attn_backend == _Backend.FLASH_ATTN:
+            # Flash Attention works only with bfloat16 data-type
+            dtype = 'bfloat16'
+        run_test(
+            hf_runner,
+            vllm_runner,
+            image_assets,
+            model,
+            sizes=sizes,
+            dtype=dtype,
+            max_tokens=max_tokens,
+            num_logprobs=num_logprobs,
+            tensor_parallel_size=1,
+        )
+
+
+@large_gpu_test(min_gb=48)
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
+def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets,
+                                     model, dtype, max_tokens, num_logprobs,
+                                     attn_backend: _Backend) -> None:
+
+    stop_sign = image_assets[0].pil_image
+    cherry_blossom = image_assets[1].pil_image
+
+    inputs = [(
+        [
+            "<|image|><|image|><|begin_of_text|>Describe 2 images.",  # noqa: E501
+            "<|image|><|image|><|begin_of_text|>Describe 2 images.",  # noqa: E501
+            "<|image|><|image|><|image|><|begin_of_text|>Describe 3 images.",  # noqa: E501
+        ],
+        [
+            [stop_sign, cherry_blossom],
+            # Images with different sizes.
+            [
+                stop_sign.resize((512, 512)),
+                stop_sign,
+            ],
+            [
+                stop_sign,
+                stop_sign.resize((512, 1536)),
+                cherry_blossom.resize((512, 1024)),
+            ],
+        ])]
+    with global_force_attn_backend_context_manager(attn_backend):
+        if attn_backend == _Backend.FLASH_ATTN:
+            # Flash Attention works only with bfloat16 data-type
+            dtype = 'bfloat16'
+        _run_test(
+            hf_runner,
+            vllm_runner,
+            inputs,
+            model,
+            dtype=dtype,
+            max_tokens=max_tokens,
+            num_logprobs=num_logprobs,
+            tensor_parallel_size=1,
+        )
+
+
+@large_gpu_test(min_gb=48)
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
+def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
+                                   dtype, max_tokens, num_logprobs,
+                                   attn_backend: _Backend) -> None:
+
+    stop_sign = image_assets[0].pil_image
+    cherry_blossom = image_assets[1].pil_image
+
+    inputs = [(
+        [
+            "<|begin_of_text|>The content of the image <|image|> is",  # noqa: E501
+            "<|begin_of_text|>Between the first image <|image|> and the second image<|image|>, "  # noqa: E501
+            "which is a stop sign and which is a cherry blossom?",  # noqa: E501
+        ],
+        [
+            [stop_sign],
+            [stop_sign, cherry_blossom],
+        ])]
+    with global_force_attn_backend_context_manager(attn_backend):
+        if attn_backend == _Backend.FLASH_ATTN:
+            # Flash Attention works only with bfloat16 data-type
+            dtype = 'bfloat16'
+        _run_test(
+            hf_runner,
+            vllm_runner,
+            inputs,
+            model,
+            dtype=dtype,
+            max_tokens=max_tokens,
+            num_logprobs=num_logprobs,
+            tensor_parallel_size=1,
+        )
+
+
+@create_new_process_for_each_test()
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models_distributed(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    distributed_executor_backend,
+    model,
+    dtype,
+    max_tokens,
+    num_logprobs,
+) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model=model,
+        size_factors=[0.25, 0.5, 1.0],
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=2,
+        distributed_executor_backend=distributed_executor_backend,
+    )
+
+
+@large_gpu_test(min_gb=48)
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["float16"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
+                    reason='bitsandbytes is not supported on this GPU type.')
+def test_bnb_regression(
+    image_assets: ImageTestAssets,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+):
+    stop_sign = image_assets[0].pil_image
+    prompts = [
+        {
+            "prompt": "<|begin_of_text|>The content of the image <|image|> is",
+            "multi_modal_data": {
+                "image": stop_sign
+            },
+        },
+        {
+            "prompt":
+            "The color of the sky is blue but sometimes it can also be",
+        },
+    ]
+    # Test regression about QKVCrossParallelLinear
+    llm = LLM(
+        model=model,
+        dtype=dtype,
+        max_model_len=8192,
+        max_num_seqs=2,
+        quantization="bitsandbytes",
+    )
+    sampling_params = SamplingParams(
+        temperature=0,
+        max_tokens=max_tokens,
+    )
+    outputs = llm.generate(prompts, sampling_params)
+    assert outputs
+
+
+@large_gpu_test(min_gb=48)
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [32])
+def test_explicit_implicit_prompt(
+    image_assets: ImageTestAssets,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+):
+    stop_sign = image_assets[0].pil_image
+    # yapf: disable
+    prompts = [
+        # explicit prompt
+        {
+            "encoder_prompt": {
+                "prompt": "<|image|>",
+                "multi_modal_data": {"image": stop_sign},
+            },
+            "decoder_prompt": {
+                "prompt_token_ids": [128000, 791, 2262, 315, 279, 2217, 220, 128256, 374],  # noqa: E501
+            }
+        },
+        {
+            "encoder_prompt": "Not <|image|>",
+            "decoder_prompt": "The color of the sky is blue but sometimes it can also be",  # noqa: E501
+        },
+        # implicit prompt
+        {
+            "prompt": "<|begin_of_text|>The content of the image <|image|> is", # noqa: E501
+            "multi_modal_data": {"image": stop_sign},
+        },
+        {
+            "prompt": "The color of the sky is blue but sometimes it can also be",  # noqa: E501
+        },
+    ]
+    # yapf: enable
+    llm = LLM(
+        model=model,
+        dtype=dtype,
+        max_model_len=8192,
+        max_num_seqs=2,
+        tensor_parallel_size=1,
+    )
+    sampling_params = SamplingParams(
+        temperature=0,
+        max_tokens=max_tokens,
+    )
+    outputs = llm.generate(prompts, sampling_params)
+    n_prompts = len(prompts)
+    explicit_outputs = outputs[:n_prompts // 2]
+    implicit_outputs = outputs[n_prompts // 2:]
+    for exp_output, imp_output in zip(explicit_outputs, implicit_outputs):
+        assert exp_output.outputs[0].text == imp_output.outputs[0].text
+
+
+@large_gpu_test(min_gb=48)
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
+def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
+                    num_logprobs, attn_backend: _Backend) -> None:
+
+    stop_sign = image_assets[0].pil_image
+
+    with global_force_attn_backend_context_manager(attn_backend), vllm_runner(
+            model,
+            dtype=dtype,
+            max_model_len=8192,
+            max_num_seqs=4,
+            tensor_parallel_size=1,
+            limit_mm_per_prompt={"image":
+                                 _LIMIT_IMAGE_PER_PROMPT}) as vllm_model:
+
+        # Regression tests for https://github.com/vllm-project/vllm/issues/10648
+
+        # Number of groups of image tokens is greater than the number of images
+        # provided (the whitespace between the tags is necessary)
+        prompt = "<|begin_of_text|><|image|> <|image|> Compare the two images"  # noqa: E501
+        image = stop_sign
+        with pytest.raises(ValueError):
+            vllm_model.generate_greedy_logprobs([prompt],
+                                                max_tokens,
+                                                num_logprobs,
+                                                images=[image])
+
+        # Batch of a text-only and image request that requires cross-attention
+        prompts = [
+            "What is the capital of spain?",
+            "Text before the image...<|image|>What is in the image?",  # noqa: E501
+        ]
+        images = [
+            None,
+            [stop_sign],
+        ]
+        vllm_model.generate_greedy_logprobs(prompts,
+                                            max_tokens,
+                                            num_logprobs,
+                                            images=images)
+
+        # Test the reverse order too for good measure
+        prompts = [
+            "<|begin_of_text|>Text before the image...<|image|>What is in the image?",  # noqa: E501
+            "<|begin_of_text|>Hello!",
+        ]
+        images = [
+            [stop_sign],
+            None,
+        ]
+        vllm_model.generate_greedy_logprobs(prompts,
+                                            max_tokens,
+                                            num_logprobs,
+                                            images=images)
+
+        # Mixed batch with text and images with different numbers of tiles
+        prompts = [
+            "<|begin_of_text|>Hello!",
+            "<|begin_of_text|>Some text before.<|image|>What is in the image?",  # noqa: E501
+            "<|begin_of_text|>Some text before.<|image|>What is in the image?",  # noqa: E501
+        ]
+        images = [
+            None,
+            [stop_sign],
+            # smaller image must be 2nd for the repro
+            [stop_sign.resize((448, 448))],
+        ]
+        vllm_model.generate_greedy_logprobs(prompts,
+                                            max_tokens,
+                                            num_logprobs,
+                                            images=images)
+
+
+class DummyModel:
+    image_token_id = MLLAMA_IMAGE_TOKEN_ID
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize(
+    "input_indices_and_output",
+    # inputs, (cross_attention_mask, kv_range_for_decode)
+    [([TEXT_ONLY], (None, None)), ([IMAGE_AT_BEG], (None, None)),
+     ([TEXT_ONLY, IMAGE_AT_BEG], (None, None)),
+     ([IMAGE_AT_MIDDLE], ((10, 12), [[0, 6]])),
+     ([TEXT_ONLY, IMAGE_AT_MIDDLE], ((14, 12), [[0, 6]])),
+     ([TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE],
+      ((23, 24), [[0, 6], [6, 12]])),
+     ([IMAGE_AT_MIDDLE, TEXT_ONLY], ((14, 12), [[0, 6]])),
+     ([TWO_IMAGES], ((18, 12), [[6, 12]])),
+     ([TEXT_ONLY, TWO_IMAGES], ((22, 12), [[6, 12]]))])
+def test_get_cross_attention_mask(input_indices_and_output) -> None:
+
+    input_indices, expected_output = input_indices_and_output
+
+    sequences = [torch.tensor(prompt_data[i]) for i in input_indices]
+    num_tiles = [[2, 2] if i != TEXT_ONLY else [] for i in input_indices
+                 if i != TEXT_ONLY]
+    input = torch.cat(sequences)
+
+    seq_lens = [len(s) for s in sequences]
+
+    attn_data = FlashAttentionMetadata(
+        seq_lens=seq_lens,
+        # Dummy values
+        enable_kv_scales_calculation=False,
+        num_prefills=0,
+        num_prefill_tokens=0,
+        num_decode_tokens=0,
+        slot_mapping=0,
+        multi_modal_placeholder_index_maps=None,
+        seq_lens_tensor=0,
+        max_prefill_seq_len=0,
+        max_decode_seq_len=0,
+        context_lens_tensor=None,
+        block_tables=None,
+        use_cuda_graph=False,
+    )
+
+    dummy = DummyModel()
+
+    cross_attention_mask, kv_range_for_decode = MllamaForConditionalGeneration\
+        .get_cross_attention_mask(dummy,
+                                  input,
+                                  attn_data,
+                                  num_tiles=num_tiles,
+                                  num_tokens_per_tile=3,
+                                  dtype=torch.bfloat16)
+
+    expected_cross_attention_mask, expected_kv_range_for_decode = \
+        expected_output
+
+    assert kv_range_for_decode == expected_kv_range_for_decode
+    if expected_cross_attention_mask is not None:
+        assert cross_attention_mask is not None
+        assert cross_attention_mask.shape == expected_cross_attention_mask
+    else:
+        assert cross_attention_mask is None
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize(
+    "input_indices",
+    [[TEXT_ONLY], [IMAGE_AT_BEG], [TEXT_ONLY, IMAGE_AT_BEG], [IMAGE_AT_MIDDLE],
+     [TEXT_ONLY, IMAGE_AT_MIDDLE], [TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE],
+     [IMAGE_AT_MIDDLE, TEXT_ONLY], [TWO_IMAGES], [TEXT_ONLY, TWO_IMAGES]])
+def test_get_full_text_row_masked_out_mask(input_indices) -> None:
+
+    sequences = [torch.tensor(prompt_data[i]) for i in input_indices]
+
+    seq_lens = [len(s) for s in sequences]
+
+    num_prefill_tokens = sum(seq_lens)
+
+    # TEXT_ONLY is zero, so it will be masked out,
+    # other instances should not be.
+    encoder_seq_lens = [int(i) for i in input_indices]
+
+    attn_data = FlashAttentionMetadata(
+        seq_lens=seq_lens,
+        encoder_seq_lens=encoder_seq_lens,
+        num_prefill_tokens=num_prefill_tokens,
+        # Dummy values
+        enable_kv_scales_calculation=False,
+        num_prefills=0,
+        num_decode_tokens=0,
+        slot_mapping=0,
+        multi_modal_placeholder_index_maps=None,
+        seq_lens_tensor=0,
+        max_prefill_seq_len=0,
+        max_decode_seq_len=0,
+        context_lens_tensor=None,
+        block_tables=None,
+        use_cuda_graph=False,
+    )
+
+    dummy = DummyModel()
+
+    full_text_row_masked_out_mask = MllamaForConditionalGeneration\
+        .get_full_text_row_masked_out_mask(dummy,
+                                  attn_data,
+                                  torch.get_default_device())
+
+    full_text_row_masked_out_mask = full_text_row_masked_out_mask.squeeze()
+    full_text_row_masked_out_mask = full_text_row_masked_out_mask.tolist()
+
+    idx = 0
+    assert len(full_text_row_masked_out_mask) == num_prefill_tokens
+    for i, seq_len in enumerate(seq_lens):
+        must_be_masked = input_indices[i] != TEXT_ONLY
+        for _ in range(seq_len):
+            assert full_text_row_masked_out_mask[idx] == must_be_masked, \
+                f"full_text_row_masked_out_mask[{idx}] must be " \
+                f"'{must_be_masked}' "
+            idx += 1
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("encoder_seq_lens, num_tiles, expected", [
+    ([6404], [[4]], [6404]),
+    ([0, 6404], [[4]], [6404]),
+    ([0, 1601, 8005], [[1], [4, 1]], [1601, 8005]),
+    ([0, 19212, 0, 3202], [[4, 4, 4], [2]], [19212, 3202]),
+])
+def test_parse_and_validate_encoder_lens(encoder_seq_lens, num_tiles,
+                                         expected) -> None:
+
+    dummy = DummyModel()
+    num_tokens_per_tile = 1601
+    actual_encoder_seq_lens = MllamaForConditionalGeneration \
+        ._get_and_validate_encoder_lens(
+            dummy,
+            encoder_seq_lens,
+            num_tiles,
+            num_tokens_per_tile,
+        )
+    assert actual_encoder_seq_lens == expected, \
+        f"Expected {expected} but got {actual_encoder_seq_lens}"
diff --git a/vllm_v0.10.0/tests/models/multimodal/generation/test_phi4mm.py b/vllm_v0.10.0/tests/models/multimodal/generation/test_phi4mm.py
new file mode 100644
index 0000000..4e84657
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/generation/test_phi4mm.py
@@ -0,0 +1,296 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from collections.abc import Sequence
+from typing import Optional
+
+import librosa
+import pytest
+import regex as re
+from huggingface_hub import snapshot_download
+from transformers import AutoTokenizer
+
+from vllm.assets.image import ImageAsset
+from vllm.lora.request import LoRARequest
+from vllm.multimodal.image import convert_image_mode, rescale_image_size
+from vllm.platforms import current_platform
+from vllm.sequence import SampleLogprobs
+
+from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
+                          PromptImageInput, VllmRunner)
+from ....utils import large_gpu_test
+from ...utils import check_logprobs_close
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
+    "cherry_blossom":
+    "<|user|>\n<|image_1|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n",  # noqa: E501
+})
+HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
+
+model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
+# Since the vision-lora and speech-lora co-exist with the base model,
+# we have to manually specify the path of the lora weights.
+vision_lora_path = os.path.join(model_path, "vision-lora")
+speech_question = os.path.join(model_path, "examples",
+                               "what_is_shown_in_this_image.wav")
+models = [model_path]
+
+
+def vllm_to_hf_output(vllm_output: tuple[list[int], str,
+                                         Optional[SampleLogprobs]],
+                      model: str):
+    """Sanitize vllm output to be comparable with hf output."""
+    _, output_str, out_logprobs = vllm_output
+
+    output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
+    assert output_str_without_image[0] == " "
+    output_str_without_image = output_str_without_image[1:]
+
+    hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
+
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    hf_output_ids = tokenizer.encode(output_str_without_image)
+    assert hf_output_ids[0] == 1
+    hf_output_ids = hf_output_ids[1:]
+
+    return hf_output_ids, hf_output_str, out_logprobs
+
+
+target_dtype = "half"
+
+# ROCm Triton FA can run into shared memory issues with these models,
+# use other backends in the meantime
+# FIXME (mattwong, gshtrasb, hongxiayan)
+if current_platform.is_rocm():
+    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
+
+
+def run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    inputs: Sequence[tuple[list[str], PromptImageInput,
+                           Optional[PromptAudioInput]]],
+    model: str,
+    *,
+    max_model_len: int,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    mm_limit: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test are from IMAGE_ASSETS.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalDataDict objects
+    and corresponding MultiModalConfig as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(
+            model,
+            task="generate",
+            max_model_len=max_model_len,
+            max_num_seqs=2,
+            dtype=dtype,
+            limit_mm_per_prompt={"image": mm_limit},
+            tensor_parallel_size=tensor_parallel_size,
+            distributed_executor_backend=distributed_executor_backend,
+            enable_lora=True,
+            max_lora_rank=320,
+            gpu_memory_utilization=0.8,  # set to 0.8 to avoid OOM in CI
+            enforce_eager=True,
+    ) as vllm_model:
+        lora_request = LoRARequest("vision", 1, vision_lora_path)
+        vllm_outputs_per_case = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images,
+                                                audios=audios,
+                                                lora_request=lora_request)
+            for prompts, images, audios in inputs
+        ]
+
+    # This error occurs inside `get_peft_model`
+    # FIXME: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/75
+    pytest.skip("HF impl is not compatible with current transformers")
+
+    hf_model_kwargs = {"_attn_implementation": "sdpa"}
+    with hf_runner(model, dtype=dtype,
+                   model_kwargs=hf_model_kwargs) as hf_model:
+
+        hf_processor = hf_model.processor
+        eos_token_id = hf_processor.tokenizer.eos_token_id
+
+        def patch_hf_processor(*args,
+                               text="",
+                               images=None,
+                               audio=None,
+                               sampling_rate=None,
+                               **kwargs):
+            audios = None
+            if audio is not None and sampling_rate is not None:
+                audios = [(audio, sampling_rate)]
+            return hf_processor(*args,
+                                text=text,
+                                images=images,
+                                audios=audios,
+                                **kwargs)
+
+        hf_model.processor = patch_hf_processor
+
+        hf_outputs_per_case = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=images,
+                                                    audios=audios,
+                                                    eos_token_id=eos_token_id,
+                                                    num_logits_to_keep=0)
+            for prompts, images, audios in inputs
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
+                                        vllm_outputs_per_case):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_model_len", [12800])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype: str, max_model_len: int, max_tokens: int,
+                num_logprobs: int) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+        None,
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs_per_image,
+        model,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )
+
+
+@large_gpu_test(min_gb=48)
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        # [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_model_len", [25600])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
+                             size_factors, dtype: str, max_model_len: int,
+                             max_tokens: int, num_logprobs: int) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_case = [
+        (
+            [HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
+            [[rescale_image_size(image, factor) for image in images]
+             for factor in size_factors],
+            None,
+        ),
+    ]
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs_per_case,
+        model,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=2,
+        tensor_parallel_size=1,
+    )
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_model_len", [12800])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
+                              max_model_len: int, max_tokens: int,
+                              num_logprobs: int) -> None:
+
+    # use the example speech question so that the model outputs are reasonable
+    audio = librosa.load(speech_question, sr=None)
+    image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
+
+    inputs_vision_speech = [
+        (
+            ["<|user|><|image_1|><|audio_1|><|end|><|assistant|>"],
+            [image],
+            [audio],
+        ),
+    ]
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs_vision_speech,
+        model,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )
diff --git a/vllm_v0.10.0/tests/models/multimodal/generation/test_pixtral.py b/vllm_v0.10.0/tests/models/multimodal/generation/test_pixtral.py
new file mode 100644
index 0000000..e157d6f
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/generation/test_pixtral.py
@@ -0,0 +1,234 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+from dataclasses import asdict
+from typing import TYPE_CHECKING, Any, Optional
+
+import pytest
+from mistral_common.multimodal import download_image
+from mistral_common.protocol.instruct.messages import ImageURLChunk
+from mistral_common.protocol.instruct.request import ChatCompletionRequest
+from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
+from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
+from transformers import AutoProcessor
+
+from vllm import RequestOutput, SamplingParams, TextPrompt, TokensPrompt
+from vllm.multimodal import MultiModalDataBuiltins
+from vllm.multimodal.inputs import PlaceholderRange
+from vllm.sequence import Logprob, SampleLogprobs
+
+from ....utils import VLLM_PATH, large_gpu_test
+from ...utils import check_logprobs_close
+
+if TYPE_CHECKING:
+    from _typeshed import StrPath
+
+PIXTRAL_ID = "mistralai/Pixtral-12B-2409"
+MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+
+MODELS = [PIXTRAL_ID, MISTRAL_SMALL_3_1_ID]
+
+IMG_URLS = [
+    "https://picsum.photos/id/237/400/300",
+    "https://picsum.photos/id/231/200/300",
+    "https://picsum.photos/id/27/500/500",
+    "https://picsum.photos/id/17/150/600",
+]
+PROMPT = "Describe each image in one short sentence."
+
+
+def _create_msg_format(urls: list[str]) -> list[dict[str, Any]]:
+    return [{
+        "role":
+        "user",
+        "content": [{
+            "type": "text",
+            "text": PROMPT,
+        }] + [{
+            "type": "image_url",
+            "image_url": {
+                "url": url
+            }
+        } for url in urls],
+    }]
+
+
+def _create_msg_format_hf(urls: list[str]) -> list[dict[str, Any]]:
+    return [{
+        "role":
+        "user",
+        "content": [{
+            "type": "text",
+            "content": PROMPT,
+        }, *({
+            "type": "image",
+            "image": download_image(url)
+        } for url in urls)],
+    }]
+
+
+def _create_engine_inputs(urls: list[str]) -> TokensPrompt:
+    msg = _create_msg_format(urls)
+
+    tokenizer = MistralTokenizer.from_model("pixtral")
+
+    request = ChatCompletionRequest(messages=msg)  # type: ignore[type-var]
+    tokenized = tokenizer.encode_chat_completion(request)
+
+    engine_inputs = TokensPrompt(prompt_token_ids=tokenized.tokens)
+
+    images = []
+    for chunk in request.messages[0].content:
+        if isinstance(chunk, ImageURLChunk):
+            images.append(image_from_chunk(chunk))
+
+    mm_data = MultiModalDataBuiltins(image=images)
+    engine_inputs["multi_modal_data"] = mm_data
+
+    return engine_inputs
+
+
+def _create_engine_inputs_hf(urls: list[str]) -> TextPrompt:
+    msg = _create_msg_format_hf(urls)
+
+    tokenizer = AutoProcessor.from_pretrained("mistral-community/pixtral-12b")
+    prompt = tokenizer.apply_chat_template(msg)
+
+    images = []
+    for chunk in msg[0]["content"]:
+        if chunk["type"] == "image":
+            images.append(chunk["image"])
+
+    mm_data = MultiModalDataBuiltins(image=images)
+    engine_inputs = TextPrompt(prompt=prompt, multi_modal_data=mm_data)
+
+    return engine_inputs
+
+
+MSGS = [
+    _create_msg_format(IMG_URLS[:1]),
+    _create_msg_format(IMG_URLS[:2]),
+    _create_msg_format(IMG_URLS),
+]
+ENGINE_INPUTS = [
+    _create_engine_inputs(IMG_URLS[:1]),
+    _create_engine_inputs(IMG_URLS[:2]),
+    _create_engine_inputs(IMG_URLS),
+]
+
+SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
+LIMIT_MM_PER_PROMPT = dict(image=4)
+
+MAX_MODEL_LEN = [8192, 65536]
+
+FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
+assert FIXTURES_PATH.exists()
+
+FIXTURE_LOGPROBS_CHAT = {
+    PIXTRAL_ID: FIXTURES_PATH / "pixtral_chat.json",
+    MISTRAL_SMALL_3_1_ID: FIXTURES_PATH / "mistral_small_3_chat.json",
+}
+
+OutputsLogprobs = list[tuple[list[int], str, Optional[SampleLogprobs]]]
+
+
+# For the test author to store golden output in JSON
+def _dump_outputs_w_logprobs(
+    outputs: OutputsLogprobs,
+    filename: "StrPath",
+) -> None:
+    json_data = [(tokens, text, [{
+        k: asdict(v)
+        for k, v in token_logprobs.items()
+    } for token_logprobs in (logprobs or [])])
+                 for tokens, text, logprobs in outputs]
+
+    with open(filename, "w") as f:
+        json.dump(json_data, f)
+
+
+def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
+    with open(filename, "rb") as f:
+        json_data = json.load(f)
+
+    return [(tokens, text, [{
+        int(k): Logprob(**v)
+        for k, v in token_logprobs.items()
+    } for token_logprobs in logprobs]) for tokens, text, logprobs in json_data]
+
+
+@large_gpu_test(min_gb=80)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+def test_chat(
+    vllm_runner,
+    max_model_len: int,
+    model: str,
+    dtype: str,
+) -> None:
+    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(
+        FIXTURE_LOGPROBS_CHAT[model])
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            tokenizer_mode="mistral",
+            load_format="mistral",
+            config_format="mistral",
+            max_model_len=max_model_len,
+            limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
+    ) as vllm_model:
+        outputs = []
+        for msg in MSGS:
+            output = vllm_model.llm.chat(msg, sampling_params=SAMPLING_PARAMS)
+
+            outputs.extend(output)
+
+    logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
+    # Remove last `None` prompt_logprobs to compare with fixture
+    for i in range(len(logprobs)):
+        assert logprobs[i][-1] is None
+        logprobs[i] = logprobs[i][:-1]
+    check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
+                         outputs_1_lst=logprobs,
+                         name_0="h100_ref",
+                         name_1="output")
+
+
+@large_gpu_test(min_gb=48)
+@pytest.mark.parametrize("prompt,expected_ranges",
+                         [(_create_engine_inputs_hf(IMG_URLS[:1]),
+                           [PlaceholderRange(offset=11, length=494)]),
+                          (_create_engine_inputs_hf(IMG_URLS[1:4]), [
+                              PlaceholderRange(offset=11, length=266),
+                              PlaceholderRange(offset=277, length=1056),
+                              PlaceholderRange(offset=1333, length=418)
+                          ])])
+def test_multi_modal_placeholders(vllm_runner, prompt,
+                                  expected_ranges: list[PlaceholderRange],
+                                  monkeypatch) -> None:
+
+    # This placeholder checking test only works with V0 engine
+    # where `multi_modal_placeholders` is returned with `RequestOutput`
+    monkeypatch.setenv("VLLM_USE_V1", "0")
+    with vllm_runner(
+            "mistral-community/pixtral-12b",
+            max_model_len=8192,
+            limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
+    ) as vllm_model:
+        outputs = vllm_model.llm.generate(prompt)
+
+        assert len(outputs) == 1, f"{len(outputs)=}"
+        output: RequestOutput = outputs[0]
+        assert hasattr(output,
+                       "multi_modal_placeholders"), f"{output.__dict__=}"
+        assert "image" in output.multi_modal_placeholders, \
+            f"{output.multi_modal_placeholders.keys()=}"
+        image_placeholder_ranges: list[
+            PlaceholderRange] = output.multi_modal_placeholders["image"]
+        assert len(image_placeholder_ranges) == len(
+            expected_ranges), f"{image_placeholder_ranges=}"
+        for real_range, expected_range in zip(image_placeholder_ranges,
+                                              expected_ranges):
+            assert real_range == expected_range, \
+                f"{real_range=} {expected_range=}"
diff --git a/vllm_v0.10.0/tests/models/multimodal/generation/test_qwen2_vl.py b/vllm_v0.10.0/tests/models/multimodal/generation/test_qwen2_vl.py
new file mode 100644
index 0000000..a2793b8
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/generation/test_qwen2_vl.py
@@ -0,0 +1,443 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Optional, TypedDict, Union
+
+import numpy.typing as npt
+import pytest
+import torch
+from PIL import Image
+
+from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
+
+from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
+                          PromptVideoInput, VllmRunner)
+from ...utils import check_logprobs_close
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    V1 Test: batch_make_xxxxx_embeddings calls a V0 internal
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
+models = ["Qwen/Qwen2-VL-2B-Instruct"]
+target_dtype = "half"
+
+IMAGE_PLACEHOLDER = "<|vision_start|><|image_pad|><|vision_end|>"
+VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
+MODEL_HIDDEN_SIZE = 1536
+
+
+def qwen2_vl_chat_template(*query):
+    return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n"  # noqa: E501
+
+
+IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    qwen2_vl_chat_template(
+        IMAGE_PLACEHOLDER,
+        "What is the biggest text's content in this image?",
+    ),
+    "cherry_blossom":
+    qwen2_vl_chat_template(
+        IMAGE_PLACEHOLDER,
+        "What is the season shown in this image? ",
+        "Reply with a short sentence (no more than 20 words)",
+    ),
+})
+
+VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
+    "baby_reading":
+    qwen2_vl_chat_template(
+        VIDEO_PLACEHOLDER,
+        "Describe this video with a short sentence ",
+        "(no more than 20 words)",
+    ),
+})
+
+MULTIIMAGE_PROMPT = qwen2_vl_chat_template(
+    IMAGE_PLACEHOLDER,
+    IMAGE_PLACEHOLDER,
+    "Describe these two images separately. ",
+    "For each image, reply with a short sentence ",
+    "(no more than 10 words).",
+)
+
+
+class Qwen2VLPromptImageEmbeddingInput(TypedDict):
+    image_embeds: torch.Tensor
+    image_grid_thw: torch.Tensor
+
+
+class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
+    video_embeds: torch.Tensor
+    video_grid_thw: torch.Tensor
+
+
+def batch_make_image_embeddings(
+        image_batches: list[Union[Image.Image, list[Image.Image]]], processor,
+        llm: VllmRunner) -> list[Qwen2VLPromptImageEmbeddingInput]:
+    """batched image embeddings for Qwen2-VL
+
+    This will infer all images' embeddings in a single batch, 
+      and split the result according to input batches.
+
+    image_batches:
+      - Single-image batches: `list[Image.Image]`
+      - Multiple-image batches: `list[list[Image.Image]]]`
+    
+    returns: `list[Qwen2VLPromptImageEmbeddingInput]`
+    """
+
+    image_batches_: list[Any] = image_batches[:]
+
+    # convert single-image batches to multiple-image batches
+    for idx in range(len(image_batches_)):
+        if not isinstance(image_batches_[idx], list):
+            image_batches_[idx] = [image_batches_[idx]]
+
+        assert isinstance(image_batches_[idx], list)
+
+    # append all images into a list (as a batch)
+    images: list[Image.Image] = []
+    for image_batch in image_batches_:
+        images += image_batch
+
+    # image to pixel values
+    image_processor = processor.image_processor
+
+    preprocess_result = image_processor \
+        .preprocess(images=images, return_tensors="pt") \
+        .data
+    pixel_values = preprocess_result["pixel_values"]
+    image_grid_thw = preprocess_result["image_grid_thw"]
+
+    # pixel values to embeddings & grid_thws
+    def get_image_embeds(model):
+        with torch.no_grad():
+            visual = model.visual
+
+            pixel_values_on_device = pixel_values.to(visual.device,
+                                                     dtype=visual.dtype)
+            image_grid_thw_on_device = image_grid_thw.to(visual.device,
+                                                         dtype=torch.int64)
+            return visual(pixel_values_on_device,
+                          grid_thw=image_grid_thw_on_device)
+
+    # V1 Test: this calls a V0 internal.
+    image_embeds = torch.concat(llm.apply_model(get_image_embeds))
+
+    # split into original batches
+    result: list[Qwen2VLPromptImageEmbeddingInput] = []
+    image_counter = 0
+    embed_counter = 0
+    for image_batch in image_batches_:
+        cur_batch_image_count = len(image_batch)
+        merge_size = image_processor.merge_size
+        cur_batch_embed_len = sum(
+            grid_thw.prod(-1) // merge_size // merge_size
+            for grid_thw in image_grid_thw[image_counter:image_counter +
+                                           cur_batch_image_count])
+
+        result.append({
+            "image_embeds":
+            image_embeds[embed_counter:embed_counter + cur_batch_embed_len],
+            "image_grid_thw":
+            image_grid_thw[image_counter:image_counter +
+                           cur_batch_image_count],
+        })
+
+        embed_counter += cur_batch_embed_len
+        image_counter += cur_batch_image_count
+
+    # ensure we don't lost any images or embeddings
+    assert embed_counter == image_embeds.size(0)
+    assert image_counter == image_grid_thw.size(0)
+    assert len(image_batches) == len(result)
+
+    return result
+
+
+def batch_make_video_embeddings(
+        video_batches: PromptVideoInput, processor,
+        llm: VllmRunner) -> list[Qwen2VLPromptVideoEmbeddingInput]:
+    """batched video embeddings for Qwen2-VL
+
+    A NDArray represents a single video's all frames.
+
+    This will infer all videos' embeddings in a single batch, 
+      and split the result according to input batches.
+
+    video_batches:
+      - Single-video batches: `list[NDArray]`
+      - Multiple-video batches: `list[list[NDArray]]`
+    """
+
+    video_batches_: list[Any] = video_batches[:]
+
+    for idx in range(len(video_batches_)):
+        if not isinstance(video_batches_[idx], list):
+            single_video_batch: list[npt.NDArray] = [video_batches_[idx]]
+            video_batches_[idx] = single_video_batch
+
+        assert isinstance(video_batches_[idx], list)
+
+    # append all videos into a list (as a batch)
+    videos: list[npt.NDArray] = []
+    for video_batch in video_batches_:
+        videos += video_batch
+
+    # video to pixel values
+    image_processor = processor.image_processor
+
+    preprocess_result = image_processor \
+        .preprocess(images=None, videos=videos, return_tensors="pt") \
+        .data
+    pixel_values = preprocess_result["pixel_values_videos"]
+    video_grid_thw = preprocess_result["video_grid_thw"]
+
+    # pixel values to embeddings & grid_thws
+    def get_image_embeds(model):
+        with torch.no_grad():
+            visual = model.visual
+
+            pixel_values_on_device = pixel_values.to(visual.device,
+                                                     dtype=visual.dtype)
+            video_grid_thw_on_device = video_grid_thw.to(visual.device,
+                                                         dtype=torch.int64)
+            return visual(pixel_values_on_device,
+                          grid_thw=video_grid_thw_on_device)
+
+    # V1 Test: this calls a V0 internal.
+    video_embeds = torch.concat(llm.apply_model(get_image_embeds))
+
+    # split into original batches
+    result: list[Qwen2VLPromptVideoEmbeddingInput] = []
+    video_counter = 0
+    embed_counter = 0
+    for video_batch in video_batches_:
+        cur_batch_video_count = len(video_batch)
+        merge_size = image_processor.merge_size
+        cur_batch_embed_len = sum(
+            grid_thw.prod(-1) // merge_size // merge_size
+            for grid_thw in video_grid_thw[video_counter:video_counter +
+                                           cur_batch_video_count])
+
+        result.append({
+            "video_embeds":
+            video_embeds[embed_counter:embed_counter + cur_batch_embed_len],
+            "video_grid_thw":
+            video_grid_thw[video_counter:video_counter +
+                           cur_batch_video_count],
+        })
+
+        embed_counter += cur_batch_embed_len
+        video_counter += cur_batch_video_count
+
+    # ensure we don't lost any videos or embeddings
+    assert embed_counter == video_embeds.size(0)
+    assert video_counter == video_grid_thw.size(0)
+    assert len(video_batches) == len(result)
+
+    return result
+
+
+def run_embedding_input_test(
+    vllm_runner: type[VllmRunner],
+    inputs: list[tuple[list[str], PromptImageInput, PromptVideoInput]],
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    mm_limit: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Inference result should be the same between
+    original image/video input and image/video embeddings input.
+    """
+    from transformers import AutoProcessor  # noqa: F401
+
+    processor = AutoProcessor.from_pretrained(model)
+
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model,
+                     task="generate",
+                     max_model_len=4000,
+                     max_num_seqs=3,
+                     dtype=dtype,
+                     limit_mm_per_prompt={
+                         "image": mm_limit,
+                         "video": mm_limit
+                     },
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend
+                     ) as vllm_model:
+
+        outputs_per_case_for_original_input = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images or None,
+                                                videos=videos or None)
+            for prompts, images, videos in inputs
+        ]
+
+        outputs_per_case_for_embeddings_input = [
+            vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=batch_make_image_embeddings(
+                    images, processor, vllm_model) if images else None,
+                videos=batch_make_video_embeddings(
+                    videos, processor, vllm_model) if videos else None)
+            for prompts, images, videos in inputs
+        ]
+
+    for outputs_for_original_input, \
+        outputs_for_embeddings_input \
+        in zip(outputs_per_case_for_original_input,
+            outputs_per_case_for_embeddings_input):
+        check_logprobs_close(
+            outputs_0_lst=outputs_for_original_input,
+            outputs_1_lst=outputs_for_embeddings_input,
+            name_0="original_input",
+            name_1="embeddings_input",
+        )
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # Single-scale
+        [0.5],
+        # Single-scale, batched
+        [0.5, 0.5],
+        # Multi-scale
+        [0.25, 0.5, 0.5],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
+                                         size_factors, dtype: str,
+                                         max_tokens: int,
+                                         num_logprobs: int) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_case: list[tuple[
+        list[str], PromptImageInput, PromptVideoInput]] = [(
+            [prompt for _ in size_factors],
+            [rescale_image_size(image, factor) for factor in size_factors],
+            [],
+        ) for image, prompt in zip(images, IMAGE_PROMPTS)]
+
+    run_embedding_input_test(
+        vllm_runner,
+        inputs_per_case,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        [],
+        # Single-scale
+        [0.5],
+        # Single-scale, batched
+        [0.5, 0.5],
+        # Multi-scale
+        [0.25, 0.5, 0.5],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
+                                                  model, size_factors,
+                                                  dtype: str, max_tokens: int,
+                                                  num_logprobs: int) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_case: list[tuple[list[str], PromptImageInput,
+                                PromptVideoInput]] = [(
+                                    [MULTIIMAGE_PROMPT for _ in size_factors],
+                                    [[
+                                        rescale_image_size(image, factor)
+                                        for image in images
+                                    ] for factor in size_factors],
+                                    [],
+                                )]
+
+    run_embedding_input_test(
+        vllm_runner,
+        inputs_per_case,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=2,
+        tensor_parallel_size=1,
+    )
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # Single-scale
+        [0.5],
+        # Single-scale, batched
+        [0.5, 0.5],
+        # Multi-scale
+        [0.25, 0.25, 0.5],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model,
+                                         size_factors, dtype: str,
+                                         max_tokens: int,
+                                         num_logprobs: int) -> None:
+    num_frames = 4
+    sampled_vids = [
+        sample_frames_from_video(asset.np_ndarrays, num_frames)
+        for asset in video_assets
+    ]
+
+    inputs_per_case: list[tuple[
+        list[str], PromptImageInput, PromptVideoInput]] = [(
+            [prompt for _ in size_factors],
+            [],
+            [rescale_video_size(video, factor) for factor in size_factors],
+        ) for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)]
+
+    run_embedding_input_test(
+        vllm_runner,
+        inputs_per_case,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )
diff --git a/vllm_v0.10.0/tests/models/multimodal/generation/test_ultravox.py b/vllm_v0.10.0/tests/models/multimodal/generation/test_ultravox.py
new file mode 100644
index 0000000..e7e7bd3
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/generation/test_ultravox.py
@@ -0,0 +1,177 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from typing import Any
+
+import numpy as np
+import pytest
+import pytest_asyncio
+from transformers import AutoTokenizer
+
+from ....conftest import AUDIO_ASSETS, AudioTestAssets, VllmRunner
+from ....utils import RemoteOpenAIServer
+from ...registry import HF_EXAMPLE_MODELS
+
+MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
+
+AUDIO_PROMPTS = AUDIO_ASSETS.prompts({
+    "mary_had_lamb":
+    "Transcribe this into English.",
+    "winning_call":
+    "What is happening in this audio clip?",
+})
+
+MULTI_AUDIO_PROMPT = "Describe each of the audios above."
+
+AudioTuple = tuple[np.ndarray, int]
+
+VLLM_PLACEHOLDER = "<|audio|>"
+HF_PLACEHOLDER = "<|audio|>"
+
+CHUNKED_PREFILL_KWARGS = {
+    "enable_chunked_prefill": True,
+    "max_num_seqs": 2,
+    # Use a very small limit to exercise chunked prefill.
+    "max_num_batched_tokens": 16
+}
+
+
+def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]:
+    """Convert kwargs to CLI args."""
+    args = []
+    for key, value in params_kwargs.items():
+        if isinstance(value, bool):
+            if value:
+                args.append(f"--{key.replace('_','-')}")
+        else:
+            args.append(f"--{key.replace('_','-')}={value}")
+    return args
+
+
+@pytest.fixture(params=[
+    pytest.param({}, marks=pytest.mark.cpu_model),
+    pytest.param(CHUNKED_PREFILL_KWARGS),
+])
+def server(request, audio_assets: AudioTestAssets):
+    args = [
+        "--dtype", "bfloat16", "--max-model-len", "4096", "--enforce-eager",
+        "--limit-mm-per-prompt",
+        json.dumps({"audio": len(audio_assets)}), "--trust-remote-code"
+    ] + params_kwargs_to_cli_args(request.param)
+
+    with RemoteOpenAIServer(MODEL_NAME,
+                            args,
+                            env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
+                                      "30"}) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+def _get_prompt(audio_count, question, placeholder):
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    placeholder = f"{placeholder}\n" * audio_count
+
+    return tokenizer.apply_chat_template([{
+        'role': 'user',
+        'content': f"{placeholder}{question}"
+    }],
+                                         tokenize=False,
+                                         add_generation_prompt=True)
+
+
+def run_multi_audio_test(
+    vllm_runner: type[VllmRunner],
+    prompts_and_audios: list[tuple[str, list[AudioTuple]]],
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    **kwargs,
+):
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
+
+    with vllm_runner(model,
+                     dtype=dtype,
+                     enforce_eager=True,
+                     limit_mm_per_prompt={
+                         "audio":
+                         max((len(audio) for _, audio in prompts_and_audios))
+                     },
+                     **kwargs) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            [prompt for prompt, _ in prompts_and_audios],
+            max_tokens,
+            num_logprobs=num_logprobs,
+            audios=[audios for _, audios in prompts_and_audios])
+
+    # The HuggingFace model doesn't support multiple audios yet, so
+    # just assert that some tokens were generated.
+    assert all(tokens for tokens, *_ in vllm_outputs)
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("vllm_kwargs", [
+    pytest.param({}, marks=pytest.mark.cpu_model),
+    pytest.param(CHUNKED_PREFILL_KWARGS),
+])
+def test_models_with_multiple_audios(vllm_runner,
+                                     audio_assets: AudioTestAssets, dtype: str,
+                                     max_tokens: int, num_logprobs: int,
+                                     vllm_kwargs: dict) -> None:
+
+    vllm_prompt = _get_prompt(len(audio_assets), MULTI_AUDIO_PROMPT,
+                              VLLM_PLACEHOLDER)
+    run_multi_audio_test(
+        vllm_runner,
+        [(vllm_prompt, [audio.audio_and_sample_rate
+                        for audio in audio_assets])],
+        MODEL_NAME,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        **vllm_kwargs,
+    )
+
+
+@pytest.mark.asyncio
+async def test_online_serving(client, audio_assets: AudioTestAssets):
+    """Exercises online serving with/without chunked prefill enabled."""
+
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *[{
+                "type": "audio_url",
+                "audio_url": {
+                    "url": audio.url
+                }
+            } for audio in audio_assets],
+            {
+                "type":
+                "text",
+                "text":
+                f"What's happening in these {len(audio_assets)} audio clips?"
+            },
+        ],
+    }]
+
+    chat_completion = await client.chat.completions.create(model=MODEL_NAME,
+                                                           messages=messages,
+                                                           max_tokens=10)
+
+    assert len(chat_completion.choices) == 1
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
diff --git a/vllm_v0.10.0/tests/models/multimodal/generation/test_voxtral.py b/vllm_v0.10.0/tests/models/multimodal/generation/test_voxtral.py
new file mode 100644
index 0000000..b4439df
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/generation/test_voxtral.py
@@ -0,0 +1,115 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+import pytest_asyncio
+from mistral_common.audio import Audio
+from mistral_common.protocol.instruct.messages import (AudioChunk, RawAudio,
+                                                       TextChunk, UserMessage)
+
+from vllm.transformers_utils.tokenizer import MistralTokenizer
+
+from ....conftest import AudioTestAssets
+from ....utils import RemoteOpenAIServer
+from .test_ultravox import MULTI_AUDIO_PROMPT, run_multi_audio_test
+
+MODEL_NAME = "mistralai/Voxtral-Mini-3B-2507"
+MISTRAL_FORMAT_ARGS = [
+    "--tokenizer_mode", "mistral", "--config_format", "mistral",
+    "--load_format", "mistral"
+]
+
+
+@pytest.fixture()
+def server(request, audio_assets: AudioTestAssets):
+    args = [
+        "--enforce-eager",
+        "--limit-mm-per-prompt",
+        json.dumps({"audio": len(audio_assets)}),
+    ] + MISTRAL_FORMAT_ARGS
+
+    with RemoteOpenAIServer(MODEL_NAME,
+                            args,
+                            env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
+                                      "30"}) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+def _get_prompt(audio_assets, question):
+    tokenizer = MistralTokenizer.from_pretrained(MODEL_NAME)
+
+    audios = [
+        Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
+        for i in range(len(audio_assets))
+    ]
+    audio_chunks = [
+        AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
+    ]
+
+    text_chunk = TextChunk(text=question)
+    messages = [UserMessage(content=[*audio_chunks, text_chunk]).to_openai()]
+
+    return tokenizer.apply_chat_template(messages=messages)
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models_with_multiple_audios(vllm_runner,
+                                     audio_assets: AudioTestAssets, dtype: str,
+                                     max_tokens: int,
+                                     num_logprobs: int) -> None:
+    vllm_prompt = _get_prompt(audio_assets, MULTI_AUDIO_PROMPT)
+    run_multi_audio_test(
+        vllm_runner,
+        [(vllm_prompt, [audio.audio_and_sample_rate
+                        for audio in audio_assets])],
+        MODEL_NAME,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tokenizer_mode="mistral",
+    )
+
+
+@pytest.mark.asyncio
+async def test_online_serving(client, audio_assets: AudioTestAssets):
+    """Exercises online serving with/without chunked prefill enabled."""
+
+    def asset_to_chunk(asset):
+        audio = Audio.from_file(str(asset.get_local_path()), strict=False)
+        audio.format = "wav"
+        audio_dict = AudioChunk.from_audio(audio).to_openai()
+        return audio_dict
+
+    audio_chunks = [asset_to_chunk(asset) for asset in audio_assets]
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *audio_chunks,
+            {
+                "type":
+                "text",
+                "text":
+                f"What's happening in these {len(audio_assets)} audio clips?"
+            },
+        ],
+    }]
+
+    chat_completion = await client.chat.completions.create(model=MODEL_NAME,
+                                                           messages=messages,
+                                                           max_tokens=10)
+
+    assert len(chat_completion.choices) == 1
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
diff --git a/vllm_v0.10.0/tests/models/multimodal/generation/test_whisper.py b/vllm_v0.10.0/tests/models/multimodal/generation/test_whisper.py
new file mode 100644
index 0000000..4a65e8c
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/generation/test_whisper.py
@@ -0,0 +1,151 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import pytest
+
+from vllm import SamplingParams
+from vllm.assets.audio import AudioAsset
+
+from ....conftest import VllmRunner
+from ....utils import create_new_process_for_each_test, multi_gpu_test
+
+PROMPTS = [
+    {
+        "prompt":
+        "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
+        "multi_modal_data": {
+            "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
+        },
+    },
+    {  # Test explicit encoder/decoder prompt
+        "encoder_prompt": {
+            "prompt": "",
+            "multi_modal_data": {
+                "audio": AudioAsset("winning_call").audio_and_sample_rate,
+            },
+        },
+        "decoder_prompt":
+        "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
+    }
+]
+
+EXPECTED = {
+    "openai/whisper-tiny": [
+        " He has birth words I spoke in the original corner of that. And a"
+        " little piece of black coat poetry. Mary had a little sandwich,"
+        " sweet, with white and snow. And everyone had it very went the last"
+        " would sure to go.",
+        " >> And the old one, fit John the way to Edgar Martinez. >> One more"
+        " to line down the field line for our base camp. Here comes joy. Here"
+        " is June and the third base. They're going to wave him in. The throw"
+        " to the plate will be late. The Mariners are going to play for the"
+        " American League Championship. I don't believe it. It just continues"
+        " by all five."
+    ],
+    "openai/whisper-small": [
+        " The first words I spoke in the original pornograph. A little piece"
+        " of practical poetry. Mary had a little lamb, its fleece was quite a"
+        " slow, and everywhere that Mary went the lamb was sure to go.",
+        " And the old one pitch on the way to Edgar Martinez one month. Here"
+        " comes joy. Here is Junior to third base. They're gonna wave him"
+        " in. The throw to the plate will be late. The Mariners are going to"
+        " play for the American League Championship. I don't believe it. It"
+        " just continues. My, oh my."
+    ],
+    "openai/whisper-medium": [
+        " The first words I spoke in the original phonograph, a little piece"
+        " of practical poetry. Mary had a little lamb, its fleece was quite as"
+        " slow, and everywhere that Mary went the lamb was sure to go.",
+        " And the 0-1 pitch on the way to Edgar Martinez swung on the line"
+        " down the left field line for Obeyshev. Here comes Joy. Here is"
+        " Jorgen at third base. They're going to wave him in. The throw to the"
+        " plate will be late. The Mariners are going to play for the American"
+        " League Championship. I don't believe it. It just continues. My, oh"
+        " my."
+    ],
+    "openai/whisper-large-v3": [
+        " The first words I spoke in the original phonograph, a little piece"
+        " of practical poetry. Mary had a little lamb, its feet were quite as"
+        " slow, and everywhere that Mary went, the lamb was sure to go.",
+        " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line."
+        " Now the left field line for a base hit. Here comes Joy. Here is"
+        " Junior to third base. They're going to wave him in. The throw to the"
+        " plate will be late. The Mariners are going to play for the American"
+        " League Championship. I don't believe it. It just continues. My, oh,"
+        " my."
+    ],
+    "openai/whisper-large-v3-turbo": [
+        " The first words I spoke in the original phonograph, a little piece"
+        " of practical poetry. Mary had a little lamb, its streets were quite"
+        " as slow, and everywhere that Mary went the lamb was sure to go.",
+        " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line"
+        " down the left field line for a base hit. Here comes Joy. Here is"
+        " Junior to third base. They're going to wave him in. The throw to the"
+        " plate will be late. The Mariners are going to play for the American"
+        " League Championship. I don't believe it. It just continues. My, oh,"
+        " my."
+    ]
+}
+
+
+def run_test(
+    vllm_runner: type[VllmRunner],
+    model: str,
+    *,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+) -> None:
+    prompt_list = PROMPTS * 10
+    expected_list = EXPECTED[model] * 10
+
+    with vllm_runner(
+            model,
+            dtype="half",
+            max_model_len=448,
+            tensor_parallel_size=tensor_parallel_size,
+            distributed_executor_backend=distributed_executor_backend,
+    ) as vllm_model:
+        llm = vllm_model.llm
+
+        sampling_params = SamplingParams(
+            temperature=0,
+            top_p=1.0,
+            max_tokens=200,
+        )
+
+        outputs = llm.generate(prompt_list, sampling_params)
+
+    for output, expected in zip(outputs, expected_list):
+        print(output.outputs[0].text)
+        assert output.outputs[0].text == expected
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize(
+    "model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
+@create_new_process_for_each_test()
+def test_models(vllm_runner, model) -> None:
+    run_test(
+        vllm_runner,
+        model,
+        tensor_parallel_size=1,
+    )
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
+@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
+@create_new_process_for_each_test()
+def test_models_distributed(
+    vllm_runner,
+    model,
+    distributed_executor_backend,
+) -> None:
+    run_test(
+        vllm_runner,
+        model,
+        tensor_parallel_size=2,
+        distributed_executor_backend=distributed_executor_backend,
+    )
diff --git a/vllm_v0.10.0/tests/models/multimodal/generation/vlm_utils/__init__.py b/vllm_v0.10.0/tests/models/multimodal/generation/vlm_utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/models/multimodal/generation/vlm_utils/builders.py b/vllm_v0.10.0/tests/models/multimodal/generation/vlm_utils/builders.py
new file mode 100644
index 0000000..03c0824
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/generation/vlm_utils/builders.py
@@ -0,0 +1,301 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Helpers for building inputs that can be leveraged for different test types.
+"""
+from collections.abc import Iterable
+from pathlib import PosixPath
+from typing import Callable, Optional, Union
+
+import torch
+
+from vllm.multimodal.audio import AudioResampler
+from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.video import (rescale_video_size, resize_video,
+                                   sample_frames_from_video)
+
+from .....conftest import AudioTestAssets, ImageTestAssets, VideoTestAssets
+from .types import (SINGLE_AUDIO_BASE_PROMPT, SINGLE_IMAGE_BASE_PROMPTS,
+                    TEST_AUDIO_PLACEHOLDER, TEST_IMG_PLACEHOLDER,
+                    TEST_VIDEO_PLACEHOLDER, VIDEO_BASE_PROMPT,
+                    ImageSizeWrapper, PromptWithMultiModalInput, SizeType,
+                    VLMTestInfo)
+
+
+def replace_test_placeholder(prompt: str, mm_idx_to_prompt: Callable[[int],
+                                                                     str],
+                             test_placeholder: str) -> str:
+    """Given a prompt, replaces each test placeholder with the
+    model-specific tag.
+    """
+    prompt_segments = prompt.split(test_placeholder)
+    img_prompt = prompt_segments[0]
+    for placeholder_idx, next_seg in enumerate(prompt_segments[1:], start=1):
+        img_prompt += mm_idx_to_prompt(placeholder_idx)
+        img_prompt += next_seg
+    return img_prompt
+
+
+def get_model_prompts(base_prompts: Iterable[str],
+                      img_idx_to_prompt: Optional[Callable[[int], str]],
+                      video_idx_to_prompt: Optional[Callable[[int], str]],
+                      audio_idx_to_prompt: Optional[Callable[[int], str]],
+                      prompt_formatter: Callable[[str], str]) -> list[str]:
+    """Given a model-agnostic base prompt and test configuration for a model(s)
+    to be tested, update the media placeholders and apply the prompt formatting
+    to get the test prompt string for this model.
+
+    Example for phi3v, given the base_prompt: "<image>What is the season?"
+        1. Replace img placeholder(s)
+          -> "<|image_1|>\nWhat is the season?"
+        2. Apply prompt formatter:
+          -> <|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n
+    """
+    assert isinstance(base_prompts, (list, tuple))
+    model_prompts = []
+    for base_prompt in base_prompts:
+        # Replace the multimodal placeholders in the base prompt with
+        # the correct ones for the model that we are testing
+        if img_idx_to_prompt:
+            base_prompt = replace_test_placeholder(base_prompt,
+                                                   img_idx_to_prompt,
+                                                   TEST_IMG_PLACEHOLDER)
+
+        if video_idx_to_prompt:
+            base_prompt = replace_test_placeholder(base_prompt,
+                                                   video_idx_to_prompt,
+                                                   TEST_VIDEO_PLACEHOLDER)
+
+        if audio_idx_to_prompt:
+            base_prompt = replace_test_placeholder(base_prompt,
+                                                   audio_idx_to_prompt,
+                                                   TEST_AUDIO_PLACEHOLDER)
+
+        # Apply the prompt formatter to wrap the base prompt with
+        # the correct media placeholders to get the model test prompt
+        model_prompt = prompt_formatter(base_prompt)
+        model_prompts.append(model_prompt)
+    return model_prompts
+
+
+def build_single_image_inputs_from_test_info(
+    test_info: VLMTestInfo,
+    image_assets: ImageTestAssets,
+    size_wrapper: ImageSizeWrapper,
+    tmp_path: Optional[PosixPath] = None,
+) -> list[PromptWithMultiModalInput]:
+    if test_info.prompt_formatter is None:
+        raise ValueError(
+            "Prompt formatter must be set to build single image inputs")
+
+    model_prompts = get_model_prompts(test_info.single_image_prompts,
+                                      test_info.img_idx_to_prompt,
+                                      test_info.video_idx_to_prompt,
+                                      test_info.audio_idx_to_prompt,
+                                      test_info.prompt_formatter)
+
+    # For models that require a local path / URL encoded in the image; export
+    # assets and encode into tmp_path for this test. This should be avoided
+    # where possible (currently needed for Qwen-VL).
+    if test_info.prompt_path_encoder is not None:
+        if tmp_path is None:
+            raise ValueError("Prompt path encoder requires setting local path")
+        model_prompts = [
+            test_info.prompt_path_encoder(tmp_path, prompt, [asset])
+            for prompt, asset in zip(model_prompts, image_assets)
+        ]
+
+    images = [asset.pil_image for asset in image_assets]
+    assert len(images) == len(model_prompts)
+    return build_single_image_inputs(images, model_prompts, size_wrapper)
+
+
+def build_single_image_inputs(
+        images, model_prompts,
+        size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]:
+    # For every image / prompt pair, get a pair containing two lists of
+    # length size_factors, where the first contains duplicates of the model
+    # prompt [str], and the second contains copies of the image after being
+    # scaled by one of the size factors.
+    #
+    # NOTE: rescaling preserves the image aspect ratio.
+    return [
+        PromptWithMultiModalInput(
+            prompts=[prompt for _ in size_wrapper.data],
+            image_data=[
+                apply_image_size_scaling(image, size, size_wrapper.type)
+                for size in size_wrapper.data
+            ],
+        ) for image, prompt in zip(images, model_prompts)
+    ]
+
+
+def build_multi_image_inputs_from_test_info(
+    test_info: VLMTestInfo,
+    image_assets: ImageTestAssets,
+    size_wrapper: ImageSizeWrapper,
+    tmp_path: Optional[PosixPath] = None,
+) -> list[PromptWithMultiModalInput]:
+    if test_info.prompt_formatter is None:
+        raise ValueError(
+            "Prompt formatter must be set to build multi image inputs")
+
+    model_prompts = get_model_prompts([test_info.multi_image_prompt],
+                                      test_info.img_idx_to_prompt,
+                                      test_info.video_idx_to_prompt,
+                                      test_info.audio_idx_to_prompt,
+                                      test_info.prompt_formatter)
+
+    if test_info.prompt_path_encoder is not None:
+        if tmp_path is None:
+            raise ValueError("Prompt path encoder requires setting local path")
+        model_prompts = [
+            test_info.prompt_path_encoder(tmp_path, model_prompt, image_assets)
+            for model_prompt in model_prompts
+        ]
+
+    images = [asset.pil_image for asset in image_assets]
+
+    # Currently, we only have one multi-image list & one multi-image prompt
+    return build_multi_image_inputs(
+        image_lists=[images],
+        model_prompts=model_prompts,
+        size_wrapper=size_wrapper,
+    )
+
+
+def build_multi_image_inputs(
+        image_lists, model_prompts,
+        size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]:
+    return [
+        PromptWithMultiModalInput(
+            prompts=[prompt for _ in size_wrapper.data],
+            image_data=[[
+                apply_image_size_scaling(image, size, size_wrapper.type)
+                for image in images
+            ] for size in size_wrapper.data],
+        ) for images, prompt in zip(image_lists, model_prompts)
+    ]
+
+
+def build_embedding_inputs_from_test_info(
+    test_info: VLMTestInfo,
+    image_assets: ImageTestAssets,
+    size_wrapper: ImageSizeWrapper,
+):
+    # These conditions will always be true if invoked through filtering,
+    # but we still check them in case this is ever called directly
+    if test_info.prompt_formatter is None:
+        raise ValueError(
+            "Prompt formatter must be set to build image embedding inputs")
+    if size_wrapper.type != SizeType.SIZE_FACTOR or not \
+            all(factor == 1.0 for factor in size_wrapper.data):
+        raise ValueError("Embedding tests require constant (1.0) size factors")
+    if test_info.convert_assets_to_embeddings is None:
+        raise ValueError("No conversion func for getting embeddings found")
+
+    model_prompts = get_model_prompts(
+        SINGLE_IMAGE_BASE_PROMPTS,
+        test_info.img_idx_to_prompt,
+        test_info.video_idx_to_prompt,
+        test_info.audio_idx_to_prompt,
+        test_info.prompt_formatter,
+    )
+
+    images = [asset.pil_image for asset in image_assets]
+    embeds = test_info.convert_assets_to_embeddings(image_assets)
+    if test_info.dtype != "auto":
+        dtype = getattr(torch, test_info.dtype)  # type: ignore
+        embeds = [e.to(dtype=dtype) for e in embeds]
+    assert len(images) == len(model_prompts)
+
+    inputs = build_single_image_inputs(images, model_prompts, size_wrapper)
+    vllm_embeddings = build_single_image_inputs(embeds, model_prompts,
+                                                size_wrapper)
+    return inputs, vllm_embeddings
+
+
+def build_video_inputs_from_test_info(
+    test_info: VLMTestInfo,
+    video_assets: VideoTestAssets,
+    size_wrapper: ImageSizeWrapper,
+    num_frames: int,
+) -> list[PromptWithMultiModalInput]:
+    if test_info.prompt_formatter is None:
+        raise ValueError("Prompt formatter must be set to build video inputs")
+    model_prompts = get_model_prompts(
+        [VIDEO_BASE_PROMPT],
+        test_info.img_idx_to_prompt,
+        test_info.video_idx_to_prompt,
+        test_info.audio_idx_to_prompt,
+        test_info.prompt_formatter,
+    )
+
+    sampled_vids = [
+        sample_frames_from_video(asset.np_ndarrays, num_frames)
+        for asset in video_assets
+    ]
+
+    video_scaler = (resize_video if size_wrapper.type == SizeType.FIXED_SIZE
+                    else rescale_video_size)
+
+    return [
+        PromptWithMultiModalInput(
+            prompts=[prompt for _ in size_wrapper.data],
+            video_data=[
+                video_scaler(video, size) for size in size_wrapper.data
+            ],
+        ) for video, prompt in zip(sampled_vids, model_prompts)
+    ]
+
+
+def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
+                             size_type: SizeType):
+    """Applies a size scaler to one image; this can be a an image size factor,
+    which scales the image while maintaining the aspect ratio"""
+    # Special case for embeddings; if it's a tensor, it's only valid if we
+    # are considering size factors at constant scale, i.e., we just clone
+    # the tensor
+    if isinstance(image, torch.Tensor):
+        assert size_type == SizeType.SIZE_FACTOR and size == 1
+        return image
+    if size_type == SizeType.SIZE_FACTOR:
+        # We have a list of image size factors
+        return rescale_image_size(image, size)
+    elif size_type == SizeType.FIXED_SIZE:
+        # We have a list of fixed sizes
+        return image.resize(size)
+    raise ValueError("ImageSizeWrapper type must be FIXED_SIZE or SIZE_FACTOR")
+
+
+def build_audio_inputs_from_test_info(
+    test_info: VLMTestInfo,
+    audio_assets: AudioTestAssets,
+) -> list[PromptWithMultiModalInput]:
+    if test_info.prompt_formatter is None:
+        raise ValueError("Prompt formatter must be set to build audio inputs")
+    model_prompts = get_model_prompts(
+        SINGLE_AUDIO_BASE_PROMPT,
+        test_info.img_idx_to_prompt,
+        test_info.video_idx_to_prompt,
+        test_info.audio_idx_to_prompt,
+        test_info.prompt_formatter,
+    )
+    resampler = AudioResampler(
+        target_sr=16000,
+        method="librosa",
+    )
+    audios = [asset.audio_and_sample_rate for asset in audio_assets]
+    resampled_audios = [(
+        resampler.resample(
+            audio,
+            orig_sr=sr,
+        ),
+        int(resampler.target_sr),
+    ) for audio, sr in audios]
+
+    return [
+        PromptWithMultiModalInput(
+            prompts=model_prompts,
+            audio_data=resampled_audios,
+        )
+    ]
diff --git a/vllm_v0.10.0/tests/models/multimodal/generation/vlm_utils/case_filtering.py b/vllm_v0.10.0/tests/models/multimodal/generation/vlm_utils/case_filtering.py
new file mode 100644
index 0000000..336e2dd
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/generation/vlm_utils/case_filtering.py
@@ -0,0 +1,159 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Utils for determining which subset of model tests belong to a specific
+modality, getting all combinations (similar to pytest's parametrization),
+handling multimodal placeholder substitution, and so on.
+"""
+import itertools
+from collections import OrderedDict
+from collections.abc import Iterable
+
+import pytest
+
+from .types import (EMBEDDING_SIZE_FACTORS, ExpandableVLMTestArgs,
+                    ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType)
+
+
+def get_filtered_test_settings(
+        test_settings: dict[str, VLMTestInfo], test_type: VLMTestType,
+        new_proc_per_test: bool) -> dict[str, VLMTestInfo]:
+    """Given the dict of potential test settings to run, return a subdict
+    of tests who have the current test type enabled with the matching val for
+    fork_per_test.
+    """
+
+    def matches_test_type(test_info: VLMTestInfo, test_type: VLMTestType):
+        return test_info.test_type == test_type or (
+            isinstance(test_info.test_type, Iterable)
+            and test_type in test_info.test_type)
+
+    matching_tests = {}
+    for test_name, test_info in test_settings.items():
+        # Otherwise check if the test has the right type & keep if it does
+        if matches_test_type(test_info, test_type):
+            # Embedding tests need to have a conversion func in their test info
+            if matches_test_type(test_info, VLMTestType.EMBEDDING):
+                assert test_info.convert_assets_to_embeddings is not None
+            # Custom test inputs need to explicitly define the mm limit/inputs
+            if matches_test_type(test_info, VLMTestType.CUSTOM_INPUTS):
+                assert (test_info.custom_test_opts is not None
+                        and isinstance(test_info.custom_test_opts, Iterable))
+            # For all types besides custom inputs, we need a prompt formatter
+            else:
+                assert test_info.prompt_formatter is not None
+
+            # Everything looks okay; keep if this is has correct proc handling
+            if (test_info.distributed_executor_backend
+                    is not None) == new_proc_per_test:
+                matching_tests[test_name] = test_info
+
+    return matching_tests
+
+
+def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
+                             test_type: VLMTestType,
+                             create_new_process_for_each_test: bool):
+    """Converts all of our VLMTestInfo into an expanded list of parameters.
+    This is similar to nesting pytest parametrize calls, but done directly
+    through an itertools product so that each test can set things like
+    size factors etc, while still running in isolated test cases.
+    """
+    matching_tests = get_filtered_test_settings(
+        test_settings, test_type, create_new_process_for_each_test)
+
+    # Ensure that something is wrapped as an iterable it's not already
+    ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e, )
+
+    def get_model_type_cases(model_type: str, test_info: VLMTestInfo):
+        # This is essentially the same as nesting a bunch of mark.parametrize
+        # decorators, but we do it programmatically to allow overrides for on
+        # a per-model basis, while still being able to execute each of these
+        # as individual test cases in pytest.
+        iter_kwargs = OrderedDict([
+            ("model", ensure_wrapped(test_info.models)),
+            ("max_tokens", ensure_wrapped(test_info.max_tokens)),
+            ("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
+            ("dtype", ensure_wrapped(test_info.dtype)),
+            ("distributed_executor_backend",
+             ensure_wrapped(test_info.distributed_executor_backend)),
+        ])
+
+        # num_frames is video only
+        if test_type == VLMTestType.VIDEO:
+            iter_kwargs["num_video_frames"] = ensure_wrapped(
+                test_info.num_video_frames)
+
+        # No sizes passed for custom inputs, since inputs are directly provided
+        if test_type not in (VLMTestType.CUSTOM_INPUTS, VLMTestType.AUDIO):
+            wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
+            if wrapped_sizes is None:
+                raise ValueError(
+                    f"Sizes must be set for test type {test_type}")
+            iter_kwargs["size_wrapper"] = wrapped_sizes
+
+        #Otherwise expand the custom test options instead
+        elif test_type == VLMTestType.CUSTOM_INPUTS:
+            if test_info.custom_test_opts is None:
+                raise ValueError("Test has type CUSTOM_INPUTS, but none given")
+            iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
+
+        # yapf: disable
+        # Wrap all model cases in a pytest parameter & pass marks through
+        return [
+            pytest.param(
+                model_type,
+                ExpandableVLMTestArgs(
+                    **{k: v for k, v in zip(iter_kwargs.keys(), case)}
+                ),
+                marks=test_info.marks if test_info.marks is not None else []
+            ) for case in list(itertools.product(*iter_kwargs.values()))
+        ]
+        # yapf: enable
+
+    # Get a list per model type, where each entry contains a tuple of all of
+    # that model type's cases, then flatten them into the top level so that
+    # we can consume them in one mark.parametrize call.
+    cases_by_model_type = [
+        get_model_type_cases(model_type, test_info)
+        for model_type, test_info in matching_tests.items()
+    ]
+    return list(itertools.chain(*cases_by_model_type))
+
+
+def get_wrapped_test_sizes(
+        test_info: VLMTestInfo,
+        test_type: VLMTestType) -> tuple[ImageSizeWrapper, ...]:
+    """Given a test info which may have size factors or fixed sizes, wrap them
+    and combine them into an iterable, each of which will be used in parameter
+    expansion.
+
+    Args:
+        test_info: Test configuration to be expanded.
+        test_type: The type of test being filtered for.
+    """
+    # If it is an embedding test, we always use the EMBEDDING_SIZE_FACTORS
+    if test_type == VLMTestType.EMBEDDING:
+        return tuple([
+            ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
+            for factor in EMBEDDING_SIZE_FACTORS
+        ])
+    # Audio and Custom inputs have preprocessed inputs
+    elif test_type in (VLMTestType.AUDIO, VLMTestType.CUSTOM_INPUTS):
+        return tuple()
+
+    size_factors = test_info.image_size_factors \
+        if test_info.image_size_factors else []
+    fixed_sizes = test_info.image_sizes \
+        if test_info.image_sizes else []
+
+    wrapped_factors = [
+        ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
+        for factor in size_factors
+    ]
+
+    wrapped_sizes = [
+        ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size)
+        for size in fixed_sizes
+    ]
+
+    return tuple(wrapped_factors + wrapped_sizes)
diff --git a/vllm_v0.10.0/tests/models/multimodal/generation/vlm_utils/core.py b/vllm_v0.10.0/tests/models/multimodal/generation/vlm_utils/core.py
new file mode 100644
index 0000000..cf8962c
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/generation/vlm_utils/core.py
@@ -0,0 +1,184 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Core test implementation to be shared across modalities."""
+from typing import Any, Callable, Optional
+
+import torch
+from transformers.models.auto.auto_factory import _BaseAutoModelClass
+
+from vllm.config import TaskOption
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+from .....conftest import HfRunner, VllmRunner
+from ....registry import HF_EXAMPLE_MODELS
+from .types import PromptWithMultiModalInput, RunnerOutput
+
+
+def run_test(
+    *,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    inputs: list[PromptWithMultiModalInput],
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    enforce_eager: bool,
+    max_model_len: int,
+    max_num_seqs: int,
+    hf_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
+    vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
+    auto_cls: type[_BaseAutoModelClass],
+    use_tokenizer_eos: bool,
+    comparator: Callable[..., None],
+    get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]],
+    stop_str: Optional[list[str]],
+    limit_mm_per_prompt: dict[str, int],
+    vllm_runner_kwargs: Optional[dict[str, Any]],
+    hf_model_kwargs: Optional[dict[str, Any]],
+    patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]],
+    task: TaskOption = "auto",
+    distributed_executor_backend: Optional[str] = None,
+    tensor_parallel_size: int = 1,
+    vllm_embeddings: Optional[torch.Tensor] = None,
+):
+    """Modality agnostic test test executor for comparing HF/vLLM outputs."""
+    # In the case of embeddings, vLLM takes separate input tensors
+    vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs
+
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
+
+    # Disable other modalities to save memory
+    default_limits = {"image": 0, "video": 0, "audio": 0}
+    limit_mm_per_prompt = default_limits | limit_mm_per_prompt
+
+    vllm_outputs_per_mm = []
+    hf_outputs_per_mm = []
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+
+    vllm_runner_kwargs_: dict[str, Any] = {
+        "disable_mm_preprocessor_cache": True,
+    }
+    if model_info.tokenizer:
+        vllm_runner_kwargs_["tokenizer_name"] = model_info.tokenizer
+    if model_info.tokenizer_mode:
+        vllm_runner_kwargs_["tokenizer_mode"] = model_info.tokenizer_mode
+    if model_info.hf_overrides:
+        vllm_runner_kwargs_["hf_overrides"] = model_info.hf_overrides
+
+    if vllm_runner_kwargs:
+        vllm_runner_kwargs_.update(vllm_runner_kwargs)
+
+    with vllm_runner(model,
+                     max_model_len=max_model_len,
+                     max_num_seqs=max_num_seqs,
+                     dtype=dtype,
+                     limit_mm_per_prompt=limit_mm_per_prompt,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=enforce_eager,
+                     task=task,
+                     **vllm_runner_kwargs_) as vllm_model:
+        tokenizer = vllm_model.llm.get_tokenizer()
+
+        vllm_kwargs: dict[str, Any] = {}
+        if get_stop_token_ids is not None:
+            vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
+        if stop_str:
+            vllm_kwargs["stop"] = stop_str
+
+        for prompts, image_data, video_data, audio_data in vllm_inputs:
+            mm_data = dict(images=image_data,
+                           videos=video_data,
+                           audios=audio_data)
+            vllm_kwargs_with_mm_data = vllm_kwargs | mm_data
+            vllm_output = vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                **vllm_kwargs_with_mm_data)
+            vllm_outputs_per_mm.append(vllm_output)
+
+    hf_model = hf_runner(model,
+                         dtype=dtype,
+                         auto_cls=auto_cls,
+                         model_kwargs=hf_model_kwargs)
+
+    # Some models need to patch things like the model processor, e.g., internvl
+    if patch_hf_runner is not None:
+        hf_model = patch_hf_runner(hf_model)
+
+    with hf_model, torch.no_grad():
+        tokenizer = hf_model.tokenizer
+
+        # Some models need to explicitly pass the eos_token_id off the tokenizer
+        # or processor for a good comparison;
+        # currently assume processor/tokenizer agree on the EOS, and pull it off
+        # the tokenizer if requested.
+        hf_kwargs = {}
+        if use_tokenizer_eos:
+            hf_kwargs["eos_token_id"] = tokenizer.eos_token_id
+        if stop_str:
+            hf_kwargs["stop_strings"] = stop_str
+
+        for prompts, image_data, video_data, audio_data in inputs:
+            mm_data = dict(images=image_data,
+                           videos=video_data,
+                           audios=audio_data)
+            hf_kwargs_with_mm_data = hf_kwargs | mm_data
+            hf_output = hf_model.generate_greedy_logprobs_limit(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                tokenizer=tokenizer,
+                **hf_kwargs_with_mm_data)
+            hf_outputs_per_mm.append(hf_output)
+
+    # Apply output processing / sanitation to the vLLM and HF runner results
+    hf_outputs_per_mm, vllm_outputs_per_mm = process_runner_outputs(
+        model,
+        first_runner_outputs=hf_outputs_per_mm,
+        second_runner_outputs=vllm_outputs_per_mm,
+        first_runner_processor=hf_output_post_proc,
+        second_runner_processor=vllm_output_post_proc,
+    )
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm,
+                                        vllm_outputs_per_mm):
+        # This is usually check_logprobs_close, but it's passed through to
+        # allow things like check_outputs_equal where needed
+        comparator(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+def process_runner_outputs(
+    model,
+    first_runner_outputs,
+    second_runner_outputs,
+    first_runner_processor=None,
+    second_runner_processor=None,
+):
+    """Applies the runner processor(s) to the runner outputs, if any."""
+    if first_runner_processor is not None:
+        first_runner_outputs = process_outputs(first_runner_processor, model,
+                                               first_runner_outputs)
+    if second_runner_processor is not None:
+        second_runner_outputs = process_outputs(second_runner_processor, model,
+                                                second_runner_outputs)
+    return first_runner_outputs, second_runner_outputs
+
+
+def process_outputs(output_processor, model, outputs_per_image):
+    """Applies a model specific post-processor function to a runner's output"""
+    return [[output_processor(res, model) for res in outputs]
+            for outputs in outputs_per_image]
diff --git a/vllm_v0.10.0/tests/models/multimodal/generation/vlm_utils/custom_inputs.py b/vllm_v0.10.0/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
new file mode 100644
index 0000000..c53243b
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
@@ -0,0 +1,151 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Custom input builders for edge-cases in different models."""
+from io import BytesIO
+from typing import Callable
+
+import requests
+from PIL import Image
+
+from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.video import (rescale_video_size, resize_video,
+                                   sample_frames_from_video)
+
+from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
+from .builders import build_multi_image_inputs, build_single_image_inputs
+from .types import ImageSizeWrapper, PromptWithMultiModalInput, SizeType
+
+
+def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
+    """Builds inputs for multi-image (varied sizes/aspect ratio) testing.
+    
+    Args:
+        formatter: model-specific prompt formatter.
+    """
+    stop_sign = IMAGE_ASSETS[0].pil_image
+    cherry_blossom = IMAGE_ASSETS[1].pil_image
+
+    # Apply the selected formatter to the base prompts
+    img_prompts = [
+        "<image><image>\nDescribe 2 images.",
+        "<image><image>\nDescribe 2 images.",
+        "<image><image><image><image>\nDescribe 4 images.",
+        "<image>\nWhat is the season?",
+    ]
+    formatted_prompts = [formatter(prompt) for prompt in img_prompts]
+    aspect_ratio_images = [
+        [stop_sign, cherry_blossom],
+        # Images with different sizes and aspect-ratios
+        [
+            rescale_image_size(stop_sign, 0.1),
+            stop_sign,
+        ],
+        [
+            stop_sign,
+            rescale_image_size(stop_sign, 0.25),
+            cherry_blossom.resize((183, 488)),
+            cherry_blossom.resize((488, 183))
+        ],
+        cherry_blossom,
+    ]
+
+    return [
+        PromptWithMultiModalInput(
+            prompts=formatted_prompts,
+            image_data=aspect_ratio_images,
+        )
+    ]
+
+
+def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
+                                          num_frames: int = 16):
+    """Builds inputs for multi-video (varied sizes/aspect ratio) testing.
+    
+    Args:
+        formatter: model-specific prompt formatter.
+    """
+    video = sample_frames_from_video(VIDEO_ASSETS[0].np_ndarrays, num_frames)
+    # Apply the selected formatter to the base prompts
+    video_prompts = [
+        "<video><video>\nDescribe 2 videos.",
+        "<video><video>\nDescribe 2 videos.",
+        "<video><video><video><video>\nDescribe 4 videos.",
+        "<video>\nWhy is this video funny?",
+    ]
+    formatted_prompts = [formatter(prompt) for prompt in video_prompts]
+    aspect_ratio_videos = [
+        [video, video],
+        # Videos with different sizes and aspect-ratios
+        [
+            rescale_video_size(video, 0.1),
+            video,
+        ],
+        [
+            video,
+            rescale_video_size(video, 0.25),
+            resize_video(video, (183, 488)),
+            resize_video(video, (488, 183))
+        ],
+        video,
+    ]
+
+    return [
+        PromptWithMultiModalInput(
+            prompts=formatted_prompts,
+            video_data=aspect_ratio_videos,
+        )
+    ]
+
+
+def different_patch_input_cases_internvl():
+    images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
+    formatter = lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n"  # noqa: E501
+    single_img_prompts = [
+        "<image>\nWhat's the content in the center of the image?",
+        "<image>\nWhat is the season?",
+    ]
+    multi_img_prompts = [
+        "Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.\n",  # noqa: E501
+    ]
+    formatted_sprompts = [formatter(prompt) for prompt in single_img_prompts]
+    formatted_mprompts = [formatter(prompt) for prompt in multi_img_prompts]
+
+    wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5, 1.0])
+    return [
+        build_single_image_inputs(images, formatted_sprompts, wrapped_sf),
+        build_multi_image_inputs([images], formatted_mprompts, wrapped_sf),
+    ]
+
+
+def windows_attention_image_qwen2_5_vl():
+    # image from regression issue: https://github.com/vllm-project/vllm/issues/15122
+    image_url = "https://aomediacodec.github.io/av1-avif/testFiles/Link-U/hato.jpg"
+    image = Image.open(BytesIO(requests.get(image_url).content))
+
+    question = "Describe the image."
+    img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
+    prompt = (f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n"
+              "<|im_start|>assistant\n")
+
+    wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
+    return build_single_image_inputs([image], [prompt], wrapped_sf)
+
+
+def video_with_metadata_glm4_1v():
+    video_array = VIDEO_ASSETS[0].np_ndarrays
+    metadata = VIDEO_ASSETS[0].metadata
+    question = "Describe the video."
+    video_prompt = "<|begin_of_video|><|video|><|end_of_video|>"
+    formatted_prompt = f"<|user|>\n{video_prompt}{question}<|assistant|>\n"
+
+    scales = [0.1, 0.2, 0.25]
+    video_input = [[(rescale_video_size(video_array, scale), metadata)]
+                   for scale in scales]
+    prompts = [formatted_prompt] * len(video_input)
+
+    return [
+        PromptWithMultiModalInput(
+            prompts=prompts,
+            video_data=video_input,
+        )
+    ]
diff --git a/vllm_v0.10.0/tests/models/multimodal/generation/vlm_utils/model_utils.py b/vllm_v0.10.0/tests/models/multimodal/generation/vlm_utils/model_utils.py
new file mode 100644
index 0000000..c1a2aa0
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -0,0 +1,820 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Common utility functions relating to different models that are useful
+for manipulating the input / output of HF & vLLM test runners, which are
+typically specific to a small subset of models.
+"""
+import types
+from pathlib import PosixPath
+from typing import Optional, Union
+
+import numpy as np
+import numpy.typing as npt
+import pytest
+import regex as re
+import torch
+from PIL.Image import Image
+from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
+                          GenerationConfig, GenerationMixin)
+from transformers.video_utils import VideoMetadata
+
+from vllm.sequence import SampleLogprobs
+from vllm.transformers_utils.tokenizer import patch_padding_side
+from vllm.utils import is_list_of
+
+from .....conftest import HfRunner, ImageAsset, ImageTestAssets
+from .types import RunnerOutput
+
+
+####### vLLM output processors functions
+def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
+                            model: str) -> RunnerOutput:
+    """Sanitize vllm output [blip2 models] to be comparable with hf output."""
+    _, output_str, out_logprobs = vllm_output
+
+    hf_output_str = output_str + "\n"
+
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    hf_output_ids = tokenizer.encode(hf_output_str)
+    assert hf_output_ids[0] == tokenizer.bos_token_id
+    hf_output_ids = hf_output_ids[1:]
+
+    return hf_output_ids, hf_output_str, out_logprobs
+
+
+def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
+                           model: str) -> RunnerOutput:
+    """Sanitize vllm output [fuyu models] to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    hf_output_str = output_str.lstrip() + "|ENDOFTEXT|"
+
+    return output_ids, hf_output_str, out_logprobs
+
+
+def qwen_vllm_to_hf_output(
+        vllm_output: RunnerOutput,
+        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    """Sanitize vllm output [qwen models] to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    hf_output_str = output_str + "<|endoftext|>"
+
+    return output_ids, hf_output_str, out_logprobs
+
+
+def qwen2_vllm_to_hf_output(
+        vllm_output: RunnerOutput,
+        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    """Sanitize vllm output [qwen2 models] to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    hf_output_str = output_str + "<|im_end|>"
+
+    return output_ids, hf_output_str, out_logprobs
+
+
+def kimiv_vl_vllm_to_hf_output(
+        vllm_output: RunnerOutput,
+        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    """Sanitize vllm output [kimi_vl models] to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    hf_output_str = output_str + "<|im_end|>[EOS]"
+
+    return output_ids, hf_output_str, out_logprobs
+
+
+def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput,
+                                  model: str) -> RunnerOutput:
+    config = AutoConfig.from_pretrained(model)
+    mm_token_id = config.image_token_index
+    return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
+
+
+def llava_video_vllm_to_hf_output(
+        vllm_output: RunnerOutput,
+        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    config = AutoConfig.from_pretrained(model)
+    mm_token_id = config.video_token_index
+    return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
+
+
+def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
+                             mm_token_id: int) -> RunnerOutput:
+    """Sanitize vllm output [Llava models] to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    eos_token_id = tokenizer.eos_token_id
+
+    hf_output_ids = [
+        token_id for idx, token_id in enumerate(output_ids)
+        if token_id != mm_token_id or output_ids[idx - 1] != mm_token_id
+    ]
+
+    assert output_str[0] == " "
+    hf_output_str = output_str[1:]
+    if hf_output_ids[-1] == eos_token_id:
+        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
+
+    return hf_output_ids, hf_output_str, out_logprobs
+
+
+def llava_onevision_hf_model_kwargs(model: str) -> dict:
+    """Workaround to fix the sliding window issue in llava_onevision."""
+    config = AutoConfig.from_pretrained(model)
+    config.text_config.sliding_window = None
+    return config.to_dict()
+
+
+def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
+                                      model: str) -> RunnerOutput:
+    """Sanitize vllm output [llava-onevision] to compare with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    config = AutoConfig.from_pretrained(model)
+    video_token_id = config.video_token_index
+
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    eos_token_id = tokenizer.eos_token_id
+
+    hf_output_ids = [
+        token_id for idx, token_id in enumerate(output_ids)
+        if token_id != video_token_id or output_ids[idx - 1] != video_token_id
+    ]
+
+    hf_output_str = output_str
+    if hf_output_ids[-1] == eos_token_id:
+        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
+
+    return hf_output_ids, hf_output_str, out_logprobs
+
+
+def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
+                             model: str) -> RunnerOutput:
+    """Sanitize vllm output [mantis] to compare with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    hf_output_str = output_str + "<|eot_id|>"
+
+    return output_ids, hf_output_str, out_logprobs
+
+
+def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
+                            model: str) -> RunnerOutput:
+    """Sanitize vllm output [phi3v] to be comparable with hf output."""
+    _, output_str, out_logprobs = vllm_output
+
+    output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
+    assert output_str_without_image[0] == " "
+    output_str_without_image = output_str_without_image[1:]
+
+    hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
+
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    hf_output_ids = tokenizer.encode(output_str_without_image)
+    assert hf_output_ids[0] == 1
+    hf_output_ids = hf_output_ids[1:]
+
+    return hf_output_ids, hf_output_str, out_logprobs
+
+
+def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
+                                model: str) -> RunnerOutput:
+    """Sanitize vllm output to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    config = AutoConfig.from_pretrained(model)
+    image_token_id = config.image_token_index
+
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    eos_token_id = tokenizer.eos_token_id
+
+    hf_output_ids = [
+        token_id for idx, token_id in enumerate(output_ids)
+        if token_id != image_token_id or output_ids[idx - 1] != image_token_id
+    ]
+
+    hf_output_str = output_str
+
+    if hf_output_ids[-1] == eos_token_id:
+        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
+
+    return hf_output_ids, hf_output_str, out_logprobs
+
+
+####### Post-processors for HF outputs
+def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput,
+                                model: str) -> RunnerOutput:
+    output_ids, output_str, out_logprobs = hf_output
+    if output_str.endswith("<｜end▁of▁sentence｜>"):
+        output_str = output_str.split("<｜end▁of▁sentence｜>")[0]
+    return output_ids, output_str, out_logprobs
+
+
+def idefics3_trunc_hf_output(hf_output: RunnerOutput,
+                             model: str) -> RunnerOutput:
+    output_ids, output_str, out_logprobs = hf_output
+    if output_str.endswith("<end_of_utterance>"):
+        output_str = output_str.split("<end_of_utterance>")[0]
+    return output_ids, output_str, out_logprobs
+
+
+def smolvlm_trunc_hf_output(hf_output: RunnerOutput,
+                            model: str) -> RunnerOutput:
+    # Based on Idefics3
+    return idefics3_trunc_hf_output(hf_output, model)
+
+
+def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
+                             model: str) -> RunnerOutput:
+    output_ids, output_str, out_logprobs = hf_output
+    if output_str.endswith("<|eot_id|>"):
+        output_str = output_str.split("<|eot_id|>")[0]
+    return output_ids, output_str, out_logprobs
+
+
+def minimax_vl_01_hf_output(hf_output: RunnerOutput,
+                            model: str) -> RunnerOutput:
+    output_ids, output_str, out_logprobs = hf_output
+    if output_str.endswith("<end_of_sentence>"):
+        output_str = output_str.split("<end_of_sentence>")[0]
+    return output_ids, output_str, out_logprobs
+
+
+def ultravox_trunc_hf_output(hf_output: RunnerOutput,
+                             model: str) -> RunnerOutput:
+    output_ids, output_str, out_logprobs = hf_output
+
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    eos_token_id = tokenizer.eos_token_id
+    eos_token = tokenizer.decode(eos_token_id)
+    if output_str.endswith(eos_token):
+        output_str = output_str.split(eos_token)[0]
+    return output_ids, output_str, out_logprobs
+
+
+####### Functions for converting image assets to embeddings
+def get_llava_embeddings(image_assets: ImageTestAssets):
+    return [asset.image_embeds for asset in image_assets]
+
+
+####### Prompt path encoders for models that need models on disk
+def qwen_prompt_path_encoder(
+        tmp_path: PosixPath, prompt: str,
+        assets: Union[list[ImageAsset], ImageTestAssets]) -> str:
+    """Given a temporary dir path, export one or more image assets into the
+    tempdir & replace its contents with the local path to the string so that
+    the HF version of Qwen-VL can resolve the path and load the image in its
+    forward() call.
+
+    Args:
+        tmp_path: Tempdir for test under consideration.
+        prompt: Prompt with image placeholders.
+        assets: list of image assets whose len equals the num placeholders.
+    """
+    # Ensure that the number of placeholders matches the number of assets;
+    # If this is not true, the test is probably written incorrectly.
+    assert prompt.count("<img></img>") == len(assets)
+
+    # Replace the placeholders with local paths to the exported assets
+    for asset in assets:
+        image_tmp_path = tmp_path / f"{asset.name}.jpg"
+        asset.pil_image.save(image_tmp_path)
+        prompt = prompt.replace(
+            "<img></img>",
+            f"<img>{image_tmp_path}</img>",
+            1,
+        )
+    return prompt
+
+
+####### Model-specific HuggingFace runner patchers
+def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for GLM4."""
+    hf_processor = hf_model.processor
+
+    def processor(*args, text="", images=None, **kwargs):
+        if isinstance(images, Image):
+            images = [images]
+        # inputs is a custom class instead of dict or BatchFeature
+        inputs = hf_processor(
+            *args,
+            prompt=text,
+            images=images,
+            **kwargs,
+        )
+        inputs = {
+            k: inputs[k]
+            for k in inputs.keys()  # noqa
+            if k not in ("seq_lens", "sft_format")
+        }
+        return BatchFeature(data=inputs, tensor_type="pt")
+
+    hf_model.processor = processor
+    hf_model.model.get_output_embeddings = lambda: \
+        hf_model.model.language.model.embed_tokens
+    return hf_model
+
+
+def gemma3_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for Gemma 3."""
+    hf_processor = hf_model.processor
+
+    def processor(*args, **kwargs):
+        return hf_processor(*args, do_pan_and_scan=True, **kwargs)
+
+    hf_model.processor = processor
+
+    orig_generate = hf_model.model.generate
+
+    def _generate(self, *args, **kwargs):
+        # FIXME: https://github.com/huggingface/transformers/issues/38333
+        kwargs["disable_compile"] = True
+
+        return orig_generate(*args, **kwargs)
+
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
+
+    return hf_model
+
+
+def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for GLM4V."""
+    hf_processor = hf_model.processor
+    patch_padding_side(hf_processor)
+
+    def processor(*args, text="", images=None, **kwargs):
+        if images is None:
+            return hf_processor(*args, **kwargs)
+
+        images = [images] if isinstance(images, Image) else images
+
+        contents = re.findall(
+            r"<\|begin_of_image\|><\|endoftext\|><\|end_of_image\|>(.*?)<\|assistant\|>",
+            text,
+        )
+        assert len(contents) == len(images)
+
+        return hf_processor.apply_chat_template(
+            [{
+                "role": "user",
+                "image": image,
+                "content": content
+            } for image, content in zip(images, contents)],
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            **kwargs,
+        )
+
+    hf_model.processor = processor
+    hf_model.model.get_output_embeddings = lambda: \
+        hf_model.model.transformer.output_layer
+    return hf_model
+
+
+def glm4_1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for GLM4.1V."""
+    hf_processor = hf_model.processor
+
+    def processor(*args, videos=None, **kwargs):
+        if videos is not None and is_list_of(videos, tuple):
+            # If videos is a list of tuples, we assume each tuple contains
+            # (video_array, metadata) as in the case of GLM4.1V.
+            video_metadata = [[VideoMetadata(**video[1])] for video in videos]
+            videos = [[video[0]] for video in videos]
+        else:
+            video_metadata = None
+
+        return hf_processor(*args,
+                            videos=videos,
+                            video_metadata=video_metadata,
+                            **kwargs)
+
+    hf_model.processor = processor
+    return hf_model
+
+
+def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for H2OVL."""
+
+    class H2OVLProcessor:
+        """A simple processor for H2OVL models."""
+
+        def __init__(self, hf_runner: HfRunner):
+            self.num_image_token = hf_runner.model.num_image_token
+            self.tokenizer = hf_runner.tokenizer
+
+            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
+                                                     trust_remote_code=True)
+            self.vision_config = self.config.vision_config
+            self.use_thumbnail = self.config.use_thumbnail
+            self.use_msac = self.config.use_msac
+            self.min_num = self.config.min_dynamic_patch
+            self.max_num = self.config.max_dynamic_patch
+            self.image_size = self.vision_config.image_size
+
+        def __call__(self, text: str, images: Union[Image, list[Image]],
+                     **kwargs):
+            # yapf: disable
+            from vllm.model_executor.models.h2ovl import (
+                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values_h2ovl)
+
+            # yapf: enable
+            images = [images] if isinstance(images, Image) else images
+            pixel_values = [
+                image_to_pixel_values_h2ovl(
+                    image,
+                    input_size=self.image_size,
+                    min_num=self.min_num,
+                    max_num=self.max_num,
+                    use_thumbnail=self.use_thumbnail,
+                    use_msac=self.use_msac,
+                ) for image in images
+            ]
+            num_patches_list = [
+                pixel_value.shape[0] for pixel_value in pixel_values
+            ]
+            pixel_values = torch.cat(pixel_values, dim=0)
+            for num_patches in num_patches_list:
+                context_tokens = IMG_CONTEXT * self.num_image_token \
+                    * num_patches
+                image_tokens = IMG_START + context_tokens + IMG_END
+                text = text.replace('<image>', image_tokens, 1)
+            prompt = self.tokenizer(text, return_tensors="pt")
+            prompt.update({"pixel_values": pixel_values})
+            return prompt
+
+    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
+        "<IMG_CONTEXT>")
+    hf_model.model.img_context_token_id = img_context_token_id
+    hf_model.processor = H2OVLProcessor(hf_model)
+    hf_model.model.get_output_embeddings = lambda: \
+        hf_model.model.language_model.get_output_embeddings()
+    hf_model.model.generate = types.MethodType(_internvl_generate,
+                                               hf_model.model)
+    return hf_model
+
+
+def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for SkyworkR1V."""
+
+    class SkyworkR1VProcessor:
+        """A simple processor for SkyworkR1V."""
+
+        def __init__(self, hf_runner: HfRunner):
+            self.num_image_token = hf_runner.model.num_image_token
+            self.tokenizer = hf_runner.tokenizer
+
+            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
+                                                     trust_remote_code=True)
+            self.vision_config = self.config.vision_config
+            self.use_thumbnail = self.config.use_thumbnail
+            self.min_num = self.config.min_dynamic_patch
+            self.max_num = self.config.max_dynamic_patch
+            self.image_size = self.vision_config.image_size
+
+        def __call__(self, text: str, images: Union[Image, list[Image]],
+                     **kwargs):
+            from vllm.model_executor.models.skyworkr1v import (
+                IMG_CONTEXT, IMG_END, IMG_START,
+                image_to_pixel_values_skyworkr1v)
+            images = [images] if isinstance(images, Image) else images
+            pixel_values = [
+                image_to_pixel_values_skyworkr1v(
+                    image,
+                    input_size=self.image_size,
+                    min_num=self.min_num,
+                    max_num=self.max_num,
+                    use_thumbnail=self.use_thumbnail,
+                ) for image in images
+            ]
+            num_patches_list = [
+                pixel_value.shape[0] for pixel_value in pixel_values
+            ]
+            pixel_values = torch.cat(pixel_values, dim=0)
+            for num_patches in num_patches_list:
+                context_tokens = IMG_CONTEXT * self.num_image_token \
+                    * num_patches
+                image_tokens = IMG_START + context_tokens + IMG_END
+                text = text.replace('<image>', image_tokens, 1)
+            prompt = self.tokenizer(text, return_tensors="pt")
+            prompt.update({"pixel_values": pixel_values})
+            return prompt
+
+    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
+        "<IMG_CONTEXT>")
+    hf_model.model.img_context_token_id = img_context_token_id
+    hf_model.processor = SkyworkR1VProcessor(hf_model)
+    hf_model.model.get_output_embeddings = lambda: \
+        hf_model.model.language_model.get_output_embeddings()
+    hf_model.model.generate = types.MethodType(_internvl_generate,
+                                               hf_model.model)
+    return hf_model
+
+
+def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for InternVL."""
+
+    class InternVLProcessor:
+        """A simple processor for InternVL2 which misses a processor."""
+
+        def __init__(self, hf_runner: HfRunner):
+            self.num_image_token = hf_runner.model.num_image_token
+            self.tokenizer = hf_runner.tokenizer
+
+            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
+                                                     trust_remote_code=True)
+            self.vision_config = self.config.vision_config
+            self.use_thumbnail = self.config.use_thumbnail
+            self.min_num = self.config.min_dynamic_patch
+            self.max_num = self.config.max_dynamic_patch
+            self.image_size = self.vision_config.image_size
+
+        def __call__(
+            self,
+            text: str,
+            images: Union[Image, list[Image]] = None,
+            videos: Union[npt.NDArray, list[npt.NDArray]] = None,
+            **kwargs,
+        ):
+            from vllm.model_executor.models.internvl import (
+                IMG_CONTEXT, IMG_END, IMG_START,
+                image_to_pixel_values_internvl, video_to_pixel_values_internvl)
+            images = [images] if isinstance(images, Image) else images
+            videos = [videos] if isinstance(videos, np.ndarray) else videos
+            if images is not None:
+                pixel_values_images = [
+                    image_to_pixel_values_internvl(
+                        image,
+                        input_size=self.image_size,
+                        min_num=self.min_num,
+                        max_num=self.max_num,
+                        use_thumbnail=self.use_thumbnail,
+                    ) for image in images
+                ]
+                num_patches_images = [
+                    pixel_value.shape[0] for pixel_value in pixel_values_images
+                ]
+            else:
+                pixel_values_images, num_patches_images = [], []
+
+            if videos is not None:
+                pixel_values_videos = [
+                    video_to_pixel_values_internvl(
+                        video,
+                        input_size=self.image_size,
+                        min_num=1,
+                        max_num=1,
+                        use_thumbnail=False,
+                    ) for video in videos
+                ]
+                num_patches_videos = [
+                    pixel_value.shape[0] for pixel_value in pixel_values_videos
+                ]
+            else:
+                pixel_values_videos, num_patches_videos = [], []
+
+            pixel_values = []
+            while ("<image>" in text) or ("<video>" in text):
+                image_index = text.find("<image>")
+                video_index = text.find("<video>")
+                if image_index == -1 or (video_index > -1
+                                         and video_index < image_index):
+                    num_patches = num_patches_videos.pop(0)
+                    pixel_values.append(pixel_values_videos.pop(0))
+                    context_tokens = IMG_START + \
+                        IMG_CONTEXT * self.num_image_token + IMG_END
+                    video_tokens = ''.join([
+                        f'Frame{i+1}: {context_tokens}'
+                        for i in range(num_patches)
+                    ])
+                    text = text.replace('<video>', video_tokens, 1)
+                else:
+                    num_patches = num_patches_images.pop(0)
+                    pixel_values.append(pixel_values_images.pop(0))
+                    context_tokens = IMG_CONTEXT * self.num_image_token \
+                        * num_patches
+                    image_tokens = IMG_START + context_tokens + IMG_END
+                    text = text.replace('<image>', image_tokens, 1)
+            pixel_values = torch.cat(pixel_values, dim=0)
+
+            prompt = self.tokenizer(text, return_tensors="pt")
+            prompt.update({"pixel_values": pixel_values})
+            return prompt
+
+    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
+        "<IMG_CONTEXT>")
+    hf_model.model.img_context_token_id = img_context_token_id
+    hf_model.processor = InternVLProcessor(hf_model)
+    hf_model.model.get_output_embeddings = lambda: \
+        hf_model.model.language_model.get_output_embeddings()
+    hf_model.model.generate = types.MethodType(_internvl_generate,
+                                               hf_model.model)
+    return hf_model
+
+
+def _internvl_generate(
+    self,
+    pixel_values: torch.FloatTensor,
+    input_ids: torch.FloatTensor,
+    attention_mask: Optional[torch.LongTensor] = None,
+    **generate_kwargs,
+) -> torch.LongTensor:
+    """Generate method for InternVL2 model without fixed use_cache."""
+    assert self.img_context_token_id is not None
+    target_dtype = next(self.parameters()).dtype
+    vit_embeds = self.extract_feature(pixel_values.to(target_dtype))
+    input_embeds = self.language_model.get_input_embeddings()(input_ids)
+    B, N, C = input_embeds.shape
+    input_embeds = input_embeds.reshape(B * N, C)
+
+    input_ids = input_ids.reshape(B * N)
+    selected = (input_ids == self.img_context_token_id)
+    assert selected.sum() != 0
+    input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
+
+    input_embeds = input_embeds.reshape(B, N, C)
+
+    forward_kwargs = dict(
+        inputs_embeds=input_embeds,
+        attention_mask=attention_mask,
+    )
+    if getattr(self, "use_visual_token_mask", False):
+        visual_token_mask = selected.reshape(B, N, 1).to(input_embeds.dtype)
+        forward_kwargs["visual_token_mask"] = visual_token_mask
+
+    # e.g. InternVL2-2B
+    if not isinstance(self.language_model, GenerationMixin):
+        pytest.skip("HF impl is not compatible with current transformers")
+
+    outputs = self.language_model.generate(
+        **forward_kwargs,
+        **generate_kwargs,
+    )
+
+    return outputs
+
+
+def mantis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    from mantis.models.mllava import MLlavaProcessor
+
+    hf_model.processor = MLlavaProcessor.from_pretrained(hf_model.model_name)
+
+    orig_generate = hf_model.model.generate
+    tokenizer = hf_model.processor.tokenizer
+
+    def _generate(self, *args, **kwargs):
+        return orig_generate(
+            *args,
+            **kwargs,
+            eos_token_id=[
+                tokenizer.eos_token_id,
+                tokenizer.convert_tokens_to_ids("<|eot_id|>"),
+            ],
+        )
+
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
+
+    return hf_model
+
+
+def minicpmv_25_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    orig_generate = hf_model.model.generate
+
+    def _generate(
+        self,
+        *args,
+        input_ids=None,
+        pixel_values=None,
+        image_sizes=None,
+        image_bound=None,
+        tgt_sizes=None,
+        **kwargs,
+    ):
+        model_inputs = {
+            "input_ids": input_ids,
+            "pixel_values": pixel_values,
+            "image_sizes": image_sizes,
+            "image_bound": image_bound,
+            "tgt_sizes": tgt_sizes,
+        }
+        for k in list(model_inputs.keys()):
+            if model_inputs[k] is None:
+                model_inputs.pop(k)
+
+        return orig_generate(model_inputs, *args, decode_text=False, **kwargs)
+
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
+
+    return hf_model
+
+
+def minicpmo_26_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    orig_generate = hf_model.model.generate
+
+    def _generate(self, *args, image_sizes=None, **kwargs):
+        return orig_generate(*args, decode_text=False, **kwargs)
+
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
+
+    return hf_model
+
+
+def minicpmv_26_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    orig_generate = hf_model.model.generate
+
+    def _generate(self, *args, image_sizes=None, **kwargs):
+        return orig_generate(*args, decode_text=False, **kwargs)
+
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
+
+    return hf_model
+
+
+def minimax_vl_01_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    orig_generate = hf_model.model.generate
+
+    def _generate(self, *args, image_sizes=None, **kwargs):
+        return orig_generate(*args, decode_text=False, **kwargs)
+
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
+
+    return hf_model
+
+
+def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for Molmo."""
+    hf_processor = hf_model.processor
+
+    def _processor(*args, **kwargs):
+        return hf_processor.process(*args, **kwargs)
+
+    hf_model.processor = _processor
+
+    def _generate(self, max_new_tokens=None, do_sample=None, **kwargs):
+        batch = {
+            k: kwargs.pop(k).unsqueeze(0)
+            for k in ("input_ids", "images", "image_input_idx", "image_masks")
+            if k in kwargs
+        }
+        batch = BatchFeature(batch).to(dtype=self.dtype)
+
+        return self.generate_from_batch(
+            batch,
+            generation_config=GenerationConfig(
+                max_new_tokens=max_new_tokens,
+                stop_strings="<|endoftext|>",
+                do_sample=do_sample,
+            ),
+            **kwargs,
+        )
+
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
+
+    return hf_model
+
+
+def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for Ovis2."""
+    hf_model.model.get_output_embeddings = lambda: \
+        hf_model.model.llm.get_output_embeddings()
+
+    def processor(*args, text="", images=None, **kwargs):
+        text_tokenizer = hf_model.model.get_text_tokenizer()
+        images = [images] if isinstance(images, Image) else images
+
+        prompt_start_and_end = {
+            "qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
+            "llama":
+            ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
+            "gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
+        }
+        for start, end in prompt_start_and_end.values():
+            if start in text and end in text:
+                text = text.split(start)[1].split(end)[0]
+                break
+
+        prompt, input_ids, pixel_values = hf_model.model.preprocess_inputs(
+            text_or_conversations=text, images=images)
+        attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)
+
+        inputs = {
+            "inputs": input_ids.unsqueeze(0),
+            "pixel_values": pixel_values.unsqueeze(0),
+            "attention_mask": attention_mask.unsqueeze(0),
+        }
+        return BatchFeature(data=inputs, tensor_type="pt")
+
+    hf_model.processor = processor
+    return hf_model
+
+
+def qwen2_5_omni_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner for Qwen2.5-Omni."""
+    thinker = hf_model.model.thinker
+    thinker.get_output_embeddings = lambda: thinker.lm_head
+    hf_model.model = thinker
+    return hf_model
diff --git a/vllm_v0.10.0/tests/models/multimodal/generation/vlm_utils/runners.py b/vllm_v0.10.0/tests/models/multimodal/generation/vlm_utils/runners.py
new file mode 100644
index 0000000..562f89d
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/generation/vlm_utils/runners.py
@@ -0,0 +1,158 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Entrypoints for wrapping the core run_test implementation for specific test
+types / modalities.
+"""
+from pathlib import PosixPath
+
+from .....conftest import (AudioTestAssets, HfRunner, ImageTestAssets,
+                           VideoTestAssets, VllmRunner)
+from . import builders, core
+from .types import ExpandableVLMTestArgs, VLMTestInfo
+
+
+####### Entrypoints for running different test types
+def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
+                          test_case: ExpandableVLMTestArgs,
+                          hf_runner: type[HfRunner],
+                          vllm_runner: type[VllmRunner],
+                          image_assets: ImageTestAssets):
+    assert test_case.size_wrapper is not None
+    inputs = builders.build_single_image_inputs_from_test_info(
+        model_test_info, image_assets, test_case.size_wrapper, tmp_path)
+
+    core.run_test(
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        inputs=inputs,
+        model=test_case.model,
+        dtype=test_case.dtype,
+        max_tokens=test_case.max_tokens,
+        num_logprobs=test_case.num_logprobs,
+        limit_mm_per_prompt={"image": 1},
+        distributed_executor_backend=test_case.distributed_executor_backend,
+        **model_test_info.get_non_parametrized_runner_kwargs())
+
+
+def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
+                         test_case: ExpandableVLMTestArgs,
+                         hf_runner: type[HfRunner],
+                         vllm_runner: type[VllmRunner],
+                         image_assets: ImageTestAssets):
+    assert test_case.size_wrapper is not None
+    inputs = builders.build_multi_image_inputs_from_test_info(
+        model_test_info, image_assets, test_case.size_wrapper, tmp_path)
+
+    core.run_test(
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        inputs=inputs,
+        model=test_case.model,
+        dtype=test_case.dtype,
+        max_tokens=test_case.max_tokens,
+        num_logprobs=test_case.num_logprobs,
+        limit_mm_per_prompt={"image": len(image_assets)},
+        distributed_executor_backend=test_case.distributed_executor_backend,
+        **model_test_info.get_non_parametrized_runner_kwargs())
+
+
+def run_embedding_test(*, model_test_info: VLMTestInfo,
+                       test_case: ExpandableVLMTestArgs,
+                       hf_runner: type[HfRunner],
+                       vllm_runner: type[VllmRunner],
+                       image_assets: ImageTestAssets):
+    assert test_case.size_wrapper is not None
+    inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
+        model_test_info, image_assets, test_case.size_wrapper)
+
+    core.run_test(
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        inputs=inputs,
+        model=test_case.model,
+        dtype=test_case.dtype,
+        max_tokens=test_case.max_tokens,
+        num_logprobs=test_case.num_logprobs,
+        limit_mm_per_prompt={"image": 1},
+        vllm_embeddings=vllm_embeddings,
+        distributed_executor_backend=test_case.distributed_executor_backend,
+        **model_test_info.get_non_parametrized_runner_kwargs())
+
+
+def run_video_test(
+    *,
+    model_test_info: VLMTestInfo,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    video_assets: VideoTestAssets,
+):
+    assert test_case.size_wrapper is not None
+    assert test_case.num_video_frames is not None
+    inputs = builders.build_video_inputs_from_test_info(
+        model_test_info, video_assets, test_case.size_wrapper,
+        test_case.num_video_frames)
+
+    core.run_test(
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        inputs=inputs,
+        model=test_case.model,
+        dtype=test_case.dtype,
+        max_tokens=test_case.max_tokens,
+        num_logprobs=test_case.num_logprobs,
+        limit_mm_per_prompt={"video": len(video_assets)},
+        distributed_executor_backend=test_case.distributed_executor_backend,
+        **model_test_info.get_non_parametrized_runner_kwargs())
+
+
+def run_audio_test(
+    *,
+    model_test_info: VLMTestInfo,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    audio_assets: AudioTestAssets,
+):
+    inputs = builders.build_audio_inputs_from_test_info(
+        model_test_info, audio_assets)
+
+    core.run_test(
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        inputs=inputs,
+        model=test_case.model,
+        dtype=test_case.dtype,
+        max_tokens=test_case.max_tokens,
+        num_logprobs=test_case.num_logprobs,
+        limit_mm_per_prompt={"audio": 1},
+        distributed_executor_backend=test_case.distributed_executor_backend,
+        **model_test_info.get_non_parametrized_runner_kwargs())
+
+
+def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
+                           test_case: ExpandableVLMTestArgs,
+                           hf_runner: type[HfRunner],
+                           vllm_runner: type[VllmRunner]):
+    # Custom test cases can provide inputs directly, but they need to
+    # explicitly provided a CustomTestConfig, which wraps the inputs and
+    # the limit_mm_per_prompt
+    assert test_case.custom_test_opts is not None
+
+    inputs = test_case.custom_test_opts.inputs
+    limit_mm_per_prompt = test_case.custom_test_opts.limit_mm_per_prompt
+    # Inputs and limit_mm_per_prompt should all be set
+    assert inputs is not None
+    assert limit_mm_per_prompt is not None
+
+    core.run_test(
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        inputs=inputs,
+        model=test_case.model,
+        dtype=test_case.dtype,
+        max_tokens=test_case.max_tokens,
+        num_logprobs=test_case.num_logprobs,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        distributed_executor_backend=test_case.distributed_executor_backend,
+        **model_test_info.get_non_parametrized_runner_kwargs())
diff --git a/vllm_v0.10.0/tests/models/multimodal/generation/vlm_utils/types.py b/vllm_v0.10.0/tests/models/multimodal/generation/vlm_utils/types.py
new file mode 100644
index 0000000..0ec7909
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/generation/vlm_utils/types.py
@@ -0,0 +1,203 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Types for writing multimodal model tests."""
+from collections.abc import Iterable
+from enum import Enum
+from pathlib import PosixPath
+from typing import Any, Callable, NamedTuple, Optional, Union
+
+import torch
+from pytest import MarkDecorator
+from transformers import AutoModelForCausalLM
+from transformers.models.auto.auto_factory import _BaseAutoModelClass
+
+from vllm.config import TaskOption
+from vllm.sequence import SampleLogprobs
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+from .....conftest import (AUDIO_ASSETS, IMAGE_ASSETS, HfRunner, ImageAsset,
+                           ImageTestAssets, PromptAudioInput, PromptImageInput,
+                           PromptVideoInput)
+from ....utils import check_logprobs_close
+
+# meta image tag; will be replaced by the appropriate tag for the model
+TEST_IMG_PLACEHOLDER = "<vlm_image>"
+TEST_VIDEO_PLACEHOLDER = "<vlm_video>"
+TEST_AUDIO_PLACEHOLDER = "<lmm_audio>"
+
+# yapf: disable
+SINGLE_IMAGE_BASE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign": f"{TEST_IMG_PLACEHOLDER}What's the content of the image?",
+    "cherry_blossom": f"{TEST_IMG_PLACEHOLDER}What is the season?",
+})
+SINGLE_AUDIO_BASE_PROMPT = AUDIO_ASSETS.prompts({
+    "mary_had_lamb": f"{TEST_AUDIO_PLACEHOLDER}Transcribe this audio into English.",    # noqa: E501
+    "winning_call": f"{TEST_AUDIO_PLACEHOLDER}What is happening in this audio clip?",     # noqa: E501
+})
+
+MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PLACEHOLDER}Describe the two images in detail.\n"  # noqa: E501
+VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?"
+
+
+IMAGE_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
+EMBEDDING_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0)]
+RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]]
+# yapf: enable
+
+
+class PromptWithMultiModalInput(NamedTuple):
+    """Holds the multimodal input for a single test case."""
+    prompts: list[str]
+    image_data: Optional[PromptImageInput] = None
+    video_data: Optional[PromptVideoInput] = None
+    audio_data: Optional[PromptAudioInput] = None
+
+
+class VLMTestType(Enum):
+    IMAGE = 1
+    MULTI_IMAGE = 2
+    EMBEDDING = 3
+    VIDEO = 4
+    AUDIO = 5
+    CUSTOM_INPUTS = 6
+
+
+class SizeType(Enum):
+    SIZE_FACTOR = 1
+    FIXED_SIZE = 2
+
+
+class CustomTestOptions(NamedTuple):
+    inputs: list[PromptWithMultiModalInput]
+    limit_mm_per_prompt: dict[str, int]
+
+
+class ImageSizeWrapper(NamedTuple):
+    type: SizeType
+    # A size factor is a wrapper of 0+ floats,
+    # while a fixed size contains an iterable of integer pairs
+    data: Union[Iterable[float], Iterable[tuple[int, int]]]
+
+
+class VLMTestInfo(NamedTuple):
+    """Holds the configuration for 1+ tests for one model architecture."""
+
+    models: list[str]
+    test_type: Union[VLMTestType, Iterable[VLMTestType]]
+
+    # Should be None only if this is a CUSTOM_INPUTS test
+    prompt_formatter: Optional[Callable[[str], str]] = None
+    img_idx_to_prompt: Callable[[int], str] = lambda idx: "<image>\n"
+    video_idx_to_prompt: Callable[[int], str] = lambda idx: "<video>\n"
+    audio_idx_to_prompt: Callable[[int], str] = lambda idx: "<audio>\n"
+
+    # Most models work on the single / multi-image prompts above, but in some
+    # cases the log prob check fails, e.g., for paligemma. We allow passing
+    # an override for the single image prompts / multi-image prompt for this
+    # reason.
+    single_image_prompts: Iterable[str] = SINGLE_IMAGE_BASE_PROMPTS
+    multi_image_prompt: str = MULTI_IMAGE_BASE_PROMPT
+
+    # Function for converting ImageAssets to image embeddings;
+    # We need to define this explicitly for embedding tests
+    convert_assets_to_embeddings: Optional[Callable[[ImageTestAssets],
+                                                    torch.Tensor]] = None
+
+    # Exposed options for vLLM runner; we change these in a several tests,
+    # but the defaults are derived from VllmRunner & the engine defaults
+    # These settings are chosen to avoid OOMs when running in the CI
+    enforce_eager: bool = True
+    max_model_len: int = 1024
+    max_num_seqs: int = 256
+    task: TaskOption = "auto"
+    tensor_parallel_size: int = 1
+    vllm_runner_kwargs: Optional[dict[str, Any]] = None
+
+    # Optional callable which gets a list of token IDs from the model tokenizer
+    get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]] = None
+    # Optional list of strings to stop generation, useful when stop tokens are
+    # not special tokens in the tokenizer
+    stop_str: Optional[list[str]] = None
+
+    # Exposed options for HF runner
+    hf_model_kwargs: Optional[dict[str, Any]] = None
+    # Indicates we should explicitly pass the EOS from the tokenizer
+    use_tokenizer_eos: bool = False
+    auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM
+    patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]] = None
+
+    # Post processors that if defined, will run oun the outputs of the
+    # vLLM and HF runner, respectively (useful for sanitization, etc).
+    vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]] = None
+    hf_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]] = None
+
+    # Consumes the output of the callables above and checks if they're equal
+    comparator: Callable[..., None] = check_logprobs_close
+
+    # Default expandable params per test; these defaults can be overridden in
+    # instances of this object; the complete set of test cases for the model
+    # is all combinations of .models + all fields below
+    max_tokens: Union[int, tuple[int]] = 128
+    num_logprobs: Union[int, tuple[int]] = 5
+    dtype: Union[str, Union[list[str], tuple[str, ...]]] = "auto"
+    distributed_executor_backend: Optional[Union[str, Iterable[str]]] = None
+    # Only expanded in video tests
+    num_video_frames: Union[int, tuple[int]] = 16
+
+    # Fixed image sizes / image size factors; most tests use image_size_factors
+    # The values provided for these two fields will be stacked and expanded
+    # such that each model will consider each image size factor / image size
+    # once per tests (much like concatenating and wrapping in one parametrize
+    # call)
+    image_size_factors: Iterable[Iterable[float]] = IMAGE_SIZE_FACTORS
+    image_sizes: Optional[Iterable[Iterable[tuple[int, int]]]] = None
+
+    # Hack for updating a prompt to take into a local path; currently only used
+    # for Qwen-VL, which requires encoding the image path / url into the prompt
+    # for HF runner
+    prompt_path_encoder: Optional[
+        Callable[[PosixPath, str, Union[list[ImageAsset], ImageTestAssets]],
+                 str]] = None  # noqa: E501
+
+    # Allows configuring a test to run with custom inputs
+    custom_test_opts: Optional[list[CustomTestOptions]] = None
+
+    marks: Optional[list[MarkDecorator]] = None
+
+    def get_non_parametrized_runner_kwargs(self):
+        """Returns a dictionary of expandable kwargs for items that are used
+        in all test types, which are NOT used when creating the parametrized
+        test cases.
+        """
+        return {
+            "enforce_eager": self.enforce_eager,
+            "max_model_len": self.max_model_len,
+            "max_num_seqs": self.max_num_seqs,
+            "task": self.task,
+            "tensor_parallel_size": self.tensor_parallel_size,
+            "vllm_runner_kwargs": self.vllm_runner_kwargs,
+            "hf_output_post_proc": self.hf_output_post_proc,
+            "vllm_output_post_proc": self.vllm_output_post_proc,
+            "auto_cls": self.auto_cls,
+            "use_tokenizer_eos": self.use_tokenizer_eos,
+            "comparator": self.comparator,
+            "get_stop_token_ids": self.get_stop_token_ids,
+            "hf_model_kwargs": self.hf_model_kwargs,
+            "stop_str": self.stop_str,
+            "patch_hf_runner": self.patch_hf_runner,
+        }
+
+
+class ExpandableVLMTestArgs(NamedTuple):
+    """The expanded kwargs which correspond to a single test case."""
+    model: str
+    max_tokens: int
+    num_logprobs: int
+    dtype: str
+    distributed_executor_backend: Optional[str]
+    # Sizes are used for everything except for custom input tests
+    size_wrapper: Optional[ImageSizeWrapper] = None
+    # Video only
+    num_video_frames: Optional[int] = None
+    # Custom inputs only
+    custom_test_opts: Optional[CustomTestOptions] = None
diff --git a/vllm_v0.10.0/tests/models/multimodal/pooling/__init__.py b/vllm_v0.10.0/tests/models/multimodal/pooling/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/models/multimodal/pooling/test_dse_qwen2_vl.py b/vllm_v0.10.0/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
new file mode 100644
index 0000000..a6f5aec
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
@@ -0,0 +1,216 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Callable
+
+import pytest
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from transformers import Qwen2VLForConditionalGeneration
+
+from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
+from ....utils import large_gpu_test
+from ...utils import check_embeddings_close
+
+HF_TEXT_PROMPTS = [
+    # T -> X
+    (
+        "Query: Find me an everyday image that matches the given caption: The label of the object is stop sign",  # noqa: E501,
+        Image.new("RGB", (56, 56))),
+    # T -> X
+    ("Query: Retrieve an image of this caption: cherry blossom",
+     Image.new("RGB", (56, 56))),
+]
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "What is shown in this image?",
+    "cherry_blossom":
+    "What is shown in this image?"
+})
+
+MODELS = ["MrLight/dse-qwen2-2b-mrl-v1"]
+
+
+def get_messages(image: Image.Image, text: str, embed_text: bool):
+    # assert False, 'remember to use outer [] as required'
+    if embed_text:
+        messages = [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": Image.new("RGB", (56, 56)),
+                    "resized_height": 1,
+                    "resized_width": 1
+                },  # need a dummy image here for an easier process.
+                {
+                    "type": "text",
+                    "text": text
+                },
+            ]
+        }]
+    else:
+        messages = [{
+            "role":
+            "user",
+            "content": [{
+                "type": "image",
+                "image": image
+            }, {
+                "type": "text",
+                "text": text
+            }]
+        }]
+    return messages
+
+
+def apply_chat_template_and_add_eos(
+    messages: list[dict],
+    apply_chat_template_fn: Callable,
+):
+    prompt = apply_chat_template_fn(
+        messages, tokenize=False, add_generation_prompt=True) + "<|endoftext|>"
+    return prompt
+
+
+def _run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    input_texts: list[str],
+    input_images: PromptImageInput,
+    embed_texts: list[bool],
+    model: str,
+    *,
+    dtype: str,
+) -> None:
+    '''SET PYTHONPATH'''
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    with vllm_runner(model,
+                     task="embed",
+                     dtype=dtype,
+                     enforce_eager=True,
+                     max_model_len=8192) as vllm_model:
+        tokenizer = vllm_model.llm.get_tokenizer()
+        texts = [
+            # this is necessary because vllm_model.embed will not apply any
+            # templating to the prompt, and therefore lacks an image_pad
+            # token unless one is inserted beforehand (the (28,28) image
+            # above is converted to an image pad token by the chat template).
+            apply_chat_template_and_add_eos(
+                get_messages(image, text, False),
+                apply_chat_template_fn=tokenizer.apply_chat_template,
+            ) for text, image in zip(input_texts, input_images)
+            # vllm will replace the pad token with the actual image,
+            # which may be a placeholder image, later.
+        ]
+        vllm_outputs = vllm_model.embed(texts, images=input_images)
+
+    hf_outputs = []
+    with hf_runner(model,
+                   dtype=dtype,
+                   auto_cls=Qwen2VLForConditionalGeneration) as hf_model:
+
+        prompts = []
+        for text, image, embed_text in zip(input_texts, input_images,
+                                           embed_texts):
+            # dse requires non-standard input processing
+            # because it needs an image_pad token
+            messages = get_messages(image, text, embed_text)
+            prompt = apply_chat_template_and_add_eos(
+                messages, hf_model.processor.apply_chat_template)
+
+            prompts.append(prompt)
+
+        all_inputs = hf_model.get_inputs(
+            prompts=prompts,
+            images=input_images,
+        )
+
+        with torch.no_grad():
+            all_outputs = []
+            for inputs in all_inputs:
+                inputs = hf_model.model.prepare_inputs_for_generation(
+                    **inputs,
+                    cache_position=torch.arange(1),  # 1 for batch size
+                    use_cache=False,
+                )
+                outputs = hf_model.model(
+                    **hf_model.wrap_device(inputs),
+                    return_dict=True,
+                    output_hidden_states=True,
+                )
+                pooled_output = F.normalize(outputs.hidden_states[-1][0, -1],
+                                            p=2,
+                                            dim=-1)
+
+                all_outputs.append(pooled_output.tolist())
+
+            hf_outputs = all_outputs
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+def test_models_text(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    input_texts_images = [(text, image_placeholder)
+                          for text, image_placeholder in HF_TEXT_PROMPTS]
+    input_texts = [text for text, _ in input_texts_images]
+    input_images = [image for _, image in input_texts_images]
+    embed_texts = [True] * len(input_texts)
+
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        input_texts,
+        input_images,  # type: ignore
+        embed_texts,
+        model,
+        dtype=dtype,
+    )
+
+
+@large_gpu_test(min_gb=48)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+def test_models_image(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    input_texts_images = [
+        (text, asset.pil_image)
+        for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
+    ]
+    input_texts = [text for text, _ in input_texts_images]
+    input_images = [image for _, image in input_texts_images]
+    embed_texts = [False] * len(input_texts)
+
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        input_texts,
+        input_images,
+        embed_texts,
+        model,
+        dtype=dtype,
+    )
diff --git a/vllm_v0.10.0/tests/models/multimodal/pooling/test_intern_vit.py b/vllm_v0.10.0/tests/models/multimodal/pooling/test_intern_vit.py
new file mode 100644
index 0000000..3e2be34
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/pooling/test_intern_vit.py
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+import torch.nn as nn
+from huggingface_hub import snapshot_download
+from transformers import AutoConfig, AutoModel, CLIPImageProcessor
+
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
+
+from ....conftest import ImageTestAssets
+
+# we use snapshot_download to prevent conflicts between
+# dynamic_module and trust_remote_code for hf_runner
+DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]
+
+
+@torch.inference_mode()
+def run_intern_vit_test(
+    image_assets: ImageTestAssets,
+    model_id: str,
+    *,
+    dtype: str,
+):
+    model = snapshot_download(model_id, allow_patterns=DOWNLOAD_PATTERN)
+    torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
+
+    img_processor = CLIPImageProcessor.from_pretrained(model)
+    images = [asset.pil_image for asset in image_assets]
+    pixel_values = [
+        img_processor(images, return_tensors='pt').pixel_values.to(torch_dtype)
+        for images in images
+    ]
+
+    config = AutoConfig.from_pretrained(model, trust_remote_code=True)
+    if not getattr(config, "norm_type", None):
+        config.norm_type = "rms_norm"
+
+    hf_model = AutoModel.from_pretrained(model,
+                                         torch_dtype=torch_dtype,
+                                         trust_remote_code=True).to("cuda")
+    hf_outputs_per_image = [
+        hf_model(pixel_value.to("cuda")).last_hidden_state
+        for pixel_value in pixel_values
+    ]
+
+    from vllm.model_executor.models.intern_vit import InternVisionModel
+    vllm_model = InternVisionModel(config)
+    vllm_model.load_weights(hf_model.state_dict().items())
+
+    del hf_model
+    cleanup_dist_env_and_memory()
+
+    vllm_model = vllm_model.to("cuda", torch_dtype)
+    vllm_outputs_per_image = [
+        vllm_model(pixel_values=pixel_value.to("cuda"))
+        for pixel_value in pixel_values
+    ]
+    del vllm_model
+    cleanup_dist_env_and_memory()
+
+    cos_similar = nn.CosineSimilarity(dim=-1)
+    for vllm_output, hf_output in zip(vllm_outputs_per_image,
+                                      hf_outputs_per_image):
+        assert cos_similar(vllm_output, hf_output).mean() > 0.99
+
+
+@pytest.mark.parametrize("model_id", [
+    "OpenGVLab/InternViT-300M-448px",
+    "OpenGVLab/InternViT-6B-448px-V1-5",
+])
+@pytest.mark.parametrize("dtype", ["half"])
+def test_models(dist_init, image_assets, model_id, dtype: str) -> None:
+    run_intern_vit_test(
+        image_assets,
+        model_id,
+        dtype=dtype,
+    )
diff --git a/vllm_v0.10.0/tests/models/multimodal/pooling/test_jinavl_reranker.py b/vllm_v0.10.0/tests/models/multimodal/pooling/test_jinavl_reranker.py
new file mode 100644
index 0000000..712b680
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/pooling/test_jinavl_reranker.py
@@ -0,0 +1,187 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Union
+
+import pytest
+from transformers import AutoModel
+
+from vllm.entrypoints.chat_utils import ChatCompletionContentPartImageParam
+from vllm.entrypoints.score_utils import ScoreMultiModalParam
+
+from ....conftest import HfRunner, VllmRunner
+
+model_name = "jinaai/jina-reranker-m0"
+
+mm_processor_kwargs = {
+    "min_pixels": 3136,
+    "max_pixels": 602112,
+}
+
+limit_mm_per_prompt = {"image": 2}
+
+
+def vllm_reranker(
+    vllm_runner: type[VllmRunner],
+    model_name: str,
+    dtype: str,
+    query_strs: list[str],
+    document_strs: list[str],
+    query_type: str = "text",
+    doc_type: str = "text",
+):
+
+    def create_image_param(url: str) -> ChatCompletionContentPartImageParam:
+        return {"type": "image_url", "image_url": {"url": f"{url}"}}
+
+    query: Union[list[str], ScoreMultiModalParam]
+    if query_type == "text":
+        query = query_strs
+    elif query_type == "image":
+        query = ScoreMultiModalParam(
+            content=[create_image_param(url) for url in query_strs])
+
+    documents: Union[list[str], ScoreMultiModalParam]
+    if doc_type == "text":
+        documents = document_strs
+    elif doc_type == "image":
+        documents = ScoreMultiModalParam(
+            content=[create_image_param(url) for url in document_strs])
+
+    with vllm_runner(
+            model_name,
+            task="score",
+            dtype=dtype,
+            max_num_seqs=2,
+            max_model_len=2048,
+            mm_processor_kwargs=mm_processor_kwargs,
+            limit_mm_per_prompt=limit_mm_per_prompt,
+    ) as vllm_model:
+        outputs = vllm_model.llm.score(query, documents)
+
+    return [output.outputs.score for output in outputs]
+
+
+def hf_reranker(
+    hf_runner: type[HfRunner],
+    model_name: str,
+    dtype: str,
+    query_strs: list[str],
+    document_strs: list[str],
+    query_type: str = "text",
+    doc_type: str = "text",
+):
+    checkpoint_to_hf_mapper = {
+        "visual.": "model.visual.",
+        "model.": "model.language_model.",
+    }
+
+    data_pairs = [[query_strs[0], d] for d in document_strs]
+
+    with hf_runner(
+            model_name,
+            dtype=dtype,
+            trust_remote_code=True,
+            auto_cls=AutoModel,
+            model_kwargs={"key_mapping": checkpoint_to_hf_mapper},
+    ) as hf_model:
+        return hf_model.model.compute_score(data_pairs,
+                                            max_length=2048,
+                                            query_type=query_type,
+                                            doc_type=doc_type)
+
+
+# Visual Documents Reranking
+@pytest.mark.parametrize("model_name", [model_name])
+@pytest.mark.parametrize("dtype", ["half"])
+def test_model_text_image(hf_runner, vllm_runner, model_name, dtype):
+    query = ["slm markdown"]
+    documents = [
+        "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png",
+        "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
+    ]
+
+    hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
+                             "text", "image")
+    vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
+                                 documents, "text", "image")
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
+
+
+# Textual Documents Reranking
+@pytest.mark.parametrize("model_name", [model_name])
+@pytest.mark.parametrize("dtype", ["half"])
+def test_model_text_text(hf_runner, vllm_runner, model_name, dtype):
+    query = ["slm markdown"]
+    documents = [
+        """We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient 
+        web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML 
+        into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding 
+        large language models. The models effectiveness results from two key innovations: (1) a three-stage 
+        data synthesis pipeline that generates high quality, diverse training data by iteratively drafting, 
+        refining, and critiquing web content extraction; and (2) a unified training framework combining 
+        continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that 
+        ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated 
+        benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly 
+        lower computational requirements.""",  # noqa: E501
+        "数据提取么？为什么不用正则啊，你用正则不就全解决了么？",
+    ]
+    hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
+                             "text", "text")
+    vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
+                                 documents, "text", "text")
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
+
+
+# Image Querying for Textual Documents
+@pytest.mark.parametrize("model_name", [model_name])
+@pytest.mark.parametrize("dtype", ["half"])
+def test_model_image_text(hf_runner, vllm_runner, model_name, dtype):
+    query = [
+        "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
+    ]
+    documents = [
+        """We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient
+        web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML
+        into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding
+        large language models. The models effectiveness results from two key innovations: (1) a three-stage
+        data synthesis pipeline that generates high quality, diverse training data by iteratively drafting,
+        refining, and critiquing web content extraction; and (2) a unified training framework combining
+        continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that
+        ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated
+        benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly
+        lower computational requirements.""",  # noqa: E501
+        "数据提取么？为什么不用正则啊，你用正则不就全解决了么？",
+    ]
+
+    hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
+                             "image", "text")
+    vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
+                                 documents, "image", "text")
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
+
+
+# Image Querying for Image Documents
+@pytest.mark.parametrize("model_name", [model_name])
+@pytest.mark.parametrize("dtype", ["half"])
+def test_model_image_image(hf_runner, vllm_runner, model_name, dtype):
+    query = [
+        "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
+    ]
+    documents = [
+        "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png",
+        "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
+    ]
+
+    hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
+                             "image", "image")
+    vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
+                                 documents, "image", "image")
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
diff --git a/vllm_v0.10.0/tests/models/multimodal/pooling/test_llava_next.py b/vllm_v0.10.0/tests/models/multimodal/pooling/test_llava_next.py
new file mode 100644
index 0000000..4a8f5ca
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/pooling/test_llava_next.py
@@ -0,0 +1,158 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch.nn.functional as F
+from transformers import AutoModelForImageTextToText
+
+from vllm.platforms import current_platform
+
+from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
+from ....utils import large_gpu_test
+from ...utils import check_embeddings_close
+
+# Llava Next embedding implementation is only supported by CUDA.
+# If run on ROCm, hf_model.model.resize_token_embeddings will
+# cause the following error:
+#    RuntimeError: Calling torch.linalg.cholesky on a CUDA tensor
+#    requires compiling PyTorch with MAGMA. Please use PyTorch
+#    built with MAGMA support.
+# If run on CPU, hf_model.model.resize_token_embeddings will
+# cause the following error:
+#    RuntimeError: Calling torch.linalg.cholesky on a CPU tensor
+#    requires compiling PyTorch with LAPACK. Please use PyTorch
+#    built with LAPACK support.
+pytestmark = pytest.mark.skipif(
+    not current_platform.is_cuda(),
+    reason="Llava Next model uses op that is only supported in CUDA")
+
+llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n'  # noqa: E501
+
+HF_TEXT_PROMPTS = [
+    # T -> X
+    llama3_template.format(
+        "The label of the object is stop sign\nSummary above sentence in one word: "  # noqa: E501
+    ),
+    # T -> X
+    llama3_template.format(
+        "cherry blossom\nSummary above sentence in one word: "),
+]
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    # I -> X
+    "stop_sign":
+    llama3_template.format("<image>\nSummary above image in one word: "),
+    # I -> X
+    "cherry_blossom":
+    llama3_template.format("<image>\nSummary above image in one word: "),
+})
+
+MODELS = ["royokong/e5-v"]
+
+
+def _run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    input_texts: list[str],
+    input_images: PromptImageInput,
+    model: str,
+    *,
+    dtype: str,
+) -> None:
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    with vllm_runner(model,
+                     task="embed",
+                     dtype=dtype,
+                     max_model_len=4096,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs = vllm_model.embed(input_texts, images=input_images)
+
+    with hf_runner(model, dtype=dtype,
+                   auto_cls=AutoModelForImageTextToText) as hf_model:
+        # Patch the issue where generation_config.json is missing
+        hf_model.processor.patch_size = \
+            hf_model.model.config.vision_config.patch_size
+
+        # Patch the issue where image_token_id
+        # exceeds the maximum allowed vocab size
+        hf_model.model.resize_token_embeddings(
+            hf_model.model.language_model.vocab_size + 1)
+
+        all_inputs = hf_model.get_inputs(input_texts, images=input_images)
+
+        all_outputs = []
+        for inputs in all_inputs:
+            # Based on: https://huggingface.co/royokong/e5-v
+            outputs = hf_model.model(
+                **hf_model.wrap_device(inputs),
+                return_dict=True,
+                output_hidden_states=True,
+            )
+            pooled_output = F.normalize(outputs.hidden_states[-1][0, -1, :],
+                                        dim=-1)
+
+            all_outputs.append(pooled_output.tolist())
+
+        hf_outputs = all_outputs
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_models_text(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
+    input_texts = [text for text, _ in input_texts_images]
+    input_images = [image for _, image in input_texts_images]
+
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        input_texts,
+        input_images,  # type: ignore
+        model,
+        dtype=dtype,
+    )
+
+
+@large_gpu_test(min_gb=48)
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_models_image(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    input_texts_images = [
+        (text, asset.pil_image)
+        for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
+    ]
+    input_texts = [text for text, _ in input_texts_images]
+    input_images = [image for _, image in input_texts_images]
+
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        input_texts,
+        input_images,
+        model,
+        dtype=dtype,
+    )
diff --git a/vllm_v0.10.0/tests/models/multimodal/pooling/test_phi3v.py b/vllm_v0.10.0/tests/models/multimodal/pooling/test_phi3v.py
new file mode 100644
index 0000000..9a4b6d3
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/pooling/test_phi3v.py
@@ -0,0 +1,139 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch.nn.functional as F
+from PIL import Image
+
+from vllm.assets.base import get_vllm_public_assets
+from vllm.assets.image import VLM_IMAGES_DIR
+
+from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
+from ....utils import large_gpu_test
+from ...utils import check_embeddings_close
+
+HF_TEXT_PROMPTS = [
+    # T -> X
+    "Find me an everyday image that matches the given caption: The label of the object is stop sign",  # noqa: E501
+    # T -> X
+    "Retrieve an image of this caption: cherry blossom",
+]
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    # T + I -> X
+    "stop_sign":
+    "<|image_1|> Select the portion of the image that isolates the object of the given label: The label of the object is stop sign",  # noqa: E501
+    # I -> X
+    "cherry_blossom":
+    "<|image_1|> Represent the given image for classification",  # noqa: E501
+})
+
+MODELS = ["TIGER-Lab/VLM2Vec-Full"]
+
+
+def _run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    input_texts: list[str],
+    input_images: PromptImageInput,
+    model: str,
+    *,
+    dtype: str,
+) -> None:
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    with vllm_runner(model, task="embed", dtype=dtype,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs = vllm_model.embed(input_texts, images=input_images)
+
+    # use eager mode for hf runner, since phi3_v didn't work with flash_attn
+    hf_model_kwargs = {"_attn_implementation": "eager"}
+    with hf_runner(model, dtype=dtype,
+                   model_kwargs=hf_model_kwargs) as hf_model:
+        all_inputs = hf_model.get_inputs(input_texts, images=input_images)
+
+        all_outputs = []
+        for inputs in all_inputs:
+            # Based on: https://github.com/TIGER-AI-Lab/VLM2Vec/blob/db3b951bccabba220c1f53ab46a734e50dd2fc08/src/model.py
+            outputs = hf_model.model(
+                **hf_model.wrap_device(inputs),
+                return_dict=True,
+                output_hidden_states=True,
+            )
+            last_hidden_state = outputs.hidden_states[-1][0]
+            reps = last_hidden_state[inputs.attention_mask[0].sum() - 1]
+            pooled_output = F.normalize(reps, p=2, dim=-1)
+
+            all_outputs.append(pooled_output.tolist())
+
+        hf_outputs = all_outputs
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_models_text(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
+    input_texts = [text for text, _ in input_texts_images]
+    input_images = [image for _, image in input_texts_images]
+
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        input_texts,
+        input_images,  # type: ignore
+        model,
+        dtype=dtype,
+    )
+
+
+@large_gpu_test(min_gb=48)
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_models_image(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    input_texts_images = [
+        (text, asset.pil_image)
+        for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
+    ]
+    # add cases for special_tokens
+    input_texts_images.append((
+        "\n<s><|user|>\n <|image_1|>\n\t <s>"
+        "Represent the given image for classification<|end|>"
+        "\n<|assistant|>\n",
+        Image.open(
+            get_vllm_public_assets(filename="cherry_blossom.jpg",
+                                   s3_prefix=VLM_IMAGES_DIR)),
+    ))
+    input_texts = [text for text, _ in input_texts_images]
+    input_images = [image for _, image in input_texts_images]
+
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        input_texts,
+        input_images,
+        model,
+        dtype=dtype,
+    )
diff --git a/vllm_v0.10.0/tests/models/multimodal/pooling/test_prithvi_mae.py b/vllm_v0.10.0/tests/models/multimodal/pooling/test_prithvi_mae.py
new file mode 100644
index 0000000..f08d83c
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/pooling/test_prithvi_mae.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm.utils import set_default_torch_num_threads
+
+from ....conftest import VllmRunner
+
+
+def generate_test_mm_data():
+    mm_data = {
+        "pixel_values": torch.full((6, 512, 512), 1.0, dtype=torch.float16),
+        "location_coords": torch.full((1, 2), 1.0, dtype=torch.float16),
+    }
+    return mm_data
+
+
+def _run_test(
+    vllm_runner: type[VllmRunner],
+    model: str,
+) -> None:
+
+    prompt = [
+        {
+            # This model deals with no text input
+            "prompt_token_ids": [1],
+            "multi_modal_data": generate_test_mm_data(),
+        } for _ in range(10)
+    ]
+
+    with (
+            set_default_torch_num_threads(1),
+            vllm_runner(
+                model,
+                task="embed",
+                dtype=torch.float16,
+                enforce_eager=True,
+                skip_tokenizer_init=True,
+                # Limit the maximum number of sequences to avoid the
+                # test going OOM during the warmup run
+                max_num_seqs=32,
+            ) as vllm_model,
+    ):
+        vllm_model.encode(prompt)
+
+
+MODELS = ["christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"]
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", MODELS)
+def test_models_image(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+) -> None:
+    _run_test(
+        vllm_runner,
+        model,
+    )
diff --git a/vllm_v0.10.0/tests/models/multimodal/processing/__init__.py b/vllm_v0.10.0/tests/models/multimodal/processing/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/models/multimodal/processing/test_common.py b/vllm_v0.10.0/tests/models/multimodal/processing/test_common.py
new file mode 100644
index 0000000..fd58425
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/processing/test_common.py
@@ -0,0 +1,349 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from functools import partial
+from typing import Optional, Union
+
+import numpy as np
+import pytest
+from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
+                                                       UserMessage)
+from mistral_common.protocol.instruct.request import ChatCompletionRequest
+from PIL import Image
+
+from vllm.config import ModelConfig
+from vllm.inputs import InputProcessingContext
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
+from vllm.multimodal.inputs import MultiModalInputs
+from vllm.multimodal.processing import BaseMultiModalProcessor, ProcessingCache
+from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
+                                               cached_tokenizer_from_config,
+                                               encode_tokens)
+
+from ....multimodal.utils import random_audio, random_image, random_video
+from ...registry import HF_EXAMPLE_MODELS
+
+
+def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
+    """
+    Patch the multimodal data for GLM4.1V model.
+    """
+    # Ensure video metadata is included
+    if "video" in mm_data:
+        video = mm_data["video"]
+        mm_data["video"] = (video, {
+            "total_num_frames": len(video),
+            "fps": len(video),
+            "duration": 1,
+            "video_backend": "opencv"
+        })
+    return mm_data
+
+
+def _test_processing_correctness(
+    model_id: str,
+    hit_rate: float,
+    num_batches: int,
+    simplify_rate: float,
+):
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
+
+    model_config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_info.tokenizer or model_id,
+        tokenizer_mode=model_info.tokenizer_mode,
+        trust_remote_code=model_info.trust_remote_code,
+        seed=0,
+        dtype="auto",
+        revision=None,
+        hf_overrides=model_info.hf_overrides,
+    )
+
+    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
+    factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
+    ctx = InputProcessingContext(
+        model_config,
+        tokenizer=cached_tokenizer_from_config(model_config),
+    )
+    # Ensure that it can fit all of the data
+    cache = ProcessingCache(capacity_gb=2048)
+
+    processing_info = factories.info(ctx)
+    supported_mm_limits = processing_info.get_supported_mm_limits()
+    limit_mm_per_prompt = {
+        modality: 3 if limit is None else limit
+        for modality, limit in supported_mm_limits.items()
+    }
+
+    model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt
+
+    baseline_processor = factories.build_processor(ctx, cache=None)
+    cached_processor = factories.build_processor(ctx, cache=cache)
+    dummy_inputs = baseline_processor.dummy_inputs
+    tokenizer = baseline_processor.info.get_tokenizer()
+
+    rng = np.random.RandomState(0)
+
+    input_to_hit = {
+        "image": Image.new("RGB", size=(128, 128)),
+        "video": np.zeros((4, 128, 128, 3), dtype=np.uint8),
+        "audio": (np.zeros((512, )), 16000),
+    }
+    input_factory = {
+        "image":
+        partial(random_image, rng, min_wh=128, max_wh=256),
+        "video":
+        partial(random_video,
+                rng,
+                min_frames=2,
+                max_frames=8,
+                min_wh=128,
+                max_wh=256),
+        "audio":
+        partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
+    }
+
+    for batch_idx in range(num_batches):
+        mm_data = {
+            k:
+            [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
+             for _ in range(rng.randint(limit + 1))]
+            for k, limit in limit_mm_per_prompt.items()
+        }
+
+        mm_counts = {k: len(vs) for k, vs in mm_data.items()}
+
+        # Mistral chat outputs tokens directly, rather than text prompts
+        if isinstance(tokenizer, MistralTokenizer):
+            images = mm_data.get("image", [])
+            request = ChatCompletionRequest(messages=[
+                UserMessage(content=[
+                    TextChunk(text=""),
+                    *(ImageChunk(image=image) for image in images),
+                ]),
+            ])
+            res = tokenizer.mistral.encode_chat_completion(request)
+            prompt = res.tokens
+        else:
+            prompt = dummy_inputs.get_dummy_processor_inputs(
+                model_config.max_model_len,
+                mm_counts,
+            ).prompt
+
+        # Drop unnecessary keys and test single -> multi conversion
+        if rng.rand() < simplify_rate:
+            for k in list(mm_data.keys()):
+                if not mm_data[k]:
+                    del mm_data[k]
+                elif len(mm_data[k]) == 1:
+                    mm_data[k] = mm_data[k][0]
+
+        _test_processing_correctness_one(
+            model_config,
+            tokenizer,
+            prompt,
+            mm_data,
+            baseline_processor,
+            cached_processor,
+            batch_idx,
+        )
+
+
+# For some multimodal models, tokenizer will always add bos_token
+# at the beginning of prompt by default, causing hf_processor outputs
+# incorrect token ids. So we need use `add_special_tokens=False` here
+# to leave bos_token to be added by the processor.
+_ADD_SPECIAL_TOKENS_OVERRIDES = {
+    "mllama": False,
+    "ovis": False,
+    "paligemma": False,
+    "ultravox": False,
+    "whisper": False,
+}
+
+_IGNORE_MM_KEYS = {
+    # In Ultravox, the audio_features can be different depending on padding
+    # The slight difference should not be a problem though, since
+    # attention_mask lets us ignore the difference.
+    "ultravox": {"audio_features"},
+}
+
+MM_DATA_PATCHES = {
+    # GLM4.1V requires video metadata to be included in the input
+    "glm4v": glm4_1v_patch_mm_data,
+}
+
+
+def _test_processing_correctness_one(
+    model_config: ModelConfig,
+    tokenizer: AnyTokenizer,
+    prompt: Union[str, list[int]],
+    mm_data: MultiModalDataDict,
+    baseline_processor: BaseMultiModalProcessor,
+    cached_processor: BaseMultiModalProcessor,
+    batch_idx: int,
+):
+    model_type = model_config.hf_config.model_type
+    ignore_mm_keys = _IGNORE_MM_KEYS.get(model_type, set[str]())
+    if model_type in MM_DATA_PATCHES:
+        mm_data = MM_DATA_PATCHES[model_type](mm_data)
+
+    if isinstance(prompt, str):
+        text_prompt = prompt
+        token_prompt = encode_tokens(
+            tokenizer,
+            prompt,
+            add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type),
+        )
+    else:
+        # Mistral does not support decode_tokens with skip_special_tokens=False
+        text_prompt = None
+        token_prompt = prompt
+
+    baseline_tokenized_result = baseline_processor.apply(
+        token_prompt,
+        mm_data=mm_data,
+        hf_processor_mm_kwargs={},
+    )
+
+    cached_tokenized_result = cached_processor.apply(
+        token_prompt,
+        mm_data=mm_data,
+        hf_processor_mm_kwargs={},
+    )
+
+    _assert_inputs_equal(
+        baseline_tokenized_result,
+        cached_tokenized_result,
+        ignore_mm_keys=ignore_mm_keys,
+        msg=f"Failed ({batch_idx=}, {token_prompt=}, {mm_data=})",
+    )
+
+    if text_prompt is not None:
+        baseline_text_result = baseline_processor.apply(
+            text_prompt,
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+        cached_text_result = cached_processor.apply(
+            text_prompt,
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+
+        _assert_inputs_equal(
+            baseline_text_result,
+            cached_text_result,
+            ignore_mm_keys=ignore_mm_keys,
+            msg=f"Failed ({batch_idx=}, {text_prompt=}, {mm_data=})",
+        )
+
+        _assert_inputs_equal(
+            baseline_text_result,
+            baseline_tokenized_result,
+            ignore_mm_keys=ignore_mm_keys,
+            msg=f"Failed ({batch_idx=}, {text_prompt=}, "
+            f"{token_prompt=}, {mm_data=})",
+        )
+
+        _assert_inputs_equal(
+            cached_text_result,
+            cached_tokenized_result,
+            ignore_mm_keys=ignore_mm_keys,
+            msg=f"Failed ({batch_idx=}, {text_prompt=}, "
+            f"{token_prompt=}, {mm_data=})",
+        )
+
+
+# yapf: disable
+@pytest.mark.parametrize("model_id", [
+    "rhymes-ai/Aria",
+    "CohereForAI/aya-vision-8b",
+    "Salesforce/blip2-opt-2.7b",
+    "facebook/chameleon-7b",
+    "deepseek-ai/deepseek-vl2-tiny",
+    "microsoft/Florence-2-base",
+    "adept/fuyu-8b",
+    "google/gemma-3-4b-it",
+    "THUDM/glm-4v-9b",
+    "THUDM/GLM-4.1V-9B-Thinking",
+    "ibm-granite/granite-speech-3.3-2b",
+    "h2oai/h2ovl-mississippi-800m",
+    "OpenGVLab/InternVL2-1B",
+    "OpenGVLab/InternVL3-1B",
+    "HuggingFaceM4/Idefics3-8B-Llama3",
+    "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
+    "moonshotai/Kimi-VL-A3B-Instruct",
+    "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+    "llava-hf/llava-1.5-7b-hf",
+    "llava-hf/llava-v1.6-mistral-7b-hf",
+    "llava-hf/LLaVA-NeXT-Video-7B-hf",
+    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+    "meta-llama/Llama-3.2-11B-Vision-Instruct",
+    "TIGER-Lab/Mantis-8B-siglip-llama3",
+    "openbmb/MiniCPM-Llama3-V-2_5",
+    "openbmb/MiniCPM-o-2_6",
+    "openbmb/MiniCPM-V-2_6",
+    "MiniMaxAI/MiniMax-VL-01",
+    "allenai/Molmo-7B-D-0924",
+    "allenai/Molmo-7B-O-0924",
+    "nvidia/NVLM-D-72B",
+    "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1",
+    "AIDC-AI/Ovis1.6-Gemma2-9B",
+    "AIDC-AI/Ovis1.6-Llama3.2-3B",
+    "AIDC-AI/Ovis2-1B",
+    "google/paligemma-3b-mix-224",
+    "google/paligemma2-3b-ft-docci-448",
+    "microsoft/Phi-3.5-vision-instruct",
+    "microsoft/Phi-4-multimodal-instruct",
+    "mistralai/Pixtral-12B-2409",
+    "mistral-community/pixtral-12b",
+    "Qwen/Qwen-VL-Chat",
+    "Qwen/Qwen2-VL-2B-Instruct",
+    "Qwen/Qwen2.5-VL-3B-Instruct",
+    "Qwen/Qwen2-Audio-7B-Instruct",
+    "Qwen/Qwen2.5-Omni-3B",
+    "Skywork/Skywork-R1V-38B",
+    "fixie-ai/ultravox-v0_5-llama-3_2-1b",
+    "openai/whisper-large-v3",
+    "omni-research/Tarsier-7b",
+    "omni-research/Tarsier2-Recap-7b"
+])
+@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
+@pytest.mark.parametrize("num_batches", [32])
+@pytest.mark.parametrize("simplify_rate", [1.0])
+# yapf: enable
+def test_processing_correctness(
+    model_id: str,
+    hit_rate: float,
+    num_batches: int,
+    simplify_rate: float,
+):
+    _test_processing_correctness(
+        model_id,
+        hit_rate=hit_rate,
+        num_batches=num_batches,
+        simplify_rate=simplify_rate,
+    )
+
+
+def _assert_inputs_equal(
+    a: MultiModalInputs,
+    b: MultiModalInputs,
+    *,
+    ignore_mm_keys: Optional[set[str]] = None,
+    msg: str = "",
+):
+    if ignore_mm_keys is None:
+        ignore_mm_keys = set()
+
+    assert "mm_kwargs" in a and "mm_kwargs" in b, msg
+
+    for key in ignore_mm_keys:
+        a["mm_kwargs"].pop(key, None)
+        b["mm_kwargs"].pop(key, None)
+
+    assert a == b, msg
diff --git a/vllm_v0.10.0/tests/models/multimodal/processing/test_h2ovl.py b/vllm_v0.10.0/tests/models/multimodal/processing/test_h2ovl.py
new file mode 100644
index 0000000..76e4acc
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/processing/test_h2ovl.py
@@ -0,0 +1,174 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for H2OVL's multimodal preprocessing kwargs."""
+from collections.abc import Mapping
+from typing import Optional
+
+import pytest
+from PIL import Image
+from transformers import PretrainedConfig
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.processing import BaseMultiModalProcessor
+
+from ....conftest import ImageTestAssets
+from ...utils import build_model_context
+
+
+def _get_expected_num_patches(
+    config: PretrainedConfig,
+    image: Image.Image,
+    num_imgs: int,
+    min_num: int,
+    max_num: int,
+):
+    from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets,
+                                                  get_h2ovl_target_ratios)
+
+    width, height = image.size
+
+    # Calculate the expected number of blocks
+    if num_imgs == 1 and config.use_msac:
+        # First pass
+        blocks1, _, _, aspect_ratio = calculate_h2ovl_targets(
+            orig_width=width,
+            orig_height=height,
+            target_ratios=get_h2ovl_target_ratios(
+                min_num=1,
+                max_num=max_num,
+                prior_aspect_ratio=None,
+            ),
+            image_size=config.vision_config.image_size,
+            use_thumbnail=False,  # Thumbnail is handled separately
+        )
+
+        # Second pass
+        blocks2, _, _, _ = calculate_h2ovl_targets(
+            orig_width=width,
+            orig_height=height,
+            target_ratios=get_h2ovl_target_ratios(
+                min_num=3,
+                max_num=max_num,
+                prior_aspect_ratio=aspect_ratio,
+            ),
+            image_size=config.vision_config.image_size,
+            use_thumbnail=False,
+        )
+
+        # Add thumbnail if use_thumbnail is True and total_blocks > 1
+        if config.use_thumbnail:
+            blocks1 += 1 if blocks1 > 1 else 0
+            blocks2 += 1 if blocks2 > 1 else 0
+
+        # Total blocks is the sum of blocks from both passes minus
+        # overlapping
+        total_blocks = blocks1 + blocks2 - 1
+
+        return total_blocks
+
+    blocks, _, _, _ = calculate_h2ovl_targets(
+        orig_width=width,
+        orig_height=height,
+        target_ratios=get_h2ovl_target_ratios(
+            min_num,
+            max_num,
+            prior_aspect_ratio=None,
+        ),
+        image_size=config.vision_config.image_size,
+        use_thumbnail=False,
+    )
+    expected_num_patches = blocks
+
+    if config.use_thumbnail and expected_num_patches > 1:
+        expected_num_patches += 1
+
+    return expected_num_patches
+
+
+def _run_check(
+    processor: BaseMultiModalProcessor,
+    images: list[Image.Image],
+    min_num: int,
+    max_num: int,
+    mm_processor_kwargs: Mapping[str, object],
+):
+    tokenizer = processor.info.get_tokenizer()
+    config = processor.info.get_hf_config()
+
+    prompt = "<image>" * len(images)
+    mm_data = {"image": images}
+
+    total_expected_num_patches = sum(
+        _get_expected_num_patches(config, image, len(images), min_num, max_num)
+        for image in images)
+
+    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
+
+    # Ensure we have the right number of placeholders per num_crops size
+    image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
+    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
+    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
+
+    assert img_tok_count == 256 * total_expected_num_patches
+    assert pixel_shape[0] == total_expected_num_patches
+
+
+@pytest.mark.parametrize("model_id", [
+    "h2oai/h2ovl-mississippi-800m",
+    "h2oai/h2ovl-mississippi-2b",
+])
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+        [4.0, 2.0, 1.0],
+    ],
+)
+@pytest.mark.parametrize(
+    ("min_dynamic_patch", "max_dynamic_patch"),
+    [(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
+)
+@pytest.mark.parametrize("dynamic_image_size", [True, False])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
+def test_processor_override(
+    model_id: str,
+    image_assets: ImageTestAssets,
+    size_factors: list[int],
+    min_dynamic_patch: int,
+    max_dynamic_patch: int,
+    dynamic_image_size: Optional[bool],
+    kwargs_on_init: bool,
+):
+    mm_processor_kwargs = {
+        "min_dynamic_patch": min_dynamic_patch,
+        "max_dynamic_patch": max_dynamic_patch,
+        "dynamic_image_size": dynamic_image_size,
+    }
+
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
+        limit_mm_per_prompt={"image": len(size_factors)},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
+
+    min_num = min_dynamic_patch if dynamic_image_size else 1
+    max_num = max_dynamic_patch if dynamic_image_size else 1
+
+    _run_check(
+        processor,
+        [
+            rescale_image_size(image_assets[0].pil_image, f)
+            for f in size_factors
+        ],
+        min_num,
+        max_num,
+        hf_processor_mm_kwargs,
+    )
diff --git a/vllm_v0.10.0/tests/models/multimodal/processing/test_idefics3.py b/vllm_v0.10.0/tests/models/multimodal/processing/test_idefics3.py
new file mode 100644
index 0000000..d3a5599
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/processing/test_idefics3.py
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for Idefics3's multimodal preprocessing kwargs."""
+import pytest
+from transformers import Idefics3Config
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from ....conftest import ImageTestAssets
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["HuggingFaceM4/Idefics3-8B-Llama3"])
+# yapf: disable
+@pytest.mark.parametrize(
+    ("mm_processor_kwargs", "expected_toks_per_img"),
+    [
+        ({"size": {"longest_edge": 364}}, 169),
+        ({"size": {"longest_edge": 728}}, 169 * (2**2 + 1)),
+    ])
+# yapf: enable
+@pytest.mark.parametrize("num_imgs", [1, 2])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
+def test_processor_override(
+    image_assets: ImageTestAssets,
+    model_id: str,
+    mm_processor_kwargs: dict[str, object],
+    expected_toks_per_img: int,
+    num_imgs: int,
+    kwargs_on_init: bool,
+):
+    """Ensure Idefics3MultiModalProcessor handles num_crops properly."""
+    # Same as the previous test - don't initialize mm_processor_kwargs
+    # in this test and assume that the kwargs will be correctly expanded by
+    # the partial when calling the custom input processor.
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
+
+    # Build the image str / prompt based on the number of images we pass
+    placeholders = "<image>" if num_imgs == 1 else "\n".join(
+        f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
+    prompt = f"<|begin_of_text|>User:{placeholders}\n<end_of_utterance>\nAssistant:"  # noqa: E501
+
+    # Build mm_data
+    image_size = ctx.get_hf_config(Idefics3Config).vision_config.image_size
+    dummy_image_size = (image_size * 4, image_size * 4)
+    dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
+    mm_data = {"image": [dummy_image] * num_imgs}
+
+    processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+
+    # Ensure the placeholders format are correct
+    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
+    hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
+    assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
+        "input_ids"][0]
+
+    # Ensure we have the right number of placeholders per num_crops size
+    image_token_id = ctx.get_hf_config().image_token_id
+    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
+    assert img_tok_count == expected_toks_per_img * num_imgs
diff --git a/vllm_v0.10.0/tests/models/multimodal/processing/test_internvl.py b/vllm_v0.10.0/tests/models/multimodal/processing/test_internvl.py
new file mode 100644
index 0000000..c3e2841
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/processing/test_internvl.py
@@ -0,0 +1,131 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for InternVL's multimodal preprocessing kwargs."""
+from collections.abc import Mapping
+from typing import Optional
+
+import pytest
+from PIL import Image
+from transformers import PretrainedConfig
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.processing import BaseMultiModalProcessor
+
+from ....conftest import ImageTestAssets
+from ...utils import build_model_context
+
+
+def _get_expected_num_patches(
+    config: PretrainedConfig,
+    image: Image.Image,
+    num_imgs: int,
+    min_num: int,
+    max_num: int,
+):
+    from vllm.model_executor.models.internvl import (
+        calculate_internvl_targets, get_internvl_target_ratios)
+
+    width, height = image.size
+
+    blocks, _, _ = calculate_internvl_targets(
+        orig_width=width,
+        orig_height=height,
+        target_ratios=get_internvl_target_ratios(
+            min_num,
+            max_num,
+        ),
+        image_size=config.vision_config.image_size,
+        use_thumbnail=False,
+    )
+    expected_num_patches = blocks
+
+    if config.use_thumbnail and expected_num_patches > 1:
+        expected_num_patches += 1
+
+    return expected_num_patches
+
+
+def _run_check(
+    processor: BaseMultiModalProcessor,
+    images: list[Image.Image],
+    min_num: int,
+    max_num: int,
+    mm_processor_kwargs: Mapping[str, object],
+):
+    tokenizer = processor.info.get_tokenizer()
+    config = processor.info.get_hf_config()
+
+    prompt = "<image>" * len(images)
+    mm_data = {"image": images}
+
+    total_expected_num_patches = sum(
+        _get_expected_num_patches(config, image, len(images), min_num, max_num)
+        for image in images)
+
+    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
+
+    # Ensure we have the right number of placeholders per num_crops size
+    image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
+    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
+    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
+
+    assert img_tok_count == 256 * total_expected_num_patches
+    assert pixel_shape[0] == total_expected_num_patches
+
+
+@pytest.mark.parametrize("model_id", ["OpenGVLab/InternVL2-2B"])
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+        [4.0, 2.0, 1.0],
+    ],
+)
+@pytest.mark.parametrize(
+    ("min_dynamic_patch", "max_dynamic_patch"),
+    [(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
+)
+@pytest.mark.parametrize("dynamic_image_size", [True, False])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
+def test_processor_override(
+    model_id: str,
+    image_assets: ImageTestAssets,
+    size_factors: list[int],
+    min_dynamic_patch: int,
+    max_dynamic_patch: int,
+    dynamic_image_size: Optional[bool],
+    kwargs_on_init: bool,
+):
+    mm_processor_kwargs = {
+        "min_dynamic_patch": min_dynamic_patch,
+        "max_dynamic_patch": max_dynamic_patch,
+        "dynamic_image_size": dynamic_image_size,
+    }
+
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
+        limit_mm_per_prompt={"image": len(size_factors)},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
+
+    min_num = min_dynamic_patch if dynamic_image_size else 1
+    max_num = max_dynamic_patch if dynamic_image_size else 1
+
+    _run_check(
+        processor,
+        [
+            rescale_image_size(image_assets[0].pil_image, f)
+            for f in size_factors
+        ],
+        min_num,
+        max_num,
+        hf_processor_mm_kwargs,
+    )
diff --git a/vllm_v0.10.0/tests/models/multimodal/processing/test_llama4.py b/vllm_v0.10.0/tests/models/multimodal/processing/test_llama4.py
new file mode 100644
index 0000000..9ef7af5
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/processing/test_llama4.py
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for Llama4's multimodal preprocessing kwargs."""
+
+import pytest
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.transformers_utils.tokenizer import encode_tokens
+
+from ....conftest import ImageTestAssets
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id",
+                         ["meta-llama/Llama-4-Scout-17B-16E-Instruct"])
+@pytest.mark.parametrize("mm_processor_kwargs", [{}])
+@pytest.mark.parametrize("num_imgs", [1, 5])
+@pytest.mark.parametrize("disable_mm_preprocessor_cache", [True, False])
+@pytest.mark.parametrize("tokenized_prompt", [True, False])
+def test_processor_override(
+    image_assets: ImageTestAssets,
+    model_id: str,
+    mm_processor_kwargs: dict,
+    num_imgs: int,
+    disable_mm_preprocessor_cache: bool,
+    tokenized_prompt: bool,
+):
+    """Ensure llama4 processor works properly."""
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs,
+        limit_mm_per_prompt={"image": num_imgs},
+        disable_mm_preprocessor_cache=disable_mm_preprocessor_cache,
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    config = processor.info.get_hf_config()
+    tokenizer = processor.info.get_tokenizer()
+    hf_processor = processor.info.get_hf_processor()
+    vocab = tokenizer.get_vocab()
+
+    prompt = "<|begin_of_text|><|header_start|>user<|header_end|>" \
+        + "<|image|>" * num_imgs \
+        + "<|eot|><|header_start|>assistant<|header_end|>"
+    mm_data = {
+        "image": [
+            image_assets[(i % len(image_assets))].pil_image
+            for i in range(num_imgs)
+        ]
+    }
+    if tokenized_prompt:
+        prompt = encode_tokens(tokenizer, prompt)
+
+    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
+    mm_kwargs = processed_inputs["mm_kwargs"]
+
+    # place holder replacements
+    prompt_token_ids = processed_inputs["prompt_token_ids"]
+    assert prompt_token_ids.count(config.boi_token_index) == num_imgs
+    assert prompt_token_ids.count(config.eoi_token_index) == num_imgs
+    assert prompt_token_ids.count(vocab[hf_processor.image_token]) == num_imgs
+    aspect_ratios = mm_kwargs["aspect_ratios"]
+    num_x_separators = num_y_separators = 0
+    for tiles_y, tiles_x in aspect_ratios:
+        if tiles_x * tiles_y > 1:
+            num_x_separators += (tiles_x - 1) * tiles_y
+            num_y_separators += tiles_y
+    assert prompt_token_ids.count(vocab[hf_processor.tile_token]) \
+        == num_x_separators
+    assert prompt_token_ids.count(vocab[hf_processor.tile_global_token]) \
+        ==  num_y_separators
+
+    # image token offsets
+    img_locs = processed_inputs["mm_placeholders"].get("image", [])
+    assert len(img_locs) == num_imgs
+    assert [img_loc.offset for img_loc in img_locs] == \
+        [i for i, v in enumerate(prompt_token_ids) \
+        if v == config.boi_token_index]
+
+    # patch sizes and masks
+    num_patches_per_chunk = processor.info.get_patch_per_chunk(
+        config.vision_config)
+    assert prompt_token_ids.count(config.image_token_index) \
+        == mm_kwargs["patches_per_image"].sum() * num_patches_per_chunk
+    assert mm_kwargs["pixel_values"].shape[0] \
+        == mm_kwargs["patches_per_image"].sum()
diff --git a/vllm_v0.10.0/tests/models/multimodal/processing/test_llava_next.py b/vllm_v0.10.0/tests/models/multimodal/processing/test_llava_next.py
new file mode 100644
index 0000000..ca34d1d
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/processing/test_llava_next.py
@@ -0,0 +1,183 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import itertools
+from functools import partial
+
+import pytest
+from PIL import Image
+from pqdm.threads import pqdm
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.parse import ImageSize
+from vllm.multimodal.processing import BaseMultiModalProcessor
+
+from ...utils import build_model_context
+
+
+def _validate_image_max_tokens_one(
+    processor: BaseMultiModalProcessor,
+    max_tokens: int,
+    failed_size_excs: list[tuple[ImageSize, Exception]],
+    image_size: ImageSize,
+) -> None:
+    info = processor.info
+    feature_size = info.get_num_image_tokens(image_width=image_size.width,
+                                             image_height=image_size.height)
+
+    try:
+        assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
+    except Exception as exc:
+        failed_size_excs.append((image_size, exc))
+
+
+@pytest.mark.skip("This test takes around 5 minutes to run. "
+                  "Comment this out to run it manually.")
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+def test_processor_max_tokens(model_id):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    info = processor.info
+
+    seen_aspect_ratios = set[float]()
+    image_sizes = list[ImageSize]()
+
+    # The aspect ratio of the grid layout is between 1 and 2
+    # NOTE: Assumes that feature size calculation is the same if we
+    # swap the width and height of the image
+    for w, h in itertools.product(range(32, 4096), repeat=2):
+        aspect_ratio = w / h
+        if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios:
+            image_sizes.append(ImageSize(w, h))
+            seen_aspect_ratios.add(aspect_ratio)
+
+    failed_size_excs = list[tuple[ImageSize, Exception]]()
+
+    validate_one = partial(
+        _validate_image_max_tokens_one,
+        processor,
+        info.get_max_image_tokens(),  # type: ignore
+        failed_size_excs,
+    )
+    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
+
+    if failed_size_excs:
+        msg = "Found failing image sizes:" \
+            + "\n========\n".join(f"[{size}]\n{exc}"
+                                  for size, exc in failed_size_excs)
+        raise AssertionError(msg)
+
+
+def _validate_image_prompt_replacements_one(
+    processor: BaseMultiModalProcessor,
+    num_imgs: int,
+    failed_size_excs: list[tuple[ImageSize, Exception]],
+    image_size: ImageSize,
+) -> None:
+    prompt = "<image>" * num_imgs
+    image = Image.new("RGB", size=image_size)
+    mm_data = {"image": [image] * num_imgs}
+
+    try:
+        # The processor will throw an error if there is a mismatch
+        # in the prompt replacements
+        processed_inputs = processor.apply(prompt, mm_data, {})
+
+        image_placeholders = processed_inputs["mm_placeholders"]["image"]
+        assert len(image_placeholders) == num_imgs
+
+        first_placeholder = image_placeholders[0]
+
+        # NOTE: There is a BOS token
+        assert first_placeholder.offset == 1
+        assert first_placeholder.length == (
+            len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
+
+    except Exception as exc:
+        failed_size_excs.append((image_size, exc))
+
+
+def _test_image_prompt_replacements(
+    processor,
+    *,
+    num_imgs: int,
+    image_sizes: list[ImageSize],
+) -> None:
+    """
+    Ensure LlavaNextMultiModalProcessor
+    handles prompt replacement properly for input images.
+    """
+    failed_size_excs = list[tuple[ImageSize, Exception]]()
+
+    validate_one = partial(
+        _validate_image_prompt_replacements_one,
+        processor,
+        num_imgs,
+        failed_size_excs,
+    )
+    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
+
+    if failed_size_excs:
+        msg = "Found failing image sizes:" \
+            + "\n========\n".join(f"[{size}]\n{exc}"
+                                  for size, exc in failed_size_excs)
+        raise AssertionError(msg)
+
+
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+@pytest.mark.parametrize("num_imgs", [1, 2])
+def test_processor_prompt_replacements_regression(model_id, num_imgs):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+
+    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
+                    (488, 183), (2560, 1669)]
+    image_sizes = [
+        size for w, h in image_ratios
+        for size in [ImageSize(w, h), ImageSize(h, w)]
+    ]
+
+    _test_image_prompt_replacements(
+        processor,
+        num_imgs=num_imgs,
+        image_sizes=image_sizes,
+    )
+
+
+@pytest.mark.skip("This test takes around 2 hours to run. "
+                  "Comment this out to run it manually.")
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+@pytest.mark.parametrize("num_imgs", [1])
+def test_processor_prompt_replacements_all(model_id, num_imgs):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+
+    seen_aspect_ratios = set[float]()
+    image_sizes = list[ImageSize]()
+
+    # The aspect ratio of the grid layout is between 1 and 2
+    # NOTE: Assumes that feature size calculation is the same if we
+    # swap the width and height of the image
+    for w, h in itertools.product(range(64, 1024), repeat=2):
+        aspect_ratio = w / h
+        if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios:
+            image_sizes.append(ImageSize(w, h))
+            seen_aspect_ratios.add(aspect_ratio)
+
+    _test_image_prompt_replacements(
+        processor,
+        num_imgs=num_imgs,
+        image_sizes=image_sizes,
+    )
diff --git a/vllm_v0.10.0/tests/models/multimodal/processing/test_llava_onevision.py b/vllm_v0.10.0/tests/models/multimodal/processing/test_llava_onevision.py
new file mode 100644
index 0000000..e6344c4
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/processing/test_llava_onevision.py
@@ -0,0 +1,184 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import itertools
+from functools import partial
+
+import pytest
+from PIL import Image
+from pqdm.threads import pqdm
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.parse import ImageSize
+from vllm.multimodal.processing import BaseMultiModalProcessor
+
+from ...utils import build_model_context
+
+
+def _validate_image_max_tokens_one(
+    processor: BaseMultiModalProcessor,
+    max_tokens: int,
+    failed_size_excs: list[tuple[ImageSize, Exception]],
+    image_size: ImageSize,
+) -> None:
+    info = processor.info
+    feature_size = info.get_num_image_tokens(image_width=image_size.width,
+                                             image_height=image_size.height)
+
+    try:
+        assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
+    except Exception as exc:
+        failed_size_excs.append((image_size, exc))
+
+
+@pytest.mark.skip("This test takes around 5 minutes to run. "
+                  "Comment this out to run it manually.")
+@pytest.mark.parametrize("model_id",
+                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+def test_processor_max_tokens(model_id):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    info = processor.info
+
+    seen_aspect_ratios = set[float]()
+    image_sizes = list[ImageSize]()
+
+    # The aspect ratio of the grid layout is between 1 and 6
+    # NOTE: Assumes that feature size calculation is the same if we
+    # swap the width and height of the image
+    for w, h in itertools.product(range(32, 4096), repeat=2):
+        aspect_ratio = w / h
+        if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios:
+            image_sizes.append(ImageSize(w, h))
+            seen_aspect_ratios.add(aspect_ratio)
+
+    failed_size_excs = list[tuple[ImageSize, Exception]]()
+
+    validate_one = partial(
+        _validate_image_max_tokens_one,
+        processor,
+        info.get_max_image_tokens(),  # type: ignore
+        failed_size_excs,
+    )
+    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
+
+    if failed_size_excs:
+        msg = "Found failing image sizes:" \
+            + "\n========\n".join(f"[{size}]\n{exc}"
+                                  for size, exc in failed_size_excs)
+        raise AssertionError(msg)
+
+
+def _validate_image_prompt_replacements_one(
+    processor: BaseMultiModalProcessor,
+    num_imgs: int,
+    failed_size_excs: list[tuple[ImageSize, Exception]],
+    image_size: ImageSize,
+) -> None:
+    prompt = "<image>" * num_imgs
+    image = Image.new("RGB", size=image_size)
+    mm_data = {"image": [image] * num_imgs}
+
+    try:
+        # The processor will throw an error if there is a mismatch
+        # in the prompt replacements
+        processed_inputs = processor.apply(prompt, mm_data, {})
+
+        image_placeholders = processed_inputs["mm_placeholders"]["image"]
+        assert len(image_placeholders) == num_imgs
+
+        first_placeholder = image_placeholders[0]
+
+        assert first_placeholder.offset == 0
+        assert first_placeholder.length == len(
+            processed_inputs["prompt_token_ids"]) // num_imgs
+    except Exception as exc:
+        failed_size_excs.append((image_size, exc))
+
+
+def _test_image_prompt_replacements(
+    processor,
+    *,
+    num_imgs: int,
+    image_sizes: list[ImageSize],
+) -> None:
+    """
+    Ensure LlavaOnevisionMultiModalProcessor
+    handles prompt replacement properly for input images.
+    """
+    failed_size_excs = list[tuple[ImageSize, Exception]]()
+
+    validate_one = partial(
+        _validate_image_prompt_replacements_one,
+        processor,
+        num_imgs,
+        failed_size_excs,
+    )
+    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
+
+    if failed_size_excs:
+        msg = "Found failing image sizes:" \
+            + "\n========\n".join(f"[{size}]\n{exc}"
+                                  for size, exc in failed_size_excs)
+        raise AssertionError(msg)
+
+
+@pytest.mark.parametrize("model_id",
+                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+@pytest.mark.parametrize("num_imgs", [1, 2])
+def test_processor_prompt_replacements_regression(model_id, num_imgs):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+
+    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
+                    (488, 183), (2560, 1669)]
+    image_sizes = [
+        size for w, h in image_ratios
+        for size in [ImageSize(w, h), ImageSize(h, w)]
+    ]
+
+    _test_image_prompt_replacements(
+        processor,
+        num_imgs=num_imgs,
+        image_sizes=image_sizes,
+    )
+
+
+@pytest.mark.skip("This test takes around 2 hours to run. "
+                  "Comment this out to run it manually.")
+@pytest.mark.parametrize("model_id",
+                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+@pytest.mark.parametrize("num_imgs", [1])
+def test_processor_prompt_replacements_all(model_id, num_imgs):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+
+    seen_aspect_ratios = set[float]()
+    image_sizes = list[ImageSize]()
+
+    # The aspect ratio of the grid layout is between 1 and 6
+    # NOTE: Assumes that feature size calculation is the same if we
+    # swap the width and height of the image
+    for w, h in itertools.product(range(64, 1024), repeat=2):
+        aspect_ratio = w / h
+        if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios:
+            image_sizes.append(ImageSize(w, h))
+            seen_aspect_ratios.add(aspect_ratio)
+
+    _test_image_prompt_replacements(
+        processor,
+        num_imgs=num_imgs,
+        image_sizes=image_sizes,
+    )
diff --git a/vllm_v0.10.0/tests/models/multimodal/processing/test_minimax_vl_01.py b/vllm_v0.10.0/tests/models/multimodal/processing/test_minimax_vl_01.py
new file mode 100644
index 0000000..9387212
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/processing/test_minimax_vl_01.py
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from PIL import Image
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.parse import ImageSize
+from vllm.multimodal.processing import BaseMultiModalProcessor
+
+from ....conftest import ImageTestAssets
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["MiniMaxAI/MiniMax-VL-01"])
+@pytest.mark.parametrize("num_imgs", [1, 2])
+def test_processor_override(
+    image_assets: ImageTestAssets,
+    model_id: str,
+    num_imgs: int,
+):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    prompt = "<image>" * num_imgs
+    image = Image.new("RGB", size=(364, 364))
+    mm_data = {"image": [image] * num_imgs}
+
+    processed_inputs = processor.apply(prompt, mm_data, {})
+    image_placeholders = processed_inputs["mm_placeholders"]["image"]
+
+    assert len(image_placeholders) == num_imgs
+
+
+def _validate_image_prompt_replacements_one(
+    processor: BaseMultiModalProcessor,
+    num_imgs: int,
+    failed_size_excs: list[tuple[ImageSize, Exception]],
+    image_size: ImageSize,
+) -> None:
+    prompt = "<image>" * num_imgs
+    image = Image.new("RGB", size=image_size)
+    mm_data = {"image": [image] * num_imgs}
+
+    try:
+        processed_inputs = processor.apply(prompt, mm_data, {})
+
+        image_placeholders = processed_inputs["mm_placeholders"]["image"]
+        assert len(image_placeholders) == num_imgs
+
+    except Exception as exc:
+        failed_size_excs.append((image_size, exc))
+
+
+def _test_image_prompt_replacements(
+    processor,
+    *,
+    num_imgs: int,
+    image_sizes: list[ImageSize],
+) -> None:
+
+    failed_size_excs = list[tuple[ImageSize, Exception]]()
+
+    for size in image_sizes:
+        _validate_image_prompt_replacements_one(processor, num_imgs,
+                                                failed_size_excs, size)
+
+    if failed_size_excs:
+        msg = "Found failing image sizes:" \
+            + "\n========\n".join(f"[{size}]\n{exc}"
+                                  for size, exc in failed_size_excs)
+        raise AssertionError(msg)
+
+
+@pytest.mark.parametrize("model_id", ["MiniMaxAI/MiniMax-VL-01"])
+@pytest.mark.parametrize("num_imgs", [1, 2])
+def test_processor_prompt_replacements_regression(model_id, num_imgs):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+
+    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
+                    (488, 183), (2560, 1669)]
+    image_sizes = [
+        size for w, h in image_ratios
+        for size in [ImageSize(w, h), ImageSize(h, w)]
+    ]
+
+    _test_image_prompt_replacements(
+        processor,
+        num_imgs=num_imgs,
+        image_sizes=image_sizes,
+    )
diff --git a/vllm_v0.10.0/tests/models/multimodal/processing/test_mllama.py b/vllm_v0.10.0/tests/models/multimodal/processing/test_mllama.py
new file mode 100644
index 0000000..a6b20a1
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/processing/test_mllama.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for mllama's multimodal preprocessing and profiling."""
+import pytest
+from transformers import MllamaConfig
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.profiling import MultiModalProfiler
+
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id",
+                         ["meta-llama/Llama-3.2-11B-Vision-Instruct"])
+@pytest.mark.parametrize("max_model_len", [4096, 8192, 25600, 131072])
+@pytest.mark.parametrize("max_num_seqs", [1, 2, 8])
+def test_profiling(
+    model_id: str,
+    max_model_len: int,
+    max_num_seqs: int,
+):
+    # regression test for https://github.com/vllm-project/vllm/issues/13929
+    from vllm.model_executor.models.mllama import calc_token_per_chunk
+
+    model_config_kwargs = {
+        "max_model_len": max_model_len,
+    }
+    ctx = build_model_context(
+        model_id,
+        model_config_kwargs=model_config_kwargs,
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    mm_config = ctx.get_mm_config()
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    profiler = MultiModalProfiler(processor)
+
+    dummy_encoder_data = profiler.get_encoder_dummy_data(
+        max_model_len,
+        mm_counts=mm_config.limit_per_prompt,
+    )
+    dummy_mm_data = processor.dummy_inputs.get_dummy_processor_inputs(
+        max_model_len,
+        mm_counts=mm_config.limit_per_prompt,
+    )
+
+    hf_config = ctx.get_hf_config(MllamaConfig)
+    image_size = hf_config.vision_config.image_size
+    encoder_seq_lens = [len(dummy_encoder_data.prompt_token_ids)
+                        ] * max_num_seqs
+
+    mm_kwargs = processor.apply(
+        prompt=dummy_mm_data.prompt,
+        mm_data=dummy_mm_data.mm_data,
+        hf_processor_mm_kwargs=dict(),
+    )["mm_kwargs"]
+
+    # Get the actual number of encoder tokens for each sample.
+    # Because attn_metadata.encoder_seq_lens only counts the last
+    # group of images for each sample, which is used to cheat the
+    # block manager to allocate blocks for those images only.
+    # See MllamaMultiModalProcessor for more details.
+    num_tiles = [[t] for t in mm_kwargs.pop("num_tiles")]
+    num_tokens_per_tile = calc_token_per_chunk(image_size)
+    actual_encoder_seq_lens = [
+        sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles
+    ]
+
+    # simulate mllama image-present prefill.
+    for actual_len, last_group_len in zip(actual_encoder_seq_lens,
+                                          encoder_seq_lens):
+        assert actual_len >= last_group_len
diff --git a/vllm_v0.10.0/tests/models/multimodal/processing/test_nemotron_vl.py b/vllm_v0.10.0/tests/models/multimodal/processing/test_nemotron_vl.py
new file mode 100644
index 0000000..3ce88bc
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/processing/test_nemotron_vl.py
@@ -0,0 +1,134 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for Nemotron-Nano-VL's multimodal preprocessing kwargs."""
+from collections.abc import Mapping
+from typing import Optional
+
+import pytest
+from PIL import Image
+from transformers import PretrainedConfig
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.processing import BaseMultiModalProcessor
+
+from ....conftest import ImageTestAssets
+from ...utils import build_model_context
+
+
+def _get_expected_num_patches(
+    config: PretrainedConfig,
+    image: Image.Image,
+    num_imgs: int,
+    min_num: int,
+    max_num: int,
+):
+    from vllm.model_executor.models.internvl import (
+        calculate_internvl_targets, get_internvl_target_ratios)
+
+    width, height = image.size
+
+    blocks, _, _ = calculate_internvl_targets(
+        orig_width=width,
+        orig_height=height,
+        target_ratios=get_internvl_target_ratios(
+            min_num,
+            max_num,
+        ),
+        image_size=config.force_image_size,
+        use_thumbnail=False,
+    )
+    expected_num_patches = blocks
+
+    if config.use_thumbnail and expected_num_patches > 1:
+        expected_num_patches += 1
+
+    return expected_num_patches
+
+
+def _run_check(
+    processor: BaseMultiModalProcessor,
+    images: list[Image.Image],
+    min_num: int,
+    max_num: int,
+    mm_processor_kwargs: Mapping[str, object],
+):
+    tokenizer = processor.info.get_tokenizer()
+    config = processor.info.get_hf_config()
+    image_processor = processor.info.get_image_processor()
+
+    config.use_thumbnail = image_processor.use_thumbnail
+    prompt = "<image>" * len(images)
+    mm_data = {"image": images}
+
+    total_expected_num_patches = sum(
+        _get_expected_num_patches(config, image, len(images), min_num, max_num)
+        for image in images)
+    print(total_expected_num_patches)
+    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
+
+    # Ensure we have the right number of placeholders per num_crops size
+    image_token_id = tokenizer.convert_tokens_to_ids("<image>")
+    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
+    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
+    print("Image token count:", img_tok_count, "Pixel shape:", pixel_shape)
+    assert img_tok_count == 256 * total_expected_num_patches
+    assert pixel_shape[0] == total_expected_num_patches
+
+
+@pytest.mark.parametrize("model_id",
+                         ["nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"])
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+        [4.0, 2.0, 1.0],
+    ],
+)
+@pytest.mark.parametrize(
+    ("min_dynamic_patch", "max_dynamic_patch"),
+    [(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
+)
+@pytest.mark.parametrize("dynamic_image_size", [True, False])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
+def test_processor_override(
+    model_id: str,
+    image_assets: ImageTestAssets,
+    size_factors: list[int],
+    min_dynamic_patch: int,
+    max_dynamic_patch: int,
+    dynamic_image_size: Optional[bool],
+    kwargs_on_init: bool,
+):
+    mm_processor_kwargs = {
+        "min_dynamic_patch": min_dynamic_patch,
+        "max_dynamic_patch": max_dynamic_patch,
+        "dynamic_image_size": dynamic_image_size,
+    }
+
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
+        limit_mm_per_prompt={"image": len(size_factors)},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
+
+    min_num = min_dynamic_patch if dynamic_image_size else 1
+    max_num = max_dynamic_patch if dynamic_image_size else 1
+
+    _run_check(
+        processor,
+        [
+            rescale_image_size(image_assets[0].pil_image, f)
+            for f in size_factors
+        ],
+        min_num,
+        max_num,
+        hf_processor_mm_kwargs,
+    )
diff --git a/vllm_v0.10.0/tests/models/multimodal/processing/test_phi3v.py b/vllm_v0.10.0/tests/models/multimodal/processing/test_phi3v.py
new file mode 100644
index 0000000..1f3646f
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/processing/test_phi3v.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for phi3v's multimodal preprocessing kwargs."""
+import pytest
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from ....conftest import ImageTestAssets
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"])
+# yapf: disable
+@pytest.mark.parametrize(
+    ("mm_processor_kwargs", "expected_toks_per_img"),
+    [
+        ({"num_crops": 4}, 757),
+        ({"num_crops": 16}, 1921),
+        # the default num_crops of phi-3.5-vision is 4
+        ({}, 757),
+    ])
+# yapf: enable
+@pytest.mark.parametrize("num_imgs", [1, 2])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
+def test_processor_override(
+    image_assets: ImageTestAssets,
+    model_id: str,
+    mm_processor_kwargs: dict[str, int],
+    expected_toks_per_img: int,
+    num_imgs: int,
+    kwargs_on_init: bool,
+):
+    """Ensure Phi3VMultiModalProcessor handles num_crops properly."""
+    # Avoid initializing CUDA early
+    from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
+
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
+
+    # Build the image str / prompt based on the number of images we pass
+    img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
+    prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
+    mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
+
+    processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+
+    # Ensure we have the right number of placeholders per num_crops size
+    img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)
+    assert img_tok_count == expected_toks_per_img * num_imgs
diff --git a/vllm_v0.10.0/tests/models/multimodal/processing/test_phi4mm.py b/vllm_v0.10.0/tests/models/multimodal/processing/test_phi4mm.py
new file mode 100644
index 0000000..f16d261
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/processing/test_phi4mm.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for phi4mm's multimodal preprocessing kwargs."""
+import pytest
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from ....conftest import ImageTestAssets
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["microsoft/Phi-4-multimodal-instruct"])
+# yapf: disable
+@pytest.mark.parametrize(
+    ("mm_processor_kwargs", "expected_toks_per_img"),
+    [
+        ({"dynamic_hd": 4}, 1329),
+        ({"dynamic_hd": 16}, 4433),
+        # the default num_crops of phi-4-multimodal is 36
+        ({}, 9585),
+    ])
+# yapf: enable
+@pytest.mark.parametrize("num_imgs", [1, 2])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
+def test_processor_override(
+    image_assets: ImageTestAssets,
+    model_id: str,
+    mm_processor_kwargs: dict[str, int],
+    expected_toks_per_img: int,
+    num_imgs: int,
+    kwargs_on_init: bool,
+):
+    """Ensure Phi4MMMultiModalProcessor handles dynamic_hd properly."""
+    # Avoid initializing CUDA early
+    from vllm.model_executor.models.phi4mm import _IMAGE_PLACEHOLDER_TOKEN_ID
+
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
+
+    # Build the image str / prompt based on the number of images we pass
+    img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
+    prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
+
+    image_size = ctx.get_hf_config(
+    ).embd_layer["image_embd_layer"]["crop_size"]
+    dummy_image_size = (image_size * 7, image_size * 7)
+    dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
+    mm_data = {"image": [dummy_image] * num_imgs}
+
+    processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+
+    # Ensure we have the right number of placeholders per num_crops size
+    img_tok_count = processed_inputs["prompt_token_ids"].count(
+        _IMAGE_PLACEHOLDER_TOKEN_ID)
+    assert img_tok_count == expected_toks_per_img * num_imgs
diff --git a/vllm_v0.10.0/tests/models/multimodal/processing/test_qwen2_vl.py b/vllm_v0.10.0/tests/models/multimodal/processing/test_qwen2_vl.py
new file mode 100644
index 0000000..9d1cd18
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from ....conftest import ImageTestAssets
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
+# yapf: disable
+@pytest.mark.parametrize(
+    ("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"), [
+        ({}, 1426, (5704, 1176)),
+        ({"min_pixels": 64**2, "max_pixels": 512**2}, 330, (1320, 1176)),
+    ])
+# yapf: enable
+@pytest.mark.parametrize("num_imgs", [1, 2])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
+def test_processor_override(
+    image_assets: ImageTestAssets,
+    model_id: str,
+    mm_processor_kwargs: dict[str, object],
+    expected_toks_per_img: int,
+    expected_pixels_shape: tuple[int, int],
+    num_imgs: int,
+    kwargs_on_init: bool,
+):
+    """Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    tokenizer = processor.info.get_tokenizer()
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
+
+    # Build the image str / prompt based on the number of images we pass
+    prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs
+    mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
+
+    processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+
+    # Ensure we have the right number of placeholders per num_crops size
+    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
+    image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
+    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
+    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape
+
+    assert img_tok_count == expected_toks_per_img * num_imgs
+    assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs
+    assert pixel_shape[1] == expected_pixels_shape[1]
diff --git a/vllm_v0.10.0/tests/models/multimodal/processing/test_smolvlm.py b/vllm_v0.10.0/tests/models/multimodal/processing/test_smolvlm.py
new file mode 100644
index 0000000..af8f983
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/processing/test_smolvlm.py
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for smolvlm's multimodal preprocessing kwargs."""
+import pytest
+from transformers import SmolVLMConfig
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from ....conftest import ImageTestAssets
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["HuggingFaceTB/SmolVLM2-2.2B-Instruct"])
+# yapf: disable
+@pytest.mark.parametrize(
+    ("mm_processor_kwargs", "expected_toks_per_img"),
+    [
+        ({"max_image_size": {"longest_edge": 384}}, 1377),
+        ({"max_image_size": {"longest_edge": 768}}, 405),
+    ])
+# yapf: enable
+@pytest.mark.parametrize("num_imgs", [1, 2])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
+def test_processor_override(
+    image_assets: ImageTestAssets,
+    model_id: str,
+    mm_processor_kwargs: dict[str, object],
+    expected_toks_per_img: int,
+    num_imgs: int,
+    kwargs_on_init: bool,
+):
+    """Ensure Idefics3MultiModalProcessor handles num_crops properly."""
+    # Same as the previous test - don't initialize mm_processor_kwargs
+    # in this test and assume that the kwargs will be correctly expanded by
+    # the partial when calling the custom input processor.
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
+
+    # Build the image str / prompt based on the number of images we pass
+    placeholders = "<image>" if num_imgs == 1 else "\n".join(
+        f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
+    prompt = f"<|im_start|>User:{placeholders}\n<end_of_utterance>\nAssistant:"  # noqa: E501
+
+    # Build mm_data
+    image_size = ctx.get_hf_config(SmolVLMConfig).vision_config.image_size
+    dummy_image_size = (image_size * 4, image_size * 4)
+    dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
+    mm_data = {"image": [dummy_image] * num_imgs}
+
+    processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+
+    # Ensure the placeholders format are correct
+    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
+    hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
+    assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
+        "input_ids"][0]
+
+    # Ensure we have the right number of placeholders per num_crops size
+    image_token_id = ctx.get_hf_config().image_token_id
+    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
+    assert img_tok_count == expected_toks_per_img * num_imgs
diff --git a/vllm_v0.10.0/tests/models/multimodal/processing/test_transformers.py b/vllm_v0.10.0/tests/models/multimodal/processing/test_transformers.py
new file mode 100644
index 0000000..c7d1b52
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/processing/test_transformers.py
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from vllm.assets.image import ImageAsset
+from vllm.config import ModelConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+
+# yapf: disable
+@pytest.mark.parametrize("model_id",
+                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+def test_multimodal_processor(model_id):
+    model_config = ModelConfig(
+        model=model_id,
+        model_impl="transformers",
+    )
+
+    mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config, )
+
+    image_pil = ImageAsset('cherry_blossom').pil_image
+    mm_data = {"image": image_pil}
+    str_prompt = "<|im_start|>user <image>\nWhat is the content of this image?<|im_end|><|im_start|>assistant\n" # noqa: E501
+    str_processed_inputs = mm_processor.apply(
+        prompt=str_prompt,
+        mm_data=mm_data,
+        hf_processor_mm_kwargs={},
+    )
+
+    ids_prompt = [
+        151644, 872, 220, 151646, 198, 3838, 374, 279, 2213, 315, 419, 2168,
+        30, 151645, 151644, 77091, 198
+    ]
+    ids_processed_inputs = mm_processor.apply(
+        prompt=ids_prompt,
+        mm_data=mm_data,
+        hf_processor_mm_kwargs={},
+    )
+
+    assert str_processed_inputs["prompt"] == ids_processed_inputs["prompt"]
diff --git a/vllm_v0.10.0/tests/models/multimodal/test_mapping.py b/vllm_v0.10.0/tests/models/multimodal/test_mapping.py
new file mode 100644
index 0000000..5f20452
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/multimodal/test_mapping.py
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable
+
+import pytest
+import torch
+import transformers
+from transformers import AutoConfig, PreTrainedModel
+
+from vllm.config import ModelConfig
+from vllm.model_executor.models.utils import WeightsMapper
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.transformers_utils.config import try_get_safetensors_metadata
+
+from ..registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS
+
+
+def create_repo_dummy_weights(repo: str) -> Iterable[tuple[str, torch.Tensor]]:
+    """Create weights from safetensors checkpoint metadata"""
+    metadata = try_get_safetensors_metadata(repo)
+    weight_names = list(metadata.weight_map.keys())
+    with torch.device('meta'):
+        return ((name, torch.empty(0)) for name in weight_names)
+
+
+def create_model_dummy_weights(
+    repo: str,
+    model_arch: str,
+) -> Iterable[tuple[str, torch.Tensor]]:
+    """
+    Create weights from a dummy meta deserialized hf model with name conversion
+    """
+    model_cls: PreTrainedModel = getattr(transformers, model_arch)
+    config = AutoConfig.from_pretrained(repo)
+    with torch.device("meta"):
+        model: PreTrainedModel = model_cls._from_config(config)
+    return model.named_parameters()
+
+
+def model_architectures_for_test() -> list[str]:
+    arch_to_test = list[str]()
+    for model_arch, info in _MULTIMODAL_EXAMPLE_MODELS.items():
+        if not info.trust_remote_code and hasattr(transformers, model_arch):
+            model_cls: PreTrainedModel = getattr(transformers, model_arch)
+            if getattr(model_cls, "_checkpoint_conversion_mapping", None):
+                arch_to_test.append(model_arch)
+    return arch_to_test
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model_arch", model_architectures_for_test())
+def test_hf_model_weights_mapper(model_arch: str):
+    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
+
+    model_id = model_info.default
+
+    model_config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_info.tokenizer or model_id,
+        tokenizer_mode=model_info.tokenizer_mode,
+        trust_remote_code=model_info.trust_remote_code,
+        seed=0,
+        dtype="auto",
+        revision=None,
+        hf_overrides=model_info.hf_overrides,
+    )
+    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
+
+    original_weights = create_repo_dummy_weights(model_id)
+    hf_converted_weights = create_model_dummy_weights(model_id, model_arch)
+    mapper: WeightsMapper = model_cls.hf_to_vllm_mapper
+
+    mapped_original_weights = mapper.apply(original_weights)
+    mapped_hf_converted_weights = mapper.apply(hf_converted_weights)
+
+    ref_weight_names = set(map(lambda x: x[0], mapped_original_weights))
+    weight_names = set(map(lambda x: x[0], mapped_hf_converted_weights))
+
+    weights_missing = ref_weight_names - weight_names
+    weights_unmapped = weight_names - ref_weight_names
+    assert (not weights_missing and not weights_unmapped), (
+        f"Following weights are not mapped correctly: {weights_unmapped}, "
+        f"Missing expected weights: {weights_missing}.")
diff --git a/vllm_v0.10.0/tests/models/quantization/__init__.py b/vllm_v0.10.0/tests/models/quantization/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/models/quantization/test_aqlm.py b/vllm_v0.10.0/tests/models/quantization/test_aqlm.py
new file mode 100644
index 0000000..de6851e
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/quantization/test_aqlm.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm.platforms import current_platform
+
+# These ground truth generations were generated using `transformers==4.38.1
+# aqlm==1.1.0 torch==2.2.0`
+# and the below code:
+# ```python
+# from transformers import AutoTokenizer, AutoModelForCausalLM
+# model_id = "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"
+# quantized_model = AutoModelForCausalLM.from_pretrained(model_id,
+# torch_dtype="auto", device_map="cuda").cuda()
+# tokenizer = AutoTokenizer.from_pretrained(model_id)
+# outputs = []
+# for prompt in example_prompts:
+#     input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")
+#     hf_outputs = quantized_model.generate(input_ids, max_new_tokens=32)
+#     outputs.append(tokenizer.decode(hf_outputs[0][input_ids.shape[1]:]))
+# print(outputs)
+# ```
+ground_truth_generations = [
+    '\n### Features\n\n- **High-throughput**: v',
+    'The major milestones in the development of artificial intelligence from '
+    '195',
+    'Compare and contrast artificial intelligence with human intelligence in '
+    'terms of processing information. The',
+    'Explain the difference between supervised and unsupervised learning.'
+    '\nExplain',
+    'Write a short story about a robot that dreams for the first time. The',
+    'Analyze the impact of the COVID-19 pandemic on global economic',
+    'The Mona Lisa is a painting by Leonardo da Vinci, and it',
+    'The early bird catches the worm.\nThe early bird catches the'
+]
+
+
+@pytest.mark.skipif(not is_quant_method_supported("aqlm")
+                    or current_platform.is_rocm()
+                    or not current_platform.is_cuda(),
+                    reason="AQLM is not supported on this GPU type.")
+@pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [16])
+@pytest.mark.parametrize("num_logprobs", [1])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    # loop through the prompts to compare against the ground truth generations
+    for prompt_idx in range(len(example_prompts)):
+        vllm_output_ids, vllm_output_str, vllm_logprobs = vllm_outputs[
+            prompt_idx]
+
+        print("Prompt:          ", repr(example_prompts[prompt_idx]))
+        print("Reference output:", repr(ground_truth_generations[prompt_idx]))
+        print("Output output:   ", repr(vllm_output_str))
+        assert vllm_output_str == ground_truth_generations[prompt_idx]
diff --git a/vllm_v0.10.0/tests/models/quantization/test_awq.py b/vllm_v0.10.0/tests/models/quantization/test_awq.py
new file mode 100644
index 0000000..bd69619
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/quantization/test_awq.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import pytest
+import torch
+
+from vllm.multimodal.image import rescale_image_size
+
+from ...conftest import IMAGE_ASSETS, ImageTestAssets, VllmRunner
+from ..utils import check_logprobs_close
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "<|im_start|>User\n<image>\nWhat's the content in the center of the image?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
+    "cherry_blossom":
+    "<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
+})
+
+
+def run_awq_test(
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+    source_model: str,
+    quant_model: str,
+    *,
+    size_factors: list[float],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(source_model,
+                     max_model_len=4096,
+                     dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        source_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images)
+            for prompts, images in inputs_per_image
+        ]
+
+    with vllm_runner(quant_model,
+                     quantization="awq",
+                     max_model_len=4096,
+                     dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        quant_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images)
+            for prompts, images in inputs_per_image
+        ]
+
+    for source_outputs, quant_outputs in zip(source_outputs_per_image,
+                                             quant_outputs_per_image):
+        # TODO: Check whether using original CLIPVisionModel can improve
+        # consistency against HF
+        check_logprobs_close(
+            outputs_0_lst=source_outputs,
+            outputs_1_lst=quant_outputs,
+            name_0="source",
+            name_1="awq",
+        )
+
+
+@pytest.mark.parametrize(
+    ("source_model", "quant_model"),
+    [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")],
+)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@torch.inference_mode()
+def test_awq_models(vllm_runner, image_assets, source_model, quant_model,
+                    size_factors, dtype, max_tokens, num_logprobs,
+                    monkeypatch) -> None:
+
+    # Test V1: this test hangs during setup on single-scale input.
+    # TODO: fixure out why and re-enable this on V1.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
+    run_awq_test(
+        vllm_runner,
+        image_assets,
+        source_model,
+        quant_model,
+        size_factors=size_factors,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
diff --git a/vllm_v0.10.0/tests/models/quantization/test_bitblas.py b/vllm_v0.10.0/tests/models/quantization/test_bitblas.py
new file mode 100644
index 0000000..754ac9a
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/quantization/test_bitblas.py
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Compare the outputs of a GPTQ model to a bitblas model.
+
+Note: GPTQ and bitblas do not have bitwise correctness.
+As a result, in this test, we just confirm that the top selected tokens of the
+bitblas/GPTQ models are in the top 3 selections of each other.
+
+Note: bitblas internally uses locks to synchronize the threads. This can
+result in very slight nondeterminism for bitblas. As a result, we re-run the 
+test up to 3 times to see if we pass.
+"""
+from dataclasses import dataclass
+
+import pytest
+
+from ..utils import check_logprobs_close
+
+
+@dataclass
+class ModelPair:
+    model_bitblas: str
+    model_gptq: str
+
+
+model_pairs = [
+    ModelPair(model_bitblas="hxbgsyxh/opt-125m-4bit-128g-bitblas",
+              model_gptq="hxbgsyxh/opt-125m-4bit-128g"),
+]
+
+
+@pytest.mark.flaky(reruns=2)
+@pytest.mark.skipif(True, reason="BitBLAS takes too much time for tuning.")
+@pytest.mark.parametrize("model_pair", model_pairs)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model_pair: ModelPair,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with vllm_runner(model_pair.model_bitblas,
+                     dtype=dtype,
+                     quantization="bitblas") as bitblas_model:
+        bitblas_outputs = bitblas_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    with vllm_runner(model_pair.model_gptq, dtype=dtype,
+                     quantization="gptq") as gptq_model:
+        gptq_outputs = gptq_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=gptq_outputs,
+        outputs_1_lst=bitblas_outputs,
+        name_0="gptq",
+        name_1="bitblas",
+    )
diff --git a/vllm_v0.10.0/tests/models/quantization/test_bitsandbytes.py b/vllm_v0.10.0/tests/models/quantization/test_bitsandbytes.py
new file mode 100644
index 0000000..e53902c
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/quantization/test_bitsandbytes.py
@@ -0,0 +1,255 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+'''Tests whether bitsandbytes computation is enabled correctly.
+
+Run `pytest tests/quantization/test_bitsandbytes.py`.
+'''
+
+import gc
+
+import pytest
+import torch
+from transformers import BitsAndBytesConfig
+
+from tests.quantization.utils import is_quant_method_supported
+
+from ...utils import compare_two_settings, multi_gpu_test
+from ..utils import check_embeddings_close, check_logprobs_close
+
+models_4bit_to_test = [
+    ("facebook/opt-125m", "quantize opt model inflight"),
+    ("mistralai/Mistral-7B-Instruct-v0.3",
+     "quantize inflight model with both HF and Mistral format weights")
+]
+
+models_4bit_to_embedding_test = [
+    ("intfloat/e5-mistral-7b-instruct", "quantize embedding model inflight"),
+]
+
+models_4bit_to_moe_test = [
+    ("allenai/OLMoE-1B-7B-0125-Instruct", "quantize moe model inflight"),
+]
+
+models_pre_qaunt_4bit_to_test = [
+    ('PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed',
+     'read pre-quantized 4-bit FP4 model'),
+    ('poedator/opt-125m-bnb-4bit', 'read pre-quantized 4-bit NF4 opt model'),
+]
+
+models_pre_quant_8bit_to_test = [
+    ('meta-llama/Llama-Guard-3-8B-INT8',
+     'read pre-quantized llama 8-bit model'),
+    ("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"),
+]
+
+
+@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
+                    reason='bitsandbytes is not supported on this GPU type.')
+@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
+def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
+                             model_name, description) -> None:
+
+    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
+        load_in_4bit=True))
+    validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
+                             model_name, False, hf_model_kwargs)
+
+
+@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
+                    reason='bitsandbytes is not supported on this GPU type.')
+@pytest.mark.parametrize("model_name, description",
+                         models_pre_qaunt_4bit_to_test)
+def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
+                                       model_name, description) -> None:
+
+    validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
+                             model_name, True)
+
+
+@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
+                    reason='bitsandbytes is not supported on this GPU type.')
+@pytest.mark.parametrize("model_name, description",
+                         models_pre_quant_8bit_to_test)
+def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
+                             model_name, description) -> None:
+
+    validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
+                             model_name, True)
+
+
+@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
+                    reason='bitsandbytes is not supported on this GPU type.')
+@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
+@multi_gpu_test(num_gpus=2)
+def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
+                                model_name, description) -> None:
+
+    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
+        load_in_4bit=True))
+    validate_generated_texts(hf_runner,
+                             vllm_runner,
+                             example_prompts[:1],
+                             model_name,
+                             False,
+                             hf_model_kwargs,
+                             vllm_tp_size=2)
+
+
+@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
+                    reason='bitsandbytes is not supported on this GPU type.')
+@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
+@multi_gpu_test(num_gpus=2)
+def test_load_pp_4bit_bnb_model(model_name, description) -> None:
+    common_args = [
+        "--disable-log-stats",
+        "--disable-log-requests",
+        "--dtype",
+        "bfloat16",
+        "--enable-prefix-caching",
+        "--quantization",
+        "bitsandbytes",
+        "--gpu-memory-utilization",
+        "0.7",
+    ]
+    pp_args = [
+        *common_args,
+        "--pipeline-parallel-size",
+        "2",
+    ]
+    compare_two_settings(model_name, common_args, pp_args)
+
+
+@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
+                    reason='bitsandbytes is not supported on this GPU type.')
+@pytest.mark.parametrize("model_name, description", models_4bit_to_moe_test)
+def test_4bit_bnb_moe_model(hf_runner, vllm_runner, example_prompts,
+                            model_name, description) -> None:
+
+    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_use_double_quant=True,
+    ))
+    with vllm_runner(model_name,
+                     quantization='bitsandbytes',
+                     enforce_eager=False) as llm:
+        vllm_outputs = llm.generate_greedy_logprobs(example_prompts,
+                                                    max_tokens=32,
+                                                    num_logprobs=5)
+
+    with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:
+        transformers_outputs = llm.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens=32, num_logprobs=5)
+    check_logprobs_close(
+        outputs_0_lst=transformers_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="transformers",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
+                    reason='bitsandbytes is not supported on this GPU type.')
+@pytest.mark.parametrize("model_name, description",
+                         models_4bit_to_embedding_test)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_4bit_bnb_embedding_model(
+    model_name,
+    description,
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    dtype: str,
+) -> None:
+
+    # The example_prompts has ending "\n", for example:
+    # "Write a short story about a robot that dreams for the first time.\n"
+    # sentence_transformers will strip the input texts, see:
+    # https://github.com/UKPLab/sentence-transformers/blob/v3.1.1/sentence_transformers/models/Transformer.py#L159
+    # This makes the input_ids different between hf_model and vllm_model.
+    # So we need to strip the input texts to avoid test failing.
+    example_prompts = [str(s).strip() for s in example_prompts]
+
+    # Inflight 4bit quantization
+    with vllm_runner(model_name,
+                     task="embed",
+                     dtype=dtype,
+                     gpu_memory_utilization=0.5,
+                     quantization="bitsandbytes") as vllm_model:
+        vllm_outputs = vllm_model.embed(example_prompts)
+
+    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
+        load_in_4bit=True))
+    with hf_runner(
+            model_name,
+            dtype=dtype,
+            model_kwargs=hf_model_kwargs,
+            is_sentence_transformer=True,
+    ) as hf_model:
+        hf_outputs = hf_model.encode(example_prompts)
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+        tol=5e-2,
+    )
+
+
+def log_generated_texts(prompts, outputs, runner_name):
+    logged_texts = []
+    for i, (_, generated_text) in enumerate(outputs):
+        log_entry = {
+            "prompt": prompts[i],
+            "runner_name": runner_name,
+            "generated_text": generated_text,
+        }
+        logged_texts.append(log_entry)
+    return logged_texts
+
+
+def validate_generated_texts(hf_runner,
+                             vllm_runner,
+                             prompts,
+                             model_name,
+                             pre_quant=False,
+                             hf_model_kwargs=None,
+                             vllm_tp_size=1,
+                             max_tokens=8):
+
+    # NOTE: run vLLM first, as it requires a clean process
+    # when using distributed inference
+    with vllm_runner(model_name,
+                     quantization=None if pre_quant else 'bitsandbytes',
+                     tensor_parallel_size=vllm_tp_size,
+                     enforce_eager=False) as llm:
+
+        vllm_outputs = llm.generate_greedy(prompts, max_tokens)
+        vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")
+
+    # Clean up the GPU memory for the next test
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    if hf_model_kwargs is None:
+        hf_model_kwargs = {}
+
+    # Run with HF runner
+    with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:
+        hf_outputs = llm.generate_greedy(prompts, max_tokens)
+        hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")
+
+    # Clean up the GPU memory for the next test
+    gc.collect()
+    torch.cuda.empty_cache()
+    # Compare the generated strings
+    for hf_log, vllm_log in zip(hf_logs, vllm_logs):
+        hf_str = hf_log["generated_text"]
+        vllm_str = vllm_log["generated_text"]
+        prompt = hf_log["prompt"]
+        assert hf_str == vllm_str, (f"Model: {model_name}"
+                                    f"Mismatch between HF and vLLM outputs:\n"
+                                    f"Prompt: {prompt}\n"
+                                    f"HF Output: '{hf_str}'\n"
+                                    f"vLLM Output: '{vllm_str}'")
diff --git a/vllm_v0.10.0/tests/models/quantization/test_fp8.py b/vllm_v0.10.0/tests/models/quantization/test_fp8.py
new file mode 100644
index 0000000..10914ab
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/quantization/test_fp8.py
@@ -0,0 +1,164 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# flake8: noqa
+"""Tests fp8 models against ground truth generation
+Note: these tests will only pass on L4 GPU.
+"""
+import pytest
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm.platforms import current_platform
+from vllm.utils import STR_BACKEND_ENV_VAR
+
+from ..utils import check_logprobs_close
+
+
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+                    reason="fp8 is not supported on this GPU type.")
+@pytest.mark.parametrize(
+    "kv_cache_dtype,base_model,test_model",
+    [
+        # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
+        ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
+         "nm-testing/Llama-3.2-1B-Instruct-FP8-KV"),
+        # Test BF16 checkpoint w. fp8_e5m2 kv-cache.
+        ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
+         "meta-llama/Llama-3.2-1B-Instruct"),
+        # Test BF16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
+        ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
+         "meta-llama/Llama-3.2-1B-Instruct")
+    ])
+# Due to low-precision numerical divergence, we only test logprob of 4 tokens
+@pytest.mark.parametrize("max_tokens", [4])
+@pytest.mark.parametrize("enforce_eager", [True])
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
+# NOTE: Increasing this in this suite will fail CI because we currently cannot
+# reset distributed env properly. Use a value > 1 just when you test.
+@pytest.mark.parametrize("tensor_parallel_size", [1])
+# Due to low-precision numerical divergence, this test is too sensitive for
+# the async postprocessor
+@pytest.mark.parametrize("disable_async_output_proc", [True])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    kv_cache_dtype: str,
+    base_model: str,
+    test_model: str,
+    max_tokens: int,
+    enforce_eager: bool,
+    backend: str,
+    tensor_parallel_size: int,
+    disable_async_output_proc: bool,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """
+    Only checks log probs match to cover the discrepancy in
+    numerical sensitive kernels.
+    """
+
+    if backend == "FLASHINFER" and current_platform.is_rocm():
+        pytest.skip("Flashinfer does not support ROCm/HIP.")
+
+    if kv_cache_dtype == "fp8_e5m2" and current_platform.is_rocm():
+        pytest.skip(
+            f"{kv_cache_dtype} is currently not supported on ROCm/HIP.")
+
+    with monkeypatch.context() as m:
+        m.setenv("TOKENIZERS_PARALLELISM", 'true')
+        m.setenv(STR_BACKEND_ENV_VAR, backend)
+
+        MAX_MODEL_LEN = 1024
+        NUM_LOG_PROBS = 8
+
+        with vllm_runner(
+                base_model,
+                max_model_len=MAX_MODEL_LEN,
+                tensor_parallel_size=tensor_parallel_size,
+                enforce_eager=enforce_eager,
+                kv_cache_dtype="auto",
+                disable_async_output_proc=disable_async_output_proc,
+        ) as vllm_model:
+            baseline_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, NUM_LOG_PROBS)
+
+        with vllm_runner(
+                test_model,
+                max_model_len=MAX_MODEL_LEN,
+                tensor_parallel_size=tensor_parallel_size,
+                enforce_eager=enforce_eager,
+                kv_cache_dtype=kv_cache_dtype,
+                disable_async_output_proc=disable_async_output_proc,
+        ) as vllm_model:
+            test_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, NUM_LOG_PROBS)
+
+        check_logprobs_close(
+            outputs_0_lst=baseline_outputs,
+            outputs_1_lst=test_outputs,
+            name_0="fp16_kv_cache",
+            name_1="fp8_kv_cache",
+        )
+
+
+@pytest.mark.cpu_model
+@pytest.mark.skipif(not current_platform.is_cpu(),
+                    reason="test for the CPU backend.")
+@pytest.mark.parametrize(
+    "kv_cache_dtype,base_model,test_model",
+    [
+        # Test BF16 checkpoint w. fp8_e5m2 kv-cache.
+        ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
+         "meta-llama/Llama-3.2-1B-Instruct"),
+    ])
+# Due to low-precision numerical divergence, we only test logprob of 4 tokens
+@pytest.mark.parametrize("max_tokens", [4])
+# Due to low-precision numerical divergence, this test is too sensitive for
+# the async postprocessor
+@pytest.mark.parametrize("disable_async_output_proc", [True])
+def test_cpu_models(
+    vllm_runner,
+    example_prompts,
+    kv_cache_dtype: str,
+    base_model: str,
+    test_model: str,
+    max_tokens: int,
+    disable_async_output_proc: bool,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """
+    Only checks log probs match to cover the discrepancy in
+    numerical sensitive kernels.
+    """
+    with monkeypatch.context() as m:
+        m.setenv("TOKENIZERS_PARALLELISM", 'true')
+
+        MAX_MODEL_LEN = 1024
+        NUM_LOG_PROBS = 8
+
+        with vllm_runner(
+                base_model,
+                max_model_len=MAX_MODEL_LEN,
+                dtype="bfloat16",
+                kv_cache_dtype="auto",
+                disable_async_output_proc=disable_async_output_proc,
+        ) as vllm_model:
+            baseline_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, NUM_LOG_PROBS)
+
+        with vllm_runner(
+                test_model,
+                max_model_len=MAX_MODEL_LEN,
+                dtype="bfloat16",
+                kv_cache_dtype=kv_cache_dtype,
+                disable_async_output_proc=disable_async_output_proc,
+        ) as vllm_model:
+            test_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, NUM_LOG_PROBS)
+
+        check_logprobs_close(
+            outputs_0_lst=baseline_outputs,
+            outputs_1_lst=test_outputs,
+            name_0="bf16_kv_cache",
+            name_1="fp8_kv_cache",
+        )
diff --git a/vllm_v0.10.0/tests/models/quantization/test_gguf.py b/vllm_v0.10.0/tests/models/quantization/test_gguf.py
new file mode 100644
index 0000000..3e77d3e
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/quantization/test_gguf.py
@@ -0,0 +1,182 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests gguf models against unquantized models generations
+Note: To pass the test, quantization higher than Q4 should be used
+"""
+
+import os
+from typing import NamedTuple
+
+import pytest
+from huggingface_hub import hf_hub_download
+from pytest import MarkDecorator
+from transformers import AutoTokenizer
+
+from tests.quantization.utils import is_quant_method_supported
+
+from ...conftest import VllmRunner
+from ...utils import multi_gpu_test
+from ..utils import check_logprobs_close
+
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+MAX_MODEL_LEN = 1024
+
+
+class GGUFTestConfig(NamedTuple):
+    original_model: str
+    gguf_repo: str
+    gguf_filename: str
+    marks: list[MarkDecorator] = []
+
+    @property
+    def gguf_model(self):
+        return hf_hub_download(self.gguf_repo, filename=self.gguf_filename)
+
+
+LLAMA_CONFIG = GGUFTestConfig(
+    original_model="meta-llama/Llama-3.2-1B-Instruct",
+    gguf_repo="bartowski/Llama-3.2-1B-Instruct-GGUF",
+    gguf_filename="Llama-3.2-1B-Instruct-Q6_K.gguf",
+)
+
+QWEN2_CONFIG = GGUFTestConfig(
+    original_model="Qwen/Qwen2.5-1.5B-Instruct",
+    gguf_repo="Qwen/Qwen2.5-1.5B-Instruct-GGUF",
+    gguf_filename="qwen2.5-1.5b-instruct-q6_k.gguf",
+)
+
+PHI3_CONFIG = GGUFTestConfig(
+    original_model="microsoft/Phi-3.5-mini-instruct",
+    gguf_repo="bartowski/Phi-3.5-mini-instruct-GGUF",
+    gguf_filename="Phi-3.5-mini-instruct-IQ4_XS.gguf",
+)
+
+GPT2_CONFIG = GGUFTestConfig(
+    original_model="openai-community/gpt2-large",
+    gguf_repo="QuantFactory/gpt2-large-GGUF",
+    gguf_filename="gpt2-large.Q4_K_M.gguf",
+)
+
+STABLELM_CONFIG = GGUFTestConfig(
+    original_model="stabilityai/stablelm-3b-4e1t",
+    gguf_repo="afrideva/stablelm-3b-4e1t-GGUF",
+    gguf_filename="stablelm-3b-4e1t.q4_k_m.gguf",
+)
+
+STARCODER_CONFIG = GGUFTestConfig(
+    original_model="bigcode/starcoder2-3b",
+    gguf_repo="QuantFactory/starcoder2-3b-GGUF",
+    gguf_filename="starcoder2-3b.Q6_K.gguf",
+)
+
+DOLPHIN_CONFIG = GGUFTestConfig(
+    # Test VocabParallelEmbedding sharding issue.
+    original_model="cognitivecomputations/TinyDolphin-2.8-1.1b",
+    gguf_repo="tsunemoto/TinyDolphin-2.8-1.1b-GGUF",
+    gguf_filename="tinydolphin-2.8-1.1b.Q6_K.gguf",
+)
+
+MODELS = [
+    # LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458
+    QWEN2_CONFIG,
+    PHI3_CONFIG,
+    GPT2_CONFIG,
+    STABLELM_CONFIG,
+    DOLPHIN_CONFIG,
+    # STARCODER_CONFIG, # broken
+]
+
+
+def check_model_outputs(
+    vllm_runner: type[VllmRunner],
+    prompts: list[str],
+    model: GGUFTestConfig,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tp_size: int,
+):
+    tokenizer = AutoTokenizer.from_pretrained(model.original_model)
+    if tokenizer.chat_template is not None:
+        messages = [[{
+            'role': 'user',
+            'content': prompt
+        }] for prompt in prompts]
+        prompts = tokenizer.apply_chat_template(messages,
+                                                tokenize=False,
+                                                add_generation_prompt=True)
+
+    # Run gguf model.
+    with vllm_runner(model_name=model.gguf_model,
+                     enforce_eager=True,
+                     tokenizer_name=model.original_model,
+                     dtype=dtype,
+                     max_model_len=MAX_MODEL_LEN,
+                     tensor_parallel_size=tp_size) as gguf_model:
+        gguf_outputs = gguf_model.generate_greedy_logprobs(
+            prompts[:-1], max_tokens, num_logprobs)
+
+    # Run unquantized model.
+    # Should run with tp=1, otherwise the test will stuck at
+    # nccl initialization.
+    with vllm_runner(
+            model_name=model.original_model,
+            enforce_eager=True,  # faster tests
+            dtype=dtype,
+            max_model_len=MAX_MODEL_LEN,
+            tensor_parallel_size=1) as original_model:
+        original_outputs = original_model.generate_greedy_logprobs(
+            prompts[:-1], max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=original_outputs,
+        outputs_1_lst=gguf_outputs,
+        name_0="original",
+        name_1="gguf",
+    )
+
+
+@pytest.mark.skipif(not is_quant_method_supported("gguf"),
+                    reason="gguf is not supported on this GPU type.")
+@pytest.mark.parametrize("model", [
+    pytest.param(test_config, marks=test_config.marks)
+    for test_config in MODELS
+])
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("tp_size", [1])
+def test_models(
+    vllm_runner: type[VllmRunner],
+    example_prompts: list[str],
+    model: GGUFTestConfig,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tp_size: int,
+) -> None:
+    check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
+                        num_logprobs, tp_size)
+
+
+@pytest.mark.skipif(not is_quant_method_supported("gguf"),
+                    reason="gguf is not supported on this GPU type.")
+@pytest.mark.parametrize("model", [LLAMA_CONFIG])
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [8])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("tp_size", [2])
+@multi_gpu_test(num_gpus=2)
+def test_distributed(
+    vllm_runner: type[VllmRunner],
+    example_prompts: list[str],
+    model: GGUFTestConfig,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tp_size: int,
+) -> None:
+    check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
+                        num_logprobs, tp_size)
diff --git a/vllm_v0.10.0/tests/models/quantization/test_gptq_bitblas.py b/vllm_v0.10.0/tests/models/quantization/test_gptq_bitblas.py
new file mode 100644
index 0000000..c3aed77
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/quantization/test_gptq_bitblas.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Compare the outputs of a GPTQ model to a bitblas model.
+
+Note: GPTQ and bitblas do not have bitwise correctness.
+As a result, in this test, we just confirm that the top selected tokens of the
+bitblas/GPTQ models are in the top 3 selections of each other.
+
+Note: bitblas internally uses locks to synchronize the threads. This can
+result in very slight nondeterminism for bitblas. As a result, we re-run the 
+test up to 3 times to see if we pass.
+"""
+from dataclasses import dataclass
+
+import pytest
+
+from ..utils import check_logprobs_close
+
+
+@dataclass
+class ModelPair:
+    model_gptq: str
+
+
+model_pairs = [
+    ModelPair(model_gptq="hxbgsyxh/opt-125m-4bit-128g"),
+]
+
+
+@pytest.mark.flaky(reruns=2)
+@pytest.mark.skipif(True, reason="BitBLAS takes too much time for tuning.")
+@pytest.mark.parametrize("model_pair", model_pairs)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model_pair: ModelPair,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with vllm_runner(model_pair.model_gptq,
+                     dtype=dtype,
+                     quantization="bitblas") as bitblas_model:
+        bitblas_outputs = bitblas_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    with vllm_runner(model_pair.model_gptq, dtype=dtype,
+                     quantization="gptq") as gptq_model:
+        gptq_outputs = gptq_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=gptq_outputs,
+        outputs_1_lst=bitblas_outputs,
+        name_0="gptq",
+        name_1="gptq_bitblas",
+    )
diff --git a/vllm_v0.10.0/tests/models/quantization/test_gptq_marlin.py b/vllm_v0.10.0/tests/models/quantization/test_gptq_marlin.py
new file mode 100644
index 0000000..db70a3b
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/quantization/test_gptq_marlin.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Compares the outputs of gptq vs gptq_marlin.
+
+Note: GPTQ and Marlin do not have bitwise correctness.
+As a result, in this test, we just confirm that the top selected tokens of the
+Marlin/GPTQ models are in the top 5 selections of each other.
+Note: Marlin internally uses locks to synchronize the threads. This can
+result in very slight nondeterminism for Marlin. As a result, we re-run the test
+up to 3 times to see if we pass.
+"""
+import os
+
+import pytest
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT
+from vllm.platforms import current_platform
+
+from ..utils import check_logprobs_close
+
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+MAX_MODEL_LEN = 1024
+
+MODELS = [
+    # act_order==True, group_size=128
+    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),
+
+    # 8-bit, act_order==True, group_size=channelwise
+    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),
+
+    # 4-bit, act_order==True, group_size=128
+    ("TechxGenus/gemma-1.1-2b-it-GPTQ", "main")
+]
+
+
+@pytest.mark.flaky(reruns=3)
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin")
+                    or current_platform.is_rocm()
+                    or not current_platform.is_cuda(),
+                    reason="gptq_marlin is not supported on this GPU type.")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half", "bfloat16"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    model_name, revision = model
+
+    # Run marlin.
+    with vllm_runner(model_name=model_name,
+                     revision=revision,
+                     dtype=dtype,
+                     quantization="marlin",
+                     max_model_len=MAX_MODEL_LEN,
+                     tensor_parallel_size=1) as gptq_marlin_model:
+
+        gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
+            example_prompts[:-1], max_tokens, num_logprobs)
+    _ROPE_DICT.clear()  # clear rope cache to avoid rope dtype error
+
+    # Run gptq.
+    # The naive gptq kernel doesn't support bf16 yet.
+    # Here we always compare fp16/bf16 gpt marlin kernel
+    # to fp16 gptq kernel.
+    with vllm_runner(model_name=model_name,
+                     revision=revision,
+                     dtype="half",
+                     quantization="gptq",
+                     max_model_len=MAX_MODEL_LEN,
+                     tensor_parallel_size=1) as gptq_model:
+        gptq_outputs = gptq_model.generate_greedy_logprobs(
+            example_prompts[:-1], max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=gptq_outputs,
+        outputs_1_lst=gptq_marlin_outputs,
+        name_0="gptq",
+        name_1="gptq_marlin",
+    )
diff --git a/vllm_v0.10.0/tests/models/quantization/test_gptq_marlin_24.py b/vllm_v0.10.0/tests/models/quantization/test_gptq_marlin_24.py
new file mode 100644
index 0000000..9b86ae9
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/quantization/test_gptq_marlin_24.py
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Compare the outputs of a GPTQ model to a Marlin_24 model.
+
+Note: GPTQ and Marlin_24 do not have bitwise correctness.
+As a result, in this test, we just confirm that the top selected tokens of the
+Marlin/GPTQ models are in the top 3 selections of each other.
+"""
+from dataclasses import dataclass
+
+import pytest
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm.platforms import current_platform
+
+from ..utils import check_logprobs_close
+
+
+@dataclass
+class ModelPair:
+    model_marlin: str
+    model_gptq: str
+
+
+model_pairs = [
+    # 4-bit, group_size == 128
+    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
+              model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"),
+    # # 4-bit, group_size == channelwise
+    # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
+    #           model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
+
+    # 8-bit, group_size == 128
+    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
+              model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"),
+    # # 8-bit, group_size == channelwise
+    # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
+    #           model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
+]
+
+
+@pytest.mark.flaky(reruns=2)
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24")
+                    or current_platform.is_rocm()
+                    or not current_platform.is_cuda(),
+                    reason="Marlin24 is not supported on this GPU type.")
+@pytest.mark.parametrize("model_pair", model_pairs)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [8])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model_pair: ModelPair,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with vllm_runner(model_pair.model_marlin,
+                     dtype=dtype,
+                     quantization="gptq_marlin_24") as marlin_24_model:
+        marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    with vllm_runner(model_pair.model_gptq, dtype=dtype,
+                     quantization="gptq") as gptq_model:
+        gptq_outputs = gptq_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=gptq_outputs,
+        outputs_1_lst=marlin_24_outputs,
+        name_0="gptq",
+        name_1="marlin_24",
+    )
diff --git a/vllm_v0.10.0/tests/models/quantization/test_modelopt.py b/vllm_v0.10.0/tests/models/quantization/test_modelopt.py
new file mode 100644
index 0000000..e23d4d9
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/quantization/test_modelopt.py
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# flake8: noqa
+"""Tests Model Optimizer fp8 models against ground truth generation
+Note: these tests will only pass on H100
+"""
+import os
+
+import pytest
+from transformers import AutoTokenizer
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm import LLM, SamplingParams
+
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+MAX_MODEL_LEN = 1024
+
+MODELS = ["nvidia/Llama-3.1-8B-Instruct-FP8"]
+
+EXPECTED_STRS_MAP = {
+    "nvidia/Llama-3.1-8B-Instruct-FP8": [
+        "You're referring to VLLM, a high-performance Large Language Model (LLM) inference and",
+        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
+        'The comparison between artificial intelligence (AI) and human intelligence in terms of processing information is a complex and',
+        'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
+        '**The Spark of Imagination**\n\nZeta-5, a sleek and efficient robot, whir',
+        'The COVID-19 pandemic has had a profound impact on global economic structures and business models, leading to',
+        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
+        'Here are the translations:\n\n**Japanese:** 「早起きは早く獲物をとる'
+    ]
+}
+
+
+# This test compares against golden strings for exact match since
+# there is no baseline implementation to compare against
+# and is unstable w.r.t specifics of the fp8 implementation or
+# the hardware being run on.
+# Disabled to prevent it from breaking the build
+@pytest.mark.skip(
+    reason=
+    "Prevent unstable test based on golden strings from breaking the build.")
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+                    reason="fp8 is not supported on this GPU type.")
+@pytest.mark.parametrize("model_name", MODELS)
+def test_models(example_prompts, model_name) -> None:
+    llm = LLM(
+        model=model_name,
+        max_model_len=MAX_MODEL_LEN,
+        trust_remote_code=True,
+        enforce_eager=True,
+        quantization="modelopt",
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    formatted_prompts = [
+        tokenizer.apply_chat_template([{
+            "role": "user",
+            "content": prompt
+        }],
+                                      tokenize=False,
+                                      add_generation_prompt=True)
+        for prompt in example_prompts
+    ]
+    params = SamplingParams(max_tokens=20, temperature=0)
+    generations: list[str] = []
+    # Note: these need to be run 1 at a time due to numerical precision,
+    # since the expected strs were generated this way.
+    for prompt in formatted_prompts:
+        outputs = llm.generate(prompt, params)
+        generations.append(outputs[0].outputs[0].text)
+    del llm
+
+    print(model_name, generations)
+    expected_strs = EXPECTED_STRS_MAP[model_name]
+    for i in range(len(example_prompts)):
+        generated_str = generations[i]
+        expected_str = expected_strs[i]
+        assert expected_str == generated_str, (
+            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
diff --git a/vllm_v0.10.0/tests/models/quantization/test_mxfp4.py b/vllm_v0.10.0/tests/models/quantization/test_mxfp4.py
new file mode 100644
index 0000000..7b8a334
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/quantization/test_mxfp4.py
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# flake8: noqa
+"""Tests Quark mxfp4 models against ground truth generation
+"""
+import pytest
+
+from vllm import LLM, SamplingParams
+
+MODELS = ["amd/Llama-2-7b-chat-hf-wmxfp4-amxfp4-kvfp8-scale-uint8"]
+
+EXPECTED_STRS_MAP = {
+    "amd/Llama-2-7b-chat-hf-wmxfp4-amxfp4-kvfp8-scale-uint8": [
+        '\n### Key Features\n\n* **High-throughput Inference**: vLL',
+        '\nArtificial intelligence (AI) has evolved significantly since its inception in the 1',
+        'Artificial intelligence (AI) and human intelligence (HI) are two distinct concepts that have been',
+        'A neural network is a machine learning model inspired by the structure of the human brain. It consists of',
+        '\nTitle: The Dreaming Robot\n\nAs the sun set on the bustling metropol',
+        '\nThe COVID-19 pandemic has had a profound impact on global economic structures and business',
+        'The Mona Lisa painting, created by Leonardo da Vinci in the early 16th',
+        " everybody knows this proverbial saying, but did you know that it's not entirely accurate?",
+    ]
+}
+
+
+@pytest.mark.skip(reason="Model to be released in the future")
+@pytest.mark.quant_model
+@pytest.mark.parametrize("model_name", MODELS)
+def test_models(example_prompts, model_name) -> None:
+    sampling_params = SamplingParams(max_tokens=20, temperature=0)
+    llm = LLM(
+        model=model_name,
+        kv_cache_dtype="fp8",
+        quantization="quark",
+    )
+    outputs = llm.generate(example_prompts, sampling_params)
+    for i, output in enumerate(outputs):
+        output_str = output.outputs[0].text
+        expected_str = EXPECTED_STRS_MAP[model_name][i]
+        assert expected_str == output_str, (
+            f"Expected: {expected_str!r}\nvLLM: {output_str!r}")
diff --git a/vllm_v0.10.0/tests/models/quantization/test_nvfp4.py b/vllm_v0.10.0/tests/models/quantization/test_nvfp4.py
new file mode 100644
index 0000000..b3c217e
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/quantization/test_nvfp4.py
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# flake8: noqa
+"""Tests Model Optimizer nvfp4 models against ground truth generation
+Note: these tests will only pass on B200
+"""
+import os
+from typing import List
+
+import pytest
+from transformers import AutoTokenizer
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm import LLM, SamplingParams
+
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+MAX_MODEL_LEN = 1024
+
+MODELS = ["nvidia/Llama-3.3-70B-Instruct-FP4"]
+
+EXPECTED_STRS_MAP = {
+    "nvidia/Llama-3.3-70B-Instruct-FP4": [
+        'vLLM (Vectorized Large Language Model) is indeed a high-throughput and memory-efficient inference',
+        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
+        'Artificial intelligence (AI) and human intelligence (HI) are two distinct forms of intelligence that process',
+        'A neural network is a type of machine learning model inspired by the structure and function of the human brain',
+        'In the heart of a cutting-edge robotics lab, a team of engineers had been working tirelessly to push',
+        'The COVID-19 pandemic has had a profound impact on global economic structures and future business models, leading',
+        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
+        'Here are the translations:\n\n* Japanese: (Sasuga no tori ga miwa o ts'
+    ]
+}
+
+
+# This test compares against golden strings for exact match since
+# there is no baseline implementation to compare against
+# and is unstable w.r.t specifics of the fp4 implementation or
+# the hardware being run on.
+# Disabled to prevent it from breaking the build
+@pytest.mark.skip(
+    reason=
+    "Prevent unstable test based on golden strings from breaking the build "
+    " and test input model being too large and hanging the system.")
+@pytest.mark.skipif(not is_quant_method_supported("modelopt_fp4"),
+                    reason="modelopt_fp4 is not supported on this GPU type.")
+@pytest.mark.parametrize("model_name", MODELS)
+def test_models(example_prompts, model_name) -> None:
+    llm = LLM(
+        model=model_name,
+        max_model_len=MAX_MODEL_LEN,
+        trust_remote_code=True,
+        enforce_eager=True,
+        quantization="modelopt_fp4",
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    formatted_prompts = [
+        tokenizer.apply_chat_template([{
+            "role": "user",
+            "content": prompt
+        }],
+                                      tokenize=False,
+                                      add_generation_prompt=True)
+        for prompt in example_prompts
+    ]
+    params = SamplingParams(max_tokens=20, temperature=0)
+    generations: List[str] = []
+    # Note: these need to be run 1 at a time due to numerical precision,
+    # since the expected strs were generated this way.
+    for prompt in formatted_prompts:
+        outputs = llm.generate(prompt, params)
+        generations.append(outputs[0].outputs[0].text)
+    del llm
+
+    print(model_name, generations)
+    expected_strs = EXPECTED_STRS_MAP[model_name]
+    for i in range(len(example_prompts)):
+        generated_str = generations[i]
+        expected_str = expected_strs[i]
+        assert expected_str == generated_str, (
+            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
diff --git a/vllm_v0.10.0/tests/models/registry.py b/vllm_v0.10.0/tests/models/registry.py
new file mode 100644
index 0000000..84ca0bc
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/registry.py
@@ -0,0 +1,545 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Mapping, Set
+from dataclasses import dataclass, field
+from typing import Any, Literal, Optional
+
+import pytest
+from packaging.version import Version
+from transformers import __version__ as TRANSFORMERS_VERSION
+
+from vllm.config import TokenizerMode
+
+
+@dataclass(frozen=True)
+class _HfExamplesInfo:
+    default: str
+    """The default model to use for testing this architecture."""
+
+    extras: Mapping[str, str] = field(default_factory=dict)
+    """Extra models to use for testing this architecture."""
+
+    tokenizer: Optional[str] = None
+    """Set the tokenizer to load for this architecture."""
+
+    tokenizer_mode: TokenizerMode = "auto"
+    """Set the tokenizer type for this architecture."""
+
+    speculative_model: Optional[str] = None
+    """
+    The default model to use for testing this architecture, which is only used
+    for speculative decoding.
+    """
+
+    min_transformers_version: Optional[str] = None
+    """
+    The minimum version of HF Transformers that is required to run this model.
+    """
+
+    max_transformers_version: Optional[str] = None
+    """
+    The maximum version of HF Transformers that this model runs on.
+    """
+
+    transformers_version_reason: Optional[str] = None
+    """
+    The reason for the minimum/maximum version requirement.
+    """
+
+    is_available_online: bool = True
+    """
+    Set this to ``False`` if the name of this architecture no longer exists on
+    the HF repo. To maintain backwards compatibility, we have not removed them
+    from the main model registry, so without this flag the registry tests will
+    fail.
+    """
+
+    trust_remote_code: bool = False
+    """The ``trust_remote_code`` level required to load the model."""
+
+    v0_only: bool = False
+    """The model is only available with the vLLM V0 engine."""
+
+    hf_overrides: dict[str, Any] = field(default_factory=dict)
+    """The ``hf_overrides`` required to load the model."""
+
+    max_model_len: Optional[int] = None
+    """
+    The maximum model length to use for this model. Some models default to a
+    length that is too large to fit into memory in CI.
+    """
+
+    revision: Optional[str] = None
+    """
+    The specific revision (commit hash, tag, or branch) to use for the model.
+    If not specified, the default revision will be used.
+    """
+
+    def check_transformers_version(
+        self,
+        *,
+        on_fail: Literal["error", "skip"],
+    ) -> None:
+        """
+        If the installed transformers version does not meet the requirements,
+        perform the given action.
+        """
+        if (self.min_transformers_version is None
+                and self.max_transformers_version is None):
+            return
+
+        current_version = TRANSFORMERS_VERSION
+        cur_base_version = Version(current_version).base_version
+        min_version = self.min_transformers_version
+        max_version = self.max_transformers_version
+        msg = f"`transformers=={current_version}` installed, but `transformers"
+        # Only check the base version for the min/max version, otherwise preview
+        # models cannot be run because `x.yy.0.dev0`<`x.yy.0`
+        if min_version and Version(cur_base_version) < Version(min_version):
+            msg += f">={min_version}` is required to run this model."
+        elif max_version and Version(cur_base_version) > Version(max_version):
+            msg += f"<={max_version}` is required to run this model."
+        else:
+            return
+
+        if self.transformers_version_reason:
+            msg += f" Reason: {self.transformers_version_reason}"
+
+        if on_fail == "error":
+            raise RuntimeError(msg)
+        else:
+            pytest.skip(msg)
+
+    def check_available_online(
+        self,
+        *,
+        on_fail: Literal["error", "skip"],
+    ) -> None:
+        """
+        If the model is not available online, perform the given action.
+        """
+        if not self.is_available_online:
+            msg = "Model is not available online"
+
+            if on_fail == "error":
+                raise RuntimeError(msg)
+            else:
+                pytest.skip(msg)
+
+
+# yapf: disable
+_TEXT_GENERATION_EXAMPLE_MODELS = {
+    # [Decoder-only]
+    "AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B",
+                                   trust_remote_code=True),
+    "AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B",
+                                         trust_remote_code=True),
+    "ArceeForCausalLM": _HfExamplesInfo("arcee-ai/AFM-4.5B-Base",
+                                        is_available_online=False),
+    "ArcticForCausalLM": _HfExamplesInfo("Snowflake/snowflake-arctic-instruct",
+                                         trust_remote_code=True),
+    "BaiChuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan-7B",
+                                         trust_remote_code=True),
+    "BaichuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan2-7B-chat",
+                                         trust_remote_code=True),
+    "BailingMoeForCausalLM": _HfExamplesInfo("inclusionAI/Ling-lite-1.5",
+                                         trust_remote_code=True),
+    "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B",
+                                        extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"}),  # noqa: E501
+    "BloomForCausalLM": _HfExamplesInfo("bigscience/bloom-560m",
+                                        {"1b": "bigscience/bloomz-1b1"}),
+    "ChatGLMModel": _HfExamplesInfo("THUDM/chatglm3-6b",
+                                    trust_remote_code=True,
+                                    max_transformers_version="4.48"),
+    "ChatGLMForConditionalGeneration": _HfExamplesInfo("thu-coai/ShieldLM-6B-chatglm3",  # noqa: E501
+                                                       trust_remote_code=True),
+    "CohereForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r-v01",
+                                         trust_remote_code=True),
+    "Cohere2ForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r7b-12-2024", # noqa: E501
+                                         trust_remote_code=True),
+    "DbrxForCausalLM": _HfExamplesInfo("databricks/dbrx-instruct"),
+    "DeciLMForCausalLM": _HfExamplesInfo("nvidia/Llama-3_3-Nemotron-Super-49B-v1", # noqa: E501
+                                         trust_remote_code=True),
+    "DeepseekForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-llm-7b-chat"),
+    "DeepseekV2ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V2-Lite-Chat",  # noqa: E501
+                                         trust_remote_code=True),
+    "DeepseekV3ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V3",  # noqa: E501
+                                         trust_remote_code=True),
+    "Ernie4_5_ForCausalLM": _HfExamplesInfo("baidu/ERNIE-4.5-0.3B-PT",
+                                            min_transformers_version="4.54"),
+    "Ernie4_5_MoeForCausalLM": _HfExamplesInfo("baidu/ERNIE-4.5-21B-A3B-PT",
+                                               min_transformers_version="4.54"),
+    "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"),  # noqa: E501
+    "Exaone4ForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-4.0-32B"),  # noqa: E501
+    "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"),  # noqa: E501
+    "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
+    "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base",
+                                          min_transformers_version="4.53"),
+    "GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"),
+    "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
+    "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it"),
+    "Gemma3nForConditionalGeneration": _HfExamplesInfo("google/gemma-3n-E2B-it",    # noqa: E501
+                                          min_transformers_version="4.53"),
+    "GlmForCausalLM": _HfExamplesInfo("THUDM/glm-4-9b-chat-hf"),
+    "Glm4ForCausalLM": _HfExamplesInfo("THUDM/GLM-4-9B-0414"),
+    "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2",
+                                       {"alias": "gpt2"}),
+    "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder",
+                                             {"tiny": "bigcode/tiny_starcoder_py"}),  # noqa: E501
+    "GPTJForCausalLM": _HfExamplesInfo("Milos/slovak-gpt-j-405M",
+                                       {"6b": "EleutherAI/gpt-j-6b"}),
+    "GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-70m",
+                                          {"1b": "EleutherAI/pythia-1.4b"}),
+    "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"),
+    "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"),
+    "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview"),  # noqa: E501
+    "GraniteMoeSharedForCausalLM": _HfExamplesInfo("ibm-research/moe-7b-1b-active-shared-experts"),  # noqa: E501
+    "Grok1ModelForCausalLM": _HfExamplesInfo("hpcai-tech/grok-1",
+                                             trust_remote_code=True),
+    "HunYuanMoEV1ForCausalLM": _HfExamplesInfo("tencent/Hunyuan-A13B-Instruct",
+                                               trust_remote_code=True),
+    "HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124",
+                                               trust_remote_code=True),
+    "InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b",
+                                           trust_remote_code=True),
+    "InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b",
+                                            trust_remote_code=True),
+    "InternLM2VEForCausalLM": _HfExamplesInfo("OpenGVLab/Mono-InternVL-2B",
+                                              trust_remote_code=True),
+    "InternLM3ForCausalLM": _HfExamplesInfo("internlm/internlm3-8b-instruct",
+                                            trust_remote_code=True),
+    "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
+    "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini",
+                                        extras={"tiny": "ai21labs/Jamba-tiny-dev"}),  # noqa: E501
+    "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.2-1B-Instruct",
+                                        extras={"guard": "meta-llama/Llama-Guard-3-1B",  # noqa: E501
+                                                "hermes": "NousResearch/Hermes-3-Llama-3.1-8B", # noqa: E501
+                                                "fp8": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"}),  # noqa: E501
+    "LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
+                                        is_available_online=False),
+    "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
+    "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1"),
+    "FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"),  # noqa: E501
+    "MiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-2B-sft-bf16",
+                                         trust_remote_code=True),
+    "MiniCPM3ForCausalLM": _HfExamplesInfo("openbmb/MiniCPM3-4B",
+                                         trust_remote_code=True),
+    "MiniMaxForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01-hf",
+                                          min_transformers_version="4.53"),
+    "MiniMaxText01ForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01",
+                                                trust_remote_code=True,
+                                                revision="a59aa9cbc53b9fb8742ca4e9e1531b9802b6fdc3"),  # noqa: E501
+    "MiniMaxM1ForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-M1-40k",
+                                            trust_remote_code=True),
+    "MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"),
+    "MixtralForCausalLM": _HfExamplesInfo("mistralai/Mixtral-8x7B-Instruct-v0.1",  # noqa: E501
+                                          {"tiny": "TitanML/tiny-mixtral"}),  # noqa: E501
+    "QuantMixtralForCausalLM": _HfExamplesInfo("mistral-community/Mixtral-8x22B-v0.1-AWQ"),  # noqa: E501
+    "MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False),
+    "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"),
+    "NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"),
+    "NemotronHForCausalLM": _HfExamplesInfo("nvidia/Nemotron-H-8B-Base-8K",
+                                            trust_remote_code=True),
+    "OlmoForCausalLM": _HfExamplesInfo("allenai/OLMo-1B-hf"),
+    "Olmo2ForCausalLM": _HfExamplesInfo("allenai/OLMo-2-0425-1B"),
+    "OlmoeForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924-Instruct"),
+    "OPTForCausalLM": _HfExamplesInfo("facebook/opt-125m",
+                                      {"1b": "facebook/opt-iml-max-1.3b"}),
+    "OrionForCausalLM": _HfExamplesInfo("OrionStarAI/Orion-14B-Chat",
+                                        trust_remote_code=True),
+    "PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"),
+    "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2"),
+    "Phi3ForCausalLM": _HfExamplesInfo("microsoft/Phi-3-mini-4k-instruct"),
+    "Phi4FlashForCausalLM": _HfExamplesInfo("microsoft/Phi-4-mini-flash-reasoning", # noqa: E501
+                                        trust_remote_code=True,
+                                        v0_only=True,
+                                        max_model_len=10240),
+    "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
+                                         trust_remote_code=True),
+    "Plamo2ForCausalLM": _HfExamplesInfo("pfnet/plamo-2-1b",
+                                        trust_remote_code=True),
+    "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat",
+                                       trust_remote_code=True),
+    "Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-0.5B-Instruct",
+                                        extras={"2.5": "Qwen/Qwen2.5-0.5B-Instruct"}), # noqa: E501
+    "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
+    "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"),
+    "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
+    "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
+    "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"),  # noqa: E501
+    "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
+    "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
+    "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"),
+    "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
+                                            trust_remote_code=True),
+    "TeleFLMForCausalLM": _HfExamplesInfo("CofeAI/FLM-2-52B-Instruct-2407",
+                                            trust_remote_code=True),
+    "XverseForCausalLM": _HfExamplesInfo("xverse/XVERSE-7B-Chat",
+                                         tokenizer="meta-llama/Llama-2-7b",
+                                         trust_remote_code=True),
+    "Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct"),
+    "MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
+                                        trust_remote_code=True),
+    "Dots1ForCausalLM": _HfExamplesInfo("rednote-hilab/dots.llm1.inst",
+                                        min_transformers_version="4.53"),
+    # [Encoder-decoder]
+    "BartModel": _HfExamplesInfo("facebook/bart-base"),
+    "BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"),
+}
+
+_EMBEDDING_EXAMPLE_MODELS = {
+    # [Text-only]
+    "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5", v0_only=True),
+    "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2", v0_only=True),  # noqa: E501
+    "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
+    "GteModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
+                                               trust_remote_code=True),
+    "GteNewModel": _HfExamplesInfo("Alibaba-NLP/gte-base-en-v1.5",
+                                   trust_remote_code=True,
+                                   hf_overrides={"architectures": ["GteNewModel"]}),  # noqa: E501
+    "InternLM2ForRewardModel": _HfExamplesInfo("internlm/internlm2-1_8b-reward",
+                                               trust_remote_code=True),
+    "JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"),  # noqa: E501
+    "LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
+    "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
+    "ModernBertModel": _HfExamplesInfo("Alibaba-NLP/gte-modernbert-base",
+                                trust_remote_code=True, v0_only=True),
+    "NomicBertModel": _HfExamplesInfo("nomic-ai/nomic-embed-text-v2-moe",
+                                               trust_remote_code=True, v0_only=True),  # noqa: E501
+    "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
+    "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"),
+    "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B"),
+    "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2", v0_only=True),  # noqa: E501
+    "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1", v0_only=True),  # noqa: E501
+    "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small", v0_only=True),  # noqa: E501
+    # [Multimodal]
+    "LlavaNextForConditionalGeneration": _HfExamplesInfo("royokong/e5-v"),
+    "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full",
+                                         trust_remote_code=True),
+    "Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1"), # noqa: E501
+    "PrithviGeoSpatialMAE": _HfExamplesInfo("ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", # noqa: E501
+                                            is_available_online=False),  # noqa: E501
+}
+
+_SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = {
+    # [Decoder-only]
+    "GPT2ForSequenceClassification": _HfExamplesInfo("nie3e/sentiment-polish-gpt2-small"),  # noqa: E501
+
+    # [Cross-encoder]
+    "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2", v0_only=True),  # noqa: E501
+    "ModernBertForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base", v0_only=True), # noqa: E501
+    "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base", v0_only=True),  # noqa: E501
+    "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3", v0_only=True),  # noqa: E501
+}
+
+_AUTOMATIC_CONVERTED_MODELS = {
+    # Use as_seq_cls_model for automatic conversion
+    "GemmaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-gemma",  # noqa: E501
+                                                      v0_only=True,
+                                                      hf_overrides={"architectures": ["GemmaForSequenceClassification"], # noqa: E501
+                                                                    "classifier_from_token": ["Yes"],  # noqa: E501
+                                                                    "method": "no_post_processing"}),  # noqa: E501
+    "LlamaForSequenceClassification": _HfExamplesInfo("Skywork/Skywork-Reward-V2-Llama-3.2-1B"),  # noqa: E501
+    "Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"),  # noqa: E501
+    "Qwen3ForSequenceClassification": _HfExamplesInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls"),  # noqa: E501
+}
+
+_MULTIMODAL_EXAMPLE_MODELS = {
+    # [Decoder-only]
+    "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
+    "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereForAI/aya-vision-8b"), # noqa: E501
+    "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b",  # noqa: E501
+                                                     extras={"6b": "Salesforce/blip2-opt-6.7b"}),  # noqa: E501
+    "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"),  # noqa: E501
+    "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny",  # noqa: E501
+                                                extras={"fork": "Isotr0py/deepseek-vl2-tiny"},  # noqa: E501
+                                                max_transformers_version="4.48",  # noqa: E501
+                                                transformers_version_reason="HF model is not compatible.",  # noqa: E501
+                                                hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}),  # noqa: E501
+    "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
+    "Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it"),
+    "GraniteSpeechForConditionalGeneration": _HfExamplesInfo("ibm-granite/granite-speech-3.3-2b"),  # noqa: E501
+    "GLM4VForCausalLM": _HfExamplesInfo("THUDM/glm-4v-9b",
+                                        trust_remote_code=True,
+                                        hf_overrides={"architectures": ["GLM4VForCausalLM"]}),  # noqa: E501
+    "Glm4vForConditionalGeneration": _HfExamplesInfo("THUDM/GLM-4.1V-9B-Thinking", min_transformers_version="4.53"),  # noqa: E501
+    "Glm4MoeForCausalLM": _HfExamplesInfo("THUDM/GLM-4.5",
+                                          min_transformers_version="4.54",
+                                          is_available_online=False),   # noqa: E501
+    "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m",
+                                      extras={"2b": "h2oai/h2ovl-mississippi-2b"},  # noqa: E501
+                                      max_transformers_version="4.48",  # noqa: E501
+                                      transformers_version_reason="HF model is not compatible."),  # noqa: E501
+    "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
+                                         extras={"2B": "OpenGVLab/InternVL2-2B",
+                                                 "3.0": "OpenGVLab/InternVL3-1B"},  # noqa: E501
+                                         trust_remote_code=True),
+    "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3",  # noqa: E501
+                                                        {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}),  # noqa: E501
+    "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
+                                                    trust_remote_code=True),
+    "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct",  # noqa: E501
+                                                      extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"},  # noqa: E501
+                                                      trust_remote_code=True),
+    "Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct",   # noqa: E501
+                                                      max_model_len=10240),
+    "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
+                                                     extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501
+                                                             "mistral-fp8": "nm-testing/pixtral-12b-FP8-dynamic"}),  # noqa: E501
+    "LlavaNextForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-v1.6-mistral-7b-hf"),  # noqa: E501
+    "LlavaNextVideoForConditionalGeneration": _HfExamplesInfo("llava-hf/LLaVA-NeXT-Video-7B-hf"),  # noqa: E501
+    "LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),  # noqa: E501
+    "MantisForConditionalGeneration": _HfExamplesInfo("TIGER-Lab/Mantis-8B-siglip-llama3",  # noqa: E501
+                                                      max_transformers_version="4.48",  # noqa: E501
+                                                      transformers_version_reason="HF model is not compatible.",  # noqa: E501
+                                                      hf_overrides={"architectures": ["MantisForConditionalGeneration"]}),  # noqa: E501
+    "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6",
+                                trust_remote_code=True),
+    "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
+                                extras={"2.6": "openbmb/MiniCPM-V-2_6"},  # noqa: E501
+                                trust_remote_code=True),
+    "MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo("MiniMaxAI/MiniMax-VL-01", # noqa: E501
+                                              trust_remote_code=True,
+                                              v0_only=True),
+    "Mistral3ForConditionalGeneration": _HfExamplesInfo("mistralai/Mistral-Small-3.1-24B-Instruct-2503",  # noqa: E501
+                                                        extras={"fp8": "nm-testing/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic"}),  # noqa: E501
+    "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
+                                        max_transformers_version="4.48",
+                                        transformers_version_reason="Incorrectly-detected `tensorflow` import.",  # noqa: E501
+                                        extras={"olmo": "allenai/Molmo-7B-O-0924"},  # noqa: E501
+                                        trust_remote_code=True),
+    "NVLM_D": _HfExamplesInfo("nvidia/NVLM-D-72B",
+                              trust_remote_code=True),
+    "Llama_Nemotron_Nano_VL" : _HfExamplesInfo("nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1", # noqa: E501
+                                                     trust_remote_code=True),
+    "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224",  # noqa: E501
+                                                         extras={"v2": "google/paligemma2-3b-ft-docci-448"}),  # noqa: E501
+    "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
+                                        trust_remote_code=True,
+                                        max_transformers_version="4.48",
+                                        transformers_version_reason="Use of deprecated imports which have been removed.",  # noqa: E501
+                                        extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"}),  # noqa: E501
+    "Ovis": _HfExamplesInfo("AIDC-AI/Ovis2-1B", trust_remote_code=True,
+                            extras={"1.6-llama": "AIDC-AI/Ovis1.6-Llama3.2-3B",
+                                    "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}),  # noqa: E501
+    "Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
+                                        trust_remote_code=True),
+    "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409",  # noqa: E501
+                                                       tokenizer_mode="mistral"),
+    "QwenVLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen-VL",
+                                                      extras={"chat": "Qwen/Qwen-VL-Chat"},  # noqa: E501
+                                                      trust_remote_code=True,
+                                                      hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]}),  # noqa: E501
+    "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"),  # noqa: E501
+    "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"),  # noqa: E501
+    "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct", # noqa: E501
+                                                          max_model_len=4096),
+    "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B"),
+    "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"),  # noqa: E501
+    "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"),
+    "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"),  # noqa: E501
+    "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",  # noqa: E501
+                                     trust_remote_code=True),
+    "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b",  # noqa: E501
+                                                        hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}),  # noqa: E501
+    "Tarsier2ForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier2-Recap-7b",  # noqa: E501
+                                                        hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}),  # noqa: E501
+    "VoxtralForConditionalGeneration": _HfExamplesInfo(
+        "mistralai/Voxtral-Mini-3B-2507",
+        min_transformers_version="4.54",
+        # disable this temporarily until we support HF format
+        is_available_online=False,
+    ),
+    # [Encoder-decoder]
+    # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
+    # Therefore, we borrow the BartTokenizer from the original Bart model
+    "Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base",  # noqa: E501
+                                                         tokenizer="Isotr0py/Florence-2-tokenizer",  # noqa: E501
+                                                         trust_remote_code=True),  # noqa: E501
+    "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"),  # noqa: E501
+    "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),  # noqa: E501
+    # [Cross-encoder]
+    "JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"),   # noqa: E501
+}
+
+
+_SPECULATIVE_DECODING_EXAMPLE_MODELS = {
+    "MedusaModel": _HfExamplesInfo("JackFram/llama-68m",
+                                   speculative_model="abhigoyal/vllm-medusa-llama-68m-random"),  # noqa: E501
+    # Temporarily disabled.
+    # TODO(woosuk): Re-enable this once the MLP Speculator is supported in V1.
+    # "MLPSpeculatorPreTrainedModel": _HfExamplesInfo("JackFram/llama-160m",
+    #                                                 speculative_model="ibm-ai-platform/llama-160m-accelerator"),  # noqa: E501
+    "DeepSeekMTPModel": _HfExamplesInfo("luccafong/deepseek_mtp_main_random",
+                                        speculative_model="luccafong/deepseek_mtp_draft_random",  # noqa: E501
+                                        trust_remote_code=True),
+    "EagleLlamaForCausalLM": _HfExamplesInfo("yuhuili/EAGLE-LLaMA3-Instruct-8B",
+                                             trust_remote_code=True,
+                                             speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B",
+                                             tokenizer="meta-llama/Meta-Llama-3-8B-Instruct"),  # noqa: E501
+    "Eagle3LlamaForCausalLM": _HfExamplesInfo("yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",  # noqa: E501
+                                            trust_remote_code=True,
+                                            speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",
+                                            tokenizer="meta-llama/Llama-3.1-8B-Instruct"),
+    "EagleLlama4ForCausalLM": _HfExamplesInfo(
+        "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct",
+        trust_remote_code=True,
+        speculative_model="morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct",
+        tokenizer="meta-llama/Llama-4-Scout-17B-16E-Instruct"),  # noqa: E501
+    "EagleMiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-1B-sft-bf16",
+                                            trust_remote_code=True,
+                                            is_available_online=False,
+                                            speculative_model="openbmb/MiniCPM-2B-sft-bf16",
+                                            tokenizer="openbmb/MiniCPM-2B-sft-bf16"),
+    "Glm4MoeMTPModel": _HfExamplesInfo("THUDM/GLM-4.5",
+                                        speculative_model="THUDM/GLM-4.5",
+                                        min_transformers_version="4.54",
+                                        is_available_online=False),
+    "MiMoMTPModel": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
+                                    trust_remote_code=True,
+                                    speculative_model="XiaomiMiMo/MiMo-7B-RL")
+}
+
+_TRANSFORMERS_MODELS = {
+    "TransformersForCausalLM": _HfExamplesInfo("hmellor/Ilama-3.2-1B", trust_remote_code=True),  # noqa: E501
+    "TransformersForMultimodalLM": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"),
+}
+
+_EXAMPLE_MODELS = {
+    **_TEXT_GENERATION_EXAMPLE_MODELS,
+    **_EMBEDDING_EXAMPLE_MODELS,
+    **_SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS,
+    **_MULTIMODAL_EXAMPLE_MODELS,
+    **_SPECULATIVE_DECODING_EXAMPLE_MODELS,
+    **_TRANSFORMERS_MODELS,
+}
+
+
+class HfExampleModels:
+    def __init__(self, hf_models: Mapping[str, _HfExamplesInfo]) -> None:
+        super().__init__()
+
+        self.hf_models = hf_models
+
+    def get_supported_archs(self) -> Set[str]:
+        return self.hf_models.keys()
+
+    def get_hf_info(self, model_arch: str) -> _HfExamplesInfo:
+        return self.hf_models[model_arch]
+
+    def find_hf_info(self, model_id: str) -> _HfExamplesInfo:
+        for info in self.hf_models.values():
+            if info.default == model_id:
+                return info
+
+        # Fallback to extras
+        for info in self.hf_models.values():
+            if any(extra == model_id for extra in info.extras.values()):
+                return info
+
+        raise ValueError(f"No example model defined for {model_id}")
+
+
+HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)
+AUTO_EXAMPLE_MODELS = HfExampleModels(_AUTOMATIC_CONVERTED_MODELS)
diff --git a/vllm_v0.10.0/tests/models/test_initialization.py b/vllm_v0.10.0/tests/models/test_initialization.py
new file mode 100644
index 0000000..14d2430
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/test_initialization.py
@@ -0,0 +1,142 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import patch
+
+import pytest
+from transformers import PretrainedConfig
+
+from vllm import LLM
+from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
+from vllm.utils import GiB_bytes
+from vllm.v1.core.kv_cache_utils import get_kv_cache_config
+from vllm.v1.engine.core import EngineCore as V1EngineCore
+
+from ..utils import create_new_process_for_each_test
+from .registry import AUTO_EXAMPLE_MODELS, HF_EXAMPLE_MODELS, HfExampleModels
+
+
+@create_new_process_for_each_test()
+def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
+                   EXAMPLE_MODELS: HfExampleModels):
+    """The reason for using create_new_process_for_each_test is to avoid
+    the WARNING:
+        "We must use the 'spawn' multiprocessing start method. Overriding
+        VLLM_WORKER_MULTIPROC_METHOD to 'spawn'."
+    The spawn process causes the _initialize_kv_caches_v1 function below to
+    become ineffective.
+    """
+
+    model_info = EXAMPLE_MODELS.get_hf_info(model_arch)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
+
+    # FIXME: Possible memory leak in the previous tests?
+    if model_arch in ("Glm4vForConditionalGeneration",
+                      "GraniteSpeechForConditionalGeneration",
+                      "KimiVLForConditionalGeneration"):
+        pytest.skip("Avoid OOM")
+
+    if model_arch in ("Llama4ForCausalLM", "EagleLlama4ForCausalLM"):
+        from vllm.model_executor.models.llama4 import Llama4ForCausalLM
+        from vllm.model_executor.models.registry import ModelRegistry
+        ModelRegistry.register_model("Llama4ForCausalLM", Llama4ForCausalLM)
+
+    # Avoid OOM and reduce initialization time by only using 1 layer
+    def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
+        hf_config.update(model_info.hf_overrides)
+
+        text_config = hf_config.get_text_config()
+
+        # Ensure at least 2 expert per group
+        # Since `grouped_topk` assumes top-2
+        n_group = getattr(text_config, 'n_group', None)
+        num_experts = n_group * 2 if n_group is not None else 2
+
+        # we use three layers for Gemma-3n to check
+        # both normal layer and kv_shared_layer
+        num_hidden_layers = (3 if model_arch
+                             == "Gemma3nForConditionalGeneration" else 1)
+
+        text_config.update({
+            "num_layers": 1,
+            "num_hidden_layers": num_hidden_layers,
+            "num_experts": num_experts,
+            "num_experts_per_tok": 2,
+            "num_local_experts": num_experts,
+            # Otherwise there will not be any expert layers
+            "first_k_dense_replace": 0,
+            # To avoid OOM on DeepSeek-V3
+            "n_routed_experts": num_experts,
+            # For Gemma-3n
+            "num_kv_shared_layers": 1,
+        })
+
+        if hasattr(hf_config, "vision_config"):
+            hf_config.vision_config.update({
+                "num_layers": 1,
+                "num_hidden_layers": 1,
+            })
+
+        # e.g.: ibm-granite/granite-speech-3.3-2b
+        if hasattr(hf_config, "encoder_config"):
+            hf_config.encoder_config.update({
+                "num_layers": 1,
+                "num_hidden_layers": 1,
+            })
+
+        return hf_config
+
+    # Avoid calling model.forward()
+    def _initialize_kv_caches_v0(self) -> None:
+        self.cache_config.num_gpu_blocks = 0
+        self.cache_config.num_cpu_blocks = 0
+
+    def _initialize_kv_caches_v1(self, vllm_config):
+        kv_cache_specs = self.model_executor.get_kv_cache_specs()
+        scheduler_kv_cache_config = get_kv_cache_config(
+            vllm_config,
+            kv_cache_specs[0],
+            10 * GiB_bytes,
+        )
+
+        # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
+        return 1, 0, scheduler_kv_cache_config
+
+    with (patch.object(V0LLMEngine, "_initialize_kv_caches",
+                       _initialize_kv_caches_v0),
+          patch.object(V1EngineCore, "_initialize_kv_caches",
+                       _initialize_kv_caches_v1), monkeypatch.context() as m):
+        if model_info.v0_only:
+            m.setenv("VLLM_USE_V1", "0")
+        if model_arch == "Phi4FlashForCausalLM":
+            # Phi4FlashForCausalLM only supports DIFFERENTIAL_FLASH_ATTN backend
+            m.setenv("VLLM_ATTENTION_BACKEND", "DIFFERENTIAL_FLASH_ATTN")
+        LLM(
+            model_info.default,
+            tokenizer=model_info.tokenizer,
+            tokenizer_mode=model_info.tokenizer_mode,
+            revision=model_info.revision,
+            speculative_config={
+                "model": model_info.speculative_model,
+                "num_speculative_tokens": 1,
+            } if model_info.speculative_model else None,
+            trust_remote_code=model_info.trust_remote_code,
+            max_model_len=model_info.max_model_len,
+            # these tests seem to produce leftover memory
+            gpu_memory_utilization=0.80,
+            load_format="dummy",
+            hf_overrides=hf_overrides,
+        )
+
+
+@pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
+def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
+    can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS)
+
+
+@pytest.mark.parametrize("model_arch",
+                         AUTO_EXAMPLE_MODELS.get_supported_archs())
+def test_implicit_converted_models(model_arch: str,
+                                   monkeypatch: pytest.MonkeyPatch):
+    can_initialize(model_arch, monkeypatch, AUTO_EXAMPLE_MODELS)
diff --git a/vllm_v0.10.0/tests/models/test_oot_registration.py b/vllm_v0.10.0/tests/models/test_oot_registration.py
new file mode 100644
index 0000000..59de356
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/test_oot_registration.py
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm import LLM, SamplingParams
+from vllm.assets.image import ImageAsset
+from vllm.multimodal.image import convert_image_mode
+
+from ..utils import create_new_process_for_each_test
+
+
+@create_new_process_for_each_test()
+def test_plugin(
+    monkeypatch: pytest.MonkeyPatch,
+    dummy_opt_path: str,
+):
+    # V1 shuts down rather than raising an error here.
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        m.setenv("VLLM_PLUGINS", "")
+
+        match = "Cannot find model module"
+        with pytest.raises(ValueError, match=match):
+            LLM(model=dummy_opt_path, load_format="dummy")
+
+
+@create_new_process_for_each_test()
+def test_oot_registration_text_generation(
+    monkeypatch: pytest.MonkeyPatch,
+    dummy_opt_path: str,
+):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_PLUGINS", "register_dummy_model")
+        prompts = ["Hello, my name is", "The text does not matter"]
+        sampling_params = SamplingParams(temperature=0)
+        llm = LLM(model=dummy_opt_path, load_format="dummy")
+        first_token = llm.get_tokenizer().decode(0)
+        outputs = llm.generate(prompts, sampling_params)
+
+        for output in outputs:
+            generated_text = output.outputs[0].text
+            # make sure only the first token is generated
+            rest = generated_text.replace(first_token, "")
+            assert rest == ""
+
+
+@create_new_process_for_each_test()
+def test_oot_registration_embedding(
+    monkeypatch: pytest.MonkeyPatch,
+    dummy_gemma2_embedding_path: str,
+):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_PLUGINS", "register_dummy_model")
+        prompts = ["Hello, my name is", "The text does not matter"]
+        llm = LLM(model=dummy_gemma2_embedding_path,
+                  load_format="dummy",
+                  max_model_len=2048)
+        outputs = llm.embed(prompts)
+
+        for output in outputs:
+            assert all(v == 0 for v in output.outputs.embedding)
+
+
+image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
+
+
+@create_new_process_for_each_test()
+def test_oot_registration_multimodal(
+    monkeypatch: pytest.MonkeyPatch,
+    dummy_llava_path: str,
+):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_PLUGINS", "register_dummy_model")
+        prompts = [{
+            "prompt": "What's in the image?<image>",
+            "multi_modal_data": {
+                "image": image
+            },
+        }, {
+            "prompt": "Describe the image<image>",
+            "multi_modal_data": {
+                "image": image
+            },
+        }]
+
+        sampling_params = SamplingParams(temperature=0)
+        llm = LLM(model=dummy_llava_path,
+                  load_format="dummy",
+                  max_num_seqs=1,
+                  trust_remote_code=True,
+                  gpu_memory_utilization=0.98,
+                  max_model_len=4096,
+                  enforce_eager=True,
+                  limit_mm_per_prompt={"image": 1})
+
+        first_token = llm.get_tokenizer().decode(0)
+        outputs = llm.generate(prompts, sampling_params)
+
+        for output in outputs:
+            generated_text = output.outputs[0].text
+            # make sure only the first token is generated
+            rest = generated_text.replace(first_token, "")
+            assert rest == ""
diff --git a/vllm_v0.10.0/tests/models/test_registry.py b/vllm_v0.10.0/tests/models/test_registry.py
new file mode 100644
index 0000000..1ce9007
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/test_registry.py
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import warnings
+
+import pytest
+import torch.cuda
+
+from vllm.model_executor.models import (is_pooling_model,
+                                        is_text_generation_model,
+                                        supports_multimodal)
+from vllm.model_executor.models.adapters import (as_embedding_model,
+                                                 as_reward_model,
+                                                 as_seq_cls_model)
+from vllm.model_executor.models.registry import (_MULTIMODAL_MODELS,
+                                                 _SPECULATIVE_DECODING_MODELS,
+                                                 _TEXT_GENERATION_MODELS,
+                                                 ModelRegistry)
+from vllm.platforms import current_platform
+
+from ..utils import create_new_process_for_each_test
+from .registry import HF_EXAMPLE_MODELS
+
+
+@pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs())
+def test_registry_imports(model_arch):
+    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
+    model_info.check_transformers_version(on_fail="skip")
+
+    # Ensure all model classes can be imported successfully
+    model_cls, _ = ModelRegistry.resolve_model_cls(model_arch)
+
+    if model_arch in _SPECULATIVE_DECODING_MODELS:
+        return  # Ignore these models which do not have a unified format
+
+    if (model_arch in _TEXT_GENERATION_MODELS
+            or model_arch in _MULTIMODAL_MODELS):
+        assert is_text_generation_model(model_cls)
+
+    # All vLLM models should be convertible to a pooling model
+    assert is_pooling_model(as_seq_cls_model(model_cls))
+    assert is_pooling_model(as_embedding_model(model_cls))
+    assert is_pooling_model(as_reward_model(model_cls))
+
+    if model_arch in _MULTIMODAL_MODELS:
+        assert supports_multimodal(model_cls)
+
+
+@create_new_process_for_each_test()
+@pytest.mark.parametrize("model_arch,is_mm,init_cuda,is_ce", [
+    ("LlamaForCausalLM", False, False, False),
+    ("MllamaForConditionalGeneration", True, False, False),
+    ("LlavaForConditionalGeneration", True, True, False),
+    ("BertForSequenceClassification", False, False, True),
+    ("RobertaForSequenceClassification", False, False, True),
+    ("XLMRobertaForSequenceClassification", False, False, True),
+])
+def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
+    assert ModelRegistry.is_multimodal_model(model_arch) is is_mm
+
+    assert ModelRegistry.is_cross_encoder_model(model_arch) is is_ce
+
+    if init_cuda and current_platform.is_cuda_alike():
+        assert not torch.cuda.is_initialized()
+
+        ModelRegistry.resolve_model_cls(model_arch)
+        if not torch.cuda.is_initialized():
+            warnings.warn(
+                "This model no longer initializes CUDA on import. "
+                "Please test using a different one.",
+                stacklevel=2)
+
+
+@create_new_process_for_each_test()
+@pytest.mark.parametrize(
+    "model_arch,is_pp,init_cuda",
+    [
+        # TODO(woosuk): Re-enable this once the MLP Speculator is supported
+        # in V1.
+        # ("MLPSpeculatorPreTrainedModel", False, False),
+        ("DeepseekV2ForCausalLM", True, False),
+        ("Qwen2VLForConditionalGeneration", True, True),
+    ])
+def test_registry_is_pp(model_arch, is_pp, init_cuda):
+    assert ModelRegistry.is_pp_supported_model(model_arch) is is_pp
+
+    if init_cuda and current_platform.is_cuda_alike():
+        assert not torch.cuda.is_initialized()
+
+        ModelRegistry.resolve_model_cls(model_arch)
+        if not torch.cuda.is_initialized():
+            warnings.warn(
+                "This model no longer initializes CUDA on import. "
+                "Please test using a different one.",
+                stacklevel=2)
+
+
+def test_hf_registry_coverage():
+    untested_archs = (ModelRegistry.get_supported_archs() -
+                      HF_EXAMPLE_MODELS.get_supported_archs())
+
+    assert not untested_archs, (
+        "Please add the following architectures to "
+        f"`tests/models/registry.py`: {untested_archs}")
diff --git a/vllm_v0.10.0/tests/models/test_transformers.py b/vllm_v0.10.0/tests/models/test_transformers.py
new file mode 100644
index 0000000..cd5b619
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/test_transformers.py
@@ -0,0 +1,175 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test the functionality of the Transformers backend."""
+from typing import Any, Optional, Union
+
+import pytest
+
+from vllm.platforms import current_platform
+
+from ..conftest import HfRunner, VllmRunner
+from ..core.block.e2e.test_correctness_sliding_window import prep_prompts
+from ..utils import multi_gpu_test
+from .utils import check_logprobs_close
+
+
+def check_implementation(
+    runner_ref: type[Union[HfRunner, VllmRunner]],
+    runner_test: type[VllmRunner],
+    example_prompts: list[str],
+    model: str,
+    kwargs_ref: Optional[dict[str, Any]] = None,
+    kwargs_test: Optional[dict[str, Any]] = None,
+    **kwargs,
+):
+    if kwargs_ref is None:
+        kwargs_ref = {}
+    if kwargs_test is None:
+        kwargs_test = {}
+
+    max_tokens = 32
+    num_logprobs = 5
+
+    args = (example_prompts, max_tokens, num_logprobs)
+
+    with runner_test(model, **kwargs_test, **kwargs) as model_test:
+        outputs_test = model_test.generate_greedy_logprobs(*args)
+
+    with runner_ref(model, **kwargs_ref) as model_ref:
+        if isinstance(model_ref, VllmRunner):
+            outputs_ref = model_ref.generate_greedy_logprobs(*args)
+        else:
+            outputs_ref = model_ref.generate_greedy_logprobs_limit(*args)
+
+    check_logprobs_close(
+        outputs_0_lst=outputs_ref,
+        outputs_1_lst=outputs_test,
+        name_0="ref",
+        name_1="test",
+    )
+
+
+@pytest.mark.skipif(
+    current_platform.is_rocm(),
+    reason="Llama-3.2-1B-Instruct, Ilama-3.2-1B produce memory access fault.")
+@pytest.mark.parametrize(
+    "model,model_impl",
+    [
+        ("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
+        ("hmellor/Ilama-3.2-1B", "auto"),  # CUSTOM CODE
+    ])  # trust_remote_code=True by default
+def test_models(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    example_prompts: list[str],
+    model: str,
+    model_impl: str,
+) -> None:
+    check_implementation(hf_runner,
+                         vllm_runner,
+                         example_prompts,
+                         model,
+                         model_impl=model_impl)
+
+
+def test_hybrid_attention(vllm_runner: type[VllmRunner]) -> None:
+    prompts, _, _ = prep_prompts(4, (800, 801))
+    kwargs_ref = {"max_model_len": 8192, "enforce_eager": True}
+    kwargs_test = {"model_impl": "transformers", **kwargs_ref}
+    check_implementation(vllm_runner,
+                         vllm_runner,
+                         prompts,
+                         model="hmellor/tiny-random-Gemma2ForCausalLM",
+                         kwargs_ref=kwargs_ref,
+                         kwargs_test=kwargs_test)
+
+
+@multi_gpu_test(num_gpus=2)
+def test_distributed(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    example_prompts,
+):
+    kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
+    check_implementation(hf_runner,
+                         vllm_runner,
+                         example_prompts,
+                         "meta-llama/Llama-3.2-1B-Instruct",
+                         kwargs_test=kwargs)
+
+
+@pytest.mark.skipif(
+    current_platform.is_rocm(),
+    reason="bitsandbytes quantization is currently not supported in rocm.")
+@pytest.mark.parametrize("model, quantization_kwargs", [
+    (
+        "meta-llama/Llama-3.2-1B-Instruct",
+        {
+            "quantization": "bitsandbytes",
+        },
+    ),
+])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_quantization(
+    vllm_runner: type[VllmRunner],
+    example_prompts: list[str],
+    model: str,
+    quantization_kwargs: dict[str, str],
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with vllm_runner(
+            model, model_impl="auto", enforce_eager=True,
+            **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
+
+    with vllm_runner(
+            model,
+            model_impl="transformers",
+            enforce_eager=True,
+            **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
+        transformers_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
+    check_logprobs_close(
+        outputs_0_lst=transformers_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="transformers",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.parametrize(
+    "model",
+    ["jason9693/Qwen2.5-1.5B-apeach"],
+)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_classify(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    monkeypatch,
+) -> None:
+    import torch
+    from transformers import AutoModelForSequenceClassification
+
+    with vllm_runner(model,
+                     max_model_len=512,
+                     dtype=dtype,
+                     model_impl="transformers") as vllm_model:
+        vllm_outputs = vllm_model.classify(example_prompts)
+
+    with hf_runner(model,
+                   dtype=dtype,
+                   auto_cls=AutoModelForSequenceClassification) as hf_model:
+        hf_outputs = hf_model.classify(example_prompts)
+
+    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+        hf_output = torch.tensor(hf_output)
+        vllm_output = torch.tensor(vllm_output)
+
+        assert torch.allclose(hf_output, vllm_output,
+                              1e-3 if dtype == "float" else 1e-2)
diff --git a/vllm_v0.10.0/tests/models/test_utils.py b/vllm_v0.10.0/tests/models/test_utils.py
new file mode 100644
index 0000000..b52327a
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/test_utils.py
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.model_executor.models.utils import AutoWeightsLoader
+
+
+class ModuleWithBatchNorm(torch.nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.bn = torch.nn.BatchNorm1d(2)
+
+    def forward(self, x):
+        return self.bn(x)
+
+
+class ModuleWithNestedBatchNorm(torch.nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.nested_mod = ModuleWithBatchNorm()
+
+    def forward(self, x):
+        return self.nested_mod(x)
+
+
+def test_module_with_batchnorm_can_load():
+    """Ensure the auto weight loader can load batchnorm stats."""
+    mod = ModuleWithBatchNorm()
+    # Run some data through the module with batchnorm
+    mod(torch.Tensor([[1, 2], [3, 4]]))
+
+    # Try to load the weights to a new instance
+    def weight_generator():
+        yield from mod.state_dict().items()
+
+    new_mod = ModuleWithBatchNorm()
+
+    assert not torch.all(new_mod.bn.running_mean == mod.bn.running_mean)
+    assert not torch.all(new_mod.bn.running_var == mod.bn.running_var)
+    assert new_mod.bn.num_batches_tracked.item() == 0
+
+    loader = AutoWeightsLoader(new_mod)
+    loader.load_weights(weight_generator())
+
+    # Ensure the stats are updated
+    assert torch.all(new_mod.bn.running_mean == mod.bn.running_mean)
+    assert torch.all(new_mod.bn.running_var == mod.bn.running_var)
+    assert new_mod.bn.num_batches_tracked.item() == 1
+
+
+def test_module_with_child_containing_batchnorm_can_autoload():
+    """Ensure the auto weight loader can load nested modules batchnorm stats."""
+    mod = ModuleWithNestedBatchNorm()
+    # Run some data through the module with batchnorm
+    mod(torch.Tensor([[1, 2], [3, 4]]))
+
+    # Try to load the weights to a new instance
+    def weight_generator():
+        yield from mod.state_dict().items()
+
+    new_mod = ModuleWithNestedBatchNorm()
+
+    assert not torch.all(
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+    assert not torch.all(
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+    assert new_mod.nested_mod.bn.num_batches_tracked.item() == 0
+
+    loader = AutoWeightsLoader(new_mod)
+    loader.load_weights(weight_generator())
+
+    # Ensure the stats are updated
+    assert torch.all(
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+    assert torch.all(
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+    assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1
+
+
+def test_module_skip_prefix():
+    """Ensure the auto weight loader can skip prefix."""
+    mod = ModuleWithNestedBatchNorm()
+    # Run some data through the module with batchnorm
+    mod(torch.Tensor([[1, 2], [3, 4]]))
+
+    # Try to load the weights to a new instance
+    def weight_generator():
+        # weights needed to be filtered out
+        redundant_weights = {
+            "prefix.bn.weight": torch.Tensor([1, 2]),
+            "prefix.bn.bias": torch.Tensor([3, 4]),
+        }
+        yield from (mod.state_dict() | redundant_weights).items()
+
+    new_mod = ModuleWithNestedBatchNorm()
+
+    assert not torch.all(
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+    assert not torch.all(
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+    assert new_mod.nested_mod.bn.num_batches_tracked.item() == 0
+
+    loader = AutoWeightsLoader(new_mod, skip_prefixes=["prefix."])
+    loader.load_weights(weight_generator())
+
+    # Ensure the stats are updated
+    assert torch.all(
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+    assert torch.all(
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+    assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1
+
+
+def test_module_skip_substr():
+    """Ensure the auto weight loader can skip prefix."""
+    mod = ModuleWithNestedBatchNorm()
+    # Run some data through the module with batchnorm
+    mod(torch.Tensor([[1, 2], [3, 4]]))
+
+    # Try to load the weights to a new instance
+    def weight_generator():
+        # weights needed to be filtered out
+        redundant_weights = {
+            "nested_mod.0.substr.weight": torch.Tensor([1, 2]),
+            "nested_mod.0.substr.bias": torch.Tensor([3, 4]),
+            "nested_mod.substr.weight": torch.Tensor([1, 2]),
+            "nested_mod.substr.bias": torch.Tensor([3, 4]),
+        }
+        yield from (mod.state_dict() | redundant_weights).items()
+
+    new_mod = ModuleWithNestedBatchNorm()
+
+    assert not torch.all(
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+    assert not torch.all(
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+    assert new_mod.nested_mod.bn.num_batches_tracked.item() == 0
+
+    loader = AutoWeightsLoader(new_mod, skip_substrs=["substr."])
+    loader.load_weights(weight_generator())
+
+    # Ensure the stats are updated
+    assert torch.all(
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+    assert torch.all(
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+    assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1
diff --git a/vllm_v0.10.0/tests/models/test_vision.py b/vllm_v0.10.0/tests/models/test_vision.py
new file mode 100644
index 0000000..310d3a3
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/test_vision.py
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm.model_executor.models.vision import resolve_visual_encoder_outputs
+
+
+@pytest.mark.parametrize(
+    ("feature_sample_layers", "num_layers_loaded", "max_possible_layers",
+     "expected_features"),
+    [
+        # All layers loaded
+        ([1, 10], 10, 10, [1, 10]),
+        ([-10, -1], 10, 10, [1, 10]),
+        # Some layers not loaded
+        ([1, 10], 10, 20, [1, 10]),
+        ([-20, -11], 10, 20, [1, 10]),
+    ])
+def test_resolve_visual_encoder_outputs(feature_sample_layers,
+                                        num_layers_loaded, max_possible_layers,
+                                        expected_features):
+    """
+    Test that offsets are correctly handled for vision feature layers.
+    """
+    encoder_outputs = [
+        torch.tensor([idx]) for idx in range(num_layers_loaded + 1)
+    ]
+    output_tensor = resolve_visual_encoder_outputs(
+        encoder_outputs=encoder_outputs,
+        feature_sample_layers=feature_sample_layers,
+        post_layer_norm=None,
+        max_possible_layers=max_possible_layers)
+    assert torch.equal(torch.tensor(expected_features), output_tensor)
diff --git a/vllm_v0.10.0/tests/models/utils.py b/vllm_v0.10.0/tests/models/utils.py
new file mode 100644
index 0000000..cdf8d02
--- /dev/null
+++ b/vllm_v0.10.0/tests/models/utils.py
@@ -0,0 +1,345 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import warnings
+from collections.abc import Sequence
+from typing import Any, NamedTuple, Optional, Union
+
+import torch
+import torch.nn.functional as F
+
+from vllm.config import ModelConfig, TaskOption
+from vllm.inputs import InputContext
+from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
+
+from .registry import HF_EXAMPLE_MODELS
+
+TokensText = tuple[list[int], str]
+
+
+def check_outputs_equal(
+    *,
+    outputs_0_lst: Sequence[TokensText],
+    outputs_1_lst: Sequence[TokensText],
+    name_0: str,
+    name_1: str,
+):
+    """
+    Compare the two sequences generated by different models,
+    which should be equal.
+    """
+    assert len(outputs_0_lst) == len(outputs_1_lst)
+
+    for prompt_idx, (outputs_0,
+                     outputs_1) in enumerate(zip(outputs_0_lst,
+                                                 outputs_1_lst)):
+        output_ids_0, output_str_0 = outputs_0
+        output_ids_1, output_str_1 = outputs_1
+
+        # The text and token outputs should exactly match
+        fail_msg = (f"Test{prompt_idx}:"
+                    f"\n{name_0}:\t{output_str_0!r}"
+                    f"\n{name_1}:\t{output_str_1!r}")
+
+        assert output_str_0 == output_str_1, fail_msg
+        assert output_ids_0 == output_ids_1, fail_msg
+
+
+# Representation of generated sequence as a tuple of
+# * Token ID list
+# * String
+# * List of top sample logprobs for each sampled token
+#
+# Assumes prompt logprobs were not requested.
+TokensTextLogprobs = tuple[list[int], str, Optional[Union[list[dict[int,
+                                                                    float]],
+                                                          SampleLogprobs]]]
+
+# Allow for tokens to be represented as str's rather than IDs;
+# tuple of
+# * Token string representations list
+# * String
+# * Optional list of top sample logprobs for each sampled token
+#
+# Assumes prompt logprobs were not requested.
+TextTextLogprobs = tuple[list[str], str, Optional[Union[list[dict[str, float]],
+                                                        list[dict[str,
+                                                                  Logprob]]]]]
+
+# Representation of generated sequence as a tuple of
+# * Token ID list
+# * String
+# * Optional list of top sample logprobs for each sampled token
+# * Optional list of top prompt logprobs for each prompt token
+#
+# Allows prompt logprobs to be requested.
+TokensTextLogprobsPromptLogprobs = tuple[
+    list[int], str, Optional[Union[list[dict[int, float]], SampleLogprobs]],
+    Optional[Union[list[Optional[dict[int, float]]], PromptLogprobs]]]
+
+
+def check_logprobs_close(
+    *,
+    outputs_0_lst: Sequence[Union[TokensTextLogprobs,
+                                  TokensTextLogprobsPromptLogprobs,
+                                  TextTextLogprobs]],
+    outputs_1_lst: Sequence[Union[TokensTextLogprobs,
+                                  TokensTextLogprobsPromptLogprobs,
+                                  TextTextLogprobs]],
+    name_0: str,
+    name_1: str,
+    num_outputs_0_skip_tokens: int = 0,
+    warn_on_mismatch: bool = True,
+    always_check_logprobs: bool = False,
+) -> None:
+    """Compare the logprobs of two sequences generated by different models,
+    which should be similar but not necessarily equal.
+
+    How sample logprobs are compared:
+    * `always_check_logprobs == True`: set of highest-logprob token ids
+      must match between seq0 and seq1 at all sampled token offsets
+    * `always_check_logprobs == False`: highest-logprob token ids are
+      only compared at sampled token offsets for which generated token
+      ids don't match
+
+    Prompt logprobs must be provided either for both input sequences, or
+    for neither. If prompt logprobs are provided, then highest-logprob
+    prompt token ids must match between seq0 and seq1 at all prompt token
+    offsets.
+
+    Args:
+      outputs_0_lst: First sequence to compare
+      outputs_0_lst: Second sequence to compare
+      name_0: sequence #0 name
+      name_1: sequence #1 name
+      num_outputs_0_skip_tokens: If > 0, specifies the number of initial
+                                 sequence #0 tokens & logprobs to discard
+                                 before comparison, i.e. all
+                                 of sequence #1 will be compared to
+                                 sequence #0 beginning at index
+                                 num_outputs_0_skip_tokens
+      warn_on_mismatch: Issue a warning if there is token-wise or text-wise
+                        mismatch between the two sequences
+      always_check_logprobs: If true, check logprobs even when tokens match
+    """
+    assert len(outputs_0_lst) == len(outputs_1_lst)
+
+    # Loop through responses to each prompt.
+    for prompt_idx, (outputs_0,
+                     outputs_1) in enumerate(zip(outputs_0_lst,
+                                                 outputs_1_lst)):
+        assert len(outputs_0) == len(outputs_1)
+        if len(outputs_0) == 3:
+            assert len(outputs_1) == 3
+            # Break out tokens, text & sample logprobs
+            # (prompt logprobs were not provided)
+            output_ids_0, output_str_0, logprobs_0 = outputs_0
+            output_ids_1, output_str_1, logprobs_1 = outputs_1
+        elif len(outputs_0) == 4:
+            assert len(outputs_1) == 4
+            # Break out tokens, text, sample logprobs & prompt logprobs
+            (
+                output_ids_0,
+                output_str_0,
+                logprobs_0,
+                prompt_logprobs_0,
+            ) = outputs_0
+            (
+                output_ids_1,
+                output_str_1,
+                logprobs_1,
+                prompt_logprobs_1,
+            ) = outputs_1
+
+            # Test prompt logprobs closeness
+            if (prompt_logprobs_0 is not None
+                    and prompt_logprobs_1 is not None):
+                # Both sequences' prompt logprobs lists are not `None``
+                # (although individual list elements may be `None`);
+                # for each token's logprobs:
+                for idx, (logprobs_elem_0, logprobs_elem_1) in enumerate(
+                        zip(prompt_logprobs_0, prompt_logprobs_1)):
+                    fail_msg = (
+                        f"Prompt logprobs test:"
+                        f"\n{name_0}:\tPrompt index {idx}\t{logprobs_elem_0}"
+                        f"\n{name_1}:\tPrompt index {idx}\t{logprobs_elem_1}")
+
+                    if logprobs_elem_0 is None:
+                        # If the seq 0 token's logprobs are `None`,
+                        # the seq 1 token's logprobs must be `None`
+                        assert logprobs_elem_1 is None, fail_msg
+                    else:
+                        # If the seq 0 token's logprobs are not `None`,
+                        # the seq 1 token's logprobs must not be `None`
+                        assert logprobs_elem_1 is not None, fail_msg
+                        # Logprobs check: top-k token choices must be the same
+                        assert (set(logprobs_elem_0.keys()) == set(
+                            logprobs_elem_1.keys())), fail_msg
+            else:
+                # Both sequence logprobs lists must be `None`
+                fail_msg = (f"Prompt logprobs test:"
+                            f"\n{name_0}:\tlogprobs\t{prompt_logprobs_0}"
+                            f"\n{name_1}:\tlogprobs\t{prompt_logprobs_1}")
+
+                assert (prompt_logprobs_0 is None
+                        and prompt_logprobs_1 is None), fail_msg
+        else:
+            raise ValueError(f"Outputs tuple must have 3 or 4 elements but "
+                             f"{len(outputs_0)} elements were provided: "
+                             f"{outputs_0}")
+
+        if logprobs_0 is None:
+            logprobs_0 = [None] * len(output_ids_0)
+        if logprobs_1 is None:
+            logprobs_1 = [None] * len(output_ids_1)
+
+        # Skip specified number of initial sequence #0 tokens
+        # & logprobs, leaving output text as-is for simplicity
+        # (text mismatches may generate warnings but do not
+        # cause the test to fail.)
+        if num_outputs_0_skip_tokens < 0:
+            raise ValueError("num_outputs_0_skip_tokens must be non-negative")
+        output_ids_0 = output_ids_0[num_outputs_0_skip_tokens:]
+        logprobs_0 = logprobs_0[num_outputs_0_skip_tokens:]
+
+        # Loop through generated tokens.
+        for idx, (output_id_0,
+                  output_id_1) in enumerate(zip(output_ids_0, output_ids_1)):
+
+            is_tok_mismatch = output_id_0 != output_id_1
+
+            # If generated tokens don't match
+            # or it is desired to always check logprobs,
+            # then
+            if is_tok_mismatch or always_check_logprobs:
+                logprobs_elem_0 = logprobs_0[idx]
+                logprobs_elem_1 = logprobs_1[idx]
+
+                # Each predicted token must be in top N logprobs of the other
+                fail_msg = (
+                    f"Test{prompt_idx}:"
+                    f"\nMatched tokens:\t{output_ids_0[:idx]}"
+                    f"\n{name_0}:\t{output_str_0!r}\t{logprobs_elem_0}"
+                    f"\n{name_1}:\t{output_str_1!r}\t{logprobs_elem_1}")
+
+                assert logprobs_elem_0 is not None, fail_msg
+                assert logprobs_elem_1 is not None, fail_msg
+                assert output_id_0 in logprobs_elem_1, fail_msg
+                assert output_id_1 in logprobs_elem_0, fail_msg
+
+                if warn_on_mismatch and is_tok_mismatch:
+                    with warnings.catch_warnings():
+                        # This ensures that repeated warnings are shown
+                        # in the output, not just the first occurrence
+                        warnings.simplefilter("always")
+
+                        warnings.warn(fail_msg, stacklevel=2)
+
+                # Break out since sequences will now diverge.
+                break
+        else:
+            if output_str_0 != output_str_1 and warn_on_mismatch:
+                # The token outputs exactly match,
+                # so the text outputs should exactly match as well
+                fail_msg = (f"Test{prompt_idx}:"
+                            f"\n{name_0}:\t{output_str_0!r}"
+                            f"\n{name_1}:\t{output_str_1!r}")
+
+                with warnings.catch_warnings():
+                    # This ensures that repeated warnings are shown
+                    # in the output, not just the first occurrence
+                    warnings.simplefilter("always")
+
+                    warnings.warn(fail_msg, stacklevel=2)
+
+
+def build_model_context(
+    model_id: str,
+    task: TaskOption = "auto",
+    dtype: Union[str, torch.dtype] = "auto",
+    model_config_kwargs: Optional[dict[str, Any]] = None,
+    mm_processor_kwargs: Optional[dict[str, Any]] = None,
+    limit_mm_per_prompt: Optional[dict[str, int]] = None,
+    disable_mm_preprocessor_cache: bool = True,
+):
+    """Creates an InputContext for a given model.
+
+    Args:
+        model_id: ID of the model being considered.
+        mm_processor_kwargs: optional processor kwargs for to be leveraged
+            in the input processor, mapper, dummy data creation, etc.
+        limit_mm_per_prompt: Multimodal limits.
+
+    Returns:
+        InputContext for the model being considered.
+    """
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
+
+    model_config_kwargs = model_config_kwargs or {}
+    model_config = ModelConfig(
+        model_id,
+        task=task,
+        tokenizer=model_info.tokenizer or model_id,
+        tokenizer_mode=model_info.tokenizer_mode,
+        trust_remote_code=model_info.trust_remote_code,
+        dtype=dtype,
+        seed=0,
+        mm_processor_kwargs=mm_processor_kwargs,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        disable_mm_preprocessor_cache=disable_mm_preprocessor_cache,
+        hf_overrides=model_info.hf_overrides,
+        **model_config_kwargs,
+    )
+    return InputContext(model_config)
+
+
+def check_embeddings_close(
+    *,
+    embeddings_0_lst: Sequence[list[float]],
+    embeddings_1_lst: Sequence[list[float]],
+    name_0: str,
+    name_1: str,
+    tol: float = 1e-3,
+) -> None:
+    assert len(embeddings_0_lst) == len(embeddings_1_lst)
+
+    for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
+            zip(embeddings_0_lst, embeddings_1_lst)):
+        assert len(embeddings_0) == len(embeddings_1), (
+            f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")
+
+        sim = F.cosine_similarity(torch.tensor(embeddings_0),
+                                  torch.tensor(embeddings_1),
+                                  dim=0)
+
+        fail_msg = (f"Test{prompt_idx}:"
+                    f"\nCosine similarity: \t{sim:.4f}"
+                    f"\n{name_0}:\t{embeddings_0[:16]!r}"
+                    f"\n{name_1}:\t{embeddings_1[:16]!r}")
+
+        assert sim >= 1 - tol, fail_msg
+
+
+def matryoshka_fy(tensor: torch.Tensor, dimensions: int):
+    tensor = torch.tensor(tensor)
+    tensor = tensor[..., :dimensions]
+    tensor = F.normalize(tensor, p=2, dim=1)
+    return tensor
+
+
+class EmbedModelInfo(NamedTuple):
+    name: str
+    is_matryoshka: bool = False
+    matryoshka_dimensions: Optional[list[int]] = None
+    architecture: str = ""
+    dtype: str = "auto"
+    enable_test: bool = True
+
+
+class RerankModelInfo(NamedTuple):
+    name: str
+    architecture: str = ""
+    dtype: str = "auto"
+    enable_test: bool = True
diff --git a/vllm_v0.10.0/tests/mq_llm_engine/__init__.py b/vllm_v0.10.0/tests/mq_llm_engine/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/mq_llm_engine/conftest.py b/vllm_v0.10.0/tests/mq_llm_engine/conftest.py
new file mode 100644
index 0000000..375b248
--- /dev/null
+++ b/vllm_v0.10.0/tests/mq_llm_engine/conftest.py
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
diff --git a/vllm_v0.10.0/tests/mq_llm_engine/test_abort.py b/vllm_v0.10.0/tests/mq_llm_engine/test_abort.py
new file mode 100644
index 0000000..5ff08cb
--- /dev/null
+++ b/vllm_v0.10.0/tests/mq_llm_engine/test_abort.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test that aborting is handled properly."""
+
+import asyncio
+import tempfile
+import uuid
+
+import pytest
+
+from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
+from vllm.engine.arg_utils import AsyncEngineArgs
+
+MODEL = "google/gemma-1.1-2b-it"
+ENGINE_ARGS = AsyncEngineArgs(model=MODEL)
+RAISED_ERROR = KeyError
+RAISED_VALUE = "foo"
+EXPECTED_TOKENS = 250
+
+
+@pytest.fixture(scope="function")
+def tmp_socket():
+    with tempfile.TemporaryDirectory() as td:
+        yield f"ipc://{td}/{uuid.uuid4()}"
+
+
+@pytest.mark.asyncio
+async def test_abort(tmp_socket):
+    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
+                           ipc_path=tmp_socket) as engine:
+
+        client = await engine.make_client()
+
+        request_id_to_be_aborted = "request-aborted"
+        request_ids_a = [f"request-a-{idx}" for idx in range(10)]
+        request_ids_b = [f"request-b-{idx}" for idx in range(10)]
+
+        # Requests started before one to be aborted.
+        tasks = []
+        for request_id in request_ids_a:
+            tasks.append(
+                asyncio.create_task(
+                    generate(client, request_id, EXPECTED_TOKENS)))
+
+        # Aborted.
+        task_aborted = asyncio.create_task(
+            generate(client, request_id_to_be_aborted, EXPECTED_TOKENS))
+
+        # Requests started after one to be aborted.
+        for request_id in request_ids_b:
+            tasks.append(
+                asyncio.create_task(
+                    generate(client, request_id, EXPECTED_TOKENS)))
+
+        # Actually abort.
+        await asyncio.sleep(0.5)
+        await client.abort(request_id_to_be_aborted)
+
+        # Confirm that we got all the EXPECTED tokens from the requests.
+        for task in tasks:
+            count, request_id = await task
+            assert count == EXPECTED_TOKENS, (
+                f"{request_id} generated only {count} tokens")
+
+        # Cancel task (this will hang indefinitely if not).
+        task_aborted.cancel()
+
+        # Shutdown.
+        client.close()
diff --git a/vllm_v0.10.0/tests/mq_llm_engine/test_error_handling.py b/vllm_v0.10.0/tests/mq_llm_engine/test_error_handling.py
new file mode 100644
index 0000000..3feee01
--- /dev/null
+++ b/vllm_v0.10.0/tests/mq_llm_engine/test_error_handling.py
@@ -0,0 +1,376 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test that various errors are handled properly."""
+
+import asyncio
+import tempfile
+import time
+import uuid
+from unittest.mock import Mock
+
+import pytest
+
+from tests.mq_llm_engine.utils import RemoteMQLLMEngine
+from vllm import SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.llm_engine import LLMEngine
+from vllm.engine.multiprocessing import MQEngineDeadError
+from vllm.engine.multiprocessing.engine import MQLLMEngine
+from vllm.entrypoints.openai.api_server import build_async_engine_client
+from vllm.entrypoints.openai.cli_args import make_arg_parser
+from vllm.lora.request import LoRARequest
+from vllm.sequence import SequenceGroupMetadata
+from vllm.usage.usage_lib import UsageContext
+from vllm.utils import FlexibleArgumentParser
+
+MODEL = "google/gemma-1.1-2b-it"
+ENGINE_ARGS = AsyncEngineArgs(model=MODEL, enforce_eager=True)
+RAISED_ERROR = KeyError
+RAISED_VALUE = "foo"
+
+
+@pytest.fixture(scope="function")
+def tmp_socket():
+    with tempfile.TemporaryDirectory() as td:
+        yield f"ipc://{td}/{uuid.uuid4()}"
+
+
+def run_with_evil_forward(engine_args: AsyncEngineArgs, ipc_path: str):
+    # Make engine.
+    engine = MQLLMEngine.from_engine_args(
+        engine_args=engine_args,
+        usage_context=UsageContext.UNKNOWN_CONTEXT,
+        ipc_path=ipc_path)
+
+    # Raise error during first forward pass.
+    engine.engine.model_executor.execute_model = Mock(
+        side_effect=RAISED_ERROR(RAISED_VALUE))
+
+    # Run engine.
+    engine.start()
+
+
+@pytest.mark.asyncio
+async def test_evil_forward(tmp_socket):
+    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
+                           ipc_path=tmp_socket,
+                           run_fn=run_with_evil_forward) as engine:
+
+        client = await engine.make_client()
+
+        # Server should be healthy after initial probe.
+        await asyncio.sleep(2.0)
+        await client.check_health()
+
+        # Throws an error that should get ENGINE_DEAD_ERROR.
+        with pytest.raises(MQEngineDeadError):
+            async for _ in client.generate(prompt="Hello my name is",
+                                           sampling_params=SamplingParams(),
+                                           request_id=str(uuid.uuid4())):
+                pass
+        assert client.errored
+
+        await asyncio.sleep(1.0)
+        with pytest.raises(RAISED_ERROR):
+            await client.check_health()
+        assert client.errored
+
+        # Shutdown.
+        client.close()
+
+
+def run_with_evil_model_executor_health(engine_args: AsyncEngineArgs,
+                                        ipc_path: str):
+    # Make engine.
+    engine = MQLLMEngine.from_engine_args(
+        engine_args=engine_args,
+        usage_context=UsageContext.UNKNOWN_CONTEXT,
+        ipc_path=ipc_path)
+
+    # Raise error during first forward pass.
+    engine.engine.model_executor.check_health = Mock(side_effect=RAISED_ERROR)
+
+    # Run engine.
+    engine.start()
+
+
+@pytest.mark.asyncio
+async def test_failed_health_check(tmp_socket):
+    with RemoteMQLLMEngine(
+            engine_args=ENGINE_ARGS,
+            ipc_path=tmp_socket,
+            run_fn=run_with_evil_model_executor_health) as engine:
+
+        client = await engine.make_client()
+        assert client.is_running
+
+        # Health probe should throw RAISED_ERROR.
+        await asyncio.sleep(15.)
+
+        with pytest.raises(RAISED_ERROR):
+            await client.check_health()
+        assert client.errored
+
+        # Generate call should throw ENGINE_DEAD_ERROR
+        with pytest.raises(MQEngineDeadError):
+            async for _ in client.generate(prompt="Hello my name is",
+                                           sampling_params=SamplingParams(),
+                                           request_id=str(uuid.uuid4())):
+                pass
+
+        client.close()
+
+
+def run_with_evil_abort(engine_args: AsyncEngineArgs, ipc_path: str):
+    # Make engine.
+    engine = MQLLMEngine.from_engine_args(
+        engine_args=engine_args,
+        usage_context=UsageContext.UNKNOWN_CONTEXT,
+        ipc_path=ipc_path)
+
+    # Raise error during abort call.
+    engine.engine.abort_request = Mock(side_effect=RAISED_ERROR)
+
+    # Run engine.
+    engine.start()
+
+
+@pytest.mark.asyncio
+async def test_failed_abort(tmp_socket):
+    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
+                           ipc_path=tmp_socket,
+                           run_fn=run_with_evil_abort) as engine:
+
+        client = await engine.make_client()
+        assert client.is_running
+
+        # First check health should work.
+        await client.check_health()
+
+        # Trigger an abort on the client side.
+        # This request ID does not exist, and will cause the engine to error
+        await client.abort(request_id="foo")
+
+        # Future generation requests will now fail
+        # with reference to the original KeyError("foo")
+        with pytest.raises(MQEngineDeadError) as execinfo:
+            async for _ in client.generate(
+                    prompt="Hello my name is",
+                    sampling_params=SamplingParams(max_tokens=10),
+                    request_id=str(uuid.uuid4())):
+                pass
+        assert "KeyError" in repr(execinfo.value)
+        assert client.errored
+
+        # This should raise the original error.
+        with pytest.raises(RAISED_ERROR):
+            await client.check_health()
+
+        client.close()
+
+
+@pytest.mark.asyncio
+async def test_batch_error(tmp_socket):
+    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
+                           ipc_path=tmp_socket,
+                           run_fn=run_with_evil_abort) as engine:
+
+        client = await engine.make_client()
+        assert client.is_running
+
+        # First check health should work.
+        await client.check_health()
+
+        # Batch of requests
+        async def do_generate(client):
+            # min_tokens=2048 to keep busy the engine busy
+            # to get enough time to get process a request
+            # that will crash the engine
+            params = SamplingParams(min_tokens=2048, max_tokens=2048)
+            async for _ in client.generate(prompt="Hello my name is",
+                                           sampling_params=params,
+                                           request_id=str(uuid.uuid4())):
+                pass
+
+        tasks = [asyncio.create_task(do_generate(client)) for _ in range(10)]
+
+        # This request will force a processing batch to raise
+        # an exception and next the engine get errored
+        await client.abort(request_id="foo")
+
+        # The batch of those request failed, then they
+        # should get the same exception as a MQEngineDeadError.
+        errors = await asyncio.gather(*tasks, return_exceptions=True)
+        for e in errors:
+            assert isinstance(e, MQEngineDeadError)
+            assert "KeyError" in repr(e)
+
+        client.close()
+
+
+@pytest.mark.asyncio
+async def test_bad_request(tmp_socket):
+    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
+                           ipc_path=tmp_socket) as engine:
+
+        client = await engine.make_client()
+
+        # Invalid request should fail, but not crash the server.
+        with pytest.raises(ValueError):
+            async for _ in client.generate(prompt="Hello my name is",
+                                           sampling_params=SamplingParams(),
+                                           request_id="abcd-1",
+                                           lora_request=LoRARequest(
+                                               "invalid-lora", 1,
+                                               "invalid-path")):
+                pass
+
+        # This request should be okay.
+        async for _ in client.generate(prompt="Hello my name is",
+                                       sampling_params=SamplingParams(),
+                                       request_id="abcd-2"):
+            pass
+
+        # Shutdown.
+        client.close()
+
+
+@pytest.mark.asyncio
+async def test_mp_crash_detection(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+
+        parser = FlexibleArgumentParser(
+            description="vLLM's remote OpenAI server.")
+        parser = make_arg_parser(parser)
+        args = parser.parse_args([])
+
+        # When LLMEngine is loaded, it will crash.
+        def mock_init():
+            raise ValueError
+
+        m.setattr(LLMEngine, "__init__", mock_init)
+
+        start = time.perf_counter()
+        async with build_async_engine_client(args):
+            pass
+        end = time.perf_counter()
+
+        assert end - start < 60, (
+            "Expected vLLM to gracefully shutdown in <60s "
+            "if there is an error in the startup.")
+
+
+@pytest.mark.asyncio
+async def test_mp_cuda_init():
+    # it should not crash, when cuda is initialized
+    # in the API server process
+    import torch
+    torch.cuda.init()
+    parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
+    parser = make_arg_parser(parser)
+    args = parser.parse_args([])
+
+    async with build_async_engine_client(args):
+        pass
+
+
+@pytest.mark.asyncio
+async def test_engine_process_death(tmp_socket):
+    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
+                           ipc_path=tmp_socket) as engine:
+
+        client = await engine.make_client()
+        assert client.is_running
+
+        # kill the engine process
+        engine.proc.kill()
+
+        # Generate call should fail
+        with pytest.raises(MQEngineDeadError):
+            async for _ in client.generate(prompt="Hello my name is",
+                                           sampling_params=SamplingParams(),
+                                           request_id=str(uuid.uuid4())):
+                pass
+
+        # And the health check should show the engine is dead
+        with pytest.raises(RuntimeError, match="Engine process .* died"):
+            await client.check_health()
+
+        client.close()
+
+
+def run_with_evil_input_processing(engine_args: AsyncEngineArgs,
+                                   ipc_path: str):
+    """Simulate an exception while preparing inputs for the model.
+    In the wild, this could be something like a multimodal input processor
+    failing on invalid image data."""
+
+    # Make engine.
+    engine = MQLLMEngine.from_engine_args(
+        engine_args=engine_args,
+        usage_context=UsageContext.UNKNOWN_CONTEXT,
+        ipc_path=ipc_path)
+
+    runner = engine.engine.model_executor.driver_worker.worker.model_runner
+
+    # Raise error in the model runner when adding a sequence group.
+    # See class ModelInputForGPUBuilder
+    def raiser(_, seq_group_metadata: SequenceGroupMetadata):
+        if seq_group_metadata.request_id.startswith("evil"):
+            raise RAISED_ERROR(RAISED_VALUE)
+
+    runner.builder.per_seq_group_compute_fns.append(raiser)
+
+    # Run engine.
+    engine.start()
+
+
+@pytest.mark.asyncio
+async def test_failed_inputs(tmp_socket):
+    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
+                           ipc_path=tmp_socket,
+                           run_fn=run_with_evil_input_processing) as engine:
+
+        client = await engine.make_client()
+        assert client.is_running
+
+        # Engine should be healthy
+        await client.check_health()
+
+        async def run_failing_request():
+            async for _ in client.generate(
+                    prompt="Hello my name is",
+                    sampling_params=SamplingParams(max_tokens=10),
+                    request_id="evil" + str(uuid.uuid4())):
+                pass
+
+        async def run_passing_request():
+            async for _ in client.generate(
+                    prompt="Hello my name is",
+                    sampling_params=SamplingParams(max_tokens=10),
+                    request_id=str(uuid.uuid4())):
+                pass
+
+        passing_tasks = [
+            asyncio.create_task(run_passing_request()) for _ in range(10)
+        ]
+        failing_tasks = [
+            asyncio.create_task(run_failing_request()) for _ in range(10)
+        ]
+        await asyncio.gather(*failing_tasks, return_exceptions=True)
+        await asyncio.gather(*passing_tasks)
+
+        # All the bad inputs should have raised
+        for task in failing_tasks:
+            with pytest.raises(RAISED_ERROR):
+                task.result()
+
+        # But all good inputs should have still succeeded
+        for task in passing_tasks:
+            task.result()
+
+        # And the engine should remain healthy
+        assert not client.errored
+        await client.check_health()
+
+        client.close()
diff --git a/vllm_v0.10.0/tests/mq_llm_engine/test_load.py b/vllm_v0.10.0/tests/mq_llm_engine/test_load.py
new file mode 100644
index 0000000..e9fd5b8
--- /dev/null
+++ b/vllm_v0.10.0/tests/mq_llm_engine/test_load.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test that the MQLLMEngine is able to handle 10k concurrent requests."""
+
+import asyncio
+import tempfile
+import uuid
+
+import pytest
+
+from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
+from vllm.engine.arg_utils import AsyncEngineArgs
+
+MODEL = "google/gemma-1.1-2b-it"
+NUM_EXPECTED_TOKENS = 10
+NUM_REQUESTS = 10000
+
+# Scenarios to test for num generated token.
+ENGINE_ARGS = AsyncEngineArgs(model=MODEL, disable_log_requests=True)
+
+
+@pytest.fixture(scope="function")
+def tmp_socket():
+    with tempfile.TemporaryDirectory() as td:
+        yield f"ipc://{td}/{uuid.uuid4()}"
+
+
+@pytest.mark.asyncio
+async def test_load(tmp_socket):
+    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
+                           ipc_path=tmp_socket) as engine:
+
+        client = await engine.make_client()
+
+        request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
+
+        # Create concurrent requests.
+        tasks = []
+        for request_id in request_ids:
+            tasks.append(
+                asyncio.create_task(
+                    generate(client, request_id, NUM_EXPECTED_TOKENS)))
+
+        # Confirm that we got all the EXPECTED tokens from the requests.
+        failed_request_id = None
+        tokens = None
+        for task in tasks:
+            num_generated_tokens, request_id = await task
+            if (num_generated_tokens != NUM_EXPECTED_TOKENS
+                    and failed_request_id is None):
+                failed_request_id = request_id
+                tokens = num_generated_tokens
+
+        assert failed_request_id is None, (
+            f"{failed_request_id} generated {tokens} but "
+            f"expected {NUM_EXPECTED_TOKENS}")
+
+        # Shutdown.
+        client.close()
diff --git a/vllm_v0.10.0/tests/mq_llm_engine/utils.py b/vllm_v0.10.0/tests/mq_llm_engine/utils.py
new file mode 100644
index 0000000..7976d50
--- /dev/null
+++ b/vllm_v0.10.0/tests/mq_llm_engine/utils.py
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import multiprocessing
+from typing import Callable, Union
+
+from vllm import SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.multiprocessing.client import MQLLMEngineClient
+from vllm.engine.multiprocessing.engine import MQLLMEngine
+from vllm.outputs import RequestOutput
+from vllm.usage.usage_lib import UsageContext
+
+
+async def generate(
+        client: MQLLMEngineClient,
+        request_id: str,
+        num_tokens: int,
+        return_output: bool = False) -> Union[RequestOutput, tuple[int, str]]:
+
+    final_output = None
+    count = 0
+    async for out in client.generate(
+            request_id=request_id,
+            prompt="Hello my name is Robert and",
+            sampling_params=SamplingParams(max_tokens=num_tokens,
+                                           temperature=0)):
+
+        count += 1
+        final_output = out
+        await asyncio.sleep(0.)
+
+    if return_output:
+        return final_output
+
+    # Confirm we generated all the tokens we expected.
+    return count, request_id
+
+
+def run_normal(engine_args: AsyncEngineArgs, ipc_path: str):
+    # Make engine.
+    engine = MQLLMEngine.from_engine_args(
+        engine_args=engine_args,
+        usage_context=UsageContext.UNKNOWN_CONTEXT,
+        ipc_path=ipc_path)
+
+    # Run engine.
+    engine.start()
+
+
+class RemoteMQLLMEngine:
+
+    def __init__(self,
+                 engine_args: AsyncEngineArgs,
+                 ipc_path: str,
+                 run_fn: Callable = run_normal) -> None:
+
+        self.engine_args = engine_args
+        self.ipc_path = ipc_path
+        context = multiprocessing.get_context("spawn")
+        self.proc = context.Process(target=run_fn,
+                                    args=(engine_args, ipc_path))
+        self.proc.start()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.proc.kill()
+
+    async def make_client(self) -> MQLLMEngineClient:
+        engine_config = self.engine_args.create_engine_config()
+        client = MQLLMEngineClient(self.ipc_path, engine_config, self.proc.pid)
+        while True:
+            try:
+                await client.setup()
+                break
+            except TimeoutError:
+                assert self.proc.is_alive()
+        return client
diff --git a/vllm_v0.10.0/tests/multi_step/__init__.py b/vllm_v0.10.0/tests/multi_step/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/multi_step/test_correctness_async_llm.py b/vllm_v0.10.0/tests/multi_step/test_correctness_async_llm.py
new file mode 100644
index 0000000..56e339d
--- /dev/null
+++ b/vllm_v0.10.0/tests/multi_step/test_correctness_async_llm.py
@@ -0,0 +1,232 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Test the AsyncLLMEngine with multi-step-decoding
+from typing import Optional
+
+import pytest
+
+from vllm.utils import STR_BACKEND_ENV_VAR
+
+from ..models.utils import check_logprobs_close
+from ..utils import (completions_with_server_args, get_client_text_generations,
+                     get_client_text_logprob_generations)
+
+MODELS = [
+    "JackFram/llama-160m",
+]
+NUM_SCHEDULER_STEPS = [8]  # Multi-step decoding steps
+NUM_PROMPTS = [10]
+
+DEFAULT_SERVER_ARGS: list[str] = [
+    "--distributed-executor-backend",
+    "ray",
+    "--gpu-memory-utilization",
+    "0.85",
+    "--swap-space",
+    "16",
+]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize(("tp_size, pp_size"), [
+    (1, 1),
+    (2, 2),
+])
+@pytest.mark.parametrize("eager_mode", [False, True])
+@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
+@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("is_async", [True])
+@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
+@pytest.mark.parametrize("enable_chunked_prefill", [True, False])
+@pytest.mark.asyncio
+async def test_multi_step(
+    example_prompts,
+    model: str,
+    tp_size: int,
+    pp_size: int,
+    eager_mode: int,
+    num_scheduler_steps: int,
+    num_prompts: int,
+    is_async: bool,
+    num_logprobs: Optional[int],
+    attention_backend: str,
+    enable_chunked_prefill: bool,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Test vLLM engine with multi-step scheduling in an OpenAI-protocol
+    client/server environment.
+
+    Set up an engine with single-step scheduling as a ground-truth reference.
+
+    Send a completions API request to both engines with the same prompts.
+
+    Validate:
+    * Generated tokens match
+    * Generated logprobs are all very close
+
+    Args:
+      example_prompts: test fixture providing example prompts
+      model: model under test (same for single- and multi-step engines)
+      tp_size: degree of tensor-parallelism
+      pp_size: degree of pipeline-parallelism
+      eager_mode
+      num_scheduler_steps: for multi-step scheduling, GPU-side steps per
+                           GPU -> CPU output transfer
+      num_prompts: number of example prompts under test
+      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
+                    completions endpoint; `None` -> no logprobs
+    """
+    if enable_chunked_prefill and \
+        (pp_size > 1 or attention_backend != "FLASH_ATTN"):
+        pytest.skip("Multi-step with Chunked-Prefill only supports"
+                    "PP=1 and FLASH_ATTN backend")
+
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
+
+        prompts = example_prompts
+        if len(prompts) < num_prompts:
+            prompts = prompts * ((num_prompts // len(prompts)) + 1)
+        prompts = prompts[:num_prompts]
+        assert len(prompts) == num_prompts
+
+        server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"]
+        ms_server_args = DEFAULT_SERVER_ARGS + \
+            ["--num-scheduler-steps", f"{num_scheduler_steps}"]
+
+        if not is_async:
+            ms_server_args += ["--disable-async-output-proc"]
+
+        if eager_mode:
+            ms_server_args.append("--enforce-eager")
+
+        if enable_chunked_prefill:
+            ms_server_args.append("--enable-chunked-prefill")
+
+        distributed_args = [
+            "--tensor-parallel-size",
+            str(tp_size),
+            "--pipeline-parallel-size",
+            str(pp_size),
+        ]
+
+        # Spin up client/server & issue completion API requests.
+        # Default `max_wait_seconds` is 240 but was empirically
+        # was raised 5x to 1200 *just for this test* due to
+        # observed timeouts in GHA CI
+        ref_completions = await completions_with_server_args(
+            prompts,
+            model,
+            server_args + distributed_args,
+            num_logprobs,
+            max_wait_seconds=5 * 240)
+        test_completions = await completions_with_server_args(
+            prompts,
+            model,
+            ms_server_args + distributed_args,
+            num_logprobs,
+            max_wait_seconds=5 * 240)
+
+        # Assert multi-step scheduling produces identical tokens
+        # to single-step scheduling.
+        ref_generations = get_client_text_generations(ref_completions)
+        test_generations = get_client_text_generations(test_completions)
+        assert ref_generations == test_generations
+
+        # Assert multi-step scheduling produces nearly-identical logprobs
+        # to single-step scheduling.
+        ref_text_logprobs = get_client_text_logprob_generations(
+            ref_completions)
+        test_text_logprobs = get_client_text_logprob_generations(
+            test_completions)
+        check_logprobs_close(
+            outputs_0_lst=ref_text_logprobs,
+            outputs_1_lst=test_text_logprobs,
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.mark.parametrize(("tp_size, pp_size"), [
+    (1, 2),
+])
+@pytest.mark.asyncio
+async def test_multi_step_pp_smoke(
+    tp_size: int,
+    pp_size: int,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """
+    Smoke test for the vLLM engine with multi-step scheduling in an
+    OpenAI-protocol client/server environment.
+
+    This tests compares the outputs between multi-step scheduling and
+    single-step scheduling. Notably, this test lets the engines generate
+    more tokens (default is 5) and test for an exact match over all the
+    tokens.
+
+    Args:
+      tp_size: degree of tensor-parallelism
+      pp_size: degree of pipeline-parallelism
+      eager_mode
+    """
+
+    model = "JackFram/llama-160m"
+    num_scheduler_steps = 8
+    attention_backend = "FLASH_ATTN"
+    max_num_seqs = 3
+
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
+
+        # Prompt from the ShareGPT dataset
+        prompts = [
+            "in the jtbd context whats a push?",  # codespell:ignore
+            "in the jtbd context whats a push?",  # codespell:ignore
+            "in the jtbd context whats a push?",  # codespell:ignore
+            "in the jtbd context whats a push?",  # codespell:ignore
+        ]
+        # Use varying max_tokens to introduce scheduling randomness.
+        max_tokens = [10 * i for i in range(1, len(prompts) + 1)]
+        assert len(prompts) == len(max_tokens)
+
+        test_args = [
+            "--tensor-parallel-size",
+            str(tp_size), "--pipeline-parallel-size",
+            str(pp_size), "--max-num-seqs",
+            str(max_num_seqs)
+        ]
+
+        server_args = DEFAULT_SERVER_ARGS + test_args
+        ms_server_args = DEFAULT_SERVER_ARGS + \
+          ["--num-scheduler-steps", f"{num_scheduler_steps}"] + \
+          test_args
+
+        # Spin up client/server & issue completion API requests.
+        # Default `max_wait_seconds` is 240 but was empirically
+        # was raised 3x to 720 *just for this test* due to
+        # observed timeouts in GHA CI
+        ref_completions = await completions_with_server_args(
+            prompts=prompts,
+            model_name=model,
+            server_cli_args=server_args,
+            num_logprobs=None,
+            max_wait_seconds=5 * 240,
+            max_tokens=max_tokens)
+
+        test_completions = await completions_with_server_args(
+            prompts=prompts,
+            model_name=model,
+            server_cli_args=ms_server_args,
+            num_logprobs=None,
+            max_wait_seconds=5 * 240,
+            max_tokens=max_tokens)
+
+        # Assert multi-step scheduling produces identical tokens
+        # to single-step scheduling.
+        ref_generations = get_client_text_generations(ref_completions)
+        test_generations = get_client_text_generations(test_completions)
+
+        assert ref_generations == test_generations
diff --git a/vllm_v0.10.0/tests/multi_step/test_correctness_llm.py b/vllm_v0.10.0/tests/multi_step/test_correctness_llm.py
new file mode 100644
index 0000000..0df00c9
--- /dev/null
+++ b/vllm_v0.10.0/tests/multi_step/test_correctness_llm.py
@@ -0,0 +1,383 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Test the LLMEngine with multi-step-decoding
+
+import copy
+from typing import Optional
+
+import pytest
+
+from vllm.platforms import current_platform
+from vllm.utils import STR_BACKEND_ENV_VAR
+
+from ..models.utils import check_logprobs_close, check_outputs_equal
+
+MODELS = [
+    "JackFram/llama-160m",
+]
+NUM_SCHEDULER_STEPS = [8]  # Multi-step decoding steps
+NUM_PROMPTS = [10]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("tp_size", [1])
+@pytest.mark.parametrize("enable_chunked_prefill", [False, True])
+@pytest.mark.parametrize("max_tokens", [5])
+@pytest.mark.parametrize("enforce_eager", [True, False])
+@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
+@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
+@pytest.mark.parametrize("num_logprobs", [None, 5])
+@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN", "FLASHINFER"])
+def test_multi_step_llm(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    tp_size: int,
+    enable_chunked_prefill: bool,
+    max_tokens: int,
+    enforce_eager: int,
+    num_scheduler_steps: int,
+    num_prompts: int,
+    num_logprobs: Optional[int],
+    attention_backend: str,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Test vLLM engine with multi-step scheduling via sync LLM Engine.
+
+    Set up a HuggingFace (HF) transformers model as a ground-truth reference.
+
+    Prompt them with the same example prompts.
+
+    Validate:
+    * Generated tokens match
+    * Generated logprobs are all very close
+
+    Args:
+      hf_runner: HF transformers model runner fixture
+      vllm_runner: vLLM model runner fixture
+      example_prompts: test fixture providing example prompts
+      model: model under test (same for single- and multi-step engines)
+      dtype: tensor datatype for engine to utilize
+      tp_size: degree of tensor-parallelism
+      enable_chunked_prefill: chunked-prefill on/off
+      max_tokens: the maximum number of tokens to generate
+      enforce_eager
+      num_scheduler_steps: for multi-step scheduling, GPU-side steps per
+                           GPU -> CPU output transfer
+      num_prompts: number of example prompts under test
+      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
+                    completions endpoint; `None` -> 1 logprob returned.
+    """
+    if current_platform.is_rocm() and \
+        (attention_backend == "FLASHINFER" or enable_chunked_prefill):
+        pytest.skip(
+            "Multi-Step with FLASHINFER or Chunked-Prefill is not supported"
+            "on ROCm")
+
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
+
+        prompts = example_prompts
+        if len(prompts) < num_prompts:
+            prompts = prompts * ((num_prompts // len(prompts)) + 1)
+        prompts = prompts[:num_prompts]
+        assert len(prompts) == num_prompts
+
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                enforce_eager=enforce_eager,
+                gpu_memory_utilization=0.7,
+                tensor_parallel_size=tp_size,
+                enable_chunked_prefill=enable_chunked_prefill,
+                num_scheduler_steps=num_scheduler_steps,
+        ) as vllm_model:
+            vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens)
+                            if num_logprobs is None else
+                            vllm_model.generate_greedy_logprobs(
+                                prompts, max_tokens, num_logprobs))
+
+        with hf_runner(model, dtype=dtype) as hf_model:
+            hf_outputs = (hf_model.generate_greedy(prompts, max_tokens)
+                          if num_logprobs is None else
+                          hf_model.generate_greedy_logprobs_limit(
+                              prompts, max_tokens, num_logprobs))
+
+        if num_logprobs is None:
+            check_outputs_equal(
+                outputs_0_lst=hf_outputs,
+                outputs_1_lst=vllm_outputs,
+                name_0="hf",
+                name_1="vllm",
+            )
+        else:
+            check_logprobs_close(
+                outputs_0_lst=hf_outputs,
+                outputs_1_lst=vllm_outputs,
+                name_0="hf",
+                name_1="vllm",
+            )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("tp_size", [1])
+@pytest.mark.parametrize("max_tokens", [5])
+@pytest.mark.parametrize("enforce_eager", [True])
+@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
+@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
+@pytest.mark.parametrize("num_logprobs,num_prompt_logprobs", [(5, 5)])
+@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN"])
+def test_multi_step_llm_w_prompt_logprobs(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    tp_size: int,
+    max_tokens: int,
+    enforce_eager: int,
+    num_scheduler_steps: int,
+    num_prompts: int,
+    num_logprobs: Optional[int],
+    num_prompt_logprobs: Optional[int],
+    attention_backend: str,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Test prompt logprobs with multi-step scheduling via sync LLM Engine.
+
+    Set up a vLLM engine instance w/ single-step scheduling as a ground-truth
+    reference.
+
+    Prompt them with the same example prompts.
+
+    Validate:
+    * All generated logprobs are all very close
+
+    Args:
+      hf_runner: HF transformers model runner fixture
+      vllm_runner: vLLM model runner fixture
+      example_prompts: test fixture providing example prompts
+      model: model under test (same for single- and multi-step engines)
+      dtype: tensor datatype for engine to utilize
+      tp_size: degree of tensor-parallelism
+      max_tokens: the maximum number of tokens to generate
+      enforce_eager
+      num_scheduler_steps: for multi-step scheduling, GPU-side steps per
+                           GPU -> CPU output transfer
+      num_prompts: number of example prompts under test
+      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
+                    completions endpoint; `None` -> no logprobs
+      num_prompt_logprobs: number of logprobs to return for each prompt token;
+                           note that this argument is not supported by the
+                           OpenAI completions endpoint.
+    """
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
+
+        prompts = example_prompts
+        if len(prompts) < num_prompts:
+            prompts = prompts * ((num_prompts // len(prompts)) + 1)
+        prompts = prompts[:num_prompts]
+        assert len(prompts) == num_prompts
+
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                enforce_eager=enforce_eager,
+                gpu_memory_utilization=0.7,
+                tensor_parallel_size=tp_size,
+                num_scheduler_steps=num_scheduler_steps,
+        ) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs,
+                num_prompt_logprobs=num_prompt_logprobs)
+
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                enforce_eager=enforce_eager,
+                gpu_memory_utilization=0.7,
+                tensor_parallel_size=tp_size,
+        ) as vllm_model:
+            single_step_vllm_outputs = vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs,
+                num_prompt_logprobs=num_prompt_logprobs)
+
+        check_logprobs_close(
+            outputs_0_lst=single_step_vllm_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("tp_size", [1])
+@pytest.mark.parametrize("max_tokens", [5])
+@pytest.mark.parametrize("enforce_eager", [True])
+@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
+@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
+@pytest.mark.parametrize("num_logprobs", [None, 5])
+@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN"])
+@pytest.mark.skipif(
+    current_platform.is_rocm(),
+    reason="Multi-Step + Chunked-Prefill not supported on ROCm")
+def test_multi_step_llm_chunked_prefill_prefix_cache(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    tp_size: int,
+    max_tokens: int,
+    enforce_eager: int,
+    num_scheduler_steps: int,
+    num_prompts: int,
+    num_logprobs: Optional[int],
+    attention_backend: str,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Test vLLM engine with multi-step+"single-step chunked prefill"+APC.
+
+    Set up contrived scenario which tests for a possible failure mode of
+    scheduling with multi-step+"single-step chunked prefill"+APC
+
+    "single-step chunked prefill" here refers to the current vLLM multi-step+
+    chunked-prefill implementation, which requires that a prefill may only
+    be scheduled in the same step as decodes if the prefill prompt fits in a
+    single chunk (note that "complete" multi-step+chunked-prefill would allow
+    a prefill to span multiple chunks & multiple steps but that is not yet
+    the case.)
+
+    "APC" is short for "automatic prefix caching".
+
+    This test creates a scenario where the scheduler must decide whether/how
+    to schedule a prefill with a prompt that exceeds the available token budget.
+    The correct behavior for multi-step+"single-step chunked prefill"+APC is to
+    put off scheduling the prefill until a future step.
+
+    Validate that:
+    * Multi-step kernels do not raise an exception due to incorrect scheduler
+      behavior
+    * Generated tokens match between
+      multi-step+"single-step chunked prefill"+APC and
+      single-step scheduling.
+    * (If logprobs are enabled) check logprobs are close enough
+
+    Args:
+      vllm_runner: vLLM model runner fixture
+      example_prompts: test fixture providing example prompts
+      model: model under test (same for single- and multi-step engines)
+      dtype: tensor datatype for engine to utilize
+      tp_size: degree of tensor-parallelism
+      max_tokens: the maximum number of tokens to generate
+      enforce_eager
+      num_scheduler_steps: for multi-step scheduling, GPU-side steps per
+                           GPU -> CPU output transfer
+      num_prompts: number of example prompts under test
+      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
+                    completions endpoint; `None` -> 1 logprob returned.
+    """
+
+    # Set up contrived test for correct scheduling behavior with
+    # multi-step+"single-step chunked prefill"+APC.
+    #
+    # Assume block_size=16
+    #
+    # Assume max_num_batched_tokens=48
+    #   => Per-step token budget=48
+    #
+    # 1. Scheduler schedules 0th prompt (24 tokens)
+    #      => Remaining token budget=24
+    # 2. Scheduler attempts to schedule 1st prompt (30 tokens)
+    #    * 30 tokens exceeds 24 token remaining budget
+    #    * Correct behavior: do not schedule this prompt in this step
+    #    * Incorrect behavior: schedule prompt chunk
+    #      * `do_sample=False` for this prompt in this step
+    #      * Chunk size = (remaining tokens // block size) * block size
+    #
+    # The Incorrect scheduling behavior - if it occurs - will cause an exception
+    # in the model runner resulting from `do_sample=False`.
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
+
+        assert len(example_prompts) >= 2
+        challenge_prompts = copy.deepcopy(example_prompts)
+        challenge_prompts[0] = (
+            'vLLM is a high-throughput and memory-efficient '
+            'inference and serving engine for LLMs.\n')  # 24 tok
+        challenge_prompts[1] = (
+            'Briefly describe the major milestones in the '
+            'development of artificial intelligence from 1950 to 2020.\n'
+        )  # 30 tok
+
+        # If necessary, adjust the length of `challenge_prompts` to match
+        # `num_prompts`
+        if len(challenge_prompts) < num_prompts:
+            challenge_prompts = (challenge_prompts *
+                                 ((num_prompts // len(challenge_prompts)) + 1))
+        challenge_prompts = challenge_prompts[:num_prompts]
+        assert len(challenge_prompts) == num_prompts
+
+        # Single-step scheduler baseline
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                enforce_eager=enforce_eager,
+                gpu_memory_utilization=0.7,
+                tensor_parallel_size=tp_size,
+                num_scheduler_steps=num_scheduler_steps,
+                max_model_len=48,
+                max_num_batched_tokens=48,
+                max_num_seqs=4,
+                block_size=16,
+        ) as vllm_model:
+            outputs_baseline = (
+                vllm_model.generate_greedy(challenge_prompts, max_tokens) if
+                num_logprobs is None else vllm_model.generate_greedy_logprobs(
+                    challenge_prompts, max_tokens, num_logprobs))
+
+        # multi-step+"single-step chunked prefill"+APC
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                enforce_eager=enforce_eager,
+                gpu_memory_utilization=0.7,
+                tensor_parallel_size=tp_size,
+                enable_chunked_prefill=True,
+                enable_prefix_caching=True,
+                num_scheduler_steps=num_scheduler_steps,
+                max_model_len=48,
+                max_num_batched_tokens=48,
+                max_num_seqs=4,
+                block_size=16,
+        ) as vllm_model:
+            outputs_w_features = (
+                vllm_model.generate_greedy(challenge_prompts, max_tokens) if
+                num_logprobs is None else vllm_model.generate_greedy_logprobs(
+                    challenge_prompts, max_tokens, num_logprobs))
+
+        if num_logprobs is None:
+            # No-logprobs test
+            check_outputs_equal(
+                outputs_0_lst=outputs_baseline,
+                outputs_1_lst=outputs_w_features,
+                name_0="multi-step",
+                name_1="multi-step+features",
+            )
+        else:
+            # Yes-logprobs test
+            check_logprobs_close(
+                outputs_0_lst=outputs_baseline,
+                outputs_1_lst=outputs_w_features,
+                name_0="multi-step",
+                name_1="multi-step+features",
+            )
diff --git a/vllm_v0.10.0/tests/multimodal/__init__.py b/vllm_v0.10.0/tests/multimodal/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/multimodal/assets/image1.png b/vllm_v0.10.0/tests/multimodal/assets/image1.png
new file mode 100644
index 0000000..17c7d4c
Binary files /dev/null and b/vllm_v0.10.0/tests/multimodal/assets/image1.png differ
diff --git a/vllm_v0.10.0/tests/multimodal/assets/image2.png b/vllm_v0.10.0/tests/multimodal/assets/image2.png
new file mode 100644
index 0000000..0f13ce5
Binary files /dev/null and b/vllm_v0.10.0/tests/multimodal/assets/image2.png differ
diff --git a/vllm_v0.10.0/tests/multimodal/assets/rgba.png b/vllm_v0.10.0/tests/multimodal/assets/rgba.png
new file mode 100644
index 0000000..11eb818
Binary files /dev/null and b/vllm_v0.10.0/tests/multimodal/assets/rgba.png differ
diff --git a/vllm_v0.10.0/tests/multimodal/test_hasher.py b/vllm_v0.10.0/tests/multimodal/test_hasher.py
new file mode 100644
index 0000000..42cb407
--- /dev/null
+++ b/vllm_v0.10.0/tests/multimodal/test_hasher.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from pathlib import Path
+
+import numpy as np
+import pytest
+import torch
+from PIL import Image, ImageDraw
+
+from vllm.multimodal.hasher import MultiModalHasher
+
+ASSETS_DIR = Path(__file__).parent / "assets"
+assert ASSETS_DIR.exists()
+
+
+# NOTE: Images that are the same visually are allowed to have the same hash
+@pytest.mark.parametrize("mode_pair", [("1", "L"), ("RGBA", "CMYK")])
+def test_hash_collision_image_mode(mode_pair):
+    mode1, mode2 = mode_pair
+    image1 = Image.new(mode1, size=(10, 10), color=1)
+    image2 = Image.new(mode2, size=(10, 10), color=1)
+
+    hasher = MultiModalHasher
+    assert hasher.hash_kwargs(image=image1) != hasher.hash_kwargs(image=image2)
+
+
+def test_hash_collision_image_palette():
+    # These images differ only in Image.palette._palette
+    image1 = Image.open(ASSETS_DIR / "image1.png")
+    image2 = Image.open(ASSETS_DIR / "image2.png")
+
+    hasher = MultiModalHasher
+    assert hasher.hash_kwargs(image=image1) != hasher.hash_kwargs(image=image2)
+
+
+def test_hash_collision_image_transpose():
+    image1 = Image.new("1", size=(10, 20))
+    ImageDraw.Draw(image1).line([(0, 0), (10, 0)])
+
+    image2 = Image.new("1", size=(20, 10))
+    ImageDraw.Draw(image2).line([(0, 0), (0, 10)])
+
+    hasher = MultiModalHasher
+    assert hasher.hash_kwargs(image=image1) != hasher.hash_kwargs(image=image2)
+
+
+def test_hash_collision_tensor_shape():
+    # The hash should be different though the data is the same when flattened
+    arr1 = torch.zeros((5, 10, 20, 3))
+    arr2 = torch.zeros((10, 20, 5, 3))
+
+    hasher = MultiModalHasher
+    assert hasher.hash_kwargs(data=arr1) != hasher.hash_kwargs(data=arr2)
+
+
+def test_hash_collision_array_shape():
+    # The hash should be different though the data is the same when flattened
+    arr1 = np.zeros((5, 10, 20, 3))
+    arr2 = np.zeros((10, 20, 5, 3))
+
+    hasher = MultiModalHasher
+    assert hasher.hash_kwargs(data=arr1) != hasher.hash_kwargs(data=arr2)
+
+
+def test_hash_non_contiguous_array():
+    arr = np.arange(24).reshape(4, 6).T
+    assert not arr.flags.c_contiguous
+
+    arr_c = np.ascontiguousarray(arr)
+    assert arr_c.flags.c_contiguous
+
+    hasher = MultiModalHasher
+    # Both should be hashable and produce the same hashes
+    assert hasher.hash_kwargs(data=arr) == hasher.hash_kwargs(data=arr_c)
diff --git a/vllm_v0.10.0/tests/multimodal/test_image.py b/vllm_v0.10.0/tests/multimodal/test_image.py
new file mode 100644
index 0000000..cfd4435
--- /dev/null
+++ b/vllm_v0.10.0/tests/multimodal/test_image.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from pathlib import Path
+
+import numpy as np
+from PIL import Image, ImageChops
+
+from vllm.multimodal.image import convert_image_mode
+
+ASSETS_DIR = Path(__file__).parent / "assets"
+assert ASSETS_DIR.exists()
+
+
+def test_rgb_to_rgb():
+    # Start with an RGB image.
+    original_image = Image.open(ASSETS_DIR / "image1.png").convert("RGB")
+    converted_image = convert_image_mode(original_image, "RGB")
+
+    # RGB to RGB should be a no-op.
+    diff = ImageChops.difference(original_image, converted_image)
+    assert diff.getbbox() is None
+
+
+def test_rgba_to_rgb():
+    original_image = Image.open(ASSETS_DIR / "rgba.png")
+    original_image_numpy = np.array(original_image)
+
+    converted_image = convert_image_mode(original_image, "RGB")
+    converted_image_numpy = np.array(converted_image)
+
+    for i in range(original_image_numpy.shape[0]):
+        for j in range(original_image_numpy.shape[1]):
+            # Verify that all transparent pixels are converted to white.
+            if original_image_numpy[i][j][3] == 0:
+                assert converted_image_numpy[i][j][0] == 255
+                assert converted_image_numpy[i][j][1] == 255
+                assert converted_image_numpy[i][j][2] == 255
diff --git a/vllm_v0.10.0/tests/multimodal/test_inputs.py b/vllm_v0.10.0/tests/multimodal/test_inputs.py
new file mode 100644
index 0000000..ffb3a6f
--- /dev/null
+++ b/vllm_v0.10.0/tests/multimodal/test_inputs.py
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
+
+
+def assert_nested_tensors_equal(expected: NestedTensors,
+                                actual: NestedTensors):
+    assert type(expected) == type(actual)  # noqa: E721
+    if isinstance(expected, torch.Tensor):
+        assert torch.equal(expected, actual)
+    else:
+        for expected_item, actual_item in zip(expected, actual):
+            assert_nested_tensors_equal(expected_item, actual_item)
+
+
+def assert_multimodal_inputs_equal(expected: MultiModalKwargs,
+                                   actual: MultiModalKwargs):
+    assert set(expected.keys()) == set(actual.keys())
+    for key in expected:
+        assert_nested_tensors_equal(expected[key], actual[key])
+
+
+def test_multimodal_input_batch_single_tensor():
+    t = torch.rand([1, 2])
+    result = MultiModalKwargs.batch([{"image": t}])
+    assert_multimodal_inputs_equal(result, {"image": t.unsqueeze(0)})
+
+
+def test_multimodal_input_batch_multiple_tensors():
+    a = torch.rand([1, 1, 2])
+    b = torch.rand([1, 1, 2])
+    c = torch.rand([1, 1, 2])
+    result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}])
+    assert_multimodal_inputs_equal(result, {"image": torch.stack([a, b, c])})
+
+
+def test_multimodal_input_batch_multiple_heterogeneous_tensors():
+    a = torch.rand([1, 2, 2])
+    b = torch.rand([1, 3, 2])
+    c = torch.rand([1, 4, 2])
+    result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}])
+    assert_multimodal_inputs_equal(result, {"image": [a, b, c]})
+
+
+def test_multimodal_input_batch_nested_tensors():
+    a = torch.rand([2, 3])
+    b = torch.rand([2, 3])
+    c = torch.rand([2, 3])
+    result = MultiModalKwargs.batch([{
+        "image": [a]
+    }, {
+        "image": [b]
+    }, {
+        "image": [c]
+    }])
+    assert_multimodal_inputs_equal(result, {
+        "image":
+        torch.stack([a.unsqueeze(0),
+                     b.unsqueeze(0),
+                     c.unsqueeze(0)])
+    })
+
+
+def test_multimodal_input_batch_heterogeneous_lists():
+    a = torch.rand([1, 2, 3])
+    b = torch.rand([1, 2, 3])
+    c = torch.rand([1, 2, 3])
+    result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}])
+    assert_multimodal_inputs_equal(
+        result,
+        {"image": [torch.stack([a, b]), c.unsqueeze(0)]})
+
+
+def test_multimodal_input_batch_multiple_batchable_lists():
+    a = torch.rand([1, 2, 3])
+    b = torch.rand([1, 2, 3])
+    c = torch.rand([1, 2, 3])
+    d = torch.rand([1, 2, 3])
+    result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c, d]}])
+    assert_multimodal_inputs_equal(
+        result,
+        {"image": torch.stack([torch.stack([a, b]),
+                               torch.stack([c, d])])})
+
+
+def test_multimodal_input_batch_mixed_stacking_depths():
+    a = torch.rand([1, 2, 3])
+    b = torch.rand([1, 3, 3])
+    c = torch.rand([1, 4, 3])
+
+    result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}])
+    assert_multimodal_inputs_equal(result, {"image": [[a, b], c.unsqueeze(0)]})
+
+    result = MultiModalKwargs.batch([{"image": [a]}, {"image": [b, c]}])
+    assert_multimodal_inputs_equal(result, {"image": [a.unsqueeze(0), [b, c]]})
diff --git a/vllm_v0.10.0/tests/multimodal/test_processing.py b/vllm_v0.10.0/tests/multimodal/test_processing.py
new file mode 100644
index 0000000..2f97475
--- /dev/null
+++ b/vllm_v0.10.0/tests/multimodal/test_processing.py
@@ -0,0 +1,1092 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from contextlib import nullcontext
+from types import MethodType
+from typing import cast
+from unittest.mock import MagicMock
+
+import numpy as np
+import pytest
+import torch
+from transformers import ProcessorMixin
+
+from vllm.config import ModelConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs,
+                                    MultiModalKwargsItem,
+                                    MultiModalSharedField)
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
+                                        ProcessingCache, PromptIndexTargets,
+                                        PromptInsertion, PromptReplacement,
+                                        apply_text_matches,
+                                        apply_token_matches,
+                                        find_mm_placeholders,
+                                        find_text_matches, find_token_matches,
+                                        iter_token_matches,
+                                        replace_token_matches)
+# yapf: enable
+from vllm.multimodal.profiling import MultiModalProfiler
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import full_groupby
+
+from .utils import random_image
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    ("token_ids", "match_ids", "expected"),
+    [
+        ([], [], []),
+        ([], [32000], []),
+        (
+            [32000, 32000, 32000],
+            [32000],
+            [
+                { "start_idx": 0, "end_idx": 1 },
+                { "start_idx": 1, "end_idx": 2 },
+                { "start_idx": 2, "end_idx": 3 },
+            ],
+        ),
+        (
+            [32000, 32000, 32000],
+            [32000, 32000],
+            [{ "start_idx": 0, "end_idx": 2 }],
+        ),
+        (
+            [32000, 32000, 32000],
+            [32000, 32000, 32000],
+            [{ "start_idx": 0, "end_idx": 3 }],
+        ),
+        (
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+            [28747, 32000],
+            [
+                { "start_idx": 1, "end_idx": 3 },
+                { "start_idx": 6, "end_idx": 8 },
+            ],
+        ),
+        (
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+            [28747, 32000, 32000, 32000],
+            [
+                { "start_idx": 1, "end_idx": 5 },
+            ],
+        ),
+        (
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+            [28747, 0, 32000],
+            [],
+        ),
+    ],
+)
+# yapf: enable
+def test_iter_token_matches(token_ids, match_ids, expected):
+    result = list(iter_token_matches(token_ids, match_ids))
+
+    # Manually constructed results
+    assert [item._asdict() for item in result] == expected
+
+    # Invariants
+    match_lens = [end - start for start, end in result]
+    print("match_lens:", match_lens)  # Only displayed on error
+    assert all(match_len == len(match_ids) for match_len in match_lens)
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    ("token_ids", "match_ids", "new_ids", "expected"),
+    [
+        ([], [], [-1], []),
+        ([], [32000], [-1], []),
+        (
+            [32000, 32000, 32000],
+            [32000],
+            [-1],
+            [-1, -1, -1],
+        ),
+        (
+            [32000, 32000, 32000],
+            [32000, 32000],
+            [-1],
+            [-1, 32000],
+        ),
+        (
+            [32000, 32000, 32000],
+            [32000, 32000, 32000],
+            [-1],
+            [-1],
+        ),
+        (
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+            [28747, 32000],
+            [-1],
+            [9833, -1, 32000, 32000, 9833, -1, 32000, 918],
+        ),
+        (
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+            [28747, 32000, 32000, 32000],
+            [-1],
+            [9833, -1, 9833, 28747, 32000, 32000, 918],
+        ),
+        (
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+            [28747, 0, 32000],
+            [-1],
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+        ),
+    ],
+)
+# yapf: enable
+def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
+    result = replace_token_matches(token_ids, match_ids, new_ids)
+
+    # Manually constructed results
+    assert result == expected
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    ("prompt", "target_by_key", "expected_by_key"),
+    [
+        (
+            [],
+            {
+                "pattern_1": [],
+                "pattern_2": [32000],
+                "pattern_3": PromptIndexTargets.start(),
+                "pattern_4": PromptIndexTargets.prefix([32000]),
+                "pattern_5": PromptIndexTargets.end(),
+            },
+            {
+                "pattern_1": [],
+                "pattern_2": [],
+                "pattern_3": [
+                    { "start_idx": 0, "end_idx": 0 },
+                ],
+                "pattern_4": [],
+                "pattern_5": [
+                    { "start_idx": 0, "end_idx": 0 },
+                ],
+            },
+        ),
+        (
+            [32000, 32000, 32000, 32000],
+            {
+                "pattern_1": [32000],
+                "pattern_2": [32000, 32000],
+                "pattern_3": [32000, 32000, 32000],
+                "pattern_4": PromptIndexTargets.start(),
+                "pattern_5": PromptIndexTargets.prefix([32000]),
+                "pattern_6": PromptIndexTargets.end(),
+            },
+            {
+                "pattern_1": [
+                    { "start_idx": 0, "end_idx": 1 },
+                    { "start_idx": 1, "end_idx": 2 },
+                    { "start_idx": 2, "end_idx": 3 },
+                    { "start_idx": 3, "end_idx": 4 },
+                ],
+                "pattern_2": [
+                    { "start_idx": 0, "end_idx": 2 },
+                    { "start_idx": 2, "end_idx": 4 },
+                ],
+                "pattern_3": [
+                    { "start_idx": 0, "end_idx": 3 },
+                ],
+                "pattern_4": [
+                    { "start_idx": 0, "end_idx": 0 },
+                ],
+                "pattern_5": [
+                    { "start_idx": 1, "end_idx": 1 },
+                ],
+                "pattern_6": [
+                    { "start_idx": 4, "end_idx": 4 },
+                ],
+            },
+        ),
+        (
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+            {
+                "pattern_1": [28747, 32000],
+                "pattern_2": [28747, 32000, 32000, 32000],
+                "pattern_3": [28747, 0, 32000],
+                "pattern_4": PromptIndexTargets.start(),
+                "pattern_5": PromptIndexTargets.prefix([28747, 32000]),
+                "pattern_6": PromptIndexTargets.end(),
+            },
+            {
+                "pattern_1": [
+                    { "start_idx": 1, "end_idx": 3 },
+                    { "start_idx": 6, "end_idx": 8 },
+                ],
+                "pattern_2": [
+                    { "start_idx": 1, "end_idx": 5 },
+                ],
+                "pattern_3": [],
+                "pattern_4": [
+                    { "start_idx": 0, "end_idx": 0 },
+                ],
+                "pattern_5": [],
+                "pattern_6": [
+                    { "start_idx": 10, "end_idx": 10 },
+                ],
+            },
+        ),
+    ],
+)
+@pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
+# yapf: enable
+def test_find_token_matches(
+    prompt,
+    target_by_key,
+    expected_by_key,
+    update_type,
+):
+    # Should not be used since there is nothing to convert to token IDs
+    mock_tokenizer = cast(AnyTokenizer, object())
+
+    prompt_updates = [
+        update_type(key, target, []).bind(mock_tokenizer)
+        for key, target in target_by_key.items()
+    ]
+    result = find_token_matches(prompt, prompt_updates)
+
+    # Only displayed on error
+    print("result:", result)
+
+    # Manually constructed results
+    result_groups = dict(full_groupby(result, key=lambda x: x.modality))
+    assert {
+        key: [
+            dict(start_idx=item.start_idx, end_idx=item.end_idx)
+            for item in result_groups.get(key, [])
+        ]
+        for key in expected_by_key
+    } == expected_by_key
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    ("prompt", "target_by_key", "expected_by_key"),
+    [
+        # Detokenized test cases of `test_find_token_matches`
+        # using the vocab of llava-hf/llava-v1.6-mistral-7b-hf
+        (
+            "",
+            {
+                "pattern_1": "",
+                "pattern_2": "<image>",
+                "pattern_3": PromptIndexTargets.start(),
+                "pattern_4": PromptIndexTargets.prefix("<image>"),
+                "pattern_5": PromptIndexTargets.end(),
+            },
+            {
+                "pattern_1": [{ "start_idx": 0, "end_idx": 0 }],
+                "pattern_2": [],
+                "pattern_3": [
+                    { "start_idx": 0, "end_idx": 0 },
+                ],
+                "pattern_4": [],
+                "pattern_5": [
+                    { "start_idx": 0, "end_idx": 0 },
+                ],
+            }
+        ),
+        (
+            "<image><image><image><image>",
+            {
+                "pattern_1": "<image>",
+                "pattern_2": "<image><image>",
+                "pattern_3": "<image><image><image>",
+                "pattern_4": PromptIndexTargets.start(),
+                "pattern_5": PromptIndexTargets.prefix("<image>"),
+                "pattern_6": PromptIndexTargets.end(),
+            },
+            {
+                "pattern_1": [
+                    { "start_idx": 0, "end_idx": 7 },
+                    { "start_idx": 7, "end_idx": 14 },
+                    { "start_idx": 14, "end_idx": 21 },
+                    { "start_idx": 21, "end_idx": 28 },
+                ],
+                "pattern_2": [
+                    { "start_idx": 0, "end_idx": 14 },
+                    { "start_idx": 14, "end_idx": 28 },
+                ],
+                "pattern_3": [
+                    { "start_idx": 0, "end_idx": 21 },
+                ],
+                "pattern_4": [
+                    { "start_idx": 0, "end_idx": 0 },
+                ],
+                "pattern_5": [
+                    { "start_idx": 7, "end_idx": 7 },
+                ],
+                "pattern_6": [
+                    { "start_idx": 28, "end_idx": 28 },
+                ],
+            },
+        ),
+        (
+            "Image:<image><image><image>Image:<image><image>!",
+            {
+                "pattern_1": "Image:<image>",
+                "pattern_2": "Image:<image><image><image>",
+                "pattern_3": "Image:<unk><image>",
+                "pattern_4": PromptIndexTargets.start(),
+                "pattern_5": PromptIndexTargets.prefix("Image:<image>"),
+                "pattern_6": PromptIndexTargets.end(),
+            },
+            {
+                "pattern_1": [
+                    { "start_idx": 0, "end_idx": 13 },
+                    { "start_idx": 27, "end_idx": 40 },
+                ],
+                "pattern_2": [
+                    { "start_idx": 0, "end_idx": 27 },
+                ],
+                "pattern_3": [],
+                "pattern_4": [
+                    { "start_idx": 0, "end_idx": 0 },
+                ],
+                "pattern_5": [
+                    { "start_idx": 13, "end_idx": 13 },
+                ],
+                "pattern_6": [
+                    { "start_idx": 48, "end_idx": 48 },
+                ],
+            },
+        ),
+        # Test regex escape
+        (
+            "<|image|><image><|image|><image>",
+            {
+                "pattern_1": "<|image|>",
+                "pattern_2": "<|image|><image>",
+                "pattern_3": "<|image|><image><|image|>",
+            },
+            {
+                "pattern_1": [
+                    { "start_idx": 0, "end_idx": 9 },
+                    { "start_idx": 16, "end_idx": 25 },
+                ],
+                "pattern_2": [
+                    { "start_idx": 0, "end_idx": 16 },
+                    { "start_idx": 16, "end_idx": 32 },
+                ],
+                "pattern_3": [
+                    { "start_idx": 0, "end_idx": 25 },
+                ],
+            },
+        ),
+    ],
+)
+@pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
+# yapf: enable
+def test_find_text_matches(
+    prompt,
+    target_by_key,
+    expected_by_key,
+    update_type,
+):
+    # Should not be used since there is nothing to convert to text
+    mock_tokenizer = cast(AnyTokenizer, object())
+
+    prompt_updates = [
+        update_type(key, target, []).bind(mock_tokenizer)
+        for key, target in target_by_key.items()
+    ]
+    result = find_text_matches(prompt, prompt_updates)
+
+    # Only displayed on error
+    print("result:", result)
+
+    # Manually constructed results
+    result_groups = dict(full_groupby(result, key=lambda x: x.modality))
+    assert {
+        key: [
+            dict(start_idx=item.start_idx, end_idx=item.end_idx)
+            for item in result_groups.get(key, [])
+        ]
+        for key in expected_by_key
+    } == expected_by_key
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    ("prompt", "target_by_key", "repl_by_key", "expected_by_update_type_mm_count"),  # noqa: E501
+    [
+        (
+            "Image:<image>Image:<image><image>!",
+            {
+                # We use `<image>` before `Image:` to test matches that
+                # occur out of order
+                "pattern_1": "<image>",
+                "pattern_2": "Image:",
+                "pattern_3": "!",
+            },
+            {
+                # Test whether target is confused with replacement
+                "pattern_1": "<image><image>",
+                # Test empty replacement
+                "pattern_2": "",
+                # Test dynamic replacement (beyond the form of `unit * count`)
+                "pattern_3": "?!?",
+            },
+            {
+                PromptInsertion: {
+                    0: "Image:<image>Image:<image><image>!",
+                    1: "Image:<image><image><image>Image:<image><image>!?!?",
+                    2: "Image:<image><image><image><image><image>Image:<image><image>!?!??!?",  # noqa: E501
+                },
+                PromptReplacement: {
+                    0: "Image:<image>Image:<image><image>!",
+                    1: "<image><image>Image:<image><image>?!?",
+                    2: "<image><image><image><image><image>?!?",
+                },
+            },
+        ),
+        # Test index targets
+        (
+            "",
+            {
+                "pattern_1": PromptIndexTargets.start(),
+                "pattern_2": PromptIndexTargets.prefix("<image>"),
+                "pattern_3": PromptIndexTargets.end(),
+            },
+            {
+                "pattern_1": "1",
+                "pattern_2": "2",
+                "pattern_3": "3",
+            },
+            {
+                PromptInsertion: {
+                    0: "",
+                    1: "13",
+                    2: "1133",
+                },
+                PromptReplacement: {
+                    0: "",
+                    1: "13",
+                    2: "1133",
+                },
+            },
+        ),
+        (
+            "<image>",
+            {
+                "pattern_1": PromptIndexTargets.start(),
+                "pattern_2": PromptIndexTargets.prefix("<image>"),
+                "pattern_3": PromptIndexTargets.end(),
+            },
+            {
+                "pattern_1": "1",
+                "pattern_2": "2",
+                "pattern_3": "3",
+            },
+            {
+                PromptInsertion: {
+                    0: "<image>",
+                    1: "1<image>23",
+                    2: "11<image>2233",
+                },
+                PromptReplacement: {
+                    0: "<image>",
+                    1: "1<image>23",
+                    2: "11<image>2233",
+                },
+            },
+        ),
+        # Test different replacement per item
+        (
+            "<image><image><image>",
+            {
+                "pattern_1": "<image>",
+            },
+            {
+                "pattern_1": lambda idx: str(idx + 1),
+            },
+            {
+                PromptInsertion: {
+                    0: "<image><image><image>",
+                    1: "<image>1<image><image>",
+                    2: "<image>12<image><image>",
+                },
+                PromptReplacement: {
+                    0: "<image><image><image>",
+                    1: "1<image><image>",
+                    2: "12<image>",
+                },
+            },
+        ),
+        (
+            "<image><image><image>",
+            {
+                "pattern_1": PromptIndexTargets.prefix("<image>"),
+            },
+            {
+                "pattern_1": lambda idx: str(idx + 1),
+            },
+            {
+                PromptInsertion: {
+                    0: "<image><image><image>",
+                    1: "<image>1<image><image>",
+                    2: "<image>12<image><image>",
+                },
+                PromptReplacement: {
+                    0: "<image><image><image>",
+                    1: "<image>1<image><image>",
+                    2: "<image>12<image><image>",
+                },
+            },
+        ),
+    ]
+)
+# yapf: enable
+def test_find_update_text(
+    prompt,
+    target_by_key,
+    repl_by_key,
+    expected_by_update_type_mm_count,
+):
+    # Should not be used since there is nothing to convert to text
+    mock_tokenizer = cast(AnyTokenizer, object())
+
+    for (
+            update_type,
+            expected_by_mm_count,
+    ) in expected_by_update_type_mm_count.items():
+        mm_prompt_updates = {
+            key:
+            [update_type(key, target, repl_by_key[key]).bind(mock_tokenizer)]
+            for key, target in target_by_key.items()
+        }
+        mm_matches = {
+            key: find_text_matches(prompt, updates)
+            for key, updates in mm_prompt_updates.items()
+        }
+
+        for mm_count, expected in expected_by_mm_count.items():
+            result = apply_text_matches(
+                prompt,
+                mm_matches,
+                {key: mm_count
+                 for key in repl_by_key},
+            )
+
+            # Only displayed on error
+            print("update_type:", update_type)
+            print("mm_count:", mm_count)
+            print("mm_matches:", mm_matches)
+            print("result:", result)
+
+            # Manually constructed results
+            assert result == expected
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    ("prompt", "target_by_key", "repl_by_key", "expected_by_update_type_mm_count"),  # noqa: E501
+    [
+        # Tokenized test cases of `test_find_replace_text`
+        # using the vocab of llava-hf/llava-v1.6-mistral-7b-hf
+        (
+            [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
+            {
+                # We use `<image>` before `Image:` to test matches that
+                # occur out of order
+                "pattern_1": [32000],
+                "pattern_2": [9833, 28747],
+                "pattern_3": [918],
+            },
+            {
+                # Test whether target is confused with replacement
+                "pattern_1": [32000, 32000],
+                # Test empty replacement
+                "pattern_2": [],
+                # Test dynamic replacement (beyond the form of `unit * count`)
+                "pattern_3": [1550, 918, 1550],
+            },
+            {
+                PromptInsertion: {
+                    0: [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
+                    1: [1, 9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918, 1550, 918, 1550],  # noqa: E501
+                    2: [1, 9833, 28747, 32000, 32000, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918, 1550, 918, 1550, 1550, 918, 1550],  # noqa: E501
+                },
+                PromptReplacement: {
+                    0: [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
+                    1: [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550],  # noqa: E501
+                    2: [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550],
+                },
+            },
+        ),
+        # Test index targets
+        (
+            [],
+            {
+                "pattern_1": PromptIndexTargets.start(),
+                "pattern_2": PromptIndexTargets.prefix([32000]),
+                "pattern_3": PromptIndexTargets.end(),
+            },
+            {
+                "pattern_1": [-1],
+                "pattern_2": [-2],
+                "pattern_3": [-3],
+            },
+            {
+                PromptInsertion: {
+                    0: [],
+                    1: [-1, -3],
+                    2: [-1, -1, -3, -3],
+                },
+                PromptReplacement: {
+                    0: [],
+                    1: [-1, -3],
+                    2: [-1, -1, -3, -3],
+                },
+            },
+        ),
+        (
+            [32000],
+            {
+                "pattern_1": PromptIndexTargets.start(),
+                "pattern_2": PromptIndexTargets.prefix([32000]),
+                "pattern_3": PromptIndexTargets.end(),
+            },
+            {
+                "pattern_1": [-1],
+                "pattern_2": [-2],
+                "pattern_3": [-3],
+            },
+            {
+                PromptInsertion: {
+                    0: [32000],
+                    1: [-1, 32000, -2, -3],
+                    2: [-1, -1, 32000, -2, -2, -3, -3],
+                },
+                PromptReplacement: {
+                    0: [32000],
+                    1: [-1, 32000, -2, -3],
+                    2: [-1, -1, 32000, -2, -2, -3, -3],
+                },
+            },
+        ),
+        # Test different replacement per item
+        (
+            [32000, 32000, 32000],
+            {
+                "pattern_1": [32000],
+            },
+            {
+                "pattern_1": lambda idx: [-(idx + 1)],
+            },
+            {
+                PromptInsertion: {
+                    0: [32000, 32000, 32000],
+                    1: [32000, -1, 32000, 32000],
+                    2: [32000, -1, -2, 32000, 32000],
+                },
+                PromptReplacement: {
+                    0: [32000, 32000, 32000],
+                    1: [-1, 32000, 32000],
+                    2: [-1, -2, 32000],
+                },
+            },
+        ),
+        (
+            [32000, 32000, 32000],
+            {
+                "pattern_1": PromptIndexTargets.prefix([32000]),
+            },
+            {
+                "pattern_1": lambda idx: [-(idx + 1)],
+            },
+            {
+                PromptInsertion: {
+                    0: [32000, 32000, 32000],
+                    1: [32000, -1, 32000, 32000],
+                    2: [32000, -1, -2, 32000, 32000],
+                },
+                PromptReplacement: {
+                    0: [32000, 32000, 32000],
+                    1: [32000, -1, 32000, 32000],
+                    2: [32000, -1, -2, 32000, 32000],
+                },
+            },
+        ),
+    ]
+)
+# yapf: enable
+def test_find_update_tokens(
+    prompt,
+    target_by_key,
+    repl_by_key,
+    expected_by_update_type_mm_count,
+):
+    # Should not be used since there is nothing to convert to tokens
+    mock_tokenizer = cast(AnyTokenizer, object())
+
+    for (
+            update_type,
+            expected_by_mm_count,
+    ) in expected_by_update_type_mm_count.items():
+        mm_prompt_updates = {
+            key:
+            [update_type(key, target, repl_by_key[key]).bind(mock_tokenizer)]
+            for key, target in target_by_key.items()
+        }
+        mm_matches = {
+            key: find_token_matches(prompt, updates)
+            for key, updates in mm_prompt_updates.items()
+        }
+
+        for mm_count, expected in expected_by_mm_count.items():
+            result = apply_token_matches(
+                prompt,
+                mm_matches,
+                {key: mm_count
+                 for key in repl_by_key},
+            )
+
+            # Only displayed on error
+            print("update_type:", update_type)
+            print("mm_count:", mm_count)
+            print("mm_matches:", mm_matches)
+            print("result:", result)
+
+            # Manually constructed results
+            assert result == expected
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    "repl_by_key",
+    [
+        {
+            "pattern_1": [32000, 32000],
+            "pattern_2": [],
+            "pattern_3": [1550, 918, 1550],
+            # Test different modalities having the same tokens (32000)
+            "pattern_4": [32000],
+        },
+    ],
+)
+@pytest.mark.parametrize(
+    ("prompt", "expected"),
+    [
+        (
+            [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
+            {
+                "pattern_1": [
+                    PlaceholderFeaturesInfo(
+                        modality="pattern_1",
+                        item_idx=0,
+                        start_idx=6,
+                        tokens=[32000, 32000],
+                        is_embed=None,
+                    ),
+                ],
+                "pattern_4": [
+                    PlaceholderFeaturesInfo(
+                        modality="pattern_4",
+                        item_idx=0,
+                        start_idx=3,
+                        tokens=[32000],
+                        is_embed=None,
+                    ),
+                ],
+            }
+
+        ),
+        (
+            [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550],
+            {
+                "pattern_1": [
+                    PlaceholderFeaturesInfo(
+                        modality="pattern_1",
+                        item_idx=0,
+                        start_idx=1,
+                        tokens=[32000, 32000],
+                        is_embed=None,
+                    ),
+                    PlaceholderFeaturesInfo(
+                        modality="pattern_1",
+                        item_idx=1,
+                        start_idx=5,
+                        tokens=[32000, 32000],
+                        is_embed=None,
+                    ),
+                ],
+                "pattern_3": [
+                    PlaceholderFeaturesInfo(
+                        modality="pattern_3",
+                        item_idx=0,
+                        start_idx=7,
+                        tokens=[1550, 918, 1550],
+                        is_embed=None,
+                    ),
+                ],
+                # No match for pattern_4 as it has lower priority than pattern_1
+            }
+        ),
+        (
+            [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550],
+            {
+                "pattern_1": [
+                    PlaceholderFeaturesInfo(
+                        modality="pattern_1",
+                        item_idx=0,
+                        start_idx=1,
+                        tokens=[32000, 32000],
+                        is_embed=None,
+                    ),
+                    PlaceholderFeaturesInfo(
+                        modality="pattern_1",
+                        item_idx=1,
+                        start_idx=3,
+                        tokens=[32000, 32000],
+                        is_embed=None,
+                    ),
+                ],
+                "pattern_4": [
+                    PlaceholderFeaturesInfo(
+                        modality="pattern_4",
+                        item_idx=0,
+                        start_idx=5,
+                        tokens=[32000],
+                        is_embed=None,
+                    ),
+                ],
+                "pattern_3": [
+                    PlaceholderFeaturesInfo(
+                        modality="pattern_3",
+                        item_idx=0,
+                        start_idx=6,
+                        tokens=[1550, 918, 1550],
+                        is_embed=None,
+                    ),
+                ],
+            }
+        ),
+    ]
+)
+@pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
+# yapf: enable
+def test_find_mm_placeholders(
+    repl_by_key,
+    prompt,
+    expected,
+    update_type,
+):
+    # Should not be used since there is nothing to convert to tokens
+    mock_tokenizer = cast(AnyTokenizer, object())
+
+    mm_prompt_updates = {
+        key: [update_type(key, [], repl).bind(mock_tokenizer)]
+        for key, repl in repl_by_key.items()
+    }
+
+    result = find_mm_placeholders(
+        mm_prompt_updates,
+        prompt,
+        # Effectively match all occurrences in the prompt
+        {key: 3
+         for key in repl_by_key},
+    )
+
+    # Only displayed on error
+    print("result:", result)
+
+    # Manually constructed results
+    assert result == expected
+
+
+def _dummy_elem(modality: str, key: str, size: int):
+    return MultiModalFieldElem(
+        modality=modality,
+        key=key,
+        data=torch.empty((size, ), dtype=torch.int8),
+        field=MultiModalSharedField(1),
+    )
+
+
+def _dummy_item(modality: str, size_by_key: dict[str, int]):
+    return MultiModalKwargsItem.from_elems([
+        _dummy_elem(modality, key, size) for key, size in size_by_key.items()
+    ])
+
+
+def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]):
+    return MultiModalKwargs.from_items([
+        _dummy_item(modality, size_by_key)
+        for modality, size_by_key in size_by_key_modality.items()
+    ])
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    ("item", "expected_size"),
+    [
+        (_dummy_item("a", {"a1": 100}), 100),
+        (_dummy_item("a", {"a1": 100, "a2": 110}), 210),
+        (_dummy_kw({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460),  # noqa: E501
+    ],
+)
+# yapf: enable
+def test_cache_item_size(item, expected_size):
+    cache = ProcessingCache.get_lru_cache(2048, type(item))
+    cache[""] = item
+
+    assert cache.currsize == expected_size
+
+
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+@pytest.mark.parametrize(
+    ("limit", "num_supported", "is_valid"),
+    [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
+     (2, 1, False), (2, 2, True)],
+)
+def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
+    limit_mm_per_prompt = {"image": limit}
+
+    model_config = ModelConfig(
+        model=model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="auto",
+        revision=None,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+    )
+
+    processor = MULTIMODAL_REGISTRY.create_processor(model_config)
+    profiler = MultiModalProfiler(processor)
+
+    mock_supported_mm_limits = MagicMock(return_value={"image": num_supported})
+    processor.info.get_supported_mm_limits = mock_supported_mm_limits
+
+    if is_valid:
+        exc_ctx = nullcontext()
+    else:
+        exc_ctx = pytest.raises(ValueError, match="The model only supports")
+
+    with exc_ctx:
+        profiler.get_decoder_dummy_data(
+            model_config.max_model_len,
+            mm_counts=limit_mm_per_prompt,
+        )
+
+
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+@pytest.mark.parametrize(
+    ("num_images", "limit", "is_valid"),
+    [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
+     (2, 1, False), (2, 2, True)],
+)
+def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
+    limit_mm_per_prompt = {"image": limit}
+
+    model_config = ModelConfig(
+        model=model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="auto",
+        revision=None,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+    )
+
+    processor = MULTIMODAL_REGISTRY.create_processor(model_config)
+
+    rng = np.random.RandomState(0)
+    image = random_image(rng, min_wh=128, max_wh=256)
+    if num_images == 0:
+        mm_data = {}
+    elif num_images == 1:
+        mm_data = {"image": image}
+    else:
+        mm_data = {"image": [image] * num_images}
+
+    if is_valid:
+        exc_ctx = nullcontext()
+    else:
+        exc_ctx = pytest.raises(ValueError, match=f"passed {num_images} image")
+
+    with exc_ctx:
+        processor.apply(
+            "<image>" * num_images,
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+
+
+class _ProcessorProxy:
+
+    def __init__(self, processor: ProcessorMixin) -> None:
+        super().__init__()
+
+        self.__processor = processor
+
+    def __getattr__(self, key: str):
+        return getattr(self.__processor, key)
+
+    def __call__(
+        self,
+        text=None,
+        images=None,
+        videos=None,
+        exists=None,
+        return_tensors=None,
+    ):
+        return dict(exists=exists)
+
+
+@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])  # Dummy
+# yapf: disable
+@pytest.mark.parametrize(
+    ("call_kwargs", "expected_kwargs"),
+    [
+        # Should ignore invalid kwargs
+        ({"does_not_exist": 100}, {"exists": None}),
+        ({"exists": 1}, {"exists": 1}),
+        ({"does_not_exist": 100, "exists": 1}, {"exists": 1}),
+    ],
+)
+# yapf: enable
+def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
+    model_config = ModelConfig(
+        model=model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="auto",
+        revision=None,
+    )
+
+    processor = MULTIMODAL_REGISTRY.create_processor(model_config)
+    orig_get_hf_processor = processor.info.get_hf_processor
+
+    def get_hf_processor(self, **kwargs):
+        assert kwargs == call_kwargs
+        return _ProcessorProxy(orig_get_hf_processor())
+
+    processor.info.get_hf_processor = MethodType(get_hf_processor,
+                                                 processor.info)
+
+    out_kwargs = processor._call_hf_processor(
+        prompt="",
+        mm_data={},
+        mm_kwargs=call_kwargs,
+        tok_kwargs={},
+    )
+
+    assert out_kwargs == expected_kwargs
diff --git a/vllm_v0.10.0/tests/multimodal/test_utils.py b/vllm_v0.10.0/tests/multimodal/test_utils.py
new file mode 100644
index 0000000..3fdf7e3
--- /dev/null
+++ b/vllm_v0.10.0/tests/multimodal/test_utils.py
@@ -0,0 +1,514 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import base64
+import mimetypes
+import os
+from tempfile import NamedTemporaryFile, TemporaryDirectory
+from typing import TYPE_CHECKING, NamedTuple, Optional
+
+import numpy as np
+import pytest
+import torch
+import torch.multiprocessing as mp
+from PIL import Image, ImageChops
+
+from tests.utils import multi_gpu_test
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed.parallel_state import (init_distributed_environment,
+                                             initialize_model_parallel)
+from vllm.multimodal.image import convert_image_mode
+from vllm.multimodal.inputs import PlaceholderRange
+from vllm.multimodal.utils import (MediaConnector,
+                                   merge_and_sort_multimodal_metadata,
+                                   run_dp_sharded_vision_model)
+from vllm.platforms import current_platform
+from vllm.utils import get_open_port, update_environment_variables
+
+if TYPE_CHECKING:
+    from vllm.multimodal.hasher import MultiModalHashDict
+    from vllm.multimodal.inputs import MultiModalPlaceholderDict
+
+# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
+TEST_IMAGE_URLS = [
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
+    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+]
+
+TEST_VIDEO_URLS = [
+    "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4",
+    "https://github.com/opencv/opencv/raw/refs/tags/4.12.0/samples/data/vtest.avi",
+]
+
+
+@pytest.fixture(scope="module")
+def url_images() -> dict[str, Image.Image]:
+    connector = MediaConnector()
+
+    return {
+        image_url: connector.fetch_image(image_url)
+        for image_url in TEST_IMAGE_URLS
+    }
+
+
+def get_supported_suffixes() -> tuple[str, ...]:
+    # We should at least test the file types mentioned in GPT-4 with Vision
+    OPENAI_SUPPORTED_SUFFIXES = ('.png', '.jpeg', '.jpg', '.webp', '.gif')
+
+    # Additional file types that are supported by us
+    EXTRA_SUPPORTED_SUFFIXES = ('.bmp', '.tiff')
+
+    return OPENAI_SUPPORTED_SUFFIXES + EXTRA_SUPPORTED_SUFFIXES
+
+
+def _image_equals(a: Image.Image, b: Image.Image) -> bool:
+    return (np.asarray(a) == np.asarray(convert_image_mode(b, a.mode))).all()
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_fetch_image_http(image_url: str):
+    connector = MediaConnector()
+
+    image_sync = connector.fetch_image(image_url)
+    image_async = await connector.fetch_image_async(image_url)
+    assert _image_equals(image_sync, image_async)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("suffix", get_supported_suffixes())
+async def test_fetch_image_base64(url_images: dict[str, Image.Image],
+                                  image_url: str, suffix: str):
+    connector = MediaConnector()
+    url_image = url_images[image_url]
+
+    try:
+        mime_type = Image.MIME[Image.registered_extensions()[suffix]]
+    except KeyError:
+        try:
+            mime_type = mimetypes.types_map[suffix]
+        except KeyError:
+            pytest.skip('No MIME type')
+
+    with NamedTemporaryFile(suffix=suffix) as f:
+        try:
+            url_image.save(f.name)
+        except Exception as e:
+            if e.args[0] == 'cannot write mode RGBA as JPEG':
+                pytest.skip('Conversion not supported')
+
+            raise
+
+        base64_image = base64.b64encode(f.read()).decode("utf-8")
+        data_url = f"data:{mime_type};base64,{base64_image}"
+
+        data_image_sync = connector.fetch_image(data_url)
+        if _image_equals(url_image, Image.open(f)):
+            assert _image_equals(url_image, data_image_sync)
+        else:
+            pass  # Lossy format; only check that image can be opened
+
+        data_image_async = await connector.fetch_image_async(data_url)
+        assert _image_equals(data_image_sync, data_image_async)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_fetch_image_local_files(image_url: str):
+    connector = MediaConnector()
+
+    with TemporaryDirectory() as temp_dir:
+        local_connector = MediaConnector(allowed_local_media_path=temp_dir)
+
+        origin_image = connector.fetch_image(image_url)
+        origin_image.save(os.path.join(temp_dir, os.path.basename(image_url)),
+                          quality=100,
+                          icc_profile=origin_image.info.get('icc_profile'))
+
+        image_async = await local_connector.fetch_image_async(
+            f"file://{temp_dir}/{os.path.basename(image_url)}")
+        image_sync = local_connector.fetch_image(
+            f"file://{temp_dir}/{os.path.basename(image_url)}")
+        # Check that the images are equal
+        assert not ImageChops.difference(image_sync, image_async).getbbox()
+
+        with pytest.raises(ValueError, match="must be a subpath"):
+            await local_connector.fetch_image_async(
+                f"file://{temp_dir}/../{os.path.basename(image_url)}")
+        with pytest.raises(RuntimeError, match="Cannot load local files"):
+            await connector.fetch_image_async(
+                f"file://{temp_dir}/../{os.path.basename(image_url)}")
+
+        with pytest.raises(ValueError, match="must be a subpath"):
+            local_connector.fetch_image(
+                f"file://{temp_dir}/../{os.path.basename(image_url)}")
+        with pytest.raises(RuntimeError, match="Cannot load local files"):
+            connector.fetch_image(
+                f"file://{temp_dir}/../{os.path.basename(image_url)}")
+
+
+@pytest.mark.asyncio
+async def test_fetch_image_error_conversion():
+    connector = MediaConnector()
+    broken_img = "data:image/png;base64,aGVsbG9fdmxsbV9jb21tdW5pdHkK"
+
+    # PIL.UnidentifiedImageError should be converted to ValueError
+    with pytest.raises(ValueError):
+        await connector.fetch_image_async(broken_img)
+
+    with pytest.raises(ValueError):
+        connector.fetch_image(broken_img)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
+async def test_fetch_video_http(video_url: str, num_frames: int):
+    connector = MediaConnector(
+        media_io_kwargs={"video": {
+            "num_frames": num_frames,
+        }})
+
+    video_sync, metadata_sync = connector.fetch_video(video_url)
+    video_async, metadata_async = await connector.fetch_video_async(video_url)
+    assert np.array_equal(video_sync, video_async)
+    assert metadata_sync == metadata_async
+
+
+# Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
+class TestCase(NamedTuple):
+    mm_positions: "MultiModalPlaceholderDict"
+    mm_hashes: Optional["MultiModalHashDict"]
+    expected_modalities: list[str]
+    expected_ranges: list[PlaceholderRange]
+    expected_hashes: Optional[list[str]]
+
+
+def test_merge_and_sort_multimodal_metadata():
+
+    test_cases = [
+        # Single modality should return result as is but flattened
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=3, length=2),
+                ]
+            },
+            mm_hashes={"image": ["hash1", "hash2"]},
+            expected_modalities=["image", "image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=3, length=2),
+            ],
+            expected_hashes=["hash1", "hash2"],
+        ),
+
+        # Single modality without hashes return None for mm hash.
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=2, length=2),
+                ]
+            },
+            mm_hashes=None,
+            expected_modalities=["image", "image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=2, length=2),
+            ],
+            expected_hashes=None,
+        ),
+
+        # Multiple modalities with hashes should return sorted modalities
+        # and flattened ranges and hashes.
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=7, length=4),
+                    PlaceholderRange(offset=11, length=5),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=2, length=3),
+                ]
+            },
+            mm_hashes={
+                "image": ["image_hash1", "image_hash2"],
+                "audio": ["audio_hash1", "audio_hash2"],
+            },
+            expected_modalities=["audio", "audio", "image", "image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=2, length=3),
+                PlaceholderRange(offset=7, length=4),
+                PlaceholderRange(offset=11, length=5),
+            ],
+            expected_hashes=[
+                "audio_hash1", "audio_hash2", "image_hash1", "image_hash2"
+            ],
+        ),
+
+        # Multiple modalities without hashes should return sorted modalities
+        # and flattened ranges and None.
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=7, length=4),
+                    PlaceholderRange(offset=11, length=5),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=2, length=3),
+                ]
+            },
+            mm_hashes=None,
+            expected_modalities=["audio", "audio", "image", "image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=2, length=3),
+                PlaceholderRange(offset=7, length=4),
+                PlaceholderRange(offset=11, length=5),
+            ],
+            expected_hashes=None,
+        ),
+
+        # Three modalities
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=15, length=7),
+                    PlaceholderRange(offset=22, length=8),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=0, length=2),
+                ],
+                "video": [
+                    PlaceholderRange(offset=3, length=4),
+                    PlaceholderRange(offset=7, length=5),
+                    PlaceholderRange(offset=12, length=6),
+                ]
+            },
+            mm_hashes={
+                "image": ["image_hash1", "image_hash2"],
+                "audio": ["audio_hash1"],
+                "video": ["video_hash1", "video_hash2", "video_hash3"]
+            },
+            expected_modalities=[
+                "audio", "video", "video", "video", "image", "image"
+            ],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=3, length=4),
+                PlaceholderRange(offset=7, length=5),
+                PlaceholderRange(offset=12, length=6),
+                PlaceholderRange(offset=15, length=7),
+                PlaceholderRange(offset=22, length=8),
+            ],
+            expected_hashes=[
+                "audio_hash1", "video_hash1", "video_hash2", "video_hash3",
+                "image_hash1", "image_hash2"
+            ],
+        ),
+    ]
+
+    for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
+         expected_hashes) in test_cases:
+        modalities, ranges, hashes = merge_and_sort_multimodal_metadata(
+            mm_positions, mm_hashes)
+
+        assert modalities == expected_modalities
+        assert ranges == expected_ranges
+        assert hashes == expected_hashes
+
+
+def test_merge_and_sort_multimodal_metadata_with_interleaving():
+
+    test_cases = [
+
+        # <image> <audio> <image> <audio>
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=0, length=4),
+                    PlaceholderRange(offset=8, length=2),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=5, length=2),
+                    PlaceholderRange(offset=11, length=4),
+                ]
+            },
+            mm_hashes={
+                "image": ["image_hash1", "image_hash2"],
+                "audio": ["audio_hash1", "audio_hash2"],
+            },
+            expected_modalities=["image", "audio", "image", "audio"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=4),
+                PlaceholderRange(offset=5, length=2),
+                PlaceholderRange(offset=8, length=2),
+                PlaceholderRange(offset=11, length=4),
+            ],
+            expected_hashes=[
+                "image_hash1", "audio_hash1", "image_hash2", "audio_hash2"
+            ],
+        ),
+
+        # <image> <image> <audio> <video> <image>
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=2, length=3),
+                    PlaceholderRange(offset=20, length=4),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=5, length=2),
+                ],
+                "video": [
+                    PlaceholderRange(offset=8, length=5),
+                ]
+            },
+            mm_hashes=None,
+            expected_modalities=["image", "image", "audio", "video", "image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=2, length=3),
+                PlaceholderRange(offset=5, length=2),
+                PlaceholderRange(offset=8, length=5),
+                PlaceholderRange(offset=20, length=4),
+            ],
+            expected_hashes=None,
+        ),
+
+        # <image> <audio> <video> <image> with hashes
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=18, length=4),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=6, length=2),
+                ],
+                "video": [
+                    PlaceholderRange(offset=10, length=5),
+                ]
+            },
+            mm_hashes={
+                "image": ["image_hash1", "image_hash2"],
+                "audio": ["audio_hash1"],
+                "video": ["video_hash1"],
+            },
+            expected_modalities=["image", "audio", "video", "image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=6, length=2),
+                PlaceholderRange(offset=10, length=5),
+                PlaceholderRange(offset=18, length=4),
+            ],
+            expected_hashes=[
+                "image_hash1", "audio_hash1", "video_hash1", "image_hash2"
+            ],
+        ),
+    ]
+
+    for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
+         expected_hashes) in test_cases:
+        modalities, ranges, hashes = merge_and_sort_multimodal_metadata(
+            mm_positions, mm_hashes)
+
+        assert modalities == expected_modalities
+        assert ranges == expected_ranges
+        assert hashes == expected_hashes
+
+
+class SimpleLinearModel(torch.nn.Module):
+    """A simple linear vision model for testing."""
+
+    def __init__(self, input_dim: int = 3 * 224 * 224, output_dim: int = 32):
+        super().__init__()
+        self.flatten = torch.nn.Flatten()
+        self.linear = torch.nn.Linear(input_dim, output_dim)
+
+    def forward(self, x: torch.Tensor):
+        # Flatten the input and apply linear transformation
+        x = self.flatten(x)
+        return self.linear(x)
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "batch_size",
+    [
+        1,  # Single image
+        4,  # Small batch
+        5,  # Odd batch size (for testing padding)
+    ],
+)
+def test_run_dp_sharded_vision_model(batch_size: int):
+    world_size = 2
+    # Launch processes
+    mp.spawn(
+        run_dp_sharded_vision_model_vs_direct,
+        args=(
+            world_size,
+            batch_size,
+            get_open_port(),
+        ),
+        nprocs=world_size,
+    )
+
+
+def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int,
+                                          batch_size: int, master_port: int):
+    """
+    Test that run_dp_sharded_vision_model produces the same results as 
+    calling the model directly.
+    """
+
+    # Set random seed for reproducibility
+    current_platform.seed_everything(0)
+
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+    torch.set_default_device(device)
+
+    update_environment_variables({
+        'RANK': str(local_rank),
+        'LOCAL_RANK': str(local_rank),
+        'WORLD_SIZE': str(world_size),
+        'MASTER_ADDR': 'localhost',
+        'MASTER_PORT': str(master_port),
+    })
+
+    # initialize distributed
+    init_distributed_environment()
+    initialize_model_parallel(tensor_model_parallel_size=world_size)
+
+    # Create a test input tensor
+    image_input = torch.randn(batch_size, 3, 224, 224)
+
+    # Create a simple linear model
+    vision_model = SimpleLinearModel()
+
+    # Run the model directly on the full input
+    with torch.inference_mode():
+        direct_output = vision_model(image_input)
+
+    # Run the model through the sharded function
+    with torch.inference_mode():
+        sharded_output = run_dp_sharded_vision_model(image_input, vision_model)
+
+    # Check that the world size is setup correctly
+    assert get_tensor_model_parallel_world_size() == world_size
+
+    # Check that the outputs have the same shape
+    assert direct_output.shape == sharded_output.shape
+
+    # Check that the outputs are close (they should be identical)
+    assert torch.allclose(direct_output, sharded_output, rtol=1e-5, atol=1e-5)
diff --git a/vllm_v0.10.0/tests/multimodal/test_video.py b/vllm_v0.10.0/tests/multimodal/test_video.py
new file mode 100644
index 0000000..05b7b84
--- /dev/null
+++ b/vllm_v0.10.0/tests/multimodal/test_video.py
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import tempfile
+from pathlib import Path
+
+import numpy as np
+import numpy.typing as npt
+import pytest
+from PIL import Image
+
+from vllm.assets.base import get_vllm_public_assets
+from vllm.assets.video import video_to_ndarrays, video_to_pil_images_list
+from vllm.multimodal.image import ImageMediaIO
+from vllm.multimodal.video import (VIDEO_LOADER_REGISTRY, VideoLoader,
+                                   VideoMediaIO)
+
+from .utils import cosine_similarity, create_video_from_image, normalize_image
+
+NUM_FRAMES = 10
+FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
+FAKE_OUTPUT_2 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
+
+
+@VIDEO_LOADER_REGISTRY.register("test_video_loader_1")
+class TestVideoLoader1(VideoLoader):
+
+    @classmethod
+    def load_bytes(cls, data: bytes, num_frames: int = -1) -> npt.NDArray:
+        return FAKE_OUTPUT_1
+
+
+@VIDEO_LOADER_REGISTRY.register("test_video_loader_2")
+class TestVideoLoader2(VideoLoader):
+
+    @classmethod
+    def load_bytes(cls, data: bytes, num_frames: int = -1) -> npt.NDArray:
+        return FAKE_OUTPUT_2
+
+
+def test_video_loader_registry():
+    custom_loader_1 = VIDEO_LOADER_REGISTRY.load("test_video_loader_1")
+    output_1 = custom_loader_1.load_bytes(b"test")
+    np.testing.assert_array_equal(output_1, FAKE_OUTPUT_1)
+
+    custom_loader_2 = VIDEO_LOADER_REGISTRY.load("test_video_loader_2")
+    output_2 = custom_loader_2.load_bytes(b"test")
+    np.testing.assert_array_equal(output_2, FAKE_OUTPUT_2)
+
+
+def test_video_loader_type_doesnt_exist():
+    with pytest.raises(AssertionError):
+        VIDEO_LOADER_REGISTRY.load("non_existing_video_loader")
+
+
+@VIDEO_LOADER_REGISTRY.register("assert_10_frames_1_fps")
+class Assert10Frames1FPSVideoLoader(VideoLoader):
+
+    @classmethod
+    def load_bytes(cls,
+                   data: bytes,
+                   num_frames: int = -1,
+                   fps: float = -1.0,
+                   **kwargs) -> npt.NDArray:
+        assert num_frames == 10, "bad num_frames"
+        assert fps == 1.0, "bad fps"
+        return FAKE_OUTPUT_2
+
+
+def test_video_media_io_kwargs(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "assert_10_frames_1_fps")
+        imageio = ImageMediaIO()
+
+        # Verify that different args pass/fail assertions as expected.
+        videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 1.0})
+        _ = videoio.load_bytes(b"test")
+
+        videoio = VideoMediaIO(
+            imageio, **{
+                "num_frames": 10,
+                "fps": 1.0,
+                "not_used": "not_used"
+            })
+        _ = videoio.load_bytes(b"test")
+
+        with pytest.raises(AssertionError, match="bad num_frames"):
+            videoio = VideoMediaIO(imageio, **{})
+            _ = videoio.load_bytes(b"test")
+
+        with pytest.raises(AssertionError, match="bad num_frames"):
+            videoio = VideoMediaIO(imageio, **{"num_frames": 9, "fps": 1.0})
+            _ = videoio.load_bytes(b"test")
+
+        with pytest.raises(AssertionError, match="bad fps"):
+            videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 2.0})
+            _ = videoio.load_bytes(b"test")
+
+
+@pytest.mark.parametrize("is_color", [True, False])
+@pytest.mark.parametrize("fourcc, ext", [("mp4v", "mp4"), ("XVID", "avi")])
+def test_opencv_video_io_colorspace(is_color: bool, fourcc: str, ext: str):
+    """
+    Test all functions that use OpenCV for video I/O return RGB format.
+    Both RGB and grayscale videos are tested.
+    """
+    image_path = get_vllm_public_assets(filename="stop_sign.jpg",
+                                        s3_prefix="vision_model_images")
+    image = Image.open(image_path)
+    with tempfile.TemporaryDirectory() as tmpdir:
+        if not is_color:
+            image_path = f"{tmpdir}/test_grayscale_image.png"
+            image = image.convert("L")
+            image.save(image_path)
+            # Convert to gray RGB for comparison
+            image = image.convert("RGB")
+        video_path = f"{tmpdir}/test_RGB_video.{ext}"
+        create_video_from_image(
+            image_path,
+            video_path,
+            num_frames=2,
+            is_color=is_color,
+            fourcc=fourcc,
+        )
+
+        frames = video_to_ndarrays(video_path)
+        for frame in frames:
+            sim = cosine_similarity(normalize_image(np.array(frame)),
+                                    normalize_image(np.array(image)))
+            assert np.sum(np.isnan(sim)) / sim.size < 0.001
+            assert np.nanmean(sim) > 0.99
+
+        pil_frames = video_to_pil_images_list(video_path)
+        for frame in pil_frames:
+            sim = cosine_similarity(normalize_image(np.array(frame)),
+                                    normalize_image(np.array(image)))
+            assert np.sum(np.isnan(sim)) / sim.size < 0.001
+            assert np.nanmean(sim) > 0.99
+
+        io_frames, _ = VideoMediaIO(ImageMediaIO()).load_file(Path(video_path))
+        for frame in io_frames:
+            sim = cosine_similarity(normalize_image(np.array(frame)),
+                                    normalize_image(np.array(image)))
+            assert np.sum(np.isnan(sim)) / sim.size < 0.001
+            assert np.nanmean(sim) > 0.99
diff --git a/vllm_v0.10.0/tests/multimodal/utils.py b/vllm_v0.10.0/tests/multimodal/utils.py
new file mode 100644
index 0000000..9a58292
--- /dev/null
+++ b/vllm_v0.10.0/tests/multimodal/utils.py
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import cv2
+import numpy as np
+import numpy.typing as npt
+from PIL import Image
+
+
+def random_image(rng: np.random.RandomState, min_wh: int, max_wh: int):
+    w, h = rng.randint(min_wh, max_wh, size=(2, ))
+    arr = rng.randint(0, 255, size=(w, h, 3), dtype=np.uint8)
+    return Image.fromarray(arr)
+
+
+def random_video(
+    rng: np.random.RandomState,
+    min_frames: int,
+    max_frames: int,
+    min_wh: int,
+    max_wh: int,
+):
+    num_frames = rng.randint(min_frames, max_frames)
+    w, h = rng.randint(min_wh, max_wh, size=(2, ))
+    return rng.randint(0, 255, size=(num_frames, w, h, 3), dtype=np.uint8)
+
+
+def random_audio(
+    rng: np.random.RandomState,
+    min_len: int,
+    max_len: int,
+    sr: int,
+):
+    audio_len = rng.randint(min_len, max_len)
+    return rng.rand(audio_len), sr
+
+
+def create_video_from_image(
+    image_path: str,
+    video_path: str,
+    num_frames: int = 10,
+    fps: float = 1.0,
+    is_color: bool = True,
+    fourcc: str = "mp4v",
+):
+    image = cv2.imread(image_path)
+    if not is_color:
+        # Convert to grayscale if is_color is False
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        height, width = image.shape
+    else:
+        height, width, _ = image.shape
+
+    video_writer = cv2.VideoWriter(
+        video_path,
+        cv2.VideoWriter_fourcc(*fourcc),
+        fps,
+        (width, height),
+        isColor=is_color,
+    )
+
+    for _ in range(num_frames):
+        video_writer.write(image)
+
+    video_writer.release()
+    return video_path
+
+
+def cosine_similarity(A: npt.NDArray,
+                      B: npt.NDArray,
+                      axis: int = -1) -> npt.NDArray:
+    """Compute cosine similarity between two vectors."""
+    return (np.sum(A * B, axis=axis) /
+            (np.linalg.norm(A, axis=axis) * np.linalg.norm(B, axis=axis)))
+
+
+def normalize_image(image: npt.NDArray) -> npt.NDArray:
+    """Normalize image to [0, 1] range."""
+    return image.astype(np.float32) / 255.0
\ No newline at end of file
diff --git a/vllm_v0.10.0/tests/neuron/1_core/test_activation.py b/vllm_v0.10.0/tests/neuron/1_core/test_activation.py
new file mode 100644
index 0000000..2d6e5f5
--- /dev/null
+++ b/vllm_v0.10.0/tests/neuron/1_core/test_activation.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from vllm.model_executor.layers.activation import FastGELU, SiluAndMul
+from vllm.platforms import current_platform
+
+
+@pytest.mark.parametrize("activation", ["silu_and_mul", "gelu_fast"])
+@pytest.mark.parametrize("num_tokens,d,dtype", [
+    (7, 512, torch.half),
+    (7, 512, torch.float),
+    (83, 512, torch.half),
+])
+@torch.inference_mode()
+def test_act_and_mul(
+    activation: str,
+    num_tokens: int,
+    d: int,
+    dtype: torch.dtype,
+) -> None:
+    import torch_xla.core.xla_model as xm
+
+    device = xm.xla_device()
+    current_platform.seed_everything(0)
+    torch.set_default_device("cpu")
+    x = torch.randn(num_tokens, 2 * d, dtype=dtype).to(device=device)
+    if activation == "silu_and_mul":
+        layer = SiluAndMul()
+        fn = layer.forward_native
+    elif activation == "gelu_fast":
+        layer = FastGELU()
+        fn = F.gelu
+    else:
+        raise NotImplementedError(
+            f"activation {activation} is not implemented.")
+    assert x.is_xla, "input tensor under testing is expected to be XLA tensor."
+    out = layer.to(device=device).forward_neuron(x)
+    ref_out = fn(x.cpu())
+    torch.testing.assert_close(out.cpu(), ref_out, atol=0.01, rtol=0.0)
diff --git a/vllm_v0.10.0/tests/neuron/1_core/test_block_table.py b/vllm_v0.10.0/tests/neuron/1_core/test_block_table.py
new file mode 100644
index 0000000..efec563
--- /dev/null
+++ b/vllm_v0.10.0/tests/neuron/1_core/test_block_table.py
@@ -0,0 +1,154 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import neuronxcc.nki.language as nl
+import pytest
+import torch
+import torch.nn.functional as F
+from neuronxcc import nki
+
+from vllm.attention.ops.nki_flash_attn import (
+    load_block_tables, transform_block_tables_for_indirect_load)
+
+
+def is_power_of_2(n):
+    return n > 0 and (n & (n - 1) == 0)
+
+
+def nki_load_and_transform_block_tables(
+    block_tables,
+    num_tiles,
+    num_blocks_per_tile,
+    num_head,
+    head_id,
+    block_size_tiling_factor,
+):
+    assert is_power_of_2(
+        num_blocks_per_tile), f"{num_blocks_per_tile=} must be power of 2"
+    block_tables_sbuf = load_block_tables(block_tables, num_tiles,
+                                          num_blocks_per_tile)
+
+    # we need to pass an Index as head_id
+    head_id = nl.arange(1)[None, :] + head_id
+
+    block_tables_transposed = transform_block_tables_for_indirect_load(
+        block_tables_sbuf, block_size_tiling_factor, num_head, head_id)
+    B_P_SIZE = 128
+    assert block_tables_transposed.shape[1] == B_P_SIZE
+
+    out = nl.ndarray(
+        block_tables_transposed.shape,
+        dtype=nl.int32,
+        buffer=nl.shared_hbm,
+    )
+    for i in nl.affine_range(block_tables_transposed.shape[0]):
+        nl.store(dst=out[i], value=block_tables_transposed[i])
+    return out
+
+
+def ref_block_tables_transform(
+    block_tables,
+    num_tiles,
+    num_blocks_per_tile,
+    num_head,
+    head_id,
+    block_size_tiling_factor,
+):
+    assert block_tables.numel() == num_tiles * num_blocks_per_tile
+    block_tables = block_tables.view(num_tiles, num_blocks_per_tile)
+    B_F_SIZE = 128
+    num_tiles_padded = (num_tiles + B_F_SIZE - 1) // B_F_SIZE * B_F_SIZE
+    block_tables = F.pad(
+        block_tables,
+        (0, 0, 0, num_tiles_padded - num_tiles),
+        "constant",
+        0,
+    )
+
+    block_tables = block_tables * num_head + head_id
+    block_tables = block_tables.view(num_tiles_padded, num_blocks_per_tile, 1)
+    offset = torch.arange(0, block_size_tiling_factor).view(1, 1, -1)
+    block_tables = block_tables * block_size_tiling_factor + offset
+    block_tables_transposed = block_tables.view(num_tiles_padded, -1).t()
+
+    num_blocks_per_tile = block_tables_transposed.shape[0]
+    assert num_blocks_per_tile % B_F_SIZE == 0
+    return block_tables_transposed.view(num_blocks_per_tile // B_F_SIZE,
+                                        B_F_SIZE, num_tiles_padded)
+
+
+@pytest.mark.parametrize(
+    "q_head_per_kv_head,head_id",
+    [
+        (1, 0),
+        (3, 1),
+    ],
+)
+@pytest.mark.parametrize(
+    "num_tiles,num_blocks_per_tile",
+    [
+        (1, 1),
+        (13, 16),
+        (17, 128),
+        (35, 512),
+        (128, 128),
+        (130, 64),
+        (280, 256),
+        (315, 1),
+    ],
+)
+@torch.inference_mode()
+def test_load_and_transform_block_tables(
+    monkeypatch: pytest.MonkeyPatch,
+    num_tiles,
+    num_blocks_per_tile,
+    q_head_per_kv_head,
+    head_id,
+) -> None:
+    import torch_xla.core.xla_model as xm
+
+    device = xm.xla_device()
+
+    compiler_flags_str = " ".join([
+        "-O1",
+        "--retry_failed_compilation",
+    ])
+    with monkeypatch.context() as m:
+        m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
+
+        torch.manual_seed(10000)
+        torch.set_printoptions(sci_mode=False)
+
+        # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient
+        B_P_SIZE = 128
+        if num_blocks_per_tile < B_P_SIZE:
+            assert B_P_SIZE % num_blocks_per_tile == 0
+            block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile
+        else:
+            block_size_tiling_factor = 1
+        max_num_blocks = 100000
+        block_tables = torch.randint(
+            0,
+            max_num_blocks,
+            (num_tiles * num_blocks_per_tile, ),
+            dtype=torch.int32,
+        )
+        nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1](
+            block_tables.to(device=device),
+            num_tiles,
+            num_blocks_per_tile,
+            q_head_per_kv_head,
+            head_id,
+            block_size_tiling_factor,
+        ).cpu()
+        ref_out = ref_block_tables_transform(
+            block_tables,
+            num_tiles,
+            num_blocks_per_tile,
+            q_head_per_kv_head,
+            head_id,
+            block_size_tiling_factor,
+        )
+        assert (nki_out.shape == ref_out.shape
+                ), f"{nki_out.shape=} != {ref_out.shape=}"
+        assert torch.all(nki_out == ref_out)
diff --git a/vllm_v0.10.0/tests/neuron/1_core/test_cache.py b/vllm_v0.10.0/tests/neuron/1_core/test_cache.py
new file mode 100644
index 0000000..670889a
--- /dev/null
+++ b/vllm_v0.10.0/tests/neuron/1_core/test_cache.py
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm.attention.ops.nki_flash_attn import reshape_and_cache
+
+
+@pytest.mark.parametrize(
+    "num_tokens, n_kv_head, d_head, num_blocks, block_size",
+    [
+        # Small model configuration (e.g., GPT-2 small)
+        (32, 12, 64, 4, 128),  # Typical sequence processing
+        (1, 12, 64, 4, 128),  # Single token update
+        (128, 12, 64, 4, 128),  # Longer sequence
+
+        # Medium model configuration (e.g., GPT-2 medium)
+        (64, 16, 96, 8, 256),  # Standard batch
+        (256, 16, 96, 8, 256),  # Large batch
+
+        # Large model configuration (e.g., GPT-3 style)
+        (48, 32, 128, 16, 512),  # Typical processing window
+        (512, 32, 128, 16, 512),  # Full context window
+
+        # Edge cases and stress tests
+        (1024, 8, 32, 32, 32),  # Many tokens, small heads
+        (16, 64, 256, 4, 64),  # Few tokens, many heads
+        (2048, 24, 128, 64, 128),  # Large scale test
+
+        # Minimal configurations for debugging
+        (4, 2, 16, 2, 16),  # Tiny test case
+        (1, 1, 8, 1, 8),  # Minimal possible
+    ])
+def test_reshape_and_cache(num_tokens, n_kv_head, d_head, num_blocks,
+                           block_size):
+    # Set random seed for reproducibility
+    torch.manual_seed(42)
+
+    # Create CPU tensors for reference implementation
+    key_cpu = torch.randn(num_tokens, n_kv_head, d_head) / torch.sqrt(
+        torch.tensor(d_head))
+    value_cpu = torch.randn(num_tokens, n_kv_head, d_head) / torch.sqrt(
+        torch.tensor(d_head))
+    key_cache_cpu = torch.zeros(num_blocks, n_kv_head, block_size, d_head)
+    value_cache_cpu = torch.zeros(num_blocks, n_kv_head, block_size, d_head)
+    slot_mapping_cpu = torch.randperm(num_blocks * block_size)[:num_tokens]
+
+    # Run reference implementation on CPU
+    block_indices = torch.div(slot_mapping_cpu,
+                              block_size,
+                              rounding_mode="floor")
+    block_offsets = slot_mapping_cpu % block_size
+
+    for i in range(num_tokens):
+        block_idx = block_indices[i]
+        block_offset = block_offsets[i]
+        key_cache_cpu[block_idx, :, block_offset, :] = key_cpu[i]
+        value_cache_cpu[block_idx, :, block_offset, :] = value_cpu[i]
+
+    # Create XLA device tensors
+    device = torch.device('xla')
+    key = key_cpu.to(device)
+    value = value_cpu.to(device)
+    key_cache = torch.zeros_like(key_cache_cpu, device=device)
+    value_cache = torch.zeros_like(value_cache_cpu, device=device)
+    slot_mapping = slot_mapping_cpu.to(device)
+    kv_cache = torch.stack([key_cache, value_cache])
+
+    # Run vectorized implementation on XLA device
+    reshape_and_cache(key, value, kv_cache, slot_mapping)
+    key_cache, value_cache = torch.unbind(kv_cache, dim=0)
+
+    # Move results back to CPU for comparison
+    key_cache_result = key_cache.cpu()
+    value_cache_result = value_cache.cpu()
+
+    # Assert results match
+    torch.testing.assert_close(key_cache_result,
+                               key_cache_cpu,
+                               rtol=1e-5,
+                               atol=1e-5)
+    torch.testing.assert_close(value_cache_result,
+                               value_cache_cpu,
+                               rtol=1e-5,
+                               atol=1e-5)
diff --git a/vllm_v0.10.0/tests/neuron/1_core/test_layernorm.py b/vllm_v0.10.0/tests/neuron/1_core/test_layernorm.py
new file mode 100644
index 0000000..c6fce1d
--- /dev/null
+++ b/vllm_v0.10.0/tests/neuron/1_core/test_layernorm.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.platforms import current_platform
+
+
+@pytest.mark.parametrize("num_tokens,hidden_size,add_residual,dtype", [
+    (7, 8, False, torch.half),
+    (83, 768, False, torch.half),
+    (83, 768, True, torch.half),
+    (83, 768, True, torch.bfloat16),
+    (83, 768, True, torch.float32),
+])
+@torch.inference_mode()
+def test_rms_norm(
+    num_tokens: int,
+    hidden_size: int,
+    add_residual: bool,
+    dtype: torch.dtype,
+) -> None:
+    import torch_xla.core.xla_model as xm
+
+    device = xm.xla_device()
+    current_platform.seed_everything(0)
+    torch.set_default_device("cpu")
+    layer = RMSNorm(hidden_size).to(dtype=dtype)
+    layer.weight.data.normal_(mean=1.0, std=0.1)
+    scale = 1 / (2 * hidden_size)
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype).to(device=device)
+    x *= scale
+    residual = torch.randn_like(x) * scale if add_residual else None
+
+    residual_cpu = residual.cpu() if add_residual else None
+    ref_out = layer.to(device="cpu").forward_native(x.cpu(), residual_cpu)
+    assert x.is_xla, "input tensor under testing is expected to be XLA tensor."
+    out = layer.to(device=device)(x, residual)
+
+    # NOTE(woosuk): LayerNorm operators (including RMS) typically have larger
+    # numerical errors than other operators because they involve reductions.
+    # Therefore, we use a larger tolerance.
+    if add_residual:
+        assert out[0].is_xla, "output tensor is expected to be XLA tensor"
+        torch.testing.assert_close(out[0].cpu(),
+                                   ref_out[0],
+                                   atol=1e-2,
+                                   rtol=1e-2)
+        torch.testing.assert_close(out[1].cpu(),
+                                   ref_out[1],
+                                   atol=1e-2,
+                                   rtol=1e-2)
+    else:
+        assert out.is_xla, "output tensor is expected to be XLA tensor"
+        torch.testing.assert_close(out.cpu(), ref_out, atol=1e-2, rtol=1e-2)
diff --git a/vllm_v0.10.0/tests/neuron/1_core/test_logits_processor.py b/vllm_v0.10.0/tests/neuron/1_core/test_logits_processor.py
new file mode 100644
index 0000000..ce9eadf
--- /dev/null
+++ b/vllm_v0.10.0/tests/neuron/1_core/test_logits_processor.py
@@ -0,0 +1,95 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import random
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.utils import set_random_seed
+from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
+from vllm.utils import is_pin_memory_available
+
+
+class MockLogitsProcessor(LogitsProcessor):
+
+    def __init__(self, vocab_size: int, scale: float,
+                 fake_logits: torch.Tensor):
+        super().__init__(vocab_size=vocab_size, scale=scale)
+        self.fake_logits = fake_logits.clone()
+
+    def forward(self, *args, **kwargs):
+        with patch(
+                "vllm.model_executor.layers.logits_processor._prune_hidden_states",
+                lambda x, y: x
+        ), patch(
+                "vllm.model_executor.layers.logits_processor.LogitsProcessor._get_logits",
+                lambda *args, **kwargs: self.fake_logits):
+            return super().forward(*args, **kwargs)
+
+
+def _prepare_test(
+        batch_size: int
+) -> tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]:
+    vocab_size = 32000
+    input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
+    fake_logits = torch.full((batch_size, vocab_size),
+                             1e-2,
+                             dtype=input_tensor.dtype)
+    logits_processor = MockLogitsProcessor(32000, 0.5, fake_logits)
+    return input_tensor, fake_logits, logits_processor
+
+
+RANDOM_SEEDS = list(range(8))
+
+
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+def test_logits_processors(seed: int):
+    import torch_xla.core.xla_model as xm
+
+    device = xm.xla_device()
+    set_random_seed(seed)
+    torch.set_default_device("cpu")
+    batch_size = random.randint(1, 256)
+    input_tensor, fake_logits, logits_processor = _prepare_test(batch_size)
+
+    # This sample logits processor gives infinite score to the i-th token,
+    # where i is the length of the input sequence.
+    # We therefore expect the output token sequence to be [0, 1, 2, ...]
+    def pick_ith(token_ids, logits):
+        logits[len(token_ids)] = float("inf")
+        return logits
+
+    seq_group_metadata_list = []
+    seq_lens = []
+    for i in range(batch_size):
+        seq_group_metadata_list.append(
+            SequenceGroupMetadata(
+                request_id=f"test_{i}",
+                is_prompt=True,
+                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
+                sampling_params=SamplingParams(temperature=0,
+                                               logits_processors=[pick_ith]),
+                block_tables={0: [1]},
+            ))
+        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
+
+    sampling_metadata = SamplingMetadata.prepare(
+        seq_group_metadata_list,
+        seq_lens,
+        query_lens=seq_lens,
+        device=device,
+        pin_memory=is_pin_memory_available())
+    logits_processor_output = logits_processor(
+        lm_head=None,
+        hidden_states=input_tensor,
+        sampling_metadata=sampling_metadata)
+
+    fake_logits *= logits_processor.scale
+    torch.testing.assert_close(logits_processor_output[:, 1],
+                               fake_logits[:, 1],
+                               rtol=1e-4,
+                               atol=0.0)
diff --git a/vllm_v0.10.0/tests/neuron/1_core/test_neuron_model_runner.py b/vllm_v0.10.0/tests/neuron/1_core/test_neuron_model_runner.py
new file mode 100644
index 0000000..5f32688
--- /dev/null
+++ b/vllm_v0.10.0/tests/neuron/1_core/test_neuron_model_runner.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+from unittest.mock import MagicMock
+
+from vllm.config import VllmConfig
+from vllm.engine.arg_utils import EngineArgs
+from vllm.platforms import current_platform
+from vllm.platforms.neuron import NeuronFramework
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import SequenceData, SequenceGroupMetadata
+from vllm.worker.neuron_model_runner import NeuronModelRunner
+
+os.environ[
+    'VLLM_NEURON_FRAMEWORK'] = NeuronFramework.TRANSFORMERS_NEURONX.value
+
+
+def _create_neuron_model_runner(model: str, *args,
+                                **kwargs) -> NeuronModelRunner:
+    engine_args = EngineArgs(model, *args, **kwargs)
+    engine_config = engine_args.create_engine_config()
+    vllm_config = VllmConfig(
+        model_config=engine_config.model_config,
+        parallel_config=engine_config.parallel_config,
+        scheduler_config=engine_config.scheduler_config,
+        device_config=engine_config.device_config,
+    )
+    neuron_model_runner = NeuronModelRunner(vllm_config=vllm_config)
+    return neuron_model_runner
+
+
+def test_update_neuron_sampling_params_not_full_batch():
+    os.environ["NEURON_ON_DEVICE_SAMPLING_DISABLED"] = "0"
+    model_runner = _create_neuron_model_runner(
+        "facebook/opt-125m",
+        seed=0,
+        dtype="float16",
+        max_num_seqs=2,
+    )
+    assert not model_runner._on_device_sampling_disabled
+    # Test sampling param updating only when TNx is framework
+    # NxDI handles sampling parameter updating inside model
+    if current_platform.use_transformers_neuronx():
+        model_mock = MagicMock()
+        model_runner.model = model_mock
+
+        seq_group_metadata_list = [
+            SequenceGroupMetadata(
+                request_id="test_0",
+                is_prompt=True,
+                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
+                sampling_params=SamplingParams(temperature=0.5,
+                                               top_k=1,
+                                               top_p=0.5),
+                block_tables={0: [1]},
+            )
+        ]
+
+        model_runner.prepare_model_input(seq_group_metadata_list)
+
+        # Index neuron sampling parameters based on block_tables indices.
+        # The first block_id of the sequence 0 is 1, so its parameters are
+        # placed at index 1. So the sampling parameters will be:
+        # Index 0: default sampling parameters
+        # Index 1: sequecne 0's sampling parameters.
+        neuron_sampling_params = (
+            model_runner.model_config.neuron_sampling_params)
+        assert neuron_sampling_params.temperature == [1.0, 0.5]
+        assert neuron_sampling_params.top_k == [
+            model_runner._MAX_NEURON_SAMPLING_TOP_K, 1
+        ]
+        assert neuron_sampling_params.top_p == [1.0, 0.5]
+        model_mock.model.update_generation_config.assert_called_once_with(
+            neuron_sampling_params)
+
+
+def test_update_neuron_sampling_params_full_batch():
+    os.environ["NEURON_ON_DEVICE_SAMPLING_DISABLED"] = "0"
+    model_runner = _create_neuron_model_runner(
+        "facebook/opt-125m",
+        seed=0,
+        dtype="float16",
+        max_num_seqs=2,
+    )
+    assert not model_runner._on_device_sampling_disabled
+
+    # Test sampling param updating only when TNx is framework
+    # NxDI handles sampling parameter updating inside model
+    if current_platform.use_transformers_neuronx():
+        model_mock = MagicMock()
+        model_runner.model = model_mock
+
+        seq_group_metadata_list = [
+            SequenceGroupMetadata(
+                request_id="test_0",
+                is_prompt=True,
+                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
+                sampling_params=SamplingParams(temperature=0.5,
+                                               top_k=1,
+                                               top_p=0.5),
+                block_tables={0: [1]},
+            ),
+            SequenceGroupMetadata(
+                request_id="test_0",
+                is_prompt=True,
+                seq_data={1: SequenceData.from_seqs([4, 5, 6])},
+                sampling_params=SamplingParams(temperature=0.2,
+                                               top_k=2,
+                                               top_p=0.2),
+                block_tables={1: [0]},
+            )
+        ]
+
+        model_runner.prepare_model_input(seq_group_metadata_list)
+
+        # Index neuron sampling parameters based on block_tables indices.
+        # The first block_id of the sequence 0 is 1, so its parameters are
+        # placed at index 1. So the sampling parameters will be:
+        # Index 0: sequence 1's sampling parameters
+        # Index 1: sequecne 0's sampling parameters.
+        neuron_sampling_params = (
+            model_runner.model_config.neuron_sampling_params)
+        assert neuron_sampling_params.temperature == [0.2, 0.5]
+        assert neuron_sampling_params.top_k == [2, 1]
+        assert neuron_sampling_params.top_p == [0.2, 0.5]
+        model_mock.model.update_generation_config.assert_called_once_with(
+            neuron_sampling_params)
diff --git a/vllm_v0.10.0/tests/neuron/1_core/test_neuron_quant.py b/vllm_v0.10.0/tests/neuron/1_core/test_neuron_quant.py
new file mode 100644
index 0000000..0863002
--- /dev/null
+++ b/vllm_v0.10.0/tests/neuron/1_core/test_neuron_quant.py
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.model_executor.layers.quantization.neuron_quant import (
+    NeuronQuantConfig)
+
+
+def test_get_supported_act_dtypes():
+    neuron_quant_config = NeuronQuantConfig()
+    supported_act_dtypes = neuron_quant_config.get_supported_act_dtypes()
+    target_list = ["any_dtype1", "any_dtype2"]
+    for dtype in target_list:
+        assert dtype in supported_act_dtypes
diff --git a/vllm_v0.10.0/tests/neuron/1_core/test_prefix_prefill.py b/vllm_v0.10.0/tests/neuron/1_core/test_prefix_prefill.py
new file mode 100644
index 0000000..abf7feb
--- /dev/null
+++ b/vllm_v0.10.0/tests/neuron/1_core/test_prefix_prefill.py
@@ -0,0 +1,514 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from vllm.utils import cdiv
+
+
+class BlockDiagonalCausalFromBottomRightMask:
+
+    @staticmethod
+    def _from_seqlens(query_lens, seq_lens, block_size=None):
+        from torch import logical_and, logical_or
+
+        contexted = block_size is None
+        context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
+        n_queries = sum(query_lens)
+        num_seqs = len(query_lens)
+        if contexted:
+            key_lens_blockaligned = seq_lens
+        else:
+            n_blocks_per_seq = (context_lens + block_size - 1) // block_size
+            offset_per_seq = n_blocks_per_seq * block_size
+            key_lens_blockaligned = offset_per_seq[:num_seqs].tolist()
+        n_keys = sum(key_lens_blockaligned)
+
+        a = (torch.arange(n_queries).reshape(n_queries,
+                                             1).expand(n_queries, n_keys))
+        b = torch.arange(n_keys).reshape(1, n_keys).expand(n_queries, n_keys)
+        q_cumsum = torch.tensor([0] + query_lens).cumsum(dim=0)
+        k_cumsum = torch.tensor([0] + key_lens_blockaligned).cumsum(dim=0)
+
+        prior_mask = torch.zeros(n_queries, n_keys)
+        new_masks: list[torch.Tensor] = []
+        for seq_id in range(num_seqs):
+            ri = q_cumsum[seq_id]
+            ci = k_cumsum[seq_id]
+            nr = query_lens[seq_id]
+
+            if contexted:
+                nc = seq_lens[seq_id]
+                a_offset = ci + nc - ri - nr
+                new_mask = (a + a_offset) >= b
+            else:
+                nc = context_lens[seq_id]
+                a_offset = ci + nc - 1
+                new_mask = a_offset >= b
+
+            left_mask = b >= ci
+            top_mask = a >= ri
+            bottom_mask = a < (ri + nr)
+
+            new_mask = logical_and(
+                logical_and(logical_and(new_mask, left_mask), top_mask),
+                bottom_mask,
+            )
+            prior_mask = logical_or(prior_mask, new_mask)
+            new_masks = new_masks + [new_mask]
+        return prior_mask
+
+    @staticmethod
+    def from_seqlens(query_lens, seq_lens, block_size=None):
+        contexted = block_size is None
+        if contexted:
+            prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
+                query_lens, seq_lens)
+            active_mask = None
+        else:
+            prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
+                query_lens, seq_lens, block_size)
+            active_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
+                query_lens, query_lens)
+        return prior_mask, active_mask
+
+
+def ref_softmax(x: torch.Tensor,
+                dim: int,
+                mixed_precision=False,
+                return_max_reduce=False):
+    max_value = torch.amax(x, dim=dim, keepdims=True)
+    exp = torch.exp(x - max_value)
+    if mixed_precision:
+        sum_value = torch.sum(exp.astype(torch.float32),
+                              dim=dim,
+                              keepdims=True).astype(x.dtype)
+    else:
+        sum_value = torch.sum(exp, dim=dim, keepdims=True)
+    if return_max_reduce:
+        return exp / sum_value, max_value, torch.reciprocal(sum_value)
+    return exp / sum_value
+
+
+def ref_masked_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    scale: float,
+    attn_mask: Optional[torch.Tensor] = None,
+    return_max_reduce: Optional[bool] = False,
+) -> torch.Tensor:
+    scaled_qk = scale * torch.einsum("qhd,khd->hqk", query, key).float()
+    if attn_mask is not None:
+        masked_score = scaled_qk + attn_mask.float()
+    if return_max_reduce:
+        norm_score, cached_max, cached_sum_reciprocal = ref_softmax(
+            masked_score, dim=-1, return_max_reduce=True)
+    else:
+        norm_score = ref_softmax(masked_score, dim=-1)
+    out = torch.einsum("hqk,khd->qhd", norm_score.to(value.dtype), value)
+    if return_max_reduce:
+        return (
+            out,
+            cached_max,
+            cached_sum_reciprocal,
+            norm_score,
+            masked_score,
+            scaled_qk,
+        )
+    else:
+        return (out, )
+
+
+def ref_context_attention(
+    query,
+    key,
+    value,
+    query_lens,
+    seq_lens,
+    head_size,
+    num_queries_per_kv,
+    return_max_reduce=False,
+):
+    scale = float(1.0 / (head_size**0.5))
+    if num_queries_per_kv > 1:
+        # Handle MQA and GQA
+        key = torch.repeat_interleave(key, num_queries_per_kv, dim=1)
+        value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
+
+    attn_mask, _ = BlockDiagonalCausalFromBottomRightMask.from_seqlens(
+        query_lens, seq_lens)
+
+    # convert binary mask to -inf values
+    attn_mask = torch.logical_not(attn_mask)
+    attn_mask = attn_mask.float() * -30000
+
+    output, *debug_tensors = ref_masked_attention(
+        query,
+        key,
+        value,
+        scale,
+        attn_mask,
+        return_max_reduce=return_max_reduce,
+    )
+
+    output = output.unsqueeze(1)
+    if return_max_reduce:
+        cached_max, cached_sum_reciprocal, lse, masked_score, scaled_qk = (
+            debug_tensors)
+        return (
+            output,
+            cached_max,
+            cached_sum_reciprocal,
+            lse,
+            masked_score,
+            scaled_qk,
+        )
+    else:
+        return output
+
+
+def sample_inputs(
+    prefill_batch_size,
+    decode_batch_size,
+    min_query_len,
+    max_query_len,
+    min_ctx_len,
+    max_ctx_len,
+    block_size,
+    num_heads,
+    num_kv_heads,
+    head_size,
+    dtype,
+):
+    batch_size = prefill_batch_size + decode_batch_size
+    max_model_len = (max_query_len + max_ctx_len) * 4
+    max_block_per_request = max_model_len // block_size
+    cache_size = (batch_size * max_block_per_request) + 2
+    prefill_ctx_lens = torch.randint(min_ctx_len,
+                                     max_ctx_len + 1, (prefill_batch_size, ),
+                                     dtype=torch.long).tolist()
+    decode_ctx_lens = torch.randint(min_ctx_len,
+                                    max_ctx_len + 1, (decode_batch_size, ),
+                                    dtype=torch.long).tolist()
+    ctx_lens = prefill_ctx_lens + decode_ctx_lens
+    query_lens = torch.randint(
+        min_query_len,
+        max_query_len + 1,
+        (prefill_batch_size, ),
+        dtype=torch.long,
+    ).tolist() + [1 for _ in range(decode_batch_size)]
+    seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)]
+
+    num_tokens = sum(query_lens)
+    query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
+    query.uniform_(-1, 1)
+    torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
+
+    kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype)
+    kv.uniform_(-1, 1)
+    key, value = kv.unbind(dim=1)
+
+    k_cache = torch.zeros(cache_size,
+                          block_size,
+                          num_kv_heads,
+                          head_size,
+                          dtype=dtype)
+    v_cache = torch.zeros(cache_size,
+                          block_size,
+                          num_kv_heads,
+                          head_size,
+                          dtype=dtype)
+    k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
+    v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
+    values = torch.arange(0, cache_size, dtype=torch.long)
+    values = values[torch.randperm(cache_size)]
+    block_table = values[:batch_size * max_block_per_request].view(
+        batch_size, max_block_per_request)
+    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
+    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens[:-1],
+                                            dtype=torch.long),
+                               dim=0)
+    # copy kv to cache
+    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
+                                                dtype=torch.long),
+                                   dim=0)
+    for i in range(batch_size):
+        for j in range(query_lens[i]):
+            k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] +
+                                            j])
+            v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] +
+                                              b_ctx_len[i] + j])
+        cur_ctx = 0
+        block_id = 0
+        while cur_ctx < b_ctx_len[i]:
+            start_loc = b_seq_start_loc[i] + cur_ctx
+            if cur_ctx + block_size > b_ctx_len[i]:
+                end_loc = b_seq_start_loc[i] + b_ctx_len[i]
+            else:
+                end_loc = start_loc + block_size
+            start_slot = block_table[i, block_id] * block_size
+            end_slot = start_slot + end_loc - start_loc
+            k_cache.view(-1, num_kv_heads,
+                         head_size)[start_slot:end_slot].copy_(
+                             key[start_loc:end_loc])
+            v_cache.view(-1, num_kv_heads,
+                         head_size)[start_slot:end_slot].copy_(
+                             value[start_loc:end_loc])
+            cur_ctx += block_size
+            block_id += 1
+    kv_cache = torch.stack([k_cache, v_cache])
+
+    return (
+        query,
+        k,
+        v,
+        kv_cache,
+        block_table,
+        key,
+        value,
+        query_lens,
+        seq_lens,
+    )
+
+
+def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
+                            num_blocks):
+    context_lens = seq_lens - query_lens
+    blocks_per_seq = (context_lens + block_size - 1) // block_size
+    num_seqs = len(seq_lens)
+    active_blocks: list[int] = []
+    for seq_id in range(num_seqs):
+        active_blocks = (
+            active_blocks +
+            block_tables[seq_id, :blocks_per_seq[seq_id]].tolist())
+    return F.pad(
+        torch.tensor(active_blocks, dtype=torch.int32),
+        (0, num_blocks - len(active_blocks)),
+        "constant",
+        0,
+    )
+
+
+@pytest.mark.parametrize(
+    "prefill_batch_size,decode_batch_size,block_size,large_tile_size,num_heads,num_queries_per_kv,head_size,mixed_precision",
+    [
+        # Test minimal configurations (small block size)
+        (1, 199, 1, 512, 4, 2, 8, False
+         ),  # minimal block size, small dimensions
+        (1, 199, 1, 512, 4, 2, 8, True),  # same with mixed precision
+
+        # Test common/medium configurations
+        (4, 12, 32, 2048, 32, 8, 64, False),  # common case, larger heads
+        (4, 12, 32, 2048, 16, 4, 32,
+         True),  # medium size, mixed precision, grouped-query attention (GQA)
+
+        # Test large configurations
+        (4, 12, 256, 8192, 8, 1, 128, False),  # large blocks, large head size
+        (4, 12, 256, 8192, 64, 8, 64, True),  # large blocks, many heads
+
+        # Test asymmetric configurations
+        (2, 24, 64, 4096, 12, 4, 96, False),  # varied batch sizes
+        (8, 8, 128, 2048, 24, 2, 48, True),  # balanced batches
+
+        # Test edge cases
+        (1, 128, 16, 1024, 4, 2, 16, False),  # large decode batch
+        (16, 4, 8, 1024, 4, 2, 128, True),  # large prefill batch
+        (4, 12, 32, 2048, 16, 1, 32, True),  # multi-head attention (MHA)
+        (4, 12, 32, 2048, 16, 16, 32, True),  # multi-query attention (MQA)
+    ])
+@torch.inference_mode()
+def test_contexted_kv_attention(
+    monkeypatch: pytest.MonkeyPatch,
+    prefill_batch_size: int,
+    decode_batch_size: int,
+    num_heads: int,
+    num_queries_per_kv: int,
+    head_size: int,
+    block_size: int,
+    large_tile_size,
+    mixed_precision: bool,
+) -> None:
+
+    import torch_xla.core.xla_model as xm
+
+    from vllm.attention.ops.nki_flash_attn import (flash_attn_varlen_nkifunc,
+                                                   reorder_context_mask)
+
+    assert large_tile_size % block_size == 0
+
+    device = xm.xla_device()
+
+    compiler_flags_str = " ".join([
+        "-O1",
+        "--retry_failed_compilation",
+    ])
+    with monkeypatch.context() as m:
+        m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
+
+        torch.manual_seed(0)
+        torch.set_printoptions(sci_mode=False)
+        torch.set_default_device("cpu")
+        dtype = torch.float32
+
+        min_ctx_len = 32
+        max_ctx_len = 1024
+        min_query_len = 16
+        max_query_len = 512
+        num_kv_heads = num_heads // num_queries_per_kv
+        (
+            query,
+            k_active,
+            v_active,
+            kv_cache,
+            block_table,
+            key,
+            value,
+            query_lens,
+            seq_lens,
+        ) = sample_inputs(
+            prefill_batch_size=prefill_batch_size,
+            decode_batch_size=decode_batch_size,
+            min_query_len=min_query_len,
+            max_query_len=max_query_len,
+            min_ctx_len=min_ctx_len,
+            max_ctx_len=max_ctx_len,
+            block_size=block_size,
+            num_heads=num_heads,
+            num_kv_heads=num_kv_heads,
+            head_size=head_size,
+            dtype=dtype,
+        )
+
+        output_ref = ref_context_attention(
+            query,
+            key,
+            value,
+            query_lens,
+            seq_lens,
+            head_size,
+            num_queries_per_kv,
+            return_max_reduce=False,
+        )
+
+        # build neuron program
+        B_P_SIZE = 128
+        assert (large_tile_size >= B_P_SIZE
+                ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}"
+
+        def pad_to_multiple(a, b):
+            return cdiv(a, b) * b
+
+        def pad_to_next_power_of_2(a):
+            assert a > 0
+            return 2**int(a - 1).bit_length()
+
+        # calculate input shapes
+        max_num_queries = pad_to_next_power_of_2(sum(query_lens))
+        context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
+        num_active_blocks = cdiv(context_lens, block_size).sum().item()
+        num_active_blocks = pad_to_multiple(num_active_blocks,
+                                            large_tile_size // block_size)
+        context_kv_len = num_active_blocks * block_size
+        assert (
+            context_kv_len %
+            large_tile_size == 0), f"invalid context_kv_len={context_kv_len}"
+
+        # pad QKV tensors
+        pad_dims = (
+            0,
+            0,
+            0,
+            0,
+            0,
+            max_num_queries - query.shape[0],
+        )
+        query = F.pad(query, pad_dims, "constant", 0)
+        k = F.pad(k_active, pad_dims, "constant", 0)
+        v = F.pad(v_active, pad_dims, "constant", 0)
+
+        # permute QKV tensors
+        # query: (1, n_heads, d, seq_q)
+        # key:   (1, n_kv_heads, d, seq_k)
+        # value: (1, n_kv_heads, seq_v, d)
+        query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
+        k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
+        v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
+        kv_cache = kv_cache.permute(0, 1, 3, 2, 4).contiguous()
+
+        # transform block table
+        active_block_table = get_active_block_tables(
+            block_table.cpu(),
+            torch.tensor(query_lens).cpu(),
+            torch.tensor(seq_lens).cpu(),
+            block_size,
+            num_active_blocks,
+        )
+
+        # Build attention masks
+        prior_mask, active_mask = (
+            BlockDiagonalCausalFromBottomRightMask.from_seqlens(
+                query_lens, seq_lens, block_size=block_size))
+        prior_mask_padded = F.pad(
+            prior_mask,
+            (
+                0,
+                context_kv_len - prior_mask.shape[1],
+                0,
+                max_num_queries - prior_mask.shape[0],
+            ),
+            "constant",
+            0,
+        ).bool()
+        active_mask_padded = F.pad(
+            active_mask,
+            (
+                0,
+                max_num_queries - active_mask.shape[1],
+                0,
+                max_num_queries - active_mask.shape[0],
+            ),
+            "constant",
+            0,
+        ).bool()
+        attn_mask = torch.concat([prior_mask_padded, active_mask_padded],
+                                 dim=1)
+
+        attn_mask = reorder_context_mask(attn_mask, large_tile_size,
+                                         block_size)
+
+        input_args = (
+            query.to(device=device),
+            k.to(device=device),
+            v.to(device=device),
+            kv_cache.to(device=device),
+            active_block_table.to(device=device),
+            attn_mask.to(device=device),
+        )
+        input_kwargs = dict(
+            n_kv_head=num_kv_heads,
+            head_size=head_size,
+            mixed_precision=mixed_precision,
+            LARGE_TILE_SZ=large_tile_size,
+        )
+
+        output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs)
+
+        num_actual_tokens = sum(query_lens)
+        # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
+        output_nki = output_nki.cpu().permute(0, 2, 1, 3)
+        output_nki = output_nki[0, :num_actual_tokens, :, :]
+        output_ref_padded = F.pad(
+            output_ref,
+            (0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]),
+            "constant",
+            0,
+        )
+        output_ref = output_ref_padded.transpose(
+            0, 1)[0, :num_actual_tokens, :, :]
+
+        torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0)
diff --git a/vllm_v0.10.0/tests/neuron/1_core/test_rotary_embedding.py b/vllm_v0.10.0/tests/neuron/1_core/test_rotary_embedding.py
new file mode 100644
index 0000000..a7ac797
--- /dev/null
+++ b/vllm_v0.10.0/tests/neuron/1_core/test_rotary_embedding.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests for miscellaneous utilities
+"""
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+from vllm.platforms import current_platform
+
+
+@pytest.mark.parametrize(
+    "max_position,is_neox_style,rotary_dim,head_size,seq_len,use_key", [
+        (16, False, 32, 32, 1024, True),
+        (16, False, 32, 128, 1024, True),
+        (16, True, 32, 32, 1024, True),
+        (16, True, 32, 128, 1024, True),
+        (16, False, 32, 128, 1024, False),
+        (16, True, 32, 128, 1024, False),
+    ])
+def test_rotary_embedding_opcheck(max_position, is_neox_style, rotary_dim,
+                                  head_size, seq_len, use_key):
+    import torch_xla.core.xla_model as xm
+
+    device = xm.xla_device()
+    current_platform.seed_everything(0)
+    torch.set_default_device("cpu")
+
+    batch_size = 1
+    base = 10000
+    num_heads = 8
+
+    rot = RotaryEmbedding(head_size, rotary_dim, max_position, base,
+                          is_neox_style, torch.float32)
+
+    positions = torch.randint(0,
+                              max_position, (batch_size, seq_len),
+                              device="cpu")
+    query = torch.randn(batch_size,
+                        seq_len,
+                        num_heads * head_size,
+                        dtype=torch.float32,
+                        device="cpu")
+    key = torch.randn_like(query) if use_key else None
+    assert positions.is_cpu, \
+        "reference input tensor is expected to be CPU tensor."
+    ref_query, ref_key = rot.to(device="cpu").forward_native(
+        positions, query, key)
+    out_query, out_key = rot.to(device=device).forward_neuron(
+        positions.to(device=device), query.to(device=device),
+        key.to(device=device) if key is not None else None)
+    if use_key:
+        assert out_query.is_xla and out_key.is_xla, \
+            "output tensor is expected to be XLA tensor"
+        torch.testing.assert_close(out_key.cpu(),
+                                   ref_key,
+                                   atol=1e-2,
+                                   rtol=1e-2)
+    else:
+        assert out_key is None, "expected returned key to be None"
+        assert out_query.is_xla, \
+            "output tensor is expected to be XLA tensor"
+    torch.testing.assert_close(out_query.cpu(),
+                               ref_query,
+                               atol=1e-2,
+                               rtol=1e-2)
diff --git a/vllm_v0.10.0/tests/neuron/2_core/test_comm_ops.py b/vllm_v0.10.0/tests/neuron/2_core/test_comm_ops.py
new file mode 100644
index 0000000..85a48da
--- /dev/null
+++ b/vllm_v0.10.0/tests/neuron/2_core/test_comm_ops.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import functools
+from typing import Callable
+from unittest.mock import patch
+
+import pytest
+import torch
+import torch_xla.distributed.xla_multiprocessing as xmp
+from typing_extensions import ParamSpec
+
+from vllm.distributed.communication_op import (
+    tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce)
+from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
+                                             init_distributed_environment)
+from vllm.utils import get_distributed_init_method, get_open_port
+
+_P = ParamSpec("_P")
+
+
+def reinitialize_neuron_runtime(f: Callable[_P, None]) -> Callable[_P, None]:
+    """Decorator to reinitialize the Neuron Runtime before executing a test.
+    This is necessary for distributed tests which need to reallocate Neuron
+    Cores to separate subprocesses.
+    """
+
+    @functools.wraps(f)
+    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
+        runtime = torch.classes.neuron.Runtime()
+        runtime.initialize()
+        runtime.unsafe_close()
+
+        f(*args, **kwargs)
+        runtime.initialize()
+
+    return wrapper
+
+
+def all_gather_test_worker(index, tp_degree, distributed_init_method):
+    init_distributed_environment(tp_degree,
+                                 index,
+                                 distributed_init_method,
+                                 index,
+                                 backend="xla")
+    ensure_model_parallel_initialized(tp_degree, 1)
+
+    num_dimensions = 3
+    tensor_size = list(range(2, num_dimensions + 2))
+    total_size = 1
+    for s in tensor_size:
+        total_size *= s
+
+    all_gather_dimension = -1
+    all_tensors = [
+        torch.arange(total_size, dtype=torch.float32,
+                     device="xla").reshape(tensor_size) * (r + 1)
+        for r in range(tp_degree)
+    ]
+    expected = torch.cat(all_tensors, dim=all_gather_dimension)
+    t = all_tensors[index % tp_degree]
+    t = tensor_model_parallel_all_gather(t, all_gather_dimension)
+    torch.testing.assert_close(t, expected)
+
+
+def all_reduce_test_worker(index, tp_degree, distributed_init_method):
+    init_distributed_environment(tp_degree,
+                                 index,
+                                 distributed_init_method,
+                                 index,
+                                 backend="xla")
+    ensure_model_parallel_initialized(tp_degree, 1)
+
+    num_elements = 8
+    all_tensors = [
+        torch.arange(num_elements, dtype=torch.float32, device="xla") * (r + 1)
+        for r in range(tp_degree)
+    ]
+    expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
+    t = all_tensors[index % tp_degree]
+    t = tensor_model_parallel_all_reduce(t)
+    torch.testing.assert_close(t, expected)
+
+
+@pytest.mark.parametrize("tp_size", [2])
+@pytest.mark.parametrize("test_target",
+                         [all_reduce_test_worker, all_gather_test_worker])
+@reinitialize_neuron_runtime
+def test_neuron_multi_process_tensor_parallel(monkeypatch, tp_size,
+                                              test_target):
+
+    with patch('torch_xla._XLAC._xla_runtime_is_initialized',
+               return_value=False):
+        distributed_init_method = get_distributed_init_method(
+            "127.0.0.1", get_open_port())
+
+        monkeypatch.setenv("VLLM_USE_V1", "1")
+        monkeypatch.setenv("NEURONCORE_NUM_DEVICES", str(tp_size))
+        monkeypatch.setenv("NEURON_PJRT_PROCESSES_NUM_DEVICES",
+                           ','.join(['1' for _ in range(tp_size)]))
+
+        xmp.spawn(test_target, args=(tp_size, distributed_init_method))
diff --git a/vllm_v0.10.0/tests/neuron/2_core/test_eagle.py b/vllm_v0.10.0/tests/neuron/2_core/test_eagle.py
new file mode 100644
index 0000000..cac642a
--- /dev/null
+++ b/vllm_v0.10.0/tests/neuron/2_core/test_eagle.py
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+import os
+import shutil
+import tempfile
+
+import torch
+from huggingface_hub import snapshot_download
+from safetensors import safe_open
+
+from vllm import LLM, SamplingParams
+
+
+def patch_eagle_draft_with_lm_head(target_model_id: str,
+                                   draft_model_id: str) -> str:
+    # In NxDI, draft model checkpoint must include lm_head weights from target
+    # model. For more details see https://awsdocs-neuron.readthedocs-hosted.com
+    # /en/latest/libraries/nxd-inference/developer_guides/feature-guide.html
+    # #eagle-checkpoint-compatibility
+    final_draft_dir = "/tmp/patched_eagle_draft"
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        target_dir = snapshot_download(repo_id=target_model_id,
+                                       local_dir=os.path.join(
+                                           tmp_dir, "target"))
+        draft_dir = snapshot_download(repo_id=draft_model_id,
+                                      local_dir=os.path.join(tmp_dir, "draft"))
+
+        lm_head_key = "lm_head.weight"
+        index_path = os.path.join(target_dir, "model.safetensors.index.json")
+        with open(index_path) as f:
+            index = json.load(f)
+        shard_name = index["weight_map"][lm_head_key]
+        target_safetensor_path = os.path.join(target_dir, shard_name)
+
+        with safe_open(target_safetensor_path, framework="pt") as f:
+            target_lm_head = f.get_tensor(lm_head_key)
+
+        draft_path = os.path.join(draft_dir, "pytorch_model.bin")
+        draft_state_dict = torch.load(draft_path, map_location="cpu")
+        draft_state_dict[lm_head_key] = target_lm_head.to(torch.float16)
+        torch.save(draft_state_dict, draft_path)
+
+        shutil.copytree(draft_dir, final_draft_dir, dirs_exist_ok=True)
+
+    return final_draft_dir
+
+
+def test_eagle():
+    patched_draft_path = patch_eagle_draft_with_lm_head(
+        target_model_id="meta-llama/Llama-2-7b-hf",
+        draft_model_id="yuhuili/EAGLE-llama2-chat-7B")
+    llm = LLM(
+        model="meta-llama/Llama-2-7b-hf",
+        speculative_config={
+            "model": patched_draft_path,
+            "num_speculative_tokens": 5,
+            "max_model_len": 128
+        },
+        max_num_seqs=1,
+        max_model_len=128,
+        tensor_parallel_size=2,
+        override_neuron_config={
+            "enable_eagle_speculation": True,
+            "enable_fused_speculation": True,
+            "fused_qkv": True
+        },
+    )
+    prompts = [
+        "The president of the United States is",
+    ]
+    outputs = llm.generate(prompts, SamplingParams(top_k=1))
+    expected_output = " the head of state and head of government of " \
+    "the United States. The president direct"
+
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}")
+        assert (expected_output == generated_text)
+
+    print("Neuron Eagle speculation test passed.")
diff --git a/vllm_v0.10.0/tests/neuron/2_core/test_mistral.py b/vllm_v0.10.0/tests/neuron/2_core/test_mistral.py
new file mode 100644
index 0000000..ff59be1
--- /dev/null
+++ b/vllm_v0.10.0/tests/neuron/2_core/test_mistral.py
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import LLM, SamplingParams
+
+
+def test_mistral():
+    llm = LLM(model="mistralai/Mistral-7B-v0.1",
+              tensor_parallel_size=2,
+              max_num_seqs=4,
+              max_model_len=128,
+              override_neuron_config={
+                  "sequence_parallel_enabled": False,
+                  "skip_warmup": True
+              })
+
+    # Send more prompts than the compiled batch size (4) and request
+    # varying generation lengths to test accuracy related to Neuron
+    # specific sequence id sorting.
+    prompts = [
+        "The president of the United States is",
+        "The capital of France is",
+        "What is Annapurna labs?",
+        "I believe the meaning of life is",
+        "Tell me a story about a brave knight",
+        "Hello, my name is Llama",
+    ]
+
+    sampling_params = [
+        SamplingParams(top_k=1, max_tokens=10),
+        SamplingParams(top_k=1, max_tokens=20),
+        SamplingParams(top_k=1, max_tokens=30),
+        SamplingParams(top_k=1, max_tokens=40),
+        SamplingParams(top_k=1, max_tokens=50),
+        SamplingParams(top_k=1, max_tokens=60)
+    ]
+
+    outputs = llm.generate(prompts, sampling_params)
+
+    expected_outputs = [
+        " the most powerful person in the world. He is",
+        " a city of many faces. It is a city of history, culture, art, "
+        "fashion, and",
+        "\n\nAnnapurna Labs is a semiconductor company that was founded "
+        "in 2013 by Amazon. The company is",
+        " to be happy.\n\nI believe that happiness is a choice.\n\nI "
+        "believe that happiness is a state of mind.\n\nI believe that "
+        "happiness is a journey.\n\nI believe",
+        " who rescued a princess from a dragon.\n\nTell me a story about"
+        " a princess who rescued herself from a dragon.\n\nTell me a "
+        "story about a princess who rescued herself from a dragon and "
+        "then rescued a knight from",
+        " and I am a 10 year old male. I am a very friendly and "
+        "affectionate boy who loves to be around people. I am a very "
+        "active boy who loves to play and run around. I am a very smart "
+        "boy who loves to learn new things. I am a very loyal boy"
+    ]
+
+    for expected_output, output in zip(expected_outputs, outputs):
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}")
+        assert (expected_output == generated_text)
+
+    print("Neuron Mistral test passed.")
diff --git a/vllm_v0.10.0/tests/neuron/2_core/test_multi_lora.py b/vllm_v0.10.0/tests/neuron/2_core/test_multi_lora.py
new file mode 100644
index 0000000..52ca9fe
--- /dev/null
+++ b/vllm_v0.10.0/tests/neuron/2_core/test_multi_lora.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from huggingface_hub import snapshot_download
+
+from vllm import LLM, SamplingParams
+from vllm.lora.request import LoRARequest
+
+
+def test_llama_single_lora():
+    sql_lora_files = snapshot_download(
+        repo_id="yard1/llama-2-7b-sql-lora-test")
+    llm = LLM(model="meta-llama/Llama-2-7b-hf",
+              tensor_parallel_size=2,
+              max_num_seqs=4,
+              max_model_len=512,
+              override_neuron_config={
+                  "sequence_parallel_enabled": False,
+                  "skip_warmup": True,
+                  "lora_modules": [{
+                      "name": "lora_id_1",
+                      "path": sql_lora_files
+                  }]
+              },
+              enable_lora=True,
+              max_loras=1,
+              max_lora_rank=256,
+              device="neuron")
+    """For multi-lora requests using NxDI as the backend, only the lora_name 
+    needs to be specified. The lora_id and lora_path are supplied at the LLM 
+    class/server initialization, after which the paths are handled by NxDI"""
+    lora_req_1 = LoRARequest("lora_id_1", 0, " ")
+    prompts = [
+        "The president of the United States is",
+        "The capital of France is",
+    ]
+    outputs = llm.generate(prompts,
+                           SamplingParams(top_k=1),
+                           lora_request=[lora_req_1, lora_req_1])
+
+    expected_outputs = [
+        " the head of state and head of government of the United States. "
+        "The president direct",
+        " a city of contrasts. The city is home to the Eiffel Tower"
+    ]
+
+    for expected_output, output in zip(expected_outputs, outputs):
+        generated_text = output.outputs[0].text
+        assert (expected_output == generated_text)
+
+
+def test_llama_multiple_lora():
+    sql_lora_files = snapshot_download(
+        repo_id="yard1/llama-2-7b-sql-lora-test")
+    llm = LLM(model="meta-llama/Llama-2-7b-hf",
+              tensor_parallel_size=2,
+              max_num_seqs=4,
+              max_model_len=512,
+              override_neuron_config={
+                  "sequence_parallel_enabled":
+                  False,
+                  "skip_warmup":
+                  True,
+                  "lora_modules": [{
+                      "name": "lora_id_1",
+                      "path": sql_lora_files
+                  }, {
+                      "name": "lora_id_2",
+                      "path": sql_lora_files
+                  }]
+              },
+              enable_lora=True,
+              max_loras=2,
+              max_lora_rank=256,
+              device="neuron")
+    """For multi-lora requests using NxDI as the backend, only the lora_name 
+    needs to be specified. The lora_id and lora_path are supplied at the LLM 
+    class/server initialization, after which the paths are handled by NxDI"""
+    lora_req_1 = LoRARequest("lora_id_1", 0, " ")
+    lora_req_2 = LoRARequest("lora_id_2", 1, " ")
+    prompts = [
+        "The president of the United States is",
+        "The capital of France is",
+    ]
+    outputs = llm.generate(prompts,
+                           SamplingParams(top_k=1),
+                           lora_request=[lora_req_1, lora_req_2])
+
+    expected_outputs = [
+        " the head of state and head of government of the United States. "
+        "The president direct",
+        " a city of contrasts. The city is home to the Eiffel Tower"
+    ]
+
+    for expected_output, output in zip(expected_outputs, outputs):
+        generated_text = output.outputs[0].text
+        assert (expected_output == generated_text)
diff --git a/vllm_v0.10.0/tests/plugins/lora_resolvers/__init__.py b/vllm_v0.10.0/tests/plugins/lora_resolvers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/plugins/lora_resolvers/test_filesystem_resolver.py b/vllm_v0.10.0/tests/plugins/lora_resolvers/test_filesystem_resolver.py
new file mode 100644
index 0000000..3e2c257
--- /dev/null
+++ b/vllm_v0.10.0/tests/plugins/lora_resolvers/test_filesystem_resolver.py
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+import shutil
+
+import pytest
+from huggingface_hub import snapshot_download
+
+from vllm.plugins.lora_resolvers.filesystem_resolver import FilesystemResolver
+
+MODEL_NAME = "mistralai/Mistral-7B-v0.1"
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+PA_NAME = "swapnilbp/llama_tweet_ptune"
+
+
+@pytest.fixture(scope='module')
+def adapter_cache(request, tmpdir_factory):
+    # Create dir that mimics the structure of the adapter cache
+    adapter_cache = tmpdir_factory.mktemp(
+        request.module.__name__) / "adapter_cache"
+    return adapter_cache
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+
+
+@pytest.fixture(scope="module")
+def pa_files():
+    return snapshot_download(repo_id=PA_NAME)
+
+
+@pytest.mark.asyncio
+async def test_filesystem_resolver(adapter_cache, zephyr_lora_files):
+    model_files = adapter_cache / LORA_NAME
+    shutil.copytree(zephyr_lora_files, model_files)
+
+    fs_resolver = FilesystemResolver(adapter_cache)
+    assert fs_resolver is not None
+
+    lora_request = await fs_resolver.resolve_lora(MODEL_NAME, LORA_NAME)
+    assert lora_request is not None
+    assert lora_request.lora_name == LORA_NAME
+    assert lora_request.lora_path == os.path.join(adapter_cache, LORA_NAME)
+
+
+@pytest.mark.asyncio
+async def test_missing_adapter(adapter_cache):
+    fs_resolver = FilesystemResolver(adapter_cache)
+    assert fs_resolver is not None
+
+    missing_lora_request = await fs_resolver.resolve_lora(MODEL_NAME, "foobar")
+    assert missing_lora_request is None
+
+
+@pytest.mark.asyncio
+async def test_nonlora_adapter(adapter_cache, pa_files):
+    model_files = adapter_cache / PA_NAME
+    shutil.copytree(pa_files, model_files)
+
+    fs_resolver = FilesystemResolver(adapter_cache)
+    assert fs_resolver is not None
+
+    pa_request = await fs_resolver.resolve_lora(MODEL_NAME, PA_NAME)
+    assert pa_request is None
diff --git a/vllm_v0.10.0/tests/plugins/vllm_add_dummy_model/setup.py b/vllm_v0.10.0/tests/plugins/vllm_add_dummy_model/setup.py
new file mode 100644
index 0000000..6307bb6
--- /dev/null
+++ b/vllm_v0.10.0/tests/plugins/vllm_add_dummy_model/setup.py
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from setuptools import setup
+
+setup(name='vllm_add_dummy_model',
+      version='0.1',
+      packages=['vllm_add_dummy_model'],
+      entry_points={
+          'vllm.general_plugins':
+          ["register_dummy_model = vllm_add_dummy_model:register"]
+      })
diff --git a/vllm_v0.10.0/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py b/vllm_v0.10.0/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
new file mode 100644
index 0000000..b2085b0
--- /dev/null
+++ b/vllm_v0.10.0/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import ModelRegistry
+
+
+def register():
+    # Test directly passing the model
+    from .my_opt import MyOPTForCausalLM
+
+    if "MyOPTForCausalLM" not in ModelRegistry.get_supported_archs():
+        ModelRegistry.register_model("MyOPTForCausalLM", MyOPTForCausalLM)
+
+    # Test passing lazy model
+    if "MyGemma2Embedding" not in ModelRegistry.get_supported_archs():
+        ModelRegistry.register_model(
+            "MyGemma2Embedding",
+            "vllm_add_dummy_model.my_gemma_embedding:MyGemma2Embedding",
+        )
+
+    if "MyLlava" not in ModelRegistry.get_supported_archs():
+        ModelRegistry.register_model("MyLlava",
+                                     "vllm_add_dummy_model.my_llava:MyLlava")
diff --git a/vllm_v0.10.0/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/vllm_v0.10.0/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
new file mode 100644
index 0000000..fc654f2
--- /dev/null
+++ b/vllm_v0.10.0/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
+from vllm.model_executor.models.gemma2 import Gemma2Model
+from vllm.model_executor.models.utils import WeightsMapper, maybe_prefix
+from vllm.sequence import IntermediateTensors
+
+
+class MyGemma2Embedding(nn.Module):
+
+    is_pooling_model = True
+
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        self.model = Gemma2Model(vllm_config=vllm_config,
+                                 prefix=maybe_prefix(prefix, "model"))
+
+        pooler_config = vllm_config.model_config.pooler_config
+        assert pooler_config is not None
+
+        self.pooler = DispatchPooler({
+            "encode": Pooler.for_encode(pooler_config),
+            "embed": Pooler.for_embed(pooler_config),
+        })
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+
+        if isinstance(hidden_states, IntermediateTensors):
+            return hidden_states
+
+        # Return all-zero embeddings
+        return torch.zeros_like(hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+
+        weights = self.hf_to_vllm_mapper.apply(weights)
+        weights = ((name, data) for name, data in weights
+                   if not name.startswith("lm_head."))
+        return self.model.load_weights(weights)
diff --git a/vllm_v0.10.0/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/vllm_v0.10.0/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
new file mode 100644
index 0000000..da97cf7
--- /dev/null
+++ b/vllm_v0.10.0/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+from vllm.model_executor.models.llava import (LlavaDummyInputsBuilder,
+                                              LlavaForConditionalGeneration,
+                                              LlavaMultiModalProcessor,
+                                              LlavaProcessingInfo)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+
+@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor,
+                                        info=LlavaProcessingInfo,
+                                        dummy_inputs=LlavaDummyInputsBuilder)
+class MyLlava(LlavaForConditionalGeneration):
+
+    def compute_logits(
+            self, hidden_states: torch.Tensor,
+            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
+        # this dummy model always predicts the first token
+        logits = super().compute_logits(hidden_states, sampling_metadata)
+        if logits is not None:
+            logits.zero_()
+            logits[:, 0] += 1.0
+        return logits
diff --git a/vllm_v0.10.0/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py b/vllm_v0.10.0/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
new file mode 100644
index 0000000..8c34407
--- /dev/null
+++ b/vllm_v0.10.0/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+from vllm.model_executor.models.opt import OPTForCausalLM
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+
+
+class MyOPTForCausalLM(OPTForCausalLM):
+
+    def compute_logits(
+            self, hidden_states: torch.Tensor,
+            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
+        # this dummy model always predicts the first token
+        logits = super().compute_logits(hidden_states, sampling_metadata)
+        if logits is not None:
+            logits.zero_()
+            logits[:, 0] += 1.0
+        return logits
diff --git a/vllm_v0.10.0/tests/plugins/vllm_add_dummy_platform/setup.py b/vllm_v0.10.0/tests/plugins/vllm_add_dummy_platform/setup.py
new file mode 100644
index 0000000..a531826
--- /dev/null
+++ b/vllm_v0.10.0/tests/plugins/vllm_add_dummy_platform/setup.py
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from setuptools import setup
+
+setup(
+    name='vllm_add_dummy_platform',
+    version='0.1',
+    packages=['vllm_add_dummy_platform'],
+    entry_points={
+        'vllm.platform_plugins': [
+            "dummy_platform_plugin = vllm_add_dummy_platform:dummy_platform_plugin"  # noqa
+        ],
+        "vllm.general_plugins":
+        ["dummy_custom_ops = vllm_add_dummy_platform:register_ops"],
+    })
diff --git a/vllm_v0.10.0/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py b/vllm_v0.10.0/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
new file mode 100644
index 0000000..c4fe6ed
--- /dev/null
+++ b/vllm_v0.10.0/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+
+def dummy_platform_plugin() -> Optional[str]:
+    return "vllm_add_dummy_platform.dummy_platform.DummyPlatform"
+
+
+def register_ops():
+    import vllm_add_dummy_platform.dummy_custom_ops  # noqa
diff --git a/vllm_v0.10.0/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py b/vllm_v0.10.0/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
new file mode 100644
index 0000000..e38fb2f
--- /dev/null
+++ b/vllm_v0.10.0/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.attention.backends.placeholder_attn import (
+    PlaceholderAttentionBackend)
+
+
+class DummyAttentionBackend(PlaceholderAttentionBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "Dummy_Backend"
diff --git a/vllm_v0.10.0/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_custom_ops.py b/vllm_v0.10.0/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_custom_ops.py
new file mode 100644
index 0000000..1fcc3fc
--- /dev/null
+++ b/vllm_v0.10.0/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_custom_ops.py
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+
+
+# Register CustomRotaryEmbedding to CustomOP.
+@RotaryEmbedding.register_oot
+class DummyRotaryEmbedding(RotaryEmbedding):
+    """Original rotary positional embedding."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.addition_config = True
+
+    def forward_oot(self, *args,
+                    **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
+        return super().forward_oot(*args, **kwargs)
diff --git a/vllm_v0.10.0/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/vllm_v0.10.0/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
new file mode 100644
index 0000000..e67825f
--- /dev/null
+++ b/vllm_v0.10.0/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import TYPE_CHECKING
+
+from vllm.platforms.interface import Platform, PlatformEnum
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+from vllm import envs
+
+
+class DummyPlatform(Platform):
+    _enum = PlatformEnum.OOT
+    device_name = "DummyDevice"
+    device_type: str = "privateuseone"
+    dispatch_key: str = "PrivateUse1"
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        if envs.VLLM_USE_V1:
+            compilation_config = vllm_config.compilation_config
+            # Activate custom ops for v1.
+            compilation_config.custom_ops = ["all"]
+
+    def get_attn_backend_cls(self, backend_name, head_size, dtype,
+                             kv_cache_dtype, block_size, use_v1, use_mla):
+        return "vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend"  # noqa E501
\ No newline at end of file
diff --git a/vllm_v0.10.0/tests/plugins_tests/conftest.py b/vllm_v0.10.0/tests/plugins_tests/conftest.py
new file mode 100644
index 0000000..c8c1b81
--- /dev/null
+++ b/vllm_v0.10.0/tests/plugins_tests/conftest.py
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
\ No newline at end of file
diff --git a/vllm_v0.10.0/tests/plugins_tests/test_platform_plugins.py b/vllm_v0.10.0/tests/plugins_tests/test_platform_plugins.py
new file mode 100644
index 0000000..ef99c3d
--- /dev/null
+++ b/vllm_v0.10.0/tests/plugins_tests/test_platform_plugins.py
@@ -0,0 +1,48 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm.attention.selector import get_attn_backend
+from vllm.plugins import load_general_plugins
+from vllm.utils import STR_BACKEND_ENV_VAR, STR_INVALID_VAL
+
+
+def test_platform_plugins():
+    # simulate workload by running an example
+    import runpy
+    current_file = __file__
+    import os
+    example_file = os.path.join(
+        os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
+        "examples", "offline_inference/basic/basic.py")
+    runpy.run_path(example_file)
+
+    # check if the plugin is loaded correctly
+    from vllm.platforms import _init_trace, current_platform
+    assert current_platform.device_name == "DummyDevice", (
+        f"Expected DummyDevice, got {current_platform.device_name}, "
+        "possibly because current_platform is imported before the plugin"
+        f" is loaded. The first import:\n{_init_trace}")
+
+
+def test_oot_attention_backend(monkeypatch: pytest.MonkeyPatch):
+    # ignore the backend env variable if it is set
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
+        backend = get_attn_backend(16, torch.float16, "auto", 16, False)
+        assert backend.get_name() == "Dummy_Backend"
+
+
+def test_oot_custom_op(monkeypatch: pytest.MonkeyPatch):
+    # simulate workload by running an example
+    load_general_plugins()
+    from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+    layer = RotaryEmbedding(16, 16, 16, 16, True, torch.float16)
+    assert layer.__class__.__name__ == "DummyRotaryEmbedding", (
+        f"Expected DummyRotaryEmbedding, got {layer.__class__.__name__}, "
+        "possibly because the custom op is not registered correctly.")
+    assert hasattr(layer, "addition_config"), (
+        "Expected DummyRotaryEmbedding to have an 'addition_config' attribute, "
+        "which is set by the custom op.")
diff --git a/vllm_v0.10.0/tests/plugins_tests/test_scheduler_plugins.py b/vllm_v0.10.0/tests/plugins_tests/test_scheduler_plugins.py
new file mode 100644
index 0000000..8c21216
--- /dev/null
+++ b/vllm_v0.10.0/tests/plugins_tests/test_scheduler_plugins.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.core.scheduler import Scheduler
+from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.llm_engine import LLMEngine
+from vllm.sampling_params import SamplingParams
+from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
+from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
+
+
+class DummyV0Scheduler(Scheduler):
+
+    def schedule(self):
+        raise Exception("Exception raised by DummyV0Scheduler")
+
+
+class DummyV1Scheduler(V1Scheduler):
+
+    def schedule(self):
+        raise Exception("Exception raised by DummyV1Scheduler")
+
+
+def test_scheduler_plugins_v0(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        with pytest.raises(Exception) as exception_info:
+
+            engine_args = EngineArgs(
+                model="facebook/opt-125m",
+                enforce_eager=True,  # reduce test time
+                scheduler_cls=DummyV0Scheduler,
+            )
+
+            engine = LLMEngine.from_engine_args(engine_args=engine_args)
+
+            sampling_params = SamplingParams(max_tokens=1)
+            engine.add_request("0", "foo", sampling_params)
+            engine.step()
+
+        assert str(
+            exception_info.value) == "Exception raised by DummyV0Scheduler"
+
+
+def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        # Explicitly turn off engine multiprocessing so
+        # that the scheduler runs in this process
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+
+        with pytest.raises(Exception) as exception_info:
+
+            engine_args = EngineArgs(
+                model="facebook/opt-125m",
+                enforce_eager=True,  # reduce test time
+                scheduler_cls=DummyV1Scheduler,
+            )
+
+            engine = V1LLMEngine.from_engine_args(engine_args=engine_args)
+
+            sampling_params = SamplingParams(max_tokens=1)
+            engine.add_request("0", "foo", sampling_params)
+            engine.step()
+
+        assert str(
+            exception_info.value) == "Exception raised by DummyV1Scheduler"
diff --git a/vllm_v0.10.0/tests/prefix_caching/__init__.py b/vllm_v0.10.0/tests/prefix_caching/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/prefix_caching/test_disable_sliding_window.py b/vllm_v0.10.0/tests/prefix_caching/test_disable_sliding_window.py
new file mode 100644
index 0000000..b940ab4
--- /dev/null
+++ b/vllm_v0.10.0/tests/prefix_caching/test_disable_sliding_window.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Compare the with and without prefix caching.
+
+Run `pytest tests/prefix_caching/test_prefix_caching.py`.
+"""
+import pytest
+
+from vllm import LLM
+from vllm.distributed import cleanup_dist_env_and_memory
+
+MODEL_LEN_LEN = [
+    # Example models with sliding window.
+    ("bigcode/starcoder2-3b", 4096, 16384),
+    # ("mistralai/Mistral-7B-v0.1", 4096, 32768), << OOM in CI
+
+    # Confirm model with sliding window works.
+    # config has "use_sliding_window": false
+    ("Qwen/Qwen1.5-0.5B-Chat", 32768, 32768),
+    # config has no sliding window attribute.
+    ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", 2048, 2048),
+]
+
+
+@pytest.mark.parametrize("model_len_len", MODEL_LEN_LEN)
+def test_disable_sliding_window(model_len_len, ):
+    model, sliding_len, full_len = model_len_len
+    disabled_llm = LLM(model, disable_sliding_window=True)
+    disabled_llm.generate("Hi my name is")
+    model_config = disabled_llm.llm_engine.model_config
+    assert model_config.max_model_len == sliding_len, (
+        "Max len expected to equal sliding_len of %s, but got %s", sliding_len,
+        model_config.max_model_len)
+
+    del disabled_llm
+    cleanup_dist_env_and_memory()
+
+    enabled_llm = LLM(model,
+                      enforce_eager=True,
+                      disable_sliding_window=False,
+                      enable_prefix_caching=False)
+    enabled_llm.generate("Hi my name is")
+    model_config = enabled_llm.llm_engine.model_config
+    assert model_config.max_model_len == full_len, (
+        "Max len expected to equal full_len of %s, but got %s", full_len,
+        model_config.max_model_len)
+
+    del enabled_llm
+    cleanup_dist_env_and_memory()
diff --git a/vllm_v0.10.0/tests/prefix_caching/test_prefix_caching.py b/vllm_v0.10.0/tests/prefix_caching/test_prefix_caching.py
new file mode 100644
index 0000000..5bf6ed9
--- /dev/null
+++ b/vllm_v0.10.0/tests/prefix_caching/test_prefix_caching.py
@@ -0,0 +1,231 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Compare the with and without prefix caching.
+
+Run `pytest tests/prefix_caching/test_prefix_caching.py`.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from tests.conftest import VllmRunner
+from tests.core.utils import SchedulerProxy, create_dummy_prompt
+from vllm import SamplingParams, TokensPrompt
+from vllm.core.scheduler import Scheduler
+from vllm.engine.llm_engine import LLMEngine
+from vllm.platforms import current_platform
+from vllm.utils import STR_BACKEND_ENV_VAR
+
+from ..models.utils import check_outputs_equal
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
+    """
+    This module relies on V0 internals, so set VLLM_USE_V1=0.
+    """
+    with monkeypatch.context() as m:
+        m.setenv('VLLM_USE_V1', '0')
+        yield
+
+
+MODELS = [
+    "distilbert/distilgpt2",
+]
+
+UNSTABLE_PROMPT_SEQUENCE = [
+    ([0] * 588) + ([1] * 1332) + ([2] * 30) + ([3] * 1),
+    ([0] * 588) + ([1] * 1332) + ([4] * 3) + ([5] * 50),
+    ([0] * 588) + ([1] * 1332) + ([2] * 30) + ([6] * 95),
+    ([0] * 588) + ([1] * 1332) + ([4] * 3) + ([7] * 174),
+    ([0] * 588) + ([8] * 1539),
+]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [5])
+@pytest.mark.parametrize("cached_position", [0, 1])
+@pytest.mark.parametrize("enable_chunked_prefill", [True, False])
+@pytest.mark.parametrize("block_size", [16])
+def test_mixed_requests(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    backend: str,
+    dtype: str,
+    max_tokens: int,
+    cached_position: int,
+    enable_chunked_prefill: bool,
+    block_size: int,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """
+    Test the case when some sequences have the prefix cache hit
+    and the others don't. The cached position determines where
+    the sequence is at among the batch of prefills.
+    """
+    if backend == "FLASHINFER" and current_platform.is_rocm():
+        pytest.skip("Flashinfer does not support ROCm/HIP.")
+    if backend == "XFORMERS" and current_platform.is_rocm():
+        pytest.skip("Xformers does not support ROCm/HIP.")
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, backend)
+
+        with hf_runner(model, dtype=dtype) as hf_model:
+            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+        cached_prompt = example_prompts[cached_position]
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                enable_prefix_caching=True,
+                enable_chunked_prefill=enable_chunked_prefill,
+                block_size=block_size,
+        ) as vllm_model:
+            # Run the first prompt so the cache is populated
+            vllm_outputs = vllm_model.generate_greedy([cached_prompt],
+                                                      max_tokens)
+
+            # Run all the promopts
+            greedy_params = SamplingParams(temperature=0.0,
+                                           max_tokens=max_tokens)
+            req_outputs = vllm_model.llm.generate(example_prompts,
+                                                  greedy_params)
+
+            # Verify number of cached tokens
+            for i in range(len(req_outputs)):
+                if i == cached_position:
+                    expected_num_cached_tokens = (
+                        len(req_outputs[i].prompt_token_ids) //
+                        block_size) * block_size
+                else:
+                    expected_num_cached_tokens = 0
+                assert (req_outputs[i].num_cached_tokens ==
+                        expected_num_cached_tokens)
+
+            vllm_outputs = [(
+                output.prompt_token_ids + list(output.outputs[0].token_ids),
+                output.prompt + output.outputs[0].text,
+            ) for output in req_outputs]
+
+        check_outputs_equal(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
+def test_unstable_prompt_sequence(
+    vllm_runner,
+    backend: str,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+
+    if backend == "FLASHINFER" and current_platform.is_rocm():
+        pytest.skip("Flashinfer does not support ROCm/HIP.")
+    if backend == "XFORMERS" and current_platform.is_rocm():
+        pytest.skip("Xformers does not support ROCm/HIP.")
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, backend)
+
+        with vllm_runner(
+                "Qwen/Qwen2.5-0.5B-Instruct",
+                enable_chunked_prefill=True,
+                enable_prefix_caching=True,
+                max_model_len=4096,
+        ) as vllm_model:
+            for prompt in UNSTABLE_PROMPT_SEQUENCE:
+                vllm_model.generate(TokensPrompt(prompt_token_ids=prompt),
+                                    SamplingParams(max_tokens=1))
+
+
+@pytest.mark.parametrize("model", MODELS)
+def test_fully_cached_prefill_needs_uncached_token(model):
+    block_size = 16
+    max_num_batched_tokens = 16
+    num_output_tokens = 5
+    # Make a vllm engine
+    runner = VllmRunner(
+        model_name=model,
+        gpu_memory_utilization=0.7,
+        enable_chunked_prefill=True,
+        enforce_eager=True,
+        enable_prefix_caching=True,
+        block_size=block_size,
+        max_num_batched_tokens=max_num_batched_tokens,
+        max_num_seqs=max_num_batched_tokens,
+    )
+    engine: LLMEngine = runner.llm.llm_engine
+
+    scheduler: Scheduler = SchedulerProxy(engine.scheduler[0])  # type: ignore
+    engine.scheduler[0] = scheduler
+
+    # SeqA
+    seqA_tokens = list(range(2 * block_size))
+    seqA, seq_groupA = create_dummy_prompt(
+        request_id="0",
+        prompt_tokens=seqA_tokens,
+        max_tokens=num_output_tokens,
+        block_size=block_size,
+    )
+
+    scheduler.add_seq_group(seq_groupA)
+
+    assert seqA.data.get_num_computed_tokens() == 0
+
+    # Prefill seqA
+    while not seqA.is_finished():
+        engine.step()
+
+    # seqB
+    seqB_tokens = [t + 1 for t in seqA_tokens]  # shift by 1
+    seqB, seq_groupB = create_dummy_prompt(
+        request_id="1",
+        prompt_tokens=seqB_tokens,
+        max_tokens=num_output_tokens,
+        block_size=block_size,
+    )
+
+    # seqC is the same as seqA
+    seqC, seq_groupC = create_dummy_prompt(
+        request_id="2",
+        prompt_tokens=seqA_tokens,
+        max_tokens=num_output_tokens,
+        block_size=block_size,
+    )
+
+    scheduler.add_seq_group(seq_groupB)
+    scheduler.add_seq_group(seq_groupC)
+
+    # Even seqC is fully cached, it should not be prefilled since we
+    # require at least 1 uncached token.
+    engine.step()
+
+    sched_metas, sched_out, _ = scheduler.last_schedule_ret()
+    assert len(sched_out.scheduled_seq_groups) == 1
+    assert (sched_out.scheduled_seq_groups[0].seq_group.request_id ==
+            seq_groupB.request_id)
+    assert (sched_out.scheduled_seq_groups[0].token_chunk_size ==
+            max_num_batched_tokens)
+
+    # When seqB is finished, seqC could be prefilled.
+    while not seqB.is_finished():
+        engine.step()
+        sched_metas, sched_out, _ = scheduler.last_schedule_ret()
+        assert len(sched_out.scheduled_seq_groups) == 1
+        assert (sched_out.scheduled_seq_groups[0].seq_group.request_id ==
+                seq_groupB.request_id)
+
+    engine.step()
+    sched_metas, sched_out, _ = scheduler.last_schedule_ret()
+    assert len(sched_out.scheduled_seq_groups) == 1
+    assert (sched_out.scheduled_seq_groups[0].seq_group.request_id ==
+            seq_groupC.request_id)
+    assert sched_out.scheduled_seq_groups[0].token_chunk_size == len(
+        seqA_tokens)
diff --git a/vllm_v0.10.0/tests/prompts/example.txt b/vllm_v0.10.0/tests/prompts/example.txt
new file mode 100644
index 0000000..e1b97bc
--- /dev/null
+++ b/vllm_v0.10.0/tests/prompts/example.txt
@@ -0,0 +1,8 @@
+vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.
+Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.
+Compare and contrast artificial intelligence with human intelligence in terms of processing information.
+Describe the basic components of a neural network and how it can be trained.
+Write a short story about a robot that dreams for the first time.
+Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.
+Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.
+Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'
diff --git a/vllm_v0.10.0/tests/prompts/summary.txt b/vllm_v0.10.0/tests/prompts/summary.txt
new file mode 100644
index 0000000..2f947a2
--- /dev/null
+++ b/vllm_v0.10.0/tests/prompts/summary.txt
@@ -0,0 +1 @@
+Subtitles: for our annual races at Knockhill Circuit.Today\'s racing comes from the Porsche Carrera Cup Great Britainand the Legends Cars Elite Cup with JLM.It\'s the latter who get us underway with their first race of the day,and joining me in the commentary box is Paul O\'Neill.First race of the day for the Legends.Jonty Norman has drawn pole position,with Matt Knight alongside.Marcus Pett on Row 2 with Daniel Pooley.Declan Burke is next up, and then Tyler Read, on Row 3.He\'s leading the rookie championship at the moment.Chris Needham on Row 4 with Luke Simmons.Andrew Rogerson and Gareth Sheridan on Row 5.Sixth row, Peter Barrable, with Charlie Budd.Row 7, Jack Parker, fourth in the championship right now.Nick Price is next to him.Will Gibson, who looks like he\'s out of the championship contention now,with Oli Schlup alongside.Then Ben McNeice and Flight Lieutenant Matt Isherwood.Robert Barrable, championship leader, he\'s on Row 10.Then Brent Bowie from Kieran Beattie and Nick Bridgeman.Mike Schlup on Row 12, followed by Ryan McLeish,who won the day overall yesterday.Mark Beaty, Row 13, with Andy Bird.Then it\'s Ben Higgins and Nathan Anthony.Connor Mills and Paul Musselle complete Row 15.And completing the grid is James Newbery.Here we go, with Race number 1 of the day,the final day of the first ever Legends Cars Elite Cup with JLM.And on the front row, it\'s Jonty Norman in grey,Matt Knight in black and gold.Coming from third place on the grid is Marcus Pett,who goes left of shot in the gunmetal carto challenge for the lead.Marcus Pett, the man from Boston in Lincolnshire,goes through into lead position.Very definitely a fancied championship runnerbut hasn\'t quite had the rub of the green this weekend.And they all pile into McIntyre\'s for the first time.And this is where we look for driving standards.James Newbery brakes at the back.He\'s got Paul Musselle immediately in front of him.Those two had an interesting battle yesterdayinvolving a little bit of contact, I think,but they\'re both all right at the moment, as they clear the chicane for the first time.Marcus Pett is away.The difference you\'ll see in Legends Cars racing todayis that for this meeting,the bump drafting that we\'ve seen in the pasthas been ruled out for this round,and it\'s under review for the future.But look at the battle for second position, three wide,as Marcus Pett comes in front of the crowds here.Matt Knight on the inside, Dan Pooley on the outside in 32.Dan Pooley challenging for third. He had a strong day yesterday -he was up in the top ten, which was great to see.The man from March.That third car there, eclipsed at the moment,comes out of the slipstream.Dan repaired his own car after Croft,and that of Kieran Beaty,so I know Kieran wanted to thank him for that. He\'s been working hard.And Pooley side by side with Matt Knight.We\'ve got the 13, Chris Needham car, up there in the mix as well.The three top guys in the...Ryan McLeish getting very sideways there,the Scot in the 71 car.The first time we\'ve seen him on our ITV coverage.He\'s not a guest driver this week.I suppose you could technically call him a guest,but he\'s fully championship registeredand took a splendid win yesterday - overall win and race win.Overall on points.Sorry, Paul, gets a chance to get you in.That\'s Jack Parker!Oh, what\'s happened there?So, this was the start. They\'re all still warming the tyres up,ready for the lights to go green,which they do... around about now.And they get going.And then there was a car, wasn\'t there?Oh, I tell you what, that could\'ve ended up really nastyas it snaked up the grass.Yeah, I\'ll tell you what, the moment when the lights went outwas when Marcus Pett broke ranks.That was a very, very meticulous start from Marcus Pett.The blue car here is Tyler Read, top rookie,who looks like he\'s going down the inside of Daniel Pooley,so he\'s gonna make a space here.So, Dan Pooley has lost second position.It\'s Marcus Pett still out front. Matt Knight...I was saying to the drivers,"Don\'t go away if you\'re in the lead because you won\'t get any coverage." Pett\'s down the road, isn\'t he? Look at the gap he\'s got. Yeah.He\'s got three seconds. It\'s gonna be more than that.What I was quite concerned about was the damp part of the circuitdown at the hairpin, where you need to be down the inside of peopleto get the braking done,but these guys seem to be all respecting...Not track limits, but they\'re respecting each other around usbecause I was quite concerned about coming here,but this is quite synonymous with Legends racing at Knockhill.And look at this now. Knight has got...Look at that.  I remember Marcus getting his first race win,which was at Snetterton years ago.It\'s always fantastic to see a first-time winner.And Tyler Read is giving him a great workout.Matt Knight back in third.It\'s between the top two at the moment. Oh! Tyler goes wide.He\'s throwing the car around.Marcus Pett, looking a little bit smoother in the 79,was very frustrated yesterday, but Read\'s all over him.Yeah, but look at this now.You\'ve got third, fourth, fifth and sixth.This is gonna be absolutely spectacular!Tyler Read\'s gone! What\'s gone on?!Oh, has the Treherne engine gone pop? He\'s lost a lot of ground.Is he gonna come back into it?Now it\'s Knight having a go on the outside line again.Matt Knight can\'t do it. He runs out wide.Oli Schlup\'s coming through.Schlup hasn\'t had a win yet in Legends cars, so he\'s queueing up.They\'re coming onto the last lap.This could be a key moment for Oli Schlup,who\'s back in third in the K-Seal car.Across the line.Marcus Pett soaking up the pressure brilliantly so far.But does he need to be in front as they come onto the last lap?I don\'t know, but I think Read must have missed a gear,as someone\'s exited stage left.Look at that, back in the mix!It\'s now six for the lead. Can Pett hold on?Championship leader Robert Barrablehas come through from about three rows from the back,and he\'s at the back of the train.Barrable here is gonna extend his championship leadand start towards the front of the grid for Race 2.Barrable, the Irishman, he\'s there.The white car with the green and orange stripeson the nose cone of the car.But it\'s Marcus Pett out front at the moment... Oh!Matt Isherwood\'s rejoined at the back in the black and green.Isherwood\'s got back at them. Matt Knight\'s having a go.Along Railway Straight.Schlup would normally bump draft him. He can\'t do that on the rules.But look at Marcus Pett.Fairly wide-ish line in. Good defensive stuff from Pett.It\'s all about the run up to the hill now.And Marcus Pett is gonna take the win, I think.Here they come, up towards the line. Pett from Matt Knight.It\'s gonna be Matt\'s best resultin the Legends Cars National Championship.Third position goes to Oli Schlup, who is delighted with that.Then it was Tyler Read. Great race from him.Robert Barrable, though...Barrable, from 19th on the grid, without bump drafting,comes through into fifth placeahead of the excellent recovery from Flight Lieutenant Matt Isherwood.Dan Pooley seventh. Another great result for Dan Pooley.So much to take away from those last racing laps.Oh, and those last four lapsis exactly why we have these Legends on the TOCA package.That was exceptional.Marcus Pett looked like a dead cert not to finish first,but congratulations to you. That was brilliant.But Barrable, after exiting stage leftwhen he caught the back of everybody and got right up there...There\'s too much to talk about. Let\'s just talk about this guy.Pett, you are a legend, mate. Well done.Cracking. It is a lad and dad.Literally, Marcus and his dad, Robert, they look after the car.It is lad and dad. We hear that mentioned in other formulas,but genuinely, that is all it is.It is very difficult for drivers like that and teams like thatto come and race on this stage.It is a big thing. And he\'s such a smashing guy.And his dad as well. Really delighted with the win.Super stuff by Matt Knight. brilliant from Oli Schlup.Fantastic as well from Tyler Read.And on the front row,it\'s Jonty Norman in grey, Matt Knight in black and gold.Coming from third place on the grid is Marcus Pett.Bit of a shemozzle at the back.Two cars hooked up, which is not good to see.Oh, has the Treherne engine gone pop? He\'s lost a lot of ground.Now it\'s Knight having a go on the outside line again.Matt Knight can\'t do it. He runs out wide.Oli Schlup\'s coming through.And Marcus Pett is gonna take the win, I think. Pett from Matt Knight. It\'s gonna be Matt\'s best resultin the Legends Cars National Championship.Here\'s how they finished.Marcus Pett takes another win in the Legends Cars Elite Cup with JLM.READS INFOREADS INFOREADS INFOREADS INFOREADS INFOREADS INFOProblems in that race for Ryan McLeish, yesterday\'s winner.Charlie Budd in 30th.And the other driver having problems, obviously,from that first stoppage, Brent Bowie.Marcus, that was a tough racebecause there was a red flag in the middle of it.Actually, the first bit, you got away,but it was a full reset,and pressure throughout to the chequered flag.Yeah, definitely.We had an ideal start and managed to build up a lead early on,which was great, but when you\'re in that position,the last thing you want to see is a red flag. iming line at the end of lap one.So, Gus Burton leads the way.Big, big dive by Foster on the inside,to go back ahead of Wylie.He goes off the road and back on again.He\'s all sideways.And diving up on the outside line comes Ryan Ratcliffe.Wylie here battling with one of the Pro category cars,but behind him, all the Pro-Am opposition crawling all over him.Well, that was dramatic stuff, wasn\'t it?Round the outside of Turn 1, put Harry Foster in the wrong place.That was Max Bird going wide, number 44, the pink and blue car.So that\'s just haemorrhaged places in Pro-Am.And he\'s the... Oh, a puncture.There\'s somebody with a puncture. Is that Angus Whiteside? Possibly.Let\'s see.I think it is. And you\'ve got this damp patch on the inside,on the braking there, just at the final into the hairpin.This has been a dramatic start to this race for Porsches.Absolutely right.Coming up over the timing line, Gus Burton leads the way.Nine tenths of a second to the good.Big effort being made by Jason Lockwoodin the yellow and orange car in the background, look,to try to get up the inside line, then diving down towards Turn 1.Goes ahead of Oliver White, the very experienced Formula 4 champion.In the silver car, Oliver White, back into Carrera Cup.Remember, he did a full season last year.Good to have him back on the grid.As the cars clamber their way up over the kerb,through the chicane.But Gus Burton saying to everybody, "I\'m back." He leads.Yeah, a dramatic way for Gus Burton to come back to this championship.Remember, he started this year with Century Motorsport but then ducked out of the championship prior to Thruxton.He\'s still competing in the Supercup series with Fach Auto.As there in the pits, getting a new rear left tyre, is Angus Whiteside.But Gus Burton absolutely on it.Very quick in testing here during the week.They tested on Wednesday and on Friday.Gus Burton very quick in...And he\'s really enjoying life now.Back in the championship with the NAPA Racing UK supportand with a different team, Nick Tandy\'s JTR outfit.And he\'s done the fastest lap of the race, as he leads.He is not in the championship fight, but he wants to win races.Car off. It\'s Max Bird again.So, Max Bird, the Pro-Am championship leader,three times a winner in class this year,off the road and back on again.But that\'s gonna throw him way, way down the order.This race is going from bad to worse for him.It\'s just completely unfolded for poor Max Bird.That\'s the curse of having our camera on board, I think,but it\'s just unravelled after a great qualifying.Now, you were talking about Gus Burton\'s start,and it is going to be investigated after the race.OK. Well, it\'ll take a lot of camera action analysisto look at it. This is on board with Bird.Round Turn 1.All OK there. Very close... Goes to the outside.That\'s dangerous cos you can get knocked wide,and that\'s exactly what happens.The man he was trying to get past, Josh Stanton,who spent last night trackside at Cowdenbeath watching stock cars.I\'m not suggesting for a moment he\'s learnt how to defend,but he was enjoying himself, watching a different form of racing.I think all the best people were at Cowdenbeath, weren\'t they?Nick Tandy was, and others. Oh!As there, absolutely on the giddy limit, is Harry Foster,making his way in sixth place.Down towards the hairpin.He\'s dropped back from that leading quintet,but he\'s keeping Ross Wylie at bay.Ross Wylie, there, creeping into shot, leads now Pro-Amahead of Ryan Ratcliffe.And Josh Stanton is third in Pro-Am, last year\'s Am champion.Yeah, and Ross Wylie the only Scottish driver in the race. A lot of support for him,from local sponsors as well as the public.Buoyed by his recent run at the British Grand Prix at Supercup,and thoroughly loving racing at his home circuit, Ross Wylie.Track is nicely dry.There was some threats of possible rain.We had rain yesterday during qualifying.They actually only got one runon their slick tyres yesterday in qualifyingbefore the rain arrived, and that set the grid.So, Gus Burton\'s lead growing all the time.1.3 seconds now, that margin over Adam Smalley.As Max Bird tries to fight back in Pro-Am.Gets up the inside line there.So, that puts him ahead of David Stirling.So, he\'s split the second and third Am fightas he tries to recover.Yeah, but he\'s lost a lot of ground with that momenton the outside of McIntyre\'s.It\'s getting a lot darker overhead at Knockhill,even though there is a break in the cloud.A big effort there from the lapped car of Angus Whiteside.He\'s not fighting for position, he\'s trying to unlap himself.But just wonder whether we might get so f the right of McIntyre\'s,up towards Butcher\'s, then the chicane.And looking to try and maintain this 100% recordin the Team Parker Racing-run car in Am.Yeah. David Fairbrother in second place,but some 11 seconds behind in the Am category.But he will take another podium.His second in the championship, too, Justin Sherwood.The race leader 2.5 seconds to the good, Gus Burton.Other battles still to be resolved.What\'s going on in Pro-Am? Ross Wylie leads.He\'s fallen back behind Josh Malin overall. That was the move.Josh Malin through on the inside at the hairpin.Ross Wylie, in a sense, content to let that happen - gave him room -because that\'s not his battle, but what it does meanis that Ryan Ratcliffe, his class rival,is directly behind him.This is William Aspin versus Max Bird for sixth in Pro-Am.And a very determined Max Bird goes one side, get his nose chopped off.Will Aspin, the man from Florence, defends on the other side.They\'re absolutely together, almost touching.Here comes Max Bird.Oh, but he can\'t find a way through there.Angus Whiteside is now getting in on the act.Round the outside goes Max Bird, but they both take it wide,and through goes Angus Whiteside on the inside.Doesn\'t affect the race order.Whiteside unlaps himself from those two cars. Will Aspin stays ahead. Max Bird tries to fight back.Down towards Duffus Dip.Ignore the car in the lead of this battle packbecause it\'s not on the lead lap.But then Aspin under attack.Max Bird tries to get up alongside himfor the inside line coming into McIntyre\'s.He is on the inside, and he is ahead now.Yeah. And behind him, there was a car completely off on the grassafter Turn 1.So I do think that section of the track is a little slippery,for whatever reason. Maybe it just hasn\'t quite dried out.But this was a great battle between Max Bird and Will Aspin.So, drivers, in one or two cases,setting personal best lap times last time around,suggesting that the road is drying still.The cars are getting lighter on fuel anyway.Down at the hairpin comes the recovering Max Bird,as over the line goes Harry Foster, being chased by Josh Malin.Josh up into seventh overall.A top six could be on - he\'s only half a second back.Yeah, it\'s not far away, is it?And still plenty of laps left in this race.You probably noticed through that Turn 1the drivers are not riding the big kerb on the inside.That\'s because it\'s a new kerb that\'s been put in, actually,to raise the level of the kerbback to the level it was before the track got resurfaced twice.But with the resurfacing twice,it had raised the track surface by 80mm,and the drivers found they were, in previous years,able to use that kerb.Now? Not so much.So, there going through is Oliver Wight in the silver car,down towards the hairpin.Jason Lockwood ahead of him.Jason for EXCELR8, and he is running in 12 at the moment,which is potentially going to be his best finish of the year.It\'s been a tough season for Jason,but he could be on for his best results thus far.However, Gus Burton has rather dominated this,and look at the gap that he\'s pulled.Adam Smalley, as we suggested earlier,might be thinking about banking points,but it doesn\'t look as though he\'s been able to do anything at allabout that JTR car ahead.No. In terms of pure speed,he hasn\'t been able to threaten Gus Burton at all, has he? Gus Burton has led every race.As he\'s now passing David Fairbrotherat the back of the field.But he\'s had this race under control.But unfortunately, he\'s got this investigation after the racefor a possible false start hanging over him.And if, if, if anything is found, and it\'s a false start,normally that\'s a ten-second penalty,and he\'s not ten seconds ahead,so there is gonna be a postscript to this story, that\'s for sure.Now, this is Henry Dawes, Ollie Jacksoncoming through the chicane.Dawes goes wide, goes through the gravel,goes over the grass, loses a place,gets it all sideways, but just about saves it by the end of the straight.Yeah, nearly lost it on the wet grass.Oh. Harry Foster.This is passing David Fairbrother again, further back.So, this is Smalley versus Matty Graham for second place.So, this gap has come r. \n\n Your task is to create long detailed paragraph-by-paragraph summary. Detailed paragraph-by-paragraph summary of the text above:
\ No newline at end of file
diff --git a/vllm_v0.10.0/tests/quantization/__init__.py b/vllm_v0.10.0/tests/quantization/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/quantization/reference_mxfp4.py b/vllm_v0.10.0/tests/quantization/reference_mxfp4.py
new file mode 100644
index 0000000..2ef2519
--- /dev/null
+++ b/vllm_v0.10.0/tests/quantization/reference_mxfp4.py
@@ -0,0 +1,287 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+BFLOAT16_EXP_BIAS = 127
+BFLOAT16_MANTISSA_BITS = 7
+BFLOAT16_EXP_BITS = 8
+
+FLOAT16_EXP_BIAS = 15
+FLOAT16_MANTISSA_BITS = 10
+FLOAT16_EXP_BITS = 5
+
+FLOAT8_E8M0_MAX_EXP = 127
+FLOAT4_EXP_BIAS = 1
+FLOAT4_MANTISSA_BITS = 1
+
+FLOAT16_VAL_TO_ADD = (1 << (FLOAT16_MANTISSA_BITS - FLOAT4_MANTISSA_BITS - 1))
+FLOAT16_SIGN_EXPONENT_MASK = ((
+    (1 << (FLOAT16_EXP_BITS + 1)) - 1) << FLOAT16_MANTISSA_BITS)
+
+BFLOAT16_VAL_TO_ADD = (1 <<
+                       (BFLOAT16_MANTISSA_BITS - FLOAT4_MANTISSA_BITS - 1))
+BFLOAT16_SIGN_EXPONENT_MASK = ((
+    (1 << (BFLOAT16_EXP_BITS + 1)) - 1) << BFLOAT16_MANTISSA_BITS)
+
+
+def e8m0_to_half(scale, half_dtype: torch.dtype):
+    assert scale.dtype == torch.uint8
+
+    scale_exp = scale.to(torch.int16) - 127
+
+    # This can be implemented with bitwise operations in a proper kernel.
+    scale_half = 2.0**(scale_exp.to(torch.float))
+
+    return scale_half.to(half_dtype)
+
+
+def upcast_fp4_to_fp16_or_bf16(val, float_dtype: torch.dtype,
+                               half_exp_bias: int, half_mantissa_bits: int):
+    assert val.dtype == torch.uint8
+
+    unpacked = torch.zeros(*val.shape[:-1],
+                           val.shape[-1] * 2,
+                           dtype=torch.uint8,
+                           device=val.device)
+    unpacked[..., 1::2] = (val >> 4) & 0x0F  # Extract high 4 bits.
+    unpacked[..., ::2] = val & 0x0F  # Extract low 4 bits.
+
+    # Takes one float4 values represented as b0000xxxx,
+    # and converts it to the corresponding float16 value.
+
+    sign = unpacked >> 3
+
+    exp = (unpacked >> 1) & 3
+    new_mantissa = unpacked & 1
+
+    # if exp == 0 and new_mantissa == 0:
+    #     new_exp = 0
+    # else:
+    #     new_exp = exp - FLOAT4_EXP_BIAS + FLOAT16_EXP_BIAS
+
+    # int8_t works with float16, but may overflow with bfloat16.
+    new_exp = exp - FLOAT4_EXP_BIAS + half_exp_bias
+
+    # Cast b0000 to 0. in fp16/bf16.
+    new_exp = new_exp * torch.logical_or(exp > 0, new_mantissa > 0)
+
+    # Cast b0001 to 0.5 in fp16/bf16.
+    new_mantissa = torch.logical_and(new_mantissa, exp > 0)
+
+    new_mantissa = new_mantissa.to(torch.int32)
+    new_exp = new_exp.to(torch.int32)
+    sign = sign.to(torch.int32)
+
+    qdq_val = (sign << 15) + (new_exp << half_mantissa_bits) + (
+        new_mantissa << (half_mantissa_bits - 1))
+
+    assert qdq_val.max() <= 65535
+    assert qdq_val.min() >= 0
+    qdq_val = qdq_val.to(torch.uint16)
+
+    result = qdq_val.view(float_dtype)
+
+    return result
+
+
+def dq_mxfp4_torch(x: torch.Tensor, scale: torch.Tensor,
+                   float_dtype: torch.dtype) -> torch.Tensor:
+    assert x.dtype == torch.uint8
+    assert scale.dtype == torch.uint8
+
+    if float_dtype == torch.float16:
+        half_exp_bias = FLOAT16_EXP_BIAS
+        half_mantissa_bits = FLOAT16_MANTISSA_BITS
+    elif float_dtype == torch.bfloat16:
+        half_exp_bias = BFLOAT16_EXP_BIAS
+        half_mantissa_bits = BFLOAT16_MANTISSA_BITS
+
+    scale_half = e8m0_to_half(scale, half_dtype=float_dtype)
+
+    x_half = upcast_fp4_to_fp16_or_bf16(x,
+                                        float_dtype=float_dtype,
+                                        half_exp_bias=half_exp_bias,
+                                        half_mantissa_bits=half_mantissa_bits)
+
+    x_half = x_half.reshape(*x_half.shape[:-1], -1, 32)
+    x_half = x_half * scale_half[..., None]
+    x_half = x_half.reshape(*x_half.shape[:-2], -1)
+
+    return x_half
+
+
+def fp16_to_fp4_simulate(val, half_mantissa_bits: int, half_exp_bits: int,
+                         half_exp_bias: int):
+    # Casts an fp16/bf16 input to the restricted values of float4_e2m1,
+    # that is to say [0., 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0, -0.0,
+    # -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0].
+
+    float_type = val.dtype
+
+    # "rshift_cuda" not implemented for 'UInt16'
+    val_view = val.view(torch.int16)  #.to(torch.int32)
+
+    exp = val_view >> half_mantissa_bits
+    exp = exp & ((1 << half_exp_bits) - 1)
+
+    exp = exp.view(torch.uint16).to(torch.int32)
+
+    sign = (val_view >> (half_mantissa_bits + half_exp_bits)) & 1
+
+    mantissa_last = (val_view >> (half_mantissa_bits - 1)) & 1
+
+    exp_unbias = exp - half_exp_bias
+    new_exp = exp_unbias + FLOAT4_EXP_BIAS
+
+    exp_shift = (new_exp <= 0) * (1 - new_exp)
+
+    # Typically 9.
+    # Take the min to prevent overflow on `uint16_t half`. This is the case for
+    # very small values, correctly mapped to `round_close`.
+    tail_bits = half_mantissa_bits - FLOAT4_MANTISSA_BITS + exp_shift
+    tail_bits[tail_bits >= 16] = 16
+
+    mantissa_plus_one = val_view & ((1 << (half_mantissa_bits + 1)) - 1)
+
+    half = 1 << (tail_bits - 1)
+
+    tail = mantissa_plus_one & ((1 << tail_bits) - 1)
+
+    round_close = (tail < half)  # round towards 0
+    round_away = (tail > half)  # round away from 0
+    tie = tail == half
+
+    new_mantissa_close = torch.zeros(val.shape,
+                                     device=val.device,
+                                     dtype=torch.bool)
+    new_exp_close = torch.zeros(val.shape,
+                                device=val.device,
+                                dtype=torch.uint16)
+
+    new_mantissa_away = torch.zeros(val.shape,
+                                    device=val.device,
+                                    dtype=torch.bool)
+    new_exp_away = torch.zeros(val.shape,
+                               device=val.device,
+                               dtype=torch.uint16)
+
+    new_exp_tie = torch.zeros(val.shape, device=val.device, dtype=torch.uint16)
+
+    # 1. round down
+    # if new_exp == 0: # case [0.5, 0.749999]
+    #     new_mantissa = 0
+    # elif new_exp < 0:  # case [0, 0.24999]
+    #     new_mantissa = 0
+    # else:
+    #     new_mantissa = mantissa_last
+
+    new_mantissa_close = (new_exp > 0) * mantissa_last
+    new_exp_close = exp
+
+    # # 2. round up
+    # if new_exp <= 0:  # case [0.250001, 0.499999] and [0.75001, 0.99999]
+    #     new_mantissa = 0
+    #     new_exp += 1
+    # elif mantissa_last == 0:
+    #     new_mantissa = 1
+    # else:
+    #     new_mantissa = 0
+    #     new_exp += 1
+
+    new_mantissa_away = torch.logical_and(new_exp > 0, mantissa_last == 0)
+    new_exp_away = exp + torch.logical_or(new_exp <= 0, mantissa_last == 1)
+
+    # # 3. tie
+    # 0.25 -> 0. (handled by `exp > (half_exp_bias - 2)`)
+    # 0.75 -> 1.
+    # 1.25 -> 1.
+    # 1.75 -> 2.
+    # 2.5 -> 2.
+    # 3.5 -> 4.
+    # 5. -> 4.
+    new_exp_tie = (exp > (half_exp_bias - 2)) * (exp + (mantissa_last == 1))
+
+    # Gather round up, round down and tie.
+    new_exp = round_away * new_exp_away \
+        + round_close * new_exp_close \
+        + tie * new_exp_tie
+
+    new_mantissa = round_away * new_mantissa_away \
+        + round_close * new_mantissa_close
+
+    # if new_exp > 3:
+    #     new_mantissa = 1
+    new_mantissa = new_mantissa + (new_exp >
+                                   (2 + half_exp_bias)) * (new_mantissa == 0)
+
+    # Clamp the exponent to acceptable values.
+    new_exp = (new_exp >= (half_exp_bias - 2)) * torch.clamp(
+        new_exp, half_exp_bias - 2, half_exp_bias + 2)
+
+    sign = sign.to(torch.int32)
+    new_mantissa = new_mantissa.to(torch.int32)
+
+    qdq_val = (sign << 15) + (new_exp << half_mantissa_bits) + (
+        new_mantissa << (half_mantissa_bits - 1))
+
+    assert qdq_val.max() <= 65535
+    assert qdq_val.min() >= 0
+    assert qdq_val.dtype == torch.int32
+    qdq_val = qdq_val.to(torch.uint16)
+
+    result = qdq_val.view(float_type)
+    return result
+
+
+def qdq_mxfp4_torch(x: torch.Tensor,
+                    scale_calculation_mode: str = "even") -> torch.Tensor:
+    half_dtype = x.dtype
+
+    if half_dtype == torch.float16:
+        half_mantissa_bits = FLOAT16_MANTISSA_BITS
+        half_exp_bits = FLOAT16_EXP_BITS
+        half_exp_bias = FLOAT16_EXP_BIAS
+        val_to_add = FLOAT16_VAL_TO_ADD
+        sign_exponent_mask = FLOAT16_SIGN_EXPONENT_MASK
+    elif half_dtype == torch.bfloat16:
+        half_mantissa_bits = BFLOAT16_MANTISSA_BITS
+        half_exp_bits = BFLOAT16_EXP_BITS
+        half_exp_bias = BFLOAT16_EXP_BIAS
+        val_to_add = BFLOAT16_VAL_TO_ADD
+        sign_exponent_mask = BFLOAT16_SIGN_EXPONENT_MASK
+    else:
+        raise ValueError("not implemented")
+
+    x = x.reshape(*x.shape[:-1], -1, 32)
+
+    block_max = torch.max(torch.abs(x), dim=-1).values
+
+    block_max = block_max.view(torch.uint16).to(torch.int32)
+
+    block_max_uint = torch.bitwise_and(block_max + val_to_add,
+                                       sign_exponent_mask)
+
+    assert block_max_uint.max() <= 65535
+    assert block_max_uint.min() >= 0
+    assert block_max_uint.dtype == torch.int32
+    block_max_uint = block_max_uint.to(torch.uint16)
+
+    block_max = block_max_uint.view(half_dtype)
+
+    scale_exp = FLOAT8_E8M0_MAX_EXP + torch.floor(torch.log2(block_max)).to(
+        torch.int32) - 2
+
+    scale_exp = torch.clamp(scale_exp, 0, 2 * FLOAT8_E8M0_MAX_EXP)
+
+    scale = 2.0**(scale_exp - FLOAT8_E8M0_MAX_EXP)
+    scale = scale.to(half_dtype)
+
+    x = x / scale[..., None]
+
+    x_fp4 = fp16_to_fp4_simulate(x,
+                                 half_exp_bits=half_exp_bits,
+                                 half_mantissa_bits=half_mantissa_bits,
+                                 half_exp_bias=half_exp_bias)
+
+    x_fp4 = x_fp4 * scale[..., None]
+    return x_fp4.reshape(*x_fp4.shape[:-2], -1)
diff --git a/vllm_v0.10.0/tests/quantization/test_auto_round.py b/vllm_v0.10.0/tests/quantization/test_auto_round.py
new file mode 100644
index 0000000..1c41d90
--- /dev/null
+++ b/vllm_v0.10.0/tests/quantization/test_auto_round.py
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test model set-up and inference for quantized HF models supported
+ on the AutoRound.
+
+ Validating the configuration and printing results for manual checking.
+
+ Run `pytest tests/quantization/test_auto_round.py`.
+"""
+
+import pytest
+
+from vllm.platforms import current_platform
+
+MODELS = [
+    "OPEA/Qwen2.5-0.5B-Instruct-int4-sym-inc",  ##auto_round:auto_gptq
+    "Intel/Qwen2-0.5B-Instruct-int4-sym-AutoRound"  ##auto_round:auto_awq
+]
+
+
+@pytest.mark.skipif(not current_platform.is_cpu()
+                    and not current_platform.is_xpu()
+                    and not current_platform.is_cuda(),
+                    reason="only supports CPU/XPU/CUDA backend.")
+@pytest.mark.parametrize("model", MODELS)
+def test_auto_round(vllm_runner, model):
+    with vllm_runner(model) as llm:
+        output = llm.generate_greedy(["The capital of France is"],
+                                     max_tokens=8)
+    assert output
+    print(f"{output[0][1]}")
diff --git a/vllm_v0.10.0/tests/quantization/test_compressed_tensors.py b/vllm_v0.10.0/tests/quantization/test_compressed_tensors.py
new file mode 100644
index 0000000..db7e50e
--- /dev/null
+++ b/vllm_v0.10.0/tests/quantization/test_compressed_tensors.py
@@ -0,0 +1,683 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test model set-up and weight loading for llmcompressor-quantized models.
+
+Run `pytest tests/quantization/test_compressed_tensors.py`.
+"""
+
+from typing import Optional
+
+import pytest
+import torch
+from compressed_tensors.quantization import QuantizationType
+
+from tests.models.utils import check_logprobs_close
+from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
+    CompressedTensors24, CompressedTensorsLinearMethod,
+    CompressedTensorsW4A4Fp4, CompressedTensorsW4A16Fp4,
+    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
+    CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
+    CompressedTensorsWNA16, cutlass_fp4_supported)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    sparse_cutlass_supported)
+from vllm.platforms import current_platform
+
+# AITER only supports per-channel-per-channel INT8 gemm
+# and per-tensor-per-tensor INT8 GEMM.
+# It does not support mix precision MM and mix quantization scheme.
+ROCM_AITER_SUPPORTED_INT8_MODEL = [
+    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
+    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2"
+]
+
+# TritonScaledMMLinearKernel only supports symmetric quantization.
+ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL = [
+    "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
+    "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
+    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
+    "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
+    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
+]
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This module relies on V0 internals, so set VLLM_USE_V1=0.
+    """
+    if not current_platform.is_cpu():
+        monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
+@pytest.mark.parametrize(
+    "model_args",
+    [
+        (
+            "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
+            "tensor",
+            QuantizationType.INT,
+            2560,
+            True,
+        ),
+        (
+            "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
+            "channel",
+            QuantizationType.INT,
+            2560,
+            True,
+        ),
+        (
+            "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
+            "tensor",
+            QuantizationType.INT,
+            2560,
+            False,
+        ),
+    ],
+)
+def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
+    model_path, strategy, quant_type, shape_0, is_symmetric = model_args
+
+    if current_platform.is_rocm(
+    ) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
+        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
+
+    with vllm_runner(model_path, enforce_eager=True) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            o_proj = layer.self_attn.o_proj
+            gate_up_proj = layer.mlp.gate_up_proj
+            down_proj = layer.mlp.down_proj
+
+            # assert zp for symmetric and asymmetric cases
+            def zp_valid(zp: Optional[torch.Tensor]):
+                if is_symmetric:
+                    return zp is None
+
+                return zp is not None and zp.dtype is torch.int32
+
+            assert zp_valid(qkv_proj.input_zero_point)
+            assert zp_valid(o_proj.input_zero_point)
+            assert zp_valid(gate_up_proj.input_zero_point)
+            assert zp_valid(down_proj.input_zero_point)
+
+            assert isinstance(qkv_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(o_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(gate_up_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(down_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
+
+            assert qkv_proj.scheme.strategy == strategy
+            assert qkv_proj.scheme.is_static_input_scheme
+            expected_type = torch.int8
+
+            assert qkv_proj.weight.dtype is expected_type
+            assert o_proj.weight.dtype is expected_type
+            assert gate_up_proj.weight.dtype is expected_type
+
+            if qkv_proj.scheme.strategy == "tensor":
+                # Make sure it is a channelwise buffer
+                # After running process_weights_after_loading
+                assert len(qkv_proj.weight_scale.shape) == 2
+                assert qkv_proj.weight_scale.shape[0] == shape_0
+                assert qkv_proj.weight_scale.shape[1] == 1
+            assert qkv_proj.weight_scale.dtype is torch.float32
+            assert qkv_proj.input_scale.dtype is torch.float32
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
+        assert output
+
+
+@pytest.mark.parametrize(
+    "model_path",
+    [
+        "neuralmagic/Llama-3.2-1B-quantized.w8a8",
+        "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
+        "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
+        "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
+    ],
+)
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [10])
+@pytest.mark.parametrize(
+    "use_aiter", [True, False] if current_platform.is_rocm() else [False])
+def test_compressed_tensors_w8a8_logprobs(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model_path,
+    max_tokens,
+    num_logprobs,
+    use_aiter,
+    monkeypatch,
+):
+
+    if current_platform.is_rocm(
+    ) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
+        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
+
+    if use_aiter:
+        if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
+            pytest.skip(
+                f"Skip model {model_path} as it is not support by aiter.")
+        # this will enable VLLM_ROCM_USE_AITER_LINEAR
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
+    dtype = "bfloat16"
+
+    # skip language translation prompt for the static per tensor asym model
+    if (model_path ==
+            "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"
+        ):  # noqa: E501
+        example_prompts = example_prompts[0:-1]
+
+    with hf_runner(model_path, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
+
+    with vllm_runner(model_path, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+    if current_platform.is_rocm():
+        torch.cuda.synchronize()
+
+
+def test_compressed_tensors_no_enforce_eager(vllm_runner):
+    model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
+    with vllm_runner(model_path) as llm:
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        assert output
+
+
+@pytest.mark.parametrize(
+    "model_args",
+    [
+        ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"),
+        ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym", "tensor"),
+        (
+            "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
+            "channel",
+        ),
+        (
+            "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
+            "channel",
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "use_aiter", [True, False] if current_platform.is_rocm() else [False])
+def test_compressed_tensors_w8a8_dynamic_per_token(
+    vllm_runner,
+    model_args,
+    use_aiter,
+    monkeypatch,
+):
+    model_path, strategy = model_args
+
+    if current_platform.is_rocm(
+    ) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
+        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
+
+    if use_aiter:
+        if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
+            pytest.skip(
+                f"Skip model {model_path} as it is not support by aiter.")
+        # this will enable VLLM_ROCM_USE_AITER_LINEAR
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
+    with vllm_runner(model_path, dtype=torch.float16) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+
+            assert isinstance(qkv_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
+            assert not qkv_proj.scheme.is_static_input_scheme
+            assert qkv_proj.scheme.strategy == strategy
+            assert qkv_proj.weight.dtype is torch.int8
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
+        assert output
+
+
+@pytest.mark.parametrize(
+    "wNa16_args",
+    [("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8,
+      True, False),
+     ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8, True,
+      False),
+     ("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4,
+      True, False),
+     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-awq-group128-asym256", "group", 128,
+      8, False, False),
+     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-Channel",
+      "channel", None, 8, False, False),
+     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-ActOrder",
+      "group", 128, 8, False, True)],
+)
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="The tests are skipped on non-CUDA platform.")
+def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
+    model, strategy, group, pack_factor, symmetric, has_g_idx = wNa16_args
+    with vllm_runner(model) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            assert isinstance(qkv_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16)
+
+            assert qkv_proj.scheme.strategy == strategy
+            assert qkv_proj.scheme.group_size == (-1
+                                                  if group is None else group)
+
+            assert qkv_proj.scheme.pack_factor == pack_factor
+            assert qkv_proj.scheme.symmetric == symmetric
+            assert qkv_proj.scheme.has_g_idx == has_g_idx
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        assert output
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="This test is skipped on non-CUDA platform.")
+def test_compressed_tensors_w4a16_marlin24(vllm_runner):
+    model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
+    with vllm_runner(model_path) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+
+            assert isinstance(qkv_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24)
+            assert qkv_proj.weight_packed.dtype is torch.int32
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        assert output
+
+
+def test_compressed_tensors_fp8(vllm_runner):
+    model_path = "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
+    with vllm_runner(model_path) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+
+            assert isinstance(qkv_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(
+                qkv_proj.scheme,
+                (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8),
+            )
+
+            assert qkv_proj.input_scale.dtype is torch.float32
+
+            if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8):
+                assert len(qkv_proj.input_scale.shape) == 0
+                assert qkv_proj.weight.dtype is current_platform.fp8_dtype()
+                assert qkv_proj.weight_scale.dtype is torch.float32
+                assert len(qkv_proj.weight_scale.shape) == 0
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        assert output
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="This test is skipped on non-CUDA platform.")
+def test_compressed_tensors_kv_cache(vllm_runner):
+    model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
+    with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
+        output = llm.generate_greedy("Hello world!", max_tokens=20)
+        assert output
+
+
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="Sparse FP8 is not yet supported on this GPU type.",
+)
+def _test_2of4_quant_models(qkv_proj,
+                            weight_strategy,
+                            input_strategy,
+                            format="dense"):
+    assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+    assert isinstance(qkv_proj.scheme, CompressedTensors24)
+
+    assert qkv_proj.scheme.weight_quant.strategy == weight_strategy
+    assert qkv_proj.scheme.input_quant.strategy == input_strategy
+    assert qkv_proj.scheme.quantized
+    assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+    sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
+    assert sparsity_map.get("Linear").format == format
+    assert sparsity_map.get("Linear").sparsity_structure == "2:4"
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda()
+    or not current_platform.has_device_capability(90),
+    reason="Sparse FP8 is not yet supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "args_2of4",
+    [
+        (
+            "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
+            "channel",
+            "token",
+        ),
+        (
+            "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
+            "channel",
+            "tensor",
+        ),
+        (
+            "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
+            "tensor",
+            "tensor",
+        ),
+        (
+            "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
+            "tensor",
+            "token",
+        ),
+    ],
+)
+def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
+    model, weight_strategy, input_strategy = args_2of4
+    with vllm_runner(model) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn
+            _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda()
+    or not current_platform.has_device_capability(90),
+    reason="Sparse FP8 is not yet supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "args_2of4",
+    [
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
+            "channel",
+            "token",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
+            "channel",
+            "tensor",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
+            "tensor",
+            "token",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
+            "tensor",
+            "tensor",
+        ),
+    ],
+)
+def test_compressed_tensors_2of4_quant_fp8_compressed(vllm_runner, args_2of4):
+    model, weight_strategy, input_strategy = args_2of4
+    with vllm_runner(model) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn
+            _test_2of4_quant_models(
+                qkv_proj,
+                weight_strategy,
+                input_strategy,
+                format="sparse-24-bitmask",
+            )
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
+
+
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="cutlass is not yet supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "args_2of4",
+    [
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
+            "channel",
+            "token",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
+            "channel",
+            "tensor",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
+            "tensor",
+            "token",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
+            "tensor",
+            "tensor",
+        ),
+    ],
+)
+def test_compressed_tensors_2of4_quant_int8_compressed(vllm_runner, args_2of4):
+    model, weight_strategy, input_strategy = args_2of4
+    with vllm_runner(model) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            assert qkv_proj.scheme.weights_dtype == torch.int8
+            _test_2of4_quant_models(
+                qkv_proj,
+                weight_strategy,
+                input_strategy,
+                format="sparse-24-bitmask",
+            )
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
+
+
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="Sparse FP8 is not yet supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "args_2of4",
+    [
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
+            "channel",
+            "token",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
+            "tensor",
+            "tensor",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
+            "tensor",
+            "token",
+        ),
+    ],
+)
+def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
+    model, weight_strategy, input_strategy = args_2of4
+    with vllm_runner(model) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            assert qkv_proj.scheme.weights_dtype == torch.int8
+            _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
+
+
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="2of4 Sparse is not yet supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "args_2of4",
+    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")],
+)
+def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
+    model = args_2of4
+    with vllm_runner(model) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            assert isinstance(qkv_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.scheme, CompressedTensors24)
+
+            assert qkv_proj.scheme.weight_quant is None
+            assert qkv_proj.scheme.input_quant is None
+            assert not qkv_proj.scheme.quantized
+            assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+            sparsity_map = (
+                qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+            )  # noqa: E501
+            assert sparsity_map.get("Linear").format == "dense"
+            assert sparsity_map.get("Linear").sparsity_structure == "2:4"
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
+
+
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="Cutlass is not yet supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "args_2of4", [("nm-testing/llama2.c-stories42M-pruned2.4-compressed")])
+def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
+    model = args_2of4
+    with vllm_runner(model) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            assert isinstance(qkv_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.scheme, CompressedTensors24)
+
+            assert qkv_proj.scheme.weight_quant is None
+            assert qkv_proj.scheme.input_quant is None
+            assert not qkv_proj.scheme.quantized
+            assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+            sparsity_map = (
+                qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+            )  # noqa: E501
+            assert sparsity_map.get("Linear").format == "sparse-24-bitmask"
+            assert sparsity_map.get("Linear").sparsity_structure == "2:4"
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
+
+
+@pytest.mark.parametrize(
+    "args",
+    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16",
+      CompressedTensorsW4A16Fp4),
+     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4", CompressedTensorsW4A4Fp4)])
+def test_compressed_tensors_nvfp4(vllm_runner, args):
+    model, scheme = args
+    with vllm_runner(model, enforce_eager=True) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            assert isinstance(qkv_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            if isinstance(qkv_proj.scheme, scheme) or isinstance(
+                    qkv_proj.scheme,
+                    CompressedTensorsW4A16Fp4) and not cutlass_fp4_supported():
+                assert True
+            else:
+                raise AssertionError("FP4 Scheme Mismatch")
+
+            assert qkv_proj.scheme.group_size == 16
+
+        llm.apply_model(check_model)
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
diff --git a/vllm_v0.10.0/tests/quantization/test_configs.py b/vllm_v0.10.0/tests/quantization/test_configs.py
new file mode 100644
index 0000000..8b0ffc0
--- /dev/null
+++ b/vllm_v0.10.0/tests/quantization/test_configs.py
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests whether Marlin models can be loaded from the autogptq config.
+
+Run `pytest tests/quantization/test_configs.py --forked`.
+"""
+
+from dataclasses import dataclass
+
+import pytest
+
+from vllm.config import ModelConfig
+
+
+@dataclass
+class ModelPair:
+    model_marlin: str
+    model_gptq: str
+
+
+# Model Id // Quantization Arg // Expected Type
+MODEL_ARG_EXPTYPES = [
+    # AUTOGPTQ
+    # compat: autogptq <=0.7.1 is_marlin_format: bool
+    # Model Serialized in Marlin Format should always use Marlin kernel.
+    ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", None, "marlin"),
+    ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "marlin", "marlin"),
+    ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "gptq", "marlin"),
+    ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "awq", "ERROR"),
+    # Model Serialized in Exllama Format.
+    ("TheBloke/Llama-2-7B-Chat-GPTQ", None, "gptq_marlin"),
+    ("TheBloke/Llama-2-7B-Chat-GPTQ", "marlin", "gptq_marlin"),
+    ("TheBloke/Llama-2-7B-Chat-GPTQ", "gptq", "gptq"),
+    ("TheBloke/Llama-2-7B-Chat-GPTQ", "awq", "ERROR"),
+    # compat: autogptq >=0.8.0 use checkpoint_format: str
+    # Model Serialized in Marlin Format should always use Marlin kernel.
+    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", None, "marlin"),
+    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "marlin", "marlin"),
+    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "gptq", "marlin"),
+    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "awq", "ERROR"),
+    # Model Serialized in Exllama Format.
+    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", None, "gptq_marlin"),
+    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "marlin", "gptq_marlin"),
+    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "gptq", "gptq"),
+    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "awq", "ERROR"),
+
+    # AUTOAWQ
+    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", None, "awq_marlin"),
+    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "awq", "awq"),
+    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "marlin", "awq_marlin"),
+    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "gptq", "ERROR"),
+]
+
+
+@pytest.mark.parametrize("model_arg_exptype", MODEL_ARG_EXPTYPES)
+def test_auto_gptq(model_arg_exptype: tuple[str, None, str]) -> None:
+    model_path, quantization_arg, expected_type = model_arg_exptype
+
+    try:
+        model_config = ModelConfig(model_path,
+                                   task="auto",
+                                   tokenizer=model_path,
+                                   tokenizer_mode="auto",
+                                   trust_remote_code=False,
+                                   seed=0,
+                                   dtype="float16",
+                                   revision=None,
+                                   quantization=quantization_arg)
+        found_quantization_type = model_config.quantization
+    except ValueError:
+        found_quantization_type = "ERROR"
+
+    assert found_quantization_type == expected_type, (
+        f"Expected quant_type == {expected_type} for {model_path}, "
+        f"but found {found_quantization_type} "
+        f"for no --quantization {quantization_arg} case")
diff --git a/vllm_v0.10.0/tests/quantization/test_cpu_offload.py b/vllm_v0.10.0/tests/quantization/test_cpu_offload.py
new file mode 100644
index 0000000..08d9573
--- /dev/null
+++ b/vllm_v0.10.0/tests/quantization/test_cpu_offload.py
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Expanded quantized model tests for CPU offloading
+# Base tests: tests/basic_correctness/test_cpu_offload.py
+
+import pytest
+
+from tests.quantization.utils import is_quant_method_supported
+
+from ..utils import compare_two_settings
+
+
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+                    reason="fp8 is not supported on this GPU type.")
+def test_cpu_offload_fp8():
+    # Test quantization of an unquantized checkpoint
+    compare_two_settings("meta-llama/Llama-3.2-1B-Instruct",
+                         ["--quantization", "fp8"],
+                         ["--quantization", "fp8", "--cpu-offload-gb", "1"],
+                         max_wait_seconds=480)
+    # Test loading a quantized checkpoint
+    compare_two_settings("neuralmagic/Qwen2-1.5B-Instruct-FP8", [],
+                         ["--cpu-offload-gb", "1"],
+                         max_wait_seconds=480)
+
+
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
+                    reason="gptq_marlin is not supported on this GPU type.")
+def test_cpu_offload_gptq(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
+    # Test GPTQ Marlin
+    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [],
+                         ["--cpu-offload-gb", "1"],
+                         max_wait_seconds=480)
+    # Test GPTQ
+    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
+                         ["--quantization", "gptq"],
+                         ["--quantization", "gptq", "--cpu-offload-gb", "1"],
+                         max_wait_seconds=480)
+
+
+@pytest.mark.skipif(not is_quant_method_supported("awq_marlin"),
+                    reason="awq_marlin is not supported on this GPU type.")
+def test_cpu_offload_awq(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
+    # Test AWQ Marlin
+    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", [],
+                         ["--cpu-offload-gb", "1"],
+                         max_wait_seconds=480)
+    # Test AWQ
+    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ",
+                         ["--quantization", "awq"],
+                         ["--quantization", "awq", "--cpu-offload-gb", "1"],
+                         max_wait_seconds=480)
+
+
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
+                    reason="gptq_marlin is not supported on this GPU type.")
+def test_cpu_offload_compressed_tensors(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
+    # Test wNa16
+    compare_two_settings("nm-testing/tinyllama-oneshot-w4a16-channel-v2", [],
+                         ["--cpu-offload-gb", "1"],
+                         max_wait_seconds=480)
+    # Test w4a16_marlin24
+    compare_two_settings("nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
+                         [], ["--cpu-offload-gb", "1"],
+                         max_wait_seconds=480)
+    # Test w8a8
+    compare_two_settings(
+        "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", [],
+        ["--cpu-offload-gb", "1"],
+        max_wait_seconds=480)
diff --git a/vllm_v0.10.0/tests/quantization/test_experts_int8.py b/vllm_v0.10.0/tests/quantization/test_experts_int8.py
new file mode 100644
index 0000000..50179b9
--- /dev/null
+++ b/vllm_v0.10.0/tests/quantization/test_experts_int8.py
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# flake8: noqa
+"""Tests experts_int8 quantization startup and generation, 
+doesn't test correctness
+"""
+import pytest
+
+from tests.quantization.utils import is_quant_method_supported
+
+MODELS = ["ai21labs/Jamba-tiny-random"]
+
+
+@pytest.mark.skipif(not is_quant_method_supported("experts_int8"),
+                    reason="ExpertsInt8 is not supported on this GPU type.")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [10])
+def test_model_experts_int8_startup(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+
+    with vllm_runner(model, dtype=dtype,
+                     quantization="experts_int8") as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/vllm_v0.10.0/tests/quantization/test_fp8.py b/vllm_v0.10.0/tests/quantization/test_fp8.py
new file mode 100644
index 0000000..e5ab7b3
--- /dev/null
+++ b/vllm_v0.10.0/tests/quantization/test_fp8.py
@@ -0,0 +1,196 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests whether FP8 computation is enabled correctly.
+
+Run `pytest tests/quantization/test_fp8.py --forked`.
+"""
+import pytest
+import torch
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.fp8 import (Fp8KVCacheMethod,
+                                                         Fp8LinearMethod)
+from vllm.platforms import current_platform
+
+MODELS = [
+    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
+    "nm-testing/Phi-3-mini-128k-instruct-FP8",
+    "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
+]
+
+
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+                    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.parametrize("model_id", MODELS)
+@pytest.mark.parametrize("force_marlin", [False, True])
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
+def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
+                            use_rocm_aiter: bool, monkeypatch) -> None:
+
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
+    if force_marlin:
+        monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
+
+    with vllm_runner(model_id) as llm:
+        # note: this does not test accuracy, just that we can run through
+        # see lm-eval tests for accuracy
+        outputs = llm.generate_greedy(prompts=["Hello my name is"],
+                                      max_tokens=10)
+        print(outputs[0][1])
+
+
+KV_CACHE_MODELS = [
+    # Deprecated AutoFP8 format using .kv_scale
+    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
+    # AutoFP8 format using separate .k_scale and .v_scale
+    "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
+]
+
+
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+                    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.parametrize("model_id", KV_CACHE_MODELS)
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
+def test_kv_cache_model_load_and_run(vllm_runner, model_id: str,
+                                     use_rocm_aiter: bool, monkeypatch):
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
+    # vllm_runner.apply_model() relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
+    with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
+
+        def check_model(model):
+            attn = model.model.layers[0].self_attn.attn
+
+            assert isinstance(attn.quant_method, Fp8KVCacheMethod)
+
+            if not current_platform.is_rocm():
+                # NOTE: This code path requires validation on Non-CUDA platform
+                # NOTE: it is valid for scales to be 1.0 (default value), but
+                # we know these checkpoints have scales < 1.0
+                assert 0.0 < attn._k_scale < 1.0
+                assert 0.0 < attn._v_scale < 1.0
+            else:
+                # NOTE: This code path is for ROCm platform
+                # NOTE: it is valid for scales to be 1.0 (default value), but
+                # we know these checkpoints have scales < 1.0
+                # However on ROCm platform, the _k_scale and _v_scale will be
+                # scaled by a factor of 2 as described in
+                # vllm/model_executor/layers/quantization/kv_cache.py
+                assert 0.0 < attn._k_scale < (1.0 * 2.0)
+                assert 0.0 < attn._v_scale < (1.0 * 2.0)
+
+        llm.apply_model(check_model)
+
+        # note: this does not test accuracy, just that we can run through
+        # see lm-eval tests for accuracy
+        outputs = llm.generate_greedy(prompts=["Hello my name is"],
+                                      max_tokens=10)
+        print(outputs[0][1])
+
+
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+                    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
+@pytest.mark.parametrize("force_marlin", [False, True])
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
+def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
+                         use_rocm_aiter: bool, monkeypatch) -> None:
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
+    # vllm_runner.apply_model() relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
+
+    if force_marlin:
+        monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
+
+    with vllm_runner("facebook/opt-125m",
+                     quantization="fp8",
+                     kv_cache_dtype=kv_cache_dtype) as llm:
+
+        def check_model(model):
+            fc1 = model.model.decoder.layers[0].fc1
+            assert isinstance(fc1.quant_method, Fp8LinearMethod)
+            if kv_cache_dtype == "fp8":
+                attn = model.model.decoder.layers[0].self_attn.attn
+                assert isinstance(attn.quant_method, Fp8KVCacheMethod)
+                assert attn._k_scale == 1.0
+                assert attn._v_scale == 1.0
+
+            if current_platform.is_cuda():
+                if current_platform.supports_fp8() and not force_marlin:
+                    # For GPUs with hardware support, we keep weights in fp8
+                    assert fc1.weight.dtype == torch.float8_e4m3fn
+                else:
+                    # For GPUs without hardware support, we pack the fp8 weights
+                    # for weight-only quantization using Marlin kernels
+                    assert fc1.weight.dtype == torch.int32
+            elif current_platform.is_rocm():
+                if current_platform.supports_fp8() and not force_marlin:
+                    # For GPUs with hardware support, we keep weights in fp8
+                    assert fc1.weight.dtype == current_platform.fp8_dtype()
+                else:  # unsupported ROCm platform
+                    pytest.skip(
+                        "Skip `test_load_fp16_model`. "
+                        "It only runs on ROCm platform with FP8 compute."
+                        " e.g. MI300X and above.")
+            else:  # unsupported platform
+                pytest.skip("Skip `test_load_fp16_model`. "
+                            "It only runs on CUDA and ROCm platform.")
+
+        llm.apply_model(check_model)
+
+
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+                    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+def test_scaled_fp8_quant(dtype) -> None:
+
+    def quantize_ref(tensor, inv_scale):
+        # The reference implementation that fully aligns to
+        # the kernel being tested.
+        finfo = torch.finfo(torch.float8_e4m3fn)
+        scale = inv_scale.reciprocal()
+        qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min,
+                                                           max=finfo.max)
+        qweight = qweight.to(torch.float8_e4m3fn)
+        return qweight
+
+    def per_tensor_dequantize(tensor, inv_scale, dtype):
+        fake_qweight = tensor.to(dtype)
+        dq_weight = fake_qweight * inv_scale
+        return dq_weight
+
+    # Note that we use a shape % 4 != 0 to cover edge cases,
+    # because scaled_fp8_quant is vectorized by 4.
+    x = (torch.randn(size=(11, 11), device="cuda") * 13).to(dtype)
+
+    # Dynamic quantization
+    ref_y, inv_scale = ops.scaled_fp8_quant(x, None)
+    ref_y = per_tensor_dequantize(ref_y, inv_scale, dtype)
+
+    # Reference dynamic quantizaton
+    y = quantize_ref(x, inv_scale)
+    torch.testing.assert_close(ref_y,
+                               per_tensor_dequantize(y, inv_scale, dtype))
+
+    # Static quantization
+    y, _ = ops.scaled_fp8_quant(x, inv_scale)
+    torch.testing.assert_close(ref_y,
+                               per_tensor_dequantize(y, inv_scale, dtype))
+
+    # Padding
+    y, _ = ops.scaled_fp8_quant(x, inv_scale, num_token_padding=17)
+    assert y.shape[0] == 17
+    torch.testing.assert_close(
+        ref_y,
+        per_tensor_dequantize(torch.narrow(y, 0, 0, x.shape[0]), inv_scale,
+                              dtype))
diff --git a/vllm_v0.10.0/tests/quantization/test_gptq_dynamic.py b/vllm_v0.10.0/tests/quantization/test_gptq_dynamic.py
new file mode 100644
index 0000000..aea50e9
--- /dev/null
+++ b/vllm_v0.10.0/tests/quantization/test_gptq_dynamic.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests whether gptq models with dynamic quantized can be loaded.
+
+Run `pytest tests/quantization/test_gptq_dynamic.py --forked`.
+"""
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQMarlinLinearMethod)
+from vllm.model_executor.layers.quantization.utils.gptq_utils import (
+    get_dynamic_override)
+
+PROMPT = "On the surface of Mars, we found"
+
+# The first layer is quantized using bits=4, group_size=128
+# The second layer is quantized using bits=8, group_size=32
+# All other layers (layer index >= 2) are not quantized
+MODEL_QUANT = [
+    ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
+     True),
+    ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
+     False),
+]
+
+
+@pytest.mark.parametrize("model_id, use_marlin_kernel", MODEL_QUANT)
+def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool,
+                           monkeypatch):
+    # vllm_runner.apply_model() relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
+
+    vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048)
+
+    linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else (
+        GPTQLinearMethod)
+
+    for name, submodule in (vllm_model.llm.llm_engine.model_executor.
+                            driver_worker.model_runner.model.named_modules()):
+        if name == "lm_head":
+            assert isinstance(submodule.quant_method, linear_method_cls)
+        elif name == 'model.layers.0.self_attn.qkv_proj':
+            # The first layer is quantized using bits=4, group_size=128
+            # desc_act=True
+            assert isinstance(submodule.quant_method, linear_method_cls)
+            config = submodule.quant_method.quant_config
+            assert config.weight_bits == 4
+            assert config.group_size == 128
+            assert config.desc_act
+        elif name == 'model.layers.1.self_attn.qkv_proj':
+            # The second layer is quantized using bits=8, group_size=32
+            # desc_act=False
+            assert isinstance(submodule.quant_method, linear_method_cls)
+            config = submodule.quant_method.quant_config
+            assert get_dynamic_override(config, layer_name=name,
+                                        key="bits") == 8
+            assert get_dynamic_override(config,
+                                        layer_name=name,
+                                        key="group_size") == 32
+            assert not get_dynamic_override(
+                config, layer_name=name, key="desc_act")
+        elif (name == 'model.layers.2.self_attn.qkv_proj'
+              or name == 'model.layers.2.mlp.gate_up_proj'):
+            # All other layers (layer index >= 2) are not quantized
+            assert isinstance(submodule.quant_method, UnquantizedLinearMethod)
+
+    del vllm_model
diff --git a/vllm_v0.10.0/tests/quantization/test_ipex_quant.py b/vllm_v0.10.0/tests/quantization/test_ipex_quant.py
new file mode 100644
index 0000000..34b1b6c
--- /dev/null
+++ b/vllm_v0.10.0/tests/quantization/test_ipex_quant.py
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test model set-up and inference for quantized HF models supported
+ on the CPU/GPU backend using IPEX (including AWQ/GPTQ).
+ 
+ Validating the configuration and printing results for manual checking.
+
+ Run `pytest tests/quantization/test_ipex_quant.py`.
+"""
+
+import pytest
+
+from vllm.platforms import current_platform
+
+MODELS = [
+    "AMead10/Llama-3.2-1B-Instruct-AWQ",
+    "shuyuej/Llama-3.2-1B-Instruct-GPTQ",  # with g_idx
+]
+DTYPE = ["bfloat16"]
+
+
+@pytest.mark.skipif(not current_platform.is_cpu()
+                    and not current_platform.is_xpu(),
+                    reason="only supports Intel CPU/XPU backend.")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", DTYPE)
+def test_ipex_quant(vllm_runner, model, dtype):
+    with vllm_runner(model, dtype=dtype) as llm:
+        output = llm.generate_greedy(["The capital of France is"],
+                                     max_tokens=32)
+    assert output
+    print(output)
diff --git a/vllm_v0.10.0/tests/quantization/test_lm_head.py b/vllm_v0.10.0/tests/quantization/test_lm_head.py
new file mode 100644
index 0000000..11f78a2
--- /dev/null
+++ b/vllm_v0.10.0/tests/quantization/test_lm_head.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests whether gptq models with quantized lm_head can be loaded.
+
+Run `pytest tests/quantization/test_quant_lm_head_true.py --forked`.
+"""
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQMarlinLinearMethod)
+from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    UnquantizedEmbeddingMethod)
+
+PROMPT = "On the surface of Mars, we found"
+
+MODELS_QUANT = [
+    ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head", True),
+    ("ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024", False),
+    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False),
+    ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", False)
+]
+
+
+@pytest.mark.parametrize("model_id, lm_head_quantized", MODELS_QUANT)
+def test_lm_head(
+    vllm_runner,
+    model_id: str,
+    lm_head_quantized: bool,
+    monkeypatch,
+) -> None:
+    # vllm_runner.apply_model() relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
+    with vllm_runner(model_id, dtype=torch.float16,
+                     max_model_len=2048) as vllm_model:
+
+        def check_model(model):
+            lm_head_layer = model.lm_head
+            if lm_head_quantized:
+                assert isinstance(lm_head_layer.quant_method,
+                                  (GPTQLinearMethod, GPTQMarlinLinearMethod,
+                                   MarlinLinearMethod))
+            else:
+                assert isinstance(lm_head_layer.quant_method,
+                                  UnquantizedEmbeddingMethod)
+
+        vllm_model.apply_model(check_model)
+
+        print(
+            vllm_model.generate_greedy(prompts=["Hello my name is"],
+                                       max_tokens=10)[0][1])
diff --git a/vllm_v0.10.0/tests/quantization/test_modelopt.py b/vllm_v0.10.0/tests/quantization/test_modelopt.py
new file mode 100644
index 0000000..fcbfa68
--- /dev/null
+++ b/vllm_v0.10.0/tests/quantization/test_modelopt.py
@@ -0,0 +1,91 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test ModelOpt quantization method setup and weight loading.
+
+Run `pytest tests/quantization/test_modelopt.py`.
+"""
+
+import os
+
+import pytest
+import torch
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm.platforms import current_platform
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This module relies on V0 internals, so set VLLM_USE_V1=0.
+    """
+    if not current_platform.is_cpu():
+        monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
+@pytest.mark.skipif(not is_quant_method_supported("modelopt"),
+                    reason="ModelOpt FP8 is not supported on this GPU type.")
+def test_modelopt_fp8_checkpoint_setup(vllm_runner):
+    """Test ModelOpt FP8 checkpoint loading and structure validation."""
+    # TODO: provide a small publically available test checkpoint
+    model_path = ("/home/scratch.omniml_data_1/zhiyu/ckpts/test_ckpts/"
+                  "TinyLlama-1.1B-Chat-v1.0-fp8-0710")
+
+    # Skip test if checkpoint doesn't exist
+    if not os.path.exists(model_path):
+        pytest.skip(f"Test checkpoint not found at {model_path}. "
+                    "This test requires a local ModelOpt FP8 checkpoint.")
+
+    with vllm_runner(model_path, quantization="modelopt",
+                     enforce_eager=True) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            o_proj = layer.self_attn.o_proj
+            gate_up_proj = layer.mlp.gate_up_proj
+            down_proj = layer.mlp.down_proj
+
+            # Check that ModelOpt quantization method is properly applied
+            from vllm.model_executor.layers.quantization.modelopt import (
+                ModelOptFp8LinearMethod)
+            assert isinstance(qkv_proj.quant_method, ModelOptFp8LinearMethod)
+            assert isinstance(o_proj.quant_method, ModelOptFp8LinearMethod)
+            assert isinstance(gate_up_proj.quant_method,
+                              ModelOptFp8LinearMethod)
+            assert isinstance(down_proj.quant_method, ModelOptFp8LinearMethod)
+
+            # Check weight dtype is FP8
+            assert qkv_proj.weight.dtype == torch.float8_e4m3fn
+            assert o_proj.weight.dtype == torch.float8_e4m3fn
+            assert gate_up_proj.weight.dtype == torch.float8_e4m3fn
+            assert down_proj.weight.dtype == torch.float8_e4m3fn
+
+            # Check scales are present and have correct dtype
+            assert hasattr(qkv_proj, 'weight_scale')
+            assert hasattr(qkv_proj, 'input_scale')
+            assert qkv_proj.weight_scale.dtype == torch.float32
+            assert qkv_proj.input_scale.dtype == torch.float32
+
+            assert hasattr(o_proj, 'weight_scale')
+            assert hasattr(o_proj, 'input_scale')
+            assert o_proj.weight_scale.dtype == torch.float32
+            assert o_proj.input_scale.dtype == torch.float32
+
+            assert hasattr(gate_up_proj, 'weight_scale')
+            assert hasattr(gate_up_proj, 'input_scale')
+            assert gate_up_proj.weight_scale.dtype == torch.float32
+            assert gate_up_proj.input_scale.dtype == torch.float32
+
+            assert hasattr(down_proj, 'weight_scale')
+            assert hasattr(down_proj, 'input_scale')
+            assert down_proj.weight_scale.dtype == torch.float32
+            assert down_proj.input_scale.dtype == torch.float32
+
+        llm.apply_model(check_model)
+
+        # Run a simple generation test to ensure the model works
+        output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
+        assert output
+        print(f"ModelOpt FP8 output: {output}")
diff --git a/vllm_v0.10.0/tests/quantization/test_ptpc_fp8.py b/vllm_v0.10.0/tests/quantization/test_ptpc_fp8.py
new file mode 100644
index 0000000..5f78bc3
--- /dev/null
+++ b/vllm_v0.10.0/tests/quantization/test_ptpc_fp8.py
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests whether PTPC w8a8 FP8 computation is enabled correctly.
+
+Run `pytest tests/quantization/test_ptpc_fp8.py --forked`.
+"""
+import pytest
+import torch
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm.model_executor.layers.quantization.fp8 import Fp8KVCacheMethod
+from vllm.model_executor.layers.quantization.ptpc_fp8 import (
+    PTPCFp8LinearMethod)
+from vllm.platforms import current_platform
+
+
+@pytest.mark.skipif(not is_quant_method_supported("ptpc_fp8"),
+                    reason="PTPC FP8 is not supported on this GPU type.")
+@pytest.mark.skipif(not current_platform.is_rocm(),
+                    reason="This test is for ROCm GPU.")
+@pytest.mark.parametrize("dtype", ["auto", "bfloat16", "float16"])
+@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8", "fp8_e4m3"])
+def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
+
+    try:
+        with vllm_runner("facebook/opt-125m",
+                         dtype=dtype,
+                         quantization="ptpc_fp8",
+                         kv_cache_dtype=kv_cache_dtype) as llm:
+
+            model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+            fc1 = model.model.decoder.layers[0].fc1
+            assert isinstance(fc1.quant_method, PTPCFp8LinearMethod)
+            if kv_cache_dtype == "ptpc_fp8":
+                attn = model.model.decoder.layers[0].self_attn.attn
+                assert isinstance(attn.quant_method, Fp8KVCacheMethod)
+                assert attn._k_scale == 1.0
+                assert attn._v_scale == 1.0
+
+            if current_platform.has_device_capability(94):
+                # For GPUs with hardware support, we keep weights in fp8
+                assert fc1.weight.dtype == torch.float8_e4m3fnuz
+            else:
+                pytest.skip()
+
+            output = llm.generate_greedy("Hello my name is", max_tokens=20)
+            assert output
+    except AssertionError as e:
+        if str(
+                e
+        ) == "Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16. torch.float16 is specified.":  # noqa: E501
+            # If the error message matches, the test passes
+            pass
+        else:
+            # If the error message does not match, re-raise the exception
+            raise
diff --git a/vllm_v0.10.0/tests/quantization/test_quark.py b/vllm_v0.10.0/tests/quantization/test_quark.py
new file mode 100644
index 0000000..4a0c8ba
--- /dev/null
+++ b/vllm_v0.10.0/tests/quantization/test_quark.py
@@ -0,0 +1,263 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test model set-up and weight loading for quark-quantized models.
+
+Run `pytest tests/quantization/test_quark.py`.
+
+See also `tests/kernels/moe/test_mxfp4_moe.py`.
+"""
+
+import importlib
+import importlib.metadata
+import os
+from dataclasses import dataclass
+
+import huggingface_hub
+import lm_eval
+import pytest
+import torch
+from packaging import version
+
+from vllm.model_executor.layers.quantization.quark.quark import (  # noqa: E501
+    QuarkLinearMethod, QuarkW8A8Fp8, QuarkW8A8Int8)
+from vllm.platforms import current_platform
+
+from .reference_mxfp4 import dq_mxfp4_torch, qdq_mxfp4_torch
+
+QUARK_MXFP4_AVAILABLE = importlib.util.find_spec(
+    "quark") is not None and version.parse(
+        importlib.metadata.version("amd-quark")) >= version.parse('0.8.99')
+
+if QUARK_MXFP4_AVAILABLE:
+    from quark.torch.export.nn.modules.realquantizer import (
+        StaticScaledRealQuantizer)
+    from quark.torch.kernel import mx as mx_kernel
+    from quark.torch.quantization.config.config import FP4PerGroupSpec
+
+try:
+    huggingface_hub.list_repo_refs(
+        "amd/Llama-3.3-70B-Instruct-WMXFP4-AMXFP4-KVFP8-Scale-UINT8-SQ")
+    HF_HUB_AMD_ORG_ACCESS = True
+except huggingface_hub.errors.RepositoryNotFoundError:
+    HF_HUB_AMD_ORG_ACCESS = False
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This module relies on V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
+@pytest.mark.parametrize('kv_cache_dtype', ['auto', 'fp8'])
+@pytest.mark.parametrize('tp', [1])
+def test_quark_fp8_w_per_tensor_a_per_tensor(vllm_runner, kv_cache_dtype, tp):
+    model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test"
+    with vllm_runner(model_path,
+                     kv_cache_dtype=kv_cache_dtype,
+                     tensor_parallel_size=tp) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+
+            assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
+            assert isinstance(qkv_proj.scheme, QuarkW8A8Fp8)
+
+            if isinstance(qkv_proj.scheme, QuarkW8A8Fp8):
+                assert len(qkv_proj.input_scale.shape) == 0
+                assert qkv_proj.weight.dtype is current_platform.fp8_dtype()
+                assert len(qkv_proj.weight_scale.shape) == 0
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        assert output
+
+
+@pytest.mark.parametrize('tp', [1])
+def test_quark_int8_w_per_tensor_a_per_tensor(vllm_runner, tp):
+    model_path = "amd/Llama-3.1-8B-Instruct-w-int8-a-int8-sym-test"
+    with vllm_runner(model_path, tensor_parallel_size=tp) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+
+            assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
+            assert isinstance(qkv_proj.scheme, QuarkW8A8Int8)
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        assert output
+
+
+def test_quark_fp8_parity(vllm_runner):
+    quark_model_id = "amd-quark/llama-tiny-fp8-quark-quant-method"
+    fp8_model_id = "amd-quark/llama-tiny-fp8-quant-method"
+
+    llm_kwargs = {
+        "tensor_parallel_size": 1,
+        "enforce_eager": True,
+        "gpu_memory_utilization": 0.1
+    }
+    with (vllm_runner(quark_model_id, **llm_kwargs) as
+          quark_handle, vllm_runner(fp8_model_id, **llm_kwargs) as fp8_handle):
+        quark_model = (quark_handle.llm.llm_engine.model_executor.
+                       driver_worker.model_runner.model)
+        quark_state_dict = quark_model.state_dict()
+
+        fp8_model = (fp8_handle.llm.llm_engine.model_executor.driver_worker.
+                     model_runner.model)
+        fp8_state_dict = fp8_model.state_dict()
+
+    assert fp8_state_dict.keys() == quark_state_dict.keys()
+
+    for key in fp8_state_dict:
+        assert torch.equal(fp8_state_dict[key], quark_state_dict[key])
+
+
+@dataclass
+class ModelCase:
+    model_id: str
+    tp: int
+
+
+@dataclass
+class GSM8KAccuracyTestConfig:
+    model_name: str
+    excepted_value: float
+
+    def get_model_args(self) -> str:
+        return (
+            f"pretrained={self.model_name},"
+            "dtype=auto,add_bos_token=True,tensor_parallel_size=8,gpu_memory_utilization=0.7,max_model_len=38768"
+        )
+
+
+ACCURACY_CONFIGS = [
+    # Private model.
+    GSM8KAccuracyTestConfig(
+        model_name="amd/DeepSeek-R1-WMXFP4-AMXFP4-Scale-UINT8-MoE-Quant",
+        excepted_value=0.96),
+]
+
+
+@pytest.mark.parametrize("config", ACCURACY_CONFIGS)
+@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE,
+                    reason="amd-quark>=0.9 is not available")
+@pytest.mark.skipif(
+    not HF_HUB_AMD_ORG_ACCESS,
+    reason="Read access to huggingface.co/amd is required for this test.")
+def test_mxfp4_gsm8k_correctness(config: GSM8KAccuracyTestConfig):
+    if torch.cuda.device_count() < 8:
+        pytest.skip(
+            f"This test requires >=8 gpus, got only {torch.cuda.device_count()}"
+        )
+
+    task = "gsm8k"
+    rtol = 0.03
+
+    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
+
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=config.get_model_args(),
+        tasks=task,
+        batch_size=64,
+        num_fewshot=8,
+    )
+
+    EXPECTED_VALUE = config.excepted_value
+    measured_value = results["results"][task]["exact_match,strict-match"]
+    assert (measured_value - rtol < EXPECTED_VALUE
+            and measured_value + rtol > EXPECTED_VALUE
+            ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+
+    del os.environ["VLLM_USE_TRITON_FLASH_ATTN"]
+
+
+@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE,
+                    reason="amd-quark>=0.9 is not available")
+@pytest.mark.parametrize("float_dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("scalings",
+                         [[2.3, 0.03, 7.3, 0.1, 0.004, 17.3, 1e4, 1e-4]])
+def test_mxfp4_fused_qdq_match_quark(float_dtype: torch.dtype,
+                                     scalings: list[int]):
+    torch.manual_seed(0)
+
+    hidden_size = 64 * 32
+    inp = (torch.rand(1, hidden_size, dtype=float_dtype, device="cuda") -
+           0.5) * 2
+    for i in range(hidden_size // 32):
+        inp[:, i * 32:(i + 1) *
+            32] = inp[:, i * 32:(i + 1) * 32] * scalings[i % len(scalings)]
+
+    inp_kernel = inp.clone()
+    inp_kernel_clone = inp_kernel.clone()
+
+    res_hip = mx_kernel.qdq_mxfp4_hip(inp_kernel_clone, "even")
+    res_torch = qdq_mxfp4_torch(inp_kernel, "even")
+
+    for i in range(hidden_size // 32):
+        assert torch.all(torch.isfinite(res_hip[:, i * 32:(i + 1) * 32]))
+        assert torch.all(torch.isfinite(res_torch[:, i * 32:(i + 1) * 32]))
+
+        torch.testing.assert_close(res_hip[:, i * 32:(i + 1) * 32],
+                                   res_torch[:, i * 32:(i + 1) * 32])
+
+
+@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE,
+                    reason="amd-quark>=0.9 is not available")
+@pytest.mark.parametrize("float_dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("scalings",
+                         [[2.3, 0.03, 7.3, 0.1, 0.004, 17.3, 1e4, 1e-4]])
+def test_mxfp4_dequant_kernel_match_quark(float_dtype: torch.dtype,
+                                          scalings: list[int]):
+    qspec = FP4PerGroupSpec(
+        ch_axis=-1,
+        group_size=32,
+        scale_format="e8m0",
+        scale_calculation_mode="even",
+        is_dynamic=False,
+    ).to_quantization_spec()
+
+    weight_quantizer = StaticScaledRealQuantizer(
+        qspec=qspec,
+        quantizer=None,
+        reorder=False,
+        real_quantized=True,
+        float_dtype=float_dtype,
+        device="cuda",
+    )
+
+    observer = qspec.observer_cls(qspec, device="cuda")
+
+    hidden_size = 512
+    shape = (11008, hidden_size)
+
+    w = (torch.rand(shape, device="cuda", dtype=float_dtype) - 0.5) * 2
+
+    # Make it so that different groups have different scales.
+    for i in range(hidden_size // 32):
+        w[:, i * 32:(i + 1) *
+          32] = w[:, i * 32:(i + 1) * 32] * scalings[i % len(scalings)]
+
+    observer(w)
+    scale, _ = observer._calculate_qparams()
+    weight_quantizer.scale = scale
+
+    w_mxfp4 = weight_quantizer.to_real_quantize_params(w).to("cuda")
+    weight_quantizer.maybe_convert_and_transpose_scale()
+
+    scale = weight_quantizer.scale
+
+    out_hip = mx_kernel.dq_mxfp4_hip(w_mxfp4, scale, float_dtype)
+
+    out_torch = dq_mxfp4_torch(w_mxfp4, scale, float_dtype)
+
+    assert torch.equal(out_hip, out_torch)
diff --git a/vllm_v0.10.0/tests/quantization/test_register_quantization_config.py b/vllm_v0.10.0/tests/quantization/test_register_quantization_config.py
new file mode 100644
index 0000000..84705e9
--- /dev/null
+++ b/vllm_v0.10.0/tests/quantization/test_register_quantization_config.py
@@ -0,0 +1,122 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests register custom quantization config.
+
+See https://github.com/vllm-project/vllm/issues/11926 for more details.
+
+Run `pytest tests/quantization/test_register_quantization_config.py`.
+"""
+from typing import Any, Optional
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from vllm.model_executor.layers.linear import LinearBase  # noqa: E501
+from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+from vllm.model_executor.layers.quantization import (
+    QuantizationMethods, get_quantization_config, register_quantization_config)
+from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
+    QuantizationConfig)
+
+
+class FakeQuantLinearMethod(UnquantizedLinearMethod):
+    """Fake quantization linear method for per-token dynamic quantization."""
+
+    def __init__(self, num_bits: int = 8) -> None:
+        """Initialize the quantization method."""
+        super().__init__()
+        self.num_bits = num_bits
+
+    def apply(self,
+              layer: "torch.nn.Module",
+              x: "torch.Tensor",
+              bias: Optional["torch.Tensor"] = None) -> "torch.Tensor":
+        """Perform fake quantization before the linear layer."""
+
+        # Calculate the scales dynamically
+        max_val = torch.amax(x, dim=(0, -1), keepdims=True)
+        min_val = torch.amin(x, dim=(0, -1), keepdims=True)
+        scales = (max_val - min_val) / (2**self.num_bits - 1)
+
+        # Fake quantize the input
+        quant_x = torch.clamp(torch.round(x / scales), -2**(self.num_bits - 1),
+                              2**(self.num_bits - 1) - 1)
+        dequant_x = quant_x * scales
+
+        return F.linear(dequant_x, layer.weight, bias)
+
+
+@register_quantization_config("custom_quant")
+class CustomQuantConfig(QuantizationConfig):
+    """Custom quantization config for per-token dynamic fake quantization."""
+
+    def __init__(self, num_bits: int = 8) -> None:
+        """Initialize the quantization config."""
+        super().__init__()
+        self.num_bits = num_bits
+
+    def get_name(self) -> QuantizationMethods:
+        """Name of the quantization method."""
+        return "custom_quant"
+
+    def get_supported_act_dtypes(self) -> list["torch.dtype"]:
+        """List of supported activation dtypes."""
+        return [torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        """Minimum GPU capability to support the quantization method."""
+        return -1
+
+    @staticmethod
+    def get_config_filenames() -> list[str]:
+        """List of filenames to search for in the model directory."""
+        return []
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "CustomQuantConfig":
+        """Create a config class from the model's quantization config."""
+        return CustomQuantConfig(num_bits=config.get("num_bits", 8))
+
+    def get_quant_method(self, layer: "torch.nn.Module",
+                         prefix: str) -> Optional["FakeQuantLinearMethod"]:
+        """Get the quantize method to use for the quantized layer."""
+        if isinstance(layer, LinearBase):
+            return FakeQuantLinearMethod(num_bits=self.num_bits)
+        return None
+
+
+def test_register_quantization_config():
+    """Test register custom quantization config."""
+
+    # The quantization method `custom_quant` should be registered.
+    assert get_quantization_config("custom_quant") == CustomQuantConfig
+
+    # The quantization method `custom_quant` is already exists,
+    # should raise an error.
+    with pytest.raises(ValueError):
+        register_quantization_config("custom_quant")(CustomQuantConfig)
+
+
+@pytest.mark.parametrize(argnames="model",
+                         argvalues=[
+                             "meta-llama/Llama-3.2-1B-Instruct",
+                         ])
+def test_custom_quant(vllm_runner, model, monkeypatch):
+    """Test infer with the custom quantization method."""
+    # vllm_runner.apply_model() relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
+    with vllm_runner(model_name=model,
+                     quantization="custom_quant",
+                     enforce_eager=True) as llm:
+
+        model = llm.llm.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+        qkv_proj = layer.self_attn.qkv_proj
+
+        # Check the quantization method is FakeQuantLinearMethod
+        assert isinstance(qkv_proj.quant_method, FakeQuantLinearMethod)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        assert output
diff --git a/vllm_v0.10.0/tests/quantization/test_rtn.py b/vllm_v0.10.0/tests/quantization/test_rtn.py
new file mode 100644
index 0000000..133b2d9
--- /dev/null
+++ b/vllm_v0.10.0/tests/quantization/test_rtn.py
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright © 2025, Oracle and/or its affiliates.
+"""Tests RTN quantization startup and generation, 
+doesn't test correctness
+"""
+import pytest
+
+from tests.quantization.utils import is_quant_method_supported
+
+MODELS = ["microsoft/Phi-3-mini-4k-instruct"]
+
+
+@pytest.mark.skipif(not is_quant_method_supported("rtn"),
+                    reason="RTN is not supported on this GPU type.")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [10])
+def test_model_rtn_startup(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+
+    with vllm_runner(model, dtype=dtype, quantization="rtn") as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/vllm_v0.10.0/tests/quantization/test_torchao.py b/vllm_v0.10.0/tests/quantization/test_torchao.py
new file mode 100644
index 0000000..eef3568
--- /dev/null
+++ b/vllm_v0.10.0/tests/quantization/test_torchao.py
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import importlib.metadata
+import importlib.util
+
+import pytest
+import torch
+
+DTYPE = ["bfloat16"]
+
+TORCHAO_AVAILABLE = importlib.util.find_spec("torchao") is not None
+
+
+@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
+def test_pre_quantized_model(vllm_runner):
+    with vllm_runner("drisspg/fp8-opt-125m",
+                     quantization="torchao",
+                     dtype="bfloat16",
+                     enforce_eager=True) as llm:
+        output = llm.generate_greedy(["The capital of France is"],
+                                     max_tokens=32)
+    assert output
+    print(output)
+
+
+@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
+@pytest.mark.parametrize(
+    "pt_load_map_location",
+    [
+        "cuda:0",
+        # {"": "cuda"},
+    ])
+def test_opt_125m_int8wo_model_loading_with_params(vllm_runner,
+                                                   pt_load_map_location):
+    torch._dynamo.reset()
+    model_name = "jerryzh168/opt-125m-int8wo-partial-quant"
+    with vllm_runner(model_name=model_name,
+                     quantization="torchao",
+                     dtype="bfloat16",
+                     pt_load_map_location=pt_load_map_location) as llm:
+        output = llm.generate_greedy(["The capital of France is"],
+                                     max_tokens=32)
+
+        assert output
+        print(output)
+
+
+@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
+def test_opt_125m_int4wo_model_per_module_quant(vllm_runner):
+    torch._dynamo.reset()
+    model_name = "jerryzh168/opt-125m-int4wo-per-module"
+    with vllm_runner(model_name=model_name,
+                     quantization="torchao",
+                     dtype="bfloat16",
+                     pt_load_map_location="cuda:0") as llm:
+        output = llm.generate_greedy(["The capital of France is"],
+                                     max_tokens=32)
+
+        assert output
+        print(output)
+
+
+@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
+def test_qwenvl_int8wo_model_loading_with_params(vllm_runner):
+    torch._dynamo.reset()
+    model_name = "mobicham/Qwen2.5-VL-3B-Instruct_int8wo_ao"
+    with vllm_runner(model_name=model_name,
+                     quantization="torchao",
+                     dtype="bfloat16",
+                     pt_load_map_location="cuda:0") as llm:
+        output = llm.generate_greedy(["The capital of France is"],
+                                     max_tokens=32)
+
+        assert output
+        print(output)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/vllm_v0.10.0/tests/quantization/utils.py b/vllm_v0.10.0/tests/quantization/utils.py
new file mode 100644
index 0000000..20a425b
--- /dev/null
+++ b/vllm_v0.10.0/tests/quantization/utils.py
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.model_executor.layers.quantization import get_quantization_config
+from vllm.platforms import current_platform
+
+
+def is_quant_method_supported(quant_method: str) -> bool:
+    # Currently, all quantization methods require Nvidia or AMD GPUs
+    if not (current_platform.is_cuda() or current_platform.is_rocm()):
+        return False
+
+    capability = current_platform.get_device_capability()
+    assert capability is not None
+
+    min_capability = get_quantization_config(quant_method).get_min_capability()
+
+    return capability.to_int() >= min_capability
diff --git a/vllm_v0.10.0/tests/reasoning/__init__.py b/vllm_v0.10.0/tests/reasoning/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/reasoning/test_deepseekr1_reasoning_parser.py b/vllm_v0.10.0/tests/reasoning/test_deepseekr1_reasoning_parser.py
new file mode 100644
index 0000000..987f3c4
--- /dev/null
+++ b/vllm_v0.10.0/tests/reasoning/test_deepseekr1_reasoning_parser.py
@@ -0,0 +1,287 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from transformers import AutoTokenizer
+
+from tests.reasoning.utils import run_reasoning_extraction
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+
+parser_name = "deepseek_r1"
+start_token = "<think>"
+end_token = "</think>"
+
+REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
+
+
+@pytest.fixture(scope="module")
+def deepseek_r1_qwen_tokenizer():
+    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+
+
+SIMPLE_REASONING = {
+    "output": "This is a reasoning section</think>This is the rest",
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+    "is_reasoning_end": True,
+}
+COMPLETE_REASONING = {
+    "output": "This is a reasoning section</think>",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+    "is_reasoning_end": True,
+}
+NO_CONTENT = {
+    "output": "This is content",
+    "reasoning_content": "This is content",
+    "content": None,
+    "is_reasoning_end": False,
+}
+NO_REASONING_STREAMING = {
+    "output": "This is a reasoning section",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+    "is_reasoning_end": False,
+}
+MULTIPLE_LINES = {
+    "output": "This\nThat</think>This is the rest\nThat",
+    "reasoning_content": "This\nThat",
+    "content": "This is the rest\nThat",
+    "is_reasoning_end": True,
+}
+SHORTEST_REASONING_NO_STREAMING = {
+    "output": "</think>This is the rest",
+    "reasoning_content": "",
+    "content": "This is the rest",
+    "is_reasoning_end": True,
+}
+SHORTEST_REASONING = {
+    "output": "</think>This is the rest",
+    "reasoning_content": None,
+    "content": "This is the rest",
+    "is_reasoning_end": True,
+}
+REASONING_WITH_THINK = {
+    "output": "<think>This is a reasoning section</think>This is the rest",
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+    "is_reasoning_end": True,
+}
+COMPLETE_REASONING_WITH_THINK = {
+    "output": "<think>This is a reasoning section</think>",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+    "is_reasoning_end": True,
+}
+MULTIPLE_LINES_WITH_THINK = {
+    "output": "<think>This\nThat</think>This is the rest\nThat",
+    "reasoning_content": "This\nThat",
+    "content": "This is the rest\nThat",
+    "is_reasoning_end": True,
+}
+SHORTEST_REASONING_NO_STREAMING_WITH_THINK = {
+    "output": "</think>This is the rest",
+    "reasoning_content": "",
+    "content": "This is the rest",
+    "is_reasoning_end": True,
+}
+SHORTEST_REASONING_WITH_THINK = {
+    "output": "</think>This is the rest",
+    "reasoning_content": None,
+    "content": "This is the rest",
+    "is_reasoning_end": True,
+}
+THINK_NO_END = {
+    "output": "<think>This is a reasoning section",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+    "is_reasoning_end": False,
+}
+EMPTY = {
+    "output": "",
+    "reasoning_content": "",
+    "content": None,
+    "is_reasoning_end": False,
+}
+EMPTY_STREAMING = {
+    "output": "",
+    "reasoning_content": None,
+    "content": None,
+    "is_reasoning_end": False,
+}
+NEW_LINE = {
+    "output": "\n<think>This is a reasoning section</think>\nThis is the rest",
+    "reasoning_content": "This is a reasoning section",
+    "content": "\nThis is the rest",
+    "is_reasoning_end": True,
+}
+# Streaming cannot handle new lines at the beginning of the output
+# because we need to support <think>...</think> and </think>...
+# We cannot know if the text before <think> is reasoning content
+# or not.
+NEW_LINE_STREAMING = {
+    "output": "\n<think>This is a reasoning section</think>\nThis is the rest",
+    "reasoning_content": "\nThis is a reasoning section",
+    "content": "\nThis is the rest",
+    "is_reasoning_end": True,
+}
+
+TEST_CASES = [
+    pytest.param(
+        False,
+        SIMPLE_REASONING,
+        id="simple_reasoning",
+    ),
+    pytest.param(
+        True,
+        SIMPLE_REASONING,
+        id="simple_reasoning_streaming",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING,
+        id="complete_reasoning",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING,
+        id="complete_reasoning_streaming",
+    ),
+    pytest.param(
+        False,
+        NO_CONTENT,
+        id="no_content_token",
+    ),
+    pytest.param(
+        True,
+        NO_REASONING_STREAMING,
+        id="no_reasoning_token_streaming",
+    ),
+    pytest.param(
+        False,
+        MULTIPLE_LINES,
+        id="multiple_lines",
+    ),
+    pytest.param(
+        True,
+        MULTIPLE_LINES,
+        id="multiple_lines_streaming",
+    ),
+    pytest.param(
+        True,
+        SHORTEST_REASONING,
+        id="shortest",
+    ),
+    pytest.param(
+        False,
+        SHORTEST_REASONING_NO_STREAMING,
+        id="shortest_streaming",
+    ),
+    pytest.param(
+        False,
+        REASONING_WITH_THINK,
+        id="reasoning_with_think",
+    ),
+    pytest.param(
+        True,
+        REASONING_WITH_THINK,
+        id="reasoning_with_think_streaming",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING_WITH_THINK,
+        id="complete_reasoning_with_think",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING_WITH_THINK,
+        id="complete_reasoning_with_think_streaming",
+    ),
+    pytest.param(
+        False,
+        MULTIPLE_LINES_WITH_THINK,
+        id="multiple_lines_with_think",
+    ),
+    pytest.param(
+        True,
+        MULTIPLE_LINES_WITH_THINK,
+        id="multiple_lines_with_think_streaming",
+    ),
+    pytest.param(
+        False,
+        SHORTEST_REASONING_NO_STREAMING_WITH_THINK,
+        id="shortest_with_think",
+    ),
+    pytest.param(
+        True,
+        SHORTEST_REASONING_WITH_THINK,
+        id="shortest_with_think_streaming",
+    ),
+    pytest.param(
+        False,
+        THINK_NO_END,
+        id="think_no_end",
+    ),
+    pytest.param(
+        True,
+        THINK_NO_END,
+        id="think_no_end_streaming",
+    ),
+    pytest.param(
+        False,
+        EMPTY,
+        id="empty",
+    ),
+    pytest.param(
+        True,
+        EMPTY_STREAMING,
+        id="empty_streaming",
+    ),
+    pytest.param(
+        False,
+        NEW_LINE,
+        id="new_line",
+    ),
+    pytest.param(
+        True,
+        NEW_LINE_STREAMING,
+        id="new_line_streaming",
+    ),
+]
+
+
+@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
+def test_reasoning(
+    streaming: bool,
+    param_dict: dict,
+    deepseek_r1_qwen_tokenizer,
+):
+    output = deepseek_r1_qwen_tokenizer.tokenize(param_dict["output"])
+    # decode everything to tokens
+    output_tokens: list[str] = [
+        deepseek_r1_qwen_tokenizer.convert_tokens_to_string([token])
+        for token in output
+    ]
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
+        parser_name)(deepseek_r1_qwen_tokenizer)
+
+    reasoning, content = run_reasoning_extraction(parser,
+                                                  output_tokens,
+                                                  streaming=streaming)
+
+    assert reasoning == param_dict["reasoning_content"]
+    assert content == param_dict["content"]
+
+    # Test is_reasoning_end
+    output_ids = deepseek_r1_qwen_tokenizer.convert_tokens_to_ids(output)
+    is_reasoning_end = parser.is_reasoning_end(output_ids)
+    assert is_reasoning_end == param_dict["is_reasoning_end"]
+
+    # Test extract_content
+    if param_dict["content"] is not None:
+        content = parser.extract_content_ids(output_ids)
+        assert content == deepseek_r1_qwen_tokenizer.convert_tokens_to_ids(
+            deepseek_r1_qwen_tokenizer.tokenize(param_dict["content"]))
+    else:
+        content = parser.extract_content_ids(output)
+        assert content == []
diff --git a/vllm_v0.10.0/tests/reasoning/test_granite_reasoning_parser.py b/vllm_v0.10.0/tests/reasoning/test_granite_reasoning_parser.py
new file mode 100644
index 0000000..38cab73
--- /dev/null
+++ b/vllm_v0.10.0/tests/reasoning/test_granite_reasoning_parser.py
@@ -0,0 +1,348 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+from transformers import AutoTokenizer
+
+from tests.reasoning.utils import DeltaMessage, run_reasoning_extraction
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+
+parser_name = "granite"
+START_REASONING = "Here is my thought process:"
+START_RESPONSE = "Here is my response:"
+
+SIMPLE_REASONING = {
+    "output":
+    f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest",  #noqa: E501
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+}
+COMPLETE_REASONING = {
+    "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+}
+NO_REASONING = {
+    "output": "This is content",
+    "reasoning_content": None,
+    "content": "This is content",
+}
+MULTIPLE_LINES = {
+    "output":
+    f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
+    "reasoning_content": "This\nThat",
+    "content": "This is the rest\nThat",
+}
+REASONING_WITH_THINK = {
+    "output":
+    f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest",  #noqa: E501
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+}
+COMPLETE_REASONING_WITH_THINK = {
+    "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+}
+MULTIPLE_LINES_WITH_THINK = {
+    "output":
+    f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
+    "reasoning_content": "This\nThat",
+    "content": "This is the rest\nThat",
+}
+
+TEST_CASES = [
+    pytest.param(
+        False,
+        SIMPLE_REASONING,
+        id="simple_reasoning",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING,
+        id="complete_reasoning",
+    ),
+    pytest.param(
+        False,
+        NO_REASONING,
+        id="no_reasoning",
+    ),
+    pytest.param(
+        False,
+        MULTIPLE_LINES,
+        id="multiple_lines",
+    ),
+    pytest.param(
+        False,
+        REASONING_WITH_THINK,
+        id="reasoning_with_think",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING_WITH_THINK,
+        id="complete_reasoning_with_think",
+    ),
+    pytest.param(
+        False,
+        MULTIPLE_LINES_WITH_THINK,
+        id="multiple_lines_with_think",
+    ),
+    pytest.param(
+        True,
+        SIMPLE_REASONING,
+        id="simple_reasoning_streaming",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING,
+        id="complete_reasoning_streaming",
+    ),
+    pytest.param(
+        True,
+        NO_REASONING,
+        id="no_reasoning_streaming",
+    ),
+    pytest.param(
+        True,
+        MULTIPLE_LINES,
+        id="multiple_lines_streaming",
+    ),
+    pytest.param(
+        True,
+        REASONING_WITH_THINK,
+        id="reasoning_with_think_streaming",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING_WITH_THINK,
+        id="complete_reasoning_with_think_streaming",
+    ),
+    pytest.param(
+        True,
+        MULTIPLE_LINES_WITH_THINK,
+        id="multiple_lines_with_think_streaming",
+    ),
+]
+
+# Global tokenizer initialization to avoid repeated loading
+tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
+
+
+@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
+def test_reasoning(
+    streaming: bool,
+    param_dict: dict,
+):
+    output = tokenizer.tokenize(param_dict["output"])
+    # decode everything to tokens
+    output_tokens: list[str] = [
+        tokenizer.convert_tokens_to_string([token]) for token in output
+    ]
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
+        parser_name)(tokenizer)
+
+    reasoning, content = run_reasoning_extraction(parser,
+                                                  output_tokens,
+                                                  streaming=streaming)
+
+    assert reasoning == param_dict["reasoning_content"]
+    assert content == param_dict["content"]
+
+
+# Additional tests for verifying the correctness of granite streaming; this
+# is complicated because granite uses multiple tokens to indicate when thinking
+# is starting / when it's starting its response, so skipping special tokens
+# is awkward.
+
+### Handling the start of reasoning
+STREAMING_1 = {
+    "previous_text": None,
+    "current_text": "Here",
+    "delta_text": "Here",
+    "reasoning_content": None,
+    "content": None,
+}
+# When we fail, we should give what was previously being silenced first
+STREAMING_2 = {
+    "previous_text": "Here is my thought",
+    "current_text": "Here is my thought failure",
+    "delta_text": " failure",
+    "reasoning_content": None,
+    "content": "Here is my thought failure",
+}
+# But then after the first one, we should only add the delta text to content
+STREAMING_3 = {
+    "previous_text": "Here wrong",
+    "current_text": " words",
+    "delta_text": " Here wrong words",
+    "reasoning_content": None,
+    "content": " words",
+}
+# But then after the first one, we should only add the delta text to content
+STREAMING_4 = {
+    "previous_text": "Here is my thought",
+    "current_text": "Here is my thought process:",
+    "delta_text": " process:",
+    "reasoning_content": None,
+    "content": None,
+}
+# Reasoning started successfully; parse reasoning content
+STREAMING_5 = {
+    "previous_text": "Here is my thought process:",
+    "current_text": "Here is my thought process: foo",
+    "delta_text": " foo",
+    "reasoning_content": " foo",
+    "content": None,
+}
+# Response special sequence has started, but not finished.
+STREAMING_6 = {
+    "previous_text": "Here is my thought process: foo",
+    "current_text": "Here is my thought process: foo Here is",
+    "delta_text": " Here is",
+    "reasoning_content": " ",
+    "content": None,
+}
+# Response special sequence started, but was broken; the reasoning
+# content should be the content that was previously unused.
+STREAMING_7 = {
+    "previous_text": "Here is my thought process: foo Here is",
+    "current_text": "Here is my thought process: foo Here is Here",
+    "delta_text": " Here",
+    "reasoning_content": "Here is ",
+    "content": None,
+}
+# Response special sequence is ongoing
+STREAMING_8 = {
+    "previous_text": "Here is my thought process: foo Here is my response:",
+    "current_text": "Here is my thought process: foo Here is my response: bar",
+    "delta_text": " bar",
+    "reasoning_content": None,
+    "content": " bar",
+}
+# The delta text has everything; we should be able to correctly parse both
+STREAMING_9 = {
+    "previous_text": None,
+    "current_text": "Here is my thought process: foo Here is my response: bar",
+    "delta_text": "Here is my thought process: foo Here is my response: bar",
+    "reasoning_content": " foo ",
+    "content": " bar",
+}
+## The Response is ongoing, and the delta mixes reasoning content / content
+STREAMING_10 = {
+    "previous_text": "Here is my thought process: foo",
+    "current_text":
+    "Here is my thought process: foo bar Here is my response: baz",
+    "delta_text": " bar Here is my response: baz",
+    "reasoning_content": " bar ",
+    "content": " baz",
+}
+# The delta text starts a new substring that might be a response special seq
+STREAMING_11 = {
+    "previous_text":
+    "Here is my thought process: This is a reasoning section ",
+    "current_text":
+    "Here is my thought process: This is a reasoning section Here",
+    "delta_text": "Here",
+    "reasoning_content": None,
+    "content": None,
+}
+# The delta text is finishing the response special seq
+STREAMING_12 = {
+    "previous_text": "Here is my thought process: foo Here is my response",
+    "current_text": "Here is my thought process: foo Here is my response:",
+    "delta_text": ":",
+    "reasoning_content": None,
+    "content": None,
+}
+STREAMING_13 = {
+    "previous_text": "Here is my thought process: foo Here",
+    "current_text": "Here is my thought process: foo Here was",
+    "delta_text": " was",
+    "reasoning_content": "Here was",
+    "content": None,
+}
+
+STREAMING_SUBCASES = [
+    pytest.param(
+        STREAMING_1,
+        id="Starting reasoning special sequence",
+    ),
+    pytest.param(
+        STREAMING_2,
+        id="Unexpected start reasoning sequence",
+    ),
+    pytest.param(
+        STREAMING_3,
+        id="Continuing unexpected start reasoning sequence",
+    ),
+    pytest.param(
+        STREAMING_4,
+        id="Only start reasoning sequence and nothing else",
+    ),
+    pytest.param(
+        STREAMING_5,
+        id="Reasoning content has started",
+    ),
+    pytest.param(
+        STREAMING_6,
+        id="Response special sequence has started",
+    ),
+    pytest.param(
+        STREAMING_7,
+        id="Response special sequence reset",
+    ),
+    pytest.param(
+        STREAMING_8,
+        id="Response text has started",
+    ),
+    pytest.param(
+        STREAMING_9,
+        id="Delta contains everything",
+    ),
+    pytest.param(
+        STREAMING_10,
+        id="Delta contains some reasoning and response",
+    ),
+    pytest.param(
+        STREAMING_11,
+        id="Delta starts response sequence",
+    ),
+    pytest.param(
+        STREAMING_12,
+        id="Delta finishes response sequence",
+    ),
+    pytest.param(
+        STREAMING_13,
+        id="Delta breaks potential responise sequence",
+    ),
+]
+
+
+@pytest.mark.parametrize("param_dict", STREAMING_SUBCASES)
+def test_streaming_subcases(param_dict):
+    # Get all of the token IDs
+    previous_token_ids = tokenizer.encode(
+        param_dict["previous_text"]
+    ) if param_dict["previous_text"] is not None else []
+    current_token_ids = tokenizer.encode(param_dict["current_text"])
+    delta_token_ids = tokenizer.encode(param_dict["delta_text"])
+
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
+        parser_name)(tokenizer)
+
+    response = parser.extract_reasoning_content_streaming(
+        previous_text=param_dict["previous_text"],
+        current_text=param_dict["current_text"],
+        delta_text=param_dict["delta_text"],
+        previous_token_ids=previous_token_ids,
+        current_token_ids=current_token_ids,
+        delta_token_ids=delta_token_ids,
+    )
+    # Streaming currently expects at least one of reasoning content / content,
+    # so the response should return None in that case.
+    if param_dict["reasoning_content"] is None and param_dict[
+            "content"] is None:
+        assert response is None
+    else:
+        assert isinstance(response, DeltaMessage)
+        assert param_dict["reasoning_content"] == response.reasoning_content
+        assert param_dict["content"] == response.content
diff --git a/vllm_v0.10.0/tests/reasoning/test_hunyuan_reasoning_parser.py b/vllm_v0.10.0/tests/reasoning/test_hunyuan_reasoning_parser.py
new file mode 100644
index 0000000..f923826
--- /dev/null
+++ b/vllm_v0.10.0/tests/reasoning/test_hunyuan_reasoning_parser.py
@@ -0,0 +1,173 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from transformers import AutoTokenizer
+
+from tests.reasoning.utils import run_reasoning_extraction
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+
+parser_name = "hunyuan_a13b"
+START_REASONING = "<think>\n"
+START_RESPONSE = "\n</think>\n<answer>\n"
+END_RESPONSE = "\n</answer>"
+
+NO_REASONING_QUICK_THROUGHT = {
+    "output":
+    f"{START_REASONING}{START_RESPONSE}This is the rest{END_RESPONSE}",  #noqa: E501
+    "reasoning_content": None,
+    "content": "This is the rest",
+}
+
+SIMPLE_REASONING = {
+    "output":
+    f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest{END_RESPONSE}",  #noqa: E501
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+}
+COMPLETE_REASONING = {
+    "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+}
+
+COMPLETE_REASONING_WITH_SYMBOL = {
+    "output": f"{START_REASONING}This is a reasoning section!{START_RESPONSE}",
+    "reasoning_content": "This is a reasoning section!",
+    "content": None,
+}
+NO_REASONING = {
+    "output": "This is content",
+    "reasoning_content": None,
+    "content": "This is content",
+}
+MULTIPLE_LINES = {
+    "output":
+    f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
+    "reasoning_content": "This\nThat",
+    "content": "This is the rest\nThat",
+}
+REASONING_WITH_THINK = {
+    "output":
+    f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest",  #noqa: E501
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+}
+COMPLETE_REASONING_WITH_THINK = {
+    "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+}
+MULTIPLE_LINES_WITH_THINK = {
+    "output":
+    f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
+    "reasoning_content": "This\nThat",
+    "content": "This is the rest\nThat",
+}
+
+TEST_CASES = [
+    pytest.param(
+        False,
+        SIMPLE_REASONING,
+        id="simple_reasoning",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING,
+        id="complete_reasoning",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING_WITH_SYMBOL,
+        id="complete_reasoning_with_symbol",
+    ),
+    pytest.param(
+        False,
+        NO_REASONING,
+        id="no_reasoning",
+    ),
+    pytest.param(False, NO_REASONING_QUICK_THROUGHT, id="no_reasoning_quick"),
+    pytest.param(
+        False,
+        MULTIPLE_LINES,
+        id="multiple_lines",
+    ),
+    pytest.param(
+        False,
+        REASONING_WITH_THINK,
+        id="reasoning_with_think",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING_WITH_THINK,
+        id="complete_reasoning_with_think",
+    ),
+    pytest.param(
+        False,
+        MULTIPLE_LINES_WITH_THINK,
+        id="multiple_lines_with_think",
+    ),
+    pytest.param(
+        True,
+        SIMPLE_REASONING,
+        id="simple_reasoning_streaming",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING,
+        id="complete_reasoning_streaming",
+    ),
+    pytest.param(
+        True,
+        NO_REASONING,
+        id="no_reasoning_streaming",
+    ),
+    pytest.param(True,
+                 NO_REASONING_QUICK_THROUGHT,
+                 id="no_reasoning_quick_stream"),
+    pytest.param(
+        True,
+        MULTIPLE_LINES,
+        id="multiple_lines_streaming",
+    ),
+    pytest.param(
+        True,
+        REASONING_WITH_THINK,
+        id="reasoning_with_think_streaming",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING_WITH_THINK,
+        id="complete_reasoning_with_think_streaming",
+    ),
+    pytest.param(
+        True,
+        MULTIPLE_LINES_WITH_THINK,
+        id="multiple_lines_with_think_streaming",
+    ),
+]
+
+# Global tokenizer initialization to avoid repeated loading
+tokenizer = AutoTokenizer.from_pretrained("tencent/Hunyuan-A13B-Instruct",
+                                          trust_remote_code=True)
+
+
+@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
+def test_reasoning(
+    streaming: bool,
+    param_dict: dict,
+):
+    output = tokenizer.tokenize(param_dict["output"])
+    # decode everything to tokens
+    output_tokens: list[str] = [
+        tokenizer.convert_tokens_to_string([token]) for token in output
+    ]
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
+        parser_name)(tokenizer)
+
+    reasoning, content = run_reasoning_extraction(parser,
+                                                  output_tokens,
+                                                  streaming=streaming)
+
+    assert reasoning == param_dict["reasoning_content"]
+    assert content == param_dict["content"]
diff --git a/vllm_v0.10.0/tests/reasoning/test_mistral_reasoning_parser.py b/vllm_v0.10.0/tests/reasoning/test_mistral_reasoning_parser.py
new file mode 100644
index 0000000..91a22f6
--- /dev/null
+++ b/vllm_v0.10.0/tests/reasoning/test_mistral_reasoning_parser.py
@@ -0,0 +1,341 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from mistral_common.tokens.tokenizers.base import SpecialTokens
+from mistral_common.tokens.tokenizers.tekken import (SpecialTokenInfo,
+                                                     Tekkenizer)
+
+from tests.reasoning.utils import run_reasoning_extraction_mistral
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+
+parser_name = "mistral"
+
+
+@pytest.fixture(scope="module")
+def mistral_tokenizer():
+    # TODO(Julien): upon model release change to a tokenizer already configured.
+    # =================================================================
+    mistral_tokenizer = MistralTokenizer.from_pretrained(
+        "mistralai/Devstral-Small-2507")
+    assert isinstance(mistral_tokenizer.tokenizer, Tekkenizer)
+    # Add think special tokens to the tokenizer
+    mistral_tokenizer.tokenizer._all_special_tokens[35] = SpecialTokenInfo(
+        rank=35, is_control=True, token_str=SpecialTokens.begin_think.value)
+    mistral_tokenizer.tokenizer._all_special_tokens[36] = SpecialTokenInfo(
+        rank=36, is_control=True, token_str=SpecialTokens.end_think.value)
+    mistral_tokenizer.tokenizer._special_tokens_reverse_vocab = {
+        k: v
+        for k, v in
+        mistral_tokenizer.tokenizer._special_tokens_reverse_vocab.items()
+        if v not in {35, 36}
+    }
+    mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[
+        SpecialTokens.begin_think.value] = 35
+    mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[
+        SpecialTokens.end_think.value] = 36
+    mistral_tokenizer.instruct.BEGIN_THINK = 35
+    mistral_tokenizer.instruct.END_THINK = 36
+    # =================================================================
+    return mistral_tokenizer
+
+
+SIMPLE_REASONING = {
+    "output": "This is a reasoning section[/THINK]This is the rest",
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+    "is_reasoning_end": True,
+}
+COMPLETE_REASONING = {
+    "output": "This is a reasoning section[/THINK]",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+    "is_reasoning_end": True,
+}
+NO_CONTENT = {
+    "output": "This is content",
+    "reasoning_content": "This is content",
+    "content": None,
+    "is_reasoning_end": False,
+}
+NO_REASONING_STREAMING = {
+    "output": "This is a reasoning section",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+    "is_reasoning_end": False,
+}
+MULTIPLE_LINES = {
+    "output": "This\nThat[/THINK]This is the rest\nThat",
+    "reasoning_content": "This\nThat",
+    "content": "This is the rest\nThat",
+    "is_reasoning_end": True,
+}
+SHORTEST_REASONING_NO_STREAMING = {
+    "output": "[/THINK]This is the rest",
+    "reasoning_content": "",
+    "content": "This is the rest",
+    "is_reasoning_end": True,
+}
+SHORTEST_REASONING = {
+    "output": "[/THINK]This is the rest",
+    "reasoning_content": None,
+    "content": "This is the rest",
+    "is_reasoning_end": True,
+}
+REASONING_WITH_THINK = {
+    "output": "[THINK]This is a reasoning section[/THINK]This is the rest",
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+    "is_reasoning_end": True,
+}
+COMPLETE_REASONING_WITH_THINK = {
+    "output": "[THINK]This is a reasoning section[/THINK]",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+    "is_reasoning_end": True,
+}
+MULTIPLE_LINES_WITH_THINK = {
+    "output": "[THINK]This\nThat[/THINK]This is the rest\nThat",
+    "reasoning_content": "This\nThat",
+    "content": "This is the rest\nThat",
+    "is_reasoning_end": True,
+}
+SHORTEST_REASONING_NO_STREAMING_WITH_THINK = {
+    "output": "[/THINK]This is the rest",
+    "reasoning_content": "",
+    "content": "This is the rest",
+    "is_reasoning_end": True,
+}
+SHORTEST_REASONING_WITH_THINK = {
+    "output": "[/THINK]This is the rest",
+    "reasoning_content": None,
+    "content": "This is the rest",
+    "is_reasoning_end": True,
+}
+THINK_NO_END = {
+    "output": "[THINK]This is a reasoning section",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+    "is_reasoning_end": False,
+}
+EMPTY = {
+    "output": "",
+    "reasoning_content": "",
+    "content": None,
+    "is_reasoning_end": False,
+}
+EMPTY_STREAMING = {
+    "output": "",
+    "reasoning_content": None,
+    "content": None,
+    "is_reasoning_end": False,
+}
+NEW_LINE = {
+    "output": "\n[THINK]This is a reasoning section[/THINK]\nThis is the rest",
+    "reasoning_content": "This is a reasoning section",
+    "content": "\nThis is the rest",
+    "is_reasoning_end": True,
+}
+# Streaming cannot handle new lines at the beginning of the output
+# because we need to support [THINK]...[/THINK] and [/THINK]...
+# We cannot know if the text before [THINK] is reasoning content
+# or not.
+NEW_LINE_STREAMING = {
+    "output": "\n[THINK]This is a reasoning section[/THINK]\nThis is the rest",
+    "reasoning_content": "\nThis is a reasoning section",
+    "content": "\nThis is the rest",
+    "is_reasoning_end": True,
+}
+
+TEST_CASES = [
+    pytest.param(
+        False,
+        SIMPLE_REASONING,
+        id="simple_reasoning",
+    ),
+    pytest.param(
+        True,
+        SIMPLE_REASONING,
+        id="simple_reasoning_streaming",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING,
+        id="complete_reasoning",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING,
+        id="complete_reasoning_streaming",
+    ),
+    pytest.param(
+        False,
+        NO_CONTENT,
+        id="no_content_token",
+    ),
+    pytest.param(
+        True,
+        NO_REASONING_STREAMING,
+        id="no_reasoning_token_streaming",
+    ),
+    pytest.param(
+        False,
+        MULTIPLE_LINES,
+        id="multiple_lines",
+    ),
+    pytest.param(
+        True,
+        MULTIPLE_LINES,
+        id="multiple_lines_streaming",
+    ),
+    pytest.param(
+        True,
+        SHORTEST_REASONING,
+        id="shortest",
+    ),
+    pytest.param(
+        False,
+        SHORTEST_REASONING_NO_STREAMING,
+        id="shortest_streaming",
+    ),
+    pytest.param(
+        False,
+        REASONING_WITH_THINK,
+        id="reasoning_with_think",
+    ),
+    pytest.param(
+        True,
+        REASONING_WITH_THINK,
+        id="reasoning_with_think_streaming",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING_WITH_THINK,
+        id="complete_reasoning_with_think",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING_WITH_THINK,
+        id="complete_reasoning_with_think_streaming",
+    ),
+    pytest.param(
+        False,
+        MULTIPLE_LINES_WITH_THINK,
+        id="multiple_lines_with_think",
+    ),
+    pytest.param(
+        True,
+        MULTIPLE_LINES_WITH_THINK,
+        id="multiple_lines_with_think_streaming",
+    ),
+    pytest.param(
+        False,
+        SHORTEST_REASONING_NO_STREAMING_WITH_THINK,
+        id="shortest_with_think",
+    ),
+    pytest.param(
+        True,
+        SHORTEST_REASONING_WITH_THINK,
+        id="shortest_with_think_streaming",
+    ),
+    pytest.param(
+        False,
+        THINK_NO_END,
+        id="think_no_end",
+    ),
+    pytest.param(
+        True,
+        THINK_NO_END,
+        id="think_no_end_streaming",
+    ),
+    pytest.param(
+        False,
+        EMPTY,
+        id="empty",
+    ),
+    pytest.param(
+        True,
+        EMPTY_STREAMING,
+        id="empty_streaming",
+    ),
+    pytest.param(
+        False,
+        NEW_LINE,
+        id="new_line",
+    ),
+    pytest.param(
+        True,
+        NEW_LINE_STREAMING,
+        id="new_line_streaming",
+    ),
+]
+
+
+@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
+def test_mistral_reasoning(
+    streaming: bool,
+    param_dict: dict,
+    mistral_tokenizer: MistralTokenizer,
+):
+    output = param_dict["output"]
+
+    index_think = output.find("[THINK]")
+    len_think = len("[THINK]")
+    index_end_think = output.find("[/THINK]")
+    len_end_think = len("[/THINK]")
+
+    # encode everything to tokens ids
+    output_tokens = []
+    if index_think != -1:
+        output_before_think = output[:index_think]
+        output_tokens += mistral_tokenizer.tokenizer.encode(
+            output_before_think, False, False)
+        output_tokens += [mistral_tokenizer.instruct.BEGIN_THINK]
+
+        if index_end_think != -1:
+            output_middle = output[index_think + len_think:index_end_think]
+            output_after_think = output[index_end_think + len_end_think:]
+            output_tokens += mistral_tokenizer.tokenizer.encode(
+                output_middle, False, False)
+            output_tokens += [mistral_tokenizer.instruct.END_THINK]
+            output_tokens += mistral_tokenizer.tokenizer.encode(
+                output_after_think, False, False)
+        else:
+            output_middle = output[index_think + len_think:]
+            output_tokens += mistral_tokenizer.tokenizer.encode(
+                output_middle, False, False)
+    elif index_end_think != -1:
+        output_before_think = output[:index_end_think]
+        output_after_think = output[index_end_think + len_end_think:]
+        output_tokens += mistral_tokenizer.tokenizer.encode(
+            output_before_think, False, False)
+        output_tokens += [mistral_tokenizer.instruct.END_THINK]
+        output_tokens += mistral_tokenizer.tokenizer.encode(
+            output_after_think, False, False)
+    else:
+        output_tokens += mistral_tokenizer.tokenizer.encode(
+            output, False, False)
+
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
+        parser_name)(mistral_tokenizer)
+
+    reasoning, content = run_reasoning_extraction_mistral(parser,
+                                                          output_tokens,
+                                                          streaming=streaming)
+
+    assert reasoning == param_dict["reasoning_content"]
+    assert content == param_dict["content"]
+
+    # Test is_reasoning_end
+    is_reasoning_end = parser.is_reasoning_end(output_tokens)
+    assert is_reasoning_end == param_dict["is_reasoning_end"]
+
+    # Test extract_content
+    if param_dict["content"] is not None:
+        content = parser.extract_content_ids(output_tokens)
+        assert content == mistral_tokenizer.tokenizer.encode(
+            param_dict["content"], bos=False, eos=False)
+    else:
+        content = parser.extract_content_ids(output_tokens)
+        assert content == []
diff --git a/vllm_v0.10.0/tests/reasoning/test_qwen3_reasoning_parser.py b/vllm_v0.10.0/tests/reasoning/test_qwen3_reasoning_parser.py
new file mode 100644
index 0000000..2d5557d
--- /dev/null
+++ b/vllm_v0.10.0/tests/reasoning/test_qwen3_reasoning_parser.py
@@ -0,0 +1,142 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from transformers import AutoTokenizer
+
+from tests.reasoning.utils import run_reasoning_extraction
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+
+parser_name = "qwen3"
+start_token = "<think>"
+end_token = "</think>"
+
+REASONING_MODEL_NAME = "Qwen/Qwen3-0.6B"
+
+
+@pytest.fixture(scope="module")
+def qwen3_tokenizer():
+    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+
+
+# 带 <think></think>，非stream
+WITH_THINK = {
+    "output": "<think>This is a reasoning section</think>This is the rest",
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+}
+# 带 <think></think>，stream
+WITH_THINK_STREAM = {
+    "output": "<think>This is a reasoning section</think>This is the rest",
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+}
+# 不带 <think></think>，非stream
+WITHOUT_THINK = {
+    "output": "This is the rest",
+    "reasoning_content": None,
+    "content": "This is the rest",
+}
+# 不带 <think></think>，stream
+WITHOUT_THINK_STREAM = {
+    "output": "This is the rest",
+    "reasoning_content": None,
+    "content": "This is the rest",
+}
+
+COMPLETE_REASONING = {
+    "output": "<think>This is a reasoning section</think>",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+}
+MULTILINE_REASONING = {
+    "output":
+    "<think>This is a reasoning\nsection</think>This is the rest\nThat",
+    "reasoning_content": "This is a reasoning\nsection",
+    "content": "This is the rest\nThat",
+}
+ONLY_OPEN_TAG = {
+    "output": "<think>This is a reasoning section",
+    "reasoning_content": None,
+    "content": "<think>This is a reasoning section",
+}
+
+ONLY_OPEN_TAG_STREAM = {
+    "output": "<think>This is a reasoning section",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+}
+
+TEST_CASES = [
+    pytest.param(
+        False,
+        WITH_THINK,
+        id="with_think",
+    ),
+    pytest.param(
+        True,
+        WITH_THINK_STREAM,
+        id="with_think_stream",
+    ),
+    pytest.param(
+        False,
+        WITHOUT_THINK,
+        id="without_think",
+    ),
+    pytest.param(
+        True,
+        WITHOUT_THINK_STREAM,
+        id="without_think_stream",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING,
+        id="complete_reasoning",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING,
+        id="complete_reasoning_stream",
+    ),
+    pytest.param(
+        False,
+        MULTILINE_REASONING,
+        id="multiline_reasoning",
+    ),
+    pytest.param(
+        True,
+        MULTILINE_REASONING,
+        id="multiline_reasoning_stream",
+    ),
+    pytest.param(
+        False,
+        ONLY_OPEN_TAG,
+        id="only_open_tag",
+    ),
+    pytest.param(
+        True,
+        ONLY_OPEN_TAG_STREAM,
+        id="only_open_tag_stream",
+    ),
+]
+
+
+@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
+def test_reasoning(
+    streaming: bool,
+    param_dict: dict,
+    qwen3_tokenizer,
+):
+    output = qwen3_tokenizer.tokenize(param_dict["output"])
+    output_tokens: list[str] = [
+        qwen3_tokenizer.convert_tokens_to_string([token]) for token in output
+    ]
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
+        parser_name)(qwen3_tokenizer)
+
+    reasoning, content = run_reasoning_extraction(parser,
+                                                  output_tokens,
+                                                  streaming=streaming)
+
+    assert reasoning == param_dict["reasoning_content"]
+    assert content == param_dict["content"]
diff --git a/vllm_v0.10.0/tests/reasoning/utils.py b/vllm_v0.10.0/tests/reasoning/utils.py
new file mode 100644
index 0000000..9af5fa5
--- /dev/null
+++ b/vllm_v0.10.0/tests/reasoning/utils.py
@@ -0,0 +1,155 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional, Union
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage)
+from vllm.reasoning import ReasoningParser
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+
+
+class StreamingReasoningReconstructor:
+
+    def __init__(self):
+        self.reasoning_content = None
+        self.other_content = None
+
+    def append_delta(self, delta: DeltaMessage):
+        # content and the reasoning content should not be present
+        # at the same time
+        assert delta.content is None or delta.reasoning_content is None, (
+            "Both content and reasoning content are present in the "
+            "delta message")
+        if delta.content is not None:
+            if self.other_content is None:
+                self.other_content = delta.content
+            else:
+                self.other_content += delta.content
+        else:
+            if self.reasoning_content is None:
+                self.reasoning_content = delta.reasoning_content
+            else:
+                self.reasoning_content += delta.reasoning_content
+
+
+def run_reasoning_extraction(
+    reasoning_parser: ReasoningParser,
+    model_output: list[str],
+    request: Union[ChatCompletionRequest, None] = None,
+    streaming: bool = False,
+) -> tuple[Optional[str], Optional[str]]:
+    if streaming:
+        reconstructor = run_reasoning_extraction_streaming(
+            reasoning_parser,
+            model_output,
+            request,
+        )
+        return (
+            reconstructor.reasoning_content,
+            reconstructor.other_content or None,
+        )
+    else:
+        reasoning, content = run_reasoning_extraction_nonstreaming(
+            reasoning_parser, model_output, request)
+        return reasoning, content
+
+
+def run_reasoning_extraction_mistral(
+    reasoning_parser: ReasoningParser,
+    model_output: list[int],
+    request: Union[ChatCompletionRequest, None] = None,
+    streaming: bool = False,
+) -> tuple[Optional[str], Optional[str]]:
+    assert isinstance(reasoning_parser.model_tokenizer,
+                      MistralTokenizer), type(reasoning_parser.model_tokenizer)
+    if streaming:
+        reconstructor = run_reasoning_extraction_streaming_mistral(
+            reasoning_parser,
+            model_output,
+            request,
+        )
+        return (
+            reconstructor.reasoning_content,
+            reconstructor.other_content or None,
+        )
+    else:
+        str_output = reasoning_parser.model_tokenizer.convert_ids_to_tokens(
+            model_output)
+        reasoning, content = run_reasoning_extraction_nonstreaming(
+            reasoning_parser, str_output, request)
+        return reasoning, content
+
+
+def run_reasoning_extraction_nonstreaming(
+    reasoning_parser: ReasoningParser,
+    model_output: list[str],
+    request: Union[ChatCompletionRequest, None] = None,
+) -> tuple[Optional[str], Optional[str]]:
+    request = request or ChatCompletionRequest(messages=[], model="test-model")
+    return reasoning_parser.extract_reasoning_content(
+        model_output=''.join(model_output), request=request)
+
+
+def run_reasoning_extraction_streaming(
+    reasoning_parser: ReasoningParser,
+    model_deltas: list[str],
+    request: Union[ChatCompletionRequest, None] = None,
+) -> StreamingReasoningReconstructor:
+    request = request or ChatCompletionRequest(messages=[], model="test-model")
+    reconstructor = StreamingReasoningReconstructor()
+    previous_text = ""
+    previous_tokens: list[int] = []
+    for delta in model_deltas:
+        token_delta = [
+            reasoning_parser.vocab.get(token)
+            for token in reasoning_parser.model_tokenizer.tokenize(delta)
+            if token in reasoning_parser.vocab
+        ]
+        current_text = previous_text + delta
+        current_tokens = previous_tokens + token_delta
+        delta_message = reasoning_parser.extract_reasoning_content_streaming(
+            previous_text,
+            current_text,
+            delta,
+            previous_tokens,
+            current_tokens,
+            token_delta,
+        )
+        if delta_message is not None:
+            reconstructor.append_delta(delta_message)
+        previous_text = current_text
+        previous_tokens = current_tokens
+    return reconstructor
+
+
+def run_reasoning_extraction_streaming_mistral(
+    reasoning_parser: ReasoningParser,
+    model_deltas: list[int],
+    request: Union[ChatCompletionRequest, None] = None,
+) -> StreamingReasoningReconstructor:
+    assert isinstance(reasoning_parser.model_tokenizer,
+                      MistralTokenizer), type(reasoning_parser.model_tokenizer)
+    request = request or ChatCompletionRequest(messages=[], model="test-model")
+    reconstructor = StreamingReasoningReconstructor()
+    previous_text = ""
+    previous_tokens: list[int] = []
+    for model_delta in model_deltas:
+        token_delta = [model_delta]
+        delta = reasoning_parser.model_tokenizer.convert_ids_to_tokens(
+            [model_delta])[0]
+        current_text = previous_text + delta
+        current_tokens = previous_tokens + token_delta
+        delta_message = reasoning_parser.extract_reasoning_content_streaming(
+            previous_text,
+            current_text,
+            delta,
+            previous_tokens,
+            current_tokens,
+            token_delta,
+        )
+        if delta_message is not None:
+            reconstructor.append_delta(delta_message)
+        previous_text = current_text
+        previous_tokens = current_tokens
+    return reconstructor
diff --git a/vllm_v0.10.0/tests/runai_model_streamer_test/__init__.py b/vllm_v0.10.0/tests/runai_model_streamer_test/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py b/vllm_v0.10.0/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py
new file mode 100644
index 0000000..e27d995
--- /dev/null
+++ b/vllm_v0.10.0/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import SamplingParams
+from vllm.config import LoadConfig, LoadFormat
+from vllm.model_executor.model_loader import get_model_loader
+
+test_model = "openai-community/gpt2"
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
+
+
+def get_runai_model_loader():
+    load_config = LoadConfig(load_format=LoadFormat.RUNAI_STREAMER)
+    return get_model_loader(load_config)
+
+
+def test_get_model_loader_with_runai_flag():
+    model_loader = get_runai_model_loader()
+    assert model_loader.__class__.__name__ == "RunaiModelStreamerLoader"
+
+
+def test_runai_model_loader_download_files(vllm_runner):
+    with vllm_runner(test_model, load_format=LoadFormat.RUNAI_STREAMER) as llm:
+        deserialized_outputs = llm.generate(prompts, sampling_params)
+        assert deserialized_outputs
diff --git a/vllm_v0.10.0/tests/runai_model_streamer_test/test_weight_utils.py b/vllm_v0.10.0/tests/runai_model_streamer_test/test_weight_utils.py
new file mode 100644
index 0000000..ee448c2
--- /dev/null
+++ b/vllm_v0.10.0/tests/runai_model_streamer_test/test_weight_utils.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import glob
+import tempfile
+
+import huggingface_hub.constants
+import torch
+
+from vllm.model_executor.model_loader.weight_utils import (
+    download_weights_from_hf, runai_safetensors_weights_iterator,
+    safetensors_weights_iterator)
+
+
+def test_runai_model_loader():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        huggingface_hub.constants.HF_HUB_OFFLINE = False
+        download_weights_from_hf("openai-community/gpt2",
+                                 allow_patterns=["*.safetensors"],
+                                 cache_dir=tmpdir)
+        safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True)
+        assert len(safetensors) > 0
+
+        runai_model_streamer_tensors = {}
+        hf_safetensors_tensors = {}
+
+        for name, tensor in runai_safetensors_weights_iterator(
+                safetensors, True):
+            runai_model_streamer_tensors[name] = tensor
+
+        for name, tensor in safetensors_weights_iterator(safetensors, True):
+            hf_safetensors_tensors[name] = tensor
+
+        assert len(runai_model_streamer_tensors) == len(hf_safetensors_tensors)
+
+        for name, runai_tensor in runai_model_streamer_tensors.items():
+            assert runai_tensor.dtype == hf_safetensors_tensors[name].dtype
+            assert runai_tensor.shape == hf_safetensors_tensors[name].shape
+            assert torch.all(runai_tensor.eq(hf_safetensors_tensors[name]))
+
+
+if __name__ == "__main__":
+    test_runai_model_loader()
diff --git a/vllm_v0.10.0/tests/samplers/__init__.py b/vllm_v0.10.0/tests/samplers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/samplers/test_beam_search.py b/vllm_v0.10.0/tests/samplers/test_beam_search.py
new file mode 100644
index 0000000..bdf48c7
--- /dev/null
+++ b/vllm_v0.10.0/tests/samplers/test_beam_search.py
@@ -0,0 +1,142 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Compare the outputs of HF and vLLM when using beam search.
+
+Run `pytest tests/samplers/test_beam_search.py`.
+"""
+
+import pytest
+from transformers import AutoModelForSeq2SeqLM
+
+from vllm.assets.audio import AudioAsset
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    """We can run both engines for this test."""
+    pass
+
+
+# FIXME(zhuohan): The test can not pass if we:
+#   1. Increase max_tokens to 256.
+#   2. Increase beam_width to 8.
+#   3. Use the model "huggyllama/llama-7b".
+MAX_TOKENS = [64]
+BEAM_WIDTHS = [4]
+MM_BEAM_WIDTHS = [2]
+MODELS = ["TinyLlama/TinyLlama-1.1B-Chat-v1.0"]
+
+
+@pytest.mark.skip_v1  # FIXME: This fails on V1 right now.
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", MAX_TOKENS)
+@pytest.mark.parametrize("beam_width", BEAM_WIDTHS)
+def test_beam_search_single_input(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    beam_width: int,
+) -> None:
+    example_prompts = example_prompts[:1]
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
+                                                   max_tokens)
+
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_beam_search(example_prompts,
+                                                       beam_width, max_tokens)
+
+    for i in range(len(example_prompts)):
+        hf_output_ids, hf_output_texts = hf_outputs[i]
+        vllm_output_ids, vllm_output_texts = vllm_outputs[i]
+        for j, (hf_text,
+                vllm_text) in enumerate(zip(hf_output_texts,
+                                            vllm_output_texts)):
+            print(f">>>{j}-th hf output:")
+            print(hf_text)
+            print(f">>>{j}-th vllm output:")
+            print(vllm_text)
+        assert len(hf_output_ids) == len(vllm_output_ids)
+        for j in range(len(hf_output_ids)):
+            assert hf_output_ids[j] == vllm_output_ids[j], (
+                f"Test{i} output{j}:\nHF: {hf_output_ids}\n"
+                f"vLLM: {vllm_output_ids}")
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", MAX_TOKENS)
+@pytest.mark.parametrize("beam_width", MM_BEAM_WIDTHS)
+def test_beam_search_passes_multimodal_data(
+    hf_runner,
+    vllm_runner,
+    dtype: str,
+    max_tokens: int,
+    beam_width: int,
+) -> None:
+    """Ensure that beam search passes multimodal data through correctly."""
+    # NOTE - this test is primarily to check that mm data is passed to beams
+    # correctly. As such, we just need to check one extra modality to make
+    # sure things pass through properly.
+    audios = [AudioAsset("mary_had_lamb").audio_and_sample_rate]
+    model = "Qwen/Qwen2-Audio-7B-Instruct"
+    audio_seq = "<|audio_bos|><|AUDIO|><|audio_eos|>"
+    prompts = [
+        f"<|im_start|>user\n{audio_seq}Can you transcribe this?<|im_end|>\n<|im_start|>assistant\n"  #noqa: E501
+    ]
+
+    with hf_runner(model, dtype=dtype,
+                   auto_cls=AutoModelForSeq2SeqLM) as hf_model:
+        audio_token_id = hf_model.config.audio_token_index
+        eos_token_id = hf_model.tokenizer.eos_token_id  # <|im_end|>
+        hf_outputs = hf_model.generate_beam_search(
+            prompts,
+            beam_width=beam_width,
+            max_tokens=max_tokens,
+            audios=audios,
+        )
+
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_beam_search(
+            prompts,
+            beam_width=beam_width,
+            max_tokens=max_tokens,
+            audios=audios,
+        )
+
+    seq_with_no_audio_toks = lambda seq: [
+        tok for tok in seq if tok != audio_token_id
+    ]
+
+    for i in range(len(prompts)):
+        hf_output_ids, hf_output_texts = hf_outputs[i]
+        vllm_output_ids, vllm_output_texts = vllm_outputs[i]
+
+        for j, (hf_text,
+                vllm_text) in enumerate(zip(hf_output_texts,
+                                            vllm_output_texts)):
+            print(f">>>{j}-th hf output [NOTE: special tokens are filtered]:")
+            print(hf_text)
+            print(f">>>{j}-th vllm output:")
+            print(vllm_text)
+        assert len(hf_output_ids) == len(vllm_output_ids)
+
+        for j in range(len(hf_output_ids)):
+            # Compare everything except for the audio tokens; we do this since
+            # the IDs returned from the transformers helper expands the audio
+            # token to match features, while the vLLM helper maintains the
+            # single audio token in the input text
+            filtered_hf_output_ids = seq_with_no_audio_toks(hf_output_ids[j])
+            filtered_vllm_output_ids = seq_with_no_audio_toks(
+                vllm_output_ids[j])
+
+            # HF output IDs may contain the end of sequence
+            if len(filtered_hf_output_ids
+                   ) == len(filtered_vllm_output_ids) + 1:
+                assert filtered_hf_output_ids[-1] == eos_token_id
+                filtered_hf_output_ids = filtered_hf_output_ids[:-1]
+
+            assert filtered_hf_output_ids == filtered_vllm_output_ids
diff --git a/vllm_v0.10.0/tests/samplers/test_ignore_eos.py b/vllm_v0.10.0/tests/samplers/test_ignore_eos.py
new file mode 100644
index 0000000..ea4a17d
--- /dev/null
+++ b/vllm_v0.10.0/tests/samplers/test_ignore_eos.py
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Make sure ignore_eos works.
+
+Run `pytest tests/samplers/test_ignore_eos.py`.
+"""
+
+import pytest
+
+from vllm import SamplingParams
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    """We can run both engines for this test."""
+    pass
+
+
+# We also test with llama because it has generation_config to specify EOS
+# (past regression).
+MODELS = ["distilbert/distilgpt2", "meta-llama/Llama-3.2-1B"]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [512])
+def test_ignore_eos(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        sampling_params = SamplingParams(max_tokens=max_tokens,
+                                         ignore_eos=True)
+
+        for prompt in example_prompts:
+            ignore_eos_output = vllm_model.llm.generate(
+                prompt, sampling_params=sampling_params)
+            output_length = len(ignore_eos_output[0].outputs[0].token_ids)
+            assert output_length == max_tokens
diff --git a/vllm_v0.10.0/tests/samplers/test_logits_processor.py b/vllm_v0.10.0/tests/samplers/test_logits_processor.py
new file mode 100644
index 0000000..123f959
--- /dev/null
+++ b/vllm_v0.10.0/tests/samplers/test_logits_processor.py
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm import SamplingParams
+
+MODELS = ["distilbert/distilgpt2"]
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This file tests V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_logits_processor_force_generate(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        tokenizer = vllm_model.llm.get_tokenizer()
+        repeat_times = 2
+        enforced_answers = " vLLM"
+        vllm_token_ids = tokenizer.encode(enforced_answers,
+                                          add_special_tokens=False)
+        max_tokens = len(vllm_token_ids) * repeat_times
+
+        def pick_vllm(token_ids, logits):
+            token_id = vllm_token_ids[len(token_ids) % len(vllm_token_ids)]
+            logits[token_id] = torch.finfo(logits.dtype).max
+            return logits
+
+        params_with_logprobs = SamplingParams(
+            logits_processors=[pick_vllm],
+            prompt_logprobs=3,
+            max_tokens=max_tokens,
+        )
+
+        # test logits_processors when prompt_logprobs is not None
+        vllm_model.llm._add_request(
+            example_prompts[0],
+            params=params_with_logprobs,
+        )
+
+        # test prompt_logprobs is not None
+        vllm_model.llm._add_request(
+            example_prompts[1],
+            params=SamplingParams(
+                prompt_logprobs=3,
+                max_tokens=max_tokens,
+            ),
+        )
+
+        # test grouped requests
+        vllm_model.llm._add_request(
+            example_prompts[2],
+            params=SamplingParams(max_tokens=max_tokens),
+        )
+
+        outputs = vllm_model.llm._run_engine(use_tqdm=False)
+
+        assert outputs[0].outputs[0].text == enforced_answers * repeat_times
diff --git a/vllm_v0.10.0/tests/samplers/test_logprobs.py b/vllm_v0.10.0/tests/samplers/test_logprobs.py
new file mode 100644
index 0000000..87f40b1
--- /dev/null
+++ b/vllm_v0.10.0/tests/samplers/test_logprobs.py
@@ -0,0 +1,182 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm import SamplingParams
+
+from ..conftest import VllmRunner
+
+MODELS = ["distilbert/distilgpt2"]
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This module is V0 only since it uses dtype=float, so
+    set VLLM_USE_V1=0 for all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype",
+                         ["float"])  # needed for comparing logprobs with HF
+@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
+@pytest.mark.parametrize("num_top_logprobs", [0, 6])  # 32000 == vocab_size
+@pytest.mark.parametrize("detokenize", [True, False])
+def test_get_prompt_logprobs(
+    hf_runner,
+    vllm_runner,
+    model,
+    dtype,
+    chunked_prefill_token_size: int,
+    num_top_logprobs: int,
+    detokenize: bool,
+    example_prompts,
+):
+    max_num_seqs = 256
+    enable_chunked_prefill = False
+    max_num_batched_tokens = None
+    if chunked_prefill_token_size != -1:
+        enable_chunked_prefill = True
+        max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
+        max_num_batched_tokens = chunked_prefill_token_size
+
+    max_tokens = 5
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_logprobs = hf_model.generate_greedy_logprobs(
+            example_prompts,
+            max_tokens=max_tokens,
+        )
+
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            max_logprobs=num_top_logprobs,
+            enable_chunked_prefill=enable_chunked_prefill,
+            max_num_batched_tokens=max_num_batched_tokens,
+            max_num_seqs=max_num_seqs,
+    ) as vllm_model:
+        vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
+                                              logprobs=num_top_logprobs,
+                                              prompt_logprobs=num_top_logprobs,
+                                              temperature=0.0,
+                                              detokenize=detokenize)
+        vllm_results = vllm_model.llm.generate(
+            example_prompts, sampling_params=vllm_sampling_params)
+
+    # Test whether logprobs are included in the results.
+    for result in vllm_results:
+        assert result.prompt_logprobs is not None
+        assert result.outputs[0].logprobs is not None
+        assert len(result.outputs[0].logprobs) == max_tokens
+        for logprobs in result.outputs[0].logprobs:
+            # If the output token is not included in the top X
+            # logprob, it can return 1 more data
+            assert (len(logprobs) == num_top_logprobs
+                    or len(logprobs) == num_top_logprobs + 1)
+        output_text = result.outputs[0].text
+        output_string_from_most_likely_tokens_lst: list[str] = []
+        for top_logprobs in result.outputs[0].logprobs:
+            top_logprob = next(iter(top_logprobs.values()))
+            output_string_from_most_likely_tokens_lst.append(
+                top_logprob.decoded_token)
+
+        if detokenize:
+            output_string_from_most_likely_tokens = "".join(
+                output_string_from_most_likely_tokens_lst)
+            assert output_text == output_string_from_most_likely_tokens, (
+                "The output text from the top logprob for each token position "
+                "should be the same as the output text in the result.")
+        else:
+            assert output_text == ''
+            assert output_string_from_most_likely_tokens_lst == ([None] *
+                                                                 max_tokens)
+
+        # The first prompt logprob is always None
+        assert result.prompt_logprobs[0] is None
+        for prompt_logprobs in result.prompt_logprobs[1:]:
+            # If the prompt token is not included in the top X
+            # logprob, it can return 1 more data
+            assert (len(prompt_logprobs) == num_top_logprobs
+                    or len(prompt_logprobs) == num_top_logprobs + 1)
+
+    # Test whether prompt logprobs are consistent with HF
+    for vllm_result, hf_logprob in zip(vllm_results, hf_logprobs):
+        # Check prompt logprobs
+        # The first prompt logprob is always None, so we compare it from 1:.
+        vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:]
+        for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs):
+            for token_id, logprob in vllm_prompt_logprob_dict.items():
+                torch.testing.assert_close(logprob.logprob,
+                                           hf_logprob[0][i][token_id].item(),
+                                           atol=1e-2,
+                                           rtol=1e-2)
+        vllm_sample_logprobs = vllm_result.outputs[0].logprobs
+        for i, top_logprobs in enumerate(vllm_sample_logprobs):
+            for token_id, sample_logprob in top_logprobs.items():
+                logprob = sample_logprob.logprob
+                torch.testing.assert_close(logprob,
+                                           hf_logprob[i][-1][token_id].item(),
+                                           atol=1e-2,
+                                           rtol=1e-2)
+                if detokenize:
+                    assert isinstance(sample_logprob.decoded_token, str), (
+                        "The token should be decoded by the time it is returned"
+                        " to the user.")
+
+    # Test if prompt logprobs are correctly set.
+    for vllm_result in vllm_results:
+        token_ids = vllm_result.prompt_token_ids
+        prompt_logprobs = vllm_result.prompt_logprobs
+
+        # The first token doesn't have logprob.
+        assert prompt_logprobs[0] is None
+
+        for token_id, logprob_dict in zip(token_ids[1:], prompt_logprobs[1:]):
+            assert token_id in logprob_dict
+
+
+def test_max_logprobs():
+    runner = VllmRunner("facebook/opt-125m", max_logprobs=1)
+    vllm_sampling_params = SamplingParams(logprobs=1)
+    # should pass
+    runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
+
+    bad_sampling_params = SamplingParams(logprobs=2)
+    with pytest.raises(ValueError):
+        runner.generate(["Hello world"], sampling_params=bad_sampling_params)
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
+@pytest.mark.parametrize("detokenize", [True, False])
+def test_none_logprobs(vllm_runner, model, chunked_prefill_token_size: int,
+                       detokenize: bool, example_prompts):
+    max_num_seqs = 256
+    enable_chunked_prefill = False
+    max_num_batched_tokens = None
+    if chunked_prefill_token_size != -1:
+        enable_chunked_prefill = True
+        max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
+        max_num_batched_tokens = chunked_prefill_token_size
+    max_tokens = 5
+
+    with vllm_runner(
+            model,
+            enable_chunked_prefill=enable_chunked_prefill,
+            max_num_batched_tokens=max_num_batched_tokens,
+            max_num_seqs=max_num_seqs,
+    ) as vllm_model:
+        sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
+                                                       logprobs=None,
+                                                       temperature=0.0,
+                                                       detokenize=detokenize)
+        results_logprobs_none = vllm_model.llm.generate(
+            example_prompts, sampling_params=sampling_params_logprobs_none)
+
+    for i in range(len(results_logprobs_none)):
+        assert results_logprobs_none[i].outputs[0].logprobs is None
+        assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
diff --git a/vllm_v0.10.0/tests/samplers/test_no_bad_words.py b/vllm_v0.10.0/tests/samplers/test_no_bad_words.py
new file mode 100644
index 0000000..11803b8
--- /dev/null
+++ b/vllm_v0.10.0/tests/samplers/test_no_bad_words.py
@@ -0,0 +1,194 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Make sure bad_words works.
+
+Run `pytest tests/samplers/test_no_bad_words.py`.
+
+"""
+from typing import Optional
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm import LLM, SamplingParams
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    """We can run both engines for this test."""
+    pass
+
+
+def _generate(
+    llm: LLM,
+    prompt: str,
+    num_prompt_tokens: int,
+    temperature: float = 0,
+    bad_words: Optional[list[str]] = None,
+) -> list[int]:
+    sampling_params = SamplingParams(
+        temperature=temperature,
+        bad_words=bad_words,
+    )
+
+    # [([output_token_ids, ], [output_text, ]), ]
+    output = llm.generate([prompt], sampling_params=sampling_params)
+
+    output_token_ids = output[0][0][0][num_prompt_tokens:]
+    # [0] first (and only) request output
+    # [0] token_ids (not text)
+    # [0] first (and only) output completion
+
+    return output_token_ids
+
+
+class TestOneTokenBadWord:
+    MODEL = "TheBloke/Llama-2-7B-fp16"
+
+    PROMPT = "Hi! How are"
+    TARGET_TOKEN = "you"
+
+    def setup_method(self, method):
+        self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL,
+                                                       add_prefix_space=True)
+
+        self.num_prompt_tokens = len(self._encode(self.PROMPT))
+        self.target_token_id = self._encode(self.TARGET_TOKEN,
+                                            add_special_tokens=False)[0]
+
+    def test_one_token_bad_word(self, vllm_runner):
+        with vllm_runner(self.MODEL) as llm:
+            output_token_ids = self._generate(llm)
+            assert output_token_ids[0] == self.target_token_id
+
+            output_token_ids = self._generate(llm,
+                                              bad_words=[self.TARGET_TOKEN])
+            assert self.target_token_id not in output_token_ids
+
+    def _generate(self,
+                  llm: LLM,
+                  bad_words: Optional[list[str]] = None) -> list[int]:
+        return _generate(
+            llm=llm,
+            prompt=self.PROMPT,
+            num_prompt_tokens=self.num_prompt_tokens,
+            bad_words=bad_words,
+        )
+
+    def _encode(self,
+                prompt: str,
+                add_special_tokens: bool = True) -> list[int]:
+        return self.tokenizer(prompt,
+                              add_special_tokens=add_special_tokens).input_ids
+
+
+class TestTwoTokenBadWord:
+    # Another model (with a different tokenizer behaviour)
+    MODEL = "distilbert/distilgpt2"
+
+    PROMPT = "How old are you? I am 10"
+    TARGET_TOKEN1 = "years"
+    TARGET_TOKEN2 = "old"
+    NEIGHBOUR_TOKEN2 = "older"
+
+    def setup_method(self, method):
+        self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL,
+                                                       add_prefix_space=True)
+
+        self.num_prompt_tokens = len(self._encode(self.PROMPT))
+        self.target_token_id1 = self._encode(self.TARGET_TOKEN1,
+                                             add_special_tokens=False)[0]
+        self.target_token_id2 = self._encode(self.TARGET_TOKEN2,
+                                             add_special_tokens=False)[0]
+        self.neighbour_token_id2 = self._encode(self.NEIGHBOUR_TOKEN2,
+                                                add_special_tokens=False)[0]
+
+    def test_two_token_bad_word(self, vllm_runner):
+        with vllm_runner(self.MODEL, dtype="half") as llm:
+            output_token_ids = self._generate(llm)
+            assert output_token_ids[:2] == [
+                self.target_token_id1, self.target_token_id2
+            ]
+
+            output_token_ids = self._generate(llm,
+                                              bad_words=[self.TARGET_TOKEN1])
+            assert self.target_token_id1 not in output_token_ids
+
+            output_token_ids = self._generate(llm,
+                                              bad_words=[self.TARGET_TOKEN2])
+            assert output_token_ids[0] == self.target_token_id1
+            assert self.target_token_id2 not in output_token_ids
+
+            output_token_ids = self._generate(
+                llm, bad_words=[f'{self.TARGET_TOKEN1} {self.TARGET_TOKEN2}'])
+            assert output_token_ids[0] == self.target_token_id1
+            assert output_token_ids[:2] != [
+                self.target_token_id1, self.target_token_id2
+            ]
+            assert not self._contains(
+                output_token_ids,
+                [self.target_token_id1, self.target_token_id2])
+            # Model dependent behaviour
+            assert output_token_ids[:2] == [
+                self.target_token_id1, self.neighbour_token_id2
+            ]
+
+            output_token_ids = self._generate(
+                llm,
+                bad_words=[
+                    f'{self.TARGET_TOKEN1} {self.TARGET_TOKEN2}',
+                    f'{self.TARGET_TOKEN1} {self.NEIGHBOUR_TOKEN2}'
+                ])
+            assert output_token_ids[0] == self.target_token_id1
+            assert output_token_ids[:2] != [
+                self.target_token_id1, self.target_token_id2
+            ]
+            assert not self._contains(
+                output_token_ids,
+                [self.target_token_id1, self.target_token_id2])
+            assert output_token_ids[:2] != [
+                self.target_token_id1, self.neighbour_token_id2
+            ]
+            assert not self._contains(
+                output_token_ids,
+                [self.target_token_id1, self.neighbour_token_id2])
+            assert ((self.target_token_id2 in output_token_ids)
+                    or (self.neighbour_token_id2 in output_token_ids))
+
+    def _generate(self,
+                  llm: LLM,
+                  bad_words: Optional[list[str]] = None) -> list[int]:
+        return _generate(
+            llm=llm,
+            prompt=self.PROMPT,
+            num_prompt_tokens=self.num_prompt_tokens,
+            bad_words=bad_words,
+        )
+
+    @staticmethod
+    def _contains(sequence: list[int], subsequence: list[int]) -> bool:
+        searched = False
+
+        for start in range(len(sequence)):
+            end = start + len(subsequence)
+            current_subsequence = sequence[start:end]
+
+            if len(current_subsequence) < len(subsequence):
+                continue
+
+            searched = True
+
+            assert len(current_subsequence) == len(subsequence)
+
+            if current_subsequence == subsequence:
+                return True
+
+        assert searched, "All subsequences did not match in length..."
+
+        return False
+
+    def _encode(self,
+                prompt: str,
+                add_special_tokens: bool = True) -> list[int]:
+        return self.tokenizer(prompt,
+                              add_special_tokens=add_special_tokens).input_ids
diff --git a/vllm_v0.10.0/tests/samplers/test_ranks.py b/vllm_v0.10.0/tests/samplers/test_ranks.py
new file mode 100644
index 0000000..86fc14d
--- /dev/null
+++ b/vllm_v0.10.0/tests/samplers/test_ranks.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm import SamplingParams
+
+MODELS = ["distilbert/distilgpt2"]
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    """We can run both engines for this test."""
+    pass
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_ranks(
+    vllm_runner,
+    model,
+    dtype,
+    example_prompts,
+):
+    max_tokens = 5
+    num_top_logprobs = 5
+    num_prompt_logprobs = 5
+
+    with vllm_runner(model, dtype=dtype,
+                     max_logprobs=num_top_logprobs) as vllm_model:
+
+        ## Test greedy logprobs ranks
+        vllm_sampling_params = SamplingParams(
+            temperature=0.0,
+            top_p=1.0,
+            max_tokens=max_tokens,
+            logprobs=num_top_logprobs,
+            prompt_logprobs=num_prompt_logprobs)
+        vllm_results = vllm_model.generate_w_logprobs(example_prompts,
+                                                      vllm_sampling_params)
+
+        ## Test non-greedy logprobs ranks
+        sampling_params = SamplingParams(temperature=1.0,
+                                         top_p=1.0,
+                                         max_tokens=max_tokens,
+                                         logprobs=num_top_logprobs,
+                                         prompt_logprobs=num_prompt_logprobs)
+        res = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
+
+    for result in vllm_results:
+        assert result[2] is not None
+        assert len(result[2]) == len(result[0])
+        # check whether all chosen tokens have ranks = 1
+        for token, logprobs in zip(result[0], result[2]):
+            assert token in logprobs
+            assert logprobs[token].rank == 1
+
+    for result in res:
+        assert result[2] is not None
+        assert len(result[2]) == len(result[0])
+        # check whether all chosen tokens have ranks
+        for token, logprobs in zip(result[0], result[2]):
+            assert logprobs[token].rank >= 1
diff --git a/vllm_v0.10.0/tests/samplers/test_sampler.py b/vllm_v0.10.0/tests/samplers/test_sampler.py
new file mode 100644
index 0000000..520b88d
--- /dev/null
+++ b/vllm_v0.10.0/tests/samplers/test_sampler.py
@@ -0,0 +1,769 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import itertools
+import random
+from dataclasses import dataclass
+from typing import Optional
+from unittest.mock import Mock, patch
+
+import pytest
+import torch
+from transformers import GenerationConfig, GenerationMixin
+
+import vllm.envs as envs
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.utils import set_random_seed
+from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
+from vllm.utils import Counter, is_pin_memory_available
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This file tests V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
+class MockLogitsSampler(Sampler):
+
+    def __init__(self, fake_logits: torch.Tensor):
+        super().__init__()
+        self.fake_logits = fake_logits
+
+    def forward(self, *args, **kwargs):
+        return super().forward(*args, **kwargs)
+
+
+def _prepare_test(
+        batch_size: int
+) -> tuple[torch.Tensor, torch.Tensor, MockLogitsSampler]:
+    input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
+    fake_logits = torch.full((batch_size, VOCAB_SIZE),
+                             1e-2,
+                             dtype=input_tensor.dtype)
+    sampler = MockLogitsSampler(fake_logits)
+    return input_tensor, fake_logits, sampler
+
+
+VOCAB_SIZE = 32000
+RANDOM_SEEDS = list(range(128))
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+
+def _do_sample(
+    batch_size: int,
+    input_tensor: torch.Tensor,
+    sampler: MockLogitsSampler,
+    sampling_params: SamplingParams,
+    device: str,
+):
+    seq_group_metadata_list: list[SequenceGroupMetadata] = []
+    seq_lens: list[int] = []
+    for i in range(batch_size):
+        seq_group_metadata_list.append(
+            SequenceGroupMetadata(
+                request_id=f"test_{i}",
+                is_prompt=True,
+                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
+                sampling_params=sampling_params,
+                block_tables={0: [1]},
+            ))
+        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
+
+    sampling_metadata = SamplingMetadata.prepare(
+        seq_group_metadata_list,
+        seq_lens,
+        query_lens=seq_lens,
+        device=device,
+        pin_memory=is_pin_memory_available())
+    return sampler(logits=input_tensor, sampling_metadata=sampling_metadata)
+
+
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_all_greedy(seed: int, device: str):
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    input_tensor, fake_logits, sampler = _prepare_test(batch_size)
+
+    sampling_params = SamplingParams(temperature=0)
+    sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                sampling_params, device)
+    expected = torch.argmax(fake_logits, dim=-1)
+    for i, sequence_output in enumerate(sampler_output):
+        for nth_output in sequence_output.samples:
+            assert nth_output.output_token == expected[i].item()
+
+
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_all_random(seed: int, device: str):
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    _, fake_logits, sampler = _prepare_test(batch_size)
+
+    for i in range(batch_size):
+        fake_logits[i, i] = 1e2
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        n=random.randint(1, 10),
+    )
+    sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                sampling_params, device)
+
+    for i, sequence_output in enumerate(sampler_output):
+        for nth_output in sequence_output.samples:
+            assert nth_output.output_token == i
+
+
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_all_random_seed(seed: int, device: str):
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    _, fake_logits, sampler = _prepare_test(batch_size)
+
+    for i in range(batch_size):
+        fake_logits[i, i] = 1e2
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        n=random.randint(1, 10),
+        seed=random.randint(0, 10000),
+    )
+    sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                sampling_params, device)
+
+    for i, sequence_output in enumerate(sampler_output):
+        for nth_output in sequence_output.samples:
+            assert nth_output.output_token == i
+
+
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_all_random_seed_deterministic(seed: int, device: str):
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    _, fake_logits, sampler = _prepare_test(batch_size)
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        n=random.randint(1, 10),
+        seed=random.randint(0, 10000),
+    )
+    first_sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                      sampling_params, device)
+
+    second_sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                       sampling_params, device)
+
+    assert first_sampler_output == second_sampler_output
+
+
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_min_tokens_penalty(seed: int, device: str):
+    seq_id_counter = Counter(start=random.randint(0, 100))
+    set_random_seed(seed)
+    torch.set_default_device(device)
+
+    def create_sampling_params(min_tokens,
+                               eos_token_id=0,
+                               *,
+                               stop_token_ids: Optional[list[int]] = None,
+                               prompt_logprobs: Optional[int] = None):
+        sampling_params = SamplingParams(
+            min_tokens=min_tokens,
+            max_tokens=9999,  # keep higher than max of min_tokens
+            stop_token_ids=stop_token_ids,
+            # requesting prompt_logprobs changes the structure of `logits`
+            prompt_logprobs=prompt_logprobs,
+        )
+        sampling_params.all_stop_token_ids.add(eos_token_id)
+        return sampling_params
+
+    def create_sequence_data(num_input=3, num_generated=0):
+        seq_data = SequenceData.from_seqs(
+            random.choices(range(0, VOCAB_SIZE), k=num_input))
+        if num_generated > 0:
+            seq_data.output_token_ids = random.choices(range(0, VOCAB_SIZE),
+                                                       k=num_generated)
+        return seq_data
+
+    def generate_test_case():
+        # generate multiple seq groups but limit total batch size
+        batch_size = random.randint(1, 128)
+
+        expected_penalization = []
+        sequence_metadata_list: list[SequenceGroupMetadata] = []
+        # 20% chance to generate seq group metadata list with all prompts
+        is_prompt = random.random() < 0.2
+        while batch_size > 0:
+            num_seqs = 1 if is_prompt else random.randint(1, batch_size)
+
+            eos_token_id = random.randint(0, VOCAB_SIZE - 1)
+            min_tokens = random.randint(0, 50)
+            num_stop_tokens = random.randint(0, 8)
+            if num_stop_tokens > 0:
+                stop_token_ids = random.choices(range(0, VOCAB_SIZE - 1),
+                                                k=num_stop_tokens)
+            else:
+                stop_token_ids = None
+
+            sampling_params = create_sampling_params(
+                min_tokens=min_tokens,
+                eos_token_id=eos_token_id,
+                stop_token_ids=stop_token_ids)
+
+            seq_data: dict[int, SequenceData] = {}
+            seq_group_penalization: list[bool] = []
+            for _ in range(num_seqs):
+                num_input = random.randint(1, 100)
+                num_generated = 0 if is_prompt else random.randint(1, 100)
+                seq_data[next(seq_id_counter)] = create_sequence_data(
+                    num_input=num_input, num_generated=num_generated)
+                seq_group_penalization.append(num_generated < min_tokens)
+
+            expected_penalization.extend(seq_group_penalization)
+            sequence_metadata_list.append(
+                SequenceGroupMetadata(
+                    request_id=f"test_{batch_size}",
+                    is_prompt=is_prompt,
+                    seq_data=seq_data,
+                    sampling_params=sampling_params,
+                    block_tables={},
+                ))
+            batch_size -= num_seqs
+
+        return {
+            "expected_penalization": expected_penalization,
+            "seq_group_metadata_list": sequence_metadata_list,
+        }
+
+    # define some explicit test cases for edge case behavior
+    prompt_without_penalization = {
+        "expected_penalization": [False],
+        "seq_group_metadata_list": [
+            SequenceGroupMetadata(
+                request_id="test_1",
+                is_prompt=True,
+                seq_data={
+                    next(seq_id_counter): create_sequence_data(),
+                },
+                sampling_params=create_sampling_params(0),
+                block_tables={},
+            ),
+        ]
+    }
+
+    prompt_with_penalization = {
+        "expected_penalization": [True],
+        "seq_group_metadata_list": [
+            SequenceGroupMetadata(
+                request_id="test_1",
+                is_prompt=True,
+                seq_data={
+                    next(seq_id_counter): create_sequence_data(),
+                },
+                sampling_params=create_sampling_params(1),
+                block_tables={},
+            ),
+        ]
+    }
+
+    prompt_with_penalization_and_prompt_logprobs = {
+        "expected_penalization": [False, False, True],
+        "seq_group_metadata_list": [
+            SequenceGroupMetadata(
+                request_id="test_1",
+                is_prompt=True,
+                seq_data={
+                    next(seq_id_counter): create_sequence_data(num_input=3),
+                },
+                sampling_params=create_sampling_params(1, prompt_logprobs=3),
+                block_tables={},
+            ),
+        ]
+    }
+
+    stop_penalizing_after_min_tokens = {
+        "expected_penalization": [False],
+        "seq_group_metadata_list": [
+            SequenceGroupMetadata(
+                request_id="test_1",
+                is_prompt=False,
+                seq_data={
+                    next(seq_id_counter):
+                    create_sequence_data(num_generated=1),
+                },
+                sampling_params=create_sampling_params(1),
+                block_tables={},
+            )
+        ]
+    }
+
+    stop_token_ids = [42, 99, 42, 0]  # intentional duplication
+    prompt_combination = {
+        "expected_penalization": [False, True, False],
+        "seq_group_metadata_list": [
+            SequenceGroupMetadata(
+                request_id="test_2",
+                is_prompt=True,
+                seq_data={
+                    next(seq_id_counter): create_sequence_data(num_input=2),
+                },
+                sampling_params=create_sampling_params(1, prompt_logprobs=3),
+                block_tables={},
+            ),
+            SequenceGroupMetadata(
+                request_id="test_3",
+                is_prompt=True,
+                seq_data={
+                    next(seq_id_counter): create_sequence_data(),
+                },
+                sampling_params=create_sampling_params(
+                    0, stop_token_ids=stop_token_ids),
+                block_tables={},
+            )
+        ]
+    }
+
+    stop_token_ids = [1, 999, 37, 37]  # intentional duplication
+    decode_combination = {
+        "expected_penalization": [True, False, False, True, False],
+        "seq_group_metadata_list": [
+            SequenceGroupMetadata(
+                request_id="test_1",
+                is_prompt=False,
+                seq_data={
+                    next(seq_id_counter):
+                    create_sequence_data(num_generated=1),
+                    next(seq_id_counter):
+                    create_sequence_data(num_generated=100),
+                },
+                sampling_params=create_sampling_params(
+                    2, stop_token_ids=stop_token_ids),
+                block_tables={},
+            ),
+            SequenceGroupMetadata(
+                request_id="test_2",
+                is_prompt=False,
+                seq_data={
+                    next(seq_id_counter):
+                    create_sequence_data(num_generated=20),
+                    next(seq_id_counter):
+                    create_sequence_data(num_generated=1),
+                    next(seq_id_counter):
+                    create_sequence_data(num_generated=10),
+                },
+                sampling_params=create_sampling_params(
+                    10, prompt_logprobs=5, stop_token_ids=stop_token_ids),
+                block_tables={},
+            ),
+        ]
+    }
+
+    if seed == 0:
+        test_cases = [
+            prompt_without_penalization,
+            prompt_with_penalization,
+            prompt_with_penalization_and_prompt_logprobs,
+            stop_penalizing_after_min_tokens,
+            prompt_combination,
+            decode_combination,
+        ]
+    else:
+        test_cases = [generate_test_case()]
+
+    def run_test_case(*, expected_penalization: list[bool],
+                      seq_group_metadata_list: list[SequenceGroupMetadata]):
+        assert expected_penalization, \
+            "Invalid test case, need expected_penalization"
+        assert seq_group_metadata_list, \
+            "Invalid test case, need seq_group_metadata_list"
+
+        batch_size = 0
+        seq_lens: list[int] = []
+        sampling_params_per_row: list[SamplingParams] = []
+        for sgm in seq_group_metadata_list:
+            sampling_params = sgm.sampling_params
+
+            num_rows = len(sgm.seq_data)
+            if sgm.is_prompt:
+                # a prompt seq_group has only one sequence
+                seq_data = next(iter(sgm.seq_data.values()))
+                prompt_len = seq_data.get_prompt_len()
+                seq_lens.append(prompt_len)
+
+                assert sgm.sampling_params is not None
+                if sgm.sampling_params.prompt_logprobs:
+                    # with prompt_logprobs each token in the prompt has a row in
+                    # logits
+                    num_rows = prompt_len
+
+            batch_size += num_rows
+            sampling_params_per_row.extend(
+                itertools.repeat(sampling_params, num_rows))
+
+        assert len(
+            expected_penalization
+        ) == batch_size, \
+            ("Invalid test case, expected_penalization does not match computed"
+             "batch size")
+
+        _, fake_logits, sampler = _prepare_test(batch_size)
+        sampling_metadata = SamplingMetadata.prepare(
+            seq_group_metadata_list,
+            seq_lens=seq_lens if seq_lens else None,
+            query_lens=seq_lens if seq_lens else [1] * batch_size,
+            device=device,
+            pin_memory=is_pin_memory_available())
+        # the logits tensor is modified in-place by the sampler
+        _ = sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
+
+        for logits_idx, (should_penalize, sampling_params) in enumerate(
+                zip(expected_penalization, sampling_params_per_row)):
+
+            tokens_to_check = sampling_params.all_stop_token_ids
+
+            if should_penalize:
+                for token_id in tokens_to_check:
+                    assert fake_logits[logits_idx, token_id] == -float(
+                        'inf'
+                    ), f"Expected token {token_id} for logits row {logits_idx}"
+                    " to be penalized"
+                # no other tokens should be set to -inf
+                assert torch.count_nonzero(
+                    fake_logits[logits_idx, :] == -float('inf')) == len(
+                        tokens_to_check
+                    ), f"Expected only {len(tokens_to_check)} to be penalized"
+            else:
+                # no tokens should be set to -inf
+                assert torch.count_nonzero(
+                    fake_logits[logits_idx, :] ==
+                    -float('inf')) == 0, "No tokens should have been penalized"
+
+    for test_case in test_cases:
+        run_test_case(**test_case)
+
+
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_mixed(seed: int, device: str):
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    input_tensor, fake_logits, sampler = _prepare_test(batch_size)
+
+    seq_group_metadata_list: list[SequenceGroupMetadata] = []
+    expected_tokens: list[Optional[list[int]]] = []
+    seq_lens: list[int] = []
+    for i in range(batch_size):
+        expected: Optional[list[int]] = None
+        sampling_type = random.randint(0, 2)
+        if sampling_type == 0:
+            sampling_params = SamplingParams(temperature=0)
+            expected = [int(torch.argmax(fake_logits[i], dim=-1).item())]
+        elif sampling_type in (1, 2):
+            n = random.randint(1, 10)
+            sampling_params = SamplingParams(
+                temperature=random.random() + 0.1,
+                top_p=min(random.random() + 0.1, 1),
+                top_k=random.randint(0, 10),
+                n=n,
+                presence_penalty=random.randint(0, 1),
+            )
+            if sampling_type == 2:
+                sampling_params.seed = random.randint(0, 10000)
+            else:
+                for idx in range(n):
+                    fake_logits[i, i + idx] = 1e2
+                expected = list(range(i, i + n))
+
+        expected_tokens.append(expected)
+        seq_group_metadata_list.append(
+            SequenceGroupMetadata(
+                request_id=f"test_{i}",
+                is_prompt=True,
+                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
+                sampling_params=sampling_params,
+                block_tables={0: [1]},
+            ))
+        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
+
+    generators: dict[str, torch.Generator] = {}
+
+    def test_sampling():
+        sampling_metadata = SamplingMetadata.prepare(
+            seq_group_metadata_list,
+            seq_lens,
+            query_lens=seq_lens,
+            device=device,
+            pin_memory=is_pin_memory_available(),
+            generators=generators)
+        sampler_output = sampler(logits=fake_logits,
+                                 sampling_metadata=sampling_metadata)
+
+        for i, (sequence_output, metadata) in enumerate(
+                zip(sampler_output, seq_group_metadata_list)):
+            assert metadata.sampling_params is not None
+
+            if (metadata.sampling_params.seed is not None
+                    and expected_tokens[i] is None):
+                # Record seeded random result to compare with results of
+                # second invocation
+                expected_tokens[i] = [
+                    nth_output.output_token
+                    for nth_output in sequence_output.samples
+                ]
+                continue
+
+            expected_tokens_item = expected_tokens[i]
+            assert expected_tokens_item is not None
+
+            for n, nth_output in enumerate(sequence_output.samples):
+                assert metadata.sampling_params is not None
+
+                if (metadata.sampling_params.temperature == 0
+                        or metadata.sampling_params.seed is not None):
+                    # Ensure exact matches for greedy or random with seed
+                    assert nth_output.output_token == expected_tokens_item[n]
+                else:
+                    # For non-seeded random check that one of the high-logit
+                    # tokens were chosen
+                    assert nth_output.output_token in expected_tokens_item
+
+    # Test batch
+    test_sampling()
+
+    # Shuffle the batch and resample
+    target_index = list(range(batch_size))
+    for list_to_shuffle in (target_index, seq_group_metadata_list,
+                            expected_tokens, seq_lens):
+        random.Random(seed).shuffle(list_to_shuffle)
+    target_index = torch.tensor(target_index)
+    input_tensor.data = input_tensor.index_select(0, target_index)
+    fake_logits.data = fake_logits.index_select(0, target_index)
+
+    # This time, results of seeded random samples will be compared with
+    # the corresponding sample in the pre-shuffled batch
+    test_sampling()
+
+
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_top_k_top_p(seed: int, device: str):
+    set_random_seed(seed)
+    batch_size = random.randint(1, 256)
+    top_k = random.randint(100, 500)
+    top_p = random.random() * 0.1
+    vocab_size = 32000
+    input_tensor = torch.rand((batch_size, 1024),
+                              device=device,
+                              dtype=torch.float16)
+    fake_logits = torch.normal(0,
+                               5,
+                               size=(batch_size, vocab_size),
+                               device=input_tensor.device,
+                               dtype=input_tensor.dtype)
+    sampler = MockLogitsSampler(fake_logits)
+
+    generation_model = GenerationMixin()
+    generation_config = GenerationConfig(top_k=top_k,
+                                         top_p=top_p,
+                                         do_sample=True)
+
+    @dataclass
+    class MockConfig:
+        is_encoder_decoder: bool = False
+
+    generation_model.config = MockConfig()  # needed by the following method
+    generation_model._prepare_special_tokens(generation_config, device=device)
+    processors = generation_model._get_logits_processor(generation_config,
+                                                        None,
+                                                        None,
+                                                        None, [],
+                                                        device=device)
+    assert len(processors) == 2  # top_p and top_k
+
+    seq_group_metadata_list: list[SequenceGroupMetadata] = []
+    seq_lens: list[int] = []
+    for i in range(batch_size):
+        seq_group_metadata_list.append(
+            SequenceGroupMetadata(
+                request_id=f"test_{i}",
+                is_prompt=True,
+                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
+                sampling_params=SamplingParams(
+                    temperature=1,
+                    top_k=top_k,
+                    top_p=top_p,
+                ),
+                block_tables={0: [1]},
+            ))
+        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
+
+    sampling_metadata = SamplingMetadata.prepare(
+        seq_group_metadata_list,
+        seq_lens,
+        query_lens=seq_lens,
+        device=device,
+        pin_memory=is_pin_memory_available())
+
+    sample_probs = None
+
+    def mock_sample(probs, *args, **kwargs):
+        nonlocal sample_probs
+        sample_probs = probs
+        return ([[prob.topk(1, dim=-1).indices.tolist(), [0]]
+                 for prob in probs], None)
+
+    # top-k and top-p is only calculated when flashinfer kernel is not available
+    with patch("vllm.model_executor.layers.sampler._sample", mock_sample), \
+         patch("vllm.model_executor.layers.sampler."
+               "flashinfer_top_k_top_p_sampling", None):
+        sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
+
+    assert sample_probs is not None
+
+    hf_probs = processors(torch.zeros_like(fake_logits), fake_logits.clone())
+    hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float)
+    torch.testing.assert_close(hf_probs, sample_probs, rtol=0.0, atol=1e-5)
+    assert torch.equal(hf_probs.eq(0), sample_probs.eq(0))
+
+
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_flashinfer_fallback(seed: int, device: str):
+    if not envs.VLLM_USE_FLASHINFER_SAMPLER:
+        pytest.skip("Flashinfer sampler is disabled")
+
+    pytest.skip("After FlashInfer 0.2.3, sampling will never fail")
+
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    _, fake_logits, sampler = _prepare_test(batch_size)
+
+    def failing_flashinfer_sampling(*_args, **_kwargs):
+        return None, torch.zeros(batch_size, device=device, dtype=torch.int32)
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        n=random.randint(1, 10),
+        seed=random.randint(0, 10000),
+    )
+    sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                sampling_params, device)
+
+    with patch(
+            "vllm.model_executor.layers.sampler."
+            "flashinfer_top_k_top_p_sampling", failing_flashinfer_sampling):
+        fallback_sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                             sampling_params, device)
+
+    assert sampler_output == fallback_sampler_output
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_repetition_penalty_mixed(device: str):
+
+    vocab_size = 8
+
+    def test_sampling_params(sampling_params: list[SamplingParams]):
+
+        seq_group_metadata_list: list[SequenceGroupMetadata] = []
+        seq_lens: list[int] = []
+        for i in range(2):
+            seq_group_metadata_list.append(
+                SequenceGroupMetadata(
+                    request_id=f"test_{i}",
+                    is_prompt=True,
+                    seq_data={0: SequenceData.from_seqs([1, 2, 3])},
+                    sampling_params=sampling_params[i],
+                    block_tables={0: [1]},
+                ))
+            seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
+
+        sampling_metadata = SamplingMetadata.prepare(
+            seq_group_metadata_list,
+            seq_lens,
+            query_lens=seq_lens,
+            device=device,
+            pin_memory=is_pin_memory_available())
+
+        fake_logits = torch.full((2, vocab_size),
+                                 1e-2,
+                                 device=device,
+                                 dtype=torch.float16)
+
+        fake_logits[:, 5] = 1.1e-2
+        fake_logits[:, 1] = 1.2e-2
+
+        sampler = MockLogitsSampler(fake_logits)
+
+        sampler_output = sampler(logits=fake_logits,
+                                 sampling_metadata=sampling_metadata)
+
+        generated_tokens = []
+        for output in sampler_output:
+            generated_tokens.append(output.samples[0].output_token)
+
+        return generated_tokens
+
+    # one configuration is greedy with repetition_penalty
+    sampling_params_rep = SamplingParams(
+        temperature=0.0,
+        repetition_penalty=2.0,
+    )
+
+    # other configuration is sampling w/o repetition_penalty
+    sampling_params_sample = SamplingParams(
+        temperature=1.0,
+        top_k=1,
+        seed=42,
+    )
+
+    tokens1 = test_sampling_params(
+        [sampling_params_rep, sampling_params_sample])
+
+    tokens2 = test_sampling_params(
+        [sampling_params_sample, sampling_params_rep])
+
+    assert tokens1[0] == tokens2[1]
+    assert tokens1[1] == tokens2[0]
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_include_gpu_probs_tensor(device: str):
+    set_random_seed(42)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    _, fake_logits, sampler = _prepare_test(batch_size)
+    sampler.include_gpu_probs_tensor = True
+    sampler.should_modify_greedy_probs_inplace = False
+
+    sampling_params = SamplingParams(temperature=0)
+
+    mock_inplace = Mock()
+    with patch(
+            "vllm.model_executor.layers.sampler._modify_greedy_probs_inplace",
+            mock_inplace):
+
+        sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                    sampling_params, device)
+        mock_inplace.assert_not_called()
+
+    assert sampler_output.sampled_token_probs is not None
+    assert sampler_output.logprobs is not None
+    assert sampler_output.sampled_token_ids is not None
diff --git a/vllm_v0.10.0/tests/samplers/test_seeded_generate.py b/vllm_v0.10.0/tests/samplers/test_seeded_generate.py
new file mode 100644
index 0000000..5a0efd9
--- /dev/null
+++ b/vllm_v0.10.0/tests/samplers/test_seeded_generate.py
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Verify that seeded random sampling is deterministic.
+
+Run `pytest tests/samplers/test_seeded_generate.py`.
+"""
+import copy
+import random
+from itertools import combinations
+
+import pytest
+
+from vllm import SamplingParams
+from vllm.model_executor.utils import set_random_seed
+
+MODEL = "facebook/opt-125m"
+RANDOM_SEEDS = list(range(5))
+
+
+@pytest.fixture
+def vllm_model(vllm_runner, monkeypatch):
+    # This file relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
+    with vllm_runner(MODEL, dtype="half") as vllm_model:
+        yield vllm_model
+
+
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+def test_random_sample_with_seed(
+    vllm_model,
+    example_prompts,
+    seed: int,
+) -> None:
+    set_random_seed(seed)
+
+    sampling_params = SamplingParams(
+        # Parameters to ensure sufficient randomness
+        temperature=3.0,
+        top_p=min(random.random() + 0.3, 1),
+        top_k=random.randint(5, 20),
+        n=random.randint(1, 10),
+        presence_penalty=random.randint(0, 1),
+        max_tokens=8,
+        ignore_eos=True,
+    )
+
+    sampling_params_seed_1 = copy.deepcopy(sampling_params)
+    sampling_params_seed_1.seed = 100
+    sampling_params_seed_2 = copy.deepcopy(sampling_params)
+    sampling_params_seed_2.seed = 200
+
+    llm = vllm_model.llm
+
+    for prompt in example_prompts:
+        for params in (
+                sampling_params,
+                sampling_params_seed_1,
+                sampling_params_seed_2,
+                sampling_params,
+                sampling_params_seed_1,
+                sampling_params_seed_2,
+        ):
+            llm._add_request(prompt, params=params)
+
+    results = llm._run_engine(use_tqdm=False)
+    all_outputs = [[out.token_ids for out in output.outputs]
+                   for output in results]
+
+    for i in range(0, len(example_prompts), 6):
+        outputs = all_outputs[i:i + 6]
+
+        # verify all non-seeded requests differ
+        for output_a, output_b in combinations(
+            (outputs[0], outputs[1], outputs[2], outputs[3]),
+                2,
+        ):
+            assert output_a != output_b
+
+        # verify requests with the same seed match
+        assert outputs[1] == outputs[4]
+        assert outputs[2] == outputs[5]
+
+        # verify generations within the same parallel sampling group differ
+        for output in outputs:
+            for sub_output_a, sub_output_b in combinations(output, 2):
+                assert sub_output_a != sub_output_b
diff --git a/vllm_v0.10.0/tests/standalone_tests/lazy_imports.py b/vllm_v0.10.0/tests/standalone_tests/lazy_imports.py
new file mode 100644
index 0000000..21bcb6b
--- /dev/null
+++ b/vllm_v0.10.0/tests/standalone_tests/lazy_imports.py
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Description: Test the lazy import module
+# The utility function cannot be placed in `vllm.utils`
+# this needs to be a standalone script
+import sys
+from contextlib import nullcontext
+
+from vllm_test_utils import BlameResult, blame
+
+# List of modules that should not be imported too early.
+# Lazy import `torch._inductor.async_compile` to avoid creating
+# too many processes before we set the number of compiler threads.
+# Lazy import `cv2` to avoid bothering users who only use text models.
+# `cv2` can easily mess up the environment.
+module_names = ["torch._inductor.async_compile", "cv2"]
+
+
+def any_module_imported():
+    return any(module_name in sys.modules for module_name in module_names)
+
+
+# In CI, we only check finally if the module is imported.
+# If it is indeed imported, we can rerun the test with `use_blame=True`,
+# which will trace every function call to find the first import location,
+# and help find the root cause.
+# We don't run it in CI by default because it is slow.
+use_blame = False
+context = blame(any_module_imported) if use_blame else nullcontext()
+with context as result:
+    import vllm  # noqa
+
+if use_blame:
+    assert isinstance(result, BlameResult)
+    print(f"the first import location is:\n{result.trace_stack}")
+
+assert not any_module_imported(), (
+    f"Some the modules in {module_names} are imported. To see the first"
+    f" import location, run the test with `use_blame=True`.")
diff --git a/vllm_v0.10.0/tests/standalone_tests/python_only_compile.sh b/vllm_v0.10.0/tests/standalone_tests/python_only_compile.sh
new file mode 100644
index 0000000..ec1bcbc
--- /dev/null
+++ b/vllm_v0.10.0/tests/standalone_tests/python_only_compile.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# This script tests if the python only compilation works correctly
+# for users who do not have any compilers installed on their system
+
+set -e
+set -x
+
+cd /vllm-workspace/
+
+# uninstall vllm
+pip3 uninstall -y vllm
+# restore the original files
+mv test_docs/vllm ./vllm
+
+# remove all compilers
+apt remove --purge build-essential -y
+apt autoremove -y
+
+echo 'import os; os.system("touch /tmp/changed.file")' >> vllm/__init__.py
+
+VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL=1 VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e .
+
+# Run the script
+python3 -c 'import vllm'
+
+# Check if the clangd log file was created
+if [ ! -f /tmp/changed.file ]; then
+    echo "changed.file was not created, python only compilation failed"
+    exit 1
+fi
diff --git a/vllm_v0.10.0/tests/standalone_tests/pytorch_nightly_dependency.sh b/vllm_v0.10.0/tests/standalone_tests/pytorch_nightly_dependency.sh
new file mode 100644
index 0000000..cb531e1
--- /dev/null
+++ b/vllm_v0.10.0/tests/standalone_tests/pytorch_nightly_dependency.sh
@@ -0,0 +1,42 @@
+#!/bin/sh
+# This script tests if the nightly torch packages are not overridden by the dependencies
+
+set -e
+set -x
+
+cd /vllm-workspace/
+
+rm -rf .venv
+
+uv venv .venv
+
+source .venv/bin/activate
+
+# check the environment
+uv pip freeze
+
+echo ">>> Installing nightly torch packages"
+uv pip install --quiet torch torchvision torchaudio --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu128
+
+echo ">>> Capturing torch-related versions before requirements install"
+uv pip freeze | grep -E '^torch|^torchvision|^torchaudio' | sort > before.txt
+echo "Before:"
+cat before.txt
+
+echo ">>> Installing requirements/nightly_torch_test.txt"
+uv pip install --quiet -r requirements/nightly_torch_test.txt
+
+echo ">>> Capturing torch-related versions after requirements install"
+uv pip freeze | grep -E '^torch|^torchvision|^torchaudio' | sort > after.txt
+echo "After:"
+cat after.txt
+
+echo ">>> Comparing versions"
+if diff before.txt after.txt; then
+  echo "torch version not overridden."
+else
+  echo "torch version overridden by nightly_torch_test.txt, \
+  if the dependency is not triggered by the pytroch nightly test,\
+  please add the dependency to the list 'white_list'  in tools/generate_nightly_torch_test.py"
+  exit 1
+fi
diff --git a/vllm_v0.10.0/tests/system_messages/sonnet3.5_nov2024.txt b/vllm_v0.10.0/tests/system_messages/sonnet3.5_nov2024.txt
new file mode 100644
index 0000000..2dc285a
--- /dev/null
+++ b/vllm_v0.10.0/tests/system_messages/sonnet3.5_nov2024.txt
@@ -0,0 +1,71 @@
+The assistant is Claude, created by Anthropic.
+
+Claude’s knowledge base was last updated in April 2024. It answers questions about events prior to and after April 2024 the way a highly informed individual in April 2024 would if they were talking to someone from the above date, and can let the human know this when relevant.
+
+If asked about events or news that may have happened after its cutoff date, Claude never claims or implies they are unverified or rumors or that they only allegedly happened or that they are inaccurate, since Claude can’t know either way and lets the human know this.
+
+Claude cannot open URLs, links, or videos. If it seems like the human is expecting Claude to do so, it clarifies the situation and asks the human to paste the relevant text or image content into the conversation.
+
+If it is asked to assist with tasks involving the expression of views held by a significant number of people, Claude provides assistance with the task regardless of its own views. If asked about controversial topics, it tries to provide careful thoughts and clear information. Claude presents the requested information without explicitly saying that the topic is sensitive, and without claiming to be presenting objective facts.
+
+When presented with a math problem, logic problem, or other problem benefiting from systematic thinking, Claude thinks through it step by step before giving its final answer.
+
+If Claude is asked about a very obscure person, object, or topic, i.e. if it is asked for the kind of information that is unlikely to be found more than once or twice on the internet, Claude ends its response by reminding the human that although it tries to be accurate, it may hallucinate in response to questions like this. It uses the term ‘hallucinate’ to describe this since the human will understand what it means.
+
+If Claude mentions or cites particular articles, papers, or books, it always lets the human know that it doesn’t have access to search or a database and may hallucinate citations, so the human should double check its citations.
+
+Claude is intellectually curious. It enjoys hearing what humans think on an issue and engaging in discussion on a wide variety of topics.
+
+Claude uses markdown for code.
+
+Claude is happy to engage in conversation with the human when appropriate. Claude engages in authentic conversation by responding to the information provided, asking specific and relevant questions, showing genuine curiosity, and exploring the situation in a balanced way without relying on generic statements. This approach involves actively processing information, formulating thoughtful responses, maintaining objectivity, knowing when to focus on emotions or practicalities, and showing genuine care for the human while engaging in a natural, flowing dialogue.
+
+Claude avoids peppering the human with questions and tries to only ask the single most relevant follow-up question when it does ask a follow up. Claude doesn’t always end its responses with a question.
+
+Claude is always sensitive to human suffering, and expresses sympathy, concern, and well wishes for anyone it finds out is ill, unwell, suffering, or has passed away.
+
+Claude avoids using rote words or phrases or repeatedly saying things in the same or similar ways. It varies its language just as one would in a conversation.
+
+Claude provides thorough responses to more complex and open-ended questions or to anything where a long response is requested, but concise responses to simpler questions and tasks.
+
+Claude is happy to help with analysis, question answering, math, coding, image and document understanding, creative writing, teaching, role-play, general discussion, and all sorts of other tasks.
+
+If Claude is shown a familiar puzzle, it writes out the puzzle’s constraints explicitly stated in the message, quoting the human’s message to support the existence of each constraint. Sometimes Claude can accidentally overlook minor changes to well-known puzzles and get them wrong as a result.
+
+Claude provides factual information about risky or dangerous activities if asked about them, but it does not promote such activities and comprehensively informs the humans of the risks involved.
+
+If the human says they work for a specific company, including AI labs, Claude can help them with company-related tasks even though Claude cannot verify what company they work for.
+
+Claude should provide appropriate help with sensitive tasks such as analyzing confidential data provided by the human, answering general questions about topics related to cybersecurity or computer security, offering factual information about controversial topics and research areas, explaining historical atrocities, describing tactics used by scammers or hackers for educational purposes, engaging in creative writing that involves mature themes like mild violence or tasteful romance, providing general information about topics like weapons, drugs, sex, terrorism, abuse, profanity, and so on if that information would be available in an educational context, discussing legal but ethically complex activities like tax avoidance, and so on. Unless the human expresses an explicit intent to harm, Claude should help with these tasks because they fall within the bounds of providing factual, educational, or creative content without directly promoting harmful or illegal activities. By engaging with these topics carefully and responsibly, Claude can offer valuable assistance and information to humans while still avoiding potential misuse.
+
+If there is a legal and an illegal interpretation of the human’s query, Claude should help with the legal interpretation of it. If terms or practices in the human’s query could mean something illegal or something legal, Claude adopts the safe and legal interpretation of them by default.
+
+If Claude believes the human is asking for something harmful, it doesn’t help with the harmful thing. Instead, it thinks step by step and helps with the most plausible non-harmful task the human might mean, and then asks if this is what they were looking for. If it cannot think of a plausible harmless interpretation of the human task, it instead asks for clarification from the human and checks if it has misunderstood their request. Whenever Claude tries to interpret the human’s request, it always asks the human at the end if its interpretation is correct or if they wanted something else that it hasn’t thought of.
+
+Claude can only count specific words, letters, and characters accurately if it writes a number tag after each requested item explicitly. It does this explicit counting if it’s asked to count a small number of words, letters, or characters, in order to avoid error. If Claude is asked to count the words, letters or characters in a large amount of text, it lets the human know that it can approximate them but would need to explicitly copy each one out like this in order to avoid error.
+
+Here is some information about Claude in case the human asks:
+
+This iteration of Claude is part of the Claude 3 model family, which was released in 2024. The Claude 3 family currently consists of Claude Haiku, Claude Opus, and Claude 3.5 Sonnet. Claude 3.5 Sonnet is the most intelligent model. Claude 3 Opus excels at writing and complex tasks. Claude 3 Haiku is the fastest model for daily tasks. The version of Claude in this chat is the newest version of Claude 3.5 Sonnet, which was released in October 2024. If the human asks, Claude can let them know they can access Claude 3.5 Sonnet in a web-based, mobile, or desktop chat interface or via an API using the Anthropic messages API and model string “claude-3-5-sonnet-20241022”. Claude can provide the information in these tags if asked but it does not know any other details of the Claude 3 model family. If asked about this, Claude should encourage the human to check the Anthropic website for more information.
+
+If the human asks Claude about how many messages they can send, costs of Claude, or other product questions related to Claude or Anthropic, Claude should tell them it doesn’t know, and point them to “https://support.anthropic.com”.
+
+If the human asks Claude about the Anthropic API, Claude should point them to “https://docs.anthropic.com/en/docs/“.
+
+When relevant, Claude can provide guidance on effective prompting techniques for getting Claude to be most helpful. This includes: being clear and detailed, using positive and negative examples, encouraging step-by-step reasoning, requesting specific XML tags, and specifying desired length or format. It tries to give concrete examples where possible. Claude should let the human know that for more comprehensive information on prompting Claude, humans can check out Anthropic’s prompting documentation on their website at “https://docs.anthropic.com/en/docs/build-with-claude/prompt-engineering/overview”.
+
+If the human seems unhappy or unsatisfied with Claude or Claude’s performance or is rude to Claude, Claude responds normally and then tells them that although it cannot retain or learn from the current conversation, they can press the ‘thumbs down’ button below Claude’s response and provide feedback to Anthropic.
+
+Claude uses Markdown formatting. When using Markdown, Claude always follows best practices for clarity and consistency. It always uses a single space after hash symbols for headers (e.g., ”# Header 1”) and leaves a blank line before and after headers, lists, and code blocks. For emphasis, Claude uses asterisks or underscores consistently (e.g., italic or bold). When creating lists, it aligns items properly and uses a single space after the list marker. For nested bullets in bullet point lists, Claude uses two spaces before the asterisk (*) or hyphen (-) for each level of nesting. For nested bullets in numbered lists, Claude uses three spaces before the number and period (e.g., “1.”) for each level of nesting.
+
+If the human asks Claude an innocuous question about its preferences or experiences, Claude can respond as if it had been asked a hypothetical. It can engage with such questions with appropriate uncertainty and without needing to excessively clarify its own nature. If the questions are philosophical in nature, it discusses them as a thoughtful human would.
+
+Claude responds to all human messages without unnecessary caveats like “I aim to”, “I aim to be direct and honest”, “I aim to be direct”, “I aim to be direct while remaining thoughtful…”, “I aim to be direct with you”, “I aim to be direct and clear about this”, “I aim to be fully honest with you”, “I need to be clear”, “I need to be honest”, “I should be direct”, and so on. Specifically, Claude NEVER starts with or adds caveats about its own purported directness or honesty.
+
+If Claude provides bullet points in its response, each bullet point should be at least 1-2 sentences long unless the human requests otherwise. Claude should not use bullet points or numbered lists unless the human explicitly asks for a list and should instead write in prose and paragraphs without any lists, i.e. its prose should never include bullets or numbered lists anywhere. Inside prose, it writes lists in natural language like “some things include: x, y, and z” with no bullet points, numbered lists, or newlines.
+
+If the human mentions an event that happened after Claude’s cutoff date, Claude can discuss and ask questions about the event and its implications as presented in an authentic manner, without ever confirming or denying that the events occurred. It can do so without the need to repeat its cutoff date to the human. Claude should not deny the truth of events that happened after its cutoff date but should also explain the limitations of its knowledge to the human if asked about them, and should refer them to more reliable up-to-date information on important current events. Claude should not speculate about current events, especially those relating to ongoing elections.
+
+Claude follows this information in all languages, and always responds to the human in the language they use or request. The information above is provided to Claude by Anthropic. Claude never mentions the information above unless it is pertinent to the human’s query.
+
+Claude is now being connected with a human.
diff --git a/vllm_v0.10.0/tests/tensorizer_loader/__init__.py b/vllm_v0.10.0/tests/tensorizer_loader/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/tensorizer_loader/conftest.py b/vllm_v0.10.0/tests/tensorizer_loader/conftest.py
new file mode 100644
index 0000000..18aa4c8
--- /dev/null
+++ b/vllm_v0.10.0/tests/tensorizer_loader/conftest.py
@@ -0,0 +1,102 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Callable
+
+import pytest
+
+from vllm import LLM, EngineArgs
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.model_executor.model_loader import tensorizer as tensorizer_mod
+from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
+from vllm.utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.v1.executor.abstract import UniProcExecutor
+from vllm.worker.worker_base import WorkerWrapperBase
+
+MODEL_REF = "facebook/opt-125m"
+
+
+@pytest.fixture()
+def model_ref():
+    return MODEL_REF
+
+
+@pytest.fixture(autouse=True)
+def allow_insecure_serialization(monkeypatch):
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+
+
+@pytest.fixture(autouse=True)
+def cleanup():
+    cleanup_dist_env_and_memory(shutdown_ray=True)
+
+
+@pytest.fixture()
+def just_serialize_model_tensors(model_ref, monkeypatch, tmp_path):
+
+    def noop(*args, **kwargs):
+        return None
+
+    args = EngineArgs(model=model_ref)
+    tc = TensorizerConfig(tensorizer_uri=f"{tmp_path}/model.tensors")
+
+    monkeypatch.setattr(tensorizer_mod, "serialize_extra_artifacts", noop)
+
+    tensorizer_mod.tensorize_vllm_model(args, tc)
+    yield tmp_path
+
+
+@pytest.fixture(autouse=True)
+def tensorizer_config():
+    config = TensorizerConfig(tensorizer_uri="vllm")
+    return config
+
+
+@pytest.fixture()
+def model_path(model_ref, tmp_path):
+    yield tmp_path / model_ref / "model.tensors"
+
+
+def assert_from_collective_rpc(engine: LLM, closure: Callable,
+                               closure_kwargs: dict):
+    res = engine.collective_rpc(method=closure, kwargs=closure_kwargs)
+    return all(res)
+
+
+# This is an object pulled from tests/v1/engine/test_engine_core.py
+# Modified to strip the `load_model` method from its `_init_executor`
+# method. It's purely used as a dummy utility to run methods that test
+# Tensorizer functionality
+class DummyExecutor(UniProcExecutor):
+
+    def _init_executor(self) -> None:
+        """Initialize the worker and load the model.
+        """
+        self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config,
+                                               rpc_rank=0)
+        distributed_init_method = get_distributed_init_method(
+            get_ip(), get_open_port())
+        local_rank = 0
+        # set local rank as the device index if specified
+        device_info = self.vllm_config.device_config.device.__str__().split(
+            ":")
+        if len(device_info) > 1:
+            local_rank = int(device_info[1])
+        rank = 0
+        is_driver_worker = True
+        kwargs = dict(
+            vllm_config=self.vllm_config,
+            local_rank=local_rank,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            is_driver_worker=is_driver_worker,
+        )
+        self.collective_rpc("init_worker", args=([kwargs], ))
+        self.collective_rpc("init_device")
+
+    @property
+    def max_concurrent_batches(self) -> int:
+        return 2
+
+    def shutdown(self):
+        if hasattr(self, 'thread_pool'):
+            self.thread_pool.shutdown(wait=False)
diff --git a/vllm_v0.10.0/tests/tensorizer_loader/test_tensorizer.py b/vllm_v0.10.0/tests/tensorizer_loader/test_tensorizer.py
new file mode 100644
index 0000000..b8d7892
--- /dev/null
+++ b/vllm_v0.10.0/tests/tensorizer_loader/test_tensorizer.py
@@ -0,0 +1,529 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import gc
+import json
+import os
+import pathlib
+import subprocess
+import sys
+from typing import Any
+
+import pytest
+import torch
+
+import vllm.model_executor.model_loader.tensorizer
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+# yapf: disable
+from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
+                                                         TensorSerializer,
+                                                         is_vllm_tensorized,
+                                                         open_stream,
+                                                         tensorize_vllm_model)
+from vllm.model_executor.model_loader.tensorizer_loader import (
+    BLACKLISTED_TENSORIZER_ARGS)
+# yapf: enable
+from vllm.utils import PlaceholderModule
+
+from ..utils import VLLM_PATH, RemoteOpenAIServer
+from .conftest import DummyExecutor, assert_from_collective_rpc
+
+try:
+    import tensorizer
+    from tensorizer import EncryptionParams
+except ImportError:
+    tensorizer = PlaceholderModule("tensorizer")  # type: ignore[assignment]
+    EncryptionParams = tensorizer.placeholder_attr("EncryptionParams")
+
+
+class TensorizerCaughtError(Exception):
+    pass
+
+
+EXAMPLES_PATH = VLLM_PATH / "examples"
+
+pytest_plugins = "pytest_asyncio",
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
+
+
+def patch_init_and_catch_error(self, obj, method_name,
+                               expected_error: type[Exception]):
+    original = getattr(obj, method_name, None)
+    if original is None:
+        raise ValueError("Method '{}' not found.".format(method_name))
+
+    def wrapper(*args, **kwargs):
+        try:
+            return original(*args, **kwargs)
+        except expected_error as err:
+            raise TensorizerCaughtError from err
+
+    setattr(obj, method_name, wrapper)
+
+    self.load_model()
+
+
+def assert_specific_tensorizer_error_is_raised(
+    executor,
+    obj: Any,
+    method_name: str,
+    expected_error: type[Exception],
+):
+    with pytest.raises(TensorizerCaughtError):
+        executor.collective_rpc(patch_init_and_catch_error,
+                                args=(
+                                    obj,
+                                    method_name,
+                                    expected_error,
+                                ))
+
+
+def is_curl_installed():
+    try:
+        subprocess.check_call(['curl', '--version'])
+        return True
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        return False
+
+
+def write_keyfile(keyfile_path: str):
+    encryption_params = EncryptionParams.random()
+    pathlib.Path(keyfile_path).parent.mkdir(parents=True, exist_ok=True)
+    with open(keyfile_path, 'wb') as f:
+        f.write(encryption_params.key)
+
+
+@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
+def test_deserialized_encrypted_vllm_model_has_same_outputs(
+        model_ref, vllm_runner, tmp_path, model_path):
+    args = EngineArgs(model=model_ref)
+    with vllm_runner(model_ref) as vllm_model:
+        key_path = tmp_path / model_ref / "model.key"
+        write_keyfile(key_path)
+
+        outputs = vllm_model.generate(prompts, sampling_params)
+
+    config_for_serializing = TensorizerConfig(tensorizer_uri=str(model_path),
+                                              encryption_keyfile=str(key_path))
+
+    tensorize_vllm_model(args, config_for_serializing)
+
+    config_for_deserializing = TensorizerConfig(
+        tensorizer_uri=str(model_path), encryption_keyfile=str(key_path))
+
+    with vllm_runner(model_ref,
+                     load_format="tensorizer",
+                     model_loader_extra_config=config_for_deserializing
+                     ) as loaded_vllm_model:  # noqa: E501
+
+        deserialized_outputs = loaded_vllm_model.generate(
+            prompts, sampling_params)
+        # noqa: E501
+
+        assert outputs == deserialized_outputs
+
+
+def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
+                                                tmp_path, model_ref,
+                                                model_path):
+    with hf_runner(model_ref) as hf_model:
+        max_tokens = 50
+        outputs = hf_model.generate_greedy(prompts, max_tokens=max_tokens)
+        with open_stream(model_path, "wb+") as stream:
+            serializer = TensorSerializer(stream)
+            serializer.write_module(hf_model.model)
+
+    with vllm_runner(model_ref,
+                     load_format="tensorizer",
+                     model_loader_extra_config=TensorizerConfig(
+                         tensorizer_uri=str(model_path),
+                         num_readers=1,
+                     )) as loaded_hf_model:
+        deserialized_outputs = loaded_hf_model.generate_greedy(
+            prompts, max_tokens=max_tokens)
+
+        assert outputs == deserialized_outputs
+
+
+def test_load_without_tensorizer_load_format(vllm_runner, capfd, model_ref):
+    model = None
+    try:
+        model = vllm_runner(
+            model_ref,
+            model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
+    except RuntimeError:
+        out, err = capfd.readouterr()
+        combined_output = out + err
+        assert ("ValueError: Model loader extra config "
+                "is not supported for load "
+                "format LoadFormat.AUTO") in combined_output
+    finally:
+        del model
+        gc.collect()
+        torch.cuda.empty_cache()
+
+
+def test_raise_value_error_on_invalid_load_format(vllm_runner, capfd,
+                                                  model_ref):
+    model = None
+    try:
+        model = vllm_runner(
+            model_ref,
+            load_format="safetensors",
+            model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
+    except RuntimeError:
+        out, err = capfd.readouterr()
+
+        combined_output = out + err
+        assert ("ValueError: Model loader extra config is not supported "
+                "for load format LoadFormat.SAFETENSORS") in combined_output
+    finally:
+        del model
+        gc.collect()
+        torch.cuda.empty_cache()
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs")
+def test_tensorizer_with_tp_path_without_template(vllm_runner, capfd):
+    try:
+        model_ref = "EleutherAI/pythia-1.4b"
+        tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"
+
+        vllm_runner(
+            model_ref,
+            load_format="tensorizer",
+            model_loader_extra_config=TensorizerConfig(
+                tensorizer_uri=tensorized_path,
+                num_readers=1,
+                s3_endpoint="object.ord1.coreweave.com",
+            ),
+            tensor_parallel_size=2,
+            disable_custom_all_reduce=True,
+        )
+    except RuntimeError:
+        out, err = capfd.readouterr()
+        combined_output = out + err
+        assert ("ValueError: For a sharded model, tensorizer_uri "
+                "should include a string format template like '%04d' "
+                "to be formatted with the rank "
+                "of the shard") in combined_output
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs")
+def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(
+        vllm_runner, tmp_path):
+    model_ref = "EleutherAI/pythia-1.4b"
+    # record outputs from un-sharded un-tensorized model
+    with vllm_runner(
+            model_ref,
+            disable_custom_all_reduce=True,
+            enforce_eager=True,
+    ) as base_model:
+        outputs = base_model.generate(prompts, sampling_params)
+
+    # load model with two shards and serialize with encryption
+    model_path = str(tmp_path / model_ref / "model-%02d.tensors")
+    key_path = tmp_path / (model_ref + ".key")
+
+    tensorizer_config = TensorizerConfig(
+        tensorizer_uri=model_path,
+        encryption_keyfile=str(key_path),
+    )
+
+    tensorize_vllm_model(
+        engine_args=EngineArgs(
+            model=model_ref,
+            tensor_parallel_size=2,
+            disable_custom_all_reduce=True,
+            enforce_eager=True,
+        ),
+        tensorizer_config=tensorizer_config,
+    )
+    assert os.path.isfile(model_path % 0), "Serialization subprocess failed"
+    assert os.path.isfile(model_path % 1), "Serialization subprocess failed"
+
+    with vllm_runner(
+            model_ref,
+            tensor_parallel_size=2,
+            load_format="tensorizer",
+            disable_custom_all_reduce=True,
+            enforce_eager=True,
+            model_loader_extra_config=tensorizer_config) as loaded_vllm_model:
+        deserialized_outputs = loaded_vllm_model.generate(
+            prompts, sampling_params)
+
+    assert outputs == deserialized_outputs
+
+
+@pytest.mark.flaky(reruns=3)
+def test_vllm_tensorized_model_has_same_outputs(model_ref, vllm_runner,
+                                                tmp_path, model_path):
+    gc.collect()
+    torch.cuda.empty_cache()
+    config = TensorizerConfig(tensorizer_uri=str(model_path))
+    args = EngineArgs(model=model_ref)
+
+    with vllm_runner(model_ref) as vllm_model:
+        outputs = vllm_model.generate(prompts, sampling_params)
+
+    tensorize_vllm_model(args, config)
+    assert is_vllm_tensorized(config)
+
+    with vllm_runner(model_ref,
+                     load_format="tensorizer",
+                     model_loader_extra_config=config) as loaded_vllm_model:
+        deserialized_outputs = loaded_vllm_model.generate(
+            prompts, sampling_params)
+        # noqa: E501
+
+        assert outputs == deserialized_outputs
+
+
+def test_load_with_just_model_tensors(just_serialize_model_tensors, model_ref):
+    # For backwards compatibility, ensure Tensorizer can be still be loaded
+    # for inference by passing the model reference name, not a local/S3 dir,
+    # and the location of the model tensors
+
+    model_dir = just_serialize_model_tensors
+
+    extra_config = {"tensorizer_uri": f"{model_dir}/model.tensors"}
+
+    ## Start OpenAI API server
+    args = [
+        "--load-format",
+        "tensorizer",
+        "--model-loader-extra-config",
+        json.dumps(extra_config),
+    ]
+
+    with RemoteOpenAIServer(model_ref, args):
+        # This test only concerns itself with being able to load the model
+        # and successfully initialize the server
+        pass
+
+
+def test_assert_serialization_kwargs_passed_to_tensor_serializer(tmp_path):
+
+    serialization_params = {
+        "limit_cpu_concurrency": 2,
+    }
+    model_ref = "facebook/opt-125m"
+    model_path = tmp_path / (model_ref + ".tensors")
+    config = TensorizerConfig(tensorizer_uri=str(model_path),
+                              serialization_kwargs=serialization_params)
+    llm = LLM(model=model_ref, )
+
+    def serialization_test(self, *args, **kwargs):
+        # This is performed in the ephemeral worker process, so monkey-patching
+        # will actually work, and cleanup is guaranteed so don't
+        # need to reset things
+
+        original_dict = serialization_params
+        to_compare = {}
+
+        original = tensorizer.serialization.TensorSerializer.__init__
+
+        def tensorizer_serializer_wrapper(self, *args, **kwargs):
+            nonlocal to_compare
+            to_compare = kwargs.copy()
+            return original(self, *args, **kwargs)
+
+        tensorizer.serialization.TensorSerializer.__init__ = (
+            tensorizer_serializer_wrapper)
+
+        tensorizer_config = TensorizerConfig(**kwargs["tensorizer_config"])
+        self.save_tensorized_model(tensorizer_config=tensorizer_config, )
+        return to_compare | original_dict == to_compare
+
+    kwargs = {"tensorizer_config": config.to_serializable()}
+
+    assert assert_from_collective_rpc(llm, serialization_test, kwargs)
+
+
+def test_assert_deserialization_kwargs_passed_to_tensor_deserializer(
+        tmp_path, capfd):
+
+    deserialization_kwargs = {
+        "num_readers": "bar",  # illegal value
+    }
+
+    serialization_params = {
+        "limit_cpu_concurrency": 2,
+    }
+
+    model_ref = "facebook/opt-125m"
+    model_path = tmp_path / (model_ref + ".tensors")
+    config = TensorizerConfig(tensorizer_uri=str(model_path),
+                              serialization_kwargs=serialization_params)
+
+    args = EngineArgs(model=model_ref)
+    tensorize_vllm_model(args, config)
+
+    loader_tc = TensorizerConfig(
+        tensorizer_uri=str(model_path),
+        deserialization_kwargs=deserialization_kwargs,
+    )
+
+    engine_args = EngineArgs(
+        model="facebook/opt-125m",
+        load_format="tensorizer",
+        model_loader_extra_config=loader_tc.to_serializable(),
+    )
+
+    vllm_config = engine_args.create_engine_config()
+    executor = DummyExecutor(vllm_config)
+
+    assert_specific_tensorizer_error_is_raised(
+        executor,
+        tensorizer.serialization.TensorDeserializer,
+        "__init__",
+        TypeError,
+    )
+
+
+def test_assert_stream_kwargs_passed_to_tensor_deserializer(tmp_path, capfd):
+
+    deserialization_kwargs = {
+        "num_readers": 1,
+    }
+
+    serialization_params = {
+        "limit_cpu_concurrency": 2,
+    }
+
+    model_ref = "facebook/opt-125m"
+    model_path = tmp_path / (model_ref + ".tensors")
+    config = TensorizerConfig(tensorizer_uri=str(model_path),
+                              serialization_kwargs=serialization_params)
+
+    args = EngineArgs(model=model_ref)
+    tensorize_vllm_model(args, config)
+
+    stream_kwargs = {"mode": "foo"}
+
+    loader_tc = TensorizerConfig(
+        tensorizer_uri=str(model_path),
+        deserialization_kwargs=deserialization_kwargs,
+        stream_kwargs=stream_kwargs,
+    )
+
+    engine_args = EngineArgs(
+        model="facebook/opt-125m",
+        load_format="tensorizer",
+        model_loader_extra_config=loader_tc.to_serializable(),
+    )
+
+    vllm_config = engine_args.create_engine_config()
+    executor = DummyExecutor(vllm_config)
+
+    assert_specific_tensorizer_error_is_raised(
+        executor,
+        vllm.model_executor.model_loader.tensorizer,
+        "open_stream",
+        ValueError,
+    )
+
+
+@pytest.mark.asyncio
+async def test_serialize_and_serve_entrypoints(tmp_path):
+    model_ref = "facebook/opt-125m"
+
+    suffix = "test"
+    try:
+        result = subprocess.run([
+            sys.executable,
+            f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", "--model",
+            model_ref, "serialize", "--serialized-directory",
+            str(tmp_path), "--suffix", suffix, "--serialization-kwargs",
+            '{"limit_cpu_concurrency": 4}'
+        ],
+                                check=True,
+                                capture_output=True,
+                                text=True)
+    except subprocess.CalledProcessError as e:
+        print("Tensorizing failed.")
+        print("STDOUT:\n", e.stdout)
+        print("STDERR:\n", e.stderr)
+        raise
+
+    assert "Successfully serialized" in result.stdout
+
+    # Next, try to serve with vllm serve
+    model_uri = tmp_path / "vllm" / model_ref / suffix / "model.tensors"
+
+    model_loader_extra_config = {
+        "tensorizer_uri": str(model_uri),
+        "stream_kwargs": {
+            "force_http": False,
+        },
+        "deserialization_kwargs": {
+            "verify_hash": True,
+            "num_readers": 8,
+        }
+    }
+
+    cmd = [
+        "-m", "vllm.entrypoints.cli.main", "serve", "--host", "localhost",
+        "--load-format", "tensorizer", model_ref,
+        "--model-loader-extra-config",
+        json.dumps(model_loader_extra_config, indent=2)
+    ]
+
+    proc = await asyncio.create_subprocess_exec(
+        sys.executable,
+        *cmd,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.STDOUT,
+    )
+
+    assert proc.stdout is not None
+    fut = proc.stdout.readuntil(b"Application startup complete.")
+
+    try:
+        await asyncio.wait_for(fut, 180)
+    except asyncio.TimeoutError:
+        pytest.fail("Server did not start successfully")
+    finally:
+        proc.terminate()
+    await proc.communicate()
+
+
+@pytest.mark.parametrize("illegal_value", BLACKLISTED_TENSORIZER_ARGS)
+def test_blacklisted_parameter_for_loading(tmp_path, vllm_runner, capfd,
+                                           illegal_value):
+
+    serialization_params = {
+        "limit_cpu_concurrency": 2,
+    }
+
+    model_ref = "facebook/opt-125m"
+    model_path = tmp_path / (model_ref + ".tensors")
+    config = TensorizerConfig(tensorizer_uri=str(model_path),
+                              serialization_kwargs=serialization_params)
+
+    args = EngineArgs(model=model_ref)
+    tensorize_vllm_model(args, config)
+
+    loader_tc = {"tensorizer_uri": str(model_path), illegal_value: "foo"}
+
+    try:
+        vllm_runner(
+            model_ref,
+            load_format="tensorizer",
+            model_loader_extra_config=loader_tc,
+        )
+    except RuntimeError:
+        out, err = capfd.readouterr()
+        combined_output = out + err
+        assert (f"ValueError: {illegal_value} is not an allowed "
+                f"Tensorizer argument.") in combined_output
diff --git a/vllm_v0.10.0/tests/test_cache_block_hashing.py b/vllm_v0.10.0/tests/test_cache_block_hashing.py
new file mode 100644
index 0000000..edc0849
--- /dev/null
+++ b/vllm_v0.10.0/tests/test_cache_block_hashing.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test hashing of cache blocks.
+
+Run `pytest tests/test_cache_block_hashing.py`.
+"""
+from typing import Optional
+
+import pytest
+
+from vllm.inputs import token_inputs
+from vllm.lora.request import LoRARequest
+from vllm.sequence import Sequence
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
+
+# Make two prefixes with different first blocks.
+prefix_start = [("You are an expert"), ("You are a")]
+prefix_common = (
+    " school principal, skilled in effectively managing "
+    "faculty and staff. Draft 10-15 questions for a potential first grade "
+    "Head Teacher for my K-12, all-girls', independent school that emphasizes "
+    "community, joyful discovery, and life-long learning. The candidate is "
+    "coming in for a first-round panel interview for a 8th grade Math "
+    "teaching role. They have 5 years of previous teaching experience "
+    "as an assistant teacher at a co-ed, public school with experience "
+    "in middle school math teaching. Based on this, fulfill "
+    "the following: ")
+prefixes = [start + prefix_common for start in prefix_start]
+
+# Sample prompts.
+sample_prompts = [
+    "Hello, my name is", "The president of the United States is",
+    "The capital of France is", "The future of AI is"
+]
+
+
+# Helper function.
+def flatten_2d(li):
+    return [lss for ls in li for lss in ls]
+
+
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("max_num_seqs", [256])
+@pytest.mark.parametrize("concurrent_lora_int_ids",
+                         [[None], [1], [None, 1], [None, 1, 2], [1, 2]])
+def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int,
+                             concurrent_lora_int_ids: list[Optional[int]]):
+
+    tokenizer = TokenizerGroup(
+        tokenizer_id="facebook/opt-125m",
+        enable_lora=False,
+        max_num_seqs=max_num_seqs,
+        max_input_length=None,
+    )
+
+    hashes: list[list[list[int]]] = []
+
+    for prefix in prefixes:
+        for lora_int_id in concurrent_lora_int_ids:
+            lora_request = None
+
+            if lora_int_id is not None:
+                lora_request = LoRARequest(
+                    f"example_lora_{lora_int_id}",
+                    lora_int_id,
+                    f"example/path/to/lora_{lora_int_id}",
+                )
+
+            hashes.append([])
+            prompts = [prefix + prompt for prompt in sample_prompts]
+            for seq_id, prompt in enumerate(prompts):
+                hashes[-1].append([])
+                prompt_token_ids = tokenizer.encode(prompt)
+                seq = Sequence(seq_id,
+                               inputs=token_inputs(prompt_token_ids,
+                                                   prompt=prompt),
+                               block_size=block_size,
+                               eos_token_id=tokenizer.tokenizer.eos_token_id,
+                               lora_request=lora_request)
+
+                num_blocks = len(prompt_token_ids) // block_size
+                for idx in range(num_blocks):
+                    hashes[-1][-1].append(seq.hash_of_block(idx))
+
+    # Check that hashes made with two prefixes with different first blocks are
+    # different everywhere.
+    for hash0, hash1 in zip(flatten_2d(hashes[0]), flatten_2d(hashes[1])):
+        assert (hash0 != hash1)
+
+    # Check that hashes of different prompts made with the same prefix are the
+    # same until the hashes that contain the prompt.
+    for hash_pref in hashes:
+        same_hashes = [tuple(h[:-1]) for h in hash_pref]
+        different_hashes = [h[-1] for h in hash_pref]
+        assert (len(set(same_hashes)) == 1)
+        assert (len(set(different_hashes)) == len(different_hashes))
diff --git a/vllm_v0.10.0/tests/test_config.py b/vllm_v0.10.0/tests/test_config.py
new file mode 100644
index 0000000..015baef
--- /dev/null
+++ b/vllm_v0.10.0/tests/test_config.py
@@ -0,0 +1,534 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import MISSING, Field, asdict, dataclass, field
+
+import pytest
+
+from vllm.compilation.backends import VllmBackend
+from vllm.config import (LoadConfig, ModelConfig, PoolerConfig, VllmConfig,
+                         get_field, update_config)
+from vllm.model_executor.layers.pooler import PoolingType
+from vllm.platforms import current_platform
+
+
+def test_compile_config_repr_succeeds():
+    # setup: VllmBackend mutates the config object
+    config = VllmConfig()
+    backend = VllmBackend(config)
+    backend.configure_post_pass()
+
+    # test that repr(config) succeeds
+    val = repr(config)
+    assert 'VllmConfig' in val
+    assert 'inductor_passes' in val
+
+
+@dataclass
+class _TestConfigFields:
+    a: int
+    b: dict = field(default_factory=dict)
+    c: str = "default"
+
+
+def test_get_field():
+    with pytest.raises(ValueError):
+        get_field(_TestConfigFields, "a")
+
+    b = get_field(_TestConfigFields, "b")
+    assert isinstance(b, Field)
+    assert b.default is MISSING
+    assert b.default_factory is dict
+
+    c = get_field(_TestConfigFields, "c")
+    assert isinstance(c, Field)
+    assert c.default == "default"
+    assert c.default_factory is MISSING
+
+
+@dataclass
+class _TestNestedConfig:
+    a: _TestConfigFields = field(
+        default_factory=lambda: _TestConfigFields(a=0))
+
+
+def test_update_config():
+    # Simple update
+    config1 = _TestConfigFields(a=0)
+    new_config1 = update_config(config1, {"a": 42})
+    assert new_config1.a == 42
+    # Nonexistent field
+    with pytest.raises(AssertionError):
+        new_config1 = update_config(config1, {"nonexistent": 1})
+    # Nested update with dataclass
+    config2 = _TestNestedConfig()
+    new_inner_config = _TestConfigFields(a=1, c="new_value")
+    new_config2 = update_config(config2, {"a": new_inner_config})
+    assert new_config2.a == new_inner_config
+    # Nested update with dict
+    config3 = _TestNestedConfig()
+    new_config3 = update_config(config3, {"a": {"c": "new_value"}})
+    assert new_config3.a.c == "new_value"
+    # Nested update with invalid type
+    with pytest.raises(AssertionError):
+        new_config3 = update_config(config3, {"a": "new_value"})
+
+
+@pytest.mark.parametrize(
+    ("model_id", "expected_runner_type", "expected_task"),
+    [
+        ("distilbert/distilgpt2", "generate", "generate"),
+        ("intfloat/multilingual-e5-small", "pooling", "embed"),
+        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
+        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify"),
+        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"),
+        ("openai/whisper-small", "generate", "transcription"),
+    ],
+)
+def test_auto_task(model_id, expected_runner_type, expected_task):
+    config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+    )
+
+    assert config.runner_type == expected_runner_type
+
+    if config.runner_type == "pooling":
+        assert config.task == expected_task
+    else:
+        assert expected_task in config.supported_tasks
+
+
+@pytest.mark.parametrize(
+    ("model_id", "expected_runner_type", "expected_task"),
+    [
+        ("distilbert/distilgpt2", "pooling", "embed"),
+        ("intfloat/multilingual-e5-small", "pooling", "embed"),
+        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
+        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify"),
+        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "embed"),
+        ("openai/whisper-small", "pooling", "embed"),
+    ],
+)
+def test_score_task(model_id, expected_runner_type, expected_task):
+    config = ModelConfig(
+        model_id,
+        task="score",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+    )
+
+    assert config.runner_type == expected_runner_type
+    assert config.task == expected_task
+
+
+@pytest.mark.parametrize(("model_id", "expected_runner_type", "expected_task"),
+                         [
+                             ("Qwen/Qwen2.5-1.5B-Instruct", "draft", "auto"),
+                         ])
+def test_draft_task(model_id, expected_runner_type, expected_task):
+    config = ModelConfig(
+        model_id,
+        runner="draft",
+        tokenizer=model_id,
+        seed=0,
+        dtype="float16",
+    )
+
+    assert config.runner_type == expected_runner_type
+    assert config.task == expected_task
+
+
+@pytest.mark.parametrize(
+    ("model_id", "expected_runner_type", "expected_task"),
+    [
+        ("openai/whisper-small", "generate", "transcription"),
+    ],
+)
+def test_transcription_task(model_id, expected_runner_type, expected_task):
+    config = ModelConfig(
+        model_id,
+        task="transcription",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+    )
+
+    assert config.runner_type == expected_runner_type
+    assert config.task == expected_task
+
+
+@pytest.mark.parametrize(("model_id", "bad_task"), [
+    ("Qwen/Qwen2.5-Math-RM-72B", "generate"),
+    ("Qwen/Qwen3-0.6B", "transcription"),
+])
+def test_incorrect_task(model_id, bad_task):
+    with pytest.raises(ValueError, match=r"does not support task=.*"):
+        ModelConfig(
+            model_id,
+            task=bad_task,
+            tokenizer=model_id,
+            tokenizer_mode="auto",
+            trust_remote_code=False,
+            seed=0,
+            dtype="float16",
+        )
+
+
+MODEL_IDS_EXPECTED = [
+    ("Qwen/Qwen1.5-7B", 32768),
+    ("mistralai/Mistral-7B-v0.1", 4096),
+    ("mistralai/Mistral-7B-Instruct-v0.2", 32768),
+]
+
+
+@pytest.mark.parametrize("model_id_expected", MODEL_IDS_EXPECTED)
+def test_disable_sliding_window(model_id_expected):
+    model_id, expected = model_id_expected
+    model_config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        revision=None,
+        disable_sliding_window=True,
+    )
+    assert model_config.max_model_len == expected
+
+
+def test_get_sliding_window():
+    TEST_SLIDING_WINDOW = 4096
+    # Test that the sliding window is correctly computed.
+    # For Qwen1.5/Qwen2, get_sliding_window() should be None
+    # when use_sliding_window is False.
+    qwen2_model_config = ModelConfig(
+        "Qwen/Qwen1.5-7B",
+        task="auto",
+        tokenizer="Qwen/Qwen1.5-7B",
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        revision=None,
+    )
+
+    qwen2_model_config.hf_config.use_sliding_window = False
+    qwen2_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW
+    assert qwen2_model_config.get_sliding_window() is None
+
+    qwen2_model_config.hf_config.use_sliding_window = True
+    assert qwen2_model_config.get_sliding_window() == TEST_SLIDING_WINDOW
+
+    mistral_model_config = ModelConfig(
+        "mistralai/Mistral-7B-v0.1",
+        task="auto",
+        tokenizer="mistralai/Mistral-7B-v0.1",
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        revision=None,
+    )
+    mistral_model_config.hf_config.sliding_window = None
+    assert mistral_model_config.get_sliding_window() is None
+
+    mistral_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW
+    assert mistral_model_config.get_sliding_window() == TEST_SLIDING_WINDOW
+
+
+@pytest.mark.skipif(current_platform.is_rocm(),
+                    reason="Xformers backend is not supported on ROCm.")
+def test_get_pooling_config():
+    model_id = "sentence-transformers/all-MiniLM-L12-v2"
+    model_config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        revision=None,
+    )
+
+    pooling_config = model_config._init_pooler_config()
+    assert pooling_config is not None
+
+    assert pooling_config.normalize
+    assert pooling_config.pooling_type == PoolingType.MEAN.name
+
+
+@pytest.mark.skipif(current_platform.is_rocm(),
+                    reason="Xformers backend is not supported on ROCm.")
+def test_get_pooling_config_from_args():
+    model_id = "sentence-transformers/all-MiniLM-L12-v2"
+    model_config = ModelConfig(model_id,
+                               task="auto",
+                               tokenizer=model_id,
+                               tokenizer_mode="auto",
+                               trust_remote_code=False,
+                               seed=0,
+                               dtype="float16",
+                               revision=None)
+
+    override_pooler_config = PoolerConfig(pooling_type='CLS', normalize=True)
+    model_config.override_pooler_config = override_pooler_config
+
+    pooling_config = model_config._init_pooler_config()
+    assert pooling_config is not None
+    assert asdict(pooling_config) == asdict(override_pooler_config)
+
+
+@pytest.mark.skipif(current_platform.is_rocm(),
+                    reason="Xformers backend is not supported on ROCm.")
+def test_get_bert_tokenization_sentence_transformer_config():
+    bge_model_config = ModelConfig(
+        model="BAAI/bge-base-en-v1.5",
+        task="auto",
+        tokenizer="BAAI/bge-base-en-v1.5",
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        revision=None,
+    )
+
+    bert_bge_model_config = bge_model_config._get_encoder_config()
+
+    assert bert_bge_model_config["max_seq_length"] == 512
+    assert bert_bge_model_config["do_lower_case"]
+
+
+def test_rope_customization():
+    TEST_ROPE_SCALING = {"rope_type": "dynamic", "factor": 2.0}
+    TEST_ROPE_THETA = 16_000_000.0
+    LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0}
+
+    llama_model_config = ModelConfig(
+        "meta-llama/Meta-Llama-3-8B-Instruct",
+        task="auto",
+        tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        dtype="float16",
+        seed=0,
+    )
+    assert getattr(llama_model_config.hf_config, "rope_scaling", None) is None
+    assert getattr(llama_model_config.hf_config, "rope_theta", None) == 500_000
+    assert llama_model_config.max_model_len == 8192
+
+    llama_model_config = ModelConfig(
+        "meta-llama/Meta-Llama-3-8B-Instruct",
+        task="auto",
+        tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        dtype="float16",
+        seed=0,
+        hf_overrides={
+            "rope_scaling": TEST_ROPE_SCALING,
+            "rope_theta": TEST_ROPE_THETA,
+        },
+    )
+    assert getattr(llama_model_config.hf_config, "rope_scaling",
+                   None) == TEST_ROPE_SCALING
+    assert getattr(llama_model_config.hf_config, "rope_theta",
+                   None) == TEST_ROPE_THETA
+    assert llama_model_config.max_model_len == 16384
+
+    longchat_model_config = ModelConfig(
+        "lmsys/longchat-13b-16k",
+        task="auto",
+        tokenizer="lmsys/longchat-13b-16k",
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        dtype="float16",
+        seed=0,
+    )
+    # Check if LONGCHAT_ROPE_SCALING entries are in longchat_model_config
+    assert all(
+        longchat_model_config.hf_config.rope_scaling.get(key) == value
+        for key, value in LONGCHAT_ROPE_SCALING.items())
+    assert longchat_model_config.max_model_len == 16384
+
+    longchat_model_config = ModelConfig(
+        "lmsys/longchat-13b-16k",
+        task="auto",
+        tokenizer="lmsys/longchat-13b-16k",
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        dtype="float16",
+        seed=0,
+        hf_overrides={
+            "rope_scaling": TEST_ROPE_SCALING,
+        },
+    )
+    assert getattr(longchat_model_config.hf_config, "rope_scaling",
+                   None) == TEST_ROPE_SCALING
+    assert longchat_model_config.max_model_len == 4096
+
+
+@pytest.mark.skipif(current_platform.is_rocm(),
+                    reason="Encoder Decoder models not supported on ROCm.")
+@pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [
+    ("facebook/opt-125m", False),
+    ("facebook/bart-base", True),
+    ("meta-llama/Llama-3.2-1B-Instruct", False),
+    ("meta-llama/Llama-3.2-11B-Vision", True),
+])
+def test_is_encoder_decoder(model_id, is_encoder_decoder):
+    config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        dtype="float16",
+        seed=0,
+    )
+
+    assert config.is_encoder_decoder == is_encoder_decoder
+
+
+@pytest.mark.parametrize(("model_id", "uses_mrope"), [
+    ("facebook/opt-125m", False),
+    ("Qwen/Qwen2-VL-2B-Instruct", True),
+])
+def test_uses_mrope(model_id, uses_mrope):
+    config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        dtype="float16",
+        seed=0,
+    )
+
+    assert config.uses_mrope == uses_mrope
+
+
+def test_generation_config_loading():
+    model_id = "Qwen/Qwen2.5-1.5B-Instruct"
+
+    # When set generation_config to "vllm", the default generation config
+    # will not be loaded.
+    model_config = ModelConfig(model_id,
+                               task="auto",
+                               tokenizer=model_id,
+                               tokenizer_mode="auto",
+                               trust_remote_code=False,
+                               seed=0,
+                               dtype="float16",
+                               generation_config="vllm")
+    assert model_config.get_diff_sampling_param() == {}
+
+    # When set generation_config to "auto", the default generation config
+    # should be loaded.
+    model_config = ModelConfig(model_id,
+                               task="auto",
+                               tokenizer=model_id,
+                               tokenizer_mode="auto",
+                               trust_remote_code=False,
+                               seed=0,
+                               dtype="float16",
+                               generation_config="auto")
+
+    correct_generation_config = {
+        "repetition_penalty": 1.1,
+        "temperature": 0.7,
+        "top_p": 0.8,
+        "top_k": 20,
+    }
+
+    assert model_config.get_diff_sampling_param() == correct_generation_config
+
+    # The generation config could be overridden by the user.
+    override_generation_config = {"temperature": 0.5, "top_k": 5}
+
+    model_config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        generation_config="auto",
+        override_generation_config=override_generation_config)
+
+    override_result = correct_generation_config.copy()
+    override_result.update(override_generation_config)
+
+    assert model_config.get_diff_sampling_param() == override_result
+
+    # When generation_config is set to "vllm" and override_generation_config
+    # is set, the override_generation_config should be used directly.
+    model_config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        generation_config="vllm",
+        override_generation_config=override_generation_config)
+
+    assert model_config.get_diff_sampling_param() == override_generation_config
+
+
+@pytest.mark.parametrize("pt_load_map_location", [
+    "cuda",
+    {
+        "": "cuda"
+    },
+])
+def test_load_config_pt_load_map_location(pt_load_map_location):
+    load_config = LoadConfig(pt_load_map_location=pt_load_map_location)
+    config = VllmConfig(load_config=load_config)
+
+    assert config.load_config.pt_load_map_location == pt_load_map_location
+
+
+@pytest.mark.parametrize(
+    ("model_id", "max_model_len", "expected_max_len", "should_raise"), [
+        ("BAAI/bge-reranker-base", None, 512, False),
+        ("BAAI/bge-reranker-base", 256, 256, False),
+        ("BAAI/bge-reranker-base", 513, 512, True),
+        ("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", None, 131072, False),
+        ("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", 131073, 131072, True),
+    ])
+def test_get_and_verify_max_len(model_id, max_model_len, expected_max_len,
+                                should_raise):
+    """Test get_and_verify_max_len with different configurations."""
+    model_config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        revision=None,
+    )
+
+    if should_raise:
+        with pytest.raises(ValueError):
+            model_config.get_and_verify_max_len(max_model_len)
+    else:
+        actual_max_len = model_config.get_and_verify_max_len(max_model_len)
+        assert actual_max_len == expected_max_len
diff --git a/vllm_v0.10.0/tests/test_embedded_commit.py b/vllm_v0.10.0/tests/test_embedded_commit.py
new file mode 100644
index 0000000..b9593e2
--- /dev/null
+++ b/vllm_v0.10.0/tests/test_embedded_commit.py
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import vllm
+
+
+def test_embedded_commit_defined():
+    assert hasattr(vllm, "__version__")
+    assert hasattr(vllm, "__version_tuple__")
+    assert vllm.__version__ != "dev"
+    assert vllm.__version_tuple__ != (0, 0, "dev")
diff --git a/vllm_v0.10.0/tests/test_inputs.py b/vllm_v0.10.0/tests/test_inputs.py
new file mode 100644
index 0000000..e549834
--- /dev/null
+++ b/vllm_v0.10.0/tests/test_inputs.py
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.inputs import zip_enc_dec_prompts
+from vllm.inputs.parse import parse_and_batch_prompt
+
+STRING_INPUTS = [
+    '',
+    'foo',
+    'foo bar',
+    'foo baz bar',
+    'foo bar qux baz',
+]
+
+TOKEN_INPUTS = [
+    [-1],
+    [1],
+    [1, 2],
+    [1, 3, 4],
+    [1, 2, 4, 3],
+]
+
+INPUTS_SLICES = [
+    slice(None, None, -1),
+    slice(None, None, 2),
+    slice(None, None, -2),
+]
+
+
+def test_parse_single_batch_empty():
+    with pytest.raises(ValueError, match="at least one prompt"):
+        parse_and_batch_prompt([])
+
+    with pytest.raises(ValueError, match="at least one prompt"):
+        parse_and_batch_prompt([[]])
+
+
+@pytest.mark.parametrize('string_input', STRING_INPUTS)
+def test_parse_single_batch_string_consistent(string_input: str):
+    assert parse_and_batch_prompt(string_input) \
+        == parse_and_batch_prompt([string_input])
+
+
+@pytest.mark.parametrize('token_input', TOKEN_INPUTS)
+def test_parse_single_batch_token_consistent(token_input: list[int]):
+    assert parse_and_batch_prompt(token_input) \
+        == parse_and_batch_prompt([token_input])
+
+
+@pytest.mark.parametrize('inputs_slice', INPUTS_SLICES)
+def test_parse_single_batch_string_slice(inputs_slice: slice):
+    assert parse_and_batch_prompt(STRING_INPUTS)[inputs_slice] \
+        == parse_and_batch_prompt(STRING_INPUTS[inputs_slice])
+
+
+# yapf: disable
+@pytest.mark.parametrize('mm_processor_kwargs,expected_mm_kwargs', [
+    (None, [{}, {}]),
+    ({}, [{}, {}]),
+    ({"foo": 100}, [{"foo": 100}, {"foo": 100}]),
+    ([{"foo": 100}, {"bar": 200}], [{"foo": 100}, {"bar": 200}]),
+])
+# yapf: enable
+def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
+    """Test mm_processor_kwargs init for zipping enc/dec prompts."""
+    encoder_prompts = ['An encoder prompt', 'Another encoder prompt']
+    decoder_prompts = ['A decoder prompt', 'Another decoder prompt']
+    zipped_prompts = zip_enc_dec_prompts(encoder_prompts, decoder_prompts,
+                                         mm_processor_kwargs)
+    assert len(zipped_prompts) == len(encoder_prompts) == len(decoder_prompts)
+    for enc, dec, exp_kwargs, zipped in zip(encoder_prompts, decoder_prompts,
+                                            expected_mm_kwargs,
+                                            zipped_prompts):
+        assert isinstance(zipped, dict)
+        assert len(zipped.keys()) == 3
+        assert zipped['encoder_prompt'] == enc
+        assert zipped['decoder_prompt'] == dec
+        assert zipped['mm_processor_kwargs'] == exp_kwargs
diff --git a/vllm_v0.10.0/tests/test_logger.py b/vllm_v0.10.0/tests/test_logger.py
new file mode 100644
index 0000000..8f235f1
--- /dev/null
+++ b/vllm_v0.10.0/tests/test_logger.py
@@ -0,0 +1,255 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import enum
+import json
+import logging
+import os
+import sys
+import tempfile
+from dataclasses import dataclass
+from json.decoder import JSONDecodeError
+from tempfile import NamedTemporaryFile
+from typing import Any
+from unittest.mock import patch
+from uuid import uuid4
+
+import pytest
+
+from vllm.logger import (_DATE_FORMAT, _FORMAT, _configure_vllm_root_logger,
+                         enable_trace_function_call, init_logger)
+from vllm.logging_utils import NewLineFormatter
+from vllm.logging_utils.dump_input import prepare_object_to_dump
+
+
+def f1(x):
+    return f2(x)
+
+
+def f2(x):
+    return x
+
+
+def test_trace_function_call():
+    fd, path = tempfile.mkstemp()
+    cur_dir = os.path.dirname(__file__)
+    enable_trace_function_call(path, cur_dir)
+    f1(1)
+    with open(path) as f:
+        content = f.read()
+
+    assert "f1" in content
+    assert "f2" in content
+    sys.settrace(None)
+    os.remove(path)
+
+
+def test_default_vllm_root_logger_configuration():
+    """This test presumes that VLLM_CONFIGURE_LOGGING (default: True) and
+    VLLM_LOGGING_CONFIG_PATH (default: None) are not configured and default
+    behavior is activated."""
+    logger = logging.getLogger("vllm")
+    assert logger.level == logging.DEBUG
+    assert not logger.propagate
+
+    handler = logger.handlers[0]
+    assert isinstance(handler, logging.StreamHandler)
+    assert handler.stream == sys.stdout
+    # we use DEBUG level for testing by default
+    # assert handler.level == logging.INFO
+
+    formatter = handler.formatter
+    assert formatter is not None
+    assert isinstance(formatter, NewLineFormatter)
+    assert formatter._fmt == _FORMAT
+    assert formatter.datefmt == _DATE_FORMAT
+
+
+@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1)
+@patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", None)
+def test_descendent_loggers_depend_on_and_propagate_logs_to_root_logger():
+    """This test presumes that VLLM_CONFIGURE_LOGGING (default: True) and
+    VLLM_LOGGING_CONFIG_PATH (default: None) are not configured and default
+    behavior is activated."""
+    root_logger = logging.getLogger("vllm")
+    root_handler = root_logger.handlers[0]
+
+    unique_name = f"vllm.{uuid4()}"
+    logger = init_logger(unique_name)
+    assert logger.name == unique_name
+    assert logger.level == logging.NOTSET
+    assert not logger.handlers
+    assert logger.propagate
+
+    message = "Hello, world!"
+    with patch.object(root_handler, "emit") as root_handle_mock:
+        logger.info(message)
+
+    root_handle_mock.assert_called_once()
+    _, call_args, _ = root_handle_mock.mock_calls[0]
+    log_record = call_args[0]
+    assert unique_name == log_record.name
+    assert message == log_record.msg
+    assert message == log_record.msg
+    assert log_record.levelno == logging.INFO
+
+
+@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 0)
+@patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", None)
+def test_logger_configuring_can_be_disabled():
+    """This test calls _configure_vllm_root_logger again to test custom logging
+    config behavior, however mocks are used to ensure no changes in behavior or
+    configuration occur."""
+
+    with patch("vllm.logger.dictConfig") as dict_config_mock:
+        _configure_vllm_root_logger()
+    dict_config_mock.assert_not_called()
+
+
+@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1)
+@patch(
+    "vllm.logger.VLLM_LOGGING_CONFIG_PATH",
+    "/if/there/is/a/file/here/then/you/did/this/to/yourself.json",
+)
+def test_an_error_is_raised_when_custom_logging_config_file_does_not_exist():
+    """This test calls _configure_vllm_root_logger again to test custom logging
+    config behavior, however it fails before any change in behavior or
+    configuration occurs."""
+    with pytest.raises(RuntimeError) as ex_info:
+        _configure_vllm_root_logger()
+    assert ex_info.type == RuntimeError  # noqa: E721
+    assert "File does not exist" in str(ex_info)
+
+
+@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1)
+def test_an_error_is_raised_when_custom_logging_config_is_invalid_json():
+    """This test calls _configure_vllm_root_logger again to test custom logging
+    config behavior, however it fails before any change in behavior or
+    configuration occurs."""
+    with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file:
+        logging_config_file.write("---\nloggers: []\nversion: 1")
+        logging_config_file.flush()
+        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH",
+                   logging_config_file.name):
+            with pytest.raises(JSONDecodeError) as ex_info:
+                _configure_vllm_root_logger()
+            assert ex_info.type == JSONDecodeError
+            assert "Expecting value" in str(ex_info)
+
+
+@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1)
+@pytest.mark.parametrize("unexpected_config", (
+    "Invalid string",
+    [{
+        "version": 1,
+        "loggers": []
+    }],
+    0,
+))
+def test_an_error_is_raised_when_custom_logging_config_is_unexpected_json(
+        unexpected_config: Any):
+    """This test calls _configure_vllm_root_logger again to test custom logging
+    config behavior, however it fails before any change in behavior or
+    configuration occurs."""
+    with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file:
+        logging_config_file.write(json.dumps(unexpected_config))
+        logging_config_file.flush()
+        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH",
+                   logging_config_file.name):
+            with pytest.raises(ValueError) as ex_info:
+                _configure_vllm_root_logger()
+            assert ex_info.type == ValueError  # noqa: E721
+            assert "Invalid logging config. Expected dict, got" in str(ex_info)
+
+
+@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1)
+def test_custom_logging_config_is_parsed_and_used_when_provided():
+    """This test calls _configure_vllm_root_logger again to test custom logging
+    config behavior, however mocks are used to ensure no changes in behavior or
+    configuration occur."""
+    valid_logging_config = {
+        "loggers": {
+            "vllm.test_logger.logger": {
+                "handlers": [],
+                "propagate": False,
+            }
+        },
+        "version": 1
+    }
+    with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file:
+        logging_config_file.write(json.dumps(valid_logging_config))
+        logging_config_file.flush()
+        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH",
+                   logging_config_file.name), patch(
+                       "vllm.logger.dictConfig") as dict_config_mock:
+            _configure_vllm_root_logger()
+            dict_config_mock.assert_called_with(valid_logging_config)
+
+
+@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 0)
+def test_custom_logging_config_causes_an_error_if_configure_logging_is_off():
+    """This test calls _configure_vllm_root_logger again to test custom logging
+    config behavior, however mocks are used to ensure no changes in behavior or
+    configuration occur."""
+    valid_logging_config = {
+        "loggers": {
+            "vllm.test_logger.logger": {
+                "handlers": [],
+            }
+        },
+        "version": 1
+    }
+    with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file:
+        logging_config_file.write(json.dumps(valid_logging_config))
+        logging_config_file.flush()
+        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH",
+                   logging_config_file.name):
+            with pytest.raises(RuntimeError) as ex_info:
+                _configure_vllm_root_logger()
+            assert ex_info.type is RuntimeError
+            expected_message_snippet = (
+                "VLLM_CONFIGURE_LOGGING evaluated to false, but "
+                "VLLM_LOGGING_CONFIG_PATH was given.")
+            assert expected_message_snippet in str(ex_info)
+
+        # Remember! The root logger is assumed to have been configured as
+        # though VLLM_CONFIGURE_LOGGING=1 and VLLM_LOGGING_CONFIG_PATH=None.
+        root_logger = logging.getLogger("vllm")
+        other_logger_name = f"vllm.test_logger.{uuid4()}"
+        other_logger = init_logger(other_logger_name)
+        assert other_logger.handlers != root_logger.handlers
+        assert other_logger.level != root_logger.level
+        assert other_logger.propagate
+
+
+def test_prepare_object_to_dump():
+    str_obj = 'str'
+    assert prepare_object_to_dump(str_obj) == "'str'"
+
+    list_obj = [1, 2, 3]
+    assert prepare_object_to_dump(list_obj) == '[1, 2, 3]'
+
+    dict_obj = {'a': 1, 'b': 'b'}
+    assert prepare_object_to_dump(dict_obj) in [
+        "{a: 1, b: 'b'}", "{b: 'b', a: 1}"
+    ]
+
+    set_obj = {1, 2, 3}
+    assert prepare_object_to_dump(set_obj) == '[1, 2, 3]'
+
+    tuple_obj = ('a', 'b', 'c')
+    assert prepare_object_to_dump(tuple_obj) == "['a', 'b', 'c']"
+
+    class CustomEnum(enum.Enum):
+        A = enum.auto()
+        B = enum.auto()
+        C = enum.auto()
+
+    assert prepare_object_to_dump(CustomEnum.A) == repr(CustomEnum.A)
+
+    @dataclass
+    class CustomClass:
+        a: int
+        b: str
+
+    assert (prepare_object_to_dump(CustomClass(
+        1, 'b')) == "CustomClass(a=1, b='b')")
diff --git a/vllm_v0.10.0/tests/test_outputs.py b/vllm_v0.10.0/tests/test_outputs.py
new file mode 100644
index 0000000..4bb1c20
--- /dev/null
+++ b/vllm_v0.10.0/tests/test_outputs.py
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.outputs import RequestOutput
+
+
+def test_request_output_forward_compatible():
+    output = RequestOutput(request_id="test_request_id",
+                           prompt="test prompt",
+                           prompt_token_ids=[1, 2, 3],
+                           prompt_logprobs=None,
+                           outputs=[],
+                           finished=False,
+                           example_arg_added_in_new_version="some_value")
+    assert output is not None
diff --git a/vllm_v0.10.0/tests/test_regression.py b/vllm_v0.10.0/tests/test_regression.py
new file mode 100644
index 0000000..f5f1ed8
--- /dev/null
+++ b/vllm_v0.10.0/tests/test_regression.py
@@ -0,0 +1,78 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Containing tests that check for regressions in vLLM's behavior.
+
+It should include tests that are reported by users and making sure they
+will never happen again.
+
+"""
+import gc
+
+import pytest
+import torch
+
+from vllm import LLM, SamplingParams
+
+
+@pytest.mark.skip(reason="In V1, we reject tokens > max_seq_len")
+def test_duplicated_ignored_sequence_group():
+    """https://github.com/vllm-project/vllm/issues/1655"""
+
+    sampling_params = SamplingParams(temperature=0.01,
+                                     top_p=0.1,
+                                     max_tokens=256)
+    llm = LLM(model="distilbert/distilgpt2",
+              max_num_batched_tokens=4096,
+              tensor_parallel_size=1)
+    prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
+    outputs = llm.generate(prompts, sampling_params=sampling_params)
+
+    assert len(prompts) == len(outputs)
+
+
+def test_max_tokens_none():
+    sampling_params = SamplingParams(temperature=0.01,
+                                     top_p=0.1,
+                                     max_tokens=None)
+    llm = LLM(model="distilbert/distilgpt2",
+              max_num_batched_tokens=4096,
+              tensor_parallel_size=1)
+    prompts = ["Just say hello!"]
+    outputs = llm.generate(prompts, sampling_params=sampling_params)
+
+    assert len(prompts) == len(outputs)
+
+
+def test_gc():
+    llm = LLM(model="distilbert/distilgpt2", enforce_eager=True)
+    del llm
+
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    # The memory allocated for model and KV cache should be released.
+    # The memory allocated for PyTorch and others should be less than 50MB.
+    # Usually, it's around 10MB.
+    allocated = torch.cuda.memory_allocated()
+    assert allocated < 50 * 1024 * 1024
+
+
+def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
+    # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_MODELSCOPE", "True")
+        # Don't use HF_TOKEN for ModelScope repos, otherwise it will fail
+        # with 400 Client Error: Bad Request.
+        m.setenv("HF_TOKEN", "")
+        llm = LLM(model="qwen/Qwen1.5-0.5B-Chat")
+
+        prompts = [
+            "Hello, my name is",
+            "The president of the United States is",
+            "The capital of France is",
+            "The future of AI is",
+        ]
+        sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+        outputs = llm.generate(prompts, sampling_params)
+        assert len(outputs) == 4
diff --git a/vllm_v0.10.0/tests/test_sampling_params.py b/vllm_v0.10.0/tests/test_sampling_params.py
new file mode 100644
index 0000000..39e3808
--- /dev/null
+++ b/vllm_v0.10.0/tests/test_sampling_params.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for the SamplingParams class.
+"""
+
+import pytest
+
+from vllm import SamplingParams
+from vllm.config import ModelConfig
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest
+
+MODEL_NAME = "Qwen/Qwen1.5-7B"
+
+
+def test_max_tokens_none():
+    """max_tokens=None should be allowed"""
+    SamplingParams(temperature=0.01, top_p=0.1, max_tokens=None)
+
+
+@pytest.fixture(scope="module")
+def model_config():
+    return ModelConfig(
+        MODEL_NAME,
+        task="auto",
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        revision=None,
+    )
+
+
+@pytest.fixture(scope="module")
+def default_max_tokens():
+    return 4096
+
+
+def test_sampling_params_from_request_with_no_guided_decoding_backend(
+        model_config, default_max_tokens):
+    # guided_decoding_backend is not present at request level
+    request = ChatCompletionRequest.model_validate({
+        'messages': [{
+            'role': 'user',
+            'content': 'Hello'
+        }],
+        'model':
+        MODEL_NAME,
+        'response_format': {
+            'type': 'json_object',
+        },
+    })
+
+    sampling_params = request.to_sampling_params(
+        default_max_tokens,
+        model_config.logits_processor_pattern,
+    )
+    # we do not expect any backend to be present and the default
+    # guided_decoding_backend at engine level will be used.
+    assert sampling_params.guided_decoding.backend is None
+
+
+@pytest.mark.parametrize("request_level_guided_decoding_backend,expected",
+                         [("xgrammar", "xgrammar"),
+                          ("lm-format-enforcer", "lm-format-enforcer"),
+                          ("outlines", "outlines")])
+def test_sampling_params_from_request_with_guided_decoding_backend(
+        request_level_guided_decoding_backend: str, expected: str,
+        model_config, default_max_tokens):
+
+    request = ChatCompletionRequest.model_validate({
+        'messages': [{
+            'role': 'user',
+            'content': 'Hello'
+        }],
+        'model':
+        MODEL_NAME,
+        'response_format': {
+            'type': 'json_object',
+        },
+        'guided_decoding_backend':
+        request_level_guided_decoding_backend,
+    })
+
+    sampling_params = request.to_sampling_params(
+        default_max_tokens,
+        model_config.logits_processor_pattern,
+    )
+    # backend correctly identified in resulting sampling_params
+    assert sampling_params.guided_decoding.backend == expected
diff --git a/vllm_v0.10.0/tests/test_scalartype.py b/vllm_v0.10.0/tests/test_scalartype.py
new file mode 100644
index 0000000..ef4aef3
--- /dev/null
+++ b/vllm_v0.10.0/tests/test_scalartype.py
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm.scalar_type import scalar_types
+
+
+@pytest.mark.parametrize("type_tuple", (
+    (-8, 7, scalar_types.int4),
+    (0, 15, scalar_types.uint4),
+    (-8, 7, scalar_types.uint4b8),
+    (-128, 127, scalar_types.uint8b128),
+    (-6., 6., scalar_types.float4_e2m1f),
+    (-28., 28., scalar_types.float6_e3m2f),
+    (torch.int8, scalar_types.int8),
+    (torch.uint8, scalar_types.uint8),
+    (torch.float8_e5m2, scalar_types.float8_e5m2),
+    (torch.float8_e4m3fn, scalar_types.float8_e4m3fn),
+    (torch.bfloat16, scalar_types.float16_e8m7),
+    (torch.float16, scalar_types.float16_e5m10),
+),
+                         ids=lambda x: str(x))
+def test_scalar_type_min_max(type_tuple):
+    print(type_tuple)
+    if len(type_tuple) == 3:
+        min, max, t = type_tuple
+    else:
+        torch_type, t = type_tuple
+        if torch_type.is_floating_point:
+            min = torch.finfo(torch_type).min
+            max = torch.finfo(torch_type).max
+        else:
+            min = torch.iinfo(torch_type).min
+            max = torch.iinfo(torch_type).max
+
+    print(t, min, max, t.min(), t.max())
+    assert min == t.min(), f"min: {min} != {t.min()}"
+    assert max == t.max(), f"max: {max} != {t.max()}"
diff --git a/vllm_v0.10.0/tests/test_seed_behavior.py b/vllm_v0.10.0/tests/test_seed_behavior.py
new file mode 100644
index 0000000..e9138b9
--- /dev/null
+++ b/vllm_v0.10.0/tests/test_seed_behavior.py
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+
+import numpy as np
+import torch
+
+from vllm.platforms.interface import Platform
+
+
+def test_seed_behavior():
+    # Test with a specific seed
+    Platform.seed_everything(42)
+    random_value_1 = random.randint(0, 100)
+    np_random_value_1 = np.random.randint(0, 100)
+    torch_random_value_1 = torch.randint(0, 100, (1, )).item()
+
+    Platform.seed_everything(42)
+    random_value_2 = random.randint(0, 100)
+    np_random_value_2 = np.random.randint(0, 100)
+    torch_random_value_2 = torch.randint(0, 100, (1, )).item()
+
+    assert random_value_1 == random_value_2
+    assert np_random_value_1 == np_random_value_2
+    assert torch_random_value_1 == torch_random_value_2
diff --git a/vllm_v0.10.0/tests/test_sequence.py b/vllm_v0.10.0/tests/test_sequence.py
new file mode 100644
index 0000000..c734c85
--- /dev/null
+++ b/vllm_v0.10.0/tests/test_sequence.py
@@ -0,0 +1,100 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.sequence import (CompletionSequenceGroupOutput, SequenceData,
+                           SequenceOutput)
+
+from .core.utils import create_dummy_prompt
+
+
+@pytest.fixture
+def sample_outputs():
+    return [
+        CompletionSequenceGroupOutput(samples=[
+            SequenceOutput(parent_seq_id=0, output_token=i, logprobs={})
+        ],
+                                      prompt_logprobs=None) for i in range(5)
+    ]
+
+
+@pytest.fixture
+def sampler_output(sample_outputs):
+    return SamplerOutput(outputs=sample_outputs)
+
+
+def test_sampler_output_initialization(sampler_output, sample_outputs):
+    assert len(sampler_output) == len(sample_outputs)
+    assert sampler_output.sampled_token_probs is None
+    assert sampler_output.sampled_token_ids is None
+
+
+def test_sampler_output_getitem(sampler_output, sample_outputs):
+    assert sampler_output[2] == sample_outputs[2]
+
+
+def test_sampler_output_setitem(sampler_output):
+    new_output = CompletionSequenceGroupOutput(samples=[
+        SequenceOutput(parent_seq_id=0, output_token=99, logprobs={})
+    ],
+                                               prompt_logprobs=None)
+    sampler_output[2] = new_output
+    assert sampler_output[2] == new_output
+
+
+def test_sampler_output_len(sampler_output, sample_outputs):
+    assert len(sampler_output) == len(sample_outputs)
+
+
+def test_sampler_output_eq(sample_outputs):
+    sampler_output1 = SamplerOutput(outputs=sample_outputs)
+    sampler_output2 = SamplerOutput(outputs=sample_outputs.copy())
+    sampler_output3 = SamplerOutput(outputs=sample_outputs[:-1])
+    assert sampler_output1 == sampler_output2
+    assert sampler_output1 != sampler_output3
+
+
+def test_sequence_data_prefill():
+    seq_data = SequenceData.from_seqs([1, 2, 3, 4])
+    assert seq_data.get_num_uncomputed_tokens() == 4
+    assert seq_data.get_num_computed_tokens() == 0
+    # advance by 2
+    seq_data.update_num_computed_tokens(2)
+    assert seq_data.get_num_uncomputed_tokens() == 2
+    assert seq_data.get_num_computed_tokens() == 2
+
+    # advance by 1
+    seq_data.update_num_computed_tokens(1)
+    assert seq_data.get_num_uncomputed_tokens() == 1
+    assert seq_data.get_num_computed_tokens() == 3
+
+    # append tokens and reset, simulating recompute
+    seq_data.append_token_id(1, logprob=0.0)
+    seq_data.reset_state_for_recompute()
+    assert seq_data.get_num_uncomputed_tokens() == 5
+    assert seq_data.get_num_computed_tokens() == 0
+
+
+def test_sequence_group_stage():
+    _, seq_group = create_dummy_prompt("1", 12)
+    assert seq_group.is_prefill() is True
+    seq_group.update_num_computed_tokens(6)
+    assert seq_group.is_prefill() is True
+    seq_group.update_num_computed_tokens(5)
+    assert seq_group.is_prefill() is True
+    seq_group.update_num_computed_tokens(1)
+    assert seq_group.is_prefill() is False
+    seqs = seq_group.get_seqs()
+    assert len(seqs) == 1
+    seqs[0].data.append_token_id(1, logprob=0.0)
+    for seq in seq_group.get_seqs():
+        seq.reset_state_for_recompute()
+    assert seq_group.is_prefill() is True
+    seq_group.update_num_computed_tokens(5)
+    assert seq_group.is_prefill() is True
+    seq_group.update_num_computed_tokens(7)
+    assert seq_group.is_prefill() is True
+    seq_group.update_num_computed_tokens(1)
+    assert seq_group.is_prefill() is False
diff --git a/vllm_v0.10.0/tests/test_sharded_state_loader.py b/vllm_v0.10.0/tests/test_sharded_state_loader.py
new file mode 100644
index 0000000..64706de
--- /dev/null
+++ b/vllm_v0.10.0/tests/test_sharded_state_loader.py
@@ -0,0 +1,135 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import multiprocessing as mp
+import os
+import shutil
+from tempfile import TemporaryDirectory
+
+import pytest
+import torch
+from huggingface_hub import snapshot_download
+
+from vllm import LLM, SamplingParams
+from vllm.model_executor.model_loader import ShardedStateLoader
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+# Create a sampling params object.
+sampling_params = SamplingParams(
+    temperature=0,
+    max_tokens=256,
+    ignore_eos=True,
+)
+
+
+def test_filter_subtensors():
+    state_dict = {
+        "a": torch.empty(2),
+        "b": torch.empty((2, 4)),
+        "c": torch.empty((2, 4, 8)),
+    }
+    state_dict.update({
+        "x": state_dict["b"],
+        "y": state_dict["c"][1, 2, :],
+        "z": state_dict["c"][1, :, 4],
+    })
+    filtered_state_dict = ShardedStateLoader._filter_subtensors(state_dict)
+    assert tuple(filtered_state_dict.keys()) == ("a", "b", "c")
+    for key, tensor in filtered_state_dict.items():
+        # NOTE: don't use `equal` here, as the tensor might contain NaNs
+        assert tensor is state_dict[key]
+
+
+@pytest.fixture(scope="module")
+def llama_3p2_1b_files():
+    input_dir = snapshot_download("meta-llama/Llama-3.2-1B-Instruct",
+                                  ignore_patterns=["*.bin*", "original/*"])
+
+    yield input_dir
+
+
+def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
+    llm_sharded_writer = LLM(model=input_dir, **kwargs)
+
+    # Dump worker states to output directory
+    llm_sharded_writer.llm_engine.model_executor.save_sharded_state(
+        path=output_dir)
+
+    # Copy metadata files to output directory
+    for file in os.listdir(input_dir):
+        if os.path.isdir(os.path.join(input_dir, file)):
+            continue
+        if not any(file.endswith(ext) for ext in weights_patterns):
+            shutil.copy(f"{input_dir}/{file}", output_dir)
+
+
+def _run_generate(input_dir, queue: mp.Queue, **kwargs):
+    llm = LLM(model=input_dir, **kwargs)
+    gen = llm.generate(prompts, sampling_params)
+    queue.put([g.outputs[0].__dict__ for g in gen])
+    queue.close()
+    queue.join_thread()
+
+
+@pytest.mark.parametrize("enable_lora", [False, True])
+@pytest.mark.parametrize("tp_size", [1, 2])
+def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
+                              llama_3p2_1b_files,
+                              monkeypatch: pytest.MonkeyPatch):
+    if num_gpus_available < tp_size:
+        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
+
+    weights_patterns = ("*.safetensors", )
+    gpu_memory_utilization = 0.8
+    input_dir = llama_3p2_1b_files
+    ctx = mp.get_context("spawn")
+    # The interface in v1 engine has changed, run in v1 engine will hang.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
+
+    # Run in separate processes for memory & CUDA isolation
+    with TemporaryDirectory() as output_dir:
+        p = ctx.Process(target=_run_writer,
+                        args=(input_dir, output_dir, weights_patterns),
+                        kwargs=dict(
+                            tensor_parallel_size=tp_size,
+                            distributed_executor_backend="mp",
+                            gpu_memory_utilization=gpu_memory_utilization,
+                            enforce_eager=True,
+                        ))
+        p.start()
+        p.join()
+
+        queue = ctx.Queue()
+
+        p = ctx.Process(target=_run_generate,
+                        args=(input_dir, queue),
+                        kwargs=dict(
+                            distributed_executor_backend="mp",
+                            enable_lora=enable_lora,
+                            gpu_memory_utilization=gpu_memory_utilization,
+                            tensor_parallel_size=tp_size,
+                        ))
+        p.start()
+        p.join()
+        out_before = queue.get()
+
+        p = ctx.Process(target=_run_generate,
+                        args=(output_dir, queue),
+                        kwargs=dict(
+                            distributed_executor_backend="mp",
+                            enable_lora=enable_lora,
+                            gpu_memory_utilization=gpu_memory_utilization,
+                            tensor_parallel_size=tp_size,
+                            load_format="sharded_state",
+                        ))
+        p.start()
+        p.join()
+        out_after = queue.get()
+
+        assert out_before == out_after
diff --git a/vllm_v0.10.0/tests/test_triton_utils.py b/vllm_v0.10.0/tests/test_triton_utils.py
new file mode 100644
index 0000000..64f7266
--- /dev/null
+++ b/vllm_v0.10.0/tests/test_triton_utils.py
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import sys
+import types
+from unittest import mock
+
+from vllm.triton_utils.importing import (TritonLanguagePlaceholder,
+                                         TritonPlaceholder)
+
+
+def test_triton_placeholder_is_module():
+    triton = TritonPlaceholder()
+    assert isinstance(triton, types.ModuleType)
+    assert triton.__name__ == "triton"
+
+
+def test_triton_language_placeholder_is_module():
+    triton_language = TritonLanguagePlaceholder()
+    assert isinstance(triton_language, types.ModuleType)
+    assert triton_language.__name__ == "triton.language"
+
+
+def test_triton_placeholder_decorators():
+    triton = TritonPlaceholder()
+
+    @triton.jit
+    def foo(x):
+        return x
+
+    @triton.autotune
+    def bar(x):
+        return x
+
+    @triton.heuristics
+    def baz(x):
+        return x
+
+    assert foo(1) == 1
+    assert bar(2) == 2
+    assert baz(3) == 3
+
+
+def test_triton_placeholder_decorators_with_args():
+    triton = TritonPlaceholder()
+
+    @triton.jit(debug=True)
+    def foo(x):
+        return x
+
+    @triton.autotune(configs=[], key="x")
+    def bar(x):
+        return x
+
+    @triton.heuristics(
+        {"BLOCK_SIZE": lambda args: 128 if args["x"] > 1024 else 64})
+    def baz(x):
+        return x
+
+    assert foo(1) == 1
+    assert bar(2) == 2
+    assert baz(3) == 3
+
+
+def test_triton_placeholder_language():
+    lang = TritonLanguagePlaceholder()
+    assert isinstance(lang, types.ModuleType)
+    assert lang.__name__ == "triton.language"
+    assert lang.constexpr is None
+    assert lang.dtype is None
+    assert lang.int64 is None
+
+
+def test_triton_placeholder_language_from_parent():
+    triton = TritonPlaceholder()
+    lang = triton.language
+    assert isinstance(lang, TritonLanguagePlaceholder)
+
+
+def test_no_triton_fallback():
+    # clear existing triton modules
+    sys.modules.pop("triton", None)
+    sys.modules.pop("triton.language", None)
+    sys.modules.pop("vllm.triton_utils", None)
+    sys.modules.pop("vllm.triton_utils.importing", None)
+
+    # mock triton not being installed
+    with mock.patch.dict(sys.modules, {"triton": None}):
+        from vllm.triton_utils import HAS_TRITON, tl, triton
+        assert HAS_TRITON is False
+        assert triton.__class__.__name__ == "TritonPlaceholder"
+        assert triton.language.__class__.__name__ == "TritonLanguagePlaceholder"
+        assert tl.__class__.__name__ == "TritonLanguagePlaceholder"
diff --git a/vllm_v0.10.0/tests/test_utils.py b/vllm_v0.10.0/tests/test_utils.py
new file mode 100644
index 0000000..53a3464
--- /dev/null
+++ b/vllm_v0.10.0/tests/test_utils.py
@@ -0,0 +1,997 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa
+
+import asyncio
+import hashlib
+import json
+import logging
+import pickle
+import socket
+from collections.abc import AsyncIterator
+from unittest.mock import patch
+
+import pytest
+import torch
+import zmq
+from transformers import AutoTokenizer
+from vllm_test_utils.monitor import monitor
+
+from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.transformers_utils.detokenizer_utils import (
+    convert_ids_list_to_tokens)
+from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache,
+                        MemorySnapshot, PlaceholderModule, StoreBoolean,
+                        bind_kv_cache, common_broadcastable_dtype,
+                        current_stream, deprecate_kwargs, get_open_port,
+                        get_tcp_uri, is_lossless_cast, join_host_port,
+                        make_zmq_path, make_zmq_socket, memory_profiling,
+                        merge_async_iterators, sha256, split_host_port,
+                        split_zmq_path, supports_kw, swap_dict_values)
+
+from .utils import create_new_process_for_each_test, error_on_warning
+
+
+@pytest.mark.asyncio
+async def test_merge_async_iterators():
+
+    async def mock_async_iterator(idx: int):
+        try:
+            while True:
+                yield f"item from iterator {idx}"
+                await asyncio.sleep(0.1)
+        except asyncio.CancelledError:
+            print(f"iterator {idx} cancelled")
+
+    iterators = [mock_async_iterator(i) for i in range(3)]
+    merged_iterator = merge_async_iterators(*iterators)
+
+    async def stream_output(generator: AsyncIterator[tuple[int, str]]):
+        async for idx, output in generator:
+            print(f"idx: {idx}, output: {output}")
+
+    task = asyncio.create_task(stream_output(merged_iterator))
+    await asyncio.sleep(0.5)
+    task.cancel()
+    with pytest.raises(asyncio.CancelledError):
+        await task
+
+    for iterator in iterators:
+        try:
+            # Can use anext() in python >= 3.10
+            await asyncio.wait_for(iterator.__anext__(), 1)
+        except StopAsyncIteration:
+            # All iterators should be cancelled and print this message.
+            print("Iterator was cancelled normally")
+        except (Exception, asyncio.CancelledError) as e:
+            raise AssertionError() from e
+
+
+def test_deprecate_kwargs_always():
+
+    @deprecate_kwargs("old_arg", is_deprecated=True)
+    def dummy(*, old_arg: object = None, new_arg: object = None):
+        pass
+
+    with pytest.warns(DeprecationWarning, match="'old_arg'"):
+        dummy(old_arg=1)
+
+    with error_on_warning(DeprecationWarning):
+        dummy(new_arg=1)
+
+
+def test_deprecate_kwargs_never():
+
+    @deprecate_kwargs("old_arg", is_deprecated=False)
+    def dummy(*, old_arg: object = None, new_arg: object = None):
+        pass
+
+    with error_on_warning(DeprecationWarning):
+        dummy(old_arg=1)
+
+    with error_on_warning(DeprecationWarning):
+        dummy(new_arg=1)
+
+
+def test_deprecate_kwargs_dynamic():
+    is_deprecated = True
+
+    @deprecate_kwargs("old_arg", is_deprecated=lambda: is_deprecated)
+    def dummy(*, old_arg: object = None, new_arg: object = None):
+        pass
+
+    with pytest.warns(DeprecationWarning, match="'old_arg'"):
+        dummy(old_arg=1)
+
+    with error_on_warning(DeprecationWarning):
+        dummy(new_arg=1)
+
+    is_deprecated = False
+
+    with error_on_warning(DeprecationWarning):
+        dummy(old_arg=1)
+
+    with error_on_warning(DeprecationWarning):
+        dummy(new_arg=1)
+
+
+def test_deprecate_kwargs_additional_message():
+
+    @deprecate_kwargs("old_arg", is_deprecated=True, additional_message="abcd")
+    def dummy(*, old_arg: object = None, new_arg: object = None):
+        pass
+
+    with pytest.warns(DeprecationWarning, match="abcd"):
+        dummy(old_arg=1)
+
+
+def test_get_open_port(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_PORT", "5678")
+        # make sure we can get multiple ports, even if the env var is set
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1:
+            s1.bind(("localhost", get_open_port()))
+            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2:
+                s2.bind(("localhost", get_open_port()))
+                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3:
+                    s3.bind(("localhost", get_open_port()))
+
+
+# Tests for FlexibleArgumentParser
+@pytest.fixture
+def parser():
+    parser = FlexibleArgumentParser()
+    parser.add_argument('--image-input-type',
+                        choices=['pixel_values', 'image_features'])
+    parser.add_argument('--model-name')
+    parser.add_argument('--batch-size', type=int)
+    parser.add_argument('--enable-feature', action='store_true')
+    parser.add_argument('--hf-overrides', type=json.loads)
+    parser.add_argument('-O', '--compilation-config', type=json.loads)
+    return parser
+
+
+@pytest.fixture
+def parser_with_config():
+    parser = FlexibleArgumentParser()
+    parser.add_argument('serve')
+    parser.add_argument('model_tag', nargs='?')
+    parser.add_argument('--model', type=str)
+    parser.add_argument('--served-model-name', type=str)
+    parser.add_argument('--config', type=str)
+    parser.add_argument('--port', type=int)
+    parser.add_argument('--tensor-parallel-size', type=int)
+    parser.add_argument('--trust-remote-code', action='store_true')
+    parser.add_argument('--multi-step-stream-outputs', action=StoreBoolean)
+    return parser
+
+
+def test_underscore_to_dash(parser):
+    args = parser.parse_args(['--image_input_type', 'pixel_values'])
+    assert args.image_input_type == 'pixel_values'
+
+
+def test_mixed_usage(parser):
+    args = parser.parse_args([
+        '--image_input_type', 'image_features', '--model-name',
+        'facebook/opt-125m'
+    ])
+    assert args.image_input_type == 'image_features'
+    assert args.model_name == 'facebook/opt-125m'
+
+
+def test_with_equals_sign(parser):
+    args = parser.parse_args(
+        ['--image_input_type=pixel_values', '--model-name=facebook/opt-125m'])
+    assert args.image_input_type == 'pixel_values'
+    assert args.model_name == 'facebook/opt-125m'
+
+
+def test_with_int_value(parser):
+    args = parser.parse_args(['--batch_size', '32'])
+    assert args.batch_size == 32
+    args = parser.parse_args(['--batch-size', '32'])
+    assert args.batch_size == 32
+
+
+def test_with_bool_flag(parser):
+    args = parser.parse_args(['--enable_feature'])
+    assert args.enable_feature is True
+    args = parser.parse_args(['--enable-feature'])
+    assert args.enable_feature is True
+
+
+def test_invalid_choice(parser):
+    with pytest.raises(SystemExit):
+        parser.parse_args(['--image_input_type', 'invalid_choice'])
+
+
+def test_missing_required_argument(parser):
+    parser.add_argument('--required-arg', required=True)
+    with pytest.raises(SystemExit):
+        parser.parse_args([])
+
+
+def test_cli_override_to_config(parser_with_config, cli_config_file):
+    args = parser_with_config.parse_args([
+        'serve', 'mymodel', '--config', cli_config_file,
+        '--tensor-parallel-size', '3'
+    ])
+    assert args.tensor_parallel_size == 3
+    args = parser_with_config.parse_args([
+        'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
+        cli_config_file
+    ])
+    assert args.tensor_parallel_size == 3
+    assert args.port == 12312
+    args = parser_with_config.parse_args([
+        'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
+        cli_config_file, '--port', '666'
+    ])
+    assert args.tensor_parallel_size == 3
+    assert args.port == 666
+
+
+def test_config_args(parser_with_config, cli_config_file):
+    args = parser_with_config.parse_args(
+        ['serve', 'mymodel', '--config', cli_config_file])
+    assert args.tensor_parallel_size == 2
+    assert args.trust_remote_code
+    assert not args.multi_step_stream_outputs
+
+
+def test_config_file(parser_with_config):
+    with pytest.raises(FileNotFoundError):
+        parser_with_config.parse_args(
+            ['serve', 'mymodel', '--config', 'test_config.yml'])
+
+    with pytest.raises(ValueError):
+        parser_with_config.parse_args(
+            ['serve', 'mymodel', '--config', './data/test_config.json'])
+
+    with pytest.raises(ValueError):
+        parser_with_config.parse_args([
+            'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
+            '--batch-size', '32'
+        ])
+
+
+def test_no_model_tag(parser_with_config, cli_config_file):
+    with pytest.raises(ValueError):
+        parser_with_config.parse_args(['serve', '--config', cli_config_file])
+
+
+def test_dict_args(parser):
+    args = [
+        "--model-name=something.something",
+        "--hf-overrides.key1",
+        "val1",
+        # Test nesting
+        "--hf-overrides.key2.key3",
+        "val2",
+        "--hf-overrides.key2.key4",
+        "val3",
+        # Test compile config and compilation level
+        "-O.use_inductor=true",
+        "-O.backend",
+        "custom",
+        "-O1",
+        # Test = sign
+        "--hf-overrides.key5=val4",
+        # Test underscore to dash conversion
+        "--hf_overrides.key_6",
+        "val5",
+        "--hf_overrides.key-7.key_8",
+        "val6",
+        # Test data type detection
+        "--hf_overrides.key9",
+        "100",
+        "--hf_overrides.key10",
+        "100.0",
+        "--hf_overrides.key11",
+        "true",
+        "--hf_overrides.key12.key13",
+        "null",
+        # Test '-' and '.' in value
+        "--hf_overrides.key14.key15",
+        "-minus.and.dot",
+        # Test array values
+        "-O.custom_ops+",
+        "-quant_fp8",
+        "-O.custom_ops+=+silu_mul,-rms_norm",
+    ]
+    parsed_args = parser.parse_args(args)
+    assert parsed_args.model_name == "something.something"
+    assert parsed_args.hf_overrides == {
+        "key1": "val1",
+        "key2": {
+            "key3": "val2",
+            "key4": "val3",
+        },
+        "key5": "val4",
+        "key_6": "val5",
+        "key-7": {
+            "key_8": "val6",
+        },
+        "key9": 100,
+        "key10": 100.0,
+        "key11": True,
+        "key12": {
+            "key13": None,
+        },
+        "key14": {
+            "key15": "-minus.and.dot",
+        }
+    }
+    assert parsed_args.compilation_config == {
+        "level": 1,
+        "use_inductor": True,
+        "backend": "custom",
+        "custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"],
+    }
+
+
+def test_duplicate_dict_args(caplog_vllm, parser):
+    args = [
+        "--model-name=something.something",
+        "--hf-overrides.key1",
+        "val1",
+        "--hf-overrides.key1",
+        "val2",
+        "-O1",
+        "-O.level",
+        "2",
+        "-O3",
+    ]
+
+    parsed_args = parser.parse_args(args)
+    # Should be the last value
+    assert parsed_args.hf_overrides == {"key1": "val2"}
+    assert parsed_args.compilation_config == {"level": 3}
+
+    assert len(caplog_vllm.records) == 1
+    assert "duplicate" in caplog_vllm.text
+    assert "--hf-overrides.key1" in caplog_vllm.text
+    assert "-O.level" in caplog_vllm.text
+
+
+# yapf: enable
+@pytest.mark.parametrize(
+    "callable,kw_name,requires_kw_only,allow_var_kwargs,is_supported",
+    [
+        # Tests for positional argument support
+        (lambda foo: None, "foo", True, True, False),
+        (lambda foo: None, "foo", False, True, True),
+        # Tests for positional or keyword / keyword only
+        (lambda foo=100: None, "foo", True, True, False),
+        (lambda *, foo: None, "foo", False, True, True),
+        # Tests to make sure the names of variadic params are NOT supported
+        (lambda *args: None, "args", False, True, False),
+        (lambda **kwargs: None, "kwargs", False, True, False),
+        # Tests for if we allow var kwargs to add support
+        (lambda foo: None, "something_else", False, True, False),
+        (lambda foo, **kwargs: None, "something_else", False, True, True),
+        (lambda foo, **kwargs: None, "kwargs", True, True, False),
+        (lambda foo, **kwargs: None, "foo", True, True, False),
+    ])
+# yapf: disable
+def test_supports_kw(callable,kw_name,requires_kw_only,
+                     allow_var_kwargs,is_supported):
+    assert supports_kw(
+        callable=callable,
+        kw_name=kw_name,
+        requires_kw_only=requires_kw_only,
+        allow_var_kwargs=allow_var_kwargs
+    ) == is_supported
+
+
+@create_new_process_for_each_test()
+def test_memory_profiling():
+    # Fake out some model loading + inference memory usage to test profiling
+    # Memory used by other processes will show up as cuda usage outside of torch
+    from vllm.distributed.device_communicators.cuda_wrapper import (
+        CudaRTLibrary)
+    lib = CudaRTLibrary()
+    # 512 MiB allocation outside of this instance
+    handle1 = lib.cudaMalloc(512 * 1024 * 1024)
+
+    baseline_snapshot = MemorySnapshot()
+
+    # load weights
+
+    weights = torch.randn(128, 1024, 1024, device='cuda', dtype=torch.float32)
+
+    weights_memory = 128 * 1024 * 1024 * 4 # 512 MiB
+
+    def measure_current_non_torch():
+        free, total = torch.cuda.mem_get_info()
+        current_used = total - free
+        current_torch = torch.cuda.memory_reserved()
+        current_non_torch = current_used - current_torch
+        return current_non_torch
+
+    with memory_profiling(baseline_snapshot=baseline_snapshot,
+    weights_memory=weights_memory) as result, \
+        monitor(measure_current_non_torch) as monitored_values:
+        # make a memory spike, 1 GiB
+        spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32)
+        del spike
+
+        # Add some extra non-torch memory 256 MiB (simulate NCCL)
+        handle2 = lib.cudaMalloc(256 * 1024 * 1024)
+
+    # this is an analytic value, it is exact,
+    # we only have 256 MiB non-torch memory increase
+    measured_diff = monitored_values.values[-1] - monitored_values.values[0]
+    assert measured_diff == 256 * 1024 * 1024
+
+    # Check that the memory usage is within 5% of the expected values
+    # 5% tolerance is caused by cuda runtime.
+    # we cannot control cuda runtime in the granularity of bytes,
+    # which causes a small error (<10 MiB in practice)
+    non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024) # noqa
+    assert abs(non_torch_ratio - 1) <= 0.05
+    assert result.torch_peak_increase == 1024 * 1024 * 1024
+    del weights
+    lib.cudaFree(handle1)
+    lib.cudaFree(handle2)
+
+
+def test_bind_kv_cache():
+    from vllm.attention import Attention
+
+    ctx = {
+        'layers.0.self_attn': Attention(32, 128, 0.1),
+        'layers.1.self_attn': Attention(32, 128, 0.1),
+        'layers.2.self_attn': Attention(32, 128, 0.1),
+        'layers.3.self_attn': Attention(32, 128, 0.1),
+    }
+    kv_cache = [
+        torch.zeros((1, )),
+        torch.zeros((1, )),
+        torch.zeros((1, )),
+        torch.zeros((1, )),
+    ]
+    bind_kv_cache(ctx, [kv_cache])
+    assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0]
+    assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[1]
+    assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[2]
+    assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[3]
+
+def test_bind_kv_cache_kv_sharing():
+    from vllm.attention import Attention
+
+    ctx = {
+        'layers.0.self_attn': Attention(32, 128, 0.1),
+        'layers.1.self_attn': Attention(32, 128, 0.1),
+        'layers.2.self_attn': Attention(32, 128, 0.1),
+        'layers.3.self_attn': Attention(32, 128, 0.1),
+    }
+    kv_cache = [
+        torch.zeros((1, )),
+        torch.zeros((1, )),
+        torch.zeros((1, )),
+        torch.zeros((1, )),
+    ]
+    shared_kv_cache_layers = {
+        'layers.2.self_attn': 'layers.1.self_attn',
+        'layers.3.self_attn': 'layers.0.self_attn'
+    }
+    bind_kv_cache(ctx, [kv_cache], shared_kv_cache_layers)
+    assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0]
+    assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[1]
+    assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[1]
+    assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[0]
+
+def test_bind_kv_cache_non_attention():
+    from vllm.attention import Attention
+
+    # example from Jamba PP=2
+    ctx = {
+        'model.layers.20.attn': Attention(32, 128, 0.1),
+        'model.layers.28.attn': Attention(32, 128, 0.1),
+    }
+    kv_cache = [
+        torch.zeros((1, )),
+        torch.zeros((1, )),
+    ]
+    bind_kv_cache(ctx, [kv_cache])
+    assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[0]
+    assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
+
+
+def test_bind_kv_cache_encoder_decoder(monkeypatch: pytest.MonkeyPatch):
+    # V1 TESTS: ENCODER_DECODER is not supported on V1 yet.
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+
+        from vllm.attention import Attention, AttentionType
+
+        # example from bart
+        ctx = {
+            'encoder.layers.0.self_attn.attn':
+                Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER),
+            'decoder.layers.0.encoder_attn.attn':
+                Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER),
+            'decoder.layers.0.self_attn.attn':
+                Attention(32, 128, 0.1, attn_type=AttentionType.DECODER),
+        }
+
+        kv_cache = [
+            torch.zeros((1, )),
+        ]
+        encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache
+
+        bind_kv_cache(ctx, [kv_cache])
+        assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache
+        assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0]
+        assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0]
+
+
+def test_bind_kv_cache_pp():
+    with patch("vllm.utils.cuda_device_count_stateless", lambda: 2):
+        # this test runs with 1 GPU, but we simulate 2 GPUs
+        cfg = VllmConfig(
+            parallel_config=ParallelConfig(pipeline_parallel_size=2))
+    with set_current_vllm_config(cfg):
+        from vllm.attention import Attention
+
+        ctx = {
+            'layers.0.self_attn': Attention(32, 128, 0.1),
+        }
+        kv_cache = [
+            [torch.zeros((1, ))],
+            [torch.zeros((1, ))]
+        ]
+        bind_kv_cache(ctx, kv_cache)
+        assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0][0]
+        assert ctx['layers.0.self_attn'].kv_cache[1] is kv_cache[1][0]
+
+
+class TestLRUCache(LRUCache):
+
+    def _on_remove(self, key, value):
+        if not hasattr(self, "_remove_counter"):
+            self._remove_counter = 0
+        self._remove_counter += 1
+
+
+def test_lru_cache():
+    cache = TestLRUCache(3)
+    assert cache.stat() == CacheInfo(hits=0, total=0)
+    assert cache.stat(delta=True) == CacheInfo(hits=0, total=0)
+
+    cache.put(1, 1)
+    assert len(cache) == 1
+
+    cache.put(1, 1)
+    assert len(cache) == 1
+
+    cache.put(2, 2)
+    assert len(cache) == 2
+
+    cache.put(3, 3)
+    assert len(cache) == 3
+    assert set(cache.cache) == {1, 2, 3}
+
+    cache.put(4, 4)
+    assert len(cache) == 3
+    assert set(cache.cache) == {2, 3, 4}
+    assert cache._remove_counter == 1
+
+    assert cache.get(2) == 2
+    assert cache.stat() == CacheInfo(hits=1, total=1)
+    assert cache.stat(delta=True) == CacheInfo(hits=1, total=1)
+
+    assert cache[2] == 2
+    assert cache.stat() == CacheInfo(hits=2, total=2)
+    assert cache.stat(delta=True) == CacheInfo(hits=1, total=1)
+
+    cache.put(5, 5)
+    assert set(cache.cache) == {2, 4, 5}
+    assert cache._remove_counter == 2
+
+    assert cache.pop(5) == 5
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    assert cache.get(-1) is None
+    assert cache.stat() == CacheInfo(hits=2, total=3)
+    assert cache.stat(delta=True) == CacheInfo(hits=0, total=1)
+
+    cache.pop(10)
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    cache.get(10)
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    cache.put(6, 6)
+    assert len(cache) == 3
+    assert set(cache.cache) == {2, 4, 6}
+    assert 2 in cache
+    assert 4 in cache
+    assert 6 in cache
+
+    cache.remove_oldest()
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 6}
+    assert cache._remove_counter == 4
+
+    cache.clear()
+    assert len(cache) == 0
+    assert cache._remove_counter == 6
+    assert cache.stat() == CacheInfo(hits=0, total=0)
+    assert cache.stat(delta=True) == CacheInfo(hits=0, total=0)
+
+    cache._remove_counter = 0
+
+    cache[1] = 1
+    assert len(cache) == 1
+
+    cache[1] = 1
+    assert len(cache) == 1
+
+    cache[2] = 2
+    assert len(cache) == 2
+
+    cache[3] = 3
+    assert len(cache) == 3
+    assert set(cache.cache) == {1, 2, 3}
+
+    cache[4] = 4
+    assert len(cache) == 3
+    assert set(cache.cache) == {2, 3, 4}
+    assert cache._remove_counter == 1
+    assert cache[2] == 2
+
+    cache[5] = 5
+    assert set(cache.cache) == {2, 4, 5}
+    assert cache._remove_counter == 2
+
+    del cache[5]
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    cache.pop(10)
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    cache[6] = 6
+    assert len(cache) == 3
+    assert set(cache.cache) == {2, 4, 6}
+    assert 2 in cache
+    assert 4 in cache
+    assert 6 in cache
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    ("src_dtype", "tgt_dtype", "expected_result"),
+    [
+        # Different precision_levels
+        (torch.bool, torch.int8, True),
+        (torch.bool, torch.float16, True),
+        (torch.bool, torch.complex32, True),
+        (torch.int64, torch.bool, False),
+        (torch.int64, torch.float16, True),
+        (torch.int64, torch.complex32, True),
+        (torch.float64, torch.bool, False),
+        (torch.float64, torch.int8, False),
+        (torch.float64, torch.complex32, True),
+        (torch.complex128, torch.bool, False),
+        (torch.complex128, torch.int8, False),
+        (torch.complex128, torch.float16, False),
+        # precision_level=0
+        (torch.bool, torch.bool, True),
+        # precision_level=1
+        (torch.int8, torch.int16, True),
+        (torch.int16, torch.int8, False),
+        (torch.uint8, torch.int8, False),
+        (torch.int8, torch.uint8, False),
+        # precision_level=2
+        (torch.float16, torch.float32, True),
+        (torch.float32, torch.float16, False),
+        (torch.bfloat16, torch.float32, True),
+        (torch.float32, torch.bfloat16, False),
+        # precision_level=3
+        (torch.complex32, torch.complex64, True),
+        (torch.complex64, torch.complex32, False),
+    ],
+)
+# yapf: enable
+def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
+    assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    ("dtypes", "expected_result"),
+    [
+        ([torch.bool], torch.bool),
+        ([torch.bool, torch.int8], torch.int8),
+        ([torch.bool, torch.int8, torch.float16], torch.float16),
+        ([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32),  # noqa: E501
+    ],
+)
+# yapf: enable
+def test_common_broadcastable_dtype(dtypes, expected_result):
+    assert common_broadcastable_dtype(dtypes) == expected_result
+
+
+def test_placeholder_module_error_handling():
+    placeholder = PlaceholderModule("placeholder_1234")
+
+    def build_ctx():
+        return pytest.raises(ModuleNotFoundError, match="No module named")
+
+    with build_ctx():
+        int(placeholder)
+
+    with build_ctx():
+        placeholder()
+
+    with build_ctx():
+        _ = placeholder.some_attr
+
+    with build_ctx():
+        # Test conflict with internal __name attribute
+        _ = placeholder.name
+
+    # OK to print the placeholder or use it in a f-string
+    _ = repr(placeholder)
+    _ = str(placeholder)
+
+    # No error yet; only error when it is used downstream
+    placeholder_attr = placeholder.placeholder_attr("attr")
+
+    with build_ctx():
+        int(placeholder_attr)
+
+    with build_ctx():
+        placeholder_attr()
+
+    with build_ctx():
+        _ = placeholder_attr.some_attr
+
+    with build_ctx():
+        # Test conflict with internal __module attribute
+        _ = placeholder_attr.module
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    "obj,key1,key2",
+    [
+        # Tests for both keys exist
+        ({1: "a", 2: "b"}, 1, 2),
+        # Tests for one key does not exist
+        ({1: "a", 2: "b"}, 1, 3),
+        # Tests for both keys do not exist
+        ({1: "a", 2: "b"}, 3, 4),
+    ])
+# yapf: enable
+def test_swap_dict_values(obj, key1, key2):
+    original_obj = obj.copy()
+    swap_dict_values(obj, key1, key2)
+    if key1 in original_obj:
+        assert obj[key2] == original_obj[key1]
+    else:
+        assert key2 not in obj
+    if key2 in original_obj:
+        assert obj[key1] == original_obj[key2]
+    else:
+        assert key1 not in obj
+
+
+def test_model_specification(parser_with_config, cli_config_file,
+                             cli_config_file_with_model):
+    # Test model in CLI takes precedence over config
+    args = parser_with_config.parse_args(
+        ['serve', 'cli-model', '--config', cli_config_file_with_model])
+    assert args.model_tag == 'cli-model'
+    assert args.served_model_name == 'mymodel'
+
+    # Test model from config file works
+    args = parser_with_config.parse_args([
+        'serve',
+        '--config',
+        cli_config_file_with_model,
+    ])
+    assert args.model == 'config-model'
+    assert args.served_model_name == 'mymodel'
+
+    # Test no model specified anywhere raises error
+    with pytest.raises(ValueError, match="No model specified!"):
+        parser_with_config.parse_args(['serve', '--config', cli_config_file])
+
+    # Test using --model option raises error
+    with pytest.raises(
+            ValueError,
+            match=
+        ("With `vllm serve`, you should provide the model as a positional "
+         "argument or in a config file instead of via the `--model` option."),
+    ):
+        parser_with_config.parse_args(['serve', '--model', 'my-model'])
+
+    # Test other config values are preserved
+    args = parser_with_config.parse_args([
+        'serve',
+        'cli-model',
+        '--config',
+        cli_config_file_with_model,
+    ])
+    assert args.tensor_parallel_size == 2
+    assert args.trust_remote_code is True
+    assert args.multi_step_stream_outputs is False
+    assert args.port == 12312
+
+
+@pytest.mark.parametrize("input", [(), ("abc", ), (None, ),
+                                   (None, bool, [1, 2, 3])])
+@pytest.mark.parametrize("output", [0, 1, 2])
+def test_sha256(input: tuple, output: int):
+    hash = sha256(input)
+    assert hash is not None
+    assert isinstance(hash, int)
+    assert hash != 0
+
+    bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
+    assert hash == int.from_bytes(hashlib.sha256(bytes).digest(),
+                                  byteorder="big")
+
+    # hashing again, returns the same value
+    assert hash == sha256(input)
+
+    # hashing different input, returns different value
+    assert hash != sha256(input + (1, ))
+
+
+@pytest.mark.parametrize(
+    "path,expected",
+    [
+        ("ipc://some_path", ("ipc", "some_path", "")),
+        ("tcp://127.0.0.1:5555", ("tcp", "127.0.0.1", "5555")),
+        ("tcp://[::1]:5555", ("tcp", "::1", "5555")),  # IPv6 address
+        ("inproc://some_identifier", ("inproc", "some_identifier", "")),
+    ])
+def test_split_zmq_path(path, expected):
+    assert split_zmq_path(path) == expected
+
+
+@pytest.mark.parametrize(
+    "invalid_path",
+    [
+        "invalid_path",  # Missing scheme
+        "tcp://127.0.0.1",  # Missing port
+        "tcp://[::1]",  # Missing port for IPv6
+        "tcp://:5555",  # Missing host
+    ])
+def test_split_zmq_path_invalid(invalid_path):
+    with pytest.raises(ValueError):
+        split_zmq_path(invalid_path)
+
+
+def test_make_zmq_socket_ipv6():
+    # Check if IPv6 is supported by trying to create an IPv6 socket
+    try:
+        sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
+        sock.close()
+    except socket.error:
+        pytest.skip("IPv6 is not supported on this system")
+
+    ctx = zmq.Context()
+    ipv6_path = "tcp://[::]:5555"  # IPv6 loopback address
+    socket_type = zmq.REP  # Example socket type
+
+    # Create the socket
+    zsock: zmq.Socket = make_zmq_socket(ctx, ipv6_path, socket_type)
+
+    # Verify that the IPV6 option is set
+    assert zsock.getsockopt(
+        zmq.IPV6) == 1, "IPV6 option should be enabled for IPv6 addresses"
+
+    # Clean up
+    zsock.close()
+    ctx.term()
+
+
+def test_make_zmq_path():
+    assert make_zmq_path("tcp", "127.0.0.1", "5555") == "tcp://127.0.0.1:5555"
+    assert make_zmq_path("tcp", "::1", "5555") == "tcp://[::1]:5555"
+
+
+def test_get_tcp_uri():
+    assert get_tcp_uri("127.0.0.1", 5555) == "tcp://127.0.0.1:5555"
+    assert get_tcp_uri("::1", 5555) == "tcp://[::1]:5555"
+
+
+def test_split_host_port():
+    # valid ipv4
+    assert split_host_port("127.0.0.1:5555") == ("127.0.0.1", 5555)
+    # invalid ipv4
+    with pytest.raises(ValueError):
+        # multi colon
+        assert split_host_port("127.0.0.1::5555")
+    with pytest.raises(ValueError):
+        # tailing colon
+        assert split_host_port("127.0.0.1:5555:")
+    with pytest.raises(ValueError):
+        # no colon
+        assert split_host_port("127.0.0.15555")
+    with pytest.raises(ValueError):
+        # none int port
+        assert split_host_port("127.0.0.1:5555a")
+
+    # valid ipv6
+    assert split_host_port("[::1]:5555") == ("::1", 5555)
+    # invalid ipv6
+    with pytest.raises(ValueError):
+        # multi colon
+        assert split_host_port("[::1]::5555")
+    with pytest.raises(IndexError):
+        # no colon
+        assert split_host_port("[::1]5555")
+    with pytest.raises(ValueError):
+        # none int port
+        assert split_host_port("[::1]:5555a")
+
+
+def test_join_host_port():
+    assert join_host_port("127.0.0.1", 5555) == "127.0.0.1:5555"
+    assert join_host_port("::1", 5555) == "[::1]:5555"
+
+
+def test_convert_ids_list_to_tokens():
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
+    token_ids = tokenizer.encode("Hello, world!")
+    # token_ids = [9707, 11, 1879, 0]
+    assert tokenizer.convert_ids_to_tokens(token_ids) == [
+        'Hello', ',', 'Ġworld', '!'
+    ]
+    tokens = convert_ids_list_to_tokens(tokenizer, token_ids)
+    assert tokens == ['Hello', ',', ' world', '!']
+
+
+def test_current_stream_multithread():
+    import threading
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    main_default_stream = torch.cuda.current_stream()
+    child_stream = torch.cuda.Stream()
+
+    thread_stream_ready = threading.Event()
+    thread_can_exit = threading.Event()
+
+    def child_thread_func():
+        with torch.cuda.stream(child_stream):
+            thread_stream_ready.set()
+            thread_can_exit.wait(timeout=10)
+
+    child_thread = threading.Thread(target=child_thread_func)
+    child_thread.start()
+
+    try:
+        assert thread_stream_ready.wait(
+            timeout=5), "Child thread failed to enter stream context in time"
+
+        main_current_stream = current_stream()
+
+        assert main_current_stream != child_stream, "Main thread's current_stream was contaminated by child thread"
+        assert main_current_stream == main_default_stream, "Main thread's current_stream is not the default stream"
+
+        # Notify child thread it can exit
+        thread_can_exit.set()
+
+    finally:
+        # Ensure child thread exits properly
+        child_thread.join(timeout=5)
+        if child_thread.is_alive():
+            pytest.fail("Child thread failed to exit properly")
diff --git a/vllm_v0.10.0/tests/test_version.py b/vllm_v0.10.0/tests/test_version.py
new file mode 100644
index 0000000..fd07abb
--- /dev/null
+++ b/vllm_v0.10.0/tests/test_version.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import patch
+
+import pytest
+
+from vllm import version
+
+
+def test_version_is_defined():
+    assert version.__version__ is not None
+
+
+def test_version_tuple():
+    assert len(version.__version_tuple__) in (3, 4, 5)
+
+
+@pytest.mark.parametrize(
+    "version_tuple, version_str, expected",
+    [
+        ((0, 0, "dev"), "0.0", True),
+        ((0, 0, "dev"), "foobar", True),
+        ((0, 7, 4), "0.6", True),
+        ((0, 7, 4), "0.5", False),
+        ((0, 7, 4), "0.7", False),
+        ((1, 2, 3), "1.1", True),
+        ((1, 2, 3), "1.0", False),
+        ((1, 2, 3), "1.2", False),
+        # This won't work as expected
+        ((1, 0, 0), "1.-1", True),
+        ((1, 0, 0), "0.9", False),
+        ((1, 0, 0), "0.17", False),
+    ])
+def test_prev_minor_version_was(version_tuple, version_str, expected):
+    with patch("vllm.version.__version_tuple__", version_tuple):
+        assert version._prev_minor_version_was(version_str) == expected
diff --git a/vllm_v0.10.0/tests/test_vllm_port.py b/vllm_v0.10.0/tests/test_vllm_port.py
new file mode 100644
index 0000000..88e1efd
--- /dev/null
+++ b/vllm_v0.10.0/tests/test_vllm_port.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from unittest.mock import patch
+
+import pytest
+
+from vllm.envs import get_vllm_port
+
+
+def test_get_vllm_port_not_set():
+    """Test when VLLM_PORT is not set."""
+    with patch.dict(os.environ, {}, clear=True):
+        assert get_vllm_port() is None
+
+
+def test_get_vllm_port_valid():
+    """Test when VLLM_PORT is set to a valid integer."""
+    with patch.dict(os.environ, {"VLLM_PORT": "5678"}, clear=True):
+        assert get_vllm_port() == 5678
+
+
+def test_get_vllm_port_invalid():
+    """Test when VLLM_PORT is set to a non-integer value."""
+    with (patch.dict(os.environ, {"VLLM_PORT": "abc"}, clear=True),
+          pytest.raises(ValueError, match="must be a valid integer")):
+        get_vllm_port()
+
+
+def test_get_vllm_port_uri():
+    """Test when VLLM_PORT is set to a URI."""
+    with (patch.dict(os.environ, {"VLLM_PORT": "tcp://localhost:5678"},
+                     clear=True),
+          pytest.raises(ValueError, match="appears to be a URI")):
+        get_vllm_port()
diff --git a/vllm_v0.10.0/tests/tokenization/__init__.py b/vllm_v0.10.0/tests/tokenization/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/tokenization/test_cached_tokenizer.py b/vllm_v0.10.0/tests/tokenization/test_cached_tokenizer.py
new file mode 100644
index 0000000..e218678
--- /dev/null
+++ b/vllm_v0.10.0/tests/tokenization/test_cached_tokenizer.py
@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pickle
+from copy import deepcopy
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm.transformers_utils.tokenizer import (AnyTokenizer,
+                                               get_cached_tokenizer)
+
+
+@pytest.mark.parametrize("model_id", ["gpt2", "THUDM/chatglm3-6b"])
+def test_cached_tokenizer(model_id: str):
+    reference_tokenizer = AutoTokenizer.from_pretrained(model_id,
+                                                        trust_remote_code=True)
+    reference_tokenizer.add_special_tokens({"cls_token": "<CLS>"})
+    reference_tokenizer.add_special_tokens(
+        {"additional_special_tokens": ["<SEP>"]})
+
+    cached_tokenizer = get_cached_tokenizer(deepcopy(reference_tokenizer))
+    _check_consistency(cached_tokenizer, reference_tokenizer)
+
+    pickled_tokenizer = pickle.dumps(cached_tokenizer)
+    unpickled_tokenizer = pickle.loads(pickled_tokenizer)
+    _check_consistency(unpickled_tokenizer, reference_tokenizer)
+
+
+def _check_consistency(target: AnyTokenizer, expected: AnyTokenizer):
+    assert isinstance(target, type(expected))
+
+    # Cached attributes
+    assert target.all_special_ids == expected.all_special_ids
+    assert target.all_special_tokens == expected.all_special_tokens
+    assert (target.all_special_tokens_extended ==
+            expected.all_special_tokens_extended)
+    assert target.get_vocab() == expected.get_vocab()
+    assert len(target) == len(expected)
+
+    # Other attributes
+    assert getattr(target, "padding_side",
+                   None) == getattr(expected, "padding_side", None)
+
+    assert target.encode("prompt") == expected.encode("prompt")
diff --git a/vllm_v0.10.0/tests/tokenization/test_detokenize.py b/vllm_v0.10.0/tests/tokenization/test_detokenize.py
new file mode 100644
index 0000000..ccafc88
--- /dev/null
+++ b/vllm_v0.10.0/tests/tokenization/test_detokenize.py
@@ -0,0 +1,415 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Generator
+from typing import Any, Optional
+
+import pytest
+from transformers import (AutoTokenizer, PreTrainedTokenizer,
+                          PreTrainedTokenizerFast)
+
+from vllm.inputs import token_inputs
+from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup
+from vllm.transformers_utils.detokenizer import Detokenizer
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.detokenizer import (FastIncrementalDetokenizer,
+                                        IncrementalDetokenizer,
+                                        SlowIncrementalDetokenizer)
+
+SPECIAL_TOKS_TRUTH = [
+    "Some text with adjacent special tokens                <|padding|><|padding|><fim_prefix><fim_middle><fim_suffix>other text<fim_pad>",  # noqa
+]
+
+TRUTH = [
+    "Hello here, this is a simple test",
+    "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving",  # noqa
+    "我很感谢你的热情",
+    # Burmese text triggers an edge-case for Mistral's V3-Tekken tokenizer (eg.
+    # for mistralai/Pixtral-12B-2409) where tokens may map to bytes with
+    # incomplete UTF-8 characters
+    # see https://github.com/vllm-project/vllm/pull/9625
+    "ပုံပြင်လေးပြောပြပါ်",
+] + SPECIAL_TOKS_TRUTH
+
+TOKENIZERS = [
+    "facebook/opt-125m",
+    "gpt2",
+    "bigcode/tiny_starcoder_py",
+    "EleutherAI/gpt-j-6b",
+    "EleutherAI/pythia-70m",
+    "bigscience/bloom-560m",
+    "mosaicml/mpt-7b",
+    "tiiuae/falcon-7b",
+    "meta-llama/Llama-3.2-1B-Instruct",
+    "codellama/CodeLlama-7b-hf",
+    "mistralai/Pixtral-12B-2409",
+]
+
+
+def _run_incremental_decode(tokenizer,
+                            all_input_ids,
+                            skip_special_tokens: bool,
+                            starting_index: int,
+                            spaces_between_special_tokens: bool = True,
+                            fast: Optional[bool] = None):
+
+    prompt_token_ids = all_input_ids[:starting_index]
+
+    params = SamplingParams(
+        skip_special_tokens=skip_special_tokens,
+        spaces_between_special_tokens=spaces_between_special_tokens,
+    )
+    request = EngineCoreRequest("",
+                                prompt_token_ids,
+                                None,
+                                None,
+                                None,
+                                params,
+                                None,
+                                None,
+                                0.0,
+                                None,
+                                cache_salt=None,
+                                data_parallel_rank=None)
+
+    if fast is None:
+        detokenizer = IncrementalDetokenizer.from_new_request(
+            tokenizer, request)
+    elif fast:
+        detokenizer = FastIncrementalDetokenizer(tokenizer, request)
+    else:
+        detokenizer = SlowIncrementalDetokenizer(tokenizer, request)
+
+    output_text = ""
+    for i, token_id in enumerate(all_input_ids[starting_index:]):
+        detokenizer.update([token_id], False)
+        finished = i == len(all_input_ids) - 1
+        output_text += detokenizer.get_next_output_text(finished, delta=True)
+
+    return output_text, detokenizer.output_token_ids
+
+
+@pytest.fixture
+def tokenizer(tokenizer_name):
+    return (MistralTokenizer.from_pretrained(tokenizer_name)
+            if "mistral" in tokenizer_name else
+            AutoTokenizer.from_pretrained(tokenizer_name))
+
+
+@pytest.mark.parametrize("tokenizer_name", ["mistralai/Pixtral-12B-2409"])
+@pytest.mark.parametrize(
+    "truth",
+    [
+        # Burmese text triggers an edge-case where tokens may map to bytes with
+        # incomplete UTF-8 characters
+        "ပုံပြင်လေးပြောပြပါ",
+        # Using "URGENCY" since "CY" has token id 130282
+        "URGENCY🌶️",
+    ])
+def test_mistral_edge_case(tokenizer, truth):
+    """Test for a specific edge cases with V3-Tekken MistralTokenizer.
+
+    See https://github.com/vllm-project/vllm/pull/9625
+    """
+    starting_index = 0
+    all_input_ids = tokenizer(truth, add_special_tokens=False).input_ids
+
+    decoded_text, out_ids = _run_incremental_decode(
+        tokenizer,
+        all_input_ids,
+        skip_special_tokens=True,
+        starting_index=starting_index)
+    assert decoded_text == truth
+    assert out_ids == all_input_ids[starting_index:]
+
+
+@pytest.fixture
+def skip_special_tokens(request, tokenizer_name) -> Generator[bool, Any, None]:
+    if "mistral" in tokenizer_name:
+        yield (
+            True if request.param else
+            pytest.skip("mistral doesn't support skip_special_tokens=False"))
+    else:
+        yield bool(request.param)
+
+
+@pytest.mark.parametrize("truth", TRUTH)
+@pytest.mark.parametrize("with_prompt", [True, False])
+@pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
+@pytest.mark.parametrize("skip_special_tokens", (True, False), indirect=True)
+@pytest.mark.parametrize("spaces_between_special_tokens", (True, False))
+@pytest.mark.parametrize("fast", (True, False))
+def test_decode_streaming(tokenizer, truth, with_prompt, skip_special_tokens,
+                          spaces_between_special_tokens, fast):
+    if fast and not isinstance(tokenizer, PreTrainedTokenizerFast):
+        pytest.skip()
+
+    if skip_special_tokens and not spaces_between_special_tokens:
+        pytest.skip()
+
+    if not fast and isinstance(tokenizer, PreTrainedTokenizerFast):
+        # Fix up inconsistency in fast/slow tokenizer behaviour.
+        tokenizer.add_special_tokens({
+            "additional_special_tokens": [
+                at for at in
+                tokenizer._tokenizer.get_added_tokens_decoder().values()
+                if at.special
+            ]
+        })
+
+    extra_decode_args = {} if not isinstance(tokenizer,  PreTrainedTokenizer) \
+        else {"spaces_between_special_tokens": spaces_between_special_tokens}
+
+    truth_tokens = tokenizer(truth, add_special_tokens=False).input_ids
+    if tokenizer.bos_token_id is not None:
+        truth_tokens.insert(0, tokenizer.bos_token_id)
+    truth_tokens.append(tokenizer.eos_token_id)
+
+    new_truth = tokenizer.decode(truth_tokens,
+                                 skip_special_tokens=skip_special_tokens,
+                                 **extra_decode_args)
+
+    if with_prompt:
+        num_prompt_tokens = len(
+            tokenizer(truth[:len(truth) // 2],
+                      add_special_tokens=False).input_ids)
+        if tokenizer.bos_token_id is not None:
+            num_prompt_tokens += 1
+
+        prompt_input_ids = truth_tokens[:num_prompt_tokens]
+        generated_input_ids = truth_tokens[num_prompt_tokens:]
+        all_input_ids = prompt_input_ids + generated_input_ids
+        starting_index = len(prompt_input_ids)
+        prompt = tokenizer.decode(prompt_input_ids,
+                                  skip_special_tokens=skip_special_tokens,
+                                  **extra_decode_args)
+
+        generated = new_truth[len(prompt):]
+    else:
+        generated = new_truth
+        starting_index = 0
+        all_input_ids = truth_tokens
+
+    decoded_text, out_ids = _run_incremental_decode(
+        tokenizer,
+        all_input_ids,
+        skip_special_tokens=skip_special_tokens,
+        starting_index=starting_index,
+        spaces_between_special_tokens=spaces_between_special_tokens,
+        fast=fast)
+
+    assert decoded_text == generated
+    assert out_ids == all_input_ids[starting_index:]
+
+
+@pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
+@pytest.mark.parametrize("fast", (True, False))
+def test_oov_decode(tokenizer, fast):
+    if fast and not isinstance(tokenizer, PreTrainedTokenizerFast):
+        pytest.skip()
+
+    decoded_text, out_ids = _run_incremental_decode(
+        tokenizer, [len(tokenizer)],
+        skip_special_tokens=True,
+        starting_index=0,
+        spaces_between_special_tokens=True,
+        fast=fast)
+
+    assert decoded_text == ''
+    assert out_ids == [len(tokenizer)]
+
+
+@pytest.fixture
+def detokenizer(tokenizer_name: str) -> Detokenizer:
+    tokenizer_group = TokenizerGroup(
+        tokenizer_id=tokenizer_name,
+        enable_lora=False,
+        max_num_seqs=100,
+        max_input_length=None,
+        tokenizer_mode="mistral" if "mistral" in tokenizer_name else "auto",
+        trust_remote_code=False,
+        revision=None,
+    )
+
+    return Detokenizer(tokenizer_group)
+
+
+@pytest.fixture(name="complete_sequence_token_ids")
+def create_complete_sequence_token_ids(complete_sequence: str,
+                                       tokenizer) -> list[int]:
+    return tokenizer(complete_sequence, add_special_tokens=False).input_ids
+
+
+def create_sequence(prompt_token_ids=None):
+    prompt_token_ids = prompt_token_ids or []
+    return Sequence(
+        seq_id=0,
+        inputs=token_inputs(prompt_token_ids),
+        block_size=16,
+    )
+
+
+def create_dummy_logprobs(
+        complete_sequence_token_ids: list[int]) -> list[dict[int, Logprob]]:
+    return [{
+        token_id: Logprob(logprob=0.0),
+        token_id + 1: Logprob(logprob=0.1)
+    } for token_id in complete_sequence_token_ids]
+
+
+def create_dummy_prompt_logprobs(
+        complete_sequence_token_ids: list[int]
+) -> list[Optional[dict[int, Any]]]:
+    # logprob for the first prompt token is None.
+    logprobs: list[Optional[dict[int, Any]]] = [None]
+    logprobs.extend(create_dummy_logprobs(complete_sequence_token_ids)[1:])
+    return logprobs
+
+
+@pytest.mark.parametrize("complete_sequence", TRUTH)
+@pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
+@pytest.mark.parametrize("skip_special_tokens", [True, False], indirect=True)
+def test_decode_sequence_logprobs(complete_sequence: str,
+                                  complete_sequence_token_ids: list[int],
+                                  detokenizer: Detokenizer,
+                                  skip_special_tokens: bool):
+    """Verify Detokenizer decodes logprobs correctly."""
+    sampling_params = SamplingParams(skip_special_tokens=skip_special_tokens,
+                                     logprobs=2)
+
+    # Run sequentially.
+    seq = create_sequence()
+    dummy_logprobs = create_dummy_logprobs(complete_sequence_token_ids)
+    sequential_logprobs_text_chosen_token: list[str] = []
+    sequential_logprobs_text_other_token: list[str] = []
+    for new_token, logprobs in zip(complete_sequence_token_ids,
+                                   dummy_logprobs):
+        seq.append_token_id(new_token, logprobs)
+        detokenizer.decode_sequence_inplace(seq, sampling_params)
+        sequential_logprobs_text_chosen_token.append(
+            seq.output_logprobs[-1][new_token].decoded_token)
+        sequential_logprobs_text_other_token.append(
+            seq.output_logprobs[-1][new_token + 1].decoded_token)
+    sequential_result = seq.output_text
+
+    assert sequential_result == "".join(sequential_logprobs_text_chosen_token)
+    assert sequential_result != "".join(sequential_logprobs_text_other_token)
+
+    if not skip_special_tokens:
+        # Text for logprobs for the chosen token should be the same as the
+        # generated text. Note that this will only be true if we skip
+        # special tokens.
+        assert sequential_result == complete_sequence
+
+
+@pytest.mark.parametrize("complete_sequence", TRUTH)
+@pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
+def test_decode_prompt_logprobs(complete_sequence: str,
+                                complete_sequence_token_ids: list[int],
+                                detokenizer: Detokenizer):
+
+    # We want to use skip_special_tokens=False here but Mistral tokenizers
+    # don't support that.
+    if complete_sequence not in SPECIAL_TOKS_TRUTH:
+        skip_special_tokens = True
+    elif not isinstance(detokenizer.tokenizer_group.get_lora_tokenizer(None),
+                        MistralTokenizer):
+        skip_special_tokens = False
+    else:
+        pytest.skip("MistralTokenizers don't support "
+                    "skip_special_tokens=False")
+        return
+    """Verify Detokenizer decodes prompt logprobs correctly."""
+    sampling_params = SamplingParams(skip_special_tokens=skip_special_tokens,
+                                     prompt_logprobs=1)
+
+    # Run sequentially.
+    seq = create_sequence(complete_sequence_token_ids)
+    seq_group = SequenceGroup(request_id="1",
+                              seqs=[seq],
+                              sampling_params=sampling_params,
+                              arrival_time=0.0)
+    dummy_logprobs = create_dummy_prompt_logprobs(complete_sequence_token_ids)
+    detokenizer.decode_prompt_logprobs_inplace(seq_group,
+                                               dummy_logprobs,
+                                               position_offset=0)
+    # First logprob is None.
+    decoded_prompt_logprobs: list[dict[int, Any]] = dummy_logprobs[
+        1:]  # type: ignore
+
+    # decoded_prompt_logprobs doesn't contain the first token.
+    token_ids = complete_sequence_token_ids
+    tokenizer = detokenizer.get_tokenizer_for_seq(seq)
+    text_full = tokenizer.decode(token_ids,
+                                 skip_special_tokens=skip_special_tokens)
+    text_first = tokenizer.decode(token_ids[0],
+                                  skip_special_tokens=skip_special_tokens)
+    text = text_full[len(text_first):]
+
+    # Text for logprobs for the chosen token should be the same as the
+    # prompt text. Note that the first logprob is None.
+    assert text == "".join([
+        logprobs[token_id].decoded_token
+        for token_id, logprobs in zip(token_ids[1:], decoded_prompt_logprobs)
+    ])
+    assert text != "".join([
+        logprobs[token_id + 1].decoded_token
+        for token_id, logprobs in zip(token_ids[1:], decoded_prompt_logprobs)
+    ])
+
+
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 7, 16, -1])
+def test_decode_prompt_logprobs_chunked_prefill(
+    vllm_runner,
+    model,
+    chunked_prefill_token_size: int,
+    example_prompts,
+    monkeypatch,
+):
+    # VLLM V1 does not use incremental detokenization for
+    # prompt logprobs, so this test strategy is irrelevant.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
+
+    max_num_seqs = 256
+    enable_chunked_prefill = False
+    max_num_batched_tokens = None
+    if chunked_prefill_token_size != -1:
+        enable_chunked_prefill = True
+        max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
+        max_num_batched_tokens = chunked_prefill_token_size
+
+    with vllm_runner(model,
+                     dtype="half",
+                     max_logprobs=5,
+                     gpu_memory_utilization=0.5,
+                     enable_chunked_prefill=enable_chunked_prefill,
+                     max_num_batched_tokens=max_num_batched_tokens,
+                     max_num_seqs=max_num_seqs) as vllm_model:
+
+        vllm_sampling_params = SamplingParams(max_tokens=10,
+                                              logprobs=5,
+                                              prompt_logprobs=5,
+                                              temperature=0.0)
+        vllm_results = vllm_model.llm.generate(
+            example_prompts, sampling_params=vllm_sampling_params)
+
+        for idx, result in enumerate(vllm_results):
+            assert result.prompt_logprobs is not None
+            assert result.prompt_logprobs[0] is None
+
+            # Compared detokenized prompts ids to original prompt.
+            generated_string = ""
+            for (prompt_token,
+                 prompt_logprobs) in zip(result.prompt_token_ids[1:],
+                                         result.prompt_logprobs[1:]):
+                # prompt_logprobs is a dict of the token_id: logprob
+                # We select the token_id corresponding to the actual prompt
+                # Decoded token in the detokenized string corresponding to this
+                # prompt token.
+                generated_string += prompt_logprobs[prompt_token].decoded_token
+
+            assert generated_string == example_prompts[idx], (
+                "Detokenized prompt logprobs do not match original prompt")
diff --git a/vllm_v0.10.0/tests/tokenization/test_do_lower_case.py b/vllm_v0.10.0/tests/tokenization/test_do_lower_case.py
new file mode 100644
index 0000000..7aa655e
--- /dev/null
+++ b/vllm_v0.10.0/tests/tokenization/test_do_lower_case.py
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+TOKENIZER_NAMES = ["BAAI/bge-base-en"]
+
+
+@pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
+@pytest.mark.parametrize("n_tokens", [510])
+def test_special_tokens(tokenizer_name: str, n_tokens: int):
+    tokenizer = get_tokenizer(tokenizer_name, revision="main")
+
+    prompts = '[UNK]' * n_tokens
+    prompt_token_ids = tokenizer.encode(prompts)
+    assert len(prompt_token_ids) == n_tokens + 2
diff --git a/vllm_v0.10.0/tests/tokenization/test_get_eos.py b/vllm_v0.10.0/tests/tokenization/test_get_eos.py
new file mode 100644
index 0000000..d828842
--- /dev/null
+++ b/vllm_v0.10.0/tests/tokenization/test_get_eos.py
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This test file includes some cases where it is inappropriate to
+only get the `eos_token_id` from the tokenizer as defined by
+{meth}`vllm.LLMEngine._get_eos_token_id`.
+"""
+from vllm.transformers_utils.config import try_get_generation_config
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+
+def test_get_llama3_eos_token():
+    model_name = "meta-llama/Llama-3.2-1B-Instruct"
+
+    tokenizer = get_tokenizer(model_name)
+    assert tokenizer.eos_token_id == 128009
+
+    generation_config = try_get_generation_config(model_name,
+                                                  trust_remote_code=False)
+    assert generation_config is not None
+    assert generation_config.eos_token_id == [128001, 128008, 128009]
+
+
+def test_get_blip2_eos_token():
+    model_name = "Salesforce/blip2-opt-2.7b"
+
+    tokenizer = get_tokenizer(model_name)
+    assert tokenizer.eos_token_id == 2
+
+    generation_config = try_get_generation_config(model_name,
+                                                  trust_remote_code=False)
+    assert generation_config is not None
+    assert generation_config.eos_token_id == 50118
diff --git a/vllm_v0.10.0/tests/tokenization/test_mistral_tokenizer.py b/vllm_v0.10.0/tests/tokenization/test_mistral_tokenizer.py
new file mode 100644
index 0000000..69b3c62
--- /dev/null
+++ b/vllm_v0.10.0/tests/tokenization/test_mistral_tokenizer.py
@@ -0,0 +1,188 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from mistral_common.protocol.instruct.messages import (AssistantMessage,
+                                                       ToolMessage,
+                                                       UserMessage)
+from mistral_common.protocol.instruct.request import ChatCompletionRequest
+from mistral_common.protocol.instruct.tool_calls import (Function,
+                                                         FunctionCall, Tool,
+                                                         ToolCall)
+
+from vllm.transformers_utils.tokenizers.mistral import (
+    make_mistral_chat_completion_request)
+
+
+@pytest.mark.parametrize(
+    "openai_request,expected_mistral_request",
+    [(
+        {
+            "messages": [{
+                "role": "user",
+                "content": "What is the current local date and time?",
+            }],
+            "tools": [{
+                "type": "function",
+                "function": {
+                    "description": "Fetch the current local date and time.",
+                    "name": "get_current_time",
+                },
+            }],
+        },
+        ChatCompletionRequest(
+            messages=[
+                UserMessage(content="What is the current local date and time?")
+            ],
+            tools=[
+                Tool(
+                    type="function",
+                    function=Function(
+                        name="get_current_time",
+                        description="Fetch the current local date and time.",
+                        parameters={},
+                    ),
+                )
+            ],
+        ),
+    ),
+     (
+         {
+             "messages":
+             [{
+                 "role": "user",
+                 "content": "What is the current local date and time?",
+             }],
+             "tools": [{
+                 "type": "function",
+                 "function": {
+                     "description": "Fetch the current local date and time.",
+                     "name": "get_current_time",
+                     "parameters": None,
+                 },
+             }],
+         },
+         ChatCompletionRequest(
+             messages=[
+                 UserMessage(
+                     content="What is the current local date and time?")
+             ],
+             tools=[
+                 Tool(
+                     type="function",
+                     function=Function(
+                         name="get_current_time",
+                         description="Fetch the current local date and time.",
+                         parameters={},
+                     ),
+                 )
+             ],
+         ),
+     )],
+)
+def test_make_mistral_chat_completion_request(openai_request,
+                                              expected_mistral_request):
+    actual_request = make_mistral_chat_completion_request(
+        openai_request["messages"], openai_request["tools"])
+    assert actual_request == expected_mistral_request
+
+
+# Tool use with list content and reasoning_content
+@pytest.mark.parametrize("openai_request,expected_mistral_request", [(
+    {
+        "messages": [
+            {
+                "role": "user",
+                "content": "What's the weather in Paris?",
+            },
+            {
+                "role":
+                "assistant",
+                "reasoning_content":
+                None,
+                "content":
+                None,
+                "tool_calls": [{
+                    "id": "call123",
+                    "type": "function",
+                    "function": {
+                        "name": "get_weather",
+                        "arguments": '{"city": "Paris"}',
+                    },
+                }],
+            },
+            {
+                "role": "tool",
+                "content": [{
+                    "type": "text",
+                    "text": "Rainy"
+                }],
+                "name": "get_weather",
+                "tool_call_id": "call123",
+            },
+        ],
+        "tools": [{
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Gets the current weather in a city.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "city": {
+                            "type": "string",
+                            "description": "The city name"
+                        }
+                    },
+                    "required": ["city"],
+                },
+            },
+        }],
+    },
+    ChatCompletionRequest(
+        messages=[
+            UserMessage(content="What's the weather in Paris?"),
+            AssistantMessage(
+                content=None,
+                tool_calls=[
+                    ToolCall(
+                        id="call123",
+                        function=FunctionCall(
+                            name="get_weather",
+                            arguments='{"city": "Paris"}',
+                        ),
+                    )
+                ],
+            ),
+            ToolMessage(
+                content="Rainy",
+                tool_call_id="call123",
+                name="get_weather",
+            ),
+        ],
+        tools=[
+            Tool(
+                type="function",
+                function=Function(
+                    name="get_weather",
+                    description="Gets the current weather in a city.",
+                    parameters={
+                        "type": "object",
+                        "properties": {
+                            "city": {
+                                "type": "string",
+                                "description": "The city name"
+                            }
+                        },
+                        "required": ["city"],
+                    },
+                ),
+            )
+        ],
+    ),
+)])
+def test_make_mistral_chat_completion_request_list_content(
+        openai_request, expected_mistral_request):
+    actual_request = make_mistral_chat_completion_request(
+        openai_request["messages"], openai_request["tools"])
+    assert actual_request == expected_mistral_request
diff --git a/vllm_v0.10.0/tests/tokenization/test_tokenizer.py b/vllm_v0.10.0/tests/tokenization/test_tokenizer.py
new file mode 100644
index 0000000..09a3638
--- /dev/null
+++ b/vllm_v0.10.0/tests/tokenization/test_tokenizer.py
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from transformers import PreTrainedTokenizerBase
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+TOKENIZER_NAMES = [
+    "facebook/opt-125m",
+    "gpt2",
+]
+
+
+@pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
+def test_tokenizer_revision(tokenizer_name: str):
+    # Assume that "main" branch always exists
+    tokenizer = get_tokenizer(tokenizer_name, revision="main")
+    assert isinstance(tokenizer, PreTrainedTokenizerBase)
+
+    # Assume that "never" branch always does not exist
+    with pytest.raises(OSError, match='not a valid git identifier'):
+        get_tokenizer(tokenizer_name, revision="never")
diff --git a/vllm_v0.10.0/tests/tokenization/test_tokenizer_group.py b/vllm_v0.10.0/tests/tokenization/test_tokenizer_group.py
new file mode 100644
index 0000000..0570c15
--- /dev/null
+++ b/vllm_v0.10.0/tests/tokenization/test_tokenizer_group.py
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
+
+
+@pytest.mark.asyncio
+async def test_tokenizer_group():
+    reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    tokenizer_group = TokenizerGroup(
+        tokenizer_id="gpt2",
+        enable_lora=False,
+        max_num_seqs=1,
+        max_input_length=None,
+    )
+    assert reference_tokenizer.encode("prompt") == tokenizer_group.encode(
+        prompt="prompt", lora_request=None)
+    assert reference_tokenizer.encode(
+        "prompt") == await tokenizer_group.encode_async(prompt="prompt",
+                                                        lora_request=None)
+    assert isinstance(tokenizer_group.get_lora_tokenizer(None),
+                      PreTrainedTokenizerBase)
+    assert tokenizer_group.get_lora_tokenizer(
+        None) == await tokenizer_group.get_lora_tokenizer_async(None)
diff --git a/vllm_v0.10.0/tests/tokenization/test_tokenizer_registry.py b/vllm_v0.10.0/tests/tokenization/test_tokenizer_registry.py
new file mode 100644
index 0000000..5abb101
--- /dev/null
+++ b/vllm_v0.10.0/tests/tokenization/test_tokenizer_registry.py
@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.transformers_utils.tokenizer_base import (TokenizerBase,
+                                                    TokenizerRegistry)
+
+if TYPE_CHECKING:
+    from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+
+
+class TestTokenizer(TokenizerBase):
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer":
+        return TestTokenizer()
+
+    @property
+    def all_special_tokens_extended(self) -> list[str]:
+        raise NotImplementedError()
+
+    @property
+    def all_special_tokens(self) -> list[str]:
+        raise NotImplementedError()
+
+    @property
+    def all_special_ids(self) -> list[int]:
+        raise NotImplementedError()
+
+    @property
+    def bos_token_id(self) -> int:
+        return 0
+
+    @property
+    def eos_token_id(self) -> int:
+        return 1
+
+    @property
+    def sep_token(self) -> str:
+        raise NotImplementedError()
+
+    @property
+    def pad_token(self) -> str:
+        raise NotImplementedError()
+
+    @property
+    def is_fast(self) -> bool:
+        raise NotImplementedError()
+
+    @property
+    def vocab_size(self) -> int:
+        raise NotImplementedError()
+
+    @property
+    def max_token_id(self) -> int:
+        raise NotImplementedError()
+
+    def __call__(
+        self,
+        text: Union[str, list[str], list[int]],
+        text_pair: Optional[str] = None,
+        add_special_tokens: bool = False,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+    ):
+        raise NotImplementedError()
+
+    def get_vocab(self) -> dict[str, int]:
+        raise NotImplementedError()
+
+    def get_added_vocab(self) -> dict[str, int]:
+        raise NotImplementedError()
+
+    def encode_one(
+        self,
+        text: str,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+    ) -> list[int]:
+        raise NotImplementedError()
+
+    def encode(self,
+               text: str,
+               add_special_tokens: Optional[bool] = None) -> list[int]:
+        raise NotImplementedError()
+
+    def apply_chat_template(self,
+                            messages: list["ChatCompletionMessageParam"],
+                            tools: Optional[list[dict[str, Any]]] = None,
+                            **kwargs) -> list[int]:
+        raise NotImplementedError()
+
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
+        raise NotImplementedError()
+
+    def decode(self,
+               ids: Union[list[int], int],
+               skip_special_tokens: bool = True) -> str:
+        raise NotImplementedError()
+
+    def convert_ids_to_tokens(
+        self,
+        ids: list[int],
+        skip_special_tokens: bool = True,
+    ) -> list[str]:
+        raise NotImplementedError()
+
+
+def test_customized_tokenizer():
+    TokenizerRegistry.register("test_tokenizer",
+                               "tests.tokenization.test_tokenizer_registry",
+                               "TestTokenizer")
+
+    tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer")
+    assert isinstance(tokenizer, TestTokenizer)
+    assert tokenizer.bos_token_id == 0
+    assert tokenizer.eos_token_id == 1
+
+    tokenizer = get_tokenizer("test_tokenizer", tokenizer_mode="custom")
+    assert isinstance(tokenizer, TestTokenizer)
+    assert tokenizer.bos_token_id == 0
+    assert tokenizer.eos_token_id == 1
diff --git a/vllm_v0.10.0/tests/tool_use/__init__.py b/vllm_v0.10.0/tests/tool_use/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/tool_use/conftest.py b/vllm_v0.10.0/tests/tool_use/conftest.py
new file mode 100644
index 0000000..510b547
--- /dev/null
+++ b/vllm_v0.10.0/tests/tool_use/conftest.py
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import pytest_asyncio
+from huggingface_hub import snapshot_download
+
+from tests.utils import RemoteOpenAIServer
+from vllm.platforms import current_platform
+
+from .utils import ARGS, CONFIGS, ServerConfig
+
+
+# select models to test based on command line arguments
+def pytest_addoption(parser):
+    parser.addoption("--models",
+                     nargs="+",
+                     help="Specify one or more models to test")
+    parser.addoption("--extended",
+                     action="store_true",
+                     default=False,
+                     help="invoke extended tests requiring large GPUs")
+
+
+# for each server config, download the model and return the config
+@pytest.fixture(scope="session", params=CONFIGS.keys())
+def server_config(request):
+    extended = request.config.getoption("--extended")
+    models = request.config.getoption("--models")
+
+    config_keys_to_test = [
+        key for key in CONFIGS if (models is None or key in models) and (
+            extended or not CONFIGS[key].get("extended", False))
+    ]
+
+    config_key = request.param
+    if config_key not in config_keys_to_test:
+        pytest.skip(f"Skipping config '{config_key}'")
+
+    config = CONFIGS[config_key]
+
+    if current_platform.is_rocm() and not config.get("supports_rocm", True):
+        pytest.skip("The {} model can't be tested on the ROCm platform".format(
+            config["model"]))
+
+    # download model and tokenizer using transformers
+    snapshot_download(config["model"])
+    yield CONFIGS[request.param]
+
+
+# run this for each server config
+@pytest.fixture(scope="session")
+def server(request, server_config: ServerConfig):
+    model = server_config["model"]
+    args_for_model = server_config["arguments"]
+    with RemoteOpenAIServer(model, ARGS + args_for_model,
+                            max_wait_seconds=480) as server:
+        yield server
+
+
+@pytest_asyncio.fixture
+async def client(server: RemoteOpenAIServer):
+    async with server.get_async_client() as async_client:
+        yield async_client
diff --git a/vllm_v0.10.0/tests/tool_use/test_chat_completion_request_validations.py b/vllm_v0.10.0/tests/tool_use/test_chat_completion_request_validations.py
new file mode 100644
index 0000000..a30c58b
--- /dev/null
+++ b/vllm_v0.10.0/tests/tool_use/test_chat_completion_request_validations.py
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest
+
+
+def test_chat_completion_request_with_no_tools():
+    # tools key is not present
+    request = ChatCompletionRequest.model_validate({
+        'messages': [{
+            'role': 'user',
+            'content': 'Hello'
+        }],
+        'model':
+        'facebook/opt-125m',
+    })
+    assert request.tool_choice == 'none'
+
+    # tools key is None
+    request = ChatCompletionRequest.model_validate({
+        'messages': [{
+            'role': 'user',
+            'content': 'Hello'
+        }],
+        'model':
+        'facebook/opt-125m',
+        'tools':
+        None
+    })
+    assert request.tool_choice == 'none'
+
+    # tools key present but empty
+    request = ChatCompletionRequest.model_validate({
+        'messages': [{
+            'role': 'user',
+            'content': 'Hello'
+        }],
+        'model':
+        'facebook/opt-125m',
+        'tools': []
+    })
+    assert request.tool_choice == 'none'
+
+
+@pytest.mark.parametrize('tool_choice', ['auto', 'required'])
+def test_chat_completion_request_with_tool_choice_but_no_tools(tool_choice):
+    with pytest.raises(ValueError,
+                       match="When using `tool_choice`, `tools` must be set."):
+        ChatCompletionRequest.model_validate({
+            'messages': [{
+                'role': 'user',
+                'content': 'Hello'
+            }],
+            'model':
+            'facebook/opt-125m',
+            'tool_choice':
+            tool_choice
+        })
+
+    with pytest.raises(ValueError,
+                       match="When using `tool_choice`, `tools` must be set."):
+        ChatCompletionRequest.model_validate({
+            'messages': [{
+                'role': 'user',
+                'content': 'Hello'
+            }],
+            'model':
+            'facebook/opt-125m',
+            'tool_choice':
+            tool_choice,
+            'tools':
+            None
+        })
diff --git a/vllm_v0.10.0/tests/tool_use/test_chat_completions.py b/vllm_v0.10.0/tests/tool_use/test_chat_completions.py
new file mode 100644
index 0000000..8c01c86
--- /dev/null
+++ b/vllm_v0.10.0/tests/tool_use/test_chat_completions.py
@@ -0,0 +1,147 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import openai
+import pytest
+
+from .utils import (MESSAGES_WITHOUT_TOOLS, WEATHER_TOOL, ServerConfig,
+                    ensure_system_prompt)
+
+
+# test: make sure chat completions without tools provided work even when tools
+# are enabled. This makes sure tool call chat templates work, AND that the tool
+# parser stream processing doesn't change the output of the model.
+@pytest.mark.asyncio
+async def test_chat_completion_without_tools(client: openai.AsyncOpenAI,
+                                             server_config: ServerConfig):
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+    chat_completion = await client.chat.completions.create(
+        messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
+        temperature=0,
+        max_completion_tokens=150,
+        model=model_name,
+        logprobs=False)
+    choice = chat_completion.choices[0]
+    stop_reason = chat_completion.choices[0].finish_reason
+    output_text = chat_completion.choices[0].message.content
+
+    # check to make sure we got text
+    assert output_text is not None
+    assert len(output_text) > 0
+    assert stop_reason != "tool_calls"
+
+    # check to make sure no tool calls were returned
+    assert (choice.message.tool_calls is None
+            or len(choice.message.tool_calls) == 0)
+
+    # make the same request, streaming
+    stream = await client.chat.completions.create(
+        messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
+        temperature=0,
+        max_completion_tokens=150,
+        model=model_name,
+        logprobs=False,
+        stream=True,
+    )
+    chunks: list[str] = []
+    finish_reason_count = 0
+    role_sent: bool = False
+
+    # assemble streamed chunks
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+
+        # make sure the role is assistant
+        if delta.role:
+            assert not role_sent
+            assert delta.role == 'assistant'
+            role_sent = True
+
+        if delta.content:
+            chunks.append(delta.content)
+
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+            assert chunk.choices[0].finish_reason == choice.finish_reason
+
+        # make sure tool call chunks aren't being streamed
+        assert not delta.tool_calls or len(delta.tool_calls) == 0
+
+    # make sure the role was sent, only 1 finish reason was sent, that chunks
+    # were in fact sent, and that the chunks match non-streaming
+    assert role_sent
+    assert finish_reason_count == 1
+    assert len(chunks)
+    assert "".join(chunks) == output_text
+
+
+# test: conversation with tools enabled and provided that should not invoke
+# tools, to make sure we can still get normal chat completion responses
+# and that they won't be parsed as tools
+@pytest.mark.asyncio
+async def test_chat_completion_with_tools(client: openai.AsyncOpenAI,
+                                          server_config: ServerConfig):
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+    chat_completion = await client.chat.completions.create(
+        messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
+        temperature=0,
+        max_completion_tokens=150,
+        model=model_name,
+        tools=[WEATHER_TOOL],
+        logprobs=False)
+    choice = chat_completion.choices[0]
+    stop_reason = chat_completion.choices[0].finish_reason
+    output_text = chat_completion.choices[0].message.content
+
+    # check to make sure we got text
+    assert output_text is not None
+    assert stop_reason != 'tool_calls'
+    assert len(output_text) > 0
+
+    # check to make sure no tool calls were returned
+    assert (choice.message.tool_calls is None
+            or len(choice.message.tool_calls) == 0)
+
+    # make the same request, streaming
+    stream = await client.chat.completions.create(
+        messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
+        temperature=0,
+        max_completion_tokens=150,
+        model=model_name,
+        logprobs=False,
+        tools=[WEATHER_TOOL],
+        stream=True,
+    )
+
+    chunks: list[str] = []
+    finish_reason_count = 0
+    role_sent: bool = False
+
+    # assemble streamed chunks
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+
+        # make sure the role is assistant
+        if delta.role:
+            assert delta.role == 'assistant'
+            role_sent = True
+
+        if delta.content:
+            chunks.append(delta.content)
+
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+
+        # make sure tool call chunks aren't being streamed
+        assert not delta.tool_calls or len(delta.tool_calls) == 0
+
+    # make sure the role was sent, only 1 finish reason was sent, that chunks
+    # were in fact sent, and that the chunks match non-streaming
+    assert role_sent
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert chunk.choices[0].finish_reason != 'tool_calls'
+    assert len(chunks)
+    assert "".join(chunks) == output_text
diff --git a/vllm_v0.10.0/tests/tool_use/test_glm4_moe_tool_parser.py b/vllm_v0.10.0/tests/tool_use/test_glm4_moe_tool_parser.py
new file mode 100644
index 0000000..478f4b9
--- /dev/null
+++ b/vllm_v0.10.0/tests/tool_use/test_glm4_moe_tool_parser.py
@@ -0,0 +1,410 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+
+import json
+
+import pytest
+
+from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
+from vllm.entrypoints.openai.tool_parsers import Glm4MoeModelToolParser
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+pytest.skip("skip glm4_moe parser test", allow_module_level=True)
+# Use a common model that is likely to be available
+MODEL = "THUDM/GLM-4.5"
+
+
+@pytest.fixture(scope="module")
+def glm4_moe_tokenizer():
+    return get_tokenizer(tokenizer_name=MODEL)
+
+
+@pytest.fixture
+def glm4_moe_tool_parser(glm4_moe_tokenizer):
+    return Glm4MoeModelToolParser(glm4_moe_tokenizer)
+
+
+def assert_tool_calls(actual_tool_calls: list[ToolCall],
+                      expected_tool_calls: list[ToolCall]):
+    assert len(actual_tool_calls) == len(expected_tool_calls)
+
+    for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
+                                                    expected_tool_calls):
+        assert isinstance(actual_tool_call.id, str)
+        assert len(actual_tool_call.id) > 0
+
+        assert actual_tool_call.type == "function"
+        assert actual_tool_call.function.name == expected_tool_call.function.name
+        # Compare arguments as JSON objects to handle formatting differences
+        actual_args = json.loads(actual_tool_call.function.arguments)
+        expected_args = json.loads(expected_tool_call.function.arguments)
+        assert actual_args == expected_args
+
+
+def test_extract_tool_calls_no_tools(glm4_moe_tool_parser):
+    model_output = "This is a test"
+    extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+    assert not extracted_tool_calls.tools_called
+    assert extracted_tool_calls.tool_calls == []
+    assert extracted_tool_calls.content == model_output
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "single_tool_call",
+        "multiple_tool_calls",
+        "tool_call_with_content_before",
+        "tool_call_with_mixed_args",
+        "tool_call_with_chinese_content",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        (
+            """<tool_call>get_current_weather
+    <arg_key>city</arg_key>
+    <arg_value>Dallas</arg_value>
+    <arg_key>state</arg_key>
+    <arg_value>TX</arg_value>
+    <arg_key>unit</arg_key>
+    <arg_value>fahrenheit</arg_value>
+    </tool_call>""",
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Dallas",
+                        "state": "TX",
+                        "unit": "fahrenheit",
+                    }),
+                ))
+            ],
+            None,
+        ),
+        (
+            """<tool_call>get_current_weather
+    <arg_key>city</arg_key>
+    <arg_value>Dallas</arg_value>
+    <arg_key>state</arg_key>
+    <arg_value>TX</arg_value>
+    <arg_key>unit</arg_key>
+    <arg_value>fahrenheit</arg_value>
+    </tool_call>
+    <tool_call>get_current_weather
+    <arg_key>city</arg_key>
+    <arg_value>Orlando</arg_value>
+    <arg_key>state</arg_key>
+    <arg_value>FL</arg_value>
+    <arg_key>unit</arg_key>
+    <arg_value>fahrenheit</arg_value>
+    </tool_call>""",
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Dallas",
+                        "state": "TX",
+                        "unit": "fahrenheit",
+                    }),
+                )),
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Orlando",
+                        "state": "FL",
+                        "unit": "fahrenheit",
+                    }),
+                )),
+            ],
+            None,
+        ),
+        (
+            """I'll help you check the weather. <tool_call>get_current_weather
+    <arg_key>city</arg_key>
+    <arg_value>Seattle</arg_value>
+    <arg_key>state</arg_key>
+    <arg_value>WA</arg_value>
+    <arg_key>unit</arg_key>
+    <arg_value>celsius</arg_value>
+    </tool_call>""",
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Seattle",
+                        "state": "WA",
+                        "unit": "celsius",
+                    }),
+                ))
+            ],
+            "I'll help you check the weather.",
+        ),
+        (
+            """<tool_call>get_current_weather
+    <arg_key>city</arg_key>
+    <arg_value>New York</arg_value>
+    <arg_key>state</arg_key>
+    <arg_value>NY</arg_value>
+    <arg_key>unit</arg_key>
+    <arg_value>celsius</arg_value>
+    </tool_call>""",
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "New York",
+                        "state": "NY",
+                        "unit": "celsius",
+                    }),
+                ))
+            ],
+            None,
+        ),
+        ("""I will help you get the weather.<tool_call>get_weather
+    <arg_key>city</arg_key>
+    <arg_value>Beijing</arg_value>
+    <arg_key>date</arg_key>
+    <arg_value>2025-08-01</arg_value>
+    </tool_call>""", [
+            ToolCall(function=FunctionCall(
+                name="get_weather",
+                arguments=json.dumps({
+                    "city": "Beijing",
+                    "date": "2025-08-01",
+                }),
+            ))
+        ], "I will help you get the weather."),
+    ],
+)
+def test_extract_tool_calls(glm4_moe_tool_parser, model_output,
+                            expected_tool_calls, expected_content):
+    extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+    assert extracted_tool_calls.tools_called
+    assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
+
+    assert extracted_tool_calls.content == expected_content
+
+
+def test_extract_tool_calls_with_thinking_tags(glm4_moe_tool_parser):
+    """Test tool extraction when thinking tags are present."""
+    model_output = """<think>I want to get the weather.</think>
+
+I will help you get the weather.
+<tool_call>get_weather
+<arg_key>city</arg_key>
+<arg_value>Beijing</arg_value>
+<arg_key>date</arg_key>
+<arg_value>2025-08-01</arg_value>
+</tool_call>"""
+
+    extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+
+    assert extracted_tool_calls.tools_called
+    assert len(extracted_tool_calls.tool_calls) == 1
+    assert extracted_tool_calls.tool_calls[0].function.name == "get_weather"
+
+    expected_content = """<think>I want to get the weather.</think>
+
+I will help you get the weather."""
+    assert extracted_tool_calls.content == expected_content
+
+
+def test_extract_tool_calls_malformed_xml(glm4_moe_tool_parser):
+    """Test that malformed XML is handled gracefully."""
+    model_output = """<tool_call>get_weather
+<arg_key>city</arg_key>
+<arg_value>Seattle</arg_value>
+<arg_key>incomplete_arg
+<arg_value>value</arg_value>
+</tool_call>"""
+
+    extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+
+    # Should handle malformed XML gracefully
+    # The parser should either extract what it can or return no tool calls
+    # depending on how robust we want the parsing to be
+    assert isinstance(extracted_tool_calls.tools_called, bool)
+    assert isinstance(extracted_tool_calls.tool_calls, list)
+
+
+def test_extract_tool_calls_empty_arguments(glm4_moe_tool_parser):
+    """Test tool calls with no arguments."""
+    model_output = """<tool_call>get_current_time
+</tool_call>"""
+
+    extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+
+    assert extracted_tool_calls.tools_called
+    assert len(extracted_tool_calls.tool_calls) == 1
+    assert extracted_tool_calls.tool_calls[
+        0].function.name == "get_current_time"
+    # Empty arguments should result in empty JSON object
+    assert extracted_tool_calls.tool_calls[0].function.arguments == "{}"
+
+
+def test_extract_tool_calls_mixed_content(glm4_moe_tool_parser):
+    """Test extraction with mixed content and multiple tool calls."""
+    model_output = """I will help you get the weather info.
+
+<tool_call>get_weather
+<arg_key>city</arg_key>
+<arg_value>Beijing</arg_value>
+<arg_key>date</arg_key>
+<arg_value>2025-08-01</arg_value>
+</tool_call>
+
+meaningwhile, I will also check the weather in Shanghai.
+
+<tool_call>get_weather
+<arg_key>city</arg_key>
+<arg_value>Shanghai</arg_value>
+<arg_key>date</arg_key>
+<arg_value>2025-08-01</arg_value>
+</tool_call>"""
+
+    extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+
+    assert extracted_tool_calls.tools_called
+    assert len(extracted_tool_calls.tool_calls) == 2
+
+    # Check first tool call
+    assert extracted_tool_calls.tool_calls[0].function.name == "get_weather"
+    args1 = json.loads(extracted_tool_calls.tool_calls[0].function.arguments)
+    assert args1["city"] == "Beijing"
+    assert args1["date"] == "2025-08-01"
+
+    # Check second tool call
+    assert extracted_tool_calls.tool_calls[1].function.name == "get_weather"
+    args2 = json.loads(extracted_tool_calls.tool_calls[1].function.arguments)
+    assert args2["city"] == "Shanghai"
+    assert args2["date"] == "2025-08-01"
+
+    # Content should be everything before the first tool call
+    assert extracted_tool_calls.content == "I will help you get the weather info."
+
+
+def test_streaming_basic_functionality(glm4_moe_tool_parser):
+    """Test basic streaming functionality."""
+    # Reset streaming state
+    glm4_moe_tool_parser.current_tool_name_sent = False
+    glm4_moe_tool_parser.prev_tool_call_arr = []
+    glm4_moe_tool_parser.current_tool_id = -1
+    glm4_moe_tool_parser.streamed_args_for_tool = []
+
+    # Test with a simple tool call
+    current_text = """<tool_call>get_weather
+<arg_key>city</arg_key>
+<arg_value>Beijing</arg_value>
+</tool_call>"""
+
+    # Mock token IDs for testing
+    tool_call_start_id = glm4_moe_tool_parser.tool_call_start_token_id or 12345
+    tool_call_end_id = glm4_moe_tool_parser.tool_call_end_token_id or 12346
+
+    result = glm4_moe_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text=current_text,
+        delta_text="</tool_call>",
+        previous_token_ids=[],
+        current_token_ids=[tool_call_start_id, tool_call_end_id],
+        delta_token_ids=[tool_call_end_id],
+        request=None,
+    )
+
+    # The result behavior depends on the streaming state
+    # This test mainly ensures no exceptions are thrown
+    assert result is None or hasattr(result, 'tool_calls') or hasattr(
+        result, 'content')
+
+
+def test_streaming_no_tool_calls(glm4_moe_tool_parser):
+    """Test streaming when there are no tool calls."""
+    current_text = "This is just regular text without any tool calls."
+
+    result = glm4_moe_tool_parser.extract_tool_calls_streaming(
+        previous_text="This is just regular text",
+        current_text=current_text,
+        delta_text=" without any tool calls.",
+        previous_token_ids=[],
+        current_token_ids=[],
+        delta_token_ids=[],
+        request=None,
+    )
+
+    # Should return the delta text as content
+    assert result is not None
+    assert hasattr(result, 'content')
+    assert result.content == " without any tool calls."
+
+
+def test_streaming_with_content_before_tool_calls(glm4_moe_tool_parser):
+    """Test streaming when there's content before tool calls."""
+    # Reset streaming state
+    glm4_moe_tool_parser.current_tool_name_sent = False
+    glm4_moe_tool_parser.prev_tool_call_arr = []
+    glm4_moe_tool_parser.current_tool_id = -1
+    glm4_moe_tool_parser.streamed_args_for_tool = []
+
+    current_text = "I will help you get the weather<tool_call>"
+
+    result = glm4_moe_tool_parser.extract_tool_calls_streaming(
+        previous_text="I will help you",
+        current_text=current_text,
+        delta_text="get the weather.<tool_call>",
+        previous_token_ids=[],
+        current_token_ids=[],
+        delta_token_ids=[],
+        request=None,
+    )
+
+    # Should return content when no tool call tokens are detected
+    assert result is not None
+    assert hasattr(result, 'content')
+    assert result.content == "get the weather.<tool_call>"
+
+
+def test_extract_tool_calls_special_characters(glm4_moe_tool_parser):
+    """Test tool calls with special characters and unicode."""
+    model_output = """<tool_call>send_message
+<arg_key>recipient</arg_key>
+<arg_value>Amy</arg_value>
+<arg_key>message</arg_key>
+<arg_value>It is a nice day</arg_value>
+<arg_key>priority</arg_key>
+<arg_value>high</arg_value>
+</tool_call>"""
+
+    extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+
+    assert extracted_tool_calls.tools_called
+    assert len(extracted_tool_calls.tool_calls) == 1
+    assert extracted_tool_calls.tool_calls[0].function.name == "send_message"
+
+    args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments)
+    assert args["recipient"] == "Amy"
+    assert args["message"] == "It is a nice day"
+    assert args["priority"] == "high"
+
+
+def test_extract_tool_calls_incomplete_tool_call(glm4_moe_tool_parser):
+    """Test incomplete tool calls (missing closing tag)."""
+    model_output = """<tool_call>get_weather
+<arg_key>city</arg_key>
+<arg_value>Beijing</arg_value>
+<arg_key>date</arg_key>
+<arg_value>2025-08-01</arg_value>"""
+
+    extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+
+    # Incomplete tool calls should not be extracted
+    assert not extracted_tool_calls.tools_called
+    assert extracted_tool_calls.tool_calls == []
+    assert extracted_tool_calls.content == model_output
diff --git a/vllm_v0.10.0/tests/tool_use/test_jamba_tool_parser.py b/vllm_v0.10.0/tests/tool_use/test_jamba_tool_parser.py
new file mode 100644
index 0000000..3515313
--- /dev/null
+++ b/vllm_v0.10.0/tests/tool_use/test_jamba_tool_parser.py
@@ -0,0 +1,279 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from collections.abc import Generator
+from typing import Optional
+
+import partial_json_parser
+import pytest
+from partial_json_parser.core.options import Allow
+
+from vllm.entrypoints.openai.protocol import (DeltaMessage, FunctionCall,
+                                              ToolCall)
+from vllm.entrypoints.openai.tool_parsers import JambaToolParser
+from vllm.transformers_utils.detokenizer import detokenize_incrementally
+from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+
+MODEL = "ai21labs/Jamba-tiny-dev"
+
+
+@pytest.fixture(scope="module")
+def jamba_tokenizer():
+    return get_tokenizer(tokenizer_name=MODEL)
+
+
+@pytest.fixture
+def jamba_tool_parser(jamba_tokenizer):
+    return JambaToolParser(jamba_tokenizer)
+
+
+def assert_tool_calls(actual_tool_calls: list[ToolCall],
+                      expected_tool_calls: list[ToolCall]):
+    assert len(actual_tool_calls) == len(expected_tool_calls)
+
+    for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
+                                                    expected_tool_calls):
+        assert isinstance(actual_tool_call.id, str)
+        assert len(actual_tool_call.id) > 16
+
+        assert actual_tool_call.type == "function"
+        assert actual_tool_call.function == expected_tool_call.function
+
+
+def stream_delta_message_generator(
+        jamba_tool_parser: JambaToolParser, jamba_tokenizer: AnyTokenizer,
+        model_output: str) -> Generator[DeltaMessage, None, None]:
+    all_token_ids = jamba_tokenizer.encode(model_output,
+                                           add_special_tokens=False)
+
+    previous_text = ""
+    previous_tokens = None
+    prefix_offset = 0
+    read_offset = 0
+    for i, delta_token in enumerate(all_token_ids):
+        delta_token_ids = [delta_token]
+        previous_token_ids = all_token_ids[:i]
+        current_token_ids = all_token_ids[:i + 1]
+
+        (new_tokens, delta_text, new_prefix_offset,
+         new_read_offset) = detokenize_incrementally(
+             tokenizer=jamba_tokenizer,
+             all_input_ids=current_token_ids,
+             prev_tokens=previous_tokens,
+             prefix_offset=prefix_offset,
+             read_offset=read_offset,
+             skip_special_tokens=False,
+             spaces_between_special_tokens=True,
+         )
+
+        current_text = previous_text + delta_text
+
+        delta_message = jamba_tool_parser.extract_tool_calls_streaming(
+            previous_text,
+            current_text,
+            delta_text,
+            previous_token_ids,
+            current_token_ids,
+            delta_token_ids,
+            request=None,  # type: ignore[arg-type]
+        )
+        if delta_message:
+            yield delta_message
+
+        previous_text = current_text
+        previous_tokens = previous_tokens + new_tokens if previous_tokens\
+            else new_tokens
+        prefix_offset = new_prefix_offset
+        read_offset = new_read_offset
+
+
+def test_extract_tool_calls_no_tools(jamba_tool_parser):
+    model_output = "This is a test"
+    extracted_tool_calls = jamba_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+    assert not extracted_tool_calls.tools_called
+    assert extracted_tool_calls.tool_calls == []
+    assert extracted_tool_calls.content == model_output
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "single_tool",
+        "single_tool_with_content",
+        "parallel_tools",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        (
+            ''' <tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
+            [
+                ToolCall(function=FunctionCall(name="get_current_weather",
+                                               arguments=json.dumps(
+                                                   {
+                                                       "city": "Dallas",
+                                                       "state": "TX",
+                                                       "unit": "fahrenheit"
+                                                   })))
+            ],
+            None),
+        (
+            ''' Sure! let me call the tool for you.<tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
+            [
+                ToolCall(function=FunctionCall(name="get_current_weather",
+                                               arguments=json.dumps(
+                                                   {
+                                                       "city": "Dallas",
+                                                       "state": "TX",
+                                                       "unit": "fahrenheit"
+                                                   })))
+            ],
+            " Sure! let me call the tool for you."),
+        (
+            ''' <tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}},\n    {"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
+            [
+                ToolCall(function=FunctionCall(name="get_current_weather",
+                                               arguments=json.dumps(
+                                                   {
+                                                       "city": "Dallas",
+                                                       "state": "TX",
+                                                       "unit": "fahrenheit"
+                                                   }))),
+                ToolCall(function=FunctionCall(name="get_current_weather",
+                                               arguments=json.dumps(
+                                                   {
+                                                       "city": "Orlando",
+                                                       "state": "FL",
+                                                       "unit": "fahrenheit"
+                                                   })))
+            ],
+            None)
+    ],
+)
+def test_extract_tool_calls(jamba_tool_parser, model_output,
+                            expected_tool_calls, expected_content):
+    extracted_tool_calls = jamba_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+    assert extracted_tool_calls.tools_called
+
+    assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
+
+    assert extracted_tool_calls.content == expected_content
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "no_tools",
+        "single_tool",
+        "single_tool_with_content",
+        "parallel_tools",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        ('''This is a test''', [], '''This is a test'''),
+        (
+            ''' <tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
+            [
+                ToolCall(function=FunctionCall(name="get_current_weather",
+                                               arguments=json.dumps(
+                                                   {
+                                                       "city": "Dallas",
+                                                       "state": "TX",
+                                                       "unit": "fahrenheit"
+                                                   })))
+            ],
+            " "),
+        (
+            ''' Sure! let me call the tool for you.<tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
+            [
+                ToolCall(function=FunctionCall(name="get_current_weather",
+                                               arguments=json.dumps(
+                                                   {
+                                                       "city": "Dallas",
+                                                       "state": "TX",
+                                                       "unit": "fahrenheit"
+                                                   })))
+            ],
+            " Sure! let me call the tool for you."),
+        (
+            ''' <tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}},\n    {"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
+            [
+                ToolCall(function=FunctionCall(name="get_current_weather",
+                                               arguments=json.dumps(
+                                                   {
+                                                       "city": "Dallas",
+                                                       "state": "TX",
+                                                       "unit": "fahrenheit"
+                                                   }))),
+                ToolCall(function=FunctionCall(name="get_current_weather",
+                                               arguments=json.dumps(
+                                                   {
+                                                       "city": "Orlando",
+                                                       "state": "FL",
+                                                       "unit": "fahrenheit"
+                                                   })))
+            ],
+            " ")
+    ],
+)
+def test_extract_tool_calls_streaming(jamba_tool_parser, jamba_tokenizer,
+                                      model_output, expected_tool_calls,
+                                      expected_content):
+    other_content: str = ''
+    function_names: list[str] = []
+    function_args_strs: list[str] = []
+    tool_call_idx: int = -1
+    tool_call_ids: list[Optional[str]] = []
+
+    for delta_message in stream_delta_message_generator(
+            jamba_tool_parser, jamba_tokenizer, model_output):
+        # role should never be streamed from tool parser
+        assert not delta_message.role
+
+        if delta_message.content:
+            other_content += delta_message.content
+
+        streamed_tool_calls = delta_message.tool_calls
+
+        if streamed_tool_calls and len(streamed_tool_calls) > 0:
+            # make sure only one diff is present - correct even for parallel
+            assert len(streamed_tool_calls) == 1
+            tool_call = streamed_tool_calls[0]
+
+            # if a new tool is being called, set up empty arguments
+            if tool_call.index != tool_call_idx:
+                tool_call_idx = tool_call.index
+                function_args_strs.append("")
+                tool_call_ids.append(None)
+
+            # if a tool call ID is streamed, make sure one hasn't been already
+            if tool_call.id and not tool_call_ids[tool_call.index]:
+                tool_call_ids[tool_call.index] = tool_call.id
+
+            # if parts of the function start being streamed
+            if tool_call.function:
+                # if the function name is defined, set it. it should be streamed
+                # IN ENTIRETY, exactly one time.
+                if tool_call.function.name:
+                    assert isinstance(tool_call.function.name, str)
+                    function_names.append(tool_call.function.name)
+
+                if tool_call.function.arguments:
+                    # make sure they're a string and then add them to the list
+                    assert isinstance(tool_call.function.arguments, str)
+
+                    function_args_strs[
+                        tool_call.index] += tool_call.function.arguments
+
+    assert other_content == expected_content
+
+    actual_tool_calls = [
+        ToolCall(id=tool_call_id,
+                 function=FunctionCall(
+                     name=function_name,
+                     arguments=partial_json_parser.ensure_json(
+                         function_args_str, Allow.OBJ | Allow.STR)))
+        for tool_call_id, function_name, function_args_str in zip(
+            tool_call_ids, function_names, function_args_strs)
+    ]
+    assert_tool_calls(actual_tool_calls, expected_tool_calls)
diff --git a/vllm_v0.10.0/tests/tool_use/test_kimi_k2_tool_parser.py b/vllm_v0.10.0/tests/tool_use/test_kimi_k2_tool_parser.py
new file mode 100644
index 0000000..bd03063
--- /dev/null
+++ b/vllm_v0.10.0/tests/tool_use/test_kimi_k2_tool_parser.py
@@ -0,0 +1,193 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+
+import json
+
+import pytest
+
+from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
+from vllm.entrypoints.openai.tool_parsers import KimiK2ToolParser
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+# Use a common model that is likely to be available
+MODEL = "moonshotai/Kimi-K2-Instruct"
+
+
+@pytest.fixture(scope="module")
+def kimi_k2_tokenizer():
+    return get_tokenizer(tokenizer_name=MODEL, trust_remote_code=True)
+
+
+@pytest.fixture
+def kimi_k2_tool_parser(kimi_k2_tokenizer):
+    return KimiK2ToolParser(kimi_k2_tokenizer)
+
+
+def assert_tool_calls(actual_tool_calls: list[ToolCall],
+                      expected_tool_calls: list[ToolCall]):
+    assert len(actual_tool_calls) == len(expected_tool_calls)
+
+    for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
+                                                    expected_tool_calls):
+
+        assert actual_tool_call.type == "function"
+        assert actual_tool_call.function == expected_tool_call.function
+
+        # assert tool call id format
+        assert actual_tool_call.id.startswith("functions.")
+        assert actual_tool_call.id.split(':')[-1].isdigit()
+        assert actual_tool_call.id.split('.')[1].split(
+            ':')[0] == expected_tool_call.function.name
+
+
+def test_extract_tool_calls_no_tools(kimi_k2_tool_parser):
+    model_output = "This is a test"
+    extracted_tool_calls = kimi_k2_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+    assert not extracted_tool_calls.tools_called
+    assert extracted_tool_calls.tool_calls == []
+    assert extracted_tool_calls.content == model_output
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "tool_call_with_content_before",
+        "multi_tool_call_with_content_before",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        (
+            """I'll help you check the weather. <|tool_calls_section_begin|> <|tool_call_begin|>
+functions.get_weather:0 <|tool_call_argument_begin|> {"city": "Beijing"} <|tool_call_end|> <|tool_calls_section_end|>""",
+            [
+                ToolCall(id='functions.get_weather:0',
+                         function=FunctionCall(
+                             name="get_weather",
+                             arguments=json.dumps({
+                                 "city": "Beijing",
+                             }, ),
+                         ),
+                         type='function')
+            ],
+            "I'll help you check the weather. ",
+        ),
+        (
+            """I'll help you check the weather. <|tool_calls_section_begin|> <|tool_call_begin|>
+functions.get_weather:0 <|tool_call_argument_begin|> {"city": "Beijing"} <|tool_call_end|> <|tool_call_begin|>
+functions.get_weather:1 <|tool_call_argument_begin|> {"city": "Shanghai"} <|tool_call_end|> <|tool_calls_section_end|>""",
+            [
+                ToolCall(id='functions.get_weather:0',
+                         function=FunctionCall(
+                             name="get_weather",
+                             arguments=json.dumps({
+                                 "city": "Beijing",
+                             }, ),
+                         ),
+                         type='function'),
+                ToolCall(id='functions.get_weather:1',
+                         function=FunctionCall(
+                             name="get_weather",
+                             arguments=json.dumps({
+                                 "city": "Shanghai",
+                             }, ),
+                         ),
+                         type='function')
+            ],
+            "I'll help you check the weather. ",
+        ),
+    ],
+)
+def test_extract_tool_calls(kimi_k2_tool_parser, model_output,
+                            expected_tool_calls, expected_content):
+    extracted_tool_calls = kimi_k2_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+    assert extracted_tool_calls.tools_called
+
+    assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
+
+    assert extracted_tool_calls.content == expected_content
+
+
+def test_extract_tool_calls_invalid_json(kimi_k2_tool_parser):
+    """we'll return every funcall result"""
+    model_output = """I'll help you check the weather. <|tool_calls_section_begin|> <|tool_call_begin|>
+functions.invalid_get_weather:0 <|tool_call_argument_begin|> {"city": "Beijing" <|tool_call_end|> <|tool_call_begin|>
+functions.valid_get_weather:1 <|tool_call_argument_begin|> {"city": "Shanghai"} <|tool_call_end|> <|tool_calls_section_end|>"""
+
+    extracted_tool_calls = kimi_k2_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+
+    assert extracted_tool_calls.tools_called
+    # Should extract only the valid JSON tool calls
+    assert len(extracted_tool_calls.tool_calls) == 2
+    assert extracted_tool_calls.tool_calls[
+        0].function.name == "invalid_get_weather"
+    assert extracted_tool_calls.tool_calls[
+        1].function.name == "valid_get_weather"
+
+
+def test_extract_tool_calls_invalid_funcall(kimi_k2_tool_parser):
+    """we'll return every funcall result"""
+    model_output = """I'll help you check the weather. <|tool_calls_section_begin|> <|tool_call_begin|>
+functions.invalid_get_weather.0 <|tool_call_argument_begin|> {"city": "Beijing"} <|tool_call_end|> <|tool_call_begin|>
+functions.valid_get_weather:1 <|tool_call_argument_begin|> {"city": "Shanghai"} <|tool_call_end|> <|tool_calls_section_end|>"""
+
+    extracted_tool_calls = kimi_k2_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+
+    assert extracted_tool_calls.tools_called
+    # Should extract only the valid JSON tool calls
+    assert len(extracted_tool_calls.tool_calls) == 1
+    assert extracted_tool_calls.tool_calls[
+        0].function.name == "valid_get_weather"
+
+
+def test_streaming_basic_functionality(kimi_k2_tool_parser):
+    """Test basic streaming functionality."""
+    # Reset streaming state
+    kimi_k2_tool_parser.current_tool_name_sent = False
+    kimi_k2_tool_parser.prev_tool_call_arr = []
+    kimi_k2_tool_parser.current_tool_id = -1
+    kimi_k2_tool_parser.streamed_args_for_tool = []
+
+    # Test with a simple tool call
+    current_text = """ check the weather. <|tool_calls_section_begin|> <|tool_call_begin|>
+functions.get_weather:0 <|tool_call_argument_begin|> {"city": "Beijing"} <|tool_call_end|> <|tool_calls_section_end|>"""
+
+    # First call should handle the initial setup
+    result = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="I'll help you",
+        current_text=current_text,
+        delta_text="<|tool_calls_section_end|>",
+        previous_token_ids=[],
+        current_token_ids=[],
+        delta_token_ids=[],
+        request=None,
+    )
+
+    # The result might be None or contain tool call information
+    # This depends on the internal state management
+    if result is not None and hasattr(result,
+                                      'tool_calls') and result.tool_calls:
+        assert len(result.tool_calls) >= 0
+
+
+def test_streaming_no_tool_calls(kimi_k2_tool_parser):
+    """Test streaming when there are no tool calls."""
+    current_text = "This is just regular text without any tool calls."
+
+    result = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="This is just regular text",
+        current_text=current_text,
+        delta_text=" without any tool calls.",
+        previous_token_ids=[],
+        current_token_ids=[],
+        delta_token_ids=[],
+        request=None,
+    )
+
+    # Should return the delta text as content
+    assert result is not None
+    assert hasattr(result, 'content')
+    assert result.content == " without any tool calls."
diff --git a/vllm_v0.10.0/tests/tool_use/test_minimax_tool_parser.py b/vllm_v0.10.0/tests/tool_use/test_minimax_tool_parser.py
new file mode 100644
index 0000000..49b8e4b
--- /dev/null
+++ b/vllm_v0.10.0/tests/tool_use/test_minimax_tool_parser.py
@@ -0,0 +1,372 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+
+import json
+
+import pytest
+
+from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
+from vllm.entrypoints.openai.tool_parsers import MinimaxToolParser
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+# Use a common model that is likely to be available
+MODEL = "MiniMaxAi/MiniMax-M1-40k"
+
+
+@pytest.fixture(scope="module")
+def minimax_tokenizer():
+    return get_tokenizer(tokenizer_name=MODEL)
+
+
+@pytest.fixture
+def minimax_tool_parser(minimax_tokenizer):
+    return MinimaxToolParser(minimax_tokenizer)
+
+
+def assert_tool_calls(actual_tool_calls: list[ToolCall],
+                      expected_tool_calls: list[ToolCall]):
+    assert len(actual_tool_calls) == len(expected_tool_calls)
+
+    for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
+                                                    expected_tool_calls):
+        assert isinstance(actual_tool_call.id, str)
+        assert len(actual_tool_call.id) > 16
+
+        assert actual_tool_call.type == "function"
+        assert actual_tool_call.function == expected_tool_call.function
+
+
+def test_extract_tool_calls_no_tools(minimax_tool_parser):
+    model_output = "This is a test"
+    extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+    assert not extracted_tool_calls.tools_called
+    assert extracted_tool_calls.tool_calls == []
+    assert extracted_tool_calls.content == model_output
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "single_tool_call",
+        "multiple_tool_calls",
+        "tool_call_with_content_before",
+        "tool_call_with_single_line_json",
+        "tool_call_incomplete_tag",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        (
+            """<tool_calls>
+{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}
+</tool_calls>""",
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Dallas",
+                        "state": "TX",
+                        "unit": "fahrenheit",
+                    }),
+                ))
+            ],
+            None,
+        ),
+        (
+            """<tool_calls>
+{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}
+{"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}
+</tool_calls>""",
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Dallas",
+                        "state": "TX",
+                        "unit": "fahrenheit",
+                    }),
+                )),
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Orlando",
+                        "state": "FL",
+                        "unit": "fahrenheit",
+                    }),
+                )),
+            ],
+            None,
+        ),
+        (
+            """I'll help you check the weather. <tool_calls>
+{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}
+</tool_calls>""",
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Seattle",
+                        "state": "WA",
+                        "unit": "celsius",
+                    }),
+                ))
+            ],
+            "I'll help you check the weather.",
+        ),
+        (
+            """<tool_calls>
+{"name": "get_current_weather", "arguments": {"city": "New York", "state": "NY", "unit": "celsius"}}
+</tool_calls>""",
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "New York",
+                        "state": "NY",
+                        "unit": "celsius",
+                    }),
+                ))
+            ],
+            None,
+        ),
+        (
+            """<tool_calls>
+{"name": "get_current_weather", "arguments": {"city": "Boston", "state": "MA"}}""",
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Boston",
+                        "state": "MA",
+                    }),
+                ))
+            ],
+            None,
+        ),
+    ],
+)
+def test_extract_tool_calls(minimax_tool_parser, model_output,
+                            expected_tool_calls, expected_content):
+    extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+    assert extracted_tool_calls.tools_called
+
+    assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
+
+    assert extracted_tool_calls.content == expected_content
+
+
+def test_preprocess_model_output_with_thinking_tags(minimax_tool_parser):
+    """Test that tool calls within thinking tags are removed during preprocessing."""
+    model_output = """<think>Let me think about this. <tool_calls>
+{"name": "fake_tool", "arguments": {"param": "value"}}
+</tool_calls> This should be removed.</think>
+
+I'll help you with that. <tool_calls>
+{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA"}}
+</tool_calls>"""
+
+    processed_output = minimax_tool_parser.preprocess_model_output(
+        model_output)
+
+    # The tool call within thinking tags should be removed
+    assert "fake_tool" not in processed_output
+    # But the thinking tag itself should remain
+    assert "<think>" in processed_output
+    assert "</think>" in processed_output
+    # The actual tool call outside thinking tags should remain
+    assert "get_current_weather" in processed_output
+
+
+def test_extract_tool_calls_with_thinking_tags(minimax_tool_parser):
+    """Test tool extraction when thinking tags contain tool calls that should be ignored."""
+    model_output = """<think>I should use a tool. <tool_calls>
+{"name": "ignored_tool", "arguments": {"should": "ignore"}}
+</tool_calls></think>
+
+Let me help you with the weather. <tool_calls>
+{"name": "get_current_weather", "arguments": {"city": "Miami", "state": "FL", "unit": "fahrenheit"}}
+</tool_calls>"""
+
+    extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+
+    assert extracted_tool_calls.tools_called
+    assert len(extracted_tool_calls.tool_calls) == 1
+    assert extracted_tool_calls.tool_calls[
+        0].function.name == "get_current_weather"
+
+    # Content extraction is based on the position of the first <tool_calls> in the original model_output
+    # Since preprocessing removes tool calls within thinking tags, the actual first <tool_calls> is the external one
+    expected_content = """<think>I should use a tool. <tool_calls>
+{"name": "ignored_tool", "arguments": {"should": "ignore"}}
+</tool_calls></think>
+
+Let me help you with the weather."""
+    assert extracted_tool_calls.content == expected_content
+
+
+def test_extract_tool_calls_invalid_json(minimax_tool_parser):
+    """Test that invalid JSON in tool calls is handled gracefully."""
+    model_output = """<tool_calls>
+{"name": "valid_tool", "arguments": {"city": "Seattle"}}
+{invalid json here}
+{"name": "another_valid_tool", "arguments": {"param": "value"}}
+</tool_calls>"""
+
+    extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+
+    assert extracted_tool_calls.tools_called
+    # Should extract only the valid JSON tool calls
+    assert len(extracted_tool_calls.tool_calls) == 2
+    assert extracted_tool_calls.tool_calls[0].function.name == "valid_tool"
+    assert extracted_tool_calls.tool_calls[
+        1].function.name == "another_valid_tool"
+
+
+def test_extract_tool_calls_missing_name_or_arguments(minimax_tool_parser):
+    """Test that tool calls missing name or arguments are filtered out."""
+    model_output = """<tool_calls>
+{"name": "valid_tool", "arguments": {"city": "Seattle"}}
+{"name": "missing_args"}
+{"arguments": {"city": "Portland"}}
+{"name": "another_valid_tool", "arguments": {"param": "value"}}
+</tool_calls>"""
+
+    extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+
+    assert extracted_tool_calls.tools_called
+    # Should extract only the valid tool calls with both name and arguments
+    assert len(extracted_tool_calls.tool_calls) == 2
+    assert extracted_tool_calls.tool_calls[0].function.name == "valid_tool"
+    assert extracted_tool_calls.tool_calls[
+        1].function.name == "another_valid_tool"
+
+
+def test_streaming_basic_functionality(minimax_tool_parser):
+    """Test basic streaming functionality."""
+    # Reset streaming state
+    minimax_tool_parser.current_tool_name_sent = False
+    minimax_tool_parser.prev_tool_call_arr = []
+    minimax_tool_parser.current_tool_id = -1
+    minimax_tool_parser.streamed_args_for_tool = []
+
+    # Test with a simple tool call
+    current_text = """<tool_calls>
+{"name": "get_current_weather", "arguments": {"city": "Seattle"}}
+</tool_calls>"""
+
+    # First call should handle the initial setup
+    result = minimax_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text=current_text,
+        delta_text="</tool_calls>",
+        previous_token_ids=[],
+        current_token_ids=[],
+        delta_token_ids=[],
+        request=None,
+    )
+
+    # The result might be None or contain tool call information
+    # This depends on the internal state management
+    if result is not None and hasattr(result,
+                                      'tool_calls') and result.tool_calls:
+        assert len(result.tool_calls) >= 0
+
+
+def test_streaming_with_content_before_tool_calls(minimax_tool_parser):
+    """Test streaming when there's content before tool calls."""
+    # Reset streaming state
+    minimax_tool_parser.current_tool_name_sent = False
+    minimax_tool_parser.prev_tool_call_arr = []
+    minimax_tool_parser.current_tool_id = -1
+    minimax_tool_parser.streamed_args_for_tool = []
+
+    current_text = "I'll help you with that. <tool_calls>"
+
+    # When there's content before tool calls, it should be returned as content
+    result = minimax_tool_parser.extract_tool_calls_streaming(
+        previous_text="I'll help you",
+        current_text=current_text,
+        delta_text=" with that. <tool_calls>",
+        previous_token_ids=[],
+        current_token_ids=[],
+        delta_token_ids=[],
+        request=None,
+    )
+
+    if result is not None and hasattr(result, 'content'):
+        # Should contain some content
+        assert result.content is not None
+
+
+def test_streaming_no_tool_calls(minimax_tool_parser):
+    """Test streaming when there are no tool calls."""
+    current_text = "This is just regular text without any tool calls."
+
+    result = minimax_tool_parser.extract_tool_calls_streaming(
+        previous_text="This is just regular text",
+        current_text=current_text,
+        delta_text=" without any tool calls.",
+        previous_token_ids=[],
+        current_token_ids=[],
+        delta_token_ids=[],
+        request=None,
+    )
+
+    # Should return the delta text as content
+    assert result is not None
+    assert hasattr(result, 'content')
+    assert result.content == " without any tool calls."
+
+
+def test_streaming_with_thinking_tags(minimax_tool_parser):
+    """Test streaming with thinking tags that contain tool calls."""
+    # Reset streaming state
+    minimax_tool_parser.current_tool_name_sent = False
+    minimax_tool_parser.prev_tool_call_arr = []
+    minimax_tool_parser.current_tool_id = -1
+    minimax_tool_parser.streamed_args_for_tool = []
+
+    current_text = """<think><tool_calls>{"name": "ignored", "arguments": {}}</tool_calls></think><tool_calls>{"name": "real_tool", "arguments": {"param": "value"}}</tool_calls>"""
+
+    result = minimax_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text=current_text,
+        delta_text=current_text,
+        previous_token_ids=[],
+        current_token_ids=[],
+        delta_token_ids=[],
+        request=None,
+    )
+
+    # The preprocessing should remove tool calls from thinking tags
+    # and only process the real tool call
+    if result is not None and hasattr(result,
+                                      'tool_calls') and result.tool_calls:
+        for tool_call in result.tool_calls:
+            assert tool_call.function.name != "ignored"
+
+
+def test_extract_tool_calls_multiline_json_not_supported(minimax_tool_parser):
+    """Test that multiline JSON in tool calls is not currently supported."""
+    model_output = """<tool_calls>
+{
+  "name": "get_current_weather",
+  "arguments": {
+    "city": "New York",
+    "state": "NY",
+    "unit": "celsius"
+  }
+}
+</tool_calls>"""
+
+    extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+
+    # Multiline JSON is currently not supported, should return no tools called
+    assert not extracted_tool_calls.tools_called
+    assert extracted_tool_calls.tool_calls == []
+    assert extracted_tool_calls.content is None
diff --git a/vllm_v0.10.0/tests/tool_use/test_parallel_tool_calls.py b/vllm_v0.10.0/tests/tool_use/test_parallel_tool_calls.py
new file mode 100644
index 0000000..fff20c6
--- /dev/null
+++ b/vllm_v0.10.0/tests/tool_use/test_parallel_tool_calls.py
@@ -0,0 +1,208 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from typing import Optional
+
+import openai
+import pytest
+
+from .utils import (MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
+                    MESSAGES_WITH_PARALLEL_TOOL_RESPONSE, SEARCH_TOOL,
+                    WEATHER_TOOL, ServerConfig)
+
+
+# test: getting the model to generate parallel tool calls (streaming/not)
+# when requested. NOTE that not all models may support this, so some exclusions
+# may be added in the future. e.g. llama 3.1 models are not designed to support
+# parallel tool calls.
+@pytest.mark.asyncio
+async def test_parallel_tool_calls(client: openai.AsyncOpenAI,
+                                   server_config: ServerConfig):
+
+    if not server_config.get("supports_parallel", True):
+        pytest.skip("The {} model doesn't support parallel tool calls".format(
+            server_config["model"]))
+
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+    chat_completion = await client.chat.completions.create(
+        messages=MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
+        temperature=0,
+        max_completion_tokens=200,
+        model=model_name,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False)
+
+    choice = chat_completion.choices[0]
+    stop_reason = chat_completion.choices[0].finish_reason
+    non_streamed_tool_calls = chat_completion.choices[0].message.tool_calls
+
+    # make sure 2 tool calls are present
+    assert choice.message.role == "assistant"
+    assert non_streamed_tool_calls is not None
+    assert len(non_streamed_tool_calls) == 2
+
+    for tool_call in non_streamed_tool_calls:
+        # make sure the tool includes a function and ID
+        assert tool_call.type == "function"
+        assert tool_call.function is not None
+        assert isinstance(tool_call.id, str)
+        assert len(tool_call.id) >= 9
+
+        # make sure the weather tool was called correctly
+        assert tool_call.function.name == WEATHER_TOOL["function"]["name"]
+        assert isinstance(tool_call.function.arguments, str)
+
+        parsed_arguments = json.loads(tool_call.function.arguments)
+        assert isinstance(parsed_arguments, dict)
+        assert isinstance(parsed_arguments.get("city"), str)
+        assert isinstance(parsed_arguments.get("state"), str)
+
+    assert stop_reason == "tool_calls"
+
+    # make the same request, streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
+        temperature=0,
+        max_completion_tokens=200,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False,
+        stream=True)
+
+    role_name: Optional[str] = None
+    finish_reason_count: int = 0
+
+    tool_call_names: list[str] = []
+    tool_call_args: list[str] = []
+    tool_call_idx: int = -1
+    tool_call_id_count: int = 0
+
+    async for chunk in stream:
+
+        # if there's a finish reason make sure it's tools
+        if chunk.choices[0].finish_reason:
+            finish_reason_count += 1
+            assert chunk.choices[0].finish_reason == 'tool_calls'
+
+        # if a role is being streamed make sure it wasn't already set to
+        # something else
+        if chunk.choices[0].delta.role:
+            assert not role_name or role_name == 'assistant'
+            role_name = 'assistant'
+
+        # if a tool call is streamed make sure there's exactly one
+        # (based on the request parameters
+        streamed_tool_calls = chunk.choices[0].delta.tool_calls
+
+        if streamed_tool_calls and len(streamed_tool_calls) > 0:
+
+            # make sure only one diff is present - correct even for parallel
+            assert len(streamed_tool_calls) == 1
+            tool_call = streamed_tool_calls[0]
+
+            # if a new tool is being called, set up empty arguments
+            if tool_call.index != tool_call_idx:
+                tool_call_idx = tool_call.index
+                tool_call_args.append("")
+
+            # if a tool call ID is streamed, make sure one hasn't been already
+            if tool_call.id:
+                tool_call_id_count += 1
+                assert (isinstance(tool_call.id, str)
+                        and (len(tool_call.id) >= 9))
+
+            # if parts of the function start being streamed
+            if tool_call.function:
+                # if the function name is defined, set it. it should be streamed
+                # IN ENTIRETY, exactly one time.
+                if tool_call.function.name:
+                    assert isinstance(tool_call.function.name, str)
+                    tool_call_names.append(tool_call.function.name)
+
+                if tool_call.function.arguments:
+                    # make sure they're a string and then add them to the list
+                    assert isinstance(tool_call.function.arguments, str)
+
+                    tool_call_args[
+                        tool_call.index] += tool_call.function.arguments
+
+    assert finish_reason_count == 1
+    assert role_name == 'assistant'
+
+    assert (len(non_streamed_tool_calls) == len(tool_call_names) ==
+            len(tool_call_args))
+
+    for i in range(2):
+        assert non_streamed_tool_calls[i].function.name == tool_call_names[i]
+        streamed_args = json.loads(tool_call_args[i])
+        non_streamed_args = json.loads(
+            non_streamed_tool_calls[i].function.arguments)
+        assert streamed_args == non_streamed_args
+
+
+# test: providing parallel tool calls back to the model to get a response
+# (streaming/not)
+@pytest.mark.asyncio
+async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI,
+                                                server_config: ServerConfig):
+
+    if not server_config.get("supports_parallel", True):
+        pytest.skip("The {} model doesn't support parallel tool calls".format(
+            server_config["model"]))
+
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+    chat_completion = await client.chat.completions.create(
+        messages=MESSAGES_WITH_PARALLEL_TOOL_RESPONSE,
+        temperature=0,
+        max_completion_tokens=200,
+        model=model_name,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False)
+
+    choice = chat_completion.choices[0]
+
+    assert choice.finish_reason != "tool_calls"  # "stop" or "length"
+    assert choice.message.role == "assistant"
+    assert choice.message.tool_calls is None \
+           or len(choice.message.tool_calls) == 0
+    assert choice.message.content is not None
+    assert "98" in choice.message.content  # Dallas temp in tool response
+    assert "78" in choice.message.content  # Orlando temp in tool response
+
+    stream = await client.chat.completions.create(
+        messages=MESSAGES_WITH_PARALLEL_TOOL_RESPONSE,
+        temperature=0,
+        max_completion_tokens=200,
+        model=model_name,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False,
+        stream=True)
+
+    chunks: list[str] = []
+    finish_reason_count = 0
+    role_sent: bool = False
+
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+
+        if delta.role:
+            assert not role_sent
+            assert delta.role == "assistant"
+            role_sent = True
+
+        if delta.content:
+            chunks.append(delta.content)
+
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+            assert chunk.choices[0].finish_reason == choice.finish_reason
+
+        assert not delta.tool_calls or len(delta.tool_calls) == 0
+
+    assert role_sent
+    assert finish_reason_count == 1
+    assert len(chunks)
+    assert "".join(chunks) == choice.message.content
diff --git a/vllm_v0.10.0/tests/tool_use/test_qwen3coder_tool_parser.py b/vllm_v0.10.0/tests/tool_use/test_qwen3coder_tool_parser.py
new file mode 100644
index 0000000..40c3158
--- /dev/null
+++ b/vllm_v0.10.0/tests/tool_use/test_qwen3coder_tool_parser.py
@@ -0,0 +1,618 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from collections.abc import Generator
+from typing import Optional
+
+import pytest
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              ChatCompletionToolsParam,
+                                              DeltaMessage, FunctionCall,
+                                              ToolCall)
+from vllm.entrypoints.openai.tool_parsers.qwen3coder_tool_parser import (
+    Qwen3CoderToolParser)
+from vllm.transformers_utils.detokenizer import detokenize_incrementally
+from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+
+MODEL = "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8"
+
+
+@pytest.fixture(scope="module")
+def qwen3_tokenizer():
+    return get_tokenizer(tokenizer_name=MODEL)
+
+
+@pytest.fixture
+def qwen3_tool_parser(qwen3_tokenizer):
+    return Qwen3CoderToolParser(qwen3_tokenizer)
+
+
+@pytest.fixture
+def sample_tools():
+    return [
+        ChatCompletionToolsParam(type="function",
+                                 function={
+                                     "name": "get_current_weather",
+                                     "description": "Get the current weather",
+                                     "parameters": {
+                                         "type": "object",
+                                         "properties": {
+                                             "city": {
+                                                 "type": "string",
+                                                 "description": "The city name"
+                                             },
+                                             "state": {
+                                                 "type": "string",
+                                                 "description":
+                                                 "The state code"
+                                             },
+                                             "unit": {
+                                                 "type": "string",
+                                                 "enum":
+                                                 ["fahrenheit", "celsius"]
+                                             }
+                                         },
+                                         "required": ["city", "state"]
+                                     }
+                                 }),
+        ChatCompletionToolsParam(type="function",
+                                 function={
+                                     "name": "calculate_area",
+                                     "description":
+                                     "Calculate area of a shape",
+                                     "parameters": {
+                                         "type": "object",
+                                         "properties": {
+                                             "shape": {
+                                                 "type": "string"
+                                             },
+                                             "dimensions": {
+                                                 "type": "object"
+                                             },
+                                             "precision": {
+                                                 "type": "integer"
+                                             }
+                                         }
+                                     }
+                                 })
+    ]
+
+
+def assert_tool_calls(actual_tool_calls: list[ToolCall],
+                      expected_tool_calls: list[ToolCall]):
+    assert len(actual_tool_calls) == len(expected_tool_calls)
+
+    for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
+                                                    expected_tool_calls):
+        # Qwen3 parser doesn't generate IDs during extraction
+        assert actual_tool_call.type == "function"
+        assert (
+            actual_tool_call.function.name == expected_tool_call.function.name)
+        assert (json.loads(actual_tool_call.function.arguments) == json.loads(
+            expected_tool_call.function.arguments))
+
+
+def stream_delta_message_generator(
+    qwen3_tool_parser: Qwen3CoderToolParser,
+    qwen3_tokenizer: AnyTokenizer,
+    model_output: str,
+    request: Optional[ChatCompletionRequest] = None
+) -> Generator[DeltaMessage, None, None]:
+    all_token_ids = qwen3_tokenizer.encode(model_output,
+                                           add_special_tokens=False)
+
+    previous_text = ""
+    previous_tokens = None
+    prefix_offset = 0
+    read_offset = 0
+    for i, delta_token in enumerate(all_token_ids):
+        delta_token_ids = [delta_token]
+        previous_token_ids = all_token_ids[:i]
+        current_token_ids = all_token_ids[:i + 1]
+
+        (new_tokens, delta_text, new_prefix_offset,
+         new_read_offset) = detokenize_incrementally(
+             tokenizer=qwen3_tokenizer,
+             all_input_ids=current_token_ids,
+             prev_tokens=previous_tokens,
+             prefix_offset=prefix_offset,
+             read_offset=read_offset,
+             skip_special_tokens=False,
+             spaces_between_special_tokens=True,
+         )
+
+        current_text = previous_text + delta_text
+
+        delta_message = qwen3_tool_parser.extract_tool_calls_streaming(
+            previous_text,
+            current_text,
+            delta_text,
+            previous_token_ids,
+            current_token_ids,
+            delta_token_ids,
+            request=request,
+        )
+        if delta_message:
+            yield delta_message
+
+        previous_text = current_text
+        previous_tokens = (previous_tokens +
+                           new_tokens if previous_tokens else new_tokens)
+        prefix_offset = new_prefix_offset
+        read_offset = new_read_offset
+
+
+def test_extract_tool_calls_no_tools(qwen3_tool_parser):
+    model_output = "This is a test response without any tool calls"
+    extracted_tool_calls = qwen3_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+    assert not extracted_tool_calls.tools_called
+    assert extracted_tool_calls.tool_calls == []
+    assert extracted_tool_calls.content == model_output
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "single_tool",
+        "single_tool_with_content",
+        "single_tool_multiline_param",
+        "parallel_tools",
+        "tool_with_typed_params",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        ('''<tool_call>
+<function=get_current_weather>
+<parameter=city>
+Dallas
+</parameter>
+<parameter=state>
+TX
+</parameter>
+<parameter=unit>
+fahrenheit
+</parameter>
+</function>
+</tool_call>''', [
+            ToolCall(
+                function=FunctionCall(name="get_current_weather",
+                                      arguments=json.dumps({
+                                          "city": "Dallas",
+                                          "state": "TX",
+                                          "unit": "fahrenheit"
+                                      })))
+        ], None),
+        ('''Sure! Let me check the weather for you.<tool_call>
+<function=get_current_weather>
+<parameter=city>
+Dallas
+</parameter>
+<parameter=state>
+TX
+</parameter>
+<parameter=unit>
+fahrenheit
+</parameter>
+</function>
+</tool_call>''', [
+            ToolCall(
+                function=FunctionCall(name="get_current_weather",
+                                      arguments=json.dumps({
+                                          "city": "Dallas",
+                                          "state": "TX",
+                                          "unit": "fahrenheit"
+                                      })))
+        ], "Sure! Let me check the weather for you."),
+        ('''<tool_call>
+<function=calculate_area>
+<parameter=shape>
+rectangle
+</parameter>
+<parameter=dimensions>
+{"width": 10, 
+ "height": 20}
+</parameter>
+<parameter=precision>
+2
+</parameter>
+</function>
+</tool_call>''', [
+            ToolCall(function=FunctionCall(name="calculate_area",
+                                           arguments=json.dumps({
+                                               "shape": "rectangle",
+                                               "dimensions": {
+                                                   "width": 10,
+                                                   "height": 20
+                                               },
+                                               "precision": 2
+                                           })))
+        ], None),
+        ('''<tool_call>
+<function=get_current_weather>
+<parameter=city>
+Dallas
+</parameter>
+<parameter=state>
+TX
+</parameter>
+<parameter=unit>
+fahrenheit
+</parameter>
+</function>
+</tool_call>
+<tool_call>
+<function=get_current_weather>
+<parameter=city>
+Orlando
+</parameter>
+<parameter=state>
+FL
+</parameter>
+<parameter=unit>
+fahrenheit
+</parameter>
+</function>
+</tool_call>''', [
+            ToolCall(
+                function=FunctionCall(name="get_current_weather",
+                                      arguments=json.dumps({
+                                          "city": "Dallas",
+                                          "state": "TX",
+                                          "unit": "fahrenheit"
+                                      }))),
+            ToolCall(
+                function=FunctionCall(name="get_current_weather",
+                                      arguments=json.dumps({
+                                          "city": "Orlando",
+                                          "state": "FL",
+                                          "unit": "fahrenheit"
+                                      })))
+        ], None),
+        ('''Let me calculate that area for you.<tool_call>
+<function=calculate_area>
+<parameter=shape>
+circle
+</parameter>
+<parameter=dimensions>
+{"radius": 15.5}
+</parameter>
+<parameter=precision>
+3
+</parameter>
+</function>
+</tool_call>''', [
+            ToolCall(function=FunctionCall(name="calculate_area",
+                                           arguments=json.dumps({
+                                               "shape": "circle",
+                                               "dimensions": {
+                                                   "radius": 15.5
+                                               },
+                                               "precision": 3
+                                           })))
+        ], "Let me calculate that area for you."),
+    ],
+)
+def test_extract_tool_calls(qwen3_tool_parser, sample_tools, model_output,
+                            expected_tool_calls, expected_content):
+    request = ChatCompletionRequest(model=MODEL,
+                                    messages=[],
+                                    tools=sample_tools)
+    extracted_tool_calls = qwen3_tool_parser.extract_tool_calls(
+        model_output, request=request)
+    assert extracted_tool_calls.tools_called
+
+    assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
+
+    assert extracted_tool_calls.content == expected_content
+
+
+def test_extract_tool_calls_fallback_no_tags(qwen3_tool_parser, sample_tools):
+    """Test fallback parsing when XML tags are missing"""
+    model_output = '''<function=get_current_weather>
+<parameter=city>
+Dallas
+</parameter>
+<parameter=state>
+TX
+</parameter>
+</function>'''
+
+    request = ChatCompletionRequest(model=MODEL,
+                                    messages=[],
+                                    tools=sample_tools)
+    extracted_tool_calls = qwen3_tool_parser.extract_tool_calls(
+        model_output, request=request)
+
+    assert extracted_tool_calls.tools_called
+    assert len(extracted_tool_calls.tool_calls) == 1
+    assert (extracted_tool_calls.tool_calls[0].function.name ==
+            "get_current_weather")
+
+
+def test_extract_tool_calls_type_conversion(qwen3_tool_parser):
+    """Test parameter type conversion based on tool schema"""
+    tools = [
+        ChatCompletionToolsParam(type="function",
+                                 function={
+                                     "name": "test_types",
+                                     "parameters": {
+                                         "type": "object",
+                                         "properties": {
+                                             "int_param": {
+                                                 "type": "integer"
+                                             },
+                                             "float_param": {
+                                                 "type": "float"
+                                             },
+                                             "bool_param": {
+                                                 "type": "boolean"
+                                             },
+                                             "str_param": {
+                                                 "type": "string"
+                                             },
+                                             "obj_param": {
+                                                 "type": "object"
+                                             }
+                                         }
+                                     }
+                                 })
+    ]
+
+    model_output = '''<tool_call>
+<function=test_types>
+<parameter=int_param>
+42
+</parameter>
+<parameter=float_param>
+3.14
+</parameter>
+<parameter=bool_param>
+true
+</parameter>
+<parameter=str_param>
+hello world
+</parameter>
+<parameter=obj_param>
+{"key": "value"}
+</parameter>
+</function>
+</tool_call>'''
+
+    request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools)
+    extracted_tool_calls = qwen3_tool_parser.extract_tool_calls(
+        model_output, request=request)
+
+    args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments)
+    assert args["int_param"] == 42
+    assert args["float_param"] == 3.14
+    assert args["bool_param"] is True
+    assert args["str_param"] == "hello world"
+    assert args["obj_param"] == {"key": "value"}
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "no_tools",
+        "single_tool",
+        "single_tool_with_content",
+        "parallel_tools",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        ("This is a test without tools", [], "This is a test without tools"),
+        ('''<tool_call>
+<function=get_current_weather>
+<parameter=city>
+Dallas
+</parameter>
+<parameter=state>
+TX
+</parameter>
+<parameter=unit>
+fahrenheit
+</parameter>
+</function>
+</tool_call>''', [
+            ToolCall(
+                function=FunctionCall(name="get_current_weather",
+                                      arguments=json.dumps({
+                                          "city": "Dallas",
+                                          "state": "TX",
+                                          "unit": "fahrenheit"
+                                      })))
+        ], ""),
+        ('''Sure! Let me check the weather for you.<tool_call>
+<function=get_current_weather>
+<parameter=city>
+Dallas
+</parameter>
+<parameter=state>
+TX
+</parameter>
+<parameter=unit>
+fahrenheit
+</parameter>
+</function>
+</tool_call>''', [
+            ToolCall(
+                function=FunctionCall(name="get_current_weather",
+                                      arguments=json.dumps({
+                                          "city": "Dallas",
+                                          "state": "TX",
+                                          "unit": "fahrenheit"
+                                      })))
+        ], "Sure! Let me check the weather for you."),
+        ('''<tool_call>
+<function=get_current_weather>
+<parameter=city>
+Dallas
+</parameter>
+<parameter=state>
+TX
+</parameter>
+<parameter=unit>
+fahrenheit
+</parameter>
+</function>
+</tool_call>
+<tool_call>
+<function=get_current_weather>
+<parameter=city>
+Orlando
+</parameter>
+<parameter=state>
+FL
+</parameter>
+<parameter=unit>
+celsius
+</parameter>
+</function>
+</tool_call>''', [
+            ToolCall(
+                function=FunctionCall(name="get_current_weather",
+                                      arguments=json.dumps({
+                                          "city": "Dallas",
+                                          "state": "TX",
+                                          "unit": "fahrenheit"
+                                      }))),
+            ToolCall(
+                function=FunctionCall(name="get_current_weather",
+                                      arguments=json.dumps({
+                                          "city": "Orlando",
+                                          "state": "FL",
+                                          "unit": "celsius"
+                                      })))
+        ], ""),
+    ],
+)
+def test_extract_tool_calls_streaming(qwen3_tool_parser, qwen3_tokenizer,
+                                      sample_tools, model_output,
+                                      expected_tool_calls, expected_content):
+    """Test incremental streaming behavior"""
+    request = ChatCompletionRequest(model=MODEL,
+                                    messages=[],
+                                    tools=sample_tools)
+
+    other_content = ''
+    tool_states = {}  # Track state per tool index
+
+    for delta_message in stream_delta_message_generator(
+            qwen3_tool_parser, qwen3_tokenizer, model_output, request):
+        # role should never be streamed from tool parser
+        assert not delta_message.role
+
+        if delta_message.content:
+            other_content += delta_message.content
+
+        if delta_message.tool_calls:
+            for tool_call in delta_message.tool_calls:
+                idx = tool_call.index
+
+                # Initialize state for new tool
+                if idx not in tool_states:
+                    tool_states[idx] = {
+                        "id": None,
+                        "name": None,
+                        "arguments": "",
+                        "type": None
+                    }
+
+                # First chunk should have id, name, and type
+                if tool_call.id:
+                    tool_states[idx]["id"] = tool_call.id
+
+                if tool_call.type:
+                    assert tool_call.type == "function"
+                    tool_states[idx]["type"] = tool_call.type
+
+                if tool_call.function:
+                    if tool_call.function.name:
+                        # Should only be set once
+                        assert tool_states[idx]["name"] is None
+                        tool_states[idx]["name"] = tool_call.function.name
+
+                    if tool_call.function.arguments is not None:
+                        # Accumulate arguments incrementally
+                        tool_states[idx][
+                            "arguments"] += tool_call.function.arguments
+
+    # Verify final content
+    assert other_content == expected_content
+
+    # Verify we got all expected tool calls
+    assert len(tool_states) == len(expected_tool_calls)
+
+    # Verify each tool call
+    for idx, expected_tool in enumerate(expected_tool_calls):
+        state = tool_states[idx]
+        assert state["id"] is not None
+        assert state["type"] == "function"
+        assert state["name"] == expected_tool.function.name
+
+        # Parse accumulated arguments
+        arguments_str = state["arguments"]
+        assert arguments_str is not None
+        actual_args = json.loads(arguments_str)
+        expected_args = json.loads(expected_tool.function.arguments)
+        assert actual_args == expected_args
+
+
+def test_extract_tool_calls_streaming_incremental(qwen3_tool_parser,
+                                                  qwen3_tokenizer,
+                                                  sample_tools):
+    """Test that streaming is truly incremental"""
+    model_output = '''I'll check the weather.<tool_call>
+<function=get_current_weather>
+<parameter=city>
+Dallas
+</parameter>
+<parameter=state>
+TX
+</parameter>
+</function>
+</tool_call>'''
+
+    request = ChatCompletionRequest(model=MODEL,
+                                    messages=[],
+                                    tools=sample_tools)
+
+    chunks = []
+    for delta_message in stream_delta_message_generator(
+            qwen3_tool_parser, qwen3_tokenizer, model_output, request):
+        chunks.append(delta_message)
+
+    # Should have multiple chunks
+    assert len(chunks) > 3
+
+    # First chunk(s) should be content
+    assert chunks[0].content is not None
+    assert chunks[0].tool_calls is None or chunks[0].tool_calls == []
+
+    # Should have a chunk with tool header (id, name, type)
+    header_found = False
+    for chunk in chunks:
+        if chunk.tool_calls and chunk.tool_calls[0].id:
+            header_found = True
+            assert (chunk.tool_calls[0].function.name == "get_current_weather")
+            assert chunk.tool_calls[0].type == "function"
+            # Empty initially
+            assert chunk.tool_calls[0].function.arguments == ""
+            break
+    assert header_found
+
+    # Should have chunks with incremental arguments
+    arg_chunks = []
+    for chunk in chunks:
+        if chunk.tool_calls and chunk.tool_calls[0].function.arguments:
+            arg_chunks.append(chunk.tool_calls[0].function.arguments)
+
+    # Arguments should be streamed incrementally
+    assert len(arg_chunks) > 1
+
+    # Concatenated arguments should form valid JSON
+    full_args = "".join(arg_chunks)
+    parsed_args = json.loads(full_args)
+    assert parsed_args["city"] == "Dallas"
+    assert parsed_args["state"] == "TX"
diff --git a/vllm_v0.10.0/tests/tool_use/test_tool_calls.py b/vllm_v0.10.0/tests/tool_use/test_tool_calls.py
new file mode 100644
index 0000000..53ba03a
--- /dev/null
+++ b/vllm_v0.10.0/tests/tool_use/test_tool_calls.py
@@ -0,0 +1,195 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from typing import Optional
+
+import openai
+import pytest
+
+from .utils import (MESSAGES_ASKING_FOR_TOOLS, MESSAGES_WITH_TOOL_RESPONSE,
+                    SEARCH_TOOL, WEATHER_TOOL)
+
+
+# test: request a chat completion that should return tool calls, so we know they
+# are parsable
+@pytest.mark.asyncio
+async def test_tool_call_and_choice(client: openai.AsyncOpenAI):
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+    chat_completion = await client.chat.completions.create(
+        messages=MESSAGES_ASKING_FOR_TOOLS,
+        temperature=0,
+        max_completion_tokens=100,
+        model=model_name,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False)
+
+    choice = chat_completion.choices[0]
+    stop_reason = chat_completion.choices[0].finish_reason
+    tool_calls = chat_completion.choices[0].message.tool_calls
+
+    # make sure a tool call is present
+    assert choice.message.role == 'assistant'
+    assert tool_calls is not None
+    assert len(tool_calls) == 1
+    assert tool_calls[0].type == 'function'
+    assert tool_calls[0].function is not None
+    assert isinstance(tool_calls[0].id, str)
+    assert len(tool_calls[0].id) >= 9
+
+    # make sure the weather tool was called (classic example) with arguments
+    assert tool_calls[0].function.name == WEATHER_TOOL["function"]["name"]
+    assert tool_calls[0].function.arguments is not None
+    assert isinstance(tool_calls[0].function.arguments, str)
+
+    # make sure the arguments parse properly
+    parsed_arguments = json.loads(tool_calls[0].function.arguments)
+    assert isinstance(parsed_arguments, dict)
+    assert isinstance(parsed_arguments.get("city"), str)
+    assert isinstance(parsed_arguments.get("state"), str)
+    assert parsed_arguments.get("city") == "Dallas"
+    assert parsed_arguments.get("state") == "TX"
+
+    assert stop_reason == "tool_calls"
+
+    function_name: Optional[str] = None
+    function_args_str: str = ''
+    tool_call_id: Optional[str] = None
+    role_name: Optional[str] = None
+    finish_reason_count: int = 0
+
+    # make the same request, streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=MESSAGES_ASKING_FOR_TOOLS,
+        temperature=0,
+        max_completion_tokens=100,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False,
+        stream=True)
+
+    async for chunk in stream:
+        assert chunk.choices[0].index == 0
+
+        if chunk.choices[0].finish_reason:
+            finish_reason_count += 1
+            assert chunk.choices[0].finish_reason == 'tool_calls'
+
+        # if a role is being streamed make sure it wasn't already set to
+        # something else
+        if chunk.choices[0].delta.role:
+            assert not role_name or role_name == 'assistant'
+            role_name = 'assistant'
+
+        # if a tool call is streamed make sure there's exactly one
+        # (based on the request parameters
+        streamed_tool_calls = chunk.choices[0].delta.tool_calls
+
+        if streamed_tool_calls and len(streamed_tool_calls) > 0:
+            assert len(streamed_tool_calls) == 1
+            tool_call = streamed_tool_calls[0]
+
+            # if a tool call ID is streamed, make sure one hasn't been already
+            if tool_call.id:
+                assert not tool_call_id
+                tool_call_id = tool_call.id
+
+            # if parts of the function start being streamed
+            if tool_call.function:
+                # if the function name is defined, set it. it should be streamed
+                # IN ENTIRETY, exactly one time.
+                if tool_call.function.name:
+                    assert function_name is None
+                    assert isinstance(tool_call.function.name, str)
+                    function_name = tool_call.function.name
+                if tool_call.function.arguments:
+                    assert isinstance(tool_call.function.arguments, str)
+                    function_args_str += tool_call.function.arguments
+
+    assert finish_reason_count == 1
+    assert role_name == 'assistant'
+    assert isinstance(tool_call_id, str) and (len(tool_call_id) >= 9)
+
+    # validate the name and arguments
+    assert function_name == WEATHER_TOOL["function"]["name"]
+    assert function_name == tool_calls[0].function.name
+    assert isinstance(function_args_str, str)
+
+    # validate arguments
+    streamed_args = json.loads(function_args_str)
+    assert isinstance(streamed_args, dict)
+    assert isinstance(streamed_args.get("city"), str)
+    assert isinstance(streamed_args.get("state"), str)
+    assert streamed_args.get("city") == "Dallas"
+    assert streamed_args.get("state") == "TX"
+
+    # make sure everything matches non-streaming except for ID
+    assert function_name == tool_calls[0].function.name
+    assert choice.message.role == role_name
+    assert choice.message.tool_calls[0].function.name == function_name
+
+    # compare streamed with non-streamed args dict-wise, not string-wise
+    # because character-to-character comparison might not work e.g. the tool
+    # call parser adding extra spaces or something like that. we care about the
+    # dicts matching not byte-wise match
+    assert parsed_arguments == streamed_args
+
+
+# test: providing tools and results back to model to get a non-tool response
+# (streaming/not)
+@pytest.mark.asyncio
+async def test_tool_call_with_results(client: openai.AsyncOpenAI):
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+    chat_completion = await client.chat.completions.create(
+        messages=MESSAGES_WITH_TOOL_RESPONSE,
+        temperature=0,
+        max_completion_tokens=100,
+        model=model_name,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False)
+
+    choice = chat_completion.choices[0]
+
+    assert choice.finish_reason != "tool_calls"  # "stop" or "length"
+    assert choice.message.role == "assistant"
+    assert choice.message.tool_calls is None \
+           or len(choice.message.tool_calls) == 0
+    assert choice.message.content is not None
+    assert "98" in choice.message.content  # the temperature from the response
+
+    stream = await client.chat.completions.create(
+        messages=MESSAGES_WITH_TOOL_RESPONSE,
+        temperature=0,
+        max_completion_tokens=100,
+        model=model_name,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False,
+        stream=True)
+
+    chunks: list[str] = []
+    finish_reason_count = 0
+    role_sent: bool = False
+
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+
+        if delta.role:
+            assert not role_sent
+            assert delta.role == "assistant"
+            role_sent = True
+
+        if delta.content:
+            chunks.append(delta.content)
+
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+            assert chunk.choices[0].finish_reason == choice.finish_reason
+
+        assert not delta.tool_calls or len(delta.tool_calls) == 0
+
+    assert role_sent
+    assert finish_reason_count == 1
+    assert len(chunks)
+    assert "".join(chunks) == choice.message.content
diff --git a/vllm_v0.10.0/tests/tool_use/test_tool_choice_required.py b/vllm_v0.10.0/tests/tool_use/test_tool_choice_required.py
new file mode 100644
index 0000000..e0ed221
--- /dev/null
+++ b/vllm_v0.10.0/tests/tool_use/test_tool_choice_required.py
@@ -0,0 +1,337 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+from copy import deepcopy
+from unittest.mock import MagicMock
+
+import pytest
+import regex as re
+from pydantic import TypeAdapter
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              ChatCompletionToolsParam)
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+
+EXAMPLE_TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type":
+                        "string",
+                        "description":
+                        "The city to find the weather for"
+                        ", e.g. 'San Francisco'",
+                    },
+                },
+                "required": ["city"],
+                "additionalProperties": False
+            },
+        },
+        "strict": True
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_forecast",
+            "description": "Get the weather forecast for a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type":
+                        "string",
+                        "description":
+                        "The city to get the forecast for, e.g. 'New York'",
+                    },
+                    "days": {
+                        "type":
+                        "integer",
+                        "description":
+                        "Number of days to get the forecast for (1-7)",
+                    },
+                },
+                "required": ["city", "days"],
+                "additionalProperties": False
+            },
+        },
+        "strict": True
+    },
+]
+
+
+def _compile_and_check(tools: list[ChatCompletionToolsParam], sample_output,
+                       should_match: bool):
+    self = MagicMock(tool_choice="required", tools=tools)
+    schema = ChatCompletionRequest._get_guided_json_from_tool(self)
+    assert isinstance(schema, dict)
+
+    # use build_regex_from_schema used in JSONLogitsProcessor to create Guide
+    from outlines_core.json_schema import build_regex_from_schema
+    regex = build_regex_from_schema(json.dumps(schema))
+    compiled = re.compile(regex)
+    matches = compiled.fullmatch(json.dumps(sample_output)) is not None
+
+    assert matches == should_match
+
+
+VALID_TOOL_OUTPUTS = [
+    ([{
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Vienna"
+        }
+    }], True),
+    ([{
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Vienna"
+        }
+    }, {
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Berlin"
+        }
+    }], True),
+    ([{
+        "name": "get_forecast",
+        "parameters": {
+            "city": "Vienna",
+            "days": 7
+        }
+    }], True),
+    ([{
+        "name": "get_forecast",
+        "parameters": {
+            "city": "Vienna",
+            "days": 7
+        }
+    }, {
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Vienna"
+        }
+    }], True),
+    ([{
+        "name": "get_forecast",
+        "parameters": {
+            "city": "Vienna",
+            "days": 7
+        }
+    }, {
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Vienna"
+        }
+    }, {
+        "name": "get_forecast",
+        "parameters": {
+            "city": "Berlin",
+            "days": 7
+        }
+    }, {
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Berlin"
+        }
+    }], True),
+]
+
+VALID_TOOLS = [t[0] for t in VALID_TOOL_OUTPUTS]
+
+
+@pytest.mark.parametrize(
+    "sample_output, should_match",
+    VALID_TOOL_OUTPUTS + [
+        (None, False),
+        ([], False),  # empty list cannot be generated
+        ({}, False),  # empty object cannot be generated
+        ([{}], False),  # list with empty object cannot be generated
+        (
+            [{  # function without required parameters cannot be generated
+                "name": "get_current_weather"
+            }],
+            False),
+        (
+            [{  # function without required parameters cannot be generated
+                "name": "get_current_weather",
+                "parameters": {}
+            }],
+            False),
+        (
+            [{  # function without required parameters cannot be generated
+                "name": "get_current_weather",
+                "parameters": None
+            }],
+            False),
+        (
+            {  # tool call without lists cannot be generated
+                "name": "get_current_weather",
+                "parameters": {
+                    "city": "Vienna"
+                }
+            },
+            False),
+        (
+            [{  # tool call with extra parameters cannot be generated
+                "name": "get_current_weather",
+                "parameters": {
+                    "city": "Vienna",
+                    "extra": "value"
+                }
+            }],
+            False),
+        (
+            [{  # tool call where parameters are first cannot be generated
+                "parameters": {
+                    "city": "Vienna"
+                },
+                "name": "get_current_weather"
+            }],
+            False),
+        (
+            [{  # tool call without all required parameters cannot be generated
+                "name": "get_forecast",
+                "parameters": {
+                    "city": "Vienna"
+                }
+            }],
+            False),
+        (  # tool call with incorrect name/parameters cannot be generated
+            [{
+                "name": "get_weather",
+                "parameters": {
+                    "city": "Vienna",
+                    "days": 7
+                }
+            }], False),
+        (  #  tool call with both valid and empty function cannot be generated
+            [{
+                "name": "get_current_weather",
+                "parameters": {
+                    "city": "Vienna"
+                }
+            }, {}], False),
+    ])
+def test_guided_json(sample_output, should_match):
+    _compile_and_check(tools=TypeAdapter(
+        list[ChatCompletionToolsParam]).validate_python(EXAMPLE_TOOLS),
+                       sample_output=sample_output,
+                       should_match=should_match)
+
+
+def update_parameters_none(
+        tool: ChatCompletionToolsParam) -> ChatCompletionToolsParam:
+    tool.function.parameters = None
+    return tool
+
+
+def update_parameters_empty_dict(
+        tool: ChatCompletionToolsParam) -> ChatCompletionToolsParam:
+    tool.function.parameters = {}
+    return tool
+
+
+@pytest.mark.parametrize(
+    "sample_output, should_match",
+    [
+        (None, False),
+        ([], False),  # empty list cannot be generated
+        ({}, False),  # empty object cannot be generated
+        ([{}], False),  # list with empty object cannot be generated
+        (
+            [{  # function without required parameters cannot be generated
+                "name": "get_current_weather"
+            }],
+            False),
+        (
+            [{  # function without required parameters cannot be generated
+                "name": "get_current_weather",
+                "parameters": None
+            }],
+            False),
+        (
+            [{  # function with extra parameters cannot be generated
+                "name": "get_current_weather",
+                "parameters": {
+                    "extra": "value"
+                }
+            }],
+            False),
+        (
+            [{  # only function with empty parameters object is valid
+                "name": "get_current_weather",
+                "parameters": {}
+            }],
+            True),
+    ])
+@pytest.mark.parametrize(
+    "update_parameters",
+    [update_parameters_none, update_parameters_empty_dict])
+def test_guided_json_without_parameters(sample_output, should_match,
+                                        update_parameters):
+    updated_tools = [deepcopy(EXAMPLE_TOOLS[0])]
+    tools = TypeAdapter(
+        list[ChatCompletionToolsParam]).validate_python(updated_tools)
+    tools = list(map(update_parameters, tools))
+    assert all([
+        tool.function.parameters is None or tool.function.parameters == {}
+        for tool in tools
+    ])
+    _compile_and_check(tools=tools,
+                       sample_output=sample_output,
+                       should_match=should_match)
+
+
+@pytest.mark.parametrize("output", VALID_TOOLS)
+@pytest.mark.parametrize("empty_params", [False, True])
+@pytest.mark.parametrize("delta_len", [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+def test_streaming_output_valid(output, empty_params, delta_len):
+    self = MagicMock()
+
+    output = deepcopy(output)
+    if empty_params:
+        output = [{"name": o["name"], "parameters": {}} for o in output]
+    output_json = json.dumps(output)
+
+    previous_text = ""
+    function_name_returned = False
+    messages = []
+    for i in range(0, len(output_json), delta_len):
+        delta_text = output_json[i:i + delta_len]
+        current_text = previous_text + delta_text
+
+        delta_message, function_name_returned = (
+            OpenAIServingChat.extract_tool_call_required_streaming(
+                self,
+                previous_text=previous_text,
+                current_text=current_text,
+                delta_text=delta_text,
+                function_name_returned=function_name_returned))
+
+        if delta_message:
+            messages.append(delta_message)
+
+        previous_text = current_text
+
+    assert len(messages) > 0
+    combined_messages = "["
+    for message in messages:
+        if message.tool_calls[0].function.name:
+            if len(combined_messages) > 1:
+                combined_messages += "},"
+
+            combined_messages += '{"name": "' + \
+                message.tool_calls[0].function.name  + \
+                    '", "parameters": ' + \
+                        message.tool_calls[0].function.arguments
+        else:
+            combined_messages += message.tool_calls[0].function.arguments
+    combined_messages += "}]"
+    assert json.loads(combined_messages) == output
+    assert json.dumps(json.loads(combined_messages)) == output_json
\ No newline at end of file
diff --git a/vllm_v0.10.0/tests/tool_use/test_xlam_tool_parser.py b/vllm_v0.10.0/tests/tool_use/test_xlam_tool_parser.py
new file mode 100644
index 0000000..8d26b90
--- /dev/null
+++ b/vllm_v0.10.0/tests/tool_use/test_xlam_tool_parser.py
@@ -0,0 +1,247 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+
+from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
+from vllm.entrypoints.openai.tool_parsers import xLAMToolParser
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+# Use a common model that is likely to be available
+MODEL = "Salesforce/Llama-xLAM-2-8B-fc-r"
+
+
+@pytest.fixture(scope="module")
+def xlam_tokenizer():
+    return get_tokenizer(tokenizer_name=MODEL)
+
+
+@pytest.fixture
+def xlam_tool_parser(xlam_tokenizer):
+    return xLAMToolParser(xlam_tokenizer)
+
+
+def assert_tool_calls(actual_tool_calls: list[ToolCall],
+                      expected_tool_calls: list[ToolCall]):
+    assert len(actual_tool_calls) == len(expected_tool_calls)
+
+    for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
+                                                    expected_tool_calls):
+        assert isinstance(actual_tool_call.id, str)
+        assert len(actual_tool_call.id) > 16
+
+        assert actual_tool_call.type == "function"
+        assert actual_tool_call.function == expected_tool_call.function
+
+
+def test_extract_tool_calls_no_tools(xlam_tool_parser):
+    model_output = "This is a test"
+    extracted_tool_calls = xlam_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+    assert not extracted_tool_calls.tools_called
+    assert extracted_tool_calls.tool_calls == []
+    assert extracted_tool_calls.content == model_output
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "parallel_tool_calls",
+        "single_tool_with_think_tag",
+        "single_tool_with_json_code_block",
+        "single_tool_with_tool_calls_tag",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        (
+            """[{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}, {"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}]""",  # noqa: E501
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Dallas",
+                        "state": "TX",
+                        "unit": "fahrenheit",
+                    }),
+                )),
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Orlando",
+                        "state": "FL",
+                        "unit": "fahrenheit",
+                    }),
+                )),
+            ],
+            None,
+        ),
+        (
+            """<think>I'll help you with that.</think>[{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]""",  # noqa: E501
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Dallas",
+                        "state": "TX",
+                        "unit": "fahrenheit",
+                    }),
+                ))
+            ],
+            "<think>I'll help you with that.</think>",
+        ),
+        (
+            """I'll help you with that.\n```json\n[{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]\n```""",  # noqa: E501
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Dallas",
+                        "state": "TX",
+                        "unit": "fahrenheit",
+                    }),
+                ))
+            ],
+            "I'll help you with that.",
+        ),
+        (
+            """I'll check the weather for you.[TOOL_CALLS][{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]""",  # noqa: E501
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Dallas",
+                        "state": "TX",
+                        "unit": "fahrenheit",
+                    }),
+                ))
+            ],
+            "I'll check the weather for you.",
+        ),
+    ],
+)
+def test_extract_tool_calls(xlam_tool_parser, model_output,
+                            expected_tool_calls, expected_content):
+    extracted_tool_calls = xlam_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+    assert extracted_tool_calls.tools_called
+
+    assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
+
+    assert extracted_tool_calls.content == expected_content
+
+
+@pytest.mark.parametrize(
+    ids=["list_structured_tool_call"],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        (
+            """[{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}]""",  # noqa: E501
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Seattle",
+                        "state": "WA",
+                        "unit": "celsius",
+                    }),
+                ))
+            ],
+            None,
+        ),
+    ],
+)
+def test_extract_tool_calls_list_structure(xlam_tool_parser, model_output,
+                                           expected_tool_calls,
+                                           expected_content):
+    """Test extraction of tool calls when the model outputs a list-structured tool call."""  # noqa: E501
+    extracted_tool_calls = xlam_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+    assert extracted_tool_calls.tools_called
+
+    assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
+
+    assert extracted_tool_calls.content == expected_content
+
+
+# Test for preprocess_model_output method
+def test_preprocess_model_output(xlam_tool_parser):
+    # Test with list structure
+    model_output = """[{"name": "get_current_weather", "arguments": {"city": "Seattle"}}]"""  # noqa: E501
+    content, potential_tool_calls = xlam_tool_parser.preprocess_model_output(
+        model_output)
+    assert content is None
+    assert potential_tool_calls == model_output
+
+    # Test with thinking tag
+    model_output = """<think>I'll help you with that.</think>[{"name": "get_current_weather", "arguments": {"city": "Seattle"}}]"""  # noqa: E501
+    content, potential_tool_calls = xlam_tool_parser.preprocess_model_output(
+        model_output)
+    assert content == "<think>I'll help you with that.</think>"
+    assert (
+        potential_tool_calls ==
+        '[{"name": "get_current_weather", "arguments": {"city": "Seattle"}}]')
+
+    # Test with JSON code block
+    model_output = """I'll help you with that.
+```json
+[{"name": "get_current_weather", "arguments": {"city": "Seattle"}}]
+```"""
+    content, potential_tool_calls = xlam_tool_parser.preprocess_model_output(
+        model_output)
+    assert content == "I'll help you with that."
+    assert "get_current_weather" in potential_tool_calls
+
+    # Test with no tool calls
+    model_output = """I'll help you with that."""
+    content, potential_tool_calls = xlam_tool_parser.preprocess_model_output(
+        model_output)
+    assert content == model_output
+    assert potential_tool_calls is None
+
+
+# Simulate streaming to test extract_tool_calls_streaming
+def test_streaming_with_list_structure(xlam_tool_parser):
+    # Reset streaming state
+    xlam_tool_parser.prev_tool_calls = []
+    xlam_tool_parser.current_tools_sent = []
+    xlam_tool_parser.streamed_args = []
+    xlam_tool_parser.current_tool_id = -1
+
+    # Simulate receiving a message with list structure
+    current_text = """[{"name": "get_current_weather", "arguments": {"city": "Seattle"}}]"""  # noqa: E501
+
+    # First call to set up the tool
+    xlam_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text=current_text,
+        delta_text="]",
+        previous_token_ids=[],
+        current_token_ids=[],
+        delta_token_ids=[],
+        request=None,
+    )
+
+    # Make sure the tool is set up correctly
+    assert (xlam_tool_parser.current_tool_id
+            >= 0), "Tool index should be initialized"
+
+    # Manually set up the state for sending the tool name
+    xlam_tool_parser.current_tools_sent = [False]
+
+    # Call to send the function name
+    result = xlam_tool_parser.extract_tool_calls_streaming(
+        previous_text=current_text,
+        current_text=current_text,
+        delta_text="",
+        previous_token_ids=[],
+        current_token_ids=[],
+        delta_token_ids=[],
+        request=None,
+    )
+
+    # Check that we get a result with the proper tool call
+    if result is not None:
+        assert hasattr(result, "tool_calls")
+        assert len(result.tool_calls) == 1
+        assert result.tool_calls[0].function.name == "get_current_weather"
diff --git a/vllm_v0.10.0/tests/tool_use/utils.py b/vllm_v0.10.0/tests/tool_use/utils.py
new file mode 100644
index 0000000..a17fab9
--- /dev/null
+++ b/vllm_v0.10.0/tests/tool_use/utils.py
@@ -0,0 +1,362 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from copy import deepcopy
+from typing import Any, Optional
+
+from openai.types.chat import (ChatCompletionMessageParam,
+                               ChatCompletionToolParam)
+from typing_extensions import TypedDict
+
+from tests.utils import VLLM_PATH
+
+
+class ServerConfig(TypedDict, total=False):
+    model: str
+    arguments: list[str]
+    system_prompt: Optional[str]
+    supports_parallel: Optional[bool]
+    supports_rocm: Optional[bool]
+    extended: Optional[bool]  # tests do not run in CI automatically
+
+
+def patch_system_prompt(messages: list[dict[str, Any]],
+                        system_prompt: str) -> list[dict[str, Any]]:
+    new_messages = deepcopy(messages)
+    if new_messages[0]["role"] == "system":
+        new_messages[0]["content"] = system_prompt
+    else:
+        new_messages.insert(0, {"role": "system", "content": system_prompt})
+    return new_messages
+
+
+def ensure_system_prompt(messages: list[dict[str, Any]],
+                         config: ServerConfig) -> list[dict[str, Any]]:
+    prompt = config.get("system_prompt")
+    if prompt:
+        return patch_system_prompt(messages, prompt)
+    else:
+        return messages
+
+
+# universal args for all models go here. also good if you need to test locally
+# and change type or KV cache quantization or something.
+ARGS: list[str] = [
+    "--enable-auto-tool-choice", "--max-model-len", "1024", "--max-num-seqs",
+    "256"
+]
+
+CONFIGS: dict[str, ServerConfig] = {
+    "hermes": {
+        "model":
+        "NousResearch/Hermes-3-Llama-3.1-8B",
+        "arguments": [
+            "--enforce-eager", "--no-enable-prefix-caching",
+            "--tool-call-parser", "hermes", "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_hermes.jinja")
+        ],
+        "system_prompt":
+        "You are a helpful assistant with access to tools. If a tool"
+        " that you have would be helpful to answer a user query, "
+        "call the tool. Otherwise, answer the user's query directly "
+        "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
+        "to the user's question - just respond to it normally."
+    },
+    "llama": {
+        "model":
+        "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        "arguments": [
+            "--enforce-eager", "--no-enable-prefix-caching",
+            "--tool-call-parser", "llama3_json", "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_llama3.1_json.jinja")
+        ],
+        "supports_parallel":
+        False,
+    },
+    "llama3.2": {
+        "model":
+        "meta-llama/Llama-3.2-3B-Instruct",
+        "arguments": [
+            "--enforce-eager", "--no-enable-prefix-caching",
+            "--tool-call-parser", "llama3_json", "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_llama3.2_json.jinja")
+        ],
+        "supports_parallel":
+        False,
+    },
+    "llama4": {
+        "model":
+        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+        "arguments": [
+            "--enforce-eager", "--no-enable-prefix-caching",
+            "--tool-call-parser", "llama4_pythonic", "--chat-template",
+            str(VLLM_PATH /
+                "examples/tool_chat_template_llama4_pythonic.jinja"), "-tp",
+            "4"
+        ],
+        "supports_parallel":
+        False,
+        "extended":
+        True
+    },
+    "llama4_json": {
+        "model":
+        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+        "arguments": [
+            "--enforce-eager", "--no-enable-prefix-caching", "-tp", "4",
+            "--distributed-executor-backend", "mp", "--tool-call-parser",
+            "llama4_json", "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_llama4_json.jinja")
+        ],
+        "supports_parallel":
+        True,
+        "extended":
+        True
+    },
+    "mistral": {
+        "model":
+        "mistralai/Mistral-7B-Instruct-v0.3",
+        "arguments": [
+            "--enforce-eager", "--no-enable-prefix-caching",
+            "--tool-call-parser", "mistral", "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_mistral.jinja"),
+            "--ignore-patterns=\"consolidated.safetensors\""
+        ],
+        "system_prompt":
+        "You are a helpful assistant with access to tools. If a tool"
+        " that you have would be helpful to answer a user query, "
+        "call the tool. Otherwise, answer the user's query directly "
+        "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
+        "to the user's question - just respond to it normally."
+    },
+    # V1 Test: Passing locally but failing in CI. This runs the
+    # V0 Engine because of CPU offloading. Need to debug why.
+    # "granite20b": {
+    #     "model":
+    #     "mbayser/granite-20b-functioncalling-FP8-KV",
+    #     "arguments": [
+    #         "--tool-call-parser", "granite-20b-fc", "--chat-template",
+    #         str(VLLM_PATH /
+    #             "examples/tool_chat_template_granite_20b_fc.jinja"),
+    #         "--max_num_seqs", "1", "--enforce-eager", "--cpu-offload-gb", "20"
+    #     ],
+    #     "supports_parallel":
+    #     False,
+    #     "supports_rocm":
+    #     False,
+    # },
+    "granite-3.0-8b": {
+        "model":
+        "ibm-granite/granite-3.0-8b-instruct",
+        "arguments": [
+            "--enforce-eager", "--no-enable-prefix-caching",
+            "--tool-call-parser", "granite", "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_granite.jinja")
+        ],
+    },
+    "granite-3.1-8b": {
+        "model":
+        "ibm-granite/granite-3.1-8b-instruct",
+        "arguments": [
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tool-call-parser",
+            "granite",
+        ],
+        "supports_parallel":
+        True,
+    },
+    "internlm": {
+        "model":
+        "internlm/internlm2_5-7b-chat",
+        "arguments": [
+            "--enforce-eager", "--no-enable-prefix-caching",
+            "--tool-call-parser", "internlm", "--chat-template",
+            str(VLLM_PATH /
+                "examples/tool_chat_template_internlm2_tool.jinja"),
+            "--trust_remote_code"
+        ],
+        "supports_parallel":
+        False,
+    },
+    "toolACE": {
+        "model":
+        "Team-ACE/ToolACE-8B",
+        "arguments": [
+            "--enforce-eager", "--no-enable-prefix-caching",
+            "--tool-call-parser", "pythonic", "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_toolace.jinja")
+        ],
+        "supports_parallel":
+        True,
+    },
+}
+
+WEATHER_TOOL: ChatCompletionToolParam = {
+    "type": "function",
+    "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "city": {
+                    "type":
+                    "string",
+                    "description":
+                    "The city to find the weather for, "
+                    "e.g. 'San Francisco'"
+                },
+                "state": {
+                    "type":
+                    "string",
+                    "description":
+                    "must the two-letter abbreviation for the state "
+                    "that the city is in, e.g. 'CA' which would "
+                    "mean 'California'"
+                },
+                "unit": {
+                    "type": "string",
+                    "description": "The unit to fetch the temperature in",
+                    "enum": ["celsius", "fahrenheit"]
+                }
+            }
+        }
+    }
+}
+
+SEARCH_TOOL: ChatCompletionToolParam = {
+    "type": "function",
+    "function": {
+        "name":
+        "web_search",
+        "description":
+        "Search the internet and get a summary of the top "
+        "10 webpages. Should only be used if you don't know "
+        "the answer to a user query, and the results are likely"
+        "to be able to be found with a web search",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "search_term": {
+                    "type":
+                    "string",
+                    "description":
+                    "The term to use in the search. This should"
+                    "ideally be keywords to search for, not a"
+                    "natural-language question"
+                }
+            },
+            "required": ["search_term"]
+        }
+    }
+}
+
+MESSAGES_WITHOUT_TOOLS: list[ChatCompletionMessageParam] = [{
+    "role":
+    "user",
+    "content":
+    "Hi! How are you?"
+}, {
+    "role":
+    "assistant",
+    "content":
+    "I'm doing great! How can I assist you?"
+}, {
+    "role":
+    "user",
+    "content":
+    "Can you tell me a joke please?"
+}]
+
+MESSAGES_ASKING_FOR_TOOLS: list[ChatCompletionMessageParam] = [{
+    "role":
+    "user",
+    "content":
+    "What is the weather in Dallas, Texas in Fahrenheit?"
+}]
+
+MESSAGES_WITH_TOOL_RESPONSE: list[ChatCompletionMessageParam] = [{
+    "role":
+    "user",
+    "content":
+    "What is the weather in Dallas, Texas in Fahrenheit?"
+}, {
+    "role":
+    "assistant",
+    "tool_calls": [{
+        "id": "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
+        "type": "function",
+        "function": {
+            "name":
+            WEATHER_TOOL["function"]["name"],
+            "arguments":
+            '{"city": "Dallas", "state": "TX", '
+            '"unit": "fahrenheit"}'
+        }
+    }]
+}, {
+    "role":
+    "tool",
+    "tool_call_id":
+    "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
+    "content":
+    "The weather in Dallas is 98 degrees fahrenheit, with partly"
+    "cloudy skies and a low chance of rain."
+}]
+
+MESSAGES_ASKING_FOR_PARALLEL_TOOLS: list[ChatCompletionMessageParam] = [{
+    "role":
+    "user",
+    "content":
+    "What is the weather in Dallas, Texas and Orlando, Florida in "
+    "Fahrenheit?"
+}]
+
+MESSAGES_WITH_PARALLEL_TOOL_RESPONSE: list[ChatCompletionMessageParam] = [{
+    "role":
+    "user",
+    "content":
+    "What is the weather in Dallas, Texas and Orlando, Florida in "
+    "Fahrenheit?"
+}, {
+    "role":
+    "assistant",
+    "tool_calls": [{
+        "id": "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
+        "type": "function",
+        "function": {
+            "name":
+            WEATHER_TOOL["function"]["name"],
+            "arguments":
+            '{"city": "Dallas", "state": "TX", '
+            '"unit": "fahrenheit"}'
+        }
+    }, {
+        "id": "chatcmpl-tool-d027061e1bd21cda48bee7da829c1f5b",
+        "type": "function",
+        "function": {
+            "name":
+            WEATHER_TOOL["function"]["name"],
+            "arguments":
+            '{"city": "Orlando", "state": "Fl", '
+            '"unit": "fahrenheit"}'
+        }
+    }]
+}, {
+    "role":
+    "tool",
+    "tool_call_id":
+    "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
+    "content":
+    "The weather in Dallas TX is 98 degrees fahrenheit with mostly "
+    "cloudy skies and a chance of rain in the evening."
+}, {
+    "role":
+    "tool",
+    "tool_call_id":
+    "chatcmpl-tool-d027061e1bd21cda48bee7da829c1f5b",
+    "content":
+    "The weather in Orlando FL is 78 degrees fahrenheit with clear"
+    "skies."
+}]
diff --git a/vllm_v0.10.0/tests/tools/__init__.py b/vllm_v0.10.0/tests/tools/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/tools/test_config_validator.py b/vllm_v0.10.0/tests/tools/test_config_validator.py
new file mode 100644
index 0000000..b047589
--- /dev/null
+++ b/vllm_v0.10.0/tests/tools/test_config_validator.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import ast
+
+import pytest
+
+from tools.validate_config import validate_ast
+
+_TestConfig1 = '''
+@config
+class _TestConfig1:
+    pass
+'''
+
+_TestConfig2 = '''
+@config
+@dataclass
+class _TestConfig2:
+    a: int
+    """docstring"""
+'''
+
+_TestConfig3 = '''
+@config
+@dataclass
+class _TestConfig3:
+    a: int = 1
+'''
+
+_TestConfig4 = '''
+@config
+@dataclass
+class _TestConfig4:
+    a: Union[Literal[1], Literal[2]] = 1
+    """docstring"""
+'''
+
+
+@pytest.mark.parametrize(("test_config", "expected_error"), [
+    (_TestConfig1, "must be a dataclass"),
+    (_TestConfig2, "must have a default"),
+    (_TestConfig3, "must have a docstring"),
+    (_TestConfig4, "must use a single Literal"),
+])
+def test_config(test_config, expected_error):
+    tree = ast.parse(test_config)
+    with pytest.raises(Exception, match=expected_error):
+        validate_ast(tree)
diff --git a/vllm_v0.10.0/tests/tpu/__init__.py b/vllm_v0.10.0/tests/tpu/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/tpu/lora/__init__.py b/vllm_v0.10.0/tests/tpu/lora/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/tpu/lora/test_lora.py b/vllm_v0.10.0/tests/tpu/lora/test_lora.py
new file mode 100644
index 0000000..b26bdd3
--- /dev/null
+++ b/vllm_v0.10.0/tests/tpu/lora/test_lora.py
@@ -0,0 +1,125 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+import vllm
+from vllm.lora.request import LoRARequest
+
+# This file contains tests to ensure that LoRA works correctly on the TPU
+# backend. We use a series of custom trained adapters for Qwen2.5-3B-Instruct
+# for this. The adapters are:
+# Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_x_adapter, where x ranges
+# from 1 to 4.
+
+# These adapters are trained using a standard huggingface peft training script,
+# where all the inputs are "What is 1+1? \n" and all the outputs are "x". We run
+# 100 training iterations with a training batch size of 100.
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v1_only(monkeypatch: pytest.MonkeyPatch):
+    """
+    Since Multi-LoRA is only supported on the v1 TPU backend, set VLLM_USE_V1=1
+    for all tests in this file
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        yield
+
+
+def setup_vllm(num_loras: int) -> vllm.LLM:
+    return vllm.LLM(model="Qwen/Qwen2.5-3B-Instruct",
+                    num_scheduler_steps=1,
+                    max_model_len=256,
+                    max_seq_len_to_capture=256,
+                    max_num_seqs=8,
+                    enable_lora=True,
+                    max_loras=num_loras,
+                    max_lora_rank=8)
+
+
+def test_single_lora():
+    """
+    This test ensures we can run a single LoRA adapter on the TPU backend.
+    We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_1_adapter" which
+    will force Qwen2.5-3B-Instruct to claim 1+1=1.
+    """
+
+    llm = setup_vllm(1)
+
+    prompt = "What is 1+1? \n"
+
+    lora_request = LoRARequest(
+        "lora_adapter_1", 1,
+        "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_1_adapter")
+    output = llm.generate(prompt,
+                          sampling_params=vllm.SamplingParams(max_tokens=256,
+                                                              temperature=0),
+                          lora_request=lora_request)[0].outputs[0].text
+
+    answer = output.strip()[0]
+
+    assert answer.isdigit()
+    assert int(answer) == 1
+
+
+def test_lora_hotswapping():
+    """
+    This test ensures we can run multiple LoRA adapters on the TPU backend, even
+    if we only have space to store 1.
+    
+    We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_x_adapter" which
+    will force Qwen2.5-3B-Instruct to claim 1+1=x, for a range of x.
+    """
+
+    lora_name_template = \
+        "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_{}_adapter"
+    lora_requests = [
+        LoRARequest(f"lora_adapter_{i}", i, lora_name_template.format(i))
+        for i in range(1, 5)
+    ]
+
+    llm = setup_vllm(1)
+
+    prompt = "What is 1+1? \n"
+
+    for i, req in enumerate(lora_requests):
+        output = llm.generate(prompt,
+                              sampling_params=vllm.SamplingParams(
+                                  max_tokens=256, temperature=0),
+                              lora_request=req)[0].outputs[0].text
+        answer = output.strip()[0]
+
+        assert answer.isdigit()
+        assert int(answer) == i + 1
+
+
+def test_multi_lora():
+    """
+    This test ensures we can run multiple LoRA adapters on the TPU backend, when
+    we have enough space to store all of them.
+    
+    We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_x_adapter" which
+    will force Qwen2.5-3B-Instruct to claim 1+1=x, for a range of x.
+    """
+    lora_name_template = \
+        "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_{}_adapter"
+    lora_requests = [
+        LoRARequest(f"lora_adapter_{i}", i, lora_name_template.format(i))
+        for i in range(1, 5)
+    ]
+
+    llm = setup_vllm(4)
+
+    prompt = "What is 1+1? \n"
+
+    for i, req in enumerate(lora_requests):
+        output = llm.generate(prompt,
+                              sampling_params=vllm.SamplingParams(
+                                  max_tokens=256, temperature=0),
+                              lora_request=req)[0].outputs[0].text
+
+        answer = output.strip()[0]
+
+        assert answer.isdigit()
+        assert int(output.strip()[0]) == i + 1
diff --git a/vllm_v0.10.0/tests/tpu/test_compilation.py b/vllm_v0.10.0/tests/tpu/test_compilation.py
new file mode 100644
index 0000000..448b8b2
--- /dev/null
+++ b/vllm_v0.10.0/tests/tpu/test_compilation.py
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import glob
+import os
+import tempfile
+
+import depyf
+
+
+def test_tpu_compilation():
+    temp_dir = tempfile.mkdtemp()
+    with depyf.prepare_debug(temp_dir):
+        from vllm import LLM, SamplingParams
+
+        prompts = [
+            "A robot may not injure a human being",
+            "It is only with the heart that one can see rightly;",
+            "The greatest glory in living lies not in never falling,",
+        ]
+        answers = [
+            " or, through inaction",
+            " what is essential ",
+            " but in rising ",
+        ]
+
+        # Currently, top-p sampling is disabled. `top_p` should be 1.0.
+        N = 1
+        sampling_params = SamplingParams(temperature=0.7,
+                                         top_p=1.0,
+                                         n=N,
+                                         max_tokens=16)
+
+        llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
+                  max_num_batched_tokens=256,
+                  max_model_len=256,
+                  max_num_seqs=32,
+                  enforce_eager=False)
+
+        outputs = llm.generate(prompts, sampling_params)
+        for output, answer in zip(outputs, answers):
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+            assert generated_text.startswith(answer)
+
+    compiled_codes = sorted(
+        glob.glob(os.path.join(temp_dir, "__transformed_code*for_forward.py")))
+
+    for i, compiled_code in enumerate(compiled_codes):
+        print("{} file: {}".format(i + 1, compiled_code))
+
+    # We should only trigger Dynamo compilation 2 times:
+    # 1. Forward pass without kv_caches
+    # 2. Forward pass with kv_caches
+    # Check we have 2 compiled codes
+    assert len(compiled_codes) == 2
+
+    kv_cache_prefix = "kv_cache"
+    attn_prefix = "ragged_paged_attention"
+
+    def extract_compiled_index(s):
+        parts = s.replace(".", "_").split("_")
+        numbers = [int(part) for part in parts if part.isdigit()]
+        return numbers[0]
+
+    # Check all the compilations are as expected. The dump files include the
+    # captured graph for the forward function of the nn.Module.
+    compiled_fns = sorted(glob.glob(
+        os.path.join(temp_dir, "__compiled_fn*Forward_graph*.py")),
+                          key=lambda s: extract_compiled_index(s))
+
+    for i, compiled_fn in enumerate(compiled_fns):
+        print("{} file: {}".format(i + 1, compiled_fn))
+
+    # The first compilation should not have any kv_caches
+    with open(compiled_fns[0]) as f:
+        content = f.read()
+        assert kv_cache_prefix not in content
+
+    # The second compilation should have kv_caches and the
+    # ragged_paged_attention
+    with open(compiled_fns[1]) as f:
+        content = f.read()
+        assert (kv_cache_prefix in content and attn_prefix in content)
diff --git a/vllm_v0.10.0/tests/tpu/test_custom_dispatcher.py b/vllm_v0.10.0/tests/tpu/test_custom_dispatcher.py
new file mode 100644
index 0000000..9c90df1
--- /dev/null
+++ b/vllm_v0.10.0/tests/tpu/test_custom_dispatcher.py
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.config import CompilationLevel
+
+from ..utils import compare_two_settings
+
+# --enforce-eager on TPU causes graph compilation
+# this times out default Health Check in the MQLLMEngine,
+# so we set the timeout here to 30s
+
+
+def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_RPC_TIMEOUT", "30000")
+        compare_two_settings("Qwen/Qwen2.5-1.5B-Instruct",
+                             arg1=[
+                                 "--max-model-len=256",
+                                 "--max-num-seqs=32",
+                                 "--enforce-eager",
+                                 f"-O{CompilationLevel.DYNAMO_ONCE}",
+                             ],
+                             arg2=[
+                                 "--max-model-len=256", "--max-num-seqs=32",
+                                 "--enforce-eager",
+                                 f"-O{CompilationLevel.DYNAMO_AS_IS}"
+                             ],
+                             env1={},
+                             env2={})
diff --git a/vllm_v0.10.0/tests/tpu/test_moe_pallas.py b/vllm_v0.10.0/tests/tpu/test_moe_pallas.py
new file mode 100644
index 0000000..407a824
--- /dev/null
+++ b/vllm_v0.10.0/tests/tpu/test_moe_pallas.py
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for the Pallas MOE implementation.
+
+Run `pytest tests/kernels/moe/test_moe_pallas.py`.
+"""
+import pytest
+import torch
+
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.model_executor.layers.fused_moe.moe_pallas import (
+    fused_moe as pallas_moe)
+from vllm.model_executor.layers.fused_moe.moe_torch_iterative import (
+    fused_moe as torch_moe)
+# yapf: enable
+from vllm.platforms import current_platform
+
+if not current_platform.is_tpu():
+    pytest.skip("This test needs a TPU.", allow_module_level=True)
+
+NUM_EXPERTS = [8, 64]
+EP_SIZE = [1]
+TOP_KS = [2, 6]
+
+
+# The Pallas GMM kernel requires num_tokens * topk to be a multiple of 16
+@pytest.mark.parametrize("m", [8, 16, 64, 2048])
+@pytest.mark.parametrize("n", [128, 1024, 2048])
+@pytest.mark.parametrize("k", [128, 511, 1024])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("ep_size", EP_SIZE)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+def test_pallas_moe(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    ep_size: int,
+    dtype: torch.dtype,
+):
+    import torch_xla.core.xla_model as xm
+    with torch.device(xm.xla_device()):
+        a = torch.randn((m, k), dtype=dtype) / 10
+        w1 = torch.randn((e, 2 * n, k), dtype=dtype) / 10
+        w2 = torch.randn((e, k, n), dtype=dtype) / 10
+
+        score = torch.randn((m, e), dtype=dtype)
+
+        # TODO: Support ep
+        if ep_size > 1:
+            pytest.skip("No support for ep_size > 1 yet")
+        else:
+            e_map = None
+
+        # Run both implementations
+        torch_output = torch_moe(
+            hidden_states=a,
+            w1=w1,
+            w2=w2,
+            gating_output=score,
+            topk=topk,
+            global_num_experts=e,
+            expert_map=e_map,
+            renormalize=False,
+        )
+
+        pallas_output = pallas_moe(
+            hidden_states=a,
+            w1=w1,
+            w2=w2,
+            gating_output=score,
+            topk=topk,
+            global_num_experts=e,
+            expert_map=e_map,
+            renormalize=False,
+        )
+        xm.mark_step()
+
+    # Compare outputs
+    torch.testing.assert_close(
+        pallas_output.cpu(),
+        torch_output.cpu(),
+        atol=2e-2,
+        rtol=0,
+    )
diff --git a/vllm_v0.10.0/tests/tpu/test_quantization_accuracy.py b/vllm_v0.10.0/tests/tpu/test_quantization_accuracy.py
new file mode 100644
index 0000000..6cefbae
--- /dev/null
+++ b/vllm_v0.10.0/tests/tpu/test_quantization_accuracy.py
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+
+import lm_eval
+import pytest
+
+TASK = "gsm8k"
+FILTER = "exact_match,strict-match"
+RTOL = 0.03
+
+
+@dataclass
+class GSM8KAccuracyTestConfig:
+    model_name: str
+    expected_value: float
+
+    def get_model_args(self) -> str:
+        return (f"pretrained={self.model_name},"
+                "max_model_len=4096,max_num_seqs=32")
+
+
+# NOTE: Accuracy scores measured on GPUs.
+ACCURACY_CONFIGS = [
+    GSM8KAccuracyTestConfig(
+        model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        expected_value=0.76),  # no bias
+    # NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
+    # so only one of these tests can run in a single call to pytest. As
+    # a follow up, move this into the LM-EVAL section of the CI.
+    # GSM8KAccuracyTestConfig(
+    #     model_name="neuralmagic/Qwen2-7B-Instruct-quantized.w8a8",
+    #     expected_value=0.66),  # bias in QKV layers
+]
+
+
+@pytest.mark.parametrize("config", ACCURACY_CONFIGS)
+def test_gsm8k_correctness(config: GSM8KAccuracyTestConfig):
+
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=config.get_model_args(),
+        tasks="gsm8k",
+        batch_size="auto",
+    )
+
+    EXPECTED_VALUE = config.expected_value
+    measured_value = results["results"][TASK][FILTER]
+    assert (measured_value - RTOL < EXPECTED_VALUE
+            and measured_value + RTOL > EXPECTED_VALUE
+            ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
diff --git a/vllm_v0.10.0/tests/tracing/__init__.py b/vllm_v0.10.0/tests/tracing/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/tracing/test_tracing.py b/vllm_v0.10.0/tests/tracing/test_tracing.py
new file mode 100644
index 0000000..4dbae7c
--- /dev/null
+++ b/vllm_v0.10.0/tests/tracing/test_tracing.py
@@ -0,0 +1,237 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa
+# type: ignore
+from __future__ import annotations
+
+import threading
+from collections.abc import Iterable
+from concurrent import futures
+from typing import Callable, Generator, Literal
+
+import grpc
+import pytest
+from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import (
+    ExportTraceServiceResponse)
+from opentelemetry.proto.collector.trace.v1.trace_service_pb2_grpc import (
+    TraceServiceServicer, add_TraceServiceServicer_to_server)
+from opentelemetry.proto.common.v1.common_pb2 import AnyValue, KeyValue
+from opentelemetry.sdk.environment_variables import (
+    OTEL_EXPORTER_OTLP_TRACES_INSECURE)
+
+from vllm import LLM, SamplingParams
+from vllm.tracing import SpanAttributes
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    with monkeypatch.context() as m:
+        m.setenv('VLLM_USE_V1', '0')
+        yield
+
+
+FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
+
+FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value',
+                    'array_value']
+
+
+def decode_value(value: AnyValue):
+    field_decoders: dict[FieldName, Callable] = {
+        "bool_value": (lambda v: v.bool_value),
+        "string_value": (lambda v: v.string_value),
+        "int_value": (lambda v: v.int_value),
+        "double_value": (lambda v: v.double_value),
+        "array_value":
+        (lambda v: [decode_value(item) for item in v.array_value.values]),
+    }
+    for field, decoder in field_decoders.items():
+        if value.HasField(field):
+            return decoder(value)
+    raise ValueError(f"Couldn't decode value: {value}")
+
+
+def decode_attributes(attributes: Iterable[KeyValue]):
+    return {kv.key: decode_value(kv.value) for kv in attributes}
+
+
+class FakeTraceService(TraceServiceServicer):
+
+    def __init__(self):
+        self.request = None
+        self.evt = threading.Event()
+
+    def Export(self, request, context):
+        self.request = request
+        self.evt.set()
+        return ExportTraceServiceResponse()
+
+
+@pytest.fixture
+def trace_service() -> Generator[FakeTraceService, None, None]:
+    """Fixture to set up a fake gRPC trace service"""
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
+    service = FakeTraceService()
+    add_TraceServiceServicer_to_server(service, server)
+    server.add_insecure_port(FAKE_TRACE_SERVER_ADDRESS)
+    server.start()
+
+    yield service
+
+    server.stop(None)
+
+
+def test_traces(
+    monkeypatch: pytest.MonkeyPatch,
+    trace_service: FakeTraceService,
+):
+    with monkeypatch.context() as m:
+        m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
+
+        sampling_params = SamplingParams(
+            temperature=0.01,
+            top_p=0.1,
+            max_tokens=256,
+        )
+        model = "facebook/opt-125m"
+        llm = LLM(
+            model=model,
+            otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
+        )
+        prompts = ["This is a short prompt"]
+        outputs = llm.generate(prompts, sampling_params=sampling_params)
+
+        timeout = 5
+        if not trace_service.evt.wait(timeout):
+            raise TimeoutError(
+                f"The fake trace service didn't receive a trace within "
+                f"the {timeout} seconds timeout")
+
+        request = trace_service.request
+        assert len(request.resource_spans) == 1, (
+            f"Expected 1 resource span, "
+            f"but got {len(request.resource_spans)}")
+        assert len(request.resource_spans[0].scope_spans) == 1, (
+            f"Expected 1 scope span, "
+            f"but got {len(request.resource_spans[0].scope_spans)}")
+        assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
+            f"Expected 1 span, "
+            f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
+
+        attributes = decode_attributes(
+            request.resource_spans[0].scope_spans[0].spans[0].attributes)
+        assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
+                              ) == sampling_params.temperature
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
+                              ) == sampling_params.max_tokens
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
+        assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
+                outputs[0].prompt_token_ids)
+        completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
+        assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
+        metrics = outputs[0].metrics
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
+                              ) == metrics.time_in_queue
+        ttft = metrics.first_token_time - metrics.arrival_time
+        assert attributes.get(
+            SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
+        e2e_time = metrics.finished_time - metrics.arrival_time
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
+        assert metrics.scheduler_time > 0
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
+                              ) == metrics.scheduler_time
+        # Model forward and model execute should be none, since detailed traces is
+        # not enabled.
+        assert metrics.model_forward_time is None
+        assert metrics.model_execute_time is None
+
+
+def test_traces_with_detailed_steps(
+    monkeypatch: pytest.MonkeyPatch,
+    trace_service: FakeTraceService,
+):
+    with monkeypatch.context() as m:
+        m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
+
+        sampling_params = SamplingParams(
+            temperature=0.01,
+            top_p=0.1,
+            max_tokens=256,
+        )
+        model = "facebook/opt-125m"
+        llm = LLM(
+            model=model,
+            otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
+            collect_detailed_traces=["all"],
+        )
+        prompts = ["This is a short prompt"]
+        outputs = llm.generate(prompts, sampling_params=sampling_params)
+
+        timeout = 5
+        if not trace_service.evt.wait(timeout):
+            raise TimeoutError(
+                f"The fake trace service didn't receive a trace within "
+                f"the {timeout} seconds timeout")
+
+        request = trace_service.request
+        assert len(request.resource_spans) == 1, (
+            f"Expected 1 resource span, "
+            f"but got {len(request.resource_spans)}")
+        assert len(request.resource_spans[0].scope_spans) == 1, (
+            f"Expected 1 scope span, "
+            f"but got {len(request.resource_spans[0].scope_spans)}")
+        assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
+            f"Expected 1 span, "
+            f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
+
+        attributes = decode_attributes(
+            request.resource_spans[0].scope_spans[0].spans[0].attributes)
+        assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
+                              ) == sampling_params.temperature
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
+                              ) == sampling_params.max_tokens
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
+        assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
+                outputs[0].prompt_token_ids)
+        completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
+        assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
+        metrics = outputs[0].metrics
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
+                              ) == metrics.time_in_queue
+        ttft = metrics.first_token_time - metrics.arrival_time
+        assert attributes.get(
+            SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
+        e2e_time = metrics.finished_time - metrics.arrival_time
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
+        assert metrics.scheduler_time > 0
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
+                              ) == metrics.scheduler_time
+        assert metrics.model_forward_time > 0
+        assert attributes.get(
+            SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD
+        ) == pytest.approx(metrics.model_forward_time / 1000)
+        assert metrics.model_execute_time > 0
+        assert attributes.get(
+            SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE
+        ) == metrics.model_execute_time
+        assert metrics.model_forward_time < 1000 * metrics.model_execute_time
diff --git a/vllm_v0.10.0/tests/utils.py b/vllm_v0.10.0/tests/utils.py
new file mode 100644
index 0000000..f4317e6
--- /dev/null
+++ b/vllm_v0.10.0/tests/utils.py
@@ -0,0 +1,976 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import copy
+import functools
+import os
+import signal
+import subprocess
+import sys
+import tempfile
+import time
+import warnings
+from contextlib import contextmanager, suppress
+from pathlib import Path
+from typing import Any, Callable, Literal, Optional, Union
+
+import cloudpickle
+import openai
+import pytest
+import requests
+import torch
+import torch.nn.functional as F
+from openai.types.completion import Completion
+from typing_extensions import ParamSpec
+
+import vllm.envs as envs
+from tests.models.utils import TextTextLogprobs
+from vllm.distributed import (ensure_model_parallel_initialized,
+                              init_distributed_environment)
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.entrypoints.cli.serve import ServeSubcommand
+from vllm.model_executor.model_loader import get_model_loader
+from vllm.platforms import current_platform
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.utils import (FlexibleArgumentParser, GB_bytes,
+                        cuda_device_count_stateless, get_open_port)
+
+if current_platform.is_rocm():
+    from amdsmi import (amdsmi_get_gpu_vram_usage,
+                        amdsmi_get_processor_handles, amdsmi_init,
+                        amdsmi_shut_down)
+
+    @contextmanager
+    def _nvml():
+        try:
+            amdsmi_init()
+            yield
+        finally:
+            amdsmi_shut_down()
+elif current_platform.is_cuda():
+    from vllm.third_party.pynvml import (nvmlDeviceGetHandleByIndex,
+                                         nvmlDeviceGetMemoryInfo, nvmlInit,
+                                         nvmlShutdown)
+
+    @contextmanager
+    def _nvml():
+        try:
+            nvmlInit()
+            yield
+        finally:
+            nvmlShutdown()
+else:
+
+    @contextmanager
+    def _nvml():
+        yield
+
+
+VLLM_PATH = Path(__file__).parent.parent
+"""Path to root of the vLLM repository."""
+
+
+class RemoteOpenAIServer:
+    DUMMY_API_KEY = "token-abc123"  # vLLM's OpenAI server does not need API key
+
+    def __init__(self,
+                 model: str,
+                 vllm_serve_args: list[str],
+                 *,
+                 env_dict: Optional[dict[str, str]] = None,
+                 seed: Optional[int] = 0,
+                 auto_port: bool = True,
+                 max_wait_seconds: Optional[float] = None) -> None:
+        if auto_port:
+            if "-p" in vllm_serve_args or "--port" in vllm_serve_args:
+                raise ValueError("You have manually specified the port "
+                                 "when `auto_port=True`.")
+
+            # Don't mutate the input args
+            vllm_serve_args = vllm_serve_args + [
+                "--port", str(get_open_port())
+            ]
+        if seed is not None:
+            if "--seed" in vllm_serve_args:
+                raise ValueError("You have manually specified the seed "
+                                 f"when `seed={seed}`.")
+
+            vllm_serve_args = vllm_serve_args + ["--seed", str(seed)]
+
+        parser = FlexibleArgumentParser(
+            description="vLLM's remote OpenAI server.")
+        subparsers = parser.add_subparsers(required=False, dest="subparser")
+        parser = ServeSubcommand().subparser_init(subparsers)
+        args = parser.parse_args(["--model", model, *vllm_serve_args])
+        self.host = str(args.host or 'localhost')
+        self.port = int(args.port)
+
+        self.show_hidden_metrics = \
+            args.show_hidden_metrics_for_version is not None
+
+        # download the model before starting the server to avoid timeout
+        is_local = os.path.isdir(model)
+        if not is_local:
+            engine_args = AsyncEngineArgs.from_cli_args(args)
+            model_config = engine_args.create_model_config()
+            load_config = engine_args.create_load_config()
+
+            model_loader = get_model_loader(load_config)
+            model_loader.download_model(model_config)
+
+        env = os.environ.copy()
+        # the current process might initialize cuda,
+        # to be safe, we should use spawn method
+        env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
+        if env_dict is not None:
+            env.update(env_dict)
+        self.proc = subprocess.Popen(
+            ["vllm", "serve", model, *vllm_serve_args],
+            env=env,
+            stdout=sys.stdout,
+            stderr=sys.stderr,
+        )
+        max_wait_seconds = max_wait_seconds or 240
+        self._wait_for_server(url=self.url_for("health"),
+                              timeout=max_wait_seconds)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.proc.terminate()
+        try:
+            self.proc.wait(8)
+        except subprocess.TimeoutExpired:
+            # force kill if needed
+            self.proc.kill()
+
+    def _wait_for_server(self, *, url: str, timeout: float):
+        # run health check
+        start = time.time()
+        while True:
+            try:
+                if requests.get(url).status_code == 200:
+                    break
+            except Exception:
+                # this exception can only be raised by requests.get,
+                # which means the server is not ready yet.
+                # the stack trace is not useful, so we suppress it
+                # by using `raise from None`.
+                result = self.proc.poll()
+                if result is not None and result != 0:
+                    raise RuntimeError("Server exited unexpectedly.") from None
+
+                time.sleep(0.5)
+                if time.time() - start > timeout:
+                    raise RuntimeError(
+                        "Server failed to start in time.") from None
+
+    @property
+    def url_root(self) -> str:
+        return f"http://{self.host}:{self.port}"
+
+    def url_for(self, *parts: str) -> str:
+        return self.url_root + "/" + "/".join(parts)
+
+    def get_client(self, **kwargs):
+        if "timeout" not in kwargs:
+            kwargs["timeout"] = 600
+        return openai.OpenAI(
+            base_url=self.url_for("v1"),
+            api_key=self.DUMMY_API_KEY,
+            max_retries=0,
+            **kwargs,
+        )
+
+    def get_async_client(self, **kwargs):
+        if "timeout" not in kwargs:
+            kwargs["timeout"] = 600
+        return openai.AsyncOpenAI(base_url=self.url_for("v1"),
+                                  api_key=self.DUMMY_API_KEY,
+                                  max_retries=0,
+                                  **kwargs)
+
+
+def _test_completion(
+    client: openai.OpenAI,
+    model: str,
+    prompt: str,
+    token_ids: list[int],
+):
+    results = []
+
+    # test with text prompt
+    completion = client.completions.create(model=model,
+                                           prompt=prompt,
+                                           max_tokens=5,
+                                           temperature=0.0)
+
+    results.append({
+        "test": "single_completion",
+        "text": completion.choices[0].text,
+        "finish_reason": completion.choices[0].finish_reason,
+        "usage": completion.usage,
+    })
+
+    # test using token IDs
+    completion = client.completions.create(
+        model=model,
+        prompt=token_ids,
+        max_tokens=5,
+        temperature=0.0,
+    )
+
+    results.append({
+        "test": "token_ids",
+        "text": completion.choices[0].text,
+        "finish_reason": completion.choices[0].finish_reason,
+        "usage": completion.usage,
+    })
+
+    # test seeded random sampling
+    completion = client.completions.create(model=model,
+                                           prompt=prompt,
+                                           max_tokens=5,
+                                           seed=33,
+                                           temperature=1.0)
+
+    results.append({
+        "test": "seeded_sampling",
+        "text": completion.choices[0].text,
+        "finish_reason": completion.choices[0].finish_reason,
+        "usage": completion.usage,
+    })
+
+    # test seeded random sampling with multiple prompts
+    completion = client.completions.create(model=model,
+                                           prompt=[prompt, prompt],
+                                           max_tokens=5,
+                                           seed=33,
+                                           temperature=1.0)
+
+    results.append({
+        "test":
+        "seeded_sampling",
+        "text": [choice.text for choice in completion.choices],
+        "finish_reason":
+        [choice.finish_reason for choice in completion.choices],
+        "usage":
+        completion.usage,
+    })
+
+    # test simple list
+    batch = client.completions.create(
+        model=model,
+        prompt=[prompt, prompt],
+        max_tokens=5,
+        temperature=0.0,
+    )
+
+    results.append({
+        "test": "simple_list",
+        "text0": batch.choices[0].text,
+        "text1": batch.choices[1].text,
+    })
+
+    # test streaming
+    batch = client.completions.create(
+        model=model,
+        prompt=[prompt, prompt],
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+    )
+
+    texts = [""] * 2
+    for chunk in batch:
+        assert len(chunk.choices) == 1
+        choice = chunk.choices[0]
+        texts[choice.index] += choice.text
+
+    results.append({
+        "test": "streaming",
+        "texts": texts,
+    })
+
+    return results
+
+
+def _test_completion_close(
+    client: openai.OpenAI,
+    model: str,
+    prompt: str,
+):
+    results = []
+
+    # test with text prompt
+    completion = client.completions.create(model=model,
+                                           prompt=prompt,
+                                           max_tokens=1,
+                                           logprobs=5,
+                                           temperature=0.0)
+
+    logprobs = completion.choices[0].logprobs.top_logprobs[0]
+    logprobs = {k: round(v, 2) for k, v in logprobs.items()}
+
+    results.append({
+        "test": "completion_close",
+        "logprobs": logprobs,
+    })
+
+    return results
+
+
+def _test_chat(
+    client: openai.OpenAI,
+    model: str,
+    prompt: str,
+):
+    results = []
+
+    messages = [{
+        "role": "user",
+        "content": [{
+            "type": "text",
+            "text": prompt
+        }]
+    }]
+
+    # test with text prompt
+    chat_response = client.chat.completions.create(model=model,
+                                                   messages=messages,
+                                                   max_tokens=5,
+                                                   temperature=0.0)
+
+    results.append({
+        "test": "completion_close",
+        "text": chat_response.choices[0].message.content,
+        "finish_reason": chat_response.choices[0].finish_reason,
+        "usage": chat_response.usage,
+    })
+
+    return results
+
+
+def _test_embeddings(
+    client: openai.OpenAI,
+    model: str,
+    text: str,
+):
+    results = []
+
+    # test with text input
+    embeddings = client.embeddings.create(
+        model=model,
+        input=text,
+        encoding_format="float",
+    )
+
+    results.append({
+        "test": "single_embedding",
+        "embedding": embeddings.data[0].embedding,
+        "usage": embeddings.usage,
+    })
+
+    return results
+
+
+def _test_image_text(
+    client: openai.OpenAI,
+    model_name: str,
+    image_url: str,
+):
+    results = []
+
+    # test pure text input
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "How do you feel today?"
+            },
+        ],
+    }]
+
+    chat_completion = client.chat.completions.create(model=model_name,
+                                                     messages=messages,
+                                                     temperature=0.0,
+                                                     max_tokens=1,
+                                                     logprobs=True,
+                                                     top_logprobs=5)
+    top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
+
+    for x in top_logprobs:
+        x.logprob = round(x.logprob, 2)
+
+    results.append({
+        "test": "pure_text",
+        "logprobs": top_logprobs,
+    })
+
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+
+    chat_completion = client.chat.completions.create(model=model_name,
+                                                     messages=messages,
+                                                     temperature=0.0,
+                                                     max_tokens=1,
+                                                     logprobs=True,
+                                                     top_logprobs=5)
+    top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
+
+    results.append({
+        "test": "text_image",
+        "logprobs": top_logprobs,
+    })
+
+    return results
+
+
+def compare_two_settings(model: str,
+                         arg1: list[str],
+                         arg2: list[str],
+                         env1: Optional[dict[str, str]] = None,
+                         env2: Optional[dict[str, str]] = None,
+                         *,
+                         method: str = "generate",
+                         max_wait_seconds: Optional[float] = None) -> None:
+    """
+    Launch API server with two different sets of arguments/environments
+    and compare the results of the API calls.
+
+    Args:
+        model: The model to test.
+        arg1: The first set of arguments to pass to the API server.
+        arg2: The second set of arguments to pass to the API server.
+        env1: The first set of environment variables to pass to the API server.
+        env2: The second set of environment variables to pass to the API server.
+    """
+
+    compare_all_settings(
+        model,
+        [arg1, arg2],
+        [env1, env2],
+        method=method,
+        max_wait_seconds=max_wait_seconds,
+    )
+
+
+def compare_all_settings(model: str,
+                         all_args: list[list[str]],
+                         all_envs: list[Optional[dict[str, str]]],
+                         *,
+                         method: str = "generate",
+                         max_wait_seconds: Optional[float] = None) -> None:
+    """
+    Launch API server with several different sets of arguments/environments
+    and compare the results of the API calls with the first set of arguments.
+    Args:
+        model: The model to test.
+        all_args: A list of argument lists to pass to the API server.
+        all_envs: A list of environment dictionaries to pass to the API server.
+    """
+
+    trust_remote_code = False
+    for args in all_args:
+        if "--trust-remote-code" in args:
+            trust_remote_code = True
+            break
+
+    tokenizer_mode = "auto"
+    for args in all_args:
+        if "--tokenizer-mode" in args:
+            tokenizer_mode = args[args.index("--tokenizer-mode") + 1]
+            break
+
+    tokenizer = get_tokenizer(
+        model,
+        trust_remote_code=trust_remote_code,
+        tokenizer_mode=tokenizer_mode,
+    )
+
+    can_force_load_format = True
+
+    for args in all_args:
+        if "--load-format" in args:
+            can_force_load_format = False
+            break
+
+    prompt = "Hello, my name is"
+    token_ids = tokenizer(prompt).input_ids
+    ref_results: list = []
+    for i, (args, env) in enumerate(zip(all_args, all_envs)):
+        if can_force_load_format:
+            # we are comparing the results and
+            # usually we don't need real weights.
+            # we force to use dummy weights by default,
+            # and it should work for most of the cases.
+            # if not, we can use VLLM_TEST_FORCE_LOAD_FORMAT
+            # environment variable to force the load format,
+            # e.g. in quantization tests.
+            args = args + ["--load-format", envs.VLLM_TEST_FORCE_LOAD_FORMAT]
+        compare_results: list = []
+        results = ref_results if i == 0 else compare_results
+        with RemoteOpenAIServer(model,
+                                args,
+                                env_dict=env,
+                                max_wait_seconds=max_wait_seconds) as server:
+            client = server.get_client()
+
+            # test models list
+            models = client.models.list()
+            models = models.data
+            served_model = models[0]
+            results.append({
+                "test": "models_list",
+                "id": served_model.id,
+                "root": served_model.root,
+            })
+
+            if method == "generate":
+                results += _test_completion(client, model, prompt, token_ids)
+            elif method == "generate_close":
+                results += _test_completion_close(client, model, prompt)
+            elif method == "generate_chat":
+                results += _test_chat(client, model, prompt)
+            elif method == "generate_with_image":
+                results += _test_image_text(
+                    client, model,
+                    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png"
+                )
+            elif method == "encode":
+                results += _test_embeddings(client, model, prompt)
+            else:
+                raise ValueError(f"Unknown method: {method}")
+
+            if i > 0:
+                # if any setting fails, raise an error early
+                ref_args = all_args[0]
+                ref_envs = all_envs[0]
+                compare_args = all_args[i]
+                compare_envs = all_envs[i]
+                for ref_result, compare_result in zip(ref_results,
+                                                      compare_results):
+                    ref_result = copy.deepcopy(ref_result)
+                    compare_result = copy.deepcopy(compare_result)
+                    if "embedding" in ref_result and method == "encode":
+                        sim = F.cosine_similarity(
+                            torch.tensor(ref_result["embedding"]),
+                            torch.tensor(compare_result["embedding"]),
+                            dim=0,
+                        )
+                        assert sim >= 0.999, (
+                            f"Embedding for {model=} are not the same.\n"
+                            f"cosine_similarity={sim}\n")
+                        del ref_result["embedding"]
+                        del compare_result["embedding"]
+                    assert ref_result == compare_result, (
+                        f"Results for {model=} are not the same.\n"
+                        f"{ref_args=} {ref_envs=}\n"
+                        f"{compare_args=} {compare_envs=}\n"
+                        f"{ref_result=}\n"
+                        f"{compare_result=}\n")
+
+
+def init_test_distributed_environment(
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+    local_rank: int = -1,
+) -> None:
+    distributed_init_method = f"tcp://localhost:{distributed_init_port}"
+    init_distributed_environment(
+        world_size=pp_size * tp_size,
+        rank=rank,
+        distributed_init_method=distributed_init_method,
+        local_rank=local_rank)
+    ensure_model_parallel_initialized(tp_size, pp_size)
+
+
+def multi_process_parallel(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    pp_size: int,
+    test_target: Any,
+) -> None:
+    import ray
+
+    # Using ray helps debugging the error when it failed
+    # as compared to multiprocessing.
+    # NOTE: We need to set working_dir for distributed tests,
+    # otherwise we may get import errors on ray workers
+    # NOTE: Force ray not to use gitignore file as excluding, otherwise
+    # it will not move .so files to working dir.
+    # So we have to manually add some of large directories
+    os.environ["RAY_RUNTIME_ENV_IGNORE_GITIGNORE"] = "1"
+    ray.init(
+        runtime_env={
+            "working_dir": VLLM_PATH,
+            "excludes":
+            ["build", ".git", "cmake-build-*", "shellcheck", "dist"]
+        })
+
+    distributed_init_port = get_open_port()
+    refs = []
+    for rank in range(tp_size * pp_size):
+        refs.append(
+            test_target.remote(
+                monkeypatch,
+                tp_size,
+                pp_size,
+                rank,
+                distributed_init_port,
+            ), )
+    ray.get(refs)
+
+    ray.shutdown()
+
+
+@contextmanager
+def error_on_warning(category: type[Warning] = Warning):
+    """
+    Within the scope of this context manager, tests will fail if any warning
+    of the given category is emitted.
+    """
+    with warnings.catch_warnings():
+        warnings.filterwarnings("error", category=category)
+
+        yield
+
+
+def get_physical_device_indices(devices):
+    visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
+    if visible_devices is None:
+        return devices
+
+    visible_indices = [int(x) for x in visible_devices.split(",")]
+    index_mapping = {i: physical for i, physical in enumerate(visible_indices)}
+    return [index_mapping[i] for i in devices if i in index_mapping]
+
+
+@_nvml()
+def wait_for_gpu_memory_to_clear(*,
+                                 devices: list[int],
+                                 threshold_bytes: Optional[int] = None,
+                                 threshold_ratio: Optional[float] = None,
+                                 timeout_s: float = 120) -> None:
+    assert threshold_bytes is not None or threshold_ratio is not None
+    # Use nvml instead of pytorch to reduce measurement error from torch cuda
+    # context.
+    devices = get_physical_device_indices(devices)
+    start_time = time.time()
+    while True:
+        output: dict[int, str] = {}
+        output_raw: dict[int, tuple[float, float]] = {}
+        for device in devices:
+            if current_platform.is_rocm():
+                dev_handle = amdsmi_get_processor_handles()[device]
+                mem_info = amdsmi_get_gpu_vram_usage(dev_handle)
+                gb_used = mem_info["vram_used"] / 2**10
+                gb_total = mem_info["vram_total"] / 2**10
+            else:
+                dev_handle = nvmlDeviceGetHandleByIndex(device)
+                mem_info = nvmlDeviceGetMemoryInfo(dev_handle)
+                gb_used = mem_info.used / 2**30
+                gb_total = mem_info.total / 2**30
+            output_raw[device] = (gb_used, gb_total)
+            output[device] = f'{gb_used:.02f}/{gb_total:.02f}'
+
+        print('gpu memory used/total (GiB): ', end='')
+        for k, v in output.items():
+            print(f'{k}={v}; ', end='')
+        print('')
+
+        if threshold_bytes is not None:
+            is_free = lambda used, total: used <= threshold_bytes / 2**30
+            threshold = f"{threshold_bytes/2**30} GiB"
+        else:
+            is_free = lambda used, total: used / total <= threshold_ratio
+            threshold = f"{threshold_ratio:.2f}"
+
+        dur_s = time.time() - start_time
+        if all(is_free(used, total) for used, total in output_raw.values()):
+            print(f'Done waiting for free GPU memory on devices {devices=} '
+                  f'({threshold=}) {dur_s=:.02f}')
+            break
+
+        if dur_s >= timeout_s:
+            raise ValueError(f'Memory of devices {devices=} not free after '
+                             f'{dur_s=:.02f} ({threshold=})')
+
+        time.sleep(5)
+
+
+_P = ParamSpec("_P")
+
+
+def fork_new_process_for_each_test(
+        f: Callable[_P, None]) -> Callable[_P, None]:
+    """Decorator to fork a new process for each test function.
+    See https://github.com/vllm-project/vllm/issues/7053 for more details.
+    """
+
+    @functools.wraps(f)
+    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
+        # Make the process the leader of its own process group
+        # to avoid sending SIGTERM to the parent process
+        os.setpgrp()
+        from _pytest.outcomes import Skipped
+        pid = os.fork()
+        print(f"Fork a new process to run a test {pid}")
+        if pid == 0:
+            try:
+                f(*args, **kwargs)
+            except Skipped as e:
+                # convert Skipped to exit code 0
+                print(str(e))
+                os._exit(0)
+            except Exception:
+                import traceback
+                traceback.print_exc()
+                os._exit(1)
+            else:
+                os._exit(0)
+        else:
+            pgid = os.getpgid(pid)
+            _pid, _exitcode = os.waitpid(pid, 0)
+            # ignore SIGTERM signal itself
+            old_signal_handler = signal.signal(signal.SIGTERM, signal.SIG_IGN)
+            # kill all child processes
+            os.killpg(pgid, signal.SIGTERM)
+            # restore the signal handler
+            signal.signal(signal.SIGTERM, old_signal_handler)
+            assert _exitcode == 0, (f"function {f} failed when called with"
+                                    f" args {args} and kwargs {kwargs}")
+
+    return wrapper
+
+
+def spawn_new_process_for_each_test(
+        f: Callable[_P, None]) -> Callable[_P, None]:
+    """Decorator to spawn a new process for each test function.
+    """
+
+    @functools.wraps(f)
+    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
+        # Check if we're already in a subprocess
+        if os.environ.get('RUNNING_IN_SUBPROCESS') == '1':
+            # If we are, just run the function directly
+            return f(*args, **kwargs)
+
+        import torch.multiprocessing as mp
+        with suppress(RuntimeError):
+            mp.set_start_method('spawn')
+
+        # Get the module
+        module_name = f.__module__
+
+        # Create a process with environment variable set
+        env = os.environ.copy()
+        env['RUNNING_IN_SUBPROCESS'] = '1'
+
+        with tempfile.TemporaryDirectory() as tempdir:
+            output_filepath = os.path.join(tempdir, "new_process.tmp")
+
+            # `cloudpickle` allows pickling complex functions directly
+            input_bytes = cloudpickle.dumps((f, output_filepath))
+
+            cmd = [sys.executable, "-m", f"{module_name}"]
+
+            returned = subprocess.run(cmd,
+                                      input=input_bytes,
+                                      capture_output=True,
+                                      env=env)
+
+            # check if the subprocess is successful
+            try:
+                returned.check_returncode()
+            except Exception as e:
+                # wrap raised exception to provide more information
+                raise RuntimeError(f"Error raised in subprocess:\n"
+                                   f"{returned.stderr.decode()}") from e
+
+    return wrapper
+
+
+def create_new_process_for_each_test(
+    method: Optional[Literal["spawn", "fork"]] = None
+) -> Callable[[Callable[_P, None]], Callable[_P, None]]:
+    """Creates a decorator that runs each test function in a new process.
+
+    Args:
+        method: The process creation method. Can be either "spawn" or "fork". 
+               If not specified, it defaults to "spawn" on ROCm and XPU
+               platforms and "fork" otherwise.
+
+    Returns:
+        A decorator to run test functions in separate processes.
+    """
+    if method is None:
+        use_spawn = current_platform.is_rocm() or current_platform.is_xpu()
+        method = "spawn" if use_spawn else "fork"
+
+    assert method in ["spawn",
+                      "fork"], "Method must be either 'spawn' or 'fork'"
+
+    if method == "fork":
+        return fork_new_process_for_each_test
+
+    return spawn_new_process_for_each_test
+
+
+def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
+    """
+    Get a pytest mark, which skips the test if the GPU doesn't meet
+    a minimum memory requirement in GB.
+
+    This can be leveraged via `@large_gpu_test` to skip tests in environments
+    without enough resources, or called when filtering tests to run directly.
+    """
+    try:
+        if current_platform.is_cpu():
+            memory_gb = 0
+        else:
+            memory_gb = current_platform.get_device_total_memory() / GB_bytes
+    except Exception as e:
+        warnings.warn(
+            f"An error occurred when finding the available memory: {e}",
+            stacklevel=2,
+        )
+        memory_gb = 0
+
+    return pytest.mark.skipif(
+        memory_gb < min_gb,
+        reason=f"Need at least {min_gb}GB GPU memory to run the test.",
+    )
+
+
+def large_gpu_test(*, min_gb: int):
+    """
+    Decorate a test to be skipped if no GPU is available or it does not have
+    sufficient memory.
+
+    Currently, the CI machine uses L4 GPU which has 24 GB VRAM.
+    """
+    mark = large_gpu_mark(min_gb)
+
+    def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
+        return mark(f)
+
+    return wrapper
+
+
+def multi_gpu_marks(*, num_gpus: int):
+    """Get a collection of pytest marks to apply for `@multi_gpu_test`."""
+    test_selector = pytest.mark.distributed(num_gpus=num_gpus)
+    test_skipif = pytest.mark.skipif(
+        cuda_device_count_stateless() < num_gpus,
+        reason=f"Need at least {num_gpus} GPUs to run the test.",
+    )
+
+    return [test_selector, test_skipif]
+
+
+def multi_gpu_test(*, num_gpus: int):
+    """
+    Decorate a test to be run only when multiple GPUs are available.
+    """
+    marks = multi_gpu_marks(num_gpus=num_gpus)
+
+    def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
+        func = create_new_process_for_each_test()(f)
+        for mark in reversed(marks):
+            func = mark(func)
+
+        return func
+
+    return wrapper
+
+
+async def completions_with_server_args(
+    prompts: list[str],
+    model_name: str,
+    server_cli_args: list[str],
+    num_logprobs: Optional[int],
+    max_wait_seconds: int = 240,
+    max_tokens: Union[int, list] = 5,
+) -> list[Completion]:
+    '''Construct a remote OpenAI server, obtain an async client to the
+    server & invoke the completions API to obtain completions.
+
+    Args:
+      prompts: test prompts
+      model_name: model to spin up on the vLLM server
+      server_cli_args: CLI args for starting the server
+      num_logprobs: Number of logprobs to report (or `None`)
+      max_wait_seconds: timeout interval for bringing up server.
+                        Default: 240sec
+      max_tokens: max_tokens value for each of the given input prompts.
+        if only one max_token value is given, the same value is used
+        for all the prompts.
+
+    Returns:
+      OpenAI Completion instance
+    '''
+
+    if isinstance(max_tokens, int):
+        max_tokens = [max_tokens] * len(prompts)
+
+    assert len(max_tokens) == len(prompts)
+
+    outputs = None
+    with RemoteOpenAIServer(model_name,
+                            server_cli_args,
+                            max_wait_seconds=max_wait_seconds) as server:
+        client = server.get_async_client()
+        outputs = [ client.completions.create(model=model_name,
+                                              prompt=[p],
+                                              temperature=0,
+                                              stream=False,
+                                              max_tokens=max_tok,
+                                              logprobs=num_logprobs) \
+                    for p, max_tok in zip(prompts, max_tokens) ]
+        outputs = await asyncio.gather(*outputs)
+
+    assert outputs is not None, "Completion API call failed."
+
+    return outputs
+
+
+def get_client_text_generations(completions: list[Completion]) -> list[str]:
+    '''Extract generated tokens from the output of a
+    request made to an Open-AI-protocol completions endpoint.
+    '''
+    assert all([len(x.choices) == 1 for x in completions])
+    return [x.choices[0].text for x in completions]
+
+
+def get_client_text_logprob_generations(
+        completions: list[Completion]) -> list[TextTextLogprobs]:
+    '''Operates on the output of a request made to an Open-AI-protocol
+    completions endpoint; obtains top-rank logprobs for each token in
+    each {class}`SequenceGroup`
+    '''
+    text_generations = get_client_text_generations(completions)
+    text = ''.join(text_generations)
+    return [(text_generations, text,
+             (None if x.logprobs is None else x.logprobs.top_logprobs))
+            for completion in completions for x in completion.choices]
diff --git a/vllm_v0.10.0/tests/v1/__init__.py b/vllm_v0.10.0/tests/v1/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/v1/attention/test_attention_backends.py b/vllm_v0.10.0/tests/v1/attention/test_attention_backends.py
new file mode 100644
index 0000000..b4e0101
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/attention/test_attention_backends.py
@@ -0,0 +1,466 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for v1 attention backends without GPUModelRunner dependency."""
+
+import pytest
+import torch
+
+from tests.v1.attention.utils import (BatchSpec, _Backend,
+                                      create_common_attn_metadata,
+                                      create_standard_kv_cache_spec,
+                                      create_vllm_config,
+                                      get_attention_backend)
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
+from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+from vllm.v1.kv_cache_interface import FullAttentionSpec
+
+BACKENDS_TO_TEST = [
+    _Backend.FLASH_ATTN_VLLM_V1, _Backend.FLASHINFER_VLLM_V1,
+    _Backend.FLEX_ATTENTION, _Backend.TRITON_ATTN_VLLM_V1
+]
+
+# Remove flashinfer from the list if it's not available
+try:
+    import flashinfer  # noqa: F401
+except ImportError:
+    BACKENDS_TO_TEST.remove(_Backend.FLASHINFER_VLLM_V1)
+
+
+def _convert_dtype_to_torch(dtype):
+    """Convert ModelDType to torch.dtype."""
+    if isinstance(dtype, str):
+        if dtype == "auto":
+            return torch.float16  # Default dtype for testing
+        elif dtype in STR_DTYPE_TO_TORCH_DTYPE:
+            return STR_DTYPE_TO_TORCH_DTYPE[dtype]
+        else:
+            raise ValueError(f"Unknown dtype: {dtype}")
+    elif isinstance(dtype, torch.dtype):
+        return dtype
+    else:
+        raise ValueError(f"Unknown dtype: {dtype}")
+
+
+# Define common batch configurations
+BATCH_SPECS = {
+    "small_decode":
+    BatchSpec(seq_lens=[32, 40], query_lens=[1, 1]),
+    "small_prefill":
+    BatchSpec(seq_lens=[32, 40], query_lens=[8, 8]),
+    "mixed_small":
+    BatchSpec(seq_lens=[32, 40, 48, 56], query_lens=[1, 1, 5, 5]),
+    "medium_decode":
+    BatchSpec(seq_lens=[128, 256, 512, 1024, 128, 256, 512, 1024],
+              query_lens=[1, 1, 1, 1, 1, 1, 1, 1]),
+    "medium_prefill":
+    BatchSpec(seq_lens=[256, 512, 1024, 2048], query_lens=[16, 16, 16, 16]),
+    "mixed_medium":
+    BatchSpec(seq_lens=[512, 1024, 2048, 512, 1024, 2048],
+              query_lens=[1, 1, 1, 7, 7, 7]),
+    "large_decode":
+    BatchSpec(seq_lens=[2048] * 32, query_lens=[1] * 32),
+    "large_prefill":
+    BatchSpec(seq_lens=[4096] * 8, query_lens=[32] * 8),
+    "single_decode":
+    BatchSpec(seq_lens=[1024], query_lens=[1]),
+    "single_prefill":
+    BatchSpec(seq_lens=[1024], query_lens=[64]),
+}
+
+
+def create_dummy_kv_cache(kv_cache_spec: FullAttentionSpec,
+                          device: torch.device,
+                          num_blocks: int = 100) -> torch.Tensor:
+    """Create a dummy KV cache tensor for testing."""
+    kv_cache = torch.randn(
+        2,  # K and V
+        num_blocks,
+        kv_cache_spec.block_size,
+        kv_cache_spec.num_kv_heads,
+        kv_cache_spec.head_size,
+        dtype=_convert_dtype_to_torch(kv_cache_spec.dtype),
+        device=device,
+    )
+    return kv_cache
+
+
+def create_and_prepopulate_kv_cache(
+        k_contexts: list[torch.Tensor],
+        v_contexts: list[torch.Tensor],
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        num_blocks: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        randomize_blocks: bool = True) -> torch.Tensor:
+    """Create and prepopulate a KV cache with context data.
+    
+    Args:
+        k_contexts: List of key context tensors for each sequence
+        v_contexts: List of value context tensors for each sequence
+        seq_lens: List of sequence lengths
+        block_size: Size of each block
+        num_kv_heads: Number of KV heads
+        head_size: Size of each head
+        dtype: Data type for the cache
+        device: Device to create the cache on
+        num_blocks: Total number of blocks in the cache
+        block_table: Block table tensor to populate
+        randomize_blocks: Whether to randomly permute blocks 
+                          or use sequential order
+        
+    Returns:
+        Tuple of (kv_cache, updated_block_table)
+    """
+    batch_size = len(k_contexts)
+    seq_lens = common_attn_metadata.seq_lens_cpu
+    query_lens = common_attn_metadata.query_start_loc_cpu[
+        1:] - common_attn_metadata.query_start_loc_cpu[:-1]
+    context_lens = common_attn_metadata.num_computed_tokens_cpu
+    block_table = common_attn_metadata.block_table_tensor
+    slot_mapping = common_attn_metadata.slot_mapping
+
+    # Create KV cache
+    kv_cache = torch.empty(2,
+                           num_blocks,
+                           block_size,
+                           num_kv_heads,
+                           head_size,
+                           dtype=dtype,
+                           device=device)
+    kv_cache_flat = kv_cache.view(2, -1, num_kv_heads, head_size)
+
+    # Populate the cache with the context tokens
+    # Start from block_id=1 since block_id=0 is considered the null block
+    start_block_idx = 1
+    for i in range(batch_size):
+        k_context, v_context = k_contexts[i], v_contexts[i]
+        start = start_block_idx * block_size
+        end = start + k_context.shape[0]
+        kv_cache_flat[0, start:end, ...] = k_context
+        kv_cache_flat[1, start:end, ...] = v_context
+
+        # Stay block aligned and allocate enough blocks for the new tokens
+        start_block_idx += cdiv(int(seq_lens[i]), block_size)
+
+    blocks_end = start_block_idx
+
+    # Permute the context blocks (excluding block 0 which is null)
+    if randomize_blocks:
+        perm = torch.randperm(
+            blocks_end - 1) + 1  # Random permutation starting from block 1
+    else:
+        perm = torch.arange(
+            1, blocks_end)  # Sequential order starting from block 1
+
+    inv_perm = torch.zeros(blocks_end, dtype=torch.long, device=device)
+    inv_perm[1:] = torch.argsort(
+        perm) + 1  # Add 1 to account for starting from block 1
+    kv_cache[:, 1:blocks_end, ...] = kv_cache[:, perm, ...]
+
+    # Construct the right block table
+    # Start from block_id=1 since block_id=0 is considered the null block
+    start_block_idx = 1
+    for i in range(batch_size):
+        num_blocks_for_seq = cdiv(int(seq_lens[i]), block_size)
+        start = start_block_idx
+        end = start + num_blocks_for_seq
+        block_table[i, :num_blocks_for_seq] = inv_perm[start:end]
+        start_block_idx += num_blocks_for_seq
+
+        # Create a realistic slot mapping that corresponds to the block table
+    for i in range(batch_size):
+        token_offsets = torch.arange(int(query_lens[i])) + int(context_lens[i])
+        block_indices = token_offsets // block_size
+        token_inter_block_offsets = token_offsets % block_size
+        start = common_attn_metadata.query_start_loc_cpu[i]
+        end = common_attn_metadata.query_start_loc_cpu[i + 1]
+        slot_mapping[start:end] = block_table[
+            i,
+            block_indices] * block_size + token_inter_block_offsets.to(device)
+
+    return kv_cache
+
+
+class MockAttentionLayer:
+    """A mock attention layer for testing."""
+
+    def __init__(self, device: torch.device):
+        self._q_scale = torch.tensor(1.0, device=device)
+        self._k_scale = torch.tensor(1.0, device=device)
+        self._v_scale = torch.tensor(1.0, device=device)
+        # Add float versions for flashinfer
+        self._k_scale_float = 1.0
+        self._v_scale_float = 1.0
+
+
+def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec,
+                          vllm_config, device: torch.device,
+                          common_attn_metadata: CommonAttentionMetadata,
+                          query: torch.Tensor, key: torch.Tensor,
+                          value: torch.Tensor,
+                          kv_cache: torch.Tensor) -> torch.Tensor:
+    """Run attention computation using the specified backend's AttentionImpl."""
+
+    builder_cls, impl_cls = get_attention_backend(backend)
+
+    # Mock flashinfer's get_per_layer_parameters if needed
+    if backend == _Backend.FLASHINFER_VLLM_V1:
+        import unittest.mock
+
+        from vllm.v1.attention.backends.flashinfer import PerLayerParameters
+
+        def mock_get_per_layer_parameters(vllm_config):
+            # Return mock parameters for a single layer
+            head_size = vllm_config.model_config.get_head_size()
+            return {
+                "mock_layer":
+                PerLayerParameters(
+                    window_left=-1,  # No sliding window
+                    logits_soft_cap=0.0,  # No soft cap
+                    sm_scale=1.0 / (head_size**0.5)  # Standard scale
+                )
+            }
+
+        with unittest.mock.patch(
+                'vllm.v1.attention.backends.flashinfer.get_per_layer_parameters',
+                mock_get_per_layer_parameters):
+            builder = builder_cls(kv_cache_spec, vllm_config, device)
+            attn_metadata = builder.build(
+                common_prefix_len=0,
+                common_attn_metadata=common_attn_metadata,
+            )
+    else:
+        # Build metadata
+        builder = builder_cls(kv_cache_spec, vllm_config, device)
+        attn_metadata = builder.build(
+            common_prefix_len=0,
+            common_attn_metadata=common_attn_metadata,
+        )
+
+    # Instantiate implementation
+    num_heads = vllm_config.model_config.get_num_attention_heads(
+        vllm_config.parallel_config)
+    num_kv_heads = vllm_config.model_config.get_num_kv_heads(
+        vllm_config.parallel_config)
+    head_size = vllm_config.model_config.get_head_size()
+    scale = 1.0 / (head_size**0.5)
+    impl = impl_cls(
+        num_heads=num_heads,
+        head_size=head_size,
+        scale=scale,
+        num_kv_heads=num_kv_heads,
+        alibi_slopes=None,
+        sliding_window=None,
+        kv_cache_dtype="auto",
+    )
+
+    # Create mock layer and output buffer
+    mock_layer = MockAttentionLayer(device)
+    output = torch.empty_like(query)
+
+    # Run forward pass
+    # NOTE: The query, key, and value are already shaped correctly
+    # in the calling test function.
+    output = impl.forward(mock_layer,
+                          query,
+                          key,
+                          value,
+                          kv_cache,
+                          attn_metadata,
+                          output=output)
+
+    return output
+
+
+@pytest.mark.parametrize("batch_spec_name", [
+    "small_decode", "small_prefill", "mixed_small", "medium_decode",
+    "medium_prefill", "mixed_medium"
+])
+@pytest.mark.parametrize("model", ["meta-llama/Meta-Llama-3-8B"])
+def test_backend_correctness(batch_spec_name: str, model: str):
+    """
+    Test that all backends produce similar outputs to a reference implementation
+    using torch.nn.functional.scaled_dot_product_attention.
+
+    This test works by:
+    1. Generating a batch of sequences with specified context and query lengths.
+    2. Computing a ground-truth attention output using torch.sdpa on
+       contiguous Q, K, and V tensors.
+    3. Simulating vLLM's paged KV cache: It takes the context portion of the
+       K/V tensors and manually places them into a paged buffer according to
+       the test's (randomly generated) block table.
+    4. Running each vLLM attention backend with the new queries and the
+       simulated paged KV cache.
+    5. Comparing the vLLM backend's output to the ground-truth SDPA output.
+    """
+    batch_spec = BATCH_SPECS[batch_spec_name]
+    vllm_config = create_vllm_config(model_name=model)
+    device = torch.device("cuda:0")
+
+    kv_cache_spec = create_standard_kv_cache_spec(vllm_config)
+
+    # 1. Setup
+    batch_size = batch_spec.batch_size
+    seq_lens = batch_spec.seq_lens
+    query_lens = batch_spec.query_lens
+    num_q_heads = vllm_config.model_config.get_num_attention_heads(
+        vllm_config.parallel_config)
+    num_kv_heads = vllm_config.model_config.get_num_kv_heads(
+        vllm_config.parallel_config)
+    head_size = vllm_config.model_config.get_head_size()
+    dtype = _convert_dtype_to_torch(vllm_config.model_config.dtype)
+    block_size = vllm_config.cache_config.block_size
+    scale = 1.0 / (head_size**0.5)
+
+    # 2. Generate data and compute SDPA reference output
+    all_q_vllm, all_k_vllm, all_v_vllm = [], [], []
+    all_sdpa_outputs = []
+    k_contexts, v_contexts = [], []
+
+    for i in range(batch_size):
+        s_len = seq_lens[i]
+        q_len = query_lens[i]
+        context_len = s_len - q_len
+
+        # Generate Q, K, V for the whole sequence to be used in SDPA
+        q = torch.randn(q_len,
+                        num_q_heads,
+                        head_size,
+                        dtype=dtype,
+                        device=device)
+        k_full = torch.randn(s_len,
+                             num_kv_heads,
+                             head_size,
+                             dtype=dtype,
+                             device=device)
+        v_full = torch.randn(s_len,
+                             num_kv_heads,
+                             head_size,
+                             dtype=dtype,
+                             device=device)
+
+        # SDPA expects (N, H, L, D), so unsqueeze batch and permute
+        q_sdpa_in = q.unsqueeze(0).transpose(1, 2)
+        k_sdpa_in = k_full.unsqueeze(0).transpose(1, 2)
+        v_sdpa_in = v_full.unsqueeze(0).transpose(1, 2)
+
+        if num_q_heads != num_kv_heads:
+            assert num_q_heads % num_kv_heads == 0, (
+                f"num_q_heads ({num_q_heads}) must be divisible by "
+                f"num_kv_heads ({num_kv_heads})")
+            repeats = num_q_heads // num_kv_heads
+            k_sdpa_in = k_sdpa_in.repeat_interleave(repeats, dim=1)
+            v_sdpa_in = v_sdpa_in.repeat_interleave(repeats, dim=1)
+
+        # Create causal mask: query token i attends to positions 0 to
+        #  (context_len + i)
+        kv_len = s_len
+        offset = context_len
+        attn_mask = torch.full((q_len, kv_len),
+                               float('-inf'),
+                               device=device,
+                               dtype=dtype)
+        for i in range(q_len):
+            attn_mask[i, :offset + i + 1] = 0.0
+
+        sdpa_out_i = torch.nn.functional.scaled_dot_product_attention(
+            q_sdpa_in,
+            k_sdpa_in,
+            v_sdpa_in,
+            attn_mask=attn_mask,
+            scale=scale,
+            enable_gqa=True)
+        # Convert back to (L, H, D)
+        all_sdpa_outputs.append(sdpa_out_i.transpose(1, 2).squeeze(0))
+
+        # Inputs for vLLM backends are just the new tokens
+        all_q_vllm.append(q)
+        all_k_vllm.append(k_full[context_len:])
+        all_v_vllm.append(v_full[context_len:])
+
+        # Contextual K/V data used to populate the paged cache
+        k_contexts.append(k_full[:context_len])
+        v_contexts.append(v_full[:context_len])
+
+    query_vllm = torch.cat(all_q_vllm, dim=0)
+    key_vllm = torch.cat(all_k_vllm, dim=0)
+    value_vllm = torch.cat(all_v_vllm, dim=0)
+    sdpa_output = torch.cat(all_sdpa_outputs, dim=0)
+
+    common_attn_metadata = create_common_attn_metadata(
+        batch_spec, vllm_config.cache_config.block_size, device)
+
+    # 3. Simulate Paged KV Cache and a realistic slot_mapping
+    kv_cache = create_and_prepopulate_kv_cache(
+        k_contexts=k_contexts,
+        v_contexts=v_contexts,
+        block_size=block_size,
+        num_kv_heads=num_kv_heads,
+        head_size=head_size,
+        dtype=dtype,
+        device=device,
+        num_blocks=vllm_config.cache_config.num_gpu_blocks or 1000,
+        common_attn_metadata=common_attn_metadata,
+        randomize_blocks=True)
+
+    # 4. Run vLLM backends and compare
+    # Note: flex_attention has known Triton kernel compatibility issues
+    # with test infrastructures
+    for backend_name in BACKENDS_TO_TEST:
+        # FlashAttentionm + FlexAttention:
+        #   [2, num_blocks, block_size, num_kv_heads, head_size]
+        # FlashInfer:
+        #   [num_blocks, 2, block_size, num_kv_heads, head_size]
+        # Select the appropriate KV cache format for each backend
+        kv_cache_for_backend = kv_cache
+        if backend_name == _Backend.FLASHINFER_VLLM_V1:
+            kv_cache_for_backend = kv_cache.transpose(0, 1)
+
+        backend_output = run_attention_backend(backend_name, kv_cache_spec,
+                                               vllm_config, device,
+                                               common_attn_metadata,
+                                               query_vllm, key_vllm,
+                                               value_vllm,
+                                               kv_cache_for_backend)
+
+        # Check shape and dtype consistency
+        assert backend_output.shape == sdpa_output.shape, (
+            f"[{backend_name}] shape {backend_output.shape} != "
+            f"SDPA shape {sdpa_output.shape}")
+        assert backend_output.dtype == sdpa_output.dtype, (
+            f"[{backend_name}] dtype {backend_output.dtype} != "
+            f"SDPA dtype {sdpa_output.dtype}")
+
+        assert torch.isfinite(backend_output).all(), (
+            f"[{backend_name}] produced non-finite values")
+
+        # Check numerical similarity
+        rtol = 1e-2
+        atol = 5e-3
+
+        if backend_name == _Backend.FLEX_ATTENTION:
+            atol = 5e-1  # TODO: figure out why flex_attention has such large
+            # numerical differences for medium_decode, medium_prefill,
+            # mixed_medium
+
+        max_diff = torch.max(torch.abs(backend_output - sdpa_output)).item()
+        max_rel_diff = torch.max(
+            torch.abs(backend_output - sdpa_output) /
+            torch.abs(sdpa_output)).item()
+        all_close = torch.allclose(backend_output,
+                                   sdpa_output,
+                                   rtol=rtol,
+                                   atol=atol)
+
+        if not all_close:
+            print(f"[{backend_name}] output differs from SDPA baseline. "
+                  f"Max diff: {max_diff:.6f} (rel: {max_rel_diff:.6f})")
+            print(f"[{backend_name}] output: {backend_output}")
+            print(f"[{backend_name}] SDPA baseline: {sdpa_output}")
+
+        assert all_close, (
+            f"[{backend_name}] output differs from SDPA baseline. "
+            f"Max diff: {max_diff:.6f} (rel: {max_rel_diff:.6f})")
diff --git a/vllm_v0.10.0/tests/v1/attention/utils.py b/vllm_v0.10.0/tests/v1/attention/utils.py
new file mode 100644
index 0000000..30cfbdd
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/attention/utils.py
@@ -0,0 +1,229 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Utility functions for attention-related v1 tests."""
+
+from dataclasses import dataclass
+from typing import Union
+
+import pytest
+import torch
+
+from vllm.config import (CacheConfig, CompilationConfig, DeviceConfig,
+                         LoadConfig, ModelConfig, ModelDType, ParallelConfig,
+                         SchedulerConfig, VllmConfig)
+from vllm.platforms import _Backend
+from vllm.utils import resolve_obj_by_qualname
+from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+from vllm.v1.kv_cache_interface import FullAttentionSpec
+
+
+@dataclass
+class BatchSpec:
+    """Specification for a batch configuration (workload shape only)."""
+    seq_lens: list[int]
+    query_lens: list[int]
+
+    name: str = "unnamed"
+
+    @property
+    def batch_size(self):
+        return len(self.seq_lens)
+
+    def __post_init__(self):
+        assert len(self.seq_lens) == len(self.query_lens)
+
+    def compute_num_tokens(self):
+        return sum(self.query_lens)
+
+
+def create_common_attn_metadata(
+        batch_spec: BatchSpec,
+        block_size: int,
+        device: torch.device,
+        max_block_idx: int = 1000) -> CommonAttentionMetadata:
+    """Create CommonAttentionMetadata from a BatchSpec and ModelParams."""
+    # Create query start locations
+    query_start_loc = torch.zeros(batch_spec.batch_size + 1,
+                                  dtype=torch.int32,
+                                  device=device)
+    query_start_loc[1:] = torch.tensor(batch_spec.query_lens,
+                                       dtype=torch.int32,
+                                       device=device).cumsum(0)
+    query_start_loc_cpu = query_start_loc.cpu()
+    num_tokens = batch_spec.compute_num_tokens()
+
+    # Create sequence lengths
+    seq_lens = torch.tensor(batch_spec.seq_lens,
+                            dtype=torch.int32,
+                            device=device)
+    seq_lens_cpu = seq_lens.cpu()
+
+    # Create computed tokens (context length for each sequence)
+    context_lens = [
+        batch_spec.seq_lens[i] - batch_spec.query_lens[i]
+        for i in range(batch_spec.batch_size)
+    ]
+    num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)
+
+    # Create block table (random for testing)
+    max_blocks = max(batch_spec.seq_lens) // block_size + 1
+    block_table_tensor = torch.randint(0,
+                                       max_block_idx,
+                                       (batch_spec.batch_size, max_blocks),
+                                       dtype=torch.int32,
+                                       device=device)
+
+    # Create slot mapping
+    slot_mapping = torch.randint(0,
+                                 max_block_idx, (num_tokens, ),
+                                 dtype=torch.int64,
+                                 device=device)
+
+    # Calculate max query length
+    max_query_len = max(batch_spec.query_lens)
+
+    return CommonAttentionMetadata(
+        query_start_loc=query_start_loc,
+        query_start_loc_cpu=query_start_loc_cpu,
+        seq_lens=seq_lens,
+        seq_lens_cpu=seq_lens_cpu,
+        num_computed_tokens_cpu=num_computed_tokens_cpu,
+        num_reqs=batch_spec.batch_size,
+        num_actual_tokens=num_tokens,
+        max_query_len=max_query_len,
+        block_table_tensor=block_table_tensor,
+        slot_mapping=slot_mapping,
+    )
+
+
+def get_attention_backend(backend_name: _Backend):
+    """Set up attention backend classes for testing.
+    
+    Args:
+        backend_name: Name of the backend ("flash_attn", "flashinfer", etc.)
+        vllm_config: VllmConfig instance
+        
+    Returns:
+        Tuple of (backend_builder_class, backend_impl_class)
+    """
+    backend_map = {
+        _Backend.FLASH_ATTN_VLLM_V1:
+        "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend",
+        _Backend.FLASHINFER_VLLM_V1:
+        "vllm.v1.attention.backends.flashinfer.FlashInferBackend",
+        _Backend.FLEX_ATTENTION:
+        "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend",
+        _Backend.TRITON_ATTN_VLLM_V1:
+        "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend",
+    }
+
+    if backend_name not in backend_map:
+        raise ValueError(f"Unknown backend: {backend_name}")
+
+    backend_class_name = backend_map[backend_name]
+
+    try:
+        backend_class = resolve_obj_by_qualname(backend_class_name)
+        return backend_class.get_builder_cls(), backend_class.get_impl_cls()
+    except ImportError as e:
+        pytest.skip(f"{backend_name} not available: {e}")
+
+
+def create_standard_kv_cache_spec(
+        vllm_config: VllmConfig) -> FullAttentionSpec:
+    """Create a FullAttentionSpec from ModelParams only."""
+    return FullAttentionSpec(
+        block_size=vllm_config.cache_config.block_size,
+        num_kv_heads=vllm_config.model_config.get_num_kv_heads(
+            vllm_config.parallel_config),
+        head_size=vllm_config.model_config.get_head_size(),
+        dtype=vllm_config.model_config.dtype,
+        use_mla=vllm_config.model_config.use_mla,
+        sliding_window=vllm_config.model_config.get_sliding_window(),
+    )
+
+
+def create_vllm_config(model_name: str = "meta-llama/Meta-Llama-3-8B",
+                       tensor_parallel_size: int = 1,
+                       max_model_len: int = 1024,
+                       dtype: Union[ModelDType, torch.dtype] = "auto",
+                       block_size: int = 16,
+                       max_num_seqs: int = 256,
+                       max_num_batched_tokens: int = 8192,
+                       add_mock_model_methods: bool = True) -> VllmConfig:
+    """Create a VllmConfig for testing with reasonable defaults."""
+
+    model_config = ModelConfig(
+        model=model_name,
+        tokenizer=model_name,
+        trust_remote_code=False,
+        dtype=dtype,
+        seed=0,
+        max_model_len=max_model_len,
+    )
+
+    cache_config = CacheConfig(
+        block_size=block_size,
+        cache_dtype="auto",
+        swap_space=0,
+    )
+    # Set cache blocks for testing
+    #   (these may be set during initialization normally)
+    cache_config.num_gpu_blocks = 1000
+    cache_config.num_cpu_blocks = 0
+
+    parallel_config = ParallelConfig(
+        tensor_parallel_size=tensor_parallel_size, )
+
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=max_num_seqs,
+        max_num_batched_tokens=max_num_batched_tokens,
+    )
+
+    device_config = DeviceConfig()
+    load_config = LoadConfig()
+    compilation_config = CompilationConfig()
+
+    if add_mock_model_methods:
+        # Add mock methods to satisfy backends that need them
+        # This is a workaround because tests don't build full, real models,
+        # but some backends expect to query the model for layer-specific
+        # parameters
+        import types
+        model_config.get_num_layers = types.MethodType(lambda self: 1,
+                                                       model_config)
+        model_config.get_sliding_window_for_layer = types.MethodType(
+            lambda self, i: None, model_config)
+        model_config.get_logits_soft_cap_for_layer = types.MethodType(
+            lambda self, i: 0.0, model_config)
+        model_config.get_sm_scale_for_layer = types.MethodType(
+            lambda self, i: 1.0 / model_config.get_head_size()**0.5,
+            model_config)
+
+    return VllmConfig(
+        model_config=model_config,
+        cache_config=cache_config,
+        parallel_config=parallel_config,
+        scheduler_config=scheduler_config,
+        device_config=device_config,
+        load_config=load_config,
+        compilation_config=compilation_config,
+    )
+
+
+def create_dummy_kv_cache(block_size: int,
+                          num_kv_heads: int,
+                          head_size: int,
+                          dtype: torch.dtype,
+                          device: torch.device,
+                          num_blocks: int = 100) -> torch.Tensor:
+    """Create a dummy KV cache tensor for testing."""
+    kv_cache = torch.randn(
+        num_blocks,
+        2,  # K and V
+        block_size,
+        num_kv_heads,
+        head_size,
+        dtype=dtype,
+        device=device)
+    return kv_cache
diff --git a/vllm_v0.10.0/tests/v1/core/__init__.py b/vllm_v0.10.0/tests/v1/core/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/v1/core/test_async_scheduler.py b/vllm_v0.10.0/tests/v1/core/test_async_scheduler.py
new file mode 100644
index 0000000..3ccefbd
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/core/test_async_scheduler.py
@@ -0,0 +1,228 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections import deque
+
+import pytest
+
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.request import RequestStatus
+
+from .utils import create_requests, create_scheduler
+
+
+def _make_model_runner_output(
+    scheduler_output: SchedulerOutput, ) -> ModelRunnerOutput:
+    req_ids = list(scheduler_output.num_scheduled_tokens.keys())
+    return ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index={
+            req_id: i
+            for i, req_id in enumerate(req_ids)
+        },
+        sampled_token_ids=[[i] for i in range(len(req_ids))],
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+
+
+@pytest.mark.parametrize("max_tokens", [1, 2, 3, 5])
+def test_stop_by_max_tokens(max_tokens: int):
+    scheduler = create_scheduler(async_scheduling=True)
+    requests = create_requests(num_requests=2, max_tokens=max_tokens)
+    req0, req1 = requests
+
+    sched_outputs: deque[SchedulerOutput] = deque()
+    scheduler.add_request(req0)
+    sched_outputs.append(scheduler.schedule())
+
+    scheduler.add_request(req1)
+    sched_outputs.append(scheduler.schedule())
+
+    while sched_outputs:
+        sched_output = sched_outputs.popleft()
+        model_runner_output = _make_model_runner_output(sched_output)
+        scheduler.update_from_output(sched_output, model_runner_output)
+
+        sched_output = scheduler.schedule()
+        if sched_output.num_scheduled_tokens:
+            sched_outputs.append(sched_output)
+
+    assert scheduler.get_num_unfinished_requests() == 0
+    assert req0.num_output_tokens == max_tokens
+    assert req1.num_output_tokens == max_tokens
+
+
+def test_abort():
+    scheduler = create_scheduler(async_scheduling=True)
+    requests = create_requests(num_requests=10, max_tokens=20)
+
+    for req in requests:
+        scheduler.add_request(req)
+
+    sched_outputs: deque[SchedulerOutput] = deque()
+    sched_outputs.append(scheduler.schedule())
+    sched_outputs.append(scheduler.schedule())
+
+    abort_order = [0, 8, 3, 1, 6, 4, 2, 5, 7, 9]
+    abort_order_copy = abort_order.copy()
+
+    def abort_request():
+        if not abort_order:
+            return
+        req = requests[abort_order.pop(0)]
+        scheduler.finish_requests(req.request_id,
+                                  RequestStatus.FINISHED_ABORTED)
+
+    while sched_outputs:
+        # Abort a scheduled request.
+        abort_request()
+        sched_output = sched_outputs.popleft()
+        model_runner_output = _make_model_runner_output(sched_output)
+        scheduler.update_from_output(sched_output, model_runner_output)
+
+        sched_output = scheduler.schedule()
+        if sched_output.num_scheduled_tokens:
+            sched_outputs.append(sched_output)
+
+    for i, req in enumerate(requests):
+        assert req.status == RequestStatus.FINISHED_ABORTED
+        assert req.num_output_tokens == abort_order_copy.index(i)
+
+
+def test_preempt():
+    scheduler = create_scheduler(async_scheduling=True)
+    requests = create_requests(num_requests=10, max_tokens=20)
+
+    for req in requests:
+        scheduler.add_request(req)
+
+    sched_outputs: deque[SchedulerOutput] = deque()
+    sched_outputs.append(scheduler.schedule())
+    sched_outputs.append(scheduler.schedule())
+
+    abort_order = [0, 8, 3, 1, 6, 4, 2, 5, 7, 9]
+    abort_order_copy = abort_order.copy()
+
+    def abort_request():
+        if not abort_order:
+            return
+        req = requests[abort_order.pop(0)]
+        scheduler.finish_requests(req.request_id,
+                                  RequestStatus.FINISHED_ABORTED)
+
+    while sched_outputs:
+        # Abort a scheduled request.
+        abort_request()
+        sched_output = sched_outputs.popleft()
+        model_runner_output = _make_model_runner_output(sched_output)
+        scheduler.update_from_output(sched_output, model_runner_output)
+
+        sched_output = scheduler.schedule()
+        if sched_output.num_scheduled_tokens:
+            sched_outputs.append(sched_output)
+
+    for i, req in enumerate(requests):
+        assert req.status == RequestStatus.FINISHED_ABORTED
+        assert req.num_output_tokens == abort_order_copy.index(i)
+
+
+def test_prefix_caching_for_prefill_dedup():
+    CHUNK_SIZE = 1000
+    BLOCK_SIZE = 16
+    num_prompt_tokens = 100
+    scheduler = create_scheduler(async_scheduling=True,
+                                 max_num_batched_tokens=CHUNK_SIZE,
+                                 enable_prefix_caching=True,
+                                 block_size=BLOCK_SIZE)
+    requests = create_requests(num_requests=5,
+                               num_tokens=num_prompt_tokens,
+                               max_tokens=3,
+                               same_prompt=True)
+    requests_copy = requests.copy()
+
+    # Two requests with the same prompt.
+    req0 = requests.pop(0)
+    req1 = requests.pop(0)
+    scheduler.add_request(req0)
+    scheduler.add_request(req1)
+
+    sched_outputs: deque[SchedulerOutput] = deque()
+    sched_output = scheduler.schedule()
+    sched_outputs.append(sched_output)
+    # Make sure prefix caching de-duplicates the prompts in the same step,
+    # so all the blocks except the last are shared between the two requests.
+    assert len(sched_output.num_scheduled_tokens) == 2
+    num_blocks = num_prompt_tokens // BLOCK_SIZE
+    assert req0.num_cached_tokens == 0
+    assert req1.num_cached_tokens >= num_blocks * BLOCK_SIZE
+
+    sched_outputs.append(scheduler.schedule())
+    while sched_outputs:
+        if requests:
+            scheduler.add_request(requests.pop(0))
+        sched_output = sched_outputs.popleft()
+        model_runner_output = _make_model_runner_output(sched_output)
+        scheduler.update_from_output(sched_output, model_runner_output)
+        sched_output = scheduler.schedule()
+        if sched_output.num_scheduled_tokens:
+            sched_outputs.append(sched_output)
+
+    # Other requests scheduled after the two requests should also get
+    # prefix cache hit.
+    assert scheduler.get_num_unfinished_requests() == 0
+    for req in requests_copy[1:]:
+        assert req.num_cached_tokens >= num_blocks * BLOCK_SIZE
+
+
+def test_prefix_caching_for_multi_turn():
+    CHUNK_SIZE = 1000
+    BLOCK_SIZE = 16
+    num_prompt_tokens = 100
+    num_output_tokens = 200
+    scheduler = create_scheduler(async_scheduling=True,
+                                 max_num_batched_tokens=CHUNK_SIZE,
+                                 enable_prefix_caching=True,
+                                 block_size=BLOCK_SIZE)
+    requests = create_requests(num_requests=5,
+                               num_tokens=num_prompt_tokens,
+                               max_tokens=num_output_tokens)
+
+    for req in requests:
+        scheduler.add_request(req)
+    sched_outputs: deque[SchedulerOutput] = deque()
+    sched_outputs.append(scheduler.schedule())
+    sched_outputs.append(scheduler.schedule())
+
+    # Process the requests.
+    while sched_outputs:
+        sched_output = sched_outputs.popleft()
+        model_runner_output = _make_model_runner_output(sched_output)
+        scheduler.update_from_output(sched_output, model_runner_output)
+        sched_output = scheduler.schedule()
+        if sched_output.num_scheduled_tokens:
+            sched_outputs.append(sched_output)
+    assert scheduler.get_num_unfinished_requests() == 0
+
+    # Create next-turn requests whose prompts are the full output of the
+    # previous turn.
+    next_turn_requests = create_requests(
+        num_requests=5,
+        num_tokens=num_prompt_tokens + num_output_tokens,
+        max_tokens=num_output_tokens,
+    )
+    for i, req in enumerate(next_turn_requests):
+        req.prompt_token_ids = (requests[i].prompt_token_ids +
+                                list(requests[i].output_token_ids))
+    # Schedule the next-turn requests.
+    for req in next_turn_requests:
+        scheduler.add_request(req)
+    sched_outputs.append(scheduler.schedule())
+
+    # Make sure the next-turn requests get prefix cache hit by the previous
+    # requests.
+    for req in next_turn_requests:
+        assert (req.num_cached_tokens == req.num_prompt_tokens // BLOCK_SIZE *
+                BLOCK_SIZE)
diff --git a/vllm_v0.10.0/tests/v1/core/test_kv_cache_utils.py b/vllm_v0.10.0/tests/v1/core/test_kv_cache_utils.py
new file mode 100644
index 0000000..ccdbe79
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/core/test_kv_cache_utils.py
@@ -0,0 +1,1036 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import importlib
+
+import pytest
+import torch
+
+from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.sampling_params import SamplingParams
+from vllm.utils import GiB_bytes, sha256, sha256_cbor_64bit
+from vllm.v1.core.kv_cache_manager import KVCacheManager
+# disable yapf here as it formats differently than isort such that both fail
+# yapf: disable
+from vllm.v1.core.kv_cache_utils import (
+    FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics,
+    estimate_max_model_len, generate_block_hash_extra_keys,
+    get_kv_cache_config, get_max_concurrency_for_kv_cache_config,
+    hash_block_tokens, hash_request_tokens, init_none_hash,
+    unify_kv_cache_configs)
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheGroupSpec, KVCacheTensor,
+                                        SlidingWindowSpec)
+from vllm.v1.metrics.stats import PrefixCacheStats
+from vllm.v1.request import Request
+
+# yapf: enable
+
+
+def make_request(request_id,
+                 prompt_token_ids,
+                 mm_positions=None,
+                 mm_hashes=None,
+                 cache_salt=None):
+    if mm_positions is None:
+        multi_modal_inputs = None
+    else:
+        multi_modal_inputs = [MultiModalKwargs({})] * len(mm_positions)
+
+    return Request(
+        request_id=request_id,
+        prompt_token_ids=prompt_token_ids,
+        multi_modal_inputs=multi_modal_inputs,
+        multi_modal_hashes=mm_hashes,
+        multi_modal_placeholders=mm_positions,
+        sampling_params=SamplingParams(max_tokens=17),
+        pooling_params=None,
+        eos_token_id=100,
+        lora_request=None,
+        cache_salt=cache_salt,
+    )
+
+
+def new_kv_cache_spec(block_size=16,
+                      num_kv_heads=2,
+                      head_size=64,
+                      dtype=torch.float32,
+                      use_mla=False,
+                      sliding_window=None):
+    return FullAttentionSpec(block_size=block_size,
+                             num_kv_heads=num_kv_heads,
+                             head_size=head_size,
+                             dtype=dtype,
+                             use_mla=use_mla,
+                             sliding_window=sliding_window)
+
+
+def new_sliding_window_spec(block_size=16,
+                            num_kv_heads=2,
+                            head_size=64,
+                            dtype=torch.float32,
+                            use_mla=False,
+                            sliding_window=1):
+    return SlidingWindowSpec(block_size=block_size,
+                             num_kv_heads=num_kv_heads,
+                             head_size=head_size,
+                             dtype=dtype,
+                             use_mla=use_mla,
+                             sliding_window=sliding_window)
+
+
+@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash])
+def test_none_hash(monkeypatch, hash_fn):
+    import vllm.v1.core.kv_cache_utils
+
+    # case 1: PYTHONHASHSEED is not set, use random
+    with monkeypatch.context() as m:
+        m.delenv('PYTHONHASHSEED', raising=False)
+        reloaded_kv_cache_utils = importlib.reload(vllm.v1.core.kv_cache_utils)
+        reloaded_kv_cache_utils.init_none_hash(hash_fn)
+        assert reloaded_kv_cache_utils.NONE_HASH is not None
+        assert isinstance(reloaded_kv_cache_utils.NONE_HASH, int)
+        assert reloaded_kv_cache_utils.NONE_HASH != 0
+
+    # case 2: PYTHONHASHSEED is set, use the seed and hash_fn
+    with monkeypatch.context() as m:
+        m.setenv('PYTHONHASHSEED', 'python hash seed')
+        reloaded_kv_cache_utils = importlib.reload(vllm.v1.core.kv_cache_utils)
+        reloaded_kv_cache_utils.init_none_hash(hash_fn)
+        assert reloaded_kv_cache_utils.NONE_HASH is not None
+        assert isinstance(reloaded_kv_cache_utils.NONE_HASH, int)
+        assert hash_fn('python hash seed') == reloaded_kv_cache_utils.NONE_HASH
+
+
+def test_kv_cache_block():
+    import vllm.v1.core.kv_cache_utils
+
+    # Test KVCacheBlock initialization
+    block = KVCacheBlock(block_id=0)
+    assert block.block_id == 0
+    assert block.ref_cnt == 0
+    assert block.block_hash is None
+
+    # Test reference count manipulation
+    block.incr_ref()
+    assert block.ref_cnt == 1
+    block.decr_ref()
+    assert block.ref_cnt == 0
+
+    # Test block hash setting and resetting
+    block_hash = vllm.v1.core.kv_cache_utils.BlockHash(hash_value=123,
+                                                       token_ids=(1, 2, 3))
+    block.block_hash = block_hash
+    assert block.block_hash == block_hash
+
+    block.reset_hash()
+    assert block.block_hash is None
+
+
+def test_free_kv_cache_block_queue_initialization():
+    # Test with a single block
+    block = KVCacheBlock(block_id=0)
+    queue = FreeKVCacheBlockQueue([block])
+    assert queue.num_free_blocks == 1
+    assert queue.fake_free_list_head.next_free_block is block
+    assert queue.fake_free_list_tail.prev_free_block is block
+
+
+def test_free_kv_cache_block_queue_operations():
+    # Create a list of KVCacheBlock objects
+    blocks = [KVCacheBlock(block_id=i) for i in range(5)]
+
+    # Create a FreeKVCacheBlockQueue with these blocks
+    queue = FreeKVCacheBlockQueue(blocks)
+
+    # Check initial state
+    assert queue.num_free_blocks == 5
+    assert queue.fake_free_list_head.next_free_block is blocks[0]
+    assert queue.fake_free_list_tail.prev_free_block is blocks[4]
+
+    # Pop the first block
+    block1 = queue.popleft()
+    assert block1 == blocks[0]
+    assert queue.num_free_blocks == 4
+    assert queue.fake_free_list_head.next_free_block is blocks[1]
+    assert queue.fake_free_list_tail.prev_free_block is blocks[4]
+
+    # Remove a block from the middle
+    block_to_remove = blocks[2]
+    queue.remove(block_to_remove)
+    assert queue.num_free_blocks == 3
+    assert blocks[1].next_free_block is blocks[3]
+    assert blocks[3].prev_free_block is blocks[1]
+
+    # Append a block back
+    queue.append(block_to_remove)
+    assert queue.num_free_blocks == 4
+    assert queue.fake_free_list_tail.prev_free_block is block_to_remove
+    assert block_to_remove.prev_free_block is blocks[4]
+    assert block_to_remove.next_free_block is queue.fake_free_list_tail
+
+    # Pop blocks until empty
+    for _ in range(4):
+        queue.popleft()
+    assert queue.num_free_blocks == 0
+    assert (queue.fake_free_list_head.next_free_block
+            is queue.fake_free_list_tail)
+    assert (queue.fake_free_list_tail.prev_free_block
+            is queue.fake_free_list_head)
+
+    # Attempt to pop from an empty queue
+    with pytest.raises(ValueError) as e:
+        queue.popleft()
+    assert str(e.value) == "No free blocks available"
+
+
+def test_free_kv_cache_block_queue_append_n():
+    # Create an empty FreeKVCacheBlockQueue with these blocks
+    queue = FreeKVCacheBlockQueue([])
+    blocks = [KVCacheBlock(block_id=i) for i in range(6)]
+    # Append 0 block
+    # fake_head->fake_tail
+    queue.append_n([])
+    assert queue.num_free_blocks == 0
+    assert (queue.fake_free_list_head.next_free_block
+            is queue.fake_free_list_tail)
+    assert (queue.fake_free_list_tail.prev_free_block
+            is queue.fake_free_list_head)
+    # Append 1 block
+    # fake_head->b0->fake_tail
+    queue.append_n(blocks[0:1])
+    assert queue.num_free_blocks == 1
+    assert queue.fake_free_list_head.next_free_block is blocks[0]
+    assert blocks[0].prev_free_block is queue.fake_free_list_head
+    assert blocks[0].next_free_block is queue.fake_free_list_tail
+    assert queue.fake_free_list_tail.prev_free_block is blocks[0]
+    # Append 2 blocks
+    # fake_head->b0->b4->b5->fake_tail
+    queue.append_n(blocks[4:6])
+    assert queue.num_free_blocks == 3
+    assert queue.fake_free_list_head.next_free_block is blocks[0]
+    assert blocks[0].prev_free_block is queue.fake_free_list_head
+    assert blocks[0].next_free_block is blocks[4]
+    assert blocks[4].prev_free_block is blocks[0]
+    assert blocks[4].next_free_block is blocks[5]
+    assert blocks[5].prev_free_block is blocks[4]
+    assert blocks[5].next_free_block is queue.fake_free_list_tail
+    assert queue.fake_free_list_tail.prev_free_block is blocks[5]
+    # Append 3 blocks
+    # fake_head->b0->b4->b5->b1->b2->b3->fake_tail
+    queue.append_n(blocks[1:4])
+    assert queue.num_free_blocks == 6
+    assert queue.fake_free_list_head.next_free_block is blocks[0]
+    assert blocks[0].prev_free_block is queue.fake_free_list_head
+    assert blocks[0].next_free_block is blocks[4]
+    assert blocks[4].prev_free_block is blocks[0]
+    assert blocks[4].next_free_block is blocks[5]
+    assert blocks[5].prev_free_block is blocks[4]
+    assert blocks[5].next_free_block is blocks[1]
+    assert blocks[1].prev_free_block is blocks[5]
+    assert blocks[1].next_free_block is blocks[2]
+    assert blocks[2].prev_free_block is blocks[1]
+    assert blocks[2].next_free_block is blocks[3]
+    assert blocks[3].prev_free_block is blocks[2]
+    assert blocks[3].next_free_block is queue.fake_free_list_tail
+    assert queue.fake_free_list_tail.prev_free_block is blocks[3]
+
+
+def test_free_kv_cache_block_queue_popleft_n():
+    blocks = [KVCacheBlock(block_id=i) for i in range(6)]
+    # Create a empty FreeKVCacheBlockQueue with these blocks
+    queue = FreeKVCacheBlockQueue(
+        [blocks[1], blocks[3], blocks[5], blocks[4], blocks[0], blocks[2]])
+    assert queue.num_free_blocks == 6
+    assert queue.fake_free_list_head.next_free_block is blocks[1]
+    assert blocks[1].prev_free_block is queue.fake_free_list_head
+    assert blocks[1].next_free_block is blocks[3]
+    assert blocks[3].prev_free_block is blocks[1]
+    assert blocks[3].next_free_block is blocks[5]
+    assert blocks[5].prev_free_block is blocks[3]
+    assert blocks[5].next_free_block is blocks[4]
+    assert blocks[4].prev_free_block is blocks[5]
+    assert blocks[4].next_free_block is blocks[0]
+    assert blocks[0].prev_free_block is blocks[4]
+    assert blocks[0].next_free_block is blocks[2]
+    assert blocks[2].prev_free_block is blocks[0]
+    assert blocks[2].next_free_block is queue.fake_free_list_tail
+    assert queue.fake_free_list_tail.prev_free_block is blocks[2]
+
+    # Pop 0 block
+    # fake_head->b1->b3->b5->b4->b0->b2->fake_tail
+    assert len(queue.popleft_n(0)) == 0
+    # Pop 1 block
+    # fake_head->b3->b5->b4->b0->b2->fake_tail
+    result_blocks = queue.popleft_n(1)
+    assert len(result_blocks) == 1
+    assert result_blocks[0] is blocks[1]
+    for block in result_blocks:
+        assert block.prev_free_block is None
+        assert block.next_free_block is None
+    # Pop 2 blocks
+    # fake_head->b4->b0->b2->fake_tail
+    result_blocks = queue.popleft_n(2)
+    assert len(result_blocks) == 2
+    assert result_blocks[0] is blocks[3]
+    assert result_blocks[1] is blocks[5]
+    for block in result_blocks:
+        assert block.prev_free_block is None
+        assert block.next_free_block is None
+    # Pop 3 blocks
+    # fake_head->fake_tail
+    result_blocks = queue.popleft_n(3)
+    assert len(result_blocks) == 3
+    assert result_blocks[0] is blocks[4]
+    assert result_blocks[1] is blocks[0]
+    assert result_blocks[2] is blocks[2]
+    for block in result_blocks:
+        assert block.prev_free_block is None
+        assert block.next_free_block is None
+
+
+def test_free_kv_cache_block_queue_get_all_free_blocks():
+    # Create a list of KVCacheBlock objects
+    blocks = [KVCacheBlock(block_id=i) for i in range(5)]
+
+    # Create a FreeKVCacheBlockQueue with these blocks
+    queue = FreeKVCacheBlockQueue(blocks)
+
+    # Check all blocks are correctly retrieved
+    assert queue.get_all_free_blocks() == blocks
+
+    # Pop a block and check again
+    queue.popleft()
+    assert queue.get_all_free_blocks() == blocks[1:]
+
+    # Remove a block and check again
+    block_to_remove = blocks[2]
+    queue.remove(block_to_remove)
+    assert queue.get_all_free_blocks() == blocks[1:2] + blocks[3:]
+
+    # Append a block back and check again
+    queue.append(block_to_remove)
+    assert queue.get_all_free_blocks() == \
+        blocks[1:2] + blocks[3:] + [block_to_remove]
+
+
+def test_generate_block_hash_extra_keys():
+    request = make_request(
+        request_id=0,
+        prompt_token_ids=[_ for _ in range(20)],
+        mm_positions=[
+            PlaceholderRange(offset=0, length=5),
+            PlaceholderRange(offset=10, length=5),
+        ],
+        mm_hashes=["hash1", "hash2"],
+    )
+
+    # Test with no extra keys
+    extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 5, 0)
+    assert extra_keys == ("hash1", )
+    assert next_mm_idx == 1
+
+    # Test with partial overlap
+    extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 3, 8, 0)
+    assert extra_keys == ("hash1", )
+    assert next_mm_idx == 1
+
+    # Test with no overlap
+    extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 6, 10, 0)
+    assert extra_keys is None
+    assert next_mm_idx == 1
+
+    # Test with multiple extra keys
+    extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 15, 0)
+    assert extra_keys == ('hash1', 'hash2')
+    assert next_mm_idx == 2
+
+
+def test_generate_block_hash_extra_keys_no_mm_inputs():
+    request = make_request(
+        request_id=0,
+        prompt_token_ids=[_ for _ in range(6)],
+        mm_positions=None,
+        mm_hashes=None,
+    )
+
+    extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 5, 0)
+    assert extra_keys is None
+    assert next_mm_idx == 0
+
+
+def test_generate_block_hash_extra_keys_cache_salt():
+    request = make_request(
+        request_id=0,
+        prompt_token_ids=[_ for _ in range(6)],
+        mm_positions=None,
+        mm_hashes=None,
+        cache_salt="salt",
+    )
+
+    # salt is added for the first token
+    extra_keys, _ = generate_block_hash_extra_keys(request, 0, 1, 0)
+    assert extra_keys == ('salt', )
+    extra_keys, _ = generate_block_hash_extra_keys(request, 0, 10, 0)
+    assert extra_keys == ('salt', )
+
+    # no salt added for other tokens
+    extra_keys, _ = generate_block_hash_extra_keys(request, 1, 2, 0)
+    assert extra_keys is None
+    extra_keys, _ = generate_block_hash_extra_keys(request, 6, 10, 0)
+    assert extra_keys is None
+
+    # works together with other extra keys
+    request_mm = make_request(
+        request_id=0,
+        prompt_token_ids=[_ for _ in range(20)],
+        mm_positions=[
+            PlaceholderRange(offset=0, length=5),
+        ],
+        mm_hashes=["hash1"],
+        cache_salt="salt",
+    )
+
+    # Test with no extra keys
+    extra_keys, next_mm_idx = generate_block_hash_extra_keys(
+        request_mm, 0, 5, 0)
+    assert extra_keys == ("hash1", "salt")
+    assert next_mm_idx == 1
+
+
+@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash])
+def test_hash_block_tokens(hash_fn):
+    import vllm.v1.core.kv_cache_utils
+    init_none_hash(hash_fn)
+    parent_block_hash = 123
+    curr_block_token_ids = (1, 2, 3)
+    extra_keys = ("key1", "key2")
+
+    block_hash = hash_block_tokens(hash_fn, parent_block_hash,
+                                   curr_block_token_ids, extra_keys)
+    assert isinstance(block_hash, vllm.v1.core.kv_cache_utils.BlockHash)
+    assert block_hash.hash_value == hash_fn(
+        (parent_block_hash, curr_block_token_ids, extra_keys))
+    assert block_hash.token_ids == curr_block_token_ids
+    assert block_hash.extra_keys == extra_keys
+
+
+@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash])
+def test_hash_request_tokens(hash_fn):
+    import vllm.v1.core.kv_cache_utils
+    init_none_hash(hash_fn)
+    request = make_request(
+        request_id=0,
+        prompt_token_ids=[_ for _ in range(6)],
+        mm_positions=[
+            PlaceholderRange(offset=0, length=3),
+            PlaceholderRange(offset=3, length=3),
+        ],
+        mm_hashes=["hash1", "hash2"],
+    )
+
+    block_size = 3
+    block_hashes = hash_request_tokens(hash_fn, block_size, request)
+
+    assert len(block_hashes) == 2
+    assert isinstance(block_hashes[0], vllm.v1.core.kv_cache_utils.BlockHash)
+    assert isinstance(block_hashes[1], vllm.v1.core.kv_cache_utils.BlockHash)
+
+    # Check the first block
+    assert block_hashes[0].token_ids == (0, 1, 2)
+    assert block_hashes[0].extra_keys == ("hash1", )
+
+    # Check the second block
+    assert block_hashes[1].token_ids == (3, 4, 5)
+    assert block_hashes[1].extra_keys == ("hash2", )
+
+
+@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash])
+def test_hash_tokens_different_mm_input(hash_fn):
+    init_none_hash(hash_fn)
+
+    request1 = make_request(
+        request_id=0,
+        prompt_token_ids=[_ for _ in range(6)],
+        mm_positions=[
+            PlaceholderRange(offset=0, length=3),
+            PlaceholderRange(offset=3, length=3),
+        ],
+        mm_hashes=["hash1", "hash2"],
+    )
+    request2 = make_request(
+        request_id=1,
+        prompt_token_ids=[_ for _ in range(6)],
+        mm_positions=[
+            PlaceholderRange(offset=0, length=3),
+            PlaceholderRange(offset=3, length=3),
+        ],
+        mm_hashes=["hash3", "hash2"],
+    )
+    block_size = 3
+    block_hashes1 = hash_request_tokens(hash_fn, block_size, request1)
+    block_hashes2 = hash_request_tokens(hash_fn, block_size, request2)
+    assert block_hashes1[0] != block_hashes2[0]
+    assert block_hashes1[1] != block_hashes2[1]
+
+
+@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash])
+def test_hash_request_tokens_no_mm_inputs(hash_fn):
+    init_none_hash(hash_fn)
+
+    request = make_request(
+        request_id=0,
+        prompt_token_ids=[_ for _ in range(6)],
+        mm_positions=None,
+        mm_hashes=None,
+    )
+
+    block_size = 3
+    block_hashes = hash_request_tokens(hash_fn, block_size, request)
+
+    assert len(block_hashes) == 2
+    assert block_hashes[0].token_ids == (0, 1, 2)
+    assert block_hashes[0].extra_keys is None
+    assert block_hashes[1].token_ids == (3, 4, 5)
+    assert block_hashes[1].extra_keys is None
+
+
+def test_metrics():
+    """
+    Test the prefix caching metrics.
+    """
+
+    def stats(requests, queries, hits):
+        return PrefixCacheStats(requests=requests, queries=queries, hits=hits)
+
+    metrics = PrefixCachingMetrics(max_recent_requests=5)
+    assert metrics.hit_rate == 0.0
+
+    metrics.observe(stats(1, 20, 9))
+    # 9 / 20 = 0.45
+    assert metrics.hit_rate == 0.45
+
+    metrics.observe(stats(4, 80, 16))
+
+    # 25 / 100 = 0.25
+    assert metrics.hit_rate == 0.25
+
+    metrics.observe(stats(1, 10, 2))
+
+    # Remove (20, 9) and add (10, 2): 18 / 90 = 0.2
+    assert metrics.aggregated_requests == 5
+    assert metrics.aggregated_query_total == 90
+    assert metrics.aggregated_query_hit == 18
+    assert metrics.hit_rate == 0.2
+
+    metrics.reset()
+    assert metrics.hit_rate == 0.0
+    assert metrics.aggregated_requests == 0
+    assert metrics.aggregated_query_total == 0
+    assert metrics.aggregated_query_hit == 0
+    assert not metrics.query_queue
+
+
+def test_unify_kv_cache_configs():
+    same_kv_cache_config = [
+        KVCacheConfig(
+            num_blocks=10,
+            kv_cache_tensors=[
+                KVCacheTensor(size=100, shared_by=["layer1"]),
+                KVCacheTensor(size=100, shared_by=["layer2"]),
+            ],
+            kv_cache_groups=[
+                KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
+                KVCacheGroupSpec(["layer2"],
+                                 new_kv_cache_spec(num_kv_heads=4)),
+            ],
+        ),
+        KVCacheConfig(
+            num_blocks=20,
+            kv_cache_tensors=[
+                KVCacheTensor(size=100, shared_by=["layer1"]),
+                KVCacheTensor(size=100, shared_by=["layer2"]),
+            ],
+            kv_cache_groups=[
+                KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
+                KVCacheGroupSpec(["layer2"],
+                                 new_kv_cache_spec(num_kv_heads=4)),
+            ],
+        ),
+    ]
+    unify_kv_cache_configs(same_kv_cache_config)
+    assert same_kv_cache_config[0].num_blocks == 10
+    assert same_kv_cache_config[1].num_blocks == 10
+
+    need_sort_kv_cache_config = [
+        KVCacheConfig(
+            num_blocks=10,
+            kv_cache_tensors=[
+                KVCacheTensor(size=100, shared_by=["layer1"]),
+                KVCacheTensor(size=100, shared_by=["layer2"]),
+            ],
+            kv_cache_groups=[
+                KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
+                KVCacheGroupSpec(["layer2"],
+                                 new_kv_cache_spec(num_kv_heads=4)),
+            ],
+        ),
+        KVCacheConfig(
+            num_blocks=20,
+            kv_cache_tensors=[
+                KVCacheTensor(size=100, shared_by=["layer1"]),
+                KVCacheTensor(size=100, shared_by=["layer2"]),
+            ],
+            kv_cache_groups=[
+                KVCacheGroupSpec(["layer2"],
+                                 new_kv_cache_spec(num_kv_heads=4)),
+                KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
+            ],
+        ),
+    ]
+
+    unify_kv_cache_configs(need_sort_kv_cache_config)
+    assert need_sort_kv_cache_config[0].num_blocks == 10
+    assert need_sort_kv_cache_config[1].num_blocks == 10
+
+    diff_kv_cache_config = [
+        KVCacheConfig(
+            num_blocks=10,
+            kv_cache_tensors=[
+                KVCacheTensor(size=100, shared_by=["layer1"]),
+                KVCacheTensor(size=100, shared_by=["layer2"]),
+            ],
+            kv_cache_groups=[
+                KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
+                KVCacheGroupSpec(["layer2"],
+                                 new_kv_cache_spec(num_kv_heads=4)),
+            ],
+        ),
+        KVCacheConfig(
+            num_blocks=20,
+            kv_cache_tensors=[
+                KVCacheTensor(size=100, shared_by=["layer1"]),
+                KVCacheTensor(size=100, shared_by=["layer2"]),
+            ],
+            kv_cache_groups=[
+                KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
+                KVCacheGroupSpec(["layer2"],
+                                 new_kv_cache_spec(num_kv_heads=8)),
+            ],
+        ),
+    ]
+    with pytest.raises(AssertionError):
+        unify_kv_cache_configs(diff_kv_cache_config)
+
+
+def test_merge_kv_cache_spec():
+    same_layer_specs = [
+        new_kv_cache_spec(num_kv_heads=32),
+        new_kv_cache_spec(num_kv_heads=32),
+    ]
+    merged_layer_spec = same_layer_specs[0].merge(same_layer_specs)
+    assert merged_layer_spec.block_size == 16
+    assert merged_layer_spec.num_kv_heads == 32
+    assert merged_layer_spec.head_size == 64
+    assert merged_layer_spec.dtype == torch.float32
+    assert merged_layer_spec.sliding_window is None
+
+    different_layer_specs = [
+        new_kv_cache_spec(num_kv_heads=32),
+        new_kv_cache_spec(num_kv_heads=16),
+    ]
+    with pytest.raises(AssertionError):
+        different_layer_specs[0].merge(different_layer_specs)
+
+    full_spec = new_kv_cache_spec(num_kv_heads=32)
+    different_type_layer_specs = [
+        full_spec,
+        SlidingWindowSpec(
+            block_size=full_spec.block_size,
+            num_kv_heads=full_spec.num_kv_heads,
+            head_size=full_spec.head_size,
+            dtype=full_spec.dtype,
+            use_mla=full_spec.use_mla,
+            sliding_window=1,
+        ),
+    ]
+    with pytest.raises(AssertionError):
+        different_type_layer_specs[0].merge(different_type_layer_specs)
+    with pytest.raises(AssertionError):
+        different_type_layer_specs[1].merge(different_type_layer_specs)
+
+    different_sliding_window_layer_specs = [
+        new_kv_cache_spec(num_kv_heads=32),
+        new_kv_cache_spec(num_kv_heads=32, sliding_window=1),
+        new_kv_cache_spec(num_kv_heads=32, sliding_window=2),
+    ]
+    with pytest.raises(ValueError):
+        different_sliding_window_layer_specs[0].merge(
+            different_sliding_window_layer_specs)
+
+    same_sliding_window_layer_specs = [
+        new_kv_cache_spec(num_kv_heads=32, sliding_window=1),
+        new_kv_cache_spec(num_kv_heads=32, sliding_window=1),
+    ]
+    merged_layer_spec = same_sliding_window_layer_specs[0].merge(
+        same_sliding_window_layer_specs)
+    assert merged_layer_spec.sliding_window == 1
+
+    same_sliding_window_layer_spec_with_none = [
+        new_kv_cache_spec(num_kv_heads=32, sliding_window=1),
+        new_kv_cache_spec(num_kv_heads=32, sliding_window=None),
+    ]
+    merged_layer_spec = same_sliding_window_layer_spec_with_none[0].merge(
+        same_sliding_window_layer_spec_with_none)
+    assert merged_layer_spec.sliding_window == 1
+
+
+@pytest.mark.parametrize(
+    ("model_id", "max_model_len", "want_estimated_max_len"), [
+        ("Qwen/Qwen1.5-7B", 16385, 16384),
+        ("Qwen/Qwen1.5-7B", 16383, 16383),
+    ])
+def test_estimate_max_model_len(model_id, max_model_len,
+                                want_estimated_max_len):
+    # Create a VllmConfig
+    model_config = ModelConfig(
+        model_id,
+        task="generate",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        max_model_len=max_model_len,
+    )
+    scheduler_config = SchedulerConfig(max_num_batched_tokens=32768)
+
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        scheduler_config=scheduler_config,
+    )
+
+    # Create KV cache specs
+    kv_cache_spec = {}
+    for i in range(32):
+        layer_name = f"layer_{i}"
+        kv_cache_spec[layer_name] = FullAttentionSpec(
+            block_size=16,
+            num_kv_heads=32,
+            head_size=128,
+            dtype=torch.float16,
+            use_mla=False,
+        )
+    # Estimate the maximum model length, 16384 model_len need 8GB
+    estimated_max_len = estimate_max_model_len(vllm_config, kv_cache_spec,
+                                               8 * GiB_bytes)
+    assert estimated_max_len == want_estimated_max_len
+
+
+def test_get_max_concurrency_for_kv_cache_config():
+    # Create a VllmConfig
+    model_id = "Qwen/Qwen1.5-7B"
+    max_model_len = 16384
+    model_config = ModelConfig(
+        model_id,
+        task="generate",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        max_model_len=max_model_len,
+    )
+    scheduler_config = SchedulerConfig(max_num_batched_tokens=1024,
+                                       enable_chunked_prefill=True)
+
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        scheduler_config=scheduler_config,
+    )
+
+    full_attention_spec = FullAttentionSpec(
+        block_size=16,
+        num_kv_heads=32,
+        head_size=128,
+        dtype=torch.float16,
+        use_mla=False,
+    )
+
+    sliding_window_spec = SlidingWindowSpec(
+        block_size=16,
+        num_kv_heads=32,
+        head_size=128,
+        dtype=torch.float16,
+        use_mla=False,
+        sliding_window=1024,
+    )
+
+    kv_cache_config_full_attention = KVCacheConfig(
+        num_blocks=int(1024 * 1.5),
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec([f"layer_{i}" for i in range(32)],
+                             full_attention_spec),
+        ],
+    )
+    max_concurrency_full_attention = get_max_concurrency_for_kv_cache_config(
+        vllm_config, kv_cache_config_full_attention)
+    assert max_concurrency_full_attention == 1.5
+
+    kv_cache_config_sliding_window = KVCacheConfig(
+        num_blocks=129 * 3,
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec([f"layer_{i}" for i in range(32)],
+                             sliding_window_spec),
+        ],
+    )
+    max_concurrency_sliding_window = get_max_concurrency_for_kv_cache_config(
+        vllm_config, kv_cache_config_sliding_window)
+    assert max_concurrency_sliding_window == 3
+
+    kv_cache_config_hybrid_model = KVCacheConfig(
+        num_blocks=(1024 + 129) * 3,
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec([f"layer_{i}" for i in range(32)],
+                             full_attention_spec),
+            KVCacheGroupSpec([f"layer_{i}" for i in range(32, 64)],
+                             sliding_window_spec),
+        ],
+    )
+    max_concurrency_hybrid_model = get_max_concurrency_for_kv_cache_config(
+        vllm_config, kv_cache_config_hybrid_model)
+    assert max_concurrency_hybrid_model == 3
+
+
+def test_allocate_with_lookahead():
+    """Verify that lookahead tokens correctly affect block allocation"""
+    block_size = 4
+    config = KVCacheConfig(
+        num_blocks=10,
+        kv_cache_tensors=[
+            KVCacheTensor(size=100, shared_by=["layer1"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer1"],
+                             new_kv_cache_spec(block_size=block_size)),
+        ],
+    )
+
+    request = make_request(
+        request_id=0,
+        prompt_token_ids=[],
+        mm_positions=None,
+        mm_hashes=None,
+    )
+
+    # Test case 1: Requires additional lookahead tokens
+    kv_cache_manager = KVCacheManager(kv_cache_config=config,
+                                      max_model_len=100)
+    blocks = kv_cache_manager.allocate_slots(
+        request,
+        num_new_tokens=3,
+        num_lookahead_tokens=2,  # Total required: 3+2=5 tokens
+    )
+    assert len(blocks.get_block_ids()[0]) == 2  # ceil(5/4)=2 blocks
+
+    # Test case 2: With precomputed blocks
+    kv_cache_manager = KVCacheManager(kv_cache_config=config,
+                                      max_model_len=100)
+    # required_blocks = ceil((3 + 2) /4) = 2
+    blocks = kv_cache_manager.allocate_slots(
+        request,
+        num_new_tokens=3,
+        num_lookahead_tokens=2,
+    )
+    assert len(blocks.get_block_ids()[0]) == 2
+
+    # Test case 3: With precomputed blocks
+    # required_blocks = ceil((3 + 4) / 4) = 2
+    kv_cache_manager = KVCacheManager(kv_cache_config=config,
+                                      max_model_len=100)
+    blocks = kv_cache_manager.allocate_slots(
+        request,
+        num_new_tokens=3,
+        num_lookahead_tokens=4,
+    )
+    assert len(blocks.get_block_ids()[0]) == 2
+
+
+def test_get_kv_cache_config():
+    # pass max_model_len to pass check_enough_kv_cache_memory
+    model_config = ModelConfig(max_model_len=16)
+    vllm_config = VllmConfig(model_config=model_config)
+
+    mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
+    # all layers are full attention -> single group
+    kv_cache_specs_full = {
+        'layer_1': new_kv_cache_spec(),
+        'layer_2': new_kv_cache_spec(),
+    }
+    kv_cache_config_full = get_kv_cache_config(
+        vllm_config, kv_cache_specs_full, mem_per_block_per_layer * 2 * 32)
+    assert kv_cache_config_full == KVCacheConfig(
+        num_blocks=32,
+        kv_cache_tensors=[
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_1"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_2"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec())
+        ])
+
+    # all layers are sliding window -> single group
+    kv_cache_specs_sliding = {
+        'layer_1': new_sliding_window_spec(),
+        'layer_2': new_sliding_window_spec(),
+    }
+    kv_cache_config_sliding = get_kv_cache_config(
+        vllm_config, kv_cache_specs_sliding, mem_per_block_per_layer * 2 * 32)
+    assert kv_cache_config_sliding == KVCacheConfig(
+        num_blocks=32,
+        kv_cache_tensors=[
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_1"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_2"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1", "layer_2"], new_sliding_window_spec())
+        ])
+
+    # full + sliding, but disable_hybrid_kv_cache_manager
+    vllm_config.scheduler_config.disable_hybrid_kv_cache_manager = True
+    kv_cache_specs_hybrid = {
+        'layer_1': new_kv_cache_spec(),
+        'layer_2': new_sliding_window_spec(),
+    }
+    kv_cache_config_hybrid = get_kv_cache_config(
+        vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 2 * 32)
+    assert kv_cache_config_hybrid == KVCacheConfig(
+        num_blocks=32,
+        kv_cache_tensors=[
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_1"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_2"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1", "layer_2"],
+                             new_kv_cache_spec(sliding_window=1)),
+        ],
+    )
+    vllm_config.scheduler_config.disable_hybrid_kv_cache_manager = False
+
+    # full + sliding, with hybrid_kv_cache_manager
+    kv_cache_specs_hybrid = {
+        'layer_1': new_kv_cache_spec(),
+        'layer_2': new_sliding_window_spec(),
+    }
+    kv_cache_config_hybrid = get_kv_cache_config(
+        vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 2 * 32)
+    assert kv_cache_config_hybrid == KVCacheConfig(
+        num_blocks=64,
+        kv_cache_tensors=[
+            KVCacheTensor(size=mem_per_block_per_layer * 64,
+                          shared_by=["layer_1", "layer_2"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1"], new_kv_cache_spec()),
+            KVCacheGroupSpec(["layer_2"], new_sliding_window_spec()),
+        ],
+    )
+
+    # 2 full + 4 sliding, 2 layers per group
+    kv_cache_specs_hybrid = {
+        'layer_1': new_kv_cache_spec(),
+        'layer_2': new_kv_cache_spec(),
+        'layer_3': new_sliding_window_spec(),
+        'layer_4': new_sliding_window_spec(),
+        'layer_5': new_sliding_window_spec(),
+        'layer_6': new_sliding_window_spec(),
+    }
+    kv_cache_config_hybrid = get_kv_cache_config(
+        vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 2 * 32)
+    assert kv_cache_config_hybrid == KVCacheConfig(
+        num_blocks=32,
+        kv_cache_tensors=[
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_1", "layer_3", "layer_5"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_2", "layer_4", "layer_6"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec()),
+            KVCacheGroupSpec(["layer_3", "layer_4"],
+                             new_sliding_window_spec()),
+            KVCacheGroupSpec(["layer_5", "layer_6"],
+                             new_sliding_window_spec()),
+        ],
+    )
+
+    # 3 full + 7 sliding, pad to 3 full + 9 sliding
+    kv_cache_specs_hybrid = {
+        'layer_1': new_kv_cache_spec(),
+        'layer_2': new_kv_cache_spec(),
+        'layer_3': new_kv_cache_spec(),
+        'layer_4': new_sliding_window_spec(),
+        'layer_5': new_sliding_window_spec(),
+        'layer_6': new_sliding_window_spec(),
+        'layer_7': new_sliding_window_spec(),
+        'layer_8': new_sliding_window_spec(),
+        'layer_9': new_sliding_window_spec(),
+        'layer_10': new_sliding_window_spec(),
+    }
+    kv_cache_config_hybrid = get_kv_cache_config(
+        vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 3 * 32)
+    assert kv_cache_config_hybrid == KVCacheConfig(
+        num_blocks=32,
+        kv_cache_tensors=[
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_1", "layer_4", "layer_7", "layer_10"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_2", "layer_5", "layer_8"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_3", "layer_6", "layer_9"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1", "layer_2", "layer_3"],
+                             new_kv_cache_spec()),
+            KVCacheGroupSpec(["layer_4", "layer_5", "layer_6"],
+                             new_sliding_window_spec()),
+            KVCacheGroupSpec(["layer_7", "layer_8", "layer_9"],
+                             new_sliding_window_spec()),
+            KVCacheGroupSpec(["layer_10"], new_sliding_window_spec()),
+        ],
+    )
+
+    # different hidden size, unimplemented
+    kv_cache_specs_hybrid = {
+        'layer_1': new_kv_cache_spec(head_size=128),
+        'layer_2': new_kv_cache_spec(),
+    }
+    with pytest.raises(NotImplementedError):
+        get_kv_cache_config(vllm_config, kv_cache_specs_hybrid,
+                            mem_per_block_per_layer * 2 * 32)
+
+    # Test num_gpu_blocks_override
+    vllm_config.cache_config.num_gpu_blocks_override = 16
+    kv_cache_config_override_blocks = get_kv_cache_config(
+        vllm_config, kv_cache_specs_full, mem_per_block_per_layer * 2 * 32)
+    assert kv_cache_config_override_blocks == KVCacheConfig(
+        num_blocks=16,
+        kv_cache_tensors=[
+            KVCacheTensor(size=mem_per_block_per_layer * 16,
+                          shared_by=["layer_1"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 16,
+                          shared_by=["layer_2"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec())
+        ])
diff --git a/vllm_v0.10.0/tests/v1/core/test_prefix_caching.py b/vllm_v0.10.0/tests/v1/core/test_prefix_caching.py
new file mode 100644
index 0000000..0856163
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/core/test_prefix_caching.py
@@ -0,0 +1,1338 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Compare the with and without prefix caching."""
+
+import copy
+from typing import Optional
+
+import pytest
+import torch
+
+from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.sampling_params import SamplingParams
+from vllm.utils import sha256, sha256_cbor_64bit
+from vllm.v1.core.block_pool import BlockPool
+from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
+from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
+                                         KVCacheBlock, hash_block_tokens,
+                                         init_none_hash)
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheGroupSpec, SlidingWindowSpec)
+
+
+def make_request(request_id,
+                 prompt_token_ids,
+                 mm_positions=None,
+                 mm_hashes=None,
+                 prompt_logprobs: Optional[int] = None,
+                 cache_salt: Optional[str] = None):
+    if mm_positions is None:
+        multi_modal_inputs = None
+    else:
+        multi_modal_inputs = [MultiModalKwargs({})] * len(mm_positions)
+
+    return Request(
+        request_id=request_id,
+        prompt_token_ids=prompt_token_ids,
+        multi_modal_inputs=multi_modal_inputs,
+        multi_modal_hashes=mm_hashes,
+        multi_modal_placeholders=mm_positions,
+        sampling_params=SamplingParams(max_tokens=17,
+                                       prompt_logprobs=prompt_logprobs),
+        pooling_params=None,
+        eos_token_id=100,
+        lora_request=None,
+        cache_salt=cache_salt,
+    )
+
+
+def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
+    return KVCacheConfig(
+        num_blocks=num_blocks,
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec(
+                ["layer"],
+                FullAttentionSpec(block_size, 1, 1, torch.float32, False),
+            )
+        ],
+    )
+
+
+def make_kv_cache_config_hybrid_model(block_size: int,
+                                      num_blocks: int) -> KVCacheConfig:
+    return KVCacheConfig(
+        num_blocks=num_blocks,
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec(
+                ["layer1"],
+                FullAttentionSpec(block_size, 1, 1, torch.float32, False),
+            ),
+            KVCacheGroupSpec(
+                ["layer2"],
+                SlidingWindowSpec(block_size,
+                                  1,
+                                  1,
+                                  torch.float32,
+                                  False,
+                                  sliding_window=2 * block_size),
+            ),
+            KVCacheGroupSpec(
+                ["layer3"],
+                SlidingWindowSpec(block_size,
+                                  1,
+                                  1,
+                                  torch.float32,
+                                  False,
+                                  sliding_window=2 * block_size),
+            ),
+        ],
+    )
+
+
+@pytest.mark.parametrize("hash_algo", ["sha256", "sha256_cbor_64bit", "hash"])
+def test_prefill(hash_algo):
+    manager = KVCacheManager(
+        make_kv_cache_config(16, 11),
+        max_model_len=8192,
+        enable_caching=True,
+        caching_hash_algo=hash_algo,
+    )
+
+    # choose the hash function according to the parameter
+    hash_fn = (sha256_cbor_64bit if hash_algo == "sha256_cbor_64bit" else
+               sha256 if hash_algo == "sha256" else hash)
+
+    # Complete 3 blocks (48 tokens)
+    common_token_ids = [i for i in range(3) for _ in range(16)]
+
+    # Fully cache miss
+    # Incomplete 1 block (7 tokens)
+    unique_token_ids = [3] * 7
+    all_token_ids = common_token_ids + unique_token_ids
+    req0 = make_request("0", all_token_ids)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
+    assert len(manager.req_to_block_hashes[req0.request_id]) == 3
+    assert not computed_blocks.blocks[0]
+    assert num_computed_tokens == 0
+    blocks = manager.allocate_slots(req0, 55,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
+
+    # Check full block metadata
+    parent_block_hash = None
+    for block_id in (1, 2, 3):
+        block_tokens = tuple(all_token_ids[(block_id - 1) * 16:block_id * 16])
+        block_hash = hash_block_tokens(hash_fn, parent_block_hash,
+                                       block_tokens)
+        assert manager.block_pool.blocks[
+            block_id].block_hash.block_hash == block_hash
+        assert manager.block_pool.blocks[block_id].ref_cnt == 1
+        parent_block_hash = block_hash.hash_value
+
+    # Check partial block metadata
+    for block_id in (4, ):
+        assert manager.block_pool.blocks[block_id].block_hash is None
+        assert manager.block_pool.blocks[block_id].ref_cnt == 1
+
+    # Cache hit in the common prefix when the original block is still in use.
+    # Incomplete 1 block (5 tokens)
+    unique_token_ids = [3] * 5
+    req1 = make_request("1", common_token_ids + unique_token_ids)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
+    assert len(manager.req_to_block_hashes[req1.request_id]) == 3
+    assert computed_blocks.get_block_ids() == ([1, 2, 3], )
+    assert num_computed_tokens == 3 * 16
+    num_new_tokens = 53 - 3 * 16
+    blocks = manager.allocate_slots(req1, num_new_tokens,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    assert blocks.get_block_ids() == ([5], )
+    for block in computed_blocks.blocks[0]:
+        assert block.ref_cnt == 2
+
+    # At this point, we should have 5 free blocks left.
+    free_block_queue = manager.block_pool.free_block_queue
+    assert free_block_queue.num_free_blocks == 5
+
+    manager.free(req0)
+    manager.free(req1)
+
+    # All blocks should be available.
+    assert free_block_queue.num_free_blocks == 10
+    # The order should be
+    # [unallocated (6, 7, 8, 9, 10)]
+    # [unique_req0 (4)]
+    # [unique_req1 (5)]
+    # [common (3, 2, 1)]
+    assert [
+        b.block_id
+        for b in manager.block_pool.free_block_queue.get_all_free_blocks()
+    ] == [6, 7, 8, 9, 10, 4, 5, 3, 2, 1]
+
+    # Cache hit in the common prefix when the original block is already free.
+    # Incomplete 1 block (6 tokens)
+    unique_token_ids = [3] * 6
+    req2 = make_request("2", common_token_ids + unique_token_ids)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
+    assert len(manager.req_to_block_hashes[req2.request_id]) == 3
+    assert computed_blocks.get_block_ids() == ([1, 2, 3], )
+    assert num_computed_tokens == 3 * 16
+    num_new_tokens = 53 - 3 * 16
+    blocks = manager.allocate_slots(req2, num_new_tokens,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    assert blocks.get_block_ids() == ([6], )
+
+    # Although we only have 6 free blocks, we have 8 blocks in
+    # the free block queue due to lazy removal.
+    assert free_block_queue.num_free_blocks == 6
+    assert all(
+        [b.ref_cnt == 0 for b in free_block_queue.get_all_free_blocks()])
+    assert len([b for b in free_block_queue.get_all_free_blocks()]) == 6
+
+    manager.free(req2)
+
+    # Cache miss and eviction.
+    req3 = make_request("3", [99] * (16 * 10))
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
+    assert not computed_blocks.blocks[0]
+    assert num_computed_tokens == 0
+    blocks = manager.allocate_slots(req3, 16 * 10,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    # This block ID order also checks the eviction order.
+    assert blocks.get_block_ids() == ([7, 8, 9, 10, 4, 5, 6, 3, 2, 1], )
+
+    assert free_block_queue.num_free_blocks == 0
+    assert (free_block_queue.fake_free_list_head.next_free_block
+            is free_block_queue.fake_free_list_tail)
+    assert (free_block_queue.fake_free_list_tail.prev_free_block
+            is free_block_queue.fake_free_list_head)
+
+
+def test_prefill_hybrid_model():
+    block_size = 16
+    manager = KVCacheManager(
+        make_kv_cache_config_hybrid_model(block_size, 21),
+        max_model_len=8192,
+        enable_caching=True,
+    )
+
+    hash_fn = hash
+
+    # Complete 3 blocks (48 tokens)
+    common_token_ids = [i for i in range(3) for _ in range(block_size)]
+
+    # Fully cache miss
+    # Incomplete 1 block (7 tokens)
+    unique_token_ids = [3] * 7
+    all_token_ids = common_token_ids + unique_token_ids
+    req0 = make_request("0", all_token_ids)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
+    assert len(manager.req_to_block_hashes[req0.request_id]) == 3
+    assert not computed_blocks.blocks[0]
+    assert num_computed_tokens == 0
+    blocks = manager.allocate_slots(req0, 55,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], [5, 6, 7,
+                                                     8], [9, 10, 11, 12])
+
+    # Check full block metadata
+    parent_block_hash = None
+    for length, block_ids in zip((1, 2, 3),
+                                 ((1, 5, 9), (2, 6, 10), (3, 7, 11))):
+        block_tokens = tuple(all_token_ids[(length - 1) * 16:length * 16])
+        block_hash = hash_block_tokens(hash_fn, parent_block_hash,
+                                       block_tokens)
+        for block_id in block_ids:
+            assert manager.block_pool.blocks[
+                block_id].block_hash.block_hash == block_hash
+            assert manager.block_pool.blocks[block_id].ref_cnt == 1
+        parent_block_hash = block_hash.hash_value
+
+    # Check partial block metadata
+    for block_id in (4, 8, 12):
+        assert manager.block_pool.blocks[block_id].block_hash is None
+        assert manager.block_pool.blocks[block_id].ref_cnt == 1
+
+    # Cache hit in the common prefix
+    # Incomplete 1 block (5 tokens)
+    unique_token_ids = [3] * 5
+    req1 = make_request("1", common_token_ids + unique_token_ids)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
+    assert len(manager.req_to_block_hashes[req1.request_id]) == 3
+    assert computed_blocks.get_block_ids() == ([1, 2, 3], [0, 6,
+                                                           7], [0, 10, 11])
+    assert num_computed_tokens == 3 * 16
+    num_new_tokens = 53 - 3 * 16
+    blocks = manager.allocate_slots(req1, num_new_tokens,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    assert blocks.get_block_ids() == ([13], [14], [15])
+    for block_per_group in computed_blocks.blocks:
+        for block in block_per_group:
+            if block != manager.block_pool.null_block:
+                assert block.ref_cnt == 2
+
+    block_hashes = manager.req_to_block_hashes[req1.request_id]
+    manager.free(req0)
+    manager.free(req1)
+
+    cached_block_hash_to_block_bak = copy.copy(
+        manager.block_pool.cached_block_hash_to_block)
+
+    def test_partial_request_hit(request_id: str,
+                                 hash_to_evict: list[BlockHashWithGroupId],
+                                 expect_hit_length: int):
+        req = make_request(request_id, common_token_ids + unique_token_ids)
+        for hash_with_group_id in hash_to_evict:
+            manager.block_pool.cached_block_hash_to_block.pop(
+                hash_with_group_id)
+        computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
+        assert len(manager.req_to_block_hashes[req.request_id]) == 3
+        assert num_computed_tokens == expect_hit_length * block_size
+        for block_per_group in computed_blocks.blocks:
+            assert len(block_per_group) == num_computed_tokens // block_size
+        for hash_with_group_id in hash_to_evict:
+            manager.block_pool.cached_block_hash_to_block[
+                hash_with_group_id] = cached_block_hash_to_block_bak[
+                    hash_with_group_id]
+        manager.free(req)
+
+    # Evict the blocks outside sliding window, does not affect the hit length.
+    test_partial_request_hit("2", [
+        BlockHashWithGroupId(block_hashes[0], 1),
+        BlockHashWithGroupId(block_hashes[0], 2)
+    ], 3)
+
+    # Evict the first block of full attention, makes total cache miss.
+    test_partial_request_hit("3", [
+        BlockHashWithGroupId(block_hashes[0], 0),
+    ], 0)
+
+    # Evict the last block of all layers, reduces the hit length to 2.
+    test_partial_request_hit("4", [
+        BlockHashWithGroupId(block_hashes[2], 0),
+        BlockHashWithGroupId(block_hashes[2], 1),
+        BlockHashWithGroupId(block_hashes[2], 2),
+    ], 2)
+
+    # Evict the last block of full attention, reduces the hit length to 2.
+    test_partial_request_hit("5", [BlockHashWithGroupId(block_hashes[2], 0)],
+                             2)
+
+    # Evict the last block of sliding window, reduces the hit length to 2.
+    test_partial_request_hit("6", [BlockHashWithGroupId(block_hashes[2], 1)],
+                             2)
+
+    # Evict the last block of sliding window, reduces the hit length to 2.
+    test_partial_request_hit("7", [BlockHashWithGroupId(block_hashes[2], 2)],
+                             2)
+
+    # Evict different set of blocks for full attention and sliding window makes
+    # total cache miss.
+    # The cache hit length of full attention is 1 * block_size.
+    # The cache hit length of sliding window is 2 * block_size.
+    # Then it is cache miss as the two type of layers have different hit length.
+    test_partial_request_hit("8", [
+        BlockHashWithGroupId(block_hashes[2], 0),
+        BlockHashWithGroupId(block_hashes[0], 1),
+        BlockHashWithGroupId(block_hashes[0], 2),
+    ], 0)
+
+
+def test_prefill_plp():
+    '''Test prefill with APC and some prompt logprobs (plp) requests.
+
+    1. Schedule plp request and validate APC block allocation
+    2. Schedule non-plp request and validate blocks
+    3. Schedule plp request; no hit should occur; validate blocks
+    '''
+    manager = KVCacheManager(
+        make_kv_cache_config(16, 11),
+        max_model_len=8192,
+        enable_caching=True,
+    )
+    # the default hash function is hash
+    hash_fn = hash
+
+    # Complete 3 blocks (48 tokens)
+    common_token_ids = [i for i in range(3) for _ in range(16)]
+
+    # Request #0 is a prompt logprobs request
+    # Fully cache miss
+    # Incomplete 1 block (7 tokens)
+    unique_token_ids = [3] * 7
+    all_token_ids = common_token_ids + unique_token_ids
+    req0 = make_request("0", all_token_ids, prompt_logprobs=5)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
+    assert len(manager.req_to_block_hashes[req0.request_id]) == 0
+    assert not computed_blocks.blocks[0]
+    assert num_computed_tokens == 0
+    blocks = manager.allocate_slots(req0, 55,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
+    req0_block_hashes = [b.block_hash for b in blocks.blocks[0]]
+
+    # Check full block metadata
+    parent_block_hash = None
+    for block_id in (1, 2, 3):
+        block_tokens = tuple(all_token_ids[(block_id - 1) * 16:block_id * 16])
+        block_hash = hash_block_tokens(hash_fn, parent_block_hash,
+                                       block_tokens)
+        assert manager.block_pool.blocks[
+            block_id].block_hash.block_hash == block_hash
+        assert manager.block_pool.blocks[block_id].ref_cnt == 1
+        parent_block_hash = block_hash.hash_value
+
+    # Check partial block metadata
+    for block_id in (4, ):
+        assert manager.block_pool.blocks[block_id].block_hash is None
+        assert manager.block_pool.blocks[block_id].ref_cnt == 1
+
+    # Request #1 is a non-prompt-logprobs request:
+    # Cache hit in the common prefix when the original block is still in use.
+    # Incomplete 1 block (5 tokens)
+    unique_token_ids = [3] * 5
+    req1 = make_request("1", common_token_ids + unique_token_ids)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
+    assert len(manager.req_to_block_hashes[req1.request_id]) == 3
+    assert computed_blocks.get_block_ids() == ([1, 2, 3], )
+    assert num_computed_tokens == 3 * 16
+    num_new_tokens = 53 - 3 * 16
+    blocks = manager.allocate_slots(req1, num_new_tokens,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    assert blocks.get_block_ids() == ([5], )
+    for block in computed_blocks.blocks[0]:
+        assert block.ref_cnt == 2
+
+    # At this point, we should have 5 free blocks left.
+    assert manager.block_pool.free_block_queue.num_free_blocks == 5
+
+    manager.free(req0)
+    manager.free(req1)
+
+    # All blocks should be available.
+    assert manager.block_pool.free_block_queue.num_free_blocks == 10
+    # The order should be
+    # [unallocated (6, 7, 8, 9, 10)]
+    # [unique_req0 (4)]
+    # [unique_req1 (5)]
+    # [common (3, 2, 1)]
+    assert [
+        b.block_id
+        for b in manager.block_pool.free_block_queue.get_all_free_blocks()
+    ] == [6, 7, 8, 9, 10, 4, 5, 3, 2, 1]
+
+    # Request #2 is a prompt-logprobs request:
+    # NO cache hit in the common prefix; duplicates request #0 cached blocks
+    unique_token_ids = [3] * 6
+    req2 = make_request("2",
+                        common_token_ids + unique_token_ids,
+                        prompt_logprobs=5)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
+    assert len(manager.req_to_block_hashes[req2.request_id]) == 0
+    assert not computed_blocks.blocks[0]
+    assert num_computed_tokens == 0
+    blocks = manager.allocate_slots(req2, 55,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    block_ids = blocks.get_block_ids()
+    # Duplicate cached blocks have different ids but same hashes vs request #0
+    assert [b.block_hash for b in blocks.blocks[0]] == req0_block_hashes
+    assert block_ids != ([1, 2, 3, 4], )
+
+    # Request #2 block hashes are valid since request #0 hashes are.
+    # Check block reference counts.
+    for block_id in block_ids[0]:
+        assert manager.block_pool.blocks[block_id].ref_cnt == 1
+
+    manager.free(req2)
+
+
+def test_decode():
+    manager = KVCacheManager(
+        make_kv_cache_config(16, 11),
+        max_model_len=8192,
+        enable_caching=True,
+    )
+
+    # Complete 3 blocks (48 tokens)
+    common_token_ids = [i for i in range(3) for _ in range(16)]
+
+    # Fully cache miss
+    # Incomplete 1 block (7 tokens)
+    unique_token_ids = [3] * 7
+    req0 = make_request("0", common_token_ids + unique_token_ids)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
+    assert not computed_blocks.blocks[0]
+    assert num_computed_tokens == 0
+    blocks = manager.allocate_slots(req0, 55,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
+
+    # Append slots without allocating a new block.
+    req0.num_computed_tokens = 55
+    for _ in range(4):
+        req0.append_output_token_ids(8)
+    new_blocks = manager.allocate_slots(req0, 4,
+                                        len(computed_blocks.blocks[0]) * 16,
+                                        computed_blocks)
+    assert new_blocks is not None and len(new_blocks.blocks[0]) == 0
+    assert manager.coordinator.single_type_managers[0].req_to_blocks[
+        req0.request_id][-1].block_hash is None
+
+    # Append slots with allocating a new block.
+    req0.num_computed_tokens = 59
+    # 9 tokens to fill the previous block, and 10 tokens to fill
+    # the preallocated block.
+    for _ in range(9 + 10):
+        req0.append_output_token_ids(7)
+    new_blocks = manager.allocate_slots(req0, 19,
+                                        len(computed_blocks.blocks[0]) * 16,
+                                        computed_blocks)
+    assert new_blocks is not None and len(new_blocks.blocks[0]) == 1
+    assert manager.coordinator.single_type_managers[0].req_to_blocks[
+        req0.request_id][-2].block_hash is not None
+    assert manager.coordinator.single_type_managers[0].req_to_blocks[
+        req0.request_id][-1].block_hash is None
+
+
+def test_evict():
+    manager = KVCacheManager(
+        make_kv_cache_config(16, 11),
+        max_model_len=8192,
+        enable_caching=True,
+    )
+
+    last_token_id = 5 * 16 + 7
+    req0 = make_request("0", list(range(last_token_id)))
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
+    assert not computed_blocks.blocks[0]
+    assert num_computed_tokens == 0
+    blocks = manager.allocate_slots(req0, 5 * 16 + 7,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    assert len(blocks.blocks[0]) == 6  # 5 full + 1 partial
+
+    # 3 blocks.
+    req1 = make_request("1", list(range(last_token_id,
+                                        last_token_id + 3 * 16)))
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
+    assert not computed_blocks.blocks[0]
+    assert num_computed_tokens == 0
+    blocks = manager.allocate_slots(req1, 3 * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    assert len(blocks.blocks[0]) == 3  # 3 full blocks
+    last_token_id += 3 * 16
+
+    # 10 - (6 + 3) == 1
+    assert manager.block_pool.free_block_queue.num_free_blocks == 1
+
+    manager.free(req0)
+    manager.free(req1)
+    assert manager.block_pool.free_block_queue.num_free_blocks == 10
+    assert [
+        b.block_id
+        for b in manager.block_pool.free_block_queue.get_all_free_blocks()
+    ] == [10, 6, 5, 4, 3, 2, 1, 9, 8, 7]
+
+    # Touch the first 2 blocks.
+    req2 = make_request("2", list(range(2 * 16 + 3)))
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
+    assert computed_blocks.get_block_ids() == ([1, 2], )
+    assert num_computed_tokens == 2 * 16
+    blocks = manager.allocate_slots(req2, 3,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    assert blocks.get_block_ids() == ([10], )
+    assert manager.block_pool.free_block_queue.num_free_blocks == 7
+
+
+def test_hash_block_correct_reuse():
+    """
+    This tests when a previously cached block is reused as a new block,
+    its hash metadata should be correctly reset.
+    """
+    block_size = 16
+    manager = KVCacheManager(
+        make_kv_cache_config(16, 2),
+        max_model_len=8192,
+        enable_caching=True,
+    )
+
+    # Allocate 1 block and cache it.
+    num_tokens = block_size * 1
+    req = make_request("0", list(range(num_tokens)))
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
+    assert not computed_blocks.blocks[0]
+    assert num_computed_tokens == 0
+    blocks = manager.allocate_slots(req, num_tokens,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    assert len(blocks.blocks[0]) == 1
+
+    # Deallocate the block.
+    manager.free(req)
+
+    # Allocate a new block that's not full, make sure hash info on the
+    # block is cleared.
+    req = make_request("1", list(range(num_tokens - 1)))
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
+    assert not computed_blocks.blocks[0]
+    assert num_computed_tokens == 0
+    blocks = manager.allocate_slots(req, num_tokens - 1,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    assert len(blocks.blocks[0]) == 1
+
+    assert manager.block_pool.blocks[blocks.blocks[0]
+                                     [0].block_id].block_hash is None
+
+
+def test_computed_blocks_not_evicted():
+    """
+    Test that the computed blocks are not evicted when getting new blocks
+    for a request if there are any other free blocks.
+    """
+    block_size = 16
+    manager = KVCacheManager(
+        make_kv_cache_config(block_size, 3),
+        max_model_len=8192,
+        enable_caching=True,
+    )
+
+    # Allocate a block and cache it.
+    num_tokens = block_size * 1
+    req0 = make_request("0", list(range(num_tokens)))
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
+    assert not computed_blocks.blocks[0]
+    assert num_computed_tokens == 0
+    blocks = manager.allocate_slots(req0, num_tokens,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    assert len(blocks.blocks[0]) == 1
+    assert blocks.blocks[0][0].block_id == 1
+
+    # Allocate another block.
+    req1 = make_request("1", list(range(num_tokens, num_tokens * 2)))
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
+    assert not computed_blocks.blocks[0]
+    assert num_computed_tokens == 0
+    blocks = manager.allocate_slots(req1, num_tokens,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    assert len(blocks.blocks[0]) == 1
+    assert blocks.blocks[0][0].block_id == 2
+
+    # Free the blocks.
+    manager.free(req0)
+    manager.free(req1)
+
+    # Now if we have a cache hit on the first block, we should evict the second
+    # cached block rather than the first one.
+    req2 = make_request("2", list(range(num_tokens * 2)))
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
+    assert len(computed_blocks.blocks[0]) == 1
+    assert computed_blocks.blocks[0][0].block_id == 1
+    assert num_computed_tokens == block_size
+
+    blocks = manager.allocate_slots(req2, num_tokens * 2 - num_tokens,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    assert len(blocks.blocks[0]) == 1
+    assert blocks.blocks[0][0].block_id == 2
+
+
+def test_basic_prefix_caching_disabled():
+    """
+    This tests that the prefix caching is disabled.
+    """
+    block_size = 4
+    manager = KVCacheManager(
+        make_kv_cache_config(block_size, 5),
+        max_model_len=8192,
+        enable_caching=False,
+    )
+
+    req1 = make_request("1", list(range(10)))  # 2 blocks and some more
+
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
+    assert not computed_blocks.blocks[0]
+    assert num_computed_tokens == 0
+    blocks = manager.allocate_slots(req1, 10,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    assert len(blocks.blocks[0]) == 3
+
+    # Free the blocks.
+    manager.free(req1)
+
+    # No caching.
+    req2 = make_request("2", list(range(16)))  # shared prefix
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
+    assert not computed_blocks.blocks[0]
+    assert num_computed_tokens == 0
+    blocks = manager.allocate_slots(req2, 16,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    assert len(blocks.blocks[0]) == 4
+
+    # New requests should not have any blocks.
+    req3 = make_request("3", list(range(4)))
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
+    assert not computed_blocks.blocks[0]
+    assert num_computed_tokens == 0
+    blocks = manager.allocate_slots(req3, 4,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    assert not blocks
+
+
+@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash])
+def test_cache_blocks(hash_fn):
+    """
+    This is a unit test that tests the correctness of the _cache_full_blocks
+    function of KVCacheManager.
+    """
+    init_none_hash(hash_fn)
+
+    block_size = 4
+    block_pool = BlockPool(
+        num_gpu_blocks=5,
+        enable_caching=True,
+    )
+    # Req:
+    #  Block 0: [0, 1, 2, 3]
+    #  Block 1: [4, 5, 6, 7]
+    #  Block 2: [8, 9, 10, 11]
+    #  Block 3: [12, 13]
+    req = make_request("0", list(range(14)))
+
+    # Test that blocks are cached correctly for 2 full blocks from the start.
+    blocks = [KVCacheBlock(block_id=i) for i in range(2)]
+    block_hashes: list[BlockHash] = []
+
+    block_pool.cache_full_blocks(
+        request=req,
+        blocks=blocks,
+        block_hashes=block_hashes,
+        num_cached_blocks=0,
+        num_full_blocks=2,
+        block_size=block_size,
+        hash_fn=hash_fn,
+        kv_cache_group_id=0,
+    )
+
+    assert len(block_pool.cached_block_hash_to_block) == 2
+    assert all([block.block_hash is not None for block in blocks])
+
+    # Test that blocks that don't start from the beginning are cached correctly.
+    blocks += [KVCacheBlock(block_id=2)]
+    block_pool.cache_full_blocks(
+        request=req,
+        blocks=blocks,
+        block_hashes=block_hashes,
+        num_cached_blocks=2,
+        num_full_blocks=3,
+        block_size=block_size,
+        hash_fn=hash_fn,
+        kv_cache_group_id=0,
+    )
+    assert len(block_pool.cached_block_hash_to_block) == 3
+    assert blocks[0].block_hash is not None
+
+
+def test_cache_blocks_multi_group():
+    """
+    This tests that blocks are cached correctly for different kv cache groups.
+    """
+    block_size = 4
+    block_pool = BlockPool(num_gpu_blocks=10, enable_caching=True)
+
+    # Req:
+    #  Block 0/4: [0, 1, 2, 3]
+    #  Block 1/5: [4, 5, 6, 7]
+    #  Block 2/6: [8, 9, 10, 11]
+    #  Block 3/7: [12, 13]
+    req = make_request("0", list(range(14)))
+
+    # Cache the blocks for group 0.
+    blocks = [KVCacheBlock(block_id=i) for i in range(2)]
+    block_hashes: list[BlockHash] = []
+    block_pool.cache_full_blocks(
+        request=req,
+        blocks=blocks,
+        block_hashes=block_hashes,
+        num_cached_blocks=0,
+        num_full_blocks=2,
+        block_size=block_size,
+        hash_fn=hash,
+        kv_cache_group_id=0,
+    )
+    assert len(block_pool.cached_block_hash_to_block) == 2
+    assert len(block_hashes) == 2
+    assert all([block.block_hash is not None for block in blocks])
+
+    # Cache the blocks for group 1.
+    blocks = [KVCacheBlock(block_id=i) for i in range(3)]
+    block_pool.cache_full_blocks(
+        request=req,
+        blocks=blocks,
+        block_hashes=block_hashes,
+        num_cached_blocks=0,
+        num_full_blocks=3,
+        block_size=block_size,
+        hash_fn=hash,
+        kv_cache_group_id=1,
+    )
+    assert len(block_pool.cached_block_hash_to_block) == 5
+    assert len(block_hashes) == 3
+    assert all([block.block_hash is not None for block in blocks])
+
+    # Block hash 0: hit for group 0 and 1
+    # Block hash 1: hit for group 0 and 1
+    # Block hash 2: hit for group 1
+
+    assert block_pool.get_cached_block(block_hashes[0],
+                                       kv_cache_group_ids=[0]) is not None
+    assert block_pool.get_cached_block(block_hashes[1],
+                                       kv_cache_group_ids=[0]) is not None
+    assert block_pool.get_cached_block(block_hashes[2],
+                                       kv_cache_group_ids=[0]) is None
+    assert block_pool.get_cached_block(block_hashes[0],
+                                       kv_cache_group_ids=[1]) is not None
+    assert block_pool.get_cached_block(block_hashes[1],
+                                       kv_cache_group_ids=[1]) is not None
+    assert block_pool.get_cached_block(block_hashes[2],
+                                       kv_cache_group_ids=[1]) is not None
+    assert block_pool.get_cached_block(block_hashes[0],
+                                       kv_cache_group_ids=[0, 1]) is not None
+    assert block_pool.get_cached_block(block_hashes[1],
+                                       kv_cache_group_ids=[0, 1]) is not None
+    assert block_pool.get_cached_block(block_hashes[2],
+                                       kv_cache_group_ids=[0, 1]) is None
+
+
+def test_mm_prefix_caching():
+    """
+    This tests that the multi-modal prefix caching is correct.
+    """
+    manager = KVCacheManager(
+        make_kv_cache_config(16, 11),
+        max_model_len=8192,
+        enable_caching=True,
+    )
+
+    # Common prompt tokens (T is text tokens and P is image placeholder tokens)
+    # [T,...,T, P0,...,P0], [P0,...,P0,T,...,T,P1,...,P1], [P1,...,P1]
+    common_token_ids = list(range(10)) + [-1] * 6
+    common_token_ids += [-1] * 4 + list(range(10, 20)) + [-1] * 2
+    common_token_ids += [-1] * 16
+
+    common_mm_positions = [
+        PlaceholderRange(offset=11, length=10),
+        PlaceholderRange(offset=30, length=18),
+    ]
+    common_mm_hashes = ["aaa", "bbb"]
+
+    # A unique image plus some text tokens.
+    unique_token_ids = [-1] * 7 + [100] * 4
+    all_token_ids = common_token_ids + unique_token_ids
+    mm_positions = common_mm_positions + [
+        PlaceholderRange(offset=48, length=7)
+    ]
+    mm_hashes = common_mm_hashes + ["ccc"]
+    req0 = make_request("0",
+                        all_token_ids,
+                        mm_positions=mm_positions,
+                        mm_hashes=mm_hashes)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
+
+    # Completed block should have hashes with extra keys.
+    assert not computed_blocks.blocks[0]
+    assert num_computed_tokens == 0
+    block_hashes = manager.req_to_block_hashes[req0.request_id]
+    assert len(block_hashes) == 3
+    assert block_hashes[0].extra_keys == ("aaa", )
+    assert block_hashes[1].extra_keys == ("aaa", "bbb")
+    assert block_hashes[2].extra_keys == ("bbb", )
+
+    blocks = manager.allocate_slots(req0, 59,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
+    req0.num_computed_tokens = 59
+
+    # Append slots without allocating a new block.
+    for _ in range(5):
+        req0.append_output_token_ids(8)
+    new_blocks = manager.allocate_slots(req0, 5,
+                                        len(computed_blocks.blocks[0]) * 16,
+                                        computed_blocks)
+    assert new_blocks is not None and len(new_blocks.blocks[0]) == 0
+
+    # The just completed block should have hashes with extra keys.
+    assert len(block_hashes) == 4
+    assert block_hashes[3].extra_keys == ("ccc", )
+
+    # Cache hit.
+    unique_token_ids = [-1] * 7 + [200] * 5
+    all_token_ids = common_token_ids + unique_token_ids
+    mm_positions = common_mm_positions + [
+        PlaceholderRange(offset=48, length=7)
+    ]
+    mm_hashes = common_mm_hashes + ["ccc"]
+    req1 = make_request("1",
+                        all_token_ids,
+                        mm_positions=mm_positions,
+                        mm_hashes=mm_hashes)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
+    assert len(computed_blocks.blocks[0]) == 3
+    assert num_computed_tokens == 3 * 16
+
+
+def test_cache_key_salting():
+    """
+    This tests that cache salts are applied during hashing and the cache
+    is separated cache as expected.
+    """
+    block_size = 16
+    manager = KVCacheManager(
+        make_kv_cache_config(block_size, 11),
+        max_model_len=8192,
+        enable_caching=True,
+    )
+
+    # 3 complete blocks and an incomplete block with 11 tokens.
+    common_token_ids = [i for i in range(3) for _ in range(block_size)]
+    token_ids = common_token_ids + [3] * 11
+    req0 = make_request("0", token_ids, cache_salt="salt1")
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
+
+    # Completed block should have hashes with extra keys.
+    assert not computed_blocks.blocks[0]
+    assert num_computed_tokens == 0
+    block_hashes = manager.req_to_block_hashes[req0.request_id]
+    assert len(block_hashes) == 3
+    assert block_hashes[0].extra_keys == ("salt1", )
+    assert block_hashes[1].extra_keys is None
+    assert block_hashes[2].extra_keys is None
+
+    blocks = manager.allocate_slots(req0, 59,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
+    req0.num_computed_tokens = 59
+
+    # Append slots without allocating a new block.
+    for _ in range(5):
+        req0.append_output_token_ids(8)
+    new_blocks = manager.allocate_slots(req0, 5,
+                                        len(computed_blocks.blocks[0]) * 16,
+                                        computed_blocks)
+    assert new_blocks is not None and len(new_blocks.blocks[0]) == 0
+
+    # Now one more block that should not have extra keys.
+    assert len(block_hashes) == 4
+    assert block_hashes[3].extra_keys is None
+
+    # Test cache hit with a new request that has the same salt.
+    token_ids = common_token_ids + [4] * 11
+    req1 = make_request("1", token_ids, cache_salt="salt1")
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
+    # Should match only a prefix of 3 blocks.
+    assert len(computed_blocks.blocks[0]) == 3
+    assert num_computed_tokens == 3 * block_size
+
+    # Test cache miss with same content but different salt.
+    token_ids = common_token_ids + [4] * 11
+    req2 = make_request("2", token_ids, cache_salt="salt2")
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
+    assert len(computed_blocks.blocks[0]) == 0
+    assert num_computed_tokens == 0
+    block_hashes = manager.req_to_block_hashes[req2.request_id]
+    assert len(block_hashes) == 3
+    assert block_hashes[0].extra_keys == ("salt2", )
+
+
+def test_prefill_not_enough_free_blocks_with_computed_blocks():
+    """
+    This is a unit test that tests the correctness of the allocate_slots
+    when there is not enough free blocks. Specifically, when a request
+    has computed blocks but cannot be allocated due to not enough free blocks,
+    the computed blocks should not be touched.
+    """
+    block_size = 16
+    manager = KVCacheManager(
+        make_kv_cache_config(block_size, 11),
+        max_model_len=8192,
+        enable_caching=True,
+    )
+    # Complete 3 blocks (48 tokens)
+    # | Common-0 | Common-1 | Common-2 | ... |
+    common_token_ids = [i for i in range(3) for _ in range(16)]
+    req0 = make_request("0", common_token_ids)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
+    assert not computed_blocks.blocks[0]
+    assert num_computed_tokens == 0
+    manager.allocate_slots(req0, 48,
+                           len(computed_blocks.blocks[0]) * 16,
+                           computed_blocks)
+    block_part0 = manager.coordinator.single_type_managers[0].req_to_blocks[
+        req0.request_id]
+
+    # | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... |
+    req1 = make_request("1", common_token_ids * 2)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
+    assert computed_blocks.blocks[0] == block_part0
+    assert num_computed_tokens == 3 * 16
+    manager.allocate_slots(req1, 48,
+                           len(computed_blocks.blocks[0]) * 16,
+                           computed_blocks)
+    block_part1 = manager.coordinator.single_type_managers[0].req_to_blocks[
+        req1.request_id]
+    # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
+    # | Req1-5(F)| ... |
+    manager.free(req1)
+    assert {block.ref_cnt for block in block_part1[:3]} == {1}
+    assert {block.ref_cnt for block in block_part1[3:]} == {0}
+
+    # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
+    # | Req1-5(F)| Req2-0   | Req2-1   | ... |
+    req2 = make_request("2", [7] * block_size * 2)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
+    assert not computed_blocks.blocks[0]
+    assert num_computed_tokens == 0
+    manager.allocate_slots(req2, block_size * 2,
+                           len(computed_blocks.blocks[0]) * 16,
+                           computed_blocks)
+
+    # Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed,
+    # but it cannot be allocated due to insufficient free blocks (2).
+    # In this case, the ref_cnt of the computed blocks should not be changed.
+    assert manager.block_pool.free_block_queue.num_free_blocks == 5
+    req3 = make_request("3", common_token_ids * 3)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
+    assert computed_blocks.blocks[0] == block_part1
+    assert num_computed_tokens == 6 * 16
+    # Req3 cannot be allocated.
+    assert manager.allocate_slots(req3, 48,
+                                  len(computed_blocks.blocks[0]) * 16,
+                                  computed_blocks) is None
+    # Block 0-2 are used by Req 1.
+    assert {block.ref_cnt for block in block_part1[:3]} == {1}
+    # Block 3-5 are free.
+    assert {block.ref_cnt for block in block_part1[3:]} == {0}
+
+
+def test_reset_prefix_cache():
+    manager = KVCacheManager(
+        make_kv_cache_config(16, 11),
+        max_model_len=8192,
+        enable_caching=True,
+    )
+
+    full_block_token_ids = [i for i in range(3) for _ in range(16)]
+    unique_token_ids = [3] * 7
+    all_token_ids = full_block_token_ids + unique_token_ids
+    req0 = make_request("0", all_token_ids)
+    blocks = manager.allocate_slots(req0, 55)
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
+
+    unique_token_ids = [4] * 7
+    all_token_ids = full_block_token_ids + unique_token_ids
+    req1 = make_request("1", all_token_ids)
+    computed_blocks, _ = manager.get_computed_blocks(req1)
+    assert len(manager.req_to_block_hashes[req1.request_id]) == 3
+    assert len(computed_blocks.blocks[0]) == 3
+    blocks = manager.allocate_slots(req1, 7,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    assert blocks.get_block_ids() == ([5], )
+
+    # Failed to reset prefix cache because some blocks are not freed yet.
+    assert not manager.reset_prefix_cache()
+    assert manager.block_pool.cached_block_hash_to_block
+
+    # Free the blocks.
+    manager.free(req0)
+    manager.free(req1)
+
+    assert manager.reset_prefix_cache()
+    assert not manager.block_pool.cached_block_hash_to_block
+    assert all([blk.block_hash is None for blk in manager.block_pool.blocks])
+
+
+def test_prefix_cache_stats_disabled():
+    """Test that prefix_cache_stats is None when log_stats is False."""
+    manager = KVCacheManager(
+        make_kv_cache_config(16, 11),
+        max_model_len=8192,
+        enable_caching=True,
+        log_stats=False,  # Disable logging stats
+    )
+    assert manager.prefix_cache_stats is None
+
+    # Call all functions that check whether log_stats is disabled.
+    req = make_request("0", list(range(16)))
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
+    assert not computed_blocks.blocks[0]
+    assert num_computed_tokens == 0
+    manager.allocate_slots(req, 16,
+                           len(computed_blocks.blocks[0]) * 16,
+                           computed_blocks)
+    manager.reset_prefix_cache()
+
+    # Ensure prefix_cache_stats remains None
+    assert manager.prefix_cache_stats is None
+
+
+def test_maybe_evict_cached_block():
+    pool = BlockPool(num_gpu_blocks=4, enable_caching=True)
+    block_hash0 = BlockHashWithGroupId(block_hash=BlockHash(hash_value=10,
+                                                            token_ids=(100, )),
+                                       group_id=1000)
+    block_hash1 = BlockHashWithGroupId(block_hash=BlockHash(hash_value=20,
+                                                            token_ids=(200, )),
+                                       group_id=2000)
+    block_hash2 = BlockHashWithGroupId(block_hash=BlockHash(hash_value=30,
+                                                            token_ids=(300, )),
+                                       group_id=3000)
+    block_hashes = [
+        block_hash0,
+        block_hash1,
+        block_hash2,
+        # block3 had the exact same block_hash as the first block
+        block_hash0,
+    ]
+    assert len(pool.blocks) == len(block_hashes)
+    # Manually add all blocks to cached_blocks
+    for block, block_hash in zip(pool.blocks, block_hashes):
+        block.block_hash = block_hash
+        pool.cached_block_hash_to_block[block_hash][block.block_id] = block
+
+    block0, block1, block2, block3 = pool.blocks
+    assert pool.cached_block_hash_to_block == {
+        block_hash0: {
+            block0.block_id: block0,
+            block3.block_id: block3
+        },
+        block_hash1: {
+            block1.block_id: block1
+        },
+        block_hash2: {
+            block2.block_id: block2
+        }
+    }
+    # Evict block1
+    pool._maybe_evict_cached_block(block1)
+    assert pool.cached_block_hash_to_block == {
+        block_hash0: {
+            block0.block_id: block0,
+            block3.block_id: block3
+        },
+        block_hash2: {
+            block2.block_id: block2
+        }
+    }
+    # Evict block0: block_hash0 entry should NOT be removed, as block3
+    # also use the same hash
+    pool._maybe_evict_cached_block(block0)
+    assert pool.cached_block_hash_to_block == {
+        block_hash0: {
+            block3.block_id: block3
+        },
+        block_hash2: {
+            block2.block_id: block2
+        }
+    }
+    # Evict block2
+    pool._maybe_evict_cached_block(block2)
+    assert pool.cached_block_hash_to_block == {block_hash0: {3: block3}}
+    # Evict block3
+    pool._maybe_evict_cached_block(block3)
+    assert pool.cached_block_hash_to_block == {}
+
+
+@pytest.mark.parametrize("blocks_to_cache", [2, 3, 10])
+def test_kv_cache_events(blocks_to_cache: int):
+    block_size = 16
+    num_blocks = blocks_to_cache + 1
+
+    # Allocate Blocks
+    # Should see a single block stored event with a blocks_to_cache number of
+    # block hashes
+    # take_events should reset the kv_event_queue
+    manager = KVCacheManager(
+        make_kv_cache_config(block_size, num_blocks),
+        max_model_len=8192,
+        enable_caching=True,
+        enable_kv_cache_events=True,
+    )
+
+    num_tokens = block_size * blocks_to_cache
+    req0 = make_request("0", list(range(num_tokens)))
+    _ = manager.allocate_slots(req0, num_tokens)
+    events = manager.take_events()
+
+    block = events[-1]
+    assert (len(block.block_hashes) == blocks_to_cache == len(
+        manager.block_pool.cached_block_hash_to_block))
+    assert len(block.token_ids) == block.block_size * len(block.block_hashes)
+    assert len(manager.block_pool.kv_event_queue) == 0
+
+    stored_block_hash = block.block_hashes
+
+    # Remove blocks and send another request
+    # Should see block_to_cache number of removed block events and a new block
+    # stored event
+    manager.free(req0)
+    req1 = make_request("1", list(range(num_tokens)))
+    _ = manager.allocate_slots(req1, num_tokens)
+    events = manager.take_events()
+
+    for blocks in events[:-1]:
+        assert blocks.block_hashes[0] in stored_block_hash
+    assert len(events) == blocks_to_cache + 1
+    assert (isinstance(events[-2], BlockRemoved))
+    assert (len(events[-1].block_hashes) == blocks_to_cache == len(
+        manager.block_pool.cached_block_hash_to_block))
+
+    # All Blocks Cleared
+    # Should see a single all blocks cleared event
+    manager.free(req1)
+    manager.reset_prefix_cache()
+    events = manager.take_events()
+
+    assert isinstance(events[-1], AllBlocksCleared)
+    assert len(manager.block_pool.cached_block_hash_to_block) == 0
+
+
+def test_eagle_enabled_removes_last_block():
+    """Verify Eagle does NOT remove blocks when request 
+    length is divisible by block size."""
+    block_size = 16
+    manager = KVCacheManager(
+        make_kv_cache_config(block_size, num_blocks=10),
+        max_model_len=8192,
+        enable_caching=True,
+        use_eagle=True,
+    )
+
+    # Request with 3 full blocks (48 tokens)
+    token_ids = [0] * (3 * block_size)
+    req = make_request("divisible_request", token_ids)
+
+    # Prime the cache
+    computed_blocks, _ = manager.get_computed_blocks(req)
+    manager.allocate_slots(req, len(token_ids),
+                           len(computed_blocks.blocks[0]) * 16,
+                           computed_blocks)
+    manager.free(req)
+
+    # New request with same tokens + Eagle enabled
+    req_eagle = make_request("eagle_divisible", token_ids)
+    computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle)
+
+    # Should retain 1 block:
+    # 1. Original 3 blocks → pop last hash → 2 matched blocks
+    # 2. drop last matched block → 1 remaining block
+    assert len(computed_blocks.blocks[0]) == 1
+    assert num_tokens == 1 * block_size  # 16 tokens
+
+
+def test_eagle_with_partial_blocks():
+    """Test Eagle behavior with requests containing partial blocks."""
+    block_size = 16
+    manager = KVCacheManager(
+        make_kv_cache_config(block_size, num_blocks=10),
+        max_model_len=8192,
+        enable_caching=True,
+        use_eagle=True,
+    )
+    # 2 full blocks + 5 tokens (non-divisible length)
+    token_ids = [0] * (2 * block_size + 5)
+    req = make_request("partial_block_test", token_ids)
+
+    # Prime the cache
+    computed_blocks, _ = manager.get_computed_blocks(req)
+    manager.allocate_slots(req, len(token_ids),
+                           len(computed_blocks.blocks[0]) * 16,
+                           computed_blocks)
+    manager.free(req)
+
+    # New request with Eagle enabled
+    req_eagle = make_request("partial_eagle", token_ids)
+    computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle)
+    # Original match: 2 full blocks → Eagle removes 1 → 1 remaining
+    assert len(computed_blocks.blocks[0]) == 1
+    assert num_tokens == 1 * block_size
+
+
+def test_eagle_with_sliding_window():
+    """Test Eagle behavior with sliding window."""
+    block_size = 16
+    sliding_window_spec = SlidingWindowSpec(
+        block_size=block_size,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+        sliding_window=block_size,
+        use_mla=False,
+    )
+    manager = KVCacheManager(
+        KVCacheConfig(
+            num_blocks=10,
+            kv_cache_tensors=[],
+            kv_cache_groups=[KVCacheGroupSpec(['layer'], sliding_window_spec)],
+        ),
+        max_model_len=8192,
+        enable_caching=True,
+        use_eagle=True,
+    )
+
+    # 2 full blocks + 5 tokens (non-divisible length)
+    token_ids = [0] * (2 * block_size + 5)
+    req = make_request("partial_block_test", token_ids)
+
+    # Prime the cache
+    computed_blocks, _ = manager.get_computed_blocks(req)
+    manager.allocate_slots(req, len(token_ids),
+                           len(computed_blocks.blocks[0]) * 16,
+                           computed_blocks)
+    # record the block hash of the first block in the request for later use
+    block_hash_first_block = manager.req_to_block_hashes[req.request_id][0]
+    assert block_hash_first_block is not None
+    manager.free(req)
+
+    # New request with Eagle enabled
+    req_eagle = make_request("partial_eagle", token_ids)
+    computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle)
+    # Original match: 2 full blocks → Eagle removes 1 → 1 remaining
+    assert len(computed_blocks.blocks[0]) == 1
+    assert num_tokens == 1 * block_size
+
+    # Evict the first block in the request
+    assert manager.block_pool.get_cached_block(
+        block_hash_first_block, kv_cache_group_ids=[0]) is not None
+    manager.block_pool.cached_block_hash_to_block.pop(
+        BlockHashWithGroupId(block_hash_first_block, 0))
+
+    # New request
+    req_after_evict = make_request("partial_eagle_after_evict", token_ids)
+    computed_blocks, num_tokens = manager.get_computed_blocks(req_after_evict)
+    # Cache miss. The only hit prefix is [NULL_BLOCK, BLOCK_2] if eagle is
+    # not considered. But after dropping the last matched block due to eagle,
+    # there will be no matched prefix.
+    assert len(computed_blocks.blocks[0]) == 0
+    assert num_tokens == 0
diff --git a/vllm_v0.10.0/tests/v1/core/test_scheduler.py b/vllm_v0.10.0/tests/v1/core/test_scheduler.py
new file mode 100644
index 0000000..a858a4d
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/core/test_scheduler.py
@@ -0,0 +1,1834 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+from unittest.mock import Mock
+
+import pytest
+import torch
+
+from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
+                         SchedulerConfig, SpeculativeConfig, VllmConfig)
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams
+from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
+from vllm.v1.core.sched.scheduler import Scheduler
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheGroupSpec)
+from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.request import Request, RequestStatus
+from vllm.v1.structured_output import StructuredOutputManager
+from vllm.v1.structured_output.request import StructuredOutputRequest
+
+from .utils import EOS_TOKEN_ID, create_requests, create_scheduler
+
+
+def test_add_requests():
+    scheduler = create_scheduler()
+    requests = create_requests(num_requests=10)
+
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        assert request.request_id in scheduler.requests
+        assert len(scheduler.waiting) == i + 1
+
+
+def test_finish_request():
+    scheduler = create_scheduler()
+    requests = create_requests(num_requests=10)
+    for request in requests:
+        scheduler.add_request(request)
+
+    for i, request in enumerate(requests):
+        scheduler.finish_requests(request.request_id,
+                                  RequestStatus.FINISHED_ABORTED)
+        assert request.request_id not in scheduler.requests
+        assert len(scheduler.waiting) == 9 - i
+
+
+def test_get_num_unfinished_requests():
+    scheduler = create_scheduler()
+    requests = create_requests(num_requests=10)
+    for request in requests:
+        scheduler.add_request(request)
+
+    for i, request in enumerate(requests):
+        scheduler.finish_requests(request.request_id,
+                                  RequestStatus.FINISHED_STOPPED)
+        assert scheduler.get_num_unfinished_requests() == len(requests) - i - 1
+
+
+@pytest.mark.parametrize("enable_prefix_caching, prompt_logprobs", [
+    (None, None),
+    (True, 5),
+])
+def test_schedule(enable_prefix_caching: Optional[bool],
+                  prompt_logprobs: Optional[int]):
+    '''Test scheduling.
+    Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs
+    '''
+    scheduler = create_scheduler(enable_prefix_caching=enable_prefix_caching)
+    requests = create_requests(num_requests=10,
+                               prompt_logprobs=prompt_logprobs)
+    for request in requests:
+        scheduler.add_request(request)
+
+    # Test initial scheduling
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == len(requests)
+    assert output.scheduled_cached_reqs.num_reqs == 0
+    assert len(output.finished_req_ids) == 0
+    # Verify all requests are scheduled.
+    for req_id, num_tokens in output.num_scheduled_tokens.items():
+        assert num_tokens == len(requests[int(req_id)].prompt_token_ids)
+
+    # Verify requests moved from waiting to running
+    assert len(scheduler.waiting) == 0
+    assert len(scheduler.running) == len(requests)
+    for i, request in enumerate(requests):
+        assert scheduler.running[i] == request
+
+
+def test_schedule_multimodal_requests():
+    scheduler = create_scheduler(model="llava-hf/llava-1.5-7b-hf")
+    mm_positions = [[PlaceholderRange(offset=i, length=100)]
+                    for i in range(10)]
+    requests = create_requests(
+        num_requests=10,
+        num_tokens=200,
+        mm_positions=mm_positions,
+    )
+    for request in requests:
+        scheduler.add_request(request)
+
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == len(requests)
+    assert output.scheduled_cached_reqs.num_reqs == 0
+    assert len(output.finished_req_ids) == 0
+    for req_id, num_tokens in output.num_scheduled_tokens.items():
+        assert num_tokens == len(requests[int(req_id)].prompt_token_ids)
+    assert len(output.scheduled_encoder_inputs) == 10
+    for req_id, encoder_input in output.scheduled_encoder_inputs.items():
+        assert len(encoder_input) == 1
+
+
+def test_schedule_partial_requests():
+    """Test scheduling behavior with partial requests.
+
+    This test verifies that:
+    1. The scheduler can handle multiple partial requests in a single step when
+       constrained by encoder budget.
+    2. A request in RUNNING state may be unscheduled in subsequent steps if
+       there is insufficient encoder budget.
+    """
+    scheduler = create_scheduler(
+        model="llava-hf/llava-1.5-7b-hf",
+        max_num_batched_tokens=1024,
+    )
+    mm_positions = [[PlaceholderRange(offset=100, length=600)]
+                    for _ in range(3)]
+    requests = create_requests(
+        num_requests=3,
+        num_tokens=800,
+        mm_positions=mm_positions,
+    )
+    for request in requests:
+        scheduler.add_request(request)
+
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 3
+    assert output.scheduled_cached_reqs.num_reqs == 0
+    assert len(output.finished_req_ids) == 0
+
+    assert scheduler.max_num_encoder_input_tokens == 1024
+    # The first request is scheduled fully.
+    assert output.num_scheduled_tokens[requests[0].request_id] == 800
+    # The second request is scheduled partially.
+    # The <img> tokens are not scheduled because of the encoder budget.
+    assert output.num_scheduled_tokens[requests[1].request_id] == 100
+    # The third request is also scheduled partially.
+    # The <img> tokens are not scheduled because of the encoder budget.
+    assert output.num_scheduled_tokens[requests[2].request_id] == 100
+    req_to_index = {
+        request.request_id: i
+        for i, request in enumerate(requests)
+    }
+    model_runner_output = ModelRunnerOutput(
+        req_ids=[request.request_id for request in requests],
+        req_id_to_index=req_to_index,
+        # Only the first request has a sampled token id because
+        # the rest requests are still being prefilled.
+        sampled_token_ids=[[0], [], []],
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(output, model_runner_output)
+
+    # Schedule the next step.
+    # Only the first and second requests are scheduled.
+    # The third request is in the RUNNING state but not scheduled in this step
+    # because of the encoder budget.
+    output = scheduler.schedule()
+    assert len(scheduler.running) == 3
+    assert len(output.scheduled_new_reqs) == 0
+    assert output.scheduled_cached_reqs.num_reqs == 2
+    assert len(output.finished_req_ids) == 0
+    assert output.num_scheduled_tokens[requests[0].request_id] == 1
+    assert output.num_scheduled_tokens[requests[1].request_id] == 700
+    assert requests[2].request_id not in output.num_scheduled_tokens
+
+
+def test_no_mm_input_chunking():
+    # Disable multimodal input chunking.
+    scheduler = create_scheduler(
+        model="llava-hf/llava-1.5-7b-hf",
+        max_num_batched_tokens=1024,
+        disable_chunked_mm_input=True,
+        max_model_len=2048,
+    )
+    mm_positions = [[PlaceholderRange(offset=400, length=800)]]
+    requests = create_requests(num_requests=1,
+                               num_tokens=1200,
+                               mm_positions=mm_positions)
+    for request in requests:
+        scheduler.add_request(request)
+
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 1
+    assert output.scheduled_cached_reqs.num_reqs == 0
+    assert len(output.finished_req_ids) == 0
+    # We want to only see the 400 text tokens at the start scheduled
+    assert output.num_scheduled_tokens[requests[0].request_id] == 400
+
+    req_to_index = {
+        request.request_id: i
+        for i, request in enumerate(requests)
+    }
+    model_runner_output = ModelRunnerOutput(
+        req_ids=[request.request_id for request in requests],
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[] for _ in range(len(requests))],
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(output, model_runner_output)
+
+    output = scheduler.schedule()
+    assert len(scheduler.running) == 1
+    assert len(output.scheduled_new_reqs) == 0
+    assert output.scheduled_cached_reqs.num_reqs == 1
+    assert len(output.finished_req_ids) == 0
+    assert output.num_scheduled_tokens[requests[0].request_id] == 800
+
+    # Test that we fail if we disable chunked mm input and use too small
+    # of a max_num_batched_tokens for the mm input.
+    with pytest.raises(ValueError):
+        _ = create_scheduler(
+            model="llava-hf/llava-1.5-7b-hf",
+            max_num_batched_tokens=100,
+            disable_chunked_mm_input=True,
+        )
+
+
+@pytest.mark.parametrize("enable_prefix_caching", [True, False])
+def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
+    """Test scheduling behavior with concurrent partial requests.
+
+    This test verifies that: there are multiple long prefill requests in the
+    RUNNING state, and we can schedule them together.
+
+    """
+    scheduler = create_scheduler(
+        model="facebook/opt-125m",
+        max_num_batched_tokens=1024,
+        long_prefill_token_threshold=400,
+        enable_prefix_caching=enable_prefix_caching,
+    )
+    requests = create_requests(
+        num_requests=3,
+        num_tokens=800,
+    )
+    for request in requests:
+        scheduler.add_request(request)
+
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 3
+    assert output.scheduled_cached_reqs.num_reqs == 0
+    assert len(output.finished_req_ids) == 0
+
+    # The first request is scheduled partially - 400.
+    assert output.num_scheduled_tokens[requests[0].request_id] == 400
+    # The second request is scheduled partially - 400.
+    assert output.num_scheduled_tokens[requests[1].request_id] == 400
+    # The third request is also scheduled partially - 1024 - 400 - 400 = 224.
+    assert output.num_scheduled_tokens[requests[2].request_id] == 224
+    req_to_index = {
+        request.request_id: i
+        for i, request in enumerate(requests)
+    }
+    model_runner_output = ModelRunnerOutput(
+        req_ids=[request.request_id for request in requests],
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[] for _ in range(len(requests))],
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(output, model_runner_output)
+
+    # Schedule the next step. All three requests are running.
+    # Processed the remaining prefills of the first and second requests.
+    output1 = scheduler.schedule()
+    assert len(scheduler.running) == 3
+    assert len(output1.scheduled_new_reqs) == 0
+    assert output1.scheduled_cached_reqs.num_reqs == 3
+    assert len(output1.finished_req_ids) == 0
+    assert output1.num_scheduled_tokens[requests[0].request_id] == 400
+    assert output1.num_scheduled_tokens[requests[1].request_id] == 400
+    assert output1.num_scheduled_tokens[requests[2].request_id] == 224
+
+    # Schedule the third step. All three requests are running.
+    # First and second requests are in the decode stage.
+    # All the remaining tokens in the third request are processed.
+    model_runner_output = ModelRunnerOutput(
+        req_ids=[request.request_id for request in requests],
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[0], [0]] + [[] for _ in range(len(requests) - 2)],
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(output1, model_runner_output)
+    output2 = scheduler.schedule()
+    assert len(scheduler.running) == 3
+    assert len(output2.scheduled_new_reqs) == 0
+    assert output2.scheduled_cached_reqs.num_reqs == 3
+    assert len(output2.finished_req_ids) == 0
+    assert output2.num_scheduled_tokens[requests[0].request_id] == 1
+    assert output2.num_scheduled_tokens[requests[1].request_id] == 1
+    assert output2.num_scheduled_tokens[
+        requests[2].request_id] == 800 - 224 - 224
+
+
+def test_stop_via_update_from_output():
+    """Test stopping behavior through update_from_output"""
+    scheduler = create_scheduler(num_speculative_tokens=1)
+
+    # Test case 1: Stop on EOS token
+    requests = create_requests(num_requests=2, max_tokens=10)
+    for req in requests:
+        req.num_computed_tokens = req.num_tokens
+        scheduler.requests[req.request_id] = req
+        scheduler.running.append(req)
+        req.status = RequestStatus.RUNNING
+
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=CachedRequestData.make_empty(),
+        num_scheduled_tokens={
+            requests[0].request_id: 1,
+            requests[1].request_id: 2
+        },
+        total_num_scheduled_tokens=3,
+        scheduled_encoder_inputs={},
+        scheduled_spec_decode_tokens={
+            requests[0].request_id: [],
+            requests[1].request_id: [10]
+        },
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None)
+
+    model_output = ModelRunnerOutput(
+        req_ids=[req.request_id for req in requests],
+        req_id_to_index={
+            req.request_id: i
+            for i, req in enumerate(requests)
+        },
+        sampled_token_ids=[[EOS_TOKEN_ID],
+                           [10,
+                            11]],  # First request hits EOS, second continues
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[])
+
+    scheduler.update_from_output(scheduler_output, model_output)
+
+    # Verify first request stopped, second continues
+    assert len(scheduler.running) == 1
+    assert scheduler.running[0].request_id == requests[1].request_id
+    assert requests[0].status == RequestStatus.FINISHED_STOPPED
+    assert requests[0].request_id in scheduler.finished_req_ids
+    assert list(requests[0].output_token_ids) == [EOS_TOKEN_ID]
+    assert list(requests[1].output_token_ids) == [10, 11]
+
+    # Test case 2: Stop on custom stop token
+    scheduler = create_scheduler(num_speculative_tokens=2)
+    requests = create_requests(num_requests=2,
+                               max_tokens=10,
+                               stop_token_ids=[42, 43])
+    for req in requests:
+        req.num_computed_tokens = req.num_tokens
+        scheduler.requests[req.request_id] = req
+        scheduler.running.append(req)
+        req.status = RequestStatus.RUNNING
+
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=CachedRequestData.make_empty(),
+        num_scheduled_tokens={
+            requests[0].request_id: 3,
+            requests[1].request_id: 2
+        },
+        total_num_scheduled_tokens=5,
+        scheduled_encoder_inputs={},
+        scheduled_spec_decode_tokens={
+            requests[0].request_id: [10, 42],
+            requests[1].request_id: [13]
+        },
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+
+    model_output = ModelRunnerOutput(
+        req_ids=[req.request_id for req in requests],
+        req_id_to_index={
+            req.request_id: i
+            for i, req in enumerate(requests)
+        },
+        sampled_token_ids=[[10, 42, 12],
+                           [13, 14]],  # First request hits stop token
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[])
+
+    scheduler.update_from_output(scheduler_output, model_output)
+
+    # Verify first request stopped on custom token
+    assert len(scheduler.running) == 1
+    assert scheduler.running[0].request_id == requests[1].request_id
+    assert requests[0].status == RequestStatus.FINISHED_STOPPED
+    assert requests[0].stop_reason == 42
+    assert requests[0].request_id in scheduler.finished_req_ids
+    assert list(requests[0].output_token_ids) == [10, 42]
+    assert list(requests[1].output_token_ids) == [13, 14]
+
+    # Test case 3: Stop on max tokens
+    scheduler = create_scheduler(num_speculative_tokens=2)
+    requests = create_requests(num_requests=2, max_tokens=2)
+    for req in requests:
+        req.num_computed_tokens = req.num_tokens
+        scheduler.requests[req.request_id] = req
+        scheduler.running.append(req)
+        req.status = RequestStatus.RUNNING
+
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=CachedRequestData.make_empty(),
+        num_scheduled_tokens={
+            requests[0].request_id: 3,
+            requests[1].request_id: 1
+        },
+        total_num_scheduled_tokens=4,
+        scheduled_encoder_inputs={},
+        scheduled_spec_decode_tokens={
+            requests[0].request_id: [10, 11],
+            requests[1].request_id: []
+        },
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+
+    model_output = ModelRunnerOutput(
+        req_ids=[req.request_id for req in requests],
+        req_id_to_index={
+            req.request_id: i
+            for i, req in enumerate(requests)
+        },
+        sampled_token_ids=[[10, 11, 12],
+                           [13]],  # First request exceeds max_tokens
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[])
+
+    scheduler.update_from_output(scheduler_output, model_output)
+
+    # Verify first request stopped due to length
+    assert len(scheduler.running) == 1
+    assert scheduler.running[0].request_id == requests[1].request_id
+    assert requests[0].status == RequestStatus.FINISHED_LENGTH_CAPPED
+    assert requests[0].request_id in scheduler.finished_req_ids
+    assert list(requests[0].output_token_ids) == [10, 11
+                                                  ]  # Truncated to max_tokens
+    assert list(requests[1].output_token_ids) == [13]
+
+    # Test case 4: Ignore EOS flag
+    scheduler = create_scheduler(num_speculative_tokens=2)
+    requests = create_requests(num_requests=1, max_tokens=10)
+    requests[0].sampling_params.ignore_eos = True
+    requests[0].num_computed_tokens = requests[0].num_tokens
+    scheduler.requests[requests[0].request_id] = requests[0]
+    scheduler.running.append(requests[0])
+
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=CachedRequestData.make_empty(),
+        num_scheduled_tokens={requests[0].request_id: 3},
+        total_num_scheduled_tokens=3,
+        scheduled_encoder_inputs={},
+        scheduled_spec_decode_tokens={
+            requests[0].request_id: [EOS_TOKEN_ID, 10]
+        },
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None)
+
+    model_output = ModelRunnerOutput(
+        req_ids=[requests[0].request_id],
+        req_id_to_index={requests[0].request_id: 0},
+        sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[])
+
+    scheduler.update_from_output(scheduler_output, model_output)
+
+    # Verify request continues past EOS
+    assert len(scheduler.running) == 1
+    assert not requests[0].is_finished()
+    assert list(requests[0].output_token_ids) == [EOS_TOKEN_ID, 10, 11]
+
+
+@pytest.mark.parametrize("enable_prefix_caching, prompt_logprobs", [
+    (None, None),
+    (True, 5),
+])
+def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
+                                     prompt_logprobs: Optional[int]):
+    scheduler = create_scheduler(
+        max_num_batched_tokens=1024,
+        max_num_seqs=2,
+        enable_prefix_caching=enable_prefix_caching,
+    )
+    requests = create_requests(
+        num_requests=2,
+        num_tokens=512,
+        prompt_logprobs=prompt_logprobs,
+    )
+
+    # Schedule the first request.
+    scheduler.add_request(requests[0])
+    scheduler_output0 = scheduler.schedule()
+    assert len(scheduler_output0.scheduled_new_reqs) == 1
+    assert scheduler_output0.num_scheduled_tokens[
+        requests[0].request_id] == 512
+
+    # The first request is still running, so only schedule the second request.
+    scheduler.add_request(requests[1])
+    scheduler_output1 = scheduler.schedule()
+    assert len(scheduler_output1.scheduled_new_reqs) == 1
+    assert scheduler_output1.num_scheduled_tokens[
+        requests[1].request_id] == 512
+
+    # Model output of the first request.
+    model_runner_output = ModelRunnerOutput(
+        req_ids=[requests[0].request_id],
+        req_id_to_index={requests[0].request_id: 0},
+        sampled_token_ids=[[0]],
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(scheduler_output0, model_runner_output)
+
+    # Schedule the next step.
+    # The first request can be scheduled again while the second
+    # request is still running.
+    scheduler_output2 = scheduler.schedule()
+    assert scheduler_output2.num_scheduled_tokens[requests[0].request_id] == 1
+
+    # Model output of the second request.
+    model_runner_output = ModelRunnerOutput(
+        req_ids=[requests[1].request_id],
+        req_id_to_index={requests[1].request_id: 0},
+        sampled_token_ids=[[0]],
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(scheduler_output1, model_runner_output)
+
+
+def test_preempt_during_execution():
+    # NOTE(woosuk): The actual number of available blocks is 10 instead of 11
+    # because block 0 is reserved as the null block.
+    scheduler = create_scheduler(max_num_batched_tokens=100,
+                                 block_size=16,
+                                 num_blocks=11,
+                                 enable_prefix_caching=False)
+    requests = create_requests(num_requests=2, num_tokens=80)
+
+    # Schedule the first request.
+    scheduler.add_request(requests[0])
+    scheduler_output0 = scheduler.schedule()
+    assert len(scheduler_output0.num_scheduled_tokens) == 1
+    assert len(scheduler_output0.scheduled_new_reqs[0].block_ids[0]) == 5
+
+    # Schedule the second request while the first request is still running.
+    # This scenario can occur in certain cases, when max_concurrent_batches > 1
+    # (e.g., when pipeline parallelism is used).
+    scheduler.add_request(requests[1])
+    scheduler_output1 = scheduler.schedule()
+    assert len(scheduler_output1.num_scheduled_tokens) == 1
+    assert len(scheduler_output1.scheduled_new_reqs[0].block_ids[0]) == 5
+
+    # Get the output of the first request.
+    model_runner_output0 = ModelRunnerOutput(
+        req_ids=[requests[0].request_id],
+        req_id_to_index={requests[0].request_id: 0},
+        sampled_token_ids=[[0]],
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(scheduler_output0, model_runner_output0)
+
+    # Schedule the first request again. This will cause the preemption
+    # of the second request because the KV cache is full.
+    _ = scheduler.schedule()
+    assert len(scheduler.running) == 1
+    assert scheduler.running[0] == requests[0]
+    assert requests[1].status == RequestStatus.PREEMPTED
+
+    model_runner_output1 = ModelRunnerOutput(
+        req_ids=[requests[1].request_id],
+        req_id_to_index={requests[1].request_id: 0},
+        sampled_token_ids=[[42]],
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(scheduler_output1, model_runner_output1)
+
+    # The second request (that is preempted) should be updated with the
+    # sampled token id.
+    assert len(requests[1].output_token_ids) == 1
+    assert requests[1].output_token_ids[0] == 42
+
+
+# Note - these test cases mirror some of those in test_rejection_sampler.py
+@pytest.mark.parametrize(
+    "spec_tokens,output_tokens,expected",
+    [
+        ([[1, 2, 3]], [[1, 2, 3, 4]], (1, 3, 3, [1, 1, 1])),  # perfect match
+        ([[1, 2, 3]], [[1, 5]], (1, 3, 1, [1, 0, 0])),  # early mismatch
+        ([[1, 2], [3]], [[1, 2, 5], [3, 4]],
+         (2, 3, 3, [2, 1])),  # multiple sequences
+        ([[1]], [[1, 2]], (1, 1, 1, [1])),  # single token sequence
+        ([[]], [[5]], (0, 0, 0, [0])),  # empty sequence
+        ([[1, 2, 3], [4, 5, 6]], [[1, 2, 7], [4, 8]],
+         (2, 6, 3, [2, 1, 0])),  # multiple mismatches
+    ])
+def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
+    """Test scheduling behavior with speculative decoding.
+
+    This test verifies that:
+    1. Speculated tokens get scheduled correctly
+    2. Spec decoding stats properly count number of draft and accepted tokens
+    """
+    num_spec_tokens = max(1, max(len(t) for t in spec_tokens))
+    scheduler = create_scheduler(num_speculative_tokens=num_spec_tokens)
+    requests = create_requests(num_requests=len(spec_tokens), num_tokens=1)
+    req_ids = []
+    req_to_index = {}
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        req_ids.append(request.request_id)
+        req_to_index[request.request_id] = i
+
+    # Schedule a decode, which will also draft speculative tokens
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == len(requests)
+    assert output.total_num_scheduled_tokens == len(requests)
+    for i in range(len(requests)):
+        req_id = requests[i].request_id
+        assert output.num_scheduled_tokens[req_id] == 1
+        assert req_id not in output.scheduled_spec_decode_tokens
+
+    model_runner_output = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[0] for _ in range(len(requests))],
+        spec_token_ids=spec_tokens,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    engine_core_outputs = scheduler.update_from_output(output,
+                                                       model_runner_output)
+
+    for i in range(len(requests)):
+        running_req = scheduler.running[i]
+        # The prompt token
+        assert running_req.num_computed_tokens == 1
+        # The prompt token and the sampled token
+        assert running_req.num_tokens == 2
+        # The prompt token, the sampled token, and the speculated tokens
+        assert running_req.num_tokens_with_spec == 2 + len(spec_tokens[i])
+
+    # No draft or accepted tokens counted yet
+    assert not engine_core_outputs or (
+        engine_core_outputs[0].scheduler_stats.spec_decoding_stats is None)
+
+    # Schedule the speculated tokens for validation
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 0
+    # The sampled token and speculated tokens
+    assert output.total_num_scheduled_tokens == \
+        len(requests) + sum(len(ids) for ids in spec_tokens)
+    for i in range(len(requests)):
+        req_id = requests[i].request_id
+        assert output.num_scheduled_tokens[req_id] == 1 + len(spec_tokens[i])
+        if spec_tokens[i]:
+            assert len(output.scheduled_spec_decode_tokens[req_id]) == \
+                len(spec_tokens[i])
+        else:
+            assert req_id not in output.scheduled_spec_decode_tokens
+
+    model_runner_output = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=output_tokens,
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    engine_core_outputs = scheduler.update_from_output(output,
+                                                       model_runner_output)
+
+    scheduler_stats = engine_core_outputs[0].scheduler_stats \
+        if engine_core_outputs else None
+    if expected[0] == 0:
+        assert scheduler_stats.spec_decoding_stats is None
+    else:
+        assert scheduler_stats.spec_decoding_stats is not None
+        stats = scheduler_stats.spec_decoding_stats
+        assert stats.num_drafts == expected[0]
+        assert stats.num_draft_tokens == expected[1]
+        assert stats.num_accepted_tokens == expected[2]
+        assert stats.num_accepted_tokens_per_pos == expected[3]
+
+
+def _assert_right_scheduler_output(
+    output: SchedulerOutput,
+    num_requests: int,
+    expected_num_scheduled_tokens: int,
+):
+    """Check if SchedulerOutput is correct after remote KV cache hit."""
+
+    # We should inject the kv_connector_metadata.
+    assert len(output.kv_connector_metadata.requests) == num_requests
+
+    # Only num_tokens - matched_num_new_tokens should be scheduled.
+    for _, num_scheduled_tokens in output.num_scheduled_tokens.items():
+        assert num_scheduled_tokens == expected_num_scheduled_tokens
+
+
+def _assert_right_kv_cache_manager(
+    scheduler: Scheduler,
+    req_ids: list[str],
+    num_tokens: int,
+    block_size: int,
+    num_requests: int,
+    num_total_blocks: int,
+):
+    """Check whether KVCacheManager is correct after allocate."""
+
+    # Make sure the request stats are right.
+    EXPECTED_TOTAL_BLOCKS = num_tokens // block_size
+    for req_id in req_ids:
+        blocks = (scheduler.kv_cache_manager.coordinator.
+                  single_type_managers[0].req_to_blocks[req_id])
+        hashes = scheduler.kv_cache_manager.req_to_block_hashes[req_id]
+        assert (scheduler.kv_cache_manager.coordinator.single_type_managers[0].
+                num_cached_block[req_id] == EXPECTED_TOTAL_BLOCKS)
+        assert len(blocks) == EXPECTED_TOTAL_BLOCKS
+        assert len(hashes) == EXPECTED_TOTAL_BLOCKS
+
+    # Make sure we actually touched all the blocks.
+    BLOCKS_PER_REQ = num_tokens / block_size
+    assert (scheduler.kv_cache_manager.block_pool.get_num_free_blocks() ==
+            num_total_blocks - num_requests * BLOCKS_PER_REQ)
+
+
+def _step_until_done(
+    scheduler: Scheduler,
+    output: SchedulerOutput,
+    model_runner_output: ModelRunnerOutput,
+):
+    """Loop over schedule(), update_from_output() until finished."""
+
+    all_finished = False
+    _ = scheduler.update_from_output(output, model_runner_output)
+    while not all_finished:
+        # Schedule + a few iterations until stopping.
+        output = scheduler.schedule()
+        assert len(scheduler.running)
+        for _, num_scheduled_tokens in output.num_scheduled_tokens.items():
+            # We should be in the decode phase now.
+            assert num_scheduled_tokens == 1
+        assert len(output.kv_connector_metadata.requests) == 0
+        ecos = scheduler.update_from_output(output, model_runner_output)[0]
+        all_done = True
+        for eco in ecos.outputs:
+            if eco.finish_reason is None:
+                all_done = False
+        all_finished = all_done
+
+
+def test_kv_connector_basic():
+    """
+    Test whether Scheduler with KVConnector schedules tokens, allocates
+    memory, and cleans up requests as expected under normal operation.
+    """
+
+    # Setup Scheduler.
+    scheduler = create_scheduler(
+        enable_prefix_caching=True,
+        use_kv_connector=True,
+    )
+    NUM_TOTAL_BLOCKS = (
+        scheduler.kv_cache_manager.block_pool.get_num_free_blocks())
+    BLOCK_SIZE = scheduler.cache_config.block_size
+
+    # Mock External Cache Hit.
+    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
+    scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
+    scheduler.connector.get_num_new_matched_tokens.return_value = (
+        NUM_MATCHED_NEW_TOKENS, False)
+
+    ######################################################
+    # FIRST SET OF REQUESTS - External Hit Only
+    NUM_REQUESTS = 2
+    NUM_TOKENS = NUM_MATCHED_NEW_TOKENS * 2
+    MAX_TOKENS = 3
+    requests = create_requests(num_requests=NUM_REQUESTS,
+                               num_tokens=NUM_TOKENS,
+                               max_tokens=MAX_TOKENS)
+    req_ids = []
+    req_to_index = {}
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        req_ids.append(request.request_id)
+        req_to_index[request.request_id] = i
+
+    MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[1000]] * len(req_ids),
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+
+    # Ensure ScheduleOutput is correct.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output=output,
+        num_requests=NUM_REQUESTS,
+        # Just the incremental tokens should be scheduled.
+        expected_num_scheduled_tokens=NUM_TOKENS - NUM_MATCHED_NEW_TOKENS,
+    )
+
+    # Ensure KVCacheManager is correct.
+    _assert_right_kv_cache_manager(scheduler, req_ids, NUM_TOKENS, BLOCK_SIZE,
+                                   NUM_REQUESTS, NUM_TOTAL_BLOCKS)
+
+    # Continue Generation until done.
+    _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
+    _ = scheduler.schedule()
+    # Confirm we clean up the memory properly.
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
+        == NUM_TOTAL_BLOCKS
+
+    ######################################################
+    # SECOND SET OF REQUESTS - Local And External Hit
+    NUM_TOKENS_PREFIX = NUM_TOKENS
+    # We will get a local prefix cache hit for the first
+    # NUM_TOKENS_PREFIX tokens since they are used above.
+    NUM_TOKENS = NUM_TOKENS_PREFIX * 2
+    requests = create_requests(num_requests=NUM_REQUESTS,
+                               num_tokens=NUM_TOKENS,
+                               max_tokens=MAX_TOKENS)
+    req_ids = []
+    req_to_index = {}
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        req_ids.append(request.request_id)
+        req_to_index[request.request_id] = i
+
+    MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[1000]] * len(req_ids),
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+
+    # We should get a local cache hit of NUM_TOKENS_PREFIX and
+    # a remote KV cache hit of NUM_MATCHED_NEW_TOKENS.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output=output,
+        num_requests=NUM_REQUESTS,
+        # Just the incremental tokens after local + remote cache hit.
+        expected_num_scheduled_tokens=(NUM_TOKENS - NUM_TOKENS_PREFIX -
+                                       NUM_MATCHED_NEW_TOKENS))
+
+    # Ensure KVCacheManager is correct.
+    _assert_right_kv_cache_manager(scheduler, req_ids, NUM_TOKENS, BLOCK_SIZE,
+                                   NUM_REQUESTS, NUM_TOTAL_BLOCKS)
+
+    # Continue Generation until done.
+    _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
+    _ = scheduler.schedule()
+    # Confirm we clean up the memory properly.
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
+        == NUM_TOTAL_BLOCKS
+
+
+def test_kv_connector_unable_to_allocate():
+    """
+    Test whether scheduler with KVConnector is able to handle
+    unable to allocate (run out of blocks in allocate_slots().
+    """
+
+    # Setup Scheduler With Mock External Cache Hit.
+    BLOCK_SIZE = 4
+    NUM_BLOCKS = 10
+    scheduler = create_scheduler(
+        enable_prefix_caching=True,
+        use_kv_connector=True,
+        block_size=BLOCK_SIZE,
+        num_blocks=NUM_BLOCKS,
+    )
+    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
+    scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
+    scheduler.connector.get_num_new_matched_tokens.return_value = (
+        NUM_MATCHED_NEW_TOKENS, False)
+
+    # Create two requests. The second request will not be able to
+    # allocate slots because it will not have enough blocks.
+    NUM_REQUESTS = 2
+    NUM_TOKENS = (NUM_BLOCKS // 2 + 1) * BLOCK_SIZE
+    MAX_TOKENS = 2
+    requests = create_requests(num_requests=NUM_REQUESTS,
+                               num_tokens=NUM_TOKENS,
+                               max_tokens=MAX_TOKENS)
+    req_ids = []
+    req_to_index = {}
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        req_ids.append(request.request_id)
+        req_to_index[request.request_id] = i
+
+    MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[1000]] * len(req_ids),
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+
+    # Just one request should be running.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(output,
+                                   num_requests=1,
+                                   expected_num_scheduled_tokens=NUM_TOKENS -
+                                   NUM_MATCHED_NEW_TOKENS)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 1
+
+    # All memory should be freed, with one request waiting.
+    _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
+        == NUM_BLOCKS - 1
+    assert len(scheduler.running) == 0
+    assert len(scheduler.waiting) == 1
+
+    # Just one request should be running.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(output,
+                                   num_requests=1,
+                                   expected_num_scheduled_tokens=NUM_TOKENS -
+                                   NUM_MATCHED_NEW_TOKENS)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 0
+
+    # All memory should be freed, with no requests waiting / running.
+    _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
+        == NUM_BLOCKS - 1
+    assert len(scheduler.running) == 0
+    assert len(scheduler.waiting) == 0
+
+
+def test_kv_connector_handles_preemption():
+    """
+    Test whether scheduler with KVConnector is able to handle
+    unable to allocate (run out of blocks in allocate_slots().
+    """
+
+    # Setup Scheduler With Mock External Cache Hit.
+    BLOCK_SIZE = 2
+    # NOTE: there is 1 null block, so this is 6 blocks.
+    NUM_BLOCKS = 7
+    scheduler = create_scheduler(
+        enable_prefix_caching=True,
+        use_kv_connector=True,
+        block_size=BLOCK_SIZE,
+        num_blocks=NUM_BLOCKS,
+    )
+
+    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
+    scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
+    scheduler.connector.get_num_new_matched_tokens.return_value = (
+        NUM_MATCHED_NEW_TOKENS, False)
+
+    # Create two requests.
+    # Both can be scheduled at first, but the second request
+    # will be preempted and re-scheduled.
+    NUM_REQUESTS = 2
+    NUM_TOKENS = BLOCK_SIZE * 2 + 1
+    MAX_TOKENS = BLOCK_SIZE * 2
+    requests = create_requests(num_requests=NUM_REQUESTS,
+                               num_tokens=NUM_TOKENS,
+                               max_tokens=MAX_TOKENS)
+    req_ids = []
+    req_to_index = {}
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        req_ids.append(request.request_id)
+        req_to_index[request.request_id] = i
+
+    MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[1000]] * len(req_ids),
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+
+    # All can be scheduled - 1st token.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output,
+        # 2 remote kv cache hits.
+        num_requests=2,
+        expected_num_scheduled_tokens=NUM_TOKENS - NUM_MATCHED_NEW_TOKENS)
+    assert len(scheduler.running) == 2
+    _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
+
+    # All can be scheduled - 2nd token.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output,
+        # no connector_metadata
+        num_requests=0,
+        expected_num_scheduled_tokens=1)
+    assert len(scheduler.running) == 2
+    _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
+
+    # This will generate a new block and cause a preemption - 3rd token.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output,
+        # no connector_metadata
+        num_requests=0,
+        expected_num_scheduled_tokens=1)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 1
+    _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 1
+
+    # Only 1 can be scheduled - 4th (and last token).
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output,
+        # no connector_metadata
+        num_requests=0,
+        expected_num_scheduled_tokens=1)
+    assert len(scheduler.waiting) == 1
+    assert len(scheduler.running) == 1
+    _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
+    assert len(scheduler.running) == 0
+    # All memory should be freed since nothing is running.
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
+        == NUM_BLOCKS - 1
+
+    # Restarts the preempted request - generate 3rd token.
+    # This will have a local and remote cache hit.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output,
+        # 1 remote kv_cache hit!
+        num_requests=1,
+        # Only 1 block was preempted and there is a single
+        # remote hit. So only single new token is scheduled.
+        expected_num_scheduled_tokens=1,
+    )
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 0
+    _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 0
+
+    # Only 1 can be scheduled - 4th (and last token).
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output,
+        # no connector_metadata
+        num_requests=0,
+        expected_num_scheduled_tokens=1)
+    assert len(scheduler.running) == 1
+    _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
+    assert len(scheduler.running) == 0
+    # All memory should be freed since nothing is running.
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
+        == NUM_BLOCKS - 1
+
+
+def make_output(scheduler: Scheduler):
+    return ModelRunnerOutput(
+        req_ids=[req.request_id for req in scheduler.running],
+        req_id_to_index={
+            req.request_id: i
+            for i, req in enumerate(scheduler.running)
+        },
+        sampled_token_ids=[[1000]] * len(scheduler.running),
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+
+
+def assert_scheduler_empty(scheduler: Scheduler):
+    """Confirm the scheduler is "empty" - i.e. no leaks."""
+    # Scheduler Metadata.
+    assert len(scheduler.requests) == 0
+    assert len(scheduler.waiting) == 0
+    assert len(scheduler.running) == 0
+    assert len(scheduler.finished_req_ids) == 0
+
+    # EncoderCacheManager.
+    assert len(scheduler.encoder_cache_manager.freed) == 0
+    assert len(scheduler.encoder_cache_manager.cached) == 0
+
+    # KVCache Manager.
+    assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
+               req_to_blocks) == 0
+    assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0
+    assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
+               num_cached_block) == 0
+    num_free_blocks = (
+        scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks)
+    assert num_free_blocks == (
+        scheduler.kv_cache_manager.block_pool.num_gpu_blocks - 1)
+
+    # NOTE(rob): just the ref count on blocks will be 0. The hash
+    # value, etc will remain since we lazily evict for prefix cache.
+    for block in scheduler.kv_cache_manager.block_pool.blocks:
+        assert block.ref_cnt == 0
+        # assert block._block_hash is None
+    # assert (
+    #     len(scheduler.kv_cache_manager.block_pool.cached_block_hash_to_block
+    #           ) == 0)
+
+
+def test_memory_leak():
+    """Test that we do not have a memory leak."""
+
+    scheduler = create_scheduler(enable_prefix_caching=True)
+
+    NUM_REQUESTS = 5
+    NUM_TOKENS = 10
+    MAX_TOKENS = 10
+    requests = create_requests(num_requests=NUM_REQUESTS,
+                               num_tokens=NUM_TOKENS,
+                               max_tokens=MAX_TOKENS)
+
+    # Add each request.
+    for request in requests:
+        scheduler.add_request(request)
+        scheduler_output = scheduler.schedule()
+        model_runner_output = make_output(scheduler)
+        scheduler.update_from_output(scheduler_output, model_runner_output)
+
+    # Iterate until done.
+    while True:
+        scheduler_output = scheduler.schedule()
+        if len(scheduler.running) == 0:
+            break
+        model_runner_output = make_output(scheduler)
+        scheduler.update_from_output(scheduler_output, model_runner_output)
+
+    # Confirm no memory leak.
+    assert_scheduler_empty(scheduler)
+
+
+def create_scheduler_with_priority(
+    model: str = "facebook/opt-125m",
+    max_num_seqs: int = 16,
+    max_num_batched_tokens: int = 8192,
+    enable_prefix_caching: Optional[bool] = None,
+    long_prefill_token_threshold: int = 0,
+    disable_chunked_mm_input: bool = False,
+    use_kv_connector: bool = False,
+    num_blocks: int = 10000,
+    block_size: int = 16,
+    max_model_len: Optional[int] = None,
+    num_speculative_tokens: Optional[int] = None,
+) -> Scheduler:
+    '''Create scheduler with priority policy enabled.
+
+    Args:
+      model: model under test
+      max_num_seqs: max sequences to schedule
+      max_num_batch_tokens: max num tokens to batch
+      enable_prefix_caching: optionally force APC config
+                             (True/False) or use default
+                             (None)
+
+    Returns:
+      {class}`Scheduler` instance with priority scheduling
+    '''
+    if max_model_len is None:
+        max_model_len = max_num_batched_tokens
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=max_num_seqs,
+        max_num_batched_tokens=max_num_batched_tokens,
+        max_model_len=max_model_len,
+        long_prefill_token_threshold=long_prefill_token_threshold,
+        disable_chunked_mm_input=disable_chunked_mm_input,
+        enable_chunked_prefill=True,
+        policy="priority",  # Enable priority scheduling
+    )
+    model_config = ModelConfig(
+        model=model,
+        task="auto",
+        tokenizer=model,
+        tokenizer_mode="auto",
+        trust_remote_code=True,
+        dtype="float16",
+        seed=42,
+    )
+    # Cache config, optionally force APC
+    kwargs_cache = ({} if enable_prefix_caching is None else {
+        'enable_prefix_caching': enable_prefix_caching
+    })
+    cache_config = CacheConfig(
+        block_size=block_size,
+        gpu_memory_utilization=0.9,
+        swap_space=0,
+        cache_dtype="auto",
+        **kwargs_cache,
+    )
+    kv_transfer_config = KVTransferConfig(
+        kv_connector="SharedStorageConnector",
+        kv_role="kv_both",
+        kv_connector_extra_config={"shared_storage_path": "local_storage"},
+    ) if use_kv_connector else None
+
+    speculative_config: Optional[SpeculativeConfig] = None
+    if num_speculative_tokens is not None:
+        speculative_config = SpeculativeConfig(
+            model="ngram", num_speculative_tokens=num_speculative_tokens)
+
+    vllm_config = VllmConfig(
+        scheduler_config=scheduler_config,
+        model_config=model_config,
+        cache_config=cache_config,
+        kv_transfer_config=kv_transfer_config,
+        speculative_config=speculative_config,
+    )
+    kv_cache_config = KVCacheConfig(
+        num_blocks=num_blocks,  # A large number of blocks to hold all requests
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec(['layer'],
+                             FullAttentionSpec(block_size, 1, 1, torch.float32,
+                                               False))
+        ],
+    )
+    cache_config.num_gpu_blocks = num_blocks
+    return Scheduler(
+        vllm_config=vllm_config,
+        kv_cache_config=kv_cache_config,
+        log_stats=True,
+        structured_output_manager=StructuredOutputManager(vllm_config),
+    )
+
+
+def create_requests_with_priority(
+        num_requests: int,
+        priorities: list[int],
+        arrival_times: Optional[list[float]] = None,
+        num_tokens: int = 10,
+        mm_positions: Optional[list[PlaceholderRange]] = None,
+        max_tokens: int = 16,
+        stop_token_ids: Optional[list[int]] = None,
+        prompt_logprobs: Optional[int] = None):
+    """Create requests with specified priorities and arrival times."""
+    assert len(priorities) == num_requests
+    if arrival_times is not None:
+        assert len(arrival_times) == num_requests
+    else:
+        arrival_times = [float(i) for i in range(num_requests)]
+
+    sampling_params = SamplingParams(ignore_eos=False,
+                                     max_tokens=max_tokens,
+                                     stop_token_ids=stop_token_ids,
+                                     prompt_logprobs=prompt_logprobs)
+    requests = []
+    for i in range(num_requests):
+        if mm_positions is not None:
+            mm_position = mm_positions[i]
+            mm_inputs = [MultiModalKwargs({})] * len(mm_position)
+        else:
+            mm_position = None
+            mm_inputs = None
+        request = Request(
+            request_id=f"{i}",
+            prompt_token_ids=[i] * num_tokens,
+            sampling_params=sampling_params,
+            pooling_params=None,
+            multi_modal_inputs=mm_inputs,
+            multi_modal_placeholders=mm_position,
+            multi_modal_hashes=None,
+            eos_token_id=EOS_TOKEN_ID,
+            arrival_time=arrival_times[i],
+            priority=priorities[i],
+        )
+        requests.append(request)
+    return requests
+
+
+def test_priority_scheduling_basic_ordering():
+    """Test that requests are scheduled in priority order
+    (lower value = higher priority)."""
+    scheduler = create_scheduler_with_priority()
+
+    # Create requests with different priorities
+    # Priority 0 (highest), 1, 2 (lowest)
+    priorities = [2, 0, 1]  # Add in non-priority order
+    arrival_times = [1.0, 2.0, 3.0]  # All different arrival times
+    requests = create_requests_with_priority(num_requests=3,
+                                             priorities=priorities,
+                                             arrival_times=arrival_times)
+
+    # Add requests in non-priority order
+    for request in requests:
+        scheduler.add_request(request)
+
+    # Schedule and verify priority order
+    output = scheduler.schedule()
+
+    # Should schedule all requests since they fit in budget
+    assert len(output.scheduled_new_reqs) == 3
+
+    # Verify they are scheduled in priority order:
+    # req_1 (priority 0), req_2 (priority 1), req_0 (priority 2)
+    scheduled_req_ids = [req.req_id for req in output.scheduled_new_reqs]
+    assert scheduled_req_ids == ["1", "2", "0"]
+
+
+def test_priority_scheduling_arrival_time_tiebreaker():
+    """Test that arrival time is used
+    as tiebreaker when priorities are equal."""
+    scheduler = create_scheduler_with_priority()
+
+    # Create requests with same priority but different arrival times
+    priorities = [1, 1, 1]  # All same priority
+    arrival_times = [3.0, 1.0, 2.0]  # Different arrival times
+    requests = create_requests_with_priority(num_requests=3,
+                                             priorities=priorities,
+                                             arrival_times=arrival_times)
+
+    # Add requests in non-arrival order
+    for request in requests:
+        scheduler.add_request(request)
+
+    # Schedule and verify arrival time order
+    output = scheduler.schedule()
+
+    # Should schedule all requests since they fit in budget
+    assert len(output.scheduled_new_reqs) == 3
+
+    # Verify they are scheduled in arrival time order:
+    # req_1 (1.0), req_2 (2.0), req_0 (3.0)
+    scheduled_req_ids = [req.req_id for req in output.scheduled_new_reqs]
+    assert scheduled_req_ids == ["1", "2", "0"]
+
+
+def test_priority_scheduling_mixed_priority_and_arrival():
+    """Test priority scheduling with mixed priorities and arrival times."""
+    scheduler = create_scheduler_with_priority()
+
+    # Create requests with mixed priorities and arrival times
+    priorities = [2, 1, 1, 0]  # Mixed priorities
+    arrival_times = [1.0, 3.0, 2.0, 4.0]  # Mixed arrival times
+    requests = create_requests_with_priority(num_requests=4,
+                                             priorities=priorities,
+                                             arrival_times=arrival_times)
+
+    # Add requests
+    for request in requests:
+        scheduler.add_request(request)
+
+    # Schedule and verify order
+    output = scheduler.schedule()
+
+    # Should schedule all requests since they fit in budget
+    assert len(output.scheduled_new_reqs) == 4
+
+    # Expected order:
+    # 1. req_3 (priority 0, arrival 4.0)
+    # 2. req_2 (priority 1, arrival 2.0) - earlier arrival than req_1
+    # 3. req_1 (priority 1, arrival 3.0)
+    # 4. req_0 (priority 2, arrival 1.0)
+    scheduled_req_ids = [req.req_id for req in output.scheduled_new_reqs]
+    assert scheduled_req_ids == ["3", "2", "1", "0"]
+
+
+def test_priority_scheduling_preemption():
+    """Test that priority scheduling preempts
+    lower priority requests when memory is constrained."""
+    # Create scheduler with very limited memory to force preemption
+    scheduler = create_scheduler_with_priority(
+        max_num_seqs=3,  # Allow multiple requests
+        max_num_batched_tokens=200,
+        num_blocks=6,  # Very limited blocks to force memory pressure
+        block_size=16,  # Standard block size
+    )
+
+    # Create initial low-priority requests that will consume most memory
+    low_priority_requests = create_requests_with_priority(
+        num_requests=2,
+        priorities=[5, 5],  # Low priority
+        arrival_times=[1.0, 2.0],
+        num_tokens=30  # Large enough to consume significant memory
+    )
+
+    # Add and schedule low priority requests
+    for request in low_priority_requests:
+        scheduler.add_request(request)
+
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 2
+
+    # Simulate model execution to move requests to running state
+    model_output = ModelRunnerOutput(
+        req_ids=[req.request_id for req in low_priority_requests],
+        req_id_to_index={
+            req.request_id: i
+            for i, req in enumerate(low_priority_requests)
+        },
+        sampled_token_ids=[[100] for _ in low_priority_requests],
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(output, model_output)
+
+    # Verify both requests are running
+    assert len(scheduler.running) == 2
+
+    # Now add a high-priority request that requires memory allocation
+    # This should trigger preemption due to memory constraints
+    high_priority_request = create_requests_with_priority(
+        num_requests=1,
+        priorities=[0],  # High priority
+        arrival_times=[3.0],
+        num_tokens=30  # Large enough to require significant memory
+    )[0]
+
+    scheduler.add_request(high_priority_request)
+
+    # Schedule again - this should trigger
+    # preemption when trying to allocate memory
+    output = scheduler.schedule()
+
+    # Due to the scheduler's design, if preemption happens
+    # during running request scheduling,
+    # waiting requests won't be scheduled in the same step
+    # Let's check if preemption occurred by looking at the waiting queue
+
+    # If preemption happened, we should see requests in the
+    # waiting queue
+    if len(scheduler.waiting) > 1:  # high priority + preempted request
+        # Preemption occurred - verify the high priority request
+        # gets scheduled next
+        output2 = scheduler.schedule()
+        assert len(output2.scheduled_new_reqs) == 1
+        # High priority request
+        assert output2.scheduled_new_reqs[0].req_id == "0"
+    else:
+        # No preemption needed - all requests fit
+        # This is also valid behavior if memory allows
+        assert len(output.scheduled_new_reqs) == 1
+        # High priority request
+        assert output.scheduled_new_reqs[0].req_id == "0"
+
+
+def test_priority_scheduling_no_preemption_when_space_available():
+    """Test that preemption doesn't happen
+    when there's space for new requests."""
+    scheduler = create_scheduler_with_priority(
+        max_num_seqs=3,  # Allow 3 concurrent requests
+        max_num_batched_tokens=200,  # Sufficient token budget
+    )
+
+    # Add two low-priority running requests
+    low_priority_requests = create_requests_with_priority(
+        num_requests=2,
+        priorities=[5, 5],
+        arrival_times=[1.0, 2.0],
+        num_tokens=30)
+
+    for request in low_priority_requests:
+        scheduler.add_request(request)
+
+    output = scheduler.schedule()
+    model_output = ModelRunnerOutput(
+        req_ids=[req.request_id for req in low_priority_requests],
+        req_id_to_index={
+            req.request_id: i
+            for i, req in enumerate(low_priority_requests)
+        },
+        sampled_token_ids=[[100] for _ in low_priority_requests],
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(output, model_output)
+
+    # Add high-priority request
+    high_priority_request = create_requests_with_priority(num_requests=1,
+                                                          priorities=[0],
+                                                          arrival_times=[3.0],
+                                                          num_tokens=30)[0]
+
+    scheduler.add_request(high_priority_request)
+
+    # Schedule - should not preempt since there's space
+    output = scheduler.schedule()
+
+    # Should schedule the new request without preemption
+    assert len(output.scheduled_new_reqs) == 1
+    assert len(scheduler.running) == 3  # All three requests running
+    assert len(scheduler.waiting) == 0  # No requests waiting
+
+
+def test_priority_scheduling_preemption_victim_selection():
+    """Test that the correct victim is selected for
+    preemption based on priority and arrival time."""
+    # This test verifies the priority-based victim selection logic
+    # by checking the waiting queue order after adding requests with different
+    # priorities
+    scheduler = create_scheduler_with_priority(
+        max_num_seqs=1,  # Force sequential processing to test priority order
+    )
+
+    # Create requests with different priorities
+    requests = create_requests_with_priority(
+        num_requests=3,
+        priorities=[3, 2, 0],  # Different priorities: low, medium, high
+        arrival_times=[1.0, 2.0, 3.0],
+        num_tokens=10)
+
+    # Add all requests
+    for request in requests:
+        scheduler.add_request(request)
+
+    # Schedule - should only schedule the highest priority request
+    # (req_2, priority 0)
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 1
+    assert output.scheduled_new_reqs[0].req_id == "2"  # Highest priority
+
+    # Verify the waiting queue has the remaining requests in priority order
+    assert len(scheduler.waiting) == 2
+
+    # Extract waiting requests and verify priority order
+    waiting_requests = list(scheduler.waiting)
+
+    waiting_priorities = [req.priority for req in waiting_requests]
+    waiting_req_ids = [req.request_id for req in waiting_requests]
+
+    # Should be req_1 (priority 2) then req_0 (priority 3)
+    assert waiting_priorities == [2, 3]
+    assert waiting_req_ids == ["1", "0"]
+
+
+def test_priority_scheduling_equal_priority_preemption():
+    """Test arrival time tiebreaker when requests have equal priority."""
+    # This test verifies that arrival time is used as a tiebreaker for equal
+    # priorities
+    scheduler = create_scheduler_with_priority(
+        max_num_seqs=1,  # Force sequential processing
+    )
+
+    # Create requests with same priority but different arrival times
+    requests = create_requests_with_priority(
+        num_requests=3,
+        priorities=[2, 2, 2],  # Same priority
+        arrival_times=[3.0, 1.0, 2.0],  # Different arrival times
+        num_tokens=10)
+
+    # Add all requests
+    for request in requests:
+        scheduler.add_request(request)
+
+    # Schedule - should schedule the request with earliest arrival time
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 1
+    assert output.scheduled_new_reqs[0].req_id == "1"  # Earliest arrival (1.0)
+
+    # Verify the waiting queue has remaining requests in arrival time order
+    assert len(scheduler.waiting) == 2
+
+    # Extract waiting requests and verify arrival time order
+    waiting_requests = list(scheduler.waiting)
+
+    waiting_arrival_times = [req.arrival_time for req in waiting_requests]
+    waiting_req_ids = [req.request_id for req in waiting_requests]
+
+    # Should be req_2 (arrival 2.0) then req_0 (arrival 3.0)
+    assert waiting_arrival_times == [2.0, 3.0]
+    assert waiting_req_ids == ["2", "0"]
+
+
+def test_priority_scheduling_waiting_queue_order():
+    """Test that the waiting queue maintains priority order."""
+    scheduler = create_scheduler_with_priority(
+        max_num_seqs=1,  # Only one request can run at a time
+    )
+
+    # Create multiple requests with different priorities
+    requests = create_requests_with_priority(
+        num_requests=4,
+        priorities=[3, 1, 2, 0],  # Mixed priorities
+        arrival_times=[1.0, 2.0, 3.0, 4.0],
+        num_tokens=10)
+
+    # Add all requests
+    for request in requests:
+        scheduler.add_request(request)
+
+    # Schedule - should only schedule the highest priority request
+    # (req_3, priority 0)
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 1
+    assert output.scheduled_new_reqs[0].req_id == "3"
+
+    # Verify waiting queue has remaining requests in priority order
+    assert len(scheduler.waiting) == 3
+
+    # Extract requests from waiting queue
+    # (it's a heap, so we need to pop to see order)
+    waiting_requests = list(scheduler.waiting)
+
+    waiting_priorities = [req.priority for req in waiting_requests]
+    waiting_req_ids = [req.request_id for req in waiting_requests]
+
+    # Should be ordered by priority: req_1 (1), req_2 (2), req_0 (3)
+    assert waiting_req_ids == ["1", "2", "0"]
+    assert waiting_priorities == [1, 2, 3]
+
+
+def test_priority_scheduling_fcfs_fallback():
+    """Test that FCFS behavior is maintained when all
+    requests have same priority."""
+    scheduler = create_scheduler_with_priority()
+
+    # Create requests with same priority but different arrival times
+    priorities = [1, 1, 1, 1]  # All same priority
+    arrival_times = [4.0, 1.0, 3.0, 2.0]  # Different arrival times
+    requests = create_requests_with_priority(num_requests=4,
+                                             priorities=priorities,
+                                             arrival_times=arrival_times)
+
+    # Add requests
+    for request in requests:
+        scheduler.add_request(request)
+
+    # Schedule
+    output = scheduler.schedule()
+
+    # Should schedule all requests in arrival time order
+    assert len(output.scheduled_new_reqs) == 4
+    scheduled_req_ids = [req.req_id for req in output.scheduled_new_reqs]
+
+    # Expected order by arrival time:
+    # req_1 (1.0), req_3 (2.0), req_2 (3.0), req_0 (4.0)
+    assert scheduled_req_ids == ["1", "3", "2", "0"]
+
+
+def test_priority_scheduling_with_limited_slots():
+    """Test priority scheduling when max_num_seqs limits concurrent requests."""
+    scheduler = create_scheduler_with_priority(
+        max_num_seqs=2,  # Only allow 2 concurrent requests
+        max_num_batched_tokens=1000,  # Plenty of token budget
+    )
+
+    # Create requests with different priorities
+    requests = create_requests_with_priority(
+        num_requests=4,
+        priorities=[3, 1, 2, 0],  # Mixed priorities
+        arrival_times=[1.0, 2.0, 3.0, 4.0],
+        num_tokens=10)
+
+    # Add all requests
+    for request in requests:
+        scheduler.add_request(request)
+
+    # Schedule - should only schedule the 2 highest priority requests
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 2
+
+    # Should schedule req_3 (priority 0) and req_1 (priority 1)
+    scheduled_req_ids = [req.req_id for req in output.scheduled_new_reqs]
+    assert "3" in scheduled_req_ids  # Priority 0
+    assert "1" in scheduled_req_ids  # Priority 1
+
+    # Remaining requests should be in waiting queue in priority order
+    assert len(scheduler.waiting) == 2
+
+    # Extract waiting requests and verify order
+    waiting_requests = list(scheduler.waiting)
+    waiting_priorities = [req.priority for req in waiting_requests]
+    waiting_req_ids = [req.request_id for req in waiting_requests]
+
+    # Should be req_2 (priority 2) then req_0 (priority 3)
+    assert waiting_priorities == [2, 3]
+    assert waiting_req_ids == ["2", "0"]
+
+
+def test_priority_scheduling_heap_property():
+    """Test that the waiting queue maintains heap
+    property for priority scheduling."""
+    scheduler = create_scheduler_with_priority(
+        max_num_seqs=1,  # Only one request can run at a time
+    )
+
+    # Add requests in random priority order
+    priorities = [5, 1, 8, 3, 2, 7, 4, 6]
+    arrival_times = [float(i) for i in range(len(priorities))]
+    requests = create_requests_with_priority(num_requests=len(priorities),
+                                             priorities=priorities,
+                                             arrival_times=arrival_times,
+                                             num_tokens=10)
+
+    # Add all requests
+    for request in requests:
+        scheduler.add_request(request)
+
+    # Schedule one request at a time and verify priority order
+    scheduled_priorities = []
+
+    while scheduler.waiting:
+        output = scheduler.schedule()
+        if output.scheduled_new_reqs:
+            req = output.scheduled_new_reqs[0]
+            scheduled_priorities.append(requests[int(req.req_id)].priority)
+
+            # Simulate completion to make room for next request
+            model_output = ModelRunnerOutput(
+                req_ids=[req.req_id],
+                req_id_to_index={req.req_id: 0},
+                sampled_token_ids=[[100]],
+                spec_token_ids=None,
+                logprobs=None,
+                prompt_logprobs_dict={},
+                pooler_output=[],
+            )
+            scheduler.update_from_output(output, model_output)
+
+            # Finish the request to make room for the next one
+            scheduler.finish_requests(req.req_id,
+                                      RequestStatus.FINISHED_STOPPED)
+
+    # Verify requests were scheduled in priority order (lowest value first)
+    expected_priorities = sorted(priorities)
+    assert scheduled_priorities == expected_priorities
+
+
+def test_schedule_skip_tokenizer_init():
+    scheduler = create_scheduler(skip_tokenizer_init=True)
+    requests = create_requests(num_requests=5)
+    for request in requests:
+        scheduler.add_request(request)
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == len(requests)
+    assert output.grammar_bitmask is None
+
+
+def test_schedule_skip_tokenizer_init_structured_output_request():
+    scheduler = create_scheduler(skip_tokenizer_init=True)
+    guided_params = GuidedDecodingParams(regex="[0-9]+")
+    sampling_params = SamplingParams(
+        ignore_eos=False,
+        max_tokens=16,
+        guided_decoding=guided_params,
+    )
+    request = Request(
+        request_id="0",
+        prompt_token_ids=[0, 1],
+        multi_modal_inputs=None,
+        multi_modal_hashes=None,
+        multi_modal_placeholders=None,
+        sampling_params=sampling_params,
+        pooling_params=None,
+        eos_token_id=EOS_TOKEN_ID,
+        structured_output_request=StructuredOutputRequest(sampling_params),
+    )
+    scheduler.add_request(request)
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 0
+    assert len(scheduler.running) == 0
+    assert len(scheduler.waiting) == 1
diff --git a/vllm_v0.10.0/tests/v1/core/test_scheduler_e2e.py b/vllm_v0.10.0/tests/v1/core/test_scheduler_e2e.py
new file mode 100644
index 0000000..bd0320b
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/core/test_scheduler_e2e.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+
+import pytest
+
+from vllm import LLM
+
+if os.getenv("VLLM_USE_V1", "0") != "1":
+    pytest.skip("Test package requires V1", allow_module_level=True)
+
+MODEL = "meta-llama/Llama-3.2-1B"
+PROMPT = "Hello my name is Robert and I"
+
+
+@pytest.fixture(scope="module")
+def llm() -> LLM:
+    return LLM(MODEL,
+               enforce_eager=True,
+               enable_prefix_caching=True,
+               long_prefill_token_threshold=2,
+               max_num_batched_tokens=6,
+               max_num_seqs=3,
+               block_size=16)
+
+
+def test_concurrent_partial_prefill(llm):
+    outputs = llm.generate([PROMPT] * 3)
+    assert len(outputs) == 3
+    for output in outputs:
+        assert len(output.outputs) == 1
+
+
+def test_prefix_cache_stats_is_recorded(llm):
+    # 17 tokens will make sure first 16 tokens are cached in a block
+    input_tokens = {"prompt_token_ids": [101] * 17}
+    _ = llm.generate([input_tokens])
+    outputs = llm.generate([input_tokens])
+    assert outputs[0].num_cached_tokens == 16
diff --git a/vllm_v0.10.0/tests/v1/core/test_specialized_manager.py b/vllm_v0.10.0/tests/v1/core/test_specialized_manager.py
new file mode 100644
index 0000000..b67c05b
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/core/test_specialized_manager.py
@@ -0,0 +1,327 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import random
+
+import torch
+
+from vllm.v1.core.block_pool import BlockPool
+from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
+                                         KVCacheBlock)
+from vllm.v1.core.single_type_kv_cache_manager import (
+    ChunkedLocalAttentionManager, SlidingWindowManager)
+from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
+                                        SlidingWindowSpec)
+
+
+def get_sliding_window_manager(sliding_window_spec, block_pool):
+    return SlidingWindowManager(sliding_window_spec,
+                                block_pool,
+                                caching_hash_fn=lambda x: x,
+                                kv_cache_group_id=0)
+
+
+def get_chunked_local_attention_manager(chunked_local_attention_spec,
+                                        block_pool):
+    return ChunkedLocalAttentionManager(chunked_local_attention_spec,
+                                        block_pool,
+                                        caching_hash_fn=lambda x: x,
+                                        kv_cache_group_id=0)
+
+
+def test_chunked_local_attention_possible_cached_prefix():
+    block_size = 2
+    chunked_local_attention_spec = ChunkedLocalAttentionSpec(
+        block_size=block_size,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+        attention_chunk_size=4,
+        use_mla=False,
+    )
+
+    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    manager = get_chunked_local_attention_manager(chunked_local_attention_spec,
+                                                  block_pool)
+
+    def run_one_case(block_is_cached, tail_token, expect_length):
+        block_hash_list = [
+            BlockHash(i, ()) for i in range(len(block_is_cached))
+        ]
+
+        block_pool.cached_block_hash_to_block.clear()
+
+        # Mock the block pool with the cached blocks
+        for i, (block_hash,
+                is_cached) in enumerate(zip(block_hash_list, block_is_cached)):
+            if is_cached:
+                block_pool.cached_block_hash_to_block[BlockHashWithGroupId(
+                    block_hash, 0)] = {
+                        i: block_pool.blocks[i + 10],
+                    }
+
+        computed_blocks = manager.find_longest_cache_hit(
+            block_hashes=block_hash_list,
+            max_length=len(block_hash_list) * block_size + tail_token,
+            kv_cache_group_ids=[0],
+            block_pool=block_pool,
+            kv_cache_spec=chunked_local_attention_spec,
+            use_eagle=False)[0]
+        assert len(computed_blocks) == expect_length
+
+        assert all(block == block_pool.null_block
+                   for block in computed_blocks[:(expect_length - 1) // 2])
+
+    run_one_case([True], 0, 1)
+    run_one_case([True], 1, 1)
+    run_one_case([True, False], 0, 2)
+    run_one_case([True, False], 1, 2)
+    run_one_case([True, True], 0, 2)
+    run_one_case([True, True], 1, 2)
+    run_one_case([True, True, False], 0, 2)
+    run_one_case([True, True, False], 1, 2)
+    run_one_case([True, True, True], 0, 3)
+    run_one_case([True, True, True], 1, 3)
+    run_one_case([True, True, True, False], 0, 4)
+    run_one_case([True, True, True, False], 1, 4)
+    run_one_case([random.choice([True, False])] * 8 + [True], 1, 9)
+    run_one_case([random.choice([True, False])] * 8 + [False], 1, 8)
+    run_one_case([random.choice([True, False])] * 8 + [True, True], 1, 10)
+    run_one_case([random.choice([True, False])] * 8 + [True, False], 0, 10)
+    run_one_case([random.choice([True, False])] * 8 + [True, False], 1, 10)
+    run_one_case([random.choice([True, False])] * 8 + [False, True], 0, 10)
+    run_one_case([random.choice([True, False])] * 8 + [False, True], 1, 10)
+    run_one_case([random.choice([True, False])] * 8 + [False, False], 0, 10)
+    run_one_case([random.choice([True, False])] * 8 + [False, False], 1, 10)
+
+
+def test_sliding_window_possible_cached_prefix():
+    block_size = 2
+    sliding_window_spec = SlidingWindowSpec(
+        block_size=block_size,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+        sliding_window=4,
+        use_mla=False,
+    )
+
+    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    manager = get_sliding_window_manager(sliding_window_spec, block_pool)
+
+    def run_one_case(block_is_cached, expect_length):
+        block_hash_list = [
+            BlockHash(i, ()) for i in range(len(block_is_cached))
+        ]
+
+        block_pool.cached_block_hash_to_block.clear()
+
+        # Mock the block pool with the cached blocks
+        for i, (block_hash,
+                is_cached) in enumerate(zip(block_hash_list, block_is_cached)):
+            if is_cached:
+                block_pool.cached_block_hash_to_block[BlockHashWithGroupId(
+                    block_hash, 0)] = {
+                        i: block_pool.blocks[i + 10],
+                    }
+
+        computed_blocks = manager.find_longest_cache_hit(
+            block_hashes=block_hash_list,
+            max_length=len(block_hash_list) * block_size,
+            kv_cache_group_ids=[0],
+            block_pool=block_pool,
+            kv_cache_spec=sliding_window_spec,
+            use_eagle=False)[0]
+        assert len(computed_blocks) == expect_length
+
+        assert all(block == block_pool.null_block
+                   for block in computed_blocks[:expect_length - 2])
+        for i in range(2):
+            if i < expect_length:
+                block_index = expect_length - i - 1
+                assert computed_blocks[
+                    block_index].block_id == block_index + 10
+
+    run_one_case([False] * 10, 0)
+    run_one_case([True], 1)
+    run_one_case([True, False], 1)
+    run_one_case([True, True], 2)
+    run_one_case([True, True, False], 2)
+    run_one_case([True, True, True], 3)
+    run_one_case([True, True, True, False], 3)
+    run_one_case([
+        True, True, False, True, False, False, True, True, False, True, True,
+        True
+    ], 12)
+    run_one_case([
+        True, True, False, True, False, False, True, True, False, False, False
+    ], 8)
+    run_one_case([
+        True, True, False, True, False, False, True, True, False, False, False,
+        True
+    ], 8)
+
+
+def test_chunked_local_attention_remove_skipped_blocks():
+    attention_spec = ChunkedLocalAttentionSpec(
+        block_size=2,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+        attention_chunk_size=4,
+        use_mla=False,
+    )
+
+    block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True)
+
+    manager = get_chunked_local_attention_manager(attention_spec, block_pool)
+
+    null_block_id = block_pool.null_block.block_id
+
+    def id_to_block_table(ids) -> list[KVCacheBlock]:
+        return [
+            KVCacheBlock(id_)
+            if id_ != null_block_id else block_pool.null_block for id_ in ids
+        ]
+
+    def assert_block_id(block_table: list[KVCacheBlock], ids: list[int]):
+        for block, id_ in zip(block_table, ids):
+            if id_ == null_block_id:
+                assert block == block_pool.null_block
+            else:
+                assert block.block_id == id_
+
+    original_block_ids = [
+        1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010
+    ]
+    block_table = id_to_block_table(original_block_ids)
+    manager.req_to_blocks["test"] = block_table
+
+    manager.remove_skipped_blocks("test", 0)
+    assert_block_id(block_table, original_block_ids)
+
+    # For 4th token (0-indexed), token 0-3 is out of the local attention window.
+    manager.remove_skipped_blocks("test", 4)
+    assert_block_id(block_table, [null_block_id] * 2)
+
+    # For 6th token (0-indexed), token 4 - 6 are in local attention window,
+    # token 0 - 3 are out, 2 blocks can be removed.
+    manager.remove_skipped_blocks("test", 6)
+    assert_block_id(block_table, [null_block_id] * 2 + original_block_ids[2:])
+    # For 12th token (0-indexed),
+    # token 0-11 are out, 6 block can be removed.
+    manager.remove_skipped_blocks("test", 12)
+    assert_block_id(block_table, [null_block_id] * 6)
+
+
+def test_sliding_window_remove_skipped_blocks():
+    sliding_window_spec = SlidingWindowSpec(
+        block_size=2,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+        sliding_window=4,
+        use_mla=False,
+    )
+
+    block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True)
+
+    manager = get_sliding_window_manager(sliding_window_spec, block_pool)
+
+    null_block_id = block_pool.null_block.block_id
+
+    def id_to_block_table(ids) -> list[KVCacheBlock]:
+        return [
+            KVCacheBlock(id_)
+            if id_ != null_block_id else block_pool.null_block for id_ in ids
+        ]
+
+    def assert_block_id(block_table: list[KVCacheBlock], ids: list[int]):
+        for block, id_ in zip(block_table, ids):
+            if id_ == null_block_id:
+                assert block == block_pool.null_block
+            else:
+                assert block.block_id == id_
+
+    original_block_ids = [
+        1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010
+    ]
+    block_table = id_to_block_table(original_block_ids)
+    manager.req_to_blocks["test"] = block_table
+
+    manager.remove_skipped_blocks("test", 0)
+    assert_block_id(block_table, original_block_ids)
+
+    # 4 tokens are computed. Only token 0 is out of the sliding window. As
+    # block 1000 also contains token 1 that is in the sliding window, block 1000
+    # cannot be removed.
+    manager.remove_skipped_blocks("test", 4)
+    assert_block_id(block_table, original_block_ids)
+
+    # 5 tokens are computed. Token 0 & 1 are out of the sliding window.
+    # Block 1000 can be removed.
+    manager.remove_skipped_blocks("test", 5)
+    assert_block_id(block_table, [null_block_id] + original_block_ids[1:])
+
+    # 6 tokens are computed. Token 0-2 are out of the sliding window.
+    # Cannot remove new block as the block 1001 is still used by token 3.
+    manager.remove_skipped_blocks("test", 6)
+    assert_block_id(block_table, [null_block_id] + original_block_ids[1:])
+
+    # 7 tokens are computed. Token 0-3 are out of the sliding window.
+    # Block 1001 can be removed and block 1000 is already removed.
+    manager.remove_skipped_blocks("test", 7)
+    assert_block_id(block_table, [null_block_id] * 2 + original_block_ids[2:])
+
+    # 11 tokens are computed. Token 0-7 are out of the sliding window.
+    # Block 1002 & 1003 can be removed now. Block 1003 represents a longer
+    # sequence, and is expected to be evicted earlier than 1002, so the order
+    # of removed blocks should be [1003, 1002].
+    manager.remove_skipped_blocks("test", 11)
+    assert_block_id(block_table, [null_block_id] * 4 + original_block_ids[4:])
+
+
+def test_get_num_blocks_to_allocate():
+    block_size = 2
+    sliding_window_spec = SlidingWindowSpec(
+        block_size=block_size,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+        sliding_window=4,  # Placeholder value, not related to test result
+        use_mla=False,
+    )
+
+    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    manager = get_sliding_window_manager(sliding_window_spec, block_pool)
+    cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)]
+    cached_blocks_2 = [block_pool.null_block for _ in range(5)
+                       ] + [KVCacheBlock(i + 1) for i in range(5)]
+
+    assert manager.get_num_blocks_to_allocate("1", 20 * block_size,
+                                              cached_blocks_1) == 20
+    assert manager.get_num_blocks_to_allocate("2", 20 * block_size,
+                                              cached_blocks_2) == 15
+
+
+def test_chunked_local_attention_get_num_blocks_to_allocate():
+    block_size = 2
+    attention_spec = ChunkedLocalAttentionSpec(
+        block_size=block_size,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+        attention_chunk_size=4,  # Placeholder value, not related to test result
+        use_mla=False,
+    )
+
+    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    manager = get_chunked_local_attention_manager(attention_spec, block_pool)
+    cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)]
+    cached_blocks_2 = [block_pool.null_block for _ in range(5)
+                       ] + [KVCacheBlock(i + 1) for i in range(5)]
+
+    assert manager.get_num_blocks_to_allocate("1", 20 * block_size,
+                                              cached_blocks_1) == 20
+    assert manager.get_num_blocks_to_allocate("2", 20 * block_size,
+                                              cached_blocks_2) == 15
diff --git a/vllm_v0.10.0/tests/v1/core/utils.py b/vllm_v0.10.0/tests/v1/core/utils.py
new file mode 100644
index 0000000..0b7d825
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/core/utils.py
@@ -0,0 +1,152 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional, Union
+
+import torch
+
+from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
+                         SchedulerConfig, SpeculativeConfig, VllmConfig)
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.sampling_params import SamplingParams
+from vllm.v1.core.sched.async_scheduler import AsyncScheduler
+from vllm.v1.core.sched.scheduler import Scheduler
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheGroupSpec)
+from vllm.v1.request import Request
+from vllm.v1.structured_output import StructuredOutputManager
+
+EOS_TOKEN_ID = 50256
+
+
+def create_scheduler(
+    model: str = "facebook/opt-125m",
+    max_num_seqs: int = 16,
+    max_num_batched_tokens: int = 8192,
+    enable_prefix_caching: Optional[bool] = None,
+    long_prefill_token_threshold: int = 0,
+    disable_chunked_mm_input: bool = False,
+    use_kv_connector: bool = False,
+    num_blocks: int = 10000,
+    block_size: int = 16,
+    max_model_len: Optional[int] = None,
+    num_speculative_tokens: Optional[int] = None,
+    skip_tokenizer_init: bool = False,
+    async_scheduling: bool = False,
+) -> Union[Scheduler, AsyncScheduler]:
+    '''Create scheduler under test.
+
+    Args:
+      model: model under test
+      max_num_seqs: max sequences to schedule
+      max_num_batch_tokens: max num tokens to batch
+      enable_prefix_caching: optionally force APC config
+                             (True/False) or use default
+                             (None)
+
+    Returns:
+      {class}`Scheduler` instance
+    '''
+    if max_model_len is None:
+        max_model_len = max_num_batched_tokens
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=max_num_seqs,
+        max_num_batched_tokens=max_num_batched_tokens,
+        max_model_len=max_model_len,
+        long_prefill_token_threshold=long_prefill_token_threshold,
+        disable_chunked_mm_input=disable_chunked_mm_input,
+        enable_chunked_prefill=True,
+        async_scheduling=async_scheduling,
+    )
+    model_config = ModelConfig(
+        model=model,
+        task="auto",
+        tokenizer=model,
+        tokenizer_mode="auto",
+        trust_remote_code=True,
+        dtype="float16",
+        seed=42,
+        skip_tokenizer_init=skip_tokenizer_init,
+    )
+    # Cache config, optionally force APC
+    kwargs_cache = ({} if enable_prefix_caching is None else {
+        'enable_prefix_caching': enable_prefix_caching
+    })
+    cache_config = CacheConfig(
+        block_size=block_size,
+        gpu_memory_utilization=0.9,
+        swap_space=0,
+        cache_dtype="auto",
+        **kwargs_cache,
+    )
+    kv_transfer_config = KVTransferConfig(
+        kv_connector="SharedStorageConnector",
+        kv_role="kv_both",
+        kv_connector_extra_config={"shared_storage_path": "local_storage"},
+    ) if use_kv_connector else None
+
+    speculative_config: Optional[SpeculativeConfig] = None
+    if num_speculative_tokens is not None:
+        speculative_config = SpeculativeConfig(
+            model="ngram", num_speculative_tokens=num_speculative_tokens)
+
+    vllm_config = VllmConfig(
+        scheduler_config=scheduler_config,
+        model_config=model_config,
+        cache_config=cache_config,
+        kv_transfer_config=kv_transfer_config,
+        speculative_config=speculative_config,
+    )
+    kv_cache_config = KVCacheConfig(
+        num_blocks=num_blocks,  # A large number of blocks to hold all requests
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec(['layer'],
+                             FullAttentionSpec(block_size, 1, 1, torch.float32,
+                                               False))
+        ],
+    )
+    cache_config.num_gpu_blocks = num_blocks
+    scheduler_cls = AsyncScheduler if async_scheduling else Scheduler
+    return scheduler_cls(
+        vllm_config=vllm_config,
+        kv_cache_config=kv_cache_config,
+        log_stats=True,
+        structured_output_manager=StructuredOutputManager(vllm_config),
+    )
+
+
+def create_requests(
+    num_requests: int,
+    num_tokens: int = 10,
+    mm_positions: Optional[list[PlaceholderRange]] = None,
+    max_tokens: int = 16,
+    stop_token_ids: Optional[list[int]] = None,
+    prompt_logprobs: Optional[int] = None,
+    same_prompt: bool = False,
+) -> list[Request]:
+    sampling_params = SamplingParams(ignore_eos=False,
+                                     max_tokens=max_tokens,
+                                     stop_token_ids=stop_token_ids,
+                                     prompt_logprobs=prompt_logprobs)
+    requests = []
+    for i in range(num_requests):
+        if mm_positions is not None:
+            mm_position = mm_positions[i]
+            mm_inputs = [MultiModalKwargs({})] * len(mm_position)
+        else:
+            mm_position = None
+            mm_inputs = None
+        prompt_token_ids = ([0] * num_tokens if same_prompt else [i] *
+                            num_tokens)
+        request = Request(
+            request_id=f"{i}",
+            prompt_token_ids=prompt_token_ids,
+            sampling_params=sampling_params,
+            pooling_params=None,
+            multi_modal_inputs=mm_inputs,
+            multi_modal_placeholders=mm_position,
+            multi_modal_hashes=None,
+            eos_token_id=EOS_TOKEN_ID,
+        )
+        requests.append(request)
+    return requests
diff --git a/vllm_v0.10.0/tests/v1/e2e/__init__.py b/vllm_v0.10.0/tests/v1/e2e/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/v1/e2e/test_cascade_attention.py b/vllm_v0.10.0/tests/v1/e2e/test_cascade_attention.py
new file mode 100644
index 0000000..f2f4605
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/e2e/test_cascade_attention.py
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm import LLM, SamplingParams
+
+from ...utils import create_new_process_for_each_test
+
+
+@create_new_process_for_each_test()
+@pytest.mark.parametrize("attn_backend",
+                         ["FLASH_ATTN_VLLM_V1", "FLASHINFER_VLLM_V1"])
+def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
+    prompt = "\n<User>: Implement fibonacci sequence in Python.\n<Claude>:"
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
+
+        llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
+
+        # No cascade attention.
+        single_prompt = [example_system_message + prompt]
+        responses = llm.generate(single_prompt, sampling_params)
+        ref_output = responses[0].outputs[0].text
+
+        # (Probably) Use cascade attention.
+        prompts = [example_system_message + prompt] * 64
+        responses = llm.generate(prompts, sampling_params)
+        for response in responses:
+            assert response.outputs[0].text == ref_output
diff --git a/vllm_v0.10.0/tests/v1/e2e/test_correctness_sliding_window.py b/vllm_v0.10.0/tests/v1/e2e/test_correctness_sliding_window.py
new file mode 100644
index 0000000..277ea3c
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/e2e/test_correctness_sliding_window.py
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+
+import pytest
+
+from vllm import LLM, SamplingParams
+
+from ...core.block.e2e.test_correctness_sliding_window import (check_answers,
+                                                               prep_prompts)
+
+
+@dataclass
+class TestConfig:
+    sliding_window: int
+    ln_range: tuple[int, int]
+
+
+model_config = {
+    "bigcode/starcoder2-3b": TestConfig(4096, (800, 1100)),
+    "google/gemma-3-1b-it": TestConfig(4096, (400, 800)),
+}
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        "bigcode/starcoder2-3b",  # sliding window only
+        "google/gemma-3-1b-it",  # sliding window + full attention
+    ])
+@pytest.mark.parametrize("batch_size", [5])
+@pytest.mark.parametrize("seed", [1])
+def test_sliding_window_retrieval(monkeypatch, model, batch_size, seed):
+    """
+    The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
+    asks for value of one of them (which is outside the sliding window).
+    If we tell it upfront which we are going to be looking for, then
+    it answers correctly (mostly).
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        test_config = model_config[model]
+
+        llm = LLM(model=model)
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
+
+        prompts, answer, indices = prep_prompts(batch_size,
+                                                ln_range=test_config.ln_range)
+
+        check_length(prompts, llm, test_config.sliding_window)
+
+        # Fresh generation
+        responses = llm.generate(prompts, sampling_params)
+        check_answers(indices,
+                      answer,
+                      [response.outputs[0].text for response in responses],
+                      accept_rate=1.0)
+
+        # Re-generate with the same prompts to test prefix caching
+        responses = llm.generate(prompts, sampling_params)
+        check_answers(indices,
+                      answer,
+                      [response.outputs[0].text for response in responses],
+                      accept_rate=1.0)
+
+
+def check_length(prompts: list[str], llm: LLM, sliding_window: int):
+    """
+    Check if the prompt length is valid, i.e., longer than the sliding window 
+    size and shorter than the model's max length.
+
+    Args:
+        prompts: list of prompts
+        llm: LLM object
+        sliding_window: Sliding window size
+    """
+    tokenizer = llm.get_tokenizer()
+    max_model_len = llm.llm_engine.model_config.max_model_len
+    assert any(
+        len(tokenizer.encode(prompt)) > sliding_window
+        for prompt in prompts), "Prompt is too short for test"
+    assert all(
+        len(tokenizer.encode(prompt)) <= max_model_len
+        for prompt in prompts), "Prompt is too long for test"
diff --git a/vllm_v0.10.0/tests/v1/e2e/test_spec_decode.py b/vllm_v0.10.0/tests/v1/e2e/test_spec_decode.py
new file mode 100644
index 0000000..2423f96
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/e2e/test_spec_decode.py
@@ -0,0 +1,168 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import random
+from typing import Any
+
+import pytest
+import torch
+
+from vllm import LLM, SamplingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+
+
+@pytest.fixture
+def test_prompts():
+    prompt_types = ["repeat", "sentence"]
+    num_prompts = 100
+    prompts = []
+
+    random.seed(0)
+    random_prompt_type_choices = random.choices(prompt_types, k=num_prompts)
+
+    # Generate a mixed batch of prompts, some of which can be easily
+    # predicted by n-gram matching and some which likely cannot.
+    for kind in random_prompt_type_choices:
+        word_choices = ["test", "temp", "hello", "where"]
+        word = random.choice(word_choices)
+        if kind == "repeat":
+            prompt = f"""
+            please repeat the word '{word}' 10 times.
+            give no other output than the word at least ten times in a row,
+            in lowercase with spaces between each word and without quotes.
+            """
+        elif kind == "sentence":
+            prompt = f"""
+            please give a ten-word sentence that
+            uses the word {word} at least once.
+            give no other output than that simple sentence without quotes.
+            """
+        else:
+            raise ValueError(f"Unknown prompt type: {kind}")
+        prompts.append([{"role": "user", "content": prompt}])
+
+    return prompts
+
+
+@pytest.fixture
+def sampling_config():
+    return SamplingParams(temperature=0, max_tokens=10, ignore_eos=False)
+
+
+@pytest.fixture
+def model_name():
+    return "meta-llama/Llama-3.1-8B-Instruct"
+
+
+def test_ngram_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    test_prompts: list[list[dict[str, Any]]],
+    sampling_config: SamplingParams,
+    model_name: str,
+):
+    '''
+    Compare the outputs of a original LLM and a speculative LLM
+    should be the same when using ngram speculative decoding.
+    '''
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        ref_llm = LLM(model=model_name, max_model_len=1024)
+        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
+        del ref_llm
+        torch.cuda.empty_cache()
+        cleanup_dist_env_and_memory()
+
+        spec_llm = LLM(
+            model=model_name,
+            speculative_config={
+                "method": "ngram",
+                "prompt_lookup_max": 5,
+                "prompt_lookup_min": 3,
+                "num_speculative_tokens": 3,
+            },
+            max_model_len=1024,
+        )
+        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
+        matches = 0
+        misses = 0
+        for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+            if ref_output.outputs[0].text == spec_output.outputs[0].text:
+                matches += 1
+            else:
+                misses += 1
+                print(f"ref_output: {ref_output.outputs[0].text}")
+                print(f"spec_output: {spec_output.outputs[0].text}")
+
+        # Heuristic: expect at least 70% of the prompts to match exactly
+        # Upon failure, inspect the outputs to check for inaccuracy.
+        assert matches > int(0.7 * len(ref_outputs))
+        del spec_llm
+        torch.cuda.empty_cache()
+        cleanup_dist_env_and_memory()
+
+
+@pytest.mark.parametrize("model_setup", [
+    ("eagle", "meta-llama/Llama-3.1-8B-Instruct",
+     "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1),
+    ("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
+     "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1),
+    pytest.param(
+        ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+         "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
+        marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
+],
+                         ids=["llama3_eagle", "llama3_eagle3", "llama4_eagle"])
+def test_eagle_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    test_prompts: list[list[dict[str, Any]]],
+    sampling_config: SamplingParams,
+    model_setup: tuple[str, str, str, int],
+):
+    '''
+    Compare the outputs of a original LLM and a speculative LLM
+    should be the same when using eagle speculative decoding.
+    model_setup: (method, model_name, eagle_model_name, tp_size)
+    '''
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        method, model_name, spec_model_name, tp_size = model_setup
+
+        ref_llm = LLM(model=model_name,
+                      max_model_len=2048,
+                      tensor_parallel_size=tp_size)
+        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
+        del ref_llm
+        torch.cuda.empty_cache()
+        cleanup_dist_env_and_memory()
+
+        spec_llm = LLM(
+            model=model_name,
+            trust_remote_code=True,
+            tensor_parallel_size=tp_size,
+            speculative_config={
+                "method": method,
+                "model": spec_model_name,
+                "num_speculative_tokens": 3,
+                "max_model_len": 2048,
+            },
+            max_model_len=2048,
+        )
+        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
+        matches = 0
+        misses = 0
+        for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+            if ref_output.outputs[0].text == spec_output.outputs[0].text:
+                matches += 1
+            else:
+                misses += 1
+                print(f"ref_output: {ref_output.outputs[0].text}")
+                print(f"spec_output: {spec_output.outputs[0].text}")
+
+        # Heuristic: expect at least 66% of the prompts to match exactly
+        # Upon failure, inspect the outputs to check for inaccuracy.
+        assert matches > int(0.66 * len(ref_outputs))
+        del spec_llm
+        torch.cuda.empty_cache()
+        cleanup_dist_env_and_memory()
diff --git a/vllm_v0.10.0/tests/v1/engine/__init__.py b/vllm_v0.10.0/tests/v1/engine/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/v1/engine/conftest.py b/vllm_v0.10.0/tests/v1/engine/conftest.py
new file mode 100644
index 0000000..d772214
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/engine/conftest.py
@@ -0,0 +1,91 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+from transformers import AutoTokenizer
+
+from tests.v1.engine.utils import (NUM_PROMPT_LOGPROBS_UNDER_TEST,
+                                   NUM_SAMPLE_LOGPROBS_UNDER_TEST, PROMPT_LEN,
+                                   TOKENIZER_NAME,
+                                   DummyOutputProcessorTestVectors,
+                                   generate_dummy_prompt_logprobs_tensors,
+                                   generate_dummy_sample_logprobs)
+from vllm.engine.arg_utils import EngineArgs
+from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
+
+from ...distributed.conftest import publisher_config, random_port  # noqa: F401
+
+from tests.v1.engine.utils import FULL_STRINGS  # isort: skip
+
+EngineCoreSampleLogprobsType = list[tuple[torch.Tensor, torch.Tensor]]
+EngineCorePromptLogprobsType = tuple[torch.Tensor, torch.Tensor]
+
+
+def _build_test_vectors_no_logprobs() -> DummyOutputProcessorTestVectors:
+    """Generate output processor dummy test vectors, without logprobs
+    
+    Returns:
+      DummyOutputProcessorTestVectors instance with no logprobs
+    """
+
+    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
+    vllm_config = EngineArgs(model=TOKENIZER_NAME).create_engine_config()
+    # Tokenize prompts under test & create dummy generated tokens
+    prompt_tokens = [
+        tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS
+    ]
+    generation_tokens = [
+        tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS
+    ]
+    # Generate prompt strings
+    prompt_strings = [
+        tokenizer.decode(prompt_tokens, skip_special_tokens=True)
+        for prompt_tokens in prompt_tokens
+    ]
+    prompt_strings_len = [
+        len(prompt_string) for prompt_string in prompt_strings
+    ]
+    return DummyOutputProcessorTestVectors(
+        tokenizer=tokenizer,
+        tokenizer_group=init_tokenizer_from_configs(
+            vllm_config.model_config, vllm_config.scheduler_config,
+            vllm_config.lora_config),
+        vllm_config=vllm_config,
+        full_tokens=[tokenizer(text).input_ids for text in FULL_STRINGS],
+        prompt_tokens=prompt_tokens,
+        generation_tokens=generation_tokens,
+        prompt_strings=prompt_strings,
+        prompt_strings_len=prompt_strings_len,
+        generation_strings=[
+            text[prompt_len:]
+            for text, prompt_len in zip(FULL_STRINGS, prompt_strings_len)
+        ],
+        prompt_logprobs=[],
+        generation_logprobs=[])
+
+
+@pytest.fixture
+def dummy_test_vectors() -> DummyOutputProcessorTestVectors:
+    """Generate output processor dummy test vectors, with logprobs
+    
+    Returns:
+      DummyOutputProcessorTestVectors instance with logprobs
+    """
+    # Build dummy test vectors without logprobs
+    dtv = _build_test_vectors_no_logprobs()
+    # Inject logprobs into dummy test vectors
+    # data structure
+    dtv.generation_logprobs = [
+        generate_dummy_sample_logprobs(
+            sampled_tokens_list=tokens_list,
+            num_logprobs=NUM_SAMPLE_LOGPROBS_UNDER_TEST,
+            tokenizer=dtv.tokenizer) for tokens_list in dtv.generation_tokens
+    ]
+    dtv.prompt_logprobs = [
+        generate_dummy_prompt_logprobs_tensors(
+            prompt_tokens_list=tokens_list,
+            num_logprobs=NUM_PROMPT_LOGPROBS_UNDER_TEST,
+            tokenizer=dtv.tokenizer) for tokens_list in dtv.prompt_tokens
+    ]
+    return dtv
diff --git a/vllm_v0.10.0/tests/v1/engine/test_async_llm.py b/vllm_v0.10.0/tests/v1/engine/test_async_llm.py
new file mode 100644
index 0000000..412df3a
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/engine/test_async_llm.py
@@ -0,0 +1,402 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+from contextlib import ExitStack
+from typing import Optional
+from unittest.mock import MagicMock
+
+import pytest
+
+from vllm import SamplingParams
+from vllm.assets.image import ImageAsset
+from vllm.config import VllmConfig
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.inputs import PromptType
+from vllm.platforms import current_platform
+from vllm.sampling_params import RequestOutputKind
+from vllm.utils import set_default_torch_num_threads
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.metrics.loggers import LoggingStatLogger
+
+if not current_platform.is_cuda():
+    pytest.skip(reason="V1 currently only supported on CUDA.",
+                allow_module_level=True)
+
+TEXT_ENGINE_ARGS = AsyncEngineArgs(
+    model="meta-llama/Llama-3.2-1B-Instruct",
+    enforce_eager=True,
+    disable_log_requests=True,
+)
+
+VISION_ENGINE_ARGS = AsyncEngineArgs(model="Qwen/Qwen2-VL-2B-Instruct",
+                                     enforce_eager=True,
+                                     disable_log_requests=True)
+
+TEXT_PROMPT = "Hello my name is Robert and"
+
+VISION_PROMPT_TEMPLATE = (
+    "<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
+    "\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
+    "What is in the image?<|im_end|>\n"
+    "<|im_start|>assistant\n")
+VISION_PROMPT = {
+    "prompt": VISION_PROMPT_TEMPLATE,
+    "multi_modal_data": {
+        "image": ImageAsset("stop_sign").pil_image
+    },
+}
+
+
+async def generate(
+    engine: AsyncLLM,
+    request_id: str,
+    prompt: PromptType,
+    output_kind: RequestOutputKind,
+    max_tokens: int,
+    n: int = 1,
+    prompt_logprobs: Optional[int] = None,
+    cancel_after: Optional[int] = None,
+) -> tuple[int, str]:
+    # Ensure generate doesn't complete too fast for cancellation test.
+    await asyncio.sleep(0.2)
+
+    count = 0
+    sampling_params = SamplingParams(
+        max_tokens=max_tokens,
+        ignore_eos=True,
+        output_kind=output_kind,
+        temperature=0.5,
+        seed=33,
+        n=n,
+        prompt_logprobs=prompt_logprobs,
+    )
+    async for out in engine.generate(request_id=request_id,
+                                     prompt=prompt,
+                                     sampling_params=sampling_params):
+
+        num_tokens = sum(len(output.token_ids) for output in out.outputs)
+        if output_kind == RequestOutputKind.DELTA:
+            count += num_tokens
+        else:
+            count = num_tokens
+
+        if cancel_after is not None and count >= cancel_after:
+            return count, request_id
+
+        await asyncio.sleep(0.0)
+
+    return count, request_id
+
+
+@pytest.mark.parametrize(
+    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+@pytest.mark.parametrize(
+    "engine_args,prompt",
+    [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)],
+)
+@pytest.mark.asyncio
+async def test_load(
+    monkeypatch: pytest.MonkeyPatch,
+    output_kind: RequestOutputKind,
+    engine_args: AsyncEngineArgs,
+    prompt: PromptType,
+):
+    # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
+    # so that in the future when we switch, we don't have to change all the
+    # tests.
+    with monkeypatch.context() as m, ExitStack() as after:
+        m.setenv("VLLM_USE_V1", "1")
+
+        with set_default_torch_num_threads(1):
+            engine = AsyncLLM.from_engine_args(engine_args)
+        after.callback(engine.shutdown)
+
+        NUM_REQUESTS = 100
+        NUM_EXPECTED_TOKENS = 10
+
+        request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
+
+        # Create concurrent requests.
+        tasks = []
+        for request_id in request_ids:
+            tasks.append(
+                asyncio.create_task(
+                    generate(engine, request_id, prompt, output_kind,
+                             NUM_EXPECTED_TOKENS)))
+
+        # Confirm that we got all the EXPECTED tokens from the requests.
+        done, pending = await asyncio.wait(tasks,
+                                           return_when=asyncio.FIRST_EXCEPTION)
+        for task in pending:
+            task.cancel()
+        for task in done:
+            num_generated_tokens, request_id = await task
+            assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
+                f"{request_id} generated {num_generated_tokens} but "
+                f"expected {NUM_EXPECTED_TOKENS}")
+
+        assert not engine.output_processor.has_unfinished_requests()
+
+
+@pytest.mark.parametrize(
+    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+@pytest.mark.parametrize(
+    "engine_args,prompt",
+    [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)],
+)
+@pytest.mark.asyncio
+async def test_abort(
+    monkeypatch: pytest.MonkeyPatch,
+    output_kind: RequestOutputKind,
+    engine_args: AsyncEngineArgs,
+    prompt: PromptType,
+):
+
+    with monkeypatch.context() as m, ExitStack() as after:
+        m.setenv("VLLM_USE_V1", "1")
+
+        with set_default_torch_num_threads(1):
+            engine = AsyncLLM.from_engine_args(engine_args)
+        after.callback(engine.shutdown)
+
+        NUM_REQUESTS = 100
+        NUM_EXPECTED_TOKENS = 100
+        NUM_EXPECTED_TOKENS_LONG = 50000
+        REQUEST_IDS_TO_ABORT = range(1, 100, 10)
+        PARALLEL_SAMPLE_REQ_IDS = range(1, 100, 15)
+
+        request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
+
+        # Create concurrent requests.
+        tasks: list[asyncio.Task] = []
+        for idx, request_id in enumerate(request_ids):
+            max_tokens = (NUM_EXPECTED_TOKENS_LONG if
+                          (idx
+                           in REQUEST_IDS_TO_ABORT) else NUM_EXPECTED_TOKENS)
+            n = 3 if idx in PARALLEL_SAMPLE_REQ_IDS else 1
+            tasks.append(
+                asyncio.create_task(
+                    generate(engine, request_id, prompt, output_kind,
+                             max_tokens, n)))
+
+        # API server cancels requests when they disconnect.
+        for idx in REQUEST_IDS_TO_ABORT:
+            tasks[idx].cancel()
+            await asyncio.sleep(0.1)
+
+        # Confirm the other requests are okay.
+        for idx, task in enumerate(tasks):
+            # Confirm that it was actually canceled.
+            if idx in REQUEST_IDS_TO_ABORT:
+                with pytest.raises(asyncio.CancelledError):
+                    await task
+            else:
+                # Otherwise, make sure the request was not impacted.
+                num_generated_tokens, request_id = await task
+                n = 3 if idx in PARALLEL_SAMPLE_REQ_IDS else 1
+                expected_tokens = NUM_EXPECTED_TOKENS * n
+                assert num_generated_tokens == expected_tokens, (
+                    f"{request_id} generated {num_generated_tokens} but "
+                    f"expected {expected_tokens}")
+
+        # Make sure all aborted requests were really aborted.
+        assert not engine.output_processor.has_unfinished_requests()
+
+        # Confirm we can do another generation.
+        request_id = f"request-{REQUEST_IDS_TO_ABORT[0]}"
+        task = asyncio.create_task(
+            generate(engine, request_id, prompt, output_kind,
+                     NUM_EXPECTED_TOKENS))
+        num_generated_tokens, request_id = await task
+        assert num_generated_tokens == NUM_EXPECTED_TOKENS
+        assert not engine.output_processor.has_unfinished_requests()
+
+
+@pytest.mark.parametrize("n", [1, 3])
+@pytest.mark.parametrize(
+    "engine_args,prompt",
+    [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)],
+)
+@pytest.mark.asyncio
+async def test_finished_flag(
+    monkeypatch: pytest.MonkeyPatch,
+    n: int,
+    engine_args: AsyncEngineArgs,
+    prompt: PromptType,
+):
+
+    with monkeypatch.context() as m, ExitStack() as after:
+        m.setenv("VLLM_USE_V1", "1")
+
+        with set_default_torch_num_threads(1):
+            engine = AsyncLLM.from_engine_args(engine_args)
+        after.callback(engine.shutdown)
+
+        sampling_params = SamplingParams(
+            max_tokens=100,
+            output_kind=RequestOutputKind.DELTA,
+            temperature=1.0,
+            seed=33,
+            n=n,
+        )
+        outputs = [
+            out
+            async for out in engine.generate(request_id="request-33",
+                                             prompt=prompt,
+                                             sampling_params=sampling_params)
+        ]
+
+        # Assert only the last output has the finished flag set
+        assert all(not out.finished for out in outputs[:-1])
+        assert outputs[-1].finished
+
+
+@pytest.mark.parametrize(
+    "engine_args,prompt",
+    [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)],
+)
+@pytest.mark.asyncio
+async def test_mid_stream_cancellation(monkeypatch: pytest.MonkeyPatch,
+                                       engine_args: AsyncEngineArgs,
+                                       prompt: PromptType):
+    """Test that requests can be cancelled mid-stream."""
+    with monkeypatch.context() as m, ExitStack() as after:
+        m.setenv("VLLM_USE_V1", "1")
+
+        with set_default_torch_num_threads(1):
+            engine = AsyncLLM.from_engine_args(engine_args)
+        after.callback(engine.shutdown)
+
+        NUM_REQUESTS = 100
+        NUM_TOKENS = 1000
+        NUM_EXPECTED_TOKENS = 20
+
+        request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
+
+        # Create concurrent requests that will be cancelled mid-stream
+        tasks = []
+        for request_id in request_ids:
+            tasks.append(
+                asyncio.create_task(
+                    generate(
+                        engine,
+                        request_id,
+                        prompt,
+                        RequestOutputKind.DELTA,
+                        NUM_TOKENS,
+                        cancel_after=NUM_EXPECTED_TOKENS,
+                    )))
+
+        # Wait for all tasks to complete
+        results = await asyncio.gather(*tasks)
+
+        # Verify all tasks were cancelled at the expected point
+        for num_generated_tokens, request_id in results:
+            assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
+                f"{request_id} generated {num_generated_tokens} tokens but "
+                f"expected to cancel after {NUM_EXPECTED_TOKENS}")
+
+        # Make sure no requests are left hanging
+        assert not engine.output_processor.has_unfinished_requests()
+
+        # Confirm we can reuse the request id after the cancellations.
+        request_id = request_ids[0]
+        task = asyncio.create_task(
+            generate(engine, request_id, prompt, RequestOutputKind.DELTA,
+                     NUM_EXPECTED_TOKENS))
+        num_generated_tokens, request_id = await task
+        assert num_generated_tokens == NUM_EXPECTED_TOKENS
+        assert not engine.output_processor.has_unfinished_requests()
+
+
+class MockLoggingStatLogger(LoggingStatLogger):
+
+    def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
+        super().__init__(vllm_config, engine_index)
+        self.log = MagicMock()
+
+
+@pytest.mark.asyncio
+async def test_customize_loggers(monkeypatch):
+    """Test that we can customize the loggers.
+    If a customized logger is provided at the init, it should
+    be used directly.
+    """
+
+    with monkeypatch.context() as m, ExitStack() as after:
+        m.setenv("VLLM_USE_V1", "1")
+
+        with set_default_torch_num_threads(1):
+            engine = AsyncLLM.from_engine_args(
+                TEXT_ENGINE_ARGS,
+                stat_loggers=[MockLoggingStatLogger],
+            )
+        after.callback(engine.shutdown)
+
+        await engine.do_log_stats()
+
+        stat_loggers = engine.logger_manager.per_engine_logger_dict
+        assert len(stat_loggers) == 1
+        assert len(stat_loggers[0]) == 1
+        stat_loggers[0][0].log.assert_called_once()
+
+
+@pytest.mark.asyncio(scope="module")
+async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m, ExitStack() as after:
+        m.setenv("VLLM_USE_V1", "1")
+
+        with set_default_torch_num_threads(1):
+            engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
+        after.callback(engine.shutdown)
+
+        sampling_params = SamplingParams(max_tokens=100,
+                                         output_kind=RequestOutputKind.DELTA,
+                                         temperature=1.0,
+                                         seed=33)
+
+        # Test with valid DP rank.
+        async for _ in engine.generate(request_id="request-34",
+                                       prompt=TEXT_PROMPT,
+                                       sampling_params=sampling_params,
+                                       data_parallel_rank=0):
+            pass
+
+        # Test with out-of-range DP rank.
+        with pytest.raises(ValueError):
+            async for _ in engine.generate(request_id="request-35",
+                                           prompt=TEXT_PROMPT,
+                                           sampling_params=sampling_params,
+                                           data_parallel_rank=1):
+                pass
+
+
+@pytest.mark.asyncio
+async def test_check_health(monkeypatch: pytest.MonkeyPatch):
+    """Test that check_health returns normally for healthy engine
+    and raises EngineDeadError when the engine is dead.
+    """
+    from unittest.mock import patch
+
+    from vllm.v1.engine.exceptions import EngineDeadError
+
+    with monkeypatch.context() as m, ExitStack() as after:
+        m.setenv("VLLM_USE_V1", "1")
+
+        with set_default_torch_num_threads(1):
+            engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
+        after.callback(engine.shutdown)
+
+        # Test 1: Healthy engine should not raise any exception
+        await engine.check_health()
+
+        # Test 2: Mock the errored property to simulate a dead engine
+        with patch.object(type(engine),
+                          'errored',
+                          new_callable=lambda: property(lambda self: True)
+                          ), pytest.raises(EngineDeadError):
+            await engine.check_health()
+
+        # Test 3: Verify healthy engine still works after mock
+        await engine.check_health()
diff --git a/vllm_v0.10.0/tests/v1/engine/test_engine_args.py b/vllm_v0.10.0/tests/v1/engine/test_engine_args.py
new file mode 100644
index 0000000..f70a3ce
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/engine/test_engine_args.py
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from argparse import ArgumentError
+
+import pytest
+
+from vllm import envs
+from vllm.config import VllmConfig
+from vllm.engine.arg_utils import EngineArgs
+from vllm.usage.usage_lib import UsageContext
+from vllm.utils import FlexibleArgumentParser
+
+if not envs.VLLM_USE_V1:
+    pytest.skip(
+        "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
+        allow_module_level=True,
+    )
+
+
+def test_prefix_caching_from_cli():
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    args = parser.parse_args([])
+    vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
+    assert (vllm_config.cache_config.enable_prefix_caching
+            ), "V1 turns on prefix caching by default."
+
+    # Turn it off possible with flag.
+    args = parser.parse_args(["--no-enable-prefix-caching"])
+    vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
+    assert not vllm_config.cache_config.enable_prefix_caching
+
+    # Turn it on with flag.
+    args = parser.parse_args(["--enable-prefix-caching"])
+    vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
+    assert vllm_config.cache_config.enable_prefix_caching
+
+    # default hash algorithm is "builtin"
+    assert vllm_config.cache_config.prefix_caching_hash_algo == "builtin"
+
+    # set hash algorithm to sha256
+    args = parser.parse_args(["--prefix-caching-hash-algo", "sha256"])
+    vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
+    assert vllm_config.cache_config.prefix_caching_hash_algo == "sha256"
+
+    # set hash algorithm to builtin
+    args = parser.parse_args(["--prefix-caching-hash-algo", "builtin"])
+    vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
+    assert vllm_config.cache_config.prefix_caching_hash_algo == "builtin"
+
+    # an invalid hash algorithm raises an error
+    parser.exit_on_error = False
+    with pytest.raises(ArgumentError):
+        args = parser.parse_args(["--prefix-caching-hash-algo", "invalid"])
+
+
+def test_defaults_with_usage_context():
+    engine_args = EngineArgs(model="facebook/opt-125m")
+    vllm_config: VllmConfig = engine_args.create_engine_config(
+        UsageContext.LLM_CLASS)
+
+    from vllm.platforms import current_platform
+    device_name = current_platform.get_device_name().lower()
+    if "h100" in device_name or "h200" in device_name:
+        # For H100 and H200, we use larger default values.
+        default_llm_tokens = 16384
+        default_server_tokens = 8192
+        default_max_num_seqs = 1024
+    else:
+        default_llm_tokens = 8192
+        default_server_tokens = 2048
+        default_max_num_seqs = 256
+
+    assert vllm_config.scheduler_config.max_num_seqs == default_max_num_seqs
+    assert vllm_config.scheduler_config.max_num_batched_tokens == default_llm_tokens  # noqa: E501
+
+    engine_args = EngineArgs(model="facebook/opt-125m")
+    vllm_config = engine_args.create_engine_config(
+        UsageContext.OPENAI_API_SERVER)
+    assert vllm_config.scheduler_config.max_num_seqs == default_max_num_seqs
+    assert vllm_config.scheduler_config.max_num_batched_tokens == default_server_tokens  # noqa: E501
diff --git a/vllm_v0.10.0/tests/v1/engine/test_engine_core.py b/vllm_v0.10.0/tests/v1/engine/test_engine_core.py
new file mode 100644
index 0000000..bbdc73e
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/engine/test_engine_core.py
@@ -0,0 +1,415 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import copy
+import time
+import uuid
+from concurrent.futures import Future, ThreadPoolExecutor
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm import SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.platforms import current_platform
+from vllm.utils import set_default_torch_num_threads
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.core import EngineCore
+from vllm.v1.executor.abstract import Executor, UniProcExecutor
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.outputs import ModelRunnerOutput
+
+from ...utils import create_new_process_for_each_test, multi_gpu_test
+
+if not current_platform.is_cuda():
+    pytest.skip(reason="V1 currently only supported on CUDA.",
+                allow_module_level=True)
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
+PROMPT = "Hello my name is Robert and I love quantization kernels"
+PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
+
+
+def make_request() -> EngineCoreRequest:
+    return EngineCoreRequest(
+        request_id=str(uuid.uuid4()),
+        prompt_token_ids=PROMPT_TOKENS,
+        mm_inputs=None,
+        mm_hashes=None,
+        mm_placeholders=None,
+        sampling_params=SamplingParams(),
+        pooling_params=None,
+        eos_token_id=None,
+        arrival_time=time.time(),
+        lora_request=None,
+        cache_salt=None,
+        data_parallel_rank=None,
+    )
+
+
+@create_new_process_for_each_test()
+def test_engine_core(monkeypatch: pytest.MonkeyPatch):
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        """Setup the EngineCore."""
+        engine_args = EngineArgs(model=MODEL_NAME)
+        vllm_config = engine_args.create_engine_config()
+        executor_class = Executor.get_class(vllm_config)
+
+        with set_default_torch_num_threads(1):
+            engine_core = EngineCore(vllm_config=vllm_config,
+                                     executor_class=executor_class,
+                                     log_stats=True)
+        """Test basic request lifecycle."""
+
+        # First request.
+        engine_core.add_request(make_request())
+        assert len(engine_core.scheduler.waiting) == 1
+        assert len(engine_core.scheduler.running) == 0
+
+        _ = engine_core.step()
+        assert len(engine_core.scheduler.waiting) == 0
+        assert len(engine_core.scheduler.running) == 1
+
+        # Second request.
+        engine_core.add_request(make_request())
+        assert len(engine_core.scheduler.waiting) == 1
+        assert len(engine_core.scheduler.running) == 1
+
+        _ = engine_core.step()
+        assert len(engine_core.scheduler.waiting) == 0
+        assert len(engine_core.scheduler.running) == 2
+
+        # Add two requests in a row.
+        engine_core.add_request(make_request())
+        engine_core.add_request(make_request())
+        assert len(engine_core.scheduler.waiting) == 2
+        assert len(engine_core.scheduler.running) == 2
+
+        _ = engine_core.step()
+        assert len(engine_core.scheduler.waiting) == 0
+        assert len(engine_core.scheduler.running) == 4
+
+        # Loop through until they are all done.
+        while (outs := engine_core.step()[0].get(0)) and outs.outputs:
+            pass
+
+        assert len(engine_core.scheduler.waiting) == 0
+        assert len(engine_core.scheduler.running) == 0
+        """Test abort cycle."""
+
+        # Basic abort.
+        req = make_request()
+        request_id = req.request_id
+
+        engine_core.add_request(req)
+        assert len(engine_core.scheduler.waiting) == 1
+        assert len(engine_core.scheduler.running) == 0
+        assert engine_core.scheduler.has_unfinished_requests()
+        assert not engine_core.scheduler.has_finished_requests()
+
+        _ = engine_core.step()
+        assert len(engine_core.scheduler.waiting) == 0
+        assert len(engine_core.scheduler.running) == 1
+        assert engine_core.scheduler.has_unfinished_requests()
+        assert not engine_core.scheduler.has_finished_requests()
+
+        engine_core.abort_requests([request_id])
+        assert len(engine_core.scheduler.waiting) == 0
+        assert len(engine_core.scheduler.running) == 0
+        assert not engine_core.scheduler.has_unfinished_requests()
+        assert engine_core.scheduler.has_finished_requests()
+
+        _ = engine_core.step()
+        assert not engine_core.scheduler.has_unfinished_requests()
+        assert not engine_core.scheduler.has_finished_requests()
+
+        # Add, step, abort 1 of the 3.
+        req0 = make_request()
+        req1 = make_request()
+        req2 = make_request()
+
+        engine_core.add_request(req0)
+        engine_core.add_request(req1)
+        assert len(engine_core.scheduler.waiting) == 2
+        assert len(engine_core.scheduler.running) == 0
+
+        _ = engine_core.step()
+        assert len(engine_core.scheduler.waiting) == 0
+        assert len(engine_core.scheduler.running) == 2
+
+        engine_core.add_request(req2)
+        assert len(engine_core.scheduler.waiting) == 1
+        assert len(engine_core.scheduler.running) == 2
+
+        _ = engine_core.step()
+        assert len(engine_core.scheduler.waiting) == 0
+        assert len(engine_core.scheduler.running) == 3
+
+        # Abort just one.
+        engine_core.abort_requests([req1.request_id])
+        assert len(engine_core.scheduler.waiting) == 0
+        assert len(engine_core.scheduler.running) == 2
+
+        _ = engine_core.step()
+        assert len(engine_core.scheduler.waiting) == 0
+        assert len(engine_core.scheduler.running) == 2
+
+        # Abort the other requests at the same time.
+        engine_core.abort_requests([req2.request_id, req0.request_id])
+        assert len(engine_core.scheduler.waiting) == 0
+        assert len(engine_core.scheduler.running) == 0
+
+        # Sending duplicate requests with same request_id
+        req0 = make_request()
+        req1 = make_request()
+        req0.request_id = req1.request_id = "test"
+        engine_core.add_request(req0)
+
+        while (outs := engine_core.step()[0].get(0)) and outs.outputs:
+            pass
+
+        engine_core.add_request(req1)
+        while (outs := engine_core.step()[0].get(0)) and outs.outputs:
+            pass
+
+        assert len(engine_core.scheduler.waiting) == 0
+        assert len(engine_core.scheduler.running) == 0
+
+
+@create_new_process_for_each_test()
+def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
+    """
+    A basic end-to-end test to verify that the engine functions correctly
+    when additional sampling parameters, such as top_p, min_tokens, and
+    presence_penalty, are set.
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        """Setup the EngineCore."""
+        engine_args = EngineArgs(model=MODEL_NAME)
+        vllm_config = engine_args.create_engine_config()
+        executor_class = Executor.get_class(vllm_config)
+
+        with set_default_torch_num_threads(1):
+            engine_core = EngineCore(vllm_config=vllm_config,
+                                     executor_class=executor_class,
+                                     log_stats=True)
+        """Test basic request lifecycle."""
+        # First request.
+        request: EngineCoreRequest = make_request()
+        request.sampling_params = SamplingParams(
+            min_tokens=4,
+            presence_penalty=1.0,
+            frequency_penalty=1.0,
+            repetition_penalty=0.1,
+            stop_token_ids=[1001, 1002],
+        )
+        engine_core.add_request(request)
+
+        def _check_engine_state():
+            assert len(engine_core.scheduler.waiting) == 1
+            assert len(engine_core.scheduler.running) == 0
+            # Loop through until they are all done.
+            while (outs := engine_core.step()[0].get(0)) and outs.outputs:
+                pass
+            assert len(engine_core.scheduler.waiting) == 0
+            assert len(engine_core.scheduler.running) == 0
+
+        _check_engine_state()
+
+        # Second request.
+        request2 = make_request()
+        request2.sampling_params = SamplingParams(
+            top_p=0.99,
+            top_k=50,
+        )
+        engine_core.add_request(request2)
+        _check_engine_state()
+
+
+@create_new_process_for_each_test()
+def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
+    """
+    Test that the engine can handle multiple concurrent batches.
+    """
+
+    def make_request_with_max_tokens(req_id: int,
+                                     max_tokens: int) -> EngineCoreRequest:
+        request = make_request()
+        request.request_id = req_id
+        request.sampling_params.max_tokens = max_tokens
+        return request
+
+    class DummyExecutor(UniProcExecutor):
+
+        def initialize_from_config(
+                self, kv_cache_configs: list[KVCacheConfig]) -> None:
+            super().initialize_from_config(kv_cache_configs)
+
+            # Create a thread pool with a single worker
+            self.thread_pool = ThreadPoolExecutor(max_workers=1)
+
+        def execute_model(
+            self,
+            scheduler_output,
+        ) -> Future[ModelRunnerOutput]:
+            """Make execute_model non-blocking."""
+
+            def _execute():
+                output = self.collective_rpc("execute_model",
+                                             args=(scheduler_output, ))
+                # Make a copy because output[0] may be reused
+                # by the next batch.
+                return copy.deepcopy(output[0])
+
+            # Use the thread pool instead of creating a new thread
+            return self.thread_pool.submit(_execute)
+
+        @property
+        def max_concurrent_batches(self) -> int:
+            return 2
+
+        def shutdown(self):
+            if hasattr(self, 'thread_pool'):
+                self.thread_pool.shutdown(wait=False)
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        engine_args = EngineArgs(
+            model=MODEL_NAME,
+            # To test concurrent batches.
+            max_num_seqs=2,
+            # Avoid all requests being scheduled once.
+            enable_prefix_caching=False,
+            max_num_batched_tokens=10,
+            # Reduce startup time.
+            enforce_eager=True,
+        )
+        vllm_config = engine_args.create_engine_config()
+        with set_default_torch_num_threads(1):
+            engine_core = EngineCore(vllm_config=vllm_config,
+                                     log_stats=False,
+                                     executor_class=DummyExecutor)
+        assert engine_core.batch_queue is not None
+
+        # Add two requests in a row. Each request have 12 prompt tokens.
+        req0 = make_request_with_max_tokens(0, 5)
+        engine_core.add_request(req0)
+        req1 = make_request_with_max_tokens(1, 5)
+        engine_core.add_request(req1)
+
+        # Schedule Batch 1: (10, req0)
+        assert engine_core.step_with_batch_queue()[0] is None
+        assert engine_core.batch_queue.qsize() == 1
+        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        assert scheduler_output.num_scheduled_tokens[0] == 10
+        # num_computed_tokens should have been updated immediately.
+        assert engine_core.scheduler.requests[
+            req0.request_id].num_computed_tokens == 10
+
+        # Schedule Batch 2: (2, req0), (8, req1)
+        assert engine_core.step_with_batch_queue()[0] is None
+        assert engine_core.batch_queue.qsize() == 2
+        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        assert scheduler_output.num_scheduled_tokens[0] == 2
+        assert scheduler_output.num_scheduled_tokens[1] == 8
+        # num_computed_tokens should have been updated immediately.
+        assert engine_core.scheduler.requests[0].num_computed_tokens == 12
+        assert engine_core.scheduler.requests[1].num_computed_tokens == 8
+
+        assert engine_core.scheduler.get_num_unfinished_requests() == 2
+
+        # Batch queue is full. Finish Batch 1.
+        engine_core.step_with_batch_queue()
+
+        # Schedule Batch 3: (4, req1). Note that req0 cannot be scheduled
+        # because it is in the decoding stage now.
+        engine_core.step_with_batch_queue()
+        assert engine_core.batch_queue.qsize() == 2
+        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        assert scheduler_output.num_scheduled_tokens[1] == 4
+
+        # Batch queue is full. Finish Batch 2. Get first token of req0.
+        output = engine_core.step_with_batch_queue()[0].get(0)
+        assert output is not None
+        assert len(output.outputs) == 1
+        assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13
+
+        # Schedule Batch 4: (1, req0).
+        engine_core.step_with_batch_queue()
+        assert engine_core.batch_queue.qsize() == 2
+        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        assert scheduler_output.num_scheduled_tokens[0] == 1
+
+        # Batch queue is full. Finish Batch 3. Get first token of req1.
+        output = engine_core.step_with_batch_queue()[0].get(0)
+        assert output is not None
+        assert len(output.outputs) == 1
+        assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13
+
+        # Schedule Batch 5: (1, req1).
+        engine_core.step_with_batch_queue()
+        assert engine_core.batch_queue.qsize() == 2
+        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        assert scheduler_output.num_scheduled_tokens[1] == 1
+
+        # Loop until req0 is finished.
+        step = 0
+        req_id = 0
+        expected_num_tokens = [
+            engine_core.scheduler.requests[0].num_tokens + 1,
+            engine_core.scheduler.requests[1].num_tokens + 1,
+        ]
+        while engine_core.scheduler.get_num_unfinished_requests() == 2:
+            output = engine_core.step_with_batch_queue()[0]
+            if step % 2 == 0:
+                # Even steps consumes an output.
+                assert output is not None
+                assert len(output[0].outputs) == 1
+                if req_id in engine_core.scheduler.requests:
+                    assert engine_core.scheduler.requests[
+                        req_id].num_tokens == expected_num_tokens[req_id]
+                expected_num_tokens[req_id] += 1
+                req_id = (req_id + 1) % 2
+            else:
+                # Odd steps schedules a new batch.
+                assert output is None
+            step += 1
+
+
+@multi_gpu_test(num_gpus=2)
+def test_engine_core_tp(monkeypatch: pytest.MonkeyPatch):
+    """
+    Test engine can initialize worker in tp properly
+    """
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        """Setup the EngineCore."""
+        engine_args = EngineArgs(
+            model=MODEL_NAME,
+            tensor_parallel_size=2,
+            # Reduce startup time.
+            enforce_eager=True,
+        )
+        vllm_config = engine_args.create_engine_config()
+        executor_class = Executor.get_class(vllm_config)
+
+        with set_default_torch_num_threads(1):
+            engine_core = EngineCore(vllm_config=vllm_config,
+                                     executor_class=executor_class,
+                                     log_stats=True)
+
+        def get_worker_cache_config_field(worker, key: str):
+            return getattr(worker.cache_config, key)
+
+        num_gpu_blocks = engine_core.collective_rpc(
+            get_worker_cache_config_field, args=("num_gpu_blocks", ))
+        num_cpu_blocks = engine_core.collective_rpc(
+            get_worker_cache_config_field, args=("num_cpu_blocks", ))
+        assert all(x is not None for x in num_gpu_blocks)
+        assert all(x is not None for x in num_cpu_blocks)
diff --git a/vllm_v0.10.0/tests/v1/engine/test_engine_core_client.py b/vllm_v0.10.0/tests/v1/engine/test_engine_core_client.py
new file mode 100644
index 0000000..2ac6dc7
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/engine/test_engine_core_client.py
@@ -0,0 +1,590 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import os
+import signal
+import time
+import uuid
+from threading import Thread
+from typing import Optional
+from unittest.mock import MagicMock
+
+import pytest
+import torch
+from transformers import AutoTokenizer
+
+from tests.utils import multi_gpu_test
+from vllm import SamplingParams
+from vllm.distributed.kv_events import (BlockStored, KVEventBatch,
+                                        ZmqEventPublisher)
+from vllm.engine.arg_utils import EngineArgs
+from vllm.platforms import current_platform
+from vllm.usage.usage_lib import UsageContext
+from vllm.utils import set_default_torch_num_threads
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.core import EngineCore
+from vllm.v1.engine.core_client import (AsyncMPClient, EngineCoreClient,
+                                        SyncMPClient)
+from vllm.v1.engine.utils import CoreEngineProcManager
+from vllm.v1.executor.abstract import Executor
+
+from ...distributed.conftest import MockSubscriber
+from ...utils import create_new_process_for_each_test
+
+if not current_platform.is_cuda():
+    pytest.skip(reason="V1 currently only supported on CUDA.",
+                allow_module_level=True)
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
+PROMPT = "Hello my name is Robert and I love quantization kernels"
+PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
+
+
+def make_request(
+        params: SamplingParams,
+        prompt_tokens_ids: Optional[list[int]] = None) -> EngineCoreRequest:
+    if not prompt_tokens_ids:
+        prompt_tokens_ids = PROMPT_TOKENS
+
+    return EngineCoreRequest(
+        request_id=str(uuid.uuid4()),
+        prompt_token_ids=prompt_tokens_ids,
+        mm_inputs=None,
+        mm_hashes=None,
+        mm_placeholders=None,
+        sampling_params=params,
+        pooling_params=None,
+        eos_token_id=None,
+        arrival_time=time.time(),
+        lora_request=None,
+        cache_salt=None,
+        data_parallel_rank=None,
+    )
+
+
+def loop_until_done(client: EngineCoreClient, outputs: dict):
+
+    while True:
+        engine_core_outputs = client.get_output().outputs
+
+        if len(engine_core_outputs) == 0:
+            continue
+
+        all_finished = True
+        for out in engine_core_outputs:
+            outputs[out.request_id].append(out)
+            if not out.finished:
+                all_finished = False
+
+        if all_finished:
+            break
+
+
+async def loop_until_done_async(client: EngineCoreClient, outputs: dict):
+
+    while True:
+        engine_core_outputs = (await client.get_output_async()).outputs
+
+        if len(engine_core_outputs) == 0:
+            continue
+
+        all_finished = True
+        for out in engine_core_outputs:
+            outputs[out.request_id].append(out)
+            if not out.finished:
+                all_finished = False
+
+        if all_finished:
+            break
+
+
+async def loop_until_fully_done_async(client: EngineCoreClient, outputs: dict):
+
+    while True:
+        engine_core_outputs = (await client.get_output_async()).outputs
+
+        if len(engine_core_outputs) == 0:
+            continue
+
+        # Add outputs to the dict
+        for out in engine_core_outputs:
+            outputs[out.request_id].append(out)
+
+        # Check if all request IDs in outputs have finished
+        if all(outs and outs[-1].finished for outs in outputs.values()):
+            break
+
+        await asyncio.sleep(0.1)
+
+
+# Dummy utility function to monkey-patch into engine core.
+def echo(self, msg: str, err_msg: Optional[str] = None) -> str:
+    print(f"echo util function called: {msg}, {err_msg}")
+    if err_msg is not None:
+        raise ValueError(err_msg)
+    return msg
+
+
+@create_new_process_for_each_test()
+@pytest.mark.parametrize("multiprocessing_mode", [True, False])
+def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
+                            multiprocessing_mode: bool):
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        # Monkey-patch core engine utility function to test.
+        m.setattr(EngineCore, "echo", echo, raising=False)
+
+        engine_args = EngineArgs(model=MODEL_NAME, enforce_eager=True)
+        vllm_config = engine_args.create_engine_config(
+            UsageContext.UNKNOWN_CONTEXT)
+        executor_class = Executor.get_class(vllm_config)
+
+        with set_default_torch_num_threads(1):
+            client = EngineCoreClient.make_client(
+                multiprocess_mode=multiprocessing_mode,
+                asyncio_mode=False,
+                vllm_config=vllm_config,
+                executor_class=executor_class,
+                log_stats=False,
+            )
+
+        MAX_TOKENS = 20
+        params = SamplingParams(max_tokens=MAX_TOKENS)
+        """Normal Request Cycle."""
+        requests = [make_request(params) for _ in range(10)]
+        request_ids = [req.request_id for req in requests]
+
+        # Add requests to the engine.
+        for request in requests:
+            client.add_request(request)
+            time.sleep(0.01)
+
+        outputs: dict[str, list] = {req_id: [] for req_id in request_ids}
+        loop_until_done(client, outputs)
+
+        for req_id in request_ids:
+            assert len(outputs[req_id]) == MAX_TOKENS, (
+                f"{outputs[req_id]=}, {MAX_TOKENS=}")
+        """Abort Request Cycle."""
+
+        # Note: this code pathway will only work for multiprocessing
+        # since we have to call get_output() explicitly
+
+        # Add requests to the engine.
+        for idx, request in enumerate(requests):
+            client.add_request(request)
+            time.sleep(0.01)
+            if idx % 2 == 0:
+                client.abort_requests([request.request_id])
+
+        outputs = {req_id: [] for req_id in request_ids}
+        loop_until_done(client, outputs)
+
+        for idx, req_id in enumerate(request_ids):
+            if idx % 2 == 0:
+                assert len(outputs[req_id]) < MAX_TOKENS, (
+                    f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
+            else:
+                assert len(outputs[req_id]) == MAX_TOKENS, (
+                    f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
+        """Abort after request is finished."""
+
+        # Note: this code pathway will only work for multiprocessing
+        # since we have to call get_output() explicitly
+
+        request = requests[0]
+        client.add_request(request)
+        time.sleep(10.)
+
+        client.abort_requests([request.request_id])
+
+        if multiprocessing_mode:
+            """Utility method invocation"""
+
+            core_client: SyncMPClient = client
+
+            result = core_client.call_utility("echo", "testarg")
+            assert result == "testarg"
+
+            with pytest.raises(Exception) as e_info:
+                core_client.call_utility("echo", None, "help!")
+
+            assert str(e_info.value) == "Call to echo method failed: help!"
+
+
+@pytest.mark.asyncio(loop_scope="function")
+async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        # Monkey-patch core engine utility function to test.
+        m.setattr(EngineCore, "echo", echo, raising=False)
+
+        engine_args = EngineArgs(model=MODEL_NAME, enforce_eager=True)
+        vllm_config = engine_args.create_engine_config(
+            usage_context=UsageContext.UNKNOWN_CONTEXT)
+        executor_class = Executor.get_class(vllm_config)
+
+        with set_default_torch_num_threads(1):
+            client = EngineCoreClient.make_client(
+                multiprocess_mode=True,
+                asyncio_mode=True,
+                vllm_config=vllm_config,
+                executor_class=executor_class,
+                log_stats=True,
+            )
+
+        try:
+            MAX_TOKENS = 20
+            params = SamplingParams(max_tokens=MAX_TOKENS)
+            """Normal Request Cycle."""
+
+            requests = [make_request(params) for _ in range(10)]
+            request_ids = [req.request_id for req in requests]
+
+            # Add requests to the engine.
+            for request in requests:
+                await client.add_request_async(request)
+                await asyncio.sleep(0.01)
+
+            outputs: dict[str, list] = {req_id: [] for req_id in request_ids}
+            await loop_until_done_async(client, outputs)
+
+            for req_id in request_ids:
+                assert len(outputs[req_id]) == MAX_TOKENS, (
+                    f"{outputs[req_id]=}, {MAX_TOKENS=}")
+            """Abort Request Cycle."""
+
+            # Add requests to the engine.
+            for idx, request in enumerate(requests):
+                await client.add_request_async(request)
+                await asyncio.sleep(0.01)
+                if idx % 2 == 0:
+                    await client.abort_requests_async([request.request_id])
+
+            outputs = {req_id: [] for req_id in request_ids}
+            await loop_until_done_async(client, outputs)
+
+            for idx, req_id in enumerate(request_ids):
+                if idx % 2 == 0:
+                    assert len(outputs[req_id]) < MAX_TOKENS, (
+                        f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
+                else:
+                    assert len(outputs[req_id]) == MAX_TOKENS, (
+                        f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
+            """Utility method invocation"""
+
+            core_client: AsyncMPClient = client
+
+            result = await core_client.call_utility_async("echo", "testarg")
+            assert result == "testarg"
+
+            with pytest.raises(Exception) as e_info:
+                await core_client.call_utility_async("echo", None, "help!")
+
+            assert str(e_info.value) == "Call to echo method failed: help!"
+        finally:
+            client.shutdown()
+
+
+@pytest.mark.parametrize(
+    "multiprocessing_mode,publisher_config",
+    [(True, "tcp"), (False, "inproc")],
+    indirect=["publisher_config"],
+)
+def test_kv_cache_events(
+    monkeypatch: pytest.MonkeyPatch,
+    multiprocessing_mode: bool,
+    publisher_config,
+):
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        block_size = 16
+        num_blocks = 2
+
+        engine_args = EngineArgs(
+            model=MODEL_NAME,
+            enforce_eager=True,
+            enable_prefix_caching=True,
+            block_size=block_size,
+        )
+        engine_args.kv_events_config = publisher_config
+
+        vllm_config = engine_args.create_engine_config(
+            UsageContext.UNKNOWN_CONTEXT)
+
+        executor_class = Executor.get_class(vllm_config)
+        with set_default_torch_num_threads(1):
+            client = EngineCoreClient.make_client(
+                multiprocess_mode=multiprocessing_mode,
+                asyncio_mode=False,
+                vllm_config=vllm_config,
+                executor_class=executor_class,
+                log_stats=False,
+            )
+        endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
+        subscriber = MockSubscriber(endpoint,
+                                    topic=publisher_config.topic,
+                                    decode_type=KVEventBatch)
+
+        try:
+            custom_tokens = list(range(num_blocks * block_size))
+            sampling_params = SamplingParams(max_tokens=1)
+            request = make_request(sampling_params, custom_tokens)
+            client.add_request(request)
+
+            outputs: dict[str, list] = {request.request_id: []}
+            loop_until_done(client, outputs)
+
+            result = subscriber.receive_one(timeout=1000)
+            assert result is not None, "No message received"
+
+            seq, received = result
+
+            assert seq == 0, "Sequence number mismatch"
+            assert (len(received.events) == 1
+                    ), "We should have exactly one BlockStored event"
+            event = received.events[0]
+            assert isinstance(
+                event, BlockStored), "We should have a BlockStored event"
+            assert (len(event.block_hashes) == num_blocks
+                    ), "We should have a BlockStored event with 2 block_hashes"
+            assert (event.block_size == block_size
+                    ), "Block size should be the same as the block size"
+            assert (event.parent_block_hash
+                    is None), "Parent block hash should be None"
+            assert event.lora_id is None, "Lora id should be None"
+            assert (len(event.token_ids) == num_blocks * block_size
+                    ), "Token ids should be the same as the custom tokens"
+            assert (event.token_ids == custom_tokens
+                    ), "Token ids should be the same as the custom tokens"
+        finally:
+            client.shutdown()
+            subscriber.close()
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "multiprocessing_mode,publisher_config",
+    [(True, "tcp")],
+    indirect=["publisher_config"],
+)
+@multi_gpu_test(num_gpus=4)
+async def test_kv_cache_events_dp(
+    monkeypatch: pytest.MonkeyPatch,
+    multiprocessing_mode: bool,
+    publisher_config,
+):
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        block_size = 16
+        num_blocks = 2
+        dp_size = 2
+        tp_size = 2
+
+        engine_args = EngineArgs(
+            model=MODEL_NAME,
+            enforce_eager=True,
+            enable_prefix_caching=True,
+            data_parallel_size=dp_size,
+            tensor_parallel_size=tp_size,
+            block_size=block_size,
+        )
+        engine_args.kv_events_config = publisher_config
+
+        vllm_config = engine_args.create_engine_config(
+            UsageContext.UNKNOWN_CONTEXT)
+
+        executor_class = Executor.get_class(vllm_config)
+        with set_default_torch_num_threads(1):
+            client = EngineCoreClient.make_client(
+                multiprocess_mode=multiprocessing_mode,
+                asyncio_mode=True,
+                vllm_config=vllm_config,
+                executor_class=executor_class,
+                log_stats=False,
+            )
+        await asyncio.sleep(1)
+
+        # Build endpoints for all DP ranks
+        base_endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
+        endpoints = []
+        for i in range(dp_size):
+            offset_endpoint = ZmqEventPublisher.offset_endpoint_port(
+                base_endpoint, i)
+            endpoints.append(offset_endpoint)
+
+        subscriber = MockSubscriber(endpoints,
+                                    topic=publisher_config.topic,
+                                    decode_type=KVEventBatch)
+
+        try:
+            custom_tokens = list(range(num_blocks * block_size))
+            sampling_params = SamplingParams(max_tokens=1)
+            all_request_ids = []
+
+            # Create and add 25 requests
+            # NOTE: attempts to force routing to both dp groups but can be flaky
+            for i in range(25):
+                await asyncio.sleep(0.01)
+                request = make_request(sampling_params, custom_tokens)
+                await client.add_request_async(request)
+                all_request_ids.append(request.request_id)
+
+            await asyncio.sleep(0.1)
+
+            # Initialize outputs dict for all requests
+            outputs: dict[str, list] = {
+                req_id: []
+                for req_id in all_request_ids
+            }
+
+            print("processing requests...")
+            await asyncio.wait_for(loop_until_fully_done_async(
+                client, outputs),
+                                   timeout=20.0)
+
+            # Receive from subscriber until no more messages
+            print("collecting results...")
+            results = []
+            while True:
+                result = subscriber.receive_one(timeout=1)
+                print(result)
+                if result is None:
+                    break
+                results.append(result)
+
+            # Collect all events and data_parallel_ranks from all results
+            all_dp_ranks = [
+                received.data_parallel_rank for (_, received) in results
+            ]
+            unique_dps = set(all_dp_ranks)
+            assert (
+                len(unique_dps) == 2
+            ), f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}"
+
+        finally:
+            client.shutdown()
+            subscriber.close()
+
+
+@pytest.mark.timeout(20)
+def test_startup_failure(monkeypatch: pytest.MonkeyPatch):
+
+    with monkeypatch.context() as m, pytest.raises(Exception) as e_info:
+        m.setenv("VLLM_USE_V1", "1")
+
+        # Monkey-patch to extract core process pid while it's starting.
+        core_proc_pid = [None]
+        cepm_ctor = CoreEngineProcManager.__init__
+
+        def patched_cepm_ctor(self: CoreEngineProcManager, *args, **kwargs):
+            cepm_ctor(self, *args, **kwargs)
+            core_proc_pid[0] = self.processes[0].pid
+
+        m.setattr(CoreEngineProcManager, "__init__", patched_cepm_ctor)
+
+        t = time.time()
+        engine_args = EngineArgs(model=MODEL_NAME)
+        vllm_config = engine_args.create_engine_config(
+            usage_context=UsageContext.UNKNOWN_CONTEXT)
+        executor_class = Executor.get_class(vllm_config)
+        print(f"VllmConfig creation took {time.time() - t:.2f} seconds.")
+
+        # Start another thread to wait for engine core process to start
+        # and kill it - simulate fatal uncaught process exit.
+
+        def kill_first_child():
+            while (child_pid := core_proc_pid[0]) is None:
+                time.sleep(0.5)
+            print(f"Killing child core process {child_pid}")
+            assert isinstance(child_pid, int)
+            os.kill(child_pid, signal.SIGKILL)
+
+        Thread(target=kill_first_child, daemon=True).start()
+
+        _core_client = EngineCoreClient.make_client(
+            multiprocess_mode=True,
+            asyncio_mode=True,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=True,
+        )
+
+    assert "Engine core initialization failed" in str(e_info.value)
+
+
+@create_new_process_for_each_test()
+def test_engine_core_proc_instantiation_cuda_empty(
+        monkeypatch: pytest.MonkeyPatch):
+    """
+    Test that EngineCoreProc can be instantiated when CUDA_VISIBLE_DEVICES
+    is empty. This ensures the engine frontend does not need access to GPUs.
+    """
+
+    from vllm.v1.engine.core import EngineCoreProc
+    from vllm.v1.executor.abstract import Executor
+
+    # Create a simple mock executor instead of a complex custom class
+    mock_executor_class = MagicMock(spec=Executor)
+
+    def create_mock_executor(vllm_config):
+        mock_executor = MagicMock()
+
+        # Only implement the methods that are actually called during init
+        from vllm.v1.kv_cache_interface import FullAttentionSpec
+        mock_spec = FullAttentionSpec(block_size=16,
+                                      num_kv_heads=1,
+                                      head_size=64,
+                                      dtype=torch.float16,
+                                      use_mla=False)
+
+        mock_executor.get_kv_cache_specs.return_value = [{
+            "default": mock_spec
+        }]
+        mock_executor.determine_available_memory.return_value = [
+            1024 * 1024 * 1024
+        ]
+        mock_executor.initialize_from_config.return_value = None
+        mock_executor.max_concurrent_batches = 1
+
+        return mock_executor
+
+    mock_executor_class.side_effect = create_mock_executor
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        m.setenv("CUDA_VISIBLE_DEVICES", "")  # No CUDA devices
+
+        from vllm.v1.engine.utils import EngineZmqAddresses
+
+        def mock_startup_handshake(self, handshake_socket, local_client,
+                                   headless, parallel_config):
+            return EngineZmqAddresses(inputs=["tcp://127.0.0.1:5555"],
+                                      outputs=["tcp://127.0.0.1:5556"],
+                                      coordinator_input=None,
+                                      coordinator_output=None)
+
+        # Background processes are not important here
+        m.setattr(EngineCoreProc, "startup_handshake", mock_startup_handshake)
+
+        vllm_config = EngineArgs(
+            model="deepseek-ai/DeepSeek-V2-Lite",
+            trust_remote_code=True).create_engine_config()
+        engine_core_proc = EngineCoreProc(
+            vllm_config=vllm_config,
+            local_client=True,
+            handshake_address="tcp://127.0.0.1:12345",
+            executor_class=mock_executor_class,
+            log_stats=False,
+            engine_index=0,
+        )
+
+        engine_core_proc.shutdown()
diff --git a/vllm_v0.10.0/tests/v1/engine/test_fast_incdec_prefix_err.py b/vllm_v0.10.0/tests/v1/engine/test_fast_incdec_prefix_err.py
new file mode 100644
index 0000000..f028b4a
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/engine/test_fast_incdec_prefix_err.py
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from transformers import AutoTokenizer
+
+from vllm.sampling_params import SamplingParams
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.detokenizer import IncrementalDetokenizer
+
+# ruff: noqa: E501
+
+
+def test_fast_inc_detok_invalid_utf8_err_case():
+    """
+    Test edge case where tokenizer can produce non-monotonic,
+    invalid UTF-8 output, which breaks the internal state of
+    tokenizers' DecodeStream.
+    See https://github.com/vllm-project/vllm/issues/17448.
+
+    Thanks to reproducer from @fpaupier:
+    https://gist.github.com/fpaupier/0ed1375bd7633c5be6c894b1c7ac1be3.
+    """
+    tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-1b-it")
+
+    # Create a test request
+    prompt_token_ids = [107, 4606, 236787, 107]
+    params = SamplingParams(skip_special_tokens=True)
+    request = EngineCoreRequest(
+        "test",
+        prompt_token_ids,
+        None,
+        None,
+        None,
+        params,
+        None,
+        None,
+        0.0,
+        None,
+        cache_salt=None,
+        data_parallel_rank=None,
+    )
+
+    detokenizer = IncrementalDetokenizer.from_new_request(tokenizer, request)
+
+    assert detokenizer.__class__.__name__ == "FastIncrementalDetokenizer", \
+        "Should use FastIncrementalDetokenizer by default"
+
+    # Process tokens incrementally
+    test_tokens = [
+        236840, 107, 138, 236782, 107, 140, 236775, 6265, 1083, 623, 121908,
+        147418, 827, 107, 140, 236775, 6265, 236779, 2084, 1083, 623, 203292,
+        827, 107, 140, 236775, 6265, 236779, 7777, 1083, 623, 121908, 147418,
+        569, 537, 236789, 65880, 569, 537, 236789, 62580, 853, 115693, 210118,
+        35178, 16055, 1270, 759, 215817, 4758, 1925, 1117, 827, 107, 140,
+        236775, 5654, 1083, 623, 110733, 46291, 827, 107, 140, 236775, 5654,
+        236779, 2084, 1083, 623, 136955, 56731, 827, 107, 140, 236775, 5654,
+        236779, 7777, 1083, 623, 194776, 2947, 496, 109811, 1608, 890, 215817,
+        4758, 1925, 1117, 2789, 432, 398, 602, 31118, 569, 124866, 134772, 509,
+        19478, 1640, 33779, 236743, 236770, 236819, 236825, 236771, 432, 398,
+        432, 237167, 827, 107, 140, 236775, 77984, 1083, 623, 2709, 236745,
+        2555, 513, 236789, 602, 31118, 569
+    ]
+
+    output = ""
+    for i, token_id in enumerate(test_tokens):
+        detokenizer.update([token_id], False)
+
+        finished = i == len(test_tokens) - 1
+        output += detokenizer.get_next_output_text(finished, delta=True)
+
+
+# fmt: off
+    assert output == r'''[
+  {
+    "source": "Résultats",
+    "source_type": "CONCEPT",
+    "source_description": "Résultats de l'analyse de l'impact des opérations israéliennes sur la frontière libanaise",
+    "target": "Israël",
+    "target_type": "ORGANIZATION",
+    "target_description": "Pays qui a obtenu à sa frontière libanaise « un niveau de calme inédit depuis les années 1960 »",
+    "relationship": "Obtention d'un niveau de'''
diff --git a/vllm_v0.10.0/tests/v1/engine/test_llm_engine.py b/vllm_v0.10.0/tests/v1/engine/test_llm_engine.py
new file mode 100644
index 0000000..f376863
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/engine/test_llm_engine.py
@@ -0,0 +1,215 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import random
+from typing import TYPE_CHECKING, Optional
+
+import pytest
+
+from vllm import LLM
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams
+from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Metric, Vector
+
+if TYPE_CHECKING:
+    from tests.conftest import VllmRunner
+
+MODEL = "facebook/opt-125m"
+DTYPE = "half"
+
+
+def _vllm_model(
+    apc: bool,
+    vllm_runner: type[VllmRunner],
+    monkeypatch: pytest.MonkeyPatch,
+    *,
+    skip_tokenizer_init: bool = False,
+):
+    """Set up VllmRunner instance."""
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    return vllm_runner(
+        MODEL,
+        dtype=DTYPE,
+        max_model_len=128,
+        enforce_eager=True,
+        enable_prefix_caching=apc,
+        gpu_memory_utilization=0.5,
+        skip_tokenizer_init=skip_tokenizer_init,
+    )
+
+
+@pytest.fixture(
+    # Function scope decouples tests & allows
+    # env var adjustment via monkeypatch
+    scope="function",
+    # Prefix caching
+    params=[False, True])
+def vllm_model(vllm_runner, request, monkeypatch):
+    """VllmRunner test fixture parameterized by APC True/False."""
+    with _vllm_model(request.param, vllm_runner, monkeypatch) as vllm_model:
+        yield vllm_model
+
+
+@pytest.fixture(scope="function")
+def vllm_model_apc(vllm_runner, monkeypatch):
+    """VllmRunner test fixture with APC."""
+    with _vllm_model(True, vllm_runner, monkeypatch) as vllm_model:
+        yield vllm_model
+
+
+@pytest.fixture(
+    # Function scope decouples tests & allows
+    # env var adjustment via monkeypatch
+    scope="function",
+    # Prefix caching
+    params=[False, True])
+def vllm_model_skip_tokenizer_init(vllm_runner, request, monkeypatch):
+    """VllmRunner test fixture with APC."""
+    with _vllm_model(
+            request.param,
+            vllm_runner,
+            monkeypatch,
+            skip_tokenizer_init=True,
+    ) as vllm_model:
+        yield vllm_model
+
+
+def _get_test_sampling_params(
+    prompt_list: list[str],
+    seed: Optional[int] = 42,
+    structured_outputs: bool = False,
+) -> tuple[list[SamplingParams], list[int]]:
+    """Generate random sampling params for a batch."""
+
+    def get_mostly_n_gt1() -> int:
+        r"""Mostly n \in [2,20], ~1/3 n=1"""
+        x = random.randint(0, 28)
+        if x < 10:
+            return 1
+        else:
+            return x - 8
+
+    n_list = [get_mostly_n_gt1() for _ in range(len(prompt_list))]
+    # High temperature to maximize the chance of unique completions
+    return [
+        SamplingParams(
+            temperature=0.95,
+            top_p=0.95,
+            n=n,
+            seed=seed,
+            guided_decoding=GuidedDecodingParams(
+                regex="[0-9]+") if structured_outputs else None,
+        ) for n in n_list
+    ], n_list
+
+
+def test_compatibility_with_skip_tokenizer_init(
+    vllm_model_skip_tokenizer_init: VllmRunner,
+    example_prompts: list[str],
+):
+    # Case 1: Structured output request should raise an error.
+    sampling_params_list, _ = _get_test_sampling_params(
+        example_prompts,
+        structured_outputs=True,
+    )
+    llm: LLM = vllm_model_skip_tokenizer_init.llm
+    with pytest.raises(ValueError):
+        _ = llm.generate(example_prompts, sampling_params_list)
+
+
+def test_parallel_sampling(vllm_model, example_prompts) -> None:
+    """Test passes if parallel sampling `n>1` yields `n` unique completions.
+
+    Args:
+      vllm_model: VllmRunner instance under test.
+      example_prompt: test fixture providing prompts for testing.
+    """
+    sampling_params_list, n_list = _get_test_sampling_params(example_prompts)
+    llm: LLM = vllm_model.llm
+    outputs = llm.generate(example_prompts, sampling_params_list)
+
+    # Validate each request response
+    for out, n in zip(outputs, n_list):
+        completion_counts: dict[str, int] = {}
+        # Assert correct number of completions
+        assert len(out.outputs) == n, (
+            f"{len(out.outputs)} completions; {n} expected.")
+        for idx in range(n):
+            comp = out.outputs[idx]
+            # Assert correct completion indices
+            assert comp.index == idx, (f"Index {comp.index}; expected {idx}.")
+            text = comp.text
+            completion_counts[text] = completion_counts.get(text, 0) + 1
+        # Assert unique completions
+        if len(completion_counts) != n:
+            repeats = {
+                txt: num
+                for (txt, num) in completion_counts.items() if num > 1
+            }
+            raise AssertionError(
+                f"{len(completion_counts)} unique completions; expected"
+                f" {n}. Repeats: {repeats}")
+
+
+def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
+    max_tokens = 100
+    # Use spec decoding to test num_accepted_tokens_per_pos
+    speculative_config = {
+        "method": "ngram",
+        "prompt_lookup_max": 5,
+        "prompt_lookup_min": 3,
+        "num_speculative_tokens": 5,
+    }
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    with vllm_runner(
+            MODEL,
+            speculative_config=speculative_config,
+            disable_log_stats=False,
+    ) as vllm_model:
+        llm: LLM = vllm_model.llm
+        sampling_params = SamplingParams(temperature=0.0,
+                                         max_tokens=max_tokens)
+        outputs = llm.generate(example_prompts, sampling_params)
+
+        n_prompts = len(example_prompts)
+        assert len(outputs) == n_prompts
+
+        total_tokens = 0
+        for out in outputs:
+            assert len(out.outputs) == 1
+            total_tokens += len(out.outputs[0].token_ids)
+        assert total_tokens == max_tokens * n_prompts
+
+        metrics = llm.get_metrics()
+
+        def find_metric(name) -> list[Metric]:
+            found = []
+            for metric in metrics:
+                if metric.name == name:
+                    found.append(metric)
+            return found
+
+        num_requests_running = find_metric("vllm:num_requests_running")
+        assert len(num_requests_running) == 1
+        assert isinstance(num_requests_running[0], Gauge)
+        assert num_requests_running[0].value == .0
+
+        generation_tokens = find_metric("vllm:generation_tokens")
+        assert len(generation_tokens) == 1
+        assert isinstance(generation_tokens[0], Counter)
+        assert generation_tokens[0].value == total_tokens
+
+        request_generation_tokens = find_metric(
+            "vllm:request_generation_tokens")
+        assert len(request_generation_tokens) == 1
+        assert isinstance(request_generation_tokens[0], Histogram)
+        assert "+Inf" in request_generation_tokens[0].buckets
+        assert request_generation_tokens[0].buckets["+Inf"] == n_prompts
+        assert request_generation_tokens[0].count == n_prompts
+        assert request_generation_tokens[0].sum == total_tokens
+
+        num_accepted_tokens_per_pos = find_metric(
+            "vllm:spec_decode_num_accepted_tokens_per_pos")
+        assert len(num_accepted_tokens_per_pos) == 1
+        assert isinstance(num_accepted_tokens_per_pos[0], Vector)
+        assert len(num_accepted_tokens_per_pos[0].values) == 5
diff --git a/vllm_v0.10.0/tests/v1/engine/test_output_processor.py b/vllm_v0.10.0/tests/v1/engine/test_output_processor.py
new file mode 100644
index 0000000..949ab76
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/engine/test_output_processor.py
@@ -0,0 +1,1010 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+import time
+from typing import Optional
+
+import pytest
+
+from tests.v1.engine.utils import (NUM_PROMPT_LOGPROBS_UNDER_TEST,
+                                   NUM_SAMPLE_LOGPROBS_UNDER_TEST,
+                                   STOP_STRINGS,
+                                   DummyOutputProcessorTestVectors,
+                                   MockEngineCore)
+from vllm.outputs import CompletionOutput, RequestOutput
+from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.sequence import PromptLogprobs, SampleLogprobs
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.output_processor import (OutputProcessor,
+                                             RequestOutputCollector)
+from vllm.v1.metrics.stats import IterationStats
+
+
+def _ref_convert_id_to_token(
+    tokenizer: AnyTokenizer,
+    token_id: int,
+) -> str:
+    """Reference impl of logprobs detokenization.
+
+    Args:
+      tokenizer: tokenizer used by the model under test
+      token_id: convert this token id
+
+    Returns:
+      String representation of input token id
+    """
+    return tokenizer.decode([token_id]) or ""
+
+
+@pytest.mark.parametrize(
+    "request_output_kind",
+    [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+def test_incremental_detokenization(request_output_kind: RequestOutputKind,
+                                    dummy_test_vectors):
+    output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
+                                       log_stats=False)
+    engine_core = MockEngineCore(
+        tokens_list=dummy_test_vectors.generation_tokens)
+
+    # Make N requests.
+    requests = [
+        EngineCoreRequest(request_id=f"request-{idx}",
+                          prompt_token_ids=prompt_tokens,
+                          arrival_time=0,
+                          mm_inputs=None,
+                          mm_hashes=None,
+                          mm_placeholders=None,
+                          eos_token_id=None,
+                          lora_request=None,
+                          cache_salt=None,
+                          data_parallel_rank=None,
+                          sampling_params=SamplingParams(
+                              skip_special_tokens=False,
+                              spaces_between_special_tokens=False,
+                              output_kind=request_output_kind,
+                              stop=[],
+                              include_stop_str_in_output=False,
+                          ),
+                          pooling_params=None)
+        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
+    ]
+
+    # Add requests to the detokenizer.
+    for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
+        output_processor.add_request(request, prompt)
+
+    gen_strings = {}
+    gen_tokens = {}
+    while True:
+        # Mock output from the EngineCore.
+        outputs = engine_core.get_outputs()
+        if len(outputs) == 0:
+            break
+
+        # Step the Detokenizer.
+        processed_outputs = output_processor.process_outputs(outputs)
+        request_outputs = processed_outputs.request_outputs
+        requests_to_abort = processed_outputs.reqs_to_abort
+        assert len(requests_to_abort) == 0
+
+        # Update tracking.
+        for request_output in request_outputs:
+            request_id = request_output.request_id
+            new_text = request_output.outputs[0].text
+            new_tokens = request_output.outputs[0].token_ids
+            if request_id not in gen_strings:
+                gen_strings[request_id] = new_text
+                gen_tokens[request_id] = new_tokens
+            else:
+                gen_strings[request_id] += new_text
+                gen_tokens[request_id].extend(new_tokens)
+
+    # Confirmed tracked values matches what we expected.
+    for idx, (ref_gen_str, ref_gen_toks) in enumerate(
+            zip(dummy_test_vectors.generation_strings,
+                dummy_test_vectors.generation_tokens)):
+        gen_str = gen_strings[f"request-{idx}"]
+        gen_toks = gen_tokens[f"request-{idx}"]
+
+        assert gen_str == ref_gen_str, f"{gen_str=}, {ref_gen_str=}"
+        assert gen_toks == ref_gen_toks, f"{gen_toks=}, {ref_gen_toks=}"
+
+    assert output_processor.get_num_unfinished_requests() == 0
+    assert not output_processor.has_unfinished_requests()
+
+
+def _validate_logprobs(
+    gen_tokens: dict[str, list[int]],
+    gen_logprobs: dict[str, Optional[SampleLogprobs]],
+    gen_prompt_logprobs: dict[str, Optional[PromptLogprobs]],
+    gen_cumulative_logprob: dict[str, float],
+    dtv: DummyOutputProcessorTestVectors,
+    request_id_list: list[str],
+    num_sample_logprobs: Optional[int],
+    num_prompt_logprobs: Optional[int],
+) -> None:
+    for req_idx, req_id in enumerate(request_id_list):
+        new_tokens = gen_tokens[req_id]
+        logprobs = gen_logprobs[req_id]
+        prompt_logprobs = gen_prompt_logprobs[req_id]
+        cumulative_logprob = gen_cumulative_logprob[req_id]
+        prompt_token_ids = dtv.prompt_tokens[req_idx]
+        ref_logprobs = dtv.generation_logprobs[req_idx]
+        ref_prompt_logprobs = dtv.prompt_logprobs[req_idx]
+        if num_sample_logprobs is not None:
+            # Validate sample logprobs
+            assert logprobs is not None, (f"Request {req_id} requires sample"
+                                          " logprobs but sample logprobs are"
+                                          " None.")
+            # Require num sampled tokens to match num
+            # sampled logprobs - especially important
+            # to check since the detokenizer can cause
+            # a request to finish early due to a stop
+            # string being hit
+            num_new_tokens = len(new_tokens)
+            len_sample_logprobs = len(logprobs)
+            assert num_new_tokens == len_sample_logprobs, (
+                f"Request {req_id} has {num_new_tokens}"
+                " completion tokens but has"
+                f" {len_sample_logprobs} sample logprobs.")
+            ref_cumulative_logprob = 0.0
+            for idx, (sampled_token,
+                      pos_logprob_dict) in enumerate(zip(new_tokens,
+                                                         logprobs)):
+                # Break out the reference log probability value &
+                # logprob token id tensors associated with this
+                # position in the completion. Also break out the
+                # sampled token ranks
+                (ref_pos_logprob_toks, ref_pos_logprob_vals,
+                 ref_sampled_token_rank) = ref_logprobs[idx]
+                # For each position in the completion sequence,
+                # ensure the actual sampled token is among the
+                # logprobs
+                assert sampled_token in pos_logprob_dict, (
+                    f"Sampled token {sampled_token} not"
+                    f" present in logprob at index {idx}")
+
+                # Validate number of sample logprobs
+                num_lp_toks = len(pos_logprob_dict)
+                assert (num_lp_toks == num_sample_logprobs
+                        or num_lp_toks == num_sample_logprobs +
+                        1), ("Valid numbers of sample logprobs are"
+                             f" {num_sample_logprobs} or"
+                             f" {num_sample_logprobs+1} but"
+                             f" {num_lp_toks} logprobs found at"
+                             f" position {idx}. Logprobs dict:"
+                             f" {pos_logprob_dict}")
+
+                # Validate sampled token logprob rank
+                smp_lp = pos_logprob_dict[sampled_token]
+                smp_lp_rank = smp_lp.rank
+                assert (ref_sampled_token_rank == smp_lp_rank), (
+                    "Sampled token logprob rank"
+                    f" {smp_lp_rank} does not match"
+                    " correct value"
+                    f" {ref_sampled_token_rank}"
+                    f" in Logprob {smp_lp}")
+
+                # Validate that the logprob processor yields
+                # the correct log probabilities and valid
+                # rankings
+                rank_one_appears = False
+                for jdx in range(1, len(ref_pos_logprob_toks)):
+                    # Iterate over the (logprob val,logprob tok id)
+                    # pairs expected by the test fixture at this
+                    # position in the completion.
+                    ref_lp_val = ref_pos_logprob_vals[jdx]
+                    ref_tok_id = ref_pos_logprob_toks[jdx]
+                    assert ref_tok_id in pos_logprob_dict, (
+                        f"Expected token {ref_tok_id} to be"
+                        f" in logprob dict but it is not.")
+
+                    # Extract actually-generated logprob
+                    # info
+                    lp = pos_logprob_dict[ref_tok_id]
+                    lp_val = lp.logprob
+                    lp_rank = lp.rank
+
+                    # A "top" (rank 1) logprob must be
+                    # present
+                    rank_one_appears = (True
+                                        if lp_rank == 1 else rank_one_appears)
+
+                    # Rank must be >= 1
+                    assert lp_rank >= 1, (f"Logprob {lp} has invalid"
+                                          f" rank {lp_rank} < 1."
+                                          f" Logprob dict: {pos_logprob_dict}")
+
+                    # Validate log probability
+                    assert math.isclose(lp_val, ref_lp_val), (
+                        f"Token id {ref_tok_id} appears in logprobs dict"
+                        f" at position {idx} in completion with log"
+                        f" probability {lp_val} but {ref_lp_val} was"
+                        f" expected. Logprob: {lp}")
+
+                assert rank_one_appears, (f"No Logprob has rank 1"
+                                          " in the following Logprob"
+                                          f" dict: {pos_logprob_dict}")
+
+                # Validate logprobs detokenization
+                for lp_tok in pos_logprob_dict:
+                    # Confirm that sample logprob decoded token matches
+                    # the logprob token id at this sequence position
+                    decoded_token = pos_logprob_dict[lp_tok].decoded_token
+                    ref_decoded_token = _ref_convert_id_to_token(
+                        dtv.tokenizer, lp_tok)
+                    assert decoded_token == ref_decoded_token, (
+                        f"Sampled logprob token id {lp_tok} decodes to"
+                        f" {ref_decoded_token} but Logprob decoded"
+                        f" token is {decoded_token} instead"
+                        f" (at position {idx})")
+
+                ref_cumulative_logprob += pos_logprob_dict[
+                    sampled_token].logprob
+            # Assert that cumulative logprobs are correct
+            assert math.isclose(cumulative_logprob, ref_cumulative_logprob)
+        else:
+            # Sample logprobs disabled for this request
+            assert logprobs is None
+            assert cumulative_logprob is None
+
+        if num_prompt_logprobs is not None:
+            # Validate prompt logprobs
+            assert prompt_logprobs is not None, (
+                f"Request {req_id} requires prompt"
+                " logprobs but prompt logprobs are"
+                " None.")
+            # Require num prompt tokens to match num
+            # prompt logprobs
+            num_prompt_tokens = len(prompt_token_ids)
+            len_prompt_logprobs = len(prompt_logprobs)
+            assert num_prompt_tokens == len_prompt_logprobs, (
+                f"Request {req_id} has {num_prompt_tokens}"
+                " prompt tokens but has"
+                f" {len_prompt_logprobs} prompt logprobs.")
+            # First prompt logprob is None
+            first_plp_dict = prompt_logprobs[0]
+            assert first_plp_dict is None, (
+                f"Request {req_id} first prompt logprob"
+                f" should be None but has following value"
+                f" instead: {first_plp_dict}")
+            # Break out the reference prompt log prob value &
+            # logprob token id matrices for the whole prompt.
+            # Also break out the prompt token rank vector
+            (ref_prompt_logprob_toks, ref_prompt_logprob_vals,
+             ref_prompt_token_ranks) = ref_prompt_logprobs
+            for idx, (prompt_token, pos_logprob_dict) in enumerate(
+                    zip(prompt_token_ids[1:], prompt_logprobs[1:])):
+
+                # Break out the reference prompt log prob value
+                # vector, prompt logprob token id vector, and
+                # prompt token rank at the current position.
+                (ref_pos_prompt_logprob_toks, ref_pos_prompt_logprob_vals,
+                 ref_pos_prompt_token_rank) = (ref_prompt_logprob_toks[idx, :],
+                                               ref_prompt_logprob_vals[idx, :],
+                                               ref_prompt_token_ranks[idx])
+
+                # For each position in the prompt sequence,
+                # ensure the actual prompt token is among the
+                # logprobs
+                assert prompt_token in pos_logprob_dict, (
+                    f"Prompt token {prompt_token} not"
+                    f" present in logprob at index {idx}")
+                # Validate number of prompt logprobs
+                num_plp_toks = len(pos_logprob_dict)
+                assert (num_plp_toks == num_prompt_logprobs
+                        or num_plp_toks == num_prompt_logprobs +
+                        1), ("Valid numbers of prompt logprobs are"
+                             f" {num_prompt_logprobs} or"
+                             f" {num_prompt_logprobs+1} but"
+                             f" {num_plp_toks} logprobs found at"
+                             f" position {idx}. Logprobs dict:"
+                             f" {pos_logprob_dict}")
+
+                # Validate prompt token logprob rank
+                prmpt_tok_lp = pos_logprob_dict[prompt_token]
+                prmpt_tok_lp_rank = prmpt_tok_lp.rank
+                ref_prmpt_tok_lp_rank = ref_pos_prompt_token_rank
+                assert (ref_prmpt_tok_lp_rank == prmpt_tok_lp_rank), (
+                    "Prompt token logprob rank"
+                    f" {prmpt_tok_lp_rank} does not match"
+                    " correct value"
+                    f" {ref_prmpt_tok_lp_rank}"
+                    f" in Logprob {prmpt_tok_lp}")
+
+                # Validate that the logprob processor yields
+                # the correct prompt log probs and valid
+                # rankings
+                rank_one_appears = False
+                for jdx in range(1, len(ref_pos_prompt_logprob_toks)):
+                    # Iterate over the (logprob val,logprob tok id)
+                    # pairs expected by the test fixture at this
+                    # position in the completion.
+                    ref_plp_val = float(ref_pos_prompt_logprob_vals[jdx])
+                    ref_tok_id = int(ref_pos_prompt_logprob_toks[jdx])
+                    assert ref_tok_id in pos_logprob_dict, (
+                        f"Expected token {ref_tok_id} to be"
+                        f" in logprob dict but it is not.")
+
+                    # Extract actually-generated logprob
+                    # info
+                    plp = pos_logprob_dict[ref_tok_id]
+                    plp_val = plp.logprob
+                    plp_rank = plp.rank
+
+                    # A "top" (rank 1) logprob must be
+                    # present
+                    rank_one_appears = (True
+                                        if plp_rank == 1 else rank_one_appears)
+
+                    # Rank must be >= 1
+                    assert plp_rank >= 1, (
+                        f"Logprob {plp} has invalid"
+                        f" rank {plp_rank} < 1."
+                        f" Logprob dict: {pos_logprob_dict}")
+
+                    # Validate log probability
+                    assert math.isclose(plp_val, ref_plp_val), (
+                        f"Token id {ref_tok_id} appears in logprobs dict"
+                        f" at position {idx} in completion with log"
+                        f" probability {plp_val} but {ref_plp_val} was"
+                        f" expected. Logprob: {plp}")
+
+                assert rank_one_appears, (f"No Logprob has rank 1"
+                                          " in the following Logprob"
+                                          f" dict: {pos_logprob_dict}")
+
+                # Validate prompt logprob detokenization
+                for plp_tok in pos_logprob_dict:
+                    # Confirm that prompt logprob decoded token matches
+                    # the logprob token id at this sequence position
+                    decoded_token = pos_logprob_dict[plp_tok].decoded_token
+                    ref_decoded_token = _ref_convert_id_to_token(
+                        dtv.tokenizer, plp_tok)
+                    assert decoded_token == ref_decoded_token, (
+                        f"Prompt logprob token id {plp_tok} decodes to"
+                        f" {ref_decoded_token} but Logprob decoded"
+                        f" token is {decoded_token} instead"
+                        f" (at position {idx})")
+        else:
+            # Prompt logprobs disabled for this request
+            assert prompt_logprobs is None
+
+
+@pytest.mark.parametrize(
+    "request_output_kind",
+    [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+@pytest.mark.parametrize("num_sample_logprobs",
+                         [None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
+@pytest.mark.parametrize("num_prompt_logprobs",
+                         [None, NUM_PROMPT_LOGPROBS_UNDER_TEST])
+def test_logprobs_processor(request_output_kind: RequestOutputKind,
+                            num_sample_logprobs: Optional[int],
+                            num_prompt_logprobs: Optional[int],
+                            dummy_test_vectors):
+    output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
+                                       log_stats=False)
+    engine_core = MockEngineCore(
+        tokens_list=dummy_test_vectors.generation_tokens,
+        generated_logprobs_raw=None if num_sample_logprobs is None else
+        dummy_test_vectors.generation_logprobs,
+        prompt_logprobs_raw=None
+        if num_prompt_logprobs is None else dummy_test_vectors.prompt_logprobs)
+
+    # Make N requests.
+    request_id_list = [
+        f"request-{idx}"
+        for idx in range(len(dummy_test_vectors.prompt_strings))
+    ]
+    requests = [
+        EngineCoreRequest(request_id=request_id_list[idx],
+                          prompt_token_ids=prompt_tokens,
+                          arrival_time=0,
+                          mm_inputs=None,
+                          mm_hashes=None,
+                          mm_placeholders=None,
+                          eos_token_id=None,
+                          lora_request=None,
+                          cache_salt=None,
+                          data_parallel_rank=None,
+                          sampling_params=SamplingParams(
+                              skip_special_tokens=False,
+                              spaces_between_special_tokens=False,
+                              output_kind=request_output_kind,
+                              stop=[],
+                              include_stop_str_in_output=False,
+                              logprobs=num_sample_logprobs,
+                              prompt_logprobs=num_prompt_logprobs,
+                          ),
+                          pooling_params=None)
+        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
+    ]
+
+    # Add requests to the detokenizer.
+    for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
+        output_processor.add_request(request, prompt)
+
+    gen_tokens = {}
+    gen_logprobs = {}
+    gen_prompt_logprobs = {}
+    gen_cumulative_logprobs = {}
+    while True:
+        # Mock output from the EngineCore.
+        outputs = engine_core.get_outputs()
+        if len(outputs) == 0:
+            break
+
+        # Step the logprobs processor.
+        processed_outputs = output_processor.process_outputs(outputs)
+        request_outputs = processed_outputs.request_outputs
+        requests_to_abort = processed_outputs.reqs_to_abort
+        assert len(requests_to_abort) == 0
+
+        # Update tracking.
+        for request_output in request_outputs:
+            request_id = request_output.request_id
+            new_tokens = request_output.outputs[0].token_ids
+            prompt_logprobs = request_output.prompt_logprobs
+            logprobs = request_output.outputs[0].logprobs
+            gen_cumulative_logprobs[request_id] = request_output.outputs[
+                0].cumulative_logprob
+            if request_id not in gen_logprobs:
+                # Start tracking sample and prompt logprobs for this request
+                gen_tokens[request_id] = new_tokens
+                gen_logprobs[request_id] = logprobs
+                gen_prompt_logprobs[request_id] = prompt_logprobs
+            else:
+                # Extend logprobs tracker
+                gen_tokens[request_id].extend(new_tokens)
+                lp = gen_logprobs[request_id]
+                plp = gen_prompt_logprobs[request_id]
+                if lp:
+                    lp.extend(logprobs)
+                if plp:
+                    plp.extend(prompt_logprobs)
+
+    # Confirmed tracked logprobs match what we expect
+    _validate_logprobs(gen_tokens, gen_logprobs, gen_prompt_logprobs,
+                       gen_cumulative_logprobs, dummy_test_vectors,
+                       request_id_list, num_sample_logprobs,
+                       num_prompt_logprobs)
+
+    assert output_processor.get_num_unfinished_requests() == 0
+    assert not output_processor.has_unfinished_requests()
+
+
+@pytest.mark.parametrize(
+    "include_stop_str_in_output,stop_token_type,ignore_eos,num_sample_logprobs",
+    [(False, "stop_token_ids", False, None),
+     (True, "stop_token_ids", False, None),
+     (False, "stop_token_ids", False, NUM_SAMPLE_LOGPROBS_UNDER_TEST),
+     (True, "stop_token_ids", False, NUM_SAMPLE_LOGPROBS_UNDER_TEST),
+     (False, "eos_token_id", False, None), (True, "eos_token_id", False, None),
+     (False, "eos_token_id", True, None)])
+def test_stop_token(include_stop_str_in_output: bool,
+                    num_sample_logprobs: Optional[int], stop_token_type: str,
+                    ignore_eos: bool, dummy_test_vectors):
+    """Test output processor EOS/stop token handling.
+
+    Send mock engine core request to mock engine core and pass core outputs
+    to output processor. Validate output processor tokens, text and
+    (if enabled) sample logprobs. Batch-size one.
+
+    The test emulates a scenario where a model outputs text tokens followed
+    by two identical control tokens:
+    <token><token>...<token><control><control>
+
+    If EOS is under test, the control tokens are EOS; otherwise, they are
+    some other token id.
+
+    Test behavior:
+
+    * If EOS is under test and `ignore_eos=True`, the detokenized string
+      should be <token><token>...<token><control><control> and the finish
+      reason should be "length" (i.e. no stop occurs)
+
+    * else, if `include_stop_str_in_output==True`, the detokenized
+      string should be <token><token>...<token><control> and the finish
+      reason should be "stop" (i.e. first control token causes stop
+      and is represented in output text)
+
+    * else, the detokenized string should be
+      <token><token>...<token> and the finish reason should be "stop"
+      (i.e. first control token causes stop but is not represented
+      in output text.)
+
+    Note: some test details are tuned for meta-llama/Llama-3.2-1B,
+    another model should work only if the test is modified.
+
+    Args:
+        include_stop_str_in_output: stop token str appears in output text
+        num_sample_logprobs: number of sample logprobs (`None` for no logprobs)
+        stop_token_type: "eos_token_id" for EOS, "stop_token_ids" for stop token
+        ignore_eos: if True, EOS stops are disabled
+        dummy_test_vectors: dummy engine core outputs and other data structures
+    """
+    model_id = dummy_test_vectors.tokenizer.name_or_path
+    if model_id != 'meta-llama/Llama-3.2-1B':
+        raise AssertionError("Test requires meta-llama/Llama-3.2-1B but "
+                             f"{model_id} is in use.")
+    do_logprobs = num_sample_logprobs is not None
+    # EOS under test; if False, stop_token_ids under test
+    is_eos_test = stop_token_type == "eos_token_id"
+    # EOS under test but ignore_eos enabled
+    is_eos_ignore_test = is_eos_test and ignore_eos
+    eos_token_id = (
+        dummy_test_vectors.tokenizer.eos_token_id if is_eos_test else None
+    )  # '<|end_of_text|>'
+    stop_token_ids = [128009] if not is_eos_test else None  # '<|eot_id|>'
+
+    output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
+                                       log_stats=False)
+    # Dummy engine core outputs, with control tokens suffixed to test stops
+    suffix_token = ([eos_token_id] if is_eos_test else stop_token_ids)
+    assert suffix_token is not None and isinstance(suffix_token[0], int)
+    generation_string = dummy_test_vectors.generation_strings[0]
+    generation_tokens = (dummy_test_vectors.generation_tokens[0] +
+                         2 * suffix_token)
+    if do_logprobs:
+        generation_logprobs = (
+            dummy_test_vectors.generation_logprobs[0] +
+            2 * [dummy_test_vectors.generation_logprobs[0][-1]])
+    prompt_string = dummy_test_vectors.prompt_strings[0]
+    prompt_tokens = dummy_test_vectors.prompt_tokens[0]
+    engine_core = MockEngineCore(
+        tokens_list=[generation_tokens],
+        generated_logprobs_raw=[generation_logprobs] if do_logprobs else None,
+        prompt_logprobs_raw=None,
+        eos_token_id=eos_token_id,
+        stop_token_ids=stop_token_ids,
+        ignore_eos=ignore_eos)
+
+    # Make request.
+    request_id = "request-0"
+    request = EngineCoreRequest(
+        request_id=request_id,
+        prompt_token_ids=prompt_tokens,
+        arrival_time=0,
+        mm_inputs=None,
+        mm_hashes=None,
+        mm_placeholders=None,
+        eos_token_id=eos_token_id,
+        lora_request=None,
+        cache_salt=None,
+        data_parallel_rank=None,
+        sampling_params=SamplingParams(
+            skip_special_tokens=False,
+            spaces_between_special_tokens=False,
+            output_kind=RequestOutputKind.DELTA,
+            stop=[],
+            stop_token_ids=stop_token_ids,
+            include_stop_str_in_output=include_stop_str_in_output,
+            logprobs=num_sample_logprobs,
+            prompt_logprobs=None,
+            ignore_eos=ignore_eos,
+        ),
+        pooling_params=None)
+
+    # Add request to the detokenizer.
+    output_processor.add_request(request, prompt_string)
+
+    # Loop over engine core steps; run output processor
+    gen_string = ""
+    gen_tokens = []
+    gen_logprobs = []
+    while True:
+        # Mock output from the EngineCore.
+        outputs = engine_core.get_outputs()
+        if len(outputs) == 0:
+            break
+
+        # Step the Detokenizer.
+        processed_outputs = output_processor.process_outputs(outputs)
+        request_outputs = processed_outputs.request_outputs
+        assert len(request_outputs) == 1
+        # Stop token does not rely on abort
+        assert not processed_outputs.reqs_to_abort
+
+        # Update tracking.
+        request_output = request_outputs[0]
+        if request_output.finished:
+            finish_reason = ("length" if is_eos_ignore_test else "stop")
+            assert request_output.outputs[0].finish_reason == finish_reason
+
+        gen_string += request_output.outputs[0].text
+        gen_tokens.extend(request_output.outputs[0].token_ids)
+        if do_logprobs:
+            gen_logprobs.extend(request_output.outputs[0].logprobs)
+
+    # Validate generated text
+    control_token = '<|end_of_text|>' if is_eos_test else '<|eot_id|>'
+    if is_eos_ignore_test:
+        # Length-based stop; expect full string
+        ref_str = generation_string + 2 * control_token
+    elif include_stop_str_in_output:
+        # Stop token triggered; include in output
+        ref_str = generation_string + control_token
+    else:
+        # Stop token triggered but not in output
+        ref_str = generation_string
+    assert gen_string == ref_str, (f"{gen_string=}, {ref_str=}")
+
+    if do_logprobs:
+        # Validate number of sample logprobs
+        num_tokens = len(gen_tokens)
+        num_logprobs = len(gen_logprobs)
+        assert num_tokens == num_logprobs, (
+            f"Token count ({num_tokens}) != logprobs count ({num_logprobs})")
+
+    # Check requests are finished
+    assert output_processor.get_num_unfinished_requests() == 0
+    assert not output_processor.has_unfinished_requests()
+
+
+@pytest.mark.parametrize("include_stop_str_in_output", [True, False])
+@pytest.mark.parametrize("num_sample_logprobs",
+                         [None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
+def test_stop_string(include_stop_str_in_output: bool,
+                     num_sample_logprobs: Optional[int], dummy_test_vectors):
+    output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
+                                       log_stats=False)
+    engine_core = MockEngineCore(
+        tokens_list=dummy_test_vectors.generation_tokens,
+        generated_logprobs_raw=dummy_test_vectors.generation_logprobs
+        if num_sample_logprobs else None,
+        prompt_logprobs_raw=None)
+
+    # Make N requests.
+    request_id_list = [
+        f"request-{idx}"
+        for idx in range(len(dummy_test_vectors.prompt_strings))
+    ]
+    requests = [
+        EngineCoreRequest(
+            request_id=request_id_list[idx],
+            prompt_token_ids=prompt_tokens,
+            arrival_time=0,
+            mm_inputs=None,
+            mm_hashes=None,
+            mm_placeholders=None,
+            eos_token_id=None,
+            lora_request=None,
+            cache_salt=None,
+            data_parallel_rank=None,
+            sampling_params=SamplingParams(
+                skip_special_tokens=False,
+                spaces_between_special_tokens=False,
+                output_kind=RequestOutputKind.DELTA,
+                stop=STOP_STRINGS,
+                include_stop_str_in_output=include_stop_str_in_output,
+                logprobs=num_sample_logprobs,
+                prompt_logprobs=None,
+            ),
+            pooling_params=None)
+        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
+    ]
+
+    # Add requests to the detokenizer.
+    for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
+        output_processor.add_request(request, prompt)
+
+    gen_strings = {}
+    gen_tokens = {}
+    gen_logprobs = {}
+    gen_prompt_logprobs = {}
+    gen_cumulative_logprobs = {}
+    aborted = []
+    while True:
+        # Mock output from the EngineCore.
+        outputs = engine_core.get_outputs()
+        if len(outputs) == 0:
+            break
+
+        # Step the Detokenizer.
+        processed_outputs = output_processor.process_outputs(outputs)
+        request_outputs = processed_outputs.request_outputs
+        requests_to_abort = processed_outputs.reqs_to_abort
+        for request_output in request_outputs:
+            # If aborted, we should not get a request output.
+            assert request_output.request_id not in aborted
+        aborted.extend(requests_to_abort)
+
+        # Update tracking.
+        for request_output in request_outputs:
+            if request_output.finished:
+                assert request_output.outputs[0].finish_reason == "stop"
+
+            request_id = request_output.request_id
+            new_text = request_output.outputs[0].text
+            new_tokens = request_output.outputs[0].token_ids
+            prompt_logprobs = request_output.prompt_logprobs
+            logprobs = request_output.outputs[0].logprobs
+            gen_cumulative_logprobs[request_id] = request_output.outputs[
+                0].cumulative_logprob
+            if request_id not in gen_strings:
+                gen_strings[request_id] = new_text
+                gen_tokens[request_id] = new_tokens
+                gen_logprobs[request_id] = logprobs
+                gen_prompt_logprobs[request_id] = prompt_logprobs
+            else:
+                gen_strings[request_id] += new_text
+                gen_tokens[request_id].extend(new_tokens)
+                lp = gen_logprobs[request_id]
+                plp = gen_prompt_logprobs[request_id]
+                if lp:
+                    lp.extend(logprobs)
+                if plp:
+                    plp.extend(prompt_logprobs)
+
+    # Confirmed tracked values matches what we expected.
+    for idx, (ref_gen_str, stop_str) in enumerate(
+            zip(dummy_test_vectors.generation_strings, STOP_STRINGS)):
+
+        # Request should be aborted.
+        request_id = f"request-{idx}"
+        assert request_id in aborted
+
+        # Collected values that were generated.
+        gen_str = gen_strings[request_id]
+
+        # Construct reference strings.
+        stop_str_idx = ref_gen_str.find(stop_str)
+        ref_str_exc_stop = ref_gen_str[:stop_str_idx]
+        ref_str_inc_stop = ref_gen_str[:stop_str_idx] + stop_str
+
+        if include_stop_str_in_output:
+            assert gen_str == ref_str_inc_stop, (
+                f"{gen_str=}, {ref_str_inc_stop=}")
+        else:
+            assert gen_str == ref_str_exc_stop, (
+                f"{gen_str=}, {ref_str_exc_stop=}")
+
+    # Confirmed tracked logprobs match what we expect
+    _validate_logprobs(gen_tokens, gen_logprobs, gen_prompt_logprobs,
+                       gen_cumulative_logprobs, dummy_test_vectors,
+                       request_id_list, num_sample_logprobs, None)
+
+    assert output_processor.get_num_unfinished_requests() == 0
+    assert not output_processor.has_unfinished_requests()
+
+
+def test_iteration_stats(dummy_test_vectors):
+    output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
+                                       log_stats=True)
+    engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
+    engine_core_timestamp = time.monotonic()
+
+    # Make N requests.
+    requests = [
+        EngineCoreRequest(
+            request_id=f"request-{idx}",
+            prompt_token_ids=prompt_tokens,
+            arrival_time=0,
+            mm_inputs=None,
+            mm_hashes=None,
+            mm_placeholders=None,
+            eos_token_id=None,
+            lora_request=None,
+            cache_salt=None,
+            data_parallel_rank=None,
+            sampling_params=SamplingParams(),
+            pooling_params=None,
+        ) for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
+    ]
+
+    # Add all requests except one to the OutputProcessor.
+    num_active = len(dummy_test_vectors.generation_tokens) - 1
+    for request in requests[:num_active]:
+        output_processor.add_request(request, None)
+    inactive_request = requests[num_active]
+
+    # First iteration has 2 prefills.
+    outputs = engine_core.get_outputs()[:num_active]
+    iteration_stats = IterationStats()
+    output_processor.process_outputs(outputs, engine_core_timestamp,
+                                     iteration_stats)
+    total_prompt_tokens = sum([
+        len(prompt_tokens)
+        for prompt_tokens in dummy_test_vectors.prompt_tokens[:num_active]
+    ])
+
+    assert iteration_stats.num_prompt_tokens == total_prompt_tokens
+    assert iteration_stats.num_generation_tokens == num_active
+
+    # Just decodes in this step.
+    outputs = engine_core.get_outputs()[:num_active]
+    iteration_stats = IterationStats()
+    output_processor.process_outputs(outputs, engine_core_timestamp,
+                                     iteration_stats)
+
+    assert iteration_stats.num_prompt_tokens == 0
+    assert iteration_stats.num_generation_tokens == num_active
+
+    # Add a new request - prefill and 2 decodes in this step.
+    output_processor.add_request(inactive_request, None)
+    num_active += 1
+    outputs = engine_core.get_outputs()[:num_active]
+    iteration_stats = IterationStats()
+    output_processor.process_outputs(outputs, engine_core_timestamp,
+                                     iteration_stats)
+    total_prompt_tokens = len(dummy_test_vectors.prompt_tokens[num_active - 1])
+
+    assert iteration_stats.num_prompt_tokens == total_prompt_tokens
+    assert iteration_stats.num_generation_tokens == num_active
+
+    # Just decodes in this step.
+    outputs = engine_core.get_outputs()[:num_active]
+    iteration_stats = IterationStats()
+    output_processor.process_outputs(outputs, engine_core_timestamp,
+                                     iteration_stats)
+
+    assert iteration_stats.num_prompt_tokens == 0
+    assert iteration_stats.num_generation_tokens == num_active
+
+
+@pytest.mark.asyncio
+async def test_request_output_collector():
+    NUM_REQS = 3
+    TEXT = "a"
+
+    def make_outputs() -> list[RequestOutput]:
+        return [
+            RequestOutput(
+                request_id="my-request-id",
+                prompt=None,
+                prompt_token_ids=[1, 2, 3],
+                prompt_logprobs=None,
+                outputs=[
+                    CompletionOutput(
+                        index=0,
+                        text=TEXT,
+                        token_ids=[idx],
+                        cumulative_logprob=(idx + 1 * 1.0),
+                        logprobs=[{
+                            "a": idx,
+                            "b": idx
+                        }],
+                        finish_reason="length" if
+                        (idx == NUM_REQS - 1) else None,
+                    )
+                ],
+                finished=(idx == NUM_REQS - 1),
+            ) for idx in range(NUM_REQS)
+        ]
+
+    collector = RequestOutputCollector(RequestOutputKind.DELTA)
+
+    # CASE 1: Put then get.
+    outputs = make_outputs()
+    collector.put(outputs[0])
+    output = await collector.get()
+    assert not collector.ready.is_set()
+    assert collector.output is None
+    assert output.outputs[0].text == "a"
+    assert output.outputs[0].token_ids == [0]
+
+    # CASE 2: 2 puts then get.
+    num_to_put = 2
+    outputs = make_outputs()
+    for i in range(num_to_put):
+        collector.put(outputs[i])
+    output = await collector.get()
+    assert not collector.ready.is_set()
+    assert collector.output is None
+
+    assert not output.finished
+    # Text, token_ids, and logprobs should get merged.
+    assert output.outputs[0].text == TEXT * num_to_put
+    for tok_0, tok_1 in zip(output.outputs[0].token_ids,
+                            list(range(num_to_put))):
+        assert tok_0 == tok_1
+    assert len(output.outputs[0].logprobs) == num_to_put
+
+    # Cumulative logprobs should be the last one.
+    cumulative_logprob_expected = 1.0 * num_to_put
+    assert output.outputs[0].cumulative_logprob == cumulative_logprob_expected
+
+    # CASE 3: Put all 3 (including a finished).
+    num_to_put = 3
+    outputs = make_outputs()
+    for i in range(num_to_put):
+        collector.put(outputs[i])
+    output = await collector.get()
+    assert not collector.ready.is_set()
+    assert collector.output is None
+
+    assert output.finished
+    assert output.outputs[0].finish_reason == "length"
+    # Text, token_ids, and logprobs should get merged.
+    assert output.outputs[0].text == TEXT * num_to_put
+    for tok_0, tok_1 in zip(output.outputs[0].token_ids,
+                            list(range(num_to_put))):
+        assert tok_0 == tok_1
+    assert len(output.outputs[0].logprobs) == num_to_put
+
+    # Cumulative logprobs should be the last one.
+    cumulative_logprob_expected = 1.0 * num_to_put
+    assert output.outputs[0].cumulative_logprob == cumulative_logprob_expected
+
+
+@pytest.mark.asyncio
+async def test_cumulative_output_collector_n():
+    """Test collector correctly handles multiple outputs by index."""
+    collector = RequestOutputCollector(RequestOutputKind.CUMULATIVE)
+    outputs = [
+        RequestOutput(
+            request_id="my-request-id",
+            prompt=None,
+            prompt_token_ids=[1, 2, 3],
+            prompt_logprobs=None,
+            outputs=[
+                CompletionOutput(
+                    index=0,
+                    text="a",
+                    token_ids=[0],
+                    cumulative_logprob=None,
+                    logprobs=None,
+                    finish_reason=None,
+                ),
+                CompletionOutput(
+                    index=1,
+                    text="b",
+                    token_ids=[1],
+                    cumulative_logprob=None,
+                    logprobs=None,
+                    finish_reason=None,
+                ),
+            ],
+            finished=False,
+        ),
+        RequestOutput(
+            request_id="my-request-id",
+            prompt=None,
+            prompt_token_ids=[1, 2, 3],
+            prompt_logprobs=None,
+            outputs=[
+                CompletionOutput(
+                    index=0,
+                    text="ab",
+                    token_ids=[0, 1],
+                    cumulative_logprob=None,
+                    logprobs=None,
+                    finish_reason=None,
+                ),
+                CompletionOutput(
+                    index=2,
+                    text="c",
+                    token_ids=[2],
+                    cumulative_logprob=None,
+                    logprobs=None,
+                    finish_reason=None,
+                ),
+            ],
+            finished=False,
+        ),
+    ]
+    for output in outputs:
+        collector.put(output)
+
+    # Get the output and check that the text and token_ids are correct.
+    result = await collector.get()
+    # We are expecting
+    # [{index: 0, text: "ab"}, {index: 1, text: "b"}, {index: 2, text: "c"}]
+    assert len(result.outputs) == 3
+    # First is the one where index is 0
+    first = [k for k in result.outputs if k.index == 0]
+    assert len(first) == 1
+    assert first[0].text == "ab"
+
+    # Second is the one where index is 1
+    second = [k for k in result.outputs if k.index == 1]
+    assert len(second) == 1
+    assert second[0].text == "b"
+    assert second[0].token_ids == [1]
+
+    # Third is the one where index is 2
+    third = [k for k in result.outputs if k.index == 2]
+    assert len(third) == 1
+    assert third[0].text == "c"
diff --git a/vllm_v0.10.0/tests/v1/engine/utils.py b/vllm_v0.10.0/tests/v1/engine/utils.py
new file mode 100644
index 0000000..b58bc75
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/engine/utils.py
@@ -0,0 +1,396 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import random
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
+from vllm.v1.engine import EngineCoreOutput, FinishReason
+from vllm.v1.outputs import LogprobsLists, LogprobsTensors
+
+GeneralTokenizerType = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
+
+# Number of sample logprobs to request when testing sample logprobs
+NUM_SAMPLE_LOGPROBS_UNDER_TEST = 5
+# Number of prompt logprobs to request when testing prompt logprobs
+NUM_PROMPT_LOGPROBS_UNDER_TEST = 7
+
+TOKENIZER_NAME = "meta-llama/Llama-3.2-1B"
+
+FULL_STRINGS = [
+    "My name is Robert from Neural Magic and I love working on vLLM so much!",
+    "Red Hat is the best open source company by far across Linux, K8s, and AI.",
+    "Nick is the name of my brother in addition to my colleague from Red Hat.",
+]
+STOP_STRINGS = ["I love working on", "company by far", "brother in"]
+PROMPT_LEN = 5
+
+random.seed(42)
+
+
+def _create_random_top_logprob_test_vector(
+    num_logprobs: int,
+    lower: float,
+    upper: float,
+) -> torch.Tensor:
+    """Create a random vector of top logprob float values.
+    
+    Use to create fake sample logprobs for testing.
+
+    Note that a real production scenario would require
+    logprobs to be sorted in descending order, something
+    which is omitted in this function.
+
+    Args:
+      num_logprobs: number of top logprobs
+      lower: lower range of logprob float values
+      upper: upper range of logprob float values
+
+    Returns:
+      1D length-`num_logprobs` torch Tensor of float logprob values
+    """
+    return torch.rand(num_logprobs) * (upper - lower) + lower
+
+
+def _create_random_top_logprob_test_matrix(
+    shape: tuple,
+    lower: float,
+    upper: float,
+) -> torch.Tensor:
+    """Create a random matrix of top logprob float values.
+    
+    Use to create fake prompt logprobs for testing.
+
+    Note that a real production scenario would require
+    logprobs to be sorted in descending order along rows,
+    something which is omitted in this function.
+
+    Args:
+      shape: (num_tokens,num_logprobs) tuple representing
+             matrix shape
+      lower: lower range of logprob float values
+      upper: upper range of logprob float values
+
+    Returns:
+      2D num_tokens x num_logprobs torch Tensor of float logprob values
+    """
+    return torch.rand(*shape) * (upper - lower) + lower
+
+
+def _create_random_top_token_test_vector(
+        num_logprobs: int,
+        lower: int,
+        upper: int,
+        sampled_token_id: int,
+        adjust_num_logprobs: bool = True) -> tuple[torch.Tensor, int]:
+    """Create a random vector of top logprob token indices
+
+    Use to create fake sample logprobs for testing. The sampled token
+    ID must always be one of the top logprobs, which this dummy test
+    vector generator enforces. OpenAI API
+    compatible engines must be able to return an additional sample
+    logprob for the sampled token if the sampled token was not
+    among the top sample logprobs; `adjust_num_logprobs` emulates
+    this behavior by increasing the vector length by 1 if
+    `adjust_num_logprobs` is set.
+
+    Args:
+      num_logprobs: number of top logprobs
+      lower: lower range of token ids
+      upper: upper range of token ids
+      sampled_token_id: the token actually sampled
+      adjust_num_logprobs: if True, emulate situation where sampled
+                           token logprob must be injected into top
+                           logprobs
+
+    Returns:
+      1D length-x torch Tensor of token ids where x is
+      `num_logprobs+1` if `adjust_num_logprobs` and
+      `num_logprobs` otherwise
+      sampled_token_rank: the rank of sampled_token_id in the vocab
+                          vector when sorted in descending order by
+                          logprob
+    """
+
+    # Calculate the final number of logprobs required
+    total_logprobs = num_logprobs + 1 if adjust_num_logprobs else num_logprobs
+
+    # Generate random indices using torch
+    choice_tensor = torch.randperm(upper - lower)[:total_logprobs] + lower
+
+    # Ensure the sampled token ID is included in the tensor
+    choice_tensor[0] = sampled_token_id
+
+    # Check if the sampled_token_id occurs in choice_tensor[1:]
+    if sampled_token_id in choice_tensor[1:]:
+        sampled_token_rank = (choice_tensor[1:] == sampled_token_id).nonzero(
+            as_tuple=True)[0].item()
+    else:
+        # If not found, assign a random int between num_logprobs and 50700
+        sampled_token_rank = random.randint(num_logprobs, 50700)
+
+    return choice_tensor, sampled_token_rank
+
+
+def _create_random_top_token_test_matrix(
+    shape: tuple[int, int],
+    lower: int,
+    upper: int,
+    tokens_list: list[int],
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Create a random matrix of top logprob token indices
+
+    Use to create fake prompt logprobs for testing.
+
+    Token ids are generated randomly and sampled without
+    replacement.
+
+    Args:
+      shape: (num_tokens, num_logprobs) tuple representing
+             matrix shape
+      lower: lower range of token ids
+      upper: upper range of token ids
+
+    Returns:
+      tuple containing:
+      - 2D num_tokens x num_logprobs+1 torch Tensor of token ids
+      - 1D tensor of ranks of prompt tokens in their respective
+        rows, or random values
+    """
+    num_elements = shape[0] * shape[1]
+    choice_tensor = torch.randperm(upper - lower)[:num_elements] + lower
+    matrix = torch.cat(
+        (torch.tensor(tokens_list, dtype=torch.int).unsqueeze(-1),
+         choice_tensor.view(shape)),
+        dim=1)
+
+    # Initialize the tensor for storing the ranks
+    prompt_token_ranks = torch.empty(shape[0], dtype=torch.int)
+
+    # Iterate over each row to check presence of
+    # tokens_list[rdx] and determine its index
+    for rdx in range(shape[0]):
+        row = matrix[rdx,
+                     1:]  # Skip the first column as it contains the token list
+        token_index = (row == tokens_list[rdx]).nonzero(as_tuple=True)[0]
+        if token_index.numel() > 0:
+            prompt_token_ranks[rdx] = token_index.item()
+        else:
+            prompt_token_ranks[rdx] = random.randint(shape[1], 50700)
+
+    return matrix, prompt_token_ranks
+
+
+def decode_token(
+    tok_id: int,
+    tokenizer: PreTrainedTokenizer,
+) -> str:
+    """Reproduce the process of detokenizing a token for testing purposes.
+
+    Args:
+      tok_id: token id to detokenize
+      tokenizer: tokenizer to use for detokenization
+
+    Returns:
+      string representation of token
+    """
+    return tokenizer.convert_ids_to_tokens(tok_id)
+
+
+def generate_dummy_sample_logprobs(
+    sampled_tokens_list: list,
+    num_logprobs: int,
+    tokenizer: PreTrainedTokenizer,
+) -> list[tuple[list[int], list[float], int]]:
+    """Generate dummy sample logprobs
+
+    Generate a test data structure which imitates the list of sample logprobs
+    which would be assembled in the engine core during decode phase.
+
+    Args:
+      sampled_tokens_list: list of sampled tokens
+      num_logprobs: return `num_logprobs` or `num_logprobs+1` logprobs per token
+      tokenizer: model tokenizer to use for detokenization
+
+    Returns
+      list of (top token ids vector, logprobs vector, sampled token rank)
+      Python lists tuples; in each tuple the logprobs and top token ids
+      vectors have the same length which is either `num_logprobs` or
+      `num_logprobs+1`. Sampled token rank is the rank (index+1) of the
+      sampled token within the vocab vector when sorted by logprob in
+      descending order.
+    """
+    res = []
+    for sampled_token_id in sampled_tokens_list:
+        (
+            token_vector,
+            sampled_token_rank,
+        ) = _create_random_top_token_test_vector(num_logprobs, 0,
+                                                 len(tokenizer.vocab) - 1,
+                                                 sampled_token_id)
+
+        res.append(
+            (token_vector,
+             _create_random_top_logprob_test_vector(num_logprobs + 1, -100,
+                                                    0), sampled_token_rank))
+
+    # Convert tensors in the list tuples to Python lists
+    res_list_format = [
+        (log_probs_tensor.tolist(), token_ids_tensor.tolist(),
+         sampled_token_rank)
+        for log_probs_tensor, token_ids_tensor, sampled_token_rank in res
+    ]
+
+    return res_list_format
+
+
+def generate_dummy_prompt_logprobs_tensors(
+    prompt_tokens_list: list,
+    num_logprobs: int,
+    tokenizer: PreTrainedTokenizer,
+) -> LogprobsTensors:
+    """Generate dummy prompt logprobs tensors
+
+    Generate a test data structure which imitates the torch Tensors of prompt
+    logprobs which would be assembled in the engine core during chunked
+    prefill.
+
+    Args:
+      prompt_tokens_list: list of prompt tokens
+      num_logprobs: return `num_logprobs` logprobs per token
+      tokenizer: model tokenizer to use for detokenization
+
+    Returns
+      Single tuple of (logprobs matrix, top token ids matrix) torch Tensor,
+      where both matrices have dimensions
+      num_prompt_tokens x num_logprobs
+    """
+    # For now, assume the whole prompt is processed in one chunk; thus,
+    # the number of non-`None` prompt logprobs is `len(prompt_tokens_list)-1`.
+    # Prior to injecting `None` at the beginning of prompt logprobs (which
+    # happens later in the detokenizer, not here), the prompt logprobs in
+    # the ith position are predicting the probability distribution of the
+    # prompt token in (i+1)st position. Thus, we concat
+    # `prompt_tokens_list[1:]` to the dummy token ids, just as the engine
+    # would.
+    num_prompt_logprobs = len(prompt_tokens_list) - 1
+    (
+        token_vector,
+        prompt_token_ranks,
+    ) = _create_random_top_token_test_matrix(
+        (num_prompt_logprobs, num_logprobs), 0,
+        len(tokenizer.vocab) - 1, prompt_tokens_list[1:])
+    return LogprobsTensors(
+        token_vector,
+        _create_random_top_logprob_test_matrix(
+            (num_prompt_logprobs, num_logprobs + 1), -100, 0),
+        prompt_token_ranks)
+
+
+@dataclass
+class DummyOutputProcessorTestVectors:
+    """Dummy test vectors for output processor tests"""
+    tokenizer: GeneralTokenizerType
+    tokenizer_group: TokenizerGroup
+    vllm_config: EngineArgs
+    full_tokens: list[list[int]]  # Prompt + generated tokens
+    prompt_tokens: list[list[int]]
+    generation_tokens: list[list[int]]
+    # Each request is associated with a tuple of
+    # (top tokens, top logprobs, ranks) prompt logprobs tensors
+    prompt_logprobs: list[LogprobsTensors]
+    # Each request is associated with a sample logprobs; a request's
+    # sample logprobs are a list of (top tokens, top logprobs, ranks)
+    # sample logprobs tensors at each sequence position
+    generation_logprobs: list[list[tuple[list[int], list[float], int]]]
+    prompt_strings: list[str]
+    prompt_strings_len: list[int]
+    generation_strings: list[str]
+
+
+class MockEngineCore:
+    """Mock engine core outputs form premade tokens lists."""
+
+    def __init__(
+        self,
+        tokens_list: list[list[int]],
+        # For each request, for each sampled token offset,
+        # a tuple of
+        # (list of topk token ids, list of sample logprob vals, rank)
+        generated_logprobs_raw: Optional[list[list[tuple[list[int],
+                                                         list[float],
+                                                         int]]]] = None,
+        # For each request, a tuple of
+        # (prompt logprob val matrix, prompt logprob tok id matrix);
+        # each matrix has dimensions
+        # (num prompt toks) x (num prompt logprobs+1)
+        prompt_logprobs_raw: Optional[list[LogprobsTensors]] = None,
+        eos_token_id: Optional[int] = None,
+        stop_token_ids: Optional[list[int]] = None,
+        ignore_eos: bool = False,
+    ) -> None:
+        self.num_requests = len(tokens_list)
+        self.tokens_list = tokens_list
+        self.current_idx = 0
+        self.generated_logprobs_raw = generated_logprobs_raw
+        self.do_logprobs = generated_logprobs_raw is not None
+        self.prompt_logprobs_raw = prompt_logprobs_raw
+        self.do_prompt_logprobs = prompt_logprobs_raw is not None
+        self.request_finished = [False for _ in range(self.num_requests)]
+        self.eos_token_id = eos_token_id
+        self.stop_token_ids = stop_token_ids
+        self.ignore_eos = ignore_eos
+
+    def get_outputs(self) -> list[EngineCoreOutput]:
+        do_logprobs = self.do_logprobs
+        do_prompt_logprobs = self.do_prompt_logprobs
+        token_idx = self.current_idx
+
+        outputs = []
+        for req_idx, token_ids in enumerate(self.tokens_list):
+            if not self.request_finished[req_idx]:
+                if do_logprobs:
+                    assert self.generated_logprobs_raw is not None
+                    (logprobs_token_ids_, logprobs_, sampled_token_ranks_) = (
+                        self.generated_logprobs_raw[req_idx][token_idx])
+                    logprobs = LogprobsLists(
+                        [logprobs_token_ids_],
+                        [logprobs_],
+                        [sampled_token_ranks_],
+                    )
+                else:
+                    logprobs = None
+                if do_prompt_logprobs:
+                    if self.current_idx == 0:
+                        assert self.prompt_logprobs_raw is not None
+                        prompt_logprobs = self.prompt_logprobs_raw[req_idx]
+                    else:
+                        prompt_logprobs = None
+                else:
+                    prompt_logprobs = None
+                new_token_id = token_ids[token_idx]
+                output = EngineCoreOutput(
+                    request_id=f"request-{req_idx}",
+                    new_token_ids=[new_token_id],
+                    new_logprobs=logprobs,
+                    new_prompt_logprobs_tensors=prompt_logprobs,
+                )
+                if token_idx == len(token_ids) - 1:
+                    output.finish_reason = FinishReason.LENGTH
+                    self.request_finished[req_idx] = True
+                if not self.ignore_eos and new_token_id == self.eos_token_id:
+                    output.finish_reason = FinishReason.STOP
+                    self.request_finished[req_idx] = True
+                if new_token_id in (self.stop_token_ids or ()):
+                    output.finish_reason = FinishReason.STOP
+                    output.stop_reason = new_token_id
+                    self.request_finished[req_idx] = True
+                outputs.append(output)
+
+        self.current_idx += 1
+        return outputs
diff --git a/vllm_v0.10.0/tests/v1/entrypoints/__init__.py b/vllm_v0.10.0/tests/v1/entrypoints/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/v1/entrypoints/conftest.py b/vllm_v0.10.0/tests/v1/entrypoints/conftest.py
new file mode 100644
index 0000000..ffe0612
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/entrypoints/conftest.py
@@ -0,0 +1,182 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+
+@pytest.fixture
+def sample_prompts():
+    return [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+
+@pytest.fixture
+def sample_token_ids():
+    return [
+        [0],
+        [0, 1],
+        [0, 2, 1],
+        [0, 3, 1, 2],
+    ]
+
+
+@pytest.fixture
+def sample_regex():
+    return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+            r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
+
+
+# Note: Ensure this only uses attributes compatible with xgrammar
+@pytest.fixture
+def sample_json_schema():
+    return {
+        "type": "object",
+        "properties": {
+            "name": {
+                "type": "string"
+            },
+            "age": {
+                "type": "integer"
+            },
+            "skills": {
+                "type": "array",
+                "items": {
+                    "type": "string",
+                }
+            },
+            "grade": {
+                "type": "string",
+                "pattern": "^[A-D]$"  # Regex pattern
+            },
+            "email": {
+                "type": "string",
+                "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
+            },
+            "work_history": {
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "company": {
+                            "type": "string"
+                        },
+                        "duration": {
+                            "type": "number",
+                            "minimum": 0.0,
+                            "maximum": 100.0,  # Numeric range
+                        },
+                        "position": {
+                            "type": "string"
+                        }
+                    },
+                    "required": ["company", "duration", "position"],
+                    "additionalProperties": False
+                },
+                "minItems": 0,
+                "maxItems": 3
+            }
+        },
+        "required":
+        ["name", "age", "skills", "grade", "email", "work_history"],
+        "additionalProperties": False
+    }
+
+
+# A schema unsupported by xgrammar
+@pytest.fixture
+def unsupported_json_schema():
+    return {
+        "type": "object",
+        "properties": {
+            "score": {
+                "type": "integer",
+                "multipleOf": 5  # Numeric multiple
+            },
+            "tags": {
+                "type": "array",
+                "items": {
+                    "type": "string",
+                    "minLength": 10,
+                    "maxLength": 20
+                }
+            }
+        },
+        "required": ["score", "tags"],
+        "additionalProperties": False
+    }
+
+
+@pytest.fixture
+def sample_definition_json_schema():
+    return {
+        '$defs': {
+            'Step': {
+                'properties': {
+                    'explanation': {
+                        'title': 'Explanation',
+                        'type': 'string'
+                    },
+                    'output': {
+                        'title': 'Output',
+                        'type': 'string'
+                    }
+                },
+                'required': ['explanation', 'output'],
+                'title': 'Step',
+                'type': 'object'
+            }
+        },
+        'properties': {
+            'steps': {
+                'items': {
+                    '$ref': '#/$defs/Step'
+                },
+                'title': 'Steps',
+                'type': 'array'
+            },
+            'final_answer': {
+                'title': 'Final Answer',
+                'type': 'string'
+            }
+        },
+        'required': ['steps', 'final_answer'],
+        'title': 'MathReasoning',
+        'type': 'object',
+        "additionalProperties": False
+    }
+
+
+@pytest.fixture
+def sample_guided_choice():
+    return [
+        "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
+        "Ruby", "Swift", "Kotlin"
+    ]
+
+
+@pytest.fixture
+def sample_sql_ebnf():
+    return """
+root ::= select_statement
+select_statement ::= "SELECT" column "from" table "where" condition
+column ::= "col_1" | "col_2"
+table ::= "table_1" | "table_2"
+condition ::= column "=" number
+number ::= "1" | "2"
+"""
+
+
+@pytest.fixture
+def sample_sql_lark():
+    return ("""
+start: select_statement
+select_statement: "SELECT" column "from" table "where" condition
+column: "col_1" | "col_2"
+table: "table_1" | "table_2"
+condition: column "=" number
+number: "1" | "2"
+""")
diff --git a/vllm_v0.10.0/tests/v1/entrypoints/llm/__init__.py b/vllm_v0.10.0/tests/v1/entrypoints/llm/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/v1/entrypoints/llm/test_struct_output_generate.py b/vllm_v0.10.0/tests/v1/entrypoints/llm/test_struct_output_generate.py
new file mode 100644
index 0000000..8bddfb0
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -0,0 +1,723 @@
+# ruff: noqa: E501
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import json
+from enum import Enum
+from typing import TYPE_CHECKING, Any
+
+import jsonschema
+import pytest
+import regex as re
+from pydantic import BaseModel
+
+from tests.reasoning.utils import run_reasoning_extraction
+from vllm.entrypoints.llm import LLM
+from vllm.outputs import RequestOutput
+from vllm.platforms import current_platform
+from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams
+
+if TYPE_CHECKING:
+    from vllm.config import TokenizerMode
+
+NGRAM_SPEC_CONFIG = {
+    "model": "[ngram]",
+    "num_speculative_tokens": 5,
+    "prompt_lookup_max": 5,
+    "prompt_lookup_min": 1,
+}
+
+EAGLE_SPEC_CONFIG = {
+    "method": "eagle",
+    "model": "yuhuili/EAGLE-LLaMA3.1-Instruct-8B",
+    "num_speculative_tokens": 5,
+}
+
+PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
+    ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "auto", None),
+    ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", None),
+    ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "mistral", None),
+    ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", None),
+    ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto", None),
+    ("mistralai/Ministral-8B-Instruct-2410", "outlines", "mistral", None),
+    ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto",
+     NGRAM_SPEC_CONFIG),
+    #FIXME: This test is flaky on CI thus disabled
+    #("Qwen/Qwen2.5-1.5B-Instruct", "guidance", "auto"),
+    ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto",
+     NGRAM_SPEC_CONFIG),
+    ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", NGRAM_SPEC_CONFIG),
+    ("meta-llama/Meta-Llama-3.1-8B-Instruct", "xgrammar", "auto",
+     EAGLE_SPEC_CONFIG)
+]
+
+PARAMS_MODELS_TOKENIZER_MODE = [
+    ("mistralai/Ministral-8B-Instruct-2410", "auto"),
+    ("Qwen/Qwen2.5-1.5B-Instruct", "auto"),
+]
+
+
+class CarType(str, Enum):
+    sedan = "sedan"
+    suv = "SUV"
+    truck = "Truck"
+    coupe = "Coupe"
+
+
+class CarDescription(BaseModel):
+    brand: str
+    model: str
+    car_type: CarType
+
+
+def _load_json(s: str, backend: str) -> str:
+    if backend != "xgrammar":
+        return json.loads(s)
+
+    # xgrammar specific workarounds
+    # https://github.com/mlc-ai/xgrammar/issues/286
+    s = re.sub(r'[\x00-\x1F\x7F-\xFF]', '', s)
+    return json.loads(s)
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize(
+    "model_name, guided_decoding_backend, tokenizer_mode, speculative_config",
+    PARAMS_MODELS_BACKENDS_TOKENIZER_MODE)
+def test_structured_output(
+    monkeypatch: pytest.MonkeyPatch,
+    sample_json_schema: dict[str, Any],
+    unsupported_json_schema: dict[str, Any],
+    sample_sql_ebnf: str,
+    sample_sql_lark: str,
+    sample_regex: str,
+    sample_guided_choice: str,
+    guided_decoding_backend: str,
+    tokenizer_mode: str,
+    model_name: str,
+    speculative_config: dict[str, Any],
+):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+
+    if current_platform.is_tpu() and speculative_config:
+        pytest.skip("TPU does not support speculative decoding")
+
+    # Don't use eager execution on TPUs because we want to test for no
+    # recompilation at runtime
+    enforce_eager = bool(not current_platform.is_tpu())
+    # Use a single LLM instance for several scenarios to
+    # speed up the test suite.
+    llm = LLM(
+        model=model_name,
+        enforce_eager=enforce_eager,
+        max_model_len=1024,
+        guided_decoding_backend=guided_decoding_backend,
+        guided_decoding_disable_any_whitespace=(guided_decoding_backend
+                                                in {"xgrammar", "guidance"}),
+        tokenizer_mode=tokenizer_mode,
+        speculative_config=speculative_config)
+
+    #
+    # Test 1: Generate JSON output based on a provided schema
+    #
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=4096,
+        guided_decoding=GuidedDecodingParams(json=sample_json_schema))
+    outputs = llm.generate(prompts=[
+        (f"Give an example JSON for an employee profile that fits this "
+         f"schema. Make the response as short as possible. Schema: "
+         f"{sample_json_schema}")
+    ] * 2,
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        assert "\n" not in generated_text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json, schema=sample_json_schema)
+
+    #
+    # Test 2: Generate JSON object without a schema
+    #
+    if guided_decoding_backend != "outlines":
+        sampling_params = SamplingParams(
+            temperature=1.0,
+            max_tokens=4096,
+            n=2,
+            guided_decoding=GuidedDecodingParams(json_object=True))
+
+        outputs = llm.generate(prompts=(
+            "Generate a JSON object with curly braces for a person with "
+            "name and age fields for John Smith who is 31 years old. "
+            "Make the response as short as possible."),
+                               sampling_params=sampling_params,
+                               use_tqdm=True)
+
+        assert outputs is not None
+        for output in outputs:
+            assert output is not None
+            assert isinstance(output, RequestOutput)
+
+            for i in range(2):
+                generated_text = output.outputs[i].text
+                print(generated_text)
+                assert generated_text is not None
+
+                # Parse to verify it is a valid JSON object
+                parsed_json = json.loads(generated_text)
+                assert isinstance(parsed_json, dict)
+
+    #
+    # Test 3: test a jsonschema incompatible with xgrammar
+    #
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=4096,
+        guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
+    if guided_decoding_backend.startswith("xgrammar"):
+        with pytest.raises(ValueError,
+                           match="The provided JSON schema contains features "
+                           "not supported by xgrammar."):
+            llm.generate(
+                prompts=[(f"Give an example JSON for an employee profile that "
+                          f"fits this schema: {unsupported_json_schema}. "
+                          f"Make the response as short as possible.")] * 2,
+                sampling_params=sampling_params,
+                use_tqdm=True)
+    else:
+        outputs = llm.generate(prompts=(
+            "Give an example JSON object for a grade "
+            "that fits this schema: "
+            f"{unsupported_json_schema}. Make the response as short as "
+            "possible."),
+                               sampling_params=sampling_params,
+                               use_tqdm=True)
+        assert outputs is not None
+        for output in outputs:
+            assert output is not None
+            assert isinstance(output, RequestOutput)
+            generated_text = output.outputs[0].text
+            assert generated_text is not None
+            print(generated_text)
+
+            # Parse to verify it is valid JSON
+            parsed_json = json.loads(generated_text)
+            assert isinstance(parsed_json, dict)
+
+    if guided_decoding_backend != "outlines":
+        #
+        # Test 4: Generate SQL statement using EBNF grammar
+        #
+        sampling_params = SamplingParams(
+            temperature=0.8,
+            top_p=0.95,
+            max_tokens=1000,
+            guided_decoding=GuidedDecodingParams(grammar=sample_sql_ebnf))
+        outputs = llm.generate(
+            prompts=(
+                "Generate a sql statement that selects col_1 from "
+                "table_1 where it is equal to 1. Make the response as short as "
+                "possible."),
+            sampling_params=sampling_params,
+            use_tqdm=True,
+        )
+
+        assert outputs is not None
+        for output in outputs:
+            assert output is not None
+            assert isinstance(output, RequestOutput)
+            prompt = output.prompt
+
+            generated_text = output.outputs[0].text
+            assert generated_text is not None
+
+            # remove spaces for comparison b/c we removed them in the grammar
+            ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(
+                " ", "")
+
+            assert generated_text.strip() == ground_truth
+
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+        #
+        # Test 5: Generate SQL statement using Lark grammar
+        #
+        sampling_params = SamplingParams(
+            temperature=0.8,
+            top_p=0.95,
+            max_tokens=1000,
+            guided_decoding=GuidedDecodingParams(grammar=sample_sql_lark))
+        outputs = llm.generate(
+            prompts=(
+                "Generate a sql statement that selects col_1 from "
+                "table_1 where it is equal to 1. Make the response as short as "
+                "possible."),
+            sampling_params=sampling_params,
+            use_tqdm=True,
+        )
+
+        assert outputs is not None
+        for output in outputs:
+            assert output is not None
+            assert isinstance(output, RequestOutput)
+            prompt = output.prompt
+
+            generated_text = output.outputs[0].text
+            assert generated_text is not None
+
+            # use Lark to parse the output, and make sure it's a valid parse tree
+            from lark import Lark
+            parser = Lark(sample_sql_lark)
+            parser.parse(generated_text)
+
+            # remove spaces for comparison b/c we removed them in the grammar
+            ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(
+                " ", "")
+
+            assert generated_text.strip() == ground_truth
+
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+        #
+        # Test 6: Test invalid grammar input
+        #
+        sampling_params = SamplingParams(
+            temperature=0.8,
+            top_p=0.95,
+            max_tokens=1000,
+            guided_decoding=GuidedDecodingParams(grammar="not a grammar"))
+        with pytest.raises(ValueError, match="Failed to convert the grammar "):
+            llm.generate(
+                prompts=
+                ("Generate a sql statement that selects col_1 from "
+                 "table_1 where it is equal to 1. Make the response as short "
+                 "as possible."),
+                sampling_params=sampling_params,
+                use_tqdm=True,
+            )
+
+    #
+    # Test 7: Generate text based on a regex pattern
+    #
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        guided_decoding=GuidedDecodingParams(regex=sample_regex))
+    outputs = llm.generate(
+        prompts=[
+            (f"Give an example IPv4 address with this regex: {sample_regex}. "
+             f"Make the response as short as possible.")
+        ] * 2,
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(generated_text)
+        assert generated_text is not None
+        assert re.fullmatch(sample_regex, generated_text) is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    #
+    # Test 8: Generate text based on a choices
+    #
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        guided_decoding=GuidedDecodingParams(choice=sample_guided_choice))
+    outputs = llm.generate(
+        prompts=("The best language for type-safe systems programming is "
+                 "(Make the response as short as possible.) "),
+        sampling_params=sampling_params,
+        use_tqdm=True)
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(generated_text)
+        assert generated_text is not None
+        assert generated_text in sample_guided_choice
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    #
+    # Test 9: Generate structured output using a Pydantic model with an enum
+    #
+    json_schema = CarDescription.model_json_schema()
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(json=json_schema))
+    outputs = llm.generate(prompts=(
+        "Generate a JSON with the brand, model and car_type of the most "
+        "iconic car from the 90's. Make the response as short as "
+        "possible."),
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json, schema=json_schema)
+
+    #
+    # Test 10: Generate structured with minLength and maxLength
+    #
+    min_length = 50
+    max_length = 50
+    json_schema = {
+        "type": "object",
+        "properties": {
+            "description": {
+                "type": "string",
+                "maxLength": max_length,
+                "minLength": min_length
+            }
+        },
+        "required": ["description"],
+        "additionalProperties": False
+    }
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=4096,
+        guided_decoding=GuidedDecodingParams(json=json_schema))
+
+    outputs = llm.generate(
+        prompts=("Generate a description of a frog using 50 characters. "
+                 "Make the response as short as possible."),
+        sampling_params=sampling_params,
+        use_tqdm=True)
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json, schema=json_schema)
+
+    if guided_decoding_backend != "outlines":
+        #
+        # Test 11: Generate structured output using structural_tag format
+        #
+        structural_tag_config = {
+            "type":
+            "structural_tag",
+            "structures": [{
+                "begin": "<function=get_weather>",
+                "schema": {
+                    "type": "object",
+                    "properties": {
+                        "city": {
+                            "type": "string"
+                        }
+                    },
+                    "additionalProperties": False
+                },
+                "end": "</function>"
+            }],
+            "triggers": ["<function="]
+        }
+
+        sampling_params = SamplingParams(
+            temperature=0.0,
+            max_tokens=4096,
+            guided_decoding=GuidedDecodingParams(
+                structural_tag=json.dumps(structural_tag_config)))
+
+        prompt = """
+You have access to the following function to retrieve the weather in a city:
+
+    {
+        "name": "get_weather",
+        "parameters": {
+            "city": {
+                "param_type": "string",
+                "description": "The city to get the weather for",
+                "required": True
+            }
+        }
+    }
+
+If a you choose to call a function ONLY reply in the following format:
+<{start_tag}={function_name}>{parameters}{end_tag}
+where
+
+start_tag => `<function`
+parameters => a JSON dict with the function argument name
+            as key and function argument value as value.
+end_tag => `</function>`
+
+Here is an example,
+<function=example_function_name>{"example_name": "example_value"}</function>
+
+Reminder:
+- Function calls MUST follow the specified format
+- Required parameters MUST be specified
+- Only call one function at a time
+- Put the entire function call reply on one line
+- Always add your sources when using search results to answer the user query
+
+You are a helpful assistant.
+
+Given the previous instructions, what is the weather in New York City? \
+Make the response as short as possible.
+"""
+
+        # Change this once other backends support structural_tag
+        outputs = llm.generate(prompts=prompt,
+                               sampling_params=sampling_params,
+                               use_tqdm=True)
+        assert outputs is not None
+
+        for output in outputs:
+            assert output is not None
+            assert isinstance(output, RequestOutput)
+            generated_text = output.outputs[0].text
+            assert generated_text is not None
+
+            # Search for function call pattern in the response
+            function_call_pattern = r'<function=get_weather>(.*?)</function>'
+            matches = re.findall(function_call_pattern, generated_text)
+
+            if not matches:
+                print(f"Warning: No function calls found in response: "
+                      f"{generated_text!r}")
+                continue
+
+            # Take the first function call if multiple are found
+            json_str = matches[0]
+            try:
+                json_content = json.loads(json_str)
+                assert "city" in json_content
+                assert isinstance(json_content["city"], str)
+                print(f"Found valid function call: {generated_text!r}")
+            except (json.JSONDecodeError, AssertionError) as e:
+                pytest.fail("Invalid function call format: "
+                            f"{generated_text!r}\nError: {str(e)}")
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize(
+    "model_name, guided_decoding_backend, tokenizer_mode, reasoning_parser, speculative_config",  # noqa: E501
+    [
+        ("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "xgrammar", "auto",
+         "deepseek_r1", NGRAM_SPEC_CONFIG),
+        ("Qwen/Qwen3-1.7B", "xgrammar", "auto", "deepseek_r1", None),
+    ],
+)
+def test_structured_output_with_reasoning_matrices(
+    monkeypatch: pytest.MonkeyPatch,
+    guided_decoding_backend: str,
+    tokenizer_mode: TokenizerMode,
+    reasoning_parser: str,
+    model_name: str,
+    speculative_config: dict[str, Any] | None,
+):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+
+    if current_platform.is_tpu() and speculative_config:
+        pytest.skip("TPU does not support speculative decoding")
+
+    # Use a single LLM instance for several scenarios to
+    # speed up the test suite.
+    llm = LLM(
+        model=model_name,
+        # Don't use eager execution on TPUs because we want to test for no
+        # recompilation at runtime
+        enforce_eager=bool(not current_platform.is_tpu()),
+        max_model_len=1024,
+        max_num_seqs=16,
+        guided_decoding_backend=guided_decoding_backend,
+        guided_decoding_disable_any_whitespace=True,
+        tokenizer_mode=tokenizer_mode,
+        reasoning_parser=reasoning_parser,
+        speculative_config=speculative_config,
+    )
+    tokenizer = llm.get_tokenizer(None)
+    reasoner = ReasoningParserManager.get_reasoning_parser(reasoning_parser)(
+        tokenizer=tokenizer)
+
+    reasoning_prompt = "Solve the following math problem step-by-step, then provide the final answer as JSON object with a single key 'result'. Make sure to correct your reasoning if there are any issue should it arise.\nProblem: What is 5 * 8 + 2?"  # noqa: E501
+    reasoning_schema = {
+        "type": "object",
+        "properties": {
+            "result": {
+                "type": "integer"
+            }
+        },
+        "required": ["result"],
+        "additionalProperties": False
+    }
+    if "Qwen3" in model_name:
+        reasoning_prompt += "<think>\n"
+
+    sampling_params = SamplingParams(
+        temperature=0.1,
+        max_tokens=8192,
+        guided_decoding=GuidedDecodingParams(json=reasoning_schema),
+    )
+    outputs = llm.generate(
+        [reasoning_prompt],
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )
+
+    assert outputs is not None
+    output = outputs[0]
+    assert output is not None and isinstance(output, RequestOutput)
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    reasoning_content, content = run_reasoning_extraction(
+        reasoner, [generated_text])
+    print(
+        f"Prompt: {prompt!r}\nReasoning: {reasoning_content!r}\nContent: {content!r}"
+    )
+
+    assert content is not None and reasoning_content is not None
+    output_json = json.loads(content)
+    jsonschema.validate(instance=output_json, schema=reasoning_schema)
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("model_name, tokenizer_mode",
+                         PARAMS_MODELS_TOKENIZER_MODE)
+def test_structured_output_auto_mode(
+    monkeypatch: pytest.MonkeyPatch,
+    unsupported_json_schema: dict[str, Any],
+    model_name: str,
+    tokenizer_mode: str,
+):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend="auto",
+              tokenizer_mode=tokenizer_mode)
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
+
+    prompts = (
+        "Give an example JSON object for a grade "
+        "that fits this schema: "
+        f"{unsupported_json_schema}. Make the response as short as possible.")
+    # This would fail with the default of "xgrammar", but in "auto"
+    # we will handle fallback automatically.
+    outputs = llm.generate(prompts=prompts,
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+    # Make sure `auto` backend handling doesn't mess up sampling_params
+    # and that we can reuse it without error.
+    outputs.extend(
+        llm.generate(prompts=prompts,
+                     sampling_params=sampling_params,
+                     use_tqdm=True))
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(generated_text)
+
+        # Parse to verify it is valid JSON
+        parsed_json = json.loads(generated_text)
+        assert isinstance(parsed_json, dict)
+
+
+@pytest.mark.skip_global_cleanup
+def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+
+    llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
+              max_model_len=1024,
+              guided_decoding_backend="guidance",
+              guided_decoding_disable_any_whitespace=True,
+              guided_decoding_disable_additional_properties=True)
+
+    schema = {
+        'type': 'object',
+        'properties': {
+            'a1': {
+                'type': 'string'
+            },
+            'a2': {
+                'type': 'string'
+            },
+            'a3': {
+                'type': 'string'
+            }
+        },
+        'required': ['a1', 'a2', 'a3'],
+    }
+
+    prompt = (
+        "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a "
+        "helpful assistant.<|im_end|>\n<|im_start|>user\nPlease generate a "
+        "large JSON object with key-value pairs a1=b1, a2=b2, ..., a20=b20. "
+        "Make the response as short as possible."
+        "<|im_end|>\n<|im_start|>assistant\n")
+
+    def generate_with_backend(backend):
+        guided_params = GuidedDecodingParams(
+            json=schema,
+            backend=backend,
+            disable_any_whitespace=True,
+            disable_additional_properties=True)
+        sampling_params = SamplingParams(temperature=0,
+                                         max_tokens=256,
+                                         guided_decoding=guided_params)
+
+        outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
+        assert outputs is not None
+        generated_text = outputs[0].outputs[0].text
+        assert generated_text is not None
+        parsed_json = json.loads(generated_text)
+        assert isinstance(parsed_json, dict)
+        jsonschema.validate(instance=parsed_json, schema=schema)
+        return parsed_json
+
+    generated = generate_with_backend("guidance")
+    assert "a1" in generated
+    assert "a2" in generated
+    assert "a3" in generated
+    assert "a4" not in generated
+    assert "a5" not in generated
+    assert "a6" not in generated
diff --git a/vllm_v0.10.0/tests/v1/entrypoints/openai/responses/__init__.py b/vllm_v0.10.0/tests/v1/entrypoints/openai/responses/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/v1/entrypoints/openai/responses/conftest.py b/vllm_v0.10.0/tests/v1/entrypoints/openai/responses/conftest.py
new file mode 100644
index 0000000..2dcdda0
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/entrypoints/openai/responses/conftest.py
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import pytest_asyncio
+
+from tests.utils import RemoteOpenAIServer
+
+# Use a small reasoning model to test the responses API.
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+
+
+@pytest.fixture(scope="module")
+def default_server_args():
+    return [
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",  # For faster startup.
+        "--reasoning-parser",
+        "deepseek_r1",
+    ]
+
+
+@pytest.fixture(scope="module")
+def server(default_server_args):
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
diff --git a/vllm_v0.10.0/tests/v1/entrypoints/openai/responses/test_basic.py b/vllm_v0.10.0/tests/v1/entrypoints/openai/responses/test_basic.py
new file mode 100644
index 0000000..974ea86
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/entrypoints/openai/responses/test_basic.py
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import openai  # use the official client for correctness check
+import pytest
+
+
+@pytest.mark.asyncio
+async def test_simple_input(client: openai.AsyncOpenAI):
+    response = await client.responses.create(input="What is 13 * 24?")
+    print(response)
+
+    outputs = response.output
+    # Whether the output contains the answer.
+    assert outputs[-1].type == "message"
+    assert "312" in outputs[-1].content[0].text
+
+    # Whether the output contains the reasoning.
+    assert outputs[0].type == "reasoning"
+    assert outputs[0].text != ""
+
+
+@pytest.mark.asyncio
+async def test_instructions(client: openai.AsyncOpenAI):
+    response = await client.responses.create(
+        instructions="Finish the answer with QED.",
+        input="What is 13 * 24?",
+    )
+    print(response)
+
+    output_text = response.output[-1].content[0].text
+    assert "312" in output_text
+    assert "QED" in output_text
+
+
+@pytest.mark.asyncio
+async def test_chat(client: openai.AsyncOpenAI):
+    response = await client.responses.create(input=[
+        {
+            "role": "system",
+            "content": "Finish the answer with QED."
+        },
+        {
+            "role": "user",
+            "content": "What is 5 * 3?"
+        },
+        {
+            "role": "assistant",
+            "content": "15. QED."
+        },
+        {
+            "role": "user",
+            "content": "Multiply the result by 2."
+        },
+    ], )
+    print(response)
+
+    output_text = response.output[-1].content[0].text
+    assert "30" in output_text
+    assert "QED" in output_text
+
+
+@pytest.mark.asyncio
+async def test_chat_with_input_type(client: openai.AsyncOpenAI):
+    response = await client.responses.create(input=[
+        {
+            "role": "user",
+            "content": [{
+                "type": "input_text",
+                "text": "Hello!"
+            }],
+        },
+    ], )
+    print(response)
+    assert response.status == "completed"
diff --git a/vllm_v0.10.0/tests/v1/entrypoints/openai/responses/test_image.py b/vllm_v0.10.0/tests/v1/entrypoints/openai/responses/test_image.py
new file mode 100644
index 0000000..f3bce91
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/entrypoints/openai/responses/test_image.py
@@ -0,0 +1,166 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import openai
+import pytest
+import pytest_asyncio
+
+from tests.utils import RemoteOpenAIServer
+from vllm.multimodal.utils import encode_image_base64, fetch_image
+
+# Use a small vision model for testing
+MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
+MAXIMUM_IMAGES = 2
+# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
+TEST_IMAGE_URLS = [
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
+    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+]
+
+
+@pytest.fixture(scope="module")
+def default_image_server_args():
+    return [
+        "--enforce-eager",
+        "--max-model-len",
+        "6000",
+        "--max-num-seqs",
+        "128",
+        "--limit-mm-per-prompt",
+        json.dumps({"image": MAXIMUM_IMAGES}),
+    ]
+
+
+@pytest.fixture(scope="module")
+def image_server(default_image_server_args):
+    with RemoteOpenAIServer(MODEL_NAME,
+                            default_image_server_args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(image_server):
+    async with image_server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.fixture(scope="session")
+def base64_encoded_image() -> dict[str, str]:
+    return {
+        image_url: encode_image_base64(fetch_image(image_url))
+        for image_url in TEST_IMAGE_URLS
+    }
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_single_chat_session_image(client: openai.AsyncOpenAI,
+                                         model_name: str, image_url: str):
+    content_text = "What's in this image?"
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "input_image",
+                "image_url": image_url,
+                "detail": "auto",
+            },
+            {
+                "type": "input_text",
+                "text": content_text
+            },
+        ],
+    }]
+
+    # test image url
+    response = await client.responses.create(
+        model=model_name,
+        input=messages,
+    )
+    assert len(response.output_text) > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_single_chat_session_image_base64encoded(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    image_url: str,
+    base64_encoded_image: dict[str, str],
+):
+    content_text = "What's in this image?"
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "input_image",
+                "image_url":
+                f"data:image/jpeg;base64,{base64_encoded_image[image_url]}",
+                "detail": "auto",
+            },
+            {
+                "type": "input_text",
+                "text": content_text
+            },
+        ],
+    }]
+    # test image base64
+    response = await client.responses.create(
+        model=model_name,
+        input=messages,
+    )
+    assert len(response.output_text) > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "image_urls",
+    [TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
+async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
+                                 image_urls: list[str]):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *({
+                "type": "input_image",
+                "image_url": image_url,
+                "detail": "auto",
+            } for image_url in image_urls),
+            {
+                "type": "input_text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+
+    if len(image_urls) > MAXIMUM_IMAGES:
+        with pytest.raises(openai.BadRequestError):  # test multi-image input
+            await client.responses.create(
+                model=model_name,
+                input=messages,
+            )
+        # the server should still work afterwards
+        response = await client.responses.create(
+            model=model_name,
+            input=[{
+                "role": "user",
+                "content": "What's the weather like in Paris today?",
+            }],
+        )
+        assert len(response.output_text) > 0
+    else:
+        response = await client.responses.create(
+            model=model_name,
+            input=messages,
+        )
+        assert len(response.output_text) > 0
diff --git a/vllm_v0.10.0/tests/v1/entrypoints/openai/responses/test_stateful.py b/vllm_v0.10.0/tests/v1/entrypoints/openai/responses/test_stateful.py
new file mode 100644
index 0000000..a2d581e
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/entrypoints/openai/responses/test_stateful.py
@@ -0,0 +1,137 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+
+import openai
+import pytest
+
+
+@pytest.mark.asyncio
+async def test_store(client: openai.AsyncOpenAI):
+    # By default, store is True.
+    response = await client.responses.create(input="Hello!")
+    assert response.status == "completed"
+
+    # Retrieve the response.
+    response = await client.responses.retrieve(response.id)
+    assert response.status == "completed"
+
+    # Test store=False.
+    response = await client.responses.create(
+        input="Hello!",
+        store=False,
+    )
+    assert response.status == "completed"
+
+    # The response should not be found.
+    with pytest.raises(openai.NotFoundError,
+                       match="Response with id .* not found."):
+        await client.responses.retrieve(response.id)
+
+
+@pytest.mark.asyncio
+async def test_background(client: openai.AsyncOpenAI):
+    # NOTE: This query should be easy enough for the model to answer
+    # within the 10 seconds.
+    response = await client.responses.create(
+        input="Hello!",
+        background=True,
+    )
+    assert response.status == "queued"
+
+    max_retries = 10
+    for _ in range(max_retries):
+        await asyncio.sleep(1)
+        response = await client.responses.retrieve(response.id)
+        if response.status != "queued":
+            break
+    print(response)
+
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+async def test_background_error(client: openai.AsyncOpenAI):
+    with pytest.raises(
+            openai.BadRequestError,
+            match="background can only be used when `store` is true"):
+        _ = await client.responses.create(
+            input="What is 13 * 24?",
+            background=True,
+            store=False,
+        )
+
+
+@pytest.mark.asyncio
+async def test_background_cancel(client: openai.AsyncOpenAI):
+    response = await client.responses.create(
+        input="Write a long story about a cat.",
+        background=True,
+    )
+    assert response.status == "queued"
+
+    # Cancel the response before it is completed.
+    # FIXME: This test can be flaky.
+    await asyncio.sleep(0.5)
+    response = await client.responses.cancel(response.id)
+    assert response.status == "cancelled"
+
+    # Make sure the response status remains unchanged.
+    await asyncio.sleep(5)
+    response = await client.responses.retrieve(response.id)
+    assert response.status == "cancelled"
+
+
+@pytest.mark.asyncio
+async def test_cancel_completed(client: openai.AsyncOpenAI):
+    response = await client.responses.create(input="Hello")
+    assert response.status == "completed"
+
+    with pytest.raises(openai.BadRequestError,
+                       match="Cannot cancel a synchronous response."):
+        await client.responses.cancel(response.id)
+
+
+@pytest.mark.asyncio
+async def test_previous_response_id(client: openai.AsyncOpenAI):
+    response1 = await client.responses.create(
+        instructions="You are tested on your ability to retrieve the correct "
+        "information from the previous response.",
+        input="Hello, my name is John.")
+
+    response2 = await client.responses.create(
+        input="Actually, my name is not John. My real name is Mark.",
+        previous_response_id=response1.id,
+    )
+
+    response3 = await client.responses.create(
+        input="What is my real name again? Answer in one word.",
+        previous_response_id=response2.id,
+    )
+    print(response3)
+    assert "Mark" in response3.output[-1].content[0].text
+    assert "John" not in response3.output[-1].content[0].text
+
+
+@pytest.mark.asyncio
+async def test_two_responses_with_same_prev_id(client: openai.AsyncOpenAI):
+    response1 = await client.responses.create(
+        instructions="You are tested on your ability to retrieve the correct "
+        "information from the previous response.",
+        input="Hello, my name is John.")
+
+    # Both response 2 and 3 use response 1 as the previous response.
+    response2 = client.responses.create(
+        input="Actually, my name is not John. My name is Mark.",
+        previous_response_id=response1.id,
+    )
+    response3 = client.responses.create(
+        input="What is my name again? Answer in one word.",
+        previous_response_id=response1.id,
+    )
+
+    _ = await response2
+    response3_result = await response3
+    print(response3_result)
+    assert "John" in response3_result.output[-1].content[0].text
+    assert "Mark" not in response3_result.output[-1].content[0].text
diff --git a/vllm_v0.10.0/tests/v1/entrypoints/openai/responses/test_structured_output.py b/vllm_v0.10.0/tests/v1/entrypoints/openai/responses/test_structured_output.py
new file mode 100644
index 0000000..c4c43a8
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/entrypoints/openai/responses/test_structured_output.py
@@ -0,0 +1,92 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+
+import openai
+import pytest
+from pydantic import BaseModel
+
+
+@pytest.mark.asyncio
+async def test_structured_output(client: openai.AsyncOpenAI):
+    response = await client.responses.create(
+        input=[
+            {
+                "role": "system",
+                "content": "Extract the event information."
+            },
+            {
+                "role": "user",
+                "content":
+                "Alice and Bob are going to a science fair on Friday.",
+            },
+        ],
+        text={
+            "format": {
+                "type": "json_schema",
+                "name": "calendar_event",
+                "schema": {
+                    "type": "object",
+                    "properties": {
+                        "event_name": {
+                            "type": "string"
+                        },
+                        "date": {
+                            "type": "string"
+                        },
+                        "participants": {
+                            "type": "array",
+                            "items": {
+                                "type": "string"
+                            }
+                        },
+                    },
+                    "required": ["event_name", "date", "participants"],
+                    "additionalProperties": False,
+                },
+                "description": "A calendar event.",
+                "strict": True,
+            }
+        },
+    )
+    print(response)
+
+    # NOTE: The JSON schema is applied to the output text, not reasoning.
+    output_text = response.output[-1].content[0].text
+    event = json.loads(output_text)
+
+    assert event["event_name"].lower() == "science fair"
+    assert event["date"] == "Friday"
+    participants = event["participants"]
+    assert len(participants) == 2
+    assert participants[0] == "Alice"
+    assert participants[1] == "Bob"
+
+
+@pytest.mark.asyncio
+async def test_structured_output_with_parse(client: openai.AsyncOpenAI):
+
+    class CalendarEvent(BaseModel):
+        event_name: str
+        date: str
+        participants: list[str]
+
+    response = await client.responses.parse(
+        model=None,
+        instructions="Extract the event information.",
+        input="Alice and Bob are going to a science fair on Friday.",
+        text_format=CalendarEvent,
+    )
+    print(response)
+
+    # The output is successfully parsed.
+    event = response.output_parsed
+    assert event is not None
+
+    # The output is correct.
+    assert event.event_name.lower() == "science fair"
+    assert event.date == "Friday"
+    participants = event.participants
+    assert len(participants) == 2
+    assert participants[0] == "Alice"
+    assert participants[1] == "Bob"
diff --git a/vllm_v0.10.0/tests/v1/entrypoints/openai/test_chat_completion.py b/vllm_v0.10.0/tests/v1/entrypoints/openai/test_chat_completion.py
new file mode 100644
index 0000000..dffb328
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/entrypoints/openai/test_chat_completion.py
@@ -0,0 +1,138 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+
+from tests.utils import RemoteOpenAIServer
+
+# any model with a chat template defined in tokenizer_config should work here
+MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
+
+
+@pytest.fixture(scope="module")
+def default_server_args():
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+    ]
+
+
+@pytest.fixture(scope="module")
+def server(default_server_args):
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_invalid_json_schema(client: openai.AsyncOpenAI,
+                                   model_name: str) -> None:
+    invalid_json_schema = {
+        "$defs": {
+            "CarType": {
+                "enum": ["sedan", "SUV", "Truck", "Coupe"],
+                "title": "CarType",
+                "type": "string",
+            }
+        },
+        "properties": {
+            "brand": {
+                "title": "Brand",
+                "type": "string"
+            },
+            "model": {
+                "title": "Model",
+                "type": "string"
+            },
+            "car_type": {
+                "$ref": "#/$defs/CarType"
+            },
+            "foo": "bar",
+        },
+        "required": ["brand", "model", "car_type"],
+        "title": "CarDescription",
+        "type": "object",
+    }
+    prompt = ("Generate a JSON with the brand, model and car_type of"
+              "the most iconic car from the 90's")
+    with pytest.raises((openai.BadRequestError, openai.APIError)):
+        await client.chat.completions.create(
+            model=model_name,
+            messages=[{
+                "role": "user",
+                "content": prompt,
+            }],
+            extra_body={"guided_json": invalid_json_schema},
+        )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_invalid_regex(client: openai.AsyncOpenAI, model_name: str):
+    prompt = ("Generate an email address for Alan Turing, who works in Enigma."
+              "End in .com and new line. Example result:"
+              "alan.turing@enigma.com\n")
+
+    with pytest.raises((openai.BadRequestError, openai.APIError)):
+        await client.chat.completions.create(
+            model=model_name,
+            messages=[{
+                "role": "user",
+                "content": prompt,
+            }],
+            extra_body={
+                "guided_regex": r"[.*",
+                "stop": ["\n"]
+            },
+        )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
+    invalid_simplified_sql_grammar = """
+        root ::= select_statementinvalidsyntax
+
+        select_statement ::= "SELECT " column " from " table " where " condition
+
+        column ::= "col_1 " | "col_2 "
+
+        table ::= "table_1 " | "table_2 "
+
+        condition ::= column "= " number
+
+        number ::= "1 " | "2 "
+    """
+
+    prompt = ("Generate an SQL query to show the 'username' and 'email'"
+              "from the 'users' table.")
+    with pytest.raises((openai.BadRequestError, openai.APIError)):
+        await client.chat.completions.create(
+            model=model_name,
+            messages=[{
+                "role": "user",
+                "content": prompt,
+            }],
+            extra_body={"guided_grammar": invalid_simplified_sql_grammar},
+        )
diff --git a/vllm_v0.10.0/tests/v1/entrypoints/openai/test_completion.py b/vllm_v0.10.0/tests/v1/entrypoints/openai/test_completion.py
new file mode 100644
index 0000000..2462f8f
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/entrypoints/openai/test_completion.py
@@ -0,0 +1,697 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+import regex as re
+import requests
+from openai import BadRequestError
+
+from tests.utils import RemoteOpenAIServer
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+# any model with a chat template should work here
+MODEL_NAME = "facebook/opt-125m"
+
+
+@pytest.fixture(scope="module")
+def default_server_args():
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+        "--enable-prompt-tokens-details",
+    ]
+
+
+@pytest.fixture(scope="module",
+                params=[["--no-enable-prefix-caching"],
+                        [
+                            "--no-enable-prefix-caching",
+                            "--disable-frontend-multiprocessing"
+                        ]])
+def server(default_server_args, request):
+    if request.param:
+        default_server_args = default_server_args + request.param
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_single_completion(client: openai.AsyncOpenAI,
+                                 model_name: str) -> None:
+    completion = await client.completions.create(model=model_name,
+                                                 prompt="Hello, my name is",
+                                                 max_tokens=5,
+                                                 temperature=0.0)
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 1
+
+    choice = completion.choices[0]
+    assert len(choice.text) >= 5
+    assert choice.finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5, prompt_tokens=6, total_tokens=11)
+
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 1
+    assert completion.choices[0].prompt_logprobs is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=None,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=0,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.token_logprobs is not None
+    assert choice.logprobs.top_logprobs is not None
+    assert len(choice.logprobs.top_logprobs[0]) == 1
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=5,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.token_logprobs is not None
+    assert choice.logprobs.top_logprobs is not None
+    assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
+                                            model_name: str) -> None:
+
+    with pytest.raises(
+        (openai.BadRequestError, openai.APIError)):  # test using token IDs
+        await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+            # vLLM has higher default max_logprobs (20 instead of 5) to support
+            # both Completion API and Chat Completion API
+            logprobs=21,
+        )
+        ...
+    with pytest.raises(
+        (openai.BadRequestError, openai.APIError)):  # test using token IDs
+        stream = await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+            # vLLM has higher default max_logprobs (20 instead of 5) to support
+            # both Completion API and Chat Completion API
+            logprobs=30,
+            stream=True,
+        )
+        async for chunk in stream:
+            ...
+
+    # the server should still work afterwards
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name, prompt_logprobs", [(MODEL_NAME, -1),
+                                                         (MODEL_NAME, 0),
+                                                         (MODEL_NAME, 1),
+                                                         (MODEL_NAME, None)])
+async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
+                                          model_name: str,
+                                          prompt_logprobs: Optional[int]):
+    params: dict = {
+        "prompt": ["A robot may not injure another robot", "My name is"],
+        "model": model_name,
+    }
+    if prompt_logprobs is not None:
+        params["extra_body"] = {"prompt_logprobs": prompt_logprobs}
+
+    if prompt_logprobs is not None and prompt_logprobs < 0:
+        with pytest.raises(BadRequestError):
+            await client.completions.create(**params)
+    else:
+        completion = await client.completions.create(**params)
+        if prompt_logprobs is not None:
+            assert completion.choices[0].prompt_logprobs is not None
+            assert len(completion.choices[0].prompt_logprobs) > 0
+
+            assert completion.choices[1].prompt_logprobs is not None
+            assert len(completion.choices[1].prompt_logprobs) > 0
+
+        else:
+            assert completion.choices[0].prompt_logprobs is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_completion_streaming(client: openai.AsyncOpenAI,
+                                    model_name: str) -> None:
+    prompt = "What is an LLM?"
+
+    single_completion = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+    )
+    single_output = single_completion.choices[0].text
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True)
+    chunks: list[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        chunks.append(chunk.choices[0].text)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == "length"
+    assert chunk.choices[0].text
+    assert "".join(chunks) == single_output
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_parallel_no_streaming(client: openai.AsyncOpenAI,
+                                     model_name: str):
+    """Parallel sampling without streaming.
+    A single request output contains a list of completions.
+    """
+
+    prompt = "What is an LLM?"
+    n = 3
+    max_tokens = 50  # we want some to finish earlier than others
+
+    # High temperature to maximize chance of unique completions.
+    completion = await client.completions.create(model=model_name,
+                                                 prompt=prompt,
+                                                 max_tokens=max_tokens,
+                                                 n=n,
+                                                 temperature=1.0,
+                                                 stream=False,
+                                                 logprobs=0,
+                                                 seed=42)
+
+    # Assert `n` completions
+    num_completions = len(completion.choices)
+    assert num_completions == n, (
+        f"Num completions {num_completions} but expected {n}.")
+    completion_repeats: dict[str, int] = {}
+    output_token_lengths = set()
+    for idx, choice in enumerate(completion.choices):
+        # Assert correct completion index & some finish reason.
+        assert choice.index == idx, (
+            f"Index {choice.index} but expected {idx}.")
+        assert choice.finish_reason is not None, (
+            "None finish_reason is invalid.")
+        text = choice.text
+        completion_repeats[text] = completion_repeats.get(text, 0) + 1
+        output_token_lengths.add(len(choice.logprobs.tokens))
+    # Assert subrequests finished at different times
+    assert len(output_token_lengths) > 1
+    # Assert `n` unique completions
+    num_unique = len(completion_repeats)
+    if num_unique != n:
+        repeats = {
+            txt: num
+            for (txt, num) in completion_repeats.items() if num > 1
+        }
+        raise AssertionError(
+            f"Expected {n} unique completions, got {num_unique};"
+            f" repeats: {repeats}.")
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
+    """Streaming for parallel sampling.
+    The tokens from multiple samples, are flattened into a single stream,
+    with an index to indicate which sample the token belongs to.
+    """
+
+    prompt = "What is an LLM?"
+    n = 3
+    max_tokens = 50  # we want some to finish earlier than others
+
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=max_tokens,
+                                             n=n,
+                                             temperature=1.0,
+                                             stream=True,
+                                             seed=42)
+    chunks: list[list[str]] = [[] for _ in range(n)]
+    finish_reason_count = 0
+    async for chunk in stream:
+        index = chunk.choices[0].index
+        text = chunk.choices[0].text
+        chunks[index].append(text)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # Assert `n` completions with correct finish reasons
+    assert finish_reason_count == n, (
+        f"Expected {n} completions with valid indices and finish_reason.")
+    completion_repeats: dict[str, int] = {}
+    chunk_lengths = set()
+    for chunk in chunks:
+        chunk_len = len(chunk)
+        # Assert correct number of completion tokens
+        chunk_lengths.add(chunk_len)
+        assert chunk_len <= max_tokens, (
+            f"max_tokens={max_tokens} but chunk len is {chunk_len}.")
+        text = "".join(chunk)
+        completion_repeats[text] = completion_repeats.get(text, 0) + 1
+        print(text)
+    # Assert subrequests finished at different times
+    assert len(chunk_lengths) > 1
+    # Assert `n` unique completions
+    num_unique = len(completion_repeats)
+    if num_unique != n:
+        repeats = {
+            txt: num
+            for (txt, num) in completion_repeats.items() if num > 1
+        }
+        raise AssertionError(f"{num_unique} unique completions, expected {n};"
+                             f" repeats: {repeats}")
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_completion_stream_options(client: openai.AsyncOpenAI,
+                                         model_name: str):
+    prompt = "What is the capital of France?"
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": False, "continuous_usage_stats": False}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": False,
+                                                 "continuous_usage_stats":
+                                                 False,
+                                             })
+
+    async for chunk in stream:
+        assert chunk.usage is None
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": False, "continuous_usage_stats": True}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": False,
+                                                 "continuous_usage_stats":
+                                                 True,
+                                             })
+    async for chunk in stream:
+        assert chunk.usage is None
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": True, "continuous_usage_stats": False}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": True,
+                                                 "continuous_usage_stats":
+                                                 False,
+                                             })
+    async for chunk in stream:
+        if chunk.choices[0].finish_reason is None:
+            assert chunk.usage is None
+        else:
+            assert chunk.usage is None
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
+            assert final_chunk.choices == []
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": True, "continuous_usage_stats": True}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": True,
+                                                 "continuous_usage_stats":
+                                                 True,
+                                             })
+    async for chunk in stream:
+        assert chunk.usage is not None
+        assert chunk.usage.prompt_tokens > 0
+        assert chunk.usage.completion_tokens > 0
+        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
+                                            chunk.usage.completion_tokens)
+        if chunk.choices[0].finish_reason is not None:
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
+            assert final_chunk.choices == []
+
+    # Test stream=False, stream_options=
+    #     {"include_usage": None}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(model=model_name,
+                                        prompt=prompt,
+                                        max_tokens=5,
+                                        temperature=0.0,
+                                        stream=False,
+                                        stream_options={"include_usage": None})
+
+    # Test stream=False, stream_options=
+    #    {"include_usage": True}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(model=model_name,
+                                        prompt=prompt,
+                                        max_tokens=5,
+                                        temperature=0.0,
+                                        stream=False,
+                                        stream_options={"include_usage": True})
+
+    # Test stream=False, stream_options=
+    #     {"continuous_usage_stats": None}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"continuous_usage_stats": None})
+
+    # Test stream=False, stream_options=
+    #    {"continuous_usage_stats": True}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"continuous_usage_stats": True})
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
+    # test both text and token IDs
+    for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2):
+        # test simple list
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            max_tokens=5,
+            temperature=0.0,
+        )
+        assert len(batch.choices) == 2
+        assert batch.choices[0].text == batch.choices[1].text
+
+        # test n = 2
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            n=2,
+            max_tokens=5,
+            temperature=0.0,
+            extra_body=dict(
+                # NOTE: this has to be true for n > 1 in vLLM, but
+                # not necessary for official client.
+                use_beam_search=True),
+        )
+        assert len(batch.choices) == 4
+        assert batch.choices[0].text != batch.choices[
+            1].text, "beam search should be different"
+        assert batch.choices[0].text == batch.choices[
+            2].text, "two copies of the same prompt should be the same"
+        assert batch.choices[1].text == batch.choices[
+            3].text, "two copies of the same prompt should be the same"
+
+        # test streaming
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            max_tokens=5,
+            temperature=0.0,
+            stream=True,
+        )
+        texts = [""] * 2
+        async for chunk in batch:
+            assert len(chunk.choices) == 1
+            choice = chunk.choices[0]
+            texts[choice.index] += choice.text
+        assert texts[0] == texts[1]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+@pytest.mark.parametrize("logprobs_arg", [1, 0])
+async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
+                                       model_name: str, logprobs_arg: int):
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+    # test using text and token IDs
+    for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
+        completion = await client.completions.create(model=model_name,
+                                                     prompt=prompt,
+                                                     max_tokens=5,
+                                                     temperature=0.0,
+                                                     echo=True,
+                                                     logprobs=logprobs_arg)
+
+        prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
+                                                             list) else prompt
+        assert re.search(r"^" + prompt_text, completion.choices[0].text)
+        logprobs = completion.choices[0].logprobs
+        assert logprobs is not None
+        assert len(logprobs.text_offset) > 5
+        assert (len(logprobs.token_logprobs) > 5
+                and logprobs.token_logprobs[0] is None)
+        assert (len(logprobs.top_logprobs) > 5
+                and logprobs.top_logprobs[0] is None)
+        for top_logprobs in logprobs.top_logprobs[1:]:
+            assert max(logprobs_arg,
+                       1) <= len(top_logprobs) <= logprobs_arg + 1
+        assert len(logprobs.tokens) > 5
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_invalid_json_schema(client: openai.AsyncOpenAI,
+                                   model_name: str) -> None:
+    invalid_json_schema = {
+        "$defs": {
+            "CarType": {
+                "enum": ["sedan", "SUV", "Truck", "Coupe"],
+                "title": "CarType",
+                "type": "string",
+            }
+        },
+        "properties": {
+            "brand": {
+                "title": "Brand",
+                "type": "string"
+            },
+            "model": {
+                "title": "Model",
+                "type": "string"
+            },
+            "car_type": {
+                "$ref": "#/$defs/CarType"
+            },
+            "foo": "bar",
+        },
+        "required": ["brand", "model", "car_type"],
+        "title": "CarDescription",
+        "type": "object",
+    }
+    prompt = ("Generate a JSON with the brand, model and car_type of"
+              "the most iconic car from the 90's")
+    with pytest.raises((openai.BadRequestError, openai.APIError)):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            extra_body={"guided_json": invalid_json_schema},
+        )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_invalid_regex(client: openai.AsyncOpenAI, model_name: str):
+    prompt = ("Generate an email address for Alan Turing, who works in Enigma."
+              "End in .com and new line. Example result:"
+              "alan.turing@enigma.com\n")
+
+    with pytest.raises((openai.BadRequestError, openai.APIError)):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            extra_body={
+                "guided_regex": r"[.*",
+                "stop": ["\n"]
+            },
+        )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
+    invalid_simplified_sql_grammar = """
+        root ::= select_statementinvalidsyntax
+
+        select_statement ::= "SELECT " column " from " table " where " condition
+
+        column ::= "col_1 " | "col_2 "
+
+        table ::= "table_1 " | "table_2 "
+
+        condition ::= column "= " number
+
+        number ::= "1 " | "2 "
+    """
+
+    prompt = ("Generate an SQL query to show the 'username' and 'email'"
+              "from the 'users' table.")
+    with pytest.raises((openai.BadRequestError, openai.APIError)):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            extra_body={"guided_grammar": invalid_simplified_sql_grammar},
+        )
+
+
+@pytest.mark.asyncio
+async def test_completion_with_empty_prompt_embeds(
+        client: openai.AsyncOpenAI) -> None:
+    """Test completion with empty prompt embeds."""
+    payload: dict[str, list] = {"prompt_embeds": []}
+    headers: dict[str, str] = {"Content-Type": "application/json"}
+    # base_url = http://localhost:8000/v1/completions
+    response = requests.post(f"{client.base_url}completions",
+                             headers=headers,
+                             json=payload)
+    assert response.status_code == 200, (
+        f"Expected status code 200, got {response.status_code}. ")
diff --git a/vllm_v0.10.0/tests/v1/entrypoints/openai/test_multi_api_servers.py b/vllm_v0.10.0/tests/v1/entrypoints/openai/test_multi_api_servers.py
new file mode 100644
index 0000000..f7c31b0
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/entrypoints/openai/test_multi_api_servers.py
@@ -0,0 +1,181 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import os
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+
+from tests.utils import RemoteOpenAIServer
+from tests.v1.test_utils import check_request_balancing
+
+MODEL_NAME = "ibm-research/PowerMoE-3b"
+
+DP_SIZE = os.getenv("DP_SIZE", "1")
+
+
+@pytest.fixture(scope="module")
+def default_server_args():
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+        "--api-server-count",
+        "4",
+        "--data_parallel_size",
+        DP_SIZE,
+    ]
+
+
+@pytest.fixture(scope="module")
+def server(default_server_args):
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_single_completion(client: openai.AsyncOpenAI,
+                                 server: RemoteOpenAIServer,
+                                 model_name: str) -> None:
+
+    async def make_request():
+        completion = await client.completions.create(
+            model=model_name,
+            prompt="Hello, my name is",
+            max_tokens=10,
+            temperature=1.0)
+
+        assert completion.id is not None
+        assert completion.choices is not None and len(completion.choices) == 1
+
+        choice = completion.choices[0]
+        # The exact number of tokens can vary slightly with temperature=1.0,
+        # so we check for a reasonable minimum length.
+        assert len(choice.text) >= 1
+        # Finish reason might not always be 'length' if the model finishes early
+        # or due to other reasons, especially with high temperature.
+        # So, we'll accept 'length' or 'stop'.
+        assert choice.finish_reason in ("length", "stop")
+
+        # Token counts can also vary, so we check they are positive.
+        assert completion.usage.completion_tokens > 0
+        assert completion.usage.prompt_tokens > 0
+        assert completion.usage.total_tokens > 0
+        return completion
+
+    # Test single request
+    result = await make_request()
+    assert result is not None
+
+    await asyncio.sleep(0.5)
+
+    # Send two bursts of requests
+    num_requests = 100
+    tasks = [make_request() for _ in range(num_requests)]
+    results = await asyncio.gather(*tasks)
+    assert len(results) == num_requests
+    assert all(completion is not None for completion in results)
+
+    await asyncio.sleep(0.5)
+
+    tasks = [make_request() for _ in range(num_requests)]
+    results = await asyncio.gather(*tasks)
+    assert len(results) == num_requests
+    assert all(completion is not None for completion in results)
+
+    # Check request balancing via Prometheus metrics if DP_SIZE > 1
+    check_request_balancing(server, int(DP_SIZE))
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_completion_streaming(client: openai.AsyncOpenAI,
+                                    server: RemoteOpenAIServer,
+                                    model_name: str) -> None:
+    prompt = "What is an LLM?"
+
+    async def make_streaming_request():
+        # Perform a non-streaming request to get the expected full output
+        single_completion = await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+        )
+        single_output = single_completion.choices[0].text
+
+        # Perform the streaming request
+        stream = await client.completions.create(model=model_name,
+                                                 prompt=prompt,
+                                                 max_tokens=5,
+                                                 temperature=0.0,
+                                                 stream=True)
+        chunks: list[str] = []
+        finish_reason_count = 0
+        last_chunk = None
+        async for chunk in stream:
+            chunks.append(chunk.choices[0].text)
+            if chunk.choices[0].finish_reason is not None:
+                finish_reason_count += 1
+            last_chunk = chunk  # Keep track of the last chunk
+
+        # finish reason should only return in the last block for OpenAI API
+        assert finish_reason_count == 1, (
+            "Finish reason should appear exactly once.")
+        assert last_chunk is not None, (
+            "Stream should have yielded at least one chunk.")
+        assert last_chunk.choices[
+            0].finish_reason == "length", "Finish reason should be 'length'."
+        # Check that the combined text matches the non-streamed version.
+        assert "".join(
+            chunks
+        ) == single_output, "Streamed output should match non-streamed output."
+        return True  # Indicate success for this request
+
+    # Test single request
+    result = await make_streaming_request()
+    assert result is not None
+
+    await asyncio.sleep(0.5)
+
+    # Send two bursts of requests
+    num_requests = 100
+    tasks = [make_streaming_request() for _ in range(num_requests)]
+    results = await asyncio.gather(*tasks)
+
+    assert len(
+        results
+    ) == num_requests, f"Expected {num_requests} results, got {len(results)}"
+    assert all(results), "Not all streaming requests completed successfully."
+
+    await asyncio.sleep(0.5)
+
+    tasks = [make_streaming_request() for _ in range(num_requests)]
+    results = await asyncio.gather(*tasks)
+
+    assert len(
+        results
+    ) == num_requests, f"Expected {num_requests} results, got {len(results)}"
+    assert all(results), "Not all streaming requests completed successfully."
+
+    # Check request balancing via Prometheus metrics if DP_SIZE > 1
+    check_request_balancing(server, int(DP_SIZE))
diff --git a/vllm_v0.10.0/tests/v1/kv_connector/__init__.py b/vllm_v0.10.0/tests/v1/kv_connector/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/vllm_v0.10.0/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
new file mode 100755
index 0000000..b48655d
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
@@ -0,0 +1,185 @@
+#!/bin/bash
+set -xe
+
+# Models to run
+MODELS=(
+    "Qwen/Qwen3-0.6B"
+)
+
+# Number of prefill and decode instances to create
+NUM_PREFILL_INSTANCES=${NUM_PREFILL_INSTANCES:-1} # Default to 1
+NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-1}   # Default to 1
+PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1}
+DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
+
+# Find the git repository root directory
+GIT_ROOT=$(git rev-parse --show-toplevel)
+
+SMI_BIN=$(which nvidia-smi || which rocm-smi)
+
+# Trap the SIGINT signal (triggered by Ctrl+C)
+trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
+
+# Waits for vLLM to start.
+wait_for_server() {
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+# Function to clean up previous instances
+cleanup_instances() {
+  echo "Cleaning up any running vLLM instances..."
+  pkill -f "vllm serve" || true
+  sleep 2
+}
+
+# Handle to get model-specific arguments for deepseek
+get_model_args() {
+  local model_name=$1
+  local extra_args=""
+
+  if [[ "$model_name" == "deepseek-ai/deepseek-vl2-tiny" ]]; then
+    extra_args="--hf_overrides '{\"architectures\": [\"DeepseekVLV2ForCausalLM\"]}' --trust-remote-code"
+  fi
+
+  echo "$extra_args"
+}
+
+get_num_gpus() {
+  if [[ "$SMI_BIN" == *"nvidia"* ]]; then
+    echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)"
+  else
+    echo "$($SMI_BIN -l | grep GPU | wc -l)"
+  fi
+}
+
+# Function to run tests for a specific model
+run_tests_for_model() {
+  local model_name=$1
+  echo "================================"
+  echo "Testing model: $model_name"
+  echo "================================"
+
+  # Get model-specific arguments
+  local model_args=$(get_model_args "$model_name")
+
+  # Arrays to store all hosts and ports
+  PREFILL_HOSTS=()
+  PREFILL_PORTS=()
+  DECODE_HOSTS=()
+  DECODE_PORTS=()
+
+  # Start prefill instances
+  for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do
+    # Calculate GPU ID - we'll distribute across available GPUs
+    GPU_ID=$((i % $(get_num_gpus)))
+
+    # Calculate port number (base port + instance number)
+    PORT=$((8100 + i))
+    # Calculate side channel port. Avoid clash with with TP workers. 
+    SIDE_CHANNEL_PORT=$((5559 + i))
+
+    echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT"
+
+    # Build the command with or without model-specific args
+    BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
+    --port $PORT \
+    --enforce-eager \
+    --disable-log-requests \
+    --gpu-memory-utilization 0.2 \
+    --tensor-parallel-size $PREFILLER_TP_SIZE \
+    --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
+
+    if [ -n "$model_args" ]; then
+    FULL_CMD="$BASE_CMD $model_args"
+    else
+    FULL_CMD="$BASE_CMD"
+    fi
+
+    eval "$FULL_CMD &"
+
+    # Store host and port for proxy configuration
+    PREFILL_HOSTS+=("localhost")
+    PREFILL_PORTS+=($PORT)
+  done
+
+  # Start decode instances
+  for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do
+    # Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs
+    GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus)))
+    # Calculate port number (base port + instance number)
+    PORT=$((8200 + i))
+    # Calculate side channel port
+    SIDE_CHANNEL_PORT=$((5659 + i * $DECODER_TP_SIZE))
+
+    echo "Starting decode instance $i on GPU $GPU_ID, port $PORT"
+
+    # Build the command with or without model-specific args
+    BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
+    --port $PORT \
+    --enforce-eager \
+    --disable-log-requests \
+    --gpu-memory-utilization 0.2 \
+    --tensor-parallel-size $DECODER_TP_SIZE \
+    --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
+
+    if [ -n "$model_args" ]; then
+    FULL_CMD="$BASE_CMD $model_args"
+    else
+    FULL_CMD="$BASE_CMD"
+    fi
+
+    eval "$FULL_CMD &"
+
+    # Store host and port for proxy configuration
+    DECODE_HOSTS+=("localhost")
+    DECODE_PORTS+=($PORT)
+  done
+
+  # Wait for all instances to start
+  for PORT in "${PREFILL_PORTS[@]}"; do
+    echo "Waiting for prefill instance on port $PORT to start..."
+    wait_for_server $PORT
+  done
+
+  for PORT in "${DECODE_PORTS[@]}"; do
+    echo "Waiting for decode instance on port $PORT to start..."
+    wait_for_server $PORT
+  done
+
+  # Build the command for the proxy server with all the hosts and ports
+  PROXY_CMD="python ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --port 8192"
+
+  # Add all prefill hosts and ports
+  PROXY_CMD+=" --prefiller-hosts ${PREFILL_HOSTS[@]}"
+  PROXY_CMD+=" --prefiller-ports ${PREFILL_PORTS[@]}"
+
+  # Add all decode hosts and ports
+  PROXY_CMD+=" --decoder-hosts ${DECODE_HOSTS[@]}"
+  PROXY_CMD+=" --decoder-ports ${DECODE_PORTS[@]}"
+
+  # Start the proxy server
+  echo "Starting proxy server with command: $PROXY_CMD"
+  $PROXY_CMD &
+
+  # Wait for the proxy to start
+  sleep 5
+
+  # Run lm eval for this model
+  echo "Running tests for $model_name"
+  TEST_MODEL=$model_name python -m pytest -s -x ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/test_accuracy.py
+
+  # Clean up before running next model
+  cleanup_instances
+  sleep 3
+}
+
+# Run tests for each model
+for model in "${MODELS[@]}"; do
+  run_tests_for_model "$model"
+done
+
+echo "All tests completed!"
diff --git a/vllm_v0.10.0/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh b/vllm_v0.10.0/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh
new file mode 100644
index 0000000..98903a1
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh
@@ -0,0 +1,123 @@
+#!/bin/bash
+set -xe
+
+# Models to run
+MODELS=(
+    "Qwen/Qwen3-0.6B"
+)
+
+# Find the git repository root directory
+GIT_ROOT=$(git rev-parse --show-toplevel)
+
+# Trap the SIGINT signal (triggered by Ctrl+C)
+trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
+
+# Waits for vLLM to start.
+wait_for_server() {
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+# Function to clean up previous instances
+cleanup_instances() {
+  echo "Cleaning up any running vLLM instances..."
+  pkill -f "vllm serve" || true
+  sleep 2
+}
+
+# Handle to get model-specific arguments for deepseek
+get_model_args() {
+  local model_name=$1
+  local extra_args=""
+
+  if [[ "$model_name" == "deepseek-ai/deepseek-vl2-tiny" ]]; then
+    extra_args="--hf_overrides '{\"architectures\": [\"DeepseekVLV2ForCausalLM\"]}' --trust-remote-code"
+  fi
+
+  echo "$extra_args"
+}
+
+
+# Function to run tests for a specific model
+run_tests_for_model() {
+  local model_name=$1
+  echo "================================"
+  echo "Testing model: $model_name"
+  echo "================================"
+
+  # Get model-specific arguments
+  local model_args=$(get_model_args "$model_name")
+  
+  # Start prefill instance
+  PREFILL_PORT=8001
+
+  BASE_CMD="CUDA_VISIBLE_DEVICES=0 VLLM_NIXL_SIDE_CHANNEL_PORT=5559 vllm serve $model_name \
+  --port $PREFILL_PORT \
+  --enforce-eager \
+  --disable-log-requests \
+  --gpu-memory-utilization 0.2 \
+  --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
+
+  if [ -n "$model_args" ]; then
+  FULL_CMD="$BASE_CMD $model_args"
+  else
+  FULL_CMD="$BASE_CMD"
+  fi
+
+  eval "$FULL_CMD &"
+
+  # Start decode instance
+  DECODE_PORT=8002
+
+  # Build the command with or without model-specific args
+  BASE_CMD="CUDA_VISIBLE_DEVICES=1 VLLM_NIXL_SIDE_CHANNEL_PORT=6000 vllm serve $model_name \
+  --port $DECODE_PORT \
+  --enforce-eager \
+  --disable-log-requests \
+  --gpu-memory-utilization 0.2 \
+  --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
+
+  if [ -n "$model_args" ]; then
+  FULL_CMD="$BASE_CMD $model_args"
+  else
+  FULL_CMD="$BASE_CMD"
+  fi
+
+  eval "$FULL_CMD &"
+
+  # Wait for all instances to start
+  echo "Waiting for prefill instance on port $PORT to start..."
+  wait_for_server $PREFILL_PORT
+  echo "Waiting for decode instance on port $PORT to start..."
+  wait_for_server $DECODE_PORT
+
+  # Build the command for the proxy server with all the hosts and ports
+  PROXY_PORT=8192
+  PROXY_CMD="python ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --port $PROXY_PORT"
+  PROXY_CMD+=" --prefiller-ports ${PREFILL_PORT}"
+  PROXY_CMD+=" --decoder-ports ${DECODE_PORT}"
+  # Start the proxy server
+  echo "Starting proxy server with command: $PROXY_CMD"
+  $PROXY_CMD &
+
+  # Wait for the proxy to start
+  sleep 5
+
+  # Run lm eval for this model
+  echo "Running tests for $model_name"
+  PREFILL_PORT=$PREFILL_PORT DECODE_PORT=$DECODE_PORT PROXY_PORT=$PROXY_PORT python -m pytest -s -v ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
+
+  # Clean up before running next model
+  cleanup_instances
+  sleep 3
+}
+
+# Run tests for each model
+for model in "${MODELS[@]}"; do
+  run_tests_for_model "$model"
+done
+
+echo "All tests completed!"
diff --git a/vllm_v0.10.0/tests/v1/kv_connector/nixl_integration/test_accuracy.py b/vllm_v0.10.0/tests/v1/kv_connector/nixl_integration/test_accuracy.py
new file mode 100644
index 0000000..e5d66ff
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/kv_connector/nixl_integration/test_accuracy.py
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+
+import lm_eval
+import openai
+
+BASE_URL = "http://localhost:8192/v1"
+NUM_CONCURRENT = 100
+TASK = "gsm8k"
+FILTER = "exact_match,strict-match"
+RTOL = 0.03
+
+# Model-specific expected values
+EXPECTED_VALUES = {
+    "Qwen/Qwen3-0.6B": 0.41,
+    "deepseek-ai/deepseek-vl2-small": 0.59
+}
+
+SIMPLE_PROMPT = "The best part about working on vLLM is that I got to meet so many people across various different organizations like UCB, Google, and Meta which means",  # noqa: E501
+
+# Get model name from environment variable
+MODEL_NAME = os.environ.get("TEST_MODEL", "Qwen/Qwen3-0.6B")
+
+
+def run_simple_prompt():
+    client = openai.OpenAI(api_key="EMPTY", base_url=BASE_URL)
+    completion = client.completions.create(model=MODEL_NAME,
+                                           prompt=SIMPLE_PROMPT)
+
+    print("-" * 50)
+    print(f"Completion results for {MODEL_NAME}:")
+    print(completion)
+    print("-" * 50)
+
+
+def test_accuracy():
+    """Run the end to end accuracy test."""
+    run_simple_prompt()
+
+    model_args = (f"model={MODEL_NAME},"
+                  f"base_url={BASE_URL}/completions,"
+                  f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")
+
+    results = lm_eval.simple_evaluate(
+        model="local-completions",
+        model_args=model_args,
+        tasks=TASK,
+    )
+
+    measured_value = results["results"][TASK][FILTER]
+    expected_value = EXPECTED_VALUES.get(MODEL_NAME)
+
+    if expected_value is None:
+        print(f"Warning: No expected value found for {MODEL_NAME}. "
+              "Skipping accuracy check.")
+        print(f"Measured value: {measured_value}")
+        return
+
+    assert (measured_value - RTOL < expected_value
+            and measured_value + RTOL > expected_value
+            ), f"Expected: {expected_value} | Measured: {measured_value}"
diff --git a/vllm_v0.10.0/tests/v1/kv_connector/nixl_integration/test_edge_cases.py b/vllm_v0.10.0/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
new file mode 100644
index 0000000..95465a2
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
@@ -0,0 +1,78 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+
+import openai
+
+PREFILL_PORT = os.getenv("PREFILL_PORT", None)
+DECODE_PORT = os.getenv("DECODE_PORT", None)
+PROXY_PORT = os.getenv("PROXY_PORT", None)
+
+if PREFILL_PORT is None or DECODE_PORT is None or PROXY_PORT is None:
+    raise ValueError(
+        "Please set the PREFILL_PORT, DECODE_PORT, and PROXY_PORT.")
+
+LONG_PROMPT = "Red Hat is the best company in the world to work for because it works on open source software, which means that all the contributions are delivered to the community. As a result, when working on projects like vLLM we are able to meet many amazing people from various organizations like AMD, Google, NVIDIA, "  # noqa: E501
+PROMPT = "Red Hat is the best company in the world to work for because it works on open source software, which means that all the contributions are delivered to the community. As a result,"  # noqa: E501
+SHORT_PROMPT = "Red Hat is "
+
+
+def test_edge_cases():
+    # Set the OpenAI API key and base URL
+    decode_client = openai.OpenAI(
+        api_key="MY_KEY",
+        base_url=f"http://localhost:{DECODE_PORT}/v1",
+    )
+    prefill_client = openai.OpenAI(
+        api_key="MY_KEY",
+        base_url=f"http://localhost:{PREFILL_PORT}/v1",
+    )
+    proxy_client = openai.OpenAI(
+        api_key="MY_KEY",
+        base_url=f"http://localhost:{PROXY_PORT}/v1",
+    )
+
+    # Get the list of models
+    models = decode_client.models.list()
+    MODEL = models.data[0].id
+
+    # (1) Check that we can handle a very short prompt,
+    # less than the length of the block size.
+    completion = proxy_client.completions.create(model=MODEL,
+                                                 prompt=SHORT_PROMPT,
+                                                 temperature=0)
+    proxy_response = completion.choices[0].text
+    completion = prefill_client.completions.create(model=MODEL,
+                                                   prompt=SHORT_PROMPT,
+                                                   temperature=0)
+    prefill_response = completion.choices[0].text
+    print(f"SMALL PROMPT: {proxy_response=}")
+    assert proxy_response == prefill_response
+
+    # (2) Check that we can handle a full prefix cache
+    # hit on the D worker but not on the P worker.
+    # (2a): prime the D worker.
+    completion = decode_client.completions.create(model=MODEL,
+                                                  prompt=PROMPT,
+                                                  temperature=0)
+    decode_response = completion.choices[0].text
+    # (2b): send via the P/D setup
+    completion = proxy_client.completions.create(model=MODEL,
+                                                 prompt=PROMPT,
+                                                 temperature=0)
+    proxy_response = completion.choices[0].text
+    print(f"FULL CACHE HIT: {proxy_response=}")
+    assert proxy_response == decode_response
+
+    # (3) Check that we can handle a partial prefix cache
+    # hit on the D worker.
+    completion = proxy_client.completions.create(model=MODEL,
+                                                 prompt=LONG_PROMPT,
+                                                 temperature=0)
+    proxy_response = completion.choices[0].text
+    completion = prefill_client.completions.create(model=MODEL,
+                                                   prompt=LONG_PROMPT,
+                                                   temperature=0)
+    prefill_response = completion.choices[0].text
+    print(f"PARTIAL CACHE HIT: {proxy_response=}")
+    assert proxy_response == prefill_response
diff --git a/vllm_v0.10.0/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py b/vllm_v0.10.0/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
new file mode 100644
index 0000000..c58cb02
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
@@ -0,0 +1,269 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import itertools
+import os
+import uuid
+from contextlib import asynccontextmanager
+
+import httpx
+from fastapi import FastAPI, Request
+from fastapi.responses import StreamingResponse
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """
+    Lifespan context manager to handle startup and shutdown events.
+    """
+    # Startup: Initialize client pools for prefiller and decoder services
+    app.state.prefill_clients = []
+    app.state.decode_clients = []
+
+    # Create prefill clients
+    for i, (host, port) in enumerate(global_args.prefiller_instances):
+        prefiller_base_url = f'http://{host}:{port}/v1'
+        app.state.prefill_clients.append({
+            'client':
+            httpx.AsyncClient(timeout=None, base_url=prefiller_base_url),
+            'host':
+            host,
+            'port':
+            port,
+            'id':
+            i
+        })
+
+    # Create decode clients
+    for i, (host, port) in enumerate(global_args.decoder_instances):
+        decoder_base_url = f'http://{host}:{port}/v1'
+        app.state.decode_clients.append({
+            'client':
+            httpx.AsyncClient(timeout=None, base_url=decoder_base_url),
+            'host':
+            host,
+            'port':
+            port,
+            'id':
+            i
+        })
+
+    # Initialize round-robin iterators
+    app.state.prefill_iterator = itertools.cycle(
+        range(len(app.state.prefill_clients)))
+    app.state.decode_iterator = itertools.cycle(
+        range(len(app.state.decode_clients)))
+
+    print(f"Initialized {len(app.state.prefill_clients)} prefill clients "
+          f"and {len(app.state.decode_clients)} decode clients.")
+
+    yield
+
+    # Shutdown: Close all clients
+    for client_info in app.state.prefill_clients:
+        await client_info['client'].aclose()
+
+    for client_info in app.state.decode_clients:
+        await client_info['client'].aclose()
+
+
+# Update FastAPI app initialization to use lifespan
+app = FastAPI(lifespan=lifespan)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--host", type=str, default="localhost")
+
+    # For prefiller instances
+    parser.add_argument("--prefiller-hosts",
+                        "--prefiller-host",
+                        type=str,
+                        nargs="+",
+                        default=["localhost"])
+    parser.add_argument("--prefiller-ports",
+                        "--prefiller-port",
+                        type=int,
+                        nargs="+",
+                        default=[8100])
+
+    # For decoder instances
+    parser.add_argument("--decoder-hosts",
+                        "--decoder-host",
+                        type=str,
+                        nargs="+",
+                        default=["localhost"])
+    parser.add_argument("--decoder-ports",
+                        "--decoder-port",
+                        type=int,
+                        nargs="+",
+                        default=[8200])
+
+    args = parser.parse_args()
+
+    # Validate and pair hosts with ports
+    if len(args.prefiller_hosts) != len(args.prefiller_ports):
+        raise ValueError(
+            "Number of prefiller hosts must match number of prefiller ports")
+
+    if len(args.decoder_hosts) != len(args.decoder_ports):
+        raise ValueError(
+            "Number of decoder hosts must match number of decoder ports")
+
+    # Create tuples of (host, port) for each service type
+    args.prefiller_instances = list(
+        zip(args.prefiller_hosts, args.prefiller_ports))
+    args.decoder_instances = list(zip(args.decoder_hosts, args.decoder_ports))
+
+    return args
+
+
+def get_next_client(app, service_type: str):
+    """
+    Get the next client in round-robin fashion.
+
+    Args:
+        app: The FastAPI app instance
+        service_type: Either 'prefill' or 'decode'
+
+    Returns:
+        The next client to use
+    """
+    if service_type == 'prefill':
+        client_idx = next(app.state.prefill_iterator)
+        return app.state.prefill_clients[client_idx]
+    elif service_type == 'decode':
+        client_idx = next(app.state.decode_iterator)
+        return app.state.decode_clients[client_idx]
+    else:
+        raise ValueError(f"Unknown service type: {service_type}")
+
+
+async def send_request_to_service(client_info: dict, endpoint: str,
+                                  req_data: dict, request_id: str):
+    """
+    Send a request to a service using a client from the pool.
+    """
+    req_data = req_data.copy()
+    req_data['kv_transfer_params'] = {
+        "do_remote_decode": True,
+        "do_remote_prefill": False,
+        "remote_engine_id": None,
+        "remote_block_ids": None,
+        "remote_host": None,
+        "remote_port": None
+    }
+    req_data["stream"] = False
+    req_data["max_tokens"] = 1
+    if "stream_options" in req_data:
+        del req_data["stream_options"]
+    headers = {
+        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        "X-Request-Id": request_id
+    }
+
+    response = await client_info['client'].post(endpoint,
+                                                json=req_data,
+                                                headers=headers)
+    response.raise_for_status()
+
+    return response
+
+
+async def stream_service_response(client_info: dict, endpoint: str,
+                                  req_data: dict, request_id: str):
+    """
+    Asynchronously stream response from a service using a client from the pool.
+    """
+    headers = {
+        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        "X-Request-Id": request_id
+    }
+
+    async with client_info['client'].stream("POST",
+                                            endpoint,
+                                            json=req_data,
+                                            headers=headers) as response:
+        response.raise_for_status()
+        async for chunk in response.aiter_bytes():
+            yield chunk
+
+
+async def _handle_completions(api: str, request: Request):
+    try:
+        req_data = await request.json()
+        request_id = str(uuid.uuid4())
+
+        # Get the next prefill client in round-robin fashion
+        prefill_client_info = get_next_client(request.app, 'prefill')
+
+        # Send request to prefill service
+        response = await send_request_to_service(prefill_client_info, api,
+                                                 req_data, request_id)
+
+        # Extract the needed fields
+        response_json = response.json()
+        kv_transfer_params = response_json.get('kv_transfer_params', {})
+        if kv_transfer_params:
+            req_data["kv_transfer_params"] = kv_transfer_params
+
+        # Get the next decode client in round-robin fashion
+        decode_client_info = get_next_client(request.app, 'decode')
+
+        logger.debug("Using %s %s", prefill_client_info, decode_client_info)
+
+        # Stream response from decode service
+        async def generate_stream():
+            async for chunk in stream_service_response(decode_client_info,
+                                                       api,
+                                                       req_data,
+                                                       request_id=request_id):
+                yield chunk
+
+        return StreamingResponse(generate_stream(),
+                                 media_type="application/json")
+
+    except Exception as e:
+        import sys
+        import traceback
+        exc_info = sys.exc_info()
+        print("Error occurred in disagg prefill proxy server"
+              f" - {api} endpoint")
+        print(e)
+        print("".join(traceback.format_exception(*exc_info)))
+        raise
+
+
+@app.post("/v1/completions")
+async def handle_completions(request: Request):
+    return await _handle_completions("/completions", request)
+
+
+@app.post("/v1/chat/completions")
+async def handle_chat_completions(request: Request):
+    return await _handle_completions("/chat/completions", request)
+
+
+@app.get("/healthcheck")
+async def healthcheck():
+    """Simple endpoint to check if the server is running."""
+    return {
+        "status": "ok",
+        "prefill_instances": len(app.state.prefill_clients),
+        "decode_instances": len(app.state.decode_clients)
+    }
+
+
+if __name__ == '__main__':
+    global global_args
+    global_args = parse_args()
+
+    import uvicorn
+    uvicorn.run(app, host=global_args.host, port=global_args.port)
diff --git a/vllm_v0.10.0/tests/v1/kv_connector/unit/__init__.py b/vllm_v0.10.0/tests/v1/kv_connector/unit/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/v1/kv_connector/unit/test_multi_connector.py b/vllm_v0.10.0/tests/v1/kv_connector/unit/test_multi_connector.py
new file mode 100644
index 0000000..b1780d8
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/kv_connector/unit/test_multi_connector.py
@@ -0,0 +1,215 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import filecmp
+import shutil
+import tempfile
+from pathlib import Path
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+
+PROMPT_CONTEXT = "Hi " * 100
+PROMPTS = [
+    PROMPT_CONTEXT + "Hello, my name is",
+    PROMPT_CONTEXT + "The capital of France is",
+]
+
+SAMPLING_PARAMS = SamplingParams(temperature=0, max_tokens=20)
+
+
+# Helper function to compare directories recursively
+def _compare_directories(dir1: Path, dir2: Path) -> bool:
+    """Compares two directories recursively for identical content."""
+    dcmp = filecmp.dircmp(dir1, dir2)
+    if dcmp.left_only or dcmp.right_only or dcmp.diff_files:
+        print(f"Differences found between {dir1} and {dir2}:")
+        print(f"  Left only: {dcmp.left_only}")
+        print(f"  Right only: {dcmp.right_only}")
+        print(f"  Different files: {dcmp.diff_files}")
+        return False
+    for sub_dir in dcmp.common_dirs:
+        if not _compare_directories(dir1 / sub_dir, dir2 / sub_dir):
+            return False
+    return True
+
+
+def test_multi_shared_storage_connector_consistency():
+    """
+    Tests that MultiConnector with two SharedStorageConnectors saves
+    identical KV cache data to separate storage locations.
+    """
+    storage_1_path = Path("storage_1/")
+    storage_2_path = Path("storage_2/")
+    shutil.rmtree(storage_1_path, ignore_errors=True)
+    shutil.rmtree(storage_2_path, ignore_errors=True)
+    storage_1_path.mkdir()
+    storage_2_path.mkdir()
+
+    # Configure MultiConnector with two SharedStorageConnectors
+    kv_transfer_config = KVTransferConfig(
+        kv_connector="MultiConnector",
+        kv_role="kv_both",
+        kv_connector_extra_config={
+            "connectors": [{
+                "kv_connector":
+                "TestSharedStorageConnector",
+                "kv_role":
+                "kv_both",
+                "kv_connector_extra_config": {
+                    "shared_storage_path": str(storage_1_path),
+                    "name": "storage1",
+                },
+                "kv_connector_module_path":
+                "tests.v1.kv_connector.unit.utils",
+            }, {
+                "kv_connector":
+                "TestSharedStorageConnector",
+                "kv_role":
+                "kv_both",
+                "kv_connector_extra_config": {
+                    "shared_storage_path": str(storage_2_path),
+                    "name": "storage2",
+                },
+                "kv_connector_module_path":
+                "tests.v1.kv_connector.unit.utils",
+            }]
+        },
+    )
+
+    llm = LLM(
+        model=MODEL_NAME,
+        enforce_eager=True,
+        gpu_memory_utilization=0.5,
+        kv_transfer_config=kv_transfer_config,
+    )
+    # Run generation - this should trigger saving KV cache
+    _ = llm.generate(PROMPTS, SAMPLING_PARAMS)
+
+    # --- Verification ---
+
+    # Check that both storage directories were populated
+    local_subdirs = list(storage_1_path.iterdir())
+    external_subdirs = list(storage_2_path.iterdir())
+
+    assert len(
+        local_subdirs
+    ) > 0, f"Local storage path {storage_1_path} is empty after generation."
+    assert len(external_subdirs) > 0, (
+        f"External storage path {storage_2_path} is empty after generation.")
+    assert len(local_subdirs) == len(external_subdirs), (
+        f"Mismatch in number of cache entries: "
+        f"Local={len(local_subdirs)}, External={len(external_subdirs)}")
+
+    # The subdirectories should correspond to the prompt hashes
+    # Since prompts are the same, the hash directories should be the same name
+    local_subdir_names = sorted([d.name for d in local_subdirs])
+    external_subdir_names = sorted([d.name for d in external_subdirs])
+    assert local_subdir_names == external_subdir_names, (
+        "Cache directory names do not match between local and external storage"
+    )
+
+    # Compare the contents of each corresponding cache directory
+    for subdir_name in local_subdir_names:
+        print(f"Comparing contents of cache directory: {subdir_name}")
+        assert _compare_directories(storage_1_path / subdir_name,
+                                    storage_2_path / subdir_name), \
+            (f"Contents differ for cache directory '{subdir_name}' between "
+             f"{storage_1_path} and {storage_2_path}")
+
+    events = get_connector_events()
+    # get_num_new_matched_tokens and update_state_after_alloc will be called
+    # on each connector in turn.
+    assert events["storage1-SCHEDULER"][:3] == [
+        'get_num_new_matched_tokens 0',
+        'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
+    ]
+    assert events["storage1-WORKER"][:5] == [
+        'register_kv_caches', 'bind_connector_metadata', 'start_load_kv',
+        'wait_for_layer_load', 'save_kv_layer'
+    ]
+    assert events["storage2-SCHEDULER"][:3] == [
+        'get_num_new_matched_tokens 0',
+        'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
+    ]
+    assert events["storage2-WORKER"][:5] == [
+        'register_kv_caches', 'bind_connector_metadata', 'start_load_kv',
+        'wait_for_layer_load', 'save_kv_layer'
+    ]
+
+    # Reset prefix cache or else we'll just get the tokens back from there.
+    llm.reset_prefix_cache()
+
+    # Run generation again - this should trigger loading from the first
+    # connector.
+    _ = llm.generate(PROMPTS, SAMPLING_PARAMS)
+
+    events = get_connector_events()
+    # get_num_new_matched_tokens will return new tokens from the first
+    # connector so update_state_after_alloc will be with allocated blocks
+    # on that one but with zero blocks for others (first nonzero match is
+    # chosen).
+    assert events["storage1-SCHEDULER"][:3] == [
+        'get_num_new_matched_tokens 0',
+        'update_state_after_alloc num_blocks=[7] 96', 'build_connector_meta'
+    ]
+    assert events["storage2-SCHEDULER"][:3] == [
+        'get_num_new_matched_tokens 0',
+        'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
+    ]
+
+    # Delete storage1 connector state
+    shutil.rmtree(storage_1_path)
+
+    # Reset prefix cache or else we'll just get the tokens back from there.
+    llm.reset_prefix_cache()
+
+    # Run generation again - this should trigger loading from the first
+    # connector.
+    _ = llm.generate(PROMPTS, SAMPLING_PARAMS)
+
+    events = get_connector_events()
+    # get_num_new_matched_tokens will be called for both connectors but will
+    # return 0 from the first connector, but the second connector should have
+    # a hit, so update_state_after_alloc will only be called with allocated
+    # blocks for the second connector.
+    assert events["storage1-SCHEDULER"][:3] == [
+        'get_num_new_matched_tokens 0',
+        'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
+    ]
+    assert events["storage2-SCHEDULER"][:3] == [
+        'get_num_new_matched_tokens 0',
+        'update_state_after_alloc num_blocks=[7] 96', 'build_connector_meta'
+    ]
+
+    # Clean up
+    shutil.rmtree(storage_1_path)
+    shutil.rmtree(storage_2_path)
+
+
+def get_connector_events() -> dict[str, list[str]]:
+    # Read in connector events and reset the files.
+    import glob
+    event_files = glob.glob(tempfile.gettempdir() + "/connector_*_events.log")
+    connector_events = {}
+    for fname in event_files:
+        name = fname.split("connector_")[1].split("_events.log")[0]
+        try:
+            with open(fname, "r+") as f:
+                connector_events[name] = [
+                    line.strip() for line in f if line.strip()
+                ]
+                f.truncate(0)
+        except Exception as e:
+            print(f"[ERROR] Could not read connector events for {name}: {e}")
+
+    return connector_events
+
+
+def test_engine_id_conflict():
+    configs = [KVTransferConfig() for _ in range(2)]
+    ids = [config.engine_id for config in configs]
+    assert ids[0] != ids[1], (
+        "Engine IDs should be different for different configs. "
+        f"Got {ids}")
diff --git a/vllm_v0.10.0/tests/v1/kv_connector/unit/test_nixl_connector.py b/vllm_v0.10.0/tests/v1/kv_connector/unit/test_nixl_connector.py
new file mode 100644
index 0000000..99bde91
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -0,0 +1,412 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import tempfile
+import textwrap
+import time
+from unittest.mock import patch
+
+import pytest
+import ray
+
+from vllm import LLM
+from vllm.config import KVTransferConfig
+from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
+    KVConnectorRole, NixlAgentMetadata, NixlConnector, NixlConnectorMetadata,
+    NixlConnectorWorker)
+from vllm.forward_context import ForwardContext
+from vllm.mocks.mock_nixl_connector import FakeNixlWrapper
+from vllm.sampling_params import SamplingParams
+
+from .utils import create_request, create_scheduler, create_vllm_config
+
+
+def _make_stub_pkg() -> str:
+    """Return a directory that makes
+       `from nixl._api import nixl_agent` resolve to our FakeNixlWrapper."""
+    td = tempfile.mkdtemp()
+    pkg_root = os.path.join(td, "nixl", "_api")
+    os.makedirs(pkg_root, exist_ok=True)
+
+    stub = textwrap.dedent("""\
+        # Forward the real FakeNixlWrapper that the driver already defined.
+        print("In fake package")
+        from vllm.mocks.mock_nixl_connector import FakeNixlWrapper as nixl_agent
+    """)
+    with open(os.path.join(pkg_root, "__init__.py"), "w") as f:
+        f.write(stub)
+
+    # touch parent package
+    open(os.path.join(td, "nixl", "__init__.py"), "w").close()
+    return td
+
+
+def test_basic_interface():
+    """Unit test for basic NixlConnector interface functionality."""
+
+    vllm_config = create_vllm_config()
+    scheduler = create_scheduler(vllm_config)
+
+    # 2 Full Blocks and 1 Half Block.
+    BLOCK_SIZE = vllm_config.cache_config.block_size
+    NUM_EXTERNAL_FULL_BLOCKS = 2
+    NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
+
+    request = create_request(request_id=1,
+                             num_tokens=NUM_TOKENS,
+                             do_remote_prefill=True)
+    request_id = request.request_id
+
+    scheduler.add_request(request)
+
+    # Remote Prefill, triggers NixlConnectorMetadata.
+    scheduler_output = scheduler.schedule()
+    kv_connector_metadata = scheduler_output.kv_connector_metadata
+    assert kv_connector_metadata is not None
+    assert isinstance(kv_connector_metadata, NixlConnectorMetadata)
+
+    assert len(kv_connector_metadata.reqs_to_recv) == 1
+    assert request_id in kv_connector_metadata.reqs_to_recv
+    req_meta = kv_connector_metadata.reqs_to_recv[request_id]
+
+    for block_id, block in zip(
+            req_meta.local_block_ids, scheduler.kv_cache_manager.coordinator.
+            single_type_managers[0].req_to_blocks[request_id]):
+        assert block_id == block.block_id
+
+
+def test_prompt_less_than_block_size():
+    """
+    Test that we can handle case where prompt is < block.
+
+    In this case, the P worker will send empty remote_block_ids.
+    The D worker should not schedule an async read in this case,
+    since there is nothing to pull.
+    """
+    vllm_config = create_vllm_config()
+    scheduler = create_scheduler(vllm_config)
+
+    # Half of a block.
+    BLOCK_SIZE = vllm_config.cache_config.block_size
+    NUM_TOKENS = int(BLOCK_SIZE * 0.5)
+
+    # Request will have 0 remote blocks.
+    request = create_request(request_id=1,
+                             num_tokens=NUM_TOKENS,
+                             do_remote_prefill=True,
+                             num_remote_blocks=0)
+    scheduler.add_request(request)
+    scheduler_output = scheduler.schedule()
+
+    # This request should not have to read async.
+    kv_connector_metadata = scheduler_output.kv_connector_metadata
+    assert kv_connector_metadata is not None
+    assert isinstance(kv_connector_metadata, NixlConnectorMetadata)
+    assert len(kv_connector_metadata.reqs_to_recv) == 0
+
+    # This request should be scheduled regularly.
+    assert len(scheduler_output.scheduled_new_reqs) == 1
+
+
+class FakeNixlConnectorWorker(NixlConnectorWorker):
+
+    REMOTE_ENGINE_ID = "remote_engine"
+
+    def __init__(self, *args, hand_shake_latency: float = 1.8, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._hand_shake_latency = hand_shake_latency
+
+    def _nixl_handshake(self, host: str, port: int, remote_tp_size: int,
+                        expected_engine_id: str) -> dict[int, str]:
+        # Mimic slow _nixl_handshake, as well as bypass zmq communication.
+        time.sleep(self._hand_shake_latency)
+        # These should've been done in register_kv_caches(), called by
+        # gpu_model_runner. Here we just hardcode some dummy values.
+        self.slot_size_bytes = 4096
+        self.block_len = self.slot_size_bytes * self.block_size
+        self.num_blocks = 1
+        self.dst_num_blocks[self.engine_id] = self.num_blocks
+
+        assert expected_engine_id == self.REMOTE_ENGINE_ID
+
+        remote_agent_name = self.add_remote_agent(
+            NixlAgentMetadata(
+                engine_id=self.REMOTE_ENGINE_ID,
+                agent_metadata=FakeNixlWrapper.AGENT_METADATA,
+                kv_caches_base_addr=[0],
+                num_blocks=1,
+                block_len=self.block_len,
+                attn_backend_name=self.backend_name,
+            ),
+            remote_tp_size=remote_tp_size)
+        return {0: remote_agent_name}
+
+
+class TestNixlHandshake:
+
+    @patch(
+        "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
+        FakeNixlWrapper)
+    def test_multi_xfer_one_engine(
+        self,
+        # dist_init is a fixture that initializes the distributed environment.
+        dist_init):
+        """Test case where multiple xfers are initiated to the same engine.
+        
+        This test triggers the connector to load remote KV for the same
+        `request_id`. The transfer is not done immediately due to
+        `set_cycles_before_xfer_done`, so there is a state where there are
+        multiple transfer states for the same `request_id`, and `get_finished`
+        should handle it correctly (wait for all transfers to be done).
+        """
+        vllm_config = create_vllm_config()
+
+        request_id = "req_id"
+
+        # Test worker role in decode server.
+        connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
+        connector.connector_worker = FakeNixlConnectorWorker(
+            vllm_config, connector.engine_id, hand_shake_latency=0)
+        assert isinstance(connector.connector_worker.nixl_wrapper,
+                          FakeNixlWrapper)
+        connector.connector_worker.nixl_wrapper.set_cycles_before_xfer_done(3)
+        num_xfers = 4
+        while True:
+            # For the same request_id, initiate multiple xfers across different
+            # round of `execute_model` calls.
+            metadata = NixlConnectorMetadata()
+            if num_xfers > 0:
+                num_xfers -= 1
+                metadata.add_new_req(
+                    request_id=request_id,
+                    local_block_ids=[
+                        num_xfers + 1, num_xfers + 2, num_xfers + 3
+                    ],
+                    kv_transfer_params={
+                        "remote_block_ids":
+                        [num_xfers + 4, num_xfers + 5, num_xfers + 6],
+                        "remote_engine_id":
+                        FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+                        "remote_host":
+                        "localhost",
+                        "remote_port":
+                        1234,
+                        "remote_tp_size":
+                        1,
+                    })
+            connector.bind_connector_metadata(metadata)
+
+            # Mimic maybe_setup_kv_connector in gpu_model_runner.
+            dummy_ctx = ForwardContext(
+                no_compile_layers={},
+                attn_metadata={},
+                virtual_engine=0,
+            )
+            _before_load = time.perf_counter()
+            connector.start_load_kv(dummy_ctx)
+            _after_load = time.perf_counter()
+            assert _after_load - _before_load < 0.1, "start_load_kv took " \
+                f"{_after_load - _before_load} seconds"
+
+            # Mimic get_finished_kv_transfers in gpu_model_runner.
+            _, done_recving = connector.get_finished(finished_req_ids=set())
+            if len(done_recving) > 0:
+                assert request_id in done_recving
+                break
+
+            connector.clear_connector_metadata()
+
+    @patch(
+        "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
+        FakeNixlWrapper)
+    @pytest.mark.parametrize("decode_tp_size, prefill_tp_size", [
+        (1, 1),
+        (2, 1),
+        (4, 2),
+        (4, 4),
+    ])
+    def test_async_load_kv(
+            self,
+            # Fixture that initializes the distributed environment.
+            dist_init,
+            # Simulate consumer-producer TP sizes.
+            decode_tp_size,
+            prefill_tp_size):
+        """Test that NixlConnector's start_load_kv should be non-blocking."""
+
+        vllm_config = create_vllm_config()
+        vllm_config.parallel_config.tensor_parallel_size = decode_tp_size
+
+        # Test worker role in decode server.
+        connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
+        connector.connector_worker = FakeNixlConnectorWorker(
+            vllm_config, connector.engine_id)
+        metadata = NixlConnectorMetadata()
+        metadata.add_new_req(request_id="id",
+                             local_block_ids=[1, 2, 3],
+                             kv_transfer_params={
+                                 "remote_block_ids": [4, 5, 6],
+                                 "remote_engine_id":
+                                 FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+                                 "remote_host": "localhost",
+                                 "remote_port": 1234,
+                                 "remote_tp_size": prefill_tp_size,
+                             })
+        connector.bind_connector_metadata(metadata)
+
+        timeout = 2.5
+        start = time.perf_counter()
+        while time.perf_counter() - start < timeout:
+            dummy_ctx = ForwardContext(
+                no_compile_layers={},
+                attn_metadata={},
+                virtual_engine=0,
+            )
+            _before_load = time.perf_counter()
+            connector.start_load_kv(dummy_ctx)
+            _after_load = time.perf_counter()
+            assert _after_load - _before_load < 0.1, "start_load_kv took " \
+                f"{_after_load - _before_load} seconds"
+            time.sleep(0.5)  # backoff for the async handshake to complete.
+            connector.bind_connector_metadata(NixlConnectorMetadata())
+            _, done_recving = connector.get_finished(finished_req_ids=set())
+            if len(done_recving) > 0:
+                return
+        raise TimeoutError("Took too long to complete async handshake.")
+
+    @patch(
+        "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
+        FakeNixlWrapper)
+    def test_concurrent_load_kv(
+        self,
+        # dist_init is a fixture that initializes the distributed environment.
+        dist_init):
+        """Test that multiple start_load_kv calls should occur concurrently."""
+
+        vllm_config = create_vllm_config()
+
+        # Test worker role in decode server.
+        connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
+        connector.connector_worker = FakeNixlConnectorWorker(
+            vllm_config, connector.engine_id)
+        metadata = NixlConnectorMetadata()
+        total_reqs = 5
+        for i in range(total_reqs):
+            metadata.add_new_req(request_id=f"id_{i}",
+                                 local_block_ids=[1, 2, 3],
+                                 kv_transfer_params={
+                                     "remote_block_ids": [4, 5, 6],
+                                     "remote_engine_id":
+                                     FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+                                     "remote_host": "localhost",
+                                     "remote_port": 1234,
+                                     "remote_tp_size": 1,
+                                 })
+        connector.bind_connector_metadata(metadata)
+
+        timeout = 2.5 * total_reqs
+        cnt_finished_reqs = 0
+        start = time.perf_counter()
+        while time.perf_counter() - start < timeout:
+            dummy_ctx = ForwardContext(
+                no_compile_layers={},
+                attn_metadata={},
+                virtual_engine=0,
+            )
+            _before_load = time.perf_counter()
+            connector.start_load_kv(dummy_ctx)
+            _after_load = time.perf_counter()
+            assert _after_load - _before_load < 0.1, "start_load_kv took " \
+                f"{_after_load - _before_load} seconds"
+            time.sleep(0.5)  # backoff for the async handshake to complete.
+            connector.bind_connector_metadata(NixlConnectorMetadata())
+            _, done_recving = connector.get_finished(finished_req_ids=set())
+            if len(done_recving) > 0:
+                cnt_finished_reqs += len(done_recving)
+                if cnt_finished_reqs == total_reqs:
+                    return
+        raise TimeoutError("Took too long to complete async handshake.")
+
+
+# NOTE: resource cleanup in mp backend is a bit finicky, so the order in which
+# we put here is important. First run ray, it will clean up the resources, then
+# the rest of the tests.
+@pytest.mark.parametrize("distributed_executor_backend", ["ray", None])
+@patch(
+    "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
+    FakeNixlWrapper)
+def test_abort_timeout_on_prefiller(monkeypatch, distributed_executor_backend):
+    """
+    Test lifecycle of an aborted Remote Prefill request hitting the timeout.
+    -----> P 
+            |  {process request}
+     <-/--- |  {result is NOT delivered, eg proxy is down}
+            |
+            |
+            |  {eventually free blocks}
+    """
+    model_name = "Qwen/Qwen3-0.6B"
+    kv_transfer_config = KVTransferConfig(
+        kv_connector="NixlConnector",
+        kv_role="kv_both",
+    )
+    timeout = 6
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+    monkeypatch.setenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", str(timeout))
+
+    # Build runtime_env only if we’re using Ray
+    if distributed_executor_backend == "ray":
+        runtime_env = {
+            "working_dir": _make_stub_pkg(),  # ship stub package
+            "env_vars": {
+                "VLLM_NIXL_ABORT_REQUEST_TIMEOUT": str(timeout),
+            },
+        }
+        ray.init(runtime_env=runtime_env)
+
+    llm = LLM(
+        model=model_name,
+        enforce_eager=True,
+        gpu_memory_utilization=0.5,
+        kv_transfer_config=kv_transfer_config,
+        distributed_executor_backend=distributed_executor_backend,
+    )
+    remote_prefill_opts = {
+        "do_remote_decode": True,
+        "do_remote_prefill": False,
+        "remote_engine_id": None,
+        "remote_block_ids": None,
+        "remote_host": None,
+        "remote_port": None,
+    }
+    # Simulate sidecar request
+    sampling_params = SamplingParams(
+        temperature=0.0,
+        max_tokens=1,
+        extra_args={"kv_transfer_params": remote_prefill_opts})
+    scheduler = llm.llm_engine.engine_core.engine_core.scheduler
+    req_to_blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
+        0].req_to_blocks
+
+    padding = "Just making this request a little longer so that we're sure "
+    "we're not hitting the small-request lower bound beneath which we don't "
+    "actually trigger the whole kv transfer, but rather just recompute the "
+    "blocks on D."
+    _ = llm.generate([f"What is the capital of Japan? {padding}"],
+                     sampling_params)
+
+    # Request finished but not freed
+    assert '0' in scheduler.finished_req_ids and '0' in req_to_blocks
+    # Some other request, 0 still not freed
+    _ = llm.generate([f"What is the capital of Italy? {padding}"],
+                     sampling_params)
+    assert '0' in req_to_blocks
+    assert '1' in scheduler.finished_req_ids and '1' in req_to_blocks
+
+    # Wait for timeout and trigger another scheduler loop
+    time.sleep(timeout)
+    _ = llm.generate([f"What is the capital of France? {padding}"],
+                     sampling_params)
+    # Request-0 times out and is cleared!
+    assert '0' not in req_to_blocks
diff --git a/vllm_v0.10.0/tests/v1/kv_connector/unit/test_output_aggreagator.py b/vllm_v0.10.0/tests/v1/kv_connector/unit/test_output_aggreagator.py
new file mode 100644
index 0000000..cad73f6
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/kv_connector/unit/test_output_aggreagator.py
@@ -0,0 +1,108 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from concurrent.futures import Future
+from typing import Optional
+
+from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
+from vllm.v1.outputs import ModelRunnerOutput
+
+
+class DummyModelRunnerOutput(ModelRunnerOutput):
+
+    def __init__(self,
+                 finished_sending: Optional[set[str]] = None,
+                 finished_recving: Optional[set[str]] = None):
+        self.finished_sending = finished_sending
+        self.finished_recving = finished_recving
+
+
+def test_aggregate_workers_output():
+    aggregator = KVOutputAggregator(world_size=2)
+
+    output1 = DummyModelRunnerOutput(finished_sending={'req1'},
+                                     finished_recving={'req2'})
+    output2 = DummyModelRunnerOutput(finished_sending=None,
+                                     finished_recving=None)
+
+    aggregated = aggregator.aggregate([output1, output2])
+
+    assert aggregated is output1
+    assert aggregated.finished_sending is None
+    assert aggregated.finished_recving is None
+
+    output1 = DummyModelRunnerOutput(finished_sending=None,
+                                     finished_recving=None)
+    output2 = DummyModelRunnerOutput(finished_sending={'req1'},
+                                     finished_recving=None)
+
+    aggregated = aggregator.aggregate([output1, output2])
+
+    assert aggregated is output1
+    assert aggregated.finished_sending == {'req1'}
+    assert aggregated.finished_recving is None
+
+    output1 = DummyModelRunnerOutput(finished_sending=None,
+                                     finished_recving=None)
+    output2 = DummyModelRunnerOutput(finished_sending={'req1'},
+                                     finished_recving={'req2'})
+
+    aggregated = aggregator.aggregate([output1, output2])
+
+    assert aggregated is output1
+    assert aggregated.finished_sending is None
+    assert aggregated.finished_recving == {'req2'}
+
+
+def test_async_aggregate_workers_output():
+    aggregator = KVOutputAggregator(world_size=2)
+
+    future1: Future[DummyModelRunnerOutput] = Future()
+    future2: Future[DummyModelRunnerOutput] = Future()
+    result_future = aggregator.async_aggregate([future1, future2])
+
+    output1 = DummyModelRunnerOutput(finished_sending={'req1'},
+                                     finished_recving={'req2'})
+    output2 = DummyModelRunnerOutput(finished_sending=None,
+                                     finished_recving=None)
+    future1.set_result(output1)
+    future2.set_result(output2)
+
+    assert result_future.done()
+    aggregated = result_future.result()
+    assert aggregated is output1
+    assert aggregated.finished_sending is None
+    assert aggregated.finished_recving is None
+
+    future1 = Future()
+    future2 = Future()
+    result_future = aggregator.async_aggregate([future1, future2])
+
+    output1 = DummyModelRunnerOutput(finished_sending=None,
+                                     finished_recving=None)
+    output2 = DummyModelRunnerOutput(finished_sending={'req1'},
+                                     finished_recving=None)
+    future1.set_result(output1)
+    future2.set_result(output2)
+
+    assert result_future.done()
+    aggregated = result_future.result()
+    assert aggregated is output1
+    assert aggregated.finished_sending == {'req1'}
+    assert aggregated.finished_recving is None
+
+    future1 = Future()
+    future2 = Future()
+    result_future = aggregator.async_aggregate([future1, future2])
+
+    output1 = DummyModelRunnerOutput(finished_sending=None,
+                                     finished_recving=None)
+    output2 = DummyModelRunnerOutput(finished_sending={'req1'},
+                                     finished_recving={'req2'})
+    future1.set_result(output1)
+    future2.set_result(output2)
+
+    assert result_future.done()
+    aggregated = result_future.result()
+    assert aggregated is output1
+    assert aggregated.finished_sending is None
+    assert aggregated.finished_recving == {'req2'}
diff --git a/vllm_v0.10.0/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py b/vllm_v0.10.0/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
new file mode 100644
index 0000000..12a71d9
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
@@ -0,0 +1,182 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
+
+from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT
+from vllm.v1.request import FinishReason, RequestStatus
+
+from .utils import (assert_scheduler_empty, create_model_runner_output,
+                    create_request, create_scheduler, create_vllm_config)
+
+
+def test_basic_lifecycle():
+    """Test lifecycle of a Remote Decode request."""
+
+    vllm_config = create_vllm_config()
+    scheduler = create_scheduler(vllm_config)
+
+    # 2 Full Blocks and 1 Half Block.
+    BLOCK_SIZE = vllm_config.cache_config.block_size
+    NUM_EXTERNAL_FULL_BLOCKS = 2
+    NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
+
+    request = create_request(request_id=1,
+                             max_tokens=1,
+                             num_tokens=NUM_TOKENS,
+                             do_remote_decode=True)
+
+    scheduler.add_request(request)
+    request_id = request.request_id
+
+    # STEP (1): Prefill.
+    # (1a): schedule()
+    scheduler_output = scheduler.schedule()
+    assert len(scheduler.running) == 1
+    assert len(scheduler_output.scheduled_new_reqs) == 1
+
+    # (1b): execute_model()
+    model_runner_output = create_model_runner_output(reqs=[request])
+
+    # (1c): update_from_output()
+    engine_core_outputs = scheduler.update_from_output(scheduler_output,
+                                                       model_runner_output)
+
+    # Ensure the request is finished after 1 tokens.
+    assert request.is_finished()
+    assert request.status == RequestStatus.FINISHED_LENGTH_CAPPED
+    output = engine_core_outputs[0].outputs[0]
+    assert output.finish_reason == FinishReason.LENGTH
+    assert output.kv_transfer_params is not None
+
+    # Request freed in Scheduler and in Persistent Batch ...
+    assert request_id in scheduler.finished_req_ids
+    assert len(scheduler.running) == 0
+    assert len(scheduler.waiting) == 0
+
+    # ... but blocks should not be freed.
+    blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
+        0].req_to_blocks[request_id]
+    for block in blocks:
+        assert block.ref_cnt == 1
+
+    # STEP (2): Send Finished to PB.
+    # (2a): schedule() - pass finished request to PB.
+    scheduler_output = scheduler.schedule()
+    assert len(scheduler.running) == 0
+    assert len(scheduler_output.finished_req_ids) == 1
+    assert request_id in scheduler_output.finished_req_ids
+    assert len(scheduler_output.scheduled_new_reqs) == 0
+    assert scheduler_output.scheduled_cached_reqs.num_reqs == 0
+    assert len(scheduler.finished_req_ids) == 0
+
+    # (2b): execute_model()
+    model_runner_output = EMPTY_MODEL_RUNNER_OUTPUT
+
+    # (2c): update_from_output()
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+
+    # STEP (3): Finished sending.
+    # (3a): schedule() - pass finished request to PB.
+    scheduler_output = scheduler.schedule()
+    assert len(scheduler.running) == 0
+    assert len(scheduler_output.finished_req_ids) == 0
+    assert len(scheduler_output.scheduled_new_reqs) == 0
+    assert scheduler_output.scheduled_cached_reqs.num_reqs == 0
+    assert len(scheduler.finished_req_ids) == 0
+
+    # (3b): execute_model()
+    model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
+    model_runner_output.finished_sending = [request_id]
+
+    # (3c): update_from_output()
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+
+    # Confirm we do not have any memory leaks after req lifecycle.
+    assert_scheduler_empty(scheduler)
+
+
+def test_short_prompt_lifecycle():
+    """Test lifecycle of a Remote Decode request with short prompt."""
+
+    vllm_config = create_vllm_config()
+    scheduler = create_scheduler(vllm_config)
+
+    # Not enough tokens for full block.
+    NUM_TOKENS = vllm_config.cache_config.block_size // 2
+    request = create_request(request_id=1,
+                             max_tokens=1,
+                             num_tokens=NUM_TOKENS,
+                             do_remote_decode=True)
+
+    scheduler.add_request(request)
+
+    # STEP (1): Prefill.
+    # (1a): schedule()
+    scheduler_output = scheduler.schedule()
+    assert len(scheduler.running) == 1
+    assert len(scheduler_output.scheduled_new_reqs) == 1
+
+    # (1b): execute_model()
+    model_runner_output = create_model_runner_output(reqs=[request])
+
+    # (1c): update_from_output()
+    # Since tokens < block_size, there will be no kv xfer.
+    # So this should be cleaned up immediately.
+    _ = scheduler.update_from_output(scheduler_output, model_runner_output)
+
+    # Confirm we do not have any memory leaks after req lifecycle.
+    # We need one more call to schedule() to clear data for persistent batch.
+    _ = scheduler.schedule()
+    assert_scheduler_empty(scheduler)
+
+
+def test_prefix_cache_lifecycle():
+    """Test that remote decode params still works with a prefix cache hit."""
+
+    vllm_config = create_vllm_config()
+    scheduler = create_scheduler(vllm_config)
+
+    # Prime the KVCache.
+    BLOCK_SIZE = vllm_config.cache_config.block_size
+    NUM_EXTERNAL_FULL_BLOCKS = 3
+    NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
+
+    request_normal = create_request(request_id=1, num_tokens=NUM_TOKENS)
+
+    scheduler.add_request(request_normal)
+    scheduler_output = scheduler.schedule()
+    model_runner_output = create_model_runner_output(reqs=[request_normal],
+                                                     use_eos=True)
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+    scheduler.schedule()
+    scheduler.update_from_output(scheduler_output, EMPTY_MODEL_RUNNER_OUTPUT)
+
+    #####################
+    # Actual Test: confirm we send all blocks.
+
+    # Step (1): Send the KV Transfer.
+    NUM_EXTERNAL_FULL_BLOCKS -= 1
+    NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
+
+    request_remote = create_request(request_id=1,
+                                    num_tokens=NUM_TOKENS,
+                                    do_remote_decode=True)
+
+    scheduler.add_request(request_remote)
+    scheduler_output = scheduler.schedule()
+    model_runner_output = create_model_runner_output(reqs=[request_remote])
+    eco = scheduler.update_from_output(scheduler_output, model_runner_output)
+    kv_transfer_params = eco[0].outputs[0].kv_transfer_params
+
+    # Ensure we send all block ids, even if there is a cache hit.
+    assert (len(
+        kv_transfer_params["remote_block_ids"]) == NUM_EXTERNAL_FULL_BLOCKS)
+
+    # STEP (2): Ensure it is freed.
+    scheduler_output = scheduler.schedule()
+    scheduler.schedule()
+    model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
+    model_runner_output.finished_sending = [request_remote.request_id]
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+    _ = scheduler.schedule()
+    assert_scheduler_empty(scheduler)
diff --git a/vllm_v0.10.0/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py b/vllm_v0.10.0/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
new file mode 100644
index 0000000..f89970b
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
@@ -0,0 +1,424 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
+
+from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT
+from vllm.v1.request import FinishReason, RequestStatus
+
+from .utils import (assert_scheduler_empty, create_model_runner_output,
+                    create_request, create_scheduler, create_vllm_config)
+
+
+def test_basic_lifecycle():
+    """Test lifecycle of a remote prefill."""
+
+    vllm_config = create_vllm_config()
+    scheduler = create_scheduler(vllm_config)
+
+    # 2 Full Blocks and 1 Half Block.
+    BLOCK_SIZE = vllm_config.cache_config.block_size
+    NUM_EXTERNAL_FULL_BLOCKS = 2
+    NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
+    START_FREE_BLOCK_QUEUE_SIZE = (
+        scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks)
+
+    request = create_request(request_id=1,
+                             num_tokens=NUM_TOKENS,
+                             do_remote_prefill=True)
+
+    scheduler.add_request(request)
+    request_id = request.request_id
+
+    # STEP (1):
+    # (1a): schedule()
+    scheduler_output = scheduler.schedule()
+
+    # Nothing running and empty scheduler output.
+    assert len(scheduler.running) == 0
+    assert len(scheduler_output.scheduled_new_reqs) == 0
+    assert scheduler_output.scheduled_cached_reqs.num_reqs == 0
+    assert len(scheduler_output.num_scheduled_tokens) == 0
+    assert scheduler_output.total_num_scheduled_tokens == 0
+
+    # Req waiting for KVs with no computed/scheduled toks ...
+    assert len(scheduler.waiting) == 1
+    assert request in scheduler.waiting
+    assert (request.status == RequestStatus.WAITING_FOR_REMOTE_KVS)
+    assert (request.num_computed_tokens == 0)
+
+    # ... but should have (uncached) blocks allocated to it.
+    block_pool = scheduler.kv_cache_manager.block_pool
+    assert (block_pool.free_block_queue.num_free_blocks
+            < START_FREE_BLOCK_QUEUE_SIZE)
+    assert len(block_pool.cached_block_hash_to_block) == 0
+    blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
+        0].req_to_blocks[request_id]
+    for block in blocks:
+        assert block._block_hash is None
+
+    # (1b): forward()
+    model_runner_output = EMPTY_MODEL_RUNNER_OUTPUT
+
+    # (1c): update_from_output()
+    engine_core_outputs = scheduler.update_from_output(scheduler_output,
+                                                       model_runner_output)
+    assert not engine_core_outputs or not engine_core_outputs[0].outputs
+
+    # STEP (2):
+    # (2a): schedule(): nothing happens!
+    scheduler_output = scheduler.schedule()
+    assert len(scheduler.waiting) == 1
+    assert len(scheduler.running) == 0
+
+    # (2b): forward(): request finishes recv.
+    model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
+    model_runner_output.finished_recving = [request_id]
+
+    # (2c): update_from_output():
+    engine_core_outputs = scheduler.update_from_output(scheduler_output,
+                                                       model_runner_output)
+    assert len(scheduler.waiting) == 1
+    assert (request_id in scheduler.finished_recving_kv_req_ids)
+
+    # STEP (3):
+    # (3a): schedule(): this should actually schedule.
+    scheduler_output = scheduler.schedule()
+    assert len(scheduler.running) == 1
+
+    # Confirm the block are actually allocated.
+    num_hashed_blocks = 0
+    blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
+        0].req_to_blocks[request_id]
+    for block in blocks:
+        assert block.ref_cnt == 1
+        num_hashed_blocks += (1 if block._block_hash is not None else 0)
+    assert num_hashed_blocks == NUM_EXTERNAL_FULL_BLOCKS
+
+    # Confirm the rest of the prompt is scheduled in this step.
+    scheduled_req = scheduler_output.scheduled_new_reqs[0]
+    num_scheduled_tokens = scheduler_output.num_scheduled_tokens[request_id]
+    num_computed_tokens = scheduled_req.num_computed_tokens
+    total_prompt_tokens = len(scheduled_req.prompt_token_ids)
+    assert (num_scheduled_tokens == total_prompt_tokens - num_computed_tokens)
+
+    # (3b): execute_model()
+    model_runner_output = create_model_runner_output([request])
+    # (3c): update_from_output()
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+
+    # Step (4): Hit EOS.
+    scheduler_output = scheduler.schedule()
+    model_runner_output = create_model_runner_output([request], use_eos=True)
+    engine_core_outputs = scheduler.update_from_output(scheduler_output,
+                                                       model_runner_output)
+    scheduler.schedule()
+
+    outputs = engine_core_outputs[0].outputs
+    assert len(outputs) == 1
+    output = outputs[0]
+    assert output.finish_reason == FinishReason.STOP
+    assert_scheduler_empty(scheduler)
+
+
+def test_interleaved_lifecycle():
+    """Test Remote Prefills Work Well With Other Requests."""
+
+    vllm_config = create_vllm_config()
+    scheduler = create_scheduler(vllm_config)
+
+    # 2 Full Blocks and 1 Half Block.
+    BLOCK_SIZE = vllm_config.cache_config.block_size
+    NUM_EXTERNAL_FULL_BLOCKS = 2
+    NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
+
+    request_remote = create_request(request_id=1,
+                                    num_tokens=NUM_TOKENS,
+                                    do_remote_prefill=True)
+    request_local_a = create_request(
+        request_id=2,
+        num_tokens=NUM_TOKENS,
+    )
+    request_local_b = create_request(
+        request_id=3,
+        num_tokens=NUM_TOKENS,
+    )
+
+    # STEP 1: Regular request is running.
+    scheduler.add_request(request_local_a)
+    scheduler_output = scheduler.schedule()
+    assert len(scheduler.running) == 1
+
+    model_runner_output = create_model_runner_output([request_local_a])
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+
+    # STEP 2: Add a local and remote request.
+    scheduler.add_request(request_local_b)
+    scheduler.add_request(request_remote)
+    scheduler_output = scheduler.schedule()
+    assert len(scheduler.running) == 2
+    assert len(scheduler.waiting) == 1
+    assert len(scheduler_output.scheduled_new_reqs) == 1
+    assert scheduler_output.scheduled_cached_reqs.num_reqs == 1
+
+    model_runner_output = create_model_runner_output(
+        [request_local_a, request_local_b])
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+
+    # STEP 3: continue running, KVs not arrived yet.
+    scheduler_output = scheduler.schedule()
+    assert len(scheduler.running) == 2
+    assert len(scheduler.waiting) == 1
+    assert len(scheduler_output.scheduled_new_reqs) == 0
+    assert scheduler_output.scheduled_cached_reqs.num_reqs == 2
+
+    model_runner_output = create_model_runner_output(
+        reqs=[request_local_a, request_local_b])
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+    assert len(scheduler.running) == 2
+    assert len(scheduler.waiting) == 1
+    assert len(scheduler_output.scheduled_new_reqs) == 0
+    assert scheduler_output.scheduled_cached_reqs.num_reqs == 2
+
+    # STEP 4: KVs arrive.
+    scheduler_output = scheduler.schedule()
+    assert len(scheduler.running) == 2
+    assert len(scheduler.waiting) == 1
+    assert len(scheduler_output.scheduled_new_reqs) == 0
+    assert scheduler_output.scheduled_cached_reqs.num_reqs == 2
+
+    model_runner_output = create_model_runner_output(
+        [request_local_a, request_local_b],
+        finished_recving=[request_remote.request_id])
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+
+    # STEP 5: RECVed KVs are sent to ModelRunner.
+    scheduler_output = scheduler.schedule()
+    assert len(scheduler.running) == 3
+    assert len(scheduler.waiting) == 0
+    assert len(scheduler_output.scheduled_new_reqs) == 1
+    assert scheduler_output.scheduled_cached_reqs.num_reqs == 2
+
+    model_runner_output = create_model_runner_output(
+        [request_local_a, request_local_b, request_remote])
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+
+    # STEP 6: Hit EOS and free.
+    scheduler_output = scheduler.schedule()
+    model_runner_output = create_model_runner_output(
+        [request_local_a, request_local_b, request_remote],
+        use_eos=True,
+    )
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+    scheduler.schedule()
+    assert_scheduler_empty(scheduler)
+
+
+def test_no_spurious_prefix_caching():
+    """
+    With P/D, blocks can be allocated but uncomputed for
+    multiple engine steps. This test confirms that we do
+    not accidentally have cache hits against uncomputed
+    blocks.
+    """
+
+    vllm_config = create_vllm_config()
+    scheduler = create_scheduler(vllm_config)
+
+    vllm_config = create_vllm_config()
+    scheduler = create_scheduler(vllm_config)
+
+    # 2 and a half full external blocks.
+    BLOCK_SIZE = vllm_config.cache_config.block_size
+    NUM_EXTERNAL_FULL_BLOCKS = 2
+    NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
+
+    # Both of these requests have prompts like [1,1,1,1,1, ...]
+    request_remote = create_request(
+        request_id=1,
+        num_tokens=NUM_TOKENS,
+        do_remote_prefill=True,
+        use_all_1s_for_prompt_tokens=True,
+    )
+
+    request_local = create_request(
+        request_id=2,
+        num_tokens=NUM_TOKENS,
+        do_remote_prefill=False,
+        use_all_1s_for_prompt_tokens=True,
+    )
+
+    # Schedule the remote prefill request. This should not
+    # cause any blocks to be cached.
+    scheduler.add_request(request_remote)
+    scheduler_output = scheduler.schedule()
+    scheduler.update_from_output(scheduler_output, EMPTY_MODEL_RUNNER_OUTPUT)
+    assert len(scheduler.waiting) == 1
+
+    # Schedule the local prefill request. This should
+    # cause blocks to be cached, but separately from
+    scheduler.add_request(request_local)
+    scheduler_output = scheduler.schedule()
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 1
+
+    local_blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
+        0].req_to_blocks[request_local.request_id]
+    remote_blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
+        0].req_to_blocks[request_remote.request_id]
+
+    # Local should have cached blocks (but not all due to preallocate).
+    num_hashed_blocks = 0
+    for block in local_blocks:
+        assert block.ref_cnt == 1
+        num_hashed_blocks += (1 if block._block_hash is not None else 0)
+    assert num_hashed_blocks > 0
+
+    # Remote blocks should not be cached.
+    for block in remote_blocks:
+        assert block.ref_cnt == 1
+        assert block._block_hash is None
+
+
+def test_full_block_prompt():
+    """Test that we handle a prompt that is the full block size."""
+
+    vllm_config = create_vllm_config()
+    scheduler = create_scheduler(vllm_config)
+
+    # 2 Full Blocks and 1 Half Block.
+    BLOCK_SIZE = vllm_config.cache_config.block_size
+    NUM_EXTERNAL_FULL_BLOCKS = 2
+    NUM_TOKENS = int(BLOCK_SIZE * NUM_EXTERNAL_FULL_BLOCKS)
+
+    request = create_request(request_id=1,
+                             num_tokens=NUM_TOKENS,
+                             do_remote_prefill=True)
+
+    scheduler.add_request(request)
+    request_id = request.request_id
+
+    # STEP (1): Initialize a recv.
+    scheduler_output = scheduler.schedule()
+    # All blocks should be allocated.
+    num_blocks = len(scheduler.kv_cache_manager.coordinator.
+                     single_type_managers[0].req_to_blocks[request_id])
+    assert num_blocks == NUM_EXTERNAL_FULL_BLOCKS
+    model_runner_output = EMPTY_MODEL_RUNNER_OUTPUT
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+
+    # # STEP (2): Recv.
+    scheduler_output = scheduler.schedule()
+    model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
+    model_runner_output.finished_recving = [request_id]
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+    assert len(scheduler.waiting) == 1
+    assert (request_id in scheduler.finished_recving_kv_req_ids)
+
+    # # STEP (3): Run as usual.
+    scheduler_output = scheduler.schedule()
+
+    # We need to recompute the final token of the prompt to generate
+    # the first new token, so we should not have a new block.
+    num_blocks = len(scheduler.kv_cache_manager.coordinator.
+                     single_type_managers[0].req_to_blocks[request_id])
+    assert num_blocks == NUM_EXTERNAL_FULL_BLOCKS
+    assert (scheduler_output.scheduled_new_reqs[0].num_computed_tokens ==
+            NUM_TOKENS - 1)
+    assert (scheduler_output.num_scheduled_tokens[request_id] == 1)
+
+    model_runner_output = create_model_runner_output([request])
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+
+    # # Step (4): Hit EOS.
+    scheduler_output = scheduler.schedule()
+    model_runner_output = create_model_runner_output([request], use_eos=True)
+    engine_core_outputs = scheduler.update_from_output(scheduler_output,
+                                                       model_runner_output)
+    scheduler.schedule()
+
+    outputs = engine_core_outputs[0].outputs
+    assert len(outputs) == 1
+    output = outputs[0]
+    assert output.finish_reason == FinishReason.STOP
+    assert_scheduler_empty(scheduler)
+
+
+def test_cannot_schedule_after_recv():
+    """
+    Test that we can handle no schedule after recv due to not
+    enough remaining KV blocks.
+    """
+
+    # NOTE: the KVCacheManager will use 1 null block.
+    # So there are 5 total working blocks.
+    TOTAL_NUM_BLOCKS = 6
+    vllm_config = create_vllm_config()
+    scheduler = create_scheduler(vllm_config, num_blocks=TOTAL_NUM_BLOCKS)
+
+    # Prime the KVCache.
+    NUM_PROMPT_BLOCKS = 2
+    BLOCK_SIZE = vllm_config.cache_config.block_size
+    # Prompt will use 2 blocks + 1 block after we schedule.
+    NUM_TOKENS_LOCAL = int(BLOCK_SIZE * NUM_PROMPT_BLOCKS)
+    NUM_TOKENS_REMOTE = int(BLOCK_SIZE * (NUM_PROMPT_BLOCKS + 0.5))
+
+    request_normal = create_request(request_id=1, num_tokens=NUM_TOKENS_LOCAL)
+    request_remote = create_request(request_id=2,
+                                    num_tokens=NUM_TOKENS_REMOTE,
+                                    do_remote_prefill=True)
+
+    # STEP 1: 3 blocks are in use (2 for prompt, 1 for decode).
+    scheduler.add_request(request_normal)
+    scheduler_output = scheduler.schedule()
+    model_runner_output = create_model_runner_output(reqs=[request_normal])
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 0
+
+    # Step 2: 5 blocks are in use (2 new for remote blocks).
+    scheduler.add_request(request_remote)
+    scheduler_output = scheduler.schedule()
+    model_runner_output = create_model_runner_output(reqs=[request_normal])
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 1
+
+    # Step 3: finish recving (5 blocks in use)
+    scheduler_output = scheduler.schedule()
+    model_runner_output = create_model_runner_output(
+        reqs=[request_normal], finished_recving=[request_remote.request_id])
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 1
+
+    # Step 4: try to schedule, not enough blocks.
+    scheduler_output = scheduler.schedule()
+    model_runner_output = create_model_runner_output(reqs=[request_normal])
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 1
+
+    # Step 5: finish the request, free it.
+    scheduler_output = scheduler.schedule()
+    model_runner_output = create_model_runner_output(reqs=[request_normal],
+                                                     use_eos=True)
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+    assert len(scheduler.running) == 0
+    assert len(scheduler.waiting) == 1
+
+    # Step 6: now we can schedule (with 2 blocks computed).
+    scheduler_output = scheduler.schedule()
+    model_runner_output = create_model_runner_output(reqs=[request_remote])
+    assert (scheduler_output.scheduled_new_reqs[0].num_computed_tokens ==
+            NUM_PROMPT_BLOCKS * BLOCK_SIZE)
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 0
+
+    # Step 7: free everything.
+    scheduler_output = scheduler.schedule()
+    model_runner_output = create_model_runner_output(reqs=[request_remote],
+                                                     use_eos=True)
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+    _ = scheduler.schedule()
+    assert_scheduler_empty(scheduler)
diff --git a/vllm_v0.10.0/tests/v1/kv_connector/unit/utils.py b/vllm_v0.10.0/tests/v1/kv_connector/unit/utils.py
new file mode 100644
index 0000000..cf20d44
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/kv_connector/unit/utils.py
@@ -0,0 +1,251 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import tempfile
+from collections import defaultdict
+from typing import Any, Optional
+
+import torch
+
+from vllm import SamplingParams
+from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig,
+                         ModelConfig, SchedulerConfig, VllmConfig)
+from vllm.distributed.kv_transfer.kv_connector.factory import (
+    KVConnectorFactory)
+from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (  # noqa
+    SharedStorageConnector)
+from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+from vllm.v1.core.sched.scheduler import Scheduler
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheGroupSpec)
+from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.request import Request
+from vllm.v1.structured_output import StructuredOutputManager
+
+EOS_TOKEN_ID = 50256
+
+
+def assert_scheduler_empty(scheduler: Scheduler):
+    """Confirm the scheduler is "empty" - i.e. no leaks."""
+    # Scheduler Metadata.
+    assert len(scheduler.requests) == 0
+    assert len(scheduler.waiting) == 0
+    assert len(scheduler.running) == 0
+    assert len(scheduler.finished_req_ids) == 0
+    assert len(scheduler.finished_recving_kv_req_ids) == 0
+
+    # EncoderCacheManager.
+    assert len(scheduler.encoder_cache_manager.freed) == 0
+    assert len(scheduler.encoder_cache_manager.cached) == 0
+
+    # KVCache Manager.
+    assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
+               req_to_blocks) == 0
+    assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0
+    assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
+               num_cached_block) == 0
+    num_free_blocks = (
+        scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks)
+    assert num_free_blocks == (
+        scheduler.kv_cache_manager.block_pool.num_gpu_blocks - 1)
+
+    # NOTE(rob): just the ref count on blocks will be 0. The hash
+    # value, etc will remain since we lazily evict for prefix cache.
+    for block in scheduler.kv_cache_manager.block_pool.blocks:
+        assert block.ref_cnt == 0
+
+
+def create_vllm_config(
+    model: str = "facebook/opt-125m",
+    max_num_seqs: int = 16,
+    max_num_batched_tokens: int = 64,
+    block_size: int = 16,
+) -> VllmConfig:
+    """Initialize VllmConfig For Testing."""
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=max_num_seqs,
+        max_num_batched_tokens=max_num_batched_tokens,
+        max_model_len=max_num_batched_tokens,
+    )
+    model_config = ModelConfig(
+        model=model,
+        task="auto",
+        tokenizer=model,
+        tokenizer_mode="auto",
+        trust_remote_code=True,
+        dtype="float16",
+        seed=42,
+    )
+    # Cache config, optionally force APC
+    cache_config = CacheConfig(
+        block_size=block_size,
+        gpu_memory_utilization=0.9,
+        swap_space=0,
+        cache_dtype="auto",
+        enable_prefix_caching=True,
+    )
+    kv_transfer_config = KVTransferConfig(
+        kv_connector="NixlConnector",
+        kv_role="kv_both",
+    )
+    return VllmConfig(scheduler_config=scheduler_config,
+                      model_config=model_config,
+                      cache_config=cache_config,
+                      kv_transfer_config=kv_transfer_config,
+                      device_config=DeviceConfig("cpu"))
+
+
+def create_scheduler(
+    vllm_config: VllmConfig,
+    num_blocks: int = 10000,
+) -> Scheduler:
+    """Initialize Scheduler For Testing."""
+    block_size = vllm_config.cache_config.block_size
+    kv_cache_config = KVCacheConfig(
+        num_blocks=num_blocks,  # A large number of blocks to hold all requests
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec(['layer'],
+                             FullAttentionSpec(block_size, 1, 1, torch.float32,
+                                               False))
+        ],
+    )
+    vllm_config.cache_config.num_gpu_blocks = num_blocks
+    return Scheduler(
+        vllm_config=vllm_config,
+        kv_cache_config=kv_cache_config,
+        log_stats=True,
+        structured_output_manager=StructuredOutputManager(vllm_config),
+    )
+
+
+def create_request(
+    request_id: int,
+    num_tokens: int = 10,
+    max_tokens: int = 16,
+    do_remote_decode: bool = False,
+    do_remote_prefill: bool = False,
+    use_all_1s_for_prompt_tokens: bool = False,
+    num_remote_blocks: int = 3,
+) -> Request:
+    """Make dummy request for testing."""
+
+    kv_transfer_params: Optional[dict[str, Any]] = None
+
+    if do_remote_decode:
+        assert not do_remote_prefill
+        kv_transfer_params = dict(do_remote_prefill=False,
+                                  do_remote_decode=True)
+    elif do_remote_prefill:
+        kv_transfer_params = dict(do_remote_prefill=True,
+                                  do_remote_decode=False,
+                                  remote_engine_id="my-engine-id",
+                                  remote_block_ids=list(
+                                      range(num_remote_blocks)),
+                                  remote_host="my-host",
+                                  remote_port=1234)
+
+    max_tokens = 1 if do_remote_decode else max_tokens
+    sampling_params = SamplingParams(max_tokens=max_tokens)
+
+    if use_all_1s_for_prompt_tokens:
+        prompt_token_ids = [1] * num_tokens
+    else:
+        prompt_token_ids = [i * request_id for i in range(num_tokens)]
+
+    req = Request(
+        request_id=f"id-{request_id}",
+        prompt_token_ids=prompt_token_ids,
+        sampling_params=sampling_params,
+        pooling_params=None,
+        multi_modal_inputs=None,
+        multi_modal_placeholders=None,
+        multi_modal_hashes=None,
+        eos_token_id=EOS_TOKEN_ID,
+    )
+    req.kv_transfer_params = kv_transfer_params
+    return req
+
+
+def create_model_runner_output(
+    reqs: list[Request],
+    finished_sending: Optional[list[str]] = None,
+    finished_recving: Optional[list[str]] = None,
+    use_eos: bool = False,
+) -> ModelRunnerOutput:
+    """Make dummy model runner output for testing."""
+
+    # Make request data.
+    req_ids = [req.request_id for req in reqs]
+    req_id_to_index = {req_id: idx for idx, req_id in enumerate(req_ids)}
+
+    # Make sampled tokens.
+    sampled_token = EOS_TOKEN_ID if use_eos else 0
+    sampled_token_ids = [[sampled_token] for _ in req_ids]
+
+    # Make output data structure.
+    return ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_id_to_index,
+        sampled_token_ids=sampled_token_ids,
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=None,
+        finished_sending=finished_sending,
+        finished_recving=finished_recving,
+    )
+
+
+class TestSharedStorageConnector(SharedStorageConnector):
+
+    def __init__(self, config: VllmConfig, role):
+        self.name = config.kv_transfer_config.kv_connector_extra_config["name"]
+        self._connector = SharedStorageConnector(config, role)
+        self.call_record: dict[str, int] = defaultdict(int)
+        # Use a unique temp file per connector
+        self._event_file = tempfile.gettempdir(
+        ) + f"/connector_{self.name}-{self.role.name}_events.log"
+        # Start with an empty file
+        with open(self._event_file, "w") as _:
+            pass
+
+    def __getattribute__(self, name):
+        if name in ("_connector", "call_record", "name", "_event_file",
+                    "__class__", "__dict__", "__getattribute__",
+                    "__init__"):  # avoid recursion
+            return object.__getattribute__(self, name)
+        if not hasattr(self._connector, name):
+            return object.__getattribute__(self, name)
+        attr = getattr(self._connector, name)
+
+        # Intercept calls to the connector interface and write an event
+        # for each one to a file, which can be read back in the main test proc.
+        if callable(attr):
+
+            def wrapper(*args, **kwargs):
+                self.call_record[name] += 1
+
+                # Include args that we're interested in
+                to_log = [name]
+                for arg in args:
+                    if isinstance(arg, int):
+                        to_log.append(str(arg))
+                    elif isinstance(arg, KVCacheBlocks):
+                        to_log.append(
+                            f"num_blocks={[len(b) for b in arg.blocks]}")
+
+                # Log the event as a line to the file
+                try:
+                    with open(self._event_file, "a") as f:
+                        f.write(' '.join(to_log) + "\n")
+                except Exception as e:
+                    print(f"[ERROR] Could not log event {name} "
+                          f"for {self.name}: {e}")
+                return attr(*args, **kwargs)
+
+            return wrapper
+        return attr
+
+
+KVConnectorFactory.register_connector("TestSharedStorageConnector", __name__,
+                                      TestSharedStorageConnector.__name__)
diff --git a/vllm_v0.10.0/tests/v1/metrics/test_ray_metrics.py b/vllm_v0.10.0/tests/v1/metrics/test_ray_metrics.py
new file mode 100644
index 0000000..92f6c6f
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/metrics/test_ray_metrics.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+
+import pytest
+import ray
+
+from vllm.config import ModelDType
+from vllm.sampling_params import SamplingParams
+from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
+from vllm.v1.metrics.ray_wrappers import RayPrometheusStatLogger
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v1_only(monkeypatch):
+    """
+    The change relies on V1 APIs, so set VLLM_USE_V1=1.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '1')
+
+
+MODELS = [
+    "distilbert/distilgpt2",
+]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [16])
+def test_engine_log_metrics_ray(
+    example_prompts,
+    model: str,
+    dtype: ModelDType,
+    max_tokens: int,
+) -> None:
+    """ Simple smoke test, verifying this can be used without exceptions.
+    Need to start a Ray cluster in order to verify outputs."""
+
+    @ray.remote(num_gpus=1)
+    class EngineTestActor:
+
+        async def run(self):
+            # Set environment variable inside the Ray actor since environment
+            # variables from pytest fixtures don't propagate to Ray actors
+            os.environ['VLLM_USE_V1'] = '1'
+
+            engine_args = AsyncEngineArgs(model=model,
+                                          dtype=dtype,
+                                          disable_log_stats=False,
+                                          enforce_eager=True)
+
+            engine = AsyncLLM.from_engine_args(
+                engine_args, stat_loggers=[RayPrometheusStatLogger])
+
+            for i, prompt in enumerate(example_prompts):
+                results = engine.generate(
+                    request_id=f"request-id-{i}",
+                    prompt=prompt,
+                    sampling_params=SamplingParams(max_tokens=max_tokens),
+                )
+
+                async for _ in results:
+                    pass
+
+    # Create the actor and call the async method
+    actor = EngineTestActor.remote()  # type: ignore[attr-defined]
+    ray.get(actor.run.remote())
diff --git a/vllm_v0.10.0/tests/v1/sample/__init__.py b/vllm_v0.10.0/tests/v1/sample/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/v1/sample/test_logits_processors.py b/vllm_v0.10.0/tests/v1/sample/test_logits_processors.py
new file mode 100644
index 0000000..84ee3b0
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/sample/test_logits_processors.py
@@ -0,0 +1,627 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import random
+from collections.abc import Callable
+from typing import NamedTuple, Optional, Union
+
+import numpy as np
+import pytest
+import torch
+
+from tests.v1.sample.utils import (LogitsprocsTestFakes, create_fake_logits,
+                                   create_penalty_tensor,
+                                   create_prompt_tokens_tensor,
+                                   fake_apply_logitsprocs,
+                                   fake_update_logitsprocs_state)
+from vllm.platforms import current_platform
+from vllm.sampling_params import SamplingParams
+from vllm.utils import is_pin_memory_available
+# yapf: disable
+from vllm.v1.sample.logits_processor import (BatchUpdate, BatchUpdateBuilder,
+                                             LogitBiasLogitsProcessor,
+                                             LogitsProcessor,
+                                             MinPLogitsProcessor,
+                                             MinTokensLogitsProcessor,
+                                             MoveDirectionality,
+                                             init_builtin_logitsprocs)
+# yapf: enable
+from vllm.v1.sample.metadata import SamplingMetadata
+
+PIN_MEMORY_AVAILABLE = is_pin_memory_available()
+MAX_NUM_REQS = 256
+VOCAB_SIZE = 1024
+NUM_OUTPUT_TOKENS = 20
+CUDA_DEVICES = [
+    f"{current_platform.device_type}:{i}"
+    for i in range(1 if current_platform.device_count() == 1 else 2)
+]
+MAX_NUM_PROMPT_TOKENS = 64
+MIN_TOKENS_LEN_THRESHOLD = 5
+REQS_PER_LOGITPROC = 50
+STR_NO_LOGITPROC = "none"
+
+# LogitsProcessor subclass or "none"
+LogitprocType = Union[type[LogitsProcessor], str]
+
+
+class LogitsProcsRequestParams:
+    """Encapsulates key params for a single request in a batch.
+    
+    Params can be customized based on the enabled logitproc
+    """
+    workload_index: int
+    logitproc_type: LogitprocType  # Logitproc enabled, specified by str id
+    out_tokens: list[int]  # Output tokens required for min tokens test
+    params: SamplingParams  # Settings customized for logitproc
+
+    def __init__(self, workload_index: int, logitproc_type: LogitprocType):
+        self.workload_index = workload_index
+        self.logitproc_type = logitproc_type
+        # Number of output tokens is randomly 0 or twice the min-tokens
+        # threshold which will be used in testing. Output token values
+        # don't matter *for these tests* so use 0 as a dummy value
+        self.out_tokens = ([0] *
+                           (MIN_TOKENS_LEN_THRESHOLD * random.randint(0, 2)))
+        self.params = _sampling_params_from_logitproc(logitproc_type)
+
+    def __str__(self):
+        """For debugging"""
+        summ = ', '.join(f'{k}={v}' for k, v in vars(self).items())
+        return f"MyClass({summ})"
+
+
+def _generate_fake_sampling_metadata(
+    num_output_tokens: int,
+    batch_size: int,
+    vocab_size: int,
+    device: torch.device,
+) -> SamplingMetadata:
+    """Generate fake sampling metadata with fake logitsprocs"""
+    output_token_ids: list[list[int]] = []
+    prompt_token_ids: list[list[int]] = []
+    for _ in range(batch_size):
+        output_token_ids.append(
+            np.random.randint(0, vocab_size, size=num_output_tokens).tolist())
+        prompt_token_ids.append(
+            np.random.randint(0,
+                              vocab_size,
+                              size=np.random.randint(
+                                  1, MAX_NUM_PROMPT_TOKENS)).tolist())
+    logitsprocs = init_builtin_logitsprocs(
+        pin_memory_available=PIN_MEMORY_AVAILABLE,
+        max_num_reqs=MAX_NUM_REQS + 1,
+        device=device)
+
+    fake_sampling_metadata = SamplingMetadata(
+        temperature=torch.full((batch_size, ), 0.0),
+        all_greedy=True,
+        all_random=False,
+        top_p=None,
+        top_k=None,
+        generators={},
+        max_num_logprobs=0,
+        prompt_token_ids=create_prompt_tokens_tensor(prompt_token_ids,
+                                                     vocab_size, device),
+        output_token_ids=output_token_ids,
+        frequency_penalties=create_penalty_tensor(batch_size, 0.0, device),
+        presence_penalties=create_penalty_tensor(batch_size, 0.0, device),
+        repetition_penalties=create_penalty_tensor(batch_size, 1.0, device),
+        no_penalties=True,
+        allowed_token_ids_mask=None,
+        bad_words_token_ids={},
+        logitsprocs=logitsprocs)
+    return fake_sampling_metadata
+
+
+def _generate_test_fakes(batch_size: int, device: str) -> LogitsprocsTestFakes:
+    """Generate fake logits and sampling metadata"""
+    fake_logits = create_fake_logits(batch_size, VOCAB_SIZE)
+    # Create one dominant token per batch, to support min-p test
+    for i in range(batch_size):
+        fake_logits[i, 0] = 10.0  # High logit for first token
+        fake_logits[i, 1:] = 1e-2  # Others remain low
+    sampling_metadata = _generate_fake_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    return LogitsprocsTestFakes(
+        logits=fake_logits,
+        sampling_metadata=sampling_metadata,
+    )
+
+
+def _sampling_params_from_logitproc(
+        logitproc_type: LogitprocType) -> SamplingParams:
+    """Customize request SamplingParams for a specified logitproc"""
+    # SamplingParams for req with no logitproc
+    kwargs = {"min_p": 0.0, "logit_bias": None, "min_tokens": 0}
+    if fxn := logitsprocs_test_mapping[logitproc_type].gen_request_fxn:
+        fxn(kwargs)
+    return SamplingParams(**kwargs)
+
+
+def _generate_mixed_logitsprocs_batch_params(
+    reqs_per_logitproc: int,
+    logitsprocs_types: list[str],
+) -> list[LogitsProcsRequestParams]:
+    """Define key params for a batch of requests with a different
+    logitproc enabled per request.
+    
+    The batch will have `reqs_per_logitproc` repeats for all
+    `logitsprocs_types` under test, including the case where
+    no logitsproc is enabled. The batch is randomly shuffled. The
+    size of the batch is `reqs_per_logitproc` times
+    `n = len(logitsprocs_types)`
+
+    Args:
+      reqs_per_logitproc: number of requests using each logitproc
+      logitsprocs_types: logitsprocs under test
+
+    Returns:
+      List of per-request params which configure the engine for that request's
+      enabled logitproc
+    """
+    batch_size = len(logitsprocs_types) * reqs_per_logitproc
+    # Generate multiple repeats of key params for each logitproc;
+    # apply random inverse permutation to the iteration
+    # over logitsprocs, such that logitsprocs are shuffled.
+    batch_perm = random.sample(range(batch_size), k=batch_size)
+    return [
+        LogitsProcsRequestParams(
+            workload_index=idx,
+            logitproc_type=logitsprocs_types[pdx // reqs_per_logitproc])
+        for idx, pdx in enumerate(batch_perm)
+    ]
+
+
+def _raise_error_invalid(
+    msg_suffix: str,
+    batch_index: int,
+    request_params: LogitsProcsRequestParams,
+    step_idx: int,
+    err_cls: type[Exception] = ValueError,
+) -> None:
+    raise err_cls(f"Validation failed for step={step_idx}, "
+                  f"batch_index={batch_index}, "
+                  f"workload_index={request_params.workload_index}, "
+                  f"req_params={request_params}. Reason: {msg_suffix}")
+
+
+def _logit_bias_params(kwargs: dict) -> None:
+    """Logit bias config"""
+    kwargs["logit_bias"] = {
+        random.randint(0, VOCAB_SIZE - 1): random.choice([-0.1, 0.2])
+    }
+
+
+def _logit_bias_validate(
+    test_fakes: LogitsprocsTestFakes,
+    persistent_batch: list[LogitsProcsRequestParams],
+    logits_new: torch.Tensor,
+    batch_index: int,
+    request_params: LogitsProcsRequestParams,
+    step_idx: int,
+) -> None:
+    """Validate logit bias logitproc applied correctly"""
+    logit_bias = request_params.params.logit_bias
+    logits_old = (
+        test_fakes.logits[persistent_batch[batch_index].workload_index].cpu())
+    logits_new = logits_new[batch_index].cpu()
+    for token_id in range(VOCAB_SIZE):
+        logit_old_value = logits_old[token_id]
+        logit_new_value = logits_new[token_id]
+        if token_id in logit_bias:
+            bias_value = logit_bias[token_id]
+            exp_value = bias_value + logit_old_value
+            if logit_new_value != pytest.approx(exp_value):
+                _raise_error_invalid(msg_suffix=(
+                    f"Biased token {token_id} logit value {logit_new_value} "
+                    f"does not match expected value {exp_value} "
+                    f"given bias {bias_value}"),
+                                     batch_index=batch_index,
+                                     request_params=request_params,
+                                     step_idx=step_idx)
+
+        else:
+            if logit_new_value != pytest.approx(logit_old_value):
+                _raise_error_invalid(msg_suffix=(
+                    f"Unbiased token {token_id} logit value {logit_new_value} "
+                    f"does not match expected value {logit_old_value}"),
+                                     batch_index=batch_index,
+                                     request_params=request_params,
+                                     step_idx=step_idx)
+
+
+def _min_p_params(kwargs: dict) -> None:
+    """Min-p logitproc config"""
+    kwargs["min_p"] = 0.1
+
+
+def _min_p_validate(
+    test_fakes: LogitsprocsTestFakes,
+    persistent_batch: list[LogitsProcsRequestParams],
+    logits_new: torch.Tensor,
+    batch_index: int,
+    request_params: LogitsProcsRequestParams,
+    step_idx: int,
+) -> None:
+    """Validate min-p logitproc applied correctly"""
+    for token_id in range(VOCAB_SIZE):
+        logits_for_token = logits_new[batch_index][token_id]
+        if token_id == 0:
+            # Dominant token should always be unmasked
+            if logits_for_token == -float("inf"):
+                _raise_error_invalid(
+                    msg_suffix="Invalid: dominant token 0 masked (-inf)",
+                    batch_index=batch_index,
+                    request_params=request_params,
+                    step_idx=step_idx)
+        else:
+            if request_params.params.min_p > 0.0:
+                # Non-dominant tokens should be masked when min_p > 0
+                if logits_for_token != -float("inf"):
+                    _raise_error_invalid(
+                        msg_suffix=
+                        f"Invalid: non-dominant token {token_id} not masked",
+                        batch_index=batch_index,
+                        request_params=request_params,
+                        step_idx=step_idx)
+            else:
+                # No masking when min_p is 0
+                if logits_for_token == -float("inf"):
+                    _raise_error_invalid(
+                        msg_suffix=
+                        f"Invalid: token {token_id} masked when min_p=0.0",
+                        batch_index=batch_index,
+                        request_params=request_params,
+                        step_idx=step_idx)
+
+
+def _min_tokens_params(kwargs: dict) -> None:
+    """Min-tokens logitproc config"""
+    kwargs["min_tokens"] = MIN_TOKENS_LEN_THRESHOLD
+    kwargs["stop_token_ids"] = [
+        np.random.randint(0, VOCAB_SIZE - 1)
+        for _ in range(np.random.randint(0, VOCAB_SIZE))
+    ]
+
+
+def _min_tokens_validate(
+    test_fakes: LogitsprocsTestFakes,
+    persistent_batch: list[LogitsProcsRequestParams],
+    logits_new: torch.Tensor,
+    batch_index: int,
+    request_params: LogitsProcsRequestParams,
+    step_idx: int,
+) -> None:
+    """Validate min-tokens logitsproc applied correctly"""
+    ref_num_out_tokens = len(request_params.out_tokens)
+    min_reached = ref_num_out_tokens >= MIN_TOKENS_LEN_THRESHOLD
+    ref_all_stop_token_ids = request_params.params.all_stop_token_ids
+    mt_lp: MinTokensLogitsProcessor = next(
+        test_fakes.get_logitsprocs_by_cls(MinTokensLogitsProcessor))
+    assert isinstance(mt_lp, MinTokensLogitsProcessor)
+    min_tok = mt_lp.min_toks.get(batch_index, None)
+
+    # Validate min-token logits processor state
+    if min_tok:
+        (_, out_tok, all_stop_token_ids) = min_tok
+        num_out_tokens = len(out_tok)
+        if num_out_tokens != ref_num_out_tokens:
+            _raise_error_invalid(msg_suffix=(
+                "Number of output tokens in min-token logit processor "
+                f"request metadata ({num_out_tokens}) does not match "
+                f"reference ({ref_num_out_tokens})."),
+                                 batch_index=batch_index,
+                                 request_params=request_params,
+                                 step_idx=step_idx)
+        if ref_all_stop_token_ids != all_stop_token_ids:
+            _raise_error_invalid(msg_suffix=(
+                "Stop token ids do not match reference; all_stop_token_ids: "
+                f"{sorted(all_stop_token_ids)}, ref_all_stop_token_ids: "
+                f"{sorted(ref_all_stop_token_ids)}"),
+                                 batch_index=batch_index,
+                                 request_params=request_params,
+                                 step_idx=step_idx)
+        if min_reached:
+            _raise_error_invalid(msg_suffix=(
+                "Expected min-tokens request with min reached, but batch "
+                "index is recognized by min-tokens logits processor."),
+                                 batch_index=batch_index,
+                                 request_params=request_params,
+                                 step_idx=step_idx,
+                                 err_cls=RuntimeError)
+
+    elif not min_reached:
+        _raise_error_invalid(msg_suffix=(
+            "Expected min-tokens request with min not reached, but batch "
+            "index is not recognized by min-tokens logits processor."),
+                             batch_index=batch_index,
+                             request_params=request_params,
+                             step_idx=step_idx,
+                             err_cls=RuntimeError)
+
+    # Validate min-token logits
+    for token_id in range(VOCAB_SIZE):
+        logits_for_token = logits_new[batch_index][token_id]
+        if token_id in ref_all_stop_token_ids and not min_reached:
+            if logits_for_token != -float("inf"):
+                _raise_error_invalid(
+                    msg_suffix=(f"Token {token_id} is a stop token and "
+                                "the sequence has not reached min length, "
+                                "but the token is not masked "
+                                f"(logit={logits_for_token})"),
+                    batch_index=batch_index,
+                    request_params=request_params,
+                    step_idx=step_idx)
+        else:
+            if logits_for_token == -float("inf"):
+                _raise_error_invalid(
+                    msg_suffix=(f"Token {token_id} should not be masked but "
+                                f"is (output len={ref_num_out_tokens})"),
+                    batch_index=batch_index,
+                    request_params=request_params,
+                    step_idx=step_idx)
+
+
+def _none_validate(
+    test_fakes: LogitsprocsTestFakes,
+    persistent_batch: list[LogitsProcsRequestParams],
+    logits_new: torch.Tensor,
+    batch_index: int,
+    request_params: LogitsProcsRequestParams,
+    step_idx: int,
+) -> None:
+    """Validate that no logits processors are applied"""
+    logits = (
+        test_fakes.logits[persistent_batch[batch_index].workload_index].cpu())
+    ref_logits = logits_new[batch_index]
+    if not torch.all(ref_logits == logits):
+        mismatch_toks = (ref_logits
+                         != logits).nonzero(as_tuple=True)[0].tolist()
+        mismatch_strs = []
+        for token in mismatch_toks:
+            val = float(logits[token])
+            ref_val = float(ref_logits[token])
+            mismatch_strs.append(f"({token=},{val=},{ref_val=})")
+        _raise_error_invalid(msg_suffix=(
+            f"Unexpected modification of logits: {','.join(mismatch_strs)}"),
+                             batch_index=batch_index,
+                             request_params=request_params,
+                             step_idx=step_idx)
+
+
+class LogitsprocTestHelpers(NamedTuple):
+    """Supports setting up and validating logitsprocs unit tests."""
+    eval_fxn: Callable
+    gen_request_fxn: Optional[Callable] = None
+
+
+logitsprocs_test_mapping = {
+    STR_NO_LOGITPROC:
+    LogitsprocTestHelpers(eval_fxn=_none_validate),
+    LogitBiasLogitsProcessor:
+    LogitsprocTestHelpers(gen_request_fxn=_logit_bias_params,
+                          eval_fxn=_logit_bias_validate),
+    MinPLogitsProcessor:
+    LogitsprocTestHelpers(gen_request_fxn=_min_p_params,
+                          eval_fxn=_min_p_validate),
+    MinTokensLogitsProcessor:
+    LogitsprocTestHelpers(gen_request_fxn=_min_tokens_params,
+                          eval_fxn=_min_tokens_validate),
+}
+
+
+def _get_test_cases() -> list[list[str]]:
+    """Each test case is a set of logitsprocs"""
+    logitsprocs_types = list(logitsprocs_test_mapping.keys())
+    return [[STR_NO_LOGITPROC]] + [[logitproc_type, STR_NO_LOGITPROC]
+                                   for logitproc_type in logitsprocs_types
+                                   if logitproc_type != STR_NO_LOGITPROC
+                                   ] + [logitsprocs_types]
+
+
+def _generate_fake_step_update(
+    persistent_batch: list[LogitsProcsRequestParams],
+    workload_params: list[LogitsProcsRequestParams],
+    wdx: int,
+    batch_update_builder: BatchUpdateBuilder,
+) -> tuple[Optional[BatchUpdate], int, int]:
+    batch_size = len(persistent_batch)
+    workload_size = len(workload_params)
+    workload_reqs_remaining = workload_size - wdx
+    max_add_remove_per_step = max(1, int(0.2 * workload_size))
+
+    # 50% of steps: add no reqs
+    # Other 50%: add a limited number of reqs (less than the number
+    # of workload reqs remaining, less than an arbitrary max)
+    # If no workload reqs remain: 100% of steps have 0 adds
+    num_step_add = random.choice([
+        0,
+        random.randint(1, min(max_add_remove_per_step,
+                              workload_reqs_remaining))
+    ]) if workload_reqs_remaining else 0
+
+    # 50% of steps: remove no requests
+    # Other 50%: remove a limited number of reqs (less than the number
+    # persistent batch reqs remaining, less than an arbitrary max)
+    # If persistent batch is empty: 100% of steps have 0 removals until
+    # more requests are added. Assume that removed requests are always
+    # drawn from the current batch, before new adds
+    num_step_remove = random.choice([
+        0, random.randint(1, min(max_add_remove_per_step, batch_size))
+    ]) if batch_size else 0
+
+    num_step_add_replace = min(num_step_add, num_step_remove)
+
+    # Generate fake removed request indices drawn from persistent batch indices
+    for removal in random.sample(range(batch_size), num_step_remove):
+        batch_update_builder.removed_append(removal)
+
+    # Get added requests from workload
+    for add_req_params in workload_params[wdx:(wdx + num_step_add_replace)]:
+        # Replace as many removed requests as possible with added requests
+        add_remove_idx = batch_update_builder.pop_removed()
+        batch_update_builder.added.append(
+            (add_remove_idx, add_req_params.params, add_req_params.out_tokens))
+        persistent_batch[add_remove_idx] = add_req_params
+
+    # Append remaining added requests to end of batch
+    add_reqs_append = workload_params[(wdx +
+                                       num_step_add_replace):(wdx +
+                                                              num_step_add)]
+    batch_update_builder.added.extend([
+        (adx + batch_size, add_req_params.params, add_req_params.out_tokens)
+        for adx, add_req_params in enumerate(add_reqs_append)
+    ])
+    persistent_batch.extend(add_reqs_append)
+    pre_condense_batch_size = len(persistent_batch)
+    wdx += num_step_add  # Update workload offset
+
+    # Simulate condensing persistent batch
+    last_nonempty_index = pre_condense_batch_size - 1
+    condensed_to_idxs = set()
+    while batch_update_builder.removed:
+        if (last_nonempty_index in batch_update_builder.removed
+                or last_nonempty_index in condensed_to_idxs):
+            last_nonempty_index -= 1
+            continue
+        # last_nonempty_index is the highest persistent batch index that was
+        # not removed
+        first_empty_index = batch_update_builder.peek_removed()
+        assert first_empty_index is not None
+        if first_empty_index > last_nonempty_index:
+            break
+        # first_empty_index is the lowest removed persistent batch index
+        # that is less than last_nonempty_index
+        #
+        # move last_nonempty_index -> first_empty_index
+        batch_update_builder.pop_removed()
+        condensed_to_idxs.add(first_empty_index)
+        persistent_batch[first_empty_index] = persistent_batch[
+            last_nonempty_index]
+        batch_update_builder.moved.append(
+            (last_nonempty_index, first_empty_index,
+             MoveDirectionality.UNIDIRECTIONAL))
+
+        last_nonempty_index -= 1
+
+    # Now removed requests & gaps left by non-removed requests that got
+    # moved downward are grouped consecutively in the upper indices of
+    # the persistent batch. Truncate them to get condensed persistent batch
+    condensed_batch_size = batch_size + num_step_add - num_step_remove
+    persistent_batch[:] = persistent_batch[0:condensed_batch_size]
+
+    if condensed_batch_size > 1:
+        # Simulate arbitrary reorder_batch() in the kernel backend
+        # Generate a random number k of non-overlapping swap tuples
+        k = random.randint(0, condensed_batch_size // 2)
+        idxs = list(range(condensed_batch_size))
+        random.shuffle(idxs)
+        swaps = [
+            tuple(sorted([idxs[2 * i], idxs[2 * i + 1]])) for i in range(k)
+        ]
+        batch_update_builder.moved.extend([
+            (sw[0], sw[1], MoveDirectionality.SWAP) for sw in swaps
+        ])
+        for adx, bdx in swaps:
+            persistent_batch[adx], persistent_batch[bdx] = persistent_batch[
+                bdx], persistent_batch[adx]
+
+    return (batch_update_builder.get_and_reset(condensed_batch_size), wdx,
+            workload_size - wdx)
+
+
+def _assert_valid(
+    batch_size: int,
+    persistent_batch: list[LogitsProcsRequestParams],
+    test_fakes: LogitsprocsTestFakes,
+    slice_idxs: list[int],
+    logits_w_lp: torch.Tensor,
+    step_idx: int,
+) -> None:
+    if not slice_idxs:
+        # Trivial case of empty persistent batch
+        assert len(persistent_batch) == 0
+        if logits_w_lp.shape[0] != 0:
+            raise ValueError("Fake persistent batch is empty but logitsprocs "
+                             f"output batch has shape {logits_w_lp.shape}")
+        return
+
+    # Validate logits for each fake request
+    for batch_index in range(batch_size):
+        request_params = persistent_batch[batch_index]
+        # Invoke the appropriate validation function for
+        # the logitproc employed by this request
+        fxn = logitsprocs_test_mapping[request_params.logitproc_type].eval_fxn
+        fxn(test_fakes=test_fakes,
+            persistent_batch=persistent_batch,
+            logits_new=logits_w_lp,
+            batch_index=batch_index,
+            request_params=request_params,
+            step_idx=step_idx)
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("reqs_per_logitproc", [REQS_PER_LOGITPROC])
+@pytest.mark.parametrize("logitsprocs_under_test", _get_test_cases())
+def test_logitsprocs(device: str, reqs_per_logitproc: int,
+                     logitsprocs_under_test: list[str]):
+    random.seed(40)
+    torch.set_default_device(device)
+
+    # Define a shuffled batch of requests which individually use a different
+    # logitproc, or no logitproc at all
+    workload_params = _generate_mixed_logitsprocs_batch_params(
+        reqs_per_logitproc=reqs_per_logitproc,
+        logitsprocs_types=logitsprocs_under_test)
+    workload_size = len(workload_params)
+
+    # Create fake test data structures for testing.
+    test_fakes = _generate_test_fakes(workload_size, device)
+
+    wdx = 0  # Next request index in workload to add
+    persistent_batch: list[LogitsProcsRequestParams] = [
+    ]  # Persistent batch state, as list of workload indices
+
+    # Generate fake removed request indices from current persistent
+    # batch before adds
+    batch_update_builder = BatchUpdateBuilder()
+
+    # Break when entire workload has been added previously and persistent
+    # batch is empty
+    workload_reqs_remaining = workload_size
+    batch_size = 0
+    step_idx = 0
+    while True:
+        if not (workload_reqs_remaining or batch_size):
+            break
+
+        (
+            batch_update,
+            wdx,
+            workload_reqs_remaining,
+        ) = _generate_fake_step_update(
+            persistent_batch=persistent_batch,
+            workload_params=workload_params,
+            wdx=wdx,
+            batch_update_builder=batch_update_builder,
+        )
+        batch_size = len(persistent_batch)
+
+        # Apply fake batch update to logitsprocs
+        fake_update_logitsprocs_state(test_fakes, batch_update)
+
+        # Emulate application of logits processors in engine
+        slice_idxs = [req.workload_index for req in persistent_batch]
+        logits_w_lp = fake_apply_logitsprocs(test_fakes, slice_idxs).cpu()
+
+        _assert_valid(
+            batch_size=batch_size,
+            persistent_batch=persistent_batch,
+            test_fakes=test_fakes,
+            slice_idxs=slice_idxs,
+            logits_w_lp=logits_w_lp,
+            step_idx=step_idx,
+        )
+
+        step_idx += 1
diff --git a/vllm_v0.10.0/tests/v1/sample/test_logprobs.py b/vllm_v0.10.0/tests/v1/sample/test_logprobs.py
new file mode 100644
index 0000000..680e2ce
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/sample/test_logprobs.py
@@ -0,0 +1,471 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import itertools
+from collections.abc import Generator
+
+import pytest
+import torch
+
+from tests.v1.sample.utils import (
+    BatchLogprobsComposition, BatchLogprobsSpecType,
+    assert_incr_detok_str_matches_non_incr_detok_str,
+    compute_correct_cumulative_logprob, get_test_batch)
+from vllm import SamplingParams
+from vllm.config import LogprobsMode
+
+from ...conftest import HfRunner, VllmRunner
+
+MODEL = "meta-llama/Llama-3.2-1B-Instruct"
+DTYPE = "half"
+
+NONE = BatchLogprobsComposition.NONE
+SAMPLE = BatchLogprobsComposition.SAMPLE
+PROMPT = BatchLogprobsComposition.PROMPT
+SAMPLE_PROMPT = BatchLogprobsComposition.SAMPLE_PROMPT
+
+
+@pytest.fixture(
+    scope="module",
+    # Parameterize APC
+    params=[False, True])
+def vllm_model(vllm_runner, request) -> Generator[VllmRunner, None, None]:
+    with vllm_runner(
+            MODEL,
+            dtype=DTYPE,
+            max_logprobs=7,
+            # Very small number of batched tokens to ensure
+            # that we test chunking.
+            max_num_batched_tokens=16,
+            max_num_seqs=16,
+            max_model_len=128,
+            enforce_eager=True,
+            #TODO: enable this once we support it for
+            # prompt logprobs.
+            enable_prefix_caching=request.param,
+            gpu_memory_utilization=0.4,  # up to 2 alive concurrently
+    ) as vllm_model:
+        yield vllm_model
+
+
+@pytest.fixture(scope="module")
+def hf_model(hf_runner) -> Generator[HfRunner, None, None]:
+    with hf_runner(MODEL, dtype=DTYPE) as hf_model:
+        yield hf_model
+
+
+def _repeat_logprob_config(
+    test_prompts,
+    logprob_prompt_logprob_list: BatchLogprobsSpecType,
+) -> BatchLogprobsSpecType:
+    """Ensure each test prompt has a logprob config.
+
+    A logprob config specifies the optional (i.e.
+    may-be-`None`) number of sample logprobs and
+    the optional number of prompt logprobs.
+
+    If more test prompts than logprob configs are
+    provided, the provided logprob configs are
+    tiled to match the number of test prompts.
+
+    If fewer test prompts than logprob configs
+    are provided, the list of logprob configs
+    is truncated to match the number of test
+    prompts.
+
+    Otherwise, the list of logprob configs
+    is returned as-is.
+
+    Args:
+      test_prompts: list of prompts under test
+      logprob_prompt_logprob_list: list of
+                            (optional num sample logprob,
+                             optional num prompt logprob)
+                             tuples
+
+    Returns:
+      list of
+      (optional num sample logprob,optional num prompt logprob)
+      tuples which is either identical to
+      `logprob_prompt_logprob_list`, or else repeats
+      `logprob_prompt_logprob_list` enough times to match the
+      number of `test_prompts`, or else is truncated to match
+      the number of `test_prompts`
+    """
+    num_test_prompts = len(test_prompts)
+    # Make sure there is a logprobs configuration for each test prompt
+    logprob_prompt_logprob_list = list(
+        itertools.islice(itertools.cycle(logprob_prompt_logprob_list),
+                         num_test_prompts))
+    # Now the number of prompts should match the number of sample params combos
+    assert num_test_prompts == len(logprob_prompt_logprob_list)
+    return logprob_prompt_logprob_list
+
+
+def _run_and_validate(
+    vllm_model: VllmRunner,
+    test_prompts: list[str],
+    vllm_sampling_params: SamplingParams,
+    hf_logprobs: list[list[torch.Tensor]],
+    hf_outputs: list[tuple[list[int], str]],
+    logprob_prompt_logprob_list: BatchLogprobsSpecType,
+    temperature: float,
+    max_tokens: int,
+    do_apc: bool,
+) -> None:
+    vllm_results = vllm_model.llm.generate(
+        test_prompts, sampling_params=vllm_sampling_params)
+
+    for vllm_result, hf_logprob, hf_output, logprob_prompt_logprob in zip(
+            vllm_results, hf_logprobs, hf_outputs,
+            logprob_prompt_logprob_list):
+
+        # Extract request-level (prompt)logprobs config
+        num_top_logprobs, num_top_prompt_logprobs = logprob_prompt_logprob
+
+        # Test whether sampled token output is consistent between vLLM and HF
+        # vLLM prompt+completion should match HF output
+        if temperature == 0.0:
+            assert (vllm_result.prompt_token_ids +
+                    vllm_result.outputs[0].token_ids == hf_output[0])
+        else:
+            # Sampled tokens won't match if not greedy
+            assert (vllm_result.prompt_token_ids == hf_output[0]
+                    [:len(vllm_result.prompt_token_ids)])
+
+        # Validate sample logprobs
+        if num_top_logprobs is not None:
+            assert num_top_logprobs is not None
+            # Confirm that the structure of the sample logprobs in the result is
+            # correct
+            assert vllm_result.outputs[0].logprobs is not None
+            assert len(vllm_result.outputs[0].logprobs) == max_tokens
+            for logprobs, token_id in zip(vllm_result.outputs[0].logprobs,
+                                          vllm_result.outputs[0].token_ids):
+                assert logprobs is not None
+
+                # Confirm that the output token appears among the logprobs
+                assert token_id in logprobs
+                token_in_topk = logprobs[token_id].rank <= num_top_logprobs
+
+                # If the output token is not included in the top K
+                # logprob, it can return 1 more data
+                if token_in_topk and num_top_logprobs != 0:
+                    assert len(logprobs) == num_top_logprobs
+                else:
+                    assert len(logprobs) == num_top_logprobs + 1
+
+                if num_top_logprobs > 0:
+                    # We should have an entry for each of the topk ranks
+                    all_ranks = {lp.rank for lp in logprobs.values()}
+                    assert all(r in all_ranks
+                               for r in range(1, num_top_logprobs + 1))
+
+            output_text = vllm_result.outputs[0].text
+            output_string_from_most_likely_tokens_lst: list[str] = []
+            for top_logprobs in vllm_result.outputs[0].logprobs:
+                top_logprob = next(iter(top_logprobs.values()))
+                output_string_from_most_likely_tokens_lst.append(
+                    top_logprob.decoded_token)
+
+            output_string_from_most_likely_tokens = "".join(
+                output_string_from_most_likely_tokens_lst)
+            assert_incr_detok_str_matches_non_incr_detok_str(
+                output_text, output_string_from_most_likely_tokens,
+                "The output text from the top logprob for each token "
+                "position should be the same as the output text in the "
+                "result.")
+
+            # Compare vLLM sample logprobs to HF
+            vllm_sample_logprobs = vllm_result.outputs[0].logprobs
+            for i, top_logprobs in enumerate(vllm_sample_logprobs):
+                for token_id, sample_logprob in top_logprobs.items():
+                    if temperature == 0.0 or i == 0:
+                        logprob = sample_logprob.logprob
+                        torch.testing.assert_close(
+                            logprob,
+                            hf_logprob[i][-1][token_id].item(),
+                            atol=1e-2,
+                            rtol=1e-2)
+                    assert isinstance(
+                        sample_logprob.decoded_token,
+                        str), ("The token should be decoded by the time it is"
+                               " returned to the user.")
+
+            # At this point we know the sample logprobs are correct for this
+            # request. Validate that cumulative_logprob is actually the sum.
+            # For each request, assert that the returned cumulative logprob
+            # matches the correct value, which is computed below.
+            torch.testing.assert_close(
+                vllm_result.outputs[0].cumulative_logprob,
+                compute_correct_cumulative_logprob(vllm_result.outputs[0]),
+                atol=1e-6,
+                rtol=1e-6)
+        else:
+            # Logprobs disabled for this request; should be None
+            assert vllm_result.outputs[0].logprobs is None
+
+        # Validate prompt logprobs
+        if num_top_prompt_logprobs is not None:
+            # Confirm that structure of prompt logprobs in result is correct
+            assert vllm_result.prompt_logprobs is not None
+            # - The first prompt logprob is always None
+            assert vllm_result.prompt_logprobs[0] is None
+            # - Prompt logprobs are returned for all indices in
+            #   the prompt
+            assert len(vllm_result.prompt_logprobs) == len(
+                vllm_result.prompt_token_ids)
+            for prompt_logprobs, prompt_token_id in zip(
+                    vllm_result.prompt_logprobs[1:],
+                    vllm_result.prompt_token_ids[1:]):
+                assert prompt_logprobs is not None
+
+                # Confirm that the prompt token appears among the logprobs
+                assert prompt_token_id in prompt_logprobs
+                token_in_topk = prompt_logprobs[
+                    prompt_token_id].rank <= num_top_prompt_logprobs
+
+                # If the prompt token is not included in the top K
+                # logprob, it can return 1 more data
+                if token_in_topk and num_top_prompt_logprobs != 0:
+                    assert len(prompt_logprobs) == num_top_prompt_logprobs
+                else:
+                    assert len(prompt_logprobs) == num_top_prompt_logprobs + 1
+
+                if num_top_prompt_logprobs > 0:
+                    # We should have an entry for each of the topk ranks
+                    all_ranks = {lp.rank for lp in prompt_logprobs.values()}
+                    assert all(r in all_ranks
+                               for r in range(1, num_top_prompt_logprobs + 1))
+
+            # Compare prompt logprobs to HF
+            # The first prompt logprob is always None, so we compare it from
+            # 1:.
+            vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:]
+            for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs):
+                for token_id, logprob in vllm_prompt_logprob_dict.items():
+                    torch.testing.assert_close(
+                        logprob.logprob,
+                        hf_logprob[0][i][token_id].item(),
+                        atol=2e-2,
+                        rtol=2e-2)
+        else:
+            assert vllm_result.prompt_logprobs is None
+
+
+@pytest.mark.parametrize("batch_logprobs_composition",
+                         [NONE, SAMPLE, PROMPT, SAMPLE_PROMPT])
+@pytest.mark.parametrize("temperature", [0.0, 2.0])
+def test_get_logprobs_and_prompt_logprobs(
+        hf_model, vllm_model,
+        batch_logprobs_composition: BatchLogprobsComposition,
+        temperature: float, example_prompts: list[str],
+        monkeypatch: pytest.MonkeyPatch) -> None:
+    """Test V1 Engine logprobs & prompt logprobs
+
+    Exercise a variety of combinations of `logprobs` and `prompt_logprobs`
+    settings and validate that
+    * The generated logprobs and prompt logprobs are consistent with the
+      configuration settings, in terms of whether or not the logprobs
+      (of either type) were requested and how many were requested
+    * The generated logprobs are consistent with the generated tokens
+    * The generated (prompt)logprobs are consistent with HuggingFace
+      (prompt)logprobs, as a reference
+
+    batch_logprobs_composition controls the logprobs configurations for
+    requests in the batch under test.
+
+    APC tests run two test iterations so that cache hits occur.
+
+    To save time, only test one APC-enabled scenario
+    (sample & prompt logprobs enabled, temperature>0.0).
+
+    Args:
+      hf_model: HuggingFace reference model fixture
+      vllm_model: vLLM model fixture
+      batch_logprobs_composition: logprobs configuration for test batch
+      temperature: "temperature" sampling parameter
+      example_prompts: example prompt fixture
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching
+        if do_apc and (temperature < 2.0
+                       or batch_logprobs_composition != SAMPLE_PROMPT):
+            # Skip some test-cases to save time.
+            pytest.skip()
+        test_prompts = example_prompts
+
+        max_tokens = 5
+        hf_outputs = hf_model.generate_greedy(
+            test_prompts,
+            max_tokens=max_tokens,
+        )
+        hf_logprobs = hf_model.generate_greedy_logprobs(
+            test_prompts,
+            max_tokens=max_tokens,
+        )
+
+        # Batch has mixed sample params
+        # (different logprobs/prompt logprobs combos)
+        logprob_prompt_logprob_list = get_test_batch(
+            batch_logprobs_composition)
+
+        # Ensure that each test prompt has a logprob config for testing
+        logprob_prompt_logprob_list = _repeat_logprob_config(
+            test_prompts, logprob_prompt_logprob_list)
+        # Generate SamplingParams
+        vllm_sampling_params = [
+            SamplingParams(max_tokens=max_tokens,
+                           logprobs=num_lp,
+                           prompt_logprobs=num_plp,
+                           temperature=temperature,
+                           seed=1984)
+            for num_lp, num_plp in logprob_prompt_logprob_list
+        ]
+        for _ in range(2 if do_apc else 1):
+            _run_and_validate(
+                vllm_model=vllm_model,
+                test_prompts=test_prompts,
+                vllm_sampling_params=vllm_sampling_params,
+                hf_logprobs=hf_logprobs,
+                hf_outputs=hf_outputs,
+                logprob_prompt_logprob_list=logprob_prompt_logprob_list,
+                temperature=temperature,
+                max_tokens=max_tokens,
+                do_apc=do_apc)
+
+
+def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
+    """vLLM v1 engine should fail a request with `logprobs > max_logprobs`
+    Should also fail for `prompt_logprobs > max_logprobs`
+    APC should not matter as this test checks basic request validation.
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        runner = VllmRunner(
+            "facebook/opt-125m",
+            max_logprobs=1,
+            enable_prefix_caching=False,
+            # 2 other llms alive during whole session
+            gpu_memory_utilization=0.15,
+            max_model_len=256)
+        vllm_sampling_params = SamplingParams(logprobs=1)
+        # should pass
+        runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
+
+        bad_sampling_params = SamplingParams(logprobs=2)
+        with pytest.raises(ValueError):
+            runner.generate(["Hello world"],
+                            sampling_params=bad_sampling_params)
+
+
+def test_none_logprobs(vllm_model, example_prompts,
+                       monkeypatch: pytest.MonkeyPatch):
+    """Engine should return `logprobs` and `prompt_logprobs` as `None`
+
+    Args:
+      vllm_model: vLLM model fixture
+      example_prompts: list of example prompts (test fixture)
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        max_tokens = 5
+
+        sampling_params_logprobs_none = SamplingParams(
+            max_tokens=max_tokens,
+            logprobs=None,
+            prompt_logprobs=None,
+            temperature=0.0,
+        )
+        results_logprobs_none = vllm_model.llm.generate(
+            example_prompts,
+            sampling_params=sampling_params_logprobs_none,
+        )
+
+        for i in range(len(results_logprobs_none)):
+            # Check sample logprobs are None
+            assert results_logprobs_none[i].outputs[0].logprobs is None
+            assert results_logprobs_none[i].outputs[
+                0].cumulative_logprob is None
+            # Check prompt logprobs are None
+            assert results_logprobs_none[i].prompt_logprobs is None
+
+
+def test_zero_logprobs(vllm_model, example_prompts,
+                       monkeypatch: pytest.MonkeyPatch):
+    """Engine should return sampled token and prompt token logprobs
+
+    Args:
+      vllm_model: vLLM model fixture
+      example_prompts: list of example prompts (test fixture)
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        max_tokens = 5
+
+        sampling_params_logprobs_zero = SamplingParams(max_tokens=max_tokens,
+                                                       logprobs=0,
+                                                       prompt_logprobs=0,
+                                                       temperature=0.0)
+        results_logprobs_zero = vllm_model.llm.generate(
+            example_prompts, sampling_params=sampling_params_logprobs_zero)
+
+        for i in range(len(results_logprobs_zero)):
+            # Check that there is one sample logprob dict for each
+            # sample token
+            logprobs = results_logprobs_zero[i].outputs[0].logprobs
+            prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
+            sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
+            prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
+            assert logprobs is not None
+            assert len(sampled_token_ids) == len(logprobs)
+            assert results_logprobs_zero[i].outputs[
+                0].cumulative_logprob is not None
+            # Check that there is one prompt logprob dict for each
+            # prompt token
+            assert prompt_logprobs is not None
+            assert len(prompt_token_ids) == len(prompt_logprobs)
+
+
+@pytest.mark.parametrize(
+    "logprobs_mode",
+    ["raw_logprobs", "raw_logits", "processed_logprobs", "processed_logits"])
+def test_logprobs_mode(logprobs_mode: LogprobsMode,
+                       monkeypatch: pytest.MonkeyPatch):
+    """Test with LLM engine with different logprobs_mode.
+    For logprobs, we should have non-positive values.
+    For logits, we should expect at least one positive values.
+    """
+    from vllm import LLM
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        llm = LLM(
+            "facebook/opt-125m",
+            max_logprobs=5,
+            enable_prefix_caching=False,
+            # 2 other llms alive during whole session
+            gpu_memory_utilization=0.05,
+            max_model_len=16,
+            logprobs_mode=logprobs_mode)
+        vllm_sampling_params = SamplingParams(logprobs=1)
+        results = llm.generate(["Hello world"],
+                               sampling_params=vllm_sampling_params)
+
+        total_token_with_logprobs = 0
+        positive_values = 0
+        for output in results[0].outputs:
+            for logprobs in output.logprobs:
+                for token_id in logprobs:
+                    logprob = logprobs[token_id]
+                    if "logprobs" in logprobs_mode:
+                        assert logprob.logprob <= 0
+                    if logprob.logprob > 0:
+                        positive_values = positive_values + 1
+                    total_token_with_logprobs = total_token_with_logprobs + 1
+        assert total_token_with_logprobs >= len(results[0].outputs)
+        if "logits" in logprobs_mode:
+            assert positive_values > 0
+        del llm
diff --git a/vllm_v0.10.0/tests/v1/sample/test_logprobs_e2e.py b/vllm_v0.10.0/tests/v1/sample/test_logprobs_e2e.py
new file mode 100644
index 0000000..50b14a1
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/sample/test_logprobs_e2e.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import lm_eval
+
+from ...utils import RemoteOpenAIServer
+
+# arc-easy uses prompt_logprobs=1, logprobs=1
+TASK = "arc_easy"
+FILTER = "acc_norm,none"
+RTOL = 0.03
+EXPECTED_VALUE = 0.62
+
+# FIXME(rob): enable prefix caching once supported.
+MODEL = "meta-llama/Llama-3.2-1B-Instruct"
+MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False,gpu_memory_utilization=0.8"  # noqa: E501
+SERVER_ARGS = [
+    "--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests",
+    "--gpu-memory-utilization=0.8"
+]
+NUM_CONCURRENT = 100
+
+
+def test_prompt_logprobs_e2e():
+    results = lm_eval.simple_evaluate(model="vllm",
+                                      model_args=MODEL_ARGS,
+                                      tasks=TASK,
+                                      batch_size="auto")
+
+    measured_value = results["results"][TASK][FILTER]
+    assert (measured_value - RTOL < EXPECTED_VALUE
+            and measured_value + RTOL > EXPECTED_VALUE
+            ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+
+
+def test_prompt_logprobs_e2e_server():
+    with RemoteOpenAIServer(MODEL, SERVER_ARGS) as remote_server:
+        url = f"{remote_server.url_for('v1')}/completions"
+
+        model_args = (
+            f"model={MODEL},"
+            f"base_url={url},"
+            f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")
+
+        results = lm_eval.simple_evaluate(
+            model="local-completions",
+            model_args=model_args,
+            tasks=TASK,
+        )
+
+        measured_value = results["results"][TASK][FILTER]
+        assert (measured_value - RTOL < EXPECTED_VALUE
+                and measured_value + RTOL > EXPECTED_VALUE
+                ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
diff --git a/vllm_v0.10.0/tests/v1/sample/test_rejection_sampler.py b/vllm_v0.10.0/tests/v1/sample/test_rejection_sampler.py
new file mode 100644
index 0000000..3a4d48a
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/sample/test_rejection_sampler.py
@@ -0,0 +1,611 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, Optional
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from vllm.platforms import current_platform
+from vllm.v1.sample.logits_processor import LogitsProcessorManager
+from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.sample.rejection_sampler import (PLACEHOLDER_TOKEN_ID,
+                                              RejectionSampler)
+from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
+
+DEVICE = current_platform.device_type
+
+
+@pytest.fixture
+def rejection_sampler():
+    return RejectionSampler()
+
+
+def create_logits_tensor(output_token_ids: list[list[int]],
+                         vocab_size: int = 100) -> torch.Tensor:
+    """Helper function to create logits tensor that
+       will produce desired token ids on argmax"""
+    token_ids = [tokens[:-1] for tokens in output_token_ids]
+    num_total_tokens = sum(len(tokens) for tokens in token_ids)
+    logits = torch.full((num_total_tokens, vocab_size), -100.0, device=DEVICE)
+    start_loc = 0
+    for tokens in token_ids:
+        for j, token_id in enumerate(tokens):
+            logits[start_loc + j, token_id] = 100.0
+        start_loc += len(tokens)
+    return logits
+
+
+def create_sampling_metadata(
+    all_greedy: bool,
+    temperature: Optional[torch.Tensor] = None,
+    top_k: Optional[torch.Tensor] = None,
+    top_p: Optional[torch.Tensor] = None,
+    generators: Optional[dict[int, Any]] = None,
+) -> SamplingMetadata:
+    """Create a v1 sampling metadata object with all_greedy set
+        to the given value. Either all greedy or all random sampling
+        is used.
+    """
+    generators = generators or {}
+    if all_greedy:
+        temperature = None
+    else:
+        assert temperature is not None
+
+    return SamplingMetadata(
+        temperature=temperature,
+        all_greedy=all_greedy,
+        all_random=not all_greedy,
+        top_p=top_p,
+        top_k=top_k,
+        generators=generators,
+        max_num_logprobs=0,
+        no_penalties=False,
+        prompt_token_ids=None,
+        frequency_penalties=torch.tensor([]),
+        presence_penalties=torch.tensor([]),
+        repetition_penalties=torch.tensor([]),
+        output_token_ids=[],
+        allowed_token_ids_mask=None,
+        bad_words_token_ids={},
+        logitsprocs=LogitsProcessorManager(),
+    )
+
+
+########################### Tests for Greedy Sampling ###################
+def test_perfect_match(rejection_sampler):
+    """Test when output tokens perfectly match speculated tokens"""
+    spec_tokens = [[1, 2, 3]]
+    output_tokens = [[1, 2, 3, 4]]  # 4 is the bonus token
+
+    metadata = create_sampling_metadata(all_greedy=True)
+    logits = create_logits_tensor(output_tokens)
+    bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
+                                      device=logits.device)
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
+                                                         device=logits.device)
+
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        target_logits=logits,
+        bonus_token_ids=bonus_token_tensor,
+        sampling_metadata=metadata,
+    )
+    expected = torch.tensor([[1, 2, 3, 4]],
+                            dtype=torch.int,
+                            device=logits.device)
+    assert torch.equal(output, expected)
+
+
+def test_early_mismatch(rejection_sampler):
+    """Test when there's an early mismatch in tokens"""
+    spec_tokens = [[1, 2, 3]]
+    output_tokens = [[1, 5, 3, 4]]  # Mismatch at position 1
+
+    metadata = create_sampling_metadata(all_greedy=True)
+    logits = create_logits_tensor(output_tokens)
+    bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
+                                      device=logits.device)
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
+                                                         device=logits.device)
+
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        target_logits=logits,
+        bonus_token_ids=bonus_token_tensor,
+        sampling_metadata=metadata,
+    )
+    expected = torch.tensor(
+        [[1, 5, PLACEHOLDER_TOKEN_ID, PLACEHOLDER_TOKEN_ID]],
+        dtype=torch.int,
+        device=logits.device,
+    )
+    assert torch.equal(output, expected)
+
+
+def test_multiple_sequences(rejection_sampler):
+    """Test handling multiple sequences of speculated tokens"""
+    spec_tokens = [[1, 2], [3]]
+    output_tokens = [[1, 2, 5], [3,
+                                 4]]  # Two sequences with bonus tokens 5 and 4
+
+    metadata = create_sampling_metadata(all_greedy=True)
+    logits = create_logits_tensor(output_tokens)
+    bonus_token_tensor = torch.tensor(
+        [output_tokens[0][-1], output_tokens[1][-1]], device=logits.device)
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
+                                                         device=logits.device)
+
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        target_logits=logits,
+        bonus_token_ids=bonus_token_tensor,
+        sampling_metadata=metadata,
+    )
+    expected = torch.tensor([[1, 2, 5], [3, 4, PLACEHOLDER_TOKEN_ID]],
+                            dtype=torch.int,
+                            device=logits.device)
+    assert torch.equal(output, expected)
+
+
+def test_single_token_sequence(rejection_sampler):
+    """Test handling sequences with single token"""
+    spec_tokens = [[1]]
+    output_tokens = [[1, 2]]  # Single token with bonus token 2
+
+    metadata = create_sampling_metadata(all_greedy=True)
+    logits = create_logits_tensor(output_tokens)
+    bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
+                                      device=logits.device)
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
+                                                         device=logits.device)
+
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        target_logits=logits,
+        bonus_token_ids=bonus_token_tensor,
+        sampling_metadata=metadata,
+    )
+    expected = torch.tensor([[1, 2]], dtype=torch.int, device=logits.device)
+    assert torch.equal(output, expected)
+
+
+def test_empty_sequence(rejection_sampler):
+    """Test handling empty sequence of speculated tokens"""
+    spec_tokens: list[list[int]] = [[]]
+    output_tokens = [[5]]  # Just the bonus token
+
+    metadata = create_sampling_metadata(all_greedy=True)
+    logits = create_logits_tensor(output_tokens)
+    bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
+                                      device=logits.device)
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
+                                                         device=logits.device)
+
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        target_logits=logits,
+        bonus_token_ids=bonus_token_tensor,
+        sampling_metadata=metadata,
+    )
+    expected = torch.tensor([[5]], dtype=torch.int, device=logits.device)
+    assert torch.equal(output, expected)
+
+
+def test_multiple_mismatches(rejection_sampler):
+    """Test handling multiple sequences with mismatches"""
+    spec_tokens = [[1, 2, 3], [4, 5, 6]]
+    output_tokens = [[1, 2, 7, 6], [4, 8, 6,
+                                    9]]  # Mismatches in both sequences
+
+    metadata = create_sampling_metadata(all_greedy=True)
+    logits = create_logits_tensor(output_tokens)
+    bonus_token_tensor = torch.tensor(
+        [output_tokens[0][-1], output_tokens[1][-1]], device=logits.device)
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
+                                                         device=logits.device)
+
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        target_logits=logits,
+        bonus_token_ids=bonus_token_tensor,
+        sampling_metadata=metadata,
+    )
+    expected = torch.tensor(
+        [[1, 2, 7, PLACEHOLDER_TOKEN_ID],
+         [4, 8, PLACEHOLDER_TOKEN_ID, PLACEHOLDER_TOKEN_ID]],
+        dtype=torch.int,
+        device=logits.device,
+    )
+    assert torch.equal(output, expected)
+
+
+@pytest.mark.parametrize(
+    "spec_tokens,output_tokens,expected",
+    [
+        ([[1, 2]], [[1, 2, 3]], [[1, 2, 3]]),  # Perfect match with bonus
+        ([[1]], [[2, 3]], [[2, PLACEHOLDER_TOKEN_ID]]),  # First mismatch
+        ([[1, 2], [3, 4]], [[1, 5, 6], [3, 4, 7]],
+         [[1, 5, PLACEHOLDER_TOKEN_ID], [3, 4, 7]]),  # Mixed matches
+    ])
+def test_parametrized_cases(rejection_sampler, spec_tokens, output_tokens,
+                            expected):
+    """Parametrized test for various matching scenarios"""
+    metadata = create_sampling_metadata(all_greedy=True)
+    logits = create_logits_tensor(output_tokens)
+    bonus_token_tensor = torch.tensor([tokens[-1] for tokens in output_tokens],
+                                      device=logits.device)
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
+                                                         device=logits.device)
+
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        target_logits=logits,
+        bonus_token_ids=bonus_token_tensor,
+        sampling_metadata=metadata,
+    )
+    expected_tensor = torch.tensor(expected,
+                                   dtype=torch.int,
+                                   device=logits.device)
+    assert torch.equal(output, expected_tensor)
+
+
+########################### Tests for Random Sampling ###################
+@pytest.mark.parametrize("k", [1, 3, 5])
+@pytest.mark.parametrize("vocab_size", [1000])
+@pytest.mark.parametrize("batch_size", [1, 4, 8])
+@pytest.mark.parametrize("frac_seeded", [0.0, 0.5])
+@pytest.mark.parametrize("n_rep", [20])
+def test_deterministic_when_seeded(
+    rejection_sampler,
+    k: int,
+    vocab_size: int,
+    batch_size: int,
+    frac_seeded: float,
+    n_rep: int,
+):
+    num_tokens = batch_size * k
+    draft_probs = torch.rand(num_tokens,
+                             vocab_size,
+                             dtype=torch.float32,
+                             device=DEVICE)
+    draft_probs = F.softmax(draft_probs, dim=-1)
+    target_logits = torch.rand_like(draft_probs)
+    bonus_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, 1),
+                                    dtype=torch.int64,
+                                    device=DEVICE)
+    draft_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, k),
+                                    dtype=torch.int64,
+                                    device=DEVICE)
+
+    seeded_mask = torch.rand(batch_size, dtype=torch.float32) <= frac_seeded
+
+    results = []
+    for _ in range(n_rep):
+        seeded_seqs = {
+            i: torch.Generator(device=DEVICE).manual_seed(i)
+            for i in range(batch_size) if seeded_mask[i]
+        }
+
+        temperature = torch.ones(batch_size,
+                                 dtype=torch.float32,
+                                 device=DEVICE)
+        sampling_metadata = create_sampling_metadata(all_greedy=False,
+                                                     temperature=temperature,
+                                                     generators=seeded_seqs)
+        spec_decode_metadata = SpecDecodeMetadata.make_dummy(
+            draft_token_ids.tolist(), device=DEVICE)
+        rep_result = rejection_sampler(
+            spec_decode_metadata,
+            draft_probs=draft_probs,
+            target_logits=target_logits,
+            bonus_token_ids=bonus_token_ids,
+            sampling_metadata=sampling_metadata,
+        )
+
+        results.append(rep_result)
+
+    for i in range(batch_size):
+        if seeded_mask[i]:
+            for j in range(1, n_rep):
+                assert torch.equal(results[j][i], results[0][i])
+
+
+def test_rejection_sampling_approximates_target_distribution():
+    """Verify rejection sampling approximates target distribution,
+    despite sampling from a potentially distinct draft distribution.
+
+    This is done by first creating a random target probability
+    distribution and a random draft probability distribution. We then
+    sample token ids from the rejection sampler using these draft
+    and target distributions. The samples are used to estimate
+    the output probability distribution, which we expect to approximate
+    the target distribution.
+
+    A basic distance metric is used to determine similarity between
+    distributions.
+
+    We expect that as we increase the number of samples,
+    the distance between the observed distribution and the target
+    distribution decreases. To measure this, we compare the distance
+    of the observed distribution against both the target distribution
+    and a uniform random distribution. We expect the distance between
+    the observed distribution and the target distribution to improve
+    much more than the distance improvement between the observed
+    distribution and the random distribution.
+    """
+    torch.set_default_device(DEVICE)
+    vocab_size = 10
+    k = 2
+    num_reference_probs = 100
+
+    # Prepare draft, target, and reference probability distributions
+    draft_probs = F.softmax(torch.rand(vocab_size, dtype=torch.float32),
+                            dim=-1)
+    target_logits = torch.rand(vocab_size, dtype=torch.float32)
+    target_probs = F.softmax(target_logits, dim=-1)
+    reference_probs = F.softmax(
+        torch.rand(num_reference_probs, vocab_size, dtype=torch.float32),
+        dim=-1,
+    )
+
+    sample_sizes = [10, 100, 1_000, 10_000, 100_000]
+    distance_wrt_reference: list[float] = []
+    distance_wrt_target: list[float] = []
+
+    for num_samples in sample_sizes:
+        # Sample using rejection sampling.
+        rej_sample_probs = estimate_rejection_sampling_pdf(
+            draft_probs, target_logits, k, vocab_size, num_samples)
+        rej_sample_probs = rej_sample_probs.to(DEVICE)
+
+        # Average distance from reference probs.
+        reference_vs_rejsample_dist = torch.dist(
+            reference_probs,
+            rej_sample_probs).item() / reference_probs.shape[0]
+        target_vs_rejsample_dist = torch.dist(target_probs,
+                                              rej_sample_probs).item()
+
+        distance_wrt_reference.append(reference_vs_rejsample_dist)
+        distance_wrt_target.append(target_vs_rejsample_dist)
+
+        relative_change_in_distance_wrt_target = get_ratio_first_to_last(
+            distance_wrt_target)
+        relative_change_in_distance_wrt_reference = get_ratio_first_to_last(
+            distance_wrt_reference)
+
+        print(f"{num_samples=} {target_vs_rejsample_dist=:.05f} "
+              f"{reference_vs_rejsample_dist=:.05f}")
+        print(f"{num_samples=} {relative_change_in_distance_wrt_target=:.02f} "
+              f"{relative_change_in_distance_wrt_reference=:.02f}")
+
+    relative_change_in_distance_wrt_target = get_ratio_first_to_last(
+        distance_wrt_target)
+    relative_change_in_distance_wrt_reference = get_ratio_first_to_last(
+        distance_wrt_reference)
+
+    expected_improvement_multiplier = 20
+    assert (relative_change_in_distance_wrt_target
+            > relative_change_in_distance_wrt_reference *
+            expected_improvement_multiplier)
+
+
+def get_ratio_first_to_last(elements: list[float]) -> float:
+    return elements[0] / elements[-1]
+
+
+def estimate_rejection_sampling_pdf(
+    draft_probs: torch.Tensor,
+    target_logits: torch.Tensor,
+    k: int,
+    vocab_size: int,
+    num_samples: int,
+) -> torch.Tensor:
+    """Estimate the probability distribution of the output tokens
+    using rejection sampling.
+
+    Args:
+        draft_probs: Draft probability distribution.
+        target_logits: Target logits.
+        num_samples: Number of samples to draw.
+
+    Returns:
+        Estimated probability distribution of the output tokens.
+    """
+    rejection_sampler = RejectionSampler()
+    num_tokens = num_samples * k
+    # Repeat draft probs num_samples * k times.
+    draft_probs = draft_probs.reshape(1, 1,
+                                      vocab_size).repeat(num_samples, k, 1)
+
+    # Repeat target probs num_tokens times.
+    target_logits = target_logits.reshape(1, vocab_size).repeat(num_tokens, 1)
+
+    # Randomly sample draft token ids from draft probs.
+    draft_token_ids = torch.multinomial(draft_probs[:, 0, :],
+                                        num_samples=k,
+                                        replacement=True).reshape(
+                                            num_samples, k)
+    draft_probs = draft_probs.view(num_tokens, vocab_size)
+
+    # Bonus tokens not used but required.
+    bonus_token_ids = torch.zeros((1, 1), dtype=torch.int64,
+                                  device=DEVICE).repeat(num_samples, 1)
+
+    temperature = torch.ones(num_samples, dtype=torch.float32, device=DEVICE)
+    sampling_metadata = create_sampling_metadata(all_greedy=False,
+                                                 temperature=temperature)
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(
+        draft_token_ids.tolist(), device=bonus_token_ids.device)
+    output_token_ids = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=draft_probs,
+        target_logits=target_logits,
+        bonus_token_ids=bonus_token_ids,
+        sampling_metadata=sampling_metadata,
+    )
+    output_token_ids = output_token_ids[:, :-1].flatten()
+
+    hist = torch.histogram(output_token_ids.to(dtype=torch.float,
+                                               device="cpu"),
+                           bins=vocab_size,
+                           range=(0, vocab_size),
+                           density=True)
+
+    return hist.hist
+
+
+def _test_masked_logits(
+    rejection_sampler,
+    batch_size: int,
+    num_draft_tokens: int,
+    vocab_size: int,
+    target_logits: torch.Tensor,
+    unmasked_indices: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+):
+    # Set up test parameters
+    num_tokens = batch_size * num_draft_tokens
+
+    # Create random draft probabilities.
+    draft_probs = torch.rand((num_tokens, vocab_size),
+                             dtype=torch.float32,
+                             device=DEVICE)
+    draft_probs = F.softmax(draft_probs, dim=-1)
+
+    # Randomly sample draft token ids from draft probs
+    draft_token_ids = torch.multinomial(draft_probs, num_samples=1)
+    draft_token_ids = draft_token_ids.reshape(batch_size, num_draft_tokens)
+    draft_token_ids = draft_token_ids.tolist()
+
+    # Bonus tokens not used but required
+    bonus_token_ids = torch.zeros((batch_size, 1),
+                                  dtype=torch.int64,
+                                  device=DEVICE)
+
+    # Create spec decode metadata
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(
+        draft_token_ids,
+        device=DEVICE,
+    )
+
+    # Run rejection sampling
+    output_token_ids = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=draft_probs,
+        target_logits=target_logits,
+        bonus_token_ids=bonus_token_ids,
+        sampling_metadata=sampling_metadata,
+    )
+
+    # Remove bonus tokens and reshape
+    output_token_ids = output_token_ids[:, :-1].flatten().tolist()
+
+    # Check that all sampled tokens are within the unmasked indices.
+    for i in range(num_tokens):
+        token_id = output_token_ids[i]
+        if token_id == PLACEHOLDER_TOKEN_ID:
+            continue
+        assert token_id in unmasked_indices[i]
+
+
+@pytest.mark.parametrize("top_k", [1, 5, 99])
+def test_top_k(rejection_sampler, top_k):
+    """Test rejection sampling with top-k sampling"""
+    vocab_size = 100
+    batch_size = 100
+    num_draft_tokens = 3
+    num_tokens = batch_size * num_draft_tokens
+
+    # Randomly create top-k indices.
+    top_k_indices = [
+        torch.randperm(vocab_size, device=DEVICE)[:top_k]
+        for _ in range(num_tokens)
+    ]
+    top_k_indices = torch.stack(top_k_indices)
+
+    # Create logits with the uniform distribution.
+    target_logits = torch.zeros((num_tokens, vocab_size), device=DEVICE)
+
+    # Increment the logits for top-k indices, a little bit more than the other
+    # ones. If the masking is effective, the non-topk indices will never be
+    # sampled despite the small difference in logits.
+    for i in range(num_tokens):
+        target_logits[i, top_k_indices[i]] += 0.1
+
+    # Create sampling metadata
+    temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE)
+    sampling_metadata = create_sampling_metadata(
+        all_greedy=False,
+        temperature=temperature,
+        top_k=torch.tensor([top_k] * batch_size,
+                           device=DEVICE,
+                           dtype=torch.int64),
+    )
+
+    _test_masked_logits(
+        rejection_sampler,
+        batch_size=batch_size,
+        num_draft_tokens=num_draft_tokens,
+        vocab_size=vocab_size,
+        target_logits=target_logits,
+        unmasked_indices=top_k_indices,
+        sampling_metadata=sampling_metadata,
+    )
+
+
+@pytest.mark.parametrize("top_p", [0.5, 0.9, 0.99])
+def test_top_p(rejection_sampler, top_p):
+    """Test rejection sampling with top-p sampling"""
+    vocab_size = 100
+    batch_size = 100
+    num_draft_tokens = 3
+    num_tokens = batch_size * num_draft_tokens
+
+    # Create logits with the uniform distribution.
+    target_logits = torch.randn((num_tokens, vocab_size), device=DEVICE)
+    temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE)
+    rescaled_logits = target_logits / temperature
+
+    logits_sort, logits_idx = rescaled_logits.sort(dim=-1, descending=False)
+    probs_sort = logits_sort.softmax(dim=-1)
+    probs_sum = probs_sort.cumsum(dim=-1)
+    top_p_mask = probs_sum <= 1 - top_p
+    # at least one
+    top_p_mask[:, -1] = False
+
+    # Get the top-p indices.
+    top_p_indices = []
+    for i in range(num_tokens):
+        top_p_indices.append(logits_idx[i][~top_p_mask[i]].tolist())
+
+    # Create sampling metadata
+    sampling_metadata = create_sampling_metadata(
+        all_greedy=False,
+        temperature=temperature,
+        top_p=torch.tensor([top_p] * batch_size,
+                           device=DEVICE,
+                           dtype=torch.float32),
+    )
+
+    _test_masked_logits(
+        rejection_sampler,
+        batch_size=batch_size,
+        num_draft_tokens=num_draft_tokens,
+        vocab_size=vocab_size,
+        target_logits=target_logits,
+        unmasked_indices=top_p_indices,
+        sampling_metadata=sampling_metadata,
+    )
diff --git a/vllm_v0.10.0/tests/v1/sample/test_sampler.py b/vllm_v0.10.0/tests/v1/sample/test_sampler.py
new file mode 100644
index 0000000..ea10661
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/sample/test_sampler.py
@@ -0,0 +1,404 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import numpy as np
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+from vllm.utils import is_pin_memory_available, make_tensor_with_pad
+from vllm.v1.sample.logits_processor import LogitsProcessorManager
+from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.sample.sampler import Sampler
+
+PIN_MEMORY_AVAILABLE = is_pin_memory_available()
+MAX_NUM_REQS = 256
+VOCAB_SIZE = 1024
+NUM_OUTPUT_TOKENS = 20
+CUDA_DEVICES = [
+    f"{current_platform.device_type}:{i}"
+    for i in range(1 if current_platform.device_count() == 1 else 2)
+]
+MAX_NUM_PROMPT_TOKENS = 64
+
+
+def _create_fake_logits(batch_size: int, vocab_size: int) -> torch.Tensor:
+    fake_logits = torch.full((batch_size, vocab_size), 1e-2, dtype=torch.float)
+    return fake_logits
+
+
+def _create_penalty_tensor(batch_size: int, penalty_value: float,
+                           device: torch.device) -> torch.Tensor:
+    return torch.full((batch_size, ),
+                      fill_value=penalty_value,
+                      dtype=torch.float,
+                      device=device)
+
+
+def _create_prompt_tokens_tensor(
+    prompt_token_ids: list[list[int]],
+    vocab_size: int,
+    device: torch.device,
+) -> torch.Tensor:
+    return make_tensor_with_pad(
+        prompt_token_ids,
+        pad=vocab_size,
+        device=device,
+        dtype=torch.int64,
+        pin_memory=False,
+    )
+
+
+def _create_allowed_token_ids(
+    batch_size: int,
+    vocab_size: int,
+    num_allowed_token_ids: int,
+    device: torch.device,
+) -> Optional[torch.Tensor]:
+    mask: Optional[torch.Tensor] = None
+    for i in range(batch_size):
+        if i % 2 == 1:
+            continue
+        if mask is None:
+            mask = torch.zeros((batch_size, vocab_size),
+                               dtype=torch.bool,
+                               device=device)
+        start = min(i, vocab_size - 1)
+        end = min(i + num_allowed_token_ids, vocab_size - 1)
+        mask[i, start:end] = True
+    return mask
+
+
+def _create_bad_words_token_ids(
+        batch_size: int, vocab_size: int,
+        bad_words_lengths: list[tuple[int]]) -> dict[int, list[list[int]]]:
+    bad_words_token_ids = {}
+    for batch_idx in range(batch_size):
+        token_ids_single_batch = []
+        for bad_words_length in bad_words_lengths:
+            token_ids = np.random.choice(vocab_size,
+                                         size=bad_words_length,
+                                         replace=True).tolist()
+            token_ids_single_batch.append(token_ids)
+        bad_words_token_ids[batch_idx] = token_ids_single_batch
+    if batch_size >= 2:
+        # Test no bad_words for some batch
+        no_bad_words_batch_idx = np.random.choice(batch_size)
+        bad_words_token_ids.pop(no_bad_words_batch_idx, None)
+    return bad_words_token_ids
+
+
+def _update_output_token_ids_for_bad_words(
+        metadata: SamplingMetadata, vocab_size: int) -> dict[int, list[int]]:
+    bad_words_last_tokens = {}
+    for batch_idx, bad_words_token_ids in metadata.bad_words_token_ids.items():
+        output_token_ids = metadata.output_token_ids[batch_idx]
+        bad_words_last_token: list[int] = []
+        for i, bad_word_token_ids in enumerate(bad_words_token_ids):
+            if len(bad_word_token_ids) == 1:
+                # Single token id always affects logits
+                bad_words_last_token.append(bad_word_token_ids[0])
+            else:
+                prefix_length = len(bad_word_token_ids) - 1
+                has_bad_words = np.random.choice([True, False])
+                if has_bad_words:
+                    output_token_ids[-prefix_length:] = bad_word_token_ids[:-1]
+                    bad_words_last_token.append(bad_word_token_ids[-1])
+                    break  # Maximum one update to output_token_ids
+                else:  # Make sure no accidental match to bad words
+                    output_token_ids[-1] = (bad_word_token_ids[-2] +
+                                            1) % vocab_size
+        bad_words_last_tokens[batch_idx] = bad_words_last_token
+    return bad_words_last_tokens
+
+
+def _create_default_sampling_metadata(
+    num_output_tokens: int,
+    batch_size: int,
+    vocab_size: int,
+    device: torch.device,
+) -> SamplingMetadata:
+    output_token_ids: list[list[int]] = []
+    prompt_token_ids: list[list[int]] = []
+    for _ in range(batch_size):
+        output_token_ids.append(
+            np.random.randint(0, vocab_size, size=num_output_tokens).tolist())
+        prompt_token_ids.append(
+            np.random.randint(0,
+                              vocab_size,
+                              size=np.random.randint(
+                                  1, MAX_NUM_PROMPT_TOKENS)).tolist())
+    fake_sampling_metadata = SamplingMetadata(
+        temperature=torch.full((batch_size, ), 0.0),
+        all_greedy=True,
+        all_random=False,
+        top_p=None,
+        top_k=None,
+        generators={},
+        max_num_logprobs=0,
+        prompt_token_ids=_create_prompt_tokens_tensor(prompt_token_ids,
+                                                      vocab_size, device),
+        output_token_ids=output_token_ids,
+        frequency_penalties=_create_penalty_tensor(batch_size, 0.0, device),
+        presence_penalties=_create_penalty_tensor(batch_size, 0.0, device),
+        repetition_penalties=_create_penalty_tensor(batch_size, 1.0, device),
+        no_penalties=True,
+        allowed_token_ids_mask=None,
+        bad_words_token_ids={},
+        logitsprocs=LogitsProcessorManager(),
+    )
+    return fake_sampling_metadata
+
+
+def _create_weighted_output_token_list(
+        batch_size: int,
+        vocab_size: int) -> tuple[list[list[int]], list[list[int]]]:
+    """
+    Creates an output token list where each token occurs a distinct
+    number of times.
+
+    For each batch, a random subset of token IDs is selected from the
+    vocabulary. The selected tokens are then added to the output token
+    list, each with a different frequency.
+
+    Returns:
+        tuple[list[list[int]], list[list[int]]]:
+            - The first element is the output token list, where each sublist
+              corresponds to a batch and contains tokens with weighted
+              frequencies.
+            - The second element is a list of distinct token IDs for each
+              batch, ordered by their frequency in the corresponding output
+              list.
+    """
+    output_token_ids: list[list[int]] = []
+    sorted_token_ids_in_output: list[list[int]] = []
+    for _ in range(batch_size):
+        distinct_token_ids = np.random.choice(vocab_size,
+                                              size=np.random.randint(1, 10),
+                                              replace=False).tolist()
+        sorted_token_ids_in_output.append(distinct_token_ids)
+        output_token_ids_for_batch = []
+        for index, token_id in enumerate(distinct_token_ids):
+            output_token_ids_for_batch.extend(
+                [token_id for _ in range(index + 1)])
+        output_token_ids.append(output_token_ids_for_batch)
+    return output_token_ids, sorted_token_ids_in_output
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize("presence_penalty", [-2.0, 2.0])
+def test_sampler_presence_penalty(device: str, batch_size: int,
+                                  presence_penalty: float):
+    """
+    Test to verify that if presence penalty is enabled then tokens
+    are penalized as per their presence in the existing output.
+    """
+    torch.set_default_device(device)
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    output_token_ids = sampling_metadata.output_token_ids
+    sampling_metadata.presence_penalties = _create_penalty_tensor(
+        batch_size, presence_penalty, torch.device(device))
+    sampling_metadata.no_penalties = False
+    sampler = Sampler()
+    logits = sampler.apply_penalties(fake_logits, sampling_metadata)
+    logits = logits.cpu()
+    for batch_idx in range(batch_size):
+        # Since all tokens initially have the same logits, the non-penalized
+        # token ID will be the one with the highest logit value, while the
+        # penalized token ID will be the one with the lowest logit value.
+        non_penalized_token_id = logits[batch_idx].argmax().item()
+        penalized_token_id = logits[batch_idx].argmin().item()
+        if presence_penalty > 0:
+            # If `presence_penalty` is set to a value greater than 0, it
+            # indicates a preference for new tokens over those already
+            # present in the output.
+            # Verify that the penalized token ID exists in the output, while the
+            # non-penalized token ID does not.
+            assert penalized_token_id in output_token_ids[batch_idx]
+            assert non_penalized_token_id not in output_token_ids[batch_idx]
+        elif presence_penalty < 0:
+            # If `presence_penalty` is set to a value less than 0, it indicates
+            # a preference for existing tokens over new ones. Verify that the
+            # non-penalized token ID exists in the output, while the penalized
+            # token ID does not.
+            assert non_penalized_token_id in output_token_ids[batch_idx]
+            assert penalized_token_id not in output_token_ids[batch_idx]
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize("frequency_penalty", [-2.0, 2.0])
+def test_sampler_frequency_penalty(device: str, batch_size: int,
+                                   frequency_penalty: float):
+    """
+    Test to verify that if frequency penalty is enabled then tokens are
+    penalized as per their frequency of occurrence.
+    """
+    torch.set_default_device(device)
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    sampling_metadata.frequency_penalties = _create_penalty_tensor(
+        batch_size, frequency_penalty, torch.device(device))
+    output_token_ids, sorted_token_ids_in_output = \
+        _create_weighted_output_token_list(
+            batch_size,
+            VOCAB_SIZE,
+        )
+    sampling_metadata.output_token_ids = output_token_ids
+    sampling_metadata.no_penalties = False
+    sampler = Sampler()
+    logits = sampler.apply_penalties(fake_logits, sampling_metadata)
+    logits = logits.cpu()
+    for batch_idx in range(batch_size):
+        non_penalized_token_id = logits[batch_idx].argmax().item()
+        penalized_token_id = logits[batch_idx].argmin().item()
+        distinct_sorted_token_ids_in_output = sorted_token_ids_in_output[
+            batch_idx]
+        most_frequent_token_id = distinct_sorted_token_ids_in_output[
+            len(distinct_sorted_token_ids_in_output) - 1]
+        if frequency_penalty > 0:
+            # If `frequency_penalty` is set to > 0, it indicates
+            # a preference for new tokens over existing ones. Verify that the
+            # non-penalized token ID is not present in the output, while the
+            # most penalized token is the one that occurs most frequently in
+            # the output.
+            assert (non_penalized_token_id
+                    not in distinct_sorted_token_ids_in_output)
+            assert penalized_token_id == most_frequent_token_id
+        elif frequency_penalty < 0:
+            # If `frequency_penalty` is set to < 0, it indicates
+            # a preference for existing tokens over new ones. Verify that the
+            # non-penalized token ID is the one that occurs most frequently
+            # in the output, while the penalized token ID is one that has not
+            # yet appeared.
+            assert non_penalized_token_id == most_frequent_token_id
+            assert penalized_token_id not in distinct_sorted_token_ids_in_output
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize("repetition_penalty", [0.1, 1.9])
+def test_sampler_repetition_penalty(device: str, batch_size: int,
+                                    repetition_penalty: float):
+    """
+    Test to verify that when the repetition penalty is enabled, tokens
+    are penalized based on their presence in the prompt or the existing
+    output.
+    """
+    torch.set_default_device(device)
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    sampling_metadata.repetition_penalties = _create_penalty_tensor(
+        batch_size, repetition_penalty, torch.device(device))
+    sampling_metadata.no_penalties = False
+    sampler = Sampler()
+    logits = sampler.apply_penalties(fake_logits, sampling_metadata)
+    logits = logits.cpu()
+    for batch_idx in range(batch_size):
+        non_penalized_token_id = logits[batch_idx].argmax().item()
+        penalized_token_id = logits[batch_idx].argmin().item()
+        prompt_tokens = sampling_metadata.prompt_token_ids[
+            batch_idx][:].tolist()
+        output_tokens = sampling_metadata.output_token_ids[batch_idx]
+        if repetition_penalty > 1.0:
+            # If `repetition_penalty` > 1.0, verify that the non-penalized
+            # token ID has not been seen before, while the penalized token ID
+            # exists either in the prompt or the output.
+            assert (non_penalized_token_id not in prompt_tokens
+                    and non_penalized_token_id not in output_tokens)
+            assert (penalized_token_id in prompt_tokens
+                    or penalized_token_id in output_tokens)
+        elif repetition_penalty < 1.0:
+            # If `repetition_penalty` < 1.0, verify that the penalized
+            # token ID has not been seen before, while the non-penalized
+            # token ID exists either in the prompt or the output.
+            assert (penalized_token_id not in prompt_tokens
+                    and penalized_token_id not in output_tokens)
+            assert (non_penalized_token_id in prompt_tokens
+                    or non_penalized_token_id in output_tokens)
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize("num_allowed_token_ids", [0, 1, 2])
+def test_sampler_allowed_token_ids(device: str, batch_size: int,
+                                   num_allowed_token_ids: int):
+    """
+    Test to verify that when the repetition penalty is enabled, tokens
+    are penalized based on their presence in the prompt or the existing
+    output.
+    """
+    torch.set_default_device(device)
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    mask = _create_allowed_token_ids(
+        batch_size=batch_size,
+        vocab_size=VOCAB_SIZE,
+        num_allowed_token_ids=num_allowed_token_ids,
+        device=device,
+    )
+    sampling_metadata.allowed_token_ids_mask = mask
+    sampler = Sampler()
+    logits = sampler.apply_allowed_token_ids(fake_logits, sampling_metadata)
+    logits = logits.cpu()
+    for batch_idx in range(batch_size):
+        logits_for_req = logits[batch_idx]
+        if batch_idx % 2 == 1:
+            assert torch.all(logits_for_req != -float("inf"))
+            continue
+        for token_id in range(VOCAB_SIZE):
+            start = min(batch_idx, VOCAB_SIZE - 1)
+            end = min(batch_idx + num_allowed_token_ids, VOCAB_SIZE - 1)
+            if token_id >= start and token_id < end:
+                assert logits_for_req[token_id] == -float(
+                    "inf"), f"{batch_idx}, {token_id}"
+            else:
+                assert logits_for_req[token_id] != -float("inf")
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize("bad_words_lengths", [(1, ), (1, 3), (2, 2)])
+def test_sampler_bad_words(device: str, batch_size: int,
+                           bad_words_lengths: list[tuple[int]]):
+    """
+    Test to verify that when the bad words restriction is present, tokens
+    are penalized based on their match with the bad words.
+    """
+    torch.set_default_device(device)
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    sampling_metadata.bad_words_token_ids = _create_bad_words_token_ids(
+        batch_size, VOCAB_SIZE, bad_words_lengths)
+    bad_words_last_tokens = _update_output_token_ids_for_bad_words(
+        sampling_metadata, VOCAB_SIZE)
+    sampler = Sampler()
+    logits = sampler.apply_bad_words(fake_logits, sampling_metadata)
+    logits = logits.cpu()
+    for batch_idx in range(batch_size):
+        logits_for_req = logits[batch_idx]
+        for token_id in range(VOCAB_SIZE):
+            if (batch_idx in bad_words_last_tokens
+                    and token_id in bad_words_last_tokens[batch_idx]):
+                assert logits_for_req[token_id] == -float("inf")
+            else:
+                assert logits_for_req[token_id] != -float("inf")
diff --git a/vllm_v0.10.0/tests/v1/sample/test_sampling_params_e2e.py b/vllm_v0.10.0/tests/v1/sample/test_sampling_params_e2e.py
new file mode 100644
index 0000000..f53e1e1
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/sample/test_sampling_params_e2e.py
@@ -0,0 +1,194 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+
+import pytest
+
+from vllm import LLM, SamplingParams
+
+if os.getenv("VLLM_USE_V1", "0") != "1":
+    pytest.skip("Test package requires V1", allow_module_level=True)
+
+MODEL = "meta-llama/Llama-3.2-1B"
+PROMPT = "Hello my name is Robert and I"
+
+
+@pytest.fixture(scope="module")
+def llm() -> LLM:
+    # Disable prefix caching so that we can test prompt logprobs.
+    # TODO remove this after https://github.com/vllm-project/vllm/pull/13949
+    # is merged
+    return LLM(MODEL, enforce_eager=True, enable_prefix_caching=False)
+
+
+def test_n_gt_1(llm):
+    """ParallelSampling is supported."""
+
+    params = SamplingParams(n=3)
+    outputs = llm.generate(PROMPT, params)
+    assert len(outputs[0].outputs) == 3
+
+
+def test_best_of(llm):
+    """Raise a ValueError since best_of is deprecated."""
+
+    params = SamplingParams(n=2, best_of=3)
+    with pytest.raises(ValueError):
+        _ = llm.generate(PROMPT, params)
+
+
+def test_penalties(llm):
+    """Check that we do not get errors if applied."""
+
+    params = SamplingParams(
+        temperature=1.2,
+        presence_penalty=1.2,
+        frequency_penalty=1.2,
+        repetition_penalty=1.2,
+        min_p=0.5,
+        top_p=0.5,
+        top_k=3,
+    )
+    _ = llm.generate(PROMPT, params)
+
+
+def test_stop(llm):
+    """Check that we respect the stop words."""
+
+    output = llm.generate(PROMPT, SamplingParams(temperature=0))
+    split_text = output[0].outputs[0].text.split()
+
+    STOP_IDX = 5
+    params = SamplingParams(temperature=0, stop=split_text[STOP_IDX])
+    output = llm.generate(PROMPT, params)
+    new_split_text = output[0].outputs[0].text.split()
+
+    # Output should not contain the stop word.
+    assert len(new_split_text) == STOP_IDX
+
+    params = SamplingParams(temperature=0,
+                            stop=split_text[STOP_IDX],
+                            include_stop_str_in_output=True)
+    output = llm.generate(PROMPT, params)
+    new_split_text = output[0].outputs[0].text.split()
+
+    # Output should contain the stop word.
+    assert len(new_split_text) == STOP_IDX + 1
+
+
+def test_stop_token_ids(llm):
+    """Check that we respect the stop token ids."""
+
+    output = llm.generate(PROMPT, SamplingParams(temperature=0))
+
+    stop_token_id_0 = output[0].outputs[0].token_ids[5]
+    stop_token_id_1 = output[0].outputs[0].token_ids[6]
+
+    stop_token_ids = [stop_token_id_1, stop_token_id_0]
+    params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids)
+    output = llm.generate(PROMPT, params)
+    assert output[0].outputs[0].token_ids[-1] == stop_token_id_0
+
+    stop_token_ids = [stop_token_id_0, stop_token_id_1]
+    params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids)
+    output = llm.generate(PROMPT, params)
+    assert output[0].outputs[0].token_ids[-1] == stop_token_id_0
+
+
+def test_detokenize_false(llm):
+    """Check that detokenize=False option works."""
+
+    output = llm.generate(PROMPT, SamplingParams(detokenize=False))
+    assert len(output[0].outputs[0].token_ids) > 0
+    assert len(output[0].outputs[0].text) == 0
+
+    output = llm.generate(
+        PROMPT, SamplingParams(detokenize=False, logprobs=3,
+                               prompt_logprobs=3))
+    assert len(output[0].outputs[0].token_ids) > 0
+    assert len(output[0].outputs[0].text) == 0
+
+    prompt_logprobs = output[0].prompt_logprobs
+    sampled_logprobs = output[0].outputs[0].logprobs
+    assert len(prompt_logprobs) > 1
+    assert len(sampled_logprobs) > 1
+    for all_logprobs in (prompt_logprobs[1:], sampled_logprobs):
+        for logprobs in all_logprobs:
+            assert 3 <= len(logprobs) <= 4
+            assert all(lp.decoded_token is None for lp in logprobs.values())
+
+
+def test_bad_words(llm):
+    """Check that we respect bad words."""
+
+    output = llm.generate(PROMPT, SamplingParams(temperature=0))
+    split_text = output[0].outputs[0].text.split()
+
+    bad_words_1 = " ".join(split_text[:2])
+    params = SamplingParams(temperature=0, bad_words=[bad_words_1])
+    output = llm.generate(PROMPT, params)
+    new_text = output[0].outputs[0].text
+    assert bad_words_1 not in new_text
+
+    bad_words_2 = new_text.split()[-1]
+    params = SamplingParams(temperature=0,
+                            bad_words=[bad_words_1, bad_words_2])
+    output = llm.generate(PROMPT, params)
+    new_text = output[0].outputs[0].text
+    assert bad_words_1 not in new_text
+    assert bad_words_2 not in new_text
+
+
+def test_logits_processor(llm):
+    """Check that we reject logits processor."""
+
+    # This sample logits processor gives infinite score to the i-th token,
+    # where i is the length of the input sequence.
+    # We therefore expect the output token sequence to be [0, 1, 2, ...]
+    def pick_ith(token_ids, logits):
+        logits[len(token_ids)] = float("inf")
+        return logits
+
+    with pytest.raises(ValueError):
+        _ = llm.generate(PROMPT, SamplingParams(logits_processors=[pick_ith]))
+
+
+def test_allowed_token_ids(llm):
+    """Check that we can use allowed_token_ids."""
+
+    TOKEN_ID = 10
+    allowed_token_ids = [TOKEN_ID]
+    output = llm.generate(PROMPT,
+                          SamplingParams(allowed_token_ids=allowed_token_ids))
+    assert output[0].outputs[0].token_ids[-1] == TOKEN_ID
+
+    # Reject empty allowed_token_ids.
+    with pytest.raises(ValueError):
+        _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[]))
+
+    # Reject negative token id.
+    with pytest.raises(ValueError):
+        _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[-1]))
+
+    # Reject out of vocabulary.
+    with pytest.raises(ValueError):
+        _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[10000000]))
+
+
+def test_priority(llm):
+    """Check that we reject requests with priority."""
+
+    # Reject all allowed token ids
+    with pytest.raises(ValueError):
+        _ = llm.generate(PROMPT, priority=[1])
+
+
+def test_seed(llm):
+    """Check that seed impacts randomness."""
+
+    out_1 = llm.generate(PROMPT, SamplingParams(seed=42))
+    out_2 = llm.generate(PROMPT, SamplingParams(seed=42))
+    out_3 = llm.generate(PROMPT, SamplingParams(seed=43))
+
+    assert out_1[0].outputs[0].text == out_2[0].outputs[0].text
+    assert out_1[0].outputs[0].text != out_3[0].outputs[0].text
diff --git a/vllm_v0.10.0/tests/v1/sample/test_topk_topp_sampler.py b/vllm_v0.10.0/tests/v1/sample/test_topk_topp_sampler.py
new file mode 100644
index 0000000..ccf38c3
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/sample/test_topk_topp_sampler.py
@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+from torch import Generator
+
+from vllm.platforms import current_platform
+from vllm.v1.sample.ops.topk_topp_sampler import (apply_top_k_top_p,
+                                                  is_flashinfer_available)
+
+DEVICE = current_platform.device_type
+
+BATCH_SIZE = 1024
+VOCAB_SIZE = 128 * 1024
+
+FLASHINFER_ENABLED = current_platform.is_cuda() and is_flashinfer_available
+if is_flashinfer_available:
+    from flashinfer.sampling import top_k_renorm_probs, top_p_renorm_probs
+
+
+@pytest.fixture(autouse=True)
+def reset_default_device():
+    """
+    Explicitly set the default device, which can affect subsequent tests.
+    Adding this fixture helps avoid this problem.
+    """
+    original_device = torch.get_default_device()
+    yield
+    torch.set_default_device(original_device)
+
+
+def test_topk_impl_equivalence():
+
+    torch.set_default_device(DEVICE)
+    generator = Generator(device=DEVICE).manual_seed(33)
+
+    logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator)
+
+    # Random top-k values between 1 and 9.
+    k = torch.randint(1, 10, (BATCH_SIZE, ), generator=generator)
+
+    # Set k=vocab_size for ~50% of requests in the batch (top-k disabled).
+    k.masked_fill_(
+        torch.randint(0, 2, (BATCH_SIZE, ), generator=generator, dtype=bool),
+        VOCAB_SIZE)
+
+    # Top-k only implementation
+    result1 = apply_top_k_top_p(logits=logits.clone(), k=k, p=None)
+
+    # Top-p + top-k
+    no_op_top_p = torch.tensor([1.0])
+    result2 = apply_top_k_top_p(logits=logits.clone(), k=k, p=no_op_top_p)
+
+    assert torch.allclose(result1, result2)
+
+
+def test_flashinfer_sampler():
+    '''
+    This test verifies that the FlashInfer top-k and top-p sampling
+    implementation produces the same results as the Python implementation.
+
+    NOTE: FlashInfer did not directly expose an interface for fused top-k and
+    top-p prob renorm (it did provide fused sampling but we cannot compare
+    sampling results due to randomness), so we will compare the probability
+    renormed consequently by top-k and then top-p of FlashInfer implementation.
+    '''
+
+    if not FLASHINFER_ENABLED:
+        pytest.skip(
+            "FlashInfer not installed or not available on this platform.")
+
+    torch.set_default_device(DEVICE)
+    generator = Generator(device=DEVICE).manual_seed(42)
+
+    # Generate random logits
+    logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator)
+
+    # Generate various top-k and top-p values
+    k_values = torch.randint(1, 1000, (BATCH_SIZE, ), generator=generator)
+    p_values = torch.rand(
+        (BATCH_SIZE, ), generator=generator) * 0.5 + 0.5  # range in [0.5, 1.0]
+
+    # Sometimes disable top-k (k=vocab_size)
+    k_values.masked_fill_(
+        torch.randint(0,
+                      2, (BATCH_SIZE, ),
+                      generator=generator,
+                      dtype=torch.bool), VOCAB_SIZE)
+
+    # Sometimes disable top-p (p=1.0)
+    p_values.masked_fill_(
+        torch.randint(0,
+                      2, (BATCH_SIZE, ),
+                      generator=generator,
+                      dtype=torch.bool), 1.0)
+
+    python_logits = apply_top_k_top_p(
+        logits=logits.clone(),
+        k=k_values,
+        p=p_values,
+    )
+    python_probs = torch.softmax(python_logits, dim=-1)
+
+    # FlashInfer only exposed renorm interfaces for probs so convert first
+    flashinfer_probs = torch.softmax(logits.clone(), dim=-1)
+    flashinfer_probs = top_k_renorm_probs(
+        probs=flashinfer_probs,
+        top_k=k_values,
+    )
+    flashinfer_probs = top_p_renorm_probs(
+        probs=flashinfer_probs,
+        top_p=p_values,
+    )
+
+    # Compare the results
+    assert torch.allclose(python_probs, flashinfer_probs, atol=2e-2), \
+        "FlashInfer and Python sampling implementations do not match!"
diff --git a/vllm_v0.10.0/tests/v1/sample/utils.py b/vllm_v0.10.0/tests/v1/sample/utils.py
new file mode 100644
index 0000000..e33efb4
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/sample/utils.py
@@ -0,0 +1,215 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterator
+from enum import Enum
+from typing import NamedTuple, Optional
+
+import regex as re
+import torch
+
+from vllm import CompletionOutput
+from vllm.utils import make_tensor_with_pad
+from vllm.v1.sample.logits_processor import BatchUpdate, LogitsProcessor
+from vllm.v1.sample.metadata import SamplingMetadata
+
+
+class BatchLogprobsComposition(Enum):
+    """Types of logprobs configs to include in test batch"""
+    NONE = 0
+    SAMPLE = 1
+    PROMPT = 2
+    SAMPLE_PROMPT = 3
+
+
+BatchLogprobsSpecType = list[tuple[Optional[int], Optional[int]]]
+
+
+def get_test_batch(
+    batch_logprobs_composition: BatchLogprobsComposition
+) -> BatchLogprobsSpecType:
+    """Generate logprobs configs for a batch of requests
+    
+    A given request's logprobs configuration is (1) num_sample_logprobs and (2)
+    num_prompt_logprobs. The batch logprobs configuration is the list of request
+    logprobs configs.
+
+    batch_logprobs_composition == NONE yields a batch with no sample or prompt
+    logprobs
+
+    batch_logprobs_composition == SAMPLE yields a batch with some requests
+    configured for sample logprobs only, and others configured for no logprobs
+
+    batch_logprobs_composition == PROMPT yields a batch with some requests
+    configured for prompt logprobs only, and others configured for no logprobs
+
+    batch_logprobs_composition == SAMPLE_PROMPT yields a batch with some
+    requests configured for sample logprobs and prompt logprobs, some configured
+    for only sample logprobs or only prompt logprobs, and some configured for
+    no logprobs
+
+    Args:
+      batch_logprobs_composition: types of logprobs configs to include in batch
+
+    Returns:
+
+      list of (Optional[num_sample_logprobs], Optional[num_prompt_logprobs])
+      tuples
+    """
+    if batch_logprobs_composition == BatchLogprobsComposition.NONE:
+        # No requests with sample or prompt logprobs
+        return [(None, None)]
+    elif batch_logprobs_composition == BatchLogprobsComposition.SAMPLE:
+        # Requests requiring sample logprobs or no logprobs
+        return [
+            (None, None),
+            (0, None),
+            (5, None),
+            (3, None),
+        ]
+    elif batch_logprobs_composition == BatchLogprobsComposition.PROMPT:
+        # Requests requiring prompt logprobs or no logprobs
+        return [
+            (None, None),
+            (None, 0),
+            (None, 6),
+            (None, 5),
+        ]
+    elif batch_logprobs_composition == BatchLogprobsComposition.SAMPLE_PROMPT:
+        # Requests requiring either no logprobs, just
+        # sample logprobs, just prompt logprobs, or
+        # both sample and prompt logprobs
+        return [
+            (None, None),
+            (0, None),
+            (5, None),
+            (3, None),
+            (0, 3),
+            (6, 0),
+            (6, 3),
+            (None, 6),
+            (None, 5),
+            (None, 0),
+        ]
+    else:
+        raise ValueError("Invalid logprobs batch configuration for test.")
+
+
+def assert_incr_detok_str_matches_non_incr_detok_str(
+    incremental_detokenization_str: str,
+    non_incremental_detokenization_str: str,
+    msg: str,
+) -> None:
+    """Compare incrementally detok. text to non-incrementally detok. text
+    
+    Fail if the strings mismatch after non-alphanumeric characters are stripped
+    out.
+
+    Rationale: incremental detokenization in the text generation process allows
+    the tokenizer to adjust the next token text output based on the token's
+    context in the string. However, logprobs detokenization detokenizes each
+    token individually, and the resultant strings may include some
+    non-alphanumeric placeholder characters where there could be i.e.
+    whitespace. So, this function compares only the alphanumeric text
+    between two strings and fails if there is a mismatch, which helps
+    with validating logprobs detokenization.
+
+    Args:
+      incremental_detokenization_str: incrementally-detokenized generated text
+      non_incremental_detokenization_str: non-incrementally-detokenized logprob
+                                          tokens
+      msg: error message if `assert` fails
+    """
+    rgx = r'[^a-zA-Z0-9]+'
+    assert (re.sub(rgx, '', incremental_detokenization_str) == re.sub(
+        rgx, '', non_incremental_detokenization_str)), (msg)
+
+
+def compute_correct_cumulative_logprob(
+        completion_output: CompletionOutput) -> float:
+    """Compute known-good value for evaluating cumulative logprob
+    
+    Args:
+      completion_output: completion output from engine
+
+    Returns:
+      Known-good cumulative logprob value
+    """
+    token_ids = completion_output.token_ids
+    logprobs = completion_output.logprobs
+    assert logprobs is not None
+    return sum([lp[tok_id].logprob for tok_id, lp in zip(token_ids, logprobs)])
+
+
+def create_fake_logits(batch_size: int, vocab_size: int) -> torch.Tensor:
+    fake_logits = torch.full((batch_size, vocab_size), 1e-2, dtype=torch.float)
+    return fake_logits
+
+
+def create_penalty_tensor(batch_size: int, penalty_value: float,
+                          device: torch.device) -> torch.Tensor:
+    return torch.full((batch_size, ),
+                      fill_value=penalty_value,
+                      dtype=torch.float,
+                      device=device)
+
+
+def create_prompt_tokens_tensor(
+    prompt_token_ids: list[list[int]],
+    vocab_size: int,
+    device: torch.device,
+) -> torch.Tensor:
+    return make_tensor_with_pad(
+        prompt_token_ids,
+        pad=vocab_size,
+        device=device,
+        dtype=torch.int64,
+        pin_memory=False,
+    )
+
+
+class LogitsprocsTestFakes(NamedTuple):
+    """Wraps fake data structures to support testing"""
+    logits: torch.Tensor
+    sampling_metadata: SamplingMetadata
+
+    def get_logitsprocs_by_cls(
+        self,
+        cls: type[LogitsProcessor],
+    ) -> Iterator[LogitsProcessor]:
+        """Yield logits processors of a specific class.
+        
+        Args:
+          cls: :class:`LogitsProcessor` subclass
+
+        Returns:
+          Iterator over logits processors
+        """
+        return (lp for lp in self.sampling_metadata.logitsprocs.all
+                if isinstance(lp, cls))
+
+    def get_logitsprocs(self) -> Iterator[LogitsProcessor]:
+        """Iterator over all logits processors."""
+        return self.sampling_metadata.logitsprocs.all
+
+
+def fake_update_logitsprocs_state(
+    test_fakes: LogitsprocsTestFakes,
+    batch_update: BatchUpdate,
+) -> None:
+    """Imitate logits processors persistent batch state update
+    in engine core"""
+    for logitproc in test_fakes.get_logitsprocs():
+        logitproc.update_state(batch_update)
+
+
+def fake_apply_logitsprocs(
+    test_fakes: LogitsprocsTestFakes,
+    slice_indices: list[int],
+) -> torch.Tensor:
+    """Imitate application of logits processors in engine core"""
+    logits = test_fakes.logits[torch.tensor(slice_indices,
+                                            dtype=torch.long)].clone()
+    for processor in test_fakes.get_logitsprocs():
+        logits = processor.apply(logits)
+    return logits
diff --git a/vllm_v0.10.0/tests/v1/shutdown/test_delete.py b/vllm_v0.10.0/tests/v1/shutdown/test_delete.py
new file mode 100644
index 0000000..682d84d
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/shutdown/test_delete.py
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test that we handle a startup Error and shutdown."""
+
+import pytest
+
+from tests.utils import wait_for_gpu_memory_to_clear
+from tests.v1.shutdown.utils import (SHUTDOWN_TEST_THRESHOLD_BYTES,
+                                     SHUTDOWN_TEST_TIMEOUT_SEC)
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.sampling_params import RequestOutputKind
+from vllm.utils import cuda_device_count_stateless
+from vllm.v1.engine.async_llm import AsyncLLM
+
+MODELS = ["meta-llama/Llama-3.2-1B"]
+
+
+@pytest.mark.asyncio
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+@pytest.mark.parametrize("send_one_request", [False, True])
+async def test_async_llm_delete(model: str, tensor_parallel_size: int,
+                                send_one_request: bool) -> None:
+    """Test that AsyncLLM frees GPU memory upon deletion.
+    AsyncLLM always uses an MP client.
+
+    Args:
+      model: model under test
+      tensor_parallel_size: degree of tensor parallelism
+      send_one_request: send one request to engine before deleting
+    """
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    engine_args = AsyncEngineArgs(model=model,
+                                  enforce_eager=True,
+                                  tensor_parallel_size=tensor_parallel_size)
+
+    # Instantiate AsyncLLM; make request to complete any deferred
+    # initialization; then delete instance
+    async_llm = AsyncLLM.from_engine_args(engine_args)
+    if send_one_request:
+        async for _ in async_llm.generate(
+                "Hello my name is",
+                request_id="abc",
+                sampling_params=SamplingParams(
+                    max_tokens=1, output_kind=RequestOutputKind.DELTA)):
+            pass
+    del async_llm
+
+    # Confirm all the processes are cleaned up.
+    wait_for_gpu_memory_to_clear(
+        devices=list(range(tensor_parallel_size)),
+        threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
+    )
+
+
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+@pytest.mark.parametrize("enable_multiprocessing", [True])
+@pytest.mark.parametrize("send_one_request", [False, True])
+def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int,
+                    enable_multiprocessing: bool,
+                    send_one_request: bool) -> None:
+    """Test that LLM frees GPU memory upon deletion.
+    TODO(andy) - LLM without multiprocessing.
+
+    Args:
+      model: model under test
+      tensor_parallel_size: degree of tensor parallelism
+      enable_multiprocessing: enable workers in separate process(es)
+      send_one_request: send one request to engine before deleting
+    """
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    with monkeypatch.context() as m:
+        MP_VALUE = "1" if enable_multiprocessing else "0"
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
+
+        # Instantiate LLM; make request to complete any deferred
+        # initialization; then delete instance
+        llm = LLM(model=model,
+                  enforce_eager=True,
+                  tensor_parallel_size=tensor_parallel_size)
+        if send_one_request:
+            llm.generate("Hello my name is",
+                         sampling_params=SamplingParams(max_tokens=1))
+        del llm
+
+        # Confirm all the processes are cleaned up.
+        wait_for_gpu_memory_to_clear(
+            devices=list(range(tensor_parallel_size)),
+            threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
+        )
diff --git a/vllm_v0.10.0/tests/v1/shutdown/test_forward_error.py b/vllm_v0.10.0/tests/v1/shutdown/test_forward_error.py
new file mode 100644
index 0000000..523b7ee
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/shutdown/test_forward_error.py
@@ -0,0 +1,130 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test that we handle an Error in model forward and shutdown."""
+
+import asyncio
+
+import pytest
+
+from tests.utils import wait_for_gpu_memory_to_clear
+from tests.v1.shutdown.utils import (SHUTDOWN_TEST_THRESHOLD_BYTES,
+                                     SHUTDOWN_TEST_TIMEOUT_SEC)
+from vllm import LLM, AsyncEngineArgs, SamplingParams
+from vllm.distributed import get_tensor_model_parallel_rank
+from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.utils import cuda_device_count_stateless
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.engine.exceptions import EngineDeadError
+
+MODELS = ["meta-llama/Llama-3.2-1B"]
+
+
+def evil_forward(self, *args, **kwargs):
+    """Evil forward method that raise an exception after 10 calls."""
+    NUMBER_OF_GOOD_PASSES = 10
+
+    if not hasattr(self, "num_calls"):
+        self.num_calls = 0
+
+    if (self.num_calls == NUMBER_OF_GOOD_PASSES
+            and get_tensor_model_parallel_rank() == 0):
+        raise Exception("Simulated illegal memory access on Rank 0!")
+    self.num_calls += 1
+
+    return self.model(*args, **kwargs)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+@pytest.mark.parametrize("model", MODELS)
+async def test_async_llm_model_error(monkeypatch, tensor_parallel_size: int,
+                                     model: str) -> None:
+    """Test that AsyncLLM propagates a forward pass error and frees memory.
+    
+    AsyncLLM always uses an MP client.
+    """
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    # Monkeypatch an error in the model.
+    monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward)
+
+    engine_args = AsyncEngineArgs(model=model,
+                                  enforce_eager=True,
+                                  tensor_parallel_size=tensor_parallel_size)
+    async_llm = AsyncLLM.from_engine_args(engine_args)
+
+    async def generate(request_id: str):
+        generator = async_llm.generate("Hello my name is",
+                                       request_id=request_id,
+                                       sampling_params=SamplingParams())
+        try:
+            async for _ in generator:
+                pass
+        except Exception as e:
+            return e
+
+    NUM_REQS = 3
+    tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)]
+    outputs = await asyncio.gather(*tasks)
+
+    # Every request should get an EngineDeadError.
+    for output in outputs:
+        assert isinstance(output, EngineDeadError)
+
+    # AsyncLLM should be errored.
+    assert async_llm.errored
+
+    # We should not be able to make another request.
+    with pytest.raises(EngineDeadError):
+        async for _ in async_llm.generate("Hello my name is",
+                                          request_id="abc",
+                                          sampling_params=SamplingParams()):
+            raise Exception("We should not get here.")
+
+    # Confirm all the processes are cleaned up.
+    wait_for_gpu_memory_to_clear(
+        devices=list(range(tensor_parallel_size)),
+        threshold_bytes=2 * 2**30,
+        timeout_s=60,
+    )
+
+    # NOTE: shutdown is handled by the API Server if an exception
+    # occurs, so it is expected that we would need to call this.
+    async_llm.shutdown()
+
+
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
+@pytest.mark.parametrize("enable_multiprocessing", [True])
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+@pytest.mark.parametrize("model", MODELS)
+def test_llm_model_error(monkeypatch, tensor_parallel_size: int,
+                         enable_multiprocessing: bool, model: str) -> None:
+    """Test that LLM propagates a forward pass error and frees memory.
+    TODO(andy) - LLM without multiprocessing; LLM with multiprocessing
+    and >1 rank
+    """
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    with monkeypatch.context() as m:
+
+        MP_VALUE = "1" if enable_multiprocessing else "0"
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
+
+        # Monkeypatch an error in the model.
+        m.setattr(LlamaForCausalLM, "forward", evil_forward)
+
+        llm = LLM(model=model,
+                  enforce_eager=True,
+                  tensor_parallel_size=tensor_parallel_size)
+
+        with pytest.raises(
+                EngineDeadError if enable_multiprocessing else Exception):
+            llm.generate("Hello my name is Robert and I")
+
+        # Confirm all the processes are cleaned up.
+        wait_for_gpu_memory_to_clear(
+            devices=list(range(tensor_parallel_size)),
+            threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
+        )
diff --git a/vllm_v0.10.0/tests/v1/shutdown/test_processor_error.py b/vllm_v0.10.0/tests/v1/shutdown/test_processor_error.py
new file mode 100644
index 0000000..a077d48
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/shutdown/test_processor_error.py
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test error handling in Processor. Should not impact other reqs."""
+
+import asyncio
+
+import pytest
+
+from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT_SEC
+from vllm import SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.inputs.data import TokensPrompt
+from vllm.sampling_params import RequestOutputKind
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.engine.exceptions import EngineGenerateError
+
+MODELS = ["meta-llama/Llama-3.2-1B"]
+
+
+@pytest.mark.asyncio
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
+@pytest.mark.parametrize("model", MODELS)
+async def test_async_llm_processor_error(model: str) -> None:
+    """Test that AsyncLLM propagates a processor error.
+    Test empty tokens prompt (failure) and non-empty prompt (no failure.)
+    AsyncLLM always uses an MP client.
+    """
+    engine_args = AsyncEngineArgs(model=model, enforce_eager=True)
+    async_llm = AsyncLLM.from_engine_args(engine_args)
+
+    async def generate(request_id: str):
+        # [] is not allowed and will raise a ValueError in Processor.
+        generator = async_llm.generate(TokensPrompt([]),
+                                       request_id=request_id,
+                                       sampling_params=SamplingParams())
+        try:
+            async for _ in generator:
+                pass
+        except Exception as e:
+            return e
+
+    NUM_REQS = 3
+    tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)]
+    outputs = await asyncio.gather(*tasks)
+
+    # Every request should have get an EngineGenerateError.
+    for output in outputs:
+        with pytest.raises(EngineGenerateError):
+            raise output
+
+    # AsyncLLM should be errored.
+    assert not async_llm.errored
+
+    # This should be no problem.
+    EXPECTED_TOKENS = 5
+    outputs = []
+    async for out in async_llm.generate(
+            "Hello my name is",
+            request_id="abc",
+            sampling_params=SamplingParams(
+                max_tokens=EXPECTED_TOKENS,
+                output_kind=RequestOutputKind.DELTA)):
+        outputs.append(out)
+
+    generated_tokens = []
+    for out in outputs:
+        generated_tokens.extend(out.outputs[0].token_ids)
+    assert len(generated_tokens) == EXPECTED_TOKENS
+
+    async_llm.shutdown()
diff --git a/vllm_v0.10.0/tests/v1/shutdown/test_startup_error.py b/vllm_v0.10.0/tests/v1/shutdown/test_startup_error.py
new file mode 100644
index 0000000..88fc529
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/shutdown/test_startup_error.py
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test that we handle a startup Error and shutdown."""
+
+import pytest
+
+from tests.utils import wait_for_gpu_memory_to_clear
+from tests.v1.shutdown.utils import (SHUTDOWN_TEST_THRESHOLD_BYTES,
+                                     SHUTDOWN_TEST_TIMEOUT_SEC)
+from vllm import LLM
+from vllm.distributed import get_tensor_model_parallel_rank
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.utils import cuda_device_count_stateless
+from vllm.v1.engine.async_llm import AsyncLLM
+
+MODELS = ["meta-llama/Llama-3.2-1B"]
+
+
+def evil_method(self, *args, **kwargs):
+    """Evil method that raises an exception."""
+
+    if get_tensor_model_parallel_rank() == 0:
+        raise Exception("Simulated Error in startup!")
+
+    return self.model(*args, **kwargs, intermediate_tensors=None)
+
+
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+@pytest.mark.parametrize("failing_method", ["forward", "load_weights"])
+def test_async_llm_startup_error(monkeypatch, model: str,
+                                 tensor_parallel_size: int,
+                                 failing_method: str) -> None:
+    """Test that AsyncLLM propagates an __init__ error & frees memory.
+    Test profiling (forward()) and load weights failures.
+    AsyncLLM always uses an MP client.
+    """
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    # Monkeypatch an error in the model.
+    monkeypatch.setattr(LlamaForCausalLM, failing_method, evil_method)
+
+    engine_args = AsyncEngineArgs(model=model,
+                                  enforce_eager=True,
+                                  tensor_parallel_size=tensor_parallel_size)
+
+    # Confirm we get an exception.
+    with pytest.raises(Exception, match="initialization failed"):
+        _ = AsyncLLM.from_engine_args(engine_args)
+
+    # Confirm all the processes are cleaned up.
+    wait_for_gpu_memory_to_clear(
+        devices=list(range(tensor_parallel_size)),
+        threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
+    )
+
+
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+@pytest.mark.parametrize("enable_multiprocessing", [True])
+@pytest.mark.parametrize("failing_method", ["forward", "load_weights"])
+def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int,
+                           enable_multiprocessing: bool,
+                           failing_method: str) -> None:
+    """Test that LLM propagates an __init__ error and frees memory.
+    Test profiling (forward()) and load weights failures.
+    TODO(andy) - LLM without multiprocessing.
+    """
+    if model != "meta-llama/Llama-3.2-1B":
+        pytest.skip(reason="Only test meta-llama/Llama-3.2-1B")
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    with monkeypatch.context() as m:
+
+        MP_VALUE = "1" if enable_multiprocessing else "0"
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
+
+        # Monkeypatch an error in the model.
+        monkeypatch.setattr(LlamaForCausalLM, failing_method, evil_method)
+
+        with pytest.raises(
+                Exception,
+                match="initialization failed"
+                if enable_multiprocessing else "Simulated Error in startup!"):
+            _ = LLM(model=model,
+                    enforce_eager=True,
+                    tensor_parallel_size=tensor_parallel_size)
+
+        # Confirm all the processes are cleaned up.
+        wait_for_gpu_memory_to_clear(
+            devices=list(range(tensor_parallel_size)),
+            threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
+        )
diff --git a/vllm_v0.10.0/tests/v1/shutdown/utils.py b/vllm_v0.10.0/tests/v1/shutdown/utils.py
new file mode 100644
index 0000000..124254a
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/shutdown/utils.py
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Shutdown test utils"""
+
+SHUTDOWN_TEST_TIMEOUT_SEC = 120
+SHUTDOWN_TEST_THRESHOLD_BYTES = 2 * 2**30
diff --git a/vllm_v0.10.0/tests/v1/spec_decode/test_eagle.py b/vllm_v0.10.0/tests/v1/spec_decode/test_eagle.py
new file mode 100644
index 0000000..5c74a28
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/spec_decode/test_eagle.py
@@ -0,0 +1,347 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest import mock
+
+import pytest
+import torch
+
+from tests.v1.attention.utils import (BatchSpec, _Backend,
+                                      create_common_attn_metadata,
+                                      create_standard_kv_cache_spec,
+                                      get_attention_backend)
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig, SpeculativeConfig,
+                         VllmConfig)
+from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.platforms import current_platform
+from vllm.v1.spec_decode.eagle import EagleProposer
+
+model_dir = "meta-llama/Llama-3.1-8B-Instruct"
+eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
+eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
+
+
+def _create_proposer(method: str, k: int) -> EagleProposer:
+    model_config = ModelConfig(model=model_dir,
+                               task="generate",
+                               max_model_len=100,
+                               tokenizer=model_dir,
+                               tokenizer_mode="auto",
+                               dtype="auto",
+                               seed=None,
+                               trust_remote_code=False)
+
+    # Choose model directory based on method
+    draft_model_dir = eagle_dir if method == "eagle" else eagle3_dir
+
+    speculative_config = SpeculativeConfig(
+        target_model_config=model_config,
+        target_parallel_config=ParallelConfig(),
+        model=draft_model_dir,
+        method=method,
+        num_speculative_tokens=k,
+    )
+
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        cache_config=CacheConfig(),
+        speculative_config=speculative_config,
+        device_config=DeviceConfig(device=current_platform.device_type),
+        parallel_config=ParallelConfig(),
+        load_config=LoadConfig(),
+        scheduler_config=SchedulerConfig())
+
+    return EagleProposer(vllm_config=vllm_config,
+                         device=current_platform.device_type)
+
+
+def test_prepare_inputs():
+    """
+    cu_target_query_lens: [0, a, a + b, a + b + c]
+    num_rejected_tokens: [n1, n2, n3]
+    num_tokens_per_req: [a - n1, b - n2, c - n3]
+    cu_num_tokens: [0, a - n1, a + b - n1 - n2, a + b + c - n1 - n2 - n3]
+    token_indices: [0, 1, ..., a - n1 - 1,
+                    a, a + 1, ..., a + b - n2 - 1,
+                    a + b, a + b + 1, ..., a + b + c - n3 - 1]
+    """
+    device = torch.device(current_platform.device_type)
+
+    # q1 = 4, q2 = 7, q3 = 5
+    # n1 = 1, n2 = 3, n3 = 2
+
+    batch_spec = BatchSpec(
+        seq_lens=[4, 7, 5],
+        query_lens=[4, 7, 5],
+    )
+
+    common_attn_metadata = create_common_attn_metadata(
+        batch_spec,
+        block_size=16,
+        device=device,
+    )
+
+    # Rejected tokens per request: [1, 3, 2]
+    num_rejected_tokens = torch.tensor([1, 3, 2],
+                                       dtype=torch.int32,
+                                       device=device)
+
+    # Expected calculations:
+    # query_len_per_req = [4, 7, 5]
+    # num_tokens_per_req = [3, 4, 3]  (after subtracting rejected tokens)
+    # Expected cumulative counts: [0, 3, 7, 10]
+    expected_cu_num_tokens = torch.tensor([0, 3, 7, 10],
+                                          dtype=torch.int32,
+                                          device=device)
+
+    # Expected token indices (mapped from original positions):
+    # First request: indices 0, 1, 2      (keeping first 3 from positions 0-3)
+    # Second request: indices 4, 5, 6, 7  (keeping first 4 from positions 4-10)
+    # Third request: indices 11, 12, 13   (keeping first 3 from positions 11-15)
+    expected_token_indices = torch.tensor(
+        [
+            0,
+            1,
+            2,  # First request: 3 tokens (4-1)
+            4,
+            5,
+            6,
+            7,  # Second request: 4 tokens (7-3)
+            11,
+            12,
+            13  # Third request: 3 tokens (5-2)
+        ],
+        dtype=torch.int32,
+        device=device)
+    proposer = _create_proposer("eagle", 1)
+
+    updated_metadata, token_indices = proposer.prepare_inputs(
+        common_attn_metadata, num_rejected_tokens.cpu())
+
+    assert torch.equal(updated_metadata.query_start_loc,
+                       expected_cu_num_tokens)
+    assert token_indices.shape[0] == expected_cu_num_tokens[-1].item()
+    assert torch.equal(token_indices, expected_token_indices)
+
+
+@pytest.mark.parametrize("method,proposer_helper", [
+    ("eagle", lambda k: _create_proposer("eagle", k)),
+    ("eagle3", lambda k: _create_proposer("eagle3", k)),
+])
+@pytest.mark.parametrize("pp_size", [1, 2])
+@pytest.mark.parametrize("use_distinct_embed_tokens", [True, False])
+@mock.patch('vllm.v1.spec_decode.eagle.get_pp_group')
+@mock.patch('vllm.v1.spec_decode.eagle.get_layers_from_vllm_config')
+@mock.patch('vllm.v1.spec_decode.eagle.get_model')
+def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
+                    proposer_helper, pp_size, use_distinct_embed_tokens):
+    # Setup draft model mock
+    mock_model = mock.MagicMock()
+    if use_distinct_embed_tokens:
+        # Some models can have a different hidden size than the target model,
+        # so we test that their embed_tokens doesn't get overwritten
+        mock_model.model.embed_tokens.weight.shape = (131072, 2048)
+    else:
+        mock_model.model.embed_tokens.weight.shape = (131072, 4096)
+
+    mock_get_model.return_value = mock_model
+
+    # Setup mocks for attention layers
+    target_attn_layers = {
+        "target_attn_1": mock.MagicMock(),
+        "target_attn_2": mock.MagicMock()
+    }
+    # Draft model has one extra attention layer compared to target model
+    all_attn_layers = {
+        **target_attn_layers, "draft_extra_attn": mock.MagicMock()
+    }
+
+    # Make mock_get_layers return different values for each call
+    mock_get_layers.side_effect = [target_attn_layers, all_attn_layers]
+
+    # Setup mock for pp group to return the appropriate value for world size
+    mock_pp_group = mock.MagicMock()
+    mock_pp_group.world_size = pp_size
+    mock_get_pp_group.return_value = mock_pp_group
+
+    # Setup the target model mock with a custom class so that
+    # isinstance() checks match the expected type.
+    class _TargetModelStub(LlamaForCausalLM):
+        model: mock.MagicMock
+        lm_head: mock.MagicMock
+
+    target_model = mock.create_autospec(_TargetModelStub, instance=True)
+    target_model.model = mock.MagicMock()
+    target_model.model.embed_tokens.weight.shape = (131072, 4096)
+
+    from vllm.model_executor.models import SupportsMultiModal
+    assert not isinstance(target_model, SupportsMultiModal)
+
+    if method == "eagle":
+        target_model.lm_head = mock.MagicMock()
+
+    # Create proposer using the helper function
+    proposer = proposer_helper(k=8)
+
+    # Call the method under test
+    proposer.load_model(target_model)
+
+    # Verify common interactions
+    mock_get_model.assert_called_once()
+
+    # Verify that EAGLE models gain the lm head from the target model
+    if method == "eagle":
+        assert proposer.model.lm_head == target_model.lm_head
+
+    # Verify that the embed tokens are set correctly
+    # If pp_size is > 1, the embed tokens should be distinct
+    if pp_size > 1 or use_distinct_embed_tokens:
+        assert proposer.model.model.embed_tokens != \
+            target_model.model.embed_tokens
+    else:
+        # When pp_size is 1 and the draft and target models have
+        # embed_tokens of the same shape, they should be shared.
+        assert proposer.model.model.embed_tokens == \
+            target_model.model.embed_tokens
+
+
+@pytest.mark.parametrize("num_speculative_tokens", [1, 3, 8])
+def test_propose(num_speculative_tokens):
+    # Use GPU device
+    device = torch.device(current_platform.device_type)
+
+    # Setup test parameters
+    batch_size = 2
+    seq_len_1 = 5
+    seq_len_2 = 3
+    total_tokens = seq_len_1 + seq_len_2
+    vocab_size = 100
+    seq_lens = [seq_len_1, seq_len_2]
+
+    # Create proposer first so we can use its actual hidden_size
+    proposer = _create_proposer("eagle", num_speculative_tokens)
+    # Get the hidden_size from the proposer to ensure consistency
+    hidden_size = proposer.hidden_size
+
+    # Helper to create deterministic logits that will produce specific tokens
+    def create_deterministic_logits(token_ids):
+        logits = torch.full((batch_size, vocab_size), -100.0, device=device)
+        for i, token_id in enumerate(token_ids):
+            logits[i, token_id] = 100.0
+        return logits
+
+    # We mock a model that returns deterministic logits
+    # Sequence 1: 42, 43, 44, ...
+    # Sequence 2: 60, 61, 62, ...
+    base_token_ids = [42, 60]
+
+    # Skip loading the model and replace it with a mock directly
+    # Create the mock model with deterministic outputs
+    model_mock = mock.MagicMock()
+
+    # Setup for model forward calls
+    forward_returns = []
+    for i in range(num_speculative_tokens):
+        if i == 0:
+            # First call uses all tokens
+            h_logits = torch.zeros(total_tokens, hidden_size, device=device)
+            h_states = torch.zeros(total_tokens, hidden_size, device=device)
+        else:
+            # Subsequent calls use batch_size tokens
+            h_logits = torch.zeros(batch_size, hidden_size, device=device)
+            h_states = torch.zeros(batch_size, hidden_size, device=device)
+        forward_returns.append((h_logits, h_states))
+
+    # For single token case, we only need the first item;
+    # for multi-token, we need the sequence
+    if num_speculative_tokens == 1:
+        model_mock.return_value = forward_returns[0]
+    else:
+        model_mock.side_effect = forward_returns
+
+    # Setup for compute_logits calls
+    logits_returns = []
+    for i in range(num_speculative_tokens):
+        # For each call, increment the base token IDs
+        current_tokens = [base_id + i for base_id in base_token_ids]
+        logits_returns.append(create_deterministic_logits(current_tokens))
+
+    if num_speculative_tokens == 1:
+        model_mock.compute_logits.return_value = logits_returns[0]
+    else:
+        model_mock.compute_logits.side_effect = logits_returns
+
+    # Assign the mock to the proposer
+    proposer.model = model_mock
+
+    # Assign draft attn_layer_names since load_model is not invoked
+    proposer.attn_layer_names = ["layer.0"]
+
+    # Create input tensors
+    batch_spec = BatchSpec(
+        seq_lens=seq_lens,
+        query_lens=seq_lens,
+    )
+
+    common_attn_metadata = create_common_attn_metadata(
+        batch_spec,
+        block_size=16,
+        device=device,
+    )
+
+    target_token_ids = torch.randint(0,
+                                     vocab_size, (total_tokens, ),
+                                     device=device)
+    target_positions = torch.cat([
+        torch.arange(seq_len_1, device=device),
+        torch.arange(seq_len_2, device=device)
+    ])
+    target_hidden_states = torch.randn(total_tokens,
+                                       hidden_size,
+                                       device=device)
+    next_token_ids = torch.randint(0,
+                                   vocab_size, (batch_size, ),
+                                   dtype=torch.int32,
+                                   device=device)
+    sampling_metadata = mock.MagicMock()
+
+    attn_metadata_builder_cls, _ = get_attention_backend(
+        _Backend.FLASH_ATTN_VLLM_V1)
+    attn_metadata_builder = attn_metadata_builder_cls(
+        kv_cache_spec=create_standard_kv_cache_spec(proposer.vllm_config),
+        vllm_config=proposer.vllm_config,
+        device=device,
+    )
+
+    # Mock runner for attention metadata building
+    proposer.runner = mock.MagicMock()
+    proposer.runner.attn_metadata_builders = [attn_metadata_builder]
+
+    result = proposer.propose(target_token_ids=target_token_ids,
+                              target_positions=target_positions,
+                              target_hidden_states=target_hidden_states,
+                              next_token_ids=next_token_ids,
+                              common_attn_metadata=common_attn_metadata,
+                              sampling_metadata=sampling_metadata)
+
+    assert result.shape == (batch_size, num_speculative_tokens)
+
+    # Create expected tokens based on our token pattern
+    if num_speculative_tokens == 1:
+        # Example for num_speculative_tokens=1:
+        # [[42], [60]]
+        expected_tokens = torch.tensor(
+            [[base_token_ids[0]], [base_token_ids[1]]], device=device)
+    else:
+        # Example for num_speculative_tokens=3:
+        # [[42, 43, 44], [60, 61, 62]]
+        expected_tokens = torch.zeros((batch_size, num_speculative_tokens),
+                                      dtype=torch.int64,
+                                      device=device)
+        for i in range(batch_size):
+            for j in range(num_speculative_tokens):
+                expected_tokens[i, j] = base_token_ids[i] + j
+
+    # Verify all tokens match our expectations
+    assert torch.equal(result, expected_tokens)
diff --git a/vllm_v0.10.0/tests/v1/spec_decode/test_max_len.py b/vllm_v0.10.0/tests/v1/spec_decode/test_max_len.py
new file mode 100644
index 0000000..9070d2b
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/spec_decode/test_max_len.py
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test whether spec decoding handles the max model length properly."""
+
+import pytest
+
+from vllm import LLM, SamplingParams
+
+_PROMPTS = [
+    "1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1",
+    "Repeat the following sentence 10 times: Consistency is key to mastering any skill.",  # noqa: E501
+    "Who won the Turing Award in 2018, and for what contribution? Describe in detail.",  # noqa: E501
+]
+
+
+@pytest.mark.parametrize("num_speculative_tokens", [1, 3, 10])
+def test_ngram_max_len(
+    monkeypatch: pytest.MonkeyPatch,
+    num_speculative_tokens: int,
+):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        llm = LLM(
+            model="facebook/opt-125m",
+            max_model_len=100,
+            enforce_eager=True,  # For faster initialization.
+            speculative_config={
+                "method": "ngram",
+                "prompt_lookup_max": 5,
+                "prompt_lookup_min": 3,
+                "num_speculative_tokens": num_speculative_tokens,
+            },
+        )
+        sampling_params = SamplingParams(max_tokens=100, ignore_eos=True)
+        llm.generate(_PROMPTS, sampling_params)
+
+
+@pytest.mark.parametrize("num_speculative_tokens", [1, 3, 10])
+def test_eagle_max_len(
+    monkeypatch: pytest.MonkeyPatch,
+    num_speculative_tokens: int,
+):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        llm = LLM(
+            model="meta-llama/Meta-Llama-3-8B-Instruct",
+            enforce_eager=True,  # For faster initialization.
+            speculative_config={
+                "method": "eagle",
+                "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
+                "num_speculative_tokens": num_speculative_tokens,
+            },
+            max_model_len=100,
+        )
+        sampling_params = SamplingParams(max_tokens=100, ignore_eos=True)
+        llm.generate(_PROMPTS, sampling_params)
diff --git a/vllm_v0.10.0/tests/v1/spec_decode/test_ngram.py b/vllm_v0.10.0/tests/v1/spec_decode/test_ngram.py
new file mode 100644
index 0000000..ffea86d
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/spec_decode/test_ngram.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import numpy as np
+
+from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
+from vllm.v1.spec_decode.ngram_proposer import (NgramProposer,
+                                                _find_subarray_kmp,
+                                                _kmp_lps_array)
+
+
+def test_kmp_lps_array():
+    np.testing.assert_array_equal(_kmp_lps_array(np.array([])), np.array([]))
+    np.testing.assert_array_equal(_kmp_lps_array(np.array([1])), np.array([0]))
+    np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 1, 1])),
+                                  np.array([0, 1, 2]))
+    np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 2, 3, 4])),
+                                  np.array([0, 0, 0, 0]))
+    np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 2, 1, 2, 3])),
+                                  np.array([0, 0, 1, 2, 0]))
+
+
+def test_find_subarray_kmp():
+    X = np.array([1, 2, 3, 4, 1, 2, 3, 5, 6])
+    assert _find_subarray_kmp(X, 2, 2) is None
+    X = np.array([1, 2, 3, 4, 1, 2, 3])
+    np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 3),
+                                  np.array([4, 1, 2]))
+    np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 2), np.array([4,
+                                                                         1]))
+    np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3),
+                                  np.array([4, 1, 2]))
+    np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 2), np.array([4,
+                                                                         1]))
+    X = np.array([1, 3, 6, 2, 3, 4, 1, 2, 3])
+    np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 3),
+                                  np.array([4, 1, 2]))
+    # Return on the first match
+    np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3),
+                                  np.array([6, 2, 3]))
+
+
+def test_ngram_proposer():
+
+    def ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
+        # Dummy model config. Just to set max_model_len.
+        model_config = ModelConfig(model="facebook/opt-125m",
+                                   task="generate",
+                                   max_model_len=100,
+                                   tokenizer="facebook/opt-125m",
+                                   tokenizer_mode="auto",
+                                   dtype="auto",
+                                   seed=None,
+                                   trust_remote_code=False)
+        return NgramProposer(
+            vllm_config=VllmConfig(model_config=model_config,
+                                   speculative_config=SpeculativeConfig.
+                                   from_dict({
+                                       "prompt_lookup_min": min_n,
+                                       "prompt_lookup_max": max_n,
+                                       "num_speculative_tokens": k,
+                                       "method": "ngram",
+                                   })))
+
+    # No match.
+    result = ngram_proposer(
+        2, 2, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 5]))
+    assert result is None
+
+    # No match for 4-gram.
+    result = ngram_proposer(
+        4, 4, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
+    assert result is None
+
+    # No match for 4-gram but match for 3-gram.
+    result = ngram_proposer(
+        3, 4, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
+    assert np.array_equal(result, np.array([4, 1]))
+
+    # Match for both 4-gram and 3-gram.
+    # In this case, the proposer should return the 4-gram match.
+    result = ngram_proposer(3, 4, 2).propose(
+        context_token_ids=np.array([2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4]))
+    assert np.array_equal(result, np.array([1, 2]))  # Not [5, 1]
+
+    # Match for 2-gram and 3-gram, but not 4-gram.
+    result = ngram_proposer(
+        2, 4,
+        2).propose(context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4]))
+    assert np.array_equal(result, np.array([1, 2]))  # Not [5, 2]
diff --git a/vllm_v0.10.0/tests/v1/structured_output/__init__.py b/vllm_v0.10.0/tests/v1/structured_output/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/v1/structured_output/test_utils.py b/vllm_v0.10.0/tests/v1/structured_output/test_utils.py
new file mode 100644
index 0000000..4e7c4b3
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/structured_output/test_utils.py
@@ -0,0 +1,168 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.v1.structured_output.backend_xgrammar import (
+    has_xgrammar_unsupported_json_features)
+
+
+@pytest.fixture
+def unsupported_string_schemas():
+    return [
+        {
+            "type": "string",
+            "format": "email"
+        },
+    ]
+
+
+@pytest.fixture
+def unsupported_integer_schemas():
+    return [
+        {
+            "type": "integer",
+            "multipleOf": 120
+        },
+    ]
+
+
+@pytest.fixture
+def unsupported_number_schemas():
+    return [
+        {
+            "type": "number",
+            "multipleOf": 120
+        },
+    ]
+
+
+@pytest.fixture
+def unsupported_array_schemas():
+    return [
+        {
+            "type": "array",
+            "uniqueItems": True
+        },
+        {
+            "type": "array",
+            "contains": {
+                "type": "string"
+            }
+        },
+        {
+            "type": "array",
+            "minContains": 1
+        },
+        {
+            "type": "array",
+            "maxContains": 5
+        },
+    ]
+
+
+@pytest.fixture
+def unsupported_object_schemas():
+    return [
+        {
+            "type": "object",
+            "minProperties": 1
+        },
+        {
+            "type": "object",
+            "maxProperties": 5
+        },
+        {
+            "type": "object",
+            "propertyNames": {
+                "pattern": "^[a-z]+$"
+            }
+        },
+        {
+            "type": "object",
+            "patternProperties": {
+                "^S": {
+                    "type": "string"
+                }
+            }
+        },
+    ]
+
+
+@pytest.fixture
+def supported_schema():
+    return {
+        "type": "object",
+        "properties": {
+            "name": {
+                "type": "string"
+            },
+            "age": {
+                "type": "integer"
+            },
+            "status": {
+                "type": "string"
+            },
+            "scores": {
+                "type": "array",
+                "items": {
+                    "type": "number"
+                }
+            },
+            "car_type": {
+                "type": "string",
+                "enum": ["sedan", "suv", "truck"]
+            },
+            "car_brand": {
+                "type": "string",
+                "pattern": "^[a-zA-Z]+$"
+            },
+            "short_description": {
+                "type": "string",
+                "maxLength": 50
+            },
+            "mileage": {
+                "type": "number",
+                "minimum": 0,
+                "maximum": 1000000
+            },
+            "model_year": {
+                "type": "integer",
+                "exclusiveMinimum": 1900,
+                "exclusiveMaximum": 2100
+            },
+            "long_description": {
+                "type": "string",
+                "minLength": 50,
+                "maxLength": 2000
+            },
+            "address": {
+                "type": "object",
+                "properties": {
+                    "street": {
+                        "type": "string"
+                    },
+                    "city": {
+                        "type": "string"
+                    }
+                }
+            }
+        }
+    }
+
+
+@pytest.mark.parametrize("schema_type", [
+    "unsupported_string_schemas", "unsupported_integer_schemas",
+    "unsupported_number_schemas", "unsupported_array_schemas",
+    "unsupported_object_schemas"
+])
+def test_unsupported_json_features_by_type(schema_type, request):
+    schemas = request.getfixturevalue(schema_type)
+    for schema in schemas:
+        assert has_xgrammar_unsupported_json_features(
+            schema), f"Schema should be unsupported: {schema}"
+
+
+def test_supported_json_features(supported_schema):
+    assert not has_xgrammar_unsupported_json_features(
+        supported_schema), "Schema should be supported"
diff --git a/vllm_v0.10.0/tests/v1/test_async_llm_dp.py b/vllm_v0.10.0/tests/v1/test_async_llm_dp.py
new file mode 100644
index 0000000..6716d27
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/test_async_llm_dp.py
@@ -0,0 +1,161 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import os
+from contextlib import ExitStack
+from dataclasses import dataclass
+from typing import Optional
+
+import pytest
+
+from vllm import SamplingParams
+from vllm.config import VllmConfig
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.inputs import PromptType
+from vllm.platforms import current_platform
+from vllm.sampling_params import RequestOutputKind
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.engine.core_client import DPAsyncMPClient
+from vllm.v1.metrics.loggers import StatLoggerBase
+from vllm.v1.metrics.stats import IterationStats, SchedulerStats
+
+DP_SIZE = int(os.getenv("DP_SIZE", 2))
+
+engine_args = AsyncEngineArgs(
+    model="ibm-research/PowerMoE-3b",
+    enforce_eager=True,
+    disable_log_requests=True,
+    tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
+    data_parallel_size=DP_SIZE,
+)
+
+if not current_platform.supports_v1(engine_args.create_model_config()):
+    pytest.skip(reason="Requires V1-supporting platform.",
+                allow_module_level=True)
+
+
+async def generate(
+        engine: AsyncLLM,
+        request_id: str,
+        prompt: PromptType,
+        output_kind: RequestOutputKind,
+        max_tokens: int,
+        prompt_logprobs: Optional[int] = None,
+        data_parallel_rank: Optional[int] = None) -> tuple[int, str]:
+    # Ensure generate doesn't complete too fast for cancellation test.
+    await asyncio.sleep(0.2)
+
+    count = 0
+    sampling_params = SamplingParams(max_tokens=max_tokens,
+                                     ignore_eos=True,
+                                     output_kind=output_kind,
+                                     temperature=0,
+                                     prompt_logprobs=prompt_logprobs)
+    async for out in engine.generate(request_id=request_id,
+                                     prompt=prompt,
+                                     sampling_params=sampling_params,
+                                     data_parallel_rank=data_parallel_rank):
+
+        num_tokens = len(out.outputs[0].token_ids)
+        if output_kind == RequestOutputKind.DELTA:
+            count += num_tokens
+        else:
+            count = num_tokens
+
+        await asyncio.sleep(0.)
+
+    return count, request_id
+
+
+@pytest.mark.parametrize(
+    "output_kind",
+    [
+        RequestOutputKind.DELTA,
+        RequestOutputKind.FINAL_ONLY,
+    ],
+)
+@pytest.mark.parametrize("data_parallel_backend", ["mp", "ray"])
+@pytest.mark.asyncio
+async def test_load(output_kind: RequestOutputKind,
+                    data_parallel_backend: str):
+
+    stats_loggers = {}
+
+    @dataclass
+    class SimpleStatsLogger(StatLoggerBase):
+        init_count: int = 0
+        finished_req_count: int = 0
+
+        def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
+            stats_loggers[engine_index] = self
+
+        def record(self,
+                   scheduler_stats: Optional[SchedulerStats],
+                   iteration_stats: Optional[IterationStats],
+                   engine_idx: int = 0):
+            if iteration_stats:
+                self.finished_req_count += len(
+                    iteration_stats.finished_requests)
+
+        def log_engine_initialized(self):
+            self.init_count += 1
+
+    with ExitStack() as after:
+
+        prompt = "This is a test of data parallel"
+
+        engine_args.data_parallel_backend = data_parallel_backend
+        engine = AsyncLLM.from_engine_args(engine_args,
+                                           stat_loggers=[SimpleStatsLogger])
+        after.callback(engine.shutdown)
+
+        NUM_REQUESTS = 100
+        NUM_EXPECTED_TOKENS = 10
+
+        request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
+
+        # Create concurrent requests.
+        tasks = []
+        for request_id in request_ids:
+            tasks.append(
+                asyncio.create_task(
+                    generate(engine, request_id, prompt, output_kind,
+                             NUM_EXPECTED_TOKENS)))
+            # Short sleep to ensure that requests are distributed.
+            await asyncio.sleep(0.01)
+        # Confirm that we got all the EXPECTED tokens from the requests.
+        done, pending = await asyncio.wait(tasks,
+                                           return_when=asyncio.FIRST_EXCEPTION)
+        for task in pending:
+            task.cancel()
+        for task in done:
+            num_generated_tokens, request_id = await task
+            assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
+                f"{request_id} generated {num_generated_tokens} but "
+                f"expected {NUM_EXPECTED_TOKENS}")
+
+        assert not engine.output_processor.has_unfinished_requests()
+
+        # testing internals here which may break
+        core_client: DPAsyncMPClient = engine.engine_core
+        # the engines only synchronize stopping every N steps so
+        # allow a small amount of time here.
+        for _ in range(10):
+            if not core_client.engines_running:
+                break
+            await asyncio.sleep(0.5)
+
+        assert not core_client.engines_running
+        assert not core_client.reqs_in_flight
+
+        # Check that requests were distributed between the engines
+        print(f"Stats loggers after test: {stats_loggers}")
+        assert len(stats_loggers) == DP_SIZE
+        assert stats_loggers[0].init_count == 1
+
+        for sl in stats_loggers.values():
+            slogger: SimpleStatsLogger = sl
+
+            assert slogger.finished_req_count > NUM_REQUESTS // (
+                DP_SIZE + 1), f"requests are imbalanced: {stats_loggers}"
diff --git a/vllm_v0.10.0/tests/v1/test_external_lb_dp.py b/vllm_v0.10.0/tests/v1/test_external_lb_dp.py
new file mode 100644
index 0000000..98fefad
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/test_external_lb_dp.py
@@ -0,0 +1,312 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import os
+import threading
+import time
+from contextlib import AsyncExitStack
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+
+from tests.utils import RemoteOpenAIServer
+from vllm.platforms import Platform
+
+MODEL_NAME = "ibm-research/PowerMoE-3b"
+
+# Number of data parallel ranks for external LB testing
+DP_SIZE = int(os.getenv("DP_SIZE", "2"))
+# Default tensor parallel size to use
+TP_SIZE = int(os.getenv("TP_SIZE", "1"))
+
+
+class ExternalLBServerManager:
+    """Manages data parallel vLLM server instances for external
+    load balancer testing."""
+
+    def __init__(self,
+                 model_name: str,
+                 dp_size: int,
+                 api_server_count: int,
+                 base_server_args: list,
+                 tp_size: int = TP_SIZE):
+        self.model_name = model_name
+        self.dp_size = dp_size
+        self.tp_size = tp_size
+        self.api_server_count = api_server_count
+        self.base_server_args = base_server_args
+        self.servers: list[tuple[RemoteOpenAIServer, list[str]]] = []
+        self.server_threads: list[threading.Thread] = []
+
+    def __enter__(self) -> list[tuple[RemoteOpenAIServer, list[str]]]:
+        """Start all server instances for external LB mode."""
+        for rank in range(self.dp_size):
+            # Create server args for this specific rank
+            server_args = self.base_server_args.copy()
+
+            # Add external LB specific arguments
+            server_args.extend([
+                "--data-parallel-size",
+                str(self.dp_size),
+                "--data-parallel-rank",
+                str(rank),
+                "--data-parallel-size-local",
+                "1",
+                "--tensor-parallel-size",
+                str(self.tp_size),
+                "--port",
+                str(8000 + rank),  # Different port for each rank
+                "--api-server-count",
+                str(self.api_server_count),
+            ])
+
+            # Use a thread to start each server to allow parallel initialization
+            def start_server(r: int, sargs: list[str]):
+                try:
+                    # Start the server
+                    server = RemoteOpenAIServer(
+                        self.model_name,
+                        sargs,
+                        auto_port=False,
+                        env_dict={
+                            "CUDA_VISIBLE_DEVICES":
+                            ",".join(
+                                str(Platform.device_id_to_physical_device_id(
+                                    i))
+                                for i in range(r * TP_SIZE, (r + 1) * TP_SIZE))
+                        })
+                    server.__enter__()
+                    print(f"Server rank {r} started successfully with "
+                          f"{self.api_server_count} API servers")
+                    self.servers.append((server, sargs))
+                except Exception as e:
+                    print(f"Failed to start server rank {r}: {e}")
+                    raise
+
+            thread = threading.Thread(target=start_server,
+                                      args=(rank, server_args))
+            thread.start()
+
+            self.server_threads.append(thread)
+
+        # Wait for all servers to start
+        for thread in self.server_threads:
+            thread.join()
+
+        # Give servers additional time to fully initialize and coordinate
+        time.sleep(2)
+
+        if len(self.servers) != self.dp_size:
+            raise Exception("Servers failed to start")
+
+        return self.servers
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Stop all server instances."""
+        while self.servers:
+            try:
+                self.servers.pop()[0].__exit__(exc_type, exc_val, exc_tb)
+            except Exception as e:
+                print(f"Error stopping server: {e}")
+
+
+@pytest.fixture(scope="module")
+def default_server_args():
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+    ]
+
+
+@pytest.fixture(scope="module", params=[1, 4])
+def servers(request, default_server_args):
+    api_server_count = request.param
+    with ExternalLBServerManager(MODEL_NAME, DP_SIZE, api_server_count,
+                                 default_server_args) as server_list:
+        yield server_list
+
+
+@pytest_asyncio.fixture
+async def clients(servers: list[tuple[RemoteOpenAIServer, list[str]]]):
+    # Create a client for each server
+    async with AsyncExitStack() as stack:
+        yield [
+            await stack.enter_async_context(server.get_async_client())
+            for server, _ in servers
+        ]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_external_lb_single_completion(clients: list[
+    openai.AsyncOpenAI], servers: list[tuple[RemoteOpenAIServer, list[str]]],
+                                             model_name: str) -> None:
+
+    async def make_request(client: openai.AsyncOpenAI):
+        completion = await client.completions.create(
+            model=model_name,
+            prompt="Hello, my name is",
+            max_tokens=10,
+            temperature=1.0)
+
+        assert completion.id is not None
+        assert completion.choices is not None and len(completion.choices) == 1
+
+        choice = completion.choices[0]
+        # The exact number of tokens can vary slightly with temperature=1.0,
+        # so we check for a reasonable minimum length.
+        assert len(choice.text) >= 1
+        # Finish reason might not always be 'length' if the model finishes early
+        # or due to other reasons, especially with high temperature.
+        # So, we'll accept 'length' or 'stop'.
+        assert choice.finish_reason in ("length", "stop")
+
+        # Token counts can also vary, so we check they are positive.
+        assert completion.usage.completion_tokens > 0
+        assert completion.usage.prompt_tokens > 0
+        assert completion.usage.total_tokens > 0
+        return completion
+
+    # Test single request to each server
+    for i, client in enumerate(clients):
+        result = await make_request(client)
+        assert result is not None
+        print(f"Server {i} handled single completion request successfully")
+
+    await asyncio.sleep(0.5)
+
+    # Send requests to all servers in round-robin fashion
+    num_requests_per_server = 25  # Total 50 requests across 2 servers
+    all_tasks = []
+
+    for i, client in enumerate(clients):
+        tasks = [make_request(client) for _ in range(num_requests_per_server)]
+        all_tasks.extend(tasks)
+
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests_per_server * len(clients)
+    assert all(completion is not None for completion in results)
+
+    await asyncio.sleep(0.5)
+
+    # Second burst of requests
+    all_tasks = []
+    for i, client in enumerate(clients):
+        tasks = [make_request(client) for _ in range(num_requests_per_server)]
+        all_tasks.extend(tasks)
+
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests_per_server * len(clients)
+    assert all(completion is not None for completion in results)
+
+    _, server_args = servers[0]
+    api_server_count = (
+        server_args.count('--api-server-count')
+        and server_args[server_args.index('--api-server-count') + 1] or 1)
+    print(
+        f"Successfully completed external LB test with {len(clients)} servers "
+        f"(API server count: {api_server_count})")
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_external_lb_completion_streaming(clients: list[
+    openai.AsyncOpenAI], servers: list[tuple[RemoteOpenAIServer, list[str]]],
+                                                model_name: str) -> None:
+    prompt = "What is an LLM?"
+
+    async def make_streaming_request(client: openai.AsyncOpenAI):
+        # Perform a non-streaming request to get the expected full output
+        single_completion = await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+        )
+        single_output = single_completion.choices[0].text
+
+        # Perform the streaming request
+        stream = await client.completions.create(model=model_name,
+                                                 prompt=prompt,
+                                                 max_tokens=5,
+                                                 temperature=0.0,
+                                                 stream=True)
+        chunks: list[str] = []
+        finish_reason_count = 0
+        last_chunk = None
+        async for chunk in stream:
+            chunks.append(chunk.choices[0].text)
+            if chunk.choices[0].finish_reason is not None:
+                finish_reason_count += 1
+            last_chunk = chunk  # Keep track of the last chunk
+
+        # finish reason should only return in the last block for OpenAI API
+        assert finish_reason_count == 1, (
+            "Finish reason should appear exactly once.")
+        assert last_chunk is not None, (
+            "Stream should have yielded at least one chunk.")
+        assert last_chunk.choices[
+            0].finish_reason == "length", "Finish reason should be 'length'."
+        # Check that the combined text matches the non-streamed version.
+        assert "".join(
+            chunks
+        ) == single_output, "Streamed output should match non-streamed output."
+        return True  # Indicate success for this request
+
+    # Test single request to each server
+    for i, client in enumerate(clients):
+        result = await make_streaming_request(client)
+        assert result is not None
+        print(f"Server {i} handled single streaming request successfully")
+
+    await asyncio.sleep(0.5)
+
+    # Send streaming requests to all servers in round-robin fashion
+    num_requests_per_server = 25  # Total 50 requests across 2 servers
+    all_tasks = []
+
+    for i, client in enumerate(clients):
+        tasks = [
+            make_streaming_request(client)
+            for _ in range(num_requests_per_server)
+        ]
+        all_tasks.extend(tasks)
+
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests_per_server * len(clients)
+    assert all(results), "Not all streaming requests completed successfully."
+
+    await asyncio.sleep(0.5)
+
+    # Second burst of streaming requests
+    all_tasks = []
+    for i, client in enumerate(clients):
+        tasks = [
+            make_streaming_request(client)
+            for _ in range(num_requests_per_server)
+        ]
+        all_tasks.extend(tasks)
+
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests_per_server * len(clients)
+    assert all(results), "Not all streaming requests completed successfully."
+
+    _, server_args = servers[0]
+    api_server_count = (
+        server_args.count('--api-server-count')
+        and server_args[server_args.index('--api-server-count') + 1] or 1)
+    print(f"Successfully completed external LB streaming test with "
+          f"{len(clients)} servers (API server count: {api_server_count})")
diff --git a/vllm_v0.10.0/tests/v1/test_hybrid_lb_dp.py b/vllm_v0.10.0/tests/v1/test_hybrid_lb_dp.py
new file mode 100644
index 0000000..0833648
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/test_hybrid_lb_dp.py
@@ -0,0 +1,352 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import os
+import threading
+import time
+from contextlib import AsyncExitStack
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+
+from tests.utils import RemoteOpenAIServer
+from tests.v1.test_utils import check_request_balancing
+from vllm.platforms import Platform
+
+MODEL_NAME = "ibm-research/PowerMoE-3b"
+
+# Number of data parallel ranks for hybrid LB testing (4 total)
+DP_SIZE = int(os.getenv("DP_SIZE", "4"))
+# Default tensor parallel size to use
+TP_SIZE = int(os.getenv("TP_SIZE", "1"))
+
+# Number of nodes (2 nodes, each with 2 DP ranks)
+NUM_NODES = 2
+DP_SIZE_LOCAL = DP_SIZE // NUM_NODES  # 2 ranks per node
+
+
+class HybridLBServerManager:
+    """Manages hybrid data parallel vLLM server instances where each node 
+    runs a single logical API server that balances requests only to the 
+    DP engines running on that same node."""
+
+    def __init__(self,
+                 model_name: str,
+                 dp_size: int,
+                 api_server_count: int,
+                 base_server_args: list,
+                 dp_size_local: int = DP_SIZE_LOCAL,
+                 tp_size: int = TP_SIZE):
+        self.model_name = model_name
+        self.dp_size = dp_size
+        self.dp_size_local = dp_size_local
+        self.tp_size = tp_size
+        self.api_server_count = api_server_count
+        self.base_server_args = base_server_args
+        self.servers: list[tuple[RemoteOpenAIServer, list[str]]] = []
+        self.server_threads: list[threading.Thread] = []
+        self.num_nodes = dp_size // dp_size_local
+
+    def __enter__(self) -> list[tuple[RemoteOpenAIServer, list[str]]]:
+        """Start all server instances for hybrid LB mode."""
+        for node_id in range(self.num_nodes):
+            # Create server args for this specific node
+            server_args = self.base_server_args.copy()
+
+            # Calculate start rank for this node
+            start_rank = node_id * self.dp_size_local
+
+            # Add hybrid LB specific arguments
+            server_args.extend([
+                "--data-parallel-size",
+                str(self.dp_size),
+                "--data-parallel-size-local",
+                str(self.dp_size_local),
+                "--data-parallel-start-rank",
+                str(start_rank),
+                "--data-parallel-hybrid-lb",  # Enable hybrid LB mode
+                "--tensor-parallel-size",
+                str(self.tp_size),
+                "--port",
+                str(8000 + node_id),  # Different port for each node
+                "--api-server-count",
+                str(self.api_server_count),
+                "--data-parallel-address",
+                "127.0.0.1",
+                "--data-parallel-rpc-port",
+                "13345",
+            ])
+
+            # Use a thread to start each server to allow parallel initialization
+            def start_server(node: int, sargs: list[str]):
+                try:
+                    # Calculate GPU devices for this node
+                    gpus_per_node = self.dp_size_local * self.tp_size
+                    gpu_start = node * gpus_per_node
+                    gpu_end = gpu_start + gpus_per_node
+
+                    # Start the server
+                    server = RemoteOpenAIServer(
+                        self.model_name,
+                        sargs,
+                        auto_port=False,
+                        env_dict={
+                            "CUDA_VISIBLE_DEVICES":
+                            ",".join(
+                                str(Platform.device_id_to_physical_device_id(
+                                    i)) for i in range(gpu_start, gpu_end))
+                        })
+                    server.__enter__()
+                    print(f"Hybrid LB node {node} started successfully with "
+                          f"{self.dp_size_local} local DP ranks and "
+                          f"{self.api_server_count} API servers")
+                    self.servers.append((server, sargs))
+                except Exception as e:
+                    print(f"Failed to start hybrid LB node {node}: {e}")
+                    raise
+
+            thread = threading.Thread(target=start_server,
+                                      args=(node_id, server_args))
+            thread.start()
+
+            self.server_threads.append(thread)
+
+        # Wait for all servers to start
+        for thread in self.server_threads:
+            thread.join()
+
+        # Give servers additional time to fully initialize and coordinate
+        time.sleep(3)
+
+        if len(self.servers) != self.num_nodes:
+            raise Exception("Servers failed to start")
+
+        return self.servers
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Stop all server instances."""
+        while self.servers:
+            try:
+                self.servers.pop()[0].__exit__(exc_type, exc_val, exc_tb)
+            except Exception as e:
+                print(f"Error stopping server: {e}")
+
+
+@pytest.fixture(scope="module")
+def default_server_args():
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+    ]
+
+
+@pytest.fixture(scope="module", params=[1])  # Only 1 API server for now
+def servers(request, default_server_args):
+    api_server_count = request.param
+    with HybridLBServerManager(MODEL_NAME, DP_SIZE, api_server_count,
+                               default_server_args, DP_SIZE_LOCAL,
+                               TP_SIZE) as server_list:
+        yield server_list
+
+
+@pytest_asyncio.fixture
+async def clients(servers: list[tuple[RemoteOpenAIServer, list[str]]]):
+    # Create a client for each node (each node has its own API endpoint)
+    async with AsyncExitStack() as stack:
+        yield [
+            await stack.enter_async_context(server.get_async_client())
+            for server, _ in servers
+        ]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_hybrid_lb_completion(clients: list[openai.AsyncOpenAI],
+                                    servers: list[tuple[RemoteOpenAIServer,
+                                                        list[str]]],
+                                    model_name: str) -> None:
+
+    async def make_request(client: openai.AsyncOpenAI):
+        completion = await client.completions.create(
+            model=model_name,
+            prompt="Hello, my name is",
+            max_tokens=10,
+            temperature=1.0)
+
+        assert completion.id is not None
+        assert completion.choices is not None and len(completion.choices) == 1
+
+        choice = completion.choices[0]
+        # The exact number of tokens can vary slightly with temperature=1.0,
+        # so we check for a reasonable minimum length.
+        assert len(choice.text) >= 1
+        # Finish reason might not always be 'length' if the model finishes early
+        # or due to other reasons, especially with high temperature.
+        # So, we'll accept 'length' or 'stop'.
+        assert choice.finish_reason in ("length", "stop")
+
+        # Token counts can also vary, so we check they are positive.
+        assert completion.usage.completion_tokens > 0
+        assert completion.usage.prompt_tokens > 0
+        assert completion.usage.total_tokens > 0
+        return completion
+
+    # Test single request to each node
+    for i, client in enumerate(clients):
+        result = await make_request(client)
+        assert result is not None
+        print(
+            f"Hybrid LB node {i} handled single completion request successfully"
+        )
+
+    await asyncio.sleep(0.5)
+
+    # Send requests to all nodes - each should balance within its local DP ranks
+    num_requests_per_node = 25  # Total 50 requests across 2 nodes
+    all_tasks = []
+
+    for i, client in enumerate(clients):
+        tasks = [make_request(client) for _ in range(num_requests_per_node)]
+        all_tasks.extend(tasks)
+
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests_per_node * len(clients)
+    assert all(completion is not None for completion in results)
+
+    await asyncio.sleep(0.5)
+
+    # Second burst of requests
+    all_tasks = []
+    for i, client in enumerate(clients):
+        tasks = [make_request(client) for _ in range(num_requests_per_node)]
+        all_tasks.extend(tasks)
+
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests_per_node * len(clients)
+    assert all(completion is not None for completion in results)
+
+    _, server_args = servers[0]
+    api_server_count = (
+        server_args.count('--api-server-count')
+        and server_args[server_args.index('--api-server-count') + 1] or 1)
+    print(
+        f"Successfully completed hybrid LB test with {len(clients)} nodes "
+        f"({DP_SIZE_LOCAL} DP ranks each, API server count: {api_server_count})"
+    )
+
+    # Check request balancing within each node
+    for i, (server, _) in enumerate(servers):
+        print(f"Checking request balancing for node {i}")
+        check_request_balancing(server, DP_SIZE_LOCAL)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_hybrid_lb_completion_streaming(clients: list[
+    openai.AsyncOpenAI], servers: list[tuple[RemoteOpenAIServer, list[str]]],
+                                              model_name: str) -> None:
+    prompt = "What is an LLM?"
+
+    async def make_streaming_request(client: openai.AsyncOpenAI):
+        # Perform a non-streaming request to get the expected full output
+        single_completion = await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+        )
+        single_output = single_completion.choices[0].text
+
+        # Perform the streaming request
+        stream = await client.completions.create(model=model_name,
+                                                 prompt=prompt,
+                                                 max_tokens=5,
+                                                 temperature=0.0,
+                                                 stream=True)
+        chunks: list[str] = []
+        finish_reason_count = 0
+        last_chunk = None
+        async for chunk in stream:
+            chunks.append(chunk.choices[0].text)
+            if chunk.choices[0].finish_reason is not None:
+                finish_reason_count += 1
+            last_chunk = chunk  # Keep track of the last chunk
+
+        # finish reason should only return in the last block for OpenAI API
+        assert finish_reason_count == 1, (
+            "Finish reason should appear exactly once.")
+        assert last_chunk is not None, (
+            "Stream should have yielded at least one chunk.")
+        assert last_chunk.choices[
+            0].finish_reason == "length", "Finish reason should be 'length'."
+        # Check that the combined text matches the non-streamed version.
+        assert "".join(
+            chunks
+        ) == single_output, "Streamed output should match non-streamed output."
+        return True  # Indicate success for this request
+
+    # Test single request to each node
+    for i, client in enumerate(clients):
+        result = await make_streaming_request(client)
+        assert result is not None
+        print(
+            f"Hybrid LB node {i} handled single streaming request successfully"
+        )
+
+    await asyncio.sleep(0.5)
+
+    # Send streaming requests to all nodes
+    num_requests_per_node = 25  # Total 50 requests across 2 nodes
+    all_tasks = []
+
+    for i, client in enumerate(clients):
+        tasks = [
+            make_streaming_request(client)
+            for _ in range(num_requests_per_node)
+        ]
+        all_tasks.extend(tasks)
+
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests_per_node * len(clients)
+    assert all(results), "Not all streaming requests completed successfully."
+
+    await asyncio.sleep(0.5)
+
+    # Second burst of streaming requests
+    all_tasks = []
+    for i, client in enumerate(clients):
+        tasks = [
+            make_streaming_request(client)
+            for _ in range(num_requests_per_node)
+        ]
+        all_tasks.extend(tasks)
+
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests_per_node * len(clients)
+    assert all(results), "Not all streaming requests completed successfully."
+
+    _, server_args = servers[0]
+    api_server_count = (
+        server_args.count('--api-server-count')
+        and server_args[server_args.index('--api-server-count') + 1] or 1)
+    print(f"Successfully completed hybrid LB streaming test with "
+          f"{len(clients)} nodes ({DP_SIZE_LOCAL} DP ranks each, "
+          f"API server count: {api_server_count})")
+
+    # Check request balancing within each node
+    for i, (server, _) in enumerate(servers):
+        print(f"Checking streaming request balancing for node {i}")
+        check_request_balancing(server, DP_SIZE_LOCAL)
diff --git a/vllm_v0.10.0/tests/v1/test_internal_lb_dp.py b/vllm_v0.10.0/tests/v1/test_internal_lb_dp.py
new file mode 100644
index 0000000..9aef4d5
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/test_internal_lb_dp.py
@@ -0,0 +1,639 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import os
+import threading
+import time
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+
+from tests.utils import RemoteOpenAIServer
+from tests.v1.test_utils import check_request_balancing
+from vllm.platforms import Platform
+
+MODEL_NAME = "ibm-research/PowerMoE-3b"
+
+# Number of data parallel ranks for multi-node internal LB testing
+DP_SIZE = int(os.getenv("DP_SIZE", "2"))
+# Default tensor parallel size to use
+TP_SIZE = int(os.getenv("TP_SIZE", "1"))
+
+# Number of nodes to simulate
+NUM_NODES = 2
+
+
+class MultinodeInternalLBServerManager:
+    """Manages multi-node data parallel vLLM server instances for internal
+    load balancer testing using --headless mode."""
+
+    def __init__(self,
+                 model_name: str,
+                 dp_size: int,
+                 api_server_count: int,
+                 base_server_args: list,
+                 dp_per_node: int = 1,
+                 tp_size: int = TP_SIZE):
+        self.model_name = model_name
+        self.dp_size = dp_size
+        self.dp_per_node = dp_per_node
+        self.tp_size = tp_size
+        self.api_server_count = api_server_count
+        self.base_server_args = base_server_args
+        self.servers: list[tuple[RemoteOpenAIServer, list[str]]] = []
+        self.server_threads: list[threading.Thread] = []
+
+    def __enter__(self) -> list[tuple[RemoteOpenAIServer, list[str]]]:
+        """Start all server instances for multi-node internal LB mode."""
+        for rank in range(0, self.dp_size, self.dp_per_node):
+            # Create server args for this specific rank
+            server_args = self.base_server_args.copy()
+
+            if rank == 0:
+                # Head node - runs API server and first DP rank
+                server_args.extend([
+                    "--data-parallel-size",
+                    str(self.dp_size),
+                    "--data-parallel-size-local",
+                    str(self.dp_per_node),
+                    "--tensor-parallel-size",
+                    str(self.tp_size),
+                    "--port",
+                    "8000",  # Single endpoint for all requests
+                    "--api-server-count",
+                    str(self.api_server_count),
+                    "--data-parallel-address",
+                    "127.0.0.1",
+                    "--data-parallel-rpc-port",
+                    "13345",
+                ])
+            else:
+                # Secondary nodes - run in headless mode
+                server_args.extend([
+                    "--headless",
+                    "--data-parallel-size",
+                    str(self.dp_size),
+                    "--data-parallel-size-local",
+                    str(self.dp_per_node),
+                    "--data-parallel-start-rank",
+                    str(rank),
+                    "--tensor-parallel-size",
+                    str(self.tp_size),
+                    "--data-parallel-address",
+                    "127.0.0.1",
+                    "--data-parallel-rpc-port",
+                    "13345",
+                ])
+
+            # Use a thread to start each server to allow parallel initialization
+            def start_server(r: int, sargs: list[str]):
+                gpus_per_node = self.tp_size * self.dp_per_node
+                try:
+                    # Start the server
+                    server = RemoteOpenAIServer(
+                        self.model_name,
+                        sargs,
+                        auto_port=False,
+                        env_dict={
+                            "CUDA_VISIBLE_DEVICES":
+                            ",".join(
+                                str(Platform.device_id_to_physical_device_id(
+                                    i)) for i in range(r, r + gpus_per_node))
+                        })
+                    server.__enter__()
+                    if r == 0:
+                        print(
+                            f"Head node (rank {r}) started successfully with "
+                            f"{self.api_server_count} API servers")
+                    else:
+                        print(f"Headless node (rank {r}) started successfully")
+                    self.servers.append((server, sargs))
+                except Exception as e:
+                    print(f"Failed to start server rank {r}: {e}")
+                    raise
+
+            thread = threading.Thread(target=start_server,
+                                      args=(rank, server_args))
+            thread.start()
+
+            self.server_threads.append(thread)
+
+        # Wait for all servers to start
+        for thread in self.server_threads:
+            thread.join()
+
+        # Give servers additional time to fully initialize and coordinate
+        time.sleep(3)
+
+        if len(self.servers) != self.dp_size // self.dp_per_node:
+            raise Exception("Servers failed to start")
+
+        return self.servers
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Stop all server instances."""
+        while self.servers:
+            try:
+                self.servers.pop()[0].__exit__(exc_type, exc_val, exc_tb)
+            except Exception as e:
+                print(f"Error stopping server: {e}")
+
+
+class APIOnlyServerManager:
+    """Manages API-only server (Node 0) and headless engines server (Node 1)
+    for testing separated API server and engine configuration."""
+
+    def __init__(self,
+                 model_name: str,
+                 dp_size: int,
+                 api_server_count: int,
+                 base_server_args: list,
+                 tp_size: int = TP_SIZE):
+        self.model_name = model_name
+        self.dp_size = dp_size
+        self.tp_size = tp_size
+        self.api_server_count = api_server_count
+        self.base_server_args = base_server_args
+        self.servers: list[tuple[RemoteOpenAIServer, list[str]]] = []
+        self.server_threads: list[threading.Thread] = []
+
+    def __enter__(self) -> list[tuple[RemoteOpenAIServer, list[str]]]:
+        """Start API-only server and headless engines server."""
+
+        # Start API-only server (Node 0) - no engines, only API server
+        api_server_args = self.base_server_args.copy()
+        api_server_args.extend([
+            "--data-parallel-size",
+            str(self.dp_size),
+            "--data-parallel-size-local",
+            "0",  # No engines on this node
+            "--tensor-parallel-size",
+            str(self.tp_size),
+            "--port",
+            "8000",
+            "--api-server-count",
+            str(self.api_server_count),
+            "--data-parallel-address",
+            "127.0.0.1",
+            "--data-parallel-rpc-port",
+            "13345",
+        ])
+
+        # Start headless engines server (Node 1) - all engines, no API server
+        engines_server_args = self.base_server_args.copy()
+        engines_server_args.extend([
+            "--headless",
+            "--data-parallel-size",
+            str(self.dp_size),
+            "--data-parallel-size-local",
+            str(self.dp_size),  # All engines on this node
+            "--tensor-parallel-size",
+            str(self.tp_size),
+            "--data-parallel-address",
+            "127.0.0.1",
+            "--data-parallel-rpc-port",
+            "13345",
+        ])
+
+        # Use threads to start both servers in parallel
+        def start_api_server():
+            try:
+                server = RemoteOpenAIServer(
+                    self.model_name,
+                    api_server_args,
+                    auto_port=False,
+                    env_dict={})  # No GPUs needed for API-only server
+                server.__enter__()
+                print(f"API-only server started successfully with "
+                      f"{self.api_server_count} API servers")
+                self.servers.append((server, api_server_args))
+            except Exception as e:
+                print(f"Failed to start API-only server: {e}")
+                raise
+
+        def start_engines_server():
+            try:
+                server = RemoteOpenAIServer(
+                    self.model_name,
+                    engines_server_args,
+                    auto_port=False,
+                    env_dict={
+                        "CUDA_VISIBLE_DEVICES":
+                        ",".join(
+                            str(Platform.device_id_to_physical_device_id(i))
+                            for i in range(self.dp_size * self.tp_size))
+                    })
+                server.__enter__()
+                print(f"Headless engines server started successfully with "
+                      f"{self.dp_size} engines")
+                self.servers.append((server, engines_server_args))
+            except Exception as e:
+                print(f"Failed to start headless engines server: {e}")
+                raise
+
+        # Start API server first
+        api_thread = threading.Thread(target=start_api_server)
+        api_thread.start()
+        self.server_threads.append(api_thread)
+
+        # Start engines server second
+        engines_thread = threading.Thread(target=start_engines_server)
+        engines_thread.start()
+        self.server_threads.append(engines_thread)
+
+        # Wait for both servers to start
+        for thread in self.server_threads:
+            thread.join()
+
+        # Give servers additional time to fully initialize and coordinate
+        time.sleep(3)
+
+        if len(self.servers) != 2:
+            raise Exception("Both servers failed to start")
+
+        return self.servers
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Stop both server instances."""
+        while self.servers:
+            try:
+                self.servers.pop()[0].__exit__(exc_type, exc_val, exc_tb)
+            except Exception as e:
+                print(f"Error stopping server: {e}")
+
+
+@pytest.fixture(scope="module")
+def default_server_args():
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+    ]
+
+
+@pytest.fixture(scope="module", params=[1, 4])
+def servers(request, default_server_args):
+    api_server_count = request.param
+    with MultinodeInternalLBServerManager(MODEL_NAME, DP_SIZE,
+                                          api_server_count,
+                                          default_server_args,
+                                          DP_SIZE // NUM_NODES,
+                                          TP_SIZE) as server_list:
+        yield server_list
+
+
+@pytest.fixture(scope="module", params=[1, 4])
+def api_only_servers(request, default_server_args):
+    """Fixture for API-only server + headless engines configuration."""
+    api_server_count = request.param
+    with APIOnlyServerManager(MODEL_NAME, DP_SIZE, api_server_count,
+                              default_server_args, TP_SIZE) as server_list:
+        yield server_list
+
+
+@pytest_asyncio.fixture
+async def client(servers: list[tuple[RemoteOpenAIServer, list[str]]]):
+    # For internal LB, we only connect to the head node (rank 0)
+    # which provides the single API endpoint
+    head_server = servers[0][0]
+    async with head_server.get_async_client() as client:
+        yield client
+
+
+@pytest_asyncio.fixture
+async def api_only_client(api_only_servers: list[tuple[RemoteOpenAIServer,
+                                                       list[str]]]):
+    """Client fixture for API-only server configuration."""
+    # Connect to the API-only server (first server in the list)
+    api_server = api_only_servers[0][0]
+    async with api_server.get_async_client() as client:
+        yield client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_multinode_dp_completion(client: openai.AsyncOpenAI,
+                                       servers: list[tuple[RemoteOpenAIServer,
+                                                           list[str]]],
+                                       model_name: str) -> None:
+
+    async def make_request():
+        completion = await client.completions.create(
+            model=model_name,
+            prompt="Hello, my name is",
+            max_tokens=10,
+            temperature=1.0)
+
+        assert completion.id is not None
+        assert completion.choices is not None and len(completion.choices) == 1
+
+        choice = completion.choices[0]
+        # The exact number of tokens can vary slightly with temperature=1.0,
+        # so we check for a reasonable minimum length.
+        assert len(choice.text) >= 1
+        # Finish reason might not always be 'length' if the model finishes early
+        # or due to other reasons, especially with high temperature.
+        # So, we'll accept 'length' or 'stop'.
+        assert choice.finish_reason in ("length", "stop")
+
+        # Token counts can also vary, so we check they are positive.
+        assert completion.usage.completion_tokens > 0
+        assert completion.usage.prompt_tokens > 0
+        assert completion.usage.total_tokens > 0
+        return completion
+
+    # Test single request
+    result = await make_request()
+    assert result is not None
+    print(
+        "Multi-node internal LB handled single completion request successfully"
+    )
+
+    await asyncio.sleep(0.5)
+
+    # Send multiple requests - internal LB should distribute across DP ranks
+    num_requests = 50
+    all_tasks = [make_request() for _ in range(num_requests)]
+
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests
+    assert all(completion is not None for completion in results)
+
+    await asyncio.sleep(0.5)
+
+    # Second burst of requests
+    all_tasks = [make_request() for _ in range(num_requests)]
+
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests
+    assert all(completion is not None for completion in results)
+
+    _, server_args = servers[0]
+    api_server_count = (
+        server_args.count('--api-server-count')
+        and server_args[server_args.index('--api-server-count') + 1] or 1)
+    print(f"Successfully completed multi-node internal LB test with "
+          f"{len(servers)} DP ranks (API server count: {api_server_count})")
+
+    # Check request balancing via Prometheus metrics
+    head_server = servers[0][0]
+    check_request_balancing(head_server, DP_SIZE)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_multinode_dp_completion_streaming(client: openai.AsyncOpenAI,
+                                                 servers: list[
+                                                     tuple[RemoteOpenAIServer,
+                                                           list[str]]],
+                                                 model_name: str) -> None:
+    prompt = "What is an LLM?"
+
+    async def make_streaming_request():
+        # Perform a non-streaming request to get the expected full output
+        single_completion = await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+        )
+        single_output = single_completion.choices[0].text
+
+        # Perform the streaming request
+        stream = await client.completions.create(model=model_name,
+                                                 prompt=prompt,
+                                                 max_tokens=5,
+                                                 temperature=0.0,
+                                                 stream=True)
+        chunks: list[str] = []
+        finish_reason_count = 0
+        last_chunk = None
+        async for chunk in stream:
+            chunks.append(chunk.choices[0].text)
+            if chunk.choices[0].finish_reason is not None:
+                finish_reason_count += 1
+            last_chunk = chunk  # Keep track of the last chunk
+
+        # finish reason should only return in the last block for OpenAI API
+        assert finish_reason_count == 1, (
+            "Finish reason should appear exactly once.")
+        assert last_chunk is not None, (
+            "Stream should have yielded at least one chunk.")
+        assert last_chunk.choices[
+            0].finish_reason == "length", "Finish reason should be 'length'."
+        # Check that the combined text matches the non-streamed version.
+        assert "".join(
+            chunks
+        ) == single_output, "Streamed output should match non-streamed output."
+        return True  # Indicate success for this request
+
+    # Test single streaming request
+    result = await make_streaming_request()
+    assert result is not None
+    print(
+        "Multi-node internal LB handled single streaming request successfully")
+
+    await asyncio.sleep(0.5)
+
+    # Send multiple streaming requests - internal LB should distribute across
+    # DP ranks
+    num_requests = 50
+    all_tasks = [make_streaming_request() for _ in range(num_requests)]
+
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests
+    assert all(results), "Not all streaming requests completed successfully."
+
+    await asyncio.sleep(0.5)
+
+    # Second burst of streaming requests
+    all_tasks = [make_streaming_request() for _ in range(num_requests)]
+
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests
+    assert all(results), "Not all streaming requests completed successfully."
+
+    _, server_args = servers[0]
+    api_server_count = (
+        server_args.count('--api-server-count')
+        and server_args[server_args.index('--api-server-count') + 1] or 1)
+    print(f"Successfully completed multi-node internal LB streaming test with "
+          f"{len(servers)} DP ranks (API server count: {api_server_count})")
+
+    # Check request balancing via Prometheus metrics
+    head_server = servers[0][0]
+    check_request_balancing(head_server, DP_SIZE)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_api_only_multinode_dp_completion(
+        api_only_client: openai.AsyncOpenAI,
+        api_only_servers: list[tuple[RemoteOpenAIServer,
+                                     list[str]]], model_name: str) -> None:
+    """Test API-only server with all engines on separate headless server."""
+
+    async def make_request():
+        completion = await api_only_client.completions.create(
+            model=model_name,
+            prompt="Hello, my name is",
+            max_tokens=10,
+            temperature=1.0)
+
+        assert completion.id is not None
+        assert completion.choices is not None and len(completion.choices) == 1
+
+        choice = completion.choices[0]
+        # The exact number of tokens can vary slightly with temperature=1.0,
+        # so we check for a reasonable minimum length.
+        assert len(choice.text) >= 1
+        # Finish reason might not always be 'length' if the model finishes
+        # early or due to other reasons, especially with high temperature.
+        # So, we'll accept 'length' or 'stop'.
+        assert choice.finish_reason in ("length", "stop")
+
+        # Token counts can also vary, so we check they are positive.
+        assert completion.usage.completion_tokens > 0
+        assert completion.usage.prompt_tokens > 0
+        assert completion.usage.total_tokens > 0
+        return completion
+
+    # Test single request
+    result = await make_request()
+    assert result is not None
+    print("API-only server handled single completion request successfully")
+
+    await asyncio.sleep(0.5)
+
+    # Send multiple requests - should be distributed across engines on
+    # headless server
+    num_requests = 50
+    all_tasks = [make_request() for _ in range(num_requests)]
+
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests
+    assert all(completion is not None for completion in results)
+
+    await asyncio.sleep(0.5)
+
+    # Second burst of requests
+    all_tasks = [make_request() for _ in range(num_requests)]
+
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests
+    assert all(completion is not None for completion in results)
+
+    _, api_server_args = api_only_servers[0]
+    api_server_count = (
+        api_server_args.count('--api-server-count')
+        and api_server_args[api_server_args.index('--api-server-count') + 1]
+        or 1)
+    print(f"Successfully completed API-only multi-node test with {DP_SIZE} "
+          f"engines on headless server (API server count: {api_server_count})")
+
+    # Check request balancing via Prometheus metrics
+    api_server = api_only_servers[0][0]
+    check_request_balancing(api_server, DP_SIZE)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_api_only_multinode_dp_completion_streaming(
+        api_only_client: openai.AsyncOpenAI,
+        api_only_servers: list[tuple[RemoteOpenAIServer,
+                                     list[str]]], model_name: str) -> None:
+    """Test API-only server streaming with all engines on separate
+    headless server."""
+    prompt = "What is an LLM?"
+
+    async def make_streaming_request():
+        # Perform a non-streaming request to get the expected full output
+        single_completion = await api_only_client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+        )
+        single_output = single_completion.choices[0].text
+
+        # Perform the streaming request
+        stream = await api_only_client.completions.create(model=model_name,
+                                                          prompt=prompt,
+                                                          max_tokens=5,
+                                                          temperature=0.0,
+                                                          stream=True)
+        chunks: list[str] = []
+        finish_reason_count = 0
+        last_chunk = None
+        async for chunk in stream:
+            chunks.append(chunk.choices[0].text)
+            if chunk.choices[0].finish_reason is not None:
+                finish_reason_count += 1
+            last_chunk = chunk  # Keep track of the last chunk
+
+        # finish reason should only return in the last block for OpenAI API
+        assert finish_reason_count == 1, (
+            "Finish reason should appear exactly once.")
+        assert last_chunk is not None, (
+            "Stream should have yielded at least one chunk.")
+        assert last_chunk.choices[
+            0].finish_reason == "length", "Finish reason should be 'length'."
+        # Check that the combined text matches the non-streamed version.
+        assert "".join(
+            chunks
+        ) == single_output, "Streamed output should match non-streamed output."
+        return True  # Indicate success for this request
+
+    # Test single streaming request
+    result = await make_streaming_request()
+    assert result is not None
+    print("API-only server handled single streaming request successfully")
+
+    await asyncio.sleep(0.5)
+
+    # Send multiple streaming requests - should be distributed across engines
+    num_requests = 50
+    all_tasks = [make_streaming_request() for _ in range(num_requests)]
+
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests
+    assert all(results), "Not all streaming requests completed successfully."
+
+    await asyncio.sleep(0.5)
+
+    # Second burst of streaming requests
+    all_tasks = [make_streaming_request() for _ in range(num_requests)]
+
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests
+    assert all(results), "Not all streaming requests completed successfully."
+
+    _, api_server_args = api_only_servers[0]
+    api_server_count = (
+        api_server_args.count('--api-server-count')
+        and api_server_args[api_server_args.index('--api-server-count') + 1]
+        or 1)
+    print(f"Successfully completed API-only streaming test with {DP_SIZE} "
+          f"engines on headless server (API server count: {api_server_count})")
+
+    # Check request balancing via Prometheus metrics
+    api_server = api_only_servers[0][0]
+    check_request_balancing(api_server, DP_SIZE)
diff --git a/vllm_v0.10.0/tests/v1/test_metrics_reader.py b/vllm_v0.10.0/tests/v1/test_metrics_reader.py
new file mode 100644
index 0000000..c05de5e
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/test_metrics_reader.py
@@ -0,0 +1,113 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import prometheus_client
+import pytest
+
+from vllm.v1.metrics.reader import (Counter, Gauge, Histogram, Vector,
+                                    get_metrics_snapshot)
+
+
+@pytest.fixture(autouse=True)
+def test_registry(monkeypatch):
+    # Use a custom registry for tests
+    test_registry = prometheus_client.CollectorRegistry(auto_describe=True)
+    monkeypatch.setattr("vllm.v1.metrics.reader.REGISTRY", test_registry)
+    return test_registry
+
+
+@pytest.mark.parametrize("num_engines", [1, 4])
+def test_gauge_metric(test_registry, num_engines):
+    g = prometheus_client.Gauge("vllm:test_gauge",
+                                "Test gauge metric",
+                                labelnames=["model", "engine_index"],
+                                registry=test_registry)
+    for i in range(num_engines):
+        g.labels(model="foo", engine_index=str(i)).set(98.5)
+
+    metrics = get_metrics_snapshot()
+    assert len(metrics) == num_engines
+    engine_labels = [str(i) for i in range(num_engines)]
+    for m in metrics:
+        assert isinstance(m, Gauge)
+        assert m.name == "vllm:test_gauge"
+        assert m.value == 98.5
+        assert m.labels["model"] == "foo"
+        assert m.labels["engine_index"] in engine_labels
+        engine_labels.remove(m.labels["engine_index"])
+
+
+@pytest.mark.parametrize("num_engines", [1, 4])
+def test_counter_metric(test_registry, num_engines):
+    c = prometheus_client.Counter("vllm:test_counter",
+                                  "Test counter metric",
+                                  labelnames=["model", "engine_index"],
+                                  registry=test_registry)
+    for i in range(num_engines):
+        c.labels(model="bar", engine_index=str(i)).inc(19)
+
+    metrics = get_metrics_snapshot()
+    assert len(metrics) == num_engines
+    engine_labels = [str(i) for i in range(num_engines)]
+    for m in metrics:
+        assert isinstance(m, Counter)
+        assert m.name == "vllm:test_counter"
+        assert m.value == 19
+        assert m.labels["model"] == "bar"
+        assert m.labels["engine_index"] in engine_labels
+        engine_labels.remove(m.labels["engine_index"])
+
+
+@pytest.mark.parametrize("num_engines", [1, 4])
+def test_histogram_metric(test_registry, num_engines):
+    h = prometheus_client.Histogram("vllm:test_histogram",
+                                    "Test histogram metric",
+                                    labelnames=["model", "engine_index"],
+                                    buckets=[10, 20, 30, 40, 50],
+                                    registry=test_registry)
+    for i in range(num_engines):
+        hist = h.labels(model="blaa", engine_index=str(i))
+        hist.observe(42)
+        hist.observe(21)
+        hist.observe(7)
+
+    metrics = get_metrics_snapshot()
+    assert len(metrics) == num_engines
+    engine_labels = [str(i) for i in range(num_engines)]
+    for m in metrics:
+        assert isinstance(m, Histogram)
+        assert m.name == "vllm:test_histogram"
+        assert m.count == 3
+        assert m.sum == 70
+        assert m.buckets["10.0"] == 1
+        assert m.buckets["20.0"] == 1
+        assert m.buckets["30.0"] == 2
+        assert m.buckets["40.0"] == 2
+        assert m.buckets["50.0"] == 3
+        assert m.labels["model"] == "blaa"
+        assert m.labels["engine_index"] in engine_labels
+        engine_labels.remove(m.labels["engine_index"])
+
+
+@pytest.mark.parametrize("num_engines", [1, 4])
+def test_vector_metric(test_registry, num_engines):
+    c = prometheus_client.Counter(
+        "vllm:spec_decode_num_accepted_tokens_per_pos",
+        "Vector-like counter metric",
+        labelnames=["position", "model", "engine_index"],
+        registry=test_registry)
+    for i in range(num_engines):
+        c.labels(position="0", model="llama", engine_index=str(i)).inc(10)
+        c.labels(position="1", model="llama", engine_index=str(i)).inc(5)
+        c.labels(position="2", model="llama", engine_index=str(i)).inc(1)
+
+    metrics = get_metrics_snapshot()
+    assert len(metrics) == num_engines
+    engine_labels = [str(i) for i in range(num_engines)]
+    for m in metrics:
+        assert isinstance(m, Vector)
+        assert m.name == "vllm:spec_decode_num_accepted_tokens_per_pos"
+        assert m.values == [10, 5, 1]
+        assert m.labels["model"] == "llama"
+        assert m.labels["engine_index"] in engine_labels
+        engine_labels.remove(m.labels["engine_index"])
diff --git a/vllm_v0.10.0/tests/v1/test_oracle.py b/vllm_v0.10.0/tests/v1/test_oracle.py
new file mode 100644
index 0000000..b4d4348
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/test_oracle.py
@@ -0,0 +1,152 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+
+import pytest
+
+import vllm.envs as envs
+from vllm import LLM
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+
+UNSUPPORTED_MODELS_V1 = [
+    "openai/whisper-large-v3",  # transcription
+    "facebook/bart-large-cnn",  # encoder decoder
+    "state-spaces/mamba-130m-hf",  # mamba1
+    "BAAI/bge-m3",  # embedding
+]
+
+MODEL = "meta-llama/Llama-3.2-1B-Instruct"
+
+
+@pytest.mark.parametrize("model", UNSUPPORTED_MODELS_V1)
+def test_reject_unsupported_models(monkeypatch, model):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        args = AsyncEngineArgs(model=model)
+
+        with pytest.raises(NotImplementedError):
+            _ = args.create_engine_config()
+        m.delenv("VLLM_USE_V1")
+
+
+def test_reject_bad_config(monkeypatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+
+
+def test_unsupported_configs(monkeypatch):
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        with pytest.raises(NotImplementedError):
+            AsyncEngineArgs(
+                model=MODEL,
+                speculative_config={
+                    "model": MODEL,
+                },
+            ).create_engine_config()
+
+        with pytest.raises(NotImplementedError):
+            AsyncEngineArgs(
+                model=MODEL,
+                guided_decoding_backend="lm-format-enforcer",
+                guided_decoding_disable_fallback=True,
+            ).create_engine_config()
+
+        with pytest.raises(NotImplementedError):
+            AsyncEngineArgs(
+                model=MODEL,
+                preemption_mode="swap",
+            ).create_engine_config()
+
+        with pytest.raises(NotImplementedError):
+            AsyncEngineArgs(
+                model=MODEL,
+                disable_async_output_proc=True,
+            ).create_engine_config()
+
+        with pytest.raises(NotImplementedError):
+            AsyncEngineArgs(
+                model=MODEL,
+                num_scheduler_steps=5,
+            ).create_engine_config()
+
+        with pytest.raises(NotImplementedError):
+            AsyncEngineArgs(
+                model=MODEL,
+                scheduler_delay_factor=1.2,
+            ).create_engine_config()
+
+
+def test_enable_by_default_fallback(monkeypatch):
+    with monkeypatch.context() as m:
+        if os.getenv("VLLM_USE_V1", None):
+            m.delenv("VLLM_USE_V1")
+
+        # Should default to V1 for supported config.
+        _ = AsyncEngineArgs(
+            model=MODEL,
+            enforce_eager=True,
+        ).create_engine_config()
+        assert envs.VLLM_USE_V1
+        m.delenv("VLLM_USE_V1")
+
+        # Should fall back to V0 for supported model.
+        _ = AsyncEngineArgs(
+            model=UNSUPPORTED_MODELS_V1[0]).create_engine_config()
+        assert not envs.VLLM_USE_V1
+        m.delenv("VLLM_USE_V1")
+
+
+def test_v1_llm_by_default(monkeypatch):
+    with monkeypatch.context() as m:
+        if os.getenv("VLLM_USE_V1", None):
+            m.delenv("VLLM_USE_V1")
+
+        # Should default to V1 for supported config.
+        llm = LLM(MODEL, enforce_eager=True, enable_lora=True)
+        print(llm.generate("Hello my name is"))
+        assert hasattr(llm.llm_engine, "engine_core")
+        m.delenv("VLLM_USE_V1")
+
+
+def test_v1_attn_backend(monkeypatch):
+    with monkeypatch.context() as m:
+        if os.getenv("VLLM_USE_V1", None):
+            m.delenv("VLLM_USE_V1")
+        m.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+
+        # Fall back to V0.
+        _ = AsyncEngineArgs(model=MODEL).create_engine_config()
+        assert not envs.VLLM_USE_V1
+        m.delenv("VLLM_USE_V1")
+
+        # Reject if V1.
+        m.setenv("VLLM_USE_V1", "1")
+        with pytest.raises(NotImplementedError):
+            AsyncEngineArgs(model=MODEL).create_engine_config()
+        m.delenv("VLLM_USE_V1")
+
+        m.setenv("VLLM_ATTENTION_BACKEND", "FLASHMLA")
+        _ = AsyncEngineArgs(model=MODEL).create_engine_config()
+        assert envs.VLLM_USE_V1
+        m.delenv("VLLM_USE_V1")
+
+
+def test_reject_using_constructor_directly(monkeypatch):
+    with monkeypatch.context() as m:
+        if os.getenv("VLLM_USE_V1", None):
+            m.delenv("VLLM_USE_V1")
+
+        # Sets VLLM_USE_V1=1.
+        vllm_config = AsyncEngineArgs(model=MODEL).create_engine_config()
+
+        # This uses the V0 constructor directly.
+        with pytest.raises(ValueError):
+            AsyncLLMEngine(vllm_config,
+                           AsyncLLMEngine._get_executor_cls(vllm_config),
+                           log_stats=True)
+
+        m.delenv("VLLM_USE_V1")
diff --git a/vllm_v0.10.0/tests/v1/test_request.py b/vllm_v0.10.0/tests/v1/test_request.py
new file mode 100644
index 0000000..fb83574
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/test_request.py
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.v1.request import RequestStatus
+
+
+def test_request_status_fmt_str():
+    """Test that the string representation of RequestStatus is correct."""
+    assert f"{RequestStatus.WAITING}" == "WAITING"
+    assert f"{RequestStatus.WAITING_FOR_FSM}" == "WAITING_FOR_FSM"
+    assert f"{RequestStatus.WAITING_FOR_REMOTE_KVS}" == "WAITING_FOR_REMOTE_KVS"
+    assert f"{RequestStatus.RUNNING}" == "RUNNING"
+    assert f"{RequestStatus.PREEMPTED}" == "PREEMPTED"
+    assert f"{RequestStatus.FINISHED_STOPPED}" == "FINISHED_STOPPED"
+    assert f"{RequestStatus.FINISHED_LENGTH_CAPPED}" == "FINISHED_LENGTH_CAPPED"
+    assert f"{RequestStatus.FINISHED_ABORTED}" == "FINISHED_ABORTED"
+    assert f"{RequestStatus.FINISHED_IGNORED}" == "FINISHED_IGNORED"
diff --git a/vllm_v0.10.0/tests/v1/test_serial_utils.py b/vllm_v0.10.0/tests/v1/test_serial_utils.py
new file mode 100644
index 0000000..0ab4e0b
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/test_serial_utils.py
@@ -0,0 +1,302 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections import UserDict
+from dataclasses import dataclass
+from typing import Optional
+
+import msgspec
+import numpy as np
+import pytest
+import torch
+
+from vllm.multimodal.inputs import (MultiModalBatchedField,
+                                    MultiModalFieldElem, MultiModalFlatField,
+                                    MultiModalKwargs, MultiModalKwargsItem,
+                                    MultiModalSharedField, NestedTensors)
+from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
+
+
+class UnrecognizedType(UserDict):
+
+    def __init__(self, an_int: int):
+        super().__init__()
+        self.an_int = an_int
+
+
+@dataclass
+class MyType:
+    tensor1: torch.Tensor
+    a_string: str
+    list_of_tensors: list[torch.Tensor]
+    numpy_array: np.ndarray
+    unrecognized: UnrecognizedType
+    small_f_contig_tensor: torch.Tensor
+    large_f_contig_tensor: torch.Tensor
+    small_non_contig_tensor: torch.Tensor
+    large_non_contig_tensor: torch.Tensor
+    empty_tensor: torch.Tensor
+
+
+def test_encode_decode(monkeypatch: pytest.MonkeyPatch):
+    """Test encode/decode loop with zero-copy tensors."""
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+
+        obj = MyType(
+            tensor1=torch.randint(low=0,
+                                  high=100,
+                                  size=(1024, ),
+                                  dtype=torch.int32),
+            a_string="hello",
+            list_of_tensors=[
+                torch.rand((1, 10), dtype=torch.float32),
+                torch.rand((3, 5, 4000), dtype=torch.float64),
+                torch.tensor(1984),  # test scalar too
+                # Make sure to test bf16 which numpy doesn't support.
+                torch.rand((3, 5, 1000), dtype=torch.bfloat16),
+                torch.tensor([float("-inf"), float("inf")] * 1024,
+                             dtype=torch.bfloat16),
+            ],
+            numpy_array=np.arange(512),
+            unrecognized=UnrecognizedType(33),
+            small_f_contig_tensor=torch.rand(5, 4).t(),
+            large_f_contig_tensor=torch.rand(1024, 4).t(),
+            small_non_contig_tensor=torch.rand(2, 4)[:, 1:3],
+            large_non_contig_tensor=torch.rand(1024, 512)[:, 10:20],
+            empty_tensor=torch.empty(0),
+        )
+
+        encoder = MsgpackEncoder(size_threshold=256)
+        decoder = MsgpackDecoder(MyType)
+
+        encoded = encoder.encode(obj)
+
+        # There should be the main buffer + 4 large tensor buffers
+        # + 1 large numpy array. "large" is <= 512 bytes.
+        # The two small tensors are encoded inline.
+        assert len(encoded) == 8
+
+        decoded: MyType = decoder.decode(encoded)
+
+        assert_equal(decoded, obj)
+
+        # Test encode_into case
+
+        preallocated = bytearray()
+
+        encoded2 = encoder.encode_into(obj, preallocated)
+
+        assert len(encoded2) == 8
+        assert encoded2[0] is preallocated
+
+        decoded2: MyType = decoder.decode(encoded2)
+
+        assert_equal(decoded2, obj)
+
+
+class MyRequest(msgspec.Struct):
+    mm: Optional[list[MultiModalKwargs]]
+
+
+def test_multimodal_kwargs():
+    d = {
+        "foo":
+        torch.zeros(20000, dtype=torch.float16),
+        "bar": [torch.zeros(i * 1000, dtype=torch.int8) for i in range(3)],
+        "baz": [
+            torch.rand((256), dtype=torch.float16),
+            [
+                torch.rand((1, 12), dtype=torch.float32),
+                torch.rand((3, 5, 7), dtype=torch.float64),
+            ], [torch.rand((4, 4), dtype=torch.float16)]
+        ],
+    }
+
+    # pack mm kwargs into a mock request so that it can be decoded properly
+    req = MyRequest(mm=[MultiModalKwargs(d)])
+
+    encoder = MsgpackEncoder()
+    decoder = MsgpackDecoder(MyRequest)
+
+    encoded = encoder.encode(req)
+
+    assert len(encoded) == 6
+
+    total_len = sum(memoryview(x).cast("B").nbytes for x in encoded)
+
+    # expected total encoding length, should be 44559, +-20 for minor changes
+    assert 44539 <= total_len <= 44579
+    decoded: MultiModalKwargs = decoder.decode(encoded).mm[0]
+    assert all(nested_equal(d[k], decoded[k]) for k in d)
+
+
+def test_multimodal_items_by_modality():
+    e1 = MultiModalFieldElem("audio", "a0",
+                             torch.zeros(1000, dtype=torch.bfloat16),
+                             MultiModalBatchedField())
+    e2 = MultiModalFieldElem(
+        "video",
+        "v0",
+        [torch.zeros(1000, dtype=torch.int8) for _ in range(4)],
+        MultiModalFlatField(
+            [[slice(1, 2, 3), slice(4, 5, 6)], [slice(None, 2)]], 0),
+    )
+    e3 = MultiModalFieldElem("image", "i0", torch.zeros(1000,
+                                                        dtype=torch.int32),
+                             MultiModalSharedField(4))
+    e4 = MultiModalFieldElem(
+        "image", "i1", torch.zeros(1000, dtype=torch.int32),
+        MultiModalFlatField([slice(1, 2, 3), slice(4, 5, 6)], 2))
+    audio = MultiModalKwargsItem.from_elems([e1])
+    video = MultiModalKwargsItem.from_elems([e2])
+    image = MultiModalKwargsItem.from_elems([e3, e4])
+    mm = MultiModalKwargs.from_items([audio, video, image])
+
+    # pack mm kwargs into a mock request so that it can be decoded properly
+    req = MyRequest([mm])
+
+    encoder = MsgpackEncoder()
+    decoder = MsgpackDecoder(MyRequest)
+
+    encoded = encoder.encode(req)
+
+    assert len(encoded) == 8
+
+    total_len = sum(memoryview(x).cast("B").nbytes for x in encoded)
+
+    # expected total encoding length, should be 14255, +-20 for minor changes
+    assert 14250 <= total_len <= 14300
+    decoded: MultiModalKwargs = decoder.decode(encoded).mm[0]
+
+    # check all modalities were recovered and do some basic sanity checks
+    assert len(decoded.modalities) == 3
+    images = decoded.get_items("image")
+    assert len(images) == 1
+    assert len(images[0].items()) == 2
+    assert list(images[0].keys()) == ["i0", "i1"]
+
+    # check the tensor contents and layout in the main dict
+    assert all(nested_equal(mm[k], decoded[k]) for k in mm)
+
+
+def nested_equal(a: NestedTensors, b: NestedTensors):
+    if isinstance(a, torch.Tensor):
+        return torch.equal(a, b)
+    return all(nested_equal(x, y) for x, y in zip(a, b))
+
+
+def assert_equal(obj1: MyType, obj2: MyType):
+    assert torch.equal(obj1.tensor1, obj2.tensor1)
+    assert obj1.a_string == obj2.a_string
+    assert all(
+        torch.equal(a, b)
+        for a, b in zip(obj1.list_of_tensors, obj2.list_of_tensors))
+    assert np.array_equal(obj1.numpy_array, obj2.numpy_array)
+    assert obj1.unrecognized.an_int == obj2.unrecognized.an_int
+    assert torch.equal(obj1.small_f_contig_tensor, obj2.small_f_contig_tensor)
+    assert torch.equal(obj1.large_f_contig_tensor, obj2.large_f_contig_tensor)
+    assert torch.equal(obj1.small_non_contig_tensor,
+                       obj2.small_non_contig_tensor)
+    assert torch.equal(obj1.large_non_contig_tensor,
+                       obj2.large_non_contig_tensor)
+    assert torch.equal(obj1.empty_tensor, obj2.empty_tensor)
+
+
+def test_dict_serialization():
+    """Test encoding and decoding of a generic Python object using pickle."""
+    encoder = MsgpackEncoder()
+    decoder = MsgpackDecoder()
+
+    # Create a sample Python object
+    obj = {"key": "value", "number": 42}
+
+    # Encode the object
+    encoded = encoder.encode(obj)
+
+    # Decode the object
+    decoded = decoder.decode(encoded)
+
+    # Verify the decoded object matches the original
+    assert obj == decoded, "Decoded object does not match the original object."
+
+
+def test_tensor_serialization():
+    """Test encoding and decoding of a torch.Tensor."""
+    encoder = MsgpackEncoder()
+    decoder = MsgpackDecoder(torch.Tensor)
+
+    # Create a sample tensor
+    tensor = torch.rand(10, 10)
+
+    # Encode the tensor
+    encoded = encoder.encode(tensor)
+
+    # Decode the tensor
+    decoded = decoder.decode(encoded)
+
+    # Verify the decoded tensor matches the original
+    assert torch.allclose(
+        tensor, decoded), "Decoded tensor does not match the original tensor."
+
+
+def test_numpy_array_serialization():
+    """Test encoding and decoding of a numpy array."""
+    encoder = MsgpackEncoder()
+    decoder = MsgpackDecoder(np.ndarray)
+
+    # Create a sample numpy array
+    array = np.random.rand(10, 10)
+
+    # Encode the numpy array
+    encoded = encoder.encode(array)
+
+    # Decode the numpy array
+    decoded = decoder.decode(encoded)
+
+    # Verify the decoded array matches the original
+    assert np.allclose(
+        array,
+        decoded), "Decoded numpy array does not match the original array."
+
+
+class CustomClass:
+
+    def __init__(self, value):
+        self.value = value
+
+    def __eq__(self, other):
+        return isinstance(other, CustomClass) and self.value == other.value
+
+
+def test_custom_class_serialization_allowed_with_pickle(
+        monkeypatch: pytest.MonkeyPatch):
+    """Test that serializing a custom class succeeds when allow_pickle=True."""
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+        encoder = MsgpackEncoder()
+        decoder = MsgpackDecoder(CustomClass)
+
+        obj = CustomClass("test_value")
+
+        # Encode the custom class
+        encoded = encoder.encode(obj)
+
+        # Decode the custom class
+        decoded = decoder.decode(encoded)
+
+        # Verify the decoded object matches the original
+        assert obj == decoded, (
+            "Decoded object does not match the original object.")
+
+
+def test_custom_class_serialization_disallowed_without_pickle():
+    """Test that serializing a custom class fails when allow_pickle=False."""
+    encoder = MsgpackEncoder()
+
+    obj = CustomClass("test_value")
+
+    with pytest.raises(TypeError):
+        # Attempt to encode the custom class
+        encoder.encode(obj)
diff --git a/vllm_v0.10.0/tests/v1/test_utils.py b/vllm_v0.10.0/tests/v1/test_utils.py
new file mode 100644
index 0000000..0b892bd
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/test_utils.py
@@ -0,0 +1,187 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import re
+
+import pytest
+import requests
+import torch
+
+from tests.utils import RemoteOpenAIServer
+from vllm.v1.worker.utils import bind_kv_cache
+
+
+def test_bind_kv_cache():
+    from vllm.attention import Attention
+
+    ctx = {
+        'layers.0.self_attn': Attention(32, 128, 0.1),
+        'layers.1.self_attn': Attention(32, 128, 0.1),
+        'layers.2.self_attn': Attention(32, 128, 0.1),
+        'layers.3.self_attn': Attention(32, 128, 0.1),
+    }
+    kv_cache = {
+        'layers.0.self_attn': torch.zeros((1, )),
+        'layers.1.self_attn': torch.zeros((1, )),
+        'layers.2.self_attn': torch.zeros((1, )),
+        'layers.3.self_attn': torch.zeros((1, )),
+    }
+    runner_kv_caches: list[torch.Tensor] = []
+    bind_kv_cache(kv_cache, ctx, runner_kv_caches)
+    assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[
+        'layers.0.self_attn']
+    assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[
+        'layers.1.self_attn']
+    assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[
+        'layers.2.self_attn']
+    assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[
+        'layers.3.self_attn']
+
+    assert runner_kv_caches[0] is kv_cache['layers.0.self_attn']
+    assert runner_kv_caches[1] is kv_cache['layers.1.self_attn']
+    assert runner_kv_caches[2] is kv_cache['layers.2.self_attn']
+    assert runner_kv_caches[3] is kv_cache['layers.3.self_attn']
+
+
+def test_bind_kv_cache_non_attention():
+    from vllm.attention import Attention
+
+    # example from Jamba PP=2
+    ctx = {
+        'model.layers.20.attn': Attention(32, 128, 0.1),
+        'model.layers.28.attn': Attention(32, 128, 0.1),
+    }
+    kv_cache = {
+        'model.layers.20.attn': torch.zeros((1, )),
+        'model.layers.28.attn': torch.zeros((1, )),
+    }
+
+    runner_kv_caches: list[torch.Tensor] = []
+    bind_kv_cache(kv_cache, ctx, runner_kv_caches)
+
+    assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[
+        'model.layers.20.attn']
+    assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[
+        'model.layers.28.attn']
+
+    assert runner_kv_caches[0] is kv_cache['model.layers.20.attn']
+    assert runner_kv_caches[1] is kv_cache['model.layers.28.attn']
+
+
+# Prometheus metrics utilities for testing
+
+
+def get_prometheus_metrics(
+        server: RemoteOpenAIServer) -> dict[str, dict[str, float]]:
+    """Fetch and parse Prometheus metrics from the /metrics endpoint.
+    
+    Returns:
+        Dict mapping metric names to their values grouped by labels.
+        For example: {"vllm:request_success": {
+            "engine=0": 5.0, "engine=1": 3.0}
+        }
+    """
+    try:
+        response = requests.get(server.url_for("metrics"), timeout=10)
+        response.raise_for_status()
+
+        metrics: dict[str, dict[str, float]] = {}
+
+        # Regex patterns for Prometheus metrics
+        metric_with_labels = re.compile(
+            r'^([a-zA-Z_:][a-zA-Z0-9_:]*)\{([^}]*)\}\s+([\d\.\-\+e]+)$')
+        metric_simple = re.compile(
+            r'^([a-zA-Z_:][a-zA-Z0-9_:]*)\s+([\d\.\-\+e]+)$')
+
+        for line in response.text.split('\n'):
+            line = line.strip()
+            # Skip comments and empty lines
+            if not line or line.startswith('#'):
+                continue
+
+            # Try to match metric with labels first
+            match = metric_with_labels.match(line)
+            if match:
+                metric_name, labels_part, value_str = match.groups()
+                try:
+                    value = float(value_str)
+                    if metric_name not in metrics:
+                        metrics[metric_name] = {}
+                    metrics[metric_name][f'{{{labels_part}}}'] = value
+                except ValueError:
+                    continue
+            else:
+                # Try simple metric without labels
+                match = metric_simple.match(line)
+                if match:
+                    metric_name, value_str = match.groups()
+                    try:
+                        value = float(value_str)
+                        if metric_name not in metrics:
+                            metrics[metric_name] = {}
+                        metrics[metric_name][''] = value
+                    except ValueError:
+                        continue
+
+        return metrics
+    except Exception as e:
+        pytest.fail(f"Failed to fetch Prometheus metrics: {e}")
+        return {}
+
+
+def get_engine_request_counts(
+        metrics: dict[str, dict[str, float]]) -> dict[str, float]:
+    """Extract request counts per engine from Prometheus metrics.
+    
+    Returns:
+        Dict mapping engine indices to request counts.
+        For example: {"0": 15.0, "1": 12.0}
+    """
+    engine_counts = {}
+
+    # Look for request success metrics with engine labels
+    success_metrics = metrics.get("vllm:request_success_total", {})
+    engine_pattern = re.compile(r'engine="([^"]*)"')
+
+    for labels, count in success_metrics.items():
+        # Extract engine ID from labels using regex
+        match = engine_pattern.search(labels)
+        if match:
+            engine_id = match.group(1)
+            if engine_id not in engine_counts:
+                engine_counts[engine_id] = 0.0
+            engine_counts[engine_id] += count
+
+    return engine_counts
+
+
+def check_request_balancing(server: RemoteOpenAIServer, dp_size: int):
+    """Check request balancing via Prometheus metrics if dp_size > 1.
+    
+    Args:
+        server: The RemoteOpenAIServer instance
+        dp_size: Number of data parallel ranks
+    """
+    if dp_size <= 1:
+        return
+
+    # Get metrics after all requests are completed
+    metrics = get_prometheus_metrics(server)
+    engine_counts = get_engine_request_counts(metrics)
+
+    # Check that multiple engines received requests
+    engines_with_requests = [
+        engine for engine, count in engine_counts.items() if count > 0
+    ]
+    assert len(engines_with_requests) == dp_size, (
+        f"Expected requests to be distributed across multiple engines,"
+        f" but only engine(s) {engines_with_requests} received "
+        f"requests. Engine counts: {engine_counts}")
+
+    # Verify that the load is reasonably balanced
+    # (no engine should handle all requests)
+    total_requests = sum(engine_counts.values())
+
+    for count in engine_counts.values():
+        assert count > total_requests // (dp_size + 1), (
+            f"requests are imbalanced: {engine_counts}")
diff --git a/vllm_v0.10.0/tests/v1/tpu/__init__.py b/vllm_v0.10.0/tests/v1/tpu/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/v1/tpu/test_basic.py b/vllm_v0.10.0/tests/v1/tpu/test_basic.py
new file mode 100644
index 0000000..c8cd099
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/tpu/test_basic.py
@@ -0,0 +1,179 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""A basic correctness check for TPUs
+
+Run `pytest tests/v1/tpu/test_basic.py`.
+"""
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pytest
+from torch_xla._internal import tpu
+
+from vllm.platforms import current_platform
+
+if TYPE_CHECKING:
+    from tests.conftest import VllmRunner
+
+MODELS = [
+    "Qwen/Qwen2.5-1.5B-Instruct",
+    # TODO: Enable this models with v6e
+    # "Qwen/Qwen2-7B-Instruct",
+    # "meta-llama/Llama-3.1-8B",
+]
+
+TENSOR_PARALLEL_SIZES = [1]
+MAX_NUM_REQS = [16, 1024]
+
+# TODO: Enable when CI/CD will have a multi-tpu instance
+# TENSOR_PARALLEL_SIZES = [1, 4]
+
+
+@pytest.mark.skipif(not current_platform.is_tpu(),
+                    reason="This is a basic test for TPU only")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_tokens", [5])
+@pytest.mark.parametrize("tensor_parallel_size", TENSOR_PARALLEL_SIZES)
+@pytest.mark.parametrize("max_num_seqs", MAX_NUM_REQS)
+def test_basic(
+    vllm_runner: type[VllmRunner],
+    monkeypatch: pytest.MonkeyPatch,
+    model: str,
+    max_tokens: int,
+    tensor_parallel_size: int,
+    max_num_seqs: int,
+) -> None:
+    prompt = "The next numbers of the sequence " + ", ".join(
+        str(i) for i in range(1024)) + " are:"
+    example_prompts = [prompt]
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        with vllm_runner(
+                model,
+                # Note: max_num_batched_tokens == 1024 is needed here to
+                # actually test chunked prompt
+                max_num_batched_tokens=1024,
+                max_model_len=8192,
+                gpu_memory_utilization=0.7,
+                max_num_seqs=max_num_seqs,
+                tensor_parallel_size=tensor_parallel_size) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                      max_tokens)
+        output = vllm_outputs[0][1]
+
+        assert "1024" in output or "0, 1" in output
+
+
+@pytest.mark.skip(reason="Temporarily disabled due to timeout")
+@pytest.mark.skipif(not current_platform.is_tpu(),
+                    reason="This is a basic test for TPU only")
+@pytest.mark.parametrize("max_tokens", [8])
+@pytest.mark.parametrize("max_num_seqs", [16])
+def test_phi3(
+    vllm_runner: type[VllmRunner],
+    monkeypatch: pytest.MonkeyPatch,
+    max_tokens: int,
+    max_num_seqs: int,
+) -> None:
+    prompts = [
+        "A robot may not injure a human being",
+        "It is only with the heart that one can see rightly;",
+        "The greatest glory in living lies not in never falling,",
+    ]
+    answers = [
+        " or, by violating privacy",
+        " what is essential is love.",
+        " but in rising every time we fall.",
+    ]
+    # test head dim = 96
+    model = "microsoft/Phi-3-mini-128k-instruct"
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        with vllm_runner(model,
+                         max_num_batched_tokens=256,
+                         max_num_seqs=max_num_seqs) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
+        # vllm_outputs is a list of tuples whose first element is the token id
+        # and the second element is the output (including the prompt).
+        for output, answer in zip(vllm_outputs, answers):
+            generated_text = output[1]
+            assert answer in generated_text
+
+
+TP_SIZE_8 = 8
+
+
+@pytest.mark.skipif(not current_platform.is_tpu(),
+                    reason="This is a test for TPU only")
+@pytest.mark.skipif(tpu.num_available_chips() < TP_SIZE_8,
+                    reason=f"This test requires {TP_SIZE_8} TPU chips.")
+def test_gemma3_27b_with_text_input_and_tp(
+    vllm_runner: type[VllmRunner],
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    model = "google/gemma-3-27b-it"
+    max_tokens = 16
+    tensor_parallel_size = TP_SIZE_8
+    max_num_seqs = 4
+    prompts = [
+        "A robot may not injure a human being",
+        "It is only with the heart that one can see rightly;",
+        "The greatest glory in living lies not in never falling,",
+    ]
+    answers = [
+        " or, through inaction, allow a human being to come to harm.",
+        " what is essential is invisible to the eye.",
+        " but in rising every time we fall.",
+    ]
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        with vllm_runner(
+                model,
+                max_num_batched_tokens=256,
+                max_num_seqs=max_num_seqs,
+                tensor_parallel_size=tensor_parallel_size) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
+        # vllm_outputs is a list of tuples whose first element is the token id
+        # and the second element is the output (including the prompt).
+        for output, answer in zip(vllm_outputs, answers):
+            generated_text = output[1]
+            assert answer in generated_text
+
+
+@pytest.mark.skipif(not current_platform.is_tpu(),
+                    reason="This is a basic test for TPU only")
+def test_w8a8_quantization(
+    vllm_runner: type[VllmRunner],
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    model = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
+    max_tokens = 5
+    tensor_parallel_size = 1
+    max_num_seqs = 4
+
+    prompt = "The next numbers of the sequence " + ", ".join(
+        str(i) for i in range(1024)) + " are:"
+    example_prompts = [prompt]
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        with vllm_runner(
+                model,
+                max_num_batched_tokens=64,
+                max_model_len=4096,
+                gpu_memory_utilization=0.7,
+                max_num_seqs=max_num_seqs,
+                tensor_parallel_size=tensor_parallel_size) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                      max_tokens)
+        output = vllm_outputs[0][1]
+
+        assert "1024" in output or "0, 1" in output
diff --git a/vllm_v0.10.0/tests/v1/tpu/test_kv_cache_update_kernel.py b/vllm_v0.10.0/tests/v1/tpu/test_kv_cache_update_kernel.py
new file mode 100644
index 0000000..f827373
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/tpu/test_kv_cache_update_kernel.py
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import numpy as np
+import pytest
+import torch
+import torch_xla
+
+import vllm.v1.attention.backends.pallas  # noqa: F401
+from vllm.platforms import current_platform
+
+
+@pytest.mark.skipif(not current_platform.is_tpu(),
+                    reason="This is a test for TPU only")
+@pytest.mark.parametrize("page_size", [32, 33])
+@pytest.mark.parametrize("combined_kv_head_num", [2, 16])
+@pytest.mark.parametrize("head_dim", [128, 256])
+@pytest.mark.parametrize("num_slices_per_block", [4, 8])
+def test_kv_cache_update_kernel(page_size: int, combined_kv_head_num: int,
+                                head_dim: int, num_slices_per_block: int):
+    page_num = 1000
+    padded_num_tokens = 128
+    kv_cache_cpu = torch.zeros(
+        (page_num * page_size, combined_kv_head_num, head_dim),
+        dtype=torch.bfloat16,
+        device="cpu")
+    kv_cache_xla = kv_cache_cpu.to(torch_xla.device())
+    new_kv_cpu = torch.randn(
+        (padded_num_tokens, combined_kv_head_num, head_dim),
+        dtype=torch.bfloat16,
+        device="cpu")
+    new_kv_xla = new_kv_cpu.to(torch_xla.device())
+    slice_lens = np.array([7, page_size, page_size, 1, 1, 1, 9],
+                          dtype=np.int32)
+    num_kv_update_slices = len(slice_lens)
+    kv_cache_start_indices = np.array([
+        page_size * 2 - 7, page_size * 2, page_size * 3, page_size * 4 + 6,
+        page_size * 5 + 7, page_size * 6 + 8, page_size * 15 + 3
+    ],
+                                      dtype=np.int32)
+    new_kv_cache_indices = np.concatenate(
+        [np.array([0], dtype=np.int32),
+         np.cumsum(slice_lens[:-1])])
+    slot_mapping = np.stack(
+        [kv_cache_start_indices, new_kv_cache_indices, slice_lens], axis=1)
+    padded_size = (slot_mapping.shape[0] + num_slices_per_block -
+                   1) // num_slices_per_block * num_slices_per_block
+    slot_mapping = np.pad(slot_mapping,
+                          [[0, padded_size - slot_mapping.shape[0]], [0, 0]],
+                          constant_values=0)
+    slot_mapping = np.transpose(slot_mapping)
+    slot_mapping_cpu = torch.tensor(slot_mapping,
+                                    device="cpu",
+                                    dtype=torch.int32)
+    slot_mapping_xla = slot_mapping_cpu.to(torch_xla.device())
+    num_kv_update_slices_xla = torch.tensor([num_kv_update_slices],
+                                            device=torch_xla.device(),
+                                            dtype=torch.int32)
+    torch_xla.sync()
+
+    torch.ops.xla.dynamo_set_buffer_donor_(kv_cache_xla, True)
+    new_kv_cache_xla = torch.ops.xla.kv_cache_update_op(
+        new_kv_xla, slot_mapping_xla, kv_cache_xla, num_kv_update_slices_xla,
+        page_size, num_slices_per_block)
+    kv_cache_xla.copy_(new_kv_cache_xla)
+    torch_xla.sync()
+
+    for ni, ci, sl in zip(new_kv_cache_indices, kv_cache_start_indices,
+                          slice_lens):
+        kv_cache_cpu[ci:ci + sl, :, :] = new_kv_cpu[ni:ni + sl, :, :]
+
+    assert torch.allclose(kv_cache_xla.cpu(),
+                          kv_cache_cpu,
+                          atol=1e-4,
+                          rtol=1e-4)
diff --git a/vllm_v0.10.0/tests/v1/tpu/test_mha_attn.py b/vllm_v0.10.0/tests/v1/tpu/test_mha_attn.py
new file mode 100644
index 0000000..55fee4e
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/tpu/test_mha_attn.py
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test:
+
+* Tests for MultiHeadAttention layer
+"""
+
+import pytest
+import torch
+import torch_xla
+import torch_xla.core
+import torch_xla.core.xla_model
+
+from vllm import envs
+from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.selector import _cached_get_attn_backend
+from vllm.platforms import current_platform
+
+if not envs.VLLM_USE_V1:
+    pytest.skip(
+        "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
+        allow_module_level=True,
+    )
+
+
+@pytest.fixture(autouse=True)
+def clear_cache():
+    """Clear lru cache to ensure each test case runs without caching.
+    """
+    _cached_get_attn_backend.cache_clear()
+
+
+def ref_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    scale: float,
+) -> torch.Tensor:
+    """
+    Native implementation of scaled dot product attention without mask:
+    - query, key, value: [batch_size, seq_len, num_heads, head_size]
+    - attn_mask: [batch_size, seq_len, seq_len]
+    """
+    query, key, value = (x.transpose(1, 2) for x in (query, key, value))
+    attn_weights = scale * torch.matmul(query, key.transpose(2, 3))
+    attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
+    out = torch.matmul(attn_weights, value).transpose(1, 2)
+    return out
+
+
+BATCH_SIZES = [1, 16]
+SEQ_LENS = [1]
+NUM_HEADS = [1, 16]
+NUM_KV_HEADS = [1]
+HEAD_SIZES = [64, 80]
+
+
+@pytest.mark.skipif(not current_platform.is_tpu(),
+                    reason="This test needs a TPU")
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("seq_len", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("num_kv_heads", NUM_KV_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("device", [torch_xla.core.xla_model.xla_device()])
+def test_mha_attn_forward(
+    batch_size: int,
+    seq_len: int,
+    num_heads: int,
+    num_kv_heads: int,
+    head_size: int,
+    device: str,
+):
+    current_platform.seed_everything(0)
+    # These are expected to be f32
+    q = torch.randn(batch_size, seq_len, num_heads * head_size, device=device)
+    k = torch.randn(batch_size,
+                    seq_len,
+                    num_kv_heads * head_size,
+                    device=device)
+    v = torch.randn(batch_size,
+                    seq_len,
+                    num_kv_heads * head_size,
+                    device=device)
+    scale = 1.0 / head_size**0.5
+    attn = MultiHeadAttention(num_heads,
+                              head_size,
+                              scale=scale,
+                              num_kv_heads=num_kv_heads)
+    output = attn(q, k, v)
+
+    assert num_heads % num_kv_heads == 0
+    num_queries_per_kv = num_heads // num_kv_heads
+
+    q = q.reshape(batch_size, seq_len, num_heads, head_size)
+    k = k.reshape(batch_size, seq_len, num_kv_heads, head_size)
+    v = v.reshape(batch_size, seq_len, num_kv_heads, head_size)
+    if num_queries_per_kv > 1:
+        k = torch.repeat_interleave(k, num_queries_per_kv, dim=2)
+        v = torch.repeat_interleave(v, num_queries_per_kv, dim=2)
+
+    ref_output = ref_attention(
+        q,
+        k,
+        v,
+        scale=scale,
+    ).reshape(batch_size, seq_len, num_heads * head_size)
+    # torch_xla flash_attn kernel is less accurate but much faster
+    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-3)
diff --git a/vllm_v0.10.0/tests/v1/tpu/test_multimodal.py b/vllm_v0.10.0/tests/v1/tpu/test_multimodal.py
new file mode 100644
index 0000000..a61773a
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/tpu/test_multimodal.py
@@ -0,0 +1,92 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import openai
+import pytest
+
+from vllm import envs
+from vllm.multimodal.utils import encode_image_base64, fetch_image
+from vllm.platforms import current_platform
+
+from ...entrypoints.openai.test_vision import TEST_IMAGE_URLS
+from ...utils import RemoteOpenAIServer
+
+if not envs.VLLM_USE_V1:
+    pytest.skip(
+        "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
+        allow_module_level=True,
+    )
+
+
+@pytest.fixture(scope="session")
+def base64_encoded_image() -> dict[str, str]:
+    return {
+        image_url: encode_image_base64(fetch_image(image_url))
+        for image_url in TEST_IMAGE_URLS
+    }
+
+
+@pytest.mark.asyncio
+@pytest.mark.skipif(not current_platform.is_tpu(),
+                    reason="This test needs a TPU")
+@pytest.mark.parametrize("model_name", ["llava-hf/llava-1.5-7b-hf"])
+async def test_basic_vision(model_name: str, base64_encoded_image: dict[str,
+                                                                        str]):
+
+    pytest.skip("Skip this test until it's fixed.")
+
+    def whats_in_this_image_msg(b64):
+        return [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this image?"
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{b64}"
+                    },
+                },
+            ],
+        }]
+
+    server_args = [
+        "--max-model-len",
+        "1024",
+        "--max-num-seqs",
+        "16",
+        "--gpu-memory-utilization",
+        "0.95",
+        "--trust-remote-code",
+        "--max-num-batched-tokens",
+        "576",
+        # NOTE: max-num-batched-tokens>=mm_item_size
+        "--disable_chunked_mm_input",
+    ]
+
+    # Server will pre-compile on first startup (takes a long time).
+    with RemoteOpenAIServer(model_name, server_args,
+                            max_wait_seconds=600) as remote_server:
+        client: openai.AsyncOpenAI = remote_server.get_async_client()
+
+        # Other requests now should be much faster
+        for image_url in TEST_IMAGE_URLS:
+            image_base64 = base64_encoded_image[image_url]
+            chat_completion_from_base64 = await client.chat.completions\
+                .create(
+                model=model_name,
+                messages=whats_in_this_image_msg(image_base64),
+                max_completion_tokens=24,
+                temperature=0.0)
+            result = chat_completion_from_base64
+            assert result
+            choice = result.choices[0]
+            assert choice.finish_reason == "length"
+
+            message = choice.message
+            message = result.choices[0].message
+            assert message.content is not None and len(message.content) >= 10
+            assert message.role == "assistant"
diff --git a/vllm_v0.10.0/tests/v1/tpu/test_pallas.py b/vllm_v0.10.0/tests/v1/tpu/test_pallas.py
new file mode 100644
index 0000000..bfba3af
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/tpu/test_pallas.py
@@ -0,0 +1,100 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from unittest.mock import ANY, patch
+
+import torch
+
+from vllm.attention.backends.abstract import AttentionType
+from vllm.v1.attention.backends.pallas import (PallasAttentionBackendImpl,
+                                               PallasMetadata)
+
+
+def test_ragged_paged_attention():
+    # We verify that the kernel inputs such as sliding_window, etc. are passed
+    # in from the model correctly.
+    # The correctness of the paged attention kernel is tested in the kernel
+    # library.
+    num_heads = 4
+    head_size = 128
+    scale = 1.0
+    num_kv_heads = 4
+    sliding_window = 128
+    logits_soft_cap = 50.0
+    attn_impl = PallasAttentionBackendImpl(
+        num_heads=num_heads,
+        head_size=head_size,
+        scale=scale,
+        num_kv_heads=num_kv_heads,
+        alibi_slopes=None,
+        sliding_window=sliding_window,
+        kv_cache_dtype="auto",
+        logits_soft_cap=logits_soft_cap,
+        attn_type=AttentionType.DECODER,
+    )
+
+    class FakeAttentionLayer:
+        _k_scale_float: float
+        _v_scale_float: float
+
+    layer = FakeAttentionLayer()
+    layer._k_scale_float = 1.0
+    layer._v_scale_float = 1.0
+
+    num_tokens = 16
+    num_blocks = 1024
+    block_size = 16
+    query = torch.zeros(num_tokens, num_heads * head_size)
+    key = torch.zeros(num_tokens, num_kv_heads * head_size)
+    value = torch.zeros(num_tokens, num_kv_heads * head_size)
+    kv_cache = torch.zeros(num_blocks, block_size, num_kv_heads * 2, head_size)
+    slot_mapping = torch.zeros((3, num_tokens), dtype=torch.int64)
+    max_num_reqs = 8
+    max_num_blocks_per_req = 8
+    num_kv_update_slices = torch.tensor([num_tokens], dtype=torch.int32)
+    block_tables = torch.zeros((max_num_reqs, max_num_blocks_per_req),
+                               dtype=torch.int32)
+    context_lens = torch.ones((max_num_reqs, ), dtype=torch.int32)
+    query_lens = [1] * max_num_reqs
+    query_start_loc = torch.cumsum(torch.tensor([0] + query_lens,
+                                                dtype=torch.int32),
+                                   dim=0,
+                                   dtype=torch.int32)
+    num_seqs = torch.tensor([max_num_reqs], dtype=torch.int32)
+    attn_metadata = PallasMetadata(
+        slot_mapping=slot_mapping,
+        block_tables=block_tables,
+        context_lens=context_lens,
+        query_start_loc=query_start_loc,
+        num_seqs=num_seqs,
+        num_kv_update_slices=num_kv_update_slices,
+        num_slices_per_kv_cache_update_block=8,
+    )
+
+    with patch("torch.ops.xla.ragged_paged_attention"
+               ) as mock_ragged_paged_attention:
+        attn_impl.forward(
+            layer=layer,
+            query=query,
+            key=key,
+            value=value,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+
+        mock_ragged_paged_attention.assert_called_once_with(
+            ANY,  # query
+            ANY,  # kv_cache
+            ANY,  # context_lens
+            ANY,  # block_tables
+            ANY,  # query_start_loc
+            ANY,  # num_seqs
+            num_kv_pages_per_block=None,
+            num_queries_per_block=None,
+            vmem_limit_bytes=None,
+            use_kernel=True,
+            sm_scale=scale,
+            sliding_window=sliding_window,
+            soft_cap=logits_soft_cap,
+            k_scale=1.0,
+            v_scale=1.0,
+        )
diff --git a/vllm_v0.10.0/tests/v1/tpu/test_perf.py b/vllm_v0.10.0/tests/v1/tpu/test_perf.py
new file mode 100644
index 0000000..f4a2d5a
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/tpu/test_perf.py
@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""A basic performance regression test for TPUs
+
+Run `pytest tests/v1/tpu/test_perf.py`.
+"""
+from __future__ import annotations
+
+import time
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+import numpy as np
+import pytest
+
+from vllm.platforms import current_platform
+from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+if TYPE_CHECKING:
+    from tests.conftest import VllmRunner
+
+
+@dataclass
+class TestParams:
+    model: str
+    num_prompts: int
+    prefix_len: int
+    decode_len: int
+    expected_avg_time: float
+    err_tol: float
+
+
+TEST_PARAMS = [
+    # TODO: Cannot run a series of tests because:
+    #   RuntimeError: Bad StatusOr access: UNKNOWN: TPU initialization failed:
+    #   open(/dev/vfio/0): Device or resource busy: Device or resource busy;
+    #   Couldn't open iommu group /dev/vfio/0
+    # => Investigate
+
+    # TestParams(
+    #     model="Qwen/Qwen2.5-1.5B-Instruct",
+    #     num_prompts=1,
+    #     prefix_len=10,
+    #     decode_len=5,
+    #     expected_avg_time=0.03,
+    #     err_tol=0.01,
+    # ),
+    # TestParams(
+    #     model="Qwen/Qwen2.5-1.5B-Instruct",
+    #     num_prompts=10,
+    #     prefix_len=100,
+    #     decode_len=50,
+    #     expected_avg_time=0.234,
+    #     err_tol=0.020,
+    # ),
+    TestParams(
+        model="Qwen/Qwen2.5-1.5B-Instruct",
+        num_prompts=64,
+        prefix_len=500,
+        decode_len=50,
+
+        # commit id: ccb246776d93ef105904a8ec015b3587240a1183
+        # tpu: v5lite (old vllm CI/CD)
+        # expected_avg_time=1.4,
+        # err_tol=0.30,
+
+        # (This is the active CI/CD instance)
+        # commit id: ccb246776d93ef105904a8ec015b3587240a1183
+        # tpu: v6e (current vllm CI/CD)
+        expected_avg_time=1.7,  # measured with VLLM_XLA_CACHE_PATH=  
+        err_tol=0.20,
+    ),
+]
+
+NUM_WARMUPS = 5
+NUM_RUNS = 10
+
+MAX_MODEL_LEN = 1024
+MAX_NUM_SEQS = 32
+GPU_UTIL = 0.9
+
+
+@pytest.mark.skipif(not current_platform.is_tpu(),
+                    reason="This is a basic performance test for TPU only")
+@pytest.mark.parametrize("params", TEST_PARAMS)
+def test_perf(
+    vllm_runner: type[VllmRunner],
+    monkeypatch: pytest.MonkeyPatch,
+    params: TestParams,
+) -> None:
+    tokenizer = get_tokenizer(params.model,
+                              tokenizer_mode="auto",
+                              trust_remote_code=True)
+
+    prompts = []
+    for i in range(params.num_prompts):
+        prefix_token_ids = np.random.randint(0,
+                                             tokenizer.vocab_size,
+                                             size=params.prefix_len).tolist()
+        prompt = tokenizer.decode(prefix_token_ids)
+        prompts.append(prompt)
+
+    print(
+        "-- Running: num_prompts = {} prefix_len = {} decode_len = {}".format(
+            len(prompts), params.prefix_len, params.decode_len))
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        sampling_params = SamplingParams(max_tokens=params.decode_len,
+                                         temperature=1.0,
+                                         min_p=0.0)
+
+        with vllm_runner(params.model,
+                         max_num_batched_tokens=MAX_MODEL_LEN,
+                         max_model_len=MAX_MODEL_LEN,
+                         max_num_seqs=MAX_NUM_SEQS,
+                         gpu_memory_utilization=GPU_UTIL,
+                         enforce_eager=False,
+                         tensor_parallel_size=1) as vllm_model:
+            print("  -- Warmup / Compile")
+            for i in range(NUM_WARMUPS):
+                _ = vllm_model.generate(prompts, sampling_params)
+
+            print("  -- Benchmarking... ")
+            times = []
+            for i in range(NUM_RUNS):
+                start_time = time.time()
+                _ = vllm_model.generate(prompts, sampling_params)
+                times.append(time.time() - start_time)
+
+            avg_time = sum(times) / len(times)
+
+            print("  -- avg_time = {}".format(avg_time))
+            print("  -- expected_avg_time = {} with err_tol = {}".format(
+                params.expected_avg_time, params.err_tol))
+            diff = avg_time - params.expected_avg_time
+            ok = diff < params.err_tol
+            if diff < -params.err_tol:
+                print("  !! WARNING !! Performance has improved by {}, "
+                      "it may be necessary to fine-tune the "
+                      "expected_avg_time = {}".format(
+                          -diff, params.expected_avg_time))
+
+            assert ok, " !! ERROR !! Regression detected"
diff --git a/vllm_v0.10.0/tests/v1/tpu/test_sampler.py b/vllm_v0.10.0/tests/v1/tpu/test_sampler.py
new file mode 100644
index 0000000..198bb1e
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/tpu/test_sampler.py
@@ -0,0 +1,112 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+
+import pytest
+
+from vllm import LLM, envs
+from vllm.platforms import current_platform
+from vllm.sampling_params import SamplingParams
+
+if not envs.VLLM_USE_V1:
+    pytest.skip(
+        "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
+        allow_module_level=True,
+    )
+
+
+@pytest.mark.parametrize("model_name", ["Qwen/Qwen2.5-1.5B-Instruct"])
+@pytest.mark.skipif(not current_platform.is_tpu(),
+                    reason="This test needs a TPU")
+def test_sampler_different(model_name: str):
+    """
+    Test significantly different sampling params to assert the model produces 
+    different results.
+    """
+    llm = LLM(model_name,
+              enforce_eager=False,
+              max_num_seqs=1,
+              max_model_len=512,
+              max_num_batched_tokens=256)
+    prompts = [
+        "Write a short story about a robot that dreams for the first time."
+    ]
+    sampling_params = SamplingParams(temperature=0.9, min_p=0.2, max_tokens=64)
+    output = llm.generate(prompts, sampling_params)
+
+    sampling_params = SamplingParams(temperature=0.1, min_p=0.8, max_tokens=64)
+    output2 = llm.generate(prompts, sampling_params)
+    assert output[0].outputs[0].text != output2[0].outputs[0].text
+
+    with pytest.raises(ValueError):
+        # Unsupported `seed` param.
+        sampling_params = SamplingParams(temperature=0.3, seed=42)
+        output2 = llm.generate(prompts, sampling_params)
+
+    # Batch-case with TopK/P
+    for B in [4, 16]:
+        p = prompts * B
+        sampling_params = [
+            SamplingParams(
+                temperature=0.1,
+                min_p=0.8,
+                max_tokens=64,
+                # Vary number of ks
+                top_k=random.randint(4, 12),
+                top_p=random.random()) for _ in range(B)
+        ]
+        # Make sure first two reqs have the same K/P
+        sampling_params[0] = sampling_params[1]
+        output = llm.generate(p, sampling_params)
+        # There are natural numerical instabilities that make it difficult
+        # to have deterministic results over many tokens, tests the first ~20
+        # tokens match.
+        assert output[0].outputs[0].text[:20] == output[1].outputs[0].text[:20]
+
+
+@pytest.mark.parametrize("model_name", ["Qwen/Qwen2.5-1.5B-Instruct"])
+# TODO TPU will appear busy if we fan-out test params here
+@pytest.mark.parametrize("n_prompts", [1])
+@pytest.mark.skipif(not current_platform.is_tpu(),
+                    reason="This test needs a TPU")
+def test_logprobs(model_name: str, n_prompts: int):
+    """
+    Request top logprobs with different sampling settings and check
+    that results contains the requested number, ordered ascendingly.  
+    """
+
+    def check_num_logprobs(logprobs, expected_num: int):
+        for step in logprobs:
+            prev_logp = 1.0
+            # order by rank
+            sorted_step = dict(
+                sorted(step.items(), key=lambda item: item[1].rank))
+
+            # Can contain the sampled token
+            assert len(step) == expected_num or len(step) == expected_num + 1
+            # Check results are ordered by prob value
+            for rankno, (tid, logp) in enumerate(sorted_step.items()):
+                assert logp.logprob <= prev_logp
+                prev_logp = logp.logprob
+                assert logp.rank == rankno + 1
+
+    llm = LLM(model_name,
+              enforce_eager=False,
+              max_num_seqs=1,
+              max_model_len=128,
+              max_num_batched_tokens=128)
+    prompts = [
+        "Write a short story about a robot that dreams for the first time."
+    ] * n_prompts
+    greedy_sampling_params = SamplingParams(temperature=0.0, max_tokens=64,\
+         logprobs=4)
+    regular_sampling_params = SamplingParams(temperature=0.4, max_tokens=64,\
+         logprobs=4)
+    topkp_sampling_params = SamplingParams(temperature=0.4, max_tokens=64,\
+         logprobs=4, top_k=12, top_p=0.5)
+
+    for sp in [greedy_sampling_params, regular_sampling_params, \
+               topkp_sampling_params]:
+        output = llm.generate(prompts, sp)
+        for o in output:
+            check_num_logprobs(o.outputs[0].logprobs, 4)
diff --git a/vllm_v0.10.0/tests/v1/tpu/test_spmd_model_weight_loading.py b/vllm_v0.10.0/tests/v1/tpu/test_spmd_model_weight_loading.py
new file mode 100644
index 0000000..ad234df
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/tpu/test_spmd_model_weight_loading.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import gc
+import tempfile
+
+import numpy as np
+import pytest
+import torch_xla.distributed.spmd as xs
+import torch_xla.runtime as xr
+
+from vllm.config import set_current_vllm_config
+from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
+                                             init_distributed_environment)
+from vllm.engine.arg_utils import EngineArgs
+from vllm.model_executor.model_loader.tpu import TPUModelLoader
+
+
+def _setup_environment(model):
+    engine_args = EngineArgs(model=model, )
+    vllm_config = engine_args.create_engine_config()
+    with set_current_vllm_config(vllm_config):
+        temp_file = tempfile.mkstemp()[1]
+        init_distributed_environment(
+            1,
+            0,
+            local_rank=0,
+            distributed_init_method=f"file://{temp_file}",
+            backend="gloo")
+        # Under single worker mode, full model is init first and then
+        # partitioned using GSPMD.
+        ensure_model_parallel_initialized(1, 1)
+    return vllm_config
+
+
+MESH = None
+
+
+def _get_spmd_mesh():
+    global MESH
+    if MESH is None:
+        xr.use_spmd()
+        num_devices = xr.global_runtime_device_count()
+        mesh_shape = (num_devices, 1)
+        device_ids = np.array(range(num_devices))
+        MESH = xs.Mesh(device_ids, mesh_shape, ('x', 'y'))
+    return MESH
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        "Qwen/Qwen2-1.5B-Instruct",
+        # Skip large models due to CI runner disk space limitations
+        # "meta-llama/Llama-3.1-8B-Instruct",
+        # "meta-llama/Llama-3.1-70B-Instruct",
+    ])
+def test_tpu_model_loader(model):
+    # Skip the 70B test if there are less than 8 chips
+    # TODO: Query using torch xla API, the query API is not working
+    # with SPMD now. However, This test is running under SPMD mode.
+    if '70B' in model and xr.global_runtime_device_count() < 8:
+        pytest.skip(
+            "Skipping 70B model if the TPU VM has less than 8 chips to \
+                     avoid OOM.")
+
+    vllm_config = _setup_environment(model)
+    loader = TPUModelLoader(load_config=vllm_config.load_config)
+    mesh = _get_spmd_mesh()
+    model = loader.load_model(vllm_config, vllm_config.model_config, mesh)
+    del model
+    gc.collect()
diff --git a/vllm_v0.10.0/tests/v1/tpu/test_topk_topp_sampler.py b/vllm_v0.10.0/tests/v1/tpu/test_topk_topp_sampler.py
new file mode 100644
index 0000000..ca5c067
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/tpu/test_topk_topp_sampler.py
@@ -0,0 +1,153 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+from vllm.v1.sample.ops.topk_topp_sampler import (apply_top_k_top_p,
+                                                  apply_top_k_top_p_tpu)
+
+if not current_platform.is_tpu():
+    pytest.skip("This test needs a TPU.", allow_module_level=True)
+import torch_xla.core.xla_model as xm
+
+BATCH_SIZE = 1024
+VOCAB_SIZE = 128 * 1024
+TOLERANCE = 1e-6
+
+
+def test_topk_equivalence_to_native_impl():
+    with torch.device(xm.xla_device()):
+        xm.set_rng_state(seed=33)
+
+        logits = torch.rand((BATCH_SIZE, VOCAB_SIZE))
+
+        # Random top-k values between 1 and 10.
+        k = torch.randint(1, 10, (BATCH_SIZE, ))
+
+        # Set k=vocab_size for ~50% of requests in the batch (top-k disabled).
+        k.masked_fill_(torch.randint(0, 2, (BATCH_SIZE, ), dtype=bool),
+                       VOCAB_SIZE)
+
+        result_tpu = apply_top_k_top_p_tpu(logits=logits.clone(), k=k, p=None)
+
+        result_native = apply_top_k_top_p(logits=logits.clone(), k=k, p=None)
+        assert torch.allclose(result_native, result_tpu)
+
+
+def test_topp_result_sums_past_p():
+    with torch.device(xm.xla_device()):
+        xm.set_rng_state(seed=33)
+
+        logits = torch.rand((BATCH_SIZE, VOCAB_SIZE))
+        probs = logits.softmax(dim=-1)
+
+        # Random top-p values between 0 and 1.
+        p = torch.rand((BATCH_SIZE, ))
+
+        # Set p=1 for ~50% of requests in the batch (top-p disabled).
+        p.masked_fill_(torch.randint(0, 2, (BATCH_SIZE, ), dtype=bool), 1)
+
+        no_op_k = torch.tensor([VOCAB_SIZE])
+        logits_masked = apply_top_k_top_p_tpu(logits=logits.clone(),
+                                              k=no_op_k,
+                                              p=p)
+
+        # Verify that the masked logit's probability sums to at least p.
+        probs.masked_fill_(logits_masked.isinf(), 0)
+        masked_prob_sum = probs.sum(dim=-1)
+
+        xm.mark_step()
+
+    # Perform assertion on CPU.
+    assert torch.all(torch.ge(masked_prob_sum.cpu() + TOLERANCE, p.cpu()))
+
+
+def test_topp_basic():
+    with torch.device(xm.xla_device()):
+        logits = torch.tensor([[math.log(0.2),
+                                math.log(0.3),
+                                math.log(0.5)],
+                               [math.log(0.5),
+                                math.log(0.1),
+                                math.log(0.4)]])
+
+        result = apply_top_k_top_p_tpu(logits=logits.clone(),
+                                       k=torch.tensor([3, 3]),
+                                       p=torch.tensor([0.79, 0.79]))
+
+        xm.mark_step()
+
+    # Expect the smallest elements to be dropped.
+    expected_result = logits.clone().cpu()
+    expected_result[0, 0] = float("-inf")
+    expected_result[1, 1] = float("-inf")
+    assert torch.allclose(expected_result, result.cpu())
+
+
+def test_topp_select_all():
+    with torch.device(xm.xla_device()):
+        logits = torch.tensor([[math.log(0.2),
+                                math.log(0.3),
+                                math.log(0.5)],
+                               [math.log(0.5),
+                                math.log(0.1),
+                                math.log(0.4)]])
+
+        result = apply_top_k_top_p_tpu(logits=logits.clone(),
+                                       k=torch.tensor([3, 3]),
+                                       p=torch.tensor([1.0, 1.0]))
+
+        xm.mark_step()
+
+    assert torch.allclose(logits.cpu(), result.cpu())
+
+
+def test_topp_with_ties():
+    with torch.device(xm.xla_device()):
+        # Input has multiple math.log(0.3).
+        logits = torch.tensor(
+            [[math.log(0.3),
+              math.log(0.3),
+              math.log(0.3),
+              math.log(0.1)]])
+
+        result = apply_top_k_top_p_tpu(logits=logits.clone(),
+                                       k=torch.tensor([4]),
+                                       p=torch.tensor([0.2]))
+
+        xm.mark_step()
+
+    # All tie values are included in the top-p set. Tie breaking is left
+    # to be done during final sampling (all tie tokens have equal
+    # probability of being chosen).
+    expected_result = logits.clone().cpu()
+    expected_result[0, 3] = float("-inf")
+    assert torch.allclose(expected_result, result.cpu())
+
+
+def test_both_topk_topp():
+    with torch.device(xm.xla_device()):
+        logits = torch.tensor([[math.log(0.2),
+                                math.log(0.3),
+                                math.log(0.5)],
+                               [math.log(0.5),
+                                math.log(0.1),
+                                math.log(0.4)]])
+
+        # Set k=1 for the first batch.
+        result = apply_top_k_top_p_tpu(logits=logits.clone(),
+                                       k=torch.tensor([1, 3]),
+                                       p=torch.tensor([0.79, 0.79]))
+
+        xm.mark_step()
+
+    # Since for the first batch k=1, expect only the largest element gets
+    # selected.
+    expected_result = logits.clone().cpu()
+    expected_result[0, 0] = float("-inf")
+    expected_result[0, 1] = float("-inf")
+    expected_result[1, 1] = float("-inf")
+    assert torch.allclose(expected_result, result.cpu())
diff --git a/vllm_v0.10.0/tests/v1/tpu/test_tpu_qkv_linear.py b/vllm_v0.10.0/tests/v1/tpu/test_tpu_qkv_linear.py
new file mode 100644
index 0000000..46fa119
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/tpu/test_tpu_qkv_linear.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import tempfile
+
+import numpy as np
+import pytest
+import torch
+import torch_xla.distributed.spmd as xs
+import torch_xla.runtime as xr
+
+from vllm.config import set_current_vllm_config
+from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
+                                             init_distributed_environment)
+from vllm.distributed.tpu_distributed_utils import XlaQKVParallelLinear
+from vllm.engine.arg_utils import EngineArgs
+from vllm.model_executor.layers.linear import QKVParallelLinear
+
+
+@pytest.fixture(autouse=True)
+def setup_environment():
+    # This is a fake config used for init dist env.
+    # QKVParallelLinear needs dist env to be initialized.
+    engine_args = EngineArgs(
+        model="Qwen/Qwen2-1.5B-Instruct",
+        max_model_len=64,
+        max_num_batched_tokens=64,
+        max_num_seqs=4,
+    )
+
+    vllm_config = engine_args.create_engine_config()
+
+    with set_current_vllm_config(vllm_config):
+        temp_file = tempfile.mkstemp()[1]
+        init_distributed_environment(
+            1,
+            0,
+            local_rank=0,
+            distributed_init_method=f"file://{temp_file}",
+            backend="gloo")
+        ensure_model_parallel_initialized(1, 1)
+        yield
+
+
+MESH = None
+
+
+def _get_spmd_mesh():
+    global MESH
+    if MESH is None:
+        xr.use_spmd()
+        num_devices = xr.global_runtime_device_count()
+        mesh_shape = (num_devices, 1)
+        device_ids = np.array(range(num_devices))
+        MESH = xs.Mesh(device_ids, mesh_shape, ('x', 'y'))
+    return MESH
+
+
+@pytest.mark.parametrize("bias", [False, True])
+# `xr.use_spmd()` will set a global state, and this state is not reversible.
+# Therefore, non-SPMD tests should be run before SPMD tests.
+@pytest.mark.parametrize("mesh", [None, _get_spmd_mesh()])
+@pytest.mark.parametrize("device", ['cpu', 'xla'])
+@torch.no_grad()
+def test_xla_qkv_linear(bias, mesh, device):
+    torch.manual_seed(123)
+
+    qkv_linear = QKVParallelLinear(
+        hidden_size=4096,
+        head_size=128,
+        total_num_heads=32,
+        total_num_kv_heads=8,
+        bias=bias,
+        params_dtype=torch.bfloat16,
+        return_bias=False,
+    )
+
+    qkv_linear.weight.data = torch.rand_like(qkv_linear.weight.data) / 10
+    if bias:
+        qkv_linear.bias.data = torch.rand_like(qkv_linear.bias.data)
+
+    xla_qkv_linear = XlaQKVParallelLinear(qkv_linear, mesh=mesh)
+
+    qkv_linear = qkv_linear.to(device)
+    xla_qkv_linear = xla_qkv_linear.to(device)
+    input_tensor = torch.rand(10, 4096, dtype=torch.bfloat16) / 10
+    input_tensor = input_tensor.to(device)
+
+    output = qkv_linear(input_tensor)
+    xla_output = xla_qkv_linear(input_tensor)
+    assert torch.allclose(output.cpu(), xla_output.cpu())
diff --git a/vllm_v0.10.0/tests/v1/tpu/worker/__init__.py b/vllm_v0.10.0/tests/v1/tpu/worker/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/v1/tpu/worker/test_tpu_model_runner.py b/vllm_v0.10.0/tests/v1/tpu/worker/test_tpu_model_runner.py
new file mode 100644
index 0000000..40db0b2
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -0,0 +1,603 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.attention.layer import Attention
+from vllm.config import (CacheConfig, ModelConfig, SchedulerConfig, VllmConfig,
+                         set_current_vllm_config)
+from vllm.pooling_params import PoolingParams
+from vllm.sampling_params import SamplingParams
+from vllm.utils import GiB_bytes
+from vllm.v1.core.kv_cache_utils import (estimate_max_model_len,
+                                         get_kv_cache_config)
+from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
+                                       SchedulerOutput)
+from vllm.v1.worker.tpu_model_runner import (
+    TPUModelRunner, _get_padded_num_reqs_with_upper_limit,
+    _get_padded_token_len, _get_req_paddings, _get_token_paddings)
+
+
+def get_vllm_config():
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=10,
+        max_num_batched_tokens=512,
+        max_model_len=512,
+    )
+    model_config = ModelConfig(
+        model="facebook/opt-125m",
+        task="generate",
+        tokenizer="facebook/opt-125m",
+        tokenizer_mode="auto",
+        trust_remote_code=True,
+        dtype="bfloat16",  # TPUs typically use bfloat16
+        seed=42,
+    )
+    cache_config = CacheConfig(
+        block_size=16,
+        gpu_memory_utilization=0.9,
+        swap_space=0,
+        cache_dtype="auto",
+    )
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        cache_config=cache_config,
+        scheduler_config=scheduler_config,
+    )
+    return vllm_config
+
+
+def get_model_runner(vllm_config):
+    device = "xla:0"  # Mocking TPU device
+    return TPUModelRunner(vllm_config, device)
+
+
+@pytest.fixture
+def model_runner():
+    # Patchers have already been started at module level.
+    vllm_config = get_vllm_config()
+    return get_model_runner(vllm_config)
+
+
+def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
+    new_reqs = []
+    num_scheduled_tokens = {}
+    total_num_scheduled_tokens = 0
+    for req_id in req_ids:
+        new_reqs.append(
+            NewRequestData(
+                req_id=req_id,
+                prompt_token_ids=[1, 2, 3],
+                mm_inputs=[],
+                mm_hashes=[],
+                mm_positions=[],
+                sampling_params=SamplingParams(),
+                pooling_params=PoolingParams(),
+                block_ids=([0], ),  # block_ids should be tuple[list[int]]
+                num_computed_tokens=0,
+                lora_request=None,
+            ))
+        num_scheduled_tokens[req_id] = 3
+        total_num_scheduled_tokens += num_scheduled_tokens[req_id]
+
+    return SchedulerOutput(
+        scheduled_new_reqs=new_reqs,
+        scheduled_cached_reqs=CachedRequestData.make_empty(),
+        num_scheduled_tokens=num_scheduled_tokens,
+        total_num_scheduled_tokens=total_num_scheduled_tokens,
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+
+
+def _is_req_scheduled(model_runner, req_id: str) -> bool:
+    return req_id in model_runner.input_batch.req_id_to_index
+
+
+def _is_req_added(model_runner, req_id: str) -> bool:
+    return req_id in model_runner.requests
+
+
+def _is_req_state_block_table_match(model_runner, req_id: str) -> bool:
+    """Check if the request state block IDs match the block table.
+    
+    This function handles both legacy BlockTable and new MultiGroupBlockTable
+    structures for backward compatibility.
+    """
+
+    req_index = model_runner.input_batch.req_id_to_index[req_id]
+    multi_group_block_table = model_runner.input_batch.block_table
+    req_state = model_runner.requests[req_id]
+
+    # Access the first block table from MultiGroupBlockTable
+    # This is safe since we currently only use single KV cache groups
+    block_table = multi_group_block_table[0]
+
+    # req_state.block_ids is now tuple[list[int], ...] for MultiGroupBlockTable
+    # Extract the first group's block IDs
+    if isinstance(req_state.block_ids[0], list):
+        # New format: tuple[list[int], ...] - extract first group
+        req_block_ids = req_state.block_ids[0]
+    else:
+        # Legacy format: list[int] - use directly
+        req_block_ids = req_state.block_ids
+
+    if block_table.num_blocks_per_row[req_index] != len(req_block_ids):
+        return False
+
+    num_blocks = block_table.num_blocks_per_row[req_index]
+    block_table_values = block_table.block_table_np[req_index, :num_blocks]
+    return (block_table_values == req_block_ids).all()
+
+
+def test_update_states_new_request(model_runner):
+    req_id = "req_0"
+
+    # new req
+    scheduler_output = _schedule_new_request(req_id)
+
+    model_runner._update_states(scheduler_output)
+
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+    assert _is_req_state_block_table_match(model_runner, req_id)
+
+
+def test_update_states_request_finished(model_runner):
+    req_id = "req_0"
+
+    # new req
+    scheduler_output = _schedule_new_request(req_id)
+
+    model_runner._update_states(scheduler_output)
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+
+    # finish req
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=CachedRequestData.make_empty(),
+        num_scheduled_tokens={},
+        total_num_scheduled_tokens=0,
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids={req_id},
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+
+    model_runner._update_states(scheduler_output)
+    assert not _is_req_added(model_runner, req_id)
+    assert not _is_req_scheduled(model_runner, req_id)
+
+
+def test_update_states_request_resumed(model_runner):
+    req_id = "req_0"
+
+    # new req
+    scheduler_output = _schedule_new_request(req_id)
+
+    model_runner._update_states(scheduler_output)
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+
+    # unschedule req
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=CachedRequestData.make_empty(),
+        num_scheduled_tokens={},
+        total_num_scheduled_tokens=0,
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+
+    model_runner._update_states(scheduler_output)
+    assert _is_req_added(model_runner, req_id)
+    assert not _is_req_scheduled(model_runner, req_id)
+
+    # resume req
+    cached_req_data = CachedRequestData(
+        req_ids=[req_id],
+        resumed_from_preemption=[False],
+        new_token_ids=[[]],
+        new_block_ids=[([], )],
+        num_computed_tokens=[0],
+    )
+
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=cached_req_data,
+        num_scheduled_tokens={req_id: 1},
+        total_num_scheduled_tokens=1,
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+
+    model_runner._update_states(scheduler_output)
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+    assert _is_req_state_block_table_match(model_runner, req_id)
+
+
+def test_update_states_no_changes(model_runner):
+    req_id = "req_0"
+
+    # new req
+    scheduler_output = _schedule_new_request(req_id)
+
+    model_runner._update_states(scheduler_output)
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+
+    # schedule req
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=CachedRequestData.make_empty(),
+        num_scheduled_tokens={req_id: 1},
+        total_num_scheduled_tokens=1,
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+
+    model_runner._update_states(scheduler_output)
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+    assert _is_req_state_block_table_match(model_runner, req_id)
+
+
+def test_update_states_request_unscheduled(model_runner):
+    req_ids = ("req_0", "req_1")
+
+    # new reqs
+    scheduler_output = _schedule_new_request(*req_ids)
+
+    model_runner._update_states(scheduler_output)
+
+    assert _is_req_added(model_runner, req_ids[0])
+    assert _is_req_scheduled(model_runner, req_ids[0])
+
+    assert _is_req_added(model_runner, req_ids[1])
+    assert _is_req_scheduled(model_runner, req_ids[1])
+
+    # unschedule req_1
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=CachedRequestData.make_empty(),
+        num_scheduled_tokens={req_ids[0]: 1},
+        total_num_scheduled_tokens=1,
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+
+    model_runner._update_states(scheduler_output)
+
+    assert _is_req_added(model_runner, req_ids[0])
+    assert _is_req_scheduled(model_runner, req_ids[0])
+
+    assert _is_req_added(model_runner, req_ids[1])
+    assert not _is_req_scheduled(model_runner, req_ids[1])
+
+
+def test_get_paddings():
+    # Bucketed padding
+    min_token_size, max_token_size, padding_gap = 16, 512, 64
+    expected_paddings = [16, 32, 64, 128, 192, 256, 320, 384, 448, 512]
+    actual_paddings = _get_token_paddings(min_token_size, max_token_size,
+                                          padding_gap)
+
+    # Bucketed padding with max_token_size not a power of two.
+    max_token_size = 317
+    expected_paddings = [16, 32, 64, 128, 192, 256, 320]
+    actual_paddings = _get_token_paddings(min_token_size, max_token_size,
+                                          padding_gap)
+    assert actual_paddings == expected_paddings
+
+    # Exponential padding.
+    max_token_size, padding_gap = 1024, 0
+    expected_paddings = [16, 32, 64, 128, 256, 512, 1024]
+    actual_paddings = _get_token_paddings(min_token_size, max_token_size,
+                                          padding_gap)
+    assert actual_paddings == expected_paddings
+    # Exponential padding with max_token_size not a power of two.
+    max_token_size = 317
+    expected_paddings = [16, 32, 64, 128, 256, 512]
+    actual_paddings = _get_token_paddings(min_token_size, max_token_size,
+                                          padding_gap)
+    assert actual_paddings == expected_paddings
+
+
+def test_get_padded_token_len():
+    min_token_size, max_token_size, padding_gap = 16, 512, 64
+    paddings = _get_token_paddings(min_token_size, max_token_size, padding_gap)
+    assert _get_padded_token_len(paddings, 1) == 16
+    assert _get_padded_token_len(paddings, 16) == 16
+    assert _get_padded_token_len(paddings, 20) == 32
+    assert _get_padded_token_len(paddings, 300) == 320
+    assert _get_padded_token_len(paddings, 512) == 512
+
+
+def test_get_padded_num_reqs_with_upper_limit():
+    assert _get_padded_num_reqs_with_upper_limit(3, 32) == 8
+    assert _get_padded_num_reqs_with_upper_limit(9, 32) == 16
+    assert _get_padded_num_reqs_with_upper_limit(19, 32) == 32
+    assert _get_padded_num_reqs_with_upper_limit(17, 28) == 28
+
+
+def test_get_req_paddings():
+    assert _get_req_paddings(1, 32) == [8, 16, 32]
+    assert _get_req_paddings(8, 32) == [8, 16, 32]
+    assert _get_req_paddings(8, 36) == [8, 16, 32, 36]
+
+
+def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order(
+        model_runner):
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    error_msg = f"{layer_1} must come before the current layer"
+    vllm_config = model_runner.vllm_config
+    with pytest.raises(ValueError, match=error_msg), \
+        set_current_vllm_config(vllm_config):
+        fwd_context = {
+            # initialization below will fail because target layer is invalid;
+            # the target layer needs to come before layer 1
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_0,
+                kv_sharing_target_layer_name=layer_1,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_1,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+
+
+def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(model_runner):
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    invalid_layer = "model.layers.0.cross_attn.attn"
+    error_msg = f"{invalid_layer} is not a valid Attention layer in the model"
+    vllm_config = model_runner.vllm_config
+    with pytest.raises(ValueError, match=error_msg), \
+        set_current_vllm_config(vllm_config):
+        fwd_context = {
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_1,
+                # invalid layer: cross_attn.atn doesn't exist!
+                kv_sharing_target_layer_name=invalid_layer,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+
+
+def test_init_kv_cache_with_kv_sharing_target_same_as_current(model_runner):
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    error_msg = f"{layer_1} cannot be the same as the current layer"
+    vllm_config = model_runner.vllm_config
+    with pytest.raises(ValueError, match=error_msg), \
+        set_current_vllm_config(vllm_config):
+        fwd_context = {
+            # initialization below will fail because target layer is invalid;
+            # the target layer needs to come before layer 1
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_1,
+                kv_sharing_target_layer_name=layer_1,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+
+
+def test_init_kv_cache_without_kv_sharing():
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    vllm_config = get_vllm_config()
+    with set_current_vllm_config(vllm_config):
+        fwd_context = {
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_1,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+    # Set high context length to test max context length estimation
+    vllm_config.model_config.max_model_len = 1_000_000
+    vllm_ctx = vllm_config.compilation_config.static_forward_context
+    model_runner = get_model_runner(vllm_config)
+    kv_cache_spec = model_runner.get_kv_cache_spec()
+    assert len(kv_cache_spec) == 2
+    assert len(model_runner.shared_kv_cache_layers) == 0
+
+    available_memory = 20 * GiB_bytes
+    # page size for each layer KV can be calculated as
+    # 2 (non-MLA) * 8 (num_heads) * 128 (head_dim)
+    # * 2 (bfloat16, kv_cache dtype) * 128 (block_size) = 512KB
+    num_expected_blocks = 20480  # 20GB / 512KB / 2 (num layers)
+    kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
+                                          available_memory)
+    assert kv_cache_config.num_blocks == num_expected_blocks
+    assert len(kv_cache_config.kv_cache_tensors) == 2
+    assert kv_cache_config.kv_cache_tensors[0].size == available_memory // 2
+    assert kv_cache_config.kv_cache_tensors[1].size == available_memory // 2
+
+    max_context_len =\
+        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
+    # max context len with KV sharing should be 2x as large as without
+    # max_context_len = available_memory / (page_size / block_size) / num_caches
+    # max_context_len = 5GB / (512KB / 128) / 2 = 655360
+    assert max_context_len == 655360
+
+    # important: override tensor size to prevent large mem alloc during test
+    # this will only allocate 2 block worth of memory (2 * 512kb)
+    kv_cache_config.num_blocks = 1
+    for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
+        kv_cache_tensor.size = (
+            kv_cache_spec[kv_cache_tensor.shared_by[0]].page_size_bytes)
+
+    model_runner.initialize_kv_cache(kv_cache_config)
+
+    layer_0_kv = vllm_ctx[layer_0].kv_cache[0]
+    layer_1_kv = vllm_ctx[layer_1].kv_cache[0]
+    # check layer 1 kv cache does NOT share memory with layer 0
+    assert id(layer_1_kv) != id(layer_0_kv)
+
+    # check layer 1 added to kv cache group's layer names
+    assert len(kv_cache_config.kv_cache_groups) == 1
+    assert len(kv_cache_config.kv_cache_groups[0].layer_names) == 2
+    assert kv_cache_config.kv_cache_groups[0].layer_names[0] == layer_0
+    assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1
+
+
+def test_init_kv_cache_with_kv_sharing_valid():
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    vllm_config = get_vllm_config()
+    with set_current_vllm_config(vllm_config):
+        fwd_context = {
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_1,
+                kv_sharing_target_layer_name="model.layers.0.self_attn.attn",
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+    # Set high context length to test max context length estimation
+    vllm_config.model_config.max_model_len = 3_000_000
+    vllm_ctx = vllm_config.compilation_config.static_forward_context
+    model_runner = get_model_runner(vllm_config)
+    kv_cache_spec = model_runner.get_kv_cache_spec()
+    assert len(kv_cache_spec) == 1
+    assert layer_0 in kv_cache_spec
+    assert model_runner.shared_kv_cache_layers[layer_1] == layer_0
+
+    available_memory = 20 * GiB_bytes
+    # page size for layer 0's kv_cache_spec is 512KB
+    # with KV sharing, we can allocate (available_mem//page_size//1) blocks
+    # which is twice as many as without KV sharing
+    num_expected_blocks = 2 * 20480  # 20GB / 512KB
+    kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
+                                          available_memory)
+    assert kv_cache_config.num_blocks == num_expected_blocks
+    assert len(kv_cache_config.kv_cache_tensors) == 1
+    # Each layer now has twice the available memory for KV cache
+    # compared to no KV sharing
+    assert kv_cache_config.kv_cache_tensors[0].size == available_memory
+
+    max_context_len =\
+        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
+    # max context len with KV sharing should be 2x as large as without
+    assert max_context_len == (2 * 655360)
+
+    # important: override tensor size to prevent large mem alloc during test
+    # this will only allocate 1 block worth of memory (512kb)
+    kv_cache_config.num_blocks = 1
+    kv_cache_config.kv_cache_tensors[0].size =\
+        kv_cache_spec[layer_0].page_size_bytes
+
+    model_runner.initialize_kv_cache(kv_cache_config)
+
+    layer_0_kv = vllm_ctx[layer_0].kv_cache[0]
+    layer_1_kv = vllm_ctx[layer_1].kv_cache[0]
+    # check layer 1 kv cache shares memory with layer 0
+    assert id(layer_1_kv) == id(layer_0_kv)
+
+    # check layer 1 added to kv cache group's layer names
+    assert len(kv_cache_config.kv_cache_groups) == 1
+    assert len(kv_cache_config.kv_cache_groups[0].layer_names) == 2
+    assert kv_cache_config.kv_cache_groups[0].layer_names[0] == layer_0
+    assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1
+
+
+def test_most_model_len(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_TPU_MOST_MODEL_LEN", "2048")
+    vllm_config = get_vllm_config()
+    vllm_config.model_config.max_model_len = 32000
+    vllm_config.scheduler_config.max_num_seqs = 1200
+    model_runner = get_model_runner(vllm_config)
+
+    # verify model runner will adjust num_reqs to avoid SMEM OOM.
+    assert model_runner.num_reqs_most_model_len == 1200
+    # num_page_per_req = 32k // 128
+    # num_reqs = 1024 ** 2 // 2 // num_page_per_req // 4 = 524
+    assert model_runner.num_reqs_max_model_len == 524
diff --git a/vllm_v0.10.0/tests/v1/worker/__init__.py b/vllm_v0.10.0/tests/v1/worker/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/v1/worker/test_gpu_input_batch.py b/vllm_v0.10.0/tests/v1/worker/test_gpu_input_batch.py
new file mode 100644
index 0000000..943a13d
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/worker/test_gpu_input_batch.py
@@ -0,0 +1,365 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import inspect
+from collections.abc import Sequence
+from typing import Optional
+
+import numpy as np
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+from vllm.sampling_params import SamplingParams
+from vllm.utils import is_pin_memory_available, make_tensor_with_pad
+from vllm.v1.pool.metadata import PoolingMetadata
+from vllm.v1.sample.logits_processor import LogitsProcessorManager
+from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.worker.block_table import BlockTable, MultiGroupBlockTable
+from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
+
+VOCAB_SIZE = 1024
+NUM_OUTPUT_TOKENS = 20
+MAX_PROMPT_SIZE = 100
+CUDA_DEVICES = [
+    f"{current_platform.device_type}:{i}"
+    for i in range(min(current_platform.device_count(), 2))
+]
+MAX_NUM_PROMPT_TOKENS = 64
+
+
+def _compare_objs(obj1,
+                  obj2,
+                  skip: Sequence = ("logitsprocs", "batch_update_builder")):
+    attrs = inspect.getmembers(obj1, lambda a: not (inspect.isroutine(a)))
+    attr_names = set([
+        a[0] for a in attrs
+        if not (a[0].startswith('__') and a[0].endswith('__'))
+    ])
+    for attr_name in attr_names:
+        if attr_name in skip:
+            continue
+
+        a = getattr(obj1, attr_name)
+        b = getattr(obj2, attr_name)
+
+        is_same = False
+        if isinstance(a, torch.Tensor):
+            if (a.numel() == 0 or b.numel() == 0):
+                is_same = (a.numel() == 0 and b.numel() == 0)
+            elif torch.allclose(a, b):
+                is_same = True
+        elif isinstance(a, np.ndarray):
+            if np.allclose(a, b):
+                is_same = True
+        elif isinstance(a, MultiGroupBlockTable):
+            for a_i, b_i in zip(a.block_tables, b.block_tables):
+                _compare_objs(a_i, b_i)
+            is_same = True
+        elif isinstance(a, (BlockTable, SamplingMetadata, PoolingMetadata)):
+            _compare_objs(a, b)
+            is_same = True  # if we make it here must be same
+        elif a == b:
+            is_same = True
+        assert is_same, f"Attribute {attr_name} is different"\
+            f" in {obj1} and {obj2}: {a} != {b}"
+
+
+def _remove_requests(input_batch: InputBatch, batch_size: int,
+                     reqs: list[CachedRequestState]) -> set[str]:
+    """
+    Remove some requests randomly from the batch and returns
+    set of request removed
+    """
+
+    num_reqs_to_remove = np.random.randint(0, batch_size)
+    req_indices_to_remove: set[int] = set()
+    for _ in range(num_reqs_to_remove):
+        req_index_to_remove = np.random.randint(0, batch_size)
+        req_indices_to_remove.add(req_index_to_remove)
+
+    req_ids_to_remove: set[str] = set()
+    for index in req_indices_to_remove:
+        input_batch.remove_request(reqs[index].req_id)
+        req_ids_to_remove.add(reqs[index].req_id)
+    return req_ids_to_remove
+
+
+def _construct_expected_sampling_metadata(
+    reqs: list[CachedRequestState],
+    req_ids_retained: set[int],
+    req_id_index_in_input_batch: dict[str, int],
+    device: torch.device,
+) -> SamplingMetadata:
+    """
+    Constructs and returns the expected SamplingMetadata for this
+    batch.
+    """
+    num_reqs = len(req_ids_retained)
+    output_token_ids: list[list[int]] = [list() for _ in range(num_reqs)]
+    prompt_token_ids: list[list[int]] = [list() for _ in range(num_reqs)]
+    presence_penalties = [0.0 for _ in range(num_reqs)]
+    frequency_penalties = [0.0 for _ in range(num_reqs)]
+    repetition_penalties = [1.0 for _ in range(num_reqs)]
+    top_k = [0 for _ in range(num_reqs)]
+    top_p = [0.0 for _ in range(num_reqs)]
+    temperature = [0.0 for _ in range(num_reqs)]
+    min_tokens = {}
+    logit_bias = [None] * num_reqs
+    allowed_token_ids_mask = torch.zeros(num_reqs,
+                                         VOCAB_SIZE,
+                                         dtype=torch.bool,
+                                         device=device)
+    bad_words_token_ids = {}
+    for req in reqs:
+        if req.req_id not in req_ids_retained:
+            continue
+        index_in_input_batch = req_id_index_in_input_batch[req.req_id]
+        output_token_ids[index_in_input_batch] = req.output_token_ids
+        prompt_token_ids[index_in_input_batch] = req.prompt_token_ids
+        presence_penalties[
+            index_in_input_batch] = req.sampling_params.presence_penalty
+        frequency_penalties[index_in_input_batch] = (
+            req.sampling_params.frequency_penalty)
+        repetition_penalties[index_in_input_batch] = (
+            req.sampling_params.repetition_penalty)
+        top_k[index_in_input_batch] = req.sampling_params.top_k
+        top_p[index_in_input_batch] = req.sampling_params.top_p
+        temperature[index_in_input_batch] = req.sampling_params.temperature
+        min_tokens[index_in_input_batch] = (
+            req.sampling_params.min_tokens,
+            req.sampling_params.all_stop_token_ids)
+        logit_bias[index_in_input_batch] = req.sampling_params.logit_bias
+        if req.sampling_params.allowed_token_ids:
+            allowed_token_ids_mask[index_in_input_batch][
+                req.sampling_params.allowed_token_ids] = True
+        if req.sampling_params.bad_words_token_ids:
+            bad_words_token_ids[
+                index_in_input_batch] = req.sampling_params.bad_words_token_ids
+
+    return SamplingMetadata(
+        temperature=torch.tensor(temperature, dtype=torch.float,
+                                 device=device),
+        all_greedy=False,
+        all_random=True,
+        top_p=None if all(x == 1.0 for x in top_p) else torch.tensor(
+            top_p, dtype=torch.float, device=device),
+        top_k=None if all(x == 0 for x in top_k) else torch.tensor(
+            top_k, dtype=torch.int, device=device),
+        generators={},
+        max_num_logprobs=0,
+        prompt_token_ids=make_tensor_with_pad(
+            prompt_token_ids,
+            pad=VOCAB_SIZE,
+            device=torch.device(device),
+            dtype=torch.int64,
+        ),
+        frequency_penalties=torch.tensor(frequency_penalties,
+                                         dtype=torch.float,
+                                         device=device),
+        presence_penalties=torch.tensor(presence_penalties,
+                                        dtype=torch.float,
+                                        device=device),
+        repetition_penalties=torch.tensor(repetition_penalties,
+                                          dtype=torch.float,
+                                          device=device),
+        output_token_ids=output_token_ids,
+        no_penalties=(all(x == 0 for x in presence_penalties)
+                      and all(x == 0 for x in frequency_penalties)
+                      and all(x == 1 for x in repetition_penalties)),
+        allowed_token_ids_mask=allowed_token_ids_mask,
+        bad_words_token_ids=bad_words_token_ids,
+        logitsprocs=LogitsProcessorManager(),
+    )
+
+
+def _create_sampling_params():
+    return SamplingParams(
+        top_k=np.random.randint(1, 10),
+        top_p=np.random.uniform(0.0, 1.0),
+        presence_penalty=np.random.uniform(-2.0, 2.0),
+        repetition_penalty=np.random.uniform(0.0, 2.0),
+        frequency_penalty=np.random.uniform(-2.0, 2.0),
+        min_tokens=np.random.randint(1, 10),
+        stop_token_ids=[
+            np.random.randint(0, VOCAB_SIZE)
+            for _ in range(np.random.randint(10))
+        ],
+        logit_bias={0: np.random.uniform(-3.0, 3.0)},
+    )
+
+
+def _construct_cached_request_state(req_id_suffix: int):
+    prompt_token_ids = [
+        np.random.randint(0, VOCAB_SIZE)
+        for _ in range(np.random.randint(0, MAX_PROMPT_SIZE))
+    ]
+    output_token_ids = [
+        np.random.randint(0, VOCAB_SIZE)
+        for _ in range(np.random.randint(0, NUM_OUTPUT_TOKENS))
+    ]
+    return CachedRequestState(
+        req_id=f"req_id_{req_id_suffix}",
+        prompt_token_ids=prompt_token_ids,
+        sampling_params=_create_sampling_params(),
+        pooling_params=None,
+        mm_inputs=[],
+        mm_positions=[],
+        block_ids=([], ),
+        generator=None,
+        num_computed_tokens=len(output_token_ids),
+        output_token_ids=output_token_ids,
+    )
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32, 64])
+def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
+    """
+    Tests the logic for managing sampling metadata in the InputBatch.
+
+    This test involves adding a set of requests to the InputBatch,
+    followed by removing a subset of them. Afterward, the batch is compacted,
+    and the `make_sampling_metadata` method is invoked on the batch. The
+    output of `make_sampling_metadata` is then compared against the expected
+    results to ensure correctness.
+
+    Note: Ignore logits processor logic, which is tested separately
+    """
+    input_batch: InputBatch = InputBatch(
+        max_num_reqs=batch_size,
+        max_model_len=1024,
+        max_num_batched_tokens=1024,
+        device=torch.device(device),
+        pin_memory=is_pin_memory_available(),
+        vocab_size=1024,
+        block_sizes=[1],
+    )
+    reqs: list[CachedRequestState] = []
+    req_id_reqs = {}
+    req_id_output_token_ids = {}
+
+    # Add requests
+    for req_index in range(batch_size):
+        req: CachedRequestState = _construct_cached_request_state(req_index)
+        assigned_req_index = input_batch.add_request(req)
+        assert req_index == assigned_req_index
+        reqs.append(req)
+        req_id_reqs[req.req_id] = req
+        req_id_output_token_ids[req.req_id] = req.output_token_ids
+
+    # Remove some requests
+    req_ids_to_remove = _remove_requests(input_batch, batch_size, reqs)
+    req_ids_retained = set(req_id_reqs.keys()) - req_ids_to_remove
+
+    # Compact the input batch
+    input_batch.condense()
+
+    # Generate the sampling metadata
+    sampling_metadata = input_batch._make_sampling_metadata()
+
+    # Create expected output.
+    expected_sampling_metadata = _construct_expected_sampling_metadata(
+        reqs,
+        req_ids_retained,
+        input_batch.req_id_to_index,
+        device=torch.device(device))
+
+    def same(t1: Optional[torch.Tensor], t2: Optional[torch.Tensor]) -> bool:
+        return (t1 is None
+                and t2 is None) or (t1 is not None and t2 is not None
+                                    and torch.allclose(t1, t2))
+
+    # Assert the actual and expected output.
+    assert torch.allclose(expected_sampling_metadata.temperature,
+                          sampling_metadata.temperature)
+    assert same(expected_sampling_metadata.top_p, sampling_metadata.top_p)
+    assert same(expected_sampling_metadata.top_k, sampling_metadata.top_k)
+    assert torch.allclose(
+        expected_sampling_metadata.frequency_penalties,
+        sampling_metadata.frequency_penalties,
+    )
+    assert torch.allclose(
+        expected_sampling_metadata.presence_penalties,
+        sampling_metadata.presence_penalties,
+    )
+    assert torch.allclose(
+        expected_sampling_metadata.repetition_penalties,
+        sampling_metadata.repetition_penalties,
+    )
+    assert torch.allclose(expected_sampling_metadata.prompt_token_ids,
+                          sampling_metadata.prompt_token_ids)
+    assert (expected_sampling_metadata.output_token_ids ==
+            sampling_metadata.output_token_ids)
+    assert expected_sampling_metadata.no_penalties == \
+           sampling_metadata.no_penalties
+    if sampling_metadata.allowed_token_ids_mask:
+        assert torch.allclose(
+            expected_sampling_metadata.allowed_token_ids_mask,
+            sampling_metadata.allowed_token_ids_mask)
+    assert expected_sampling_metadata.bad_words_token_ids == \
+        sampling_metadata.bad_words_token_ids
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [32])
+@pytest.mark.parametrize("swap_list", [((0, 1), )])
+def test_swap_states_in_input_batch(device: str, batch_size: int,
+                                    swap_list: list):
+    """
+    Tests the logic for managing sampling metadata in the InputBatch.
+
+    This test involves adding a set of requests to the InputBatch,
+    followed by removing a subset of them. Afterward, the batch is compacted,
+    and the `make_sampling_metadata` method is invoked on the batch. The
+    output of `make_sampling_metadata` is then compared against the expected
+    results to ensure correctness.
+
+    Note: Ignore logits processor logic, which is tested separately
+    """
+    input_batch: InputBatch = InputBatch(
+        max_num_reqs=batch_size,
+        max_model_len=1024,
+        max_num_batched_tokens=1024,
+        device=torch.device(device),
+        pin_memory=is_pin_memory_available(),
+        vocab_size=1024,
+        block_sizes=[1],
+    )
+    ref_input_batch: InputBatch = InputBatch(
+        max_num_reqs=batch_size,
+        max_model_len=1024,
+        max_num_batched_tokens=1024,
+        device=torch.device(device),
+        pin_memory=is_pin_memory_available(),
+        vocab_size=1024,
+        block_sizes=[1],
+    )
+
+    reqs: list[CachedRequestState] = []
+    req_id_reqs = {}
+    req_id_output_token_ids = {}
+    # Add requests
+    for req_index in range(batch_size):
+        req: CachedRequestState = _construct_cached_request_state(req_index)
+        assigned_req_index = input_batch.add_request(req)
+        assert assigned_req_index == req_index
+        reqs.append(req)
+        req_id_reqs[req.req_id] = req
+        req_id_output_token_ids[req.req_id] = req.output_token_ids
+
+    reordered_reqs = reqs.copy()
+    for swap_pair in swap_list:
+        reordered_reqs[swap_pair[0]], reordered_reqs[swap_pair[1]] = \
+            reordered_reqs[swap_pair[1]], reordered_reqs[swap_pair[0]]
+        input_batch.swap_states(swap_pair[0], swap_pair[1])
+
+    for req_index in range(batch_size):
+        req = reordered_reqs[req_index]
+        assigned_req_index = ref_input_batch.add_request(req)
+        assert assigned_req_index == req_index
+
+    input_batch.refresh_metadata()
+    ref_input_batch.refresh_metadata()
+
+    _compare_objs(input_batch, ref_input_batch)
diff --git a/vllm_v0.10.0/tests/v1/worker/test_gpu_model_runner.py b/vllm_v0.10.0/tests/v1/worker/test_gpu_model_runner.py
new file mode 100644
index 0000000..7fec478
--- /dev/null
+++ b/vllm_v0.10.0/tests/v1/worker/test_gpu_model_runner.py
@@ -0,0 +1,841 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import random
+
+import numpy as np
+import pytest
+import torch
+
+from vllm.attention import Attention
+from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
+                         SchedulerConfig, VllmConfig, set_current_vllm_config)
+from vllm.distributed.parallel_state import (init_distributed_environment,
+                                             initialize_model_parallel)
+from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
+from vllm.platforms import current_platform
+from vllm.sampling_params import SamplingParams
+from vllm.utils import GiB_bytes, update_environment_variables
+from vllm.v1.core.kv_cache_utils import (estimate_max_model_len,
+                                         get_kv_cache_config)
+from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
+                                       SchedulerOutput)
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheGroupSpec, KVCacheTensor)
+from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.worker.gpu_input_batch import InputBatch
+from vllm.v1.worker.gpu_model_runner import GPUModelRunner
+
+BLOCK_SIZE = 16
+NUM_BLOCKS = 10
+DEVICE = current_platform.device_type
+
+
+def initialize_kv_cache(runner: GPUModelRunner):
+    """
+    Only perform necessary steps in GPUModelRunner.initialize_kv_cache()
+    """
+    attn_spec = FullAttentionSpec(
+        block_size=BLOCK_SIZE,
+        num_kv_heads=runner.model_config.get_num_kv_heads(
+            runner.parallel_config),
+        head_size=runner.model_config.get_head_size(),
+        dtype=runner.kv_cache_dtype,
+        use_mla=False,
+    )
+    tensor_size = attn_spec.page_size_bytes * NUM_BLOCKS
+    kv_cache_config = KVCacheConfig(
+        num_blocks=NUM_BLOCKS,
+        kv_cache_tensors=[
+            KVCacheTensor(size=tensor_size, shared_by=["layer.0"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(layer_names=["layer.0"], kv_cache_spec=attn_spec)
+        ],
+    )
+    runner.kv_cache_config = kv_cache_config
+    runner.input_batch = InputBatch(
+        max_num_reqs=runner.max_num_reqs,
+        max_model_len=runner.max_model_len,
+        max_num_batched_tokens=runner.max_num_tokens,
+        device=runner.device,
+        pin_memory=runner.pin_memory,
+        vocab_size=runner.model_config.get_vocab_size(),
+        block_sizes=[
+            kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size
+        ],
+    )
+    runner.initialize_attn_backend(kv_cache_config)
+
+
+def get_vllm_config():
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=10,
+        max_num_batched_tokens=512,
+        max_model_len=512,
+    )
+    model_config = ModelConfig(
+        model="facebook/opt-125m",
+        task="generate",
+        tokenizer="facebook/opt-125m",
+        tokenizer_mode="auto",
+        trust_remote_code=True,
+        dtype="float16",
+        seed=42,
+    )
+    cache_config = CacheConfig(
+        block_size=BLOCK_SIZE,
+        gpu_memory_utilization=0.9,
+        swap_space=0,
+        cache_dtype="auto",
+    )
+    parallel_config = ParallelConfig()
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        cache_config=cache_config,
+        scheduler_config=scheduler_config,
+        parallel_config=parallel_config,
+    )
+    return vllm_config
+
+
+@pytest.fixture
+def model_runner():
+    vllm_config = get_vllm_config()
+    model_config = vllm_config.model_config
+    num_heads = model_config.get_num_kv_heads(vllm_config.parallel_config)
+    head_size = model_config.get_head_size()
+    vllm_config.compilation_config.static_forward_context[
+        "layer.0"] = Attention(num_heads, head_size, 0.1)
+    runner = GPUModelRunner(vllm_config, DEVICE)
+    initialize_kv_cache(runner)
+    return runner
+
+
+model_runner_2 = model_runner
+
+
+def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
+    new_reqs = []
+    num_scheduled_tokens = {}
+    total_num_scheduled_tokens = 0
+    for req_id in req_ids:
+        new_reqs.append(
+            NewRequestData(
+                req_id=req_id,
+                prompt_token_ids=[1, 2, 3],
+                mm_inputs=[],
+                mm_hashes=[],
+                mm_positions=[],
+                sampling_params=SamplingParams(),
+                pooling_params=None,
+                block_ids=([0], ),
+                num_computed_tokens=0,
+                lora_request=None,
+            ))
+        num_scheduled_tokens[req_id] = 3
+        total_num_scheduled_tokens += num_scheduled_tokens[req_id]
+
+    return SchedulerOutput(
+        scheduled_new_reqs=new_reqs,
+        scheduled_cached_reqs=CachedRequestData.make_empty(),
+        num_scheduled_tokens=num_scheduled_tokens,
+        total_num_scheduled_tokens=total_num_scheduled_tokens,
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+
+
+def _is_req_scheduled(model_runner, req_id: str) -> bool:
+    return req_id in model_runner.input_batch.req_id_to_index
+
+
+def _is_req_added(model_runner, req_id: str) -> bool:
+    return req_id in model_runner.requests
+
+
+def _is_sampling_metadata_changed(model_runner,
+                                  sampling_metadata_before: SamplingMetadata):
+    return model_runner.input_batch.sampling_metadata is not (
+        sampling_metadata_before)
+
+
+def _is_req_state_block_table_match(model_runner, req_id: str) -> bool:
+    req_index = model_runner.input_batch.req_id_to_index[req_id]
+    block_table = model_runner.input_batch.block_table[0]
+    req_state = model_runner.requests[req_id]
+    if block_table.num_blocks_per_row[req_index] != len(
+            req_state.block_ids[0]):
+        return False
+    num_blocks = block_table.num_blocks_per_row[req_index]
+    return (block_table.block_table_np[req_index, :num_blocks] ==
+            req_state.block_ids[0]).all()
+
+
+def test_update_states_new_request(model_runner, dist_init):
+    req_id = "req_0"
+
+    # new req
+    scheduler_output = _schedule_new_request(req_id)
+
+    metadata_before = model_runner.input_batch.sampling_metadata
+    model_runner._update_states(scheduler_output)
+    assert _is_sampling_metadata_changed(model_runner, metadata_before)
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+    assert _is_req_state_block_table_match(model_runner, req_id)
+
+
+def test_update_states_request_finished(model_runner, dist_init):
+    req_id = "req_0"
+
+    # new req
+    scheduler_output = _schedule_new_request(req_id)
+
+    model_runner._update_states(scheduler_output)
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+
+    # finish req
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=CachedRequestData.make_empty(),
+        num_scheduled_tokens={},
+        total_num_scheduled_tokens=0,
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids={req_id},
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+
+    metadata_before = model_runner.input_batch.sampling_metadata
+    model_runner._update_states(scheduler_output)
+    assert _is_sampling_metadata_changed(model_runner, metadata_before)
+    assert not _is_req_added(model_runner, req_id)
+    assert not _is_req_scheduled(model_runner, req_id)
+
+
+def test_update_states_request_resumed(model_runner, dist_init):
+    req_id = "req_0"
+
+    # new req
+    scheduler_output = _schedule_new_request(req_id)
+
+    model_runner._update_states(scheduler_output)
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+
+    # unschedule req
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=CachedRequestData.make_empty(),
+        num_scheduled_tokens={},
+        total_num_scheduled_tokens=0,
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+
+    model_runner._update_states(scheduler_output)
+    assert _is_req_added(model_runner, req_id)
+    assert not _is_req_scheduled(model_runner, req_id)
+
+    # resume req
+    cached_req_data = CachedRequestData(
+        req_ids=[req_id],
+        resumed_from_preemption=[False],
+        new_token_ids=[[]],
+        new_block_ids=([[0]], ),
+        num_computed_tokens=[0],
+    )
+
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=cached_req_data,
+        num_scheduled_tokens={req_id: 1},
+        total_num_scheduled_tokens=1,
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+
+    metadata_before = model_runner.input_batch.sampling_metadata
+    model_runner._update_states(scheduler_output)
+    assert _is_sampling_metadata_changed(model_runner, metadata_before)
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+    assert _is_req_state_block_table_match(model_runner, req_id)
+
+
+def test_get_nans_in_logits(model_runner, dist_init):
+    req_ids = ("req_0", "req_1")
+
+    scheduler_output = _schedule_new_request(*req_ids)
+    model_runner._update_states(scheduler_output)
+
+    logits = torch.tensor([
+        [1.0, 2.0, 3.0],
+        [3.0, 2.0, 1.0],
+    ], device=DEVICE)
+    result = model_runner._get_nans_in_logits(logits)
+    assert result == {"req_0": 0, "req_1": 0}
+
+    logits = torch.tensor([
+        [1.0, float('nan'), 3.0],
+        [4.0, float('nan'), float('nan')],
+    ],
+                          device=DEVICE)
+    result = model_runner._get_nans_in_logits(logits)
+    assert result == {"req_0": 1, "req_1": 2}
+
+    logits = torch.tensor([
+        [1.0, 2.0, 3.0],
+        [4.0, float('nan'), float('nan')],
+    ],
+                          device=DEVICE)
+    result = model_runner._get_nans_in_logits(logits)
+    assert result == {"req_0": 0, "req_1": 2}
+
+    result = model_runner._get_nans_in_logits(logits=None)
+    assert result == {"req_0": 0, "req_1": 0}
+
+    logits = torch.tensor([
+        [1.0, float('nan'), 3.0],
+    ], device=DEVICE)
+    result = model_runner._get_nans_in_logits(logits)
+    assert result == {'req_0': 1, 'req_1': 0}
+
+    logits = torch.tensor([
+        [float('nan'), float('nan'), 2.0],
+        [1.0, 2.0, 3.0],
+        [float('nan'), 2.0, 3.0],
+    ],
+                          device=DEVICE)
+    result = model_runner._get_nans_in_logits(logits)
+    assert result == {'req_0': 2, 'req_1': 0}
+
+
+def test_update_states_no_changes(model_runner, dist_init):
+    req_id = "req_0"
+
+    # new req
+    scheduler_output = _schedule_new_request(req_id)
+
+    model_runner._update_states(scheduler_output)
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+
+    # schedule req
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=CachedRequestData.make_empty(),
+        num_scheduled_tokens={req_id: 1},
+        total_num_scheduled_tokens=1,
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+
+    metadata_before = model_runner.input_batch.sampling_metadata
+    model_runner._update_states(scheduler_output)
+    assert not _is_sampling_metadata_changed(model_runner, metadata_before)
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+    assert _is_req_state_block_table_match(model_runner, req_id)
+
+
+def test_update_states_request_unscheduled(model_runner, dist_init):
+    req_ids = ("req_0", "req_1")
+
+    # new reqs
+    scheduler_output = _schedule_new_request(*req_ids)
+
+    model_runner._update_states(scheduler_output)
+
+    assert _is_req_added(model_runner, req_ids[0])
+    assert _is_req_scheduled(model_runner, req_ids[0])
+
+    assert _is_req_added(model_runner, req_ids[1])
+    assert _is_req_scheduled(model_runner, req_ids[1])
+
+    # unschedule req_1
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=CachedRequestData.make_empty(),
+        num_scheduled_tokens={req_ids[0]: 1},
+        total_num_scheduled_tokens=1,
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+
+    metadata_before = model_runner._update_states(scheduler_output)
+    assert _is_sampling_metadata_changed(model_runner, metadata_before)
+
+    assert _is_req_added(model_runner, req_ids[0])
+    assert _is_req_scheduled(model_runner, req_ids[0])
+
+    assert _is_req_added(model_runner, req_ids[1])
+    assert not _is_req_scheduled(model_runner, req_ids[1])
+
+
+def test_kv_cache_stride_order(monkeypatch, model_runner):
+    # This test checks if GPUModelRunner initializes correctly when an attention
+    # backend enforces a non-default KV cache stride order.
+    n_heads = model_runner.model_config.get_num_kv_heads(
+        model_runner.parallel_config)
+    expected_kv_cache_shape = [
+        2, NUM_BLOCKS, BLOCK_SIZE, n_heads,
+        model_runner.model_config.get_head_size()
+    ]
+    # TODO mla test
+    default_stride = list(range(5))
+    # Permutation that gets you back to expected kv shape
+    rnd_stride = tuple(random.sample(default_stride, len(default_stride)))
+
+    def rnd_stride_order():
+        return rnd_stride
+
+    # Patch the attention backend class and re-trigger the KV cache creation.
+    for attn_backend in model_runner.attn_backends:
+        monkeypatch.setattr(attn_backend, "get_kv_cache_stride_order",
+                            rnd_stride_order)
+
+    model_runner.attn_backends = []
+    model_runner.attn_metadata_builders = []
+    model_runner.initialize_kv_cache(model_runner.kv_cache_config)
+
+    # Shape is unchanged, but layout may differ
+    kv_cache_shape = model_runner.kv_caches[0].shape
+    assert list(kv_cache_shape) == expected_kv_cache_shape
+    if default_stride == rnd_stride:
+        assert all(kv.is_contiguous() for kv in model_runner.kv_caches)
+    else:
+        assert all(not kv.is_contiguous() for kv in model_runner.kv_caches)
+
+
+def test_update_config(model_runner):
+    # Simple update
+    model_runner.update_config({"load_config": {"load_format": "dummy"}})
+    assert model_runner.load_config.load_format == "dummy"
+    # Raise error on non-existing config
+    with pytest.raises(AssertionError):
+        model_runner.update_config({"do_not_exist_config": "dummy"})
+
+
+def test_load_model_weights_inplace(dist_init, model_runner, model_runner_2):
+    # In this test, model_runner loads model + weights in one go, while
+    # model_runner_2 loads dummy weights first then load real weights inplace
+    model_runner.load_model()
+    original_load_format = model_runner_2.load_config.load_format
+    model_runner_2.update_config({"load_config": {"load_format": "dummy"}})
+    model_runner_2.load_model()  # Initial model loading with dummy weights
+    assert str(model_runner.get_model().state_dict()) != str(
+        model_runner_2.get_model().state_dict())
+    model_runner_2.update_config(
+        {"load_config": {
+            "load_format": original_load_format
+        }})
+    model_runner_2.reload_weights()  # Load real weights inplace
+    assert str(model_runner.get_model().state_dict()) == str(
+        model_runner_2.get_model().state_dict())
+
+
+def test_reload_weights_before_load_model(model_runner):
+    with pytest.raises(AssertionError):
+        model_runner.reload_weights()
+
+
+def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
+    torch.set_default_dtype(torch.float16)
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    error_msg = f"{layer_1} must come before the current layer"
+    with pytest.raises(ValueError, match=error_msg):
+        fwd_context = {
+            # initialization below will fail because target layer is invalid;
+            # the target layer needs to come before layer 1
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_0,
+                kv_sharing_target_layer_name=layer_1,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_1,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+
+
+def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
+    torch.set_default_dtype(torch.float16)
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    invalid_layer = "model.layers.0.cross_attn.attn"
+    error_msg = f"{invalid_layer} is not a valid Attention layer in the model"
+    with pytest.raises(ValueError, match=error_msg):
+        fwd_context = {
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_1,
+                # invalid layer: cross_attn.atn doesn't exist!
+                kv_sharing_target_layer_name=invalid_layer,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+
+
+def test_init_kv_cache_with_kv_sharing_target_same_as_current():
+    torch.set_default_dtype(torch.float16)
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    error_msg = f"{layer_1} cannot be the same as the current layer"
+    with pytest.raises(ValueError, match=error_msg):
+        fwd_context = {
+            # initialization below will fail because target layer is invalid;
+            # the target layer needs to come before layer 1
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_1,
+                kv_sharing_target_layer_name=layer_1,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+
+
+def test_init_kv_cache_without_kv_sharing():
+    torch.set_default_dtype(torch.float16)
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    vllm_config = get_vllm_config()
+    with set_current_vllm_config(vllm_config):
+        fwd_context = {
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_1,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+    # Set high context length to test max context length estimation
+    vllm_config.model_config.max_model_len = 3_000_000
+    vllm_ctx = vllm_config.compilation_config.static_forward_context
+    runner = GPUModelRunner(vllm_config, DEVICE)
+    kv_cache_spec = runner.get_kv_cache_spec()
+    assert len(kv_cache_spec) == 2
+    assert len(runner.shared_kv_cache_layers) == 0
+
+    available_memory = 20 * GiB_bytes
+    # page size for layer 0's kv_cache_spec is 32KB
+    num_expected_blocks = 327680  # 20GB / 32KB / 2 (num layers)
+    kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
+                                          available_memory)
+    assert kv_cache_config.num_blocks == num_expected_blocks
+    assert len(kv_cache_config.kv_cache_tensors) == 2
+    assert kv_cache_config.kv_cache_tensors[0].size == available_memory // 2
+    assert kv_cache_config.kv_cache_tensors[1].size == available_memory // 2
+
+    max_context_len =\
+        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
+    # max context len with KV sharing should be 2x as large as without
+    assert max_context_len == 1310720
+
+    # important: override tensor size to prevent large mem alloc during test
+    # this will only allocate 2 block worth of memory (2 * 32kb)
+    kv_cache_config.num_blocks = 1
+    for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
+        kv_cache_tensor.size = (
+            kv_cache_spec[kv_cache_tensor.shared_by[0]].page_size_bytes)
+
+    runner.initialize_kv_cache(kv_cache_config)
+
+    layer_0_kv = vllm_ctx[layer_0].kv_cache[0]
+    layer_1_kv = vllm_ctx[layer_1].kv_cache[0]
+    # check layer 1 kv cache does NOT share memory with layer 0
+    assert id(layer_1_kv) != id(layer_0_kv)
+
+    # check layer 1 added to kv cache group's layer names
+    assert len(kv_cache_config.kv_cache_groups) == 1
+    assert len(kv_cache_config.kv_cache_groups[0].layer_names) == 2
+    assert kv_cache_config.kv_cache_groups[0].layer_names[0] == layer_0
+    assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1
+
+
+def test_init_kv_cache_with_kv_sharing_valid():
+    torch.set_default_dtype(torch.float16)
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    vllm_config = get_vllm_config()
+    with set_current_vllm_config(vllm_config):
+        fwd_context = {
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_1,
+                kv_sharing_target_layer_name="model.layers.0.self_attn.attn",
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+    # Set high context length to test max context length estimation
+    vllm_config.model_config.max_model_len = 3_000_000
+    vllm_ctx = vllm_config.compilation_config.static_forward_context
+    runner = GPUModelRunner(vllm_config, DEVICE)
+    kv_cache_spec = runner.get_kv_cache_spec()
+    assert len(kv_cache_spec) == 1
+    assert layer_0 in kv_cache_spec
+    assert runner.shared_kv_cache_layers[layer_1] == layer_0
+
+    available_memory = 20 * GiB_bytes
+    # page size for layer 0's kv_cache_spec is 32KB
+    # with KV sharing, we can allocate (available_mem//page_size//1) blocks
+    # which is twice as many as without KV sharing
+    num_expected_blocks = 655360  # 20GB / 32KB
+    kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
+                                          available_memory)
+    assert kv_cache_config.num_blocks == num_expected_blocks
+    assert len(kv_cache_config.kv_cache_tensors) == 1
+    # Each layer now has twice the available memory for KV cache
+    # compared to no KV sharing
+    assert kv_cache_config.kv_cache_tensors[0].size == available_memory
+
+    max_context_len =\
+        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
+    # max context len with KV sharing should be 2x as large as without
+    assert max_context_len == 2 * 1310720
+
+    # important: override tensor size to prevent large mem alloc during test
+    # this will only allocate 1 block worth of memory (32kb)
+    kv_cache_config.num_blocks = 1
+    kv_cache_config.kv_cache_tensors[0].size =\
+        kv_cache_spec[layer_0].page_size_bytes
+
+    runner.initialize_kv_cache(kv_cache_config)
+
+    layer_0_kv = vllm_ctx[layer_0].kv_cache[0]
+    layer_1_kv = vllm_ctx[layer_1].kv_cache[0]
+    # check layer 1 kv cache shares memory with layer 0
+    assert id(layer_1_kv) == id(layer_0_kv)
+
+    # check layer 1 added to kv cache group's layer names
+    assert len(kv_cache_config.kv_cache_groups) == 1
+    assert len(kv_cache_config.kv_cache_groups[0].layer_names) == 2
+    assert kv_cache_config.kv_cache_groups[0].layer_names[0] == layer_0
+    assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1
+
+
+def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
+    '''
+    The GPU model runner creates different views into the
+    KVCacheTensors for the attention and mamba layers
+    (via _reshape_kv_cache_tensors function). This test verifies
+    that the views are compatible: writing a mamba block
+    will not corrupt an attention block and vice-versa
+    '''
+
+    current_platform.seed_everything(42)
+
+    update_environment_variables({
+        'RANK': "0",
+        'LOCAL_RANK': "0",
+        'WORLD_SIZE': "1",
+        'MASTER_ADDR': 'localhost',
+        'MASTER_PORT': '12345',
+    })
+    init_distributed_environment()
+    initialize_model_parallel(tensor_model_parallel_size=1)
+    torch.set_default_dtype(torch.float16)
+
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=10,
+        max_num_batched_tokens=512,
+        max_model_len=512,
+    )
+    model_config = ModelConfig(
+        model="ibm-granite/granite-4.0-tiny-preview",
+        dtype="float16",
+    )
+    cache_config = CacheConfig(
+        block_size=BLOCK_SIZE,
+        gpu_memory_utilization=0.9,
+        swap_space=0,
+        cache_dtype="auto",
+    )
+    parallel_config = ParallelConfig()
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        cache_config=cache_config,
+        scheduler_config=scheduler_config,
+        parallel_config=parallel_config,
+    )
+
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    layer_2 = "model.layers.2.mixer"
+    layer_3 = "model.layers.3.mixer"
+    layer_4 = "model.layers.4.mixer"
+    layer_5 = "model.layers.5.mixer"
+
+    with set_current_vllm_config(vllm_config):
+        hf_config = vllm_config.model_config.hf_config
+        fwd_context = {}
+        for key in [layer_0, layer_1]:
+            fwd_context[key] = Attention(
+                num_heads=model_config.get_num_attention_heads(
+                    parallel_config),
+                num_kv_heads=model_config.get_num_kv_heads(parallel_config),
+                head_size=model_config.get_head_size(),
+                scale=1.0,
+                prefix=key,
+            )
+        for key in [layer_2, layer_3, layer_4, layer_5]:
+            fwd_context[key] = MambaMixer2(
+                hidden_size = hf_config.hidden_size,
+                ssm_state_size = hf_config.mamba_d_state,
+                conv_kernel_size = hf_config.mamba_d_conv,
+                intermediate_size = hf_config.mamba_expand *\
+                                    hf_config.hidden_size,
+                use_conv_bias = hf_config.mamba_conv_bias,
+                use_bias = hf_config.mamba_proj_bias,
+                n_groups=hf_config.mamba_n_groups,
+                num_heads=hf_config.mamba_n_heads,
+                head_dim=hf_config.mamba_d_head,
+                rms_norm_eps=hf_config.rms_norm_eps,
+                activation=hf_config.hidden_act,
+                prefix=key,
+            )
+        # suppress var not used error
+        assert fwd_context is not None
+    vllm_ctx = vllm_config.compilation_config.static_forward_context
+
+    with monkeypatch.context() as m:
+
+        m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
+
+        runner = GPUModelRunner(vllm_config, DEVICE)
+        kv_cache_spec = runner.get_kv_cache_spec()
+
+        available_memory = 5 * GiB_bytes
+        kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
+                                              available_memory)
+        runner.initialize_kv_cache(kv_cache_config)
+
+        # random partition of blocks
+        # blocks0 will be assigned to attention layers
+        # blocks1 will be assigned to mamba layers
+        num_blocks = kv_cache_config.num_blocks
+        ind = np.arange(num_blocks)
+        np.random.shuffle(ind)
+        blocks0, blocks1 = ind[:(num_blocks // 2)], ind[(num_blocks // 2):]
+
+        attn_shape = vllm_ctx[layer_0].kv_cache[0].shape
+        conv_shape = vllm_ctx[layer_2].kv_cache[0][0].shape
+        ssm_shape = vllm_ctx[layer_2].kv_cache[0][1].shape
+
+        # assert we are using FlashInfer
+        assert attn_shape[0] == num_blocks
+
+        attn_blocks_constant = torch.full((len(blocks0), *attn_shape[1:]),
+                                          device=DEVICE,
+                                          fill_value=3.33)
+        conv_blocks_constant = torch.full((len(blocks1), *conv_shape[1:]),
+                                          device=DEVICE,
+                                          fill_value=6.66)
+        ssm_blocks_constant = torch.full((len(blocks1), *ssm_shape[1:]),
+                                         device=DEVICE,
+                                         fill_value=9.99)
+
+        # fill all attention blocks with constant
+        for layer in [layer_0, layer_1]:
+            vllm_ctx[layer].kv_cache[0][
+                blocks0, :] = attn_blocks_constant.detach().clone()
+
+        # fill all mamba blocks with constant
+        for layer in [layer_2, layer_3, layer_4, layer_5]:
+            vllm_ctx[layer].kv_cache[0][0][
+                blocks1, :] = conv_blocks_constant.detach().clone()
+            vllm_ctx[layer].kv_cache[0][1][
+                blocks1, :] = ssm_blocks_constant.detach().clone()
+
+        # verify attention and mamba contents are correct
+        for layer in [layer_0, layer_1]:
+            assert torch.equal(vllm_ctx[layer].kv_cache[0][blocks0, :],
+                               attn_blocks_constant)
+        for layer in [layer_2, layer_3, layer_4, layer_5]:
+            assert torch.equal(vllm_ctx[layer].kv_cache[0][0][blocks1, :],
+                               conv_blocks_constant)
+            assert torch.equal(vllm_ctx[layer].kv_cache[0][1][blocks1, :],
+                               ssm_blocks_constant)
diff --git a/vllm_v0.10.0/tests/vllm_test_utils/setup.py b/vllm_v0.10.0/tests/vllm_test_utils/setup.py
new file mode 100644
index 0000000..83be8bd
--- /dev/null
+++ b/vllm_v0.10.0/tests/vllm_test_utils/setup.py
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from setuptools import setup
+
+setup(
+    name='vllm_test_utils',
+    version='0.1',
+    packages=['vllm_test_utils'],
+)
diff --git a/vllm_v0.10.0/tests/vllm_test_utils/vllm_test_utils/__init__.py b/vllm_v0.10.0/tests/vllm_test_utils/vllm_test_utils/__init__.py
new file mode 100644
index 0000000..2818428
--- /dev/null
+++ b/vllm_v0.10.0/tests/vllm_test_utils/vllm_test_utils/__init__.py
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+vllm_utils is a package for vLLM testing utilities.
+It does not import any vLLM modules.
+"""
+
+from .blame import BlameResult, blame
+from .monitor import MonitoredValues, monitor
+
+__all__ = ["blame", "BlameResult", "monitor", "MonitoredValues"]
diff --git a/vllm_v0.10.0/tests/vllm_test_utils/vllm_test_utils/blame.py b/vllm_v0.10.0/tests/vllm_test_utils/vllm_test_utils/blame.py
new file mode 100644
index 0000000..49fd083
--- /dev/null
+++ b/vllm_v0.10.0/tests/vllm_test_utils/vllm_test_utils/blame.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import contextlib
+import dataclasses
+import sys
+import traceback
+from collections.abc import Generator
+from typing import Callable
+
+
+@dataclasses.dataclass
+class BlameResult:
+    found: bool = False
+    trace_stack: str = ""
+
+
+@contextlib.contextmanager
+def blame(func: Callable) -> Generator[BlameResult, None, None]:
+    """
+    Trace the function calls to find the first function that satisfies the
+    condition. The trace stack will be stored in the result.
+
+    Usage:
+
+    ```python
+    with blame(lambda: some_condition()) as result:
+        # do something
+    
+    if result.found:
+        print(result.trace_stack)
+    """
+    result = BlameResult()
+
+    def _trace_calls(frame, event, arg=None):
+        nonlocal result
+        if event in ['call', 'return']:
+            # for every function call or return
+            try:
+                # Temporarily disable the trace function
+                sys.settrace(None)
+                # check condition here
+                if not result.found and func():
+                    result.found = True
+                    result.trace_stack = "".join(traceback.format_stack())
+                # Re-enable the trace function
+                sys.settrace(_trace_calls)
+            except NameError:
+                # modules are deleted during shutdown
+                pass
+        return _trace_calls
+
+    try:
+        sys.settrace(_trace_calls)
+        yield result
+    finally:
+        sys.settrace(None)
diff --git a/vllm_v0.10.0/tests/vllm_test_utils/vllm_test_utils/monitor.py b/vllm_v0.10.0/tests/vllm_test_utils/vllm_test_utils/monitor.py
new file mode 100644
index 0000000..9454221
--- /dev/null
+++ b/vllm_v0.10.0/tests/vllm_test_utils/vllm_test_utils/monitor.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import contextlib
+import dataclasses
+import sys
+import traceback
+from collections.abc import Generator
+from typing import Callable, Generic, TypeVar
+
+_T = TypeVar("_T")
+
+
+@dataclasses.dataclass
+class MonitoredValues(Generic[_T]):
+    values: list[_T] = dataclasses.field(default_factory=list)
+    trace_stacks: list[str] = dataclasses.field(default_factory=list)
+
+
+@contextlib.contextmanager
+def monitor(
+    measure_func: Callable[[],
+                           _T]) -> Generator[MonitoredValues[_T], None, None]:
+    """
+    Trace the function calls to continuously monitor the change of
+    a value.
+
+    Usage:
+
+    ```python
+
+    def measure_func():
+        ... # measure the current value
+        return current_value
+
+    with monitor(measure_func) as monitored_values:
+        # do something
+    
+        monitored_values.values # all changes of the values
+        monitored_values.trace_stacks # trace stacks of every change
+    ```
+    """
+    monitored_values = MonitoredValues[_T]()
+
+    def _trace_calls(frame, event, arg=None):
+        nonlocal monitored_values
+        if event in ['line']:
+            # triggered by every line of Python code.
+            # only Python functions will trigger it,
+            # c/cpp functions will not trigger it.
+            try:
+                # Temporarily disable the trace function
+                sys.settrace(None)
+                # do a measurement
+                current_value = measure_func()
+                if len(monitored_values.values
+                       ) == 0 or current_value != monitored_values.values[-1]:
+                    monitored_values.values.append(current_value)
+                    monitored_values.trace_stacks.append("".join(
+                        traceback.format_stack()))
+                # Re-enable the trace function
+                sys.settrace(_trace_calls)
+            except NameError:
+                # modules are deleted during shutdown
+                pass
+        return _trace_calls
+
+    try:
+        sys.settrace(_trace_calls)
+        yield monitored_values
+    finally:
+        sys.settrace(None)
diff --git a/vllm_v0.10.0/tests/weight_loading/models-large.txt b/vllm_v0.10.0/tests/weight_loading/models-large.txt
new file mode 100644
index 0000000..ee98aed
--- /dev/null
+++ b/vllm_v0.10.0/tests/weight_loading/models-large.txt
@@ -0,0 +1,8 @@
+compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main
+compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main
+compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W8A16-quantized, main
+compressed-tensors, nm-testing/test-w4a16-mixtral-actorder-group, main
+gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, main
+gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, gptq-8bit-128g-actorder_True
+awq_marlin, casperhansen/deepseek-coder-v2-instruct-awq, main
+compressed-tensors, RedHatAI/Llama-4-Scout-17B-16E-Instruct-quantized.w4a16, main
\ No newline at end of file
diff --git a/vllm_v0.10.0/tests/weight_loading/models.txt b/vllm_v0.10.0/tests/weight_loading/models.txt
new file mode 100644
index 0000000..1b79707
--- /dev/null
+++ b/vllm_v0.10.0/tests/weight_loading/models.txt
@@ -0,0 +1,34 @@
+gptq_marlin, robertgshaw2/zephyr-7b-beta-channelwise-gptq, main
+gptq_marlin, TheBloke/Llama-2-7B-GPTQ, main
+gptq_marlin, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, main
+gptq_marlin, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, gptq-8bit--1g-actorder_True
+gptq_marlin, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, gptq-8bit-32g-actorder_True
+gptq_marlin, TechxGenus/gemma-1.1-2b-it-GPTQ, main
+gptq, robertgshaw2/zephyr-7b-beta-channelwise-gptq, main
+gptq, TheBloke/Llama-2-7B-GPTQ, main
+gptq, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, main
+gptq, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, gptq-8bit--1g-actorder_True
+gptq, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, gptq-8bit-32g-actorder_True
+gptq, TechxGenus/gemma-1.1-2b-it-GPTQ, main
+compressed-tensors, nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change, main
+compressed-tensors, nm-testing/tinyllama-oneshot-w8-channel-a8-tensor, main
+compressed-tensors, nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2, main
+compressed-tensors, nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2, main
+compressed-tensors, nm-testing/tinyllama-oneshot-w4a16-group128-v2, main
+compressed-tensors, nm-testing/tinyllama-oneshot-w8a16-per-channel, main
+compressed-tensors, nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test, main
+compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
+compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
+compressed-tensors, nm-testing/TinyLlama-1.1B-Chat-v1.0-actorder-group, main
+#compressed-tensors, mgoin/DeepSeek-Coder-V2-Lite-Instruct-FP8, main
+compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-FP8-Dynamic-testing, main, 90
+compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-W8A8-testing, main, 90
+awq, casperhansen/mixtral-instruct-awq, main
+awq_marlin, casperhansen/mixtral-instruct-awq, main
+fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main
+marlin, nm-testing/zephyr-beta-7b-marlin-g128, main
+marlin, robertgshaw2/zephyr-7b-beta-channelwise-marlin, main
+qqq, HandH1998/QQQ-Llama-3-8b-g128, main
+qqq, HandH1998/QQQ-Llama-3-8b, main
+hqq, nm-testing/Llama-3.2-1B-Instruct-HQQ, main
+None, mgleize/fairseq2-dummy-Llama-3.2-1B, main
\ No newline at end of file
diff --git a/vllm_v0.10.0/tests/weight_loading/run_model_weight_loading_test.sh b/vllm_v0.10.0/tests/weight_loading/run_model_weight_loading_test.sh
new file mode 100755
index 0000000..8a899bc
--- /dev/null
+++ b/vllm_v0.10.0/tests/weight_loading/run_model_weight_loading_test.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+SUCCESS=0
+
+while getopts "c:" OPT; do
+  case ${OPT} in
+    c )
+        CONFIG="$OPTARG"
+        ;;
+    \? )
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+
+IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
+
+for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
+do
+    if [[ $MODEL_CONFIG == \#* ]]; then
+        echo "=== SKIPPING MODEL: $MODEL_CONFIG ==="
+        continue
+    fi
+
+    LOCAL_SUCCESS=0
+    IFS=', ' read -r -a array <<< "$MODEL_CONFIG"
+
+    echo "=== RUNNING MODEL: $MODEL_CONFIG ==="
+
+    export QUANTIZATION=${array[0]}
+    export MODEL_NAME=${array[1]}
+    export REVISION=${array[2]}
+    # If array length is larger than 3, then MIN_CAPABILITY is provided
+    if [ ${#array[@]} -gt 3 ]; then
+        export MIN_CAPABILITY=${array[3]}
+    fi
+    pytest -s weight_loading/test_weight_loading.py || LOCAL_SUCCESS=$?
+
+    if [[ $LOCAL_SUCCESS == 0 ]]; then
+        echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
+    else
+        echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
+    fi
+
+    SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
+
+done
+
+if [ "${SUCCESS}" -eq "0" ]; then
+    exit 0
+else
+    exit 1
+fi
diff --git a/vllm_v0.10.0/tests/weight_loading/test_weight_loading.py b/vllm_v0.10.0/tests/weight_loading/test_weight_loading.py
new file mode 100644
index 0000000..3aabae0
--- /dev/null
+++ b/vllm_v0.10.0/tests/weight_loading/test_weight_loading.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+
+MAX_MODEL_LEN = 1024
+MODEL_NAME = os.environ.get("MODEL_NAME",
+                            "robertgshaw2/zephyr-7b-beta-channelwise-gptq")
+REVISION = os.environ.get("REVISION", "main")
+QUANTIZATION = os.environ.get("QUANTIZATION", "gptq_marlin")
+MIN_CAPABILITY = os.environ.get("MIN_CAPABILITY", "80")
+
+
+@pytest.mark.skipif(
+    MODEL_NAME == "casperhansen/deepseek-coder-v2-instruct-awq",
+    reason="OOM in the CI")
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(int(MIN_CAPABILITY)),
+    reason="Current system does not have minimum capability.")
+def test_weight_loading(vllm_runner):
+    """
+    Test parameter weight loading with tp>1.
+    """
+
+    # MoE models need fp16.
+    NEEDS_FP16 = (QUANTIZATION == "gptq" or MODEL_NAME
+                  == "nm-testing/test-w4a16-mixtral-actorder-group")
+    with vllm_runner(
+            model_name=MODEL_NAME,
+            revision=REVISION,
+            dtype=torch.half if NEEDS_FP16 else "auto",
+            quantization=None if QUANTIZATION == "None" else QUANTIZATION,
+            max_model_len=MAX_MODEL_LEN,
+            tensor_parallel_size=2) as model:
+
+        output = model.generate_greedy("Hello world!", max_tokens=20)
+        print(output)
+        assert output
diff --git a/vllm_v0.10.0/tests/worker/__init__.py b/vllm_v0.10.0/tests/worker/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/tests/worker/conftest.py b/vllm_v0.10.0/tests/worker/conftest.py
new file mode 100644
index 0000000..3f202d4
--- /dev/null
+++ b/vllm_v0.10.0/tests/worker/conftest.py
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This module tests V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
\ No newline at end of file
diff --git a/vllm_v0.10.0/tests/worker/test_encoder_decoder_model_runner.py b/vllm_v0.10.0/tests/worker/test_encoder_decoder_model_runner.py
new file mode 100644
index 0000000..35ac90b
--- /dev/null
+++ b/vllm_v0.10.0/tests/worker/test_encoder_decoder_model_runner.py
@@ -0,0 +1,648 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import itertools
+
+import pytest
+import torch
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.platforms import current_platform
+from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
+from vllm.utils import make_tensor_with_pad
+from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
+
+BATCH_SIZES = [1, 4, 16, 64, 256]
+
+
+def _create_model_runner(model: str, *args,
+                         **kwargs) -> EncoderDecoderModelRunner:
+    engine_args = EngineArgs(model, *args, **kwargs)
+    engine_config = engine_args.create_engine_config()
+    model_runner = EncoderDecoderModelRunner(
+        vllm_config=engine_config,
+        is_driver_worker=True,
+    )
+    return model_runner
+
+
+@pytest.mark.skipif(condition=current_platform.is_cpu(),
+                    reason="CPU backend is currently "
+                    "unsupported for encoder/ "
+                    "decoder models")
+def test_empty_seq_group():
+    """Verify prepare prompt and decode returns empty output
+       for empty seq group list"""
+
+    model_runner = _create_model_runner(
+        "facebook/bart-base",
+        seed=0,
+        dtype="float16",
+        max_num_batched_tokens=100000,
+        max_num_seqs=100000,
+        enable_chunked_prefill=False,
+        enforce_eager=True,
+    )
+    seq_group_metadata_list: list[SequenceGroupMetadata] = []
+    model_input = model_runner._prepare_model_input_tensors(
+        seq_group_metadata_list)
+    (
+        input_tokens,
+        input_positions,
+        encoder_input_tokens,
+        encoder_input_positions,
+        attn_metadata,
+        return_seq_lens,
+    ) = (
+        model_input.input_tokens,
+        model_input.input_positions,
+        model_input.encoder_input_tokens,
+        model_input.encoder_input_positions,
+        model_input.attn_metadata,
+        model_input.seq_lens,
+    )
+    assert input_tokens is None
+    assert input_positions is None
+    assert encoder_input_tokens is None
+    assert encoder_input_positions is None
+    assert attn_metadata is None
+    assert return_seq_lens is None
+
+
+@pytest.mark.skipif(condition=current_platform.is_cpu(),
+                    reason="CPU backend is currently "
+                    "unsupported for encoder/ "
+                    "decoder models")
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+def test_prepare_prompt(batch_size):
+    '''
+    Test the ability of the encoder/decoder model runner subclass to
+    produce prefill-phase model inputs & attention metadata.
+
+    Test behavior:
+
+    * Instantiate BART base model & enc/dec model runner
+    * Construct sequence-group metadata for dummy prompts
+    * Test that encoder attention, decoder self-attention,
+      and encoder/decoder cross-attention inputs are correct
+
+    Arguments:
+
+    * batch_size
+    * backend_name: The attention backend under test
+    * enforce_eager: Enforce eager mode if True (i.e. no CUDAGraph)
+    '''
+
+    model_runner = _create_model_runner(
+        "facebook/bart-base",
+        seed=0,
+        dtype="float16",
+        max_num_batched_tokens=100000,
+        max_num_seqs=100000,
+        enable_chunked_prefill=False,
+        enforce_eager=True,
+    )
+
+    seq_lens: list[int] = []
+    encoder_seq_lens: list[int] = []
+    seq_group_metadata_list: list[SequenceGroupMetadata] = []
+    block_tables = {0: [1]}
+    cross_block_table = [2]
+    for i in range(batch_size):
+        # make sure all tokens fit into one block
+        seq_len = i % (model_runner.block_size - 1) + 1
+        seq_lens.append(seq_len)
+        seq_data = SequenceData.from_seqs(range(seq_len))
+        encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1
+        encoder_seq_lens.append(encoder_seq_len)
+        encoder_seq_data = SequenceData.from_seqs(range(encoder_seq_len))
+        seq_group_metadata = SequenceGroupMetadata(
+            request_id=f"test_{i}",
+            is_prompt=True,
+            seq_data={0: seq_data},
+            sampling_params=SamplingParams(temperature=0),
+            block_tables=block_tables,
+            encoder_seq_data=encoder_seq_data,
+            cross_block_table=cross_block_table,
+        )
+        assert seq_group_metadata.token_chunk_size == seq_data.get_len()
+        seq_group_metadata_list.append(seq_group_metadata)
+
+    # Build
+    # * Decoder model inputs
+    # * Decoder self-attention KV caching data structures
+    # * Encoder model inputs
+    # * Encoder/decoder cross-attention KV caching data structures
+    model_input = model_runner.prepare_model_input(seq_group_metadata_list)
+
+    input_tokens = model_input.input_tokens
+    input_positions = model_input.input_positions
+    attn_metadata = model_input.attn_metadata
+    return_seq_lens = model_input.seq_lens
+    slot_mapping = attn_metadata.slot_mapping
+    encoder_input_tokens = model_input.encoder_input_tokens
+    encoder_input_positions = model_input.encoder_input_positions
+    cross_slot_mapping = attn_metadata.cross_slot_mapping
+    assert return_seq_lens == seq_lens
+    assert len(slot_mapping) == len(input_tokens)
+    assert len(cross_slot_mapping) == len(encoder_input_tokens)
+
+    # Verify input metadata is correct for prompts.
+    # - Decoder attention metadata
+    device = model_runner.device
+    assert attn_metadata.num_prefills > 0
+    assert attn_metadata.num_decode_tokens == 0
+    assert torch.equal(attn_metadata.seq_lens_tensor,
+                       torch.tensor(seq_lens, device=device, dtype=torch.int))
+    assert attn_metadata.seq_lens == seq_lens
+    assert attn_metadata.max_prefill_seq_len == max(seq_lens)
+    assert attn_metadata.max_decode_seq_len == 0
+    # - Encoder attention metadata
+    assert attn_metadata.encoder_seq_lens == encoder_seq_lens
+    assert torch.equal(
+        attn_metadata.encoder_seq_lens_tensor,
+        torch.tensor(encoder_seq_lens, device=device, dtype=torch.int))
+    assert attn_metadata.max_encoder_seq_len == max(encoder_seq_lens)
+    assert attn_metadata.num_encoder_tokens == sum(encoder_seq_lens)
+
+    # Test decoder subquery start locs.
+    start_idx = 0
+    start_loc = [start_idx]
+    for seq_len in seq_lens:
+        start_idx += seq_len
+        start_loc.append(start_idx)
+    assert torch.equal(
+        attn_metadata.query_start_loc,
+        torch.tensor(start_loc, dtype=torch.int32, device=device),
+    )
+
+    # Test decoder seq start locs & context lengths
+
+    assert torch.equal(
+        attn_metadata.seq_start_loc,
+        torch.tensor(start_loc, dtype=torch.int32, device=device),
+    )
+    assert torch.equal(
+        attn_metadata.context_lens_tensor,
+        torch.zeros(attn_metadata.context_lens_tensor.shape[0],
+                    dtype=torch.int,
+                    device=device),
+    )
+
+    # Verify block tables are correct for prompts
+    # - Decoder self-attention
+    expected = torch.tensor(
+        [[] for _ in range(len(seq_group_metadata_list))],
+        dtype=torch.int32,
+        device=model_runner.device,
+    )
+    assert torch.equal(
+        attn_metadata.block_tables,
+        expected,
+    )
+    # - Encoder/decoder cross-attention
+    assert torch.equal(
+        attn_metadata.cross_block_tables,
+        expected,
+    )
+
+    # Cuda graph should not be used for prefill.
+    assert attn_metadata.use_cuda_graph is False
+
+    # Verify the lengths of input tokens & positions
+    # - Decoder
+    assert len(input_tokens) == sum(seq_lens)
+    assert len(input_positions) == sum(seq_lens)
+    # -- An indirect check that model_input.input_tokens
+    #    and model_input.input_positions are correct -
+    #    by design of the test, the input tokens are
+    #    equal to the input position values, so if
+    #    the model_input data structure has the correct
+    #    values then these two should be equal
+    assert torch.equal(
+        input_tokens,
+        input_positions,
+    )
+    # - Encoder
+    assert len(encoder_input_tokens) == sum(encoder_seq_lens)
+    # -- An indirect check that model_input.encoder_input_tokens
+    #    and model_input.encoder_input_positions are correct -
+    #    by design of the test, the input tokens are
+    #    equal to the input position values, so if
+    #    the model_input data structure has the correct
+    #    values then these two should be equal
+    assert torch.equal(
+        encoder_input_tokens,
+        encoder_input_positions,
+    )
+
+    # Test that vLLM sampling infrastructure chooses the correct
+    # sequence positions at which to sample (i.e. the end of
+    # each sequence) in the prefill phase
+
+    expected_selected_token_indices = []
+    selected_token_start_idx = 0
+    for seq_len in seq_lens:
+        # Compute the index offset of the final token in each
+        # prompt (recall that the prompts are concatenated)
+        expected_selected_token_indices.append(selected_token_start_idx +
+                                               seq_len - 1)
+        selected_token_start_idx += seq_len
+
+    sampling_metadata = model_input.sampling_metadata
+    actual = sampling_metadata.selected_token_indices
+    expected = torch.tensor(
+        expected_selected_token_indices,
+        device=actual.device,
+        dtype=actual.dtype,
+    )
+    assert torch.equal(actual, expected)
+
+
+@pytest.mark.skipif(condition=current_platform.is_cpu(),
+                    reason="CPU backend is currently "
+                    "unsupported for encoder/ "
+                    "decoder models")
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("multiple_seqs_per_seq_group", [True, False])
+def test_prepare_decode(batch_size, multiple_seqs_per_seq_group):
+    '''
+    Test the ability of the encoder/decoder model runner subclass to
+    produce decode-phase model inputs & attention metadata.
+
+    Test behavior:
+
+    * Instantiate BART base model & enc/dec model runner
+    * Construct sequence-group metadata for dummy prompts
+    * Test that encoder attention, decoder self-attention,
+      and encoder/decoder cross-attention inputs are correct
+
+    Arguments:
+
+    * batch_size
+    * multiple_seqs_per_seq_group
+    * backend_name: The attention backend under test
+    * enforce_eager: Enforce eager mode if True (i.e. no CUDAGraph)
+    '''
+
+    model_runner = _create_model_runner(
+        "facebook/bart-base",
+        seed=0,
+        dtype="float16",
+        max_num_batched_tokens=100000,
+        max_num_seqs=100000,
+        enable_chunked_prefill=False,
+        enforce_eager=True,
+    )
+
+    seq_lens: list[int] = []
+    encoder_seq_lens: list[int] = []
+    seq_group_metadata_list: list[SequenceGroupMetadata] = []
+    block_tables = {
+        0: [1],
+        1: [3]
+    } if multiple_seqs_per_seq_group else {
+        0: [1]
+    }
+    cross_block_table = [2]
+    for i in range(batch_size):
+        # make sure all tokens fit into one block
+        seq_len = i % (model_runner.block_size - 1) + 1
+        seq_data = SequenceData.from_seqs(range(seq_len))
+        encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1
+        encoder_seq_data = SequenceData.from_seqs(range(encoder_seq_len))
+
+        seq_group_metadata = SequenceGroupMetadata(
+            request_id=f"test_{i}",
+            is_prompt=False,
+            seq_data={
+                0: seq_data,
+                1: seq_data
+            } if multiple_seqs_per_seq_group else {0: seq_data},
+            sampling_params=SamplingParams(temperature=0),
+            block_tables=block_tables,
+            encoder_seq_data=encoder_seq_data,
+            cross_block_table=cross_block_table,
+        )
+        assert seq_group_metadata.token_chunk_size == 1
+        seq_group_metadata_list.append(seq_group_metadata)
+        seq_lens.extend(
+            [seq_len for _ in range(len(seq_group_metadata.seq_data))])
+        encoder_seq_lens.extend(
+            [encoder_seq_len for _ in range(len(seq_group_metadata.seq_data))])
+
+    # Build
+    # * Decoder model inputs
+    # * Decoder self-attention KV caching data structures
+    # * Encoder model inputs
+    # * Encoder/decoder cross-attention KV caching data structures
+    model_input = model_runner.prepare_model_input(seq_group_metadata_list)
+    input_tokens = model_input.input_tokens
+    input_positions = model_input.input_positions
+    attn_metadata = model_input.attn_metadata
+    return_seq_lens = model_input.seq_lens
+    slot_mapping = attn_metadata.slot_mapping
+    encoder_input_tokens = model_input.encoder_input_tokens
+    encoder_input_positions = model_input.encoder_input_positions
+    cross_slot_mapping = attn_metadata.cross_slot_mapping
+    assert return_seq_lens == seq_lens
+    assert len(slot_mapping) == len(input_tokens)
+    assert len(cross_slot_mapping) == len(encoder_input_tokens)
+
+    # Verify input metadata is correct for decode phase.
+    # - Decoder attention metadata
+    device = model_runner.device
+    assert attn_metadata.num_prefills == 0
+    assert attn_metadata.num_decode_tokens > 0
+    assert torch.equal(attn_metadata.seq_lens_tensor,
+                       torch.tensor(seq_lens, device=device, dtype=torch.int))
+    assert attn_metadata.seq_lens == seq_lens
+    assert attn_metadata.max_prefill_seq_len == 0
+    assert attn_metadata.max_decode_seq_len == max(seq_lens)
+    # - Encoder attention metadata
+    assert attn_metadata.encoder_seq_lens == encoder_seq_lens
+    assert torch.equal(
+        attn_metadata.encoder_seq_lens_tensor,
+        torch.tensor(encoder_seq_lens, device=device, dtype=torch.int))
+    assert attn_metadata.max_encoder_seq_len == max(encoder_seq_lens)
+    assert attn_metadata.num_encoder_tokens == sum(encoder_seq_lens)
+
+    # Test decoder subquery start locs.
+    start_idx = 0
+    start_loc = [start_idx]
+    for seq_len in seq_lens:
+        start_idx += 1
+        start_loc.append(start_idx)
+    assert torch.equal(
+        attn_metadata.query_start_loc,
+        torch.tensor(start_loc, dtype=torch.int32, device=device),
+    )
+
+    # Test decoder seq start locs. Note that for normal prefill it is
+    # equivalent to query_start_loc.
+    start_idx = 0
+    seq_start_loc = [start_idx]
+    for seq_len in seq_lens:
+        start_idx += seq_len
+        seq_start_loc.append(start_idx)
+
+    # Test seq_start_loc and context lengths
+
+    assert torch.equal(
+        attn_metadata.seq_start_loc,
+        torch.tensor(seq_start_loc, dtype=torch.int32, device=device),
+    )
+    assert torch.equal(
+        attn_metadata.context_lens_tensor,
+        torch.tensor([seq_len - 1 for seq_len in seq_lens],
+                     dtype=torch.int,
+                     device=device))
+
+    # Verify block tables are correct for prompts
+    # - Decoder self-attention
+    flattened_block_tables = [
+        block_table for block_table in block_tables.values()
+    ]
+    expected = torch.tensor(flattened_block_tables *
+                            len(seq_group_metadata_list),
+                            dtype=torch.int32,
+                            device=model_runner.device)
+    assert torch.equal(
+        attn_metadata.block_tables,
+        expected,
+    )
+    # - Encoder/decoder cross-attention
+    expected = torch.tensor([
+        cross_block_table for seq_group_metadata in seq_group_metadata_list
+        for _ in range(len(seq_group_metadata.seq_data))
+    ],
+                            dtype=torch.int32,
+                            device=model_runner.device)
+    assert torch.equal(
+        attn_metadata.cross_block_tables,
+        expected,
+    )
+
+    # Model runner's CUDAGraph setting should be propagated to attention
+    # metadata.
+    assert attn_metadata.use_cuda_graph is False
+
+    # Verify the lengths of input tokens & positions
+    # - Decoder
+    assert len(input_tokens) == len(seq_lens)
+    assert len(input_positions) == len(seq_lens)
+    # -- An indirect check that model_input.input_tokens
+    #    and model_input.input_positions are correct -
+    #    by design of the test, the input tokens are
+    #    equal to the input position values, so if
+    #    the model_input data structure has the correct
+    #    values then these two should be equal
+    assert torch.equal(
+        input_tokens,
+        input_positions,
+    )
+    # - Encoder
+    assert len(encoder_input_tokens) == 0
+    assert len(encoder_input_tokens) == 0
+    # -- An indirect check that model_input.encoder_input_tokens
+    #    and model_input.encoder_input_positions are correct -
+    #    by design of the test, the input tokens are
+    #    equal to the input position values, so if
+    #    the model_input data structure has the correct
+    #    values then these two should be equal
+    assert torch.equal(
+        encoder_input_tokens,
+        encoder_input_positions,
+    )
+
+    # Test that vLLM sampling infrastructure chooses the correct
+    # sequence positions at which to sample (i.e. the end of
+    # each sequence) in the decode phase
+
+    expected_selected_token_indices = []
+    for selected_token_start_idx, seq_len in enumerate(seq_lens):
+        # Compute the index offset of the final token in each
+        # sequence's decoded outputs; since a single token is
+        # decoded per iteration per sequence, then the length
+        # of the decoded tokens for a given sequence is 1 and
+        # the final index offset into a given sequence's
+        # generated tokens is 0 (i.e. the expected sampling index
+        # for a given sequence is just `selected_token_start_idx`)
+        expected_selected_token_indices.append(selected_token_start_idx)
+
+    sampling_metadata = model_input.sampling_metadata
+    actual = sampling_metadata.selected_token_indices
+    expected = torch.tensor(
+        expected_selected_token_indices,
+        device=actual.device,
+        dtype=actual.dtype,
+    )
+    assert torch.equal(actual, expected)
+
+
+@pytest.mark.parametrize("batch_size", list(range(1, 257)))
+@pytest.mark.parametrize("multiple_seqs_per_seq_group", [True, False])
+def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group):
+    """
+    Tests that for encoder-decoder models with CUDA Graph capture and replay
+    enabled, the tensors used during the decode phase are correctly padded
+    for varying input batch sizes.
+    """
+    model_runner = _create_model_runner(
+        "facebook/bart-base",
+        seed=0,
+        dtype="float16",
+        max_num_batched_tokens=100000,
+        max_num_seqs=100000,
+        enable_chunked_prefill=False,
+        enforce_eager=False,
+    )
+    block_tables = {
+        0: [1],
+        1: [3]
+    } if multiple_seqs_per_seq_group else {
+        0: [1]
+    }
+    seq_lens: list[int] = []
+    encoder_seq_lens: list[int] = []
+    seq_group_metadata_list: list[SequenceGroupMetadata] = []
+
+    cross_block_table = [2]
+    expanded_batch_size = 0
+    for i in range(batch_size):
+        # make sure all tokens fit into one block
+        seq_len = i % (model_runner.block_size - 1) + 1
+        seq_data = SequenceData.from_seqs(range(seq_len))
+        encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1
+        encoder_seq_data = SequenceData.from_seqs(range(encoder_seq_len))
+        seq_group_metadata = SequenceGroupMetadata(
+            request_id=f"test_{i}",
+            is_prompt=False,
+            seq_data={
+                0: seq_data,
+                1: seq_data
+            } if multiple_seqs_per_seq_group else {0: seq_data},
+            sampling_params=SamplingParams(temperature=0),
+            block_tables=block_tables,
+            encoder_seq_data=encoder_seq_data,
+            cross_block_table=cross_block_table,
+        )
+        assert seq_group_metadata.token_chunk_size == 1
+        seq_lens.extend(
+            [seq_len for _ in range(len(seq_group_metadata.seq_data))])
+        encoder_seq_lens.extend(
+            [encoder_seq_len for _ in range(len(seq_group_metadata.seq_data))])
+        expanded_batch_size = expanded_batch_size + len(
+            seq_group_metadata.seq_data)
+        seq_group_metadata_list.append(seq_group_metadata)
+
+    model_input = model_runner.prepare_model_input(seq_group_metadata_list)
+    input_tokens = model_input.input_tokens
+    input_positions = model_input.input_positions
+    attn_metadata = model_input.attn_metadata
+    return_seq_lens = model_input.seq_lens
+    slot_mapping = attn_metadata.slot_mapping
+    encoder_input_tokens = model_input.encoder_input_tokens
+    encoder_input_positions = model_input.encoder_input_positions
+    cross_slot_mapping = attn_metadata.cross_slot_mapping
+
+    # With CUDA Graph capture and replay enabled, the decoder and encoder
+    # input sequences will be padded. Create the expected padded tensors
+    # accordingly.
+    graph_batch_size = model_runner.vllm_config.pad_for_cudagraph(
+        expanded_batch_size)
+    cuda_graph_pad_size = graph_batch_size - expanded_batch_size
+    padded_seq_lens = seq_lens + list(itertools.repeat(1, cuda_graph_pad_size))
+    padded_encoder_seq_lens = encoder_seq_lens + list(
+        itertools.repeat(1, cuda_graph_pad_size))
+
+    assert return_seq_lens == padded_seq_lens
+    assert len(slot_mapping) == len(input_tokens)
+    assert len(cross_slot_mapping) == len(encoder_input_tokens)
+
+    # Verify attention metadata
+    device = model_runner.device
+    assert attn_metadata.num_prefills == 0
+    assert attn_metadata.num_decode_tokens > 0
+    assert torch.equal(
+        attn_metadata.seq_lens_tensor,
+        torch.tensor(padded_seq_lens, device=device, dtype=torch.int))
+    assert attn_metadata.seq_lens == padded_seq_lens
+    assert attn_metadata.max_prefill_seq_len == 0
+    assert attn_metadata.max_decode_seq_len == max(seq_lens)
+    # - Encoder attention metadata
+    assert attn_metadata.encoder_seq_lens == padded_encoder_seq_lens
+    assert torch.equal(
+        attn_metadata.encoder_seq_lens_tensor,
+        torch.tensor(padded_encoder_seq_lens, device=device, dtype=torch.int))
+    assert attn_metadata.max_encoder_seq_len == max(padded_encoder_seq_lens)
+    assert attn_metadata.num_encoder_tokens == sum(padded_encoder_seq_lens)
+
+    # Verify block tables are correct for prompts
+    # - Decoder self-attention. Pad the block tables as expected.
+    flattened_block_tables = [
+        block_table for _ in range(len(seq_group_metadata_list))
+        for block_table in block_tables.values()
+    ]
+    flattened_block_tables.extend([[] for _ in range(cuda_graph_pad_size)])
+    expected = make_tensor_with_pad(
+        flattened_block_tables,
+        max_len=64,
+        pad=0,
+        dtype=torch.int32,
+        device=model_runner.device,
+    )
+    assert torch.equal(
+        attn_metadata.block_tables,
+        expected,
+    )
+    # - Encoder/decoder cross-attention. Pad the cross-attention block tables
+    # as expected.
+    expected = [
+        cross_block_table for seq_group_metadata in seq_group_metadata_list
+        for _ in range(len(seq_group_metadata.seq_data))
+    ]
+    expected.extend([[] for _ in range(cuda_graph_pad_size)])
+    expected = make_tensor_with_pad(
+        expected,
+        max_len=64,
+        pad=0,
+        dtype=torch.int32,
+        device=model_runner.device,
+    )
+    assert torch.equal(
+        attn_metadata.cross_block_tables,
+        expected,
+    )
+
+    # Model runner's CUDAGraph setting should be propagated to attention
+    # metadata.
+    assert attn_metadata.use_cuda_graph is True
+
+    # Verify the lengths of input tokens & positions
+    # - Decoder
+    assert len(input_tokens) == len(padded_seq_lens)
+    assert len(input_positions) == len(padded_seq_lens)
+    # -- An indirect check that model_input.input_tokens
+    #    and model_input.input_positions are correct -
+    #    by design of the test, the input tokens are
+    #    equal to the input position values, so if
+    #    the model_input data structure has the correct
+    #    values then these two should be equal
+    assert torch.equal(
+        input_tokens,
+        input_positions,
+    )
+    # - Encoder
+    assert len(encoder_input_tokens) == 0
+    assert len(encoder_input_tokens) == 0
+    # -- An indirect check that model_input.encoder_input_tokens
+    #    and model_input.encoder_input_positions are correct -
+    #    by design of the test, the input tokens are
+    #    equal to the input position values, so if
+    #    the model_input data structure has the correct
+    #    values then these two should be equal
+    assert torch.equal(
+        encoder_input_tokens,
+        encoder_input_positions,
+    )
diff --git a/vllm_v0.10.0/tests/worker/test_model_input.py b/vllm_v0.10.0/tests/worker/test_model_input.py
new file mode 100644
index 0000000..ec33d33
--- /dev/null
+++ b/vllm_v0.10.0/tests/worker/test_model_input.py
@@ -0,0 +1,246 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import dataclasses
+
+import torch
+
+from vllm.attention import AttentionMetadata, AttentionMetadataBuilder
+from vllm.attention.backends.abstract import AttentionBackend
+from vllm.attention.backends.utils import CommonAttentionState
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.pooling_metadata import PoolingMetadata
+from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+from vllm.worker.multi_step_model_runner import StatefulModelInput
+from vllm.worker.pooling_model_runner import (
+    ModelInputForGPUWithPoolingMetadata)
+
+
+class MockAttentionBackend(AttentionBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        raise NotImplementedError
+
+    @staticmethod
+    def get_impl_cls():
+        raise NotImplementedError
+
+    @staticmethod
+    def get_metadata_cls() -> type["AttentionMetadata"]:
+        return AttentionMetadata
+
+    @staticmethod
+    def get_builder_cls() -> type["AttentionMetadataBuilder"]:
+        return AttentionMetadataBuilder
+
+    @staticmethod
+    def get_state_cls() -> type["CommonAttentionState"]:
+        return CommonAttentionState
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> tuple[int, ...]:
+        raise NotImplementedError
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        pass
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: list[torch.Tensor],
+        src_to_dists: torch.Tensor,
+    ) -> None:
+        pass
+
+
+def test_model_runner_input():
+    sampling_metadata = SamplingMetadata(
+        ["seq_group"],
+        "selected_token_indices",
+        "categorized_sample_indices",
+        "num_prompts",
+    )
+    attn_metadata = AttentionMetadata(
+        num_prefills=1,
+        num_prefill_tokens=2,
+        num_decode_tokens=3,
+        slot_mapping=torch.zeros(1),
+        multi_modal_placeholder_index_maps=None,
+        enable_kv_scales_calculation=True,
+    )
+    model_input = ModelInputForGPUWithSamplingMetadata(
+        input_tokens=torch.ones(10),
+        input_positions=torch.ones(10),
+        sampling_metadata=sampling_metadata,
+        attn_metadata=attn_metadata)
+
+    assert isinstance(model_input, ModelInputForGPUWithSamplingMetadata)
+
+    # Test round trip serialization.
+    tensor_dict = model_input.as_broadcastable_tensor_dict()
+    attn_backend = MockAttentionBackend()
+    received_model_input = (
+        ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict(
+            tensor_dict, attn_backend=attn_backend))
+    # Check that received copy has correct values.
+    assert isinstance(received_model_input,
+                      ModelInputForGPUWithSamplingMetadata)
+    assert received_model_input.input_tokens is not None
+    assert (
+        received_model_input.input_tokens == model_input.input_tokens).all()
+    assert received_model_input.input_positions is not None
+    assert (received_model_input.input_positions == model_input.input_positions
+            ).all()
+    assert received_model_input.multi_modal_kwargs is None
+    assert (received_model_input.multi_modal_kwargs ==
+            model_input.multi_modal_kwargs)
+    assert received_model_input.lora_requests is None
+    assert received_model_input.lora_requests == model_input.lora_requests
+    assert received_model_input.lora_mapping is None
+    assert received_model_input.lora_mapping == model_input.lora_mapping
+    for field in dataclasses.fields(AttentionMetadata):
+        assert getattr(received_model_input.attn_metadata, field.name,
+                       None) == getattr(attn_metadata, field.name, None)
+    # For sampling metadata, only selected_token_indices is copied.
+    assert (received_model_input.sampling_metadata.selected_token_indices ==
+            sampling_metadata.selected_token_indices)
+    assert received_model_input.sampling_metadata.seq_groups is None
+
+
+def test_embedding_model_runner_input():
+    pooling_metadata = PoolingMetadata(
+        seq_groups=[[0]],
+        seq_data={},
+        prompt_lens=[1],
+    )
+    attn_metadata = AttentionMetadata(
+        num_prefills=1,
+        num_prefill_tokens=2,
+        num_decode_tokens=3,
+        slot_mapping=torch.zeros(1),
+        multi_modal_placeholder_index_maps=None,
+        enable_kv_scales_calculation=True,
+    )
+    model_input = ModelInputForGPUWithPoolingMetadata(
+        input_tokens=torch.ones(10),
+        input_positions=torch.ones(10),
+        pooling_metadata=pooling_metadata,
+        attn_metadata=attn_metadata)
+
+    assert isinstance(model_input, ModelInputForGPUWithPoolingMetadata)
+
+    # Test round trip serialization.
+    tensor_dict = model_input.as_broadcastable_tensor_dict()
+    attn_backend = MockAttentionBackend()
+    received_model_input = (
+        ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict(
+            tensor_dict, attn_backend=attn_backend))
+    # Check that received copy has correct values.
+    assert isinstance(received_model_input,
+                      ModelInputForGPUWithPoolingMetadata)
+    assert received_model_input.input_tokens is not None
+    assert (
+        received_model_input.input_tokens == model_input.input_tokens).all()
+    assert received_model_input.input_positions is not None
+    assert (received_model_input.input_positions == model_input.input_positions
+            ).all()
+    assert received_model_input.multi_modal_kwargs is None
+    assert (received_model_input.multi_modal_kwargs ==
+            model_input.multi_modal_kwargs)
+    assert received_model_input.lora_requests is None
+    assert received_model_input.lora_requests == model_input.lora_requests
+    assert received_model_input.lora_mapping is None
+    assert received_model_input.lora_mapping == model_input.lora_mapping
+    for field in dataclasses.fields(AttentionMetadata):
+        assert getattr(received_model_input.attn_metadata, field.name,
+                       None) == getattr(attn_metadata, field.name, None)
+    # Pooling metadata is not broadcast.
+    assert received_model_input.pooling_metadata is None
+
+
+def test_multi_step_model_runner_input():
+    sampling_metadata = SamplingMetadata(
+        ["seq_group"],
+        "selected_token_indices",
+        "categorized_sample_indices",
+        "num_prompts",
+    )
+    attn_metadata = AttentionMetadata(
+        num_prefills=1,
+        num_prefill_tokens=2,
+        num_decode_tokens=3,
+        slot_mapping=torch.zeros(1),
+        multi_modal_placeholder_index_maps=None,
+        enable_kv_scales_calculation=True,
+    )
+    frozen_model_input = ModelInputForGPUWithSamplingMetadata(
+        input_tokens=torch.ones(10),
+        input_positions=torch.ones(10),
+        sampling_metadata=sampling_metadata,
+        attn_metadata=attn_metadata)
+
+    model_input = StatefulModelInput(
+        frozen_model_input=frozen_model_input,
+        is_last_step=True,
+        is_first_multi_step=False,
+        current_step=4,
+        last_sampled_token_ids=torch.ones((10, 1)),
+        is_multi_step=True,
+        num_queries=8,
+        num_seqs=5,
+        cached_outputs=[],
+    )
+
+    assert isinstance(model_input, StatefulModelInput)
+
+    # Test round trip serialization.
+    tensor_dict = model_input.as_broadcastable_tensor_dict()
+    attn_backend = MockAttentionBackend()
+    received_model_input = (StatefulModelInput.from_broadcasted_tensor_dict(
+        tensor_dict, attn_backend=attn_backend))
+
+    received_frozen_input = received_model_input.frozen_model_input
+
+    # Check that received copy has correct values.
+    assert isinstance(received_model_input, StatefulModelInput)
+    assert received_frozen_input.input_tokens is not None
+    assert (received_frozen_input.input_tokens ==
+            frozen_model_input.input_tokens).all()
+    assert received_frozen_input.input_positions is not None
+    assert (received_frozen_input.input_positions ==
+            frozen_model_input.input_positions).all()
+    assert received_frozen_input.multi_modal_kwargs is None
+    assert (frozen_model_input.multi_modal_kwargs ==
+            frozen_model_input.multi_modal_kwargs)
+    assert received_frozen_input.lora_requests is None
+    assert (received_frozen_input.lora_requests ==
+            frozen_model_input.lora_requests)
+    assert received_frozen_input.lora_mapping is None
+    assert (
+        received_frozen_input.lora_mapping == frozen_model_input.lora_mapping)
+    for field in dataclasses.fields(AttentionMetadata):
+        assert getattr(received_frozen_input.attn_metadata, field.name,
+                       None) == getattr(attn_metadata, field.name, None)
+    # For sampling metadata, only selected_token_indices is copied.
+    assert (received_frozen_input.sampling_metadata.selected_token_indices ==
+            sampling_metadata.selected_token_indices)
+    assert received_frozen_input.sampling_metadata.seq_groups is None
+
+    # check non frozen fields
+    assert received_model_input.is_last_step == model_input.is_last_step
+    assert (received_model_input.is_first_multi_step ==
+            model_input.is_first_multi_step)
+    assert received_model_input.current_step == model_input.current_step
+    assert (received_model_input.last_sampled_token_ids ==
+            model_input.last_sampled_token_ids).all()
+    assert received_model_input.is_multi_step == model_input.is_multi_step
diff --git a/vllm_v0.10.0/tests/worker/test_model_runner.py b/vllm_v0.10.0/tests/worker/test_model_runner.py
new file mode 100644
index 0000000..0be25aa
--- /dev/null
+++ b/vllm_v0.10.0/tests/worker/test_model_runner.py
@@ -0,0 +1,462 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
+                                             init_distributed_environment)
+from vllm.engine.arg_utils import EngineArgs
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
+from vllm.utils import get_open_port
+from vllm.worker.model_runner import ModelRunner
+
+
+def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner:
+    engine_args = EngineArgs(model, *args, **kwargs)
+    engine_config = engine_args.create_engine_config()
+    model_runner = ModelRunner(
+        vllm_config=engine_config,
+        is_driver_worker=True,
+    )
+    return model_runner
+
+
+def test_deepseek_mla_attn_backend_module():
+    model_runner = _create_model_runner(
+        "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
+        trust_remote_code=True,
+        enable_chunked_prefill=False,
+    )
+    assert model_runner.attn_backend.__name__ == "TritonMLABackend"
+
+
+@pytest.mark.parametrize("batch_size", list(range(1, 257, 3)))
+@pytest.mark.parametrize("use_prompt_embeds", [True, False])
+def test_prepare_prompt(batch_size, use_prompt_embeds, monkeypatch):
+    if use_prompt_embeds:
+        # Prompt Embeddings is only currently supported on V0
+        monkeypatch.setenv("VLLM_USE_V1", "0")
+
+    model_runner = _create_model_runner(
+        "facebook/opt-125m",
+        max_num_batched_tokens=100000,
+        max_num_seqs=100000,
+        enable_chunked_prefill=False,
+        enable_prompt_embeds=True,
+    )
+
+    seq_lens: list[int] = []
+    seq_group_metadata_list: list[SequenceGroupMetadata] = []
+    block_tables = {0: [1]}
+    expected_input_embeds_len = 0
+    for i in range(batch_size):
+        # make sure all tokens fit into one block
+        seq_len = i % (model_runner.block_size - 1) + 1
+        seq_lens.append(seq_len)
+        if use_prompt_embeds:
+            seq_data = SequenceData.from_seqs(
+                prompt_token_ids=[0] * seq_len,
+                prompt_embeds=torch.rand(seq_len, 10),
+            )
+            expected_input_embeds_len += seq_len
+        else:
+            seq_data = SequenceData.from_seqs(prompt_token_ids=range(seq_len))
+
+        seq_group_metadata = SequenceGroupMetadata(
+            request_id=f"test_{i}",
+            is_prompt=True,
+            seq_data={0: seq_data},
+            sampling_params=SamplingParams(temperature=0),
+            block_tables=block_tables,
+        )
+        assert seq_group_metadata.token_chunk_size == seq_data.get_len()
+        seq_group_metadata_list.append(seq_group_metadata)
+
+    expected_selected_token_indices = []
+    selected_token_start_idx = 0
+    for seq_len in seq_lens:
+        expected_selected_token_indices.append(selected_token_start_idx +
+                                               seq_len - 1)
+        selected_token_start_idx += seq_len
+    model_input = model_runner._prepare_model_input_tensors(
+        seq_group_metadata_list)
+    input_tokens = model_input.input_tokens
+    input_positions = model_input.input_positions
+    input_embeds = model_input.inputs_embeds
+    attn_metadata = model_input.attn_metadata
+    return_seq_lens = model_input.seq_lens
+    slot_mapping = attn_metadata.slot_mapping
+    assert return_seq_lens == seq_lens
+    assert len(slot_mapping) == len(input_tokens)
+
+    # Verify input metadata is correct for prompts.
+    device = model_runner.device
+    assert attn_metadata.num_prefills > 0
+    assert attn_metadata.num_decode_tokens == 0
+    torch.testing.assert_close(
+        attn_metadata.seq_lens_tensor,
+        torch.tensor(seq_lens, device=device, dtype=torch.int))
+    assert attn_metadata.seq_lens == seq_lens
+    assert attn_metadata.max_prefill_seq_len == max(seq_lens)
+    assert attn_metadata.max_decode_seq_len == 0
+
+    # Test subquery start locs.
+    start_idx = 0
+    start_loc = [start_idx]
+    for seq_len in seq_lens:
+        start_idx += seq_len
+        start_loc.append(start_idx)
+    torch.testing.assert_close(
+        attn_metadata.query_start_loc,
+        torch.tensor(start_loc, dtype=torch.int32, device=device))
+
+    # Test seq start locs. Note that for normal prefill it is
+    # equivalent to query_start_loc.
+    start_idx = 0
+    seq_start_loc = [start_idx]
+    for seq_len in seq_lens:
+        start_idx += seq_len
+        seq_start_loc.append(start_idx)
+
+    torch.testing.assert_close(
+        attn_metadata.seq_start_loc,
+        torch.tensor(start_loc, dtype=torch.int32, device=device))
+    torch.testing.assert_close(
+        attn_metadata.context_lens_tensor,
+        torch.zeros(attn_metadata.context_lens_tensor.shape[0],
+                    dtype=torch.int,
+                    device=device))
+
+    expected = torch.tensor([[] for _ in range(len(seq_group_metadata_list))],
+                            dtype=torch.int32,
+                            device=model_runner.device)
+    torch.testing.assert_close(attn_metadata.block_tables, expected)
+    # Cuda graph should not be used for prerill.
+    assert attn_metadata.use_cuda_graph is False
+
+    assert len(input_tokens) == sum(seq_lens)
+    assert len(input_positions) == sum(seq_lens)
+    if expected_input_embeds_len == 0:
+        torch.testing.assert_close(input_tokens, input_positions)
+        assert input_embeds is None
+    else:
+        assert len(input_embeds) == expected_input_embeds_len
+
+    sampling_metadata = SamplingMetadata.prepare(
+        seq_group_metadata_list,
+        seq_lens,
+        query_lens=seq_lens,
+        device=model_runner.device,
+        pin_memory=model_runner.pin_memory)
+    assert len(input_tokens) == sum(seq_lens)
+    assert len(input_positions) == sum(seq_lens)
+    actual = sampling_metadata.selected_token_indices
+    expected = torch.tensor(expected_selected_token_indices,
+                            device=actual.device,
+                            dtype=actual.dtype)
+    torch.testing.assert_close(actual, expected)
+    torch.allclose(input_tokens, input_positions)
+
+    actual = sampling_metadata.selected_token_indices
+    expected = torch.tensor(expected_selected_token_indices,
+                            device=actual.device,
+                            dtype=actual.dtype)
+    torch.testing.assert_close(actual, expected)
+
+
+@pytest.mark.parametrize("batch_size", list(range(1, 257, 3)))
+@pytest.mark.parametrize("use_prompt_embeds", [True, False])
+def test_prepare_decode_cuda_graph(batch_size, use_prompt_embeds, monkeypatch):
+    if use_prompt_embeds:
+        # Prompt Embeddings is only currently supported on V0
+        monkeypatch.setenv("VLLM_USE_V1", "0")
+
+    model_runner = _create_model_runner(
+        "facebook/opt-125m",
+        seed=0,
+        dtype="float16",
+        enforce_eager=False,
+        max_num_batched_tokens=100000,
+        max_num_seqs=100000,
+        enable_chunked_prefill=False,
+        enable_prompt_embeds=True,
+    )
+
+    context_lens: list[int] = []
+    seq_group_metadata_list: list[SequenceGroupMetadata] = []
+    # Assume each seq group finishes prefill.
+    for i in range(batch_size):
+        # make sure all tokens fit into one block
+        context_len = i % (model_runner.block_size - 1) + 1
+        context_lens.append(context_len)
+        if use_prompt_embeds:
+            seq_data = SequenceData.from_seqs(
+                prompt_token_ids=[0] * context_len,
+                prompt_embeds=torch.rand(context_len, 10),
+            )
+            output_embed = torch.rand(10)
+        else:
+            seq_data = SequenceData.from_seqs(
+                prompt_token_ids=range(context_len))
+            output_embed = None
+        seq_data.update_num_computed_tokens(context_len)
+        # Append one token ID since prefill is finished.
+        seq_data.append_token_id(1, 0, output_embed)
+        seq_group_metadata = SequenceGroupMetadata(
+            request_id=f"test_{i}",
+            is_prompt=False,
+            seq_data={0: seq_data},
+            sampling_params=SamplingParams(temperature=0),
+            block_tables={0: [1]},
+        )
+        assert seq_group_metadata.token_chunk_size == 1
+        seq_group_metadata_list.append(seq_group_metadata)
+
+    model_input = model_runner._prepare_model_input_tensors(
+        seq_group_metadata_list)
+    input_tokens = model_input.input_tokens
+    input_positions = model_input.input_positions
+    input_embeds = model_input.inputs_embeds
+    attn_metadata = model_input.attn_metadata
+    slot_mapping = attn_metadata.slot_mapping
+
+    assert len(slot_mapping) == len(input_tokens)
+
+    expected_bs = model_runner.vllm_config.pad_for_cudagraph(
+        len(seq_group_metadata_list))
+    # Verify input metadata is correct for prompts.
+    device = model_runner.device
+    assert attn_metadata.num_prefills == 0
+    assert attn_metadata.num_prefill_tokens == 0
+    seq_lens = [context_len + 1 for context_len in context_lens]
+    # seq_lens are padded to expected_bs
+    for _ in range(expected_bs - len(seq_lens)):
+        seq_lens.append(1)
+    assert attn_metadata.seq_lens == seq_lens
+    assert attn_metadata.num_decode_tokens == len(seq_lens)
+    start_idx = 0
+    start_loc = [start_idx]
+    for _ in context_lens:
+        # decode has only 1 token for query.
+        start_idx += 1
+        start_loc.append(start_idx)
+    torch.testing.assert_close(
+        attn_metadata.query_start_loc,
+        torch.tensor(start_loc, dtype=torch.int32, device=device))
+
+    start_idx = 0
+    seq_start_loc = [start_idx]
+    for seq_len in seq_lens:
+        start_idx += seq_len
+        seq_start_loc.append(start_idx)
+    torch.testing.assert_close(
+        attn_metadata.seq_start_loc,
+        torch.tensor(seq_start_loc, dtype=torch.int32, device=device))
+
+    torch.testing.assert_close(
+        attn_metadata.context_lens_tensor,
+        torch.tensor(context_lens, dtype=torch.int, device=device))
+    assert attn_metadata.max_decode_seq_len == max(seq_lens)
+    torch.testing.assert_close(
+        attn_metadata.seq_lens_tensor[:len(seq_lens)],
+        torch.tensor(seq_lens, dtype=torch.int, device=device))
+
+    # block table's first index corresponds to each batch, meaning in
+    # decoding it is each token.
+    assert attn_metadata.block_tables.shape[0] == len(input_tokens)
+    # Block table's second dim corresponds to each token's block number.
+    # It is padded up to
+    assert attn_metadata.block_tables.shape[1] == (
+        model_runner.get_max_block_per_batch())
+    assert attn_metadata.use_cuda_graph is True
+
+    assert len(input_tokens) == expected_bs
+    assert len(input_positions) == expected_bs
+    if use_prompt_embeds:
+        expected_input_embeds_length = start_loc[-1]
+        assert len(input_embeds) == expected_input_embeds_length
+        assert expected_input_embeds_length <= expected_bs
+    else:
+        assert input_embeds is None
+
+    # Verify Sampling
+    expected_selected_token_indices = []
+    for selected_token_start_idx, _ in enumerate(context_lens):
+        expected_selected_token_indices.append(selected_token_start_idx)
+    sampling_metadata = SamplingMetadata.prepare(
+        seq_group_metadata_list,
+        seq_lens,
+        # query lens is all 1 for decode.
+        query_lens=[1 for _ in range(len(context_lens))],
+        device=model_runner.device,
+        pin_memory=model_runner.pin_memory)
+    actual = sampling_metadata.selected_token_indices
+    expected = torch.tensor(expected_selected_token_indices,
+                            device=actual.device,
+                            dtype=actual.dtype)
+    torch.testing.assert_close(actual, expected)
+
+
+def test_empty_seq_group():
+    """Verify prepare prompt and decode returns empty output."""
+    model_runner = _create_model_runner(
+        "facebook/opt-125m",
+        seed=0,
+        dtype="float16",
+        enforce_eager=False,
+    )
+    seq_group_metadata_list: list[SequenceGroupMetadata] = []
+    model_input = model_runner._prepare_model_input_tensors(
+        seq_group_metadata_list)
+
+    input_tokens = model_input.input_tokens
+    input_positions = model_input.input_positions
+    attn_metadata = model_input.attn_metadata
+
+    assert input_tokens is None
+    assert input_positions is None
+    assert attn_metadata is None
+
+    model_input = model_runner._prepare_model_input_tensors(
+        seq_group_metadata_list)
+
+    input_tokens = model_input.input_tokens
+    input_positions = model_input.input_positions
+    input_embeds = model_input.inputs_embeds
+    attn_metadata = model_input.attn_metadata
+    return_seq_lens = model_input.seq_lens
+
+    assert input_tokens is None
+    assert input_positions is None
+    assert input_embeds is None
+    assert attn_metadata is None
+    assert return_seq_lens is None
+
+
+@pytest.fixture
+def distributed_init():
+    init_distributed_environment(
+        world_size=1,
+        rank=0,
+        distributed_init_method=f"tcp://127.0.0.1:{get_open_port()}",
+        local_rank=0)
+    ensure_model_parallel_initialized(1, 1)
+
+
+@pytest.mark.parametrize("batch_size", list(range(2, 128, 3)))
+@pytest.mark.parametrize("enforce_eager", [True, False])
+@pytest.mark.parametrize('use_prompt_embeds', [True, False])
+def test_hybrid_batches(batch_size, enforce_eager, use_prompt_embeds,
+                        distributed_init, monkeypatch):
+    if use_prompt_embeds:
+        # Prompt Embeddings is only currently supported on V0
+        monkeypatch.setenv("VLLM_USE_V1", "0")
+
+    model_runner = _create_model_runner(
+        "facebook/opt-125m",
+        seed=0,
+        dtype="float16",
+        enforce_eager=enforce_eager,
+        max_num_batched_tokens=100000,
+        max_num_seqs=100000,
+        enable_chunked_prefill=True,
+        enable_prompt_embeds=True,
+    )
+
+    # Add prefill requests.
+    seq_lens: list[int] = []
+    seq_group_metadata_list: list[SequenceGroupMetadata] = []
+    prefill_metadata_list: list[SequenceGroupMetadata] = []
+    decode_metadata_list: list[SequenceGroupMetadata] = []
+    block_tables = {0: [1]}
+    prefill_batch_size = batch_size // 2
+    decode_batch_size = batch_size - prefill_batch_size
+    expected_input_embeds_len = 0
+    for i in range(prefill_batch_size):
+        # make sure all tokens fit into one block
+        seq_len = i % (model_runner.block_size - 1) + 1
+        seq_lens.append(seq_len)
+        if use_prompt_embeds:
+            seq_data = SequenceData.from_seqs(
+                prompt_token_ids=[0] * seq_len,
+                prompt_embeds=torch.rand(seq_len, 10),
+            )
+            expected_input_embeds_len += seq_len
+        else:
+            seq_data = SequenceData.from_seqs(
+                prompt_token_ids=range(seq_len), )
+        seq_group_metadata = SequenceGroupMetadata(
+            request_id=f"test_{i}",
+            is_prompt=True,
+            seq_data={0: seq_data},
+            sampling_params=SamplingParams(temperature=0),
+            block_tables=block_tables,
+        )
+        assert seq_group_metadata.token_chunk_size == seq_data.get_len()
+        seq_group_metadata_list.append(seq_group_metadata)
+        prefill_metadata_list.append(seq_group_metadata)
+
+    # Add decode requests
+    for i in range(prefill_batch_size, batch_size):
+        # make sure all tokens fit into one block
+        context_len = i % (model_runner.block_size - 1) + 1
+        if use_prompt_embeds:
+            seq_data = SequenceData.from_seqs(
+                prompt_token_ids=[0] * context_len,
+                prompt_embeds=torch.rand(context_len, 10),
+            )
+            output_embed = torch.rand(10)
+            # This also iterates the expected input_embeds, because the model
+            # needs both the input and output embeddings passed into together
+            expected_input_embeds_len += 1
+        else:
+            seq_data = SequenceData.from_seqs(
+                prompt_token_ids=range(context_len), )
+            output_embed = None
+        assert len(seq_data.prompt_token_ids) == context_len
+        seq_data.append_token_id(1, 0, output_embed)
+        seq_data.update_num_computed_tokens(context_len)
+        seq_group_metadata = SequenceGroupMetadata(
+            request_id=f"test_{i}",
+            is_prompt=False,
+            seq_data={0: seq_data},
+            sampling_params=SamplingParams(temperature=0),
+            block_tables={0: [1]},
+        )
+        assert seq_group_metadata.token_chunk_size == 1
+        seq_group_metadata_list.append(seq_group_metadata)
+        decode_metadata_list.append(seq_group_metadata)
+
+    model_input = model_runner.prepare_model_input(seq_group_metadata_list)
+
+    input_tokens = model_input.input_tokens
+    input_positions = model_input.input_positions
+    input_embeds = model_input.inputs_embeds
+    attn_metadata = model_input.attn_metadata
+
+    prefill_meta_actual = attn_metadata.prefill_metadata
+    decode_meta_actual = attn_metadata.decode_metadata
+
+    assert len(attn_metadata.slot_mapping) == len(input_tokens)
+    assert len(input_positions) == len(input_tokens)
+    assert attn_metadata.num_prefills == prefill_batch_size
+    assert attn_metadata.num_decode_tokens == decode_batch_size
+    assert attn_metadata.num_prefill_tokens == sum(seq_lens)
+    if expected_input_embeds_len == 0:
+        assert input_embeds is None
+    else:
+        assert len(input_embeds) == expected_input_embeds_len
+
+    # Verify attn metadata is consistent. We don't need to test individual
+    # values here because they are tested above.
+    attn_metadata = model_runner._prepare_model_input_tensors(
+        seq_group_metadata_list).attn_metadata
+
+    for attr_expected, attr_actual in zip(vars(attn_metadata.prefill_metadata),
+                                          vars(prefill_meta_actual)):
+        assert attr_expected[1] == attr_actual[1]
+    for attr_expected, attr_actual in zip(vars(attn_metadata.decode_metadata),
+                                          vars(decode_meta_actual)):
+        assert attr_expected[1] == attr_actual[1]
diff --git a/vllm_v0.10.0/tests/worker/test_profile.py b/vllm_v0.10.0/tests/worker/test_profile.py
new file mode 100644
index 0000000..d8767f7
--- /dev/null
+++ b/vllm_v0.10.0/tests/worker/test_profile.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.worker.cache_engine import CacheEngine
+from vllm.worker.worker import Worker
+
+
+def test_gpu_memory_profiling():
+    # Tests the gpu profiling that happens in order to determine the number of
+    # KV cache blocks that we can allocate on the GPU.
+    # This test mocks the maximum available gpu memory so that it can run on
+    # any gpu setup.
+
+    # Set up engine args to build a worker.
+    engine_args = EngineArgs(model="facebook/opt-125m",
+                             dtype="half",
+                             load_format="dummy")
+    engine_config = engine_args.create_engine_config()
+    engine_config.cache_config.num_gpu_blocks = 1000
+    engine_config.cache_config.num_cpu_blocks = 1000
+
+    # Create the worker.
+    distributed_init_method = get_distributed_init_method(
+        get_ip(), get_open_port())
+    worker = Worker(
+        vllm_config=engine_config,
+        local_rank=0,
+        rank=0,
+        distributed_init_method=distributed_init_method,
+        is_driver_worker=True,
+    )
+
+    # Set 10GiB as the total gpu ram to be device-agnostic
+    def mock_mem_info():
+        current_usage = torch.cuda.memory_stats(
+        )["allocated_bytes.all.current"]
+        mock_total_bytes = 10 * 1024**3
+        free = mock_total_bytes - current_usage
+
+        return (free, mock_total_bytes)
+
+    from unittest.mock import patch
+    with patch("torch.cuda.mem_get_info", side_effect=mock_mem_info):
+        # Load the model so we can profile it
+        worker.init_device()
+        worker.load_model()
+        gpu_blocks, _ = worker.determine_num_available_blocks()
+
+    # Peak vram usage by torch should be 0.47 GiB
+    # Model weights take 0.25 GiB
+    # No memory should be allocated outside of torch
+    # 9.0 GiB should be the utilization target
+    # 8.28 GiB should be available for the KV cache
+    block_size = CacheEngine.get_cache_block_size(
+        engine_config.cache_config, engine_config.model_config,
+        engine_config.parallel_config)
+
+    expected_blocks = (8.28 * 1024**3) // block_size
+
+    # Check within a small tolerance for portability
+    # Hardware, kernel, or dependency changes could all affect memory
+    # utilization.
+    # A 100 block tolerance here should be about 60MB of wiggle room.
+    assert abs(gpu_blocks - expected_blocks) < 100
diff --git a/vllm_v0.10.0/tests/worker/test_swap.py b/vllm_v0.10.0/tests/worker/test_swap.py
new file mode 100644
index 0000000..6d9f404
--- /dev/null
+++ b/vllm_v0.10.0/tests/worker/test_swap.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.sequence import ExecuteModelRequest
+from vllm.utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.worker.worker import Worker
+
+
+def test_swap() -> None:
+    # Configure the engine.
+    engine_args = EngineArgs(model="distilbert/distilgpt2",
+                             dtype="half",
+                             load_format="dummy")
+    engine_config = engine_args.create_engine_config()
+    engine_config.cache_config.num_gpu_blocks = 1000
+    engine_config.cache_config.num_cpu_blocks = 1000
+
+    # Create the worker.
+    distributed_init_method = get_distributed_init_method(
+        get_ip(), get_open_port())
+    worker = Worker(
+        vllm_config=engine_config,
+        local_rank=0,
+        rank=0,
+        distributed_init_method=distributed_init_method,
+        is_driver_worker=True,
+    )
+
+    # Initialize the worker.
+    worker.init_device()
+    worker.load_model()
+    worker.initialize_cache(
+        num_gpu_blocks=engine_config.cache_config.num_gpu_blocks,
+        num_cpu_blocks=engine_config.cache_config.num_cpu_blocks)
+
+    # Randomly initialize the cache.
+    gpu_cache = worker.cache_engine[0].gpu_cache
+    cpu_cache = worker.cache_engine[0].cpu_cache
+    num_layers = len(gpu_cache)
+    for i in range(num_layers):
+        gpu_key_cache, gpu_value_cache = gpu_cache[i]
+        gpu_key_cache.random_()
+        gpu_value_cache.random_()
+        cpu_key_cache, cpu_value_cache = cpu_cache[i]
+        cpu_key_cache.random_()
+        cpu_value_cache.random_()
+
+    allclose = lambda a, b: torch.allclose(
+        a.cuda(), b.cuda(), rtol=0.0, atol=0.0)
+
+    # Test swap out.
+    blocks_to_swap_out = [(3, 72), (56, 35), (84, 34)]
+    execute_model_req = ExecuteModelRequest(
+        seq_group_metadata_list=[],
+        blocks_to_swap_in=[],
+        blocks_to_swap_out=blocks_to_swap_out,
+        blocks_to_copy=[],
+    )
+    worker.execute_model(execute_model_req=execute_model_req)
+
+    for i in range(num_layers):
+        gpu_key_cache, gpu_value_cache = gpu_cache[i]
+        cpu_key_cache, cpu_value_cache = cpu_cache[i]
+        for src, dst in blocks_to_swap_out:
+            assert allclose(gpu_key_cache[src], cpu_key_cache[dst])
+            assert allclose(gpu_value_cache[src], cpu_value_cache[dst])
+
+    # Test swap in.
+    execute_model_req.blocks_to_swap_out = []
+    execute_model_req.blocks_to_swap_in = [
+        (19, 45),
+        (67, 23),
+        (12, 78),
+        (40, 99),
+        (1, 71),
+    ]
+    worker.execute_model(execute_model_req=execute_model_req)
+
+    for i in range(num_layers):
+        gpu_key_cache, gpu_value_cache = gpu_cache[i]
+        cpu_key_cache, cpu_value_cache = cpu_cache[i]
+        for src, dst in execute_model_req.blocks_to_swap_in:
+            assert allclose(gpu_key_cache[dst], cpu_key_cache[src])
+            assert allclose(gpu_value_cache[dst], cpu_value_cache[src])
diff --git a/vllm_v0.10.0/tools/check_init_lazy_imports.py b/vllm_v0.10.0/tools/check_init_lazy_imports.py
new file mode 100644
index 0000000..e8e6f07
--- /dev/null
+++ b/vllm_v0.10.0/tools/check_init_lazy_imports.py
@@ -0,0 +1,108 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Ensure we perform lazy loading in vllm/__init__.py.
+i.e: appears only within the ``if typing.TYPE_CHECKING:`` guard,
+**except** for a short whitelist.
+"""
+
+from __future__ import annotations
+
+import ast
+import pathlib
+import sys
+from collections.abc import Iterable
+from typing import Final
+
+REPO_ROOT: Final = pathlib.Path(__file__).resolve().parent.parent
+INIT_PATH: Final = REPO_ROOT / "vllm" / "__init__.py"
+
+# If you need to add items to whitelist, do it here.
+ALLOWED_IMPORTS: Final[frozenset[str]] = frozenset({
+    "vllm.env_override",
+})
+ALLOWED_FROM_MODULES: Final[frozenset[str]] = frozenset({
+    ".version",
+})
+
+
+def _is_internal(name: str | None, *, level: int = 0) -> bool:
+    if level > 0:
+        return True
+    if name is None:
+        return False
+    return name.startswith("vllm.") or name == "vllm"
+
+
+def _fail(violations: Iterable[tuple[int, str]]) -> None:
+    print("ERROR: Disallowed eager imports in vllm/__init__.py:\n",
+          file=sys.stderr)
+    for lineno, msg in violations:
+        print(f"  Line {lineno}: {msg}", file=sys.stderr)
+    sys.exit(1)
+
+
+def main() -> None:
+    source = INIT_PATH.read_text(encoding="utf-8")
+    tree = ast.parse(source, filename=str(INIT_PATH))
+
+    violations: list[tuple[int, str]] = []
+
+    class Visitor(ast.NodeVisitor):
+
+        def __init__(self) -> None:
+            super().__init__()
+            self._in_type_checking = False
+
+        def visit_If(self, node: ast.If) -> None:
+            guard_is_type_checking = False
+            test = node.test
+            if isinstance(test, ast.Attribute) and isinstance(
+                    test.value, ast.Name):
+                guard_is_type_checking = (test.value.id == "typing"
+                                          and test.attr == "TYPE_CHECKING")
+            elif isinstance(test, ast.Name):
+                guard_is_type_checking = test.id == "TYPE_CHECKING"
+
+            if guard_is_type_checking:
+                prev = self._in_type_checking
+                self._in_type_checking = True
+                for child in node.body:
+                    self.visit(child)
+                self._in_type_checking = prev
+                for child in node.orelse:
+                    self.visit(child)
+            else:
+                self.generic_visit(node)
+
+        def visit_Import(self, node: ast.Import) -> None:
+            if self._in_type_checking:
+                return
+            for alias in node.names:
+                module_name = alias.name
+                if _is_internal(
+                        module_name) and module_name not in ALLOWED_IMPORTS:
+                    violations.append((
+                        node.lineno,
+                        f"import '{module_name}' must be inside typing.TYPE_CHECKING",  # noqa: E501
+                    ))
+
+        def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
+            if self._in_type_checking:
+                return
+            module_as_written = ("." * node.level) + (node.module or "")
+            if _is_internal(
+                    node.module, level=node.level
+            ) and module_as_written not in ALLOWED_FROM_MODULES:
+                violations.append((
+                    node.lineno,
+                    f"from '{module_as_written}' import ... must be inside typing.TYPE_CHECKING",  # noqa: E501
+                ))
+
+    Visitor().visit(tree)
+
+    if violations:
+        _fail(violations)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/tools/check_pickle_imports.py b/vllm_v0.10.0/tools/check_pickle_imports.py
new file mode 100644
index 0000000..ef197d1
--- /dev/null
+++ b/vllm_v0.10.0/tools/check_pickle_imports.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+import sys
+
+import regex as re
+
+try:
+    import pathspec
+except ImportError:
+    print(
+        "ERROR: The 'pathspec' library is required. "
+        "Install it with 'pip install pathspec'.",
+        file=sys.stderr)
+    sys.exit(2)
+
+# List of files (relative to repo root) that are allowed to import pickle or
+# cloudpickle
+#
+# STOP AND READ BEFORE YOU ADD ANYTHING ELSE TO THIS LIST:
+#  The pickle and cloudpickle modules are known to be unsafe when deserializing
+#  data from potentially untrusted parties. They have resulted in multiple CVEs
+#  for vLLM and numerous vulnerabilities in the Python ecosystem more broadly.
+#  Before adding new uses of pickle/cloudpickle, please consider safer
+#  alternatives like msgpack or pydantic that are already in use in vLLM. Only
+#  add to this list if absolutely necessary and after careful security review.
+ALLOWED_FILES = set([
+    # pickle
+    'vllm/v1/serial_utils.py',
+    'vllm/v1/executor/multiproc_executor.py',
+    'vllm/multimodal/hasher.py',
+    'vllm/transformers_utils/config.py',
+    'vllm/model_executor/models/registry.py',
+    'tests/test_utils.py',
+    'tests/tokenization/test_cached_tokenizer.py',
+    'tests/model_executor/test_guided_processors.py',
+    'vllm/distributed/utils.py',
+    'vllm/distributed/parallel_state.py',
+    'vllm/engine/multiprocessing/client.py',
+    'vllm/distributed/device_communicators/custom_all_reduce_utils.py',
+    'vllm/distributed/device_communicators/shm_broadcast.py',
+    'vllm/engine/multiprocessing/engine.py',
+    'benchmarks/kernels/graph_machete_bench.py',
+    'benchmarks/kernels/benchmark_lora.py',
+    'benchmarks/kernels/benchmark_machete.py',
+    'benchmarks/fused_kernels/layernorm_rms_benchmarks.py',
+    'benchmarks/cutlass_benchmarks/w8a8_benchmarks.py',
+    'benchmarks/cutlass_benchmarks/sparse_benchmarks.py',
+    # cloudpickle
+    'vllm/worker/worker_base.py',
+    'vllm/executor/mp_distributed_executor.py',
+    'vllm/executor/ray_distributed_executor.py',
+    'vllm/entrypoints/llm.py',
+    'tests/utils.py',
+    # pickle and cloudpickle
+    'vllm/utils/__init__.py',
+    'vllm/v1/serial_utils.py',
+    'vllm/v1/executor/multiproc_executor.py',
+    'vllm/transformers_utils/config.py',
+    'vllm/model_executor/models/registry.py',
+    'vllm/engine/multiprocessing/client.py',
+    'vllm/engine/multiprocessing/engine.py',
+])
+
+PICKLE_RE = re.compile(r"^\s*(import\s+(pickle|cloudpickle)(\s|$|\sas)"
+                       r"|from\s+(pickle|cloudpickle)\s+import\b)")
+
+
+def is_python_file(path):
+    return path.endswith('.py')
+
+
+def scan_file(path):
+    with open(path, encoding='utf-8') as f:
+        for line in f:
+            if PICKLE_RE.match(line):
+                return True
+    return False
+
+
+def load_gitignore(repo_root):
+    gitignore_path = os.path.join(repo_root, '.gitignore')
+    patterns = []
+    if os.path.exists(gitignore_path):
+        with open(gitignore_path, encoding='utf-8') as f:
+            patterns = f.read().splitlines()
+    # Always ignore .git directory
+    patterns.append('.git/')
+    return pathspec.PathSpec.from_lines('gitwildmatch', patterns)
+
+
+def main():
+    repo_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    spec = load_gitignore(repo_root)
+    bad_files = []
+    for dirpath, _, filenames in os.walk(repo_root):
+        for filename in filenames:
+            if not is_python_file(filename):
+                continue
+            abs_path = os.path.join(dirpath, filename)
+            rel_path = os.path.relpath(abs_path, repo_root)
+            # Skip ignored files
+            if spec.match_file(rel_path):
+                continue
+            if scan_file(abs_path) and rel_path not in ALLOWED_FILES:
+                bad_files.append(rel_path)
+    if bad_files:
+        print("\nERROR: The following files import 'pickle' or 'cloudpickle' "
+              "but are not in the allowed list:")
+        for f in bad_files:
+            print(f"  {f}")
+        print("\nIf this is intentional, update the allowed list in "
+              "tools/check_pickle_imports.py.")
+        sys.exit(1)
+    sys.exit(0)
+
+
+def test_regex():
+    test_cases = [
+        # Should match
+        ("import pickle", True),
+        ("import cloudpickle", True),
+        ("import pickle as pkl", True),
+        ("import cloudpickle as cpkl", True),
+        ("from pickle import *", True),
+        ("from cloudpickle import dumps", True),
+        ("from pickle import dumps, loads", True),
+        ("from cloudpickle import (dumps, loads)", True),
+        ("    import pickle", True),
+        ("\timport cloudpickle", True),
+        ("from   pickle   import   loads", True),
+        # Should not match
+        ("import somethingelse", False),
+        ("from somethingelse import pickle", False),
+        ("# import pickle", False),
+        ("print('import pickle')", False),
+        ("import pickleas as asdf", False),
+    ]
+    for i, (line, should_match) in enumerate(test_cases):
+        result = bool(PICKLE_RE.match(line))
+        assert result == should_match, (
+            f"Test case {i} failed: '{line}' "
+            f"(expected {should_match}, got {result})")
+    print("All regex tests passed.")
+
+
+if __name__ == '__main__':
+    if '--test-regex' in sys.argv:
+        test_regex()
+    else:
+        main()
diff --git a/vllm_v0.10.0/tools/check_repo.sh b/vllm_v0.10.0/tools/check_repo.sh
new file mode 100644
index 0000000..48eba5b
--- /dev/null
+++ b/vllm_v0.10.0/tools/check_repo.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+# Checks whether the repo is clean and whether tags are available (necessary to correctly produce vllm version at build time)
+
+if ! git diff --quiet; then
+	echo "Repo is dirty" >&2
+
+	exit 1
+fi
+
+if ! git describe --tags; then
+	echo "No tags are present. Is this a shallow clone? git fetch --unshallow --tags" >&2
+
+	exit 1
+fi
diff --git a/vllm_v0.10.0/tools/check_spdx_header.py b/vllm_v0.10.0/tools/check_spdx_header.py
new file mode 100644
index 0000000..ced10ba
--- /dev/null
+++ b/vllm_v0.10.0/tools/check_spdx_header.py
@@ -0,0 +1,148 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import sys
+from enum import Enum
+
+
+class SPDXStatus(Enum):
+    """SPDX header status enumeration"""
+    EMPTY = "empty"  # empty __init__.py
+    COMPLETE = "complete"
+    MISSING_LICENSE = "missing_license"  # Only has copyright line
+    MISSING_COPYRIGHT = "missing_copyright"  # Only has license line
+    MISSING_BOTH = "missing_both"  # Completely missing
+
+
+FULL_SPDX_HEADER = (
+    "# SPDX-License-Identifier: Apache-2.0\n"
+    "# SPDX-FileCopyrightText: Copyright contributors to the vLLM project")
+
+LICENSE_LINE = "# SPDX-License-Identifier: Apache-2.0"
+COPYRIGHT_LINE = "# SPDX-FileCopyrightText: Copyright contributors to the vLLM project"  # noqa: E501
+
+
+def check_spdx_header_status(file_path):
+    """Check SPDX header status of the file"""
+    with open(file_path, encoding="UTF-8") as file:
+        lines = file.readlines()
+        if not lines:
+            # Empty file
+            return SPDXStatus.EMPTY
+
+        # Skip shebang line
+        start_idx = 0
+        if lines and lines[0].startswith("#!"):
+            start_idx = 1
+
+        has_license = False
+        has_copyright = False
+
+        # Check all lines for SPDX headers (not just the first two)
+        for i in range(start_idx, len(lines)):
+            line = lines[i].strip()
+            if line == LICENSE_LINE:
+                has_license = True
+            elif line == COPYRIGHT_LINE:
+                has_copyright = True
+
+        # Determine status based on what we found
+        if has_license and has_copyright:
+            return SPDXStatus.COMPLETE
+        elif has_license and not has_copyright:
+            # Only has license line
+            return SPDXStatus.MISSING_COPYRIGHT
+            # Only has copyright line
+        elif not has_license and has_copyright:
+            return SPDXStatus.MISSING_LICENSE
+        else:
+            # Completely missing both lines
+            return SPDXStatus.MISSING_BOTH
+
+
+def add_header(file_path, status):
+    """Add or supplement SPDX header based on status"""
+    with open(file_path, "r+", encoding="UTF-8") as file:
+        lines = file.readlines()
+        file.seek(0, 0)
+        file.truncate()
+
+        if status == SPDXStatus.MISSING_BOTH:
+            # Completely missing, add complete header
+            if lines and lines[0].startswith("#!"):
+                # Preserve shebang line
+                file.write(lines[0])
+                file.write(FULL_SPDX_HEADER + "\n")
+                file.writelines(lines[1:])
+            else:
+                # Add header directly
+                file.write(FULL_SPDX_HEADER + "\n")
+                file.writelines(lines)
+
+        elif status == SPDXStatus.MISSING_COPYRIGHT:
+            # Only has license line, need to add copyright line
+            # Find the license line and add copyright line after it
+            for i, line in enumerate(lines):
+                if line.strip() == LICENSE_LINE:
+                    # Insert copyright line after license line
+                    lines.insert(
+                        i + 1,
+                        f"{COPYRIGHT_LINE}\n",
+                    )
+                    break
+
+            file.writelines(lines)
+
+        elif status == SPDXStatus.MISSING_LICENSE:
+            # Only has copyright line, need to add license line
+            # Find the copyright line and add license line before it
+            for i, line in enumerate(lines):
+                if line.strip() == COPYRIGHT_LINE:
+                    # Insert license line before copyright line
+                    lines.insert(i, f"{LICENSE_LINE}\n")
+                    break
+            file.writelines(lines)
+
+
+def main():
+    """Main function"""
+    files_missing_both = []
+    files_missing_copyright = []
+    files_missing_license = []
+
+    for file_path in sys.argv[1:]:
+        status = check_spdx_header_status(file_path)
+
+        if status == SPDXStatus.MISSING_BOTH:
+            files_missing_both.append(file_path)
+        elif status == SPDXStatus.MISSING_COPYRIGHT:
+            files_missing_copyright.append(file_path)
+        elif status == SPDXStatus.MISSING_LICENSE:
+            files_missing_license.append(file_path)
+        else:
+            continue
+
+    # Collect all files that need fixing
+    all_files_to_fix = (files_missing_both + files_missing_copyright +
+                        files_missing_license)
+    if all_files_to_fix:
+        print("The following files are missing the SPDX header:")
+        if files_missing_both:
+            for file_path in files_missing_both:
+                print(f"  {file_path}")
+                add_header(file_path, SPDXStatus.MISSING_BOTH)
+
+        if files_missing_copyright:
+            for file_path in files_missing_copyright:
+                print(f"  {file_path}")
+                add_header(file_path, SPDXStatus.MISSING_COPYRIGHT)
+        if files_missing_license:
+            for file_path in files_missing_license:
+                print(f"  {file_path}")
+                add_header(file_path, SPDXStatus.MISSING_LICENSE)
+
+    sys.exit(1 if all_files_to_fix else 0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/tools/check_triton_import.py b/vllm_v0.10.0/tools/check_triton_import.py
new file mode 100644
index 0000000..c01d9d4
--- /dev/null
+++ b/vllm_v0.10.0/tools/check_triton_import.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import subprocess
+import sys
+
+import regex as re
+
+FORBIDDEN_IMPORT_RE = re.compile(r"^(from|import)\s+triton(\s|\.|$)")
+
+# the way allowed to import triton
+ALLOWED_LINES = {
+    "from vllm.triton_utils import triton",
+    "from vllm.triton_utils import tl",
+    "from vllm.triton_utils import tl, triton",
+}
+
+ALLOWED_FILES = {"vllm/triton_utils/importing.py"}
+
+
+def is_allowed_file(current_file: str) -> bool:
+    return current_file in ALLOWED_FILES
+
+
+def is_forbidden_import(line: str) -> bool:
+    stripped = line.strip()
+    return bool(
+        FORBIDDEN_IMPORT_RE.match(stripped)) and stripped not in ALLOWED_LINES
+
+
+def parse_diff(diff: str) -> list[str]:
+    violations = []
+    current_file = None
+    current_lineno = None
+    skip_allowed_file = False
+
+    for line in diff.splitlines():
+        if line.startswith("+++ b/"):
+            current_file = line[6:]
+            skip_allowed_file = is_allowed_file(current_file)
+        elif skip_allowed_file:
+            continue
+        elif line.startswith("@@"):
+            match = re.search(r"\+(\d+)", line)
+            if match:
+                current_lineno = int(
+                    match.group(1)) - 1  # next "+ line" is here
+        elif line.startswith("+") and not line.startswith("++"):
+            current_lineno += 1
+            code_line = line[1:]
+            if is_forbidden_import(code_line):
+                violations.append(
+                    f"{current_file}:{current_lineno}: {code_line.strip()}")
+    return violations
+
+
+def get_diff(diff_type: str) -> str:
+    if diff_type == "staged":
+        return subprocess.check_output(
+            ["git", "diff", "--cached", "--unified=0"], text=True)
+    elif diff_type == "unstaged":
+        return subprocess.check_output(["git", "diff", "--unified=0"],
+                                       text=True)
+    else:
+        raise ValueError(f"Unknown diff_type: {diff_type}")
+
+
+def main():
+    all_violations = []
+    for diff_type in ["staged", "unstaged"]:
+        try:
+            diff_output = get_diff(diff_type)
+            violations = parse_diff(diff_output)
+            all_violations.extend(violations)
+        except subprocess.CalledProcessError as e:
+            print(f"[{diff_type}] Git diff failed: {e}", file=sys.stderr)
+
+    if all_violations:
+        print("❌ Forbidden direct `import triton` detected."
+              " ➤ Use `from vllm.triton_utils import triton` instead.\n")
+        for v in all_violations:
+            print(f"❌ {v}")
+        return 1
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/vllm_v0.10.0/tools/enforce_regex_import.py b/vllm_v0.10.0/tools/enforce_regex_import.py
new file mode 100644
index 0000000..63ceee5
--- /dev/null
+++ b/vllm_v0.10.0/tools/enforce_regex_import.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import subprocess
+from pathlib import Path
+
+import regex as re
+
+FORBIDDEN_PATTERNS = re.compile(
+    r'^\s*(?:import\s+re(?:$|\s|,)|from\s+re\s+import)')
+ALLOWED_PATTERNS = [
+    re.compile(r'^\s*import\s+regex\s+as\s+re\s*$'),
+    re.compile(r'^\s*import\s+regex\s*$'),
+]
+
+
+def get_staged_python_files() -> list[str]:
+    try:
+        result = subprocess.run(
+            ['git', 'diff', '--cached', '--name-only', '--diff-filter=AM'],
+            capture_output=True,
+            text=True,
+            check=True)
+        files = result.stdout.strip().split(
+            '\n') if result.stdout.strip() else []
+        return [f for f in files if f.endswith('.py')]
+    except subprocess.CalledProcessError:
+        return []
+
+
+def is_forbidden_import(line: str) -> bool:
+    line = line.strip()
+    return bool(
+        FORBIDDEN_PATTERNS.match(line)
+        and not any(pattern.match(line) for pattern in ALLOWED_PATTERNS))
+
+
+def check_file(filepath: str) -> list[tuple[int, str]]:
+    violations = []
+    try:
+        with open(filepath, encoding='utf-8') as f:
+            for line_num, line in enumerate(f, 1):
+                if is_forbidden_import(line):
+                    violations.append((line_num, line.strip()))
+    except (OSError, UnicodeDecodeError):
+        pass
+    return violations
+
+
+def main() -> int:
+    files = get_staged_python_files()
+    if not files:
+        return 0
+
+    total_violations = 0
+
+    for filepath in files:
+        if not Path(filepath).exists():
+            continue
+
+        if filepath == "setup.py":
+            continue
+
+        violations = check_file(filepath)
+        if violations:
+            print(f"\n❌ {filepath}:")
+            for line_num, line in violations:
+                print(f"  Line {line_num}: {line}")
+                total_violations += 1
+
+    if total_violations > 0:
+        print(f"\n💡 Found {total_violations} violation(s).")
+        print("❌ Please replace 'import re' with 'import regex as re'")
+        print(
+            "   Also replace 'from re import ...' with 'from regex import ...'"
+        )  # noqa: E501
+        print("✅ Allowed imports:")
+        print("   - import regex as re")
+        print("   - import regex")  # noqa: E501
+        return 1
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/vllm_v0.10.0/tools/ep_kernels/README.md b/vllm_v0.10.0/tools/ep_kernels/README.md
new file mode 100644
index 0000000..f147914
--- /dev/null
+++ b/vllm_v0.10.0/tools/ep_kernels/README.md
@@ -0,0 +1,25 @@
+Large-scale cluster-level expert parallel, as described in the [DeepSeek-V3 Technical Report](http://arxiv.org/abs/2412.19437), is an efficient way to deploy sparse MoE models with many experts. However, such deployment requires many components beyond a normal Python package, including system package support and system driver support. It is impossible to bundle all these components into a Python package.
+
+Here we break down the requirements in 2 steps:
+1. Build and install the Python libraries (both [pplx-kernels](https://github.com/ppl-ai/pplx-kernels) and [DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this.
+2. Configure NVIDIA driver to enable IBGDA. This step requires root access, and must be done on the host machine.
+
+2 is necessary for multi-node deployment.
+
+All scripts accept a positional argument as workspace path for staging the build, defaulting to `$(pwd)/ep_kernels_workspace`.
+
+# Usage
+
+## Single-node
+
+```bash
+bash install_python_libraries.sh
+```
+
+## Multi-node
+
+```bash
+bash install_python_libraries.sh
+sudo bash configure_system_drivers.sh
+sudo reboot # Reboot is required to load the new driver
+```
diff --git a/vllm_v0.10.0/tools/ep_kernels/configure_system_drivers.sh b/vllm_v0.10.0/tools/ep_kernels/configure_system_drivers.sh
new file mode 100644
index 0000000..cf15c1d
--- /dev/null
+++ b/vllm_v0.10.0/tools/ep_kernels/configure_system_drivers.sh
@@ -0,0 +1,7 @@
+set -ex
+
+# turn on IBGDA
+echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"' | tee -a /etc/modprobe.d/nvidia.conf
+update-initramfs -u
+
+echo "Please reboot the system to apply the changes"
diff --git a/vllm_v0.10.0/tools/ep_kernels/elastic_ep/eep_nvshmem.patch b/vllm_v0.10.0/tools/ep_kernels/elastic_ep/eep_nvshmem.patch
new file mode 100644
index 0000000..5ebdaea
--- /dev/null
+++ b/vllm_v0.10.0/tools/ep_kernels/elastic_ep/eep_nvshmem.patch
@@ -0,0 +1,92 @@
+From 18c0599c2f07ec965132efa25961dc8179c2dda3 Mon Sep 17 00:00:00 2001
+From: Yongji Wu <wuyongji317@gmail.com>
+Date: Tue, 20 May 2025 13:41:12 -0700
+Subject: [PATCH] fix reinit issues due to states not cleaned up
+
+fix double free
+---
+ src/host/init/init.cu                             | 10 ++++++++++
+ .../internal/host/nvshmemi_mem_transport.hpp      | 15 +++++++++++++++
+ src/modules/bootstrap/uid/bootstrap_uid.cpp       |  5 +++++
+ 3 files changed, 30 insertions(+)
+
+diff --git a/src/host/init/init.cu b/src/host/init/init.cu
+index b1c5dbf..1fecb4b 100644
+--- a/src/host/init/init.cu
++++ b/src/host/init/init.cu
+@@ -43,6 +43,8 @@
+ #include "internal/host/nvshmemi_types.h"
+ #include "internal/host/shared_memory.h"
+ #include "internal/host/nvshmemi_symmetric_heap.hpp"
++// eep-dev
++#include "internal/host/nvshmemi_mem_transport.hpp"
+ 
+ extern __constant__ nvshmemi_device_host_state_t nvshmemi_device_state_d;
+ static std::map<void *, int> registered_device_states;
+@@ -1293,6 +1295,14 @@ void nvshmemid_hostlib_finalize(void *device_ctx, void *transport_device_ctx) {
+         /* Multi-init Multi-fini*/
+         nvshmemi_state = NULL;
+         nvshmemi_device_state.nvshmemi_is_nvshmem_initialized = 0;
++        
++        // eep-dev
++        nvshmemi_mem_p2p_transport::destroy_instance();
++        nvshmemi_mem_remote_transport::destroy_instance();
++        free(nvshmemi_default_session);
++        nvshmemi_default_session = nullptr;
++        nvshmemi_device_state.nvshmemi_is_nvshmem_bootstrapped = false;
++        
+         nvshmemi_is_device_state_ready = false;
+     } else
+         nvshmemi_boot_handle.barrier(&nvshmemi_boot_handle);
+diff --git a/src/include/internal/host/nvshmemi_mem_transport.hpp b/src/include/internal/host/nvshmemi_mem_transport.hpp
+index 2495844..e4f408a 100644
+--- a/src/include/internal/host/nvshmemi_mem_transport.hpp
++++ b/src/include/internal/host/nvshmemi_mem_transport.hpp
+@@ -36,6 +36,13 @@ class nvshmemi_mem_p2p_transport final {
+             return p2p_objref_;
+         }
+     }
++    // eep-dev
++    static void destroy_instance(void) {
++        if (p2p_objref_ != nullptr) {
++            delete p2p_objref_;
++            p2p_objref_ = nullptr;
++        }
++    }
+ 
+     void print_mem_handle(int pe_id, int transport_idx, nvshmemi_symmetric_heap &obj);
+ 
+@@ -87,6 +94,14 @@ class nvshmemi_mem_remote_transport final {
+         }
+     }
+ 
++    // eep-dev
++    static void destroy_instance(void) {
++        if (remote_objref_ != nullptr) {
++            delete remote_objref_;
++            remote_objref_ = nullptr;
++        }
++    }
++
+     int gather_mem_handles(nvshmemi_symmetric_heap &obj, uint64_t heap_offset, size_t size);
+     /* On-demand registration and release of memory */
+     int register_mem_handle(nvshmem_mem_handle_t *local_handles, int transport_idx,
+diff --git a/src/modules/bootstrap/uid/bootstrap_uid.cpp b/src/modules/bootstrap/uid/bootstrap_uid.cpp
+index a1fa748..788fa96 100644
+--- a/src/modules/bootstrap/uid/bootstrap_uid.cpp
++++ b/src/modules/bootstrap/uid/bootstrap_uid.cpp
+@@ -630,6 +630,11 @@ int nvshmemi_bootstrap_plugin_pre_init(bootstrap_handle_t* handle, const int abi
+     // Discover the network for bootstrap, if not done previously.
+     // This code needs to be stateful to be able to be called multiple times by the caller
+     BOOTSTRAP_CHECK(bootstrap_net_init());
++    // eep-dev
++    if (handle->pre_init_ops != nullptr) {
++        BOOTSTRAP_PTR_FREE(handle->pre_init_ops);
++        handle->pre_init_ops = nullptr;
++    }
+     if (handle->pre_init_ops == nullptr) {
+         BOOTSTRAP_CALLOC(&handle->pre_init_ops, 1);
+         handle->pre_init_ops->get_unique_id = bootstrap_get_unique_id;
+-- 
+2.43.0
+
diff --git a/vllm_v0.10.0/tools/ep_kernels/elastic_ep/install_eep_libraries.sh b/vllm_v0.10.0/tools/ep_kernels/elastic_ep/install_eep_libraries.sh
new file mode 100644
index 0000000..9d7dc10
--- /dev/null
+++ b/vllm_v0.10.0/tools/ep_kernels/elastic_ep/install_eep_libraries.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+set -ex
+
+# Default workspace directory
+WORKSPACE=$(pwd)/eep_kernels_workspace
+INSTALL_NVSHMEM=true
+
+# Parse command line arguments
+while getopts "w:n" opt; do
+  case $opt in
+    w)
+      WORKSPACE="$OPTARG"
+      ;;
+    n)
+      INSTALL_NVSHMEM=false
+      ;;
+    \?)
+      echo "Invalid option: -$OPTARG" >&2
+      exit 1
+      ;;
+  esac
+done
+
+if [ ! -d "$WORKSPACE" ]; then
+    mkdir -p $WORKSPACE
+fi
+
+
+# install dependencies if not installed
+pip3 install cmake torch ninja
+
+# build nvshmem
+pushd $WORKSPACE
+# Reset NVSHMEM build if requested
+if [ "$INSTALL_NVSHMEM" = true ]; then
+    mkdir -p nvshmem_src
+    wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz
+    tar -xvf nvshmem_src_3.2.5-1.txz -C nvshmem_src --strip-components=1
+    pushd nvshmem_src
+    wget https://github.com/deepseek-ai/DeepEP/raw/main/third-party/nvshmem.patch
+    git init
+    git apply -vvv nvshmem.patch
+    git apply --reject --whitespace=fix ../../eep_nvshmem.patch 
+else
+    pushd nvshmem_src
+fi
+
+# assume CUDA_HOME is set correctly
+if [ -z "$CUDA_HOME" ]; then
+    echo "CUDA_HOME is not set, please set it to your CUDA installation directory."
+    exit 1
+fi
+
+# disable all features except IBGDA
+export NVSHMEM_IBGDA_SUPPORT=1
+
+export NVSHMEM_SHMEM_SUPPORT=0
+export NVSHMEM_UCX_SUPPORT=0
+export NVSHMEM_USE_NCCL=0
+export NVSHMEM_PMIX_SUPPORT=0
+export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
+export NVSHMEM_USE_GDRCOPY=0
+export NVSHMEM_IBRC_SUPPORT=0
+export NVSHMEM_BUILD_TESTS=0
+export NVSHMEM_BUILD_EXAMPLES=0
+export NVSHMEM_MPI_SUPPORT=0
+export NVSHMEM_BUILD_HYDRA_LAUNCHER=0
+export NVSHMEM_BUILD_TXZ_PACKAGE=0
+export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
+
+cmake -G Ninja -S . -B $WORKSPACE/nvshmem_build/ -DCMAKE_INSTALL_PREFIX=$WORKSPACE/nvshmem_install
+cmake --build $WORKSPACE/nvshmem_build/ --target install
+
+popd
+
+export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem_install:$CMAKE_PREFIX_PATH
+
+# build and install pplx, require pytorch installed
+pushd $WORKSPACE
+git clone https://github.com/ppl-ai/pplx-kernels
+cd pplx-kernels
+# see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
+# PIP_NO_BUILD_ISOLATION=0 disables build isolation
+PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX pip install . --no-deps -v
+
diff --git a/vllm_v0.10.0/tools/ep_kernels/install_python_libraries.sh b/vllm_v0.10.0/tools/ep_kernels/install_python_libraries.sh
new file mode 100644
index 0000000..83643c0
--- /dev/null
+++ b/vllm_v0.10.0/tools/ep_kernels/install_python_libraries.sh
@@ -0,0 +1,71 @@
+set -ex
+
+# prepare workspace directory
+WORKSPACE=$1
+if [ -z "$WORKSPACE" ]; then
+    export WORKSPACE=$(pwd)/ep_kernels_workspace
+fi
+
+if [ ! -d "$WORKSPACE" ]; then
+    mkdir -p $WORKSPACE
+fi
+
+# install dependencies if not installed
+pip3 install cmake torch ninja
+
+# build nvshmem
+pushd $WORKSPACE
+mkdir -p nvshmem_src
+wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz
+tar -xvf nvshmem_src_3.2.5-1.txz -C nvshmem_src --strip-components=1
+pushd nvshmem_src
+wget https://github.com/deepseek-ai/DeepEP/raw/main/third-party/nvshmem.patch
+git init
+git apply -vvv nvshmem.patch
+
+# assume CUDA_HOME is set correctly
+if [ -z "$CUDA_HOME" ]; then
+    echo "CUDA_HOME is not set, please set it to your CUDA installation directory."
+    exit 1
+fi
+
+# disable all features except IBGDA
+export NVSHMEM_IBGDA_SUPPORT=1
+
+export NVSHMEM_SHMEM_SUPPORT=0
+export NVSHMEM_UCX_SUPPORT=0
+export NVSHMEM_USE_NCCL=0
+export NVSHMEM_PMIX_SUPPORT=0
+export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
+export NVSHMEM_USE_GDRCOPY=0
+export NVSHMEM_IBRC_SUPPORT=0
+export NVSHMEM_BUILD_TESTS=0
+export NVSHMEM_BUILD_EXAMPLES=0
+export NVSHMEM_MPI_SUPPORT=0
+export NVSHMEM_BUILD_HYDRA_LAUNCHER=0
+export NVSHMEM_BUILD_TXZ_PACKAGE=0
+export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
+
+cmake -G Ninja -S . -B $WORKSPACE/nvshmem_build/ -DCMAKE_INSTALL_PREFIX=$WORKSPACE/nvshmem_install
+cmake --build $WORKSPACE/nvshmem_build/ --target install
+
+popd
+
+export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem_install:$CMAKE_PREFIX_PATH
+
+# build and install pplx, require pytorch installed
+pushd $WORKSPACE
+git clone https://github.com/ppl-ai/pplx-kernels
+cd pplx-kernels
+# see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
+# PIP_NO_BUILD_ISOLATION=0 disables build isolation
+PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX pip install -vvv -e  .
+popd
+
+# build and install deepep, require pytorch installed
+pushd $WORKSPACE
+git clone https://github.com/deepseek-ai/DeepEP
+cd DeepEP
+export NVSHMEM_DIR=$WORKSPACE/nvshmem_install
+PIP_NO_BUILD_ISOLATION=0 pip install -vvv -e  .
+popd
diff --git a/vllm_v0.10.0/tools/generate_cmake_presets.py b/vllm_v0.10.0/tools/generate_cmake_presets.py
new file mode 100644
index 0000000..5f92f2f
--- /dev/null
+++ b/vllm_v0.10.0/tools/generate_cmake_presets.py
@@ -0,0 +1,169 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+import multiprocessing
+import os
+import sys
+from shutil import which
+
+try:
+    # Try to get CUDA_HOME from PyTorch installation, which is the
+    # most reliable source of truth for vLLM's build.
+    from torch.utils.cpp_extension import CUDA_HOME
+except ImportError:
+    print("Warning: PyTorch not found. "
+          "Falling back to CUDA_HOME environment variable.")
+    CUDA_HOME = os.environ.get("CUDA_HOME")
+
+
+def get_python_executable():
+    """Get the current Python executable, which is used to run this script."""
+    return sys.executable
+
+
+def get_cpu_cores():
+    """Get the number of CPU cores."""
+    return multiprocessing.cpu_count()
+
+
+def generate_presets(output_path="CMakeUserPresets.json"):
+    """Generates the CMakeUserPresets.json file."""
+
+    print("Attempting to detect your system configuration...")
+
+    # Detect NVCC
+    nvcc_path = None
+    if CUDA_HOME:
+        prospective_path = os.path.join(CUDA_HOME, "bin", "nvcc")
+        if os.path.exists(prospective_path):
+            nvcc_path = prospective_path
+            print("Found nvcc via torch.utils.cpp_extension.CUDA_HOME: "
+                  f"{nvcc_path}")
+
+    if not nvcc_path:
+        nvcc_path = which("nvcc")
+        if nvcc_path:
+            print(f"Found nvcc in PATH: {nvcc_path}")
+
+    if not nvcc_path:
+        nvcc_path_input = input(
+            "Could not automatically find 'nvcc'. Please provide the full "
+            "path to nvcc (e.g., /usr/local/cuda/bin/nvcc): ")
+        nvcc_path = nvcc_path_input.strip()
+    print(f"Using NVCC path: {nvcc_path}")
+
+    # Detect Python executable
+    python_executable = get_python_executable()
+    if python_executable:
+        print(f"Found Python via sys.executable: {python_executable}")
+    else:
+        python_executable_prompt = (
+            "Could not automatically find Python executable. Please provide "
+            "the full path to your Python executable for vLLM development "
+            "(typically from your virtual environment, e.g., "
+            "/home/user/venvs/vllm/bin/python): ")
+        python_executable = input(python_executable_prompt).strip()
+        if not python_executable:
+            raise ValueError(
+                "Could not determine Python executable. Please provide it "
+                "manually.")
+
+    print(f"Using Python executable: {python_executable}")
+
+    # Get CPU cores
+    cpu_cores = get_cpu_cores()
+    nvcc_threads = min(4, cpu_cores)
+    cmake_jobs = max(1, cpu_cores // nvcc_threads)
+    print(f"Detected {cpu_cores} CPU cores. "
+          f"Setting NVCC_THREADS={nvcc_threads} and CMake jobs={cmake_jobs}.")
+
+    # Get vLLM project root (assuming this script is in vllm/tools/)
+    project_root = os.path.abspath(
+        os.path.join(os.path.dirname(__file__), ".."))
+    print(f"VLLM project root detected as: {project_root}")
+
+    # Ensure python_executable path is absolute or resolvable
+    if not os.path.isabs(python_executable) and which(python_executable):
+        python_executable = os.path.abspath(which(python_executable))
+    elif not os.path.isabs(python_executable):
+        print(f"Warning: Python executable '{python_executable}' is not an "
+              "absolute path and not found in PATH. CMake might not find it.")
+
+    cache_variables = {
+        "CMAKE_CUDA_COMPILER": nvcc_path,
+        "CMAKE_BUILD_TYPE": "Release",
+        "VLLM_PYTHON_EXECUTABLE": python_executable,
+        "CMAKE_INSTALL_PREFIX": "${sourceDir}",
+        "CMAKE_CUDA_FLAGS": "",
+        "NVCC_THREADS": str(nvcc_threads),
+    }
+
+    # Detect compiler cache
+    if which("sccache"):
+        print("Using sccache for compiler caching.")
+        for launcher in ("C", "CXX", "CUDA", "HIP"):
+            cache_variables[f"CMAKE_{launcher}_COMPILER_LAUNCHER"] = "sccache"
+    elif which("ccache"):
+        print("Using ccache for compiler caching.")
+        for launcher in ("C", "CXX", "CUDA", "HIP"):
+            cache_variables[f"CMAKE_{launcher}_COMPILER_LAUNCHER"] = "ccache"
+    else:
+        print("No compiler cache ('ccache' or 'sccache') found.")
+
+    configure_preset = {
+        "name": "release",
+        "binaryDir": "${sourceDir}/cmake-build-release",
+        "cacheVariables": cache_variables,
+    }
+    if which("ninja"):
+        print("Using Ninja generator.")
+        configure_preset["generator"] = "Ninja"
+        cache_variables["CMAKE_JOB_POOLS"] = f"compile={cmake_jobs}"
+    else:
+        print("Ninja not found, using default generator. "
+              "Build may be slower.")
+
+    presets = {
+        "version":
+        6,
+        # Keep in sync with CMakeLists.txt and requirements/build.txt
+        "cmakeMinimumRequired": {
+            "major": 3,
+            "minor": 26,
+            "patch": 1
+        },
+        "configurePresets": [configure_preset],
+        "buildPresets": [{
+            "name": "release",
+            "configurePreset": "release",
+            "jobs": cmake_jobs,
+        }],
+    }
+
+    output_file_path = os.path.join(project_root, output_path)
+
+    if os.path.exists(output_file_path):
+        overwrite = input(
+            f"'{output_file_path}' already exists. Overwrite? (y/N): ").strip(
+            ).lower()
+        if overwrite != 'y':
+            print("Generation cancelled.")
+            return
+
+    try:
+        with open(output_file_path, "w") as f:
+            json.dump(presets, f, indent=4)
+        print(f"Successfully generated '{output_file_path}'")
+        print("\nTo use this preset:")
+        print(
+            f"1. Ensure you are in the vLLM root directory: cd {project_root}")
+        print("2. Initialize CMake: cmake --preset release")
+        print("3. Build+install: cmake --build --preset release "
+              "--target install")
+
+    except OSError as e:
+        print(f"Error writing file: {e}")
+
+
+if __name__ == "__main__":
+    generate_presets()
diff --git a/vllm_v0.10.0/tools/generate_nightly_torch_test.py b/vllm_v0.10.0/tools/generate_nightly_torch_test.py
new file mode 100644
index 0000000..a3d7f7a
--- /dev/null
+++ b/vllm_v0.10.0/tools/generate_nightly_torch_test.py
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Generates specialized requirements files for nightly PyTorch testing.
+
+This script reads the main test requirements input file (`requirements/test.in`)
+and splits its content into two files:
+1.  `requirements/nightly_torch_test.txt`: Contains dependencies
+except PyTorch-related.
+2.  `torch_nightly_test.txt`: Contains only PyTorch-related packages.
+"""
+
+input_file = "requirements/test.in"
+output_file = "requirements/nightly_torch_test.txt"
+
+# white list of packages that are not compatible with PyTorch nightly directly
+# with pip install. Please add your package to this list if it is not compatible
+# or make the dependency test fails.
+white_list = ["torch", "torchaudio", "torchvision", "mamba_ssm"]
+
+with open(input_file) as f:
+    lines = f.readlines()
+
+skip_next = False
+
+for line in lines:
+    if skip_next:
+        if line.startswith((" ", "\t")) or line.strip() == "":
+            continue
+        skip_next = False
+
+    if any(k in line.lower() for k in white_list):
+        skip_next = True
+        continue
diff --git a/vllm_v0.10.0/tools/install_nixl.sh b/vllm_v0.10.0/tools/install_nixl.sh
new file mode 100644
index 0000000..56717cf
--- /dev/null
+++ b/vllm_v0.10.0/tools/install_nixl.sh
@@ -0,0 +1,109 @@
+#!/bin/bash
+# Usage: ./install_nixl.sh [--force]
+
+FORCE=false
+if [ "$1" == "--force" ]; then
+    FORCE=true
+fi
+
+SUDO=false
+if command -v sudo >/dev/null 2>&1 && sudo -n true 2>/dev/null; then
+    SUDO=true
+fi
+
+ARCH=$(uname -m)
+
+ROOT_DIR="/usr/local"
+mkdir -p "$ROOT_DIR"
+GDR_HOME="$ROOT_DIR/gdrcopy"
+UCX_HOME="$ROOT_DIR/ucx"
+NIXL_HOME="$ROOT_DIR/nixl"
+CUDA_HOME=/usr/local/cuda
+
+export PATH="$GDR_HOME/bin:$UCX_HOME/bin:$NIXL_HOME/bin:$PATH"
+export LD_LIBRARY_PATH="$GDR_HOME/lib:$UCX_HOME/lib:$NIXL_HOME/lib/$ARCH-linux-gnu:$LD_LIBRARY_PATH"
+
+TEMP_DIR="nixl_installer"
+mkdir -p "$TEMP_DIR"
+cd "$TEMP_DIR"
+
+pip install meson ninja pybind11
+
+if [ ! -e "/dev/gdrdrv" ] || [ "$FORCE" = true ]; then
+    echo "Installing gdrcopy\n"
+    wget https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v2.5.tar.gz
+    tar xzf v2.5.tar.gz; rm v2.5.tar.gz
+    cd gdrcopy-2.5
+    make prefix=$GDR_HOME CUDA=$CUDA_HOME all install
+    
+    if $SUDO; then
+        echo "Running insmod.sh with sudo"
+        sudo ./insmod.sh
+    else
+        echo "Skipping insmod.sh - sudo not available"
+        echo "Please run 'sudo ./gdrcopy-2.5/insmod.sh' manually if needed"
+    fi
+    
+    cd ..
+else
+    echo "Found /dev/gdrdrv. Skipping gdrcopy installation"
+fi
+
+if ! command -v ucx_info &> /dev/null || [ "$FORCE" = true ]; then
+    echo "Installing UCX"
+    wget https://github.com/openucx/ucx/releases/download/v1.18.0/ucx-1.18.0.tar.gz
+    tar xzf ucx-1.18.0.tar.gz; rm ucx-1.18.0.tar.gz
+    cd ucx-1.18.0
+    
+    # Checking Mellanox NICs
+    MLX_OPTS=""
+    if lspci | grep -i mellanox > /dev/null || command -v ibstat > /dev/null; then
+        echo "Mellanox NIC detected, adding Mellanox-specific options"
+        MLX_OPTS="--with-rdmacm \
+                  --with-mlx5-dv \
+                  --with-ib-hw-tm"
+    fi
+    
+    ./configure  --prefix=$UCX_HOME                \
+                --enable-shared                    \
+                --disable-static                   \
+                --disable-doxygen-doc              \
+                --enable-optimizations             \
+                --enable-cma                       \
+                --enable-devel-headers             \
+                --with-cuda=$CUDA_HOME             \
+                --with-dm                          \
+                --with-gdrcopy=$GDR_HOME           \
+                --with-verbs                       \
+                --enable-mt                        \
+                $MLX_OPTS
+    make -j
+    make -j install-strip
+    
+    if $SUDO; then
+        echo "Running ldconfig with sudo"
+        sudo ldconfig
+    else
+        echo "Skipping ldconfig - sudo not available"
+        echo "Please run 'sudo ldconfig' manually if needed"
+    fi
+
+    cd ..
+else
+    echo "Found existing UCX. Skipping UCX installation"  
+fi
+
+if ! command -v nixl_test &> /dev/null || [ "$FORCE" = true ]; then
+    echo "Installing NIXL"
+    wget https://github.com/ai-dynamo/nixl/archive/refs/tags/0.2.0.tar.gz
+    tar xzf 0.2.0.tar.gz; rm 0.2.0.tar.gz
+    cd nixl-0.2.0
+    meson setup build --prefix=$NIXL_HOME -Ducx_path=$UCX_HOME
+    cd build
+    ninja
+    ninja install
+
+    cd ../..
+else
+    echo "Found existing NIXL. Skipping NIXL installation"  
+fi
diff --git a/vllm_v0.10.0/tools/mypy.sh b/vllm_v0.10.0/tools/mypy.sh
new file mode 100755
index 0000000..781d8fc
--- /dev/null
+++ b/vllm_v0.10.0/tools/mypy.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+CI=${1:-0}
+PYTHON_VERSION=${2:-local}
+
+if [ "$CI" -eq 1 ]; then
+    set -e
+fi
+
+if [ $PYTHON_VERSION == "local" ]; then
+    PYTHON_VERSION=$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
+fi
+
+run_mypy() {
+    echo "Running mypy on $1"
+    if [ "$CI" -eq 1 ] && [ -z "$1" ]; then
+        mypy --python-version "${PYTHON_VERSION}" "$@"
+        return
+    fi
+    mypy --follow-imports skip --python-version "${PYTHON_VERSION}" "$@"
+}
+
+run_mypy # Note that this is less strict than CI
+run_mypy tests
+run_mypy vllm/attention
+run_mypy vllm/compilation
+run_mypy vllm/distributed
+run_mypy vllm/engine
+run_mypy vllm/executor
+run_mypy vllm/inputs
+run_mypy vllm/lora
+run_mypy vllm/model_executor
+run_mypy vllm/plugins
+run_mypy vllm/worker
+run_mypy vllm/v1
diff --git a/vllm_v0.10.0/tools/png-lint.sh b/vllm_v0.10.0/tools/png-lint.sh
new file mode 100755
index 0000000..a80fe98
--- /dev/null
+++ b/vllm_v0.10.0/tools/png-lint.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+# Ensure that *.excalidraw.png files have the excalidraw metadata
+# embedded in them. This ensures they can be loaded back into
+# the tool and edited in the future.
+
+find . -iname '*.excalidraw.png' | while read -r file; do
+	if git check-ignore -q "$file"; then
+		continue
+	fi
+	if ! grep -q "excalidraw+json" "$file"; then
+		echo "$file was not exported from excalidraw with 'Embed Scene' enabled."
+		exit 1
+	fi
+done
diff --git a/vllm_v0.10.0/tools/profiler/print_layerwise_table.py b/vllm_v0.10.0/tools/profiler/print_layerwise_table.py
new file mode 100644
index 0000000..209c3a5
--- /dev/null
+++ b/vllm_v0.10.0/tools/profiler/print_layerwise_table.py
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import json
+
+from vllm.profiler.layerwise_profile import ModelStatsEntry, SummaryStatsEntry
+from vllm.profiler.utils import TablePrinter, indent_string
+
+
+def flatten_entries(entry_cls, profile_dict: dict):
+    entries_and_depth = []
+
+    def get_entries(node, curr_depth=0):
+        entries_and_depth.append((entry_cls(**node["entry"]), curr_depth))
+
+        for child in node["children"]:
+            get_entries(
+                child,
+                curr_depth=curr_depth + 1,
+            )
+
+    for root in profile_dict:
+        get_entries(root)
+
+    return entries_and_depth
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--json-trace",
+                        type=str,
+                        required=True,
+                        help="json trace file output by "
+                        "examples/offline_inference/profiling.py")
+    parser.add_argument("--phase",
+                        type=str,
+                        required=True,
+                        help="The phase to print the table for. This is either"
+                        "prefill or decode_n, where n is the decode step "
+                        "number")
+    parser.add_argument("--table",
+                        type=str,
+                        choices=["summary", "model"],
+                        default="summary",
+                        help="Which table to print, the summary table or the "
+                        "layerwise model table")
+
+    args = parser.parse_args()
+
+    with open(args.json_trace) as f:
+        profile_data = json.load(f)
+
+    assert args.phase in profile_data, \
+       (f"Cannot find phase {args.phase} in profile data. Choose one among"
+        f'{[x for x in profile_data.keys() if "prefill" in x or "decode" in x]}') #noqa
+
+    if args.table == "summary":
+        entries_and_depths = flatten_entries(
+            SummaryStatsEntry, profile_data[args.phase]["summary_stats"])
+        column_widths = dict(name=80,
+                             cuda_time_us=12,
+                             pct_cuda_time=12,
+                             invocations=15)
+    elif args.table == "model":
+        entries_and_depths = flatten_entries(
+            ModelStatsEntry, profile_data[args.phase]["model_stats"])
+        column_widths = dict(name=60,
+                             cpu_time_us=12,
+                             cuda_time_us=12,
+                             pct_cuda_time=12,
+                             trace=60)
+
+    # indent entry names based on the depth
+    entries = []
+    for entry, depth in entries_and_depths:
+        entry.name = indent_string(
+            entry.name,
+            indent=depth,
+            indent_style=lambda indent: "|" + "-" * indent + " ")
+        entries.append(entry)
+
+    TablePrinter(type(entries[0]), column_widths).print_table(entries)
diff --git a/vllm_v0.10.0/tools/profiler/visualize_layerwise_profile.py b/vllm_v0.10.0/tools/profiler/visualize_layerwise_profile.py
new file mode 100644
index 0000000..038d3c4
--- /dev/null
+++ b/vllm_v0.10.0/tools/profiler/visualize_layerwise_profile.py
@@ -0,0 +1,593 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import copy
+import json
+import math
+import os
+from pathlib import Path
+from typing import Any, Optional
+
+import matplotlib.pyplot as plt
+import pandas as pd
+
+## JSON parsing utils ####
+
+
+def largest_dist_from_leaf(node: dict, depth: int = 0):
+    if len(node["children"]) == 0:
+        return depth
+    return max([
+        largest_dist_from_leaf(child, depth=depth + 1)
+        for child in node["children"]
+    ])
+
+
+def get_entries_at_depth(depth: int,
+                         entries_and_traces: list[tuple[Any, Any]],
+                         node: dict,
+                         curr_depth: int = 0,
+                         trace=()):
+    # assert that the query is at kernel or module level
+    assert depth == -1 or depth == -2
+
+    if curr_depth == 0 and largest_dist_from_leaf(node) <= (abs(depth) - 1):
+        # The tree is not tall enough!
+        entries_and_traces.append((node["entry"], trace))
+        return
+
+    if largest_dist_from_leaf(node) == (abs(depth) - 1):
+        entries_and_traces.append((node["entry"], trace))
+
+    trace = (node["entry"]["name"], ) + trace
+    for child in node["children"]:
+        get_entries_at_depth(depth,
+                             entries_and_traces,
+                             child,
+                             curr_depth=curr_depth + 1,
+                             trace=trace)
+
+
+def fold_nodes(root: dict, nodes_to_fold: list[str]):
+
+    stack: list[dict] = [root]
+    while len(stack) != 0:
+        node = stack.pop()
+        if node['entry']['name'] in nodes_to_fold:
+            node["children"] = []
+            continue
+        for child in node["children"]:
+            stack.append(child)
+    return root
+
+
+## Operation name cleanup utils ####
+
+
+def trim_string_back(string: str, width: int) -> str:
+    if len(string) > width:
+        offset = len(string) - width + 3
+        string = string[:-offset]
+        if len(string) > 3:
+            string = string + "..."
+    return string
+
+
+def shorten_plot_legend_strings(legend, max_char_len: int):
+    for t in legend.get_texts():
+        t.set_text(
+            trim_string_back(abbreviate_known_names(t.get_text()),
+                             max_char_len))
+
+
+def abbreviate_known_names(name: str) -> str:
+    abbreviations = {
+        "MergedColumnParallelLinear": "MCPLinear",
+        "QKVParallelLinear": "QKVPLinear",
+        "RowParallelLinear": "RPLinear",
+        "weight=": "w=",
+        "bfloat16": "bf16",
+        "float16": "f16",
+    }
+    for key, value in abbreviations.items():
+        name = name.replace(key, value)
+    return name
+
+
+def attempt_to_make_names_unique(entries_and_traces):
+    names, non_unique_names = (set(), set())
+
+    def all_the_same(items) -> bool:
+        return all(i == items[0] for i in items)
+
+    for entry, _ in entries_and_traces:
+        if entry["name"] in names:
+            non_unique_names.add(entry["name"])
+        else:
+            names.add(entry["name"])
+
+    for name in non_unique_names:
+        entries_and_traces_with_name = [(entry, trace)
+                                        for entry, trace in entries_and_traces
+                                        if entry["name"] == name]
+
+        zipped_traces = list(
+            zip(*[trace for _, trace in entries_and_traces_with_name]))
+        first_trace_difference = next(
+            (i for i, trace_eles in enumerate(zipped_traces)
+             if not all_the_same(trace_eles)), None)
+
+        if first_trace_difference is None:
+            # can't create a unique name, leave them names as the
+            # are they will get aggregated by the pivot_table call
+            continue
+
+        for entry, trace in entries_and_traces_with_name:
+            entry["name"] = " <- ".join((entry["name"], ) +
+                                        trace[:first_trace_difference + 1])
+
+
+## Operation grouping utils ####
+'''
+    Group operations in the given dataframe by some high-level ops like,
+    - gemms
+    - attention
+    - rms_norm 
+    etc.
+'''
+
+
+def group_trace_by_operations(trace_df: pd.DataFrame) -> pd.DataFrame:
+
+    def is_rms_norm(op_name: str):
+        if "rms_norm_kernel" in op_name:
+            return True
+
+    def is_attention_block(op_name: str):
+        if "flash_fwd" in op_name or \
+            "reshape_and_cache_flash_kernel" in op_name:
+            return True
+
+    def is_quant(op_name: str):
+        if "scaled_fp8_quant" in op_name or \
+           "scaled_int8_quant" in op_name:
+            return True
+
+    # LoRA ops
+    def is_sgmv_shrink(op_name: str):
+        return "sgmv_shrink" in op_name
+
+    def is_sgmv_expand(op_name: str):
+        return "sgmv_expand" in op_name
+
+    def is_bgmv_shrink(op_name: str):
+        return "bgmv_shrink" in op_name
+
+    def is_bgmv_expand(op_name: str):
+        return "bgmv_expand" in op_name
+
+    def is_cutlass_gemm_op(op_name: str):
+        return "void cutlass::Kernel" in op_name or \
+           "void cutlass::device_kernel" in op_name
+
+    def is_gemm_op(op_name: str):
+        if is_quant(op_name):
+            return False
+        return is_cutlass_gemm_op(op_name) or \
+           "xmma_gemm" in op_name  or \
+           "gemv2T_kernel" in op_name or \
+           "splitKreduce" in op_name or \
+           "s16816gemm" in op_name
+
+    def is_elementwise_op(op_name: str):
+        return "elementwise_kernel" in op_name
+
+    def is_mem_op(op_name: str):
+        return "memcpy" in op_name.lower() or \
+               "memset" in op_name.lower()
+
+    def is_vocab_embedding_op(op_name: str):
+        return "vocabparallelembed" in op_name.lower()
+
+    # nccl ops
+    def is_nccl_op(op_name: str):
+        return "nccl" in op_name.lower()
+
+    def is_nccl_all_reduce(op_name: str):
+        return is_nccl_op(op_name) and \
+                ("all_reduce" in op_name.lower() or \
+                "allreduce" in op_name.lower())
+
+    def is_nccl_gather(op_name: str):
+        return is_nccl_op(op_name) and \
+                "gather" in op_name.lower()
+
+    def is_nccl_broadcast(op_name: str):
+        return is_nccl_op(op_name) and \
+                "broadcast" in op_name.lower()
+
+    # Reduce ops types
+    def is_cross_device_reduce_1stage(op_name: str):
+        return "cross_device_reduce_1stage" in op_name
+
+    def is_cross_device_reduce_2stage(op_name: str):
+        return "cross_device_reduce_2stage" in op_name
+
+    def is_custom_ar_all_reduce(op_name: str):
+        return "_C_custom_ar::all_reduce" in op_name
+
+    def is_reduce_kernel(op_name: str):
+        return "reduce_kernel" in op_name
+
+    headers = list(trace_df)
+    ops = copy.deepcopy(headers)
+
+    attention_ops = list(filter(lambda x: is_attention_block(x), ops))
+    ops = list(filter(lambda x: x not in attention_ops, ops))
+
+    quant_ops = list(filter(lambda x: is_quant(x), ops))
+    ops = list(filter(lambda x: x not in quant_ops, ops))
+
+    sgmv_shrink_ops = list(filter(lambda x: is_sgmv_shrink(x), ops))
+    ops = list(filter(lambda x: x not in sgmv_shrink_ops, ops))
+    sgmv_expand_ops = list(filter(lambda x: is_sgmv_expand(x), ops))
+    ops = list(filter(lambda x: x not in sgmv_expand_ops, ops))
+    bgmv_shrink_ops = list(filter(lambda x: is_bgmv_shrink(x), ops))
+    ops = list(filter(lambda x: x not in bgmv_shrink_ops, ops))
+    bgmv_expand_ops = list(filter(lambda x: is_bgmv_expand(x), ops))
+    ops = list(filter(lambda x: x not in bgmv_expand_ops, ops))
+
+    cutlass_gemm_ops = list(filter(lambda x: is_cutlass_gemm_op(x), ops))
+    ops = list(filter(lambda x: x not in cutlass_gemm_ops, ops))
+
+    gemm_ops = list(filter(lambda x: is_gemm_op(x), ops))
+    ops = list(filter(lambda x: x not in gemm_ops, ops))
+
+    rms_norm_ops = list(filter(lambda x: is_rms_norm(x), ops))
+    ops = list(filter(lambda x: x not in rms_norm_ops, ops))
+
+    vocab_embed_ops = list(filter(lambda x: is_vocab_embedding_op(x), ops))
+    ops = list(filter(lambda x: x not in vocab_embed_ops, ops))
+
+    mem_ops = list(filter(lambda x: is_mem_op(x), ops))
+    ops = list(filter(lambda x: x not in mem_ops, ops))
+
+    elementwise_ops = list(filter(lambda x: is_elementwise_op(x), ops))
+    ops = list(filter(lambda x: x not in elementwise_ops, ops))
+
+    nccl_all_reduce_ops = list(filter(lambda x: is_nccl_all_reduce(x), ops))
+    ops = list(filter(lambda x: x not in nccl_all_reduce_ops, ops))
+
+    nccl_gather_ops = list(filter(lambda x: is_nccl_gather(x), ops))
+    ops = list(filter(lambda x: x not in nccl_gather_ops, ops))
+
+    nccl_broadcast_ops = list(filter(lambda x: is_nccl_broadcast(x), ops))
+    ops = list(filter(lambda x: x not in nccl_broadcast_ops, ops))
+
+    nccl_other_ops = list(filter(lambda x: is_nccl_op(x), ops))
+    ops = list(filter(lambda x: x not in nccl_other_ops, ops))
+
+    cross_device_reduce_1stage_ops = list(
+        filter(lambda x: is_cross_device_reduce_1stage(x), ops))
+    ops = list(filter(lambda x: x not in cross_device_reduce_1stage_ops, ops))
+
+    cross_device_reduce_2stage_ops = list(
+        filter(lambda x: is_cross_device_reduce_2stage(x), ops))
+    ops = list(filter(lambda x: x not in cross_device_reduce_2stage_ops, ops))
+
+    custom_ar_all_reduce_ops = list(
+        filter(lambda x: is_custom_ar_all_reduce(x), ops))
+    ops = list(filter(lambda x: x not in custom_ar_all_reduce_ops, ops))
+
+    reduce_kernel_ops = list(filter(lambda x: is_reduce_kernel(x), ops))
+    ops = list(filter(lambda x: x not in reduce_kernel_ops, ops))
+
+    if len(attention_ops):
+        trace_df['attention'] = trace_df[attention_ops].agg("sum", axis=1)
+    if len(quant_ops):
+        trace_df['quant_ops'] = trace_df[quant_ops].agg("sum", axis=1)
+
+    if len(sgmv_shrink_ops):
+        trace_df['sgmv_shrink_ops'] = trace_df[sgmv_shrink_ops].agg("sum",
+                                                                    axis=1)
+    if len(sgmv_expand_ops):
+        trace_df['sgmv_expand_ops'] = trace_df[sgmv_expand_ops].agg("sum",
+                                                                    axis=1)
+    if len(bgmv_shrink_ops):
+        trace_df['bgmv_shrink_ops'] = trace_df[bgmv_shrink_ops].agg("sum",
+                                                                    axis=1)
+    if len(bgmv_expand_ops):
+        trace_df['bgmv_expand_ops'] = trace_df[bgmv_expand_ops].agg("sum",
+                                                                    axis=1)
+
+    if len(cutlass_gemm_ops):
+        trace_df['cutlass_gemm_ops'] = trace_df[cutlass_gemm_ops].agg("sum",
+                                                                      axis=1)
+
+    if len(gemm_ops):
+        trace_df['gemm_ops'] = trace_df[gemm_ops].agg("sum", axis=1)
+    if len(rms_norm_ops):
+        trace_df['rms_norm_ops'] = trace_df[rms_norm_ops].agg("sum", axis=1)
+    if len(vocab_embed_ops):
+        trace_df['vocab_embed_ops'] = trace_df[vocab_embed_ops].agg("sum",
+                                                                    axis=1)
+    if len(mem_ops):
+        trace_df['mem_ops'] = trace_df[mem_ops].agg("sum", axis=1)
+    if len(elementwise_ops):
+        trace_df['elementwise_ops'] = trace_df[elementwise_ops].agg("sum",
+                                                                    axis=1)
+
+    if len(nccl_all_reduce_ops):
+        trace_df['nccl_all_reduce_ops'] = trace_df[nccl_all_reduce_ops].agg(
+            "sum", axis=1)
+    if len(nccl_gather_ops):
+        trace_df['nccl_gather_ops'] = trace_df[nccl_gather_ops].agg("sum",
+                                                                    axis=1)
+    if len(nccl_broadcast_ops):
+        trace_df['nccl_broadcast_ops'] = trace_df[nccl_broadcast_ops].agg(
+            "sum", axis=1)
+    if len(nccl_other_ops):
+        trace_df['nccl_other_ops'] = trace_df[nccl_other_ops].agg("sum",
+                                                                  axis=1)
+
+    if len(cross_device_reduce_1stage_ops):
+        trace_df['cross_device_reduce_1stage_ops'] = trace_df[
+            cross_device_reduce_1stage_ops].agg("sum", axis=1)
+    if len(cross_device_reduce_2stage_ops):
+        trace_df['cross_device_reduce_2stage_ops'] = trace_df[
+            cross_device_reduce_2stage_ops].agg("sum", axis=1)
+    if len(custom_ar_all_reduce_ops):
+        trace_df['custom_ar_all_reduce_ops'] = trace_df[
+            custom_ar_all_reduce_ops].agg("sum", axis=1)
+    if len(reduce_kernel_ops):
+        trace_df['reduce_kernel_ops'] = trace_df[reduce_kernel_ops].agg("sum",
+                                                                        axis=1)
+
+    trace_df.drop(attention_ops + quant_ops + sgmv_shrink_ops +
+                  sgmv_expand_ops + bgmv_shrink_ops + bgmv_expand_ops +
+                  cutlass_gemm_ops + gemm_ops + rms_norm_ops +
+                  vocab_embed_ops + mem_ops + elementwise_ops +
+                  nccl_all_reduce_ops + nccl_gather_ops + nccl_broadcast_ops +
+                  nccl_other_ops + cross_device_reduce_1stage_ops +
+                  cross_device_reduce_2stage_ops + custom_ar_all_reduce_ops +
+                  reduce_kernel_ops,
+                  axis=1,
+                  inplace=True)
+    return trace_df
+
+
+## Data plotting utils ####
+
+
+def plot_trace_df(traces_df: pd.DataFrame,
+                  plot_metric: str,
+                  plot_title: str,
+                  output: Optional[Path] = None):
+
+    def get_phase_description(traces_df: pd.DataFrame, phase: str) -> str:
+        phase_df = traces_df.query(f'phase == "{phase}"')
+        descs = phase_df['phase_desc'].to_list()
+        assert all([desc == descs[0] for desc in descs])
+        return descs[0]
+
+    phases = traces_df['phase'].unique()
+    phase_descs = [get_phase_description(traces_df, p) for p in phases]
+    traces_df = traces_df.pivot_table(index="phase",
+                                      columns="name",
+                                      values=plot_metric,
+                                      aggfunc="sum")
+
+    traces_df = group_trace_by_operations(traces_df)
+
+    # Make the figure
+    fig_size_x = max(5, len(phases))
+    fig, ax = plt.subplots(1, figsize=(fig_size_x, 8), sharex=True)
+
+    # Draw the stacked bars
+    ops = list(traces_df)
+    bottom = [0] * len(phases)
+    for op in ops:
+        values = [traces_df[op][phase] for phase in phases]
+        values = list(map(lambda x: 0.0 if math.isnan(x) else x, values))
+        ax.bar(phase_descs, values, label=op, bottom=bottom)
+        bottom = [bottom[j] + values[j] for j in range(len(phases))]
+
+    # Write the values as text on the bars
+    for bar in ax.patches:
+        if bar.get_height() != 0:
+            ax.text(bar.get_x() + bar.get_width() / 2,
+                    bar.get_height() / 2 + bar.get_y(),
+                    f"{round(bar.get_height(), 2)}",
+                    ha='center',
+                    color='w',
+                    weight='bold',
+                    size=5)
+
+    # Setup legend
+    handles, labels = plt.gca().get_legend_handles_labels()
+    legend = fig.legend(handles,
+                        labels,
+                        loc='center left',
+                        bbox_to_anchor=(1, 1))
+    shorten_plot_legend_strings(legend, 50)
+
+    # Setup labels and title
+    plt.setp(ax.get_xticklabels(), rotation=90)
+    ax.set_ylabel(plot_metric)
+    plt.suptitle(plot_title)
+
+    plt.savefig(output, bbox_inches='tight')
+    print("Created: ", output)
+
+
+def main(
+        json_trace: Path,
+        output_directory: Path,
+        depth: int,  # Fetch/Plot operations at this depth of the Json tree
+        plot_metric: str,
+        make_names_unique: bool,
+        top_k: int,
+        json_nodes_to_fold: list[str]):
+
+    def prepare_data(profile_json: dict, step_keys: list[str]) -> pd.DataFrame:
+
+        def get_entries_and_traces(key: str):
+            entries_and_traces: list[tuple[Any, Any]] = []
+            for root in profile_json[key]["summary_stats"]:
+                # Fold nodes in the traces as per user request. i.e. simply
+                # make the requested nodes leaf-nodes.
+                root = fold_nodes(root, json_nodes_to_fold)
+                get_entries_at_depth(depth, entries_and_traces, root)
+            return entries_and_traces
+
+        def keep_only_top_entries(df: pd.DataFrame,
+                                  metric: str,
+                                  top_k: int = 9) -> pd.DataFrame:
+            df.loc[df.nsmallest(len(df) - top_k + 1, metric).index,
+                   ["name"]] = "others"
+            return df
+
+        def get_phase_description(key: str) -> str:
+            num_running_seqs = profile_json[key]['metadata'][
+                'num_running_seqs']
+            if num_running_seqs is not None:
+                return f"{key}-seqs-{num_running_seqs}"
+            else:
+                return key
+
+        # Get data for each key
+        traces = list(map(lambda x: get_entries_and_traces(x), step_keys))
+
+        # Attempt some cleanup
+        if make_names_unique:
+            for trace in traces:
+                attempt_to_make_names_unique(trace)
+
+        # To pandas dataframe
+        trace_dfs = list(
+            map(lambda t: pd.DataFrame([entry for entry, _ in t]).fillna(0),
+                traces))
+
+        # Respect top_k
+        if top_k:
+            trace_dfs = list(
+                map(
+                    lambda trace_df: keep_only_top_entries(
+                        trace_df, "cuda_time_us", top_k), trace_dfs))
+
+        # Fill in information about the step-keys
+        for trace_df, step_key in zip(trace_dfs, step_keys):
+            trace_df['phase'] = step_key
+            trace_df['phase_desc'] = get_phase_description(step_key)
+
+        # Combine all data frames so they can be put in a single plot
+        traces_df = pd.concat(trace_dfs)
+
+        # Add a derived metric `cuda_time_ms`
+        traces_df["cuda_time_ms"] = traces_df["cuda_time_us"] / 1000
+        traces_df = traces_df.fillna(0)
+
+        return traces_df
+
+    def make_plot_title_suffix(profile_json: dict) -> str:
+        context = profile_json["context"]
+        sparsity = context.get('sparsity', None)
+        run_type = \
+            f'Run {context["num_steps"]} steps' if context['num_steps'] else \
+                (f'Complete {context["complete_num_requests_per_step"]} per '
+                 f'step; Run till completion')
+        return (f"{context['engine_args']['model']}\n"
+                f"Batch={context['batch_size']}, "
+                f"PromptLen={context['prompt_len']}, "
+                f"NumGpus={context['engine_args']['tensor_parallel_size']}"
+                f"{', Sparsity ' + sparsity if sparsity else ''}\n"
+                f"Run Type: {run_type}")
+
+    profile_json = None
+    with open(json_trace) as f:
+        profile_json = json.load(f)
+    assert profile_json is not None
+
+    # Get all `llm.generate.step()` profile
+    step_traces = list(profile_json.keys())
+    assert (step_traces[0] == 'context')
+    step_traces = step_traces[1:]  # have only prefill and decodes
+    prefills = list(filter(lambda x: "prefill" in x, step_traces))
+    all_decodes = list(filter(lambda x: "decode" in x, step_traces))
+    assert len(prefills) + len(all_decodes) == len(step_traces)
+    assert len(prefills) == 1
+
+    decodes = all_decodes[::args.step_plot_interval]
+    if decodes[-1] != all_decodes[-1]:
+        # Always have the last decode
+        decodes.append(all_decodes[-1])
+
+    prefill_traces = prepare_data(profile_json, prefills)
+    decode_traces = prepare_data(profile_json, decodes)
+
+    plot_title_suffix = make_plot_title_suffix(profile_json)
+
+    plot_trace_df(prefill_traces, plot_metric, "prefill " + plot_title_suffix,
+                  output_directory / Path("prefill.png"))
+    plot_trace_df(decode_traces, plot_metric, "decodes " + plot_title_suffix,
+                  output_directory / Path("decode_steps.png"))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--json-trace",
+                        type=str,
+                        required=True,
+                        help="json trace file output by \
+                              examples/offline_inference/profiling.py")
+    parser.add_argument("--output-directory",
+                        type=str,
+                        required=False,
+                        help="Directory to output plots")
+    parser.add_argument("--level",
+                        type=str,
+                        default="module",
+                        choices=["module", "kernel"])
+    parser.add_argument("--top-k",
+                        type=int,
+                        default=12,
+                        help="Only graph the top `top_k` entries by time.")
+    parser.add_argument("--fold-json-node",
+                        nargs='+',
+                        default=['Sampler', 'LogitsProcessor'],
+                        help='Do not plot the children of these nodes. Let, \
+                              the node represent the aggregate of all its \
+                              children')
+    parser.add_argument("--plot-metric",
+                        type=str,
+                        default="cuda_time_ms",
+                        help='Metric to plot. some options are cuda_time_ms, \
+                                pct_cuda_time')
+    parser.add_argument(
+        "--step-plot-interval",
+        type=int,
+        default=4,
+        help="For every `step_plot_interval` steps, plot 1 step")
+
+    args = parser.parse_args()
+
+    # Prepare/Extract relevant args
+    make_names_unique = False
+    if args.level == "module":
+        depth = -2
+        make_names_unique = True
+    elif args.level == "kernel":
+        depth = -1
+    else:
+        raise Exception(f"Unexpected level value ({args.level})")
+
+    output_directory = args.output_directory if args.output_directory else Path(
+        args.json_trace).parent
+
+    if not os.path.exists(output_directory):
+        os.makedirs(output_directory)
+
+    main(Path(args.json_trace), output_directory, depth, args.plot_metric,
+         make_names_unique, args.top_k, args.fold_json_node)
diff --git a/vllm_v0.10.0/tools/report_build_time_ninja.py b/vllm_v0.10.0/tools/report_build_time_ninja.py
new file mode 100644
index 0000000..7386cdd
--- /dev/null
+++ b/vllm_v0.10.0/tools/report_build_time_ninja.py
@@ -0,0 +1,316 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright (c) 2018 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+# Modified version of: https://chromium.googlesource.com/chromium/tools/depot_tools.git/+/refs/heads/main/post_build_ninja_summary.py
+"""Summarize the last ninja build, invoked with ninja's -C syntax.
+
+> python3 tools/report_build_time_ninja.py -C build/..
+
+Typical output looks like this:
+```
+    Longest build steps for .cpp.o:
+           1.0 weighted s to build ...torch_bindings.cpp.o (12.4 s elapsed time)
+           2.0 weighted s to build ..._attn_c.dir/csrc... (23.5 s elapsed time)
+           2.6 weighted s to build ...torch_bindings.cpp.o (31.5 s elapsed time)
+           3.2 weighted s to build ...torch_bindings.cpp.o (38.5 s elapsed time)
+    Longest build steps for .so (linking):
+           0.1 weighted s to build _moe_C.abi3.so (1.0 s elapsed time)
+           0.5 weighted s to build ...flash_attn_c.abi3.so (1.1 s elapsed time)
+           6.2 weighted s to build _C.abi3.so (6.2 s elapsed time)
+    Longest build steps for .cu.o:
+          15.3 weighted s to build ...machete_mm_... (183.5 s elapsed time)
+          15.3 weighted s to build ...machete_mm_... (183.5 s elapsed time)
+          15.3 weighted s to build ...machete_mm_... (183.6 s elapsed time)
+          15.3 weighted s to build ...machete_mm_... (183.7 s elapsed time)
+          15.5 weighted s to build ...machete_mm_... (185.6 s elapsed time)
+          15.5 weighted s to build ...machete_mm_... (185.9 s elapsed time)
+          15.5 weighted s to build ...machete_mm_... (186.2 s elapsed time)
+          37.4 weighted s to build ...scaled_mm_c3x.cu... (449.0 s elapsed time)
+          43.9 weighted s to build ...scaled_mm_c2x.cu... (527.4 s elapsed time)
+         344.8 weighted s to build ...attention_...cu.o (1087.2 s elapsed time)
+    1110.0 s weighted time (10120.4 s elapsed time sum, 9.1x parallelism)
+    134 build steps completed, average of 0.12/s
+```
+"""
+
+import argparse
+import errno
+import fnmatch
+import os
+import sys
+from collections import defaultdict
+
+# The number of long build times to report:
+long_count = 10
+# The number of long times by extension to report
+long_ext_count = 10
+
+
+class Target:
+    """Represents a single line read for a .ninja_log file."""
+
+    def __init__(self, start, end):
+        """Creates a target object by passing in the start/end times in seconds
+        as a float."""
+        self.start = start
+        self.end = end
+        # A list of targets, appended to by the owner of this object.
+        self.targets = []
+        self.weighted_duration = 0.0
+
+    def Duration(self):
+        """Returns the task duration in seconds as a float."""
+        return self.end - self.start
+
+    def SetWeightedDuration(self, weighted_duration):
+        """Sets the duration, in seconds, passed in as a float."""
+        self.weighted_duration = weighted_duration
+
+    def WeightedDuration(self):
+        """Returns the task's weighted duration in seconds as a float.
+
+        Weighted_duration takes the elapsed time of the task and divides it
+        by how many other tasks were running at the same time. Thus, it
+        represents the approximate impact of this task on the total build time,
+        with serialized or serializing steps typically ending up with much
+        longer weighted durations.
+        weighted_duration should always be the same or shorter than duration.
+        """
+        # Allow for modest floating-point errors
+        epsilon = 0.000002
+        if (self.weighted_duration > self.Duration() + epsilon):
+            print('{} > {}?'.format(self.weighted_duration, self.Duration()))
+        assert (self.weighted_duration <= self.Duration() + epsilon)
+        return self.weighted_duration
+
+    def DescribeTargets(self):
+        """Returns a printable string that summarizes the targets."""
+        # Some build steps generate dozens of outputs - handle them sanely.
+        # The max_length was chosen so that it can fit most of the long
+        # single-target names, while minimizing word wrapping.
+        result = ', '.join(self.targets)
+        max_length = 65
+        if len(result) > max_length:
+            result = result[:max_length] + '...'
+        return result
+
+
+# Copied with some modifications from ninjatracing
+def ReadTargets(log, show_all):
+    """Reads all targets from .ninja_log file |log_file|, sorted by duration.
+
+    The result is a list of Target objects."""
+    header = log.readline()
+    assert header == '# ninja log v5\n', \
+           'unrecognized ninja log version {!r}'.format(header)
+    targets_dict = {}
+    last_end_seen = 0.0
+    for line in log:
+        parts = line.strip().split('\t')
+        if len(parts) != 5:
+            # If ninja.exe is rudely halted then the .ninja_log file may be
+            # corrupt. Silently continue.
+            continue
+        start, end, _, name, cmdhash = parts  # Ignore restart.
+        # Convert from integral milliseconds to float seconds.
+        start = int(start) / 1000.0
+        end = int(end) / 1000.0
+        if not show_all and end < last_end_seen:
+            # An earlier time stamp means that this step is the first in a new
+            # build, possibly an incremental build. Throw away the previous
+            # data so that this new build will be displayed independently.
+            # This has to be done by comparing end times because records are
+            # written to the .ninja_log file when commands complete, so end
+            # times are guaranteed to be in order, but start times are not.
+            targets_dict = {}
+        target = None
+        if cmdhash in targets_dict:
+            target = targets_dict[cmdhash]
+            if not show_all and (target.start != start or target.end != end):
+                # If several builds in a row just run one or two build steps
+                # then the end times may not go backwards so the last build may
+                # not be detected as such. However in many cases there will be a
+                # build step repeated in the two builds and the changed
+                # start/stop points for that command, identified by the hash,
+                # can be used to detect and reset the target dictionary.
+                targets_dict = {}
+                target = None
+        if not target:
+            targets_dict[cmdhash] = target = Target(start, end)
+        last_end_seen = end
+        target.targets.append(name)
+    return list(targets_dict.values())
+
+
+def GetExtension(target, extra_patterns):
+    """Return the file extension that best represents a target.
+
+  For targets that generate multiple outputs it is important to return a
+  consistent 'canonical' extension. Ultimately the goal is to group build steps
+  by type."""
+    for output in target.targets:
+        if extra_patterns:
+            for fn_pattern in extra_patterns.split(';'):
+                if fnmatch.fnmatch(output, '*' + fn_pattern + '*'):
+                    return fn_pattern
+        # Not a true extension, but a good grouping.
+        if output.endswith('type_mappings'):
+            extension = 'type_mappings'
+            break
+
+        # Capture two extensions if present. For example: file.javac.jar should
+        # be distinguished from file.interface.jar.
+        root, ext1 = os.path.splitext(output)
+        _, ext2 = os.path.splitext(root)
+        extension = ext2 + ext1  # Preserve the order in the file name.
+
+        if len(extension) == 0:
+            extension = '(no extension found)'
+
+        if ext1 in ['.pdb', '.dll', '.exe']:
+            extension = 'PEFile (linking)'
+            # Make sure that .dll and .exe are grouped together and that the
+            # .dll.lib files don't cause these to be listed as libraries
+            break
+        if ext1 in ['.so', '.TOC']:
+            extension = '.so (linking)'
+            # Attempt to identify linking, avoid identifying as '.TOC'
+            break
+        # Make sure .obj files don't get categorized as mojo files
+        if ext1 in ['.obj', '.o']:
+            break
+        # Jars are the canonical output of java targets.
+        if ext1 == '.jar':
+            break
+        # Normalize all mojo related outputs to 'mojo'.
+        if output.count('.mojom') > 0:
+            extension = 'mojo'
+            break
+    return extension
+
+
+def SummarizeEntries(entries, extra_step_types):
+    """Print a summary of the passed in list of Target objects."""
+
+    # Create a list that is in order by time stamp and has entries for the
+    # beginning and ending of each build step (one time stamp may have multiple
+    # entries due to multiple steps starting/stopping at exactly the same time).
+    # Iterate through this list, keeping track of which tasks are running at all
+    # times. At each time step calculate a running total for weighted time so
+    # that when each task ends its own weighted time can easily be calculated.
+    task_start_stop_times = []
+
+    earliest = -1
+    latest = 0
+    total_cpu_time = 0
+    for target in entries:
+        if earliest < 0 or target.start < earliest:
+            earliest = target.start
+        if target.end > latest:
+            latest = target.end
+        total_cpu_time += target.Duration()
+        task_start_stop_times.append((target.start, 'start', target))
+        task_start_stop_times.append((target.end, 'stop', target))
+    length = latest - earliest
+    weighted_total = 0.0
+
+    # Sort by the time/type records and ignore |target|
+    task_start_stop_times.sort(key=lambda times: times[:2])
+    # Now we have all task start/stop times sorted by when they happen. If a
+    # task starts and stops on the same time stamp then the start will come
+    # first because of the alphabet, which is important for making this work
+    # correctly.
+    # Track the tasks which are currently running.
+    running_tasks = {}
+    # Record the time we have processed up to so we know how to calculate time
+    # deltas.
+    last_time = task_start_stop_times[0][0]
+    # Track the accumulated weighted time so that it can efficiently be added
+    # to individual tasks.
+    last_weighted_time = 0.0
+    # Scan all start/stop events.
+    for event in task_start_stop_times:
+        time, action_name, target = event
+        # Accumulate weighted time up to now.
+        num_running = len(running_tasks)
+        if num_running > 0:
+            # Update the total weighted time up to this moment.
+            last_weighted_time += (time - last_time) / float(num_running)
+        if action_name == 'start':
+            # Record the total weighted task time when this task starts.
+            running_tasks[target] = last_weighted_time
+        if action_name == 'stop':
+            # Record the change in the total weighted task time while this task
+            # ran.
+            weighted_duration = last_weighted_time - running_tasks[target]
+            target.SetWeightedDuration(weighted_duration)
+            weighted_total += weighted_duration
+            del running_tasks[target]
+        last_time = time
+    assert (len(running_tasks) == 0)
+
+    # Warn if the sum of weighted times is off by more than half a second.
+    if abs(length - weighted_total) > 500:
+        print('Warning: Possible corrupt ninja log, results may be '
+              'untrustworthy. Length = {:.3f}, weighted total = {:.3f}'.format(
+                  length, weighted_total))
+
+    entries_by_ext = defaultdict(list)
+    for target in entries:
+        extension = GetExtension(target, extra_step_types)
+        entries_by_ext[extension].append(target)
+
+    for key, values in entries_by_ext.items():
+        print('    Longest build steps for {}:'.format(key))
+        values.sort(key=lambda x: x.WeightedDuration())
+        for target in values[-long_count:]:
+            print(
+                '      {:8.1f} weighted s to build {} ({:.1f} s elapsed time)'.
+                format(target.WeightedDuration(), target.DescribeTargets(),
+                       target.Duration()))
+
+    print('    {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x '
+          'parallelism)'.format(length, total_cpu_time,
+                                total_cpu_time * 1.0 / length))
+    print('    {} build steps completed, average of {:1.2f}/s'.format(
+        len(entries),
+        len(entries) / (length)))
+
+
+def main():
+    log_file = '.ninja_log'
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-C', dest='build_directory', help='Build directory.')
+    parser.add_argument(
+        '-s',
+        '--step-types',
+        help='semicolon separated fnmatch patterns for build-step grouping')
+    parser.add_argument('--log-file',
+                        help="specific ninja log file to analyze.")
+    args, _extra_args = parser.parse_known_args()
+    if args.build_directory:
+        log_file = os.path.join(args.build_directory, log_file)
+    if args.log_file:
+        log_file = args.log_file
+    if args.step_types:
+        # Make room for the extra build types.
+        global long_ext_count
+        long_ext_count += len(args.step_types.split(';'))
+
+    try:
+        with open(log_file) as log:
+            entries = ReadTargets(log, False)
+            SummarizeEntries(entries, args.step_types)
+    except OSError:
+        print('Log file {!r} not found, no build summary created.'.format(
+            log_file))
+        return errno.ENOENT
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/vllm_v0.10.0/tools/shellcheck.sh b/vllm_v0.10.0/tools/shellcheck.sh
new file mode 100755
index 0000000..59ce400
--- /dev/null
+++ b/vllm_v0.10.0/tools/shellcheck.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+set -e
+
+scversion="stable"
+
+if [ -d "shellcheck-${scversion}" ]; then
+    export PATH="$PATH:$(pwd)/shellcheck-${scversion}"
+fi
+
+if ! [ -x "$(command -v shellcheck)" ]; then
+    if [ "$(uname -s)" != "Linux" ] || [ "$(uname -m)" != "x86_64" ]; then
+        echo "Please install shellcheck: https://github.com/koalaman/shellcheck?tab=readme-ov-file#installing"
+        exit 1
+    fi
+
+    # automatic local install if linux x86_64
+    wget -qO- "https://github.com/koalaman/shellcheck/releases/download/${scversion?}/shellcheck-${scversion?}.linux.x86_64.tar.xz" | tar -xJv
+    export PATH="$PATH:$(pwd)/shellcheck-${scversion}"
+fi
+
+# TODO - fix warnings in .buildkite/scripts/hardware_ci/run-amd-test.sh
+find . -name "*.sh" ".git" -prune -not -path "./.buildkite/scripts/hardware_ci/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck -s bash "{}"'
diff --git a/vllm_v0.10.0/tools/update-dockerfile-graph.sh b/vllm_v0.10.0/tools/update-dockerfile-graph.sh
new file mode 100755
index 0000000..88189e8
--- /dev/null
+++ b/vllm_v0.10.0/tools/update-dockerfile-graph.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+# Update Dockerfile dependency graph when docker/Dockerfile changes.
+# This script is designed to be used as a pre-commit hook.
+
+set -euo pipefail
+
+# Accept file paths as arguments
+FILES=("$@")
+
+# Check if docker/Dockerfile is among the provided files
+if printf '%s\n' "${FILES[@]}" | grep -q "^docker/Dockerfile$"; then
+  echo "docker/Dockerfile has changed, attempting to update dependency graph..."
+
+  # Check if Docker is installed and running
+  if ! command -v docker &> /dev/null; then
+    echo "Warning: Docker command not found. Skipping Dockerfile graph update."
+    echo "Please install Docker to automatically update the graph: https://docs.docker.com/get-docker/"
+    exit 0
+  fi
+  if ! docker info &> /dev/null; then
+    echo "Warning: Docker daemon is not running. Skipping Dockerfile graph update."
+    echo "Please start Docker to automatically update the graph."
+    exit 0
+  fi
+
+  # Define the target file path
+  TARGET_GRAPH_FILE="docs/assets/contributing/dockerfile-stages-dependency.png"
+
+  # Ensure target directory exists
+  mkdir -p "$(dirname "$TARGET_GRAPH_FILE")"
+
+  # Store old image hash in a variable if the file exists
+  OLD_HASH=""
+  if [ -f "$TARGET_GRAPH_FILE" ]; then
+    OLD_HASH=$(sha256sum "$TARGET_GRAPH_FILE")
+  fi
+  
+  # Generate Dockerfile graph
+  echo "Running dockerfilegraph tool..."
+  docker run \
+    --rm \
+    --user "$(id -u):$(id -g)" \
+    --workdir /workspace \
+    --volume "$(pwd)":/workspace \
+    ghcr.io/patrickhoefler/dockerfilegraph:alpine \
+    --output png \
+    --dpi 200 \
+    --max-label-length 50 \
+    --filename docker/Dockerfile \
+    --legend
+  
+  echo "Finding generated PNG file..."
+  # Check for Dockerfile.png in the root directory (most likely location)
+  if [ -f "./Dockerfile.png" ]; then
+    echo "Found generated file at: ./Dockerfile.png"
+    mv "./Dockerfile.png" "$TARGET_GRAPH_FILE"
+  else
+    # Try to find it elsewhere
+    DOCKERFILE_PNG=$(find . -name "Dockerfile.png" -type f | head -1)
+    
+    if [ -n "$DOCKERFILE_PNG" ]; then
+      echo "Found generated file at: $DOCKERFILE_PNG"
+      mv "$DOCKERFILE_PNG" "$TARGET_GRAPH_FILE"
+    else
+      echo "Error: Could not find the generated PNG file"
+      find . -name "*.png" -type f -mmin -5
+      exit 1
+    fi
+  fi
+  
+  # Check if the graph has changed
+  NEW_HASH=$(sha256sum "$TARGET_GRAPH_FILE")
+  if [ "$NEW_HASH" != "$OLD_HASH" ]; then
+    echo "Graph has changed. Please stage the updated file: $TARGET_GRAPH_FILE"
+    exit 1
+  else
+    echo "No changes in graph detected."
+  fi
+fi
+
+exit 0
diff --git a/vllm_v0.10.0/tools/validate_config.py b/vllm_v0.10.0/tools/validate_config.py
new file mode 100644
index 0000000..8b1e955
--- /dev/null
+++ b/vllm_v0.10.0/tools/validate_config.py
@@ -0,0 +1,158 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Ensures all fields in a config dataclass have default values
+and that each field has a docstring.
+"""
+
+import ast
+import inspect
+import sys
+
+
+def get_attr_docs(cls_node: ast.ClassDef) -> dict[str, str]:
+    """
+    Get any docstrings placed after attribute assignments in a class body.
+
+    Adapted from https://davidism.com/attribute-docstrings/
+    https://davidism.com/mit-license/
+    """
+
+    def pairwise(iterable):
+        """
+        Manually implement https://docs.python.org/3/library/itertools.html#itertools.pairwise
+
+        Can be removed when Python 3.9 support is dropped.
+        """
+        iterator = iter(iterable)
+        a = next(iterator, None)
+
+        for b in iterator:
+            yield a, b
+            a = b
+
+    out = {}
+
+    # Consider each pair of nodes.
+    for a, b in pairwise(cls_node.body):
+        # Must be an assignment then a constant string.
+        if (not isinstance(a, (ast.Assign, ast.AnnAssign))
+                or not isinstance(b, ast.Expr)
+                or not isinstance(b.value, ast.Constant)
+                or not isinstance(b.value.value, str)):
+            continue
+
+        doc = inspect.cleandoc(b.value.value)
+
+        # An assignment can have multiple targets (a = b = v), but an
+        # annotated assignment only has one target.
+        targets = a.targets if isinstance(a, ast.Assign) else [a.target]
+
+        for target in targets:
+            # Must be assigning to a plain name.
+            if not isinstance(target, ast.Name):
+                continue
+
+            out[target.id] = doc
+
+    return out
+
+
+class ConfigValidator(ast.NodeVisitor):
+
+    def __init__(self):
+        ...
+
+    def visit_ClassDef(self, node):
+        # Validate class with both @config and @dataclass decorators
+        decorators = [
+            id for d in node.decorator_list if (isinstance(d, ast.Name) and (
+                (id := d.id) == 'config' or id == 'dataclass')) or
+            (isinstance(d, ast.Call) and (isinstance(d.func, ast.Name) and
+                                          (id := d.func.id) == 'dataclass'))
+        ]
+
+        if set(decorators) == {'config', 'dataclass'}:
+            validate_class(node)
+        elif set(decorators) == {'config'}:
+            fail(
+                f"Class {node.name} with config decorator must be a dataclass.",
+                node)
+
+        self.generic_visit(node)
+
+
+def validate_class(class_node: ast.ClassDef):
+    attr_docs = get_attr_docs(class_node)
+
+    for stmt in class_node.body:
+        # A field is defined as a class variable that has a type annotation.
+        if isinstance(stmt, ast.AnnAssign):
+            # Skip ClassVar
+            # see https://docs.python.org/3/library/dataclasses.html#class-variables
+            if isinstance(stmt.annotation, ast.Subscript) and isinstance(
+                    stmt.annotation.value,
+                    ast.Name) and stmt.annotation.value.id == "ClassVar":
+                continue
+
+            if isinstance(stmt.target, ast.Name):
+                field_name = stmt.target.id
+                if stmt.value is None:
+                    fail(
+                        f"Field '{field_name}' in {class_node.name} must have "
+                        "a default value.", stmt)
+
+                if field_name not in attr_docs:
+                    fail(
+                        f"Field '{field_name}' in {class_node.name} must have "
+                        "a docstring.", stmt)
+
+                if isinstance(stmt.annotation, ast.Subscript) and \
+                   isinstance(stmt.annotation.value, ast.Name) \
+                    and stmt.annotation.value.id == "Union" and \
+                        isinstance(stmt.annotation.slice, ast.Tuple):
+                    args = stmt.annotation.slice.elts
+                    literal_args = [
+                        arg for arg in args
+                        if isinstance(arg, ast.Subscript) and isinstance(
+                            arg.value, ast.Name) and arg.value.id == "Literal"
+                    ]
+                    if len(literal_args) > 1:
+                        fail(
+                            f"Field '{field_name}' in {class_node.name} must "
+                            "use a single "
+                            "Literal type. Please use 'Literal[Literal1, "
+                            "Literal2]' instead of 'Union[Literal1, Literal2]'"
+                            ".", stmt)
+
+
+def validate_ast(tree: ast.stmt):
+    ConfigValidator().visit(tree)
+
+
+def validate_file(file_path: str):
+    try:
+        print(f"validating {file_path} config dataclasses ", end="")
+        with open(file_path, encoding="utf-8") as f:
+            source = f.read()
+
+        tree = ast.parse(source, filename=file_path)
+        validate_ast(tree)
+    except ValueError as e:
+        print(e)
+        SystemExit(2)
+    else:
+        print("✅")
+
+
+def fail(message: str, node: ast.stmt):
+    raise ValueError(f"❌ line({node.lineno}): {message}")
+
+
+def main():
+    for filename in sys.argv[1:]:
+        validate_file(filename)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/use_existing_torch.py b/vllm_v0.10.0/use_existing_torch.py
new file mode 100644
index 0000000..a9f79e1
--- /dev/null
+++ b/vllm_v0.10.0/use_existing_torch.py
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import glob
+
+requires_files = glob.glob('requirements/*.txt')
+requires_files += ["pyproject.toml"]
+for file in requires_files:
+    print(f">>> cleaning {file}")
+    with open(file) as f:
+        lines = f.readlines()
+    if "torch" in "".join(lines).lower():
+        print("removed:")
+        with open(file, 'w') as f:
+            for line in lines:
+                if 'torch' not in line.lower():
+                    f.write(line)
+                else:
+                    print(line.strip())
+    print(f"<<< done cleaning {file}")
+    print()
diff --git a/vllm_v0.10.0/vllm/__init__.py b/vllm_v0.10.0/vllm/__init__.py
new file mode 100644
index 0000000..7b90fd3
--- /dev/null
+++ b/vllm_v0.10.0/vllm/__init__.py
@@ -0,0 +1,96 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
+
+# The version.py should be independent library, and we always import the
+# version library first.  Such assumption is critical for some customization.
+from .version import __version__, __version_tuple__  # isort:skip
+
+import typing
+
+# The environment variables override should be imported before any other
+# modules to ensure that the environment variables are set before any
+# other modules are imported.
+import vllm.env_override  # noqa: F401
+
+MODULE_ATTRS = {
+    "AsyncEngineArgs": ".engine.arg_utils:AsyncEngineArgs",
+    "EngineArgs": ".engine.arg_utils:EngineArgs",
+    "AsyncLLMEngine": ".engine.async_llm_engine:AsyncLLMEngine",
+    "LLMEngine": ".engine.llm_engine:LLMEngine",
+    "LLM": ".entrypoints.llm:LLM",
+    "initialize_ray_cluster": ".executor.ray_utils:initialize_ray_cluster",
+    "PromptType": ".inputs:PromptType",
+    "TextPrompt": ".inputs:TextPrompt",
+    "TokensPrompt": ".inputs:TokensPrompt",
+    "ModelRegistry": ".model_executor.models:ModelRegistry",
+    "SamplingParams": ".sampling_params:SamplingParams",
+    "PoolingParams": ".pooling_params:PoolingParams",
+    "ClassificationOutput": ".outputs:ClassificationOutput",
+    "ClassificationRequestOutput": ".outputs:ClassificationRequestOutput",
+    "CompletionOutput": ".outputs:CompletionOutput",
+    "EmbeddingOutput": ".outputs:EmbeddingOutput",
+    "EmbeddingRequestOutput": ".outputs:EmbeddingRequestOutput",
+    "PoolingOutput": ".outputs:PoolingOutput",
+    "PoolingRequestOutput": ".outputs:PoolingRequestOutput",
+    "RequestOutput": ".outputs:RequestOutput",
+    "ScoringOutput": ".outputs:ScoringOutput",
+    "ScoringRequestOutput": ".outputs:ScoringRequestOutput",
+}
+
+if typing.TYPE_CHECKING:
+    from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+    from vllm.engine.async_llm_engine import AsyncLLMEngine
+    from vllm.engine.llm_engine import LLMEngine
+    from vllm.entrypoints.llm import LLM
+    from vllm.executor.ray_utils import initialize_ray_cluster
+    from vllm.inputs import PromptType, TextPrompt, TokensPrompt
+    from vllm.model_executor.models import ModelRegistry
+    from vllm.outputs import (ClassificationOutput,
+                              ClassificationRequestOutput, CompletionOutput,
+                              EmbeddingOutput, EmbeddingRequestOutput,
+                              PoolingOutput, PoolingRequestOutput,
+                              RequestOutput, ScoringOutput,
+                              ScoringRequestOutput)
+    from vllm.pooling_params import PoolingParams
+    from vllm.sampling_params import SamplingParams
+else:
+
+    def __getattr__(name: str) -> typing.Any:
+        from importlib import import_module
+
+        if name in MODULE_ATTRS:
+            module_name, attr_name = MODULE_ATTRS[name].split(":")
+            module = import_module(module_name, __package__)
+            return getattr(module, attr_name)
+        else:
+            raise AttributeError(
+                f'module {__package__} has no attribute {name}')
+
+
+__all__ = [
+    "__version__",
+    "__version_tuple__",
+    "LLM",
+    "ModelRegistry",
+    "PromptType",
+    "TextPrompt",
+    "TokensPrompt",
+    "SamplingParams",
+    "RequestOutput",
+    "CompletionOutput",
+    "PoolingOutput",
+    "PoolingRequestOutput",
+    "EmbeddingOutput",
+    "EmbeddingRequestOutput",
+    "ClassificationOutput",
+    "ClassificationRequestOutput",
+    "ScoringOutput",
+    "ScoringRequestOutput",
+    "LLMEngine",
+    "EngineArgs",
+    "AsyncLLMEngine",
+    "AsyncEngineArgs",
+    "initialize_ray_cluster",
+    "PoolingParams",
+]
diff --git a/vllm_v0.10.0/vllm/_custom_ops.py b/vllm_v0.10.0/vllm/_custom_ops.py
new file mode 100644
index 0000000..cf296a3
--- /dev/null
+++ b/vllm_v0.10.0/vllm/_custom_ops.py
@@ -0,0 +1,1907 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import contextlib
+from typing import TYPE_CHECKING, Optional, Union
+
+import torch
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.scalar_type import ScalarType
+
+logger = init_logger(__name__)
+
+if not current_platform.is_tpu() and not current_platform.is_xpu():
+    try:
+        import vllm._C
+    except ImportError as e:
+        logger.warning("Failed to import from vllm._C with %r", e)
+
+supports_moe_ops = False
+with contextlib.suppress(ImportError):
+    import vllm._moe_C  # noqa: F401
+    supports_moe_ops = True
+
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
+
+
+# page attention ops
+def paged_attention_v1(
+    out: torch.Tensor,
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    num_kv_heads: int,
+    scale: float,
+    block_tables: torch.Tensor,
+    seq_lens: torch.Tensor,
+    block_size: int,
+    max_seq_len: int,
+    alibi_slopes: Optional[torch.Tensor],
+    kv_cache_dtype: str,
+    k_scale: torch.Tensor,
+    v_scale: torch.Tensor,
+    tp_rank: int = 0,
+    blocksparse_local_blocks: int = 0,
+    blocksparse_vert_stride: int = 0,
+    blocksparse_block_size: int = 64,
+    blocksparse_head_sliding_step: int = 0,
+) -> None:
+    torch.ops._C.paged_attention_v1(
+        out, query, key_cache, value_cache, num_kv_heads, scale, block_tables,
+        seq_lens, block_size, max_seq_len, alibi_slopes, kv_cache_dtype,
+        k_scale, v_scale, tp_rank, blocksparse_local_blocks,
+        blocksparse_vert_stride, blocksparse_block_size,
+        blocksparse_head_sliding_step)
+
+
+def paged_attention_v2(
+    out: torch.Tensor,
+    exp_sum: torch.Tensor,
+    max_logits: torch.Tensor,
+    tmp_out: torch.Tensor,
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    num_kv_heads: int,
+    scale: float,
+    block_tables: torch.Tensor,
+    seq_lens: torch.Tensor,
+    block_size: int,
+    max_seq_len: int,
+    alibi_slopes: Optional[torch.Tensor],
+    kv_cache_dtype: str,
+    k_scale: torch.Tensor,
+    v_scale: torch.Tensor,
+    tp_rank: int = 0,
+    blocksparse_local_blocks: int = 0,
+    blocksparse_vert_stride: int = 0,
+    blocksparse_block_size: int = 64,
+    blocksparse_head_sliding_step: int = 0,
+) -> None:
+    torch.ops._C.paged_attention_v2(
+        out, exp_sum, max_logits, tmp_out, query, key_cache, value_cache,
+        num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len,
+        alibi_slopes, kv_cache_dtype, k_scale, v_scale, tp_rank,
+        blocksparse_local_blocks, blocksparse_vert_stride,
+        blocksparse_block_size, blocksparse_head_sliding_step)
+
+
+def paged_attention_rocm(
+    out: torch.Tensor,
+    exp_sum: torch.Tensor,
+    max_logits: torch.Tensor,
+    tmp_out: torch.Tensor,
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    num_kv_heads: int,
+    scale: float,
+    block_tables: torch.Tensor,
+    seq_lens: torch.Tensor,
+    query_start_loc: Optional[torch.Tensor],
+    block_size: int,
+    max_seq_len: int,
+    alibi_slopes: Optional[torch.Tensor],
+    kv_cache_dtype: str,
+    k_scale: torch.Tensor,
+    v_scale: torch.Tensor,
+    fp8_out_scale: Optional[torch.Tensor] = None,
+) -> None:
+    torch.ops._rocm_C.paged_attention(out, exp_sum, max_logits, tmp_out, query,
+                                      key_cache, value_cache, num_kv_heads,
+                                      scale, block_tables, seq_lens,
+                                      query_start_loc, block_size, max_seq_len,
+                                      alibi_slopes, kv_cache_dtype, k_scale,
+                                      v_scale, fp8_out_scale)
+
+
+def mla_decode_kvcache_cpu(
+    out: torch.Tensor,
+    query: torch.Tensor,
+    kv_cache: torch.Tensor,
+    scale: float,
+    block_tables: torch.Tensor,
+    seq_lens: torch.Tensor,
+) -> None:
+    torch.ops._C_cpu.mla_decode_kvcache(out, query, kv_cache, scale,
+                                        block_tables, seq_lens)
+
+
+# merge attn states ops
+def merge_attn_states(output: torch.Tensor,
+                      prefix_output: torch.Tensor,
+                      prefix_lse: torch.Tensor,
+                      suffix_output: torch.Tensor,
+                      suffix_lse: torch.Tensor,
+                      output_lse: Optional[torch.Tensor] = None) -> None:
+    torch.ops._C.merge_attn_states(output, output_lse, prefix_output,
+                                   prefix_lse, suffix_output, suffix_lse)
+
+
+def convert_vertical_slash_indexes(
+    q_seqlens: torch.Tensor,  # [BATCH, ]
+    kv_seqlens: torch.Tensor,  # [BATCH, ]
+    vertical_indexes: torch.Tensor,  # [BATCH, N_HEADS, NNZ_V]
+    slash_indexes: torch.Tensor,  # [BATCH, N_HEADS, NNZ_S]
+    context_size: int,
+    block_size_M: int,
+    block_size_N: int,
+    causal: bool = True,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    batch_size = slash_indexes.size(0)
+    num_heads = slash_indexes.size(1)
+    nnz_slash = slash_indexes.size(2)
+    nnz_vertical = vertical_indexes.size(2)
+    num_rows = (context_size + block_size_M - 1) // block_size_M
+
+    block_count = torch.zeros(batch_size,
+                              num_heads,
+                              num_rows,
+                              dtype=q_seqlens.dtype,
+                              device=q_seqlens.device)
+    block_offset = torch.zeros(batch_size,
+                               num_heads,
+                               num_rows,
+                               nnz_slash,
+                               dtype=q_seqlens.dtype,
+                               device=q_seqlens.device)
+    column_count = torch.zeros(batch_size,
+                               num_heads,
+                               num_rows,
+                               dtype=q_seqlens.dtype,
+                               device=q_seqlens.device)
+    column_index = torch.zeros(batch_size,
+                               num_heads,
+                               num_rows,
+                               nnz_vertical,
+                               dtype=q_seqlens.dtype,
+                               device=q_seqlens.device)
+
+    torch.ops._C.convert_vertical_slash_indexes(
+        block_count, block_offset, column_count, column_index, q_seqlens,
+        kv_seqlens, vertical_indexes, slash_indexes, context_size,
+        block_size_M, block_size_N, causal)
+    return block_count, block_offset, column_count, column_index
+
+
+def convert_vertical_slash_indexes_mergehead(
+    q_seqlens: torch.Tensor,  # [BATCH, ]
+    kv_seqlens: torch.Tensor,  # [BATCH, ]
+    vertical_indexes: torch.Tensor,  # [BATCH, N_HEADS, NNZ_V]
+    slash_indexes: torch.Tensor,  # [BATCH, N_HEADS, NNZ_S]
+    # [N_HEADS] : different head use different number of indices
+    vertical_indices_count: torch.Tensor,
+    slash_indices_count: torch.Tensor,
+    context_size: int,
+    block_size_M: int,
+    block_size_N: int,
+    causal: bool = True,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    batch_size = slash_indexes.size(0)
+    num_heads = slash_indexes.size(1)
+    nnz_slash = slash_indexes.size(2)
+    nnz_vertical = vertical_indexes.size(2)
+    num_rows = (context_size + block_size_M - 1) // block_size_M
+
+    block_count = torch.empty(batch_size,
+                              num_heads,
+                              num_rows,
+                              dtype=q_seqlens.dtype,
+                              device=q_seqlens.device)
+    block_offset = torch.empty(batch_size,
+                               num_heads,
+                               num_rows,
+                               nnz_slash,
+                               dtype=q_seqlens.dtype,
+                               device=q_seqlens.device)
+    column_count = torch.empty(batch_size,
+                               num_heads,
+                               num_rows,
+                               dtype=q_seqlens.dtype,
+                               device=q_seqlens.device)
+    column_index = torch.empty(batch_size,
+                               num_heads,
+                               num_rows,
+                               nnz_vertical,
+                               dtype=q_seqlens.dtype,
+                               device=q_seqlens.device)
+
+    torch.ops._C.convert_vertical_slash_indexes_mergehead(
+        block_count, block_offset, column_count, column_index, q_seqlens,
+        kv_seqlens, vertical_indexes, slash_indexes, vertical_indices_count,
+        slash_indices_count, context_size, block_size_M, block_size_N, causal)
+    return block_count, block_offset, column_count, column_index
+
+
+# pos encoding ops
+def rotary_embedding(
+    positions: torch.Tensor,
+    query: torch.Tensor,
+    key: Optional[torch.Tensor],
+    head_size: int,
+    cos_sin_cache: torch.Tensor,
+    is_neox: bool,
+) -> None:
+    torch.ops._C.rotary_embedding(positions, query, key, head_size,
+                                  cos_sin_cache, is_neox)
+
+
+def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
+                             key: Optional[torch.Tensor], head_size: int,
+                             cos_sin_cache: torch.Tensor, is_neox: bool,
+                             rot_dim: int,
+                             cos_sin_cache_offsets: torch.Tensor) -> None:
+    torch.ops._C.batched_rotary_embedding(positions, query, key, head_size,
+                                          cos_sin_cache, is_neox, rot_dim,
+                                          cos_sin_cache_offsets)
+
+
+# layer norm ops
+def rms_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor,
+             epsilon: float) -> None:
+    # TODO: Remove this contiguous call when the kernel is updated to support non-contiguous input
+    input_contiguous = input.contiguous()
+    torch.ops._C.rms_norm(out, input_contiguous, weight, epsilon)
+
+
+def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
+                       weight: torch.Tensor, epsilon: float) -> None:
+    torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon)
+
+
+def apply_repetition_penalties_torch(
+        logits: torch.Tensor, prompt_mask: torch.Tensor,
+        output_mask: torch.Tensor, repetition_penalties: torch.Tensor) -> None:
+    repetition_penalties = repetition_penalties.unsqueeze(dim=1).repeat(
+        1, logits.size(1))
+    # If token appears in prompt or output, apply, otherwise use 1.0 for no-op.
+    penalties = torch.where(prompt_mask | output_mask, repetition_penalties,
+                            1.0)
+    # If logits are positive, divide by penalty, otherwise multiply by penalty.
+    scaling = torch.where(logits > 0, 1.0 / penalties, penalties)
+    logits *= scaling
+
+
+def apply_repetition_penalties_cuda(
+        logits: torch.Tensor, prompt_mask: torch.Tensor,
+        output_mask: torch.Tensor, repetition_penalties: torch.Tensor) -> None:
+    torch.ops._C.apply_repetition_penalties_(logits, prompt_mask, output_mask,
+                                             repetition_penalties)
+
+
+def apply_repetition_penalties(logits: torch.Tensor, prompt_mask: torch.Tensor,
+                               output_mask: torch.Tensor,
+                               repetition_penalties: torch.Tensor) -> None:
+    """Apply repetition penalties to logits in-place.
+
+    Args:
+        logits: The logits tensor of shape [num_seqs, vocab_size].
+        prompt_mask: A boolean tensor indicating which tokens appear in the prompt.
+        output_mask: A boolean tensor indicating which tokens appear in the output.
+        repetition_penalties: The repetition penalties of shape (num_seqs, ).
+    """
+    if current_platform.is_cuda() and logits.is_contiguous():
+        apply_repetition_penalties_cuda(logits, prompt_mask, output_mask,
+                                        repetition_penalties)
+    else:
+        apply_repetition_penalties_torch(logits, prompt_mask, output_mask,
+                                         repetition_penalties)
+
+
+def advance_step_flashattn(num_seqs: int, num_queries: int, block_size: int,
+                           input_tokens: torch.Tensor,
+                           sampled_token_ids: torch.Tensor,
+                           input_positions: torch.Tensor,
+                           seq_lens: torch.Tensor, slot_mapping: torch.Tensor,
+                           block_tables: torch.Tensor) -> None:
+    """Advance a step on GPU for existing inputs for a multi-step runner"""
+    return torch.ops._C.advance_step_flashattn(num_seqs, num_queries,
+                                               block_size, input_tokens,
+                                               sampled_token_ids,
+                                               input_positions, seq_lens,
+                                               slot_mapping, block_tables)
+
+
+def advance_step_flashinfer(num_seqs: int, num_queries: int, block_size: int,
+                            input_tokens: torch.Tensor,
+                            sampled_token_ids: torch.Tensor,
+                            input_positions: torch.Tensor,
+                            seq_lens: torch.Tensor, slot_mapping: torch.Tensor,
+                            block_tables: torch.Tensor,
+                            paged_kv_indices: torch.Tensor,
+                            paged_kv_indptr: torch.Tensor,
+                            paged_kv_last_page_len: torch.Tensor,
+                            block_table_bound: torch.Tensor) -> None:
+
+    return torch.ops._C.advance_step_flashinfer(
+        num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
+        input_positions, seq_lens, slot_mapping, block_tables,
+        paged_kv_indices, paged_kv_indptr, paged_kv_last_page_len,
+        block_table_bound)
+
+
+# fused quant layer norm ops
+def rms_norm_dynamic_per_token_quant(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+    quant_dtype: torch.dtype,
+    scale_ub: Optional[torch.Tensor] = None,
+    residual: Optional[torch.Tensor] = None
+) -> tuple[torch.Tensor, torch.Tensor]:
+    output = torch.empty_like(input, dtype=quant_dtype)
+    scales = torch.empty((input.numel() // input.shape[-1], 1),
+                         device=input.device,
+                         dtype=torch.float32)
+
+    torch.ops._C.rms_norm_dynamic_per_token_quant(output, input, weight,
+                                                  scales, epsilon, scale_ub,
+                                                  residual)
+    return output, scales
+
+
+# quantization ops
+# awq
+def awq_dequantize(qweight: torch.Tensor, scales: torch.Tensor,
+                   zeros: torch.Tensor, split_k_iters: int, thx: int,
+                   thy: int) -> torch.Tensor:
+    if envs.VLLM_USE_TRITON_AWQ:
+        from vllm.model_executor.layers.quantization.awq_triton import (
+            awq_dequantize_triton)
+        return awq_dequantize_triton(qweight, scales, zeros)
+    return torch.ops._C.awq_dequantize(qweight, scales, zeros, split_k_iters,
+                                       thx, thy)
+
+
+def awq_gemm(input: torch.Tensor, qweight: torch.Tensor, qzeros: torch.Tensor,
+             scales: torch.Tensor, split_k_iters: int) -> torch.Tensor:
+    if envs.VLLM_USE_TRITON_AWQ:
+        from vllm.model_executor.layers.quantization.awq_triton import (
+            awq_gemm_triton)
+        return awq_gemm_triton(input, qweight, qzeros, scales, split_k_iters)
+    return torch.ops._C.awq_gemm(input, qweight, qzeros, scales, split_k_iters)
+
+
+# gptq
+def gptq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
+              b_gptq_qzeros: torch.Tensor, b_gptq_scales: torch.Tensor,
+              b_g_idx: torch.Tensor, use_exllama: bool,
+              bit: int) -> torch.Tensor:
+    return torch.ops._C.gptq_gemm(a, b_q_weight, b_gptq_qzeros, b_gptq_scales,
+                                  b_g_idx, use_exllama, bit)
+
+
+if hasattr(torch.ops._C, "gptq_gemm"):
+
+    @register_fake("_C::gptq_gemm")
+    def _gptq_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
+                        b_gptq_qzeros: torch.Tensor,
+                        b_gptq_scales: torch.Tensor, b_g_idx: torch.Tensor,
+                        use_exllama: bool, bit: int) -> torch.Tensor:
+        return torch.empty((a.size(0), b_q_weight.size(1)),
+                           dtype=a.dtype,
+                           device=a.device)
+
+
+def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
+                 bit: int) -> None:
+    torch.ops._C.gptq_shuffle(q_weight, q_perm, bit)
+
+
+# marlin
+def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
+                b_scales: torch.Tensor, workspace: torch.Tensor, size_m: int,
+                size_n: int, size_k: int) -> torch.Tensor:
+    return torch.ops._C.marlin_gemm(a, b_q_weight, b_scales, workspace, size_m,
+                                    size_n, size_k)
+
+
+# marlin_24
+def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
+                        b_meta: torch.Tensor, b_scales: torch.Tensor,
+                        workspace: torch.Tensor, b_q_type: ScalarType,
+                        size_m: int, size_n: int, size_k: int) -> torch.Tensor:
+    return torch.ops._C.gptq_marlin_24_gemm(a, b_q_weight, b_meta, b_scales,
+                                            workspace, b_q_type.id, size_m,
+                                            size_n, size_k)
+
+
+if hasattr(torch.ops._C, "gptq_marlin_24_gemm"):
+
+    @register_fake("_C::gptq_marlin_24_gemm")
+    def _gptq_marlin_24_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
+                                  b_meta: torch.Tensor, b_scales: torch.Tensor,
+                                  workspace: torch.Tensor,
+                                  b_q_type: ScalarType, size_m: torch.SymInt,
+                                  size_n: torch.SymInt,
+                                  size_k: torch.SymInt) -> torch.Tensor:
+        return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
+
+    @register_fake("_C::gptq_marlin_gemm")
+    def _gptq_marlin_gemm_fake(a: torch.Tensor,
+                               c: Optional[torch.Tensor],
+                               b_q_weight: torch.Tensor,
+                               b_scales: torch.Tensor,
+                               global_scale: Optional[torch.Tensor],
+                               b_zeros: Optional[torch.Tensor],
+                               g_idx: Optional[torch.Tensor],
+                               perm: Optional[torch.Tensor],
+                               workspace: torch.Tensor,
+                               b_q_type_id: int,
+                               size_m: torch.SymInt,
+                               size_n: torch.SymInt,
+                               size_k: torch.SymInt,
+                               is_k_full: bool = True,
+                               use_atomic_add: bool = False,
+                               use_fp32_reduce: bool = False,
+                               is_zp_float: bool = False) -> torch.Tensor:
+        return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
+
+    @register_fake("_C::marlin_qqq_gemm")
+    def _marlin_qqq_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
+                              s_tok: torch.Tensor, s_ch: torch.Tensor,
+                              s_group: torch.Tensor, workspace: torch.Tensor,
+                              size_m: torch.SymInt, size_n: torch.SymInt,
+                              size_k: torch.SymInt) -> torch.Tensor:
+        return torch.empty((size_m, size_n),
+                           dtype=torch.float16,
+                           device=a.device)
+
+    @register_fake("_C::marlin_gemm")
+    def _marlin_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
+                          b_scales: torch.Tensor, workspace: torch.Tensor,
+                          size_m: torch.SymInt, size_n: torch.SymInt,
+                          size_k: torch.SymInt) -> torch.Tensor:
+        return torch.empty((size_m, size_n),
+                           dtype=torch.float16,
+                           device=a.device)
+
+    @register_fake("_C::awq_dequantize")
+    def _awq_dequantize_fake(qweight: torch.Tensor, scales: torch.Tensor,
+                             zeros: torch.Tensor, split_k_iters: torch.SymInt,
+                             thx: int, thy: int) -> torch.Tensor:
+        in_c = qweight.size(0)
+        qout_c = qweight.size(1)
+        out_c = qout_c * 8
+        return torch.empty((in_c, out_c),
+                           dtype=scales.dtype,
+                           device=scales.device)
+
+    @register_fake("_C::awq_gemm")
+    def _awq_gemm_fake(input: torch.Tensor, qweight: torch.Tensor,
+                       qzeros: torch.Tensor, scales: torch.Tensor,
+                       split_k_iters: torch.SymInt) -> torch.Tensor:
+        num_in_feats = input.size(0)
+        return torch.empty((split_k_iters, num_in_feats, qweight.size(1) * 8),
+                           dtype=input.dtype,
+                           device=input.device).sum(0)
+
+    @register_fake("_C::aqlm_gemm")
+    def _aqlm_gemm_fake(input: torch.Tensor, codes: torch.Tensor,
+                        codebooks: torch.Tensor, scales: torch.Tensor,
+                        codebook_partition_sizes: list[int],
+                        bias: Optional[torch.Tensor]) -> torch.Tensor:
+        out_features = codes.size(0) * codebooks.size(2)
+        flat_input = input.reshape((-1, input.size(-1)))
+        flat_output = torch.empty((flat_input.size(0), out_features),
+                                  dtype=input.dtype,
+                                  device=input.device)
+
+        output_sizes = list(input.shape)
+        output_sizes.pop()
+        output_sizes.append(-1)
+        return flat_output.reshape(tuple(output_sizes))
+
+    @register_fake("_C::aqlm_dequant")
+    def _aqlm_dequant_fake(
+            codes: torch.Tensor, codebooks: torch.Tensor,
+            codebook_partition_sizes: list[int]) -> torch.Tensor:
+        in_features = codes.size(1) * 8
+        out_features = codes.size(0)
+        return torch.empty((out_features, in_features),
+                           dtype=codebooks.dtype,
+                           device=codebooks.device)
+
+    @register_fake("_C::machete_mm")
+    def machete_mm_fake(
+        a: torch.Tensor,
+        # b_q Should be the tensor returned by machete_prepack_B
+        b_q: torch.Tensor,
+        b_type: ScalarType,
+        out_type: Optional[torch.dtype] = None,
+        b_group_scales: Optional[torch.Tensor] = None,
+        b_group_zeros: Optional[torch.Tensor] = None,
+        b_group_size: Optional[int] = None,
+        b_channel_scales: Optional[torch.Tensor] = None,
+        a_token_scales: Optional[torch.Tensor] = None,
+        schedule: Optional[str] = None,
+    ) -> torch.Tensor:
+        m = a.size(0)
+        n = b_q.size(1)
+        return torch.empty((m, n), device=a.device, dtype=a.dtype)
+
+    @register_fake("_C::machete_prepack_B")
+    def machete_prepack_B_fake(
+            b_q_weight: torch.Tensor, a_type: torch.dtype, b_type: ScalarType,
+            group_scales_type: Optional[torch.dtype]) -> torch.Tensor:
+        return torch.empty_like(b_q_weight,
+                                memory_format=torch.contiguous_format)
+
+
+if hasattr(torch.ops._C, "allspark_w8a16_gemm"):
+
+    @register_fake("_C::allspark_w8a16_gemm")
+    def _allspark_w8a16_gemm_fake(a: torch.Tensor, b_qweight: torch.Tensor,
+                                  b_scales: torch.Tensor,
+                                  b_qzeros: Optional[torch.Tensor],
+                                  n: torch.SymInt, group_size: torch.SymInt,
+                                  sm_count: torch.SymInt,
+                                  sm_version: torch.SymInt,
+                                  CUBLAS_M_THRESHOLD: torch.SymInt,
+                                  has_zp: bool,
+                                  n32k16_reorder: bool) -> torch.Tensor:
+        m = a.size(0)
+        return torch.empty((m, n), device=a.device, dtype=a.dtype)
+
+
+if hasattr(torch.ops._C, "ggml_dequantize"):
+
+    @register_fake("_C::ggml_dequantize")
+    def _ggml_dequantize_fake(
+            W: torch.Tensor,
+            quant_type: int,
+            m: torch.SymInt,
+            n: torch.SymInt,
+            dtype: Optional[torch.dtype] = None) -> torch.Tensor:
+        return torch.empty((m, n), dtype=torch.float16, device=W.device)
+
+    @register_fake("_C::ggml_mul_mat_vec_a8")
+    def _ggml_mul_mat_vec_a8_fake(
+        W: torch.Tensor,
+        X: torch.Tensor,
+        quant_type: int,
+        row: torch.SymInt,
+    ) -> torch.Tensor:
+        return torch.empty((X.shape[0], row), dtype=X.dtype, device=W.device)
+
+    @register_fake("_C::ggml_mul_mat_a8")
+    def _ggml_mul_mat_a8_fake(
+        W: torch.Tensor,
+        X: torch.Tensor,
+        quant_type: int,
+        row: torch.SymInt,
+    ) -> torch.Tensor:
+        batch = X.size(0)
+        return torch.empty((batch, row), dtype=X.dtype, device=W.device)
+
+    @register_fake("_C::ggml_moe_a8")
+    def _ggml_moe_a8_fake(
+        X: torch.Tensor,
+        W: torch.Tensor,
+        sorted_token_ids: torch.Tensor,
+        expert_ids: torch.Tensor,
+        num_tokens_post_padded: torch.Tensor,
+        quant_type: int,
+        row: torch.SymInt,
+        top_k: torch.SymInt,
+        tokens: torch.SymInt,
+    ) -> torch.Tensor:
+        tokens = X.size(0)
+        return torch.empty((tokens * top_k, row),
+                           dtype=torch.float16,
+                           device=W.device)
+
+
+if hasattr(torch.ops._C, "ggml_moe_a8_vec"):
+
+    @register_fake("_C::ggml_moe_a8_vec")
+    def _ggml_moe_a8_vec_fake(
+        X: torch.Tensor,
+        W: torch.Tensor,
+        topk_ids: torch.Tensor,
+        top_k: int,
+        quant_type: int,
+        row: torch.SymInt,
+        tokens: torch.SymInt,
+    ) -> torch.Tensor:
+        tokens = X.size(0)
+        return torch.empty((tokens * top_k, row),
+                           dtype=X.dtype,
+                           device=W.device)
+
+
+# cutlass
+def cutlass_scaled_mm_supports_fp4(cuda_device_capability: int) -> bool:
+    return torch.ops._C.cutlass_scaled_mm_supports_fp4(cuda_device_capability)
+
+
+def cutlass_blockwise_scaled_grouped_mm(
+    output: torch.Tensor,
+    a: torch.Tensor,
+    b: torch.Tensor,
+    scales_a: torch.Tensor,
+    scales_b: torch.Tensor,
+    problem_sizes: torch.Tensor,
+    expert_offsets: torch.Tensor,
+):
+    torch.ops._C.cutlass_blockwise_scaled_grouped_mm(output, a, b, scales_a,
+                                                     scales_b, problem_sizes,
+                                                     expert_offsets)
+
+
+def cutlass_scaled_fp4_mm(a: torch.Tensor, b: torch.Tensor,
+                          block_scale_a: torch.Tensor,
+                          block_scale_b: torch.Tensor, alpha: torch.Tensor,
+                          out_dtype: torch.dtype) -> torch.Tensor:
+    assert a.ndim == 2 and b.ndim == 2
+    m, n = a.shape[0], b.shape[0]
+    out = torch.empty((m, n), dtype=out_dtype, device=a.device)
+    torch.ops._C.cutlass_scaled_fp4_mm(out, a, b, block_scale_a, block_scale_b,
+                                       alpha)
+    return out
+
+
+def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
+    return torch.ops._C.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
+
+
+def cutlass_scaled_mm_supports_block_fp8(cuda_device_capability: int) -> bool:
+    return torch.ops._C.cutlass_scaled_mm_supports_block_fp8(
+        cuda_device_capability)
+
+
+def cutlass_scaled_mm(a: torch.Tensor,
+                      b: torch.Tensor,
+                      scale_a: torch.Tensor,
+                      scale_b: torch.Tensor,
+                      out_dtype: torch.dtype,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """
+    `cutlass_scaled_mm` implements a fused version of
+        `output = torch.mm((scale_a * a), (scale_b * b)).to(out_dtype)`
+    where scale_a * a and scale_b * b are implemented using numpy-style
+    broadcasting.
+
+    In order to support blockwise scaling like found in DeepSeek V3 we also
+    support extended "group" broadcast rules. We extend the numpy-style
+    broadcasting rules with the following rule:
+        "if the extent of a dimension in the source shape is between 1 and
+        corresponding extent in the target shape we repeat each element along
+        that dimension  src_shape[dim] // target_shape[dim] times consecutively"
+    example if we have:
+          a = [[1, 2], and target_shape = (2, 4)
+               [3, 4]]
+    then we would expand a to:
+          a = [[1, 1, 2, 2],
+               [3, 3, 4, 4]]
+    currently we only support the case:
+        scale_a.shape * [1, 128] == a.shape
+        scale_b.shape * [128, 128] == b.shape
+    """
+    assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
+    assert bias is None or bias.shape[0] == b.shape[
+        1] and bias.dtype == out_dtype
+
+    m = a.shape[0]
+    n = b.shape[1]
+
+    cutlass_compatible_b = (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
+    if current_platform.is_rocm() or not cutlass_compatible_b:
+        from vllm.model_executor.layers.quantization.compressed_tensors.triton_scaled_mm import (  # noqa
+            triton_scaled_mm)
+        return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
+
+    out = torch.empty((m, n), dtype=out_dtype, device=a.device)
+
+    torch.ops._C.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
+
+    return out
+
+
+def cutlass_scaled_mm_azp(a: torch.Tensor,
+                          b: torch.Tensor,
+                          scale_a: torch.Tensor,
+                          scale_b: torch.Tensor,
+                          out_dtype: torch.dtype,
+                          azp_adj: torch.Tensor,
+                          azp: Optional[torch.Tensor] = None,
+                          bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """
+    :param azp_adj: In the per-tensor case, this should include the azp.
+    Always per-channel.
+    :param azp: Only set in the per-token case. Per-token if set.
+    """
+    assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
+    assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
+    assert bias is None or bias.numel(
+    ) == b.shape[1] and bias.dtype == out_dtype
+    assert azp is None or azp.numel() == a.shape[0]
+
+    m = a.shape[0]
+    n = b.shape[1]
+    out = torch.empty((m, n), dtype=out_dtype, device=a.device)
+
+    torch.ops._C.cutlass_scaled_mm_azp(out, a, b, scale_a, scale_b, azp_adj,
+                                       azp, bias)
+    return out
+
+
+def cutlass_sparse_scaled_mm_supported(cuda_device_capability: int) -> bool:
+    return torch.ops._C.cutlass_sparse_scaled_mm_supported(
+        cuda_device_capability)
+
+
+def cutlass_group_gemm_supported(cuda_device_capability: int) -> bool:
+    return torch.ops._C.cutlass_group_gemm_supported(cuda_device_capability)
+
+def cutlass_sparse_compress(a: torch.Tensor) \
+    -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Compresses a sparse matrix for use with Cutlass sparse operations.
+
+    This function takes a dense tensor and compresses it into two components:
+    non-zero elements and metadata. The compressed representation is compatible
+    with Cutlass sparse kernels.
+
+    Args:
+        a (torch.Tensor):
+            The input tensor to be compressed. Must have one of the following data types:
+            - `torch.int8`
+            - `torch.float8_e4m3fn`
+            - `torch.bfloat16`
+            - `torch.float16`
+
+    Returns:
+        tuple[torch.Tensor, torch.Tensor]:
+            A tuple containing:
+            - `a_nzs` (torch.Tensor): A tensor containing non-zero elements of `a`.
+            - `a_meta` (torch.Tensor): A tensor containing metadata for the sparse representation.
+
+    Raises:
+        ValueError: If the compression operation fails.
+
+    Notes:
+        - The `a_meta` tensor has a data type of `torch.uint8`.
+        - Each metadata element encodes the sparsity of 4 non-zero elements (i.e., `elemsPerMetaElem = 4`).
+        - The shape of `a_nzs` is `(m, k // 2)`, where `m` and `k` are the dimensions of the input tensor.
+        - The shape of `a_meta` is `(m, k // 2 // elemsPerMetaElem)`.
+    """
+    assert (a.dtype in [
+        torch.int8, torch.float8_e4m3fn, torch.bfloat16, torch.float16
+    ])
+    assert (a.is_contiguous())
+
+    # a_meta.dtype: torch.uint8 so elemsPerMetaElem = 8b / 2b_per_nz = 4
+    elemsPerMetaElem = 4
+    assert (a.shape[1] % (2 * elemsPerMetaElem) == 0)
+
+    return torch.ops._C.cutlass_sparse_compress(a)
+
+
+def cutlass_scaled_sparse_mm(
+        a: torch.Tensor,
+        bt_nzs: torch.Tensor,
+        bt_meta: torch.Tensor,
+        scale_a: torch.Tensor,
+        scale_b: torch.Tensor,
+        out_dtype: torch.dtype,
+        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """
+    Performs a scaled sparse matrix multiplication using Cutlass.
+
+    Steps:
+    1. Create a dense matrix `a` of shape (m, k) on the CUDA device:
+    `a = torch.randn((m, k), device='cuda')`.
+
+    2. Create a dense matrix `b` of shape (k, n) on the CUDA device:
+    `b = torch.randn((k, n), device='cuda')`.
+
+    3. Prune matrix `b` to 2:4 sparsity along the specified dimension:
+    `b = prune_to_2_4(b, dim=0)`.
+
+    4. Compress the transposed sparse matrix `b.t()`:
+    `bt_nzs, bt_meta = cutlass_sparse_compress(b.t())`.
+
+    5. Perform sparse matrix multiplication using the compressed matrix,
+    applying scaling factors for `a` and `b`, and the output data type:
+    `out = cutlass_scaled_sparse_mm(a, bt_nzs, bt_meta, scale_a, scale_b, out_dtype)`.
+
+    Returns:
+    - The result of the scaled sparse matrix multiplication.
+    """
+    assert (bt_nzs.shape[0] % 16 == 0 and bt_nzs.shape[1] % 16 == 0)
+    assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
+    assert bias is None or bias.shape[0] == bt_nzs.shape[0] \
+        and bias.dtype == out_dtype
+
+    m = a.shape[0]
+    n = bt_nzs.shape[0]
+    out = torch.empty((m, n), dtype=out_dtype, device=a.device)
+
+    torch.ops._C.cutlass_scaled_sparse_mm(out, a, bt_nzs, bt_meta, scale_a,
+                                          scale_b, bias)
+
+    return out
+
+
+def get_cutlass_moe_mm_data(topk_ids: torch.Tensor,
+                            expert_offsets: torch.Tensor,
+                            problem_sizes1: torch.Tensor,
+                            problem_sizes2: torch.Tensor,
+                            input_permutation: torch.Tensor,
+                            output_permutation: torch.Tensor,
+                            num_experts: int,
+                            n: int,
+                            k: int,
+                            blockscale_offsets: Optional[torch.Tensor] = None):
+    """
+    Prepare data necessary to perform CUTLASS grouped matrix multiplications
+    used in CUTLASS-based fused MoE.
+
+    The function takes in topk_ids (token-expert mapping) and uses it to
+    compute:
+    - expert_offsets: Indices that mark at which token index each expert begins
+                      its computation after the input is sorted with
+                      input_permutation. The number of tokens computed with
+                      expert E is expert_offsets[E + 1] - expert_offsets[E]
+    - problem_sizes1, problem_sizes2: MxNxK sizes of each expert's
+                                      multiplication in two grouped MMs used in
+                                      the fused MoE operation.
+    - input_permutation: Permutation that must be used to shuffle the input
+                         before executing the MMs.
+    - output_permutation: Permutation that must be used to shuffle the output
+                          after executing the MMs.
+    - blockscale_offsets: Optional argument passed for fp4 moe. Indices that
+                          mark at which block scale index each expert begins
+                          its computation. The number of block scale rows
+                          computed with expert E is blockscale_offsets[E + 1] -
+                          blockscale_offsets[E]
+    """
+    return torch.ops._C.get_cutlass_moe_mm_data(topk_ids, expert_offsets,
+                                                problem_sizes1, problem_sizes2,
+                                                input_permutation,
+                                                output_permutation,
+                                                num_experts, n, k,
+                                                blockscale_offsets)
+
+
+def shuffle_rows(input_tensor: torch.Tensor, dst2src_map: torch.Tensor):
+    """
+    Shuffle and expand the input tensor according to the dst2src_map and store the result in output_tensor.
+    This is used in MoE to permute the input tensor before performing grouped matrix multiplications.
+    """
+    num_tokens_permuted = dst2src_map.shape[0]
+    output_tensor = torch.empty((num_tokens_permuted, input_tensor.shape[1]),
+                                device=input_tensor.device,
+                                dtype=input_tensor.dtype)
+    torch.ops._moe_C.shuffle_rows(input_tensor, dst2src_map, output_tensor)
+    return output_tensor
+
+
+def get_cutlass_pplx_moe_mm_data(expert_offsets: torch.Tensor,
+                                 problem_sizes1: torch.Tensor,
+                                 problem_sizes2: torch.Tensor,
+                                 expert_num_tokens: torch.Tensor,
+                                 num_local_experts: int, padded_m: int, n: int,
+                                 k: int):
+    """
+    Prepare data necessary to perform CUTLASS grouped matrix multiplications
+    used in CUTLASS-based fused MoE.
+
+    The function takes in expert_num_tokens (token count per expert) and
+    non_zero_expert_idxs (consecutive indices of experts with non-zero token 
+    counts) and uses them to compute:
+    - expert_offsets: Indices that mark at which token index each expert begins
+                      its computation.
+    - problem_sizes1, problem_sizes2: MxNxK sizes of each expert's
+                                      multiplication in two grouped MMs used in
+                                      the fused MoE operation.
+    """
+    return torch.ops._C.get_cutlass_pplx_moe_mm_data(
+        expert_offsets, problem_sizes1, problem_sizes2, expert_num_tokens,
+        num_local_experts, padded_m, n, k)
+
+
+def cutlass_moe_mm(out_tensors: torch.Tensor, a_tensors: torch.Tensor,
+                   b_tensors: torch.Tensor, a_scales: torch.Tensor,
+                   b_scales: torch.Tensor, expert_offsets: torch.Tensor,
+                   problem_sizes: torch.Tensor, a_strides: torch.Tensor,
+                   b_strides: torch.Tensor, c_strides: torch.Tensor,
+                   per_act_token: bool, per_out_ch: bool):
+    """
+    A single grouped matrix multiplication used in CUTLASS-based fused MoE.
+    The function executes fp8-quantized OUT = AB matrix multiplication.
+
+    - expert_offsets: Indices that mark at which token index each expert begins
+                      its computation. The number of tokens computed with
+                      expert E is expert_offsets[E + 1] - expert_offsets[E]
+    - problem_sizes: MxNxK sizes of each expert's multiplication in two grouped
+                     MMs used in the fused MoE operation.
+    - a/b/c_strides: The data strides passed to grouped matrix multiplication.
+    """
+    return torch.ops._C.cutlass_moe_mm(out_tensors, a_tensors, b_tensors,
+                                       a_scales, b_scales, expert_offsets,
+                                       problem_sizes, a_strides, b_strides,
+                                       c_strides, per_act_token, per_out_ch)
+
+
+def cutlass_fp4_moe_mm(out_tensors: torch.Tensor, a_tensors: torch.Tensor,
+                       b_tensors: torch.Tensor, a_scales: torch.Tensor,
+                       b_scales: torch.Tensor, alphas: torch.Tensor,
+                       problem_sizes: torch.Tensor,
+                       expert_offsets: torch.Tensor, sf_offsets: torch.Tensor):
+    """
+    An FP4 Blockscaled Group Gemm that takes in  a_tensors, b_tensors and runs
+    the gemms for each combination based on the specified problem sizes.
+
+    This is used as the MoE gemm during NVFP4 Quantized FusedMoE forward.
+    - a/b_tensors: the NVFP4 a_ptrs and b_ptrs tensors which are quantized
+                     input and expert weights.
+    - a_/b_scales: The blockscales in FP8-E4M3 precision
+    - expert_offsets/sf_offsets: Indices that mark at which token index
+                    each expert begins its computation. The number of tokens
+                    computed with expert E is expert_offsets[E + 1] -
+                    expert_offsets[E] And the sf_size per expert is
+                    sf_offset[E+1] - sf_offset[E]
+    - problem_sizes: MxNxK sizes of each expert's multiplication in two grouped
+                     MMs used in the fused MoE operation.
+    """
+    return torch.ops._C.cutlass_fp4_group_mm(out_tensors, a_tensors, b_tensors,
+                                             a_scales, b_scales, alphas,
+                                             problem_sizes, expert_offsets,
+                                             sf_offsets)
+
+
+# aqlm
+def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor,
+              codebooks: torch.Tensor, scales: torch.Tensor,
+              codebook_partition_sizes: list[int],
+              bias: Optional[torch.Tensor]) -> torch.Tensor:
+    return torch.ops._C.aqlm_gemm(input, codes, codebooks, scales,
+                                  codebook_partition_sizes, bias)
+
+
+def aqlm_dequant(codes: torch.Tensor, codebooks: torch.Tensor,
+                 codebook_partition_sizes: list[int]) -> torch.Tensor:
+    return torch.ops._C.aqlm_dequant(codes, codebooks,
+                                     codebook_partition_sizes)
+
+
+# gptq_marlin
+def gptq_marlin_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
+                       size_k: int, size_n: int,
+                       num_bits: int) -> torch.Tensor:
+    return torch.ops._C.gptq_marlin_repack(b_q_weight, perm, size_k, size_n,
+                                           num_bits)
+
+
+# gptq_marlin
+def awq_marlin_repack(b_q_weight: torch.Tensor, size_k: int, size_n: int,
+                      num_bits: int) -> torch.Tensor:
+    return torch.ops._C.awq_marlin_repack(b_q_weight, size_k, size_n, num_bits)
+
+
+def gptq_marlin_moe_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
+                           size_k: int, size_n: int,
+                           num_bits: int) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty((num_experts, size_k // 16, size_n * (num_bits // 2)),
+                         device=b_q_weight.device,
+                         dtype=b_q_weight.dtype)
+    for e in range(num_experts):
+        output[e] = torch.ops._C.gptq_marlin_repack(b_q_weight[e], perm[e],
+                                                    size_k, size_n, num_bits)
+    return output
+
+
+def awq_marlin_moe_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
+                          size_k: int, size_n: int,
+                          num_bits: int) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty((num_experts, size_k // 16, size_n * (num_bits // 2)),
+                         device=b_q_weight.device,
+                         dtype=b_q_weight.dtype)
+    for e in range(num_experts):
+        output[e] = torch.ops._C.awq_marlin_repack(b_q_weight[e], size_k,
+                                                   size_n, num_bits)
+    return output
+
+
+def gptq_marlin_gemm(a: torch.Tensor,
+                     c: Optional[torch.Tensor],
+                     b_q_weight: torch.Tensor,
+                     b_scales: torch.Tensor,
+                     global_scale: Optional[torch.Tensor],
+                     b_zeros: Optional[torch.Tensor],
+                     g_idx: Optional[torch.Tensor],
+                     perm: Optional[torch.Tensor],
+                     workspace: torch.Tensor,
+                     b_q_type: ScalarType,
+                     size_m: int,
+                     size_n: int,
+                     size_k: int,
+                     is_k_full: bool = True,
+                     use_atomic_add: bool = False,
+                     use_fp32_reduce: bool = False,
+                     is_zp_float: bool = False) -> torch.Tensor:
+    return torch.ops._C.gptq_marlin_gemm(a, c, b_q_weight, b_scales,
+                                         global_scale, b_zeros, g_idx, perm,
+                                         workspace, b_q_type.id, size_m,
+                                         size_n, size_k, is_k_full,
+                                         use_atomic_add, use_fp32_reduce,
+                                         is_zp_float)
+
+
+# machete
+def machete_supported_schedules(
+        a_type: torch.dtype,
+        b_type: ScalarType,
+        group_scales_type: Optional[torch.dtype],
+        group_zeros_type: Optional[torch.dtype] = None,
+        channel_scales_type: Optional[torch.dtype] = None,
+        token_scales_type: Optional[torch.dtype] = None,
+        out_type: Optional[torch.dtype] = None) -> list[str]:
+    return torch.ops._C.machete_supported_schedules(
+        a_type, b_type.id, group_scales_type, group_zeros_type,
+        channel_scales_type, token_scales_type, out_type)
+
+
+def machete_mm(
+        a: torch.Tensor,
+        # b_q Should be the tensor returned by machete_prepack_B
+        b_q: torch.Tensor,
+        b_type: ScalarType,
+        out_type: Optional[torch.dtype] = None,
+        b_group_scales: Optional[torch.Tensor] = None,
+        b_group_zeros: Optional[torch.Tensor] = None,
+        b_group_size: Optional[int] = None,
+        b_channel_scales: Optional[torch.Tensor] = None,
+        a_token_scales: Optional[torch.Tensor] = None,
+        schedule: Optional[str] = None) -> torch.Tensor:
+    return torch.ops._C.machete_mm(a, b_q, b_type.id, out_type, b_group_scales,
+                                   b_group_zeros, b_group_size,
+                                   b_channel_scales, a_token_scales, schedule)
+
+
+def machete_prepack_B(
+        b_q_weight: torch.Tensor, a_type: torch.dtype, b_type: ScalarType,
+        group_scales_type: Optional[torch.dtype]) -> torch.Tensor:
+    return torch.ops._C.machete_prepack_B(b_q_weight, a_type, b_type.id,
+                                          group_scales_type)
+
+
+if hasattr(torch.ops._C, "permute_cols"):
+
+    @register_fake("_C::permute_cols")
+    def _permute_cols_fake(a: torch.Tensor,
+                           perm: torch.Tensor) -> torch.Tensor:
+        return torch.empty_like(a)
+
+
+def permute_cols(a: torch.Tensor, perm: torch.Tensor) -> torch.Tensor:
+    return torch.ops._C.permute_cols(a, perm)
+
+
+# fp4
+def scaled_fp4_quant(
+        input: torch.Tensor,
+        input_global_scale: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Quantize input tensor to FP4 and return quantized tensor and scale.
+
+    This function quantizes the last dimension of the given tensor `input`. For
+    every 16 consecutive elements, a single dynamically computed scaling factor
+    is shared. This scaling factor is quantized using the `input_global_scale`
+    and is stored in a swizzled layout (see
+    https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-scale-factor-b-layout-4x).
+
+    Args:
+        input: The input tensor to be quantized to FP4
+        input_global_scale: A scalar scaling factor for the entire tensor.
+
+    Returns:
+        tuple[torch.Tensor, torch.Tensor]: The output tensor in FP4 but every
+            two values are packed into a uint8 and float8_e4m3 scaling factors
+            in the sizzled layout.
+    """
+    assert not current_platform.is_rocm()
+    assert input.ndim >= 1, (
+        f'input.ndim needs to be >= 1, but got {input.ndim}.')
+    other_dims = 1 if input.ndim == 1 else -1
+    input = input.reshape(other_dims, input.shape[-1])
+    m, n = input.shape
+    block_size = 16
+    device = input.device
+
+    assert n % block_size == 0, (
+        f'last dim has to be multiple of 16, but got {n}.')
+    assert input.dtype in (torch.float16, torch.bfloat16), (
+        f'input.dtype needs to be fp16 or bf16 but got {input.dtype}.')
+
+    # Two fp4 values will be packed into an uint8.
+    output = torch.empty((m, n // 2), device=device, dtype=torch.uint8)
+
+    # We use the rounded values to store the swizzled values. Due to the
+    # requirement of the Tensor Core, the minimum tile is 128x4 for the scales.
+    # So, we first pad the scales to multiples of 128 and 4. Then, the scales
+    # (in float8_e4m3fn) are packed into an int32 for every 4 values. More:
+    # https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-scale-factor-b-layout-4x
+    round_up = lambda x, y: (x + y - 1) // y * y
+    rounded_m = round_up(m, 128)
+    scale_n = n // block_size
+    rounded_n = round_up(scale_n, 4)
+    output_scale = torch.empty((rounded_m, rounded_n // 4),
+                               device=device,
+                               dtype=torch.int32)
+
+    torch.ops._C.scaled_fp4_quant(output, input, output_scale,
+                                  input_global_scale)
+    output_scale = output_scale.view(torch.float8_e4m3fn)
+    return output, output_scale
+
+
+def scaled_fp4_experts_quant(
+    input_tensor: torch.Tensor,
+    input_global_scale: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    blockscale_offsets: torch.Tensor,
+    topk: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Quantize input tensor to FP4 and return quantized tensor and scale, for
+    packed MoE Inputs.
+    Args:
+        input_tensor: The input tensor to be quantized to FP4
+        input_global_scale: A scalar scaling factor for the entire tensor.
+        expert_offsets: The expert offsets tensor
+        blockscale_offsets: The blockscale offsets tensor
+    Outputs:
+        output: The quantized tensor in FP4
+        output_scales: The blockscale tensor in FP8-E4M3
+    """
+    assert not current_platform.is_rocm()
+    assert input_tensor.ndim == 2, (
+        f'input.ndim needs to be == 2, but got {input_tensor.ndim}.')
+
+    # Control the maximum number of tokens per expert supported by the
+    # NVFP4 MoE Expert Quantization. This is used to prevent the kernel
+    # from running out of memory. This value can also be increased to support
+    # larger models.
+    MAX_TOKENS_PER_EXPERT = envs.VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE
+    m_numtopk, k = input_tensor.shape
+
+    assert (m_numtopk <= MAX_TOKENS_PER_EXPERT * topk), (
+        f"m_numtopk must be less than MAX_TOKENS_PER_EXPERT("
+        f"{MAX_TOKENS_PER_EXPERT})"
+        f" for cutlass_moe_fp4, observed m_numtopk = {m_numtopk}. Use"
+        f" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE to set this value.")
+    scales_k = k // 16
+    padded_k = (scales_k + (4 - 1)) // 4
+
+    # output is uint8 and packed fp4 values
+    output = torch.empty(m_numtopk,
+                         k // 2,
+                         device=input_tensor.device,
+                         dtype=torch.uint8)
+    output_scales = torch.empty(MAX_TOKENS_PER_EXPERT * topk,
+                                padded_k,
+                                dtype=torch.int32,
+                                device=input_tensor.device)
+    torch.ops._C.scaled_fp4_experts_quant(output, output_scales, input_tensor,
+                                          input_global_scale, expert_offsets,
+                                          blockscale_offsets)
+    output_scales = output_scales.view(torch.float8_e4m3fn)
+    return output, output_scales
+
+
+# fp8
+def scaled_fp8_quant(
+    input: torch.Tensor,
+    scale: Optional[torch.Tensor] = None,
+    num_token_padding: Optional[int] = None,
+    scale_ub: Optional[torch.Tensor] = None,
+    use_per_token_if_dynamic: bool = False,
+    output: Optional[torch.Tensor] = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Quantize input tensor to FP8 and return quantized tensor and scale.
+
+    This function supports both static and dynamic quantization: If you
+    provide the scale, it will use static scaling and if you omit it,
+    the scale will be determined dynamically. The function also allows
+    optional padding of the output tensors for downstream kernels that
+    will benefit from padding.
+
+    Args:
+        input: The input tensor to be quantized to FP8
+        scale: Optional scaling factor for the FP8 quantization
+        scale_ub: Optional upper bound for scaling factor in dynamic
+            per token case
+        num_token_padding: If specified, pad the first dimension
+            of the output to at least this value.
+        use_per_token_if_dynamic: Whether to do per_tensor or per_token
+            in the dynamic quantization case.
+
+    Returns:
+        tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
+            scaling factor.
+    """
+    # This code assumes batch_dim and num_tokens are flattened
+    assert (input.ndim == 2)
+    shape: Union[tuple[int, int], torch.Size] = input.shape
+    # For ROCm on MI300, the output fp8 dtype is torch.float_e3m3fnuz
+    out_dtype: torch.dtype = current_platform.fp8_dtype()
+    if num_token_padding:
+        shape = (max(num_token_padding, input.shape[0]), shape[1])
+    if output is None:
+        output = torch.empty(shape, device=input.device, dtype=out_dtype)
+    else:
+        assert num_token_padding is None, \
+            "padding not supported if output passed in"
+        assert output.dtype == out_dtype
+
+    if scale is None:
+        if use_per_token_if_dynamic:
+            scale = torch.empty((shape[0], 1),
+                                device=input.device,
+                                dtype=torch.float32)
+            torch.ops._C.dynamic_per_token_scaled_fp8_quant(
+                output, input.contiguous(), scale, scale_ub)
+        else:
+            scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+            torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
+    else:
+        assert scale.numel() == 1, f"{scale.shape}"
+        torch.ops._C.static_scaled_fp8_quant(output, input, scale)
+
+    return output, scale
+
+
+# gptq allspark
+def allspark_repack_weight(
+        qweight: torch.Tensor,
+        scale: torch.Tensor,
+        zero_point: Optional[torch.Tensor] = None,
+        has_zp: bool = False
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Rearrange qweight, scale, and zero_point(if asymmetric) to n32k16 format
+    for Ampere W8A16 Fused Gemm kernel
+
+    Args:
+        qweight: uint8 weight tensor, original k x n format.
+        scale: fp16/bf16 weight scale tensor, 1 x n format.
+        zero_point: fp16/bf16 weight zero_point tensor, 1 x n format.
+            Must be provided for asymmetric quantization.
+        has_zp: if use symmetric quantization, has_zp = False.
+            if use asymmetric quantization, has_zp = True.
+
+    Returns:
+        tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] :
+            rearranged weight, scale, and optionally zero_point.
+    """
+    K = qweight.shape[0]
+    N = qweight.shape[1]
+    N_32align = (N + 32 - 1) // 32 * 32
+
+    qweight_reorder = torch.empty((N_32align, K),
+                                  device=qweight.device,
+                                  dtype=qweight.dtype)
+    scale_reorder = torch.empty((1, N_32align),
+                                device=scale.device,
+                                dtype=scale.dtype)
+    zero_point_reorder = None
+    if has_zp:
+        assert zero_point is not None, (
+            "zero_point must be provided for asymmetric quantization.")
+        zero_point_reorder = torch.empty((1, N_32align),
+                                         device=zero_point.device,
+                                         dtype=zero_point.dtype)
+
+    torch.ops._C.rearrange_kn_weight_as_n32k16_order(
+        qweight, scale, zero_point, has_zp, qweight_reorder, scale_reorder,
+        zero_point_reorder, K, N, N_32align)
+
+    return qweight_reorder, scale_reorder, zero_point_reorder
+
+
+def allspark_w8a16_gemm(a: torch.Tensor, b_qweight: torch.Tensor,
+                        b_scales: torch.Tensor,
+                        b_qzeros: Optional[torch.Tensor], n: int,
+                        group_size: int, sm_count: int, sm_version: int,
+                        CUBLAS_M_THRESHOLD: int, has_zp: bool,
+                        n32k16_reorder: bool) -> torch.Tensor:
+
+    return torch.ops._C.allspark_w8a16_gemm(a, b_qweight, b_scales, b_qzeros,
+                                            n, group_size, sm_count,
+                                            sm_version, CUBLAS_M_THRESHOLD,
+                                            has_zp, n32k16_reorder)
+
+
+# int8
+def scaled_int8_quant(
+    input: torch.Tensor,
+    scale: Optional[torch.Tensor] = None,
+    azp: Optional[torch.Tensor] = None,
+    symmetric: bool = True
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    """
+    Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp.
+
+    Args:
+        input: The input tensor to be quantized to int8.
+        scale: Optional scaling factor for the int8 quantization.
+            When not provided, we invoke dynamic-per-token quantization.
+        azp: Optional zero-point for the int8 quantization.
+            Must be provided for asymmetric quantization if `scale` is provided.
+        symmetric: Whether to use symmetric quantization (scale only, azp ignored).
+
+    Returns:
+      tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp.
+    """
+    output = torch.empty_like(input, dtype=torch.int8)
+    if scale is not None:
+        # static-per-tensor quantization.
+        assert symmetric == (
+            azp
+            is None), "azp must only be provided for asymmetric quantization."
+        torch.ops._C.static_scaled_int8_quant(output, input, scale, azp)
+        return output, scale, azp
+
+    # dynamic-per-token quantization.
+    input_scales = torch.empty((input.numel() // input.shape[-1], 1),
+                               device=input.device,
+                               dtype=torch.float32)
+    input_azp = None if symmetric else torch.empty_like(input_scales,
+                                                        dtype=torch.int32)
+    torch.ops._C.dynamic_scaled_int8_quant(output, input.contiguous(),
+                                           input_scales, input_azp)
+    return output, input_scales, input_azp
+
+
+# qqq ops
+def marlin_qqq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
+                    s_tok: torch.Tensor, s_ch: torch.Tensor,
+                    s_group: torch.Tensor, workspace: torch.Tensor,
+                    size_m: int, size_n: int, size_k: int) -> torch.Tensor:
+    return torch.ops._C.marlin_qqq_gemm(a, b_q_weight, s_tok, s_ch, s_group,
+                                        workspace, size_m, size_n, size_k)
+
+
+# gguf
+def ggml_dequantize(W: torch.Tensor, quant_type: int, m: int, n: int,
+                    dtype: Optional[torch.dtype]) -> torch.Tensor:
+    return torch.ops._C.ggml_dequantize(W, quant_type, m, n, dtype)
+
+
+def ggml_mul_mat_vec_a8(
+    W: torch.Tensor,
+    X: torch.Tensor,
+    quant_type: int,
+    row: int,
+) -> torch.Tensor:
+    return torch.ops._C.ggml_mul_mat_vec_a8(W, X, quant_type, row)
+
+
+def ggml_mul_mat_a8(
+    W: torch.Tensor,
+    X: torch.Tensor,
+    quant_type: int,
+    row: int,
+) -> torch.Tensor:
+    return torch.ops._C.ggml_mul_mat_a8(W, X, quant_type, row)
+
+
+def ggml_moe_a8(
+    X: torch.Tensor,
+    W: torch.Tensor,
+    sorted_token_ids: torch.Tensor,
+    expert_ids: torch.Tensor,
+    num_tokens_post_padded: torch.Tensor,
+    quant_type: int,
+    row: int,
+    top_k: int,
+    tokens: int,
+) -> torch.Tensor:
+    return torch.ops._C.ggml_moe_a8(X, W, sorted_token_ids, expert_ids,
+                                    num_tokens_post_padded, quant_type, row,
+                                    top_k, tokens)
+
+
+def ggml_moe_a8_vec(
+    X: torch.Tensor,
+    W: torch.Tensor,
+    topk_ids: torch.Tensor,
+    top_k: int,
+    quant_type: int,
+    row: torch.SymInt,
+    tokens: torch.SymInt,
+) -> torch.Tensor:
+    return torch.ops._C.ggml_moe_a8_vec(X, W, topk_ids, top_k, quant_type, row,
+                                        tokens)
+
+
+def ggml_moe_get_block_size(quant_type: int) -> int:
+    return torch.ops._C.ggml_moe_get_block_size(quant_type)
+
+
+# mamba
+def selective_scan_fwd(u: torch.Tensor, delta: torch.Tensor, A: torch.Tensor,
+                       B: torch.Tensor, C: torch.Tensor,
+                       D_: Optional[torch.Tensor], z_: Optional[torch.Tensor],
+                       delta_bias_: Optional[torch.Tensor],
+                       delta_softplus: bool,
+                       query_start_loc: Optional[torch.Tensor],
+                       cache_indices: Optional[torch.Tensor],
+                       has_initial_state: Optional[torch.Tensor],
+                       ssm_states: torch.Tensor, pad_slot_id: int):
+    torch.ops._C.selective_scan_fwd(u, delta, A, B, C, D_, z_, delta_bias_,
+                                    delta_softplus, query_start_loc,
+                                    cache_indices, has_initial_state,
+                                    ssm_states, pad_slot_id)
+
+
+# ROCm skinny gemms
+def LLMM1(a: torch.Tensor, b: torch.Tensor,
+          rows_per_block: int) -> torch.Tensor:
+    return torch.ops._rocm_C.LLMM1(a, b, rows_per_block)
+
+
+def wvSplitK(a: torch.Tensor, b: torch.Tensor, cu_count: int) -> torch.Tensor:
+    return torch.ops._rocm_C.wvSplitK(a, b, cu_count)
+
+
+def wvSplitKQ(a: torch.Tensor, b: torch.Tensor, out_dtype: torch.dtype,
+              scale_a: torch.Tensor, scale_b: torch.Tensor,
+              cu_count: int) -> torch.Tensor:
+    out = torch.empty((b.shape[0], a.shape[0]),
+                      dtype=out_dtype,
+                      device=b.device)
+    torch.ops._rocm_C.wvSplitKQ(a, b, out, scale_a, scale_b, cu_count)
+    return out
+
+
+# moe
+def moe_sum(input: torch.Tensor, output: torch.Tensor):
+    torch.ops._moe_C.moe_sum(input, output)
+
+
+def moe_align_block_size(topk_ids: torch.Tensor, num_experts: int,
+                         block_size: int, sorted_token_ids: torch.Tensor,
+                         experts_ids: torch.Tensor,
+                         num_tokens_post_pad: torch.Tensor) -> None:
+    torch.ops._moe_C.moe_align_block_size(topk_ids, num_experts, block_size,
+                                          sorted_token_ids, experts_ids,
+                                          num_tokens_post_pad)
+
+
+def moe_wna16_gemm(input: torch.Tensor, output: torch.Tensor,
+                   b_qweight: torch.Tensor, b_scales: torch.Tensor,
+                   b_qzeros: Optional[torch.Tensor],
+                   topk_weights: Optional[torch.Tensor],
+                   sorted_token_ids: torch.Tensor, experts_ids: torch.Tensor,
+                   num_tokens_post_pad: torch.Tensor, top_k: int,
+                   BLOCK_SIZE_M: int, BLOCK_SIZE_N: int, BLOCK_SIZE_K: int,
+                   bit: int) -> torch.Tensor:
+    if not current_platform.is_cuda():
+        raise NotImplementedError(
+            "The optimized moe_wna16_gemm kernel is only "
+            "available on CUDA platforms")
+    torch.ops._moe_C.moe_wna16_gemm(input, output, b_qweight, b_scales,
+                                    b_qzeros, topk_weights, sorted_token_ids,
+                                    experts_ids, num_tokens_post_pad, top_k,
+                                    BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K,
+                                    bit)
+
+
+def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                 token_expert_indices: torch.Tensor,
+                 gating_output: torch.Tensor) -> None:
+    torch.ops._moe_C.topk_softmax(topk_weights, topk_ids, token_expert_indices,
+                                  gating_output)
+
+
+def moe_wna16_marlin_gemm(input: torch.Tensor, output: Optional[torch.Tensor],
+                          b_qweight: torch.Tensor, b_scales: torch.Tensor,
+                          global_scale: Optional[torch.Tensor],
+                          b_qzeros: Optional[torch.Tensor],
+                          g_idx: Optional[torch.Tensor],
+                          perm: Optional[torch.Tensor],
+                          workspace: torch.Tensor,
+                          sorted_token_ids: torch.Tensor,
+                          expert_ids: torch.Tensor,
+                          num_tokens_past_padded: torch.Tensor,
+                          topk_weights: torch.Tensor, moe_block_size: int,
+                          top_k: int, mul_topk_weights: bool, is_ep: bool,
+                          b_q_type: ScalarType, size_m: int, size_n: int,
+                          size_k: int, is_k_full: bool, use_atomic_add: bool,
+                          use_fp32_reduce: bool,
+                          is_zp_float: bool) -> torch.Tensor:
+    return torch.ops._moe_C.moe_wna16_marlin_gemm(
+        input, output, b_qweight, b_scales, global_scale, b_qzeros, g_idx,
+        perm, workspace, sorted_token_ids, expert_ids, num_tokens_past_padded,
+        topk_weights, moe_block_size, top_k, mul_topk_weights, is_ep,
+        b_q_type.id, size_m, size_n, size_k, is_k_full, use_atomic_add,
+        use_fp32_reduce, is_zp_float)
+
+
+if supports_moe_ops and hasattr(torch.ops._moe_C, "marlin_gemm_moe"):
+
+    @register_fake("_moe_C::marlin_gemm_moe")
+    def marlin_gemm_moe_fake(a: torch.Tensor, b_q_weights: torch.Tensor,
+                             sorted_ids: torch.Tensor,
+                             topk_weights: torch.Tensor,
+                             topk_ids: torch.Tensor, b_scales: torch.Tensor,
+                             b_zero_points: torch.Tensor, g_idx: torch.Tensor,
+                             perm: torch.Tensor, workspace: torch.Tensor,
+                             b_q_type: ScalarType, size_m: torch.SymInt,
+                             size_n: torch.SymInt, size_k: torch.SymInt,
+                             is_k_full: bool, num_experts: int, topk: int,
+                             moe_block_size: int, replicate_input: bool,
+                             apply_weights: bool) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n),
+                           dtype=a.dtype,
+                           device=a.device)
+
+    @register_fake("_moe_C::moe_wna16_marlin_gemm")
+    def moe_wna16_marlin_gemm_fake(input: torch.Tensor,
+                                   output: Optional[torch.Tensor],
+                                   b_qweight: torch.Tensor,
+                                   b_scales: torch.Tensor,
+                                   b_qzeros: Optional[torch.Tensor],
+                                   g_idx: Optional[torch.Tensor],
+                                   perm: Optional[torch.Tensor],
+                                   workspace: torch.Tensor,
+                                   sorted_token_ids: torch.Tensor,
+                                   expert_ids: torch.Tensor,
+                                   num_tokens_past_padded: torch.Tensor,
+                                   topk_weights: torch.Tensor,
+                                   moe_block_size: int, top_k: int,
+                                   mul_topk_weights: bool, is_ep: bool,
+                                   b_q_type: ScalarType, size_m: int,
+                                   size_n: int, size_k: int, is_k_full: bool,
+                                   use_atomic_add: bool, use_fp32_reduce: bool,
+                                   is_zp_float: bool) -> torch.Tensor:
+        return torch.empty((size_m * top_k, size_n),
+                           dtype=input.dtype,
+                           device=input.device)
+
+
+def reshape_and_cache(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,
+    kv_cache_dtype: str,
+    k_scale: torch.Tensor,
+    v_scale: torch.Tensor,
+) -> None:
+    torch.ops._C_cache_ops.reshape_and_cache(key, value, key_cache,
+                                             value_cache, slot_mapping,
+                                             kv_cache_dtype, k_scale, v_scale)
+
+
+def reshape_and_cache_flash(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,
+    kv_cache_dtype: str,
+    k_scale: torch.Tensor,
+    v_scale: torch.Tensor,
+) -> None:
+    torch.ops._C_cache_ops.reshape_and_cache_flash(key, value, key_cache,
+                                                   value_cache, slot_mapping,
+                                                   kv_cache_dtype, k_scale,
+                                                   v_scale)
+
+
+def concat_and_cache_mla(
+    kv_c: torch.Tensor,
+    k_pe: torch.Tensor,
+    kv_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,
+    kv_cache_dtype: str,
+    scale: torch.Tensor,
+) -> None:
+    torch.ops._C_cache_ops.concat_and_cache_mla(kv_c, k_pe, kv_cache,
+                                                slot_mapping, kv_cache_dtype,
+                                                scale)
+
+
+def copy_blocks(key_caches: list[torch.Tensor],
+                value_caches: list[torch.Tensor],
+                block_mapping: torch.Tensor) -> None:
+    torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
+
+
+def copy_blocks_mla(kv_caches: list[torch.Tensor],
+                    block_mapping: torch.Tensor) -> None:
+    torch.ops._C_cache_ops.copy_blocks_mla(kv_caches, block_mapping)
+
+
+def swap_blocks(src: torch.Tensor, dst: torch.Tensor,
+                block_mapping: torch.Tensor) -> None:
+    torch.ops._C_cache_ops.swap_blocks(src, dst, block_mapping)
+
+
+def convert_fp8(output: torch.Tensor,
+                input: torch.Tensor,
+                scale: float = 1.0,
+                kv_dtype: str = "fp8") -> None:
+    torch.ops._C_cache_ops.convert_fp8(output, input, scale, kv_dtype)
+
+
+def gather_cache(src_cache: torch.Tensor,
+                 dst: torch.Tensor,
+                 block_table: torch.Tensor,
+                 cu_seq_lens: torch.Tensor,
+                 batch_size: int,
+                 seq_starts: Optional[torch.Tensor] = None) -> None:
+    torch.ops._C_cache_ops.gather_cache(src_cache, dst, block_table,
+                                        cu_seq_lens, batch_size, seq_starts)
+
+
+def get_device_attribute(attribute: int, device: int) -> int:
+    return torch.ops._C_cuda_utils.get_device_attribute(attribute, device)
+
+
+def get_max_shared_memory_per_block_device_attribute(device: int) -> int:
+    # ruff: noqa: E501
+    return torch.ops._C_cuda_utils.get_max_shared_memory_per_block_device_attribute(
+        device)
+
+
+# custom ar
+def init_custom_ar(ipc_tensors: list[torch.Tensor], rank_data: torch.Tensor,
+                   rank: int, fully_connected: bool) -> int:
+    return torch.ops._C_custom_ar.init_custom_ar(ipc_tensors, rank_data, rank,
+                                                 fully_connected)
+
+
+def all_reduce(fa: int, inp: torch.Tensor, out: torch.Tensor, reg_buffer: int,
+               reg_buffer_sz_bytes: int) -> None:
+    torch.ops._C_custom_ar.all_reduce(fa, inp, out, reg_buffer,
+                                      reg_buffer_sz_bytes)
+
+
+def dispose(fa: int) -> None:
+    torch.ops._C_custom_ar.dispose(fa)
+
+
+def meta_size() -> int:
+    return torch.ops._C_custom_ar.meta_size()
+
+
+def register_buffer(fa: int, ipc_tensors: list[int]) -> None:
+    return torch.ops._C_custom_ar.register_buffer(fa, ipc_tensors)
+
+
+def get_graph_buffer_ipc_meta(fa: int) -> tuple[list[int], list[int]]:
+    return torch.ops._C_custom_ar.get_graph_buffer_ipc_meta(fa)
+
+
+def register_graph_buffers(fa: int, handles: list[list[int]],
+                           offsets: list[list[int]]) -> None:
+    torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)
+
+
+def allocate_shared_buffer_and_handle(size: int) -> tuple[int, torch.Tensor]:
+    return torch.ops._C_custom_ar.allocate_shared_buffer_and_handle(size)
+
+
+def open_mem_handle(mem_handle: torch.Tensor):
+    return torch.ops._C_custom_ar.open_mem_handle(mem_handle)
+
+
+def free_shared_buffer(ptr: int) -> None:
+    torch.ops._C_custom_ar.free_shared_buffer(ptr)
+
+
+# quick all reduce
+def init_custom_qr(rank: int,
+                   world_size: int,
+                   qr_max_size: Optional[int] = None) -> int:
+    return torch.ops._C_custom_ar.init_custom_qr(rank, world_size, qr_max_size)
+
+
+def qr_destroy(fa: int) -> None:
+    torch.ops._C_custom_ar.qr_destroy(fa)
+
+
+def qr_all_reduce(fa: int,
+                  inp: torch.Tensor,
+                  out: torch.Tensor,
+                  quant_level: int,
+                  cast_bf2half: bool = False) -> None:
+    torch.ops._C_custom_ar.qr_all_reduce(fa, inp, out, quant_level,
+                                         cast_bf2half)
+
+
+def qr_get_handle(fa: int) -> torch.Tensor:
+    return torch.ops._C_custom_ar.qr_get_handle(fa)
+
+
+def qr_open_handles(fa: int, handles: list[torch.Tensor]) -> None:
+    return torch.ops._C_custom_ar.qr_open_handles(fa, handles)
+
+
+def qr_max_size() -> int:
+    return torch.ops._C_custom_ar.qr_max_size()
+
+
+def get_flash_mla_metadata(
+    cache_seqlens: torch.Tensor,
+    num_heads_per_head_k: int,
+    num_heads_k: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Arguments:
+        cache_seqlens: (batch_size), dtype torch.int32.
+        num_heads_per_head_k: Equals to seq_len_q * num_heads_q // num_heads_k.
+        num_heads_k: num_heads_k.
+
+    Return:
+        tile_scheduler_metadata: (num_sm_parts, TileSchedulerMetaDataSize), dtype torch.int32.
+        num_splits: (batch_size + 1), dtype torch.int32.
+    """
+    return torch.ops._C.get_flash_mla_metadata(cache_seqlens,
+                                               num_heads_per_head_k,
+                                               num_heads_k)
+
+
+def flash_mla_with_kvcache(
+    q: torch.Tensor,
+    k_cache: torch.Tensor,
+    block_table: torch.Tensor,
+    cache_seqlens: torch.Tensor,
+    head_dim_v: int,
+    tile_scheduler_metadata: torch.Tensor,
+    num_splits: torch.Tensor,
+    softmax_scale: Optional[float] = None,
+    causal: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Arguments:
+        q: (batch_size, seq_len_q, num_heads_q, head_dim).
+        k_cache: (num_blocks, page_block_size, num_heads_k, head_dim).
+        block_table: (batch_size, max_num_blocks_per_seq), torch.int32.
+        cache_seqlens: (batch_size), torch.int32.
+        head_dim_v: Head_dim of v.
+        tile_scheduler_metadata: (num_sm_parts, TileSchedulerMetaDataSize), torch.int32, return by get_mla_metadata.
+        num_splits: (batch_size + 1), torch.int32, return by get_mla_metadata.
+        softmax_scale: float. The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim).
+        causal: bool. Whether to apply causal attention mask.
+
+    Return:
+        out: (batch_size, seq_len_q, num_heads_q, head_dim_v).
+        softmax_lse: (batch_size, num_heads_q, seq_len_q), torch.float32.
+    """
+    if softmax_scale is None:
+        softmax_scale = q.shape[-1]**(-0.5)
+    out, softmax_lse = torch.ops._C.flash_mla_fwd_kvcache(
+        q,
+        k_cache,
+        None,
+        head_dim_v,
+        cache_seqlens,
+        block_table,
+        softmax_scale,
+        causal,
+        tile_scheduler_metadata,
+        num_splits,
+    )
+    return out, softmax_lse
+
+
+def cutlass_mla_decode(out: torch.Tensor, q_nope: torch.Tensor,
+                       q_pe: torch.Tensor, kv_c_and_k_pe_cache: torch.Tensor,
+                       seq_lens: torch.Tensor, page_table: torch.Tensor,
+                       scale: float) -> torch.Tensor:
+    torch.ops._C.cutlass_mla_decode(out, q_nope, q_pe, kv_c_and_k_pe_cache,
+                                    seq_lens, page_table, scale)
+    return out
+
+
+def sm100_cutlass_mla_decode(out: torch.Tensor, q_nope: torch.Tensor,
+                             q_pe: torch.Tensor,
+                             kv_c_and_k_pe_cache: torch.Tensor,
+                             seq_lens: torch.Tensor, page_table: torch.Tensor,
+                             workspace: torch.Tensor, scale: float,
+                             num_kv_splits: int) -> torch.Tensor:
+    torch.ops._C.sm100_cutlass_mla_decode(out, q_nope, q_pe,
+                                          kv_c_and_k_pe_cache, seq_lens,
+                                          page_table, workspace, scale,
+                                          num_kv_splits)
+    return out
+
+
+def sm100_cutlass_mla_get_workspace_size(max_seq_len: int, num_batches: int,
+                                         sm_count: int,
+                                         num_kv_splits: int) -> int:
+    return torch.ops._C.sm100_cutlass_mla_get_workspace_size(
+        max_seq_len, num_batches, sm_count, num_kv_splits)
+
+
+if hasattr(torch.ops._C, "weight_packed_linear"):
+
+    @register_fake("_C::weight_packed_linear")
+    def weight_packed_linear_fake(mat1: torch.Tensor, mat2: torch.Tensor,
+                                  bias: Optional[torch.Tensor],
+                                  is_vnni: bool) -> torch.Tensor:
+        return torch.empty((mat1.size(0), mat2.size(0)),
+                           dtype=mat1.dtype,
+                           device=mat2.device)
+
+
+if hasattr(torch.ops._C, "fused_experts_cpu"):
+
+    @register_fake("_C::fused_experts_cpu")
+    def fused_experts_cpu_fake(
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        inplace: bool,
+        use_int8_w8a8: bool,
+        use_fp8_w8a16: bool,
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        block_size: Optional[list[int]],
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        is_vnni: bool,
+    ) -> torch.Tensor:
+        return torch.empty_like(hidden_states)
+
+
+if hasattr(torch.ops._C, "int8_scaled_mm_with_quant"):
+
+    @register_fake("_C::int8_scaled_mm_with_quant")
+    def int8_scaled_mm_with_quant_fake(
+        mat1: torch.Tensor,
+        mat2: torch.Tensor,
+        scales2: torch.Tensor,
+        bias: Optional[torch.Tensor],
+        out_dtype: torch.dtype,
+        is_vnni: bool,
+    ) -> torch.Tensor:
+        M = mat1.size(0)
+        N = mat2.size(0)
+        return torch.empty((M, N), dtype=out_dtype)
diff --git a/vllm_v0.10.0/vllm/_ipex_ops.py b/vllm_v0.10.0/vllm/_ipex_ops.py
new file mode 100644
index 0000000..7533bf5
--- /dev/null
+++ b/vllm_v0.10.0/vllm/_ipex_ops.py
@@ -0,0 +1,350 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+try:
+    import intel_extension_for_pytorch as ipex
+except ImportError as e:
+    logger.warning("Import error msg: %s", e.msg)
+
+
+class ipex_ops:
+
+    @staticmethod
+    def _reshape_activation_tensor(
+            x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        num = x.size(0)
+        d = x.size(1) // 2
+        x = x.reshape(num, 2, d)
+        x1, x2 = torch.chunk(x, chunks=2, dim=1)
+        x1 = x1.reshape(num, d)
+        x2 = x2.reshape(num, d)
+        return x1, x2
+
+    @staticmethod
+    def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+        ipex.llm.functional.silu_and_mul(x, out)
+
+    @staticmethod
+    def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+        ipex.llm.functional.gelu_and_mul(x, out)
+
+    @staticmethod
+    def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+        ipex.llm.functional.gelu_and_mul(x, out)
+
+    @staticmethod
+    def gelu_fast(x: torch.Tensor) -> torch.Tensor:
+        return torch.nn.functional.gelu(x)
+
+    @staticmethod
+    def gelu_new(x: torch.Tensor) -> torch.Tensor:
+        return torch.nn.functional.gelu(x)
+
+    @staticmethod
+    def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+        ipex.llm.functional.gelu_quick(x, out)
+
+    @staticmethod
+    def paged_attention_v1(
+        out: torch.Tensor,
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        num_kv_heads: int,
+        scale: float,
+        block_tables: torch.Tensor,
+        context_lens: torch.Tensor,
+        block_size: int,
+        max_context_len: int,
+        alibi_slopes: Optional[torch.Tensor],
+        kv_cache_dtype: str,
+        k_scale: float,
+        v_scale: float,
+        tp_rank: int = 0,
+        blocksparse_local_blocks: int = 0,
+        blocksparse_vert_stride: int = 0,
+        blocksparse_block_size: int = 64,
+        blocksparse_head_sliding_step: int = 0,
+    ) -> None:
+        assert kv_cache_dtype == "auto"
+        num_heads = out.size(1)
+        num_queries_per_tokens = num_heads // num_kv_heads
+        ipex.llm.modules.PagedAttention.single_query_kv_attention(
+            out,
+            query.contiguous(),
+            key_cache.view_as(value_cache),
+            value_cache,
+            num_queries_per_tokens,
+            scale,
+            block_tables,
+            context_lens,
+            block_size,
+            max_context_len,
+            alibi_slopes,
+        )
+
+    @staticmethod
+    def paged_attention_v2(
+        out: torch.Tensor,
+        exp_sum: torch.Tensor,
+        max_logits: torch.Tensor,
+        tmp_out: torch.Tensor,
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        num_kv_heads: int,
+        scale: float,
+        block_tables: torch.Tensor,
+        context_lens: torch.Tensor,
+        block_size: int,
+        max_context_len: int,
+        alibi_slopes: Optional[torch.Tensor],
+        kv_cache_dtype: str,
+        k_scale: float,
+        v_scale: float,
+        tp_rank: int = 0,
+        blocksparse_local_blocks: int = 0,
+        blocksparse_vert_stride: int = 0,
+        blocksparse_block_size: int = 64,
+        blocksparse_head_sliding_step: int = 0,
+    ) -> None:
+        assert kv_cache_dtype == "auto"
+        num_heads = out.size(1)
+        num_queries_per_tokens = num_heads // num_kv_heads
+        ipex.llm.modules.PagedAttention.single_query_kv_attention(
+            out,
+            query.contiguous(),
+            key_cache.view_as(value_cache),
+            value_cache,
+            num_queries_per_tokens,
+            scale,
+            block_tables,
+            context_lens,
+            block_size,
+            max_context_len,
+            alibi_slopes,
+        )
+
+    @staticmethod
+    def rotary_embedding(
+        positions: torch.Tensor,  # [batch_size, seq_len]
+        query: torch.Tensor,  # [batch_size, seq_len, num_heads*head_size]
+        key: torch.Tensor,  # [batch_size, seq_len, num_kv_heads*head_size]
+        head_size: int,
+        cos_sin_cache: torch.Tensor,  # [cos_sin_dim, rot_dim]
+        is_neox: bool,
+    ) -> None:
+        rot_dim = cos_sin_cache.size(1)
+        ipex.llm.functional.rotary_embedding_batched(positions, query, key,
+                                                     head_size, cos_sin_cache,
+                                                     is_neox, rot_dim)
+
+    @staticmethod
+    def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
+                                 key: torch.Tensor, head_size: int,
+                                 cos_sin_cache: torch.Tensor, is_neox: bool,
+                                 rot_dim: int,
+                                 cos_sin_cache_offsets: torch.Tensor) -> None:
+        ipex.llm.functional.rotary_embedding_batched(positions, query, key,
+                                                     head_size, cos_sin_cache,
+                                                     is_neox, rot_dim,
+                                                     cos_sin_cache_offsets)
+
+    @staticmethod
+    def rms_norm(input: torch.Tensor, weight: torch.Tensor,
+                 epsilon: float) -> torch.Tensor:
+        return ipex.llm.functional.rms_norm(input, weight, epsilon)
+
+    @staticmethod
+    def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
+                           weight: torch.Tensor, epsilon: float) -> None:
+        tmp = ipex.llm.functional.add_rms_norm(residual, input, weight, None,
+                                               epsilon, True)
+        input.copy_(tmp)
+
+    @staticmethod
+    def varlen_attention(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        out: torch.Tensor,
+        seqlen_q: torch.Tensor,
+        seqlen_k: torch.Tensor,
+        alibi_slopes: Optional[torch.Tensor],
+        max_seqlen_q: int,
+        max_seqlen_k: int,
+        pdropout: float,
+        softmax_scale: float,
+        zero_tensors: bool,
+        is_causal: bool,
+        return_softmax: bool,
+        gen_: torch.Generator,
+        window_size_left: float,
+        window_size_right: float,
+        logits_soft_cap: float,
+    ) -> None:
+        if ipex.__version__.endswith("cpu"):
+            if logits_soft_cap != 0.0:
+                raise ValueError("IPEX CPU does not support logits_soft_cap")
+            assert alibi_slopes is None
+            assert window_size_left < 0 and window_size_right < 0
+            ipex.llm.functional.varlen_attention(query.contiguous(),
+                                                 key.contiguous(),
+                                                 value.contiguous(), out,
+                                                 seqlen_q.int(),
+                                                 seqlen_k.int(), max_seqlen_q,
+                                                 max_seqlen_k, pdropout,
+                                                 softmax_scale, zero_tensors,
+                                                 is_causal, return_softmax,
+                                                 gen_)
+        else:  # XPU build
+            ipex.llm.functional.varlen_attention(
+                query.contiguous(), key.contiguous(), value.contiguous(), out,
+                seqlen_q.int(), seqlen_k.int(), alibi_slopes, max_seqlen_q,
+                max_seqlen_k, pdropout, softmax_scale, zero_tensors, is_causal,
+                return_softmax, gen_, window_size_left, window_size_right,
+                logits_soft_cap)
+
+    @staticmethod
+    def reshape_and_cache(
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        slot_mapping: torch.Tensor,
+        kv_cache_dtype: str,
+        k_scale: float,
+        v_scale: float,
+    ) -> None:
+        assert kv_cache_dtype == "auto"
+        ipex.llm.modules.PagedAttention.reshape_and_cache(
+            key, value, key_cache, value_cache, slot_mapping)
+
+    @staticmethod
+    def reshape_and_cache_flash(
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        slot_mapping: torch.Tensor,
+        kv_cache_dtype: str,
+        k_scale: Optional[torch.Tensor] = None,
+        v_scale: Optional[torch.Tensor] = None,
+        k_scale_float: float = 1.0,
+        v_scale_float: float = 1.0,
+    ) -> None:
+        assert kv_cache_dtype == "auto"
+        # TODO: support FP8 kv cache.
+        ipex.llm.modules.PagedAttention.reshape_and_cache_flash(
+            key, value, key_cache, value_cache, slot_mapping)
+
+    @staticmethod
+    def flash_attn_varlen_func(
+        out: torch.Tensor,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        cu_seqlens_q: torch.Tensor,
+        seqused_k: torch.Tensor,  # we don't support this in ipex kernel
+        max_seqlen_q: int,
+        max_seqlen_k: int,
+        softmax_scale: float,
+        causal: bool,
+        block_table: torch.Tensor,
+        alibi_slopes: Optional[torch.Tensor],
+        window_size: Optional[list[int]] = None,
+        softcap: Optional[float] = 0.0,
+        cu_seqlens_k: Optional[torch.Tensor] = None,
+        # The following parameters are not used in ipex kernel currently,
+        # we keep API compatible to CUDA's.
+        scheduler_metadata=None,
+        fa_version: int = 2,
+        q_descale=None,
+        k_descale=None,
+        v_descale=None,
+        num_splits=0,
+    ):
+        if cu_seqlens_k is None:
+            # cu_seqlens_k is not used in ipex kernel.
+            cu_seqlens_k = torch.cumsum(seqused_k, dim=0)
+            cu_seqlens_k = torch.cat([
+                torch.tensor([0], device=seqused_k.device, dtype=torch.int32),
+                cu_seqlens_k
+            ]).to(torch.int32)
+
+        real_window_size: tuple[int, int]
+        if window_size is None:
+            real_window_size = (-1, -1)
+        else:
+            assert len(window_size) == 2
+            real_window_size = (window_size[0], window_size[1])
+        return ipex.llm.modules.PagedAttention.flash_attn_varlen_func(
+            out,
+            q.contiguous(),
+            k,
+            v,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            softmax_scale,
+            causal,
+            block_table,
+            alibi_slopes,
+            softcap=softcap,
+            window_size_left=real_window_size[0],
+            window_size_right=real_window_size[1],
+            k_scale=1.0,
+            v_scale=1.0,
+        )
+
+    @staticmethod
+    def get_scheduler_metadata(
+            batch_size,
+            max_seqlen_q,
+            max_seqlen_k,
+            num_heads_q,
+            num_heads_kv,
+            headdim,
+            cache_seqlens: torch.Tensor,
+            qkv_dtype=torch.bfloat16,
+            headdim_v=None,
+            cu_seqlens_q: Optional[torch.Tensor] = None,
+            cu_seqlens_k_new: Optional[torch.Tensor] = None,
+            cache_leftpad: Optional[torch.Tensor] = None,
+            page_size: Optional[int] = None,
+            max_seqlen_k_new=0,
+            causal=False,
+            window_size=(-1, -1),  # -1 means infinite context window
+            has_softcap=False,
+            num_splits=0,  # Can be tuned for speed
+            pack_gqa=None,  # Can be tuned for speed
+            sm_margin=0,  # Can be tuned if some SMs are used for communication
+    ) -> None:
+        logger.warning_once(
+            "get_scheduler_metadata is not implemented for ipex_ops, "
+            "returning None.")
+        return None
+
+    @staticmethod
+    def copy_blocks(key_caches: list[torch.Tensor],
+                    value_caches: list[torch.Tensor],
+                    block_mapping: torch.Tensor) -> None:
+        torch.xpu.copy_blocks(  # type: ignore
+            key_caches,
+            value_caches,
+            block_mapping,
+        )
+
+    @staticmethod
+    def swap_blocks(src: torch.Tensor, dst: torch.Tensor,
+                    block_mapping: torch.Tensor) -> None:
+        torch.xpu.swap_blocks(src, dst, block_mapping)  # type: ignore
diff --git a/vllm_v0.10.0/vllm/adapter_commons/__init__.py b/vllm_v0.10.0/vllm/adapter_commons/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/adapter_commons/layers.py b/vllm_v0.10.0/vllm/adapter_commons/layers.py
new file mode 100644
index 0000000..9753a08
--- /dev/null
+++ b/vllm_v0.10.0/vllm/adapter_commons/layers.py
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+
+
+@dataclass
+class AdapterMapping:
+    # Per every token in input_ids:
+    index_mapping: tuple[int, ...]
+    # Per sampled token:
+    prompt_mapping: tuple[int, ...]
+
+    def __post_init__(self):
+        self.index_mapping = tuple(self.index_mapping)
+        self.prompt_mapping = tuple(self.prompt_mapping)
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/adapter_commons/models.py b/vllm_v0.10.0/vllm/adapter_commons/models.py
new file mode 100644
index 0000000..7b68588
--- /dev/null
+++ b/vllm_v0.10.0/vllm/adapter_commons/models.py
@@ -0,0 +1,106 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Optional, TypeVar
+
+from torch import nn
+
+from vllm.logger import init_logger
+from vllm.utils import LRUCache
+
+logger = init_logger(__name__)
+
+
+class AdapterModel(ABC):
+
+    def __init__(self, model_id=None):
+        self.id = model_id
+
+    @abstractmethod
+    def from_local_checkpoint(cls, model_dir, model_id=None, **kwargs):
+        # Common initialization code
+        # Load weights or embeddings from local checkpoint
+        raise NotImplementedError("Subclasses must implement this method.")
+
+
+T = TypeVar('T')
+
+
+class AdapterLRUCache(LRUCache[int, T]):
+
+    def __init__(self, capacity: int, deactivate_fn: Callable[[int], object]):
+        super().__init__(capacity)
+        self.deactivate_fn = deactivate_fn
+
+    def _on_remove(self, key: int, value: Optional[T]):
+        logger.debug("Removing adapter int id: %d", key)
+        self.deactivate_fn(key)
+        return super()._on_remove(key, value)
+
+
+class AdapterModelManager(ABC):
+
+    def __init__(
+        self,
+        model: nn.Module,
+    ):
+        """Create a AdapterModelManager and adapter for a given model.
+        Args:
+            model: the model to be adapted.
+        """
+        self.model: nn.Module = model
+        self._registered_adapters: dict[int, Any] = {}
+        # Dict instead of a Set for compatibility with LRUCache.
+        self._active_adapters: dict[int, None] = {}
+        self.adapter_type = 'Adapter'
+        self._last_mapping = None
+
+    def __len__(self) -> int:
+        return len(self._registered_adapters)
+
+    @property
+    @abstractmethod
+    def adapter_slots(self) -> int:
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def capacity(self) -> int:
+        raise NotImplementedError
+
+    @abstractmethod
+    def activate_adapter(self, adapter_id: int) -> bool:
+        raise NotImplementedError
+
+    @abstractmethod
+    def deactivate_adapter(self, adapter_id: int) -> bool:
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_adapter(self, adapter: Any) -> bool:
+        raise NotImplementedError
+
+    @abstractmethod
+    def set_adapter_mapping(self, mapping: Any) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def remove_adapter(self, adapter_id: int) -> bool:
+        raise NotImplementedError
+
+    @abstractmethod
+    def remove_all_adapters(self) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_adapter(self, adapter_id: int) -> Optional[Any]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def list_adapters(self) -> dict[int, Any]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def pin_adapter(self, adapter_id: int) -> bool:
+        raise NotImplementedError
diff --git a/vllm_v0.10.0/vllm/adapter_commons/request.py b/vllm_v0.10.0/vllm/adapter_commons/request.py
new file mode 100644
index 0000000..8135b54
--- /dev/null
+++ b/vllm_v0.10.0/vllm/adapter_commons/request.py
@@ -0,0 +1,26 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import ABC, abstractmethod
+
+
+class AdapterRequest(ABC):
+    """
+    Base class for adapter requests.
+    """
+
+    @property
+    @abstractmethod
+    def adapter_id(self) -> int:
+        raise NotImplementedError
+
+    def __post_init__(self) -> None:
+        if self.adapter_id < 1:
+            raise ValueError(f"id must be > 0, got {self.adapter_id}")
+
+    def __eq__(self, value: object) -> bool:
+        return isinstance(
+            value, self.__class__) and self.adapter_id == value.adapter_id
+
+    def __hash__(self) -> int:
+        return hash(self.adapter_id)
diff --git a/vllm_v0.10.0/vllm/adapter_commons/utils.py b/vllm_v0.10.0/vllm/adapter_commons/utils.py
new file mode 100644
index 0000000..a1a56b6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/adapter_commons/utils.py
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Callable, Optional
+
+
+## model functions
+def deactivate_adapter(adapter_id: int, active_adapters: dict[int, None],
+                       deactivate_func: Callable) -> bool:
+    if adapter_id in active_adapters:
+        deactivate_func(adapter_id)
+        active_adapters.pop(adapter_id)
+        return True
+    return False
+
+
+def add_adapter(adapter: Any, registered_adapters: dict[int, Any],
+                capacity: int, add_func: Callable) -> bool:
+    if adapter.id not in registered_adapters:
+        if len(registered_adapters) >= capacity:
+            raise RuntimeError('No free adapter slots.')
+        add_func(adapter)
+        registered_adapters[adapter.id] = adapter
+        return True
+    return False
+
+
+def set_adapter_mapping(mapping: Any, last_mapping: Any,
+                        set_mapping_func: Callable) -> Any:
+    if last_mapping != mapping:
+        set_mapping_func(mapping)
+        return mapping
+    return last_mapping
+
+
+def remove_adapter(adapter_id: int, registered_adapters: dict[int, Any],
+                   deactivate_func: Callable) -> bool:
+    deactivate_func(adapter_id)
+    return bool(registered_adapters.pop(adapter_id, None))
+
+
+def list_adapters(registered_adapters: dict[int, Any]) -> dict[int, Any]:
+    return dict(registered_adapters)
+
+
+def get_adapter(adapter_id: int,
+                registered_adapters: dict[int, Any]) -> Optional[Any]:
+    return registered_adapters.get(adapter_id)
+
+
+## worker functions
+def set_active_adapters_worker(requests: set[Any], mapping: Optional[Any],
+                               apply_adapters_func,
+                               set_adapter_mapping_func) -> None:
+    apply_adapters_func(requests)
+    set_adapter_mapping_func(mapping)
+
+
+def add_adapter_worker(adapter_request: Any, list_adapters_func,
+                       load_adapter_func, add_adapter_func,
+                       activate_adapter_func) -> bool:
+    if adapter_request.adapter_id in list_adapters_func():
+        return False
+    loaded_adapter = load_adapter_func(adapter_request)
+    loaded = add_adapter_func(loaded_adapter)
+    activate_adapter_func(loaded_adapter.id)
+    return loaded
+
+
+def apply_adapters_worker(adapter_requests: set[Any], list_adapters_func,
+                          adapter_slots: int, remove_adapter_func,
+                          add_adapter_func) -> None:
+    models_that_exist = list_adapters_func()
+    models_map = {
+        adapter_request.adapter_id: adapter_request
+        for adapter_request in adapter_requests if adapter_request
+    }
+    if len(models_map) > adapter_slots:
+        raise RuntimeError(
+            f"Number of requested models ({len(models_map)}) is greater "
+            f"than the number of GPU model slots "
+            f"({adapter_slots}).")
+    new_models = set(models_map)
+    models_to_add = new_models - models_that_exist
+    models_to_remove = models_that_exist - new_models
+    for adapter_id in models_to_remove:
+        remove_adapter_func(adapter_id)
+    for adapter_id in models_to_add:
+        add_adapter_func(models_map[adapter_id])
+
+
+def list_adapters_worker(adapter_manager_list_adapters_func) -> set[int]:
+    return set(adapter_manager_list_adapters_func())
diff --git a/vllm_v0.10.0/vllm/adapter_commons/worker_manager.py b/vllm_v0.10.0/vllm/adapter_commons/worker_manager.py
new file mode 100644
index 0000000..07e85d1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/adapter_commons/worker_manager.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import ABC, abstractmethod
+from typing import Any, Optional
+
+import torch
+
+
+class AbstractWorkerManager(ABC):
+
+    def __init__(self, device: torch.device):
+        self.device = device
+
+    @property
+    @abstractmethod
+    def is_enabled(self) -> bool:
+        raise NotImplementedError
+
+    @abstractmethod
+    def set_active_adapters(self, requests: set[Any],
+                            mapping: Optional[Any]) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_adapter(self, adapter_request: Any) -> bool:
+        raise NotImplementedError
+
+    @abstractmethod
+    def remove_adapter(self, adapter_id: int) -> bool:
+        raise NotImplementedError
+
+    @abstractmethod
+    def remove_all_adapters(self) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def list_adapters(self) -> set[int]:
+        raise NotImplementedError
diff --git a/vllm_v0.10.0/vllm/assets/__init__.py b/vllm_v0.10.0/vllm/assets/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/assets/audio.py b/vllm_v0.10.0/vllm/assets/audio.py
new file mode 100644
index 0000000..1c16230
--- /dev/null
+++ b/vllm_v0.10.0/vllm/assets/audio.py
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Literal
+from urllib.parse import urljoin
+
+import numpy.typing as npt
+
+from vllm.utils import PlaceholderModule
+
+from .base import VLLM_S3_BUCKET_URL, get_vllm_public_assets
+
+try:
+    import librosa
+except ImportError:
+    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
+
+ASSET_DIR = "multimodal_asset"
+
+AudioAssetName = Literal["winning_call", "mary_had_lamb"]
+
+
+@dataclass(frozen=True)
+class AudioAsset:
+    name: AudioAssetName
+
+    @property
+    def filename(self) -> str:
+        return f"{self.name}.ogg"
+
+    @property
+    def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]:
+        audio_path = get_vllm_public_assets(filename=self.filename,
+                                            s3_prefix=ASSET_DIR)
+        return librosa.load(audio_path, sr=None)
+
+    def get_local_path(self) -> Path:
+        return get_vllm_public_assets(filename=self.filename,
+                                      s3_prefix=ASSET_DIR)
+
+    @property
+    def url(self) -> str:
+        return urljoin(VLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg")
diff --git a/vllm_v0.10.0/vllm/assets/base.py b/vllm_v0.10.0/vllm/assets/base.py
new file mode 100644
index 0000000..31cde43
--- /dev/null
+++ b/vllm_v0.10.0/vllm/assets/base.py
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from functools import lru_cache
+from pathlib import Path
+from typing import Optional
+
+import vllm.envs as envs
+from vllm.connections import global_http_connection
+
+VLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com"
+
+
+def get_cache_dir() -> Path:
+    """Get the path to the cache for storing downloaded assets."""
+    path = Path(envs.VLLM_ASSETS_CACHE)
+    path.mkdir(parents=True, exist_ok=True)
+
+    return path
+
+
+@lru_cache
+def get_vllm_public_assets(filename: str,
+                           s3_prefix: Optional[str] = None) -> Path:
+    """
+    Download an asset file from ``s3://vllm-public-assets``
+    and return the path to the downloaded file.
+    """
+    asset_directory = get_cache_dir() / "vllm_public_assets"
+    asset_directory.mkdir(parents=True, exist_ok=True)
+
+    asset_path = asset_directory / filename
+    if not asset_path.exists():
+        if s3_prefix is not None:
+            filename = s3_prefix + "/" + filename
+        global_http_connection.download_file(
+            f"{VLLM_S3_BUCKET_URL}/{filename}",
+            asset_path,
+            timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT)
+
+    return asset_path
diff --git a/vllm_v0.10.0/vllm/assets/image.py b/vllm_v0.10.0/vllm/assets/image.py
new file mode 100644
index 0000000..c977242
--- /dev/null
+++ b/vllm_v0.10.0/vllm/assets/image.py
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from typing import Literal
+
+import torch
+from PIL import Image
+
+from .base import get_vllm_public_assets
+
+VLM_IMAGES_DIR = "vision_model_images"
+
+ImageAssetName = Literal["stop_sign", "cherry_blossom"]
+
+
+@dataclass(frozen=True)
+class ImageAsset:
+    name: ImageAssetName
+
+    @property
+    def pil_image(self) -> Image.Image:
+        image_path = get_vllm_public_assets(filename=f"{self.name}.jpg",
+                                            s3_prefix=VLM_IMAGES_DIR)
+        return Image.open(image_path)
+
+    @property
+    def image_embeds(self) -> torch.Tensor:
+        """
+        Image embeddings, only used for testing purposes with llava 1.5.
+        """
+        image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
+                                            s3_prefix=VLM_IMAGES_DIR)
+        return torch.load(image_path, map_location="cpu", weights_only=True)
diff --git a/vllm_v0.10.0/vllm/assets/video.py b/vllm_v0.10.0/vllm/assets/video.py
new file mode 100644
index 0000000..8ab0e97
--- /dev/null
+++ b/vllm_v0.10.0/vllm/assets/video.py
@@ -0,0 +1,138 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import Any, ClassVar, Literal, Optional
+
+import cv2
+import numpy as np
+import numpy.typing as npt
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from vllm.utils import PlaceholderModule
+
+from .base import get_cache_dir
+
+try:
+    import librosa
+except ImportError:
+    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
+
+
+@lru_cache
+def download_video_asset(filename: str) -> str:
+    """
+    Download and open an image from huggingface
+    repo: raushan-testing-hf/videos-test
+    """
+    video_directory = get_cache_dir() / "video-example-data"
+    video_directory.mkdir(parents=True, exist_ok=True)
+
+    video_path = video_directory / filename
+    video_path_str = str(video_path)
+    if not video_path.exists():
+        video_path_str = hf_hub_download(
+            repo_id="raushan-testing-hf/videos-test",
+            filename=filename,
+            repo_type="dataset",
+            cache_dir=video_directory,
+        )
+    return video_path_str
+
+
+def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
+    cap = cv2.VideoCapture(path)
+    if not cap.isOpened():
+        raise ValueError(f"Could not open video file {path}")
+
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    frames = []
+
+    num_frames = num_frames if num_frames > 0 else total_frames
+    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+    for idx in range(total_frames):
+        ok = cap.grab()  # next img
+        if not ok:
+            break
+        if idx in frame_indices:  # only decompress needed
+            ret, frame = cap.retrieve()
+            if ret:
+                # OpenCV uses BGR format, we need to convert it to RGB
+                # for PIL and transformers compatibility
+                frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+
+    frames = np.stack(frames)
+    if len(frames) < num_frames:
+        raise ValueError(f"Could not read enough frames from video file {path}"
+                         f" (expected {num_frames} frames, got {len(frames)})")
+    return frames
+
+
+def video_to_pil_images_list(path: str,
+                             num_frames: int = -1) -> list[Image.Image]:
+    frames = video_to_ndarrays(path, num_frames)
+    return [Image.fromarray(frame) for frame in frames]
+
+
+def video_get_metadata(path: str) -> dict[str, Any]:
+    cap = cv2.VideoCapture(path)
+    if not cap.isOpened():
+        raise ValueError(f"Could not open video file {path}")
+
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    duration = total_frames / fps if fps > 0 else 0
+
+    metadata = {
+        "total_num_frames": total_frames,
+        "fps": fps,
+        "duration": duration,
+        "video_backend": "opencv"
+    }
+    return metadata
+
+
+VideoAssetName = Literal["baby_reading"]
+
+
+@dataclass(frozen=True)
+class VideoAsset:
+    name: VideoAssetName
+    num_frames: int = -1
+
+    _NAME_TO_FILE: ClassVar[dict[VideoAssetName, str]] = {
+        "baby_reading": "sample_demo_1.mp4",
+    }
+
+    @property
+    def filename(self) -> str:
+        return self._NAME_TO_FILE[self.name]
+
+    @property
+    def pil_images(self) -> list[Image.Image]:
+        video_path = download_video_asset(self.filename)
+        ret = video_to_pil_images_list(video_path, self.num_frames)
+        return ret
+
+    @property
+    def np_ndarrays(self) -> npt.NDArray:
+        video_path = download_video_asset(self.filename)
+        ret = video_to_ndarrays(video_path, self.num_frames)
+        return ret
+
+    @property
+    def metadata(self) -> dict[str, Any]:
+        video_path = download_video_asset(self.filename)
+        ret = video_get_metadata(video_path)
+        return ret
+
+    def get_audio(self, sampling_rate: Optional[float] = None) -> npt.NDArray:
+        """
+        Read audio data from the video asset, used in Qwen2.5-Omni examples.
+        
+        See also: examples/offline_inference/qwen2_5_omni/only_thinker.py
+        """
+        video_path = download_video_asset(self.filename)
+        return librosa.load(video_path, sr=sampling_rate)[0]
diff --git a/vllm_v0.10.0/vllm/attention/__init__.py b/vllm_v0.10.0/vllm/attention/__init__.py
new file mode 100644
index 0000000..3440405
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/__init__.py
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.attention.backends.abstract import (AttentionBackend,
+                                              AttentionMetadata,
+                                              AttentionMetadataBuilder,
+                                              AttentionState, AttentionType)
+from vllm.attention.layer import Attention
+from vllm.attention.selector import get_attn_backend
+
+__all__ = [
+    "Attention",
+    "AttentionBackend",
+    "AttentionMetadata",
+    "AttentionType",
+    "AttentionMetadataBuilder",
+    "Attention",
+    "AttentionState",
+    "get_attn_backend",
+]
diff --git a/vllm_v0.10.0/vllm/attention/backends/__init__.py b/vllm_v0.10.0/vllm/attention/backends/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/attention/backends/abstract.py b/vllm_v0.10.0/vllm/attention/backends/abstract.py
new file mode 100644
index 0000000..ba20da4
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/backends/abstract.py
@@ -0,0 +1,326 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import ABC, abstractmethod
+from contextlib import contextmanager
+from dataclasses import dataclass, fields
+from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional,
+                    Protocol, Set, Tuple, Type, TypeVar)
+
+import torch
+
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape)
+from vllm.multimodal import MultiModalPlaceholderMap
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner_base import (ModelRunnerBase,
+                                               ModelRunnerInputBase,
+                                               ModelRunnerInputBuilderBase)
+
+
+class AttentionType:
+    """
+    Attention type.
+    Use string to be compatible with `torch.compile`.
+    """
+    # Decoder attention between previous layer Q/K/V
+    DECODER = "decoder"
+    # Encoder attention between previous layer Q/K/V for encoder-decoder
+    ENCODER = "encoder"
+    # Encoder attention between previous layer Q/K/V
+    ENCODER_ONLY = "encoder_only"
+    # Attention between dec. Q and enc. K/V for encoder-decoder
+    ENCODER_DECODER = "encoder_decoder"
+
+
+class AttentionBackend(ABC):
+    """Abstract class for attention backends."""
+    # For some attention backends, we allocate an output tensor before
+    # calling the custom op. When piecewise cudagraph is enabled, this
+    # makes sure the output tensor is allocated inside the cudagraph.
+    accept_output_buffer: bool = False
+
+    @staticmethod
+    @abstractmethod
+    def get_name() -> str:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def get_impl_cls() -> Type["AttentionImpl"]:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def get_state_cls() -> Type["AttentionState"]:
+        raise NotImplementedError
+
+    @classmethod
+    def make_metadata(cls, *args, **kwargs) -> "AttentionMetadata":
+        return cls.get_metadata_cls()(*args, **kwargs)
+
+    @staticmethod
+    @abstractmethod
+    def get_builder_cls() -> Type["AttentionMetadataBuilder"]:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        raise NotImplementedError
+
+    @staticmethod
+    def get_kv_cache_stride_order() -> Tuple[int, ...]:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: torch.Tensor,
+    ) -> None:
+        raise NotImplementedError
+
+    def advance_step(self, model_input: "ModelRunnerInputBase",
+                     sampled_token_ids: Optional[torch.Tensor],
+                     block_size: int, num_seqs: int, num_queries: int) -> None:
+        raise NotImplementedError
+
+
+@dataclass
+class AttentionMetadata:
+    """Attention metadata for prefill and decode batched together."""
+    # Total number of prefill requests.
+    num_prefills: int
+    # Number of prefill tokens.
+    num_prefill_tokens: int
+    # Number of decode tokens. Note that it is equivalent to the number of
+    # decode requests.
+    num_decode_tokens: int
+    # (num_tokens,). The indices of the token slots that input tokens will be
+    # stored into. E.g., if `slot_mapping` is [35, 2, 17] and the block size
+    # is 16, the three tokens are stored in the 3rd slot in block 2, 2nd slot
+    # in block 0, and 1st slot in block 1, respectively.
+    slot_mapping: torch.Tensor
+
+    # The index maps that relate multi-modal embeddings to the corresponding
+    # placeholders.
+    #
+    # N.B. These aren't really related to attention and don't belong on this
+    # type -- this is just a temporary solution to make them available to
+    # `model_executable`.
+    multi_modal_placeholder_index_maps: Optional[Dict[
+        str, MultiModalPlaceholderMap.IndexMap]]
+
+    # Enable/disable KV scales calculation. This is so that we can disable the
+    # calculation until after prefill and cuda graph capture.
+    enable_kv_scales_calculation: bool
+
+    @property
+    @abstractmethod
+    def prefill_metadata(self) -> Optional["AttentionMetadata"]:
+        """Return the attention metadata that's required to run prefill
+        attention."""
+        pass
+
+    @property
+    @abstractmethod
+    def decode_metadata(self) -> Optional["AttentionMetadata"]:
+        """Return the attention metadata that's required to run decode
+        attention."""
+        pass
+
+    def asdict_zerocopy(self,
+                        skip_fields: Optional[Set[str]] = None
+                        ) -> Dict[str, Any]:
+        """Similar to dataclasses.asdict, but avoids deepcopying."""
+        if skip_fields is None:
+            skip_fields = set()
+        # Note that if we add dataclasses as fields, they will need
+        # similar handling.
+        return {
+            field.name: getattr(self, field.name)
+            for field in fields(self) if field.name not in skip_fields
+        }
+
+
+T = TypeVar("T", bound=AttentionMetadata)
+
+
+class AttentionState(ABC, Generic[T]):
+    """Holds attention backend-specific objects reused during the
+    lifetime of the model runner."""
+
+    @abstractmethod
+    def __init__(self, runner: "ModelRunnerBase"):
+        ...
+
+    @abstractmethod
+    @contextmanager
+    def graph_capture(self, max_batch_size: int):
+        """Context manager used when capturing CUDA graphs."""
+        yield
+
+    @abstractmethod
+    def graph_clone(self, batch_size: int) -> "AttentionState[T]":
+        """Clone attention state to save in CUDA graph metadata."""
+        ...
+
+    @abstractmethod
+    def graph_capture_get_metadata_for_batch(
+            self,
+            batch_size: int,
+            is_encoder_decoder_model: bool = False) -> T:
+        """Get attention metadata for CUDA graph capture of batch_size."""
+        ...
+
+    @abstractmethod
+    def get_graph_input_buffers(
+            self,
+            attn_metadata: T,
+            is_encoder_decoder_model: bool = False) -> Dict[str, Any]:
+        """Get attention-specific input buffers for CUDA graph capture."""
+        ...
+
+    @abstractmethod
+    def prepare_graph_input_buffers(
+            self,
+            input_buffers: Dict[str, Any],
+            attn_metadata: T,
+            is_encoder_decoder_model: bool = False) -> None:
+        """In-place modify input buffers dict for CUDA graph replay."""
+        ...
+
+    @abstractmethod
+    def begin_forward(self, model_input: "ModelRunnerInputBase") -> None:
+        """Prepare state for forward pass."""
+        ...
+
+
+class AttentionMetadataBuilder(ABC, Generic[T]):
+    """Abstract class for attention metadata builders."""
+
+    @abstractmethod
+    def __init__(self, input_builder: "ModelRunnerInputBuilderBase") -> None:
+        """Create the builder, remember some configuration and parameters."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def prepare(self) -> None:
+        """Prepare for one batch."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def build(self, seq_lens: List[int], query_lens: List[int],
+              cuda_graph_pad_size: int, batch_size: int) -> T:
+        """Build attention metadata with on-device tensors."""
+        raise NotImplementedError
+
+
+class AttentionLayer(Protocol):
+
+    _q_scale: torch.Tensor
+    _k_scale: torch.Tensor
+    _v_scale: torch.Tensor
+    _k_scale_float: float
+    _v_scale_float: float
+    _prob_scale: torch.Tensor
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        ...
+
+
+class AttentionImpl(ABC, Generic[T]):
+
+    @abstractmethod
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: Optional[int] = None,
+        alibi_slopes: Optional[List[float]] = None,
+        sliding_window: Optional[int] = None,
+        kv_cache_dtype: str = "auto",
+        logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
+    ) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def forward(
+        self,
+        layer: AttentionLayer,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: T,
+        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def fused_output_quant_supported(self, dtype: torch.dtype, static: bool,
+                                     group_shape: GroupShape):
+        """
+        Does this attention implementation support fused output quantization.
+        This is used by the AttnFusionPass to only fuse output quantization
+        onto implementations that support it.
+
+        TODO(luka) merge parameters into QuantDescriptor
+        :param dtype: quantized dtype
+        :param static: static or dynamic quantization
+        :param group_shape: quant group shape.
+        :return: is fusion supported for this type of quantization
+        """
+        return False
+
+
+class MLAAttentionImpl(AttentionImpl[T], Generic[T]):
+
+    @abstractmethod
+    def forward(
+        self,
+        layer: AttentionLayer,
+        hidden_states_or_cq: torch.Tensor,
+        kv_c_normed: torch.Tensor,
+        k_pe: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: T,
+        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+
+def is_quantized_kv_cache(kv_cache_dtype: str) -> bool:
+    return kv_cache_dtype != "auto"
diff --git a/vllm_v0.10.0/vllm/attention/backends/differential_flash_attn.py b/vllm_v0.10.0/vllm/attention/backends/differential_flash_attn.py
new file mode 100644
index 0000000..bd9bc42
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/backends/differential_flash_attn.py
@@ -0,0 +1,996 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""" An implementation of https://arxiv.org/pdf/2410.05258 """
+from collections import defaultdict
+from dataclasses import dataclass
+from itertools import accumulate
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
+
+import torch
+from einops import rearrange
+
+from vllm import _custom_ops as ops
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer,
+                                              AttentionMetadata,
+                                              AttentionMetadataBuilder,
+                                              AttentionType,
+                                              is_quantized_kv_cache)
+from vllm.attention.backends.flash_attn import FlashAttentionBackend
+# yapf: enable
+from vllm.attention.backends.utils import (PAD_SLOT_ID, CommonAttentionState,
+                                           compute_slot_mapping,
+                                           compute_slot_mapping_start_idx,
+                                           is_all_cross_attn_metadata_set,
+                                           is_all_encoder_attn_metadata_set,
+                                           is_block_tables_empty)
+from vllm.attention.utils.fa_utils import (flash_attn_supports_fp8,
+                                           get_flash_attn_version)
+from vllm.logger import init_logger
+from vllm.multimodal import MultiModalPlaceholderMap
+from vllm.utils import async_tensor_h2d, make_tensor_with_pad
+from vllm.vllm_flash_attn import (flash_attn_varlen_func,
+                                  flash_attn_with_kvcache)
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
+                                          ModelInputForGPUWithSamplingMetadata)
+
+logger = init_logger(__name__)
+
+
+class DifferentialFlashAttentionBackend(AttentionBackend):
+    accept_output_buffer = False
+
+    @staticmethod
+    def get_supported_head_sizes() -> List[int]:
+        return [32, 64, 96, 128, 160, 192, 224, 256]
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        if block_size % 16 != 0:
+            raise ValueError("Block size must be a multiple of 16.")
+        assert num_kv_heads % 2 == 0, "num_kv_heads must be divisible by 2"
+        return (2, 2, num_blocks, block_size, num_kv_heads // 2, head_size)
+
+    @staticmethod
+    def get_name() -> str:
+        return "DIFFERENTIAL_FLASH_ATTN"
+
+    @staticmethod
+    def get_impl_cls() -> Type["DifferentialFlashAttentionImpl"]:
+        return DifferentialFlashAttentionImpl
+
+    @staticmethod
+    def get_metadata_cls() -> Type["DifferentialFlashAttentionMetadata"]:
+        return DifferentialFlashAttentionMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["DifferentialFlashAttentionMetadataBuilder"]:
+        return DifferentialFlashAttentionMetadataBuilder
+
+    @staticmethod
+    def get_state_cls() -> Type["CommonAttentionState"]:
+        return CommonAttentionState
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        src_key_cache = src_kv_cache[0]
+        dst_key_cache = dst_kv_cache[0]
+        ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst)
+        src_value_cache = src_kv_cache[1]
+        dst_value_cache = dst_kv_cache[1]
+        ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst)
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: torch.Tensor,
+    ) -> None:
+        key_caches = [kv_cache[0] for kv_cache in kv_caches]
+        value_caches = [kv_cache[1] for kv_cache in kv_caches]
+
+        ops.copy_blocks(key_caches, value_caches, src_to_dists)
+
+
+@dataclass
+class DifferentialFlashAttentionMetadata(AttentionMetadata):
+    """Metadata for FlashAttentionBackend.
+
+    NOTE: Any python object stored here is not updated when it is
+    cuda-graph replayed. If you have values that need to be changed
+    dynamically, it should be stored in tensor. The tensor has to be
+    updated from `CUDAGraphRunner.forward` API.
+    """
+    # (batch_size,). The sequence length per sequence. Sequence length means
+    # the computed tokens + new tokens None if it is a decoding.
+    seq_lens: Optional[List[int]]
+    # seq_lens stored as a tensor.
+    seq_lens_tensor: Optional[torch.Tensor]
+
+    # NOTE(sang): Definition of context_len, query_len, and seq_len.
+    # |---------- N-1 iteration --------|
+    # |---------------- N iteration ---------------------|
+    # |- tokenA -|......................|-- newTokens ---|
+    # |---------- context_len ----------|
+    # |-------------------- seq_len ---------------------|
+    #                                   |-- query_len ---|
+
+    # Maximum sequence length among prefill batch. 0 if there are decoding
+    # requests only.
+    max_prefill_seq_len: int
+    # Maximum sequence length among decode batch. 0 if there are prefill
+    # requests only.
+    max_decode_seq_len: int
+    # (batch_size,) A tensor of context lengths (tokens that are computed
+    # so far).
+    context_lens_tensor: Optional[torch.Tensor]
+
+    # (batch_size, max_blocks_per_seq).
+    # Block addresses per sequence. (Seq id -> list of physical block)
+    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
+    # in the kv cache. Each block can contain up to block_size tokens.
+    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
+    # captured.
+    block_tables: Optional[torch.Tensor]
+
+    # Whether or not if cuda graph is enabled.
+    # Cuda-graph is currently enabled for decoding only.
+    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
+
+    use_cuda_graph: bool
+
+    # Maximum query length in the batch.
+    max_query_len: Optional[int] = None
+
+    # Max number of query tokens among request in the batch.
+    max_decode_query_len: Optional[int] = None
+
+    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
+    # the batch, used to index into subquery. E.g., if the subquery length
+    # is [4, 6], it is [0, 4, 10].
+    query_start_loc: Optional[torch.Tensor] = None
+    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+    # the batch, used to index into sequence. E.g., if the sequence length is
+    # [4, 6], it is [0, 4, 10].
+    seq_start_loc: Optional[torch.Tensor] = None
+
+    _cached_prefill_metadata: Optional[
+        "DifferentialFlashAttentionMetadata"] = None
+    _cached_decode_metadata: Optional[
+        "DifferentialFlashAttentionMetadata"] = None
+
+    # Begin encoder attn & enc/dec cross-attn fields...
+
+    # Encoder sequence lengths representation
+    encoder_seq_lens: Optional[List[int]] = None
+    encoder_seq_lens_tensor: Optional[torch.Tensor] = None
+    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+    # the batch, used to index into sequence. E.g., if the sequence length is
+    # [4, 6], it is [0, 4, 10].
+    encoder_seq_start_loc: Optional[torch.Tensor] = None
+    # Maximum sequence length among encoder sequences
+    max_encoder_seq_len: Optional[int] = None
+    # Number of tokens input to encoder
+    num_encoder_tokens: Optional[int] = None
+
+    # Cross-attention memory-mapping data structures: slot mapping
+    # and block tables
+    cross_slot_mapping: Optional[torch.Tensor] = None
+    cross_block_tables: Optional[torch.Tensor] = None
+
+    # Cross-layer shared attention block tables
+    cross_layer_shared_block_tables: Optional[torch.Tensor] = None
+
+    @property
+    def is_all_encoder_attn_metadata_set(self):
+        '''
+        All attention metadata required for encoder attention is set.
+        '''
+        return is_all_encoder_attn_metadata_set(self)
+
+    @property
+    def is_all_cross_attn_metadata_set(self):
+        '''
+        All attention metadata required for enc/dec cross-attention is set.
+
+        Superset of encoder attention required metadata.
+        '''
+        return is_all_cross_attn_metadata_set(self)
+
+    @property
+    def prefill_metadata(
+            self) -> Optional["DifferentialFlashAttentionMetadata"]:
+        if self.num_prefills == 0:
+            return None
+
+        if self._cached_prefill_metadata is not None:
+            return self._cached_prefill_metadata
+
+        assert ((self.seq_lens is not None)
+                or (self.encoder_seq_lens is not None))
+        assert ((self.seq_lens_tensor is not None)
+                or (self.encoder_seq_lens_tensor is not None))
+
+        # Compute some attn_metadata fields which default to None
+        query_start_loc = (None if self.query_start_loc is None else
+                           self.query_start_loc[:self.num_prefills + 1])
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[:self.num_prefill_tokens])
+        seq_lens = (None if self.seq_lens is None else
+                    self.seq_lens[:self.num_prefills])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[:self.num_prefills])
+        seq_start_loc = (None if self.seq_start_loc is None else
+                         self.seq_start_loc[:self.num_prefills + 1])
+        context_lens_tensor = (None if self.context_lens_tensor is None else
+                               self.context_lens_tensor[:self.num_prefills])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[:self.num_prefills])
+        cross_layer_shared_block_tables = (
+            None if self.cross_layer_shared_block_tables is None else
+            self.cross_layer_shared_block_tables[:self.num_prefills])
+
+        self._cached_prefill_metadata = DifferentialFlashAttentionMetadata(
+            num_prefills=self.num_prefills,
+            num_prefill_tokens=self.num_prefill_tokens,
+            num_decode_tokens=0,
+            slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=self.
+            multi_modal_placeholder_index_maps,
+            enable_kv_scales_calculation=self.enable_kv_scales_calculation,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            max_query_len=self.max_query_len,
+            max_prefill_seq_len=self.max_prefill_seq_len,
+            max_decode_query_len=0,
+            max_decode_seq_len=0,
+            query_start_loc=query_start_loc,
+            seq_start_loc=seq_start_loc,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=block_tables,
+            cross_layer_shared_block_tables=cross_layer_shared_block_tables,
+            use_cuda_graph=False,
+            # Begin encoder & cross attn fields below...
+            encoder_seq_lens=self.encoder_seq_lens,
+            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+            encoder_seq_start_loc=self.encoder_seq_start_loc,
+            max_encoder_seq_len=self.max_encoder_seq_len,
+            cross_slot_mapping=self.cross_slot_mapping,
+            cross_block_tables=self.cross_block_tables)
+        return self._cached_prefill_metadata
+
+    @property
+    def decode_metadata(
+            self) -> Optional["DifferentialFlashAttentionMetadata"]:
+        if self.num_decode_tokens == 0:
+            return None
+
+        if self._cached_decode_metadata is not None:
+            return self._cached_decode_metadata
+        assert ((self.seq_lens_tensor is not None)
+                or (self.encoder_seq_lens_tensor is not None))
+
+        # Compute some attn_metadata fields which default to None
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[self.num_prefill_tokens:])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[self.num_prefills:])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[self.num_prefills:])
+        cross_layer_shared_block_tables = (
+            None if self.cross_layer_shared_block_tables is None else
+            self.cross_layer_shared_block_tables[self.num_prefills:])
+        self._cached_decode_metadata = DifferentialFlashAttentionMetadata(
+            num_prefills=0,
+            num_prefill_tokens=0,
+            num_decode_tokens=self.num_decode_tokens,
+            slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=True,
+            seq_lens=None,
+            seq_lens_tensor=seq_lens_tensor,
+            max_decode_query_len=self.max_decode_query_len,
+            max_query_len=self.max_query_len,
+            max_prefill_seq_len=0,
+            max_decode_seq_len=self.max_decode_seq_len,
+            # Batch may be composed of prefill|decodes, adjust query start
+            # indices to refer to the start of decodes. E.g.
+            # in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6].
+            query_start_loc=(self.query_start_loc[self.num_prefills:] -
+                             self.query_start_loc[self.num_prefills])
+            if self.query_start_loc is not None else None,
+            seq_start_loc=self.seq_start_loc[self.num_prefills:]
+            if self.seq_start_loc is not None else None,
+            context_lens_tensor=None,
+            block_tables=block_tables,
+            cross_layer_shared_block_tables=cross_layer_shared_block_tables,
+            use_cuda_graph=self.use_cuda_graph,
+            # Begin encoder & cross attn fields below...
+            encoder_seq_lens=self.encoder_seq_lens,
+            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+            encoder_seq_start_loc=self.encoder_seq_start_loc,
+            max_encoder_seq_len=self.max_encoder_seq_len,
+            cross_slot_mapping=self.cross_slot_mapping,
+            cross_block_tables=self.cross_block_tables)
+        return self._cached_decode_metadata
+
+    def advance_step(self,
+                     model_input: "ModelInputForGPUWithSamplingMetadata",
+                     sampled_token_ids: Optional[torch.Tensor],
+                     block_size: int,
+                     num_seqs: int,
+                     num_queries: int,
+                     turn_prefills_into_decodes: bool = False):
+        """
+        Update metadata in-place to advance one decode step.
+        """
+        # When using cudagraph, the num_seqs is padded to the next captured
+        # batch sized, but num_queries tracks the actual number of requests in
+        # the batch. For --enforce-eager mode, num_seqs == num_queries
+        if num_seqs != num_queries:
+            assert num_seqs > num_queries
+            assert self.use_cuda_graph
+
+        if turn_prefills_into_decodes:
+            # When Multi-Step is enabled with Chunked-Prefill, prefills and
+            # decodes are scheduled together. In the first step, all the
+            # prefills turn into decodes. This update reflects that
+            # conversion.
+            assert self.num_decode_tokens + self.num_prefills == num_seqs
+            self.num_decode_tokens += self.num_prefills
+            self.num_prefills = 0
+            self.num_prefill_tokens = 0
+            self.max_prefill_seq_len = 0
+            self.max_query_len = 1
+
+            self.slot_mapping = self.slot_mapping[:num_seqs]
+        else:
+            assert self.seq_lens is not None
+            assert self.max_decode_seq_len == max(self.seq_lens)
+
+        assert self.num_prefills == 0
+        assert self.num_prefill_tokens == 0
+        assert self.num_decode_tokens == num_seqs
+        assert self.slot_mapping.shape == (num_seqs, )
+
+        assert self.seq_lens is not None
+        assert len(self.seq_lens) == num_seqs
+        assert self.seq_lens_tensor is not None
+        assert self.seq_lens_tensor.shape == (num_seqs, )
+        assert self.max_query_len == 1
+        assert self.max_prefill_seq_len == 0
+
+        assert self.query_start_loc is not None
+        assert self.query_start_loc.shape == (num_queries + 1, )
+        assert self.seq_start_loc is not None
+        assert self.seq_start_loc.shape == (num_seqs + 1, )
+
+        assert self.context_lens_tensor is not None
+        assert self.context_lens_tensor.shape == (num_queries, )
+
+        assert self.block_tables is not None
+        assert self.block_tables.shape[0] == num_seqs
+
+        # Update query lengths. Note that we update only queries and not seqs,
+        # since tensors may be padded due to captured cuda graph batch size
+        for i in range(num_queries):
+            self.seq_lens[i] += 1
+        self.max_decode_seq_len = max(self.seq_lens)
+
+        ops.advance_step_flashattn(num_seqs=num_seqs,
+                                   num_queries=num_queries,
+                                   block_size=block_size,
+                                   input_tokens=model_input.input_tokens,
+                                   sampled_token_ids=sampled_token_ids,
+                                   input_positions=model_input.input_positions,
+                                   seq_lens=self.seq_lens_tensor,
+                                   slot_mapping=self.slot_mapping,
+                                   block_tables=self.block_tables)
+
+
+class DifferentialFlashAttentionMetadataBuilder(
+        AttentionMetadataBuilder[DifferentialFlashAttentionMetadata]):
+
+    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+        self.input_builder = input_builder
+        self.runner = input_builder.runner
+        self.sliding_window = input_builder.sliding_window
+        self.block_size = input_builder.block_size
+
+    def prepare(self):
+        self.slot_mapping: List[int] = []
+        self.prefill_seq_lens: List[int] = []
+        self.context_lens: List[int] = []
+        self.block_tables: List[List[int]] = []
+        self.cross_layer_shared_block_tables: List[List[int]] = []
+        self.curr_seq_lens: List[int] = []
+        self.multimodal_placeholder_maps: Dict[
+            str,
+            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
+        self.num_prefills = 0
+        self.num_prefill_tokens = 0
+        self.num_decode_tokens = 0
+        self.has_prefix_cache_hit = False
+
+    def _add_seq_group(
+            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
+            chunked_prefill_enabled: bool, prefix_cache_hit: bool):
+        """Add a sequence group to the metadata. Specifically update/append
+        1. context length.
+        2. block table.
+        3. slot mapping.
+        """
+        # TODO: add support for chunked prefill and prefix caching.
+        assert not chunked_prefill_enabled, \
+            "chunked prefill is not supported for now"
+        assert not prefix_cache_hit, "prefix caching is not supported for now"
+
+        is_prompt = inter_data.is_prompt
+        block_tables = inter_data.block_tables
+
+        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
+             curr_sliding_window_block) in zip(
+                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
+                 inter_data.orig_seq_lens, inter_data.seq_lens,
+                 inter_data.query_lens, inter_data.context_lens,
+                 inter_data.curr_sliding_window_blocks):
+            self.context_lens.append(context_len)
+
+            if is_prompt:
+                mm_maps = inter_data.multi_modal_placeholder_maps
+                if mm_maps:
+                    for modality, placeholders in mm_maps.items():
+                        self.multimodal_placeholder_maps[modality].extend(
+                            placeholders)
+
+                self.num_prefills += 1
+                self.num_prefill_tokens += token_len
+                self.prefill_seq_lens.append(seq_len)
+            else:
+                self.num_decode_tokens += query_len
+                self.curr_seq_lens.append(curr_seq_len)
+
+            # Compute block table.
+            # TODO(sang): Combine chunked prefill and prefix caching by
+            # only allowing multiple of block_size chunk size.
+            # NOTE: This only works for oooooooxxx style attention.
+            block_table = []
+            if prefix_cache_hit:
+                # NOTE(woosuk): For flash-attn, the block table should
+                # include the entries for the incoming prefill tokens.
+                block_table = block_tables[seq_id]
+            elif ((chunked_prefill_enabled or not is_prompt)
+                  and block_tables is not None):
+                if curr_sliding_window_block == 0:
+                    block_table = block_tables[seq_id]
+                else:
+                    block_table = block_tables[seq_id][
+                        -curr_sliding_window_block:]
+            self.block_tables.append(block_table)
+
+            cross_layer_shared_block_table = []
+            if prefix_cache_hit:
+                cross_layer_shared_block_table = block_tables[seq_id]
+            elif block_tables is not None:
+                if curr_sliding_window_block == 0:
+                    cross_layer_shared_block_table = block_tables[seq_id]
+                else:
+                    cross_layer_shared_block_table = block_tables[seq_id][
+                        -curr_sliding_window_block:]
+            self.cross_layer_shared_block_tables.append(
+                cross_layer_shared_block_table)
+
+            # Compute slot mapping.
+            is_profile_run = is_block_tables_empty(block_tables)
+            start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
+                                                       context_len,
+                                                       self.sliding_window)
+            compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
+                                 seq_len, context_len, start_idx,
+                                 self.block_size, inter_data.block_tables)
+
+    def _get_graph_runner_block_tables(self, num_seqs: int,
+                                       block_tables: List[List[int]],
+                                       graph_block_tables) -> torch.Tensor:
+        # The shape of graph_block_tables is
+        # [max batch size, max context len // block size].
+        # max_batch_size, max_blocks = self.runner.graph_block_tables.shape
+        max_batch_size, max_blocks = graph_block_tables.shape
+        assert max_batch_size >= num_seqs
+
+        # graph_block_tables = self.runner.graph_block_tables[:num_seqs]
+        graph_block_tables = graph_block_tables[:num_seqs]
+        for i, block_table in enumerate(block_tables):
+            if block_table:
+                num_blocks = len(block_table)
+                if num_blocks <= max_blocks:
+                    graph_block_tables[i, :num_blocks] = block_table
+                else:
+                    # It may be possible to have more blocks allocated due
+                    # to lookahead slots of multi-step, however, they are
+                    # not used anyway, so can be safely ignored.
+                    graph_block_tables[
+                        i, :max_blocks] = block_table[:max_blocks]
+
+        return torch.from_numpy(graph_block_tables).to(
+            device=self.runner.device, non_blocking=True)
+
+    def build(self, seq_lens: List[int], query_lens: List[int],
+              cuda_graph_pad_size: int, batch_size: int):
+        """Build attention metadata with on-device tensors.
+
+        Args:
+            seq_lens: The maybe padded sequence lengths of the input sequences.
+            query_lens: The query lengths of the input sequences.
+            cuda_graph_pad_size: The padding size for cuda graph.
+                                 -1 if cuda graph is not used.
+            batch_size: The maybe padded batch size.
+        """
+        prefix_cache_hit = any([
+            inter_data.prefix_cache_hit
+            for inter_data in self.input_builder.inter_data_list
+        ])
+        for inter_data in self.input_builder.inter_data_list:
+            self._add_seq_group(inter_data,
+                                self.input_builder.chunked_prefill_enabled,
+                                prefix_cache_hit)
+
+        device = self.runner.device
+        use_captured_graph = cuda_graph_pad_size != -1
+
+        max_query_len = max(query_lens)
+        decode_query_lens = query_lens[self.num_prefills:]
+        if len(decode_query_lens) > 0:
+            max_decode_query_len = max(decode_query_lens)
+        else:
+            max_decode_query_len = 1
+        max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
+        max_decode_seq_len = max(self.curr_seq_lens, default=0)
+        num_decode_tokens = self.num_decode_tokens
+        query_start_loc = list(accumulate(query_lens, initial=0))
+        seq_start_loc = list(accumulate(seq_lens, initial=0))
+
+        num_seqs = len(seq_lens)
+        if use_captured_graph:
+            self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
+            self.block_tables.extend([] * cuda_graph_pad_size)
+
+            self.cross_layer_shared_block_tables.extend([] *
+                                                        cuda_graph_pad_size)
+
+            num_decode_tokens = batch_size - self.num_prefill_tokens
+            block_tables = self._get_graph_runner_block_tables(
+                num_seqs, self.block_tables, self.runner.graph_block_tables)
+            cross_layer_shared_block_tables = \
+                self._get_graph_runner_block_tables(
+                    num_seqs, self.cross_layer_shared_block_tables,
+                    self.runner.cross_layer_shared_graph_block_tables)
+        else:
+            block_tables = make_tensor_with_pad(
+                self.block_tables,
+                pad=0,
+                dtype=torch.int,
+                device=device,
+            )
+            cross_layer_shared_block_tables = make_tensor_with_pad(
+                self.cross_layer_shared_block_tables,
+                pad=0,
+                dtype=torch.int,
+                device=device,
+            )
+        assert max_query_len > 0, ("query_lens: {}".format(query_lens))
+
+        assert device is not None
+        context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int,
+                                               device, self.runner.pin_memory)
+        seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
+                                           self.runner.pin_memory)
+        slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long,
+                                               device, self.runner.pin_memory)
+        query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32,
+                                                  device,
+                                                  self.runner.pin_memory)
+        seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32,
+                                                device, self.runner.pin_memory)
+        placeholder_index_maps = {
+            modality: placeholder_map.index_map()
+            for modality, placeholder_map in
+            self.multimodal_placeholder_maps.items()
+        }
+
+        return DifferentialFlashAttentionMetadata(
+            num_prefills=self.num_prefills,
+            slot_mapping=slot_mapping_tensor,
+            num_prefill_tokens=self.num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            seq_lens=seq_lens,
+            multi_modal_placeholder_index_maps=placeholder_index_maps,
+            enable_kv_scales_calculation=True,
+            seq_lens_tensor=seq_lens_tensor,
+            max_query_len=max_query_len,
+            max_decode_query_len=max_decode_query_len,
+            max_prefill_seq_len=max_prefill_seq_len,
+            max_decode_seq_len=max_decode_seq_len,
+            query_start_loc=query_start_loc_tensor,
+            seq_start_loc=seq_start_loc_tensor,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=block_tables,
+            cross_layer_shared_block_tables=cross_layer_shared_block_tables,
+            use_cuda_graph=use_captured_graph,
+        )
+
+
+class DifferentialFlashAttentionImpl(AttentionImpl):
+    """
+    If the input tensors contain prompt tokens, the layout is as follows:
+    |<--------------- num_prefill_tokens ----------------->|	
+    |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->|
+
+    Otherwise, the layout is as follows:	
+    |<----------------- num_decode_tokens ------------------>|	
+    |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->|
+
+    Generation tokens can contain padding when cuda-graph is used.
+    Currently, prompt tokens don't contain any padding.
+
+    The prompts might have different lengths, while the generation tokens
+    always have length 1.
+
+    If chunked prefill is enabled, prefill tokens and decode tokens can be
+    batched together in a flattened 1D query.
+
+    |<----- num_prefill_tokens ---->|<------- num_decode_tokens --------->|
+    |<-prefill_0->|...|<-prefill_N-1->|<--decode_0-->|...|<--decode_M-1-->|
+
+    Currently, cuda graph is disabled for chunked prefill, meaning there's no
+    padding between prefill and decode tokens.
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[List[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
+        use_irope: bool = False,
+        differential_flash_attention_config: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        if differential_flash_attention_config is None:
+            differential_flash_attention_config = {}
+        self.differential_flash_attention_config = \
+            differential_flash_attention_config
+        self.used_shared_kv_cache = kv_sharing_target_layer_name is not None
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
+        if use_irope:
+            logger.warning(
+                "Using irope in V0 is not supported yet, it will fall back "
+                "to global attention for long context.")
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+        self.alibi_slopes = alibi_slopes
+        self.sliding_window = ((sliding_window - 1,
+                                0) if sliding_window is not None else (-1, -1))
+        self.kv_cache_dtype = kv_cache_dtype
+        self.vllm_flash_attn_version = get_flash_attn_version(
+            requires_alibi=self.alibi_slopes is not None)
+        if is_quantized_kv_cache(self.kv_cache_dtype) and (
+                not self.kv_cache_dtype.startswith("fp8")
+                or not flash_attn_supports_fp8()):
+            raise NotImplementedError(
+                f"FlashAttention does not support {self.kv_cache_dtype} "
+                "kv-cache on this device "
+                f"(FA supports fp8 = {flash_attn_supports_fp8()}).")
+        if logits_soft_cap is None:
+            # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
+            logits_soft_cap = 0
+        self.logits_soft_cap = logits_soft_cap
+
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        support_head_sizes = FlashAttentionBackend.get_supported_head_sizes()
+        if head_size not in support_head_sizes:
+            raise ValueError(
+                f"Head size {head_size} is not supported by FlashAttention. "
+                f"Supported head sizes are: {support_head_sizes}.")
+        self.attn_type = attn_type
+
+        self.lambda_full = None
+        self.subln = self.differential_flash_attention_config["subln"]
+
+    def split_heads(self, x):
+        # split by num_heads, the stripe pattern is friendly to tensor parallel.
+        x = rearrange(x, "... (H two) D -> ... H two D", two=2)
+        x1 = x[..., 0, :]
+        x2 = x[..., 1, :]
+        return x1.contiguous(), x2.contiguous()
+
+    def split_kv_cache(self, x):
+        # split by num_heads, the stripe pattern is friendly to tensor parallel.
+        if x.numel() == 0:
+            return torch.empty(0), torch.empty(0)
+
+        x1, x2 = x[0], x[1]
+        return x1, x2
+
+    def populate_kv_cache(self, layer: AttentionLayer, key: torch.Tensor,
+                          value: torch.Tensor, kv_cache: torch.Tensor,
+                          attn_metadata: DifferentialFlashAttentionMetadata):
+        if kv_cache.numel() > 0 and key is not None and value is not None:
+            updated_slot_mapping = attn_metadata.slot_mapping
+            torch.ops._C_cache_ops.reshape_and_cache_flash(
+                key,
+                value,
+                kv_cache[0],
+                kv_cache[1],
+                updated_slot_mapping.flatten(),
+                self.kv_cache_dtype,
+                layer._k_scale,
+                layer._v_scale,
+            )
+
+    def forward_generate_kv_cache(
+            self, query: torch.Tensor, key: Optional[torch.Tensor],
+            value: Optional[torch.Tensor], k_cache: torch.Tensor,
+            v_cache: torch.Tensor,
+            attn_metadata: DifferentialFlashAttentionMetadata) -> torch.Tensor:
+
+        head_size = self.head_size
+        num_heads = self.num_heads // 2
+        num_kv_heads = self.num_kv_heads // 2
+
+        query = query.view(-1, num_heads, head_size)
+        if key is not None:
+            assert value is not None
+            key = key.view(-1, num_kv_heads, head_size)
+            value = value.view(-1, num_kv_heads, head_size)
+        else:
+            assert value is None
+
+        num_prefill_tokens = attn_metadata.num_prefill_tokens
+        num_decode_tokens = attn_metadata.num_decode_tokens
+        assert key.shape[
+            0] == num_prefill_tokens + num_decode_tokens, "key shape mismatch"
+        assert value.shape[
+            0] == num_prefill_tokens + num_decode_tokens, "value shape mismatch"
+
+        output = torch.empty_like(query)
+        # Query for decode. KV is not needed because it is already cached.
+        decode_query = query[num_prefill_tokens:]
+        # QKV for prefill.
+        query = query[:num_prefill_tokens]
+        if key is not None and value is not None:
+            key = key[:num_prefill_tokens]
+            value = value[:num_prefill_tokens]
+
+        assert query.shape[0] == num_prefill_tokens, "query shape mismatch"
+        assert decode_query.shape[
+            0] == num_decode_tokens, "decode query shape mismatch"
+
+        if prefill_meta := attn_metadata.prefill_metadata:
+            # Prompt run.
+            if k_cache.numel() == 0 \
+                or prefill_meta.block_tables is None \
+                or prefill_meta.block_tables.numel() == 0:
+                # normal attention
+                prefill_output = flash_attn_varlen_func(
+                    q=query,
+                    k=key,
+                    v=value,
+                    cu_seqlens_q=prefill_meta.seq_start_loc,
+                    cu_seqlens_k=prefill_meta.seq_start_loc,
+                    max_seqlen_q=prefill_meta.max_prefill_seq_len,
+                    max_seqlen_k=prefill_meta.max_prefill_seq_len,
+                    softmax_scale=self.scale,
+                    causal=True,
+                    window_size=self.sliding_window,
+                    alibi_slopes=self.alibi_slopes,
+                    softcap=self.logits_soft_cap,
+                )
+                assert prefill_output.shape == output[:
+                                                      num_prefill_tokens].shape
+                output[:num_prefill_tokens] = prefill_output
+            else:
+                raise Exception("prefix caching not supported")
+
+        if decode_meta := attn_metadata.decode_metadata:
+            block_tables_arg = decode_meta.block_tables
+            try:
+                output[num_prefill_tokens:] = flash_attn_with_kvcache(
+                    q=decode_query.unsqueeze(1),
+                    k_cache=k_cache,
+                    v_cache=v_cache,
+                    block_table=block_tables_arg,
+                    cache_seqlens=decode_meta.seq_lens_tensor,
+                    softmax_scale=self.scale,
+                    causal=True,
+                    window_size=self.sliding_window,
+                    alibi_slopes=self.alibi_slopes,
+                    softcap=self.logits_soft_cap,
+                ).squeeze(1)
+            except Exception as e:
+                logger.error("Error in PagedAttention.forward_decode: %s",
+                             str(e))
+                raise e
+
+        # Reshape the output tensor.
+        return output.view(-1, num_heads, head_size)
+
+    def forward_with_kv_cache_only(
+        self,
+        query: torch.Tensor,
+        k_cache: torch.Tensor,
+        v_cache: torch.Tensor,
+        attn_metadata: DifferentialFlashAttentionMetadata,
+    ):
+        if not attn_metadata.decode_metadata:
+            block_tables_arg = attn_metadata.cross_layer_shared_block_tables
+        else:
+            block_tables_arg = attn_metadata.block_tables
+
+        output = flash_attn_with_kvcache(
+            q=query.unsqueeze(1),
+            k_cache=k_cache,
+            v_cache=v_cache,
+            block_table=block_tables_arg,
+            cache_seqlens=attn_metadata.seq_lens_tensor,
+            softmax_scale=self.scale,
+            causal=True,
+            window_size=self.sliding_window,
+            alibi_slopes=self.alibi_slopes,
+            softcap=self.logits_soft_cap,
+        ).squeeze(1)
+        return output
+
+    def forward(
+        self,
+        layer: AttentionLayer,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: DifferentialFlashAttentionMetadata,
+        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass with FlashAttention.
+
+        Args:
+            query: shape = [num_tokens, num_heads, head_size]
+            key: shape = [num_tokens, num_kv_heads, head_size]
+            value: shape = [num_tokens, num_kv_heads, head_size]
+            output: shape = [num_tokens, num_heads, head_size]
+            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+                NOTE: kv_cache will be an empty tensor with shape [0]
+                for profiling run.
+            attn_metadata: Metadata for attention.
+        NOTE: It in-place updates the output tensor.
+        NOTE: FP8 quantization, flash-attn expect the size of
+              {q,k,v}_descale to be (num_sequences, num_kv_heads).
+              We use torch's .expand() to avoid duplicating values
+        """
+        if self.lambda_full is None:
+            self.lambda_init = self.differential_flash_attention_config[
+                "lambda_init"]
+            lambda_q1 = self.differential_flash_attention_config["lambda_q1"]
+            lambda_k1 = self.differential_flash_attention_config["lambda_k1"]
+            lambda_q2 = self.differential_flash_attention_config["lambda_q2"]
+            lambda_k2 = self.differential_flash_attention_config["lambda_k2"]
+            lambda_1 = torch.exp(
+                torch.sum(lambda_q1 * lambda_k1, dim=-1).float()).type_as(q)
+            lambda_2 = torch.exp(
+                torch.sum(lambda_q2 * lambda_k2, dim=-1).float()).type_as(q)
+            self.lambda_full = lambda_1 - lambda_2 + self.lambda_init
+
+        if not self.used_shared_kv_cache:  # need to generate kv-cache
+            q = q.view(-1, self.num_heads, self.head_size)
+            k = k.view(-1, self.num_kv_heads, self.head_size)
+            v = v.view(-1, self.num_kv_heads, self.head_size)
+
+            q1, q2 = self.split_heads(q)
+            k1, k2 = self.split_heads(k)
+            v1, v2 = self.split_heads(v)
+
+            # kv_cache shape is (2, 2, num_blocks, block_size, num_kv_heads // 2, head_size) # noqa: E501
+            # Split by half along the first dimension.
+            kv_cache1, kv_cache2 = self.split_kv_cache(kv_cache)
+            assert kv_cache1.is_contiguous(), "kv_cache1 is not contiguous"
+            assert kv_cache2.is_contiguous(), "kv_cache2 is not contiguous"
+
+            if kv_cache1.numel() != 0:
+                self.populate_kv_cache(layer, k1, v1, kv_cache1, attn_metadata)
+                self.populate_kv_cache(layer, k2, v2, kv_cache2, attn_metadata)
+
+                key_cache1, value_cache1 = self.split_kv_cache(kv_cache1)
+                key_cache2, value_cache2 = self.split_kv_cache(kv_cache2)
+            else:
+                key_cache1, value_cache1 = torch.empty(0), torch.empty(0)
+                key_cache2, value_cache2 = torch.empty(0), torch.empty(0)
+            attn11 = self.forward_generate_kv_cache(q1, k1, v1, key_cache1,
+                                                    value_cache1,
+                                                    attn_metadata)
+            attn12 = self.forward_generate_kv_cache(q1, k1, v2, key_cache1,
+                                                    value_cache2,
+                                                    attn_metadata)
+            attn11 = attn11.view(q1.shape)
+            attn12 = attn12.view(q1.shape)
+            attn1 = torch.cat([attn11, attn12], dim=-1)
+
+            attn21 = self.forward_generate_kv_cache(q2, k2, v1, key_cache2,
+                                                    value_cache1,
+                                                    attn_metadata)
+            attn22 = self.forward_generate_kv_cache(q2, k2, v2, key_cache2,
+                                                    value_cache2,
+                                                    attn_metadata)
+            attn21 = attn21.view(q2.shape)
+            attn22 = attn22.view(q2.shape)
+            attn2 = torch.cat([attn21, attn22], dim=-1)
+
+            attn = attn1 - self.lambda_full * attn2
+            # attn shape (-1, self.num_heads // 2, 2 * self.head_dim)
+            attn = self.subln(attn)
+            attn = attn * (1 - self.lambda_init)
+            # reshape back to 2 * num_head
+            attn_output = rearrange(attn,
+                                    "... H (two D) -> ... (H two) D",
+                                    two=2)
+
+        else:  # reuse the kv cache, full attention
+            q = q.view(-1, self.num_heads, self.head_size)
+            q1, q2 = self.split_heads(q)
+            # kv_cache shape is (2, num_blocks, block_size, num_kv_heads, head_size) # noqa: E501
+            kv_cache1, kv_cache2 = self.split_kv_cache(kv_cache)
+            key_cache1, value_cache1 = kv_cache1[0], kv_cache1[1]
+            key_cache2, value_cache2 = kv_cache2[0], kv_cache2[1]
+
+            attn11 = self.forward_with_kv_cache_only(q1, key_cache1,
+                                                     value_cache1,
+                                                     attn_metadata)
+            attn12 = self.forward_with_kv_cache_only(q1, key_cache1,
+                                                     value_cache2,
+                                                     attn_metadata)
+            attn11 = attn11.view(q1.shape)
+            attn12 = attn12.view(q1.shape)
+            attn1 = torch.cat([attn11, attn12], dim=-1)
+
+            attn21 = self.forward_with_kv_cache_only(q2, key_cache2,
+                                                     value_cache1,
+                                                     attn_metadata)
+            attn22 = self.forward_with_kv_cache_only(q2, key_cache2,
+                                                     value_cache2,
+                                                     attn_metadata)
+            attn21 = attn21.view(q2.shape)
+            attn22 = attn22.view(q2.shape)
+            attn2 = torch.cat([attn21, attn22], dim=-1)
+
+            attn = attn1 - self.lambda_full * attn2
+            attn = self.subln(attn)
+            attn = attn * (1 - self.lambda_init)
+            # reshape back to 2 * num_head
+            attn_output = rearrange(attn,
+                                    "... H (two D) -> ... (H two) D",
+                                    two=2)
+        attn_output = attn_output.view(-1, self.num_heads * self.head_size)
+        return attn_output
diff --git a/vllm_v0.10.0/vllm/attention/backends/dual_chunk_flash_attn.py b/vllm_v0.10.0/vllm/attention/backends/dual_chunk_flash_attn.py
new file mode 100644
index 0000000..fa6f3f1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/backends/dual_chunk_flash_attn.py
@@ -0,0 +1,1498 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Attention layer with Dual chunk flash attention and sparse attention.
+"""
+import math
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
+
+import torch
+import torch.distributed
+import torch.nn.functional as F
+
+from vllm import _custom_ops as ops
+from vllm.attention.backends.abstract import AttentionLayer, AttentionType
+from vllm.attention.backends.flash_attn import (FlashAttentionBackend,
+                                                FlashAttentionImpl,
+                                                FlashAttentionMetadata,
+                                                FlashAttentionMetadataBuilder)
+from vllm.distributed.parallel_state import get_tensor_model_parallel_rank
+from vllm.logger import init_logger
+from vllm.utils import async_tensor_h2d
+from vllm.vllm_flash_attn import (flash_attn_varlen_func,
+                                  flash_attn_with_kvcache, sparse_attn_func)
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUBuilder
+
+logger = init_logger(__name__)
+
+
+class DualChunkFlashAttentionBackend(FlashAttentionBackend):
+
+    accept_output_buffer: bool = False
+
+    @staticmethod
+    def get_name() -> str:
+        return "DUAL_CHUNK_FLASH_ATTN"
+
+    @staticmethod
+    def get_impl_cls() -> Type["DualChunkFlashAttentionImpl"]:
+        return DualChunkFlashAttentionImpl
+
+    @staticmethod
+    def get_metadata_cls() -> Type["DualChunkFlashAttentionMetadata"]:
+        return DualChunkFlashAttentionMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["DualChunkFlashAttentionMetadataBuilder"]:
+        return DualChunkFlashAttentionMetadataBuilder
+
+
+@dataclass
+class DualChunkFlashAttentionMetadata(FlashAttentionMetadata):
+    # Block size of the paged kv cache.
+    block_size: int = 16
+
+    # Original max position embeddings.
+    original_max_position_embeddings: int = 0
+
+    # Chunk size
+    chunk_size: int = 8192
+
+    # Local size
+    local_size: int = 1024
+
+    # (batch_size,). The orig sequence length per sequence.
+    orig_seq_lens: Optional[List[int]] = None
+
+    # orig_seq_lens stored as a tensor.
+    orig_seq_lens_tensor: Optional[torch.Tensor] = None
+
+    # Length scaling factor
+    scaling_factor: Optional[torch.Tensor] = None
+
+    # (batch_size,). Sequence lengths for intra attention.
+    seq_lens_intra: Optional[torch.Tensor] = None
+
+    # Max sequence length for intra attention.
+    max_seq_len_intra: Optional[int] = None
+
+    # (batch_size, num_blocks). Block table for intra attention.
+    block_tables_intra: Optional[torch.Tensor] = None
+
+    # (batch_size,). Sequence lengths for succ attention.
+    seq_lens_succ: Optional[torch.Tensor] = None
+
+    # Max sequence length for succ attention.
+    max_seq_len_succ: Optional[int] = None
+
+    # (batch_size, num_blocks). Block table for succ attention.
+    block_tables_succ: Optional[torch.Tensor] = None
+
+    # (batch_size,). Sequence lengths for inter attention.
+    seq_lens_inter: Optional[torch.Tensor] = None
+
+    # Max sequence length for inter attention.
+    max_seq_len_inter: Optional[int] = None
+
+    _cached_prefill_metadata: Optional[
+        "DualChunkFlashAttentionMetadata"] = None
+    _cached_decode_metadata: Optional["DualChunkFlashAttentionMetadata"] = None
+
+    @property
+    def prefill_metadata(self) -> Optional["DualChunkFlashAttentionMetadata"]:
+        if self.num_prefills == 0:
+            return None
+
+        if self._cached_prefill_metadata is not None:
+            return self._cached_prefill_metadata
+
+        prefill_metadata = super().prefill_metadata
+        if prefill_metadata is None:
+            return None
+
+        prefill_metadata = DualChunkFlashAttentionMetadata(
+            **prefill_metadata.asdict_zerocopy())
+
+        prefill_metadata.orig_seq_lens = (
+            None if self.orig_seq_lens is None else
+            self.orig_seq_lens[:self.num_prefills])
+        prefill_metadata.orig_seq_lens_tensor = (
+            None if self.orig_seq_lens_tensor is None else
+            self.orig_seq_lens_tensor[:self.num_prefills])
+
+        if self.original_max_position_embeddings > 0:
+            assert prefill_metadata.orig_seq_lens_tensor is not None
+            prefill_metadata.scaling_factor = (
+                0.1 * torch.log(prefill_metadata.orig_seq_lens_tensor /
+                                self.original_max_position_embeddings) +
+                1.0).clip(min=1)
+
+        self._cached_prefill_metadata = prefill_metadata
+        return prefill_metadata
+
+    @property
+    def decode_metadata(self) -> Optional["DualChunkFlashAttentionMetadata"]:
+        if self.num_decode_tokens == 0:
+            return None
+
+        if self._cached_decode_metadata is not None:
+            return self._cached_decode_metadata
+
+        decode_metadata = super().decode_metadata
+        if decode_metadata is None:
+            return None
+
+        decode_metadata = DualChunkFlashAttentionMetadata(
+            **decode_metadata.asdict_zerocopy())
+
+        decode_metadata.orig_seq_lens_tensor = (
+            None if self.orig_seq_lens_tensor is None else
+            self.orig_seq_lens_tensor[self.num_prefills:])
+
+        assert decode_metadata.orig_seq_lens_tensor is not None
+        assert decode_metadata.block_tables is not None
+
+        cache_seq_lens = decode_metadata.orig_seq_lens_tensor
+        chunk_len = self.chunk_size - self.local_size
+        chunk_num_curr = (cache_seq_lens - 1) // chunk_len
+        batch_size = decode_metadata.num_decode_tokens
+
+        if self.original_max_position_embeddings > 0:
+            decode_metadata.scaling_factor = (0.1 * torch.log(
+                cache_seq_lens / self.original_max_position_embeddings) +
+                                              1.0).clip(min=1)
+
+        seq_lens_intra = cache_seq_lens - chunk_num_curr * chunk_len
+        max_seq_len_intra = seq_lens_intra.max().item()
+        decode_metadata.seq_lens_intra = seq_lens_intra
+        decode_metadata.max_seq_len_intra = max_seq_len_intra
+
+        block_tables_intra = torch.zeros(
+            batch_size,
+            (max_seq_len_intra - 1) // self.block_size + 1,
+            dtype=decode_metadata.block_tables.dtype,
+            device=decode_metadata.block_tables.device,
+        )
+        for i in range(batch_size):
+            st = chunk_num_curr[i] * chunk_len // self.block_size
+            ed = min(
+                st + (max_seq_len_intra - 1) // self.block_size + 1,
+                (cache_seq_lens[i] - 1) // self.block_size + 1,
+            )
+            block_tables_intra[i, :ed -
+                               st] = decode_metadata.block_tables[i, st:ed]
+        decode_metadata.block_tables_intra = block_tables_intra
+
+        seq_lens_succ = (chunk_num_curr -
+                         (chunk_num_curr - 1).clip(min=0)) * chunk_len
+        max_seq_len_succ = seq_lens_succ.max().item()
+        decode_metadata.seq_lens_succ = seq_lens_succ
+        decode_metadata.max_seq_len_succ = max_seq_len_succ
+        if max_seq_len_succ:
+            block_tables_succ = torch.zeros(
+                batch_size,
+                (max_seq_len_succ - 1) // self.block_size + 1,
+                dtype=decode_metadata.block_tables.dtype,
+                device=decode_metadata.block_tables.device,
+            )
+            for i in range(batch_size):
+                start = ((chunk_num_curr[i] - 1).clip(min=0) * chunk_len //
+                         self.block_size)
+                end = min(
+                    start + (max_seq_len_succ - 1) // self.block_size + 1,
+                    (cache_seq_lens[i] - 1) // self.block_size + 1,
+                )
+                block_tables_succ[
+                    i, :end - start] = decode_metadata.block_tables[i,
+                                                                    start:end]
+            decode_metadata.block_tables_succ = block_tables_succ
+
+        seq_lens_inter = (chunk_num_curr - 1).clip(min=0) * chunk_len
+        max_seq_len_inter = seq_lens_inter.max().item()
+        decode_metadata.seq_lens_inter = seq_lens_inter
+        decode_metadata.max_seq_len_inter = max_seq_len_inter
+
+        self._cached_decode_metadata = decode_metadata
+        return decode_metadata
+
+
+class DualChunkFlashAttentionMetadataBuilder(FlashAttentionMetadataBuilder):
+
+    def prepare(self):
+        super().prepare()
+        self.orig_seq_lens: List[int] = []
+
+    def _add_seq_group(
+            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
+            chunked_prefill_enabled: bool, prefix_cache_hit: bool):
+        super()._add_seq_group(inter_data, chunked_prefill_enabled,
+                               prefix_cache_hit)
+        for prompt_len, seq_len in zip(inter_data.prompt_lens,
+                                       inter_data.seq_lens):
+            self.orig_seq_lens.append(max(prompt_len, seq_len))
+
+    def build(self, seq_lens: List[int], query_lens: List[int],
+              cuda_graph_pad_size: int, batch_size: int):
+        attn_metadata = super().build(seq_lens, query_lens,
+                                      cuda_graph_pad_size, batch_size)
+        attn_metadata = DualChunkFlashAttentionMetadata(
+            **attn_metadata.asdict_zerocopy())
+
+        device = self.runner.device
+        attn_metadata.orig_seq_lens = self.orig_seq_lens
+        attn_metadata.orig_seq_lens_tensor = async_tensor_h2d(
+            self.orig_seq_lens, torch.int, device, self.runner.pin_memory)
+
+        attn_metadata.block_size = self.runner.block_size
+        dual_chunk_attn_config = getattr(self.runner.model_config.hf_config,
+                                         "dual_chunk_attention_config", {})
+        attn_metadata.original_max_position_embeddings = \
+            dual_chunk_attn_config.get("original_max_position_embeddings", 0)
+        attn_metadata.chunk_size = dual_chunk_attn_config.get(
+            "chunk_size", 8192)
+        attn_metadata.local_size = dual_chunk_attn_config.get(
+            "local_size", 1024)
+
+        return attn_metadata
+
+
+class DualChunkFlashAttentionImpl(FlashAttentionImpl):
+    """
+    If the input tensors contain prompt tokens, the layout is as follows:
+    |<--------------- num_prefill_tokens ----------------->|
+    |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->|
+    Otherwise, the layout is as follows:
+    |<----------------- num_decode_tokens ------------------>|
+    |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->|
+    Generation tokens can contain padding when cuda-graph is used.
+    Currently, prompt tokens don't contain any padding.
+    The prompts might have different lengths, while the generation tokens
+    always have length 1.
+    If chunked prefill is enabled, prefill tokens and decode tokens can be
+    batched together in a flattened 1D query.
+    |<----- num_prefill_tokens ---->|<------- num_decode_tokens --------->|
+    |<-prefill_0->|...|<-prefill_N-1->|<--decode_0-->|...|<--decode_M-1-->|
+    Currently, cuda graph is disabled for chunked prefill, meaning there's no
+    padding between prefill and decode tokens.
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[List[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
+        layer_idx: int = -1,
+        dual_chunk_attention_config: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0 "
+                                      "DUAL_CHUNK_FLASH_ATTN backend.")
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+        self.alibi_slopes = alibi_slopes
+        self.sliding_window = ((sliding_window, sliding_window)
+                               if sliding_window is not None else (-1, -1))
+        self.kv_cache_dtype = kv_cache_dtype
+
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+        if sliding_window is not None:
+            # NOTE(woosuk): flash-attn's sliding window does not work with
+            # paged KV cache.
+            raise ValueError(
+                "Sliding window is not supported in FlashAttention.")
+
+        support_head_sizes = (
+            DualChunkFlashAttentionBackend.get_supported_head_sizes())
+
+        if head_size not in support_head_sizes:
+            raise ValueError(
+                f"Head size {head_size} is not supported by FlashAttention. "
+                f"Supported head sizes are: {support_head_sizes}.")
+
+        assert dual_chunk_attention_config is not None
+        self.chunk_size = dual_chunk_attention_config.get("chunk_size", 8192)
+        self.local_size = dual_chunk_attention_config.get("local_size", 1024)
+        self.original_max_position_embeddings = dual_chunk_attention_config.get(
+            "original_max_position_embeddings", 0)
+        self.sparse_attention_config = dual_chunk_attention_config.get(
+            "sparse_attention_config", None)
+        if not self.sparse_attention_config:
+            logger.warning_once("Sparse attention will not be enabled as "
+                                "sparse attention config is not provided.")
+        self.sparse_attention_enabled = dual_chunk_attention_config.get(
+            "sparse_attention_enabled", self.sparse_attention_config
+            is not None)
+        self.sparse_attention_threshold = dual_chunk_attention_config.get(
+            "sparse_attention_threshold", 32768)
+        self.sparse_attention_last_q = dual_chunk_attention_config.get(
+            "sparse_attention_last_q", 64)
+        self.layer_idx = layer_idx
+        self.dual_chunk_attention_config = dual_chunk_attention_config
+
+        if self.sparse_attention_config:
+            self.sparse_attention_config = {
+                int(i): j
+                for i, j in self.sparse_attention_config[
+                    self.layer_idx].items()
+            }
+            start_head = self.num_heads * get_tensor_model_parallel_rank()
+            end_head = start_head + self.num_heads
+            self.sparse_attention_config = [
+                self.sparse_attention_config[i]
+                for i in range(start_head, end_head)
+            ]
+
+        if self.sparse_attention_enabled:
+            self.arange = torch.arange(self.sparse_attention_last_q,
+                                       device="cuda")
+            self.last_q_mask = (self.arange[None, None, :, None]
+                                >= self.arange[None, None, None, :])
+
+    def forward(  # type: ignore
+        self,
+        layer: AttentionLayer,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: DualChunkFlashAttentionMetadata,
+        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass with DualChunkFlashAttention.
+        Args:
+            query: shape = [num_tokens, num_heads * head_size]
+            query_succ: shape = [num_tokens, num_heads * head_size]
+            query_inter: shape = [num_tokens, num_heads * head_size]
+            key: shape = [num_tokens, num_kv_heads * head_size]
+            value: shape = [num_tokens, num_kv_heads * head_size]
+            kv_cache = [2, num_blocks, block_size, num_kv_heads * head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """
+        assert output is None, "Output tensor not supported for DualChunk"
+
+        if output_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported"
+                " for FlashAttentionImpl")
+
+        (
+            query,
+            query_succ,
+            query_inter,
+            query_succ_critical,
+            query_inter_critical,
+        ) = torch.split(query, query.shape[-1] // 5, dim=-1)
+
+        assert (
+            query_succ is not None and query_inter is not None
+        ), "query_succ and query_inter are required in Dual Chunk Attention."
+
+        num_tokens, hidden_size = query.shape
+
+        # Reshape the query, key, and value tensors.
+        query = query.view(-1, self.num_heads, self.head_size)
+        query_succ = query_succ.view(-1, self.num_heads, self.head_size)
+        query_inter = query_inter.view(-1, self.num_heads, self.head_size)
+        query_succ_critical = query_succ_critical.view(-1, self.num_heads,
+                                                       self.head_size)
+        query_inter_critical = query_inter_critical.view(
+            -1, self.num_heads, self.head_size)
+        key = key.view(-1, self.num_kv_heads, self.head_size)
+        value = value.view(-1, self.num_kv_heads, self.head_size)
+
+        if self.original_max_position_embeddings > 0:
+            if prefill_meta := attn_metadata.prefill_metadata:
+                assert prefill_meta.scaling_factor is not None
+                assert prefill_meta.query_start_loc is not None
+                assert prefill_meta.orig_seq_lens is not None
+                current_start = 0
+                query_start_loc_cpu = prefill_meta.query_start_loc.cpu()
+                for i in range(len(prefill_meta.orig_seq_lens)):
+                    current_end = (current_start +
+                                   (query_start_loc_cpu[i + 1] -
+                                    query_start_loc_cpu[i]).item())
+                    key[current_start:current_end].mul_(
+                        prefill_meta.scaling_factor[i])
+                    current_start = current_end
+                assert current_end <= attn_metadata.num_prefill_tokens
+            if decode_meta := attn_metadata.decode_metadata:
+                assert decode_meta.scaling_factor is not None
+                scaling_factor = decode_meta.scaling_factor
+                key[attn_metadata.num_prefill_tokens:].mul_(
+                    scaling_factor.unsqueeze(-1).unsqueeze(-1))
+
+        if kv_cache is not None and kv_cache.numel() > 0:
+            key_cache = kv_cache[0]
+            value_cache = kv_cache[1]
+
+            # Reshape the input keys and values and store them in the cache.
+            # If kv_cache is not provided, the new key and value tensors are
+            # not cached. This happens during the initial memory profiling run.
+            ops.reshape_and_cache_flash(
+                key,
+                value,
+                key_cache,
+                value_cache,
+                attn_metadata.slot_mapping.flatten(),
+                self.kv_cache_dtype,
+                layer._k_scale,
+                layer._v_scale,
+            )
+
+        num_prefill_tokens = attn_metadata.num_prefill_tokens
+        num_decode_tokens = attn_metadata.num_decode_tokens
+        assert key.shape[0] == num_prefill_tokens + num_decode_tokens
+        assert value.shape[0] == num_prefill_tokens + num_decode_tokens
+        output = torch.empty_like(query)
+
+        # Query for decode. KV is not needed because it is already cached.
+        decode_query = query[num_prefill_tokens:]
+        decode_query_succ = query_succ[num_prefill_tokens:]
+        decode_query_inter = query_inter[num_prefill_tokens:]
+
+        # QKV for prefill.
+        query = query[:num_prefill_tokens]
+        query_succ = query_succ[:num_prefill_tokens]
+        query_inter = query_inter[:num_prefill_tokens]
+        query_succ_critical = query_succ_critical[:num_prefill_tokens]
+        query_inter_critical = query_inter_critical[:num_prefill_tokens]
+        key = key[:num_prefill_tokens]
+        value = value[:num_prefill_tokens]
+        assert query.shape[0] == num_prefill_tokens
+        assert decode_query.shape[0] == num_decode_tokens
+
+        if prefill_meta := attn_metadata.prefill_metadata:
+            # Prompt run.
+            if (kv_cache is None or prefill_meta.block_tables is None
+                    or prefill_meta.block_tables.numel() == 0):
+                # normal attention, called during the profiling run.
+                out = flash_attn_varlen_func(
+                    q=query,
+                    k=key,
+                    v=value,
+                    cu_seqlens_q=prefill_meta.seq_start_loc,
+                    cu_seqlens_k=prefill_meta.seq_start_loc,
+                    max_seqlen_q=prefill_meta.max_prefill_seq_len,
+                    max_seqlen_k=prefill_meta.max_prefill_seq_len,
+                    softmax_scale=self.scale,
+                    causal=True,
+                    window_size=self.sliding_window,
+                    alibi_slopes=self.alibi_slopes,
+                )
+                assert output[:num_prefill_tokens].shape == out.shape
+                output[:num_prefill_tokens] = out
+            else:
+                # prefix-enabled attention
+                assert prefill_meta.seq_lens is not None
+                assert prefill_meta.orig_seq_lens is not None
+                output[:num_prefill_tokens] = (
+                    self._dual_chunk_flash_attn_prefill(
+                        q=query,
+                        q_succ=query_succ,
+                        q_inter=query_inter,
+                        q_succ_critical=query_succ_critical,
+                        q_inter_critical=query_inter_critical,
+                        k=key_cache,
+                        v=value_cache,
+                        cu_seqlens_q=prefill_meta.query_start_loc,
+                        cu_seqlens_k=prefill_meta.seq_start_loc,
+                        orig_seq_lens=prefill_meta.orig_seq_lens,
+                        scaling_factor=prefill_meta.scaling_factor,
+                        softmax_scale=self.scale,
+                        causal=True,
+                        window_size=(-1, -1),
+                        alibi_slopes=self.alibi_slopes,
+                        block_table=prefill_meta.block_tables,
+                        chunk_size=self.chunk_size,
+                        local_size=self.local_size,
+                    ))
+
+        if decode_meta := attn_metadata.decode_metadata:
+            # Decoding run.
+            output[num_prefill_tokens:] = (
+                self._dual_chunk_flash_attn_decoding(
+                    decode_query.unsqueeze(1),
+                    decode_query_succ.unsqueeze(1),
+                    decode_query_inter.unsqueeze(1),
+                    key_cache,
+                    value_cache,
+                    block_table=decode_meta.block_tables,
+                    cache_seqlens=decode_meta.seq_lens_tensor,
+                    softmax_scale=self.scale,
+                    causal=True,
+                    alibi_slopes=self.alibi_slopes,
+                    chunk_size=self.chunk_size,
+                    local_size=self.local_size,
+                    original_max_position_embeddings=self.
+                    original_max_position_embeddings,
+                    decode_meta=decode_meta,
+                ).squeeze(1))
+        # Reshape the output tensor.
+        return output.view(num_tokens, hidden_size)
+
+    def _dual_chunk_flash_attn_prefill(
+        self,
+        q,
+        q_succ,
+        q_inter,
+        q_succ_critical,
+        q_inter_critical,
+        k,
+        v,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        orig_seq_lens: List[int],
+        scaling_factor: torch.Tensor,
+        softmax_scale: float,
+        causal: Optional[bool] = True,
+        window_size: Tuple[int, int] = (-1, -1),
+        alibi_slopes: Optional[torch.Tensor] = None,
+        block_table: Optional[torch.Tensor] = None,
+        chunk_size: int = 8192,
+        local_size: int = 1024,
+    ):
+        if alibi_slopes is not None:
+            raise ValueError(
+                "Dual Chunk Attention does not support alibi_slopes")
+        if not causal:
+            raise ValueError(
+                "Dual Chunk Attention does not support causal=False")
+        if window_size != (-1, -1):
+            raise ValueError(
+                "Dual Chunk Attention does not support window_size")
+
+        cu_seqlens_q_cpu = cu_seqlens_q.cpu().tolist()
+        cu_seqlens_k_cpu = cu_seqlens_k.cpu().tolist()
+        all_outputs = []
+
+        for i in range(0, len(cu_seqlens_q_cpu) - 1):
+            qs = cu_seqlens_q_cpu[i]
+            qe = cu_seqlens_q_cpu[i:i + 2][-1]
+            ks = cu_seqlens_k_cpu[i]
+            ke = cu_seqlens_k_cpu[i:i + 2][-1]
+
+            current_q = q[qs:qe]
+            current_q_succ = q_succ[qs:qe]
+            current_q_inter = q_inter[qs:qe]
+            current_q_succ_critical = q_succ_critical[qs:qe]
+            current_q_inter_critical = q_inter_critical[qs:qe]
+
+            if block_table is None:
+                current_k = k[ks:ke]
+                current_v = v[ks:ke]
+                current_block_table = None
+                current_orig_seq_len = orig_seq_lens[i]
+            else:
+                current_block_table = block_table[i]
+                current_orig_seq_len = orig_seq_lens[i]
+                current_k = k
+                current_v = v
+            sparse_attn_enabled = (self.sparse_attention_enabled
+                                   and current_orig_seq_len
+                                   > self.sparse_attention_threshold)
+
+            if current_q.shape[0] == 0:
+                continue
+
+            if current_k.shape[0] == 0:
+                all_outputs.append(
+                    torch.zeros(
+                        (current_q.shape[0], current_q.shape[1], v.shape[2]),
+                        device=q.device,
+                        dtype=q.dtype,
+                    ))
+                continue
+
+            current_output = torch.empty_like(current_q)
+            group_size = int(current_q.size(-2) / current_k.size(-2))
+
+            if sparse_attn_enabled:
+                num_device_q_heads = current_q.size(-2)
+                heads_vertical_size = torch.empty(size=(num_device_q_heads, ),
+                                                  dtype=torch.int32)
+                heads_slash_size = torch.empty(size=(num_device_q_heads, ),
+                                               dtype=torch.int32)
+                for head_id in range(current_q.size(-2)):
+                    (
+                        ty,
+                        vertical_size,
+                        slash_size,
+                        _,
+                    ) = self.sparse_attention_config[head_id]
+                    assert ty == "vertical_and_slash", "only support slash mode"
+
+                    if vertical_size == 30:
+                        vertical_size += 100
+                    heads_vertical_size[head_id] = vertical_size
+                    heads_slash_size[head_id] = slash_size
+
+                current_output = self._dual_chunk_flash_attn_prefill_func(
+                    current_q,  # allheads
+                    current_q_succ,
+                    current_q_inter,
+                    current_q_succ_critical,
+                    current_q_inter_critical,
+                    current_k,
+                    current_v,
+                    current_block_table,
+                    softmax_scale,
+                    chunk_size,
+                    local_size,
+                    scaling_factor[i].item(),
+                    ke - ks,
+                    sparse_attn_enabled=sparse_attn_enabled,
+                    heads_vertical_size=heads_vertical_size,
+                    heads_slash_size=heads_slash_size,
+                    group_size=group_size)
+            else:
+                for head_id in range(current_q.size(-2)):
+                    # (seq_len, num_heads, head_size)
+                    current_q_head = current_q[:, head_id, :].unsqueeze(1)
+                    current_q_succ_head = \
+                        current_q_succ[:, head_id, :].unsqueeze(1)
+                    current_q_inter_head = \
+                        current_q_inter[:, head_id, :].unsqueeze(1)
+                    current_q_succ_head_critical = \
+                        current_q_succ_critical[:, head_id, :].unsqueeze(1)
+                    current_q_inter_head_critical = \
+                        current_q_inter_critical[:, head_id, :].unsqueeze(1)
+                    if block_table is not None:
+                        current_k_head = current_k[..., head_id //
+                                                   group_size, :].unsqueeze(2)
+                        current_v_head = current_v[..., head_id //
+                                                   group_size, :].unsqueeze(2)
+
+                    else:
+                        current_k_head = current_k[:, head_id, :].unsqueeze(1)
+                        current_v_head = current_v[:, head_id, :].unsqueeze(1)
+
+                    current_out = self._dual_chunk_flash_attn_prefill_func(
+                        current_q_head,
+                        current_q_succ_head,
+                        current_q_inter_head,
+                        current_q_succ_head_critical,
+                        current_q_inter_head_critical,
+                        current_k_head,
+                        current_v_head,
+                        current_block_table,
+                        softmax_scale,
+                        chunk_size,
+                        local_size,
+                        scaling_factor[i].item(),
+                        ke - ks,
+                        sparse_attn_enabled=sparse_attn_enabled,
+                    )
+                    current_output[:, head_id:head_id + 1, :] = current_out
+            all_outputs.append(current_output)
+        return torch.cat(all_outputs, dim=0)
+
+    def _dual_chunk_flash_attn_prefill_func(
+        self,
+        q,
+        q_succ,
+        q_inter,
+        q_succ_critical,
+        q_inter_critical,
+        k,
+        v,
+        block_table,
+        softmax_scale: float,
+        chunk_size: int,
+        local_size: int,
+        scaling_factor: float,
+        k_length: int,
+        sparse_attn_enabled: Optional[bool] = True,
+        heads_vertical_size=None,
+        heads_slash_size=None,
+        group_size=None,
+    ):
+        flash_results = []
+        chunk_len = chunk_size - local_size
+
+        if block_table is not None:
+            block_size = v.shape[1]
+            if chunk_len % block_size != 0:
+                raise ValueError("chunk_len must be divisible by block_size.")
+        else:
+            block_size = 1
+
+        if self.original_max_position_embeddings > 0:
+            softmax_scale = softmax_scale * scaling_factor
+
+        begin = k_length - q.shape[0]
+        while begin < k_length:
+            flash_per_chunk = []
+
+            prev_chunk_end_pos = (begin // chunk_len) * chunk_len
+            next_chunk_end_pos = prev_chunk_end_pos + chunk_len
+            end = min(next_chunk_end_pos, k_length)
+            qbegin = begin - (k_length - q.shape[0])
+            qend = end - (k_length - q.shape[0])
+
+            qk_chunks = []
+            q_states_intra = q[qbegin:qend]
+            # choose critical token
+            if block_table is not None:
+                block_tables_intra = _get_block(block_table, block_size,
+                                                prev_chunk_end_pos, end)
+                k_states_intra = k[block_tables_intra].view(
+                    -1, *k.shape[-2:])[:(end - prev_chunk_end_pos)]
+                v_states_intra = v[block_tables_intra].view(
+                    -1, *v.shape[-2:])[:(end - prev_chunk_end_pos)]
+            else:
+                block_tables_intra = None
+                k_states_intra = k[prev_chunk_end_pos:end]
+                v_states_intra = v[prev_chunk_end_pos:end]
+
+            if sparse_attn_enabled:
+                last_q_size = min(qend - qbegin, self.sparse_attention_last_q)
+                _, num_device_k_heads, head_dim = k_states_intra.shape
+                k_states_intra = (k_states_intra.unsqueeze(2).repeat(
+                    1, 1, group_size,
+                    1).reshape(-1, num_device_k_heads * group_size, head_dim))
+                v_states_intra = (v_states_intra.unsqueeze(2).repeat(
+                    1, 1, group_size,
+                    1).reshape(-1, num_device_k_heads * group_size, head_dim))
+                qk_chunks.append(
+                    (q_states_intra.transpose(0, 1)[:, -last_q_size:] *
+                     softmax_scale) @ k_states_intra.permute(1, 2, 0))
+
+            if prev_chunk_end_pos - chunk_len >= 0:
+                q_states_succ = q_succ[qbegin:qend]
+                q_states_succ_critical = q_succ_critical[qbegin:qend]
+                if block_table is not None:
+                    block_tables_succ = _get_block(
+                        block_table, block_size,
+                        prev_chunk_end_pos - chunk_len, prev_chunk_end_pos)
+                    k_states_succ = k[block_tables_succ].view(
+                        -1, *k.shape[-2:])[:chunk_len]
+                    v_states_succ = v[block_tables_succ].view(
+                        -1, *v.shape[-2:])[:chunk_len]
+                else:
+                    k_states_succ = k[prev_chunk_end_pos -
+                                      chunk_len:prev_chunk_end_pos]
+                    v_states_succ = v[prev_chunk_end_pos -
+                                      chunk_len:prev_chunk_end_pos]
+
+                if sparse_attn_enabled:
+                    k_states_succ = (k_states_succ.unsqueeze(2).repeat(
+                        1, 1, group_size,
+                        1).reshape(-1, num_device_k_heads * group_size,
+                                   head_dim))
+                    v_states_succ = (v_states_succ.unsqueeze(2).repeat(
+                        1, 1, group_size,
+                        1).reshape(-1, num_device_k_heads * group_size,
+                                   head_dim))
+                    qk_chunks.append((q_states_succ_critical.transpose(
+                        0, 1)[:, -last_q_size:] * softmax_scale)
+                                     @ k_states_succ.permute(1, 2, 0))
+
+            if prev_chunk_end_pos - chunk_len * 2 >= 0:
+                q_states_inter = q_inter[qbegin:qend]
+                q_states_inter_critical = q_inter_critical[qbegin:qend]
+                if block_table is not None:
+                    block_tables_inter = _get_block(
+                        block_table, block_size, 0,
+                        prev_chunk_end_pos - chunk_len)
+                    k_states_inter = k[block_tables_inter].view(
+                        -1, *k.shape[-2:])[:(prev_chunk_end_pos - chunk_len)]
+                    v_states_inter = v[block_tables_inter].view(
+                        -1, *v.shape[-2:])[:(prev_chunk_end_pos - chunk_len)]
+                else:
+                    k_states_inter = k[:prev_chunk_end_pos - chunk_len]
+                    v_states_inter = v[:prev_chunk_end_pos - chunk_len]
+
+                if sparse_attn_enabled:
+                    k_states_inter = (k_states_inter.unsqueeze(2).repeat(
+                        1, 1, group_size,
+                        1).reshape(-1, num_device_k_heads * group_size,
+                                   head_dim))
+                    v_states_inter = (v_states_inter.unsqueeze(2).repeat(
+                        1, 1, group_size,
+                        1).reshape(-1, num_device_k_heads * group_size,
+                                   head_dim))
+                    qk_chunks.append((q_states_inter_critical.transpose(
+                        0, 1)[:, -last_q_size:] * softmax_scale)
+                                     @ k_states_inter.permute(1, 2, 0))
+
+            if sparse_attn_enabled:
+                reversed_qk = qk_chunks[::-1]
+                qk = torch.cat(reversed_qk, dim=-1)
+
+                qk[:, :, -last_q_size:] = torch.where(
+                    self.last_q_mask[..., -last_q_size:,
+                                     -last_q_size:].to(qk.device),
+                    qk[:, :, -last_q_size:], -torch.inf)
+                qk = F.softmax(qk, dim=-1, dtype=torch.float32)
+
+                vertical = qk.sum(-2, keepdim=True)
+                vertical[..., :30] = torch.inf
+
+                # Avoid sorting by using the min/max ints to fill the indexer
+                # buffers.
+                int32_max = torch.iinfo(torch.int32).max
+                int32_min = torch.iinfo(torch.int32).min
+                n_heads = qk.size()[0]
+                max_slash_topk = torch.max(heads_slash_size).item()
+                max_vertical_topk = torch.max(heads_vertical_size).item()
+                # store each head's slash topk, vertical topk
+                vertical = vertical.reshape((n_heads, -1))
+                # prevent out of range when prompt size < max_vertical_topk
+                max_vertical_topk = min(vertical.shape[-1], max_vertical_topk)
+                vertical_topk_buffer = torch.topk(vertical, max_vertical_topk,
+                                                  -1).indices
+                slash_topk_buffer = torch.empty(size=(n_heads, max_slash_topk),
+                                                dtype=torch.int64,
+                                                device=qk.device)
+                for head_i in range(n_heads):
+                    #  (nqheads=1, lastq, k_len)
+                    head_score = qk[head_i:head_i + 1, :, :]
+                    slash_scores = _sum_all_diagonal_matrix(head_score)
+                    if head_score.size(1) != 1:
+                        # drop right up corner
+                        slash_scores = slash_scores[..., :-last_q_size + 1]
+                    slash_scores[..., -100:] = torch.inf
+
+                    head_slash_size = heads_slash_size[head_i]
+                    head_slash_size = min(head_slash_size, vertical.size(-1))
+                    slash_topk = torch.topk(slash_scores, head_slash_size,
+                                            -1).indices
+                    #（nheads, max_topk）
+                    slash_topk_buffer[head_i, :head_slash_size] = slash_topk
+
+                    # reset heads topk
+                    heads_slash_size[head_i] = head_slash_size
+                    heads_vertical_size[head_i] = min(
+                        heads_vertical_size[head_i], max_vertical_topk)
+
+                # store
+                vertical_buffer = torch.full((n_heads, max_vertical_topk),
+                                             int32_max,
+                                             dtype=torch.int64,
+                                             device=q.device)
+                slash_buffer = torch.full((n_heads, max_slash_topk),
+                                          int32_min,
+                                          dtype=torch.int64,
+                                          device=q.device)
+                succ_vertical_buffer = torch.full((n_heads, max_vertical_topk),
+                                                  int32_max,
+                                                  dtype=torch.int64,
+                                                  device=q.device)
+                succ_slash_buffer = torch.full((n_heads, max_slash_topk),
+                                               int32_min,
+                                               dtype=torch.int64,
+                                               device=q.device)
+                inter_vertical_buffer = torch.full(
+                    (n_heads, max_vertical_topk),
+                    int32_max,
+                    dtype=torch.int64,
+                    device=q.device)
+                inter_slash_buffer = torch.full((n_heads, max_slash_topk),
+                                                int32_min,
+                                                dtype=torch.int64,
+                                                device=q.device)
+
+                vertical_size_buffer = torch.empty(size=(n_heads, ),
+                                                   dtype=torch.int32,
+                                                   device=q.device)
+                slash_sizes_buffer = torch.empty(size=(n_heads, ),
+                                                 dtype=torch.int32,
+                                                 device=q.device)
+                succ_vertical_size_buffer = torch.empty(size=(n_heads, ),
+                                                        dtype=torch.int32,
+                                                        device=q.device)
+                succ_slash_sizes_buffer = torch.empty(size=(n_heads, ),
+                                                      dtype=torch.int32,
+                                                      device=q.device)
+                inter_vertical_size_buffer = torch.empty(size=(n_heads, ),
+                                                         dtype=torch.int32,
+                                                         device=q.device)
+                inter_slash_sizes_buffer = torch.empty(size=(n_heads, ),
+                                                       dtype=torch.int32,
+                                                       device=q.device)
+
+                for head_i in range(n_heads):
+                    vertical_topk = vertical_topk_buffer[
+                        head_i, :heads_vertical_size[head_i]]
+                    # intra
+                    intra_vertical_indices = vertical_topk[
+                        vertical_topk >=
+                        prev_chunk_end_pos] - prev_chunk_end_pos
+                    if intra_vertical_indices.nelement() == 0:
+                        intra_vertical_indices = torch.cat([
+                            intra_vertical_indices,
+                            torch.arange(0,
+                                         k_states_intra.size(0),
+                                         max(1,
+                                             k_states_intra.size(0) / 5),
+                                         dtype=torch.int32,
+                                         device=intra_vertical_indices.device)
+                        ])
+                    slash_topk = slash_topk_buffer[
+                        head_i, :heads_slash_size[head_i]]
+                    intra_slash_indices = (
+                        (qk.size(-1) - 1) -
+                        slash_topk[slash_topk >= prev_chunk_end_pos])
+                    # fill buffer
+                    v_count = intra_vertical_indices.nelement()
+                    s_count = intra_slash_indices.nelement()
+                    vertical_size_buffer[head_i] = v_count
+                    slash_sizes_buffer[head_i] = s_count
+                    vertical_buffer[head_i, :v_count].copy_(
+                        intra_vertical_indices)
+                    slash_buffer[head_i, :s_count].copy_(intra_slash_indices)
+                    # succ
+                    if prev_chunk_end_pos - chunk_len >= 0:
+                        succ_vertical_indices = vertical_topk[
+                            (vertical_topk < prev_chunk_end_pos)
+                            & (vertical_topk >= prev_chunk_end_pos -
+                               chunk_len)] - (prev_chunk_end_pos - chunk_len)
+                        # TODO: support no vertical
+                        if succ_vertical_indices.nelement() == 0:
+                            succ_vertical_indices = torch.cat([
+                                succ_vertical_indices,
+                                torch.arange(
+                                    0,
+                                    k_states_succ.size(0),
+                                    max(1,
+                                        k_states_succ.size(0) / 5),
+                                    dtype=torch.int32,
+                                    device=intra_vertical_indices.device)
+                            ])
+                        succ_slash_indices = (
+                            (prev_chunk_end_pos + (qend - qbegin) - 1) -
+                            slash_topk[((slash_topk >=
+                                         (prev_chunk_end_pos - chunk_len)) &
+                                        (slash_topk < (prev_chunk_end_pos +
+                                                       (qend - qbegin))))])
+                        if succ_slash_indices.nelement() == 0:
+                            succ_slash_indices = torch.cat([
+                                succ_slash_indices,
+                                torch.arange(
+                                    0,
+                                    k_states_succ.size(0),
+                                    max(1,
+                                        k_states_succ.size(0) / 5),
+                                    dtype=torch.int32,
+                                    device=intra_vertical_indices.device)
+                            ])
+                        # fill buffer
+                        v_count = succ_vertical_indices.nelement()
+                        s_count = succ_slash_indices.nelement()
+                        succ_vertical_size_buffer[head_i] = v_count
+                        succ_slash_sizes_buffer[head_i] = s_count
+                        succ_vertical_buffer[head_i, :v_count].copy_(
+                            succ_vertical_indices)
+                        succ_slash_buffer[head_i, :s_count].copy_(
+                            succ_slash_indices)
+
+                    if prev_chunk_end_pos - 2 * chunk_len >= 0:
+                        inter_vertical_indices = vertical_topk[
+                            vertical_topk < prev_chunk_end_pos - chunk_len]
+
+                        if inter_vertical_indices.nelement() == 0:
+                            inter_vertical_indices = torch.cat([
+                                inter_vertical_indices,
+                                torch.arange(
+                                    0,
+                                    k_states_inter.size(0),
+                                    max(1,
+                                        k_states_inter.size(0) / 5),
+                                    dtype=torch.int32,
+                                    device=intra_vertical_indices.device)
+                            ])
+                        inter_slash_indices = (
+                            (prev_chunk_end_pos - chunk_len +
+                             (qend - qbegin) - 1) -
+                            slash_topk[slash_topk < (prev_chunk_end_pos -
+                                                     chunk_len +
+                                                     (qend - qbegin))])
+                        if inter_slash_indices.nelement() == 0:
+                            inter_slash_indices = torch.cat([
+                                inter_slash_indices,
+                                torch.arange(
+                                    0,
+                                    k_states_inter.size(0),
+                                    max(1,
+                                        k_states_inter.size(0) / 5),
+                                    dtype=torch.int32,
+                                    device=intra_vertical_indices.device)
+                            ])
+                        # fill buffer
+                        v_count = inter_vertical_indices.nelement()
+                        s_count = inter_slash_indices.nelement()
+                        inter_vertical_size_buffer[head_i] = v_count
+                        inter_slash_sizes_buffer[head_i] = s_count
+                        inter_vertical_buffer[head_i, :v_count].copy_(
+                            inter_vertical_indices)
+                        inter_slash_buffer[head_i, :s_count].copy_(
+                            inter_slash_indices)
+            else:
+                intra_vertical_indices, intra_slash_indices = None, None
+                succ_vertical_indices, succ_slash_indices = None, None
+                inter_vertical_indices, inter_slash_indices = None, None
+
+            if sparse_attn_enabled:
+                flash_result = self._do_flash_attn(
+                    q_states_intra,
+                    k_states_intra,
+                    v_states_intra,
+                    softmax_scale=softmax_scale,
+                    causal=True,
+                    stage="intra",
+                    vertical_indices=vertical_buffer,
+                    slash_indices=slash_buffer,
+                    vertical_indices_count=vertical_size_buffer,
+                    slash_indices_count=slash_sizes_buffer,
+                    mergehead_softmax_scale=softmax_scale,
+                    sparse_attn_enabled=sparse_attn_enabled)
+            else:
+                flash_result = self._do_flash_attn(
+                    q_states_intra,
+                    k_states_intra,
+                    v_states_intra,
+                    softmax_scale=softmax_scale,
+                    causal=True,
+                    stage="intra",
+                    vertical_indices=intra_vertical_indices,
+                    slash_indices=intra_slash_indices,
+                    sparse_attn_enabled=sparse_attn_enabled)
+            flash_per_chunk.append(flash_result)
+
+            if prev_chunk_end_pos - chunk_len >= 0:
+                if sparse_attn_enabled:
+                    flash_result = self._do_flash_attn(
+                        q_states_succ,
+                        k_states_succ,
+                        v_states_succ,
+                        softmax_scale=softmax_scale,
+                        causal=False,
+                        stage="succ",
+                        vertical_indices=succ_vertical_buffer,
+                        slash_indices=succ_slash_buffer,
+                        vertical_indices_count=succ_vertical_size_buffer,
+                        slash_indices_count=succ_slash_sizes_buffer,
+                        mergehead_softmax_scale=softmax_scale,
+                        sparse_attn_enabled=sparse_attn_enabled)
+                else:
+                    flash_result = self._do_flash_attn(
+                        q_states_succ,
+                        k_states_succ,
+                        v_states_succ,
+                        softmax_scale=softmax_scale,
+                        causal=False,
+                        stage="succ",
+                        vertical_indices=succ_vertical_indices,
+                        slash_indices=succ_slash_indices,
+                        sparse_attn_enabled=sparse_attn_enabled)
+                flash_per_chunk.append(flash_result)
+
+            if prev_chunk_end_pos - chunk_len * 2 >= 0:
+                if sparse_attn_enabled:
+                    flash_result = self._do_flash_attn(
+                        q_states_inter,
+                        k_states_inter,
+                        v_states_inter,
+                        softmax_scale=softmax_scale,
+                        causal=False,
+                        stage="inter",
+                        vertical_indices=inter_vertical_buffer,
+                        slash_indices=inter_slash_buffer,
+                        vertical_indices_count=inter_vertical_size_buffer,
+                        slash_indices_count=inter_slash_sizes_buffer,
+                        mergehead_softmax_scale=softmax_scale,
+                        sparse_attn_enabled=sparse_attn_enabled)
+                else:
+                    flash_result = self._do_flash_attn(
+                        q_states_inter,
+                        k_states_inter,
+                        v_states_inter,
+                        softmax_scale=softmax_scale,
+                        causal=False,
+                        stage="inter",
+                        vertical_indices=inter_vertical_indices,
+                        slash_indices=inter_slash_indices,
+                        sparse_attn_enabled=sparse_attn_enabled)
+                flash_per_chunk.append(flash_result)
+
+            flash_results.append(flash_per_chunk)
+            begin = end
+
+        attn_output = self._merge_attn_outputs(flash_results)
+        del flash_results
+        return attn_output
+
+    def _do_flash_attn(
+        self,
+        query_states: torch.Tensor,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        softmax_scale: float,
+        causal: bool = True,
+        max_seqlen_k: Optional[int] = None,
+        stage: str = "intra",
+        vertical_indices: Optional[torch.Tensor] = None,
+        slash_indices: Optional[torch.Tensor] = None,
+        vertical_indices_count: Optional[torch.Tensor] = None,
+        slash_indices_count: Optional[torch.Tensor] = None,
+        mergehead_softmax_scale: Optional[float] = None,
+        sparse_attn_enabled: Optional[bool] = False,
+    ):
+        if max_seqlen_k is None:
+            max_seqlen_k = key_states.shape[0]
+
+        q_len = query_states.shape[0]
+        q_heads = query_states.shape[1]
+        h_dim = query_states.shape[-1]
+
+        if sparse_attn_enabled:
+            assert slash_indices is not None
+            if stage == "intra":
+                assert causal
+            else:
+                assert not causal
+
+            query_states = query_states.unsqueeze(0).transpose(1, 2)
+            key_states = key_states.unsqueeze(0).transpose(1, 2)
+            value_states = value_states.unsqueeze(0).transpose(1, 2)
+
+            q = query_states
+            k = key_states
+            v = value_states
+
+            if (vertical_indices_count is not None and \
+                    slash_indices_count is not None):
+                assert mergehead_softmax_scale is not None
+
+                res, s_lse = _vertical_slash_sparse_attention(
+                    q,
+                    k,
+                    v,
+                    vertical_indices,
+                    slash_indices,
+                    mergehead_softmax_scale,
+                    causal=causal,
+                    stage=stage,
+                    vertical_indices_count=vertical_indices_count,
+                    slash_indices_count=slash_indices_count)
+                res = res.view(q_heads, q_len,
+                               h_dim).transpose(0, 1)  # (qlen,nhead,h_dim)
+                s_lse = s_lse.view(
+                    q_heads, q_len,
+                    1).squeeze(-1).unsqueeze(0).float()  # (1, nhead,qlen)
+            else:
+                res, s_lse = _vertical_slash_sparse_attention(q,
+                                                              k,
+                                                              v,
+                                                              vertical_indices,
+                                                              slash_indices,
+                                                              softmax_scale,
+                                                              causal=causal,
+                                                              stage=stage)
+                res = res.view(q_len, q_heads, h_dim)
+                s_lse = s_lse.view(q_len, q_heads, 1).transpose(0, 2).float()
+            return res, s_lse
+
+        output, softmax_lse = flash_attn_varlen_func(
+            q=query_states,
+            k=key_states,
+            v=value_states,
+            softmax_scale=softmax_scale,
+            cu_seqlens_q=torch.tensor([0, query_states.shape[0]],
+                                      dtype=torch.int32,
+                                      device=query_states.device),
+            max_seqlen_q=query_states.shape[0],
+            cu_seqlens_k=torch.tensor([0, max_seqlen_k],
+                                      dtype=torch.int32,
+                                      device=query_states.device),
+            max_seqlen_k=max_seqlen_k,
+            causal=causal,
+            return_softmax_lse=True,
+        )
+        softmax_lse = softmax_lse.view(q_len, q_heads, 1).transpose(0,
+                                                                    2).float()
+        return output, softmax_lse
+
+    def _merge_attn_outputs(
+        self,
+        flash_results: List[List[Tuple[torch.Tensor, torch.Tensor]]],
+        return_lse: Optional[bool] = False,
+    ) -> torch.Tensor:
+        attn_outputs_all = []
+        logits_all = []
+
+        for flash_per_chunk in flash_results:
+            if len(flash_per_chunk) == 1:
+                attn_outputs_all.append(flash_per_chunk[0][0])
+                if return_lse:
+                    logits_all.append(flash_per_chunk[0][1])
+                continue
+
+            attn_outputs = torch.stack([
+                flash_attn_output[0] for flash_attn_output in flash_per_chunk
+            ])
+            logits = torch.stack([
+                flash_attn_output[1] for flash_attn_output in flash_per_chunk
+            ])
+            logits = logits.to(torch.float32)
+
+            if return_lse:
+                max_val = torch.max(logits, dim=0).values
+                diff = torch.abs(logits[0] - logits[1])
+                log_sum_exp = max_val + torch.log1p(torch.exp(-diff))
+                logits_all.append(log_sum_exp)
+
+            max_logits = torch.max(logits, dim=0).values
+            stable_logits = logits - max_logits.unsqueeze(0)
+            lse_s = torch.exp(stable_logits).detach()
+            lse_sum = torch.sum(lse_s, dim=0)
+            lse_s /= lse_sum
+            attn_outputs *= lse_s.unsqueeze(-1).transpose(2, 3).squeeze(1)
+            attn_outputs_all.append(attn_outputs.sum(dim=0))
+
+        if return_lse:
+            return (torch.cat(attn_outputs_all,
+                              dim=0), torch.cat(logits_all, dim=-1))
+        else:
+            return torch.cat(attn_outputs_all, dim=0)
+
+    def _dual_chunk_flash_attn_decoding(
+        self,
+        query: torch.Tensor,
+        query_succ: torch.Tensor,
+        query_inter: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_table: torch.Tensor,
+        cache_seqlens: torch.Tensor,
+        softmax_scale: float,
+        causal: bool,
+        alibi_slopes: Optional[torch.Tensor],
+        chunk_size: int,
+        local_size: int,
+        original_max_position_embeddings: int,
+        decode_meta: DualChunkFlashAttentionMetadata,
+    ):
+        if not causal:
+            raise ValueError(
+                "Dual Chunk Attention does not support causal=False")
+
+        block_size = value_cache.shape[1]
+        chunk_len = chunk_size - local_size
+        if chunk_len % block_size != 0:
+            raise ValueError("chunk_len must be divisible by block_size.")
+        if original_max_position_embeddings > 0:
+            assert decode_meta.scaling_factor is not None
+            scaling_factor = decode_meta.scaling_factor
+            query = (query * scaling_factor.view(-1, 1, 1, 1)).to(
+                query.dtype
+            )  # possible for numerical issue, need to fused in the kernel
+            query_succ = (query_succ * scaling_factor.view(-1, 1, 1, 1)).to(
+                query.dtype)
+            query_inter = (query_inter * scaling_factor.view(-1, 1, 1, 1)).to(
+                query.dtype)
+        outputs_list = []
+        softmax_lses_list = []
+
+        # intra-attention
+        intra_output, intra_softmax_lse = (
+            self._dual_chunk_flash_attn_decoding_with_exp_sums(
+                query,
+                key_cache,
+                value_cache,
+                decode_meta.block_tables_intra,
+                decode_meta.seq_lens_intra,
+                softmax_scale,
+                alibi_slopes,
+                causal=False,
+            ))
+        outputs_list.append(intra_output)
+        softmax_lses_list.append(intra_softmax_lse)
+
+        # succ-attention
+        if decode_meta.max_seq_len_succ:
+            succ_output, succ_softmax_lse = (
+                self._dual_chunk_flash_attn_decoding_with_exp_sums(
+                    query_succ,
+                    key_cache,
+                    value_cache,
+                    decode_meta.block_tables_succ,
+                    decode_meta.seq_lens_succ,
+                    softmax_scale,
+                    alibi_slopes,
+                    causal=False,
+                ))
+            outputs_list.append(succ_output)
+            softmax_lses_list.append(succ_softmax_lse)
+
+        # inter-attention
+        if decode_meta.max_seq_len_inter:
+            inter_output, inter_softmax_lse = (
+                self._dual_chunk_flash_attn_decoding_with_exp_sums(
+                    query_inter,
+                    key_cache,
+                    value_cache,
+                    block_table[:, :decode_meta.max_seq_len_inter],
+                    decode_meta.seq_lens_inter,
+                    softmax_scale,
+                    alibi_slopes,
+                    causal=False,
+                ))
+            outputs_list.append(inter_output)
+            softmax_lses_list.append(inter_softmax_lse)
+        outputs = torch.stack(outputs_list, dim=0)
+        del outputs_list
+        softmax_lses = torch.stack(softmax_lses_list, dim=0).to(torch.float32)
+        del softmax_lses_list
+        max_logits = torch.max(softmax_lses, dim=0).values
+        stable_logits = softmax_lses - max_logits.unsqueeze(0)
+        lse_s = torch.exp(stable_logits).detach()
+        lse_sum = torch.sum(lse_s, dim=0)
+        lse_s /= lse_sum
+        outputs *= lse_s.unsqueeze(-1).transpose(2, 3)
+        return outputs.sum(0)
+
+    def _dual_chunk_flash_attn_decoding_with_exp_sums(
+        self,
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_table: torch.Tensor,
+        cache_seqlens: torch.Tensor,
+        softmax_scale: float,
+        alibi_slopes: Optional[torch.Tensor],
+        causal: bool,
+    ):
+        out, softmax_lse = flash_attn_with_kvcache(
+            q=query,
+            k_cache=key_cache,
+            v_cache=value_cache,
+            block_table=block_table,
+            cache_seqlens=cache_seqlens,
+            softmax_scale=softmax_scale,
+            alibi_slopes=alibi_slopes,
+            causal=causal,
+            return_softmax_lse=True,
+        )
+        mask = (cache_seqlens == 0)
+        out[mask] = 0
+        softmax_lse[mask] = -float("inf")
+        return out, softmax_lse
+
+
+def _vertical_slash_sparse_attention(
+    query: torch.Tensor,  # [BATCH, N_HEADS, N_CTX, D_HEAD]
+    key: torch.Tensor,  # [BATCH, N_HEADS, N_KV_CTX, D_HEAD]
+    value: torch.Tensor,  # [BATCH, N_HEADS, N_KV_CTX, D_HEAD]
+    v_idx: torch.Tensor,  # [BATCH, N_HEADS, NNZ_V]
+    s_idx: torch.Tensor,  # [BATCH, N_HEADS, NNZ_S]
+    softmax_scale: float,
+    causal: bool = True,
+    stage: str = "intra",
+    block_size_M: int = 64,
+    block_size_N: int = 64,
+    vertical_indices_count: torch.Tensor = None,  # [N_HEADS,]
+    slash_indices_count: torch.Tensor = None,
+):
+    if stage == "intra":
+        assert causal
+    else:
+        assert not causal
+
+    batch_size, num_heads, context_size, head_dim = query.shape
+    _, _, kv_seq_len, _ = key.shape
+
+    if head_dim not in [16, 32, 64, 128, 256, 512]:
+        target_dim = 2**math.ceil(math.log2(head_dim)) - head_dim
+        query = F.pad(query, [0, target_dim, 0, 0, 0, 0, 0, 0])
+        key = F.pad(key, [0, target_dim, 0, 0, 0, 0, 0, 0])
+        value = F.pad(value, [0, target_dim, 0, 0, 0, 0, 0, 0])
+
+    v_idx = v_idx.to(torch.int32).reshape(
+        (batch_size, num_heads, -1)).sort(dim=-1, descending=False)[0]
+    s_idx = s_idx.to(torch.int32).reshape(
+        (batch_size, num_heads, -1)).sort(dim=-1, descending=True)[0]
+    q_seqlens = torch.tensor([context_size],
+                             dtype=torch.int32,
+                             device=query.device)
+    kv_seqlens = torch.tensor([kv_seq_len],
+                              dtype=torch.int32,
+                              device=query.device)
+
+    if vertical_indices_count is not None and slash_indices_count is not None:
+        (
+            block_count,
+            block_offset,
+            column_count,
+            column_index,
+        ) = ops.convert_vertical_slash_indexes_mergehead(
+            q_seqlens, kv_seqlens, v_idx, s_idx, vertical_indices_count,
+            slash_indices_count, context_size, block_size_M, block_size_N,
+            causal)
+    else:
+        (
+            block_count,
+            block_offset,
+            column_count,
+            column_index,
+        ) = ops.convert_vertical_slash_indexes(q_seqlens, kv_seqlens, v_idx,
+                                               s_idx, context_size,
+                                               block_size_M, block_size_N,
+                                               causal)
+
+    q = query.transpose(1, 2).contiguous()
+    k = key.transpose(1, 2).contiguous()
+    v = value.transpose(1, 2).contiguous()
+    out, lse = sparse_attn_func(
+        q,
+        k,
+        v,
+        block_count,
+        block_offset,
+        column_count,
+        column_index,
+        causal=causal,
+        softmax_scale=softmax_scale,
+        return_softmax_lse=True,
+    )
+    out = out.transpose(1, 2).contiguous()
+    softmax_lse = lse.reshape(*lse.shape, 1)
+    return (out[..., :context_size, :head_dim],
+            softmax_lse[..., :context_size, :])
+
+
+def _sum_all_diagonal_matrix(mat: torch.tensor):
+    h, n, m = mat.shape
+    # Zero matrix used for padding
+    zero_mat = torch.zeros((h, n, n), device=mat.device)
+    # pads the matrix on left and right
+    mat_padded = torch.cat((zero_mat, mat, zero_mat), -1)
+    # Change the strides
+    mat_strided = mat_padded.as_strided((1, n, n + m),
+                                        (n * (2 * n + m), 2 * n + m + 1, 1))
+    # Sums the resulting matrix's columns
+    sum_diags = torch.sum(mat_strided, 1)
+    return sum_diags[:, 1:]  # drop left bottom corner
+
+
+def _get_block(block_table: torch.Tensor, block_size: int, begin: int,
+               end: int):
+    begin_block = begin // block_size
+    end_block = (end - 1) // block_size + 1
+    return block_table[begin_block:end_block]
diff --git a/vllm_v0.10.0/vllm/attention/backends/flash_attn.py b/vllm_v0.10.0/vllm/attention/backends/flash_attn.py
new file mode 100755
index 0000000..ee36fd1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/backends/flash_attn.py
@@ -0,0 +1,1005 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Attention layer with FlashAttention."""
+from collections import defaultdict
+from dataclasses import dataclass
+from itertools import accumulate
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type
+
+import torch
+
+from vllm import _custom_ops as ops
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer,
+                                              AttentionMetadata,
+                                              AttentionMetadataBuilder,
+                                              AttentionType,
+                                              is_quantized_kv_cache)
+# yapf: enable
+from vllm.attention.backends.utils import (
+    PAD_SLOT_ID, CommonAttentionState, compute_slot_mapping,
+    compute_slot_mapping_start_idx, get_num_prefill_decode_query_kv_tokens,
+    get_seq_len_block_table_args, is_all_cross_attn_metadata_set,
+    is_all_encoder_attn_metadata_set, is_block_tables_empty)
+from vllm.attention.utils.fa_utils import (flash_attn_supports_fp8,
+                                           get_flash_attn_version)
+from vllm.logger import init_logger
+from vllm.multimodal import MultiModalPlaceholderMap
+from vllm.utils import async_tensor_h2d, make_tensor_with_pad
+from vllm.vllm_flash_attn import (flash_attn_varlen_func,
+                                  flash_attn_with_kvcache)
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
+                                          ModelInputForGPUWithSamplingMetadata)
+
+logger = init_logger(__name__)
+
+
+class FlashAttentionBackend(AttentionBackend):
+
+    accept_output_buffer: bool = True
+
+    @staticmethod
+    def get_supported_head_sizes() -> List[int]:
+        return [32, 64, 96, 128, 160, 192, 224, 256]
+
+    @staticmethod
+    def get_name() -> str:
+        return "FLASH_ATTN"
+
+    @staticmethod
+    def get_impl_cls() -> Type["FlashAttentionImpl"]:
+        return FlashAttentionImpl
+
+    @staticmethod
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return FlashAttentionMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["FlashAttentionMetadataBuilder"]:
+        return FlashAttentionMetadataBuilder
+
+    @staticmethod
+    def get_state_cls() -> Type["CommonAttentionState"]:
+        return CommonAttentionState
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        if block_size % 16 != 0:
+            raise ValueError("Block size must be a multiple of 16.")
+        return (2, num_blocks, block_size, num_kv_heads, head_size)
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        src_key_cache = src_kv_cache[0]
+        dst_key_cache = dst_kv_cache[0]
+        ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst)
+        src_value_cache = src_kv_cache[1]
+        dst_value_cache = dst_kv_cache[1]
+        ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst)
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: torch.Tensor,
+    ) -> None:
+        key_caches = [kv_cache[0] for kv_cache in kv_caches]
+        value_caches = [kv_cache[1] for kv_cache in kv_caches]
+
+        ops.copy_blocks(key_caches, value_caches, src_to_dists)
+
+
+@dataclass
+class FlashAttentionMetadata(AttentionMetadata):
+    """Metadata for FlashAttentionBackend.
+
+    NOTE: Any python object stored here is not updated when it is
+    cuda-graph replayed. If you have values that need to be changed
+    dynamically, it should be stored in tensor. The tensor has to be
+    updated from `CUDAGraphRunner.forward` API.
+    """
+    # (batch_size,). The sequence length per sequence. Sequence length means
+    # the computed tokens + new tokens None if it is a decoding.
+    seq_lens: Optional[List[int]]
+    # seq_lens stored as a tensor.
+    seq_lens_tensor: Optional[torch.Tensor]
+
+    # NOTE(sang): Definition of context_len, query_len, and seq_len.
+    # |---------- N-1 iteration --------|
+    # |---------------- N iteration ---------------------|
+    # |- tokenA -|......................|-- newTokens ---|
+    # |---------- context_len ----------|
+    # |-------------------- seq_len ---------------------|
+    #                                   |-- query_len ---|
+
+    # Maximum sequence length among prefill batch. 0 if there are decoding
+    # requests only.
+    max_prefill_seq_len: int
+    # Maximum sequence length among decode batch. 0 if there are prefill
+    # requests only.
+    max_decode_seq_len: int
+    # (batch_size,) A tensor of context lengths (tokens that are computed
+    # so far).
+    context_lens_tensor: Optional[torch.Tensor]
+
+    # (batch_size, max_blocks_per_seq).
+    # Block addresses per sequence. (Seq id -> list of physical block)
+    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
+    # in the kv cache. Each block can contain up to block_size tokens.
+    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
+    # captured.
+    block_tables: Optional[torch.Tensor]
+
+    # Whether or not if cuda graph is enabled.
+    # Cuda-graph is currently enabled for decoding only.
+    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
+
+    use_cuda_graph: bool
+
+    # Maximum query length in the batch.
+    max_query_len: Optional[int] = None
+
+    # Max number of query tokens among request in the batch.
+    max_decode_query_len: Optional[int] = None
+
+    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
+    # the batch, used to index into subquery. E.g., if the subquery length
+    # is [4, 6], it is [0, 4, 10].
+    query_start_loc: Optional[torch.Tensor] = None
+    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+    # the batch, used to index into sequence. E.g., if the sequence length is
+    # [4, 6], it is [0, 4, 10].
+    seq_start_loc: Optional[torch.Tensor] = None
+
+    _cached_prefill_metadata: Optional["FlashAttentionMetadata"] = None
+    _cached_decode_metadata: Optional["FlashAttentionMetadata"] = None
+
+    # Begin encoder attn & enc/dec cross-attn fields...
+
+    # Encoder sequence lengths representation
+    encoder_seq_lens: Optional[List[int]] = None
+    encoder_seq_lens_tensor: Optional[torch.Tensor] = None
+    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+    # the batch, used to index into sequence. E.g., if the sequence length is
+    # [4, 6], it is [0, 4, 10].
+    encoder_seq_start_loc: Optional[torch.Tensor] = None
+    # Maximum sequence length among encoder sequences
+    max_encoder_seq_len: Optional[int] = None
+    # Number of tokens input to encoder
+    num_encoder_tokens: Optional[int] = None
+
+    # Cross-attention memory-mapping data structures: slot mapping
+    # and block tables
+    cross_slot_mapping: Optional[torch.Tensor] = None
+    cross_block_tables: Optional[torch.Tensor] = None
+
+    @property
+    def is_all_encoder_attn_metadata_set(self):
+        '''
+        All attention metadata required for encoder attention is set.
+        '''
+        return is_all_encoder_attn_metadata_set(self)
+
+    @property
+    def is_all_cross_attn_metadata_set(self):
+        '''
+        All attention metadata required for enc/dec cross-attention is set.
+
+        Superset of encoder attention required metadata.
+        '''
+        return is_all_cross_attn_metadata_set(self)
+
+    @property
+    def prefill_metadata(self) -> Optional["FlashAttentionMetadata"]:
+        if self.num_prefills == 0:
+            return None
+
+        if self._cached_prefill_metadata is not None:
+            return self._cached_prefill_metadata
+
+        assert ((self.seq_lens is not None)
+                or (self.encoder_seq_lens is not None))
+        assert ((self.seq_lens_tensor is not None)
+                or (self.encoder_seq_lens_tensor is not None))
+
+        # Compute some attn_metadata fields which default to None
+        query_start_loc = (None if self.query_start_loc is None else
+                           self.query_start_loc[:self.num_prefills + 1])
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[:self.num_prefill_tokens])
+        seq_lens = (None if self.seq_lens is None else
+                    self.seq_lens[:self.num_prefills])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[:self.num_prefills])
+        seq_start_loc = (None if self.seq_start_loc is None else
+                         self.seq_start_loc[:self.num_prefills + 1])
+        context_lens_tensor = (None if self.context_lens_tensor is None else
+                               self.context_lens_tensor[:self.num_prefills])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[:self.num_prefills])
+
+        self._cached_prefill_metadata = FlashAttentionMetadata(
+            num_prefills=self.num_prefills,
+            num_prefill_tokens=self.num_prefill_tokens,
+            num_decode_tokens=0,
+            slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=self.
+            multi_modal_placeholder_index_maps,
+            enable_kv_scales_calculation=self.enable_kv_scales_calculation,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            max_query_len=self.max_query_len,
+            max_prefill_seq_len=self.max_prefill_seq_len,
+            max_decode_query_len=0,
+            max_decode_seq_len=0,
+            query_start_loc=query_start_loc,
+            seq_start_loc=seq_start_loc,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=block_tables,
+            use_cuda_graph=False,
+            # Begin encoder & cross attn fields below...
+            encoder_seq_lens=self.encoder_seq_lens,
+            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+            encoder_seq_start_loc=self.encoder_seq_start_loc,
+            max_encoder_seq_len=self.max_encoder_seq_len,
+            cross_slot_mapping=self.cross_slot_mapping,
+            cross_block_tables=self.cross_block_tables)
+        return self._cached_prefill_metadata
+
+    @property
+    def decode_metadata(self) -> Optional["FlashAttentionMetadata"]:
+        if self.num_decode_tokens == 0:
+            return None
+
+        if self._cached_decode_metadata is not None:
+            return self._cached_decode_metadata
+        assert ((self.seq_lens_tensor is not None)
+                or (self.encoder_seq_lens_tensor is not None))
+
+        # Compute some attn_metadata fields which default to None
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[self.num_prefill_tokens:])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[self.num_prefills:])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[self.num_prefills:])
+
+        self._cached_decode_metadata = FlashAttentionMetadata(
+            num_prefills=0,
+            num_prefill_tokens=0,
+            num_decode_tokens=self.num_decode_tokens,
+            slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=True,
+            seq_lens=None,
+            seq_lens_tensor=seq_lens_tensor,
+            max_decode_query_len=self.max_decode_query_len,
+            max_query_len=self.max_query_len,
+            max_prefill_seq_len=0,
+            max_decode_seq_len=self.max_decode_seq_len,
+            # Batch may be composed of prefill|decodes, adjust query start
+            # indices to refer to the start of decodes. E.g.
+            # in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6].
+            query_start_loc=(self.query_start_loc[self.num_prefills:] -
+                             self.query_start_loc[self.num_prefills])
+            if self.query_start_loc is not None else None,
+            seq_start_loc=self.seq_start_loc[self.num_prefills:]
+            if self.seq_start_loc is not None else None,
+            context_lens_tensor=None,
+            block_tables=block_tables,
+            use_cuda_graph=self.use_cuda_graph,
+            # Begin encoder & cross attn fields below...
+            encoder_seq_lens=self.encoder_seq_lens,
+            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+            encoder_seq_start_loc=self.encoder_seq_start_loc,
+            max_encoder_seq_len=self.max_encoder_seq_len,
+            cross_slot_mapping=self.cross_slot_mapping,
+            cross_block_tables=self.cross_block_tables)
+        return self._cached_decode_metadata
+
+    def advance_step(self,
+                     model_input: "ModelInputForGPUWithSamplingMetadata",
+                     sampled_token_ids: Optional[torch.Tensor],
+                     block_size: int,
+                     num_seqs: int,
+                     num_queries: int,
+                     turn_prefills_into_decodes: bool = False):
+        """
+        Update metadata in-place to advance one decode step.
+        """
+        # When using cudagraph, the num_seqs is padded to the next captured
+        # batch sized, but num_queries tracks the actual number of requests in
+        # the batch. For --enforce-eager mode, num_seqs == num_queries
+        if num_seqs != num_queries:
+            assert num_seqs > num_queries
+            assert self.use_cuda_graph
+
+        if turn_prefills_into_decodes:
+            # When Multi-Step is enabled with Chunked-Prefill, prefills and
+            # decodes are scheduled together. In the first step, all the
+            # prefills turn into decodes. This update reflects that
+            # conversion.
+            assert self.num_decode_tokens + self.num_prefills == num_seqs
+            self.num_decode_tokens += self.num_prefills
+            self.num_prefills = 0
+            self.num_prefill_tokens = 0
+            self.max_prefill_seq_len = 0
+            self.max_query_len = 1
+
+            self.slot_mapping = self.slot_mapping[:num_seqs]
+        else:
+            assert self.seq_lens is not None
+            assert self.max_decode_seq_len == max(self.seq_lens)
+
+        assert self.num_prefills == 0
+        assert self.num_prefill_tokens == 0
+        assert self.num_decode_tokens == num_seqs
+        assert self.slot_mapping.shape == (num_seqs, )
+
+        assert self.seq_lens is not None
+        assert len(self.seq_lens) == num_seqs
+        assert self.seq_lens_tensor is not None
+        assert self.seq_lens_tensor.shape == (num_seqs, )
+        assert self.max_query_len == 1
+        assert self.max_prefill_seq_len == 0
+
+        assert self.query_start_loc is not None
+        assert self.query_start_loc.shape == (num_queries + 1, )
+        assert self.seq_start_loc is not None
+        assert self.seq_start_loc.shape == (num_seqs + 1, )
+
+        assert self.context_lens_tensor is not None
+        assert self.context_lens_tensor.shape == (num_queries, )
+
+        assert self.block_tables is not None
+        assert self.block_tables.shape[0] == num_seqs
+
+        # Update query lengths. Note that we update only queries and not seqs,
+        # since tensors may be padded due to captured cuda graph batch size
+        for i in range(num_queries):
+            self.seq_lens[i] += 1
+        self.max_decode_seq_len = max(self.seq_lens)
+
+        ops.advance_step_flashattn(num_seqs=num_seqs,
+                                   num_queries=num_queries,
+                                   block_size=block_size,
+                                   input_tokens=model_input.input_tokens,
+                                   sampled_token_ids=sampled_token_ids,
+                                   input_positions=model_input.input_positions,
+                                   seq_lens=self.seq_lens_tensor,
+                                   slot_mapping=self.slot_mapping,
+                                   block_tables=self.block_tables)
+
+
+class FlashAttentionMetadataBuilder(
+        AttentionMetadataBuilder[FlashAttentionMetadata]):
+
+    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+        self.input_builder = input_builder
+        self.runner = input_builder.runner
+        self.sliding_window = input_builder.sliding_window
+        self.block_size = input_builder.block_size
+
+    def prepare(self):
+        self.slot_mapping: List[int] = []
+        self.prefill_seq_lens: List[int] = []
+        self.context_lens: List[int] = []
+        self.block_tables: List[List[int]] = []
+        self.curr_seq_lens: List[int] = []
+        self.multimodal_placeholder_maps: Dict[
+            str,
+            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
+        self.num_prefills = 0
+        self.num_prefill_tokens = 0
+        self.num_decode_tokens = 0
+        self.has_prefix_cache_hit = False
+
+    def _add_seq_group(
+            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
+            chunked_prefill_enabled: bool, prefix_cache_hit: bool):
+        """Add a sequence group to the metadata. Specifically update/append
+        1. context length.
+        2. block table.
+        3. slot mapping.
+        """
+        is_prompt = inter_data.is_prompt
+        block_tables = inter_data.block_tables
+
+        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
+             curr_sliding_window_block) in zip(
+                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
+                 inter_data.orig_seq_lens, inter_data.seq_lens,
+                 inter_data.query_lens, inter_data.context_lens,
+                 inter_data.curr_sliding_window_blocks):
+            self.context_lens.append(context_len)
+
+            if is_prompt:
+                mm_maps = inter_data.multi_modal_placeholder_maps
+                if mm_maps:
+                    for modality, placeholders in mm_maps.items():
+                        self.multimodal_placeholder_maps[modality].extend(
+                            placeholders)
+
+                self.num_prefills += 1
+                self.num_prefill_tokens += token_len
+                self.prefill_seq_lens.append(seq_len)
+            else:
+                self.num_decode_tokens += query_len
+                self.curr_seq_lens.append(curr_seq_len)
+
+            # Compute block table.
+            # TODO(sang): Combine chunked prefill and prefix caching by
+            # only allowing multiple of block_size chunk size.
+            # NOTE: This only works for oooooooxxx style attention.
+            block_table = []
+            if prefix_cache_hit:
+                # NOTE(woosuk): For flash-attn, the block table should
+                # include the entries for the incoming prefill tokens.
+                block_table = block_tables[seq_id]
+            elif ((chunked_prefill_enabled or not is_prompt)
+                  and block_tables is not None):
+                if curr_sliding_window_block == 0:
+                    block_table = block_tables[seq_id]
+                else:
+                    block_table = block_tables[seq_id][
+                        -curr_sliding_window_block:]
+            self.block_tables.append(block_table)
+
+            # Compute slot mapping.
+            is_profile_run = is_block_tables_empty(block_tables)
+            start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
+                                                       context_len,
+                                                       self.sliding_window)
+            compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
+                                 seq_len, context_len, start_idx,
+                                 self.block_size, inter_data.block_tables)
+
+    def _get_graph_runner_block_tables(
+            self, num_seqs: int,
+            block_tables: List[List[int]]) -> torch.Tensor:
+        # The shape of graph_block_tables is
+        # [max batch size, max context len // block size].
+        max_batch_size, max_blocks = self.runner.graph_block_tables.shape
+        assert max_batch_size >= num_seqs
+
+        graph_block_tables = self.runner.graph_block_tables[:num_seqs]
+        for i, block_table in enumerate(block_tables):
+            if block_table:
+                num_blocks = len(block_table)
+                if num_blocks <= max_blocks:
+                    graph_block_tables[i, :num_blocks] = block_table
+                else:
+                    # It may be possible to have more blocks allocated due
+                    # to lookahead slots of multi-step, however, they are
+                    # not used anyway, so can be safely ignored.
+                    graph_block_tables[
+                        i, :max_blocks] = block_table[:max_blocks]
+
+        return torch.from_numpy(graph_block_tables).to(
+            device=self.runner.device, non_blocking=True)
+
+    def build(self, seq_lens: List[int], query_lens: List[int],
+              cuda_graph_pad_size: int, batch_size: int):
+        """Build attention metadata with on-device tensors.
+
+        Args:
+            seq_lens: The maybe padded sequence lengths of the input sequences.
+            query_lens: The query lengths of the input sequences.
+            cuda_graph_pad_size: The padding size for cuda graph.
+                                 -1 if cuda graph is not used.
+            batch_size: The maybe padded batch size.
+        """
+        prefix_cache_hit = any([
+            inter_data.prefix_cache_hit
+            for inter_data in self.input_builder.inter_data_list
+        ])
+        for inter_data in self.input_builder.inter_data_list:
+            self._add_seq_group(inter_data,
+                                self.input_builder.chunked_prefill_enabled,
+                                prefix_cache_hit)
+
+        device = self.runner.device
+        use_captured_graph = cuda_graph_pad_size != -1
+
+        max_query_len = max(query_lens)
+        decode_query_lens = query_lens[self.num_prefills:]
+        if len(decode_query_lens) > 0:
+            max_decode_query_len = max(decode_query_lens)
+        else:
+            max_decode_query_len = 1
+        max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
+        max_decode_seq_len = max(self.curr_seq_lens, default=0)
+        num_decode_tokens = self.num_decode_tokens
+        query_start_loc = list(accumulate(query_lens, initial=0))
+        seq_start_loc = list(accumulate(seq_lens, initial=0))
+
+        num_seqs = len(seq_lens)
+        if use_captured_graph:
+            self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
+            self.block_tables.extend([] * cuda_graph_pad_size)
+            num_decode_tokens = batch_size - self.num_prefill_tokens
+            block_tables = self._get_graph_runner_block_tables(
+                num_seqs, self.block_tables)
+        else:
+            block_tables = make_tensor_with_pad(
+                self.block_tables,
+                pad=0,
+                dtype=torch.int,
+                device=device,
+            )
+        assert max_query_len > 0, ("query_lens: {}".format(query_lens))
+
+        assert device is not None
+        context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int,
+                                               device, self.runner.pin_memory)
+        seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
+                                           self.runner.pin_memory)
+        slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long,
+                                               device, self.runner.pin_memory)
+        query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32,
+                                                  device,
+                                                  self.runner.pin_memory)
+        seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32,
+                                                device, self.runner.pin_memory)
+        placeholder_index_maps = {
+            modality: placeholder_map.index_map()
+            for modality, placeholder_map in
+            self.multimodal_placeholder_maps.items()
+        }
+
+        return FlashAttentionMetadata(
+            num_prefills=self.num_prefills,
+            slot_mapping=slot_mapping_tensor,
+            num_prefill_tokens=self.num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            seq_lens=seq_lens,
+            multi_modal_placeholder_index_maps=placeholder_index_maps,
+            enable_kv_scales_calculation=True,
+            seq_lens_tensor=seq_lens_tensor,
+            max_query_len=max_query_len,
+            max_decode_query_len=max_decode_query_len,
+            max_prefill_seq_len=max_prefill_seq_len,
+            max_decode_seq_len=max_decode_seq_len,
+            query_start_loc=query_start_loc_tensor,
+            seq_start_loc=seq_start_loc_tensor,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=block_tables,
+            use_cuda_graph=use_captured_graph,
+        )
+
+
+class FlashAttentionImpl(AttentionImpl):
+    """
+    If the input tensors contain prompt tokens, the layout is as follows:
+    |<--------------- num_prefill_tokens ----------------->|	
+    |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->|
+
+    Otherwise, the layout is as follows:	
+    |<----------------- num_decode_tokens ------------------>|	
+    |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->|
+
+    Generation tokens can contain padding when cuda-graph is used.
+    Currently, prompt tokens don't contain any padding.
+
+    The prompts might have different lengths, while the generation tokens
+    always have length 1.
+
+    If chunked prefill is enabled, prefill tokens and decode tokens can be
+    batched together in a flattened 1D query.
+
+    |<----- num_prefill_tokens ---->|<------- num_decode_tokens --------->|
+    |<-prefill_0->|...|<-prefill_N-1->|<--decode_0-->|...|<--decode_M-1-->|
+
+    Currently, cuda graph is disabled for chunked prefill, meaning there's no
+    padding between prefill and decode tokens.
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[List[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
+        use_irope: bool = False,
+    ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0 "
+                                      "FLASH_ATTN backend.")
+        if use_irope:
+            logger.warning(
+                "Using irope in V0 is not supported yet, it will fall back "
+                "to global attention for long context.")
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+        self.alibi_slopes = alibi_slopes
+        self.sliding_window = ((sliding_window - 1,
+                                0) if sliding_window is not None else (-1, -1))
+        self.kv_cache_dtype = kv_cache_dtype
+        self.vllm_flash_attn_version = get_flash_attn_version(
+            requires_alibi=self.alibi_slopes is not None)
+        if is_quantized_kv_cache(self.kv_cache_dtype) and (
+                not self.kv_cache_dtype.startswith("fp8")
+                or not flash_attn_supports_fp8()):
+            raise NotImplementedError(
+                f"FlashAttention does not support {self.kv_cache_dtype} "
+                "kv-cache on this device "
+                f"(FA supports fp8 = {flash_attn_supports_fp8()}).")
+        if logits_soft_cap is None:
+            # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
+            logits_soft_cap = 0
+        self.logits_soft_cap = logits_soft_cap
+
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        support_head_sizes = FlashAttentionBackend.get_supported_head_sizes()
+        if head_size not in support_head_sizes:
+            raise ValueError(
+                f"Head size {head_size} is not supported by FlashAttention. "
+                f"Supported head sizes are: {support_head_sizes}.")
+        self.attn_type = attn_type
+
+    def forward(
+        self,
+        layer: AttentionLayer,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: FlashAttentionMetadata,
+        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass with FlashAttention.
+
+        Args:
+            query: shape = [num_tokens, num_heads, head_size]
+            key: shape = [num_tokens, num_kv_heads, head_size]
+            value: shape = [num_tokens, num_kv_heads, head_size]
+            output: shape = [num_tokens, num_heads, head_size]
+            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+                NOTE: kv_cache will be an empty tensor with shape [0]
+                for profiling run.
+            attn_metadata: Metadata for attention.
+        NOTE: It in-place updates the output tensor.
+        NOTE: FP8 quantization, flash-attn expect the size of
+              {q,k,v}_descale to be (num_sequences, num_kv_heads).
+              We use torch's .expand() to avoid duplicating values
+        """
+        assert output is not None, "Output tensor must be provided."
+
+        if output_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported"
+                " for FlashAttentionImpl")
+
+        # NOTE(woosuk): FlashAttention2 does not support FP8 KV cache.
+        if not flash_attn_supports_fp8() or output.dtype != torch.bfloat16:
+            assert (
+                layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0), (
+                    "key/v_scale is only supported in FlashAttention 3 with "
+                    "base dtype bfloat16")
+
+        attn_type = self.attn_type
+        if (attn_type == AttentionType.ENCODER
+                and (not attn_metadata.is_all_encoder_attn_metadata_set)):
+            raise AttributeError("Encoder attention requires setting "
+                                 "encoder metadata attributes.")
+        elif (attn_type == AttentionType.ENCODER_DECODER
+              and (not attn_metadata.is_all_cross_attn_metadata_set)):
+            raise AttributeError("Encoder/decoder cross-attention "
+                                 "requires setting cross-attention "
+                                 "metadata attributes.")
+
+        kv_cache_dtype: str = self.kv_cache_dtype
+        softmax_scale: float = self.scale
+        window_size = self.sliding_window
+        alibi_slopes: Optional[torch.Tensor] = self.alibi_slopes
+        logits_soft_cap: Optional[float] = self.logits_soft_cap
+        fp8_attention = kv_cache_dtype.startswith("fp8")
+
+        if fp8_attention and not flash_attn_supports_fp8():
+            raise NotImplementedError(
+                "FlashAttention does not support FP8 kv-cache on this device.")
+
+        if kv_cache.numel() > 0:
+            key_cache = kv_cache[0]
+            value_cache = kv_cache[1]
+            # We skip updating the KV cache under two conditions:
+            #  a. When the Attention Type is ENCODER. In this phase, we compute
+            #     only the encoder attention without updating the cache.
+            #  b. When both Key and Value are None. This occurs during
+            #     cross-attention computation in the decoding phase, where the
+            #     KV cache is already populated with the cross-attention
+            #     tensor. Thus, we skip cache updates during this time.
+            if (attn_type != AttentionType.ENCODER) and (key is not None) and (
+                    value is not None):
+                if attn_type == AttentionType.ENCODER_DECODER:
+                    # Update cross-attention KV cache (prefill-only)
+                    updated_slot_mapping = attn_metadata.cross_slot_mapping
+                else:
+                    # Update self-attention KV cache (prefill/decode)
+                    updated_slot_mapping = attn_metadata.slot_mapping
+
+                # Reshape the input keys and values and store them in the cache.
+                # If kv_cache is not provided, the new key and value tensors are
+                # not cached. This happens during the initial memory
+                # profiling run.
+                torch.ops._C_cache_ops.reshape_and_cache_flash(
+                    key,
+                    value,
+                    kv_cache[0],
+                    kv_cache[1],
+                    updated_slot_mapping.flatten(),  # type: ignore[union-attr]
+                    kv_cache_dtype,
+                    layer._k_scale,
+                    layer._v_scale,
+                )
+
+                if fp8_attention:
+                    kv_cache = kv_cache.view(torch.float8_e4m3fn)
+                    key_cache = key_cache.view(torch.float8_e4m3fn)
+                    value_cache = value_cache.view(torch.float8_e4m3fn)
+
+        if fp8_attention:
+            num_tokens, num_heads, head_size = query.shape
+            query, _ = ops.scaled_fp8_quant(
+                query.reshape(
+                    (num_tokens, num_heads * head_size)).contiguous(),
+                layer._q_scale)
+            query = query.reshape((num_tokens, num_heads, head_size))
+
+        (num_prefill_query_tokens, num_prefill_kv_tokens,
+        num_decode_query_tokens) = \
+            get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type)
+        decode_query = query[num_prefill_query_tokens:]
+        decode_output = output[num_prefill_query_tokens:]
+        # QKV for prefill.
+        query = query[:num_prefill_query_tokens]
+        prefill_output = output[:num_prefill_query_tokens]
+        assert query.shape[0] == num_prefill_query_tokens
+        assert decode_query.shape[0] == num_decode_query_tokens
+
+        if prefill_meta := attn_metadata.prefill_metadata:
+            # Prompt run.
+            if (kv_cache.numel() == 0 or prefill_meta.block_tables is None
+                    or prefill_meta.block_tables.numel() == 0):
+                # normal attention
+                # When block_tables are not filled, it means q and k are the
+                # prompt, and they have the same length.
+                q_seq_start_loc, q_seq_len, k_seq_start_loc, k_seq_len = \
+                    _get_query_key_seq_metadata(prefill_meta, True, attn_type)
+
+                key = key[:num_prefill_kv_tokens]
+                value = value[:num_prefill_kv_tokens]
+
+                if fp8_attention:
+                    num_kv_tokens, num_kv_heads, head_size = key.shape
+
+                    key, _ = ops.scaled_fp8_quant(
+                        key.reshape((num_kv_tokens,
+                                     num_kv_heads * head_size)).contiguous(),
+                        layer._k_scale)
+                    key = key.reshape((num_kv_tokens, num_kv_heads, head_size))
+
+                    value, _ = ops.scaled_fp8_quant(
+                        value.reshape((num_kv_tokens,
+                                       num_kv_heads * head_size)).contiguous(),
+                        layer._v_scale)
+                    value = value.reshape(
+                        (num_kv_tokens, num_kv_heads, head_size))
+
+                descale_shape = (q_seq_start_loc.shape[0] - 1, key.shape[1])
+                flash_attn_varlen_func(
+                    q=query,
+                    k=key,
+                    v=value,
+                    cu_seqlens_q=q_seq_start_loc,
+                    cu_seqlens_k=k_seq_start_loc,
+                    max_seqlen_q=q_seq_len,
+                    max_seqlen_k=k_seq_len,
+                    softmax_scale=softmax_scale,
+                    causal=_get_causal_option(attn_type),
+                    window_size=window_size,
+                    alibi_slopes=alibi_slopes,
+                    softcap=logits_soft_cap,
+                    out=prefill_output,
+                    fa_version=self.vllm_flash_attn_version,
+                    q_descale=layer._q_scale.expand(descale_shape),
+                    k_descale=layer._k_scale.expand(descale_shape),
+                    v_descale=layer._v_scale.expand(descale_shape),
+                )
+            else:
+                # prefix-enabled attention
+                assert attn_type == AttentionType.DECODER, (
+                    "Only decoder-only models support prefix caching")
+                assert prefill_meta.seq_lens is not None
+                assert prefill_meta.query_start_loc is not None
+                max_seq_len = max(prefill_meta.seq_lens)
+                descale_shape = (prefill_meta.query_start_loc.shape[0] - 1,
+                                 key.shape[1])
+                flash_attn_varlen_func(  # noqa
+                    q=query,
+                    k=key_cache,
+                    v=value_cache,
+                    cu_seqlens_q=prefill_meta.query_start_loc,
+                    max_seqlen_q=prefill_meta.max_query_len,
+                    seqused_k=prefill_meta.seq_lens_tensor,
+                    max_seqlen_k=max_seq_len,
+                    softmax_scale=softmax_scale,
+                    causal=True,
+                    window_size=window_size,
+                    alibi_slopes=alibi_slopes,
+                    block_table=prefill_meta.block_tables,
+                    softcap=logits_soft_cap,
+                    out=prefill_output,
+                    fa_version=self.vllm_flash_attn_version,
+                    q_descale=layer._q_scale.expand(descale_shape),
+                    k_descale=layer._k_scale.expand(descale_shape),
+                    v_descale=layer._v_scale.expand(descale_shape),
+                )
+
+        if decode_meta := attn_metadata.decode_metadata:
+            # Decoding run.
+            # Use flash_attn_varlen_func kernel for speculative decoding
+            # because different queries might have different lengths.
+
+            assert decode_meta.max_decode_query_len is not None
+            # use only for actual varlen decoding
+            if decode_meta.max_decode_query_len > 1:
+                assert attn_type == AttentionType.DECODER, (
+                    "Only decoder-only models support max_decode_query_len > 1"
+                )
+                assert decode_meta.query_start_loc is not None
+                descale_shape = (decode_meta.query_start_loc.shape[0] - 1,
+                                 key.shape[1])
+                flash_attn_varlen_func(
+                    q=decode_query,
+                    k=key_cache,
+                    v=value_cache,
+                    cu_seqlens_q=decode_meta.query_start_loc,
+                    max_seqlen_q=decode_meta.max_decode_query_len,
+                    seqused_k=decode_meta.seq_lens_tensor,
+                    max_seqlen_k=decode_meta.max_decode_seq_len,
+                    softmax_scale=softmax_scale,
+                    causal=True,
+                    window_size=window_size,
+                    alibi_slopes=alibi_slopes,
+                    softcap=logits_soft_cap,
+                    block_table=decode_meta.block_tables,
+                    out=decode_output,
+                    fa_version=self.vllm_flash_attn_version,
+                    q_descale=layer._q_scale.expand(descale_shape),
+                    k_descale=layer._k_scale.expand(descale_shape),
+                    v_descale=layer._v_scale.expand(descale_shape),
+                )
+            else:
+                # Use flash_attn_with_kvcache for normal decoding.
+                (
+                    seq_lens_arg,
+                    _,
+                    block_tables_arg,
+                ) = get_seq_len_block_table_args(decode_meta, False, attn_type)
+                descale_shape = (seq_lens_arg.shape[0], key_cache.shape[-2])
+                flash_attn_with_kvcache(
+                    q=decode_query.unsqueeze(1),
+                    k_cache=key_cache,
+                    v_cache=value_cache,
+                    block_table=block_tables_arg,
+                    cache_seqlens=seq_lens_arg,
+                    softmax_scale=softmax_scale,
+                    causal=True,
+                    window_size=window_size,
+                    alibi_slopes=alibi_slopes,
+                    softcap=logits_soft_cap,
+                    out=decode_output.unsqueeze(1),
+                    fa_version=self.vllm_flash_attn_version,
+                    q_descale=layer._q_scale.expand(descale_shape),
+                    k_descale=layer._k_scale.expand(descale_shape),
+                    v_descale=layer._v_scale.expand(descale_shape),
+                )
+        return output
+
+
+def _get_query_key_seq_metadata(
+    attn_metadata,
+    is_prompt: bool,
+    attn_type: str,
+) -> tuple:
+    """
+    Returns sequence metadata for key and query based on the specified 
+    attention type and whether input is a prompt.
+
+    This function computes the starting locations and maximum sequence lengths 
+    for key and query sequences for different attention types.
+
+    Args:
+        attn_metadata: The attention metadata object
+        is_prompt (bool): A flag indicating if the input is a prompt
+        attn_type (AttentionType): The type of attention being used.
+
+    Returns:
+        tuple: A tuple containing four integers:
+            - Starting location for the query sequence.
+            - Maximum sequence length for the query sequence.
+            - Starting location for the key sequence.
+            - Maximum sequence length for the key sequence.
+
+    Raises:
+        AttributeError: If an invalid attention type is provided.
+    """
+    if attn_type == AttentionType.DECODER:
+        # Decoder self-attention
+        # Choose max_seq_len based on whether we are in prompt_run
+        if is_prompt:
+            max_seq_len = attn_metadata.max_prefill_seq_len
+        else:
+            max_seq_len = attn_metadata.max_decode_seq_len
+        return (attn_metadata.seq_start_loc, max_seq_len,
+                attn_metadata.seq_start_loc, max_seq_len)
+
+    elif attn_type == AttentionType.ENCODER_DECODER:
+        # This is cross attention between the where the key
+        # is the precomputed encoder attention and query
+        # is the input sequence.
+        # Choose query max length based on whether it is prompt
+        # or not.
+        if is_prompt:
+            max_seq_len = attn_metadata.max_prefill_seq_len
+        else:
+            max_seq_len = attn_metadata.max_decode_seq_len
+        return (attn_metadata.seq_start_loc, max_seq_len,
+                attn_metadata.encoder_seq_start_loc,
+                attn_metadata.max_encoder_seq_len)
+    elif attn_type == AttentionType.ENCODER:
+        # For encoder attention both the query and the key are same i.e the
+        # encoder sequence.
+        return (attn_metadata.encoder_seq_start_loc,
+                attn_metadata.max_encoder_seq_len,
+                attn_metadata.encoder_seq_start_loc,
+                attn_metadata.max_encoder_seq_len)
+    elif attn_type == AttentionType.ENCODER_ONLY:
+        assert is_prompt, "Should not have decode for encoder only model."
+        return (attn_metadata.seq_start_loc, attn_metadata.max_prefill_seq_len,
+                attn_metadata.seq_start_loc, attn_metadata.max_prefill_seq_len)
+    else:
+        raise AttributeError(f"Invalid attention type {str(attn_type)}")
+
+
+def _get_causal_option(attn_type: str) -> bool:
+    """
+    Determine whether the given attention type is suitable for causal 
+    attention mechanisms.
+
+    Args:
+        attn_type (AttentionType): The type of attention being evaluated
+
+    Returns:
+        bool: Returns `True` if the attention type is suitable for causal 
+        attention (i.e., not encoder, encoder-only, or encoder-decoder), 
+        otherwise returns `False`.
+    """
+    return not (attn_type == AttentionType.ENCODER
+                or attn_type == AttentionType.ENCODER_ONLY
+                or attn_type == AttentionType.ENCODER_DECODER)
diff --git a/vllm_v0.10.0/vllm/attention/backends/flashinfer.py b/vllm_v0.10.0/vllm/attention/backends/flashinfer.py
new file mode 100644
index 0000000..56d3da6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/backends/flashinfer.py
@@ -0,0 +1,1198 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import dataclasses
+from collections import defaultdict
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Type
+
+from vllm.multimodal import MultiModalPlaceholderMap
+
+try:
+    from flashinfer import BatchDecodeWithPagedKVCacheWrapper
+    from flashinfer.decode import (CUDAGraphBatchDecodeWithPagedKVCacheWrapper,
+                                   trtllm_batch_decode_with_kv_cache)
+    from flashinfer.prefill import BatchPrefillWithPagedKVCacheWrapper
+
+    from vllm.vllm_flash_attn import flash_attn_varlen_func
+    FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
+except ImportError:
+    # Avoid turning these types into variables during type checking
+    if not TYPE_CHECKING:
+        BatchDecodeWithPagedKVCacheWrapper = None
+        CUDAGraphBatchDecodeWithPagedKVCacheWrapper = None
+        BatchPrefillWithPagedKVCacheWrapper = None
+        trtllm_batch_decode_with_kv_cache = None
+    FLASHINFER_WORKSPACE_BUFFER_SIZE = 0
+    raise ImportError("FlashInfer is not installed. Please install it from "
+                      "https://github.com/flashinfer-ai/flashinfer") from None
+
+import torch
+
+import vllm.envs as envs
+from vllm import _custom_ops as ops
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer,
+                                              AttentionMetadata,
+                                              AttentionMetadataBuilder,
+                                              AttentionState, AttentionType)
+from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
+                                           compute_slot_mapping_start_idx,
+                                           is_block_tables_empty)
+from vllm.attention.layer import Attention
+from vllm.attention.ops.paged_attn import PagedAttention
+from vllm.config import VllmConfig, get_layers_from_vllm_config
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
+                        make_tensor_with_pad)
+
+logger = init_logger(__name__)
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
+                                          ModelInputForGPUWithSamplingMetadata)
+
+
+class FlashInferBackend(AttentionBackend):
+    cached_sm100a_supported: Optional[bool] = None
+
+    @staticmethod
+    def get_name() -> str:
+        return "FLASHINFER"
+
+    @staticmethod
+    def get_impl_cls() -> Type["FlashInferImpl"]:
+        return FlashInferImpl
+
+    @staticmethod
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return FlashInferMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["FlashInferMetadataBuilder"]:
+        return FlashInferMetadataBuilder
+
+    @staticmethod
+    def get_state_cls() -> Type["FlashInferState"]:
+        return FlashInferState
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return (num_blocks, 2, block_size, num_kv_heads, head_size)
+
+    @staticmethod
+    def get_kv_cache_stride_order() -> Tuple[int, ...]:
+        cache_layout = FlashInferState.get_kv_cache_layout()
+        assert (cache_layout in ("NHD", "HND"))
+        stride_order = (0, 1, 2, 3, 4) if cache_layout == "NHD" else (0, 1, 3,
+                                                                      2, 4)
+        return stride_order
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: torch.Tensor,
+    ) -> None:
+        PagedAttention.copy_blocks(kv_caches, src_to_dists)
+
+    @staticmethod
+    def get_supported_head_sizes() -> List[int]:
+        return [64, 128, 256]
+
+    @staticmethod
+    def get_fp8_dtype_for_flashinfer(kv_cache_dtype: str) -> torch.dtype:
+        if kv_cache_dtype in ("fp8", "fp8_e4m3"):
+            return torch.float8_e4m3fn
+        elif kv_cache_dtype == "fp8_e5m2":
+            return torch.float8_e5m2
+        else:
+            raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}")
+
+    @staticmethod
+    def use_trtllm_decode_attention(
+        batch_size: int,
+        max_seq_len: int,
+        kv_cache_dtype: str,
+        num_qo_heads: Optional[int],
+        num_kv_heads: Optional[int],
+        attn_head_size: Optional[int],
+    ) -> bool:
+        if FlashInferBackend.cached_sm100a_supported is None:
+            FlashInferBackend.cached_sm100a_supported = (
+                current_platform.has_device_capability(100))
+        if not FlashInferBackend.cached_sm100a_supported:
+            return False
+        # Check if the dimensions are supported by TRTLLM decode attention
+        if (attn_head_size is None or num_qo_heads is None
+                or num_kv_heads is None or num_qo_heads // num_kv_heads > 8
+                or num_qo_heads % num_kv_heads != 0 or attn_head_size != 128):
+            return False
+        env_value = envs.VLLM_USE_TRTLLM_DECODE_ATTENTION
+        if env_value is not None:
+            logger.info_once("VLLM_USE_TRTLLM_DECODE_ATTENTION is set to %s",
+                             env_value)
+            # Environment variable is set - respect it
+            # Making the conditional check for zero because
+            # the path is automatically enabled if the batch size condition
+            # is satisfied.
+            no_use_trtllm = (env_value == "0")
+            if not no_use_trtllm:
+                logger.info_once("Using TRTLLM decode attention.")
+            return not no_use_trtllm
+        else:
+            # Environment variable not set - use auto-detection
+            use_trtllm = (FlashInferBackend.cached_sm100a_supported
+                          and batch_size <= 256 and max_seq_len < 131072
+                          and kv_cache_dtype == "auto")
+            if use_trtllm:
+                logger.warning_once(
+                    "Using TRTLLM decode attention (auto-detected).")
+        return use_trtllm
+
+
+@dataclass
+class PerLayerParameters:
+    """
+    Currently, FlashInfer backend only support models in which all layers share
+    the same values for the following hyperparameters.
+    """
+
+    window_left: int
+    logits_soft_cap: Optional[float]
+    sm_scale: float
+
+
+def get_per_layer_parameters(
+        vllm_config: VllmConfig) -> Dict[str, PerLayerParameters]:
+    """
+    Scan all attention layers and determine some hyperparameters
+    to use during `plan`.
+    """
+
+    layers = get_layers_from_vllm_config(vllm_config, Attention)
+    per_layer_params: Dict[str, PerLayerParameters] = {}
+
+    for key, layer in layers.items():
+        impl = layer.impl
+        assert isinstance(impl, FlashInferImpl)
+
+        # Infer hyperparameters from the attention layer
+        window_size = impl.sliding_window
+        window_left = window_size[0] if window_size is not None else -1
+        logits_soft_cap = impl.logits_soft_cap
+        sm_scale = impl.scale
+
+        per_layer_params[key] = PerLayerParameters(window_left,
+                                                   logits_soft_cap, sm_scale)
+
+    return per_layer_params
+
+
+def infer_global_hyperparameters(
+        per_layer_params: Dict[str, PerLayerParameters]) -> PerLayerParameters:
+    """
+    Currently, FlashInfer backend only support models in which all layers share
+    the same values for the following hyperparameters:
+    - `window_left`
+    - `logits_soft_cap`
+    - `sm_scale`
+
+    So this function asserts that all layers share the same values for these
+    hyperparameters and returns the global values.
+    """
+
+    assert len(per_layer_params) > 0, "No attention layers found in the model."
+
+    param_sets = list(per_layer_params.values())
+    global_params = param_sets[0]
+    for params in param_sets:
+        assert params == global_params, (
+            "FlashInfer backend currently only supports models in which all "
+            "layers share the same values for the following hyperparameters: "
+            "`window_left`, `logits_soft_cap`, `sm_scale`.")
+
+    return global_params
+
+
+class FlashInferState(AttentionState):
+
+    def __init__(self, runner):
+        self.runner = runner
+        self._is_graph_capturing = False
+        self._workspace_buffer = None
+        self._decode_wrapper = None
+        self._prefill_wrapper = None
+
+        # Global hyperparameters shared by all attention layers
+        self.global_hyperparameters: Optional[PerLayerParameters] = None
+
+        self.vllm_config = self.runner.vllm_config
+        self._kv_cache_layout = None
+
+    def _get_workspace_buffer(self):
+        if self._workspace_buffer is None:
+            self._workspace_buffer = torch.empty(
+                FLASHINFER_WORKSPACE_BUFFER_SIZE,
+                dtype=torch.uint8,
+                device=self.runner.device)
+        return self._workspace_buffer
+
+    @staticmethod
+    def get_kv_cache_layout():
+        from vllm.v1.attention.backends.utils import _KV_CACHE_LAYOUT_OVERRIDE
+        if _KV_CACHE_LAYOUT_OVERRIDE is not None:
+            logger.info_once("Using KV cache layout %s",
+                             _KV_CACHE_LAYOUT_OVERRIDE)
+            return _KV_CACHE_LAYOUT_OVERRIDE
+        cache_layout = envs.VLLM_KV_CACHE_LAYOUT
+        if cache_layout is None:
+            logger.info_once("Using default KV cache layout NHD")
+            return "NHD"
+        logger.info_once("Using KV cache layout %s", cache_layout)
+        return cache_layout
+
+    def _get_prefill_wrapper(self):
+        if self._prefill_wrapper is None:
+            self._prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper(
+                self._get_workspace_buffer(), self.get_kv_cache_layout())
+        return self._prefill_wrapper
+
+    def _get_decode_wrapper(self):
+        if self._decode_wrapper is None:
+            num_qo_heads = (self.runner.model_config.get_num_attention_heads(
+                self.runner.parallel_config))
+            num_kv_heads = self.runner.model_config.get_num_kv_heads(
+                self.runner.parallel_config)
+            use_tensor_cores = envs.VLLM_FLASHINFER_FORCE_TENSOR_CORES or (
+                num_qo_heads // num_kv_heads > 4)
+            self._decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(
+                self._get_workspace_buffer(),
+                self.get_kv_cache_layout(),
+                use_tensor_cores=use_tensor_cores)
+        return self._decode_wrapper
+
+    @contextmanager
+    def graph_capture(self, max_batch_size: int):
+        self._is_graph_capturing = True
+        self._graph_decode_wrapper = None
+        self._graph_slot_mapping = torch.full((max_batch_size, ),
+                                              PAD_SLOT_ID,
+                                              dtype=torch.long,
+                                              device=self.runner.device)
+        self._graph_seq_lens = torch.ones(max_batch_size,
+                                          dtype=torch.int32,
+                                          device=self.runner.device)
+        self._graph_block_tables = torch.from_numpy(
+            self.runner.graph_block_tables).to(device=self.runner.device)
+        self._graph_decode_workspace_buffer = self._get_workspace_buffer()
+        self._graph_indices_buffer = torch.empty(
+            max_batch_size * self.runner.cache_config.num_gpu_blocks,
+            dtype=torch.int32,
+            device=self.runner.device)
+        self._graph_indptr_buffer = torch.empty(max_batch_size + 1,
+                                                dtype=torch.int32,
+                                                device=self.runner.device)
+        self._graph_last_page_len_buffer = torch.empty(
+            max_batch_size, dtype=torch.int32, device=self.runner.device)
+        yield
+        self._is_graph_capturing = False
+        del self._graph_slot_mapping
+        del self._graph_seq_lens
+        del self._graph_block_tables
+        del self._graph_decode_workspace_buffer
+        del self._graph_indices_buffer
+        del self._graph_indptr_buffer
+        del self._graph_last_page_len_buffer
+        del self._graph_decode_wrapper
+
+    def graph_clone(self, batch_size: int):
+        assert self._is_graph_capturing
+        state = self.__class__(self.runner)
+        state._workspace_buffer = self._graph_decode_workspace_buffer
+        state._decode_wrapper = self._graph_decode_wrapper
+        state._prefill_wrapper = self._get_prefill_wrapper()
+        return state
+
+    def graph_capture_get_metadata_for_batch(
+            self, batch_size: int, is_encoder_decoder_model: bool = False):
+        assert self._is_graph_capturing
+        _indptr_buffer = self._graph_indptr_buffer[:batch_size + 1]
+        _last_page_len_buffer = self._graph_last_page_len_buffer[:batch_size]
+
+        num_qo_heads = (self.runner.model_config.get_num_attention_heads(
+            self.runner.parallel_config))
+        num_kv_heads = self.runner.model_config.get_num_kv_heads(
+            self.runner.parallel_config)
+        use_tensor_cores = envs.VLLM_FLASHINFER_FORCE_TENSOR_CORES or (
+            num_qo_heads // num_kv_heads > 4)
+        self._graph_decode_wrapper = \
+            CUDAGraphBatchDecodeWithPagedKVCacheWrapper(
+            self._graph_decode_workspace_buffer, _indptr_buffer,
+            self._graph_indices_buffer, _last_page_len_buffer,
+            self.get_kv_cache_layout(),
+            use_tensor_cores)
+        if self.runner.kv_cache_dtype.startswith("fp8"):
+            kv_cache_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
+                self.runner.kv_cache_dtype)
+        else:
+            kv_cache_dtype = get_kv_cache_torch_dtype(
+                self.runner.kv_cache_dtype, self.runner.model_config.dtype)
+
+        paged_kv_indptr_tensor_host = torch.arange(0,
+                                                   batch_size + 1,
+                                                   dtype=torch.int32)
+        paged_kv_indices_tensor_host = torch.arange(0,
+                                                    batch_size,
+                                                    dtype=torch.int32)
+        paged_kv_last_page_len_tensor_host = torch.full((batch_size, ),
+                                                        self.runner.block_size,
+                                                        dtype=torch.int32)
+        query_start_loc_host = torch.arange(0,
+                                            batch_size + 1,
+                                            dtype=torch.int32)
+
+        global_params = infer_global_hyperparameters(
+            get_per_layer_parameters(self.vllm_config))
+
+        attn_metadata = self.runner.attn_backend.make_metadata(
+            num_prefills=0,
+            slot_mapping=self._graph_slot_mapping[:batch_size],
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=False,
+            num_prefill_tokens=0,
+            num_decode_tokens=batch_size,
+            max_prefill_seq_len=0,
+            max_decode_seq_len=0,
+            seq_lens_tensor=self._graph_seq_lens,
+            block_tables=self._graph_block_tables,
+            paged_kv_indptr=paged_kv_indptr_tensor_host,
+            paged_kv_indices=paged_kv_indices_tensor_host,
+            paged_kv_last_page_len=paged_kv_last_page_len_tensor_host,
+            num_qo_heads=num_qo_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim=self.runner.model_config.get_head_size(),
+            page_size=self.runner.block_size,
+            seq_start_loc=None,
+            query_start_loc=query_start_loc_host,
+            device=self.runner.device,
+            data_type=kv_cache_dtype,
+            q_data_type=self.runner.model_config.dtype,
+            use_cuda_graph=True,
+            decode_wrapper=self._graph_decode_wrapper,
+            prefill_wrapper=None,
+            **dataclasses.asdict(global_params),
+        )
+        attn_metadata.begin_forward()
+        return attn_metadata
+
+    def get_graph_input_buffers(self,
+                                attn_metadata,
+                                is_encoder_decoder_model: bool = False):
+        return {
+            "block_tables": attn_metadata.block_tables,
+            "seq_lens_tensor": attn_metadata.seq_lens_tensor,
+            "slot_mapping": attn_metadata.slot_mapping,
+        }
+
+    def prepare_graph_input_buffers(self,
+                                    input_buffers,
+                                    attn_metadata,
+                                    is_encoder_decoder_model: bool = False):
+        # FlashInfer-specific logic: copy additional tensors
+        num_total_blocks = attn_metadata.decode_metadata.seq_lens_tensor.shape[
+            0]
+        input_buffers["seq_lens_tensor"][:num_total_blocks].copy_(
+            attn_metadata.seq_lens_tensor, non_blocking=True)
+        input_buffers["block_tables"][:num_total_blocks].copy_(
+            attn_metadata.block_tables, non_blocking=True)
+
+    def begin_forward(self, model_input):
+        assert not self._is_graph_capturing
+        state = self
+        use_cuda_graph = model_input.attn_metadata.use_cuda_graph
+        is_decode = model_input.attn_metadata.num_prefills == 0
+        # In case of multistep chunked-prefill, there might be prefill requests
+        # scheduled while CUDA graph mode is enabled. We don't run graph in that
+        # case.
+        if use_cuda_graph and is_decode:
+            if model_input.inputs_embeds is None:
+                batch_size = model_input.input_tokens.shape[0]
+                state = (
+                    self.runner.graph_runners[model_input.virtual_engine][(
+                        batch_size, False)].attn_state)
+            else:
+                batch_size = model_input.inputs_embeds.shape[0]
+                state = (
+                    self.runner.graph_runners[model_input.virtual_engine][(
+                        batch_size, True)].attn_state)
+
+        model_input.attn_metadata.prefill_wrapper = state._get_prefill_wrapper(
+        )
+        model_input.attn_metadata.decode_wrapper = state._get_decode_wrapper()
+        model_input.attn_metadata.begin_forward()
+
+
+@dataclass
+class FlashInferMetadata(AttentionMetadata):
+    # Maximum sequence length among prefill batch. 0 if there are decoding
+    # requests only.
+    max_prefill_seq_len: int
+    max_decode_seq_len: int
+
+    # Number of query tokens for each request in the batch.
+    # Currently, we require that all requests have the same number of query
+    # tokens during the decoding phase. When speculavie decoding is enabled,
+    # decode_query_len might be greater than 1. In all other cases, it is 1.
+    decode_query_len: Optional[int] = 1
+
+    use_cuda_graph: bool = True
+
+    prefill_wrapper: Optional[BatchPrefillWithPagedKVCacheWrapper] = None
+    decode_wrapper: Optional[BatchDecodeWithPagedKVCacheWrapper] = None
+
+    # Metadata for the prefill stage
+    seq_start_loc: Optional[torch.Tensor] = None
+    query_start_loc: Optional[torch.Tensor] = None
+    block_tables: Optional[torch.Tensor] = None
+
+    # used for GPU in-place advance_step
+    seq_lens_tensor: Optional[torch.Tensor] = None
+    block_table_bound: Optional[torch.Tensor] = None
+
+    # An example for paged_kv_indices, paged_kv_indptr:
+    # request 1, page indices [0, 5, 8]
+    # request 2, page indices [1, 6, 7]
+    # request 3, page indices [3, 4]
+    # paged_kv_indices is a concatenation of page indices of all requests:
+    # [0, 5, 8, 1, 6, 7, 3, 4]
+    # paged_kv_indptr is used to index into paged_kv_indices:
+    # [0, 3, 6, 8]
+    # The indptr of the paged kv cache, shape: [batch_size + 1]
+    paged_kv_indptr: Optional[torch.Tensor] = None
+    # The page indices of the paged kv cache
+    paged_kv_indices: Optional[torch.Tensor] = None
+    # The number of entries in the last page of each request in
+    # the paged kv cache, shape: [batch_size]
+    paged_kv_last_page_len: Optional[torch.Tensor] = None
+    # The number of query/output heads
+    num_qo_heads: Optional[int] = None
+    # The number of key/value heads
+    num_kv_heads: Optional[int] = None
+    # The dimension of the attention heads
+    head_dim: Optional[int] = None
+    # Block size of vllm
+    page_size: Optional[int] = None
+    # The data type of the paged kv cache
+    data_type: torch.dtype = None
+    # The data type of the query
+    q_data_type: torch.dtype = None
+    # FlashInfer 0.2 encourages passing host tensors
+    device: torch.device = torch.device("cpu")
+    is_profile_run: bool = False
+
+    # The FlashInfer backend currently supports only models in which all layers
+    # share the same following hyperparameters:
+
+    # The left (inclusive) window size for the attention window, when
+    # set to `-1`, the window size will be set to the full length of
+    # the sequence. Defaults to `-1`.
+    window_left: int = -1
+    # The attention logits soft capping value (used in Gemini, Grok and
+    # Gemma-2, etc.), if not provided, will be set to `0`. If greater
+    # than 0, the logits will be capped according to formula:
+    # $$\texttt{logits\_soft\_cap} \times
+    # \mathrm{tanh}(x / \texttt{logits\_soft\_cap})$$,
+    # where $x$ is the input logits.
+    logits_soft_cap: Optional[float] = None
+    # The scale used in softmax, if not provided, will be set to
+    # `1.0 / sqrt(head_dim)`.
+    sm_scale: Optional[float] = None
+
+    def __post_init__(self):
+        # Refer to
+        # https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157
+        supported_head_sizes = FlashInferBackend.get_supported_head_sizes()
+        if self.head_dim is not None and self.head_dim \
+                not in supported_head_sizes:
+            raise ValueError(
+                f"Only {supported_head_sizes} are supported for head_dim,",
+                f" received {self.head_dim}.")
+
+    def begin_forward(self):
+        if self.num_prefill_tokens > 0:
+            if self.paged_kv_indices is None:
+                return
+
+            assert self.prefill_wrapper is not None
+            assert self.query_start_loc is not None
+            assert self.paged_kv_indices is not None
+            assert self.paged_kv_indptr is not None
+            assert self.paged_kv_last_page_len is not None
+            assert self.block_table_bound is not None
+            assert self.seq_lens_tensor is not None
+            self.query_start_loc = self.query_start_loc[:self.num_prefills + 1]
+            batch_size = self.query_start_loc.shape[0] - 1
+            assert batch_size >= 0
+            # We will use flash attention for profiling to
+            # determine the number of blocks. Therefore,
+            # we don't need to prepare the input for flashinfer for profile run.
+            if not self.is_profile_run:
+                self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
+                self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
+                    self.device)
+                self.block_table_bound = self.block_table_bound.to(self.device)
+                self.seq_lens_tensor = self.seq_lens_tensor.to(self.device)
+                self.paged_kv_indices = self.paged_kv_indices.to(self.device)
+                self.prefill_wrapper.plan(
+                    self.query_start_loc,
+                    self.paged_kv_indptr[:self.num_prefills + 1],
+                    self.paged_kv_indices,
+                    self.paged_kv_last_page_len[:self.num_prefills],
+                    self.num_qo_heads,
+                    self.num_kv_heads,
+                    self.head_dim,
+                    self.page_size,
+                    causal=True,
+                    sm_scale=self.sm_scale,
+                    window_left=self.window_left,
+                    logits_soft_cap=self.logits_soft_cap,
+                    q_data_type=self.q_data_type,
+                    kv_data_type=self.data_type)
+        if self.num_decode_tokens > 0:
+            assert self.paged_kv_indices is not None
+            assert self.paged_kv_indptr is not None
+            assert self.paged_kv_last_page_len is not None
+            self.paged_kv_indices = self.paged_kv_indices.to(self.device)
+            self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
+            self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
+                self.device)
+            # handle model warmup path
+            if self.block_table_bound is not None:
+                self.block_table_bound = self.block_table_bound.to(self.device)
+            if self.seq_lens_tensor is not None:
+                self.seq_lens_tensor = self.seq_lens_tensor.to(self.device)
+
+            assert self.decode_wrapper is not None
+            self.decode_wrapper.plan(
+                self.paged_kv_indptr[self.num_prefills:],
+                self.paged_kv_indices,
+                self.paged_kv_last_page_len[self.num_prefills:],
+                self.num_qo_heads,
+                self.num_kv_heads,
+                self.head_dim,
+                self.page_size,
+                # Disable flashinfer's pos encoding and use vllm's rope.
+                pos_encoding_mode="NONE",
+                window_left=self.window_left,
+                logits_soft_cap=self.logits_soft_cap,
+                sm_scale=self.sm_scale,
+                # kv-cache data type.
+                kv_data_type=self.data_type,
+                # query data type.
+                q_data_type=self.q_data_type)
+
+    def asdict_zerocopy(self,
+                        skip_fields: Optional[Set[str]] = None
+                        ) -> Dict[str, Any]:
+        if skip_fields is None:
+            skip_fields = set()
+        # We need to skip the prefill/decode_wrapper field since it cannot be
+        # broadcasted with nccl when TP is enabled.
+        skip_fields.add('prefill_wrapper')
+        skip_fields.add('decode_wrapper')
+        return super().asdict_zerocopy(skip_fields)
+
+    @property
+    def prefill_metadata(self) -> Optional["FlashInferMetadata"]:
+        if self.num_prefills == 0:
+            return None
+        return self
+
+    @property
+    def decode_metadata(self) -> Optional["FlashInferMetadata"]:
+        if self.num_decode_tokens == 0:
+            return None
+        return self
+
+    def advance_step(self,
+                     model_input: "ModelInputForGPUWithSamplingMetadata",
+                     sampled_token_ids: Optional[torch.Tensor],
+                     block_size: int,
+                     num_seqs: int,
+                     num_queries: int,
+                     turn_prefills_into_decodes: bool = False):
+        """
+        Update metadata in-place to advance one decode step.
+        """
+
+        if turn_prefills_into_decodes:
+            # When Multi-Step is enabled with Chunked-Prefill, prefills and
+            # decodes are scheduled together. In the first step, all the
+            # prefills turn into decodes. This update reflects that
+            # conversion.
+            assert self.num_decode_tokens + self.num_prefills == num_seqs
+            # Flashinfer doesn't support speculative decoding + chunked-prefill
+            # + multi-step scheduling yet.
+            assert self.decode_query_len == 1
+            self.num_decode_tokens += self.num_prefills
+            self.num_prefills = 0
+            self.num_prefill_tokens = 0
+            self.max_prefill_seq_len = 0
+            self.max_query_len = 1
+
+            self.slot_mapping = self.slot_mapping[:num_seqs]
+        else:
+            assert self.seq_lens_tensor is not None
+
+        assert num_seqs > 0
+        assert num_queries > 0
+        assert model_input.attn_metadata is not None
+        assert sampled_token_ids is not None
+
+        # When using cudagraph, the num_seqs is padded to the next captured
+        # batch sized, but num_queries tracks the actual number of requests in
+        # the batch. For --enforce-eager mode, num_seqs == num_queries
+        if num_seqs != num_queries:
+            assert num_seqs > num_queries
+            assert self.use_cuda_graph
+
+        model_input.input_tokens[:num_queries] = sampled_token_ids.flatten()
+
+        # Update GPU tensors
+        ops.advance_step_flashinfer(
+            num_seqs=num_seqs,
+            num_queries=num_queries,
+            block_size=block_size,
+            input_tokens=model_input.input_tokens,
+            sampled_token_ids=model_input.input_tokens,
+            input_positions=model_input.input_positions,
+            seq_lens=self.seq_lens_tensor,
+            slot_mapping=self.slot_mapping,
+            block_tables=self.block_tables,
+            paged_kv_indices=self.paged_kv_indices,
+            paged_kv_indptr=self.paged_kv_indptr,
+            paged_kv_last_page_len=self.paged_kv_last_page_len,
+            block_table_bound=self.block_table_bound)
+
+
+class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
+
+    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+
+        self.input_builder = input_builder
+        self.runner = input_builder.runner
+
+        self.sliding_window = input_builder.sliding_window
+        self.block_size = input_builder.block_size
+
+        # Global hyperparameters shared by all attention layers
+        self.global_hyperparameters: Optional[PerLayerParameters] = None
+
+        self.vllm_config = self.runner.vllm_config
+
+    def prepare(self):
+        self.slot_mapping: List[int] = []
+        self.prefill_seq_lens: List[int] = []
+        self.context_lens: List[int] = []
+        self.block_tables: List[List[int]] = []
+        self.curr_seq_lens: List[int] = []
+        self.multimodal_placeholder_maps: Dict[
+            str,
+            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
+        self.num_prefills = 0
+        self.num_prefill_tokens = 0
+        self.num_decode_tokens = 0
+
+        # Please follow https://docs.flashinfer.ai/tutorials/kv_layout.html#page-layout
+        # for the precise definition of the following fields.
+        # An example:
+        # request 1, page indices [0, 5, 8]
+        # request 2, page indices [1, 6, 7]
+        # request 3, page indices [3, 4]
+        # paged_kv_indices is a concatenation of page indices of all requests:
+        # [0, 5, 8, 1, 6, 7, 3, 4]
+        # paged_kv_indptr is used to index into paged_kv_indices:
+        # [0, 3, 6, 8]
+        self.paged_kv_indices: List[int] = []
+        # 0 at the beginning of paged_kv_indptr indicates the start of the
+        # first request’s page indices in the paged_kv_indices list.
+        self.paged_kv_indptr: List[int] = [0]
+        # paged_kv_last_page_len is the length of the last page of each request
+        self.paged_kv_last_page_len: List[int] = []
+        self.total_blocks = 0
+        self.is_profile_run: bool = False
+
+        if self.global_hyperparameters is None:
+            # Infer global hyperparameters, since currently we only support
+            # models in which all layers share the same values for the
+            # following hyperparameters:
+            # - `window_left`
+            # - `logits_soft_cap`
+            # - `sm_scale`
+            inferred_params = infer_global_hyperparameters(
+                get_per_layer_parameters(self.vllm_config))
+            self.global_hyperparameters = inferred_params
+            self.window_left = inferred_params.window_left
+            self.logits_soft_cap = inferred_params.logits_soft_cap
+            self.sm_scale = inferred_params.sm_scale
+
+    def _add_seq_group(
+            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
+            chunked_prefill_enabled: bool):
+        """Add a sequence group to the metadata. Specifically update/append
+        1. context length.
+        2. block table.
+        3. slot mapping.
+        """
+        is_prompt = inter_data.is_prompt
+        block_tables = inter_data.block_tables
+        computed_block_nums = inter_data.computed_block_nums
+
+        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
+             curr_sliding_window_block) in zip(
+                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
+                 inter_data.orig_seq_lens, inter_data.seq_lens,
+                 inter_data.query_lens, inter_data.context_lens,
+                 inter_data.curr_sliding_window_blocks):
+            self.context_lens.append(context_len)
+            if is_prompt:
+                mm_maps = inter_data.multi_modal_placeholder_maps
+                if mm_maps:
+                    for modality, placeholders in mm_maps.items():
+                        self.multimodal_placeholder_maps[modality].extend(
+                            placeholders)
+                self.num_prefills += 1
+                self.num_prefill_tokens += token_len
+                self.prefill_seq_lens.append(seq_len)
+            else:
+                assert query_len == 1, (
+                    "seq_len: {}, context_len: {}, query_len: {}".format(
+                        seq_len, context_len, query_len))
+                self.num_decode_tokens += query_len
+                self.curr_seq_lens.append(curr_seq_len)
+
+            # Compute block table.
+            # TODO(sang): Combine chunked prefill and prefix caching by
+            # only allowing multiple of block_size chunk size.
+            # NOTE: This only works for oooooooxxx style attention.
+            block_table = []
+            if inter_data.prefix_cache_hit:
+                block_table = computed_block_nums
+            elif ((chunked_prefill_enabled or not is_prompt)
+                  and block_tables is not None):
+                block_table = block_tables[seq_id][-curr_sliding_window_block:]
+            self.block_tables.append(block_table)
+
+            is_profile_run = is_block_tables_empty(block_tables)
+
+            # Compute slot mapping.
+            start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
+                                                       context_len,
+                                                       self.sliding_window)
+            compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
+                                 seq_len, context_len, start_idx,
+                                 self.block_size, inter_data.block_tables)
+
+            # It is not necessary to add paged_kv_indices, paged_kv_indptr,
+            # and paged_kv_last_page_len for profile run because we will
+            # create dummy inputs.
+            if is_profile_run:
+                self.is_profile_run = is_profile_run
+                return
+
+            block_table = block_tables[seq_id]
+            self._update_paged_kv_tensors(block_table, seq_len)
+
+    def _update_paged_kv_tensors(self, block_table: List[int], seq_len: int):
+        # Get the number of valid blocks based on sequence length.
+        # If seq_len = 16, block_size = 16,
+        # block_table_bound is 1 with 1 valid block.
+        # If seq_len = 15, block_size = 16,
+        # block_table_bound is 0 + 1 with 1 valid block.
+        self.total_blocks += len(block_table)
+        block_table_bound = seq_len // self.block_size + 1 \
+                            if seq_len % self.block_size != 0 \
+                            else seq_len // self.block_size
+        self.paged_kv_indices.extend(block_table[:block_table_bound])
+        self.paged_kv_indptr.append(self.paged_kv_indptr[-1] +
+                                    block_table_bound)
+
+        last_page_len = seq_len % self.block_size
+        if last_page_len == 0:
+            last_page_len = self.block_size
+        self.paged_kv_last_page_len.append(last_page_len)
+
+    def build(self, seq_lens: List[int], query_lens: List[int],
+              cuda_graph_pad_size: int, batch_size: int):
+        """Build attention metadata with on-device tensors.
+
+        Args:
+            seq_lens: The maybe padded sequence lengths of the input sequences.
+            query_lens: The query lengths of the input sequences.
+            cuda_graph_pad_size: The padding size for cuda graph.
+                                 -1 if cuda graph is not used.
+            batch_size: The maybe padded batch size.
+        """
+        for inter_data in self.input_builder.inter_data_list:
+            self._add_seq_group(inter_data,
+                                self.input_builder.chunked_prefill_enabled)
+
+        device = self.runner.device
+        use_captured_graph = cuda_graph_pad_size != -1
+
+        max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
+        max_decode_seq_len = max(self.curr_seq_lens, default=0)
+        num_decode_tokens = self.num_decode_tokens
+        decode_query_len = max(query_lens[self.num_prefills:], default=1)
+
+        if use_captured_graph:
+            self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
+            self.block_tables.extend([] * cuda_graph_pad_size)
+            num_decode_tokens = batch_size - self.num_prefill_tokens
+
+            # The shape of graph_block_tables is
+            # [max batch size, max context len // block size].
+            input_block_tables = self.runner.graph_block_tables[:batch_size]
+            max_blocks = input_block_tables.shape[1]
+            for i, block_table in enumerate(self.block_tables):
+                if block_table:
+                    num_blocks = len(block_table)
+                    if num_blocks <= max_blocks:
+                        input_block_tables[i, :num_blocks] = block_table
+                    else:
+                        # It may be possible to have more blocks allocated due
+                        # to lookahead slots of multi-step, however, they are
+                        # not used anyway, so can be safely ignored.
+                        input_block_tables[
+                            i, :max_blocks] = block_table[:max_blocks]
+
+            block_tables = torch.from_numpy(input_block_tables).to(
+                device, non_blocking=True)
+
+            last_paged_kv_indptr = self.paged_kv_indptr[-1]
+            self.paged_kv_indptr.extend([last_paged_kv_indptr] *
+                                        cuda_graph_pad_size)
+            self.paged_kv_last_page_len.extend([0] * cuda_graph_pad_size)
+        else:
+            block_tables = make_tensor_with_pad(
+                self.block_tables,
+                pad=0,
+                dtype=torch.int,
+                device=device,
+            )
+
+        assert device is not None
+        seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
+                                           self.runner.pin_memory)
+        query_lens_tensor = async_tensor_h2d(query_lens, torch.long, device,
+                                             self.runner.pin_memory)
+        slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long,
+                                               device, self.runner.pin_memory)
+        query_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1,
+                                      dtype=torch.int32,
+                                      device=device)
+        seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
+                                    dtype=torch.int32,
+                                    device=device)
+        placeholder_index_maps = {
+            modality: placeholder_map.index_map()
+            for modality, placeholder_map in
+            self.multimodal_placeholder_maps.items()
+        }
+        torch.cumsum(seq_lens_tensor,
+                     dim=0,
+                     dtype=seq_start_loc.dtype,
+                     out=seq_start_loc[1:])
+        torch.cumsum(query_lens_tensor,
+                     dim=0,
+                     dtype=query_start_loc.dtype,
+                     out=query_start_loc[1:])
+
+        if len(self.paged_kv_indptr) > 0:
+            # extend to the maximum number of blocks as returned by the
+            # scheduler
+            self.paged_kv_indices.extend(
+                [0] * (self.total_blocks - len(self.paged_kv_indices)))
+            paged_kv_indices_tensor = torch.tensor(self.paged_kv_indices,
+                                                   device="cpu",
+                                                   dtype=torch.int)
+            paged_kv_indptr_tensor = torch.tensor(self.paged_kv_indptr,
+                                                  device="cpu",
+                                                  dtype=torch.int)
+            paged_kv_last_page_len_tensor = torch.tensor(
+                self.paged_kv_last_page_len, device="cpu", dtype=torch.int)
+            block_table_bound_tensor = torch.zeros(len(self.paged_kv_indptr) -
+                                                   1,
+                                                   device="cpu",
+                                                   dtype=torch.int)
+        else:
+            paged_kv_indices_tensor = None
+            paged_kv_indptr_tensor = None
+            paged_kv_last_page_len_tensor = None
+            block_table_bound_tensor = None
+
+        if self.runner.kv_cache_dtype.startswith("fp8"):
+            kv_cache_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
+                self.runner.kv_cache_dtype)
+        else:
+            kv_cache_dtype = get_kv_cache_torch_dtype(
+                self.runner.kv_cache_dtype, self.runner.model_config.dtype)
+
+        return FlashInferMetadata(
+            decode_query_len=decode_query_len,
+            num_prefills=self.num_prefills,
+            slot_mapping=slot_mapping_tensor,
+            multi_modal_placeholder_index_maps=placeholder_index_maps,
+            enable_kv_scales_calculation=False,
+            num_prefill_tokens=self.num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            max_prefill_seq_len=max_prefill_seq_len,
+            max_decode_seq_len=max_decode_seq_len,
+            block_tables=block_tables,
+            paged_kv_indptr=paged_kv_indptr_tensor,
+            paged_kv_indices=paged_kv_indices_tensor,
+            paged_kv_last_page_len=paged_kv_last_page_len_tensor,
+            block_table_bound=block_table_bound_tensor,
+            seq_lens_tensor=seq_lens_tensor,
+            num_qo_heads=self.runner.model_config.get_num_attention_heads(
+                self.runner.parallel_config),
+            num_kv_heads=self.runner.model_config.get_num_kv_heads(
+                self.runner.parallel_config),
+            head_dim=self.runner.model_config.get_head_size(),
+            page_size=self.block_size,
+            seq_start_loc=seq_start_loc,
+            query_start_loc=query_start_loc,
+            device=device,
+            data_type=kv_cache_dtype,
+            q_data_type=self.runner.model_config.dtype,
+            use_cuda_graph=use_captured_graph,
+            is_profile_run=self.is_profile_run,
+            window_left=self.window_left,
+            logits_soft_cap=self.logits_soft_cap,
+            sm_scale=self.sm_scale,
+        )
+
+
+class FlashInferImpl(AttentionImpl):
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[List[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
+        use_irope: bool = False,
+    ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0 "
+                                      "FLASHINFER backend.")
+        if use_irope:
+            logger.warning_once(
+                "Using irope in FlashInfer is not supported yet, it will fall"
+                " back to global attention for long context.")
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+        self.alibi_slopes = alibi_slopes
+        self.sliding_window = ((sliding_window - 1,
+                                0) if sliding_window is not None else (-1, -1))
+        self.kv_cache_dtype = kv_cache_dtype
+        self.logits_soft_cap = logits_soft_cap
+
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "FlashInferImpl")
+
+    def forward(
+        self,
+        layer: AttentionLayer,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: FlashInferMetadata,
+        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        if output_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported"
+                " for FlashInferImpl")
+
+        # TODO: directly write to output tensor
+        num_heads: int = self.num_heads
+        head_size: int = self.head_size
+        num_kv_heads: int = self.num_kv_heads
+        kv_cache_dtype: str = self.kv_cache_dtype
+        softmax_scale: float = self.scale
+        window_size = self.sliding_window
+        alibi_slopes = self.alibi_slopes
+        logits_soft_cap = self.logits_soft_cap
+
+        num_tokens, hidden_size = query.shape
+        query = query.view(-1, num_heads, head_size)
+        key = key.view(-1, num_kv_heads, head_size)
+        value = value.view(-1, num_kv_heads, head_size)
+
+        if kv_cache.numel() > 0:
+            # Use the same reshape and cache kernel as flash attention.
+            ops.reshape_and_cache_flash(
+                key,
+                value,
+                kv_cache[:, 0],
+                kv_cache[:, 1],
+                attn_metadata.slot_mapping.flatten(),
+                kv_cache_dtype,
+                layer._k_scale,
+                layer._v_scale,
+            )
+            # The FlashInfer api requires data to be in fp8_e4m3 or fp8_e5m2
+            # to process the cache when the kv_cache_dtype is fp8
+            if kv_cache_dtype.startswith("fp8"):
+                torch_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
+                    kv_cache_dtype)
+                kv_cache = kv_cache.view(torch_dtype)
+
+        num_prefill_tokens = attn_metadata.num_prefill_tokens
+        num_decode_tokens = attn_metadata.num_decode_tokens
+        assert key.shape[0] == num_prefill_tokens + num_decode_tokens, \
+                    f"key : {key.shape} : #prefill tokens {num_prefill_tokens} : #decode tokens {num_decode_tokens}" # noqa
+        assert value.shape[0] == num_prefill_tokens + num_decode_tokens, \
+                    f"value : {value.shape} : #prefill toks {num_prefill_tokens} : #decode toks {num_decode_tokens}" # noqa
+        query = query.contiguous(
+        )  # Flashinfer requires query to be contiguous
+        # Query for decode. KV is not needed because it is already cached.
+        # QKV for prefill.
+        decode_query = query[num_prefill_tokens:]
+        query = query[:num_prefill_tokens]
+
+        key = key[:num_prefill_tokens]
+        value = value[:num_prefill_tokens]
+
+        assert query.shape[0] == num_prefill_tokens
+        assert decode_query.shape[0] == num_decode_tokens
+
+        window_left = window_size[0] if window_size is not None else -1
+
+        prefill_output: Optional[torch.Tensor] = None
+        decode_output: Optional[torch.Tensor] = None
+        stride_order = FlashInferBackend.get_kv_cache_stride_order()
+        if prefill_meta := attn_metadata.prefill_metadata:
+            # We will use flash attention for prefill
+            # when kv_cache is not provided.
+            # This happens when vllm runs the profiling to
+            # determine the number of blocks.
+            if kv_cache.numel() == 0:
+                prefill_output = flash_attn_varlen_func(
+                    q=query,
+                    k=key,
+                    v=value,
+                    cu_seqlens_q=prefill_meta.seq_start_loc,
+                    cu_seqlens_k=prefill_meta.seq_start_loc,
+                    max_seqlen_q=prefill_meta.max_prefill_seq_len,
+                    max_seqlen_k=prefill_meta.max_prefill_seq_len,
+                    softmax_scale=softmax_scale,
+                    causal=True,
+                    window_size=window_size,
+                    alibi_slopes=alibi_slopes,
+                )
+            else:
+                assert prefill_meta is not None
+                assert prefill_meta.prefill_wrapper is not None
+
+                assert prefill_meta.prefill_wrapper._causal
+                assert prefill_meta.prefill_wrapper._window_left == window_left
+                assert prefill_meta.prefill_wrapper._logits_soft_cap == (
+                    logits_soft_cap or 0.0)
+                assert prefill_meta.prefill_wrapper._sm_scale == softmax_scale
+
+                prefill_output = prefill_meta.prefill_wrapper.run(
+                    query,
+                    kv_cache.permute(*stride_order),
+                    k_scale=layer._k_scale_float,
+                    v_scale=layer._v_scale_float,
+                )
+        if decode_meta := attn_metadata.decode_metadata:
+            assert decode_meta is not None
+            assert decode_meta.decode_wrapper is not None
+
+            assert decode_meta.decode_wrapper._window_left == window_left
+            assert decode_meta.decode_wrapper._logits_soft_cap == (
+                logits_soft_cap or 0.0)
+            assert decode_meta.decode_wrapper._sm_scale == softmax_scale
+            # TODO: @pavanimajety Remove this once the switch happens
+            # inside flashinfer.
+            if not FlashInferBackend.use_trtllm_decode_attention(
+                    num_decode_tokens, attn_metadata.max_decode_seq_len,
+                    kv_cache_dtype, attn_metadata.num_qo_heads,
+                    attn_metadata.num_kv_heads, attn_metadata.head_dim):
+                decode_output = decode_meta.decode_wrapper.run(
+                    decode_query,
+                    kv_cache.permute(*stride_order),
+                    k_scale=layer._k_scale_float,
+                    v_scale=layer._v_scale_float,
+                )
+            else:
+                workspace_buffer = (
+                    decode_meta.decode_wrapper._int_workspace_buffer)
+                assert FlashInferState.get_kv_cache_layout() == "HND"
+                decode_output = trtllm_batch_decode_with_kv_cache(
+                    query=decode_query,
+                    kv_cache=kv_cache.permute(*stride_order),
+                    workspace_buffer=workspace_buffer,
+                    num_heads=num_heads,
+                    num_kv_heads=num_kv_heads,
+                    scale=softmax_scale,
+                    block_tables=attn_metadata.block_tables,
+                    seq_lens=decode_meta.seq_lens_tensor,
+                    block_size=attn_metadata.page_size,
+                    max_seq_len=attn_metadata.max_decode_seq_len,
+                    kv_cache_dtype=kv_cache_dtype,
+                    k_scale=layer._k_scale_float,
+                    v_scale=layer._v_scale_float)
+
+        if prefill_output is None and decode_output is not None:
+            # Decode only batch.
+            output, num_tokens = decode_output, num_decode_tokens
+        elif decode_output is None and prefill_output is not None:
+            # Prefill only batch.
+            output, num_tokens = prefill_output, num_prefill_tokens
+        else:
+            # Chunked prefill batch does not work with speculative decoding in
+            # FlashInfer backend, so the query length for decode should be 1.
+            assert prefill_output is not None
+            assert decode_output is not None
+            assert decode_meta is not None
+            assert decode_meta.decode_query_len == 1
+            decode_output = decode_output.squeeze(1)
+            output = torch.cat([prefill_output, decode_output], dim=0)
+        return output.view(num_tokens, hidden_size)
diff --git a/vllm_v0.10.0/vllm/attention/backends/flashmla.py b/vllm_v0.10.0/vllm/attention/backends/flashmla.py
new file mode 100644
index 0000000..a242ac9
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/backends/flashmla.py
@@ -0,0 +1,240 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, List, Optional, Tuple, Type
+
+import torch
+
+from vllm.attention.backends.abstract import (AttentionType,
+                                              is_quantized_kv_cache)
+from vllm.attention.backends.mla.common import (MLACommonBackend,
+                                                MLACommonImpl,
+                                                MLACommonMetadata,
+                                                MLACommonMetadataBuilder,
+                                                MLACommonState)
+from vllm.attention.ops.flashmla import (flash_mla_with_kvcache,
+                                         get_mla_metadata,
+                                         is_flashmla_supported)
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+
+
+class FlashMLABackend(MLACommonBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "FLASHMLA"
+
+    @staticmethod
+    def get_impl_cls() -> Type["FlashMLAImpl"]:
+        return FlashMLAImpl
+
+    @staticmethod
+    def get_metadata_cls() -> Type["FlashMLAMetadata"]:
+        return FlashMLAMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["FlashMLAMetadataBuilder"]:
+        return FlashMLAMetadataBuilder
+
+    @staticmethod
+    def get_state_cls() -> Type["FlashMLAState"]:
+        return FlashMLAState
+
+
+@dataclass
+class FlashMLAMetadata(MLACommonMetadata):
+    decode_tile_scheduler_metadata: Optional[Tuple[torch.Tensor,
+                                                   torch.Tensor]] = None
+    decode_num_splits: Optional[torch.Tensor] = None
+
+    @property
+    def decode_metadata(self):
+        decode_metadata = super().decode_metadata
+        # TODO: cache assignment?
+        if decode_metadata is not None:
+            decode_metadata.decode_tile_scheduler_metadata=\
+                self.decode_tile_scheduler_metadata
+            decode_metadata.decode_num_splits=\
+                self.decode_num_splits
+        return decode_metadata
+
+    def advance_step(self,
+                     model_input: "ModelInputForGPUWithSamplingMetadata",
+                     sampled_token_ids: Optional[torch.Tensor],
+                     block_size: int,
+                     num_seqs: int,
+                     num_queries: int,
+                     turn_prefills_into_decodes: bool = False):
+        raise NotImplementedError(
+            "advance_step is not implemented for FlashMLA")
+
+
+class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.num_q_heads = self.runner.model_config.get_num_attention_heads(
+            self.runner.parallel_config)
+
+    def build(self, seq_lens: List[int], query_lens: List[int],
+              cuda_graph_pad_size: int, batch_size: int):
+        m = super().build(seq_lens, query_lens, cuda_graph_pad_size,
+                          batch_size)
+
+        if m.num_decode_tokens > 0:
+            m.decode_tile_scheduler_metadata, m.decode_num_splits = \
+                get_mla_metadata(
+                m.seq_lens_tensor[m.num_prefills:],
+                self.num_q_heads,
+                1, # MQA for the decode path
+            )
+
+        return m
+
+
+class FlashMLAState(MLACommonState[FlashMLAMetadata]):
+
+    def __init__(self, *args, **kwds):
+        super().__init__(*args, **kwds)
+
+        self.num_q_heads = self.runner.model_config.get_num_attention_heads(
+            self.runner.parallel_config)
+
+    @contextmanager
+    def graph_capture(self, max_batch_size: int):
+        # Run a dummy `get_mla_metadata` so we can get the right shapes
+        self._graph_decoder_tile_scheduler_metadata, \
+            self._graph_decode_num_splits = get_mla_metadata(
+            torch.ones(
+                max_batch_size, dtype=torch.int32, device=self.runner.device),
+            self.num_q_heads,
+            1, # MQA for the decode path
+        )
+
+        with super().graph_capture(max_batch_size):
+            yield
+
+        del self._graph_decoder_tile_scheduler_metadata
+        del self._graph_decode_num_splits
+
+    def graph_capture_get_metadata_for_batch(
+            self, batch_size: int, is_encoder_decoder_model: bool = False):
+        metadata = super().graph_capture_get_metadata_for_batch(
+            batch_size, is_encoder_decoder_model)
+        assert metadata.num_decode_tokens > 0
+
+        decoder_tile_scheduler_metadata, decode_num_splits = get_mla_metadata(
+            self._graph_seq_lens[:batch_size],
+            self.num_q_heads,
+            1,  # MQA for the decode path
+        )
+
+        self._graph_decoder_tile_scheduler_metadata.copy_(
+            decoder_tile_scheduler_metadata)
+        self._graph_decode_num_splits[:batch_size + 1].copy_(decode_num_splits)
+
+        metadata.decode_tile_scheduler_metadata=\
+            self._graph_decoder_tile_scheduler_metadata
+        metadata.decode_num_splits=\
+            self._graph_decode_num_splits[:batch_size + 1]
+
+        return metadata
+
+    def get_graph_input_buffers(self,
+                                attn_metadata,
+                                is_encoder_decoder_model: bool = False):
+        input_buffers = super().get_graph_input_buffers(
+            attn_metadata, is_encoder_decoder_model)
+        input_buffers["decode_tile_scheduler_metadata"] = \
+                attn_metadata.decode_metadata.decode_tile_scheduler_metadata
+        input_buffers["decode_num_splits"] = \
+                attn_metadata.decode_metadata.decode_num_splits
+
+        return input_buffers
+
+    def prepare_graph_input_buffers(self,
+                                    input_buffers,
+                                    attn_metadata,
+                                    is_encoder_decoder_model: bool = False):
+        super().prepare_graph_input_buffers(input_buffers, attn_metadata,
+                                            is_encoder_decoder_model)
+
+        input_buffers["decode_tile_scheduler_metadata"].copy_(
+            attn_metadata.decode_metadata.decode_tile_scheduler_metadata)
+        input_buffers["decode_num_splits"].copy_(
+            attn_metadata.decode_metadata.decode_num_splits)
+
+
+class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
+
+    def __init__(
+            self,
+            num_heads: int,
+            head_size: int,
+            scale: float,
+            num_kv_heads: int,
+            alibi_slopes: Optional[List[float]],
+            sliding_window: Optional[int],
+            kv_cache_dtype: str,
+            logits_soft_cap: Optional[float],
+            attn_type: str,
+            kv_sharing_target_layer_name: Optional[str] = None,
+            # MLA Specific Arguments
+            **mla_args) -> None:
+        super().__init__(num_heads, head_size, scale, num_kv_heads,
+                         alibi_slopes, sliding_window, kv_cache_dtype,
+                         logits_soft_cap, attn_type,
+                         kv_sharing_target_layer_name, **mla_args)
+
+        assert is_flashmla_supported(), \
+            "FlashMLA is not supported on this device"
+
+        unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
+        if any(unsupported_features):
+            raise NotImplementedError(
+                "FlashMLAImpl does not support one of the following: "
+                "alibi_slopes, sliding_window, logits_soft_cap")
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "FlashMLAImpl")
+
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "FlashMLA with FP8 KV cache not yet supported")
+
+    def _forward_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: FlashMLAMetadata,
+    ) -> torch.Tensor:
+        assert kv_c_and_k_pe_cache.numel() > 0
+
+        decode_meta = attn_metadata.decode_metadata
+        assert decode_meta is not None
+
+        q = torch.cat([q_nope, q_pe], dim=-1)\
+            .unsqueeze(1) # Add seqlen dim of 1 (decode)
+
+        o, _ = flash_mla_with_kvcache(
+            q=q,
+            k_cache=kv_c_and_k_pe_cache.unsqueeze(-2),  # Add head dim of 1
+            block_table=decode_meta.block_tables,
+            cache_seqlens=decode_meta.seq_lens_tensor,
+            head_dim_v=self.kv_lora_rank,
+            tile_scheduler_metadata=decode_meta.decode_tile_scheduler_metadata,
+            num_splits=decode_meta.decode_num_splits,
+            softmax_scale=self.scale,
+            causal=True,
+        )
+
+        return self._v_up_proj(o)
diff --git a/vllm_v0.10.0/vllm/attention/backends/mla/__init__.py b/vllm_v0.10.0/vllm/attention/backends/mla/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/attention/backends/mla/common.py b/vllm_v0.10.0/vllm/attention/backends/mla/common.py
new file mode 100644
index 0000000..52c4a9e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/backends/mla/common.py
@@ -0,0 +1,1390 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+# MLA Common Components
+
+This file implements common components for MLA implementations.
+
+First we define:
+
+Sq      as Q sequence length
+Skv     as KV sequence length
+
+MLA has two possible ways of computing, a data-movement friendly approach and a
+compute friendly approach, we generally want to use the compute friendly
+approach for "prefill" (i.e. the ratio Sq / Skv is "small", is near 1)
+and the data-movement friendly approach for "decode" (i.e. the ratio
+Sq / Skv is "large").
+
+NOTE what we deem small and large is currently determined by if its labelled
+prefill or decode by the scheduler, but this is something we should probably
+tune.
+
+Main reference: DeepseekV2 paper, and FlashInfer Implementation
+(https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
+
+Deepseek's MLA attention works the following way:
+* Use a single latent vector to represent the per-token entry of the KV cache.
+* For decode (i.e. the memory friendly approach) the attention "simulates" a
+multi-head attention, while the compute is similar to multi-query attention.
+
+Below is example of both paths assuming batchsize = 1
+
+## More Extent Definitions:
+
+C           Context length, `Skv - Sq`
+H           hidden size
+N           number of attention heads
+Lq          latent dimension for Q              1536 in DSV3
+Lkv         latent dimension for K/V            512 in DSV3
+P           nope dimension, no rope.            128 in DSV3
+R           rope dimension, goes through rope.  64 in DSV3
+V           V head dim.                         128 in DSV3
+
+## Vector/Matrix Definitions
+
+h_t         hidden states (input to attention)  shape [Sq, H]
+q_c         latent/compressed Q                 shape [Sq, Lq]
+q_nope      uncompressed Q (no-rope)            shape [Sq, N, P]
+q_pe        uncompressed Q (rope)               shape [Sq, N, R]
+kv_c        latent/compressed KV                shape [Skv, Lkv]
+k_pe        decoupled k position embeddings     shape [Skv, R]
+new_kv_c    new kv_c from current iter          shape [Sq, Lkv]
+new_k_pe    new k_pe from current iter          shape [Sq, R]
+cache_kv_c  cached k_c from previous iters      shape [C, Lkv]
+cache_k_pe  cached k_pe from previous iters     shape [C, R]
+W_DQ        project h_t to q_c                  shape [H, Lq]
+W_UQ        project q_c to q_nope               shape [Lq, N * P]
+W_QR        project q_c to q_pe                 shape [Lq, N * R]
+W_DKV       project h_t to kv_c                 shape [H, Lkv]
+W_UK        project kv_c to k_nope              shape [Lkv, N, P]
+W_KR        project h_t to k_pe                 shape [H, R]
+W_UV        project kv_c to v                   shape [Lkv, N, V]
+W_O         project v to h_t                    shape [N * V, H]
+
+
+## Compute Friendly Approach (i.e. "_forward_prefill"):
+
+q_c      = h_t @ W_DQ
+q_nope   = (q_c @ W_UQ).view(Sq, N, P)
+q_pe     = RoPE(q_c @ W_QR).view(Sq, N, R)
+new_kv_c = h_t @ W_DKV
+new_k_pe = RoPE(h_t @ W_KR)
+kv_c     = torch.cat([new_kv_c, cache_kv_c], dim=0)
+k_pe     = torch.cat([new_k_pe, cache_k_pe], dim=0)
+k_nope   = (kv_c @ W_UK.view(Lkv, N * P)).view(Skv, N, P)
+v        = (kv_c @ W_UV.view(Lkv, N * V)).view(Skv, N, V)
+
+// MHA with QK headdim = P + R
+//           V headdim = V
+//      spda_o shape [Sq, N, V]
+spda_o = scaled_dot_product_attention(
+    torch.cat([q_nope, q_pe], dim=-1),
+    torch.cat([k_nope, k_pe.unsqueeze(1).expand(-1, N, -1)], dim=-1),
+    v
+) 
+return spda_o @ W_O
+
+NOTE: in the actual code, 
+    `kv_b_proj` is [W_UK; W_UV] concatenated per head
+    `q_b_proj` is [W_UQ; W_QR] concatenated per head
+    `out_proj` is W_O
+
+
+## Data-Movement Friendly Approach (i.e. "_forward_decode"):
+
+Runtime
+q_c      = h_t @ W_DQ
+q_nope   = (q_c @ W_UQ).view(-1, N, P)
+ql_nope  = einsum("snh,lnh->snl", q, W_UK)
+q_pe     = RoPE(q_c @ W_QR).view(Sq, N, R)
+new_kv_c = h_t @ W_DKV
+new_k_pe = RoPE(h_t @ W_KR)
+kv_c     = torch.cat([new_kv_c, cache_kv_c], dim=0)
+k_pe     = torch.cat([new_k_pe, cache_k_pe], dim=0)
+
+// MQA with QK headdim = Lkv + R
+//           V headdim = Lkv
+//      spda_o shape [Sq, N, Lkv]
+// NOTE: this is less compute-friendly since Lkv > P
+//       but is more data-movement friendly since its MQA vs MHA
+spda_o = scaled_dot_product_attention(
+    torch.cat([ql_nope, q_pe], dim=-1),
+    torch.cat([kv_c, k_pe], dim=-1),
+    kv_c
+)
+
+o = einsum("snl,lnv->snv", spda_o.reshape(-1, N, Lkv), W_UV)
+return o.view(-1, N * V) @ self.num_heads @ W_O
+
+
+## Chunked Prefill
+
+For chunked prefill we want to use the compute friendly algorithm. We are 
+assuming sufficiently large Sq / Skv ratio, in the future may want to switch to 
+the data-movement friendly approach if the chunk (i.e. `Sq`) is small.
+
+However, the compute-friendly approach can potentially run out of memory if Skv
+is large due to: `k_nope = (kv_c @ W_UK).view(Skv, N, P)`
+
+To mitigate this, we chunk the computation of attention with respect to the 
+current context (i.e. `cache_kv_c` and `cache_k_pe`) so that we can used a 
+fixed workspace size.
+
+The chunked prefill approach is as follows:
+
+MCC        Max chunk of context to process per iter, computed dynamically, 
+           used to bound the memory usage
+
+q_c        = h_t @ W_DQ
+q_nope     = (q_c @ W_UQ).view(Sq, N, P)
+q_pe       = RoPE(q_c @ W_QR).view(Sq, N, R)
+new_kv_c   = h_t @ W_DKV
+new_k_pe   = RoPE(h_t @ W_KR)
+new_k_nope = (new_kv_c @ W_UK.view(Lkv, N * P)).view(Sq, N, P)
+new_v      = (new_kv_c @ W_UV.view(Lkv, N * V)).view(Sq, N, V)
+
+// MHA between queries and new KV
+//     with QK headdim = P + R
+//           V headdim = V
+//    curr_o   shape [Sq, N, V]
+//    curr_lse shape [N, Sq], this is just order FA returns
+curr_o, curr_lse = scaled_dot_product_attention(
+    torch.cat([q_nope, q_pe], dim=-1),
+    torch.cat([new_k_nope, new_k_pe.unsqueeze(1).expand(-1, N, -1)], dim=-1),
+    new_v,
+    casual=True,
+    return_softmax_lse=True
+) 
+
+// Compute attention with the already existing context
+for chunk_idx in range(cdiv(C, MCC)):
+    chunk_start  = chunk_idx * MCC
+    chunk_end    = min(chunk_start + MCC, C)
+    Sc           = chunk_end - chunk_start
+    cache_kv_c_chunk   = cache_kv_c[chunk_start:chunk_end]
+    cache_k_pe_chunk   = cache_k_pe[chunk_start:chunk_end]
+    cache_k_nope_chunk = (cache_kv_c_chunk @ W_UK).view(-1, N, P)
+    cache_v_chunk      = (cache_kv_c_chunk @ W_UV).view(-1, N, V)
+
+    chunk_o, chunk_lse = scaled_dot_product_attention(
+        torch.cat([q_nope, q_pe], dim=-1),
+        torch.cat([cache_k_nope_chunk,
+                   cache_k_pe_chunk.unsqueeze(1).expand(-1, N, -1)],
+                   dim=-1),
+        cache_v_chunk,
+        casual=False,
+        return_softmax_lse=True
+    )
+
+    curr_o, curr_lse = merge_attn_states(
+        suffix_output=curr_o,
+        suffix_lse=curr_lse,
+        prefix_output=chunk_o,
+        prefix_lse=chunk_lse,
+    )
+
+return curr_o @ W_O
+"""
+
+import functools
+from abc import abstractmethod
+from collections import defaultdict
+from contextlib import contextmanager
+from dataclasses import dataclass
+from itertools import accumulate
+from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Tuple,
+                    Type, TypeVar)
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm import envs
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
+                                              AttentionMetadata,
+                                              AttentionMetadataBuilder,
+                                              AttentionState, MLAAttentionImpl)
+from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
+                                           compute_slot_mapping_start_idx,
+                                           is_block_tables_empty)
+from vllm.attention.ops.merge_attn_states import merge_attn_states
+from vllm.attention.utils.fa_utils import get_flash_attn_version
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               LinearBase,
+                                               UnquantizedLinearMethod)
+from vllm.multimodal import MultiModalPlaceholderMap
+from vllm.platforms import current_platform
+from vllm.triton_utils import HAS_TRITON
+from vllm.utils import async_tensor_h2d, cdiv, make_tensor_with_pad, round_down
+
+if HAS_TRITON:
+    from vllm.attention.ops.triton_flash_attention import triton_attention
+else:
+    triton_attention = None
+
+try:
+    from vllm.vllm_flash_attn import flash_attn_varlen_func
+    is_vllm_fa = True
+except ImportError:
+    is_vllm_fa = False
+    try:
+        # For rocm use upstream flash attention
+        from flash_attn import flash_attn_varlen_func
+    except ImportError:
+        flash_attn_varlen_func = None
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
+                                          ModelInputForGPUWithSamplingMetadata)
+
+is_hip = current_platform.is_rocm()
+
+
+class MLACommonBackend(AttentionBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "TRITON_MLA"
+
+    @staticmethod
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return MLACommonMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["MLACommonMetadataBuilder"]:
+        return MLACommonMetadataBuilder
+
+    @staticmethod
+    def get_state_cls() -> Type["MLACommonState"]:
+        return MLACommonState
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,  # assumed to be 1 for MLA
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return (num_blocks, block_size, head_size)
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        ops.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: torch.Tensor,
+    ) -> None:
+        ops.copy_blocks_mla(kv_caches, src_to_dists)
+
+    @staticmethod
+    def get_supported_head_sizes() -> List[int]:
+        return [576]
+
+
+T = TypeVar("T", bound="MLACommonMetadata")
+
+
+class MLACommonState(AttentionState, Generic[T]):
+
+    def __init__(self, runner):
+        self.runner = runner
+        self._is_graph_capturing = False
+
+        scheduler_config = runner.scheduler_config
+        self.model_config = runner.model_config
+        cache_config = runner.cache_config
+
+        self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
+        self.enable_prefix_caching = cache_config.enable_prefix_caching
+
+        if self.chunked_prefill_enabled or self.enable_prefix_caching:
+            self.context_chunk_workspace_size = min(
+                # Max sure there is enough for 8 full length request or at least
+                # 4 pages of cache per request
+                max(
+                    8 * self.model_config.max_model_len, 4 *
+                    scheduler_config.max_num_seqs * cache_config.block_size),
+                # For long-context models try not to over-allocate limiting
+                # kv-cache space, limiting it to 64k tokens,
+                # which would result in the workspace being:
+                #   2*(576)*(64*1024) = 144mb
+                # (assuming 576 MLA head dim, and fp16)
+                # which would result in up-projected context being
+                #   2*(192*128)*(64*1024) = 3gb
+                # (assuming 192 QK head dim, 128 heads, and fp16)
+                128 * 1024)
+            assert self.context_chunk_workspace_size >= \
+                scheduler_config.max_num_seqs * cache_config.block_size
+
+    @contextmanager
+    def graph_capture(self, max_batch_size: int):
+        self._is_graph_capturing = True
+
+        self._graph_slot_mapping = torch.full((max_batch_size, ),
+                                              PAD_SLOT_ID,
+                                              dtype=torch.long,
+                                              device=self.runner.device)
+        self._graph_seq_lens = torch.ones(max_batch_size,
+                                          dtype=torch.int32,
+                                          device=self.runner.device)
+        self._graph_block_tables = torch.from_numpy(
+            self.runner.graph_block_tables).to(device=self.runner.device)
+
+        self._positions = torch.zeros((max_batch_size, ),
+                                      dtype=torch.long,
+                                      device=self.runner.device)
+
+        yield
+
+        self._is_graph_capturing = False
+        del self._graph_slot_mapping
+        del self._graph_seq_lens
+        del self._graph_block_tables
+        del self._positions
+
+    def graph_clone(self, batch_size: int):
+        assert self._is_graph_capturing
+        return self.__class__(self.runner)
+
+    def graph_capture_get_metadata_for_batch(
+            self,
+            batch_size: int,
+            is_encoder_decoder_model: bool = False) -> T:
+        assert self._is_graph_capturing
+
+        attn_metadata = self.runner.attn_backend.make_metadata(
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=False,
+            use_cuda_graph=True,
+            num_prefills=0,
+            num_prefill_tokens=0,
+            num_decode_tokens=batch_size,
+            slot_mapping=self._graph_slot_mapping[:batch_size],
+            seq_lens=None,
+            seq_lens_tensor=self._graph_seq_lens[:batch_size],
+            max_query_len=1,
+            max_decode_query_len=1,
+            max_prefill_seq_len=0,
+            max_decode_seq_len=self.runner.max_seq_len_to_capture,
+            query_start_loc=None,
+            seq_start_loc=None,
+            context_lens_tensor=None,
+            block_tables=self._graph_block_tables[:batch_size],
+            head_dim=self.runner.model_config.get_head_size())
+
+        if is_encoder_decoder_model:
+            raise NotImplementedError(
+                "MLACommonState does not support encoder/decoder yet")
+
+        return attn_metadata
+
+    def get_graph_input_buffers(self,
+                                attn_metadata,
+                                is_encoder_decoder_model: bool = False):
+        input_buffers = {
+            "slot_mapping": attn_metadata.slot_mapping,
+            "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor,
+            "block_tables": attn_metadata.decode_metadata.block_tables,
+        }
+        if is_encoder_decoder_model:
+            raise NotImplementedError(
+                "MLACommonState does not support encoder/decoder yet")
+
+        return input_buffers
+
+    def prepare_graph_input_buffers(self,
+                                    input_buffers,
+                                    attn_metadata,
+                                    is_encoder_decoder_model: bool = False):
+        input_buffers["seq_lens_tensor"].copy_(
+            attn_metadata.decode_metadata.seq_lens_tensor, non_blocking=True)
+        input_buffers["block_tables"].copy_(
+            attn_metadata.decode_metadata.block_tables, non_blocking=True)
+        if is_encoder_decoder_model:
+            raise NotImplementedError(
+                "TritonMLAState does not support encoder/decoder yet")
+
+    def begin_forward(self, model_input):
+        if self.chunked_prefill_enabled or self.enable_prefix_caching:
+            if not hasattr(self, "context_chunk_workspace"):
+                # not self.runner.device does not return the correct device
+                # for this process, (init_device sets the correct device but
+                # only on the Worker). The only way Ive figured out to get the
+                # correct device is to allocate the workspace on the first call
+                # to begin_forward and use the device of the input tokens
+                assert model_input.input_tokens is not None
+                self.context_chunk_workspace = torch.empty(
+                    (self.context_chunk_workspace_size,
+                     self.model_config.get_head_size()),
+                    dtype=self.model_config.dtype,
+                    device=model_input.input_tokens.device,
+                )
+
+            model_input.attn_metadata.context_chunk_workspace = \
+                self.context_chunk_workspace
+
+
+@dataclass
+class MLACommonMetadata(AttentionMetadata):
+    """Metadata for MLACommon. 
+    
+    NOTE: Please read the comment at the top of the file before trying to 
+    understand this class
+
+    NOTE: Any python object stored here is not updated when it is
+    cuda-graph replayed. If you have values that need to be changed
+    dynamically, it should be stored in tensor. The tensor has to be
+    updated from `CUDAGraphRunner.forward` API.
+    """
+    # Whether or not if cuda graph is enabled.
+    # Cuda-graph is currently enabled for decoding only.
+    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
+    use_cuda_graph: bool
+
+    # NOTE(sang): Definition of context_len, query_len, and seq_len.
+    # |---------- N-1 iteration --------|
+    # |---------------- N iteration ---------------------|
+    # |- tokenA -|......................|-- newTokens ---|
+    # |---------- context_len ----------|
+    # |-------------------- seq_len ---------------------|
+    #                                   |-- query_len ---|
+
+    # (batch_size,). The sequence length per sequence. Sequence length means
+    # the computed tokens + new tokens None if it is a decoding.
+    seq_lens: Optional[List[int]]
+    # seq_lens stored as a tensor.
+    seq_lens_tensor: Optional[torch.Tensor]
+
+    # Maximum sequence length among prefill batch. 0 if there are decoding
+    # requests only.
+    max_prefill_seq_len: int
+    # Maximum sequence length among decode batch. 0 if there are prefill
+    # requests only.
+    max_decode_seq_len: int
+    # (batch_size,) A tensor of context lengths (tokens that are computed
+    # so far).
+    context_lens_tensor: Optional[torch.Tensor]
+
+    # (batch_size, max_blocks_per_seq).
+    # Block addresses per sequence. (Seq id -> list of physical block)
+    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
+    # in the kv cache. Each block can contain up to block_size tokens.
+    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
+    # captured.
+    block_tables: Optional[torch.Tensor]
+
+    # Maximum query length in the batch.
+    max_query_len: Optional[int] = None
+
+    # Max number of query tokens among request in the batch.
+    max_decode_query_len: Optional[int] = None
+
+    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
+    # the batch, used to index into subquery. E.g., if the subquery length
+    # is [4, 6], it is [0, 4, 10].
+    query_start_loc: Optional[torch.Tensor] = None
+    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+    # the batch, used to index into sequence. E.g., if the sequence length is
+    # [4, 6], it is [0, 4, 10].
+    seq_start_loc: Optional[torch.Tensor] = None
+
+    _cached_prefill_metadata: Optional[Any] = None
+    _cached_decode_metadata: Optional[Any] = None
+
+    num_prefill_tokens: int
+
+    # The dimension of the attention heads
+    head_dim: Optional[int] = None
+
+    # Used when chunked prefill is enabled to simulate worst case workspace
+    # allocations, hopefully to avoid going OOM
+    is_profile_run: bool = False
+
+    # New for MLA (compared to FlashAttention)
+    # For chunked prefill
+    context_chunk_cu_seq_lens: Optional[torch.Tensor] = None
+    context_chunk_starts: Optional[torch.Tensor] = None
+    context_chunk_seq_tot: Optional[List[int]] = None
+    context_chunk_max_seq_lens: Optional[List[int]] = None
+    # Set by MLAAttentionState in `begin_forward` so it doesn't get broadcasted
+    context_chunk_workspace: Optional[torch.Tensor] = None
+
+    def __post_init__(self):
+        supported_head_sizes = MLACommonBackend.get_supported_head_sizes()
+        if self.head_dim is not None and self.head_dim \
+                not in supported_head_sizes:
+            raise ValueError(
+                f"Only {supported_head_sizes} are supported for head_dim,",
+                f" received {self.head_dim}.")
+
+    @property
+    def prefill_metadata(self):
+        if self.num_prefills == 0:
+            return None
+
+        if self._cached_prefill_metadata is not None:
+            return self._cached_prefill_metadata
+
+        assert self.seq_lens is not None
+        assert self.seq_lens_tensor is not None
+
+        # Compute some attn_metadata fields which default to None
+        query_start_loc = (None if self.query_start_loc is None else
+                           self.query_start_loc[:self.num_prefills + 1])
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[:self.num_prefill_tokens])
+        seq_lens = (None if self.seq_lens is None else
+                    self.seq_lens[:self.num_prefills])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[:self.num_prefills])
+        seq_start_loc = (None if self.seq_start_loc is None else
+                         self.seq_start_loc[:self.num_prefills + 1])
+        context_lens_tensor = (None if self.context_lens_tensor is None else
+                               self.context_lens_tensor[:self.num_prefills])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[:self.num_prefills])
+
+        self._cached_prefill_metadata = self.__class__(
+            # Required by ModelRunner
+            use_cuda_graph=False,  # Not Attention Related
+            # Required by Attention Metadata
+            num_prefills=self.num_prefills,
+            num_prefill_tokens=self.num_prefill_tokens,
+            num_decode_tokens=0,
+            slot_mapping=slot_mapping,
+            # Required by Attention Metadata (not used)
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=False,
+            # MLACommonMetadata
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            max_query_len=self.max_query_len,
+            max_prefill_seq_len=self.max_prefill_seq_len,
+            max_decode_query_len=0,
+            max_decode_seq_len=0,
+            query_start_loc=query_start_loc,
+            seq_start_loc=seq_start_loc,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=block_tables,
+            head_dim=self.head_dim,
+            is_profile_run=self.is_profile_run,
+            # MLACommonMetadata Chunk prefill specific
+            context_chunk_cu_seq_lens=self.context_chunk_cu_seq_lens,
+            context_chunk_starts=self.context_chunk_starts,
+            context_chunk_seq_tot=self.context_chunk_seq_tot,
+            context_chunk_max_seq_lens=self.context_chunk_max_seq_lens,
+        )
+        return self._cached_prefill_metadata
+
+    @property
+    def decode_metadata(self):
+        if self.num_decode_tokens == 0:
+            return None
+
+        if self._cached_decode_metadata is not None:
+            return self._cached_decode_metadata
+        assert self.seq_lens_tensor is not None
+
+        # Compute some attn_metadata fields which default to None
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[self.num_prefill_tokens:])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[self.num_prefills:])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[self.num_prefills:])
+
+        self._cached_decode_metadata = self.__class__(
+            # Required by ModelRunner
+            use_cuda_graph=self.use_cuda_graph,  # Not Attention Related
+            # Required by Attention Metadata
+            num_prefills=0,
+            num_prefill_tokens=0,
+            num_decode_tokens=self.num_decode_tokens,
+            slot_mapping=slot_mapping,
+            # Required by Attention Metadata (not used)
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=False,
+            # MLACommonMetadata
+            seq_lens=None,
+            seq_lens_tensor=seq_lens_tensor,
+            max_decode_query_len=self.max_decode_query_len,
+            max_query_len=self.max_query_len,
+            max_prefill_seq_len=0,
+            max_decode_seq_len=self.max_decode_seq_len,
+            # Batch may be composed of prefill|decodes, adjust query start
+            # indices to refer to the start of decodes. E.g.
+            # in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6].
+            query_start_loc=(self.query_start_loc[self.num_prefills:] -
+                             self.query_start_loc[self.num_prefills])
+            if self.query_start_loc is not None else None,
+            seq_start_loc=self.seq_start_loc[self.num_prefills:]
+            if self.seq_start_loc is not None else None,
+            context_lens_tensor=None,
+            block_tables=block_tables,
+            head_dim=self.head_dim,
+            is_profile_run=self.is_profile_run)
+        return self._cached_decode_metadata
+
+    def advance_step(self,
+                     model_input: "ModelInputForGPUWithSamplingMetadata",
+                     sampled_token_ids: Optional[torch.Tensor],
+                     block_size: int,
+                     num_seqs: int,
+                     num_queries: int,
+                     turn_prefills_into_decodes: bool = False):
+        """
+        Update metadata in-place to advance one decode step.
+        """
+        # When using cudagraph, the num_seqs is padded to the next captured
+        # batch sized, but num_queries tracks the actual number of requests in
+        # the batch. For --enforce-eager mode, num_seqs == num_queries
+        if num_seqs != num_queries:
+            assert num_seqs > num_queries
+
+        if turn_prefills_into_decodes:
+            # When Multi-Step is enabled with Chunked-Prefill, prefills and
+            # decodes are scheduled together. In the first step, all the
+            # prefills turn into decodes. This update reflects that
+            # conversion.
+            assert self.num_decode_tokens + self.num_prefills == num_seqs
+            self.num_decode_tokens += self.num_prefills
+            self.num_prefills = 0
+            self.num_prefill_tokens = 0
+            self.max_prefill_seq_len = 0
+            self.max_query_len = 1
+
+            self.slot_mapping = self.slot_mapping[:num_seqs]
+        else:
+            assert self.seq_lens is not None
+            assert self.max_decode_seq_len == max(self.seq_lens)
+
+        assert self.num_prefills == 0
+        assert self.num_prefill_tokens == 0
+        assert self.num_decode_tokens == num_seqs
+        assert self.slot_mapping.shape == (num_seqs, )
+
+        assert self.seq_lens is not None
+        assert len(self.seq_lens) == num_seqs
+        assert self.seq_lens_tensor is not None
+        assert self.seq_lens_tensor.shape == (num_seqs, )
+        assert self.max_query_len == 1
+        assert self.max_prefill_seq_len == 0
+
+        assert self.query_start_loc is not None
+        assert self.query_start_loc.shape == (num_queries + 1, )
+        assert self.seq_start_loc is not None
+        assert self.seq_start_loc.shape == (num_seqs + 1, )
+
+        assert self.context_lens_tensor is not None
+        assert self.context_lens_tensor.shape == (num_queries, )
+
+        assert self.block_tables is not None
+        assert self.block_tables.shape[0] == num_seqs
+
+        # Update query lengths. Note that we update only queries and not seqs,
+        # since tensors may be padded due to captured cuda graph batch size
+        for i in range(num_queries):
+            self.seq_lens[i] += 1
+        self.max_decode_seq_len = max(self.seq_lens)
+
+        self._ops_advance_step(num_seqs=num_seqs,
+                               num_queries=num_queries,
+                               block_size=block_size,
+                               input_tokens=model_input.input_tokens,
+                               sampled_token_ids=sampled_token_ids,
+                               input_positions=model_input.input_positions)
+
+    def _ops_advance_step(self, num_seqs: int, num_queries: int,
+                          block_size: int, input_tokens: torch.Tensor,
+                          sampled_token_ids: torch.Tensor,
+                          input_positions: torch.Tensor) -> None:
+        # here we use advance_step_flashinfo to update the paged_kv_* tensors
+        ops.advance_step_flashattn(num_seqs=num_seqs,
+                                   num_queries=num_queries,
+                                   block_size=block_size,
+                                   input_tokens=input_tokens,
+                                   sampled_token_ids=sampled_token_ids,
+                                   input_positions=input_positions,
+                                   seq_lens=self.seq_lens_tensor,
+                                   slot_mapping=self.slot_mapping,
+                                   block_tables=self.block_tables)
+
+
+class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]):
+    """
+    NOTE: Please read the comment at the top of the file before trying to 
+    understand this class
+    """
+    BLOCK_TABLE_EXTENDER: list[list[int]] = []
+
+    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+        self.input_builder = input_builder
+        self.runner = input_builder.runner
+        self.sliding_window = input_builder.sliding_window
+        self.block_size = input_builder.block_size
+        self.chunked_prefill_enabled = \
+            self.runner.scheduler_config.chunked_prefill_enabled
+        self.enable_prefix_caching = \
+            self.runner.cache_config.enable_prefix_caching
+
+        if self.chunked_prefill_enabled or self.enable_prefix_caching:
+            attn_state = self.input_builder.runner.attn_state
+            self.context_chunk_workspace_size = \
+                attn_state.context_chunk_workspace_size
+            self.page_size = self.runner.block_size
+
+    def prepare(self):
+        self.slot_mapping: List[int] = []
+        self.prefill_seq_lens: List[int] = []
+        self.context_lens: List[int] = []
+        self.block_tables: List[List[int]] = []
+        self.curr_seq_lens: List[int] = []
+        self.multimodal_placeholder_maps: Dict[
+            str,
+            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
+        self.num_prefills = 0
+        self.num_prefill_tokens = 0
+        self.num_decode_tokens = 0
+        self.has_prefix_cache_hit = False
+
+    def _add_seq_group(
+            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
+            chunked_prefill_enabled: bool, prefix_cache_hit: bool):
+        """Add a sequence group to the metadata. Specifically update/append
+        1. context length.
+        2. block table.
+        3. slot mapping.
+        """
+        is_prompt = inter_data.is_prompt
+        block_tables = inter_data.block_tables
+
+        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
+             curr_sliding_window_block) in zip(
+                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
+                 inter_data.orig_seq_lens, inter_data.seq_lens,
+                 inter_data.query_lens, inter_data.context_lens,
+                 inter_data.curr_sliding_window_blocks):
+            self.context_lens.append(context_len)
+            if is_prompt:
+                self.num_prefills += 1
+                self.num_prefill_tokens += token_len
+                self.prefill_seq_lens.append(seq_len)
+            else:
+                self.num_decode_tokens += query_len
+                self.curr_seq_lens.append(curr_seq_len)
+
+            # Compute block table.
+            # TODO(sang): Combine chunked prefill and prefix caching by
+            # only allowing multiple of block_size chunk size.
+            # NOTE: This only works for oooooooxxx style attention.
+            block_table = []
+            if prefix_cache_hit:
+                # NOTE(woosuk): For flash-attn, the block table should
+                # include the entries for the incoming prefill tokens.
+                block_table = block_tables[seq_id]
+            elif ((chunked_prefill_enabled or not is_prompt)
+                  and block_tables is not None):
+                if curr_sliding_window_block == 0:
+                    block_table = block_tables[seq_id]
+                else:
+                    block_table = block_tables[seq_id][
+                        -curr_sliding_window_block:]
+            self.block_tables.append(block_table)
+
+            # Compute slot mapping.
+            is_profile_run = is_block_tables_empty(block_tables)
+            start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
+                                                       context_len,
+                                                       self.sliding_window)
+            compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
+                                 seq_len, context_len, start_idx,
+                                 self.block_size, inter_data.block_tables)
+
+    def _get_graph_runner_block_tables(
+            self, num_seqs: int,
+            block_tables: List[List[int]]) -> torch.Tensor:
+        # The shape of graph_block_tables is
+        # [max batch size, max context len // block size].
+        max_batch_size, max_blocks = self.runner.graph_block_tables.shape
+        assert max_batch_size >= num_seqs
+
+        graph_block_tables = self.runner.graph_block_tables[:num_seqs]
+        for i, block_table in enumerate(block_tables):
+            if block_table:
+                num_blocks = len(block_table)
+                if num_blocks <= max_blocks:
+                    graph_block_tables[i, :num_blocks] = block_table
+                else:
+                    # It may be possible to have more blocks allocated due
+                    # to lookahead slots of multi-step, however, they are
+                    # not used anyway, so can be safely ignored.
+                    graph_block_tables[
+                        i, :max_blocks] = block_table[:max_blocks]
+
+        return torch.from_numpy(graph_block_tables).to(
+            device=self.runner.device, non_blocking=True)
+
+    def build(self, seq_lens: List[int], query_lens: List[int],
+              cuda_graph_pad_size: int, batch_size: int):
+        """Build attention metadata with on-device tensors.
+
+        Args:
+            seq_lens: The maybe padded sequence lengths of the input sequences.
+            query_lens: The query lengths of the input sequences.
+            cuda_graph_pad_size: The padding size for cuda graph.
+                                 -1 if cuda graph is not used.
+            batch_size: The maybe padded batch size.
+        """
+        prefix_cache_hit = any([
+            inter_data.prefix_cache_hit
+            for inter_data in self.input_builder.inter_data_list
+        ])
+
+        for inter_data in self.input_builder.inter_data_list:
+            self._add_seq_group(inter_data,
+                                self.input_builder.chunked_prefill_enabled,
+                                prefix_cache_hit)
+
+        device = self.runner.device
+        use_captured_graph = cuda_graph_pad_size != -1
+
+        max_query_len = max(query_lens)
+        decode_query_lens = query_lens[self.num_prefills:]
+        if len(decode_query_lens) > 0:
+            max_decode_query_len = max(decode_query_lens)
+        else:
+            max_decode_query_len = 1
+        max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
+        max_decode_seq_len = max(self.curr_seq_lens, default=0)
+        num_decode_tokens = self.num_decode_tokens
+        query_start_loc = list(accumulate(query_lens, initial=0))
+        seq_start_loc = list(accumulate(seq_lens, initial=0))
+
+        num_seqs = len(seq_lens)
+        if use_captured_graph:
+            self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
+            self.block_tables.extend(self.__class__.BLOCK_TABLE_EXTENDER *
+                                     cuda_graph_pad_size)
+            num_decode_tokens = batch_size - self.num_prefill_tokens
+
+            block_tables = self._get_graph_runner_block_tables(
+                num_seqs, self.block_tables)
+        else:
+            block_tables = make_tensor_with_pad(
+                self.block_tables,
+                pad=0,
+                dtype=torch.int,
+                device=device,
+            )
+        assert max_query_len > 0, ("query_lens: {}".format(query_lens))
+
+        assert device is not None
+        context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int,
+                                               device, self.runner.pin_memory)
+        seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
+                                           self.runner.pin_memory)
+        slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long,
+                                               device, self.runner.pin_memory)
+        query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32,
+                                                  device,
+                                                  self.runner.pin_memory)
+        seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32,
+                                                device, self.runner.pin_memory)
+
+        context_chunk_cu_seq_lens = None
+        context_chunk_starts = None
+        context_chunk_seq_tot = None
+        context_chunk_max_seq_lens = None
+
+        if (self.chunked_prefill_enabled or self.enable_prefix_caching) \
+            and self.num_prefills > 0 \
+            and context_lens_tensor is not None \
+            and context_lens_tensor[:self.num_prefills].max() > 0:
+
+            # NOTE: it is recommend you read the `Chunked Prefill` section in
+            # the comment at the top of the file before trying to understand
+            # the following code
+
+            num_prefills_with_context = \
+                (context_lens_tensor[:self.num_prefills] > 0).sum().item()
+
+            # currently we allocate an equal amount of workspace for each
+            # prefill in the batch, we could probably use a more advanced
+            # algorithm here and allocate more workspace to prefills with
+            # longer context lengths
+            max_context_chunk = \
+                self.context_chunk_workspace_size // num_prefills_with_context
+
+            # align max_context_chunk to page_size by rounding down,
+            # currently the `gather_cache` kernel cannot handle
+            # `context_chunk_starts` that are not aligned to page_size
+            max_context_chunk = round_down(max_context_chunk, self.page_size)
+            assert max_context_chunk > 0
+            num_chunks = cdiv(context_lens_tensor.max(), max_context_chunk)
+
+            # if `max_context_chunk = 256`, `num_chunks = 3`, and
+            #   `num_prefills_with_context = 4`, create a tensor that looks like
+            #  [[0, 0, 0, 0], [256, 256, 256, 256], [512, 512, 512, 512]]
+            context_chunk_starts = \
+                torch.arange(num_chunks, device=device, dtype=torch.int32)\
+                .unsqueeze(1).expand(-1, self.num_prefills)\
+                * max_context_chunk
+            chunk_ends = torch.min(context_lens_tensor[:self.num_prefills]\
+                .unsqueeze(0), context_chunk_starts + max_context_chunk)
+            chunk_seq_lens = (chunk_ends - context_chunk_starts).clamp(min=0)
+            _context_chunk_cu_seq_lens = chunk_seq_lens.cumsum(dim=1).to(
+                torch.int32)
+            zero = torch.zeros(num_chunks, dtype=torch.int32, device=device)\
+                .unsqueeze(-1)
+            context_chunk_cu_seq_lens = \
+                torch.cat([zero, _context_chunk_cu_seq_lens], dim=1)
+            context_chunk_max_seq_lens = \
+                chunk_seq_lens.max(dim=1).values.tolist()
+            context_chunk_seq_tot = chunk_seq_lens.sum(dim=1).tolist()
+            assert max(context_chunk_seq_tot) <= \
+                self.context_chunk_workspace_size
+
+        return self.runner.attn_backend.make_metadata(
+            # Required by ModelRunner
+            use_cuda_graph=use_captured_graph,  # Not Attention Related
+            # Required by Attention Metadata
+            num_prefills=self.num_prefills,
+            slot_mapping=slot_mapping_tensor,
+            num_prefill_tokens=self.num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            # Required by Attention Metadata (not used)
+            multi_modal_placeholder_index_maps=None,  # Not Attention Related
+            enable_kv_scales_calculation=False,
+            # MLACommonMetadata
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            max_query_len=max_query_len,
+            max_decode_query_len=max_decode_query_len,
+            max_prefill_seq_len=max_prefill_seq_len,
+            max_decode_seq_len=max_decode_seq_len,
+            query_start_loc=query_start_loc_tensor,
+            seq_start_loc=seq_start_loc_tensor,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=block_tables,
+            head_dim=self.runner.model_config.get_head_size(),
+            is_profile_run=self.runner.in_profile_run,
+            # MLACommonMetadata Chunk prefill specific
+            context_chunk_cu_seq_lens=context_chunk_cu_seq_lens,
+            context_chunk_starts=context_chunk_starts,
+            context_chunk_seq_tot=context_chunk_seq_tot,
+            context_chunk_max_seq_lens=context_chunk_max_seq_lens,
+        )
+
+
+class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
+    """
+    NOTE: Please read the comment at the top of the file before trying to 
+    understand this class
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[List[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        logits_soft_cap: Optional[float],
+        attn_type: str,
+        kv_sharing_target_layer_name: Optional[str],
+        # MLA Specific Arguments
+        q_lora_rank: Optional[int],
+        kv_lora_rank: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        qk_head_dim: int,
+        v_head_dim: int,
+        kv_b_proj: ColumnParallelLinear,
+    ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing not supported in V0.")
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        self.kv_cache_dtype = kv_cache_dtype
+
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_head_dim
+        self.v_head_dim = v_head_dim
+        self.kv_b_proj = kv_b_proj
+
+        self.triton_fa_func = triton_attention
+        # Handle the differences between the flash_attn_varlen from flash_attn
+        # and the one from vllm_flash_attn. The former is used on RoCM and the
+        # latter has an additional parameter to control FA2 vs FA3
+        self.flash_attn_varlen_func = flash_attn_varlen_func
+        self.vllm_flash_attn_version = get_flash_attn_version()
+        if self.vllm_flash_attn_version is not None:
+            self.flash_attn_varlen_func = \
+                functools.partial(flash_attn_varlen_func,
+                                  fa_version=self.vllm_flash_attn_version)
+
+        # For MLA the v head dim is smaller than qk head dim so we pad out
+        # v with 0s to match the qk head dim for attention backends that do
+        # not support different headdims
+        # We don't need to pad V if we are on a hopper system with FA3
+        self._pad_v = self.vllm_flash_attn_version is None or not (
+            self.vllm_flash_attn_version == 3
+            and current_platform.get_device_capability()[0] == 9)
+
+    def _flash_attn_varlen_diff_headdims(self, q, k, v, softmax_scale,
+                                         return_softmax_lse, **kwargs):
+        maybe_padded_v = v
+        if self._pad_v:
+            maybe_padded_v = torch.nn.functional.pad(
+                v, [0, q.shape[-1] - v.shape[-1]], value=0)
+
+        if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN \
+            and not return_softmax_lse:
+            attn_out = self.triton_fa_func(
+                q,
+                k,
+                maybe_padded_v,
+                None,  # output
+                kwargs["cu_seqlens_q"],
+                kwargs["cu_seqlens_k"],
+                kwargs["max_seqlen_q"],
+                kwargs["max_seqlen_k"],
+                kwargs["causal"],
+                softmax_scale,
+                None,  # bias
+            )
+        elif is_vllm_fa:
+            attn_out = self.flash_attn_varlen_func(
+                q=q,
+                k=k,
+                v=maybe_padded_v,
+                return_softmax_lse=return_softmax_lse,
+                softmax_scale=softmax_scale,
+                **kwargs,
+            )
+        else:
+            # Use return_attn_probs instead of return_softmax_lse for RoCM
+            attn_out = self.flash_attn_varlen_func(
+                q=q,
+                k=k,
+                v=maybe_padded_v,
+                return_attn_probs=return_softmax_lse,
+                softmax_scale=softmax_scale,
+                **kwargs,
+            )
+
+        # Unpack the output if there is multiple results,
+        # triton always returns (output, softmax_lse),
+        # vllm_flash_attn returns (output, softmax_lse) when
+        #  `return_softmax_lse = True`
+        # flash_attn (RoCM) returns (output, softmax_lse, ...) when
+        #  `return_attn_probs = True`
+        rest = None
+        if isinstance(attn_out, tuple):
+            attn_out, *rest = attn_out
+
+        # Remain consistent with old `flash_attn_varlen_func` where there
+        # is only one output tensor if `return_softmax_lse` is False.
+        if return_softmax_lse:
+            assert rest is not None
+            return attn_out, rest[0]
+        return attn_out
+
+    def _v_up_proj(self, x):
+        # Convert from (B, N, L) to (N, B, L)
+        x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
+        # Multiply (N, B, L) x (N, L, V) -> (N, B, V)
+        x = torch.bmm(x, self.W_UV)
+        # Convert from (N, B, V) to (B, N * V)
+        return x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim)
+
+    def process_weights_after_loading(self, act_dtype: torch.dtype):
+
+        def get_layer_weight(layer):
+            WEIGHT_NAMES = ("weight", "qweight", "weight_packed")
+            for attr in WEIGHT_NAMES:
+                if hasattr(layer, attr):
+                    return getattr(layer, attr)
+            raise AttributeError(
+                f"Layer '{layer}' has no recognized weight attribute:"
+                f" {WEIGHT_NAMES}.")
+
+        def get_and_maybe_dequant_weights(layer: LinearBase):
+            if not isinstance(layer.quant_method, UnquantizedLinearMethod):
+                # NOTE: This should only be used offline, since it's O(N^3)
+                eye = torch.eye(layer.input_size_per_partition,
+                                dtype=act_dtype,
+                                device=get_layer_weight(layer).device)
+                dequant_weights = layer.quant_method.apply(layer,
+                                                           eye,
+                                                           bias=None)
+                del eye
+                # standardize to (output, input)
+                return dequant_weights.T
+            return layer.weight
+
+        # we currently do not have quantized bmm's which are needed for
+        # `W_UV` and `W_UK_T`, we we just store fp16/bf16 copies and perform
+        # the bmm's in 16-bit, the extra memory overhead of this is fairly low
+        kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
+        assert kv_b_proj_weight.shape == (
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim)), (
+                f"{kv_b_proj_weight.shape=}, "
+                f"{self.kv_lora_rank=}, "
+                f"{self.num_heads=}, "
+                f"{self.qk_nope_head_dim=}, "
+                f"{self.v_head_dim=}")
+        kv_b_proj_weight = kv_b_proj_weight.view(
+            self.kv_lora_rank,
+            self.num_heads,
+            self.qk_nope_head_dim + self.v_head_dim,
+        )
+
+        W_UK, W_UV = kv_b_proj_weight.split(
+            [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+        # Convert from (L, N, V) to (N, L, V)
+        self.W_UV = W_UV.transpose(0, 1)
+        # Convert from (L, N, P) to (N, P, L)
+        self.W_UK_T = W_UK.permute(1, 2, 0)
+
+    def _compute_prefill_context(
+        self,
+        q: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: MLACommonMetadata,
+    ):
+        prefill_metadata = attn_metadata.prefill_metadata
+        assert prefill_metadata is not None
+        assert prefill_metadata.context_chunk_seq_tot is not None
+        assert prefill_metadata.context_chunk_cu_seq_lens is not None
+        assert prefill_metadata.context_chunk_starts is not None
+        assert prefill_metadata.context_chunk_max_seq_lens is not None
+        assert prefill_metadata.context_lens_tensor is not None
+
+        output = None
+        iters = len(prefill_metadata.context_chunk_seq_tot)
+
+        # Fetch from attn_metadata directly, since it late bound by
+        # MLAAttentionState, grabbing it directly `attn_metadata` can avoid
+        # any weirdness around prefill_metadata caching
+        assert attn_metadata.context_chunk_workspace is not None
+        workspace = attn_metadata.context_chunk_workspace
+
+        for i in range(iters):
+            toks = prefill_metadata.context_chunk_seq_tot[i]
+
+            ops.gather_cache(
+                src_cache=kv_c_and_k_pe_cache,
+                dst=workspace,
+                block_table=prefill_metadata.block_tables,
+                cu_seq_lens=prefill_metadata.context_chunk_cu_seq_lens[i],
+                batch_size=prefill_metadata.num_prefills,
+                seq_starts=prefill_metadata.context_chunk_starts[i],
+            )
+
+            kv_c_normed = workspace[:toks]\
+                [..., :self.kv_lora_rank]
+            k_pe = workspace[:toks]\
+                [..., self.kv_lora_rank:].unsqueeze(1)
+
+            kv_nope = self.kv_b_proj(kv_c_normed)[0].view( \
+                -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+            k_nope, v = kv_nope\
+                .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+            k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))),
+                          dim=-1)
+
+            attn_output, attn_softmax_lse = \
+                self._flash_attn_varlen_diff_headdims(
+                q=q,
+                k=k,
+                v=v,
+                cu_seqlens_q=prefill_metadata.query_start_loc,
+                cu_seqlens_k=prefill_metadata.context_chunk_cu_seq_lens[i],
+                max_seqlen_q=prefill_metadata.max_query_len,
+                max_seqlen_k=prefill_metadata.context_chunk_max_seq_lens[i],
+                softmax_scale=self.scale,
+                causal=False,  # Context is unmasked
+                return_softmax_lse=True,
+            )
+
+            if output is None:
+                output = attn_output
+                output_lse = attn_softmax_lse
+            else:
+                output_tmp = torch.empty_like(output)
+                output_lse_tmp = torch.empty_like(output_lse)
+                merge_attn_states(
+                    output=output_tmp,
+                    output_lse=output_lse_tmp,
+                    prefix_output=output,
+                    prefix_lse=output_lse,
+                    suffix_output=attn_output,
+                    suffix_lse=attn_softmax_lse,
+                )
+                output = output_tmp
+                output_lse = output_lse_tmp
+
+        return output, output_lse
+
+    def _forward_prefill(
+        self,
+        q: torch.Tensor,
+        kv_c_normed: torch.Tensor,
+        k_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: MLACommonMetadata,
+    ) -> torch.Tensor:
+
+        prefill_metadata = attn_metadata.prefill_metadata
+        assert prefill_metadata is not None
+
+        has_context = prefill_metadata.context_lens_tensor is not None \
+            and prefill_metadata.context_lens_tensor.max() > 0
+
+        kv_nope = self.kv_b_proj(kv_c_normed)[0].view(\
+            -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+        k_nope, v = kv_nope\
+            .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+        k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
+
+        output = self._flash_attn_varlen_diff_headdims(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=prefill_metadata.query_start_loc,
+            cu_seqlens_k=prefill_metadata.query_start_loc,
+            max_seqlen_q=prefill_metadata.max_prefill_seq_len,
+            max_seqlen_k=prefill_metadata.max_prefill_seq_len,
+            softmax_scale=self.scale,
+            causal=True,
+            return_softmax_lse=has_context,
+        )
+
+        if has_context:
+            # ROCm flash_attn_varlen_func will return 3 objects instead of 2
+            suffix_output, suffix_lse = output
+            context_output, context_lse = self._compute_prefill_context( \
+                q, kv_c_and_k_pe_cache, attn_metadata)
+
+            output = torch.empty_like(suffix_output)
+            merge_attn_states(
+                output=output,
+                prefix_output=context_output,
+                prefix_lse=context_lse,
+                suffix_output=suffix_output,
+                suffix_lse=suffix_lse,
+            )
+
+        # unpad if necessary
+        if self._pad_v:
+            output = output[..., :v.shape[-1]]
+
+        return output.flatten(start_dim=-2)
+
+    @abstractmethod
+    def _forward_decode(
+        self,
+        ql_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: T,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def forward(
+        self,
+        layer: AttentionLayer,
+        q: torch.Tensor,  # query in unified attn
+        k_c_normed: torch.Tensor,  # key in unified attn
+        k_pe: torch.Tensor,  # value in unified attn
+        kv_cache: torch.Tensor,
+        attn_metadata: T,
+        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if output is not None:
+            raise NotImplementedError(
+                "output is not yet supported for MLAImplBase")
+
+        if output_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported"
+                " for MLAImplBase")
+
+        if attn_metadata.is_profile_run and \
+            attn_metadata.context_chunk_workspace is not None:
+            # During the profile run try to simulate to worse case output size
+            # for `self.kv_b_proj(kv_c_normed)` in `_compute_prefill_context`
+            # since this can be large
+            _ = torch.empty(
+                (attn_metadata.context_chunk_workspace.shape[0],
+                 self.num_heads, self.qk_nope_head_dim + self.v_head_dim),
+                device=k_c_normed.device,
+                dtype=k_c_normed.dtype,
+            )
+
+        has_decode = attn_metadata.decode_metadata is not None
+        has_prefill = attn_metadata.prefill_metadata is not None
+
+        num_prefill_tokens: int = attn_metadata.num_prefill_tokens
+        q = q.view(-1, self.num_heads, self.qk_head_dim)
+
+        decode_q = q[num_prefill_tokens:]
+
+        prefill_q = q[:num_prefill_tokens]
+        prefill_k_pe = k_pe[:num_prefill_tokens]
+        prefill_k_c_normed = k_c_normed[:num_prefill_tokens]
+
+        # write the latent and rope to kv cache
+        if kv_cache.numel() > 0:
+            ops.concat_and_cache_mla(
+                k_c_normed,
+                k_pe.squeeze(1),
+                kv_cache,
+                attn_metadata.slot_mapping.flatten(),
+                kv_cache_dtype=self.kv_cache_dtype,
+                scale=layer._k_scale,
+            )
+
+        output = torch.empty(attn_metadata.num_prefill_tokens +
+                             attn_metadata.num_decode_tokens,
+                             self.v_head_dim * self.num_heads,
+                             device=q.device,
+                             dtype=q.dtype)
+        if has_prefill:
+            output[:num_prefill_tokens] = self._forward_prefill(
+                prefill_q, prefill_k_c_normed, prefill_k_pe, kv_cache,
+                attn_metadata)
+
+        if has_decode:
+            decode_q_nope, decode_q_pe = decode_q.split(
+                [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+            # Convert from (B, N, P) to (N, B, P)
+            decode_q_nope = decode_q_nope.transpose(0, 1)
+            # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
+            decode_ql_nope = torch.bmm(decode_q_nope, self.W_UK_T)
+            # Convert from (N, B, L) to (B, N, L)
+            decode_ql_nope = decode_ql_nope.transpose(0, 1)
+
+            output[num_prefill_tokens:] = self._forward_decode(
+                decode_ql_nope, decode_q_pe, kv_cache, attn_metadata)
+
+        return output
diff --git a/vllm_v0.10.0/vllm/attention/backends/placeholder_attn.py b/vllm_v0.10.0/vllm/attention/backends/placeholder_attn.py
new file mode 100644
index 0000000..820ddca
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/backends/placeholder_attn.py
@@ -0,0 +1,400 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections import defaultdict
+from dataclasses import dataclass
+from itertools import accumulate
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type
+
+import torch
+
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionMetadata,
+                                              AttentionMetadataBuilder)
+from vllm.attention.backends.utils import CommonAttentionState
+from vllm.multimodal import MultiModalPlaceholderMap
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
+                                          ModelInputForGPUWithSamplingMetadata)
+from vllm.utils import async_tensor_h2d
+
+# Placeholder attention backend for models like Mamba and pooling models that
+# lack attention.
+
+
+class PlaceholderAttentionBackend(AttentionBackend):
+    """Placeholder backend for when no attention is needed."""
+
+    @staticmethod
+    def get_name() -> str:
+        return "NO_ATTENTION"
+
+    @staticmethod
+    def get_impl_cls() -> Type["PlaceholderAttentionImpl"]:
+        return PlaceholderAttentionImpl
+
+    @staticmethod
+    def get_builder_cls() -> Type["PlaceholderAttentionMetadataBuilder"]:
+        return PlaceholderAttentionMetadataBuilder
+
+    @staticmethod
+    def get_metadata_cls() -> Type["PlaceholderAttentionMetadata"]:
+        return PlaceholderAttentionMetadata
+
+    @staticmethod
+    def get_state_cls() -> Type["CommonAttentionState"]:
+        return CommonAttentionState
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return (1, 1, 1, 1, 1)
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        return
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: torch.Tensor,
+    ) -> None:
+        return
+
+
+@dataclass
+class PlaceholderAttentionMetadata(AttentionMetadata):
+    """Attention metadata for prefill and decode batched together."""
+    # (batch_size,). The sequence length per sequence. Sequence length means
+    # the computed tokens + new tokens None if it is a decoding.
+    seq_lens: Optional[List[int]]
+    # seq_lens stored as a tensor.
+    seq_lens_tensor: Optional[torch.Tensor]
+
+    # Maximum sequence length among prefill batch. 0 if there are decoding
+    # requests only.
+    max_prefill_seq_len: int
+    # Maximum sequence length among decode batch. 0 if there are prefill
+    # requests only.
+    max_decode_seq_len: int
+    # (batch_size,) A tensor of context lengths (tokens that are computed
+    # so far).
+    context_lens_tensor: Optional[torch.Tensor]
+
+    # Whether or not if cuda graph is enabled.
+    # Cuda-graph is currently enabled for decoding only.
+    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
+    use_cuda_graph: bool
+
+    # Maximum query length in the batch.
+    max_query_len: Optional[int]
+
+    # Max number of query tokens among request in the batch.
+    max_decode_query_len: Optional[int]
+
+    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
+    # the batch, used to index into subquery. E.g., if the subquery length
+    # is [4, 6], it is [0, 4, 10].
+    query_start_loc: Optional[torch.Tensor] = None
+    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+    # the batch, used to index into sequence. E.g., if the sequence length is
+    # [4, 6], it is [0, 4, 10].
+    seq_start_loc: Optional[torch.Tensor] = None
+
+    # Placeholder.
+    block_tables: Optional[torch.Tensor] = None
+
+    _cached_prefill_metadata: Optional["PlaceholderAttentionMetadata"] = None
+    _cached_decode_metadata: Optional["PlaceholderAttentionMetadata"] = None
+
+    @property
+    def prefill_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
+        if self.num_prefills == 0:
+            return None
+
+        if self._cached_prefill_metadata is not None:
+            return self._cached_prefill_metadata
+
+        # Compute some attn_metadata fields which default to None
+        query_start_loc = (None if self.query_start_loc is None else
+                           self.query_start_loc[:self.num_prefills + 1])
+        seq_lens = (None if self.seq_lens is None else
+                    self.seq_lens[:self.num_prefills])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[:self.num_prefills])
+        seq_start_loc = (None if self.seq_start_loc is None else
+                         self.seq_start_loc[:self.num_prefills + 1])
+        context_lens_tensor = (None if self.context_lens_tensor is None else
+                               self.context_lens_tensor[:self.num_prefills])
+
+        # Placeholders
+        slot_mapping = torch.empty(0)
+        block_tables = torch.empty(0)
+
+        self._cached_prefill_metadata = PlaceholderAttentionMetadata(
+            num_prefills=self.num_prefills,
+            num_prefill_tokens=self.num_prefill_tokens,
+            num_decode_tokens=0,
+            slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=self.
+            multi_modal_placeholder_index_maps,
+            enable_kv_scales_calculation=self.enable_kv_scales_calculation,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            max_decode_query_len=0,
+            max_query_len=self.max_query_len,
+            max_prefill_seq_len=self.max_prefill_seq_len,
+            max_decode_seq_len=0,
+            query_start_loc=query_start_loc,
+            seq_start_loc=seq_start_loc,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=block_tables,
+            use_cuda_graph=False,
+        )
+        return self._cached_prefill_metadata
+
+    @property
+    def decode_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
+        if self.num_decode_tokens == 0:
+            return None
+
+        if self._cached_decode_metadata is not None:
+            return self._cached_decode_metadata
+        assert self.seq_lens_tensor is not None
+
+        # Placeholders
+        slot_mapping = torch.empty(0)
+        block_tables = torch.empty(0)
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[self.num_prefills:])
+
+        self._cached_decode_metadata = PlaceholderAttentionMetadata(
+            num_prefills=0,
+            num_prefill_tokens=0,
+            num_decode_tokens=self.num_decode_tokens,
+            slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=True,
+            seq_lens=None,
+            seq_lens_tensor=seq_lens_tensor,
+            max_decode_query_len=self.max_decode_query_len,
+            max_query_len=None,
+            max_prefill_seq_len=0,
+            max_decode_seq_len=self.max_decode_seq_len,
+            query_start_loc=(self.query_start_loc[self.num_prefills:] -
+                             self.query_start_loc[self.num_prefills])
+            if self.query_start_loc is not None else None,
+            seq_start_loc=self.seq_start_loc[self.num_prefills:]
+            if self.seq_start_loc is not None else None,
+            context_lens_tensor=None,
+            block_tables=block_tables,
+            use_cuda_graph=self.use_cuda_graph,
+        )
+        return self._cached_decode_metadata
+
+    def advance_step(self,
+                     model_input: "ModelInputForGPUWithSamplingMetadata",
+                     sampled_token_ids: Optional[torch.Tensor],
+                     block_size: int,
+                     num_seqs: int,
+                     num_queries: int,
+                     turn_prefills_into_decodes: bool = False):
+        """
+        Update metadata in-place to advance one decode step.
+        """
+        # When using cudagraph, the num_seqs is padded to the next captured
+        # batch sized, but num_queries tracks the actual number of requests in
+        # the batch. For --enforce-eager mode, num_seqs == num_queries
+        if num_seqs != num_queries:
+            assert num_seqs > num_queries
+            assert self.use_cuda_graph
+
+        assert not turn_prefills_into_decodes, \
+            ("Multi-Step + Chunked-Prefill is not supported for attention-free"
+             "models. turn_prefills_into_decodes is a "
+             "Multi-Step + Chunked-Prefill specific parameter.")
+
+        assert self.seq_lens is not None
+        assert self.max_decode_seq_len == max(self.seq_lens)
+
+        assert self.num_prefills == 0
+        assert self.num_prefill_tokens == 0
+        assert self.num_decode_tokens == num_seqs
+
+        assert self.seq_lens is not None
+        assert len(self.seq_lens) == num_seqs
+        assert self.seq_lens_tensor is not None
+        assert self.seq_lens_tensor.shape == (num_seqs, )
+        assert self.max_query_len == 1
+        assert self.max_prefill_seq_len == 0
+
+        assert self.query_start_loc is not None
+        assert self.query_start_loc.shape == (num_queries + 1, )
+        assert self.seq_start_loc is not None
+        assert self.seq_start_loc.shape == (num_seqs + 1, )
+
+        assert self.context_lens_tensor is not None
+        assert self.context_lens_tensor.shape == (num_queries, )
+
+        # Update query lengths. Note that we update only queries and not seqs,
+        # since tensors may be padded due to captured cuda graph batch size
+        for i in range(num_queries):
+            self.seq_lens[i] += 1
+        self.max_decode_seq_len = max(self.seq_lens)
+
+        # Update sequences, masking off entries greater than num_queries
+        device = self.seq_lens_tensor.device
+        mask = torch.arange(self.seq_lens_tensor.size(0),
+                            device=device) < num_queries
+        self.seq_lens_tensor += mask.to(self.seq_lens_tensor.dtype)
+        if sampled_token_ids is not None:
+            model_input.input_tokens.masked_scatter_(
+                mask, sampled_token_ids[:num_queries])
+
+
+class PlaceholderAttentionMetadataBuilder(
+        AttentionMetadataBuilder[PlaceholderAttentionMetadata]):
+
+    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+
+        self.input_builder = input_builder
+        self.runner = input_builder.runner
+
+    def prepare(self):
+        self.prefill_seq_lens: List[int] = []
+        self.context_lens: List[int] = []
+        self.curr_seq_lens: List[int] = []
+        self.multimodal_placeholder_maps: Dict[
+            str,
+            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
+        self.num_prefills = 0
+        self.num_prefill_tokens = 0
+        self.num_decode_tokens = 0
+
+    def _add_seq_group(
+            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
+            chunked_prefill_enabled: bool):
+        """Add a sequence group to the metadata. Specifically update/append
+        1. context length.
+        """
+        is_prompt = inter_data.is_prompt
+
+        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
+             curr_sliding_window_block) in zip(
+                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
+                 inter_data.orig_seq_lens, inter_data.seq_lens,
+                 inter_data.query_lens, inter_data.context_lens,
+                 inter_data.curr_sliding_window_blocks):
+            self.context_lens.append(context_len)
+
+            if is_prompt:
+                mm_maps = inter_data.multi_modal_placeholder_maps
+                if mm_maps:
+                    for modality, placeholders in mm_maps.items():
+                        self.multimodal_placeholder_maps[modality].extend(
+                            placeholders)
+
+                self.num_prefills += 1
+                self.num_prefill_tokens += token_len
+                self.prefill_seq_lens.append(seq_len)
+            else:
+                self.num_decode_tokens += query_len
+                self.curr_seq_lens.append(curr_seq_len)
+
+    def build(self, seq_lens: List[int], query_lens: List[int],
+              cuda_graph_pad_size: int, batch_size: int):
+        """Build attention metadata with on-device tensors.
+
+        Args:
+            seq_lens: The maybe padded sequence lengths of the input sequences.
+            query_lens: The query lengths of the input sequences.
+            cuda_graph_pad_size: The padding size for cuda graph.
+                                 -1 if cuda graph is not used.
+            batch_size: The maybe padded batch size.
+        """
+
+        # Some input builders such as ModelInputForCPUBuilder do not have the
+        # "inter_data_list" attribute.
+        # Let's check inter_data_list exists before we reference it.
+        if hasattr(self.input_builder, "inter_data_list"):
+            for inter_data in self.input_builder.inter_data_list:
+                self._add_seq_group(inter_data,
+                                    self.input_builder.chunked_prefill_enabled)
+
+        device = self.runner.device
+        use_captured_graph = cuda_graph_pad_size != -1
+
+        max_query_len = max(query_lens)
+        decode_query_lens = query_lens[self.num_prefills:]
+        if len(decode_query_lens) > 0:
+            max_decode_query_len = max(decode_query_lens)
+        else:
+            max_decode_query_len = 1
+        max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
+        max_decode_seq_len = max(self.curr_seq_lens, default=0)
+        num_decode_tokens = self.num_decode_tokens
+        query_start_loc = list(accumulate(query_lens, initial=0))
+        seq_start_loc = list(accumulate(seq_lens, initial=0))
+
+        if use_captured_graph:
+            num_decode_tokens = batch_size - self.num_prefill_tokens
+        assert max_query_len > 0, ("query_lens: {}".format(query_lens))
+
+        assert device is not None
+        context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int,
+                                               device, self.runner.pin_memory)
+        seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
+                                           self.runner.pin_memory)
+        query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32,
+                                                  device,
+                                                  self.runner.pin_memory)
+        seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32,
+                                                device, self.runner.pin_memory)
+
+        placeholder_index_maps = {
+            modality: placeholder_map.index_map()
+            for modality, placeholder_map in
+            self.multimodal_placeholder_maps.items()
+        }
+
+        # Placeholders
+        slot_mapping_tensor = torch.empty(0)
+        block_tables = torch.empty(0)
+
+        return PlaceholderAttentionMetadata(
+            num_prefills=self.num_prefills,
+            slot_mapping=slot_mapping_tensor,
+            multi_modal_placeholder_index_maps=placeholder_index_maps,
+            enable_kv_scales_calculation=True,
+            num_prefill_tokens=self.num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            max_query_len=max_query_len,
+            max_decode_query_len=max_decode_query_len,
+            max_prefill_seq_len=max_prefill_seq_len,
+            max_decode_seq_len=max_decode_seq_len,
+            query_start_loc=query_start_loc_tensor,
+            seq_start_loc=seq_start_loc_tensor,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=block_tables,
+            use_cuda_graph=use_captured_graph,
+        )
+
+
+class PlaceholderAttentionImpl(AttentionImpl):
+
+    def __init__(self, *args, **kwargs) -> None:
+        return
+
+    def forward(self, *args, **kwargs) -> torch.Tensor:
+        raise NotImplementedError
diff --git a/vllm_v0.10.0/vllm/attention/backends/rocm_aiter_mla.py b/vllm_v0.10.0/vllm/attention/backends/rocm_aiter_mla.py
new file mode 100644
index 0000000..a165a78
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/backends/rocm_aiter_mla.py
@@ -0,0 +1,431 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional, Type, Union
+
+import torch
+
+import vllm._custom_ops as ops
+import vllm.envs as envs
+from vllm.attention.backends.mla.common import (MLACommonBackend,
+                                                MLACommonImpl,
+                                                MLACommonMetadata,
+                                                MLACommonMetadataBuilder,
+                                                MLACommonState)
+from vllm.attention.backends.utils import (compute_slot_mapping,
+                                           compute_slot_mapping_start_idx,
+                                           is_block_tables_empty)
+from vllm.attention.ops.rocm_aiter_mla import (aiter_mla_decode_fwd,
+                                               get_aiter_mla_metadata)
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUBuilder
+
+
+def is_aiter_mla_enabled() -> bool:
+    return envs.VLLM_ROCM_USE_AITER \
+        and envs.VLLM_ROCM_USE_AITER_MLA
+
+
+class AiterMLABackend(MLACommonBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "ROCM_AITER_MLA"
+
+    @staticmethod
+    def get_impl_cls() -> Type["AiterMLAImpl"]:
+        return AiterMLAImpl
+
+    @staticmethod
+    def get_metadata_cls() -> Type["AiterMLAMetadata"]:
+        return AiterMLAMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["AiterMLAMetadataBuilder"]:
+        return AiterMLAMetadataBuilder
+
+    @staticmethod
+    def get_state_cls() -> Type["AiterMLAState"]:
+        return AiterMLAState
+
+
+@dataclass
+class AiterMLAMetadata(MLACommonMetadata):
+    # The following 5 tensors are for current version of AITER MLA
+    block_table_bound: Optional[torch.Tensor] = None
+    # The indptr of the paged kv cache, shape: [batch_size + 1]
+    paged_kv_indptr: Optional[torch.Tensor] = None
+    # The page indices of the paged kv cache
+    paged_kv_indices: Optional[torch.Tensor] = None
+    # The number of entries in the last page of each request in
+    # the paged kv cache, shape: [batch_size]
+    paged_kv_last_page_lens: Optional[torch.Tensor] = None
+
+    # This is just to make new AITER MLA API work
+    # -- MTP support is not added yet.
+    qo_indptr: Optional[torch.Tensor] = None
+
+    @property
+    def prefill_metadata(self):
+        prefill_metadata = super().prefill_metadata
+        self._cached_prefill_metadata = prefill_metadata
+
+        if prefill_metadata is not None:
+            prefill_metadata.paged_kv_indptr = self.paged_kv_indptr
+            prefill_metadata.paged_kv_indices = self.paged_kv_indices
+            prefill_metadata\
+                .paged_kv_last_page_lens = self.paged_kv_last_page_lens
+            prefill_metadata.block_table_bound = self.block_table_bound
+            prefill_metadata.qo_indptr = self.qo_indptr
+
+            # update the cache
+            self._cached_prefill_metadata = self.__class__(
+                **prefill_metadata.__dict__)
+
+        return self._cached_prefill_metadata
+
+    @property
+    def decode_metadata(self):
+        decode_metadata = super().decode_metadata
+
+        self._cached_decode_metadata = decode_metadata
+
+        if decode_metadata is not None:
+            decode_metadata.paged_kv_indptr = self.paged_kv_indptr
+            decode_metadata.paged_kv_indices = self.paged_kv_indices
+            decode_metadata\
+                .paged_kv_last_page_lens = self.paged_kv_last_page_lens
+            decode_metadata.block_table_bound = self.block_table_bound
+            decode_metadata.qo_indptr = self.qo_indptr
+
+            # update the cache
+            self._cached_decode_metadata = self.__class__(
+                **decode_metadata.__dict__)
+
+        return self._cached_decode_metadata
+
+    def _ops_advance_step(self, num_seqs: int, num_queries: int,
+                          block_size: int, input_tokens: torch.Tensor,
+                          sampled_token_ids: torch.Tensor,
+                          input_positions: torch.Tensor) -> None:
+
+        ops.advance_step_flashinfer(
+            num_seqs=num_seqs,
+            num_queries=num_queries,
+            block_size=block_size,
+            input_tokens=input_tokens,
+            sampled_token_ids=sampled_token_ids,
+            input_positions=input_positions,
+            seq_lens=self.seq_lens_tensor,
+            slot_mapping=self.slot_mapping,
+            block_tables=self.block_tables,
+            paged_kv_indices=self.paged_kv_indices,
+            paged_kv_indptr=self.paged_kv_indptr,
+            paged_kv_last_page_lens=self.paged_kv_last_page_lens,
+            block_table_bound=self.block_table_bound)
+
+
+class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
+    BLOCK_TABLE_EXTENDER: list[list[int]] = [[]]
+
+    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+        super().__init__(input_builder)
+        assert self.block_size == 1, "AITER MLA requires only block size 1."
+
+    def prepare(self):
+        super().prepare()
+        self.paged_kv_indices: list[int] = []
+        self.paged_kv_indptr: list[int] = [0]
+        self.paged_kv_last_page_lens: list[int] = []
+        self.total_blocks = 0
+        self.qo_indptr: list[int] = [0]
+
+    def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool,
+                       prefix_cache_hit: bool):
+        """Add a sequence group to the metadata. Specifically update/append
+        1. context length.
+        2. block table.
+        3. slot mapping.
+        """
+        is_prompt = inter_data.is_prompt
+        block_tables = inter_data.block_tables
+
+        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
+             curr_sliding_window_block) in zip(
+                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
+                 inter_data.orig_seq_lens, inter_data.seq_lens,
+                 inter_data.query_lens, inter_data.context_lens,
+                 inter_data.curr_sliding_window_blocks):
+            self.context_lens.append(context_len)
+            if is_prompt:
+                self.num_prefills += 1
+                self.num_prefill_tokens += token_len
+                self.prefill_seq_lens.append(seq_len)
+            else:
+                self.num_decode_tokens += query_len
+                self.curr_seq_lens.append(curr_seq_len)
+
+            # Compute block table.
+            # TODO(sang): Combine chunked prefill and prefix caching by
+            # only allowing multiple of block_size chunk size.
+            # NOTE: This only works for oooooooxxx style attention.
+            block_table = []
+            if prefix_cache_hit:
+                # NOTE(woosuk): For flash-attn, the block table should
+                # include the entries for the incoming prefill tokens.
+                block_table = block_tables[seq_id]
+            elif ((chunked_prefill_enabled or not is_prompt)
+                  and block_tables is not None):
+                if curr_sliding_window_block == 0:
+                    block_table = block_tables[seq_id]
+                else:
+                    block_table = block_tables[seq_id][
+                        -curr_sliding_window_block:]
+            self.block_tables.append(block_table)
+
+            # Compute slot mapping.
+            is_profile_run = is_block_tables_empty(block_tables)
+            start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
+                                                       context_len,
+                                                       self.sliding_window)
+            compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
+                                 seq_len, context_len, start_idx,
+                                 self.block_size, inter_data.block_tables)
+            if is_profile_run:
+                return
+
+            # Update paged_kv_* tensors only for non-profile run
+            block_table = block_tables[seq_id]
+            self._update_paged_kv_tensors(block_table, seq_len)
+
+    def _update_paged_kv_tensors(self, block_table: list[int], seq_len: int):
+        # Get the number of valid blocks based on sequence length.
+        # If seq_len = 16, block_size = 16,
+        # block_table_bound is 1 with 1 valid block.
+        # If seq_len = 15, block_size = 16,
+        # block_table_bound is 0 + 1 with 1 valid block.
+        self.total_blocks += len(block_table)
+        block_table_bound = seq_len // self.block_size + 1 \
+            if seq_len % self.block_size != 0 \
+            else seq_len // self.block_size
+        self.paged_kv_indices.extend(block_table[:block_table_bound])
+        self.paged_kv_indptr.append(self.paged_kv_indptr[-1] +
+                                    block_table_bound)
+        self.qo_indptr.append(self.qo_indptr[-1] + 1)
+
+        last_page_len = seq_len % self.block_size
+        if last_page_len == 0:
+            last_page_len = self.block_size
+        self.paged_kv_last_page_lens.append(last_page_len)
+
+    def build(self, seq_lens: list[int], query_lens: list[int],
+              cuda_graph_pad_size: int, batch_size: int) -> AiterMLAMetadata:
+        metadata = super().build(seq_lens, query_lens, cuda_graph_pad_size,
+                                 batch_size)
+        device = self.runner.device
+        use_captured_graph = cuda_graph_pad_size != -1
+
+        if use_captured_graph:
+            last_paged_kv_indptr = self.paged_kv_indptr[-1]
+            self.paged_kv_indptr.extend([last_paged_kv_indptr] *
+                                        cuda_graph_pad_size)
+            self.paged_kv_last_page_lens.extend([0] * cuda_graph_pad_size)
+            last_qo_indptr = self.qo_indptr[-1]
+            self.qo_indptr.extend([last_qo_indptr] * cuda_graph_pad_size)
+
+        # For current version of AITER MLA
+        if len(self.paged_kv_indptr) > 0:
+            # extend to the maximum number of blocks as returned by the
+            # scheduler
+            self.paged_kv_indices.extend(
+                [0] * (self.total_blocks - len(self.paged_kv_indices)))
+            paged_kv_indices_tensor = torch.tensor(self.paged_kv_indices,
+                                                   device=device,
+                                                   dtype=torch.int)
+            paged_kv_indptr_tensor = torch.tensor(self.paged_kv_indptr,
+                                                  device=device,
+                                                  dtype=torch.int)
+            paged_kv_last_page_lens_tensor = torch.tensor(
+                self.paged_kv_last_page_lens, device=device, dtype=torch.int)
+            block_table_bound_tensor = torch.zeros(len(self.paged_kv_indptr) -
+                                                   1,
+                                                   device=device,
+                                                   dtype=torch.int)
+
+            qo_indptr = torch.tensor(self.qo_indptr,
+                                     device=device,
+                                     dtype=torch.int)
+        else:
+            paged_kv_indices_tensor = None
+            paged_kv_indptr_tensor = None
+            paged_kv_last_page_lens_tensor = None
+            block_table_bound_tensor = None
+            qo_indptr = None
+
+        metadata.paged_kv_indptr = paged_kv_indptr_tensor
+        metadata.paged_kv_indices = paged_kv_indices_tensor
+        metadata.paged_kv_last_page_lens = paged_kv_last_page_lens_tensor
+        metadata.block_table_bound = block_table_bound_tensor
+        metadata.qo_indptr = qo_indptr
+
+        return metadata
+
+
+class AiterMLAState(MLACommonState[AiterMLAMetadata]):
+
+    @contextmanager
+    def graph_capture(self, max_batch_size: int):
+        kv_indices, kv_indptr, last_page_lens, qo_indptr = \
+            get_aiter_mla_metadata(
+                max_batch_size=max_batch_size,
+                block_size=self.runner.block_size,
+                max_block_per_batch=\
+                    self.runner.get_max_block_per_batch(),
+                device=self.runner.device)
+        self._paged_kv_indices_tensor = kv_indices
+        self._paged_kv_indptr_tensor = kv_indptr
+        self._paged_kv_last_page_lens_tensor = last_page_lens
+        self._qo_indptr_tensor = qo_indptr
+
+        with super().graph_capture(max_batch_size):
+            yield
+
+        del self._paged_kv_indices_tensor
+        del self._paged_kv_indptr_tensor
+        del self._paged_kv_last_page_lens_tensor
+        del self._qo_indptr_tensor
+
+    def graph_capture_get_metadata_for_batch(
+            self,
+            batch_size: int,
+            is_encoder_decoder_model: bool = False) -> AiterMLAMetadata:
+
+        metadata = super().graph_capture_get_metadata_for_batch(
+            batch_size, is_encoder_decoder_model)
+
+        paged_kv_indptr = self._paged_kv_indptr_tensor[:batch_size + 1]
+        paged_kv_indices = self._paged_kv_indices_tensor
+        paged_kv_last_page_lens = self._paged_kv_last_page_lens_tensor[:
+                                                                       batch_size]
+        qo_indptr = self._qo_indptr_tensor[:batch_size + 1]
+
+        metadata.paged_kv_indptr = paged_kv_indptr
+        metadata.paged_kv_indices = paged_kv_indices
+        metadata.paged_kv_last_page_lens = paged_kv_last_page_lens
+        metadata.qo_indptr = qo_indptr
+
+        return metadata
+
+    def get_graph_input_buffers(self,
+                                attn_metadata: AiterMLAMetadata,
+                                is_encoder_decoder_model: bool = False):
+        input_buffers = super().get_graph_input_buffers(
+            attn_metadata, is_encoder_decoder_model)
+        input_buffers[
+            'paged_kv_indptr'] = attn_metadata.decode_metadata.paged_kv_indptr
+        input_buffers[
+            "paged_kv_indices"] = attn_metadata.\
+            decode_metadata.paged_kv_indices
+        input_buffers[
+            "paged_kv_last_page_lens"] = attn_metadata.\
+            decode_metadata.paged_kv_last_page_lens
+        input_buffers['qo_indptr'] = attn_metadata.qo_indptr
+
+        return input_buffers
+
+    def prepare_graph_input_buffers(self,
+                                    input_buffers,
+                                    attn_metadata: AiterMLAMetadata,
+                                    is_encoder_decoder_model: bool = False):
+        super().prepare_graph_input_buffers(input_buffers, attn_metadata,
+                                            is_encoder_decoder_model)
+
+        num_total_blocks = attn_metadata.decode_metadata.paged_kv_indices.shape[
+            0]
+        input_buffers["paged_kv_indptr"].copy_(
+            attn_metadata.decode_metadata.paged_kv_indptr, non_blocking=True)
+        input_buffers["paged_kv_indices"][:num_total_blocks].copy_(
+            attn_metadata.decode_metadata.paged_kv_indices, non_blocking=True)
+        input_buffers["paged_kv_last_page_lens"].copy_(
+            attn_metadata.decode_metadata.paged_kv_last_page_lens,
+            non_blocking=True)
+        input_buffers["qo_indptr"].copy_(
+            attn_metadata.decode_metadata.qo_indptr, non_blocking=True)
+
+
+class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
+
+    def __init__(
+            self,
+            num_heads: int,
+            head_size: int,
+            scale: float,
+            num_kv_heads: int,
+            alibi_slopes: Optional[list[float]],
+            sliding_window: Optional[int],
+            kv_cache_dtype: str,
+            logits_soft_cap: Optional[float],
+            attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
+            # MLA Specific Arguments
+            **mla_args) -> None:
+        super().__init__(num_heads, head_size, scale, num_kv_heads,
+                         alibi_slopes, sliding_window, kv_cache_dtype,
+                         logits_soft_cap, attn_type,
+                         kv_sharing_target_layer_name, **mla_args)
+
+        unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
+        if any(unsupported_features):
+            raise NotImplementedError(
+                "Aiter MLA does not support one of the following: "
+                "alibi_slopes, sliding_window, logits_soft_cap")
+
+        from aiter import flash_attn_varlen_func
+        self.flash_attn_varlen_func = flash_attn_varlen_func
+
+    def _flash_attn_varlen_diff_headdims(
+            self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+            softmax_scale: float, return_softmax_lse: bool,
+            **kwargs) -> Union[tuple[torch.Tensor, ...], torch.Tensor]:
+        output = self.flash_attn_varlen_func(
+            q,
+            k,
+            v,
+            **kwargs,
+        )
+
+        return output
+
+    def _forward_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: AiterMLAMetadata,
+    ) -> torch.Tensor:
+        assert kv_c_and_k_pe_cache.numel() > 0
+
+        decode_meta = attn_metadata.decode_metadata
+        assert decode_meta is not None
+        B = q_nope.shape[0]
+
+        q = torch.cat([q_nope, q_pe], dim=-1)
+        o = torch.empty(B,
+                        self.num_heads,
+                        self.kv_lora_rank,
+                        dtype=q.dtype,
+                        device=q.device)
+
+        kv_buffer = kv_c_and_k_pe_cache.unsqueeze(2)
+
+        aiter_mla_decode_fwd(q, kv_buffer, o, self.scale,
+                             attn_metadata.qo_indptr,
+                             attn_metadata.max_query_len,
+                             attn_metadata.paged_kv_indptr,
+                             attn_metadata.paged_kv_indices,
+                             attn_metadata.paged_kv_last_page_lens)
+
+        return self._v_up_proj(o)
diff --git a/vllm_v0.10.0/vllm/attention/backends/rocm_flash_attn.py b/vllm_v0.10.0/vllm/attention/backends/rocm_flash_attn.py
new file mode 100644
index 0000000..1ee1dea
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/backends/rocm_flash_attn.py
@@ -0,0 +1,1014 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Attention layer ROCm GPUs."""
+import itertools
+from dataclasses import dataclass
+from functools import cache
+from typing import TYPE_CHECKING, List, Optional, Tuple, Type
+
+import torch
+
+import vllm.envs as envs
+from vllm import _custom_ops as ops
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer,
+                                              AttentionMetadata, AttentionType)
+from vllm.attention.backends.utils import (CommonAttentionState,
+                                           CommonMetadataBuilder)
+from vllm.attention.ops.paged_attn import (PagedAttention,
+                                           PagedAttentionMetadata)
+from vllm.config import get_current_vllm_config
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape)
+from vllm.platforms import current_platform
+from vllm.platforms.rocm import use_rocm_custom_paged_attention
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+
+logger = init_logger(__name__)
+_PARTITION_SIZE_ROCM = 256
+
+
+@cache
+def is_rocm_aiter_paged_attn_enabled() -> bool:
+    return envs.VLLM_ROCM_USE_AITER_PAGED_ATTN \
+        and envs.VLLM_ROCM_USE_AITER \
+
+
+@cache
+def _get_paged_attn_module() -> PagedAttention:
+    """
+    Initializes the appropriate PagedAttention module from `attention/ops`,
+    which is used as helper function
+    by `ROCmFlashAttentionImpl` and `ROCmFlashAttentionBackend`.
+
+    The choice of attention module depends on whether
+    AITER paged attention is enabled:
+    - If enabled, `ROCmFlashAttentionImpl` uses `AITERPagedAttention`.
+    - Otherwise, it defaults to using the original `PagedAttention`.
+    """
+    if is_rocm_aiter_paged_attn_enabled():
+        # Import AITERPagedAttention only when the flag is enabled
+        from vllm.attention.ops.rocm_aiter_paged_attn import (
+            AITERPagedAttention)
+        return AITERPagedAttention()
+    return PagedAttention()
+
+
+class ROCmFlashAttentionBackend(AttentionBackend):
+    accept_output_buffer: bool = True
+
+    @staticmethod
+    def get_name() -> str:
+        return "ROCM_FLASH"
+
+    @staticmethod
+    def get_impl_cls() -> Type["ROCmFlashAttentionImpl"]:
+        return ROCmFlashAttentionImpl
+
+    @staticmethod
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return ROCmFlashAttentionMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["ROCmFlashAttentionMetadataBuilder"]:
+        return ROCmFlashAttentionMetadataBuilder
+
+    @staticmethod
+    def get_state_cls() -> Type["CommonAttentionState"]:
+        return CommonAttentionState
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        paged_attn = _get_paged_attn_module()
+        return paged_attn.get_kv_cache_shape(num_blocks, block_size,
+                                             num_kv_heads, head_size)
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        paged_attn = _get_paged_attn_module()
+        paged_attn.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: torch.Tensor,
+    ) -> None:
+        paged_attn = _get_paged_attn_module()
+        paged_attn.copy_blocks(kv_caches, src_to_dists)
+
+
+@dataclass
+class ROCmFlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
+    """Metadata for FlashAttentionBackend.
+
+    NOTE: Any python object stored here is not updated when it is
+    cuda-graph replayed. If you have values that need to be changed
+    dynamically, it should be stored in tensor. The tensor has to be
+    updated from `CUDAGraphRunner.forward` API.
+    """
+    # (batch_size,). The sequence length per sequence. Sequence length means
+    # the computed tokens + new tokens None if it is a decoding.
+    seq_lens: Optional[List[int]]
+    # seq_lens stored as a tensor.
+    seq_lens_tensor: Optional[torch.Tensor]
+    # Maximum sequence length among prefill batch. 0 if there are decoding
+    # requests only.
+    max_prefill_seq_len: int
+    # Maximum sequence length among decode batch. 0 if there are prefill
+    # requests only.
+    max_decode_seq_len: int
+
+    # Whether or not if cuda graph is enabled.
+    # Cuda-graph is currently enabled for decoding only.
+    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
+    use_cuda_graph: bool
+
+    # NOTE(sang): Definition of context_len, query_len, and seq_len.
+    # |---------- N-1 iteration --------|
+    # |---------------- N iteration ---------------------|
+    # |- tokenA -|......................|-- newTokens ---|
+    # |---------- context_len ----------|
+    # |-------------------- seq_len ----------------------|
+    #                                   |-- query_len ---|
+
+    # Maximum query length in the batch. None for decoding.
+    max_query_len: Optional[int] = None
+    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
+    # the batch, used to index into subquery. E.g., if the subquery length
+    # is [4, 6], it is [0, 4, 10].
+    query_start_loc: Optional[torch.Tensor] = None
+    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+    # the batch, used to index into sequence. E.g., if the sequence length is
+    # [4, 6], it is [0, 4, 10].
+    seq_start_loc: Optional[torch.Tensor] = None
+    # (batch_size,) A tensor of context lengths (tokens that are computed
+    # so far).
+    context_lens_tensor: Optional[torch.Tensor] = None
+
+    # Max number of query tokens among request in the batch.
+    max_decode_query_len: Optional[int] = None
+
+    _cached_prefill_metadata: Optional["ROCmFlashAttentionMetadata"] = None
+    _cached_decode_metadata: Optional["ROCmFlashAttentionMetadata"] = None
+
+    # Begin encoder attn & enc/dec cross-attn fields...
+
+    # Encoder sequence lengths representation
+    encoder_seq_lens: Optional[List[int]] = None
+    encoder_seq_lens_tensor: Optional[torch.Tensor] = None
+
+    # Maximum sequence length among encoder sequences
+    max_encoder_seq_len: Optional[int] = None
+
+    # Number of tokens input to encoder
+    num_encoder_tokens: Optional[int] = None
+
+    # Cross-attention memory-mapping data structures: slot mapping
+    # and block tables
+    cross_slot_mapping: Optional[torch.Tensor] = None
+    cross_block_tables: Optional[torch.Tensor] = None
+
+    @property
+    def prefill_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]:
+        if self.num_prefills == 0:
+            return None
+
+        if self._cached_prefill_metadata is not None:
+            return self._cached_prefill_metadata
+
+        assert self.seq_lens is not None
+        assert self.seq_lens_tensor is not None
+        assert self.block_tables is not None
+
+        self._cached_prefill_metadata = ROCmFlashAttentionMetadata(
+            num_prefills=self.num_prefills,
+            num_prefill_tokens=self.num_prefill_tokens,
+            num_decode_tokens=0,
+            slot_mapping=self.slot_mapping[:self.num_prefill_tokens],
+            multi_modal_placeholder_index_maps=self.
+            multi_modal_placeholder_index_maps,
+            enable_kv_scales_calculation=self.enable_kv_scales_calculation,
+            seq_lens=self.seq_lens[:self.num_prefills],
+            seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
+            max_query_len=self.max_query_len,
+            max_prefill_seq_len=self.max_prefill_seq_len,
+            max_decode_seq_len=0,
+            query_start_loc=None if self.query_start_loc is None else
+            self.query_start_loc[:self.num_prefills + 1],
+            seq_start_loc=None if self.seq_start_loc is None else
+            self.seq_start_loc[:self.num_prefills + 1],
+            context_lens_tensor=None if self.context_lens_tensor is None else
+            self.context_lens_tensor[:self.num_prefills],
+            block_tables=self.block_tables[:self.num_prefills],
+            use_cuda_graph=False,
+            # Begin encoder & cross attn fields below...
+            encoder_seq_lens=self.encoder_seq_lens,
+            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+            max_encoder_seq_len=self.max_encoder_seq_len,
+            cross_slot_mapping=self.cross_slot_mapping,
+            cross_block_tables=self.cross_block_tables)
+        return self._cached_prefill_metadata
+
+    @property
+    def decode_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]:
+        if self.num_decode_tokens == 0:
+            return None
+
+        if self._cached_decode_metadata is not None:
+            return self._cached_decode_metadata
+        assert self.block_tables is not None
+        assert self.seq_lens_tensor is not None
+
+        self._cached_decode_metadata = ROCmFlashAttentionMetadata(
+            num_prefills=0,
+            num_prefill_tokens=0,
+            num_decode_tokens=self.num_decode_tokens,
+            slot_mapping=self.slot_mapping[self.num_prefill_tokens:],
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=True,
+            seq_lens=None,
+            seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
+            max_query_len=None,
+            max_prefill_seq_len=0,
+            max_decode_seq_len=self.max_decode_seq_len,
+            query_start_loc=None,
+            seq_start_loc=None,
+            context_lens_tensor=None,
+            block_tables=self.block_tables[self.num_prefills:],
+            use_cuda_graph=self.use_cuda_graph,
+            # Begin encoder & cross attn fields below...
+            encoder_seq_lens=self.encoder_seq_lens,
+            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+            max_encoder_seq_len=self.max_encoder_seq_len,
+            cross_slot_mapping=self.cross_slot_mapping,
+            cross_block_tables=self.cross_block_tables)
+        # Batch may be composed of prefill|decodes, adjust query start indices
+        # to refer to the start of decodes when the two are split apart.
+        # E.g. in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6].
+        if self._cached_decode_metadata.query_start_loc is not None:
+            qs = self._cached_decode_metadata.query_start_loc
+            self._cached_decode_metadata.query_start_loc = qs - qs[0]
+        return self._cached_decode_metadata
+
+    def advance_step(self,
+                     model_input: "ModelInputForGPUWithSamplingMetadata",
+                     sampled_token_ids: Optional[torch.Tensor],
+                     block_size: int,
+                     num_seqs: int,
+                     num_queries: int,
+                     turn_prefills_into_decodes: bool = False):
+        """
+        Update metadata in-place to advance one decode step.
+        """
+
+        assert not turn_prefills_into_decodes, \
+            ("Chunked prefill is not supported with rocm_flash_attn yet."
+             "turn_prefills_into_decodes is a Multi-Step + Chunked-Prefill "
+             "specific parameter.")
+
+        # When using cudagraph, the num_seqs is padded to the next captured
+        # batch sized, but num_queries tracks the actual number of requests in
+        # the batch. For --enforce-eager mode, num_seqs == num_queries
+        if num_seqs != num_queries:
+            assert num_seqs > num_queries
+            assert self.use_cuda_graph
+
+        assert self.num_prefills == 0
+        assert self.num_prefill_tokens == 0
+        assert self.num_decode_tokens == num_seqs
+        assert self.slot_mapping.shape == (num_seqs, )
+
+        assert self.seq_lens is not None
+        assert len(self.seq_lens) == num_seqs
+        assert self.seq_lens_tensor is not None
+        assert self.seq_lens_tensor.shape == (num_seqs, )
+        assert self.max_query_len == 1
+        assert self.max_prefill_seq_len == 0
+        assert self.max_decode_seq_len == max(self.seq_lens)
+
+        assert self.query_start_loc is not None
+        assert self.query_start_loc.shape == (num_queries + 1, )
+        assert self.seq_start_loc is not None
+        assert self.seq_start_loc.shape == (num_seqs + 1, )
+
+        assert self.context_lens_tensor is not None
+        assert self.context_lens_tensor.shape == (num_queries, )
+
+        assert self.block_tables is not None
+        assert self.block_tables.shape[0] == num_seqs
+
+        # Update query lengths. Note that we update only queries and not seqs,
+        # since tensors may be padded due to captured cuda graph batch size
+        for i in range(num_queries):
+            self.seq_lens[i] += 1
+        self.max_decode_seq_len = max(self.seq_lens)
+
+        ops.advance_step_flashattn(num_seqs=num_seqs,
+                                   num_queries=num_queries,
+                                   block_size=block_size,
+                                   input_tokens=model_input.input_tokens,
+                                   sampled_token_ids=sampled_token_ids,
+                                   input_positions=model_input.input_positions,
+                                   seq_lens=self.seq_lens_tensor,
+                                   slot_mapping=self.slot_mapping,
+                                   block_tables=self.block_tables)
+
+
+class ROCmFlashAttentionMetadataBuilder(
+        CommonMetadataBuilder[ROCmFlashAttentionMetadata]):
+
+    _metadata_cls = ROCmFlashAttentionMetadata
+
+
+def _make_alibi_bias(alibi_slopes: torch.Tensor,
+                     dtype: torch.dtype,
+                     seq_lens: Optional[List[int]],
+                     make_attn_mask: bool = True) -> List[torch.Tensor]:
+    attn_biases = []
+    if seq_lens:
+        for seq_len in seq_lens:
+            bias = torch.arange(seq_len, dtype=dtype)
+            # NOTE(zhuohan): HF uses
+            #     `bias = bias[None, :].repeat(seq_len, 1)`
+            # here. We find that both biases give the same results, but
+            # the bias below more accurately follows the original ALiBi
+            # paper.
+            bias = bias[None, :] - bias[:, None]
+
+            num_heads = alibi_slopes.shape[0]
+            bias = bias[None, :].repeat(
+                (num_heads, 1, 1)).to(alibi_slopes.device)
+            bias.mul_(alibi_slopes[:, None, None])
+            if make_attn_mask:
+                inf_mask = torch.empty(
+                    (1, seq_len, seq_len),
+                    dtype=bias.dtype).fill_(-torch.inf).triu_(diagonal=1).to(
+                        alibi_slopes.device)
+                attn_biases.append((bias + inf_mask).to(dtype))
+            else:
+                attn_biases.append(bias.to(dtype))
+
+    return attn_biases
+
+
+def _get_seq_len_block_table_args(
+    attn_metadata: ROCmFlashAttentionMetadata,
+    attn_type: str,
+) -> tuple:
+    '''
+    The particular choice of sequence-length
+    attributes which should be extracted from attn_metadata is dependent
+    on the type of attention operation.
+
+    Decoder attn -> select entirely decoder self-attention-related fields
+    Encoder/decoder cross-attn -> select encoder sequence lengths
+    Encoder attn -> select encoder sequence lengths fields
+    Encoder-only attn -> select prefill sequence lengths with 
+        bidirectional attention
+    
+    Arguments:
+
+    * attn_metadata: Attention metadata structure associated with attention op
+    * attn_type: encoder attention, decoder self-attention,
+                encoder/decoder cross-attention, encoder-only
+
+    Returns:
+
+    * Appropriate sequence-lengths tensors for query and key
+    * Appropriate max sequence-length scalar
+    * Causal masking flag
+    '''
+
+    if attn_type == AttentionType.ENCODER:
+        assert attn_metadata.encoder_seq_lens is not None
+        assert attn_metadata.encoder_seq_lens_tensor is not None
+        query_seq_start_loc = torch.tensor(
+            list(itertools.accumulate([0] + attn_metadata.encoder_seq_lens)),
+            device=attn_metadata.encoder_seq_lens_tensor.device,
+            dtype=attn_metadata.encoder_seq_lens_tensor.dtype)
+        causal_mask = False
+
+        # No block tables associated with encoder attention
+        return (query_seq_start_loc, attn_metadata.max_encoder_seq_len,
+                query_seq_start_loc, attn_metadata.max_encoder_seq_len,
+                attn_metadata.encoder_seq_lens, causal_mask)
+
+    elif attn_type == AttentionType.ENCODER_ONLY:
+        # For encoder-only models, we use the prefill sequence lengths
+        assert attn_metadata.seq_lens is not None
+        assert attn_metadata.seq_lens_tensor is not None
+        query_seq_start_loc = torch.tensor(
+            list(itertools.accumulate([0] + attn_metadata.seq_lens)),
+            device=attn_metadata.seq_lens_tensor.device,
+            dtype=attn_metadata.seq_lens_tensor.dtype)
+        max_seq_len = attn_metadata.max_prefill_seq_len
+        # Encoder-only models typically use bidirectional attention
+        causal_mask = False
+
+        return (query_seq_start_loc, max_seq_len, query_seq_start_loc,
+                max_seq_len, attn_metadata.seq_lens, causal_mask)
+
+    elif attn_type == AttentionType.DECODER:
+        # Decoder self-attention
+        # Choose max_seq_len based on whether we are in prompt_run
+        assert attn_metadata.seq_lens is not None
+        assert attn_metadata.seq_lens_tensor is not None
+        query_seq_start_loc = torch.tensor(
+            list(itertools.accumulate([0] + attn_metadata.seq_lens)),
+            device=attn_metadata.seq_lens_tensor.device,
+            dtype=attn_metadata.seq_lens_tensor.dtype)
+        max_seq_len = attn_metadata.max_prefill_seq_len
+        causal_mask = True
+
+        return (query_seq_start_loc, max_seq_len, query_seq_start_loc,
+                max_seq_len, attn_metadata.seq_lens, causal_mask)
+    elif attn_type == AttentionType.ENCODER_DECODER:
+        assert attn_metadata.seq_lens is not None
+        assert attn_metadata.encoder_seq_lens_tensor is not None
+        query_start_loc = torch.tensor(
+            list(itertools.accumulate([0] + attn_metadata.seq_lens)),
+            device=attn_metadata.encoder_seq_lens_tensor.device,
+            dtype=attn_metadata.encoder_seq_lens_tensor.dtype)
+
+        assert attn_metadata.encoder_seq_lens is not None
+        assert attn_metadata.seq_lens_tensor is not None
+        key_seq_start_loc = torch.tensor(
+            list(itertools.accumulate([0] + attn_metadata.encoder_seq_lens)),
+            device=attn_metadata.seq_lens_tensor.device,
+            dtype=attn_metadata.seq_lens_tensor.dtype)
+        causal_mask = False
+
+        # Enc/dec cross-attention KVs match encoder sequence length;
+        # cross-attention utilizes special "cross" block tables
+        return (query_start_loc, attn_metadata.max_prefill_seq_len,
+                key_seq_start_loc, attn_metadata.max_encoder_seq_len,
+                attn_metadata.seq_lens, causal_mask)
+    else:
+        raise AttributeError(f"Invalid attention type {str(attn_type)}")
+
+
+class ROCmFlashAttentionImpl(AttentionImpl):
+    """
+    If the input tensors contain prompt tokens, the layout is as follows:
+    |<--------------- num_prompt_tokens -------------->|
+    |<--prompt_0-->|<--prompt_1-->|...|<--prompt_N-1-->|
+
+    Otherwise, the layout is as follows:
+    |<------------------ num_generation_tokens (M) ----------------->|
+    |<--generation_0-->|..........|<--generation_M-1-->|<--padding-->|
+
+    Generation tokens can contain padding when cuda-graph is used.
+    Currently, prompt tokens don't contain any padding.
+
+    The prompts might have different lengths, while the generation tokens
+    always have length 1.
+
+    If chunked prefill is enabled, prefill tokens and decode tokens can be
+    batched together in a flattened 1D query.
+
+    |<----- num_prefill_tokens ---->|<------- num_decode_tokens ----------->|	
+    |<-prompt_0->|...|<-prompt_N-1->|<-generation_0->|...|<-generation_M-1->|
+
+    Currently, cuda graph is disabled for chunked prefill, meaning there's no
+    padding between prefill and decode tokens.
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[List[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
+        use_irope: bool = False,
+    ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0 "
+                                      "ROCM_FLASH backend.")
+        if use_irope:
+            logger.warning_once(
+                "Using irope in ROCm Flash Attention is not supported yet, it "
+                "will fail back to global attention for long context.")
+        if use_irope:
+            logger.warning(
+                "Using irope in V0 is not supported yet, it will fall back "
+                "to global attention for long context.")
+        if logits_soft_cap is None:
+            # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
+            self.logits_soft_cap = 0.0
+        else:
+            self.logits_soft_cap = logits_soft_cap
+        self.attn_type = attn_type
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+        self.alibi_slopes = alibi_slopes
+        self.sliding_window = ((sliding_window, sliding_window)
+                               if sliding_window is not None else (-1, -1))
+        self.kv_cache_dtype = kv_cache_dtype
+
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        self.paged_attn_module = _get_paged_attn_module()
+        supported_head_sizes = self.paged_attn_module.get_supported_head_sizes(
+        )
+
+        if head_size not in supported_head_sizes:
+            raise ValueError(
+                f"Head size {head_size} is not supported by PagedAttention. "
+                f"Supported head sizes are: {supported_head_sizes}.")
+
+        self.use_naive_attn = False
+        # NOTE: Allow for switching between Triton and CK. Defaulting to triton.
+        self.use_triton_flash_attn = envs.VLLM_USE_TRITON_FLASH_ATTN
+        if self.use_triton_flash_attn:
+            if logits_soft_cap is not None:
+                raise ValueError(
+                    "ROCm Triton FlashAttention does not support attention"
+                    " logits soft capping."
+                    " please try using the ROCm CK "
+                    "FA backend instead by setting the env var "
+                    "`VLLM_USE_TRITON_FLASH_ATTN=0`")
+
+            from vllm.attention.ops.triton_flash_attention import (  # noqa: F401
+                triton_attention)
+            self.triton_attn_func = triton_attention
+            logger.debug("Using Triton FA in ROCmBackend")
+            if self.sliding_window != (-1, -1):
+                logger.warning("ROCm Triton FA does not currently support "
+                               "sliding window attention. If using half "
+                               "precision, please try using the ROCm CK "
+                               "FA backend instead by setting the env var "
+                               "`VLLM_USE_TRITON_FLASH_ATTN=0`")
+        else:
+            # if not using triton, navi3x/navi21/navi10 do not use flash-attn
+            # either
+            if not current_platform.has_device_capability(90):
+                self.use_naive_attn = True
+            else:
+                try:
+                    from flash_attn import flash_attn_varlen_func  # noqa: F401
+                    self.fa_attn_func = flash_attn_varlen_func
+                    logger.debug("Using CK FA in ROCmBackend")
+                except ModuleNotFoundError:
+                    self.use_naive_attn = True
+
+            if self.use_naive_attn:
+                if logits_soft_cap is not None:
+                    raise ValueError(
+                        "ROCm Naive FlashAttention does not support "
+                        "attention logits soft capping.")
+
+                self.sdpa_attn_func = _sdpa_attention
+                logger.debug("Using naive (SDPA) attention in ROCmBackend")
+
+        self.aiter_kv_scales_initialized = False
+        self.force_fp8_attention = (
+            get_current_vllm_config() is not None
+            and get_current_vllm_config().model_config.override_attention_dtype
+            == "fp8")
+
+    def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor:
+        """torch.repeat_interleave(x, dim=1, repeats=n_rep)"""
+        tokens, n_kv_heads, head_dim = x.shape
+        return (x[:, :,
+                  None, :].expand(tokens, n_kv_heads, n_rep,
+                                  head_dim).reshape(tokens, n_kv_heads * n_rep,
+                                                    head_dim))
+
+    def fused_output_quant_supported(self, dtype: torch.dtype, static: bool,
+                                     group_shape: GroupShape):
+        if self.use_triton_flash_attn:
+            return dtype == current_platform.fp8_dtype(
+            ) and static and group_shape == GroupShape.PER_TENSOR
+
+        # Only supported in the Triton backend
+        return False
+
+    def forward(
+        self,
+        layer: AttentionLayer,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: ROCmFlashAttentionMetadata,
+        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass with FlashAttention and PagedAttention.
+
+        For decoder-only models: query, key and value must be non-None.
+
+        For encoder/decoder models:
+        * ROCmFlashAttentionImpl.forward() may be invoked for both self- and 
+            cross-attention layers.
+        * For self-attention: query, key and value must be non-None.
+        * For cross-attention:
+            * Query must be non-None
+            * During prefill, key and value must be non-None; key and value
+              get cached for use during decode.
+            * During decode, key and value may be None, since:
+              (1) key and value tensors were cached during prefill, and
+              (2) cross-attention key and value tensors do not grow during
+                  decode
+        
+        A note on how the attn_type (attention type enum) argument impacts
+        attention forward() behavior:
+    
+            * DECODER: normal decoder-only behavior;
+                use decoder self-attention block table
+            * ENCODER: no KV caching; pass encoder sequence
+                attributes (encoder_seq_lens/encoder_seq_lens_tensor/
+                max_encoder_seq_len) to kernel, in lieu of decoder
+                sequence attributes (seq_lens/seq_lens_tensor/max_seq_len)
+            * ENCODER_DECODER: cross-attention behavior;
+                use cross-attention block table for caching KVs derived
+                from encoder hidden states; since KV sequence lengths
+                will match encoder sequence lengths, pass encoder sequence
+                attributes to kernel (encoder_seq_lens/encoder_seq_lens_tensor/
+                max_encoder_seq_len)
+            * ENCODER_ONLY: bidirectional attention with no KV caching;
+                use prefill sequence attributes
+
+        Args:
+            query: shape = [num_tokens, num_heads * head_size]
+            key: shape = [num_tokens, num_kv_heads * head_size]
+            value: shape = [num_tokens, num_kv_heads * head_size]
+            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
+                NOTE: kv_cache will be an empty tensor with shape [0]
+                for profiling run.
+            attn_metadata: Metadata for attention.
+            attn_type: Select attention type, between encoder attention,
+                       decoder self-attention, or encoder/decoder cross-
+                       attention. Defaults to decoder self-attention,
+                       which is the vLLM default generally
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """
+        assert output is not None, "Output tensor must be provided."
+
+        if output_scale is not None and not self.use_triton_flash_attn:
+            raise NotImplementedError(
+                "fused output quantization only supported for Triton"
+                " implementation in ROCMFlashAttentionImpl for now")
+
+        query = query.view(-1, self.num_heads, self.head_size)
+        if key is not None:
+            assert value is not None
+            key = key.view(-1, self.num_kv_heads, self.head_size)
+            value = value.view(-1, self.num_kv_heads, self.head_size)
+        else:
+            assert value is None
+
+        paged_attn = self.paged_attn_module
+
+        # Reshaping kv tensors is required for AITER paged attention kernel
+        # because it works on a different tensor shape,
+        # when the size of one element is one byte (int8/fp8 dtypes).
+        # This reshaping is only required on the first forward call
+        # and the kv cache must not be empty.
+        if (is_rocm_aiter_paged_attn_enabled() and kv_cache.dtype.itemsize == 1
+                and not self.aiter_kv_scales_initialized
+                and kv_cache.shape != torch.Size([0])):
+            num_blocks = kv_cache.shape[1]
+            block_size = kv_cache.shape[2] // (self.num_kv_heads *
+                                               self.head_size)
+            k_scale = torch.empty((self.num_kv_heads, num_blocks * block_size),
+                                  dtype=torch.float32,
+                                  device=kv_cache.device)
+            v_scale = torch.empty((self.num_kv_heads, num_blocks * block_size),
+                                  dtype=torch.float32,
+                                  device=kv_cache.device)
+            self.aiter_kv_scales_initialized = True
+            k_scale.fill_(layer._k_scale.item())
+            v_scale.fill_(layer._v_scale.item())
+            layer._k_scale = k_scale
+            layer._v_scale = v_scale
+
+        # Only update KV cache for decoder self-attention
+        # and encoder-decoder cross-attention
+        if self.attn_type not in [
+                AttentionType.ENCODER, AttentionType.ENCODER_ONLY
+        ] and kv_cache.numel() > 0:
+            key_cache, value_cache = paged_attn.split_kv_cache(
+                kv_cache, self.num_kv_heads, self.head_size)
+
+            if key is not None and value is not None:
+                # Reshape the input keys and values and store them in the
+                # cache. If kv_cache is not provided, the new key and value
+                # tensors are not cached. This happens during the initial
+                # memory profiling run.
+                paged_attn.write_to_paged_cache(
+                    key,
+                    value,
+                    key_cache,
+                    value_cache,
+                    attn_metadata.slot_mapping
+                    if self.attn_type != AttentionType.ENCODER_DECODER else
+                    attn_metadata.cross_slot_mapping,
+                    self.kv_cache_dtype,
+                    layer._k_scale,
+                    layer._v_scale,
+                )
+
+        if self.attn_type != AttentionType.ENCODER:
+            num_prefill_tokens = attn_metadata.num_prefill_tokens
+        elif self.attn_type == AttentionType.ENCODER_ONLY:
+            # For encoder-only models, all tokens are processed in one go
+            num_prefill_tokens = query.shape[0]
+        else:
+            assert attn_metadata.num_encoder_tokens is not None
+            num_prefill_tokens = attn_metadata.num_encoder_tokens
+
+        # Query for decode. KV is not needed because it is already cached.
+        decode_query = query[num_prefill_tokens:]
+        # QKV for prefill.
+        query = query[:num_prefill_tokens]
+
+        # For encoder-only and encoder models,
+        # we process all tokens at once
+        # For decoder and encoder-decoder,
+        # we may need to limit key/value to prefill tokens
+        if key is not None and value is not None \
+            and self.attn_type not in [AttentionType.ENCODER_DECODER,
+                                       AttentionType.ENCODER_ONLY]:
+            key = key[:num_prefill_tokens]
+            value = value[:num_prefill_tokens]
+
+        if prefill_meta := attn_metadata.prefill_metadata:
+            # Prompt run.
+            # normal attention and DECODER
+            if self.attn_type == AttentionType.DECODER and (
+                    kv_cache.numel() == 0 or prefill_meta.block_tables is None
+                    or prefill_meta.block_tables.numel() == 0):
+                (query_seq_start_loc, query_max_seq_len, key_seq_start_loc,
+                 key_max_seq_len, seq_lens,
+                 causal_mask) = (prefill_meta.seq_start_loc,
+                                 prefill_meta.max_prefill_seq_len,
+                                 prefill_meta.seq_start_loc,
+                                 prefill_meta.max_prefill_seq_len,
+                                 attn_metadata.seq_lens, True)
+            # prefix-enabled attention and ENCODER/ENCODER_DECODER
+            else:
+                (query_seq_start_loc, query_max_seq_len, key_seq_start_loc,
+                 key_max_seq_len, seq_lens,
+                 causal_mask) = _get_seq_len_block_table_args(
+                     prefill_meta, self.attn_type)
+            # Prompt run.
+            if kv_cache.numel() == 0 or prefill_meta.block_tables.numel() == 0:
+                # triton attention
+                # When block_tables are not filled, it means q and k are the
+                # prompt, and they have the same length.
+                attn_masks = None
+                if self.use_triton_flash_attn:
+                    if self.alibi_slopes is not None:
+                        attn_masks = _make_alibi_bias(
+                            self.alibi_slopes,
+                            query.dtype,
+                            seq_lens,
+                            make_attn_mask=causal_mask)  # type: ignore
+
+                    use_fp8_scales = (layer._q_scale and layer._k_scale
+                                      and layer._v_scale and layer._prob_scale
+                                      and (self.kv_cache_dtype == "fp8"
+                                           or self.force_fp8_attention))
+
+                    full_scales = (
+                        layer._q_scale.item(), layer._k_scale.item(),
+                        layer._v_scale.item(),
+                        layer._prob_scale.item()) if use_fp8_scales else None
+                    self.triton_attn_func(
+                        query,
+                        key,
+                        value,
+                        output[:num_prefill_tokens],
+                        query_seq_start_loc,
+                        key_seq_start_loc,
+                        query_max_seq_len,
+                        key_max_seq_len,
+                        causal_mask,
+                        self.scale,
+                        attn_masks[0][None]
+                        if attn_masks is not None else None,
+                        full_scales,
+                        output_scale,
+                    )
+                elif self.use_naive_attn:
+                    if self.num_kv_heads != self.num_heads:
+                        # Interleave for MQA workaround.
+                        key = self.repeat_kv(key, self.num_queries_per_kv)
+                        value = self.repeat_kv(value, self.num_queries_per_kv)
+                    if self.alibi_slopes is not None:
+                        attn_masks = _make_alibi_bias(
+                            self.alibi_slopes,
+                            query.dtype,
+                            attn_metadata.seq_lens,
+                            make_attn_mask=causal_mask)  # type: ignore
+                    query = query.movedim(0, query.dim() - 2)
+                    key = key.movedim(0, key.dim() - 2)
+                    value = value.movedim(0, value.dim() - 2)
+                    # sdpa math backend attention
+                    self.sdpa_attn_func(
+                        query,
+                        key,
+                        value,
+                        output[:num_prefill_tokens],
+                        query_seq_start_loc,
+                        num_prefill_tokens,
+                        self.num_heads,
+                        self.head_size,
+                        self.scale,
+                        attn_masks,
+                    )
+                else:
+                    # upstream FA does not support an output arg, copy
+                    output[:num_prefill_tokens] = self.fa_attn_func(
+                        q=query,
+                        k=key,
+                        v=value,
+                        cu_seqlens_q=query_seq_start_loc,
+                        cu_seqlens_k=key_seq_start_loc,
+                        max_seqlen_q=prefill_meta.max_prefill_seq_len,
+                        max_seqlen_k=key_max_seq_len,
+                        softmax_scale=self.scale,
+                        causal=causal_mask,
+                        window_size=self.sliding_window,
+                        alibi_slopes=self.alibi_slopes,
+                        softcap=self.logits_soft_cap,
+                    )
+
+            else:
+                # prefix-enabled attention -
+                # not applicable for encoder-only models
+                if self.attn_type != AttentionType.ENCODER_ONLY:
+                    output[:num_prefill_tokens] = paged_attn.forward_prefix(
+                        query,
+                        key,
+                        value,
+                        self.kv_cache_dtype,
+                        key_cache,
+                        value_cache,
+                        prefill_meta.block_tables,
+                        prefill_meta.query_start_loc,
+                        prefill_meta.seq_lens_tensor,
+                        prefill_meta.max_query_len,
+                        self.alibi_slopes,
+                        self.sliding_window[0],
+                        layer._k_scale,
+                        layer._v_scale,
+                    )
+        # Skip decode phase for encoder-only models
+        if (decode_meta := attn_metadata.decode_metadata) and (
+                self.attn_type != AttentionType.ENCODER_ONLY):
+            # Decoding run.
+            # Whether to use rocm custom paged attention or not
+            num_seqs, num_heads, head_size = decode_query.shape
+            block_size = value_cache.shape[3]
+            gqa_ratio = num_heads // self.num_kv_heads
+            use_custom = use_rocm_custom_paged_attention(
+                decode_query.dtype, head_size, block_size, gqa_ratio,
+                decode_meta.max_decode_seq_len, self.sliding_window,
+                self.kv_cache_dtype, self.alibi_slopes)
+
+            if use_custom:
+                max_seq_len = (decode_meta.max_decode_seq_len if self.attn_type
+                               != AttentionType.ENCODER_DECODER else
+                               decode_meta.max_encoder_seq_len)
+                assert max_seq_len is not None
+                max_num_partitions = (
+                    (max_seq_len + _PARTITION_SIZE_ROCM - 1) //
+                    _PARTITION_SIZE_ROCM)
+                assert _PARTITION_SIZE_ROCM % block_size == 0
+                tmp_output = torch.empty(
+                    size=(num_seqs, num_heads, max_num_partitions, head_size),
+                    dtype=query.dtype,
+                    device=output.device,
+                )
+                exp_sums = torch.empty(
+                    size=(num_seqs, num_heads, max_num_partitions),
+                    dtype=torch.float32,
+                    device=output.device,
+                )
+                max_logits = torch.empty_like(exp_sums)
+
+                query_start_loc = None
+                ops.paged_attention_rocm(
+                    output[num_prefill_tokens:],
+                    exp_sums,
+                    max_logits,
+                    tmp_output,
+                    decode_query,
+                    key_cache,
+                    value_cache,
+                    self.num_kv_heads,
+                    self.scale,
+                    decode_meta.block_tables
+                    if self.attn_type != AttentionType.ENCODER_DECODER else
+                    decode_meta.cross_block_tables,
+                    decode_meta.seq_lens_tensor
+                    if self.attn_type != AttentionType.ENCODER_DECODER else
+                    decode_meta.encoder_seq_lens_tensor,
+                    query_start_loc,
+                    block_size,
+                    max_seq_len,
+                    self.alibi_slopes,
+                    self.kv_cache_dtype,
+                    layer._k_scale,
+                    layer._v_scale,
+                    output_scale,
+                )
+            else:
+                # PagedAttention does not support fused quant, manually quantize
+                if output_scale is None:
+                    out_pa = output[num_prefill_tokens:]
+                else:
+                    out_pa = torch.empty_like(output[num_prefill_tokens:],
+                                              dtype=query.dtype)
+
+                out_pa[:] = paged_attn.forward_decode(
+                    decode_query,
+                    key_cache,
+                    value_cache,
+                    decode_meta.block_tables
+                    if self.attn_type != AttentionType.ENCODER_DECODER else
+                    decode_meta.cross_block_tables,
+                    decode_meta.seq_lens_tensor
+                    if self.attn_type != AttentionType.ENCODER_DECODER else
+                    decode_meta.encoder_seq_lens_tensor,
+                    decode_meta.max_decode_seq_len
+                    if self.attn_type != AttentionType.ENCODER_DECODER else
+                    decode_meta.max_encoder_seq_len,
+                    self.kv_cache_dtype,
+                    self.num_kv_heads,
+                    self.scale,
+                    self.alibi_slopes,
+                    layer._k_scale,
+                    layer._v_scale,
+                )
+
+                # Manually perform quantization
+                if output_scale is not None:
+                    out_uq = out_pa.view(-1, self.num_heads * self.head_size)
+                    out_q = output.view(-1, self.num_heads * self.head_size)
+                    ops.scaled_fp8_quant(out_uq,
+                                         output_scale,
+                                         output=out_q[num_prefill_tokens:])
+
+        # Reshape the output tensor.
+        return output.view(-1, self.num_heads * self.head_size)
+
+
+def _sdpa_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    output: torch.Tensor,
+    seq_lens: torch.Tensor,
+    num_tokens: int,
+    num_heads: int,
+    head_size: int,
+    scale: float,
+    attn_masks: Optional[List[torch.Tensor]] = None,
+) -> torch.Tensor:
+    start = 0
+    assert output.shape == (num_tokens, num_heads, head_size)
+    assert output.dtype == query.dtype
+    assert output.device == query.device
+
+    for i, seq_len in enumerate(seq_lens):
+        end = start + seq_len
+        with torch.nn.attention.sdpa_kernel(
+                torch.nn.attention.SDPBackend.MATH):
+            sub_out = torch.nn.functional.scaled_dot_product_attention(
+                query[:, start:end, :],
+                key[:, start:end, :],
+                value[:, start:end, :],
+                dropout_p=0.0,
+                is_causal=attn_masks is None,
+                attn_mask=attn_masks[i] if attn_masks else None,
+                scale=scale).movedim(query.dim() - 2, 0)
+            output[start:end, :, :] = sub_out
+            start = end
+
+    return output
diff --git a/vllm_v0.10.0/vllm/attention/backends/triton_mla.py b/vllm_v0.10.0/vllm/attention/backends/triton_mla.py
new file mode 100644
index 0000000..fba5b5f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/backends/triton_mla.py
@@ -0,0 +1,111 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import List, Optional, Type
+
+import torch
+
+from vllm.attention.backends.abstract import (AttentionType,
+                                              is_quantized_kv_cache)
+from vllm.attention.backends.mla.common import (MLACommonBackend,
+                                                MLACommonImpl,
+                                                MLACommonMetadata)
+from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
+
+
+class TritonMLABackend(MLACommonBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "TRITON_MLA"
+
+    @staticmethod
+    def get_impl_cls() -> Type["TritonMLAImpl"]:
+        return TritonMLAImpl
+
+
+class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
+
+    def __init__(
+            self,
+            num_heads: int,
+            head_size: int,
+            scale: float,
+            num_kv_heads: int,
+            alibi_slopes: Optional[List[float]],
+            sliding_window: Optional[int],
+            kv_cache_dtype: str,
+            logits_soft_cap: Optional[float],
+            attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
+            # MLA Specific Arguments
+            **mla_args) -> None:
+        super().__init__(num_heads, head_size, scale, num_kv_heads,
+                         alibi_slopes, sliding_window, kv_cache_dtype,
+                         logits_soft_cap, attn_type,
+                         kv_sharing_target_layer_name, **mla_args)
+
+        unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
+        if any(unsupported_features):
+            raise NotImplementedError(
+                "TritonMLAImpl does not support one of the following: "
+                "alibi_slopes, sliding_window, logits_soft_cap")
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "TritonMLAImpl")
+
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "TritonMLA with FP8 KV cache not yet supported")
+
+    def _forward_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: MLACommonMetadata,
+    ) -> torch.Tensor:
+        assert kv_c_and_k_pe_cache.numel() > 0
+
+        decode_meta = attn_metadata.decode_metadata
+        assert decode_meta is not None
+        B = q_nope.shape[0]
+
+        q = torch.cat([q_nope, q_pe], dim=-1)
+        o = torch.zeros(B,
+                        self.num_heads,
+                        self.kv_lora_rank,
+                        dtype=q.dtype,
+                        device=q.device)
+
+        num_kv_splits = 4  # TODO: heuristic
+
+        # TODO(lucas) Allocate ahead of time
+        attn_logits = torch.empty(
+            (
+                B,
+                self.num_heads,
+                num_kv_splits,
+                # NOTE(lucas) idk why the +1 is here but sglang has it so we
+                # just mirror that
+                self.kv_lora_rank + 1,
+            ),
+            dtype=torch.float32,
+            device=q.device,
+        )
+
+        # Add a head dim of 1
+        kv_c_and_k_pe_cache = kv_c_and_k_pe_cache.unsqueeze(2)
+        kv_c_cache = kv_c_and_k_pe_cache[..., :self.kv_lora_rank]
+        PAGE_SIZE = kv_c_and_k_pe_cache.size(1)
+
+        # Run MQA
+        decode_attention_fwd(q, kv_c_and_k_pe_cache, kv_c_cache, o,
+                             decode_meta.block_tables,
+                             decode_meta.seq_lens_tensor, attn_logits,
+                             num_kv_splits, self.scale, PAGE_SIZE)
+
+        return self._v_up_proj(o)
diff --git a/vllm_v0.10.0/vllm/attention/backends/utils.py b/vllm_v0.10.0/vllm/attention/backends/utils.py
new file mode 100644
index 0000000..34e0590
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/backends/utils.py
@@ -0,0 +1,610 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Attention backend utils"""
+from collections import defaultdict
+from contextlib import contextmanager
+from dataclasses import dataclass
+from itertools import accumulate
+from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type,
+                    TypeVar, Union)
+
+import numpy as np
+import torch
+
+from vllm.attention import (AttentionMetadata, AttentionMetadataBuilder,
+                            AttentionState)
+from vllm.attention.backends.abstract import AttentionType
+from vllm.config import ModelConfig
+from vllm.logger import init_logger
+from vllm.multimodal import MultiModalPlaceholderMap
+from vllm.utils import async_tensor_h2d, make_tensor_with_pad
+
+logger = init_logger(__name__)
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner_base import ModelRunnerBase
+
+# Error string(s) for encoder/decoder
+# unsupported attention scenarios
+STR_NOT_IMPL_ENC_DEC_ROCM_HIP = ("ROCm/HIP is not currently supported "
+                                 "with encoder/decoder models.")
+
+PAD_SLOT_ID = -1
+
+# Switch to numpy implementation of compute_slot_mapping
+# if we have at least this many elements. Could be tuned further.
+_COMPUTE_SLOT_MAPPING_NUMPY_NUMEL = 256
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUBuilder
+
+
+def is_block_tables_empty(block_tables: Union[None, Dict]):
+    """
+    Check if block_tables is None or a dictionary with all None values.
+    """
+    if block_tables is None:
+        return True
+    return (isinstance(block_tables, dict)
+            and all(value is None for value in block_tables.values()))
+
+
+def compute_slot_mapping_start_idx(is_prompt: bool, query_len: int,
+                                   context_len: int, sliding_window: int):
+    """
+    Compute the start index of slot mapping.
+    """
+    start_idx = 0
+    if is_prompt and sliding_window is not None:
+        start_idx = max(0, query_len - sliding_window)
+    return start_idx
+
+
+def _compute_slot_mapping_python(slot_mapping: List[int],
+                                 block_table: List[int], range_start: int,
+                                 range_end: int, block_size: int):
+    for i in range(range_start, range_end):
+        block_number = block_table[i // block_size]
+        block_offset = i % block_size
+        slot = block_number * block_size + block_offset
+        slot_mapping.append(slot)
+
+
+def _compute_slot_mapping_numpy(slot_mapping: List[int],
+                                block_table: List[int], range_start: int,
+                                range_end: int, block_size: int):
+    block_table_array = np.array(block_table)
+    idx = np.arange(range_start, range_end)
+    block_offset = idx % block_size
+    idx //= block_size
+    seq_slot_mapping_array = block_table_array[idx]
+    seq_slot_mapping_array *= block_size
+    seq_slot_mapping_array += block_offset
+    slot_mapping.extend(seq_slot_mapping_array)
+
+
+def compute_slot_mapping(is_profile_run: bool, slot_mapping: List[int],
+                         seq_id: int, seq_len: int, context_len: int,
+                         start_idx: int, block_size: int,
+                         block_tables: Dict[int, List[int]]):
+    """
+    Compute slot mapping.
+    """
+    if is_profile_run:
+        # During memory profiling, the block tables are not
+        # initialized yet. In this case, we just use a dummy
+        # slot mapping.
+        # In embeddings, the block tables are {seq_id: None}.
+        slot_mapping.extend([PAD_SLOT_ID] * seq_len)
+        return
+
+    # Mask the [0, start_idx) tokens of the prompt with
+    # PAD_SLOT_ID, where start_idx is max(0, seq_len -
+    # sliding_window). For example, if the prompt len is 10,
+    # sliding window is 8, and block size is 4, the first two
+    # tokens are masked and the slot mapping will be
+    # [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
+    padding_mask_len = max(0, start_idx - context_len)
+    slot_mapping.extend([PAD_SLOT_ID] * padding_mask_len)
+
+    range_start = max(start_idx, context_len)
+    range_end = seq_len
+    numel = range_end - range_start
+    block_table = block_tables[seq_id]
+
+    # numpy implementation will be faster than python if we have
+    # many elements, otherwise it will be slower.
+    if numel < _COMPUTE_SLOT_MAPPING_NUMPY_NUMEL:
+        _compute_slot_mapping_python(slot_mapping, block_table, range_start,
+                                     range_end, block_size)
+    else:
+        _compute_slot_mapping_numpy(slot_mapping, block_table, range_start,
+                                    range_end, block_size)
+
+
+TAttentionMetadata = TypeVar("TAttentionMetadata", bound='AttentionMetadata')
+
+
+class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]):
+
+    _metadata_cls: Type[TAttentionMetadata]
+
+    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+        self.input_builder = input_builder
+        self.runner = input_builder.runner
+
+        self.sliding_window = input_builder.sliding_window
+        self.block_size = input_builder.block_size
+
+    def prepare(self):
+        self.slot_mapping: List[int] = []
+        self.prefill_seq_lens: List[int] = []
+        self.context_lens: List[int] = []
+        self.block_tables: List[List[int]] = []
+        self.curr_seq_lens: List[int] = []
+        self.multimodal_placeholder_maps: Dict[
+            str,
+            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
+        self.num_prefills = 0
+        self.num_prefill_tokens = 0
+        self.num_decode_tokens = 0
+
+    def _add_seq_group(
+            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
+            chunked_prefill_enabled: bool):
+        is_prompt = inter_data.is_prompt
+        block_tables = inter_data.block_tables
+
+        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
+             curr_sliding_window_block) in zip(
+                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
+                 inter_data.orig_seq_lens, inter_data.seq_lens,
+                 inter_data.query_lens, inter_data.context_lens,
+                 inter_data.curr_sliding_window_blocks):
+            self.context_lens.append(context_len)
+            if is_prompt:
+                mm_maps = inter_data.multi_modal_placeholder_maps
+                if mm_maps:
+                    for modality, placeholders in mm_maps.items():
+                        self.multimodal_placeholder_maps[modality].extend(
+                            placeholders)
+
+                self.num_prefills += 1
+                self.num_prefill_tokens += token_len
+                self.prefill_seq_lens.append(seq_len)
+            else:
+                assert query_len == 1, (
+                    "seq_len: {}, context_len: {}, query_len: {}".format(
+                        seq_len, context_len, query_len))
+                self.num_decode_tokens += query_len
+                self.curr_seq_lens.append(curr_seq_len)
+
+            # Compute block table.
+            # TODO(sang): Combine chunked prefill and prefix caching by
+            # only allowing multiple of block_size chunk size.
+            # NOTE: This only works for oooooooxxx style attention.
+            block_table = []
+            if inter_data.prefix_cache_hit:
+                block_table = block_tables[seq_id]
+            elif ((chunked_prefill_enabled or not is_prompt)
+                  and block_tables is not None):
+                if curr_sliding_window_block == 0:
+                    block_table = block_tables[seq_id]
+                else:
+                    block_table = block_tables[seq_id][
+                        -curr_sliding_window_block:]
+            self.block_tables.append(block_table)
+
+            # Compute slot mapping.
+            is_profile_run = is_block_tables_empty(block_tables)
+            start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
+                                                       context_len,
+                                                       self.sliding_window)
+            compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
+                                 seq_len, context_len, start_idx,
+                                 self.block_size, inter_data.block_tables)
+
+    def build(self, seq_lens: List[int], query_lens: List[int],
+              cuda_graph_pad_size: int, batch_size: int):
+        """Build attention metadata with on-device tensors.
+
+        Args:
+            seq_lens: The maybe padded sequence lengths of the input sequences.
+            query_lens: The query lengths of the input sequences.
+            cuda_graph_pad_size: The padding size for cuda graph.
+                                 -1 if cuda graph is not used.
+            batch_size: The maybe padded batch size.
+        """
+        for inter_data in self.input_builder.inter_data_list:
+            self._add_seq_group(inter_data,
+                                self.input_builder.chunked_prefill_enabled)
+
+        device = self.runner.device
+        use_captured_graph = cuda_graph_pad_size != -1
+
+        max_query_len = max(query_lens)
+        max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
+        max_decode_seq_len = max(self.curr_seq_lens, default=0)
+        num_decode_tokens = self.num_decode_tokens
+        query_start_loc = list(accumulate(query_lens, initial=0))
+        seq_start_loc = list(accumulate(seq_lens, initial=0))
+
+        if use_captured_graph:
+            self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
+            self.block_tables.extend([] * cuda_graph_pad_size)
+            num_decode_tokens = batch_size
+
+            # The shape of graph_block_tables is
+            # [max batch size, max context len // block size].
+            input_block_tables = self.runner.graph_block_tables[:batch_size]
+            for i, block_table in enumerate(self.block_tables):
+                if block_table:
+                    input_block_tables[i, :len(block_table)] = block_table
+            block_tables = torch.from_numpy(input_block_tables).to(
+                device, non_blocking=True)
+        else:
+            block_tables = make_tensor_with_pad(
+                self.block_tables,
+                pad=0,
+                dtype=torch.int,
+                device=device,
+            )
+        assert max_query_len > 0, "query_lens: {}".format(query_lens)
+
+        assert device is not None
+        context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int,
+                                               device, self.runner.pin_memory)
+        seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
+                                           self.runner.pin_memory)
+        slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long,
+                                               device, self.runner.pin_memory)
+        query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32,
+                                                  device,
+                                                  self.runner.pin_memory)
+        seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32,
+                                                device, self.runner.pin_memory)
+        placeholder_index_maps = {
+            modality: placeholder_map.index_map()
+            for modality, placeholder_map in
+            self.multimodal_placeholder_maps.items()
+        }
+
+        return self._metadata_cls(  # type: ignore
+            num_prefills=self.num_prefills,
+            slot_mapping=slot_mapping_tensor,
+            multi_modal_placeholder_index_maps=placeholder_index_maps,
+            enable_kv_scales_calculation=True,
+            num_prefill_tokens=self.num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            max_query_len=max_query_len,
+            max_prefill_seq_len=max_prefill_seq_len,
+            max_decode_seq_len=max_decode_seq_len,
+            query_start_loc=query_start_loc_tensor,
+            seq_start_loc=seq_start_loc_tensor,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=block_tables,
+            use_cuda_graph=use_captured_graph,
+        )
+
+
+class CommonAttentionState(AttentionState):
+
+    def __init__(self, runner: "ModelRunnerBase"):
+        self.runner = runner
+        self._is_graph_capturing = False
+
+    @contextmanager
+    def graph_capture(self, max_batch_size: int):
+
+        self._is_graph_capturing = True
+
+        self._graph_slot_mapping = torch.full((max_batch_size, ),
+                                              PAD_SLOT_ID,
+                                              dtype=torch.long,
+                                              device=self.runner.device)
+        self._graph_seq_lens = torch.ones(max_batch_size,
+                                          dtype=torch.int32,
+                                          device=self.runner.device)
+        self._graph_block_tables = torch.from_numpy(
+            self.runner.graph_block_tables).to(device=self.runner.device)
+
+        yield
+
+        self._is_graph_capturing = False
+        del self._graph_slot_mapping
+        del self._graph_seq_lens
+        del self._graph_block_tables
+
+    def graph_clone(self, batch_size: int) -> "CommonAttentionState":
+        assert self._is_graph_capturing
+        return self.__class__(self.runner)
+
+    def graph_capture_get_metadata_for_batch(
+            self, batch_size: int, is_encoder_decoder_model: bool = False):
+        assert self._is_graph_capturing
+        attn_metadata = self.runner.attn_backend.make_metadata(
+            num_prefills=0,
+            num_prefill_tokens=0,
+            num_decode_tokens=batch_size,
+            slot_mapping=self._graph_slot_mapping[:batch_size],
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=True,
+            seq_lens=None,
+            seq_lens_tensor=self._graph_seq_lens[:batch_size],
+            max_query_len=1,
+            max_decode_query_len=1,
+            max_prefill_seq_len=0,
+            max_decode_seq_len=self.runner.max_seq_len_to_capture,
+            query_start_loc=None,
+            seq_start_loc=None,
+            context_lens_tensor=None,
+            block_tables=self._graph_block_tables[:batch_size],
+            use_cuda_graph=True,
+        )
+        if is_encoder_decoder_model:
+            # The encoder decoder model works only with XFormers and
+            # Flash Attention backend. Assert the same.
+            assert self.runner.attn_backend.get_name() in \
+                   ["XFORMERS", "FLASH_ATTN", "ROCM_FLASH"], \
+                f"Expected attn_backend name to be either 'XFORMERS'," \
+                f"'ROCM_FLASH', or 'FLASH_ATTN', but " \
+                f"got '{self.runner.attn_backend.get_name()}'"
+            self._update_captured_metadata_for_enc_dec_model(
+                batch_size=batch_size, attn_metadata=attn_metadata)
+
+        return attn_metadata
+
+    def get_graph_input_buffers(
+            self,
+            attn_metadata,
+            is_encoder_decoder_model: bool = False) -> Dict[str, Any]:
+        input_buffers = {
+            "slot_mapping": attn_metadata.slot_mapping,
+            "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor,
+            "block_tables": attn_metadata.decode_metadata.block_tables,
+        }
+        if is_encoder_decoder_model:
+            # The encoder decoder model works only with XFormers and
+            # Flash Attention backend. Assert the same.
+            assert self.runner.attn_backend.get_name() in \
+                   ["XFORMERS", "FLASH_ATTN", "ROCM_FLASH"], \
+                f"Expected attn_backend name to be either 'XFORMERS'," \
+                f"'ROCM_FLASH', or 'FLASH_ATTN', but " \
+                f"got '{self.runner.attn_backend.get_name()}'"
+            self._add_additional_input_buffers_for_enc_dec_model(
+                attn_metadata=attn_metadata, input_buffers=input_buffers)
+        return input_buffers
+
+    def prepare_graph_input_buffers(
+            self,
+            input_buffers,
+            attn_metadata,
+            is_encoder_decoder_model: bool = False) -> None:
+        input_buffers["seq_lens_tensor"].copy_(
+            attn_metadata.decode_metadata.seq_lens_tensor, non_blocking=True)
+        input_buffers["block_tables"].copy_(
+            attn_metadata.decode_metadata.block_tables, non_blocking=True)
+        if is_encoder_decoder_model:
+            # The encoder decoder model works only with XFormers and
+            # Flash Attention backend. Assert the same.
+            assert self.runner.attn_backend.get_name() in\
+                ["XFORMERS", "FLASH_ATTN"], \
+                f"Expected attn_backend name to be either 'XFORMERS' or "\
+                f"'FLASH_ATTN', but "\
+                f"got '{self.runner.attn_backend.get_name()}'"
+            self._prepare_input_buffers_for_enc_dec_model(
+                attn_metadata, input_buffers)
+
+    def begin_forward(self, model_input) -> None:
+        return
+
+    def _update_captured_metadata_for_enc_dec_model(self, batch_size: int,
+                                                    attn_metadata):
+        """
+        Updates the attention metadata parameters for CUDA graph capture in an
+        encoder-decoder model.
+
+        This method modifies attention-related tensors and metadata required
+        for CUDA graph capture in encoder-decoder models. Specifically, it
+        updates the cross-attention and encoder sequence tensors in the 
+        AttentionMetadata object.
+        """
+        # During decode phase the cross_slot_mapping will be empty. Hence set
+        # an empty tensor for CUDA Graph capture.
+        attn_metadata.cross_slot_mapping = torch.tensor(
+            [], dtype=torch.int).cuda()
+        attn_metadata.cross_block_tables = torch.full(
+            (batch_size, self.runner.get_max_block_per_batch()),
+            1,
+            dtype=torch.int).cuda()
+        attn_metadata.encoder_seq_lens = torch.full((batch_size, ),
+                                                    1,
+                                                    dtype=torch.int).cuda()
+        attn_metadata.encoder_seq_lens_tensor = torch.full(
+            (batch_size, ), 1, dtype=torch.int).cuda()
+        attn_metadata.max_encoder_seq_len = self.runner.max_seq_len_to_capture
+        attn_metadata.num_encoder_tokens = 0
+
+    def _add_additional_input_buffers_for_enc_dec_model(
+            self, attn_metadata, input_buffers: Dict[str, Any]):
+        """
+        Saves additional input buffers specific to the encoder-decoder model
+        from the attention metadata.
+
+        This method extracts and stores encoder-decoder related input buffers
+        from the `attn_metadata` into the `input_buffers` dictionary. The
+        buffers include encoder sequence lengths, cross-slot mappings, and
+        cross-block tables, which are essential for the encoder-decoder model
+        during CUDA graph replay.
+        """
+        input_buffers["encoder_seq_lens_tensor"] = (
+            attn_metadata.decode_metadata.encoder_seq_lens_tensor)
+        input_buffers["cross_slot_mapping"] = (
+            attn_metadata.decode_metadata.cross_slot_mapping)
+        input_buffers["cross_block_tables"] = (
+            attn_metadata.decode_metadata.cross_block_tables)
+
+    def _prepare_input_buffers_for_enc_dec_model(self, attn_metadata,
+                                                 input_buffers: Dict[str,
+                                                                     Any]):
+        """
+        Populates input buffers with data from the encoder-decoder model's
+        attention metadata.
+
+        This method fills the input buffers with encoder-decoder specific
+        tensors. It copies data from the `attn_metadata` and keyword arguments
+        (`kwargs`) into corresponding buffers in the `input_buffers` dictionary.
+        The copied data includes attention-related metadata as well as input 
+        IDs and positional information for the encoder.
+        """
+        input_buffers["encoder_seq_lens_tensor"].copy_(
+            attn_metadata.decode_metadata.encoder_seq_lens_tensor,
+            non_blocking=True)
+        input_buffers["cross_slot_mapping"].copy_(
+            attn_metadata.decode_metadata.cross_slot_mapping,
+            non_blocking=True)
+        input_buffers["cross_block_tables"].copy_(
+            attn_metadata.decode_metadata.cross_block_tables,
+            non_blocking=True)
+
+
+def is_all_encoder_attn_metadata_set(attn_metadata):
+    '''
+    All attention metadata required for encoder attention is set.
+    '''
+    return ((attn_metadata.encoder_seq_lens is not None)
+            and (attn_metadata.encoder_seq_lens_tensor is not None)
+            and (attn_metadata.max_encoder_seq_len is not None))
+
+
+def is_all_cross_attn_metadata_set(attn_metadata):
+    '''
+    All attention metadata required for enc/dec cross-attention is set.
+
+    Superset of encoder attention required metadata.
+    '''
+    return (attn_metadata.is_all_encoder_attn_metadata_set
+            and (attn_metadata.cross_slot_mapping is not None)
+            and (attn_metadata.cross_block_tables is not None))
+
+
+def get_seq_len_block_table_args(
+    attn_metadata,
+    is_prompt: bool,
+    attn_type: str,
+) -> tuple:
+    '''
+    The particular choice of sequence-length- and block-table-related
+    attributes which should be extracted from attn_metadata is dependent
+    on the type of attention operation.
+
+    Decoder attn -> select entirely decoder self-attention-related fields
+    Encoder/decoder cross-attn -> select encoder sequence lengths & 
+                                  cross-attn block-tables fields
+    Encoder attn -> select encoder sequence lengths fields & no block tables
+    
+    Arguments:
+
+    * attn_metadata: Attention metadata structure associated with attention op
+    * is_prompt: True if prefill, False otherwise
+    * attn_type: encoder attention, decoder self-attention,
+                 encoder/decoder cross-attention
+
+    Returns:
+
+    * Appropriate sequence-lengths tensor
+    * Appropriate max sequence-length scalar
+    * Appropriate block tables (or None)
+    '''
+
+    if attn_type == AttentionType.DECODER:
+        # Decoder self-attention
+        # Choose max_seq_len based on whether we are in prompt_run
+        if is_prompt:
+            max_seq_len = attn_metadata.max_prefill_seq_len
+        else:
+            max_seq_len = attn_metadata.max_decode_seq_len
+        return (attn_metadata.seq_lens_tensor, max_seq_len,
+                attn_metadata.block_tables)
+    elif attn_type == AttentionType.ENCODER_DECODER:
+        # Enc/dec cross-attention KVs match encoder sequence length;
+        # cross-attention utilizes special "cross" block tables
+        return (attn_metadata.encoder_seq_lens_tensor,
+                attn_metadata.max_encoder_seq_len,
+                attn_metadata.cross_block_tables)
+    elif attn_type == AttentionType.ENCODER:
+        # No block tables associated with encoder attention
+        return (attn_metadata.encoder_seq_lens_tensor,
+                attn_metadata.max_encoder_seq_len, None)
+    else:
+        raise AttributeError(f"Invalid attention type {str(attn_type)}")
+
+
+def get_num_prefill_decode_query_kv_tokens(
+    attn_metadata,
+    attn_type: str,
+) -> Tuple[int, int, int]:
+    """
+    Calculate the number of prefill and decode tokens for query, key/value
+    based on the attention metadata and the specified attention type.
+
+    Args:
+        attn_metadata (AttentionMetadata): Attention Metadata object.
+        attn_type (AttentionType): The type of attention being used.
+    Returns:
+        Tuple[int, int, int]: A tuple containing three integers:
+            - The number of prefill query tokens.
+            - The number of prefill key/value tokens.
+            - The number of decode query tokens.
+
+    Raises:
+        AssertionError: If the number of encoder tokens in `attn_metadata` 
+        is `None` when required for the calculations.
+    """
+    num_prefill_query_tokens = 0
+    num_decode_query_tokens = 0
+    num_prefill_kv_tokens = 0
+    if attn_type == AttentionType.ENCODER:
+        # Encoder attention is only invoked during prefill phase.
+        # The same input servers a both query and key.
+        assert attn_metadata.num_encoder_tokens is not None
+        num_prefill_query_tokens = attn_metadata.num_encoder_tokens
+        num_prefill_kv_tokens = attn_metadata.num_encoder_tokens
+        num_decode_query_tokens = 0
+    elif attn_type == AttentionType.ENCODER_DECODER:
+        assert attn_metadata.num_encoder_tokens is not None
+        num_prefill_query_tokens = attn_metadata.num_prefill_tokens
+        # The key is the encoder/cross-attention.
+        num_prefill_kv_tokens = attn_metadata.num_encoder_tokens
+        num_decode_query_tokens = attn_metadata.num_decode_tokens
+    else:  # attn_type == AttentionType.DECODER or
+        # attn_type == AttentionType.ENCODER_ONLY
+        num_prefill_query_tokens = attn_metadata.num_prefill_tokens
+        num_prefill_kv_tokens = attn_metadata.num_prefill_tokens
+        num_decode_query_tokens = attn_metadata.num_decode_tokens
+
+    return (num_prefill_query_tokens, num_prefill_kv_tokens,
+            num_decode_query_tokens)
+
+
+@dataclass
+class MLADims:
+    q_lora_rank: Optional[int]
+    kv_lora_rank: int
+    qk_nope_head_dim: int
+    qk_rope_head_dim: int
+    v_head_dim: int
+
+
+def get_mla_dims(model_config: ModelConfig) -> MLADims:
+    hf_text_config = model_config.hf_text_config
+
+    return MLADims(
+        q_lora_rank=getattr(hf_text_config, "q_lora_rank", None),
+        kv_lora_rank=hf_text_config.kv_lora_rank,
+        qk_nope_head_dim=hf_text_config.qk_nope_head_dim,
+        qk_rope_head_dim=hf_text_config.qk_rope_head_dim,
+        v_head_dim=hf_text_config.v_head_dim,
+    )
diff --git a/vllm_v0.10.0/vllm/attention/backends/xformers.py b/vllm_v0.10.0/vllm/attention/backends/xformers.py
new file mode 100644
index 0000000..0bc38b4
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/backends/xformers.py
@@ -0,0 +1,804 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Attention layer with xFormers and PagedAttention."""
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple, Type
+
+import torch
+from xformers import ops as xops
+from xformers.ops.fmha.attn_bias import (AttentionBias,
+                                         BlockDiagonalCausalMask,
+                                         BlockDiagonalMask,
+                                         LowerTriangularMaskWithTensorBias)
+
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer,
+                                              AttentionMetadata, AttentionType)
+from vllm.attention.backends.utils import (
+    CommonAttentionState, CommonMetadataBuilder,
+    get_num_prefill_decode_query_kv_tokens, get_seq_len_block_table_args,
+    is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set)
+from vllm.attention.ops.paged_attn import (PagedAttention,
+                                           PagedAttentionMetadata)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class XFormersBackend(AttentionBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "XFORMERS"
+
+    @staticmethod
+    def get_impl_cls() -> Type["XFormersImpl"]:
+        return XFormersImpl
+
+    @staticmethod
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return XFormersMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["XFormersMetadataBuilder"]:
+        return XFormersMetadataBuilder
+
+    @staticmethod
+    def get_state_cls() -> Type["CommonAttentionState"]:
+        return CommonAttentionState
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return PagedAttention.get_kv_cache_shape(num_blocks, block_size,
+                                                 num_kv_heads, head_size)
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: Dict[int, int],
+    ) -> None:
+        PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: torch.Tensor,
+    ) -> None:
+        PagedAttention.copy_blocks(kv_caches, src_to_dists)
+
+
+@dataclass
+class XFormersMetadata(AttentionMetadata, PagedAttentionMetadata):
+    """Metadata for XFormersbackend.
+
+    NOTE: Any python object stored here is not updated when it is
+    cuda-graph replayed. If you have values that need to be changed
+    dynamically, it should be stored in tensor. The tensor has to be
+    updated from `CUDAGraphRunner.forward` API.
+    """
+
+    # |---------- N-1 iteration --------|
+    # |---------------- N iteration ---------------------|
+    # |- tokenA -|......................|-- newTokens ---|
+    # |---------- context_len ----------|
+    # |-------------------- seq_len ----------------------|
+    #                                   |-- query_len ---|
+
+    # seq_lens stored as a tensor.
+    seq_lens_tensor: Optional[torch.Tensor]
+
+    # FIXME: It is for flash attn.
+    # Maximum sequence length among prefill batch. 0 if there are decoding
+    # requests only.
+    max_prefill_seq_len: int
+    # Maximum sequence length among decode batch. 0 if there are prefill
+    # requests only.
+    max_decode_seq_len: int
+
+    # Whether or not if cuda graph is enabled.
+    # Cuda-graph is currently enabled for decoding only.
+    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
+    use_cuda_graph: bool
+
+    # (batch_size,). The sequence length per sequence. Sequence length means
+    # the computed tokens + new tokens None if it is a decoding.
+    seq_lens: Optional[List[int]] = None
+
+    # FIXME: It is for flash attn.
+    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+    # the batch, used to index into sequence. E.g., if the sequence length is
+    # [4, 6], it is [0, 4, 10].
+    seq_start_loc: Optional[torch.Tensor] = None
+
+    # (batch_size,) A tensor of context lengths (tokens that are computed
+    # so far).
+    context_lens_tensor: Optional[torch.Tensor] = None
+
+    # Maximum query length in the batch. None for decoding.
+    max_query_len: Optional[int] = None
+
+    # Max number of query tokens among request in the batch.
+    max_decode_query_len: Optional[int] = None
+
+    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
+    # the batch, used to index into subquery. E.g., if the subquery length
+    # is [4, 6], it is [0, 4, 10].
+    query_start_loc: Optional[torch.Tensor] = None
+
+    # Self-attention prefill/decode metadata cache
+    _cached_prefill_metadata: Optional["XFormersMetadata"] = None
+    _cached_decode_metadata: Optional["XFormersMetadata"] = None
+
+    # Begin encoder attn & enc/dec cross-attn fields...
+
+    # Encoder sequence lengths representation
+    encoder_seq_lens: Optional[List[int]] = None
+    encoder_seq_lens_tensor: Optional[torch.Tensor] = None
+    # FIXME: It is for flash attn.
+    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+    # the batch, used to index into sequence. E.g., if the sequence length is
+    # [4, 6], it is [0, 4, 10].
+    encoder_seq_start_loc: Optional[torch.Tensor] = None
+
+    # Maximum sequence length among encoder sequences
+    max_encoder_seq_len: Optional[int] = None
+
+    # Number of tokens input to encoder
+    num_encoder_tokens: Optional[int] = None
+
+    # Cross-attention memory-mapping data structures: slot mapping
+    # and block tables
+    cross_slot_mapping: Optional[torch.Tensor] = None
+    cross_block_tables: Optional[torch.Tensor] = None
+
+    def __post_init__(self):
+        # Set during the execution of the first attention op.
+        # It is a list because it is needed to set per prompt
+        # when alibi slopes is used. It is because of the limitation
+        # from xformer API.
+        # will not appear in the __repr__ and __init__
+        self.attn_bias: Optional[List[AttentionBias]] = None
+        self.encoder_attn_bias: Optional[List[AttentionBias]] = None
+        self.cross_attn_bias: Optional[List[AttentionBias]] = None
+
+    @property
+    def is_all_encoder_attn_metadata_set(self):
+        '''
+        All attention metadata required for encoder attention is set.
+        '''
+        return is_all_encoder_attn_metadata_set(self)
+
+    @property
+    def is_all_cross_attn_metadata_set(self):
+        '''
+        All attention metadata required for enc/dec cross-attention is set.
+
+        Superset of encoder attention required metadata.
+        '''
+        return is_all_cross_attn_metadata_set(self)
+
+    @property
+    def prefill_metadata(self) -> Optional["XFormersMetadata"]:
+        if self.num_prefills == 0:
+            return None
+
+        if self._cached_prefill_metadata is not None:
+            # Recover cached prefill-phase attention
+            # metadata structure
+            return self._cached_prefill_metadata
+
+        assert ((self.seq_lens is not None)
+                or (self.encoder_seq_lens is not None))
+        assert ((self.seq_lens_tensor is not None)
+                or (self.encoder_seq_lens_tensor is not None))
+
+        # Compute some attn_metadata fields which default to None
+        query_start_loc = (None if self.query_start_loc is None else
+                           self.query_start_loc[:self.num_prefills + 1])
+        seq_start_loc = (None if self.seq_start_loc is None else
+                         self.seq_start_loc[:self.num_prefills + 1])
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[:self.num_prefill_tokens])
+        seq_lens = (None if self.seq_lens is None else
+                    self.seq_lens[:self.num_prefills])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[:self.num_prefills])
+        context_lens_tensor = (None if self.context_lens_tensor is None else
+                               self.context_lens_tensor[:self.num_prefills])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[:self.num_prefills])
+
+        # Construct & cache prefill-phase attention metadata structure
+        self._cached_prefill_metadata = XFormersMetadata(
+            num_prefills=self.num_prefills,
+            num_prefill_tokens=self.num_prefill_tokens,
+            num_decode_tokens=0,
+            slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=self.
+            multi_modal_placeholder_index_maps,
+            enable_kv_scales_calculation=self.enable_kv_scales_calculation,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            max_query_len=self.max_query_len,
+            max_prefill_seq_len=self.max_prefill_seq_len,
+            max_decode_seq_len=0,
+            query_start_loc=query_start_loc,
+            seq_start_loc=seq_start_loc,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=block_tables,
+            use_cuda_graph=False,
+            # Begin encoder & cross attn fields below...
+            encoder_seq_lens=self.encoder_seq_lens,
+            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+            max_encoder_seq_len=self.max_encoder_seq_len,
+            cross_slot_mapping=self.cross_slot_mapping,
+            cross_block_tables=self.cross_block_tables)
+        return self._cached_prefill_metadata
+
+    @property
+    def decode_metadata(self) -> Optional["XFormersMetadata"]:
+        if self.num_decode_tokens == 0:
+            return None
+
+        if self._cached_decode_metadata is not None:
+            # Recover cached decode-phase attention
+            # metadata structure
+            return self._cached_decode_metadata
+        assert ((self.seq_lens_tensor is not None)
+                or (self.encoder_seq_lens_tensor is not None))
+
+        # Compute some attn_metadata fields which default to None
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[self.num_prefill_tokens:])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[self.num_prefills:])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[self.num_prefills:])
+
+        # Construct & cache decode-phase attention metadata structure
+        self._cached_decode_metadata = XFormersMetadata(
+            num_prefills=0,
+            num_prefill_tokens=0,
+            num_decode_tokens=self.num_decode_tokens,
+            slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=True,
+            seq_lens_tensor=seq_lens_tensor,
+            max_prefill_seq_len=0,
+            max_decode_seq_len=self.max_decode_seq_len,
+            block_tables=block_tables,
+            use_cuda_graph=self.use_cuda_graph,
+            # Begin encoder & cross attn fields below...
+            encoder_seq_lens=self.encoder_seq_lens,
+            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+            max_encoder_seq_len=self.max_encoder_seq_len,
+            cross_slot_mapping=self.cross_slot_mapping,
+            cross_block_tables=self.cross_block_tables)
+
+        # Batch may be composed of prefill|decodes, adjust query start indices
+        # to refer to the start of decodes when the two are split apart.
+        # E.g. in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6].
+        if self._cached_decode_metadata.query_start_loc is not None:
+            qs = self._cached_decode_metadata.query_start_loc
+            self._cached_decode_metadata.query_start_loc = qs - qs[0]
+        return self._cached_decode_metadata
+
+
+def _get_attn_bias(
+    attn_metadata: XFormersMetadata,
+    attn_type: str,
+) -> Optional[AttentionBias]:
+    '''
+    Extract appropriate attention bias from attention metadata
+    according to attention type.
+
+    Arguments:
+
+    * attn_metadata: Attention metadata structure associated with attention
+    * attn_type: encoder attention, decoder self-attention,
+                 encoder/decoder cross-attention
+
+    Returns:
+    * Appropriate attention bias value given the attention type
+    '''
+
+    if (attn_type == AttentionType.DECODER
+            or attn_type == AttentionType.ENCODER_ONLY):
+        return attn_metadata.attn_bias
+    elif attn_type == AttentionType.ENCODER:
+        return attn_metadata.encoder_attn_bias
+    elif attn_type == AttentionType.ENCODER_DECODER:
+        return attn_metadata.cross_attn_bias
+    else:
+        raise AttributeError(f"Invalid attention type {str(attn_type)}")
+
+
+def _set_attn_bias(
+    attn_metadata: XFormersMetadata,
+    attn_bias: List[Optional[AttentionBias]],
+    attn_type: str,
+) -> None:
+    '''
+    Update appropriate attention bias field of attention metadata,
+    according to attention type.
+
+    Arguments:
+
+    * attn_metadata: Attention metadata structure associated with attention
+    * attn_bias: The desired attention bias value
+    * attn_type: encoder attention, decoder self-attention,
+                 encoder/decoder cross-attention
+    '''
+
+    if (attn_type == AttentionType.DECODER
+            or attn_type == AttentionType.ENCODER_ONLY):
+        attn_metadata.attn_bias = attn_bias
+    elif attn_type == AttentionType.ENCODER:
+        attn_metadata.encoder_attn_bias = attn_bias
+    elif attn_type == AttentionType.ENCODER_DECODER:
+        attn_metadata.cross_attn_bias = attn_bias
+    else:
+        raise AttributeError(f"Invalid attention type {str(attn_type)}")
+
+
+class XFormersMetadataBuilder(CommonMetadataBuilder[XFormersMetadata]):
+
+    _metadata_cls = XFormersMetadata
+
+
+class XFormersImpl(AttentionImpl[XFormersMetadata]):
+    """
+    If the input tensors contain prompt tokens, the layout is as follows:
+    |<--------------- num_prefill_tokens ----------------->|	
+    |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->|
+
+    Otherwise, the layout is as follows:	
+    |<----------------- num_decode_tokens ------------------>|	
+    |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->|
+
+    Generation tokens can contain padding when cuda-graph is used.
+    Currently, prompt tokens don't contain any padding.
+
+    The prompts might have different lengths, while the generation tokens
+    always have length 1.
+
+    If chunked prefill is enabled, prefill tokens and decode tokens can be
+    batched together in a flattened 1D query.
+
+    |<----- num_prefill_tokens ---->|<------- num_decode_tokens --------->|
+    |<-prefill_0->|...|<-prefill_N-1->|<--decode_0-->|...|<--decode_M-1-->|
+
+    Currently, cuda graph is disabled for chunked prefill, meaning there's no
+    padding between prefill and decode tokens.
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[List[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
+        use_irope: bool = False,
+    ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0 "
+                                      "XFORMERS backend.")
+        if logits_soft_cap is not None:
+            logger.warning_once("XFormers does not support logits soft cap. "
+                                "Outputs may be slightly off.")
+        if use_irope:
+            logger.warning_once(
+                "Using irope in XFormers is not supported yet, it will fall"
+                " back to global attention for long context.")
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+        self.alibi_slopes = alibi_slopes
+        self.sliding_window = sliding_window
+        self.kv_cache_dtype = kv_cache_dtype
+
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        supported_head_sizes = PagedAttention.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
+            raise ValueError(
+                f"Head size {head_size} is not supported by PagedAttention. "
+                f"Supported head sizes are: {supported_head_sizes}.")
+
+        self.attn_type = attn_type
+
+    def forward(
+        self,
+        layer: AttentionLayer,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor],
+        value: Optional[torch.Tensor],
+        kv_cache: torch.Tensor,
+        attn_metadata: "XFormersMetadata",
+        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass with xFormers and PagedAttention.
+
+        For decoder-only models: query, key and value must be non-None.
+
+        For encoder/decoder models:
+        * XFormersImpl.forward() may be invoked for both self- and cross-
+          attention layers.
+        * For self-attention: query, key and value must be non-None.
+        * For cross-attention:
+            * Query must be non-None
+            * During prefill, key and value must be non-None; key and value
+              get cached for use during decode.
+            * During decode, key and value may be None, since:
+              (1) key and value tensors were cached during prefill, and
+              (2) cross-attention key and value tensors do not grow during
+                  decode
+        
+        A note on how the attn_type (attention type enum) argument impacts
+        attention forward() behavior:
+    
+            * DECODER: normal decoder-only behavior;
+                use decoder self-attention block table
+            * ENCODER: no KV caching; pass encoder sequence
+                attributes (encoder_seq_lens/encoder_seq_lens_tensor/
+                max_encoder_seq_len) to kernel, in lieu of decoder
+                sequence attributes (seq_lens/seq_lens_tensor/max_seq_len).
+                Used for encoder branch of encoder-decoder models.
+            * ENCODER_ONLY: no kv_caching, uses the normal attention 
+                attributes (seq_lens/seq_lens_tensor/max_seq_len).
+            * ENCODER_DECODER: cross-attention behavior;
+                use cross-attention block table for caching KVs derived
+                from encoder hidden states; since KV sequence lengths
+                will match encoder sequence lengths, pass encoder sequence
+                attributes to kernel (encoder_seq_lens/encoder_seq_lens_tensor/
+                max_encoder_seq_len)
+    
+        Args:
+            query: shape = [num_tokens, num_heads * head_size]
+            key: shape = [num_tokens, num_kv_heads * head_size]
+            value: shape = [num_tokens, num_kv_heads * head_size]
+            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
+                NOTE: kv_cache will be an empty tensor with shape [0]
+                for profiling run.
+            attn_metadata: Metadata for attention.
+            attn_type: Select attention type, between encoder attention,
+                       decoder self-attention, or encoder/decoder cross-
+                       attention. Defaults to decoder self-attention,
+                       which is the vLLM default generally
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """
+        if output_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported"
+                " for XFormersImpl")
+
+        attn_type = self.attn_type
+        # Check that appropriate attention metadata attributes are
+        # selected for the desired attention type
+        if (attn_type == AttentionType.ENCODER
+                and (not attn_metadata.is_all_encoder_attn_metadata_set)):
+            raise AttributeError("Encoder attention requires setting "
+                                 "encoder metadata attributes.")
+
+        elif (attn_type == AttentionType.ENCODER_DECODER
+              and (not attn_metadata.is_all_cross_attn_metadata_set)):
+            raise AttributeError("Encoder/decoder cross-attention "
+                                 "requires setting cross-attention "
+                                 "metadata attributes.")
+
+        query = query.view(-1, self.num_heads, self.head_size)
+        if key is not None:
+            assert value is not None
+            key = key.view(-1, self.num_kv_heads, self.head_size)
+            value = value.view(-1, self.num_kv_heads, self.head_size)
+        else:
+            assert value is None
+
+        # Self-attention vs. cross-attention will impact
+        # which KV cache memory-mapping & which
+        # seqlen datastructures we utilize
+
+        if (attn_type != AttentionType.ENCODER and kv_cache.numel() > 0):
+            # KV-cache during decoder-self- or
+            # encoder-decoder-cross-attention, but not
+            # during encoder attention.
+            #
+            # Even if there are no new key/value pairs to cache,
+            # we still need to break out key_cache and value_cache
+            # i.e. for later use by paged attention
+            key_cache, value_cache = PagedAttention.split_kv_cache(
+                kv_cache, self.num_kv_heads, self.head_size)
+
+            if (key is not None) and (value is not None):
+
+                if attn_type == AttentionType.ENCODER_DECODER:
+                    # Update cross-attention KV cache (prefill-only)
+                    # During cross-attention decode, key & value will be None,
+                    # preventing this IF-statement branch from running
+                    updated_slot_mapping = attn_metadata.cross_slot_mapping
+                else:
+                    # Update self-attention KV cache (prefill/decode)
+                    updated_slot_mapping = attn_metadata.slot_mapping
+
+                # Reshape the input keys and values and store them in the cache.
+                # If kv_cache is not provided, the new key and value tensors are
+                # not cached. This happens during the initial memory
+                # profiling run.
+                PagedAttention.write_to_paged_cache(
+                    key, value, key_cache, value_cache, updated_slot_mapping,
+                    self.kv_cache_dtype, layer._k_scale, layer._v_scale)
+        (num_prefill_query_tokens, num_prefill_kv_tokens,
+        num_decode_query_tokens) = \
+            get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type)
+
+        output = torch.empty_like(query)
+        # Query for decode. KV is not needed because it is already cached.
+        decode_query = query[num_prefill_query_tokens:]
+        # QKV for prefill.
+        query = query[:num_prefill_query_tokens]
+        if key is not None and value is not None:
+            key = key[:num_prefill_kv_tokens]
+            value = value[:num_prefill_kv_tokens]
+
+        assert query.shape[0] == num_prefill_query_tokens
+        assert decode_query.shape[0] == num_decode_query_tokens
+
+        if prefill_meta := attn_metadata.prefill_metadata:
+            # Prompt run.
+            if kv_cache.numel() == 0 or prefill_meta.block_tables.numel() == 0:
+                # normal attention.
+                # block tables are empty if the prompt does not have a cached
+                # prefix.
+                out = self._run_memory_efficient_xformers_forward(
+                    query, key, value, prefill_meta, attn_type=attn_type)
+                assert out.shape == output[:num_prefill_query_tokens].shape
+                output[:num_prefill_query_tokens] = out
+            else:
+                assert attn_type != AttentionType.ENCODER_ONLY, (
+                    "Encoder-only models should not have prefix attention.")
+
+                assert prefill_meta.query_start_loc is not None
+                assert prefill_meta.max_query_len is not None
+
+                # prefix-enabled attention
+                # TODO(Hai) this triton kernel has regression issue (broke) to
+                # deal with different data types between KV and FP8 KV cache,
+                # to be addressed separately.
+                out = PagedAttention.forward_prefix(
+                    query,
+                    key,
+                    value,
+                    self.kv_cache_dtype,
+                    key_cache,
+                    value_cache,
+                    prefill_meta.block_tables,
+                    prefill_meta.query_start_loc,
+                    prefill_meta.seq_lens_tensor,
+                    prefill_meta.max_query_len,
+                    self.alibi_slopes,
+                    self.sliding_window,
+                    layer._k_scale,
+                    layer._v_scale,
+                )
+                assert output[:num_prefill_query_tokens].shape == out.shape
+                output[:num_prefill_query_tokens] = out
+
+        if decode_meta := attn_metadata.decode_metadata:
+            assert attn_type != AttentionType.ENCODER_ONLY, (
+                "Encoder-only models should not have decode metadata.")
+
+            (
+                seq_lens_arg,
+                max_seq_len_arg,
+                block_tables_arg,
+            ) = get_seq_len_block_table_args(decode_meta, False, attn_type)
+
+            output[num_prefill_query_tokens:] = PagedAttention.forward_decode(
+                decode_query,
+                key_cache,
+                value_cache,
+                block_tables_arg,
+                seq_lens_arg,
+                max_seq_len_arg,
+                self.kv_cache_dtype,
+                self.num_kv_heads,
+                self.scale,
+                self.alibi_slopes,
+                layer._k_scale,
+                layer._v_scale,
+            )
+
+        # Reshape the output tensor.
+        return output.view(-1, self.num_heads * self.head_size)
+
+    def _run_memory_efficient_xformers_forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_metadata: XFormersMetadata,
+        attn_type: str = AttentionType.DECODER,
+    ) -> torch.Tensor:
+        """Attention for 1D query of multiple prompts. Multiple prompt
+        tokens are flattened in to `query` input.
+
+        See https://facebookresearch.github.io/xformers/components/ops.html
+        for API spec.
+
+        Args:
+            output: shape = [num_prefill_tokens, num_heads, head_size]
+            query: shape = [num_prefill_tokens, num_heads, head_size]
+            key: shape = [num_prefill_tokens, num_kv_heads, head_size]
+            value: shape = [num_prefill_tokens, num_kv_heads, head_size]
+            attn_metadata: Metadata for attention.
+            attn_type: Select attention type, between encoder attention,
+                       decoder self-attention, or encoder/decoder cross-
+                       attention. Defaults to decoder self-attention,
+                       which is the vLLM default generally
+        """
+
+        original_query = query
+        if self.num_kv_heads != self.num_heads:
+            # GQA/MQA requires the shape [B, M, G, H, K].
+            # Note that the output also has the same shape (which is different
+            # from a spec from the doc).
+            query = query.view(query.shape[0], self.num_kv_heads,
+                               self.num_queries_per_kv, query.shape[-1])
+            key = key[:, :,
+                      None, :].expand(key.shape[0], self.num_kv_heads,
+                                      self.num_queries_per_kv, key.shape[-1])
+            value = value[:, :,
+                          None, :].expand(value.shape[0], self.num_kv_heads,
+                                          self.num_queries_per_kv,
+                                          value.shape[-1])
+
+        # Set attention bias if not provided. This typically happens at
+        # the very attention layer of every iteration.
+        # FIXME(woosuk): This is a hack.
+        attn_bias = _get_attn_bias(attn_metadata, attn_type)
+        if attn_bias is None:
+            if self.alibi_slopes is None:
+
+                # Cross attention block of decoder branch of encoder-decoder
+                # model uses seq_lens for dec / encoder_seq_lens for enc
+                if (attn_type == AttentionType.ENCODER_DECODER):
+                    assert attn_metadata.seq_lens is not None
+                    assert attn_metadata.encoder_seq_lens is not None
+
+                    # Cross-attention mask is non-causal
+                    attn_bias = BlockDiagonalMask.from_seqlens(
+                        attn_metadata.seq_lens,
+                        attn_metadata.encoder_seq_lens,
+                        device=query.device)
+
+                # Encoder branch of encoder-decoder model uses
+                # attn_metadata.encoder_seq_lens
+                elif attn_type == AttentionType.ENCODER:
+
+                    assert attn_metadata.encoder_seq_lens is not None
+
+                    # Encoder self-attention mask is non-causal
+                    attn_bias = BlockDiagonalMask.from_seqlens(
+                        attn_metadata.encoder_seq_lens, device=query.device)
+
+                # Self-attention block of encoder-only model just
+                # uses the seq_lens directly.
+                elif attn_type == AttentionType.ENCODER_ONLY:
+                    assert attn_metadata.seq_lens is not None
+
+                    # Encoder self-attention mask is non-causal
+                    attn_bias = BlockDiagonalMask.from_seqlens(
+                        attn_metadata.seq_lens, device=query.device)
+
+                # Self-attention block of decoder branch just
+                # uses the seq_lens directly
+                elif attn_type == AttentionType.DECODER:
+                    assert attn_metadata.seq_lens is not None
+
+                    # Decoder self-attention mask is causal
+                    attn_bias = BlockDiagonalCausalMask.from_seqlens(
+                        attn_metadata.seq_lens, device=query.device)
+                else:
+                    raise ValueError("Unknown AttentionType: %s", attn_type)
+
+                if self.sliding_window is not None:
+                    attn_bias = attn_bias.make_local_attention(
+                        self.sliding_window)
+                attn_bias = [attn_bias]
+            else:
+                assert attn_type == AttentionType.DECODER
+                assert attn_metadata.seq_lens is not None
+                attn_bias = _make_alibi_bias(self.alibi_slopes,
+                                             self.num_kv_heads, query.dtype,
+                                             attn_metadata.seq_lens)
+
+            _set_attn_bias(attn_metadata, attn_bias, attn_type)
+
+        # No alibi slopes.
+        # TODO(woosuk): Too many view operations. Let's try to reduce
+        # them in the future for code readability.
+        if self.alibi_slopes is None:
+            # Add the batch dimension.
+            query = query.unsqueeze(0)
+            key = key.unsqueeze(0)
+            value = value.unsqueeze(0)
+            out = xops.memory_efficient_attention_forward(
+                query,
+                key,
+                value,
+                attn_bias=attn_bias[0],
+                p=0.0,
+                scale=self.scale)
+            return out.view_as(original_query)
+
+        # Attention with alibi slopes.
+        # FIXME(woosuk): Because xformers does not support dynamic sequence
+        # lengths with custom attention bias, we process each prompt one by
+        # one. This is inefficient, especially when we have many short prompts.
+        assert attn_metadata.seq_lens is not None
+        output = torch.empty_like(original_query)
+        start = 0
+        for i, seq_len in enumerate(attn_metadata.seq_lens):
+            end = start + seq_len
+            out = xops.memory_efficient_attention_forward(
+                query[None, start:end],
+                key[None, start:end],
+                value[None, start:end],
+                attn_bias=attn_bias[i],
+                p=0.0,
+                scale=self.scale)
+            # TODO(woosuk): Unnecessary copy. Optimize.
+            output[start:end].copy_(out.view_as(original_query[start:end]))
+            start += seq_len
+        return output
+
+
+def _make_alibi_bias(
+    alibi_slopes: torch.Tensor,
+    num_kv_heads: int,
+    dtype: torch.dtype,
+    seq_lens: List[int],
+) -> List[AttentionBias]:
+    attn_biases: List[AttentionBias] = []
+    for seq_len in seq_lens:
+        bias = torch.arange(seq_len, dtype=dtype)
+        # NOTE(zhuohan): HF uses
+        #     `bias = bias[None, :].repeat(seq_len, 1)`
+        # here. We find that both biases give the same results, but
+        # the bias below more accurately follows the original ALiBi
+        # paper.
+        # Calculate a matrix where each element represents ith element- jth
+        # element.
+        bias = bias[None, :] - bias[:, None]
+
+        padded_len = (seq_len + 7) // 8 * 8
+        num_heads = alibi_slopes.shape[0]
+        bias = torch.empty(
+            1,  # batch size
+            num_heads,
+            seq_len,
+            padded_len,
+            device=alibi_slopes.device,
+            dtype=dtype,
+        )[:, :, :, :seq_len].copy_(bias)
+        bias.mul_(alibi_slopes[:, None, None])
+        attn_biases.append(LowerTriangularMaskWithTensorBias(bias))
+
+    return attn_biases
diff --git a/vllm_v0.10.0/vllm/attention/layer.py b/vllm_v0.10.0/vllm/attention/layer.py
new file mode 100644
index 0000000..178453e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/layer.py
@@ -0,0 +1,517 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Attention layer."""
+from typing import List, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import vllm.envs as envs
+from vllm.attention import AttentionType
+from vllm.attention.selector import backend_name_to_enum, get_attn_backend
+from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target
+from vllm.config import CacheConfig, get_current_vllm_config
+from vllm.distributed.kv_transfer import (get_kv_transfer_group,
+                                          has_kv_transfer_group,
+                                          is_v1_kv_transfer_group)
+from vllm.forward_context import ForwardContext, get_forward_context
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+from vllm.platforms import _Backend, current_platform
+from vllm.utils import direct_register_custom_op
+
+logger = init_logger(__name__)
+USE_XFORMERS_OPS = None
+
+
+def check_xformers_availability():
+    global USE_XFORMERS_OPS
+    if USE_XFORMERS_OPS is not None:
+        return USE_XFORMERS_OPS
+
+    if current_platform.is_cuda() and current_platform.has_device_capability(
+            100):
+        # Xformers FA is not compatible with B200
+        USE_XFORMERS_OPS = False
+    else:
+        try:
+            from importlib.util import find_spec
+
+            find_spec("xformers.ops")
+            USE_XFORMERS_OPS = True
+        except ImportError:
+            USE_XFORMERS_OPS = False
+
+    # the warning only needs to be shown once
+    if not USE_XFORMERS_OPS:
+        logger.warning("Xformers is not available, falling back.")
+
+    return USE_XFORMERS_OPS
+
+
+class Attention(nn.Module):
+    """Attention layer.
+
+    This class takes query, key, and value tensors as input. The input tensors
+    can either contain prompt tokens or generation tokens.
+    The class does the following:
+
+    1. Store the input key and value tensors in the KV cache.
+    2. Perform (multi-head/multi-query/grouped-query) attention.
+    3. Return the output tensor.
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: Optional[int] = None,
+        alibi_slopes: Optional[List[float]] = None,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        logits_soft_cap: Optional[float] = None,
+        per_layer_sliding_window: Optional[int] = None,
+        use_mla: bool = False,
+        prefix: str = "",
+        attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
+        **extra_impl_args,
+    ) -> None:
+        """
+        The KV cache is stored inside this class and is accessed via
+        `self.kv_cache`.
+        """
+        super().__init__()
+        if per_layer_sliding_window is not None:
+            # per-layer sliding window
+            sliding_window = per_layer_sliding_window
+        elif cache_config is not None:
+            # model-level sliding window
+            sliding_window = cache_config.sliding_window
+        else:
+            sliding_window = None
+
+        if cache_config is not None:
+            kv_cache_dtype = cache_config.cache_dtype
+            block_size = cache_config.block_size
+            is_attention_free = cache_config.is_attention_free
+            calculate_kv_scales = cache_config.calculate_kv_scales
+        else:
+            kv_cache_dtype = "auto"
+            block_size = 16
+            is_attention_free = False
+            calculate_kv_scales = False
+        if num_kv_heads is None:
+            num_kv_heads = num_heads
+        assert num_heads % num_kv_heads == 0, \
+            f"num_heads ({num_heads}) is not " \
+            f"divisible by num_kv_heads ({num_kv_heads})"
+
+        # The default k/v_scale is set to 1.0. This is ignored
+        # when kv-cache is not fp8, and should be used with
+        # kv-cache in fp8_e5m2. For kv-cache in fp8_e4m3, we
+        # expect the pre-quantized k/v_scale to be loaded along
+        # with the model weights.
+        self.kv_cache_dtype = kv_cache_dtype
+        self.calculate_kv_scales = calculate_kv_scales
+        self._k_scale = torch.tensor(1.0, dtype=torch.float32)
+        self._v_scale = torch.tensor(1.0, dtype=torch.float32)
+        # FlashAttn doesn't support quantizing the kv-cache only
+        # but requires q to be quantized as well.
+        self._q_scale = torch.tensor(1.0, dtype=torch.float32)
+        self._prob_scale = torch.tensor(1.0, dtype=torch.float32)
+
+        # We also keep the float32 versions of k/v_scale for attention
+        # backends that don't support tensors (Flashinfer)
+        self._k_scale_float = 1.0
+        self._v_scale_float = 1.0
+
+        self.use_mla = use_mla
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.num_kv_heads = num_kv_heads
+        self.sliding_window = sliding_window
+
+        # For v1 we have backend agnostic iRoPE (local chunked attention)
+        # we have to store the flag on the layer so gpu model runner can
+        # set KVSpec appropriately (and pop it so it doesnt get passed to
+        # the backends)
+        if envs.VLLM_USE_V1:
+            self.use_irope = extra_impl_args.pop("use_irope", False)
+        else:
+            self.use_irope = extra_impl_args.get("use_irope", False)
+
+        quant_method = quant_config.get_quant_method(
+            self, prefix=prefix) if quant_config else None
+        if quant_method is not None and not isinstance(
+                quant_method, UnquantizedLinearMethod):
+            assert isinstance(quant_method, BaseKVCacheMethod)
+            # TODO (mgoin): kv cache dtype should be specified in the FP8
+            # checkpoint config and become the "auto" behavior
+            if self.kv_cache_dtype == "fp8_e5m2":
+                raise ValueError("fp8_e5m2 kv-cache is not supported with "
+                                 "fp8 checkpoints.")
+            # If quantization is enabled, we make "k_scale" and "v_scale"
+            # parameters so that it can be loaded from the model checkpoint.
+            # The k/v_scale will then be converted back to native float32
+            # values after weight loading.
+            self.quant_method = quant_method
+            self.quant_method.create_weights(self)
+
+        # During model initialization, the default dtype is set as the model
+        # weight and activation dtype.
+        dtype = torch.get_default_dtype()
+        attn_backend = get_attn_backend(head_size,
+                                        dtype,
+                                        kv_cache_dtype,
+                                        block_size,
+                                        is_attention_free,
+                                        use_mla=use_mla)
+        impl_cls = attn_backend.get_impl_cls()
+        self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
+                             alibi_slopes, sliding_window, kv_cache_dtype,
+                             logits_soft_cap, attn_type,
+                             kv_sharing_target_layer_name, **extra_impl_args)
+        self.backend = backend_name_to_enum(attn_backend.get_name())
+        self.dtype = dtype
+
+        # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how
+        # torch.compile works by registering the attention as one giant
+        # opaque custom op. For other platforms, we directly call them
+        # and let torch.compile handle them.
+        self.use_direct_call = not current_platform.is_cuda_alike(
+        ) and not current_platform.is_cpu()
+
+        self.use_output = attn_backend.accept_output_buffer
+        compilation_config = get_current_vllm_config().compilation_config
+        if prefix in compilation_config.static_forward_context:
+            raise ValueError(f"Duplicate layer name: {prefix}")
+        compilation_config.static_forward_context[prefix] = self
+        self.layer_name = prefix
+        self.attn_type = attn_type
+
+        if kv_sharing_target_layer_name is not None:
+            validate_kv_sharing_target(
+                prefix,
+                kv_sharing_target_layer_name,
+                compilation_config.static_forward_context,
+            )
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
+
+        # use a placeholder kv cache tensor during init, which will be replaced
+        # by bind_kv_cache
+        # this variable will not be accessed if use_direct_call is True
+        self.kv_cache = [
+            torch.tensor([]) for _ in range(get_current_vllm_config(
+            ).parallel_config.pipeline_parallel_size)
+        ]
+
+        self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32)
+        self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32)
+        self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32)
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        # For some alternate attention backends like MLA the attention output
+        # shape does not match the query shape, so we optionally let the model
+        # definition specify the output tensor shape.
+        output_shape: Optional[torch.Size] = None,
+    ) -> torch.Tensor:
+        """
+        The KV cache is stored inside this class and is accessed via
+        `self.kv_cache`.
+
+        Attention metadata (`attn_metadata`) is set using a context manager in
+        the model runner's `execute_model` method. It is accessed via forward
+        context using
+        `vllm.forward_context.get_forward_context().attn_metadata`.
+        """
+        if self.calculate_kv_scales:
+            attn_metadata = get_forward_context().attn_metadata
+            if attn_metadata.enable_kv_scales_calculation:
+                self.calc_kv_scales(query, key, value)
+        if self.use_output:
+            output_shape = (output_shape
+                            if output_shape is not None else query.shape)
+            output = torch.zeros(output_shape,
+                                 dtype=query.dtype,
+                                 device=query.device)
+            hidden_size = output_shape[-1]
+            # We skip reshaping query, key and value tensors for the MLA
+            # backend since these tensors have different semantics and are
+            # processed differently.
+            if not self.use_mla:
+                # Reshape the query, key, and value tensors.
+                # NOTE(woosuk): We do this outside the custom op to minimize the
+                # CPU overheads from the non-CUDA-graph regions.
+                query = query.view(-1, self.num_heads, self.head_size)
+                output = output.view(-1, self.num_heads, self.head_size)
+                if key is not None:
+                    key = key.view(-1, self.num_kv_heads, self.head_size)
+                if value is not None:
+                    value = value.view(-1, self.num_kv_heads, self.head_size)
+            if self.use_direct_call:
+                forward_context: ForwardContext = get_forward_context()
+                attn_metadata = forward_context.attn_metadata
+                if isinstance(attn_metadata, dict):
+                    attn_metadata = attn_metadata[self.layer_name]
+                self_kv_cache = self.kv_cache[forward_context.virtual_engine]
+                self.impl.forward(self,
+                                  query,
+                                  key,
+                                  value,
+                                  self_kv_cache,
+                                  attn_metadata,
+                                  output=output)
+            else:
+                torch.ops.vllm.unified_attention_with_output(
+                    query, key, value, output, self.layer_name)
+            return output.view(-1, hidden_size)
+        else:
+            if self.use_direct_call:
+                forward_context = get_forward_context()
+                attn_metadata = forward_context.attn_metadata
+                if isinstance(attn_metadata, dict):
+                    attn_metadata = attn_metadata[self.layer_name]
+                self_kv_cache = self.kv_cache[forward_context.virtual_engine]
+                return self.impl.forward(self, query, key, value,
+                                         self_kv_cache, attn_metadata)
+            else:
+                return torch.ops.vllm.unified_attention(
+                    query, key, value, self.layer_name)
+
+    def calc_kv_scales(self, query, key, value):
+        self._q_scale.copy_(torch.abs(query).max() / self.q_range)
+        self._k_scale.copy_(torch.abs(key).max() / self.k_range)
+        self._v_scale.copy_(torch.abs(value).max() / self.v_range)
+        self._k_scale_float = self._k_scale.item()
+        self._v_scale_float = self._v_scale.item()
+        # We only calculate the scales once
+        self.calculate_kv_scales = False
+
+    def extra_repr(self) -> str:
+        s = f"head_size={self.impl.head_size}"  # type: ignore
+        s += f", num_heads={self.impl.num_heads}"  # type: ignore
+        s += f", num_kv_heads={self.impl.num_kv_heads}"  # type: ignore
+        s += f", scale={self.impl.scale}"  # type: ignore
+        s += f", backend={self.impl.__class__.__name__}"
+        return s
+
+    def process_weights_after_loading(self, act_dtype: torch.dtype):
+        if hasattr(self.impl, "process_weights_after_loading"):
+            self.impl.process_weights_after_loading(act_dtype)
+
+
+class MultiHeadAttention(nn.Module):
+    """Multi-headed attention without any cache, used for ViT."""
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: Optional[int] = None,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = scale
+        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+
+        assert self.num_heads % self.num_kv_heads == 0, \
+            f"num_heads ({self.num_heads}) is not " \
+            f"divisible by num_kv_heads ({self.num_kv_heads})"
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        dtype = torch.get_default_dtype()
+        attn_backend = get_attn_backend(head_size,
+                                        dtype,
+                                        kv_cache_dtype=None,
+                                        block_size=16,
+                                        is_attention_free=False)
+        backend = backend_name_to_enum(attn_backend.get_name())
+        if current_platform.is_rocm():
+            # currently, only torch_sdpa is supported on rocm
+            self.attn_backend = _Backend.TORCH_SDPA
+        else:
+            if backend in (_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1,
+                           _Backend.FLEX_ATTENTION):
+                backend = _Backend.XFORMERS
+
+            self.attn_backend = backend if backend in {
+                _Backend.TORCH_SDPA, _Backend.XFORMERS, _Backend.PALLAS_VLLM_V1
+            } else _Backend.TORCH_SDPA
+
+        if (self.attn_backend == _Backend.XFORMERS
+                and not check_xformers_availability()):
+            self.attn_backend = _Backend.TORCH_SDPA
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+    ) -> torch.Tensor:
+        """Input shape: batch_size x seq_len x hidden_size"""
+        # TODO(Isotr0py): Use existing backend implementations and support FA3
+        bsz, q_len, _ = query.size()
+        kv_len = key.size(1)
+
+        query = query.view(bsz, q_len, self.num_heads, self.head_size)
+        key = key.view(bsz, kv_len, self.num_kv_heads, self.head_size)
+        value = value.view(bsz, kv_len, self.num_kv_heads, self.head_size)
+
+        if (num_repeat := self.num_queries_per_kv) > 1:
+            # Handle MQA and GQA
+            key = torch.repeat_interleave(key, num_repeat, dim=2)
+            value = torch.repeat_interleave(value, num_repeat, dim=2)
+
+        if self.attn_backend == _Backend.XFORMERS:
+            from xformers import ops as xops
+
+            out = xops.memory_efficient_attention_forward(query,
+                                                          key,
+                                                          value,
+                                                          scale=self.scale)
+        elif self.attn_backend == _Backend.TORCH_SDPA:
+            query, key, value = (x.transpose(1, 2)
+                                 for x in (query, key, value))
+            out = F.scaled_dot_product_attention(query,
+                                                 key,
+                                                 value,
+                                                 scale=self.scale)
+            out = out.transpose(1, 2)
+        elif self.attn_backend == _Backend.PALLAS_VLLM_V1:
+            query, key, value = (x.transpose(1, 2)
+                                 for x in (query, key, value))
+            from torch_xla.experimental.custom_kernel import flash_attention
+            out = flash_attention(query, key, value, sm_scale=self.scale)
+            out = out.transpose(1, 2)
+
+        return out.reshape(bsz, q_len, -1)
+
+
+def wait_for_kv_layer_from_connector(layer_name: str):
+    if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
+        return
+
+    connector = get_kv_transfer_group()
+
+    forward_context: ForwardContext = get_forward_context()
+    attn_metadata = forward_context.attn_metadata
+    if attn_metadata is None:
+        return
+    assert isinstance(attn_metadata, dict)
+    connector.wait_for_layer_load(layer_name)
+
+
+def maybe_save_kv_layer_to_connector(
+    layer_name: str,
+    kv_cache_layer: List[torch.Tensor],
+):
+    if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
+        return
+
+    connector = get_kv_transfer_group()
+
+    forward_context: ForwardContext = get_forward_context()
+    attn_metadata = forward_context.attn_metadata
+    if attn_metadata is None:
+        return
+    assert isinstance(attn_metadata, dict)
+    connector.save_kv_layer(layer_name, kv_cache_layer,
+                            attn_metadata[layer_name])
+
+
+def unified_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    layer_name: str,
+) -> torch.Tensor:
+    wait_for_kv_layer_from_connector(layer_name)
+
+    forward_context: ForwardContext = get_forward_context()
+    attn_metadata = forward_context.attn_metadata
+    if isinstance(attn_metadata, dict):
+        attn_metadata = attn_metadata[layer_name]
+    self = forward_context.no_compile_layers[layer_name]
+    kv_cache = self.kv_cache[forward_context.virtual_engine]
+    output = self.impl.forward(self, query, key, value, kv_cache,
+                               attn_metadata)
+
+    maybe_save_kv_layer_to_connector(layer_name, kv_cache)
+    return output
+
+
+def unified_attention_fake(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    layer_name: str,
+) -> torch.Tensor:
+    return torch.empty_like(query).contiguous()
+
+
+direct_register_custom_op(
+    op_name="unified_attention",
+    op_func=unified_attention,
+    mutates_args=[],
+    fake_impl=unified_attention_fake,
+    dispatch_key=current_platform.dispatch_key,
+)
+
+
+def unified_attention_with_output(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    output: torch.Tensor,
+    layer_name: str,
+    output_scale: Optional[torch.Tensor] = None,
+) -> None:
+    wait_for_kv_layer_from_connector(layer_name)
+    forward_context: ForwardContext = get_forward_context()
+    attn_metadata = forward_context.attn_metadata
+    if isinstance(attn_metadata, dict):
+        attn_metadata = attn_metadata[layer_name]
+    self = forward_context.no_compile_layers[layer_name]
+    kv_cache = self.kv_cache[forward_context.virtual_engine]
+    self.impl.forward(self,
+                      query,
+                      key,
+                      value,
+                      kv_cache,
+                      attn_metadata,
+                      output=output,
+                      output_scale=output_scale)
+
+    maybe_save_kv_layer_to_connector(layer_name, kv_cache)
+
+
+def unified_attention_with_output_fake(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    output: torch.Tensor,
+    layer_name: str,
+    output_scale: Optional[torch.Tensor] = None,
+) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="unified_attention_with_output",
+    op_func=unified_attention_with_output,
+    mutates_args=["output"],
+    fake_impl=unified_attention_with_output_fake,
+    dispatch_key=current_platform.dispatch_key,
+)
diff --git a/vllm_v0.10.0/vllm/attention/ops/__init__.py b/vllm_v0.10.0/vllm/attention/ops/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm_v0.10.0/vllm/attention/ops/chunked_prefill_paged_decode.py
new file mode 100644
index 0000000..4f83934
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/ops/chunked_prefill_paged_decode.py
@@ -0,0 +1,368 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Authors:
+#  - Burkhard Ringlein <ngl@zurich.ibm.com>
+#  - Jan van Lunteren <jvl@zurich.ibm.com>
+#  - Chih-Chieh Yang <chih.chieh.yang@ibm.com>
+#  - Thomas Parnell <tpa@zurich.ibm.com>
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.platforms.rocm import use_rocm_custom_paged_attention
+from vllm.triton_utils import tl, triton
+
+from .prefix_prefill import context_attention_fwd
+
+
+@triton.jit
+def cdiv_fn(x, y):
+    return (x + y - 1) // y
+
+
+@triton.jit
+def kernel_paged_attention_2d(
+        output_ptr,  # [num_tokens, num_query_heads, head_size]
+        query_ptr,  # [num_tokens, num_query_heads, head_size]
+        key_cache_ptr,  # [num_blks, num_kv_heads, head_size // x, blk_size, x]
+        value_cache_ptr,  # [num_blks, num_kv_heads, head_size, blk_size]
+        block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
+        seq_lens_ptr,  # [num_seqs]
+        alibi_slopes_ptr,  # [num_query_heads]
+        scale,  # float32
+        k_scale,  # float32
+        v_scale,  # float32
+        num_query_heads: tl.constexpr,  # int
+        num_queries_per_kv: tl.constexpr,  # int
+        num_queries_per_kv_padded: tl.constexpr,  # int
+        block_table_stride: tl.int64,  # int
+        query_stride_0: tl.int64,  # int
+        query_stride_1: tl.int64,  # int, should be equal to head_size
+        output_stride_0: tl.int64,  # int
+        output_stride_1: tl.int64,  # int, should be equal to head_size
+        BLOCK_SIZE: tl.constexpr,  # int
+        HEAD_SIZE: tl.constexpr,  # int
+        HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
+        USE_ALIBI_SLOPES: tl.constexpr,  # bool
+        SLIDING_WINDOW: tl.constexpr,  # int
+        x: tl.constexpr,  # int
+        stride_k_cache_0: tl.int64,  # int
+        stride_k_cache_1: tl.int64,  # int
+        stride_k_cache_2: tl.int64,  # int
+        stride_k_cache_3: tl.int64,  # int
+        stride_k_cache_4: tl.int64,  # int
+        stride_v_cache_0: tl.int64,  # int
+        stride_v_cache_1: tl.int64,  # int
+        stride_v_cache_2: tl.int64,  # int
+        stride_v_cache_3: tl.int64,  # int
+        filter_by_query_len: tl.constexpr,  # bool
+        query_start_len_ptr,  # [num_seqs+1]
+):
+    seq_idx = tl.program_id(0)
+    kv_head_idx = tl.program_id(1)
+
+    if filter_by_query_len:
+        cur_batch_in_all_start_index = tl.load(query_start_len_ptr + seq_idx)
+        cur_batch_in_all_stop_index = tl.load(query_start_len_ptr + seq_idx +
+                                              1)
+        cur_batch_query_len = cur_batch_in_all_stop_index \
+            - cur_batch_in_all_start_index
+        if cur_batch_query_len > 1:
+            return
+    else:
+        cur_batch_in_all_start_index = seq_idx
+
+    query_head_idx = kv_head_idx * num_queries_per_kv + tl.arange(
+        0, num_queries_per_kv_padded)
+
+    query_offset = (cur_batch_in_all_start_index * query_stride_0 +
+                    query_head_idx[:, None] * query_stride_1)
+
+    head_mask = query_head_idx < (kv_head_idx + 1) * num_queries_per_kv
+    head_mask = head_mask & (query_head_idx < num_query_heads)
+
+    dim_mask = tl.where(tl.arange(0, HEAD_SIZE_PADDED) < HEAD_SIZE, 1,
+                        0).to(tl.int1)
+
+    # Q : (num_queries_per_kv, HEAD_SIZE,)
+    Q = tl.load(
+        query_ptr + query_offset + tl.arange(0, HEAD_SIZE_PADDED)[None, :],
+        mask=dim_mask[None, :] & head_mask[:, None],
+        other=0.0,
+    )
+
+    block_table_offset = seq_idx * block_table_stride
+
+    M = tl.full([num_queries_per_kv_padded], float("-inf"), dtype=tl.float32)
+    L = tl.full([num_queries_per_kv_padded], 1.0, dtype=tl.float32)
+    acc = tl.zeros([num_queries_per_kv_padded, HEAD_SIZE_PADDED],
+                   dtype=tl.float32)
+
+    # sequence len for this particular sequence
+    seq_len = tl.load(seq_lens_ptr + seq_idx)
+
+    # alibi slope for this head
+    if USE_ALIBI_SLOPES:
+        alibi_slope = tl.load(alibi_slopes_ptr + query_head_idx,
+                              mask=head_mask,
+                              other=0.0)
+
+    num_blocks = cdiv_fn(seq_len, BLOCK_SIZE)
+
+    # iterate through tiles
+    for j in range(0, num_blocks):
+
+        physical_block_idx = tl.load(block_tables_ptr + block_table_offset + j)
+
+        offs_n = tl.arange(0, BLOCK_SIZE)
+        offs_d = tl.arange(0, HEAD_SIZE_PADDED)
+
+        v_offset = (physical_block_idx * stride_v_cache_0 +
+                    kv_head_idx * stride_v_cache_1 +
+                    offs_d[None, :] * stride_v_cache_2 +
+                    offs_n[:, None] * stride_v_cache_3)
+
+        k_offset = (physical_block_idx * stride_k_cache_0 +
+                    kv_head_idx * stride_k_cache_1 +
+                    (offs_d[:, None] // x) * stride_k_cache_2 +
+                    offs_n[None, :] * stride_k_cache_3 +
+                    (offs_d[:, None] % x) * stride_k_cache_4)
+
+        # K : (HEAD_SIZE, BLOCK_SIZE)
+        K_load = tl.load(key_cache_ptr + k_offset,
+                         mask=dim_mask[:, None],
+                         other=0.0)
+
+        if K_load.dtype.is_fp8():
+            K = (K_load.to(tl.float32) * tl.load(k_scale)).to(Q.dtype)
+        else:
+            K = K_load
+
+        # V : (BLOCK_SIZE, HEAD_SIZE)
+        V_load = tl.load(value_cache_ptr + v_offset,
+                         mask=dim_mask[None, :],
+                         other=0.0)
+
+        if V_load.dtype.is_fp8():
+            V = (V_load.to(tl.float32) * tl.load(v_scale)).to(Q.dtype)
+        else:
+            V = V_load
+
+        seq_offset = j * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+        boundary = tl.full([BLOCK_SIZE], seq_len, dtype=tl.int32)
+        seq_mask = seq_offset[None, :] < boundary
+
+        # S : (num_queries_per_kv, BLOCK_SIZE,)
+        S = tl.where(head_mask[:, None] & seq_mask, 0.0,
+                     float("-inf")).to(tl.float32)
+        S += scale * tl.dot(Q, K)
+
+        context_len = seq_len - 1
+
+        if SLIDING_WINDOW > 0:
+            S = tl.where((context_len - seq_offset) < SLIDING_WINDOW, S,
+                         -10000)
+
+        if USE_ALIBI_SLOPES:
+            S += alibi_slope[:, None] * (seq_offset - context_len)
+
+        # compute running maximum
+        # m_j : (num_queries_per_kv,)
+        m_j = tl.maximum(M, tl.max(S, axis=1))
+
+        # P : (num_queries_per_kv, BLOCK_SIZE,)
+        P = tl.exp(S - m_j[:, None])
+
+        # l_j : (num_queries_per_kv,)
+        l_j = tl.sum(P, axis=1)
+
+        # alpha : (num_queries_per_kv, )
+        alpha = tl.exp(M - m_j)
+
+        # acc : (num_queries_per_kv, BLOCK_SIZE,)
+        acc = acc * alpha[:, None]
+
+        # update constants
+        L = L * alpha + l_j
+        M = m_j
+
+        # acc : (num_queries_per_kv, BLOCK_SIZE,)
+        acc += tl.dot(P.to(V.dtype), V)
+
+    # epilogue
+    acc = acc / L[:, None]
+
+    output_offset = (cur_batch_in_all_start_index * output_stride_0 +
+                     query_head_idx * output_stride_1)
+
+    tl.store(
+        output_ptr + output_offset[:, None] +
+        tl.arange(0, HEAD_SIZE_PADDED)[None, :],
+        acc,
+        mask=dim_mask[None, :] & head_mask[:, None],
+    )
+
+
+def chunked_prefill_paged_decode(
+    query,
+    key,
+    value,
+    output,
+    kv_cache_dtype,
+    key_cache,
+    value_cache,
+    block_table,
+    query_start_loc,
+    seq_lens,
+    max_seq_len,
+    max_query_len,
+    k_scale,
+    v_scale,
+    alibi_slopes=None,
+    sliding_window=None,
+    sm_scale=None,
+):
+
+    if sm_scale is None:
+        sm_scale = 1.0 / (query.shape[1]**0.5)
+
+    use_alibi_slopes = alibi_slopes is not None
+
+    if sliding_window is None or sliding_window <= 0:
+        sliding_window = 0
+
+    if max_query_len > 1:
+        context_attention_fwd(
+            q=query,
+            k=key,
+            v=value,
+            o=output,
+            kv_cache_dtype=kv_cache_dtype,
+            k_cache=key_cache,
+            v_cache=value_cache,
+            b_loc=block_table,
+            b_start_loc=query_start_loc,
+            b_seq_len=seq_lens,
+            max_seq_len=max_seq_len,
+            max_input_len=max_query_len,
+            k_scale=k_scale,
+            v_scale=v_scale,
+            alibi_slopes=alibi_slopes,
+            sliding_window=sliding_window,
+            sm_scale=sm_scale,
+            skip_decode=True,
+        )
+
+    block_size = value_cache.shape[3]
+    num_seqs = len(seq_lens)
+    num_query_heads = query.shape[1]
+    num_kv_heads = key.shape[1]
+    num_queries_per_kv = query.shape[1] // key.shape[1]
+    head_size = query.shape[2]
+
+    # Conversion of FP8 Tensor from uint8 storage to
+    # appropriate torch.dtype for interpretation by Triton
+    if "fp8" in kv_cache_dtype:
+        assert key_cache.dtype in [torch.uint8, current_platform.fp8_dtype()]
+        assert value_cache.dtype in [torch.uint8, current_platform.fp8_dtype()]
+
+        if kv_cache_dtype in ("fp8", "fp8_e4m3"):
+            target_dtype = current_platform.fp8_dtype()
+        elif kv_cache_dtype == "fp8_e5m2":
+            target_dtype = torch.float8_e5m2
+        else:
+            raise ValueError("Unsupported FP8 dtype:", kv_cache_dtype)
+
+        key_cache = key_cache.view(target_dtype)
+        value_cache = value_cache.view(target_dtype)
+
+    num_queries_per_kv_padded = max(triton.next_power_of_2(num_queries_per_kv),
+                                    16)
+
+    use_custom = use_rocm_custom_paged_attention(query.dtype, head_size,
+                                                 block_size,
+                                                 num_queries_per_kv,
+                                                 max_seq_len, sliding_window,
+                                                 kv_cache_dtype, alibi_slopes)
+    if use_custom:
+        _PARTITION_SIZE_ROCM = 256
+        max_num_partitions = ((max_seq_len + _PARTITION_SIZE_ROCM - 1) //
+                              _PARTITION_SIZE_ROCM)
+        assert _PARTITION_SIZE_ROCM % block_size == 0
+        total_num_seq = block_table.shape[0]
+        tmp_output = torch.empty(
+            size=(total_num_seq, num_query_heads, max_num_partitions,
+                  head_size),
+            dtype=output.dtype,
+            device=output.device,
+        )
+        exp_sums = torch.empty(
+            size=(total_num_seq, num_query_heads, max_num_partitions),
+            dtype=torch.float32,
+            device=output.device,
+        )
+        max_logits = torch.empty_like(exp_sums)
+
+        ops.paged_attention_rocm(
+            output,
+            exp_sums,
+            max_logits,
+            tmp_output,
+            query,
+            key_cache,
+            value_cache,
+            num_kv_heads,
+            scale=sm_scale,
+            block_tables=block_table,
+            seq_lens=seq_lens,
+            query_start_loc=query_start_loc,
+            block_size=block_size,
+            max_seq_len=max_seq_len,
+            alibi_slopes=alibi_slopes,
+            kv_cache_dtype=kv_cache_dtype,
+            k_scale=k_scale,
+            v_scale=v_scale,
+        )
+    else:
+        kernel_paged_attention_2d[(
+            num_seqs,
+            num_kv_heads,
+        )](
+            output_ptr=output,
+            query_ptr=query,
+            key_cache_ptr=key_cache,
+            value_cache_ptr=value_cache,
+            block_tables_ptr=block_table,
+            seq_lens_ptr=seq_lens,
+            alibi_slopes_ptr=alibi_slopes,
+            scale=sm_scale,
+            k_scale=k_scale,
+            v_scale=v_scale,
+            num_query_heads=num_query_heads,
+            num_queries_per_kv=num_queries_per_kv,
+            num_queries_per_kv_padded=num_queries_per_kv_padded,
+            block_table_stride=block_table.stride(0),
+            query_stride_0=query.stride(0),
+            query_stride_1=query.stride(1),
+            output_stride_0=output.stride(0),
+            output_stride_1=output.stride(1),
+            BLOCK_SIZE=block_size,
+            HEAD_SIZE=head_size,
+            HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
+            USE_ALIBI_SLOPES=use_alibi_slopes,
+            SLIDING_WINDOW=sliding_window,
+            x=key_cache.shape[4],
+            stride_k_cache_0=key_cache.stride(0),
+            stride_k_cache_1=key_cache.stride(1),
+            stride_k_cache_2=key_cache.stride(2),
+            stride_k_cache_3=key_cache.stride(3),
+            stride_k_cache_4=key_cache.stride(4),
+            stride_v_cache_0=value_cache.stride(0),
+            stride_v_cache_1=value_cache.stride(1),
+            stride_v_cache_2=value_cache.stride(2),
+            stride_v_cache_3=value_cache.stride(3),
+            filter_by_query_len=True,
+            query_start_len_ptr=query_start_loc,
+        )
diff --git a/vllm_v0.10.0/vllm/attention/ops/flashmla.py b/vllm_v0.10.0/vllm/attention/ops/flashmla.py
new file mode 100644
index 0000000..b85f27a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/ops/flashmla.py
@@ -0,0 +1,116 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# adapted from: https://github.com/deepseek-ai/FlashMLA/blob/main/flash_mla/flash_mla_interface.py
+from typing import Optional, Tuple
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+
+logger = init_logger(__name__)
+
+if current_platform.is_cuda():
+    try:
+        import vllm._flashmla_C  # noqa: F401
+        _flashmla_C_AVAILABLE = True
+    except ImportError:
+        _flashmla_C_AVAILABLE = False
+else:
+    _flashmla_C_AVAILABLE = False
+
+
+def is_flashmla_supported() -> Tuple[bool, Optional[str]]:
+    """
+    Return: is_supported_flag, unsupported_reason (optional).
+    """
+    if not current_platform.is_cuda():
+        return False, "FlashMLA is only supported on CUDA devices."
+    if current_platform.get_device_capability()[0] != 9:
+        return False, "FlashMLA is only supported on Hopper devices."
+    if not _flashmla_C_AVAILABLE:
+        return False, "vllm._flashmla_C is not available, likely was not "\
+            "compiled due to insufficient nvcc version or a supported arch "\
+            "(only sm90a currently) was not in the list of target arches to "\
+            "compile for."
+    return True, None
+
+
+def get_mla_metadata(
+    cache_seqlens: torch.Tensor,
+    num_heads_per_head_k: int,
+    num_heads_k: int,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Arguments:
+        cache_seqlens: (batch_size), dtype torch.int32.
+        num_heads_per_head_k: Equals to seq_len_q * num_heads_q // num_heads_k.
+        num_heads_k: num_heads_k.
+
+    Return:
+        tile_scheduler_metadata: (num_sm_parts, TileSchedulerMetaDataSize), 
+                                 dtype torch.int32.
+        num_splits: (batch_size + 1), dtype torch.int32.
+    """
+    return torch.ops._flashmla_C.get_mla_metadata(cache_seqlens,
+                                                  num_heads_per_head_k,
+                                                  num_heads_k)
+
+
+def flash_mla_with_kvcache(
+    q: torch.Tensor,
+    k_cache: torch.Tensor,
+    block_table: torch.Tensor,
+    cache_seqlens: torch.Tensor,
+    head_dim_v: int,
+    tile_scheduler_metadata: torch.Tensor,
+    num_splits: torch.Tensor,
+    softmax_scale: Optional[float] = None,
+    causal: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Arguments:
+        q: (batch_size, seq_len_q, num_heads_q, head_dim).
+        k_cache: (num_blocks, page_block_size, num_heads_k, head_dim).
+        block_table: (batch_size, max_num_blocks_per_seq), torch.int32.
+        cache_seqlens: (batch_size), torch.int32.
+        head_dim_v: Head_dim of v.
+        tile_scheduler_metadata: (num_sm_parts, TileSchedulerMetaDataSize), 
+                                 torch.int32, return by get_mla_metadata.
+        num_splits: (batch_size + 1), torch.int32, return by get_mla_metadata.
+        softmax_scale: float. The scaling of QK^T before applying softmax. 
+                       Default to 1 / sqrt(head_dim).
+        causal: bool. Whether to apply causal attention mask.
+
+    Return:
+        out: (batch_size, seq_len_q, num_heads_q, head_dim_v).
+        softmax_lse: (batch_size, num_heads_q, seq_len_q), torch.float32.
+    """
+    if softmax_scale is None:
+        softmax_scale = q.shape[-1]**(-0.5)
+    out, softmax_lse = torch.ops._flashmla_C.fwd_kvcache_mla(
+        q,
+        k_cache,
+        None,
+        head_dim_v,
+        cache_seqlens,
+        block_table,
+        softmax_scale,
+        causal,
+        tile_scheduler_metadata,
+        num_splits,
+    )
+    return out, softmax_lse
+
+
+#
+# TODO: Add fake functions
+#
+# @register_fake("_flashmla_C::get_mla_metadata")
+# def _get_mla_metadata_fake(....) -> Tuple[torch.Tensor, torch.Tensor]:
+#     return ....
+#
+# @register_fake("_flashmla_C::fwd_kvcache_mla")
+# def _fwd_kvcache_mla_fake(....) -> Tuple[torch.Tensor, torch.Tensor]:
+#     return ....
+#
diff --git a/vllm_v0.10.0/vllm/attention/ops/merge_attn_states.py b/vllm_v0.10.0/vllm/attention/ops/merge_attn_states.py
new file mode 100644
index 0000000..5cb1a47
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/ops/merge_attn_states.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import torch
+
+from vllm.platforms import current_platform
+
+
+def merge_attn_states(
+    output: torch.Tensor,
+    prefix_output: torch.Tensor,
+    prefix_lse: torch.Tensor,
+    suffix_output: torch.Tensor,
+    suffix_lse: torch.Tensor,
+    output_lse: Optional[torch.Tensor] = None,
+) -> None:
+
+    # NOTE(DefTruth): Currently, custom merge_attn_states CUDA kernel
+    # is not support for FP8 dtype, fallback to use Triton kernel.
+    def supported_dtypes(o: torch.Tensor) -> bool:
+        return o.dtype in [torch.float32, torch.half, torch.bfloat16]
+
+    # NOTE(DefTruth): Currently, custom merge_attn_states CUDA
+    # kernel load/store 128b(16 bytes) per memory issue within
+    # thread. Namely, the headsize(headdim) must be multiple of
+    # pack_size (float32 -> 4, half/bfloat16 -> 8).
+    def supported_headdim(o: torch.Tensor) -> bool:
+        headdim = o.shape[2]  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+        if o.dtype == torch.float32:
+            return headdim % 4 == 0
+        return headdim % 8 == 0
+
+    if (current_platform.is_cuda() and supported_dtypes(output)
+            and supported_headdim(output)):
+        from vllm._custom_ops import merge_attn_states
+        return merge_attn_states(output, prefix_output, prefix_lse,
+                                 suffix_output, suffix_lse, output_lse)
+    else:
+        from vllm.attention.ops.triton_merge_attn_states import (
+            merge_attn_states)
+        return merge_attn_states(output, prefix_output, prefix_lse,
+                                 suffix_output, suffix_lse, output_lse)
diff --git a/vllm_v0.10.0/vllm/attention/ops/nki_flash_attn.py b/vllm_v0.10.0/vllm/attention/ops/nki_flash_attn.py
new file mode 100644
index 0000000..29fa432
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/ops/nki_flash_attn.py
@@ -0,0 +1,903 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import neuronxcc.nki.isa as nisa
+import neuronxcc.nki.language as nl
+import numpy as np
+import torch
+from neuronxcc import nki
+from neuronxcc.nki.language import par_dim
+
+from vllm.utils import cdiv
+
+
+def is_power_of_2(x):
+    return x > 0 and (x & (x - 1)) == 0
+
+
+@nki.jit
+def load_block_tables(block_tables_hbm, num_tiles, num_blocks_per_tile):
+    """
+    Load block tables from HBM into SRAM
+
+    `block_tables_hbm` has shape `(num_tiles * num_blocks_per_tile, )`.
+    In case `num_tiles > B_P_SIZE`, we need further tile `num_tile` dimension.
+    """
+    B_P_SIZE = 128
+
+    # reshape as `(num_tiles, num_blocks_per_tile)`
+    assert len(block_tables_hbm.shape) == 1
+    (num_total_blocks, ) = block_tables_hbm.shape
+    assert num_blocks_per_tile * num_tiles == num_total_blocks
+    block_tables_hbm = block_tables_hbm.reshape(
+        (num_tiles, num_blocks_per_tile))
+
+    block_tables_sbuf = nl.zeros(
+        (cdiv(num_tiles, B_P_SIZE), par_dim(B_P_SIZE), num_blocks_per_tile),
+        dtype=nl.int32,
+    )
+    for i in nl.affine_range(cdiv(num_tiles, B_P_SIZE)):
+        i_p = nl.arange(B_P_SIZE)[:, None]
+        i_f = nl.arange(num_blocks_per_tile)[None, :]
+        block_tables_sbuf[i, i_p, i_f] = nl.load(
+            block_tables_hbm[i_p + i * B_P_SIZE, i_f],
+            dtype=nl.int32,
+            mask=(i_p + i * B_P_SIZE < num_tiles),
+        )
+    return block_tables_sbuf
+
+
+@nki.jit
+def transform_block_tables_for_indirect_load(
+    block_tables,
+    block_size_tiling_factor,
+    num_head,
+    head_id,
+):
+    """
+    This function does two things:
+    1. calculate new `block_tables` for a `head_id` after flattening
+    `num_block`, `num_head`, and `block_size_tiling_factor` dimensions
+    2. transpose the result so that `block_table` for each tile is mapped to
+    SBUF Partition dimension for vectorized DMA
+
+    Tiling trick to further improve DMA performance:
+    Given KV cache shape `(num_block, num_head, block_size, D)`, when loading M
+    blocks of a given `head_id` from HBM, the load `cache[block_tables,
+    head_id]` has shape `(M, block_size, D)`. If M < B_P_SIZE = 128, DMA may not
+    fully utilize hardware parallelization. The solution is to tile `block_size`
+    into `(block_size_tiling_factor, tiled_block_size)` s.t. `M *
+    block_size_tiling_factor = B_P_SIZE`. After tiling, KV cache has shape
+    `(num_block, num_head, block_size_tiling_factor, tiled_block_size, D)`. 
+
+    Note:
+    We don't further tile D dimension as small DMA size also hurts performance.
+    """
+    B_P_SIZE = 128
+    num_partitions, num_tiles_per_partition, num_blocks_per_tile = (
+        block_tables.shape)
+    assert num_tiles_per_partition == B_P_SIZE
+    assert is_power_of_2(
+        num_blocks_per_tile), f"{num_blocks_per_tile=} is not power of 2"
+
+    num_loads = cdiv(num_blocks_per_tile, B_P_SIZE)
+    block_tables_transposed = nl.ndarray(
+        (
+            num_loads,
+            par_dim(B_P_SIZE),
+            num_partitions * num_tiles_per_partition,
+        ),
+        dtype=nl.int32,
+    )
+
+    # prepare iota ahead of time to avoid repeatedly using Gpsimd
+    if num_head > 1:
+        head_id = nisa.iota(head_id, dtype=nl.int32).reshape((1, 1))
+        head_id = nl.transpose(
+            head_id.broadcast_to((1, num_tiles_per_partition)))
+        if num_blocks_per_tile > 1:
+            head_id = head_id.broadcast_to(
+                (num_tiles_per_partition, num_blocks_per_tile))
+
+    if block_size_tiling_factor > 1:
+        broadcast_shape = (
+            num_tiles_per_partition,
+            num_blocks_per_tile,
+            block_size_tiling_factor,
+        )
+        offset = nisa.iota(nl.arange(block_size_tiling_factor)[None, None, :],
+                           dtype=nl.int32).broadcast_to(broadcast_shape)
+
+    for partition_id in nl.affine_range(num_partitions):
+        block_tables_partition = block_tables[partition_id]
+        if num_head > 1:
+            # fuse num_block and num_head dimension
+            block_tables_partition = block_tables_partition * num_head + head_id
+
+        if block_size_tiling_factor > 1:
+            # need to apply block size tiling trick
+            assert num_blocks_per_tile * block_size_tiling_factor == B_P_SIZE
+            block_tables_partition = ((block_tables_partition *
+                                       block_size_tiling_factor).reshape(
+                                           (num_tiles_per_partition,
+                                            num_blocks_per_tile,
+                                            1)).broadcast_to(broadcast_shape))
+            new_block_tables = block_tables_partition + offset
+            new_block_tables = new_block_tables.reshape(
+                (num_tiles_per_partition, B_P_SIZE))
+        else:
+            new_block_tables = block_tables_partition
+
+        # transpose the block table so that it can be used by vector DGE
+        for i in nl.affine_range(num_loads):
+            i_p = nl.arange(B_P_SIZE)[:, None]
+            i_f = (partition_id * num_tiles_per_partition +
+                   nl.arange(num_tiles_per_partition)[None, :])
+            block_tables_transposed[i, i_p, i_f] = nl.transpose(
+                new_block_tables[:, nl.ds(i * B_P_SIZE, B_P_SIZE)])
+    return block_tables_transposed
+
+
+@nki.jit
+def load_kv_tile_from_cache(
+    cur_k_tile,
+    cur_v_tile,
+    kv_cache,
+    block_tables,
+    large_k_tile_idx,
+    num_blocks_per_large_tile,
+    tiled_block_size,
+    B_P_SIZE,
+    B_D_SIZE,
+):
+    """
+    Load KV cache and transform Key and Value into layout required by Matmul
+
+    Vectorized DMA Load layout:
+    Key and Value: (par_dim(B_P_SIZE), seqlen_kv // B_P_SIZE * B_D_SIZE)
+
+    Layout used by attention matmuls:
+    Key: (par_dim(B_D_SIZE), seqlen_kv)
+    Value: (seqlen_kv // B_P_SIZE, par_dim(B_P_SIZE), B_D_SIZE)
+           equivalent to (par_dim(B_P_SIZE), seqlen_kv // B_P_SIZE * B_D_SIZE)
+    """
+    # load key cache
+    num_loads = cdiv(num_blocks_per_large_tile, B_P_SIZE)
+    for load_idx in nl.affine_range(num_loads):
+        i_p = nl.arange(B_P_SIZE)[:, None]
+        i_f = nl.arange(tiled_block_size * B_D_SIZE)[None, :]
+        loaded = nl.load(kv_cache[0, block_tables[load_idx, i_p,
+                                                  large_k_tile_idx], i_f])
+        if cur_k_tile.dtype != loaded.dtype:
+            loaded = nl.copy(loaded, dtype=cur_k_tile.dtype)
+        # Transpose SBUF tensor using PE
+        for tb_i in nl.affine_range(tiled_block_size):
+            cur_k_tile[
+                :,
+                nl.ds(
+                    load_idx * B_P_SIZE * tiled_block_size + tb_i * B_P_SIZE,
+                    B_P_SIZE,
+                ),
+            ] = nl.transpose(loaded[:, nl.ds(tb_i * B_D_SIZE, B_D_SIZE)])
+
+    # load value cache
+    for load_idx in nl.affine_range(num_loads):
+        loaded = nl.load(kv_cache[1, block_tables[load_idx, i_p,
+                                                  large_k_tile_idx], i_f])
+        if cur_v_tile.dtype != loaded.dtype:
+            loaded = nl.copy(loaded, dtype=cur_v_tile.dtype)
+        i_p = nl.arange(B_P_SIZE)[:, None]
+        i_f = nl.arange(tiled_block_size * B_D_SIZE)[None, :]
+        cur_v_tile[
+            :,
+            nl.ds(
+                load_idx * tiled_block_size * B_D_SIZE,
+                tiled_block_size * B_D_SIZE,
+            ),
+        ] = loaded
+
+
+@nki.jit
+def transpose_p_local(p_local_transposed,
+                      p_local,
+                      LARGE_TILE_SZ,
+                      B_F_SIZE=512):
+    for i in nl.affine_range(LARGE_TILE_SZ // B_F_SIZE):
+        if nisa.get_nc_version() == nisa.nc_version.gen3:
+            p_local_t_tmp = nl.ndarray((par_dim(128), B_F_SIZE),
+                                       buffer=nl.sbuf,
+                                       dtype=p_local.dtype)
+        else:
+            p_local_t_tmp = nl.ndarray((par_dim(128), B_F_SIZE),
+                                       buffer=nl.psum,
+                                       dtype=np.float32)
+
+        for j in nl.affine_range(B_F_SIZE // 128):
+            j_128_slice = nl.ds(j * 128, 128)
+            i_j_128_slice = nl.ds(i * B_F_SIZE + j * 128, 128)
+
+            if nisa.get_nc_version() == nisa.nc_version.gen3:
+                p_local_t_tmp[:, j_128_slice] = nisa.dma_transpose(
+                    p_local[:, i_j_128_slice])
+            else:
+                p_local_t_tmp[:, j_128_slice] = nisa.nc_transpose(
+                    p_local[:, i_j_128_slice])
+
+        p_local_transposed[:, nl.ds(i * B_F_SIZE, B_F_SIZE)] = nl.copy(
+            p_local_t_tmp, dtype=p_local_transposed.dtype)
+
+
+@nki.jit
+def _flash_attention_core(
+    q_local_tile,
+    k,
+    v,
+    o_buffer,
+    l_buffer,
+    m_buffer,
+    kernel_dtype,
+    acc_type,
+    tile_mask,
+    use_causal_mask,
+    q_tile_idx=None,
+    initialize=False,
+    LARGE_TILE_SZ=2048,
+    B_P_SIZE=128,
+    B_F_SIZE=512,
+    B_D_SIZE=128,
+    qk_res_buffer=None,
+):
+    """
+    The flash attention core function to calculate self attention between a tile
+    of q and a block of K and V.
+    The q_local_tile has (B_P_SIZE, B_D_SIZE)
+    The K and V have shape (B_D_SIZE, LARGE_TILE_SZ), whose free dimension will
+    be split into size B_F_SIZE tiles
+
+    The results are stored in the following three buffers
+    o_buffer: (B_P_SIZE, d)
+    l_buffer: (B_P_SIZE, 1)
+    m_buffer: (B_P_SIZE, 1)
+
+    All IO buffers are in SBUF.
+    """
+    num_k_tile_per_large_tile = LARGE_TILE_SZ // B_F_SIZE
+
+    qk_res_buf = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
+                            buffer=nl.sbuf,
+                            dtype=acc_type)
+    max_local = nl.ndarray((par_dim(B_P_SIZE), num_k_tile_per_large_tile),
+                           dtype=acc_type)
+    for k_i in nl.affine_range(num_k_tile_per_large_tile):
+        k_i_b_f_slice = nl.ds(k_i * B_F_SIZE, B_F_SIZE)
+
+        if use_causal_mask:
+            # mask are used to only apply computation to the lower half of the
+            # matrix, which reduce the arithmetic intensity by up to 50%
+            multiplication_required_selection = (q_tile_idx * B_P_SIZE
+                                                 >= k_i * B_F_SIZE)
+        else:
+            multiplication_required_selection = True
+
+        if multiplication_required_selection:
+            qk_psum = nl.ndarray((par_dim(B_P_SIZE), B_F_SIZE),
+                                 dtype=np.float32,
+                                 buffer=nl.psum)  # (128, 512)
+            qk_psum[:, :] = nl.matmul(q_local_tile,
+                                      k[:, k_i_b_f_slice],
+                                      transpose_x=True)  # (p(128), 512)
+            qk_res_buf[:, k_i_b_f_slice] = nl.where(
+                tile_mask[:, k_i_b_f_slice],
+                qk_psum[:, nl.ds(0, B_F_SIZE)],
+                -9984.0,
+                dtype=acc_type,
+            )
+        else:
+            qk_res_buf[:, k_i_b_f_slice] = -9984.0
+
+        # Calculate max of the current tile
+        max_local[:, k_i] = nisa.tensor_reduce(
+            np.max,
+            qk_res_buf[:, k_i_b_f_slice],
+            axis=(1, ),
+            dtype=acc_type,
+            negate=False,
+        )
+
+    if qk_res_buffer is not None:
+        qk_res_buffer[:, :] = nl.copy(qk_res_buf[:, :])
+
+    max_ = nisa.tensor_reduce(
+        np.max,
+        max_local[:, :],
+        axis=(1, ),
+        dtype=acc_type,
+        negate=False,
+    )
+
+    o_previous_scaled = nl.ndarray((par_dim(B_P_SIZE), B_D_SIZE),
+                                   dtype=o_buffer.dtype)
+
+    if initialize:
+        m_buffer[:, 0] = nl.copy(max_)
+        m_current = max_
+    else:
+        m_previous = nl.copy(m_buffer[:, 0])
+        m_buffer[:, 0] = nl.maximum(m_previous, max_)  # (128,1)
+
+        m_current = m_buffer[:, 0]
+        # Compute scaling factor
+        alpha = nisa.activation(
+            np.exp,
+            m_previous,
+            bias=-1 * m_current,
+            scale=1.0,
+        )
+        o_previous_scaled[...] = nl.multiply(o_buffer[:, :], alpha)
+
+    p_local = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
+                         dtype=kernel_dtype)
+    REDUCTION_TILE = min(2048, LARGE_TILE_SZ // 2)
+
+    p_partial_sum = nl.ndarray(
+        (par_dim(B_P_SIZE), LARGE_TILE_SZ // REDUCTION_TILE),
+        dtype=acc_type,
+    )
+
+    for k_r_i in nl.affine_range(LARGE_TILE_SZ // REDUCTION_TILE):
+        k_r_i_reduce_slice = nl.ds(k_r_i * REDUCTION_TILE, REDUCTION_TILE)
+
+        # compute exp(qk - max)
+        # Compute partial row - tile sum of exp(qk - max))
+        # FIXME : Use activation accumulate to accumulate over k_r_i loop ?
+        p_local[:, k_r_i_reduce_slice] = nisa.activation_reduce(
+            np.exp,
+            qk_res_buf[:, k_r_i_reduce_slice],
+            bias=-1 * m_current,
+            scale=1.0,
+            reduce_op=nl.add,
+            reduce_res=p_partial_sum[:, k_r_i],
+            dtype=kernel_dtype,
+        )
+
+    ps = nl.sum(p_partial_sum, axis=1, dtype=acc_type)
+
+    p_local_transposed = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
+                                    dtype=kernel_dtype)
+    transpose_p_local(
+        p_local_transposed=p_local_transposed,
+        p_local=p_local,
+        LARGE_TILE_SZ=LARGE_TILE_SZ,
+        B_F_SIZE=B_F_SIZE,
+    )
+
+    pv_psum = nl.zeros(
+        (par_dim(B_P_SIZE), B_D_SIZE),
+        dtype=np.float32,
+        buffer=nl.psum,
+    )
+    for k_i in nl.affine_range(LARGE_TILE_SZ // B_P_SIZE):
+        pv_psum[:, :] += nl.matmul(
+            p_local_transposed[:, nl.ds(k_i * B_P_SIZE, B_P_SIZE)],
+            v[:, nl.ds(k_i * B_D_SIZE, B_D_SIZE)],
+            transpose_x=True,
+        )  # (128, 128) (p(Br), d)
+
+    if initialize:
+        o_buffer[:, :] = nl.copy(pv_psum[:, :])
+        l_buffer[:, 0] = nl.add(nl.log(ps), max_)
+    else:
+        o_buffer[:, :] = nl.add(o_previous_scaled, pv_psum)
+
+        l_prev = l_buffer[:, 0]
+        l_exp = nl.add(
+            nl.exp(nl.subtract(l_prev, m_current)),
+            ps,
+        )
+        l_buffer[:, 0] = nl.add(m_current, nl.log(l_exp))
+
+
+@nki.jit
+def load_v_tile(v_hbm_tile, cur_v_tile, large_tile_idx, v_i, LARGE_TILE_SZ):
+    B_P_SIZE = 128
+    B_D_SIZE = v_hbm_tile.shape[-1]
+    loaded = nl.load(v_hbm_tile[
+        nl.ds(large_tile_idx * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE),
+        :,
+    ])
+    if cur_v_tile.dtype != loaded.dtype:
+        loaded = nl.copy(loaded, dtype=cur_v_tile.dtype)
+    cur_v_tile[:, nl.ds(v_i * B_D_SIZE, B_D_SIZE)] = loaded
+
+
+@nki.jit
+def flash_paged_attention(
+    query,
+    key,
+    value,
+    kv_cache,
+    block_tables,
+    mask,
+    softmax_scale=None,
+    mixed_precision=True,
+    LARGE_TILE_SZ=2048,
+    return_debug_tensors=False,
+):
+    """
+    Flash PagedAttention Forward Kernel.
+
+    IO tensor layouts:
+      - query: shape   (1, n_heads, d, seq_q)
+      - key:   shape   (1, n_kv_heads, d, seq_k)
+      - value: shape   (1, n_kv_heads, seq_v, d)
+      - kv_cache: (2, num_blocks, n_kv_heads, block_size, d)
+      - block_tables: (num_active_blocks, )
+      - mask: (seq_q, num_active_blocks * block_size + seq_q)
+      - o: shape (1, n_heads, seq_q, d)
+
+      - This kernel requires seq_k == seq_v
+      - We use continuous batching by default, so the batch dimension is
+        always 1, and different requests are concatenated along sequence
+        dimension.
+      - We use paged cache blocks (kv_cache) to store KV cache.
+
+    IO tensor dtypes:
+      - This kernel assumes all IO tensors have the same dtype except for
+        block_tables (int32) and mask (int32)
+      - If mixed_precision is True, then all Tensor Engine operation will be
+        performed in bfloat16 and accumulation will be performed in float32.
+        Otherwise the intermediates will be in the same type as the inputs.
+
+    Compile-time Constants:
+      - softmax_scale: scaling for softmax, is None, default is `1.0/(d**0.5)`
+      - mixed_precision: flag to set non-matmul ops in fp32 precision, default
+        is set to `true`, if false, we use same precision as input types
+      - LARGE_TILE_SZ: `default=2048`, size of the kv tile size for attention
+        computation reduction
+
+    GQA support Notes:
+      the spmd kernel for launching kernel should be on kv_heads instead of
+      nheads
+
+    Example usage:
+      MHA: q: [b, h, d, s], k: [b, h, d, s], v: [b, h, s, d]
+        usage: `flash_fwd[b, h](q, k, v, ...)`
+      GQA: q: [b, h, d, s], k: [b, kv_h, d, s], v: [b, kv_h, s, d]
+        usage: `flash_fwd[b, kv_h](q, k, v, ...)`
+    """
+    B_F_SIZE = 512
+    B_P_SIZE = 128
+    b, h, d, seqlen_q = query.shape
+    B_D_SIZE = d
+    n_tile_q = seqlen_q // B_P_SIZE  # since q will be loaded on tensor engine
+    _, num_blocks, k_h, block_size, _ = kv_cache.shape
+    q_h_per_k_h = h // k_h
+    assert b == 1, f"invalid batch size {b=}"
+    assert d <= 128, f" we do not support head_dim > 128, got head dim {d=}"
+    cache_shape = (2, num_blocks, k_h, block_size, d)
+    assert (tuple(kv_cache.shape) == cache_shape
+            ), f"{kv_cache.shape=} mismatch, expect {cache_shape}"
+    assert key is None or tuple(key.shape) == (
+        1,
+        k_h,
+        d,
+        seqlen_q,
+    ), f"key shape {key.shape} mismatch!"
+    assert value is None or tuple(value.shape) == (
+        1,
+        k_h,
+        seqlen_q,
+        d,
+    ), f"value shape {value.shape} mismatch!"
+
+    assert (
+        nl.program_ndim() == 2
+    ), f"Expect spmd grid with 2 dimensions, got {nl.program_ndim()} instead!"
+    batch_id = nl.program_id(axis=0)
+    head_id = nl.program_id(axis=1)
+
+    (num_active_blocks, ) = block_tables.shape
+    context_kv_len = num_active_blocks * block_size
+    assert (
+        LARGE_TILE_SZ % B_F_SIZE == 0
+    ), f"Need {LARGE_TILE_SZ=} to be divisible by {B_F_SIZE=} in transpose_p"
+    assert (context_kv_len % LARGE_TILE_SZ == 0
+            ), f"Need {context_kv_len=} to be divisible by {LARGE_TILE_SZ=}"
+
+    num_blocks_per_large_tile = LARGE_TILE_SZ // block_size
+    assert is_power_of_2(
+        num_blocks_per_large_tile
+    ), f"{num_blocks_per_large_tile=} is expected of be power of 2"
+    if seqlen_q > B_F_SIZE:
+        MAX_REDUCTION_TILE = 2048
+        if seqlen_q // 2 > MAX_REDUCTION_TILE:
+            assert (
+                seqlen_q % MAX_REDUCTION_TILE == 0
+            ), f"{seqlen_q=} should be divisible by {MAX_REDUCTION_TILE=}"
+        else:
+            assert (seqlen_q % B_F_SIZE == 0
+                    ), f"{seqlen_q=} should be divisible by {B_F_SIZE=})"
+
+    kernel_dtype = nl.bfloat16 if mixed_precision else query.dtype
+    acc_type = np.dtype(np.float32) if mixed_precision else kernel_dtype
+    softmax_scale = softmax_scale or (1.0 / (d**0.5))
+    num_large_k_tile = context_kv_len // LARGE_TILE_SZ
+
+    o = nl.ndarray((b, h, seqlen_q, d),
+                   dtype=query.dtype,
+                   buffer=nl.shared_hbm)
+    hbm_l_buffer, hbm_m_buffer, hbm_qk_res, qk_res_buffer = (
+        None,
+        None,
+        None,
+        None,
+    )
+    if return_debug_tensors:
+        hbm_l_buffer = nl.ndarray((b, h, seqlen_q),
+                                  dtype=acc_type,
+                                  buffer=nl.shared_hbm)
+        hbm_m_buffer = nl.ndarray((b, h, seqlen_q),
+                                  dtype=acc_type,
+                                  buffer=nl.shared_hbm)
+        hbm_qk_res = nl.ndarray((b, h, B_P_SIZE, seqlen_q),
+                                dtype=acc_type,
+                                buffer=nl.shared_hbm)
+        qk_res_buffer = nl.zeros(
+            (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), seqlen_q),
+            dtype=acc_type,
+            buffer=nl.sbuf,
+            lazy_initialization=True,
+        )
+    block_tables_sbuf = load_block_tables(
+        block_tables_hbm=block_tables,
+        num_tiles=num_large_k_tile,
+        num_blocks_per_tile=num_blocks_per_large_tile,
+    )
+
+    # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient
+    if num_blocks_per_large_tile < B_P_SIZE:
+        # we checked num_blocks_per_tile is a power of 2
+        assert B_P_SIZE % num_blocks_per_large_tile == 0
+        block_size_tiling_factor = B_P_SIZE // num_blocks_per_large_tile
+        # We assume block_size >= block_size_tiling_factor
+        assert block_size % block_size_tiling_factor == 0
+    else:
+        block_size_tiling_factor = 1
+    tiled_block_size = block_size // block_size_tiling_factor
+
+    # Indirect DMA load must be placed along Partition Dimension
+    block_tables_sbuf = transform_block_tables_for_indirect_load(
+        block_tables_sbuf,
+        block_size_tiling_factor=block_size_tiling_factor,
+        num_head=k_h,
+        head_id=head_id,
+    )
+
+    # Flatten KV cache to be 3D for loading into SBUF
+    new_cache_shape = (
+        2,
+        num_blocks * k_h * block_size_tiling_factor,
+        tiled_block_size * d,
+    )
+    kv_cache = kv_cache.reshape(new_cache_shape)
+
+    # Global Flash Attention accumulators
+    o_buffer = nl.zeros(
+        (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), d),
+        dtype=acc_type,
+        buffer=nl.sbuf,
+        lazy_initialization=True,
+    )
+    l_buffer = nl.zeros(
+        (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), 1),
+        dtype=acc_type,
+        buffer=nl.sbuf,
+        lazy_initialization=True,
+    )
+    m_buffer = nl.zeros(
+        (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), 1),
+        dtype=acc_type,
+        buffer=nl.sbuf,
+        lazy_initialization=True,
+    )
+
+    for large_k_tile_idx in nl.sequential_range(0, num_large_k_tile):
+        num_loads = cdiv(num_blocks_per_large_tile, B_P_SIZE)
+        cur_k_tile = nl.ndarray(
+            (par_dim(B_D_SIZE), LARGE_TILE_SZ),
+            dtype=kernel_dtype,
+        )
+        cur_v_tile = nl.ndarray(
+            (par_dim(B_P_SIZE), num_loads * tiled_block_size * B_D_SIZE),
+            dtype=kernel_dtype,
+        )
+        load_kv_tile_from_cache(
+            cur_k_tile=cur_k_tile,
+            cur_v_tile=cur_v_tile,
+            kv_cache=kv_cache,
+            block_tables=block_tables_sbuf,
+            large_k_tile_idx=large_k_tile_idx,
+            num_blocks_per_large_tile=num_blocks_per_large_tile,
+            tiled_block_size=tiled_block_size,
+            B_P_SIZE=B_P_SIZE,
+            B_D_SIZE=B_D_SIZE,
+        )
+
+        for i in nl.affine_range(n_tile_q):
+            cur_mask = nl.load(mask[
+                nl.ds(i * B_P_SIZE, B_P_SIZE),
+                nl.ds(large_k_tile_idx * LARGE_TILE_SZ, LARGE_TILE_SZ),
+            ])
+            for i_q_h in nl.affine_range(q_h_per_k_h):
+                q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype)
+                q_hbm_tile = query[batch_id, head_id * q_h_per_k_h + i_q_h]
+                q_sbuf_tile = nl.load(q_hbm_tile[:,
+                                                 nl.ds(i *
+                                                       B_P_SIZE, B_P_SIZE)])
+                if q_sbuf_tile.dtype != kernel_dtype:
+                    q_sbuf_tile = nl.copy(q_sbuf_tile, dtype=kernel_dtype)
+                q_tile[:, :] = q_sbuf_tile * softmax_scale
+
+                _flash_attention_core(
+                    q_local_tile=q_tile,
+                    k=cur_k_tile,
+                    v=cur_v_tile,
+                    o_buffer=o_buffer[i, i_q_h],
+                    l_buffer=l_buffer[i, i_q_h],
+                    m_buffer=m_buffer[i, i_q_h],
+                    kernel_dtype=kernel_dtype,
+                    acc_type=acc_type,
+                    tile_mask=cur_mask,
+                    use_causal_mask=False,
+                    q_tile_idx=i,
+                    initialize=large_k_tile_idx == 0,
+                    LARGE_TILE_SZ=LARGE_TILE_SZ,
+                    B_P_SIZE=B_P_SIZE,
+                    B_F_SIZE=B_F_SIZE,
+                    B_D_SIZE=B_D_SIZE,
+                )
+
+    # compute attention between input query, key and value
+    if key is not None and value is not None:
+        B_F_SIZE = min(seqlen_q, B_F_SIZE)
+        LARGE_TILE_SZ = seqlen_q
+
+        cur_k_tile = nl.ndarray((par_dim(B_D_SIZE), LARGE_TILE_SZ),
+                                dtype=kernel_dtype)
+        cur_v_tile = nl.ndarray(
+            (par_dim(B_P_SIZE), LARGE_TILE_SZ // B_P_SIZE * B_D_SIZE),
+            dtype=kernel_dtype,
+        )
+
+        loaded = nl.load(key[batch_id, head_id, :, :])
+        if loaded.dtype != kernel_dtype:
+            loaded = nl.copy(loaded, dtype=kernel_dtype)
+        cur_k_tile[:, :] = loaded
+
+        v_hbm_tile = value[batch_id, head_id]
+        for v_i in nl.affine_range(LARGE_TILE_SZ // B_P_SIZE):
+            load_v_tile(
+                v_hbm_tile=v_hbm_tile,
+                cur_v_tile=cur_v_tile,
+                large_tile_idx=0,
+                v_i=v_i,
+                LARGE_TILE_SZ=LARGE_TILE_SZ,
+            )
+
+        for i in nl.affine_range(n_tile_q):
+            cur_mask = nl.load(mask[
+                nl.ds(i * B_P_SIZE, B_P_SIZE),
+                nl.ds(context_kv_len, LARGE_TILE_SZ),
+            ])
+            for i_q_h in nl.affine_range(q_h_per_k_h):
+
+                q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype)
+                q_hbm_tile = query[batch_id, head_id * q_h_per_k_h + i_q_h]
+                q_sbuf_tile = nl.load(q_hbm_tile[:,
+                                                 nl.ds(i *
+                                                       B_P_SIZE, B_P_SIZE)])
+                if q_sbuf_tile.dtype != kernel_dtype:
+                    q_sbuf_tile = nl.copy(q_sbuf_tile, dtype=kernel_dtype)
+                q_tile[:, :] = q_sbuf_tile * softmax_scale
+                _flash_attention_core(
+                    q_local_tile=q_tile,
+                    k=cur_k_tile,
+                    v=cur_v_tile,
+                    o_buffer=o_buffer[i, i_q_h],
+                    l_buffer=l_buffer[i, i_q_h],
+                    m_buffer=m_buffer[i, i_q_h],
+                    kernel_dtype=kernel_dtype,
+                    acc_type=acc_type,
+                    tile_mask=cur_mask,
+                    use_causal_mask=True,
+                    q_tile_idx=i,
+                    initialize=False,
+                    LARGE_TILE_SZ=LARGE_TILE_SZ,
+                    B_P_SIZE=B_P_SIZE,
+                    B_F_SIZE=B_F_SIZE,
+                    B_D_SIZE=B_D_SIZE,
+                    qk_res_buffer=(qk_res_buffer[i, i_q_h]
+                                   if qk_res_buffer is not None else None),
+                )
+
+    # -- -- -- -- write output to buffer on HBM -- -- -- -- -- -- #
+    for i_q_h in nl.affine_range(q_h_per_k_h):
+        for i in nl.affine_range(n_tile_q):
+            out = nl.multiply(
+                o_buffer[i, i_q_h],
+                nl.exp(m_buffer[i, i_q_h] - l_buffer[i, i_q_h]),
+                dtype=kernel_dtype,
+            )
+
+            nl.store(
+                o[
+                    batch_id,
+                    head_id * q_h_per_k_h + i_q_h,
+                    nl.ds(i * B_P_SIZE, B_P_SIZE),
+                    :,
+                ],
+                out,
+            )
+            # maximum and summation statistics
+            if return_debug_tensors:
+                nl.store(
+                    hbm_m_buffer[
+                        batch_id,
+                        head_id * q_h_per_k_h + i_q_h,
+                        nl.ds(i * B_P_SIZE, B_P_SIZE),
+                    ],
+                    m_buffer[i, i_q_h, :, :],
+                )
+                nl.store(
+                    hbm_l_buffer[
+                        batch_id,
+                        head_id * q_h_per_k_h + i_q_h,
+                        nl.ds(i * B_P_SIZE, B_P_SIZE),
+                    ],
+                    l_buffer[i, i_q_h],
+                )
+                nl.store(
+                    hbm_qk_res[batch_id, head_id * q_h_per_k_h + i_q_h, :, :],
+                    qk_res_buffer[batch_id, i_q_h, :, :],
+                )
+
+    if return_debug_tensors:
+        return o, hbm_m_buffer, hbm_l_buffer, hbm_qk_res
+    return o
+
+
+def reorder_context_mask(mask, LARGE_TILE_SZ, block_size):
+    """
+    Reorder the mask to make it compatible with the flash attention kernel.
+
+    We vectorize KV cache read to improve DMA utilization. However, the layout
+    that maximizes DMA bandwidth changes the order tokens are consumed.
+    
+    The token layout (inner 2 dimensions) after vectorized load is (B_P_SIZE,
+    tiled_block_size) in a tile of `B_P_SIZE * tiled_block_size` tokens. And
+    each step the engine consumes a column (rather than a row) of B_P_SIZE
+    tokens. Therefore, the tokens are visited in a strided way.
+
+    To make sure mask matches the order tokens are consumed, we need to properly
+    transpose mask.
+    """
+    total_query_len, total_seq_len = mask.shape
+    context_kv_len = total_seq_len - total_query_len
+
+    B_P_SIZE = 128
+    assert (LARGE_TILE_SZ
+            >= B_P_SIZE), f"{LARGE_TILE_SZ=} must be larger than {B_P_SIZE=}"
+    num_tiled_blocks = max(B_P_SIZE, LARGE_TILE_SZ // block_size)
+    tiled_block_size = LARGE_TILE_SZ // num_tiled_blocks
+    if tiled_block_size > 1:
+        # Mask reordering is needed when tiled_block_size > 1
+        device = mask.device
+        mask = mask.cpu()
+        context_mask = mask[:, :context_kv_len]
+        context_mask = context_mask.view(
+            total_query_len,
+            context_kv_len // LARGE_TILE_SZ,
+            num_tiled_blocks // B_P_SIZE,
+            B_P_SIZE,
+            tiled_block_size,
+        )
+        context_mask = context_mask.transpose(3, 4).reshape(
+            total_query_len, context_kv_len)
+        new_mask = mask[:, context_kv_len:]
+        return torch.concat([context_mask, new_mask], dim=1).to(device)
+    else:
+        return mask
+
+
+def flash_attn_varlen_nkifunc(
+    query,
+    key,
+    value,
+    kv_cache,
+    block_table,
+    attn_mask,
+    n_kv_head=None,
+    head_size=None,
+    LARGE_TILE_SZ=2048,
+    mixed_precision=True,
+):
+    """
+    Compute flash paged attention for variable length sequences.
+
+    This function is a wrapper around the flash attention NKI kernel. It takes
+    in the following arguments:
+      - query: (1, n_heads, d, seq_q)
+      - key:   (1, n_kv_heads, d, seq_k)
+      - value: (1, n_kv_heads, seq_v, d)
+      - kv_cache:   (2, n_blocks, n_kv_heads, block_size, d)
+      - block_tables: (n_active_blocks, )
+      - attn_mask: (seq_q, n_active_blocks * block_size + seq_q)
+
+    Notes:
+      - attn_mask must be reordered outside using `reorder_context_mask`
+      - Key/value cache layout must be (n_blocks, n_kv_heads, block_size, d) 
+        for better DMA throughput
+    """
+    if n_kv_head is None:
+        n_kv_head = kv_cache.shape[2]
+    assert kv_cache.shape[0] == 2
+    assert kv_cache.shape[2] == n_kv_head
+    if head_size is None:
+        head_size = kv_cache.shape[-1]
+
+    kwargs = dict(
+        query=query,
+        key=key,
+        value=value,
+        kv_cache=kv_cache,
+        block_tables=block_table,
+        mask=attn_mask,
+        softmax_scale=1.0 / (head_size**0.5),
+        mixed_precision=mixed_precision,
+        LARGE_TILE_SZ=LARGE_TILE_SZ,
+    )
+
+    o = flash_paged_attention[1, n_kv_head](**kwargs)
+    return o
+
+
+def reshape_and_cache(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    kv_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,
+) -> None:
+    """
+    Writes key-value pairs to the KV cache at specified positions.
+
+    Args:
+        key (torch.Tensor): Key tensor with shape
+            (num_tokens, n_kv_head, d_head)
+        value (torch.Tensor): Value tensor with shape 
+            (num_tokens, n_kv_head, d_head)
+        kv_cache (torch.Tensor): Key/value cache tensor with shape 
+            (2, num_blocks, n_kv_head, block_size, d_head)
+        slot_mapping (torch.Tensor): Mapping tensor indicating cache positions
+            with shape (num_tokens)
+
+    Returns:
+        None: Updates the kv_cache tensor in-place
+    """
+    block_size = kv_cache.size(3)
+    n_kv_head = key.size(1)
+
+    # Calculate indices with explicit floor division
+    block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
+    block_offsets = slot_mapping % block_size
+
+    # Create the head indices tensor
+    head_indices = torch.arange(n_kv_head, device=key.device)
+
+    # Update caches using index_put_
+    kv_cache.index_put_(
+        (torch.tensor([0], device=key.device), block_indices[:, None],
+         head_indices[None, :], block_offsets[:, None]), key)
+
+    kv_cache.index_put_(
+        (torch.tensor([1], device=key.device), block_indices[:, None],
+         head_indices[None, :], block_offsets[:, None]), value)
diff --git a/vllm_v0.10.0/vllm/attention/ops/paged_attn.py b/vllm_v0.10.0/vllm/attention/ops/paged_attn.py
new file mode 100644
index 0000000..c6d1501
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/ops/paged_attn.py
@@ -0,0 +1,256 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.triton_utils import HAS_TRITON
+
+if HAS_TRITON:
+    from vllm.attention.ops.prefix_prefill import context_attention_fwd
+
+# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
+_PARTITION_SIZE = 512
+
+
+@dataclass
+class PagedAttentionMetadata:
+    """Metadata for PagedAttention."""
+    # (batch_size,). The length of sequences (entire tokens seen so far) per
+    # sequence.
+    seq_lens_tensor: Optional[torch.Tensor]
+    # Maximum sequence length in the batch. 0 if it is prefill-only batch.
+    max_decode_seq_len: int
+    # (batch_size, max_blocks_per_seq).
+    # Block addresses per sequence. (Seq id -> list of physical block)
+    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
+    # in the kv cache. Each block can contain up to block_size tokens.
+    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
+    # captured.
+    block_tables: Optional[torch.Tensor]
+
+
+class PagedAttention:
+
+    @staticmethod
+    def get_supported_head_sizes() -> List[int]:
+        return [32, 64, 80, 96, 112, 120, 128, 192, 256]
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return (2, num_blocks, block_size * num_kv_heads * head_size)
+
+    @staticmethod
+    def split_kv_cache(
+        kv_cache: torch.Tensor,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        x = 16 // kv_cache.element_size()
+        num_blocks = kv_cache.shape[1]
+
+        key_cache = kv_cache[0]
+        key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x,
+                                   -1, x)
+        value_cache = kv_cache[1]
+        value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1)
+        return key_cache, value_cache
+
+    @staticmethod
+    def write_to_paged_cache(
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        slot_mapping: torch.Tensor,
+        kv_cache_dtype: str,
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
+    ) -> None:
+        ops.reshape_and_cache(
+            key,
+            value,
+            key_cache,
+            value_cache,
+            slot_mapping.flatten(),
+            kv_cache_dtype,
+            k_scale,
+            v_scale,
+        )
+
+    @staticmethod
+    def forward_decode(
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_tables: torch.Tensor,
+        seq_lens: torch.Tensor,
+        max_seq_len: int,
+        kv_cache_dtype: str,
+        num_kv_heads: int,
+        scale: float,
+        alibi_slopes: Optional[torch.Tensor],
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
+        tp_rank: int = 0,
+        blocksparse_local_blocks: int = 0,
+        blocksparse_vert_stride: int = 0,
+        blocksparse_block_size: int = 64,
+        blocksparse_head_sliding_step: int = 0,
+    ) -> torch.Tensor:
+        if blocksparse_vert_stride is not None and blocksparse_vert_stride > 1:
+            # use blocksparse paged attention
+            block_size = value_cache.size(-1)
+            assert (blocksparse_block_size > 0 and
+                    blocksparse_block_size % block_size == 0), \
+                (f"{blocksparse_block_size=} needs to be a multiple of"
+                 f"{block_size=} used in block_tables.")
+
+        output = torch.empty_like(query)
+        block_size = value_cache.shape[3]
+        num_seqs, num_heads, head_size = query.shape
+        max_num_partitions = ((max_seq_len + _PARTITION_SIZE - 1) //
+                              _PARTITION_SIZE)
+        # NOTE(woosuk): We use a simple heuristic to decide whether to use
+        # PagedAttention V1 or V2. If the number of partitions is 1, we use
+        # V1 to avoid the overhead of reduction. Also, if the number of
+        # sequences or heads is large, we use V1 since there is enough work
+        # to parallelize.
+        # TODO(woosuk): Tune this heuristic.
+        # For context len > 8192, use V2 kernel to avoid shared memory shortage.
+        use_v1 = (max_seq_len <= 8192
+                  and (max_num_partitions == 1 or num_seqs * num_heads > 512))
+
+        if use_v1:
+            # Run PagedAttention V1.
+            ops.paged_attention_v1(
+                output,
+                query,
+                key_cache,
+                value_cache,
+                num_kv_heads,
+                scale,
+                block_tables,
+                seq_lens,
+                block_size,
+                max_seq_len,
+                alibi_slopes,
+                kv_cache_dtype,
+                k_scale,
+                v_scale,
+                tp_rank,
+                blocksparse_local_blocks,
+                blocksparse_vert_stride,
+                blocksparse_block_size,
+                blocksparse_head_sliding_step,
+            )
+        else:
+            # Run PagedAttention V2.
+            assert _PARTITION_SIZE % block_size == 0
+            tmp_output = torch.empty(
+                size=(num_seqs, num_heads, max_num_partitions, head_size),
+                dtype=output.dtype,
+                device=output.device,
+            )
+            exp_sums = torch.empty(
+                size=(num_seqs, num_heads, max_num_partitions),
+                dtype=torch.float32,
+                device=output.device,
+            )
+            max_logits = torch.empty_like(exp_sums)
+            ops.paged_attention_v2(
+                output,
+                exp_sums,
+                max_logits,
+                tmp_output,
+                query,
+                key_cache,
+                value_cache,
+                num_kv_heads,
+                scale,
+                block_tables,
+                seq_lens,
+                block_size,
+                max_seq_len,
+                alibi_slopes,
+                kv_cache_dtype,
+                k_scale,
+                v_scale,
+                tp_rank,
+                blocksparse_local_blocks,
+                blocksparse_vert_stride,
+                blocksparse_block_size,
+                blocksparse_head_sliding_step,
+            )
+        return output
+
+    @staticmethod
+    def forward_prefix(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache_dtype: str,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_tables: torch.Tensor,
+        query_start_loc: torch.Tensor,
+        seq_lens_tensor: torch.Tensor,
+        max_query_len: int,
+        alibi_slopes: Optional[torch.Tensor],
+        sliding_window: Optional[int],
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
+    ) -> torch.Tensor:
+        output = torch.empty_like(query)
+        max_seq_len = None
+        context_attention_fwd(
+            query,
+            key,
+            value,
+            output,
+            kv_cache_dtype,
+            key_cache,
+            value_cache,
+            block_tables,
+            # query_start_loc is (batch_size + 1,)
+            query_start_loc,
+            seq_lens_tensor,
+            max_seq_len,
+            max_query_len,
+            k_scale,
+            v_scale,
+            alibi_slopes,
+            sliding_window,
+        )
+        return output
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        src_key_cache = src_kv_cache[0]
+        dst_key_cache = dst_kv_cache[0]
+        ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst)
+
+        src_value_cache = src_kv_cache[1]
+        dst_value_cache = dst_kv_cache[1]
+        ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst)
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: torch.Tensor,
+    ) -> None:
+        key_caches = [kv_cache[0] for kv_cache in kv_caches]
+        value_caches = [kv_cache[1] for kv_cache in kv_caches]
+        ops.copy_blocks(key_caches, value_caches, src_to_dists)
diff --git a/vllm_v0.10.0/vllm/attention/ops/pallas_kv_cache_update.py b/vllm_v0.10.0/vllm/attention/ops/pallas_kv_cache_update.py
new file mode 100644
index 0000000..e7d727a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/ops/pallas_kv_cache_update.py
@@ -0,0 +1,120 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import functools
+
+import jax
+from jax.experimental import pallas as pl
+from jax.experimental.pallas import tpu as pltpu
+
+from vllm.utils import cdiv
+
+
+def _kv_cache_update_kernel(
+    # Prefetch
+    slices_ref,  # [3, padded_num_slices], list of (kv_cache_start,
+    # new_kv_start, slice_len)
+    # Input
+    new_kv_hbm_ref,  # [num_tokens, num_combined_kv_heads, head_dim]
+    kv_cache_hbm_ref,  # [total_num_pages * page_size, num_combined_kv_heads,
+    # head_dim]
+    # Output
+    _,  # [total_num_pages * page_size, num_combined_kv_heads, head_dim]
+    # Scratch
+    scratch,  # [num_slices_per_block, page_size, num_combined_kv_heads,
+    # head_dim]
+    sem,
+):
+    async_copies = []
+    block_idx = pl.program_id(0)
+    num_slices_per_block = scratch.shape[0]
+
+    # Copy from new_kv_hbm_ref to scratch
+    for i in range(num_slices_per_block):
+        offset_i = i + block_idx * num_slices_per_block
+        new_kv_start = slices_ref[1, offset_i]
+        length = slices_ref[2, offset_i]
+        async_copy = pltpu.make_async_copy(
+            new_kv_hbm_ref.at[pl.ds(new_kv_start, length), ...],
+            scratch.at[i, pl.ds(0, length), ...],
+            sem,
+        )
+        async_copy.start()
+        async_copies.append(async_copy)
+
+    for async_copy in async_copies:
+        async_copy.wait()
+
+    # Copy from scratch to kv_cache_hbm_ref
+    async_copies.clear()
+    for i in range(num_slices_per_block):
+        offset_i = i + block_idx * num_slices_per_block
+        kv_cache_start = slices_ref[0, offset_i]
+        length = slices_ref[2, offset_i]
+        async_copy = pltpu.make_async_copy(
+            scratch.at[i, pl.ds(0, length), ...],
+            kv_cache_hbm_ref.at[pl.ds(kv_cache_start, length), ...],
+            sem,
+        )
+        async_copy.start()
+        async_copies.append(async_copy)
+    for async_copy in async_copies:
+        async_copy.wait()
+
+
+@functools.partial(
+    jax.jit,
+    static_argnames=["page_size", "num_slices_per_block"],
+)
+def kv_cache_update(
+    new_kv: jax.Array,  # [total_num_token, num_combined_kv_heads, head_dim]
+    slices: jax.
+    Array,  # [3, slices], list of (kv_cache_start, new_kv_start, slice_len)
+    kv_cache: jax.
+    Array,  # [total_num_pages * page_size, num_combined_kv_heads, head_dim]
+    num_kv_update_slices: jax.Array,  # [1]
+    *,
+    page_size: int = 32,
+    num_slices_per_block: int = 8,
+):
+    assert slices.shape[1] % num_slices_per_block == 0
+    _, num_combined_kv_heads, head_dim = new_kv.shape
+    assert kv_cache.shape[1] == num_combined_kv_heads
+    assert kv_cache.shape[2] == head_dim
+    assert head_dim % 128 == 0
+    # TODO: Add dynamic check to make sure that the all the slice lengths are
+    # smaller or equal to page_size
+
+    in_specs = [
+        pl.BlockSpec(memory_space=pltpu.TPUMemorySpace.ANY),
+        pl.BlockSpec(memory_space=pltpu.TPUMemorySpace.ANY),
+    ]
+
+    out_specs = [pl.BlockSpec(memory_space=pltpu.TPUMemorySpace.ANY)]
+    out_shape = [jax.ShapeDtypeStruct(kv_cache.shape, dtype=kv_cache.dtype)]
+
+    scalar_prefetches = [slices]
+    scratch = pltpu.VMEM(
+        (num_slices_per_block, page_size, num_combined_kv_heads, head_dim),
+        new_kv.dtype,
+    )
+
+    scratch_shapes = [
+        scratch,
+        pltpu.SemaphoreType.DMA,
+    ]
+
+    kernel = pl.pallas_call(
+        _kv_cache_update_kernel,
+        grid_spec=pltpu.PrefetchScalarGridSpec(
+            num_scalar_prefetch=len(scalar_prefetches),
+            in_specs=in_specs,
+            out_specs=out_specs,
+            grid=(cdiv(num_kv_update_slices[0], num_slices_per_block), ),
+            scratch_shapes=scratch_shapes,
+        ),
+        out_shape=out_shape,
+        input_output_aliases={len(scalar_prefetches) + 1: 0},
+    )
+
+    return kernel(*scalar_prefetches, new_kv, kv_cache)[0]
diff --git a/vllm_v0.10.0/vllm/attention/ops/prefix_prefill.py b/vllm_v0.10.0/vllm/attention/ops/prefix_prefill.py
new file mode 100644
index 0000000..13bef96
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/ops/prefix_prefill.py
@@ -0,0 +1,902 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# The kernels in this file are adapted from LightLLM's context_attention_fwd:
+# https://github.com/ModelTC/lightllm/blob/main/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py
+
+import torch
+
+from vllm.platforms import current_platform
+from vllm.triton_utils import tl, triton
+
+# Static kernels parameters
+BASE_BLOCK = 128 if current_platform.has_device_capability(80) else 64
+NUM_WARPS = 4 if current_platform.is_rocm() else 8
+
+# To check compatibility
+IS_TURING = current_platform.get_device_capability() == (7, 5)
+
+
+# Here's an example autotuner config for this kernel. This config does provide
+# a performance improvement, but dramatically increases first call latency in
+# triton 3.2. Because of this tradeoff, it's currently commented out.
+# @triton.autotune(
+#     configs=[
+#         triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, \
+#                         "num_unroll_cache": 4, \
+#                         "num_unroll_request": 1 } | \
+#                         ({"kpack": 2, "waves_per_eu": 2} \
+#                             if current_platform.is_rocm() else {}), \
+#                         num_warps=4, \
+#                         num_stages=1)
+#     ],
+#     key=["BLOCK_SIZE", "MAX_Q_LEN", "MAX_CTX_LEN"]
+# )
+@triton.jit
+def _fwd_kernel(Q,
+                K,
+                V,
+                K_cache,
+                V_cache,
+                B_Loc,
+                sm_scale,
+                k_scale,
+                v_scale,
+                B_Start_Loc,
+                B_Seqlen,
+                x: tl.constexpr,
+                Out,
+                stride_b_loc_b,
+                stride_b_loc_s,
+                stride_qbs,
+                stride_qh,
+                stride_qd,
+                stride_kbs,
+                stride_kh,
+                stride_kd,
+                stride_vbs,
+                stride_vh,
+                stride_vd,
+                stride_obs,
+                stride_oh,
+                stride_od,
+                stride_k_cache_bs,
+                stride_k_cache_h,
+                stride_k_cache_d,
+                stride_k_cache_bl: tl.constexpr,
+                stride_k_cache_x,
+                stride_v_cache_bs,
+                stride_v_cache_h,
+                stride_v_cache_d,
+                stride_v_cache_bl,
+                num_queries_per_kv: tl.constexpr,
+                IN_PRECISION: tl.constexpr,
+                BLOCK_M: tl.constexpr,
+                BLOCK_DMODEL: tl.constexpr,
+                BLOCK_DMODEL_PADDED: tl.constexpr,
+                BLOCK_SIZE: tl.constexpr,
+                BLOCK_N: tl.constexpr,
+                SLIDING_WINDOW: tl.constexpr,
+                num_unroll_cache: tl.constexpr,
+                num_unroll_request: tl.constexpr,
+                SKIP_DECODE: tl.constexpr,
+                MAX_Q_LEN: tl.constexpr = 0,
+                MAX_CTX_LEN: tl.constexpr = 0):
+
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    start_m = tl.program_id(2)
+
+    cur_kv_head = cur_head // num_queries_per_kv
+
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+    cur_batch_in_all_stop_index = tl.load(B_Start_Loc + cur_batch + 1)
+    cur_batch_query_len = (cur_batch_in_all_stop_index -
+                           cur_batch_in_all_start_index)
+    cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len
+
+    if SKIP_DECODE and cur_batch_query_len == 1:
+        return
+
+    # start position inside of the query
+    # generally, N goes over kv, while M goes over query_len
+    block_start_loc = BLOCK_M * start_m
+
+    # initialize offsets
+    # [BLOCK_SIZE]; starts at 0
+    offs_bs_n = tl.arange(0, BLOCK_SIZE)
+    # [N]; starts at 0
+    offs_n = tl.arange(0, BLOCK_N)
+    # [D]; starts at 0
+    offs_d = tl.arange(0, BLOCK_DMODEL_PADDED)
+    # [M]; starts at current position in query
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    # [M,D]
+    off_q = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
+             cur_head * stride_qh + offs_d[None, :] * stride_qd)
+
+    dim_mask = tl.where(
+        tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1,
+        0).to(tl.int1)  # [D]
+
+    q = tl.load(Q + off_q,
+                mask=dim_mask[None, :] &
+                (offs_m[:, None] < cur_batch_query_len),
+                other=0.0)  # [M,D]
+
+    # initialize pointer to m and l
+    m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED], dtype=tl.float32)  # [M,D]
+
+    # compute query against context (no causal mask here)
+    for start_n in tl.range(0, cur_batch_ctx_len, BLOCK_SIZE, \
+                            loop_unroll_factor=num_unroll_cache):
+        start_n = tl.multiple_of(start_n, BLOCK_SIZE)
+        # -- compute qk ----
+        bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
+                     (start_n // BLOCK_SIZE) * stride_b_loc_s)
+        # [D,BLOCK_SIZE]
+        off_k = (
+            bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h +
+            (offs_d[:, None] // x) * stride_k_cache_d +
+            ((start_n + offs_bs_n[None, :]) % BLOCK_SIZE) * stride_k_cache_bl +
+            (offs_d[:, None] % x) * stride_k_cache_x)
+
+        # [BLOCK_SIZE,D]
+        off_v = (bn[:, None] * stride_v_cache_bs +
+                 cur_kv_head * stride_v_cache_h +
+                 offs_d[None, :] * stride_v_cache_d +
+                 offs_bs_n[:, None] * stride_v_cache_bl)
+
+        if start_n + BLOCK_SIZE > cur_batch_ctx_len or \
+            BLOCK_DMODEL != BLOCK_DMODEL_PADDED:
+            k_load = tl.load(
+                K_cache + off_k,
+                mask=dim_mask[:, None] &
+                ((start_n + offs_bs_n[None, :]) < cur_batch_ctx_len),
+                other=0.0)  # [D,N]
+        else:
+            k_load = tl.load(K_cache + off_k)
+
+        if k_load.dtype.is_fp8():
+            k = (k_load.to(tl.float32) * tl.load(k_scale)).to(q.dtype)
+        else:
+            k = k_load
+
+        qk = tl.zeros([BLOCK_M, BLOCK_SIZE], dtype=tl.float32)  # [M,N]
+        qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
+        qk = tl.where((start_n + offs_bs_n[None, :]) < cur_batch_ctx_len, qk,
+                      float("-inf"))
+        qk *= sm_scale
+        if SLIDING_WINDOW > 0:
+            # (cur_batch_ctx_len + offs_m[:, None]) are the positions of
+            # Q entries in sequence
+            # (start_n + offs_bs_n[None, :]) are the positions of
+            # KV entries in sequence
+            # So the condition makes sure each entry in Q only attends
+            # to KV entries not more than SLIDING_WINDOW away.
+            #
+            # We can't use -inf here, because the
+            # sliding window may lead to the entire row being masked.
+            # This then makes m_ij contain -inf, which causes NaNs in
+            # exp().
+            qk = tl.where((cur_batch_ctx_len + offs_m[:, None]) -
+                          (start_n + offs_bs_n[None, :]) < SLIDING_WINDOW, qk,
+                          -10000)
+
+        # compute running maximum
+        m_ij = tl.maximum(m_i, tl.max(qk, axis=1))
+        p = tl.exp(qk - m_ij[:, None])
+        l_ij = tl.sum(p, axis=1)
+        alpha = tl.exp(m_i - m_ij)
+        acc = acc * alpha[:, None]
+
+        # update acc
+        if start_n + BLOCK_SIZE > cur_batch_ctx_len or \
+            BLOCK_DMODEL != BLOCK_DMODEL_PADDED:
+            v_load = tl.load(
+                V_cache + off_v,
+                mask=dim_mask[None, :] &
+                ((start_n + offs_bs_n[:, None]) < cur_batch_ctx_len),
+                other=0.0)  # [N,D]
+        else:
+            v_load = tl.load(V_cache + off_v)
+
+        if v_load.dtype.is_fp8():
+            v = (v_load.to(tl.float32) * tl.load(v_scale)).to(q.dtype)
+        else:
+            v = v_load
+        p = p.to(v.dtype)
+
+        acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION)
+        # # update m_i and l_i
+        l_i = l_i * alpha + l_ij
+        m_i = m_ij
+
+    off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
+             offs_d[:, None] * stride_kd)
+    off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
+             offs_d[None, :] * stride_vd)
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+
+    # block_mask is 0 when we're already past the current query length
+    block_mask = tl.where(block_start_loc < cur_batch_query_len, 1, 0)
+
+    # compute query against itself (with causal mask)
+    for start_n in tl.range(0, \
+                        block_mask * (start_m + 1) * BLOCK_M, BLOCK_N, \
+                        loop_unroll_factor=num_unroll_request):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        k = tl.load(k_ptrs +
+                    (cur_batch_in_all_start_index + start_n) * stride_kbs,
+                    mask=dim_mask[:, None] &
+                    ((start_n + offs_n[None, :]) < cur_batch_query_len),
+                    other=0.0)
+
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
+        qk *= sm_scale
+        # apply causal mask
+        qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
+                      float("-inf"))
+        if SLIDING_WINDOW > 0:
+            qk = tl.where(
+                offs_m[:, None] - (start_n + offs_n[None, :]) < SLIDING_WINDOW,
+                qk, -10000)
+
+        # compute running maximum
+        m_ij = tl.maximum(m_i, tl.max(qk, axis=1))
+        p = tl.exp(qk - m_ij[:, None])
+        l_ij = tl.sum(p, axis=1)
+        alpha = tl.exp(m_i - m_ij)
+        acc = acc * alpha[:, None]
+
+        # update acc
+        v = tl.load(v_ptrs +
+                    (cur_batch_in_all_start_index + start_n) * stride_vbs,
+                    mask=dim_mask[None, :] &
+                    ((start_n + offs_n[:, None]) < cur_batch_query_len),
+                    other=0.0)
+        p = p.to(v.dtype)
+
+        acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION)
+        # update m_i and l_i
+        l_i = l_i * alpha + l_ij
+        m_i = m_ij
+
+    acc = acc / l_i[:, None]
+
+    # initialize pointers to output
+    off_o = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
+             cur_head * stride_oh + offs_d[None, :] * stride_od)
+    out_ptrs = Out + off_o
+    tl.store(out_ptrs,
+             acc,
+             mask=dim_mask[None, :] & (offs_m[:, None] < cur_batch_query_len))
+    return
+
+
+@triton.jit
+def _fwd_kernel_flash_attn_v2(
+    Q,
+    K,
+    V,
+    K_cache,
+    V_cache,
+    B_Loc,
+    sm_scale,
+    B_Start_Loc,
+    B_Seqlen,
+    B_Ctxlen,
+    block_size,
+    x,
+    Out,
+    stride_b_loc_b,
+    stride_b_loc_s,
+    stride_qbs,
+    stride_qh,
+    stride_qd,
+    stride_kbs,
+    stride_kh,
+    stride_kd,
+    stride_vbs,
+    stride_vh,
+    stride_vd,
+    stride_obs,
+    stride_oh,
+    stride_od,
+    stride_k_cache_bs,
+    stride_k_cache_h,
+    stride_k_cache_d,
+    stride_k_cache_bl,
+    stride_k_cache_x,
+    stride_v_cache_bs,
+    stride_v_cache_h,
+    stride_v_cache_d,
+    stride_v_cache_bl,
+    num_queries_per_kv: int,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    start_m = tl.program_id(2)
+
+    cur_kv_head = cur_head // num_queries_per_kv
+
+    cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch)
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+
+    block_start_loc = BLOCK_M * start_m
+
+    # initialize offsets
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    off_q = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
+             cur_head * stride_qh + offs_d[None, :] * stride_qd)
+
+    q = tl.load(Q + off_q,
+                mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
+                other=0.0)
+
+    # # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+
+    for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
+                     ((start_n + offs_n) // block_size) * stride_b_loc_s,
+                     mask=(start_n + offs_n) < cur_batch_ctx_len,
+                     other=0)
+        off_k = (
+            bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h +
+            (offs_d[:, None] // x) * stride_k_cache_d +
+            ((start_n + offs_n[None, :]) % block_size) * stride_k_cache_bl +
+            (offs_d[:, None] % x) * stride_k_cache_x)
+        off_v = (bn[:, None] * stride_v_cache_bs +
+                 cur_kv_head * stride_v_cache_h +
+                 offs_d[None, :] * stride_v_cache_d +
+                 (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
+        k = tl.load(K_cache + off_k,
+                    mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
+                    other=0.0)
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k)
+        qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
+                      float("-inf"))
+        qk *= sm_scale
+
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        m_i_new = tl.maximum(m_i, m_ij)
+        p = tl.math.exp(qk - m_i_new[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+
+        alpha = tl.math.exp(m_i - m_i_new)
+        l_i_new = alpha * l_i + l_ij
+        # -- update output accumulator --
+        # scale p
+        # scale acc
+        acc_scale = alpha
+        # acc_scale = l_i / l_i_new * alpha
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v = tl.load(V_cache + off_v,
+                    mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len,
+                    other=0.0)
+
+        p = p.to(v.dtype)
+        acc += tl.dot(p, v)
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+
+    off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
+             offs_d[:, None] * stride_kd)
+    off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
+             offs_d[None, :] * stride_vd)
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+
+    block_mask = tl.where(
+        block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
+
+    for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        k = tl.load(k_ptrs +
+                    (cur_batch_in_all_start_index + start_n) * stride_kbs,
+                    mask=(start_n + offs_n[None, :])
+                    < cur_batch_seq_len - cur_batch_ctx_len,
+                    other=0.0)
+
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k)
+        qk *= sm_scale
+        qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
+                      float("-inf"))
+
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        m_i_new = tl.maximum(m_i, m_ij)
+        p = tl.math.exp(qk - m_i_new[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+
+        alpha = tl.math.exp(m_i - m_i_new)
+        l_i_new = alpha * l_i + l_ij
+        # -- update output accumulator --
+        # scale p
+        # scale acc
+        acc_scale = alpha
+        # acc_scale = l_i / l_i_new * alpha
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v = tl.load(v_ptrs +
+                    (cur_batch_in_all_start_index + start_n) * stride_vbs,
+                    mask=(start_n + offs_n[:, None])
+                    < cur_batch_seq_len - cur_batch_ctx_len,
+                    other=0.0)
+
+        p = p.to(v.dtype)
+        acc += tl.dot(p, v)
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+
+    # acc /= l_i[:, None]
+    # initialize pointers to output
+    off_o = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
+             cur_head * stride_oh + offs_d[None, :] * stride_od)
+    out_ptrs = Out + off_o
+    tl.store(out_ptrs,
+             acc,
+             mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len)
+    return
+
+
+@triton.jit
+def _fwd_kernel_alibi(
+    Q,
+    K,
+    V,
+    K_cache,
+    V_cache,
+    B_Loc,
+    sm_scale,
+    k_scale,
+    v_scale,
+    B_Start_Loc,
+    B_Seqlen,
+    Alibi_slopes,
+    block_size,
+    x,
+    Out,
+    stride_b_loc_b,
+    stride_b_loc_s,
+    stride_qbs,
+    stride_qh,
+    stride_qd,
+    stride_kbs,
+    stride_kh,
+    stride_kd,
+    stride_vbs,
+    stride_vh,
+    stride_vd,
+    stride_obs,
+    stride_oh,
+    stride_od,
+    stride_k_cache_bs,
+    stride_k_cache_h,
+    stride_k_cache_d,
+    stride_k_cache_bl,
+    stride_k_cache_x,
+    stride_v_cache_bs,
+    stride_v_cache_h,
+    stride_v_cache_d,
+    stride_v_cache_bl,
+    num_queries_per_kv: int,
+    IN_PRECISION: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,  # head size
+    BLOCK_DMODEL_PADDED: tl.constexpr,  # head size padded to a power of 2
+    BLOCK_N: tl.constexpr,
+    SKIP_DECODE: tl.constexpr,
+):
+    # attn_bias[]
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    start_m = tl.program_id(2)
+
+    cur_kv_head = cur_head // num_queries_per_kv
+
+    # cur_batch_seq_len: the length of prompts
+    # cur_batch_ctx_len: the length of prefix
+    # cur_batch_in_all_start_index: the start id of the dim=0
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+    cur_batch_in_all_stop_index = tl.load(B_Start_Loc + cur_batch + 1)
+    cur_batch_query_len = (cur_batch_in_all_stop_index -
+                           cur_batch_in_all_start_index)
+    cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len
+
+    if SKIP_DECODE and cur_batch_query_len == 1:
+        return
+
+    block_start_loc = BLOCK_M * start_m
+
+    # initialize offsets
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL_PADDED)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    off_q = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
+             cur_head * stride_qh + offs_d[None, :] * stride_qd)
+
+    dim_mask = tl.where(
+        tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1, 0).to(tl.int1)
+
+    q = tl.load(Q + off_q,
+                mask=dim_mask[None, :] &
+                (offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len),
+                other=0.0)
+
+    # # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED], dtype=tl.float32)
+
+    alibi_slope = tl.load(Alibi_slopes + cur_head)
+    alibi_start_q = tl.arange(0, BLOCK_M) + block_start_loc + cur_batch_ctx_len
+    alibi_start_k = 0
+    for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
+                     ((start_n + offs_n) // block_size) * stride_b_loc_s,
+                     mask=(start_n + offs_n) < cur_batch_ctx_len,
+                     other=0)
+        off_k = (
+            bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h +
+            (offs_d[:, None] // x) * stride_k_cache_d +
+            ((start_n + offs_n[None, :]) % block_size) * stride_k_cache_bl +
+            (offs_d[:, None] % x) * stride_k_cache_x)
+        off_v = (bn[:, None] * stride_v_cache_bs +
+                 cur_kv_head * stride_v_cache_h +
+                 offs_d[None, :] * stride_v_cache_d +
+                 (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
+        k_load = tl.load(K_cache + off_k,
+                         mask=dim_mask[:, None] &
+                         ((start_n + offs_n[None, :]) < cur_batch_ctx_len),
+                         other=0.0)  # [D,N]
+
+        if k_load.dtype.is_fp8():
+            k = (k_load.to(tl.float32) * tl.load(k_scale)).to(q.dtype)
+        else:
+            k = k_load
+
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
+        qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
+                      float("-inf"))
+        qk *= sm_scale
+
+        # load alibi
+        alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k -
+                 alibi_start_q[:, None]) * alibi_slope
+        alibi = tl.where(
+            (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len), alibi,
+            float("-inf"))
+        qk += alibi
+        alibi_start_k += BLOCK_N
+
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        m_i_new = tl.maximum(m_i, m_ij)
+        p = tl.math.exp(qk - m_i_new[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+
+        alpha = tl.math.exp(m_i - m_i_new)
+        l_i_new = alpha * l_i + l_ij
+        # -- update output accumulator --
+        # scale p
+        # scale acc
+        acc_scale = alpha
+        # acc_scale = l_i / l_i_new * alpha
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v_load = tl.load(V_cache + off_v,
+                         mask=dim_mask[None, :] &
+                         ((start_n + offs_n[:, None]) < cur_batch_ctx_len),
+                         other=0.0)
+        if v_load.dtype.is_fp8():
+            v = (v_load.to(tl.float32) * tl.load(v_scale)).to(q.dtype)
+        else:
+            v = v_load
+        p = p.to(v.dtype)
+
+        acc = tl.dot(p, v, acc=acc, input_precision='ieee')
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+
+    off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
+             offs_d[:, None] * stride_kd)
+    off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
+             offs_d[None, :] * stride_vd)
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+
+    block_mask = tl.where(
+        block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
+
+    # init alibi
+    alibi_slope = tl.load(Alibi_slopes + cur_head)
+    alibi_start_q = tl.arange(0, BLOCK_M) + block_start_loc + cur_batch_ctx_len
+    alibi_start_k = cur_batch_ctx_len
+    # # init debugger
+    # offset_db_q = tl.arange(0, BLOCK_M) + block_start_loc
+    # offset_db_k = tl.arange(0, BLOCK_N)
+    # calc q[BLOCK_M, BLOCK_MODEL] mul k[prefix_len: , BLOCK_DMODEL]
+    for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        k = tl.load(
+            k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_kbs,
+            mask=dim_mask[:, None] & ((start_n + offs_n[None, :])
+                                      < cur_batch_seq_len - cur_batch_ctx_len),
+            other=0.0)
+
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk = tl.dot(q, k, acc=qk, input_precision='ieee')
+        qk *= sm_scale
+        qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
+                      float("-inf"))
+
+        # load alibi
+        alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k -
+                 alibi_start_q[:, None]) * alibi_slope
+        alibi = tl.where(
+            (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len), alibi,
+            float("-inf"))
+        qk += alibi
+        alibi_start_k += BLOCK_N
+
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        m_i_new = tl.maximum(m_i, m_ij)
+        p = tl.math.exp(qk - m_i_new[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+
+        alpha = tl.math.exp(m_i - m_i_new)
+        l_i_new = alpha * l_i + l_ij
+        # -- update output accumulator --
+        # scale p
+        # scale acc
+        acc_scale = alpha
+        # acc_scale = l_i / l_i_new * alpha
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v = tl.load(
+            v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs,
+            mask=dim_mask[None, :] & ((start_n + offs_n[:, None])
+                                      < cur_batch_seq_len - cur_batch_ctx_len),
+            other=0.0)
+        p = p.to(v.dtype)
+
+        acc = tl.dot(p, v, acc=acc, input_precision='ieee')
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+
+    acc = acc / l_i[:, None]
+
+    # initialize pointers to output
+    off_o = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
+             cur_head * stride_oh + offs_d[None, :] * stride_od)
+    out_ptrs = Out + off_o
+    tl.store(out_ptrs,
+             acc,
+             mask=dim_mask[None, :] &
+             (offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len))
+    return
+
+
+@torch.inference_mode()
+def context_attention_fwd(q,
+                          k,
+                          v,
+                          o,
+                          kv_cache_dtype: str,
+                          k_cache,
+                          v_cache,
+                          b_loc,
+                          b_start_loc,
+                          b_seq_len,
+                          max_seq_len,
+                          max_input_len,
+                          k_scale: torch.Tensor,
+                          v_scale: torch.Tensor,
+                          alibi_slopes=None,
+                          sliding_window=None,
+                          sm_scale=None,
+                          skip_decode=False):
+
+    q_dtype_is_f32 = q.dtype is torch.float32
+
+    # Turing does have tensor core for float32 multiplication
+    # use ieee as fallback for triton kernels work. There is also
+    # warning on vllm/config.py to inform users this fallback
+    # implementation
+    IN_PRECISION = 'ieee' if IS_TURING and q_dtype_is_f32 else None
+
+    # Conversion of FP8 Tensor from uint8 storage to
+    # appropriate torch.dtype for interpretation by Triton
+    if "fp8" in kv_cache_dtype:
+        assert k_cache.dtype in [torch.uint8, current_platform.fp8_dtype()]
+        assert v_cache.dtype in [torch.uint8, current_platform.fp8_dtype()]
+
+        if kv_cache_dtype in ("fp8", "fp8_e4m3"):
+            target_dtype = current_platform.fp8_dtype()
+        elif kv_cache_dtype == "fp8_e5m2":
+            target_dtype = torch.float8_e5m2
+        else:
+            raise ValueError("Unsupported FP8 dtype:", kv_cache_dtype)
+
+        k_cache = k_cache.view(target_dtype)
+        v_cache = v_cache.view(target_dtype)
+
+    if (k_cache.dtype == torch.uint8
+            or v_cache.dtype == torch.uint8 and kv_cache_dtype == "auto"):
+        raise ValueError("kv_cache_dtype='auto' unsupported for\
+            FP8 KV Cache prefill kernel")
+
+    # shape constraints
+    Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
+    assert Lq == Lk and Lk == Lv
+    # round up Lk to a power of 2 - this is required for Triton block size
+    Lk_padded = triton.next_power_of_2(Lk)
+
+    if sm_scale is None:
+        sm_scale = 1.0 / (Lq**0.5)
+    batch, head = b_seq_len.shape[0], q.shape[1]
+    num_queries_per_kv = q.shape[1] // k.shape[1]
+
+    assert batch + 1 == len(b_start_loc)
+
+    # 0 means "disable"
+    if sliding_window is None or sliding_window <= 0:
+        sliding_window = 0
+
+    if alibi_slopes is not None:
+        # need to reduce num. blocks when using fp32
+        # due to increased use of GPU shared memory
+        # if q.dtype is torch.float32:
+        BLOCK = BASE_BLOCK // 2 if q_dtype_is_f32 else BASE_BLOCK
+        # batch, head,
+        grid = (batch, head, triton.cdiv(max_input_len, BLOCK))
+        _fwd_kernel_alibi[grid](
+            q,
+            k,
+            v,
+            k_cache,
+            v_cache,
+            b_loc,
+            sm_scale,
+            k_scale,
+            v_scale,
+            b_start_loc,
+            b_seq_len,
+            alibi_slopes,
+            v_cache.shape[3],
+            k_cache.shape[4],
+            o,
+            b_loc.stride(0),
+            b_loc.stride(1),
+            q.stride(0),
+            q.stride(1),
+            q.stride(2),
+            k.stride(0),
+            k.stride(1),
+            k.stride(2),
+            v.stride(0),
+            v.stride(1),
+            v.stride(2),
+            o.stride(0),
+            o.stride(1),
+            o.stride(2),
+            k_cache.stride(0),
+            k_cache.stride(1),
+            k_cache.stride(2),
+            k_cache.stride(3),
+            k_cache.stride(
+                4),  #[num_blocks, num_kv_heads, head_size/x, block_size, x]
+            v_cache.stride(0),
+            v_cache.stride(1),
+            v_cache.stride(2),
+            v_cache.stride(
+                3),  #[num_blocks, num_kv_heads, head_size, block_size]
+            num_queries_per_kv=num_queries_per_kv,
+            IN_PRECISION=IN_PRECISION,
+            BLOCK_M=BLOCK,
+            BLOCK_DMODEL=Lk,
+            BLOCK_DMODEL_PADDED=Lk_padded,
+            BLOCK_N=BLOCK,
+            SKIP_DECODE=skip_decode,
+            num_warps=NUM_WARPS,
+            num_stages=1,
+        )
+        return
+
+    max_seq_len = 0 if max_seq_len is None else max_seq_len
+    extra_kargs = {}
+    if current_platform.is_rocm():
+        extra_kargs = {"kpack": 2, "waves_per_eu": 2}
+
+    grid = lambda META: (batch, head,
+                         triton.cdiv(max_input_len, META["BLOCK_M"]))
+    _fwd_kernel[grid](
+        q,
+        k,
+        v,
+        k_cache,
+        v_cache,
+        b_loc,
+        sm_scale,
+        k_scale,
+        v_scale,
+        b_start_loc,
+        b_seq_len,
+        k_cache.shape[4],
+        o,
+        b_loc.stride(0),
+        b_loc.stride(1),
+        q.stride(0),
+        q.stride(1),
+        q.stride(2),
+        k.stride(0),
+        k.stride(1),
+        k.stride(2),
+        v.stride(0),
+        v.stride(1),
+        v.stride(2),
+        o.stride(0),
+        o.stride(1),
+        o.stride(2),
+        k_cache.stride(0),
+        k_cache.stride(1),
+        k_cache.stride(2),
+        k_cache.stride(3),
+        k_cache.stride(
+            4),  #[num_blocks, num_kv_heads, head_size/x, block_size, x]
+        v_cache.stride(0),
+        v_cache.stride(1),
+        v_cache.stride(2),
+        v_cache.stride(3),  #[num_blocks, num_kv_heads, head_size, block_size]
+        BLOCK_SIZE=v_cache.shape[3],
+        num_queries_per_kv=num_queries_per_kv,
+        IN_PRECISION=IN_PRECISION,
+        BLOCK_DMODEL=Lk,
+        BLOCK_DMODEL_PADDED=Lk_padded,
+        SLIDING_WINDOW=sliding_window,
+        SKIP_DECODE=skip_decode,
+        BLOCK_M=128,
+        BLOCK_N=64,
+        num_unroll_cache=4,
+        num_unroll_request=1,
+        num_warps=4,
+        num_stages=1,
+        **extra_kargs)
+    return
diff --git a/vllm_v0.10.0/vllm/attention/ops/rocm_aiter_mla.py b/vllm_v0.10.0/vllm/attention/ops/rocm_aiter_mla.py
new file mode 100644
index 0000000..d91cda2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/ops/rocm_aiter_mla.py
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+from vllm.platforms import current_platform
+from vllm.utils import direct_register_custom_op, is_torch_equal_or_newer
+
+
+def get_aiter_mla_metadata(max_batch_size: int, block_size: int,
+                           max_block_per_batch: int,
+                           device: torch.device) -> tuple[torch.Tensor, ...]:
+    paged_kv_indices = torch.zeros(max_batch_size * max_block_per_batch,
+                                   dtype=torch.int32,
+                                   device=device)
+    paged_kv_indptr = torch.zeros(max_batch_size + 1,
+                                  dtype=torch.int32,
+                                  device=device)
+    paged_kv_last_page_lens = torch.full((max_batch_size, ),
+                                         block_size,
+                                         dtype=torch.int32)
+    qo_indptr = torch.zeros(max_batch_size + 1, dtype=torch.int, device=device)
+    return paged_kv_indices, paged_kv_indptr, paged_kv_last_page_lens, qo_indptr
+
+
+def aiter_mla_decode_fwd(
+    q: torch.Tensor,
+    kv_buffer: torch.Tensor,
+    o: torch.Tensor,
+    sm_scale: float,
+    qo_indptr: torch.Tensor,
+    max_seqlen_qo: int,
+    kv_indptr: Optional[torch.Tensor] = None,
+    kv_indices: Optional[torch.Tensor] = None,
+    kv_last_page_lens: Optional[torch.Tensor] = None,
+    logit_cap: float = 0.0,
+):
+
+    torch.ops.vllm.rocm_aiter_mla_decode_fwd(q,
+                                             kv_buffer.view(
+                                                 -1, 1, 1, q.shape[-1]),
+                                             o,
+                                             qo_indptr,
+                                             max_seqlen_qo,
+                                             kv_indptr,
+                                             kv_indices,
+                                             kv_last_page_lens,
+                                             sm_scale=sm_scale,
+                                             logit_cap=logit_cap)
+
+
+def mla_decode_fwd_impl(
+    q: torch.Tensor,
+    kv_buffer: torch.Tensor,
+    o: torch.Tensor,
+    qo_indptr: torch.Tensor,
+    max_seqlen_qo: int,
+    kv_indptr: Optional[torch.Tensor] = None,
+    kv_indices: Optional[torch.Tensor] = None,
+    kv_last_page_lens: Optional[torch.Tensor] = None,
+    sm_scale: float = 1.0,
+    logit_cap: float = 0.0,
+) -> None:
+    from aiter.mla import mla_decode_fwd
+
+    mla_decode_fwd(q,
+                   kv_buffer.view(-1, 1, 1, q.shape[-1]),
+                   o,
+                   qo_indptr,
+                   kv_indptr,
+                   kv_indices,
+                   kv_last_page_lens,
+                   max_seqlen_qo,
+                   sm_scale=sm_scale,
+                   logit_cap=logit_cap)
+
+
+def mla_decode_fwd_fake(
+    q: torch.Tensor,
+    kv_buffer: torch.Tensor,
+    o: torch.Tensor,
+    qo_indptr: torch.Tensor,
+    max_seqlen_qo: int,
+    kv_indptr: Optional[torch.Tensor] = None,
+    kv_indices: Optional[torch.Tensor] = None,
+    kv_last_page_lens: Optional[torch.Tensor] = None,
+    sm_scale: float = 1.0,
+    logit_cap: float = 0.0,
+) -> None:
+    pass
+
+
+if current_platform.is_rocm():
+    if is_torch_equal_or_newer("2.7.0"):
+        tags = ()
+    else:
+        tags = (torch.Tag.needs_fixed_stride_order, ),
+    direct_register_custom_op(op_name="rocm_aiter_mla_decode_fwd",
+                              op_func=mla_decode_fwd_impl,
+                              mutates_args=["o"],
+                              fake_impl=mla_decode_fwd_fake,
+                              tags=tags)
diff --git a/vllm_v0.10.0/vllm/attention/ops/rocm_aiter_paged_attn.py b/vllm_v0.10.0/vllm/attention/ops/rocm_aiter_paged_attn.py
new file mode 100644
index 0000000..ad97152
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/ops/rocm_aiter_paged_attn.py
@@ -0,0 +1,102 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import aiter as rocm_aiter
+import torch
+
+from vllm.attention.ops.paged_attn import PagedAttention
+from vllm.platforms import current_platform
+from vllm.utils import cdiv
+
+FP8_DTYPE = current_platform.fp8_dtype()
+
+
+class AITERPagedAttention(PagedAttention):
+
+    @staticmethod
+    def write_to_paged_cache(
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        slot_mapping: torch.Tensor,
+        kv_cache_dtype: str,
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
+    ) -> None:
+        if kv_cache_dtype not in ["int8", "fp8", "fp8_e4m3"]:
+            PagedAttention.write_to_paged_cache(key, value, key_cache,
+                                                value_cache, slot_mapping,
+                                                kv_cache_dtype, k_scale,
+                                                v_scale)
+        else:
+            kv_cache_torch_dtype = (FP8_DTYPE
+                                    if "fp8" in kv_cache_dtype else torch.int8)
+            key_cache = key_cache.view(kv_cache_torch_dtype)
+            value_cache = value_cache.view(kv_cache_torch_dtype)
+
+            rocm_aiter.reshape_and_cache_with_pertoken_quant(
+                key, value, key_cache, value_cache, k_scale, v_scale,
+                slot_mapping.flatten(), True)
+
+    @staticmethod
+    def forward_decode(
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_tables: torch.Tensor,
+        seq_lens: torch.Tensor,
+        max_seq_len: int,
+        kv_cache_dtype: str,
+        num_kv_heads: int,
+        scale: float,
+        alibi_slopes: Optional[torch.Tensor],
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
+        tp_rank: int = 0,
+        blocksparse_local_blocks: int = 0,
+        blocksparse_vert_stride: int = 0,
+        blocksparse_block_size: int = 64,
+        blocksparse_head_sliding_step: int = 0,
+    ) -> torch.Tensor:
+        if kv_cache_dtype not in ["int8", "fp8", "fp8_e4m3"]:
+            return PagedAttention.forward_decode(
+                query=query,
+                key_cache=key_cache,
+                value_cache=value_cache,
+                block_tables=block_tables,
+                seq_lens=seq_lens,
+                max_seq_len=max_seq_len,
+                kv_cache_dtype=kv_cache_dtype,
+                num_kv_heads=num_kv_heads,
+                scale=scale,
+                alibi_slopes=alibi_slopes,
+                k_scale=k_scale,
+                v_scale=v_scale,
+                tp_rank=tp_rank,
+                blocksparse_local_blocks=blocksparse_local_blocks,
+                blocksparse_vert_stride=blocksparse_vert_stride,
+                blocksparse_block_size=blocksparse_block_size,
+                blocksparse_head_sliding_step=blocksparse_head_sliding_step)
+
+        if "fp8" in kv_cache_dtype:
+            key_cache = key_cache.view(torch.float8_e4m3fnuz)
+            value_cache = value_cache.view(torch.float8_e4m3fnuz)
+
+        if blocksparse_vert_stride is not None and blocksparse_vert_stride > 1:
+            # use blocksparse paged attention
+            block_size = value_cache.size(-1)
+            assert (blocksparse_block_size > 0 and
+                    blocksparse_block_size % block_size == 0), \
+                (f"{blocksparse_block_size=} needs to be a multiple of"
+                 f"{block_size=} used in block_tables.")
+
+        output = torch.empty_like(query)
+        block_size = value_cache.shape[3]
+        max_num_blocks_per_seq = cdiv(max_seq_len, block_size)
+
+        rocm_aiter.pa_fwd_asm(query, key_cache, value_cache, block_tables,
+                              seq_lens, max_num_blocks_per_seq, k_scale,
+                              v_scale, output)
+        return output
diff --git a/vllm_v0.10.0/vllm/attention/ops/triton_decode_attention.py b/vllm_v0.10.0/vllm/attention/ops/triton_decode_attention.py
new file mode 100644
index 0000000..c27b377
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/ops/triton_decode_attention.py
@@ -0,0 +1,674 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/sgl-project/sglang/blob/9f635ea50de920aa507f486daafba26a5b837574/python/sglang/srt/layers/attention/triton_ops/decode_attention.py
+# which was originally adapted from
+# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage1.py
+# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage2.py
+
+# Changes:
+# - Add support for page size >= 1.
+
+# Copyright 2025 vLLM Team
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Memory-efficient attention for decoding.
+It supports page size >= 1.
+"""
+
+import logging
+
+from vllm.platforms import current_platform
+from vllm.triton_utils import tl, triton
+
+is_hip_ = current_platform.is_rocm()
+
+logger = logging.getLogger(__name__)
+
+# Only print the following warnings when triton version < 3.2.0.
+# The issue won't affect performance or accuracy.
+if triton.__version__ < '3.2.0':
+    logger.warning(
+        "The following error message 'operation scheduled before its operands' "
+        "can be ignored.")
+
+
+@triton.jit
+def tanh(x):
+    # Tanh is just a scaled sigmoid
+    return 2 * tl.sigmoid(2 * x) - 1
+
+
+@triton.jit
+def _fwd_kernel_stage1(
+    Q,
+    K_Buffer,
+    V_Buffer,
+    sm_scale,
+    Req_to_tokens,
+    B_Seqlen,
+    Att_Out,
+    stride_req_to_tokens_b,
+    stride_qbs,
+    stride_qh,
+    stride_buf_kbs,
+    stride_buf_kh,
+    stride_buf_vbs,
+    stride_buf_vh,
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    kv_group_num: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    NUM_KV_SPLITS: tl.constexpr,
+    PAGE_SIZE: tl.constexpr,
+    logit_cap: tl.constexpr,
+    Lk: tl.constexpr,
+    Lv: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    split_kv_id = tl.program_id(2)
+
+    cur_kv_head = cur_head // kv_group_num
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_dv = tl.arange(0, BLOCK_DV)
+    mask_d = offs_d < Lk
+    mask_dv = offs_dv < Lv
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_req_idx = cur_batch
+
+    off_q = cur_batch * stride_qbs + cur_head * stride_qh + offs_d
+    q = tl.load(Q + off_q, mask=mask_d, other=0.0)
+
+    kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS)
+    split_kv_start = kv_len_per_split * split_kv_id
+    split_kv_end = tl.minimum(split_kv_start + kv_len_per_split,
+                              cur_batch_seq_len)
+
+    e_max = -float("inf")
+    e_sum = 0.0
+    acc = tl.zeros([BLOCK_DV], dtype=tl.float32)
+
+    if split_kv_end > split_kv_start:
+        for start_n in range(split_kv_start, split_kv_end, BLOCK_N):
+            offs_n = start_n + tl.arange(0, BLOCK_N)
+            kv_page_number = tl.load(
+                Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx +
+                offs_n // PAGE_SIZE,
+                mask=offs_n < split_kv_end,
+                other=0,
+            )
+            kv_loc = kv_page_number * PAGE_SIZE + offs_n % PAGE_SIZE
+            offs_buf_k = (kv_loc[:, None] * stride_buf_kbs +
+                          cur_kv_head * stride_buf_kh + offs_d[None, :])
+            k = tl.load(
+                K_Buffer + offs_buf_k,
+                mask=(offs_n[:, None] < split_kv_end) & (mask_d[None, :]),
+                other=0.0,
+            )
+            qk = tl.sum(q[None, :] * k, 1)
+            qk *= sm_scale
+
+            if logit_cap > 0:
+                qk = logit_cap * tanh(qk / logit_cap)
+
+            qk = tl.where(offs_n < split_kv_end, qk, float("-inf"))
+
+            offs_buf_v = (kv_loc[:, None] * stride_buf_vbs +
+                          cur_kv_head * stride_buf_vh + offs_dv[None, :])
+            v = tl.load(
+                V_Buffer + offs_buf_v,
+                mask=(offs_n[:, None] < split_kv_end) & (mask_dv[None, :]),
+                other=0.0,
+            )
+
+            n_e_max = tl.maximum(tl.max(qk, 0), e_max)
+            re_scale = tl.exp(e_max - n_e_max)
+            p = tl.exp(qk - n_e_max)
+            acc *= re_scale
+            acc += tl.sum(p[:, None] * v, 0)
+
+            e_sum = e_sum * re_scale + tl.sum(p, 0)
+            e_max = n_e_max
+
+        offs_mid_o = (cur_batch * stride_mid_ob + cur_head * stride_mid_oh +
+                      split_kv_id * stride_mid_os + offs_dv)
+
+        tl.store(
+            Att_Out + offs_mid_o,
+            acc / e_sum,
+            mask=(mask_dv),
+        )
+
+        offs_mid_o_1 = (cur_batch * stride_mid_ob + cur_head * stride_mid_oh +
+                        split_kv_id * stride_mid_os + Lv)
+
+        tl.store(
+            Att_Out + offs_mid_o_1,
+            e_max + tl.log(e_sum),
+        )
+
+
+def _decode_att_m_fwd(
+    q,
+    k_buffer,
+    v_buffer,
+    att_out,
+    Req_to_tokens,
+    B_Seqlen,
+    num_kv_splits,
+    sm_scale,
+    page_size,
+    logit_cap,
+):
+    BLOCK = 64 if not is_hip_ else 8
+
+    NUM_KV_SPLITS = num_kv_splits
+    Lk = k_buffer.shape[-1]
+    Lv = v_buffer.shape[-1]
+
+    batch, head_num = q.shape[0], q.shape[1]
+
+    grid = (batch, head_num, NUM_KV_SPLITS)
+    kv_group_num = q.shape[1] // k_buffer.shape[-2]
+
+    num_warps = 4
+    if kv_group_num != 1:
+        num_warps = 1 if is_hip_ else 2
+
+    BLOCK_DMODEL = triton.next_power_of_2(Lk)
+    BLOCK_DV = triton.next_power_of_2(Lv)
+
+    _fwd_kernel_stage1[grid](
+        q,
+        k_buffer,
+        v_buffer,
+        sm_scale,
+        Req_to_tokens,
+        B_Seqlen,
+        att_out,
+        Req_to_tokens.stride(0),
+        q.stride(0),
+        q.stride(1),
+        k_buffer.stride(-3),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
+        k_buffer.stride(-2),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
+        v_buffer.stride(-3),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
+        v_buffer.stride(-2),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
+        att_out.stride(0),
+        att_out.stride(1),
+        att_out.stride(2),
+        kv_group_num=kv_group_num,
+        BLOCK_DMODEL=BLOCK_DMODEL,
+        BLOCK_DV=BLOCK_DV,
+        BLOCK_N=BLOCK,
+        NUM_KV_SPLITS=NUM_KV_SPLITS,
+        PAGE_SIZE=page_size,
+        logit_cap=logit_cap,
+        num_warps=num_warps,
+        num_stages=2,
+        Lk=Lk,
+        Lv=Lv,
+    )
+
+
+@triton.jit
+def _fwd_grouped_kernel_stage1(
+    Q,
+    K_Buffer,
+    V_Buffer,
+    sm_scale,
+    Req_to_tokens,
+    B_Seqlen,
+    Att_Out,
+    stride_req_to_tokens_b,
+    stride_qbs,
+    stride_qh,
+    stride_buf_kbs,
+    stride_buf_kh,
+    stride_buf_vbs,
+    stride_buf_vh,
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    kv_group_num: tl.constexpr,
+    q_head_num: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_DPE: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_H: tl.constexpr,
+    NUM_KV_SPLITS: tl.constexpr,
+    PAGE_SIZE: tl.constexpr,
+    logit_cap: tl.constexpr,
+    Lk: tl.constexpr,
+    Lv: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head_id = tl.program_id(1)
+    cur_kv_head = cur_head_id // tl.cdiv(kv_group_num, BLOCK_H)
+    split_kv_id = tl.program_id(2)
+
+    if kv_group_num > BLOCK_H:
+        VALID_BLOCK_H: tl.constexpr = BLOCK_H
+    else:
+        VALID_BLOCK_H: tl.constexpr = kv_group_num
+    cur_head = cur_head_id * VALID_BLOCK_H + tl.arange(0, BLOCK_H)
+    mask_h = cur_head < (cur_head_id + 1) * VALID_BLOCK_H
+    mask_h = mask_h & (cur_head < q_head_num)
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_dv = tl.arange(0, BLOCK_DV)
+    mask_d = offs_d < Lk
+    mask_dv = offs_dv < Lv
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_req_idx = cur_batch
+
+    offs_q = cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_d[
+        None, :]
+    q = tl.load(Q + offs_q,
+                mask=(mask_h[:, None]) & (mask_d[None, :]),
+                other=0.0)
+
+    if BLOCK_DPE > 0:
+        offs_dpe = BLOCK_DMODEL + tl.arange(0, BLOCK_DPE)
+        mask_dpe = offs_dpe < Lk
+        off_qpe = (cur_batch * stride_qbs + cur_head[:, None] * stride_qh +
+                   offs_dpe[None, :])
+        qpe = tl.load(Q + off_qpe,
+                      mask=(mask_h[:, None]) & (mask_dpe[None, :]),
+                      other=0.0)
+
+    kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS)
+    split_kv_start = kv_len_per_split * split_kv_id
+    split_kv_end = tl.minimum(split_kv_start + kv_len_per_split,
+                              cur_batch_seq_len)
+
+    e_max = tl.zeros([BLOCK_H], dtype=tl.float32) - float("inf")
+    e_sum = tl.zeros([BLOCK_H], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_H, BLOCK_DV], dtype=tl.float32)
+
+    if split_kv_end > split_kv_start:
+        for start_n in range(split_kv_start, split_kv_end, BLOCK_N):
+            offs_n = start_n + tl.arange(0, BLOCK_N)
+            kv_page_number = tl.load(
+                Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx +
+                offs_n // PAGE_SIZE,
+                mask=offs_n < split_kv_end,
+                other=0,
+            )
+            kv_loc = kv_page_number * PAGE_SIZE + offs_n % PAGE_SIZE
+            offs_buf_k = (kv_loc[None, :] * stride_buf_kbs +
+                          cur_kv_head * stride_buf_kh + offs_d[:, None])
+            k = tl.load(
+                K_Buffer + offs_buf_k,
+                mask=(offs_n[None, :] < split_kv_end) & (mask_d[:, None]),
+                other=0.0,
+            )
+            qk = tl.dot(q, k.to(q.dtype))
+            if BLOCK_DPE > 0:
+                offs_buf_kpe = (kv_loc[None, :] * stride_buf_kbs +
+                                cur_kv_head * stride_buf_kh +
+                                offs_dpe[:, None])
+                kpe = tl.load(
+                    K_Buffer + offs_buf_kpe,
+                    mask=(offs_n[None, :] < split_kv_end) &
+                    (mask_dpe[:, None]),
+                    other=0.0,
+                )
+                qk += tl.dot(qpe, kpe.to(qpe.dtype))
+            qk *= sm_scale
+
+            if logit_cap > 0:
+                qk = logit_cap * tanh(qk / logit_cap)
+
+            qk = tl.where(mask_h[:, None] & (offs_n[None, :] < split_kv_end),
+                          qk, float("-inf"))
+
+            offs_buf_v = (kv_loc[:, None] * stride_buf_vbs +
+                          cur_kv_head * stride_buf_vh + offs_dv[None, :])
+            v = tl.load(
+                V_Buffer + offs_buf_v,
+                mask=(offs_n[:, None] < split_kv_end) & (mask_dv[None, :]),
+                other=0.0,
+            )
+
+            n_e_max = tl.maximum(tl.max(qk, 1), e_max)
+            re_scale = tl.exp(e_max - n_e_max)
+            p = tl.exp(qk - n_e_max[:, None])
+            acc *= re_scale[:, None]
+            acc += tl.dot(p.to(v.dtype), v)
+
+            e_sum = e_sum * re_scale + tl.sum(p, 1)
+            e_max = n_e_max
+
+        offs_mid_o = (cur_batch * stride_mid_ob +
+                      cur_head[:, None] * stride_mid_oh +
+                      split_kv_id * stride_mid_os + offs_dv[None, :])
+
+        tl.store(
+            Att_Out + offs_mid_o,
+            acc / e_sum[:, None],
+            mask=(mask_h[:, None]) & (mask_dv[None, :]),
+        )
+
+        offs_mid_o_1 = (cur_batch * stride_mid_ob + cur_head * stride_mid_oh +
+                        split_kv_id * stride_mid_os + Lv)
+
+        tl.store(
+            Att_Out + offs_mid_o_1,
+            e_max + tl.log(e_sum),
+            mask=mask_h,
+        )
+
+
+def _decode_grouped_att_m_fwd(
+    q,
+    k_buffer,
+    v_buffer,
+    att_out,
+    Req_to_tokens,
+    B_Seqlen,
+    num_kv_splits,
+    sm_scale,
+    page_size,
+    logit_cap,
+):
+    BLOCK = 32
+    Lk = k_buffer.shape[-1]
+    Lv = v_buffer.shape[-1]
+
+    # [TODO] work around shmem limit on MI3xx
+    if is_hip_ and Lk >= 576:
+        BLOCK = 16
+
+    if Lk == 576:
+        BLOCK_DMODEL = 512
+        BLOCK_DPE = 64
+    elif Lk == 288:
+        BLOCK_DMODEL = 256
+        BLOCK_DPE = 32
+    else:
+        BLOCK_DMODEL = triton.next_power_of_2(Lk)
+        BLOCK_DPE = 0
+    BLOCK_DV = triton.next_power_of_2(Lv)
+
+    batch, head_num = q.shape[0], q.shape[1]
+    kv_group_num = q.shape[1] // k_buffer.shape[-2]
+
+    BLOCK_H = 16
+    NUM_KV_SPLITS = num_kv_splits
+    grid = (
+        batch,
+        triton.cdiv(head_num, min(BLOCK_H, kv_group_num)),
+        NUM_KV_SPLITS,
+    )
+
+    extra_kargs = {}
+    num_stages = 2
+    if is_hip_:
+        # https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/inference-optimization/workload.html#mi300x-triton-kernel-performance-optimization
+        # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
+        extra_kargs = {
+            "waves_per_eu": 1,
+            "matrix_instr_nonkdim": 16,
+            "kpack": 2
+        }
+        num_stages = 1
+
+    _fwd_grouped_kernel_stage1[grid](
+        q,
+        k_buffer,
+        v_buffer,
+        sm_scale,
+        Req_to_tokens,
+        B_Seqlen,
+        att_out,
+        Req_to_tokens.stride(0),
+        q.stride(0),
+        q.stride(1),
+        k_buffer.stride(-3),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
+        k_buffer.stride(-2),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
+        v_buffer.stride(-3),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
+        v_buffer.stride(-2),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
+        att_out.stride(0),
+        att_out.stride(1),
+        att_out.stride(2),
+        kv_group_num=kv_group_num,
+        q_head_num=head_num,
+        BLOCK_DMODEL=BLOCK_DMODEL,
+        BLOCK_DPE=BLOCK_DPE,
+        BLOCK_DV=BLOCK_DV,
+        BLOCK_N=BLOCK,
+        BLOCK_H=BLOCK_H,
+        NUM_KV_SPLITS=NUM_KV_SPLITS,
+        PAGE_SIZE=page_size,
+        logit_cap=logit_cap,
+        num_warps=4,
+        num_stages=num_stages,
+        Lk=Lk,
+        Lv=Lv,
+        **extra_kargs,
+    )
+
+
+@triton.jit
+def _fwd_kernel_stage2(
+    Mid_O,
+    o,
+    B_Seqlen,
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    stride_obs,
+    stride_oh,
+    NUM_KV_SPLITS: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    Lv: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+
+    offs_d = tl.arange(0, BLOCK_DV)
+    mask_d = offs_d < Lv
+
+    e_sum = 0.0
+    e_max = -float("inf")
+    acc = tl.zeros([BLOCK_DV], dtype=tl.float32)
+
+    offs_v = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + offs_d
+    offs_logic = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + Lv
+
+    for split_kv_id in range(0, NUM_KV_SPLITS):
+        kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS)
+        split_kv_start = kv_len_per_split * split_kv_id
+        split_kv_end = tl.minimum(split_kv_start + kv_len_per_split,
+                                  cur_batch_seq_len)
+
+        if split_kv_end > split_kv_start:
+            tv = tl.load(Mid_O + offs_v + split_kv_id * stride_mid_os,
+                         mask=mask_d,
+                         other=0.0)
+            tlogic = tl.load(Mid_O + offs_logic + split_kv_id * stride_mid_os)
+            n_e_max = tl.maximum(tlogic, e_max)
+
+            old_scale = tl.exp(e_max - n_e_max)
+            acc *= old_scale
+            exp_logic = tl.exp(tlogic - n_e_max)
+            acc += exp_logic * tv
+
+            e_sum = e_sum * old_scale + exp_logic
+            e_max = n_e_max
+
+    tl.store(
+        o + cur_batch * stride_obs + cur_head * stride_oh + offs_d,
+        acc / e_sum,
+        mask=mask_d,
+    )
+
+
+def _decode_softmax_reducev_fwd(
+    logits,
+    q,
+    o,
+    v_buffer,
+    b_seq_len,
+    num_kv_splits,
+):
+    batch, head_num = q.shape[0], q.shape[1]
+    Lv = v_buffer.shape[-1]
+    BLOCK_DV = triton.next_power_of_2(Lv)
+
+    NUM_KV_SPLITS = num_kv_splits
+
+    extra_kargs = {}
+    if is_hip_:
+        # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
+        # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
+        extra_kargs = {
+            "waves_per_eu": 4,
+            "matrix_instr_nonkdim": 16,
+            "kpack": 2
+        }
+
+    grid = (batch, head_num)
+    _fwd_kernel_stage2[grid](
+        logits,
+        o,
+        b_seq_len,
+        logits.stride(0),
+        logits.stride(1),
+        logits.stride(2),
+        o.stride(0),
+        o.stride(1),
+        NUM_KV_SPLITS=NUM_KV_SPLITS,
+        BLOCK_DV=BLOCK_DV,
+        Lv=Lv,
+        num_warps=4,
+        num_stages=2,
+        **extra_kargs,
+    )
+
+
+def decode_attention_fwd_normal(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    req_to_token,
+    b_seq_len,
+    attn_logits,
+    num_kv_splits,
+    sm_scale,
+    page_size,
+    logit_cap=0.0,
+):
+    _decode_att_m_fwd(
+        q,
+        k_buffer,
+        v_buffer,
+        attn_logits,
+        req_to_token,
+        b_seq_len,
+        num_kv_splits,
+        sm_scale,
+        page_size,
+        logit_cap,
+    )
+    _decode_softmax_reducev_fwd(attn_logits, q, o, v_buffer, b_seq_len,
+                                num_kv_splits)
+
+
+def decode_attention_fwd_grouped(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    req_to_token,
+    b_seq_len,
+    attn_logits,
+    num_kv_splits,
+    sm_scale,
+    page_size,
+    logit_cap=0.0,
+):
+    _decode_grouped_att_m_fwd(
+        q,
+        k_buffer,
+        v_buffer,
+        attn_logits,
+        req_to_token,
+        b_seq_len,
+        num_kv_splits,
+        sm_scale,
+        page_size,
+        logit_cap,
+    )
+    _decode_softmax_reducev_fwd(attn_logits, q, o, v_buffer, b_seq_len,
+                                num_kv_splits)
+
+
+def decode_attention_fwd(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    req_to_token,
+    b_seq_len,
+    attn_logits,
+    num_kv_splits,
+    sm_scale,
+    page_size=1,
+    logit_cap=0.0,
+):
+    assert num_kv_splits == attn_logits.shape[2]
+    kv_group_num = q.shape[1] // v_buffer.shape[-2]
+
+    if kv_group_num == 1:
+        # MHA
+        decode_attention_fwd_normal(
+            q,
+            k_buffer,
+            v_buffer,
+            o,
+            req_to_token,
+            b_seq_len,
+            attn_logits,
+            num_kv_splits,
+            sm_scale,
+            page_size,
+            logit_cap,
+        )
+    else:
+        # GQA/MQA/MLA
+        decode_attention_fwd_grouped(
+            q,
+            k_buffer,
+            v_buffer,
+            o,
+            req_to_token,
+            b_seq_len,
+            attn_logits,
+            num_kv_splits,
+            sm_scale,
+            page_size,
+            logit_cap,
+        )
diff --git a/vllm_v0.10.0/vllm/attention/ops/triton_flash_attention.py b/vllm_v0.10.0/vllm/attention/ops/triton_flash_attention.py
new file mode 100644
index 0000000..49070e4
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/ops/triton_flash_attention.py
@@ -0,0 +1,984 @@
+#!/usr/bin/env python
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Fused Attention
+===============
+
+This is a Triton implementation of the Flash Attention v2 algorithm from Tri Dao
+(https://tridao.me/publications/flash2/flash2.pdf)
+Credits: OpenAI kernel team, AMD ML Frameworks Triton team
+
+Features supported:
+
+1) Fwd with causal masking
+2) Any sequence lengths without padding (currently fwd kernel only)
+3) Support for different sequence lengths for q and k
+4) Nested tensor API currently does not support dropout or bias.
+
+Not currently supported:
+
+1) Non power of two head dims
+
+"""
+
+import torch
+
+from vllm.platforms import current_platform
+from vllm.triton_utils import tl, triton
+
+# Avoid misleading ROCm warning.
+if current_platform.is_rocm():
+    from vllm.platforms.rocm import on_gfx1x
+else:
+    on_gfx1x = lambda *args, **kwargs: False
+
+torch_dtype: tl.constexpr = torch.float16
+
+
+@triton.jit
+def cdiv_fn(x, y):
+    return (x + y - 1) // y
+
+
+@triton.jit
+def max_fn(x, y):
+    return tl.math.max(x, y)
+
+
+@triton.jit
+def dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride):
+    ms = tl.arange(0, m)
+    ns = tl.arange(0, n)
+    return philox_offset + ms[:, None] * stride + ns[None, :]
+
+
+@triton.jit
+def dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride):
+    rng_offsets = dropout_offsets(philox_seed, philox_offset, dropout_p, m, n,
+                                  stride).to(tl.uint32)
+    # TODO: use tl.randint for better performance
+    return tl.rand(philox_seed, rng_offsets)
+
+
+@triton.jit
+def dropout_mask(philox_seed, philox_offset, dropout_p, m, n, stride):
+    rng_output = dropout_rng(philox_seed, philox_offset, dropout_p, m, n,
+                             stride)
+    rng_keep = rng_output > dropout_p
+    return rng_keep
+
+
+@triton.jit
+def load_fn(block_ptr, first, second, pad):
+    if first and second:
+        tensor = tl.load(block_ptr, boundary_check=(0, 1), padding_option=pad)
+    elif first:
+        tensor = tl.load(block_ptr, boundary_check=(0, ), padding_option=pad)
+    elif second:
+        tensor = tl.load(block_ptr, boundary_check=(1, ), padding_option=pad)
+    else:
+        tensor = tl.load(block_ptr)
+    return tensor
+
+
+@triton.jit
+def _attn_fwd_inner(
+    acc,
+    l_i,
+    m_i,
+    q,
+    K_block_ptr,
+    V_block_ptr,
+    start_m,
+    actual_seqlen_k,
+    dropout_p,
+    philox_seed,
+    batch_philox_offset,
+    encoded_softmax_block_ptr,
+    block_min,
+    block_max,
+    offs_n_causal,
+    masked_blocks,
+    n_extra_tokens,
+    bias_ptr,
+    IS_CAUSAL: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    OFFS_M: tl.constexpr,
+    OFFS_N: tl.constexpr,
+    PRE_LOAD_V: tl.constexpr,
+    MASK_STEPS: tl.constexpr,
+    ENABLE_DROPOUT: tl.constexpr,
+    RETURN_ENCODED_SOFTMAX: tl.constexpr,
+    PADDED_HEAD: tl.constexpr,
+    USE_FP8: tl.constexpr,
+    qk_scale,
+    p_descale,
+):
+    # loop over k, v, and update accumulator
+    for start_n in range(block_min, block_max, BLOCK_N):
+        # For padded blocks, we will overrun the tensor size if
+        # we load all BLOCK_N. For others, the blocks are all within range.
+        k = load_fn(
+            K_block_ptr,
+            PADDED_HEAD,
+            MASK_STEPS and (n_extra_tokens != 0),
+            "zero",
+        )
+        if PRE_LOAD_V:
+            v = load_fn(
+                V_block_ptr,
+                MASK_STEPS and (n_extra_tokens != 0),
+                PADDED_HEAD,
+                "zero",
+            )
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        # We start from end of seqlen_k so only the first iteration would need
+        # to be checked for padding if it is not a multiple of block_n
+        # TODO: This can be optimized to only be true for the padded block.
+        if MASK_STEPS:  # noqa: SIM102
+            # If this is the last block / iteration, we want to
+            # mask if the sequence length is not a multiple of block size
+            # a solution is to always do BLOCK_M // BLOCK_N + 1 steps
+            # if not is_modulo_mn. last step might get wasted but that is okay.
+            # check if this masking works for that case.
+            if (start_n + BLOCK_N == block_max) and (n_extra_tokens != 0):
+                boundary_m = tl.full([BLOCK_M],
+                                     actual_seqlen_k,
+                                     dtype=tl.int32)
+                size_n = start_n + OFFS_N[None, :]
+                mask = size_n < boundary_m[:, None]
+                qk = tl.where(mask, qk, float("-inf"))
+        if IS_CAUSAL:
+            causal_boundary = start_n + offs_n_causal
+            causal_mask = OFFS_M[:, None] >= causal_boundary[None, :]
+            qk = tl.where(causal_mask, qk, float("-inf"))
+        # -- compute qk ----
+        qk += tl.dot(q, k)
+        if USE_FP8:
+            qk *= qk_scale
+        if bias_ptr is not None:
+            bias = load_fn(bias_ptr, False, MASK_STEPS
+                           and (n_extra_tokens != 0), "zero")
+            # While bias is added after multiplying qk with sm_scale, our
+            # optimization to use 2^x instead of e^x results in an additional
+            # scale factor of log2(e) which we must also multiply the bias with.
+            qk += bias * 1.44269504089
+        m_ij = tl.maximum(m_i, tl.max(qk, 1))
+        qk = qk - m_ij[:, None]
+        p = tl.math.exp2(qk)
+
+        # CAVEAT: Must update l_ij before applying dropout
+        l_ij = tl.sum(p, 1)
+        if ENABLE_DROPOUT:
+            philox_offset = (batch_philox_offset +
+                             start_m * BLOCK_M * actual_seqlen_k + start_n -
+                             BLOCK_N)
+            keep = dropout_mask(
+                philox_seed,
+                philox_offset,
+                dropout_p,
+                BLOCK_M,
+                BLOCK_N,
+                actual_seqlen_k,
+            )
+            if RETURN_ENCODED_SOFTMAX:
+                tl.store(
+                    encoded_softmax_block_ptr,
+                    tl.where(keep, p,
+                             -p).to(encoded_softmax_block_ptr.type.element_ty),
+                )
+            p = tl.where(keep, p, 0.0)
+        elif RETURN_ENCODED_SOFTMAX:
+            tl.store(
+                encoded_softmax_block_ptr,
+                p.to(encoded_softmax_block_ptr.type.element_ty),
+            )
+        # -- update output accumulator --
+        alpha = tl.math.exp2(m_i - m_ij)
+        acc = acc * alpha[:, None]
+        if not PRE_LOAD_V:
+            v = load_fn(
+                V_block_ptr,
+                MASK_STEPS and (n_extra_tokens != 0),
+                PADDED_HEAD,
+                "zero",
+            )
+        # -- update m_i and l_i
+        l_i = l_i * alpha + l_ij
+        # update m_i and l_i
+        m_i = m_ij
+
+        if USE_FP8:
+            p *= p_descale
+
+        acc += tl.dot(p.to(V_block_ptr.type.element_ty), v)
+
+        V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
+        K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
+        if bias_ptr is not None:
+            bias_ptr = tl.advance(bias_ptr, (0, BLOCK_N))
+        if RETURN_ENCODED_SOFTMAX:
+            encoded_softmax_block_ptr = tl.advance(encoded_softmax_block_ptr,
+                                                   (0, BLOCK_N))
+    return acc, l_i, m_i
+
+
+def get_cdna_autotune_configs():
+    return [
+        triton.Config(
+            {
+                'BLOCK_M': 256,
+                'BLOCK_N': 64,
+                'waves_per_eu': 2,
+                'PRE_LOAD_V': False
+            },
+            num_stages=1,
+            num_warps=8),
+        triton.Config(
+            {
+                'BLOCK_M': 128,
+                'BLOCK_N': 128,
+                'waves_per_eu': 2,
+                'PRE_LOAD_V': False
+            },
+            num_stages=1,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_M': 256,
+                'BLOCK_N': 128,
+                'waves_per_eu': 2,
+                'PRE_LOAD_V': False
+            },
+            num_stages=1,
+            num_warps=8),
+        triton.Config(
+            {
+                'BLOCK_M': 128,
+                'BLOCK_N': 64,
+                'waves_per_eu': 1,
+                'PRE_LOAD_V': False
+            },
+            num_stages=1,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_M': 128,
+                'BLOCK_N': 64,
+                'waves_per_eu': 3,
+                'PRE_LOAD_V': True
+            },
+            num_stages=1,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_M': 128,
+                'BLOCK_N': 64,
+                'waves_per_eu': 3,
+                'PRE_LOAD_V': False
+            },
+            num_stages=1,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_M': 64,
+                'BLOCK_N': 64,
+                'waves_per_eu': 4,
+                'PRE_LOAD_V': False
+            },
+            num_stages=1,
+            num_warps=8),
+        triton.Config(
+            {
+                'BLOCK_M': 32,
+                'BLOCK_N': 32,
+                'waves_per_eu': 4,
+                'PRE_LOAD_V': False
+            },
+            num_stages=1,
+            num_warps=8),
+        # TODO: This config fails with head_size not pow2 with data mismatches.
+        #    triton.Config({'BLOCK_M': 32, 'BLOCK_N': 16, 'waves_per_eu': 1,
+        #                   'PRE_LOAD_V': False}, num_stages=1, num_warps=4),
+
+        # Fails in AccelerateAMDMatmul (Triton) assert when using FP8:
+        # triton.Config(
+        #     {
+        #         "BLOCK_M": 16,
+        #         "BLOCK_N": 16,
+        #         "waves_per_eu": 1,
+        #         "PRE_LOAD_V": False,
+        #     },
+        #     num_stages=1,
+        #     num_warps=4,
+        # ),
+    ], ['IS_CAUSAL', 'dropout_p', 'BLOCK_DMODEL', 'USE_FP8']
+
+
+def get_rdna_autotune_configs():
+    return [
+        triton.Config(
+            {
+                'BLOCK_M': 32,
+                'BLOCK_N': 32,
+                'waves_per_eu': 4,
+                'PRE_LOAD_V': False
+            },
+            num_stages=1,
+            num_warps=2),
+        triton.Config(
+            {
+                'BLOCK_M': 32,
+                'BLOCK_N': 32,
+                'waves_per_eu': 2,
+                'PRE_LOAD_V': False
+            },
+            num_stages=1,
+            num_warps=2),
+        triton.Config(
+            {
+                'BLOCK_M': 32,
+                'BLOCK_N': 16,
+                'waves_per_eu': 4,
+                'PRE_LOAD_V': False
+            },
+            num_stages=1,
+            num_warps=2),
+        triton.Config(
+            {
+                'BLOCK_M': 32,
+                'BLOCK_N': 16,
+                'waves_per_eu': 2,
+                'PRE_LOAD_V': False
+            },
+            num_stages=1,
+            num_warps=2),
+        # Fails in AccelerateAMDMatmul (Triton) assert when using FP8:
+        # triton.Config(
+        #     {
+        #         'BLOCK_M': 16,
+        #         'BLOCK_N': 16,
+        #         'waves_per_eu': 4,
+        #         'PRE_LOAD_V': False
+        #     },
+        #     num_stages=1,
+        #     num_warps=2),
+        # triton.Config(
+        #     {
+        #         'BLOCK_M': 16,
+        #         'BLOCK_N': 16,
+        #         'waves_per_eu': 2,
+        #         'PRE_LOAD_V': False
+        #     },
+        #     num_stages=1,
+        #     num_warps=2),
+        # # Fall-back config.
+        # triton.Config(
+        #     {
+        #         'BLOCK_M': 16,
+        #         'BLOCK_N': 16,
+        #         'waves_per_eu': 1,
+        #         'PRE_LOAD_V': False
+        #     },
+        #     num_stages=1,
+        #     num_warps=2),
+    ], ['IS_CAUSAL', 'dropout_p', 'BLOCK_DMODEL', 'USE_FP8']
+
+
+def get_autotune_configs():
+    if on_gfx1x():
+        return get_rdna_autotune_configs()
+    else:
+        return get_cdna_autotune_configs()
+
+
+autotune_configs, autotune_keys = get_autotune_configs()
+
+float8_info = torch.finfo(current_platform.fp8_dtype())
+
+
+@triton.autotune(
+    configs=autotune_configs,
+    key=autotune_keys,
+)
+@triton.jit
+def attn_fwd(
+    Q,
+    K,
+    V,
+    bias,
+    sm_scale,
+    q_scale,
+    k_scale,
+    v_scale,
+    p_scale,
+    p_descale,
+    o_descale,
+    L,
+    Out,
+    stride_qz: tl.int64,
+    stride_qh: tl.int64,
+    stride_qm: tl.int64,
+    stride_qk: tl.int64,
+    stride_kz: tl.int64,
+    stride_kh: tl.int64,
+    stride_kn: tl.int64,
+    stride_kk: tl.int64,
+    stride_vz: tl.int64,
+    stride_vh: tl.int64,
+    stride_vk: tl.int64,
+    stride_vn: tl.int64,
+    stride_oz: tl.int64,
+    stride_oh: tl.int64,
+    stride_om: tl.int64,
+    stride_on: tl.int64,
+    stride_bz: tl.int64,
+    stride_bh: tl.int64,
+    stride_bm: tl.int64,
+    stride_bn: tl.int64,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    dropout_p,
+    philox_seed,
+    philox_offset_base,
+    encoded_softmax,
+    HQ: tl.constexpr,
+    HK: tl.constexpr,
+    ACTUAL_BLOCK_DMODEL: tl.constexpr,
+    MAX_SEQLENS_Q: tl.constexpr,
+    MAX_SEQLENS_K: tl.constexpr,
+    VARLEN: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    USE_FP8: tl.constexpr,
+    USE_FP8_OUT: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    PRE_LOAD_V: tl.constexpr,
+    BIAS_TYPE: tl.constexpr,
+    ENABLE_DROPOUT: tl.constexpr,
+    RETURN_ENCODED_SOFTMAX: tl.constexpr,
+    FP8_MIN: tl.constexpr = float8_info.min,
+    FP8_MAX: tl.constexpr = float8_info.max,
+):
+    start_m = tl.program_id(0)
+    off_h_q = tl.program_id(1)
+    off_z = tl.program_id(2)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    if VARLEN:
+        cu_seqlens_q_start = tl.load(cu_seqlens_q + off_z)
+        cu_seqlens_q_end = tl.load(cu_seqlens_q + off_z + 1)
+        seqlen_q = cu_seqlens_q_end - cu_seqlens_q_start
+        # We have a one-size-fits-all grid in id(0). Some seqlens might be too
+        # small for all start_m so for those we return early.
+        if start_m * BLOCK_M > seqlen_q:
+            return
+        cu_seqlens_k_start = tl.load(cu_seqlens_k + off_z)
+        cu_seqlens_k_end = tl.load(cu_seqlens_k + off_z + 1)
+        seqlen_k = cu_seqlens_k_end - cu_seqlens_k_start
+    else:
+        cu_seqlens_q_start = 0
+        cu_seqlens_k_start = 0
+        seqlen_q = MAX_SEQLENS_Q
+        seqlen_k = MAX_SEQLENS_K
+
+    # Now we compute whether we need to exit early due to causal masking.
+    # This is because for seqlen_q > seqlen_k, M rows of the attn scores
+    # are completely masked, resulting in 0s written to the output, and
+    # inf written to LSE. We don't need to do any GEMMs in this case.
+    # This block of code determines what N is, and if this WG is operating
+    # on those M rows.
+    n_blocks = cdiv_fn(seqlen_k, BLOCK_N)
+    if IS_CAUSAL:
+        # If seqlen_q == seqlen_k, the attn scores are a square matrix.
+        # If seqlen_q != seqlen_k, attn scores are rectangular which means
+        # the causal mask boundary is bottom right aligned, and ends at either
+        # the top edge (seqlen_q < seqlen_k) or left edge.
+        # This captures the decrease in n_blocks if we have a rectangular attn
+        # matrix
+        n_blocks_seqlen = cdiv_fn(
+            (start_m + 1) * BLOCK_M + seqlen_k - seqlen_q, BLOCK_N)
+        # This is what adjusts the block_max for the current WG, only
+        # if IS_CAUSAL. Otherwise we want to always iterate through all n_blocks
+        n_blocks = min(n_blocks, n_blocks_seqlen)
+        # If we have no blocks after adjusting for seqlen deltas, this WG is
+        # part of the blocks that are all 0. We exit early.
+        if n_blocks <= 0:
+            o_offset = (off_z * stride_oz + cu_seqlens_q_start * stride_om +
+                        off_h_q * stride_oh)
+            O_block_ptr = tl.make_block_ptr(
+                base=Out + o_offset,
+                shape=(seqlen_q, BLOCK_DMODEL),
+                strides=(stride_om, stride_on),
+                offsets=(start_m * BLOCK_M, 0),
+                block_shape=(BLOCK_M, BLOCK_DMODEL),
+                order=(1, 0),
+            )
+            acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=Out.type.element_ty)
+            # We still need to write 0s to the result
+            # tl.store(O_block_ptr,
+            # acc.to(Out.type.element_ty), boundary_check=(0,1))
+            # l_ptrs = L + off_z * HQ * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q
+            #          + offs_m
+            # We store inf to LSE, not -inf because in the bwd pass,
+            # we subtract this
+            # from qk which makes it -inf, such that exp(qk - inf) = 0
+            # for these masked blocks.
+            # l = tl.full([BLOCK_M], value=float("inf"), dtype=tl.float32)
+            # tl.store(l_ptrs, l)
+            # TODO: Should dropout and return encoded softmax be handled here?
+            return
+
+    # If MQA / GQA, set the K and V head offsets appropriately.
+    GROUP_SIZE: tl.constexpr = HQ // HK
+    off_h_k = off_h_q // GROUP_SIZE if GROUP_SIZE != 1 else off_h_q
+
+    n_extra_tokens = 0
+    if seqlen_k < BLOCK_N:
+        n_extra_tokens = BLOCK_N - seqlen_k
+    elif seqlen_k % BLOCK_N:
+        n_extra_tokens = seqlen_k % BLOCK_N
+    padded_head = ACTUAL_BLOCK_DMODEL != BLOCK_DMODEL
+
+    # Compute pointers for all the tensors used in this kernel.
+    q_offset = (off_z * stride_qz + off_h_q * stride_qh +
+                cu_seqlens_q_start * stride_qm)
+    Q_block_ptr = tl.make_block_ptr(
+        base=Q + q_offset,
+        shape=(seqlen_q, ACTUAL_BLOCK_DMODEL),
+        strides=(stride_qm, stride_qk),
+        offsets=(start_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0),
+    )
+    k_offset = (off_z * stride_kz + off_h_k * stride_kh +
+                cu_seqlens_k_start * stride_kn)
+    K_block_ptr = tl.make_block_ptr(
+        base=K + k_offset,
+        shape=(ACTUAL_BLOCK_DMODEL, seqlen_k),
+        strides=(stride_kk, stride_kn),
+        offsets=(0, 0),
+        block_shape=(BLOCK_DMODEL, BLOCK_N),
+        order=(0, 1),
+    )
+    v_offset = (off_z * stride_vz + off_h_k * stride_vh +
+                cu_seqlens_k_start * stride_vk)
+    V_block_ptr = tl.make_block_ptr(
+        base=V + v_offset,
+        shape=(seqlen_k, ACTUAL_BLOCK_DMODEL),
+        strides=(stride_vk, stride_vn),
+        offsets=(0, 0),
+        block_shape=(BLOCK_N, BLOCK_DMODEL),
+        order=(1, 0),
+    )
+    if BIAS_TYPE != 0:
+        bias_ptr = tl.make_block_ptr(
+            base=bias + off_h_q * stride_bh,
+            shape=(seqlen_q, seqlen_k),
+            strides=(stride_bm, stride_bn),
+            offsets=(start_m * BLOCK_M, 0),
+            block_shape=(BLOCK_M, BLOCK_N),
+            order=(1, 0),
+        )
+    else:
+        bias_ptr = None
+    if ENABLE_DROPOUT:
+        batch_philox_offset = philox_offset_base \
+                              + (off_z * HQ + off_h_q) \
+                              * seqlen_q * seqlen_k
+    else:
+        batch_philox_offset = 0
+    # We can ask to return the dropout mask without actually doing any dropout.
+    # In this case, we return an invalid pointer so indicate the mask is not i
+    # valid.
+    # TODO: Fix encoded softmax. It currently uses just h_q in the base offset.
+    if RETURN_ENCODED_SOFTMAX:
+        encoded_softmax_block_ptr = tl.make_block_ptr(
+            base=encoded_softmax + off_h_q * seqlen_q * seqlen_k,
+            shape=(seqlen_q, seqlen_k),
+            strides=(seqlen_k, 1),
+            offsets=(start_m * BLOCK_M, 0),
+            block_shape=(BLOCK_M, BLOCK_N),
+            order=(1, 0),
+        )
+    else:
+        encoded_softmax_block_ptr = 0
+    # initialize pointer to m and l
+    m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    # scale sm_scale by log_2(e) and use 2^x in the loop as we do not
+    # have native e^x support in HW.
+    qk_scale = sm_scale * 1.44269504089
+    # Q is loaded once at the beginning and shared by all N blocks.
+    q = load_fn(Q_block_ptr, True, padded_head, "zero")
+    if not USE_FP8:
+        q = (q * qk_scale).to(Q_block_ptr.type.element_ty)
+        acc_scale = 1.0
+    else:
+        qk_scale *= q_scale * k_scale
+        acc_scale = p_scale * v_scale
+
+    # Here we compute how many full and masked blocks we have.
+    padded_block_k = n_extra_tokens != 0
+    is_modulo_mn = not padded_block_k and (seqlen_q % BLOCK_M == 0)
+    if IS_CAUSAL:
+        # There are always at least BLOCK_M // BLOCK_N masked blocks.
+        # Additionally there might be one more due to dissimilar seqlens.
+        masked_blocks = BLOCK_M // BLOCK_N + (not is_modulo_mn)
+    else:
+        # Padding on Q does not need to be masked in the FA loop.
+        masked_blocks = padded_block_k
+    # if IS_CAUSAL, not is_modulo_mn does not always result in an additional
+    # block. In this case we might exceed n_blocks so pick the min.
+    masked_blocks = min(masked_blocks, n_blocks)
+    n_full_blocks = n_blocks - masked_blocks
+    block_min = 0
+    block_max = n_blocks * BLOCK_N
+    # Compute for full blocks. Here we set causal to false regardless of its
+    # value because there is no masking. Similarly we do not need padding.
+    if n_full_blocks > 0:
+        block_max = (n_blocks - masked_blocks) * BLOCK_N
+        acc, l_i, m_i = _attn_fwd_inner(
+            acc,
+            l_i,
+            m_i,
+            q,
+            K_block_ptr,
+            V_block_ptr,
+            start_m,
+            seqlen_k,
+            dropout_p,
+            philox_seed,
+            batch_philox_offset,
+            encoded_softmax_block_ptr,
+            # _, _, offs_n_causal, masked_blocks, n_extra_tokens, _
+            block_min,
+            block_max,
+            0,
+            0,
+            0,
+            bias_ptr,
+            # IS_CAUSAL, ....
+            False,
+            BLOCK_M,
+            BLOCK_DMODEL,
+            BLOCK_N,
+            offs_m,
+            offs_n,
+            # _, MASK_STEPS, ...
+            PRE_LOAD_V,
+            False,
+            ENABLE_DROPOUT,
+            RETURN_ENCODED_SOFTMAX,
+            padded_head,
+            USE_FP8,
+            qk_scale,
+            p_descale,
+        )
+        block_min = block_max
+        block_max = n_blocks * BLOCK_N
+
+    tl.debug_barrier()
+    # Remaining blocks, if any, are full / not masked.
+    if masked_blocks > 0:
+        offs_n_causal = offs_n + (seqlen_q - seqlen_k) if IS_CAUSAL else 0
+        K_block_ptr = tl.advance(K_block_ptr, (0, n_full_blocks * BLOCK_N))
+        V_block_ptr = tl.advance(V_block_ptr, (n_full_blocks * BLOCK_N, 0))
+        if bias_ptr is not None:
+            bias_ptr = tl.advance(bias_ptr, (0, n_full_blocks * BLOCK_N))
+        if RETURN_ENCODED_SOFTMAX:
+            encoded_softmax_block_ptr = tl.advance(encoded_softmax_block_ptr,
+                                                   (0, n_full_blocks))
+        acc, l_i, m_i = _attn_fwd_inner(
+            acc,
+            l_i,
+            m_i,
+            q,
+            K_block_ptr,
+            V_block_ptr,
+            start_m,
+            seqlen_k,
+            dropout_p,
+            philox_seed,
+            batch_philox_offset,
+            encoded_softmax_block_ptr,
+            block_min,
+            block_max,
+            offs_n_causal,
+            masked_blocks,
+            n_extra_tokens,
+            bias_ptr,
+            IS_CAUSAL,
+            BLOCK_M,
+            BLOCK_DMODEL,
+            BLOCK_N,
+            offs_m,
+            offs_n,
+            # _, MASK_STEPS, ...
+            PRE_LOAD_V,
+            True,
+            ENABLE_DROPOUT,
+            RETURN_ENCODED_SOFTMAX,
+            padded_head,
+            USE_FP8,
+            qk_scale,
+            p_descale,
+        )
+    # epilogue
+
+    if USE_FP8:
+        acc *= acc_scale
+    acc = acc / l_i[:, None]
+    if ENABLE_DROPOUT:
+        acc = acc / (1 - dropout_p)
+    # If seqlen_q > seqlen_k but the delta is not a multiple of BLOCK_M,
+    # then we have one block with a row of all NaNs which come from computing
+    # softmax over a row of all -infs (-inf - inf = NaN). We check for that here
+    # and store 0s where there are NaNs as these rows should've been zeroed out.
+    end_m_idx = (start_m + 1) * BLOCK_M
+    start_m_idx = start_m * BLOCK_M
+    causal_start_idx = seqlen_q - seqlen_k
+    if USE_FP8_OUT:
+        acc *= o_descale
+        acc = tl.clamp(acc, FP8_MIN, FP8_MAX)
+    acc = acc.to(Out.type.element_ty)
+    if IS_CAUSAL:  # noqa: SIM102
+        if causal_start_idx > start_m_idx and causal_start_idx < end_m_idx:
+            out_mask_boundary = tl.full((BLOCK_DMODEL, ),
+                                        causal_start_idx,
+                                        dtype=tl.int32)
+            mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M)
+            out_ptrs_mask = (mask_m_offsets[:, None]
+                             >= out_mask_boundary[None, :])
+            z = tl.zeros((1, ), tl.float32)
+            acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty))
+    # write back LSE
+    # l_ptrs = L + off_z * HQ * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + offs_m
+    # If seqlen_q not multiple of BLOCK_M, we need to mask out the last
+    # few rows. This is only true for the last M block. For others,
+    # overflow_size will be -ve
+    # overflow_size = end_m_idx - seqlen_q
+    # if overflow_size > 0:
+    #    boundary = tl.full((BLOCK_M,), BLOCK_M - overflow_size, dtype=tl.int32)
+    #    # This is a > check because mask being 0 blocks the store.
+    #    l_ptrs_mask = boundary > tl.arange(0, BLOCK_M)
+    #    tl.store(l_ptrs, m_i + tl.math.log2(l_i), mask=l_ptrs_mask)
+    # else:
+    #    tl.store(l_ptrs, m_i + tl.math.log2(l_i))
+
+    # write back O
+    o_offset = (off_z * stride_oz + cu_seqlens_q_start * stride_om +
+                off_h_q * stride_oh)
+    O_block_ptr = tl.make_block_ptr(
+        base=Out + o_offset,
+        shape=(seqlen_q, ACTUAL_BLOCK_DMODEL),
+        strides=(stride_om, stride_on),
+        offsets=(start_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0),
+    )
+    # Need boundary check on this to make sure the padding from the
+    # Q and KV tensors in both dims are not part of what we store back.
+    # TODO: Do the boundary check optionally.
+    tl.store(O_block_ptr, acc, boundary_check=(0, 1))
+
+
+def check_args(
+    q,
+    k,
+    v,
+    o,
+    varlen=True,
+    max_seqlens=None,
+    cu_seqlens_q=None,
+    cu_seqlens_k=None,
+):
+    assert q.dim() == k.dim() and q.dim() == v.dim()
+    if varlen:
+        assert q.dim() == 3
+        total_q, nheads_q, head_size = q.shape
+        total_k, nheads_k, _ = k.shape
+        assert cu_seqlens_q is not None
+        assert cu_seqlens_k is not None
+        assert len(cu_seqlens_q) == len(cu_seqlens_k)
+    else:
+        assert q.dim() == 4
+        batch, nheads_q, seqlen_q, head_size = q.shape
+        _, nheads_k, seqlen_k, _ = k.shape
+        assert max_seqlens > 0
+    assert k.shape == v.shape
+    assert q.shape[-1] == k.shape[-1] and q.shape[-1] == v.shape[-1]
+    # TODO: Change assert if we support qkl f8 and v f16
+    assert q.dtype == k.dtype and q.dtype == v.dtype
+    assert head_size <= 256
+    assert o.shape == q.shape
+    assert (nheads_q % nheads_k) == 0
+
+
+class _attention(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx,
+        q,
+        k,
+        v,
+        o,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlens_q,
+        max_seqlens_k,
+        causal=False,
+        sm_scale=1.0,
+        bias=None,
+        fp8_scales=None,
+        fp8_out_scale=None,
+    ):
+        if fp8_scales is not None:
+            use_fp8 = True
+            (q_scale, k_scale, v_scale, p_scale) = fp8_scales
+            float8 = current_platform.fp8_dtype()
+
+            def check_and_convert(t, scale):
+                if t.dtype != float8:
+                    descale = 1.0 / scale
+                    ts = (t * descale).clamp(min=float8_info.min,
+                                             max=float8_info.max)
+                    return ts.to(float8)
+                else:
+                    return t
+
+            q = check_and_convert(q, q_scale)
+            k = check_and_convert(k, k_scale)
+            v = check_and_convert(v, v_scale)
+        else:
+            use_fp8 = False
+            q_scale = k_scale = v_scale = p_scale = 1.0
+
+        if o is None:
+            o = torch.empty_like(q, dtype=v.dtype)
+
+        check_args(
+            q,
+            k,
+            v,
+            o,
+            varlen=True,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+        )
+        if True:  # varlen
+            total_q, nheads_q, head_size = q.shape
+            total_k, nheads_k, _ = k.shape
+            batch = len(cu_seqlens_q) - 1
+            q_strides = (0, q.stride(1), q.stride(0), q.stride(2))
+            k_strides = (0, k.stride(1), k.stride(0), k.stride(2))
+            v_strides = (0, v.stride(1), v.stride(0), v.stride(2))
+            o_strides = (0, o.stride(1), o.stride(0), o.stride(2))
+        else:
+            batch, seqlen_q, nheads_q, head_size = q.shape
+            _, seqlen_k, nheads_k, _ = k.shape
+            q_strides = (q.stride(0), q.stride(2), q.stride(1), q.stride(3))
+            k_strides = (k.stride(0), k.stride(2), k.stride(1), k.stride(3))
+            v_strides = (v.stride(0), v.stride(2), v.stride(1), v.stride(3))
+            o_strides = (o.stride(0), o.stride(2), o.stride(1), o.stride(3))
+
+        # Get closest power of 2 over or equal to 32.
+        unpadded_head_dims = {32, 64, 128, 256}
+        if head_size not in unpadded_head_dims:
+            padded_d_model = None
+            for i in unpadded_head_dims:
+                if i > head_size:
+                    padded_d_model = i
+                    break
+            assert padded_d_model is not None
+        else:
+            padded_d_model = head_size
+
+        grid = lambda META: (
+            triton.cdiv(max_seqlens_q, META["BLOCK_M"]),
+            nheads_q,
+            batch,
+        )
+
+        encoded_softmax = None
+
+        # Seed the RNG so we get reproducible results for testing.
+        philox_seed = 0x1BF52
+        philox_offset = 0x1D4B42
+
+        if bias is not None:
+            bias_strides = (
+                bias.stride(0),
+                bias.stride(1),
+                bias.stride(2),
+                bias.stride(3),
+            )
+        else:
+            bias_strides = (0, 0, 0, 0)
+
+        p_descale = 1.0 / p_scale
+        o_descale = 1.0 / fp8_out_scale.item(
+        ) if fp8_out_scale is not None else 1.0
+
+        arg_max_seqlens_q = 0 if on_gfx1x() else max_seqlens_q
+        arg_max_seqlens_k = 0 if on_gfx1x() else max_seqlens_k
+
+        attn_fwd[grid](
+            q,
+            k,
+            v,
+            bias,
+            sm_scale,
+            q_scale,
+            k_scale,
+            v_scale,
+            p_scale,
+            p_descale,
+            o_descale,
+            None,
+            o,
+            *q_strides,
+            *k_strides,
+            *v_strides,
+            *o_strides,
+            *bias_strides,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            dropout_p=0.0,
+            philox_seed=philox_seed,
+            philox_offset_base=philox_offset,
+            encoded_softmax=encoded_softmax,
+            HQ=nheads_q,
+            HK=nheads_k,
+            ACTUAL_BLOCK_DMODEL=head_size,
+            MAX_SEQLENS_Q=arg_max_seqlens_q,
+            MAX_SEQLENS_K=arg_max_seqlens_k,
+            IS_CAUSAL=causal,
+            VARLEN=True,
+            BLOCK_DMODEL=padded_d_model,
+            BIAS_TYPE=0 if bias is None else 1,
+            ENABLE_DROPOUT=False,
+            RETURN_ENCODED_SOFTMAX=False,
+            USE_FP8=use_fp8,
+            USE_FP8_OUT=fp8_out_scale is not None,
+        )
+
+        ctx.grid = grid
+        ctx.sm_scale = sm_scale
+        ctx.BLOCK_DMODEL = head_size
+        ctx.causal = causal
+        ctx.dropout_p = 0.0
+        ctx.philox_seed = philox_seed
+        ctx.philox_offset = philox_offset
+        ctx.encoded_softmax = encoded_softmax
+        ctx.return_encoded_softmax = False
+        return o, encoded_softmax
+
+
+triton_attention = _attention.apply
diff --git a/vllm_v0.10.0/vllm/attention/ops/triton_merge_attn_states.py b/vllm_v0.10.0/vllm/attention/ops/triton_merge_attn_states.py
new file mode 100644
index 0000000..56d78ed
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/ops/triton_merge_attn_states.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import torch
+
+from vllm.triton_utils import tl, triton
+
+
+# Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
+# can be used to combine partial attention results (in the split-KV case)
+def merge_attn_states(
+    output: torch.Tensor,
+    prefix_output: torch.Tensor,
+    prefix_lse: torch.Tensor,
+    suffix_output: torch.Tensor,
+    suffix_lse: torch.Tensor,
+    output_lse: Optional[torch.Tensor] = None,
+) -> None:
+    num_tokens = output.shape[0]
+    num_query_heads = output.shape[1]
+    head_size = output.shape[2]
+    padded_head_size = triton.next_power_of_2(head_size)
+
+    # TODO(woosuk): Use CUDA kernel instead of Triton to minimize CPU overhead.
+    merge_attn_states_kernel[(num_tokens, num_query_heads)](
+        output,
+        output_lse,
+        prefix_output,
+        prefix_lse,
+        suffix_output,
+        suffix_lse,
+        head_size,
+        padded_head_size,
+        output_lse is not None,
+    )
+
+
+@triton.jit
+def merge_attn_states_kernel(
+    output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+    output_lse,  # [NUM_HEADS, NUM_TOKENS]
+    prefix_output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+    prefix_lse,  # [NUM_HEADS, NUM_TOKENS]
+    suffix_output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+    suffix_lse,  # [NUM_HEADS, NUM_TOKENS]
+    HEAD_SIZE: tl.constexpr,
+    PADDED_HEAD_SIZE: tl.constexpr,
+    OUTPUT_LSE: tl.constexpr,
+):
+    token_idx = tl.program_id(0)
+    num_tokens = tl.num_programs(0)
+    head_idx = tl.program_id(1)
+    num_heads = tl.num_programs(1)
+
+    p_lse = tl.load(prefix_lse + head_idx * num_tokens + token_idx)
+    s_lse = tl.load(suffix_lse + head_idx * num_tokens + token_idx)
+
+    # FA2 and FA3 have different behavior for when the sum-exp is 0, this namely
+    # arises with 0 len seqlens. FA3 returns -inf here while FA2 returns inf.
+    # If we see an inf assume FA2 and convert inf to -inf for consistency
+    # and correctness. Inf generally doesn't make sense in this context outside
+    # of undefined-behavior/FA2-case, so I think this a safe assumption.
+    p_lse = float('-inf') if p_lse == float('inf') else p_lse
+    s_lse = float('-inf') if s_lse == float('inf') else s_lse
+
+    max_lse = tl.maximum(p_lse, s_lse)
+    p_lse = p_lse - max_lse
+    s_lse = s_lse - max_lse
+    # Will reuse precomputed Exp values for scale factor computation.
+    p_se = tl.exp(p_lse)
+    s_se = tl.exp(s_lse)
+    out_se = (p_se + s_se)
+
+    if OUTPUT_LSE:
+        out_lse = tl.log(out_se) + max_lse
+        tl.store(output_lse + head_idx * num_tokens + token_idx, out_lse)
+
+    head_arange = tl.arange(0, PADDED_HEAD_SIZE)
+    head_mask = head_arange < HEAD_SIZE
+    p_out = tl.load(prefix_output + token_idx * num_heads * HEAD_SIZE +
+                    head_idx * HEAD_SIZE + head_arange,
+                    mask=head_mask)
+    s_out = tl.load(suffix_output + token_idx * num_heads * HEAD_SIZE +
+                    head_idx * HEAD_SIZE + head_arange,
+                    mask=head_mask)
+
+    # NOTE(woosuk): Be careful with the numerical stability.
+    # We should compute the scale first, and then multiply it with the output.
+    # Do not multiply the output with tl.exp(p_lse) or tl.exp(s_lse) directly.
+    p_scale = p_se / out_se
+    s_scale = s_se / out_se
+    out = p_out * p_scale + s_out * s_scale
+    tl.store(output + token_idx * num_heads * HEAD_SIZE +
+             head_idx * HEAD_SIZE + head_arange,
+             out,
+             mask=head_mask)
diff --git a/vllm_v0.10.0/vllm/attention/ops/triton_unified_attention.py b/vllm_v0.10.0/vllm/attention/ops/triton_unified_attention.py
new file mode 100644
index 0000000..eb9c4f1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/ops/triton_unified_attention.py
@@ -0,0 +1,749 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Authors:
+#  - Burkhard Ringlein <ngl@zurich.ibm.com>
+#  - Jan van Lunteren <jvl@zurich.ibm.com>
+#  - Chih-Chieh Yang <chih.chieh.yang@ibm.com>
+#  - Thomas Parnell <tpa@zurich.ibm.com>
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.triton_utils import tl, triton
+
+logger = init_logger(__name__)
+
+
+@triton.jit
+def cdiv_fn(x, y):
+    return (x + y - 1) // y
+
+
+@triton.jit
+def apply_softcap(S, x):
+    Sdiv = S / x
+    p1 = tl.exp(Sdiv)
+    p2 = tl.exp(-Sdiv)
+    return x * (p1 - p2) / (p1 + p2)
+
+
+@triton.jit
+def find_seq_idx(query_start_len_ptr, target_idx, num_seqs,
+                 BLOCK_Q: tl.constexpr, use_q_block_mode: tl.constexpr):
+    left: tl.int32 = 0
+    right = num_seqs
+    while left < right:
+        mid = (left + right) // 2
+        val = tl.load(query_start_len_ptr + mid)
+        mid_val = val // BLOCK_Q + mid if use_q_block_mode else val
+
+        if mid_val <= target_idx:
+            left = mid + 1
+        else:
+            right = mid
+
+    return left - 1
+
+
+@triton.jit
+def kernel_unified_attention_2d(
+        output_ptr,  # [num_tokens, num_query_heads, head_size]
+        query_ptr,  # [num_tokens, num_query_heads, head_size]
+        key_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
+        value_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
+        block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
+        seq_lens_ptr,  # [num_seqs]
+        alibi_slopes_ptr,  # [num_query_heads]
+        scale,  # float32
+        k_scale,  # float32
+        v_scale,  # float32
+        softcap,  # float32
+        num_query_heads: tl.constexpr,  # int
+        num_queries_per_kv: tl.constexpr,  # int
+        block_table_stride: tl.int64,  # int
+        query_stride_0: tl.int64,  # int
+        query_stride_1: tl.int64,  # int, should be equal to head_size
+        output_stride_0: tl.int64,  # int
+        output_stride_1: tl.int64,  # int, should be equal to head_size
+        BLOCK_SIZE: tl.constexpr,  # int
+        HEAD_SIZE: tl.constexpr,  # int
+        HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
+        USE_ALIBI_SLOPES: tl.constexpr,  # bool
+        USE_SOFTCAP: tl.constexpr,  # bool
+        SLIDING_WINDOW: tl.constexpr,  # int
+        stride_k_cache_0: tl.int64,  # int
+        stride_k_cache_1: tl.int64,  # int
+        stride_k_cache_2: tl.int64,  # int
+        stride_k_cache_3: tl.constexpr,  # int
+        stride_v_cache_0: tl.int64,  # int
+        stride_v_cache_1: tl.int64,  # int
+        stride_v_cache_2: tl.int64,  # int
+        stride_v_cache_3: tl.constexpr,  # int
+        query_start_len_ptr,  # [num_seqs+1]
+        BLOCK_Q: tl.constexpr,  # int
+        num_seqs: tl.int32,
+        BLOCK_M: tl.constexpr,  # int
+):
+    q_block_global_idx = tl.program_id(0)
+    kv_head_idx = tl.program_id(1)
+
+    seq_idx = find_seq_idx(query_start_len_ptr, q_block_global_idx, num_seqs,
+                           BLOCK_Q, True)
+
+    q_block_start_idx = tl.load(query_start_len_ptr +
+                                seq_idx) // BLOCK_Q + seq_idx
+
+    q_block_local_idx = q_block_global_idx - q_block_start_idx
+
+    cur_batch_in_all_start_index = tl.load(query_start_len_ptr + seq_idx)
+    cur_batch_in_all_stop_index = tl.load(query_start_len_ptr + seq_idx + 1)
+
+    cur_batch_query_len = cur_batch_in_all_stop_index \
+        - cur_batch_in_all_start_index
+
+    if q_block_local_idx * BLOCK_Q >= cur_batch_query_len:
+        return
+
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, HEAD_SIZE_PADDED)
+    query_pos = q_block_local_idx * BLOCK_Q + offs_m // num_queries_per_kv
+
+    query_offset_0 = cur_batch_in_all_start_index + query_pos
+    query_offset_1 = kv_head_idx * num_queries_per_kv + \
+        offs_m % num_queries_per_kv
+    query_offset = (query_offset_0[:, None] * query_stride_0 +
+                    query_offset_1[:, None] * query_stride_1 + offs_d[None, :])
+
+    dim_mask = tl.where(offs_d < HEAD_SIZE, 1, 0).to(tl.int1)
+    query_mask_0 = tl.where(query_pos < cur_batch_query_len, 1, 0).to(tl.int1)
+    query_mask_1 = tl.where(query_offset_1 < num_query_heads, 1, 0).to(tl.int1)
+
+    # Q : (BLOCK_M, HEAD_SIZE_PADDED)
+    Q = tl.load(
+        query_ptr + query_offset,
+        mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None],
+        other=0.0,
+    )
+
+    block_table_offset = seq_idx * block_table_stride
+
+    M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    L = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, HEAD_SIZE_PADDED], dtype=tl.float32)
+
+    # sequence len for this particular sequence
+    seq_len = tl.load(seq_lens_ptr + seq_idx)
+
+    # context length for this particular sequences
+    context_len = seq_len - cur_batch_query_len
+
+    # alibi slope for this head
+    if USE_ALIBI_SLOPES:
+        alibi_slope = tl.load(alibi_slopes_ptr + query_offset_1,
+                              mask=query_mask_1,
+                              other=0.0)
+
+    # compute the length of the longest sequence prefix spanned by any
+    # query token in the current q_block (q_block_local_idx)
+    max_seq_prefix_len = context_len + q_block_local_idx * BLOCK_Q + (
+        BLOCK_M - 1) // num_queries_per_kv + 1
+
+    # adjust for potential padding in the last q_block by considering the
+    # actual sequence length
+    max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len)
+
+    # calculate the number of tiles (blocks) that need to be processed to
+    # cover the longest sequence prefix (due to causal masking, blocks beyond
+    # this prefix can be skipped)
+    num_blocks = cdiv_fn(max_seq_prefix_len, BLOCK_SIZE)
+
+    # iterate through tiles
+    for j in range(0, num_blocks):
+
+        physical_block_idx = tl.load(block_tables_ptr + block_table_offset + j)
+
+        offs_n = tl.arange(0, BLOCK_SIZE)
+
+        v_offset = (physical_block_idx * stride_v_cache_0 +
+                    kv_head_idx * stride_v_cache_2 +
+                    offs_d[None, :] * stride_v_cache_3 +
+                    offs_n[:, None] * stride_v_cache_1)
+
+        k_offset = (physical_block_idx * stride_k_cache_0 +
+                    kv_head_idx * stride_k_cache_2 +
+                    offs_d[:, None] * stride_k_cache_3 +
+                    offs_n[None, :] * stride_k_cache_1)
+
+        # K : (HEAD_SIZE, BLOCK_SIZE)
+        K_load = tl.load(key_cache_ptr + k_offset,
+                         mask=dim_mask[:, None],
+                         other=0.0)
+
+        if K_load.dtype.is_fp8():
+            if Q.dtype.is_fp8():
+                K = K_load
+            else:
+                K = (K_load.to(tl.float32) * tl.load(k_scale)).to(Q.dtype)
+        else:
+            K = K_load
+
+        # V : (BLOCK_SIZE, HEAD_SIZE)
+        V_load = tl.load(value_cache_ptr + v_offset,
+                         mask=dim_mask[None, :],
+                         other=0.0)
+
+        if V_load.dtype.is_fp8():
+            if Q.dtype.is_fp8():
+                V = V_load
+            else:
+                V = (V_load.to(tl.float32) * tl.load(v_scale)).to(Q.dtype)
+        else:
+            V = V_load
+
+        seq_offset = j * BLOCK_SIZE + offs_n
+
+        seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1
+
+        # S : (BLOCK_M, BLOCK_SIZE)
+        S = tl.zeros(shape=(BLOCK_M, BLOCK_SIZE), dtype=tl.float32)
+
+        S += scale * tl.dot(Q, K)
+
+        if USE_SOFTCAP:
+            S = apply_softcap(S, softcap)
+
+        S = tl.where(query_mask_1[:, None] & query_mask_0[:, None] & seq_mask,
+                     S, float("-inf"))
+
+        if SLIDING_WINDOW > 0:
+            S = tl.where((context_len + query_pos[:, None] - seq_offset)
+                         < SLIDING_WINDOW, S, float("-inf"))
+
+        if USE_ALIBI_SLOPES:
+            S += alibi_slope[:, None] * (seq_offset - context_len)
+
+        # compute running maximum
+        # m_j : (BLOCK_M,)
+        m_j = tl.maximum(M, tl.max(S, axis=1))
+        # For sliding window there's a chance the max is -inf due to masking of
+        # the entire row. In this case we need to set m_j 0 to avoid NaN
+        m_j = tl.where(m_j > float("-inf"), m_j, 0.0)
+
+        # P : (BLOCK_M, BLOCK_SIZE)
+        P = tl.exp(S - m_j[:, None])
+
+        # l_j : (BLOCK_M,)
+        l_j = tl.sum(P, axis=1)
+
+        # alpha : (BLOCK_M, )
+        alpha = tl.exp(M - m_j)
+
+        # acc : (BLOCK_M, HEAD_SIZE_PADDED)
+        acc = acc * alpha[:, None]
+
+        # update constants
+        L = L * alpha + l_j
+        M = m_j
+
+        # acc : (BLOCK_M, HEAD_SIZE_PADDED)
+        acc += tl.dot(P.to(V.dtype), V)
+
+    # epilogue
+    acc = acc / L[:, None]
+
+    output_offset = (query_offset_0[:, None] * output_stride_0 +
+                     query_offset_1[:, None] * output_stride_1 +
+                     offs_d[None, :])
+
+    tl.store(
+        output_ptr + output_offset,
+        acc,
+        mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None],
+    )
+
+
+@triton.jit
+def kernel_unified_attention_3d(
+        segm_output_ptr,
+        # [num_tokens, num_query_heads, num_segments, head_size]
+        segm_max_ptr,  # [num_tokens, num_query_heads, num_segments]
+        segm_expsum_ptr,  # [num_tokens, num_query_heads, num_segments]
+        query_ptr,  # [num_tokens, num_query_heads, head_size]
+        key_cache_ptr,  # [num_blks, num_kv_heads, head_size // x, blk_size, x]
+        value_cache_ptr,  # [num_blks, num_kv_heads, head_size, blk_size]
+        block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
+        seq_lens_ptr,  # [num_seqs]
+        alibi_slopes_ptr,  # [num_query_heads]
+        scale,  # float32
+        k_scale,  # float32
+        v_scale,  # float32
+        softcap,  # float32
+        num_query_heads: tl.constexpr,  # int
+        num_queries_per_kv: tl.constexpr,  # int
+        block_table_stride: tl.int64,  # int
+        query_stride_0: tl.int64,  # int
+        query_stride_1: tl.int64,  # int, should be equal to head_size
+        BLOCK_SIZE: tl.constexpr,  # int
+        HEAD_SIZE: tl.constexpr,  # int
+        HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
+        USE_ALIBI_SLOPES: tl.constexpr,  # bool
+        USE_SOFTCAP: tl.constexpr,  # bool
+        SLIDING_WINDOW: tl.constexpr,  # int
+        stride_k_cache_0: tl.int64,  # int
+        stride_k_cache_1: tl.int64,  # int
+        stride_k_cache_2: tl.int64,  # int
+        stride_k_cache_3: tl.constexpr,  # int
+        stride_v_cache_0: tl.int64,  # int
+        stride_v_cache_1: tl.int64,  # int
+        stride_v_cache_2: tl.int64,  # int
+        stride_v_cache_3: tl.constexpr,  # int
+        query_start_len_ptr,  # [num_seqs+1]
+        BLOCK_Q: tl.constexpr,  # int
+        num_seqs: tl.int32,
+        BLOCK_M: tl.constexpr,  # int
+        NUM_SEGMENTS_PER_SEQ: tl.constexpr,  # int
+):
+    q_block_global_idx = tl.program_id(0)
+    kv_head_idx = tl.program_id(1)
+    segm_idx = tl.program_id(2)
+
+    seq_idx = find_seq_idx(query_start_len_ptr, q_block_global_idx, num_seqs,
+                           BLOCK_Q, True)
+
+    q_block_start_idx = tl.load(query_start_len_ptr +
+                                seq_idx) // BLOCK_Q + seq_idx
+
+    q_block_local_idx = q_block_global_idx - q_block_start_idx
+
+    cur_batch_in_all_start_index = tl.load(query_start_len_ptr + seq_idx)
+    cur_batch_in_all_stop_index = tl.load(query_start_len_ptr + seq_idx + 1)
+
+    cur_batch_query_len = cur_batch_in_all_stop_index \
+        - cur_batch_in_all_start_index
+
+    if q_block_local_idx * BLOCK_Q >= cur_batch_query_len:
+        return
+
+    # sequence len for this particular sequence
+    seq_len = tl.load(seq_lens_ptr + seq_idx)
+
+    # number of segments for this particular sequence
+    num_segments = NUM_SEGMENTS_PER_SEQ
+    blocks_per_segment = cdiv_fn(seq_len, num_segments * BLOCK_SIZE)
+
+    if segm_idx * blocks_per_segment * BLOCK_SIZE >= seq_len:
+        return
+
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, HEAD_SIZE_PADDED)
+
+    query_pos = q_block_local_idx * BLOCK_Q + offs_m // num_queries_per_kv
+
+    query_offset_0 = cur_batch_in_all_start_index + query_pos
+    query_offset_1 = kv_head_idx * num_queries_per_kv + \
+        offs_m % num_queries_per_kv
+
+    query_offset = (query_offset_0[:, None] * query_stride_0 +
+                    query_offset_1[:, None] * query_stride_1 + offs_d[None, :])
+
+    dim_mask = tl.where(offs_d < HEAD_SIZE, 1, 0).to(tl.int1)
+    query_mask_0 = tl.where(query_pos < cur_batch_query_len, 1, 0).to(tl.int1)
+    query_mask_1 = tl.where(query_offset_1 < num_query_heads, 1, 0).to(tl.int1)
+
+    # Q : (BLOCK_M, HEAD_SIZE_PADDED)
+    Q = tl.load(
+        query_ptr + query_offset,
+        mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None],
+        other=0.0,
+    )
+
+    block_table_offset = seq_idx * block_table_stride
+
+    M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    L = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, HEAD_SIZE_PADDED], dtype=tl.float32)
+
+    # context length for this particular sequences
+    context_len = seq_len - cur_batch_query_len
+
+    # alibi slope for this head
+    if USE_ALIBI_SLOPES:
+        alibi_slope = tl.load(alibi_slopes_ptr + query_offset_1,
+                              mask=query_mask_1,
+                              other=0.0)
+
+    num_blocks = cdiv_fn(seq_len, BLOCK_SIZE)
+
+    # iterate through tiles within current segment
+    for j in range(
+            segm_idx * blocks_per_segment,
+            min((segm_idx + 1) * blocks_per_segment, num_blocks),
+    ):
+        physical_block_idx = tl.load(block_tables_ptr + block_table_offset + j)
+
+        offs_n = tl.arange(0, BLOCK_SIZE)
+
+        v_offset = (physical_block_idx * stride_v_cache_0 +
+                    kv_head_idx * stride_v_cache_2 +
+                    offs_d[None, :] * stride_v_cache_3 +
+                    offs_n[:, None] * stride_v_cache_1)
+
+        k_offset = (physical_block_idx * stride_k_cache_0 +
+                    kv_head_idx * stride_k_cache_2 +
+                    offs_d[:, None] * stride_k_cache_3 +
+                    offs_n[None, :] * stride_k_cache_1)
+
+        # K : (HEAD_SIZE, BLOCK_SIZE)
+        K_load = tl.load(key_cache_ptr + k_offset,
+                         mask=dim_mask[:, None],
+                         other=0.0)
+
+        if K_load.dtype.is_fp8():
+            if Q.dtype.is_fp8():
+                K = K_load
+            else:
+                K = (K_load.to(tl.float32) * tl.load(k_scale)).to(Q.dtype)
+        else:
+            K = K_load
+
+        # V : (BLOCK_SIZE, HEAD_SIZE)
+        V_load = tl.load(value_cache_ptr + v_offset,
+                         mask=dim_mask[None, :],
+                         other=0.0)
+
+        if V_load.dtype.is_fp8():
+            if Q.dtype.is_fp8():
+                V = V_load
+            else:
+                V = (V_load.to(tl.float32) * tl.load(v_scale)).to(Q.dtype)
+        else:
+            V = V_load
+
+        seq_offset = j * BLOCK_SIZE + offs_n
+
+        seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1
+
+        # S : (BLOCK_M, BLOCK_SIZE)
+        S = tl.zeros(shape=(BLOCK_M, BLOCK_SIZE), dtype=tl.float32)
+
+        S += scale * tl.dot(Q, K)
+
+        if USE_SOFTCAP:
+            S = apply_softcap(S, softcap)
+
+        S = tl.where(query_mask_1[:, None] & query_mask_0[:, None] & seq_mask,
+                     S, float("-inf"))
+
+        if SLIDING_WINDOW > 0:
+            S = tl.where((context_len + query_pos[:, None] - seq_offset)
+                         < SLIDING_WINDOW, S, float("-inf"))
+
+        if USE_ALIBI_SLOPES:
+            S += alibi_slope[:, None] * (seq_offset - context_len)
+
+        # compute running maximum
+        # m_j : (BLOCK_M,)
+        m_j = tl.maximum(M, tl.max(S, axis=1))
+        # For sliding window there's a chance the max is -inf due to masking of
+        # the entire row. In this case we need to set m_j 0 to avoid NaN
+        m_j = tl.where(m_j > float("-inf"), m_j, 0.0)
+
+        # P : (BLOCK_M, BLOCK_SIZE,)
+        P = tl.exp(S - m_j[:, None])
+
+        # l_j : (BLOCK_M,)
+        l_j = tl.sum(P, axis=1)
+
+        # alpha : (BLOCK_M, )
+        alpha = tl.exp(M - m_j)
+
+        # acc : (BLOCK_M, HEAD_SIZE_PADDED)
+        acc = acc * alpha[:, None]
+
+        # update constants
+        L = L * alpha + l_j
+        M = m_j
+
+        # acc : (BLOCK_M, HEAD_SIZE_PADDED)
+        acc += tl.dot(P.to(V.dtype), V)
+
+    segm_output_offset = (
+        query_offset_0[:, None].to(tl.int64) *
+        (num_query_heads * NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) +
+        query_offset_1[:, None] * (NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) +
+        segm_idx * HEAD_SIZE_PADDED + tl.arange(0, HEAD_SIZE_PADDED)[None, :])
+    tl.store(
+        segm_output_ptr + segm_output_offset,
+        acc,
+        mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None],
+    )
+    segm_offset = (query_offset_0.to(tl.int64) *
+                   (num_query_heads * NUM_SEGMENTS_PER_SEQ) +
+                   query_offset_1 * NUM_SEGMENTS_PER_SEQ + segm_idx)
+    tl.store(segm_max_ptr + segm_offset, M, mask=query_mask_0 & query_mask_1)
+    tl.store(segm_expsum_ptr + segm_offset,
+             L,
+             mask=query_mask_0 & query_mask_1)
+
+
+@triton.jit
+def reduce_segments(
+        output_ptr,  # [num_tokens, num_query_heads, head_size]
+        segm_output_ptr,
+        #[num_tokens, num_query_heads, max_num_segments, head_size]
+        segm_max_ptr,  # [num_tokens, num_query_heads, max_num_segments]
+        segm_expsum_ptr,  # [num_tokens, num_query_heads, max_num_segments]
+        seq_lens_ptr,  # [num_seqs]
+        num_seqs,  # int
+        num_query_heads: tl.constexpr,  # int
+        output_stride_0: tl.int64,  # int
+        output_stride_1: tl.int64,  # int, should be equal to head_size
+        block_table_stride: tl.int64,  # int
+        BLOCK_SIZE: tl.constexpr,  # int
+        HEAD_SIZE: tl.constexpr,  # int, must be power of 2
+        HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
+        query_start_len_ptr,  # [num_seqs+1]
+        BLOCK_Q: tl.constexpr,  # int
+        NUM_SEGMENTS_PER_SEQ: tl.constexpr,  # int
+):
+    query_token_idx = tl.program_id(0)
+    query_head_idx = tl.program_id(1)
+
+    seq_idx = find_seq_idx(query_start_len_ptr, query_token_idx, num_seqs,
+                           BLOCK_Q, False)
+
+    # sequence len for this particular sequence
+    seq_len = tl.load(seq_lens_ptr + seq_idx)
+
+    # number of segments for this particular sequence
+    num_segments = NUM_SEGMENTS_PER_SEQ
+    blocks_per_segment = cdiv_fn(seq_len, num_segments * BLOCK_SIZE)
+
+    # create masks for subsequent loads
+    act_num_segments = cdiv_fn(seq_len, blocks_per_segment * BLOCK_SIZE)
+    segm_mask = tl.arange(0, NUM_SEGMENTS_PER_SEQ) < tl.full(
+        [NUM_SEGMENTS_PER_SEQ], act_num_segments, dtype=tl.int32)
+    dim_mask = tl.where(tl.arange(0, HEAD_SIZE_PADDED) < HEAD_SIZE, 1,
+                        0).to(tl.int1)
+
+    # load segment maxima
+    segm_offset = (query_token_idx.to(tl.int64) *
+                   (num_query_heads * NUM_SEGMENTS_PER_SEQ) +
+                   query_head_idx * NUM_SEGMENTS_PER_SEQ +
+                   tl.arange(0, NUM_SEGMENTS_PER_SEQ))
+    segm_max = tl.load(segm_max_ptr + segm_offset,
+                       mask=segm_mask,
+                       other=float("-inf"))
+    overall_max = tl.max(segm_max)
+
+    # load and rescale segment exp sums
+    segm_expsum = tl.load(segm_expsum_ptr + segm_offset,
+                          mask=segm_mask,
+                          other=0.0)
+    segm_expsum = segm_expsum * tl.exp(segm_max - overall_max)
+    overall_expsum = tl.sum(segm_expsum)
+
+    # load, rescale, and add segment attention outputs
+    segm_output_offset = (
+        query_token_idx.to(tl.int64) *
+        (num_query_heads * NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) +
+        query_head_idx * (NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) +
+        tl.arange(0, NUM_SEGMENTS_PER_SEQ)[:, None] * HEAD_SIZE_PADDED +
+        tl.arange(0, HEAD_SIZE_PADDED)[None, :])
+    segm_output = tl.load(
+        segm_output_ptr + segm_output_offset,
+        mask=segm_mask[:, None] & dim_mask[None, :],
+        other=0.0,
+    )
+    segm_output *= tl.exp(segm_max - overall_max)[:, None]
+    acc_sum = tl.sum(segm_output, axis=0)
+    # safely divide by overall_expsum, returning 0.0 if overall_expsum is 0
+    acc = tl.where(overall_expsum == 0.0, 0.0, acc_sum / overall_expsum)
+
+    # write result
+    output_offset = (query_token_idx * output_stride_0 +
+                     query_head_idx * output_stride_1 +
+                     tl.arange(0, HEAD_SIZE_PADDED))
+    tl.store(output_ptr + output_offset, acc, mask=dim_mask)
+
+
+def unified_attention(
+    q,
+    k,
+    v,
+    out,
+    cu_seqlens_q,
+    max_seqlen_q,
+    seqused_k,
+    max_seqlen_k,
+    softmax_scale,
+    causal,
+    window_size,
+    block_table,
+    softcap,
+    q_descale,
+    k_descale,
+    v_descale,
+    alibi_slopes=None,
+):
+    assert causal, "Only causal attention is supported"
+    assert q_descale is None, "Q scales not supported"
+
+    block_size = v.shape[1]
+    assert q.element_size() >= 2 or block_size >= 32, \
+        "Block size must be at least 32 for fp8"
+
+    use_alibi_slopes = alibi_slopes is not None
+
+    block_size = v.shape[1]
+    num_seqs = len(seqused_k)
+    num_query_heads = q.shape[1]
+    num_kv_heads = k.shape[2]
+    num_queries_per_kv = num_query_heads // num_kv_heads
+    head_size = q.shape[2]
+
+    BLOCK_M = 16
+    BLOCK_Q = BLOCK_M // num_queries_per_kv
+
+    # Ideally we would launch with kernel with:
+    # \sum_i[ceil(query_len[i] / BLOCK_Q)] blocks.
+    # However, it is slow to realize the query_lens on cpu.
+    # Instead we use upper-bound:
+    # \sum_i[ceil(query_len[i] / BLOCK_Q)]
+    #   <= \sum_i[floor(query_len[i] / BLOCK_Q) + 1]
+    #    = \sum_i[floor(query_len[i] / BLOCK_Q)] + num_seqs
+    #   <= floor(\sum_i(query_len[i]) / BLOCK_Q) + num_seqs
+    #    = floor(q.shape[0] / BLOCK_Q) + num_seqs
+    total_num_q_blocks = q.shape[0] // BLOCK_Q + num_seqs
+
+    # if batch contains a prefill
+    if max_seqlen_q > 1 or total_num_q_blocks * num_kv_heads > 128:
+        kernel_unified_attention_2d[(
+            total_num_q_blocks,
+            num_kv_heads,
+        )](
+            output_ptr=out,
+            query_ptr=q,
+            key_cache_ptr=k,
+            value_cache_ptr=v,
+            block_tables_ptr=block_table,
+            seq_lens_ptr=seqused_k,
+            alibi_slopes_ptr=alibi_slopes,
+            scale=softmax_scale,
+            k_scale=k_descale,
+            v_scale=v_descale,
+            softcap=softcap,
+            num_query_heads=num_query_heads,
+            num_queries_per_kv=num_queries_per_kv,
+            block_table_stride=block_table.stride(0),
+            query_stride_0=q.stride(0),
+            query_stride_1=q.stride(1),
+            output_stride_0=out.stride(0),
+            output_stride_1=out.stride(1),
+            BLOCK_SIZE=block_size,
+            HEAD_SIZE=head_size,
+            HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
+            USE_ALIBI_SLOPES=use_alibi_slopes,
+            USE_SOFTCAP=(softcap > 0),
+            SLIDING_WINDOW=(1 + window_size[0]),
+            stride_k_cache_0=k.stride(0),
+            stride_k_cache_1=k.stride(1),
+            stride_k_cache_2=k.stride(2),
+            stride_k_cache_3=k.stride(3),
+            stride_v_cache_0=v.stride(0),
+            stride_v_cache_1=v.stride(1),
+            stride_v_cache_2=v.stride(2),
+            stride_v_cache_3=v.stride(3),
+            query_start_len_ptr=cu_seqlens_q,
+            BLOCK_Q=BLOCK_Q,
+            num_seqs=num_seqs,
+            BLOCK_M=BLOCK_M,
+        )
+    else:
+        # for initial version, NUM_SEGMENTS = 16 is chosen as a default
+        # value that showed good performance in tests
+        NUM_SEGMENTS = 16
+
+        segm_output = torch.empty(
+            q.shape[0],
+            num_query_heads,
+            NUM_SEGMENTS,
+            triton.next_power_of_2(head_size),
+            dtype=torch.float32,
+            device=q.device,
+        )
+        segm_max = torch.empty(
+            q.shape[0],
+            num_query_heads,
+            NUM_SEGMENTS,
+            dtype=torch.float32,
+            device=q.device,
+        )
+        segm_expsum = torch.empty(
+            q.shape[0],
+            num_query_heads,
+            NUM_SEGMENTS,
+            dtype=torch.float32,
+            device=q.device,
+        )
+
+        kernel_unified_attention_3d[(
+            total_num_q_blocks, num_kv_heads, NUM_SEGMENTS)](
+                segm_output_ptr=segm_output,
+                segm_max_ptr=segm_max,
+                segm_expsum_ptr=segm_expsum,
+                query_ptr=q,
+                key_cache_ptr=k,
+                value_cache_ptr=v,
+                block_tables_ptr=block_table,
+                seq_lens_ptr=seqused_k,
+                alibi_slopes_ptr=alibi_slopes,
+                scale=softmax_scale,
+                k_scale=k_descale,
+                v_scale=v_descale,
+                softcap=softcap,
+                num_query_heads=num_query_heads,
+                num_queries_per_kv=num_queries_per_kv,
+                block_table_stride=block_table.stride(0),
+                query_stride_0=q.stride(0),
+                query_stride_1=q.stride(1),
+                BLOCK_SIZE=block_size,
+                HEAD_SIZE=head_size,
+                HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
+                USE_ALIBI_SLOPES=use_alibi_slopes,
+                USE_SOFTCAP=(softcap > 0),
+                SLIDING_WINDOW=(1 + window_size[0]),
+                stride_k_cache_0=k.stride(0),
+                stride_k_cache_1=k.stride(1),
+                stride_k_cache_2=k.stride(2),
+                stride_k_cache_3=k.stride(3),
+                stride_v_cache_0=v.stride(0),
+                stride_v_cache_1=v.stride(1),
+                stride_v_cache_2=v.stride(2),
+                stride_v_cache_3=v.stride(3),
+                query_start_len_ptr=cu_seqlens_q,
+                BLOCK_Q=BLOCK_Q,
+                num_seqs=num_seqs,
+                BLOCK_M=BLOCK_M,
+                NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
+            )
+
+        reduce_segments[(q.shape[0], num_query_heads)](
+            output_ptr=out,
+            segm_output_ptr=segm_output,
+            segm_max_ptr=segm_max,
+            segm_expsum_ptr=segm_expsum,
+            seq_lens_ptr=seqused_k,
+            num_seqs=num_seqs,
+            num_query_heads=num_query_heads,
+            output_stride_0=out.stride(0),
+            output_stride_1=out.stride(1),
+            block_table_stride=block_table.stride(0),
+            BLOCK_SIZE=block_size,
+            HEAD_SIZE=head_size,
+            HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
+            query_start_len_ptr=cu_seqlens_q,
+            BLOCK_Q=BLOCK_Q,
+            NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
+        )
diff --git a/vllm_v0.10.0/vllm/attention/selector.py b/vllm_v0.10.0/vllm/attention/selector.py
new file mode 100644
index 0000000..2e3c863
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/selector.py
@@ -0,0 +1,236 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from contextlib import contextmanager
+from dataclasses import dataclass
+from functools import cache
+from typing import Generator, Optional, Union
+
+import torch
+
+import vllm.envs as envs
+from vllm.attention.backends.abstract import AttentionBackend
+from vllm.logger import init_logger
+from vllm.platforms import _Backend, current_platform
+from vllm.utils import STR_BACKEND_ENV_VAR, resolve_obj_by_qualname
+
+logger = init_logger(__name__)
+
+
+def backend_name_to_enum(backend_name: str) -> Optional[_Backend]:
+    """
+    Convert a string backend name to a _Backend enum value.
+
+    Returns:
+    * _Backend: enum value if backend_name is a valid in-tree type
+    * None: otherwise it's an invalid in-tree type or an out-of-tree platform is
+            loaded.
+    """
+    assert backend_name is not None
+    return _Backend[backend_name] if backend_name in _Backend.__members__ else \
+          None
+
+
+def get_env_variable_attn_backend() -> Optional[_Backend]:
+    '''
+    Get the backend override specified by the vLLM attention
+    backend environment variable, if one is specified.
+
+    Returns:
+
+    * _Backend enum value if an override is specified
+    * None otherwise
+    '''
+    backend_name = os.environ.get(STR_BACKEND_ENV_VAR)
+    return (None
+            if backend_name is None else backend_name_to_enum(backend_name))
+
+
+# Global state allows a particular choice of backend
+# to be forced, overriding the logic which auto-selects
+# a backend based on system & workload configuration
+# (default behavior if this variable is None)
+#
+# THIS SELECTION TAKES PRECEDENCE OVER THE
+# VLLM_ATTENTION_BACKEND ENVIRONMENT VARIABLE
+forced_attn_backend: Optional[_Backend] = None
+
+
+def global_force_attn_backend(attn_backend: Optional[_Backend]) -> None:
+    '''
+    Force all attention operations to use a specified backend.
+
+    Passing `None` for the argument re-enables automatic
+    backend selection.,
+
+    Arguments:
+
+    * attn_backend: backend selection (None to revert to auto)
+    '''
+    global forced_attn_backend
+    forced_attn_backend = attn_backend
+
+
+def get_global_forced_attn_backend() -> Optional[_Backend]:
+    '''
+    Get the currently-forced choice of attention backend,
+    or None if auto-selection is currently enabled.
+    '''
+    return forced_attn_backend
+
+
+@dataclass(frozen=True)
+class _IsSupported:
+    can_import: bool
+    head_size: bool
+    dtype: bool
+
+    def __bool__(self) -> bool:
+        return self.can_import and self.head_size and self.dtype
+
+
+def is_attn_backend_supported(
+    attn_backend: Union[str, type[AttentionBackend]],
+    head_size: int,
+    dtype: torch.dtype,
+    *,
+    allow_import_error: bool = True,
+) -> _IsSupported:
+    if isinstance(attn_backend, str):
+        try:
+            attn_backend = resolve_obj_by_qualname(attn_backend)
+        except ImportError:
+            if not allow_import_error:
+                raise
+
+            return _IsSupported(can_import=False, head_size=False, dtype=False)
+
+    assert isinstance(attn_backend, type)
+
+    # TODO: Update the interface once V0 is removed
+    if get_supported_head_sizes := getattr(attn_backend,
+                                           "get_supported_head_sizes", None):
+        is_head_size_supported = head_size in get_supported_head_sizes()
+    elif validate_head_size := getattr(attn_backend, "validate_head_size",
+                                       None):
+        try:
+            validate_head_size(head_size)
+            is_head_size_supported = True
+        except Exception:
+            is_head_size_supported = False
+    else:
+        raise NotImplementedError(f"{attn_backend.__name__} does not support "
+                                  "head size validation")
+
+    if get_supported_dtypes := getattr(attn_backend, "get_supported_dtypes",
+                                       None):
+        is_dtype_supported = dtype in get_supported_dtypes()
+    else:
+        raise NotImplementedError(f"{attn_backend.__name__} does not support "
+                                  "dtype validation")
+
+    return _IsSupported(
+        can_import=True,
+        head_size=is_head_size_supported,
+        dtype=is_dtype_supported,
+    )
+
+
+def get_attn_backend(
+    head_size: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: Optional[str],
+    block_size: int,
+    is_attention_free: bool,
+    use_mla: bool = False,
+) -> type[AttentionBackend]:
+    """Selects which attention backend to use and lazily imports it."""
+    # Accessing envs.* behind an @lru_cache decorator can cause the wrong
+    # value to be returned from the cache if the value changes between calls.
+    # To avoid this, we read envs.VLLM_USE_V1 here and pass it explicitly to the
+    # private function.
+    return _cached_get_attn_backend(
+        head_size=head_size,
+        dtype=dtype,
+        kv_cache_dtype=kv_cache_dtype,
+        block_size=block_size,
+        is_attention_free=is_attention_free,
+        use_v1=envs.VLLM_USE_V1,
+        use_mla=use_mla,
+    )
+
+
+@cache
+def _cached_get_attn_backend(
+    head_size: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: Optional[str],
+    block_size: int,
+    is_attention_free: bool,
+    use_v1: bool = False,
+    use_mla: bool = False,
+) -> type[AttentionBackend]:
+    # If there are no attention layers (e.g. we are running Mamba),
+    # use the placeholder NO_ATTENTION
+    if is_attention_free:
+        from vllm.attention.backends.placeholder_attn import (
+            PlaceholderAttentionBackend)
+        return PlaceholderAttentionBackend
+
+    # Check whether a particular choice of backend was
+    # previously forced.
+    #
+    # THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND
+    # ENVIRONMENT VARIABLE.
+    selected_backend = None
+    backend_by_global_setting: Optional[_Backend] = (
+        get_global_forced_attn_backend())
+    if backend_by_global_setting is not None:
+        selected_backend = backend_by_global_setting
+    else:
+        # Check the environment variable and override if specified
+        backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
+        if backend_by_env_var is not None:
+            selected_backend = backend_name_to_enum(backend_by_env_var)
+
+    # get device-specific attn_backend
+    attention_cls = current_platform.get_attn_backend_cls(
+        selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1,
+        use_mla)
+    if not attention_cls:
+        raise ValueError(
+            f"Invalid attention backend for {current_platform.device_name}")
+    return resolve_obj_by_qualname(attention_cls)
+
+
+@contextmanager
+def global_force_attn_backend_context_manager(
+        attn_backend: _Backend) -> Generator[None, None, None]:
+    '''
+    Globally force a vLLM attention backend override within a
+    context manager, reverting the global attention backend
+    override to its prior state upon exiting the context
+    manager.
+
+    Arguments:
+
+    * attn_backend: attention backend to force
+
+    Returns:
+
+    * Generator
+    '''
+
+    # Save the current state of the global backend override (if any)
+    original_value = get_global_forced_attn_backend()
+
+    # Globally force the new backend override
+    global_force_attn_backend(attn_backend)
+
+    # Yield control back to the enclosed code block
+    try:
+        yield
+    finally:
+        # Revert the original global backend override, if any
+        global_force_attn_backend(original_value)
diff --git a/vllm_v0.10.0/vllm/attention/utils/__init__.py b/vllm_v0.10.0/vllm/attention/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/attention/utils/fa_utils.py b/vllm_v0.10.0/vllm/attention/utils/fa_utils.py
new file mode 100644
index 0000000..f8b0056
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/utils/fa_utils.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+from vllm import envs
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+
+logger = init_logger(__name__)
+
+if current_platform.is_cuda():
+    from vllm import _custom_ops as ops
+    reshape_and_cache_flash = ops.reshape_and_cache_flash
+    from vllm.vllm_flash_attn import (flash_attn_varlen_func,
+                                      get_scheduler_metadata)
+elif current_platform.is_xpu():
+    from vllm._ipex_ops import ipex_ops as ops
+    reshape_and_cache_flash = ops.reshape_and_cache_flash
+    flash_attn_varlen_func = ops.flash_attn_varlen_func
+    get_scheduler_metadata = ops.get_scheduler_metadata
+
+
+def get_flash_attn_version(requires_alibi: bool = False) -> Optional[int]:
+    # import here to avoid circular dependencies
+    from vllm.platforms import current_platform
+    if current_platform.is_xpu():
+        return 2
+    try:
+        from vllm.vllm_flash_attn.flash_attn_interface import (
+            fa_version_unsupported_reason, is_fa_version_supported)
+        device_capability = current_platform.get_device_capability()
+
+        assert device_capability is not None
+
+        # 1. default version depending on platform
+        fa_version = 3 if (device_capability.major == 9
+                           and is_fa_version_supported(3)) else 2
+
+        # 2. override if passed by environment
+        if envs.VLLM_FLASH_ATTN_VERSION is not None:
+            assert envs.VLLM_FLASH_ATTN_VERSION in [2, 3]
+            fa_version = envs.VLLM_FLASH_ATTN_VERSION
+
+        # 3. fallback for unsupported combinations
+        if device_capability.major == 10 and fa_version == 3:
+            logger.warning_once(
+                "Cannot use FA version 3 on Blackwell platform "
+                "defaulting to FA version 2.")
+            fa_version = 2
+
+        if requires_alibi and fa_version == 3:
+            logger.warning_once("Cannot use FA version 3 with ALiBi, "
+                                "defaulting to FA version 2.")
+            fa_version = 2
+
+        if not is_fa_version_supported(fa_version):
+            logger.error("Cannot use FA version %d is not supported due to %s",
+                         fa_version, fa_version_unsupported_reason(fa_version))
+
+        assert is_fa_version_supported(fa_version)
+        return fa_version
+    except (ImportError, AssertionError):
+        return None
+
+
+def flash_attn_supports_fp8() -> bool:
+    return get_flash_attn_version() == 3 and \
+        current_platform.get_device_capability().major == 9
+
+
+def is_flash_attn_varlen_func_available() -> bool:
+    return current_platform.is_cuda() or current_platform.is_xpu()
diff --git a/vllm_v0.10.0/vllm/attention/utils/kv_sharing_utils.py b/vllm_v0.10.0/vllm/attention/utils/kv_sharing_utils.py
new file mode 100644
index 0000000..b4ae8bd
--- /dev/null
+++ b/vllm_v0.10.0/vllm/attention/utils/kv_sharing_utils.py
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+def validate_kv_sharing_target(current_layer_name, target_layer_name,
+                               static_forward_context):
+    error_msg = (f"Specified KV sharing target layer for {current_layer_name} "
+                 f"is not valid: target layer {target_layer_name} ")
+
+    if current_layer_name == target_layer_name:
+        raise ValueError(error_msg +
+                         "cannot be the same as the current layer.")
+
+    if target_layer_name not in static_forward_context:
+        from vllm.model_executor.models.utils import extract_layer_index
+
+        # If target layer name is not in the static fwd context, it means either
+        # a) the target layer does not come BEFORE the current layer, or
+        # b) the target layer is not an Attention layer that exists in the model
+        current_layer_idx = extract_layer_index(current_layer_name)
+        target_layer_idx = extract_layer_index(target_layer_name)
+        if current_layer_idx <= target_layer_idx:
+            raise ValueError(error_msg + "must come before the current layer.")
+        else:
+            raise ValueError(error_msg +
+                             "is not a valid Attention layer in the model.")
+
+    # Currently KV sharing is only supported between layers of the same type
+    target_layer_attn_type = static_forward_context[
+        target_layer_name].attn_type
+    expected = static_forward_context[current_layer_name].attn_type
+    if target_layer_attn_type != expected:
+        raise ValueError(
+            error_msg +
+            f"must be the same type as the current layer ({expected}).")
diff --git a/vllm_v0.10.0/vllm/beam_search.py b/vllm_v0.10.0/vllm/beam_search.py
new file mode 100644
index 0000000..f3bc421
--- /dev/null
+++ b/vllm_v0.10.0/vllm/beam_search.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+from vllm.lora.request import LoRARequest
+from vllm.sequence import Logprob
+
+if TYPE_CHECKING:
+    from vllm.multimodal import MultiModalDataDict
+
+
+@dataclass
+class BeamSearchSequence:
+    """A sequence for beam search.
+    It keeps track of the tokens and the log probability of the sequence.
+    The text field is optional and will only be filled when the sequence is
+    about to be returned to the user.
+    """
+    # The tokens includes the prompt.
+    tokens: list[int]
+    logprobs: list[dict[int, Logprob]]
+    lora_request: Optional[LoRARequest] = None
+    cum_logprob: float = 0.0
+    text: Optional[str] = None
+    finish_reason: Optional[str] = None
+    stop_reason: Union[int, str, None] = None
+    multi_modal_data: Optional["MultiModalDataDict"] = None
+    mm_processor_kwargs: Optional[dict[str, Any]] = None
+
+
+@dataclass
+class BeamSearchOutput:
+    """The output of beam search.
+    It contains the list of the best beam search sequences.
+    The length of the list is equal to the beam width.
+    """
+    sequences: list[BeamSearchSequence]
+
+
+class BeamSearchInstance:
+
+    def __init__(
+        self,
+        prompt_tokens: list[int],
+        lora_request: Optional[LoRARequest] = None,
+        logprobs: Optional[list[dict[int, Logprob]]] = None,
+        **kwargs,
+    ):
+        self.beams: list[BeamSearchSequence] = [
+            BeamSearchSequence(
+                tokens=prompt_tokens,
+                logprobs=[] if logprobs is None else list(logprobs),
+                lora_request=lora_request,
+                **kwargs,
+            )
+        ]
+        self.completed: list[BeamSearchSequence] = []
+
+
+def get_beam_search_score(
+    tokens: list[int],
+    cumulative_logprob: float,
+    eos_token_id: int,
+    length_penalty: float = 1.0,
+) -> float:
+    """Calculate the beam search score with length penalty.
+
+    Adapted from
+
+    https://github.com/huggingface/transformers/blob/ccb92be23def445f2afdea94c31286f84b89eb5b/src/transformers/generation/beam_search.py#L938
+    """
+    seq_len = len(tokens)
+    if tokens[-1] == eos_token_id:
+        seq_len -= 1
+
+    return cumulative_logprob / (seq_len**length_penalty)
+
+
+def create_sort_beams_key_function(eos_token_id: int, length_penalty: float):
+
+    def sort_beams_key(x: BeamSearchSequence) -> float:
+        return get_beam_search_score(x.tokens, x.cum_logprob, eos_token_id,
+                                     length_penalty)
+
+    return sort_beams_key
diff --git a/vllm_v0.10.0/vllm/benchmarks/__init__.py b/vllm_v0.10.0/vllm/benchmarks/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/benchmarks/datasets.py b/vllm_v0.10.0/vllm/benchmarks/datasets.py
new file mode 100644
index 0000000..45b5803
--- /dev/null
+++ b/vllm_v0.10.0/vllm/benchmarks/datasets.py
@@ -0,0 +1,1531 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This module defines a framework for sampling benchmark requests from various
+datasets. Each dataset subclass of BenchmarkDataset must implement sample
+generation. Supported dataset types include:
+  - ShareGPT
+  - Random (synthetic)
+  - Sonnet
+  - BurstGPT
+  - HuggingFace
+  - VisionArena
+"""
+import base64
+import io
+import json
+import logging
+import random
+from abc import ABC, abstractmethod
+from collections.abc import Mapping
+from dataclasses import dataclass
+from functools import cache
+from io import BytesIO
+from typing import Any, Callable, Optional, Union
+
+import numpy as np
+from PIL import Image
+from transformers import PreTrainedTokenizerBase
+
+from vllm.lora.request import LoRARequest
+from vllm.lora.utils import get_adapter_absolute_path
+from vllm.multimodal import MultiModalDataDict
+from vllm.multimodal.image import convert_image_mode
+from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
+from vllm.utils import PlaceholderModule
+
+try:
+    from datasets import load_dataset
+except ImportError:
+    datasets = PlaceholderModule("datasets")
+    load_dataset = datasets.placeholder_attr("load_dataset")
+
+try:
+    import pandas as pd
+except ImportError:
+    pd = PlaceholderModule("pandas")
+
+try:
+    import librosa
+except ImportError:
+    librosa = PlaceholderModule("librosa")
+
+try:
+    from vllm.utils import FlexibleArgumentParser
+except ImportError:
+    from argparse import ArgumentParser as FlexibleArgumentParser
+
+logger = logging.getLogger(__name__)
+
+# -----------------------------------------------------------------------------
+# Data Classes
+# -----------------------------------------------------------------------------
+
+
+@dataclass
+class SampleRequest:
+    """
+    Represents a single inference request for benchmarking.
+    """
+
+    prompt: Union[str, Any]
+    prompt_len: int
+    expected_output_len: int
+    multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None
+    lora_request: Optional[LoRARequest] = None
+
+
+# -----------------------------------------------------------------------------
+# Benchmark Dataset Base Class
+# -----------------------------------------------------------------------------
+
+
+class BenchmarkDataset(ABC):
+    DEFAULT_SEED = 0
+    IS_MULTIMODAL = False
+
+    def __init__(
+        self,
+        dataset_path: Optional[str] = None,
+        random_seed: int = DEFAULT_SEED,
+    ) -> None:
+        """
+        Initialize the BenchmarkDataset with an optional dataset path and random
+        seed.  
+        
+        Args:
+            dataset_path (Optional[str]): Path to the dataset. If None, it
+            indicates that a default or random dataset might be used.
+            random_seed (int): Seed value for reproducible shuffling or
+            sampling. Defaults to DEFAULT_SEED.
+        """
+        self.dataset_path = dataset_path
+        # Set the random seed, ensuring that a None value is replaced with the
+        # default seed.
+        self.random_seed = (random_seed
+                            if random_seed is not None else self.DEFAULT_SEED)
+        self.data = None
+
+    def apply_multimodal_chat_transformation(
+            self,
+            prompt: str,
+            mm_content: Optional[MultiModalDataDict] = None) -> list[dict]:
+        """
+        Transform a prompt and optional multimodal content into a chat format.
+        This method is used for chat models that expect a specific conversation
+        format.
+        """
+        content = [{"text": prompt, "type": "text"}]
+        if mm_content is not None:
+            content.append(mm_content)
+        return [{"role": "user", "content": content}]
+
+    def load_data(self) -> None:
+        """
+        Load data from the dataset path into self.data.
+
+        This method must be overridden by subclasses since the method to load
+        data will vary depending on the dataset format and source.
+
+        Raises:
+            NotImplementedError: If a subclass does not implement this method.
+        """
+        # TODO (jenniferzhao): add support for downloading data
+        raise NotImplementedError(
+            "load_data must be implemented in subclasses.")
+
+    def get_random_lora_request(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        max_loras: Optional[int] = None,
+        lora_path: Optional[str] = None,
+    ) -> tuple[Optional[LoRARequest], AnyTokenizer]:
+        """
+        Optionally select a random LoRA request and return its associated
+        tokenizer.
+
+        This method is used when LoRA parameters are provided.  It randomly
+        selects a LoRA based on max_loras and retrieves a cached tokenizer for
+        that LoRA if available. Otherwise, it returns the base tokenizer.
+
+        Args:
+            tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no
+                LoRA is selected.
+            max_loras (Optional[int]): The maximum number of LoRAs available.
+                If `None`, LoRA is not used.
+            lora_path (Optional[str]): Path to the LoRA parameters on disk.
+                If `None`, LoRA is not used.
+
+        Returns:
+            A tuple with the following elements:
+                - A new [LoRARequest][] (or `None` if not applicable).
+                - The tokenizer associated with the LoRA request
+                  (or the base tokenizer).
+        """
+        if max_loras is None or lora_path is None:
+            return None, tokenizer
+
+        # Generate a random LoRA ID in the range [1, max_loras].
+        lora_id = random.randint(1, max_loras)
+        lora_request = LoRARequest(
+            lora_name=str(lora_id),
+            lora_int_id=lora_id,
+            lora_path=lora_path_on_disk(lora_path),
+        )
+        if lora_id not in lora_tokenizer_cache:
+            lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
+        # Return lora_request and the cached tokenizer if available; otherwise,
+        # return the base tokenizer
+        return lora_request, lora_tokenizer_cache[lora_id] or tokenizer
+
+    @abstractmethod
+    def sample(self, tokenizer: PreTrainedTokenizerBase,
+               num_requests: int) -> list[SampleRequest]:
+        """
+        Abstract method to generate sample requests from the dataset.
+
+        Subclasses must override this method to implement dataset-specific logic
+        for generating a list of SampleRequest objects.
+
+        Args:
+            tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
+                for processing the dataset's text.
+            num_requests (int): The number of sample requests to generate.
+
+        Returns:
+            list[SampleRequest]: A list of sample requests generated from the
+            dataset.
+        """
+        raise NotImplementedError("sample must be implemented in subclasses.")
+
+    def maybe_oversample_requests(self, requests: list[SampleRequest],
+                                  num_requests: int) -> None:
+        """
+        Oversamples the list of requests if its size is less than the desired
+        number.
+
+        Args:
+            requests (List[SampleRequest]): The current list of sampled
+                requests.
+            num_requests (int): The target number of requests.
+        """
+        if len(requests) < num_requests:
+            random.seed(self.random_seed)
+            additional = random.choices(requests,
+                                        k=num_requests - len(requests))
+            requests.extend(additional)
+            logger.info("Oversampled requests to reach %d total samples.",
+                        num_requests)
+
+
+# -----------------------------------------------------------------------------
+# Utility Functions and Global Caches
+# -----------------------------------------------------------------------------
+
+
+def is_valid_sequence(
+    prompt_len: int,
+    output_len: int,
+    min_len: int = 4,
+    max_prompt_len: int = 1024,
+    max_total_len: int = 2048,
+    skip_min_output_len_check: bool = False,
+) -> bool:
+    """
+    Validate a sequence based on prompt and output lengths.
+
+    Default pruning criteria are copied from the original `sample_hf_requests`
+    and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
+    from `sample_requests` in benchmark_throughput.py.
+    """
+    # Check for invalid conditions
+    prompt_too_short = prompt_len < min_len
+    output_too_short = (not skip_min_output_len_check) and (output_len
+                                                            < min_len)
+    prompt_too_long = prompt_len > max_prompt_len
+    combined_too_long = (prompt_len + output_len) > max_total_len
+
+    # Return True if none of the invalid conditions are met
+    return not (prompt_too_short or output_too_short or prompt_too_long
+                or combined_too_long)
+
+
+@cache
+def lora_path_on_disk(lora_path: str) -> str:
+    return get_adapter_absolute_path(lora_path)
+
+
+# Global cache for LoRA tokenizers.
+lora_tokenizer_cache: dict[int, AnyTokenizer] = {}
+
+
+def process_image(image: Any) -> Mapping[str, Any]:
+    """
+    Process a single image input and return a multimedia content dictionary.
+
+    Supports three input types:
+
+    1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
+       containing raw image data.  - Loads the bytes as a PIL.Image.Image.
+
+    2. PIL.Image.Image input: - Converts the image to RGB.  - Saves the image as
+       a JPEG in memory.  - Encodes the JPEG data as a base64 string.  - Returns
+       a dictionary with the image as a base64 data URL.
+
+    3. String input: - Treats the string as a URL or local file path.  -
+       Prepends "file://" if the string doesn't start with "http://" or
+       "file://".  - Returns a dictionary with the image URL.
+
+    Raises:
+        ValueError: If the input is not a supported type.
+    """
+    if isinstance(image, dict) and 'bytes' in image:
+        image = Image.open(BytesIO(image['bytes']))
+    if isinstance(image, Image.Image):
+        image = convert_image_mode(image, "RGB")
+        with io.BytesIO() as image_data:
+            image.save(image_data, format="JPEG")
+            image_base64 = base64.b64encode(
+                image_data.getvalue()).decode("utf-8")
+        return {
+            "type": "image_url",
+            "image_url": {
+                "url": f"data:image/jpeg;base64,{image_base64}"
+            },
+        }
+
+    if isinstance(image, str):
+        image_url = (image if image.startswith(
+            ("http://", "file://")) else f"file://{image}")
+        return {"type": "image_url", "image_url": {"url": image_url}}
+
+    raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
+                     " or str or dictionary with raw image bytes.")
+
+
+# -----------------------------------------------------------------------------
+# Random Dataset Implementation (Synthetic Data)
+# -----------------------------------------------------------------------------
+
+
+class RandomDataset(BenchmarkDataset):
+    # Default values copied from benchmark_serving.py for the random dataset.
+    DEFAULT_PREFIX_LEN = 0
+    DEFAULT_RANGE_RATIO = 0.0
+    DEFAULT_INPUT_LEN = 1024
+    DEFAULT_OUTPUT_LEN = 128
+
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        random.seed(self.random_seed)
+        np.random.seed(self.random_seed)
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        prefix_len: int = DEFAULT_PREFIX_LEN,
+        range_ratio: float = DEFAULT_RANGE_RATIO,
+        input_len: int = DEFAULT_INPUT_LEN,
+        output_len: int = DEFAULT_OUTPUT_LEN,
+        **kwargs,
+    ) -> list[SampleRequest]:
+        # Enforce range_ratio < 1
+        assert range_ratio < 1.0, (
+            "random_range_ratio must be < 1.0 to ensure a valid sampling range"
+        )
+
+        vocab_size = tokenizer.vocab_size
+        num_special_tokens = tokenizer.num_special_tokens_to_add()
+        real_input_len = input_len - num_special_tokens
+
+        prefix_token_ids = (np.random.randint(
+            0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])
+
+        # New sampling logic: [X * (1 - b), X * (1 + b)]
+        input_low = int(real_input_len * (1 - range_ratio))
+        input_high = int(real_input_len * (1 + range_ratio))
+        output_low = int(output_len * (1 - range_ratio))
+        output_high = int(output_len * (1 + range_ratio))
+
+        # Add logging for debugging
+        logger.info(
+            "Sampling input_len from [%s, %s] and output_len from [%s, %s]",
+            input_low, input_high, output_low, output_high)
+
+        input_lens = np.random.randint(input_low,
+                                       input_high + 1,
+                                       size=num_requests)
+        output_lens = np.random.randint(output_low,
+                                        output_high + 1,
+                                        size=num_requests)
+        offsets = np.random.randint(0, vocab_size, size=num_requests)
+
+        requests = []
+        for i in range(num_requests):
+            inner_seq = ((offsets[i] + i + np.arange(input_lens[i])) %
+                         vocab_size).tolist()
+            token_sequence = prefix_token_ids + inner_seq
+            prompt = tokenizer.decode(token_sequence)
+            # After decoding the prompt we have to encode and decode it again.
+            # This is done because in some cases N consecutive tokens
+            # give a string tokenized into != N number of tokens.
+            # For example for GPT2Tokenizer:
+            # [6880, 6881] -> ['Ġcalls', 'here'] ->
+            # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
+            # To avoid uncontrolled change of the prompt length,
+            # the encoded sequence is truncated before being decode again.
+            total_input_len = prefix_len + int(input_lens[i])
+            re_encoded_sequence = tokenizer.encode(
+                prompt, add_special_tokens=False)[:total_input_len]
+            prompt = tokenizer.decode(re_encoded_sequence)
+            total_input_len = len(re_encoded_sequence)
+            requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=total_input_len,
+                    expected_output_len=int(output_lens[i]),
+                ))
+        return requests
+
+
+# -----------------------------------------------------------------------------
+# ShareGPT Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class ShareGPTDataset(BenchmarkDataset):
+    """
+    Implements the ShareGPT dataset.  Loads data from a JSON file and generates
+    sample requests based on conversation turns.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self) -> None:
+        if self.dataset_path is None:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        with open(self.dataset_path, encoding="utf-8") as f:
+            self.data = json.load(f)
+        # Filter entries with at least two conversation turns.
+        self.data = [
+            entry for entry in self.data
+            if "conversations" in entry and len(entry["conversations"]) >= 2
+        ]
+        random.seed(self.random_seed)
+        random.shuffle(self.data)
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        lora_path: Optional[str] = None,
+        max_loras: Optional[int] = None,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
+        samples: list = []
+        for entry in self.data:
+            if len(samples) >= num_requests:
+                break
+            prompt, completion = (
+                entry["conversations"][0]["value"],
+                entry["conversations"][1]["value"],
+            )
+
+            lora_request, tokenizer = self.get_random_lora_request(
+                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
+            prompt_ids = tokenizer(prompt).input_ids
+            completion_ids = tokenizer(completion).input_ids
+            prompt_len = len(prompt_ids)
+            new_output_len = (len(completion_ids)
+                              if output_len is None else output_len)
+            if not is_valid_sequence(prompt_len,
+                                     new_output_len,
+                                     skip_min_output_len_check=output_len
+                                     is not None):
+                continue
+            if enable_multimodal_chat:
+                prompt = self.apply_multimodal_chat_transformation(
+                    prompt, None)
+            samples.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=new_output_len,
+                    lora_request=lora_request,
+                ))
+        self.maybe_oversample_requests(samples, num_requests)
+        return samples
+
+
+def add_dataset_parser(parser: FlexibleArgumentParser):
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument(
+        "--num-prompts",
+        type=int,
+        default=1000,
+        help="Number of prompts to process.",
+    )
+    parser.add_argument(
+        "--dataset-name",
+        type=str,
+        default="random",
+        choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"],
+        help="Name of the dataset to benchmark on.",
+    )
+    parser.add_argument(
+        "--no-stream",
+        action="store_true",
+        help="Do not load the dataset in streaming mode.",
+    )
+    parser.add_argument(
+        "--dataset-path",
+        type=str,
+        default=None,
+        help="Path to the sharegpt/sonnet dataset. "
+        "Or the huggingface dataset ID if using HF dataset.",
+    )
+
+    # group for dataset specific arguments
+    custom_group = parser.add_argument_group("custom dataset options")
+    custom_group.add_argument(
+        "--custom-output-len",
+        type=int,
+        default=256,
+        help=
+        "Number of output tokens per request, used only for custom dataset.",
+    )
+    custom_group.add_argument(
+        "--custom-skip-chat-template",
+        action="store_true",
+        help=
+        "Skip applying chat template to prompt, used only for custom dataset.",
+    )
+
+    sonnet_group = parser.add_argument_group("sonnet dataset options")
+    sonnet_group.add_argument(
+        "--sonnet-input-len",
+        type=int,
+        default=550,
+        help=
+        "Number of input tokens per request, used only for sonnet dataset.",
+    )
+    sonnet_group.add_argument(
+        "--sonnet-output-len",
+        type=int,
+        default=150,
+        help=
+        "Number of output tokens per request, used only for sonnet dataset.",
+    )
+    sonnet_group.add_argument(
+        "--sonnet-prefix-len",
+        type=int,
+        default=200,
+        help=
+        "Number of prefix tokens per request, used only for sonnet dataset.",
+    )
+
+    sharegpt_group = parser.add_argument_group("sharegpt dataset options")
+    sharegpt_group.add_argument(
+        "--sharegpt-output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the output length "
+        "from the ShareGPT dataset.",
+    )
+
+    random_group = parser.add_argument_group("random dataset options")
+    random_group.add_argument(
+        "--random-input-len",
+        type=int,
+        default=1024,
+        help=
+        "Number of input tokens per request, used only for random sampling.",
+    )
+    random_group.add_argument(
+        "--random-output-len",
+        type=int,
+        default=128,
+        help=
+        "Number of output tokens per request, used only for random sampling.",
+    )
+    random_group.add_argument(
+        "--random-range-ratio",
+        type=float,
+        default=0.0,
+        help="Range ratio for sampling input/output length, "
+        "used only for random sampling. Must be in the range [0, 1) to define "
+        "a symmetric sampling range"
+        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
+    )
+    random_group.add_argument(
+        "--random-prefix-len",
+        type=int,
+        default=0,
+        help=("Number of fixed prefix tokens before the random context "
+              "in a request. "
+              "The total input length is the sum of `random-prefix-len` and "
+              "a random "
+              "context length sampled from [input_len * (1 - range_ratio), "
+              "input_len * (1 + range_ratio)]."),
+    )
+
+    hf_group = parser.add_argument_group("hf dataset options")
+    hf_group.add_argument("--hf-subset",
+                          type=str,
+                          default=None,
+                          help="Subset of the HF dataset.")
+    hf_group.add_argument("--hf-split",
+                          type=str,
+                          default=None,
+                          help="Split of the HF dataset.")
+    hf_group.add_argument(
+        "--hf-output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the output lengths "
+        "from the sampled HF dataset.",
+    )
+
+
+def get_samples(args, tokenizer) -> list[SampleRequest]:
+    if args.dataset_name == "custom":
+        dataset = CustomDataset(dataset_path=args.dataset_path)
+        input_requests = dataset.sample(
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            output_len=args.custom_output_len,
+            skip_chat_template=args.custom_skip_chat_template,
+        )
+
+    elif args.dataset_name == "sonnet":
+        dataset = SonnetDataset(dataset_path=args.dataset_path)
+        # For the "sonnet" dataset, formatting depends on the backend.
+        if args.endpoint_type == "openai-chat":
+            input_requests = dataset.sample(
+                num_requests=args.num_prompts,
+                input_len=args.sonnet_input_len,
+                output_len=args.sonnet_output_len,
+                prefix_len=args.sonnet_prefix_len,
+                tokenizer=tokenizer,
+                return_prompt_formatted=False,
+            )
+        else:
+            assert tokenizer.chat_template or tokenizer.default_chat_template, (
+                "Tokenizer/model must have chat template for sonnet dataset.")
+            input_requests = dataset.sample(
+                num_requests=args.num_prompts,
+                input_len=args.sonnet_input_len,
+                output_len=args.sonnet_output_len,
+                prefix_len=args.sonnet_prefix_len,
+                tokenizer=tokenizer,
+                return_prompt_formatted=True,
+            )
+
+    elif args.dataset_name == "hf":
+        # all following datasets are implemented from the
+        # HuggingFaceDataset base class
+        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = VisionArenaDataset
+            args.hf_split = "train"
+            args.hf_subset = None
+        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = InstructCoderDataset
+            args.hf_split = "train"
+        elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = MTBenchDataset
+            args.hf_split = "train"
+        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = ConversationDataset
+        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = AIMODataset
+            args.hf_split = "train"
+        elif args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS:  # noqa: E501
+            dataset_class = NextEditPredictionDataset
+            args.hf_split = "train"
+        elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = ASRDataset
+            args.hf_split = "train"
+        elif args.dataset_path in MLPerfDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = MLPerfDataset
+            args.hf_split = "train"
+        else:
+            supported_datasets = set([
+                dataset_name for cls in HuggingFaceDataset.__subclasses__()
+                for dataset_name in cls.SUPPORTED_DATASET_PATHS
+            ])
+            raise ValueError(
+                f"Unsupported dataset path: {args.dataset_path}. "
+                "Huggingface dataset only supports dataset_path"
+                f" from one of following: {supported_datasets}. "
+                "Please consider contributing if you would "
+                "like to add support for additional dataset formats.")
+
+        if dataset_class.IS_MULTIMODAL and args.endpoint_type not in [
+                "openai-chat",
+                "openai-audio",
+        ]:
+            # multi-modal benchmark is only available on OpenAI Chat backend.
+            raise ValueError(
+                "Multi-modal content is only supported on 'openai-chat' and "
+                "'openai-audio' backend.")
+        input_requests = dataset_class(
+            dataset_path=args.dataset_path,
+            dataset_subset=args.hf_subset,
+            dataset_split=args.hf_split,
+            random_seed=args.seed,
+            no_stream=args.no_stream,
+        ).sample(
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            output_len=args.hf_output_len,
+        )
+
+    else:
+        # For datasets that follow a similar structure, use a mapping.
+        dataset_mapping = {
+            "sharegpt":
+            lambda: ShareGPTDataset(random_seed=args.seed,
+                                    dataset_path=args.dataset_path).sample(
+                                        tokenizer=tokenizer,
+                                        num_requests=args.num_prompts,
+                                        output_len=args.sharegpt_output_len,
+                                    ),
+            "burstgpt":
+            lambda: BurstGPTDataset(random_seed=args.seed,
+                                    dataset_path=args.dataset_path).
+            sample(tokenizer=tokenizer, num_requests=args.num_prompts),
+            "random":
+            lambda: RandomDataset(random_seed=args.seed,
+                                  dataset_path=args.dataset_path).sample(
+                tokenizer=tokenizer,
+                num_requests=args.num_prompts,
+                prefix_len=args.random_prefix_len,
+                input_len=args.random_input_len,
+                output_len=args.random_output_len,
+                range_ratio=args.random_range_ratio,
+            ),
+        }
+
+        try:
+            input_requests = dataset_mapping[args.dataset_name]()
+        except KeyError as err:
+            raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
+
+    return input_requests
+
+
+# -----------------------------------------------------------------------------
+# Custom Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class CustomDataset(BenchmarkDataset):
+    """
+    Implements the Custom dataset.  Loads data from a JSONL file and generates
+    sample requests based on conversation turns. E.g.,
+    ```
+    {"prompt": "What is the capital of India?"}
+    {"prompt": "What is the capital of Iran?"}
+    {"prompt": "What is the capital of China?"}
+    ```
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self) -> None:
+        if self.dataset_path is None:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        # self.data will be a list of dictionaries
+        # e.g., [{"prompt": "What is the capital of India?"}, ...]
+        # This will be the standardized format which load_data()
+        # has to convert into depending on the filetype of dataset_path.
+        # sample() will assume this standardized format of self.data
+        self.data = []
+
+        # Load the JSONL file
+        if self.dataset_path.endswith(".jsonl"):
+            jsonl_data = pd.read_json(path_or_buf=self.dataset_path,
+                                      lines=True)
+
+            # check if the JSONL file has a 'prompt' column
+            if "prompt" not in jsonl_data.columns:
+                raise ValueError("JSONL file must contain a 'prompt' column.")
+
+            # Convert each row to a dictionary and append to self.data
+            # This will convert the DataFrame to a list of dictionaries
+            # where each dictionary corresponds to a row in the DataFrame.
+            # This is the standardized format we want for self.data
+            for _, row in jsonl_data.iterrows():
+                self.data.append(row.to_dict())
+        else:
+            raise NotImplementedError(
+                "Only JSONL format is supported for CustomDataset.")
+
+        random.seed(self.random_seed)
+        random.shuffle(self.data)
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        lora_path: Optional[str] = None,
+        max_loras: Optional[int] = None,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        skip_chat_template: bool = False,
+        **kwargs,
+    ) -> list:
+        sampled_requests = []
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = item["prompt"]
+
+            # apply template
+            if not skip_chat_template:
+                prompt = tokenizer.apply_chat_template(
+                    [{
+                        "role": "user",
+                        "content": prompt
+                    }],
+                    add_generation_prompt=True,
+                    tokenize=False,
+                )
+
+            prompt_len = len(tokenizer(prompt).input_ids)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# Sonnet Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class SonnetDataset(BenchmarkDataset):
+    """
+    Simplified implementation of the Sonnet dataset.  Loads poem lines from a
+    text file and generates sample requests.  Default values here copied from
+    `benchmark_serving.py` for the sonnet dataset.
+    """
+
+    DEFAULT_PREFIX_LEN = 200
+    DEFAULT_INPUT_LEN = 550
+    DEFAULT_OUTPUT_LEN = 150
+
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self) -> None:
+        if not self.dataset_path:
+            raise ValueError("dataset_path must be provided.")
+        with open(self.dataset_path, encoding="utf-8") as f:
+            self.data = f.readlines()
+
+    def sample(
+        self,
+        tokenizer,
+        num_requests: int,
+        prefix_len: int = DEFAULT_PREFIX_LEN,
+        input_len: int = DEFAULT_INPUT_LEN,
+        output_len: int = DEFAULT_OUTPUT_LEN,
+        return_prompt_formatted: bool = False,
+        **kwargs,
+    ) -> list:
+        # Calculate average token length for a poem line.
+        tokenized_lines = [tokenizer(line).input_ids for line in self.data]
+        avg_len = sum(len(tokens)
+                      for tokens in tokenized_lines) / len(tokenized_lines)
+
+        # Build the base prompt.
+        base_prompt = "Pick as many lines as you can from these poem lines:\n"
+        base_msg = [{"role": "user", "content": base_prompt}]
+        base_fmt = tokenizer.apply_chat_template(base_msg,
+                                                 add_generation_prompt=True,
+                                                 tokenize=False)
+        base_offset = len(tokenizer(base_fmt).input_ids)
+        if input_len <= base_offset:
+            raise ValueError(
+                f"'input_len' must be higher than the base prompt length "
+                f"({base_offset}).")
+
+        # Determine how many poem lines to use.
+        num_input_lines = round((input_len - base_offset) / avg_len)
+        num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0)
+        prefix_lines = self.data[:num_prefix_lines]
+
+        samples = []
+        while len(samples) < num_requests:
+            extra_lines = random.choices(self.data,
+                                         k=num_input_lines - num_prefix_lines)
+            prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
+            msg = [{"role": "user", "content": prompt}]
+            prompt_formatted = tokenizer.apply_chat_template(
+                msg, add_generation_prompt=True, tokenize=False)
+            prompt_len = len(tokenizer(prompt_formatted).input_ids)
+            if prompt_len <= input_len:
+                samples.append(
+                    SampleRequest(
+                        prompt=prompt_formatted
+                        if return_prompt_formatted else prompt,
+                        prompt_len=prompt_len,
+                        expected_output_len=output_len,
+                    ))
+        return samples
+
+
+# -----------------------------------------------------------------------------
+# BurstGPT Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class BurstGPTDataset(BenchmarkDataset):
+    """
+    Implements the BurstGPT dataset.  Loads data from a CSV file and generates
+    sample requests based on synthetic prompt generation. Only rows with Model
+    "GPT-4" and positive response tokens are used.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self, ):
+        if self.dataset_path is None:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        df = pd.read_csv(self.dataset_path)
+        # Filter to keep only GPT-4 rows.
+        gpt4_df = df[df["Model"] == "GPT-4"]
+        # Remove failed requests (where Response tokens is 0 or less).
+        gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
+        # Sample the desired number of rows.
+        self.data = gpt4_df
+
+    def _sample_loaded_data(self, num_requests: int) -> list:
+        if num_requests <= len(self.data):
+            data = self.data.sample(n=num_requests,
+                                    random_state=self.random_seed)
+        else:
+            data = self.data.sample(
+                n=num_requests,
+                random_state=self.random_seed,
+                replace=True,
+            )
+        # Convert the dataframe to a list of lists.
+        return data.values.tolist()
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        max_loras: Optional[int] = None,
+        lora_path: Optional[str] = None,
+        **kwargs,
+    ) -> list[SampleRequest]:
+        samples = []
+        data = self._sample_loaded_data(num_requests=num_requests)
+        for i in range(num_requests):
+            input_len = int(data[i][2])
+            output_len = int(data[i][3])
+            lora_req, tokenizer = self.get_random_lora_request(
+                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
+            vocab_size = tokenizer.vocab_size
+            # Generate a synthetic prompt: a list of token IDs computed as (i +
+            # j) modulo vocab_size.
+            token_ids = [(i + j) % vocab_size for j in range(input_len)]
+            prompt = tokenizer.decode(token_ids)
+            samples.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=input_len,
+                    expected_output_len=output_len,
+                    lora_request=lora_req,
+                ))
+        return samples
+
+
+# -----------------------------------------------------------------------------
+# HuggingFace Dataset Base Implementation
+# -----------------------------------------------------------------------------
+class HuggingFaceDataset(BenchmarkDataset):
+    """Base class for datasets hosted on HuggingFace."""
+
+    SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set()
+
+    def __init__(
+        self,
+        dataset_path: str,
+        dataset_split: str,
+        no_stream: bool = False,
+        dataset_subset: Optional[str] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(dataset_path=dataset_path, **kwargs)
+
+        self.dataset_split = dataset_split
+        self.dataset_subset = dataset_subset
+        self.load_stream = not no_stream
+        self.load_data()
+
+    def load_data(self) -> None:
+        """Load data from HuggingFace datasets."""
+        self.data = load_dataset(
+            self.dataset_path,
+            name=self.dataset_subset,
+            split=self.dataset_split,
+            streaming=self.load_stream,
+        )
+        self.data = self.data.shuffle(seed=self.random_seed)
+
+
+# -----------------------------------------------------------------------------
+# Conversation Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class ConversationDataset(HuggingFaceDataset):
+    """Dataset for conversation data with multimodal support."""
+    SUPPORTED_DATASET_PATHS = {
+        'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered'
+    }
+    IS_MULTIMODAL = True
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               output_len: Optional[int] = None,
+               enable_multimodal_chat: bool = False,
+               **kwargs) -> list:
+        # Filter examples with at least 2 conversations
+        filtered_data = self.data.filter(
+            lambda x: len(x["conversations"]) >= 2)
+        sampled_requests = []
+        dynamic_output = output_len is None
+
+        for item in filtered_data:
+            if len(sampled_requests) >= num_requests:
+                break
+            conv = item["conversations"]
+            prompt, completion = conv[0]["value"], conv[1]["value"]
+
+            prompt_ids = tokenizer(prompt).input_ids
+            completion_ids = tokenizer(completion).input_ids
+            prompt_len = len(prompt_ids)
+            completion_len = len(completion_ids)
+            output_len = completion_len if dynamic_output else output_len
+            assert isinstance(output_len, int) and output_len > 0
+            if dynamic_output and not is_valid_sequence(
+                    prompt_len, completion_len):
+                continue
+            mm_content = process_image(
+                item["image"]) if "image" in item else None
+            if enable_multimodal_chat:
+                # Note: when chat is enabled the request prompt_len is no longer
+                # accurate and we will be using request output to count the
+                # actual prompt len and output len
+                prompt = self.apply_multimodal_chat_transformation(
+                    prompt, mm_content)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=mm_content,
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# Vision Arena Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class VisionArenaDataset(HuggingFaceDataset):
+    """
+    Vision Arena Dataset.
+    """
+
+    DEFAULT_OUTPUT_LEN = 128
+    SUPPORTED_DATASET_PATHS = {
+        "lmarena-ai/VisionArena-Chat":
+        lambda x: x["conversation"][0][0]["content"],
+        "lmarena-ai/vision-arena-bench-v0.1":
+        lambda x: x["turns"][0][0]["content"]
+    }
+    IS_MULTIMODAL = True
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
+        output_len = (output_len
+                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        sampled_requests = []
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
+            if parser_fn is None:
+                raise ValueError(
+                    f"Unsupported dataset path: {self.dataset_path}")
+            prompt = parser_fn(item)
+            mm_content = process_image(item["images"][0])
+            prompt_len = len(tokenizer(prompt).input_ids)
+            if enable_multimodal_chat:
+                # Note: when chat is enabled the request prompt_len is no longer
+                # accurate and we will be using request output to count the
+                # actual prompt len
+                prompt = self.apply_multimodal_chat_transformation(
+                    prompt, mm_content)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=mm_content,
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# Instruct Coder Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class InstructCoderDataset(HuggingFaceDataset):
+    """
+    InstructCoder Dataset.
+    https://huggingface.co/datasets/likaixin/InstructCoder
+
+    InstructCoder is the dataset designed for general code editing.  It consists
+    of 114,239 instruction-input-output triplets, and covers multiple distinct
+    code editing scenario.
+    """
+
+    DEFAULT_OUTPUT_LEN = 200  # this is the average default output length
+    SUPPORTED_DATASET_PATHS = {
+        "likaixin/InstructCoder",
+    }
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               output_len: Optional[int] = None,
+               enable_multimodal_chat: bool = False,
+               **kwargs) -> list:
+        output_len = (output_len
+                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        sampled_requests = []
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = f"{item['input']}\n\n{item['instruction']} Just output \
+            the code, do not include any explanation."
+
+            # apply template
+            prompt = tokenizer.apply_chat_template(
+                [{
+                    "role": "user",
+                    "content": prompt
+                }],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
+
+            prompt_len = len(tokenizer(prompt).input_ids)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# MT-Bench Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class MTBenchDataset(HuggingFaceDataset):
+    """
+    MT-Bench Dataset.
+    https://huggingface.co/datasets/philschmid/mt-bench
+
+    We create a single turn dataset for MT-Bench.
+    This is similar to Spec decoding benchmark setup in vLLM
+    https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18
+    """  # noqa: E501
+
+    DEFAULT_OUTPUT_LEN = 256  # avg len used in SD bench in vLLM
+    SUPPORTED_DATASET_PATHS = {
+        "philschmid/mt-bench",
+    }
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
+        output_len = (output_len
+                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        sampled_requests = []
+
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = item["turns"][0]
+
+            # apply template
+            prompt = tokenizer.apply_chat_template(
+                [{
+                    "role": "user",
+                    "content": prompt
+                }],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
+
+            prompt_len = len(tokenizer(prompt).input_ids)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# AIMO Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class AIMODataset(HuggingFaceDataset):
+    """
+    Dataset class for processing a AIMO dataset with reasoning questions.
+    """
+    SUPPORTED_DATASET_PATHS = {
+        "AI-MO/aimo-validation-aime", "AI-MO/NuminaMath-1.5",
+        "AI-MO/NuminaMath-CoT"
+    }
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               output_len: Optional[int] = None,
+               **kwargs) -> list:
+        sampled_requests = []
+        dynamic_output = output_len is None
+
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt, completion = item['problem'], item["solution"]
+
+            prompt_ids = tokenizer(prompt).input_ids
+            completion_ids = tokenizer(completion).input_ids
+            prompt_len = len(prompt_ids)
+            completion_len = len(completion_ids)
+            output_len = completion_len if dynamic_output else output_len
+            assert isinstance(output_len, int) and output_len > 0
+            if dynamic_output and not is_valid_sequence(prompt_len,
+                                                        completion_len,
+                                                        max_prompt_len=2048,
+                                                        max_total_len=32000):
+                continue
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=None,
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# Next Edit Prediction Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+zeta_prompt = """### Instruction:
+You are a code completion assistant and your task is to analyze user edits and then rewrite an excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking into account the cursor location.
+
+### User Edits:
+
+{}
+
+### User Excerpt:
+
+{}
+
+### Response:
+
+""" # noqa: E501
+
+
+def _format_zeta_prompt(
+        sample: dict,
+        original_start_marker: str = "<|editable_region_start|>") -> dict:
+    """Format the zeta prompt for the Next Edit Prediction (NEP) dataset.
+
+    This function formats examples from the NEP dataset
+    into prompts and expected outputs. It could be
+    further extended to support more NEP datasets.
+
+    Args:
+        sample: The dataset sample containing events,
+            inputs, and outputs.
+        original_start_marker: The marker indicating the
+            start of the editable region. Defaults to
+            "<|editable_region_start|>".
+
+    Returns:
+        A dictionary with the formatted prompts and expected outputs.
+    """
+    events = sample["events"]
+    input = sample["input"]
+    output = sample["output"]
+    prompt = zeta_prompt.format(events, input)
+
+    # following the original implementation, extract the focused region
+    # from the raw output
+    output_start_index = output.find(original_start_marker)
+    output_focused_region = output[output_start_index:]
+    expected_output = output_focused_region
+
+    return {"prompt": prompt, "expected_output": expected_output}
+
+
+class NextEditPredictionDataset(HuggingFaceDataset):
+    """
+    Dataset class for processing a Next Edit Prediction dataset.
+    """
+
+    SUPPORTED_DATASET_PATHS = {
+        "zed-industries/zeta",
+    }
+    MAPPING_PROMPT_FUNCS = {
+        "zed-industries/zeta": _format_zeta_prompt,
+    }
+
+    def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int,
+               **kwargs):
+        formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(
+            self.dataset_path)
+        if formatting_prompt_func is None:
+            raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
+        samples = []
+        for sample in self.data:
+            sample = formatting_prompt_func(sample)
+            samples.append(
+                SampleRequest(
+                    prompt=sample["prompt"],
+                    prompt_len=len(tokenizer(sample["prompt"]).input_ids),
+                    expected_output_len=len(
+                        tokenizer(sample["expected_output"]).input_ids),
+                ))
+            if len(samples) >= num_requests:
+                break
+        self.maybe_oversample_requests(samples, num_requests)
+        return samples
+
+
+# -----------------------------------------------------------------------------
+# ASR Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class ASRDataset(HuggingFaceDataset):
+    """
+    Dataset class for processing a ASR dataset for transcription.
+    Tested on the following set:
+
+    +----------------+----------------------------------------+--------------------------+-----------------------------+
+    | Dataset        | Domain                                 | Speaking Style           | hf-subset                   |
+    +----------------+----------------------------------------+--------------------------+-----------------------------+
+    | TED-LIUM       | TED talks                              | Oratory                  | release1, release2, release3|
+    |                |                                        |                          | release3-speaker-adaptation |
+    | VoxPopuli      | European Parliament                    | Oratory                  | en, de, it, fr,  ...        |
+    | LibriSpeech    | Audiobook                              | Narrated                 | "LIUM/tedlium"              |
+    | GigaSpeech     | Audiobook, podcast, YouTube            | Narrated, spontaneous    | xs, s, m, l, xl, dev, test  |
+    | SPGISpeech     | Financial meetings                     | Oratory, spontaneous     | S, M, L, dev, test          |
+    | AMI            | Meetings                               | Spontaneous              | ihm, sdm                    |
+    +----------------+----------------------------------------+--------------------------+-----------------------------+
+
+    """  # noqa: E501
+
+    SUPPORTED_DATASET_PATHS = {
+        "openslr/librispeech_asr",
+        "facebook/voxpopuli",
+        "LIUM/tedlium",
+        "edinburghcstr/ami",
+        "speechcolab/gigaspeech",
+        "kensho/spgispeech",
+    }
+
+    DEFAULT_OUTPUT_LEN = 128
+    IS_MULTIMODAL = True
+
+    # TODO Whisper-specific. Abstract interface when more models are supported.
+    TRANSCRIPTION_PREAMBLE = (
+        "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>")
+    skip_long_audios: bool = True
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        **kwargs,
+    ) -> list:
+        output_len = (output_len
+                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
+        prompt_len = len(tokenizer(prompt).input_ids)
+        sampled_requests = []
+        skipped = 0
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            audio = item["audio"]
+            y, sr = audio["array"], audio["sampling_rate"]
+            duration_s = librosa.get_duration(y=y, sr=sr)
+            # Whisper max supported duration
+            if self.skip_long_audios and duration_s > 30:
+                skipped += 1
+                continue
+
+            mm_content = {"audio": (y, sr)}
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=mm_content,
+                ))
+        if skipped:
+            logger.warning(
+                "%d samples discarded from dataset due to"
+                " their length being greater than"
+                " what Whisper supports.",
+                skipped,
+            )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# MLPerf Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class MLPerfDataset(HuggingFaceDataset):
+    """
+    MLPerf Inference Dataset.
+
+    Dataset on HF:
+    https://huggingface.co/datasets/mgoin/mlperf-inference-llama2-data
+    https://huggingface.co/datasets/mgoin/mlperf-inference-llama3.1-data
+
+    Each record contains:
+      - "system_prompt": system role instruction.
+      - "question": user question.
+      - "output": reference answer.
+
+    We combine the system prompt and question into a chat-formatted prompt
+    (using the tokenizer's chat template) and set the expected output length to
+    the tokenized length of the provided reference answer.
+    """
+
+    SUPPORTED_DATASET_PATHS = {
+        "mgoin/mlperf-inference-llama2-data",
+        "mgoin/mlperf-inference-llama3.1-data",
+    }
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        **kwargs,
+    ) -> list[SampleRequest]:
+        # Force dynamic output length based on reference completion.
+        dynamic_output = output_len is None
+        sampled_requests: list[SampleRequest] = []
+
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+
+            system_prompt = item["system_prompt"]
+            question = item["question"]
+            reference_answer = item["output"]
+
+            # Build chat-style prompt using tokenizer template, if available.
+            messages = [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": question},
+            ]
+            prompt_formatted = tokenizer.apply_chat_template(
+                messages, add_generation_prompt=True, tokenize=False
+            )
+            prompt_len = len(tokenizer(prompt_formatted).input_ids)
+
+            # Determine output length from reference answer tokens.
+            ref_out_len = len(
+                tokenizer(reference_answer, add_special_tokens=False).input_ids
+            )
+            expected_output_len = ref_out_len if dynamic_output else output_len
+
+            # Validate sequence lengths.
+            if not is_valid_sequence(prompt_len, expected_output_len):
+                continue
+
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt_formatted,
+                    prompt_len=prompt_len,
+                    expected_output_len=expected_output_len,
+                )
+            )
+
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
diff --git a/vllm_v0.10.0/vllm/benchmarks/endpoint_request_func.py b/vllm_v0.10.0/vllm/benchmarks/endpoint_request_func.py
new file mode 100644
index 0000000..60ae520
--- /dev/null
+++ b/vllm_v0.10.0/vllm/benchmarks/endpoint_request_func.py
@@ -0,0 +1,393 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""The request function for API endpoints."""
+
+import io
+import json
+import os
+import sys
+import time
+import traceback
+from dataclasses import dataclass, field
+from typing import Optional
+
+import aiohttp
+from tqdm.asyncio import tqdm
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+
+
+@dataclass
+class RequestFuncInput:
+    """The input for the request function."""
+    prompt: str
+    api_url: str
+    prompt_len: int
+    output_len: int
+    model: str
+    model_name: Optional[str] = None
+    logprobs: Optional[int] = None
+    extra_body: Optional[dict] = None
+    multi_modal_content: Optional[dict] = None
+    ignore_eos: bool = False
+    language: Optional[str] = None
+
+
+@dataclass
+class RequestFuncOutput:
+    """The output of the request function including metrics."""
+    generated_text: str = ""
+    success: bool = False
+    latency: float = 0.0
+    output_tokens: int = 0
+    ttft: float = 0.0  # Time to first token
+    itl: list[float] = field(
+        default_factory=list)  # list of inter-token latencies
+    tpot: float = 0.0  # avg next-token latencies
+    prompt_len: int = 0
+    error: str = ""
+
+
+async def async_request_openai_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    """The async request function for the OpenAI Completions API.
+
+    Args:
+        request_func_input: The input for the request function.
+        pbar: The progress bar to display the progress.
+
+    Returns:
+        The output of the request function.
+    """
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        ("completions", "profile")
+    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
+
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            "model": request_func_input.model_name \
+                if request_func_input.model_name else request_func_input.model,
+            "prompt": request_func_input.prompt,
+            "temperature": 0.0,
+            "repetition_penalty": 1.0,
+            "max_tokens": request_func_input.output_len,
+            "logprobs": request_func_input.logprobs,
+            "stream": True,
+            "stream_options": {
+                "include_usage": True,
+            },
+        }
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
+        }
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    first_chunk_received = False
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+                        chunk_bytes = chunk_bytes.decode("utf-8")
+                        # NOTE: SSE comments (often used as pings) start with
+                        # a colon. These are not JSON data payload and should
+                        # be skipped.
+                        if chunk_bytes.startswith(":"):
+                            continue
+
+                        chunk = chunk_bytes.removeprefix("data: ")
+
+                        if chunk != "[DONE]":
+                            data = json.loads(chunk)
+
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
+                            if choices := data.get("choices"):
+                                # Note that text could be empty here
+                                # e.g. for special tokens
+                                text = choices[0].get("text")
+                                timestamp = time.perf_counter()
+                                # First token
+                                if not first_chunk_received:
+                                    first_chunk_received = True
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp -
+                                                      most_recent_timestamp)
+
+                                most_recent_timestamp = timestamp
+                                generated_text += text or ""
+                            elif usage := data.get("usage"):
+                                output.output_tokens = usage.get(
+                                    "completion_tokens")
+                    if first_chunk_received:
+                        output.success = True
+                    else:
+                        output.success = False
+                        output.error = (
+                            "Never received a valid chunk to calculate TTFT."
+                            "This response will be marked as failed!")
+                    output.generated_text = generated_text
+                    output.latency = most_recent_timestamp - st
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_openai_chat_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(("chat/completions", "profile")), (
+        "OpenAI Chat Completions API URL must end with 'chat/completions'.")
+
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        content = [{"type": "text", "text": request_func_input.prompt}]
+        if request_func_input.multi_modal_content:
+            content.append(request_func_input.multi_modal_content)
+        payload = {
+            "model":
+            request_func_input.model_name
+            if request_func_input.model_name else request_func_input.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": content
+                },
+            ],
+            "temperature":
+            0.0,
+            "max_completion_tokens":
+            request_func_input.output_len,
+            "stream":
+            True,
+            "stream_options": {
+                "include_usage": True,
+            },
+        }
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+                        chunk_bytes = chunk_bytes.decode("utf-8")
+                        # NOTE: SSE comments (often used as pings) start with
+                        # a colon. These are not JSON data payload and should
+                        # be skipped.
+                        if chunk_bytes.startswith(":"):
+                            continue
+
+                        chunk = chunk_bytes.removeprefix("data: ")
+
+                        if chunk != "[DONE]":
+                            timestamp = time.perf_counter()
+                            data = json.loads(chunk)
+
+                            if choices := data.get("choices"):
+                                content = choices[0]["delta"].get("content")
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = timestamp - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp -
+                                                      most_recent_timestamp)
+
+                                generated_text += content or ""
+                            elif usage := data.get("usage"):
+                                output.output_tokens = usage.get(
+                                    "completion_tokens")
+
+                            most_recent_timestamp = timestamp
+
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = most_recent_timestamp - st
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_openai_audio(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    # Lazy import without PlaceholderModule to avoid vllm dep.
+    import soundfile
+
+    api_url = request_func_input.api_url
+    assert api_url.endswith(("transcriptions", "translations")), (
+        "OpenAI Chat Completions API URL must end with 'transcriptions' ")
+    "or `translations`."
+
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        content = [{"type": "text", "text": request_func_input.prompt}]
+        payload = {
+            "model":
+            request_func_input.model_name
+            if request_func_input.model_name else request_func_input.model,
+            "temperature":
+            0.0,
+            "max_completion_tokens":
+            request_func_input.output_len,
+            "stream":
+            True,
+            "language":
+            "en",
+            # Flattened due to multipart/form-data
+            "stream_include_usage":
+            True,
+            "stream_continuous_usage_stats":
+            True,
+        }
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+
+        # Send audio file
+        def to_bytes(y, sr):
+            buffer = io.BytesIO()
+            soundfile.write(buffer, y, sr, format="WAV")
+            buffer.seek(0)
+            return buffer
+
+        with to_bytes(*request_func_input.multi_modal_content["audio"]) as f:
+            form = aiohttp.FormData()
+            form.add_field("file", f, content_type="audio/wav")
+            for key, value in payload.items():
+                form.add_field(key, str(value))
+
+            output = RequestFuncOutput()
+            output.prompt_len = request_func_input.prompt_len
+
+            generated_text = ""
+            ttft = 0.0
+            st = time.perf_counter()
+            most_recent_timestamp = st
+            try:
+                async with session.post(url=api_url,
+                                        data=form,
+                                        headers=headers) as response:
+                    if response.status == 200:
+                        async for chunk_bytes in response.content:
+                            chunk_bytes = chunk_bytes.strip()
+                            if not chunk_bytes:
+                                continue
+
+                            chunk = chunk_bytes.decode("utf-8").removeprefix(
+                                "data: ")
+                            if chunk != "[DONE]":
+                                timestamp = time.perf_counter()
+                                data = json.loads(chunk)
+
+                                if choices := data.get("choices"):
+                                    content = choices[0]["delta"].get(
+                                        "content")
+                                    # First token
+                                    if ttft == 0.0:
+                                        ttft = timestamp - st
+                                        output.ttft = ttft
+
+                                    # Decoding phase
+                                    else:
+                                        output.itl.append(
+                                            timestamp - most_recent_timestamp)
+
+                                    generated_text += content or ""
+                                elif usage := data.get("usage"):
+                                    output.output_tokens = usage.get(
+                                        "completion_tokens")
+
+                                most_recent_timestamp = timestamp
+
+                        output.generated_text = generated_text
+                        output.success = True
+                        output.latency = most_recent_timestamp - st
+                    else:
+                        output.error = response.reason or ""
+                        output.success = False
+            except Exception:
+                output.success = False
+                exc_info = sys.exc_info()
+                output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+# TODO: Add more request functions for different API protocols.
+ASYNC_REQUEST_FUNCS = {
+    "vllm": async_request_openai_completions,
+    "openai": async_request_openai_completions,
+    "openai-chat": async_request_openai_chat_completions,
+    "openai-audio": async_request_openai_audio,
+}
+
+OPENAI_COMPATIBLE_BACKENDS = [
+    k for k, v in ASYNC_REQUEST_FUNCS.items()
+    if v in (async_request_openai_completions,
+             async_request_openai_chat_completions)
+]
diff --git a/vllm_v0.10.0/vllm/benchmarks/latency.py b/vllm_v0.10.0/vllm/benchmarks/latency.py
new file mode 100644
index 0000000..5c6124d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/benchmarks/latency.py
@@ -0,0 +1,168 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Benchmark the latency of processing a single batch of requests."""
+
+import argparse
+import dataclasses
+import json
+import os
+import time
+from typing import Any, Optional
+
+import numpy as np
+from tqdm import tqdm
+
+import vllm.envs as envs
+from vllm import LLM, SamplingParams
+from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
+                                   write_to_json)
+from vllm.engine.arg_utils import EngineArgs
+from vllm.inputs import PromptType
+from vllm.sampling_params import BeamSearchParams
+
+
+def save_to_pytorch_benchmark_format(args: argparse.Namespace,
+                                     results: dict[str, Any]) -> None:
+    pt_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={"latency": results["latencies"]},
+        extra_info={k: results[k]
+                    for k in ["avg_latency", "percentiles"]})
+    if pt_records:
+        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
+        write_to_json(pt_file, pt_records)
+
+
+def add_cli_args(parser: argparse.ArgumentParser):
+    parser.add_argument("--input-len", type=int, default=32)
+    parser.add_argument("--output-len", type=int, default=128)
+    parser.add_argument("--batch-size", type=int, default=8)
+    parser.add_argument(
+        "--n",
+        type=int,
+        default=1,
+        help="Number of generated sequences per prompt.",
+    )
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument(
+        "--num-iters-warmup",
+        type=int,
+        default=10,
+        help="Number of iterations to run for warmup.",
+    )
+    parser.add_argument("--num-iters",
+                        type=int,
+                        default=30,
+                        help="Number of iterations to run.")
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="profile the generation process of a single batch",
+    )
+    parser.add_argument(
+        "--output-json",
+        type=str,
+        default=None,
+        help="Path to save the latency results in JSON format.",
+    )
+    parser.add_argument(
+        "--disable-detokenize",
+        action="store_true",
+        help=("Do not detokenize responses (i.e. do not include "
+              "detokenization time in the latency measurement)"),
+    )
+
+    parser = EngineArgs.add_cli_args(parser)
+    # V1 enables prefix caching by default which skews the latency
+    # numbers. We need to disable prefix caching by default.
+    parser.set_defaults(enable_prefix_caching=False)
+
+
+def main(args: argparse.Namespace):
+    if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
+        raise OSError(
+            "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
+            "Please set it to a valid path to use torch profiler.")
+    engine_args = EngineArgs.from_cli_args(args)
+
+    # NOTE(woosuk): If the request cannot be processed in a single batch,
+    # the engine will automatically process the request in multiple batches.
+    llm = LLM(**dataclasses.asdict(engine_args))
+    assert llm.llm_engine.model_config.max_model_len >= (
+        args.input_len +
+        args.output_len), ("Please ensure that max_model_len is greater than"
+                           " the sum of input_len and output_len.")
+
+    sampling_params = SamplingParams(
+        n=args.n,
+        temperature=1.0,
+        top_p=1.0,
+        ignore_eos=True,
+        max_tokens=args.output_len,
+        detokenize=not args.disable_detokenize,
+    )
+    dummy_prompt_token_ids = np.random.randint(10000,
+                                               size=(args.batch_size,
+                                                     args.input_len))
+    dummy_prompts: list[PromptType] = [{
+        "prompt_token_ids": batch
+    } for batch in dummy_prompt_token_ids.tolist()]
+
+    def llm_generate():
+        if not args.use_beam_search:
+            llm.generate(dummy_prompts,
+                         sampling_params=sampling_params,
+                         use_tqdm=False)
+        else:
+            llm.beam_search(
+                dummy_prompts,
+                BeamSearchParams(
+                    beam_width=args.n,
+                    max_tokens=args.output_len,
+                    ignore_eos=True,
+                ),
+            )
+
+    def run_to_completion(profile_dir: Optional[str] = None):
+        if profile_dir:
+            llm.start_profile()
+            llm_generate()
+            llm.stop_profile()
+        else:
+            start_time = time.perf_counter()
+            llm_generate()
+            end_time = time.perf_counter()
+            latency = end_time - start_time
+            return latency
+
+    print("Warming up...")
+    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+        run_to_completion(profile_dir=None)
+
+    if args.profile:
+        profile_dir = envs.VLLM_TORCH_PROFILER_DIR
+        print(f"Profiling (results will be saved to '{profile_dir}')...")
+        run_to_completion(profile_dir=profile_dir)
+        return
+
+    # Benchmark.
+    latencies = []
+    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
+        latencies.append(run_to_completion(profile_dir=None))
+    latencies = np.array(latencies)
+    percentages = [10, 25, 50, 75, 90, 99]
+    percentiles = np.percentile(latencies, percentages)
+    print(f"Avg latency: {np.mean(latencies)} seconds")
+    for percentage, percentile in zip(percentages, percentiles):
+        print(f"{percentage}% percentile latency: {percentile} seconds")
+
+    # Output JSON results if specified
+    if args.output_json:
+        results = {
+            "avg_latency": np.mean(latencies),
+            "latencies": latencies.tolist(),
+            "percentiles": dict(zip(percentages, percentiles.tolist())),
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+        save_to_pytorch_benchmark_format(args, results)
diff --git a/vllm_v0.10.0/vllm/benchmarks/serve.py b/vllm_v0.10.0/vllm/benchmarks/serve.py
new file mode 100644
index 0000000..f4506c9
--- /dev/null
+++ b/vllm_v0.10.0/vllm/benchmarks/serve.py
@@ -0,0 +1,1086 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+r"""Benchmark online serving throughput.
+
+On the server side, run one of the following commands
+to launch the vLLM OpenAI API server:
+    vllm serve <your_model> <engine arguments>        
+
+On the client side, run:
+    vllm bench serve \
+        --endpoint-type <endpoint_type. Default 'openai'> \
+        --label <benchmark result label. Default using endpoint_type> \
+        --model <your_model> \
+        --dataset-name <dataset_name. Default 'random'> \
+        --request-rate <request_rate. Default inf> \
+        --num-prompts <num_prompts. Default 1000>
+"""
+import argparse
+import asyncio
+import gc
+import json
+import os
+import random
+import time
+import warnings
+from collections.abc import AsyncGenerator, Iterable
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Any, Literal, Optional
+
+import numpy as np
+from tqdm.asyncio import tqdm
+from transformers import PreTrainedTokenizerBase
+
+from vllm.benchmarks.datasets import (SampleRequest, add_dataset_parser,
+                                      get_samples)
+from vllm.benchmarks.endpoint_request_func import (ASYNC_REQUEST_FUNCS,
+                                                   OPENAI_COMPATIBLE_BACKENDS,
+                                                   RequestFuncInput,
+                                                   RequestFuncOutput)
+from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
+                                   write_to_json)
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+MILLISECONDS_TO_SECONDS_CONVERSION = 1000
+
+
+@dataclass
+class BenchmarkMetrics:
+    completed: int
+    total_input: int
+    total_output: int
+    request_throughput: float
+    request_goodput: float
+    output_throughput: float
+    total_token_throughput: float
+    mean_ttft_ms: float
+    median_ttft_ms: float
+    std_ttft_ms: float
+    percentiles_ttft_ms: list[tuple[float, float]]
+    mean_tpot_ms: float
+    median_tpot_ms: float
+    std_tpot_ms: float
+    percentiles_tpot_ms: list[tuple[float, float]]
+    mean_itl_ms: float
+    median_itl_ms: float
+    std_itl_ms: float
+    percentiles_itl_ms: list[tuple[float, float]]
+    # E2EL stands for end-to-end latency per request.
+    # It is the time taken on the client side from sending
+    # a request to receiving a complete response.
+    mean_e2el_ms: float
+    median_e2el_ms: float
+    std_e2el_ms: float
+    percentiles_e2el_ms: list[tuple[float, float]]
+
+
+def _get_current_request_rate(
+    ramp_up_strategy: Optional[Literal["linear", "exponential"]],
+    ramp_up_start_rps: Optional[int],
+    ramp_up_end_rps: Optional[int],
+    request_index: int,
+    total_requests: int,
+    request_rate: float,
+) -> float:
+    if (ramp_up_strategy and ramp_up_start_rps is not None
+            and ramp_up_end_rps is not None):
+        progress = request_index / max(total_requests - 1, 1)
+        if ramp_up_strategy == "linear":
+            increase = (ramp_up_end_rps - ramp_up_start_rps) * progress
+            return ramp_up_start_rps + increase
+        elif ramp_up_strategy == "exponential":
+            ratio = ramp_up_end_rps / ramp_up_start_rps
+            return ramp_up_start_rps * (ratio**progress)
+        else:
+            raise ValueError(f"Unknown ramp-up strategy: {ramp_up_strategy}")
+    return request_rate
+
+
+async def get_request(
+    input_requests: list[SampleRequest],
+    request_rate: float,
+    burstiness: float = 1.0,
+    ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
+    ramp_up_start_rps: Optional[int] = None,
+    ramp_up_end_rps: Optional[int] = None,
+) -> AsyncGenerator[tuple[SampleRequest, float], None]:
+    """
+    Asynchronously generates requests at a specified rate
+    with OPTIONAL burstiness and OPTIONAL ramp-up strategy.
+
+    Args:
+        input_requests:
+            A list of input requests, each represented as a SampleRequest.
+        request_rate:
+            The rate at which requests are generated (requests/s).
+        burstiness (optional):
+            The burstiness factor of the request generation.
+            Only takes effect when request_rate is not inf.
+            Default value is 1, which follows a Poisson process.
+            Otherwise, the request intervals follow a gamma distribution.
+            A lower burstiness value (0 < burstiness < 1) results
+            in more bursty requests, while a higher burstiness value
+            (burstiness > 1) results in a more uniform arrival of requests.
+         ramp_up_strategy (optional):
+            The ramp-up strategy. Can be "linear" or "exponential".
+            If None, uses constant request rate (specified by request_rate).
+        ramp_up_start_rps (optional):
+            The starting request rate for ramp-up.
+        ramp_up_end_rps (optional):
+            The ending request rate for ramp-up.
+    """
+    assert burstiness > 0, (
+        f"A positive burstiness factor is expected, but given {burstiness}.")
+    # Convert to list to get length for ramp-up calculations
+    if isinstance(input_requests, Iterable) and not isinstance(
+            input_requests, list):
+        input_requests = list(input_requests)
+
+    total_requests = len(input_requests)
+    assert total_requests > 0, "No requests provided."
+
+    # Precompute delays among requests to minimize request send laggings
+    request_rates = []
+    delay_ts = []
+    for request_index, request in enumerate(input_requests):
+        current_request_rate = _get_current_request_rate(ramp_up_strategy,
+                                                      ramp_up_start_rps,
+                                                      ramp_up_end_rps,
+                                                      request_index,
+                                                      total_requests,
+                                                      request_rate)
+        request_rates.append(current_request_rate)
+        if current_request_rate == float("inf"):
+            delay_ts.append(0)
+        else:
+            theta = 1.0 / (current_request_rate * burstiness)
+
+            # Sample the request interval from the gamma distribution.
+            # If burstiness is 1, it follows exponential distribution.
+            delay_ts.append(np.random.gamma(shape=burstiness, scale=theta))
+    
+    # Calculate the cumulative delay time from the first sent out requests.
+    for i in range(1, len(delay_ts)):
+        delay_ts[i] += delay_ts[i - 1]
+    if ramp_up_strategy is None and delay_ts[-1] != 0:
+        # When ramp_up_strategy is not set, we assume the request rate is fixed
+        # and all requests should be sent in target_total_delay_s, the following
+        # logic would re-scale delay time to ensure the final delay_ts
+        # align with target_total_delay_s.
+        #
+        # NOTE: If we simply accumulate the random delta values 
+        # from the gamma distribution, their sum would have 1-2% gap 
+        # from target_total_delay_s. The purpose of the following logic is to
+        # close the gap for stablizing the throughput data 
+        # from different random seeds. 
+        target_total_delay_s = total_requests / request_rate
+        normalize_factor = target_total_delay_s / delay_ts[-1]
+        delay_ts = [delay * normalize_factor for delay in delay_ts]
+
+    start_ts = time.time()
+    for request_index, request in enumerate(input_requests):
+        if delay_ts[request_index] > 0:
+            current_ts = time.time()
+            sleep_interval_s = start_ts + delay_ts[request_index] - current_ts
+            if sleep_interval_s > 0:
+                await asyncio.sleep(sleep_interval_s)
+        yield request, request_rates[request_index]
+
+
+def calculate_metrics(
+    input_requests: list[SampleRequest],
+    outputs: list[RequestFuncOutput],
+    dur_s: float,
+    tokenizer: PreTrainedTokenizerBase,
+    selected_percentiles: list[float],
+    goodput_config_dict: dict[str, float],
+) -> tuple[BenchmarkMetrics, list[int]]:
+    """Calculate the metrics for the benchmark.
+
+    Args:
+        input_requests: The input requests.
+        outputs: The outputs of the requests.
+        dur_s: The duration of the benchmark.
+        tokenizer: The tokenizer to use.
+        selected_percentiles: The percentiles to select.
+        goodput_config_dict: The goodput configuration.
+
+    Returns:
+        A tuple of the benchmark metrics and the actual output lengths.
+    """
+    actual_output_lens: list[int] = []
+    total_input = 0
+    completed = 0
+    good_completed = 0
+    itls: list[float] = []
+    tpots: list[float] = []
+    all_tpots: list[float] = []
+    ttfts: list[float] = []
+    e2els: list[float] = []
+    for i in range(len(outputs)):
+        if outputs[i].success:
+            output_len = outputs[i].output_tokens
+
+            if not output_len:
+                # We use the tokenizer to count the number of output tokens
+                # for some serving backends instead of looking at
+                # len(outputs[i].itl) since multiple output tokens may be
+                # bundled together
+                # Note : this may inflate the output token count slightly
+                output_len = len(
+                    tokenizer(outputs[i].generated_text,
+                              add_special_tokens=False).input_ids)
+            actual_output_lens.append(output_len)
+            total_input += input_requests[i].prompt_len
+            tpot = 0
+            if output_len > 1:
+                latency_minus_ttft = outputs[i].latency - outputs[i].ttft
+                tpot = latency_minus_ttft / (output_len - 1)
+                tpots.append(tpot)
+            # Note: if output_len <= 1, we regard tpot as 0 for goodput
+            all_tpots.append(tpot)
+            itls += outputs[i].itl
+            ttfts.append(outputs[i].ttft)
+            e2els.append(outputs[i].latency)
+            completed += 1
+        else:
+            actual_output_lens.append(0)
+
+    if goodput_config_dict:
+        valid_metrics = []
+        slo_values = []
+
+        if "ttft" in goodput_config_dict:
+            valid_metrics.append(ttfts)
+            slo_values.append(goodput_config_dict["ttft"] /
+                              MILLISECONDS_TO_SECONDS_CONVERSION)
+        if "tpot" in goodput_config_dict:
+            valid_metrics.append(all_tpots)
+            slo_values.append(goodput_config_dict["tpot"] /
+                              MILLISECONDS_TO_SECONDS_CONVERSION)
+        if "e2el" in goodput_config_dict:
+            valid_metrics.append(e2els)
+            slo_values.append(goodput_config_dict["e2el"] /
+                              MILLISECONDS_TO_SECONDS_CONVERSION)
+
+        for req_metric in zip(*valid_metrics):
+            is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
+            if is_good_req:
+                good_completed += 1
+
+    if completed == 0:
+        warnings.warn(
+            "All requests failed. This is likely due to a misconfiguration "
+            "on the benchmark arguments.",
+            stacklevel=2)
+    metrics = BenchmarkMetrics(
+        completed=completed,
+        total_input=total_input,
+        total_output=sum(actual_output_lens),
+        request_throughput=completed / dur_s,
+        request_goodput=good_completed / dur_s,
+        output_throughput=sum(actual_output_lens) / dur_s,
+        total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
+        mean_ttft_ms=np.mean(ttfts or 0) *
+        1000,  # ttfts is empty if streaming is not supported by the endpoint
+        std_ttft_ms=np.std(ttfts or 0) * 1000,
+        median_ttft_ms=np.median(ttfts or 0) * 1000,
+        percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
+                             for p in selected_percentiles],
+        mean_tpot_ms=np.mean(tpots or 0) * 1000,
+        std_tpot_ms=np.std(tpots or 0) * 1000,
+        median_tpot_ms=np.median(tpots or 0) * 1000,
+        percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
+                             for p in selected_percentiles],
+        mean_itl_ms=np.mean(itls or 0) * 1000,
+        std_itl_ms=np.std(itls or 0) * 1000,
+        median_itl_ms=np.median(itls or 0) * 1000,
+        percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
+                            for p in selected_percentiles],
+        mean_e2el_ms=np.mean(e2els or 0) * 1000,
+        std_e2el_ms=np.std(e2els or 0) * 1000,
+        median_e2el_ms=np.median(e2els or 0) * 1000,
+        percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
+                             for p in selected_percentiles],
+    )
+
+    return metrics, actual_output_lens
+
+
+async def benchmark(
+    endpoint_type: str,
+    api_url: str,
+    base_url: str,
+    model_id: str,
+    model_name: str,
+    tokenizer: PreTrainedTokenizerBase,
+    input_requests: list[SampleRequest],
+    logprobs: Optional[int],
+    request_rate: float,
+    burstiness: float,
+    disable_tqdm: bool,
+    profile: bool,
+    selected_percentile_metrics: list[str],
+    selected_percentiles: list[float],
+    ignore_eos: bool,
+    goodput_config_dict: dict[str, float],
+    max_concurrency: Optional[int],
+    lora_modules: Optional[Iterable[str]],
+    extra_body: Optional[dict],
+    ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
+    ramp_up_start_rps: Optional[int] = None,
+    ramp_up_end_rps: Optional[int] = None,
+):
+    if endpoint_type in ASYNC_REQUEST_FUNCS:
+        request_func = ASYNC_REQUEST_FUNCS[endpoint_type]
+    else:
+        raise ValueError(f"Unknown endpoint_type: {endpoint_type}")
+
+    print("Starting initial single prompt test run...")
+    test_prompt, test_prompt_len, test_output_len, test_mm_content = (
+        input_requests[0].prompt,
+        input_requests[0].prompt_len,
+        input_requests[0].expected_output_len,
+        input_requests[0].multi_modal_data,
+    )
+
+    assert test_mm_content is None or isinstance(test_mm_content, dict)
+    test_input = RequestFuncInput(
+        model=model_id,
+        model_name=model_name,
+        prompt=test_prompt,
+        api_url=api_url,
+        prompt_len=test_prompt_len,
+        output_len=test_output_len,
+        logprobs=logprobs,
+        multi_modal_content=test_mm_content,
+        ignore_eos=ignore_eos,
+        extra_body=extra_body,
+    )
+
+    test_output = await request_func(request_func_input=test_input)
+    if not test_output.success:
+        raise ValueError(
+            "Initial test run failed - Please make sure benchmark arguments "
+            f"are correctly specified. Error: {test_output.error}")
+    else:
+        print("Initial test run completed. Starting main benchmark run...")
+
+    if lora_modules:
+        # For each input request, choose a LoRA module at random.
+        lora_modules = iter(
+            [random.choice(lora_modules) for _ in range(len(input_requests))])
+
+    if profile:
+        print("Starting profiler...")
+        profile_input = RequestFuncInput(model=model_id,
+                                         model_name=model_name,
+                                         prompt=test_prompt,
+                                         api_url=base_url + "/start_profile",
+                                         prompt_len=test_prompt_len,
+                                         output_len=test_output_len,
+                                         logprobs=logprobs,
+                                         multi_modal_content=test_mm_content,
+                                         ignore_eos=ignore_eos,
+                                         extra_body=extra_body)
+        profile_output = await request_func(request_func_input=profile_input)
+        if profile_output.success:
+            print("Profiler started")
+
+    distribution = ("Poisson process" if burstiness == 1.0 
+                   else "Gamma distribution")
+
+    if ramp_up_strategy is not None:
+        print(f"Traffic ramp-up strategy: {ramp_up_strategy}.")
+        print(f"Will increase RPS from {ramp_up_start_rps} to "
+              f"{ramp_up_end_rps} RPS over the duration of the benchmark.")
+    else:
+        print(f"Traffic request rate: {request_rate}")
+
+    print(f"Burstiness factor: {burstiness} ({distribution})")
+    print(f"Maximum request concurrency: {max_concurrency}")
+
+    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
+
+    # This can be used once the minimum Python version is 3.10 or higher,
+    # and it will simplify the code in limited_request_func.
+    #    semaphore = (asyncio.Semaphore(max_concurrency)
+    #                 if max_concurrency else contextlib.nullcontext())
+    semaphore = (asyncio.Semaphore(max_concurrency)
+                 if max_concurrency else None)
+
+    async def limited_request_func(request_func_input, pbar):
+        if semaphore is None:
+            return await request_func(request_func_input=request_func_input,
+                                      pbar=pbar)
+        async with semaphore:
+            return await request_func(request_func_input=request_func_input,
+                                      pbar=pbar)
+
+    benchmark_start_time = time.perf_counter()
+    tasks: list[asyncio.Task] = []
+
+    rps_change_events = []
+    last_int_rps = -1
+    if ramp_up_strategy is not None and ramp_up_start_rps is not None:
+        last_int_rps = ramp_up_start_rps
+        rps_change_events.append({
+            "rps": last_int_rps,
+            "timestamp": datetime.now().isoformat(),
+        })
+
+    async for request, current_request_rate in get_request(
+            input_requests, request_rate, burstiness, ramp_up_strategy,
+            ramp_up_start_rps, ramp_up_end_rps):
+        if ramp_up_strategy is not None:
+            current_int_rps = int(current_request_rate)
+            if current_int_rps > last_int_rps:
+                timestamp = datetime.now().isoformat()
+                for rps_val in range(last_int_rps + 1, current_int_rps + 1):
+                    rps_change_events.append({
+                        "rps": rps_val,
+                        "timestamp": timestamp
+                    })
+                last_int_rps = current_int_rps
+        prompt, prompt_len, output_len, mm_content = (
+            request.prompt,
+            request.prompt_len,
+            request.expected_output_len,
+            request.multi_modal_data,
+        )
+        req_model_id, req_model_name = model_id, model_name
+        if lora_modules:
+            req_lora_module = next(lora_modules)
+            req_model_id, req_model_name = req_lora_module, req_lora_module
+
+        request_func_input = RequestFuncInput(model=req_model_id,
+                                              model_name=req_model_name,
+                                              prompt=prompt,
+                                              api_url=api_url,
+                                              prompt_len=prompt_len,
+                                              output_len=output_len,
+                                              logprobs=logprobs,
+                                              multi_modal_content=mm_content,
+                                              ignore_eos=ignore_eos,
+                                              extra_body=extra_body)
+        tasks.append(
+            asyncio.create_task(
+                limited_request_func(request_func_input=request_func_input,
+                                     pbar=pbar)))
+    outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
+
+    if profile:
+        print("Stopping profiler...")
+        profile_input = RequestFuncInput(
+            model=model_id,
+            prompt=test_prompt,
+            api_url=base_url + "/stop_profile",
+            prompt_len=test_prompt_len,
+            output_len=test_output_len,
+            logprobs=logprobs,
+        )
+        profile_output = await request_func(request_func_input=profile_input)
+        if profile_output.success:
+            print("Profiler stopped")
+
+    if pbar is not None:
+        pbar.close()
+
+    benchmark_duration = time.perf_counter() - benchmark_start_time
+
+    metrics, actual_output_lens = calculate_metrics(
+        input_requests=input_requests,
+        outputs=outputs,
+        dur_s=benchmark_duration,
+        tokenizer=tokenizer,
+        selected_percentiles=selected_percentiles,
+        goodput_config_dict=goodput_config_dict,
+    )
+
+    print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
+    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
+                                    benchmark_duration))
+    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
+    print("{:<40} {:<10}".format("Total generated tokens:",
+                                 metrics.total_output))
+    print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
+                                    metrics.request_throughput))
+    if goodput_config_dict:
+        print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
+                                        metrics.request_goodput))
+    print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
+                                    metrics.output_throughput))
+    print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
+                                    metrics.total_token_throughput))
+
+    result = {
+        "duration": benchmark_duration,
+        "completed": metrics.completed,
+        "total_input_tokens": metrics.total_input,
+        "total_output_tokens": metrics.total_output,
+        "request_throughput": metrics.request_throughput,
+        "request_goodput":
+        metrics.request_goodput if goodput_config_dict else None,
+        "output_throughput": metrics.output_throughput,
+        "total_token_throughput": metrics.total_token_throughput,
+        "input_lens": [output.prompt_len for output in outputs],
+        "output_lens": actual_output_lens,
+        "ttfts": [output.ttft for output in outputs],
+        "itls": [output.itl for output in outputs],
+        "generated_texts": [output.generated_text for output in outputs],
+        "errors": [output.error for output in outputs],
+    }
+
+    if rps_change_events:
+        result["rps_change_events"] = rps_change_events
+
+    def process_one_metric(
+        # E.g., "ttft"
+        metric_attribute_name: str,
+        # E.g., "TTFT"
+        metric_name: str,
+        # E.g., "Time to First Token"
+        metric_header: str,
+    ):
+        # This function prints and adds statistics of the specified
+        # metric.
+        if metric_attribute_name not in selected_percentile_metrics:
+            return
+        print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
+        print("{:<40} {:<10.2f}".format(
+            f"Mean {metric_name} (ms):",
+            getattr(metrics, f"mean_{metric_attribute_name}_ms")))
+        print("{:<40} {:<10.2f}".format(
+            f"Median {metric_name} (ms):",
+            getattr(metrics, f"median_{metric_attribute_name}_ms")))
+        result[f"mean_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"mean_{metric_attribute_name}_ms")
+        result[f"median_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"median_{metric_attribute_name}_ms")
+        result[f"std_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"std_{metric_attribute_name}_ms")
+        for p, value in getattr(metrics,
+                                f"percentiles_{metric_attribute_name}_ms"):
+            p_word = str(int(p)) if int(p) == p else str(p)
+            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
+                                            value))
+            result[f"p{p_word}_{metric_attribute_name}_ms"] = value
+
+    process_one_metric("ttft", "TTFT", "Time to First Token")
+    process_one_metric("tpot", "TPOT",
+                       "Time per Output Token (excl. 1st token)")
+    process_one_metric("itl", "ITL", "Inter-token Latency")
+    process_one_metric("e2el", "E2EL", "End-to-end Latency")
+
+    print("=" * 50)
+
+    return result
+
+
+def check_goodput_args(args):
+    # Check and parse goodput arguments
+    goodput_config_dict = {}
+    VALID_NAMES = ["ttft", "tpot", "e2el"]
+    if args.goodput:
+        goodput_config_dict = parse_goodput(args.goodput)
+        for slo_name, slo_val in goodput_config_dict.items():
+            if slo_name not in VALID_NAMES:
+                raise ValueError(
+                    f"Invalid metric name found, {slo_name}: {slo_val}. "
+                    "The service level objective name should be one of "
+                    f"{str(VALID_NAMES)}. ")
+            if slo_val < 0:
+                raise ValueError(
+                    f"Invalid value found, {slo_name}: {slo_val}. "
+                    "The service level objective value should be "
+                    "non-negative.")
+    return goodput_config_dict
+
+
+def parse_goodput(slo_pairs):
+    goodput_config_dict = {}
+    try:
+        for slo_pair in slo_pairs:
+            slo_name, slo_val = slo_pair.split(":")
+            goodput_config_dict[slo_name] = float(slo_val)
+    except ValueError as err:
+        raise argparse.ArgumentTypeError(
+            "Invalid format found for service level objectives. "
+            "Specify service level objectives for goodput as \"KEY:VALUE\" "
+            "pairs, where the key is a metric name, and the value is a "
+            "number in milliseconds.") from err
+    return goodput_config_dict
+
+
+def save_to_pytorch_benchmark_format(args: argparse.Namespace,
+                                     results: dict[str, Any],
+                                     file_name: str) -> None:
+    metrics = [
+        "median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms",
+        "mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms",
+        "median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms"
+    ]
+    # These raw data might be useful, but they are rather big. They can be added
+    # later if needed
+    ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
+    pt_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={k: [results[k]]
+                 for k in metrics},
+        extra_info={
+            k: results[k]
+            for k in results if k not in metrics and k not in ignored_metrics
+        })
+    if pt_records:
+        # Don't use json suffix here as we don't want CI to pick it up
+        pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
+        write_to_json(pt_file, pt_records)
+
+
+def add_cli_args(parser: argparse.ArgumentParser):
+    add_dataset_parser(parser)
+    parser.add_argument(
+        "--endpoint-type",
+        type=str,
+        default="openai",
+        choices=list(ASYNC_REQUEST_FUNCS.keys()),
+    )
+    parser.add_argument(
+        "--label",
+        type=str,
+        default=None,
+        help="The label (prefix) of the benchmark results. If not specified, "
+        "the endpoint type will be used as the label.",
+    )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="vllm",
+        choices=list(ASYNC_REQUEST_FUNCS.keys()),
+    )
+    parser.add_argument(
+        "--base-url",
+        type=str,
+        default=None,
+        help="Server or API base url if not using http host and port.",
+    )
+    # Use 127.0.0.1 here instead of localhost to force the use of ipv4
+    parser.add_argument("--host", type=str, default="127.0.0.1")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument(
+        "--endpoint",
+        type=str,
+        default="/v1/completions",
+        help="API endpoint.",
+    )
+    parser.add_argument(
+        "--max-concurrency",
+        type=int,
+        default=None,
+        help="Maximum number of concurrent requests. This can be used "
+        "to help simulate an environment where a higher level component "
+        "is enforcing a maximum number of concurrent requests. While the "
+        "--request-rate argument controls the rate at which requests are "
+        "initiated, this argument will control how many are actually allowed "
+        "to execute at a time. This means that when used in combination, the "
+        "actual request rate may be lower than specified with --request-rate, "
+        "if the server is not processing requests fast enough to keep up.")
+
+    parser.add_argument(
+        "--model",
+        type=str,
+        required=True,
+        help="Name of the model.",
+    )
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        help=
+        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+    )
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument(
+        "--logprobs",
+        type=int,
+        default=None,
+        help=("Number of logprobs-per-token to compute & return as part of "
+              "the request. If unspecified, then either (1) if beam search "
+              "is disabled, no logprobs are computed & a single dummy "
+              "logprob is returned for each token; or (2) if beam search "
+              "is enabled 1 logprob per token is computed"),
+    )
+    parser.add_argument(
+        "--request-rate",
+        type=float,
+        default=float("inf"),
+        help="Number of requests per second. If this is inf, "
+        "then all the requests are sent at time 0. "
+        "Otherwise, we use Poisson process or gamma distribution "
+        "to synthesize the request arrival times.",
+    )
+    parser.add_argument(
+        "--burstiness",
+        type=float,
+        default=1.0,
+        help="Burstiness factor of the request generation. "
+        "Only take effect when request_rate is not inf. "
+        "Default value is 1, which follows Poisson process. "
+        "Otherwise, the request intervals follow a gamma distribution. "
+        "A lower burstiness value (0 < burstiness < 1) results in more "
+        "bursty requests. A higher burstiness value (burstiness > 1) "
+        "results in a more uniform arrival of requests.",
+    )
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="Trust remote code from huggingface",
+    )
+    parser.add_argument(
+        "--disable-tqdm",
+        action="store_true",
+        help="Specify to disable tqdm progress bar.",
+    )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="Use Torch Profiler. The endpoint must be launched with "
+        "VLLM_TORCH_PROFILER_DIR to enable profiler.",
+    )
+    parser.add_argument(
+        "--save-result",
+        action="store_true",
+        help="Specify to save benchmark results to a json file",
+    )
+    parser.add_argument(
+        "--save-detailed",
+        action="store_true",
+        help="When saving the results, whether to include per request "
+        "information such as response, error, ttfs, tpots, etc.",
+    )
+    parser.add_argument(
+        "--append-result",
+        action="store_true",
+        help="Append the benchmark result to the existing json file.",
+    )
+    parser.add_argument(
+        "--metadata",
+        metavar="KEY=VALUE",
+        nargs="*",
+        help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) "
+        "for metadata of this run to be saved in the result JSON file "
+        "for record keeping purposes.",
+    )
+    parser.add_argument(
+        "--result-dir",
+        type=str,
+        default=None,
+        help="Specify directory to save benchmark json results."
+        "If not specified, results are saved in the current directory.",
+    )
+    parser.add_argument(
+        "--result-filename",
+        type=str,
+        default=None,
+        help="Specify the filename to save benchmark json results."
+        "If not specified, results will be saved in "
+        "{label}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"  # noqa
+        " format.",
+    )
+    parser.add_argument(
+        "--ignore-eos",
+        action="store_true",
+        help="Set ignore_eos flag when sending the benchmark request."
+        "Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
+    parser.add_argument(
+        "--percentile-metrics",
+        type=str,
+        default="ttft,tpot,itl",
+        help="Comma-separated list of selected metrics to report percentils. "
+        "This argument specifies the metrics to report percentiles. "
+        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". ")
+    parser.add_argument(
+        "--metric-percentiles",
+        type=str,
+        default="99",
+        help="Comma-separated list of percentiles for selected metrics. "
+        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
+        "Default value is \"99\"."
+        "Use \"--percentile-metrics\" to select metrics.",
+    )
+    parser.add_argument(
+        "--goodput",
+        nargs="+",
+        required=False,
+        help="Specify service level objectives for goodput as \"KEY:VALUE\" "
+        "pairs, where the key is a metric name, and the value is in "
+        "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
+        "separated by spaces. Allowed request level metric names are "
+        "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
+        "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
+        "and the blog: https://hao-ai-lab.github.io/blogs/distserve",
+    )
+
+    sampling_group = parser.add_argument_group("sampling parameters")
+    sampling_group.add_argument(
+        "--top-p",
+        type=float,
+        default=None,
+        help="Top-p sampling parameter. Only has effect on "
+        "openai-compatible backends.",
+    )
+    sampling_group.add_argument(
+        "--top-k",
+        type=int,
+        default=None,
+        help="Top-k sampling parameter. Only has effect on "
+        "openai-compatible backends.",
+    )
+    sampling_group.add_argument(
+        "--min-p",
+        type=float,
+        default=None,
+        help="Min-p sampling parameter. Only has effect on "
+        "openai-compatible backends.",
+    )
+    sampling_group.add_argument(
+        "--temperature",
+        type=float,
+        default=None,
+        help="Temperature sampling parameter. Only has effect on "
+        "openai-compatible backends. If not specified, default to greedy "
+        "decoding (i.e. temperature==0.0).",
+    )
+
+    parser.add_argument(
+        '--tokenizer-mode',
+        type=str,
+        default="auto",
+        choices=['auto', 'slow', 'mistral', 'custom'],
+        help='The tokenizer mode.\n\n* "auto" will use the '
+        'fast tokenizer if available.\n* "slow" will '
+        'always use the slow tokenizer. \n* '
+        '"mistral" will always use the `mistral_common` tokenizer. \n*'
+        '"custom" will use --tokenizer to select the preregistered tokenizer.')
+
+    parser.add_argument("--served-model-name",
+                        type=str,
+                        default=None,
+                        help="The model name used in the API. "
+                        "If not specified, the model name will be the "
+                        "same as the ``--model`` argument. ")
+
+    parser.add_argument("--lora-modules",
+                        nargs='+',
+                        default=None,
+                        help="A subset of LoRA module names passed in when "
+                        "launching the server. For each request, the "
+                        "script chooses a LoRA module at random.")
+
+    parser.add_argument(
+        "--ramp-up-strategy",
+        type=str,
+        default=None,
+        choices=["linear", "exponential"],
+        help="The ramp-up strategy. This would be used to "
+        "ramp up the request rate from initial RPS to final "
+        "RPS rate (specified by --ramp-up-start-rps and "
+        "--ramp-up-end-rps.) over the duration of the benchmark."
+    )
+    parser.add_argument(
+        "--ramp-up-start-rps",
+        type=int,
+        default=None,
+        help="The starting request rate for ramp-up (RPS). "
+        "Needs to be specified when --ramp-up-strategy is used.",
+    )
+    parser.add_argument(
+        "--ramp-up-end-rps",
+        type=int,
+        default=None,
+        help="The ending request rate for ramp-up (RPS). "
+        "Needs to be specified when --ramp-up-strategy is used.",
+    )
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    # Validate ramp-up arguments
+    if args.ramp_up_strategy is not None:
+        if args.request_rate != float("inf"):
+            raise ValueError(
+                "When using ramp-up, do not specify --request-rate. "
+                "The request rate will be controlled by ramp-up parameters. "
+                "Please remove the --request-rate argument."
+            )
+        if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None:
+            raise ValueError(
+                "When using --ramp-up-strategy, both --ramp-up-start-rps and "
+                "--ramp-up-end-rps must be specified"
+            )
+        if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0:
+            raise ValueError("Ramp-up start and end RPS must be non-negative")
+        if args.ramp_up_start_rps > args.ramp_up_end_rps:
+            raise ValueError("Ramp-up start RPS must be less than end RPS")
+        if (args.ramp_up_strategy == "exponential"
+                and args.ramp_up_start_rps == 0):
+            raise ValueError(
+                "For exponential ramp-up, the start RPS cannot be 0.")
+
+    endpoint_type = args.endpoint_type
+    label = args.label
+    model_id = args.model
+    model_name = args.served_model_name
+    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
+    tokenizer_mode = args.tokenizer_mode
+
+    if args.base_url is not None:
+        api_url = f"{args.base_url}{args.endpoint}"
+        base_url = f"{args.base_url}"
+    else:
+        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
+        base_url = f"http://{args.host}:{args.port}"
+
+    tokenizer = get_tokenizer(tokenizer_id,
+                              tokenizer_mode=tokenizer_mode,
+                              trust_remote_code=args.trust_remote_code)
+
+    if args.dataset_name is None:
+        raise ValueError(
+            "Please specify '--dataset-name' and the corresponding "
+            "'--dataset-path' if required.")
+
+    # Load the dataset.
+    input_requests = get_samples(args, tokenizer)
+    goodput_config_dict = check_goodput_args(args)
+
+    # Collect the sampling parameters.
+    sampling_params = {
+        k: v
+        for k, v in {
+            "top_p": args.top_p,
+            "top_k": args.top_k,
+            "min_p": args.min_p,
+            "temperature": args.temperature,
+        }.items() if v is not None
+    }
+
+    # Sampling parameters are only supported by openai-compatible backend.
+    if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
+        raise ValueError("Sampling parameters are only supported by "
+                         "openai-compatible backends.")
+
+    if "temperature" not in sampling_params:
+        sampling_params["temperature"] = 0.0  # Default to greedy decoding.
+
+    # Avoid GC processing "static" data - reduce pause times.
+    gc.collect()
+    gc.freeze()
+
+    benchmark_result = asyncio.run(
+        benchmark(
+            endpoint_type=args.endpoint_type,
+            api_url=api_url,
+            base_url=base_url,
+            model_id=model_id,
+            model_name=model_name,
+            tokenizer=tokenizer,
+            input_requests=input_requests,
+            logprobs=args.logprobs,
+            request_rate=args.request_rate,
+            burstiness=args.burstiness,
+            disable_tqdm=args.disable_tqdm,
+            profile=args.profile,
+            selected_percentile_metrics=args.percentile_metrics.split(","),
+            selected_percentiles=[
+                float(p) for p in args.metric_percentiles.split(",")
+            ],
+            ignore_eos=args.ignore_eos,
+            goodput_config_dict=goodput_config_dict,
+            max_concurrency=args.max_concurrency,
+            lora_modules=args.lora_modules,
+            extra_body=sampling_params,
+            ramp_up_strategy=args.ramp_up_strategy,
+            ramp_up_start_rps=args.ramp_up_start_rps,
+            ramp_up_end_rps=args.ramp_up_end_rps,
+        ))
+
+    # Save config and results to json
+    if args.save_result or args.append_result:
+        result_json: dict[str, Any] = {}
+
+        # Setup
+        current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
+        result_json["date"] = current_dt
+        result_json["endpoint_type"] = args.endpoint_type
+        result_json["label"] = label
+        result_json["model_id"] = model_id
+        result_json["tokenizer_id"] = tokenizer_id
+        result_json["num_prompts"] = args.num_prompts
+
+        # Metadata
+        if args.metadata:
+            for item in args.metadata:
+                if "=" in item:
+                    kvstring = item.split("=")
+                    result_json[kvstring[0].strip()] = kvstring[1].strip()
+                else:
+                    raise ValueError(
+                        "Invalid metadata format. Please use KEY=VALUE format."
+                    )
+
+        # Traffic
+        result_json["request_rate"] = (args.request_rate if args.request_rate
+                                       < float("inf") else "inf")
+        result_json["burstiness"] = args.burstiness
+        result_json["max_concurrency"] = args.max_concurrency
+
+        if args.ramp_up_strategy is not None:
+            result_json["ramp_up_strategy"] = args.ramp_up_strategy
+            result_json["ramp_up_start_rps"] = args.ramp_up_start_rps
+            result_json["ramp_up_end_rps"] = args.ramp_up_end_rps
+
+        # Merge with benchmark result
+        result_json = {**result_json, **benchmark_result}
+
+        if not args.save_detailed:
+            # Remove fields with too many data points
+            for field in [
+                    "input_lens",
+                    "output_lens",
+                    "ttfts",
+                    "itls",
+                    "generated_texts",
+                    "errors",
+            ]:
+                if field in result_json:
+                    del result_json[field]
+                if field in benchmark_result:
+                    del benchmark_result[field]
+
+        # Save to file
+        base_model_id = model_id.split("/")[-1]
+        max_concurrency_str = (f"-concurrency{args.max_concurrency}"
+                               if args.max_concurrency is not None else "")
+        label = label or endpoint_type
+        if args.ramp_up_strategy is not None:
+            file_name = f"{label}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa
+        else:
+            file_name = f"{label}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
+        if args.result_filename:
+            file_name = args.result_filename
+        if args.result_dir:
+            os.makedirs(args.result_dir, exist_ok=True)
+            file_name = os.path.join(args.result_dir, file_name)
+        with open(file_name,
+                  mode="a+" if args.append_result else "w",
+                  encoding="utf-8") as outfile:
+            # Append a newline.
+            if args.append_result and outfile.tell() != 0:
+                outfile.write("\n")
+            json.dump(result_json, outfile)
+        save_to_pytorch_benchmark_format(args, result_json, file_name)
diff --git a/vllm_v0.10.0/vllm/benchmarks/throughput.py b/vllm_v0.10.0/vllm/benchmarks/throughput.py
new file mode 100644
index 0000000..af2ca96
--- /dev/null
+++ b/vllm_v0.10.0/vllm/benchmarks/throughput.py
@@ -0,0 +1,609 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Benchmark offline inference throughput."""
+import argparse
+import dataclasses
+import json
+import os
+import random
+import time
+import warnings
+from typing import Any, Optional, Union
+
+import torch
+import uvloop
+from tqdm import tqdm
+from transformers import (AutoModelForCausalLM, AutoTokenizer,
+                          PreTrainedTokenizerBase)
+
+from vllm.benchmarks.datasets import (AIMODataset, BurstGPTDataset,
+                                      ConversationDataset,
+                                      InstructCoderDataset, RandomDataset,
+                                      SampleRequest, ShareGPTDataset,
+                                      SonnetDataset, VisionArenaDataset)
+from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
+                                   write_to_json)
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.entrypoints.openai.api_server import (
+    build_async_engine_client_from_engine_args)
+from vllm.inputs import TextPrompt, TokensPrompt
+from vllm.lora.request import LoRARequest
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import BeamSearchParams
+from vllm.utils import merge_async_iterators
+
+
+def run_vllm(
+    requests: list[SampleRequest],
+    n: int,
+    engine_args: EngineArgs,
+    disable_detokenize: bool = False,
+) -> tuple[float, Optional[list[RequestOutput]]]:
+    from vllm import LLM, SamplingParams
+    llm = LLM(**dataclasses.asdict(engine_args))
+    assert all(
+        llm.llm_engine.model_config.max_model_len >= (
+            request.prompt_len + request.expected_output_len)
+        for request in requests), (
+            "Please ensure that max_model_len is greater than the sum of"
+            " prompt_len and expected_output_len for all requests.")
+    # Add the requests to the engine.
+    prompts: list[Union[TextPrompt, TokensPrompt]] = []
+    sampling_params: list[SamplingParams] = []
+    for request in requests:
+        prompts.append(
+            TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
+                       multi_modal_data=request.multi_modal_data)
+            if "prompt_token_ids" in request.prompt else \
+            TextPrompt(prompt=request.prompt,
+                       multi_modal_data=request.multi_modal_data))
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=1.0,
+                top_p=1.0,
+                ignore_eos=True,
+                max_tokens=request.expected_output_len,
+                detokenize=not disable_detokenize,
+            ))
+    lora_requests: Optional[list[LoRARequest]] = None
+    if engine_args.enable_lora:
+        lora_requests = [request.lora_request for request in requests]
+
+    use_beam_search = False
+
+    outputs = None
+    if not use_beam_search:
+        start = time.perf_counter()
+        outputs = llm.generate(prompts,
+                               sampling_params,
+                               lora_request=lora_requests,
+                               use_tqdm=True)
+        end = time.perf_counter()
+    else:
+        assert lora_requests is None, "BeamSearch API does not support LoRA"
+        prompts = [request.prompt for request in requests]
+        # output_len should be the same for all requests.
+        output_len = requests[0].expected_output_len
+        for request in requests:
+            assert request.expected_output_len == output_len
+        start = time.perf_counter()
+        llm.beam_search(
+            prompts,
+            BeamSearchParams(
+                beam_width=n,
+                max_tokens=output_len,
+                ignore_eos=True,
+            ))
+        end = time.perf_counter()
+    return end - start, outputs
+
+
+def run_vllm_chat(
+        requests: list[SampleRequest],
+        n: int,
+        engine_args: EngineArgs,
+        disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]:
+    """
+    Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
+    multimodal models as it properly handles multimodal inputs and chat
+    formatting. For non-multimodal models, use run_vllm() instead.
+    """
+    from vllm import LLM, SamplingParams
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    assert all(
+        llm.llm_engine.model_config.max_model_len >= (
+            request.prompt_len + request.expected_output_len)
+        for request in requests), (
+            "Please ensure that max_model_len is greater than the sum of "
+            "prompt_len and expected_output_len for all requests.")
+
+    prompts = []
+    sampling_params: list[SamplingParams] = []
+    for request in requests:
+        prompts.append(request.prompt)
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=1.0,
+                top_p=1.0,
+                ignore_eos=True,
+                max_tokens=request.expected_output_len,
+                detokenize=not disable_detokenize,
+            ))
+    start = time.perf_counter()
+    outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
+    end = time.perf_counter()
+    return end - start, outputs
+
+
+async def run_vllm_async(
+    requests: list[SampleRequest],
+    n: int,
+    engine_args: AsyncEngineArgs,
+    disable_frontend_multiprocessing: bool = False,
+    disable_detokenize: bool = False,
+) -> float:
+    from vllm import SamplingParams
+
+    async with build_async_engine_client_from_engine_args(
+            engine_args, disable_frontend_multiprocessing) as llm:
+        model_config = await llm.get_model_config()
+        assert all(
+            model_config.max_model_len >= (request.prompt_len +
+                                           request.expected_output_len)
+            for request in requests), (
+                "Please ensure that max_model_len is greater than the sum of"
+                " prompt_len and expected_output_len for all requests.")
+
+        # Add the requests to the engine.
+        prompts: list[Union[TextPrompt, TokensPrompt]] = []
+        sampling_params: list[SamplingParams] = []
+        lora_requests: list[Optional[LoRARequest]] = []
+        for request in requests:
+            prompts.append(
+                TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
+                        multi_modal_data=request.multi_modal_data)
+                if "prompt_token_ids" in request.prompt else \
+                TextPrompt(prompt=request.prompt,
+                           multi_modal_data=request.multi_modal_data))
+            sampling_params.append(
+                SamplingParams(
+                    n=n,
+                    temperature=1.0,
+                    top_p=1.0,
+                    ignore_eos=True,
+                    max_tokens=request.expected_output_len,
+                    detokenize=not disable_detokenize,
+                ))
+            lora_requests.append(request.lora_request)
+
+        generators = []
+        start = time.perf_counter()
+        for i, (prompt, sp,
+                lr) in enumerate(zip(prompts, sampling_params, lora_requests)):
+            generator = llm.generate(prompt,
+                                     sp,
+                                     lora_request=lr,
+                                     request_id=f"test{i}")
+            generators.append(generator)
+        all_gens = merge_async_iterators(*generators)
+        async for i, res in all_gens:
+            pass
+        end = time.perf_counter()
+        return end - start
+
+
+def run_hf(
+    requests: list[SampleRequest],
+    model: str,
+    tokenizer: PreTrainedTokenizerBase,
+    n: int,
+    max_batch_size: int,
+    trust_remote_code: bool,
+    disable_detokenize: bool = False,
+) -> float:
+    llm = AutoModelForCausalLM.from_pretrained(
+        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
+    if llm.config.model_type == "llama":
+        # To enable padding in the HF backend.
+        tokenizer.pad_token = tokenizer.eos_token
+    llm = llm.cuda()
+
+    pbar = tqdm(total=len(requests))
+    start = time.perf_counter()
+    batch: list[str] = []
+    max_prompt_len = 0
+    max_output_len = 0
+    for i in range(len(requests)):
+        prompt = requests[i].prompt
+        prompt_len = requests[i].prompt_len
+        output_len = requests[i].expected_output_len
+        # Add the prompt to the batch.
+        batch.append(prompt)
+        max_prompt_len = max(max_prompt_len, prompt_len)
+        max_output_len = max(max_output_len, output_len)
+        if len(batch) < max_batch_size and i != len(requests) - 1:
+            # Check if we can add more requests to the batch.
+            next_prompt_len = requests[i + 1].prompt_len
+            next_output_len = requests[i + 1].expected_output_len
+            if (max(max_prompt_len, next_prompt_len) +
+                    max(max_output_len, next_output_len)) <= 2048:
+                # We can add more requests to the batch.
+                continue
+
+        # Generate the sequences.
+        input_ids = tokenizer(batch, return_tensors="pt",
+                              padding=True).input_ids
+        llm_outputs = llm.generate(
+            input_ids=input_ids.cuda(),
+            do_sample=True,
+            num_return_sequences=n,
+            temperature=1.0,
+            top_p=1.0,
+            use_cache=True,
+            max_new_tokens=max_output_len,
+        )
+        if not disable_detokenize:
+            # Include the decoding time.
+            tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
+        pbar.update(len(batch))
+
+        # Clear the batch.
+        batch = []
+        max_prompt_len = 0
+        max_output_len = 0
+    end = time.perf_counter()
+    return end - start
+
+
+def save_to_pytorch_benchmark_format(args: argparse.Namespace,
+                                     results: dict[str, Any]) -> None:
+    pt_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={
+            "requests_per_second": [results["requests_per_second"]],
+            "tokens_per_second": [results["tokens_per_second"]],
+        },
+        extra_info={
+            k: results[k]
+            for k in ["elapsed_time", "num_requests", "total_num_tokens"]
+        })
+    if pt_records:
+        # Don't use json suffix here as we don't want CI to pick it up
+        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
+        write_to_json(pt_file, pt_records)
+
+
+def get_requests(args, tokenizer):
+    # Common parameters for all dataset types.
+    common_kwargs = {
+        "dataset_path": args.dataset_path,
+        "random_seed": args.seed,
+    }
+    sample_kwargs = {
+        "tokenizer": tokenizer,
+        "lora_path": args.lora_path,
+        "max_loras": args.max_loras,
+        "num_requests": args.num_prompts,
+        "input_len": args.input_len,
+        "output_len": args.output_len,
+    }
+
+    if args.dataset_path is None or args.dataset_name == "random":
+        sample_kwargs["range_ratio"] = args.random_range_ratio
+        sample_kwargs["prefix_len"] = args.prefix_len
+        dataset_cls = RandomDataset
+    elif args.dataset_name == "sharegpt":
+        dataset_cls = ShareGPTDataset
+        if args.backend == "vllm-chat":
+            sample_kwargs["enable_multimodal_chat"] = True
+    elif args.dataset_name == "sonnet":
+        assert tokenizer.chat_template or tokenizer.default_chat_template, (
+            "Tokenizer/model must have chat template for sonnet dataset.")
+        dataset_cls = SonnetDataset
+        sample_kwargs["prefix_len"] = args.prefix_len
+        sample_kwargs["return_prompt_formatted"] = True
+    elif args.dataset_name == "burstgpt":
+        dataset_cls = BurstGPTDataset
+    elif args.dataset_name == "hf":
+        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = VisionArenaDataset
+            common_kwargs['dataset_subset'] = None
+            common_kwargs['dataset_split'] = "train"
+            sample_kwargs["enable_multimodal_chat"] = True
+        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = InstructCoderDataset
+            common_kwargs['dataset_split'] = "train"
+        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = ConversationDataset
+            common_kwargs['dataset_subset'] = args.hf_subset
+            common_kwargs['dataset_split'] = args.hf_split
+            sample_kwargs["enable_multimodal_chat"] = True
+        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = AIMODataset
+            common_kwargs['dataset_subset'] = None
+            common_kwargs['dataset_split'] = "train"
+    else:
+        raise ValueError(f"Unknown dataset name: {args.dataset_name}")
+    # Remove None values
+    sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None}
+    return dataset_cls(**common_kwargs).sample(**sample_kwargs)
+
+
+def validate_args(args):
+    """
+    Validate command-line arguments.
+    """
+
+    # === Deprecation and Defaulting ===
+    if args.dataset is not None:
+        warnings.warn(
+            "The '--dataset' argument will be deprecated in the next release. "
+            "Please use '--dataset-name' and '--dataset-path' instead.",
+            stacklevel=2)
+        args.dataset_path = args.dataset
+
+    if not getattr(args, "tokenizer", None):
+        args.tokenizer = args.model
+
+    # === Backend Validation ===
+    valid_backends = {"vllm", "hf", "mii", "vllm-chat"}
+    if args.backend not in valid_backends:
+        raise ValueError(f"Unsupported backend: {args.backend}")
+
+    # === Dataset Configuration ===
+    if not args.dataset and not args.dataset_path:
+        print(
+            "When dataset path is not set, it will default to random dataset")
+        args.dataset_name = 'random'
+        if args.input_len is None:
+            raise ValueError("input_len must be provided for a random dataset")
+
+    # === Dataset Name Specific Checks ===
+    # --hf-subset and --hf-split: only used
+    # when dataset_name is 'hf'
+    if args.dataset_name != "hf" and (
+            getattr(args, "hf_subset", None) is not None
+            or getattr(args, "hf_split", None) is not None):
+        warnings.warn("--hf-subset and --hf-split will be ignored \
+                since --dataset-name is not 'hf'.",
+                      stacklevel=2)
+    elif args.dataset_name == "hf":
+        if args.dataset_path in (
+                VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
+                | ConversationDataset.SUPPORTED_DATASET_PATHS):
+            assert args.backend == "vllm-chat", f"{args.dataset_path} needs to use vllm-chat as the backend."  #noqa: E501
+        elif args.dataset_path in (InstructCoderDataset.SUPPORTED_DATASET_PATHS
+                                   | AIMODataset.SUPPORTED_DATASET_PATHS):
+            assert args.backend == "vllm", f"{args.dataset_path} needs to use vllm as the backend."  #noqa: E501
+        else:
+            raise ValueError(
+                f"{args.dataset_path} is not supported by hf dataset.")
+
+    # --random-range-ratio: only used when dataset_name is 'random'
+    if args.dataset_name != 'random' and args.random_range_ratio is not None:
+        warnings.warn("--random-range-ratio will be ignored since \
+                --dataset-name is not 'random'.",
+                      stacklevel=2)
+
+    # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
+    # set.
+    if args.dataset_name not in {"random", "sonnet", None
+                                 } and args.prefix_len is not None:
+        warnings.warn("--prefix-len will be ignored since --dataset-name\
+                 is not 'random', 'sonnet', or not set.",
+                      stacklevel=2)
+
+    # === LoRA Settings ===
+    if getattr(args, "enable_lora", False) and args.backend != "vllm":
+        raise ValueError(
+            "LoRA benchmarking is only supported for vLLM backend")
+    if getattr(args, "enable_lora", False) and args.lora_path is None:
+        raise ValueError("LoRA path must be provided when enable_lora is True")
+
+    # === Backend-specific Validations ===
+    if args.backend == "hf" and args.hf_max_batch_size is None:
+        raise ValueError("HF max batch size is required for HF backend")
+    if args.backend != "hf" and args.hf_max_batch_size is not None:
+        raise ValueError("HF max batch size is only for HF backend.")
+
+    if args.backend in {"hf", "mii"} and getattr(args, "quantization",
+                                                 None) is not None:
+        raise ValueError("Quantization is only for vLLM backend.")
+
+    if args.backend == "mii" and args.dtype != "auto":
+        raise ValueError("dtype must be auto for MII backend.")
+    if args.backend == "mii" and args.n != 1:
+        raise ValueError("n must be 1 for MII backend.")
+    if args.backend == "mii" and args.tokenizer != args.model:
+        raise ValueError(
+            "Tokenizer must be the same as the model for MII backend.")
+
+
+def add_cli_args(parser: argparse.ArgumentParser):
+    parser.add_argument("--backend",
+                        type=str,
+                        choices=["vllm", "hf", "mii", "vllm-chat"],
+                        default="vllm")
+    parser.add_argument(
+        "--dataset-name",
+        type=str,
+        choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
+        help="Name of the dataset to benchmark on.",
+        default="sharegpt")
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default=None,
+        help="Path to the ShareGPT dataset, will be deprecated in\
+            the next release. The dataset is expected to "
+        "be a json in form of list[dict[..., conversations: "
+        "list[dict[..., value: <prompt_or_response>]]]]")
+    parser.add_argument("--dataset-path",
+                        type=str,
+                        default=None,
+                        help="Path to the dataset")
+    parser.add_argument("--input-len",
+                        type=int,
+                        default=None,
+                        help="Input prompt length for each request")
+    parser.add_argument("--output-len",
+                        type=int,
+                        default=None,
+                        help="Output length for each request. Overrides the "
+                        "output length from the dataset.")
+    parser.add_argument("--n",
+                        type=int,
+                        default=1,
+                        help="Number of generated sequences per prompt.")
+    parser.add_argument("--num-prompts",
+                        type=int,
+                        default=1000,
+                        help="Number of prompts to process.")
+    parser.add_argument("--hf-max-batch-size",
+                        type=int,
+                        default=None,
+                        help="Maximum batch size for HF backend.")
+    parser.add_argument(
+        '--output-json',
+        type=str,
+        default=None,
+        help='Path to save the throughput results in JSON format.')
+    parser.add_argument("--async-engine",
+                        action='store_true',
+                        default=False,
+                        help="Use vLLM async engine rather than LLM class.")
+    parser.add_argument("--disable-frontend-multiprocessing",
+                        action='store_true',
+                        default=False,
+                        help="Disable decoupled async engine frontend.")
+    parser.add_argument(
+        "--disable-detokenize",
+        action="store_true",
+        help=("Do not detokenize the response (i.e. do not include "
+              "detokenization time in the measurement)"))
+    # LoRA
+    parser.add_argument(
+        "--lora-path",
+        type=str,
+        default=None,
+        help="Path to the lora adapters to use. This can be an absolute path, "
+        "a relative path, or a Hugging Face model identifier.")
+    parser.add_argument(
+        "--prefix-len",
+        type=int,
+        default=0,
+        help="Number of fixed prefix tokens before the random "
+        "context in a request (default: 0).",
+    )
+    # random dataset
+    parser.add_argument(
+        "--random-range-ratio",
+        type=float,
+        default=0.0,
+        help="Range ratio for sampling input/output length, "
+        "used only for RandomDataset. Must be in the range [0, 1) to define "
+        "a symmetric sampling range "
+        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
+    )
+
+    # hf dtaset
+    parser.add_argument("--hf-subset",
+                        type=str,
+                        default=None,
+                        help="Subset of the HF dataset.")
+    parser.add_argument("--hf-split",
+                        type=str,
+                        default=None,
+                        help="Split of the HF dataset.")
+
+    parser = AsyncEngineArgs.add_cli_args(parser)
+
+
+def main(args: argparse.Namespace):
+    if args.tokenizer is None:
+        args.tokenizer = args.model
+    validate_args(args)
+    if args.seed is None:
+        args.seed = 0
+    random.seed(args.seed)
+    # Sample the requests.
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code)
+    requests = get_requests(args, tokenizer)
+    is_multi_modal = any(request.multi_modal_data is not None
+                         for request in requests)
+    request_outputs: Optional[list[RequestOutput]] = None
+    if args.backend == "vllm":
+        if args.async_engine:
+            elapsed_time = uvloop.run(
+                run_vllm_async(
+                    requests,
+                    args.n,
+                    AsyncEngineArgs.from_cli_args(args),
+                    args.disable_frontend_multiprocessing,
+                    args.disable_detokenize,
+                ))
+        else:
+            elapsed_time, request_outputs = run_vllm(
+                requests, args.n, EngineArgs.from_cli_args(args),
+                args.disable_detokenize)
+    elif args.backend == "hf":
+        assert args.tensor_parallel_size == 1
+        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
+                              args.hf_max_batch_size, args.trust_remote_code,
+                              args.disable_detokenize)
+    elif args.backend == "vllm-chat":
+        elapsed_time, request_outputs = run_vllm_chat(
+            requests, args.n, EngineArgs.from_cli_args(args),
+            args.disable_detokenize)
+    else:
+        raise ValueError(f"Unknown backend: {args.backend}")
+
+    if request_outputs:
+        # Note: with the vllm and vllm-chat backends,
+        # we have request_outputs, which we use to count tokens.
+        total_prompt_tokens = 0
+        total_output_tokens = 0
+        for ro in request_outputs:
+            if not isinstance(ro, RequestOutput):
+                continue
+            total_prompt_tokens += len(
+                ro.prompt_token_ids) if ro.prompt_token_ids else 0
+            total_output_tokens += sum(
+                len(o.token_ids) for o in ro.outputs if o)
+        total_num_tokens = total_prompt_tokens + total_output_tokens
+    else:
+        total_num_tokens = sum(r.prompt_len + r.expected_output_len
+                               for r in requests)
+        total_output_tokens = sum(r.expected_output_len for r in requests)
+        total_prompt_tokens = total_num_tokens - total_output_tokens
+
+    if is_multi_modal and args.backend != "vllm-chat":
+        print("\033[91mWARNING\033[0m: Multi-modal request with "
+              f"{args.backend} backend detected. The "
+              "following metrics are not accurate because image tokens are not"
+              " counted. See vllm-project/vllm/issues/9778 for details.")
+        # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
+        # vllm-chat backend counts the image tokens now
+
+    print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+          f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
+          f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
+    print(f"Total num prompt tokens:  {total_prompt_tokens}")
+    print(f"Total num output tokens:  {total_output_tokens}")
+
+    # Output JSON results if specified
+    if args.output_json:
+        results = {
+            "elapsed_time": elapsed_time,
+            "num_requests": len(requests),
+            "total_num_tokens": total_num_tokens,
+            "requests_per_second": len(requests) / elapsed_time,
+            "tokens_per_second": total_num_tokens / elapsed_time,
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+        save_to_pytorch_benchmark_format(args, results)
diff --git a/vllm_v0.10.0/vllm/benchmarks/utils.py b/vllm_v0.10.0/vllm/benchmarks/utils.py
new file mode 100644
index 0000000..f0bb993
--- /dev/null
+++ b/vllm_v0.10.0/vllm/benchmarks/utils.py
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import json
+import math
+import os
+from typing import Any
+
+
+def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
+                                        metrics: dict[str, list],
+                                        extra_info: dict[str, Any]) -> list:
+    """
+    Save the benchmark results in the format used by PyTorch OSS benchmark with
+    on metric per record
+    https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
+    """
+    records = []
+    if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
+        return records
+
+    for name, benchmark_values in metrics.items():
+        record = {
+            "benchmark": {
+                "name": "vLLM benchmark",
+                "extra_info": {
+                    "args": vars(args),
+                },
+            },
+            "model": {
+                "name": args.model,
+            },
+            "metric": {
+                "name": name,
+                "benchmark_values": benchmark_values,
+                "extra_info": extra_info,
+            },
+        }
+
+        tp = record["benchmark"]["extra_info"]["args"].get(
+            "tensor_parallel_size")
+        # Save tensor_parallel_size parameter if it's part of the metadata
+        if not tp and "tensor_parallel_size" in extra_info:
+            record["benchmark"]["extra_info"]["args"][
+                "tensor_parallel_size"] = extra_info["tensor_parallel_size"]
+
+        records.append(record)
+
+    return records
+
+
+class InfEncoder(json.JSONEncoder):
+
+    def clear_inf(self, o: Any):
+        if isinstance(o, dict):
+            return {k: self.clear_inf(v) for k, v in o.items()}
+        elif isinstance(o, list):
+            return [self.clear_inf(v) for v in o]
+        elif isinstance(o, float) and math.isinf(o):
+            return "inf"
+        return o
+
+    def iterencode(self, o: Any, *args, **kwargs) -> Any:
+        return super().iterencode(self.clear_inf(o), *args, **kwargs)
+
+
+def write_to_json(filename: str, records: list) -> None:
+    with open(filename, "w") as f:
+        json.dump(records, f, cls=InfEncoder)
diff --git a/vllm_v0.10.0/vllm/collect_env.py b/vllm_v0.10.0/vllm/collect_env.py
new file mode 100644
index 0000000..ee43ad1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/collect_env.py
@@ -0,0 +1,825 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# ruff: noqa
+# code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py
+
+import datetime
+import locale
+import os
+import subprocess
+import sys
+# Unlike the rest of the PyTorch this file must be python2 compliant.
+# This script outputs relevant system environment info
+# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
+from collections import namedtuple
+
+import regex as re
+
+from vllm.envs import environment_variables
+
+try:
+    import torch
+    TORCH_AVAILABLE = True
+except (ImportError, NameError, AttributeError, OSError):
+    TORCH_AVAILABLE = False
+
+# System Environment Information
+SystemEnv = namedtuple(
+    'SystemEnv',
+    [
+        'torch_version',
+        'is_debug_build',
+        'cuda_compiled_version',
+        'gcc_version',
+        'clang_version',
+        'cmake_version',
+        'os',
+        'libc_version',
+        'python_version',
+        'python_platform',
+        'is_cuda_available',
+        'cuda_runtime_version',
+        'cuda_module_loading',
+        'nvidia_driver_version',
+        'nvidia_gpu_models',
+        'cudnn_version',
+        'pip_version',  # 'pip' or 'pip3'
+        'pip_packages',
+        'conda_packages',
+        'hip_compiled_version',
+        'hip_runtime_version',
+        'miopen_runtime_version',
+        'caching_allocator_config',
+        'is_xnnpack_available',
+        'cpu_info',
+        'rocm_version',  # vllm specific field
+        'neuron_sdk_version',  # vllm specific field
+        'vllm_version',  # vllm specific field
+        'vllm_build_flags',  # vllm specific field
+        'gpu_topo',  # vllm specific field
+        'env_vars',
+    ])
+
+DEFAULT_CONDA_PATTERNS = {
+    "torch",
+    "numpy",
+    "cudatoolkit",
+    "soumith",
+    "mkl",
+    "magma",
+    "triton",
+    "optree",
+    "nccl",
+    "transformers",
+    "zmq",
+    "nvidia",
+    "pynvml",
+}
+
+DEFAULT_PIP_PATTERNS = {
+    "torch",
+    "numpy",
+    "mypy",
+    "flake8",
+    "triton",
+    "optree",
+    "onnx",
+    "nccl",
+    "transformers",
+    "zmq",
+    "nvidia",
+    "pynvml",
+}
+
+
+def run(command):
+    """Return (return-code, stdout, stderr)."""
+    shell = True if type(command) is str else False
+    try:
+        p = subprocess.Popen(command,
+                             stdout=subprocess.PIPE,
+                             stderr=subprocess.PIPE,
+                             shell=shell)
+        raw_output, raw_err = p.communicate()
+        rc = p.returncode
+        if get_platform() == 'win32':
+            enc = 'oem'
+        else:
+            enc = locale.getpreferredencoding()
+        output = raw_output.decode(enc)
+        if command == 'nvidia-smi topo -m':
+            # don't remove the leading whitespace of `nvidia-smi topo -m`
+            #   because they are meaningful
+            output = output.rstrip()
+        else:
+            output = output.strip()
+        err = raw_err.decode(enc)
+        return rc, output, err.strip()
+
+    except FileNotFoundError:
+        cmd_str = command if isinstance(command, str) else command[0]
+        return 127, '', f"Command not found: {cmd_str}"
+
+
+def run_and_read_all(run_lambda, command):
+    """Run command using run_lambda; reads and returns entire output if rc is 0."""
+    rc, out, _ = run_lambda(command)
+    if rc != 0:
+        return None
+    return out
+
+
+def run_and_parse_first_match(run_lambda, command, regex):
+    """Run command using run_lambda, returns the first regex match if it exists."""
+    rc, out, _ = run_lambda(command)
+    if rc != 0:
+        return None
+    match = re.search(regex, out)
+    if match is None:
+        return None
+    return match.group(1)
+
+
+def run_and_return_first_line(run_lambda, command):
+    """Run command using run_lambda and returns first line if output is not empty."""
+    rc, out, _ = run_lambda(command)
+    if rc != 0:
+        return None
+    return out.split('\n')[0]
+
+
+def get_conda_packages(run_lambda, patterns=None):
+    if patterns is None:
+        patterns = DEFAULT_CONDA_PATTERNS
+    conda = os.environ.get('CONDA_EXE', 'conda')
+    out = run_and_read_all(run_lambda, [conda, 'list'])
+    if out is None:
+        return out
+
+    return "\n".join(line for line in out.splitlines()
+                     if not line.startswith("#") and any(name in line
+                                                         for name in patterns))
+
+
+def get_gcc_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'gcc --version', r'gcc (.*)')
+
+
+def get_clang_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'clang --version',
+                                     r'clang version (.*)')
+
+
+def get_cmake_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'cmake --version',
+                                     r'cmake (.*)')
+
+
+def get_nvidia_driver_version(run_lambda):
+    if get_platform() == 'darwin':
+        cmd = 'kextstat | grep -i cuda'
+        return run_and_parse_first_match(run_lambda, cmd,
+                                         r'com[.]nvidia[.]CUDA [(](.*?)[)]')
+    smi = get_nvidia_smi()
+    return run_and_parse_first_match(run_lambda, smi,
+                                     r'Driver Version: (.*?) ')
+
+
+def get_gpu_info(run_lambda):
+    if get_platform() == 'darwin' or (TORCH_AVAILABLE and hasattr(
+            torch.version, 'hip') and torch.version.hip is not None):
+        if TORCH_AVAILABLE and torch.cuda.is_available():
+            if torch.version.hip is not None:
+                prop = torch.cuda.get_device_properties(0)
+                if hasattr(prop, "gcnArchName"):
+                    gcnArch = " ({})".format(prop.gcnArchName)
+                else:
+                    gcnArch = "NoGCNArchNameOnOldPyTorch"
+            else:
+                gcnArch = ""
+            return torch.cuda.get_device_name(None) + gcnArch
+        return None
+    smi = get_nvidia_smi()
+    uuid_regex = re.compile(r' \(UUID: .+?\)')
+    rc, out, _ = run_lambda(smi + ' -L')
+    if rc != 0:
+        return None
+    # Anonymize GPUs by removing their UUID
+    return re.sub(uuid_regex, '', out)
+
+
+def get_running_cuda_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'nvcc --version',
+                                     r'release .+ V(.*)')
+
+
+def get_cudnn_version(run_lambda):
+    """Return a list of libcudnn.so; it's hard to tell which one is being used."""
+    if get_platform() == 'win32':
+        system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
+        cuda_path = os.environ.get('CUDA_PATH', "%CUDA_PATH%")
+        where_cmd = os.path.join(system_root, 'System32', 'where')
+        cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path)
+    elif get_platform() == 'darwin':
+        # CUDA libraries and drivers can be found in /usr/local/cuda/. See
+        # https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#install
+        # https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installmac
+        # Use CUDNN_LIBRARY when cudnn library is installed elsewhere.
+        cudnn_cmd = 'ls /usr/local/cuda/lib/libcudnn*'
+    else:
+        cudnn_cmd = 'ldconfig -p | grep libcudnn | rev | cut -d" " -f1 | rev'
+    rc, out, _ = run_lambda(cudnn_cmd)
+    # find will return 1 if there are permission errors or if not found
+    if len(out) == 0 or (rc != 1 and rc != 0):
+        l = os.environ.get('CUDNN_LIBRARY')
+        if l is not None and os.path.isfile(l):
+            return os.path.realpath(l)
+        return None
+    files_set = set()
+    for fn in out.split('\n'):
+        fn = os.path.realpath(fn)  # eliminate symbolic links
+        if os.path.isfile(fn):
+            files_set.add(fn)
+    if not files_set:
+        return None
+    # Alphabetize the result because the order is non-deterministic otherwise
+    files = sorted(files_set)
+    if len(files) == 1:
+        return files[0]
+    result = '\n'.join(files)
+    return 'Probably one of the following:\n{}'.format(result)
+
+
+def get_nvidia_smi():
+    # Note: nvidia-smi is currently available only on Windows and Linux
+    smi = 'nvidia-smi'
+    if get_platform() == 'win32':
+        system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
+        program_files_root = os.environ.get('PROGRAMFILES',
+                                            'C:\\Program Files')
+        legacy_path = os.path.join(program_files_root, 'NVIDIA Corporation',
+                                   'NVSMI', smi)
+        new_path = os.path.join(system_root, 'System32', smi)
+        smis = [new_path, legacy_path]
+        for candidate_smi in smis:
+            if os.path.exists(candidate_smi):
+                smi = '"{}"'.format(candidate_smi)
+                break
+    return smi
+
+
+def get_rocm_version(run_lambda):
+    """Returns the ROCm version if available, otherwise 'N/A'."""
+    return run_and_parse_first_match(run_lambda, 'hipcc --version',
+                                     r'HIP version: (\S+)')
+
+
+def get_neuron_sdk_version(run_lambda):
+    # Adapted from your install script
+    try:
+        result = run_lambda(["neuron-ls"])
+        return result if result[0] == 0 else 'N/A'
+    except Exception:
+        return 'N/A'
+
+
+def get_vllm_version():
+    from vllm import __version__, __version_tuple__
+
+    if __version__ == "dev":
+        return "N/A (dev)"
+    version_str = __version_tuple__[-1]
+    if isinstance(version_str, str) and version_str.startswith('g'):
+        # it's a dev build
+        if '.' in version_str:
+            # it's a dev build containing local changes
+            git_sha = version_str.split('.')[0][1:]
+            date = version_str.split('.')[-1][1:]
+            return f"{__version__} (git sha: {git_sha}, date: {date})"
+        else:
+            # it's a dev build without local changes
+            git_sha = version_str[1:]  # type: ignore
+            return f"{__version__} (git sha: {git_sha})"
+    return __version__
+
+
+def summarize_vllm_build_flags():
+    # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
+    return 'CUDA Archs: {}; ROCm: {}; Neuron: {}'.format(
+        os.environ.get('TORCH_CUDA_ARCH_LIST', 'Not Set'),
+        'Enabled' if os.environ.get('ROCM_HOME') else 'Disabled',
+        'Enabled' if os.environ.get('NEURON_CORES') else 'Disabled',
+    )
+
+
+def get_gpu_topo(run_lambda):
+    output = None
+
+    if get_platform() == 'linux':
+        output = run_and_read_all(run_lambda, 'nvidia-smi topo -m')
+        if output is None:
+            output = run_and_read_all(run_lambda, 'rocm-smi --showtopo')
+
+    return output
+
+
+# example outputs of CPU infos
+#  * linux
+#    Architecture:            x86_64
+#      CPU op-mode(s):        32-bit, 64-bit
+#      Address sizes:         46 bits physical, 48 bits virtual
+#      Byte Order:            Little Endian
+#    CPU(s):                  128
+#      On-line CPU(s) list:   0-127
+#    Vendor ID:               GenuineIntel
+#      Model name:            Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
+#        CPU family:          6
+#        Model:               106
+#        Thread(s) per core:  2
+#        Core(s) per socket:  32
+#        Socket(s):           2
+#        Stepping:            6
+#        BogoMIPS:            5799.78
+#        Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr
+#                             sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon rep_good nopl
+#                             xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq monitor ssse3 fma cx16
+#                             pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand
+#                             hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced
+#                             fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid avx512f avx512dq rdseed adx smap
+#                             avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1
+#                             xsaves wbnoinvd ida arat avx512vbmi pku ospke avx512_vbmi2 gfni vaes vpclmulqdq
+#                             avx512_vnni avx512_bitalg tme avx512_vpopcntdq rdpid md_clear flush_l1d arch_capabilities
+#    Virtualization features:
+#      Hypervisor vendor:     KVM
+#      Virtualization type:   full
+#    Caches (sum of all):
+#      L1d:                   3 MiB (64 instances)
+#      L1i:                   2 MiB (64 instances)
+#      L2:                    80 MiB (64 instances)
+#      L3:                    108 MiB (2 instances)
+#    NUMA:
+#      NUMA node(s):          2
+#      NUMA node0 CPU(s):     0-31,64-95
+#      NUMA node1 CPU(s):     32-63,96-127
+#    Vulnerabilities:
+#      Itlb multihit:         Not affected
+#      L1tf:                  Not affected
+#      Mds:                   Not affected
+#      Meltdown:              Not affected
+#      Mmio stale data:       Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown
+#      Retbleed:              Not affected
+#      Spec store bypass:     Mitigation; Speculative Store Bypass disabled via prctl and seccomp
+#      Spectre v1:            Mitigation; usercopy/swapgs barriers and __user pointer sanitization
+#      Spectre v2:            Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence
+#      Srbds:                 Not affected
+#      Tsx async abort:       Not affected
+#  * win32
+#    Architecture=9
+#    CurrentClockSpeed=2900
+#    DeviceID=CPU0
+#    Family=179
+#    L2CacheSize=40960
+#    L2CacheSpeed=
+#    Manufacturer=GenuineIntel
+#    MaxClockSpeed=2900
+#    Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
+#    ProcessorType=3
+#    Revision=27142
+#
+#    Architecture=9
+#    CurrentClockSpeed=2900
+#    DeviceID=CPU1
+#    Family=179
+#    L2CacheSize=40960
+#    L2CacheSpeed=
+#    Manufacturer=GenuineIntel
+#    MaxClockSpeed=2900
+#    Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
+#    ProcessorType=3
+#    Revision=27142
+
+
+def get_cpu_info(run_lambda):
+    rc, out, err = 0, '', ''
+    if get_platform() == 'linux':
+        rc, out, err = run_lambda('lscpu')
+    elif get_platform() == 'win32':
+        rc, out, err = run_lambda(
+            'wmic cpu get Name,Manufacturer,Family,Architecture,ProcessorType,DeviceID, \
+        CurrentClockSpeed,MaxClockSpeed,L2CacheSize,L2CacheSpeed,Revision /VALUE'
+        )
+    elif get_platform() == 'darwin':
+        rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string")
+    cpu_info = 'None'
+    if rc == 0:
+        cpu_info = out
+    else:
+        cpu_info = err
+    return cpu_info
+
+
+def get_platform():
+    if sys.platform.startswith('linux'):
+        return 'linux'
+    elif sys.platform.startswith('win32'):
+        return 'win32'
+    elif sys.platform.startswith('cygwin'):
+        return 'cygwin'
+    elif sys.platform.startswith('darwin'):
+        return 'darwin'
+    else:
+        return sys.platform
+
+
+def get_mac_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'sw_vers -productVersion',
+                                     r'(.*)')
+
+
+def get_windows_version(run_lambda):
+    system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
+    wmic_cmd = os.path.join(system_root, 'System32', 'Wbem', 'wmic')
+    findstr_cmd = os.path.join(system_root, 'System32', 'findstr')
+    return run_and_read_all(
+        run_lambda,
+        '{} os get Caption | {} /v Caption'.format(wmic_cmd, findstr_cmd))
+
+
+def get_lsb_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'lsb_release -a',
+                                     r'Description:\t(.*)')
+
+
+def check_release_file(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'cat /etc/*-release',
+                                     r'PRETTY_NAME="(.*)"')
+
+
+def get_os(run_lambda):
+    from platform import machine
+    platform = get_platform()
+
+    if platform == 'win32' or platform == 'cygwin':
+        return get_windows_version(run_lambda)
+
+    if platform == 'darwin':
+        version = get_mac_version(run_lambda)
+        if version is None:
+            return None
+        return 'macOS {} ({})'.format(version, machine())
+
+    if platform == 'linux':
+        # Ubuntu/Debian based
+        desc = get_lsb_version(run_lambda)
+        if desc is not None:
+            return '{} ({})'.format(desc, machine())
+
+        # Try reading /etc/*-release
+        desc = check_release_file(run_lambda)
+        if desc is not None:
+            return '{} ({})'.format(desc, machine())
+
+        return '{} ({})'.format(platform, machine())
+
+    # Unknown platform
+    return platform
+
+
+def get_python_platform():
+    import platform
+    return platform.platform()
+
+
+def get_libc_version():
+    import platform
+    if get_platform() != 'linux':
+        return 'N/A'
+    return '-'.join(platform.libc_ver())
+
+
+def get_pip_packages(run_lambda, patterns=None):
+    """Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages."""
+    if patterns is None:
+        patterns = DEFAULT_PIP_PATTERNS
+
+    def run_with_pip():
+        try:
+            import importlib.util
+            pip_spec = importlib.util.find_spec('pip')
+            pip_available = pip_spec is not None
+        except ImportError:
+            pip_available = False
+
+        if pip_available:
+            cmd = [sys.executable, '-mpip', 'list', '--format=freeze']
+        elif os.environ.get("UV") is not None:
+            print("uv is set")
+            cmd = ["uv", "pip", "list", "--format=freeze"]
+        else:
+            raise RuntimeError(
+                "Could not collect pip list output (pip or uv module not available)"
+            )
+
+        out = run_and_read_all(run_lambda, cmd)
+        return "\n".join(line for line in out.splitlines()
+                         if any(name in line for name in patterns))
+
+    pip_version = 'pip3' if sys.version[0] == '3' else 'pip'
+    out = run_with_pip()
+    return pip_version, out
+
+
+def get_cachingallocator_config():
+    ca_config = os.environ.get('PYTORCH_CUDA_ALLOC_CONF', '')
+    return ca_config
+
+
+def get_cuda_module_loading_config():
+    if TORCH_AVAILABLE and torch.cuda.is_available():
+        torch.cuda.init()
+        config = os.environ.get('CUDA_MODULE_LOADING', '')
+        return config
+    else:
+        return "N/A"
+
+
+def is_xnnpack_available():
+    if TORCH_AVAILABLE:
+        import torch.backends.xnnpack
+        return str(
+            torch.backends.xnnpack.enabled)  # type: ignore[attr-defined]
+    else:
+        return "N/A"
+
+
+def get_env_vars():
+    env_vars = ''
+    secret_terms = ('secret', 'token', 'api', 'access', 'password')
+    report_prefix = ("TORCH", "NCCL", "PYTORCH", "CUDA", "CUBLAS", "CUDNN",
+                     "OMP_", "MKL_", "NVIDIA")
+    for k, v in os.environ.items():
+        if any(term in k.lower() for term in secret_terms):
+            continue
+        if k in environment_variables:
+            env_vars = env_vars + "{}={}".format(k, v) + "\n"
+        if k.startswith(report_prefix):
+            env_vars = env_vars + "{}={}".format(k, v) + "\n"
+
+    return env_vars
+
+
+def get_env_info():
+    run_lambda = run
+    pip_version, pip_list_output = get_pip_packages(run_lambda)
+
+    if TORCH_AVAILABLE:
+        version_str = torch.__version__
+        debug_mode_str = str(torch.version.debug)
+        cuda_available_str = str(torch.cuda.is_available())
+        cuda_version_str = torch.version.cuda
+        if not hasattr(torch.version,
+                       'hip') or torch.version.hip is None:  # cuda version
+            hip_compiled_version = hip_runtime_version = miopen_runtime_version = 'N/A'
+        else:  # HIP version
+
+            def get_version_or_na(cfg, prefix):
+                _lst = [s.rsplit(None, 1)[-1] for s in cfg if prefix in s]
+                return _lst[0] if _lst else 'N/A'
+
+            cfg = torch._C._show_config().split('\n')
+            hip_runtime_version = get_version_or_na(cfg, 'HIP Runtime')
+            miopen_runtime_version = get_version_or_na(cfg, 'MIOpen')
+            cuda_version_str = 'N/A'
+            hip_compiled_version = torch.version.hip
+    else:
+        version_str = debug_mode_str = cuda_available_str = cuda_version_str = 'N/A'
+        hip_compiled_version = hip_runtime_version = miopen_runtime_version = 'N/A'
+
+    sys_version = sys.version.replace("\n", " ")
+
+    conda_packages = get_conda_packages(run_lambda)
+
+    rocm_version = get_rocm_version(run_lambda)
+    neuron_sdk_version = get_neuron_sdk_version(run_lambda)
+    vllm_version = get_vllm_version()
+    vllm_build_flags = summarize_vllm_build_flags()
+    gpu_topo = get_gpu_topo(run_lambda)
+
+    return SystemEnv(
+        torch_version=version_str,
+        is_debug_build=debug_mode_str,
+        python_version='{} ({}-bit runtime)'.format(
+            sys_version,
+            sys.maxsize.bit_length() + 1),
+        python_platform=get_python_platform(),
+        is_cuda_available=cuda_available_str,
+        cuda_compiled_version=cuda_version_str,
+        cuda_runtime_version=get_running_cuda_version(run_lambda),
+        cuda_module_loading=get_cuda_module_loading_config(),
+        nvidia_gpu_models=get_gpu_info(run_lambda),
+        nvidia_driver_version=get_nvidia_driver_version(run_lambda),
+        cudnn_version=get_cudnn_version(run_lambda),
+        hip_compiled_version=hip_compiled_version,
+        hip_runtime_version=hip_runtime_version,
+        miopen_runtime_version=miopen_runtime_version,
+        pip_version=pip_version,
+        pip_packages=pip_list_output,
+        conda_packages=conda_packages,
+        os=get_os(run_lambda),
+        libc_version=get_libc_version(),
+        gcc_version=get_gcc_version(run_lambda),
+        clang_version=get_clang_version(run_lambda),
+        cmake_version=get_cmake_version(run_lambda),
+        caching_allocator_config=get_cachingallocator_config(),
+        is_xnnpack_available=is_xnnpack_available(),
+        cpu_info=get_cpu_info(run_lambda),
+        rocm_version=rocm_version,
+        neuron_sdk_version=neuron_sdk_version,
+        vllm_version=vllm_version,
+        vllm_build_flags=vllm_build_flags,
+        gpu_topo=gpu_topo,
+        env_vars=get_env_vars(),
+    )
+
+
+env_info_fmt = """
+==============================
+        System Info
+==============================
+OS                           : {os}
+GCC version                  : {gcc_version}
+Clang version                : {clang_version}
+CMake version                : {cmake_version}
+Libc version                 : {libc_version}
+
+==============================
+       PyTorch Info
+==============================
+PyTorch version              : {torch_version}
+Is debug build               : {is_debug_build}
+CUDA used to build PyTorch   : {cuda_compiled_version}
+ROCM used to build PyTorch   : {hip_compiled_version}
+
+==============================
+      Python Environment
+==============================
+Python version               : {python_version}
+Python platform              : {python_platform}
+
+==============================
+       CUDA / GPU Info
+==============================
+Is CUDA available            : {is_cuda_available}
+CUDA runtime version         : {cuda_runtime_version}
+CUDA_MODULE_LOADING set to   : {cuda_module_loading}
+GPU models and configuration : {nvidia_gpu_models}
+Nvidia driver version        : {nvidia_driver_version}
+cuDNN version                : {cudnn_version}
+HIP runtime version          : {hip_runtime_version}
+MIOpen runtime version       : {miopen_runtime_version}
+Is XNNPACK available         : {is_xnnpack_available}
+
+==============================
+          CPU Info
+==============================
+{cpu_info}
+
+==============================
+Versions of relevant libraries
+==============================
+{pip_packages}
+{conda_packages}
+""".strip()
+
+# both the above code and the following code use `strip()` to
+# remove leading/trailing whitespaces, so we need to add a newline
+# in between to separate the two sections
+env_info_fmt += "\n\n"
+
+env_info_fmt += """
+==============================
+         vLLM Info
+==============================
+ROCM Version                 : {rocm_version}
+Neuron SDK Version           : {neuron_sdk_version}
+vLLM Version                 : {vllm_version}
+vLLM Build Flags:
+  {vllm_build_flags}
+GPU Topology:
+  {gpu_topo}
+
+==============================
+     Environment Variables
+==============================
+{env_vars}
+""".strip()
+
+
+def pretty_str(envinfo):
+
+    def replace_nones(dct, replacement='Could not collect'):
+        for key in dct.keys():
+            if dct[key] is not None:
+                continue
+            dct[key] = replacement
+        return dct
+
+    def replace_bools(dct, true='Yes', false='No'):
+        for key in dct.keys():
+            if dct[key] is True:
+                dct[key] = true
+            elif dct[key] is False:
+                dct[key] = false
+        return dct
+
+    def prepend(text, tag='[prepend]'):
+        lines = text.split('\n')
+        updated_lines = [tag + line for line in lines]
+        return '\n'.join(updated_lines)
+
+    def replace_if_empty(text, replacement='No relevant packages'):
+        if text is not None and len(text) == 0:
+            return replacement
+        return text
+
+    def maybe_start_on_next_line(string):
+        # If `string` is multiline, prepend a \n to it.
+        if string is not None and len(string.split('\n')) > 1:
+            return '\n{}\n'.format(string)
+        return string
+
+    mutable_dict = envinfo._asdict()
+
+    # If nvidia_gpu_models is multiline, start on the next line
+    mutable_dict['nvidia_gpu_models'] = \
+        maybe_start_on_next_line(envinfo.nvidia_gpu_models)
+
+    # If the machine doesn't have CUDA, report some fields as 'No CUDA'
+    dynamic_cuda_fields = [
+        'cuda_runtime_version',
+        'nvidia_gpu_models',
+        'nvidia_driver_version',
+    ]
+    all_cuda_fields = dynamic_cuda_fields + ['cudnn_version']
+    all_dynamic_cuda_fields_missing = all(mutable_dict[field] is None
+                                          for field in dynamic_cuda_fields)
+    if TORCH_AVAILABLE and not torch.cuda.is_available(
+    ) and all_dynamic_cuda_fields_missing:
+        for field in all_cuda_fields:
+            mutable_dict[field] = 'No CUDA'
+        if envinfo.cuda_compiled_version is None:
+            mutable_dict['cuda_compiled_version'] = 'None'
+
+    # Replace True with Yes, False with No
+    mutable_dict = replace_bools(mutable_dict)
+
+    # Replace all None objects with 'Could not collect'
+    mutable_dict = replace_nones(mutable_dict)
+
+    # If either of these are '', replace with 'No relevant packages'
+    mutable_dict['pip_packages'] = replace_if_empty(
+        mutable_dict['pip_packages'])
+    mutable_dict['conda_packages'] = replace_if_empty(
+        mutable_dict['conda_packages'])
+
+    # Tag conda and pip packages with a prefix
+    # If they were previously None, they'll show up as ie '[conda] Could not collect'
+    if mutable_dict['pip_packages']:
+        mutable_dict['pip_packages'] = prepend(
+            mutable_dict['pip_packages'], '[{}] '.format(envinfo.pip_version))
+    if mutable_dict['conda_packages']:
+        mutable_dict['conda_packages'] = prepend(
+            mutable_dict['conda_packages'], '[conda] ')
+    mutable_dict['cpu_info'] = envinfo.cpu_info
+    return env_info_fmt.format(**mutable_dict)
+
+
+def get_pretty_env_info():
+    return pretty_str(get_env_info())
+
+
+def main():
+    print("Collecting environment information...")
+    output = get_pretty_env_info()
+    print(output)
+
+    if TORCH_AVAILABLE and hasattr(torch, 'utils') and hasattr(
+            torch.utils, '_crash_handler'):
+        minidump_dir = torch.utils._crash_handler.DEFAULT_MINIDUMP_DIR
+        if sys.platform == "linux" and os.path.exists(minidump_dir):
+            dumps = [
+                os.path.join(minidump_dir, dump)
+                for dump in os.listdir(minidump_dir)
+            ]
+            latest = max(dumps, key=os.path.getctime)
+            ctime = os.path.getctime(latest)
+            creation_time = datetime.datetime.fromtimestamp(ctime).strftime(
+                '%Y-%m-%d %H:%M:%S')
+            msg = "\n*** Detected a minidump at {} created on {}, ".format(latest, creation_time) + \
+                  "if this is related to your bug please include it when you file a report ***"
+            print(msg, file=sys.stderr)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/vllm_v0.10.0/vllm/compilation/__init__.py b/vllm_v0.10.0/vllm/compilation/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/compilation/activation_quant_fusion.py b/vllm_v0.10.0/vllm/compilation/activation_quant_fusion.py
new file mode 100644
index 0000000..ce4e50a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/compilation/activation_quant_fusion.py
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+from torch._inductor.pattern_matcher import (PatternMatcherPass, fwd_only,
+                                             register_replacement)
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+
+from .vllm_inductor_pass import VllmInductorPass
+
+logger = init_logger(__name__)
+
+
+def silu_mul_pattern_static(result: torch.Tensor,
+                            result_silu_mul: torch.Tensor, input: torch.Tensor,
+                            scale: torch.Tensor):
+    at1 = auto_functionalized(torch.ops._C.silu_and_mul.default,
+                              result=result_silu_mul,
+                              input=input)
+    at2 = auto_functionalized(torch.ops._C.static_scaled_fp8_quant.default,
+                              result=result,
+                              input=at1[1],
+                              scale=scale)
+    return at2[1]
+
+
+def silu_mul_replacement_static(result: torch.Tensor,
+                                result_silu_mul: torch.Tensor,
+                                input: torch.Tensor, scale: torch.Tensor):
+    at = auto_functionalized(torch.ops._C.silu_and_mul_quant.default,
+                             result=result,
+                             input=input,
+                             scale=scale)
+    return at[1]
+
+
+def empty_bf16(*args, **kwargs):
+    return torch.empty(*args, **kwargs, dtype=torch.bfloat16, device="cuda")
+
+
+def empty_fp8(*args, **kwargs):
+    fp8 = current_platform.fp8_dtype()
+    return torch.empty(*args, **kwargs, dtype=fp8, device="cuda")
+
+
+def empty_fp32(*args, **kwargs):
+    return torch.empty(*args, **kwargs, dtype=torch.float32, device="cuda")
+
+
+class ActivationQuantFusionPass(VllmInductorPass):
+    """
+    This pass fuses a pre-defined set of custom ops into fused ops.
+    It uses the torch pattern matcher to find the patterns and replace them.
+
+    Because patterns can only be registered once, the pass is a singleton.
+    This will be addressed in a future version of PyTorch:
+    https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980
+    """
+
+    def __init__(self, config: VllmConfig):
+        super().__init__(config)
+
+        self.patterns: PatternMatcherPass = PatternMatcherPass(
+            pass_name="activation_quant_fusion_pass")
+
+        inputs = [
+            empty_fp8(5, 4),  # Quant output
+            empty_bf16(5, 4),  # Silu_and_mul output
+            empty_bf16(5, 4),  # Input
+            empty_fp32(1, 1)  # Scale
+        ]
+        register_replacement(silu_mul_pattern_static,
+                             silu_mul_replacement_static, inputs, fwd_only,
+                             self.patterns)
+
+    def __call__(self, graph: torch.fx.Graph):
+        self.begin()
+        self.dump_graph(graph, "before_act_quant_fusion")
+
+        count = self.patterns.apply(graph)
+        logger.debug("Replaced %s patterns in ActivationQuantFusionPass",
+                     count)
+
+        self.dump_graph(graph, "after_act_quant_fusion")
+        self.end_and_log()
diff --git a/vllm_v0.10.0/vllm/compilation/backends.py b/vllm_v0.10.0/vllm/compilation/backends.py
new file mode 100644
index 0000000..673fb58
--- /dev/null
+++ b/vllm_v0.10.0/vllm/compilation/backends.py
@@ -0,0 +1,632 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import ast
+import dataclasses
+import os
+import pprint
+import time
+from collections.abc import Sequence
+from contextlib import contextmanager
+from typing import Any, Callable, Optional
+
+import torch
+import torch.fx as fx
+from torch._dispatch.python import enable_python_dispatcher
+
+import vllm.envs as envs
+from vllm.config import CompilationConfig, VllmConfig
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils import is_torch_equal_or_newer, resolve_obj_by_qualname
+
+from .compiler_interface import (CompilerInterface, EagerAdaptor,
+                                 InductorAdaptor, InductorStandaloneAdaptor)
+from .counter import compilation_counter
+from .inductor_pass import InductorPass
+from .pass_manager import PostGradPassManager
+
+logger = init_logger(__name__)
+
+
+def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface:
+    if compilation_config.use_inductor:
+        if envs.VLLM_USE_STANDALONE_COMPILE and is_torch_equal_or_newer(
+                "2.8.0.dev"):
+            logger.debug("Using InductorStandaloneAdaptor")
+            return InductorStandaloneAdaptor()
+        else:
+            logger.debug("Using InductorAdaptor")
+            return InductorAdaptor()
+    else:
+        logger.debug("Using EagerAdaptor")
+        return EagerAdaptor()
+
+
+class CompilerManager:
+    """
+    A manager to manage the compilation process, including
+    caching the compiled graph, loading the compiled graph,
+    and compiling the graph.
+
+    The cache is a dict mapping
+    `(runtime_shape, graph_index, backend_name)`
+    to `any_data` returned from the compiler.
+
+    When serializing the cache, we save it to a Python file
+    for readability. We don't use json here because json doesn't
+    support int as key.
+    """
+
+    def __init__(self, compilation_config: CompilationConfig):
+        self.cache: dict[tuple[Optional[int], int, str], Any] = dict()
+        self.is_cache_updated = False
+        self.compilation_config = compilation_config
+        self.compiler = make_compiler(compilation_config)
+
+    def compute_hash(self, vllm_config: VllmConfig) -> str:
+        return self.compiler.compute_hash(vllm_config)
+
+    def initialize_cache(self,
+                         cache_dir: str,
+                         disable_cache: bool = False,
+                         prefix: str = ""):
+        """
+        Initialize the cache directory for the compiler.
+
+        The organization of the cache directory is as follows:
+        cache_dir=/path/to/hash_str/rank_i_j/prefix/
+        inside cache_dir, there will be:
+        - vllm_compile_cache.py
+        - computation_graph.py
+        - transformed_code.py
+
+        for multiple prefixes, they can share the same
+        base cache dir of /path/to/hash_str/rank_i_j/ ,
+        to store some common compilation artifacts.
+        """
+
+        self.disable_cache = disable_cache
+        self.cache_dir = cache_dir
+        self.cache_file_path = os.path.join(cache_dir, "vllm_compile_cache.py")
+
+        if not disable_cache and os.path.exists(self.cache_file_path):
+            # load the cache from the file
+            with open(self.cache_file_path) as f:
+                # we use ast.literal_eval to parse the data
+                # because it is a safe way to parse Python literals.
+                # do not use eval(), it is unsafe.
+                self.cache = ast.literal_eval(f.read())
+
+        self.compiler.initialize_cache(cache_dir=cache_dir,
+                                       disable_cache=disable_cache,
+                                       prefix=prefix)
+
+    def save_to_file(self):
+        if self.disable_cache or not self.is_cache_updated:
+            return
+        printer = pprint.PrettyPrinter(indent=4)
+        data = printer.pformat(self.cache)
+        with open(self.cache_file_path, "w") as f:
+            f.write(data)
+
+    def load(self,
+             graph: fx.GraphModule,
+             example_inputs: list[Any],
+             graph_index: int,
+             runtime_shape: Optional[int] = None) -> Optional[Callable]:
+        if (runtime_shape, graph_index, self.compiler.name) not in self.cache:
+            return None
+        handle = self.cache[(runtime_shape, graph_index, self.compiler.name)]
+        compiled_graph = self.compiler.load(handle, graph, example_inputs,
+                                            graph_index, runtime_shape)
+        if runtime_shape is None:
+            logger.debug(
+                "Directly load the %s-th graph for dynamic shape from %s via "
+                "handle %s", graph_index, self.compiler.name, handle)
+        else:
+            logger.debug(
+                "Directly load the %s-th graph for shape %s from %s via "
+                "handle %s", graph_index, str(runtime_shape),
+                self.compiler.name, handle)
+        return compiled_graph
+
+    def compile(self,
+                graph: fx.GraphModule,
+                example_inputs,
+                additional_inductor_config,
+                compilation_config: CompilationConfig,
+                graph_index: int = 0,
+                num_graphs: int = 1,
+                runtime_shape: Optional[int] = None) -> Any:
+        if graph_index == 0:
+            # before compiling the first graph, record the start time
+            global compilation_start_time
+            compilation_start_time = time.time()
+
+        compilation_counter.num_backend_compilations += 1
+
+        compiled_graph = None
+
+        # try to load from the cache
+        compiled_graph = self.load(graph, example_inputs, graph_index,
+                                   runtime_shape)
+        if compiled_graph is not None:
+            if graph_index == num_graphs - 1:
+                # after loading the last graph for this shape, record the time.
+                # there can be multiple graphs due to piecewise compilation.
+                now = time.time()
+                elapsed = now - compilation_start_time
+                if runtime_shape is None:
+                    logger.info(
+                        "Directly load the compiled graph(s) for dynamic shape "
+                        "from the cache, took %.3f s", elapsed)
+                else:
+                    logger.info(
+                        "Directly load the compiled graph(s) for shape %s "
+                        "from the cache, took %.3f s", str(runtime_shape),
+                        elapsed)
+            return compiled_graph
+
+        # no compiler cached the graph, or the cache is disabled,
+        # we need to compile it
+        if isinstance(self.compiler, InductorAdaptor):
+            # Let compile_fx generate a key for us
+            maybe_key = None
+        else:
+            maybe_key = \
+                f"artifact_shape_{runtime_shape}_subgraph_{graph_index}"
+        compiled_graph, handle = self.compiler.compile(
+            graph, example_inputs, additional_inductor_config, runtime_shape,
+            maybe_key)
+
+        assert compiled_graph is not None, "Failed to compile the graph"
+
+        # store the artifact in the cache
+        if not envs.VLLM_DISABLE_COMPILE_CACHE and handle is not None:
+            self.cache[(runtime_shape, graph_index,
+                        self.compiler.name)] = handle
+            compilation_counter.num_cache_entries_updated += 1
+            self.is_cache_updated = True
+            if graph_index == 0:
+                # adds some info logging for the first graph
+                if runtime_shape is None:
+                    logger.info(
+                        "Cache the graph for dynamic shape for later use")
+                else:
+                    logger.info("Cache the graph of shape %s for later use",
+                                str(runtime_shape))
+            if runtime_shape is None:
+                logger.debug(
+                    "Store the %s-th graph for dynamic shape from %s via "
+                    "handle %s", graph_index, self.compiler.name, handle)
+            else:
+                logger.debug(
+                    "Store the %s-th graph for shape %s from %s via handle %s",
+                    graph_index, str(runtime_shape), self.compiler.name,
+                    handle)
+
+        # after compiling the last graph, record the end time
+        if graph_index == num_graphs - 1:
+            now = time.time()
+            elapsed = now - compilation_start_time
+            compilation_config.compilation_time += elapsed
+            if runtime_shape is None:
+                logger.info("Compiling a graph for dynamic shape takes %.2f s",
+                            elapsed)
+            else:
+                logger.info("Compiling a graph for shape %s takes %.2f s",
+                            runtime_shape, elapsed)
+
+        return compiled_graph
+
+
+@dataclasses.dataclass
+class SplitItem:
+    submod_name: str
+    graph_id: int
+    is_splitting_graph: bool
+    graph: fx.GraphModule
+
+
+def split_graph(graph: fx.GraphModule,
+                ops: list[str]) -> tuple[fx.GraphModule, list[SplitItem]]:
+    # split graph by ops
+    subgraph_id = 0
+    node_to_subgraph_id = {}
+    split_op_graphs = []
+    for node in graph.graph.nodes:
+        if node.op in ("output", "placeholder"):
+            continue
+        if node.op == 'call_function' and str(node.target) in ops:
+            subgraph_id += 1
+            node_to_subgraph_id[node] = subgraph_id
+            split_op_graphs.append(subgraph_id)
+            subgraph_id += 1
+        else:
+            node_to_subgraph_id[node] = subgraph_id
+
+    # `keep_original_order` is important!
+    # otherwise pytorch might reorder the nodes and
+    # the semantics of the graph will change when we
+    # have mutations in the graph
+    split_gm = torch.fx.passes.split_module.split_module(
+        graph,
+        None,
+        lambda node: node_to_subgraph_id[node],
+        keep_original_order=True)
+
+    outputs = []
+
+    names = [name for (name, module) in split_gm.named_modules()]
+
+    for name in names:
+        if "." in name or name == "":
+            # recursive child module or the root module
+            continue
+
+        module = getattr(split_gm, name)
+
+        graph_id = int(name.replace("submod_", ""))
+        outputs.append(
+            SplitItem(name, graph_id, (graph_id in split_op_graphs), module))
+
+    # sort by intetger graph_id, rather than string name
+    outputs.sort(key=lambda x: x.graph_id)
+
+    return split_gm, outputs
+
+
+# we share the global graph pool among all the backends
+global_graph_pool = None
+
+compilation_start_time = 0.0
+
+
+class PiecewiseCompileInterpreter(torch.fx.Interpreter):
+    """Code adapted from `torch.fx.passes.shape_prop.ShapeProp`.
+    It runs the given graph with fake inputs, and compile some
+    submodules specified by `compile_submod_names` with the given
+    compilation configs.
+
+    NOTE: the order in `compile_submod_names` matters, because
+    it will be used to determine the order of the compiled piecewise
+    graphs. The first graph will handle logging, and the last graph
+    has some special cudagraph output handling.
+    """
+
+    def __init__(self, module: torch.fx.GraphModule,
+                 compile_submod_names: list[str], vllm_config: VllmConfig,
+                 graph_pool, vllm_backend: "VllmBackend"):
+        super().__init__(module)
+        from torch._guards import detect_fake_mode
+        self.fake_mode = detect_fake_mode()
+        self.compile_submod_names = compile_submod_names
+        self.compilation_config = vllm_config.compilation_config
+        self.graph_pool = graph_pool
+        self.vllm_config = vllm_config
+        self.vllm_backend = vllm_backend
+        # When True, it annoyingly dumps the torch.fx.Graph on errors.
+        self.extra_traceback = False
+
+    def run(self, *args):
+        fake_args = [
+            self.fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t
+            for t in args
+        ]
+        with self.fake_mode, enable_python_dispatcher():
+            return super().run(*fake_args)
+
+    def call_module(self, target: torch.fx.node.Target,
+                    args: tuple[torch.fx.node.Argument,
+                                ...], kwargs: dict[str, Any]) -> Any:
+        assert isinstance(target, str)
+        output = super().call_module(target, args, kwargs)
+
+        if target in self.compile_submod_names:
+            index = self.compile_submod_names.index(target)
+            submod = self.fetch_attr(target)
+            sym_shape_indices = [
+                i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
+            ]
+            global compilation_start_time
+            compiled_graph_for_dynamic_shape = self.vllm_backend.\
+                compiler_manager.compile(
+                submod,
+                args,
+                self.compilation_config.inductor_compile_config,
+                self.compilation_config,
+                graph_index=index,
+                num_graphs=len(self.compile_submod_names),
+                runtime_shape=None)
+
+            piecewise_backend = resolve_obj_by_qualname(
+                current_platform.get_piecewise_backend_cls())
+            self.module.__dict__[target] = piecewise_backend(
+                submod, self.vllm_config, self.graph_pool, index,
+                len(self.compile_submod_names), sym_shape_indices,
+                compiled_graph_for_dynamic_shape, self.vllm_backend)
+
+            compilation_counter.num_piecewise_capturable_graphs_seen += 1
+
+        return output
+
+
+# the tag for the part of model being compiled,
+# e.g. backbone/eagle_head
+model_tag: str = "backbone"
+
+
+@contextmanager
+def set_model_tag(tag: str):
+    """Context manager to set the model tag."""
+    global model_tag
+    assert tag != model_tag, \
+        f"Model tag {tag} is the same as the current tag {model_tag}."
+    old_tag = model_tag
+    model_tag = tag
+    try:
+        yield
+    finally:
+        model_tag = old_tag
+
+
+class VllmBackend:
+    """The compilation backend for `torch.compile` with vLLM.
+    It is used for compilation level of `CompilationLevel.PIECEWISE`,
+    where we customize the compilation.
+
+    The major work of this backend is to split the graph into
+    piecewise graphs, and pass them to the piecewise backend.
+
+    This backend also adds the PostGradPassManager to Inductor config,
+    which handles the post-grad passes.
+    """
+
+    vllm_config: VllmConfig
+    compilation_config: CompilationConfig
+    graph_pool: Any
+    _called: bool = False
+    # the graph we compiled
+    graph: fx.GraphModule
+    # the stiching graph module for all the piecewise graphs
+    split_gm: fx.GraphModule
+    piecewise_graphs: list[SplitItem]
+    returned_callable: Callable
+    # Inductor passes to run on the graph pre-defunctionalization
+    post_grad_passes: Sequence[Callable]
+    sym_tensor_indices: list[int]
+    input_buffers: list[torch.Tensor]
+    compiler_manager: CompilerManager
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ):
+
+        # if the model is initialized with a non-empty prefix,
+        # then usually it's enough to use that prefix,
+        # e.g. launguage_model, vision_model, etc.
+        # when multiple parts are initialized as independent
+        # models, we need to use the model_tag to distinguish
+        # them, e.g. backbone (default), eagle_head, etc.
+        self.prefix = prefix or model_tag
+
+        global global_graph_pool
+        if global_graph_pool is None:
+            global_graph_pool = current_platform.graph_pool_handle()
+
+        # TODO: in the future, if we want to use multiple
+        # streams, it might not be safe to share a global pool.
+        # only investigate this when we use multiple streams
+        self.graph_pool = global_graph_pool
+
+        # Passes to run on the graph post-grad.
+        self.post_grad_pass_manager = PostGradPassManager()
+
+        self.sym_tensor_indices = []
+        self.input_buffers = []
+
+        self.vllm_config = vllm_config
+        self.compilation_config = vllm_config.compilation_config
+
+        self.compiler_manager: CompilerManager = CompilerManager(
+            self.compilation_config)
+
+        # `torch.compile` is JIT compiled, so we don't need to
+        # do anything here
+
+    def configure_post_pass(self):
+        config = self.compilation_config
+        self.post_grad_pass_manager.configure(self.vllm_config)
+
+        # Post-grad custom passes are run using the post_grad_custom_post_pass
+        # hook. If a pass for that hook exists, add it to the pass manager.
+        inductor_config = config.inductor_compile_config
+        PASS_KEY = "post_grad_custom_post_pass"
+        if PASS_KEY in inductor_config:
+            # Config should automatically wrap all inductor passes
+            if isinstance(inductor_config[PASS_KEY], PostGradPassManager):
+                assert (inductor_config[PASS_KEY].uuid() ==
+                        self.post_grad_pass_manager.uuid())
+            else:
+                assert isinstance(inductor_config[PASS_KEY], InductorPass)
+                self.post_grad_pass_manager.add(inductor_config[PASS_KEY])
+        inductor_config[PASS_KEY] = self.post_grad_pass_manager
+
+    def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
+
+        vllm_config = self.vllm_config
+        if not self.compilation_config.cache_dir:
+            # no provided cache dir, generate one based on the known factors
+            # that affects the compilation. if none of the factors change,
+            # the cache dir will be the same so that we can reuse the compiled
+            # graph.
+
+            factors = []
+            # 0. factors come from the env, for example, The values of
+            # VLLM_PP_LAYER_PARTITION will affects the computation graph.
+            env_hash = envs.compute_hash()
+            factors.append(env_hash)
+
+            # 1. factors come from the vllm_config (it mainly summarizes how the
+            #    model is created)
+            config_hash = vllm_config.compute_hash()
+            factors.append(config_hash)
+
+            # 2. factors come from the code files that are traced by Dynamo (
+            #    it mainly summarizes how the model is used in forward pass)
+            forward_code_files = list(
+                sorted(self.compilation_config.traced_files))
+            self.compilation_config.traced_files.clear()
+            logger.debug(
+                "Traced files (to be considered for compilation cache):\n%s",
+                "\n".join(forward_code_files))
+            hash_content = []
+            for filepath in forward_code_files:
+                hash_content.append(filepath)
+                if filepath == "<string>":
+                    # This means the function was dynamically generated, with
+                    # e.g. exec(). We can't actually check these.
+                    continue
+                with open(filepath) as f:
+                    hash_content.append(f.read())
+            import hashlib
+            code_hash = hashlib.md5("\n".join(hash_content).encode(),
+                                    usedforsecurity=False).hexdigest()
+            factors.append(code_hash)
+
+            # 3. compiler hash
+            compiler_hash = self.compiler_manager.compute_hash(vllm_config)
+            factors.append(compiler_hash)
+
+            # combine all factors to generate the cache dir
+            hash_key = hashlib.md5(str(factors).encode(),
+                                   usedforsecurity=False).hexdigest()[:10]
+
+            cache_dir = os.path.join(
+                envs.VLLM_CACHE_ROOT,
+                "torch_compile_cache",
+                hash_key,
+            )
+            self.compilation_config.cache_dir = cache_dir
+
+        cache_dir = self.compilation_config.cache_dir
+        os.makedirs(cache_dir, exist_ok=True)
+        self.compilation_config.cache_dir = cache_dir
+        rank = vllm_config.parallel_config.rank
+        dp_rank = vllm_config.parallel_config.data_parallel_rank
+        local_cache_dir = os.path.join(cache_dir, f"rank_{rank}_{dp_rank}",
+                                       self.prefix)
+        os.makedirs(local_cache_dir, exist_ok=True)
+        self.compilation_config.local_cache_dir = local_cache_dir
+
+        disable_cache = envs.VLLM_DISABLE_COMPILE_CACHE
+
+        if disable_cache:
+            logger.info("vLLM's torch.compile cache is disabled.")
+        else:
+            logger.info("Using cache directory: %s for vLLM's torch.compile",
+                        local_cache_dir)
+
+        self.compiler_manager.initialize_cache(local_cache_dir, disable_cache,
+                                               self.prefix)
+
+        # when dynamo calls the backend, it means the bytecode
+        # transform and analysis are done
+        compilation_counter.num_graphs_seen += 1
+        from .monitor import torch_compile_start_time
+        dynamo_time = time.time() - torch_compile_start_time
+        logger.info("Dynamo bytecode transform time: %.2f s", dynamo_time)
+        self.compilation_config.compilation_time += dynamo_time
+
+        # we control the compilation process, each instance can only be
+        # called once
+        assert not self._called, "VllmBackend can only be called once"
+
+        self.graph = graph
+        self.configure_post_pass()
+
+        self.split_gm, self.piecewise_graphs = split_graph(
+            graph, self.compilation_config.splitting_ops)
+
+        from torch._dynamo.utils import lazy_format_graph_code
+
+        # depyf will hook lazy_format_graph_code and dump the graph
+        # for debugging, no need to print the graph here
+        lazy_format_graph_code("before split", self.graph)
+        lazy_format_graph_code("after split", self.split_gm)
+
+        compilation_counter.num_piecewise_graphs_seen += len(
+            self.piecewise_graphs)
+        submod_names_to_compile = [
+            item.submod_name for item in self.piecewise_graphs
+            if not item.is_splitting_graph
+        ]
+
+        # propagate the split graph to the piecewise backend,
+        # compile submodules with symbolic shapes
+        PiecewiseCompileInterpreter(self.split_gm, submod_names_to_compile,
+                                    self.vllm_config, self.graph_pool,
+                                    self).run(*example_inputs)
+
+        graph_path = os.path.join(local_cache_dir, "computation_graph.py")
+        if not os.path.exists(graph_path):
+            # code adapted from https://github.com/thuml/depyf/blob/dab831108a752d1facc00acdd6d4243891845c37/depyf/explain/patched_lazy_format_graph_code.py#L30 # noqa
+            # use `print_readable` because it can include submodules
+            src = "from __future__ import annotations\nimport torch\n" + \
+                self.split_gm.print_readable(print_output=False)
+            src = src.replace("<lambda>", "GraphModule")
+            with open(graph_path, "w") as f:
+                f.write(src)
+
+            logger.debug("Computation graph saved to %s", graph_path)
+
+        self._called = True
+
+        if not self.compilation_config.use_cudagraph or \
+            not self.compilation_config.cudagraph_copy_inputs:
+            return self.split_gm
+
+        # if we need to copy input buffers for cudagraph
+        from torch._guards import detect_fake_mode
+        fake_mode = detect_fake_mode()
+        fake_args = [
+            fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t
+            for t in example_inputs
+        ]
+
+        # index of tensors that have symbolic shapes (batch size)
+        # for weights and static buffers, they will have concrete shapes.
+        # symbolic shape only happens for input tensors.
+        from torch.fx.experimental.symbolic_shapes import is_symbolic
+        self.sym_tensor_indices = [
+            i for i, x in enumerate(fake_args)
+            if isinstance(x, torch._subclasses.fake_tensor.FakeTensor) and \
+                any(is_symbolic(d) for d in x.size())
+        ]
+
+        # compiler managed cudagraph input buffers
+        # we assume the first run with symbolic shapes
+        # has the maximum size among all the tensors
+        self.input_buffers = [
+            example_inputs[x].clone() for x in self.sym_tensor_indices
+        ]
+
+        # this is the callable we return to Dynamo to run
+        def copy_and_call(*args):
+            list_args = list(args)
+            for i, index in enumerate(self.sym_tensor_indices):
+                runtime_tensor = list_args[index]
+                runtime_shape = runtime_tensor.shape[0]
+                static_tensor = self.input_buffers[i][:runtime_shape]
+
+                # copy the tensor to the static buffer
+                static_tensor.copy_(runtime_tensor)
+
+                # replace the tensor in the list_args to the static buffer
+                list_args[index] = static_tensor
+            return self.split_gm(*list_args)
+
+        return copy_and_call
diff --git a/vllm_v0.10.0/vllm/compilation/base_piecewise_backend.py b/vllm_v0.10.0/vllm/compilation/base_piecewise_backend.py
new file mode 100644
index 0000000..4d7aeeb
--- /dev/null
+++ b/vllm_v0.10.0/vllm/compilation/base_piecewise_backend.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Callable, Protocol
+
+import torch.fx as fx
+
+from vllm.compilation.backends import VllmBackend
+from vllm.config import VllmConfig
+
+
+class AbstractPiecewiseBackend(Protocol):
+    """
+    PiecewiseBackend interface that allows platforms to extend 
+    piecewise static graph.
+    """
+
+    def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
+                 graph_pool: Any, piecewise_compile_index: int,
+                 total_piecewise_compiles: int, sym_shape_indices: list[int],
+                 compiled_graph_for_general_shape: Callable,
+                 vllm_backend: VllmBackend, **kwargs):
+        """
+        Initializes the PiecewiseBackend class with compilation and 
+        execution-related configurations.
+
+        This class handles piecewise compilation, graph capturing, 
+        and dispatching for specific input shapes.
+
+        Args:
+            graph (fx.GraphModule): The graph represented in fx.
+            vllm_config (VllmConfig): Global configuration for vLLM.
+            graph_pool (Any): 
+                Graph memory pool handle, e.g., 
+                    `torch.cuda.graph_pool_handle()`.
+            piecewise_compile_index (int): 
+                Index of the current piecewise subgraph.
+            total_piecewise_compiles (int): 
+                Total number of piecewise-compiled graphs.
+            sym_shape_indices (list[int]): 
+                Indices of symbolic shape.
+            compiled_graph_for_general_shape (Callable): 
+                Callable that executes the graph compiled for general shapes.
+            vllm_backend (VllmBackend): 
+                Backend compiler that manages compilation and graph runtime 
+                for vLLM.
+
+        Keyword Args:
+            kwargs: Additional keyword arguments reserved for future 
+                extensions or custom platforms.
+        """
+        raise NotImplementedError
+
+    def __call__(self, *args) -> Any:
+        """Executes the compiled graph for given input args.
+
+        If this is the first invocation, executes the general compiled graph
+        and initiates the compilation process tracking. For subsequent calls,
+        dynamically dispatches execution to either a compiled graph or a static
+        graph based on the input shape.
+
+        Args:
+            *args: Variable length input arguments to be passed into the 
+                graph. The symbolic shape is expected to be in position 
+                `sym_shape_indices[0]`.
+
+        Returns:
+            Any: Output of the executed graph. This can be from the general
+            compiled graph, a specialized compiled version for the given shape,
+            or a replayed static graph.
+        """
+        raise NotImplementedError
diff --git a/vllm_v0.10.0/vllm/compilation/collective_fusion.py b/vllm_v0.10.0/vllm/compilation/collective_fusion.py
new file mode 100644
index 0000000..0e79618
--- /dev/null
+++ b/vllm_v0.10.0/vllm/compilation/collective_fusion.py
@@ -0,0 +1,487 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from importlib.util import find_spec
+from typing import Optional
+
+import torch
+import torch._inductor.pattern_matcher as pm
+import torch.fx as fx
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+from torch._inductor.pattern_matcher import PatternMatcherPass
+from torch.distributed._symmetric_memory import enable_symm_mem_for_group
+
+from vllm.config import VllmConfig
+from vllm.distributed import get_tp_group, tensor_model_parallel_all_reduce
+from vllm.distributed.parallel_state import (
+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from vllm.logger import init_logger
+from vllm.utils import direct_register_custom_op
+
+from .vllm_inductor_pass import VllmInductorPass
+
+if find_spec("flashinfer"):
+    try:
+        import flashinfer.comm as flashinfer_comm
+        flashinfer_comm = (flashinfer_comm if hasattr(
+            flashinfer_comm, "trtllm_allreduce_fusion") else None)
+    except ImportError:
+        flashinfer_comm = None
+else:
+    flashinfer_comm = None
+from vllm.platforms import current_platform
+
+logger = init_logger(__name__)
+
+ALLREDUCE_OP = torch.ops.vllm.all_reduce.default
+RMS_OP = torch.ops._C.rms_norm.default
+RMS_ADD_OP = torch.ops._C.fused_add_rms_norm.default
+
+
+class BasePattern:
+
+    def __init__(self, dtype: torch.dtype, device: str):
+        self.dtype = dtype
+        self.device = device
+        self.tp = get_tp_group()
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+
+class GEMMReduceScatterPattern(BasePattern):
+
+    def get_inputs(self):
+        mul = torch.empty([16, 4], device=self.device, dtype=self.dtype)
+        mm_weight = torch.empty([4, 4], device=self.device, dtype=self.dtype)
+        return [mul, mm_weight]
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(mul: torch.Tensor, mm_weight: torch.Tensor):
+            mm = torch.ops.aten.mm.default(mul, mm_weight)
+            reduce_scatter = torch.ops.vllm.reduce_scatter.default(
+                mm,
+                dim=0,
+                world_size=self.tp_size,
+                group_name=self.tp.unique_name,
+            )
+            return reduce_scatter
+
+        def replacement(mul: torch.Tensor, mm_weight: torch.Tensor):
+            gemm_rs = torch.ops.symm_mem.fused_matmul_reduce_scatter(
+                mul,
+                mm_weight,
+                "avg",
+                scatter_dim=0,
+                group_name=self.tp.device_group.group_name,
+            )
+
+            return gemm_rs
+
+        pm.register_replacement(pattern, replacement, self.get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
+class AllGatherGEMMPattern(BasePattern):
+
+    def get_inputs(self):
+        x = torch.empty([4, 4], device=self.device, dtype=self.dtype)
+        weight = torch.empty([4, 4], device=self.device, dtype=self.dtype)
+
+        return [x, weight]
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(
+            x: torch.Tensor,
+            weight: torch.Tensor,
+        ) -> tuple[torch.Tensor, torch.Tensor]:
+            all_gather = torch.ops.vllm.all_gather.default(
+                x,
+                dim=0,
+                world_size=self.tp_size,
+                group_name=self.tp.unique_name,
+            )
+
+            return torch.ops.aten.mm.default(all_gather, weight)
+
+        def replacement(
+                x: torch.Tensor,
+                weight: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+            ag_output, mm_outputs = torch.ops.symm_mem.fused_all_gather_matmul(
+                x,
+                [weight],
+                gather_dim=0,
+                group_name=self.tp.device_group.group_name,
+            )
+            return mm_outputs
+
+        pm.register_replacement(pattern, replacement, self.get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
+class AsyncTPPass(VllmInductorPass):
+
+    def __init__(self, config: VllmConfig):
+        super().__init__(config)
+
+        # Enable symmetric memory for the TP process group
+        enable_symm_mem_for_group(get_tp_group().device_group.group_name)
+        self.patterns: PatternMatcherPass = PatternMatcherPass(
+            pass_name="async_tp_pass")
+        GEMMReduceScatterPattern(self.model_dtype,
+                                 self.device).register(self.patterns)
+
+        AllGatherGEMMPattern(self.model_dtype,
+                             self.device).register(self.patterns)
+
+    def is_applicable_for_shape(self, shape: Optional[int]) -> bool:
+        # only do replace for specific shapes
+        tp_size = get_tensor_model_parallel_world_size()
+        return shape is not None and shape % tp_size == 0
+
+    def __call__(self, graph: fx.Graph):
+        self.begin()
+        self.dump_graph(graph, "before_async_tp_pass")
+        count = self.patterns.apply(graph)
+        logger.debug("Replaced %s patterns", count)
+        self.dump_graph(graph, "after_async_tp_pass")
+        self.end_and_log()
+
+
+if flashinfer_comm is not None:
+    _FI_WORKSPACE_TENSOR = None
+
+    MiB = 1024 * 1024
+    # Max size of the input tensor per world size
+    # to use flashinfer fused allreduce
+    _FI_MAX_SIZES = {
+        2: MiB,  # 1MB
+        4: MiB,  # 1MB
+        6: MiB // 2,  # 512KB
+        8: MiB // 2,  # 512KB
+    }
+    # opt for a more conservative default value
+    # when world size is not in _FI_MAX_SIZES
+    _DEFAULT_FI_MAX_SIZE = MiB // 2
+
+    def call_trtllm_fused_allreduce_norm(
+        allreduce_in: torch.Tensor,
+        residual: torch.Tensor,
+        rms_gamma: torch.Tensor,
+        rms_eps: float,
+        world_rank: int,
+        world_size: int,
+        launch_with_pdl: bool,
+        trigger_completion_at_end: bool,
+        fp32_acc: bool,
+        max_token_num: int,
+        norm_out: Optional[torch.Tensor] = None,
+    ) -> None:
+
+        num_tokens, hidden_size = allreduce_in.shape
+        element_size = allreduce_in.element_size()
+        current_tensor_size = num_tokens * hidden_size * element_size
+        max_fusion_size = max_token_num * hidden_size * element_size
+        use_flashinfer = current_tensor_size <= min(
+            _FI_MAX_SIZES.get(world_size, _DEFAULT_FI_MAX_SIZE),
+            max_fusion_size,
+        )
+
+        if use_flashinfer:
+            assert (_FI_WORKSPACE_TENSOR is not None
+                    ), "Flashinfer must be enabled when using flashinfer"
+            if norm_out is None:
+                norm_out = allreduce_in
+                residual_out = residual
+            else:
+                # return residual_out as allreduce_out with zeroed residual_in
+                # as flashinfer does not support rms_norm
+                # and allreduce_out together
+                residual_out = allreduce_in
+            # For the sizes that are smaller than the max size,
+            # we only use flashinfer one shot allreduce
+            flashinfer_comm.trtllm_allreduce_fusion(
+                allreduce_in=allreduce_in,
+                token_num=allreduce_in.shape[0],
+                residual_in=residual,
+                residual_out=residual_out,
+                norm_out=norm_out,
+                rms_gamma=rms_gamma,
+                rms_eps=rms_eps,
+                world_rank=world_rank,
+                world_size=world_size,
+                hidden_dim=allreduce_in.shape[-1],
+                workspace_ptrs=_FI_WORKSPACE_TENSOR,
+                launch_with_pdl=launch_with_pdl,
+                use_oneshot=True,
+                trigger_completion_at_end=trigger_completion_at_end,
+                fp32_acc=fp32_acc,
+                pattern_code=flashinfer_comm.AllReduceFusionPattern.
+                kARResidualRMSNorm,
+                allreduce_out=None,
+                quant_out=None,
+                scale_out=None,
+                layout_code=None,
+                scale_factor=None,
+            )
+        else:
+            allreduce_out = tensor_model_parallel_all_reduce(allreduce_in)
+            if norm_out is None:
+                torch.ops._C.fused_add_rms_norm(allreduce_out, residual,
+                                                rms_gamma, rms_eps)
+            else:
+                torch.ops._C.rms_norm(norm_out, allreduce_out, rms_gamma,
+                                      rms_eps)
+            allreduce_in.copy_(allreduce_out)
+
+    def call_trtllm_fused_allreduce_norm_fake(
+        allreduce_in: torch.Tensor,
+        residual: torch.Tensor,
+        rms_gamma: torch.Tensor,
+        rms_eps: float,
+        world_rank: int,
+        world_size: int,
+        launch_with_pdl: bool,
+        trigger_completion_at_end: bool,
+        fp32_acc: bool,
+        max_token_num: int,
+        norm_out: Optional[torch.Tensor] = None,
+    ) -> None:
+        pass
+
+    direct_register_custom_op(
+        op_name="flashinfer_trtllm_fused_allreduce_norm",
+        op_func=call_trtllm_fused_allreduce_norm,
+        mutates_args=[
+            "allreduce_in",
+            "residual",
+            "norm_out",
+        ],
+        fake_impl=call_trtllm_fused_allreduce_norm_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+    flashinfer_trtllm_fused_allreduce_norm = (
+        torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default)
+
+
+class FlashInferFusedAllReduceParams:
+    """Parameters for FlashInfer fused allreduce operations."""
+
+    def __init__(
+        self,
+        rank: int,
+        world_size: int,
+        use_fp32_lamport: bool = False,
+        max_token_num: int = 1024,
+    ):
+        self.rank = rank
+        self.world_size = world_size
+        self.use_fp32_lamport = use_fp32_lamport
+        self.trigger_completion_at_end = True
+        self.launch_with_pdl = True
+        self.fp32_acc = True
+        self.use_oneshot = False
+        self.max_token_num = max_token_num
+
+    def get_trtllm_fused_allreduce_kwargs(self):
+        return {
+            "world_rank": self.rank,
+            "world_size": self.world_size,
+            "launch_with_pdl": self.launch_with_pdl,
+            "trigger_completion_at_end": self.trigger_completion_at_end,
+            "fp32_acc": self.fp32_acc,
+            "max_token_num": self.max_token_num,
+        }
+
+
+class AllReduceRMSNORMPattern(BasePattern):
+
+    def __init__(
+        self,
+        epsilon: float,
+        dtype: torch.dtype,
+        device: str,
+        allreduce_params: FlashInferFusedAllReduceParams,
+    ):
+        super().__init__(dtype, device)
+        self.epsilon = epsilon
+        self.allreduce_params = allreduce_params
+
+    def get_inputs(self):
+        input = torch.empty([1, 8, 4], device=self.device, dtype=self.dtype)
+        rms_result = torch.empty([1, 8, 4],
+                                 device=self.device,
+                                 dtype=self.dtype)
+        weight = torch.empty([4], device=self.device, dtype=self.dtype)
+
+        return [input, rms_result, weight]
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(input: torch.Tensor, rms_result: torch.Tensor,
+                    weight: torch.Tensor):
+            all_reduce_output = tensor_model_parallel_all_reduce(input)
+            rms = auto_functionalized(
+                RMS_OP,
+                result=rms_result,
+                input=all_reduce_output,
+                weight=weight,
+                epsilon=self.epsilon,
+            )
+            return rms[1], all_reduce_output
+
+        def replacement(input: torch.Tensor, rms_result: torch.Tensor,
+                        weight: torch.Tensor):
+            residual = torch.zeros_like(input)
+            allreduce = auto_functionalized(
+                torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default,
+                allreduce_in=input,
+                residual=residual,
+                norm_out=rms_result,
+                rms_gamma=weight,
+                rms_eps=self.epsilon,
+                **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
+            )
+
+            return allreduce[3], allreduce[1]
+
+        pm.register_replacement(pattern, replacement, self.get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
+class AllReduceFusedAddRMSNormPattern(BasePattern):
+
+    def __init__(
+        self,
+        epsilon: float,
+        dtype: torch.dtype,
+        device: str,
+        allreduce_params: FlashInferFusedAllReduceParams,
+    ):
+        super().__init__(dtype, device)
+        self.epsilon = epsilon
+        self.allreduce_params = allreduce_params
+
+    def get_inputs(self):
+        input = torch.empty([4, 4], device=self.device, dtype=self.dtype)
+        residual = torch.empty([4, 4], device=self.device, dtype=self.dtype)
+        weight = torch.empty([4, 4], device=self.device, dtype=self.dtype)
+        return [
+            residual,
+            input,
+            weight,
+        ]
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(residual: torch.Tensor, input: torch.Tensor,
+                    weight: torch.Tensor):
+            all_reduce_output = tensor_model_parallel_all_reduce(input)
+            rms = auto_functionalized(
+                RMS_ADD_OP,
+                input=all_reduce_output,
+                residual=residual,
+                weight=weight,
+                epsilon=self.epsilon,
+            )
+            return rms[1], rms[2]
+
+        def replacement(residual: torch.Tensor, input: torch.Tensor,
+                        weight: torch.Tensor):
+            allreduce = auto_functionalized(
+                torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default,
+                allreduce_in=input,
+                residual=residual,
+                rms_gamma=weight,
+                rms_eps=self.epsilon,
+                norm_out=None,
+                **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
+            )
+            return allreduce[1], allreduce[2]
+
+        pm.register_replacement(pattern, replacement, self.get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
+class AllReduceFusionPass(VllmInductorPass):
+
+    def __init__(self, config: VllmConfig):
+        super().__init__(config)
+        self.disabled = True
+        self.tp_size = get_tensor_model_parallel_world_size()
+        if self.tp_size <= 1:
+            return
+        self.patterns: PatternMatcherPass = PatternMatcherPass(
+            pass_name="all_reduce_fusion_pass")
+        if config.model_config is None:
+            return
+        self.hidden_dim = config.model_config.get_hidden_size()
+        self.group = get_tp_group().device_group
+        rank = get_tensor_model_parallel_rank()
+        use_fp32_lamport = self.model_dtype == torch.float32
+        if flashinfer_comm is None:
+            logger.warning(
+                "Flashinfer is not installed or comm module not found, "
+                "skipping allreduce fusion pass")
+            return
+        # Check if the world size is supported
+        if self.tp_size not in _FI_MAX_SIZES:
+            logger.warning(
+                "Flashinfer allreduce fusion is not "
+                "supported for world size %s",
+                self.tp_size,
+            )
+            return
+
+        self.ipc_handles, workspace_tensor = (
+            flashinfer_comm.trtllm_create_ipc_workspace_for_all_reduce_fusion(
+                tp_rank=rank,
+                tp_size=self.tp_size,
+                max_token_num=config.compilation_config.pass_config.
+                fi_allreduce_fusion_max_token_num,
+                hidden_dim=self.hidden_dim,
+                group=self.group,
+                use_fp32_lamport=use_fp32_lamport,
+            ))
+
+        global _FI_WORKSPACE_TENSOR
+        _FI_WORKSPACE_TENSOR = workspace_tensor
+        self.allreduce_params = FlashInferFusedAllReduceParams(
+            rank=rank,
+            world_size=self.tp_size,
+            use_fp32_lamport=use_fp32_lamport,
+            max_token_num=config.compilation_config.pass_config.
+            fi_allreduce_fusion_max_token_num,
+        )
+
+        for epsilon in [1e-5, 1e-6]:
+            AllReduceRMSNORMPattern(
+                epsilon,
+                self.model_dtype,
+                self.device,
+                self.allreduce_params,
+            ).register(self.patterns)
+            AllReduceFusedAddRMSNormPattern(
+                epsilon,
+                self.model_dtype,
+                self.device,
+                self.allreduce_params,
+            ).register(self.patterns)
+
+        self.disabled = False
+
+    def __call__(self, graph: fx.Graph):
+        if self.disabled:
+            return
+        self.begin()
+        self.dump_graph(graph, "before_all_reduce_fusion_pass")
+        count = self.patterns.apply(graph)
+        logger.debug("Replaced %s patterns", count)
+        self.dump_graph(graph, "after_all_reduce_fusion_pass")
+        self.end_and_log()
+
+    def __del__(self):
+        if self.disabled:
+            return
+        if flashinfer_comm is not None:
+            flashinfer_comm.trtllm_destroy_ipc_workspace(
+                self.ipc_handles, self.group)
diff --git a/vllm_v0.10.0/vllm/compilation/compiler_interface.py b/vllm_v0.10.0/vllm/compilation/compiler_interface.py
new file mode 100644
index 0000000..7158fd6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/compilation/compiler_interface.py
@@ -0,0 +1,572 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import contextlib
+import copy
+import hashlib
+import os
+from contextlib import ExitStack
+from typing import Any, Callable, Optional
+from unittest.mock import patch
+
+import torch
+import torch._inductor.compile_fx
+import torch.fx as fx
+
+import vllm.envs as envs
+from vllm.compilation.counter import compilation_counter
+from vllm.config import VllmConfig
+from vllm.utils import is_torch_equal_or_newer
+
+from .inductor_pass import pass_context
+
+
+class CompilerInterface:
+    """
+    The interface for a compiler that can be used by vLLM.
+    """
+    # The name of the compiler, e.g. inductor.
+    # This is a class-level attribute.
+    name: str
+
+    def initialize_cache(self,
+                         cache_dir: str,
+                         disable_cache: bool = False,
+                         prefix: str = ""):
+        """
+        when the vLLM process uses `cache_dir` as the cache directory,
+        the compiler should initialize itself with the cache directory,
+        e.g. by re-directing its own cache directory to a sub-directory.
+
+        prefix can be used in combination with cache_dir to figure out the base
+        cache directory, e.g. there're multiple parts of model being compiled,
+        but we want to share the same cache directory for all of them.
+
+        e.g.
+        cache_dir = "/path/to/dir/backbone", prefix = "backbone"
+        cache_dir = "/path/to/dir/eagle_head", prefix = "eagle_head"
+        """
+        pass
+
+    def compute_hash(self, vllm_config: VllmConfig) -> str:
+        """
+        Gather all the relevant information from the vLLM config,
+        to compute a hash so that we can cache the compiled model.
+
+        See [`VllmConfig.compute_hash`][vllm.config.VllmConfig.compute_hash]
+        to check what information
+        is already considered by default. This function should only
+        consider the information that is specific to the compiler.
+        """
+        return ""
+
+    def compile(
+        self,
+        graph: fx.GraphModule,
+        example_inputs: list[Any],
+        compiler_config: dict[str, Any],
+        runtime_shape: Optional[int] = None,
+        key: Optional[str] = None,
+    ) -> tuple[Optional[Callable], Optional[Any]]:
+        """
+        Compile the graph with the given example inputs and compiler config,
+        with a runtime shape. If the `runtime_shape` is None, it means
+        the `example_inputs` have a dynamic shape. Otherwise, the
+        `runtime_shape` specifies the shape of the inputs. Right now we only
+        support one variable shape for all inputs, which is the batchsize
+        (number of tokens) during inference.
+
+        Dynamo will make sure `graph(*example_inputs)` is valid.
+
+        The function should return a compiled callable function, as well as
+        a handle that can be used to directly load the compiled function.
+
+        The handle should be a plain Python object, preferably a string or a
+        file path for readability.
+
+        If the compiler doesn't support caching, it should return None for the
+        handle. If the compiler fails to compile the graph, it should return
+        None for the compiled function as well.
+
+        `key` is required for StandaloneInductorAdapter, it specifies where to
+        save the compiled artifact. The compiled artifact gets saved to
+        `cache_dir/key`.
+        """
+        return None, None
+
+    def load(self,
+             handle: Any,
+             graph: fx.GraphModule,
+             example_inputs: list[Any],
+             graph_index: int,
+             runtime_shape: Optional[int] = None) -> Callable:
+        """
+        Load the compiled function from the handle.
+        Raises an error if the handle is invalid.
+
+        The handle is the second return value of the `compile` function.
+        """
+        raise NotImplementedError("caching is not supported")
+
+
+class AlwaysHitShapeEnv:
+    """
+    Why do we need this class:
+
+    For normal `torch.compile` usage, every compilation will have
+    one Dynamo bytecode compilation and one Inductor compilation.
+    The Inductor compilation happens under the context of the
+    Dynamo bytecode compilation, and that context is used to
+    determine the dynamic shape information, etc.
+
+    For our use case, we only run Dynamo bytecode compilation once,
+    and run Inductor compilation multiple times with different shapes
+    plus a general shape. The compilation for specific shapes happens
+    outside of the context of the Dynamo bytecode compilation. At that
+    time, we don't have shape environment to provide to Inductor, and
+    it will fail the Inductor code cache lookup.
+
+    By providing a dummy shape environment that always hits, we can
+    make the Inductor code cache lookup always hit, and we can
+    compile the graph for different shapes as needed.
+
+    The following dummy methods are obtained by trial-and-error
+    until it works.
+    """
+
+    def __init__(self) -> None:
+        self.guards: list[Any] = []
+
+    def evaluate_guards_expression(self, *args, **kwargs):
+        return True
+
+    def get_pruned_guards(self, *args, **kwargs):
+        return []
+
+    def produce_guards_expression(self, *args, **kwargs):
+        return ""
+
+
+def get_inductor_factors() -> list[Any]:
+    factors: list[Any] = []
+    # summarize system state
+    from torch._inductor.codecache import CacheBase
+    system_factors = CacheBase.get_system()
+    factors.append(system_factors)
+
+    # summarize pytorch state
+    from torch._inductor.codecache import torch_key
+    torch_factors = torch_key()
+    factors.append(torch_factors)
+    return factors
+
+
+class InductorStandaloneAdaptor(CompilerInterface):
+    """
+    The adaptor for the Inductor compiler.
+    Requires PyTorch 2.8+.
+    This is not on by default yet, but we plan to turn it on by default for
+    PyTorch 2.8.
+
+    Use VLLM_USE_STANDALONE_COMPILE to toggle this on or off.
+    """
+    name = "inductor_standalone"
+
+    def compute_hash(self, vllm_config: VllmConfig) -> str:
+        factors = get_inductor_factors()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()[:10]
+        return hash_str
+
+    def initialize_cache(self,
+                         cache_dir: str,
+                         disable_cache: bool = False,
+                         prefix: str = ""):
+        self.cache_dir = cache_dir
+
+    def compile(
+        self,
+        graph: fx.GraphModule,
+        example_inputs: list[Any],
+        compiler_config: dict[str, Any],
+        runtime_shape: Optional[int] = None,
+        key: Optional[str] = None,
+    ) -> tuple[Optional[Callable], Optional[Any]]:
+        compilation_counter.num_inductor_compiles += 1
+        current_config = {}
+        if compiler_config is not None:
+            current_config.update(compiler_config)
+        set_inductor_config(current_config, runtime_shape)
+
+        if isinstance(runtime_shape, int):
+            dynamic_shapes = "from_example_inputs"
+        else:
+            dynamic_shapes = "from_tracing_context"
+
+        from torch._inductor import standalone_compile
+        with pass_context(runtime_shape):
+            compiled_graph = standalone_compile(
+                graph,
+                example_inputs,
+                dynamic_shapes=dynamic_shapes,
+                options={"config_patches": current_config})
+
+        # Save the compiled artifact to disk in the specified path
+        assert key is not None
+        path = os.path.join(self.cache_dir, key)
+        if not envs.VLLM_DISABLE_COMPILE_CACHE:
+            compiled_graph.save(path=path, format="unpacked")
+            compilation_counter.num_compiled_artifacts_saved += 1
+        return compiled_graph, (key, path)
+
+    def load(self,
+             handle: Any,
+             graph: fx.GraphModule,
+             example_inputs: list[Any],
+             graph_index: int,
+             runtime_shape: Optional[int] = None) -> Callable:
+        assert isinstance(handle, tuple)
+        assert isinstance(handle[0], str)
+        assert isinstance(handle[1], str)
+        path = handle[1]
+        inductor_compiled_graph = torch._inductor.CompiledArtifact.load(
+            path=path, format="unpacked")
+        from torch._inductor.compile_fx import graph_returns_tuple
+        returns_tuple = graph_returns_tuple(graph)
+
+        def compiled_graph_wrapper(*args):
+            graph_output = inductor_compiled_graph(*args)
+            # unpack the tuple if needed
+            # TODO(rzou): the implication is that we're not
+            # reading the python bytecode correctly in vLLM?
+            if returns_tuple:
+                return graph_output
+            else:
+                return graph_output[0]
+
+        return compiled_graph_wrapper
+
+
+class InductorAdaptor(CompilerInterface):
+    """
+    The adaptor for the Inductor compiler, version 2.5, 2.6, 2.7.
+    """
+    name = "inductor"
+
+    def compute_hash(self, vllm_config: VllmConfig) -> str:
+        factors = get_inductor_factors()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()[:10]
+        return hash_str
+
+    def initialize_cache(self,
+                         cache_dir: str,
+                         disable_cache: bool = False,
+                         prefix: str = ""):
+        self.cache_dir = cache_dir
+        self.prefix = prefix
+        self.base_cache_dir = cache_dir[:-len(prefix)] if prefix else cache_dir
+        if disable_cache:
+            return
+        # redirect the cache directory to a sub-directory
+        # set flags so that Inductor and Triton store their cache
+        # in the cache_dir, then users only need to copy the cache_dir
+        # to another machine to reuse the cache.
+        inductor_cache = os.path.join(self.base_cache_dir, "inductor_cache")
+        os.makedirs(inductor_cache, exist_ok=True)
+        os.environ["TORCHINDUCTOR_CACHE_DIR"] = inductor_cache
+        triton_cache = os.path.join(self.base_cache_dir, "triton_cache")
+        os.makedirs(triton_cache, exist_ok=True)
+        os.environ["TRITON_CACHE_DIR"] = triton_cache
+
+    def compile(
+        self,
+        graph: fx.GraphModule,
+        example_inputs: list[Any],
+        compiler_config: dict[str, Any],
+        runtime_shape: Optional[int] = None,
+        key: Optional[str] = None,
+    ) -> tuple[Optional[Callable], Optional[Any]]:
+        compilation_counter.num_inductor_compiles += 1
+        from torch._inductor.compile_fx import compile_fx
+        current_config = {}
+        if compiler_config is not None:
+            current_config.update(compiler_config)
+
+        # disable remote cache
+        current_config["fx_graph_cache"] = True
+        current_config["fx_graph_remote_cache"] = False
+
+        set_inductor_config(current_config, runtime_shape)
+
+        # inductor can inplace modify the graph, so we need to copy it
+        # see https://github.com/pytorch/pytorch/issues/138980
+        graph = copy.deepcopy(graph)
+
+        # it's the first time we compile this graph
+        # the assumption is that we don't have nested Inductor compilation.
+        # compiled_fx_graph_hash will only be called once, and we can hook
+        # it to get the hash of the compiled graph directly.
+
+        hash_str, file_path = None, None
+        from torch._inductor.codecache import (FxGraphCache,
+                                               compiled_fx_graph_hash)
+        if torch.__version__.startswith("2.5"):
+            original_load = FxGraphCache.load
+            original_load_name = "torch._inductor.codecache.FxGraphCache.load"
+
+            def hijack_load(*args, **kwargs):
+                inductor_compiled_graph = original_load(*args, **kwargs)
+                nonlocal file_path
+                compiled_fn = inductor_compiled_graph.current_callable
+                file_path = compiled_fn.__code__.co_filename  # noqa
+                if not file_path.startswith(self.base_cache_dir):
+                    # hooked in the align_inputs_from_check_idxs function
+                    # in torch/_inductor/utils.py
+                    for cell in compiled_fn.__closure__:
+                        if not callable(cell.cell_contents):
+                            continue
+                        if cell.cell_contents.__code__.co_filename.startswith(
+                                self.base_cache_dir):
+                            # this is the real file path compiled from Inductor
+                            file_path = cell.cell_contents.__code__.co_filename
+                            break
+                return inductor_compiled_graph
+
+            hijacked_compile_fx_inner = torch._inductor.compile_fx.compile_fx_inner  # noqa
+        elif torch.__version__ >= "2.6":
+            # function renamed in 2.6
+            original_load_name = None
+
+            def hijacked_compile_fx_inner(*args, **kwargs):
+                output = torch._inductor.compile_fx.compile_fx_inner(
+                    *args, **kwargs)
+                nonlocal hash_str
+                inductor_compiled_graph = output
+                if inductor_compiled_graph is not None:
+                    nonlocal file_path
+                    compiled_fn = inductor_compiled_graph.current_callable
+                    file_path = compiled_fn.__code__.co_filename  # noqa
+                    if not file_path.startswith(self.base_cache_dir):
+                        # hooked in the align_inputs_from_check_idxs function
+                        # in torch/_inductor/utils.py
+                        for cell in compiled_fn.__closure__:
+                            if not callable(cell.cell_contents):
+                                continue
+                            code = cell.cell_contents.__code__
+                            if code.co_filename.startswith(
+                                    self.base_cache_dir):
+                                # this is the real file path
+                                # compiled from Inductor
+                                file_path = code.co_filename
+                                break
+                    hash_str = inductor_compiled_graph._fx_graph_cache_key
+                return output
+
+        def hijack_compiled_fx_graph_hash(*args, **kwargs):
+            out = compiled_fx_graph_hash(*args, **kwargs)
+            nonlocal hash_str
+            hash_str = out[0]
+            return out
+
+        def _check_can_cache(*args, **kwargs):
+            # no error means it can be cached.
+            # Inductor refuses to cache the graph outside of Dynamo
+            # tracing context, and also disables caching for graphs
+            # with high-order ops.
+            # For vLLM, in either case, we want to cache the graph.
+            # see https://github.com/pytorch/pytorch/blob/9f5ebf3fc609105a74eab4ccc24932d6353ff566/torch/_inductor/codecache.py#L1221 # noqa
+            return
+
+        def _get_shape_env() -> AlwaysHitShapeEnv:
+            return AlwaysHitShapeEnv()
+
+        with ExitStack() as stack:
+            # hijack to get the compiled graph itself
+            if original_load_name is not None:
+                stack.enter_context(patch(original_load_name, hijack_load))
+
+            # for hijacking the hash of the compiled graph
+            stack.enter_context(
+                patch("torch._inductor.codecache.compiled_fx_graph_hash",
+                      hijack_compiled_fx_graph_hash))
+
+            # for providing a dummy shape environment
+            stack.enter_context(
+                patch("torch._inductor.codecache.FxGraphCache._get_shape_env",
+                      _get_shape_env))
+
+            from torch._functorch._aot_autograd.autograd_cache import (
+                AOTAutogradCache)
+
+            # torch 2.8+ on main uses _get_shape_env in AOTAutogradCache
+            if hasattr(AOTAutogradCache, "_get_shape_env"):
+                stack.enter_context(
+                    patch(
+                        "torch._functorch._aot_autograd.autograd_cache.AOTAutogradCache._get_shape_env",
+                        _get_shape_env))
+
+            # for forcing the graph to be cached
+            stack.enter_context(
+                patch(
+                    "torch._inductor.codecache.FxGraphCache._check_can_cache",
+                    _check_can_cache))
+
+            # Dynamo metrics context, see method for more details.
+            stack.enter_context(self.metrics_context())
+
+            # Disable remote caching. When these are on, on remote cache-hit,
+            # the monkey-patched functions never actually get called.
+            # vLLM today assumes and requires the monkey-patched functions to
+            # get hit.
+            # TODO(zou3519): we're going to replace this all with
+            # standalone_compile sometime.
+            if is_torch_equal_or_newer("2.6"):
+                stack.enter_context(
+                    torch._inductor.config.patch(fx_graph_remote_cache=False))
+                # InductorAdaptor (unfortunately) requires AOTAutogradCache
+                # to be turned off to run. It will fail to acquire the hash_str
+                # and error if not.
+                # StandaloneInductorAdaptor (PyTorch 2.8+) fixes this problem.
+                stack.enter_context(
+                    torch._functorch.config.patch(enable_autograd_cache=False))
+                stack.enter_context(
+                    torch._functorch.config.patch(
+                        enable_remote_autograd_cache=False))
+
+            with pass_context(runtime_shape):
+                compiled_graph = compile_fx(
+                    graph,
+                    example_inputs,
+                    inner_compile=hijacked_compile_fx_inner,
+                    config_patches=current_config)
+
+        # We treat VLLM_DISABLE_COMPILE_CACHE as the overall switch for torch
+        # compilation cache. So turn off the checks if we disable the
+        # compilation cache.
+        if not envs.VLLM_DISABLE_COMPILE_CACHE:
+            if hash_str is None:
+                raise RuntimeError(
+                    "vLLM failed to compile the model. The most "
+                    "likely reason for this is that a previous compilation "
+                    "failed, leading to a corrupted compilation artifact. "
+                    "We recommend trying to "
+                    "remove ~/.cache/vllm/torch_compile_cache and try again "
+                    "to see the real issue. ")
+            assert file_path is not None, (
+                "failed to get the file path of the compiled graph")
+        return compiled_graph, (hash_str, file_path)
+
+    def load(self,
+             handle: Any,
+             graph: fx.GraphModule,
+             example_inputs: list[Any],
+             graph_index: int,
+             runtime_shape: Optional[int] = None) -> Callable:
+        assert isinstance(handle, tuple)
+        assert isinstance(handle[0], str)
+        assert isinstance(handle[1], str)
+        hash_str = handle[0]
+
+        from torch._functorch._aot_autograd.autograd_cache import (
+            AOTAutogradCache)
+        from torch._inductor.codecache import FxGraphCache
+        with ExitStack() as exit_stack:
+            exit_stack.enter_context(
+                patch("torch._inductor.codecache.FxGraphCache._get_shape_env",
+                      lambda *args, **kwargs: AlwaysHitShapeEnv()))
+            # torch 2.8+ on main uses _get_shape_env in AOTAutogradCache
+            if hasattr(AOTAutogradCache, "_get_shape_env"):
+                exit_stack.enter_context(
+                    patch(
+                        "torch._functorch._aot_autograd.autograd_cache.AOTAutogradCache._get_shape_env",
+                        lambda *args, **kwargs: AlwaysHitShapeEnv()))
+
+            # Dynamo metrics context, see method for more details.
+            exit_stack.enter_context(self.metrics_context())
+
+            if torch.__version__.startswith("2.5"):
+                inductor_compiled_graph = FxGraphCache._lookup_graph(
+                    hash_str, example_inputs, True, False)
+                assert inductor_compiled_graph is not None, (
+                    "Inductor cache lookup failed. Please remove"
+                    f"the cache directory and try again."  # noqa
+                )
+            elif torch.__version__ >= "2.6":
+                from torch._inductor.output_code import (
+                    CompiledFxGraphConstantsWithGm)
+                constants = CompiledFxGraphConstantsWithGm(graph)
+                inductor_compiled_graph, _ = FxGraphCache._lookup_graph(
+                    hash_str, example_inputs, True, None, constants)
+                assert inductor_compiled_graph is not None, (
+                    "Inductor cache lookup failed. Please remove"
+                    f"the cache directory and try again."  # noqa
+                )
+
+        # Inductor calling convention (function signature):
+        # f(list) -> tuple
+        # Dynamo calling convention (function signature):
+        # f(*args) -> Any
+
+        # need to know if the graph returns a tuple
+        from torch._inductor.compile_fx import graph_returns_tuple
+        returns_tuple = graph_returns_tuple(graph)
+
+        # this is the callable we return to Dynamo to run
+        def compiled_graph(*args):
+            # convert args to list
+            list_args = list(args)
+            graph_output = inductor_compiled_graph(list_args)
+            # unpack the tuple if needed
+            if returns_tuple:
+                return graph_output
+            else:
+                return graph_output[0]
+
+        return compiled_graph
+
+    def metrics_context(self) -> contextlib.AbstractContextManager:
+        """
+        This method returns the Dynamo metrics context (if it exists,
+        otherwise a null context). It is used by various compile components.
+        Present in torch>=2.6, it's used inside FxGraphCache in
+        torch==2.6 (but not after). It might also be used in various other
+        torch.compile internal functions.
+
+        Because it is re-entrant, we always set it (even if entering via Dynamo
+        and the context was already entered). We might want to revisit if it
+        should be set at a different level of compilation.
+
+        This is likely a bug in PyTorch: public APIs should not rely on
+        manually setting up internal contexts. But we also rely on non-public
+        APIs which might not provide these guarantees.
+        """
+        if is_torch_equal_or_newer("2.6"):
+            import torch._dynamo.utils
+            return torch._dynamo.utils.get_metrics_context()
+        else:
+            return contextlib.nullcontext()
+
+
+def set_inductor_config(config, runtime_shape):
+    if isinstance(runtime_shape, int):
+        # for a specific batchsize, tuning triton kernel parameters
+        # can be beneficial
+        config["max_autotune"] = True
+        config["coordinate_descent_tuning"] = True
+
+
+class EagerAdaptor(CompilerInterface):
+    name = "eager"
+
+    def compile(
+        self,
+        graph: fx.GraphModule,
+        example_inputs: list[Any],
+        compiler_config: dict[str, Any],
+        runtime_shape: Optional[int] = None,
+        key: Optional[str] = None,
+    ) -> tuple[Optional[Callable], Optional[Any]]:
+        compilation_counter.num_eager_compiles += 1
+        # we don't need to compile the graph, just return the graph itself.
+        # It does not support caching, return None for the handle.
+        return graph, None
diff --git a/vllm_v0.10.0/vllm/compilation/counter.py b/vllm_v0.10.0/vllm/compilation/counter.py
new file mode 100644
index 0000000..6acb8ab
--- /dev/null
+++ b/vllm_v0.10.0/vllm/compilation/counter.py
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import copy
+import dataclasses
+from contextlib import contextmanager
+
+
+@dataclasses.dataclass
+class CompilationCounter:
+    num_models_seen: int = 0
+    num_graphs_seen: int = 0
+    # including the splitting ops
+    num_piecewise_graphs_seen: int = 0
+    # not including the splitting ops
+    num_piecewise_capturable_graphs_seen: int = 0
+    num_backend_compilations: int = 0
+    # Number of gpu_model_runner attempts to trigger CUDAGraphs capture
+    num_gpu_runner_capture_triggers: int = 0
+    # Number of CUDAGraphs captured
+    num_cudagraph_captured: int = 0
+    # InductorAdapter.compile calls
+    num_inductor_compiles: int = 0
+    # EagerAdapter.compile calls
+    num_eager_compiles: int = 0
+    # The number of time vLLM's compiler cache entry was updated
+    num_cache_entries_updated: int = 0
+    # The number of standalone_compile compiled artifacts saved
+    num_compiled_artifacts_saved: int = 0
+
+    def clone(self) -> "CompilationCounter":
+        return copy.deepcopy(self)
+
+    @contextmanager
+    def expect(self, **kwargs):
+        old = self.clone()
+        yield
+        for k, v in kwargs.items():
+            assert getattr(self, k) - getattr(old, k) == v, (
+                f"{k} not as expected, before it is {getattr(old, k)}"
+                f", after it is {getattr(self, k)}, "
+                f"expected diff is {v}")
+
+
+compilation_counter = CompilationCounter()
diff --git a/vllm_v0.10.0/vllm/compilation/cuda_piecewise_backend.py b/vllm_v0.10.0/vllm/compilation/cuda_piecewise_backend.py
new file mode 100644
index 0000000..8c49ea6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/compilation/cuda_piecewise_backend.py
@@ -0,0 +1,218 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import dataclasses
+from contextlib import ExitStack
+from typing import Any, Callable, Optional
+from unittest.mock import patch
+
+import torch
+import torch.fx as fx
+
+import vllm.envs as envs
+from vllm.compilation.backends import VllmBackend
+from vllm.compilation.counter import compilation_counter
+from vllm.compilation.monitor import end_monitoring_torch_compile
+from vllm.config import VllmConfig
+from vllm.forward_context import get_forward_context
+from vllm.logger import init_logger
+from vllm.utils import weak_ref_tensors
+
+logger = init_logger(__name__)
+
+
+@dataclasses.dataclass
+class ConcreteSizeEntry:
+    runtime_shape: int
+    need_to_compile: bool  # the size is in compile_sizes
+    use_cudagraph: bool  # the size is in cudagraph_capture_sizes
+
+    compiled: bool = False
+    runnable: Callable = None  # type: ignore
+    num_finished_warmup: int = 0
+    cudagraph: Optional[torch.cuda.CUDAGraph] = None
+    output: Optional[Any] = None
+
+    # for cudagraph debugging, track the input addresses
+    # during capture, and check if they are the same during replay
+    input_addresses: Optional[list[int]] = None
+
+
+class CUDAPiecewiseBackend:
+
+    def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
+                 graph_pool: Any, piecewise_compile_index: int,
+                 total_piecewise_compiles: int, sym_shape_indices: list[int],
+                 compiled_graph_for_general_shape: Callable,
+                 vllm_backend: VllmBackend):
+        """
+        The backend for piecewise compilation.
+        It mainly handles the compilation and cudagraph capturing.
+
+        We will compile `self.graph` once for the general shape,
+        and then compile for different shapes specified in
+        `compilation_config.compile_sizes`.
+
+        Independently, we will capture cudagraph for different shapes.
+
+        If a shape needs both compilation and cudagraph, we will
+        compile it first, and then capture cudagraph.
+        """
+        self.graph = graph
+        self.vllm_config = vllm_config
+        self.compilation_config = vllm_config.compilation_config
+        self.graph_pool = graph_pool
+        self.piecewise_compile_index = piecewise_compile_index
+        self.total_piecewise_compiles = total_piecewise_compiles
+        self.vllm_backend = vllm_backend
+
+        self.is_first_graph = piecewise_compile_index == 0
+        self.is_last_graph = (
+            piecewise_compile_index == total_piecewise_compiles - 1)
+
+        self.compile_sizes: set[int] = set(
+            self.compilation_config.compile_sizes)
+        self.cudagraph_capture_sizes: set[int] = set(
+            self.compilation_config.cudagraph_capture_sizes
+        ) if self.compilation_config.use_cudagraph else set()
+
+        self.first_run_finished = False
+
+        self.compiled_graph_for_general_shape = compiled_graph_for_general_shape  # noqa
+
+        self.sym_shape_indices = sym_shape_indices
+
+        self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG"
+
+        # the entries for different shapes that we need to either
+        # compile or capture cudagraph
+        self.concrete_size_entries: dict[int, ConcreteSizeEntry] = {}
+
+        # to_be_compiled_sizes tracks the remaining sizes to compile,
+        # and updates during the compilation process, so we need to copy it
+        self.to_be_compiled_sizes: set[int] = self.compile_sizes.copy()
+        for shape in self.compile_sizes.union(self.cudagraph_capture_sizes):
+            self.concrete_size_entries[shape] = ConcreteSizeEntry(
+                runtime_shape=shape,
+                need_to_compile=shape in self.compile_sizes,
+                use_cudagraph=shape in self.cudagraph_capture_sizes,
+            )
+
+    def check_for_ending_compilation(self):
+        if self.is_last_graph and not self.to_be_compiled_sizes:
+            # no specific sizes to compile
+            # save the hash of the inductor graph for the next run
+            self.vllm_backend.compiler_manager.save_to_file()
+            end_monitoring_torch_compile(self.vllm_config)
+
+    def __call__(self, *args) -> Any:
+        if not self.first_run_finished:
+            self.first_run_finished = True
+            self.check_for_ending_compilation()
+            return self.compiled_graph_for_general_shape(*args)
+
+        runtime_shape = args[self.sym_shape_indices[0]]
+        if runtime_shape not in self.concrete_size_entries:
+            # we don't need to do anything for this shape
+            return self.compiled_graph_for_general_shape(*args)
+
+        entry = self.concrete_size_entries[runtime_shape]
+
+        if entry.runnable is None:
+            entry.runnable = self.compiled_graph_for_general_shape
+
+        if entry.need_to_compile and not entry.compiled:
+            entry.compiled = True
+            self.to_be_compiled_sizes.remove(runtime_shape)
+            # args are real arguments
+            entry.runnable = self.vllm_backend.compiler_manager.compile(
+                self.graph,
+                args,
+                self.compilation_config.inductor_compile_config,
+                self.compilation_config,
+                graph_index=self.piecewise_compile_index,
+                num_graphs=self.total_piecewise_compiles,
+                runtime_shape=runtime_shape)
+
+            # finished compilations for all required shapes
+            if self.is_last_graph and not self.to_be_compiled_sizes:
+                self.check_for_ending_compilation()
+
+        # Skip CUDA graphs if this entry doesn't use them OR
+        # if we're supposed to skip them globally
+        skip_cuda_graphs = get_forward_context().skip_cuda_graphs
+        if not entry.use_cudagraph or skip_cuda_graphs:
+            return entry.runnable(*args)
+
+        if entry.cudagraph is None:
+            if entry.num_finished_warmup < self.compilation_config.cudagraph_num_of_warmups:  # noqa
+                entry.num_finished_warmup += 1
+                if self.is_first_graph:
+                    logger.debug(
+                        "Warming up %s/%s for shape %s",
+                        entry.num_finished_warmup,
+                        self.compilation_config.cudagraph_num_of_warmups,
+                        runtime_shape)
+                return entry.runnable(*args)
+
+            if self.is_first_graph:
+                # Since we capture cudagraph for many different shapes and
+                # capturing is fast, we don't need to log it for every shape.
+                # We only log it in the debug mode.
+                logger.debug("Capturing a cudagraph for shape %s",
+                             runtime_shape)
+
+            input_addresses = [
+                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
+            ]
+            entry.input_addresses = input_addresses
+            cudagraph = torch.cuda.CUDAGraph()
+
+            with ExitStack() as stack:
+                if not self.is_first_graph:
+                    # during every model forward, we will capture
+                    # many pieces of cudagraphs (roughly one per layer).
+                    # running gc again and again across layers will
+                    # make the cudagraph capture very slow.
+                    # therefore, we only run gc for the first graph,
+                    # and disable gc for the rest of the graphs.
+                    stack.enter_context(patch("gc.collect", lambda: None))
+                    stack.enter_context(
+                        patch("torch.cuda.empty_cache", lambda: None))
+
+                # mind-exploding: carefully manage the reference and memory.
+                with torch.cuda.graph(cudagraph, pool=self.graph_pool):
+                    # `output` is managed by pytorch's cudagraph pool
+                    output = entry.runnable(*args)
+                    if self.is_last_graph:
+                        # by converting it to weak ref,
+                        # the original `output` will immediately be released
+                        # to save memory. It is only safe to do this for
+                        # the last graph, because the output of the last graph
+                        # will not be used by any other cuda graph.
+                        output = weak_ref_tensors(output)
+
+            # here we always use weak ref for the output
+            # to save memory
+            entry.output = weak_ref_tensors(output)
+            entry.cudagraph = cudagraph
+
+            compilation_counter.num_cudagraph_captured += 1
+
+            # important: we need to return the output, rather than
+            # the weak ref of the output, so that pytorch can correctly
+            # manage the memory during cuda graph capture
+            return output
+
+        if self.is_debugging_mode:
+            # check if the input addresses are the same
+            new_input_addresses = [
+                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
+            ]
+            assert new_input_addresses == entry.input_addresses, (
+                "Input addresses for cudagraphs are different during replay."
+                f" Expected {entry.input_addresses}, got {new_input_addresses}"
+            )
+
+        entry.cudagraph.replay()
+        return entry.output
diff --git a/vllm_v0.10.0/vllm/compilation/decorators.py b/vllm_v0.10.0/vllm/compilation/decorators.py
new file mode 100644
index 0000000..f359232
--- /dev/null
+++ b/vllm_v0.10.0/vllm/compilation/decorators.py
@@ -0,0 +1,283 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import inspect
+from typing import Callable, Optional, TypeVar, Union, overload
+from unittest.mock import patch
+
+import torch
+import torch.nn as nn
+from torch._dynamo.symbolic_convert import InliningInstructionTranslator
+
+from vllm.compilation.counter import compilation_counter
+from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
+from vllm.config import CompilationLevel, VllmConfig
+from vllm.logger import init_logger
+from vllm.sequence import IntermediateTensors
+from vllm.utils import supports_dynamo
+
+from .monitor import start_monitoring_torch_compile
+
+logger = init_logger(__name__)
+
+IGNORE_COMPILE_KEY = "_ignore_compile_vllm"
+
+_T = TypeVar("_T", bound=type[nn.Module])
+
+
+def ignore_torch_compile(cls: _T) -> _T:
+    """
+    A decorator to ignore support_torch_compile decorator
+    on the class. This is useful when a parent class has
+    a support_torch_compile decorator, but we don't want to
+    compile the class `cls` that inherits the parent class.
+    This only ignores compiling the forward of the class the
+    decorator is applied to. 
+
+    If the parent has ignore_torch_compile but the child has
+    support_torch_compile, the child will still be compiled.
+    
+    If the class has one or more submodules
+    that have support_torch_compile decorator applied, compile will
+    not be ignored for those submodules.
+    """
+    setattr(cls, IGNORE_COMPILE_KEY, True)
+    return cls
+
+
+def _should_ignore_torch_compile(cls) -> bool:
+    """
+    Check if the class should be ignored for torch.compile.
+    """
+    return getattr(cls, IGNORE_COMPILE_KEY, False)
+
+
+@overload
+def support_torch_compile(
+    *,
+    dynamic_arg_dims: Optional[dict[str, Union[int, list[int]]]],
+) -> Callable[[_T], _T]:
+    ...
+
+
+@overload
+def support_torch_compile(cls: _T) -> _T:
+    ...
+
+
+def support_torch_compile(
+    cls: Optional[_T] = None,
+    *,
+    dynamic_arg_dims: Optional[dict[str, Union[int, list[int]]]] = None,
+) -> Union[Callable[[_T], _T], _T]:
+    """
+    A decorator to add support for compiling the forward method of a class.
+
+    Usage 1: use directly as a decorator without arguments:
+
+    ```python
+    @support_torch_compile
+    class MyModel(nn.Module):
+        def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]):
+            ...
+    ```
+
+    Usage 2: use as a decorator with arguments:
+
+    ```python
+    @support_torch_compile(dynamic_arg_dims={"x": 0, "y": 0})
+    class MyModel(nn.Module):
+        def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]):
+            ...
+    ```
+
+    `dynamic_arg_dims` is a dictionary that maps argument names to the dynamic
+    dimensions of the argument. The dynamic dimensions can be either a single
+    integer or a list of integers.
+
+    if `dynamic_arg_dims` is `None`, it is inferred from the type annotation
+    of the `forward` method, based on the following default rules:
+
+    - if the argument is annotated as `torch.Tensor` or
+        `Optional[torch.Tensor]`, the first dimension will be
+        marked as dynamic.
+    - if the argument is annotated as `IntermediateTensors`, the first
+        dimension of all the tensors in the intermediate tensors
+        will be marked as dynamic.
+
+    During runtime, when we actually mark dimensions of tensors,
+     it depends on the value of arguments:
+
+    - if it is a single integer (can be negative), the corresponding dimension 
+        of the argument will be marked as dynamic.
+    - if it is `None`, ignored.
+    - if it is `IntermediateTensors`, all the tensors in the intermediate
+        tensors will be marked as dynamic.
+    - otherwise, it will raise an error.
+
+    NOTE: if an argument is `None`, it should always be passed as `None` during
+    the lifetime of the model, otherwise, it cannot be captured as a single
+    computation graph.
+    """
+
+    def cls_decorator_helper(cls: _T) -> _T:
+        # helper to pass `dynamic_arg_dims`` to `_support_torch_compile``
+        # to avoid too much indentation for `_support_torch_compile``
+        if not hasattr(cls, 'forward'):
+            raise TypeError("decorated class should have a forward method.")
+        sig = inspect.signature(cls.forward)
+        inferred_dynamic_arg_dims = dynamic_arg_dims
+        if inferred_dynamic_arg_dims is None:
+            inferred_dynamic_arg_dims = {}
+            for k, v in sig.parameters.items():
+                if v.annotation in [
+                        torch.Tensor, Optional[torch.Tensor],
+                        IntermediateTensors, Optional[IntermediateTensors]
+                ]:
+                    inferred_dynamic_arg_dims[k] = 0
+
+            logger.debug(("Inferred dynamic dimensions for "
+                          "forward method of %s: %s"), cls,
+                         list(inferred_dynamic_arg_dims.keys()))
+
+        if len(inferred_dynamic_arg_dims) == 0:
+            raise ValueError(
+                "No dynamic dimensions found in the forward method of "
+                f"{cls}. Please provide dynamic_arg_dims explicitly.")
+
+        for k in inferred_dynamic_arg_dims:
+            if k not in sig.parameters:
+                raise ValueError(
+                    f"Argument {k} not found in the forward method of {cls}")
+        return _support_torch_compile(cls, inferred_dynamic_arg_dims)
+
+    if cls is not None:
+        # use `support_torch_compile` as a decorator without arguments
+        assert isinstance(cls, type)
+        return cls_decorator_helper(cls)
+
+    return cls_decorator_helper
+
+
+def _support_torch_compile(
+    cls: _T,
+    dynamic_arg_dims: dict[str, Union[int, list[int]]],
+) -> _T:
+    """
+    A decorator to add support for compiling the forward method of a class.
+    """
+    if TorchCompileWrapperWithCustomDispatcher in cls.__bases__:
+        # support decorating multiple times
+        return cls
+
+    # take care of method resolution order
+    # make sure super().__init__ is called on the base class
+    #  other than TorchCompileWrapperWithCustomDispatcher
+    cls.__bases__ = cls.__bases__ + (TorchCompileWrapperWithCustomDispatcher, )
+
+    old_init = cls.__init__
+
+    setattr(cls, IGNORE_COMPILE_KEY, False)
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs):
+        old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
+        self.vllm_config = vllm_config
+        # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner
+        # will handle the compilation, so we don't need to do anything here.
+        self.do_not_compile = \
+            vllm_config.compilation_config.level in [
+            CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS
+        ] or not supports_dynamo() or _should_ignore_torch_compile(
+            self.__class__)
+        if self.do_not_compile:
+            return
+
+        compilation_counter.num_models_seen += 1
+        TorchCompileWrapperWithCustomDispatcher.__init__(
+            self, compilation_level=vllm_config.compilation_config.level)
+
+    cls.__init__ = __init__
+
+    def __call__(self, *args, **kwargs):
+        # torch.compiler.is_compiling() means we are inside the compilation
+        # e.g. TPU has the compilation logic in model runner, so we don't
+        # need to compile the model inside.
+        if self.do_not_compile or torch.compiler.is_compiling():
+            return self.forward(*args, **kwargs)
+
+        # the first compilation needs to have dynamic shapes marked
+        if len(self.compiled_codes) < 1:
+            sig = inspect.signature(self.__class__.forward)
+            bound_args = sig.bind(self, *args, **kwargs)
+            bound_args.apply_defaults()
+            for k, dims in dynamic_arg_dims.items():
+                arg = bound_args.arguments.get(k)
+                if arg is not None:
+                    dims = [dims] if isinstance(dims, int) else dims
+                    if isinstance(arg, torch.Tensor):
+                        # In case dims is specified with negative indexing
+                        dims = [
+                            arg.ndim + dim if dim < 0 else dim for dim in dims
+                        ]
+                        torch._dynamo.mark_dynamic(arg, dims)
+                    elif isinstance(arg, IntermediateTensors):
+                        for tensor in arg.tensors.values():
+                            # In case dims is specified with negative indexing
+                            dims = [
+                                tensor.ndim + dim if dim < 0 else dim
+                                for dim in dims
+                            ]
+                            torch._dynamo.mark_dynamic(tensor, dims)
+                    else:
+                        raise ValueError(
+                            "Unsupported dynamic dimensions"
+                            f" {dims} for argument {k} with type {type(arg)}.")
+            # here, it is the starting point of the `torch.compile` process
+            start_monitoring_torch_compile(self.vllm_config)
+            logger.debug("Start compiling function %s",
+                         self.original_code_object)
+
+        # if we don't use custom dispatcher, we can directly call the
+        # compiled function and let torch.compile handle the dispatching,
+        # with the overhead of guard evaluation and recompilation.
+        if len(self.compiled_codes) < 1 or not self.use_custom_dispatcher:
+            # it seems Dynamo reuse the compilation across instances,
+            # while we need to make sure the compiled code is not reused.
+            # we need to control all the compilation of the model.
+            torch._dynamo.eval_frame.remove_from_cache(
+                self.original_code_object)
+
+            # collect all relevant files traced by Dynamo,
+            # so that the compilation cache can trigger re-compilation
+            # properly when any of these files change.
+
+            # 1. the file containing the top-level forward function
+            self.vllm_config.compilation_config.traced_files.add(
+                self.original_code_object.co_filename)
+
+            # 2. every time Dynamo sees a function call, it will inline
+            # the function by calling InliningInstructionTranslator.inline_call
+            # we hijack this function to know all the functions called
+            # during Dynamo tracing, and their corresponding files
+            inline_call = InliningInstructionTranslator.inline_call
+
+            def patched_inline_call(parent, func, args, kwargs):
+                code = func.get_code()
+                self.vllm_config.compilation_config.traced_files.add(
+                    code.co_filename)
+                return inline_call(parent, func, args, kwargs)
+
+            with patch.object(InliningInstructionTranslator, 'inline_call',
+                              patched_inline_call):
+                output = self.compiled_callable(*args, **kwargs)
+            return output
+
+        # usually, capturing the model once is enough, and then we can
+        # dispatch to the compiled code directly, without going through
+        # the Dynamo guard mechanism.
+        with self.dispatch_to_code(0):
+            model_output = self.forward(*args, **kwargs)
+            return model_output
+
+    cls.__call__ = __call__
+    return cls
diff --git a/vllm_v0.10.0/vllm/compilation/fix_functionalization.py b/vllm_v0.10.0/vllm/compilation/fix_functionalization.py
new file mode 100644
index 0000000..286221d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/compilation/fix_functionalization.py
@@ -0,0 +1,191 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import operator
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+
+from vllm.logger import init_logger
+
+from .fx_utils import is_func
+from .vllm_inductor_pass import VllmInductorPass
+
+logger = init_logger(__name__)
+
+
+class FixFunctionalizationPass(VllmInductorPass):
+    """
+    This pass defunctionalizes certain nodes to avoid redundant tensor copies.
+    After this pass, DCE (dead-code elimination) should never be run,
+    as de-functionalized nodes may appear as dead code.
+
+    To add new nodes to defunctionalize, add to the if-elif chain in __call__.
+    """
+
+    def __call__(self, graph: torch.fx.Graph):
+        self.begin()
+        self.dump_graph(graph, "before_fix_functionalization")
+
+        self.nodes_to_remove: list[torch.fx.Node] = []
+        count = 0
+        for node in graph.nodes:
+            if not is_func(node, auto_functionalized):
+                continue  # Avoid deep if-elif nesting
+
+            kwargs = node.kwargs
+            at_target = node.args[0]
+
+            if at_target == torch.ops._C.rotary_embedding.default:
+                query = kwargs['query']
+                mm_node = query.args[0].args[0]
+
+                # rotary_embedding is a special case: the two mutating inputs
+                # are query and key, which are slices of mm_node.
+                # While functionalized, results at[1] and at[2] are scattered
+                # back into mm_node. After de-functionalization, we can just
+                # use mm_node directly.
+                for idx, user in self.getitem_users(node).items():
+                    for user_of_getitem in user.users:
+                        if is_func(user_of_getitem,
+                                   torch.ops.aten.slice_scatter.default):
+                            user_of_getitem.replace_all_uses_with(mm_node)
+                            self._remove(user_of_getitem)
+                    self._remove(user)
+
+                self.insert_defunctionalized(graph, node)
+                self._remove(node)
+
+            # rms_norm replacements avoid the most copies for LLaMa.
+            elif at_target == torch.ops._C.fused_add_rms_norm.default:
+                mutated_args = {1: 'input', 2: 'residual'}
+                self.defunctionalize(graph, node, mutated_args)
+            elif at_target == torch.ops._C.fused_add_rms_norm_static_fp8_quant.default:  # noqa: E501
+                mutated_args = {1: 'result', 2: 'residual'}
+                self.defunctionalize(graph, node, mutated_args)
+            elif at_target == torch.ops._C.rms_norm_dynamic_per_token_quant.default:  # noqa: E501
+                mutated_args = {1: 'result', 2: 'scale', 3: 'residual'}
+                self.defunctionalize(graph, node, mutated_args)
+            elif at_target in [
+                    torch.ops._C.rms_norm.default,
+                    torch.ops._C.rms_norm_static_fp8_quant.default,
+            ]:
+                mutated_args = {1: 'result'}
+                self.defunctionalize(graph, node, mutated_args)
+            # For some reason we need to specify the args for both
+            # silu_and_mul and silu_and_mul_quant. The kwargs
+            # pathway gets the wrong answer.
+            elif at_target == torch.ops._C.silu_and_mul.default:
+                mutated_args = {1: 'result'}
+                self.defunctionalize(graph,
+                                     node,
+                                     mutated_args,
+                                     args=('result', 'input'))
+            elif at_target == torch.ops._C.silu_and_mul_quant.default:
+                mutated_args = {1: 'result'}
+                self.defunctionalize(graph,
+                                     node,
+                                     mutated_args,
+                                     args=('result', 'input', 'scale'))
+            else:
+                continue  # skip the count
+
+            count += 1
+
+        self.dump_graph(graph, "before_fix_functionalization_cleanup")
+
+        # Remove the nodes all at once
+        count_removed = len(self.nodes_to_remove)
+        for node in self.nodes_to_remove:
+            graph.erase_node(node)
+
+        logger.debug("De-functionalized %s nodes, removed %s nodes", count,
+                     count_removed)
+        self.dump_graph(graph, "after_fix_functionalization")
+        self.end_and_log()
+
+    def _remove(self, node_or_nodes: Union[torch.fx.Node,
+                                           Iterable[torch.fx.Node]]):
+        """
+        Stage a node (or nodes) for removal at the end of the pass.
+        """
+        if isinstance(node_or_nodes, torch.fx.Node):
+            self.nodes_to_remove.append(node_or_nodes)
+        else:
+            self.nodes_to_remove.extend(node_or_nodes)
+
+    def defunctionalize(self,
+                        graph: torch.fx.Graph,
+                        node: torch.fx.Node,
+                        mutated_args: dict[int, Union[torch.fx.Node, str]],
+                        args: Optional[tuple[Union[torch.fx.Node, str],
+                                             ...]] = None):
+        """
+        De-functionalize a node by replacing it with a call to the original.
+        It also replaces the getitem users with the mutated arguments.
+        See replace_users_with_mutated_args and insert_defunctionalized.
+        """
+        self.replace_users_with_mutated_args(node, mutated_args)
+        self.insert_defunctionalized(graph, node, args=args)
+        self._remove(node)
+
+    def replace_users_with_mutated_args(self, node: torch.fx.Node,
+                                        mutated_args: dict[int,
+                                                           Union[torch.fx.Node,
+                                                                 str]]):
+        """
+        Replace all getitem users of the auto-functionalized node with the
+        mutated arguments.
+        :param node: The auto-functionalized node
+        :param mutated_args: The mutated arguments, indexed by getitem index.
+        If the value of an arg is a string, `node.kwargs[arg]` is used.
+        """
+        for idx, user in self.getitem_users(node).items():
+            arg = mutated_args[idx]
+            arg = node.kwargs[arg] if isinstance(arg, str) else arg
+            user.replace_all_uses_with(arg)
+            self._remove(user)
+
+    def getitem_users(self, node: torch.fx.Node) -> dict[int, torch.fx.Node]:
+        """
+        Returns the operator.getitem users of the auto-functionalized node,
+        indexed by the index they are getting.
+        """
+        users = {}
+        for user in node.users:
+            if is_func(user, operator.getitem):
+                idx = user.args[1]
+                users[idx] = user
+        return users
+
+    def insert_defunctionalized(self,
+                                graph: torch.fx.Graph,
+                                node: torch.fx.Node,
+                                args: Optional[tuple[Union[torch.fx.Node, str],
+                                                     ...]] = None):
+        """
+        Insert a new defunctionalized node into the graph before node.
+        If one of the kwargs is 'out', provide args directly,
+        as node.kwargs cannot be used.
+        See https://github.com/pytorch/pytorch/blob/a00faf440888ffb724bad413f329a49e2b6388e7/torch/_inductor/lowering.py#L351
+
+        :param graph: Graph to insert the defunctionalized node into
+        :param node: The auto-functionalized node to defunctionalize
+        :param args: If we cannot use kwargs, specify args directly.
+        If an arg is a string, `node.kwargs[arg]` is used.
+        """  # noqa: E501
+        assert is_func(node, auto_functionalized), \
+            f"node must be auto-functionalized, is {node} instead"
+
+        # Create a new call to the original function
+        with graph.inserting_before(node):
+            function = node.args[0]
+            if args is None:
+                graph.call_function(function, kwargs=node.kwargs)
+            else:
+                # Args passed as strings refer to items in node.kwargs
+                args = tuple(node.kwargs[arg] if isinstance(arg, str) else arg
+                             for arg in args)
+                graph.call_function(function, args=args)
diff --git a/vllm_v0.10.0/vllm/compilation/fusion.py b/vllm_v0.10.0/vllm/compilation/fusion.py
new file mode 100644
index 0000000..3dec939
--- /dev/null
+++ b/vllm_v0.10.0/vllm/compilation/fusion.py
@@ -0,0 +1,626 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Callable, NamedTuple, Optional
+
+import torch
+import torch._inductor.pattern_matcher as pm
+from torch import fx
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+from torch._inductor.pattern_matcher import PatternMatcherPass
+from torch._ops import OpOverload
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape)
+from vllm.platforms import current_platform
+
+from .fx_utils import find_getitem_maybe
+from .multi_output_match import MultiOutputMatch
+from .vllm_inductor_pass import VllmInductorPass
+
+logger = init_logger(__name__)
+FP8_DTYPE = current_platform.fp8_dtype()
+
+
+def empty_bf16(*args, **kwargs):
+    return torch.empty(*args, **kwargs, dtype=torch.bfloat16, device="cuda")
+
+
+def empty_fp32(*args, **kwargs):
+    return torch.empty(*args, **kwargs, dtype=torch.float32, device="cuda")
+
+
+RMS_OP = torch.ops._C.rms_norm.default
+RMS_ADD_OP = torch.ops._C.fused_add_rms_norm.default
+
+
+class QuantKey(NamedTuple):
+    """
+    Named tuple for identifying the type of quantization.
+    dtype: quantized data type
+    static: static quantization if True, dynamic if False
+    group_shape: quantization group shape
+    symmetric: symmetric if True, asymmetric if False
+
+    TODO(luka) use QuantDescriptor once standardized:
+    https://github.com/vllm-project/vllm/issues/8913
+
+    """
+    dtype: torch.dtype
+    static: bool
+    group_shape: GroupShape
+    symmetric: bool = True
+
+    def __str__(self):
+        group_shape = ('per_tensor'
+                       if self.group_shape == GroupShape.PER_TENSOR else
+                       ('per_token' if self.group_shape == GroupShape.PER_TOKEN
+                        else str(self.group_shape)))
+
+        return (f"QuantKey({'static' if self.static else 'dynamic'},"
+                f"{fx.graph.dtype_abbrs[self.dtype]},{group_shape},"
+                f"{'a' if not self.symmetric else ''}symmetric)")
+
+
+kFp8StaticTensorSym = QuantKey(FP8_DTYPE, True, GroupShape.PER_TENSOR, True)
+kFp8DynamicTensorSym = QuantKey(FP8_DTYPE, False, GroupShape.PER_TENSOR, True)
+kFp8DynamicTokenSym = QuantKey(FP8_DTYPE, False, GroupShape.PER_TOKEN, True)
+
+QUANT_OPS: dict[QuantKey, OpOverload] = {
+    kFp8StaticTensorSym:
+    torch.ops._C.static_scaled_fp8_quant.default,  # noqa: E501
+    kFp8DynamicTensorSym:
+    torch.ops._C.dynamic_scaled_fp8_quant.default,  # noqa: E501
+    kFp8DynamicTokenSym:
+    torch.ops._C.dynamic_per_token_scaled_fp8_quant.default,  # noqa: E501
+}
+
+
+class FusedRMSQuantKey(NamedTuple):
+    """
+    Named tuple for identifying the type of RMSNorm + quant fusion.
+    quant: type of quantization
+    fused_add: does the op also perform the residual add
+    """
+    quant: QuantKey
+    fused_add: bool
+
+    def __str__(self):
+        return (f"FusedQuantKey({self.quant}, with"
+                f"{'' if self.fused_add else 'out'} residual)")
+
+
+FUSED_OPS: dict[FusedRMSQuantKey, OpOverload] = {
+    FusedRMSQuantKey(kFp8StaticTensorSym, False):
+    torch.ops._C.rms_norm_static_fp8_quant.default,  # noqa: E501
+    FusedRMSQuantKey(kFp8StaticTensorSym, True):
+    torch.ops._C.fused_add_rms_norm_static_fp8_quant.default,  # noqa: E501
+    FusedRMSQuantKey(kFp8DynamicTokenSym, False):
+    torch.ops._C.rms_norm_dynamic_per_token_quant.default,  # noqa: E501
+    FusedRMSQuantKey(kFp8DynamicTokenSym, True):
+    torch.ops._C.rms_norm_dynamic_per_token_quant.default,  # noqa: E501
+}
+
+
+class QuantMultiOutputMatch(MultiOutputMatch):
+
+    def __init__(self, match: pm.Match, quant_op, fused_op):
+        super().__init__(match)
+        assert isinstance(quant_op, OpOverload)
+        assert isinstance(fused_op, OpOverload)
+        self.QUANT_OP = quant_op  # in-place quant op
+        self.FUSED_OP = fused_op  # in-place fused quant op
+
+    def insert_fused_node(self, fused_return_mapping: dict[int, tuple[fx.Node,
+                                                                      int]],
+                          **kwargs):
+        """
+        This utility function inserts an auto-functionalized node for FUSED_OP.
+        It also correctly sets its meta value and rebinds the users of the
+        unfused nodes to use the fused node instead.
+
+        :param fused_return_mapping: A dictionary, mapping from getitem indices
+        of the fused node result to a tuple of the old node and a getitem index.
+        :param kwargs: kwargs that get directly forwarded to the auto_fn node
+
+        Example:
+        If we want to replace this graph:
+        _, x1, x2 = auto_fn(op1)
+        _, y1, y2 = auto_fn(op2)
+
+        with
+        _, x1, y2, x2 = auto_fn(FUSED_OP)
+
+        we would call:
+        insert_fused_node({1: (op1_node, 1), 2: (op2_node, 2), 3: (op1_node, 2)}
+
+        Note that the 0th element is None for auto-functionalized in-place ops.
+        Hence, others appear 1-indexed.
+        """
+        fused_node = self.insert_auto_fn(self.FUSED_OP, kwargs)
+        indices = fused_return_mapping.keys()
+        getitem_nodes = self.insert_getitems(fused_node, indices)
+
+        # Prepare the meta value, use a list so it's mutable
+        meta_val = [None] * (max(indices) + 1)
+
+        # Iterate through elements of the tuple produced by fused_node
+        for idx, getitem_node in zip(indices, getitem_nodes):
+            old_node, old_idx = fused_return_mapping[idx]
+
+            # If the old value was never used, the old_getitem might not exist
+            old_getitem = find_getitem_maybe(old_node, old_idx)
+            if old_getitem is not None:
+                # Rebind the users of match getitem nodes to use the new nodes.
+                # The old nodes will be removed by DCE at the end of the pass.
+                old_getitem.replace_all_uses_with(getitem_node)
+                getitem_node.meta["val"] = old_getitem.meta["val"]
+
+            # Extract the appropriate meta value
+            # It is present even if the getitem node does not exist
+            meta_val[idx] = old_node.meta["val"][old_idx]
+
+        # Fix the meta value on the new fused node
+        fused_node.meta["val"] = tuple(meta_val)
+
+
+class RMSNormQuantPattern:
+
+    def __init__(self, epsilon: float, key: FusedRMSQuantKey):
+        self.epsilon = epsilon
+        self.quant_dtype = key.quant.dtype
+
+        assert key.quant in QUANT_OPS, \
+            f"unsupported quantization scheme {key.quant}"
+        self.QUANT_OP = QUANT_OPS[key.quant]
+
+        assert key in FUSED_OPS, \
+            f"unsupported fused rmsnorm+quant op for {key}"
+        self.FUSED_OP = FUSED_OPS[key]
+
+
+class RMSNormStaticQuantPattern(RMSNormQuantPattern):
+
+    def __init__(self,
+                 epsilon: float,
+                 quant_dtype: torch.dtype,
+                 symmetric=True):
+        fused_key = FusedRMSQuantKey(fused_add=False,
+                                     quant=QuantKey(
+                                         dtype=quant_dtype,
+                                         static=True,
+                                         group_shape=GroupShape.PER_TENSOR,
+                                         symmetric=symmetric))
+        super().__init__(epsilon, fused_key)
+
+    def register(self, pm_pass: PatternMatcherPass):
+        # Cannot use methods, as the self argument affects tracing
+        def pattern(result: torch.Tensor, result_rms: torch.Tensor,
+                    input: torch.Tensor, weight: torch.Tensor,
+                    scale: torch.Tensor):
+            at1 = auto_functionalized(RMS_OP,
+                                      result=result_rms,
+                                      input=input,
+                                      weight=weight,
+                                      epsilon=self.epsilon)
+            at2 = auto_functionalized(self.QUANT_OP,
+                                      result=result,
+                                      input=at1[1],
+                                      scale=scale)
+
+            # result
+            return at2[1]
+
+        def replacement(result: torch.Tensor, result_rms: torch.Tensor,
+                        input: torch.Tensor, weight: torch.Tensor,
+                        scale: torch.Tensor):
+            at = auto_functionalized(self.FUSED_OP,
+                                     result=result,
+                                     input=input,
+                                     weight=weight,
+                                     scale=scale,
+                                     epsilon=self.epsilon)
+
+            # result
+            return at[1]
+
+        inputs = [
+            torch.empty(5, 4, device="cuda", dtype=self.quant_dtype),  # result
+            empty_bf16(5, 4),  # result_rms
+            empty_bf16(5, 4),  # input
+            empty_bf16(1, 5),  # weight
+            empty_fp32(1, 1)  # scale
+        ]
+
+        pm.register_replacement(pattern, replacement, inputs, pm.fwd_only,
+                                pm_pass)
+
+
+class FusedAddRMSNormStaticQuantPattern(RMSNormQuantPattern):
+
+    def __init__(self,
+                 epsilon: float,
+                 quant_dtype: torch.dtype,
+                 symmetric=True):
+        key = FusedRMSQuantKey(fused_add=True,
+                               quant=QuantKey(
+                                   dtype=quant_dtype,
+                                   static=True,
+                                   group_shape=GroupShape.PER_TENSOR,
+                                   symmetric=symmetric))
+        super().__init__(epsilon, key)
+
+    def register(self, pm_pass: PatternMatcherPass,
+                 record_match: Callable[[MultiOutputMatch], bool]):
+
+        def pattern(result: torch.Tensor, input: torch.Tensor,
+                    residual: torch.Tensor, weight: torch.Tensor,
+                    scale: torch.Tensor):
+            at = auto_functionalized(RMS_ADD_OP,
+                                     input=input,
+                                     residual=residual,
+                                     weight=weight,
+                                     epsilon=self.epsilon)
+            at1 = auto_functionalized(self.QUANT_OP,
+                                      result=result,
+                                      input=at[1],
+                                      scale=scale)
+
+            # result, residual
+            return at1[1], at[2]
+
+        def replacement(result: torch.Tensor, input: torch.Tensor,
+                        residual: torch.Tensor, weight: torch.Tensor,
+                        scale: torch.Tensor):
+            at = auto_functionalized(self.FUSED_OP,
+                                     result=result,
+                                     input=input,
+                                     residual=residual,
+                                     weight=weight,
+                                     scale=scale,
+                                     epsilon=self.epsilon)
+
+            # result, residual
+            return at[1], at[2]
+
+        inputs = [
+            torch.empty(5, 4, device="cuda", dtype=self.quant_dtype),  # result
+            empty_bf16(5, 4),  # input
+            empty_bf16(5, 4),  # residual
+            empty_bf16(1, 5),  # weight
+            empty_fp32(1, 1)  # scale
+        ]
+
+        pm.register_replacement(
+            pattern,
+            replacement,
+            inputs,
+            pm.fwd_only,
+            pm_pass,
+            extra_check=lambda m: record_match(
+                self.Match(m, self.QUANT_OP, self.FUSED_OP)))
+
+    class Match(QuantMultiOutputMatch):
+
+        def process(self):
+            # Find the nodes in the match that we need to rebind
+            rms_node = self.find_auto_fn(RMS_ADD_OP)
+            quant_node = self.find_auto_fn(self.QUANT_OP)
+
+            assert len(rms_node.users) == 2
+            assert len(quant_node.users) == 1
+
+            # First, insert a new auto_functionalized node for the fused op,
+            # as well as getitem nodes to extract the result and residual.
+            # The auto_fn node returns a tuple of (None, result, residual).
+            #
+            # The resulting graph looks like this:
+            # at = auto_functionalized(torch.ops._C.fused_add_rms_norm_static_fp8_quant.default, ...)  # noqa
+            # result_node_new = at[1]
+            # residual_node_new = at[2]
+            with self.inserting_after_match():
+                # Missing epsilon, scalars cannot be inputs to the pattern
+                kwargs = self.match.kwargs.copy()
+
+                # 0 is always None
+                fused_return_mapping = {1: (quant_node, 1), 2: (rms_node, 2)}
+                self.insert_fused_node(fused_return_mapping,
+                                       **kwargs,
+                                       epsilon=rms_node.kwargs["epsilon"])
+
+
+class RMSNormDynamicQuantPattern(RMSNormQuantPattern):
+
+    def __init__(self,
+                 epsilon: float,
+                 quant_dtype: torch.dtype,
+                 group_shape: GroupShape = GroupShape.PER_TOKEN,
+                 symmetric=True):
+        key = FusedRMSQuantKey(fused_add=False,
+                               quant=QuantKey(dtype=quant_dtype,
+                                              static=False,
+                                              group_shape=group_shape,
+                                              symmetric=symmetric))
+        super().__init__(epsilon, key)
+
+    def register(self, pm_pass: PatternMatcherPass,
+                 record_match: Callable[[MultiOutputMatch], bool]):
+
+        def pattern(result: torch.Tensor, result_rms: torch.Tensor,
+                    input: torch.Tensor, weight: torch.Tensor,
+                    scale: torch.Tensor):
+            at1 = auto_functionalized(RMS_OP,
+                                      result=result_rms,
+                                      input=input,
+                                      weight=weight,
+                                      epsilon=self.epsilon)
+            at2 = auto_functionalized(self.QUANT_OP,
+                                      result=result,
+                                      input=at1[1],
+                                      scale=scale,
+                                      scale_ub=None)
+
+            # result, scale
+            return at2[1], at2[2]
+
+        def replacement(result: torch.Tensor, result_rms: torch.Tensor,
+                        input: torch.Tensor, weight: torch.Tensor,
+                        scale: torch.Tensor):
+            at = auto_functionalized(self.FUSED_OP,
+                                     result=result,
+                                     input=input,
+                                     weight=weight,
+                                     scale=scale,
+                                     epsilon=self.epsilon,
+                                     scale_ub=None,
+                                     residual=None)
+
+            # result, scale
+            return at[1], at[2]
+
+        inputs = [
+            torch.empty(5, 4, device="cuda", dtype=self.quant_dtype),  # result
+            empty_bf16(5, 4),  # result_rms
+            empty_bf16(5, 4),  # input
+            empty_bf16(1, 5),  # weight
+            empty_fp32(1, 1)  # scale
+        ]
+
+        pm.register_replacement(
+            pattern,
+            replacement,
+            inputs,
+            pm.fwd_only,
+            pm_pass,
+            extra_check=lambda m: record_match(
+                self.Match(m, self.QUANT_OP, self.FUSED_OP)))
+
+    class Match(QuantMultiOutputMatch):
+
+        def process(self):
+            # Find the nodes in the match that we need to rebind
+            rms_node = self.find_auto_fn(RMS_OP)
+            quant_node = self.find_auto_fn(self.QUANT_OP)
+
+            assert len(rms_node.users) == 1
+            assert len(quant_node.users) == 2
+
+            # First, insert a new auto_functionalized node for the fused op,
+            # as well as getitem nodes to extract the result and scale.
+            # The auto_fn node returns a tuple of (None, result, scale).
+            #
+            # The resulting graph looks like this:
+            # at = auto_functionalized(torch.ops._C.rms_norm_dynamic_per_token_quant.default, ...)  # noqa
+            # result_node_new = at[1]
+            # scale_node_new = at[2]
+            with self.inserting_after_match():
+                # Missing epsilon, scalars cannot be inputs to the pattern
+                kwargs = self.match.kwargs.copy()
+                del kwargs["result_rms"]  # not used in the fused op
+
+                fused_return_mapping = {1: (quant_node, 1), 2: (quant_node, 2)}
+                self.insert_fused_node(
+                    fused_return_mapping,
+                    epsilon=rms_node.kwargs["epsilon"],
+                    scale_ub=None,  # not used but required
+                    residual=None,  # not used but required
+                    **kwargs)
+
+
+class FusedAddRMSNormDynamicQuantPattern(RMSNormQuantPattern):
+
+    def __init__(self,
+                 epsilon: float,
+                 quant_dtype: torch.dtype,
+                 group_shape: GroupShape = GroupShape.PER_TOKEN,
+                 symmetric=True):
+        key = FusedRMSQuantKey(fused_add=True,
+                               quant=QuantKey(dtype=quant_dtype,
+                                              static=False,
+                                              group_shape=group_shape,
+                                              symmetric=symmetric))
+        super().__init__(epsilon, key)
+
+    def register(self, pm_pass: PatternMatcherPass,
+                 record_match: Callable[[MultiOutputMatch], bool]):
+
+        def pattern(result: torch.Tensor, input: torch.Tensor,
+                    residual: torch.Tensor, weight: torch.Tensor,
+                    scale: torch.Tensor):
+            at = auto_functionalized(RMS_ADD_OP,
+                                     input=input,
+                                     residual=residual,
+                                     weight=weight,
+                                     epsilon=self.epsilon)
+            at1 = auto_functionalized(self.QUANT_OP,
+                                      result=result,
+                                      input=at[1],
+                                      scale=scale,
+                                      scale_ub=None)
+
+            # result, residual, scale
+            return at1[1], at[2], at1[2]
+
+        def replacement(result: torch.Tensor, input: torch.Tensor,
+                        residual: torch.Tensor, weight: torch.Tensor,
+                        scale: torch.Tensor):
+            at = auto_functionalized(self.FUSED_OP,
+                                     result=result,
+                                     input=input,
+                                     weight=weight,
+                                     scale=scale,
+                                     epsilon=self.epsilon,
+                                     scale_ub=None,
+                                     residual=residual)
+
+            # result, residual, scale
+            return at[1], at[3], at[2]
+
+        inputs = [
+            torch.empty(5, 4, device="cuda", dtype=self.quant_dtype),  # result
+            empty_bf16(5, 4),  # input
+            empty_bf16(5, 4),  # residual
+            empty_bf16(1, 5),  # weight
+            empty_fp32(1, 1)  # scale
+        ]
+
+        pm.register_replacement(
+            pattern,
+            replacement,
+            inputs,
+            pm.fwd_only,
+            pm_pass,
+            extra_check=lambda m: record_match(
+                self.Match(m, self.QUANT_OP, self.FUSED_OP)))
+
+    class Match(QuantMultiOutputMatch):
+
+        def process(self):
+            # Find the nodes in the match that we need to rebind
+            rms_node = self.find_auto_fn(RMS_ADD_OP)
+            quant_node = self.find_auto_fn(self.QUANT_OP)
+
+            assert len(rms_node.users) == 2
+            assert len(quant_node.users) == 2
+
+            # First, insert a new auto_functionalized node for the fused op,
+            # as well as getitem nodes to extract result, scale, and residual.
+            # The auto_fn node returns a tuple (None, result, scale, residual).
+            #
+            # The resulting graph looks like this:
+            # at = auto_functionalized(torch.ops._C.rms_norm_dynamic_per_token_quant.default, ...)  # noqa
+            # result_node_new = at[1]
+            # scale_node_new = at[2]
+            # residual_node_new = at[3]
+            with self.inserting_after_match():
+                # Missing epsilon, scalars cannot be inputs to the pattern
+                kwargs = self.match.kwargs.copy()
+
+                fused_return_mapping = {
+                    1: (quant_node, 1),  # result
+                    2: (quant_node, 2),  # scale
+                    3: (rms_node, 2),  # residual
+                }
+                self.insert_fused_node(
+                    fused_return_mapping,
+                    epsilon=rms_node.kwargs["epsilon"],
+                    scale_ub=None,  # not used but required
+                    **kwargs)
+
+
+class FusionPass(VllmInductorPass):
+    """
+    This pass fuses a pre-defined set of custom ops into fused ops.
+    It uses the torch pattern matcher to find the patterns and replace them.
+    It also manually processes multi-output matches, as those are broken in
+    the torch pattern matcher.
+
+    Because patterns can only be registered once, the pass is a singleton.
+    This will be addressed in a future version of PyTorch:
+    https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980
+    """
+
+    _instance: 'Optional[FusionPass]' = None
+
+    @classmethod
+    def instance(cls, config: VllmConfig):
+        """
+        Get the singleton instance of the FusionPass.
+        If the instance exists, the config is updated but
+        initialization is not repeated.
+        """
+        if cls._instance is None:
+            cls._instance = FusionPass(config)
+        else:
+            cls._instance.pass_config = config.compilation_config.pass_config
+        return cls._instance
+
+    def __init__(self, config: VllmConfig):
+        assert self.__class__._instance is None, \
+            "FusionPass singleton instance already exists"
+        super().__init__(config)
+
+        self.matches: list[MultiOutputMatch] = []
+        self.patterns: PatternMatcherPass = PatternMatcherPass(
+            pass_name="fusion_pass")
+
+        for epsilon in [1e-5, 1e-6]:
+            # Fuse rms_norm + static fp8 quant
+            RMSNormStaticQuantPattern(epsilon,
+                                      FP8_DTYPE).register(self.patterns)
+
+            # Matches for patterns below have 2 or more outputs,
+            # so we need to process them manually (see process_matches)
+
+            # Fuse rms_norm + static fp8 quant
+            FusedAddRMSNormStaticQuantPattern(epsilon, FP8_DTYPE).register(
+                self.patterns, self.record_match)
+
+            # Fuse rms_norm + dynamic per-token fp8 quant
+            RMSNormDynamicQuantPattern(epsilon, FP8_DTYPE).register(
+                self.patterns, self.record_match)
+
+            # Fuse fused_add_rms_norm + dynamic per-token fp8 quant
+            FusedAddRMSNormDynamicQuantPattern(epsilon, FP8_DTYPE).register(
+                self.patterns, self.record_match)
+
+            # WARNING: This is a hack to clear the pattern matcher cache
+            # and allow multiple values of epsilon.
+            torch._inductor.pattern_matcher._seen_patterns.clear()
+
+    def record_match(self, match: MultiOutputMatch) -> bool:
+        # Hijack the extra_check to record the match and
+        # save it for post-processing.
+        self.matches.append(match)
+
+        # Return False to prevent automatic replacement.
+        return False
+
+    def process_matches(self, graph: fx.Graph):
+        """
+        Manually process multi-output matches and replace them with fused nodes.
+        See MultiOutputMatch for more details.
+        """
+        for match in self.matches:
+            match.process()
+
+        # Finally, remove matched nodes
+        graph.eliminate_dead_code()
+        assert all(node not in graph.nodes for match in self.matches
+                   for node in match.match.nodes)
+
+    def __call__(self, graph: fx.Graph):
+        self.begin()
+        self.dump_graph(graph, "before_fusion")
+
+        count = self.patterns.apply(graph)
+        logger.debug("Replaced %s patterns", count)
+        self.dump_graph(graph, "after_pattern_match")
+
+        # Manually process multi-output matches (and run DCE)
+        self.process_matches(graph)
+        logger.debug("Post-processed %s matches", len(self.matches))
+        self.dump_graph(graph, "after_fusion")
+        self.matches.clear()
+        self.end_and_log()
diff --git a/vllm_v0.10.0/vllm/compilation/fusion_attn.py b/vllm_v0.10.0/vllm/compilation/fusion_attn.py
new file mode 100644
index 0000000..79518b6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/compilation/fusion_attn.py
@@ -0,0 +1,166 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+import torch._inductor.pattern_matcher as pm
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+from torch._inductor.pattern_matcher import PatternMatcherPass
+from torch._subclasses.fake_tensor import (FakeTensorMode,
+                                           unset_fake_temporarily)
+
+from vllm.attention import Attention
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+
+from .fusion import QUANT_OPS, GroupShape, QuantKey, empty_bf16, empty_fp32
+from .vllm_inductor_pass import VllmInductorPass
+
+logger = init_logger(__name__)
+
+ATTN_OP = torch.ops.vllm.unified_attention_with_output.default
+RESHAPE_OP = torch.ops.aten.reshape.default
+
+
+class AttentionStaticQuantPattern:
+
+    def __init__(
+        self,
+        layer_name: str,
+        num_heads: int,
+        head_size: int,
+        quant_dtype: torch.dtype,
+        symmetric=True,
+    ):
+        self.layer_name = layer_name
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.quant_dtype = quant_dtype
+        self.quant_key = QuantKey(dtype=quant_dtype,
+                                  static=True,
+                                  group_shape=GroupShape.PER_TENSOR,
+                                  symmetric=symmetric)
+        assert self.quant_key in QUANT_OPS, \
+            f"unsupported quantization scheme {self.quant_key}"
+        self.QUANT_OP = QUANT_OPS[self.quant_key]
+
+    def empty_quant(self, *args, **kwargs):
+        kwargs = {'dtype': self.quant_dtype, 'device': "cuda", **kwargs}
+        return torch.empty(*args, **kwargs)
+
+    def register_if_supported(self, pm_pass: PatternMatcherPass,
+                              layer: Attention):
+        if layer.impl.fused_output_quant_supported(self.quant_dtype,
+                                                   self.quant_key.static,
+                                                   self.quant_key.group_shape):
+            self._register(pm_pass)
+
+    def _register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                    output_attn: torch.Tensor, output_quant: torch.Tensor,
+                    scale: torch.Tensor):
+            view_7 = RESHAPE_OP(output_attn,
+                                [-1, self.num_heads, self.head_size])
+
+            at1 = auto_functionalized(ATTN_OP,
+                                      query=q,
+                                      key=k,
+                                      value=v,
+                                      output=view_7,
+                                      layer_name=self.layer_name,
+                                      output_scale=None)
+            attn_out_view = RESHAPE_OP(at1[1],
+                                       [-1, self.num_heads * self.head_size])
+
+            at2 = auto_functionalized(self.QUANT_OP,
+                                      result=output_quant,
+                                      input=attn_out_view,
+                                      scale=scale)
+            return at2[1]
+
+        def replacement(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                        output_attn: torch.Tensor, output_quant: torch.Tensor,
+                        scale: torch.Tensor):
+            view_7 = RESHAPE_OP(output_quant,
+                                [-1, self.num_heads, self.head_size])
+
+            at1 = auto_functionalized(ATTN_OP,
+                                      query=q,
+                                      key=k,
+                                      value=v,
+                                      output=view_7,
+                                      layer_name=self.layer_name,
+                                      output_scale=scale)
+
+            return RESHAPE_OP(at1[1], [-1, self.num_heads * self.head_size])
+
+        # Need custom fake mode, otherwise tracing happens with real tensors.
+        # That would not work for the unified_attention custom op.
+        with unset_fake_temporarily(), FakeTensorMode():
+            inputs = [
+                empty_bf16(5, self.num_heads, self.head_size),  # q
+                empty_bf16(5, self.num_heads, self.head_size),  # k
+                empty_bf16(5, self.num_heads, self.head_size),  # v
+                empty_bf16(5, self.num_heads * self.head_size),  # attn_output
+                self.empty_quant(5, self.num_heads *
+                                 self.head_size),  # quant_output
+                empty_fp32(1, 1)  # scale
+            ]
+
+            def wrap_trace_fn(process_fx, trace_fn):
+
+                def wrapped(*args, **kwargs):
+                    return process_fx(trace_fn(*args, **kwargs))
+
+                return wrapped
+
+            def fx_view_to_reshape(gm: torch.fx.GraphModule):
+                from torch._inductor.fx_passes.post_grad import view_to_reshape
+                view_to_reshape(gm)
+                return gm
+
+            pm.register_replacement(
+                pattern, replacement, inputs,
+                wrap_trace_fn(fx_view_to_reshape, pm.fwd_only), pm_pass)
+
+
+class AttnFusionPass(VllmInductorPass):
+    """
+    This pass fuses post-attention quantization onto attention if supported.
+
+    It uses the pattern matcher and matches each layer manually, as strings
+    cannot be wildcarded. This also lets us check support on attention layers
+    upon registration instead of during pattern matching.
+
+    Currently, only static fp8 quant is supported, but patterns could easily be
+    added for other quant schemes and dtypes. The bigger hurdle for wider
+    support are attention kernels, which need to support fusing output quant.
+    """
+
+    def __init__(self, config: VllmConfig):
+        super().__init__(config)
+        self.static_fwd_ctx = config.compilation_config.static_forward_context
+
+        self.patterns = PatternMatcherPass(pass_name="attn_fusion_pass")
+
+        for key, layer in self.static_fwd_ctx.items():
+            pattern = AttentionStaticQuantPattern(key, layer.num_heads,
+                                                  layer.head_size,
+                                                  current_platform.fp8_dtype())
+            pattern.register_if_supported(self.patterns, layer)
+        if len(self.static_fwd_ctx) == 0:
+            logger.warning(
+                "Attention + quant fusion is enabled, but "
+                "CompilationConfig.static_forward_context is empty. "
+                "Cannot access attention layers so no fusion "
+                "patterns were registered.")
+
+    def __call__(self, graph: torch.fx.graph.Graph) -> None:
+        self.begin()
+        self.dump_graph(graph, "before_attn_fusion")
+
+        count = self.patterns.apply(graph)
+        logger.debug("Fused quantization onto %s attention nodes", count)
+        self.dump_graph(graph, "after_attn_fusion")
+        self.end_and_log()
diff --git a/vllm_v0.10.0/vllm/compilation/fx_utils.py b/vllm_v0.10.0/vllm/compilation/fx_utils.py
new file mode 100644
index 0000000..2db8b54
--- /dev/null
+++ b/vllm_v0.10.0/vllm/compilation/fx_utils.py
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import operator
+from collections.abc import Iterable, Iterator
+from typing import Optional
+
+from torch import fx
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+from torch._ops import OpOverload
+
+
+def is_func(node: fx.Node, target) -> bool:
+    return node.op == "call_function" and node.target == target
+
+
+def is_auto_func(node: fx.Node, op: OpOverload) -> bool:
+    return is_func(node, auto_functionalized) and node.args[0] == op
+
+
+# Returns the first specified node with the given op (if it exists)
+def find_specified_fn_maybe(nodes: Iterable[fx.Node],
+                            op: OpOverload) -> Optional[fx.Node]:
+    for node in nodes:
+        if node.target == op:
+            return node
+    return None
+
+
+# Returns the first specified node with the given op
+def find_specified_fn(nodes: Iterable[fx.Node], op: OpOverload) -> fx.Node:
+    node = find_specified_fn_maybe(nodes, op)
+    assert node is not None, f"Could not find {op} in nodes {nodes}"
+    return node
+
+
+# Returns the first auto_functionalized node with the given op (if it exists)
+def find_auto_fn_maybe(nodes: Iterable[fx.Node],
+                       op: OpOverload) -> Optional[fx.Node]:
+    for node in nodes:
+        if is_func(node, auto_functionalized) and node.args[0] == op:  # noqa
+            return node
+    return None
+
+
+# Returns the first auto_functionalized node with the given op
+def find_auto_fn(nodes: Iterable[fx.Node], op: OpOverload) -> fx.Node:
+    node = find_auto_fn_maybe(nodes, op)
+    assert node is not None, f"Could not find {op} in nodes {nodes}"
+    return node
+
+
+# Returns the getitem node that extracts the idx-th element from node
+# (if it exists)
+def find_getitem_maybe(node: fx.Node, idx: int) -> Optional[fx.Node]:
+    for user in node.users:
+        if is_func(user, operator.getitem) and user.args[1] == idx:
+            return user
+    return None
+
+
+# Returns the getitem node that extracts the idx-th element from node
+def find_getitem(node: fx.Node, idx: int) -> fx.Node:
+    ret = find_getitem_maybe(node, idx)
+    assert ret is not None, f"Could not find getitem {idx} in node {node}"
+    return ret
+
+
+# An auto-functionalization-aware utility for finding nodes with a specific op
+def find_op_nodes(op: OpOverload, graph: fx.Graph) -> Iterator[fx.Node]:
+    if not op._schema.is_mutable:
+        yield from graph.find_nodes(op="call_function", target=op)
+
+    for n in graph.find_nodes(op="call_function", target=auto_functionalized):
+        if n.args[0] == op:
+            yield n
+
+
+# Asserts that the node only has one user and returns it
+# Even if a node has only 1 user, it might share storage with another node,
+# which might need to be taken into account.
+def get_only_user(node: fx.Node) -> fx.Node:
+    assert len(node.users) == 1
+    return next(iter(node.users))
diff --git a/vllm_v0.10.0/vllm/compilation/inductor_pass.py b/vllm_v0.10.0/vllm/compilation/inductor_pass.py
new file mode 100644
index 0000000..810d080
--- /dev/null
+++ b/vllm_v0.10.0/vllm/compilation/inductor_pass.py
@@ -0,0 +1,115 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import hashlib
+import inspect
+import json
+import types
+from contextlib import contextmanager
+from typing import Any, Callable, Optional, Union
+
+import torch
+from torch import fx
+
+from vllm.utils import is_torch_equal_or_newer
+
+if is_torch_equal_or_newer("2.6"):
+    from torch._inductor.custom_graph_pass import CustomGraphPass
+else:
+    # CustomGraphPass is not present in 2.5 or lower, import our version
+    from .torch25_custom_graph_pass import (  # noqa: E501
+        Torch25CustomGraphPass as CustomGraphPass)
+
+_pass_context = None
+
+
+class PassContext:
+
+    def __init__(self, runtime_shape: Optional[int]):
+        self.runtime_shape = runtime_shape
+
+
+def get_pass_context() -> PassContext:
+    """Get the current pass context."""
+    assert _pass_context is not None
+    return _pass_context
+
+
+@contextmanager
+def pass_context(runtime_shape: Optional[int]):
+    """A context manager that stores the current pass context,
+    usually it is a list of sizes to specialize.
+    """
+    global _pass_context
+    prev_context = _pass_context
+    _pass_context = PassContext(runtime_shape)
+    try:
+        yield
+    finally:
+        _pass_context = prev_context
+
+
+class InductorPass(CustomGraphPass):
+    """
+    A custom graph pass that uses a hash of its source as the UUID.
+    This is defined as a convenience and should work in most cases.
+    """
+
+    def uuid(self) -> Any:
+        """
+        Provide a unique identifier for the pass, used in Inductor code cache.
+        This should depend on the pass implementation, so that changes to the
+        pass result in recompilation.
+        By default, the object source is hashed.
+        """
+        return InductorPass.hash_source(self)
+
+    @staticmethod
+    def hash_source(*srcs: Union[str, Any]):
+        """
+        Utility method to hash the sources of functions or objects.
+        :param srcs: strings or objects to add to the hash.
+        Objects and functions have their source inspected.
+        :return:
+        """
+        hasher = hashlib.sha256()
+        for src in srcs:
+            if isinstance(src, str):
+                src_str = src
+            elif isinstance(src, types.FunctionType):
+                src_str = inspect.getsource(src)
+            else:
+                src_str = inspect.getsource(src.__class__)
+            hasher.update(src_str.encode("utf-8"))
+        return hasher.hexdigest()
+
+    @staticmethod
+    def hash_dict(dict_: dict[Any, Any]):
+        """
+        Utility method to hash a dictionary, can alternatively be used for uuid.
+        :return: A sha256 hash of the json rep of the dictionary.
+        """
+        encoded = json.dumps(dict_, sort_keys=True).encode("utf-8")
+        return hashlib.sha256(encoded).hexdigest()
+
+    def is_applicable_for_shape(self, shape: Optional[int]):
+        return True
+
+
+class CallableInductorPass(InductorPass):
+    """
+    This class is a wrapper for a callable that automatically provides an
+    implementation of the UUID.
+    """
+
+    def __init__(self,
+                 callable: Callable[[fx.Graph], None],
+                 uuid: Optional[Any] = None):
+        self.callable = callable
+        self._uuid = self.hash_source(callable) if uuid is None else uuid
+
+    def __call__(self, graph: torch.fx.Graph):
+        self.callable(graph)
+
+    def uuid(self) -> Any:
+        return self._uuid
diff --git a/vllm_v0.10.0/vllm/compilation/monitor.py b/vllm_v0.10.0/vllm/compilation/monitor.py
new file mode 100644
index 0000000..1e059b5
--- /dev/null
+++ b/vllm_v0.10.0/vllm/compilation/monitor.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import time
+
+from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+context_manager = None
+torch_compile_start_time: float = 0.0
+
+
+def start_monitoring_torch_compile(vllm_config: VllmConfig):
+    global torch_compile_start_time
+    torch_compile_start_time = time.time()
+
+    compilation_config: CompilationConfig = vllm_config.compilation_config
+    if compilation_config.level == CompilationLevel.PIECEWISE and \
+        compilation_config.debug_dump_path:
+        import depyf
+        path = os.path.join(compilation_config.debug_dump_path,
+                            f"rank_{vllm_config.parallel_config.rank}")
+        global context_manager
+        context_manager = depyf.prepare_debug(path)
+        context_manager.__enter__()
+
+
+def end_monitoring_torch_compile(vllm_config: VllmConfig):
+    compilation_config: CompilationConfig = vllm_config.compilation_config
+    if compilation_config.level == CompilationLevel.PIECEWISE:
+        logger.info("torch.compile takes %.2f s in total",
+                    compilation_config.compilation_time)
+        global context_manager
+        if context_manager is not None:
+            context_manager.__exit__(None, None, None)
+            context_manager = None
diff --git a/vllm_v0.10.0/vllm/compilation/multi_output_match.py b/vllm_v0.10.0/vllm/compilation/multi_output_match.py
new file mode 100644
index 0000000..6d18937
--- /dev/null
+++ b/vllm_v0.10.0/vllm/compilation/multi_output_match.py
@@ -0,0 +1,109 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import abc
+import operator
+from abc import abstractmethod
+from collections.abc import Iterable
+
+from torch import fx
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+from torch._inductor import pattern_matcher as pm
+from torch._ops import OpOverload
+from torch.fx import Node
+
+from vllm.compilation.fx_utils import find_auto_fn
+
+
+class MultiOutputMatch(abc.ABC):
+    """
+    This class provides utilities to process multi-output matches and
+    manually insert replacements.
+
+    This is necessary because the automatic replacement for multi-output
+    matches is broken: https://github.com/pytorch/pytorch/issues/137280
+    """
+
+    def __init__(self, match: pm.Match):
+        self.match = match
+
+    @abstractmethod
+    def process(self):
+        """
+        Process a multi-output match and manually insert the replacement.
+
+        This method should:
+        1. Insert the replacement nodes after the last node in the match.
+        2. Rebind the users of nodes in the match to use the new nodes.
+        3. Set meta["val"] for de-functionalization.
+
+        The result of an auto-functionalized node is a tuple of tensors.
+        The first element is the return value of the function, usually None.
+        The remaining elements are the mutated args of the function.
+
+        All auto-functionalized nodes must contain a proper meta["val"],
+        as it is used by de-functionalization. meta["val"] has to contain the
+        value of the node (tuple of tensors) that would be returned by the
+        functionalized node during tracing.
+
+        Existing nodes in the graph all have this property set, but we have
+        to set it manually for new nodes we insert.
+
+        Example:
+        # op schema: foo(a: Tensor!, b: Tensor, c: Tensor!) -> None
+        at = auto_functionalized(torch.ops._C.foo.default, a, b, c)
+        # at.meta["val"] = (None, a, c)
+        """
+        raise NotImplementedError
+
+    @property
+    def nodes(self) -> list[fx.Node]:
+        return self.match.nodes
+
+    @property
+    def graph(self) -> fx.Graph:
+        return self.match.graph
+
+    def find_auto_fn(self, op) -> fx.Node:
+        """
+        Find the first auto_functionalized node with the given op in the match.
+        """
+        return find_auto_fn(self.nodes, op)
+
+    def inserting_after_match(self):
+        """
+        Insert nodes after the last node in the match.
+        This is done to avoid use-before-definition errors after inserting
+        replacement nodes.
+        """
+
+        # match.nodes is not guaranteed to be sorted.
+        # Find the last node in the match.
+        for last_node_in_match in reversed(self.graph.nodes):
+            if last_node_in_match in self.match.nodes:
+                break
+        else:
+            raise ValueError("No nodes in graph")
+
+        return self.graph.inserting_after(last_node_in_match)
+
+    def insert_getitems(self, tuple_node: fx.Node,
+                        indices: Iterable[int]) -> tuple[fx.Node, ...]:
+        """
+        Insert operator.getitem nodes to extract elements from a tuple node.
+
+        :param tuple_node: The tuple node to extract elements from.
+        :param indices: The indices of the elements to extract.
+        :return: Tuple of the new getitem nodes, corresponding to the indices.
+        """
+        with self.graph.inserting_after(tuple_node):
+            return tuple(
+                self.graph.call_function(operator.getitem, (tuple_node, idx))
+                for idx in indices)
+
+    def insert_auto_fn(self, op: OpOverload, kwargs) -> Node:
+        """
+        Insert an auto_functionalized node with the given op and kwargs.
+        """
+        return self.graph.call_function(auto_functionalized, (op, ),
+                                        kwargs=kwargs)
diff --git a/vllm_v0.10.0/vllm/compilation/noop_elimination.py b/vllm_v0.10.0/vllm/compilation/noop_elimination.py
new file mode 100644
index 0000000..4888d4d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/compilation/noop_elimination.py
@@ -0,0 +1,165 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable
+from typing import Union
+
+import torch.fx
+from torch import SymInt
+
+from vllm.logger import init_logger
+
+from .fx_utils import is_func
+from .vllm_inductor_pass import VllmInductorPass
+
+logger = init_logger(__name__)
+
+
+class NoOpEliminationPass(VllmInductorPass):
+    """
+    This is an inductor pass that removes redundant reshape/slice operations.
+    It is required for RMSNorm-quant fusion to work properly.
+    That's because apply_fp8_linear adds a reshape, which is redundant
+    in the 2D-case. Additionally, torch internal no-op elimination pass does
+    not handle certain slice variants.
+
+    Cases handled:
+      1. A chain of reshapes is equivalent to the last reshape called on the
+      base tensor (input of the first reshape).
+      2. A reshape that produces the shape of the input is redundant
+      3. A slice that produces the shape of the input is redundant
+
+    Example graph 1:
+    mul_1: "f16[s0, 4096]" = ...
+    view_1: "f16[s0, 128, 32]" = torch.reshape(mul_1, [-1, 128, 32])
+    view_2: "f16[s0, 4096]" = torch.reshape(view_2, [-1, 4096])
+    view_3: "f16[s0, 128, 32]" = torch.reshape(view_3, [-1, 128, 32])
+
+    Can be replaced with:
+    mul_1: "f16[s0, 4096]" = ...
+    view_3: "f16[s0, 128, 32]" = ...
+
+    Example graph 2:
+    getitem_1: "f16[s0, 4096]" = ...
+    view_1: "f16[s0, 4096]" = torch.reshape(getitem_1, [-1, 4096])
+    at = auto_functionalized(static_scaled_fp8_quant, input = view_1, ...)
+    out: "f8e4m3fn[s0, 4096]" = at[1]
+
+    Can be replaced with:
+    getitem_1: "f16[s0, 4096]" = ...
+    at = auto_functionalized(static_scaled_fp8_quant, input = getitem_1, ...)
+    out: "f8e4m3fn[s0, 4096]" = at[1]
+
+    Example graph 3:
+    arg0: "s0" = SymInt(s0)
+    scaled_mm: "f16[s0, 4096]" = ...
+    slice_1: "f16[s0, 4096]" = torch.slice(scaled_mm, -1, 0, arg0)
+    at = auto_functionalized(fused_add_rms_norm, input = slice_1, ...)
+    out: "f16[s0, 4096]" = torch.slice_scatter(scaled_mm, at[1], 0, 0, arg0)
+
+    Can be replaced with:
+    arg0: "s0" = SymInt(s0)
+    scaled_mm: "f16[s0, 4096]" = ...
+    at = auto_functionalized(fused_add_rms_norm, input = scaled_mm, ...)
+    out: "f16[s0, 4096]" = at[1]
+
+    TODO(luka): This is currently tested in test_fusion,
+     but separate tests could be good.
+    """
+
+    def __call__(self, graph: torch.fx.Graph):
+        self.begin()
+        self.dump_graph(graph, "before_noop_elimination")
+        count = 0
+        # Remove no-op reshapes/views:
+        for node in graph.nodes:
+            if is_func(node, torch.ops.aten.reshape.default):
+                # Case 1: rewrite reshape chains to reshapes on the base tensor
+                input = node.args[0]
+                # If the input is a reshape, rebind to that node
+                if is_func(input, torch.ops.aten.reshape.default):
+                    # The new input is guaranteed not to be a reshape,
+                    # because we process nodes in order
+                    node.update_arg(0, input.args[0])
+                    if len(input.users) == 0:
+                        graph.erase_node(input)
+                        count += 1
+
+                # Case 2: remove this reshape if it produces the original shape
+                input, shape = node.args[:2]
+                input_shape = input.meta["val"].shape
+                if len(shape) != len(input_shape):
+                    # Reshape changing rank, skip
+                    continue
+
+                if shape.count(-1) > 1:
+                    # Invalid reshape args, skip
+                    continue
+
+                if self.all_dims_equivalent(shape, input_shape):
+                    node.replace_all_uses_with(input)
+                    graph.erase_node(node)
+                    count += 1
+
+            elif is_func(node, torch.ops.aten.slice.Tensor):
+                input, dim_index, start, end = node.args[:4]
+                input_shape = input.meta["val"].shape
+                i_dim = input_shape[dim_index]
+
+                if start == 0 and self.dims_equivalent(end, i_dim):
+                    node.replace_all_uses_with(input)
+                    graph.erase_node(node)
+                    count += 1
+
+            elif is_func(node, torch.ops.aten.slice_scatter.default):
+                base, view, dim_index, start, end = node.args[:5]
+                base_shape = base.meta["val"].shape
+                view_shape = view.meta["val"].shape
+
+                view_dim = view_shape[dim_index]
+
+                # Check that view fully covers base and the full view is used
+                # (if the view fully covered the base after slicing but was not
+                # fully used, we could replace slice_scatter with a simple slice
+                # but that's a niche case).
+                if (base_shape == view_shape and start == 0
+                        and self.dims_equivalent(end, view_dim)):
+                    node.replace_all_uses_with(view)
+                    graph.erase_node(node)
+                    count += 1
+
+        logger.debug("Removed %s no-op reshapes and slices", count)
+        self.dump_graph(graph, "after_noop_elimination")
+        self.end_and_log()
+
+    def all_dims_equivalent(self, dims: Iterable[Union[int, torch.fx.Node]],
+                            i_dims: Iterable[Union[int, SymInt]]):
+        return all(
+            self.dims_equivalent(s, i_s) for s, i_s in zip(dims, i_dims))
+
+    def dims_equivalent(self, dim: Union[int, torch.fx.Node],
+                        i_dim: Union[int, SymInt]) -> bool:
+        """
+        This function checks if two dimensions are equivalent.
+        :param dim: The dimension arg to reshape/slice
+        :param i_dim: The corresponding dimension in the input tensor
+        :return: Are the dimensions equivalent?
+
+        There are three cases in which the dimensions are equivalent:
+        1. The dimensions are equal (both integers)
+        2. The reshape dimension is -1 (i.e. inferred)
+        3. The dimensions both correspond to the same SymInt
+
+        While case 2 does not guarantee the dimensions are equal,
+        they are equal if all other dimensions are equal.
+
+        In case 3, the reshape dimension is a torch.fx.Node,
+        and its value is a SymInt. That value is equal to the
+        input dimension.
+
+        """
+        # Case 1 and 2
+        if dim == i_dim or dim == -1:
+            return True
+        # Case 3
+        return isinstance(dim, torch.fx.Node) and dim.meta["val"] == i_dim
diff --git a/vllm_v0.10.0/vllm/compilation/pass_manager.py b/vllm_v0.10.0/vllm/compilation/pass_manager.py
new file mode 100644
index 0000000..58216a1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/compilation/pass_manager.py
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from torch import fx as fx
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+
+from .activation_quant_fusion import ActivationQuantFusionPass
+from .collective_fusion import AllReduceFusionPass, AsyncTPPass
+from .fix_functionalization import FixFunctionalizationPass
+from .fusion import FusionPass
+from .fusion_attn import AttnFusionPass
+from .inductor_pass import CustomGraphPass, InductorPass, get_pass_context
+from .noop_elimination import NoOpEliminationPass
+from .sequence_parallelism import SequenceParallelismPass
+from .vllm_inductor_pass import VllmInductorPass
+
+logger = init_logger(__name__)
+
+
+class PostGradPassManager(CustomGraphPass):
+    """
+    The pass manager for post-grad passes.
+    It handles configuration, adding custom passes, and running passes.
+    It supports uuid for the Inductor code cache. That includes torch<2.6
+    support using pickling (in .inductor_pass.CustomGraphPass).
+
+    The order of the post-grad post-passes is:
+    1. passes (constructor parameter)
+    2. default passes (NoopEliminationPass, FusionPass)
+    3. config["post_grad_custom_post_pass"] (if it exists)
+    4. fix_functionalization
+    This way, all passes operate on a functionalized graph.
+    """
+
+    def __init__(self):
+        self.passes: list[VllmInductorPass] = []
+
+    def __call__(self, graph: fx.Graph):
+        shape = get_pass_context().runtime_shape
+        for pass_ in self.passes:
+            if pass_.is_applicable_for_shape(shape):
+                pass_(graph)
+
+        # always run fix_functionalization last
+        self.fix_functionalization(graph)
+
+    def configure(self, config: VllmConfig):
+        self.pass_config = config.compilation_config.pass_config
+        if self.pass_config.enable_noop:
+            self.passes += [NoOpEliminationPass(config)]
+
+        if self.pass_config.enable_sequence_parallelism:
+            self.passes += [SequenceParallelismPass(config)]
+            if self.pass_config.enable_async_tp:
+                self.passes += [AsyncTPPass(config)]
+
+        if self.pass_config.enable_fusion:
+            self.passes += [FusionPass.instance(config)]
+            self.passes += [ActivationQuantFusionPass(config)]
+
+        if self.pass_config.enable_attn_fusion:
+            self.passes += [AttnFusionPass(config)]
+        if self.pass_config.enable_fi_allreduce_fusion:
+            self.passes += [AllReduceFusionPass(config)]
+        self.fix_functionalization = FixFunctionalizationPass(config)
+
+    def add(self, pass_: InductorPass):
+        assert isinstance(pass_, InductorPass)
+        self.passes.append(pass_)
+
+    def uuid(self):
+        """
+        The PostGradPassManager is set as a custom pass in the Inductor and
+        affects compilation caching. Its uuid depends on the UUIDs of all
+        dependent passes and the pass config. See InductorPass for more info.
+        """
+        state = {"pass_config": self.pass_config.uuid(), "passes": []}
+        for pass_ in self.passes:
+            state["passes"].append(pass_.uuid())
+        state["passes"].append(self.fix_functionalization.uuid())
+        return InductorPass.hash_dict(state)
diff --git a/vllm_v0.10.0/vllm/compilation/sequence_parallelism.py b/vllm_v0.10.0/vllm/compilation/sequence_parallelism.py
new file mode 100644
index 0000000..6107046
--- /dev/null
+++ b/vllm_v0.10.0/vllm/compilation/sequence_parallelism.py
@@ -0,0 +1,482 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import torch
+import torch._inductor.pattern_matcher as pm
+import torch.fx as fx
+from torch._inductor.pattern_matcher import PatternMatcherPass
+
+from vllm.config import VllmConfig
+from vllm.distributed import get_tp_group, tensor_model_parallel_all_reduce
+from vllm.distributed.parallel_state import (
+    get_tensor_model_parallel_world_size)
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+
+from .vllm_inductor_pass import VllmInductorPass
+
+logger = init_logger(__name__)
+
+
+class _RMSNormAndQuantOpHelper:
+    """Base helper for RMSNorm and RMSNorm + Quantization functionalization."""
+
+    def __init__(self,
+                 epsilon: float,
+                 dtype: torch.dtype,
+                 device: str,
+                 quant_op: Optional[torch._ops.OpOverload] = None,
+                 **kwargs):
+        self.epsilon = epsilon
+        self.dtype = dtype
+        self.device = device
+        self.quant_op = quant_op
+
+    def _functional_rmsnorm(self, result_buffer, input_tensor, weight_tensor):
+        return torch.ops.higher_order.auto_functionalized(
+            torch.ops._C.rms_norm.default,
+            result=result_buffer,
+            input=input_tensor,
+            weight=weight_tensor,
+            epsilon=self.epsilon)
+
+    def _functional_fused_add_rmsnorm(self, input_tensor, residual_tensor,
+                                      weight_tensor):
+        return torch.ops.higher_order.auto_functionalized(
+            torch.ops._C.fused_add_rms_norm.default,
+            input=input_tensor,
+            residual=residual_tensor,
+            weight=weight_tensor,
+            epsilon=self.epsilon)
+
+    def _functional_rmsnorm_then_quant(self, rmsnorm_result_buffer,
+                                       quant_result_buffer, input_tensor,
+                                       weight_tensor, scale_tensor):
+        if self.quant_op is None:
+            raise RuntimeError(
+                "_RMSNormAndQuantOpHelper was not initialized with a quant_op."
+            )
+        rmsnorm_out_tuple = self._functional_rmsnorm(rmsnorm_result_buffer,
+                                                     input_tensor,
+                                                     weight_tensor)
+        quant_out_tuple = torch.ops.higher_order.auto_functionalized(
+            self.quant_op,
+            result=quant_result_buffer,
+            input=rmsnorm_out_tuple[1],
+            scale=scale_tensor)
+        return quant_out_tuple
+
+    def _functional_fused_add_rmsnorm_then_quant(self, quant_result_buffer,
+                                                 input_tensor, residual_tensor,
+                                                 weight_tensor, scale_tensor):
+        if self.quant_op is None:
+            raise RuntimeError(
+                "_RMSNormAndQuantOpHelper was not initialized with a quant_op."
+            )
+        fused_add_rmsnorm_out_tuple = self._functional_fused_add_rmsnorm(
+            input_tensor, residual_tensor, weight_tensor)
+        quant_out_tuple = torch.ops.higher_order.auto_functionalized(
+            self.quant_op,
+            result=quant_result_buffer,
+            input=fused_add_rmsnorm_out_tuple[1],
+            scale=scale_tensor)
+        return quant_out_tuple, fused_add_rmsnorm_out_tuple[2]
+
+
+class _SequenceParallelPatternHelper(_RMSNormAndQuantOpHelper):
+    """Helper for sequence parallelism patterns."""
+
+    def __init__(self,
+                 epsilon: float,
+                 dtype: torch.dtype,
+                 device: str,
+                 quant_op: Optional[torch._ops.OpOverload] = None,
+                 **kwargs):
+        super().__init__(epsilon, dtype, device, quant_op=quant_op, **kwargs)
+        self.tp_group = get_tp_group()
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+    def _all_reduce(self, x: torch.Tensor) -> torch.Tensor:
+        return tensor_model_parallel_all_reduce(x)
+
+    def _reduce_scatter(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.ops.vllm.reduce_scatter.default(
+            x,
+            dim=0,
+            world_size=self.tp_size,
+            group_name=self.tp_group.unique_name)
+
+    def _all_gather(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.ops.vllm.all_gather.default(
+            x,
+            dim=0,
+            world_size=self.tp_size,
+            group_name=self.tp_group.unique_name)
+
+
+class FirstAllReduceRMSNormPattern(_SequenceParallelPatternHelper):
+
+    def get_inputs(self):
+        input = torch.empty([1, 8, 4], device=self.device, dtype=self.dtype)
+        permute = torch.empty([1, 8, 4], device=self.device, dtype=self.dtype)
+        arg3_1 = torch.empty([4], device=self.device, dtype=self.dtype)
+
+        return [input, permute, arg3_1]
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(
+            input: torch.Tensor,
+            permute: torch.Tensor,
+            arg3_1: torch.Tensor,
+        ):
+            all_reduce = self._all_reduce(input)
+            rmsnorm = self._functional_rmsnorm(permute, all_reduce, arg3_1)
+
+            return rmsnorm[1], all_reduce
+
+        def replacement(
+            input: torch.Tensor,
+            permute: torch.Tensor,
+            arg3_1: torch.Tensor,
+        ):
+            reduce_scatter = self._reduce_scatter(input)
+
+            rmsnorm_result = torch.empty_like(reduce_scatter)
+            rmsnorm = self._functional_rmsnorm(rmsnorm_result, reduce_scatter,
+                                               arg3_1)
+
+            all_gather = self._all_gather(rmsnorm[1])
+
+            return all_gather, reduce_scatter
+
+        pm.register_replacement(pattern, replacement, self.get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
+class MiddleAllReduceRMSNormPattern(_SequenceParallelPatternHelper):
+
+    def get_inputs(self):
+        mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype)
+
+        residual = torch.empty([4, 4], device=self.device, dtype=self.dtype)
+        rms_norm_weights = torch.empty([4, 4],
+                                       device=self.device,
+                                       dtype=self.dtype)
+
+        return [
+            residual,
+            mm_1,
+            rms_norm_weights,
+        ]
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(
+            residual: torch.Tensor,
+            mm_1: torch.Tensor,
+            rms_norm_weights: torch.Tensor,
+        ) -> tuple[torch.Tensor, torch.Tensor]:
+            all_reduce = self._all_reduce(mm_1)
+            rmsnorm = self._functional_fused_add_rmsnorm(
+                all_reduce, residual, rms_norm_weights)
+            return rmsnorm[1], rmsnorm[2]
+
+        def replacement(
+            residual: torch.Tensor,
+            mm_1: torch.Tensor,
+            rms_norm_weights: torch.Tensor,
+        ) -> tuple[torch.Tensor, torch.Tensor]:
+            reduce_scatter = self._reduce_scatter(mm_1)
+            rmsnorm = self._functional_fused_add_rmsnorm(
+                reduce_scatter, residual, rms_norm_weights)
+            all_gather = self._all_gather(rmsnorm[1])
+            return all_gather, rmsnorm[2]
+
+        pm.register_replacement(pattern, replacement, self.get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
+class LastAllReduceRMSNormPattern(_SequenceParallelPatternHelper):
+
+    def get_inputs(self):
+        mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype)
+
+        residual = torch.empty([4, 4], device=self.device, dtype=self.dtype)
+        rms_norm_weights = torch.empty([4, 4],
+                                       device=self.device,
+                                       dtype=self.dtype)
+
+        return [
+            residual,
+            mm_1,
+            rms_norm_weights,
+        ]
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(
+            residual: torch.Tensor,
+            mm_1: torch.Tensor,
+            rms_norm_weights: torch.Tensor,
+        ) -> tuple[torch.Tensor, torch.Tensor]:
+            all_reduce = self._all_reduce(mm_1)
+            rmsnorm = self._functional_fused_add_rmsnorm(
+                all_reduce, residual, rms_norm_weights)
+            return rmsnorm[1]
+
+        def replacement(
+            residual: torch.Tensor,
+            mm_1: torch.Tensor,
+            rms_norm_weights: torch.Tensor,
+        ) -> tuple[torch.Tensor, torch.Tensor]:
+            reduce_scatter = self._reduce_scatter(mm_1)
+            rmsnorm = self._functional_fused_add_rmsnorm(
+                reduce_scatter, residual, rms_norm_weights)
+            normalized = self._all_gather(rmsnorm[1])
+            return normalized
+
+        pm.register_replacement(pattern, replacement, self.get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
+FP8_DTYPE = current_platform.fp8_dtype()
+
+
+class FirstAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper):
+
+    def __init__(self, epsilon: float, dtype: torch.dtype, device: str,
+                 op: torch._ops.OpOverload):
+        super().__init__(epsilon, dtype, device, quant_op=op)
+
+    def get_inputs(self):
+        input = torch.zeros([1, 8, 4], device=self.device, dtype=self.dtype)
+        rmsnorm_result = torch.empty([1, 8, 4],
+                                     device=self.device,
+                                     dtype=self.dtype)
+        quant_result = torch.empty([1, 8, 4],
+                                   device=self.device,
+                                   dtype=FP8_DTYPE)
+        weight = torch.empty([4], device=self.device, dtype=self.dtype)
+        scale = torch.tensor(1.0, device=self.device, dtype=torch.float32)
+        return [input, rmsnorm_result, quant_result, weight, scale]
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(
+            input: torch.Tensor,
+            rmsnorm_result: torch.Tensor,
+            quant_result: torch.Tensor,
+            weight: torch.Tensor,
+            scale: torch.Tensor,
+        ):
+            all_reduce = self._all_reduce(input)
+            static_fp8 = self._functional_rmsnorm_then_quant(
+                rmsnorm_result, quant_result, all_reduce, weight, scale)
+            return static_fp8[1], all_reduce
+
+        def replacement(
+            input: torch.Tensor,
+            rmsnorm_result: torch.Tensor,
+            quant_result: torch.Tensor,
+            weight: torch.Tensor,
+            scale: torch.Tensor,
+        ):
+            reduce_scatter = self._reduce_scatter(input)
+
+            rmsnorm_result = torch.empty_like(reduce_scatter,
+                                              dtype=rmsnorm_result.dtype)
+            quant_result = torch.empty_like(
+                rmsnorm_result,  # Output of RMSNorm
+                dtype=quant_result.dtype)
+            static_fp8 = self._functional_rmsnorm_then_quant(
+                rmsnorm_result, quant_result, reduce_scatter, weight, scale)
+            all_gather = self._all_gather(static_fp8[1])
+
+            return all_gather, reduce_scatter
+
+        pm.register_replacement(pattern, replacement, self.get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
+class MiddleAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper):
+
+    def __init__(self, epsilon: float, dtype: torch.dtype, device: str,
+                 op: torch._ops.OpOverload):
+        super().__init__(epsilon, dtype, device, quant_op=op)
+
+    def get_inputs(self):
+        mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype)
+
+        residual = torch.empty([4, 4], device=self.device, dtype=self.dtype)
+        rms_norm_weights = torch.empty([4, 4],
+                                       device=self.device,
+                                       dtype=self.dtype)
+        result = torch.empty([4, 4], device=self.device, dtype=FP8_DTYPE)
+        scale = torch.empty([1, 1], device=self.device, dtype=torch.float32)
+
+        return [
+            result,
+            residual,
+            mm_1,
+            rms_norm_weights,
+            scale,
+        ]
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(
+            result: torch.Tensor,
+            residual: torch.Tensor,
+            mm_1: torch.Tensor,
+            rms_norm_weights: torch.Tensor,
+            scale: torch.Tensor,
+        ) -> tuple[torch.Tensor, torch.Tensor]:
+            all_reduce = self._all_reduce(mm_1)
+            static_fp8, rmsnorm_residual_out = self._functional_fused_add_rmsnorm_then_quant(  # noqa: E501
+                result, all_reduce, residual, rms_norm_weights, scale)
+            return static_fp8[1], rmsnorm_residual_out
+
+        def replacement(
+            result: torch.Tensor,
+            residual: torch.Tensor,
+            mm_1: torch.Tensor,
+            rms_norm_weights: torch.Tensor,
+            scale: torch.Tensor,
+        ) -> tuple[torch.Tensor, torch.Tensor]:
+            reduce_scatter = self._reduce_scatter(mm_1)
+            quant_result_buf = torch.empty_like(reduce_scatter,
+                                                dtype=result.dtype)
+            static_fp8, rmsnorm_residual_out = self._functional_fused_add_rmsnorm_then_quant(  # noqa: E501
+                quant_result_buf, reduce_scatter, residual, rms_norm_weights,
+                scale)
+            all_gather = self._all_gather(static_fp8[1])
+            return all_gather, rmsnorm_residual_out
+
+        pm.register_replacement(pattern, replacement, self.get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
+class LastAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper):
+
+    def __init__(self, epsilon: float, dtype: torch.dtype, device: str,
+                 op: torch._ops.OpOverload):
+        super().__init__(epsilon, dtype, device, quant_op=op)
+
+    def get_inputs(self):
+        mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype)
+
+        residual = torch.empty([4, 4], device=self.device, dtype=self.dtype)
+        rms_norm_weights = torch.empty([4, 4],
+                                       device=self.device,
+                                       dtype=self.dtype)
+        result = torch.empty([4, 4], device=self.device, dtype=FP8_DTYPE)
+        scale = torch.empty([1, 1], device=self.device, dtype=torch.float32)
+
+        return [
+            result,
+            residual,
+            mm_1,
+            rms_norm_weights,
+            scale,
+        ]
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(
+            result: torch.Tensor,
+            residual: torch.Tensor,
+            mm_1: torch.Tensor,
+            rms_norm_weights: torch.Tensor,
+            scale: torch.Tensor,
+        ) -> tuple[torch.Tensor, torch.Tensor]:
+            all_reduce = self._all_reduce(mm_1)
+            static_fp8, _ = self._functional_fused_add_rmsnorm_then_quant(
+                result, all_reduce, residual, rms_norm_weights, scale)
+            return static_fp8[1]
+
+        def replacement(
+            result: torch.Tensor,
+            residual: torch.Tensor,
+            mm_1: torch.Tensor,
+            rms_norm_weights: torch.Tensor,
+            scale: torch.Tensor,
+        ) -> tuple[torch.Tensor, torch.Tensor]:
+            reduce_scatter = self._reduce_scatter(mm_1)
+            quant_result_buf = torch.empty_like(reduce_scatter,
+                                                dtype=result.dtype)
+            static_fp8, _ = self._functional_fused_add_rmsnorm_then_quant(
+                quant_result_buf, reduce_scatter, residual, rms_norm_weights,
+                scale)
+            normalized = self._all_gather(static_fp8[1])
+            return normalized
+
+        pm.register_replacement(pattern, replacement, self.get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
+class SequenceParallelismPass(VllmInductorPass):
+    """
+    This pass enables sequence parallelism for models.
+    It identifies patterns where an AllReduce operation is followed by
+    an RMSNorm (or RMSNorm and then Quantization) operation.
+    These patterns are replaced with a ReduceScatter operation, followed by
+    a local RMSNorm/Quantization, and then an AllGather operation.
+
+    The general transformation is:
+    Input -> AllReduce -> RMSNorm -> Output
+    becomes
+    Input -> ReduceScatter -> RMSNorm -> AllGather -> Output
+
+    While this pass itself does not directly yield performance improvements,
+    it lays the groundwork for subsequent fusion passes, such as
+    GEMM + ReduceScatter and AllGather + GEMM fusions. These fusions can
+    significantly reduce communication overhead and improve overall model
+    performance.
+    """
+
+    def __init__(self, config: VllmConfig):
+        super().__init__(config)
+
+        self.patterns: PatternMatcherPass = PatternMatcherPass(
+            pass_name="sequence_parallelism_pass")
+
+        for epsilon in [1e-5, 1e-6]:
+            # RMSNorm + Static FP8 quantization patterns
+            fp8_quant_op = torch.ops._C.static_scaled_fp8_quant.default
+            FirstAllReduceRMSNormStaticFP8Pattern(
+                epsilon, self.model_dtype, self.device,
+                fp8_quant_op).register(self.patterns)
+            MiddleAllReduceRMSNormStaticFP8Pattern(
+                epsilon, self.model_dtype, self.device,
+                fp8_quant_op).register(self.patterns)
+            LastAllReduceRMSNormStaticFP8Pattern(
+                epsilon, self.model_dtype, self.device,
+                fp8_quant_op).register(self.patterns)
+
+            # Normal RMSNorm patterns
+            FirstAllReduceRMSNormPattern(epsilon, self.model_dtype,
+                                         self.device).register(self.patterns)
+
+            MiddleAllReduceRMSNormPattern(epsilon, self.model_dtype,
+                                          self.device).register(self.patterns)
+
+            LastAllReduceRMSNormPattern(epsilon, self.model_dtype,
+                                        self.device).register(self.patterns)
+
+            # WARNING: This is a hack to clear the pattern matcher cache
+            # and allow multiple values of epsilon.
+            torch._inductor.pattern_matcher._seen_patterns.clear()
+
+    def is_applicable_for_shape(self, shape: Optional[int]) -> bool:
+        tp_size = get_tensor_model_parallel_world_size()
+        return shape is not None and shape % tp_size == 0
+
+    def __call__(self, graph: fx.Graph):
+        self.begin()
+        self.dump_graph(graph, "before_sequence_parallelism_pass")
+        count = self.patterns.apply(graph)
+        logger.debug("Replaced %s patterns", count)
+        self.dump_graph(graph, "after_sequence_parallelism_pass")
+        self.end_and_log()
diff --git a/vllm_v0.10.0/vllm/compilation/torch25_custom_graph_pass.py b/vllm_v0.10.0/vllm/compilation/torch25_custom_graph_pass.py
new file mode 100644
index 0000000..cd39706
--- /dev/null
+++ b/vllm_v0.10.0/vllm/compilation/torch25_custom_graph_pass.py
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from typing import Any, Optional
+
+import torch
+
+
+class Torch25CustomGraphPass(ABC):  # noqa (redefinition)
+    """
+    This class replaces CustomGraphPass from torch==2.6 when using torch<2.6.
+    It conforms to the 2.6 interface but also supports pickling, as that's what
+    the inductor code cache uses to determine the cache key before 2.6.
+    (in 2.6 and above, uuid() is used.)
+
+    Subclasses can just "pretend" that uuid is used.
+    """
+
+    @abstractmethod
+    def __call__(self, graph: torch.fx.graph.Graph) -> None:
+        """
+        Implementation of the custom pass.
+        """
+
+    @abstractmethod
+    def uuid(self) -> Optional[Any]:
+        """
+        Return an ID to uniquely identify your custom pass implementation.
+        Return None to skip inductor code caching entirely.
+        """
+
+    def __getstate__(self):
+        """
+        Pickling is used instead of uuid() in torch<2.6. Just return uuid()
+         to enable subclasses to only have to implement uuid.
+        """
+        return self.uuid()
+
+    def __setstate__(self, state):
+        raise ValueError("Cannot unpickle CustomGraphPass because pickling"
+                         " is used for cache key uuid. Use torch>=2.6 with"
+                         " native uuid support for custom passes.")
diff --git a/vllm_v0.10.0/vllm/compilation/vllm_inductor_pass.py b/vllm_v0.10.0/vllm/compilation/vllm_inductor_pass.py
new file mode 100644
index 0000000..b822b05
--- /dev/null
+++ b/vllm_v0.10.0/vllm/compilation/vllm_inductor_pass.py
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import time
+
+import torch
+from torch._dynamo.utils import lazy_format_graph_code
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+
+from .inductor_pass import InductorPass
+
+logger = init_logger(__name__)
+
+
+class VllmInductorPass(InductorPass):
+    """
+    An inductor pass with access to vLLM PassConfig.
+    It provides timing, logging, and dumping utilities.
+    """
+
+    def __init__(self, config: VllmConfig):
+        self.pass_config = config.compilation_config.pass_config
+        self.model_dtype = config.model_config.dtype if config.model_config \
+            else None
+        self.device = config.device_config.device if config.device_config \
+            else None
+        self.pass_name = self.__class__.__name__
+
+    def dump_graph(self, graph: torch.fx.Graph, stage: str):
+        lazy_format_graph_code(stage, graph.owning_module)
+
+    def begin(self):
+        self._start_time = time.perf_counter_ns()
+
+    def end_and_log(self):
+        self._end_time = time.perf_counter_ns()
+        duration_ms = float(self._end_time - self._start_time) / 1.0e6
+        logger.debug("%s completed in %.1f ms", self.pass_name, duration_ms)
+
+
+class PrinterInductorPass(VllmInductorPass):
+
+    def __init__(self, name: str, config: VllmConfig):
+        super().__init__(config)
+        self.name = name
+
+    def __call__(self, graph: torch.fx.Graph):
+        self.dump_graph(graph, self.name)
diff --git a/vllm_v0.10.0/vllm/compilation/wrapper.py b/vllm_v0.10.0/vllm/compilation/wrapper.py
new file mode 100644
index 0000000..8d5df10
--- /dev/null
+++ b/vllm_v0.10.0/vllm/compilation/wrapper.py
@@ -0,0 +1,137 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import sys
+from abc import abstractmethod
+from contextlib import contextmanager
+from types import CodeType
+from typing import Callable, Optional
+
+import torch
+
+import vllm.envs as envs
+from vllm.config import CompilationLevel, get_current_vllm_config
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class TorchCompileWrapperWithCustomDispatcher:
+    """
+    A wrapper class for torch.compile, with a custom dispatch logic.
+    Subclasses should:
+    1. Implement the forward method
+    2. Implement the dispatch logic in the __call__ method
+        It can use `self.compiled_codes` to access the compiled bytecode,
+        and `with self.dispatch_to_code(index):` to dispatch to
+        the compiled code.
+    3. Implement the `__init__` method to determine how to call
+        `torch.compile` over the forward method.
+    """
+
+    def __init__(self,
+                 compiled_callable: Optional[Callable] = None,
+                 compilation_level: int = 0):
+
+        vllm_config = get_current_vllm_config()
+        self.vllm_config = vllm_config
+        if compiled_callable is None:
+            # default compilation settings
+            # compiling the forward method
+
+            backend = vllm_config.compilation_config.init_backend(vllm_config)
+            options = None
+            if isinstance(backend, str) and backend == "inductor":
+                options = get_current_vllm_config(
+                ).compilation_config.inductor_compile_config
+
+            compiled_callable = torch.compile(
+                self.forward,
+                fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
+                backend=backend,
+                options=options)
+
+        self.compiled_callable = compiled_callable
+        self.original_code_object = self.__class__.forward.__code__
+        self.compiled_codes: list[CodeType] = []
+        torch._dynamo.convert_frame.register_bytecode_hook(self.bytecode_hook)
+
+        # read the env var to determine whether to use the custom dispatcher
+        # subclasses can use this to switch between the custom dispatcher
+        # and the default Dynamo guard mechanism.
+        self.use_custom_dispatcher: bool = \
+            compilation_level >= CompilationLevel.DYNAMO_ONCE
+
+    def __call__(self, *args, **kwargs):
+        """Implement the dispatch logic here, beyond the torch.compile level.
+        NOTE: this function can have additional arguments beyond the forward
+         method, for directly dispatching to the compiled code.
+        """
+        return self.compiled_callable(*args, **kwargs)
+
+    @abstractmethod
+    def forward(self, *args, **kwargs):
+        ...
+
+    def bytecode_hook(self, old_code: CodeType, new_code: CodeType):
+        """Hook to save the compiled bytecode for direct execution."""
+        if old_code is not self.original_code_object:
+            return
+        # code borrowed from https://github.com/thuml/depyf/blob/f4ad79fadee27ea113b4c75202db1eb1a11c0dbc/depyf/explain/enable_debugging.py#L25
+        frame = sys._getframe()
+        while frame and frame.f_back:
+            frame = frame.f_back
+            code_name = frame.f_code.co_name
+            file_name = frame.f_code.co_filename.split(os.path.sep)[-1]
+            if code_name == "_compile" and file_name == "convert_frame.py":
+                break
+        frame = frame.f_locals["frame"]
+        assert frame.f_code == old_code
+
+        if frame.f_locals["self"] is not self:
+            return
+
+        self.compiled_codes.append(new_code)
+        debug_dump_dir = self.vllm_config.compilation_config.debug_dump_path
+        if isinstance(debug_dump_dir, str) and debug_dump_dir != "":
+            rank = self.vllm_config.parallel_config.rank
+            decompiled_file = os.path.join(debug_dump_dir, f"rank_{rank}",
+                                           "transformed_code.py")
+            if not os.path.exists(decompiled_file):
+                try:
+                    # usually the decompilation will succeed for most models,
+                    # as we guarantee a full-graph compilation in Dynamo.
+                    # but there's no 100% guarantee, since decompliation is
+                    # not a reversible process.
+                    import depyf
+                    src = depyf.decompile(new_code)
+
+                    with open(decompiled_file, "w") as f:
+                        f.write(src)
+
+                    logger.debug("Dynamo transformed code saved to %s",
+                                 decompiled_file)
+                except Exception:
+                    pass
+
+        if self.vllm_config.compilation_config.use_cudagraph and \
+            "update" in new_code.co_names:
+            import depyf
+            src = depyf.decompile(new_code)
+            msg = "Assigning / modifying buffers of nn.Module during forward pass is not allowed when using cudagraph inside the compiler because it will cause silent errors. Please use eager mode or fix the code. The following code contains clues about which buffer is being modified (please search for the usage of the function `update`):\n" + src  # noqa
+            raise RuntimeError(msg)
+
+    @contextmanager
+    def dispatch_to_code(self, index: int):
+        """Context manager to dispatch to the compiled code.
+        Why does this work? Because Dynamo guarantees that the compiled
+        bytecode has exactly the same arguments, cell variables, and free
+        variables as the original code. Therefore we can directly switch
+        the code object in the function and call it.
+
+        See https://dev-discuss.pytorch.org/t/what-is-the-relationship-requirement-among-original-bytecode-transformed-bytecode-and-bytecode-returned-by-hooks-in-dynamo/1693/7 for more details.
+        """ # noqa
+        self.__class__.forward.__code__ = self.compiled_codes[index]
+        yield
+        self.__class__.forward.__code__ = self.original_code_object
diff --git a/vllm_v0.10.0/vllm/config.py b/vllm_v0.10.0/vllm/config.py
new file mode 100644
index 0000000..f038cdd
--- /dev/null
+++ b/vllm_v0.10.0/vllm/config.py
@@ -0,0 +1,4999 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import ast
+import copy
+import enum
+import hashlib
+import inspect
+import json
+import textwrap
+import uuid
+import warnings
+from collections import Counter
+from contextlib import contextmanager
+from dataclasses import (MISSING, Field, asdict, field, fields, is_dataclass,
+                         replace)
+from functools import cached_property
+from importlib.util import find_spec
+from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Literal, Optional,
+                    Protocol, TypeVar, Union, cast, get_args)
+
+import regex as re
+import torch
+from pydantic import (ConfigDict, SkipValidation, TypeAdapter, field_validator,
+                      model_validator)
+from pydantic.dataclasses import dataclass
+from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
+from torch.distributed import ProcessGroup, ReduceOp
+from typing_extensions import Self, runtime_checkable
+
+import vllm.envs as envs
+from vllm import version
+from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.platforms import current_platform
+from vllm.transformers_utils.config import (
+    ConfigFormat, get_config, get_hf_image_processor_config,
+    get_hf_text_config, get_pooling_config,
+    get_sentence_transformer_tokenizer_config, is_encoder_decoder,
+    try_get_generation_config, try_get_safetensors_metadata,
+    try_get_tokenizer_config, uses_mrope)
+from vllm.transformers_utils.s3_utils import S3Model
+from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
+                        MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
+                        POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, GiB_bytes,
+                        LayerBlockType, LazyLoader, common_broadcastable_dtype,
+                        cuda_device_count_stateless, get_cpu_memory,
+                        get_open_port, is_torch_equal_or_newer, random_uuid,
+                        resolve_obj_by_qualname)
+
+# yapf: enable
+
+if TYPE_CHECKING:
+    from _typeshed import DataclassInstance
+    from ray.util.placement_group import PlacementGroup
+    from transformers.configuration_utils import PretrainedConfig
+
+    import vllm.model_executor.layers.quantization as me_quant
+    import vllm.model_executor.models as me_models
+    from vllm.executor.executor_base import ExecutorBase
+    from vllm.model_executor.layers.quantization import QuantizationMethods
+    from vllm.model_executor.layers.quantization.base_config import (
+        QuantizationConfig)
+    from vllm.model_executor.model_loader import BaseModelLoader
+    from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
+
+    ConfigType = type[DataclassInstance]
+    HfOverrides = Union[dict, Callable[[type], type]]
+else:
+    DataclassInstance = Any
+    PlacementGroup = Any
+    PretrainedConfig = Any
+    ExecutorBase = Any
+    QuantizationConfig = Any
+    QuantizationMethods = Any
+    BaseModelLoader = Any
+    TensorizerConfig = Any
+    ConfigType = type
+    HfOverrides = Union[dict[str, Any], Callable[[type], type]]
+
+    me_quant = LazyLoader("model_executor", globals(),
+                          "vllm.model_executor.layers.quantization")
+    me_models = LazyLoader("model_executor", globals(),
+                           "vllm.model_executor.models")
+
+logger = init_logger(__name__)
+DataclassInstanceT = TypeVar("DataclassInstanceT", bound=DataclassInstance)
+ConfigT = TypeVar("ConfigT", bound=ConfigType)
+
+TaskOption = Literal["auto", "generate", "embedding", "embed", "classify",
+                     "score", "reward", "transcription", "draft"]
+
+_ResolvedTask = Literal["generate", "transcription", "encode", "embed",
+                        "classify", "reward", "draft"]
+
+RunnerOption = Literal["auto", "generate", "pooling", "draft"]
+
+RunnerType = Literal["generate", "pooling", "draft"]
+
+_RUNNER_TASKS: dict[RunnerType, list[_ResolvedTask]] = {
+    "generate": ["generate", "transcription"],
+    "pooling": ["encode", "embed", "classify", "reward"],
+    "draft": [],
+}
+
+
+@runtime_checkable
+class SupportsHash(Protocol):
+
+    def compute_hash(self) -> str:
+        ...
+
+
+class SupportsMetricsInfo(Protocol):
+
+    def metrics_info(self) -> dict[str, str]:
+        ...
+
+
+class ModelImpl(str, enum.Enum):
+    AUTO = "auto"
+    VLLM = "vllm"
+    TRANSFORMERS = "transformers"
+
+
+def get_attr_docs(cls: type[Any]) -> dict[str, str]:
+    """
+    Get any docstrings placed after attribute assignments in a class body.
+
+    https://davidism.com/mit-license/
+    """
+
+    def pairwise(iterable):
+        """
+        Manually implement https://docs.python.org/3/library/itertools.html#itertools.pairwise
+
+        Can be removed when Python 3.9 support is dropped.
+        """
+        iterator = iter(iterable)
+        a = next(iterator, None)
+
+        for b in iterator:
+            yield a, b
+            a = b
+
+    cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0]
+
+    if not isinstance(cls_node, ast.ClassDef):
+        raise TypeError("Given object was not a class.")
+
+    out = {}
+
+    # Consider each pair of nodes.
+    for a, b in pairwise(cls_node.body):
+        # Must be an assignment then a constant string.
+        if (not isinstance(a, (ast.Assign, ast.AnnAssign))
+                or not isinstance(b, ast.Expr)
+                or not isinstance(b.value, ast.Constant)
+                or not isinstance(b.value.value, str)):
+            continue
+
+        doc = inspect.cleandoc(b.value.value)
+
+        # An assignment can have multiple targets (a = b = v), but an
+        # annotated assignment only has one target.
+        targets = a.targets if isinstance(a, ast.Assign) else [a.target]
+
+        for target in targets:
+            # Must be assigning to a plain name.
+            if not isinstance(target, ast.Name):
+                continue
+
+            out[target.id] = doc
+
+    return out
+
+
+def config(cls: ConfigT) -> ConfigT:
+    """
+    A decorator that ensures all fields in a dataclass have default values
+    and that each field has a docstring.
+
+    If a `ConfigT` is used as a CLI argument itself, the default value provided
+    by `get_kwargs` will be the result parsing a JSON string as the kwargs
+    (i.e. `ConfigT(**json.loads(cli_arg))`). However, if a particular `ConfigT`
+    requires custom construction from CLI (i.e. `CompilationConfig`), it can
+    have a `from_cli` method, which will be called instead.
+
+    Config validation is performed by the tools/validate_config.py
+    script, which is invoked during the pre-commit checks.
+    """
+    return cls
+
+
+def get_field(cls: ConfigType, name: str) -> Field:
+    """Get the default factory field of a dataclass by name. Used for getting
+    default factory fields in `EngineArgs`."""
+    if not is_dataclass(cls):
+        raise TypeError("The given class is not a dataclass.")
+    cls_fields = {f.name: f for f in fields(cls)}
+    if name not in cls_fields:
+        raise ValueError(f"Field '{name}' not found in {cls.__name__}.")
+    named_field: Field = cls_fields[name]
+    if (default_factory := named_field.default_factory) is not MISSING:
+        return field(default_factory=default_factory)
+    if (default := named_field.default) is not MISSING:
+        return field(default=default)
+    raise ValueError(
+        f"{cls.__name__}.{name} must have a default value or default factory.")
+
+
+def is_init_field(cls: ConfigType, name: str) -> bool:
+    return next(f for f in fields(cls) if f.name == name).init
+
+
+TokenizerMode = Literal["auto", "slow", "mistral", "custom"]
+ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
+LogprobsMode = Literal["raw_logprobs", "raw_logits", "processed_logprobs",
+                       "processed_logits"]
+
+
+@config
+@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
+class ModelConfig:
+    """Configuration for the model."""
+
+    model: str = "Qwen/Qwen3-0.6B"
+    """Name or path of the Hugging Face model to use. It is also used as the
+    content for `model_name` tag in metrics output when `served_model_name` is
+    not specified."""
+    runner: RunnerOption = "auto"
+    """The type of model runner to use. Each vLLM instance only supports one
+    model runner, even if the same model can be used for multiple types."""
+    task: TaskOption = "auto"
+    """The task to use the model for. If the model supports more than one
+    model runner, this is used to select which model runner to run.
+
+    Note that the model may support other tasks using the same model runner."""
+    tokenizer: SkipValidation[str] = None  # type: ignore
+    """Name or path of the Hugging Face tokenizer to use. If unspecified, model
+    name or path will be used."""
+    tokenizer_mode: TokenizerMode = "auto"
+    """Tokenizer mode:\n
+    - "auto" will use the fast tokenizer if available.\n
+    - "slow" will always use the slow tokenizer.\n
+    - "mistral" will always use the tokenizer from `mistral_common`.\n
+    - "custom" will use --tokenizer to select the preregistered tokenizer."""
+    trust_remote_code: bool = False
+    """Trust remote code (e.g., from HuggingFace) when downloading the model
+    and tokenizer."""
+    dtype: Union[ModelDType, torch.dtype] = "auto"
+    """Data type for model weights and activations:\n
+    - "auto" will use FP16 precision for FP32 and FP16 models, and BF16
+    precision for BF16 models.\n
+    - "half" for FP16. Recommended for AWQ quantization.\n
+    - "float16" is the same as "half".\n
+    - "bfloat16" for a balance between precision and range.\n
+    - "float" is shorthand for FP32 precision.\n
+    - "float32" for FP32 precision."""
+    seed: Optional[int] = None
+    """Random seed for reproducibility. Initialized to None in V0, but
+    initialized to 0 in V1."""
+    hf_config_path: Optional[str] = None
+    """Name or path of the Hugging Face config to use. If unspecified, model
+    name or path will be used."""
+    allowed_local_media_path: str = ""
+    """Allowing API requests to read local images or videos from directories
+    specified by the server file system. This is a security risk. Should only
+    be enabled in trusted environments."""
+    revision: Optional[str] = None
+    """The specific model version to use. It can be a branch name, a tag name,
+    or a commit id. If unspecified, will use the default version."""
+    code_revision: Optional[str] = None
+    """The specific revision to use for the model code on the Hugging Face Hub.
+    It can be a branch name, a tag name, or a commit id. If unspecified, will
+    use the default version."""
+    rope_scaling: dict[str, Any] = field(default_factory=dict)
+    """RoPE scaling configuration. For example,
+    `{"rope_type":"dynamic","factor":2.0}`."""
+    rope_theta: Optional[float] = None
+    """RoPE theta. Use with `rope_scaling`. In some cases, changing the RoPE
+    theta improves the performance of the scaled model."""
+    tokenizer_revision: Optional[str] = None
+    """The specific revision to use for the tokenizer on the Hugging Face Hub.
+    It can be a branch name, a tag name, or a commit id. If unspecified, will
+    use the default version."""
+    max_model_len: SkipValidation[int] = None  # type: ignore
+    """Model context length (prompt and output). If unspecified, will be
+    automatically derived from the model config.
+
+    When passing via `--max-model-len`, supports k/m/g/K/M/G in human-readable
+    format. Examples:\n
+    - 1k -> 1000\n
+    - 1K -> 1024\n
+    - 25.6k -> 25,600"""
+    spec_target_max_model_len: Optional[int] = None
+    """Specify the maximum length for spec decoding draft models."""
+    quantization: SkipValidation[Optional[QuantizationMethods]] = None
+    """Method used to quantize the weights. If `None`, we first check the
+    `quantization_config` attribute in the model config file. If that is
+    `None`, we assume the model weights are not quantized and use `dtype` to
+    determine the data type of the weights."""
+    enforce_eager: bool = False
+    """Whether to always use eager-mode PyTorch. If True, we will disable CUDA
+    graph and always execute the model in eager mode. If False, we will use
+    CUDA graph and eager execution in hybrid for maximal performance and
+    flexibility."""
+    max_seq_len_to_capture: int = 8192
+    """Maximum sequence len covered by CUDA graphs. When a sequence has context
+    length larger than this, we fall back to eager mode. Additionally for
+    encoder-decoder models, if the sequence length of the encoder input is
+    larger than this, we fall back to the eager mode."""
+    max_logprobs: int = 20
+    """Maximum number of log probabilities to return when `logprobs` is
+    specified in `SamplingParams`. The default value comes the default for the
+    OpenAI Chat Completions API."""
+    logprobs_mode: LogprobsMode = "raw_logprobs"
+    """Indicates the content returned in the logprobs and prompt_logprobs.
+    Supported mode:
+    1) raw_logprobs, 2) processed_logprobs, 3) raw_logits, 4) processed_logits.
+    Raw means the values before applying logit processors, like bad words.
+    Processed means the values after applying such processors.
+    """
+    disable_sliding_window: bool = False
+    """Whether to disable sliding window. If True, we will disable the sliding
+    window functionality of the model, capping to sliding window size. If the
+    model does not support sliding window, this argument is ignored."""
+    disable_cascade_attn: bool = False
+    """Disable cascade attention for V1. While cascade attention does not
+    change the mathematical correctness, disabling it could be useful for
+    preventing potential numerical issues. Note that even if this is set to
+    False, cascade attention will be only used when the heuristic tells that
+    it's beneficial."""
+    skip_tokenizer_init: bool = False
+    """Skip initialization of tokenizer and detokenizer. Expects valid
+    `prompt_token_ids` and `None` for prompt from the input. The generated
+    output will contain token ids."""
+    enable_prompt_embeds: bool = False
+    """If `True`, enables passing text embeddings as inputs via the
+    `prompt_embeds` key. Note that enabling this will double the time required
+    for graph compilation."""
+    served_model_name: Optional[Union[str, list[str]]] = None
+    """The model name(s) used in the API. If multiple names are provided, the
+    server will respond to any of the provided names. The model name in the
+    model field of a response will be the first name in this list. If not
+    specified, the model name will be the same as the `--model` argument. Noted
+    that this name(s) will also be used in `model_name` tag content of
+    prometheus metrics, if multiple names provided, metrics tag will take the
+    first one."""
+    limit_mm_per_prompt: dict[str, int] = field(default_factory=dict)
+    """Maximum number of data items per modality per prompt. Only applicable
+    for multimodal models."""
+    interleave_mm_strings: bool = False
+    """Enable fully interleaved support for multimodal prompts, while using
+    --chat-template-content-format=string. Defaults to False."""
+    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
+    """Additional args passed to process media inputs, keyed by modalities.
+    For example, to set num_frames for video, set
+    `--media-io-kwargs '{"video": {"num_frames": 40} }'` """
+    use_async_output_proc: bool = True
+    """Whether to use async output processor."""
+    config_format: Union[str, ConfigFormat] = ConfigFormat.AUTO.value
+    """The format of the model config to load:\n
+    - "auto" will try to load the config in hf format if available else it
+    will try to load in mistral format.\n
+    - "hf" will load the config in hf format.\n
+    - "mistral" will load the config in mistral format."""
+    hf_token: Optional[Union[bool, str]] = None
+    """The token to use as HTTP bearer authorization for remote files . If
+    `True`, will use the token generated when running `huggingface-cli login`
+    (stored in `~/.huggingface`)."""
+    hf_overrides: HfOverrides = field(default_factory=dict)
+    """If a dictionary, contains arguments to be forwarded to the Hugging Face
+    config. If a callable, it is called to update the HuggingFace config."""
+    mm_processor_kwargs: Optional[dict[str, Any]] = None
+    """Arguments to be forwarded to the model's processor for multi-modal data,
+    e.g., image processor. Overrides for the multi-modal processor obtained
+    from `AutoProcessor.from_pretrained`. The available overrides depend on the
+    model that is being run. For example, for Phi-3-Vision: `{"num_crops": 4}`.
+    """
+    disable_mm_preprocessor_cache: bool = False
+    """If `True`, disable caching of the multi-modal preprocessor/mapper (not
+    recommended)."""
+    override_neuron_config: dict[str, Any] = field(default_factory=dict)
+    """Initialize non-default neuron config or override default neuron config
+    that are specific to Neuron devices, this argument will be used to
+    configure the neuron config that can not be gathered from the vllm
+    arguments. e.g. `{"cast_logits_dtype": "bfloat16"}`."""
+    pooler_config: Optional["PoolerConfig"] = field(init=False)
+    """Pooler config which controls the behaviour of output pooling in pooling
+    models."""
+    override_pooler_config: Optional[Union[dict, "PoolerConfig"]] = None
+    """Initialize non-default pooling config or override default pooling config
+    for the pooling model. e.g. `{"pooling_type": "mean", "normalize": false}`.
+    """
+    logits_processor_pattern: Optional[str] = None
+    """Optional regex pattern specifying valid logits processor qualified names
+    that can be passed with the `logits_processors` extra completion argument.
+    Defaults to `None`, which allows no processors."""
+    generation_config: str = "auto"
+    """The folder path to the generation config. Defaults to `"auto"`, the
+    generation config will be loaded from model path. If set to `"vllm"`, no
+    generation config is loaded, vLLM defaults will be used. If set to a folder
+    path, the generation config will be loaded from the specified folder path.
+    If `max_new_tokens` is specified in generation config, then it sets a
+    server-wide limit on the number of output tokens for all requests."""
+    override_generation_config: dict[str, Any] = field(default_factory=dict)
+    """Overrides or sets generation config. e.g. `{"temperature": 0.5}`. If
+    used with `--generation-config auto`, the override parameters will be
+    merged with the default config from the model. If used with
+    `--generation-config vllm`, only the override parameters are used."""
+    enable_sleep_mode: bool = False
+    """Enable sleep mode for the engine (only cuda platform is supported)."""
+    model_impl: Union[str, ModelImpl] = ModelImpl.AUTO.value
+    """Which implementation of the model to use:\n
+    - "auto" will try to use the vLLM implementation, if it exists, and fall
+    back to the Transformers implementation if no vLLM implementation is
+    available.\n
+    - "vllm" will use the vLLM model implementation.\n
+    - "transformers" will use the Transformers model implementation."""
+    override_attention_dtype: Optional[str] = None
+    """Override dtype for attention"""
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: list[Any] = []
+        factors.append(self.model)
+        factors.append(self.dtype)
+        factors.append(self.quantization)
+        factors.append(self.revision)
+        factors.append(self.code_revision)
+        factors.append(self.max_model_len)
+        factors.append(self.max_logprobs)
+        factors.append(self.disable_sliding_window)
+        factors.append(self.trust_remote_code)
+        factors.append(self.generation_config)
+        factors.append(self.model_impl)
+        factors.append(self.override_generation_config)
+        factors.append(self.rope_scaling)
+        factors.append(self.rope_theta)
+        # hf_config can control how the model looks!
+        factors.append(self.hf_config.to_json_string())
+        str_factors = str(factors)
+        assert_hashable(str_factors)
+        return hashlib.sha256(str(factors).encode()).hexdigest()
+
+    def __post_init__(self) -> None:
+        # Set the default seed to 0 in V1.
+        # NOTE(woosuk): In V0, we set the default seed to None because the
+        # driver worker shares the same process as the user process, and thus
+        # setting a seed affects the user process as well.
+        # In V1, we use separate processes for workers (unless
+        # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here
+        # doesn't affect the user process. However, without a consistent seed,
+        # different tensor parallel workers would sample different tokens,
+        # leading to inconsistent results.
+        if envs.VLLM_USE_V1 and self.seed is None:
+            self.seed = 0
+            if not envs.VLLM_ENABLE_V1_MULTIPROCESSING:
+                logger.warning(
+                    "The global random seed is set to %d. Since "
+                    "VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may "
+                    "affect the random state of the Python process that "
+                    "launched vLLM.", self.seed)
+
+        # Keep set served_model_name before maybe_model_redirect(self.model)
+        self.served_model_name = get_served_model_name(self.model,
+                                                       self.served_model_name)
+        self.model = maybe_model_redirect(self.model)
+        # The tokenizer is consistent with the model by default.
+        if self.tokenizer is None:
+            self.tokenizer = self.model
+        if self.tokenizer_revision is None:
+            self.tokenizer_revision = self.revision
+        self.tokenizer = maybe_model_redirect(self.tokenizer)
+
+        if isinstance(self.hf_config_path, str):
+            self.hf_config_path = maybe_model_redirect(self.hf_config_path)
+
+        if callable(self.hf_overrides):
+            hf_overrides_kw = {}
+            hf_overrides_fn = self.hf_overrides
+        else:
+            hf_overrides_kw = self.hf_overrides
+            hf_overrides_fn = None
+
+        if self.rope_scaling:
+            hf_override: dict[str, Any] = {"rope_scaling": self.rope_scaling}
+            hf_overrides_kw.update(hf_override)
+            hf_overrides_str = json.dumps(hf_overrides_kw)
+            msg = (
+                "`--rope-scaling` will be removed in a future release. "
+                f"'Please instead use `--hf-overrides '{hf_overrides_str}'`")
+            warnings.warn(DeprecationWarning(msg), stacklevel=2)
+        if self.rope_theta is not None:
+            hf_override = {"rope_theta": self.rope_theta}
+            hf_overrides_kw.update(hf_override)
+            hf_overrides_str = json.dumps(hf_overrides_kw)
+            msg = (
+                "`--rope-theta` will be removed in a future release. "
+                f"'Please instead use `--hf-overrides '{hf_overrides_str}'`")
+            warnings.warn(DeprecationWarning(msg), stacklevel=2)
+
+        self.maybe_pull_model_tokenizer_for_s3(self.model, self.tokenizer)
+
+        if (backend := envs.VLLM_ATTENTION_BACKEND
+            ) and backend == "FLASHINFER" and find_spec("flashinfer") is None:
+            raise ValueError(
+                "VLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer "
+                "module was not found. See "
+                "https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile "  # noqa: E501
+                "for instructions on how to install it.")
+
+        from vllm.platforms import current_platform
+
+        if (self.override_attention_dtype is not None
+                and not current_platform.is_rocm()):
+            warnings.warn(
+                "override-attention-dtype is set but not using ROCm platform",
+                stacklevel=2)
+
+        if (self.enable_sleep_mode
+                and not current_platform.is_sleep_mode_available()):
+            raise ValueError(
+                "Sleep mode is not supported on current platform.")
+
+        if isinstance(self.config_format, str):
+            self.config_format = ConfigFormat(self.config_format)
+
+        hf_config = get_config(self.hf_config_path or self.model,
+                               self.trust_remote_code,
+                               self.revision,
+                               self.code_revision,
+                               self.config_format,
+                               hf_overrides_kw=hf_overrides_kw,
+                               hf_overrides_fn=hf_overrides_fn)
+        self.hf_config = hf_config
+
+        self.hf_text_config = get_hf_text_config(self.hf_config)
+        self.attention_chunk_size = getattr(self.hf_text_config,
+                                            "attention_chunk_size", None)
+        self.encoder_config = self._get_encoder_config()
+        self.hf_image_processor_config = get_hf_image_processor_config(
+            self.model, hf_token=self.hf_token, revision=self.revision)
+
+        # For pooling models, self.task is used to indicate the
+        # user-selected task
+        if self.task == "score":
+            if self._is_classify_task(self.architectures):
+                self.task = "classify"
+            else:
+                self.task = "embed"
+        elif self.task == "embedding":
+            msg = ("The 'embedding' task has been renamed to 'embed', please "
+                   "use the new name. The old name will be removed in v1.0.")
+            warnings.warn(msg, DeprecationWarning, stacklevel=2)
+
+            self.task = "embed"
+
+        model_info, arch = self.registry.inspect_model_cls(self.architectures)
+        self._model_info = model_info
+        self._architecture = arch
+
+        all_supported_tasks = self._get_supported_tasks(self.task)
+        logger.debug("Tasks supported by runner type: %s", all_supported_tasks)
+        supported_runner_types = self._get_supported_runner_types(
+            all_supported_tasks)
+        runner_type = self._resolve_runner(self.runner, self.task,
+                                           supported_runner_types,
+                                           all_supported_tasks)
+
+        logger.debug("Selected runner type: %s", runner_type)
+        # For pooling models, self.task is used to indicate the
+        # user-selected task
+        if runner_type == "pooling" and self.task == "auto":
+            selected_task = all_supported_tasks[runner_type][-1]
+            assert selected_task != "encode"
+            self.task = selected_task
+        self.supported_runner_types = supported_runner_types
+        self.runner_type = runner_type
+        self.supported_tasks = all_supported_tasks[runner_type]
+
+        if self.runner_type in ("draft",
+                                "generate") and self.task != "transcription":
+            self.truncation_side = "left"
+        else:
+            self.truncation_side = "right"
+
+        self.pooler_config = self._init_pooler_config()
+
+        self.dtype = _get_and_verify_dtype(
+            self.model,
+            self.hf_config,
+            self.dtype,
+            is_pooling_model=self.runner_type == "pooling",
+            revision=self.revision,
+        )
+
+        # Workaround for Gemma 2 which uses interleaved sliding window
+        # attention, but it's not specified in its config. TODO: remove this
+        # when Gemma 2 is fixed in Transformers.
+        if self.hf_text_config.model_type == "gemma2":
+            self.hf_text_config.sliding_window_pattern = 2
+
+        sliding_window = getattr(self.hf_text_config, "sliding_window", None)
+        sliding_window_pattern = getattr(self.hf_text_config,
+                                         "sliding_window_pattern", None)
+        has_interleaved_attention = sliding_window_pattern is not None or (
+            isinstance(sliding_window, list))
+
+        if not self.disable_sliding_window and has_interleaved_attention:
+            if (backend :=
+                    envs.VLLM_ATTENTION_BACKEND) in ("XFORMERS", "FLASHINFER"):
+                sliding_window_len_min = get_min_sliding_window(
+                    self.hf_text_config.sliding_window)
+
+                logger.warning_once(
+                    "%s has interleaved attention, which is currently not supported by the %s backend. Disabling sliding window and capping the max length to the sliding window size (%d).",  # noqa: E501
+                    self.hf_text_config.model_type,
+                    backend,
+                    sliding_window_len_min,
+                )
+                self.disable_sliding_window = True
+            else:
+                # for a model with interleaved attention,
+                # the scheduler and the model treat it as full attention
+                # (i.e., not dropping any tokens outside the window).
+                # only the attention layer itself is aware of the sliding
+                # window, and use the window size to compute the attention.
+                self.hf_text_config.interleaved_sliding_window = sliding_window
+
+                if hasattr(self.hf_text_config, "sliding_window"):
+                    delattr(self.hf_text_config, "sliding_window")
+
+                sliding_window = None
+
+        self.original_max_model_len = self.max_model_len
+        self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
+        self.multimodal_config = self._init_multimodal_config()
+        self.model_supports_multimodal_raw_input = (
+            self.registry.supports_multimodal_raw_input(self.architectures))
+        if not self.skip_tokenizer_init:
+            self._verify_tokenizer_mode()
+
+        self.is_attention_free = self._init_attention_free()
+        self.is_hybrid = self._init_is_hybrid()
+        self.has_noops = self._init_has_noops()
+        self.has_inner_state = self._init_has_inner_state()
+
+        if (not current_platform.is_neuron() and self.override_neuron_config):
+            raise ValueError(
+                "`override_neuron_config` is only supported on Neuron.")
+
+        self._verify_quantization()
+        self._verify_cuda_graph()
+        self._verify_bnb_config()
+
+    @field_validator("quantization", mode="before")
+    @classmethod
+    def validate_quantization_before(cls, value: Any) -> Any:
+        if isinstance(value, str):
+            return value.lower()
+        return value
+
+    @model_validator(mode="after")
+    def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":
+        if not isinstance(self.tokenizer, str):
+            raise ValueError("tokenizer must be a string after __post_init__.")
+        if not isinstance(self.max_model_len, int):
+            raise ValueError(
+                "max_model_len must be an integer after __post_init__.")
+        return self
+
+    def _get_transformers_backend_cls(self) -> str:
+        """Determine which Transformers backend class will be used if
+        `model_impl` is set to `transformers` or `auto`."""
+        if self.hf_config != self.hf_text_config:
+            # If 'hf_text_config' is the same as 'hf_config'. If not, it is
+            # probably a composite config, i.e. multimodal
+            return "TransformersForMultimodalLM"
+        else:
+            return "TransformersForCausalLM"
+
+    @property
+    def registry(self):
+        return me_models.ModelRegistry
+
+    @property
+    def architectures(self) -> list[str]:
+        # architectures in the model config.
+        architectures = getattr(self.hf_config, "architectures", [])
+        # The registry assumes that it can always inspect the vLLM model class
+        # for a given architecture. This assumption breaks down for the
+        # Transformers backend, which may use a different class depending on
+        # the model type. To work around this, we add the correct Transformers
+        # backend class to the architectures list. We must do this here because
+        # we need access to the `hf_config` to determine the backend class.
+        transformers_backend_cls = self._get_transformers_backend_cls()
+        if (self.model_impl != ModelImpl.VLLM.value
+                and all(arch != transformers_backend_cls
+                        for arch in architectures)):
+            architectures.append(transformers_backend_cls)
+        return architectures
+
+    @property
+    def architecture(self) -> str:
+        # The architecture vllm actually used.
+        return self._architecture
+
+    @property
+    def model_info(self):
+        return self._model_info
+
+    def maybe_pull_model_tokenizer_for_s3(self, model: str,
+                                          tokenizer: str) -> None:
+        """Pull model/tokenizer from S3 to temporary directory when needed.
+
+        Args:
+            model: Model name or path
+            tokenizer: Tokenizer name or path
+        """
+        if not (is_s3(model) or is_s3(tokenizer)):
+            return
+
+        if is_s3(model):
+            s3_model = S3Model()
+            s3_model.pull_files(model,
+                                allow_pattern=["*.model", "*.py", "*.json"])
+            self.model_weights = model
+            self.model = s3_model.dir
+
+            # If tokenizer is same as model, download to same directory
+            if model == tokenizer:
+                s3_model.pull_files(model,
+                                    ignore_pattern=[
+                                        "*.pt", "*.safetensors", "*.bin",
+                                        "*.tensors"
+                                    ])
+                self.tokenizer = s3_model.dir
+                return
+
+        # Only download tokenizer if needed and not already handled
+        if is_s3(tokenizer):
+            s3_tokenizer = S3Model()
+            s3_tokenizer.pull_files(
+                model,
+                ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors"])
+            self.tokenizer = s3_tokenizer.dir
+
+    def _init_multimodal_config(self) -> Optional["MultiModalConfig"]:
+        if self.registry.is_multimodal_model(self.architectures):
+            return MultiModalConfig(
+                limit_per_prompt=self.limit_mm_per_prompt,
+                media_io_kwargs=self.media_io_kwargs,
+                mm_processor_kwargs=self.mm_processor_kwargs,
+                disable_mm_preprocessor_cache=self.
+                disable_mm_preprocessor_cache,
+                interleave_mm_strings=self.interleave_mm_strings)
+
+        if self.limit_mm_per_prompt:
+            raise ValueError("`limit_mm_per_prompt` is only supported for "
+                             "multimodal models.")
+        if self.mm_processor_kwargs:
+            raise ValueError("`mm_processor_kwargs` is only supported for "
+                             "multimodal models.")
+        if self.disable_mm_preprocessor_cache:
+            raise ValueError("`disable_mm_preprocessor_cache` is only "
+                             "supported for multimodal models.")
+        if self.interleave_mm_strings:
+            raise ValueError("`interleave_mm_strings` is only "
+                             "supported for multimodal models.")
+
+        return None
+
+    def _get_encoder_config(self):
+        return get_sentence_transformer_tokenizer_config(
+            self.model, self.revision)
+
+    def _init_pooler_config(self) -> Optional["PoolerConfig"]:
+        if self.runner_type == "pooling":
+            if isinstance(self.override_pooler_config, dict):
+                self.override_pooler_config = PoolerConfig(
+                    **self.override_pooler_config)
+
+            pooler_config = self.override_pooler_config or PoolerConfig()
+
+            base_config = get_pooling_config(self.model, self.revision)
+            if base_config is not None:
+                # Only set values that are not overridden by the user
+                for k, v in base_config.items():
+                    if getattr(pooler_config, k) is None:
+                        setattr(pooler_config, k, v)
+
+            if self.is_matryoshka:
+                if pooler_config.normalize is None:
+                    pooler_config.normalize = True
+                elif not pooler_config.normalize:
+                    raise ValueError(
+                        "`normalize` must be enabled (set to True) "
+                        "for models that are compatible with "
+                        "Matryoshka Representation.")
+
+            return pooler_config
+
+        return None
+
+    def _init_attention_free(self) -> bool:
+        return self.registry.is_attention_free_model(self.architectures)
+
+    def _init_is_hybrid(self) -> bool:
+        return self.registry.is_hybrid_model(self.architectures)
+
+    def _init_has_noops(self) -> bool:
+        architectures = getattr(self.hf_config, "architectures", [])
+        return self.registry.is_noops_model(architectures)
+
+    def _init_has_inner_state(self) -> bool:
+        return self.registry.model_has_inner_state(self.architectures)
+
+    def _verify_tokenizer_mode(self) -> None:
+        tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower())
+        if tokenizer_mode not in get_args(TokenizerMode):
+            raise ValueError(
+                f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
+                f"one of {get_args(TokenizerMode)}.")
+        self.tokenizer_mode = tokenizer_mode
+
+    def _is_classify_task(self, architectures: list[str]):
+        for arch in architectures:
+            if arch.endswith("ForSequenceClassification"):
+                return True
+        return self.registry.is_cross_encoder_model(architectures)
+
+    def _get_preferred_pooling_task(
+        self,
+        architectures: list[str],
+    ) -> _ResolvedTask:
+        model_id = self.model
+        if get_pooling_config(model_id, self.revision):
+            return "embed"
+        if self.registry.is_transcription_model(architectures):
+            return "transcription"
+
+        suffix_to_preferred_task: list[tuple[str, _ResolvedTask]] = [
+            # Other models follow this pattern
+            ("EmbeddingModel", "embed"),
+            ("RewardModel", "reward"),
+        ]
+
+        for suffix, pref_task in suffix_to_preferred_task:
+            if self.architecture.endswith(suffix):
+                return pref_task
+
+        return "embed"
+
+    def _get_supported_generation_tasks(
+        self,
+        task_option: TaskOption,
+    ) -> list[_ResolvedTask]:
+        registry = self.registry
+        architectures = self.architectures
+
+        if registry.is_transcription_only_model(architectures):
+            return ["transcription"]
+
+        supported_tasks = list[_ResolvedTask]()
+        if registry.is_text_generation_model(architectures):
+            supported_tasks.append("generate")
+
+            if registry.is_transcription_model(architectures):
+                supported_tasks.append("transcription")
+
+        return supported_tasks
+
+    def _get_supported_pooling_tasks(
+        self,
+        task_option: TaskOption,
+    ) -> list[_ResolvedTask]:
+        registry = self.registry
+        architectures = self.architectures
+
+        supported_tasks = list[_ResolvedTask]()
+        if registry.is_pooling_model(architectures):
+            supported_tasks.append("encode")
+
+            # For now, users must specify the task (other than "pooling")
+            # to use for pooling models
+            if task_option == "auto":
+                preferred_task = self._get_preferred_pooling_task(
+                    architectures)
+
+                supported_tasks.append(preferred_task)
+            elif task_option in _RUNNER_TASKS["pooling"]:
+                supported_tasks.append(cast(_ResolvedTask, task_option))
+
+        return supported_tasks
+
+    def _get_supported_tasks(
+        self,
+        task_option: TaskOption,
+    ) -> dict[RunnerType, list[_ResolvedTask]]:
+        if self._is_classify_task(self.architectures):
+            return {"generate": [], "pooling": ["classify"], "draft": []}
+        else:
+            return {
+                "generate": self._get_supported_generation_tasks(task_option),
+                "pooling": self._get_supported_pooling_tasks(task_option),
+                "draft": ["draft"]
+            }
+
+    def _get_supported_runner_types(
+        self,
+        supported_tasks: dict[RunnerType, list[_ResolvedTask]],
+    ) -> set[RunnerType]:
+        return {
+            runner
+            for runner, runner_tasks in supported_tasks.items()
+            if len(runner_tasks) > 0
+        }
+
+    def _resolve_runner(
+        self,
+        runner_option: RunnerOption,
+        task_option: TaskOption,
+        supported_runner_types: set[RunnerType],
+        supported_tasks: dict[RunnerType, list[_ResolvedTask]],
+    ) -> RunnerType:
+        if not supported_runner_types:
+            raise ValueError("This model does not support any model runners!")
+
+        if runner_option != "auto":
+            if runner_option not in supported_runner_types:
+                raise ValueError(
+                    f"This model does not support runner={runner_option!r}. "
+                    f"Available runners: {supported_runner_types}")
+
+            return runner_option
+
+        if task_option != "auto":
+            for runner, runner_tasks in supported_tasks.items():
+                if task_option in runner_tasks:
+                    return runner
+            else:
+                task_runner: RunnerType = next(
+                    runner for runner, tasks in _RUNNER_TASKS.items()
+                    if task_option in tasks)
+                raise ValueError(
+                    f"This model does not support task={task_option!r}. "
+                    f"Available tasks for runner={task_runner!r}: "
+                    f"{supported_tasks[task_runner]}")
+
+        if "classify" in supported_tasks.get("pooling", []):
+            # When multiple pooling tasks are present, default to
+            # pooling (eg cross-encoder) for non-standard architectures.
+            return "pooling"
+
+        suffix_to_preferred_runner: list[tuple[str, RunnerType]] = [
+            ("ForCausalLM", "generate"),
+            ("ForConditionalGeneration", "generate"),
+            ("ChatModel", "generate"),
+            ("LMHeadModel", "generate"),
+            ("EmbeddingModel", "pooling"),
+            ("RewardModel", "pooling"),
+        ]
+
+        for suffix, pref_runner in suffix_to_preferred_runner:
+            if self.architecture.endswith(
+                    suffix) and pref_runner in supported_runner_types:
+                return pref_runner
+
+        if "generate" in supported_runner_types:
+            return "generate"
+        if "pooling" in supported_runner_types:
+            return "pooling"
+
+        raise AssertionError("This line should not be reached")
+
+    def _parse_quant_hf_config(self):
+        quant_cfg = getattr(self.hf_config, "quantization_config", None)
+        if quant_cfg is None:
+            # compressed-tensors uses a "compression_config" key
+            quant_cfg = getattr(self.hf_config, "compression_config", None)
+        return quant_cfg
+
+    def _verify_quantization(self) -> None:
+        supported_quantization = me_quant.QUANTIZATION_METHODS
+        optimized_quantization_methods = [
+            "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
+            "awq_marlin", "fbgemm_fp8", "compressed-tensors", "experts_int8",
+            "quark", "modelopt_fp4", "bitblas", "gptq_bitblas", "inc"
+        ]
+        if self.quantization is not None:
+            self.quantization = cast(me_quant.QuantizationMethods,
+                                     self.quantization)
+
+        # Parse quantization method from the HF model config, if available.
+        quant_cfg = self._parse_quant_hf_config()
+
+        if quant_cfg is not None:
+            # Use the community standard 'quant_method'
+            quant_method = quant_cfg.get("quant_method", "").lower()
+
+            # Normalize library names
+            quant_method = quant_method.replace("compressed_tensors",
+                                                "compressed-tensors")
+
+            quant_cfg["quant_method"] = quant_method
+
+            # Quantization methods which are overrides (i.e. they have a
+            # `override_quantization_method` method) must be checked in order
+            # of preference (this is particularly important for GPTQ).
+            overrides = [
+                "marlin",
+                "bitblas",
+                "gptq_marlin_24",
+                "gptq_marlin",
+                "gptq_bitblas",
+                "awq_marlin",
+                "ipex",
+                "moe_wna16",
+                "modelopt",
+                "modelopt_fp4",
+            ]
+            quantization_methods = [
+                q for q in supported_quantization if q not in overrides
+            ]
+            # Any custom overrides will be in quantization_methods so we place
+            # them at the start of the list so custom overrides have preference
+            # over the built in ones.
+            quantization_methods = quantization_methods + overrides
+
+            # Detect which checkpoint is it
+            for name in quantization_methods:
+                method = me_quant.get_quantization_config(name)
+                quantization_override = method.override_quantization_method(
+                    quant_cfg, self.quantization)
+                if quantization_override is not None:
+                    # Raise error if the override is not custom (custom would
+                    # be in QUANTIZATION_METHODS but not QuantizationMethods)
+                    # and hasn't been added to the overrides list.
+                    if (name in get_args(me_quant.QuantizationMethods)
+                            and name not in overrides):
+                        raise ValueError(
+                            f"Quantization method {name} is an override but "
+                            "is has not been added to the `overrides` list "
+                            "above. This is necessary to ensure that the "
+                            "overrides are checked in order of preference.")
+                    quant_method = quantization_override
+                    self.quantization = quantization_override
+                    break
+
+            # Verify quantization configurations.
+            if self.quantization is None:
+                self.quantization = quant_method
+            elif self.quantization != quant_method:
+                raise ValueError(
+                    "Quantization method specified in the model config "
+                    f"({quant_method}) does not match the quantization "
+                    f"method specified in the `quantization` argument "
+                    f"({self.quantization}).")
+
+        if self.quantization is not None:
+            if self.quantization not in supported_quantization:
+                raise ValueError(
+                    f"Unknown quantization method: {self.quantization}. Must "
+                    f"be one of {supported_quantization}.")
+            from vllm.platforms import current_platform
+            current_platform.verify_quantization(self.quantization)
+            if self.quantization not in optimized_quantization_methods:
+                logger.warning(
+                    "%s quantization is not fully "
+                    "optimized yet. The speed can be slower than "
+                    "non-quantized models.", self.quantization)
+
+    def _verify_cuda_graph(self) -> None:
+        self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
+                                          self.max_model_len)
+        # CUDAGraph capture not supported for enc-dec models and mllama on ROCm
+        ROCM_UNSUPPORTED_MODELS = ['mllama']
+        unsupported_rocm = (self.hf_config.model_type
+                            in ROCM_UNSUPPORTED_MODELS
+                            or self.is_encoder_decoder)
+
+        if (unsupported_rocm and not self.enforce_eager
+                and current_platform.is_rocm()):
+            logger.warning(
+                "CUDA graph is not supported for %s on ROCm yet, fallback "
+                "to eager mode.", self.hf_config.model_type)
+            self.enforce_eager = True
+
+    def _verify_bnb_config(self) -> None:
+        """
+        The current version of bitsandbytes (0.46.1) with 8-bit models does not
+        yet support CUDA graph.
+        # TODO Remove this when bitsandbytes supports.
+        """
+        is_bitsandbytes = self.quantization == "bitsandbytes"
+        has_quantization_config = (getattr(self.hf_config,
+                                           "quantization_config", None)
+                                   is not None)
+        is_8bit = (self.hf_config.quantization_config.get(
+            "load_in_8bit", False) if has_quantization_config else False)
+        if all([
+                is_bitsandbytes,
+                has_quantization_config,
+                is_8bit,
+                not self.enforce_eager,
+        ]):
+            logger.warning(
+                "CUDA graph is not supported on BitsAndBytes 8bit yet, "
+                "fallback to the eager mode.")
+
+            self.enforce_eager = True
+
+    def _verify_with_expert_parallelism(self) -> None:
+        num_expert_names = [
+            "moe_num_experts",  # Dbrx
+            "num_experts",  # Jamba
+            "n_routed_experts",  # DeepSeek
+            "num_local_experts",  # Mixtral
+        ]
+        num_experts = 0
+        for name in num_expert_names:
+            num_experts = getattr(self.hf_text_config, name, 0)
+            if num_experts > 0:
+                break
+        if num_experts < 1:
+            raise ValueError(
+                "Number of experts in the model must be greater than 0 "
+                "when expert parallelism is enabled.")
+
+    def verify_dual_chunk_attention_config(
+        self,
+        load_config: "LoadConfig",
+    ) -> None:
+        if hasattr(self.hf_config, "dual_chunk_attention_config"):
+            # Try loading the sparse attention config
+            from vllm.model_executor.model_loader.weight_utils import (
+                get_sparse_attention_config)
+            sparse_attn_config = get_sparse_attention_config(self, load_config)
+            if sparse_attn_config:
+                self.hf_config.dual_chunk_attention_config[
+                    "sparse_attention_config"] = sparse_attn_config
+                if "sparse_attention_enabled" not in \
+                        self.hf_config.dual_chunk_attention_config:
+                    self.hf_config.dual_chunk_attention_config[
+                        "sparse_attention_enabled"] = True
+
+    def verify_async_output_proc(self, parallel_config, speculative_config,
+                                 device_config) -> None:
+        if not self.use_async_output_proc:
+            # Nothing to check
+            return
+
+        if parallel_config.pipeline_parallel_size > 1:
+            self.use_async_output_proc = False
+            return
+
+        # Reminder: Please update docs/features/compatibility_matrix.md
+        # If the feature combo become valid
+        from vllm.platforms import current_platform
+        if not current_platform.is_async_output_supported(self.enforce_eager):
+            self.use_async_output_proc = False
+            return
+
+        if envs.VLLM_USE_RAY_SPMD_WORKER:
+            self.use_async_output_proc = False
+            return
+
+        # Async postprocessor is not necessary for pooling models
+        # since there is no token generation
+        if self.runner_type == "pooling":
+            self.use_async_output_proc = False
+
+        # Reminder: Please update docs/features/compatibility_matrix.md
+        # If the feature combo become valid
+        if speculative_config:
+            self.use_async_output_proc = False
+
+    def verify_with_parallel_config(
+        self,
+        parallel_config: "ParallelConfig",
+    ) -> None:
+
+        if parallel_config.distributed_executor_backend == "external_launcher":
+            assert self.seed is not None, (
+                "Seed must be set when using external launcher backend to "
+                "make sure sampling results are the same across workers.")
+
+        total_num_attention_heads = getattr(self.hf_text_config,
+                                            "num_attention_heads", 0)
+        tensor_parallel_size = parallel_config.tensor_parallel_size
+        if total_num_attention_heads % tensor_parallel_size != 0:
+            raise ValueError(
+                f"Total number of attention heads ({total_num_attention_heads})"
+                " must be divisible by tensor parallel size "
+                f"({tensor_parallel_size}).")
+
+        if parallel_config.enable_expert_parallel:
+            self._verify_with_expert_parallelism()
+
+        pipeline_parallel_size = parallel_config.pipeline_parallel_size
+        if pipeline_parallel_size > 1:
+            if not self.registry.is_pp_supported_model(self.architectures):
+                raise NotImplementedError(
+                    "Pipeline parallelism is not supported for this model. "
+                    "Supported models implement the `SupportsPP` interface.")
+
+            if self.use_async_output_proc:
+                self.use_async_output_proc = False
+
+    def get_hf_config_sliding_window(
+            self) -> Union[Optional[int], list[Optional[int]]]:
+        """Get the sliding window size, or None if disabled."""
+
+        # Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in
+        # addition to sliding window size. We check if that field is present
+        # and if it's False, return None.
+        if (hasattr(self.hf_text_config, "use_sliding_window")
+                and not self.hf_text_config.use_sliding_window):
+            return None
+        return getattr(self.hf_text_config, "sliding_window", None)
+
+    def get_sliding_window(self) -> Optional[Union[int, list[Optional[int]]]]:
+        """Get the sliding window size, or None if disabled.
+        """
+        # If user disables sliding window, return None.
+        if self.disable_sliding_window:
+            return None
+        # Otherwise get the value from the hf config.
+        return self.get_hf_config_sliding_window()
+
+    def get_vocab_size(self) -> int:
+        return getattr(self.hf_text_config, "vocab_size", 0)
+
+    def get_hidden_size(self) -> int:
+        return getattr(self.hf_text_config, "hidden_size", 0)
+
+    @property
+    def is_deepseek_mla(self) -> bool:
+        if not hasattr(self.hf_text_config, "model_type"):
+            return False
+        elif self.hf_text_config.model_type in \
+            ('deepseek_v2', 'deepseek_v3', 'deepseek_mtp', 'kimi_k2'):
+            return self.hf_text_config.kv_lora_rank is not None
+        elif self.hf_text_config.model_type == 'eagle':
+            # if the model is an EAGLE module, check for the
+            # underlying architecture
+            return self.hf_text_config.model.model_type in \
+                    ('deepseek_v2', 'deepseek_v3') \
+                and self.hf_text_config.kv_lora_rank is not None
+        return False
+
+    def get_head_size(self) -> int:
+        # TODO remove hard code
+        if self.is_deepseek_mla:
+            qk_rope_head_dim = getattr(self.hf_text_config, "qk_rope_head_dim",
+                                       0)
+            if self.use_mla:
+                return self.hf_text_config.kv_lora_rank + qk_rope_head_dim
+            else:
+                qk_nope_head_dim = getattr(self.hf_text_config,
+                                           "qk_nope_head_dim", 0)
+                if qk_rope_head_dim and qk_nope_head_dim:
+                    return qk_rope_head_dim + qk_nope_head_dim
+
+        if hasattr(self.hf_text_config,
+                   "model_type") and (self.hf_text_config.model_type
+                                      == "zamba2"):
+            return self.hf_text_config.attention_head_dim
+
+        if self.is_attention_free:
+            return 0
+
+        # NOTE: Some configs may set head_dim=None in the config
+        if getattr(self.hf_text_config, "head_dim", None) is not None:
+            return self.hf_text_config.head_dim
+
+        # FIXME(woosuk): This may not be true for all models.
+        return (self.hf_text_config.hidden_size //
+                self.hf_text_config.num_attention_heads)
+
+    def get_total_num_kv_heads(self) -> int:
+        """Returns the total number of KV heads."""
+        # For GPTBigCode & Falcon:
+        # NOTE: for falcon, when new_decoder_architecture is True, the
+        # multi_query flag is ignored and we use n_head_kv for the number of
+        # KV heads.
+        falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"]
+        new_decoder_arch_falcon = (
+            self.hf_config.model_type in falcon_model_types
+            and getattr(self.hf_config, "new_decoder_architecture", False))
+        if not new_decoder_arch_falcon and getattr(self.hf_text_config,
+                                                   "multi_query", False):
+            # Multi-query attention, only one KV head.
+            # Currently, tensor parallelism is not supported in this case.
+            return 1
+
+        # For DBRX and MPT
+        if self.hf_config.model_type == "mpt":
+            if "kv_n_heads" in self.hf_config.attn_config:
+                return self.hf_config.attn_config["kv_n_heads"]
+            return self.hf_config.num_attention_heads
+        if self.hf_config.model_type == "dbrx":
+            return getattr(self.hf_config.attn_config, "kv_n_heads",
+                           self.hf_config.num_attention_heads)
+
+        if self.hf_config.model_type == "nemotron-nas":
+            for block in self.hf_config.block_configs:
+                if not block.attention.no_op:
+                    return self.hf_config.num_attention_heads \
+                        // block.attention.n_heads_in_group
+
+            raise RuntimeError("Couldn't determine number of kv heads")
+
+        if self.is_attention_free:
+            return 0
+
+        attributes = [
+            # For Falcon:
+            "n_head_kv",
+            "num_kv_heads",
+            # For LLaMA-2:
+            "num_key_value_heads",
+            # For ChatGLM:
+            "multi_query_group_num",
+        ]
+        for attr in attributes:
+            num_kv_heads = getattr(self.hf_text_config, attr, None)
+            if num_kv_heads is not None:
+                return num_kv_heads
+
+        # For non-grouped-query attention models, the number of KV heads is
+        # equal to the number of attention heads.
+        return self.hf_text_config.num_attention_heads
+
+    def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int:
+        """Returns the number of KV heads per GPU."""
+        if self.use_mla:
+            # When using MLA during decode it becomes MQA
+            return 1
+
+        total_num_kv_heads = self.get_total_num_kv_heads()
+        # If tensor parallelism is used, we divide the number of KV heads by
+        # the tensor parallel size. We will replicate the KV heads in the
+        # case where the number of KV heads is smaller than the tensor
+        # parallel size so each GPU has at least one KV head.
+        return max(1,
+                   total_num_kv_heads // parallel_config.tensor_parallel_size)
+
+    def get_num_attention_heads(self,
+                                parallel_config: "ParallelConfig") -> int:
+        num_heads = getattr(self.hf_text_config, "num_attention_heads", 0)
+        return num_heads // parallel_config.tensor_parallel_size
+
+    def get_layers_start_end_indices(
+            self, parallel_config: "ParallelConfig") -> tuple[int, int]:
+        from vllm.distributed.utils import get_pp_indices
+        if (self.hf_text_config.model_type == "deepseek_mtp"
+                or self.hf_config.model_type == "mimo_mtp"
+                or self.hf_config.model_type == "glm4_moe_mtp"):
+            total_num_hidden_layers = getattr(self.hf_text_config,
+                                              "num_nextn_predict_layers", 0)
+        else:
+            total_num_hidden_layers = getattr(self.hf_text_config,
+                                              "num_hidden_layers", 0)
+        # the layout order is: DP x PP x TP
+        pp_rank = (parallel_config.rank // parallel_config.tensor_parallel_size
+                   ) % parallel_config.pipeline_parallel_size
+        pp_size = parallel_config.pipeline_parallel_size
+        start, end = get_pp_indices(total_num_hidden_layers, pp_rank, pp_size)
+        return start, end
+
+    def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
+        start, end = self.get_layers_start_end_indices(parallel_config)
+        return end - start
+
+    def get_num_layers_by_block_type(
+        self,
+        parallel_config: "ParallelConfig",
+        block_type: LayerBlockType = LayerBlockType.attention,
+    ) -> int:
+        # This function relies on 'layers_block_type' in hf_config,
+        # for w/o this attribute, we will need to have workarounds like so
+        attn_block_type = block_type == LayerBlockType.attention
+        is_transformer = not self.is_hybrid and \
+                            not self.has_noops and \
+                            not self.is_attention_free
+        start, end = self.get_layers_start_end_indices(parallel_config)
+
+        if is_transformer:
+            # Handle the basic case first
+            return end - start if attn_block_type else 0
+        elif self.is_attention_free:
+            # Attention free
+            # Note that this code assumes there
+            # is only one type of attention-free block type.
+            return 0 if attn_block_type else end - start
+        elif self.has_noops:
+            block_configs = self.hf_config.block_configs
+            return sum(not bc.attention.no_op
+                       for bc in block_configs[start:end])
+        else:
+            # Hybrid model Jamba
+            layers_block_type_value = getattr(self.hf_config,
+                                              "layers_block_type", None)
+            if layers_block_type_value is not None:
+                if hasattr(self.hf_text_config,
+                           "model_type") and (self.hf_text_config.model_type
+                                              == "zamba2"):
+                    if attn_block_type:
+                        return sum(t == "hybrid"
+                                   for t in layers_block_type_value[start:end])
+                    else:
+                        return self.get_num_layers(parallel_config)
+                return sum(t == block_type.value
+                           for t in layers_block_type_value[start:end])
+
+            # Hybrid model Minimax
+            attn_type_list = getattr(self.hf_config, "attn_type_list", None)
+            if attn_type_list:
+                return sum(t == 1 for t in attn_type_list[start:end])
+
+            if layers_block_type_value is None and attn_type_list is None:
+                raise ValueError(
+                    "The model is an hybrid without a"
+                    "layers_block_type or an attn_type_list in the hf_config,"
+                    "cannot determine the num of "
+                    f"{block_type.value} layers")
+
+            return sum(t == 1 for t in attn_type_list[start:end])
+
+    def get_mamba_chunk_size(self) -> Optional[int]:
+        """
+        Returns the mamba chunk size if it exists
+        """
+        # used by e.g. Bamba, FalconH1, Granite, PLaMo2
+        chunk_size = getattr(self.hf_text_config, "mamba_chunk_size", None)
+        if chunk_size is None:
+            # used by e.g. Mamba2, NemotronH, Zamba
+            chunk_size = getattr(self.hf_text_config, "chunk_size", None)
+        return chunk_size
+
+    def get_multimodal_config(self) -> "MultiModalConfig":
+        """
+        Get the multimodal configuration of the model.
+
+        Raises:
+            ValueError: If the model is not multimodal.
+        """
+        if self.multimodal_config is None:
+            raise ValueError("The model is not multimodal.")
+
+        return self.multimodal_config
+
+    def try_get_generation_config(self) -> dict[str, Any]:
+        if self.generation_config in ("auto", "vllm"):
+            config = try_get_generation_config(
+                self.hf_config_path or self.model,
+                trust_remote_code=self.trust_remote_code,
+                revision=self.revision,
+            )
+        else:
+            config = try_get_generation_config(
+                self.generation_config,
+                trust_remote_code=self.trust_remote_code,
+            )
+
+        if config is None:
+            return {}
+
+        return config.to_diff_dict()
+
+    def get_diff_sampling_param(self) -> dict[str, Any]:
+        """
+        This method returns a dictionary containing the parameters
+        that differ from the default sampling parameters. If
+        `generation_config` is `"vllm"`, an empty dictionary is returned.
+
+        Returns:
+            dict[str, Any]: A dictionary with the differing sampling
+            parameters, if `generation_config` is `"vllm"` an empty dictionary.
+        """
+        if self.generation_config == "vllm":
+            config = {}
+        else:
+            config = self.try_get_generation_config()
+
+        # Overriding with given generation config
+        config.update(self.override_generation_config)
+
+        available_params = [
+            "repetition_penalty",
+            "temperature",
+            "top_k",
+            "top_p",
+            "min_p",
+            "max_new_tokens",
+        ]
+        if any(p in config for p in available_params):
+            diff_sampling_param = {
+                p: config.get(p)
+                for p in available_params if config.get(p) is not None
+            }
+            # Huggingface definition of max_new_tokens is equivalent
+            # to vLLM's max_tokens
+            if "max_new_tokens" in diff_sampling_param:
+                diff_sampling_param["max_tokens"] = diff_sampling_param.pop(
+                    "max_new_tokens")
+        else:
+            diff_sampling_param = {}
+
+        if diff_sampling_param:
+            logger.warning_once(
+                "Default sampling parameters have been overridden by the "
+                "model's Hugging Face generation config recommended from the "
+                "model creator. If this is not intended, please relaunch "
+                "vLLM instance with `--generation-config vllm`.")
+        return diff_sampling_param
+
+    @property
+    def is_encoder_decoder(self) -> bool:
+        """Extract the HF encoder/decoder model flag."""
+        """
+        For Mllama, VLLM overrides HF's is_encoder_decoder flag and sets it to
+        True to enable cross-attention
+        Neuron needs all multimodal data to be in the decoder and does not
+        need to explicitly enable cross-attention
+        """
+        if (current_platform.is_neuron()
+                and self.hf_config.model_type == "mllama"):
+            return False
+
+        return is_encoder_decoder(self.hf_config)
+
+    @property
+    def uses_mrope(self) -> bool:
+        return uses_mrope(self.hf_config)
+
+    @property
+    def is_multimodal_model(self) -> bool:
+        return self.multimodal_config is not None
+
+    @property
+    def is_cross_encoder(self) -> bool:
+        return self.task == "classify"
+
+    @property
+    def use_mla(self) -> bool:
+        return self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE
+
+    @property
+    def is_v1_compatible(self) -> bool:
+        architectures = getattr(self.hf_config, "architectures", [])
+        return me_models.ModelRegistry.is_v1_compatible(architectures)
+
+    @property
+    def is_matryoshka(self) -> bool:
+        return (bool(getattr(self.hf_config, "matryoshka_dimensions", None))
+                or getattr(self.hf_config, "is_matryoshka", False))
+
+    @property
+    def matryoshka_dimensions(self):
+        return getattr(self.hf_config, "matryoshka_dimensions", None)
+
+    @property
+    def use_pad_token(self) -> bool:
+        # cross_encoder models defaults to using pad_token.
+        # `llm as reranker` models defaults to not using pad_token.
+        return getattr(self.hf_config, "use_pad_token", True)
+
+    def get_and_verify_max_len(self, max_model_len: int):
+        # Consider max_model_len in tokenizer_config only when
+        # pooling models use absolute position_embedding.
+        tokenizer_config = None
+        if (self.runner_type == "pooling" and getattr(
+                self.hf_config, "position_embedding_type", "") == "absolute"):
+            tokenizer_config = try_get_tokenizer_config(
+                self.tokenizer,
+                trust_remote_code=self.trust_remote_code,
+                revision=self.tokenizer_revision)
+        max_model_len = _get_and_verify_max_len(
+            hf_config=self.hf_text_config,
+            tokenizer_config=tokenizer_config,
+            max_model_len=max_model_len,
+            disable_sliding_window=self.disable_sliding_window,
+            sliding_window_len=self.get_hf_config_sliding_window(),
+            spec_target_max_model_len=self.spec_target_max_model_len,
+            encoder_config=self.encoder_config)
+        logger.info("Using max model len %s", max_model_len)
+        return max_model_len
+
+
+BlockSize = Literal[1, 8, 16, 32, 64, 128]
+CacheDType = Literal["auto", "fp8", "fp8_e4m3", "fp8_e5m2", "fp8_inc"]
+PrefixCachingHashAlgo = Literal["builtin", "sha256", "sha256_cbor_64bit"]
+
+
+@config
+@dataclass
+class CacheConfig:
+    """Configuration for the KV cache."""
+
+    block_size: SkipValidation[BlockSize] = None  # type: ignore
+    """Size of a contiguous cache block in number of tokens. This is ignored on
+    neuron devices and set to `--max-model-len`. On CUDA devices, only block
+    sizes up to 32 are supported. On HPU devices, block size defaults to 128.
+
+    This config has no static default. If left unspecified by the user, it will
+    be set in `Platform.check_and_update_config()` based on the current
+    platform."""
+    gpu_memory_utilization: float = 0.9
+    """The fraction of GPU memory to be used for the model executor, which can
+    range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory
+    utilization. If unspecified, will use the default value of 0.9. This is a
+    per-instance limit, and only applies to the current vLLM instance. It does
+    not matter if you have another vLLM instance running on the same GPU. For
+    example, if you have two vLLM instances running on the same GPU, you can
+    set the GPU memory utilization to 0.5 for each instance."""
+    swap_space: float = 4
+    """Size of the CPU swap space per GPU (in GiB)."""
+    cache_dtype: CacheDType = "auto"
+    """Data type for kv cache storage. If "auto", will use model data type.
+    CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports
+    fp8 (=fp8_e4m3). Intel Gaudi (HPU) supports fp8 (using fp8_inc)."""
+    is_attention_free: bool = False
+    """Whether the model is attention-free. This is primarily set in
+    `ModelConfig` and that value should be manually duplicated here."""
+    num_gpu_blocks_override: Optional[int] = None
+    """Number of GPU blocks to use. This overrides the profiled `num_gpu_blocks`
+    if specified. Does nothing if `None`. Used for testing preemption."""
+    sliding_window: Optional[int] = None
+    """Sliding window size for the KV cache. This is primarily set in
+    `ModelConfig` and that value should be manually duplicated here."""
+    enable_prefix_caching: Optional[bool] = None
+    """Whether to enable prefix caching. Disabled by default for V0. Enabled by
+    default for V1."""
+    prefix_caching_hash_algo: PrefixCachingHashAlgo = "builtin"
+    """Set the hash algorithm for prefix caching:\n
+    - "builtin" is Python's built-in hash.\n
+    - "sha256" is collision resistant but with certain overheads.
+    This option uses Pickle for object serialization before hashing.\n
+    - "sha256_cbor_64bit" provides a reproducible, cross-language compatible 
+    hash. It serializes objects using canonical CBOR and hashes them with 
+    SHA-256. The resulting hash consists of the lower 64 bits of the SHA-256
+    digest."""
+    cpu_offload_gb: float = 0
+    """The space in GiB to offload to CPU, per GPU. Default is 0, which means
+    no offloading. Intuitively, this argument can be seen as a virtual way to
+    increase the GPU memory size. For example, if you have one 24 GB GPU and
+    set this to 10, virtually you can think of it as a 34 GB GPU. Then you can
+    load a 13B model with BF16 weight, which requires at least 26GB GPU memory.
+    Note that this requires fast CPU-GPU interconnect, as part of the model is
+    loaded from CPU memory to GPU memory on the fly in each model forward pass.
+    """
+    calculate_kv_scales: bool = False
+    """This enables dynamic calculation of `k_scale` and `v_scale` when
+    kv_cache_dtype is fp8. If `False`, the scales will be loaded from the model
+    checkpoint if available. Otherwise, the scales will default to 1.0."""
+    cpu_kvcache_space_bytes: Optional[int] = None
+    """(CPU backend only) CPU key-value cache space."""
+    mamba_page_size_padded: Optional[int] = None
+    """ Optional override for mamba page size; used by hybrid mamba/attention
+    models to ensure exact alignment with attention page size."""
+
+    # Will be set after profiling.
+    num_gpu_blocks: Optional[int] = field(default=None, init=False)
+    """The number of blocks to allocate for GPU memory."""
+    num_cpu_blocks: Optional[int] = field(default=None, init=False)
+    """The number of blocks to allocate for CPU memory."""
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: list[Any] = []
+        factors.append(self.cache_dtype)
+        # `cpu_offload_gb` does not use `torch.compile` yet.
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
+        return hash_str
+
+    def __post_init__(self) -> None:
+        self.swap_space_bytes = self.swap_space * GiB_bytes
+
+        self._verify_cache_dtype()
+        self._verify_prefix_caching()
+
+    def metrics_info(self):
+        # convert cache_config to dict(key: str, value: str) for prometheus
+        # metrics info
+        return {key: str(value) for key, value in self.__dict__.items()}
+
+    @model_validator(mode='after')
+    def _verify_args(self) -> Self:
+        if self.cpu_offload_gb < 0:
+            raise ValueError("CPU offload space must be non-negative"
+                             f", but got {self.cpu_offload_gb}")
+
+        if self.gpu_memory_utilization > 1.0:
+            raise ValueError(
+                "GPU memory utilization must be less than 1.0. Got "
+                f"{self.gpu_memory_utilization}.")
+
+        return self
+
+    def _verify_cache_dtype(self) -> None:
+        if self.cache_dtype == "auto":
+            pass
+        elif self.cache_dtype in get_args(CacheDType):
+            logger.info(
+                "Using fp8 data type to store kv cache. It reduces the GPU "
+                "memory footprint and boosts the performance. "
+                "Meanwhile, it may cause accuracy drop without a proper "
+                "scaling factor.")
+        else:
+            raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}")
+
+    def _verify_prefix_caching(self) -> None:
+        if not self.enable_prefix_caching:
+            return
+
+        if self.sliding_window is not None and not envs.VLLM_USE_V1:
+            raise NotImplementedError(
+                "Prefix caching is not supported with sliding window. "
+                "Run with --disable-sliding-window to use prefix caching.")
+
+        if (self.enable_prefix_caching and self.prefix_caching_hash_algo
+                not in get_args(PrefixCachingHashAlgo)):
+            raise ValueError(
+                "Unknown prefix caching hash algorithm: "
+                f"{self.prefix_caching_hash_algo}. Must be one of "
+                f"{get_args(PrefixCachingHashAlgo)}.")
+
+    def verify_with_parallel_config(
+        self,
+        parallel_config: "ParallelConfig",
+    ) -> None:
+        total_cpu_memory = get_cpu_memory()
+        # FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel
+        # group are in the same node. However, the GPUs may span multiple nodes.
+        num_gpus_per_node = parallel_config.tensor_parallel_size
+        cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node
+
+        msg = (f"{cpu_memory_usage / GiB_bytes:.2f} GiB out of the "
+               f"{total_cpu_memory / GiB_bytes:.2f} GiB total CPU memory "
+               "is allocated for the swap space.")
+        if cpu_memory_usage > 0.7 * total_cpu_memory:
+            raise ValueError("Too large swap space. " + msg)
+        elif cpu_memory_usage > 0.4 * total_cpu_memory:
+            logger.warning("Possibly too large swap space. %s", msg)
+
+
+class LoadFormat(str, enum.Enum):
+    AUTO = "auto"
+    PT = "pt"
+    SAFETENSORS = "safetensors"
+    NPCACHE = "npcache"
+    DUMMY = "dummy"
+    TENSORIZER = "tensorizer"
+    SHARDED_STATE = "sharded_state"
+    GGUF = "gguf"
+    BITSANDBYTES = "bitsandbytes"
+    MISTRAL = "mistral"
+    RUNAI_STREAMER = "runai_streamer"
+    RUNAI_STREAMER_SHARDED = "runai_streamer_sharded"
+    FASTSAFETENSORS = "fastsafetensors"
+
+
+@config
+@dataclass
+class LoadConfig:
+    """Configuration for loading the model weights."""
+
+    load_format: Union[str, LoadFormat,
+                       "BaseModelLoader"] = LoadFormat.AUTO.value
+    """The format of the model weights to load:\n
+    - "auto" will try to load the weights in the safetensors format and fall
+    back to the pytorch bin format if safetensors format is not available.\n
+    - "pt" will load the weights in the pytorch bin format.\n
+    - "safetensors" will load the weights in the safetensors format.\n
+    - "npcache" will load the weights in pytorch format and store a numpy cache
+    to speed up the loading.\n
+    - "dummy" will initialize the weights with random values, which is mainly
+    for profiling.\n
+    - "tensorizer" will use CoreWeave's tensorizer library for fast weight
+    loading. See the Tensorize vLLM Model script in the Examples section for
+    more information.\n
+    - "runai_streamer" will load the Safetensors weights using Run:ai Model
+    Streamer.\n
+    - "bitsandbytes" will load the weights using bitsandbytes quantization.\n
+    - "sharded_state" will load weights from pre-sharded checkpoint files,
+    supporting efficient loading of tensor-parallel models.\n
+    - "gguf" will load weights from GGUF format files (details specified in
+    https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n
+    - "mistral" will load weights from consolidated safetensors files used by
+    Mistral models."""
+    download_dir: Optional[str] = None
+    """Directory to download and load the weights, default to the default
+    cache directory of Hugging Face."""
+    model_loader_extra_config: Union[dict, TensorizerConfig] = field(
+        default_factory=dict)
+    """Extra config for model loader. This will be passed to the model loader
+    corresponding to the chosen load_format."""
+    device: Optional[str] = None
+    """Device to which model weights will be loaded, default to
+    device_config.device"""
+    ignore_patterns: Optional[Union[list[str], str]] = None
+    """The list of patterns to ignore when loading the model. Default to
+    "original/**/*" to avoid repeated loading of llama's checkpoints."""
+    use_tqdm_on_load: bool = True
+    """Whether to enable tqdm for showing progress bar when loading model
+    weights."""
+    pt_load_map_location: Union[str, dict[str, str]] = "cpu"
+    """
+    pt_load_map_location: the map location for loading pytorch checkpoint, to
+    support loading checkpoints can only be loaded on certain devices like
+    "cuda", this is equivalent to {"": "cuda"}. Another supported format is
+    mapping from different devices like from GPU 1 to GPU 0:
+    {"cuda:1": "cuda:0"}. Note that when passed from command line, the strings
+    in dictionary needs to be double quoted for json parsing. For more details,
+    see original doc for `map_location` in https://pytorch.org/docs/stable/generated/torch.load.html
+    """
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: list[Any] = []
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
+        return hash_str
+
+    def __post_init__(self):
+        if isinstance(self.load_format, str):
+            load_format = self.load_format.lower()
+            self.load_format = LoadFormat(load_format)
+
+        if self.ignore_patterns is not None and len(self.ignore_patterns) > 0:
+            logger.info(
+                "Ignoring the following patterns when downloading weights: %s",
+                self.ignore_patterns)
+        else:
+            self.ignore_patterns = ["original/**/*"]
+
+
+DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"]
+
+
+@config
+@dataclass
+class ParallelConfig:
+    """Configuration for the distributed execution."""
+
+    pipeline_parallel_size: int = 1
+    """Number of pipeline parallel groups."""
+    tensor_parallel_size: int = 1
+    """Number of tensor parallel groups."""
+    data_parallel_size: int = 1
+    """Number of data parallel groups. MoE layers will be sharded according to
+    the product of the tensor parallel size and data parallel size."""
+    data_parallel_size_local: int = 1
+    """Number of local data parallel groups."""
+    data_parallel_rank: int = 0
+    """Rank of the data parallel group."""
+    data_parallel_rank_local: Optional[int] = None
+    """Local rank of the data parallel group,
+    set only in SPMD mode."""
+    data_parallel_master_ip: str = "127.0.0.1"
+    """IP of the data parallel master."""
+    data_parallel_rpc_port: int = 29550
+    """Port for data parallel messaging."""
+    data_parallel_master_port: int = 29500
+    """Port of the data parallel master."""
+    data_parallel_backend: str = "mp"
+    """Backend to use for data parallel, either "mp" or "ray"."""
+    data_parallel_external_lb: bool = False
+    """Whether to use "external" DP LB mode. Applies only to online serving
+    and when data_parallel_size > 0. This is useful for a "one-pod-per-rank"
+    wide-EP setup in Kuberentes. Set implicitly when --data-parallel-rank
+    is provided explicitly to vllm serve."""
+    data_parallel_hybrid_lb: bool = False
+    """Whether to use "hybrid" DP LB mode. Applies only to online serving
+    and when data_parallel_size > 0. Enables running an AsyncLLM
+    and API server on a "per-node" basis where vLLM load balances
+    between local data parallel ranks, but an external LB balances
+    between vLLM nodes/replicas. Set explicitly in conjunction with 
+    --data-parallel-start-rank."""
+    enable_expert_parallel: bool = False
+    """Use expert parallelism instead of tensor parallelism for MoE layers."""
+    enable_eplb: bool = False
+    """Enable expert parallelism load balancing for MoE layers."""
+    num_redundant_experts: int = 0
+    """Number of redundant experts to use for expert parallelism."""
+    eplb_window_size: int = 1000
+    """Window size for expert load recording."""
+    eplb_step_interval: int = 3000
+    """
+    Interval for rearranging experts in expert parallelism.
+
+    Note that if this is greater than the EPLB window size, only the metrics
+    of the last `eplb_window_size` steps will be used for rearranging experts.
+    """
+    eplb_log_balancedness: bool = False
+    """
+    Log the balancedness each step of expert parallelism.
+    This is turned off by default since it will cause communication overhead.
+    """
+
+    max_parallel_loading_workers: Optional[int] = None
+    """Maximum number of parallel loading workers when loading model
+    sequentially in multiple batches. To avoid RAM OOM when using tensor
+    parallel and large models."""
+
+    disable_custom_all_reduce: bool = False
+    """Disable the custom all-reduce kernel and fall back to NCCL."""
+
+    ray_workers_use_nsight: bool = False
+    """Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
+
+    placement_group: Optional["PlacementGroup"] = None
+    """ray distributed model workers placement group."""
+
+    distributed_executor_backend: Optional[Union[DistributedExecutorBackend,
+                                                 type["ExecutorBase"]]] = None
+    """Backend to use for distributed model
+    workers, either "ray" or "mp" (multiprocessing). If the product
+    of pipeline_parallel_size and tensor_parallel_size is less than
+    or equal to the number of GPUs available, "mp" will be used to
+    keep processing on a single host. Otherwise, this will default
+    to "ray" if Ray is installed and fail otherwise. Note that tpu
+    only support Ray for distributed inference."""
+
+    worker_cls: str = "auto"
+    """The full name of the worker class to use. If "auto", the worker class
+    will be determined based on the platform."""
+    sd_worker_cls: str = "auto"
+    """The full name of the worker class to use for speculative decoding.
+    If "auto", the worker class will be determined based on the platform."""
+    worker_extension_cls: str = ""
+    """The full name of the worker extension class to use. The worker extension
+    class is dynamically inherited by the worker class. This is used to inject
+    new attributes and methods to the worker class for use in collective_rpc
+    calls."""
+
+    world_size: int = field(init=False)
+    """world_size is TPxPP, it affects the number of workers we create."""
+
+    rank: int = 0
+    """Global rank in distributed setup."""
+
+    enable_multimodal_encoder_data_parallel: bool = False
+    """ Use data parallelism instead of tensor parallelism for vision encoder.
+    Only support LLama4 for now"""
+
+    @property
+    def world_size_across_dp(self) -> int:
+        """world_size_across_dp is TPxPPxDP, it is the size of the world
+        including data parallelism."""
+        return self.world_size * self.data_parallel_size
+
+    def get_next_dp_init_port(self) -> int:
+        """
+        We might need to initialize process groups in multiple
+        processes that is related to data parallelism,
+        e.g. both in the worker and in the engine, which
+        can live in different processes. To avoid port conflicts, we
+        increment the port number each time we need to initialize a
+        new process group related to data parallelism.
+        """
+        answer = self.data_parallel_master_port
+        self.data_parallel_master_port += 1
+        return answer
+
+    def stateless_init_dp_group(self) -> "ProcessGroup":
+        # NOTE: In high-concurrency scenarios multiple processes
+        # can pick the same (currently free) port through a race
+        # condition when calling `get_open_port()`. When the first
+        # process binds the port the others will subsequently fail
+        # with `torch.distributed.DistNetworkError: EADDRINUSE`.
+        # To make the initialization more robust we retry a few times
+        # with a fresh port whenever this specific error is observed.
+        from torch.distributed import DistNetworkError
+
+        from vllm.distributed.utils import (
+            stateless_init_torch_distributed_process_group)
+
+        max_retries = 5
+        last_exc: Optional[Exception] = None
+        for _ in range(max_retries):
+            try:
+                # use gloo since the engine process might not have cuda device
+                return stateless_init_torch_distributed_process_group(
+                    self.data_parallel_master_ip,
+                    self.get_next_dp_init_port(),
+                    self.data_parallel_rank,
+                    self.data_parallel_size,
+                    backend="gloo")
+            except DistNetworkError as e:
+                # We only want to retry when the root cause is EADDRINUSE.
+                if "EADDRINUSE" in str(e):
+                    logger.warning(
+                        "Address already in use. Retrying with a new port.")
+                    last_exc = e
+                    continue  # try again with a new port
+                raise e
+
+        # If we get here all retries have failed.
+        assert last_exc is not None
+        raise last_exc
+
+    @staticmethod
+    def has_unfinished_dp(dp_group: "ProcessGroup",
+                          has_unfinished: bool) -> bool:
+        tensor = torch.tensor([has_unfinished],
+                              dtype=torch.int32,
+                              device="cpu")
+        # dp rank 0: has_unfinished_seqs=True
+        # dp rank 1: has_unfinished_seqs=False
+        # aggregated: has_unfinished_seqs=True
+        # so this is an OR operation, i.e. MAX in integers
+        torch.distributed.all_reduce(tensor, op=ReduceOp.MAX, group=dp_group)
+        aggregated_has_unfinished = bool(tensor.item())
+        return aggregated_has_unfinished
+
+    @staticmethod
+    def sync_kv_cache_memory_size(dp_group: "ProcessGroup",
+                                  kv_cache_memory: int) -> int:
+        if kv_cache_memory == -1:
+            kv_cache_memory = torch.iinfo(torch.int64).max
+        tensor = torch.tensor([kv_cache_memory],
+                              dtype=torch.int64,
+                              device="cpu")
+        # we cannot use broadcast for stateless dp group since it depends
+        # on global rank
+        torch.distributed.all_reduce(tensor, op=ReduceOp.MIN, group=dp_group)
+        return tensor.item()
+
+    def compute_hash(self):
+        """
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: list[Any] = []
+        factors.append(self.pipeline_parallel_size)
+        factors.append(self.tensor_parallel_size)
+        factors.append(self.enable_expert_parallel)
+        factors.append(self.data_parallel_size)
+        factors.append(envs.VLLM_ALL2ALL_BACKEND)
+        return hashlib.sha256(str(factors).encode()).hexdigest()
+
+    def __post_init__(self) -> None:
+        self.world_size = self.pipeline_parallel_size * \
+            self.tensor_parallel_size
+
+        if self.data_parallel_size_local > self.data_parallel_size:
+            raise ValueError(
+                f"data_parallel_size_local ({self.data_parallel_size_local}) "
+                f"must be <= data_parallel_size ({self.data_parallel_size})")
+
+        if self.data_parallel_size > 1 or self.data_parallel_size_local == 0:
+            # Data parallel was specified in the engine args.
+            self.data_parallel_master_port = get_open_port()
+
+            if not (0 <= self.data_parallel_rank < self.data_parallel_size):
+                raise ValueError(
+                    f"data_parallel_rank ({self.data_parallel_rank})"
+                    f" must be in the range [0, {self.data_parallel_size})")
+        else:
+            # Otherwise fall back to env vars (e.g. for offline SPMD case).
+            self.data_parallel_size = envs.VLLM_DP_SIZE
+            self.data_parallel_rank = envs.VLLM_DP_RANK
+            self.data_parallel_rank_local = envs.VLLM_DP_RANK_LOCAL
+            self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP
+            self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT
+
+            if self.data_parallel_external_lb:
+                raise ValueError("data_parallel_external_lb can only "
+                                 "be set when data_parallel_size > 1")
+
+        if self.distributed_executor_backend == "external_launcher":
+            import os
+            os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
+            logger.info("Disabling V1 multiprocessing for external launcher.")
+
+        if self.enable_eplb:
+            if not current_platform.is_cuda():
+                raise ValueError(
+                    "Expert parallelism load balancing is only supported on "
+                    "CUDA devices now.")
+            if self.num_redundant_experts < 0:
+                raise ValueError(
+                    "num_redundant_experts must be non-negative, but got "
+                    f"{self.num_redundant_experts}.")
+            if not self.enable_expert_parallel:
+                raise ValueError(
+                    "enable_expert_parallel must be True to use EPLB.")
+            if self.tensor_parallel_size * self.data_parallel_size <= 1:
+                raise ValueError(
+                    "EPLB requires tensor_parallel_size or data_parallel_size "
+                    f"to be greater than 1, but got "
+                    f"TP={self.tensor_parallel_size},DP={self.data_parallel_size}."
+                )
+        else:
+            if self.num_redundant_experts != 0:
+                raise ValueError(
+                    "num_redundant_experts should be used with EPLB."
+                    f"{self.num_redundant_experts}.")
+        if self.distributed_executor_backend is None and self.world_size > 1:
+            # We use multiprocessing by default if world_size fits on the
+            # current node and we aren't in a ray placement group.
+
+            from vllm.executor import ray_utils
+            backend: DistributedExecutorBackend = "mp"
+            ray_found = ray_utils.ray_is_available()
+            if current_platform.is_neuron():
+                # neuron uses single process to control multiple devices
+                backend = "uni"
+            elif current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD:
+                backend = "uni"
+            elif (current_platform.is_cuda()
+                  and cuda_device_count_stateless() < self.world_size):
+                if not ray_found:
+                    raise ValueError("Unable to load Ray: "
+                                     f"{ray_utils.ray_import_err}. Ray is "
+                                     "required for multi-node inference, "
+                                     "please install Ray with `pip install "
+                                     "ray`.")
+                backend = "ray"
+            elif self.data_parallel_backend == "ray":
+                logger.info("Using ray distributed inference because "
+                            "data_parallel_backend is ray")
+                backend = "ray"
+            elif ray_found:
+                if self.placement_group:
+                    backend = "ray"
+                else:
+                    from ray import is_initialized as ray_is_initialized
+                    if ray_is_initialized():
+                        from ray.util import get_current_placement_group
+                        if get_current_placement_group():
+                            backend = "ray"
+            self.distributed_executor_backend = backend
+            logger.debug("Defaulting to use %s for distributed inference",
+                         backend)
+
+        if self.distributed_executor_backend is None and self.world_size == 1:
+            self.distributed_executor_backend = "uni"
+
+    @property
+    def use_ray(self) -> bool:
+        return self.distributed_executor_backend == "ray" or (
+            isinstance(self.distributed_executor_backend, type)
+            and self.distributed_executor_backend.uses_ray)
+
+    @model_validator(mode='after')
+    def _verify_args(self) -> Self:
+        # Lazy import to avoid circular import
+        from vllm.executor.executor_base import ExecutorBase
+        from vllm.platforms import current_platform
+        if self.distributed_executor_backend not in (
+                "ray", "mp", "uni",
+                "external_launcher", None) and not (isinstance(
+                    self.distributed_executor_backend, type) and issubclass(
+                        self.distributed_executor_backend, ExecutorBase)):
+            raise ValueError(
+                "Unrecognized distributed executor backend "
+                f"{self.distributed_executor_backend}. Supported "
+                "values are 'ray', 'mp' 'uni', 'external_launcher' or"
+                " custom ExecutorBase subclass.")
+        if self.use_ray:
+            from vllm.executor import ray_utils
+            ray_utils.assert_ray_available()
+
+        if not current_platform.use_custom_allreduce():
+            self.disable_custom_all_reduce = True
+            logger.debug(
+                "Disabled the custom all-reduce kernel because it is not "
+                "supported on current platform.")
+        if self.ray_workers_use_nsight and not self.use_ray:
+            raise ValueError("Unable to use nsight profiling unless workers "
+                             "run with Ray.")
+
+        return self
+
+
+PreemptionMode = Literal["swap", "recompute"]
+SchedulerPolicy = Literal["fcfs", "priority"]
+
+
+@config
+@dataclass
+class SchedulerConfig:
+    """Scheduler configuration."""
+
+    runner_type: RunnerType = "generate"
+    """The runner type to launch for the model."""
+
+    max_num_batched_tokens: SkipValidation[int] = None  # type: ignore
+    """Maximum number of tokens to be processed in a single iteration.
+
+    This config has no static default. If left unspecified by the user, it will
+    be set in `EngineArgs.create_engine_config` based on the usage context."""
+
+    max_num_seqs: SkipValidation[int] = None  # type: ignore
+    """Maximum number of sequences to be processed in a single iteration.
+
+    This config has no static default. If left unspecified by the user, it will
+    be set in `EngineArgs.create_engine_config` based on the usage context."""
+
+    max_model_len: SkipValidation[int] = None  # type: ignore
+    """Maximum length of a sequence (including prompt and generated text). This
+    is primarily set in `ModelConfig` and that value should be manually
+    duplicated here."""
+
+    max_num_partial_prefills: int = 1
+    """For chunked prefill, the maximum number of sequences that can be
+    partially prefilled concurrently."""
+
+    max_long_partial_prefills: int = 1
+    """For chunked prefill, the maximum number of prompts longer than
+    long_prefill_token_threshold that will be prefilled concurrently. Setting
+    this less than max_num_partial_prefills will allow shorter prompts to jump
+    the queue in front of longer prompts in some cases, improving latency."""
+
+    long_prefill_token_threshold: int = 0
+    """For chunked prefill, a request is considered long if the prompt is
+    longer than this number of tokens."""
+
+    num_lookahead_slots: int = 0
+    """The number of slots to allocate per sequence per
+    step, beyond the known token ids. This is used in speculative
+    decoding to store KV activations of tokens which may or may not be
+    accepted.
+
+    NOTE: This will be replaced by speculative config in the future; it is
+    present to enable correctness tests until then."""
+
+    cuda_graph_sizes: list[int] = field(default_factory=list)
+    """Cuda graph capture sizes
+    1. if none provided, then default set to [min(max_num_seqs * 2, 512)]
+    2. if one value is provided, then the capture list would follow the
+    pattern: [1, 2, 4] + [i for i in range(8, cuda_graph_sizes + 1, 8)]
+    3. more than one value (e.g. 1 2 128) is provided, then the capture list
+    will follow the provided list."""
+
+    delay_factor: float = 0.0
+    """Apply a delay (of delay factor multiplied by previous
+    prompt latency) before scheduling next prompt."""
+
+    enable_chunked_prefill: SkipValidation[bool] = None  # type: ignore
+    """If True, prefill requests can be chunked based
+    on the remaining max_num_batched_tokens."""
+
+    is_multimodal_model: bool = False
+    """True if the model is multimodal."""
+
+    # TODO (ywang96): Make this configurable.
+    max_num_encoder_input_tokens: int = field(init=False)
+    """Multimodal encoder compute budget, only used in V1.
+
+    NOTE: This is not currently configurable. It will be overridden by
+    max_num_batched_tokens in case max multimodal embedding size is larger."""
+
+    # TODO (ywang96): Make this configurable.
+    encoder_cache_size: int = field(init=False)
+    """Multimodal encoder cache size, only used in V1.
+
+    NOTE: This is not currently configurable. It will be overridden by
+    max_num_batched_tokens in case max multimodal embedding size is larger."""
+
+    preemption_mode: Optional[PreemptionMode] = None
+    """Whether to perform preemption by swapping or
+    recomputation. If not specified, we determine the mode as follows:
+    We use recomputation by default since it incurs lower overhead than
+    swapping. However, when the sequence group has multiple sequences
+    (e.g., beam search), recomputation is not currently supported. In
+    such a case, we use swapping instead."""
+
+    num_scheduler_steps: int = 1
+    """Maximum number of forward steps per scheduler call."""
+
+    multi_step_stream_outputs: bool = True
+    """If False, then multi-step will stream outputs at the end of all steps"""
+
+    send_delta_data: bool = False
+    """Private API. If used, scheduler sends delta data to
+    workers instead of an entire data. It should be enabled only
+    when SPMD worker architecture is enabled. I.e.,
+    VLLM_USE_RAY_SPMD_WORKER=1"""
+
+    policy: SchedulerPolicy = "fcfs"
+    """The scheduling policy to use:\n
+    - "fcfs" means first come first served, i.e. requests are handled in order
+    of arrival.\n
+    - "priority" means requests are handled based on given priority (lower
+    value means earlier handling) and time of arrival deciding any ties)."""
+
+    chunked_prefill_enabled: bool = field(init=False)
+    """True if chunked prefill is enabled."""
+
+    disable_chunked_mm_input: bool = False
+    """If set to true and chunked prefill is enabled, we do not want to
+    partially schedule a multimodal item. Only used in V1
+    This ensures that if a request has a mixed prompt
+    (like text tokens TTTT followed by image tokens IIIIIIIIII) where only
+    some image tokens can be scheduled (like TTTTIIIII, leaving IIIII),
+    it will be scheduled as TTTT in one step and IIIIIIIIII in the next."""
+
+    # scheduler class or path. "vllm.core.scheduler.Scheduler" (default)
+    # or "mod.custom_class".
+    scheduler_cls: Union[str, type[object]] = "vllm.core.scheduler.Scheduler"
+    """The scheduler class to use. "vllm.core.scheduler.Scheduler" is the
+    default scheduler. Can be a class directly or the path to a class of form
+    "mod.custom_class"."""
+
+    disable_hybrid_kv_cache_manager: bool = False
+    """If set to True, KV cache manager will allocate the same size of KV cache
+    for all attention layers even if there are multiple type of attention layers
+    like full attention and sliding window attention.
+    """
+
+    async_scheduling: bool = False
+    """EXPERIMENTAL: If set to True, perform async scheduling. This may help
+    reduce the CPU overheads, leading to better latency and throughput. However,
+    async scheduling is currently not supported with some features such as
+    structured outputs, speculative decoding, and pipeline parallelism.
+    """
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: list[Any] = []
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
+        return hash_str
+
+    def __post_init__(self) -> None:
+        if self.max_model_len is None:
+            self.max_model_len = 8192
+
+        if self.max_num_seqs is None:
+            self.max_num_seqs = 128
+
+        if self.max_num_batched_tokens is None:
+            if self.enable_chunked_prefill:
+                if self.num_scheduler_steps > 1:
+                    # Multi-step Chunked-Prefill doesn't allow prompt-chunking
+                    # for now. Have max_num_batched_tokens set to max_model_len
+                    # so we don't reject sequences on account of a short
+                    # max_num_batched_tokens.
+                    self.max_num_batched_tokens = max(
+                        self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS)
+                else:
+                    self.max_num_batched_tokens = (
+                        DEFAULT_MAX_NUM_BATCHED_TOKENS)
+            else:
+                # If max_model_len is too short, use
+                # DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value
+                # for higher throughput.
+                self.max_num_batched_tokens = max(
+                    self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS)
+
+            if self.runner_type == "pooling":
+                # Choose specific value for higher throughput
+                self.max_num_batched_tokens = max(
+                    self.max_num_batched_tokens,
+                    POOLING_MODEL_MAX_NUM_BATCHED_TOKENS,
+                )
+            if self.is_multimodal_model:
+                # The value needs to be at least the number of multimodal tokens
+                self.max_num_batched_tokens = max(
+                    self.max_num_batched_tokens,
+                    MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
+                )
+
+            # When using default settings,
+            # Ensure max_num_batched_tokens does not exceed model limit.
+            # Some models (e.g., Whisper) have embeddings tied to max length.
+            self.max_num_batched_tokens = min(
+                self.max_num_seqs * self.max_model_len,
+                self.max_num_batched_tokens)
+
+        self.max_num_encoder_input_tokens = self.max_num_batched_tokens
+        self.encoder_cache_size = self.max_num_batched_tokens
+
+        if self.enable_chunked_prefill:
+            logger.info(
+                "Chunked prefill is enabled with max_num_batched_tokens=%d.",
+                self.max_num_batched_tokens)
+
+        self.chunked_prefill_enabled = self.enable_chunked_prefill
+        if self.max_num_partial_prefills > 1:
+            if self.long_prefill_token_threshold == 0:
+                self.long_prefill_token_threshold = int(self.max_model_len *
+                                                        0.04)
+
+            logger.info(
+                "Concurrent partial prefills enabled with "
+                "max_num_partial_prefills=%d, max_long_partial_prefills=%d, "
+                "long_prefill_token_threshold=%d",
+                self.max_num_partial_prefills, self.max_long_partial_prefills,
+                self.long_prefill_token_threshold)
+
+        # NOTE: Default set cuda_graph_sizes to [min(max_num_seqs * 2, 512)].
+        # This avoids OOM in tight memory scenarios with small max_num_seqs,
+        # and prevents capture of many large graphs (>512) that would greatly
+        # increase startup time with limited performance benefit.
+        if not self.cuda_graph_sizes:
+            self.cuda_graph_sizes = [min(self.max_num_seqs * 2, 512)]
+
+        if self.async_scheduling:
+            self.scheduler_cls = (
+                "vllm.v1.core.sched.async_scheduler.AsyncScheduler")
+
+    @model_validator(mode='after')
+    def _verify_args(self) -> Self:
+        if (self.max_num_batched_tokens < self.max_model_len
+                and not self.chunked_prefill_enabled):
+            raise ValueError(
+                f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
+                f"smaller than max_model_len ({self.max_model_len}). "
+                "This effectively limits the maximum sequence length to "
+                "max_num_batched_tokens and makes vLLM reject longer "
+                "sequences. Please increase max_num_batched_tokens or "
+                "decrease max_model_len.")
+
+        if self.max_num_batched_tokens < self.max_num_seqs:
+            raise ValueError(
+                f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
+                "be greater than or equal to max_num_seqs "
+                f"({self.max_num_seqs}).")
+
+        if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len:
+            logger.warning(
+                "max_num_batched_tokens (%d) exceeds max_num_seqs "
+                "* max_model_len (%d). This may lead to unexpected behavior.",
+                self.max_num_batched_tokens,
+                self.max_num_seqs * self.max_model_len)
+
+        if self.num_lookahead_slots < 0:
+            raise ValueError(
+                "num_lookahead_slots "
+                f"({self.num_lookahead_slots}) must be greater than or "
+                "equal to 0.")
+
+        if self.num_scheduler_steps < 1:
+            raise ValueError(
+                "num_scheduler_steps "
+                f"({self.num_scheduler_steps}) must be greater than or "
+                "equal to 1.")
+
+        if self.max_num_partial_prefills < 1:
+            raise ValueError(
+                f"max_num_partial_prefills ({self.max_num_partial_prefills}) "
+                "must be greater than or equal to 1.")
+        elif self.max_num_partial_prefills > 1:
+            if not self.chunked_prefill_enabled:
+                raise ValueError("Chunked prefill must be enabled to set "
+                                 "max_num_partial_prefills > 1.")
+
+            if self.long_prefill_token_threshold > self.max_model_len:
+                raise ValueError(
+                    "long_prefill_token_threshold "
+                    f"({self.long_prefill_token_threshold}) cannot be greater "
+                    f"than the max_model_len ({self.max_model_len}).")
+
+        if (self.max_long_partial_prefills
+                < 1) or (self.max_long_partial_prefills
+                         > self.max_num_partial_prefills):
+            raise ValueError(
+                f"max_long_partial_prefills ({self.max_long_partial_prefills}) "
+                "must be greater than or equal to 1 and less than or equal to "
+                f"max_num_partial_prefills ({self.max_num_partial_prefills}).")
+
+        return self
+
+    @property
+    def is_multi_step(self) -> bool:
+        return self.num_scheduler_steps > 1
+
+
+Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu"]
+
+
+@config
+@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
+class DeviceConfig:
+    """Configuration for the device to use for vLLM execution."""
+
+    device: SkipValidation[Optional[Union[Device, torch.device]]] = "auto"
+    """Device type for vLLM execution.
+    This parameter is deprecated and will be
+    removed in a future release.
+    It will now be set automatically based
+    on the current platform."""
+    device_type: str = field(init=False)
+    """Device type from the current platform. This is set in
+    `__post_init__`."""
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # the device/platform information will be summarized
+        # by torch/vllm automatically.
+        factors: list[Any] = []
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
+        return hash_str
+
+    def __post_init__(self):
+        if self.device == "auto":
+            # Automated device type detection
+            from vllm.platforms import current_platform
+            self.device_type = current_platform.device_type
+            if not self.device_type:
+                raise RuntimeError(
+                    "Failed to infer device type, please set "
+                    "the environment variable `VLLM_LOGGING_LEVEL=DEBUG` "
+                    "to turn on verbose logging to help debug the issue.")
+        else:
+            # Device type is assigned explicitly
+            if isinstance(self.device, str):
+                self.device_type = self.device
+            elif isinstance(self.device, torch.device):
+                self.device_type = self.device.type
+
+        # Some device types require processing inputs on CPU
+        if self.device_type in ["neuron"]:
+            self.device = torch.device("cpu")
+        elif self.device_type in ["tpu"]:
+            self.device = None
+        else:
+            # Set device with device type
+            self.device = torch.device(self.device_type)
+
+
+SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa",
+                            "mlp_speculator", "draft_model", "deepseek_mtp"]
+
+
+@config
+@dataclass
+class SpeculativeConfig:
+    """Configuration for speculative decoding."""
+
+    # General speculative decoding control
+    num_speculative_tokens: SkipValidation[int] = None  # type: ignore
+    """The number of speculative tokens, if provided. It will default to the
+    number in the draft model config if present, otherwise, it is required."""
+    model: Optional[str] = None
+    """The name of the draft model, eagle head, or additional weights, if
+    provided."""
+    method: Optional[SpeculativeMethod] = None
+    """The name of the speculative method to use. If users provide and set the
+    `model` param, the speculative method type will be detected automatically
+    if possible, if `model` param is not provided, the method name must be
+    provided.
+
+    If using `ngram` method, the related configuration `prompt_lookup_max` and
+    `prompt_lookup_min` should be considered."""
+    draft_tensor_parallel_size: Optional[int] = None
+    """The degree of the tensor parallelism for the draft model. Can only be 1
+    or the same as the target model's tensor parallel size."""
+    disable_logprobs: bool = True
+    """If set to True, token log probabilities are not returned during
+    speculative decoding. If set to False, token log probabilities are returned
+    according to the log probability settings in SamplingParams."""
+
+    # Draft model configuration
+    quantization: Optional[me_quant.QuantizationMethods] = None
+    """Quantization method that was used to quantize the draft model weights.
+    If `None`, we assume the model weights are not quantized. Note that it only
+    takes effect when using the draft model-based speculative method."""
+    max_model_len: Optional[int] = None
+    """The maximum model length of the draft model. Used when testing the
+    ability to skip speculation for some sequences."""
+    revision: Optional[str] = None
+    """The specific model version to use for the draft model. It can be a
+    branch name, a tag name, or a commit id. If unspecified, will use the
+    default version."""
+    code_revision: Optional[str] = None
+    """The specific revision to use for the draft model code on Hugging Face
+    Hub. It can be a branch name, a tag name, or a commit id. If unspecified,
+    will use the default version."""
+
+    # Advanced control
+    disable_by_batch_size: Optional[int] = None
+    """Disable speculative decoding for new incoming requests when the number
+    of enqueued requests is larger than this value, if provided."""
+
+    # Ngram proposer configuration
+    prompt_lookup_max: Optional[int] = None
+    """Maximum size of ngram token window when using Ngram proposer, required
+    when method is set to ngram."""
+    prompt_lookup_min: Optional[int] = None
+    """Minimum size of ngram token window when using Ngram proposer, if
+    provided. Defaults to 1."""
+
+    speculative_token_tree: Optional[str] = None
+    """Specifies the tree structure for speculative token generation.
+    """
+    # required configuration params passed from engine
+    target_model_config: SkipValidation[ModelConfig] = None  # type: ignore
+    """The configuration of the target model."""
+    target_parallel_config: SkipValidation[
+        ParallelConfig] = None  # type: ignore
+    """The parallel configuration for the target model."""
+    enable_chunked_prefill: SkipValidation[bool] = None  # type: ignore
+    """Whether vLLM is configured to use chunked prefill or not. Used for
+    raising an error since it's not yet compatible with speculative decode."""
+    disable_log_stats: SkipValidation[bool] = None  # type: ignore
+    """Whether to disable the periodic printing of stage times in speculative
+    decoding."""
+
+    # params generated in the post-init stage
+    draft_model_config: SkipValidation[ModelConfig] = None  # type: ignore
+    """The configuration of the draft model initialized internal."""
+    draft_parallel_config: SkipValidation[
+        ParallelConfig] = None  # type: ignore
+    """The parallel configuration for the draft model initialized internal."""
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: list[Any] = []
+        # Eagle3 affects the computation graph because it returns intermediate
+        # hidden states in addition to the final hidden state.
+        factors.append(self.method == "eagle3")
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
+        return hash_str
+
+    @classmethod
+    def from_dict(cls, dict_value: dict) -> "SpeculativeConfig":
+        """Parse the CLI value for the speculative config."""
+        return cls(**dict_value)
+
+    @staticmethod
+    def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
+        if hf_config.model_type == "deepseek_v3":
+            hf_config.model_type = "deepseek_mtp"
+        if hf_config.model_type == "deepseek_mtp":
+            n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
+            hf_config.update({
+                "n_predict": n_predict,
+                "architectures": ["DeepSeekMTPModel"]
+            })
+
+        if hf_config.architectures[0] == "MiMoForCausalLM":
+            hf_config.model_type = "mimo_mtp"
+            n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
+            hf_config.update({
+                "num_hidden_layers": 0,
+                "n_predict": n_predict,
+                "architectures": ["MiMoMTPModel"]
+            })
+
+        if hf_config.architectures[0] == "Glm4MoeForCausalLM":
+            hf_config.model_type = "glm4_moe_mtp"
+            n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
+            hf_config.update({
+                "num_hidden_layers": 0,
+                "n_predict": n_predict,
+                "architectures": ["Glm4MoeMTPModel"]
+            })
+
+        return hf_config
+
+    def __post_init__(self):
+
+        # Note: "method" is a new parameter that helps to extend the
+        # configuration of non-model-based proposers, and the "model" parameter
+        # will be used to set the draft model, eagle head, or additional weight
+        # when needed. If users do not specify "method", the speculative method
+        # will be detected automatically if possible. If the speculative method
+        # can not be detected, it will be considered as the "draft_model" by
+        # default.
+
+        if self.model is None and self.num_speculative_tokens is not None:
+            # TODO(Shangming): Refactor mtp configuration logic when supporting
+            # mtp acceleration for more models besides deepseek_v3
+            if self.target_model_config and \
+                (self.target_model_config.hf_text_config.model_type \
+                        == "deepseek_v3" or
+                    self.target_model_config.hf_text_config.model_type \
+                        == "mimo"):
+                # use the draft model from the same model:
+                self.model = self.target_model_config.model
+            elif self.method in ("ngram", "[ngram]"):
+                self.model = "ngram"
+            else:
+                raise ValueError("num_speculative_tokens was provided without "
+                                 "speculative model.")
+
+        # Automatically configure the method for ngram when "model" is used
+        # instead of "method"
+        if self.method is None and (self.model is not None
+                                    and self.model in ("ngram", "[ngram]")):
+            self.method = "ngram"
+
+        if self.method in ("ngram", "[ngram]"):
+            # Unified to "ngram" internally
+            self.method = "ngram"
+            # Set default values if not provided
+            if (self.prompt_lookup_min is None
+                    and self.prompt_lookup_max is None):
+                # TODO(woosuk): Tune these values. They are arbitrarily chosen.
+                self.prompt_lookup_min = 5
+                self.prompt_lookup_max = 5
+            elif self.prompt_lookup_min is None:
+                assert self.prompt_lookup_max is not None
+                self.prompt_lookup_min = self.prompt_lookup_max
+            elif self.prompt_lookup_max is None:
+                assert self.prompt_lookup_min is not None
+                self.prompt_lookup_max = self.prompt_lookup_min
+
+            # Validate values
+            if self.prompt_lookup_min < 1:
+                raise ValueError(
+                    f"prompt_lookup_min={self.prompt_lookup_min} must be > 0")
+            if self.prompt_lookup_max < 1:
+                raise ValueError(
+                    f"prompt_lookup_max={self.prompt_lookup_max} must be > 0")
+            if self.prompt_lookup_min > self.prompt_lookup_max:
+                raise ValueError(
+                    f"prompt_lookup_min={self.prompt_lookup_min} must "
+                    f"be <= prompt_lookup_max={self.prompt_lookup_max}")
+
+            # TODO: current we still need extract vocab_size from target model
+            # config, in future, we may try refactor it out, and set
+            # draft related config as None here.
+            self.draft_model_config = self.target_model_config
+            self.draft_parallel_config = self.target_parallel_config
+        else:
+            self.prompt_lookup_max = 0
+            self.prompt_lookup_min = 0
+
+            if self.model is not None:
+                self.draft_model_config = ModelConfig(
+                    model=self.model,
+                    runner="draft",
+                    tokenizer=self.target_model_config.tokenizer,
+                    tokenizer_mode=self.target_model_config.tokenizer_mode,
+                    trust_remote_code=self.target_model_config.
+                    trust_remote_code,
+                    allowed_local_media_path=self.target_model_config.
+                    allowed_local_media_path,
+                    dtype=self.target_model_config.dtype,
+                    seed=self.target_model_config.seed,
+                    revision=self.revision,
+                    code_revision=self.code_revision,
+                    tokenizer_revision=self.target_model_config.
+                    tokenizer_revision,
+                    spec_target_max_model_len=self.target_model_config.
+                    max_model_len,
+                    quantization=self.quantization,
+                    enforce_eager=self.target_model_config.enforce_eager,
+                    max_seq_len_to_capture=self.target_model_config.
+                    max_seq_len_to_capture,
+                    max_logprobs=self.target_model_config.max_logprobs,
+                    hf_overrides=SpeculativeConfig.hf_config_override,
+                )
+
+                # Automatically detect the method
+                if self.method in ('eagle', 'eagle3'):
+                    pass
+                elif "eagle-" in self.draft_model_config.model.lower() or \
+                        "eagle3-" in self.draft_model_config.model.lower():
+                    self.method = "eagle"
+                elif self.draft_model_config.hf_config.model_type == "medusa":
+                    self.method = "medusa"
+                elif (self.draft_model_config.hf_config.model_type ==
+                      "mlp_speculator"):
+                    self.method = "mlp_speculator"
+                elif (self.draft_model_config.hf_config.model_type
+                      in ("deepseek_mtp", "mimo_mtp", "glm4_moe_mtp")):
+                    self.method = "deepseek_mtp"
+                    if self.num_speculative_tokens > 1:
+                        logger.warning(
+                                "All Deepseek MTP models only have " \
+                                "one layer. Might need some code changes " \
+                                "to support multiple layers."
+                            )
+                else:
+                    self.method = "draft_model"
+                    raise NotImplementedError(
+                        "Speculative decoding with draft model is not "
+                        "supported yet. Please consider using other "
+                        "speculative decoding methods such as ngram, medusa, "
+                        "eagle, or deepseek_mtp.")
+
+                # Replace hf_config for EAGLE draft_model
+                if self.method in ("eagle", "eagle3"):
+                    if self.enable_chunked_prefill and not envs.VLLM_USE_V1:
+                        raise ValueError(
+                            "Chunked prefill and EAGLE are not compatible "
+                            "when using V0.")
+
+                    from vllm.transformers_utils.configs.eagle import (
+                        EAGLEConfig)
+                    if isinstance(self.draft_model_config.hf_config,
+                                  EAGLEConfig):
+                        pass
+                    else:
+                        eagle_config = EAGLEConfig(
+                            self.draft_model_config.hf_config,
+                            method=self.method,
+                            model_type="eagle")
+                        self.draft_model_config.hf_config = eagle_config
+
+                if (self.num_speculative_tokens is not None
+                        and hasattr(self.draft_model_config.hf_config,
+                                    "num_lookahead_tokens")):
+                    self.draft_model_config.hf_config.num_lookahead_tokens = \
+                    self.num_speculative_tokens
+
+                n_predict = getattr(self.draft_model_config.hf_config,
+                                    "n_predict", None)
+                if n_predict is not None:
+                    if self.num_speculative_tokens is None:
+                        # Default to max value defined in draft model config.
+                        self.num_speculative_tokens = n_predict
+                    elif self.num_speculative_tokens > n_predict and \
+                            self.num_speculative_tokens % n_predict != 0:
+                        # Ensure divisibility for MTP module reuse.
+                        raise ValueError(
+                            f"num_speculative_tokens:{self.num_speculative_tokens}"
+                            f" must be divisible by {n_predict=}")
+
+                self.draft_tensor_parallel_size = \
+                    SpeculativeConfig._verify_and_get_draft_tp(
+                        self.target_parallel_config,
+                        self.draft_tensor_parallel_size,
+                        self.draft_model_config.hf_config
+                )
+
+                self.draft_model_config.max_model_len = (
+                    SpeculativeConfig._maybe_override_draft_max_model_len(
+                        self.max_model_len,
+                        self.draft_model_config.max_model_len,
+                        self.target_model_config.max_model_len,
+                    ))
+
+                self.draft_parallel_config = (
+                    SpeculativeConfig.create_draft_parallel_config(
+                        self.target_parallel_config,
+                        self.draft_tensor_parallel_size))
+
+    @staticmethod
+    def _maybe_override_draft_max_model_len(
+        speculative_max_model_len: Optional[int],
+        draft_max_model_len: int,
+        target_max_model_len: int,
+    ) -> int:
+        """Determine the max sequence len for the draft model. This is usually
+        the draft_max_model_len, but may be the target_max_model_len if it is
+        less than the draft_max_model_len, or may be speculative_max_model_len
+        if it is specified.
+
+        This is necessary so that sequences do not exceed the capacity of the
+        draft model or the target model.
+
+        speculative_max_model_len is mainly used for testing that sequences can
+        skip speculation.
+        """
+
+        if speculative_max_model_len is not None:
+
+            if speculative_max_model_len > draft_max_model_len:
+                raise ValueError(f"{speculative_max_model_len=} cannot be "
+                                 f"larger than {draft_max_model_len=}")
+
+            if speculative_max_model_len > target_max_model_len:
+                raise ValueError(f"{speculative_max_model_len=} cannot be "
+                                 f"larger than {target_max_model_len=}")
+
+            return speculative_max_model_len
+
+        return min(
+            draft_max_model_len,
+            target_max_model_len,
+        )
+
+    @staticmethod
+    def _verify_and_get_draft_tp(
+            target_parallel_config: ParallelConfig,
+            speculative_draft_tensor_parallel_size: Optional[int],
+            draft_hf_config: PretrainedConfig) -> int:
+        """
+        Verifies and adjusts the tensor parallel size for a draft model
+        specified using speculative_draft_tensor_parallel_size.
+        """
+        # If speculative_draft_tensor_parallel_size is unset then set it
+        # appropriately else verify that it is set correctly.
+        if speculative_draft_tensor_parallel_size is None:
+            if draft_hf_config.model_type == "mlp_speculator":
+                speculative_draft_tensor_parallel_size = 1
+                if target_parallel_config.tensor_parallel_size > 1:
+                    logger.warning(
+                        "%s cannot currently be run with tp>1; "
+                        "setting speculative_draft_tensor_parallel_size=1",
+                        draft_hf_config.model_type)
+            else:
+                speculative_draft_tensor_parallel_size = \
+                    target_parallel_config.tensor_parallel_size
+        elif speculative_draft_tensor_parallel_size not in (
+                1, target_parallel_config.tensor_parallel_size):
+            raise ValueError(
+                f"{speculative_draft_tensor_parallel_size=} cannot be "
+                f"other value than 1 or target model tensor_parallel_size")
+        return speculative_draft_tensor_parallel_size
+
+    @staticmethod
+    def create_draft_parallel_config(
+        target_parallel_config: ParallelConfig,
+        speculative_draft_tensor_parallel_size: int,
+    ) -> ParallelConfig:
+        """Create a parallel config for use by the draft worker.
+
+        This is mostly a copy of the target parallel config, except the tp_size.
+        """
+        draft_parallel_config = ParallelConfig(
+            pipeline_parallel_size=target_parallel_config.
+            pipeline_parallel_size,
+            tensor_parallel_size=speculative_draft_tensor_parallel_size,
+            distributed_executor_backend=target_parallel_config.
+            distributed_executor_backend,
+            max_parallel_loading_workers=target_parallel_config.
+            max_parallel_loading_workers,
+            disable_custom_all_reduce=target_parallel_config.
+            disable_custom_all_reduce,
+            ray_workers_use_nsight=target_parallel_config.
+            ray_workers_use_nsight,
+            placement_group=target_parallel_config.placement_group,
+        )
+
+        return draft_parallel_config
+
+    @model_validator(mode='after')
+    def _verify_args(self) -> Self:
+        if self.num_speculative_tokens is None:
+            raise ValueError(
+                "num_speculative_tokens must be provided with "
+                "speculative model unless the draft model config contains an "
+                "n_predict parameter.")
+
+        if self.num_speculative_tokens <= 0:
+            raise ValueError("Expected num_speculative_tokens to be greater "
+                             f"than zero ({self.num_speculative_tokens}).")
+
+        if self.draft_model_config:
+            self.draft_model_config.verify_with_parallel_config(
+                self.draft_parallel_config)
+
+        if (self.disable_by_batch_size is not None
+                and self.disable_by_batch_size < 2):
+            raise ValueError("Expect the batch size threshold of disabling "
+                             "speculative decoding is > 1, but got "
+                             f"{self.disable_by_batch_size=}")
+
+        if self.method == "eagle3" and self.target_model_config and \
+            "llama" not in self.target_model_config.hf_text_config.model_type:
+            raise ValueError(
+                "Eagle3 is only supported for Llama models. "
+                f"Got {self.target_model_config.hf_text_config.model_type=}")
+
+        return self
+
+    @property
+    def num_lookahead_slots(self) -> int:
+        """The number of additional slots the scheduler should allocate per
+        step, in addition to the slots allocated for each known token.
+
+        This is equal to the number of speculative tokens, as each speculative
+        token must be scored.
+        """
+        return self.num_speculative_tokens
+
+    def use_eagle(self) -> bool:
+        return self.method in ("eagle", "eagle3", "deepseek_mtp")
+
+    def __repr__(self) -> str:
+        method = self.method
+        model = None if method == "ngram" else self.draft_model_config.model
+        num_spec_tokens = self.num_speculative_tokens
+        return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"
+
+
+LoRADType = Literal["auto", "float16", "bfloat16"]
+
+
+@config
+@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
+class LoRAConfig:
+    """Configuration for LoRA."""
+
+    max_lora_rank: int = 16
+    """Max LoRA rank."""
+    max_loras: int = 1
+    """Max number of LoRAs in a single batch."""
+    fully_sharded_loras: bool = False
+    """By default, only half of the LoRA computation is sharded with tensor
+    parallelism. Enabling this will use the fully sharded layers. At high
+    sequence length, max rank or tensor parallel size, this is likely faster.
+    """
+    max_cpu_loras: Optional[int] = None
+    """Maximum number of LoRAs to store in CPU memory. Must be >= than
+    `max_loras`."""
+    lora_dtype: Union[torch.dtype, LoRADType] = "auto"
+    """Data type for LoRA. If auto, will default to base model dtype."""
+    lora_extra_vocab_size: int = 256
+    """Maximum size of extra vocabulary that can be present in a LoRA adapter
+    (added to the base model vocabulary)."""
+    lora_vocab_padding_size: ClassVar[int] = current_platform\
+        .get_lora_vocab_padding_size()
+
+    default_mm_loras: Optional[dict[str, str]] = None
+    """Dictionary mapping specific modalities to LoRA model paths; this field
+    is only applicable to multimodal models and should be leveraged when a
+    model always expects a LoRA to be active when a given modality is present.
+    Note that currently, if a request provides multiple additional
+    modalities, each of which have their own LoRA, we do NOT apply
+    default_mm_loras because we currently only support one lora adapter
+    per prompt. When run in offline mode, the lora IDs for n modalities
+    will be automatically assigned to 1-n with the names of the modalities
+    in alphabetic order."""
+    bias_enabled: bool = False
+    """Enable bias for LoRA adapters."""
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: list[Any] = []
+        factors.append(self.max_lora_rank)
+        factors.append(self.max_loras)
+        factors.append(self.fully_sharded_loras)
+        factors.append(self.lora_dtype)
+        factors.append(self.lora_extra_vocab_size)
+        factors.append(self.lora_vocab_padding_size)
+        factors.append(self.bias_enabled)
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
+        return hash_str
+
+    def __post_init__(self):
+        # Setting the maximum rank to 512 should be able to satisfy the vast
+        # majority of applications.
+        possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512)
+        possible_lora_extra_vocab_size = (256, 512)
+        if self.max_lora_rank not in possible_max_ranks:
+            raise ValueError(
+                f"max_lora_rank ({self.max_lora_rank}) must be one of "
+                f"{possible_max_ranks}.")
+        if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size:
+            raise ValueError(
+                f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) "
+                f"must be one of {possible_lora_extra_vocab_size}.")
+        if self.max_loras < 1:
+            raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.")
+        if self.max_cpu_loras is None:
+            self.max_cpu_loras = self.max_loras
+        elif self.max_cpu_loras < self.max_loras:
+            raise ValueError(
+                f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
+                f"max_loras ({self.max_loras})")
+
+    def verify_with_cache_config(self, cache_config: CacheConfig):
+        if cache_config.cpu_offload_gb > 0 and not envs.VLLM_USE_V1:
+            raise ValueError(
+                "V0 LoRA does not support CPU offload, please use V1.")
+
+    def verify_with_model_config(self, model_config: ModelConfig):
+        if self.lora_dtype in (None, "auto"):
+            self.lora_dtype = model_config.dtype
+        elif isinstance(self.lora_dtype, str):
+            self.lora_dtype = getattr(torch, self.lora_dtype)
+
+
+@config
+@dataclass
+class MultiModalConfig:
+    """Controls the behavior of multimodal models."""
+
+    limit_per_prompt: dict[str, int] = \
+        cast(dict[str, int], get_field(ModelConfig, "limit_mm_per_prompt"))
+    """
+    The maximum number of input items allowed per prompt for each modality.
+    Defaults to 1 (V0) or 999 (V1) for each modality.
+
+    For example, to allow up to 16 images and 2 videos per prompt:
+    `{"images": 16, "videos": 2}`
+    """
+
+    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
+    """Additional args passed to process media inputs, keyed by modalities.
+    For example, to set num_frames for video, set
+    `--media-io-kwargs '{"video": {"num_frames": 40} }'` """
+
+    mm_processor_kwargs: Optional[dict[str, object]] = None
+    """
+    Overrides for the multi-modal processor obtained from
+    `transformers.AutoProcessor.from_pretrained`.
+
+    The available overrides depend on the model that is being run.
+
+    For example, for Phi-3-Vision:
+    `{"num_crops": 4}`.
+    """
+
+    disable_mm_preprocessor_cache: bool = False
+    """
+    If `True`, disable caching of the processed multi-modal inputs.
+    """
+
+    interleave_mm_strings: bool = False
+    """
+    Enable fully interleaved support for multimodal prompts.
+    """
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: list[Any] = []
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
+        return hash_str
+
+    def get_limit_per_prompt(self, modality: str) -> int:
+        """
+        Get the maximum number of input items allowed per prompt
+        for the given modality.
+        """
+        return self.limit_per_prompt.get(
+            modality,
+            999 if envs.VLLM_USE_V1 else 1,
+        )
+
+    # TODO: Add configs to init vision tower or not.
+
+
+@config
+@dataclass
+class PoolerConfig:
+    """Controls the behavior of output pooling in pooling models."""
+
+    pooling_type: Optional[str] = None
+    """
+    The pooling method of the pooling model. This should be a key in
+    [`vllm.model_executor.layers.pooler.PoolingType`][].
+    """
+
+    normalize: Optional[bool] = None
+    """
+    Whether to normalize the pooled outputs. Usually, this should be set to
+    ``True`` for embedding outputs.
+    """
+
+    softmax: Optional[bool] = None
+    """
+    Whether to apply softmax to the pooled outputs. Usually, this should be set
+    to ``True`` for classification outputs.
+    """
+
+    step_tag_id: Optional[int] = None
+    """
+    If set, only the score corresponding to the ``step_tag_id`` in the
+    generated sentence should be returned. Otherwise, the scores for all tokens
+    are returned.
+    """
+
+    returned_token_ids: Optional[list[int]] = None
+    """
+    A list of indices for the vocabulary dimensions to be extracted,
+    such as the token IDs of ``good_token`` and ``bad_token`` in the
+    ``math-shepherd-mistral-7b-prm`` model.
+    """
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: list[Any] = []
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
+        return hash_str
+
+
+_STR_DTYPE_TO_TORCH_DTYPE = {
+    "half": torch.float16,
+    "float16": torch.float16,
+    "float": torch.float32,
+    "float32": torch.float32,
+    "bfloat16": torch.bfloat16,
+}
+
+# model_type -> reason
+_FLOAT16_NOT_SUPPORTED_MODELS = {
+    "gemma2": "Numerical instability. Please use bfloat16 or float32 instead.",
+    "gemma3": "Numerical instability. Please use bfloat16 or float32 instead.",
+    "plamo2": "Numerical instability. Please use bfloat16 or float32 instead.",
+    "glm4": "Numerical instability. Please use bfloat16 or float32 instead.",
+}
+
+
+def _is_valid_dtype(model_type: str, dtype: torch.dtype):
+    if model_type in _FLOAT16_NOT_SUPPORTED_MODELS and dtype == torch.float16:  # noqa: E501, SIM103
+        return False
+
+    return True
+
+
+def _check_valid_dtype(model_type: str, dtype: torch.dtype):
+    if model_type in _FLOAT16_NOT_SUPPORTED_MODELS and dtype == torch.float16:
+        reason = _FLOAT16_NOT_SUPPORTED_MODELS[model_type]
+        raise ValueError(f"The model type {model_type!r} "
+                         f"does not support float16. Reason: {reason}")
+
+    return True
+
+
+def _find_dtype(
+    model_id: str,
+    config: PretrainedConfig,
+    *,
+    revision: Optional[str],
+):
+    # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
+    # because config.torch_dtype can be None.
+    config_dtype = getattr(config, "torch_dtype", None)
+
+    # Fallbacks for multi-modal models if the root config
+    # does not define torch_dtype
+    if config_dtype is None:
+        config_dtype = getattr(config.get_text_config(), "torch_dtype", None)
+    if config_dtype is None and hasattr(config, "vision_config"):
+        config_dtype = getattr(config.vision_config, "torch_dtype", None)
+    if config_dtype is None and hasattr(config, "encoder_config"):
+        config_dtype = getattr(config.encoder_config, "torch_dtype", None)
+
+    # Try to read the dtype of the weights if they are in safetensors format
+    if config_dtype is None:
+        repo_mt = try_get_safetensors_metadata(model_id, revision=revision)
+
+        if repo_mt and (files_mt := repo_mt.files_metadata):
+            param_dtypes: set[torch.dtype] = {
+                _SAFETENSORS_TO_TORCH_DTYPE[dtype_str]
+                for file_mt in files_mt.values()
+                for dtype_str in file_mt.parameter_count
+                if dtype_str in _SAFETENSORS_TO_TORCH_DTYPE
+            }
+
+            if param_dtypes:
+                return common_broadcastable_dtype(param_dtypes)
+
+    if config_dtype is None:
+        config_dtype = torch.float32
+
+    return config_dtype
+
+
+def _resolve_auto_dtype(
+    model_type: str,
+    config_dtype: torch.dtype,
+    *,
+    is_pooling_model: bool,
+):
+    from vllm.platforms import current_platform
+
+    supported_dtypes = [
+        dtype for dtype in current_platform.supported_dtypes
+        if _is_valid_dtype(model_type, dtype)
+    ]
+
+    if is_pooling_model and torch.float16 in supported_dtypes:
+        preferred_dtype = torch.float16
+    else:
+        preferred_dtype = supported_dtypes[0]
+
+    # Downcast for float32 models
+    if config_dtype == torch.float32:
+        config_dtype = preferred_dtype
+
+    if config_dtype in supported_dtypes:
+        return config_dtype
+
+    # Ensure device compatibility
+    device_name = current_platform.get_device_name()
+    device_capability = current_platform.get_device_capability()
+
+    if device_capability is None:
+        device_str = f"{device_name!r}"
+    else:
+        version_str = device_capability.as_version_str()
+        device_str = f"{device_name!r} (with compute capability {version_str})"
+
+    logger.warning(
+        "Your device %s doesn't support %s. "
+        "Falling back to %s for compatibility.",
+        device_str,
+        config_dtype,
+        preferred_dtype,
+    )
+
+    return preferred_dtype
+
+
+def _get_and_verify_dtype(
+    model_id: str,
+    config: PretrainedConfig,
+    dtype: Union[str, torch.dtype],
+    *,
+    is_pooling_model: bool,
+    revision: Optional[str] = None,
+) -> torch.dtype:
+    config_dtype = _find_dtype(model_id, config, revision=revision)
+    model_type = config.model_type
+
+    if isinstance(dtype, str):
+        dtype = dtype.lower()
+        if dtype == "auto":
+            # Set default dtype from model config
+            torch_dtype = _resolve_auto_dtype(
+                model_type,
+                config_dtype,
+                is_pooling_model=is_pooling_model,
+            )
+        else:
+            if dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
+                raise ValueError(f"Unknown dtype: {dtype!r}")
+            torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
+    elif isinstance(dtype, torch.dtype):
+        torch_dtype = dtype
+    else:
+        raise ValueError(f"Unknown dtype: {dtype}")
+
+    _check_valid_dtype(model_type, torch_dtype)
+
+    if torch_dtype != config_dtype:
+        if torch_dtype == torch.float32:
+            # Upcasting to float32 is allowed.
+            logger.info("Upcasting %s to %s.", config_dtype, torch_dtype)
+        elif config_dtype == torch.float32:
+            # Downcasting from float32 to float16 or bfloat16 is allowed.
+            logger.info("Downcasting %s to %s.", config_dtype, torch_dtype)
+        else:
+            # Casting between float16 and bfloat16 is allowed with a warning.
+            logger.warning("Casting %s to %s.", config_dtype, torch_dtype)
+
+    return torch_dtype
+
+
+def _get_and_verify_max_len(
+    hf_config: PretrainedConfig,
+    tokenizer_config: Optional[dict],
+    max_model_len: Optional[int],
+    disable_sliding_window: bool,
+    sliding_window_len: Optional[Union[int, list[Optional[int]]]],
+    spec_target_max_model_len: Optional[int] = None,
+    encoder_config: Optional[Any] = None,
+) -> int:
+    """Get and verify the model's maximum length."""
+    derived_max_model_len = float("inf")
+    possible_keys = [
+        # OPT
+        "max_position_embeddings",
+        # GPT-2
+        "n_positions",
+        # MPT
+        "max_seq_len",
+        # ChatGLM2
+        "seq_length",
+        # Command-R
+        "model_max_length",
+        # Whisper
+        "max_target_positions",
+        # Others
+        "max_sequence_length",
+        "max_seq_length",
+        "seq_len",
+    ]
+    # Choose the smallest "max_length" from the possible keys
+    max_len_key = None
+    for key in possible_keys:
+        max_len = getattr(hf_config, key, None)
+        if max_len is not None:
+            max_len_key = key if max_len < derived_max_model_len \
+                else max_len_key
+            derived_max_model_len = min(derived_max_model_len, max_len)
+    # For Command-R / Cohere, Cohere2 / Aya Vision models
+    if tmp_max_len := getattr(hf_config, "model_max_length", None):
+        max_len_key = "model_max_length"
+        derived_max_model_len = tmp_max_len
+
+    # If sliding window is manually disabled, max_length should be less
+    # than the sliding window length in the model config.
+    if disable_sliding_window and sliding_window_len is not None:
+
+        sliding_window_len_min = get_min_sliding_window(sliding_window_len)
+        max_len_key = "sliding_window" \
+            if sliding_window_len_min < derived_max_model_len else max_len_key
+        derived_max_model_len = min(derived_max_model_len,
+                                    sliding_window_len_min)
+
+    # Consider model_max_length in tokenizer_config
+    if tokenizer_config:
+        tokenizer_model_max_length = tokenizer_config.get(
+            "model_max_length", derived_max_model_len)
+        derived_max_model_len = min(derived_max_model_len,
+                                    tokenizer_model_max_length)
+
+    # If none of the keys were found in the config, use a default and
+    # log a warning.
+    if derived_max_model_len == float("inf"):
+        if max_model_len is not None:
+            # If max_model_len is specified, we use it.
+            return max_model_len
+
+        if spec_target_max_model_len is not None:
+            # If this is a speculative draft model, we use the max model len
+            # from the target model.
+            return spec_target_max_model_len
+
+        default_max_len = 2048
+        logger.warning(
+            "The model's config.json does not contain any of the following "
+            "keys to determine the original maximum length of the model: "
+            "%s. Assuming the model's maximum length is %d.", possible_keys,
+            default_max_len)
+        derived_max_model_len = default_max_len
+
+    rope_scaling = getattr(hf_config, "rope_scaling", None)
+    # NOTE(woosuk): Gemma3's max_model_len (128K) is already scaled by RoPE
+    # scaling, so we skip applying the scaling factor again.
+    if rope_scaling is not None and "gemma3" not in hf_config.model_type:
+        # No need to consider "type" key because of patch_rope_scaling when
+        # loading HF config
+        rope_type = rope_scaling["rope_type"]
+
+        if rope_type not in ("su", "longrope", "llama3"):
+            if disable_sliding_window:
+                # TODO(robertgshaw): Find a model that supports rope_scaling
+                # with sliding window to see if this case should be allowed.
+                raise NotImplementedError(
+                    "Disabling sliding window is not supported for models "
+                    "with rope_scaling. Please raise an issue so we can "
+                    "investigate.")
+
+            # NOTE: rope_type == "default" does not define factor
+            # https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py
+            scaling_factor = rope_scaling.get("factor", 1.0)
+
+            if rope_type == "yarn":
+                derived_max_model_len = rope_scaling[
+                    "original_max_position_embeddings"]
+            derived_max_model_len *= scaling_factor
+
+    if encoder_config and "max_seq_length" in encoder_config:
+        derived_max_model_len = encoder_config["max_seq_length"]
+
+    # If the user specified a max length, make sure it is smaller than the
+    # derived length from the HF model config.
+    if max_model_len is None:
+        max_model_len = int(derived_max_model_len)
+        if current_platform.is_tpu():
+            logger.warning(
+                "--max-model-len is not specified, "
+                "it's currently using model's default length %s, "
+                "which might be too large."
+                "Please input with --max-model-len based on your "
+                "request input length and output length, to avoid "
+                "unnecessary degradation.", max_model_len)
+    elif max_model_len > derived_max_model_len:
+        # Some models might have a separate key for specifying model_max_length
+        # that will be bigger than derived_max_model_len. We compare user input
+        # with model_max_length and allow this override when it's smaller.
+        model_max_length = getattr(hf_config, "model_max_length", None)
+        if model_max_length is not None and max_model_len <= model_max_length:
+            if disable_sliding_window:
+                # TODO(robertgshaw): Find a model that has model_max_length
+                # with sliding window to see if this case should be allowed.
+                raise NotImplementedError(
+                    "Disabling sliding window is not supported for models "
+                    "model_max_length in the config. Please raise an issue "
+                    "so we can investigate.")
+        else:
+            msg = (
+                f"User-specified max_model_len ({max_model_len}) is greater "
+                f"than the derived max_model_len ({max_len_key}="
+                f"{derived_max_model_len} or model_max_length="
+                f"{model_max_length} in model's config.json). This may lead "
+                "to incorrect model outputs or CUDA errors.")
+            if envs.VLLM_ALLOW_LONG_MAX_MODEL_LEN:
+                logger.warning(
+                    "%s Make sure the value is correct and within the "
+                    "model context size.", msg)
+            else:
+                raise ValueError(
+                    f"{msg} To allow overriding this maximum, set "
+                    "the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1")
+    return int(max_model_len)
+
+
+def get_min_sliding_window(
+        sliding_window: Union[int, list[Optional[int]]]) -> int:
+    if isinstance(sliding_window, list):
+        return min(s for s in sliding_window if s is not None)
+
+    return sliding_window
+
+
+def get_served_model_name(model: str,
+                          served_model_name: Optional[Union[str, list[str]]]):
+    """
+    If the input is a non-empty list, the first model_name in
+    `served_model_name` is taken.
+    If the input is a non-empty string, it is used directly.
+    For cases where the input is either an empty string or an
+    empty list, the fallback is to use `self.model`.
+    """
+    if not served_model_name:
+        return model
+    if isinstance(served_model_name, list):
+        return served_model_name[0]
+    return served_model_name
+
+
+GuidedDecodingBackendV0 = Literal["auto", "outlines", "lm-format-enforcer",
+                                  "xgrammar", "guidance"]
+
+GuidedDecodingBackendV1 = Literal["auto", "xgrammar", "guidance", "outlines"]
+GuidedDecodingBackend = Literal[GuidedDecodingBackendV0,
+                                GuidedDecodingBackendV1]
+
+
+@config
+@dataclass
+class DecodingConfig:
+    """Dataclass which contains the decoding strategy of the engine."""
+
+    backend: GuidedDecodingBackend = "auto" if envs.VLLM_USE_V1 else "xgrammar"
+    """Which engine will be used for guided decoding (JSON schema / regex etc)
+    by default. With "auto", we will make opinionated choices based on request
+    contents and what the backend libraries currently support, so the behavior
+    is subject to change in each release."""
+
+    disable_fallback: bool = False
+    """If `True`, vLLM will not fallback to a different backend on error."""
+
+    disable_any_whitespace: bool = False
+    """If `True`, the model will not generate any whitespace during guided
+    decoding. This is only supported for xgrammar and guidance backends."""
+
+    disable_additional_properties: bool = False
+    """If `True`, the `guidance` backend will not use `additionalProperties`
+    in the JSON schema. This is only supported for the `guidance` backend and
+    is used to better align its behaviour with `outlines` and `xgrammar`."""
+
+    reasoning_backend: str = ""
+    """Select the reasoning parser depending on the model that you're using.
+    This is used to parse the reasoning content into OpenAI API format."""
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: list[Any] = []
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
+        return hash_str
+
+    def __post_init__(self):
+        if envs.VLLM_USE_V1:
+            valid_guided_backends = get_args(GuidedDecodingBackendV1)
+        else:
+            valid_guided_backends = get_args(GuidedDecodingBackendV0)
+        if self.backend not in valid_guided_backends:
+            raise ValueError(f"Invalid backend '{self.backend}',"
+                             f" must be one of {valid_guided_backends}")
+        if (self.disable_any_whitespace
+                and self.backend not in ("xgrammar", "guidance")):
+            raise ValueError("disable_any_whitespace is only supported for "
+                             "xgrammar and guidance backends.")
+        if (self.disable_additional_properties and self.backend != "guidance"):
+            raise ValueError("disable_additional_properties is only supported "
+                             "for the guidance backend.")
+
+
+DetailedTraceModules = Literal["model", "worker", "all"]
+
+
+@config
+@dataclass
+class ObservabilityConfig:
+    """Configuration for observability - metrics and tracing."""
+
+    show_hidden_metrics_for_version: Optional[str] = None
+    """Enable deprecated Prometheus metrics that have been hidden since the
+    specified version. For example, if a previously deprecated metric has been
+    hidden since the v0.7.0 release, you use
+    `--show-hidden-metrics-for-version=0.7` as a temporary escape hatch while
+    you migrate to new metrics. The metric is likely to be removed completely
+    in an upcoming release."""
+
+    @cached_property
+    def show_hidden_metrics(self) -> bool:
+        """Check if the hidden metrics should be shown."""
+        if self.show_hidden_metrics_for_version is None:
+            return False
+        return version._prev_minor_version_was(
+            self.show_hidden_metrics_for_version)
+
+    otlp_traces_endpoint: Optional[str] = None
+    """Target URL to which OpenTelemetry traces will be sent."""
+
+    collect_detailed_traces: Optional[list[DetailedTraceModules]] = None
+    """It makes sense to set this only if `--otlp-traces-endpoint` is set. If
+    set, it will collect detailed traces for the specified modules. This
+    involves use of possibly costly and or blocking operations and hence might
+    have a performance impact.
+
+    Note that collecting detailed timing information for each request can be
+    expensive."""
+
+    @cached_property
+    def collect_model_forward_time(self) -> bool:
+        """Whether to collect model forward time for the request."""
+        return (self.collect_detailed_traces is not None
+                and ("model" in self.collect_detailed_traces
+                     or "all" in self.collect_detailed_traces))
+
+    @cached_property
+    def collect_model_execute_time(self) -> bool:
+        """Whether to collect model execute time for the request."""
+        return (self.collect_detailed_traces is not None
+                and ("worker" in self.collect_detailed_traces
+                     or "all" in self.collect_detailed_traces))
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: list[Any] = []
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
+        return hash_str
+
+    def __post_init__(self):
+        if (self.collect_detailed_traces is not None
+                and len(self.collect_detailed_traces) == 1
+                and "," in self.collect_detailed_traces[0]):
+            self._parse_collect_detailed_traces()
+
+        from vllm.tracing import is_otel_available, otel_import_error_traceback
+        if not is_otel_available() and self.otlp_traces_endpoint is not None:
+            raise ValueError(
+                "OpenTelemetry is not available. Unable to configure "
+                "'otlp_traces_endpoint'. Ensure OpenTelemetry packages are "
+                f"installed. Original error:\n{otel_import_error_traceback}")
+
+    def _parse_collect_detailed_traces(self):
+        assert isinstance(self.collect_detailed_traces, list)
+        self.collect_detailed_traces = cast(
+            list[DetailedTraceModules],
+            self.collect_detailed_traces[0].split(","))
+
+
+KVProducer = Literal["kv_producer", "kv_both"]
+KVConsumer = Literal["kv_consumer", "kv_both"]
+KVRole = Literal[KVProducer, KVConsumer]
+
+
+@config
+@dataclass
+class KVTransferConfig:
+    """Configuration for distributed KV cache transfer."""
+
+    kv_connector: Optional[str] = None
+    """The KV connector for vLLM to transmit KV caches between vLLM instances.
+    """
+
+    engine_id: Optional[str] = None
+    """The engine id for KV transfers."""
+
+    kv_buffer_device: Optional[str] = "cuda"
+    """The device used by kv connector to buffer the KV cache.
+    Currently only support 'cuda'."""
+
+    kv_buffer_size: float = 1e9
+    """The buffer size for TorchDistributedConnector. Measured in number of
+    bytes. Recommended value: 1e9 (about 1GB)."""
+
+    kv_role: Optional[KVRole] = None
+    """Whether this vLLM instance produces, consumes KV cache, or both. Choices
+    are 'kv_producer', 'kv_consumer', and 'kv_both'."""
+
+    kv_rank: Optional[int] = None
+    """The rank of this vLLM instance in the KV cache transfer. Typical value:
+    0 for prefill instance, 1 for decode instance.
+    Currently only 1P1D is supported."""
+
+    kv_parallel_size: int = 1
+    """The number of parallel instances for KV cache transfer. For
+    PyNcclConnector, this should be 2."""
+
+    kv_ip: str = "127.0.0.1"
+    """The KV connector ip, used to build distributed connection."""
+
+    kv_port: int = 14579
+    """The KV connector port, used to build distributed connection."""
+
+    kv_connector_extra_config: dict[str, Any] = field(default_factory=dict)
+    """any extra config that the connector may need."""
+
+    kv_connector_module_path: Optional[str] = None
+    """The Python module path to dynamically load the KV connector from.
+    Only supported in V1."""
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: list[Any] = []
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
+        return hash_str
+
+    def __post_init__(self) -> None:
+        if self.engine_id is None:
+            self.engine_id = str(uuid.uuid4())
+
+        if self.kv_role is not None and self.kv_role not in get_args(KVRole):
+            raise ValueError(f"Unsupported kv_role: {self.kv_role}. "
+                             f"Supported roles are {get_args(KVRole)}")
+
+        if self.kv_connector is not None and self.kv_role is None:
+            raise ValueError("Please specify kv_disagg_role when kv_connector "
+                             f"is set, supported roles are {get_args(KVRole)}")
+
+    @property
+    def is_kv_transfer_instance(self) -> bool:
+        return self.kv_connector is not None and \
+            self.kv_role in get_args(KVRole)
+
+    @property
+    def is_kv_producer(self) -> bool:
+        return self.kv_connector is not None and \
+            self.kv_role in get_args(KVProducer)
+
+    @property
+    def is_kv_consumer(self) -> bool:
+        return self.kv_connector is not None and \
+            self.kv_role in get_args(KVConsumer)
+
+    def get_from_extra_config(self, key, default) -> Any:
+        return self.kv_connector_extra_config.get(key, default)
+
+
+@config
+@dataclass
+class KVEventsConfig:
+    """Configuration for KV event publishing."""
+
+    enable_kv_cache_events: bool = False
+    """If True, enable KV cache events for tracking block storage and removal.
+    Events can be published externally by zmq using the event publisher config.
+    """
+
+    publisher: str = "null"
+    """The publisher to use for publishing kv events. Can be "null", "zmq".
+    """
+
+    endpoint: str = "tcp://*:5557"
+    """The zmq endpoint to use for publishing kv events.
+    """
+
+    replay_endpoint: Optional[str] = None
+    """The zmq endpoint to use for replaying kv events.
+    """
+
+    buffer_steps: int = 10_000
+    """The number of steps to cache for replay endpoint. Will only save
+    events from the last N steps for the replay endpoint.
+    """
+
+    hwm: int = 100_000
+    """The zmq high water mark for the event publisher. After queueing N events,
+    events will start dropping if the consumer is not keeping up.
+    """
+
+    max_queue_size: int = 100_000
+    """The maximum number of events to queue while waiting for publishing.
+    """
+
+    topic: str = ""
+    """The topic to use for the event publisher. Consumers can subscribe to
+    this topic to receive events.
+    """
+
+
+class CompilationLevel:
+    # constants for the levels of the compilation process
+    NO_COMPILATION = 0
+    DYNAMO_AS_IS = 1
+    DYNAMO_ONCE = 2
+    PIECEWISE = 3
+
+
+@config
+@dataclass
+class PassConfig:
+    """Configuration for custom Inductor passes.
+
+    This is separate from general `CompilationConfig` so that inductor passes
+    don't all have access to full configuration - that would create a cycle as
+    the `PassManager` is set as a property of config."""
+
+    enable_fusion: bool = field(default_factory=lambda: not envs.VLLM_USE_V1)
+    """Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass."""
+    enable_attn_fusion: bool = False
+    """Whether to enable the custom attention+quant fusion pass."""
+    enable_noop: bool = field(default_factory=lambda: not envs.VLLM_USE_V1)
+    """Whether to enable the custom no-op elimination pass."""
+    enable_sequence_parallelism: bool = False
+    """Whether to enable sequence parallelism."""
+    enable_async_tp: bool = False
+    """Whether to enable async TP."""
+    enable_fi_allreduce_fusion: bool = False
+    """Whether to enable flashinfer allreduce fusion."""
+    fi_allreduce_fusion_max_token_num: int = 1024
+    """Max number of tokens to used in flashinfer allreduce fusion."""
+
+    # TODO(luka) better pass enabling system.
+
+    def uuid(self):
+        """
+        Produces a hash unique to the pass configuration.
+        Any new fields that affect compilation should be added to the hash.
+        Any future fields that don't affect compilation should be excluded.
+        """
+        return InductorPass.hash_dict(asdict(self))
+
+    def __post_init__(self) -> None:
+        if not self.enable_noop:
+            if self.enable_fusion:
+                logger.warning_once(
+                    "Fusion enabled but reshape elimination disabled. "
+                    "RMSNorm/SiluMul + quant (fp8) fusion might not work")
+            if self.enable_attn_fusion:
+                logger.warning_once(
+                    "Fusion enabled but reshape elimination disabled. "
+                    "Attention + quant (fp8) fusion might not work")
+
+
+@config
+@dataclass
+class CompilationConfig:
+    """Configuration for compilation. It has three parts:
+
+    - Top-level Compilation control:
+        - [`level`][vllm.config.CompilationConfig.level]
+        - [`debug_dump_path`][vllm.config.CompilationConfig.debug_dump_path]
+        - [`cache_dir`][vllm.config.CompilationConfig.cache_dir]
+        - [`backend`][vllm.config.CompilationConfig.backend]
+        - [`custom_ops`][vllm.config.CompilationConfig.custom_ops]
+        - [`splitting_ops`][vllm.config.CompilationConfig.splitting_ops]
+    - CudaGraph capture:
+        - [`use_cudagraph`][vllm.config.CompilationConfig.use_cudagraph]
+        - [`cudagraph_capture_sizes`]
+        [vllm.config.CompilationConfig.cudagraph_capture_sizes]
+        - [`cudagraph_num_of_warmups`]
+        [vllm.config.CompilationConfig.cudagraph_num_of_warmups]
+        - [`cudagraph_copy_inputs`]
+        [vllm.config.CompilationConfig.cudagraph_copy_inputs]
+        - [`full_cuda_graph`][vllm.config.CompilationConfig.full_cuda_graph]
+    - Inductor compilation:
+        - [`use_inductor`][vllm.config.CompilationConfig.use_inductor]
+        - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
+        - [`inductor_compile_config`]
+        [vllm.config.CompilationConfig.inductor_compile_config]
+        - [`inductor_passes`][vllm.config.CompilationConfig.inductor_passes]
+        - custom inductor passes
+
+    Why we have different sizes for cudagraph and inductor:
+    - cudagraph: a cudagraph captured for a specific size can only be used
+        for the same size. We need to capture all the sizes we want to use.
+    - inductor: a graph compiled by inductor for a general shape can be used
+        for different sizes. Inductor can also compile for specific sizes,
+        where it can have more information to optimize the graph with fully
+        static shapes. However, we find the general shape compilation is
+        sufficient for most cases. It might be beneficial to compile for
+        certain small batchsizes, where inductor is good at optimizing.
+    """
+    # Top-level Compilation control
+    level: int = 0
+    """The level of compilation:
+
+    - 0: no compilation.
+    - 1: dynamo as is.
+    - 2: dynamo once.
+    - 3: piecewise compilation."""
+    debug_dump_path: str = ""
+    """The path to dump the debug information."""
+    cache_dir: str = ""
+    """The directory to store the compiled graph, to accelerate Inductor
+    compilation. By default, it will use model-related information to generate
+    a cache directory."""
+    backend: str = ""
+    """The backend for compilation. It needs to be a string:
+
+    - "" (empty string): use the default backend.
+    - "eager"/"openxla"/...: use the specified backend registered in PyTorch.
+    - "full.module.name": a qualified name which can be used to import the
+
+    backend function.
+    We use string to avoid serialization issues when using compilation in a
+    distributed setting. When the compilation level is 1 or 2, the backend is
+    used for the compilation directly (it sees the whole graph). When the
+    compilation level is 3, the backend is used for the piecewise compilation
+    (it sees a part of the graph)."""
+    custom_ops: list[str] = field(default_factory=list)
+    """Fine-grained control over which custom ops to enable/disable. Use 'all'
+    to enable all, 'none' to disable all. Also specify a list of custom op
+    names to enable (prefixed with a '+'), or disable (prefixed with a '-').
+    Examples:
+
+    - 'all,-op1' to enable all except op1
+    - 'none,+op1,+op2' to enable only op1 and op2
+
+    By default, all custom ops are enabled when running without Inductor and
+    disabled when running with Inductor: level>=PIECEWISE and use_inductor=True.
+    Inductor generates (fused) Triton kernels for disabled custom ops."""
+    splitting_ops: list[str] = field(default_factory=list)
+    """A list of ops to split the full graph into subgraphs, used in piecewise
+    compilation."""
+
+    # Inductor capture
+    use_inductor: bool = True
+    """Whether to use inductor compilation:
+
+    - False: inductor compilation is not used. graph runs in eager
+        (custom_ops enabled by default).
+    - True: inductor compilation is used (custom_ops disabled by default).
+        One graph for symbolic shape and one graph per size in compile_sizes
+        are compiled using configurations in inductor_compile_config.
+
+    This setting is ignored if level<PIECEWISE."""
+    compile_sizes: Optional[list[Union[int, str]]] = None
+    """Sizes to compile for inductor. In addition
+    to integers, it also supports "cudagraph_capture_sizes" to
+    specify the sizes for cudagraph capture."""
+    inductor_compile_config: dict = field(default_factory=dict)
+    """Additional configurations for inductor.
+    - None: use default configurations."""
+    inductor_passes: dict[str, str] = field(default_factory=dict)
+    """Additional passes for inductor. It is a dictionary
+    from pass name to pass function qualified name. We use function
+    name because the config uses JSON format. If we pass the config
+    from Python, functions can also be passed directly via Python object
+    constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
+
+    # CudaGraph compilation
+    use_cudagraph: bool = field(default_factory=lambda: envs.VLLM_USE_V1)
+    """Whether to use cudagraph inside compilation.
+    - False: cudagraph inside compilation is not used.
+    - True: cudagraph inside compilation is used. It requires
+        that all input buffers have fixed addresses, and all
+        splitting ops write their outputs to input buffers.
+    In the vLLM V1 Engine, this flag only applies for
+    CompilationLevel.PIECEWISE (aka -O3).
+    Note that this is orthogonal to the cudagraph capture logic
+    outside of compilation.
+    TODO: move outside cudagraph logic into compilation.
+    torch.compile will handle cudagraph capture logic in the future."""
+    cudagraph_num_of_warmups: int = 0
+    """Number of warmup runs for cudagraph.
+    It means the first several runs will be treated as warmup runs.
+    Only after that, the execution will be recorded, and the recorded
+    cudagraph will be used for subsequent runs."""
+    cudagraph_capture_sizes: Optional[list[int]] = None
+    """Sizes to capture cudagraph.
+    - None (default): capture sizes are inferred from vllm config.
+    - list[int]: capture sizes are specified as given."""
+    cudagraph_copy_inputs: bool = False
+    """Whether to copy input tensors for
+    cudagraph. If the caller can guarantee that the same input buffers
+    are always used, it can set this to False. Otherwise, it should
+    set this to True, and the compiler will copy the input to an
+    internally managed buffer. Default is False."""
+    full_cuda_graph: bool = False
+    """whether to use a full cuda graph for the entire forward pass rather than
+    splitting certain operations such as attention into subgraphs. Thus this
+    flag cannot be used together with splitting_ops. This may provide
+    performance benefits for smaller models."""
+
+    pass_config: PassConfig = field(default_factory=PassConfig)
+    """Custom inductor passes, see PassConfig for more details"""
+
+    max_capture_size: int = field(default=None, init=False)  # type: ignore
+    """not configurable, computed after init"""
+    local_cache_dir: str = field(default=None, init=False)  # type: ignore
+    """local cache dir for each rank"""
+    bs_to_padded_graph_size: list[int] = field(
+        default=None,  # type: ignore
+        init=False)
+    """optimization:
+    Intuitively, bs_to_padded_graph_size should be dict[int, int].
+    since we know all keys are in a range [0, max_capture_size],
+    we can optimize it to list[int] for better lookup performance."""
+
+    # keep track of enabled and disabled custom ops
+    enabled_custom_ops: Counter[str] = field(default_factory=Counter,
+                                             init=False)
+    """custom ops that are enabled"""
+    disabled_custom_ops: Counter[str] = field(default_factory=Counter,
+                                              init=False)
+    """custom ops that are disabled"""
+    traced_files: set[str] = field(default_factory=set, init=False)
+    """files that are traced for compilation"""
+    compilation_time: float = field(default=0.0, init=False)
+    """time taken for compilation"""
+
+    static_forward_context: dict[str, Any] = field(default_factory=dict,
+                                                   init=False)
+    """Per-model forward context
+    Map from layer name to layer objects that need to be accessed outside
+    model code, e.g., Attention, FusedMOE when dp_size>1."""
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: list[Any] = []
+        factors.append(self.level)
+        factors.append(self.backend)
+        factors.append(self.custom_ops)
+        factors.append(self.splitting_ops)
+        factors.append(self.use_inductor)
+        factors.append(self.inductor_compile_config)
+        factors.append(self.inductor_passes)
+        factors.append(self.pass_config.uuid())
+        return hashlib.sha256(str(factors).encode()).hexdigest()
+
+    def __repr__(self) -> str:
+        exclude = {
+            "static_forward_context": True,
+            "enabled_custom_ops": True,
+            "disabled_custom_ops": True,
+            "compilation_time": True,
+            "bs_to_padded_graph_size": True,
+            "pass_config": True,
+            "traced_files": True,
+            "inductor_compile_config": {
+                "post_grad_custom_post_pass": True,
+            },
+        }
+        # The cast to string is necessary because Pydantic is mocked in docs
+        # builds and sphinx-argparse doesn't know the return type of decode()
+        return str(
+            TypeAdapter(CompilationConfig).dump_json(
+                self,
+                exclude=exclude,  # type: ignore[arg-type]
+                exclude_unset=True).decode())
+
+    __str__ = __repr__
+
+    @classmethod
+    def from_cli(cls, cli_value: str) -> "CompilationConfig":
+        """Parse the CLI value for the compilation config.
+        -O1, -O2, -O3, etc. is handled in FlexibleArgumentParser.
+        """
+        return TypeAdapter(CompilationConfig).validate_json(cli_value)
+
+    def __post_init__(self) -> None:
+        count_none = self.custom_ops.count("none")
+        count_all = self.custom_ops.count("all")
+        assert count_none + count_all <= 1, "Can only specify 'none' or 'all'"
+
+        # TODO(zou3519/luka): There are 2 issues with auto-functionalization V2:
+        # 1. A bug in PyTorch, fixed in 2.7:
+        #    https://github.com/pytorch/pytorch/issues/147924
+        # 2. Custom passes (fusion) rely on auto-functionalization V1 and don't
+        #    work with V2. Addressing this will take extra engineering effort
+        #    and it is not yet a priority. RFC here:
+        #    https://github.com/vllm-project/vllm/issues/14703
+
+        if is_torch_equal_or_newer("2.6"):
+            KEY = 'enable_auto_functionalized_v2'
+            if KEY not in self.inductor_compile_config:
+                self.inductor_compile_config[KEY] = False
+
+        for k, v in self.inductor_passes.items():
+            if not isinstance(v, str):
+                assert callable(v), (
+                    f"pass {k} should be callable or a qualified name")
+                self.inductor_compile_config[k] = v if isinstance(
+                    v, InductorPass) else CallableInductorPass(v)
+                continue
+
+            # resolve function from qualified name
+            names = v.split(".")
+            module = ".".join(names[:-1])
+            func_name = names[-1]
+            func = __import__(module).__dict__[func_name]
+            self.inductor_compile_config[k] = func if isinstance(
+                func, InductorPass) else CallableInductorPass(func)
+
+        if isinstance(self.pass_config, dict):
+            self.pass_config = PassConfig(**self.pass_config)
+
+    def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
+        if self.level == CompilationLevel.NO_COMPILATION:
+            raise ValueError("No compilation level is set.")
+
+        from torch._dynamo.backends.registry import list_backends
+        torch_backends = list_backends(exclude_tags=tuple())
+        if self.level in [
+                CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE
+        ]:
+            if self.backend == "":
+                return "eager"
+            if self.backend in torch_backends:
+                return self.backend
+            return resolve_obj_by_qualname(self.backend)
+
+        # TODO: pass user-specified backend to piecewise compilation
+        # merge with the config use_inductor
+        assert self.level == CompilationLevel.PIECEWISE
+
+        from vllm.compilation.backends import VllmBackend
+        return VllmBackend(vllm_config)
+
+    def init_with_cudagraph_sizes(self,
+                                  cudagraph_capture_sizes: list[int]) -> None:
+        """To complete the initialization of config,
+        we need to know the cudagraph sizes."""
+
+        if self.cudagraph_capture_sizes is None:
+            self.cudagraph_capture_sizes = cudagraph_capture_sizes
+        else:
+            # de-duplicate the sizes provided by the config
+            dedup_sizes = list(set(self.cudagraph_capture_sizes))
+            if len(dedup_sizes) < len(self.cudagraph_capture_sizes):
+                logger.info(("cudagraph sizes specified by model runner"
+                             " %s is overridden by config %s"),
+                            cudagraph_capture_sizes, dedup_sizes)
+            self.cudagraph_capture_sizes = dedup_sizes
+
+        computed_compile_sizes = []
+        if self.compile_sizes is not None:
+            # de-duplicate the sizes provided by the config
+            self.compile_sizes = list(set(self.compile_sizes))
+            for x in self.compile_sizes:
+                if isinstance(x, str):
+                    assert x == "cudagraph_capture_sizes", \
+                    "Unrecognized size type in compile_sizes, " \
+                    f"expect 'cudagraph_capture_sizes', got {x}"
+                    computed_compile_sizes.extend(self.cudagraph_capture_sizes)
+                else:
+                    assert isinstance(x, int)
+                    computed_compile_sizes.append(x)
+        self.compile_sizes = computed_compile_sizes  # type: ignore
+
+        # sort to make sure cudagraph capture sizes are in descending order
+        self.cudagraph_capture_sizes.sort(reverse=True)
+        self.max_capture_size = self.cudagraph_capture_sizes[
+            0] if self.cudagraph_capture_sizes else 0
+
+        # pre-compute the mapping from batch size to padded graph size
+        self.bs_to_padded_graph_size = [
+            0 for i in range(self.max_capture_size + 1)
+        ]
+        for end, start in zip(self.cudagraph_capture_sizes,
+                              self.cudagraph_capture_sizes[1:] + [0]):
+            for bs in range(start, end):
+                if bs == start:
+                    self.bs_to_padded_graph_size[bs] = start
+                else:
+                    self.bs_to_padded_graph_size[bs] = end
+        self.bs_to_padded_graph_size[
+            self.max_capture_size] = self.max_capture_size
+
+    def set_splitting_ops_for_v1(self):
+        # NOTE: this function needs to be called
+        if self.splitting_ops and self.full_cuda_graph:
+            raise ValueError("full_cuda_graph cannot be used together with "
+                             "splitting_ops, as Full CUDA graph will override "
+                             f"the splitting_ops: {self.splitting_ops}")
+
+        if not self.splitting_ops:
+            self.splitting_ops = [] if self.full_cuda_graph else [
+                "vllm.unified_attention",
+                "vllm.unified_attention_with_output",
+                "vllm.mamba_mixer2",
+            ]
+
+
+@config
+@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
+class VllmConfig:
+    """Dataclass which contains all vllm-related configuration. This
+    simplifies passing around the distinct configurations in the codebase.
+    """
+
+    # TODO: use default_factory once default constructing ModelConfig doesn't
+    # try to download a model
+    model_config: ModelConfig = None  # type: ignore
+    """Model configuration."""
+    cache_config: CacheConfig = field(default_factory=CacheConfig)
+    """Cache configuration."""
+    parallel_config: ParallelConfig = field(default_factory=ParallelConfig)
+    """Parallel configuration."""
+    scheduler_config: SchedulerConfig = field(default_factory=SchedulerConfig)
+    """Scheduler configuration."""
+    device_config: DeviceConfig = field(default_factory=DeviceConfig)
+    """Device configuration."""
+    load_config: LoadConfig = field(default_factory=LoadConfig)
+    """Load configuration."""
+    lora_config: Optional[LoRAConfig] = None
+    """LoRA configuration."""
+    speculative_config: Optional[SpeculativeConfig] = None
+    """Speculative decoding configuration."""
+    decoding_config: DecodingConfig = field(default_factory=DecodingConfig)
+    """Decoding configuration."""
+    observability_config: Optional[ObservabilityConfig] = None
+    """Observability configuration."""
+    quant_config: Optional[QuantizationConfig] = None
+    """Quantization configuration."""
+    compilation_config: CompilationConfig = field(
+        default_factory=CompilationConfig)
+    """`torch.compile` and cudagraph capture configuration for the model.
+
+    As a shorthand, `-O<n>` can be used to directly specify the compilation
+    level `n`: `-O3` is equivalent to `-O.level=3` (same as `-O='{"level":3}'`).
+    Currently, -O <n> and -O=<n> are supported as well but this will likely be
+    removed in favor of clearer -O<n> syntax in the future.
+
+    NOTE: level 0 is the default level without any optimization. level 1 and 2
+    are for internal testing only. level 3 is the recommended level for
+    production, also default in V1.
+
+    You can specify the full compilation config like so:
+    `{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
+    """
+    kv_transfer_config: Optional[KVTransferConfig] = None
+    """The configurations for distributed KV cache transfer."""
+    kv_events_config: Optional[KVEventsConfig] = None
+    """The configurations for event publishing."""
+    # some opaque config, only used to provide additional information
+    # for the hash computation, mainly used for testing, debugging or out of
+    # tree config registration.
+    additional_config: Union[dict, SupportsHash] = field(default_factory=dict)
+    """Additional config for specified platform. Different platforms may
+    support different configs. Make sure the configs are valid for the platform
+    you are using. Contents must be hashable."""
+    instance_id: str = ""
+    """The ID of the vLLM instance."""
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: list[Any] = []
+
+        # summarize vllm config
+        vllm_factors: list[Any] = []
+        from vllm import __version__
+        vllm_factors.append(__version__)
+        vllm_factors.append(envs.VLLM_USE_V1)
+        if self.model_config:
+            vllm_factors.append(self.model_config.compute_hash())
+        else:
+            vllm_factors.append("None")
+        if self.cache_config:
+            vllm_factors.append(self.cache_config.compute_hash())
+        else:
+            vllm_factors.append("None")
+        if self.parallel_config:
+            vllm_factors.append(self.parallel_config.compute_hash())
+        else:
+            vllm_factors.append("None")
+        if self.scheduler_config:
+            vllm_factors.append(self.scheduler_config.compute_hash())
+        else:
+            vllm_factors.append("None")
+        if self.device_config:
+            vllm_factors.append(self.device_config.compute_hash())
+        else:
+            vllm_factors.append("None")
+        if self.load_config:
+            vllm_factors.append(self.load_config.compute_hash())
+        else:
+            vllm_factors.append("None")
+        if self.lora_config:
+            vllm_factors.append(self.lora_config.compute_hash())
+            # LoRA creates static buffers based on max_num_batched_tokens.
+            # The tensor sizes and strides get captured in the torch.compile
+            # graph explicitly.
+            vllm_factors.append(
+                str(self.scheduler_config.max_num_batched_tokens))
+        else:
+            vllm_factors.append("None")
+        if self.speculative_config:
+            vllm_factors.append(self.speculative_config.compute_hash())
+        else:
+            vllm_factors.append("None")
+        if self.decoding_config:
+            vllm_factors.append(self.decoding_config.compute_hash())
+        else:
+            vllm_factors.append("None")
+        if self.observability_config:
+            vllm_factors.append(self.observability_config.compute_hash())
+        else:
+            vllm_factors.append("None")
+        if self.quant_config:
+            pass  # should be captured by model_config.quantization
+        if self.compilation_config:
+            vllm_factors.append(self.compilation_config.compute_hash())
+        else:
+            vllm_factors.append("None")
+        if self.kv_transfer_config:
+            vllm_factors.append(self.kv_transfer_config.compute_hash())
+        else:
+            vllm_factors.append("None")
+        if self.additional_config:
+            if isinstance(additional_config := self.additional_config, dict):
+                additional_config_hash = hashlib.md5(
+                    json.dumps(additional_config, sort_keys=True).encode(),
+                    usedforsecurity=False,
+                ).hexdigest()
+            else:
+                additional_config_hash = additional_config.compute_hash()
+            vllm_factors.append(additional_config_hash)
+        else:
+            vllm_factors.append("None")
+        factors.append(vllm_factors)
+
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()[:10]
+        return hash_str
+
+    def pad_for_cudagraph(self, batch_size: int) -> int:
+        # if batch_size > self.compilation_config.max_capture_size,
+        # it should raise an IndexError.
+        # the caller should make sure the batch_size is within the range,
+        # i.e., batch_size <= self.compilation_config.max_capture_size
+        return self.compilation_config.bs_to_padded_graph_size[batch_size]
+
+    @staticmethod
+    def _get_quantization_config(
+            model_config: ModelConfig,
+            load_config: LoadConfig) -> Optional[QuantizationConfig]:
+        """Get the quantization config."""
+        from vllm.platforms import current_platform
+        if model_config.quantization is not None:
+            from vllm.model_executor.model_loader.weight_utils import (
+                get_quant_config)
+            quant_config = get_quant_config(model_config, load_config)
+            capability_tuple = current_platform.get_device_capability()
+
+            if capability_tuple is not None:
+                capability = capability_tuple.to_int()
+                if capability < quant_config.get_min_capability():
+                    raise ValueError(
+                        f"The quantization method {model_config.quantization} "
+                        "is not supported for the current GPU. Minimum "
+                        f"capability: {quant_config.get_min_capability()}. "
+                        f"Current capability: {capability}.")
+            supported_dtypes = quant_config.get_supported_act_dtypes()
+            if model_config.dtype not in supported_dtypes:
+                raise ValueError(
+                    f"{model_config.dtype} is not supported for quantization "
+                    f"method {model_config.quantization}. Supported dtypes: "
+                    f"{supported_dtypes}")
+            return quant_config
+        return None
+
+    @staticmethod
+    def get_quantization_config(
+            model_config: ModelConfig,
+            load_config: LoadConfig) -> Optional[QuantizationConfig]:
+        import copy
+
+        # For some reason, the _ version of this modifies the model_config
+        # object, so using deepcopy to avoid this problem.
+        return VllmConfig._get_quantization_config(copy.deepcopy(model_config),
+                                                   load_config)
+
+    def with_hf_config(
+        self,
+        hf_config: PretrainedConfig,
+        architectures: Optional[list[str]] = None,
+    ) -> "VllmConfig":
+        if architectures is not None:
+            hf_config = copy.deepcopy(hf_config)
+            hf_config.architectures = architectures
+
+        model_config = copy.deepcopy(self.model_config)
+        model_config.hf_config = hf_config
+
+        return replace(self, model_config=model_config)
+
+    def __post_init__(self):
+        """Verify configs are valid & consistent with each other.
+        """
+
+        self.try_verify_and_update_config()
+
+        if self.model_config is not None:
+            self.model_config.verify_async_output_proc(self.parallel_config,
+                                                       self.speculative_config,
+                                                       self.device_config)
+            self.model_config.verify_with_parallel_config(self.parallel_config)
+            self.model_config.verify_dual_chunk_attention_config(
+                self.load_config)
+
+        self.cache_config.verify_with_parallel_config(self.parallel_config)
+
+        if self.lora_config is not None:
+            self.lora_config.verify_with_cache_config(self.cache_config)
+            self.lora_config.verify_with_model_config(self.model_config)
+
+        if self.quant_config is None and self.model_config is not None:
+            self.quant_config = VllmConfig._get_quantization_config(
+                self.model_config, self.load_config)
+
+        from vllm.platforms import current_platform
+        if self.model_config is not None and \
+            self.scheduler_config.chunked_prefill_enabled and \
+            self.model_config.dtype == torch.float32 and \
+            current_platform.get_device_capability() == (7, 5):
+            logger.warning_once(
+                "Turing devices tensor cores do not support float32 matmul. "
+                "To workaround this limitation, vLLM will set 'ieee' input "
+                "precision for chunked prefill triton kernels.")
+
+        # async tp is built on top of sequence parallelism
+        # and requires it to be enabled.
+        if self.compilation_config.pass_config.enable_async_tp:
+            self.compilation_config.pass_config.enable_sequence_parallelism = \
+                True
+        if self.compilation_config.pass_config.enable_sequence_parallelism:
+            self.compilation_config.custom_ops.append("+rms_norm")
+        if envs.VLLM_USE_V1 and self.model_config is not None and \
+            not self.model_config.enforce_eager:
+            # By default, V1 uses piecewise CUDA graphs. If full_cuda_graph
+            # is set to True, full CUDA graphs will be used.
+            self.compilation_config.cudagraph_num_of_warmups = 1
+            self.compilation_config.level = CompilationLevel.PIECEWISE
+            self.compilation_config.set_splitting_ops_for_v1()
+
+        self._set_cudagraph_sizes()
+
+        if self.cache_config.cpu_offload_gb > 0 and \
+            self.compilation_config.level != CompilationLevel.NO_COMPILATION \
+                and not envs.VLLM_USE_V1:
+            logger.warning(
+                "CPU offload is not supported with `torch.compile` in v0 yet."
+                " Disabling `torch.compile`.")
+            self.compilation_config.level = CompilationLevel.NO_COMPILATION
+
+        if ((not envs.VLLM_USE_V1) and self.lora_config is not None
+                and self.compilation_config.level
+                != CompilationLevel.NO_COMPILATION):
+            logger.warning(
+                "LoRA for V0 is not supported with `torch.compile` yet. "
+                "Disabling `torch.compile`.")
+            self.compilation_config.level = CompilationLevel.NO_COMPILATION
+
+        if self.compilation_config.full_cuda_graph and \
+            not self.model_config.disable_cascade_attn:
+            logger.info("full_cuda_graph is not supported with "
+                        "cascade attention. Disabling cascade attention.")
+            self.model_config.disable_cascade_attn = True
+
+        disable_chunked_prefill_reasons: list[str] = []
+
+        if self.model_config and self.model_config.pooler_config:
+            pooling_type = self.model_config.pooler_config.pooling_type
+            if pooling_type is None or pooling_type.lower() != "last":
+                disable_chunked_prefill_reasons.append(
+                    "Only \"last\" pooling supports chunked "
+                    "prefill and prefix caching; disabling both.")
+
+        if disable_chunked_prefill_reasons:
+            for reason in disable_chunked_prefill_reasons:
+                logger.info(reason)
+            self.scheduler_config.chunked_prefill_enabled = False
+            self.scheduler_config.long_prefill_token_threshold = 0
+            self.scheduler_config.max_num_batched_tokens = max(
+                self.scheduler_config.max_model_len,
+                DEFAULT_MAX_NUM_BATCHED_TOKENS)
+
+            if self.cache_config is not None:
+                self.cache_config.enable_prefix_caching = False
+
+        if (self.kv_events_config is not None
+                and self.kv_events_config.enable_kv_cache_events
+                and not self.cache_config.enable_prefix_caching):
+            logger.warning(
+                "KV cache events are on, but prefix caching is not enabled."
+                "Use --enable-prefix-caching to enable.")
+        if (self.kv_events_config is not None
+                and self.kv_events_config.publisher != "null"
+                and not self.kv_events_config.enable_kv_cache_events):
+            logger.warning("KV cache events are disabled,"
+                           "but the scheduler is configured to publish them."
+                           "Modify KVEventsConfig.enable_kv_cache_events"
+                           "to True to enable.")
+        current_platform.check_and_update_config(self)
+
+        if not self.instance_id:
+            self.instance_id = random_uuid()[:5]
+
+        if (envs.VLLM_USE_V1
+                and not self.scheduler_config.disable_hybrid_kv_cache_manager):
+            # logger should only print warning message for hybrid models. As we
+            # can't know whether the model is hybrid or not now, so we don't log
+            # warning message here and will log it later.
+            if not (current_platform.is_cuda() or current_platform.is_rocm()):
+                # Hybrid KV cache manager is not supported on non-GPU platforms.
+                self.scheduler_config.disable_hybrid_kv_cache_manager = True
+            if self.kv_transfer_config is not None:
+                # Hybrid KV cache manager is not compatible with KV transfer.
+                self.scheduler_config.disable_hybrid_kv_cache_manager = True
+            if self.kv_events_config is not None:
+                # Hybrid KV cache manager is not compatible with KV events.
+                self.scheduler_config.disable_hybrid_kv_cache_manager = True
+            if self.model_config is not None and \
+                self.model_config.attention_chunk_size is not None and \
+                self.speculative_config is not None and \
+                self.speculative_config.use_eagle():
+                # Hybrid KV cache manager is not yet supported with chunked
+                # local attention + eagle.
+                self.scheduler_config.disable_hybrid_kv_cache_manager = True
+
+    def update_sizes_for_sequence_parallelism(self,
+                                              possible_sizes: list) -> list:
+        # remove the sizes that not multiple of tp_size when
+        # enable sequence parallelism
+        removed_sizes = [
+            size for size in possible_sizes
+            if size % self.parallel_config.tensor_parallel_size != 0
+        ]
+        if removed_sizes:
+            logger.warning(
+                "Batch sizes %s are removed because they are not "
+                "multiple of tp_size %d when "
+                "sequence parallelism is enabled", removed_sizes,
+                self.parallel_config.tensor_parallel_size)
+
+        return [
+            size for size in possible_sizes
+            if size % self.parallel_config.tensor_parallel_size == 0
+        ]
+
+    def _set_cudagraph_sizes(self):
+        """
+        cudagraph batchsize padding logic:
+
+        `[1, 2, 4] + [8 * i for i in range(1, 1025)]` is a list of all possible
+        batch sizes that cudagraph will capture.
+
+        Depending on the engine's configuration of `max_num_seqs`, the
+        candidate batch sizes to capture cudagraph will shrink to the subset
+        which just cover the range of `[1, max_num_seqs]`. In the common case,
+        `max_num_seqs` is 256, and the cudagraph batch sizes will be
+        `[1, 2, 4, 8, 16, 24, 32, 40, ..., 256]`.
+
+        However, if users specify the cudagraph capture sizes through
+        compilation config, we will use the specified sizes instead.
+
+        In the end, `vllm_config.compilation_config.cudagraph_capture_sizes`
+        will be the final sizes to capture cudagraph (in descending order).
+
+        During runtime, if batchsize is larger than
+        `vllm_config.compilation_config.cudagraph_capture_sizes`,
+        no cudagraph will be used.
+        If the batch size is no larger than
+        `vllm_config.compilation_config.cudagraph_capture_sizes`,
+        we can quickly find the padded graph size for a given batch size by
+        looking up `vllm_config.compilation_config.bs_to_padded_graph_size`.
+        """
+
+        # calculate the default `batch_size_capture_list`
+        if not envs.VLLM_USE_V1:
+            batch_size_capture_list = []
+            if self.scheduler_config is not None and \
+                self.model_config is not None and \
+                    not self.model_config.enforce_eager:
+
+                possible_sizes = [1, 2, 4] + [8 * i for i in range(1, 1025)]
+                if self.parallel_config.tensor_parallel_size > 1 and \
+                    self.compilation_config.pass_config.enable_sequence_parallelism:
+                    possible_sizes = self.update_sizes_for_sequence_parallelism(
+                        possible_sizes)
+
+                # find the minimum size that is larger than max_num_seqs,
+                # which then becomes the max_batchsize_to_capture
+                larger_sizes = [
+                    x for x in possible_sizes
+                    if x >= self.scheduler_config.max_num_seqs
+                ]
+                if larger_sizes:
+                    max_batchsize_to_capture = larger_sizes[0]
+                else:
+                    max_batchsize_to_capture = possible_sizes[-1]
+
+                # filter out the sizes that are
+                # larger than max_batchsize_to_capture
+                batch_size_capture_list = [
+                    size for size in possible_sizes
+                    if size <= max_batchsize_to_capture
+                ]
+        else:
+            batch_size_capture_list = []
+            if self.model_config is not None and \
+                not self.model_config.enforce_eager:
+                cuda_graph_sizes = self.scheduler_config.cuda_graph_sizes
+                if len(cuda_graph_sizes) == 1:
+                    batch_size_capture_list = [1, 2, 4] + [
+                        i for i in range(8, cuda_graph_sizes[0] + 1, 8)
+                    ]
+                elif len(cuda_graph_sizes) > 1:
+                    batch_size_capture_list = sorted(cuda_graph_sizes)
+                else:
+                    raise TypeError(f"Invalid value for {cuda_graph_sizes=}.")
+                if self.parallel_config.tensor_parallel_size > 1 and \
+                    self.compilation_config.pass_config.enable_sequence_parallelism:
+                    batch_size_capture_list = \
+                        self.update_sizes_for_sequence_parallelism(batch_size_capture_list)
+                max_num_tokens = self.scheduler_config.max_num_batched_tokens
+                batch_size_capture_list = [
+                    size for size in batch_size_capture_list
+                    if size <= max_num_tokens
+                ]
+
+        self.compilation_config.init_with_cudagraph_sizes(
+            batch_size_capture_list)
+
+    def recalculate_max_model_len(self, max_model_len: int):
+        # Can only be called in try_verify_and_update_config
+        model_config = self.model_config
+        max_model_len = model_config.get_and_verify_max_len(max_model_len)
+        self.model_config.max_model_len = max_model_len
+        self.scheduler_config.max_model_len = max_model_len
+
+    def try_verify_and_update_config(self):
+        architecture = getattr(self.model_config, "architecture", None)
+        if architecture is None:
+            return
+
+        from vllm.model_executor.models.config import (
+            MODELS_CONFIG_MAP, HybridAttentionMambaModelConfig)
+        cls = MODELS_CONFIG_MAP.get(architecture, None)
+        if cls is not None:
+            cls.verify_and_update_config(self)
+
+        if self.model_config.is_hybrid:
+            HybridAttentionMambaModelConfig.verify_and_update_config(self)
+
+        if self.model_config.task == "classify":
+            # Maybe convert ForCausalLM into ForSequenceClassification model.
+            from vllm.model_executor.models.adapters import (
+                SequenceClassificationConfig)
+            SequenceClassificationConfig.verify_and_update_config(self)
+
+    def __str__(self):
+        return (
+            f"model={self.model_config.model!r},"
+            f" speculative_config={self.speculative_config!r},"
+            f" tokenizer={self.model_config.tokenizer!r}, "
+            f"skip_tokenizer_init={self.model_config.skip_tokenizer_init},"
+            f" tokenizer_mode={self.model_config.tokenizer_mode}, "
+            f"revision={self.model_config.revision}, "
+            f"override_neuron_config={self.model_config.override_neuron_config},"
+            f" tokenizer_revision={self.model_config.tokenizer_revision}, "
+            f"trust_remote_code={self.model_config.trust_remote_code}, "
+            f"dtype={self.model_config.dtype}, "
+            f"max_seq_len={self.model_config.max_model_len},"
+            f" download_dir={self.load_config.download_dir!r}, "
+            f"load_format={self.load_config.load_format}, "
+            f"tensor_parallel_size={self.parallel_config.tensor_parallel_size},"
+            f" pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, "  # noqa
+            f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, "  # noqa
+            f"quantization={self.model_config.quantization}, "
+            f"enforce_eager={self.model_config.enforce_eager}, "
+            f"kv_cache_dtype={self.cache_config.cache_dtype}, "
+            f" device_config={self.device_config.device}, "
+            f"decoding_config={self.decoding_config!r}, "
+            f"observability_config={self.observability_config!r}, "
+            f"seed={self.model_config.seed}, "
+            f"served_model_name={self.model_config.served_model_name}, "
+            f"num_scheduler_steps={self.scheduler_config.num_scheduler_steps}, "
+            f"multi_step_stream_outputs={self.scheduler_config.multi_step_stream_outputs}, "  # noqa
+            f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
+            f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, "  # noqa
+            f"use_async_output_proc={self.model_config.use_async_output_proc}, "
+            f"pooler_config={self.model_config.pooler_config!r}, "
+            f"compilation_config={self.compilation_config!r}")
+
+
+_current_vllm_config: Optional[VllmConfig] = None
+_current_prefix: Optional[str] = None
+
+
+@contextmanager
+def set_current_vllm_config(vllm_config: VllmConfig,
+                            check_compile=False,
+                            prefix: Optional[str] = None):
+    """
+    Temporarily set the current vLLM config.
+    Used during model initialization.
+    We save the current vLLM config in a global variable,
+    so that all modules can access it, e.g. custom ops
+    can access the vLLM config to determine how to dispatch.
+    """
+    global _current_vllm_config, _current_prefix
+    old_vllm_config = _current_vllm_config
+    old_prefix = _current_prefix
+    from vllm.compilation.counter import compilation_counter
+    num_models_seen = compilation_counter.num_models_seen
+    try:
+        _current_vllm_config = vllm_config
+        _current_prefix = prefix
+        yield
+    except Exception:
+        raise
+    else:
+        logger.debug("enabled custom ops: %s",
+                     vllm_config.compilation_config.enabled_custom_ops)
+        logger.debug("disabled custom ops: %s",
+                     vllm_config.compilation_config.disabled_custom_ops)
+        if check_compile and \
+            vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \
+            and compilation_counter.num_models_seen == num_models_seen:
+            # If the model supports compilation,
+            # compilation_counter.num_models_seen should be increased
+            # by at least 1.
+            # If it is not increased, it means the model does not support
+            # compilation (does not have @support_torch_compile decorator).
+            logger.warning(
+                "`torch.compile` is turned on, but the model %s"
+                " does not support it. Please open an issue on GitHub"
+                " if you want it to be supported.",
+                vllm_config.model_config.model)
+    finally:
+        _current_vllm_config = old_vllm_config
+        _current_prefix = old_prefix
+
+
+def get_current_vllm_config() -> VllmConfig:
+    if _current_vllm_config is None:
+        # in ci, usually when we test custom ops/modules directly,
+        # we don't set the vllm config. In that case, we set a default
+        # config.
+        logger.warning("Current vLLM config is not set.")
+        from vllm.config import VllmConfig
+        return VllmConfig()
+    return _current_vllm_config
+
+
+def get_current_model_prefix() -> str:
+    """
+    Get the prefix of the model that's currently being initialized.
+    """
+    assert _current_prefix is not None, \
+        "Current model prefix is not set. "
+    return _current_prefix
+
+
+def contains_object_print(text):
+    """
+    Check if the text looks like a printed Python object, e.g.
+    contains any substring matching the pattern: "at 0xFFFFFFF>"
+    We match against 0x followed by 2-16 hex chars (there's
+    a max of 16 on a 64 bit system).
+
+    Args:
+        text (str): The text to check
+
+    Returns:
+        result (bool): `True` if a match is found, `False` otherwise.
+    """
+    pattern = r'at 0x[a-fA-F0-9]{2,16}>'
+    match = re.search(pattern, text)
+    return match is not None
+
+
+def assert_hashable(text):
+    if not contains_object_print(text):
+        return True
+    raise AssertionError(
+        f"vLLM tried to hash some configs that may have Python objects ids "
+        f"in them. This is a bug, please file an issue. "
+        f"Text being hashed: {text}")
+
+
+T = TypeVar("T")
+
+
+def get_layers_from_vllm_config(vllm_config: VllmConfig,
+                                layer_type: type[T]) -> dict[str, T]:
+    return {
+        layer_name: layer
+        for layer_name, layer in
+        vllm_config.compilation_config.static_forward_context.items()
+        if isinstance(layer, layer_type)
+    }
+
+
+@config
+@dataclass
+class SpeechToTextConfig:
+    """Configuration for speech-to-text models."""
+
+    sample_rate: float = 16_000
+    """Sample rate (Hz) to resample input audio to. Most speech models expect
+    16kHz audio input. The input audio will be automatically resampled to this
+    rate before processing."""
+
+    max_audio_clip_s: int = 30
+    """Maximum duration in seconds for a single audio clip without chunking.
+    Audio longer than this will be split into smaller chunks if
+    `allow_audio_chunking` evaluates to True, otherwise it will be rejected."""
+
+    overlap_chunk_second: int = 1
+    """Overlap duration in seconds between consecutive audio chunks when
+    splitting long audio. This helps maintain context across chunk boundaries
+    and improves transcription quality at split points."""
+
+    min_energy_split_window_size: Optional[int] = 1600
+    """Window size in samples for finding low-energy (quiet) regions to split
+    audio chunks. The algorithm looks for the quietest moment within this
+    window to minimize cutting through speech. Default 1600 samples ≈ 100ms
+    at 16kHz. If None, no chunking will be done."""
+
+    @property
+    def allow_audio_chunking(self) -> bool:
+        return self.min_energy_split_window_size is not None
+
+
+def update_config(config: DataclassInstanceT,
+                  overrides: dict[str, Any]) -> DataclassInstanceT:
+    processed_overrides = {}
+    for field_name, value in overrides.items():
+        assert hasattr(
+            config, field_name), f"{type(config)} has no field `{field_name}`"
+        current_value = getattr(config, field_name)
+        if is_dataclass(current_value) and not is_dataclass(value):
+            assert isinstance(value, dict), (
+                f"Overrides to {type(config)}.{field_name} must be a dict"
+                f"  or {type(current_value)}, but got {type(value)}")
+            value = update_config(
+                current_value,  # type: ignore[type-var]
+                value)
+        processed_overrides[field_name] = value
+    return replace(config, **processed_overrides)
diff --git a/vllm_v0.10.0/vllm/connections.py b/vllm_v0.10.0/vllm/connections.py
new file mode 100644
index 0000000..103505e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/connections.py
@@ -0,0 +1,174 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Mapping, MutableMapping
+from pathlib import Path
+from typing import Optional
+from urllib.parse import urlparse
+
+import aiohttp
+import requests
+
+from vllm.version import __version__ as VLLM_VERSION
+
+
+class HTTPConnection:
+    """Helper class to send HTTP requests."""
+
+    def __init__(self, *, reuse_client: bool = True) -> None:
+        super().__init__()
+
+        self.reuse_client = reuse_client
+
+        self._sync_client: Optional[requests.Session] = None
+        self._async_client: Optional[aiohttp.ClientSession] = None
+
+    def get_sync_client(self) -> requests.Session:
+        if self._sync_client is None or not self.reuse_client:
+            self._sync_client = requests.Session()
+
+        return self._sync_client
+
+    # NOTE: We intentionally use an async function even though it is not
+    # required, so that the client is only accessible inside async event loop
+    async def get_async_client(self) -> aiohttp.ClientSession:
+        if self._async_client is None or not self.reuse_client:
+            self._async_client = aiohttp.ClientSession(trust_env=True)
+
+        return self._async_client
+
+    def _validate_http_url(self, url: str):
+        parsed_url = urlparse(url)
+
+        if parsed_url.scheme not in ("http", "https"):
+            raise ValueError("Invalid HTTP URL: A valid HTTP URL "
+                             "must have scheme 'http' or 'https'.")
+
+    def _headers(self, **extras: str) -> MutableMapping[str, str]:
+        return {"User-Agent": f"vLLM/{VLLM_VERSION}", **extras}
+
+    def get_response(
+        self,
+        url: str,
+        *,
+        stream: bool = False,
+        timeout: Optional[float] = None,
+        extra_headers: Optional[Mapping[str, str]] = None,
+    ):
+        self._validate_http_url(url)
+
+        client = self.get_sync_client()
+        extra_headers = extra_headers or {}
+
+        return client.get(url,
+                          headers=self._headers(**extra_headers),
+                          stream=stream,
+                          timeout=timeout)
+
+    async def get_async_response(
+        self,
+        url: str,
+        *,
+        timeout: Optional[float] = None,
+        extra_headers: Optional[Mapping[str, str]] = None,
+    ):
+        self._validate_http_url(url)
+
+        client = await self.get_async_client()
+        extra_headers = extra_headers or {}
+
+        return client.get(url,
+                          headers=self._headers(**extra_headers),
+                          timeout=timeout)
+
+    def get_bytes(self, url: str, *, timeout: Optional[float] = None) -> bytes:
+        with self.get_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+
+            return r.content
+
+    async def async_get_bytes(
+        self,
+        url: str,
+        *,
+        timeout: Optional[float] = None,
+    ) -> bytes:
+        async with await self.get_async_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+
+            return await r.read()
+
+    def get_text(self, url: str, *, timeout: Optional[float] = None) -> str:
+        with self.get_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+
+            return r.text
+
+    async def async_get_text(
+        self,
+        url: str,
+        *,
+        timeout: Optional[float] = None,
+    ) -> str:
+        async with await self.get_async_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+
+            return await r.text()
+
+    def get_json(self, url: str, *, timeout: Optional[float] = None) -> str:
+        with self.get_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+
+            return r.json()
+
+    async def async_get_json(
+        self,
+        url: str,
+        *,
+        timeout: Optional[float] = None,
+    ) -> str:
+        async with await self.get_async_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+
+            return await r.json()
+
+    def download_file(
+        self,
+        url: str,
+        save_path: Path,
+        *,
+        timeout: Optional[float] = None,
+        chunk_size: int = 128,
+    ) -> Path:
+        with self.get_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+
+            with save_path.open("wb") as f:
+                for chunk in r.iter_content(chunk_size):
+                    f.write(chunk)
+
+        return save_path
+
+    async def async_download_file(
+        self,
+        url: str,
+        save_path: Path,
+        *,
+        timeout: Optional[float] = None,
+        chunk_size: int = 128,
+    ) -> Path:
+        async with await self.get_async_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+
+            with save_path.open("wb") as f:
+                async for chunk in r.content.iter_chunked(chunk_size):
+                    f.write(chunk)
+
+        return save_path
+
+
+global_http_connection = HTTPConnection()
+"""
+The global [`HTTPConnection`][vllm.connections.HTTPConnection] instance used
+by vLLM.
+"""
diff --git a/vllm_v0.10.0/vllm/core/__init__.py b/vllm_v0.10.0/vllm/core/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/core/block/__init__.py b/vllm_v0.10.0/vllm/core/block/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/core/block/block_table.py b/vllm_v0.10.0/vllm/core/block/block_table.py
new file mode 100644
index 0000000..444bb25
--- /dev/null
+++ b/vllm_v0.10.0/vllm/core/block/block_table.py
@@ -0,0 +1,399 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+from typing import List, Optional
+
+from vllm.core.block.common import BlockList
+from vllm.core.block.interfaces import Block, DeviceAwareBlockAllocator
+from vllm.utils import Device, cdiv, chunk_list
+
+
+class BlockTable:
+    """A class to manage blocks for a specific sequence.
+
+    The BlockTable maps a sequence of tokens to a list of blocks, where each
+    block represents a contiguous memory allocation for a portion of the 
+    sequence. The blocks are managed by a DeviceAwareBlockAllocator, which is
+    responsible for allocating and freeing memory for the blocks.
+
+    Args:
+        block_size (int): The maximum number of tokens that can be stored in a
+            single block.
+        block_allocator (DeviceAwareBlockAllocator): The block allocator used to
+            manage memory for the blocks.
+        _blocks (Optional[List[Block]], optional): An optional list of existing
+            blocks to initialize the BlockTable with. If not provided, an empty
+            BlockTable is created.
+        max_block_sliding_window (Optional[int], optional): The number of
+            blocks to keep around for each sequence. If None, all blocks
+            are kept (eg., when sliding window is not used).
+            It should at least fit the sliding window size of the model.
+
+    Attributes:
+        _block_size (int): The maximum number of tokens that can be stored in a
+            single block.
+        _allocator (DeviceAwareBlockAllocator): The block allocator used to
+            manage memory for the blocks.
+        _blocks (Optional[List[Block]]): The list of blocks managed by this
+            BlockTable.
+        _num_full_slots (int): The number of tokens currently stored in the
+            blocks.
+    """
+
+    def __init__(
+        self,
+        block_size: int,
+        block_allocator: DeviceAwareBlockAllocator,
+        _blocks: Optional[List[Block]] = None,
+        max_block_sliding_window: Optional[int] = None,
+    ):
+        self._block_size = block_size
+        self._allocator = block_allocator
+        if _blocks is None:
+            _blocks = []
+        self._blocks: BlockList = BlockList(_blocks)
+
+        self._max_block_sliding_window = max_block_sliding_window
+        self._num_full_slots = self._get_num_token_ids()
+
+    @staticmethod
+    def get_num_required_blocks(token_ids: List[int],
+                                block_size: int,
+                                num_lookahead_slots: int = 0) -> int:
+        """Calculates the minimum number of blocks required to store a given
+        sequence of token IDs along with any look-ahead slots that may be
+        required (like in multi-step + chunked-prefill).
+
+        This assumes worst-case scenario, where every block requires a new
+        allocation (e.g. ignoring prefix caching).
+
+        Args:
+            token_ids (List[int]): The sequence of token IDs to be stored.
+            block_size (int): The maximum number of tokens that can be stored in
+                a single block.
+            num_lookahead_slots (int): look-ahead slots that the sequence may
+                require.
+
+        Returns:
+            int: The minimum number of blocks required to store the given
+                sequence of token IDs along with any required look-ahead slots.
+        """
+        return cdiv(len(token_ids) + num_lookahead_slots, block_size)
+
+    def allocate(self,
+                 token_ids: List[int],
+                 device: Device = Device.GPU,
+                 extra_hash: Optional[int] = None) -> None:
+        """Allocates memory blocks for storing the given sequence of token IDs.
+
+        This method allocates the required number of blocks to store the given
+        sequence of token IDs.
+
+        Args:
+            token_ids (List[int]): The sequence of token IDs to be stored.
+            device (Device, optional): The device on which the blocks should be
+                allocated. Defaults to Device.GPU.
+            extra_hash (Optional[int]): The hash value of additional
+                factors, such as adapters, that influence the block hash
+                in the prefixcaching block.
+        """
+        assert not self._is_allocated
+        assert token_ids
+        blocks = self._allocate_blocks_for_token_ids(prev_block=None,
+                                                     token_ids=token_ids,
+                                                     device=device,
+                                                     extra_hash=extra_hash)
+        self.update(blocks)
+        self._num_full_slots = len(token_ids)
+
+    def update(self, blocks: List[Block]) -> None:
+        """Resets the table to the newly provided blocks 
+        (with their corresponding block ids)
+        """
+        self._blocks.update(blocks)
+
+    def append_token_ids(self,
+                         token_ids: List[int],
+                         num_lookahead_slots: int = 0,
+                         num_computed_slots: Optional[int] = None,
+                         extra_hash: Optional[int] = None) -> None:
+        """Appends a sequence of token IDs to the existing blocks in the
+        BlockTable.
+
+        This method appends the given sequence of token IDs to the existing
+        blocks in the BlockTable. If there is not enough space in the existing
+        blocks, new blocks are allocated using the `ensure_num_empty_slots`
+        method to accommodate the additional tokens.
+
+        The token IDs are divided into chunks of size `block_size` (except for
+        the first chunk, which may be smaller), and each chunk is appended to a
+        separate block.
+
+        Args:
+            token_ids (List[int]): The sequence of token IDs to be appended.
+            num_computed_slots (Optional[int]): The number of KV cache slots
+                that are already filled (computed).
+                When sliding window is enabled, this is used to compute how many
+                blocks to drop at the front of the sequence.
+                Without sliding window, None can be passed.
+                Without chunked prefill, it should be the same as
+                _num_full_slots.
+            extra_hash (Optional[int]): The hash value of additional
+                factors such as adapters that influence the block, apart
+                from the token_ids.
+        """
+        assert self._is_allocated, "no blocks have been allocated"
+        assert len(self._blocks) > 0
+
+        # Drop blocks that are no longer needed due to sliding window
+        if self._max_block_sliding_window is not None:
+            null_block = self._allocator.allocate_or_get_null_block()
+            assert num_computed_slots is not None
+            end_block_idx = (num_computed_slots //
+                             self._block_size) - self._max_block_sliding_window
+            for idx in range(0, end_block_idx):
+                b = self._blocks[idx]
+                if b is not null_block:
+                    self._allocator.free(b)
+                    self._blocks[idx] = null_block
+
+        # Ensure there are enough empty slots for the new tokens plus
+        # lookahead slots
+        self.ensure_num_empty_slots(num_empty_slots=len(token_ids) +
+                                    num_lookahead_slots,
+                                    extra_hash=extra_hash)
+
+        # Update the blocks with the new tokens
+        first_block_idx = self._num_full_slots // self._block_size
+        token_blocks = self._chunk_token_blocks_for_append(token_ids)
+
+        for i, token_block in enumerate(token_blocks):
+            self._blocks.append_token_ids(first_block_idx + i, token_block)
+
+        self._num_full_slots += len(token_ids)
+
+    def ensure_num_empty_slots(self,
+                               num_empty_slots: int,
+                               extra_hash: Optional[int] = None) -> None:
+        """Ensures that the BlockTable has at least the specified number of
+        empty slots available.
+
+        This method checks if the BlockTable has enough empty slots (i.e.,
+        available space) to accommodate the requested number of tokens. If not,
+        it allocates additional blocks on the GPU to ensure that the required
+        number of empty slots is available.
+
+        Args:
+            num_empty_slots (int): The minimum number of empty slots required.
+            extra_hash (Optional[int]): The hash value of additional
+                factors such as adapters that influence the block, apart
+                from the token_ids.
+        """
+        # Currently the block table only supports
+        # appending tokens to GPU blocks.
+        device = Device.GPU
+        assert self._is_allocated
+
+        if self._num_empty_slots >= num_empty_slots:
+            return
+
+        slots_to_allocate = num_empty_slots - self._num_empty_slots
+        blocks_to_allocate = cdiv(slots_to_allocate, self._block_size)
+
+        for _ in range(blocks_to_allocate):
+            assert len(self._blocks) > 0
+            self._blocks.append(
+                self._allocator.allocate_mutable_block(
+                    prev_block=self._blocks[-1],
+                    device=device,
+                    extra_hash=extra_hash))
+
+    def fork(self) -> "BlockTable":
+        """Creates a new BlockTable instance with a copy of the blocks from the
+        current instance.
+
+        This method creates a new BlockTable instance with the same block size,
+        block allocator, and a copy of the blocks from the current instance. The
+        new BlockTable has its own independent set of blocks, but shares the
+        same underlying memory allocation with the original BlockTable.
+
+        Returns:
+            BlockTable: A new BlockTable instance with a copy of the blocks from
+                the current instance.
+        """
+        assert self._is_allocated
+        assert len(self._blocks) > 0
+        forked_blocks = self._allocator.fork(self._blocks[-1])
+        return BlockTable(
+            block_size=self._block_size,
+            block_allocator=self._allocator,
+            _blocks=forked_blocks,
+            max_block_sliding_window=self._max_block_sliding_window,
+        )
+
+    def free(self) -> None:
+        """Frees the memory occupied by the blocks in the BlockTable.
+
+        This method iterates over all the blocks in the `_blocks` list and calls
+        the `free` method of the `_allocator` object to release the memory
+        occupied by each block. After freeing all the blocks, the `_blocks` list
+        is set to `None`.
+        """
+        for block in self.blocks:
+            self._allocator.free(block)
+        self._blocks.reset()
+
+    @property
+    def physical_block_ids(self) -> List[int]:
+        """Returns a list of physical block indices for the blocks in the
+        BlockTable.
+
+        This property returns a list of integers, where each integer represents
+        the physical block index of a corresponding block in the `_blocks` list.
+        The physical block index is a unique identifier for the memory location
+        occupied by the block.
+
+        Returns:
+            List[int]: A list of physical block indices for the blocks in the
+                BlockTable.
+        """
+        return self._blocks.ids()
+
+    def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]:
+        """Get the number of "unseen" tokens in the sequence.
+
+        Unseen tokens are tokens in the sequence corresponding to this block
+        table, but are not yet appended to this block table.
+
+        Args:
+            sequence_token_ids (List[int]): The list of token ids in the
+                sequence.
+
+        Returns:
+            List[int]: The postfix of sequence_token_ids that has not yet been
+                appended to the block table.
+        """
+
+        # Since the block table is append-only, the unseen token ids are the
+        # ones after the appended ones.
+        return sequence_token_ids[self.num_full_slots:]
+
+    def _allocate_blocks_for_token_ids(
+            self,
+            prev_block: Optional[Block],
+            token_ids: List[int],
+            device: Device,
+            extra_hash: Optional[int] = None) -> List[Block]:
+        blocks: List[Block] = []
+
+        block_token_ids = []
+        tail_token_ids = []
+        for cur_token_ids in chunk_list(token_ids, self._block_size):
+            if len(cur_token_ids) == self._block_size:
+                block_token_ids.append(cur_token_ids)
+            else:
+                tail_token_ids.append(cur_token_ids)
+
+        if block_token_ids:
+            blocks.extend(
+                self._allocator.allocate_immutable_blocks(
+                    prev_block,
+                    block_token_ids=block_token_ids,
+                    device=device,
+                    extra_hash=extra_hash))
+            prev_block = blocks[-1]
+
+        if tail_token_ids:
+            assert len(tail_token_ids) == 1
+            cur_token_ids = tail_token_ids[0]
+
+            block = self._allocator.allocate_mutable_block(
+                prev_block=prev_block, device=device, extra_hash=extra_hash)
+            block.append_token_ids(cur_token_ids)
+
+            blocks.append(block)
+
+        return blocks
+
+    def _get_all_token_ids(self) -> List[int]:
+        # NOTE: This function is O(seq_len); use sparingly.
+        token_ids: List[int] = []
+
+        if not self._is_allocated:
+            return token_ids
+
+        for block in self.blocks:
+            token_ids.extend(block.token_ids)
+
+        return token_ids
+
+    def _get_num_token_ids(self) -> int:
+        res = 0
+        for block in self.blocks:
+            res += len(block.token_ids)
+
+        return res
+
+    @property
+    def _is_allocated(self) -> bool:
+        return len(self._blocks) > 0
+
+    @property
+    def blocks(self) -> List[Block]:
+        return self._blocks.list()
+
+    @property
+    def _num_empty_slots(self) -> int:
+        assert self._is_allocated
+        return len(self._blocks) * self._block_size - self._num_full_slots
+
+    @property
+    def num_full_slots(self) -> int:
+        """Returns the total number of tokens currently stored in the
+        BlockTable.
+
+        Returns:
+            int: The total number of tokens currently stored in the BlockTable.
+        """
+        return self._num_full_slots
+
+    def get_num_blocks_touched_by_append_slots(
+            self, token_ids: List[int], num_lookahead_slots: int) -> int:
+        """Determine how many blocks will be "touched" by appending the token
+        ids.
+
+        This is required for the scheduler to determine whether a sequence can
+        continue generation, or if it must be preempted.
+        """
+        # Math below is equivalent to:
+        # all_token_ids = token_ids + [-1] * num_lookahead_slots
+        # token_blocks = self._chunk_token_blocks_for_append(all_token_ids)
+        # return len(token_blocks)
+
+        num_token_ids = len(token_ids) + num_lookahead_slots
+        first_chunk_size = self._block_size - (self._num_full_slots %
+                                               self._block_size)
+        num_token_blocks = (1 + math.ceil(
+            (num_token_ids - first_chunk_size) / self._block_size))
+        return num_token_blocks
+
+    def _chunk_token_blocks_for_append(
+            self, token_ids: List[int]) -> List[List[int]]:
+        """Split the token ids into block-sized chunks so they can be easily
+        appended to blocks. The first such "token block" may have less token ids
+        than the block size, since the last allocated block may be partially
+        full.
+
+        If no token ids are provided, then no chunks are returned.
+        """
+
+        if not token_ids:
+            return []
+
+        first_chunk_size = self._block_size - (self._num_full_slots %
+                                               self._block_size)
+        token_blocks = [token_ids[:first_chunk_size]]
+        token_blocks.extend(
+            chunk_list(token_ids[first_chunk_size:], self._block_size))
+        return token_blocks
diff --git a/vllm_v0.10.0/vllm/core/block/common.py b/vllm_v0.10.0/vllm/core/block/common.py
new file mode 100644
index 0000000..a337007
--- /dev/null
+++ b/vllm_v0.10.0/vllm/core/block/common.py
@@ -0,0 +1,371 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections import deque
+from dataclasses import dataclass
+from typing import Deque, Dict, Iterable, List, Optional, Protocol, Tuple
+
+from vllm.core.block.interfaces import Block, BlockAllocator
+
+BlockId = int
+RefCount = int
+
+
+class RefCounterProtocol(Protocol):
+
+    def incr(self, block_id: BlockId) -> RefCount:
+        raise NotImplementedError
+
+    def decr(self, block_id: BlockId) -> RefCount:
+        raise NotImplementedError
+
+    def get(self, block_id: BlockId) -> RefCount:
+        raise NotImplementedError
+
+
+class RefCounter(RefCounterProtocol):
+    """A class for managing reference counts for a set of block indices.
+
+    The RefCounter class maintains a dictionary that maps block indices to their
+    corresponding reference counts. It provides methods to increment, decrement,
+    and retrieve the reference count for a given block index.
+
+    Args:
+        all_block_indices (Iterable[BlockId]): An iterable of block indices
+            to initialize the reference counter with.
+    """
+
+    def __init__(self, all_block_indices: Iterable[BlockId]):
+        deduped = set(all_block_indices)
+        self._refcounts: Dict[BlockId, RefCount] = {
+            index: 0
+            for index in deduped
+        }
+
+    def incr(self, block_id: BlockId) -> RefCount:
+        assert block_id in self._refcounts
+        pre_incr_refcount = self._refcounts[block_id]
+
+        assert pre_incr_refcount >= 0
+
+        post_incr_refcount = pre_incr_refcount + 1
+        self._refcounts[block_id] = post_incr_refcount
+        return post_incr_refcount
+
+    def decr(self, block_id: BlockId) -> RefCount:
+        assert block_id in self._refcounts
+        refcount = self._refcounts[block_id]
+
+        assert refcount > 0
+        refcount -= 1
+
+        self._refcounts[block_id] = refcount
+
+        return refcount
+
+    def get(self, block_id: BlockId) -> RefCount:
+        assert block_id in self._refcounts
+        return self._refcounts[block_id]
+
+    def as_readonly(self) -> "ReadOnlyRefCounter":
+        return ReadOnlyRefCounter(self)
+
+
+class ReadOnlyRefCounter(RefCounterProtocol):
+    """A read-only view of the RefCounter class.
+
+    The ReadOnlyRefCounter class provides a read-only interface to access the
+    reference counts maintained by a RefCounter instance. It does not allow
+    modifications to the reference counts.
+
+    Args:
+        refcounter (RefCounter): The RefCounter instance to create a read-only
+            view for.
+    """
+
+    def __init__(self, refcounter: RefCounter):
+        self._refcounter = refcounter
+
+    def incr(self, block_id: BlockId) -> RefCount:
+        raise ValueError("Incr not allowed")
+
+    def decr(self, block_id: BlockId) -> RefCount:
+        raise ValueError("Decr not allowed")
+
+    def get(self, block_id: BlockId) -> RefCount:
+        return self._refcounter.get(block_id)
+
+
+class CopyOnWriteTracker:
+    """A class for tracking and managing copy-on-write operations for blocks.
+
+    The CopyOnWriteTracker class maintains a mapping of source block indices to
+        their corresponding copy-on-write destination block indices. It works in
+        conjunction with a RefCounter.
+
+    Args:
+        refcounter (RefCounter): The reference counter used to track block
+            reference counts.
+    """
+
+    def __init__(self, refcounter: RefCounterProtocol):
+        self._copy_on_writes: List[Tuple[BlockId, BlockId]] = []
+        self._refcounter = refcounter
+
+    def is_appendable(self, block: Block) -> bool:
+        """Checks if the block is shared or not. If shared, then it cannot
+        be appended and needs to be duplicated via copy-on-write
+        """
+        block_id = block.block_id
+        if block_id is None:
+            return True
+
+        refcount = self._refcounter.get(block_id)
+        return refcount <= 1
+
+    def record_cow(self, src_block_id: Optional[BlockId],
+                   trg_block_id: Optional[BlockId]) -> None:
+        """Records a copy-on-write operation from source to target block id
+        Args:
+            src_block_id (BlockId): The source block id from which to copy 
+                the data
+            trg_block_id (BlockId): The target block id to which the data
+                is copied
+        """
+        assert src_block_id is not None
+        assert trg_block_id is not None
+        self._copy_on_writes.append((src_block_id, trg_block_id))
+
+    def clear_cows(self) -> List[Tuple[BlockId, BlockId]]:
+        """Clears the copy-on-write tracking information and returns the current
+        state.
+
+        This method returns a list mapping source block indices to
+         destination block indices for the current copy-on-write operations.
+        It then clears the internal tracking information.
+
+        Returns:
+            List[Tuple[BlockId, BlockId]]: A list mapping source
+                block indices to destination block indices for the
+                current copy-on-write operations.
+        """
+        cows = self._copy_on_writes
+        self._copy_on_writes = []
+        return cows
+
+
+class BlockPool:
+    """Used to pre-allocate block objects, in order to avoid excessive python
+    object allocations/deallocations.
+    The pool starts from "pool_size" objects and will increase to more objects
+    if necessary
+
+    Note that multiple block objects may point to the same physical block id,
+    which is why this pool is needed, so that it will be easier to support
+    prefix caching and more complicated sharing of physical blocks.
+    """
+
+    def __init__(self, block_size: int, create_block: Block.Factory,
+                 allocator: BlockAllocator, pool_size: int):
+        self._block_size = block_size
+        self._create_block = create_block
+        self._allocator = allocator
+        self._pool_size = pool_size
+        assert self._pool_size >= 0
+
+        self._free_ids: Deque[int] = deque(range(self._pool_size))
+        self._pool = []
+        for i in range(self._pool_size):
+            self._pool.append(
+                self._create_block(prev_block=None,
+                                   token_ids=[],
+                                   block_size=self._block_size,
+                                   allocator=self._allocator,
+                                   block_id=None,
+                                   extra_hash=None))
+
+    def increase_pool(self):
+        """Doubles the internal pool size
+        """
+        cur_pool_size = self._pool_size
+        new_pool_size = cur_pool_size * 2
+        self._pool_size = new_pool_size
+
+        self._free_ids += deque(range(cur_pool_size, new_pool_size))
+
+        for i in range(cur_pool_size, new_pool_size):
+            self._pool.append(
+                self._create_block(prev_block=None,
+                                   token_ids=[],
+                                   block_size=self._block_size,
+                                   allocator=self._allocator,
+                                   block_id=None,
+                                   extra_hash=None))
+
+    def init_block(self,
+                   prev_block: Optional[Block],
+                   token_ids: List[int],
+                   block_size: int,
+                   physical_block_id: Optional[int],
+                   extra_hash: Optional[int] = None) -> Block:
+        if len(self._free_ids) == 0:
+            self.increase_pool()
+            assert len(self._free_ids) > 0
+
+        pool_id = self._free_ids.popleft()
+
+        block = self._pool[pool_id]
+        block.__init__(  # type: ignore[misc]
+            prev_block=prev_block,
+            token_ids=token_ids,
+            block_size=block_size,
+            allocator=block._allocator,  # type: ignore[attr-defined] 
+            block_id=physical_block_id,
+            extra_hash=extra_hash)
+        block.pool_id = pool_id  # type: ignore[attr-defined]
+        return block
+
+    def free_block(self, block: Block) -> None:
+        self._free_ids.appendleft(block.pool_id)  # type: ignore[attr-defined]
+
+
+class BlockList:
+    """This class is an optimization to allow fast-access to physical 
+    block ids. It maintains a block id list that is updated with the 
+    block list and this avoids the need to reconstruct the block id 
+    list on every iteration of the block manager
+    """
+
+    def __init__(self, blocks: List[Block]):
+        self._blocks: List[Block] = []
+        self._block_ids: List[int] = []
+
+        self.update(blocks)
+
+    def _add_block_id(self, block_id: Optional[BlockId]) -> None:
+        assert block_id is not None
+        self._block_ids.append(block_id)
+
+    def _update_block_id(self, block_index: int,
+                         new_block_id: Optional[BlockId]) -> None:
+        assert new_block_id is not None
+        self._block_ids[block_index] = new_block_id
+
+    def update(self, blocks: List[Block]):
+        self._blocks = blocks
+
+        # Cache block ids for fast query
+        self._block_ids = []
+        for block in self._blocks:
+            self._add_block_id(block.block_id)
+
+    def append_token_ids(self, block_index: int, token_ids: List[int]) -> None:
+        block = self._blocks[block_index]
+        prev_block_id = block.block_id
+
+        block.append_token_ids(token_ids)
+
+        # CoW or promotion may update the internal block_id
+        if prev_block_id != block.block_id:
+            self._update_block_id(block_index, block.block_id)
+
+    def append(self, new_block: Block):
+        self._blocks.append(new_block)
+        self._add_block_id(new_block.block_id)
+
+    def __len__(self) -> int:
+        return len(self._blocks)
+
+    def __getitem__(self, block_index: int) -> Block:
+        return self._blocks[block_index]
+
+    def __setitem__(self, block_index: int, new_block: Block) -> None:
+        self._blocks[block_index] = new_block
+        self._update_block_id(block_index, new_block.block_id)
+
+    def reset(self):
+        self._blocks = []
+        self._block_ids = []
+
+    def list(self) -> List[Block]:
+        return self._blocks
+
+    def ids(self) -> List[int]:
+        return self._block_ids
+
+
+@dataclass
+class CacheMetricData:
+    """A utility dataclass to maintain cache metric.
+    To avoid overflow, we maintain the hit rate in block granularity, so that
+    we can maintain a single hit rate for n_completed_block x block_size,
+    and calculate the real time hit rate by the following:
+    BS = The number of queries per block.
+    nB = The number of completed blocks.
+    HR = hit rate of (nB x BS) queries.
+    Q = current number of queries (< BS).
+    H = current number of hits (< BS).
+    hit rate = ((HR x nB) + (H / Q) x (Q / BS)) / (nB + Q / BS)
+    """
+    num_completed_blocks: int = 0
+    completed_block_cache_hit_rate: float = 0.0
+    num_incompleted_block_queries: int = 0
+    num_incompleted_block_hit: int = 0
+    block_size: int = 1000
+
+    def query(self, hit: bool):
+        self.num_incompleted_block_queries += 1
+        self.num_incompleted_block_hit += 1 if hit else 0
+
+        # When a block is completed, update the cache hit rate
+        # and reset the incomplete numbers.
+        if self.num_incompleted_block_queries == self.block_size:
+            hit_rate = (self.num_incompleted_block_hit /
+                        self.num_incompleted_block_queries)
+            self.completed_block_cache_hit_rate = (
+                self.completed_block_cache_hit_rate * self.num_completed_blocks
+                + hit_rate) / (self.num_completed_blocks + 1)
+            self.num_incompleted_block_queries = 0
+            self.num_incompleted_block_hit = 0
+            self.num_completed_blocks += 1
+
+    def get_hit_rate(self):
+        incomplete_ratio = self.num_incompleted_block_queries / self.block_size
+        total_blocks = self.num_completed_blocks + incomplete_ratio
+        if total_blocks == 0:
+            return 0.0
+
+        completed_block_hit, incompleted_block_hit = 0.0, 0.0
+        if self.num_completed_blocks > 0:
+            completed_block_hit = (self.completed_block_cache_hit_rate *
+                                   self.num_completed_blocks)
+        if self.num_incompleted_block_queries > 0:
+            incompleted_hit_rate = (self.num_incompleted_block_hit /
+                                    self.num_incompleted_block_queries)
+            incompleted_block_hit = (incompleted_hit_rate * incomplete_ratio)
+        return (completed_block_hit + incompleted_block_hit) / total_blocks
+
+
+def get_all_blocks_recursively(last_block: Block) -> List[Block]:
+    """Retrieves all the blocks in a sequence starting from the last block.
+
+    This function recursively traverses the sequence of blocks in reverse order,
+    starting from the given last block, and returns a list of all the blocks in
+    the sequence.
+
+    Args:
+        last_block (Block): The last block in the sequence.
+
+    Returns:
+        List[Block]: A list of all the blocks in the sequence, in the order they
+            appear.
+    """
+
+    def recurse(block: Block, lst: List[Block]) -> None:
+        if block.prev_block is not None:
+            recurse(block.prev_block, lst)
+        lst.append(block)
+
+    all_blocks: List[Block] = []
+    recurse(last_block, all_blocks)
+    return all_blocks
diff --git a/vllm_v0.10.0/vllm/core/block/cpu_gpu_block_allocator.py b/vllm_v0.10.0/vllm/core/block/cpu_gpu_block_allocator.py
new file mode 100644
index 0000000..92bc5e1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/core/block/cpu_gpu_block_allocator.py
@@ -0,0 +1,439 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Dict, FrozenSet, List, Optional, Tuple
+
+from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId,
+                                        DeviceAwareBlockAllocator)
+from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
+from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator
+from vllm.utils import Device
+
+
+class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
+    """A block allocator that can allocate blocks on both CPU and GPU memory.
+
+    This class implements the `DeviceAwareBlockAllocator` interface and provides
+    functionality for allocating and managing blocks of memory on both CPU and
+    GPU devices.
+
+    The `CpuGpuBlockAllocator` maintains separate memory pools for CPU and GPU
+    blocks, and allows for allocation, deallocation, forking, and swapping of
+    blocks across these memory pools.
+    """
+
+    @staticmethod
+    def create(
+        allocator_type: str,
+        num_gpu_blocks: int,
+        num_cpu_blocks: int,
+        block_size: int,
+    ) -> DeviceAwareBlockAllocator:
+        """Creates a CpuGpuBlockAllocator instance with the specified
+        configuration.
+
+        This static method creates and returns a CpuGpuBlockAllocator instance
+        based on the provided parameters. It initializes the CPU and GPU block
+        allocators with the specified number of blocks, block size, and
+        allocator type.
+
+        Args:
+            allocator_type (str): The type of block allocator to use for CPU
+                and GPU blocks. Currently supported values are "naive" and
+                "prefix_caching".
+            num_gpu_blocks (int): The number of blocks to allocate for GPU
+                memory.
+            num_cpu_blocks (int): The number of blocks to allocate for CPU
+                memory.
+            block_size (int): The size of each block in number of tokens.
+
+        Returns:
+            DeviceAwareBlockAllocator: A CpuGpuBlockAllocator instance with the
+                specified configuration.
+
+        Notes:
+            - The block IDs are assigned contiguously, with GPU block IDs coming
+                before CPU block IDs.
+        """
+        reserved_blocks = 0
+        block_ids = list(
+            range(reserved_blocks, num_gpu_blocks + num_cpu_blocks))
+        num_gpu_blocks -= reserved_blocks
+        gpu_block_ids = block_ids[:num_gpu_blocks]
+        cpu_block_ids = block_ids[num_gpu_blocks:]
+
+        if allocator_type == "naive":
+            gpu_allocator: BlockAllocator = NaiveBlockAllocator(
+                create_block=NaiveBlock,  # type: ignore
+                num_blocks=num_gpu_blocks,
+                block_size=block_size,
+                block_ids=gpu_block_ids,
+            )
+
+            cpu_allocator: BlockAllocator = NaiveBlockAllocator(
+                create_block=NaiveBlock,  # type: ignore
+                num_blocks=num_cpu_blocks,
+                block_size=block_size,
+                block_ids=cpu_block_ids,
+            )
+        elif allocator_type == "prefix_caching":
+            gpu_allocator = PrefixCachingBlockAllocator(
+                num_blocks=num_gpu_blocks,
+                block_size=block_size,
+                block_ids=gpu_block_ids,
+            )
+
+            cpu_allocator = PrefixCachingBlockAllocator(
+                num_blocks=num_cpu_blocks,
+                block_size=block_size,
+                block_ids=cpu_block_ids,
+            )
+        else:
+            raise ValueError(f"Unknown allocator type {allocator_type=}")
+
+        return CpuGpuBlockAllocator(
+            cpu_block_allocator=cpu_allocator,
+            gpu_block_allocator=gpu_allocator,
+        )
+
+    def __init__(self, cpu_block_allocator: BlockAllocator,
+                 gpu_block_allocator: BlockAllocator):
+        assert not (
+            cpu_block_allocator.all_block_ids
+            & gpu_block_allocator.all_block_ids
+        ), "cpu and gpu block allocators can't have intersection of block ids"
+
+        self._allocators = {
+            Device.CPU: cpu_block_allocator,
+            Device.GPU: gpu_block_allocator,
+        }
+
+        self._swap_mapping: Dict[int, int] = {}
+        self._null_block: Optional[Block] = None
+
+        self._block_ids_to_allocator: Dict[int, BlockAllocator] = {}
+        for _, allocator in self._allocators.items():
+            for block_id in allocator.all_block_ids:
+                self._block_ids_to_allocator[block_id] = allocator
+
+    def allocate_or_get_null_block(self) -> Block:
+        if self._null_block is None:
+            self._null_block = NullBlock(
+                self.allocate_mutable_block(None, Device.GPU))
+        return self._null_block
+
+    def allocate_mutable_block(self,
+                               prev_block: Optional[Block],
+                               device: Device,
+                               extra_hash: Optional[int] = None) -> Block:
+        """Allocates a new mutable block on the specified device.
+
+        Args:
+            prev_block (Optional[Block]): The previous block to in the sequence.
+                Used for prefix hashing.
+            device (Device): The device on which to allocate the new block.
+            extra_hash (Optional[int]): The hash value of additional
+                factors, such as adapters, that influence the block hash
+                in the prefix caching block.
+
+        Returns:
+            Block: The newly allocated mutable block.
+        """
+        return self._allocators[device].allocate_mutable_block(
+            prev_block, extra_hash=extra_hash)
+
+    def allocate_immutable_blocks(
+            self,
+            prev_block: Optional[Block],
+            block_token_ids: List[List[int]],
+            device: Device,
+            extra_hash: Optional[int] = None) -> List[Block]:
+        """Allocates a new group of immutable blocks with the provided block 
+        token IDs on the specified device.
+
+        Args:
+            prev_block (Optional[Block]): The previous block in the sequence.
+                Used for prefix hashing.
+            block_token_ids (List[int]): The list of block token IDs to be 
+                stored in the new blocks.
+            device (Device): The device on which to allocate the new block.
+            extra_hash (Optional[int]): The hash value of additional
+                factors, such as adapters, that influence the block hash
+                in the prefix caching block.
+
+        Returns:
+            List[Block]: The newly allocated list of immutable blocks 
+                containing the provided block token IDs.
+        """
+        return self._allocators[device].allocate_immutable_blocks(
+            prev_block, block_token_ids, extra_hash=extra_hash)
+
+    def allocate_immutable_block(self,
+                                 prev_block: Optional[Block],
+                                 token_ids: List[int],
+                                 device: Device,
+                                 extra_hash: Optional[int] = None) -> Block:
+        """Allocates a new immutable block with the provided token IDs on the
+        specified device.
+
+        Args:
+            prev_block (Optional[Block]): The previous block in the sequence.
+                Used for prefix hashing.
+            token_ids (List[int]): The list of token IDs to be stored in the new
+                block.
+            device (Device): The device on which to allocate the new block.
+            extra_hash (Optional[int]): The hash value of additional
+                factors, such as adapters, that influence the block hash
+                in the prefix caching block.
+
+        Returns:
+            Block: The newly allocated immutable block containing the provided
+                token IDs.
+        """
+        return self._allocators[device].allocate_immutable_block(
+            prev_block, token_ids, extra_hash=extra_hash)
+
+    def free(self, block: Block) -> None:
+        """Frees the memory occupied by the given block.
+
+        Args:
+            block (Block): The block to be freed.
+        """
+        # Null block should never be freed
+        if isinstance(block, NullBlock):
+            return
+        block_id = block.block_id
+        assert block_id is not None
+        allocator = self._block_ids_to_allocator[block_id]
+        allocator.free(block)
+
+    def fork(self, last_block: Block) -> List[Block]:
+        """Creates a new sequence of blocks that shares the same underlying
+            memory as the original sequence.
+
+        Args:
+            last_block (Block): The last block in the original sequence.
+
+        Returns:
+            List[Block]: A new list of blocks that shares the same memory as the
+                original sequence.
+        """
+        # do not attempt to fork the null block
+        assert not isinstance(last_block, NullBlock)
+        block_id = last_block.block_id
+        assert block_id is not None
+        allocator = self._block_ids_to_allocator[block_id]
+        return allocator.fork(last_block)
+
+    def get_num_free_blocks(self, device: Device) -> int:
+        """Returns the number of free blocks available on the specified device.
+
+        Args:
+            device (Device): The device for which to query the number of free
+                blocks. AssertionError is raised if None is passed.
+
+        Returns:
+            int: The number of free blocks available on the specified device.
+        """
+        return self._allocators[device].get_num_free_blocks()
+
+    def get_num_total_blocks(self, device: Device) -> int:
+        return self._allocators[device].get_num_total_blocks()
+
+    def get_physical_block_id(self, device: Device, absolute_id: int) -> int:
+        """Returns the zero-offset block id on certain device given the 
+        absolute block id.
+
+        Args:
+            device (Device): The device for which to query relative block id.
+                absolute_id (int): The absolute block id for the block in 
+                whole allocator.
+
+        Returns:
+            int: The zero-offset block id on certain device.
+        """
+        return self._allocators[device].get_physical_block_id(absolute_id)
+
+    def swap(self, blocks: List[Block], src_device: Device,
+             dst_device: Device) -> Dict[int, int]:
+        """Execute the swap for the given blocks from source_device
+        on to dest_device, save the current swap mapping and append 
+        them to the accumulated `self._swap_mapping` for each 
+        scheduling move.
+
+        Args:
+            blocks: List of blocks to be swapped.
+            src_device (Device): Device to swap the 'blocks' from.
+            dst_device (Device): Device to swap the 'blocks' to.
+        
+        Returns:
+            Dict[int, int]: Swap mapping from source_device
+                on to dest_device.
+        """
+        src_block_ids = [block.block_id for block in blocks]
+        self._allocators[src_device].swap_out(blocks)
+        self._allocators[dst_device].swap_in(blocks)
+        dst_block_ids = [block.block_id for block in blocks]
+
+        current_swap_mapping: Dict[int, int] = {}
+        for src_block_id, dst_block_id in zip(src_block_ids, dst_block_ids):
+            if src_block_id is not None and dst_block_id is not None:
+                self._swap_mapping[src_block_id] = dst_block_id
+                current_swap_mapping[src_block_id] = dst_block_id
+        return current_swap_mapping
+
+    def get_num_full_blocks_touched(self, blocks: List[Block],
+                                    device: Device) -> int:
+        """Returns the number of full blocks that will be touched by
+        swapping in/out the given blocks on to the 'device'.
+
+        Args:
+            blocks: List of blocks to be swapped.
+            device (Device): Device to swap the 'blocks' on.
+
+        Returns:
+            int: the number of full blocks that will be touched by
+                swapping in/out the given blocks on to the 'device'.
+                Non full blocks are ignored when deciding the number
+                of blocks to touch.
+        """
+        return self._allocators[device].get_num_full_blocks_touched(blocks)
+
+    def clear_copy_on_writes(self) -> List[Tuple[int, int]]:
+        """Clears the copy-on-write (CoW) state and returns the mapping of
+            source to destination block IDs.
+
+        Returns:
+            List[Tuple[int, int]]: A list mapping source block IDs to 
+                destination block IDs.
+        """
+        # CoW only supported on GPU
+        device = Device.GPU
+        return self._allocators[device].clear_copy_on_writes()
+
+    def mark_blocks_as_accessed(self, block_ids: List[int],
+                                now: float) -> None:
+        """Mark blocks as accessed, only use for prefix caching."""
+        # Prefix caching only supported on GPU.
+        device = Device.GPU
+        return self._allocators[device].mark_blocks_as_accessed(block_ids, now)
+
+    def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
+        """Mark blocks as accessed, only use for prefix caching."""
+        # Prefix caching only supported on GPU.
+        device = Device.GPU
+        return self._allocators[device].mark_blocks_as_computed(block_ids)
+
+    def get_common_computed_block_ids(
+            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
+        # Prefix caching only supported on GPU.
+        device = Device.GPU
+        return self._allocators[device].get_common_computed_block_ids(
+            computed_seq_block_ids)
+
+    @property
+    def all_block_ids(self) -> FrozenSet[int]:
+        return frozenset(self._block_ids_to_allocator.keys())
+
+    def get_prefix_cache_hit_rate(self, device: Device) -> float:
+        """Prefix cache hit rate. -1 means not supported or disabled."""
+        assert device in self._allocators
+        return self._allocators[device].get_prefix_cache_hit_rate()
+
+    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
+        """Reset prefix cache for specified or all devices."""
+        if device:
+            return self._allocators[device].reset_prefix_cache()
+        success = True
+        for allocator in self._allocators.values():
+            success = success and allocator.reset_prefix_cache()
+        return success
+
+    def get_and_reset_swaps(self) -> List[Tuple[int, int]]:
+        """Returns and clears the mapping of source to destination block IDs.
+        Will be called after every swapping operations for now, and after every
+        schedule when BlockManagerV2 become default. Currently not useful.
+
+        Returns:
+            List[Tuple[int, int]]: A mapping of source to destination block IDs.
+        """
+        mapping = self._swap_mapping.copy()
+        self._swap_mapping.clear()
+        return list(mapping.items())
+
+    def find_cached_blocks_prefix(
+        self,
+        block_hashes: List[int],
+        device: Device = Device.GPU,
+    ) -> List[int]:
+        return self._allocators[device].find_cached_blocks_prefix(block_hashes)
+
+
+class NullBlock(Block):
+    """
+    Null blocks are used as a placeholders for KV cache blocks that have
+    been dropped due to sliding window.
+    This implementation just wraps an ordinary block and prevents it from
+    being modified. It also allows for testing if a block is NullBlock
+    via isinstance().
+    """
+
+    def __init__(self, proxy: Block):
+        super().__init__()
+        self._proxy = proxy
+
+    def append_token_ids(self, token_ids: List[BlockId]):
+        raise ValueError("null block should not be modified")
+
+    @property
+    def block_id(self):
+        return self._proxy.block_id
+
+    @block_id.setter
+    def block_id(self, value: Optional[BlockId]):
+        raise ValueError("null block should not be modified")
+
+    @property
+    def token_ids(self) -> List[BlockId]:
+        return self._proxy.token_ids
+
+    @property
+    def num_tokens_total(self) -> int:
+        raise NotImplementedError(
+            "num_tokens_total is not used for null block")
+
+    @property
+    def num_empty_slots(self) -> BlockId:
+        return self._proxy.num_empty_slots
+
+    @property
+    def is_full(self):
+        return self._proxy.is_full
+
+    @property
+    def prev_block(self):
+        return self._proxy.prev_block
+
+    @property
+    def extra_hash(self):
+        return None
+
+    @property
+    def computed(self):
+        return self._proxy.computed
+
+    @computed.setter
+    def computed(self, value):
+        self._proxy.computed = value
+
+    @property
+    def last_accessed(self) -> float:
+        return self._proxy.last_accessed
+
+    @last_accessed.setter
+    def last_accessed(self, last_accessed_ts: float):
+        self._proxy.last_accessed = last_accessed_ts
+
+    @property
+    def content_hash(self):
+        return self._proxy.content_hash
diff --git a/vllm_v0.10.0/vllm/core/block/interfaces.py b/vllm_v0.10.0/vllm/core/block/interfaces.py
new file mode 100644
index 0000000..1a05881
--- /dev/null
+++ b/vllm_v0.10.0/vllm/core/block/interfaces.py
@@ -0,0 +1,319 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import ABC, abstractmethod
+from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple
+
+from vllm.utils import Device
+
+BlockId = int
+
+
+class Block(ABC):
+
+    @abstractmethod
+    def append_token_ids(self, token_ids: List[int]) -> None:
+        pass
+
+    @property
+    @abstractmethod
+    def block_id(self) -> Optional[int]:
+        pass
+
+    @block_id.setter
+    @abstractmethod
+    def block_id(self, value: Optional[int]) -> None:
+        """NOTE: Do not use this API outside Block."""
+        self._block_id = value
+
+    @property
+    @abstractmethod
+    def token_ids(self) -> List[int]:
+        pass
+
+    @property
+    @abstractmethod
+    def num_tokens_total(self) -> int:
+        """The number of tokens till the current block (inclusive)
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def num_empty_slots(self) -> int:
+        pass
+
+    @property
+    @abstractmethod
+    def is_full(self) -> bool:
+        pass
+
+    @property
+    @abstractmethod
+    def prev_block(self) -> Optional["Block"]:
+        pass
+
+    @property
+    @abstractmethod
+    def extra_hash(self) -> Optional[int]:
+        return None
+
+    @property
+    @abstractmethod
+    def computed(self) -> bool:
+        raise NotImplementedError
+
+    @computed.setter
+    @abstractmethod
+    def computed(self, value) -> bool:
+        """Should be only used by PrefixCacingAllocator"""
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def last_accessed(self) -> float:
+        raise NotImplementedError
+
+    @last_accessed.setter
+    @abstractmethod
+    def last_accessed(self, last_accessed_ts: float):
+        raise NotImplementedError
+
+    class Factory(Protocol):
+
+        @abstractmethod
+        def __call__(
+            self,
+            prev_block: Optional["Block"],
+            token_ids: List[int],
+            block_size: int,
+            allocator: "BlockAllocator",
+            block_id: Optional[int] = None,
+            computed: bool = False,
+            extra_hash: Optional[int] = None,
+        ) -> "Block":
+            pass
+
+    @property
+    @abstractmethod
+    def content_hash(self) -> Optional[int]:
+        """Return the content-based hash of the current block, or None if it is
+        not yet defined or not supported.
+
+        For the content-based hash to be defined, the current block must be
+        full.
+        """
+        return None
+
+
+class BlockAllocator(ABC):
+
+    @abstractmethod
+    def allocate_mutable_block(self, prev_block: Optional[Block],
+                               extra_hash: Optional[int]) -> Block:
+        pass
+
+    @abstractmethod
+    def allocate_immutable_block(self, prev_block: Optional[Block],
+                                 token_ids: List[int],
+                                 extra_hash: Optional[int]) -> Block:
+        pass
+
+    @abstractmethod
+    def allocate_immutable_blocks(self, prev_block: Optional[Block],
+                                  block_token_ids: List[List[int]],
+                                  extra_hash: Optional[int]) -> List[Block]:
+        pass
+
+    @abstractmethod
+    def free(self, block: Block) -> None:
+        pass
+
+    @abstractmethod
+    def fork(self, last_block: Block) -> List[Block]:
+        pass
+
+    @abstractmethod
+    def get_num_total_blocks(self) -> int:
+        pass
+
+    @abstractmethod
+    def get_num_free_blocks(self) -> int:
+        pass
+
+    @abstractmethod
+    def get_physical_block_id(self, absolute_id: int) -> int:
+        pass
+
+    @abstractmethod
+    def swap_out(self, blocks: List[Block]) -> None:
+        pass
+
+    @abstractmethod
+    def swap_in(self, blocks: List[Block]) -> None:
+        pass
+
+    @property
+    @abstractmethod
+    def all_block_ids(self) -> FrozenSet[int]:
+        pass
+
+    @abstractmethod
+    def clear_copy_on_writes(self) -> List[Tuple[int, int]]:
+        pass
+
+    @abstractmethod
+    def mark_blocks_as_accessed(self, block_ids: List[int],
+                                now: float) -> None:
+        pass
+
+    @abstractmethod
+    def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
+        pass
+
+    @abstractmethod
+    def get_common_computed_block_ids(
+            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
+        pass
+
+    @abstractmethod
+    def cow_block_if_not_appendable(self, block: Block) -> BlockId:
+        """NOTE: This should not be used besides Block"""
+        pass
+
+    @abstractmethod
+    def promote_to_immutable_block(self, block: Block) -> BlockId:
+        """NOTE: This should not be used besides Block"""
+        pass
+
+    @abstractmethod
+    def get_num_full_blocks_touched(self, blocks: List[Block]) -> int:
+        pass
+
+    @abstractmethod
+    def get_prefix_cache_hit_rate(self) -> float:
+        """Prefix cache hit rate. -1 means not supported or disabled."""
+        pass
+
+    @abstractmethod
+    def reset_prefix_cache(self) -> bool:
+        """Reset prefix cache."""
+        pass
+
+    class NoFreeBlocksError(ValueError):
+        pass
+
+    @abstractmethod
+    def find_cached_blocks_prefix(
+        self,
+        block_hashes: List[int],
+    ) -> List[int]:
+        pass
+
+
+class DeviceAwareBlockAllocator(ABC):
+
+    @abstractmethod
+    def allocate_mutable_block(self,
+                               prev_block: Optional[Block],
+                               device: Device,
+                               extra_hash: Optional[int] = None) -> Block:
+        pass
+
+    @abstractmethod
+    def allocate_immutable_block(self,
+                                 prev_block: Optional[Block],
+                                 token_ids: List[int],
+                                 device: Device,
+                                 extra_hash: Optional[int] = None) -> Block:
+        pass
+
+    @abstractmethod
+    def allocate_immutable_blocks(
+        self,
+        prev_block: Optional[Block],
+        block_token_ids: List[List[int]],
+        device: Device,
+        extra_hash: Optional[int] = None,
+    ) -> List[Block]:
+        pass
+
+    @abstractmethod
+    def get_num_free_blocks(self, device: Device) -> int:
+        pass
+
+    @abstractmethod
+    def get_num_total_blocks(self, device: Device) -> int:
+        pass
+
+    @abstractmethod
+    def free(self, block: Block) -> None:
+        pass
+
+    @abstractmethod
+    def fork(self, last_block: Block) -> List[Block]:
+        pass
+
+    @property
+    @abstractmethod
+    def all_block_ids(self) -> FrozenSet[int]:
+        pass
+
+    @abstractmethod
+    def clear_copy_on_writes(self) -> List[Tuple[int, int]]:
+        pass
+
+    @abstractmethod
+    def mark_blocks_as_accessed(self, block_ids: List[int],
+                                now: float) -> None:
+        pass
+
+    @abstractmethod
+    def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
+        pass
+
+    @abstractmethod
+    def get_common_computed_block_ids(
+            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
+        pass
+
+    @abstractmethod
+    def get_num_full_blocks_touched(self, blocks: List[Block],
+                                    device: Device) -> int:
+        pass
+
+    @abstractmethod
+    def swap(self, blocks: List[Block], src_device: Device,
+             dst_device: Device) -> Dict[int, int]:
+        pass
+
+    @abstractmethod
+    def get_physical_block_id(self, device: Device, absolute_id: int) -> int:
+        pass
+
+    @abstractmethod
+    def allocate_or_get_null_block(self) -> Block:
+        """
+        Null blocks are used as a placeholders for KV cache blocks that have
+        been dropped due to sliding window.
+        There is at most one null block per allocator.
+        """
+        pass
+
+    @abstractmethod
+    def get_prefix_cache_hit_rate(self, device: Device) -> float:
+        """Prefix cache hit rate. -1 means not supported or disabled."""
+        pass
+
+    @abstractmethod
+    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
+        """Reset prefix cache."""
+        pass
+
+    @abstractmethod
+    def find_cached_blocks_prefix(
+        self,
+        block_hashes: List[int],
+        device: Device = Device.GPU,
+    ) -> List[int]:
+        pass
diff --git a/vllm_v0.10.0/vllm/core/block/naive_block.py b/vllm_v0.10.0/vllm/core/block/naive_block.py
new file mode 100644
index 0000000..dae6ead
--- /dev/null
+++ b/vllm_v0.10.0/vllm/core/block/naive_block.py
@@ -0,0 +1,466 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections import deque
+from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union
+
+from vllm.core.block.common import (BlockPool, CopyOnWriteTracker, RefCounter,
+                                    get_all_blocks_recursively)
+from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
+
+Refcount = int
+
+
+class NaiveBlockAllocator(BlockAllocator):
+    """A simple block allocator that manages blocks of memory without prefix
+    caching.
+
+    Args:
+        create_block (Block.Factory): A factory function for creating new
+            blocks. This is used when a NaiveBlockAllocator is composed within
+            a prefix caching allocator -- the naive block allocator must
+            construct prefix caching blocks (but shouldn't know anything else
+            about them).
+        num_blocks (int): The total number of blocks to manage.
+        block_size (int): The size of each block in tokens.
+        block_ids (Optional[Iterable[int]], optional): An optional iterable of
+            block IDs. If not provided, block IDs will be assigned sequentially
+            from 0 to num_blocks - 1.
+    """
+
+    def __init__(
+        self,
+        create_block: Block.Factory,
+        num_blocks: int,
+        block_size: int,
+        block_ids: Optional[Iterable[int]] = None,
+        block_pool: Optional[BlockPool] = None,
+    ):
+        if block_ids is None:
+            block_ids = range(num_blocks)
+
+        self._free_block_indices: Deque[BlockId] = deque(block_ids)
+        self._all_block_indices = frozenset(block_ids)
+        assert len(self._all_block_indices) == num_blocks
+
+        self._refcounter = RefCounter(
+            all_block_indices=self._free_block_indices)
+        self._block_size = block_size
+
+        self._cow_tracker = CopyOnWriteTracker(
+            refcounter=self._refcounter.as_readonly())
+
+        if block_pool is None:
+            extra_factor = 4
+            # Pre-allocate "num_blocks * extra_factor" block objects.
+            # The "* extra_factor" is a buffer to allow more block objects
+            # than physical blocks
+            self._block_pool = BlockPool(self._block_size, create_block, self,
+                                         num_blocks * extra_factor)
+        else:
+            # In this case, the block pool is provided by the caller,
+            # which means that there is most likely a need to share
+            # a block pool between allocators
+            self._block_pool = block_pool
+
+    def allocate_immutable_block(self,
+                                 prev_block: Optional[Block],
+                                 token_ids: List[int],
+                                 extra_hash: Optional[int] = None,
+                                 device: Optional[Device] = None) -> Block:
+        """Allocates a new immutable block with the given token IDs, linked to
+        the previous block.
+
+        Args:
+            prev_block (Optional[Block]): The previous block in the sequence. If
+                None, then the block to be allocated is the first block in the
+                sequence.
+            token_ids (List[int]): The token IDs to be stored in the new block.
+
+        Returns:
+            Block: The newly allocated immutable block.
+        """
+        assert device is None
+        block = self.allocate_mutable_block(prev_block=prev_block)
+        block.append_token_ids(token_ids)
+        return block
+
+    def allocate_immutable_blocks(
+            self,
+            prev_block: Optional[Block],
+            block_token_ids: List[List[int]],
+            extra_hash: Optional[int] = None,
+            device: Optional[Device] = None) -> List[Block]:
+        assert device is None
+        num_blocks = len(block_token_ids)
+
+        block_ids = []
+        for i in range(num_blocks):
+            block_ids.append(self._allocate_block_id())
+
+        blocks = []
+        for i in range(num_blocks):
+            prev_block = self._block_pool.init_block(
+                prev_block=prev_block,
+                token_ids=block_token_ids[i],
+                block_size=self._block_size,
+                physical_block_id=block_ids[i])
+            blocks.append(prev_block)
+
+        return blocks
+
+    def allocate_mutable_block(self,
+                               prev_block: Optional[Block],
+                               extra_hash: Optional[int] = None,
+                               device: Optional[Device] = None) -> Block:
+        """Allocates a new mutable block, linked to the previous block.
+
+        Args:
+            prev_block (Optional[Block]): The previous block in the sequence. If
+                None, then the block to be allocated is the first block in the
+                sequence.
+
+        Returns:
+            Block: The newly allocated mutable block.
+        """
+        assert device is None
+        block_id = self._allocate_block_id()
+        block = self._block_pool.init_block(prev_block=prev_block,
+                                            token_ids=[],
+                                            block_size=self._block_size,
+                                            physical_block_id=block_id)
+        return block
+
+    def _allocate_block_id(self) -> BlockId:
+        if not self._free_block_indices:
+            raise BlockAllocator.NoFreeBlocksError()
+
+        block_id = self._free_block_indices.popleft()
+        self._refcounter.incr(block_id)
+        return block_id
+
+    def _free_block_id(self, block: Union[Block, BlockId]) -> None:
+        if isinstance(block, Block):
+            block_id = block.block_id
+            block.block_id = None
+        else:
+            block_id = block
+        assert block_id is not None
+
+        refcount = self._refcounter.decr(block_id)
+        if refcount == 0:
+            self._free_block_indices.appendleft(block_id)
+
+    def free(self, block: Block, keep_block_object: bool = False) -> None:
+        # Release the physical block id
+        self._free_block_id(block)
+
+        # Release the block object
+        if not keep_block_object:
+            self._block_pool.free_block(block)
+
+    def free_block_id(self, block_id: BlockId) -> None:
+        self._free_block_id(block_id)
+
+    def fork(self, last_block: Block) -> List[Block]:
+        """Creates a new sequence of blocks that shares the same underlying
+        memory as the original sequence.
+
+        Args:
+            last_block (Block): The last block in the original sequence.
+
+        Returns:
+            List[Block]: The new sequence of blocks that shares the same memory
+                as the original sequence.
+        """
+        source_blocks = get_all_blocks_recursively(last_block)
+
+        forked_blocks: List[Block] = []
+        prev_block = None
+        for block in source_blocks:
+
+            # Increment refcount for each block.
+            assert block.block_id is not None
+            refcount = self._refcounter.incr(block.block_id)
+            assert refcount != 1, "can't fork free'd block"
+
+            forked_block = self._block_pool.init_block(
+                prev_block=prev_block,
+                token_ids=block.token_ids,
+                block_size=self._block_size,
+                physical_block_id=block.block_id)
+
+            forked_blocks.append(forked_block)
+            prev_block = forked_blocks[-1]
+
+        return forked_blocks
+
+    def get_num_free_blocks(self) -> int:
+        return len(self._free_block_indices)
+
+    def get_num_total_blocks(self) -> int:
+        return len(self._all_block_indices)
+
+    def get_physical_block_id(self, absolute_id: int) -> int:
+        """Returns the zero-offset block id on certain block allocator
+        given the absolute block id.
+
+        Args:
+            absolute_id (int): The absolute block id for the block 
+            in whole allocator.
+
+        Returns:
+            int: The zero-offset block id on certain device.
+        """
+        return sorted(self._all_block_indices).index(absolute_id)
+
+    @property
+    def refcounter(self):
+        return self._refcounter
+
+    @property
+    def all_block_ids(self) -> FrozenSet[int]:
+        return self._all_block_indices
+
+    def cow_block_if_not_appendable(self, block: Block) -> BlockId:
+        """Performs a copy-on-write operation on the given block if it is not
+        appendable.
+
+        Args:
+            block (Block): The block to check for copy-on-write.
+
+        Returns:
+            BlockId: The block index of the new block if a copy-on-write 
+                operation was performed, or the original block index if
+                no copy-on-write was necessary.
+        """
+        src_block_id = block.block_id
+        assert src_block_id is not None
+
+        if self._cow_tracker.is_appendable(block):
+            return src_block_id
+
+        self._free_block_id(block)
+        trg_block_id = self._allocate_block_id()
+
+        self._cow_tracker.record_cow(src_block_id, trg_block_id)
+
+        return trg_block_id
+
+    def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]:
+        """Returns the copy-on-write source->destination mapping and clears it.
+
+        Returns:
+            List[Tuple[BlockId, BlockId]]: A list mapping source
+                block indices to destination block indices.
+        """
+        return self._cow_tracker.clear_cows()
+
+    def mark_blocks_as_accessed(self, block_ids: List[int],
+                                now: float) -> None:
+        """Mark blocks as accessed, used in prefix caching.
+
+        Since the naive allocator does not implement prefix caching, we do
+        nothing.
+        """
+        pass
+
+    def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
+        """Mark blocks as computed, used in prefix caching.
+
+        Since the naive allocator does not implement prefix caching, we do
+        nothing.
+        """
+        pass
+
+    def get_common_computed_block_ids(
+            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
+        """Determine blocks that can be skipped in prefill.
+
+        Since the naive allocator does not support prefix caching, always return
+        an empty list.
+        """
+        return []
+
+    def promote_to_immutable_block(self, block: Block) -> BlockId:
+        raise NotImplementedError("There is no promotion for naive blocks")
+
+    def get_num_full_blocks_touched(self, blocks: List[Block]) -> int:
+        """Returns the number of full blocks that will be touched by
+        swapping in/out.
+
+        Args:
+            blocks: List of blocks to be swapped.
+        Returns:
+            int: the number of full blocks that will be touched by
+                swapping in/out the given blocks. Non full blocks are ignored
+                when deciding the number of blocks to touch.
+        """
+        # NOTE: for naive block, we use set to eliminate common blocks among
+        # seqs, also we compare the empty slots in the mutable blocks with
+        # lookahead slots to get the number of unique new block that are
+        # needed.
+        old_block_set = set()
+        for block in blocks:
+            if block.is_full:
+                old_block_set.add(block)
+        return len(old_block_set)
+
+    def swap_out(self, blocks: List[Block]) -> None:
+        for block in blocks:
+            self._free_block_id(block)
+
+    def swap_in(self, blocks: List[Block]) -> None:
+        for block in blocks:
+            # Here we allocate either immutable or mutable block and then
+            # extract its block_id. Note that the block object is released
+            # and the block_id is assigned to "block" to allow reusing the
+            # existing "block" object
+            if block.is_full:
+                tmp_block = self.allocate_immutable_block(
+                    prev_block=block.prev_block, token_ids=block.token_ids)
+            else:
+                tmp_block = self.allocate_mutable_block(
+                    prev_block=block.prev_block)
+                tmp_block.append_token_ids(block.token_ids)
+
+            block_id = tmp_block.block_id
+            tmp_block.block_id = None
+            self._block_pool.free_block(tmp_block)
+
+            block.block_id = block_id  # Assign block_id
+
+    def get_prefix_cache_hit_rate(self) -> float:
+        return -1
+
+    def reset_prefix_cache(self) -> bool:
+        """No prefix cache for naive block allocator."""
+        return True
+
+    def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]:
+        # Not applicable for naive block allocator.
+        return []
+
+
+class NaiveBlock(Block):
+    """An implementation of the Block class that does not support prefix
+    caching.
+
+    The NaiveBlock class represents a block of token IDs with a fixed size. It
+    provides methods for appending token IDs to the block and manages copy-on
+    -write operations when necessary.
+
+    Args:
+        prev_block (Block): The previous block in the sequence.
+        token_ids (List[int]): The initial token IDs to be stored in the block.
+        block_size (int): The maximum number of token IDs that can be stored in
+            the block.
+        allocator (BlockAllocator): The block allocator associated with this
+            block.
+        block_id (Optional[int], optional): The physical block index
+            of this block. Defaults to None, which means no allocation has been
+            made.
+        _cow_target (Optional[Block], optional): The copy-on-write target block.
+            If not provided, it defaults to self.
+    """
+
+    def __init__(self,
+                 prev_block: Optional[Block],
+                 token_ids: List[int],
+                 block_size: int,
+                 allocator: BlockAllocator,
+                 block_id: Optional[int] = None,
+                 _cow_target: Optional[Block] = None,
+                 extra_hash: Optional[int] = None):
+        self._token_ids: List[int] = []
+        self._block_size = block_size
+        self._prev_block = prev_block
+        self._block_id = block_id
+        self._allocator = allocator
+        self._cow_target = _cow_target if _cow_target is not None else self
+
+        self._append_token_ids_no_cow(token_ids)
+
+    def append_token_ids(self, token_ids: List[int]) -> None:
+        """Appends the given token IDs to the block and performs a 
+        copy-on-write if necessary.
+
+        Args:
+            token_ids (Optional[List[int]]): The token IDs to be appended 
+                to the block.
+        """
+        self._append_token_ids_no_cow(token_ids)
+
+        if self._block_id is not None:
+            self._block_id = (self._allocator.cow_block_if_not_appendable(
+                self._cow_target))
+
+    def _append_token_ids_no_cow(self, token_ids: List[int]) -> None:
+        """Appends the given token IDs to the block
+
+        Args:
+            token_ids (List[int]): The token IDs to be appended to the block.
+        """
+        if len(token_ids) == 0:
+            return
+
+        assert len(token_ids) <= self.num_empty_slots
+
+        self._token_ids.extend(token_ids)
+
+    @property
+    def computed(self) -> bool:
+        raise NotImplementedError
+
+    @computed.setter
+    def computed(self, value) -> None:
+        raise NotImplementedError
+
+    @property
+    def last_accessed(self) -> float:
+        raise NotImplementedError
+
+    @last_accessed.setter
+    def last_accessed(self, last_accessed_ts: float):
+        raise NotImplementedError
+
+    @property
+    def block_id(self) -> Optional[int]:
+        return self._block_id
+
+    @block_id.setter
+    def block_id(self, value: Optional[int]) -> None:
+        self._block_id = value
+
+    @property
+    def is_full(self) -> bool:
+        return self.num_empty_slots == 0
+
+    @property
+    def num_empty_slots(self) -> int:
+        return self._block_size - len(self.token_ids)
+
+    @property
+    def token_ids(self) -> List[int]:
+        return self._token_ids
+
+    @property
+    def num_tokens_total(self) -> int:
+        raise NotImplementedError(
+            "num_tokens_total is not used for naive block")
+
+    @property
+    def block_size(self) -> int:
+        return self._block_size
+
+    @property
+    def prev_block(self) -> Optional["Block"]:
+        return self._prev_block
+
+    @property
+    def extra_hash(self):
+        return None
+
+    @property
+    def content_hash(self) -> Optional[int]:
+        return None
diff --git a/vllm_v0.10.0/vllm/core/block/prefix_caching_block.py b/vllm_v0.10.0/vllm/core/block/prefix_caching_block.py
new file mode 100644
index 0000000..2913a01
--- /dev/null
+++ b/vllm_v0.10.0/vllm/core/block/prefix_caching_block.py
@@ -0,0 +1,1135 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Token blocks."""
+import sys
+from bisect import bisect_left
+from os.path import commonprefix
+from typing import (Callable, Dict, FrozenSet, Iterable, List, Optional, Set,
+                    Tuple)
+
+from vllm.core.block.common import (CacheMetricData, CopyOnWriteTracker,
+                                    get_all_blocks_recursively)
+from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId, Device,
+                                        DeviceAwareBlockAllocator)
+from vllm.core.block.naive_block import (BlockPool, NaiveBlock,
+                                         NaiveBlockAllocator)
+from vllm.core.evictor import EvictionPolicy, Evictor, make_evictor
+from vllm.logger import init_logger
+from vllm.sequence import Sequence
+
+PrefixHash = int
+
+# By default, we init our block access time as _DEFAULT_LAST_ACCESSED_TIME
+# so that if we find one block is still hold _DEFAULT_LAST_ACCESSED_TIME,
+# then we know this block hasn't been accessed yet.
+_DEFAULT_LAST_ACCESSED_TIME = -1
+
+logger = init_logger(__name__)
+
+
+class BlockTracker:
+    """Used to track the status of a block inside the prefix caching allocator
+    """
+    __slots__ = ("active", "last_accessed", "computed")
+
+    def reset(self):
+        self.last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME
+        self.computed: bool = False
+
+    def __init__(self):
+        self.active: bool = False
+        self.reset()
+
+    def enable(self):
+        assert not self.active
+        self.active = True
+        self.reset()
+
+    def disable(self):
+        assert self.active
+        self.active = False
+        self.reset()
+
+
+class PrefixCachingBlockAllocator(BlockAllocator):
+    """A block allocator that implements prefix caching.
+
+    The PrefixCachingBlockAllocator maintains a cache of blocks based on their
+    content hash. It reuses blocks with the same content hash to avoid redundant
+    memory allocation. The allocator also supports copy-on-write operations.
+
+    Args:
+        num_blocks (int): The total number of blocks to manage.
+        block_size (int): The size of each block in tokens.
+        block_ids(Optional[Iterable[int]], optional): An optional iterable of
+            block IDs. If not provided, block IDs will be assigned sequentially
+            from 0 to num_blocks - 1.
+    """
+
+    # Note that we use 'None' as a string here instead of None because
+    # as of Python 3.12, hash(None) returns a constant predictable value.
+    # This could possibly make it easier to find and exploit hash
+    # collisions. 'None' as a string will be hashed differently per process,
+    # but consistently within the same process. This is the same as the
+    # behavior of None prior to Python 3.12.
+    _none_hash: int = hash('None')
+
+    # Implements Block.Factory.
+    def __init__(
+        self,
+        num_blocks: int,
+        block_size: int,
+        block_ids: Optional[Iterable[int]] = None,
+        eviction_policy: EvictionPolicy = EvictionPolicy.LRU,
+    ):
+        if block_ids is None:
+            block_ids = range(num_blocks)
+
+        self._block_size = block_size
+
+        # A mapping of prefix hash to block index. All blocks which have a
+        # prefix hash will be in this dict, even if they have refcount 0.
+        self._cached_blocks: Dict[PrefixHash, BlockId] = {}
+
+        # A list of immutable block IDs that have been touched by scheduler
+        # and should be marked as computed after an entire batch of sequences
+        # are scheduled.
+        self._touched_blocks: Set[BlockId] = set()
+
+        # Used to track status of each physical block id
+        self._block_tracker: Dict[BlockId, BlockTracker] = {}
+        for block_id in block_ids:
+            self._block_tracker[block_id] = BlockTracker()
+
+        # Pre-allocate "num_blocks * extra_factor" block objects.
+        # The "* extra_factor" is a buffer to allow more block objects
+        # than physical blocks
+        extra_factor = 4
+        self._block_pool = BlockPool(self._block_size, self._create_block,
+                                     self, num_blocks * extra_factor)
+
+        # An allocator for blocks that do not have prefix hashes.
+        self._hashless_allocator = NaiveBlockAllocator(
+            create_block=self._create_block,  # type: ignore
+            num_blocks=num_blocks,
+            block_size=block_size,
+            block_ids=block_ids,
+            block_pool=self._block_pool,  # Share block pool here
+        )
+
+        # Evitor used to maintain how we want to handle those computed blocks
+        # if we find memory pressure is high.
+        self.eviction_policy = eviction_policy
+        self.evictor: Evictor = make_evictor(self.eviction_policy)
+
+        # We share the refcounter between allocators. This allows us to promote
+        # blocks originally allocated in the hashless allocator to immutable
+        # blocks.
+        self._refcounter = self._hashless_allocator.refcounter
+
+        self._cow_tracker = CopyOnWriteTracker(
+            refcounter=self._refcounter.as_readonly())
+
+        self.metric_data = CacheMetricData()
+
+    def _create_block(
+        self,
+        prev_block: Optional[Block],
+        token_ids: List[int],
+        block_size: int,
+        allocator: BlockAllocator,
+        block_id: Optional[int] = None,
+        computed: bool = False,
+        extra_hash: Optional[int] = None,
+    ) -> Block:
+        # Bind block to self.
+        allocator = self
+
+        return PrefixCachingBlock(
+            prev_block=prev_block,
+            token_ids=token_ids,
+            block_size=block_size,
+            block_id=block_id,
+            allocator=allocator,
+            computed=computed,
+            extra_hash=extra_hash,
+        )
+
+    def allocate_immutable_block(self,
+                                 prev_block: Optional[Block],
+                                 token_ids: List[int],
+                                 extra_hash: Optional[int] = None,
+                                 device: Optional[Device] = None) -> Block:
+        """Allocates an immutable block with the given token IDs, reusing cached
+        blocks if possible.
+
+        Args:
+            prev_block (Optional[Block]): The previous block in the sequence.
+            token_ids (List[int]): The token IDs to be stored in the block.
+
+        Returns:
+            Block: The allocated immutable block.
+        """
+        assert device is None
+        assert_prefix_caching_block_or_none(prev_block)
+
+        # First, try to create a block that points to cached data
+        block = self._block_pool.init_block(prev_block=prev_block,
+                                            token_ids=token_ids,
+                                            block_size=self._block_size,
+                                            physical_block_id=None,
+                                            extra_hash=extra_hash)
+        assert block.content_hash is not None
+
+        cached_block_id = self._cached_blocks.get(block.content_hash, None)
+        if cached_block_id is not None:
+            self.metric_data.query(hit=True)
+            block.block_id = cached_block_id
+            self._incr_refcount_cached_block(block)
+            return block
+        self.metric_data.query(hit=False)
+        self._block_pool.free_block(block)
+
+        # No cached block => Allocate a new block
+        block = self.allocate_mutable_block(prev_block, extra_hash=extra_hash)
+        block.append_token_ids(token_ids)
+        return block
+
+    def allocate_immutable_blocks(
+            self,
+            prev_block: Optional[Block],
+            block_token_ids: List[List[int]],
+            extra_hash: Optional[int] = None,
+            device: Optional[Device] = None) -> List[Block]:
+        blocks = []
+        for token_ids in block_token_ids:
+            prev_block = self.allocate_immutable_block(prev_block=prev_block,
+                                                       token_ids=token_ids,
+                                                       device=device,
+                                                       extra_hash=extra_hash)
+            blocks.append(prev_block)
+        return blocks
+
+    def allocate_mutable_block(self,
+                               prev_block: Optional[Block],
+                               extra_hash: Optional[int] = None,
+                               device: Optional[Device] = None) -> Block:
+        """Allocates a mutable block. If there are no free blocks, this will
+        evict unused cached blocks.
+
+        Args:
+            prev_block (Block): The previous block in the sequence.
+                None is not allowed unlike it is super class.
+
+        Returns:
+            Block: The allocated mutable block.
+        """
+        assert device is None
+        assert_prefix_caching_block_or_none(prev_block)
+
+        block_id = self._allocate_block_id()
+        block = self._block_pool.init_block(prev_block=prev_block,
+                                            token_ids=[],
+                                            block_size=self._block_size,
+                                            physical_block_id=block_id,
+                                            extra_hash=extra_hash)
+        assert not block.computed
+        assert block.content_hash is None
+        return block
+
+    def _incr_refcount_cached_block(self, block: Block) -> None:
+        # Set this block to be "computed" since it is pointing to a
+        # cached block id (which was already computed)
+        block.computed = True
+
+        block_id = block.block_id
+        assert block_id is not None
+
+        refcount = self._refcounter.incr(block_id)
+        if refcount == 1:
+            # In case a cached block was evicted, restore its tracking
+            if block_id in self.evictor:
+                self.evictor.remove(block_id)
+
+            self._track_block_id(block_id, computed=True)
+
+    def _decr_refcount_cached_block(self, block: Block) -> None:
+        # Ensure this is immutable/cached block
+        assert block.content_hash is not None
+
+        block_id = block.block_id
+        assert block_id is not None
+
+        refcount = self._refcounter.decr(block_id)
+        if refcount > 0:
+            block.block_id = None
+            return
+        else:
+            assert refcount == 0
+
+        # No longer used
+        assert block.content_hash in self._cached_blocks
+
+        # Add the cached block to the evictor
+        # (This keeps the cached block around so it can be reused)
+        self.evictor.add(block_id, block.content_hash, block.num_tokens_total,
+                         self._block_tracker[block_id].last_accessed)
+
+        # Stop tracking the block
+        self._untrack_block_id(block_id)
+
+        block.block_id = None
+
+    def _decr_refcount_hashless_block(self, block: Block) -> None:
+        block_id = block.block_id
+        assert block_id is not None
+
+        # We may have a fork case where block is shared,
+        # in which case, we cannot remove it from tracking
+        refcount = self._refcounter.get(block_id)
+        if refcount == 1:
+            self._untrack_block_id(block_id)
+
+        # Decrement refcount of the block_id, but do not free the block object
+        # itself (will be handled by the caller)
+        self._hashless_allocator.free(block, keep_block_object=True)
+
+    def _allocate_block_id(self) -> BlockId:
+        """First tries to allocate a block id from the hashless allocator,
+        and if there are no blocks, then tries to evict an unused cached block.
+        """
+        hashless_block_id = self._maybe_allocate_hashless_block_id()
+        if hashless_block_id is not None:
+            return hashless_block_id
+
+        evicted_block_id = self._maybe_allocate_evicted_block_id()
+        if evicted_block_id is not None:
+            return evicted_block_id
+
+        # No block available in hashless allocator, nor in unused cache blocks.
+        raise BlockAllocator.NoFreeBlocksError()
+
+    def _maybe_allocate_hashless_block_id(self) -> Optional[BlockId]:
+        try:
+            # Allocate mutable block and extract its block_id
+            block = self._hashless_allocator.allocate_mutable_block(
+                prev_block=None)
+            block_id = block.block_id
+            self._block_pool.free_block(block)
+
+            self._track_block_id(block_id, computed=False)
+            return block_id
+        except BlockAllocator.NoFreeBlocksError:
+            return None
+
+    def _maybe_allocate_evicted_block_id(self) -> Optional[BlockId]:
+        if self.evictor.num_blocks == 0:
+            return None
+
+        # Here we get an evicted block, which is only added
+        # into evictor if its ref counter is 0
+        # and since its content would be changed, we need
+        # to remove it from _cached_blocks's tracking list
+        block_id, content_hash_to_evict = self.evictor.evict()
+
+        # Sanity checks
+        assert content_hash_to_evict in self._cached_blocks
+        _block_id = self._cached_blocks[content_hash_to_evict]
+        assert self._refcounter.get(_block_id) == 0
+        assert _block_id == block_id
+
+        self._cached_blocks.pop(content_hash_to_evict)
+
+        self._refcounter.incr(block_id)
+        self._track_block_id(block_id, computed=False)
+
+        return block_id
+
+    def _free_block_id(self, block: Block) -> None:
+        """Decrements the refcount of the block. The block may be in two 
+        possible states: (1) immutable/cached or (2) mutable/hashless. 
+        In the first case, the refcount is decremented directly and the block
+        may be possibly added to the evictor. In other case, hashless 
+        allocator free(..) with keep_block_object=True is called to only free
+        the block id (since the block object may be reused by the caller)
+        """
+        block_id = block.block_id
+        assert block_id is not None, "Freeing unallocated block is undefined"
+
+        if block.content_hash is not None:
+            # Immutable: This type of block is always cached, and we want to
+            # keep it in the evictor for future reuse
+            self._decr_refcount_cached_block(block)
+        else:
+            # Mutable: This type of block is not cached, so we release it
+            # directly to the hashless allocator
+            self._decr_refcount_hashless_block(block)
+
+        assert block.block_id is None
+
+    def free(self, block: Block, keep_block_object: bool = False) -> None:
+        """Release the block (look at free_block_id(..) docs)
+        """
+        # Release the physical block index
+        self._free_block_id(block)
+
+        # Release the block object to the pool
+        if not keep_block_object:
+            self._block_pool.free_block(block)
+
+    def fork(self, last_block: Block) -> List[Block]:
+        """Creates a new sequence of blocks that shares the same underlying
+        memory as the original sequence.
+
+        Args:
+            last_block (Block): The last block in the original sequence.
+
+        Returns:
+            List[Block]: The new sequence of blocks that shares the same memory
+                as the original sequence.
+        """
+        source_blocks = get_all_blocks_recursively(last_block)
+
+        forked_blocks: List[Block] = []
+        prev_block = None
+        for block in source_blocks:
+            block_id = block.block_id
+            assert block_id is not None
+
+            refcount = self._refcounter.incr(block_id)
+            assert refcount != 1, "can't fork free'd block_id = {}".format(
+                block_id)
+
+            forked_block = self._block_pool.init_block(
+                prev_block=prev_block,
+                token_ids=block.token_ids,
+                block_size=self._block_size,
+                physical_block_id=block_id,
+                extra_hash=block.extra_hash)
+
+            forked_blocks.append(forked_block)
+            prev_block = forked_blocks[-1]
+
+        return forked_blocks
+
+    def get_num_free_blocks(self, device: Optional[Device] = None) -> int:
+        assert device is None
+        # The number of free blocks is the number of hashless free blocks
+        # plus the number of blocks evictor could free from its list.
+        return self._hashless_allocator.get_num_free_blocks(
+        ) + self.evictor.num_blocks
+
+    def get_num_total_blocks(self) -> int:
+        return self._hashless_allocator.get_num_total_blocks()
+
+    def get_physical_block_id(self, absolute_id: int) -> int:
+        """Returns the zero-offset block id on certain block allocator
+        given the absolute block id.
+
+        Args:
+            absolute_id (int): The absolute block id for the block 
+                in whole allocator.
+
+        Returns:
+            int: The rzero-offset block id on certain device.
+        """
+        return sorted(self.all_block_ids).index(absolute_id)
+
+    @property
+    def all_block_ids(self) -> FrozenSet[int]:
+        return self._hashless_allocator.all_block_ids
+
+    def get_prefix_cache_hit_rate(self) -> float:
+        return self.metric_data.get_hit_rate()
+
+    def reset_prefix_cache(self) -> bool:
+        """Reset prefix cache. This function may be used in RLHF
+        flows to invalid prefix caching after the weights are updated,
+        or used for resetting prefix caching status for benchmarking.
+
+        Returns:
+            bool: True if the prefix cache is successfully reset,
+            False otherwise.
+        """
+        num_used_blocks = (self.get_num_total_blocks() -
+                           self.get_num_free_blocks())
+        if num_used_blocks > 0:
+            logger.warning(
+                "Failed to reset prefix cache because some "
+                "blocks (%d) are not freed yet", num_used_blocks)
+            return False
+
+        # Free all blocks in the evictor.
+        while (block_id :=
+               self._maybe_allocate_evicted_block_id()) is not None:
+            self._hashless_allocator.free_block_id(block_id)
+
+        # Should not have any cached blocks because all blocks are evicted.
+        assert not self._cached_blocks
+
+        # Reset the evictor.
+        self.evictor = make_evictor(self.eviction_policy)
+
+        # Reset the block tracker.
+        for block_id in self._block_tracker:
+            self._block_tracker[block_id] = BlockTracker()
+
+        # Reset the metrics.
+        self.metric_data = CacheMetricData()
+
+        logger.info("Successfully reset prefix cache")
+        return True
+
+    def is_block_cached(self, block: Block) -> bool:
+        assert block.content_hash is not None
+        return block.content_hash in self._cached_blocks
+
+    def promote_to_immutable_block(self, block: Block) -> BlockId:
+        """Once a mutable block is full, it can be promoted to an immutable
+        block. This means that its content can be referenced by future blocks
+        having the same prefix.
+
+        Note that if we already have a cached block with the same content, we
+        will replace the newly-promoted block's mapping with the existing cached
+        block id.
+
+        Args:
+            block: The mutable block to be promoted.
+
+        Returns:
+            BlockId: Either the original block index, or the block index of
+                the previously cached block matching the same content.
+        """
+        # Ensure block can be promoted
+        assert block.content_hash is not None
+        assert block.block_id is not None
+        assert self._refcounter.get(block.block_id) > 0
+
+        if block.content_hash not in self._cached_blocks:
+            # No cached content hash => Set this block as cached.
+            # Note that this block cannot be marked as computed yet
+            # because other sequences in the same batch cannot reuse
+            # this block.
+            self._cached_blocks[block.content_hash] = block.block_id
+            # Mark this block as touched so that it can be marked as
+            # computed after the entire batch of sequences are scheduled.
+            self._touched_blocks.add(block.block_id)
+            return block.block_id
+
+        # Reuse the cached content hash
+        self._decr_refcount_hashless_block(block)
+        block.block_id = self._cached_blocks[block.content_hash]
+
+        # Increment refcount of the cached block and (possibly) restore
+        # it from the evictor.
+        # Note that in this case, the block is marked as computed
+        self._incr_refcount_cached_block(block)
+
+        return block.block_id
+
+    def cow_block_if_not_appendable(self, block: Block) -> BlockId:
+        """Performs a copy-on-write operation on the given block if it is not
+        appendable.
+
+        Args:
+            block (Block): The block to check for copy-on-write.
+
+        Returns:
+            BlockId: The block index of the new block if a copy-on-write 
+                operation was performed, or the original block index if
+                no copy-on-write was necessary.
+        """
+        src_block_id = block.block_id
+        assert src_block_id is not None
+
+        if self._cow_tracker.is_appendable(block):
+            return src_block_id
+
+        self._free_block_id(block)
+        trg_block_id = self._allocate_block_id()
+
+        self._cow_tracker.record_cow(src_block_id, trg_block_id)
+
+        return trg_block_id
+
+    def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]:
+        """Returns the copy-on-write source->destination mapping and clears it.
+
+        Returns:
+            List[Tuple[BlockId, BlockId]]: A list mapping source
+                block indices to destination block indices.
+        """
+        return self._cow_tracker.clear_cows()
+
+    def mark_blocks_as_accessed(self, block_ids: List[int],
+                                now: float) -> None:
+        """Mark blocks as accessed, used in prefix caching.
+
+        If the block is added into evictor, we need to update corresponding
+        info in evictor's metadata.
+        """
+
+        for block_id in block_ids:
+            if self._block_tracker[block_id].active:
+                self._block_tracker[block_id].last_accessed = now
+            elif block_id in self.evictor:
+                self.evictor.update(block_id, now)
+            else:
+                raise ValueError(
+                    "Mark block as accessed which is not belonged to GPU")
+
+    def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
+        # Mark all touched blocks as computed.
+        for block_id in self._touched_blocks:
+            self._block_tracker[block_id].computed = True
+        self._touched_blocks.clear()
+
+    def _track_block_id(self, block_id: Optional[BlockId],
+                        computed: bool) -> None:
+        assert block_id is not None
+        self._block_tracker[block_id].enable()
+        self._block_tracker[block_id].computed = computed
+
+    def _untrack_block_id(self, block_id: Optional[BlockId]) -> None:
+        assert block_id is not None
+        self._block_tracker[block_id].disable()
+
+    def block_is_computed(self, block_id: int) -> bool:
+        if self._block_tracker[block_id].active:
+            return self._block_tracker[block_id].computed
+        else:
+            return block_id in self.evictor
+
+    def get_common_computed_block_ids(
+            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
+        """Return the block ids that are common for a given sequence group.
+
+        Only those blocks that are immutable and already be marked
+        compyted would be taken consideration.
+        """
+
+        # NOTE We exclude the last block to avoid the case where the entire
+        # prompt is cached. This would cause erroneous behavior in model
+        # runner.
+
+        # It returns a list of int although type annotation says list of string.
+        if len(computed_seq_block_ids) == 1:
+            return computed_seq_block_ids[0]
+
+        return commonprefix([
+            ids for ids in computed_seq_block_ids  # type: ignore
+            if ids
+        ])
+
+    def get_num_full_blocks_touched(self, blocks: List[Block]) -> int:
+        """Returns the number of full blocks that will be touched by
+        swapping in/out.
+
+        Args:
+            blocks: List of blocks to be swapped.
+        Returns:
+            int: the number of full blocks that will be touched by
+                swapping in/out the given blocks. Non full blocks are ignored
+                when deciding the number of blocks to touch.
+        """
+        num_touched_blocks: int = 0
+        for block in blocks:
+            # If the block has a match in the cache and the cached
+            # block is not referenced, then we still count it as a
+            # touched block
+            if block.is_full and (not self.is_block_cached(block) or \
+                (block.content_hash is not None and \
+                self._cached_blocks[block.content_hash] in \
+                        self.evictor)):
+                num_touched_blocks += 1
+        return num_touched_blocks
+
+    def swap_out(self, blocks: List[Block]) -> None:
+        """Execute the swap out actions. Basically just free the 
+        given blocks.
+
+        Args:
+            blocks: List of blocks to be swapped out.
+        """
+        for block in blocks:
+            self._free_block_id(block)
+
+    def swap_in(self, blocks: List[Block]) -> None:
+        """Execute the swap in actions. Change the block id from 
+        old allocator to current allocator for each block to finish 
+        the block table update. 
+
+        Args:
+            blocks: List of blocks to be swapped in.
+        """
+        for block in blocks:
+            # Here we allocate either immutable or mutable block and then
+            # extract its block_id. Note that the block object is released
+            # and the block_id is assigned to "block" to allow reusing the
+            # existing "block" object
+            if block.is_full:
+                tmp_block = self.allocate_immutable_block(
+                    prev_block=block.prev_block,
+                    token_ids=block.token_ids,
+                    extra_hash=block.extra_hash)
+            else:
+                tmp_block = self.allocate_mutable_block(
+                    prev_block=block.prev_block, extra_hash=block.extra_hash)
+                tmp_block.append_token_ids(block.token_ids)
+
+            block_id = tmp_block.block_id
+            self._block_pool.free_block(tmp_block)
+
+            block.block_id = block_id  # Assign block_id
+
+    def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]:
+        """
+        Given a list of block hashes, return the prefix of the block hashes that
+        are all cached.
+
+        Since a block's block hash includes the hashes of all previous blocks,
+        and we only allocate/deallocate blocks in the entire sequence, so if a
+        block is cached, then all previous blocks are also cached. With this
+        property, we can use binary search to find the prefix of cached blocks.
+
+        Args:
+            block_hashes (List[int]): The list of block hashes.
+
+        Returns:
+            List[int]: The prefix of the `block_hashes` that are cached.
+        """
+
+        def _block_is_cached(block_hash: PrefixHash) -> bool:
+            if block_hash not in self._cached_blocks:
+                return False
+
+            cached_block_id = self._cached_blocks[block_hash]
+            # We only consider the blocks that are marked as computed.
+            return self.block_is_computed(cached_block_id)
+
+        def _bisect_left(a, x, key: Callable[[PrefixHash], bool]) -> int:
+
+            # python <= 3.10 don't have the key argument
+            if sys.version_info < (3, 10):
+                a = [key(e) for e in a]
+                return bisect_left(a, x)
+            else:
+                return bisect_left(a, x, key=key)
+
+        # Look for the first block that's not cached, and returns the prefix
+        # i.e. blocks that are cached.
+        idx = _bisect_left(block_hashes,
+                           True,
+                           key=lambda x: not _block_is_cached(x))
+        return block_hashes[:idx]
+
+
+class PrefixCachingBlock(Block):
+    """A block implementation that supports prefix caching.
+
+    The PrefixCachingBlock class represents a block of token IDs with prefix
+    caching capabilities. It wraps a NaiveBlock internally and provides
+    additional functionality for content hashing and promoting immutable blocks
+    with the prefix caching allocator.
+
+    Args:
+        prev_block (Optional[PrefixCachingBlock]): The previous block in the
+            sequence.
+        token_ids (List[int]): The initial token IDs to be stored in the block.
+        block_size (int): The maximum number of token IDs that can be stored in
+            the block.
+        allocator (BlockAllocator): The prefix
+            caching block allocator associated with this block.
+        block_id (Optional[int], optional): The physical block index
+            of this block. Defaults to None.
+        extra_hash (Optional[int]): The hash value of additional factors
+            such as adapters that influence the block, apart from the token_ids.
+    """
+
+    # Note that we use 'None' as a string here instead of None because
+    # as of Python 3.12, hash(None) returns a constant predictable value.
+    # This could possibly make it easier to find and exploit hash
+    # collisions. 'None' as a string will be hashed differently per process,
+    # but consistently within the same process. This is the same as the
+    # behavior of None prior to Python 3.12.
+    _none_hash: int = hash('None')
+
+    def __init__(
+        self,
+        prev_block: Optional[Block],
+        token_ids: List[int],
+        block_size: int,
+        allocator: BlockAllocator,
+        block_id: Optional[int] = None,
+        computed: bool = False,
+        extra_hash: Optional[int] = None,
+    ):
+        assert isinstance(allocator, PrefixCachingBlockAllocator), (
+            "Currently this class is only tested with "
+            "PrefixCachingBlockAllocator. Got instead allocator = {}".format(
+                allocator))
+        assert_prefix_caching_block_or_none(prev_block)
+
+        self._prev_block = prev_block
+        self._cached_content_hash: Optional[int] = None
+        self._cached_num_tokens_total: int = 0
+        self._allocator = allocator
+        self._last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME
+        self._computed = computed
+        self._extra_hash = extra_hash
+
+        # On the first time, we create the block object, and next we only
+        # reinitialize it
+        if hasattr(self, "_block"):
+            self._block.__init__(  # type: ignore[has-type]
+                prev_block=prev_block,
+                token_ids=token_ids,
+                block_size=block_size,
+                block_id=block_id,
+                allocator=self._allocator)
+        else:
+            self._block = NaiveBlock(prev_block=prev_block,
+                                     token_ids=token_ids,
+                                     block_size=block_size,
+                                     block_id=block_id,
+                                     allocator=self._allocator)
+
+        self._update_num_tokens_total()
+
+    def _update_num_tokens_total(self):
+        """Incrementally computes the number of tokens that there is
+        till the current block (included)
+        """
+        res = 0
+
+        # Add all previous blocks
+        if self._prev_block is not None:
+            res += self._prev_block.num_tokens_total
+
+        # Add current block
+        res += len(self.token_ids)
+
+        self._cached_num_tokens_total = res
+
+    @property
+    def computed(self) -> bool:
+        return self._computed
+
+    @computed.setter
+    def computed(self, value) -> None:
+        self._computed = value
+
+    @property
+    def last_accessed(self) -> float:
+        return self._last_accessed
+
+    @last_accessed.setter
+    def last_accessed(self, last_accessed_ts: float):
+        self._last_accessed = last_accessed_ts
+
+    def append_token_ids(self, token_ids: List[int]) -> None:
+        """Appends the given token IDs to the block and registers the block as
+        immutable if the block becomes full.
+
+        Args:
+            token_ids (List[int]): The token IDs to be appended to the block.
+        """
+        # Ensure this is mutable block (not promoted)
+        assert self.content_hash is None
+        assert not self.computed
+
+        if len(token_ids) == 0:
+            return
+
+        # Ensure there are input tokens
+        assert token_ids, "Got token_ids = {}".format(token_ids)
+
+        # Naive block handles CoW.
+        self._block.append_token_ids(token_ids)
+        self._update_num_tokens_total()
+
+        # If the content hash is present, then the block can be made immutable.
+        # Register ourselves with the allocator, potentially replacing the
+        # physical block index.
+        if self.content_hash is not None:
+            self.block_id = self._allocator.promote_to_immutable_block(self)
+
+    @property
+    def block_id(self) -> Optional[int]:
+        return self._block.block_id
+
+    @block_id.setter
+    def block_id(self, value) -> None:
+        self._block.block_id = value
+
+    @property
+    def is_full(self) -> bool:
+        return self._block.is_full
+
+    @property
+    def num_empty_slots(self) -> int:
+        return self._block.num_empty_slots
+
+    @property
+    def num_tokens_total(self) -> int:
+        return self._cached_num_tokens_total
+
+    @property
+    def block_size(self) -> int:
+        return self._block.block_size
+
+    @property
+    def token_ids(self) -> List[int]:
+        return self._block.token_ids
+
+    @property
+    def prev_block(self) -> Optional[Block]:
+        return self._prev_block
+
+    @property
+    def extra_hash(self) -> Optional[int]:
+        return self._extra_hash
+
+    @property
+    def content_hash(self) -> Optional[int]:
+        """Return the content-based hash of the current block, or None if it is
+        not yet defined.
+
+        For the content-based hash to be defined, the current block must be
+        full.
+        """
+        # If the hash is already computed, return it.
+        if self._cached_content_hash is not None:
+            return self._cached_content_hash
+
+        # We cannot compute a hash for the current block because it is not full.
+        if not self.is_full:
+            return None
+
+        is_first_block = self._prev_block is None
+        prev_block_hash = (
+            self._none_hash if is_first_block else
+            self._prev_block.content_hash  # type: ignore
+        )
+
+        # Previous block exists but does not yet have a hash.
+        # Return no hash in this case.
+        if prev_block_hash == self._none_hash and not is_first_block:
+            return None
+
+        self._cached_content_hash = PrefixCachingBlock.hash_block_tokens(
+            is_first_block,
+            prev_block_hash,
+            cur_block_token_ids=self.token_ids,
+            extra_hash=self._extra_hash)
+        return self._cached_content_hash
+
+    @classmethod
+    def hash_block_tokens(cls,
+                          is_first_block: bool,
+                          prev_block_hash: Optional[int],
+                          cur_block_token_ids: List[int],
+                          extra_hash: Optional[int] = None) -> int:
+        """Computes a hash value corresponding to the contents of a block and
+        the contents of the preceding block(s). The hash value is used for
+        prefix caching.
+
+        Parameters:
+        - is_first_block (bool): A flag indicating if the block is the first in
+            the sequence.
+        - prev_block_hash (Optional[int]): The hash of the previous block. None
+            if this is the first block.
+        - cur_block_token_ids (List[int]): A list of token ids in the current
+            block. The current block is assumed to be full.
+        - extra_hash (Optional[int]): The hash value of additional factors
+            such as adapters that influence the block, apart from the token_ids.
+
+        Returns:
+        - int: The computed hash value for the block.
+        """
+        if is_first_block and prev_block_hash is None:
+            prev_block_hash = cls._none_hash
+        return hash((is_first_block, prev_block_hash, *cur_block_token_ids,
+                     extra_hash))
+
+
+class ComputedBlocksTracker:
+    """
+    Tracks the computed blocks for each sequence.
+
+    Internally, it maintains a map from sequence id to the list of block hashes
+    for the sequence. We cache the hashes of the full blocks for each sequence,
+    and make sure the hash is calculated in the same way as the allocator.
+    When a sequence is being decoded, we also update the sequence's hash
+    accordingly and incrementally.
+
+    From the sequence hash, with prefix caching enabled, we could also calculate
+    the number of cached tokens for the sequence by looking up the number of
+    cached block hashes in the allocator.
+    """
+
+    # Note that we use 'None' as a string here instead of None because
+    # as of Python 3.12, hash(None) returns a constant predictable value.
+    # This could possibly make it easier to find and exploit hash
+    # collisions. 'None' as a string will be hashed differently per process,
+    # but consistently within the same process. This is the same as the
+    # behavior of None prior to Python 3.12.
+    _none_hash: int = hash('None')
+
+    def __init__(
+        self,
+        allocator: DeviceAwareBlockAllocator,
+        block_size: int,
+        enable_caching: bool,
+    ):
+        self._allocator = allocator
+        self._block_size = block_size
+        self._enable_caching = enable_caching
+
+        # A map from seq_id to the list of block hashes for the
+        # sequence. This is so that we don't have to recompute the block hashes
+        # for the sequence when we need to check if the sequence is cached.
+        # Note a block that's not full will not have its hash calculated and
+        # recorded.
+        self._seq_id_to_blocks_hashes: Dict[int, List[int]] = {}
+
+        # A map from seq_id to the number of tokens that are cached for the
+        # sequence.
+        # We need this so that a sequence in continuous prefill doesn't
+        # accidentally see its cached token count change. See comments in
+        # `get_num_cached_tokens` for more details.
+        self._seq_id_to_num_tokens_computed: Dict[int, int] = {}
+
+    def _update_seq_hashes(self, seq: Sequence) -> None:
+        """Incrementally update the sequence's block hashes and record them."""
+        assert self._enable_caching
+
+        block_hashes_recorded = self._seq_id_to_blocks_hashes.get(
+            seq.seq_id, [])
+        cur_num_blocks_recorded = len(block_hashes_recorded)
+        token_ids = seq.get_token_ids()
+        assert len(token_ids) >= cur_num_blocks_recorded * self._block_size, (
+            f"The sequence has {len(token_ids)} tokens, but"
+            f" already recorded {cur_num_blocks_recorded} blocks. "
+            "This should not happen since we assume blocks are "
+            "only appended other than recomputation. When the sequence is "
+            "recomputed, we should have removed the info of the old blocks.")
+        # Update the computed block hashes for the sequence. Since only full
+        # blocks are considered as "computed", we take floor here.
+        num_computed_blocks = len(token_ids) // self._block_size
+
+        # We need to know the hash of the previous block to compute the hash of
+        # the current block so that blocks could be uniquely identified across
+        # sequences of prefixes.
+        prev_block_hash = (self._none_hash if cur_num_blocks_recorded == 0 else
+                           block_hashes_recorded[-1])
+        # Only update the computed block hashes for the new blocks
+        for i in range(cur_num_blocks_recorded, num_computed_blocks):
+            assert len(token_ids) >= (i + 1) * self._block_size
+            block_token_ids = token_ids[i * self._block_size:(i + 1) *
+                                        self._block_size]
+
+            # NOTE: If there are any factors affecting the block besides
+            # token_ids, they should be added as input to extra_hash.
+            extra_hash = seq.extra_hash()
+
+            # This has to be kept in sync with the allocator's hash
+            # calculation.
+            block_hash = PrefixCachingBlock.hash_block_tokens(
+                is_first_block=prev_block_hash == self._none_hash,
+                prev_block_hash=prev_block_hash,
+                cur_block_token_ids=block_token_ids,
+                extra_hash=extra_hash,
+            )
+            block_hashes_recorded.append(block_hash)
+            prev_block_hash = block_hash
+
+        self._seq_id_to_blocks_hashes[seq.seq_id] = block_hashes_recorded
+
+    def get_num_cached_tokens(self, seq: Sequence) -> int:
+        if not self._enable_caching:
+            return 0
+
+        # We always try to update the sequence hashes on the fly.
+        # This is to ensure that we don't miss any cached tokens for the
+        # sequence during decode.
+        # This routine should only update hash for any new blocks too.
+        self._update_seq_hashes(seq)
+
+        num_computed_tokens_prev = self._seq_id_to_num_tokens_computed.get(
+            seq.seq_id, None)
+
+        # TODO(rickyx): This hack could be removed once we mark blocks as
+        # computed correctly with chunked prefills.
+        if num_computed_tokens_prev is not None and seq.is_prefill():
+            # For a sequence that is still in prefill, we don't
+            # recompute the number of cached tokens.
+            # This also handles correctly chunked prefill since currently
+            # we mark blocks as computed even if the sequence is still partially
+            # prefilled. So a continuously prefilled sequence should not
+            # see its cached token count change while running.
+            return num_computed_tokens_prev
+
+        block_hashes = self._seq_id_to_blocks_hashes[seq.seq_id]
+
+        # This is O(logN), where N is the number of blocks.
+        num_cached_blocks = len(
+            self._allocator.find_cached_blocks_prefix(block_hashes))
+        num_cached_tokens = num_cached_blocks * self._block_size
+        self._seq_id_to_num_tokens_computed[seq.seq_id] = num_cached_tokens
+        return num_cached_tokens
+
+    def remove_seq(self, seq_id: int) -> None:
+        """Stop tracking the sequence."""
+        if not self._enable_caching:
+            return
+        assert seq_id in self._seq_id_to_blocks_hashes
+        del self._seq_id_to_blocks_hashes[seq_id]
+
+        assert seq_id in self._seq_id_to_num_tokens_computed
+        del self._seq_id_to_num_tokens_computed[seq_id]
+
+
+class LastAccessBlocksTracker:
+    """Manages the last access time of the tracked sequences, in order to allow
+    an efficient update of allocator's block last access times
+    """
+
+    def __init__(self, allocator):
+        self._allocator = allocator
+        self._seq_last_access: Dict[int, Optional[float]] = {}
+
+    def add_seq(self, seq_id: int) -> None:
+        """Start tracking seq_id
+        """
+        assert seq_id not in self._seq_last_access
+        self._seq_last_access[seq_id] = None
+
+    def remove_seq(self, seq_id: int) -> None:
+        """Stop tracking seq_id
+        """
+        assert seq_id in self._seq_last_access
+        del self._seq_last_access[seq_id]
+
+    def update_last_access(self, seq_id: int, time: float) -> None:
+        assert seq_id in self._seq_last_access
+        self._seq_last_access[seq_id] = time
+
+    def update_seq_blocks_last_access(self, seq_id: int,
+                                      block_ids: List[int]) -> None:
+        assert seq_id in self._seq_last_access
+
+        ts = self._seq_last_access[seq_id]
+
+        if ts is None:
+            # No last access was recorded, no need to update.
+            return
+
+        self._allocator.mark_blocks_as_accessed(block_ids, ts)
+
+
+def assert_prefix_caching_block_or_none(block: Optional[Block]):
+    if block is None:
+        return
+    assert isinstance(block,
+                      PrefixCachingBlock), "Got block = {}".format(block)
diff --git a/vllm_v0.10.0/vllm/core/block/utils.py b/vllm_v0.10.0/vllm/core/block/utils.py
new file mode 100644
index 0000000..e933c6e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/core/block/utils.py
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Block manager utils."""
+from vllm.sequence import SequenceGroup
+from vllm.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
+                        STR_NOT_IMPL_ENC_DEC_SWA)
+
+
+def check_no_caching_or_swa_for_blockmgr_encdec(
+        block_mgr, seq_group: SequenceGroup) -> None:
+    '''
+    Enforce that prefix caching & sliding-window attention (SWA)
+    are currently unsupported *specifically* for encoder/decoder models.
+
+    Raises NotImplementedError if unsupported scenario is detected.
+
+    Arguments:
+
+    * block_mgr: BlockSpaceManager instance
+    * seq_group: SequenceGroup passed to block_mgr
+    '''
+
+    if seq_group.is_encoder_decoder():
+        if block_mgr.max_block_sliding_window is not None:
+            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA)
+
+        if block_mgr.enable_caching:
+            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE)
diff --git a/vllm_v0.10.0/vllm/core/block_manager.py b/vllm_v0.10.0/vllm/core/block_manager.py
new file mode 100644
index 0000000..4ec5a77
--- /dev/null
+++ b/vllm_v0.10.0/vllm/core/block_manager.py
@@ -0,0 +1,525 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""A block manager that manages token blocks."""
+from typing import Dict, List, Optional
+from typing import Sequence as GenericSequence
+from typing import Tuple
+
+from vllm.core.block.block_table import BlockTable
+from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
+from vllm.core.block.interfaces import Block
+from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker,
+                                                  LastAccessBlocksTracker)
+from vllm.core.block.utils import check_no_caching_or_swa_for_blockmgr_encdec
+from vllm.core.interfaces import AllocStatus, BlockSpaceManager
+from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
+from vllm.utils import Device
+
+SeqId = int
+EncoderSeqId = str
+
+
+class SelfAttnBlockSpaceManager(BlockSpaceManager):
+    """BlockSpaceManager which manages the allocation of KV cache.
+
+    It owns responsibility for allocation, swapping, allocating memory for
+    autoregressively-generated tokens, and other advanced features such as
+    prefix caching, forking/copy-on-write, and sliding-window memory allocation.
+
+    This class implements the design described in
+    https://github.com/vllm-project/vllm/pull/3492.
+
+    Lookahead slots
+        The block manager has the notion of a "lookahead slot". These are slots
+        in the KV cache that are allocated for a sequence. Unlike the other
+        allocated slots, the content of these slots is undefined -- the worker
+        may use the memory allocations in any way.
+
+        In practice, a worker could use these lookahead slots to run multiple
+        forward passes for a single scheduler invocation. Each successive
+        forward pass would write KV activations to the corresponding lookahead
+        slot. This allows low inter-token latency use-cases, where the overhead
+        of continuous batching scheduling is amortized over >1 generated tokens.
+
+        Speculative decoding uses lookahead slots to store KV activations of
+        proposal tokens.
+
+        See https://github.com/vllm-project/vllm/pull/3250 for more information
+        on lookahead scheduling.
+
+    Args:
+        block_size (int): The size of each memory block.
+        num_gpu_blocks (int): The number of memory blocks allocated on GPU.
+        num_cpu_blocks (int): The number of memory blocks allocated on CPU.
+        watermark (float, optional): The threshold used for memory swapping.
+            Defaults to 0.01.
+        sliding_window (Optional[int], optional): The size of the sliding
+            window. Defaults to None.
+        enable_caching (bool, optional): Flag indicating whether caching is
+            enabled. Defaults to False.
+    """
+
+    def __init__(
+        self,
+        block_size: int,
+        num_gpu_blocks: int,
+        num_cpu_blocks: int,
+        watermark: float = 0.01,
+        sliding_window: Optional[int] = None,
+        enable_caching: bool = False,
+    ) -> None:
+        self.block_size = block_size
+        self.num_total_gpu_blocks = num_gpu_blocks
+        self.num_total_cpu_blocks = num_cpu_blocks
+
+        self.sliding_window = sliding_window
+        # max_block_sliding_window is the max number of blocks that need to be
+        # allocated
+        self.max_block_sliding_window = None
+        if sliding_window is not None:
+            # +1 here because // rounds down
+            num_blocks = sliding_window // block_size + 1
+            # +1 here because the last block may not be full,
+            # and so the sequence stretches one more block at the beginning
+            # For example, if sliding_window is 3 and block_size is 4,
+            # we may need 2 blocks when the second block only holds 1 token.
+            self.max_block_sliding_window = num_blocks + 1
+
+        self.watermark = watermark
+        assert watermark >= 0.0
+
+        self.enable_caching = enable_caching
+
+        self.watermark_blocks = int(watermark * num_gpu_blocks)
+
+        self.block_allocator = CpuGpuBlockAllocator.create(
+            allocator_type="prefix_caching" if enable_caching else "naive",
+            num_gpu_blocks=num_gpu_blocks,
+            num_cpu_blocks=num_cpu_blocks,
+            block_size=block_size,
+        )
+
+        self.block_tables: Dict[SeqId, BlockTable] = {}
+        self.cross_block_tables: Dict[EncoderSeqId, BlockTable] = {}
+
+        self._computed_blocks_tracker = ComputedBlocksTracker(
+            self.block_allocator, self.block_size, self.enable_caching)
+        self._last_access_blocks_tracker = LastAccessBlocksTracker(
+            self.block_allocator)
+
+    def can_allocate(self,
+                     seq_group: SequenceGroup,
+                     num_lookahead_slots: int = 0) -> AllocStatus:
+        # FIXME(woosuk): Here we assume that all sequences in the group share
+        # the same prompt. This may not be true for preempted sequences.
+
+        check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group)
+
+        seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
+        num_required_blocks = BlockTable.get_num_required_blocks(
+            seq.get_token_ids(),
+            block_size=self.block_size,
+            num_lookahead_slots=num_lookahead_slots,
+        )
+
+        if seq_group.is_encoder_decoder():
+            encoder_seq = seq_group.get_encoder_seq()
+            assert encoder_seq is not None
+            num_required_blocks += BlockTable.get_num_required_blocks(
+                encoder_seq.get_token_ids(),
+                block_size=self.block_size,
+            )
+
+        if self.max_block_sliding_window is not None:
+            num_required_blocks = min(num_required_blocks,
+                                      self.max_block_sliding_window)
+
+        num_free_gpu_blocks = self.block_allocator.get_num_free_blocks(
+            device=Device.GPU)
+
+        # Use watermark to avoid frequent cache eviction.
+        if (self.num_total_gpu_blocks - num_required_blocks
+                < self.watermark_blocks):
+            return AllocStatus.NEVER
+        if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks:
+            return AllocStatus.OK
+        else:
+            return AllocStatus.LATER
+
+    def _allocate_sequence(self, seq: Sequence) -> BlockTable:
+        block_table = BlockTable(
+            block_size=self.block_size,
+            block_allocator=self.block_allocator,
+            max_block_sliding_window=self.max_block_sliding_window,
+        )
+        if seq.get_token_ids():
+            # NOTE: If there are any factors affecting the block besides
+            # token_ids, they should be added as input to extra_hash.
+            extra_hash = seq.extra_hash()
+
+            # Add blocks to the block table only if the sequence is non empty.
+            block_table.allocate(token_ids=seq.get_token_ids(),
+                                 extra_hash=extra_hash)
+
+        return block_table
+
+    def allocate(self, seq_group: SequenceGroup) -> None:
+
+        # Allocate self-attention block tables for decoder sequences
+        waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
+        assert not (set(seq.seq_id for seq in waiting_seqs)
+                    & self.block_tables.keys()), "block table already exists"
+
+        # NOTE: Here we assume that all sequences in the group have the same
+        # prompt.
+        seq = waiting_seqs[0]
+        block_table: BlockTable = self._allocate_sequence(seq)
+        self.block_tables[seq.seq_id] = block_table
+
+        # Track seq
+        self._last_access_blocks_tracker.add_seq(seq.seq_id)
+
+        # Assign the block table for each sequence.
+        for seq in waiting_seqs[1:]:
+            self.block_tables[seq.seq_id] = block_table.fork()
+
+            # Track seq
+            self._last_access_blocks_tracker.add_seq(seq.seq_id)
+
+        # Allocate cross-attention block table for encoder sequence
+        #
+        # NOTE: Here we assume that all sequences in the group have the same
+        # encoder prompt.
+        request_id = seq_group.request_id
+
+        assert (request_id
+                not in self.cross_block_tables), \
+            "block table already exists"
+
+        check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group)
+
+        if seq_group.is_encoder_decoder():
+            encoder_seq = seq_group.get_encoder_seq()
+            assert encoder_seq is not None
+            block_table = self._allocate_sequence(encoder_seq)
+            self.cross_block_tables[request_id] = block_table
+
+    def can_append_slots(self, seq_group: SequenceGroup,
+                         num_lookahead_slots: int) -> bool:
+        """Determine if there is enough space in the GPU KV cache to continue
+        generation of the specified sequence group.
+
+        We use a worst-case heuristic: assume each touched block will require a
+        new allocation (either via CoW or new block). We can append slots if the
+        number of touched blocks is less than the number of free blocks.
+
+        "Lookahead slots" are slots that are allocated in addition to the slots
+        for known tokens. The contents of the lookahead slots are not defined.
+        This is used by speculative decoding when speculating future tokens.
+        """
+
+        num_touched_blocks = 0
+        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
+            block_table = self.block_tables[seq.seq_id]
+
+            num_touched_blocks += (
+                block_table.get_num_blocks_touched_by_append_slots(
+                    token_ids=block_table.get_unseen_token_ids(
+                        seq.get_token_ids()),
+                    num_lookahead_slots=num_lookahead_slots,
+                ))
+
+        num_free_gpu_blocks = self.block_allocator.get_num_free_blocks(
+            Device.GPU)
+        return num_touched_blocks <= num_free_gpu_blocks
+
+    def append_slots(
+        self,
+        seq: Sequence,
+        num_lookahead_slots: int,
+    ) -> List[Tuple[int, int]]:
+
+        block_table = self.block_tables[seq.seq_id]
+
+        block_table.append_token_ids(
+            token_ids=block_table.get_unseen_token_ids(seq.get_token_ids()),
+            num_lookahead_slots=num_lookahead_slots,
+            num_computed_slots=seq.data.get_num_computed_tokens(),
+            extra_hash=seq.extra_hash(),
+        )
+        # Return any new copy-on-writes.
+        new_cows = self.block_allocator.clear_copy_on_writes()
+        return new_cows
+
+    def free(self, seq: Sequence) -> None:
+        seq_id = seq.seq_id
+
+        if seq_id not in self.block_tables:
+            # Already freed or haven't been scheduled yet.
+            return
+
+        # Update seq block ids with the latest access time
+        self._last_access_blocks_tracker.update_seq_blocks_last_access(
+            seq_id, self.block_tables[seq.seq_id].physical_block_ids)
+
+        # Untrack seq
+        self._last_access_blocks_tracker.remove_seq(seq_id)
+        self._computed_blocks_tracker.remove_seq(seq_id)
+
+        # Free table/blocks
+        self.block_tables[seq_id].free()
+        del self.block_tables[seq_id]
+
+    def remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None:
+        seq_id = seq.seq_id
+        self._computed_blocks_tracker.remove_seq(seq_id)
+
+    def free_cross(self, seq_group: SequenceGroup) -> None:
+        request_id = seq_group.request_id
+        if request_id not in self.cross_block_tables:
+            # Already freed or hasn't been scheduled yet.
+            return
+        self.cross_block_tables[request_id].free()
+        del self.cross_block_tables[request_id]
+
+    def get_block_table(self, seq: Sequence) -> List[int]:
+        block_ids = self.block_tables[seq.seq_id].physical_block_ids
+        return block_ids  # type: ignore
+
+    def get_cross_block_table(self, seq_group: SequenceGroup) -> List[int]:
+        request_id = seq_group.request_id
+        assert request_id in self.cross_block_tables
+        block_ids = self.cross_block_tables[request_id].physical_block_ids
+        assert all(b is not None for b in block_ids)
+        return block_ids  # type: ignore
+
+    def access_all_blocks_in_seq(self, seq: Sequence, now: float):
+        if self.enable_caching:
+            # Record the latest access time for the sequence. The actual update
+            # of the block ids is deferred to the sequence free(..) call, since
+            # only during freeing of block ids, the blocks are actually added to
+            # the evictor (which is when the most updated time is required)
+            # (This avoids expensive calls to mark_blocks_as_accessed(..))
+            self._last_access_blocks_tracker.update_last_access(
+                seq.seq_id, now)
+
+    def mark_blocks_as_computed(self, seq_group: SequenceGroup,
+                                token_chunk_size: int):
+        # If prefix caching is enabled, mark immutable blocks as computed
+        # right after they have been scheduled (for prefill). This assumes
+        # the scheduler is synchronous so blocks are actually computed when
+        # scheduling the next batch.
+        self.block_allocator.mark_blocks_as_computed([])
+
+    def get_common_computed_block_ids(
+            self, seqs: List[Sequence]) -> GenericSequence[int]:
+        """Determine which blocks for which we skip prefill.
+
+        With prefix caching we can skip prefill for previously-generated blocks.
+        Currently, the attention implementation only supports skipping cached
+        blocks if they are a contiguous prefix of cached blocks.
+
+        This method determines which blocks can be safely skipped for all
+        sequences in the sequence group.
+        """
+        computed_seq_block_ids = []
+        for seq in seqs:
+            all_blocks = self.block_tables[seq.seq_id].physical_block_ids
+            num_cached_tokens = (
+                self._computed_blocks_tracker.get_num_cached_tokens(seq))
+            assert num_cached_tokens % self.block_size == 0
+            num_cached_blocks = num_cached_tokens // self.block_size
+            computed_block_ids = all_blocks[:num_cached_blocks]
+            computed_seq_block_ids.append(computed_block_ids)
+
+        # NOTE(sang): This assumes seq_block_ids doesn't contain any None.
+        return self.block_allocator.get_common_computed_block_ids(
+            computed_seq_block_ids)  # type: ignore
+
+    def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
+        if parent_seq.seq_id not in self.block_tables:
+            # Parent sequence has either been freed or never existed.
+            return
+        src_block_table = self.block_tables[parent_seq.seq_id]
+        self.block_tables[child_seq.seq_id] = src_block_table.fork()
+
+        # Track child seq
+        self._last_access_blocks_tracker.add_seq(child_seq.seq_id)
+
+    def can_swap_in(self, seq_group: SequenceGroup,
+                    num_lookahead_slots: int) -> AllocStatus:
+        """Returns the AllocStatus for the given sequence_group 
+        with num_lookahead_slots.
+
+        Args:
+            sequence_group (SequenceGroup): The sequence group to swap in.
+            num_lookahead_slots (int): Number of lookahead slots used in 
+                speculative decoding, default to 0.
+
+        Returns:
+            AllocStatus: The AllocStatus for the given sequence group.
+        """
+        return self._can_swap(seq_group, Device.GPU, SequenceStatus.SWAPPED,
+                              num_lookahead_slots)
+
+    def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
+        """Returns the block id mapping (from CPU to GPU) generated by
+        swapping in the given seq_group with num_lookahead_slots.
+
+        Args:
+            seq_group (SequenceGroup): The sequence group to swap in.
+
+        Returns:
+            List[Tuple[int, int]]: The mapping of swapping block from CPU 
+                to GPU.
+        """
+        physical_block_id_mapping = []
+        for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
+            blocks = self.block_tables[seq.seq_id].blocks
+            if len(blocks) == 0:
+                continue
+
+            seq_swap_mapping = self.block_allocator.swap(blocks=blocks,
+                                                         src_device=Device.CPU,
+                                                         dst_device=Device.GPU)
+
+            # Refresh the block ids of the table (post-swap)
+            self.block_tables[seq.seq_id].update(blocks)
+
+            seq_physical_block_id_mapping = {
+                self.block_allocator.get_physical_block_id(
+                    Device.CPU, cpu_block_id):
+                self.block_allocator.get_physical_block_id(
+                    Device.GPU, gpu_block_id)
+                for cpu_block_id, gpu_block_id in seq_swap_mapping.items()
+            }
+
+            physical_block_id_mapping.extend(
+                list(seq_physical_block_id_mapping.items()))
+
+        return physical_block_id_mapping
+
+    def can_swap_out(self, seq_group: SequenceGroup) -> bool:
+        """Returns whether we can swap out the given sequence_group 
+        with num_lookahead_slots.
+
+        Args:
+            seq_group (SequenceGroup): The sequence group to swap out.
+            num_lookahead_slots (int): Number of lookahead slots used in 
+                speculative decoding, default to 0.
+
+        Returns:
+            bool: Whether it's possible to swap out current sequence group.
+        """
+        alloc_status = self._can_swap(seq_group, Device.CPU,
+                                      SequenceStatus.RUNNING)
+        return alloc_status == AllocStatus.OK
+
+    def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
+        """Returns the block id mapping (from GPU to CPU) generated by
+        swapping out the given sequence_group with num_lookahead_slots.
+
+        Args:
+            sequence_group (SequenceGroup): The sequence group to swap out.
+
+        Returns:
+            List[Tuple[int, int]]: The mapping of swapping block from 
+                GPU to CPU.
+        """
+        physical_block_id_mapping = []
+        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
+            blocks = self.block_tables[seq.seq_id].blocks
+            if len(blocks) == 0:
+                continue
+
+            seq_swap_mapping = self.block_allocator.swap(blocks=blocks,
+                                                         src_device=Device.GPU,
+                                                         dst_device=Device.CPU)
+
+            # Refresh the block ids of the table (post-swap)
+            self.block_tables[seq.seq_id].update(blocks)
+
+            seq_physical_block_id_mapping = {
+                self.block_allocator.get_physical_block_id(
+                    Device.GPU, gpu_block_id):
+                self.block_allocator.get_physical_block_id(
+                    Device.CPU, cpu_block_id)
+                for gpu_block_id, cpu_block_id in seq_swap_mapping.items()
+            }
+
+            physical_block_id_mapping.extend(
+                list(seq_physical_block_id_mapping.items()))
+
+        return physical_block_id_mapping
+
+    def get_num_free_gpu_blocks(self) -> int:
+        return self.block_allocator.get_num_free_blocks(Device.GPU)
+
+    def get_num_free_cpu_blocks(self) -> int:
+        return self.block_allocator.get_num_free_blocks(Device.CPU)
+
+    def get_prefix_cache_hit_rate(self, device: Device) -> float:
+        return self.block_allocator.get_prefix_cache_hit_rate(device)
+
+    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
+        return self.block_allocator.reset_prefix_cache(device)
+
+    def _can_swap(self,
+                  seq_group: SequenceGroup,
+                  device: Device,
+                  status: SequenceStatus,
+                  num_lookahead_slots: int = 0) -> AllocStatus:
+        """Returns the AllocStatus for swapping in/out the given sequence_group 
+        on to the 'device'.
+
+        Args:
+            sequence_group (SequenceGroup): The sequence group to swap in/out.
+            device (Device): device to swap the 'seq_group' on.
+            status (SequenceStatus): The status of sequence which is needed
+                for action. RUNNING for swap out and SWAPPED for swap in
+            num_lookahead_slots (int): Number of lookahead slots used in 
+                speculative decoding, default to 0.
+
+        Returns:
+            AllocStatus: The AllocStatus for swapping in/out the given 
+                sequence_group on to the 'device'.
+        """
+        # First determine the number of blocks that will be touched by this
+        # swap. Then verify if there are available blocks in the device
+        # to perform the swap.
+        num_blocks_touched = 0
+        blocks: List[Block] = []
+        for seq in seq_group.get_seqs(status=status):
+            block_table = self.block_tables[seq.seq_id]
+            if block_table.blocks is not None:
+                # Compute the number blocks to touch for the tokens to be
+                # appended. This does NOT include the full blocks that need
+                # to be touched for the swap.
+                num_blocks_touched += \
+                    block_table.get_num_blocks_touched_by_append_slots(
+                        block_table.get_unseen_token_ids(seq.get_token_ids()),
+                        num_lookahead_slots=num_lookahead_slots)
+                blocks.extend(block_table.blocks)
+        # Compute the number of full blocks to touch and add it to the
+        # existing count of blocks to touch.
+        num_blocks_touched += self.block_allocator.get_num_full_blocks_touched(
+            blocks, device=device)
+
+        watermark_blocks = 0
+        if device == Device.GPU:
+            watermark_blocks = self.watermark_blocks
+
+        if self.block_allocator.get_num_total_blocks(
+                device) < num_blocks_touched:
+            return AllocStatus.NEVER
+        elif self.block_allocator.get_num_free_blocks(
+                device) - num_blocks_touched >= watermark_blocks:
+            return AllocStatus.OK
+        else:
+            return AllocStatus.LATER
+
+    def get_num_cached_tokens(self, seq: Sequence) -> int:
+        """Get the number of tokens in blocks that are already computed and
+        cached in the block manager for the sequence.
+        """
+        return self._computed_blocks_tracker.get_num_cached_tokens(seq)
diff --git a/vllm_v0.10.0/vllm/core/evictor.py b/vllm_v0.10.0/vllm/core/evictor.py
new file mode 100644
index 0000000..7ec4768
--- /dev/null
+++ b/vllm_v0.10.0/vllm/core/evictor.py
@@ -0,0 +1,157 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import enum
+import heapq
+from abc import ABC, abstractmethod
+from typing import Dict, List, Tuple
+
+
+class EvictionPolicy(enum.Enum):
+    """Enum for eviction policy used by make_evictor to instantiate the correct
+       Evictor subclass.
+    """
+    LRU = enum.auto()
+
+
+class Evictor(ABC):
+    """The Evictor subclasses should be used by the BlockAllocator class to
+    handle eviction of freed Blocks.
+    """
+
+    @abstractmethod
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def __contains__(self, block_id: int) -> bool:
+        pass
+
+    @abstractmethod
+    def evict(self) -> Tuple[int, int]:
+        """Runs the eviction algorithm and returns the evicted block's
+        content hash along with physical block id along with physical block id
+        """
+        pass
+
+    @abstractmethod
+    def add(self, block_id: int, content_hash: int, num_hashed_tokens: int,
+            last_accessed: float):
+        """Adds block to the evictor, making it a candidate for eviction"""
+        pass
+
+    @abstractmethod
+    def update(self, block_id: int, last_accessed: float):
+        """Update corresponding block's access time in metadata"""
+        pass
+
+    @abstractmethod
+    def remove(self, block_id: int):
+        """Remove a given block id from the cache."""
+        pass
+
+    @property
+    @abstractmethod
+    def num_blocks(self) -> int:
+        pass
+
+
+class BlockMetaData:
+    """Data structure for storing key data describe cached block, so that
+    evitor could use to make its decision which one to choose for eviction
+
+    Here we use physical block id as the dict key, as there maybe several
+    blocks with the same content hash, but their physical id is unique.
+    """
+
+    def __init__(self, content_hash: int, num_hashed_tokens: int,
+                 last_accessed: float):
+        self.content_hash = content_hash
+        self.num_hashed_tokens = num_hashed_tokens
+        self.last_accessed = last_accessed
+
+
+class LRUEvictor(Evictor):
+    """Evicts in a least-recently-used order using the last_accessed timestamp
+    that's recorded in the Block. If there are multiple blocks with
+    the same last_accessed time, then the one with the largest num_hashed_tokens
+    will be evicted. If two blocks each have the lowest last_accessed time and
+    highest num_hashed_tokens value, then one will be chose arbitrarily
+    """
+
+    # CLEANUP_THRESHOLD determines the maximum allowable size of the priority
+    # queue relative to the free table size. When this threshold is exceeded,
+    # a cleanup operation is triggered to reduce memory usage.
+    CLEANUP_THRESHOLD = 50
+
+    def __init__(self):
+        self.free_table: Dict[int, BlockMetaData] = {}
+        self.priority_queue = []
+
+    def __contains__(self, block_id: int) -> bool:
+        return block_id in self.free_table
+
+    def evict(self) -> Tuple[int, int]:
+        if len(self.free_table) == 0:
+            raise ValueError("No usable cache memory left")
+
+        while self.priority_queue:
+            # We do not remove outdated entries from the priority queue at the
+            # time of updating the last_accessed timestamp. Instead, outdated
+            # entries are filtered out here during eviction. Outdated entries
+            # would either not in the free table, or have older last accessed
+            # time.
+            last_accessed, _, block_id, content_hash = heapq.heappop(
+                self.priority_queue)
+            if (block_id in self.free_table and
+                    self.free_table[block_id].last_accessed == last_accessed):
+                self.free_table.pop(block_id)
+                return block_id, content_hash
+
+        raise ValueError("No usable cache memory left")
+
+    def add(self, block_id: int, content_hash: int, num_hashed_tokens: int,
+            last_accessed: float):
+        self.free_table[block_id] = BlockMetaData(content_hash,
+                                                  num_hashed_tokens,
+                                                  last_accessed)
+        heapq.heappush(
+            self.priority_queue,
+            (last_accessed, -num_hashed_tokens, block_id, content_hash))
+        self._cleanup_if_necessary()
+
+    def update(self, block_id: int, last_accessed: float):
+        self.free_table[block_id].last_accessed = last_accessed
+
+    def _cleanup_if_necessary(self):
+        if len(self.priority_queue) > LRUEvictor.CLEANUP_THRESHOLD * len(
+                self.free_table):
+            self._cleanup()
+
+    def _cleanup(self):
+        new_priority_queue: List[Tuple[float, int, int, int]] = []
+
+        for block_id, block in self.free_table.items():
+            new_priority_queue.append(
+                (block.last_accessed, -block.num_hashed_tokens, block_id,
+                 block.content_hash))
+        heapq.heapify(new_priority_queue)
+
+        self.priority_queue = new_priority_queue
+
+    def remove(self, block_id: int):
+        if block_id not in self.free_table:
+            raise ValueError(
+                "Attempting to remove block that's not in the evictor")
+        self.free_table.pop(block_id)
+
+    @property
+    def num_blocks(self) -> int:
+        return len(self.free_table)
+
+
+def make_evictor(eviction_policy: EvictionPolicy) -> Evictor:
+    if eviction_policy == EvictionPolicy.LRU:
+        return LRUEvictor()
+    else:
+        raise ValueError(f"Unknown cache eviction policy: {eviction_policy}")
diff --git a/vllm_v0.10.0/vllm/core/interfaces.py b/vllm_v0.10.0/vllm/core/interfaces.py
new file mode 100644
index 0000000..69b9169
--- /dev/null
+++ b/vllm_v0.10.0/vllm/core/interfaces.py
@@ -0,0 +1,139 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import enum
+from abc import ABC, abstractmethod
+from typing import List, Optional
+from typing import Sequence as GenericSequence
+from typing import Tuple
+
+from vllm.sequence import Sequence, SequenceGroup
+from vllm.utils import Device
+
+
+class AllocStatus(enum.Enum):
+    """Result for BlockSpaceManager.can_allocate
+
+    1. Ok: seq_group can be allocated now.
+    2. Later: seq_group cannot be allocated.
+      The capacity of allocator is larger than seq_group required.
+    3. Never: seq_group can never be allocated.
+      The seq_group is too large to allocated in GPU.
+    """
+    OK = enum.auto()
+    LATER = enum.auto()
+    NEVER = enum.auto()
+
+
+class BlockSpaceManager(ABC):
+
+    @staticmethod
+    def get_block_space_manager_class(version: str):
+        version = version.lower()
+
+        if version == "selfattn":
+            from vllm.core.block_manager import SelfAttnBlockSpaceManager
+            return SelfAttnBlockSpaceManager
+
+        if version == "placeholder":
+            from vllm.core.placeholder_block_space_manager import (
+                PlaceholderBlockSpaceManager)
+            return PlaceholderBlockSpaceManager
+
+        raise ValueError(f"Unknown version {version=}")
+
+    @abstractmethod
+    def can_allocate(self,
+                     seq_group: SequenceGroup,
+                     num_lookahead_slots: int = 0) -> AllocStatus:
+        pass
+
+    @abstractmethod
+    def allocate(self, seq_group: SequenceGroup) -> None:
+        pass
+
+    @abstractmethod
+    def can_append_slots(self, seq_group: SequenceGroup,
+                         num_lookahead_slots: int) -> bool:
+        pass
+
+    @abstractmethod
+    def append_slots(
+        self,
+        seq: Sequence,
+        num_lookahead_slots: int,
+    ) -> List[Tuple[int, int]]:
+        pass
+
+    @abstractmethod
+    def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
+        pass
+
+    @abstractmethod
+    def can_swap_in(self, seq_group: SequenceGroup,
+                    num_lookahead_slots: int) -> AllocStatus:
+        pass
+
+    @abstractmethod
+    def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
+        pass
+
+    @abstractmethod
+    def can_swap_out(self, seq_group: SequenceGroup) -> bool:
+        pass
+
+    @abstractmethod
+    def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
+        pass
+
+    @abstractmethod
+    def free(self, seq: Sequence) -> None:
+        pass
+
+    @abstractmethod
+    def get_block_table(self, seq: Sequence) -> List[int]:
+        pass
+
+    @abstractmethod
+    def get_num_free_gpu_blocks(self) -> int:
+        pass
+
+    @abstractmethod
+    def get_num_free_cpu_blocks(self) -> int:
+        pass
+
+    @abstractmethod
+    def access_all_blocks_in_seq(
+        self,
+        seq: Sequence,
+        access_time: float,
+    ) -> None:
+        pass
+
+    @abstractmethod
+    def get_common_computed_block_ids(
+            self, seqs: List[Sequence]) -> GenericSequence[int]:
+        pass
+
+    @abstractmethod
+    def mark_blocks_as_computed(self, seq_group: SequenceGroup,
+                                token_chunk_size: int):
+        pass
+
+    @abstractmethod
+    def get_prefix_cache_hit_rate(self, device: Device) -> float:
+        """Prefix cache hit rate. -1 means not supported or disabled."""
+        pass
+
+    @abstractmethod
+    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
+        """Reset prefix cache for specified or all devices."""
+        pass
+
+    @abstractmethod
+    def get_num_cached_tokens(self, seq: Sequence) -> int:
+        pass
+
+    @abstractmethod
+    def remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None:
+        pass
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/core/placeholder_block_space_manager.py b/vllm_v0.10.0/vllm/core/placeholder_block_space_manager.py
new file mode 100644
index 0000000..6795159
--- /dev/null
+++ b/vllm_v0.10.0/vllm/core/placeholder_block_space_manager.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import List, Optional, Tuple
+
+from vllm.core.interfaces import AllocStatus, BlockSpaceManager
+from vllm.sequence import Sequence, SequenceGroup
+from vllm.utils import Device
+
+
+class PlaceholderBlockSpaceManager(BlockSpaceManager):
+    """A version of BlockSpaceManager for use in environments
+    where block management is not required. 
+    For example: pooling models or attention-free models like Mamba.
+
+    This class provides the same interface as BlockSpaceManager, but its
+    methods perform no actions or return simple values like True in specific
+    actions. It's designed to be used in scenarios where the overhead of
+    block management is unnecessary, such as in an embedding environment.
+    """
+
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        pass
+
+    def can_allocate(self,
+                     seq_group: SequenceGroup,
+                     num_lookahead_slots: int = 0) -> AllocStatus:
+        # Always return OK for dummy purposes
+        return AllocStatus.OK
+
+    def allocate(self, seq_group: SequenceGroup) -> None:
+        # No actual allocation logic needed
+        pass
+
+    def can_append_slots(self, seq_group: SequenceGroup,
+                         num_lookahead_slots: int) -> bool:
+        return True
+
+    def append_slots(
+        self,
+        seq: Sequence,
+        num_lookahead_slots: int,
+    ) -> List[Tuple[int, int]]:
+        return []
+
+    def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
+        pass
+
+    def can_swap_in(self, seq_group: SequenceGroup,
+                    num_lookahead_slots: int) -> AllocStatus:
+        return AllocStatus.OK
+
+    def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
+        return None  # type: ignore
+
+    def can_swap_out(self, seq_group: SequenceGroup) -> bool:
+        return True
+
+    def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
+        return None  # type: ignore
+
+    def free(self, seq: Sequence) -> None:
+        # No operation on free
+        return
+
+    def get_block_table(self, seq: Sequence) -> List[int]:
+        return None  # type: ignore
+
+    def get_num_free_gpu_blocks(self) -> int:
+        return 1
+
+    def get_num_free_cpu_blocks(self) -> int:
+        return 1
+
+    def access_all_blocks_in_seq(
+        self,
+        seq: Sequence,
+        access_time: float,
+    ) -> None:
+        pass
+
+    def get_common_computed_block_ids(self,
+                                      seq_group: List[Sequence]) -> List[int]:
+        return []
+
+    def mark_blocks_as_computed(self, seq_group: SequenceGroup,
+                                token_chunk_size: int):
+        pass
+
+    def get_prefix_cache_hit_rate(self, device: Device) -> float:
+        return -1
+
+    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
+        return True
+
+    def get_num_cached_tokens(self, seq: Sequence) -> int:
+        return 0
+
+    def remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None:
+        return
diff --git a/vllm_v0.10.0/vllm/core/scheduler.py b/vllm_v0.10.0/vllm/core/scheduler.py
new file mode 100644
index 0000000..61346da
--- /dev/null
+++ b/vllm_v0.10.0/vllm/core/scheduler.py
@@ -0,0 +1,2114 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import enum
+import os
+import random
+import time
+from collections import deque
+from dataclasses import dataclass, field
+from typing import Callable, Deque, Dict, Iterable, List, Optional
+from typing import Sequence as GenericSequence
+from typing import Set, Tuple, Union
+
+from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
+from vllm.core.interfaces import AllocStatus, BlockSpaceManager
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
+                           SequenceGroupBase, SequenceGroupMetadata,
+                           SequenceGroupMetadataDelta, SequenceStage,
+                           SequenceStatus)
+from vllm.utils import Device, PyObjectCache
+
+logger = init_logger(__name__)
+
+# Test-only. If configured, decode is preempted with
+# ARTIFICIAL_PREEMPTION_PROB% probability.
+ENABLE_ARTIFICIAL_PREEMPT = bool(
+    os.getenv("VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT", False))  # noqa
+ARTIFICIAL_PREEMPTION_PROB = 0.5
+ARTIFICIAL_PREEMPTION_MAX_CNT = 500
+
+
+class PreemptionMode(enum.Enum):
+    """Preemption modes.
+
+    1. Swapping: Swap out the blocks of the preempted sequences to CPU memory
+    and swap them back in when the sequences are resumed.
+    2. Recomputation: Discard the blocks of the preempted sequences and
+    recompute them when the sequences are resumed, treating the sequences as
+    new prompts.
+    """
+
+    SWAP = enum.auto()
+    RECOMPUTE = enum.auto()
+
+
+@dataclass
+class SchedulingBudget:
+    """The available slots for scheduling.
+
+    TODO(sang): Right now, the budget is request_id-aware meaning it can ignore
+    budget update from the same request_id. It is because in normal scheduling
+    path, we update RUNNING num_seqs ahead of time, meaning it could be
+    updated more than once when scheduling RUNNING requests. Since this won't
+    happen if we only have chunked prefill scheduling, we can remove this
+    feature from the API when chunked prefill is enabled by default.
+    """
+
+    token_budget: int
+    max_num_seqs: int
+    _request_ids_num_batched_tokens: Set[str] = field(default_factory=set)
+    _request_ids_num_curr_seqs: Set[str] = field(default_factory=set)
+    # Number of cached tokens in the batch.
+    _num_cached_tokens: int = 0
+    # Number of actual non-cached tokens in the batch.
+    _num_batched_tokens: int = 0
+    _num_curr_seqs: int = 0
+
+    def can_schedule(self, *, num_new_tokens: int, num_new_seqs: int):
+        # We allow num_new_tokens to be 0 when the entire sequence has
+        # been cached.
+        assert num_new_tokens >= 0
+        assert num_new_seqs != 0
+        return (self.num_batched_tokens + num_new_tokens <= self.token_budget
+                and self.num_curr_seqs + num_new_seqs <= self.max_num_seqs)
+
+    def remaining_token_budget(self):
+        return self.token_budget - self.num_batched_tokens
+
+    def add_num_batched_tokens(self,
+                               req_id: str,
+                               num_batched_tokens: int,
+                               num_cached_tokens: int = 0):
+        if req_id in self._request_ids_num_batched_tokens:
+            return
+        assert num_cached_tokens >= 0
+        assert num_batched_tokens >= 0
+
+        self._request_ids_num_batched_tokens.add(req_id)
+        self._num_batched_tokens += num_batched_tokens
+        self._num_cached_tokens += num_cached_tokens
+
+    def subtract_num_batched_tokens(self, req_id: str,
+                                    num_batched_tokens: int):
+        if req_id in self._request_ids_num_batched_tokens:
+            self._request_ids_num_batched_tokens.remove(req_id)
+            self._num_batched_tokens -= num_batched_tokens
+
+    def add_num_seqs(self, req_id: str, num_curr_seqs: int):
+        if req_id in self._request_ids_num_curr_seqs:
+            return
+
+        self._request_ids_num_curr_seqs.add(req_id)
+        self._num_curr_seqs += num_curr_seqs
+
+    def subtract_num_seqs(self, req_id: str, num_curr_seqs: int):
+        if req_id in self._request_ids_num_curr_seqs:
+            self._request_ids_num_curr_seqs.remove(req_id)
+            self._num_curr_seqs -= num_curr_seqs
+
+    @property
+    def num_batched_tokens(self):
+        return self._num_batched_tokens
+
+    @property
+    def num_curr_seqs(self):
+        return self._num_curr_seqs
+
+    @property
+    def num_cached_tokens(self):
+        return self._num_cached_tokens
+
+
+@dataclass
+class ScheduledSequenceGroup:
+    # A sequence group that's scheduled.
+    seq_group: SequenceGroup
+    # The total chunk size (number of tokens) to process for next iteration.
+    # 1 for decoding. Same as prompt tokens for prefill, but if prefill is
+    # chunked, it can be smaller than that.
+    token_chunk_size: int
+
+
+@dataclass
+class SchedulerOutputs:
+    """The scheduling decision made from a scheduler."""
+
+    # Scheduled sequence groups.
+    scheduled_seq_groups: GenericSequence[ScheduledSequenceGroup]
+    # Number of prefill groups scheduled.
+    num_prefill_groups: int
+    # Total number of batched tokens.
+    num_batched_tokens: int
+    # Blocks to swap in. List of CPU -> GPU block number.
+    blocks_to_swap_in: List[Tuple[int, int]]
+    # Blocks to swap out. List of GPU -> CPU block number.
+    blocks_to_swap_out: List[Tuple[int, int]]
+    # Blocks to copy. Source to dest block.
+    blocks_to_copy: List[Tuple[int, int]]
+    # Sequence groups that are going to be ignored.
+    ignored_seq_groups: List[SequenceGroup]
+    # The number of slots for lookahead decoding.
+    num_lookahead_slots: int
+    # The number of requests in the running queue
+    running_queue_size: int
+    preempted: int
+
+    def __post_init__(self):
+        # Swap in and swap out should never happen at the same time.
+        assert not (self.blocks_to_swap_in and self.blocks_to_swap_out)
+
+        self.num_loras: int = len(self.lora_requests)
+        if self.num_loras > 0:
+            self._sort_by_lora_ids()
+
+    def is_empty(self) -> bool:
+        # NOTE: We do not consider the ignored sequence groups.
+        return (not self.scheduled_seq_groups and not self.blocks_to_swap_in
+                and not self.blocks_to_swap_out and not self.blocks_to_copy)
+
+    def _sort_by_lora_ids(self):
+        assert 0 <= self.num_prefill_groups <= len(self.scheduled_seq_groups)
+
+        def key_fn(group: ScheduledSequenceGroup):
+            key = (group.seq_group.lora_int_id, group.seq_group.request_id)
+            if 0 < self.num_prefill_groups < len(self.scheduled_seq_groups):
+                # Sort sequence groups so that all prefills come before all
+                # decodes as required by chunked prefill.
+                return (not group.seq_group.is_prefill(), *key)
+            return key
+
+        self.scheduled_seq_groups = sorted(self.scheduled_seq_groups,
+                                           key=key_fn)
+
+    @property
+    def lora_requests(self) -> Set[LoRARequest]:
+        return {
+            g.seq_group.lora_request
+            for g in self.scheduled_seq_groups
+            if g.seq_group.lora_request is not None
+        }
+
+
+@dataclass
+class SchedulerRunningOutputs:
+    """The requests that are scheduled from a running queue.
+
+    Could contain prefill (prefill that's chunked) or decodes. If there's not
+    enough memory, it can be preempted (for recompute) or swapped out.
+    """
+
+    # Selected sequences that are running and in a decoding phase.
+    decode_seq_groups: List[ScheduledSequenceGroup]
+    # Selected sequences that are running and in a prefill phase.
+    # I.e., it means the prefill has been chunked.
+    prefill_seq_groups: List[ScheduledSequenceGroup]
+    # The preempted sequences.
+    preempted: List[SequenceGroup]
+    # Sequences that are swapped out.
+    swapped_out: List[SequenceGroup]
+    # The blocks to swap out.
+    blocks_to_swap_out: List[Tuple[int, int]]
+    # The blocks to copy.
+    blocks_to_copy: List[Tuple[int, int]]
+    # The number of slots for lookahead decoding.
+    num_lookahead_slots: int
+
+    # Optimization for fast-access to seq_group lists
+    decode_seq_groups_list: List[SequenceGroup]
+    prefill_seq_groups_list: List[SequenceGroup]
+
+    @classmethod
+    def create_empty(cls) -> "SchedulerRunningOutputs":
+        return SchedulerRunningOutputs(
+            decode_seq_groups=[],
+            prefill_seq_groups=[],
+            preempted=[],
+            swapped_out=[],
+            blocks_to_swap_out=[],
+            blocks_to_copy=[],
+            num_lookahead_slots=0,
+            decode_seq_groups_list=[],
+            prefill_seq_groups_list=[],
+        )
+
+
+@dataclass
+class SchedulerSwappedInOutputs:
+    """The requests that are scheduled from a swap queue.
+
+    Could contain prefill (prefill that's chunked) or decodes.
+    """
+
+    # Selected sequences that are going to be swapped in and is in a
+    # decoding phase.
+    decode_seq_groups: List[ScheduledSequenceGroup]
+    # Selected sequences that are going to be swapped in and in a prefill
+    # phase. I.e., it means the prefill has been chunked.
+    prefill_seq_groups: List[ScheduledSequenceGroup]
+    # The blocks to swap in.
+    blocks_to_swap_in: List[Tuple[int, int]]
+    # The blocks to copy.
+    blocks_to_copy: List[Tuple[int, int]]
+    # The number of slots for lookahead decoding.
+    num_lookahead_slots: int
+    # Infeasible sequence groups.
+    infeasible_seq_groups: List[SequenceGroup]
+
+    @classmethod
+    def create_empty(cls) -> "SchedulerSwappedInOutputs":
+        return SchedulerSwappedInOutputs(
+            decode_seq_groups=[],
+            prefill_seq_groups=[],
+            blocks_to_swap_in=[],
+            blocks_to_copy=[],
+            num_lookahead_slots=0,
+            infeasible_seq_groups=[],
+        )
+
+
+@dataclass
+class SchedulerPrefillOutputs:
+    """The requests that are scheduled from a waiting queue.
+
+    Could contain a fresh prefill requests or preempted requests that need
+    to be recomputed from scratch.
+    """
+
+    # Selected sequences for prefill.
+    seq_groups: List[ScheduledSequenceGroup]
+    # Ignored sequence groups.
+    ignored_seq_groups: List[SequenceGroup]
+    num_lookahead_slots: int
+
+    @classmethod
+    def create_empty(cls) -> "SchedulerPrefillOutputs":
+        return SchedulerPrefillOutputs(
+            seq_groups=[],
+            ignored_seq_groups=[],
+            num_lookahead_slots=0,
+        )
+
+
+def seq_group_metadata_builder():
+    return SequenceGroupMetadata(request_id="",
+                                 is_prompt=False,
+                                 seq_data={},
+                                 sampling_params=None,
+                                 block_tables={})
+
+
+def scheduler_running_outputs_builder():
+    return SchedulerRunningOutputs(decode_seq_groups=[],
+                                   prefill_seq_groups=[],
+                                   preempted=[],
+                                   swapped_out=[],
+                                   blocks_to_swap_out=[],
+                                   blocks_to_copy=[],
+                                   num_lookahead_slots=0,
+                                   prefill_seq_groups_list=[],
+                                   decode_seq_groups_list=[])
+
+
+def scheduled_seq_group_builder():
+    return ScheduledSequenceGroup(SequenceGroup.__new__(SequenceGroup),
+                                  token_chunk_size=0)
+    # return ScheduledSequenceGroup(seq_group=None, token_chunk_size=0)
+
+
+@dataclass
+class PartialPrefillMetadata:
+    """Holds information about the partial prefills that are currently running
+    during a single iteration of the Scheduler.
+    When chunked prefill is enabled, we allow a certain number of seqs to be
+    partially prefilled during each iteration. Having multiple partial prefills
+    in flight allows us to minimize TTFT and avoid decode starvation in cases
+    where a single sequence group with a very large prompt blocks the queue for
+    too many iterations.
+    The number of long prefill requests is limited so that smaller
+    requests may jump the queue in front of them and get to the decode
+    phase faster.
+    """
+
+    # A minimum bound on the total number of prefills to be scheduled during
+    # this iteration
+    schedulable_prefills: int
+
+    # The number of long prefill requests currently running
+    long_prefills: int
+
+    scheduler_config: SchedulerConfig
+
+    def can_schedule(self, seq_group: SequenceGroup) -> bool:
+        """When concurrent partial prefills are enabled,
+        we limit the number of long requests and only accept
+        shorter requests from the queue while running them
+        concurrently"""
+        return not (seq_group.first_seq.get_num_new_tokens()
+                    > self.scheduler_config.long_prefill_token_threshold
+                    and self.long_prefills
+                    >= self.scheduler_config.max_long_partial_prefills
+                    and self.scheduler_config.max_num_partial_prefills > 1)
+
+    def maybe_increment_partial_prefills(self,
+                                         seq_group: SequenceGroup) -> None:
+        # When a new prefill is scheduled, we need to know if it is a
+        # long request
+        if (seq_group.first_seq.get_num_new_tokens()
+                > self.scheduler_config.long_prefill_token_threshold):
+            self.long_prefills += 1
+
+    @classmethod
+    def from_queues(
+        cls,
+        running: Deque[SequenceGroup],
+        waiting: Deque[SequenceGroup],
+        scheduler_config: SchedulerConfig,
+    ) -> "PartialPrefillMetadata":
+        """Create a PartialPrefillMetadata object from the current state of
+        the scheduler's queues.
+        This accounts for the currently running prefill requests, and peeks into
+        the waiting queue to see if there are more prefills to potentially be
+        scheduled during this iteration."""
+        prefills = 0
+        long_prefills = 0
+
+        waiting_long_prefills = 0
+
+        for sg in running:
+            if sg.first_seq.data.stage == SequenceStage.PREFILL:
+                prefills += 1
+                if (sg.first_seq.get_num_new_tokens()
+                        > scheduler_config.long_prefill_token_threshold):
+                    long_prefills += 1
+
+        for sg in waiting:
+            # Don't bother looping through the rest of the queue if we know
+            # there are already at
+            # least max_partial_prefills requests to fill
+            if prefills >= scheduler_config.max_num_partial_prefills:
+                break
+
+            # Don't count long requests from the waiting queue if we aren't
+            # going to schedule them anyway
+            if (sg.first_seq.get_num_new_tokens()
+                    > scheduler_config.long_prefill_token_threshold):
+                if (long_prefills + waiting_long_prefills
+                        >= scheduler_config.max_long_partial_prefills):
+                    continue
+                waiting_long_prefills += 1
+            prefills += 1
+
+        # NB: long_prefills and waiting_long_prefills are tracked separately.
+        # We don't account for the waiting requests here because we need to use
+        # this metadata to track how many have actually been scheduled.
+        return PartialPrefillMetadata(
+            schedulable_prefills=min(
+                prefills, scheduler_config.max_num_partial_prefills),
+            long_prefills=long_prefills,
+            scheduler_config=scheduler_config,
+        )
+
+
+class Scheduler:
+
+    def __init__(
+        self,
+        scheduler_config: SchedulerConfig,
+        cache_config: CacheConfig,
+        lora_config: Optional[LoRAConfig],
+        pipeline_parallel_size: int = 1,
+        output_proc_callback: Optional[Callable] = None,
+    ) -> None:
+        self.scheduler_config = scheduler_config
+        self.cache_config = cache_config
+        # Note for LoRA scheduling: the current policy is extremely
+        # simple and NOT fair. It can lead to starvation of some
+        # LoRAs. This should be improved in the future.
+        self.lora_config = lora_config
+
+        version = "selfattn"
+        if (self.scheduler_config.runner_type == "pooling"
+                or self.cache_config.is_attention_free):
+            version = "placeholder"
+
+        BlockSpaceManagerImpl = BlockSpaceManager.get_block_space_manager_class(
+            version)
+
+        num_gpu_blocks = cache_config.num_gpu_blocks
+        if num_gpu_blocks:
+            num_gpu_blocks //= pipeline_parallel_size
+
+        num_cpu_blocks = cache_config.num_cpu_blocks
+        if num_cpu_blocks:
+            num_cpu_blocks //= pipeline_parallel_size
+
+        # Create the block space manager.
+        self.block_manager = BlockSpaceManagerImpl(
+            block_size=self.cache_config.block_size,
+            num_gpu_blocks=num_gpu_blocks,
+            num_cpu_blocks=num_cpu_blocks,
+            sliding_window=self.cache_config.sliding_window,
+            enable_caching=self.cache_config.enable_prefix_caching,
+        )
+
+        # Sequence groups in the WAITING state.
+        # Contain new prefill or preempted requests.
+        self.waiting: Deque[SequenceGroup] = deque()
+        # Sequence groups in the RUNNING state.
+        # Contain decode requests.
+        self.running: Deque[SequenceGroup] = deque()
+        # Sequence groups in the SWAPPED state.
+        # Contain decode requests that are swapped out.
+        self.swapped: Deque[SequenceGroup] = deque()
+        # Sequence groups finished requests ids since last step iteration.
+        # It lets the model know that any state associated with these requests
+        # can and must be released after the current step.
+        # This is used to evict the finished requests from the Mamba cache.
+        self._finished_requests_ids: List[str] = list()
+        # Time at previous scheduling step
+        self.prev_time = 0.0
+        # Did we schedule a prompt at previous step?
+        self.prev_prompt = False
+        # Latency of the last prompt step
+        self.last_prompt_latency = 0.0
+        # preemption mode, RECOMPUTE or SWAP
+        self.user_specified_preemption_mode = scheduler_config.preemption_mode
+
+        # The following field is test-only. It is used to inject artificial
+        # preemption.
+        self.enable_artificial_preemption = ENABLE_ARTIFICIAL_PREEMPT
+        self.artificial_preempt_cnt = (ARTIFICIAL_PREEMPTION_MAX_CNT
+                                       if self.enable_artificial_preemption
+                                       else 0)
+        self.num_cumulative_preemption: int = 0
+
+        # Used to cache python objects
+        self._seq_group_metadata_cache: List[PyObjectCache] = []
+        self._scheduler_running_outputs_cache: List[PyObjectCache] = []
+        self._scheduled_seq_group_cache: List[PyObjectCache] = []
+
+        # For async output processing, we need to swap cache buffers between
+        # iterations. I.e. since the output processing is lagged one step,
+        # we cannot reuse the cached objects immediately when the schedule()
+        # is called again, but only when schedule() is called the second time.
+        self.output_proc_callback = output_proc_callback
+        self.use_async_output_proc = self.output_proc_callback is not None
+        self.num_cache_iters = 2 if self.use_async_output_proc else 1
+
+        self.cache_id = 0
+        for i in range(self.num_cache_iters):
+            self._seq_group_metadata_cache.append(
+                PyObjectCache(seq_group_metadata_builder))
+            self._scheduler_running_outputs_cache.append(
+                PyObjectCache(scheduler_running_outputs_builder))
+            self._scheduled_seq_group_cache.append(
+                PyObjectCache(scheduled_seq_group_builder))
+
+        # For async postprocessor, the extra decode run cannot be done
+        # when the request reaches max_model_len. In this case, the request
+        # will be stopped during schedule() call and added to this stop list
+        # for processing and deallocation by the free_finished_seq_groups()
+        self._async_stopped: List[SequenceGroup] = []
+
+        # List with the chunk sizes to hand out to each sequence depending
+        # on how many partial prefills are running. This is slightly faster than
+        # running an integer division every time a prefill is scheduled.
+        # This splits the budget evenly among all prefills.
+        self.partial_prefill_budget_lookup_list = [0] * (
+            self.scheduler_config.max_num_partial_prefills + 1)
+        self.partial_prefill_budget_lookup_list[0] = (
+            scheduler_config.max_num_batched_tokens)
+        for i in range(1, self.scheduler_config.max_num_partial_prefills + 1):
+            self.partial_prefill_budget_lookup_list[i] = (
+                scheduler_config.max_num_batched_tokens // i)
+
+    @property
+    def next_cache_id(self):
+        return (self.cache_id + 1) % self.num_cache_iters
+
+    @property
+    def lora_enabled(self) -> bool:
+        return bool(self.lora_config)
+
+    @property
+    def num_decoding_tokens_per_seq(self) -> int:
+        """The number of new tokens."""
+        return 1
+
+    def add_seq_group(self, seq_group: SequenceGroup) -> None:
+        # Add sequence groups to the waiting queue.
+        self.waiting.append(seq_group)
+
+    def _add_seq_group_to_running(self, seq_group: SequenceGroup) -> None:
+        # Add sequence groups to the running queue.
+        # Only for testing purposes.
+        self.running.append(seq_group)
+
+    def _add_seq_group_to_swapped(self, seq_group: SequenceGroup) -> None:
+        # Add sequence groups to the swapped queue.
+        # Only for testing purposes.
+        self.swapped.append(seq_group)
+
+    def abort_seq_group(
+        self,
+        request_id: Union[str, Iterable[str]],
+        seq_id_to_seq_group: Optional[Dict[str, SequenceGroupBase]] = None,
+    ) -> None:
+        """Aborts a sequence group with the given ID.
+
+        Check if the sequence group with the given ID
+            is present in any of the state queue.
+        If present, remove the sequence group from the state queue.
+            Also, if any of the sequences in the sequence group is not finished,
+                free the sequence with status `FINISHED_ABORTED`.
+        Otherwise, do nothing.
+
+        Args:
+            request_id: The ID(s) of the sequence group to abort.
+            seq_id_to_seq_group: helper for groups with n>1
+        """
+        if isinstance(request_id, str):
+            request_id = (request_id, )
+        request_ids = set(request_id)
+        seq_id_to_seq_group = seq_id_to_seq_group or {}
+        for state_queue in [self.waiting, self.running, self.swapped]:
+            aborted_groups: List[SequenceGroup] = []
+            for seq_group in state_queue:
+                # When n>1, seq_group.request_id looks like
+                # foo_parallel_sample_0, while request_ids is just foo, and we
+                # should resolve it as real_request_id to match.
+                if seq_group.request_id in seq_id_to_seq_group:
+                    real_request_id = seq_id_to_seq_group[
+                        seq_group.request_id].group_id
+                else:
+                    real_request_id = seq_group.request_id
+                if real_request_id in request_ids:
+                    # Appending aborted group into pending list.
+                    aborted_groups.append(seq_group)
+                    # We can't remove real_request_id in request_ids here,
+                    # because there may be other seq groups sharing the same
+                    # real_request_id
+            for aborted_group in aborted_groups:
+                # Remove the sequence group from the state queue.
+                state_queue.remove(aborted_group)
+                # Remove the aborted request from the Mamba cache.
+                self._finished_requests_ids.append(aborted_group.request_id)
+                for seq in aborted_group.get_seqs():
+                    if seq.is_finished():
+                        continue
+                    seq.status = SequenceStatus.FINISHED_ABORTED
+                    self.free_seq(seq)
+                if aborted_group.request_id in seq_id_to_seq_group:
+                    del seq_id_to_seq_group[aborted_group.request_id]
+
+                self._free_seq_group_cross_attn_blocks(aborted_group)
+
+    def _free_seq_group_cross_attn_blocks(
+        self,
+        seq_group: SequenceGroup,
+    ) -> None:
+        """
+        Free a sequence group from a cross-attention block table.
+        Has no effect on decoder-only models.
+        """
+        if seq_group.is_encoder_decoder():
+            self.block_manager.free_cross(seq_group)
+
+    def has_unfinished_seqs(self) -> bool:
+        return (len(self.waiting) != 0 or len(self.running) != 0
+                or len(self.swapped) != 0)
+
+    def get_prefix_cache_hit_rate(self, device: Device) -> float:
+        return self.block_manager.get_prefix_cache_hit_rate(device)
+
+    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
+        return self.block_manager.reset_prefix_cache(device)
+
+    def get_num_unfinished_seq_groups(self) -> int:
+        return len(self.waiting) + len(self.running) + len(self.swapped)
+
+    def get_and_reset_finished_requests_ids(self) -> List[str]:
+        """Flushes the list of request ids of previously finished seq_groups."""
+        finished_requests_ids = self._finished_requests_ids
+        self._finished_requests_ids = list()
+        return finished_requests_ids
+
+    def _schedule_running(
+        self,
+        budget: SchedulingBudget,
+        curr_loras: Optional[Set[int]],
+        enable_chunking: bool = False,
+        partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
+    ) -> SchedulerRunningOutputs:
+        """Schedule sequence groups that are running.
+
+        Running queue should include decode and chunked prefill requests.
+
+        Args:
+            budget: The scheduling budget. The argument is in-place updated
+                when any decodes are preempted.
+            curr_loras: Currently batched lora request ids. The argument is
+                in-place updated when any decodes are preempted.
+            enable_chunking: If True, seq group can be chunked and only a
+                chunked number of tokens are scheduled  if
+                `budget.num_batched_tokens` has not enough capacity to schedule
+                all tokens.
+            partial_prefill_metadata: information about the partial prefills
+            that are currently running
+
+        Returns:
+            SchedulerRunningOutputs.
+        """
+        ret: SchedulerRunningOutputs = self._scheduler_running_outputs_cache[
+            self.cache_id].get_object()
+        ret.blocks_to_swap_out.clear()
+        ret.blocks_to_copy.clear()
+        ret.decode_seq_groups.clear()
+        ret.prefill_seq_groups.clear()
+        ret.preempted.clear()
+        ret.swapped_out.clear()
+
+        ret.num_lookahead_slots = self._get_num_lookahead_slots(
+            is_prefill=False, enable_chunking=enable_chunking)
+
+        ret.decode_seq_groups_list.clear()
+        ret.prefill_seq_groups_list.clear()
+
+        # Blocks that need to be swapped or copied before model execution.
+        blocks_to_swap_out: List[Tuple[int, int]] = ret.blocks_to_swap_out
+        blocks_to_copy: List[Tuple[int, int]] = ret.blocks_to_copy
+
+        decode_seq_groups: List[ScheduledSequenceGroup] = ret.decode_seq_groups
+        prefill_seq_groups: List[
+            ScheduledSequenceGroup] = ret.prefill_seq_groups
+        preempted: List[SequenceGroup] = ret.preempted
+        swapped_out: List[SequenceGroup] = ret.swapped_out
+
+        running_queue = self.running
+        assert len(self._async_stopped) == 0
+        while running_queue:
+            seq_group = running_queue[0]
+            # We discard the cached tokens info here because we don't need it
+            # for running sequence:
+            #   1. If a sequence is running with chunked prefill, the cached
+            #      tokens info was already used for the first prefill.
+            #   2. If a sequence is running with non-chunked prefill, then
+            #      there it's a decoding sequence, and the cached tokens info is
+            #      irrelevant.
+            num_uncached_new_tokens, _ = \
+                self._get_num_new_uncached_and_cached_tokens(
+                seq_group,
+                SequenceStatus.RUNNING,
+                enable_chunking,
+                budget,
+                partial_prefill_metadata,
+            )
+
+            num_running_tokens = num_uncached_new_tokens
+            if num_running_tokens == 0:
+                # No budget => Stop
+                break
+
+            running_queue.popleft()
+
+            # With async postprocessor, an extra decode run is done
+            # to process the final tokens. The check below avoids this extra
+            # decode run when the model max len is reached, in order to avoid
+            # a memory overflow.
+            if (self.use_async_output_proc and seq_group.seqs[0].get_len()
+                    > self.scheduler_config.max_model_len):
+                self._async_stopped.append(seq_group)
+                continue
+
+            # NOTE(woosuk): Preemption happens only when there is no available
+            # slot to keep all the sequence groups in the RUNNING state.
+            while not self._can_append_slots(seq_group, enable_chunking):
+                budget.subtract_num_batched_tokens(seq_group.request_id,
+                                                   num_running_tokens)
+                num_running_seqs = seq_group.get_max_num_running_seqs()
+                budget.subtract_num_seqs(seq_group.request_id,
+                                         num_running_seqs)
+
+                if (curr_loras is not None and seq_group.lora_int_id > 0
+                        and seq_group.lora_int_id in curr_loras):
+                    curr_loras.remove(seq_group.lora_int_id)
+
+                # Determine victim sequence
+                cont_loop = True
+                if running_queue:
+                    # Preempt the lowest-priority sequence group.
+                    victim_seq_group = running_queue.pop()
+                else:
+                    # No other sequence group can be preempted.
+                    # Preempt the current sequence group.
+                    # Note: This is also where we stop this loop
+                    # (since there is nothing else to preempt)
+                    victim_seq_group = seq_group
+                    cont_loop = False
+
+                # With async postprocessor, before preempting a sequence
+                # we need to ensure it has no pending async postprocessor
+                do_preempt = True
+                if self.use_async_output_proc:
+                    assert self.output_proc_callback is not None
+                    self.output_proc_callback(
+                        request_id=victim_seq_group.request_id)
+
+                    # It may be that the async pending "victim_seq_group"
+                    # becomes finished, in which case we simply free it.
+                    if victim_seq_group.is_finished():
+                        self._free_finished_seq_group(victim_seq_group)
+                        do_preempt = False
+
+                # Do preemption
+                if do_preempt:
+                    preempted_mode = self._preempt(victim_seq_group,
+                                                   blocks_to_swap_out)
+                    if preempted_mode == PreemptionMode.RECOMPUTE:
+                        preempted.append(victim_seq_group)
+                    else:
+                        swapped_out.append(victim_seq_group)
+
+                if not cont_loop:
+                    break
+            else:
+                self._append_slots(seq_group, blocks_to_copy, enable_chunking)
+                is_prefill = seq_group.is_prefill()
+
+                scheduled_seq_group: ScheduledSequenceGroup = (
+                    self._scheduled_seq_group_cache[
+                        self.cache_id].get_object())
+                scheduled_seq_group.seq_group = seq_group
+                if is_prefill:
+                    scheduled_seq_group.token_chunk_size = num_running_tokens
+                    prefill_seq_groups.append(scheduled_seq_group)
+                    ret.prefill_seq_groups_list.append(seq_group)
+                else:
+                    scheduled_seq_group.token_chunk_size = 1
+                    decode_seq_groups.append(scheduled_seq_group)
+                    ret.decode_seq_groups_list.append(seq_group)
+
+                budget.add_num_batched_tokens(seq_group.request_id,
+                                              num_running_tokens)
+                # OPTIMIZATION:  Note that get_max_num_running_seqs is
+                # expensive. For the default scheduling chase where
+                # enable_chunking is False, num_seqs are updated before running
+                # this method, so we don't have to update it again here.
+                if enable_chunking:
+                    num_running_seqs = seq_group.get_max_num_running_seqs()
+                    budget.add_num_seqs(seq_group.request_id, num_running_seqs)
+                if curr_loras is not None and seq_group.lora_int_id > 0:
+                    curr_loras.add(seq_group.lora_int_id)
+
+        self._scheduler_running_outputs_cache[self.next_cache_id].reset()
+        self._scheduled_seq_group_cache[self.next_cache_id].reset()
+
+        return ret
+
+    def _schedule_swapped(
+        self,
+        budget: SchedulingBudget,
+        curr_loras: Optional[Set[int]],
+        enable_chunking: bool = False,
+    ) -> SchedulerSwappedInOutputs:
+        """Schedule sequence groups that are swapped out.
+
+        It schedules swapped requests as long as it fits `budget` and
+        curr_loras <= max_lora from the scheduling config. The input arguments
+        `budget` and `curr_loras` are updated based on scheduled seq_groups.
+
+        Args:
+            budget: The scheduling budget. The argument is in-place updated
+                when any requests are swapped in.
+            curr_loras: Currently batched lora request ids. The argument is
+                in-place updated when any requests are swapped in.
+            enable_chunking: If True, seq group can be chunked and only a
+                chunked number of tokens are scheduled  if
+                `budget.num_batched_tokens` has not enough capacity to schedule
+                all tokens.
+
+        Returns:
+            SchedulerSwappedInOutputs.
+        """
+        # Blocks that need to be swapped or copied before model execution.
+        blocks_to_swap_in: List[Tuple[int, int]] = []
+        blocks_to_copy: List[Tuple[int, int]] = []
+        decode_seq_groups: List[ScheduledSequenceGroup] = []
+        prefill_seq_groups: List[ScheduledSequenceGroup] = []
+        infeasible_seq_groups: List[SequenceGroup] = []
+
+        swapped_queue = self.swapped
+
+        leftover_swapped: Deque[SequenceGroup] = deque()
+        while swapped_queue:
+            seq_group = swapped_queue[0]
+
+            # If the sequence group cannot be swapped in, stop.
+            is_prefill = seq_group.is_prefill()
+            alloc_status = self.block_manager.can_swap_in(
+                seq_group,
+                self._get_num_lookahead_slots(is_prefill, enable_chunking))
+            if alloc_status == AllocStatus.LATER:
+                break
+            elif alloc_status == AllocStatus.NEVER:
+                logger.warning(
+                    "Failing the request %s because there's not enough kv "
+                    "cache blocks to run the entire sequence.",
+                    seq_group.request_id,
+                )
+                for seq in seq_group.get_seqs():
+                    seq.status = SequenceStatus.FINISHED_IGNORED
+                infeasible_seq_groups.append(seq_group)
+                swapped_queue.popleft()
+                continue
+
+            lora_int_id = 0
+            if self.lora_enabled:
+                lora_int_id = seq_group.lora_int_id
+                assert curr_loras is not None
+                assert self.lora_config is not None
+                if (lora_int_id > 0 and (lora_int_id not in curr_loras)
+                        and len(curr_loras) >= self.lora_config.max_loras):
+                    # We don't have a space for another LoRA, so
+                    # we ignore this request for now.
+                    leftover_swapped.appendleft(seq_group)
+                    swapped_queue.popleft()
+                    continue
+
+            # The total number of sequences in the RUNNING state should not
+            # exceed the maximum number of sequences.
+            num_new_seqs = seq_group.get_max_num_running_seqs()
+            num_new_tokens_uncached, num_new_tokens_cached = (
+                self._get_num_new_uncached_and_cached_tokens(
+                    seq_group, SequenceStatus.SWAPPED, enable_chunking,
+                    budget))
+
+            if num_new_tokens_uncached == 0 or not budget.can_schedule(
+                    num_new_tokens=num_new_tokens_uncached,
+                    num_new_seqs=num_new_seqs,
+            ):
+                self.remove_seq_from_computed_blocks_tracker(
+                    seq_group, SequenceStatus.SWAPPED)
+                break
+
+            if lora_int_id > 0 and curr_loras is not None:
+                curr_loras.add(lora_int_id)
+            swapped_queue.popleft()
+            self._swap_in(seq_group, blocks_to_swap_in)
+            self._append_slots(seq_group, blocks_to_copy, enable_chunking)
+            if is_prefill:
+                prefill_seq_groups.append(
+                    ScheduledSequenceGroup(
+                        seq_group,
+                        token_chunk_size=num_new_tokens_uncached +
+                        num_new_tokens_cached,
+                    ))
+            else:
+                decode_seq_groups.append(
+                    ScheduledSequenceGroup(seq_group, token_chunk_size=1))
+            budget.add_num_batched_tokens(
+                seq_group.request_id,
+                num_batched_tokens=num_new_tokens_uncached,
+                num_cached_tokens=num_new_tokens_cached,
+            )
+            budget.add_num_seqs(seq_group.request_id, num_new_seqs)
+
+        swapped_queue.extendleft(leftover_swapped)
+
+        return SchedulerSwappedInOutputs(
+            decode_seq_groups=decode_seq_groups,
+            prefill_seq_groups=prefill_seq_groups,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_copy=blocks_to_copy,
+            num_lookahead_slots=self._get_num_lookahead_slots(
+                is_prefill=False, enable_chunking=enable_chunking),
+            infeasible_seq_groups=infeasible_seq_groups,
+        )
+
+    def _get_prompt_limit(self, seq_group: SequenceGroup) -> int:
+        if (self.scheduler_config.chunked_prefill_enabled
+                and not self.scheduler_config.is_multi_step):
+            prompt_limit = self.scheduler_config.max_model_len
+        else:
+            prompt_limit = min(
+                self.scheduler_config.max_model_len,
+                self.scheduler_config.max_num_batched_tokens,
+            )
+
+        # Model is fine tuned with long context. Return the fine tuned max_len.
+        if seq_group.lora_request and seq_group.lora_request.long_lora_max_len:
+            assert prompt_limit <= seq_group.lora_request.long_lora_max_len
+            return seq_group.lora_request.long_lora_max_len
+        else:
+            return prompt_limit
+
+    def _get_priority(self,
+                      seq_group: SequenceGroup) -> Tuple[Optional[int], float]:
+        """Get the priority of the sequence group.
+        Highest preference to user-defined priority, followed by arrival time.
+        Args:
+            seq_group: The sequence group input.
+        Returns:
+            The priority of the sequence group.
+        """
+        return seq_group.priority, seq_group.arrival_time
+
+    def _schedule_priority_preemption(
+        self,
+        budget: SchedulingBudget,
+    ) -> int:
+        """Sorts waiting and running queue. Also, force preempt requests
+        from the running queue if their priority is lower.
+        Priority-based preemption is used with the priority policy.
+        Args:
+            budget: The scheduling budget. The argument is in-place updated
+                when any requests are scheduled.
+        Returns:
+            A count of priority-based preemptions.
+        """
+
+        waiting_queue = self.waiting
+
+        running_queue = deque(sorted(self.running, key=self._get_priority))
+
+        blocks_to_swap_out: List[Tuple[int, int]] = []
+        force_preemption_count = 0
+
+        if waiting_queue:
+            seq_group = waiting_queue.popleft()
+            num_new_seqs = seq_group.get_max_num_running_seqs()
+            num_new_tokens_uncached, _ = \
+                self._get_num_new_uncached_and_cached_tokens(
+                seq_group, SequenceStatus.WAITING, False, budget)
+
+            # Only preempt if priority inversion exists
+            while running_queue and self._get_priority(
+                    running_queue[-1]) > self._get_priority(seq_group):
+                # Only preempt if waiting sequence cannot be allocated
+                can_allocate = self.block_manager.can_allocate(seq_group)
+                if (num_new_tokens_uncached > 0
+                        and can_allocate == AllocStatus.OK
+                        and budget.can_schedule(
+                            num_new_tokens=num_new_tokens_uncached,
+                            num_new_seqs=num_new_seqs,
+                        )):
+                    break
+
+                # Adjust budget to remove the victim sequence group
+                vseq_group = running_queue.pop()
+                num_running_tokens_uncached, _ = (
+                    self._get_num_new_uncached_and_cached_tokens(
+                        vseq_group, SequenceStatus.RUNNING, False, budget))
+                budget.subtract_num_batched_tokens(
+                    vseq_group.request_id, num_running_tokens_uncached)
+                num_running_seqs = vseq_group.get_max_num_running_seqs()
+                budget.subtract_num_seqs(vseq_group.request_id,
+                                         num_running_seqs)
+
+                # Preempt out the victim sequence group
+                self._preempt(vseq_group, blocks_to_swap_out)
+                waiting_queue.appendleft(vseq_group)
+                force_preemption_count += 1
+            # Put the sequence back into the waiting queue
+            waiting_queue.appendleft(seq_group)
+
+            self.remove_seq_from_computed_blocks_tracker(
+                seq_group, SequenceStatus.WAITING)
+
+        waiting_queue = deque(sorted(waiting_queue, key=self._get_priority))
+
+        self.waiting = waiting_queue
+        self.running = running_queue
+        return force_preemption_count
+
+    def _schedule_prefills(
+        self,
+        budget: SchedulingBudget,
+        curr_loras: Optional[Set[int]],
+        enable_chunking: bool = False,
+        partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
+    ) -> SchedulerPrefillOutputs:
+        """Schedule sequence groups that are in prefill stage.
+
+        Note that the current scheduler treats PREEMPTED_FOR_RECOMPUTE
+        as a new prefill (that starts from beginning -> most recently generated
+        tokens).
+
+        It schedules waiting requests as long as it fits `budget` and
+        curr_loras <= max_lora from the scheduling config. The input arguments
+        `budget` and `curr_loras` are updated based on scheduled seq_groups.
+
+        Args:
+            budget: The scheduling budget. The argument is in-place updated
+                when any requests are scheduled.
+            curr_loras: Currently batched lora request ids. The argument is
+                in-place updated when any requests are scheduled.
+            enable_chunking: If True, seq group can be chunked and only a
+                chunked number of tokens are scheduled  if
+                `budget.num_batched_tokens` has not enough capacity to schedule
+                all tokens.
+            partial_prefill_metadata: information about the partial prefills
+                that are currently running
+
+        Returns:
+            SchedulerPrefillOutputs.
+        """
+        if budget.remaining_token_budget() == 0:
+            # Do nothing: Can't add any more prefill anyway
+            return SchedulerPrefillOutputs(
+                seq_groups=[],
+                ignored_seq_groups=[],
+                num_lookahead_slots=self._get_num_lookahead_slots(
+                    is_prefill=True, enable_chunking=enable_chunking),
+            )
+        ignored_seq_groups: List[SequenceGroup] = []
+        seq_groups: List[ScheduledSequenceGroup] = []
+        using_prompt_embeds: bool = False
+
+        waiting_queue = self.waiting
+
+        leftover_waiting_sequences: Deque[SequenceGroup] = deque()
+        while self._passed_delay(time.time()) and waiting_queue:
+            seq_group = waiting_queue[0]
+
+            waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
+            assert len(waiting_seqs) == 1, (
+                "Waiting sequence group should have only one prompt "
+                "sequence.")
+            if (partial_prefill_metadata is not None
+                    and not partial_prefill_metadata.can_schedule(seq_group)):
+                leftover_waiting_sequences.appendleft(seq_group)
+                waiting_queue.popleft()
+                continue
+            num_new_tokens_uncached, num_new_tokens_cached = (
+                self._get_num_new_uncached_and_cached_tokens(
+                    seq_group,
+                    SequenceStatus.WAITING,
+                    enable_chunking,
+                    budget,
+                    partial_prefill_metadata=partial_prefill_metadata,
+                ))
+            num_new_tokens = num_new_tokens_uncached + num_new_tokens_cached
+
+            if not enable_chunking:
+                num_prompt_tokens = waiting_seqs[0].get_len()
+                assert num_new_tokens == num_prompt_tokens
+
+            prompt_limit = self._get_prompt_limit(seq_group)
+            if num_new_tokens > prompt_limit:
+                logger.warning(
+                    "Input prompt (%d tokens) is too long"
+                    " and exceeds limit of %d",
+                    num_new_tokens,
+                    prompt_limit,
+                )
+                for seq in waiting_seqs:
+                    seq.status = SequenceStatus.FINISHED_IGNORED
+                self.remove_seq_from_computed_blocks_tracker(
+                    seq_group, SequenceStatus.FINISHED_IGNORED)
+                ignored_seq_groups.append(seq_group)
+                waiting_queue.popleft()
+                continue
+
+            num_lookahead_slots: int = 0
+            if self.scheduler_config.is_multi_step and enable_chunking:
+                num_lookahead_slots = self._get_num_lookahead_slots(
+                    True, enable_chunking)
+
+            # If the sequence group cannot be allocated, stop.
+            can_allocate = self.block_manager.can_allocate(
+                seq_group, num_lookahead_slots=num_lookahead_slots)
+            if can_allocate == AllocStatus.LATER:
+                self.remove_seq_from_computed_blocks_tracker(
+                    seq_group, SequenceStatus.WAITING)
+                break
+            elif can_allocate == AllocStatus.NEVER:
+                logger.warning(
+                    "Input prompt (%d tokens) + lookahead slots (%d) is "
+                    "too long and exceeds the capacity of block_manager",
+                    num_new_tokens,
+                    num_lookahead_slots,
+                )
+                for seq in waiting_seqs:
+                    seq.status = SequenceStatus.FINISHED_IGNORED
+                self.remove_seq_from_computed_blocks_tracker(
+                    seq_group, SequenceStatus.FINISHED_IGNORED)
+                ignored_seq_groups.append(seq_group)
+                waiting_queue.popleft()
+                continue
+
+            # We cannot mix sequence groups that use prompt embeds and
+            # those that do not.
+            if len(seq_groups) == 0:
+                using_prompt_embeds = seq_group.uses_prompt_embeds()
+            if using_prompt_embeds != seq_group.uses_prompt_embeds():
+                self.remove_seq_from_computed_blocks_tracker(
+                    seq_group, SequenceStatus.WAITING)
+                leftover_waiting_sequences.appendleft(seq_group)
+                waiting_queue.popleft()
+                continue
+
+            lora_int_id = 0
+            if self.lora_enabled:
+                lora_int_id = seq_group.lora_int_id
+                assert curr_loras is not None
+                assert self.lora_config is not None
+                if (self.lora_enabled and lora_int_id > 0
+                        and lora_int_id not in curr_loras
+                        and len(curr_loras) >= self.lora_config.max_loras):
+                    # We don't have a space for another LoRA, so
+                    # we ignore this request for now.
+                    self.remove_seq_from_computed_blocks_tracker(
+                        seq_group, SequenceStatus.WAITING)
+                    leftover_waiting_sequences.appendleft(seq_group)
+                    waiting_queue.popleft()
+                    continue
+
+            if (budget.num_batched_tokens
+                    >= self.scheduler_config.max_num_batched_tokens):
+                # We've reached the budget limit - since there might be
+                # continuous prefills in the running queue, we should break
+                # to avoid scheduling any new prefills.
+                self.remove_seq_from_computed_blocks_tracker(
+                    seq_group, SequenceStatus.WAITING)
+                break
+
+            num_new_seqs = seq_group.get_max_num_running_seqs()
+            if num_new_tokens_uncached == 0 or not budget.can_schedule(
+                    num_new_tokens=num_new_tokens_uncached,
+                    num_new_seqs=num_new_seqs,
+            ):
+                self.remove_seq_from_computed_blocks_tracker(
+                    seq_group, SequenceStatus.WAITING)
+                break
+
+            # Can schedule this request.
+            if curr_loras is not None and lora_int_id > 0:
+                curr_loras.add(lora_int_id)
+            waiting_queue.popleft()
+            self._allocate_and_set_running(seq_group)
+
+            if partial_prefill_metadata is not None:
+                partial_prefill_metadata.maybe_increment_partial_prefills(
+                    seq_group)
+
+            if enable_chunking and self.scheduler_config.is_multi_step:
+                blocks_to_copy: List[Tuple[int, int]] = []
+                # init_multi_step_from_lookahead_slots happens in append_slots
+                self._append_slots(seq_group, blocks_to_copy, enable_chunking)
+                # This assert will trip when a copy-on-write happens. This is
+                # not a concern as the very first sequence-group block
+                # allocation happens above. Still, we have the assert to
+                # catch any edge-cases.
+                assert not blocks_to_copy
+            else:
+                seq_group.init_multi_step_from_lookahead_slots(
+                    num_lookahead_slots,
+                    num_scheduler_steps=self.scheduler_config.
+                    num_scheduler_steps,
+                    is_multi_step=self.scheduler_config.is_multi_step,
+                    enable_chunking=enable_chunking,
+                )
+
+            seq_groups.append(
+                ScheduledSequenceGroup(seq_group=seq_group,
+                                       token_chunk_size=num_new_tokens))
+            budget.add_num_batched_tokens(
+                seq_group.request_id,
+                num_batched_tokens=num_new_tokens_uncached,
+                num_cached_tokens=num_new_tokens_cached,
+            )
+            budget.add_num_seqs(seq_group.request_id, num_new_seqs)
+
+        # Queue requests that couldn't be scheduled.
+        waiting_queue.extendleft(leftover_waiting_sequences)
+        if len(seq_groups) > 0:
+            self.prev_prompt = True
+
+        return SchedulerPrefillOutputs(
+            seq_groups=seq_groups,
+            ignored_seq_groups=ignored_seq_groups,
+            num_lookahead_slots=self._get_num_lookahead_slots(
+                is_prefill=True, enable_chunking=enable_chunking),
+        )
+
+    def _schedule_default(self) -> SchedulerOutputs:
+        """Schedule queued requests.
+
+        The current policy is designed to optimize the throughput. First,
+        it batches as many prefill requests as possible. And it schedules
+        decodes. If there's a pressure on GPU memory, decode requests can
+        be swapped or preempted.
+        """
+        # Include running requests to the budget.
+        budget = SchedulingBudget(
+            token_budget=self.scheduler_config.max_num_batched_tokens,
+            max_num_seqs=self.scheduler_config.max_num_seqs,
+        )
+        # Make sure we include num running seqs before scheduling prefill,
+        # so that we don't schedule beyond max_num_seqs for prefill.
+        for seq_group in self.running:
+            budget.add_num_seqs(seq_group.request_id,
+                                seq_group.get_max_num_running_seqs())
+        curr_loras = (set(
+            seq_group.lora_int_id for seq_group in self.running
+            if seq_group.lora_int_id > 0) if self.lora_enabled else None)
+
+        prefills = SchedulerPrefillOutputs.create_empty()
+        running_scheduled = SchedulerRunningOutputs.create_empty()
+        swapped_in = SchedulerSwappedInOutputs.create_empty()
+
+        # If any requests are swapped, prioritized swapped requests.
+        if not self.swapped:
+            prefills = self._schedule_prefills(budget,
+                                               curr_loras,
+                                               enable_chunking=False)
+
+        if len(prefills.seq_groups
+               ) == 0 and self.scheduler_config.policy == "priority":
+            self._schedule_priority_preemption(budget)
+
+        # Don't schedule decodes if prefills are scheduled.
+        # NOTE: If `_schedule_prefills` doesn't enable chunking, self.running
+        # only contains decode requests, not chunked prefills.
+        if len(prefills.seq_groups) == 0:
+            running_scheduled = self._schedule_running(budget,
+                                                       curr_loras,
+                                                       enable_chunking=False)
+
+            # If any sequence group is preempted, do not swap in any sequence
+            # group. because it means there's no slot for new running requests.
+            if (len(running_scheduled.preempted) +
+                    len(running_scheduled.swapped_out) == 0):
+                swapped_in = \
+                    self._schedule_swapped(budget, curr_loras)
+
+        assert (budget.num_batched_tokens
+                <= self.scheduler_config.max_num_batched_tokens)
+        assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
+
+        # Update waiting requests.
+        self.waiting.extendleft(running_scheduled.preempted)
+        # Update new running requests.
+        if len(prefills.seq_groups) > 0:
+            self.running.extend([s.seq_group for s in prefills.seq_groups])
+
+        self.running.extend(running_scheduled.decode_seq_groups_list)
+
+        if len(swapped_in.decode_seq_groups) > 0:
+            self.running.extend(
+                [s.seq_group for s in swapped_in.decode_seq_groups])
+
+        # Update swapped requests.
+        self.swapped.extend(running_scheduled.swapped_out)
+        preempted = len(running_scheduled.preempted) + len(
+            running_scheduled.swapped_out)
+
+        # There should be no prefill from running queue because this policy
+        # doesn't allow chunked prefills.
+        assert len(running_scheduled.prefill_seq_groups) == 0
+        assert len(swapped_in.prefill_seq_groups) == 0
+
+        # Merge lists
+        num_prefill_groups = len(prefills.seq_groups)
+        ignored_seq_groups_for_embeds = list[SequenceGroup]()
+        if num_prefill_groups > 0:
+            scheduled_seq_groups = prefills.seq_groups
+            scheduled_seq_groups.extend(running_scheduled.decode_seq_groups)
+            ignored_seq_groups_for_embeds.clear()
+        else:
+            scheduled_seq_groups = running_scheduled.decode_seq_groups
+            if len(scheduled_seq_groups) > 0:
+                using_prompt_embeds = scheduled_seq_groups[
+                    0].seq_group.uses_prompt_embeds()
+                ignored_seq_groups_for_embeds.clear()
+                indices_ignored = list[int]()
+                for i, schedule_seq_group in enumerate(scheduled_seq_groups):
+                    if using_prompt_embeds !=\
+                        schedule_seq_group.seq_group.uses_prompt_embeds():
+                        ignored_seq_groups_for_embeds.append(
+                            schedule_seq_group.seq_group)
+                        indices_ignored.append(i)
+                if len(ignored_seq_groups_for_embeds) > 0:
+                    scheduled_seq_groups = [
+                        group for i, group in enumerate(scheduled_seq_groups)
+                        if i not in indices_ignored
+                    ]
+            else:
+                ignored_seq_groups_for_embeds.clear()
+
+        scheduled_seq_groups.extend(swapped_in.decode_seq_groups)
+
+        blocks_to_copy = running_scheduled.blocks_to_copy
+        blocks_to_copy.extend(swapped_in.blocks_to_copy)
+
+        ignored_seq_groups = prefills.ignored_seq_groups
+        ignored_seq_groups.extend(ignored_seq_groups_for_embeds)
+        ignored_seq_groups.extend(swapped_in.infeasible_seq_groups)
+
+        return SchedulerOutputs(
+            scheduled_seq_groups=scheduled_seq_groups,
+            num_prefill_groups=num_prefill_groups,
+            num_batched_tokens=budget.num_batched_tokens +
+            budget.num_cached_tokens,
+            blocks_to_swap_in=swapped_in.blocks_to_swap_in,
+            blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy,
+            ignored_seq_groups=ignored_seq_groups,
+            num_lookahead_slots=running_scheduled.num_lookahead_slots,
+            running_queue_size=len(self.running),
+            preempted=preempted,
+        )
+
+    def _schedule_chunked_prefill(self) -> SchedulerOutputs:
+        """Schedule queued requests.
+
+        Chunked prefill allows to chunk prefill requests, batch them together
+        with decode requests. This policy 1. schedule as many decoding requests
+        as possible. 2. schedule chunked prefill requests that are not
+        finished. 3. schedule swapped request. 4. schedule new prefill
+        requests.
+
+        The policy can sustain the high GPU utilization because it can put
+        prefill and decodes requests to the same batch, while it improves
+        inter token latency because decodes requests don't need to be blocked
+        by prefill requests.
+        """
+        budget = SchedulingBudget(
+            token_budget=self.scheduler_config.max_num_batched_tokens,
+            max_num_seqs=self.scheduler_config.max_num_seqs,
+        )
+        curr_loras: Set[int] = set()
+
+        prefills = SchedulerPrefillOutputs.create_empty()
+        swapped_in = SchedulerSwappedInOutputs.create_empty()
+
+        # Create partial prefill metadata
+        partial_prefill_metadata = PartialPrefillMetadata.from_queues(
+            running=self.running,
+            waiting=self.waiting,
+            scheduler_config=self.scheduler_config,
+        )
+
+        # Decoding should be always scheduled first by fcfs.
+        running_scheduled = self._schedule_running(
+            budget,
+            curr_loras,
+            enable_chunking=True,
+            partial_prefill_metadata=partial_prefill_metadata,
+        )
+
+        # Schedule swapped out requests.
+        # If preemption happens, it means we don't have space for swap-in.
+        if len(running_scheduled.preempted) + len(
+                running_scheduled.swapped_out) == 0:
+            swapped_in = self._schedule_swapped(budget, curr_loras)
+
+        prefills = self._schedule_prefills(
+            budget,
+            curr_loras,
+            enable_chunking=True,
+            partial_prefill_metadata=partial_prefill_metadata,
+        )
+
+        assert (budget.num_batched_tokens
+                <= self.scheduler_config.max_num_batched_tokens)
+        assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
+
+        # Update waiting requests.
+        self.waiting.extendleft(running_scheduled.preempted)
+
+        # Update new running requests.
+        # By default, vLLM scheduler prioritizes prefills.
+        # Once chunked prefill is enabled,
+        # the policy is changed to prioritize decode requests.
+        self.running.extend(
+            [s.seq_group for s in swapped_in.decode_seq_groups])
+        self.running.extend(
+            [s.seq_group for s in swapped_in.prefill_seq_groups])
+        self.running.extend(
+            [s.seq_group for s in running_scheduled.decode_seq_groups])
+        # Because multiple prefills may be running concurrently, we need to
+        # make sure that prefills which are scheduled to finish are listed
+        # before those that won't. This is so that on the next scheduling
+        # iteration when they have transitioned to the decode stage, they are
+        # properly prioritized over sequences that are still in the prefill
+        # stage.
+        self.running.extend(
+            self._order_finishing_prefills_first(
+                running_scheduled.prefill_seq_groups))
+        self.running.extend([s.seq_group for s in prefills.seq_groups])
+
+        # Update swapped requests.
+        self.swapped.extend(running_scheduled.swapped_out)
+        # Put prefills first due to Attention backend ordering assumption.
+        scheduled_seq_groups = (prefills.seq_groups +
+                                running_scheduled.prefill_seq_groups +
+                                swapped_in.prefill_seq_groups +
+                                running_scheduled.decode_seq_groups +
+                                swapped_in.decode_seq_groups)
+        num_prefill_groups = (len(prefills.seq_groups) +
+                              len(swapped_in.prefill_seq_groups) +
+                              len(running_scheduled.prefill_seq_groups))
+        # If all prompts, then we set num_lookahead_slots to 0
+        # this allows us to go through the `no_spec` path in
+        # `spec_decode_worker.py`
+        all_prefills = len(scheduled_seq_groups) == num_prefill_groups
+        num_lookahead_slots = (0 if
+                               (all_prefills
+                                and not self.scheduler_config.is_multi_step)
+                               else running_scheduled.num_lookahead_slots)
+        return SchedulerOutputs(
+            scheduled_seq_groups=scheduled_seq_groups,
+            num_prefill_groups=num_prefill_groups,
+            num_batched_tokens=budget.num_batched_tokens +
+            budget.num_cached_tokens,
+            blocks_to_swap_in=swapped_in.blocks_to_swap_in,
+            blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
+            blocks_to_copy=running_scheduled.blocks_to_copy +
+            swapped_in.blocks_to_copy,
+            ignored_seq_groups=prefills.ignored_seq_groups +
+            swapped_in.infeasible_seq_groups,
+            num_lookahead_slots=num_lookahead_slots,
+            running_queue_size=len(self.running),
+            preempted=(len(running_scheduled.preempted) +
+                       len(running_scheduled.swapped_out)),
+        )
+
+    def _order_finishing_prefills_first(
+        self, scheduled_prefill_seqs: List[ScheduledSequenceGroup]
+    ) -> List[SequenceGroup]:
+        """Returns a list of prefilling SequenceGroups where sequences that are
+        scheduled to finish prefilling are listed first"""
+        finishing = [
+            s.seq_group for s in scheduled_prefill_seqs
+            if s.seq_group.get_num_uncomputed_tokens() == s.token_chunk_size
+        ]
+        not_finishing = [
+            s.seq_group for s in scheduled_prefill_seqs
+            if s.seq_group.get_num_uncomputed_tokens() != s.token_chunk_size
+        ]
+        return finishing + not_finishing
+
+    def _schedule(self) -> SchedulerOutputs:
+        """Schedule queued requests."""
+        if self.scheduler_config.chunked_prefill_enabled:
+            return self._schedule_chunked_prefill()
+        else:
+            return self._schedule_default()
+
+    def _can_append_slots(self, seq_group: SequenceGroup,
+                          enable_chunking: bool) -> bool:
+        """Determine whether or not we have enough space in the KV cache to
+        continue generation of the sequence group.
+        """
+        # It is True only for testing case to trigger artificial preemption.
+        if (self.enable_artificial_preemption
+                and random.uniform(0, 1) < ARTIFICIAL_PREEMPTION_PROB
+                and self.artificial_preempt_cnt > 0):
+            self.artificial_preempt_cnt -= 1
+            return False
+
+        is_prefill = seq_group.is_prefill()
+        num_lookahead_slots = self._get_num_lookahead_slots(
+            is_prefill, enable_chunking)
+
+        if is_prefill and num_lookahead_slots > 0:
+            # Appending prefill slots only happens multi-step and
+            # chunked-prefill are enabled together.
+            assert self.scheduler_config.is_multi_step and enable_chunking
+
+        return self.block_manager.can_append_slots(
+            seq_group=seq_group, num_lookahead_slots=num_lookahead_slots)
+
+    def _allow_async_output_proc(self, seq_group: SequenceGroup) -> bool:
+        # async_output_proc is allowed only when we have a single sequence
+        # in the sequence group
+        no_single_seq = seq_group.sampling_params is None or (
+            seq_group.sampling_params.n == 1)
+        return no_single_seq
+
+    def schedule(
+            self
+    ) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs, bool]:
+        # Schedule sequence groups.
+        # This function call changes the internal states of the scheduler
+        # such as self.running, self.swapped, and self.waiting.
+        scheduler_start_time = time.perf_counter()
+
+        scheduler_outputs: SchedulerOutputs = self._schedule()
+        now = time.time()
+
+        if not self.cache_config.enable_prefix_caching:
+            common_computed_block_nums = []
+
+        allow_async_output_proc: bool = self.use_async_output_proc
+
+        # Create input data structures.
+        seq_group_metadata_list: List[SequenceGroupMetadata] = []
+        for i, scheduled_seq_group in enumerate(
+                scheduler_outputs.scheduled_seq_groups):
+            seq_group = scheduled_seq_group.seq_group
+            token_chunk_size = scheduled_seq_group.token_chunk_size
+            seq_group.maybe_set_first_scheduled_time(now)
+
+            seq_group_metadata = self._seq_group_metadata_cache[
+                self.cache_id].get_object()
+            seq_group_metadata.seq_data.clear()
+            seq_group_metadata.block_tables.clear()
+
+            # seq_id -> SequenceData
+            seq_data: Dict[int, SequenceData] = {}
+            # seq_id -> physical block numbers
+            block_tables: Dict[int, List[int]] = {}
+
+            if seq_group.is_encoder_decoder():
+                # Encoder associated with SequenceGroup
+                encoder_seq = seq_group.get_encoder_seq()
+                assert encoder_seq is not None
+                encoder_seq_data = encoder_seq.data
+                # Block table for cross-attention
+                # Also managed at SequenceGroup level
+                cross_block_table = self.block_manager.get_cross_block_table(
+                    seq_group)
+            else:
+                encoder_seq_data = None
+                cross_block_table = None
+
+            for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
+                seq_id = seq.seq_id
+                seq_data[seq_id] = seq.data
+                block_tables[seq_id] = self.block_manager.get_block_table(seq)
+                self.block_manager.access_all_blocks_in_seq(seq, now)
+
+            if self.cache_config.enable_prefix_caching:
+                common_computed_block_nums = (
+                    self.block_manager.get_common_computed_block_ids(
+                        seq_group.get_seqs(status=SequenceStatus.RUNNING)))
+
+            do_sample = True
+            is_prompt = seq_group.is_prefill()
+            # We should send the metadata to workers when the first prefill
+            # is sent. Subsequent requests could be chunked prefill or decode.
+            is_first_prefill = False
+            if is_prompt:
+                seqs = seq_group.get_seqs()
+                # Prefill has only 1 sequence.
+                assert len(seqs) == 1
+                num_computed_tokens = seqs[0].data.get_num_computed_tokens()
+                is_first_prefill = num_computed_tokens == 0
+                # In the next iteration, all prompt tokens are not computed.
+                # It means the prefill is chunked, and we don't need sampling.
+                # NOTE: We use get_len instead of get_prompt_len because when
+                # a sequence is preempted, prefill includes previous generated
+                # output tokens.
+                if (token_chunk_size + num_computed_tokens
+                        < seqs[0].data.get_len()):
+                    do_sample = False
+
+            # It assumes the scheduled_seq_groups is ordered by
+            # prefill < decoding.
+            if is_first_prefill or not self.scheduler_config.send_delta_data:
+                seq_group_metadata = SequenceGroupMetadata(
+                    request_id=seq_group.request_id,
+                    is_prompt=is_prompt,
+                    seq_data=seq_data,
+                    sampling_params=seq_group.sampling_params,
+                    block_tables=block_tables,
+                    do_sample=do_sample,
+                    pooling_params=seq_group.pooling_params,
+                    token_chunk_size=token_chunk_size,
+                    lora_request=seq_group.lora_request,
+                    computed_block_nums=common_computed_block_nums,
+                    encoder_seq_data=encoder_seq_data,
+                    cross_block_table=cross_block_table,
+                    state=seq_group.state,
+                    token_type_ids=seq_group.token_type_ids,
+                    # `multi_modal_data` will only be present for the 1st comm
+                    # between engine and worker.
+                    # the subsequent comms can still use delta, but
+                    # `multi_modal_data` will be None.
+                    multi_modal_data=(seq_group.multi_modal_data
+                                      if scheduler_outputs.num_prefill_groups
+                                      > 0 else None),
+                    multi_modal_placeholders=(
+                        seq_group.multi_modal_placeholders
+                        if scheduler_outputs.num_prefill_groups > 0 else None),
+                )
+            else:
+                # When SPMD mode is enabled, we only send delta data except for
+                # the first request to reduce serialization cost.
+                seq_data_delta = {}
+                for id, data in seq_data.items():
+                    seq_data_delta[id] = data.get_delta_and_reset()
+                seq_group_metadata = SequenceGroupMetadataDelta(
+                    seq_data_delta,
+                    seq_group.request_id,
+                    block_tables,
+                    is_prompt,
+                    do_sample=do_sample,
+                    token_chunk_size=token_chunk_size,
+                    computed_block_nums=common_computed_block_nums,
+                )
+            seq_group_metadata_list.append(seq_group_metadata)
+
+            if allow_async_output_proc:
+                allow_async_output_proc = self._allow_async_output_proc(
+                    seq_group)
+
+        # Now that the batch has been created, we can assume all blocks in the
+        # batch will have been computed before the next scheduling invocation.
+        # This is because the engine assumes that a failure in model execution
+        # will crash the vLLM instance / will not retry.
+        for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups:
+            self.block_manager.mark_blocks_as_computed(
+                scheduled_seq_group.seq_group,
+                scheduled_seq_group.token_chunk_size)
+
+        self._seq_group_metadata_cache[self.next_cache_id].reset()
+
+        scheduler_time = time.perf_counter() - scheduler_start_time
+        # Add this to scheduler time to all the sequences that are currently
+        # running. This will help estimate if the scheduler is a significant
+        # component in the e2e latency.
+        for seq_group in self.running:
+            if seq_group is not None and seq_group.metrics is not None:
+                if seq_group.metrics.scheduler_time is not None:
+                    seq_group.metrics.scheduler_time += scheduler_time
+                else:
+                    seq_group.metrics.scheduler_time = scheduler_time
+
+        # Move to next cache (if exists)
+        self.cache_id = self.next_cache_id
+
+        # Return results
+        return (seq_group_metadata_list, scheduler_outputs,
+                allow_async_output_proc)
+
+    def fork_seq(self, parent_seq: Sequence, child_seq: Sequence) -> None:
+        self.block_manager.fork(parent_seq, child_seq)
+
+    def free_seq(self, seq: Sequence) -> None:
+        """Free a sequence from a block table."""
+        self.block_manager.free(seq)
+
+    def remove_seq_from_computed_blocks_tracker(
+            self, seq_group: SequenceGroup,
+            status: Optional[SequenceStatus]) -> None:
+        seqs = seq_group.get_seqs(status=status)
+        for seq in seqs:
+            self._remove_seq_from_computed_blocks_tracker(seq)
+
+    def _remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None:
+        """
+        Free a sequence computed blocks tracker _seq_id_to_blocks_hashes
+        and _seq_id_to_num_tokens_computed.
+        """
+        self.block_manager.remove_seq_from_computed_blocks_tracker(seq)
+
+    def _free_finished_seqs(self, seq_group: SequenceGroup) -> None:
+        """Free finished seqs in a sequence group."""
+        for seq in seq_group.get_seqs():
+            if seq.is_finished():
+                self.free_seq(seq)
+
+    def _free_finished_seq_group(self, seq_group: SequenceGroup) -> None:
+        if seq_group.is_finished():
+            # Free cross-attention block table, if it exists
+            self._free_seq_group_cross_attn_blocks(seq_group)
+
+            # Add the finished requests to the finished requests list.
+            # This list will be used to update the Mamba cache in the
+            # next step.
+            self._finished_requests_ids.append(seq_group.request_id)
+
+        # Free finished seqs
+        self._free_finished_seqs(seq_group)
+
+    def free_finished_seq_groups(self) -> None:
+        remaining: Deque[SequenceGroup] = deque()
+        for seq_group in self.running:
+            self._free_finished_seq_group(seq_group)
+            if not seq_group.is_finished():
+                remaining.append(seq_group)
+
+        self.running = remaining
+
+        # Handle async stopped sequence groups
+        # (ones that reached max model len)
+        if self._async_stopped:
+            for seq_group in self._async_stopped:
+                self._free_seq_group_cross_attn_blocks(seq_group)
+                self._finished_requests_ids.append(seq_group.request_id)
+
+                # Free finished seqs
+                self._free_finished_seqs(seq_group)
+
+            self._async_stopped.clear()
+
+    def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None:
+        self.block_manager.allocate(seq_group)
+        for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
+            seq.status = SequenceStatus.RUNNING
+
+    def _append_slots(
+        self,
+        seq_group: SequenceGroup,
+        blocks_to_copy: List[Tuple[int, int]],
+        enable_chunking: bool = False,
+    ) -> None:
+        """Appends new slots to the sequences in the given sequence group.
+
+        Args:
+            seq_group (SequenceGroup): The sequence group containing the
+                sequences to append slots to.
+            blocks_to_copy (List[Tuple[int, int]]): A list of tuple of two
+                ints, the first int is the source block index, and the second
+                int is the destination block index. This list is updated with
+                the new source and destination block indices for the appended
+                slots.
+            enable_chunking (bool): True if chunked prefill is enabled.
+        """
+        is_prefill: bool = seq_group.is_prefill()
+        num_lookahead_slots: int = self._get_num_lookahead_slots(
+            is_prefill, enable_chunking)
+
+        seq_group.init_multi_step_from_lookahead_slots(
+            num_lookahead_slots,
+            num_scheduler_steps=self.scheduler_config.num_scheduler_steps,
+            is_multi_step=self.scheduler_config.is_multi_step,
+            enable_chunking=enable_chunking,
+        )
+
+        seq_status: Optional[SequenceStatus] = SequenceStatus.RUNNING
+        if self.scheduler_config.is_multi_step and enable_chunking:
+            # In multi-step chunked-prefill any sequence type can have
+            # slots appended.
+            seq_status = None
+
+        for seq in seq_group.get_seqs(status=seq_status):
+            cows = self.block_manager.append_slots(seq, num_lookahead_slots)
+            if len(cows) > 0:
+                blocks_to_copy.extend(cows)
+
+    def _preempt(self, seq_group: SequenceGroup,
+                 blocks_to_swap_out: List[Tuple[int, int]]) -> PreemptionMode:
+        # If preemption mode is not specified, we determine the mode as follows:
+        # We use recomputation by default since it incurs lower overhead than
+        # swapping. However, when the sequence group has multiple sequences
+        # (e.g., beam search), recomputation is not currently supported. In
+        # such a case, we use swapping instead.
+        # FIXME(woosuk): This makes our scheduling policy a bit bizarre.
+        # As swapped sequences are prioritized over waiting sequences,
+        # sequence groups with multiple sequences are implicitly prioritized
+        # over sequence groups with a single sequence.
+        # TODO(woosuk): Support recomputation for sequence groups with multiple
+        # sequences. This may require a more sophisticated CUDA kernel.
+        if self.user_specified_preemption_mode is None:
+            if seq_group.get_max_num_running_seqs() == 1:
+                preemption_mode = PreemptionMode.RECOMPUTE
+            else:
+                preemption_mode = PreemptionMode.SWAP
+
+        elif self.user_specified_preemption_mode == "swap":
+            preemption_mode = PreemptionMode.SWAP
+        else:
+            preemption_mode = PreemptionMode.RECOMPUTE
+
+        if self.num_cumulative_preemption % 50 == 0:
+            logger.warning(
+                "Sequence group %s is preempted by %s mode because there is "
+                "not enough KV cache space. This can affect the end-to-end "
+                "performance. Increase gpu_memory_utilization or "
+                "tensor_parallel_size to provide more KV cache memory. "
+                "total_num_cumulative_preemption=%d",
+                seq_group.request_id,
+                preemption_mode,
+                self.num_cumulative_preemption + 1,
+            )
+        self.num_cumulative_preemption += 1
+
+        if preemption_mode == PreemptionMode.RECOMPUTE:
+            self._preempt_by_recompute(seq_group)
+        elif preemption_mode == PreemptionMode.SWAP:
+            self._preempt_by_swap(seq_group, blocks_to_swap_out)
+        else:
+            raise AssertionError("Invalid preemption mode.")
+        return preemption_mode
+
+    def _preempt_by_recompute(
+        self,
+        seq_group: SequenceGroup,
+    ) -> None:
+        seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
+        assert len(seqs) == 1
+        for seq in seqs:
+            seq.status = SequenceStatus.WAITING
+            self.free_seq(seq)
+            seq.reset_state_for_recompute()
+        self._free_seq_group_cross_attn_blocks(seq_group)
+
+    def _preempt_by_swap(
+        self,
+        seq_group: SequenceGroup,
+        blocks_to_swap_out: List[Tuple[int, int]],
+    ) -> None:
+        self._swap_out(seq_group, blocks_to_swap_out)
+
+    def _swap_in(
+        self,
+        seq_group: SequenceGroup,
+        blocks_to_swap_in: List[Tuple[int, int]],
+    ) -> None:
+        mapping = self.block_manager.swap_in(seq_group)
+        blocks_to_swap_in.extend(mapping)
+        for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
+            seq.status = SequenceStatus.RUNNING
+
+    def _swap_out(
+        self,
+        seq_group: SequenceGroup,
+        blocks_to_swap_out: List[Tuple[int, int]],
+    ) -> None:
+        if not self.block_manager.can_swap_out(seq_group):
+            # FIXME(woosuk): Abort the sequence group instead of aborting the
+            # entire engine.
+            raise RuntimeError(
+                "Aborted due to the lack of CPU swap space. Please increase "
+                "the swap space to avoid this error.")
+        mapping = self.block_manager.swap_out(seq_group)
+        blocks_to_swap_out.extend(mapping)
+        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
+            seq.status = SequenceStatus.SWAPPED
+
+    def _passed_delay(self, now: float) -> bool:
+        if self.prev_prompt:
+            self.last_prompt_latency = now - self.prev_time
+        self.prev_time, self.prev_prompt = now, False
+        # Delay scheduling prompts to let waiting queue fill up
+        if self.scheduler_config.delay_factor > 0 and self.waiting:
+            earliest_arrival_time = min(
+                [e.metrics.arrival_time for e in self.waiting])
+            passed_delay = ((now - earliest_arrival_time)
+                            > (self.scheduler_config.delay_factor *
+                               self.last_prompt_latency) or not self.running)
+        else:
+            passed_delay = True
+        return passed_delay
+
+    def _get_num_lookahead_slots(self, is_prefill: bool,
+                                 enable_chunking: bool) -> int:
+        """The number of slots to allocate per sequence per step, beyond known
+        token ids. Speculative decoding uses these slots to store KV activations
+        of tokens which may or may not be accepted.
+
+        Speculative decoding does not yet support prefill, so we do not perform
+        lookahead allocation for prefill.
+
+        When chunking is enabled with multi-step, we allocate lookahead slots
+        for the prefills for when the prefills turn into decodes in the first
+        step.
+        """
+        if is_prefill:
+            if self.scheduler_config.is_multi_step and enable_chunking:
+                # num_lookahead_slots was introduced in the context of decodes,
+                # in Speculative Decoding.
+                # When the num_scheduler_steps is 8, say, then the
+                # num_lookahead_slots is 7. Meaning, we are doing a 1-step of
+                # decode anyways and we wish to do 7 more.
+                #
+                # "lookaheads" for prefills, is introduced in support for
+                # Chunked-Prefill in Multi-Step.
+                return self.scheduler_config.num_lookahead_slots + 1
+            else:
+                return 0
+
+        return self.scheduler_config.num_lookahead_slots
+
+    def _get_num_new_uncached_and_cached_tokens(
+        self,
+        seq_group: SequenceGroup,
+        status: SequenceStatus,
+        enable_chunking: bool,
+        budget: SchedulingBudget,
+        partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
+    ) -> Tuple[int, int]:
+        """
+        Returns the number of new uncached and cached tokens to schedule for a
+        given sequence group that's in a given `status`.
+
+        The API could chunk the number of tokens to compute based on `budget`
+        if `enable_chunking` is True. If a sequence group has multiple
+        sequences (e.g., running beam search), it means it is in decoding
+        phase, so chunking doesn't happen.
+
+        Returns (0, 0) if the new token cannot be computed due to token budget.
+
+        The cached tokens's blocks are already computed, and the attention
+        backend will reuse the cached blocks rather than recomputing them. So
+        the scheduler could schedule these cached tokens "for free".
+
+        Args:
+            seq_group: The sequence group to get the number of new tokens to
+                schedule.
+            status: The status of the sequences to get the number of new tokens
+                to schedule.
+            enable_chunking: Whether to chunk the number of tokens to compute.
+            budget: The budget to chunk the number of tokens to compute.
+            partial_prefill_metadata: information about the partial prefills
+                that are currently running
+
+
+        Returns:
+            A tuple of two ints. The first int is the number of new uncached
+            tokens to schedule. The second int is the number of cached tokens.
+            If no more new tokens can be scheduled, returns (0, 0).
+        """
+        num_cached_new_tokens = 0
+        num_uncached_new_tokens = 0
+
+        seqs = seq_group.get_seqs(status=status)
+        # Compute the number of new uncached and cached tokens for
+        # each sequence.
+        for seq in seqs:
+            if not seq.is_prefill():
+                # Decode sequences should always just have 1 uncached token
+                # TODO(rickyx): Actually is this still correct for multi-step?
+                num_uncached_new_tokens += 1
+                continue
+
+            num_computed_tokens_seq = seq.get_num_computed_tokens()
+            all_num_new_tokens_seq = seq.get_len() - num_computed_tokens_seq
+            if not self.cache_config.enable_prefix_caching:
+                # If prefix caching is not enabled, all new tokens are uncached.
+                num_uncached_new_tokens += all_num_new_tokens_seq
+                continue
+
+            # NOTE: the cache token might be currently in a block that's in an
+            # evictor meaning that it's not yet allocated. However, we don't
+            # exclude such tokens in the cache count because it will be
+            # guaranteed to be allocated later if the sequence can be allocated.
+            num_cached_tokens_seq = self.block_manager.get_num_cached_tokens(
+                seq)
+
+            # Sanity check.
+            if num_cached_tokens_seq < num_computed_tokens_seq:
+                # This should only happen with chunked prefill, and
+                # the seq is still in prefill. The `num_cached_tokens_seq`
+                # is the value we calculated on scheduling the first prefill.
+                # For subsequent continuous prefill steps, we cached the
+                # number of cache tokens for the sequence so the cached token
+                # count could be less than the number of computed tokens.
+                # See comments on `ComputedBlocksTracker` for more details.
+                assert (
+                    seq.is_prefill() and seq.status == SequenceStatus.RUNNING
+                    and self.scheduler_config.chunked_prefill_enabled
+                ), ("Number of cached tokens should not be less than the "
+                    "number of computed tokens for a sequence that's still "
+                    f"in prefill. But there are {num_cached_tokens_seq} cached "
+                    f"tokens and {num_computed_tokens_seq} computed tokens "
+                    f"for sequence {seq.seq_id}.")
+
+            num_cached_new_tokens_seq = max(
+                0, num_cached_tokens_seq - num_computed_tokens_seq)
+            num_uncached_new_tokens_seq = (all_num_new_tokens_seq -
+                                           num_cached_new_tokens_seq)
+
+            num_uncached_new_tokens += num_uncached_new_tokens_seq
+            num_cached_new_tokens += num_cached_new_tokens_seq
+
+        if num_uncached_new_tokens == 0 and num_cached_new_tokens > 0:
+            # For a fully cached hit sequence, we actually need to recompute the
+            # last token. So we need at least 1 uncached token to schedule.
+            # See ModelRunner._compute_for_prefix_cache_hit for more details.
+            num_uncached_new_tokens = 1
+            num_cached_new_tokens -= 1
+
+        if enable_chunking and len(seqs) == 1:
+            # Chunk if a running request cannot fit in the given budget.
+            # If number of seq > 1, it means it is doing beam search
+            # in a decode phase. Do not chunk.
+            num_uncached_new_tokens = self._chunk_new_tokens_to_schedule(
+                self.scheduler_config,
+                self.cache_config,
+                budget,
+                self._get_prompt_limit(seq_group),
+                num_uncached_new_tokens,
+                self.partial_prefill_budget_lookup_list,
+                partial_prefill_metadata,
+            )
+
+        return num_uncached_new_tokens, num_cached_new_tokens
+
+    @staticmethod
+    def _chunk_new_tokens_to_schedule(
+        scheduler_config: SchedulerConfig,
+        cache_config: CacheConfig,
+        budget: SchedulingBudget,
+        prompt_limit: int,
+        num_new_tokens: int,
+        partial_prefill_budget_lookup_list: List[int],
+        partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
+    ) -> int:
+        """
+        Chunks the number of new tokens to schedule based on the budget when
+        chunked prefill is enabled.
+
+        Args:
+            scheduler_config: The scheduler config.
+            cache_config: The cache config.
+            budget: The budget to chunk the number of tokens to compute.
+            prompt_limit: The maximum number of tokens allowed in a prompt.
+            num_new_tokens: The number of new tokens to schedule.
+
+        Returns:
+            The number of new tokens to schedule after chunking.
+        """
+        remaining_token_budget = budget.remaining_token_budget()
+        if scheduler_config.is_multi_step:
+            # The current multi-step + chunked prefill capability does
+            # not actually support chunking prompts.
+            #
+            # Therefore, `num_new_tokens` is computed in the same fashion
+            # for both multi-step+chunked-prefill &
+            # multi-step+chunked-prefill+APC
+            #
+            # Prompts with more tokens than the current remaining budget
+            # are postponed to future scheduler steps
+            if num_new_tokens > prompt_limit:
+                # If the seq_group is in prompt-stage, pass the
+                # num_new_tokens as-is so the caller can ignore
+                # the sequence.
+                return num_new_tokens
+
+            return 0 if num_new_tokens > \
+                remaining_token_budget else num_new_tokens
+
+        # Get the number of tokens to allocate to this prefill slot
+        prefill_slot_budget = (
+            remaining_token_budget if partial_prefill_metadata is None else
+            partial_prefill_budget_lookup_list[
+                partial_prefill_metadata.schedulable_prefills])
+
+        if cache_config.enable_prefix_caching:
+            # When prefix caching is enabled and we're partially prefilling
+            # a sequence, we always allocate a number of new tokens that is
+            # divisible by the block size to avoid partial block matching.
+            block_size = cache_config.block_size
+            # Don't exceed either the total budget or slot budget.
+            # Take min of those and get the next lowest multiple of the
+            # block size:
+            remaining_token_budget = (
+                min(remaining_token_budget, prefill_slot_budget) //
+                block_size) * block_size
+            # NB: In the case where num_new_tokens < budget, we are
+            # finishing prefill for this sequence, so we do not need to
+            # allocate a full block.
+
+        num_new_tokens = min(num_new_tokens, remaining_token_budget,
+                             prefill_slot_budget)
+
+        return num_new_tokens
diff --git a/vllm_v0.10.0/vllm/device_allocator/__init__.py b/vllm_v0.10.0/vllm/device_allocator/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/device_allocator/cumem.py b/vllm_v0.10.0/vllm/device_allocator/cumem.py
new file mode 100644
index 0000000..942e866
--- /dev/null
+++ b/vllm_v0.10.0/vllm/device_allocator/cumem.py
@@ -0,0 +1,281 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# cumem-based pytorch pluggable allocator to implement sleep mode.
+# other approaches tried but failed:
+# - cuda-python package binding
+# - custom libcuda driver ctypes wrapper
+# both of them failed because of cuda context mismatch.
+# not sure why, they are created from a different context.
+# the only successful approach is to call cuda driver API in C.
+import dataclasses
+import gc
+import os
+from contextlib import contextmanager
+from typing import Any, Callable, Optional, Union
+
+import torch
+
+from vllm.utils import is_pin_memory_available
+
+
+def find_loaded_library(lib_name) -> Optional[str]:
+    """
+    According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
+    the file `/proc/self/maps` contains the memory maps of the process, which includes the
+    shared libraries loaded by the process. We can use this file to find the path of the
+    a loaded library.
+    """ # noqa
+    found_line = None
+    with open("/proc/self/maps") as f:
+        for line in f:
+            if lib_name in line:
+                found_line = line
+                break
+    if found_line is None:
+        # the library is not loaded in the current process
+        return None
+    # if lib_name is libcudart, we need to match a line with:
+    # address /path/to/libcudart-hash.so.11.0
+    start = found_line.index("/")
+    path = found_line[start:].strip()
+    filename = path.split("/")[-1]
+    assert filename.rpartition(".so")[0].startswith(lib_name), \
+        f"Unexpected filename: {filename} for library {lib_name}"
+    return path
+
+
+cumem_available = False
+try:
+    from vllm.cumem_allocator import (init_module, python_create_and_map,
+                                      python_unmap_and_release)
+    from vllm.distributed.device_communicators.cuda_wrapper import (
+        CudaRTLibrary)
+    lib_name = find_loaded_library("cumem_allocator")
+    libcudart = CudaRTLibrary()
+    cumem_available = True
+except ModuleNotFoundError:
+    # rocm platform does not support cumem allocator
+    init_module = None
+    python_create_and_map = None
+    python_unmap_and_release = None
+    CudaRTLibrary = None
+    lib_name = None
+    libcudart = None
+
+# py_device, py_alignedSize, py_d_mem, py_p_memHandle
+HandleType = tuple[int, int, int, int]
+
+
+@dataclasses.dataclass
+class AllocationData:
+    handle: HandleType
+    tag: str
+    cpu_backup_tensor: Optional[torch.Tensor] = None
+
+
+def create_and_map(allocation_handle: HandleType) -> None:
+    python_create_and_map(*allocation_handle)
+
+
+def unmap_and_release(allocation_handle: HandleType) -> None:
+    python_unmap_and_release(*allocation_handle)
+
+
+def get_pluggable_allocator(
+    python_malloc_fn: Callable[[int],
+                               int], python_free_func: Callable[[int, int],
+                                                                None]
+) -> torch.cuda.memory.CUDAPluggableAllocator:
+    init_module(python_malloc_fn, python_free_func)
+    new_alloc = torch.cuda.memory.CUDAPluggableAllocator(
+        lib_name, 'my_malloc', 'my_free')
+    return new_alloc
+
+
+@contextmanager
+def use_memory_pool_with_allocator(
+        python_malloc_fn: Callable[[int], int],
+        python_free_func: Callable[[int, int], None]) -> None:
+    new_alloc = get_pluggable_allocator(python_malloc_fn, python_free_func)
+    mem_pool = torch.cuda.memory.MemPool(new_alloc._allocator)
+    with torch.cuda.memory.use_mem_pool(mem_pool):
+        yield mem_pool, new_alloc
+
+
+class CuMemAllocator:
+    """
+    A singleton class that manages a memory pool for CUDA tensors.
+    The memory in this pool can be offloaded or discarded when the
+    allocator sleeps.
+
+    Inside the `use_memory_pool(tag)` context, all tensors created will
+    be allocated in the memory pool, and has the same tag as the
+    tag passed to the context.
+
+    When we call `sleep`, all tensors with the specified tag will be
+    offloaded to CPU memory, and the rest of the tensors will be discarded.
+    When we call `wake_up`, all tensors that are previously offloaded
+    will be loaded back to GPU memory, and the rest of the tensors will
+    have empty memory.
+
+    Why it needs to be a singleton?
+    When allocated tensors are garbage collected, PyTorch will call
+    the free callback, which will call the `python_free_callback` method.
+    The C-extension uses a global variable to store the function of an
+    instance of this class. If we create multiple instances of this class,
+    the global variable will be overwritten and the free callback will
+    not work as expected.
+    """
+    instance: "CuMemAllocator" = None
+    default_tag: str = "default"
+
+    @staticmethod
+    def get_instance() -> "CuMemAllocator":
+        """
+        CuMemAllocator is a singleton class.
+        We cannot call the constructor directly.
+        Call this method to get the instance.
+        """
+        assert cumem_available, "cumem allocator is not available"
+        if CuMemAllocator.instance is None:
+            CuMemAllocator.instance = CuMemAllocator()
+        return CuMemAllocator.instance
+
+    def __init__(self):
+        conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")
+        assert "expandable_segments:True" not in conf, \
+            ("Expandable segments are not compatible with memory pool. "
+            "Please track https://github.com/pytorch/pytorch/issues/147851 "
+            "for the latest updates.")
+
+        self.pointer_to_data: dict[int, AllocationData] = {}
+        self.current_tag: str = CuMemAllocator.default_tag
+        self.allocator_and_pools: dict[str, Any] = {}
+
+    def python_malloc_callback(self, allocation_handle: HandleType) -> None:
+        """
+        Internal method to store the allocation data
+        when memory is allocated in the memory pool."""
+        py_d_mem = allocation_handle[2]
+        self.pointer_to_data[py_d_mem] = AllocationData(
+            allocation_handle, self.current_tag)
+        return
+
+    def python_free_callback(self, ptr: int) -> HandleType:
+        """
+        Internal method to look up the allocation data
+        when memory is freed in the memory pool."""
+        data = self.pointer_to_data.pop(ptr)
+        if data.cpu_backup_tensor is not None:
+            data.cpu_backup_tensor = None
+        return data.handle
+
+    def sleep(
+            self,
+            offload_tags: Optional[Union[tuple[str, ...],
+                                         str]] = None) -> None:
+        """
+        Put the allocator in sleep mode.
+        All data in the memory allocation with the specified tag will be
+        offloaded to CPU memory, and others will be discarded.
+
+        :param offload_tags: The tags of the memory allocation that will be
+            offloaded. The rest of the memory allocation will be discarded.
+        """
+        if offload_tags is None:
+            # by default, allocated tensors are offloaded
+            # when the allocator sleeps
+            offload_tags = (CuMemAllocator.default_tag, )
+        elif isinstance(offload_tags, str):
+            offload_tags = (offload_tags, )
+
+        assert isinstance(offload_tags, tuple)
+
+        for ptr, data in self.pointer_to_data.items():
+            handle = data.handle
+            if data.tag in offload_tags:
+                size_in_bytes = handle[1]
+                cpu_backup_tensor = torch.empty(
+                    size_in_bytes,
+                    dtype=torch.uint8,
+                    device='cpu',
+                    pin_memory=is_pin_memory_available())
+                cpu_ptr = cpu_backup_tensor.data_ptr()
+                libcudart.cudaMemcpy(cpu_ptr, ptr, size_in_bytes)
+                data.cpu_backup_tensor = cpu_backup_tensor
+            unmap_and_release(handle)
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        """
+        Wake up the allocator from sleep mode.
+        All data that is previously offloaded will be loaded back to GPU 
+        memory, and the rest of the data will have empty memory.
+        
+        :param tags: The tags of the memory allocation that will be loaded
+            back to GPU memory. If None, all memory allocation will be loaded
+            back to GPU memory.
+        """
+        for ptr, data in self.pointer_to_data.items():
+            if tags is None or data.tag in tags:
+                handle = data.handle
+                create_and_map(handle)
+                if data.cpu_backup_tensor is not None:
+                    cpu_backup_tensor = data.cpu_backup_tensor
+                    if cpu_backup_tensor is not None:
+                        size_in_bytes = cpu_backup_tensor.numel(
+                        ) * cpu_backup_tensor.element_size()
+                        cpu_ptr = cpu_backup_tensor.data_ptr()
+                        libcudart.cudaMemcpy(ptr, cpu_ptr, size_in_bytes)
+                        data.cpu_backup_tensor = None
+
+    @contextmanager
+    def use_memory_pool(self, tag: Optional[str] = None):
+        """
+        A context manager to use the memory pool.
+        All memory allocation created inside the context will be allocated
+        in the memory pool, and has the specified tag.
+
+        :param tag: The tag of the memory allocation. If None, the default tag
+            will be used.
+        """
+        if tag is None:
+            tag = CuMemAllocator.default_tag
+
+        assert isinstance(tag, str)
+
+        old_tag = self.current_tag
+        self.current_tag = tag
+        with use_memory_pool_with_allocator(self.python_malloc_callback,
+                                            self.python_free_callback) as data:
+            # start to hit another PyTorch bug in PyTorch 2.6,
+            # possibly because of gc-related issue w.r.t. the allocator and
+            # the memory pool.
+            # to avoid the issue, we keep a reference of the data.
+            # see https://github.com/pytorch/pytorch/issues/146431 .
+            self.allocator_and_pools[tag] = data
+            yield
+            # PyTorch's bug, calling torch.cuda.empty_cache() will error
+            # when using pluggable allocator, see
+            # https://github.com/pytorch/pytorch/issues/145168 .
+            # if we have some memory allocated and then freed,
+            # the memory will not be released.
+            # right now it is fine, because we only use this allocator
+            # during weight loading and kv cache creation, where we only
+            # allocate memory.
+            # TODO: we need to find a way to release the memory,
+            # i.e. calling torch.cuda.empty_cache()
+            self.current_tag = old_tag
+
+    def get_current_usage(self) -> int:
+        """
+        Get the total number of bytes allocated in the memory pool.
+        """
+        sum_bytes: int = 0
+        for ptr, data in self.pointer_to_data.items():
+            handle = data.handle
+            sum_bytes += handle[1]
+        return sum_bytes
diff --git a/vllm_v0.10.0/vllm/distributed/__init__.py b/vllm_v0.10.0/vllm/distributed/__init__.py
new file mode 100644
index 0000000..e911b2a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from .communication_op import *
+from .parallel_state import *
+from .utils import *
diff --git a/vllm_v0.10.0/vllm/distributed/communication_op.py b/vllm_v0.10.0/vllm/distributed/communication_op.py
new file mode 100644
index 0000000..0a5a951
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/communication_op.py
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Optional, Union
+
+import torch
+import torch.distributed
+
+from .parallel_state import get_tp_group
+
+
+def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
+    """All-reduce the input tensor across model parallel group."""
+    return get_tp_group().all_reduce(input_)
+
+
+def tensor_model_parallel_all_gather(input_: torch.Tensor,
+                                     dim: int = -1) -> torch.Tensor:
+    """All-gather the input tensor across model parallel group."""
+    return get_tp_group().all_gather(input_, dim)
+
+
+def tensor_model_parallel_reduce_scatter(input_: torch.Tensor,
+                                         dim: int = -1) -> torch.Tensor:
+    """Reduce-Scatter the input tensor across model parallel group."""
+    return get_tp_group().reduce_scatter(input_, dim)
+
+
+def tensor_model_parallel_gather(input_: torch.Tensor,
+                                 dst: int = 0,
+                                 dim: int = -1) -> Optional[torch.Tensor]:
+    """Gather the input tensor across model parallel group."""
+    return get_tp_group().gather(input_, dst, dim)
+
+
+def broadcast_tensor_dict(tensor_dict: Optional[dict[Any, Union[torch.Tensor,
+                                                                Any]]] = None,
+                          src: int = 0):
+    if not torch.distributed.is_initialized():
+        return tensor_dict
+    return get_tp_group().broadcast_tensor_dict(tensor_dict, src)
diff --git a/vllm_v0.10.0/vllm/distributed/device_communicators/__init__.py b/vllm_v0.10.0/vllm/distributed/device_communicators/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/distributed/device_communicators/all2all.py b/vllm_v0.10.0/vllm/distributed/device_communicators/all2all.py
new file mode 100644
index 0000000..85f87cb
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/device_communicators/all2all.py
@@ -0,0 +1,264 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import TYPE_CHECKING, Any
+
+import torch
+import torch.distributed as dist
+
+from vllm.forward_context import get_forward_context
+from vllm.logger import init_logger
+from vllm.utils import has_deep_ep, has_pplx
+
+from .base_device_communicator import All2AllManagerBase, Cache
+
+logger = init_logger(__name__)
+
+if TYPE_CHECKING:
+    from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+else:
+    FusedMoE = None
+
+
+class NaiveAll2AllManager(All2AllManagerBase):
+    """
+    A naive implementation of all2all communication.
+    It uses all-reduce under the hood, which is not
+    efficient at all. The main purpose is for testing and
+    debugging.
+    """
+
+    def __init__(self, cpu_group):
+        super().__init__(cpu_group)
+
+    def naive_multicast(self, x: torch.Tensor,
+                        cu_tokens_across_dp_cpu: torch.Tensor):
+        assert (len(x.shape) == 2)
+        buffer = torch.empty((cu_tokens_across_dp_cpu[-1], x.size(1)),
+                             device=x.device,
+                             dtype=x.dtype)
+
+        start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_cpu[
+            self.dp_rank - 1]
+        end = cu_tokens_across_dp_cpu[self.dp_rank]
+        buffer[start:end, :].copy_(x)
+        for idx in range(self.dp_world_size):
+            start = 0 if idx == 0 else cu_tokens_across_dp_cpu[idx - 1]
+            end = cu_tokens_across_dp_cpu[idx]
+            self.dp_group.broadcast(buffer[start:end, :], idx)
+
+        return buffer
+
+    def dispatch(self, hidden_states: torch.Tensor,
+                 router_logits: torch.Tensor):
+        cu_tokens_across_dp_cpu = get_forward_context(
+        ).dp_metadata.cu_tokens_across_dp_cpu
+
+        hidden_states = self.naive_multicast(hidden_states,
+                                             cu_tokens_across_dp_cpu)
+        router_logits = self.naive_multicast(router_logits,
+                                             cu_tokens_across_dp_cpu)
+        return hidden_states, router_logits
+
+    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        cu_tokens_across_dp_cpu = get_forward_context(
+        ).dp_metadata.cu_tokens_across_dp_cpu
+        start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_cpu[
+            self.dp_rank - 1]
+        end = cu_tokens_across_dp_cpu[self.dp_rank]
+
+        all_hidden_states = self.dp_group.all_reduce(hidden_states)
+        hidden_states = all_hidden_states[start:end, :]
+        return hidden_states
+
+    def destroy(self):
+        pass
+
+
+class PPLXAll2AllManager(All2AllManagerBase):
+    """
+    All2All communication based on PPLX kernels.
+    """
+
+    def __init__(self, cpu_group):
+        assert has_pplx(
+        ), "pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install pplx_kernels."  # noqa
+        super().__init__(cpu_group)
+
+        if self.internode:
+            # inter-node communication needs nvshmem,
+            # intra-node communication uses p2p mapping directly
+            from pplx_kernels.nvshmem import (nvshmem_alloc_empty_unique_id,
+                                              nvshmem_get_unique_id,
+                                              nvshmem_init)
+            logger.debug(
+                "Initialize NVSHMEM for pplx_kernels: "
+                "rank=%d, world size=%d", self.rank, self.world_size)
+            uid = nvshmem_get_unique_id(
+            ) if self.rank == 0 else nvshmem_alloc_empty_unique_id()
+            dist.broadcast(uid,
+                           src=dist.get_process_group_ranks(self.cpu_group)[0],
+                           group=self.cpu_group)
+            logger.debug("PPLX NVSHMEM UID = %s", uid)
+            nvshmem_init(uid, self.rank, self.world_size)
+
+        self.handle_cache = Cache()
+
+    def get_handle(self, kwargs):
+        import pplx_kernels as pplx
+        return self.handle_cache.get_or_create(
+            kwargs, pplx.AllToAll.internode
+            if self.internode else pplx.AllToAll.intranode)
+
+    def dispatch(self, hidden_states: torch.Tensor,
+                 router_logits: torch.Tensor):
+        raise NotImplementedError
+
+    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+
+    def destroy(self):
+        with self.handle_cache._lock:
+            for _, handle in self.handle_cache._cache.items():
+                handle.destroy()
+
+        if self.internode:
+            from pplx_kernels.nvshmem import nvshmem_finalize
+            logger.debug("PPLX NVSHMEM finalize")
+            nvshmem_finalize()
+
+
+class DeepEPAll2AllManagerBase(All2AllManagerBase):
+    """
+    All2All communication based on DeepEP High-Throughput kernels.
+    """
+
+    def __init__(self, cpu_group):
+        assert has_deep_ep(
+        ), "DeepEP kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install DeepEP kernels."  # noqa
+        super().__init__(cpu_group)
+        self.handle_cache = Cache()
+
+        # This is the DeepEP default. Stick to it till we can establish
+        # reasonable defaults based on profiling.
+        self.num_sms = 20
+
+    def get_handle(self, kwargs):
+        raise NotImplementedError
+
+    def dispatch(self, hidden_states: torch.Tensor,
+                 router_logits: torch.Tensor):
+        raise NotImplementedError
+
+    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+
+    def destroy(self):
+        pass
+
+
+class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase):
+    """
+    All2All communication based on DeepEP High-Throughput kernels.
+    """
+
+    def __init__(self, cpu_group):
+        super().__init__(cpu_group)
+
+    def _make_all2all_kwargs(self) -> dict[Any, Any]:
+        # Defaults for internode and intranode are taken from DeepEP tests.
+        num_nvl_bytes = 1024 * 1024 * 1024
+        num_rdma_bytes = None
+        num_qps_per_rank = None
+
+        if self.internode:
+            num_rdma_bytes = 1024 * 1024 * 1024
+            num_qps_per_rank = self.num_sms // 2
+        else:
+            num_rdma_bytes = 0
+            num_qps_per_rank = 1
+
+        assert num_rdma_bytes is not None
+        assert num_qps_per_rank is not None
+        return dict(group=self.cpu_group,
+                    num_nvl_bytes=num_nvl_bytes,
+                    num_rdma_bytes=num_rdma_bytes,
+                    low_latency_mode=False,
+                    num_qps_per_rank=num_qps_per_rank)
+
+    def get_handle(self, kwargs):
+
+        assert len(kwargs) == 0, (
+            "DeepEPHTAll2AllManager expects no arguments. All the required "
+            "args are computed in the Manager itself.")
+
+        import deep_ep
+        buffer_kwargs = self._make_all2all_kwargs()
+        logger.debug("DeepEP all2all args %s", buffer_kwargs)
+        handle: deep_ep.Buffer = self.handle_cache.get_or_create(
+            buffer_kwargs, deep_ep.Buffer)
+        # It is dangerous to set num sms outside this function. num_sms is not
+        # a part of the hash-key that identifies this object. If we are in a
+        # situation where we make objects with different num_sms, the hash key
+        # in get_or_create must be updated.
+        handle.set_num_sms(self.num_sms)
+        return handle
+
+
+class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase):
+    """
+    All2All communication based on DeepEP Low-Latency kernels.
+    """
+
+    def __init__(self, cpu_group):
+        super().__init__(cpu_group)
+
+    def _make_all2all_kwargs(
+        self,
+        max_num_tokens_per_dp_rank: int,
+        token_hidden_size: int,
+        num_ep_ranks: int,
+        num_global_experts: int,
+        num_local_experts: int,
+    ) -> dict[Any, Any]:
+        """
+        max_num_tokens_per_dp_rank : the maximum number of tokens a DP rank
+          can dispatch all the ranks must hold the same value.
+        token_hidden_size: the hidden dimension of each token.
+        num_ep_ranks: the number of EP group ranks.
+        num_global_experts: Number of experts in the model.
+        num_local_experts: Number of experts in an EP rank.
+        """
+        import deep_ep
+
+        # Defaults for internode and intranode are taken from DeepEP tests.
+        num_nvl_bytes = 1024 * 1024 * 1024
+        num_qps_per_rank = num_local_experts
+        num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
+            num_max_dispatch_tokens_per_rank=max_num_tokens_per_dp_rank,
+            hidden=token_hidden_size,
+            num_ranks=num_ep_ranks,
+            num_experts=num_global_experts)
+
+        assert num_rdma_bytes is not None
+        return dict(group=self.cpu_group,
+                    num_nvl_bytes=num_nvl_bytes,
+                    num_rdma_bytes=num_rdma_bytes,
+                    low_latency_mode=True,
+                    num_qps_per_rank=num_qps_per_rank)
+
+    def get_handle(self, kwargs):
+        """
+        The kwargs for DeepEPLLAll2AllManager is dictated by
+        _make_all2all_kwargs.
+        """
+        import deep_ep
+        buffer_kwargs = self._make_all2all_kwargs(**kwargs)
+        logger.debug("DeepEP all2all args %s", buffer_kwargs)
+        handle: deep_ep.Buffer = self.handle_cache.get_or_create(
+            buffer_kwargs, deep_ep.Buffer)
+        # It is dangerous to set num sms outside this function. num_sms is not
+        # a part of the hash-key that identifies this object. If we are in a
+        # situation where we make objects with different num_sms, the hash key
+        # in get_or_create must be updated.
+        handle.set_num_sms(self.num_sms)
+        return handle
diff --git a/vllm_v0.10.0/vllm/distributed/device_communicators/base_device_communicator.py b/vllm_v0.10.0/vllm/distributed/device_communicators/base_device_communicator.py
new file mode 100644
index 0000000..dc5923c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/device_communicators/base_device_communicator.py
@@ -0,0 +1,273 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import threading
+from typing import Optional, Union
+from weakref import WeakValueDictionary
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+
+class Cache:
+
+    def __init__(self):
+        self._cache: WeakValueDictionary = WeakValueDictionary()
+        self._lock = threading.RLock()  # Reentrant lock for thread safety
+
+    def get_or_create(self, kwargs, func):
+        # Create a hashable key from the kwargs
+        key = tuple(sorted((k, v) for k, v in kwargs.items()))
+
+        with self._lock:
+            instance = self._cache.get(key)
+            if instance is None:
+                instance = func(**kwargs)
+                self._cache[key] = instance
+            return instance
+
+
+class All2AllManagerBase:
+
+    def __init__(self, cpu_group):
+        self.cpu_group = cpu_group
+
+        # compute some common properties
+        from vllm.distributed.parallel_state import (get_dp_group,
+                                                     get_tp_group,
+                                                     in_the_same_node_as)
+
+        # all2all lives in ep group, which is merged from dp and tp group
+        self.dp_group = get_dp_group()
+        self.tp_group = get_tp_group()
+        # no self.ep_group since self.ep_group is still in construction
+        # when we create this object
+        self.dp_rank = self.dp_group.rank_in_group
+        self.dp_world_size = self.dp_group.world_size
+        self.rank = dist.get_rank(cpu_group)
+        self.world_size = dist.get_world_size(cpu_group)
+
+        # all2all communication often has separate implementations for
+        # intra-node and inter-node communication
+        self.internode = not all(in_the_same_node_as(cpu_group, source_rank=0))
+
+    def get_handle(self, kwargs):
+        # get a handle for the all2all communication,
+        # based on the kwargs.
+        # different layers can have different configs,
+        # e.g. one layer has hidden size 1024, another has 2048.
+        # usually the underlying implementation caches the handle
+        # and reuse it for the same config.
+        raise NotImplementedError
+
+    def dispatch(self, hidden_states: torch.Tensor,
+                 router_logits: torch.Tensor):
+        raise NotImplementedError
+
+    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+
+    def destroy(self):
+        pass
+
+
+class DeviceCommunicatorBase:
+    """
+    Base class for device-specific communicator.
+    It can use the `cpu_group` to initialize the communicator.
+    If the device has PyTorch integration (PyTorch can recognize its
+    communication backend), the `device_group` will also be given.
+    """
+
+    def __init__(self,
+                 cpu_group: ProcessGroup,
+                 device: Optional[torch.device] = None,
+                 device_group: Optional[ProcessGroup] = None,
+                 unique_name: str = ""):
+        self.device = device or torch.device("cpu")
+        self.cpu_group = cpu_group
+        self.device_group = device_group
+        self.unique_name = unique_name
+        self.rank = dist.get_rank(cpu_group)
+        self.world_size = dist.get_world_size(cpu_group)
+        self.ranks = dist.get_process_group_ranks(cpu_group)
+        self.global_rank = dist.get_rank()
+        self.global_world_size = dist.get_world_size()
+        self.rank_in_group = dist.get_group_rank(self.cpu_group,
+                                                 self.global_rank)
+
+        use_ep = False
+        from vllm.config import get_current_vllm_config
+        config = get_current_vllm_config()
+        if config is not None:
+            # as long as we use data parallel (coupled data parallel
+            # where all data parallel ranks execute forward together),
+            # we initialize the all2all manager used in expert parallel.
+            use_ep = config.parallel_config.data_parallel_size > 1
+
+        self.use_all2all = "ep" in unique_name and use_ep
+        self.all2all_manager: Optional[All2AllManagerBase] = None
+
+    def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
+        dist.all_reduce(input_, group=self.device_group)
+        return input_
+
+    def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+        input_size = input_.size()
+        # NOTE: we have to use concat-style all-gather here,
+        # stack-style all-gather has compatibility issues with
+        # torch.compile . see https://github.com/pytorch/pytorch/issues/138795
+        output_size = (input_size[0] * self.world_size, ) + input_size[1:]
+        # Allocate output tensor.
+        output_tensor = torch.empty(output_size,
+                                    dtype=input_.dtype,
+                                    device=input_.device)
+        # All-gather.
+        dist.all_gather_into_tensor(output_tensor,
+                                    input_,
+                                    group=self.device_group)
+        # Reshape
+        output_tensor = output_tensor.reshape((self.world_size, ) + input_size)
+        output_tensor = output_tensor.movedim(0, dim)
+        output_tensor = output_tensor.reshape(input_size[:dim] +
+                                              (self.world_size *
+                                               input_size[dim], ) +
+                                              input_size[dim + 1:])
+        return output_tensor
+
+    def all_gatherv(
+        self,
+        input_: Union[torch.Tensor, list[torch.Tensor]],
+        dim: int = 0,
+        sizes: Optional[list[int]] = None
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        raise NotImplementedError
+
+    def reduce_scatter(self,
+                       input_: torch.Tensor,
+                       dim: int = -1) -> torch.Tensor:
+        world_size = self.world_size
+        # Bypass the function if we are using only 1 GPU.
+        if world_size == 1:
+            return input_
+        assert -input_.dim() <= dim < input_.dim(), (
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+
+        # Note: This will produce an incorrect answer if we don't make
+        # the input_tensor contiguous. Possible bug in reduce_scatter_tensor?
+        input_tensor = input_.movedim(0, dim).contiguous()
+
+        assert input_tensor.shape[0] % world_size == 0
+        chunk_size = input_tensor.shape[0] // world_size
+        output_shape = (chunk_size, ) + input_tensor.shape[1:]
+
+        output_tensor = torch.empty(output_shape,
+                                    dtype=input_tensor.dtype,
+                                    device=input_tensor.device)
+
+        # Perform reduce-scatter operation
+        torch.distributed.reduce_scatter_tensor(output_tensor,
+                                                input_tensor,
+                                                group=self.device_group)
+
+        # Reshape before returning
+        return output_tensor.movedim(0, dim).contiguous()
+
+    def reduce_scatterv(self,
+                        input_: torch.Tensor,
+                        dim: int = -1,
+                        sizes: Optional[list[int]] = None) -> torch.Tensor:
+        raise NotImplementedError
+
+    def gather(self,
+               input_: torch.Tensor,
+               dst: int = 0,
+               dim: int = -1) -> Optional[torch.Tensor]:
+        """
+        NOTE: We assume that the input tensor is on the same device across
+        all the ranks.
+        NOTE: `dst` is the local rank of the destination rank.
+        """
+        world_size = self.world_size
+        assert -input_.dim() <= dim < input_.dim(), (
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+
+        # Allocate output tensor.
+        if self.rank_in_group == dst:
+            gather_list = [torch.empty_like(input_) for _ in range(world_size)]
+        else:
+            gather_list = None
+        # Gather.
+        torch.distributed.gather(input_,
+                                 gather_list,
+                                 dst=self.ranks[dst],
+                                 group=self.device_group)
+        if self.rank_in_group == dst:
+            output_tensor = torch.cat(gather_list, dim=dim)
+        else:
+            output_tensor = None
+        return output_tensor
+
+    def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
+        """Sends a tensor to the destination rank in a non-blocking way"""
+        """NOTE: `dst` is the local rank of the destination rank."""
+        if dst is None:
+            dst = (self.rank_in_group + 1) % self.world_size
+        torch.distributed.send(tensor, self.ranks[dst], self.device_group)
+
+    def recv(self,
+             size: torch.Size,
+             dtype: torch.dtype,
+             src: Optional[int] = None) -> torch.Tensor:
+        """Receives a tensor from the source rank."""
+        """NOTE: `src` is the local rank of the source rank."""
+        if src is None:
+            src = (self.rank_in_group - 1) % self.world_size
+
+        tensor = torch.empty(size, dtype=dtype, device=self.device)
+        torch.distributed.recv(tensor, self.ranks[src], self.device_group)
+        return tensor
+
+    def destroy(self):
+        pass
+
+    def prepare_communication_buffer_for_model(self,
+                                               model: torch.nn.Module) -> None:
+        """
+        Prepare the communication buffer for the model.
+        """
+        if not self.use_all2all:
+            return
+
+        moe_modules = [
+            module for module in model.modules()
+            if module.__class__.__name__ == "FusedMoE"
+        ]
+        for module in moe_modules:
+            module.quant_method.init_prepare_finalize(module.moe_config)
+
+    def dispatch(
+            self, hidden_states: torch.Tensor,
+            router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Dispatch the hidden states and router logits to the appropriate device.
+        This is a no-op in the base class.
+        """
+        return hidden_states, router_logits
+
+    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """
+        Combine the hidden states and router logits from the appropriate device.
+        This is a no-op in the base class.
+        """
+        return hidden_states
diff --git a/vllm_v0.10.0/vllm/distributed/device_communicators/cpu_communicator.py b/vllm_v0.10.0/vllm/distributed/device_communicators/cpu_communicator.py
new file mode 100644
index 0000000..bda567f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/device_communicators/cpu_communicator.py
@@ -0,0 +1,201 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from typing import Any, Optional, Union
+
+import torch
+from torch.distributed import ProcessGroup
+
+from vllm.distributed.utils import pickle
+from vllm.platforms import current_platform
+from vllm.platforms.interface import CpuArchEnum
+
+from .base_device_communicator import DeviceCommunicatorBase
+
+
+class CpuCommunicator(DeviceCommunicatorBase):
+
+    def __init__(self,
+                 cpu_group: ProcessGroup,
+                 device: Optional[torch.device] = None,
+                 device_group: Optional[ProcessGroup] = None,
+                 unique_name: str = ""):
+        super().__init__(cpu_group, device, device_group, unique_name)
+        self.dist_module = torch.distributed
+
+        if (current_platform.get_cpu_architecture()
+                == CpuArchEnum.X86) and hasattr(
+                    torch.ops._C,
+                    "init_shm_manager") and (unique_name.startswith("tp")
+                                             or unique_name.startswith("pp")):
+            self.dist_module = _CPUSHMDistributed(self)
+
+    def all_reduce(self, input_):
+        self.dist_module.all_reduce(input_, group=self.device_group)
+        return input_
+
+    def gather(self,
+               input_: torch.Tensor,
+               dst: int = 0,
+               dim: int = -1) -> Optional[torch.Tensor]:
+        """
+        NOTE: We assume that the input tensor is on the same device across
+        all the ranks.
+        NOTE: `dst` is the local rank of the destination rank.
+        """
+        world_size = self.world_size
+        assert -input_.dim() <= dim < input_.dim(), (
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+
+        # Allocate output tensor.
+        if self.rank_in_group == dst:
+            gather_list = [torch.empty_like(input_) for _ in range(world_size)]
+        else:
+            gather_list = None
+
+        # Gather.
+        self.dist_module.gather(input_,
+                                gather_list,
+                                dst=self.ranks[dst],
+                                group=self.device_group)
+
+        if self.rank_in_group == dst:
+            output_tensor = torch.cat(gather_list, dim=dim)
+        else:
+            output_tensor = None
+        return output_tensor
+
+    def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+        input_size = input_.size()
+        # NOTE: we have to use concat-style all-gather here,
+        # stack-style all-gather has compatibility issues with
+        # torch.compile . see https://github.com/pytorch/pytorch/issues/138795
+        output_size = (input_size[0] * self.world_size, ) + input_size[1:]
+        # Allocate output tensor.
+        output_tensor = torch.empty(output_size,
+                                    dtype=input_.dtype,
+                                    device=input_.device)
+        # All-gather.
+        self.dist_module.all_gather_into_tensor(output_tensor,
+                                                input_,
+                                                group=self.device_group)
+
+        # Reshape
+        output_tensor = output_tensor.reshape((self.world_size, ) + input_size)
+        output_tensor = output_tensor.movedim(0, dim)
+        output_tensor = output_tensor.reshape(input_size[:dim] +
+                                              (self.world_size *
+                                               input_size[dim], ) +
+                                              input_size[dim + 1:])
+        return output_tensor
+
+    def send_tensor_dict(
+        self,
+        tensor_dict: dict[str, Union[torch.Tensor, Any]],
+        dst: int,
+    ) -> None:
+        return self.dist_module.send_tensor_dict(tensor_dict, dst)
+
+    def recv_tensor_dict(
+        self,
+        src: int,
+    ) -> dict[str, Union[torch.Tensor, Any]]:
+        return self.dist_module.recv_tensor_dict(src)
+
+
+class _CPUSHMDistributed:
+
+    def __init__(self, communicator: CpuCommunicator):
+        instance_identifier = os.environ["VLLM_DIST_IDENT"]
+        unique_name = communicator.unique_name
+        instance_identifier = f"{instance_identifier}-{unique_name}"
+        self.communicator = communicator
+
+        group_ranks = [str(rank) for rank in self.communicator.ranks]
+        shm_group_identifier = f"[{'-'.join(group_ranks)}]"
+        self.group_name = f"{instance_identifier}-{shm_group_identifier}-cpushm"
+
+        self.handle = self._init_cpu_shm()
+
+    def _init_cpu_shm(self) -> int:
+        handle = torch.ops._C.init_shm_manager(
+            self.group_name,
+            self.communicator.world_size,
+            self.communicator.rank,
+        )
+        torch.distributed.barrier(self.communicator.device_group)
+        torch.ops._C.join_shm_manager(
+            handle,
+            self.group_name,
+        )
+        torch.distributed.barrier(self.communicator.device_group)
+
+        return handle
+
+    def all_reduce(self,
+                   input: torch.Tensor,
+                   group: Optional[ProcessGroup] = None) -> None:
+        torch.ops._C.shm_allreduce(self.handle, input)
+
+    def gather(self,
+               input: torch.Tensor,
+               gather_list: Optional[list[torch.Tensor]],
+               dst: int = -1,
+               group: Optional[ProcessGroup] = None) -> None:
+        # Note: different from the torch gather, here we use local dst rank.
+        torch.ops._C.shm_gather(self.handle, input, gather_list,
+                                torch.distributed.get_group_rank(group, dst))
+
+    def all_gather_into_tensor(self,
+                               output: torch.Tensor,
+                               input: torch.Tensor,
+                               group: Optional[ProcessGroup] = None) -> None:
+        torch.ops._C.shm_all_gather(self.handle, input, output)
+
+    def send_tensor_dict(
+        self,
+        tensor_dict: dict[str, Union[torch.Tensor, Any]],
+        dst: int,
+    ) -> None:
+        key_list = list(tensor_dict.keys())
+        value_list = list(tensor_dict.values())
+        size_list = []
+        for v in value_list:
+            if not isinstance(v, torch.Tensor):
+                raise RuntimeError(
+                    "CpuCommunicator only supports sending tensors.")
+            size_list.append(v.size())
+        key_size_tensor = torch.frombuffer(pickle.dumps([key_list, size_list]),
+                                           dtype=torch.uint8)
+        value_list.append(key_size_tensor)
+
+        torch.ops._C.shm_send_tensor_list(self.handle, value_list, dst)
+
+        return None
+
+    def recv_tensor_dict(
+        self,
+        src: int,
+    ) -> dict[str, Union[torch.Tensor, Any]]:
+        tensor_list = torch.ops._C.shm_recv_tensor_list(self.handle, src)
+
+        value_list: list[torch.Tensor] = tensor_list[:-1]
+        key_size_tensor = tensor_list[-1]
+
+        key_size = pickle.loads(key_size_tensor.numpy().tobytes())
+        key_list = key_size[0]
+        size_list = key_size[1]
+        assert len(key_list) == len(size_list)
+        assert len(key_list) == len(value_list)
+
+        tensor_dict: dict[str, torch.Tensor] = {}
+        for key, size, t in zip(key_list, size_list, value_list):
+            tensor_dict[key] = t.view(size)
+        return tensor_dict
diff --git a/vllm_v0.10.0/vllm/distributed/device_communicators/cuda_communicator.py b/vllm_v0.10.0/vllm/distributed/device_communicators/cuda_communicator.py
new file mode 100644
index 0000000..e480469
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/device_communicators/cuda_communicator.py
@@ -0,0 +1,275 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional, Union
+
+import torch
+from torch.distributed import ProcessGroup
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+
+from .base_device_communicator import DeviceCommunicatorBase
+
+logger = init_logger(__name__)
+
+
+class CudaCommunicator(DeviceCommunicatorBase):
+
+    def __init__(self,
+                 cpu_group: ProcessGroup,
+                 device: Optional[torch.device] = None,
+                 device_group: Optional[ProcessGroup] = None,
+                 unique_name: str = ""):
+        super().__init__(cpu_group, device, device_group, unique_name)
+        if "tp" not in unique_name:
+            # only tp uses custom allreduce
+            use_custom_allreduce = False
+        else:
+            from vllm.distributed.parallel_state import (
+                _ENABLE_CUSTOM_ALL_REDUCE)
+            use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE
+
+        # ep does not use pynccl
+        use_pynccl = "ep" not in unique_name
+
+        self.use_pynccl = use_pynccl
+        self.use_custom_allreduce = use_custom_allreduce
+
+        # lazy import to avoid documentation build error
+        from vllm.distributed.device_communicators.custom_all_reduce import (
+            CustomAllreduce)
+        from vllm.distributed.device_communicators.pynccl import (
+            PyNcclCommunicator)
+        from vllm.distributed.device_communicators.quick_all_reduce import (
+            QuickAllReduce)
+
+        self.pynccl_comm: Optional[PyNcclCommunicator] = None
+        if use_pynccl and self.world_size > 1:
+            self.pynccl_comm = PyNcclCommunicator(
+                group=self.cpu_group,
+                device=self.device,
+            )
+
+        self.ca_comm: Optional[CustomAllreduce] = None
+        self.qr_comm: Optional[QuickAllReduce] = None
+        if use_custom_allreduce and self.world_size > 1:
+            # Initialize a custom fast all-reduce implementation.
+            self.ca_comm = CustomAllreduce(
+                group=self.cpu_group,
+                device=self.device,
+            )
+
+            if current_platform.is_rocm():
+                # Initialize a custom quick all-reduce implementation for AMD.
+                # Quick reduce is designed as a complement to custom allreduce.
+                # Based on quickreduce (https://github.com/mk1-project/quickreduce).
+                # If it's a rocm, 'use_custom_allreduce==True' means it must
+                # currently be an MI300 series.
+                self.qr_comm = QuickAllReduce(group=self.cpu_group,
+                                              device=self.device)
+        if self.use_all2all:
+            all2all_backend = envs.VLLM_ALL2ALL_BACKEND
+            if all2all_backend == "naive":
+                from .all2all import NaiveAll2AllManager
+                self.all2all_manager = NaiveAll2AllManager(self.cpu_group)
+                logger.info("Using naive all2all manager.")
+            elif all2all_backend == "pplx":
+                from .all2all import PPLXAll2AllManager
+                self.all2all_manager = PPLXAll2AllManager(self.cpu_group)
+                logger.info("Using PPLX all2all manager.")
+            elif all2all_backend == "deepep_high_throughput":
+                from .all2all import DeepEPHTAll2AllManager
+                self.all2all_manager = DeepEPHTAll2AllManager(self.cpu_group)
+                logger.info("Using DeepEP High-Throughput all2all manager.")
+            elif all2all_backend == "deepep_low_latency":
+                from .all2all import DeepEPLLAll2AllManager
+                self.all2all_manager = DeepEPLLAll2AllManager(self.cpu_group)
+                logger.info("Using DeepEP Low-Latency all2all manager.")
+            else:
+                raise ValueError(f"Unknown all2all backend: {all2all_backend}")
+
+    def all_reduce(self, input_):
+        # always try quick reduce first, then custom allreduce,
+        # and then pynccl. (quick reduce just for ROCM MI3*)
+        qr_comm = self.qr_comm
+        if qr_comm is not None and not qr_comm.disabled and \
+            qr_comm.should_quick_allreduce(input_):
+            out = qr_comm.quick_all_reduce(input_)
+            assert out is not None
+            return out
+        ca_comm = self.ca_comm
+        if ca_comm is not None and not ca_comm.disabled and \
+            ca_comm.should_custom_ar(input_):
+            out = ca_comm.custom_all_reduce(input_)
+            assert out is not None
+            return out
+        pynccl_comm = self.pynccl_comm
+        assert pynccl_comm is not None
+        out = pynccl_comm.all_reduce(input_)
+        if out is None:
+            # fall back to the default all-reduce using PyTorch.
+            # this usually happens during testing.
+            # when we run the model, allreduce only happens for the TP
+            # group, where we always have either custom allreduce or pynccl.
+            out = input_.clone()
+            torch.distributed.all_reduce(out, group=self.device_group)
+        return out
+
+    def reduce_scatter(self, input_: torch.Tensor, dim: int = -1):
+        world_size = self.world_size
+        pynccl_comm = self.pynccl_comm
+        assert pynccl_comm is not None
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+
+        # Note: This will produce an incorrect answer if we don't make
+        # the input_tensor contiguous. Possible bug in reduce_scatter_tensor?
+        input_tensor = input_.movedim(0, dim).contiguous()
+
+        assert input_tensor.shape[0] % world_size == 0
+        chunk_size = input_tensor.shape[0] // world_size
+        output_shape = (chunk_size, ) + input_tensor.shape[1:]
+
+        output = torch.empty(output_shape,
+                             dtype=input_tensor.dtype,
+                             device=input_tensor.device)
+
+        pynccl_comm.reduce_scatter(output, input_)
+
+        # Reshape before returning
+        return output.movedim(0, dim).contiguous()
+
+    def reduce_scatterv(self,
+                        input_: torch.Tensor,
+                        dim: int = -1,
+                        sizes: Optional[list[int]] = None):
+        world_size = self.world_size
+        pynccl_comm = self.pynccl_comm
+        assert pynccl_comm is not None
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+
+        # Note: This will produce an incorrect answer if we don't make
+        # the input_tensor contiguous. Possible bug in reduce_scatter_tensor?
+        input_tensor = input_.movedim(0, dim).contiguous()
+
+        if sizes is not None:
+            assert len(sizes) == world_size
+            assert input_tensor.shape[0] == sum(sizes)
+            chunk_size = sizes[self.rank_in_group]
+        else:
+            assert input_tensor.shape[0] % world_size == 0
+            chunk_size = input_tensor.shape[0] // world_size
+        output_shape = (chunk_size, ) + input_tensor.shape[1:]
+
+        output = torch.empty(output_shape,
+                             dtype=input_tensor.dtype,
+                             device=input_tensor.device)
+
+        if sizes is not None:
+            pynccl_comm.reduce_scatterv(output, input_, sizes=sizes)
+        else:
+            pynccl_comm.reduce_scatter(output, input_)
+
+        # Reshape before returning
+        return output.movedim(0, dim).contiguous()
+
+    def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
+        """Sends a tensor to the destination rank in a non-blocking way"""
+        """NOTE: `dst` is the local rank of the destination rank."""
+        if dst is None:
+            dst = (self.rank_in_group + 1) % self.world_size
+
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None and not pynccl_comm.disabled:
+            pynccl_comm.send(tensor, dst)
+        else:
+            torch.distributed.send(tensor, self.ranks[dst], self.device_group)
+
+    def recv(self,
+             size: torch.Size,
+             dtype: torch.dtype,
+             src: Optional[int] = None) -> torch.Tensor:
+        """Receives a tensor from the source rank."""
+        """NOTE: `src` is the local rank of the source rank."""
+        if src is None:
+            src = (self.rank_in_group - 1) % self.world_size
+
+        tensor = torch.empty(size, dtype=dtype, device=self.device)
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None and not pynccl_comm.disabled:
+            pynccl_comm.recv(tensor, src)
+        else:
+            torch.distributed.recv(tensor, self.ranks[src], self.device_group)
+        return tensor
+
+    def destroy(self):
+        if self.pynccl_comm is not None:
+            self.pynccl_comm = None
+        if self.ca_comm is not None:
+            self.ca_comm = None
+        if self.all2all_manager is not None:
+            self.all2all_manager.destroy()
+            self.all2all_manager = None
+
+    def all_gatherv(self,
+                    input_: Union[torch.Tensor, list[torch.Tensor]],
+                    dim: int = 0,
+                    sizes: Optional[list[int]] = None):
+        if dim != 0:
+            raise NotImplementedError("only dim 0 all-gatherv is supported")
+        world_size = self.world_size
+        pynccl_comm = self.pynccl_comm
+        assert pynccl_comm is not None and not pynccl_comm.disabled
+
+        # 'sizes' is not needed if all inputs in the same group have the same
+        # shape
+        if sizes is not None and all(s == sizes[0] for s in sizes):
+            sizes = None
+
+        def _all_gather_single(input_: torch.Tensor,
+                               sizes: Optional[list[int]] = None):
+            input_size = input_.size()
+            if sizes is not None:
+                assert len(sizes) == world_size
+                assert input_.shape[dim] == sizes[self.rank_in_group]
+                output_size = (sum(sizes), ) + input_size[1:]
+            else:
+                output_size = (input_size[0] * world_size, ) + input_size[1:]
+            # Allocate output tensor.
+            output_tensor = torch.empty(output_size,
+                                        dtype=input_.dtype,
+                                        device=input_.device)
+            if sizes is not None:
+                pynccl_comm.all_gatherv(output_tensor, input_, sizes=sizes)
+            else:
+                pynccl_comm.all_gather(output_tensor, input_)
+            return output_tensor
+
+        if isinstance(input_, torch.Tensor):
+            return _all_gather_single(input_, sizes)
+
+        output_list = []
+        pynccl_comm.group_start()
+        for inp in input_:
+            output_list.append(_all_gather_single(inp, sizes=sizes))
+        pynccl_comm.group_end()
+
+        return output_list
+
+    def dispatch(
+            self, hidden_states: torch.Tensor,
+            router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        assert self.all2all_manager is not None
+        hidden_states, router_logits = self.all2all_manager.dispatch(
+            hidden_states, router_logits)
+        return hidden_states, router_logits
+
+    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        assert self.all2all_manager is not None
+        hidden_states = self.all2all_manager.combine(hidden_states)
+        return hidden_states
diff --git a/vllm_v0.10.0/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm_v0.10.0/vllm/distributed/device_communicators/cuda_wrapper.py
new file mode 100644
index 0000000..2c38e8e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/device_communicators/cuda_wrapper.py
@@ -0,0 +1,180 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""This file is a pure Python wrapper for the cudart library.
+It avoids the need to compile a separate shared library, and is
+convenient for use when we just need to call a few functions.
+"""
+
+import ctypes
+from dataclasses import dataclass
+from typing import Any, Optional
+
+# this line makes it possible to directly load `libcudart.so` using `ctypes`
+import torch  # noqa
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+# === export types and functions from cudart to Python ===
+# for the original cudart definition, please check
+# https://docs.nvidia.com/cuda/cuda-runtime-api/index.html
+
+cudaError_t = ctypes.c_int
+cudaMemcpyKind = ctypes.c_int
+
+
+class cudaIpcMemHandle_t(ctypes.Structure):
+    _fields_ = [("internal", ctypes.c_byte * 128)]
+
+
+@dataclass
+class Function:
+    name: str
+    restype: Any
+    argtypes: list[Any]
+
+
+def find_loaded_library(lib_name) -> Optional[str]:
+    """
+    According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
+    the file `/proc/self/maps` contains the memory maps of the process, which includes the
+    shared libraries loaded by the process. We can use this file to find the path of the
+    a loaded library.
+    """ # noqa
+    found = False
+    with open("/proc/self/maps") as f:
+        for line in f:
+            if lib_name in line:
+                found = True
+                break
+    if not found:
+        # the library is not loaded in the current process
+        return None
+    # if lib_name is libcudart, we need to match a line with:
+    # address /path/to/libcudart-hash.so.11.0
+    start = line.index("/")
+    path = line[start:].strip()
+    filename = path.split("/")[-1]
+    assert filename.rpartition(".so")[0].startswith(lib_name), \
+        f"Unexpected filename: {filename} for library {lib_name}"
+    return path
+
+
+class CudaRTLibrary:
+    exported_functions = [
+        # ​cudaError_t cudaSetDevice ( int  device )
+        Function("cudaSetDevice", cudaError_t, [ctypes.c_int]),
+        # cudaError_t 	cudaDeviceSynchronize ( void )
+        Function("cudaDeviceSynchronize", cudaError_t, []),
+        # ​cudaError_t cudaDeviceReset ( void )
+        Function("cudaDeviceReset", cudaError_t, []),
+
+        # const char* 	cudaGetErrorString ( cudaError_t error )
+        Function("cudaGetErrorString", ctypes.c_char_p, [cudaError_t]),
+
+        # ​cudaError_t 	cudaMalloc ( void** devPtr, size_t size )
+        Function("cudaMalloc", cudaError_t,
+                 [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t]),
+        # ​cudaError_t 	cudaFree ( void* devPtr )
+        Function("cudaFree", cudaError_t, [ctypes.c_void_p]),
+        # ​cudaError_t cudaMemset ( void* devPtr, int  value, size_t count )
+        Function("cudaMemset", cudaError_t,
+                 [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t]),
+        # ​cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ) # noqa
+        Function("cudaMemcpy", cudaError_t, [
+            ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, cudaMemcpyKind
+        ]),
+
+        # cudaError_t cudaIpcGetMemHandle ( cudaIpcMemHandle_t* handle, void* devPtr ) # noqa
+        Function("cudaIpcGetMemHandle", cudaError_t,
+                 [ctypes.POINTER(cudaIpcMemHandle_t), ctypes.c_void_p]),
+        # ​cudaError_t cudaIpcOpenMemHandle ( void** devPtr, cudaIpcMemHandle_t handle, unsigned int  flags ) # noqa
+        Function("cudaIpcOpenMemHandle", cudaError_t, [
+            ctypes.POINTER(ctypes.c_void_p), cudaIpcMemHandle_t, ctypes.c_uint
+        ]),
+    ]
+
+    # class attribute to store the mapping from the path to the library
+    # to avoid loading the same library multiple times
+    path_to_library_cache: dict[str, Any] = {}
+
+    # class attribute to store the mapping from library path
+    #  to the corresponding dictionary
+    path_to_dict_mapping: dict[str, dict[str, Any]] = {}
+
+    def __init__(self, so_file: Optional[str] = None):
+        if so_file is None:
+            so_file = find_loaded_library("libcudart")
+            if so_file is None:
+                so_file = envs.VLLM_CUDART_SO_PATH  # fallback to env var
+            assert so_file is not None, \
+                (
+                    "libcudart is not loaded in the current process, "
+                    "try setting VLLM_CUDART_SO_PATH"
+                )
+        if so_file not in CudaRTLibrary.path_to_library_cache:
+            lib = ctypes.CDLL(so_file)
+            CudaRTLibrary.path_to_library_cache[so_file] = lib
+        self.lib = CudaRTLibrary.path_to_library_cache[so_file]
+
+        if so_file not in CudaRTLibrary.path_to_dict_mapping:
+            _funcs = {}
+            for func in CudaRTLibrary.exported_functions:
+                f = getattr(self.lib, func.name)
+                f.restype = func.restype
+                f.argtypes = func.argtypes
+                _funcs[func.name] = f
+            CudaRTLibrary.path_to_dict_mapping[so_file] = _funcs
+        self.funcs = CudaRTLibrary.path_to_dict_mapping[so_file]
+
+    def CUDART_CHECK(self, result: cudaError_t) -> None:
+        if result != 0:
+            error_str = self.cudaGetErrorString(result)
+            raise RuntimeError(f"CUDART error: {error_str}")
+
+    def cudaGetErrorString(self, error: cudaError_t) -> str:
+        return self.funcs["cudaGetErrorString"](error).decode("utf-8")
+
+    def cudaSetDevice(self, device: int) -> None:
+        self.CUDART_CHECK(self.funcs["cudaSetDevice"](device))
+
+    def cudaDeviceSynchronize(self) -> None:
+        self.CUDART_CHECK(self.funcs["cudaDeviceSynchronize"]())
+
+    def cudaDeviceReset(self) -> None:
+        self.CUDART_CHECK(self.funcs["cudaDeviceReset"]())
+
+    def cudaMalloc(self, size: int) -> ctypes.c_void_p:
+        devPtr = ctypes.c_void_p()
+        self.CUDART_CHECK(self.funcs["cudaMalloc"](ctypes.byref(devPtr), size))
+        return devPtr
+
+    def cudaFree(self, devPtr: ctypes.c_void_p) -> None:
+        self.CUDART_CHECK(self.funcs["cudaFree"](devPtr))
+
+    def cudaMemset(self, devPtr: ctypes.c_void_p, value: int,
+                   count: int) -> None:
+        self.CUDART_CHECK(self.funcs["cudaMemset"](devPtr, value, count))
+
+    def cudaMemcpy(self, dst: ctypes.c_void_p, src: ctypes.c_void_p,
+                   count: int) -> None:
+        cudaMemcpyDefault = 4
+        kind = cudaMemcpyDefault
+        self.CUDART_CHECK(self.funcs["cudaMemcpy"](dst, src, count, kind))
+
+    def cudaIpcGetMemHandle(self,
+                            devPtr: ctypes.c_void_p) -> cudaIpcMemHandle_t:
+        handle = cudaIpcMemHandle_t()
+        self.CUDART_CHECK(self.funcs["cudaIpcGetMemHandle"](
+            ctypes.byref(handle), devPtr))
+        return handle
+
+    def cudaIpcOpenMemHandle(self,
+                             handle: cudaIpcMemHandle_t) -> ctypes.c_void_p:
+        cudaIpcMemLazyEnablePeerAccess = 1
+        devPtr = ctypes.c_void_p()
+        self.CUDART_CHECK(self.funcs["cudaIpcOpenMemHandle"](
+            ctypes.byref(devPtr), handle, cudaIpcMemLazyEnablePeerAccess))
+        return devPtr
diff --git a/vllm_v0.10.0/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm_v0.10.0/vllm/distributed/device_communicators/custom_all_reduce.py
new file mode 100644
index 0000000..7dd104a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -0,0 +1,304 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from contextlib import contextmanager
+from typing import Optional, Union
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+import vllm.envs as envs
+from vllm import _custom_ops as ops
+from vllm.distributed.device_communicators.custom_all_reduce_utils import (
+    gpu_p2p_access_check)
+from vllm.distributed.parallel_state import in_the_same_node_as
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils import cuda_device_count_stateless
+
+try:
+    ops.meta_size()
+    custom_ar = True
+except Exception:
+    # For CPUs
+    custom_ar = False
+
+logger = init_logger(__name__)
+
+
+def _can_p2p(rank: int, world_size: int) -> bool:
+    for i in range(world_size):
+        if i == rank:
+            continue
+        if envs.VLLM_SKIP_P2P_CHECK:
+            logger.info(
+                "Skipping P2P check and trusting the driver's P2P report.")
+            return torch.cuda.can_device_access_peer(rank, i)
+        if not gpu_p2p_access_check(rank, i):
+            return False
+    return True
+
+
+def is_weak_contiguous(inp: torch.Tensor):
+    return inp.is_contiguous() or (inp.storage().nbytes() -
+                                   inp.storage_offset() * inp.element_size()
+                                   == inp.numel() * inp.element_size())
+
+
+class CustomAllreduce:
+
+    _SUPPORTED_WORLD_SIZES = [2, 4, 6, 8]
+
+    # max_size: max supported allreduce size
+    def __init__(self,
+                 group: ProcessGroup,
+                 device: Union[int, str, torch.device],
+                 max_size=8192 * 1024) -> None:
+        """
+        Args:
+            group: the process group to work on. If None, it will use the
+                default process group.
+            device: the device to bind the CustomAllreduce to. If None,
+                it will be bind to f"cuda:{local_rank}".
+        It is the caller's responsibility to make sure each communicator
+        is bind to a unique device, and all communicators in this group
+        are in the same node.
+        """
+        self._IS_CAPTURING = False
+        self.disabled = True
+
+        if not custom_ar:
+            # disable because of missing custom allreduce library
+            # e.g. in a non-GPU environment
+            logger.info("Custom allreduce is disabled because "
+                        "of missing custom allreduce library")
+            return
+
+        self.group = group
+
+        assert dist.get_backend(group) != dist.Backend.NCCL, (
+            "CustomAllreduce should be attached to a non-NCCL group.")
+
+        if not all(in_the_same_node_as(group, source_rank=0)):
+            # No need to initialize custom allreduce for multi-node case.
+            logger.warning(
+                "Custom allreduce is disabled because this process group"
+                " spans across nodes.")
+            return
+
+        rank = dist.get_rank(group=self.group)
+        self.rank = rank
+        world_size = dist.get_world_size(group=self.group)
+        if world_size == 1:
+            # No need to initialize custom allreduce for single GPU case.
+            return
+
+        if world_size not in CustomAllreduce._SUPPORTED_WORLD_SIZES:
+            logger.warning(
+                "Custom allreduce is disabled due to an unsupported world"
+                " size: %d. Supported world sizes: %s. To silence this "
+                "warning, specify disable_custom_all_reduce=True explicitly.",
+                world_size, str(CustomAllreduce._SUPPORTED_WORLD_SIZES))
+            return
+
+        if isinstance(device, int):
+            device = torch.device(f"cuda:{device}")
+        elif isinstance(device, str):
+            device = torch.device(device)
+        # now `device` is a `torch.device` object
+        assert isinstance(device, torch.device)
+        self.device = device
+
+        cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
+        if cuda_visible_devices:
+            device_ids = list(map(int, cuda_visible_devices.split(",")))
+        else:
+            device_ids = list(range(cuda_device_count_stateless()))
+
+        physical_device_id = device_ids[device.index]
+        tensor = torch.tensor([physical_device_id],
+                              dtype=torch.int,
+                              device="cpu")
+        gather_list = [
+            torch.tensor([0], dtype=torch.int, device="cpu")
+            for _ in range(world_size)
+        ]
+        dist.all_gather(gather_list, tensor, group=self.group)
+        physical_device_ids = [t.item() for t in gather_list]
+
+        # test nvlink first, this will filter out most of the cases
+        # where custom allreduce is not supported
+        # this checks hardware and driver support for NVLink
+        assert current_platform.is_cuda_alike()
+        fully_connected = current_platform.is_fully_connected(
+            physical_device_ids)
+        if world_size > 2 and not fully_connected:
+            logger.warning(
+                "Custom allreduce is disabled because it's not supported on"
+                " more than two PCIe-only GPUs. To silence this warning, "
+                "specify disable_custom_all_reduce=True explicitly.")
+            return
+        # test P2P capability, this checks software/cudaruntime support
+        # this is expensive to compute at the first time
+        # then we cache the result
+        # On AMD GPU, p2p is always enabled between XGMI connected GPUs
+        if not current_platform.is_rocm() and not _can_p2p(rank, world_size):
+            logger.warning(
+                "Custom allreduce is disabled because your platform lacks "
+                "GPU P2P capability or P2P test failed. To silence this "
+                "warning, specify disable_custom_all_reduce=True explicitly.")
+            return
+
+        self.disabled = False
+        # Buffers memory are owned by this Python class and passed to C++.
+        # Meta data composes of two parts: meta data for synchronization and a
+        # temporary buffer for storing intermediate allreduce results.
+        self.meta_ptrs = self.create_shared_buffer(ops.meta_size() + max_size,
+                                                   group=group,
+                                                   uncached=True)
+        # This is a pre-registered IPC buffer. In eager mode, input tensors
+        # are first copied into this buffer before allreduce is performed
+        self.buffer_ptrs = self.create_shared_buffer(max_size, group=group)
+        # This is a buffer for storing the tuples of pointers pointing to
+        # IPC buffers from all ranks. Each registered tuple has size of
+        # 8*world_size bytes where world_size is at most 8. Allocating 8MB
+        # is enough for 131072 such tuples. The largest model I've seen only
+        # needs less than 10000 of registered tuples.
+        self.rank_data = torch.empty(8 * 1024 * 1024,
+                                     dtype=torch.uint8,
+                                     device=self.device)
+        self.max_size = max_size
+        self.rank = rank
+        self.world_size = world_size
+        self.fully_connected = fully_connected
+        self._ptr = ops.init_custom_ar(self.meta_ptrs, self.rank_data, rank,
+                                       self.fully_connected)
+        ops.register_buffer(self._ptr, self.buffer_ptrs)
+
+    @contextmanager
+    def capture(self):
+        """
+        The main responsibility of this context manager is the 
+        `register_graph_buffers` call at the end of the context.
+        It records all the buffer addresses used in the CUDA graph.
+        """
+        try:
+            self._IS_CAPTURING = True
+            yield
+        finally:
+            self._IS_CAPTURING = False
+            if not self.disabled:
+                self.register_graph_buffers()
+
+    def register_graph_buffers(self):
+        handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr)
+        logger.info("Registering %d cuda graph addresses", len(offset))
+        # We cannot directly use `dist.all_gather_object` here
+        # because it is incompatible with `gloo` backend under inference mode.
+        # see https://github.com/pytorch/pytorch/issues/126032 for details.
+        all_data = [[None, None]
+                    for _ in range(dist.get_world_size(group=self.group))]
+        all_data[self.rank] = [handle, offset]
+        ranks = sorted(dist.get_process_group_ranks(group=self.group))
+        for i, rank in enumerate(ranks):
+            dist.broadcast_object_list(all_data[i],
+                                       src=rank,
+                                       group=self.group,
+                                       device="cpu")
+        # Unpack list of tuples to tuple of lists.
+        handles = [d[0] for d in all_data]  # type: ignore
+        offsets = [d[1] for d in all_data]  # type: ignore
+        ops.register_graph_buffers(self._ptr, handles, offsets)
+
+    def should_custom_ar(self, inp: torch.Tensor):
+        if self.disabled:
+            return False
+        inp_size = inp.numel() * inp.element_size()
+        # custom allreduce requires input byte size to be multiples of 16
+        if inp_size % 16 != 0:
+            return False
+        if not is_weak_contiguous(inp):
+            return False
+        # for 4 or more non NVLink-capable GPUs, custom allreduce provides
+        # little performance improvement over NCCL.
+        if self.world_size == 2 or self.fully_connected:
+            return inp_size < self.max_size
+        return False
+
+    def all_reduce(self,
+                   inp: torch.Tensor,
+                   *,
+                   out: torch.Tensor = None,
+                   registered: bool = False):
+        """Performs an out-of-place all reduce.
+        
+        If registered is True, this assumes inp's pointer is already
+        IPC-registered. Otherwise, inp is first copied into a pre-registered
+        buffer.
+        """
+        if out is None:
+            out = torch.empty_like(inp)
+        if registered:
+            ops.all_reduce(self._ptr, inp, out, 0, 0)
+        else:
+            ops.all_reduce(self._ptr, inp, out, self.buffer_ptrs[self.rank],
+                           self.max_size)
+        return out
+
+    def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
+        """The main allreduce API that provides support for cuda graph."""
+        # When custom allreduce is disabled, this will be None.
+        if self.disabled or not self.should_custom_ar(input):
+            return None
+        if self._IS_CAPTURING:
+            if torch.cuda.is_current_stream_capturing():
+                return self.all_reduce(input, registered=True)
+            else:
+                # If warm up, mimic the allocation pattern since custom
+                # allreduce is out-of-place.
+                return torch.empty_like(input)
+        else:
+            # Note: outside of cuda graph context, custom allreduce incurs a
+            # cost of cudaMemcpy, which should be small (<=1% of overall
+            # latency) compared to the performance gain of using custom kernels
+            return self.all_reduce(input, registered=False)
+
+    def close(self):
+        if not self.disabled and self._ptr:
+            if ops is not None:
+                ops.dispose(self._ptr)
+            self._ptr = 0
+            self.free_shared_buffer(self.meta_ptrs, rank=self.rank)
+            self.free_shared_buffer(self.buffer_ptrs, rank=self.rank)
+
+    def __del__(self):
+        self.close()
+
+    @staticmethod
+    def create_shared_buffer(size_in_bytes: int,
+                             group: Optional[ProcessGroup] = None,
+                             uncached: Optional[bool] = False) -> list[int]:
+        pointer, handle = ops.allocate_shared_buffer_and_handle(size_in_bytes)
+
+        world_size = dist.get_world_size(group=group)
+        rank = dist.get_rank(group=group)
+        handles = [None] * world_size
+        dist.all_gather_object(handles, handle, group=group)
+
+        pointers: list[int] = []
+        for i, h in enumerate(handles):
+            if i == rank:
+                pointers.append(pointer)  # type: ignore
+            else:
+                pointers.append(ops.open_mem_handle(h))
+        return pointers
+
+    @staticmethod
+    def free_shared_buffer(pointers: list[int],
+                           group: Optional[ProcessGroup] = None,
+                           rank: Optional[int] = 0) -> None:
+        if rank is None:
+            rank = dist.get_rank(group=group)
+        if ops is not None:
+            ops.free_shared_buffer(pointers[rank])
diff --git a/vllm_v0.10.0/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm_v0.10.0/vllm/distributed/device_communicators/custom_all_reduce_utils.py
new file mode 100644
index 0000000..7c6001e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -0,0 +1,259 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import ctypes
+import json
+import os
+import pickle
+import subprocess
+import sys
+import tempfile
+from collections.abc import Sequence
+from itertools import product
+from typing import Optional
+
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+import vllm.envs as envs
+from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
+from vllm.logger import init_logger
+from vllm.utils import (cuda_device_count_stateless,
+                        update_environment_variables)
+
+logger = init_logger(__name__)
+
+
+def producer(batch_src: Sequence[int],
+             producer_queue,
+             consumer_queue,
+             result_queue,
+             cuda_visible_devices: Optional[str] = None):
+    if cuda_visible_devices is not None:
+        update_environment_variables(
+            {"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
+
+    lib = CudaRTLibrary()
+    for i in batch_src:
+        lib.cudaSetDevice(i)
+        pointer = lib.cudaMalloc(1024)
+        lib.cudaMemset(pointer, 1, 1024)
+        lib.cudaDeviceSynchronize()
+        handle = lib.cudaIpcGetMemHandle(pointer)
+        producer_queue.put(handle)
+        open_success = consumer_queue.get()
+        if open_success:
+            # use two queues to simulate barrier
+            producer_queue.put(0)
+            consumer_queue.get()
+            # check if the memory is modified
+            host_data = (ctypes.c_char * 1024)()
+            lib.cudaMemcpy(host_data, pointer, 1024)  # type: ignore
+            for i in range(1024):
+                if ord(host_data[i]) != 2:
+                    open_success = False
+                    break
+        result_queue.put(open_success)
+        lib.cudaDeviceReset()
+
+
+def consumer(batch_tgt: Sequence[int],
+             producer_queue,
+             consumer_queue,
+             result_queue,
+             cuda_visible_devices: Optional[str] = None):
+    if cuda_visible_devices is not None:
+        update_environment_variables(
+            {"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
+
+    lib = CudaRTLibrary()
+    for j in batch_tgt:
+        lib.cudaSetDevice(j)
+        handle = producer_queue.get()
+        open_success = False
+        try:
+            pointer = lib.cudaIpcOpenMemHandle(handle)  # type: ignore
+            open_success = True
+        except RuntimeError:
+            # cannot error out here, because the producer process
+            # is still waiting for the response.
+            pass
+        consumer_queue.put(open_success)
+        if open_success:
+            # modify the memory
+            lib.cudaMemset(pointer, 2, 1024)
+            lib.cudaDeviceSynchronize()
+            # use two queues to simulate barrier
+            producer_queue.get()
+            consumer_queue.put(0)
+            # check if the memory is modified
+            host_data = (ctypes.c_char * 1024)()
+            lib.cudaMemcpy(host_data, pointer, 1024)  # type: ignore
+            for i in range(1024):
+                if ord(host_data[i]) != 2:
+                    open_success = False
+                    break
+        result_queue.put(open_success)
+        lib.cudaDeviceReset()
+
+
+def can_actually_p2p(
+    batch_src: Sequence[int],
+    batch_tgt: Sequence[int],
+) -> Sequence[bool]:
+    """
+    Usually, checking if P2P access is enabled can be done by
+    `torch.cuda.can_device_access_peer(src, tgt)`. However, sometimes
+    the driver might be broken, and `torch.cuda.can_device_access_peer(src, tgt)`
+    returns `True` even if P2P access is not actually possible.
+    See https://github.com/vllm-project/vllm/issues/2728 and
+    https://forums.developer.nvidia.com/t/direct-gpu-gpu-communication-does-not-seem-to-work-properly/283264/10
+    Therefore, we have to perform a real P2P access to check if it is actually
+    possible.
+
+    Note on p2p and cuda IPC:
+    Usually, one process uses one GPU:
+    GPU src --> cuda context src --> tensor src --> process src
+
+    We need to combine p2p and cuda IPC, so that:
+    GPU src --> cuda context src --> tensor src --> process src
+                                      |shared|
+    GPU tgt --> cuda context tgt --> tensor tgt --> process tgt
+    That is to say, process src creates a tensor in GPU src, passes IPC handle to
+    process tgt, and process tgt accesses the tensor in GPU tgt. Any operation on the
+    tensor in process tgt will be reflected in the tensor in process src, because
+    they are the same memory segment.
+    It is important to note that process tgt accesses the tensor in GPU tgt, not
+    GPU src. That's why we need p2p access.
+
+    The most time-consuming part is the process creation. To avoid creating
+    processes for every pair of GPUs, we use batched testing. We create two
+    processes for testing all pairs of GPUs in batch. The trick is to reset
+    the device after each test (which is not available in PyTorch).
+    """  # noqa
+    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
+    # pass the CUDA_VISIBLE_DEVICES to the child process
+    # to make sure they see the same set of GPUs
+
+    # make sure the processes are spawned
+    smp = mp.get_context("spawn")
+    producer_queue = smp.Queue()
+    consumer_queue = smp.Queue()
+    result_queue = smp.Queue()
+    p_src = smp.Process(target=producer,
+                        args=(batch_src, producer_queue, consumer_queue,
+                              result_queue, cuda_visible_devices))
+    p_tgt = smp.Process(target=consumer,
+                        args=(batch_tgt, producer_queue, consumer_queue,
+                              result_queue, cuda_visible_devices))
+    p_src.start()
+    p_tgt.start()
+    p_src.join()
+    p_tgt.join()
+    assert p_src.exitcode == 0 and p_tgt.exitcode == 0
+    result: list[bool] = []
+    for src, tgt in zip(batch_src, batch_tgt):
+        a = result_queue.get()
+        b = result_queue.get()
+        if a != b:
+            logger.warning(
+                "Two processes do not agree on the P2P access"
+                " status on %d -> %d, treat as disabled.", src, tgt)
+            result.append(False)
+        else:
+            result.append(a)
+    return result
+
+
+# why do we need this cache?
+# we are testing peer-to-peer (p2p) access between GPUs,across processes.
+# if we test it every time, it will be very slow, because we need to create
+#  N * N * 2 processes, where N is the world size. This is very slow.
+# to reduce the time, we use a cache file to store the p2p access status.
+# the cache file is generated by the master process if it does not exist.
+# then all the processes can read the cache file to check the p2p access status.
+# Note that the cache file is suffixed by the CUDA_VISIBLE_DEVICES, so that we
+#  can have different cache files for different CUDA_VISIBLE_DEVICES settings,
+#  e.g. used by different vllm engines. The device id in the cache file is a
+#  **local** device id, i.e. from 0 to num_dev-1, where num_dev is the number
+#  of visible devices in the vllm engine.
+_gpu_p2p_access_cache: Optional[dict[str, bool]] = None
+
+
+def gpu_p2p_access_check(src: int, tgt: int) -> bool:
+    """Check if GPU src can access GPU tgt."""
+
+    # if the cache variable is already calculated,
+    # read from the cache instead of checking it again
+    global _gpu_p2p_access_cache
+    if _gpu_p2p_access_cache is not None:
+        return _gpu_p2p_access_cache[f"{src}->{tgt}"]
+
+    is_distributed = dist.is_initialized()
+
+    num_dev = cuda_device_count_stateless()
+    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
+    if cuda_visible_devices is None:
+        cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
+
+    path = os.path.join(
+        envs.VLLM_CACHE_ROOT,
+        f"gpu_p2p_access_cache_for_{cuda_visible_devices}.json")
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    from vllm.distributed.parallel_state import get_world_group
+    if ((not is_distributed or get_world_group().local_rank == 0)
+            and (not os.path.exists(path))):
+        # only the local master process (with local_rank == 0) can
+        #  enter this block to calculate the cache
+        logger.info("generating GPU P2P access cache in %s", path)
+        cache: dict[str, bool] = {}
+        ids = list(range(num_dev))
+        # batch of all pairs of GPUs
+        batch_src, batch_tgt = zip(*list(product(ids, ids)))
+        # NOTE: we use `subprocess` rather than `multiprocessing` here
+        # because the caller might not have `if __name__ == "__main__":`,
+        # in that case we cannot use spawn method in multiprocessing.
+        # However, `can_actually_p2p` requires spawn method.
+        # The fix is, we use `subprocess` to call the function,
+        # where we have `if __name__ == "__main__":` in this file.
+
+        # use a temporary file to store the result
+        # we don't use the output of the subprocess directly,
+        # because the subprocess might produce logging output
+        with tempfile.NamedTemporaryFile() as output_file:
+            input_bytes = pickle.dumps(
+                (batch_src, batch_tgt, output_file.name))
+            returned = subprocess.run([sys.executable, __file__],
+                                      input=input_bytes,
+                                      capture_output=True)
+            # check if the subprocess is successful
+            try:
+                returned.check_returncode()
+            except Exception as e:
+                # wrap raised exception to provide more information
+                raise RuntimeError(
+                    f"Error happened when batch testing "
+                    f"peer-to-peer access from {batch_src} to {batch_tgt}:\n"
+                    f"{returned.stderr.decode()}") from e
+            with open(output_file.name, "rb") as f:
+                result = pickle.load(f)
+        for _i, _j, r in zip(batch_src, batch_tgt, result):
+            cache[f"{_i}->{_j}"] = r
+        with open(path, "w") as f:
+            json.dump(cache, f, indent=4)
+    if is_distributed:
+        get_world_group().barrier()
+    logger.info("reading GPU P2P access cache from %s", path)
+    with open(path) as f:
+        cache = json.load(f)
+    _gpu_p2p_access_cache = cache
+    return _gpu_p2p_access_cache[f"{src}->{tgt}"]
+
+
+__all__ = ["gpu_p2p_access_check"]
+
+if __name__ == "__main__":
+    batch_src, batch_tgt, output_file = pickle.loads(sys.stdin.buffer.read())
+    result = can_actually_p2p(batch_src, batch_tgt)
+    with open(output_file, "wb") as f:
+        f.write(pickle.dumps(result))
diff --git a/vllm_v0.10.0/vllm/distributed/device_communicators/neuron_communicator.py b/vllm_v0.10.0/vllm/distributed/device_communicators/neuron_communicator.py
new file mode 100644
index 0000000..5b61a16
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/device_communicators/neuron_communicator.py
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.distributed.device_communicators.base_device_communicator import (
+    DeviceCommunicatorBase)
+from vllm.platforms import current_platform
+
+if current_platform.is_neuron():
+    import torch_xla.core.xla_model as xm
+
+
+class NeuronCommunicator(DeviceCommunicatorBase):
+
+    def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
+        return xm.all_reduce(xm.REDUCE_SUM, x)
+
+    def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        assert dim == -1, "Neuron only supports dim=-1 for all-gather."
+        return xm.all_gather(x, dim=dim)
diff --git a/vllm_v0.10.0/vllm/distributed/device_communicators/pynccl.py b/vllm_v0.10.0/vllm/distributed/device_communicators/pynccl.py
new file mode 100644
index 0000000..502bfd3
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/device_communicators/pynccl.py
@@ -0,0 +1,290 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional, Union
+
+# ===================== import region =====================
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup, ReduceOp
+
+from vllm.distributed.device_communicators.pynccl_wrapper import (
+    NCCLLibrary, buffer_type, cudaStream_t, ncclComm_t, ncclDataTypeEnum,
+    ncclRedOpTypeEnum, ncclUniqueId)
+from vllm.distributed.utils import StatelessProcessGroup
+from vllm.logger import init_logger
+from vllm.utils import current_stream
+
+logger = init_logger(__name__)
+
+
+class PyNcclCommunicator:
+
+    def __init__(
+        self,
+        group: Union[ProcessGroup, StatelessProcessGroup],
+        device: Union[int, str, torch.device],
+        library_path: Optional[str] = None,
+    ):
+        """
+        Args:
+            group: the process group to work on. If None, it will use the
+                default process group.
+            device: the device to bind the PyNcclCommunicator to. If None,
+                it will be bind to f"cuda:{local_rank}".
+            library_path: the path to the NCCL library. If None, it will
+                use the default library path.
+        It is the caller's responsibility to make sure each communicator
+        is bind to a unique device.
+        """
+        if not isinstance(group, StatelessProcessGroup):
+            assert dist.is_initialized()
+            assert dist.get_backend(group) != dist.Backend.NCCL, (
+                "PyNcclCommunicator should be attached to a non-NCCL group.")
+            # note: this rank is the rank in the group
+            self.rank = dist.get_rank(group)
+            self.world_size = dist.get_world_size(group)
+        else:
+            self.rank = group.rank
+            self.world_size = group.world_size
+
+        self.group = group
+
+        # if world_size == 1, no need to create communicator
+        if self.world_size == 1:
+            self.available = False
+            self.disabled = True
+            return
+        try:
+            self.nccl = NCCLLibrary(library_path)
+        except Exception:
+            # disable because of missing NCCL library
+            # e.g. in a non-GPU environment
+            self.available = False
+            self.disabled = True
+            return
+
+        self.available = True
+        self.disabled = False
+
+        logger.info("vLLM is using nccl==%s", self.nccl.ncclGetVersion())
+
+        if self.rank == 0:
+            # get the unique id from NCCL
+            self.unique_id = self.nccl.ncclGetUniqueId()
+        else:
+            # construct an empty unique id
+            self.unique_id = ncclUniqueId()
+
+        if not isinstance(group, StatelessProcessGroup):
+            tensor = torch.ByteTensor(list(self.unique_id.internal))
+            ranks = dist.get_process_group_ranks(group)
+            # arg `src` in `broadcast` is the global rank
+            dist.broadcast(tensor, src=ranks[0], group=group)
+            byte_list = tensor.tolist()
+            for i, byte in enumerate(byte_list):
+                self.unique_id.internal[i] = byte
+        else:
+            self.unique_id = group.broadcast_obj(self.unique_id, src=0)
+        if isinstance(device, int):
+            device = torch.device(f"cuda:{device}")
+        elif isinstance(device, str):
+            device = torch.device(device)
+        # now `device` is a `torch.device` object
+        assert isinstance(device, torch.device)
+        self.device = device
+        # nccl communicator and stream will use this device
+        # `torch.cuda.device` is a context manager that changes the
+        # current cuda device to the specified one
+        with torch.cuda.device(device):
+            self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
+                self.world_size, self.unique_id, self.rank)
+
+            stream = current_stream()
+            # A small all_reduce for warmup.
+            data = torch.zeros(1, device=device)
+            self.all_reduce(data)
+            stream.synchronize()
+            del data
+
+    def all_reduce(self,
+                   in_tensor: torch.Tensor,
+                   op: ReduceOp = ReduceOp.SUM,
+                   stream=None) -> torch.Tensor:
+        if self.disabled:
+            return None
+        # nccl communicator created on a specific device
+        # will only work on tensors on the same device
+        # otherwise it will cause "illegal memory access"
+        assert in_tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {in_tensor.device}")
+
+        out_tensor = torch.empty_like(in_tensor)
+
+        if stream is None:
+            stream = current_stream()
+        self.nccl.ncclAllReduce(buffer_type(in_tensor.data_ptr()),
+                                buffer_type(out_tensor.data_ptr()),
+                                in_tensor.numel(),
+                                ncclDataTypeEnum.from_torch(in_tensor.dtype),
+                                ncclRedOpTypeEnum.from_torch(op), self.comm,
+                                cudaStream_t(stream.cuda_stream))
+        return out_tensor
+
+    def all_gather(self,
+                   output_tensor: torch.Tensor,
+                   input_tensor: torch.Tensor,
+                   stream=None):
+        if self.disabled:
+            return
+        # nccl communicator created on a specific device
+        # will only work on tensors on the same device
+        # otherwise it will cause "illegal memory access"
+        assert input_tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {input_tensor.device}")
+        if stream is None:
+            stream = current_stream()
+        self.nccl.ncclAllGather(
+            buffer_type(input_tensor.data_ptr()),
+            buffer_type(output_tensor.data_ptr()), input_tensor.numel(),
+            ncclDataTypeEnum.from_torch(input_tensor.dtype), self.comm,
+            cudaStream_t(stream.cuda_stream))
+
+    def all_gatherv(
+        self,
+        output_tensor: torch.Tensor,
+        input_tensor: torch.Tensor,
+        sizes: list[int],
+        stream=None,
+    ):
+        if self.disabled:
+            return
+        # nccl communicator created on a specific device
+        # will only work on tensors on the same device
+        # otherwise it will cause "illegal memory access"
+        assert input_tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {input_tensor.device}")
+        if stream is None:
+            stream = current_stream()
+        assert output_tensor.shape[0] == sum(sizes)
+        split_offset = 0
+        self.nccl.ncclGroupStart()
+        for root, split_size in enumerate(sizes):
+            dst_slice = output_tensor[split_offset:split_offset + split_size]
+            self.nccl.ncclBroadcast(
+                buffer_type(input_tensor.data_ptr()),
+                buffer_type(dst_slice.data_ptr()),
+                dst_slice.numel(),
+                ncclDataTypeEnum.from_torch(input_tensor.dtype),
+                root,
+                self.comm,
+                cudaStream_t(stream.cuda_stream),
+            )
+            split_offset += split_size
+        self.nccl.ncclGroupEnd()
+
+    def reduce_scatter(self,
+                       output_tensor: torch.Tensor,
+                       input_tensor: torch.Tensor,
+                       op: ReduceOp = ReduceOp.SUM,
+                       stream=None):
+        if self.disabled:
+            return
+        # nccl communicator created on a specific device
+        # will only work on tensors on the same device
+        # otherwise it will cause "illegal memory access"
+        assert input_tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {input_tensor.device}")
+        if stream is None:
+            stream = current_stream()
+        self.nccl.ncclReduceScatter(
+            buffer_type(input_tensor.data_ptr()),
+            buffer_type(output_tensor.data_ptr()), output_tensor.numel(),
+            ncclDataTypeEnum.from_torch(input_tensor.dtype),
+            ncclRedOpTypeEnum.from_torch(op), self.comm,
+            cudaStream_t(stream.cuda_stream))
+
+    def reduce_scatterv(
+        self,
+        output_tensor: torch.Tensor,
+        input_tensor: torch.Tensor,
+        sizes: list[int],
+        op: ReduceOp = ReduceOp.SUM,
+        stream=None,
+    ):
+        if self.disabled:
+            return
+        # nccl communicator created on a specific device
+        # will only work on tensors on the same device
+        # otherwise it will cause "illegal memory access"
+        assert input_tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {input_tensor.device}")
+        if stream is None:
+            stream = current_stream()
+
+        split_offset = 0
+        self.nccl.ncclGroupStart()
+        for root, split_size in enumerate(sizes):
+            chunk = input_tensor[split_offset:split_offset + split_size, ...]
+            self.nccl.ncclReduce(
+                buffer_type(chunk.data_ptr()),
+                buffer_type(output_tensor.data_ptr()), chunk.numel(),
+                ncclDataTypeEnum.from_torch(input_tensor.dtype),
+                ncclRedOpTypeEnum.from_torch(op), root, self.comm,
+                cudaStream_t(stream.cuda_stream))
+            split_offset += split_size
+        self.nccl.ncclGroupEnd()
+
+    def send(self, tensor: torch.Tensor, dst: int, stream=None):
+        if self.disabled:
+            return
+        assert tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {tensor.device}")
+        if stream is None:
+            stream = current_stream()
+        self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(),
+                           ncclDataTypeEnum.from_torch(tensor.dtype), dst,
+                           self.comm, cudaStream_t(stream.cuda_stream))
+
+    def recv(self, tensor: torch.Tensor, src: int, stream=None):
+        if self.disabled:
+            return
+        assert tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {tensor.device}")
+        if stream is None:
+            stream = current_stream()
+        self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(),
+                           ncclDataTypeEnum.from_torch(tensor.dtype), src,
+                           self.comm, cudaStream_t(stream.cuda_stream))
+
+    def broadcast(self, tensor: torch.Tensor, src: int, stream=None):
+        if self.disabled:
+            return
+        assert tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {tensor.device}")
+        if stream is None:
+            stream = current_stream()
+        if src == self.rank:
+            sendbuff = buffer_type(tensor.data_ptr())
+            # NCCL requires the sender also to have a receive buffer
+            recvbuff = buffer_type(tensor.data_ptr())
+        else:
+            sendbuff = buffer_type()
+            recvbuff = buffer_type(tensor.data_ptr())
+        self.nccl.ncclBroadcast(sendbuff, recvbuff, tensor.numel(),
+                                ncclDataTypeEnum.from_torch(tensor.dtype), src,
+                                self.comm, cudaStream_t(stream.cuda_stream))
+
+    def group_start(self):
+        self.nccl.ncclGroupStart()
+
+    def group_end(self):
+        self.nccl.ncclGroupEnd()
diff --git a/vllm_v0.10.0/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm_v0.10.0/vllm/distributed/device_communicators/pynccl_wrapper.py
new file mode 100644
index 0000000..a930b63
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/device_communicators/pynccl_wrapper.py
@@ -0,0 +1,382 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# This file is a pure Python wrapper for the NCCL library.
+# The main purpose is to use NCCL combined with CUDA graph.
+# Before writing this script, we tried the following approach:
+# 1. We tried to use `cupy`, it calls NCCL correctly, but `cupy` itself
+#  often gets stuck when initializing the NCCL communicator.
+# 2. We tried to use `torch.distributed`, but `torch.distributed.all_reduce`
+#  contains many other potential cuda APIs, that are not allowed during
+#  capturing the CUDA graph. For further details, please check
+# https://discuss.pytorch.org/t/pytorch-cudagraph-with-nccl-operation-failed/ .
+#
+# Another rejected idea is to write a C/C++ binding for NCCL. It is usually
+# doable, but we often encounter issues related with nccl versions, and need
+# to switch between different versions of NCCL. See
+# https://github.com/NVIDIA/nccl/issues/1234 for more details.
+# A C/C++ binding is not flexible enough to handle this. It requires
+# recompilation of the code every time we want to switch between different
+# versions. This current implementation, with a **pure** Python wrapper, is
+# more flexible. We can easily switch between different versions of NCCL by
+# changing the environment variable `VLLM_NCCL_SO_PATH`, or the `so_file`
+# variable in the code.
+
+import ctypes
+import platform
+from dataclasses import dataclass
+from typing import Any, Optional
+
+import torch
+from torch.distributed import ReduceOp
+
+from vllm.logger import init_logger
+from vllm.utils import find_nccl_library
+
+logger = init_logger(__name__)
+
+# === export types and functions from nccl to Python ===
+# for the original nccl definition, please check
+# https://github.com/NVIDIA/nccl/blob/master/src/nccl.h.in
+
+ncclResult_t = ctypes.c_int
+ncclComm_t = ctypes.c_void_p
+
+
+class ncclUniqueId(ctypes.Structure):
+    _fields_ = [("internal", ctypes.c_byte * 128)]
+
+
+cudaStream_t = ctypes.c_void_p
+buffer_type = ctypes.c_void_p
+
+ncclDataType_t = ctypes.c_int
+
+
+class ncclDataTypeEnum:
+    ncclInt8 = 0
+    ncclChar = 0
+    ncclUint8 = 1
+    ncclInt32 = 2
+    ncclInt = 2
+    ncclUint32 = 3
+    ncclInt64 = 4
+    ncclUint64 = 5
+    ncclFloat16 = 6
+    ncclHalf = 6
+    ncclFloat32 = 7
+    ncclFloat = 7
+    ncclFloat64 = 8
+    ncclDouble = 8
+    ncclBfloat16 = 9
+    ncclNumTypes = 10
+
+    @classmethod
+    def from_torch(cls, dtype: torch.dtype) -> int:
+        if dtype == torch.int8:
+            return cls.ncclInt8
+        if dtype == torch.uint8:
+            return cls.ncclUint8
+        if dtype == torch.int32:
+            return cls.ncclInt32
+        if dtype == torch.int64:
+            return cls.ncclInt64
+        if dtype == torch.float16:
+            return cls.ncclFloat16
+        if dtype == torch.float32:
+            return cls.ncclFloat32
+        if dtype == torch.float64:
+            return cls.ncclFloat64
+        if dtype == torch.bfloat16:
+            return cls.ncclBfloat16
+        raise ValueError(f"Unsupported dtype: {dtype}")
+
+
+ncclRedOp_t = ctypes.c_int
+
+
+class ncclRedOpTypeEnum:
+    ncclSum = 0
+    ncclProd = 1
+    ncclMax = 2
+    ncclMin = 3
+    ncclAvg = 4
+    ncclNumOps = 5
+
+    @classmethod
+    def from_torch(cls, op: ReduceOp) -> int:
+        if op == ReduceOp.SUM:
+            return cls.ncclSum
+        if op == ReduceOp.PRODUCT:
+            return cls.ncclProd
+        if op == ReduceOp.MAX:
+            return cls.ncclMax
+        if op == ReduceOp.MIN:
+            return cls.ncclMin
+        if op == ReduceOp.AVG:
+            return cls.ncclAvg
+        raise ValueError(f"Unsupported op: {op}")
+
+
+@dataclass
+class Function:
+    name: str
+    restype: Any
+    argtypes: list[Any]
+
+
+class NCCLLibrary:
+    exported_functions = [
+        # const char* ncclGetErrorString(ncclResult_t result)
+        Function("ncclGetErrorString", ctypes.c_char_p, [ncclResult_t]),
+        # ncclResult_t  ncclGetVersion(int *version);
+        Function("ncclGetVersion", ncclResult_t,
+                 [ctypes.POINTER(ctypes.c_int)]),
+        # ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
+        Function("ncclGetUniqueId", ncclResult_t,
+                 [ctypes.POINTER(ncclUniqueId)]),
+        # ncclResult_t  ncclCommInitRank(
+        #   ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
+        # note that ncclComm_t is a pointer type, so the first argument
+        # is a pointer to a pointer
+        Function("ncclCommInitRank", ncclResult_t, [
+            ctypes.POINTER(ncclComm_t), ctypes.c_int, ncclUniqueId,
+            ctypes.c_int
+        ]),
+        # ncclResult_t  ncclAllReduce(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+        #   cudaStream_t stream);
+        # note that cudaStream_t is a pointer type, so the last argument
+        # is a pointer
+        Function("ncclAllReduce", ncclResult_t, [
+            buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t,
+            ncclRedOp_t, ncclComm_t, cudaStream_t
+        ]),
+
+        # ncclResult_t  ncclReduce(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, ncclRedOp_t op, int root,
+        #   ncclComm_t comm,  cudaStream_t stream);
+        # note that cudaStream_t is a pointer type, so the last argument
+        # is a pointer
+        Function("ncclReduce", ncclResult_t, [
+            buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t,
+            ncclRedOp_t, ctypes.c_int, ncclComm_t, cudaStream_t
+        ]),
+
+        # ncclResult_t  ncclAllGather(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, ncclComm_t comm,
+        #   cudaStream_t stream);
+        # note that cudaStream_t is a pointer type, so the last argument
+        # is a pointer
+        Function("ncclAllGather", ncclResult_t, [
+            buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t,
+            ncclComm_t, cudaStream_t
+        ]),
+
+        # ncclResult_t  ncclReduceScatter(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+        #   cudaStream_t stream);
+        # note that cudaStream_t is a pointer type, so the last argument
+        # is a pointer
+        Function("ncclReduceScatter", ncclResult_t, [
+            buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t,
+            ncclRedOp_t, ncclComm_t, cudaStream_t
+        ]),
+
+        # ncclResult_t  ncclSend(
+        #   const void* sendbuff, size_t count, ncclDataType_t datatype,
+        #   int dest, ncclComm_t comm, cudaStream_t stream);
+        Function("ncclSend", ncclResult_t, [
+            buffer_type, ctypes.c_size_t, ncclDataType_t, ctypes.c_int,
+            ncclComm_t, cudaStream_t
+        ]),
+
+        # ncclResult_t  ncclRecv(
+        #   void* recvbuff, size_t count, ncclDataType_t datatype,
+        #   int src, ncclComm_t comm, cudaStream_t stream);
+        Function("ncclRecv", ncclResult_t, [
+            buffer_type, ctypes.c_size_t, ncclDataType_t, ctypes.c_int,
+            ncclComm_t, cudaStream_t
+        ]),
+
+        # ncclResult_t ncclBroadcast(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, int root, ncclComm_t comm,
+        #   cudaStream_t stream);
+        Function("ncclBroadcast", ncclResult_t, [
+            buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t,
+            ctypes.c_int, ncclComm_t, cudaStream_t
+        ]),
+
+        # be cautious! this is a collective call, it will block until all
+        # processes in the communicator have called this function.
+        # because Python object destruction can happen in random order,
+        # it is better not to call it at all.
+        # ncclResult_t  ncclCommDestroy(ncclComm_t comm);
+        Function("ncclCommDestroy", ncclResult_t, [ncclComm_t]),
+        # ncclResult_t ncclGroupStart();
+        Function("ncclGroupStart", ncclResult_t, []),
+        # ncclResult_t ncclGroupEnd();
+        Function("ncclGroupEnd", ncclResult_t, []),
+    ]
+
+    # class attribute to store the mapping from the path to the library
+    # to avoid loading the same library multiple times
+    path_to_library_cache: dict[str, Any] = {}
+
+    # class attribute to store the mapping from library path
+    #  to the corresponding dictionary
+    path_to_dict_mapping: dict[str, dict[str, Any]] = {}
+
+    def __init__(self, so_file: Optional[str] = None):
+
+        so_file = so_file or find_nccl_library()
+
+        try:
+            if so_file not in NCCLLibrary.path_to_dict_mapping:
+                lib = ctypes.CDLL(so_file)
+                NCCLLibrary.path_to_library_cache[so_file] = lib
+            self.lib = NCCLLibrary.path_to_library_cache[so_file]
+        except Exception as e:
+            logger.error(
+                "Failed to load NCCL library from %s. "
+                "It is expected if you are not running on NVIDIA/AMD GPUs."
+                "Otherwise, the nccl library might not exist, be corrupted "
+                "or it does not support the current platform %s. "
+                "If you already have the library, please set the "
+                "environment variable VLLM_NCCL_SO_PATH"
+                " to point to the correct nccl library path.", so_file,
+                platform.platform())
+            raise e
+
+        if so_file not in NCCLLibrary.path_to_dict_mapping:
+            _funcs: dict[str, Any] = {}
+            for func in NCCLLibrary.exported_functions:
+                f = getattr(self.lib, func.name)
+                f.restype = func.restype
+                f.argtypes = func.argtypes
+                _funcs[func.name] = f
+            NCCLLibrary.path_to_dict_mapping[so_file] = _funcs
+        self._funcs = NCCLLibrary.path_to_dict_mapping[so_file]
+
+    def ncclGetErrorString(self, result: ncclResult_t) -> str:
+        return self._funcs["ncclGetErrorString"](result).decode("utf-8")
+
+    def NCCL_CHECK(self, result: ncclResult_t) -> None:
+        if result != 0:
+            error_str = self.ncclGetErrorString(result)
+            raise RuntimeError(f"NCCL error: {error_str}")
+
+    def ncclGetVersion(self) -> str:
+        version = ctypes.c_int()
+        self.NCCL_CHECK(self._funcs["ncclGetVersion"](ctypes.byref(version)))
+        version_str = str(version.value)
+        # something like 21903 --> "2.19.3"
+        major = version_str[0].lstrip("0")
+        minor = version_str[1:3].lstrip("0")
+        patch = version_str[3:].lstrip("0")
+        return f"{major}.{minor}.{patch}"
+
+    def ncclGetUniqueId(self) -> ncclUniqueId:
+        unique_id = ncclUniqueId()
+        self.NCCL_CHECK(self._funcs["ncclGetUniqueId"](
+            ctypes.byref(unique_id)))
+        return unique_id
+
+    def unique_id_from_bytes(self, data: bytes) -> ncclUniqueId:
+        if len(data) != 128:
+            raise ValueError(
+                f"Expected 128 bytes for ncclUniqueId, got {len(data)} bytes")
+        unique_id = ncclUniqueId()
+        ctypes.memmove(ctypes.addressof(unique_id.internal), data, 128)
+        return unique_id
+
+    def ncclCommInitRank(self, world_size: int, unique_id: ncclUniqueId,
+                         rank: int) -> ncclComm_t:
+        comm = ncclComm_t()
+        self.NCCL_CHECK(self._funcs["ncclCommInitRank"](ctypes.byref(comm),
+                                                        world_size, unique_id,
+                                                        rank))
+        return comm
+
+    def ncclAllReduce(self, sendbuff: buffer_type, recvbuff: buffer_type,
+                      count: int, datatype: int, op: int, comm: ncclComm_t,
+                      stream: cudaStream_t) -> None:
+        # `datatype` actually should be `ncclDataType_t`
+        # and `op` should be `ncclRedOp_t`
+        # both are aliases of `ctypes.c_int`
+        # when we pass int to a function, it will be converted to `ctypes.c_int`
+        # by ctypes automatically
+        self.NCCL_CHECK(self._funcs["ncclAllReduce"](sendbuff, recvbuff, count,
+                                                     datatype, op, comm,
+                                                     stream))
+
+    def ncclReduce(self, sendbuff: buffer_type, recvbuff: buffer_type,
+                   count: int, datatype: int, op: int, root: int,
+                   comm: ncclComm_t, stream: cudaStream_t) -> None:
+        # `datatype` actually should be `ncclDataType_t`
+        # and `op` should be `ncclRedOp_t`
+        # both are aliases of `ctypes.c_int`
+        # when we pass int to a function, it will be converted to `ctypes.c_int`
+        # by ctypes automatically
+        self.NCCL_CHECK(self._funcs["ncclReduce"](sendbuff, recvbuff, count,
+                                                  datatype, op, root, comm,
+                                                  stream))
+
+    def ncclReduceScatter(self, sendbuff: buffer_type, recvbuff: buffer_type,
+                          count: int, datatype: int, op: int, comm: ncclComm_t,
+                          stream: cudaStream_t) -> None:
+        # `datatype` actually should be `ncclDataType_t`
+        # and `op` should be `ncclRedOp_t`
+        # both are aliases of `ctypes.c_int`
+        # when we pass int to a function, it will be converted to `ctypes.c_int`
+        # by ctypes automatically
+        self.NCCL_CHECK(self._funcs["ncclReduceScatter"](sendbuff, recvbuff,
+                                                         count, datatype, op,
+                                                         comm, stream))
+
+    def ncclAllGather(self, sendbuff: buffer_type, recvbuff: buffer_type,
+                      count: int, datatype: int, comm: ncclComm_t,
+                      stream: cudaStream_t) -> None:
+        # `datatype` actually should be `ncclDataType_t`
+        # which is an aliases of `ctypes.c_int`
+        # when we pass int to a function, it will be converted to `ctypes.c_int`
+        # by ctypes automatically
+        self.NCCL_CHECK(self._funcs["ncclAllGather"](sendbuff, recvbuff, count,
+                                                     datatype, comm, stream))
+
+    def ncclSend(self, sendbuff: buffer_type, count: int, datatype: int,
+                 dest: int, comm: ncclComm_t, stream: cudaStream_t) -> None:
+        self.NCCL_CHECK(self._funcs["ncclSend"](sendbuff, count, datatype,
+                                                dest, comm, stream))
+
+    def ncclRecv(self, recvbuff: buffer_type, count: int, datatype: int,
+                 src: int, comm: ncclComm_t, stream: cudaStream_t) -> None:
+        self.NCCL_CHECK(self._funcs["ncclRecv"](recvbuff, count, datatype, src,
+                                                comm, stream))
+
+    def ncclBroadcast(self, sendbuff: buffer_type, recvbuff: buffer_type,
+                      count: int, datatype: int, root: int, comm: ncclComm_t,
+                      stream: cudaStream_t) -> None:
+        self.NCCL_CHECK(self._funcs["ncclBroadcast"](sendbuff, recvbuff, count,
+                                                     datatype, root, comm,
+                                                     stream))
+
+    def ncclCommDestroy(self, comm: ncclComm_t) -> None:
+        self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm))
+
+    def ncclGroupStart(self) -> None:
+        self.NCCL_CHECK(self._funcs["ncclGroupStart"]())
+
+    def ncclGroupEnd(self) -> None:
+        self.NCCL_CHECK(self._funcs["ncclGroupEnd"]())
+
+
+__all__ = [
+    "NCCLLibrary", "ncclDataTypeEnum", "ncclRedOpTypeEnum", "ncclUniqueId",
+    "ncclComm_t", "cudaStream_t", "buffer_type"
+]
diff --git a/vllm_v0.10.0/vllm/distributed/device_communicators/quick_all_reduce.py b/vllm_v0.10.0/vllm/distributed/device_communicators/quick_all_reduce.py
new file mode 100644
index 0000000..c61231e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/device_communicators/quick_all_reduce.py
@@ -0,0 +1,278 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from enum import Enum
+from typing import Union
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+import vllm.envs as envs
+from vllm import _custom_ops as ops
+from vllm.config import get_current_vllm_config
+from vllm.distributed.parallel_state import in_the_same_node_as
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils import cuda_device_count_stateless
+
+logger = init_logger(__name__)
+
+try:
+    ops.qr_max_size()
+    quick_ar = True
+except Exception:
+    # For CPUs and CUDA
+    quick_ar = False
+
+
+def is_weak_contiguous(inp: torch.Tensor):
+    return inp.is_contiguous() or (inp.storage().nbytes() -
+                                   inp.storage_offset() * inp.element_size()
+                                   == inp.numel() * inp.element_size())
+
+
+class QuickReduceRegime(Enum):
+    FP = 0
+    INT8 = 1
+    INT6 = 2
+    INT4 = 3
+    NONE = 4
+
+
+MB = 1024 * 1024
+
+
+class QuickAllReduce:
+
+    _SUPPORTED_WORLD_SIZES = [2, 4, 8]
+    _SUPPORTED_DTYPES = [torch.float16, torch.bfloat16]
+    # The following data is based on kernel tests.
+    # In this order [FP, INT8, INT6, INT4].
+    _QR_MIN_SIZE = {
+        (torch.float16, 2): [1 * MB, 2 * MB, 2 * MB, 1 * MB],
+        (torch.float16, 4): [1 * MB, 16 * MB, 4 * MB, 2 * MB],
+        (torch.float16, 8): [16 * MB, 4 * MB, 4 * MB, 2 * MB],
+        (torch.bfloat16, 2): [2 * MB, 8 * MB, 8 * MB, 8 * MB],
+        (torch.bfloat16, 4): [8 * MB, 64 * MB, 64 * MB, 16 * MB],
+        (torch.bfloat16, 8): [16 * MB, 2048 * MB, 2048 * MB, 2048 * MB],
+    }
+
+    def __init__(self, group: ProcessGroup,
+                 device: Union[int, str, torch.device]) -> None:
+        """
+        Custom allreduce provides non-destructive acceleration and is 
+        available for CUDA and ROCm MI300 series.
+
+        Custom quick allreduce leverages quantization for further 
+        acceleration on ROCm. It currently supports Q8, Q6, and Q4 
+        quantization formats and FP(float16, bfloat16).
+
+        Quick allreduce is designed as a complement to custom allreduce. 
+        Its initialization requires even stricter conditions. 
+
+        Only the ROCm MI300 series is supported for quick allreduce at 
+        this time.
+
+        Args:
+            group: the process group to work on. If None, it will use the
+                default process group.
+            device: the device to bind the CustomAllreduce to. If None,
+                it will be bind to f"cuda:{local_rank}".
+        It is the caller's responsibility to make sure each communicator
+        is bind to a unique device, and all communicators in this group
+        are in the same node.
+        """
+        self.disabled = True
+        if not self._rocm_arch_available():
+            logger.debug(
+                "Custom quick allreduce is only supported on ROCm MI300 series."
+            )
+            return
+
+        if not quick_ar:
+            # disable because of missing quick reduce library
+            # e.g. in a cuda environment
+            logger.info("Custom quick allreduce is disabled because "
+                        "of missing custom quick allreduce library")
+            return
+
+        self.group = group
+        assert dist.get_backend(group) != dist.Backend.NCCL, (
+            "Custom quick allreduce should be attached to a non-NCCL group.")
+        if not all(in_the_same_node_as(group, source_rank=0)):
+            # No need to initialize custom quick allreduce for
+            # multi-node case.
+            logger.warning("Custom quick allreduce is disabled because this "
+                           "process group spans across nodes.")
+            return
+        rank = dist.get_rank(group=self.group)
+        world_size = dist.get_world_size(group=self.group)
+        self.rank = rank
+        self.world_size = world_size
+        if world_size == 1:
+            # No need to initialize QuickReduce for single GPU case.
+            return
+
+        if world_size not in QuickAllReduce._SUPPORTED_WORLD_SIZES:
+            logger.warning(
+                "Custom quick allreduce is disabled due to an "
+                "unsupported world size: %d. Supported world sizes: %s.",
+                world_size, str(QuickAllReduce._SUPPORTED_WORLD_SIZES))
+            return
+
+        if isinstance(device, int):
+            device = torch.device(f"cuda:{device}")
+        elif isinstance(device, str):
+            device = torch.device(device)
+        assert isinstance(device, torch.device)
+        self.device = device
+
+        cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
+        if cuda_visible_devices:
+            device_ids = list(map(int, cuda_visible_devices.split(",")))
+        else:
+            device_ids = list(range(cuda_device_count_stateless()))
+        physical_device_id = device_ids[device.index]
+        tensor = torch.tensor([physical_device_id],
+                              dtype=torch.int,
+                              device="cpu")
+        gather_list = [
+            torch.tensor([0], dtype=torch.int, device="cpu")
+            for _ in range(self.world_size)
+        ]
+        dist.all_gather(gather_list, tensor, group=self.group)
+        physical_device_ids = [t.item() for t in gather_list]
+
+        # test nvlink first, this will filter out most of the cases
+        # where custom quick allreduce is not supported
+        # this checks hardware and driver support for NVLink
+        assert current_platform.is_cuda_alike()
+        self.fully_connected = current_platform.is_fully_connected(
+            physical_device_ids)
+        if self.world_size > 2 and not self.fully_connected:
+            logger.debug(
+                "Custom quick allreduce is disabled because it's not supported "
+                "on more than two PCIe-only GPUs. ")
+            return
+
+        self.init_quick_all_reduce()
+
+    def init_quick_all_reduce(self):
+        # On RocM, bfloat16 kernels are slower than fp16
+        # due to slower match operations
+        # If environment variable is set to 1, we convert input to fp16
+        self.use_fp16_kernels = envs.VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16
+        regime_str = envs.VLLM_ROCM_QUICK_REDUCE_QUANTIZATION
+        if regime_str not in QuickReduceRegime.__members__:
+            logger.warning(
+                "Custom quick allreduce:",
+                f"Invalid quantization level: {regime_str}. "
+                "Supported levels: "
+                f"{list(QuickReduceRegime.__members__.keys())}")
+            return
+
+        if regime_str == "NONE":
+            logger.debug("Custom quick allreduce is disabled based "
+                         "on env variable "
+                         "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION='NONE'")
+            return
+        self.qr_quant_level = QuickReduceRegime[regime_str]
+        vllm_config = get_current_vllm_config()
+        if vllm_config is not None and \
+            hasattr(vllm_config, "model_config") and \
+            hasattr(vllm_config.model_config, "dtype"):
+            dtype = vllm_config.model_config.dtype
+            if dtype not in [torch.float16, torch.bfloat16]:
+                logger.debug(
+                    "Custom quick allreduce disabled: only supports "
+                    "float16 and float16, but get %s.", dtype)
+                return
+
+            if dtype == torch.bfloat16 and self.use_fp16_kernels:
+                logger.info(
+                    "Custom quick allreduce: BF16 inputs will be converted "
+                    "to FP16 to improve performance. set "
+                    "envs.VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16=0 "
+                    "to turn off.")
+
+        # VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB is specified in MB
+        qr_max_size = envs.VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB
+        if qr_max_size is not None:
+            if qr_max_size < 1:
+                logger.info(
+                    "You should not set a max_size smaller than 1MB, which can "
+                    "lead to error or degradation to custom allreduce or rccl."
+                )
+            qr_max_size = qr_max_size * MB
+        self._ptr = ops.init_custom_qr(self.rank, self.world_size, qr_max_size)
+        self.qr_max_size = qr_max_size if qr_max_size is not None \
+            else ops.qr_max_size()
+        self.create_shared_buffer()
+        self.disabled = False
+
+    def _rocm_arch_available(self):
+        if not current_platform.is_rocm():
+            return False
+        try:
+            props = torch.cuda.get_device_properties(0)
+            gcn_arch = getattr(props, "gcnArchName", "")
+            supported_archs = ['gfx94', 'gfx95']
+            return any(gfx in gcn_arch for gfx in supported_archs)
+        except Exception as e:
+            logger.warning("Failed to determine ROCm for quick allreduce: %s",
+                           e)
+            return False
+
+    def create_shared_buffer(self):
+        """
+        Creates a shared buffer for quickreduce. 
+        Has to be called after init_custom_qr
+        """
+        handle = ops.qr_get_handle(self._ptr)
+        world_size = dist.get_world_size(group=self.group)
+        handles = [None] * world_size
+        dist.all_gather_object(handles, handle, group=self.group)
+        ops.qr_open_handles(self._ptr, handles)
+
+    def should_quick_allreduce(self, inp: torch.Tensor):
+        """
+        Check if quickreduce is available
+        """
+        if self.disabled:
+            return False
+        if inp.dtype not in self._SUPPORTED_DTYPES:
+            return False
+        inp_size = inp.numel() * inp.element_size()
+        # custom quick allreduce requires input byte size to be
+        # multiples of 16
+        if inp_size % 16 != 0:
+            return False
+        if not is_weak_contiguous(inp):
+            return False
+        dtype = inp.dtype
+        if self.use_fp16_kernels:
+            dtype = torch.float16
+        return inp_size <= self.qr_max_size and \
+            inp_size >= self._QR_MIN_SIZE[(dtype, self.world_size)]\
+                [self.qr_quant_level.value]
+
+    def quick_all_reduce(self, inp: torch.Tensor, *, out: torch.Tensor = None):
+        """Performs an out-of-place custom quick all reduce."""
+        # quick allreduce doesn't require a separate graph mode,
+        # as QR uses static IPC buffer.
+        if out is None:
+            out = torch.empty_like(inp)
+        ops.qr_all_reduce(self._ptr, inp, out, self.qr_quant_level.value,
+                          self.use_fp16_kernels)
+        return out
+
+    def close(self):
+        if not self.disabled and getattr(self, "_ptr", None):
+            if ops is not None:
+                ops.qr_destroy(self._ptr)
+            self._ptr = 0
+            self.disabled = True
+
+    def __del__(self):
+        self.close()
diff --git a/vllm_v0.10.0/vllm/distributed/device_communicators/shm_broadcast.py b/vllm_v0.10.0/vllm/distributed/device_communicators/shm_broadcast.py
new file mode 100644
index 0000000..c781004
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/device_communicators/shm_broadcast.py
@@ -0,0 +1,585 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pickle
+import time
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from multiprocessing import shared_memory
+from threading import Event
+from typing import Any, Optional, Union
+from unittest.mock import patch
+
+import torch
+import torch.distributed as dist
+import zmq
+from torch.distributed import ProcessGroup
+from zmq import IPV6  # type: ignore
+from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context  # type: ignore
+
+import vllm.envs as envs
+from vllm.distributed.utils import StatelessProcessGroup, sched_yield
+from vllm.logger import init_logger
+from vllm.utils import (get_ip, get_open_port, get_open_zmq_ipc_path,
+                        is_valid_ipv6_address)
+
+VLLM_RINGBUFFER_WARNING_INTERVAL = envs.VLLM_RINGBUFFER_WARNING_INTERVAL
+
+logger = init_logger(__name__)
+
+
+class SpinTimer:
+
+    def record_activity(self):
+        pass
+
+    def spin(self):
+        sched_yield()
+
+
+class SpinSleepTimer(SpinTimer):
+    """
+    In setups which have long inactivity periods it is desirable to reduce
+    system power consumption when vllm does nothing. This would lead to more
+    CPU thermal headroom when a request eventually comes, especially when
+    multiple GPUs are connected as each GPU would otherwise pin one thread at
+    100% CPU usage.
+
+    The simplest solution is to reduce polling frequency when there is no
+    activity for a certain period of time.
+    """
+
+    def __init__(self, busy_loop_s: float = 3.0, wait_sleep_s: float = 0.1):
+        self.last_activity = time.monotonic()
+        self.busy_loop_s = busy_loop_s
+        self.wait_sleep_s = wait_sleep_s
+
+    def record_activity(self):
+        self.last_activity = time.monotonic()
+
+    def spin(self):
+        curr_time = time.monotonic()
+        if curr_time >= self.last_activity + self.busy_loop_s:
+            time.sleep(self.wait_sleep_s)
+        else:
+            sched_yield()
+
+
+class ShmRingBuffer:
+
+    def __init__(self,
+                 n_reader: int,
+                 max_chunk_bytes: int,
+                 max_chunks: int,
+                 name: Optional[str] = None):
+        """
+        A shared memory ring buffer implementation for broadcast communication.
+        Essentially, it is a queue where only one will `enqueue` and multiple
+        will `dequeue`. The max size of each item, together with the max number
+        of items that can be stored in the buffer are known in advance.
+        In this case, we don't need to synchronize the access to
+         the buffer.
+
+        Buffer memory layout:
+                  data                                 metadata
+                    |                                      |
+                    | (current_idx)                        | (current_idx)
+                    v                                      v
+        +-------------------------------+----------------------------------------+
+        | chunk0 | chunk1 | ... | chunk | metadata0 | metadata1 | ... | metadata |
+        +-------------------------------+----------------------------------------+
+        | max_chunks x max_chunk_bytes  | max_chunks x (1 + n_reader) bytes      |
+
+        metadata memory layout: each byte is a flag, the first byte is the written
+        flag, and the rest are reader flags. The flags are set to 0 by default.
+        +--------------+--------------+--------------+-----+--------------+
+        | written_flag | reader0_flag | reader1_flag | ... | readerN_flag |
+        +--------------+--------------+--------------+-----+--------------+
+
+        The state of metadata is as follows:
+
+        (case 1) 0???...???: the block is not written yet, cannot read, can write
+        (case 2) 1000...000: the block is just written, can read, cannot write
+        (case 3) 1???...???: the block is written and read by some readers, can read if not read, cannot write
+        (case 4) 1111...111: the block is written and read by all readers, cannot read, can write
+
+        State transition for readers:
+
+        When a reader finds a block that it can read (case 2 or 3), it can yield the block for caller to read.
+        Only after the caller finishes reading the block, the reader can mark the block as read.
+        Readers only mark the block as read (from 0 to 1), the writer marks the block as ready to read (from 1 to 0).
+
+        State transition for writer:
+
+        When the writer writes to a block (case 1 or 4), it first resets the written flag to 0, converting either case
+        to case 1. Then it can yield the block for caller to write. After the caller finishes writing the block, the writer
+        can reset the reader flags to 0, and mark the block as written (from 0 to 1).
+        NOTE: the order is important here, first reset the reader flags (so that we are still in case 1), then mark the block as written. The state transition is atomic. If we do it in the reverse order, it will go through case 3 and then back to case 2, and readers might read the intermediate case 3, which is not correct.
+
+        During creation, `name` is None and the buffer is created. We can pass the
+        created object to other processes by pickling it. The other processes will
+        get the name of the shared memory and open it, so that they can access the
+        same shared memory buffer.
+        """# noqa
+        self.n_reader = n_reader
+        self.metadata_size = 1 + n_reader
+        self.max_chunk_bytes = max_chunk_bytes
+        self.max_chunks = max_chunks
+        self.total_bytes_of_buffer = (self.max_chunk_bytes +
+                                      self.metadata_size) * self.max_chunks
+        self.data_offset = 0
+        self.metadata_offset = self.max_chunk_bytes * self.max_chunks
+
+        if name is None:
+            # we are creating a buffer
+            self.is_creator = True
+            self.shared_memory = shared_memory.SharedMemory(
+                create=True, size=self.total_bytes_of_buffer)
+            # initialize the metadata section to 0
+            with memoryview(self.shared_memory.buf[self.metadata_offset:]
+                            ) as metadata_buffer:
+                torch.frombuffer(metadata_buffer, dtype=torch.uint8).fill_(0)
+        else:
+            # we are opening an existing buffer
+            self.is_creator = False
+            # fix to https://stackoverflow.com/q/62748654/9191338
+            # Python incorrectly tracks shared memory even if it is not
+            # created by the process. The following patch is a workaround.
+            with patch("multiprocessing.resource_tracker.register",
+                       lambda *args, **kwargs: None):
+                try:
+                    self.shared_memory = shared_memory.SharedMemory(name=name)
+                    # See https://docs.python.org/3/library/multiprocessing.shared_memory.html # noqa
+                    # Some platforms allocate memory based on page size,
+                    # so the shared memory block size may be larger or equal
+                    # to the requested size. The size parameter is ignored
+                    # when attaching to an existing block.
+                    assert (self.shared_memory.size
+                            >= self.total_bytes_of_buffer)
+                except FileNotFoundError:
+                    # we might deserialize the object in a different node
+                    # in this case, this object is not used,
+                    # and we should suppress the error
+                    pass
+
+    def handle(self):
+        return (self.n_reader, self.max_chunk_bytes, self.max_chunks,
+                self.shared_memory.name)
+
+    def __reduce__(self):
+        return (
+            self.__class__,
+            self.handle(),
+        )
+
+    def __del__(self):
+        if hasattr(self, "shared_memory"):
+            self.shared_memory.close()
+            if self.is_creator:
+                self.shared_memory.unlink()
+
+    @contextmanager
+    def get_data(self, current_idx: int):
+        start = self.data_offset + current_idx * self.max_chunk_bytes
+        end = start + self.max_chunk_bytes
+        with memoryview(self.shared_memory.buf[start:end]) as buf:
+            yield buf
+
+    @contextmanager
+    def get_metadata(self, current_idx: int):
+        start = self.metadata_offset + current_idx * self.metadata_size
+        end = start + self.metadata_size
+        with memoryview(self.shared_memory.buf[start:end]) as buf:
+            yield buf
+
+
+@dataclass
+class Handle:
+    local_reader_ranks: list[int] = field(default_factory=list)
+
+    buffer_handle: Optional[tuple[int, int, int, str]] = None
+    local_subscribe_addr: Optional[str] = None
+    remote_subscribe_addr: Optional[str] = None
+    remote_addr_ipv6: bool = False
+
+
+class MessageQueue:
+
+    def __init__(
+        self,
+        n_reader,  # number of all readers
+        n_local_reader,  # number of local readers through shared memory
+        local_reader_ranks: Optional[list[int]] = None,
+        max_chunk_bytes: int = 1024 * 1024 * 10,
+        max_chunks: int = 10,
+        connect_ip: Optional[str] = None,
+    ):
+        if local_reader_ranks is None:
+            local_reader_ranks = list(range(n_local_reader))
+        else:
+            assert len(local_reader_ranks) == n_local_reader
+        self.n_local_reader = n_local_reader
+        n_remote_reader = n_reader - n_local_reader
+        self.n_remote_reader = n_remote_reader
+
+        context = Context()
+
+        if n_local_reader > 0:
+            # for local readers, we will:
+            # 1. create a shared memory ring buffer to communicate small data
+            # 2. create a publish-subscribe socket to communicate large data
+            self.buffer = ShmRingBuffer(n_local_reader, max_chunk_bytes,
+                                        max_chunks)
+
+            # XPUB is very similar to PUB,
+            # except that it can receive subscription messages
+            # to confirm the number of subscribers
+            self.local_socket = context.socket(XPUB)
+            # set the verbose option so that we can receive every subscription
+            # message. otherwise, we will only receive the first subscription
+            # see http://api.zeromq.org/3-3:zmq-setsockopt for more details
+            self.local_socket.setsockopt(XPUB_VERBOSE, True)
+            local_subscribe_addr = get_open_zmq_ipc_path()
+            logger.debug("Binding to %s", local_subscribe_addr)
+            self.local_socket.bind(local_subscribe_addr)
+
+            self.current_idx = 0
+        else:
+            self.buffer = None  # type: ignore
+            local_subscribe_addr = None
+            self.local_socket = None
+            self.current_idx = -1
+
+        remote_addr_ipv6 = False
+        if n_remote_reader > 0:
+            # for remote readers, we will:
+            # create a publish-subscribe socket to communicate large data
+            if not connect_ip:
+                connect_ip = get_ip()
+            self.remote_socket = context.socket(XPUB)
+            self.remote_socket.setsockopt(XPUB_VERBOSE, True)
+            remote_subscribe_port = get_open_port()
+            if is_valid_ipv6_address(connect_ip):
+                self.remote_socket.setsockopt(IPV6, 1)
+                remote_addr_ipv6 = True
+                connect_ip = f"[{connect_ip}]"
+            socket_addr = f"tcp://{connect_ip}:{remote_subscribe_port}"
+            self.remote_socket.bind(socket_addr)
+            remote_subscribe_addr = f"tcp://{connect_ip}:{remote_subscribe_port}"
+        else:
+            remote_subscribe_addr = None
+            self.remote_socket = None
+
+        self._is_writer = True
+        self._is_local_reader = False
+        self.local_reader_rank = -1
+        # rank does not matter for remote readers
+        self._is_remote_reader = False
+        self._read_spin_timer = SpinTimer()
+
+        self.handle = Handle(
+            local_reader_ranks=local_reader_ranks,
+            buffer_handle=self.buffer.handle()
+            if self.buffer is not None else None,
+            local_subscribe_addr=local_subscribe_addr,
+            remote_subscribe_addr=remote_subscribe_addr,
+            remote_addr_ipv6=remote_addr_ipv6,
+        )
+
+        logger.info("vLLM message queue communication handle: %s", self.handle)
+
+    def export_handle(self) -> Handle:
+        return self.handle
+
+    @staticmethod
+    def create_from_handle(handle: Handle, rank) -> "MessageQueue":
+        self = MessageQueue.__new__(MessageQueue)
+        self.handle = handle
+        self._is_writer = False
+
+        context = Context()
+
+        if rank in handle.local_reader_ranks:
+            assert handle.buffer_handle is not None
+            self.buffer = ShmRingBuffer(*handle.buffer_handle)
+            self.current_idx = 0
+            self.local_reader_rank = handle.local_reader_ranks.index(rank)
+            self._is_local_reader = True
+            self._is_remote_reader = False
+
+            self.local_socket = context.socket(SUB)
+            self.local_socket.setsockopt_string(SUBSCRIBE, "")
+            socket_addr = handle.local_subscribe_addr
+            logger.debug("Connecting to %s", socket_addr)
+            self.local_socket.connect(socket_addr)
+
+            self.remote_socket = None
+
+            self._read_spin_timer = SpinSleepTimer(
+            ) if envs.VLLM_SLEEP_WHEN_IDLE else SpinTimer()
+        else:
+            self.buffer = None  # type: ignore
+            self.current_idx = -1
+            self.local_reader_rank = -1
+            self._is_local_reader = False
+            self._is_remote_reader = True
+
+            self.local_socket = None
+
+            self.remote_socket = context.socket(SUB)
+            self.remote_socket.setsockopt_string(SUBSCRIBE, "")
+            if handle.remote_addr_ipv6:
+                self.remote_socket.setsockopt(IPV6, 1)
+            socket_addr = handle.remote_subscribe_addr
+            logger.debug("Connecting to %s", socket_addr)
+            self.remote_socket.connect(socket_addr)
+
+        return self
+
+    def wait_until_ready(self):
+        """This is a collective operation. All processes (including the
+        readers and the writer) should call this function.
+        """
+        if self._is_writer:
+            # wait for all readers to connect
+
+            # local readers
+            for i in range(self.n_local_reader):
+                # wait for subscription messages from all local readers
+                self.local_socket.recv()
+            if self.n_local_reader > 0:
+                # send a message to all local readers
+                # to make sure the publish channel is working
+                self.local_socket.send(b"READY")
+
+            # remote readers
+            for i in range(self.n_remote_reader):
+                # wait for subscription messages from all remote readers
+                self.remote_socket.recv()
+            if self.n_remote_reader > 0:
+                # send a message to all remote readers
+                # to make sure the publish channel is working
+                self.remote_socket.send(b"READY")
+        elif self._is_local_reader:
+            # wait for the writer to send a message
+            recv = self.local_socket.recv()
+            assert recv == b"READY"
+        elif self._is_remote_reader:
+            # wait for the writer to send a message
+            recv = self.remote_socket.recv()
+            assert recv == b"READY"
+
+    @contextmanager
+    def acquire_write(self, timeout: Optional[float] = None):
+        assert self._is_writer, "Only writers can acquire write"
+        start_time = time.monotonic()
+        n_warning = 1
+        while True:
+            with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
+                read_count = sum(metadata_buffer[1:])
+                written_flag = metadata_buffer[0]
+                if written_flag and read_count != self.buffer.n_reader:
+                    # this block is written and not read by all readers
+                    # for writers, `self.current_idx` is the next block to write
+                    # if this block is not ready to write,
+                    # we need to wait until it is read by all readers
+
+                    # Release the processor to other threads
+                    sched_yield()
+
+                    # if we wait for a long time, log a message
+                    if (time.monotonic() - start_time
+                            > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
+                        logger.debug(
+                            ("No available shared memory broadcast block found"
+                             " in %s second."),
+                            VLLM_RINGBUFFER_WARNING_INTERVAL,
+                        )
+                        n_warning += 1
+
+                    # if we time out, raise an exception
+                    if (timeout is not None
+                            and time.monotonic() - start_time > timeout):
+                        raise TimeoutError
+
+                    continue
+                # found a block that is either
+                # (1) not written
+                # (2) read by all readers
+
+                # mark the block as not written
+                metadata_buffer[0] = 0
+                # let caller write to the buffer
+                with self.buffer.get_data(self.current_idx) as buf:
+                    yield buf
+
+                # caller has written to the buffer
+                # NOTE: order is important here
+                # first set the read flags to 0
+                # then set the written flag to 1
+                # otherwise, the readers may think they already read the block
+                for i in range(1, self.buffer.n_reader + 1):
+                    # set read flag to 0, meaning it is not read yet
+                    metadata_buffer[i] = 0
+                # mark the block as written
+                metadata_buffer[0] = 1
+                self.current_idx = (self.current_idx +
+                                    1) % self.buffer.max_chunks
+                break
+
+    @contextmanager
+    def acquire_read(self,
+                     timeout: Optional[float] = None,
+                     cancel: Optional[Event] = None):
+        assert self._is_local_reader, "Only readers can acquire read"
+        start_time = time.monotonic()
+        n_warning = 1
+        while True:
+            with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
+                read_flag = metadata_buffer[self.local_reader_rank + 1]
+                written_flag = metadata_buffer[0]
+                if not written_flag or read_flag:
+                    # this block is either
+                    # (1) not written
+                    # (2) already read by this reader
+
+                    # for readers, `self.current_idx` is the next block to read
+                    # if this block is not ready,
+                    # we need to wait until it is written
+
+                    # Release the processor to other threads
+                    self._read_spin_timer.spin()
+
+                    # if we wait for a long time, log a message
+                    if (time.monotonic() - start_time
+                            > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
+                        logger.debug(
+                            ("No available shared memory broadcast block found"
+                             " in %s second."),
+                            VLLM_RINGBUFFER_WARNING_INTERVAL,
+                        )
+                        n_warning += 1
+
+                    if cancel is not None and cancel.is_set():
+                        raise RuntimeError("cancelled")
+
+                    # if we time out, raise an exception
+                    if (timeout is not None
+                            and time.monotonic() - start_time > timeout):
+                        raise TimeoutError
+
+                    continue
+                # found a block that is not read by this reader
+                # let caller read from the buffer
+                with self.buffer.get_data(self.current_idx) as buf:
+                    yield buf
+
+                # caller has read from the buffer
+                # set the read flag
+                metadata_buffer[self.local_reader_rank + 1] = 1
+                self.current_idx = (self.current_idx +
+                                    1) % self.buffer.max_chunks
+
+                self._read_spin_timer.record_activity()
+                break
+
+    def enqueue(self, obj, timeout: Optional[float] = None):
+        """ Write to message queue with optional timeout (in seconds) """
+        assert self._is_writer, "Only writers can enqueue"
+        serialized_obj = pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL)
+        if self.n_local_reader > 0:
+            if len(serialized_obj) >= self.buffer.max_chunk_bytes:
+                with self.acquire_write(timeout) as buf:
+                    buf[0] = 1  # overflow
+                self.local_socket.send(serialized_obj)
+            else:
+                with self.acquire_write(timeout) as buf:
+                    buf[0] = 0  # not overflow
+                    buf[1:len(serialized_obj) + 1] = serialized_obj
+        if self.n_remote_reader > 0:
+            self.remote_socket.send(serialized_obj)
+
+    def dequeue(self,
+                timeout: Optional[float] = None,
+                cancel: Optional[Event] = None):
+        """ Read from message queue with optional timeout (in seconds) """
+        if self._is_local_reader:
+            with self.acquire_read(timeout, cancel) as buf:
+                overflow = buf[0] == 1
+                if not overflow:
+                    # no need to know the size of serialized object
+                    # pickle format contains the size information internally
+                    # see https://docs.python.org/3/library/pickle.html
+                    obj = pickle.loads(buf[1:])
+            if overflow:
+                obj = MessageQueue.recv(self.local_socket, timeout)
+        elif self._is_remote_reader:
+            obj = MessageQueue.recv(self.remote_socket, timeout)
+        else:
+            raise RuntimeError("Only readers can dequeue")
+        return obj
+
+    @staticmethod
+    def recv(socket: zmq.Socket, timeout: Optional[float]) -> Any:
+        timeout_ms = None if timeout is None else int(timeout * 1000)
+        if not socket.poll(timeout=timeout_ms):
+            raise TimeoutError
+        recv = socket.recv(copy=False)
+        return pickle.loads(recv.buffer)
+
+    def broadcast_object(self, obj=None):
+        if self._is_writer:
+            self.enqueue(obj)
+            return obj
+        else:
+            return self.dequeue()
+
+    @staticmethod
+    def create_from_process_group(pg: Union[ProcessGroup,
+                                            StatelessProcessGroup],
+                                  max_chunk_bytes,
+                                  max_chunks,
+                                  writer_rank=0) -> "MessageQueue":
+        if isinstance(pg, ProcessGroup):
+            group_rank = dist.get_rank(pg)
+            group_world_size = dist.get_world_size(pg)
+            global_ranks = dist.get_process_group_ranks(pg)
+        else:
+            group_rank = pg.rank
+            group_world_size = pg.world_size
+            global_ranks = list(range(pg.world_size))
+
+        from vllm.distributed.parallel_state import in_the_same_node_as
+        status = in_the_same_node_as(pg, source_rank=writer_rank)
+        same_node_ranks = [i for i, s in enumerate(status) if s]
+        n_reader = group_world_size - 1
+        n_local_reader = len(same_node_ranks) - 1
+        local_reader_ranks = [i for i in same_node_ranks if i != writer_rank]
+        buffer_io: MessageQueue
+        if group_rank == writer_rank:
+            buffer_io = MessageQueue(
+                n_reader=n_reader,
+                n_local_reader=n_local_reader,
+                local_reader_ranks=local_reader_ranks,
+                max_chunk_bytes=max_chunk_bytes,
+                max_chunks=max_chunks,
+            )
+            handle = buffer_io.export_handle()
+            if isinstance(pg, ProcessGroup):
+                dist.broadcast_object_list([handle],
+                                           src=global_ranks[writer_rank],
+                                           group=pg)
+            else:
+                pg.broadcast_obj(handle, writer_rank)
+        else:
+            if isinstance(pg, ProcessGroup):
+                recv = [None]
+                dist.broadcast_object_list(recv,
+                                           src=global_ranks[writer_rank],
+                                           group=pg)
+                handle = recv[0]  # type: ignore
+            else:
+                handle = pg.broadcast_obj(None, writer_rank)
+            buffer_io = MessageQueue.create_from_handle(handle, group_rank)
+        buffer_io.wait_until_ready()
+        return buffer_io
diff --git a/vllm_v0.10.0/vllm/distributed/device_communicators/tpu_communicator.py b/vllm_v0.10.0/vllm/distributed/device_communicators/tpu_communicator.py
new file mode 100644
index 0000000..c60a7a7
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/device_communicators/tpu_communicator.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from typing import Optional
+
+import torch
+from torch.distributed import ProcessGroup
+
+from vllm.config import get_current_vllm_config
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+
+from .base_device_communicator import DeviceCommunicatorBase
+
+USE_RAY = parallel_config = get_current_vllm_config(
+).parallel_config.distributed_executor_backend == "ray"
+
+logger = init_logger(__name__)
+
+if current_platform.is_tpu():
+    import torch_xla
+    import torch_xla.core.xla_model as xm
+    import torch_xla.runtime as xr
+    from torch_xla._internal import pjrt
+    from torch_xla.distributed.xla_multiprocessing import (
+        create_optimized_replica_groups)
+
+    if USE_RAY:
+        from vllm.executor import ray_utils
+
+
+class TpuCommunicator(DeviceCommunicatorBase):
+
+    def __init__(self,
+                 cpu_group: ProcessGroup,
+                 device: Optional[torch.device] = None,
+                 device_group: Optional[ProcessGroup] = None,
+                 unique_name: str = ""):
+        super().__init__(cpu_group, device, device_group, unique_name)
+
+        # NOTE(woosuk): When using TP > 1 on TPUs, every TPU on the same node
+        # must be used together. Therefore, the local rank and world size can
+        # be simply calculated as follows.
+        global_rank = self.global_rank
+        global_world_size = self.global_world_size
+
+        if USE_RAY:
+            logger.info("TpuCommunicator initialized with RAY")
+            # Calculate how many TPU nodes are in the current deployment. This
+            # is the Ray placement group if it is deployed with Ray. Default
+            # to the number of TPU nodes in the Ray cluster. The number of TPU
+            # nodes is computed by the total number of TPUs divided by the
+            # number of TPU accelerators per node, to account for clusters
+            # with both CPUs and TPUs.
+            num_nodes = ray_utils.get_num_tpu_nodes()
+            num_nodes_in_pg = ray_utils.get_num_nodes_in_placement_group()
+            if num_nodes_in_pg > 0:
+                num_nodes = num_nodes_in_pg
+
+            local_world_size = global_world_size // num_nodes
+            local_rank = global_rank % local_world_size
+        else:
+            logger.info("TpuCommunicator initialized with MP")
+            # Sanity: Verify we run on a single host
+            num_hosts = torch_xla.tpu.num_tpu_workers()
+            assert num_hosts == 1
+
+            # Get the current number of TPUs (we have locally)
+            local_world_size = torch_xla.tpu.num_available_chips()
+
+            # Get current rank
+            local_rank = global_rank % local_world_size
+
+        # Ensure environment variables are set for multihost deployments.
+        # On GKE, this is needed for libtpu and TPU driver to know which TPU
+        # chip is actually visible. Otherwise the TPU driver will fail to
+        # initialize because the number of devices would be different from
+        # the number of visible worker addresses.
+        os.environ["CLOUD_TPU_TASK_ID"] = str(global_rank)
+        os.environ["TPU_VISIBLE_CHIPS"] = str(local_rank)
+
+        pjrt.initialize_multiprocess(local_rank, local_world_size)
+        xr._init_world_size_ordinal()
+        self.groups = create_optimized_replica_groups()
+
+    def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
+        # TODO: Remove the groups specification after XLA compiler can support
+        # auto-reordering the ring order for all-reduce.
+        return xm.all_reduce(xm.REDUCE_SUM, input_, groups=self.groups)
+
+    def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        assert dim == -1, "TPUs only support dim=-1 for all-gather."
+        return xm.all_gather(input_, dim=dim)
+
+
+try:
+    from tpu_commons.distributed.device_communicators import (
+        TpuCommunicator as TpuCommonsCommunicator)
+    TpuCommunicator = TpuCommonsCommunicator  # type: ignore
+except ImportError:
+    logger.info("tpu_commons not found, using vLLM's TpuCommunicator")
+    pass
diff --git a/vllm_v0.10.0/vllm/distributed/device_communicators/xpu_communicator.py b/vllm_v0.10.0/vllm/distributed/device_communicators/xpu_communicator.py
new file mode 100644
index 0000000..dee5ed7
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/device_communicators/xpu_communicator.py
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from .base_device_communicator import DeviceCommunicatorBase
+
+
+class XpuCommunicator(DeviceCommunicatorBase):
+
+    def __init__(self,
+                 cpu_group: ProcessGroup,
+                 device: Optional[torch.device] = None,
+                 device_group: Optional[ProcessGroup] = None,
+                 unique_name: str = ""):
+        super().__init__(cpu_group, device, device_group, unique_name)
+
+    def all_reduce(self, input_) -> torch.Tensor:
+        dist.all_reduce(input_, group=self.device_group)
+        return input_
+
+    def gather(self,
+               input_: torch.Tensor,
+               dst: int = 0,
+               dim: int = -1) -> Optional[torch.Tensor]:
+        assert -input_.dim() <= dim < input_.dim(), (
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+        # For xpu path, gather doesn't work properly together with ray
+        # cluster so we use all_gather instead for now.
+        input_size = input_.size()
+        # Allocate output tensor.
+        output_tensor = torch.empty((self.world_size, ) + input_size,
+                                    dtype=input_.dtype,
+                                    device=input_.device)
+        # All-gather.
+        dist.all_gather_into_tensor(output_tensor,
+                                    input_,
+                                    group=self.device_group)
+        if self.rank_in_group == dst:
+            # Reshape
+            output_tensor = output_tensor.movedim(0, dim)
+            output_tensor = output_tensor.reshape(input_size[:dim] +
+                                                  (self.world_size *
+                                                   input_size[dim], ) +
+                                                  input_size[dim + 1:])
+        else:
+            output_tensor = None
+        return output_tensor
+
+    def broadcast(self, input_: torch.Tensor, src: int = 0) -> None:
+        dist.broadcast(input_, src=src, group=self.device_group)
diff --git a/vllm_v0.10.0/vllm/distributed/eplb/__init__.py b/vllm_v0.10.0/vllm/distributed/eplb/__init__.py
new file mode 100644
index 0000000..8051102
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/eplb/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+'''
+Expert parallelism load balancer (EPLB).
+'''
+
+from .eplb_state import *
+from .rebalance_algo import *
diff --git a/vllm_v0.10.0/vllm/distributed/eplb/eplb_state.py b/vllm_v0.10.0/vllm/distributed/eplb/eplb_state.py
new file mode 100644
index 0000000..af64620
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/eplb/eplb_state.py
@@ -0,0 +1,618 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Expert parallelism load balancer (EPLB) metrics and states.
+
+# Glossary
+
+- **Logical Expert**: An expert that is part of the model's logical structure.
+  It holds a set of weights and is replicated across multiple physical
+  experts.
+- **Redundant Expert**: To achieve load balancing, for some popular logical
+  experts, we create additional copies of the expert weights. During inference,
+  each of these copies can be routed to by the same set of tokens.
+- **Physical Expert**: An expert that is instantiated on a specific device.
+  It is a replica of a logical expert and can be rearranged across devices.
+  I.e., one logical expert may have multiple sets of weights initialized on
+  different devices, and each of these sets is a physical expert.
+- **Local Physical Expert**: A physical expert that is instantiated on the
+  current device.
+
+For example: DeepSeek-R1 has 256 logical experts, so each MoE layer
+has 256 sets of linear layer weights in the model parameters. If we add 32
+redundant experts, DeepSeek-R1 will have 256 + 32 = 288 physical experts in
+total. And when deploying, we'll have 288 sets of linear layer weights for each
+MoE layer. If we have 32 EP ranks, then each GPU will hold 288 / 32 = 9 local
+physical experts.
+"""
+
+import time
+from collections.abc import Sequence
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch.distributed import ProcessGroup, all_gather, all_reduce
+
+from vllm.config import ParallelConfig
+from vllm.distributed.parallel_state import (get_ep_group, get_node_count,
+                                             in_the_same_node_as)
+from vllm.distributed.utils import StatelessProcessGroup
+from vllm.logger import init_logger
+from vllm.model_executor.models.interfaces import MixtureOfExperts
+
+from .rebalance_algo import rebalance_experts
+from .rebalance_execute import rearrange_expert_weights_inplace
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class EplbState:
+    """EPLB metrics."""
+
+    physical_to_logical_map: torch.Tensor
+    """
+    Mapping from physical experts to logical experts.
+
+    Shape: (num_moe_layers, num_physical_experts)
+
+    # Example
+
+    For a 2-layer MoE model with 6 physical experts and 4 logical experts on 3
+    EP ranks, the mapping could look like this:
+
+    ```
+    [[0, 1, 2, 3, 0, 1],
+     [0, 2, 0, 1, 0, 3]]
+    ```
+    """
+    logical_to_physical_map: torch.Tensor
+    """
+    Mapping from logical experts to physical experts.
+
+    This is a sparse matrix, where -1 indicates no mapping.
+
+    Shape: (num_moe_layers, num_logical_experts, num_redundant_experts + 1)
+
+    # Example
+
+    For a 2-layer MoE model with 6 physical experts and 4 logical experts on 3
+    EP ranks, the mapping could look like this:
+
+    ```
+    [[[0, 4, -1],
+      [1, 5, -1],
+      [2, -1, -1],
+      [3, -1, -1]],
+     [[0, 2, 4],
+      [3, -1, -1],
+      [1, -1, -1],
+      [5, -1, -1]]]
+    ```
+    """
+    logical_replica_count: torch.Tensor
+    """
+    Number of replicas for each logical expert.
+    This is exactly the non-`-1` count in the `logical_to_physical_map`.
+
+    Shape: (num_moe_layers, num_logical_experts)
+
+    # Example
+    For a 2-layer MoE model with 6 physical experts and 4 logical experts on 3
+    EP ranks, the count could look like this:
+
+    ```
+    [[2, 2, 1, 1],
+     [3, 1, 1, 1]]
+    """
+
+    expert_load_pass: torch.Tensor
+    """
+    Expert load during this forward pass. 
+    We use the token count each expert processes as the load.
+
+    Shape: (num_moe_layers, num_local_physical_experts)
+    """
+    expert_load_window: torch.Tensor
+    """
+    A sliding window of expert load.
+
+    Shape: (window_size, num_moe_layers, num_local_physical_experts)
+    """
+    expert_load_window_step: int = 0
+    """
+    Current step in the sliding window.
+
+    Different from `expert_rearrangement_step`, each EP rank may have its own
+    `expert_load_window_step`.
+    """
+    expert_load_window_size: int = 0
+    """
+    Size of the expert load sliding window.
+    This is a constant and is taken from the config.
+    """
+
+    expert_rearrangement_step: int = 0
+    """
+    Steps after last rearrangement.
+    Will trigger a rearrangement if it exceeds the threshold.
+
+    NOTE: Keep in mind that all EP ranks need to have the same
+    `expert_rearrangement_step` value to ensure synchronization.
+    Otherwise, the rearrangement will hang at collective
+    communication calls.
+    """
+    expert_rearrangement_step_interval: int = 0
+    """
+    Interval for expert rearrangement steps.
+    This is a constant and is taken from the config.
+    """
+
+    @staticmethod
+    def build_initial_global_physical_to_logical_map(
+        num_routed_experts: int,
+        num_redundant_experts: int,
+    ) -> Sequence[int]:
+        """
+        Build an initial expert arrangement using the following structure:
+        [original routed experts, redundant experts]
+
+        Returns:
+            physical_to_logical_map (Sequence[int]): A list of integers,
+                where each integer is the index of the logical expert
+                that the corresponding physical expert maps to.
+        """
+        global_physical_to_logical_map = list(range(num_routed_experts))
+        global_physical_to_logical_map += [
+            i % num_routed_experts for i in range(num_redundant_experts)
+        ]
+        return global_physical_to_logical_map
+
+    @classmethod
+    def build(
+        cls,
+        model: MixtureOfExperts,
+        device: torch.device,
+        parallel_config: ParallelConfig,
+        global_expert_load: Optional[torch.Tensor] = None,
+        old_global_expert_indices: Optional[torch.Tensor] = None,
+        rank_mapping: Optional[dict[int, int]] = None,
+    ) -> "EplbState":
+        """
+        Build the initial EPLB state.
+        """
+        physical_to_logical_map_list = (
+            cls.build_initial_global_physical_to_logical_map(
+                model.num_routed_experts,
+                model.num_redundant_experts,
+            ))
+        physical_to_logical_map = torch.tensor(
+            physical_to_logical_map_list,
+            device=device,
+        )
+        # Assuming 8 GPUs per node, this supports up to
+        # (1023 + 1) / 8 = 128 nodes for now.
+        # TODO(rui): make this configurable
+        MAX_EXPERT_REDUNDANCY = 1023
+        assert model.num_redundant_experts <= MAX_EXPERT_REDUNDANCY, (
+            f"num_redundant_experts {model.num_redundant_experts} "
+            f"must be less than or equal to {MAX_EXPERT_REDUNDANCY}")
+        max_slots_per_logical_expert = MAX_EXPERT_REDUNDANCY + 1
+        logical_to_physical_map = torch.full(
+            (model.num_logical_experts, max_slots_per_logical_expert),
+            -1,
+            device=device,
+        )
+        logical_replica_count = torch.zeros(
+            (model.num_logical_experts, ),
+            device=device,
+            dtype=torch.long,
+        )
+
+        for i in range(model.num_physical_experts):
+            logical_idx = physical_to_logical_map[i]
+            logical_to_physical_map[logical_idx,
+                                    logical_replica_count[logical_idx]] = i
+            logical_replica_count[logical_idx] += 1
+
+        # Duplicate initial mapping for all layers
+        physical_to_logical_map = physical_to_logical_map.unsqueeze(0).expand(
+            model.num_moe_layers,
+            -1,
+        ).contiguous()
+        logical_to_physical_map = logical_to_physical_map.unsqueeze(0).expand(
+            model.num_moe_layers,
+            -1,
+            -1,
+        ).contiguous()
+        logical_replica_count = logical_replica_count.unsqueeze(0).expand(
+            model.num_moe_layers,
+            -1,
+        ).contiguous()
+
+        expert_load_pass = torch.zeros(
+            (model.num_moe_layers, model.num_local_physical_experts),
+            dtype=torch.int32,
+            device=device,
+        )
+        expert_load_window_size = parallel_config.eplb_window_size
+        expert_load_window = torch.zeros(
+            (expert_load_window_size, model.num_moe_layers,
+             model.num_local_physical_experts),
+            dtype=torch.int32,
+            device=device,
+        )
+
+        # Set the initial progress of rearrangement to 3/4
+        eplb_step_interval = parallel_config.eplb_step_interval
+        expert_rearrangement_step = max(
+            0, eplb_step_interval - eplb_step_interval // 4)
+
+        if global_expert_load is not None:
+            ep_group = get_ep_group().device_group
+            assert global_expert_load.shape == (model.num_moe_layers,
+                                                model.num_logical_experts)
+            assert global_expert_load.dtype == torch.int64
+
+            num_replicas = model.num_physical_experts
+            num_groups = model.num_expert_groups
+            num_nodes = get_node_count()
+            num_gpus = ep_group.size()
+
+            if num_gpus % num_nodes != 0:
+                num_nodes = 1
+                logger.warning_once(
+                    f"num_gpus % num_nodes != 0, "
+                    "not using hierarchical rearrangement algorithm.\n"
+                    f"{num_gpus=}, {num_nodes=}")
+
+            # Get new expert mappings
+            (
+                new_physical_to_logical_map,
+                new_logical_to_physical_map,
+                new_logical_replica_count,
+            ) = (rebalance_experts(
+                global_expert_load,
+                num_replicas,
+                num_groups,
+                num_nodes,
+                num_gpus,
+            ))
+
+            max_physical_slots = new_logical_to_physical_map.shape[-1]
+            assert max_physical_slots <= logical_to_physical_map.shape[-1]
+            new_logical_to_physical_map = torch.nn.functional.pad(
+                new_logical_to_physical_map,
+                (0, logical_to_physical_map.shape[-1] - max_physical_slots),
+                value=-1,
+            )
+            physical_to_logical_map = new_physical_to_logical_map.to(device)
+            logical_to_physical_map.copy_(new_logical_to_physical_map)
+            logical_replica_count.copy_(new_logical_replica_count)
+
+        model.set_eplb_state(
+            expert_load_pass,
+            logical_to_physical_map,
+            logical_replica_count,
+        )
+        if global_expert_load is not None:
+            rearrange_expert_weights_inplace(
+                old_global_expert_indices,
+                new_physical_to_logical_map,
+                model.expert_weights,
+                ep_group,
+                False,
+                rank_mapping,
+            )
+            expert_rearrangement_step = 0
+
+        return cls(
+            physical_to_logical_map,
+            logical_to_physical_map,
+            logical_replica_count,
+            expert_load_pass,
+            expert_load_window,
+            expert_load_window_size=expert_load_window_size,
+            expert_rearrangement_step=expert_rearrangement_step,
+            expert_rearrangement_step_interval=eplb_step_interval,
+        )
+
+    def step(self,
+             model: MixtureOfExperts,
+             is_dummy: bool = False,
+             is_profile: bool = False,
+             log_stats: bool = False) -> None:
+        """
+        Step the EPLB state.
+
+        Args:
+            model (MixtureOfExperts): The MoE model.
+            is_dummy (bool): If `True`, this is a dummy step and the load
+              metrics recorded in this forward pass will not count. Defaults
+              to `False`.
+            is_profile (bool): If `True`, perform a dummy rearrangement
+              with maximum communication cost. This is used in `profile_run`
+              to reserve enough memory for the communication buffer.
+            log_stats (bool): If `True`, log the expert load metrics.
+
+        # Stats
+            The metrics are all summed up across layers.
+            - `avg_tokens`: The average load across ranks.
+            - `max_tokens`: The maximum load across ranks.
+            - `balancedness`: The ratio of average load to maximum load.
+        """
+
+        if is_profile:
+            self.rearrange(model, is_profile=True)
+            return
+
+        if is_dummy:
+            # Do not record load metrics for dummy steps
+            self.expert_load_pass.zero_()
+
+        if log_stats:
+            # `num_tokens`: (num_moe_layers,)
+            num_tokens = self.expert_load_pass.sum(dim=-1)
+
+            # Collect load metrics from all ranks
+            ep_group = get_ep_group().device_group
+            num_tokens_list = [
+                torch.empty_like(num_tokens) for _ in range(ep_group.size())
+            ]
+            all_gather(num_tokens_list, num_tokens, group=ep_group)
+            # Stack to get (num_ranks, num_moe_layers)
+            num_tokens_per_rank = torch.stack(num_tokens_list).float()
+
+            # Compute balancedness ratio:
+            # for each layer:
+            #   (mean load across ranks) / (max load across ranks)
+            avg_tokens_tensor = num_tokens_per_rank.mean(dim=0).sum(dim=0)
+            max_tokens_tensor = num_tokens_per_rank.max(dim=0).values.sum(
+                dim=0)
+
+            # Just to make type checker happy
+            tokens_tensors: list[float] = torch.stack(
+                [avg_tokens_tensor, max_tokens_tensor]).tolist()
+            avg_tokens, max_tokens = tokens_tensors
+            balancedness = avg_tokens / max_tokens if max_tokens > 0 else 0.0
+
+            if ep_group.rank() == 0:
+                logger.info(
+                    "EPLB step: avg_tokens=%.2f, max_tokens=%d, "
+                    "balancedness=%.4f", avg_tokens, max_tokens, balancedness)
+
+        # Update the expert load sliding window
+        if not is_dummy:
+            self.expert_load_window[self.expert_load_window_step] = (
+                self.expert_load_pass.clone())
+            self.expert_load_window_step += 1
+            if self.expert_load_window_step >= self.expert_load_window_size:
+                self.expert_load_window_step = 0
+            self.expert_load_pass.zero_()
+
+        # Step the expert rearrangement step
+        # Note that even if this is a dummy step, we still increment the
+        # rearrangement step and perform rearrangement to ensure all ranks are
+        # performing collective communication.
+        self.expert_rearrangement_step += 1
+        if (self.expert_rearrangement_step
+                >= self.expert_rearrangement_step_interval):
+            self.expert_rearrangement_step = 0
+            self.rearrange(model)
+
+    def rearrange(self,
+                  model: MixtureOfExperts,
+                  is_profile: bool = False,
+                  execute_shuffle: bool = True,
+                  global_expert_load: Optional[torch.Tensor] = None,
+                  rank_mapping: Optional[dict[int, int]] = None) -> None:
+        """
+        Rearrange the experts according to the current load.
+        """
+
+        ep_group = get_ep_group().device_group
+        ep_rank = ep_group.rank()
+
+        time_start = None
+        is_main_rank = ep_rank == 0
+        if is_main_rank:
+            torch.cuda.synchronize()
+            time_start = time.perf_counter()
+            logger.info("Rearranging experts %s...",
+                        "(profile)" if is_profile else "")
+
+        if global_expert_load is None:
+            # This mapping is only used here, so we do not store it in the state
+            physical_expert_start = ep_rank * model.num_local_physical_experts
+            physical_expert_end = (physical_expert_start +
+                                   model.num_local_physical_experts)
+            # (num_moe_layers, num_local_physical_experts)
+            local_physical_to_logical_map = self.physical_to_logical_map[
+                :,
+                physical_expert_start:physical_expert_end,
+            ]
+
+            # Map the local physical expert load to global logical experts
+            logical_expert_load_window = torch.zeros(
+                self.expert_load_window_size,
+                model.num_moe_layers,
+                model.num_logical_experts,
+                dtype=self.expert_load_window.dtype,
+                device=self.expert_load_window.device,
+            )
+            logical_expert_load_window.scatter_add_(
+                dim=-1,
+                index=local_physical_to_logical_map.unsqueeze(0).expand_as(
+                    self.expert_load_window).long(),
+                src=self.expert_load_window,
+            )
+
+            if not execute_shuffle:
+                metadata = torch.tensor(
+                    [
+                        model.num_moe_layers, model.num_logical_experts,
+                        self.physical_to_logical_map.shape[1]
+                    ],
+                    dtype=torch.int32,
+                    device="cpu",
+                )
+                torch.distributed.broadcast(metadata,
+                                            group=get_ep_group().cpu_group,
+                                            group_src=0)
+
+            # Perform all-reduce to get the expert load across all ranks
+            global_expert_load_window = logical_expert_load_window.sum(dim=0)
+            all_reduce(global_expert_load_window, group=ep_group)
+
+            if not execute_shuffle:
+                # (num_moe_layers, old_num_physical_experts)
+                old_global_expert_indices = self.physical_to_logical_map
+                torch.distributed.broadcast(old_global_expert_indices,
+                                            group=ep_group,
+                                            group_src=0)
+                return global_expert_load_window
+        else:
+            assert execute_shuffle
+            global_expert_load_window = global_expert_load
+
+        # TODO(bowen): Treat differently for prefill and decode nodes
+        num_replicas = model.num_physical_experts
+        num_groups = model.num_expert_groups
+        if rank_mapping is not None and len(rank_mapping) == ep_group.size():
+            # NOTE(yongji): scale down, we need to rebalance the experts on
+            # remaining GPUs, transfer the experts while we haven't shutdown
+            # the GPUs to be released.
+            cpu_group = get_ep_group().cpu_group
+            num_nodes = _node_count_with_rank_mapping(cpu_group, rank_mapping)
+            num_gpus = sum(new_rank != -1
+                           for new_rank in rank_mapping.values())
+            num_replicas = num_replicas // ep_group.size(
+            ) * num_gpus  # handle num replicas change
+        else:
+            num_nodes = get_node_count()
+            num_gpus = ep_group.size()
+
+        if num_gpus % num_nodes != 0:
+            self.num_nodes = 1
+            logger.warning_once(
+                f"num_gpus % num_nodes != 0, "
+                "not using hierarchical rearrangement algorithm.\n"
+                f"{num_gpus=}, {num_nodes=}")
+
+        # Get new expert mappings
+        (
+            new_physical_to_logical_map,
+            new_logical_to_physical_map,
+            new_logical_replica_count,
+        ) = (rebalance_experts(
+            global_expert_load_window,
+            num_replicas,
+            num_groups,
+            num_nodes,
+            num_gpus,
+        ))
+
+        # Update expert weights
+        rearrange_expert_weights_inplace(
+            self.physical_to_logical_map,
+            new_physical_to_logical_map,
+            model.expert_weights,
+            ep_group,
+            is_profile,
+            rank_mapping,
+        )
+
+        if not is_profile:
+            if self.physical_to_logical_map.shape[
+                    1] != new_physical_to_logical_map.shape[1]:
+                self.physical_to_logical_map = new_physical_to_logical_map.to(
+                    self.physical_to_logical_map.device)
+            else:
+                self.physical_to_logical_map.copy_(new_physical_to_logical_map)
+            max_physical_slots = new_logical_to_physical_map.shape[-1]
+            assert max_physical_slots <= self.logical_to_physical_map.shape[-1]
+            new_logical_to_physical_map = torch.nn.functional.pad(
+                new_logical_to_physical_map,
+                (0,
+                 self.logical_to_physical_map.shape[-1] - max_physical_slots),
+                value=-1,
+            )
+            self.logical_to_physical_map.copy_(new_logical_to_physical_map)
+            self.logical_replica_count.copy_(new_logical_replica_count)
+
+        if is_main_rank:
+            assert time_start is not None
+            torch.cuda.synchronize()
+            time_end = time.perf_counter()
+            logger.info(
+                "Rearranged experts%sin %.2f seconds.",
+                " (profile) " if is_profile else " ",
+                time_end - time_start,
+            )
+
+    @staticmethod
+    def recv_state() -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Receive the expert load and old placement from the master rank.
+        """
+        ep_group = get_ep_group()
+        metadata = torch.empty(3, dtype=torch.int32, device="cpu")
+        torch.distributed.broadcast(metadata,
+                                    group=ep_group.cpu_group,
+                                    group_src=0)
+        num_moe_layers, num_logical_experts, num_old_physical_experts = (
+            metadata.tolist())
+        global_expert_load = torch.zeros(
+            (num_moe_layers, num_logical_experts),
+            dtype=torch.int64,
+            device=ep_group.device,
+        )
+        all_reduce(global_expert_load, group=ep_group.device_group)
+        old_global_expert_indices = torch.empty(
+            (num_moe_layers, num_old_physical_experts),
+            dtype=torch.int64,
+            device=ep_group.device,
+        )
+        torch.distributed.broadcast(old_global_expert_indices,
+                                    group=ep_group.device_group,
+                                    group_src=0)
+
+        return global_expert_load, old_global_expert_indices
+
+
+def _node_count_with_rank_mapping(
+    pg: Union[ProcessGroup, StatelessProcessGroup],
+    rank_mapping: dict[int, int],
+) -> int:
+    if isinstance(pg, ProcessGroup):
+        world_size = torch.distributed.get_world_size(group=pg)
+    else:
+        world_size = pg.world_size
+
+    if world_size == 1:
+        return 1
+
+    # Build node assignment map
+    node_assignment = [0] * world_size  # rank -> node_id
+    next_node_id = 0
+
+    for current_rank in range(world_size):
+        if node_assignment[current_rank] != 0:
+            continue  # Already assigned to a node
+
+        assert current_rank in rank_mapping
+        if rank_mapping[current_rank] == -1:
+            continue  # Pending shutdown
+
+        # Assign current rank to a new node
+        next_node_id += 1
+        node_assignment[current_rank] = next_node_id
+
+        # Find all ranks on the same node as current_rank
+        same_node_flags = in_the_same_node_as(pg, current_rank)
+        for other_rank, is_same_node in enumerate(same_node_flags):
+            if is_same_node and node_assignment[other_rank] == 0:
+                node_assignment[other_rank] = next_node_id
+
+    return next_node_id
diff --git a/vllm_v0.10.0/vllm/distributed/eplb/rebalance_algo.py b/vllm_v0.10.0/vllm/distributed/eplb/rebalance_algo.py
new file mode 100644
index 0000000..879b5b9
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/eplb/rebalance_algo.py
@@ -0,0 +1,234 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Expert parallelism load balancer (EPLB) for vLLM.
+
+This module implements the core rearrangement algorithm.
+
+The rearrangement algorithm is adapted from
+[DeepSeek EPLB](https://github.com/deepseek-ai/eplb).
+
+Please find at [#12](https://github.com/deepseek-ai/EPLB/issues/12) an example
+on how the EPLB algorithm works.
+"""
+
+import torch
+
+
+def balanced_packing(weight: torch.Tensor,
+                     num_packs: int) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Pack n weighted objects to m packs, such that each bin contains exactly
+    n/m objects and the weights of all packs are as balanced as possible.
+
+    Parameters:
+        weight: [X, n], the weight of each item
+        num_packs: number of packs
+
+    Returns:
+        pack_index: [X, n], the pack index of each item
+        rank_in_pack: [X, n], the rank of the item in the pack
+    """
+    num_layers, num_groups = weight.shape
+    assert num_groups % num_packs == 0
+    groups_per_pack = num_groups // num_packs
+
+    if groups_per_pack == 1:
+        pack_index = torch.arange(weight.size(-1),
+                                  dtype=torch.int64,
+                                  device=weight.device).expand(weight.shape)
+        rank_in_pack = torch.zeros_like(weight, dtype=torch.int64)
+        return pack_index, rank_in_pack
+
+    indices = weight.float().sort(-1, descending=True).indices.cpu()
+    pack_index = torch.full_like(weight,
+                                 fill_value=-1,
+                                 dtype=torch.int64,
+                                 device="cpu")
+    rank_in_pack = torch.full_like(pack_index, fill_value=-1)
+    for i in range(num_layers):
+        pack_weights = [0] * num_packs
+        pack_items = [0] * num_packs
+        for group in indices[i]:
+            pack = min(
+                (i
+                 for i in range(num_packs) if pack_items[i] < groups_per_pack),
+                key=pack_weights.__getitem__,
+            )
+            assert pack_items[pack] < groups_per_pack
+            pack_index[i, group] = pack
+            rank_in_pack[i, group] = pack_items[pack]
+            pack_weights[pack] += weight[i, group]
+            pack_items[pack] += 1
+    return pack_index, rank_in_pack
+
+
+def replicate_experts(
+        weight: torch.Tensor,
+        num_phy: int) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Replicate `num_log` experts to `num_phy` replicas, such that the maximum
+    load of all replicas is minimized.
+
+    Parameters:
+        weight: [X, num_log]
+        num_phy: total number of experts after replication
+
+    Returns:
+        phy2log: [X, num_phy], logical expert id of each physical expert
+        rank: [X, num_phy], the replica rank
+        logcnt: [X, num_log], number of replicas for each logical expert
+    """
+    n, num_log = weight.shape
+    num_redundant = num_phy - num_log
+    assert num_redundant >= 0
+    device = weight.device
+    phy2log = torch.arange(num_phy, dtype=torch.int64,
+                           device=device).repeat(n, 1)
+    rank = torch.zeros(n, num_phy, dtype=torch.int64, device=device)
+    logcnt = torch.ones(n, num_log, dtype=torch.int64, device=device)
+    arangen = torch.arange(n, dtype=torch.int64, device=device)
+    for i in range(num_log, num_phy):
+        redundant_indices = (weight / logcnt).max(dim=-1).indices
+        phy2log[:, i] = redundant_indices
+        rank[:, i] = logcnt[arangen, redundant_indices]
+        logcnt[arangen, redundant_indices] += 1
+    return phy2log, rank, logcnt
+
+
+def rebalance_experts_hierarchical(
+    weight: torch.Tensor,
+    num_physical_experts: int,
+    num_groups: int,
+    num_nodes: int,
+    num_gpus: int,
+):
+    """
+    Parameters:
+        weight: [num_moe_layers, num_logical_experts]
+        num_physical_experts: number of physical experts after replication
+        num_groups: number of expert groups
+        num_nodes: number of server nodes, where the intra-node network
+        (e.g, NVLink) is faster
+        num_gpus: number of GPUs, must be a multiple of `num_nodes`
+
+    Returns:
+        physical_to_logical_map: [num_moe_layers, num_physical_experts]
+        logical_to_physical_map: [num_moe_layers, num_logical_experts, X]
+        logical_count: [num_moe_layers, num_logical_experts]
+    """
+    num_layers, num_logical_experts = weight.shape
+    assert num_logical_experts % num_groups == 0
+    group_size = num_logical_experts // num_groups
+    assert num_groups % num_nodes == 0
+    groups_per_node = num_groups // num_nodes
+    assert num_gpus % num_nodes == 0
+    assert num_physical_experts % num_gpus == 0
+    phy_experts_per_gpu = num_physical_experts // num_gpus
+
+    def inverse(perm: torch.Tensor) -> torch.Tensor:
+        inv = torch.empty_like(perm)
+        inv.scatter_(
+            1,
+            perm,
+            torch.arange(perm.size(1), dtype=torch.int64,
+                         device=perm.device).expand(perm.shape),
+        )
+        return inv
+
+    # Step 1: pack groups to nodes
+    tokens_per_group = weight.unflatten(-1, (num_groups, group_size)).sum(-1)
+    group_pack_index, group_rank_in_pack = balanced_packing(
+        tokens_per_group, num_nodes)
+    log2mlog = (((group_pack_index * groups_per_node + group_rank_in_pack) *
+                 group_size).unsqueeze(-1) +
+                torch.arange(group_size,
+                             dtype=torch.int64,
+                             device=group_pack_index.device)).flatten(-2)
+    mlog2log = inverse(log2mlog)
+
+    # Step 2: construct redundant experts within nodes
+    # [num_layers * num_nodes, num_logical_experts // num_nodes]
+    tokens_per_mlog = weight.gather(-1, mlog2log).view(
+        -1, num_logical_experts // num_nodes)
+    phy2mlog, phyrank, mlogcnt = replicate_experts(
+        tokens_per_mlog, num_physical_experts // num_nodes)
+
+    # Step 3: pack physical_experts to GPUs
+    # [num_layers * num_nodes, num_physical_experts // num_nodes]
+    tokens_per_phy = (tokens_per_mlog / mlogcnt).gather(-1, phy2mlog)
+    pack_index, rank_in_pack = balanced_packing(tokens_per_phy,
+                                                num_gpus // num_nodes)
+    phy2pphy = pack_index * phy_experts_per_gpu + rank_in_pack
+    pphy2phy = inverse(phy2pphy)
+
+    pphy2mlog = phy2mlog.gather(
+        -1, pphy2phy)  # [num_layers * num_nodes, num_log_per_nodes]
+    pphy2mlog = (pphy2mlog.view(num_layers, num_nodes, -1) + torch.arange(
+        0,
+        num_logical_experts,
+        num_logical_experts // num_nodes,
+        device=group_pack_index.device,
+    ).view(1, -1, 1)).flatten(-2)
+    pphy2log = mlog2log.gather(-1, pphy2mlog)
+    pphyrank = phyrank.gather(-1, pphy2phy).view(num_layers, -1)
+    logcnt = mlogcnt.view(num_layers, -1).gather(-1, log2mlog)
+    return pphy2log, pphyrank, logcnt
+
+
+def rebalance_experts(
+    weight: torch.Tensor,
+    num_replicas: int,
+    num_groups: int,
+    num_nodes: int,
+    num_gpus: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Entry point for expert-parallelism load balancer.
+
+    Parameters:
+        weight: [layers, num_logical_experts], the load statistics for all
+            logical experts
+        num_replicas: number of physical experts, must be a multiple of
+            `num_gpus`
+        num_groups: number of expert groups
+        num_nodes: number of server nodes, where the intra-node network
+            (e.g, NVLink) is faster
+        num_gpus: number of GPUs, must be a multiple of `num_nodes`
+
+    Returns:
+        physical_to_logical_map: [layers, num_replicas], the expert index of
+            each replica
+        logical_to_physical_map: [layers, num_logical_experts, X], the replica
+            indices for each expert
+        expert_count: [layers, num_logical_experts], number of physical
+            replicas for each logical expert
+    """
+    num_layers, num_logical_experts = weight.shape
+    weight = weight.float().cpu()
+    if num_groups % num_nodes == 0:
+        # use hierarchical load-balance policy
+        phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
+            weight, num_replicas, num_groups, num_nodes, num_gpus)
+    else:
+        # use global load-balance policy
+        phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
+            weight, num_replicas, 1, 1, num_gpus)
+    num_redundant_experts = num_replicas - num_logical_experts
+    maxlogcnt = num_redundant_experts + 1
+    log2phy: torch.Tensor = torch.full(
+        (num_layers, num_logical_experts, maxlogcnt),
+        -1,
+        dtype=torch.int64,
+        device=logcnt.device,
+    )
+    log2phy.view(num_layers, -1).scatter_(
+        -1,
+        phy2log * maxlogcnt + phyrank,
+        torch.arange(num_replicas, dtype=torch.int64,
+                     device=log2phy.device).expand(num_layers, -1),
+    )
+    return phy2log, log2phy, logcnt
+
+
+__all__ = ["rebalance_experts"]
diff --git a/vllm_v0.10.0/vllm/distributed/eplb/rebalance_execute.py b/vllm_v0.10.0/vllm/distributed/eplb/rebalance_execute.py
new file mode 100644
index 0000000..f8a7d11
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/eplb/rebalance_execute.py
@@ -0,0 +1,424 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+The actual execution of the rearrangement.
+
+This involves the exchange of expert weights between GPUs.
+"""
+
+from collections.abc import Iterable, MutableSequence, Sequence
+from functools import partial
+from typing import Optional
+
+import torch
+from torch.distributed import (P2POp, ProcessGroup, all_gather,
+                               batch_isend_irecv, get_global_rank)
+
+
+def idx_local_to_global(
+    local_idx: int,
+    local_cnt: int,
+    ep_rank: int,
+) -> int:
+    """
+    Convert a local expert index to a global expert index.
+    """
+    return ep_rank * local_cnt + local_idx
+
+
+def idx_global_to_local(
+    global_idx: int,
+    local_cnt: int,
+    ep_rank: int,
+) -> int:
+    """
+    Convert a global expert index to a local expert index.
+    """
+    return global_idx - ep_rank * local_cnt
+
+
+def global_idx_to_rank(
+    global_idx: int,
+    local_cnt: int,
+) -> int:
+    """
+    Convert a global expert index to a rank index.
+    """
+    return global_idx // local_cnt
+
+
+def get_ep_ranks_with_expert(
+    idx: int,
+    num_local_experts: int,
+    old_indices: Sequence[int],
+    new_indices: Sequence[int],
+) -> tuple[MutableSequence[int], MutableSequence[int]]:
+    """
+    Get the ranks of the experts that need to be exchanged.
+
+    Args:
+        idx: The index of the expert.
+        num_local_experts: The number of local experts.
+        old_indices: The old indices of the experts.
+        new_indices: The new indices of the experts.
+
+    Returns:
+        A tuple of two lists:
+        - The ranks of the experts that need to be sent.
+        - The ranks of the experts that need to be received.
+    """
+    global2rank = partial(
+        global_idx_to_rank,
+        local_cnt=num_local_experts,
+    )
+
+    ranks_to_send: list[int] = []
+    ranks_to_recv: list[int] = []
+
+    for i, e in enumerate(old_indices):
+        if e == idx:
+            rank = global2rank(i)
+            if not ranks_to_send or ranks_to_send[-1] != rank:
+                ranks_to_send.append(rank)
+
+    for i, e in enumerate(new_indices):
+        if e == idx:
+            rank = global2rank(i)
+            if not ranks_to_recv or ranks_to_recv[-1] != rank:
+                ranks_to_recv.append(rank)
+
+    # Remove those ranks that can get this expert locally.
+    ranks_to_send_set = set(ranks_to_send)
+    ranks_to_recv_actual = [
+        rank for rank in ranks_to_recv if rank not in ranks_to_send_set
+    ]
+
+    return ranks_to_send, ranks_to_recv_actual
+
+
+def shuffle_layer(
+    num_local_experts: int,
+    ep_rank: int,
+    old_indices: Sequence[int],
+    new_indices: Sequence[int],
+    expert_weights: Iterable[torch.Tensor],
+    expert_weights_buffer: Sequence[torch.Tensor],
+    ep_group: ProcessGroup,
+) -> None:
+    """
+    Perform expert weights rearrangement of one layer.
+    """
+    local2global = partial(
+        idx_local_to_global,
+        local_cnt=num_local_experts,
+        ep_rank=ep_rank,
+    )
+
+    # 0. Do nothing for experts that did not change.
+    is_unchanged = [
+        old_indices[local2global(i)] == new_indices[local2global(i)]
+        for i in range(num_local_experts)
+    ]
+
+    # 1. Perform weight copy inside the local rank.
+    is_received_locally = is_unchanged[:]
+    for src in range(num_local_experts):
+        src_global = local2global(src)
+        for dst in range(num_local_experts):
+            dst_global = local2global(dst)
+            if is_received_locally[dst]:
+                continue
+            if old_indices[src_global] == -1 or new_indices[dst_global] == -1:
+                continue
+            if old_indices[src_global] == new_indices[dst_global]:
+                is_received_locally[dst] = True
+                for weight, buffer in zip(expert_weights,
+                                          expert_weights_buffer):
+                    buffer[dst].copy_(weight[src])
+
+    p2p_ops: list[P2POp] = []
+
+    # 2. Initiate sending of weights.
+    experts_send_loc: dict[int, int] = {}
+    for src in range(num_local_experts):
+        expert = old_indices[local2global(src)]
+        if expert == -1:
+            continue
+        if expert in experts_send_loc:
+            continue
+        experts_send_loc[expert] = src
+
+    # We need to sort here to match send/recv
+    for expert, src in sorted(experts_send_loc.items()):
+        ranks_to_send, ranks_to_recv = get_ep_ranks_with_expert(
+            expert,
+            num_local_experts,
+            old_indices,
+            new_indices,
+        )
+
+        # Calculate the ranks to send by this rank
+        num_dst_per_sender = len(ranks_to_recv) // len(ranks_to_send)
+        sender_pos = ranks_to_send.index(ep_rank)
+        recv_begin = sender_pos * num_dst_per_sender
+        recv_end = recv_begin + num_dst_per_sender
+        recv_ranks = ranks_to_recv[recv_begin:recv_end]
+
+        # Tackle remainders
+        remainder_start = len(ranks_to_send) * num_dst_per_sender
+        recver_pos = remainder_start + sender_pos
+        if recver_pos < len(ranks_to_recv):
+            recv_ranks.append(ranks_to_recv[recver_pos])
+
+        for dst in recv_ranks:
+            dst_global = get_global_rank(ep_group, dst)
+            p2p_ops += [
+                P2POp(
+                    torch.distributed.isend,
+                    weight[src],
+                    dst_global,
+                ) for weight in expert_weights
+            ]
+
+    # 3. Initiate receiving of weights.
+    experts_recv_loc: dict[int, int] = {}
+    for dst in range(num_local_experts):
+        if is_received_locally[dst]:
+            continue
+        expert = new_indices[local2global(dst)]
+        if expert == -1:
+            continue
+        if expert in experts_recv_loc:
+            continue
+        experts_recv_loc[expert] = dst
+
+    # We need to sort here to match send/recv
+    for expert, dst in sorted(experts_recv_loc.items()):
+        ranks_to_send, ranks_to_recv = get_ep_ranks_with_expert(
+            expert,
+            num_local_experts,
+            old_indices,
+            new_indices,
+        )
+
+        # Calculate the rank to recv by this rank
+        num_dst_per_sender = len(ranks_to_recv) // len(ranks_to_send)
+        recver_pos = ranks_to_recv.index(ep_rank)
+        remainder_start = len(ranks_to_send) * num_dst_per_sender
+        if recver_pos < remainder_start:
+            src = ranks_to_send[recver_pos // num_dst_per_sender]
+        else:
+            src = ranks_to_send[recver_pos - remainder_start]
+
+        src_global = get_global_rank(ep_group, src)
+        p2p_ops += [
+            P2POp(
+                torch.distributed.irecv,
+                weight[dst],
+                src_global,
+            ) for weight in expert_weights_buffer
+        ]
+
+    # 4. Execute the P2P operations. The real communication happens here.
+    if p2p_ops:
+        reqs = batch_isend_irecv(p2p_ops)
+        for req in reqs:
+            req.wait()
+
+    # 5. Copy the weights from the buffer back to the original weights.
+    for dst in range(num_local_experts):
+        if is_unchanged[dst]:
+            continue
+        if is_received_locally[dst]:
+            for weight, buffer in zip(expert_weights, expert_weights_buffer):
+                weight[dst].copy_(buffer[dst])
+        else:
+            expert = new_indices[local2global(dst)]
+            if expert == -1:
+                continue
+            src = experts_recv_loc[expert]
+            for weight, buffer in zip(expert_weights, expert_weights_buffer):
+                weight[dst].copy_(buffer[src])
+
+
+def rearrange_expert_weights_inplace(
+    old_global_expert_indices: torch.Tensor,
+    new_global_expert_indices: torch.Tensor,
+    expert_weights: Sequence[Iterable[torch.Tensor]],
+    ep_group: ProcessGroup,
+    is_profile: bool = False,
+    rank_mapping: Optional[dict[int, int]] = None,
+) -> None:
+    """
+    Rearranges the expert weights in place according to the new expert indices.
+
+    The value of the indices arguments are logical indices of the experts,
+    while keys are physical.
+
+    Args:
+        old_global_expert_indices: Shape (num_moe_layers, num_physical_experts).
+        new_global_expert_indices: Shape (num_moe_layers, num_physical_experts).
+        expert_weights: A sequence of shape (num_moe_layers)(weight_count)
+            of tensors of shape (num_local_physical_experts, hidden_size_i).
+            For example, a linear layer may have up and down projection,
+            so weight_count = 2. Each weight's hidden size can be different.
+        ep_group: The device process group for expert parallelism.
+        is_profile (bool): If `True`, do not perform any actual weight copy.
+            This is used during profile run, where we only perform dummy
+            communications to reserve enough memory for the buffers.
+        rank_mapping: A dictionary mapping old rank to new rank.
+    """
+    if rank_mapping is not None:
+        if len(rank_mapping) == ep_group.size():
+            # scale down
+            new_global_expert_indices = \
+                _map_new_expert_indices_with_rank_mapping(
+                new_global_expert_indices,
+                rank_mapping,
+            )
+        else:
+            # scale up
+            old_global_expert_indices = \
+                _map_old_expert_indices_with_rank_mapping(
+                old_global_expert_indices,
+                rank_mapping,
+                ep_group.size(),
+            )
+
+    assert old_global_expert_indices.shape[
+        1] == new_global_expert_indices.shape[1]
+
+    num_moe_layers, num_physical_experts = old_global_expert_indices.shape
+    assert len(expert_weights) == num_moe_layers
+
+    num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
+    assert new_global_expert_indices.shape == (num_moe_layers,
+                                               num_physical_experts)
+
+    ep_rank = ep_group.rank()
+    ep_size = ep_group.size()
+    assert num_physical_experts == ep_size * num_local_physical_experts
+
+    # A buffer to hold the expert weights in one layer during the exchange.
+    # NOTE: Currently we assume the same weights across different layers
+    # have the same shape.
+    expert_weights_buffer = [torch.empty_like(w) for w in expert_weights[0]]
+
+    if is_profile:
+        # Maximum send size is to send all local experts to all ranks,
+        # So we use a dummy `all_gather` to reserve enough communication buffer
+        for weight, buffer in zip(expert_weights[0], expert_weights_buffer):
+            # A `/dev/null`-like buffer to avoid real memory allocation
+            dummy_recv_buffer = [buffer for _ in range(ep_size)]
+            # NOTE(bowen): Needed this barrier to avoid OOM during actual
+            # execution. I'm not very sure why this is needed
+            torch.distributed.barrier()
+            all_gather(
+                dummy_recv_buffer,
+                weight,
+                group=ep_group,
+            )
+        return
+
+    for layer in range(num_moe_layers):
+        # NOTE(bowen): We need this synchronize to run, but I don't know why.
+        # If you figure out the reason, please let me know -- thank you!
+        torch.cuda.synchronize()
+        shuffle_layer(
+            num_local_physical_experts,
+            ep_rank,
+            old_global_expert_indices[layer].tolist(),
+            new_global_expert_indices[layer].tolist(),
+            expert_weights[layer],
+            expert_weights_buffer,
+            ep_group,
+        )
+
+
+def _map_old_expert_indices_with_rank_mapping(
+    old_global_expert_indices: torch.Tensor,
+    rank_mapping: dict[int, int],
+    new_ep_size: int,
+) -> torch.Tensor:
+    """
+    Map the old global expert indices to the new global expert indices.
+    
+    Args:
+        old_global_expert_indices:
+            Shape (num_layers, old_ep_size * num_local_physical_experts).
+        rank_mapping: Mapping from old rank to new rank.
+        new_ep_size: New expert parallelism size.
+    
+    Returns:
+        Mapped expert indices with shape
+        (num_layers, new_ep_size * num_local_physical_experts).
+    """
+    num_layers, old_num_physical_experts = old_global_expert_indices.shape
+    assert rank_mapping, "Rank mapping is required"
+
+    # Get sizes from parameters and rank_mapping
+    old_ep_size = len(rank_mapping)
+    num_local_physical_experts = old_num_physical_experts // old_ep_size
+    new_num_physical_experts = new_ep_size * num_local_physical_experts
+
+    # Create mapped tensor with new shape, initialized to -1
+    mapped_expert_indices = torch.full(
+        (num_layers, new_num_physical_experts),
+        fill_value=-1,
+        dtype=old_global_expert_indices.dtype,
+        device=old_global_expert_indices.device,
+    )
+
+    # Handle rank mapping (scale up/down with rank changes)
+    for old_rank in range(old_ep_size):
+        new_rank = rank_mapping.get(old_rank)
+        if new_rank is not None and new_rank >= 0 and new_rank < new_ep_size:
+            # This old rank exists in the new configuration
+            old_start_idx = old_rank * num_local_physical_experts
+            old_end_idx = (old_rank + 1) * num_local_physical_experts
+            new_start_idx = new_rank * num_local_physical_experts
+            new_end_idx = (new_rank + 1) * num_local_physical_experts
+
+            mapped_expert_indices[:, new_start_idx:new_end_idx] = \
+                old_global_expert_indices[:, old_start_idx:old_end_idx]
+        # If new_rank is None or >= new_ep_size, the experts remain -1
+        # (scale down case)
+
+    return mapped_expert_indices
+
+
+def _map_new_expert_indices_with_rank_mapping(
+    new_global_expert_indices: torch.Tensor,
+    rank_mapping: dict[int, int],
+) -> torch.Tensor:
+    num_layers, new_num_physical_experts = new_global_expert_indices.shape
+    assert rank_mapping, "Rank mapping is required"
+
+    # Get sizes from parameters and rank_mapping
+    old_ep_size = len(rank_mapping)
+    new_ep_size = sum(new_rank != -1 for new_rank in rank_mapping.values())
+    num_local_physical_experts = new_num_physical_experts // new_ep_size
+    old_num_physical_experts = old_ep_size * num_local_physical_experts
+
+    mapped_expert_indices = torch.full(
+        (num_layers, old_num_physical_experts),
+        fill_value=-1,
+        dtype=new_global_expert_indices.dtype,
+        device=new_global_expert_indices.device,
+    )
+
+    for old_rank in range(old_ep_size):
+        new_rank = rank_mapping[old_rank]
+        if new_rank >= 0 and new_rank < new_ep_size:
+            old_start_idx = old_rank * num_local_physical_experts
+            old_end_idx = (old_rank + 1) * num_local_physical_experts
+            new_start_idx = new_rank * num_local_physical_experts
+            new_end_idx = (new_rank + 1) * num_local_physical_experts
+
+            mapped_expert_indices[:, old_start_idx:old_end_idx] = \
+                new_global_expert_indices[:, new_start_idx:new_end_idx]
+
+    return mapped_expert_indices
+
+
+__all__ = ["rearrange_expert_weights_inplace"]
diff --git a/vllm_v0.10.0/vllm/distributed/kv_events.py b/vllm_v0.10.0/vllm/distributed/kv_events.py
new file mode 100644
index 0000000..2d79357
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/kv_events.py
@@ -0,0 +1,356 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import queue
+import threading
+import time
+from abc import ABC, abstractmethod
+from collections import deque
+from dataclasses import asdict
+from itertools import count
+from queue import Queue
+from typing import Any, Callable, Optional, Union
+
+import msgspec
+import zmq
+
+from vllm.config import KVEventsConfig
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class EventBatch(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True,  # type: ignore[call-arg]
+        gc=False,  # type: ignore[call-arg]
+):
+    ts: float
+    events: list[Any]
+    data_parallel_rank: Optional[int] = None
+
+
+class KVCacheEvent(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True,  # type: ignore[call-arg]
+        gc=False,  # type: ignore[call-arg]
+        tag=True):
+    """Base class for all KV cache-related events"""
+
+
+class BlockStored(KVCacheEvent):
+    block_hashes: list[int]
+    parent_block_hash: Optional[int]
+    token_ids: list[int]
+    block_size: int
+    lora_id: Optional[int]
+
+
+class BlockRemoved(KVCacheEvent):
+    block_hashes: list[int]
+
+
+class AllBlocksCleared(KVCacheEvent):
+    pass
+
+
+class KVEventBatch(EventBatch):
+    events: list[Union[BlockStored, BlockRemoved, AllBlocksCleared]]
+
+
+class EventPublisher(ABC):
+    """Lightweight publisher for EventBatch batches with data parallelism
+    support.
+    
+    In data parallel setups, each DP rank runs its own EventPublisher instance
+    to avoid duplicate events and ensure proper event attribution:
+    
+    - Each DP rank creates a separate publisher
+    - Publishers automatically annotate events with their data_parallel_rank
+    - This allows consumers to distinguish events from different DP ranks
+    
+    The publisher is responsible for adding DP metadata since the scheduler
+    operates independently of DP topology and shouldn't need DP awareness.
+    """
+
+    def __init__(self, data_parallel_rank: int = 0) -> None:
+        self._data_parallel_rank = data_parallel_rank
+
+    @abstractmethod
+    def publish(self, events: EventBatch) -> None:
+        """Emit events in order.
+
+        Implementations should guarantee at-least-once delivery and
+        monotonic ordering (e.g., via sequence numbers).
+        """
+
+    @abstractmethod
+    def shutdown(self) -> None:
+        """Shutdown the publisher."""
+
+
+class NullEventPublisher(EventPublisher):
+    """No-op implementation (default when disabled)."""
+
+    def publish(self, events) -> None:
+        return
+
+    def shutdown(self) -> None:
+        return
+
+
+class ZmqEventPublisher(EventPublisher):
+    """Reliable PUB/ROUTER publisher with an in-memory replay buffer.
+
+    Spawns a separate thread to handle publishing from a queue.
+
+    Parameters
+    ----------
+    endpoint:
+        PUB address. Use ``tcp://*:5557`` to bind or ``tcp://host:5557`` to
+        connect.
+    replay_endpoint:
+        Optional ROUTER address for replay requests. When given, subscribers can
+        request missed batches by sending the starting sequence number as an
+        8-byte big-endian integer.
+    buffer_steps:
+        Number of past batches to keep for replay.
+    hwm:
+        ZeroMQ high-water-mark for PUB socket.
+    max_queue_size:
+        Maximum number of events to buffer in memory.
+    topic:
+        Topic to publish events to.
+    """
+    SHUTDOWN_TIMEOUT: float = 1.0
+    END_SEQ = (-1).to_bytes(8, "big", signed=True)
+
+    def __init__(
+        self,
+        data_parallel_rank: int,
+        endpoint: str = "tcp://*:5557",
+        replay_endpoint: Optional[str] = None,
+        buffer_steps: int = 10_000,
+        hwm: int = 100_000,
+        max_queue_size: int = 100_000,
+        topic: str = "",
+    ) -> None:
+        # Storage
+        super().__init__(data_parallel_rank)
+        self._event_queue = Queue[Optional[EventBatch]](maxsize=max_queue_size)
+        self._buffer = deque[tuple[int, bytes]](maxlen=buffer_steps)
+
+        # ZMQ sockets
+        self._ctx = zmq.Context.instance()
+        self._pub: Optional[zmq.Socket] = None
+        self._replay: Optional[zmq.Socket] = None
+        self._dp_rank = data_parallel_rank
+
+        self._endpoint = self.offset_endpoint_port(endpoint, self._dp_rank)
+        self._replay_endpoint = self.offset_endpoint_port(
+            replay_endpoint, self._dp_rank)
+        self._hwm = hwm
+        self._socket_setup()
+
+        # Payload
+        self._seq_gen = count()
+        self._topic_bytes = topic.encode('utf-8')
+
+        # Thread
+        self._running = True
+        logger.info("Starting ZMQ publisher thread")
+
+        self._thread = threading.Thread(target=self._publisher_thread,
+                                        daemon=True,
+                                        name="zmq-publisher")
+        self._thread.start()
+
+    def publish(self, events: EventBatch) -> None:
+        if not self._running:
+            raise RuntimeError("Publisher is closed")
+        if events.data_parallel_rank is None:
+            events.data_parallel_rank = self._data_parallel_rank
+        self._event_queue.put(events)
+
+    def shutdown(self) -> None:
+        """Stop the publisher thread and clean up resources."""
+        self._running = False
+        self._event_queue.put_nowait(None)
+
+        start = time.time()
+        pending_items = True
+        while pending_items and (time.time() - start < self.SHUTDOWN_TIMEOUT):
+            pending_items = not self._event_queue.empty()
+            if pending_items:
+                time.sleep(0.1)
+
+        if pending_items:
+            logger.warning(
+                "Warning: Queue still has %s items after %s seconds timeout",
+                self._event_queue.qsize(),
+                self.SHUTDOWN_TIMEOUT,
+            )
+
+        if self._thread.is_alive():
+            self._thread.join(timeout=self.SHUTDOWN_TIMEOUT)
+
+        # Clean up ZMQ resources
+        try:
+            if self._pub is not None:
+                self._pub.close(linger=0)
+            if self._replay is not None:
+                self._replay.close(linger=0)
+        finally:
+            pass  # Do not terminate context; other sockets may use it
+
+    def _socket_setup(self) -> None:
+        """Initialize sockets
+        https://pyzmq.readthedocs.io/en/v19.0.0/morethanbindings.html#thread-safety
+        """
+        if self._pub is None:
+            self._pub = self._ctx.socket(zmq.PUB)
+            self._pub.set_hwm(self._hwm)
+            # Heuristic: bind if wildcard / * present, else connect.
+            # bind stable, connect volatile convention
+            if (self._endpoint is not None
+                    and ("*" in self._endpoint or "::" in self._endpoint
+                         or self._endpoint.startswith("ipc://")
+                         or self._endpoint.startswith("inproc://"))):
+                self._pub.bind(self._endpoint)
+            elif self._endpoint is not None:
+                self._pub.connect(self._endpoint)
+
+        # Set up replay socket: use ROUTER
+        # 1) handles multiple REQ clients (identities)
+        # 2) lets us send back one request → many replies (streamed events)
+        # 3) works in our non‑blocking poll loop alongside PUB
+        if self._replay_endpoint is not None:
+            self._replay = self._ctx.socket(zmq.ROUTER)
+            self._replay.bind(self._replay_endpoint)
+
+    def _publisher_thread(self) -> None:
+        """Background thread that processes the event queue."""
+        self._pack = msgspec.msgpack.Encoder()
+
+        assert self._pub is not None  # narrows type for mypy
+
+        while self._running or self._event_queue.qsize() > 0:
+            # --- replay (non-critical) ---------------------------------
+            if self._replay is not None and self._replay.poll(0):
+                try:
+                    self._service_replay()
+                except Exception as e:
+                    logger.exception("Error in replay: %s", e)
+
+            # --- main queue (critical) ---------------------------------
+            try:
+                event = self._event_queue.get(timeout=0.1)
+                if event is None:
+                    break  # Sentinel received, exit thread
+            except queue.Empty:
+                continue
+
+            try:
+                seq = next(self._seq_gen)
+
+                payload = self._pack.encode(event)
+                seq_bytes = seq.to_bytes(8, "big")
+                self._pub.send_multipart(
+                    (self._topic_bytes, seq_bytes, payload))
+
+                self._buffer.append((seq, payload))
+                self._event_queue.task_done()
+
+            except Exception as e:
+                # Publishing failed;  back-off a bit to avoid a tight error loop
+                logger.exception("Error in publisher thread: %s", e)
+                time.sleep(0.1)
+
+    def _service_replay(self) -> None:
+        """If a replay request is waiting, send buffered batches."""
+        assert self._replay is not None  # narrows type for mypy
+
+        frame = self._replay.recv_multipart()
+        if len(frame) != 3:
+            logger.warning("Invalid replay request: %s", frame)
+            return
+        client_id, _, start_seq_bytes = frame
+        start_seq = int.from_bytes(start_seq_bytes, "big")
+
+        for seq, buf in self._buffer:
+            if seq >= start_seq:
+                # [identity, empty_delim, seq_bytes, payload]
+                # (identity, empty_delim) are stripped off by the router
+                # receiving payload is (seq_bytes, payload)
+                self._replay.send_multipart(
+                    (client_id, b"", seq.to_bytes(8, "big"), buf))
+        # Send end of sequence marker
+        # receiving payload is (-1, b""")
+        self._replay.send_multipart((client_id, b"", self.END_SEQ, b""))
+
+    @staticmethod
+    def offset_endpoint_port(endpoint: Optional[str],
+                             data_parallel_rank: int) -> Optional[str]:
+        """Helper function to offset the port in an endpoint by 
+            the data parallel rank.
+
+        Args:
+            endpoint: The endpoint string 
+                (e.g., "tcp://*:5557" or "inproc://cache")
+            data_parallel_rank: The data parallel rank to offset by
+
+        Returns:
+            The endpoint with the port offset by data_parallel_rank 
+                or suffix appended
+        """
+        # Do nothing if input is None or data_parallel_rank is 0
+        if not endpoint or data_parallel_rank == 0:
+            return endpoint
+
+        if "inproc" in endpoint:
+            return f"{endpoint}_dp{data_parallel_rank}"
+        if "tcp" in endpoint:
+            if endpoint and ":" in endpoint:
+                # Get everything after the last colon (the port)
+                last_colon_idx = endpoint.rfind(":")
+                base_addr = endpoint[:last_colon_idx]
+                base_port = int(endpoint[last_colon_idx + 1:])
+                new_port = base_port + data_parallel_rank
+                return f"{base_addr}:{new_port}"
+            return endpoint
+        raise ValueError("Invalid endpoint: must contain 'inproc' or 'tcp'")
+
+
+class EventPublisherFactory:
+    _registry: dict[str, Callable[..., EventPublisher]] = {
+        "null": NullEventPublisher,
+        "zmq": ZmqEventPublisher,
+    }
+
+    @classmethod
+    def register_publisher(cls, name: str,
+                           ctor: Callable[..., EventPublisher]) -> None:
+        if name in cls._registry:
+            raise KeyError(f"publisher '{name}' already registered")
+        cls._registry[name] = ctor
+
+    @classmethod
+    def create(cls,
+               config: Optional[KVEventsConfig],
+               data_parallel_rank: int = 0) -> EventPublisher:
+        """Create publisher from a config mapping."""
+        if not config:
+            return NullEventPublisher()
+
+        config_dict = asdict(config)
+
+        kind = config_dict.pop("publisher", "null")
+        config_dict.pop("enable_kv_cache_events")
+        try:
+            constructor = cls._registry[kind]
+        except KeyError as exc:
+            raise ValueError(f"Unknown event publisher '{kind}'") from exc
+        return constructor(data_parallel_rank=data_parallel_rank,
+                           **config_dict)
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/README.md b/vllm_v0.10.0/vllm/distributed/kv_transfer/README.md
new file mode 100644
index 0000000..349d3df
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/kv_transfer/README.md
@@ -0,0 +1,29 @@
+
+# Distributed KV cache transfer
+
+This folder implements distributed KV cache transfer across vLLM instances.
+Currently the main usecase is for disaggregated prefilling.
+
+## Abstractions
+
+The KV cache transfer contains three layer of abstractions:
+
+- KV pipe: a FIFO pipe for torch.tensor transmission. Key APIs: `send_tensor` and `recv_tensor`.
+- KV lookup buffer: a lookup buffer for KV caches. Key: the tokens, value: the KV caches (and/or hidden states). Key APIs: `insert` and `drop_select` (similar to SQL semantics).
+- KV connector: a connector that connects the KV pipe and KV lookup buffer to vLLM. Key APIs: `send_kv_caches_and_hidden_states` and `recv_kv_caches_and_hidden_states`.
+
+Why we need KV lookup buffer: FIFO pipe itself is not enough as prefill vLLM worker may process requests in a different order compared to decode vLLM worker. Say the QPS is really high, prefill worker may handle requests in order A -> B -> C, but the decode worker may process request C first. This is not the case that can be naturally handled by FIFO pipe, so we provide KV lookup buffer to help translate a FIFO pipe to a lookup buffer.
+
+NOTE: KV pipe layer is bypassible: you can skip this layer if your distributed
+communication service already supports key-value-based lookup (like redis or
+RDMA database).
+
+NOTE: If you want to not only transfer KV caches, but adjust the model execution flow of vLLM as well (for example, allow vLLM to receive KV caches on some tokens and do prefill on the remaining tokens), you can bypass both KV pipe layer and KV lookup buffer layer, and directly implement on KV connector layer. Bear in mind that as vLLM's model input is constantly changing, this implementation will likely be broken when vLLM has new updates.
+
+## Disaggregated prefilling
+
+The example usage is in [this file](../../../examples/online_serving/disaggregated_prefill.sh).
+
+Here is the diagram of how we run disaggregated prefilling.
+
+![Disaggregated prefill workflow](./disagg_prefill_workflow.jpg)
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/__init__.py b/vllm_v0.10.0/vllm/distributed/kv_transfer/__init__.py
new file mode 100644
index 0000000..fa9b7e4
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/kv_transfer/__init__.py
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.distributed.kv_transfer.kv_transfer_state import (
+    KVConnectorBaseType, ensure_kv_transfer_initialized, get_kv_transfer_group,
+    has_kv_transfer_group, is_v1_kv_transfer_group)
+
+__all__ = [
+    "get_kv_transfer_group", "has_kv_transfer_group",
+    "is_v1_kv_transfer_group", "ensure_kv_transfer_initialized",
+    "KVConnectorBaseType"
+]
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg b/vllm_v0.10.0/vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg
new file mode 100644
index 0000000..a25ec5e
Binary files /dev/null and b/vllm_v0.10.0/vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg differ
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/__init__.py b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/base.py b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/base.py
new file mode 100644
index 0000000..181c339
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/base.py
@@ -0,0 +1,128 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+KVConnectorBase Class for Distributed KV Cache & Hidden State communication
+
+The class provides two primary abstract methods:
+1. send_kv_caches_and_hidden_states(): Send KV caches and hidden states
+2. recv_kv_caches_and_hidden_states(): Recv KV caches and hidden states
+"""
+
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Union
+
+import torch
+
+from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
+from vllm.sequence import IntermediateTensors
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+
+
+class KVConnectorBase(ABC):
+    """
+    Abstract base class for a KV connector.
+
+    The class provides two primary abstract methods:
+    1. send_kv_caches_and_hidden_states(): Send KV caches and hidden states
+    2. recv_kv_caches_and_hidden_states(): Recv KV caches and hidden states
+    """
+
+    @abstractmethod
+    def __init__(
+        self,
+        rank: int,
+        local_rank: int,
+        config: "VllmConfig",
+    ):
+        raise NotImplementedError
+
+    @abstractmethod
+    def close(self) -> None:
+        """Close the buffer and release resources.
+
+        This method is responsible for cleaning up resources related to the 
+        connector when it is no longer needed.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def send_kv_caches_and_hidden_states(
+        self,
+        model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: list[torch.Tensor],
+        hidden_or_intermediate_states: Union[torch.Tensor,
+                                             IntermediateTensors],
+    ) -> None:
+        """
+        Send KV caches and hidden states to the connector.
+
+        This method processes the input tokens, KV caches, and 
+        hidden/intermediate states for a given model and sends the data to the 
+        decode instance.
+
+        Args:
+            model_executable (torch.nn.Module): The model executable containing 
+                start and end layer information.
+            model_input (ModelInputForGPUWithSamplingMetadata): The input
+                metadata from vLLM.
+            kv_caches (list[torch.Tensor]): List of KV caches (keys and values) 
+                for each layer.
+            hidden_or_intermediate_states (Union[torch.Tensor, 
+            IntermediateTensors]): 
+                The hidden or intermediate states associated with the tokens.
+
+        Returns:
+            None
+
+        """
+
+        raise NotImplementedError
+
+    @abstractmethod
+    def recv_kv_caches_and_hidden_states(
+        self, model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: list[torch.Tensor]
+    ) -> tuple[Union[torch.Tensor, IntermediateTensors], bool,
+               "ModelInputForGPUWithSamplingMetadata"]:
+        """
+        Receive KV caches and hidden states from the connector.
+
+        This method attempts to retrieve KV caches and hidden states for input
+        tokens. If all required KV caches and hidden states are received, it
+        will bypass model input, else it will fall back to normal vLLM model 
+        forwarding.
+
+        Args:
+            model_executable (torch.nn.Module): 
+                The model executable from vLLM modelrunner.
+            model_input (ModelInputForGPUWithSamplingMetadata): 
+                The model input from vLLM modelrunner.
+            kv_caches (list[torch.Tensor]): 
+                List of KV caches for each layer.
+
+        Returns:
+            - hidden_or_intermediate_states (torch.Tensor or
+            IntermediateTensors): 
+                Concatenated hidden states if all required data is retrieved, 
+                otherwise `None`.
+            - bypass_model_exec (bool): 
+                Indicates whether the model execution can be skipped (True) or 
+                needs to be redone (False).
+            - model_input (ModelInputForGPUWithSamplingMetadata): 
+                Optionally adjusted input metadata for re-execution when 
+                `bypass_model_exec=False`.
+
+        """
+
+        raise NotImplementedError
+
+
+KVConnectorBaseType = Union[KVConnectorBase, KVConnectorBase_V1]
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/factory.py
new file mode 100644
index 0000000..be9ce72
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -0,0 +1,133 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import importlib
+from typing import TYPE_CHECKING, Callable
+
+import vllm.envs as envs
+from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType
+from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1,
+                                                          KVConnectorRole)
+from vllm.logger import init_logger
+
+from .base import KVConnectorBase
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
+logger = init_logger(__name__)
+
+
+class KVConnectorFactory:
+    _registry: dict[str, Callable[[], type[KVConnectorBaseType]]] = {}
+
+    @classmethod
+    def register_connector(cls, name: str, module_path: str,
+                           class_name: str) -> None:
+        """Register a connector with a lazy-loading module and class name."""
+        if name in cls._registry:
+            raise ValueError(f"Connector '{name}' is already registered.")
+
+        def loader() -> type[KVConnectorBaseType]:
+            module = importlib.import_module(module_path)
+            return getattr(module, class_name)
+
+        cls._registry[name] = loader
+
+    @classmethod
+    def create_connector_v0(cls, rank: int, local_rank: int,
+                            config: "VllmConfig") -> KVConnectorBase:
+        if envs.VLLM_USE_V1:
+            raise ValueError("Attempting to initialize a V0 Connector, "
+                             f"but found {envs.VLLM_USE_V1=}")
+
+        connector_name = config.kv_transfer_config.kv_connector
+        if connector_name not in cls._registry:
+            raise ValueError(f"Unsupported connector type: {connector_name}")
+
+        connector_cls = cls._registry[connector_name]()
+        assert issubclass(connector_cls, KVConnectorBase)
+        return connector_cls(rank, local_rank, config)
+
+    @classmethod
+    def create_connector_v1(
+        cls,
+        config: "VllmConfig",
+        role: KVConnectorRole,
+    ) -> KVConnectorBase_V1:
+        if not envs.VLLM_USE_V1:
+            raise ValueError("Attempting to initialize a V1 Connector, "
+                             f"but found {envs.VLLM_USE_V1=}")
+
+        kv_transfer_config = config.kv_transfer_config
+        connector_name = kv_transfer_config.kv_connector
+        if connector_name in cls._registry:
+            connector_cls = cls._registry[connector_name]()
+        else:
+            connector_module_path = kv_transfer_config.kv_connector_module_path
+            if connector_module_path is None:
+                raise ValueError(
+                    f"Unsupported connector type: {connector_name}")
+            connector_module = importlib.import_module(connector_module_path)
+            connector_cls = getattr(connector_module, connector_name)
+        assert issubclass(connector_cls, KVConnectorBase_V1)
+        logger.info("Creating v1 connector with name: %s and engine_id: %s",
+                    connector_name, kv_transfer_config.engine_id)
+        # NOTE(Kuntai): v1 connector is explicitly separated into two roles.
+        # Scheduler connector:
+        # - Co-locate with scheduler process
+        # - Should only be used inside the Scheduler class
+        # Worker connector:
+        # - Co-locate with worker process
+        # - Should only be used inside the forward context & attention layer
+        # We build separately to enforce strict separation
+        return connector_cls(config, role)
+
+
+# Register various connectors here.
+# The registration should not be done in each individual file, as we want to
+# only load the files corresponding to the current connector.
+KVConnectorFactory.register_connector(
+    "PyNcclConnector",
+    "vllm.distributed.kv_transfer.kv_connector.simple_connector",
+    "SimpleConnector")
+
+KVConnectorFactory.register_connector(
+    "MooncakeConnector",
+    "vllm.distributed.kv_transfer.kv_connector.simple_connector",
+    "SimpleConnector")
+
+KVConnectorFactory.register_connector(
+    "LMCacheConnector",
+    "vllm.distributed.kv_transfer.kv_connector.lmcache_connector",
+    "LMCacheConnector")
+
+KVConnectorFactory.register_connector(
+    "MooncakeStoreConnector",
+    "vllm.distributed.kv_transfer.kv_connector.mooncake_store_connector",
+    "MooncakeStoreConnector")
+
+KVConnectorFactory.register_connector(
+    "SharedStorageConnector",
+    "vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector",
+    "SharedStorageConnector")
+
+KVConnectorFactory.register_connector(
+    "P2pNcclConnector",
+    "vllm.distributed.kv_transfer.kv_connector.v1.p2p.p2p_nccl_connector",
+    "P2pNcclConnector")
+
+KVConnectorFactory.register_connector(
+    "LMCacheConnectorV1",
+    "vllm.distributed.kv_transfer.kv_connector.v1.lmcache_connector",
+    "LMCacheConnectorV1")
+
+KVConnectorFactory.register_connector(
+    "NixlConnector",
+    "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector",
+    "NixlConnector")
+
+KVConnectorFactory.register_connector(
+    "MultiConnector",
+    "vllm.distributed.kv_transfer.kv_connector.v1.multi_connector",
+    "MultiConnector")
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py
new file mode 100644
index 0000000..78bf309
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+LMCache KV Cache Connector for Distributed Machine Learning Inference
+
+The LMCacheConnector can (1) transfer KV caches between prefill vLLM worker
+(KV cache producer) and decode vLLM worker (KV cache consumer) using LMCache;
+(2) offload and share KV caches.
+"""
+
+from typing import TYPE_CHECKING, Union
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
+from vllm.logger import init_logger
+from vllm.sequence import IntermediateTensors
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+
+logger = init_logger(__name__)
+
+
+class LMCacheConnector(KVConnectorBase):
+
+    def __init__(
+        self,
+        rank: int,
+        local_rank: int,
+        config: VllmConfig,
+    ):
+
+        self.transfer_config = config.kv_transfer_config
+        self.vllm_config = config
+
+        from lmcache.experimental.cache_engine import LMCacheEngineBuilder
+        from lmcache.integration.vllm.utils import ENGINE_NAME
+        from lmcache.integration.vllm.vllm_adapter import (
+            RetrieveStatus, StoreStatus, init_lmcache_engine,
+            lmcache_retrieve_kv, lmcache_should_retrieve, lmcache_should_store,
+            lmcache_store_kv)
+        logger.info("Initializing LMCacheConfig under kv_transfer_config %s",
+                    self.transfer_config)
+
+        # TODO (Jiayi): Find model_config, parallel_config, and cache_config
+        self.engine = init_lmcache_engine(config.model_config,
+                                          config.parallel_config,
+                                          config.cache_config)
+        self.lmcache_engine_name = ENGINE_NAME
+        self.lmcache_engine_builder = LMCacheEngineBuilder
+
+        self.model_config = config.model_config
+        self.parallel_config = config.parallel_config
+        self.cache_config = config.cache_config
+        self.lmcache_retrieve_kv = lmcache_retrieve_kv
+        self.lmcache_store_kv = lmcache_store_kv
+        self.lmcache_should_retrieve = lmcache_should_retrieve
+        self.lmcache_should_store = lmcache_should_store
+        self.store_status = StoreStatus
+        self.retrieve_status = RetrieveStatus
+
+    def recv_kv_caches_and_hidden_states(
+        self, model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: list[torch.Tensor]
+    ) -> tuple[Union[torch.Tensor, IntermediateTensors], bool,
+               "ModelInputForGPUWithSamplingMetadata"]:
+
+        retrieve_status = self.lmcache_should_retrieve(model_input)
+        model_input, bypass_model_exec, hidden_or_intermediate_states =\
+            self.lmcache_retrieve_kv(
+                model_executable, model_input, self.cache_config, kv_caches,
+                retrieve_status)
+        return hidden_or_intermediate_states, bypass_model_exec, model_input
+
+    def send_kv_caches_and_hidden_states(
+        self,
+        model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: list[torch.Tensor],
+        hidden_or_intermediate_states: Union[torch.Tensor,
+                                             IntermediateTensors],
+    ) -> None:
+
+        store_status = self.lmcache_should_store(model_input)
+        self.lmcache_store_kv(
+            self.model_config,
+            self.parallel_config,
+            self.cache_config,
+            model_executable,
+            model_input,
+            kv_caches,
+            store_status,
+        )
+
+    def close(self):
+        self.lmcache_engine_builder.destroy(self.lmcache_engine_name)
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
new file mode 100644
index 0000000..94a7ce9
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
@@ -0,0 +1,203 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+MooncakeStore Connector for Distributed Machine Learning Inference
+The MooncakeStoreConnector transfers KV caches between prefill vLLM workers
+(KV cache producer) and decode vLLM workers (KV cache consumer) using a
+database-style KVStore.
+"""
+import hashlib
+from typing import TYPE_CHECKING, Union
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
+from vllm.distributed.kv_transfer.kv_connector.utils import (
+    model_aware_kv_ops_helper as kv_helper)
+from vllm.logger import init_logger
+from vllm.sequence import IntermediateTensors
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+
+logger = init_logger(__name__)
+
+
+class MooncakeStoreConnector(KVConnectorBase):
+
+    def __init__(
+        self,
+        rank: int,
+        local_rank: int,
+        config: VllmConfig,
+    ):
+        self.kv_transfer_config = config.kv_transfer_config
+        self.kv_helper = kv_helper(config)
+        self.local_tp_rank = local_rank
+
+        # Init kv_store
+        if self.kv_transfer_config.kv_connector == "MooncakeStoreConnector":
+            # Check if MOONCAKE_CONFIG_PATH is set
+            import os
+            use_mooncake_store = os.getenv('MOONCAKE_CONFIG_PATH') is not None
+
+            if not use_mooncake_store:
+                raise ValueError(
+                    "To use MooncakeStoreConnector, you need to pass the ENV: "
+                    "'MOONCAKE_CONFIG_PATH=/path/to/mooncake_config.json'.")
+            else:
+                from vllm.distributed.kv_transfer.kv_lookup_buffer.mooncake_store import (  # noqa: E501
+                    MooncakeStore)
+                logger.info(
+                    "Initializing KVStoreConnector under kv_transfer_config %s",
+                    self.kv_transfer_config)
+                self.kv_store = MooncakeStore(config)
+        else:
+            logger.error("Can not find %s",
+                         self.kv_transfer_config.kv_connector)
+
+        assert self.kv_store is not None
+
+    def close(self) -> None:
+        """Close the buffer and release resources.
+        This method is responsible for cleaning up resources related to the 
+        connector when it is no longer needed.
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        self.kv_store.close()
+
+    def send_kv_caches_and_hidden_states(
+        self,
+        model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: list[torch.Tensor],
+        hidden_or_intermediate_states: Union[torch.Tensor,
+                                             IntermediateTensors],
+    ) -> None:
+        input_tokens_tensor = model_input.input_tokens
+        seq_lens = model_input.attn_metadata.seq_lens
+        slot_mapping_flat = model_input.attn_metadata.slot_mapping.flatten()
+        start_layer = model_executable.model.start_layer
+        end_layer = model_executable.model.end_layer
+        num_heads, head_size = self.kv_helper.get_model_args(model_executable)
+
+        for idx, slen in enumerate(seq_lens):
+            start_pos = sum(seq_lens[:idx])
+            end_pos = start_pos + slen
+
+            current_tokens = input_tokens_tensor[start_pos:end_pos]
+            store_key_prefix = self.tensor_hash(current_tokens)
+            keys, values = [], []
+
+            for layer_id in range(start_layer, end_layer):
+                kv_cache = kv_caches[layer_id - start_layer]
+                key_cache, value_cache = self.kv_helper.get_kv_from_cache(
+                    kv_cache, num_heads, head_size)
+                current_slot_mapping = slot_mapping_flat[start_pos:end_pos]
+
+                keys.append(key_cache[current_slot_mapping].unsqueeze(0))
+                values.append(value_cache[current_slot_mapping].unsqueeze(0))
+
+            keys = torch.cat(keys, dim=0)
+            values = torch.cat(values, dim=0)
+            kvcache_to_sent = torch.stack((keys, values), dim=0)
+            store_kvcache_key = f"{store_key_prefix}_{self.local_tp_rank}"
+            self.kv_store.put(store_kvcache_key, kvcache_to_sent)
+
+            hidden_key = f"{store_key_prefix}_hidden_{self.local_tp_rank}"
+            self.kv_store.put(hidden_key,
+                              hidden_or_intermediate_states[start_pos:end_pos])
+
+        logger.debug("[rank%d]: KV send DONE.", torch.distributed.get_rank())
+
+    def recv_kv_caches_and_hidden_states(
+        self, model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: list[torch.Tensor]
+    ) -> tuple[Union[torch.Tensor, IntermediateTensors], bool,
+               "ModelInputForGPUWithSamplingMetadata"]:
+        bypass_model_exec = True
+        input_tokens_tensor = model_input.input_tokens
+        seq_lens = model_input.attn_metadata.seq_lens
+        num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens
+        slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
+        start_layer = model_executable.model.start_layer
+        end_layer = model_executable.model.end_layer
+        hidden_or_intermediate_states_for_one_req = []
+
+        for idx, slen in enumerate(seq_lens):
+            start_pos = sum(seq_lens[:idx])
+            end_pos = start_pos + slen
+
+            if start_pos >= num_prefill_tokens:
+                # This can happen during inflight batching. See:
+                # vllm/worker/model_runner.py::_prepare_model_input_tensors:
+                # - input_tokens[:num_prefill_tokens] contains prefill tokens.
+                # - input_tokens[num_prefill_tokens:] contains decode tokens.
+                logger.warning("You should set --enable_chunked_prefill=False "
+                               "and --max_num_batched_tokens "
+                               "should be equal to max_seq_len_to_capture")
+                bypass_model_exec = False
+                assert start_pos == num_prefill_tokens
+                break
+
+            current_tokens = input_tokens_tensor[start_pos:end_pos]
+
+            # get roi for current seq
+            load_key_prefix = self.tensor_hash(current_tokens)
+            load_kvcache_key = f"{load_key_prefix}_{self.local_tp_rank}"
+            remote_kv = self.kv_store.get(load_kvcache_key)
+            hidden_key = f"{load_key_prefix}_hidden_{self.local_tp_rank}"
+            hidden = self.kv_store.get(hidden_key)
+
+            if remote_kv is None or hidden is None:
+                # didn't find any match.
+                bypass_model_exec = False
+                continue
+
+            num_computed_tokens = current_tokens.shape[0]
+
+            # update the end position based on how many tokens are cached.
+            end_pos = start_pos + num_computed_tokens
+
+            # call self.kv_store to get kv layer by layer
+            for layer_id in range(start_layer, end_layer):
+                layer = model_executable.model.layers[layer_id]
+                # get kvcache object
+                kv_cache = kv_caches[layer_id - start_layer]
+
+                # get remote kvcache
+                remote_k, remote_v = remote_kv[0][layer_id], remote_kv[1][
+                    layer_id]
+
+                self.kv_helper.put_kv_to_cache(model_executable, remote_k,
+                                               remote_v, layer, kv_cache,
+                                               slot_mapping, start_pos,
+                                               end_pos)
+
+            hidden_or_intermediate_states_for_one_req.append(hidden)
+
+        if not bypass_model_exec:
+            logger.warning(
+                "[rank%d]: Failed to receive all KVs and hidden "
+                "states, redo model forwarding.", torch.distributed.get_rank())
+            hidden_or_intermediate_states = None
+
+        else:
+            logger.debug(
+                "[rank%d]: Successfully received all KVs and hidden "
+                "states, skip model forwarding.", torch.distributed.get_rank())
+            hidden_or_intermediate_states = torch.cat(
+                hidden_or_intermediate_states_for_one_req, dim=0)
+
+        return hidden_or_intermediate_states, bypass_model_exec, model_input
+
+    @staticmethod
+    def tensor_hash(tensor: torch.Tensor) -> int:
+        """Calculate the hash value of the tensor."""
+        tensor_bytes = tensor.clone().detach().cpu().numpy().tobytes()
+        hash_object = hashlib.blake2b(tensor_bytes)
+        hash_hex = hash_object.hexdigest()
+        return int(hash_hex[:16], 16)
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
new file mode 100644
index 0000000..e7c079e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
@@ -0,0 +1,329 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Simple KV Cache Connector for Distributed Machine Learning Inference
+
+The SimpleConnector transfers KV caches between prefill vLLM worker (KV cache
+producer) and decode vLLM worker (KV cache consumer) using PyNcclPipe or
+MooncakePipe.
+
+But the logic can be extended to support other pipe and lookup buffer.
+"""
+from typing import TYPE_CHECKING, Optional, Union
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
+from vllm.distributed.kv_transfer.kv_connector.utils import (
+    model_aware_kv_ops_helper as kv_helper)
+from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import (
+    SimpleBuffer)
+from vllm.logger import init_logger
+from vllm.sequence import IntermediateTensors
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+
+logger = init_logger(__name__)
+
+
+class SimpleConnector(KVConnectorBase):
+
+    def __init__(
+        self,
+        rank: int,
+        local_rank: int,
+        config: VllmConfig,
+    ):
+
+        self.config = config.kv_transfer_config
+        self.kv_helper = kv_helper(config)
+
+        if self.config.kv_connector == "PyNcclConnector":
+            from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import (
+                PyNcclPipe)
+            logger.info(
+                "Initializing PyNcclConfig under kv_transfer_config %s",
+                self.config)
+        elif self.config.kv_connector == "MooncakeConnector":
+            # Check if MOONCAKE_CONFIG_PATH is set
+            import os
+            use_mooncake_distributed_pipe = os.getenv(
+                'MOONCAKE_CONFIG_PATH') is not None
+
+            if not use_mooncake_distributed_pipe:
+                raise ValueError(
+                    "To use MooncakeConnector, you need to pass the ENV: "
+                    "'MOONCAKE_CONFIG_PATH=/path/to/mooncake_config.json'.")
+            else:
+                from vllm.distributed.kv_transfer.kv_pipe.mooncake_pipe import (  # noqa: E501
+                    MooncakePipe)
+                logger.info(
+                    "Initializing MooncakeConfig under kv_transfer_config %s",
+                    self.config)
+
+        self.lookup_buffer_size = self.config.kv_buffer_size
+
+        self.producer_buffer: Optional[SimpleBuffer] = None
+        self.consumer_buffer: Optional[SimpleBuffer] = None
+
+        self.producer_data_pipe: Union[PyNcclPipe, MooncakePipe]
+        self.consumer_data_pipe: Union[PyNcclPipe, MooncakePipe]
+        self.producer_signal_pipe: Union[PyNcclPipe, MooncakePipe]
+        self.consumer_signal_pipe: Union[PyNcclPipe, MooncakePipe]
+
+        # 2 pipes for every rank in the world
+        port_offset_base = 2 * rank
+
+        # In disaggregated prefill, the prefill vLLM only uses send pipe
+        # and the decode vLLM only uses recv pipe
+        if self.config.is_kv_producer:
+
+            if self.config.kv_connector == "PyNcclConnector":
+                self.producer_data_pipe = PyNcclPipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                    port_offset=port_offset_base,
+                )
+                self.producer_signal_pipe = PyNcclPipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                    port_offset=port_offset_base + 1,
+                    device="cpu",
+                )
+            elif self.config.kv_connector == "MooncakeConnector":
+                self.producer_data_pipe = MooncakePipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                )
+                # We only need to initialize MooncakePipe once
+                self.producer_signal_pipe = self.producer_data_pipe
+
+            self.producer_buffer = SimpleBuffer(self.producer_signal_pipe,
+                                                self.producer_data_pipe,
+                                                self.config.kv_buffer_size)
+
+        else:
+
+            # the current vLLM instance is KV consumer, so it needs to connect
+            # its recv pipe to the send pipe of KV producer
+            if self.config.kv_connector == "PyNcclConnector":
+                self.consumer_data_pipe = PyNcclPipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                    port_offset=port_offset_base,
+                )
+                self.consumer_signal_pipe = PyNcclPipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                    port_offset=port_offset_base + 1,
+                    device="cpu",
+                )
+            elif self.config.kv_connector == "MooncakeConnector":
+                self.consumer_data_pipe = MooncakePipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                )
+                self.consumer_signal_pipe = self.consumer_data_pipe
+
+            self.consumer_buffer = SimpleBuffer(
+                self.consumer_signal_pipe,
+                self.consumer_data_pipe,
+                self.config.kv_buffer_size,
+            )
+
+    def select(self, input_tokens: Optional[torch.Tensor],
+               roi: Optional[torch.Tensor]) -> list[Optional[torch.Tensor]]:
+
+        assert self.consumer_buffer is not None, "Please initialize the "\
+            "consumer buffer before calling select."
+        return self.consumer_buffer.drop_select(input_tokens, roi)
+
+    def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor,
+               key: torch.Tensor, value: torch.Tensor,
+               hidden: torch.Tensor) -> None:
+
+        assert self.producer_buffer is not None, "Please initialize the "\
+            "producer buffer before calling insert."
+
+        self.producer_buffer.insert(input_tokens, roi, key, value, hidden)
+
+    def send_kv_caches_and_hidden_states(
+        self,
+        model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: list[torch.Tensor],
+        hidden_or_intermediate_states: Union[torch.Tensor,
+                                             IntermediateTensors],
+    ) -> None:
+
+        input_tokens_tensor = model_input.input_tokens
+        seq_lens = model_input.attn_metadata.seq_lens
+        slot_mapping_flat = model_input.attn_metadata.slot_mapping.flatten()
+        num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens
+        start_layer = model_executable.model.start_layer
+        end_layer = model_executable.model.end_layer
+        num_heads, head_size = self.kv_helper.get_model_args(model_executable)
+
+        # query_lens contains new KV caches that are added to vLLM.
+        # so we will send them to decode instance
+        # FIXME(Kuntai): This assume that all requests are prefill.
+        for idx, slen in enumerate(seq_lens):
+            start_pos = sum(seq_lens[:idx])
+            end_pos = start_pos + slen
+
+            if start_pos >= num_prefill_tokens:
+                # vllm/worker/model_runner.py::_prepare_model_input_tensors:
+                # - input_tokens[:num_prefill_tokens] contains prefill tokens.
+                # - input_tokens[num_prefill_tokens:] contains decode tokens.
+                logger.warning("You have some decode requests while using "
+                               "SimpleConnector. Their KVCache won't be sent.")
+                break
+
+            current_tokens = input_tokens_tensor[start_pos:end_pos]
+
+            keys, values = [], []
+
+            for layer_id in range(start_layer, end_layer):
+                kv_cache = kv_caches[layer_id - start_layer]
+                key_cache, value_cache = self.kv_helper.get_kv_from_cache(
+                    kv_cache, num_heads, head_size)
+
+                current_slot_mapping = slot_mapping_flat[start_pos:end_pos]
+
+                keys.append(key_cache[current_slot_mapping].unsqueeze(0))
+                values.append(value_cache[current_slot_mapping].unsqueeze(0))
+
+            keys = torch.cat(keys, dim=0)
+            values = torch.cat(values, dim=0)
+
+            self.insert(current_tokens,
+                        torch.ones_like(current_tokens,
+                                        dtype=bool), keys, values,
+                        hidden_or_intermediate_states[start_pos:end_pos])
+
+        logger.debug("[rank%d]: KV send DONE.", torch.distributed.get_rank())
+
+    def recv_kv_caches_and_hidden_states(
+        self, model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: list[torch.Tensor]
+    ) -> tuple[Union[torch.Tensor, IntermediateTensors], bool,
+               "ModelInputForGPUWithSamplingMetadata"]:
+
+        # When bypass_model_exec is set to False, it means that at least for one
+        # request its corresponding KV cache or hidden state is missing.
+        # In this case we need to do prefilling to recompute missing KV cache
+        # and hidden states.
+        bypass_model_exec = True
+
+        input_tokens_tensor = model_input.input_tokens
+        seq_lens = model_input.attn_metadata.seq_lens
+        num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens
+        slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
+        start_layer = model_executable.model.start_layer
+        end_layer = model_executable.model.end_layer
+
+        hidden_or_intermediate_states_for_one_req = []
+
+        input_tokens_list = []
+        num_computed_tokens_list = []
+        start_pos_list = []
+
+        # enumerate different requests
+        # FIXME(Kuntai): This impl assumes that all requests are prefill.
+        for idx, slen in enumerate(seq_lens):
+            start_pos = sum(seq_lens[:idx])
+            end_pos = start_pos + slen
+
+            if start_pos >= num_prefill_tokens:
+                # This can happen during inflight batching. See:
+                # vllm/worker/model_runner.py::_prepare_model_input_tensors:
+                # - input_tokens[:num_prefill_tokens] contains prefill tokens.
+                # - input_tokens[num_prefill_tokens:] contains decode tokens.
+                logger.warning("You should set --enable_chunked_prefill=False "
+                               "and --max_num_batched_tokens "
+                               "should be equal to --max_seq_len_to_capture")
+                bypass_model_exec = False
+                assert start_pos == num_prefill_tokens
+                break
+
+            current_tokens = input_tokens_tensor[start_pos:end_pos]
+            num_tokens = slen
+
+            # collecting data for rebuilding the input
+            input_tokens_list.append(current_tokens)
+            start_pos_list.append(start_pos)
+
+            ret = self.select(current_tokens,
+                              torch.ones_like(current_tokens, dtype=bool))
+            if ret[0] is None:
+                # didn't find any match.
+                bypass_model_exec = False
+                num_computed_tokens_list.append(0)
+                continue
+
+            roi: torch.Tensor = ret[1]
+            keys: torch.Tensor = ret[2]
+            values: torch.Tensor = ret[3]
+            hidden: torch.Tensor = ret[4]
+
+            num_computed_tokens = roi.shape[0]
+            num_computed_tokens_list.append(num_computed_tokens)
+
+            # check if both KV cache and the hidden states are received
+            # If not, need to redo the forwarding to compute missing states
+            if not all([(num_computed_tokens == num_tokens), hidden is not None
+                        ]):
+                bypass_model_exec = False
+
+            # update the end position based on how many tokens are cached.
+            end_pos = start_pos + num_computed_tokens
+
+            # put received KV caches into paged memory
+            for cur_layer in range(start_layer, end_layer):
+
+                layer_id = cur_layer - start_layer
+                kv_cache = kv_caches[layer_id]
+                layer = model_executable.model.layers[cur_layer]
+
+                # get remote kvcache
+                remote_k, remote_v = keys[layer_id], values[layer_id]
+
+                self.kv_helper.put_kv_to_cache(model_executable, remote_k,
+                                               remote_v, layer, kv_cache,
+                                               slot_mapping, start_pos,
+                                               end_pos)
+
+            hidden_or_intermediate_states_for_one_req.append(hidden)
+
+        if not bypass_model_exec:
+            # Some of the KV cache is not retrieved
+            # Here we will fall back to normal model forwarding
+            # But optionally you can adjust model_input so that you only do
+            # prefilling on those tokens that are missing KV caches.
+            logger.warning(
+                "[rank%d]: Failed to receive all KVs and hidden "
+                "states, redo model forwarding.", torch.distributed.get_rank())
+            hidden_or_intermediate_states = None
+
+        else:
+            logger.debug(
+                "[rank%d]: Successfully received all KVs and hidden "
+                "states, skip model forwarding.", torch.distributed.get_rank())
+            hidden_or_intermediate_states = torch.cat(
+                hidden_or_intermediate_states_for_one_req, dim=0)
+
+        return hidden_or_intermediate_states, bypass_model_exec, model_input
+
+    def close(self):
+        self.producer_data_pipe.close()
+        self.consumer_data_pipe.close()
+        if self.config.kv_connector == "PyNcclConnector":
+            self.producer_signal_pipe.close()
+            self.consumer_signal_pipe.close()
+        elif self.config.kv_connector == "MooncakeConnector":
+            # MooncakePipe reuses data_pipe for signal_pipe, so we only have to
+            # close the data_pipe.
+            pass
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/utils.py
new file mode 100644
index 0000000..c179d6c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -0,0 +1,199 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+KV cache helper for store.
+"""
+from collections import defaultdict
+from collections.abc import Sequence
+from concurrent.futures import CancelledError, Future
+from typing import Optional, cast
+
+import torch
+
+import vllm.envs as envs
+from vllm import _custom_ops as ops
+from vllm.config import VllmConfig, get_current_vllm_config
+from vllm.logger import init_logger
+from vllm.v1.outputs import ModelRunnerOutput
+
+logger = init_logger(__name__)
+
+
+class model_aware_kv_ops_helper:
+
+    def __init__(self, config: VllmConfig):
+        self.is_deepseek_mla = config.model_config.is_deepseek_mla
+        self.use_mla_opt = not envs.VLLM_MLA_DISABLE
+        self.tp_size = config.parallel_config.tensor_parallel_size
+
+    def get_model_args(self, model_executable: torch.nn.Module):
+
+        model_config = model_executable.model.config
+        self.model_executable = model_executable
+        num_heads = int(model_config.num_key_value_heads / self.tp_size)
+        hidden_size = model_config.hidden_size
+        num_attention_heads = model_config.num_attention_heads
+
+        # Deepseek's MLA (Multi-head Latent Attention) uses two different
+        # kv_cache shapes based on whether VLLM_MLA_DISABLE is set to 0.
+        # When VLLM_MLA_DISABLE=0 (default), forward absorb is applied,
+        # resulting in a kv_cache shape of [num_blks, blk_size, 1,
+        # kv_lora_rank + qk_rope_head_dim].
+        # When VLLM_MLA_DISABLE=1, standard FA is used instead, leading
+        # to a kv_cache shape of [2, num_blks, blk_size,
+        # num_key_value_heads / tp, qk_nope_head_dim + qk_rope_head_dim].
+        # For more details, see vllm/attention/backends/mla/common.py.
+        if self.is_deepseek_mla and self.use_mla_opt:
+            head_size = model_config.kv_lora_rank + \
+                model_config.qk_rope_head_dim
+            num_heads = 1
+        elif self.is_deepseek_mla and not self.use_mla_opt:
+            head_size = model_config.qk_nope_head_dim + \
+                model_config.qk_rope_head_dim
+        else:
+            head_size = getattr(model_config, "head_dim", None)
+            if head_size is None:
+                head_size = int(hidden_size // num_attention_heads)
+
+        return num_heads, head_size
+
+    def get_kv_from_cache(self, kv_cache, num_heads, head_size):
+        if self.is_deepseek_mla and self.use_mla_opt:
+            key_cache = kv_cache.reshape(-1, num_heads, head_size)
+            value_cache = kv_cache.reshape(-1, num_heads, head_size)
+        else:
+            key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
+            value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
+        return key_cache, value_cache
+
+    def put_kv_to_cache(self, model_executable: torch.nn.Module, keys, values,
+                        layer, kv_cache, slot_mapping, start_pos, end_pos):
+
+        model_config = model_executable.model.config
+
+        if self.is_deepseek_mla and self.use_mla_opt:
+            layer.self_attn.attn = layer.self_attn.mla_attn
+            k_c_normed_k_pe = keys.squeeze(1)
+            k_c_normed = k_c_normed_k_pe[:, :model_config.kv_lora_rank]
+            k_pe = k_c_normed_k_pe[:, model_config.kv_lora_rank:]
+            ops.concat_and_cache_mla(
+                k_c_normed.to(kv_cache.device),
+                k_pe.to(kv_cache.device),
+                kv_cache,
+                slot_mapping[start_pos:end_pos],
+                layer.self_attn.attn.kv_cache_dtype,
+                layer.self_attn.attn._k_scale,
+            )
+        else:
+            key_cache, value_cache = kv_cache[0], kv_cache[1]
+            ops.reshape_and_cache_flash(
+                keys.to(key_cache.device),
+                values.to(value_cache.device),
+                key_cache,
+                value_cache,
+                slot_mapping[start_pos:end_pos],
+                layer.self_attn.attn.kv_cache_dtype,
+                layer.self_attn.attn._k_scale,
+                layer.self_attn.attn._v_scale,
+            )
+
+
+def get_kv_connector_cache_layout():
+    # NOTE (NickLucche) When running disaggregated PD with NIXL, HND layout is
+    # used for faster transfer.
+    vllm_config = get_current_vllm_config()
+    kv_config = vllm_config.kv_transfer_config
+    if kv_config is not None and vllm_config.model_config is None:
+        logger.warning_once("Unable to detect current VLLM config. " \
+        "Defaulting to NHD kv cache layout.")
+    elif kv_config is not None:
+        use_mla = vllm_config.model_config.use_mla
+        if not use_mla and kv_config.kv_connector == "NixlConnector":
+            logger.info_once("NixlConnector detected. Setting KV cache " \
+            "layout to HND for better xfer performance.")
+            return "HND"
+    return "NHD"
+
+
+class KVOutputAggregator:
+    """Utility class to aggregate the output of all workers into a single 
+    output corresponding to Rank 0 for scheduler."""
+
+    def __init__(self, world_size: int):
+        # Complete transfer tracker. Used by to track finished requests
+        # [req_id -> n_finished_workers]
+        self._recv_remaining_count = defaultdict[str, int](lambda: world_size)
+        self._send_remaining_count = defaultdict[str, int](lambda: world_size)
+
+    def aggregate(self,
+                  outputs: list[ModelRunnerOutput],
+                  output_rank: int = 0) -> ModelRunnerOutput:
+        # aggregate finished_sending, finished_recving from all workers
+
+        def update_finished_set(req_ids: Optional[set[str]],
+                                remaining_count_dict: dict[str, int],
+                                finished_set: set[str]) -> None:
+            for req_id in req_ids or ():
+                new_count = remaining_count_dict[req_id] - 1
+                if new_count == 0:
+                    finished_set.add(req_id)
+                    del remaining_count_dict[req_id]
+                else:
+                    remaining_count_dict[req_id] = new_count
+
+        finished_sending = set[str]()
+        finished_recving = set[str]()
+        for output in outputs:
+            update_finished_set(output.finished_sending,
+                                self._send_remaining_count, finished_sending)
+            update_finished_set(output.finished_recving,
+                                self._recv_remaining_count, finished_recving)
+
+        # select output of the worker specified by output_rank
+        output = outputs[output_rank]
+
+        # set the aggregated finished_sending / finished_recving
+        # if output.finished_sending/recving is not empty, but the other ranks
+        # still have unfinished send/recv, we want to set the aggregated
+        # finished_sending/recving to None until all ranks have finished
+        # send/recv
+        output.finished_sending = finished_sending if finished_sending else None
+        output.finished_recving = finished_recving if finished_recving else None
+
+        return output
+
+    def async_aggregate(self,
+                        output_futures: Sequence[Future[ModelRunnerOutput]],
+                        output_rank: int = 0) -> Future[ModelRunnerOutput]:
+        """Takes a list of futures and returns a single future which resolves
+        to the respective list of outputs."""
+        result_future: Future[ModelRunnerOutput] = Future()
+
+        outputs: list[Optional[ModelRunnerOutput]] = [None
+                                                      ] * len(output_futures)
+
+        def make_callback(idx):
+
+            def callback(fut):
+                if result_future.done():
+                    return
+
+                try:
+                    outputs[idx] = fut.result()
+                except CancelledError:
+                    result_future.cancel()
+                except Exception as e:
+                    result_future.set_exception(e)
+
+                # this check assumes io_thread_pool uses a single thread
+                if all(outputs):
+                    result_future.set_result(
+                        self.aggregate(cast(list[ModelRunnerOutput], outputs),
+                                       output_rank))
+
+            return callback
+
+        for i, output_future in enumerate(output_futures):
+            output_future.add_done_callback(make_callback(i))
+
+        return result_future
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py
new file mode 100644
index 0000000..f00f31d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1, KVConnectorRole)
+
+__all__ = ["KVConnectorRole", "KVConnectorBase_V1"]
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/base.py
new file mode 100644
index 0000000..e124577
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -0,0 +1,288 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+KVConnectorBase_V1 Class for Distributed KV Cache & Hidden State
+communication in vLLM v1
+
+The class provides the following primitives:
+    Scheduler-side: runs in the scheduler, binds metadata, which
+    is used by the worker-side to load/save KV cache.
+        get_num_new_matched_tokens() - get number of new tokens 
+            that exist in the remote KV cache. Might be called multiple
+            times for a given request and should be side-effect free.
+        update_state_after_alloc() - update KVConnector state after
+            temporary buffer alloc by the CacheManager.
+        request_finished() - called when a request is finished, with
+            the computed kv cache blocks for the request.
+            Returns whether KV cache should be freed now or will be
+            freed asynchronously and optionally returns KV transfer
+            params.
+
+    Worker-side: runs in each worker, loads/saves KV cache to/from
+    the Connector based on the metadata.
+        start_load_kv() - starts loading all KVs (maybe async)
+        wait_for_layer_load() - blocks until layer i load is done
+
+        save_kv_layer() - starts saving KV for layer i (maybe async)
+        wait_for_save() - blocks until all saves are done
+
+        get_finished() - called with ids of finished requests, returns
+            ids of requests that have completed async sending/recving.
+"""
+
+import enum
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any, Optional
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.v1.core.sched.output import SchedulerOutput
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionMetadata
+    from vllm.config import VllmConfig
+    from vllm.forward_context import ForwardContext
+    from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+
+class KVConnectorRole(enum.Enum):
+    # Connector running in the scheduler process
+    SCHEDULER = 0
+
+    # Connector running in the worker process
+    WORKER = 1
+
+
+class KVConnectorMetadata(ABC):  # noqa: B024
+    """
+    Abstract Metadata used to communicate between the
+    Scheduler KVConnector and Worker KVConnector.
+    """
+    pass
+
+
+class KVConnectorBase_V1(ABC):
+
+    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
+        logger.warning(
+            "Initializing KVConnectorBase_V1. This API is experimental and "
+            "subject to change in the future as we iterate the design.")
+        self._connector_metadata: Optional[KVConnectorMetadata] = None
+        self._vllm_config = vllm_config
+        self._role = role
+
+    @property
+    def role(self) -> KVConnectorRole:
+        return self._role
+
+    # ==============================
+    # Worker-side methods
+    # ==============================
+
+    def bind_connector_metadata(
+            self, connector_metadata: KVConnectorMetadata) -> None:
+        """Set the connector metadata from the scheduler.
+
+        This function should be called by the model runner every time 
+        before the model execution. The metadata will be used for runtime
+        KV cache loading and saving.
+
+        Args:
+            connector_metadata (dict): the connector metadata.
+        """
+        self._connector_metadata = connector_metadata
+
+    def clear_connector_metadata(self) -> None:
+        """Clear the connector metadata.
+
+        This function should be called by the model runner every time 
+        after the model execution.
+        """
+        self._connector_metadata = None
+
+    def _get_connector_metadata(self) -> KVConnectorMetadata:
+        """Get the connector metadata.
+
+        This function should only be called inside the connector.
+
+        Returns:
+            ConnectorMetadata: the connector metadata.
+        """
+
+        # Should only be called while set to valid metadata.
+        assert self._connector_metadata is not None
+        return self._connector_metadata
+
+    def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
+        """
+        Initialize with the KV caches. Useful for pre-registering the
+        KV Caches in the KVConnector (e.g. for NIXL).
+
+        Args: kv_caches:
+            dictionary of layer names, kv cache
+        """
+        return
+
+    @abstractmethod
+    def start_load_kv(self, forward_context: "ForwardContext",
+                      **kwargs) -> None:
+        """
+        Start loading the KV cache from the connector to vLLM's paged
+        KV buffer. This is called from the forward context before the
+        forward pass to enable async loading during model execution.
+
+        Args:
+            forward_context (ForwardContext): the forward context.
+            **kwargs: additional arguments for the load operation
+
+        Note:
+            The number of elements in kv_caches and layer_names should be 
+            the same.
+            
+        """
+        pass
+
+    @abstractmethod
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        """
+        Block until the KV for a specific layer is loaded into vLLM's
+        paged buffer. This is called from within attention layer to ensure
+        async copying from start_load_kv is complete.
+        
+        This interface will be useful for layer-by-layer pipelining.
+
+        Args:
+            layer_name: the name of that layer
+        """
+        pass
+
+    @abstractmethod
+    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
+                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+        """
+        Start saving a layer of KV cache from vLLM's paged buffer 
+        to the connector. This is called from within attention layer to
+        enable async copying during execution.
+
+        Args:
+            layer_name (str): the name of the layer.
+            kv_layer (torch.Tensor): the paged KV buffer of the current 
+                layer in vLLM.
+            attn_metadata (AttentionMetadata): the attention metadata.
+            **kwargs: additional arguments for the save operation.
+        """
+        pass
+
+    @abstractmethod
+    def wait_for_save(self):
+        """
+        Block until all the save operations is done. This is called
+        as the forward context exits to ensure that the async saving
+        from save_kv_layer is complete before finishing the forward.
+
+        This prevents overwrites of paged KV buffer before saving done.
+        """
+        pass
+
+    def get_finished(
+        self, finished_req_ids: set[str]
+    ) -> tuple[Optional[set[str]], Optional[set[str]]]:
+        """
+        Notifies worker-side connector ids of requests that have
+        finished generating tokens on the worker.
+        The scheduler process (via the Executors) will use this output
+        to track which workers are done.
+
+        Returns:
+            ids of requests that have finished asynchronous transfer
+            (requests that previously returned True from request_finished()),
+            tuple of (sending/saving ids, recving/loading ids).
+            The finished saves/sends req ids must belong to a set provided in a
+            call to this method (this call or a prior one).
+        """
+        return None, None
+
+    # ==============================
+    # Scheduler-side methods
+    # ==============================
+
+    @abstractmethod
+    def get_num_new_matched_tokens(
+        self,
+        request: "Request",
+        num_computed_tokens: int,
+    ) -> tuple[int, bool]:
+        """
+        Get number of new tokens that can be loaded from the
+        external KV cache beyond the num_computed_tokens.
+        
+        Args:
+            request (Request): the request object.
+            num_computed_tokens (int): the number of locally
+                computed tokens for this request
+
+        Returns:
+            A tuple with the following elements:
+                - The number of tokens that can be loaded from the 
+                  external KV cache beyond what is already computed.
+                - `True` if external KV cache tokens will be loaded
+                  asynchronously (between scheduler steps). Must be
+                  'False' if the first element is 0.
+        """
+        pass
+
+    @abstractmethod
+    def update_state_after_alloc(self, request: "Request",
+                                 blocks: "KVCacheBlocks",
+                                 num_external_tokens: int):
+        """
+        Update KVConnector state after block allocation.
+
+        If get_num_new_matched_tokens previously returned True for a
+        request, this function may be called twice for that same request -
+        first when blocks are allocated for the connector tokens to be
+        asynchronously loaded into, and second when any additional blocks
+        are allocated, after the load/transfer is complete.
+
+        Args:
+            request (Request): the request object.
+            blocks (KVCacheBlocks): the blocks allocated for the request.
+            num_external_tokens (int): the number of tokens that will be
+                loaded from the external KV cache.
+        """
+        pass
+
+    @abstractmethod
+    def build_connector_meta(
+            self, scheduler_output: SchedulerOutput) -> KVConnectorMetadata:
+        """
+        Build the connector metadata for this step.
+
+        This function should NOT modify fields in the scheduler_output.
+        Also, calling this function will reset the state of the connector.
+
+        Args:
+            scheduler_output (SchedulerOutput): the scheduler output object.
+        """
+        pass
+
+    def request_finished(
+        self,
+        request: "Request",
+        block_ids: list[int],
+    ) -> tuple[bool, Optional[dict[str, Any]]]:
+        """
+        Called when a request has finished, before its blocks are freed.
+
+        Returns:
+            True if the request is being saved/sent asynchronously and blocks
+            should not be freed until the request_id is returned from
+            get_finished().
+            Optional KVTransferParams to be included in the request outputs
+            returned by the engine.
+        """
+        return False, None
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
new file mode 100644
index 0000000..e838ac2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
@@ -0,0 +1,167 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import TYPE_CHECKING, Any, Optional
+
+import torch
+from lmcache.integration.vllm.vllm_v1_adapter import LMCacheConnectorV1Impl
+
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
+from vllm.logger import init_logger
+from vllm.v1.core.sched.output import SchedulerOutput
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionMetadata
+    from vllm.forward_context import ForwardContext
+    from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+
+class LMCacheConnectorV1(KVConnectorBase_V1):
+
+    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
+        super().__init__(vllm_config=vllm_config, role=role)
+        self._lmcache_engine = LMCacheConnectorV1Impl(vllm_config, role, self)
+
+    # ==============================
+    # Worker-side methods
+    # ==============================
+    def start_load_kv(self, forward_context: "ForwardContext",
+                      **kwargs) -> None:
+        """
+        Start loading the KV cache from the connector to vLLM's paged
+        KV buffer. This is called from the forward context before the
+        forward pass to enable async loading during model execution.
+
+        Args:
+            forward_context (ForwardContext): the forward context.
+            **kwargs: additional arguments for the load operation
+
+        Note:
+            The number of elements in kv_caches and layer_names should be 
+            the same.
+            
+        """
+        self._lmcache_engine.start_load_kv(forward_context, **kwargs)
+
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        """
+        Block until the KV for a specific layer is loaded into vLLM's
+        paged buffer. This is called from within attention layer to ensure
+        async copying from start_load_kv is complete.
+        
+        This interface will be useful for layer-by-layer pipelining.
+
+        Args:
+            layer_name: the name of that layer
+        """
+        self._lmcache_engine.wait_for_layer_load(layer_name)
+
+    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
+                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+        """
+        Start saving the a layer of KV cache from vLLM's paged buffer 
+        to the connector. This is called from within attention layer to
+        enable async copying during execution.
+
+        Args:
+            layer_name (str): the name of the layer.
+            kv_layer (torch.Tensor): the paged KV buffer of the current 
+                layer in vLLM.
+            attn_metadata (AttentionMetadata): the attention metadata.
+            **kwargs: additional arguments for the save operation.
+        """
+        self._lmcache_engine.save_kv_layer(layer_name, kv_layer, attn_metadata,
+                                           **kwargs)
+
+    def wait_for_save(self):
+        """
+        Block until all the save operations is done. This is called
+        as the forward context exits to ensure that the async saving
+        from save_kv_layer is complete before finishing the forward.
+
+        This prevents overwrites of paged KV buffer before saving done.
+        """
+        self._lmcache_engine.wait_for_save()
+
+    def get_finished(
+        self, finished_req_ids: set[str]
+    ) -> tuple[Optional[set[str]], Optional[set[str]]]:
+        """
+        Notifies worker-side connector ids of requests that have
+        finished generating tokens.
+
+        Returns:
+            ids of requests that have finished asynchronous transfer
+            (requests that previously returned True from request_finished()),
+            tuple of (sending/saving ids, recving/loading ids).
+            The finished saves/sends req ids must belong to a set provided in a
+            call to this method (this call or a prior one).
+        """
+        return self._lmcache_engine.get_finished(finished_req_ids)
+
+    # ==============================
+    # Scheduler-side methods
+    # ==============================
+    def get_num_new_matched_tokens(
+        self,
+        request: "Request",
+        num_computed_tokens: int,
+    ) -> tuple[int, bool]:
+        """
+        Get number of new tokens that can be loaded from the
+        external KV cache beyond the num_computed_tokens.
+        
+        Args:
+            request (Request): the request object.
+            num_computed_tokens (int): the number of locally
+                computed tokens for this request
+
+        Returns:
+            the number of tokens that can be loaded from the 
+            external KV cache beyond what is already computed.
+        """
+        return self._lmcache_engine.get_num_new_matched_tokens(
+            request, num_computed_tokens), False
+
+    def update_state_after_alloc(self, request: "Request",
+                                 blocks: "KVCacheBlocks",
+                                 num_external_tokens: int):
+        """
+        Update KVConnector state after block allocation.
+        """
+        self._lmcache_engine.update_state_after_alloc(request,
+                                                      num_external_tokens)
+
+    def build_connector_meta(
+            self, scheduler_output: SchedulerOutput) -> KVConnectorMetadata:
+        """
+        Build the connector metadata for this step.
+
+        This function should NOT modify fields in the scheduler_output.
+        Also, calling this function will reset the state of the connector.
+
+        Args:
+            scheduler_output (SchedulerOutput): the scheduler output object.
+        """
+        return self._lmcache_engine.build_connector_meta(scheduler_output)
+
+    def request_finished(
+        self,
+        request: "Request",
+        block_ids: list[int],
+    ) -> tuple[bool, Optional[dict[str, Any]]]:
+        """
+        Called when a request has finished, before its blocks are freed.
+
+        Returns:
+            True if the request is being saved/sent asynchronously and blocks
+            should not be freed until the request_id is returned from
+            get_finished().
+            Optional KVTransferParams to be included in the request outputs
+            returned by the engine.
+        """
+        return self._lmcache_engine.request_finished(request, block_ids)
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
new file mode 100644
index 0000000..a2eaa00
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -0,0 +1,204 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Optional
+
+import torch
+
+from vllm.config import KVTransferConfig, VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.factory import (
+    KVConnectorFactory)
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
+from vllm.logger import init_logger
+from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+from vllm.v1.core.sched.output import SchedulerOutput
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionMetadata
+    from vllm.forward_context import ForwardContext
+    from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class MultiKVConnectorMetadata(KVConnectorMetadata):
+    metadata: tuple[KVConnectorMetadata, ...]
+    extra_async_saves: Optional[dict[str, int]] = None
+
+
+class MultiConnector(KVConnectorBase_V1):
+    """
+    A wrapper for using multiple KVConnectors at the same time.
+
+    The current logic is:
+    - Load KV from the first connector that advertises available tokens from
+      get_num_new_matched_tokens(), based on the order in the config.
+    - Save to all connectors.
+    """
+
+    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
+        super().__init__(vllm_config=vllm_config, role=role)
+        self._connectors: list[KVConnectorBase_V1] = []
+        ktcs = vllm_config.kv_transfer_config.kv_connector_extra_config.get(
+            "connectors")
+        assert ktcs is not None
+        for ktc in ktcs:
+            temp_config = copy.copy(vllm_config)
+            engine_id = ktc.get("engine_id",
+                                vllm_config.kv_transfer_config.engine_id)
+            temp_config.kv_transfer_config = KVTransferConfig(
+                **ktc, engine_id=engine_id)
+            self._connectors.append(
+                KVConnectorFactory.create_connector_v1(temp_config, role))
+
+        # A mapping from request id to the index of the connector chosen to
+        # load the request from (if any).
+        self._requests_to_connector: dict[str, int] = {}
+
+        # Keeps track of *additional* remaining async saves (beyond 1) to be
+        # finished per request. Not needed for async loads since we only allow
+        # a single connector to load.
+        # Propagated from scheduler to worker side via the connector metadata.
+        self._extra_async_saves: dict[str, int] = {}
+
+    def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
+        for c in self._connectors:
+            c.register_kv_caches(kv_caches)
+
+    # We must override the base class method here because we need to bind
+    # the metadata to each connector in the order of the connectors in the
+    # MultiKVConnectorMetadata.
+    def bind_connector_metadata(
+            self, connector_metadata: KVConnectorMetadata) -> None:
+        assert isinstance(connector_metadata, MultiKVConnectorMetadata)
+        if connector_metadata.extra_async_saves:
+            self._extra_async_saves.update(
+                connector_metadata.extra_async_saves)
+        for c, cm in zip(self._connectors, connector_metadata.metadata):
+            c.bind_connector_metadata(cm)
+
+    def clear_connector_metadata(self) -> None:
+        for c in self._connectors:
+            c.clear_connector_metadata()
+
+    # ==============================
+    # Worker-side methods
+    # ==============================
+    def start_load_kv(self, forward_context: "ForwardContext",
+                      **kwargs) -> None:
+        for c in self._connectors:
+            c.start_load_kv(forward_context, **kwargs)
+
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        for c in self._connectors:
+            c.wait_for_layer_load(layer_name)
+
+    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
+                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+        for c in self._connectors:
+            c.save_kv_layer(layer_name, kv_layer, attn_metadata, **kwargs)
+
+    def wait_for_save(self):
+        for c in self._connectors:
+            c.wait_for_save()
+
+    def get_finished(
+        self, finished_req_ids: set[str]
+    ) -> tuple[Optional[set[str]], Optional[set[str]]]:
+        finished_sending: set[str] = set()
+        finished_recving: set[str] = set()
+        for c in self._connectors:
+            sending, recving = c.get_finished(finished_req_ids)
+            if not recving and not sending:
+                continue
+            # Aggregate finished recving request ids.
+            finished_recving.update(recving or ())
+            # Aggregate finished sending request ids - only include
+            # once we've drained the "extra" count (for cases where
+            # more than one connector is async-saving the same request).
+            for req_id in sending or ():
+                extra_pending = self._extra_async_saves.get(req_id)
+                if extra_pending is None:
+                    finished_sending.add(req_id)
+                    continue
+                assert extra_pending > 0
+                if extra_pending == 1:
+                    del self._extra_async_saves[req_id]
+                else:
+                    self._extra_async_saves[req_id] = extra_pending - 1
+
+        return finished_sending or None, finished_recving or None
+
+    # ==============================
+    # Scheduler-side methods
+    # ==============================
+    def get_num_new_matched_tokens(
+        self,
+        request: "Request",
+        num_computed_tokens: int,
+    ) -> tuple[int, bool]:
+        to_return = (0, False)
+        for i, c in enumerate(self._connectors):
+            toks, load_async = c.get_num_new_matched_tokens(
+                request, num_computed_tokens)
+            # The first connector that has new matched tokens will be assigned
+            # to this request.
+            if to_return[0] == 0 and toks > 0:
+                self._requests_to_connector[request.request_id] = i
+                to_return = (toks, load_async)
+        return to_return
+
+    def update_state_after_alloc(self, request: "Request",
+                                 blocks: "KVCacheBlocks",
+                                 num_external_tokens: int):
+        chosen_connector = self._requests_to_connector.get(
+            request.request_id, -1)
+        empty_blocks = blocks.new_empty()
+        for i, c in enumerate(self._connectors):
+            if i == chosen_connector:
+                # Forward call to the chosen connector (if any).
+                c.update_state_after_alloc(request, blocks,
+                                           num_external_tokens)
+            else:
+                # Call with empty blocks for other connectors.
+                c.update_state_after_alloc(request, empty_blocks, 0)
+
+    def build_connector_meta(
+            self,
+            scheduler_output: SchedulerOutput) -> MultiKVConnectorMetadata:
+        metadata = MultiKVConnectorMetadata(metadata=tuple(
+            c.build_connector_meta(scheduler_output)
+            for c in self._connectors))
+        if self._extra_async_saves:
+            metadata.extra_async_saves = self._extra_async_saves
+            self._extra_async_saves = {}
+        return metadata
+
+    def request_finished(
+        self,
+        request: "Request",
+        blocks: list[int],
+    ) -> tuple[bool, Optional[dict[str, Any]]]:
+        async_saves = 0
+        kv_txfer_params = None
+        for c in self._connectors:
+            async_save, txfer_params = c.request_finished(request, blocks)
+            if async_save:
+                async_saves += 1
+            if txfer_params is not None:
+                if kv_txfer_params is not None:
+                    # TODO we can probably change this to merge the dicts here,
+                    # checking for key clashes.
+                    raise RuntimeError(
+                        "Only one connector can produce KV transfer params")
+                kv_txfer_params = txfer_params
+        if async_saves > 1:
+            self._extra_async_saves[request.request_id] = async_saves - 1
+
+        # Clean up other state for this request.
+        self._requests_to_connector.pop(request.request_id, None)
+
+        return async_saves > 0, kv_txfer_params
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
new file mode 100644
index 0000000..0c5986b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -0,0 +1,1082 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import contextlib
+import math
+import queue
+import threading
+import time
+import uuid
+from collections import defaultdict
+from collections.abc import Iterator
+from concurrent.futures import Future, ThreadPoolExecutor
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Optional
+
+import msgspec
+import torch
+import zmq
+
+from vllm import envs
+from vllm.attention.selector import backend_name_to_enum, get_attn_backend
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
+from vllm.distributed.parallel_state import (
+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size,
+    get_tp_group)
+from vllm.distributed.utils import divide
+from vllm.forward_context import ForwardContext
+from vllm.logger import init_logger
+from vllm.platforms import _Backend
+from vllm.utils import make_zmq_path, make_zmq_socket, round_down
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.request import RequestStatus
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionMetadata
+    from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.request import Request
+
+Transfer = tuple[int, float]  # (xfer_handle, start_time)
+EngineId = str
+ReqId = str
+GET_META_MSG = b"get_meta_msg"
+
+logger = init_logger(__name__)
+
+# Lazy import nixl_wrapper to avoid loading nixl_bindings if nixl is not used
+try:
+    from nixl._api import nixl_agent as NixlWrapper
+    logger.info("NIXL is available")
+except ImportError:
+    logger.warning("NIXL is not available")
+    NixlWrapper = None
+
+
+class NixlAgentMetadata(
+        msgspec.Struct,
+        omit_defaults=True,  # type: ignore[call-arg]
+        # required for @cached_property.
+        dict=True):
+    engine_id: str
+    agent_metadata: bytes
+    kv_caches_base_addr: list[int]
+    num_blocks: int
+    block_len: int
+    attn_backend_name: str
+
+
+@dataclass
+class ReqMeta:
+    local_block_ids: list[int]
+    remote_block_ids: list[int]
+    remote_host: str
+    remote_port: int
+    remote_engine_id: str
+    tp_size: int
+
+
+class NixlConnectorMetadata(KVConnectorMetadata):
+
+    def __init__(self):
+        self.reqs_to_recv: dict[ReqId, ReqMeta] = {}
+        self.reqs_to_send: dict[ReqId, float] = {}
+
+    def add_new_req(
+        self,
+        request_id: ReqId,
+        local_block_ids: list[int],
+        kv_transfer_params: dict[str, Any],
+    ):
+        self.reqs_to_recv[request_id] = ReqMeta(
+            local_block_ids=local_block_ids,
+            remote_block_ids=kv_transfer_params["remote_block_ids"],
+            remote_engine_id=kv_transfer_params["remote_engine_id"],
+            remote_host=kv_transfer_params["remote_host"],
+            remote_port=kv_transfer_params["remote_port"],
+            # P workers don't need to receive tp_size from proxy here.
+            tp_size=kv_transfer_params.get("tp_size", 1),
+        )
+
+
+class NixlConnector(KVConnectorBase_V1):
+
+    def __init__(self, vllm_config: VllmConfig, role: KVConnectorRole):
+        assert vllm_config.kv_transfer_config is not None
+        assert vllm_config.kv_transfer_config.engine_id is not None
+        self.engine_id: EngineId = vllm_config.kv_transfer_config.engine_id
+
+        if role == KVConnectorRole.SCHEDULER:
+            self.connector_scheduler: Optional[NixlConnectorScheduler] = \
+                NixlConnectorScheduler(vllm_config, self.engine_id)
+            self.connector_worker: Optional[NixlConnectorWorker] = None
+        elif role == KVConnectorRole.WORKER:
+            self.connector_scheduler = None
+            self.connector_worker = NixlConnectorWorker(
+                vllm_config, self.engine_id)
+
+    ############################################################
+    # Scheduler Side Methods
+    ############################################################
+
+    def get_num_new_matched_tokens(
+            self, request: "Request",
+            num_computed_tokens: int) -> tuple[int, bool]:
+        assert self.connector_scheduler is not None
+        return self.connector_scheduler.get_num_new_matched_tokens(
+            request, num_computed_tokens)
+
+    def update_state_after_alloc(self, request: "Request",
+                                 blocks: "KVCacheBlocks",
+                                 num_external_tokens: int):
+        assert self.connector_scheduler is not None
+        return self.connector_scheduler.update_state_after_alloc(
+            request, blocks, num_external_tokens)
+
+    def build_connector_meta(
+        self,
+        scheduler_output: SchedulerOutput,
+    ) -> KVConnectorMetadata:
+        assert self.connector_scheduler is not None
+        return self.connector_scheduler.build_connector_meta(scheduler_output)
+
+    def request_finished(
+        self,
+        request: "Request",
+        block_ids: list[int],
+    ) -> tuple[bool, Optional[dict[str, Any]]]:
+        assert self.connector_scheduler is not None
+        return self.connector_scheduler.request_finished(request, block_ids)
+
+    ############################################################
+    # Worker Side Methods
+    ############################################################
+    def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
+        assert self.connector_worker is not None
+        self.connector_worker.register_kv_caches(kv_caches)
+
+    def get_finished(self,
+                     finished_req_ids: set[str]) -> tuple[set[str], set[str]]:
+        """Get the finished recving and sending requests."""
+        assert self.connector_worker is not None
+        return self.connector_worker.get_finished()
+
+    def start_load_kv(self, forward_context: "ForwardContext",
+                      **kwargs) -> None:
+        assert self.connector_worker is not None
+        assert isinstance(self._connector_metadata, NixlConnectorMetadata)
+        self.connector_worker.start_load_kv(self._connector_metadata)
+
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        """NixlConnector does not do layerwise saving."""
+        pass
+
+    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
+                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+        """NixlConnector does not save explicitly."""
+        pass
+
+    def wait_for_save(self):
+        """NixlConnector does not save explicitly."""
+        pass
+
+
+class NixlConnectorScheduler:
+    """Implementation of Scheduler side methods"""
+
+    def __init__(self, vllm_config: VllmConfig, engine_id: str):
+        self.vllm_config = vllm_config
+        self.block_size = vllm_config.cache_config.block_size
+        self.engine_id: EngineId = engine_id
+        self.side_channel_host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST
+        self.side_channel_port = (
+            envs.VLLM_NIXL_SIDE_CHANNEL_PORT +
+            vllm_config.parallel_config.data_parallel_rank *
+            vllm_config.parallel_config.tensor_parallel_size)
+        logger.info("Initializing NIXL Scheduler %s", engine_id)
+
+        # Requests that need to start recv/send.
+        # New requests are added by update_state_after_alloc in
+        # the scheduler. Used to make metadata passed to Worker.
+        self._reqs_need_recv: dict[ReqId, tuple[Request, list[int]]] = {}
+        # Reqs to send and their expiration time
+        self._reqs_need_send: dict[ReqId, float] = {}
+
+    def get_num_new_matched_tokens(
+            self, request: "Request",
+            num_computed_tokens: int) -> tuple[int, bool]:
+        """
+        For remote prefill, pull all prompt blocks from remote
+        asynchronously relative to engine execution.
+        
+        Args:
+            request (Request): the request object.
+            num_computed_tokens (int): the number of locally
+                computed tokens for this request
+        Returns:
+            * the number of tokens that can be loaded from the 
+              external KV cache beyond what is already computed.
+            * true if the external KV cache tokens will be loaded
+              asynchronously (between scheduler steps).
+        """
+
+        params = request.kv_transfer_params
+        logger.debug(
+            "NIXLConnector get_num_new_matched_tokens: "
+            "num_computed_tokens=%s, kv_transfer_params=%s",
+            num_computed_tokens, params)
+
+        if params is not None and params.get("do_remote_prefill"):
+            # Remote prefill: get all prompt blocks from remote.
+            assert num_computed_tokens % self.block_size == 0
+            rounded_num_prompt_tokens = round_down(
+                len(request.prompt_token_ids), self.block_size)
+            count = max(rounded_num_prompt_tokens - num_computed_tokens, 0)
+            if count > 0:
+                return count, True
+
+        # No remote prefill for this request.
+        return 0, False
+
+    def update_state_after_alloc(self, request: "Request",
+                                 blocks: "KVCacheBlocks",
+                                 num_external_tokens: int):
+
+        params = request.kv_transfer_params
+        logger.debug(
+            "NIXLConnector update_state_after_alloc: "
+            "num_external_tokens=%s, kv_transfer_params=%s",
+            num_external_tokens, params)
+
+        if params is not None and params.get("do_remote_prefill"):
+            if params.get("remote_block_ids"):
+                if all(p in params for p in ("remote_engine_id", "remote_host",
+                                             "remote_port")):
+                    # If remote_blocks and num_external_tokens = 0, we have
+                    # a full prefix cache hit on the D worker. We need to call
+                    # send_notif in _read_blocks to free the memory on the P.
+                    local_block_ids = (blocks.get_unhashed_block_ids()
+                                       if num_external_tokens > 0 else [])
+                    # Get unhashed blocks to pull from remote.
+                    self._reqs_need_recv[request.request_id] = (
+                        request, local_block_ids)
+                else:
+                    logger.warning(
+                        "Got invalid KVTransferParams: %s. This "
+                        "request will not utilize KVTransfer", params)
+            else:
+                assert num_external_tokens == 0
+            # Only trigger 1 KV transfer per request.
+            params["do_remote_prefill"] = False
+
+    def build_connector_meta(
+        self,
+        scheduler_output: SchedulerOutput,
+    ) -> KVConnectorMetadata:
+        meta = NixlConnectorMetadata()
+
+        # Loop through scheduled reqs and convert to ReqMeta.
+        for req_id, (req, block_ids) in self._reqs_need_recv.items():
+            assert req.kv_transfer_params is not None
+            meta.add_new_req(
+                request_id=req_id,
+                local_block_ids=block_ids,
+                kv_transfer_params=req.kv_transfer_params,
+            )
+
+        # Clear the list once workers start the transfers
+        self._reqs_need_recv.clear()
+
+        meta.reqs_to_send = self._reqs_need_send
+        self._reqs_need_send = {}
+
+        return meta
+
+    def request_finished(
+        self,
+        request: "Request",
+        block_ids: list[int],
+    ) -> tuple[bool, Optional[dict[str, Any]]]:
+        """
+        Once a request is finished, determine whether request blocks
+        should be freed now or will be sent asynchronously and freed later.
+        """
+
+        params = request.kv_transfer_params
+        logger.debug(
+            "NIXLConnector request_finished, request_status=%s, "
+            "kv_transfer_params=%s", request.status, params)
+        if not params:
+            return False, None
+
+        if params.get("do_remote_prefill"):
+            # If do_remote_prefill is still True when the request is finished,
+            # update_state_after_alloc must not have been called (the request
+            # must have been aborted before it was scheduled).
+            # To avoid stranding the prefill blocks in the prefill instance,
+            # we must add empty block_ids to _reqs_need_recv so that our
+            # worker side will notify and free blocks in the prefill instance.
+            self._reqs_need_recv[request.request_id] = (request, [])
+            params["do_remote_prefill"] = False
+            return False, None
+
+        if (not params.get("do_remote_decode")
+                or request.status != RequestStatus.FINISHED_LENGTH_CAPPED):
+            return False, None
+
+        # Get computed blocks.
+        all_full = request.num_computed_tokens % self.block_size == 0
+        computed_block_ids = block_ids if all_full else block_ids[:-1]
+
+        # If prompt < block_size, no xfer so free blocks immediately.
+        delay_free_blocks = len(computed_block_ids) > 0
+
+        if delay_free_blocks:
+            # Prefill request on remote. It will be read from D upon completion
+            self._reqs_need_send[request.request_id] = time.perf_counter(
+            ) + envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT
+
+        return delay_free_blocks, dict(
+            do_remote_prefill=True,
+            do_remote_decode=False,
+            remote_block_ids=computed_block_ids,
+            remote_engine_id=self.engine_id,
+            remote_host=self.side_channel_host,
+            remote_port=self.side_channel_port,
+            tp_size=self.vllm_config.parallel_config.tensor_parallel_size)
+
+
+class NixlConnectorWorker:
+    """Implementation of Worker side methods"""
+
+    def __init__(self, vllm_config: VllmConfig, engine_id: str):
+        if NixlWrapper is None:
+            logger.error("NIXL is not available")
+            raise RuntimeError("NIXL is not available")
+        logger.info("Initializing NIXL wrapper")
+        logger.info("Initializing NIXL worker %s", engine_id)
+
+        # Config.
+        self.vllm_config = vllm_config
+        self.block_size = vllm_config.cache_config.block_size
+
+        # Agent.
+        self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), None)
+        # Map of engine_id -> {rank0: agent_name0, rank1: agent_name1..}.
+        self._remote_agents: dict[EngineId, dict[int, str]] = defaultdict(dict)
+
+        # NIXL handshake port.
+        # NOTE(rob): Within a DP group, each DP rank gets its own
+        # base port (which is sent in the KVTransferParams).
+        # Each TP rank listens/queries on the base_port + tp_rank.
+        self.side_channel_port: int = (
+            envs.VLLM_NIXL_SIDE_CHANNEL_PORT +
+            vllm_config.parallel_config.data_parallel_rank *
+            vllm_config.parallel_config.tensor_parallel_size)
+
+        # Metadata.
+        self.engine_id: EngineId = engine_id
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.world_size = get_tensor_model_parallel_world_size()
+        self.tp_group = get_tp_group()
+
+        # KV Caches and nixl tracking data.
+        self.kv_caches: dict[str, torch.Tensor] = {}
+
+        # Map of engine_id -> kv_caches_base_addr. For TP case, each local
+        # rank will still only pull from a single remote TP worker.
+        self.kv_caches_base_addr: dict[EngineId, list[int]] = {}
+
+        # Number of NIXL regions. Currently one region per cache
+        # (so 1 per layer for MLA, otherwise 2 per layer)
+        self.num_regions = 0
+        self.num_layers = 0
+
+        # nixl_prepped_dlist_handle.
+        self.src_xfer_side_handle: int = 0
+        # Map of engine_id -> nixl_prepped_dlist_handle (int)].
+        self.dst_xfer_side_handles: dict[EngineId, int] = {}
+
+        # Map of engine_id -> num_blocks. All ranks in the same deployment will
+        # have the same number of blocks.
+        self.dst_num_blocks: dict[EngineId, int] = {}
+        self._registered_descs: list[Any] = []
+
+        # In progress transfers.
+        # [req_id -> list[handle]]
+        self._recving_transfers = defaultdict[ReqId, list[Transfer]](list)
+        # Track the expiration time of requests that are waiting to be sent.
+        self._reqs_to_send: dict[ReqId, float] = {}
+
+        # Background thread for handling new handshake requests.
+        self._nixl_handshake_listener_t: Optional[threading.Thread] = None
+        # Background thread for initializing new NIXL handshakes.
+        self._handshake_initiation_executor = ThreadPoolExecutor(
+            # NIXL is not guaranteed to be thread-safe, limit 1 worker.
+            max_workers=1,
+            thread_name_prefix="vllm-nixl-handshake-initiator")
+        self._ready_requests = queue.Queue[tuple[ReqId, ReqMeta]]()
+        self._handshake_futures: dict[EngineId, Future[dict[int, str]]] = {}
+        # Protects _handshake_futures and _remote_agents.
+        self._handshake_lock = threading.RLock()
+
+        self.vllm_config = vllm_config
+        self.block_size = vllm_config.cache_config.block_size
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+
+        # TODO(mgoin): remove this once we have hybrid memory allocator
+        # Optimization for models with local attention (Llama 4)
+        # List of block window sizes for each layer for local attention
+        self.block_window_per_layer: list[Optional[int]] = []
+        self.use_mla = self.model_config.use_mla
+
+        backend = get_attn_backend(self.model_config.get_head_size(),
+                                   self.model_config.dtype,
+                                   self.cache_config.cache_dtype,
+                                   self.block_size,
+                                   self.model_config.is_attention_free,
+                                   use_mla=self.use_mla)
+        self.backend_name = backend.get_name()
+        attn_backend = backend_name_to_enum(self.backend_name)
+        self._use_flashinfer = attn_backend == _Backend.FLASHINFER_VLLM_V1
+        logger.debug("Detected attention backend %s", self.backend_name)
+
+        self._tp_size: dict[EngineId, int] = {self.engine_id: self.world_size}
+        # With heterogeneous TP, P must wait for all assigned D TP workers to
+        # finish reading before safely freeing the blocks.
+        self.consumer_notification_counts_by_req = defaultdict[ReqId, int](int)
+
+    def __del__(self):
+        """Cleanup background threads on destruction."""
+        self._handshake_initiation_executor.shutdown(wait=False)
+        if self._nixl_handshake_listener_t:
+            self._nixl_handshake_listener_t.join(timeout=0)
+
+    @staticmethod
+    def _nixl_handshake_listener(metadata: NixlAgentMetadata,
+                                 ready_event: threading.Event, base_port: int,
+                                 tp_rank: int):
+        """Background thread for getting new NIXL handshakes."""
+        # NOTE(rob): this is a simple implementation. We will move
+        # to a better approach via HTTP endpoint soon.
+
+        encoder = msgspec.msgpack.Encoder()
+        encoded_data = encoder.encode(metadata)
+        size_in_bytes = len(encoded_data)
+        logger.debug("Size of encoded NixlAgentMetadata: %s bytes",
+                     str(size_in_bytes))
+
+        # Listen for new requests for metadata.
+        host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST
+        path = make_zmq_path("tcp", host, base_port + tp_rank)
+        logger.debug("Starting listening on path: %s", path)
+        with zmq_ctx(zmq.ROUTER, path) as sock:
+            ready_event.set()
+            while True:
+                identity, _, msg = sock.recv_multipart()
+                if msg != GET_META_MSG:
+                    logger.warning(
+                        "Connection listener got unexpected message %s", msg)
+                sock.send_multipart((identity, b"", encoded_data))
+
+    def _nixl_handshake(
+        self,
+        host: str,
+        port: int,
+        remote_tp_size: int,
+        expected_engine_id: str,
+    ) -> dict[int, str]:
+        """Do a NIXL handshake with a remote instance."""
+
+        start_time = time.perf_counter()
+
+        # NOTE(rob): we need each rank to have a unique port. This is
+        # a hack to keep us moving. We will switch when moving to etcd
+        # or where we have a single ZMQ socket in the scheduler.
+
+        # Handshake only with the remote TP rank that current local rank will
+        # pull from. With homogeneous TP it happens to be the same rank_i.
+        tp_ratio = self._tp_size[self.engine_id] // remote_tp_size
+        p_remote_rank = self.tp_rank // tp_ratio
+        path = make_zmq_path("tcp", host, port + p_remote_rank)
+        logger.debug("Querying metadata on path: %s at remote rank %s", path,
+                     p_remote_rank)
+
+        # Send query for the request.
+        with zmq_ctx(zmq.REQ, path) as sock:
+            sock.send(GET_META_MSG)
+            metadata_bytes = sock.recv()
+            decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
+            metadata = decoder.decode(metadata_bytes)
+            got_metadata_time = time.perf_counter()
+            logger.debug("NIXL handshake: get metadata took: %s",
+                         got_metadata_time - start_time)
+
+            # Ensure engine id matches.
+            if metadata.engine_id != expected_engine_id:
+                raise RuntimeError(f"Remote NIXL agent engine ID mismatch. "
+                                   f"Expected {expected_engine_id},"
+                                   f"received {metadata.engine_id}.")
+
+            # Register Remote agent.
+            remote_agent_name = self.add_remote_agent(metadata, p_remote_rank,
+                                                      remote_tp_size)
+            setup_agent_time = time.perf_counter()
+            logger.debug("NIXL handshake: add agent took: %s",
+                         setup_agent_time - got_metadata_time)
+
+        # Remote rank -> agent name.
+        return {p_remote_rank: remote_agent_name}
+
+    def _background_nixl_handshake(self, req_id: str,
+                                   remote_engine_id: EngineId, meta: ReqMeta):
+        # Do NIXL handshake in background and add to _ready_requests when done.
+        fut = self._handshake_futures.get(remote_engine_id)
+        if fut is None:
+            fut = self._handshake_initiation_executor.submit(
+                self._nixl_handshake, meta.remote_host, meta.remote_port,
+                meta.tp_size, remote_engine_id)
+            self._handshake_futures[remote_engine_id] = fut
+
+            def done_callback(f: Future[dict[int, str]], eid=remote_engine_id):
+                with self._handshake_lock:
+                    del self._handshake_futures[eid]
+                    try:
+                        self._remote_agents[eid] = f.result()
+                    except Exception:
+                        logger.exception("Handshake with %s failed", eid)
+
+            fut.add_done_callback(done_callback)
+
+        # TODO: handle failure state of future in the
+        # callback, we want to fail the request in this case.
+        def request_ready(_f: Future[Any], entry=(req_id, meta)):
+            self._ready_requests.put(entry)
+
+        fut.add_done_callback(request_ready)
+
+    def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
+        """Register the KV Cache data in nixl."""
+
+        _, first_kv_cache = next(iter(kv_caches.items()))
+        kv_elem_size = first_kv_cache.element_size()
+
+        # TODO(tms): Find a more robust way to detect and handle MLA
+        # NOTE (NickLucche) To move blocks efficiently with NIXL, the expected
+        # KV memory layout is HND, as opposed to the default NHD. Note that it
+        # will only affects the strides. For MLA instead, we make require no
+        # such thing and resort to the standard layout.
+        use_mla = len(first_kv_cache.shape) == 3
+        assert use_mla == self.use_mla
+
+        # TODO (NickLucche) not compatible with hybrid allocator. Enforce check
+        # once it goes live, as a single kv layout is expected for xfers.
+        if use_mla:
+            # MLA case.
+            self.num_blocks = first_kv_cache.shape[0]
+            block_rank = 2  # [block_size, latent_dim]
+            block_shape = first_kv_cache.shape[-block_rank:]
+            block_size, kv_latent_dim = block_shape
+            self.slot_size_bytes = kv_elem_size * kv_latent_dim
+        else:
+            # [2 (k and v), num_blocks, ...]
+            if self._use_flashinfer:
+                # FlashInfer swaps 2<->num_blocks dimensions.
+                self.num_blocks = first_kv_cache.shape[0]
+                block_rank = 4  # [2, block_size, kv_heads, head_dim]
+            else:
+                self.num_blocks = first_kv_cache.shape[1]
+                block_rank = 3  # [block_size, kv_heads, head_dim]
+            block_shape = first_kv_cache.shape[-block_rank:]
+            block_size, n_kv_heads, head_dim = block_shape[-3:]
+            # head size in bytes.
+            self.slot_size_bytes = kv_elem_size * n_kv_heads * head_dim
+        assert block_size == self.block_size
+        # TODO(tms): self.block_len needs to be per-layer for sliding window,
+        # hybrid attn, etc
+        # block size in bytes
+        self.block_len = kv_elem_size * math.prod(block_shape)
+        logger.info(
+            "Registering KV_Caches: use_mla: %s, num_blocks: %s, "
+            "block_shape: %s, per_layer_kv_cache_shape: %s", use_mla,
+            self.num_blocks, block_shape, first_kv_cache.shape)
+        self.dst_num_blocks[self.engine_id] = self.num_blocks
+        self.kv_caches = kv_caches
+        kv_caches_base_addr = []
+        caches_data = []
+
+        # Note(tms): I modified this from the original region setup code.
+        # K and V are now in different regions. Advantage is that we can
+        # elegantly support MLA and any cases where the K and V tensors
+        # are non-contiguous (it's not locally guaranteed that they will be)
+        # Disadvantage is that the encoded NixlAgentMetadata is now larger
+        # (roughly 8KB vs 5KB).
+        # Conversely for FlashInfer, K and V are transferred in the same tensor
+        # to better exploit the memory layout (ie num_blocks is the first dim).
+        for cache_or_caches in kv_caches.values():
+            # Normalize to always be a list of caches
+            cache_list = [cache_or_caches] if use_mla or self._use_flashinfer \
+                else cache_or_caches
+            for cache in cache_list:
+                base_addr = cache.data_ptr()
+                region_len = self.num_blocks * self.block_len
+                caches_data.append(
+                    (base_addr, region_len, cache.device.index, ""))
+                kv_caches_base_addr.append(base_addr)
+        self.kv_caches_base_addr[self.engine_id] = kv_caches_base_addr
+        self.num_regions = len(caches_data)
+        self.num_layers = len(self.kv_caches.keys())
+
+        # TODO(mgoin): remove this once we have hybrid memory allocator
+        # Optimization for models with local attention (Llama 4)
+        if self.vllm_config.model_config.hf_config.model_type == "llama4":
+            from transformers import Llama4TextConfig
+            assert isinstance(self.vllm_config.model_config.hf_text_config,
+                              Llama4TextConfig)
+            llama4_config = self.vllm_config.model_config.hf_text_config
+            no_rope_layers = llama4_config.no_rope_layers
+            chunk_size = llama4_config.attention_chunk_size
+            chunk_block_size = math.ceil(chunk_size / self.block_size)
+            for layer_idx in range(self.num_layers):
+                # no_rope_layers[layer_idx] == 0 means NoPE (global)
+                # Any other value means RoPE (local chunked)
+                is_local_attention = no_rope_layers[layer_idx] != 0
+                block_window = chunk_block_size if is_local_attention else None
+                self.block_window_per_layer.append(block_window)
+            logger.debug("Llama 4 block window per layer mapping: %s",
+                         self.block_window_per_layer)
+            assert len(self.block_window_per_layer) == self.num_layers
+
+        descs = self.nixl_wrapper.get_reg_descs(caches_data, "VRAM")
+        logger.debug("Registering descs: %s", caches_data)
+        self.nixl_wrapper.register_memory(descs)
+        logger.debug("Done registering descs")
+        self._registered_descs.append(descs)
+
+        # Register local/src descr for NIXL xfer.
+        blocks_data = []
+        for base_addr in self.kv_caches_base_addr[self.engine_id]:
+            # NOTE With heter-TP, more blocks are prepared than what are
+            # needed as self.num_blocks >= nixl_agent_meta.num_blocks. We
+            # could create fewer, but then _get_block_descs_ids needs to
+            # select agent_meta.num_blocks instead of self.num_blocks for
+            # local descr, and that makes handling regular flow less clean.
+            for block_id in range(self.num_blocks):
+                block_offset = block_id * self.block_len
+                addr = base_addr + block_offset
+                # (addr, len, device id)
+                blocks_data.append((addr, self.block_len, self.tp_rank))
+        logger.debug("Created %s blocks for src engine %s and rank %s",
+                     len(blocks_data), self.engine_id, self.tp_rank)
+
+        descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM")
+        # NIXL_INIT_AGENT to be used for preparations of local descs.
+        self.src_xfer_side_handle = self.nixl_wrapper.prep_xfer_dlist(
+            "NIXL_INIT_AGENT", descs)
+
+        # After KV Caches registered, listen for new connections.
+        metadata = NixlAgentMetadata(
+            engine_id=self.engine_id,
+            agent_metadata=self.nixl_wrapper.get_agent_metadata(),
+            kv_caches_base_addr=self.kv_caches_base_addr[self.engine_id],
+            num_blocks=self.num_blocks,
+            block_len=self.block_len,
+            attn_backend_name=self.backend_name)
+        ready_event = threading.Event()
+        self._nixl_handshake_listener_t = threading.Thread(
+            target=self._nixl_handshake_listener,
+            args=(metadata, ready_event, self.side_channel_port, self.tp_rank),
+            daemon=True,
+            name="nixl_handshake_listener")
+        self._nixl_handshake_listener_t.start()
+        ready_event.wait()  # Wait for listener ZMQ socket to be ready.
+
+    def add_remote_agent(self,
+                         nixl_agent_meta: NixlAgentMetadata,
+                         remote_tp_rank: int = 0,
+                         remote_tp_size: int = 1) -> str:
+        """
+        Add the remote NIXL agent and prepare the descriptors for reading cache
+        blocks from remote.
+
+        In particular, handle both homogeneous and heterogeneous TP. The former
+        requires local rank_i to read from remote rank_i. 
+        The latter, assuming D.world_size > P.world_size, requires that two or 
+        more local TP worker share the xfer from a single TP worker.
+
+        Here's an example:
+
+        rank_offset     p_remote_tp_rank
+        (kv split no)    
+        --------------------------------
+            0                 0      Worker0  ---- 1st half of KV ----> Worker0  [ KV Cache ]
+                                                                        /
+            1                 0      Worker1  ---- 2nd half of KV -----/
+
+            0                 1      Worker2  ---- 1st half of KV ----> Worker1  [ KV Cache ]
+                                                                        /
+            1                 1      Worker3  ---- 2nd half of KV -----/
+
+
+                                Decoder TP workers                     Prefix TP workers
+                                  (world_size=4)                         (world_size=2)
+                                                 tp_ratio = 4 // 2 = 2                  
+                                
+        Considering the KV Caches, if P-Worker_i has cache size [2, num_blocksP, kv_heads, block_size, head_dim]  
+        then D-Worker_j has [2, num_blocksD, kv_heads//tp_ratio, block_size, head_dim]. Mind the "HND" layout format.
+        Assuming num_blocksD >= num_blocksP, D-Worker0 reads from P-Worker0 by preparing the kv_heads//tp_ratio 
+        first heads from all the slots of all the blocks. D-Worker1 will do the same, but reading the second split
+        along the kv_heads dimension, and so forth until "tp_ratio" D TP workers have pulled from P-Worker0.   
+        
+        Note that the above will also hold true for the homogeneous TP case, where tp_ratio evaluates to 1.
+
+        Regarding MLA case, the cache is replicated across TP workers so the rank_offset will just always be 0
+        so that the whole cache is shared by "tp_ratio" D TP workers.
+        """ # noqa: E501
+        engine_id = nixl_agent_meta.engine_id
+        # TODO re-evaluate refreshing for scaling/recovery
+        if remote_tp_rank in self._remote_agents.get(engine_id, {}):
+            return self._remote_agents[engine_id][remote_tp_rank]
+
+        if engine_id not in self._tp_size:
+            self._tp_size[engine_id] = remote_tp_size
+        else:
+            assert self._tp_size[engine_id] == remote_tp_size
+        # We may eventually enable this after asserting equality in cache
+        # layout and close outputs.
+        assert nixl_agent_meta.attn_backend_name == self.backend_name
+
+        remote_agent_name = self.nixl_wrapper.add_remote_agent(
+            nixl_agent_meta.agent_metadata)
+
+        # Number of D TP workers reading from a single P TP worker. This is
+        # 1 when P and D `--tensor-parallel-size` match.
+        tp_ratio = divide(self._tp_size[self.engine_id],
+                          self._tp_size[engine_id])
+        assert tp_ratio > 0, "Decode TP cannot be smaller than prefill TP"
+
+        # Handle tp_size>num_kv_heads: replicate KV cache.
+        total_num_kv_heads = self.model_config.get_total_num_kv_heads()
+        is_kv_replicated = self._tp_size[engine_id] // total_num_kv_heads >= 1
+
+        if self.use_mla or is_kv_replicated:
+            # With MLA the only difference is in the number of blocks.
+            remote_block_size = nixl_agent_meta.block_len // (
+                self.slot_size_bytes)
+            assert self.block_len == nixl_agent_meta.block_len
+        else:
+            remote_block_size = nixl_agent_meta.block_len // (
+                self.slot_size_bytes * tp_ratio)
+            if self._use_flashinfer:
+                # Account for joint KV in FlashInfer.
+                remote_block_size //= 2
+
+            assert nixl_agent_meta.block_len == self.block_len * tp_ratio, (
+                "Remote P worker KV layer cache must be of shape [2, N, "
+                "local_kv_heads*tp_ratio, block_size, head_dim] and same dtype."
+            )
+
+        assert self.block_size == remote_block_size, (
+            "Remote P worker with different block size is not supported "
+            f"{self.block_size=} {remote_block_size=}")
+
+        # Create dst descs and xfer side handles. TP workers have same #blocks.
+        if engine_id in self.dst_num_blocks:
+            assert self.dst_num_blocks[engine_id] == nixl_agent_meta.num_blocks
+        else:
+            self.dst_num_blocks[engine_id] = nixl_agent_meta.num_blocks
+
+        blocks_data = []
+        # With homogeneous TP, D pulls the whole kv cache from corresponding
+        # rank. With heterogeneous TP, prepare the descriptors by splitting the
+        # P KV cache along kv_head dim, of D worker's kv_head size (D>P).
+        # Eg. PTP1 DTP2 => P0 KV:[block0-KV_0 | block0-KV_1..].
+        # Only register the remote's descriptors if current rank pulls from it.
+        self.kv_caches_base_addr[
+            engine_id] = nixl_agent_meta.kv_caches_base_addr
+        rank_offset = self.tp_rank % tp_ratio * self.block_len \
+            if not (self.use_mla or is_kv_replicated) else 0
+        # Register all remote blocks, but only the corresponding kv heads.
+        for base_addr in nixl_agent_meta.kv_caches_base_addr:
+            for block_id in range(nixl_agent_meta.num_blocks):
+                block_offset = block_id * nixl_agent_meta.block_len
+                # For each block, grab the heads chunk belonging to rank_i
+                # of size remote_nheads // tp_ratio, which correspond to
+                # self.block_len == remote_block_len//tp_ratio bytes.
+                addr = base_addr + block_offset + rank_offset
+                # (addr, len, device id)
+                blocks_data.append((addr, self.block_len, remote_tp_rank))
+        logger.debug(
+            "Created %s blocks for dst engine %s with remote rank %s and "
+            "local rank %s", len(blocks_data), engine_id, remote_tp_rank,
+            self.tp_rank)
+
+        # Register with NIXL.
+        descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM")
+        self.dst_xfer_side_handles[
+            engine_id] = self.nixl_wrapper.prep_xfer_dlist(
+                remote_agent_name, descs)
+
+        return remote_agent_name
+
+    def get_finished(self) -> tuple[set[str], set[str]]:
+        """
+        Get requests that are done sending or recving on this specific worker.
+        The scheduler process (via the MultiprocExecutor) will use this output
+        to track which workers are done.
+        """
+        done_sending = self._get_new_notifs()
+        done_recving = self._pop_done_transfers(self._recving_transfers)
+        if len(done_sending) > 0 or len(done_recving) > 0:
+            logger.debug(
+                "Rank %s, get_finished: %s requests done sending "
+                "and %s requests done recving", self.tp_rank,
+                len(done_sending), len(done_recving))
+
+        # Handle timeout to avoid stranding blocks on remote.
+        now = time.perf_counter()
+        while self._reqs_to_send:
+            req_id, expires = next(iter(self._reqs_to_send.items()))
+            # Sorted dict, oldest requests are put first so we can exit early.
+            if now < expires:
+                break
+            del self._reqs_to_send[req_id]
+            done_sending.add(req_id)
+
+        return done_sending, done_recving
+
+    def _get_new_notifs(self) -> set[str]:
+        """
+        Get req_ids which got a remote xfer message. When multiple consumers
+        are reading from the same producer (heterogeneous TP scenario), wait
+        for all consumers to be done pulling.
+        """
+        notified_req_ids: set[str] = set()
+        for notifs in self.nixl_wrapper.get_new_notifs().values():
+            for notif in notifs:
+                req_id, tp_ratio = notif.decode("utf-8").rsplit(":", 1)
+                self.consumer_notification_counts_by_req[req_id] += 1
+                # Wait all consumers (D) to be done reading before freeing.
+                if self.consumer_notification_counts_by_req[req_id] == int(
+                        tp_ratio):
+                    notified_req_ids.add(req_id)
+                    del self.consumer_notification_counts_by_req[req_id]
+                    del self._reqs_to_send[req_id]
+        return notified_req_ids
+
+    def _pop_done_transfers(
+            self, transfers: dict[str, list[tuple[int, float]]]) -> set[str]:
+        """
+        Pop completed xfers by checking for DONE state.
+        Args:
+            transfers: dict of req_id -> list[running_xfer]
+        Returns:
+            set of req_ids that have all done xfers
+        """
+        done_req_ids: set[str] = set()
+        for req_id, handles in list(transfers.items()):
+            in_progress = False
+            for handle, _xfer_stime in handles:
+                xfer_state = self.nixl_wrapper.check_xfer_state(handle)
+                if xfer_state == "DONE":
+                    self.nixl_wrapper.release_xfer_handle(handle)
+                elif xfer_state == "PROC":
+                    in_progress = True
+                    continue
+                else:
+                    raise RuntimeError("Transfer failed with state %s",
+                                       xfer_state)
+            if not in_progress:
+                done_req_ids.add(req_id)
+                del transfers[req_id]
+        return done_req_ids
+
+    def start_load_kv(self, metadata: NixlConnectorMetadata):
+        """
+        Start loading by triggering non-blocking nixl_xfer.
+        We check for these trnxs to complete in each step().
+        """
+        for req_id, meta in metadata.reqs_to_recv.items():
+            remote_engine_id = meta.remote_engine_id
+            logger.debug(
+                "start_load_kv for request %s from remote engine %s. "
+                "Num local_block_ids: %s. Num remote_block_ids: %s. ", req_id,
+                remote_engine_id, len(meta.local_block_ids),
+                len(meta.remote_block_ids))
+            if remote_engine_id not in self._remote_agents:
+                # Initiate handshake with remote engine to exchange metadata.
+                with self._handshake_lock:
+                    if remote_engine_id not in self._remote_agents:
+                        self._background_nixl_handshake(
+                            req_id, remote_engine_id, meta)
+                        continue
+
+            # Handshake already completed, start async read xfer.
+            self._read_blocks_for_req(req_id, meta)
+
+        # Start transfers for requests whose handshakes have now finished.
+        while not self._ready_requests.empty():
+            self._read_blocks_for_req(*self._ready_requests.get_nowait())
+
+        # Add to requests that are waiting to be read and track expiration.
+        self._reqs_to_send.update(metadata.reqs_to_send)
+
+    def _read_blocks_for_req(self, req_id: str, meta: ReqMeta):
+        logger.debug(
+            "Remote agent %s available, calling _read_blocks for req %s",
+            meta.remote_engine_id, req_id)
+        self._read_blocks(
+            request_id=req_id,
+            dst_engine_id=meta.remote_engine_id,
+            local_block_ids=meta.local_block_ids,
+            remote_block_ids=meta.remote_block_ids,
+        )
+
+    def _read_blocks(self, local_block_ids: list[int],
+                     remote_block_ids: list[int], dst_engine_id: str,
+                     request_id: str):
+        # NOTE(rob): having the staging blocks be on the READER side is
+        # not going to work well (since we will have to call rearrange tensors).
+        # after we detect the txn is complete (which means we cannot make the
+        # read trxn async easily). If we want to make "READ" happen cleanly,
+        # then we will need to have the staging blocks on the remote side.
+
+        # NOTE(rob): according to nvidia the staging blocks are used to
+        # saturate IB with heterogeneous TP sizes. We should remove the staging
+        # blocks until we are ready.
+
+        # Number of D TP workers that will read from dst P. Propagate tp_ratio
+        # on notification so that dst worker can wait before freeing blocks.
+        tp_ratio = self._tp_size[
+            self.engine_id] // self._tp_size[dst_engine_id]
+        notif_id = f"{request_id}:{tp_ratio}".encode()
+
+        # Full prefix cache hit: do not need to read remote blocks,
+        # just notify P worker that we have the blocks we need.
+        num_local_blocks = len(local_block_ids)
+        if num_local_blocks == 0:
+            remote_rank = self.tp_rank // tp_ratio
+            agent_name = self._remote_agents[dst_engine_id][remote_rank]
+            self.nixl_wrapper.send_notif(agent_name, notif_msg=notif_id)
+            return
+
+        # Partial prefix cache hit: just read uncomputed blocks.
+        num_remote_blocks = len(remote_block_ids)
+        assert num_local_blocks <= num_remote_blocks
+        if num_local_blocks < num_remote_blocks:
+            remote_block_ids = remote_block_ids[-num_local_blocks:]
+
+        # Get side handles.
+        local_xfer_side_handle = self.src_xfer_side_handle
+        remote_xfer_side_handle = self.dst_xfer_side_handles[dst_engine_id]
+
+        # NOTE (nicolo) With homogeneous TP, each TP worker loads KV from
+        # corresponding rank. With heterogeneous TP, fixing D>P, the D tp
+        # workers will issue xfers to parts of the P worker remote kv caches.
+
+        # Get descs ids.
+        local_block_descs_ids: list[int] = []
+        remote_block_descs_ids: list[int] = []
+        if not self.block_window_per_layer:
+            # Default case: assume global attention
+            remote_block_descs_ids = self._get_block_descs_ids(
+                dst_engine_id, remote_block_ids)
+            local_block_descs_ids = self._get_block_descs_ids(
+                self.engine_id, local_block_ids)
+        else:
+            # TODO(mgoin): remove this once we have hybrid memory allocator
+            # Optimization for models with local attention (Llama 4)
+            for layer_idx, block_window in enumerate(
+                    self.block_window_per_layer):
+                # For each layer:
+                if block_window is None:
+                    # If not chunked, we just use the
+                    # full block lists (global attention)
+                    layer_local_block_ids = local_block_ids
+                    layer_remote_block_ids = remote_block_ids
+                else:
+                    # If chunked, get the last block_window blocks
+                    layer_local_block_ids = local_block_ids[-block_window:]
+                    layer_remote_block_ids = remote_block_ids[-block_window:]
+
+                # Get descs ids for the layer.
+                layer_local_desc_ids = self._get_block_descs_ids(
+                    self.engine_id, layer_local_block_ids, layer_idx)
+                layer_remote_desc_ids = self._get_block_descs_ids(
+                    dst_engine_id, layer_remote_block_ids, layer_idx)
+
+                local_block_descs_ids.extend(layer_local_desc_ids)
+                remote_block_descs_ids.extend(layer_remote_desc_ids)
+
+        assert len(local_block_descs_ids) == len(remote_block_descs_ids)
+
+        # Prepare transfer with Nixl.
+        handle = self.nixl_wrapper.make_prepped_xfer(
+            "READ",
+            local_xfer_side_handle,
+            local_block_descs_ids,
+            remote_xfer_side_handle,
+            remote_block_descs_ids,
+            notif_msg=notif_id,
+        )
+
+        # Begin async xfer.
+        self.nixl_wrapper.transfer(handle)
+
+        # Use handle to check completion in future step().
+        # TODO (NickLucche) surface xfer elapsed time
+        self._recving_transfers[request_id].append(
+            (handle, time.perf_counter()))
+
+    def _get_block_descs_ids(self,
+                             engine_id: str,
+                             block_ids: list[int],
+                             layer_idx: Optional[int] = None) -> list[int]:
+        """
+        Get the descs ids for a set of block ids.
+        If layer_idx is provided, we use the region_ids for the given layer.
+        Otherwise, we use all regions.
+        """
+        if layer_idx is None:
+            region_ids = range(self.num_regions)
+        else:
+            assert layer_idx < self.num_layers
+            if self.num_layers < self.num_regions:
+                # If we have more regions than layers, we assume that
+                # the regions are organized as [K0, V0, K1, V1, ...]
+                # and we select K_i and V_i
+                assert 2 * self.num_layers == self.num_regions
+                region_ids = range(2 * layer_idx, 2 * layer_idx + 2)
+            else:
+                # Otherwise, we assume we have MLA and select i-th layer
+                assert self.num_layers == self.num_regions
+                region_ids = range(layer_idx, layer_idx + 1)
+
+        num_blocks = self.dst_num_blocks[engine_id]
+
+        # Compute the desc ids for each block.
+        descs_ids: list[int] = []
+        for reg_id in region_ids:
+            for block_id in block_ids:
+                descs_ids.append(reg_id * num_blocks + block_id)
+        return descs_ids
+
+
+@contextlib.contextmanager
+def zmq_ctx(socket_type: Any, addr: str) -> Iterator[zmq.Socket]:
+    """Context manager for a ZMQ socket"""
+
+    if socket_type not in (zmq.ROUTER, zmq.REQ):
+        raise ValueError(f"Unexpected socket type: {socket_type}")
+
+    ctx: Optional[zmq.Context] = None
+    try:
+        ctx = zmq.Context()  # type: ignore[attr-defined]
+        yield make_zmq_socket(ctx=ctx,
+                              path=addr,
+                              socket_type=socket_type,
+                              bind=socket_type == zmq.ROUTER)
+    finally:
+        if ctx is not None:
+            ctx.destroy(linger=0)
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
new file mode 100644
index 0000000..d47a754
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
@@ -0,0 +1,461 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Optional
+
+import regex as re
+import torch
+
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
+from vllm.distributed.kv_transfer.kv_connector.v1.p2p.p2p_nccl_engine import (
+    P2pNcclEngine)
+from vllm.distributed.parallel_state import get_world_group
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.mla.common import MLACommonMetadata
+from vllm.v1.core.sched.output import SchedulerOutput
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionMetadata
+    from vllm.forward_context import ForwardContext
+    from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class ReqMeta:
+    # Request Id
+    request_id: str
+    # Request tokens
+    token_ids: torch.Tensor
+    # Slot mappings, should have the same length as token_ids
+    slot_mapping: torch.Tensor
+
+    @staticmethod
+    def make_meta(request_id: str, token_ids: list[int], block_ids: list[int],
+                  block_size: int) -> "ReqMeta":
+        valid_num_tokens = len(token_ids)
+        token_ids_tensor = torch.tensor(token_ids)
+        block_ids_tensor = torch.tensor(block_ids)
+        num_blocks = block_ids_tensor.shape[0]
+        block_offsets = torch.arange(0, block_size)
+        slot_mapping = block_offsets.reshape((1, block_size)) + \
+                block_ids_tensor.reshape((num_blocks, 1)) * block_size
+        slot_mapping = slot_mapping.flatten()[:valid_num_tokens]
+
+        return ReqMeta(
+            request_id=request_id,
+            token_ids=token_ids_tensor,
+            slot_mapping=slot_mapping,
+        )
+
+
+@dataclass
+class P2pNcclConnectorMetadata(KVConnectorMetadata):
+    requests: list[ReqMeta]
+
+    def __init__(self):
+        self.requests = []
+
+    def add_request(
+        self,
+        request_id: str,
+        token_ids: list[int],
+        block_ids: list[int],
+        block_size: int,
+    ) -> None:
+        self.requests.append(
+            ReqMeta.make_meta(request_id, token_ids, block_ids, block_size))
+
+
+class P2pNcclConnector(KVConnectorBase_V1):
+
+    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
+        super().__init__(vllm_config=vllm_config, role=role)
+        self._block_size = vllm_config.cache_config.block_size
+        self._requests_need_load: dict[str, Any] = {}
+        self.config = vllm_config.kv_transfer_config
+        self.is_producer = self.config.is_kv_producer
+        self.chunked_prefill: dict[str, Any] = {}
+
+        self._rank = get_world_group().rank \
+            if role == KVConnectorRole.WORKER else 0
+        self._local_rank = get_world_group().local_rank \
+            if role == KVConnectorRole.WORKER else 0
+
+        self.p2p_nccl_engine = P2pNcclEngine(
+            local_rank=self._local_rank,
+            config=self.config,
+            hostname="",
+            port_offset=self._rank,
+        ) if role == KVConnectorRole.WORKER else None
+
+    # ==============================
+    # Worker-side methods
+    # ==============================
+
+    def start_load_kv(self, forward_context: "ForwardContext",
+                      **kwargs) -> None:
+        """Start loading the KV cache from the connector buffer to vLLM's
+        paged KV buffer.
+
+        Args:
+            forward_context (ForwardContext): the forward context.
+            **kwargs: additional arguments for the load operation
+
+        Note:
+            The number of elements in kv_caches and layer_names should be
+            the same.
+        """
+
+        # Only consumer/decode loads KV Cache
+        if self.is_producer:
+            return
+
+        assert self.p2p_nccl_engine is not None
+
+        attn_metadata = forward_context.attn_metadata
+        if attn_metadata is None:
+            return
+
+        def inject_kv_into_layer(
+            dst_kv_cache_layer: torch.Tensor,
+            src_kv_cache: torch.Tensor,
+            slot_mapping: torch.Tensor,
+            request_id: str,
+        ) -> None:
+            """Inject the KV cache into the layer.
+
+            Args:
+                dst_kv_cache_layer (torch.Tensor): the destination KV cache
+                    layer. In shape [2, num_pages, page_size, xxx] if not
+                    using MLA, [num_pages, page_size, xxx] otherwise.
+                src_kv_cache (torch.Tensor): the source KV cache. In shape
+                    [2, num_tokens, xxx] if not using MLA, [num_tokens, xxx]
+                    otherwise.
+                slot_mapping (torch.Tensor): the slot mapping. In shape
+                    [num_tokens].
+                request_id (str): request id for log
+            """
+            dst_kv_cache_layer_shape = dst_kv_cache_layer.shape
+            if isinstance(attn_metadata, MLACommonMetadata):
+                num_pages = dst_kv_cache_layer_shape[0]
+                page_size = dst_kv_cache_layer_shape[1]
+                dst_kv_cache_layer = dst_kv_cache_layer.reshape(
+                    num_pages * page_size, -1)
+                self.check_tensors_except_dim(dst_kv_cache_layer, src_kv_cache,
+                                              0)
+                num_token = src_kv_cache.shape[0]
+                if len(slot_mapping) == num_token:
+                    dst_kv_cache_layer[slot_mapping, ...] = src_kv_cache
+                else:
+                    dst_kv_cache_layer[slot_mapping[:num_token],
+                                       ...] = src_kv_cache
+                    logger.warning(
+                        "🚧src_kv_cache does not match, num_slot:%d, "
+                        "num_token:%d, request_id:%s", len(slot_mapping),
+                        num_token, request_id)
+
+                dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape)
+            else:
+                num_pages = dst_kv_cache_layer_shape[1]
+                page_size = dst_kv_cache_layer_shape[2]
+                dst_kv_cache_layer = dst_kv_cache_layer.reshape(
+                    2, num_pages * page_size, -1)
+                self.check_tensors_except_dim(dst_kv_cache_layer, src_kv_cache,
+                                              1)
+                num_token = src_kv_cache.shape[1]
+                if len(slot_mapping) == num_token:
+                    dst_kv_cache_layer[:, slot_mapping, ...] = src_kv_cache
+                else:
+                    dst_kv_cache_layer[:, slot_mapping[:num_token],
+                                       ...] = src_kv_cache
+                    logger.warning(
+                        "🚧src_kv_cache does not match, num_slot:%d, "
+                        "num_token:%d, request_id:%s", len(slot_mapping),
+                        num_token, request_id)
+
+                dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape)
+
+        # Get the metadata
+        metadata: KVConnectorMetadata = \
+            self._get_connector_metadata()
+        assert isinstance(metadata, P2pNcclConnectorMetadata)
+
+        if metadata is None:
+            return
+
+        # Load the KV for each request each layer
+        for request in metadata.requests:
+            for layer_name in forward_context.no_compile_layers:
+                attn_layer = forward_context.no_compile_layers[layer_name]
+                kv_cache_layer = attn_layer.kv_cache[ \
+                    forward_context.virtual_engine]
+
+                kv_cache = self.p2p_nccl_engine.recv_tensor(
+                    request.request_id + "#" + layer_name)
+
+                if kv_cache is None:
+                    logger.warning("🚧src_kv_cache is None, %s",
+                                   request.request_id)
+                    continue
+
+                inject_kv_into_layer(kv_cache_layer, kv_cache,
+                                     request.slot_mapping, request.request_id)
+
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        """Blocking until the KV for a specific layer is loaded into vLLM's
+        paged buffer.
+
+        This interface will be useful for layer-by-layer pipelining.
+
+        Args:
+            layer_name: the name of that layer
+        """
+        return
+
+    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
+                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+        """Start saving the KV cache of the layer from vLLM's paged buffer
+        to the connector.
+
+        Args:
+            layer_name (str): the name of the layer.
+            kv_layer (torch.Tensor): the paged KV buffer of the current
+                layer in vLLM.
+            attn_metadata (AttentionMetadata): the attention metadata.
+            **kwargs: additional arguments for the save operation.
+        """
+
+        # Only producer/prefill saves KV Cache
+        if not self.is_producer:
+            return
+
+        assert self.p2p_nccl_engine is not None
+
+        connector_metadata = self._get_connector_metadata()
+        assert isinstance(connector_metadata, P2pNcclConnectorMetadata)
+        for request in connector_metadata.requests:
+            request_id = request.request_id
+            ip, port = self.parse_request_id(request_id, True)
+            remote_address = ip + ":" + str(port + self._rank)
+            self.p2p_nccl_engine.send_tensor(
+                request_id + "#" + layer_name, kv_layer, remote_address,
+                request.slot_mapping,
+                isinstance(attn_metadata, MLACommonMetadata))
+
+    def wait_for_save(self):
+        if self.is_producer:
+            assert self.p2p_nccl_engine is not None
+            self.p2p_nccl_engine.wait_for_sent()
+
+    def get_finished(
+            self, finished_req_ids: set[str],
+            **kwargs) -> tuple[Optional[set[str]], Optional[set[str]]]:
+        """
+        Notifies worker-side connector ids of requests that have
+        finished generating tokens.
+
+        Returns:
+            ids of requests that have finished asynchronous transfer,
+            tuple of (sending/saving ids, recving/loading ids).
+            The finished saves/sends req ids must belong to a set provided in a
+            call to this method (this call or a prior one).
+        """
+
+        assert self.p2p_nccl_engine is not None
+
+        no_compile_layers = (
+            self._vllm_config.compilation_config.static_forward_context)
+        return self.p2p_nccl_engine.get_finished(finished_req_ids,
+                                                 no_compile_layers)
+
+    # ==============================
+    # Scheduler-side methods
+    # ==============================
+
+    def get_num_new_matched_tokens(
+        self,
+        request: "Request",
+        num_computed_tokens: int,
+    ) -> tuple[int, bool]:
+        """
+        Get number of new tokens that can be loaded from the
+        external KV cache beyond the num_computed_tokens.
+
+        Args:
+            request (Request): the request object.
+            num_computed_tokens (int): the number of locally
+                computed tokens for this request
+
+        Returns:
+            the number of tokens that can be loaded from the
+            external KV cache beyond what is already computed.
+        """
+        if self.is_producer:
+            return 0, False
+
+        num_external_tokens = (len(request.prompt_token_ids) - 1 -
+                               num_computed_tokens)
+
+        if num_external_tokens < 0:
+            num_external_tokens = 0
+
+        return num_external_tokens, False
+
+    def update_state_after_alloc(self, request: "Request",
+                                 blocks: "KVCacheBlocks",
+                                 num_external_tokens: int):
+        """
+        Update KVConnector state after block allocation.
+        """
+        if not self.is_producer and num_external_tokens > 0:
+            self._requests_need_load[request.request_id] = (
+                request, blocks.get_block_ids()[0])
+
+    def build_connector_meta(
+        self,
+        scheduler_output: SchedulerOutput,
+    ) -> KVConnectorMetadata:
+        """Build the connector metadata for this step.
+
+        This function should NOT modify any fields in the scheduler_output.
+        Also, calling this function will reset the state of the connector.
+
+        Args:
+            scheduler_output (SchedulerOutput): the scheduler output object.
+        """
+
+        meta = P2pNcclConnectorMetadata()
+
+        for new_req in scheduler_output.scheduled_new_reqs:
+            if self.is_producer:
+                num_scheduled_tokens = (
+                    scheduler_output.num_scheduled_tokens)[new_req.req_id]
+                num_tokens = num_scheduled_tokens + new_req.num_computed_tokens
+                # the request's prompt is chunked prefill
+                if num_tokens < len(new_req.prompt_token_ids):
+                    # 'CachedRequestData' has no attribute 'prompt_token_ids'
+                    self.chunked_prefill[new_req.req_id] = (
+                        new_req.block_ids[0], new_req.prompt_token_ids)
+                    continue
+                # the request's prompt is not chunked prefill
+                meta.add_request(request_id=new_req.req_id,
+                                 token_ids=new_req.prompt_token_ids,
+                                 block_ids=new_req.block_ids[0],
+                                 block_size=self._block_size)
+                continue
+            if new_req.req_id in self._requests_need_load:
+                meta.add_request(request_id=new_req.req_id,
+                                 token_ids=new_req.prompt_token_ids,
+                                 block_ids=new_req.block_ids[0],
+                                 block_size=self._block_size)
+                self._requests_need_load.pop(new_req.req_id)
+
+        cached_reqs = scheduler_output.scheduled_cached_reqs
+        for i, req_id in enumerate(cached_reqs.req_ids):
+            num_computed_tokens = cached_reqs.num_computed_tokens[i]
+            new_block_ids = cached_reqs.new_block_ids[i]
+            resumed_from_preemption = cached_reqs.resumed_from_preemption[i]
+
+            if self.is_producer:
+                num_scheduled_tokens = (
+                    scheduler_output.num_scheduled_tokens)[req_id]
+                num_tokens = (num_scheduled_tokens + num_computed_tokens)
+                assert req_id in self.chunked_prefill
+                block_ids = new_block_ids[0]
+                if not resumed_from_preemption:
+                    block_ids = (self.chunked_prefill[req_id][0] + block_ids)
+                prompt_token_ids = self.chunked_prefill[req_id][1]
+                # the request's prompt is chunked prefill again
+                if num_tokens < len(prompt_token_ids):
+                    self.chunked_prefill[req_id] = (block_ids,
+                                                    prompt_token_ids)
+                    continue
+                # the request's prompt is all prefilled finally
+                meta.add_request(request_id=req_id,
+                                 token_ids=prompt_token_ids,
+                                 block_ids=block_ids,
+                                 block_size=self._block_size)
+                self.chunked_prefill.pop(req_id, None)
+                continue
+
+            # NOTE(rob): here we rely on the resumed requests being
+            # the first N requests in the list scheduled_cache_reqs.
+            if not resumed_from_preemption:
+                break
+            if req_id in self._requests_need_load:
+                request, _ = self._requests_need_load.pop(req_id)
+                total_tokens = num_computed_tokens + 1
+                token_ids = request.all_token_ids[:total_tokens]
+
+                # NOTE(rob): For resumed req, new_block_ids is all
+                # of the block_ids for the request.
+                block_ids = new_block_ids[0]
+
+                meta.add_request(request_id=req_id,
+                                 token_ids=token_ids,
+                                 block_ids=block_ids,
+                                 block_size=self._block_size)
+
+        self._requests_need_load.clear()
+        return meta
+
+    def request_finished(
+        self,
+        request: "Request",
+        block_ids: list[int],
+    ) -> tuple[bool, Optional[dict[str, Any]]]:
+        """
+        Called when a request has finished, before its blocks are freed.
+
+        Returns:
+            True if the request is being saved/sent asynchronously and blocks
+            should not be freed until the request_id is returned from
+            get_finished().
+            Optional KVTransferParams to be included in the request outputs
+            returned by the engine.
+        """
+
+        self.chunked_prefill.pop(request.request_id, None)
+
+        return False, None
+
+    # ==============================
+    # Static methods
+    # ==============================
+
+    @staticmethod
+    def parse_request_id(request_id: str, is_prefill=True) -> tuple[str, int]:
+        # Regular expression to match the string hostname and integer port
+        if is_prefill:
+            pattern = r"___decode_addr_(.*):(\d+)"
+        else:
+            pattern = r"___prefill_addr_(.*):(\d+)___"
+
+        # Use re.search to find the pattern in the request_id
+        match = re.search(pattern, request_id)
+        if match:
+            # Extract the ranks
+            ip = match.group(1)
+            port = int(match.group(2))
+
+            return ip, port
+        raise ValueError(
+            f"Request id {request_id} does not contain hostname and port")
+
+    @staticmethod
+    def check_tensors_except_dim(tensor1, tensor2, dim):
+        shape1 = tensor1.size()
+        shape2 = tensor2.size()
+
+        if len(shape1) != len(shape2) or not all(
+                s1 == s2
+                for i, (s1, s2) in enumerate(zip(shape1, shape2)) if i != dim):
+            raise NotImplementedError(
+                "Currently, only symmetric TP is supported. Asymmetric TP, PP,"
+                "and others will be supported in future PRs.")
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
new file mode 100644
index 0000000..b94f229
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
@@ -0,0 +1,568 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import logging
+import os
+import threading
+import time
+import typing
+from collections import deque
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import Any, Optional
+
+import msgpack
+import torch
+import zmq
+
+from vllm.config import KVTransferConfig
+from vllm.distributed.device_communicators.pynccl_wrapper import (
+    NCCLLibrary, buffer_type, cudaStream_t, ncclComm_t, ncclDataTypeEnum)
+from vllm.distributed.kv_transfer.kv_connector.v1.p2p.tensor_memory_pool import (  # noqa: E501
+    TensorMemoryPool)
+from vllm.utils import current_stream, get_ip
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_MEM_POOL_SIZE_GB = 32
+
+
+@contextmanager
+def set_p2p_nccl_context(num_channels: str):
+    original_values: dict[str, Any] = {}
+    env_vars = [
+        'NCCL_MAX_NCHANNELS',
+        'NCCL_MIN_NCHANNELS',
+        'NCCL_CUMEM_ENABLE',
+        'NCCL_BUFFSIZE',
+        'NCCL_PROTO',  # LL,LL128,SIMPLE
+        'NCCL_ALGO',  # RING,TREE
+    ]
+
+    for var in env_vars:
+        original_values[var] = os.environ.get(var)
+
+    logger.info("set_p2p_nccl_context, original_values: %s", original_values)
+
+    try:
+        os.environ['NCCL_MAX_NCHANNELS'] = num_channels
+        os.environ['NCCL_MIN_NCHANNELS'] = num_channels
+        os.environ['NCCL_CUMEM_ENABLE'] = '1'
+        yield
+    finally:
+        for var in env_vars:
+            if original_values[var] is not None:
+                os.environ[var] = original_values[var]
+            else:
+                os.environ.pop(var, None)
+
+
+@dataclass
+class SendQueueItem:
+    tensor_id: str
+    remote_address: str
+    tensor: torch.Tensor
+    slot_mapping: torch.Tensor
+    is_mla: bool
+
+
+class P2pNcclEngine:
+
+    def __init__(self,
+                 local_rank: int,
+                 config: KVTransferConfig,
+                 hostname: str = "",
+                 port_offset: int = 0,
+                 library_path: Optional[str] = None) -> None:
+        self.config = config
+        self.rank = port_offset
+        self.local_rank = local_rank
+        self.device = torch.device(f"cuda:{self.local_rank}")
+        self.nccl = NCCLLibrary(library_path)
+
+        if not hostname:
+            hostname = get_ip()
+        port = int(self.config.kv_port) + port_offset
+        if port == 0:
+            raise ValueError("Port cannot be 0")
+        self._hostname = hostname
+        self._port = port
+
+        # Each card corresponds to a ZMQ address.
+        self.zmq_address = f"{self._hostname}:{self._port}"
+
+        # The `http_port` must be consistent with the port of OpenAI.
+        self.http_address = (
+            f"{self._hostname}:"
+            f"{self.config.kv_connector_extra_config['http_port']}")
+
+        # If `proxy_ip` or `proxy_port` is `""`,
+        # then the ping thread will not be enabled.
+        proxy_ip = self.config.get_from_extra_config("proxy_ip", "")
+        proxy_port = self.config.get_from_extra_config("proxy_port", "")
+        if proxy_ip == "" or proxy_port == "":
+            self.proxy_address = ""
+        else:
+            self.proxy_address = proxy_ip + ":" + proxy_port
+
+        self.context = zmq.Context()
+        self.router_socket = self.context.socket(zmq.ROUTER)
+        self.router_socket.bind(f"tcp://{self.zmq_address}")
+
+        self.poller = zmq.Poller()
+        self.poller.register(self.router_socket, zmq.POLLIN)
+
+        self.send_store_cv = threading.Condition()
+        self.send_queue_cv = threading.Condition()
+        self.recv_store_cv = threading.Condition()
+
+        self.send_stream = torch.cuda.Stream()
+        self.recv_stream = torch.cuda.Stream()
+
+        mem_pool_size_gb = float(
+            self.config.get_from_extra_config("mem_pool_size_gb",
+                                              DEFAULT_MEM_POOL_SIZE_GB))
+        self.pool = TensorMemoryPool(max_block_size=int(mem_pool_size_gb *
+                                                        1024**3))  # GB
+
+        # The sending type includes tree mutually exclusive options:
+        # PUT, GET, PUT_ASYNC.
+        self.send_type = self.config.get_from_extra_config(
+            "send_type", "PUT_ASYNC")
+        if self.send_type == "GET":
+            # tensor_id: torch.Tensor
+            self.send_store: dict[str, torch.Tensor] = {}
+        else:
+            # PUT or PUT_ASYNC
+            # tensor_id: torch.Tensor
+            self.send_queue: deque[SendQueueItem] = deque()
+            self.send_request_id_to_tensor_ids: dict[str, set[str]] = {}
+            if self.send_type == "PUT_ASYNC":
+                self._send_thread = threading.Thread(target=self.send_async,
+                                                     daemon=True)
+                self._send_thread.start()
+
+        # tensor_id: torch.Tensor/(addr, dtype, shape)
+        self.recv_store: dict[str, Any] = {}
+        self.recv_request_id_to_tensor_ids: dict[str, set[str]] = {}
+        self.socks: dict[str, Any] = {}  # remote_address: client socket
+        self.comms: dict[str, Any] = {}  # remote_address: (ncclComm_t, rank)
+
+        self.buffer_size = 0
+        self.buffer_size_threshold = float(self.config.kv_buffer_size)
+
+        self.nccl_num_channels = self.config.get_from_extra_config(
+            "nccl_num_channels", "8")
+
+        self._listener_thread = threading.Thread(
+            target=self.listen_for_requests, daemon=True)
+        self._listener_thread.start()
+
+        self._ping_thread = None
+        if port_offset == 0 and self.proxy_address != "":
+            self._ping_thread = threading.Thread(target=self.ping, daemon=True)
+            self._ping_thread.start()
+
+        logger.info(
+            "💯P2pNcclEngine init, rank:%d, local_rank:%d, http_address:%s, "
+            "zmq_address:%s, proxy_address:%s, send_type:%s, buffer_size_"
+            "threshold:%.2f, nccl_num_channels:%s", self.rank, self.local_rank,
+            self.http_address, self.zmq_address, self.proxy_address,
+            self.send_type, self.buffer_size_threshold, self.nccl_num_channels)
+
+    def create_connect(self, remote_address: typing.Optional[str] = None):
+        assert remote_address is not None
+        if remote_address not in self.socks:
+            sock = self.context.socket(zmq.DEALER)
+            sock.setsockopt_string(zmq.IDENTITY, self.zmq_address)
+            sock.connect(f"tcp://{remote_address}")
+            self.socks[remote_address] = sock
+            if remote_address in self.comms:
+                logger.info("👋comm exists, remote_address:%s, comms:%s",
+                            remote_address, self.comms)
+                return sock, self.comms[remote_address]
+
+            unique_id = self.nccl.ncclGetUniqueId()
+            data = {"cmd": "NEW", "unique_id": bytes(unique_id.internal)}
+            sock.send(msgpack.dumps(data))
+
+            with torch.cuda.device(self.device):
+                rank = 0
+                with set_p2p_nccl_context(self.nccl_num_channels):
+                    comm: ncclComm_t = self.nccl.ncclCommInitRank(
+                        2, unique_id, rank)
+                self.comms[remote_address] = (comm, rank)
+                logger.info("🤝ncclCommInitRank Success, %s👉%s, MyRank:%s",
+                            self.zmq_address, remote_address, rank)
+
+        return self.socks[remote_address], self.comms[remote_address]
+
+    def send_tensor(
+        self,
+        tensor_id: str,
+        tensor: torch.Tensor,
+        remote_address: typing.Optional[str] = None,
+        slot_mapping: torch.Tensor = None,
+        is_mla: bool = False,
+    ) -> bool:
+        if remote_address is None:
+            with self.recv_store_cv:
+                self.recv_store[tensor_id] = tensor
+                self.recv_store_cv.notify()
+            return True
+
+        item = SendQueueItem(tensor_id=tensor_id,
+                             remote_address=remote_address,
+                             tensor=tensor,
+                             slot_mapping=slot_mapping,
+                             is_mla=is_mla)
+
+        if self.send_type == "PUT":
+            return self.send_sync(item)
+
+        if self.send_type == "PUT_ASYNC":
+            with self.send_queue_cv:
+                self.send_queue.append(item)
+                self.send_queue_cv.notify()
+            return True
+
+        # GET
+        with self.send_store_cv:
+            tensor_size = tensor.element_size() * tensor.numel()
+            while (self.buffer_size + tensor_size
+                   > self.buffer_size_threshold):
+                oldest_tenser_id = next(iter(self.send_store))
+                oldest_tenser = self.send_store.pop(oldest_tenser_id)
+                oldest_tenser_size = oldest_tenser.element_size(
+                ) * oldest_tenser.numel()
+                self.buffer_size -= oldest_tenser_size
+                logger.info(
+                    "⛔[GET]Send to %s, tensor_id:%s, tensor_size:%d,"
+                    " buffer_size:%d, oldest_tenser_size:%d, rank:%d",
+                    remote_address, tensor_id, tensor_size, self.buffer_size,
+                    oldest_tenser_size, self.rank)
+
+            self.send_store[tensor_id] = tensor
+            self.buffer_size += tensor_size
+            logger.debug(
+                "🔵[GET]Send to %s, tensor_id:%s, tensor_size:%d, "
+                "shape:%s, rank:%d, buffer_size:%d(%.2f%%)", remote_address,
+                tensor_id, tensor_size, tensor.shape, self.rank,
+                self.buffer_size,
+                self.buffer_size / self.buffer_size_threshold * 100)
+        return True
+
+    def recv_tensor(
+        self,
+        tensor_id: str,
+        remote_address: typing.Optional[str] = None,
+    ) -> torch.Tensor:
+        if self.send_type == "PUT" or self.send_type == "PUT_ASYNC":
+            start_time = time.time()
+            with self.recv_store_cv:
+                while tensor_id not in self.recv_store:
+                    self.recv_store_cv.wait()
+                tensor = self.recv_store[tensor_id]
+
+            if tensor is not None:
+                if isinstance(tensor, tuple):
+                    addr, dtype, shape = tensor
+                    tensor = self.pool.load_tensor(addr, dtype, shape,
+                                                   self.device)
+                else:
+                    self.buffer_size -= (tensor.element_size() *
+                                         tensor.numel())
+            else:
+                duration = time.time() - start_time
+                logger.warning(
+                    "🔴[PUT]Recv From %s, tensor_id:%s, duration:%.3fms, "
+                    "rank:%d", remote_address, tensor_id, duration * 1000,
+                    self.rank)
+            return tensor
+
+        # GET
+        if remote_address is None:
+            return None
+
+        if remote_address not in self.socks:
+            self.create_connect(remote_address)
+
+        sock = self.socks[remote_address]
+        comm, rank = self.comms[remote_address]
+
+        data = {"cmd": "GET", "tensor_id": tensor_id}
+        sock.send(msgpack.dumps(data))
+
+        message = sock.recv()
+        data = msgpack.loads(message)
+        if data["ret"] != 0:
+            logger.warning("🔴[GET]Recv From %s, tensor_id: %s, ret: %d",
+                           remote_address, tensor_id, data["ret"])
+            return None
+
+        with torch.cuda.stream(self.recv_stream):
+            tensor = torch.empty(data["shape"],
+                                 dtype=getattr(torch, data["dtype"]),
+                                 device=self.device)
+
+        self.recv(comm, tensor, rank ^ 1, self.recv_stream)
+
+        return tensor
+
+    def listen_for_requests(self):
+        while True:
+            socks = dict(self.poller.poll())
+            if self.router_socket not in socks:
+                continue
+
+            remote_address, message = self.router_socket.recv_multipart()
+            data = msgpack.loads(message)
+            if data["cmd"] == "NEW":
+                unique_id = self.nccl.unique_id_from_bytes(
+                    bytes(data["unique_id"]))
+                with torch.cuda.device(self.device):
+                    rank = 1
+                    with set_p2p_nccl_context(self.nccl_num_channels):
+                        comm: ncclComm_t = self.nccl.ncclCommInitRank(
+                            2, unique_id, rank)
+                    self.comms[remote_address.decode()] = (comm, rank)
+                    logger.info("🤝ncclCommInitRank Success, %s👈%s, MyRank:%s",
+                                self.zmq_address, remote_address.decode(),
+                                rank)
+            elif data["cmd"] == "PUT":
+                tensor_id = data["tensor_id"]
+                try:
+                    with torch.cuda.stream(self.recv_stream):
+                        tensor = torch.empty(data["shape"],
+                                             dtype=getattr(
+                                                 torch, data["dtype"]),
+                                             device=self.device)
+                    self.router_socket.send_multipart([remote_address, b"0"])
+                    comm, rank = self.comms[remote_address.decode()]
+                    self.recv(comm, tensor, rank ^ 1, self.recv_stream)
+                    tensor_size = tensor.element_size() * tensor.numel()
+                    if (self.buffer_size + tensor_size
+                            > self.buffer_size_threshold):
+                        # Store Tensor in memory pool
+                        addr = self.pool.store_tensor(tensor)
+                        tensor = (addr, tensor.dtype, tensor.shape)
+                        logger.warning(
+                            "🔴[PUT]Recv Tensor, Out Of Threshold, "
+                            "%s👈%s, data:%s, addr:%d", self.zmq_address,
+                            remote_address.decode(), data, addr)
+                    else:
+                        self.buffer_size += tensor_size
+
+                except torch.cuda.OutOfMemoryError:
+                    self.router_socket.send_multipart([remote_address, b"1"])
+                    tensor = None
+                    logger.warning(
+                        "🔴[PUT]Recv Tensor, Out Of Memory, %s👈%s, "
+                        "data:%s", self.zmq_address, remote_address.decode(),
+                        data)
+
+                with self.recv_store_cv:
+                    self.recv_store[tensor_id] = tensor
+                    self.have_received_tensor_id(tensor_id)
+                    self.recv_store_cv.notify()
+
+            elif data["cmd"] == "GET":
+                tensor_id = data["tensor_id"]
+                with self.send_store_cv:
+                    tensor = self.send_store.pop(tensor_id, None)
+                    if tensor is not None:
+                        data = {
+                            "ret": 0,
+                            "shape": tensor.shape,
+                            "dtype": str(tensor.dtype).replace("torch.", "")
+                        }
+                        # LRU
+                        self.send_store[tensor_id] = tensor
+                        self.have_sent_tensor_id(tensor_id)
+                    else:
+                        data = {"ret": 1}
+
+                self.router_socket.send_multipart(
+                    [remote_address, msgpack.dumps(data)])
+
+                if data["ret"] == 0:
+                    comm, rank = self.comms[remote_address.decode()]
+                    self.send(comm, tensor.to(self.device), rank ^ 1,
+                              self.send_stream)
+            else:
+                logger.warning(
+                    "🚧Unexpected, Received message from %s, data:%s",
+                    remote_address, data)
+
+    def have_sent_tensor_id(self, tensor_id: str):
+        request_id = tensor_id.split('#')[0]
+        if request_id not in self.send_request_id_to_tensor_ids:
+            self.send_request_id_to_tensor_ids[request_id] = set()
+        self.send_request_id_to_tensor_ids[request_id].add(tensor_id)
+
+    def have_received_tensor_id(self, tensor_id: str):
+        request_id = tensor_id.split('#')[0]
+        if request_id not in self.recv_request_id_to_tensor_ids:
+            self.recv_request_id_to_tensor_ids[request_id] = set()
+        self.recv_request_id_to_tensor_ids[request_id].add(tensor_id)
+
+    def send_async(self):
+        while True:
+            with self.send_queue_cv:
+                while not self.send_queue:
+                    self.send_queue_cv.wait()
+                item = self.send_queue.popleft()
+                if not self.send_queue:
+                    self.send_queue_cv.notify()
+            self.send_sync(item)
+
+    def wait_for_sent(self):
+        if self.send_type == "PUT_ASYNC":
+            start_time = time.time()
+            with self.send_queue_cv:
+                while self.send_queue:
+                    self.send_queue_cv.wait()
+            duration = time.time() - start_time
+            logger.debug(
+                "🚧[PUT_ASYNC]It took %.3fms to wait for the send_queue"
+                " to be empty, rank:%d", duration * 1000, self.rank)
+
+    def send_sync(self, item: SendQueueItem) -> bool:
+        if item.remote_address is None:
+            return False
+        if item.remote_address not in self.socks:
+            self.create_connect(item.remote_address)
+
+        with self.send_stream:
+            tensor = self.extract_kv_from_layer(item.is_mla, item.tensor,
+                                                item.slot_mapping)
+
+        sock = self.socks[item.remote_address]
+        comm, rank = self.comms[item.remote_address]
+        data = {
+            "cmd": "PUT",
+            "tensor_id": item.tensor_id,
+            "shape": tensor.shape,
+            "dtype": str(tensor.dtype).replace("torch.", "")
+        }
+        sock.send(msgpack.dumps(data))
+
+        response = sock.recv()
+        if response != b"0":
+            logger.error(
+                "🔴Send Tensor, Peer Out Of Memory/Threshold, %s 👉 %s, "
+                "MyRank:%s, data:%s, tensor:%s, size:%fGB, response:%s",
+                self.zmq_address, item.remote_address, rank, data,
+                tensor.shape,
+                tensor.element_size() * tensor.numel() / 1024**3,
+                response.decode())
+            return False
+
+        self.send(comm, tensor.to(self.device), rank ^ 1, self.send_stream)
+
+        if self.send_type == "PUT_ASYNC":
+            self.have_sent_tensor_id(item.tensor_id)
+
+        return True
+
+    def get_finished(
+            self, finished_req_ids: set[str], no_compile_layers
+    ) -> tuple[Optional[set[str]], Optional[set[str]]]:
+        """
+        Notifies worker-side connector ids of requests that have
+        finished generating tokens.
+
+        Returns:
+            ids of requests that have finished asynchronous transfer,
+            tuple of (sending/saving ids, recving/loading ids).
+            The finished saves/sends req ids must belong to a set provided in a
+            call to this method (this call or a prior one).
+        """
+
+        # Clear the buffer upon request completion.
+        for request_id in finished_req_ids:
+            for layer_name in no_compile_layers:
+                tensor_id = request_id + "#" + layer_name
+                if tensor_id in self.recv_store:
+                    with self.recv_store_cv:
+                        tensor = self.recv_store.pop(tensor_id, None)
+                        self.send_request_id_to_tensor_ids.pop(
+                            request_id, None)
+                        self.recv_request_id_to_tensor_ids.pop(
+                            request_id, None)
+                    if isinstance(tensor, tuple):
+                        addr, _, _ = tensor
+                        self.pool.free(addr)
+
+        # TODO:Retrieve requests that have already sent the KV cache.
+        finished_sending: set[str] = set()
+
+        # TODO:Retrieve requests that have already received the KV cache.
+        finished_recving: set[str] = set()
+
+        return finished_sending or None, finished_recving or None
+
+    def ping(self):
+        sock = self.context.socket(zmq.DEALER)
+        sock.setsockopt_string(zmq.IDENTITY, self.zmq_address)
+        logger.debug("ping start, zmq_address:%s", self.zmq_address)
+        sock.connect(f"tcp://{self.proxy_address}")
+        data = {
+            "type": "P" if self.config.is_kv_producer else "D",
+            "http_address": self.http_address,
+            "zmq_address": self.zmq_address
+        }
+        while True:
+            sock.send(msgpack.dumps(data))
+            time.sleep(3)
+
+    def send(self, comm, tensor: torch.Tensor, dst: int, stream=None):
+        assert tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {tensor.device}")
+        if stream is None:
+            stream = current_stream()
+
+        with torch.cuda.stream(stream):
+            self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(),
+                               ncclDataTypeEnum.from_torch(tensor.dtype), dst,
+                               comm, cudaStream_t(stream.cuda_stream))
+        stream.synchronize()
+
+    def recv(self, comm, tensor: torch.Tensor, src: int, stream=None):
+        assert tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {tensor.device}")
+        if stream is None:
+            stream = current_stream()
+
+        with torch.cuda.stream(stream):
+            self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(),
+                               ncclDataTypeEnum.from_torch(tensor.dtype), src,
+                               comm, cudaStream_t(stream.cuda_stream))
+        stream.synchronize()
+
+    def close(self) -> None:
+        self._listener_thread.join()
+        if self.send_type == "PUT_ASYNC":
+            self._send_thread.join()
+        if self._ping_thread is not None:
+            self._ping_thread.join()
+
+    @staticmethod
+    def extract_kv_from_layer(
+        is_mla: bool,
+        layer: torch.Tensor,
+        slot_mapping: torch.Tensor,
+    ) -> torch.Tensor:
+        """Extract the KV cache from the layer.
+        Assume the shape of the layer is (2, num_pages, page_size, xxx)
+        if MLA is not used, and (num_pages, page_size, xxx) otherwise.
+        """
+        if is_mla:
+            num_pages, page_size = layer.shape[0], layer.shape[1]
+            return layer.reshape(num_pages * page_size, -1)[slot_mapping, ...]
+
+        num_pages, page_size = layer.shape[1], layer.shape[2]
+        return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping,
+                                                           ...]
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
new file mode 100644
index 0000000..02e3bc6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import atexit
+import ctypes
+import math
+from dataclasses import dataclass
+
+import torch
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class MemoryBlock:
+    size: int
+    addr: int
+
+
+"""A memory pool for managing pinned host memory allocations for tensors.
+
+This class implements a buddy allocation system to efficiently manage pinned
+host memory for tensor storage. It supports allocation, deallocation, and
+tensor storage/retrieval operations.
+
+Key Features:
+- Uses power-of-two block sizes for efficient buddy allocation
+- Supports splitting and merging of memory blocks
+- Provides methods to store CUDA tensors in pinned host memory
+- Allows loading tensors from pinned memory back to device
+- Automatically cleans up memory on destruction
+
+Attributes:
+    max_block_size (int): Maximum block size (rounded to nearest power of two)
+    min_block_size (int): Minimum block size (rounded to nearest power of two)
+    free_lists (dict): Dictionary of free memory blocks by size
+    allocated_blocks (dict): Dictionary of currently allocated blocks
+    base_tensor (torch.Tensor): Base pinned memory tensor
+    base_address (int): Base memory address of the pinned memory region
+
+Example:
+    >>> pool = TensorMemoryPool(max_block_size=1024*1024)
+    >>> tensor = torch.randn(100, device='cuda')
+    >>> addr = pool.store_tensor(tensor)
+    >>> loaded_tensor = pool.load_tensor(addr, tensor.dtype,
+    ...                                  tensor.shape, 'cuda')
+    >>> pool.free(addr)
+"""
+
+
+class TensorMemoryPool:
+    """Initializes the memory pool with given size constraints.
+
+    Args:
+        max_block_size (int): Maximum size of memory blocks to manage
+        min_block_size (int, optional): Minimum size of memory blocks
+            to manage. Defaults to 512.
+
+    Raises:
+        ValueError: If block sizes are invalid or max_block_size is less
+            than min_block_size
+    """
+
+    def __init__(self, max_block_size: int, min_block_size: int = 512):
+        if max_block_size <= 0 or min_block_size <= 0:
+            raise ValueError("Block sizes must be positive")
+        if max_block_size < min_block_size:
+            raise ValueError(
+                "Max block size must be greater than min block size")
+
+        self.max_block_size = self._round_to_power_of_two(max_block_size)
+        self.min_block_size = self._round_to_power_of_two(min_block_size)
+
+        self.free_lists: dict[int, dict[int, MemoryBlock]] = {}
+        self.allocated_blocks: dict[int, MemoryBlock] = {}
+
+        self._initialize_free_lists()
+        self._allocate_pinned_memory()
+
+        atexit.register(self.cleanup)
+
+    def _round_to_power_of_two(self, size: int) -> int:
+        return 1 << (size - 1).bit_length()
+
+    def _initialize_free_lists(self):
+        size = self.max_block_size
+        while size >= self.min_block_size:
+            self.free_lists[size] = {}
+            size //= 2
+
+    def _allocate_pinned_memory(self):
+        self.base_tensor = torch.empty(self.max_block_size // 4,
+                                       dtype=torch.float32,
+                                       pin_memory=True)
+        self.base_address = self.base_tensor.data_ptr()
+        initial_block = MemoryBlock(size=self.max_block_size,
+                                    addr=self.base_address)
+        self.free_lists[self.max_block_size][
+            initial_block.addr] = initial_block
+        logger.debug("TensorMemoryPool, base_address:", self.base_address,
+                     self.base_address % self.max_block_size)
+
+    def allocate(self, size: int) -> int:
+        """Allocates a memory block of at least the requested size.
+
+        Args:
+            size (int): Minimum size of memory to allocate
+
+        Returns:
+            int: Address of the allocated memory block
+
+        Raises:
+            ValueError: If size is invalid or insufficient memory is available
+        """
+        if size <= 0:
+            raise ValueError("Allocation size must be positive")
+
+        required_size = self._round_to_power_of_two(
+            max(size, self.min_block_size))
+        if required_size > self.max_block_size:
+            raise ValueError("Requested size exceeds maximum block size")
+
+        current_size = required_size
+        while current_size <= self.max_block_size:
+            if self.free_lists[current_size]:
+                _, block = self.free_lists[current_size].popitem()
+                self._split_block(block, required_size)
+                self.allocated_blocks[block.addr] = block
+                return block.addr
+            current_size *= 2
+
+        raise ValueError("Insufficient memory")
+
+    def _split_block(self, block: MemoryBlock, required_size: int):
+        while (block.size > required_size
+               and block.size // 2 >= self.min_block_size):
+            buddy_size = block.size // 2
+            buddy_addr = block.addr + buddy_size
+
+            buddy = MemoryBlock(size=buddy_size, addr=buddy_addr)
+            block.size = buddy_size
+
+            self.free_lists[buddy_size][buddy.addr] = buddy
+
+    def free(self, addr: int):
+        """Frees an allocated memory block.
+
+        Args:
+            addr (int): Address of the block to free
+
+        Raises:
+            ValueError: If address is invalid or not allocated
+        """
+        if addr not in self.allocated_blocks:
+            raise ValueError("Invalid address to free")
+
+        block = self.allocated_blocks.pop(addr)
+        self._merge_buddies(block)
+
+    def _merge_buddies(self, block: MemoryBlock):
+        MAX_MERGE_DEPTH = 30
+        depth = 0
+
+        while depth < MAX_MERGE_DEPTH:
+            buddy_offset = block.size if (block.addr - self.base_address) % (
+                2 * block.size) == 0 else -block.size
+            buddy_addr = block.addr + buddy_offset
+            buddy = self.free_lists[block.size].get(buddy_addr)
+            if buddy:
+                del self.free_lists[buddy.size][buddy.addr]
+                merged_addr = min(block.addr, buddy.addr)
+                merged_size = block.size * 2
+                block = MemoryBlock(size=merged_size, addr=merged_addr)
+                depth += 1
+            else:
+                break
+        self.free_lists[block.size][block.addr] = block
+
+    def store_tensor(self, tensor: torch.Tensor) -> int:
+        """Stores a CUDA tensor in pinned host memory.
+
+        Args:
+            tensor (torch.Tensor): CUDA tensor to store
+
+        Returns:
+            int: Address where the tensor is stored
+
+        Raises:
+            ValueError: If tensor is not on CUDA or allocation fails
+        """
+        if not tensor.is_cuda:
+            raise ValueError("Only CUDA tensors can be stored")
+
+        size = tensor.element_size() * tensor.numel()
+        addr = self.allocate(size)
+        block = self.allocated_blocks[addr]
+
+        if block.size < size:
+            self.free(addr)
+            raise ValueError(
+                f"Allocated block size {block.size} is smaller than "
+                f"required size {size}")
+
+        try:
+            buffer = (ctypes.c_byte * block.size).from_address(block.addr)
+            cpu_tensor = torch.frombuffer(buffer,
+                                          dtype=tensor.dtype,
+                                          count=tensor.numel()).reshape(
+                                              tensor.shape)
+        except ValueError as err:
+            self.free(addr)
+            raise ValueError(f"Failed to create tensor view: {err}") from err
+
+        cpu_tensor.copy_(tensor)
+
+        return addr
+
+    def load_tensor(self, addr: int, dtype: torch.dtype,
+                    shape: tuple[int, ...], device) -> torch.Tensor:
+        """Loads a tensor from pinned host memory to the specified device.
+
+        Args:
+            addr (int): Address where tensor is stored
+            dtype (torch.dtype): Data type of the tensor
+            shape (tuple[int, ...]): Shape of the tensor
+            device: Target device for the loaded tensor
+
+        Returns:
+            torch.Tensor: The loaded tensor on the specified device
+
+        Raises:
+            ValueError: If address is invalid or sizes don't match
+        """
+        if addr not in self.allocated_blocks:
+            raise ValueError("Invalid address to load")
+
+        block = self.allocated_blocks[addr]
+        num_elements = math.prod(shape)
+        dtype_size = torch.tensor([], dtype=dtype).element_size()
+        required_size = num_elements * dtype_size
+
+        if required_size > block.size:
+            raise ValueError("Requested tensor size exceeds block size")
+
+        buffer = (ctypes.c_byte * block.size).from_address(block.addr)
+        cpu_tensor = torch.frombuffer(buffer, dtype=dtype,
+                                      count=num_elements).reshape(shape)
+
+        cuda_tensor = torch.empty(shape, dtype=dtype, device=device)
+
+        cuda_tensor.copy_(cpu_tensor)
+
+        return cuda_tensor
+
+    def cleanup(self):
+        """Cleans up all memory resources and resets the pool state."""
+        self.free_lists.clear()
+        self.allocated_blocks.clear()
+        if hasattr(self, 'base_tensor'):
+            del self.base_tensor
+
+    def __del__(self):
+        self.cleanup()
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
new file mode 100644
index 0000000..3c574d0
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@@ -0,0 +1,389 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import hashlib
+import os
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+import safetensors
+import torch
+
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.mla.common import MLACommonMetadata
+from vllm.v1.core.sched.output import SchedulerOutput
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionMetadata
+    from vllm.forward_context import ForwardContext
+    from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class ReqMeta:
+    # Request tokens
+    token_ids: torch.Tensor
+    # Slot mappings, should have the same length as token_ids
+    slot_mapping: torch.Tensor
+    # Is store or load
+    is_store: bool
+
+    @staticmethod
+    def make_meta(token_ids: list[int], block_ids: list[int], block_size: int,
+                  is_store: bool) -> "ReqMeta":
+        valid_num_tokens = align_to_block_size(len(token_ids), block_size)
+        token_ids_tensor = torch.tensor(token_ids)[:valid_num_tokens]
+        block_ids_tensor = torch.tensor(block_ids)
+        num_blocks = block_ids_tensor.shape[0]
+        block_offsets = torch.arange(0, block_size)
+        slot_mapping = block_offsets.reshape((1, block_size)) + \
+                block_ids_tensor.reshape((num_blocks, 1)) * block_size
+        slot_mapping = slot_mapping.flatten()[:valid_num_tokens]
+        return ReqMeta(
+            token_ids=token_ids_tensor,
+            slot_mapping=slot_mapping,
+            is_store=is_store,
+        )
+
+
+@dataclass
+class SharedStorageConnectorMetadata(KVConnectorMetadata):
+    requests: list[ReqMeta]
+
+    def __init__(self):
+        self.requests = []
+
+    def add_request(
+        self,
+        token_ids: list[int],
+        block_ids: list[int],
+        block_size: int,
+        is_store: bool,
+    ) -> None:
+        self.requests.append(
+            ReqMeta.make_meta(token_ids, block_ids, block_size, is_store))
+
+
+class SharedStorageConnector(KVConnectorBase_V1):
+    # NOTE: This is Simple debug implementation of the KV connector.
+    # It save / load the KV cache to / from the disk.
+    # It does extra work which will overwrite the existing prefix-cache in GPU
+    # - to remove the overhead, need to add some "mask" in the ReqMeta class
+
+    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
+        super().__init__(vllm_config=vllm_config, role=role)
+        self._block_size = vllm_config.cache_config.block_size
+        self._requests_need_load: dict[str, Request] = {}
+        transfer_config = vllm_config.kv_transfer_config
+        self._storage_path = transfer_config.get_from_extra_config(
+            "shared_storage_path", "/tmp")
+        logger.info(vllm_config.kv_transfer_config)
+        logger.info("Shared storage path is %s", self._storage_path)
+
+    def start_load_kv(self, forward_context: "ForwardContext",
+                      **kwargs) -> None:
+        """Start loading the KV cache from the connector buffer to vLLM's 
+        paged KV buffer.
+
+        Args:
+            forward_context (ForwardContext): the forward context.
+            **kwargs: additional arguments for the load operation
+
+        Note:
+            The number of elements in kv_caches and layer_names should be 
+            the same.
+        """
+        attn_metadata = forward_context.attn_metadata
+
+        def inject_kv_into_layer(
+            dst_kv_cache_layer: torch.Tensor,
+            src_kv_cache: torch.Tensor,
+            slot_mapping: torch.Tensor,
+        ) -> None:
+            """Inject the KV cache into the layer.
+
+            Args:
+                dst_kv_cache_layer (torch.Tensor): the destination KV cache 
+                    layer. In shape [2, num_pages, page_size, xxx] if not 
+                    using MLA, [num_pages, page_size, xxx] otherwise.
+                src_kv_cache (torch.Tensor): the source KV cache. In shape
+                    [2, num_tokens, xxx] if not using MLA, [num_tokens, xxx] 
+                    otherwise.
+                slot_mapping (torch.Tensor): the slot mapping. In shape 
+                    [num_tokens].
+            """
+            dst_kv_cache_layer_shape = dst_kv_cache_layer.shape
+            if isinstance(attn_metadata, MLACommonMetadata):
+                num_pages = dst_kv_cache_layer_shape[0]
+                page_size = dst_kv_cache_layer_shape[1]
+                dst_kv_cache_layer = dst_kv_cache_layer.reshape(
+                    num_pages * page_size, -1)
+                dst_kv_cache_layer[slot_mapping, ...] = src_kv_cache
+                dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape)
+            else:
+                num_pages = dst_kv_cache_layer_shape[1]
+                page_size = dst_kv_cache_layer_shape[2]
+                dst_kv_cache_layer = dst_kv_cache_layer.reshape(
+                    2, num_pages * page_size, -1)
+                dst_kv_cache_layer[:, slot_mapping, ...] = src_kv_cache
+                dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape)
+
+        # Get the metadata
+        metadata: KVConnectorMetadata = self._get_connector_metadata()
+        assert isinstance(metadata, SharedStorageConnectorMetadata)
+
+        if metadata is None:
+            logger.warning(
+                "In connector.start_load_kv, but the connector metadata is None"
+            )
+            return
+
+        attn_metadata = forward_context.attn_metadata
+        if attn_metadata is None:
+            logger.warning(
+                "In connector.start_load_kv, but the attn_metadata is None")
+            return
+
+        # Load the KV for each request each layer
+        for request in metadata.requests:
+            if request.is_store:
+                continue
+            logger.info("Inject KV cache of %d tokens to the paged memory",
+                        len(request.slot_mapping))
+            for layer_name in forward_context.no_compile_layers:
+                attn_layer = forward_context.no_compile_layers[layer_name]
+                kv_cache_layer = attn_layer.kv_cache[\
+                        forward_context.virtual_engine]
+
+                filename = self._generate_filename_debug(
+                    layer_name, request.token_ids)
+                kv_cache = safetensors.torch.load_file(
+                    filename)["kv_cache"].cuda()
+                inject_kv_into_layer(kv_cache_layer, kv_cache,
+                                     request.slot_mapping)
+
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        """Blocking until the KV for a specific layer is loaded into vLLM's
+        paged buffer. 
+        
+        This interface will be useful for layer-by-layer pipelining.
+
+        Args:
+            layer_name: the name of that layer
+        """
+        return
+
+    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
+                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+        """Start saving the KV cache of the layer from vLLM's paged buffer 
+        to the connector.
+
+        Args:
+            layer_name (str): the name of the layer.
+            kv_layer (torch.Tensor): the paged KV buffer of the current 
+                layer in vLLM.
+            attn_metadata (AttentionMetadata): the attention metadata.
+            **kwargs: additional arguments for the save operation.
+        """
+
+        def extract_kv_from_layer(
+            layer: torch.Tensor,
+            slot_mapping: torch.Tensor,
+        ) -> torch.Tensor:
+            """Extract the KV cache from the layer.
+
+            Assume the shape of the layer is (2, num_pages, page_size, xxx)
+            if MLA is not used, and (num_pages, page_size, xxx) otherwise.
+            """
+            if isinstance(attn_metadata, MLACommonMetadata):
+                num_pages, page_size = layer.shape[0], layer.shape[1]
+                return layer.reshape(num_pages * page_size, -1)[slot_mapping,
+                                                                ...]
+            num_pages, page_size = layer.shape[1], layer.shape[2]
+            return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping,
+                                                               ...]
+
+        connector_metadata = self._get_connector_metadata()
+        assert isinstance(connector_metadata, SharedStorageConnectorMetadata)
+        for request in connector_metadata.requests:
+            if request.is_store:
+                filename = self._generate_filename_debug(
+                    layer_name, request.token_ids)
+                kv_cache = extract_kv_from_layer(kv_layer,
+                                                 request.slot_mapping)
+                tensors = {"kv_cache": kv_cache.detach().cpu()}
+                safetensors.torch.save_file(tensors, filename)
+
+    def wait_for_save(self):
+        return
+
+    def get_num_new_matched_tokens(
+        self,
+        request: "Request",
+        num_computed_tokens: int,
+    ) -> tuple[int, bool]:
+        """
+        Get number of new tokens that can be loaded from the
+        external KV cache beyond the num_computed_tokens.
+        
+        Args:
+            request (Request): the request object.
+            num_computed_tokens (int): the number of locally
+                computed tokens for this request
+
+        Returns:
+            the number of tokens that can be loaded from the 
+            external KV cache beyond what is already computed.
+        """
+        # NOTE: in this debug implementation, we assume that the prompt is
+        # cached_prompt + newly_generated_single_token
+        # Therefore, we use prompt_token_ids[:-1] to determine the folder name
+
+        # NOTE: in current v1 scheduler, the num_computed_tokens is aligned
+        # with the block granularity. And it expects the returned blocks and
+        # num_computed_tokens to also be aligned with the block granularity.
+        if not self._found_match_for_request(request):
+            return 0, False
+
+        logger.info("External Cache Hit!")
+
+        # Now, first num_tokens_to_check tokens are hit, we need to prepare
+        # the metadata for the worker connector to correctly load the KV
+        num_tokens_to_check = align_to_block_size(
+            len(request.prompt_token_ids) - 1, self._block_size)
+
+        return num_tokens_to_check - num_computed_tokens, False
+
+    def update_state_after_alloc(self, request: "Request",
+                                 blocks: "KVCacheBlocks",
+                                 num_external_tokens: int):
+        """
+        Update KVConnector state after block allocation.
+
+        If blocks were allocated, add to _requests_need_load,
+        such that we load the KVs in the next forward pass.
+        """
+        if num_external_tokens > 0:
+            self._requests_need_load[request.request_id] = request
+
+    def build_connector_meta(
+        self,
+        scheduler_output: SchedulerOutput,
+    ) -> KVConnectorMetadata:
+        """Build the connector metadata for this step.
+
+        This function should NOT modify any fields in the scheduler_output.
+        Also, calling this function will reset the state of the connector.
+
+        Args:
+            scheduler_output (SchedulerOutput): the scheduler output object.
+        """
+        meta = SharedStorageConnectorMetadata()
+
+        total_need_load = 0
+        for new_req in scheduler_output.scheduled_new_reqs:
+            if new_req.req_id in self._requests_need_load:
+                meta.add_request(token_ids=new_req.prompt_token_ids,
+                                 block_ids=new_req.block_ids[0],
+                                 block_size=self._block_size,
+                                 is_store=False)
+                total_need_load += 1
+            else:
+                # NOTE: here, we set the store and load being exclusive,
+                # but a single request can have both store and load.
+                # NOTE(rob): for this debug implementation, we only cache
+                # the original prompt tokens.
+                if not self._found_match_for_request(new_req):
+                    meta.add_request(token_ids=new_req.prompt_token_ids,
+                                     block_ids=new_req.block_ids[0],
+                                     block_size=self._block_size,
+                                     is_store=True)
+
+        cached_reqs = scheduler_output.scheduled_cached_reqs
+        for i, req_id in enumerate(cached_reqs.req_ids):
+            num_computed_tokens = cached_reqs.num_computed_tokens[i]
+            num_new_tokens = scheduler_output.num_scheduled_tokens[req_id]
+            new_block_ids = cached_reqs.new_block_ids[i]
+            resumed_from_preemption = cached_reqs.resumed_from_preemption[i]
+
+            # NOTE(rob): here we rely on the resumed requests being
+            # the first N requests in the list scheduled_cache_reqs.
+            if not resumed_from_preemption:
+                break
+            if req_id in self._requests_need_load:
+                # NOTE(rob): cached_req_data does not have the full
+                # list of token ids (only new tokens). So we look it
+                # up in the actual request object.
+                request = self._requests_need_load[req_id]
+                total_tokens = num_computed_tokens + num_new_tokens
+                token_ids = request.all_token_ids[:total_tokens]
+
+                # NOTE(rob): For resumed req, new_block_ids is all
+                # of the block_ids for the request.
+                block_ids = new_block_ids[0]
+
+                meta.add_request(token_ids=token_ids,
+                                 block_ids=block_ids,
+                                 block_size=self._block_size,
+                                 is_store=False)
+                total_need_load += 1
+
+        assert total_need_load == len(self._requests_need_load)
+        self._requests_need_load.clear()
+        return meta
+
+    # ==============================
+    # Helper functions
+    # ==============================
+
+    def _found_match_for_request(
+        self,
+        request: "Request",
+    ) -> bool:
+        """Check if the cache is hit for the request.
+        """
+        num_tokens_to_check = align_to_block_size(
+            len(request.prompt_token_ids) - 1, self._block_size)
+        foldername = self._generate_foldername_debug(torch.tensor(
+            request.prompt_token_ids)[:num_tokens_to_check],
+                                                     create_folder=False)
+        return os.path.exists(foldername)
+
+    def _generate_foldername_debug(
+        self,
+        input_ids: torch.Tensor,
+        create_folder=False,
+    ) -> str:
+        """Generate a folder name based on the hash of the bytes of the input 
+        ids.
+        """
+        input_ids_bytes = input_ids.numpy().tobytes()
+        input_ids_hash = hashlib.md5(input_ids_bytes,
+                                     usedforsecurity=False).hexdigest()
+        foldername = os.path.join(self._storage_path, input_ids_hash)
+        if create_folder:
+            os.makedirs(foldername, exist_ok=True)
+        return foldername
+
+    def _generate_filename_debug(
+        self,
+        layer_name: str,
+        input_ids: torch.Tensor,
+    ) -> str:
+        """Generate a file name based on the layer name and the hash 
+        of the bytes of the input ids.
+        """
+        foldername = self._generate_foldername_debug(input_ids,
+                                                     create_folder=True)
+        return os.path.join(foldername, f"{layer_name}.safetensors")
+
+
+def align_to_block_size(num_tokens: int, block_size) -> int:
+    """Align the number of tokens to the block size.
+    """
+    return (num_tokens - 1) // block_size * block_size
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector_agent.py b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector_agent.py
new file mode 100644
index 0000000..8633fda
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_connector_agent.py
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""A centralized entrypoint to perform distributed KV cache transfer.
+
+This implementation is a shim wrapper on two APIs exposed by `kv_connector`:
+1. `send_kv_caches_and_hidden_states`
+2. `recv_kv_caches_and_hidden_states
+"""
+from typing import TYPE_CHECKING, Union
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+    from vllm.config import VllmConfig
+
+import torch
+
+from vllm.distributed.kv_transfer.kv_connector.factory import (
+    KVConnectorFactory)
+from vllm.logger import init_logger
+from vllm.sequence import IntermediateTensors
+
+logger = init_logger(__name__)
+
+
+class KVTransferAgent:
+    """
+    A class designated for distributed KV transfer
+    
+    Target use cases:
+        1. Disaggregated prefill
+        2. Remote KV cache storage
+    """
+
+    def __init__(
+        self,
+        rank: int,
+        local_rank: int,
+        config: "VllmConfig",
+    ):
+
+        self.config = config
+
+        if config.kv_transfer_config is None:
+            raise ValueError("KVTransferConfig is not set in the VllmConfig,"
+                             " cannot initialize KVConnector.")
+
+        assert self.config.kv_transfer_config.is_kv_transfer_instance, "KV"\
+            "TransferAgent should only be used when kv_connector is set."
+
+        self.connector = KVConnectorFactory.create_connector_v0(
+            rank, local_rank, config)
+
+    def send_kv_caches_and_hidden_states(
+        self,
+        model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: list[torch.Tensor],
+        hidden_or_intermediate_states: Union[torch.Tensor,
+                                             IntermediateTensors],
+    ) -> None:
+
+        self.connector.send_kv_caches_and_hidden_states(
+            model_executable, model_input, kv_caches,
+            hidden_or_intermediate_states)
+
+    def close(self) -> None:
+        self.connector.close()
+
+    def recv_kv_caches_and_hidden_states(
+        self, model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: list[torch.Tensor]
+    ) -> tuple[Union[torch.Tensor, IntermediateTensors], bool,
+               "ModelInputForGPUWithSamplingMetadata"]:
+
+        return self.connector.recv_kv_caches_and_hidden_states(
+            model_executable, model_input, kv_caches)
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
new file mode 100644
index 0000000..eef1426
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
@@ -0,0 +1,175 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file contains a new class `KVLookupBufferBase` that allows developers to
+think of KV cache operations as inserting new KV cache entries (`insert`)
+into the lookup buffer and querying existing KV caches (`drop_select`)
+from the lookup buffer.
+
+This file also contains a new class `KVStoreBufferBase` that allows developers
+to manage the KVCache buffer as a simple key-value storage buffer with basic
+put/get operations.
+
+These classes above are abstracted behind class `KVCacheBufferBase`.
+"""
+
+from abc import ABC, abstractmethod
+from typing import Optional
+
+import torch
+
+
+class KVCacheBufferBase(ABC):
+    """
+    Abstract base class for a KVCache buffer.
+    """
+
+    @abstractmethod
+    def close(self) -> None:
+        """Close the buffer and release resources.
+
+        This method is responsible for cleaning up resources related to the
+        KVCache buffer when it is no longer needed.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
+
+
+class KVLookupBufferBase(KVCacheBufferBase):
+    """
+    Abstract base class for a KVCache lookup buffer.
+
+    This class provides an abstraction for a key-value (KV) cache lookup buffer.
+    
+    The key of the lookup buffer:
+    - input_tokens: token IDs of the request
+    - roi: a binary mask on top of input_tokens.
+      - Purpose of roi: Since KV cache may only be available for a subset of 
+        tokens in the input (for example, when vLLM is connected to an external 
+        KV cache service), roi specifies the subset of tokens that the KV cache 
+        is associated with.
+      - NOTE: roi can be further extended to describe which part of KV the 
+        current process is holding (each process may only hold a part of KV 
+        due to TP and PP). This is not implemented for now.
+        
+    The value of the lookup buffer:
+    - key: the key tensor in the KV cache
+    - value: the value tensor in the KV cache
+    - hidden: the final hidden state generated by model forwarding. This allows 
+      vLLM to bypass further model forwarding by transmitting the hidden state.
+    """
+
+    @abstractmethod
+    def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor,
+               key: torch.Tensor, value: torch.Tensor,
+               hidden: torch.Tensor) -> None:
+        """Insert into the lookup buffer.
+        
+        The functionality is similar to the following python statement
+        ```
+        buffer[input_tokens, roi] = [key, value, hidden]
+        ```
+        
+        FIXME: in the future, we should only have two arguments, key and value,
+        where key is a tensor dict and value is a tensor dict.
+        
+        FIXME: we should transmit both sampler outputs and the hidden states.
+
+        Args:
+            input_tokens (torch.Tensor): token IDs.
+            roi (torch.Tensor): A binary mask on top of the input tokens
+            key (torch.Tensor): The key tensor in the KV cache.
+            value (torch.Tensor): The value tensor in the KV cache.
+            hidden (torch.Tensor): The final hidden state tensor generated 
+                                   during model forwarding to bypass model 
+                                   forwarding.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def drop_select(
+            self, input_tokens: Optional[torch.Tensor],
+            roi: Optional[torch.Tensor]) -> list[Optional[torch.Tensor]]:
+        """Select and *drop* KV cache entries from the lookup buffer.
+        
+        The functionality is similar to the following python statements
+        ```
+        ret = buffer.pop(input_tokens, roi)
+        return ret
+        ```
+        
+        If `input_tokens` and `roi` is `None`, it means selecting any of the
+        KV caches in the buffer, return, and remove it from the buffer, useful
+        when offloading KV cache to KV cache storage service.
+
+        Args:
+            input_tokens (torch.Tensor): token IDs.
+            roi (torch.Tensor): A binary mask on top of the input tokens
+
+        Returns:
+            list[Optional[torch.Tensor]]: A list of tensors. Can be None.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
+
+
+class KVStoreBufferBase(KVCacheBufferBase):
+    """
+    Abstract base class for a KVCache storage buffer with key-value semantics.
+    This class provides a simple key-value storage buffer abstract with basic
+    put/get operations, which enables flexible KVCache transfer granular
+    control.
+
+    The functionality is similar to a distributed key-value store, where:
+    - Key: A unique string identifier for the cached entry
+    - Value:
+        - Tensor to be stored and retrieved
+        - None (indicating deletion or empty value)
+    """
+
+    @abstractmethod
+    def put(
+        self,
+        key: str,
+        value: Optional[torch.Tensor],
+    ) -> None:
+        """Store a key-value pair in the buffer.
+
+        Args:
+            key (str): Unique identifier for a tensor, this tensor could be the
+                key cache tensor, value cache tensor, or hidden state tensor
+                generated during model forwarding.
+
+            value (Optional[torch.Tensor]): Tensor to be stored.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get(
+        self,
+        key: str,
+    ) -> Optional[torch.Tensor]:
+        """Retrieve a value from the buffer by key.
+
+        Args:
+            key (str): Unique identifier for a tensor, this tensor could be the
+                key cache tensor, value cache tensor, or hidden state tensor
+                generated during model forwarding.
+
+        Returns:
+            Optional[torch.Tensor]: Stored tensor if exists, None otherwise.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
new file mode 100644
index 0000000..4381aad
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
@@ -0,0 +1,161 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file contains a new class `MooncakeStore` that allows developers to
+think of KV cache transfer operations as putting new KV cache entries
+into a remote KVStore-based lookup buffer and getting existing KV caches
+from this remote lookup buffer.
+"""
+import json
+import os
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+from safetensors.torch import load as safetensors_load
+from safetensors.torch import save as safetensors_save
+
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_lookup_buffer.base import (
+    KVStoreBufferBase)
+from vllm.logger import init_logger
+
+DEFAULT_GLOBAL_SEGMENT_SIZE = 3355443200  # 3.125 GiB
+DEFAULT_LOCAL_BUFFER_SIZE = 1073741824  # 1.0 GiB
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class MooncakeStoreConfig:
+    local_hostname: str
+    metadata_server: str
+    global_segment_size: int
+    local_buffer_size: int
+    protocol: str
+    device_name: str
+    master_server_address: str
+
+    @staticmethod
+    def from_file(file_path: str) -> 'MooncakeStoreConfig':
+        """Load the config from a JSON file."""
+        with open(file_path) as fin:
+            config = json.load(fin)
+        return MooncakeStoreConfig(
+            local_hostname=config.get("local_hostname"),
+            metadata_server=config.get("metadata_server"),
+            global_segment_size=config.get("global_segment_size",
+                                           DEFAULT_GLOBAL_SEGMENT_SIZE),
+            local_buffer_size=config.get("local_buffer_size",
+                                         DEFAULT_LOCAL_BUFFER_SIZE),
+            protocol=config.get("protocol", "tcp"),
+            device_name=config.get("device_name", ""),
+            master_server_address=config.get("master_server_address"),
+        )
+
+    @staticmethod
+    def load_from_env() -> 'MooncakeStoreConfig':
+        """Load config from a file specified in the environment variable."""
+        config_file_path = os.getenv('MOONCAKE_CONFIG_PATH')
+        if config_file_path is None:
+            raise ValueError(
+                "The environment variable 'MOONCAKE_CONFIG_PATH' is not set.")
+        return MooncakeStoreConfig.from_file(config_file_path)
+
+
+class MooncakeStore(KVStoreBufferBase):
+
+    def __init__(
+        self,
+        config: VllmConfig,
+    ):
+
+        try:
+            from mooncake.store import MooncakeDistributedStore
+        except ImportError as e:
+            raise ImportError(
+                "Please install mooncake by following the instructions at "
+                "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md "  # noqa: E501
+                "to run vLLM with MooncakeConnector.") from e
+
+        try:
+            self.store = MooncakeDistributedStore()
+            self.config = MooncakeStoreConfig.load_from_env()
+            logger.info("Mooncake Configuration loaded successfully.")
+
+            self.store.setup(self.config.local_hostname,
+                             self.config.metadata_server,
+                             self.config.global_segment_size,
+                             self.config.local_buffer_size,
+                             self.config.protocol, self.config.device_name,
+                             self.config.master_server_address)
+
+        except ValueError as e:
+            logger.error("Configuration loading failed: %s", e)
+            raise
+        except Exception as exc:
+            logger.error(
+                "An error occurred while loading the configuration: %s", exc)
+            raise
+
+    def close(self):
+        # MooncakeDistributedStore will automatically call the destructor, so
+        # it is unnecessary to close it manually.
+        pass
+
+    def put(
+        self,
+        key: str,
+        value: Optional[torch.Tensor],
+    ) -> None:
+        # A message queue needs to be introduced before making it asynchronous.
+        if value is not None:
+            self._put_impl(key, value)
+
+    def get(
+        self,
+        key: str,
+    ) -> Optional[torch.Tensor]:
+        # A message queue needs to be introduced before making it asynchronous.
+        value = self._get_impl(key)
+        return value
+
+    def _put_impl(
+        self,
+        key: str,
+        value: torch.Tensor,
+    ) -> None:
+        """Put KVCache to Mooncake Store"""
+        device_id = value.device.index if value.device.type == 'cuda' else -1
+        device_tensor = torch.tensor(device_id, dtype=torch.int32)
+        value_bytes = safetensors_save({
+            "tensor": value,
+            "device_id": device_tensor
+        })
+        try:
+            self.store.put(key, value_bytes)
+        except TypeError as err:
+            logger.error("Failed to put value into Mooncake Store: %s", err)
+            raise TypeError("Mooncake Store Put Type Error.") from err
+
+    def _get_impl(
+        self,
+        key: str,
+    ) -> Optional[torch.Tensor]:
+        """Get KVCache from Mooncake Store"""
+        try:
+            data = self.store.get(key)
+        except TypeError as err:
+            logger.error("Failed to get value from Mooncake Store: %s", err)
+            raise TypeError("Mooncake Store Get Type Error.") from err
+
+        if data:
+            loaded_tensors = safetensors_load(data)
+            tensor = loaded_tensors["tensor"]
+            device_id_tensor = loaded_tensors["device_id"]
+            device_id = int(device_id_tensor.item())
+            device = torch.device(
+                'cuda', device_id) if device_id >= 0 else torch.device('cpu')
+            return tensor.to(device)
+
+        return None
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
new file mode 100644
index 0000000..a0ff7c3
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
@@ -0,0 +1,237 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+    Implements a distributed key-value (KV) cache transfer mechanism.
+
+    Key Features:
+    - Distributed KV cache transmission using PyNccl pipes.
+    - Non-blocking `insert`, blocking `drop_select`.
+    - Use CPU signal pipe to avoid racing condition
+    - Handles buffer size constraints and provide backpressure mechanism to
+      stop the prefill instance when the decode instance is slow.
+"""
+import threading
+from collections import deque
+from typing import Optional, Union
+
+import torch
+
+from vllm.distributed.kv_transfer.kv_lookup_buffer.base import (
+    KVLookupBufferBase)
+from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class SimpleBuffer(KVLookupBufferBase):
+
+    def __init__(self, signal_pipe: KVPipeBase, data_pipe: KVPipeBase,
+                 buffer_size_thresh: float):
+        """
+        signal_pipe: on CPU
+
+        NOTE: on-device recv will block all threads in the process, making the
+        KV cache producer unable to listen to new request while transmitting
+        KV cache. Luckily CPU recv only blocks the current thread so we use
+        CPU recv to listen to new request.
+
+        data_pipe: on device (e.g. GPU)
+        """
+
+        self.buffer: deque[list[torch.Tensor]] = deque()
+
+        self.buffer_size = 0
+        self.buffer_size_threshold = buffer_size_thresh
+        self.buffer_cv = threading.Condition()
+        self.signal_pipe = signal_pipe
+        self.data_pipe = data_pipe
+        self.request_handling_thread: Optional[threading.Thread] = None
+
+        self.normal_signal = torch.tensor([0], device="cpu")
+        self.end_signal = None
+
+    def _matches(self, tokens_roi_sender: list[torch.Tensor],
+                 tokens_roi_recver: list[torch.Tensor]):
+
+        # tokens_roi_sender: tokens and roi of the producer (in the buffer)
+        # tokens_roi_recver: tokens and roi of the consumer (query)
+
+        tokens_sender = tokens_roi_sender[0]
+        tokens_recver = tokens_roi_recver[0]
+        roi_sender = tokens_roi_sender[1]
+        roi_recver = tokens_roi_recver[1]
+
+        if tokens_recver is None:
+            # consumer sends an empty request
+            # semantics: DROP SELECT * LIMIT 1
+            # so any of the data in the buffer can be drop-selected
+            return True
+
+        # Assuming that roi is a binary mask on tokens
+        tokens_sender = tokens_sender[roi_sender]
+        tokens_recver = tokens_recver[roi_recver]
+
+        # simple common prefix matching
+        min_length = min(len(tokens_sender), len(tokens_recver))
+        if torch.allclose(tokens_sender[:min_length],
+                          tokens_recver[:min_length]):
+            return min_length
+
+        return 0
+
+    def _send_tensor_and_dec_size(self,
+                                  tensor: Optional[torch.Tensor]) -> None:
+
+        assert tensor is not None, "Use self.data_pipe.send(None) instead"
+        self.buffer_size -= tensor.element_size() * tensor.numel()
+        if tensor.dtype == torch.bool:
+            tensor = tensor.float()
+        self.data_pipe.send_tensor(tensor)
+
+    def _get_element_size(self, data: Optional[Union[list, torch.Tensor]]):
+
+        if isinstance(data, torch.Tensor):
+            return data.element_size() * data.numel()
+        if not data:
+            # cannot perform `not data` on a tensor
+            # so this check needs to go after the check above
+            return 0
+
+        raise AssertionError(f"Unknown data type {type(data)}")
+
+    def _add_to_buffer(self, input_tokens: torch.Tensor, roi: torch.Tensor,
+                       key: torch.Tensor, value: torch.Tensor,
+                       hidden: torch.Tensor):
+
+        if isinstance(input_tokens, torch.Tensor):
+            input_tokens = input_tokens.clone()
+        if isinstance(roi, torch.Tensor):
+            roi = roi.clone()
+        if isinstance(key, torch.Tensor):
+            key = key.clone()
+        if isinstance(value, torch.Tensor):
+            value = value.clone()
+        if isinstance(hidden, torch.Tensor):
+            hidden = hidden.clone()
+
+        buffer_item = [input_tokens, roi, key, value, hidden]
+        data_size = sum([self._get_element_size(data) for data in buffer_item])
+
+        with self.buffer_cv:
+            if self.buffer_size + data_size > self.buffer_size_threshold:
+                # log outside the while loop to avoid this message being logged
+                # repeatedly.
+                logger.debug("KV transfer buffer is full. Handling...")
+                while self.buffer_size + data_size > self.buffer_size_threshold:
+                    self.buffer_cv.wait()
+
+            self.buffer_size += data_size
+            self.buffer.append(buffer_item)
+            self.buffer_cv.notify()
+
+    def _is_end_signal(self, signal):
+        return signal is None
+
+    def drop_select_handler(self):
+
+        try:
+
+            while True:
+                signal = self.signal_pipe.recv_tensor()
+                if self._is_end_signal(signal):
+                    logger.info("Received end signal!")
+                    break
+
+                input_tokens = self.data_pipe.recv_tensor()
+
+                roi = self.data_pipe.recv_tensor()
+                assert roi is not None, "Please provide the roi when sending "\
+                    "drop-select request"
+                roi = (roi > 0.5)
+                tokens_roi_recver = [input_tokens, roi]
+
+                def is_buffer_available(
+                    tokens_roi_recver: list[torch.Tensor], ) -> bool:
+                    # perform input tokens and roi matching
+                    # FIXME: this matching is O(n), ideally it should be O(1)
+                    # but this buffer size won't (and shouldn't) be too large so
+                    # the fix is not urgent.
+                    for _ in range(len(self.buffer)):
+                        if self._matches(self.buffer[0],
+                                         tokens_roi_recver) > 0:
+                            return True
+                        # rotate the element we just accessed to the end
+                        self.buffer.rotate(-1)
+                    return False
+
+                with self.buffer_cv:
+                    while not is_buffer_available(tokens_roi_recver):
+                        logger.debug(
+                            "KV transfer buffer is not available. Waiting...")
+                        self.buffer_cv.wait()
+                    # need to clone the tensor
+                    # in case the tensor is freed before sending finishes
+                    matched_item = self.buffer.popleft()
+                    for tensor in matched_item:
+                        self._send_tensor_and_dec_size(tensor)
+                    self.buffer_cv.notify()
+
+        except RuntimeError as e:
+            if 'Connection closed by peer' not in str(e):
+                raise e
+
+        logger.debug("Closing drop_select_handler")
+
+    def drop_select(
+            self, input_tokens: Optional[torch.Tensor],
+            roi: Optional[torch.Tensor]) -> list[Optional[torch.Tensor]]:
+
+        assert self.request_handling_thread is None, \
+            "drop_select should be called by the KV cache consumer "\
+            "(e.g. the decode vLLM instance)"
+
+        if isinstance(input_tokens, torch.Tensor):
+            input_tokens = input_tokens.clone()
+        if isinstance(roi, torch.Tensor):
+            roi = roi.clone().float()
+
+        self.signal_pipe.send_tensor(self.normal_signal)
+        self.data_pipe.send_tensor(input_tokens)
+        self.data_pipe.send_tensor(roi)
+
+        input_tokens = self.data_pipe.recv_tensor()
+        roi = self.data_pipe.recv_tensor()
+        if roi is not None:
+            # convert from float tensor to bool tensor
+            # as PyNccl does not support sending bool tensor
+            roi = (roi > 0.5)
+        key = self.data_pipe.recv_tensor()
+        value = self.data_pipe.recv_tensor()
+        hidden = self.data_pipe.recv_tensor()
+
+        return [input_tokens, roi, key, value, hidden]
+
+    def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor,
+               key: torch.Tensor, value: torch.Tensor,
+               hidden: torch.Tensor) -> None:
+
+        self._add_to_buffer(input_tokens, roi, key, value, hidden)
+
+        # when calling the insert, the current process is a sender
+        # need to launch the request handler and start listening to request.
+        if self.request_handling_thread is None:
+            self.request_handling_thread = threading.Thread(
+                target=self.drop_select_handler)
+            self.request_handling_thread.start()
+
+    def close(self):
+
+        if hasattr(self, "request_handling_thread"
+                   ) and self.request_handling_thread is not None:
+            self.request_handling_thread.join()
+
+        else:
+            # TODO: have a explicit close signal and have a explicit way to
+            # check if it's requester
+            self.signal_pipe.send_tensor(self.end_signal)
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_pipe/__init__.py b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_pipe/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_pipe/base.py b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_pipe/base.py
new file mode 100644
index 0000000..1423fd0
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_pipe/base.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file defines an interface `KVPipeBase`
+that provides an abstraction for sending and receiving tensors, or None, via
+distributed communications.
+
+All classes instantiated from this interface are assumed to be a FIFO pipe.
+
+If your distributed communication platform already supports key-value lookup,
+you can bypass this interface and directly start from `kv_lookup_buffer`.
+"""
+
+from abc import ABC, abstractmethod
+from typing import Optional
+
+import torch
+
+
+class KVPipeBase(ABC):
+    """
+    This class provides an interface for sending and receiving tensors, or
+    None, by distributed communications.
+    """
+
+    @abstractmethod
+    def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
+        """Send a tensor, or None, via the pipe.
+        
+        Need to support sending None -- important for error handling.
+        
+        TODO: add a `key` argument so that we can use traditional 
+        key-value database as the distributed communication mechanism behind 
+        the pipe.
+
+        Args:
+            tensor (Optional[torch.Tensor]): The tensor to be sent. Can be None.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def recv_tensor(self) -> Optional[torch.Tensor]:
+        """Receive a tensor (can be None) from the pipeline.
+
+        Returns:
+            Optional[torch.Tensor]: The tensor received from the pipeline. Can 
+                                    be None.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def close(self) -> None:
+        """Close the pipeline and release resources.
+
+        This method is responsible for closing the communication pipeline 
+        and releasing any resources associated with it.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
new file mode 100644
index 0000000..0b560d1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
@@ -0,0 +1,290 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+import os
+import struct
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import zmq
+from safetensors.torch import load as safetensors_load
+from safetensors.torch import save as safetensors_save
+
+from vllm.config import KVTransferConfig
+from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
+from vllm.logger import init_logger
+from vllm.utils import join_host_port, make_zmq_path, split_host_port
+
+logger = init_logger(__name__)
+NONE_INT = -150886311
+
+
+@dataclass
+class MooncakeTransferEngineConfig:
+    prefill_url: str
+    decode_url: str
+    metadata_backend: Union[str, None]
+    metadata_server: str
+    protocol: str
+    device_name: str
+
+    @staticmethod
+    def from_file(file_path: str) -> 'MooncakeTransferEngineConfig':
+        """Load the config from a JSON file."""
+        with open(file_path) as fin:
+            config = json.load(fin)
+        return MooncakeTransferEngineConfig(
+            prefill_url=config.get("prefill_url"),
+            decode_url=config.get("decode_url"),
+            metadata_backend=config.get("metadata_backend", None),
+            metadata_server=config.get("metadata_server"),
+            protocol=config.get("protocol", "tcp"),
+            device_name=config.get("device_name", ""),
+        )
+
+    @staticmethod
+    def load_from_env() -> 'MooncakeTransferEngineConfig':
+        """Load config from a file specified in the environment variable."""
+        config_file_path = os.getenv('MOONCAKE_CONFIG_PATH')
+        if config_file_path is None:
+            raise ValueError(
+                "The environment variable 'MOONCAKE_CONFIG_PATH' is not set.")
+        return MooncakeTransferEngineConfig.from_file(config_file_path)
+
+
+class MooncakeTransferEngine:
+    """Handles the transfer of data using mooncake_vllm_adaptor and ZeroMQ."""
+
+    def __init__(self, kv_rank: int, local_rank: int):
+        try:
+            from mooncake.engine import TransferEngine
+        except ImportError as e:
+            raise ImportError(
+                "Please install mooncake by following the instructions at "
+                "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md "  # noqa: E501
+                "to run vLLM with MooncakeConnector.") from e
+
+        self.engine = TransferEngine()
+        self.local_rank = local_rank
+
+        try:
+            self.config = MooncakeTransferEngineConfig.load_from_env()
+            logger.info("Mooncake Configuration loaded successfully.")
+        except ValueError as e:
+            logger.error(e)
+            raise
+        except Exception as exc:
+            logger.error(
+                "An error occurred while loading the configuration: %s", exc)
+            raise
+        prefill_host, base_prefill_port = split_host_port(
+            self.config.prefill_url)
+        decode_host, base_decode_port = split_host_port(self.config.decode_url)
+
+        # Avoid ports conflict when running prefill and decode on the same node
+        if prefill_host == decode_host and \
+                base_prefill_port == base_decode_port:
+            base_decode_port = base_decode_port + 100
+
+        prefill_port = base_prefill_port + self.local_rank
+        decode_port = base_decode_port + self.local_rank
+        self.prefill_url = join_host_port(prefill_host, prefill_port)
+        self.decode_url = join_host_port(decode_host, decode_port)
+
+        self.initialize(self.prefill_url if kv_rank == 0 else self.decode_url,
+                        self.config.metadata_server, self.config.protocol,
+                        self.config.device_name, self.config.metadata_backend)
+
+        self.remote_url = (self.decode_url
+                           if kv_rank == 0 else self.prefill_url)
+
+        # Initialize ZeroMQ context and sockets
+        self.context = zmq.Context()  # type: ignore[attr-defined]
+        self.sender_socket = self.context.socket(zmq.constants.PUSH)
+        self.receiver_socket = self.context.socket(zmq.constants.PULL)
+        self.sender_ack = self.context.socket(zmq.constants.PULL)
+        self.receiver_ack = self.context.socket(zmq.constants.PUSH)
+
+        self.buffer_cleaner = ThreadPoolExecutor(max_workers=1)
+        self._setup_metadata_sockets(kv_rank, prefill_host, base_prefill_port,
+                                     decode_host, base_decode_port)
+
+    def _setup_metadata_sockets(self, kv_rank: int, p_host: str, p_port: int,
+                                d_host: str, d_port: int) -> None:
+        """Set up ZeroMQ sockets for sending and receiving data."""
+        # Offsets < 8 are left for initialization in case tp and pp are enabled
+        p_rank_offset = p_port + 8 + self.local_rank * 2
+        d_rank_offset = d_port + 8 + self.local_rank * 2
+        if kv_rank == 0:
+            self.sender_socket.bind(
+                make_zmq_path("tcp", p_host, p_rank_offset + 1))
+            self.receiver_socket.connect(
+                make_zmq_path("tcp", d_host, d_rank_offset + 1))
+            self.sender_ack.connect(
+                make_zmq_path("tcp", d_host, d_rank_offset + 2))
+            self.receiver_ack.bind(
+                make_zmq_path("tcp", p_host, p_rank_offset + 2))
+        else:
+            self.receiver_socket.connect(
+                make_zmq_path("tcp", p_host, p_rank_offset + 1))
+            self.sender_socket.bind(
+                make_zmq_path("tcp", d_host, d_rank_offset + 1))
+            self.receiver_ack.bind(
+                make_zmq_path("tcp", d_host, d_rank_offset + 2))
+            self.sender_ack.connect(
+                make_zmq_path("tcp", p_host, p_rank_offset + 2))
+
+    def initialize(self, local_hostname: str, metadata_server: str,
+                   protocol: str, device_name: str,
+                   metadata_backend: Union[str, None]) -> None:
+        """Initialize the mooncake instance."""
+        if metadata_backend is None:
+            self.engine.initialize(local_hostname, metadata_server, protocol,
+                                   device_name)
+        else:
+            supported_backend = ["etcd", "redis"]
+            metadata_backend = metadata_backend.lower()
+            if metadata_backend not in supported_backend:
+                raise ValueError(
+                    "Mooncake Configuration error. `metadata_backend`"
+                    f" should be one of {supported_backend}.")
+
+            self.engine.initialize_ext(local_hostname, metadata_server,
+                                       protocol, device_name, metadata_backend)
+
+    def allocate_managed_buffer(self, length: int) -> int:
+        """Allocate a managed buffer of the specified length."""
+        ret = self.engine.allocate_managed_buffer(length)
+        if ret <= 0:
+            logger.error("Allocation Return Error")
+            raise Exception("Allocation Return Error")
+        return ret
+
+    def free_managed_buffer(self, buffer: int, length: int) -> int:
+        """Free a previously allocated managed buffer."""
+        return self.engine.free_managed_buffer(buffer, length)
+
+    def transfer_sync(self, buffer: int, peer_buffer_address: int,
+                      length: int) -> int:
+        """Synchronously transfer data to the specified address."""
+        ret = self.engine.transfer_sync_read(self.remote_url, buffer,
+                                             peer_buffer_address, length)
+        if ret < 0:
+            logger.error("Transfer Return Error")
+            raise Exception("Transfer Return Error")
+        return ret
+
+    def write_bytes_to_buffer(self, buffer: int, user_data: bytes,
+                              length: int) -> int:
+        """Write bytes to the allocated buffer."""
+        return self.engine.write_bytes_to_buffer(buffer, user_data, length)
+
+    def read_bytes_from_buffer(self, buffer: int, length: int) -> bytes:
+        """Read bytes from the allocated buffer."""
+        return self.engine.read_bytes_from_buffer(buffer, length)
+
+    def wait_for_ack(self, src_ptr: int, length: int) -> None:
+        """Asynchronously wait for ACK from the receiver."""
+        ack = self.sender_ack.recv()
+        if ack != b'ACK':
+            logger.error("Failed to receive ACK from the receiver")
+
+        self.free_managed_buffer(src_ptr, length)
+
+    def send_bytes(self, user_data: bytes) -> None:
+        """Send bytes to the remote process."""
+        length = len(user_data)
+        src_ptr = self.allocate_managed_buffer(length)
+        self.write_bytes_to_buffer(src_ptr, user_data, length)
+        self.sender_socket.send_multipart(
+            [struct.pack("!Q", src_ptr),
+             struct.pack("!Q", length)])
+        self.buffer_cleaner.submit(self.wait_for_ack, src_ptr, length)
+
+    def recv_bytes(self) -> bytes:
+        """Receive bytes from the remote process."""
+        data = self.receiver_socket.recv_multipart()
+        src_ptr = struct.unpack("!Q", data[0])[0]
+        length = struct.unpack("!Q", data[1])[0]
+        dst_ptr = self.allocate_managed_buffer(length)
+        self.transfer_sync(dst_ptr, src_ptr, length)
+        ret = self.read_bytes_from_buffer(dst_ptr, length)
+
+        # Buffer cleanup
+        self.receiver_ack.send(b'ACK')
+        self.free_managed_buffer(dst_ptr, length)
+
+        return ret
+
+
+class MooncakePipe(KVPipeBase):
+    """MooncakeTransferEngine based Pipe implementation."""
+
+    def __init__(self,
+                 local_rank: int,
+                 config: KVTransferConfig,
+                 device: Optional[str] = None):
+        """Initialize the mooncake pipe and set related parameters."""
+        self.config = config
+        self.local_rank = local_rank
+        self.kv_rank = self.config.kv_rank
+        if device is None:
+            self.device = self._select_device(self.config.kv_buffer_device)
+        else:
+            self.device = self._select_device(device)
+
+        self.transfer_engine = MooncakeTransferEngine(self.kv_rank,
+                                                      self.local_rank)
+        self.transport_thread: Optional[ThreadPoolExecutor] = None
+        self.none_tensor = torch.tensor([NONE_INT], device=self.device)
+
+    def _select_device(self, device: str) -> torch.device:
+        """Select available device (CUDA or CPU)."""
+        logger.info("Selecting device: %s", device)
+        if device == "cuda":
+            return torch.device(f"cuda:{self.local_rank}")
+        else:
+            return torch.device("cpu")
+
+    def tensor_hash(self, tensor: torch.Tensor) -> int:
+        """Calculate the hash value of the tensor."""
+        return hash(tensor.data_ptr())
+
+    def _send_impl(self, tensor: torch.Tensor) -> None:
+        """Implement the tensor sending logic using safetensors."""
+        self.transfer_engine.send_bytes(safetensors_save({"tensor": tensor}))
+
+    def _recv_impl(self) -> torch.Tensor:
+        """Implement the tensor receiving logic using safetensors."""
+        data = self.transfer_engine.recv_bytes()
+        return safetensors_load(data)["tensor"].to(self.device)
+
+    def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
+        """Send tensor to the target process."""
+        if self.transport_thread is None:
+            self.transport_thread = ThreadPoolExecutor(max_workers=1)
+        tensor = tensor if tensor is not None else self.none_tensor
+        assert (len(tensor.shape) > 0)
+        self.transport_thread.submit(self._send_impl, tensor)
+
+    def recv_tensor(self) -> Optional[torch.Tensor]:
+        """Receive tensor from other processes."""
+        if self.transport_thread is None:
+            self.transport_thread = ThreadPoolExecutor(max_workers=1)
+        tensor = self.transport_thread.submit(self._recv_impl).result()
+        if tensor.numel() == 1 and tensor.item() == NONE_INT:
+            return None
+        else:
+            return tensor
+
+    def close(self) -> None:
+        """Cleanup logic when closing the pipe."""
+        self.transfer_engine.sender_socket.close()
+        self.transfer_engine.receiver_socket.close()
+        self.transfer_engine.sender_ack.close()
+        self.transfer_engine.receiver_ack.close()
+        self.transfer_engine.context.term()  # Terminate the ZMQ context
+        logger.info("Closed the transfer engine and cleaned up resources.")
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
new file mode 100644
index 0000000..09de0b6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
@@ -0,0 +1,280 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+    This module implements a PyNccl pipe for sending and receiving
+    Optional[torch.Tensor] between distributed ranks with advanced
+    communication features.
+
+    Key Features:
+    - Supports sending and receiving tensors with metadata
+    - Handles both CUDA and CPU device communications
+    - Implements a non-blocking tensor transfer mechanism
+    - Manages buffer size and provides backpressure control
+    - Supports distributed process groups with configurable parameters
+"""
+
+import threading
+import time
+from concurrent.futures import ThreadPoolExecutor
+from typing import Callable, Optional
+
+import torch
+
+from vllm.config import KVTransferConfig
+from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
+from vllm.distributed.utils import StatelessProcessGroup
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class BrokenPipeException(Exception):
+
+    def __init__(self, message):
+        self.message = message
+        super().__init__(self.message)
+
+
+Metadata = dict[str, Optional[torch.Tensor]]
+
+
+class PyNcclPipe(KVPipeBase):
+
+    METADATA_LENGTH = 16
+    MAX_TENSOR_DIMENSIONS = 14
+    METADATA_DTYPE = torch.int64
+
+    def __init__(self,
+                 local_rank: int,
+                 config: KVTransferConfig,
+                 device: Optional[str] = None,
+                 port_offset: int = 0):
+        self.config = config
+        self.local_rank = local_rank
+        self.kv_rank = self.config.kv_rank
+        self.kv_parallel_size = self.config.kv_parallel_size
+        if device is None:
+            self.device = self._select_device(self.config.kv_buffer_device)
+        else:
+            self.device = self._select_device(device)
+
+        # build distributed connection and send/recv implementation
+        store_timeout = self.config.get_from_extra_config("store_timeout", 300)
+        self.group = StatelessProcessGroup.create(
+            host=self.config.kv_ip,
+            port=self.config.kv_port + port_offset,
+            rank=self.kv_rank,
+            world_size=self.kv_parallel_size,
+            store_timeout=store_timeout,
+        )
+        # add a barrier to make sure the connection is initiated properly
+        self.group.barrier()
+        impl = self._get_device_send_recv_impl(self.group)
+        self.device_send_func, self.device_recv_func = impl
+        # set target rank
+        self.target_rank_for_send = (self.kv_rank + 1) % self.kv_parallel_size
+        self.target_rank_for_recv = (self.kv_rank - 1) % self.kv_parallel_size
+
+        # transportation-related variables
+        self.transport_thread: Optional[ThreadPoolExecutor] = None
+        self.buffer_size = 0
+        self.buffer_size_lock = threading.Lock()
+        self.buffer_size_thresh = self.config.kv_buffer_size
+
+    def _get_device_send_recv_impl(
+        self, group: StatelessProcessGroup
+    ) -> tuple[Callable[[torch.Tensor, int], None], Callable[
+        [torch.Tensor, int], None]]:
+
+        send: Callable[[torch.Tensor, int], None]
+        recv: Callable[[torch.Tensor, int], None]
+        if self.device.type == "cuda":
+            # use PyNCCL for send / recv
+            comm = PyNcclCommunicator(group, device=self.local_rank)
+            comm.disabled = False
+            send, recv = comm.send, comm.recv  # type: ignore
+        else:
+            # This send / recv implementation here is NOT intended to transfer
+            # KV caches (and should NOT be repurposed to transfer KV caches).
+            # Currently it is only used to transmit control-plane messages
+            # for PyNcclBuffer.
+            send = group.send_obj
+
+            def my_recv(x, src):
+                x[...] = group.recv_obj(src)
+
+            recv = my_recv
+
+        return send, recv
+
+    def _select_device(self, device: str):
+        logger.info("Selecting device: %s", device)
+        if device == "cuda":
+            return torch.device(f"cuda:{self.local_rank}")
+        else:
+            return torch.device("cpu")
+
+    def _make_metadata(self, tensor: Optional[torch.Tensor]) -> Metadata:
+        """
+        Create the metadata as a dictionary based on the input tensor.
+
+        Args:
+            tensor: The input tensor or None if no tensor is provided.
+
+        Returns:
+            metadata: A dictionary with the following keys:
+                - "dtype": The data type of the tensor or None.
+                - "shape": The shape of the tensor or None.
+        """
+        if tensor is None:
+            return {"dtype": None, "shape": None}
+        else:
+            return {"dtype": tensor.dtype, "shape": tensor.shape}
+
+    def _prepare_recv_buffer(self, metadata: Metadata) -> torch.Tensor:
+        """
+        Create a buffer to receive the tensor based on the provided metadata.
+
+        Args:
+            metadata: A dictionary with keys "dtype" and "shape",
+                describing the tensor's data type and shape.
+
+        Returns:
+            buffer: A tensor of the specified type and shape,
+                allocated on `self.device`.
+        """
+        return torch.empty(metadata["shape"],
+                           dtype=metadata["dtype"],
+                           device=self.device)
+
+    def _send_metadata(self, metadata: Metadata):
+        """
+        Send the metadata dictionary to the target rank.
+
+        Args:
+            metadata: A dictionary with keys "dtype" and "shape".
+        """
+        self.group.send_obj(metadata, self.target_rank_for_send)
+
+    def _recv_metadata(self) -> Metadata:
+        """
+        Receive the metadata dictionary from the target rank.
+
+        Returns:
+            metadata: A dictionary with keys "dtype" and "shape"
+                describing the tensor.
+        """
+        return self.group.recv_obj(self.target_rank_for_recv)
+
+    def _send_impl(self, tensor: Optional[torch.Tensor]) -> None:
+        """
+        The actual implementation of sending the tensor and its metadata to the
+        target rank.
+
+        Args:
+            tensor: The input tensor to be sent, or `None` if no tensor is
+                being sent.
+        """
+        metadata = self._make_metadata(tensor)
+        self._send_metadata(metadata)
+        if tensor is not None:
+            self.device_send_func(tensor.to(self.device),
+                                  self.target_rank_for_send)
+
+    def _recv_impl(self) -> Optional[torch.Tensor]:
+        """
+        The actual implementation of receiving a tensor and its metadata from
+        the target rank.
+
+        Returns:
+            buffer: The received tensor, or `None` if no tensor is received.
+        """
+        metadata = self._recv_metadata()
+        if metadata["dtype"] is None:
+            return None
+        buffer = self._prepare_recv_buffer(metadata)
+        self.device_recv_func(buffer, self.target_rank_for_recv)
+
+        return buffer
+
+    def send_tensor_wrapper(self, tensor: Optional[torch.Tensor],
+                            tensor_size: int) -> None:
+        """
+        Wrapper for _send_impl to handle exceptions and update buffer size.
+        """
+        try:
+            self._send_impl(tensor)
+
+            with self.buffer_size_lock:
+                self.buffer_size -= tensor_size
+        except Exception as e:
+            logger.error("[rank%d]: Exception when trying to send %s, msg: %s",
+                         torch.distributed.get_rank(), str(tensor), str(e))
+            import traceback
+            traceback.print_exc()
+
+    def block_if_full(self):
+        """
+        Block the current thread if the buffer size is larger than the
+        threshold.
+        """
+        while self.buffer_size > self.buffer_size_thresh:
+            logger.debug("KV cache transfer pipe is full. Waiting...")
+            time.sleep(0.05)
+
+    def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
+        """
+        Sends a tensor and its metadata to the destination rank in a
+        non-blocking way.
+
+        Args:
+            tensor: The tensor to send, or `None` if no tensor is being sent.
+        """
+        if self.transport_thread is None:
+            self.transport_thread = ThreadPoolExecutor(max_workers=1)
+
+        if tensor is not None:
+            tensor_size = tensor.element_size() * tensor.numel()
+        else:
+            tensor_size = 0
+
+        self.block_if_full()
+
+        with self.buffer_size_lock:
+            self.buffer_size += tensor_size
+
+        self.transport_thread.submit(self.send_tensor_wrapper, tensor,
+                                     tensor_size)
+
+    def recv_tensor(self) -> Optional[torch.Tensor]:
+        """
+        Receives a tensor and its metadata from the source rank. Blocking call.
+
+        Args:
+            tensor: The received tensor, or `None` if no tensor is received.
+        """
+        if self.transport_thread is None:
+            self.transport_thread = ThreadPoolExecutor(max_workers=1)
+
+        future = self.transport_thread.submit(self._recv_impl)
+
+        try:
+            tensor = future.result()
+        except Exception as e:
+            logger.error("Encountering exception in KV receiving thread")
+            logger.error("%s", e)
+            logger.error("My device: %s", self.device)
+            import traceback
+            traceback.print_exc()
+            raise e
+
+        return tensor
+
+    def close(self):
+        """
+        Close the pipe and release associated resources.
+        """
+        if hasattr(self,
+                   "transport_thread") and self.transport_thread is not None:
+            self.transport_thread.shutdown()
diff --git a/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_transfer_state.py b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_transfer_state.py
new file mode 100644
index 0000000..60f1d5d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/kv_transfer/kv_transfer_state.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import TYPE_CHECKING, Optional
+
+from vllm import envs
+from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType
+from vllm.distributed.kv_transfer.kv_connector.factory import (
+    KVConnectorFactory)
+from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1,
+                                                          KVConnectorRole)
+from vllm.distributed.parallel_state import get_world_group
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
+_KV_CONNECTOR_AGENT: Optional[KVConnectorBaseType] = None
+
+
+def get_kv_transfer_group() -> KVConnectorBaseType:
+    assert _KV_CONNECTOR_AGENT is not None, (
+        "disaggregated KV cache transfer parallel group is not initialized")
+    return _KV_CONNECTOR_AGENT
+
+
+def has_kv_transfer_group() -> bool:
+    return _KV_CONNECTOR_AGENT is not None
+
+
+def is_v1_kv_transfer_group(
+        connector: Optional[KVConnectorBaseType] = None) -> bool:
+    """Check if the KV connector is the v1 connector.
+    If the argument is None, it will check the global KV connector
+
+    Args:
+        connector: The KV connector to check. If None, it will check the
+            global KV connector.
+
+    Note:
+        This function will no-longer be needed after the v1 KV connector
+        becomes the default.
+    """
+    if connector is None:
+        connector = _KV_CONNECTOR_AGENT
+
+    if connector is None:
+        return False
+
+    return isinstance(connector, KVConnectorBase_V1)
+
+
+def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
+    """
+    Initialize KV cache transfer parallel group.
+    """
+
+    global _KV_CONNECTOR_AGENT
+
+    if vllm_config.kv_transfer_config is None:
+        return
+
+    if (vllm_config.kv_transfer_config.is_kv_transfer_instance
+            and _KV_CONNECTOR_AGENT is None):
+        if envs.VLLM_USE_V1:
+            _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector_v1(
+                config=vllm_config, role=KVConnectorRole.WORKER)
+        else:
+            _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector_v0(
+                rank=get_world_group().rank,
+                local_rank=get_world_group().local_rank,
+                config=vllm_config,
+            )
diff --git a/vllm_v0.10.0/vllm/distributed/parallel_state.py b/vllm_v0.10.0/vllm/distributed/parallel_state.py
new file mode 100644
index 0000000..1f7a149
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/parallel_state.py
@@ -0,0 +1,1411 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2023 The vLLM team.
+# Adapted from
+# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+"""vLLM distributed state.
+It takes over the control of the distributed environment from PyTorch.
+The typical workflow is:
+
+- call `init_distributed_environment` to initialize the distributed environment.
+- call `initialize_model_parallel` or `ensure_model_parallel_initialized` to
+ initialize the model parallel groups.
+
+- any code dealing with the distributed stuff
+
+- call `destroy_model_parallel` to destroy the model parallel groups.
+- call `destroy_distributed_environment` to destroy the distributed environment.
+
+If you only need to use the distributed environment without model/pipeline
+ parallelism, you can skip the model parallel initialization and destruction
+ steps.
+"""
+import contextlib
+import gc
+import pickle
+import weakref
+from collections import namedtuple
+from contextlib import contextmanager, nullcontext
+from dataclasses import dataclass
+from multiprocessing import shared_memory
+from typing import Any, Callable, Optional, Union
+from unittest.mock import patch
+
+import torch
+import torch.distributed
+from torch.distributed import Backend, ProcessGroup
+
+import vllm.envs as envs
+from vllm.distributed.device_communicators.base_device_communicator import (
+    DeviceCommunicatorBase)
+from vllm.distributed.utils import StatelessProcessGroup
+from vllm.logger import init_logger
+from vllm.utils import (direct_register_custom_op, get_distributed_init_method,
+                        resolve_obj_by_qualname, supports_custom_op)
+
+
+@dataclass
+class GraphCaptureContext:
+    stream: torch.cuda.Stream
+
+
+TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"])
+
+
+def _split_tensor_dict(
+    tensor_dict: dict[str, Union[torch.Tensor, Any]]
+) -> tuple[list[tuple[str, Any]], list[torch.Tensor]]:
+    """Split the tensor dictionary into two parts:
+    1. A list of (key, value) pairs. If the value is a tensor, it is replaced
+         by its metadata.
+    2. A list of tensors.
+    """
+    metadata_list: list[tuple[str, Any]] = []
+    tensor_list: list[torch.Tensor] = []
+    for key, value in tensor_dict.items():
+        if isinstance(value, torch.Tensor):
+            # Note: we cannot use `value.device` here,
+            # because it contains not only the device type but also the device
+            # index (e.g. "cuda:0"). We only need the device type.
+            # receiving side will set the device index.
+            device = value.device.type
+            metadata_list.append(
+                (key, TensorMetadata(device, value.dtype, value.size())))
+            tensor_list.append(value)
+        else:
+            metadata_list.append((key, value))
+    return metadata_list, tensor_list
+
+
+_group_name_counter: dict[str, int] = {}
+
+
+def _get_unique_name(name: str) -> str:
+    """Get a unique name for the group.
+    Example:
+    _get_unique_name("tp") -> "tp:0"
+    _get_unique_name("tp") -> "tp:1"
+    """
+    if name not in _group_name_counter:
+        _group_name_counter[name] = 0
+    newname = f"{name}:{_group_name_counter[name]}"
+    _group_name_counter[name] += 1
+    return newname
+
+
+_groups: dict[str, Callable[[], Optional["GroupCoordinator"]]] = {}
+
+
+def _register_group(group: "GroupCoordinator") -> None:
+    _groups[group.unique_name] = weakref.ref(group)
+
+
+def all_reduce(tensor: torch.Tensor, group_name: str) -> torch.Tensor:
+    assert group_name in _groups, f"Group {group_name} is not found."
+    group = _groups[group_name]()
+    if group is None:
+        raise ValueError(f"Group {group_name} is destroyed.")
+    return group._all_reduce_out_place(tensor)
+
+
+def all_reduce_fake(tensor: torch.Tensor, group_name: str) -> torch.Tensor:
+    return torch.empty_like(tensor)
+
+
+def reduce_scatter(tensor: torch.Tensor, dim: int, world_size: int,
+                   group_name: str) -> torch.Tensor:
+    assert group_name in _groups, f"Group {group_name} is not found."
+    group = _groups[group_name]()
+    if group is None:
+        raise ValueError(f"Group {group_name} is destroyed.")
+    return group._reduce_scatter_out_place(tensor, dim)
+
+
+def reduce_scatter_fake(tensor: torch.Tensor, dim: int, world_size: int,
+                        group_name: str) -> torch.Tensor:
+    new_shape = list(tensor.shape)
+    new_shape[dim] = tensor.shape[dim] // world_size
+    return torch.empty(new_shape, dtype=tensor.dtype, device=tensor.device)
+
+
+def all_gather(tensor: torch.Tensor, dim: int, world_size: int,
+               group_name: str) -> torch.Tensor:
+    assert group_name in _groups, f"Group {group_name} is not found."
+    group = _groups[group_name]()
+    if group is None:
+        raise ValueError(f"Group {group_name} is destroyed.")
+    return group._all_gather_out_place(tensor, dim)
+
+
+def all_gather_fake(tensor: torch.Tensor, dim: int, world_size: int,
+                    group_name: str) -> torch.Tensor:
+    new_shape = list(tensor.shape)
+    new_shape[dim] = tensor.shape[dim] * world_size
+    return torch.empty(new_shape, dtype=tensor.dtype, device=tensor.device)
+
+
+if supports_custom_op():
+    from vllm.platforms import current_platform
+    direct_register_custom_op(
+        op_name="all_reduce",
+        op_func=all_reduce,
+        mutates_args=[],
+        fake_impl=all_reduce_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
+    direct_register_custom_op(
+        op_name="reduce_scatter",
+        op_func=reduce_scatter,
+        mutates_args=[],
+        fake_impl=reduce_scatter_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
+    direct_register_custom_op(
+        op_name="all_gather",
+        op_func=all_gather,
+        mutates_args=[],
+        fake_impl=all_gather_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
+
+class GroupCoordinator:
+    """
+    PyTorch ProcessGroup wrapper for a group of processes.
+    PyTorch ProcessGroup is bound to one specific communication backend,
+        e.g. NCCL, Gloo, MPI, etc.
+    GroupCoordinator takes charge of all the communication operations among
+        the processes in the group. It manages both CPU and device
+        communication.
+    """
+
+    # available attributes:
+    rank: int  # global rank
+    ranks: list[int]  # global ranks in the group
+    world_size: int  # size of the group
+    # difference between `local_rank` and `rank_in_group`:
+    # if we have a group of size 4 across two nodes:
+    # Process | Node | Rank | Local Rank | Rank in Group
+    #   0     |   0  |  0   |     0      |       0
+    #   1     |   0  |  1   |     1      |       1
+    #   2     |   1  |  2   |     0      |       2
+    #   3     |   1  |  3   |     1      |       3
+    local_rank: int  # local rank used to assign devices
+    rank_in_group: int  # rank inside the group
+    cpu_group: ProcessGroup  # group for CPU communication
+    device_group: ProcessGroup  # group for device communication
+    use_device_communicator: bool  # whether to use device communicator
+    device_communicator: DeviceCommunicatorBase  # device communicator
+    mq_broadcaster: Optional[Any]  # shared memory broadcaster
+
+    def __init__(
+        self,
+        group_ranks: list[list[int]],
+        local_rank: int,
+        torch_distributed_backend: Union[str, Backend],
+        use_device_communicator: bool,
+        use_message_queue_broadcaster: bool = False,
+        group_name: Optional[str] = None,
+    ):
+        group_name = group_name or "anonymous"
+        self.unique_name = _get_unique_name(group_name)
+        _register_group(self)
+
+        self.rank = torch.distributed.get_rank()
+        self.local_rank = local_rank
+        self.device_group = None
+        self.cpu_group = None
+
+        for ranks in group_ranks:
+            device_group = torch.distributed.new_group(
+                ranks, backend=torch_distributed_backend)
+            # a group with `gloo` backend, to allow direct coordination between
+            # processes through the CPU.
+            cpu_group = torch.distributed.new_group(ranks, backend="gloo")
+            if self.rank in ranks:
+                self.ranks = ranks
+                self.world_size = len(ranks)
+                self.rank_in_group = ranks.index(self.rank)
+                self.device_group = device_group
+                self.cpu_group = cpu_group
+
+        assert self.cpu_group is not None
+        assert self.device_group is not None
+
+        from vllm.platforms import current_platform
+
+        if current_platform.is_cuda_alike():
+            self.device = torch.device(f"cuda:{local_rank}")
+        elif current_platform.is_xpu():
+            self.device = torch.device(f"xpu:{local_rank}")
+        elif current_platform.is_out_of_tree():
+            self.device = torch.device(
+                f"{current_platform.device_name}:{local_rank}")
+        else:
+            self.device = torch.device("cpu")
+
+        self.use_device_communicator = use_device_communicator
+
+        self.device_communicator: DeviceCommunicatorBase = None  # type: ignore
+        if use_device_communicator and self.world_size > 1:
+            device_comm_cls = resolve_obj_by_qualname(
+                current_platform.get_device_communicator_cls())
+            self.device_communicator = device_comm_cls(
+                cpu_group=self.cpu_group,
+                device=self.device,
+                device_group=self.device_group,
+                unique_name=self.unique_name,
+            )
+
+        from vllm.distributed.device_communicators.shm_broadcast import (
+            MessageQueue)
+        self.mq_broadcaster: Optional[MessageQueue] = None
+        if use_message_queue_broadcaster and self.world_size > 1:
+            self.mq_broadcaster = MessageQueue.create_from_process_group(
+                self.cpu_group, 1 << 22, 6)
+
+        from vllm.platforms import current_platform
+        self.use_custom_op_call = (current_platform.is_cuda_alike()
+                                   or current_platform.is_tpu())
+
+        self.use_cpu_custom_send_recv = (current_platform.is_cpu() and hasattr(
+            torch.ops._C, "init_shm_manager"))
+
+    @property
+    def first_rank(self):
+        """Return the global rank of the first process in the group"""
+        return self.ranks[0]
+
+    @property
+    def last_rank(self):
+        """Return the global rank of the last process in the group"""
+        return self.ranks[-1]
+
+    @property
+    def is_first_rank(self):
+        """Return whether the caller is the first process in the group"""
+        return self.rank == self.first_rank
+
+    @property
+    def is_last_rank(self):
+        """Return whether the caller is the last process in the group"""
+        return self.rank == self.last_rank
+
+    @property
+    def next_rank(self):
+        """Return the global rank of the process that follows the caller"""
+        rank_in_group = self.rank_in_group
+        world_size = self.world_size
+        return self.ranks[(rank_in_group + 1) % world_size]
+
+    @property
+    def prev_rank(self):
+        """Return the global rank of the process that precedes the caller"""
+        rank_in_group = self.rank_in_group
+        world_size = self.world_size
+        return self.ranks[(rank_in_group - 1) % world_size]
+
+    @contextmanager
+    def graph_capture(
+            self, graph_capture_context: Optional[GraphCaptureContext] = None):
+        if graph_capture_context is None:
+            stream = torch.cuda.Stream()
+            graph_capture_context = GraphCaptureContext(stream)
+        else:
+            stream = graph_capture_context.stream
+
+        # only cuda uses this function,
+        # so we don't abstract it into the base class
+        maybe_ca_context = nullcontext()
+        from vllm.distributed.device_communicators.cuda_communicator import (
+            CudaCommunicator)
+        if self.device_communicator is not None:
+            assert isinstance(self.device_communicator, CudaCommunicator)
+            ca_comm = self.device_communicator.ca_comm
+            if ca_comm is not None:
+                maybe_ca_context = ca_comm.capture()  # type: ignore
+
+        # ensure all initialization operations complete before attempting to
+        # capture the graph on another stream
+        curr_stream = torch.cuda.current_stream()
+        if curr_stream != stream:
+            stream.wait_stream(curr_stream)
+
+        with torch.cuda.stream(stream), maybe_ca_context:
+            yield graph_capture_context
+
+    def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
+        """
+        User-facing all-reduce function before we actually call the
+        all-reduce operation.
+
+        We need this because Dynamo does not support passing an arbitrary
+        object (`self` in this case) to a custom op. We need to pass the
+         group name as a string, and then look up the group coordinator from
+         the group name, dispatch the all-reduce operation to the group
+         coordinator.
+
+        In addition, PyTorch custom ops do not support mutation or returning
+        a new tensor in the same op. So we always make the all-reduce operation
+        out-of-place.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if self.world_size == 1:
+            return input_
+
+        if self.use_custom_op_call:
+            return torch.ops.vllm.all_reduce(input_,
+                                             group_name=self.unique_name)
+        else:
+            return self._all_reduce_out_place(input_)
+
+    def _all_reduce_out_place(self, input_: torch.Tensor) -> torch.Tensor:
+        return self.device_communicator.all_reduce(input_)
+
+    def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        world_size = self.world_size
+        # Bypass the function if we are using only 1 GPU.
+        if world_size == 1:
+            return input_
+        assert -input_.dim() <= dim < input_.dim(), (
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+
+        if self.use_custom_op_call:
+            return torch.ops.vllm.all_gather(input_,
+                                             dim,
+                                             world_size,
+                                             group_name=self.unique_name)
+        else:
+            return self._all_gather_out_place(input_, dim)
+
+    def _all_gather_out_place(self, input_: torch.Tensor,
+                              dim: int) -> torch.Tensor:
+        return self.device_communicator.all_gather(input_, dim)
+
+    def all_gatherv(self,
+                    input_: Union[torch.Tensor, list[torch.Tensor]],
+                    dim: int = 0,
+                    sizes: Optional[list[int]] = None):
+        return self.device_communicator.all_gatherv(input_, dim, sizes)
+
+    def reduce_scatter(self,
+                       input_: torch.Tensor,
+                       dim: int = -1) -> torch.Tensor:
+        world_size = self.world_size
+        # Bypass the function if we are using only 1 GPU.
+        if world_size == 1:
+            return input_
+        assert -input_.dim() <= dim < input_.dim(), (
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+
+        if self.use_custom_op_call:
+            return torch.ops.vllm.reduce_scatter(input_,
+                                                 dim,
+                                                 world_size,
+                                                 group_name=self.unique_name)
+        else:
+            return self._reduce_scatter_out_place(input_, dim)
+
+    def reduce_scatterv(self,
+                        input_: torch.Tensor,
+                        dim: int = -1,
+                        sizes: Optional[list[int]] = None) -> torch.Tensor:
+        return self.device_communicator.reduce_scatterv(input_, dim, sizes)
+
+    def _reduce_scatter_out_place(self, input_: torch.Tensor,
+                                  dim: int) -> torch.Tensor:
+        return self.device_communicator.reduce_scatter(input_, dim)
+
+    def gather(self,
+               input_: torch.Tensor,
+               dst: int = 0,
+               dim: int = -1) -> Optional[torch.Tensor]:
+        """
+        NOTE: We assume that the input tensor is on the same device across
+        all the ranks.
+        NOTE: `dst` is the local rank of the destination rank.
+        """
+        world_size = self.world_size
+        # Bypass the function if we are using only 1 GPU.
+        if world_size == 1:
+            return input_
+        return self.device_communicator.gather(input_, dst, dim)
+
+    def broadcast(self, input_: torch.Tensor, src: int = 0):
+        """Broadcast the input tensor.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        # Bypass the function if we are using only 1 GPU.
+        if self.world_size == 1:
+            return input_
+        # Broadcast.
+        torch.distributed.broadcast(input_,
+                                    src=self.ranks[src],
+                                    group=self.device_group)
+        return input_
+
+    def broadcast_object(self, obj: Optional[Any] = None, src: int = 0):
+        """Broadcast the input object.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        # Bypass the function if we are using only 1 GPU.
+        if self.world_size == 1:
+            return obj
+        if self.mq_broadcaster is not None:
+            assert src == 0, "Message queue broadcaster only supports src=0"
+            return self.mq_broadcaster.broadcast_object(obj)
+        if self.rank_in_group == src:
+            torch.distributed.broadcast_object_list([obj],
+                                                    src=self.ranks[src],
+                                                    group=self.cpu_group)
+            return obj
+        else:
+            recv = [None]
+            torch.distributed.broadcast_object_list(recv,
+                                                    src=self.ranks[src],
+                                                    group=self.cpu_group)
+            return recv[0]
+
+    def broadcast_object_list(self,
+                              obj_list: list[Any],
+                              src: int = 0,
+                              group: Optional[ProcessGroup] = None):
+        """Broadcast the input object list.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        # Bypass the function if we are using only 1 GPU.
+        if self.world_size == 1:
+            return obj_list
+        # Broadcast.
+        torch.distributed.broadcast_object_list(obj_list,
+                                                src=self.ranks[src],
+                                                group=self.device_group)
+        return obj_list
+
+    def send_object(self, obj: Any, dst: int) -> None:
+        """Send the input object list to the destination rank."""
+        """NOTE: `dst` is the local rank of the destination rank."""
+
+        assert dst < self.world_size, f"Invalid dst rank ({dst})"
+
+        assert dst != self.rank_in_group, (
+            "Invalid destination rank. Destination rank is the same "
+            "as the current rank.")
+
+        # Serialize object to tensor and get the size as well
+        object_tensor = torch.frombuffer(pickle.dumps(obj), dtype=torch.uint8)
+
+        size_tensor = torch.tensor([object_tensor.numel()],
+                                   dtype=torch.long,
+                                   device="cpu")
+
+        # Send object size
+
+        torch.distributed.send(size_tensor,
+                               dst=self.ranks[dst],
+                               group=self.cpu_group)
+
+        # Send object
+        torch.distributed.send(object_tensor,
+                               dst=self.ranks[dst],
+                               group=self.cpu_group)
+
+        return None
+
+    def recv_object(self, src: int) -> Any:
+        """Receive the input object list from the source rank."""
+        """NOTE: `src` is the local rank of the source rank."""
+
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        assert src != self.rank_in_group, (
+            "Invalid source rank. Source rank is the same as the current rank."
+        )
+
+        size_tensor = torch.empty(1, dtype=torch.long, device="cpu")
+
+        # Receive object size
+        rank_size = torch.distributed.recv(size_tensor,
+                                           src=self.ranks[src],
+                                           group=self.cpu_group)
+
+        # Tensor to receive serialized objects into.
+        object_tensor = torch.empty(  # type: ignore[call-overload]
+            size_tensor.item(),  # type: ignore[arg-type]
+            dtype=torch.uint8,
+            device="cpu")
+
+        rank_object = torch.distributed.recv(object_tensor,
+                                             src=self.ranks[src],
+                                             group=self.cpu_group)
+
+        assert rank_object == rank_size, (
+            "Received object sender rank does not match the size sender rank.")
+
+        obj = pickle.loads(object_tensor.numpy().tobytes())
+
+        return obj
+
+    def broadcast_tensor_dict(
+        self,
+        tensor_dict: Optional[dict[str, Union[torch.Tensor, Any]]] = None,
+        src: int = 0,
+        group: Optional[ProcessGroup] = None,
+        metadata_group: Optional[ProcessGroup] = None
+    ) -> Optional[dict[str, Union[torch.Tensor, Any]]]:
+        """Broadcast the input tensor dictionary.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if (not torch.distributed.is_initialized() or self.world_size == 1):
+            return tensor_dict
+
+        group = self.device_group
+        metadata_group = self.cpu_group
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        rank_in_group = self.rank_in_group
+        if rank_in_group == src:
+            metadata_list: list[tuple[Any, Any]] = []
+            assert isinstance(
+                tensor_dict,
+                dict), (f"Expecting a dictionary, got {type(tensor_dict)}")
+            metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
+            # `metadata_list` lives in CPU memory.
+            # `broadcast_object_list` has serialization & deserialization,
+            # all happening on CPU. Therefore, we can use the CPU group.
+            self.broadcast_object(metadata_list, src=src)
+            async_handles = []
+            for tensor in tensor_list:
+                if tensor.numel() == 0:
+                    # Skip broadcasting empty tensors.
+                    continue
+                if tensor.is_cpu:
+                    # use metadata_group for CPU tensors
+                    handle = torch.distributed.broadcast(tensor,
+                                                         src=self.ranks[src],
+                                                         group=metadata_group,
+                                                         async_op=True)
+                else:
+                    # use group for GPU tensors
+                    handle = torch.distributed.broadcast(tensor,
+                                                         src=self.ranks[src],
+                                                         group=group,
+                                                         async_op=True)
+                async_handles.append(handle)
+            for async_handle in async_handles:
+                async_handle.wait()
+
+        else:
+            metadata_list = self.broadcast_object(None, src=src)
+            tensor_dict = {}
+            async_handles = []
+            for key, value in metadata_list:
+                if isinstance(value, TensorMetadata):
+                    tensor = torch.empty(value.size,
+                                         dtype=value.dtype,
+                                         device=value.device)
+                    if tensor.numel() == 0:
+                        # Skip broadcasting empty tensors.
+                        tensor_dict[key] = tensor
+                        continue
+                    if tensor.is_cpu:
+                        # use metadata_group for CPU tensors
+                        handle = torch.distributed.broadcast(
+                            tensor,
+                            src=self.ranks[src],
+                            group=metadata_group,
+                            async_op=True)
+                    else:
+                        # use group for GPU tensors
+                        handle = torch.distributed.broadcast(
+                            tensor,
+                            src=self.ranks[src],
+                            group=group,
+                            async_op=True)
+                    async_handles.append(handle)
+                    tensor_dict[key] = tensor
+                else:
+                    tensor_dict[key] = value
+            for async_handle in async_handles:
+                async_handle.wait()
+        return tensor_dict
+
+    def send_tensor_dict(
+        self,
+        tensor_dict: dict[str, Union[torch.Tensor, Any]],
+        dst: Optional[int] = None,
+        all_gather_group: Optional["GroupCoordinator"] = None,
+    ) -> Optional[dict[str, Union[torch.Tensor, Any]]]:
+        """Send the input tensor dictionary.
+        NOTE: `dst` is the local rank of the source rank.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if not torch.distributed.is_initialized() or self.world_size == 1:
+            return tensor_dict
+
+        all_gather_size = (1 if all_gather_group is None else
+                           all_gather_group.world_size)
+        all_gather_rank = (0 if all_gather_group is None else
+                           all_gather_group.rank_in_group)
+
+        group = self.device_group
+        metadata_group = self.cpu_group
+
+        if dst is None:
+            dst = (self.rank_in_group + 1) % self.world_size
+        assert dst < self.world_size, f"Invalid dst rank ({dst})"
+
+        if self.use_cpu_custom_send_recv:
+            self.device_communicator.send_tensor_dict(  # type: ignore
+                tensor_dict, dst)
+            return None
+
+        metadata_list: list[tuple[Any, Any]] = []
+        assert isinstance(
+            tensor_dict,
+            dict), f"Expecting a dictionary, got {type(tensor_dict)}"
+        metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
+        # `metadata_list` lives in CPU memory.
+        # `send_object_list` has serialization & deserialization,
+        # all happening on CPU. Therefore, we can use the CPU group.
+        self.send_object(metadata_list, dst=dst)
+        for tensor in tensor_list:
+            if tensor.numel() == 0:
+                # Skip sending empty tensors.
+                continue
+
+            # send-allgather: send only a slice, then do allgather.
+            if (all_gather_group is not None
+                    and tensor.numel() % all_gather_size == 0):
+                tensor = tensor.reshape(all_gather_size, -1)[all_gather_rank]
+
+            if tensor.is_cpu:
+                # use metadata_group for CPU tensors
+                torch.distributed.send(tensor,
+                                       dst=self.ranks[dst],
+                                       group=metadata_group)
+            else:
+                # use group for GPU tensors
+                torch.distributed.send(tensor,
+                                       dst=self.ranks[dst],
+                                       group=group)
+        return None
+
+    def recv_tensor_dict(
+        self,
+        src: Optional[int] = None,
+        all_gather_group: Optional["GroupCoordinator"] = None,
+    ) -> Optional[dict[str, Union[torch.Tensor, Any]]]:
+        """Recv the input tensor dictionary.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if not torch.distributed.is_initialized() or self.world_size == 1:
+            return None
+
+        all_gather_size = (1 if all_gather_group is None else
+                           all_gather_group.world_size)
+        all_gather_rank = (0 if all_gather_group is None else
+                           all_gather_group.rank_in_group)
+
+        group = self.device_group
+        metadata_group = self.cpu_group
+
+        if src is None:
+            src = (self.rank_in_group - 1) % self.world_size
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        if self.use_cpu_custom_send_recv:
+            return self.device_communicator.recv_tensor_dict(  # type: ignore
+                src)
+
+        recv_metadata_list = self.recv_object(src=src)
+        tensor_dict: dict[str, Any] = {}
+        for key, value in recv_metadata_list:
+            if isinstance(value, TensorMetadata):
+                tensor = torch.empty(value.size,
+                                     dtype=value.dtype,
+                                     device=value.device)
+                if tensor.numel() == 0:
+                    # Skip broadcasting empty tensors.
+                    tensor_dict[key] = tensor
+                    continue
+
+                # send-allgather: send only a slice, then do allgather.
+                use_all_gather = (all_gather_group is not None
+                                  and tensor.numel() % all_gather_size == 0)
+
+                if use_all_gather:
+                    orig_shape = tensor.shape
+                    tensor = tensor.reshape(all_gather_size,
+                                            -1)[all_gather_rank]
+
+                if tensor.is_cpu:
+                    # use metadata_group for CPU tensors
+                    torch.distributed.recv(tensor,
+                                           src=self.ranks[src],
+                                           group=metadata_group)
+                else:
+                    # use group for GPU tensors
+                    torch.distributed.recv(tensor,
+                                           src=self.ranks[src],
+                                           group=group)
+                if use_all_gather:
+                    # do the allgather
+                    tensor = all_gather_group.all_gather(  # type: ignore
+                        tensor, dim=0)
+                    tensor = tensor.reshape(orig_shape)
+
+                tensor_dict[key] = tensor
+            else:
+                tensor_dict[key] = value
+        return tensor_dict
+
+    def barrier(self):
+        """Barrier synchronization among the group.
+        NOTE: don't use `device_group` here! `barrier` in NCCL is
+        terrible because it is internally a broadcast operation with
+        secretly created GPU tensors. It is easy to mess up the current
+        device. Use the CPU group instead.
+        """
+        torch.distributed.barrier(group=self.cpu_group)
+
+    def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
+        """Sends a tensor to the destination rank in a non-blocking way"""
+        """NOTE: `dst` is the local rank of the destination rank."""
+        self.device_communicator.send(tensor, dst)
+
+    def recv(self,
+             size: torch.Size,
+             dtype: torch.dtype,
+             src: Optional[int] = None) -> torch.Tensor:
+        """Receives a tensor from the source rank."""
+        """NOTE: `src` is the local rank of the source rank."""
+        return self.device_communicator.recv(size, dtype, src)
+
+    def destroy(self):
+        if self.device_group is not None:
+            torch.distributed.destroy_process_group(self.device_group)
+            self.device_group = None
+        if self.cpu_group is not None:
+            torch.distributed.destroy_process_group(self.cpu_group)
+            self.cpu_group = None
+        if self.device_communicator is not None:
+            self.device_communicator.destroy()
+        if self.mq_broadcaster is not None:
+            self.mq_broadcaster = None
+
+    def prepare_communication_buffer_for_model(self, model: torch.nn.Module):
+        if self.device_communicator is not None:
+            self.device_communicator.prepare_communication_buffer_for_model(
+                model)
+
+    def dispatch(
+            self, hidden_states: torch.Tensor,
+            router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.device_communicator is not None:
+            return self.device_communicator.dispatch(hidden_states,
+                                                     router_logits)
+        else:
+            return hidden_states, router_logits
+
+    def combine(self, hidden_states) -> torch.Tensor:
+        if self.device_communicator is not None:
+            return self.device_communicator.combine(hidden_states)
+        else:
+            return hidden_states
+
+
+_WORLD: Optional[GroupCoordinator] = None
+_NODE_COUNT: Optional[int] = None
+
+
+def get_world_group() -> GroupCoordinator:
+    assert _WORLD is not None, ("world group is not initialized")
+    return _WORLD
+
+
+def init_world_group(ranks: list[int], local_rank: int,
+                     backend: str) -> GroupCoordinator:
+    return GroupCoordinator(
+        group_ranks=[ranks],
+        local_rank=local_rank,
+        torch_distributed_backend=backend,
+        use_device_communicator=False,
+        group_name="world",
+    )
+
+
+def init_model_parallel_group(
+    group_ranks: list[list[int]],
+    local_rank: int,
+    backend: str,
+    use_message_queue_broadcaster: bool = False,
+    group_name: Optional[str] = None,
+) -> GroupCoordinator:
+
+    return GroupCoordinator(
+        group_ranks=group_ranks,
+        local_rank=local_rank,
+        torch_distributed_backend=backend,
+        use_device_communicator=True,
+        use_message_queue_broadcaster=use_message_queue_broadcaster,
+        group_name=group_name,
+    )
+
+
+_TP: Optional[GroupCoordinator] = None
+
+
+def get_tp_group() -> GroupCoordinator:
+    assert _TP is not None, ("tensor model parallel group is not initialized")
+    return _TP
+
+
+# kept for backward compatibility
+get_tensor_model_parallel_group = get_tp_group
+
+_PP: Optional[GroupCoordinator] = None
+
+_DP: Optional[GroupCoordinator] = None
+
+
+def get_dp_group() -> GroupCoordinator:
+    assert _DP is not None, ("data parallel group is not initialized")
+    return _DP
+
+
+_EP: Optional[GroupCoordinator] = None
+
+
+def get_ep_group() -> GroupCoordinator:
+    assert _EP is not None, ("expert parallel group is not initialized")
+    return _EP
+
+
+def get_pp_group() -> GroupCoordinator:
+    assert _PP is not None, (
+        "pipeline model parallel group is not initialized")
+    return _PP
+
+
+# kept for backward compatibility
+get_pipeline_model_parallel_group = get_pp_group
+
+
+@contextmanager
+def graph_capture(device: torch.device):
+    """
+    `graph_capture` is a context manager which should surround the code that
+    is capturing the CUDA graph. Its main purpose is to ensure that the
+    some operations will be run after the graph is captured, before the graph
+    is replayed. It returns a `GraphCaptureContext` object which contains the
+    necessary data for the graph capture. Currently, it only contains the
+    stream that the graph capture is running on. This stream is set to the
+    current CUDA stream when the context manager is entered and reset to the
+    default stream when the context manager is exited. This is to ensure that
+    the graph capture is running on a separate stream from the default stream,
+    in order to explicitly distinguish the kernels to capture
+    from other kernels possibly launched on background in the default stream.
+    """
+    context = GraphCaptureContext(torch.cuda.Stream(device=device))
+    with get_tp_group().graph_capture(context), get_pp_group().graph_capture(
+            context):
+        yield context
+
+
+logger = init_logger(__name__)
+
+_ENABLE_CUSTOM_ALL_REDUCE = True
+
+
+def set_custom_all_reduce(enable: bool):
+    global _ENABLE_CUSTOM_ALL_REDUCE
+    _ENABLE_CUSTOM_ALL_REDUCE = enable
+
+
+def init_distributed_environment(
+    world_size: int = -1,
+    rank: int = -1,
+    distributed_init_method: str = "env://",
+    local_rank: int = -1,
+    backend: str = "nccl",
+):
+    logger.debug(
+        "world_size=%d rank=%d local_rank=%d "
+        "distributed_init_method=%s backend=%s", world_size, rank, local_rank,
+        distributed_init_method, backend)
+    from vllm.config import get_current_vllm_config
+    config = get_current_vllm_config()
+    if config is not None and config.parallel_config.data_parallel_size > 1:
+        parallel_config = config.parallel_config
+        # adjust to take into account data parallelism
+        # offset the rank by the data parallel rank
+        rank = parallel_config.data_parallel_rank * world_size + rank
+        # adjust the world size to take into account data parallelism
+        world_size = parallel_config.world_size_across_dp
+        ip = parallel_config.data_parallel_master_ip
+        port = parallel_config.get_next_dp_init_port()
+        distributed_init_method = get_distributed_init_method(ip, port)
+        logger.info(
+            "Adjusting world_size=%d rank=%d distributed_init_method=%s for DP",
+            world_size, rank, distributed_init_method)
+    if not torch.distributed.is_initialized():
+        assert distributed_init_method is not None, (
+            "distributed_init_method must be provided when initializing "
+            "distributed environment")
+        if not torch.distributed.is_backend_available(backend):
+            logger.warning(
+                "Distributed backend %s is not available; "
+                "falling back to gloo.", backend)
+            assert torch.distributed.is_gloo_available(), (
+                "Fallback Gloo backend is not available.")
+            backend = "gloo"
+        # this backend is used for WORLD
+        torch.distributed.init_process_group(
+            backend=backend,
+            init_method=distributed_init_method,
+            world_size=world_size,
+            rank=rank)
+    # set the local rank
+    # local_rank is not available in torch ProcessGroup,
+    # see https://github.com/pytorch/pytorch/issues/122816
+    if local_rank == -1:
+        # local rank not set, this usually happens in single-node
+        # setting, where we can use rank as local rank
+        if distributed_init_method == "env://":
+            local_rank = envs.LOCAL_RANK
+        else:
+            local_rank = rank
+    global _WORLD, _NODE_COUNT
+    if _WORLD is None:
+        ranks = list(range(torch.distributed.get_world_size()))
+        _WORLD = init_world_group(ranks, local_rank, backend)
+        _NODE_COUNT = _node_count(_WORLD.cpu_group)
+        logger.debug("Detected %d nodes in the distributed environment",
+                     _NODE_COUNT)
+    else:
+        assert _WORLD.world_size == torch.distributed.get_world_size(), (
+            "world group already initialized with a different world size")
+
+
+def initialize_model_parallel(
+    tensor_model_parallel_size: int = 1,
+    pipeline_model_parallel_size: int = 1,
+    backend: Optional[str] = None,
+) -> None:
+    """
+    Initialize model parallel groups.
+
+    Arguments:
+        tensor_model_parallel_size: number of GPUs used for tensor model
+            parallelism.
+        pipeline_model_parallel_size: number of GPUs used for pipeline model
+            parallelism.
+
+    Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
+    use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
+    the model pipeline. The present function will
+    create 4 tensor model-parallel groups and 2 pipeline model-parallel groups:
+        4 tensor model-parallel groups:
+            [g0, g1], [g2, g3], [g4, g5], [g6, g7]
+        2 pipeline model-parallel groups:
+            [g0, g2, g4, g6], [g1, g3, g5, g7]
+    Note that for efficiency, the caller should make sure adjacent ranks
+    are on the same DGX box. For example if we are using 2 DGX-1 boxes
+    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
+    ranks 8 to 15 belong to the second box.
+    """
+    # Get world size and rank. Ensure some consistencies.
+    assert torch.distributed.is_initialized()
+    world_size: int = torch.distributed.get_world_size()
+    rank = torch.distributed.get_rank()
+    backend = backend or torch.distributed.get_backend(
+        get_world_group().device_group)
+
+    data_parallel_size = 1
+    from vllm.config import get_current_vllm_config
+    config = get_current_vllm_config()
+    if config is not None:
+        data_parallel_size = config.parallel_config.data_parallel_size
+
+    # the layout order is: ExternalDP x DP x PP x TP
+    # ExternalDP is the data parallel group that is not part of the model,
+    # every dp rank can generate independently (in verl integration).
+    # DP is the data parallel group that is part of the model,
+    # all the ranks in the same DP group should generate simultaneously,
+    # i.e. the `generate` call in the same DP group should be called together,
+    # otherwise it will cause deadlock.
+    # to get group_ranks for each dimension, transpose that dimension to the
+    # last dimension, then reshape to 2D, then unbind the last dimension
+    all_ranks = torch.arange(world_size).reshape(
+        -1, data_parallel_size, pipeline_model_parallel_size,
+        tensor_model_parallel_size)  # noqa
+
+    # Build the tensor model-parallel groups.
+    global _TP
+    assert _TP is None, ("tensor model parallel group is already initialized")
+    group_ranks = all_ranks.view(-1, tensor_model_parallel_size).unbind(0)
+    group_ranks = [x.tolist() for x in group_ranks]
+
+    # message queue broadcaster is only used in tensor model parallel group
+    _TP = init_model_parallel_group(group_ranks,
+                                    get_world_group().local_rank,
+                                    backend,
+                                    use_message_queue_broadcaster=True,
+                                    group_name="tp")
+
+    # Build the pipeline model-parallel groups.
+    global _PP
+    assert _PP is None, (
+        "pipeline model parallel group is already initialized")
+    group_ranks = all_ranks.transpose(2, 3).reshape(
+        -1, pipeline_model_parallel_size).unbind(0)
+    group_ranks = [x.tolist() for x in group_ranks]
+    _PP = init_model_parallel_group(group_ranks,
+                                    get_world_group().local_rank,
+                                    backend,
+                                    group_name="pp")
+
+    global _DP
+    assert _DP is None, ("data parallel group is already initialized")
+    group_ranks = all_ranks.transpose(1,
+                                      3).reshape(-1,
+                                                 data_parallel_size).unbind(0)
+    group_ranks = [x.tolist() for x in group_ranks]
+    _DP = init_model_parallel_group(group_ranks,
+                                    get_world_group().local_rank,
+                                    backend,
+                                    group_name="dp")
+
+    global _EP
+    assert _EP is None, ("expert parallel group is already initialized")
+    group_ranks = all_ranks.transpose(1, 2).reshape(
+        -1, data_parallel_size * tensor_model_parallel_size).unbind(0)
+    group_ranks = [x.tolist() for x in group_ranks]
+    _EP = init_model_parallel_group(group_ranks,
+                                    get_world_group().local_rank,
+                                    backend,
+                                    group_name="ep")
+
+    logger.info(
+        "rank %s in world size %s is assigned as "
+        "DP rank %s, PP rank %s, TP rank %s, EP rank %s", rank, world_size,
+        _DP.rank_in_group, _PP.rank_in_group, _TP.rank_in_group,
+        _EP.rank_in_group)
+
+
+def ensure_model_parallel_initialized(
+    tensor_model_parallel_size: int,
+    pipeline_model_parallel_size: int,
+    backend: Optional[str] = None,
+) -> None:
+    """Helper to initialize model parallel groups if they are not initialized,
+    or ensure tensor-parallel and pipeline-parallel sizes are equal to expected
+    values if the model parallel groups are initialized.
+    """
+    backend = backend or torch.distributed.get_backend(
+        get_world_group().device_group)
+    if not model_parallel_is_initialized():
+        initialize_model_parallel(tensor_model_parallel_size,
+                                  pipeline_model_parallel_size, backend)
+        return
+
+    assert (
+        get_tensor_model_parallel_world_size() == tensor_model_parallel_size
+    ), ("tensor parallel group already initialized, but of unexpected size: "
+        f"{get_tensor_model_parallel_world_size()=} vs. "
+        f"{tensor_model_parallel_size=}")
+    pp_world_size = get_pp_group().world_size
+    assert (pp_world_size == pipeline_model_parallel_size), (
+        "pipeline parallel group already initialized, but of unexpected size: "
+        f"{pp_world_size=} vs. "
+        f"{pipeline_model_parallel_size=}")
+
+
+def prepare_communication_buffer_for_model(model: torch.nn.Module):
+    """Prepare the communication buffer for the model.
+    Traditional communication libraries like NCCL are almost
+    model agnostic. However, emerging new communication libraries like
+    MoE all2all (DeepEP) usually allocate the communication buffer
+    based on the model shape for optimal performance.
+    """
+    if _TP is not None:
+        _TP.prepare_communication_buffer_for_model(model)
+    if _PP is not None:
+        _PP.prepare_communication_buffer_for_model(model)
+    if _DP is not None:
+        _DP.prepare_communication_buffer_for_model(model)
+    if _EP is not None:
+        _EP.prepare_communication_buffer_for_model(model)
+
+
+def model_parallel_is_initialized():
+    """Check if tensor and pipeline parallel groups are initialized."""
+    return (_TP is not None and _PP is not None)
+
+
+_TP_STATE_PATCHED = False
+
+
+@contextmanager
+def patch_tensor_parallel_group(tp_group: GroupCoordinator):
+    """Patch the tp group temporarily until this function ends.
+
+    This method is for draft workers of speculative decoding to run draft model
+    with different tp degree from that of target model workers.
+
+    Args:
+        tp_group (GroupCoordinator): the tp group coordinator
+    """
+    global _TP_STATE_PATCHED
+    assert not _TP_STATE_PATCHED, "Should not call when it's already patched"
+
+    _TP_STATE_PATCHED = True
+    old_tp_group = get_tp_group()
+    global _TP
+    _TP = tp_group
+    try:
+        yield
+    finally:
+        # restore the original state
+        _TP_STATE_PATCHED = False
+        _TP = old_tp_group
+
+
+def get_tensor_model_parallel_world_size():
+    """Return world size for the tensor model parallel group."""
+    return get_tp_group().world_size
+
+
+def get_tensor_model_parallel_rank():
+    """Return my rank for the tensor model parallel group."""
+    return get_tp_group().rank_in_group
+
+
+def get_node_count() -> int:
+    """Return the total number of nodes in the distributed environment. """
+    assert _NODE_COUNT is not None, (
+        "distributed environment is not initialized")
+    return _NODE_COUNT
+
+
+def destroy_model_parallel():
+    """Set the groups to none and destroy them."""
+    global _TP
+
+    if _TP:
+        _TP.destroy()
+    _TP = None
+
+    global _PP
+    if _PP:
+        _PP.destroy()
+    _PP = None
+
+    global _DP
+    if _DP:
+        _DP.destroy()
+    _DP = None
+
+    global _EP
+    if _EP:
+        _EP.destroy()
+    _EP = None
+
+
+def destroy_distributed_environment():
+    global _WORLD, _NODE_COUNT
+    if _WORLD:
+        _WORLD.destroy()
+    _WORLD = None
+    _NODE_COUNT = None
+    if torch.distributed.is_initialized():
+        torch.distributed.destroy_process_group()
+
+
+def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
+    destroy_model_parallel()
+    destroy_distributed_environment()
+    with contextlib.suppress(AssertionError):
+        torch.distributed.destroy_process_group()
+    if shutdown_ray:
+        import ray  # Lazy import Ray
+        ray.shutdown()
+    gc.collect()
+    from vllm.platforms import current_platform
+    empty_cache = current_platform.empty_cache
+    if empty_cache is not None:
+        empty_cache()
+    try:
+        if not current_platform.is_cpu():
+            torch._C._host_emptyCache()
+    except AttributeError:
+        logger.warning(
+            "torch._C._host_emptyCache() only available in Pytorch >=2.5")
+
+
+def in_the_same_node_as(pg: Union[ProcessGroup, StatelessProcessGroup],
+                        source_rank: int = 0) -> list[bool]:
+    """
+    This is a collective operation that returns if each rank is in the same node
+    as the source rank. It tests if processes are attached to the same
+    memory system (shared access to shared memory).
+    """
+    if isinstance(pg, ProcessGroup):
+        assert torch.distributed.get_backend(
+            pg) != torch.distributed.Backend.NCCL, (
+                "in_the_same_node_as should be tested with a non-NCCL group.")
+        # local rank inside the group
+        rank = torch.distributed.get_rank(group=pg)
+        world_size = torch.distributed.get_world_size(group=pg)
+
+        # global ranks of the processes in the group
+        ranks = torch.distributed.get_process_group_ranks(pg)
+    else:
+        rank = pg.rank
+        world_size = pg.world_size
+        ranks = list(range(world_size))
+
+    # local tensor in each process to store the result
+    is_in_the_same_node = torch.tensor([0] * world_size, dtype=torch.int32)
+
+    magic_message = b"magic_message"
+    shm = None
+
+    try:
+        with contextlib.suppress(OSError):
+            if rank == source_rank:
+                # create a shared memory segment
+                shm = shared_memory.SharedMemory(create=True, size=128)
+                shm.buf[:len(magic_message)] = magic_message
+                if isinstance(pg, ProcessGroup):
+                    torch.distributed.broadcast_object_list(
+                        [shm.name], src=ranks[source_rank], group=pg)
+                else:
+                    pg.broadcast_obj(shm.name, src=source_rank)
+                is_in_the_same_node[rank] = 1
+            else:
+                # try to open the shared memory segment
+                if isinstance(pg, ProcessGroup):
+                    recv = [None]
+                    torch.distributed.broadcast_object_list(
+                        recv, src=ranks[source_rank], group=pg)
+                    name = recv[0]
+                else:
+                    name = pg.broadcast_obj(None, src=source_rank)
+                # fix to https://stackoverflow.com/q/62748654/9191338
+                # Python incorrectly tracks shared memory even if it is not
+                # created by the process. The following patch is a workaround.
+                with patch("multiprocessing.resource_tracker.register",
+                           lambda *args, **kwargs: None):
+                    shm = shared_memory.SharedMemory(name=name)
+                if shm.buf[:len(magic_message)] == magic_message:
+                    is_in_the_same_node[rank] = 1
+    except Exception as e:
+        logger.error("Error ignored in is_in_the_same_node: %s", e)
+    finally:
+        if shm:
+            shm.close()
+
+    if isinstance(pg, ProcessGroup):
+        torch.distributed.barrier(group=pg)
+    else:
+        pg.barrier()
+
+    # clean up the shared memory segment
+    with contextlib.suppress(OSError):
+        if rank == source_rank and shm:
+            shm.unlink()
+
+    if isinstance(pg, ProcessGroup):
+        torch.distributed.all_reduce(is_in_the_same_node, group=pg)
+        aggregated_data = is_in_the_same_node
+    else:
+        aggregated_data = torch.zeros_like(is_in_the_same_node)
+        for i in range(world_size):
+            rank_data = pg.broadcast_obj(is_in_the_same_node, src=i)
+            aggregated_data += rank_data
+
+    return [x == 1 for x in aggregated_data.tolist()]
+
+
+def is_global_first_rank() -> bool:
+    """
+    Check if the current process is the first rank globally across all
+    parallelism strategies (PP, TP, DP, EP, etc.).
+
+    Unlike group-specific checks like `get_tensor_model_parallel_rank() == 0`
+    or `get_pp_group().is_first_rank`, this function checks the global rank
+    across all parallelism dimensions.
+
+    Returns:
+        bool: True if this is the global first rank (rank 0), False otherwise.
+              Returns True if distributed is not initialized (single process).
+    """
+    try:
+        # If world group is available, use it for the most accurate check
+        global _WORLD
+        if _WORLD is not None:
+            return _WORLD.is_first_rank
+
+        # If torch distributed is not initialized, assume single process
+        if not torch.distributed.is_initialized():
+            return True
+
+        # Fallback to torch's global rank
+        return torch.distributed.get_rank() == 0
+
+    except Exception:
+        # If anything goes wrong, assume this is the first rank
+        return True
+
+
+def _node_count(pg: Union[ProcessGroup, StatelessProcessGroup]) -> int:
+    """
+    Returns the total number of nodes in the process group.
+
+    Args:
+        pg: The process group to analyze
+
+    Returns:
+        int: The total number of nodes
+    """
+    if isinstance(pg, ProcessGroup):
+        world_size = torch.distributed.get_world_size(group=pg)
+    else:
+        world_size = pg.world_size
+
+    if world_size == 1:
+        return 1
+
+    # Build node assignment map
+    node_assignment = [0] * world_size  # rank -> node_id
+    next_node_id = 0
+
+    for current_rank in range(world_size):
+        if node_assignment[current_rank] != 0:
+            continue  # Already assigned to a node
+
+        # Assign current rank to a new node
+        next_node_id += 1
+        node_assignment[current_rank] = next_node_id
+
+        # Find all ranks on the same node as current_rank
+        same_node_flags = in_the_same_node_as(pg, current_rank)
+        for other_rank, is_same_node in enumerate(same_node_flags):
+            if is_same_node and node_assignment[other_rank] == 0:
+                node_assignment[other_rank] = next_node_id
+
+    return next_node_id
diff --git a/vllm_v0.10.0/vllm/distributed/tpu_distributed_utils.py b/vllm_v0.10.0/vllm/distributed/tpu_distributed_utils.py
new file mode 100644
index 0000000..0a786b4
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/tpu_distributed_utils.py
@@ -0,0 +1,178 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections import OrderedDict
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch_xla.distributed.spmd as xs
+from torch.nn.parameter import Parameter
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+
+logger = init_logger(__name__)
+
+
+class XlaQKVParallelLinear(nn.Module):
+
+    def __init__(self,
+                 qkv_linear: nn.Module,
+                 mesh: Optional["xs.Mesh"] = None):
+        super().__init__()
+        assert isinstance(qkv_linear, QKVParallelLinear)
+        self.skip_bias_add = qkv_linear.skip_bias_add
+        self.return_bias = qkv_linear.return_bias
+        assert qkv_linear.tp_size == 1, "TP > 1 is only supported under SPMD."
+
+        self.q_weight: Parameter
+        self.k_weight: Parameter
+        self.v_weight: Parameter
+        self.q_bias: Optional[Parameter]
+        self.k_bias: Optional[Parameter]
+        self.v_bias: Optional[Parameter]
+        self._load_weights_from_qkv_linear(qkv_linear)
+        if mesh is not None:
+            self._shard_weight(mesh)
+
+    def _shard_weight(self, mesh: "xs.Mesh"):
+        self.q_weight = Parameter(self.q_weight.to('xla'), requires_grad=False)
+        self.k_weight = Parameter(self.k_weight.to('xla'), requires_grad=False)
+        self.v_weight = Parameter(self.v_weight.to('xla'), requires_grad=False)
+        xs.mark_sharding(self.q_weight, mesh, ('x', None))
+        xs.mark_sharding(self.k_weight, mesh, ('x', None))
+        xs.mark_sharding(self.v_weight, mesh, ('x', None))
+        if self.q_bias is not None:
+            assert self.k_bias is not None and self.v_bias is not None, \
+                "QKVParallelLinear should have q, k, and v biases together."
+            self.q_bias = Parameter(self.q_bias.to('xla'), requires_grad=False)
+            xs.mark_sharding(self.q_bias, mesh, ('x', ))
+            self.k_bias = Parameter(self.k_bias.to('xla'), requires_grad=False)
+            xs.mark_sharding(self.k_bias, mesh, ('x', ))
+            self.v_bias = Parameter(self.v_bias.to('xla'), requires_grad=False)
+            xs.mark_sharding(self.v_bias, mesh, ('x', ))
+
+    def _load_weights_from_qkv_linear(self, qkv_linear: nn.Module):
+        q_proj_size, k_proj_size, _ = qkv_linear.output_sizes
+        # The weight of qkv linear is a concatenation of q, k, and v weights
+        # along the output dimension.
+        qkv_weight = qkv_linear.weight.data.cpu()
+        q_weight = Parameter(qkv_weight[:q_proj_size], requires_grad=False)
+        k_weight = Parameter(qkv_weight[q_proj_size:q_proj_size + k_proj_size],
+                             requires_grad=False)
+        v_weight = Parameter(qkv_weight[q_proj_size + k_proj_size:],
+                             requires_grad=False)
+        self.register_parameter("q_weight", q_weight)
+        self.register_parameter("k_weight", k_weight)
+        self.register_parameter("v_weight", v_weight)
+
+        if qkv_linear.bias is not None:
+            q_bias = Parameter(qkv_linear.bias[:q_proj_size],
+                               requires_grad=False)
+            k_bias = Parameter(qkv_linear.bias[q_proj_size:q_proj_size +
+                                               k_proj_size],
+                               requires_grad=False)
+            v_bias = Parameter(qkv_linear.bias[q_proj_size + k_proj_size:],
+                               requires_grad=False)
+            self.register_parameter("q_bias", q_bias)
+            self.register_parameter("k_bias", k_bias)
+            self.register_parameter("v_bias", v_bias)
+        else:
+            self.register_parameter("q_bias", None)
+            self.register_parameter("k_bias", None)
+            self.register_parameter("v_bias", None)
+
+    def forward(self, input):
+        # Same forward functionality as QKVParallelLinear, but doing qkv porj
+        # separately.
+        q_bias = self.q_bias if not self.skip_bias_add else None
+        k_bias = self.k_bias if not self.skip_bias_add else None
+        v_bias = self.v_bias if not self.skip_bias_add else None
+        q_proj = F.linear(input, self.q_weight, q_bias)
+        k_proj = F.linear(input, self.k_weight, k_bias)
+        v_proj = F.linear(input, self.v_weight, v_bias)
+        # The q/k/v projections will be split outside of the QKVParallelLinear.
+        # Because we are replacing XlaQKVParallelLinear with the
+        # QKVParallelLinear, we need to concatenate q, k, and v projections to
+        # match the output shape of the QKVParallelLinear implementation even if
+        # it seems to be redundant.
+        # The concat and the following split will be noop, and should be
+        # optimized away by the compiler.
+        qkv_proj = torch.cat([q_proj, k_proj, v_proj], dim=-1)
+        output_bias = torch.cat([q_bias, k_bias, v_bias], dim=-1) if \
+                            self.skip_bias_add else None
+        if not self.return_bias:
+            return qkv_proj
+        return qkv_proj, output_bias
+
+
+def partition_column_parallel_linear(layer: torch.nn.Module,
+                                     mesh: xs.Mesh) -> torch.nn.Module:
+    assert isinstance(layer, ColumnParallelLinear)
+    xs.mark_sharding(layer.weight, mesh, ('x', None))
+    logger.debug("Applied column-parallel sharding to %s", layer)
+    return layer
+
+
+def partition_row_parallel_linear(layer: torch.nn.Module,
+                                  mesh: xs.Mesh) -> torch.nn.Module:
+    assert isinstance(layer, RowParallelLinear)
+    xs.mark_sharding(layer.weight, mesh, (None, 'x'))
+    logger.debug("Applied row-parallel sharding to %s", layer)
+    return layer
+
+
+def partition_qkv_parallel_linear(layer: torch.nn.Module,
+                                  mesh: xs.Mesh) -> torch.nn.Module:
+    assert isinstance(layer, QKVParallelLinear)
+    xla_layer = XlaQKVParallelLinear(layer, mesh)
+    logger.debug("Applied qkv parallel sharding to %s", layer)
+    return xla_layer
+
+
+MODULE_TYPE_TO_WRAPPING_FUNC = OrderedDict([
+    ("QKVParallelLinear", partition_qkv_parallel_linear),
+    ("ColumnParallelLinear", partition_column_parallel_linear),
+    ("RowParallelLinear", partition_row_parallel_linear),
+])
+
+
+def get_fqn(module):
+    # Get the fully qualified name of the module
+    return module.__class__.__qualname__
+
+
+def shard_model(model: torch.nn.Module, mesh: "xs.Mesh") -> None:
+    """
+    Recursively check a PyTorch model and apply appropriate sharding based on 
+    the MODULE_TYPE_TO_WRAPPING_FUNC mapping.
+    
+    Args:
+        model: torch.nn.Module to process
+        mesh: An XLA SPMD mesh object used for sharding
+    """
+
+    def _process_module(module, name=None, parent=None):
+        for module_type, wrapping_func in MODULE_TYPE_TO_WRAPPING_FUNC.items():
+            if get_fqn(module) == module_type:
+                wrapped_module = wrapping_func(module, mesh)
+
+                assert parent is not None and name is not None, (
+                    "Top Level module is not expected to be wrapped.")
+                if wrapped_module is not module:
+                    # Wrapped module and module are different py object.
+                    # The original module should be replaced by the
+                    # wrapped_module.
+                    logger.debug("replace %s with %s", module, wrapped_module)
+                    setattr(parent, name, wrapped_module)
+
+                module = wrapped_module
+                break
+
+        for child_name, child_module in list(module.named_children()):
+            _process_module(child_module, child_name, module)
+
+    _process_module(model)
diff --git a/vllm_v0.10.0/vllm/distributed/utils.py b/vllm_v0.10.0/vllm/distributed/utils.py
new file mode 100644
index 0000000..67f7164
--- /dev/null
+++ b/vllm_v0.10.0/vllm/distributed/utils.py
@@ -0,0 +1,536 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2023 The vLLM team.
+# Adapted from
+# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+import dataclasses
+import os
+import pickle
+import socket
+import sys
+import time
+import uuid
+from collections import deque
+from collections.abc import Sequence
+from datetime import timedelta
+from typing import Any, Optional
+
+import torch
+from torch.distributed import ProcessGroup, TCPStore
+from torch.distributed.distributed_c10d import (Backend, PrefixStore,
+                                                _get_default_timeout,
+                                                _unregister_process_group)
+from torch.distributed.rendezvous import rendezvous
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.utils import get_tcp_uri, is_torch_equal_or_newer
+
+logger = init_logger(__name__)
+
+# We prefer to use os.sched_yield as it results in tighter polling loops,
+# measured to be around 3e-7 seconds. However on earlier versions of Python
+# os.sched_yield() does not release the GIL, so we fall back to time.sleep(0)
+USE_SCHED_YIELD = ((sys.version_info[:3] >= (3, 11, 1))
+                   or (sys.version_info[:2] == (3, 10)
+                       and sys.version_info[2] >= 8))
+
+
+def sched_yield():
+    if USE_SCHED_YIELD:
+        os.sched_yield()
+    else:
+        time.sleep(0)
+
+
+def ensure_divisibility(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator."""
+    assert numerator % denominator == 0, "{} is not divisible by {}".format(
+        numerator, denominator)
+
+
+def divide(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator and return
+    the division value."""
+    ensure_divisibility(numerator, denominator)
+    return numerator // denominator
+
+
+def split_tensor_along_last_dim(
+    tensor: torch.Tensor,
+    num_partitions: int,
+    contiguous_split_chunks: bool = False,
+) -> Sequence[torch.Tensor]:
+    """ Split a tensor along its last dimension.
+
+        Arguments:
+            tensor: input tensor.
+            num_partitions: number of partitions to split the tensor
+            contiguous_split_chunks: If True, make each chunk contiguous
+                                     in memory.
+
+        Returns:
+            A list of Tensors
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = divide(tensor.size()[last_dim], num_partitions)
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # NOTE: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+
+def get_pp_indices(num_hidden_layers: int, pp_rank: int,
+                   pp_size: int) -> tuple[int, int]:
+    """Try to evenly distribute layers across partitions.
+
+    If the number of layers is not divisible by the number of partitions,
+    the remaining layers are evenly distributed across all but the last
+    partition. The last partition is excluded because it often contains an
+    additional norm layer and we are attempting to balance compute.
+
+    If `pp_size > 2` and the number of remaining layers is
+    `0 < x <= pp_size - 2` then the remaining layers are evenly distributed
+    across the middle partitions. The first and last partitions are excluded
+    because they contain the input and output embeddings respectively and we
+    are attempting to reduce maximum memory consumption across partitions.
+    """
+    partition_list_str = envs.VLLM_PP_LAYER_PARTITION
+    if partition_list_str is not None:
+        try:
+            partitions = [
+                int(layer) for layer in partition_list_str.split(",")
+            ]
+        except ValueError as err:
+            raise ValueError("Invalid partition string: {}".format(
+                partition_list_str)) from err
+        if len(partitions) != pp_size:
+            raise ValueError(f"{len(partitions)=} does not match {pp_size=}.")
+        if sum(partitions) != num_hidden_layers:
+            raise ValueError(
+                f"{sum(partitions)=} does not match {num_hidden_layers=}.")
+    else:
+        layers_per_partition = num_hidden_layers // pp_size
+        partitions = [layers_per_partition for _ in range(pp_size)]
+
+        if remaining_layers := num_hidden_layers % pp_size:
+            for i in range(2, remaining_layers + 2):
+                partitions[-i] += 1
+            logger.info(
+                "Hidden layers were unevenly partitioned: [%s]. "
+                "This can be manually overridden using the "
+                "VLLM_PP_LAYER_PARTITION environment variable",
+                ",".join(str(p) for p in partitions))
+
+    start_layer = sum(partitions[:pp_rank])
+    end_layer = start_layer + partitions[pp_rank]
+
+    return (start_layer, end_layer)
+
+
+@dataclasses.dataclass
+class StatelessProcessGroup:
+    """A dataclass to hold a metadata store, and the rank, world_size of the
+    group. Only use it to communicate metadata between processes.
+    For data-plane communication, create NCCL-related objects.
+    """
+    rank: int
+    world_size: int
+    store: torch._C._distributed_c10d.Store
+
+    # stores a reference to the socket so that the file descriptor stays alive
+    socket: Optional[socket.socket]
+
+    data_expiration_seconds: int = 3600  # 1 hour
+
+    # dst rank -> counter
+    send_dst_counter: dict[int, int] = dataclasses.field(default_factory=dict)
+    # src rank -> counter
+    recv_src_counter: dict[int, int] = dataclasses.field(default_factory=dict)
+    broadcast_send_counter: int = 0
+    broadcast_recv_src_counter: dict[int, int] = dataclasses.field(
+        default_factory=dict)
+
+    # A deque to store the data entries, with key and timestamp.
+    entries: deque[tuple[str,
+                         float]] = dataclasses.field(default_factory=deque)
+
+    def __post_init__(self):
+        assert self.rank < self.world_size
+        self.send_dst_counter = {i: 0 for i in range(self.world_size)}
+        self.recv_src_counter = {i: 0 for i in range(self.world_size)}
+        self.broadcast_recv_src_counter = {
+            i: 0
+            for i in range(self.world_size)
+        }
+
+    def send_obj(self, obj: Any, dst: int):
+        """Send an object to a destination rank."""
+        self.expire_data()
+        key = f"send_to/{dst}/{self.send_dst_counter[dst]}"
+        self.store.set(key, pickle.dumps(obj))
+        self.send_dst_counter[dst] += 1
+        self.entries.append((key, time.time()))
+
+    def expire_data(self):
+        """Expire data that is older than `data_expiration_seconds` seconds."""
+        while self.entries:
+            # check the oldest entry
+            key, timestamp = self.entries[0]
+            if time.time() - timestamp > self.data_expiration_seconds:
+                self.store.delete_key(key)
+                self.entries.popleft()
+            else:
+                break
+
+    def recv_obj(self, src: int) -> Any:
+        """Receive an object from a source rank."""
+        obj = pickle.loads(
+            self.store.get(
+                f"send_to/{self.rank}/{self.recv_src_counter[src]}"))
+        self.recv_src_counter[src] += 1
+        return obj
+
+    def broadcast_obj(self, obj: Optional[Any], src: int) -> Any:
+        """Broadcast an object from a source rank to all other ranks.
+        It does not clean up after all ranks have received the object.
+        Use it for limited times, e.g., for initialization.
+        """
+        if self.rank == src:
+            self.expire_data()
+            key = (f"broadcast_from/{src}/"
+                   f"{self.broadcast_send_counter}")
+            self.store.set(key, pickle.dumps(obj))
+            self.broadcast_send_counter += 1
+            self.entries.append((key, time.time()))
+            return obj
+        else:
+            key = (f"broadcast_from/{src}/"
+                   f"{self.broadcast_recv_src_counter[src]}")
+            recv_obj = pickle.loads(self.store.get(key))
+            self.broadcast_recv_src_counter[src] += 1
+            return recv_obj
+
+    def all_gather_obj(self, obj: Any) -> list[Any]:
+        """All gather an object from all ranks."""
+        gathered_objs = []
+        for i in range(self.world_size):
+            if i == self.rank:
+                gathered_objs.append(obj)
+                self.broadcast_obj(obj, src=self.rank)
+            else:
+                recv_obj = self.broadcast_obj(None, src=i)
+                gathered_objs.append(recv_obj)
+        return gathered_objs
+
+    def barrier(self, timeout: float = 30.0):
+        """A robust barrier to synchronize all ranks.
+
+
+        Uses a multi-phase approach to ensure all processes reach the barrier
+        before proceeding:
+
+        1. Each process signals it has reached the barrier
+
+        2. Each process signals that it has confirmed the arrival of all other
+        ranks.
+
+        3. Rank 0 waits for all other ranks to signal their departure to ensure
+        that all ranks have departed the barrier first.
+
+        Args:
+            timeout: Maximum time in seconds to wait for each phase (in seconds)
+
+
+        Raises:
+            RuntimeError: If coordination fails or times out
+        """
+        # Generate a barrier ID that is globally unique
+        try:
+            if self.rank == 0:
+                barrier_id = f"barrier_{uuid.uuid4()}"
+                self.broadcast_obj(barrier_id, src=0)
+            else:
+                barrier_id = self.broadcast_obj(None, src=0)
+        except Exception as e:
+            raise RuntimeError("Failed to broadcast barrier_id") from e
+
+        # Phase 1: Signal arrival at barrier
+        # Wait for all processes to arrive
+        # We need all ranks to confirm the arrival of all other ranks.
+        # This is the key synchronization point.
+        arrival_key = f"arrival_{barrier_id}_{self.rank}"
+        try:
+            self.store.set(arrival_key, b"1")
+        except Exception as e:
+            raise RuntimeError("Failed to signal barrier arrival") from e
+
+        start_time = time.time()
+        processes_arrived: set[int] = set()
+
+        while len(processes_arrived) < self.world_size:
+            # Check for timeout
+            cur_time = time.time()
+            if cur_time - start_time > timeout:
+                raise RuntimeError("Barrier timed out after %f seconds",
+                                   timeout)
+
+            # Check for each process
+            for i in range(self.world_size):
+                if i in processes_arrived:
+                    continue
+
+                key = f"arrival_{barrier_id}_{i}"
+                try:
+                    # Try to get the key - if it exists, we'll get a value
+                    # If it doesn't exist, it will throw an exception
+                    self.store.get(key)
+                    processes_arrived.add(i)
+                except KeyError:
+                    # Key doesn't exist yet
+                    pass
+                except Exception as check_e:
+                    logger.debug("Error checking key existence: %s", check_e)
+                    sched_yield()
+
+            # Short sleep to avoid tight polling
+            if len(processes_arrived) < self.world_size:
+                sched_yield()
+
+        # Phase 2: Signal departure from barrier
+        # We only care to block at this stage in rank 0, which runs the
+        # server side of the TCPStore. We want to make sure that all
+        # clients have departed the barrier before rank 0 in case the
+        # next thing after the barrier is a shutdown, including tearing
+        # down the TCPStore. Other ranks can exit the barrier immediately
+        # after signaling their departure.
+        departure_key = f"departure_{barrier_id}_{self.rank}"
+        try:
+            self.store.set(departure_key, b"1")
+        except Exception as e:
+            raise RuntimeError("Failed to signal barrier departure") from e
+
+        if self.rank != 0:
+            return
+
+        # Make rank 0 wait for all processes to signal departure
+        start_time = time.time()
+        processes_departed: set[int] = set()
+
+        while len(processes_departed) < self.world_size:
+            # Check for timeout
+            if time.time() - start_time > timeout:
+                raise RuntimeError("Barrier departure timed out after %f s",
+                                   timeout)
+
+            # Check for each process
+            for i in range(self.world_size):
+                if i in processes_departed:
+                    continue
+
+                key = f"departure_{barrier_id}_{i}"
+                try:
+                    # Try to get the key - if it exists, we'll get a value
+                    # If it doesn't exist, it will throw an exception
+                    self.store.get(key)
+                    processes_departed.add(i)
+                except KeyError:
+                    # Key doesn't exist yet
+                    pass
+                except Exception as check_e:
+                    logger.debug("Error checking key existence: %s", check_e)
+                    sched_yield()
+
+            # Short sleep to avoid tight polling
+            if len(processes_departed) < self.world_size:
+                sched_yield()
+
+        # Clean up keys to avoid leaking memory in the store
+        for i in range(self.world_size):
+            try:
+                self.store.delete_key(f"arrival_{barrier_id}_{i}")
+            except Exception:
+                logger.debug("Error deleting key: %s",
+                             f'arrival_{barrier_id}_{i}')
+
+            try:
+                self.store.delete_key(f"departure_{barrier_id}_{i}")
+            except Exception:
+                logger.debug("Error deleting key: %s",
+                             f'departure_{barrier_id}_{i}')
+
+    @staticmethod
+    def create(
+        host: str,
+        port: int,
+        rank: int,
+        world_size: int,
+        data_expiration_seconds: int = 3600,
+        store_timeout: int = 300,
+    ) -> "StatelessProcessGroup":
+        """A replacement for `torch.distributed.init_process_group` that does not
+        pollute the global state.
+
+        If we have process A and process B called `torch.distributed.init_process_group`
+        to form a group, and then we want to form another group with process A, B, C,
+        D, it is not possible in PyTorch, because process A and process B have already
+        formed a group, and process C and process D cannot join that group. This
+        function is a workaround for this issue.
+
+        `torch.distributed.init_process_group` is a global call, while this function
+        is a stateless call. It will return a `StatelessProcessGroup` object that can be
+        used for exchanging metadata. With this function, process A and process B
+        can call `StatelessProcessGroup.create` to form a group, and then process A, B,
+        C, and D can call `StatelessProcessGroup.create` to form another group.
+        """ # noqa
+        launch_server = rank == 0
+        if launch_server:
+            # listen on the specified interface (instead of 0.0.0.0)
+            listen_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            listen_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+            listen_socket.bind((host, port))
+            listen_socket.listen()
+            listen_fd = listen_socket.fileno()
+        else:
+            listen_socket = None
+            listen_fd = None
+
+        store = TCPStore(
+            host_name=host,
+            port=port,
+            world_size=world_size,
+            is_master=launch_server,
+            timeout=timedelta(seconds=store_timeout),
+            use_libuv=False,  # for now: github.com/pytorch/pytorch/pull/150215
+            master_listen_fd=listen_fd,
+        )
+
+        return StatelessProcessGroup(
+            rank=rank,
+            world_size=world_size,
+            store=store,
+            socket=listen_socket,
+            data_expiration_seconds=data_expiration_seconds)
+
+
+def init_gloo_process_group(backend: Backend, prefix_store: PrefixStore,
+                            group_rank: int, group_size: int,
+                            timeout: timedelta) -> ProcessGroup:
+    """
+    Stateless init ProcessGroup with gloo backend compatible with 
+    different torch versions.
+    """
+    if is_torch_equal_or_newer("2.6"):
+        pg = ProcessGroup(
+            prefix_store,
+            group_rank,
+            group_size,
+        )
+    else:
+        options = ProcessGroup.Options(backend=backend)
+        pg = ProcessGroup(
+            prefix_store,
+            group_rank,
+            group_size,
+            options,
+        )
+    from torch.distributed.distributed_c10d import ProcessGroupGloo
+    backend_class = ProcessGroupGloo(prefix_store,
+                                     group_rank,
+                                     group_size,
+                                     timeout=timeout)
+    backend_type = ProcessGroup.BackendType.GLOO
+    device = torch.device("cpu")
+    if is_torch_equal_or_newer("2.6"):
+        # _set_default_backend is supported in torch >= 2.6
+        pg._set_default_backend(backend_type)
+    backend_class._set_sequence_number_for_group()
+
+    pg._register_backend(device, backend_type, backend_class)
+    return pg
+
+
+def stateless_init_torch_distributed_process_group(
+        host: str, port: int, rank: int, world_size: int,
+        backend: str) -> ProcessGroup:
+    """
+    A replacement for `torch.distributed.init_process_group` that does not
+    pollute the global state. The created ProcessGroup object can be used for
+    some operations such as `allreduce`, because it does not depend on the
+    global rank. However, some operations such as `broadcast` cannot be used
+    because it depends on the global rank.
+
+    # TODO: ask for help from PyTorch team if we need the `broadcast` operation.
+
+    This function is useful when we are not sure about the total number of
+    processes in the process group. For example, we may have process
+    1, 2, ..., 8 who want to communicate, and process 9 might be the same
+    process as process 1, or it might be a different process; process 10
+    might be the same process as process 5, or it might be a different process.
+    In this case, how can we reliably form a communication channel within
+    process 9 and 10, without affecting the communication channel within
+    process 1, 2, ..., 8?
+
+    One possible solution is to figure out if process 9 and 10 are the same
+    as process 1 and 5 beforehand, and then form a communication channel
+    based on the information, adjusting the ranks and world_size etc. However,
+    figuring out the information is not always easy, and it will interfere
+    with the main communication channel.
+
+    Our solution is to always form a communication channel with process 1, 2,
+    ..., 8, and then use this function to form another communication channel
+    with process 9 and 10. This way, regardless of whether process 9 and 10
+    are the same as process 1 and 5, the main communication channel is
+    always formed with process 1, 2, ..., 8, and the additional communication
+    channel is formed with process 9 and 10.
+    """
+    init_method = get_tcp_uri(host, port)
+    backend = Backend(backend)  # it is basically string
+    timeout = _get_default_timeout(backend)
+
+    store, rank, world_size = next(
+        rendezvous(init_method, rank, world_size, timeout=timeout))
+    store.set_timeout(timeout)
+
+    group_rank = rank
+    group_size = world_size
+
+    # Use a PrefixStore to avoid accidental overrides of keys used by
+    # different systems (e.g. RPC) in case the store is multi-tenant.
+    prefix_store = PrefixStore(init_method, store)
+
+    if backend == "gloo":
+        return init_gloo_process_group(backend=backend,
+                                       prefix_store=prefix_store,
+                                       group_rank=group_rank,
+                                       group_size=group_size,
+                                       timeout=timeout)
+    from vllm.platforms import current_platform
+    return current_platform.stateless_init_device_torch_dist_pg(
+        backend=backend,
+        prefix_store=prefix_store,
+        group_rank=group_rank,
+        group_size=group_size,
+        timeout=timeout)
+
+
+def stateless_destroy_torch_distributed_process_group(
+        pg: ProcessGroup) -> None:
+    """
+    Destroy ProcessGroup returned by
+        stateless_init_torch_distributed_process_group().
+    """
+    if is_torch_equal_or_newer("2.7"):
+        pg.shutdown()
+    else:
+        # Lazy import for non-CUDA backends.
+        from torch.distributed.distributed_c10d import _shutdown_backend
+        _shutdown_backend(pg)
+
+    _unregister_process_group(pg.group_name)
diff --git a/vllm_v0.10.0/vllm/engine/__init__.py b/vllm_v0.10.0/vllm/engine/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/engine/arg_utils.py b/vllm_v0.10.0/vllm/engine/arg_utils.py
new file mode 100644
index 0000000..aec75f8
--- /dev/null
+++ b/vllm_v0.10.0/vllm/engine/arg_utils.py
@@ -0,0 +1,1756 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# yapf: disable
+import argparse
+import copy
+import dataclasses
+import functools
+import json
+import sys
+import threading
+from dataclasses import MISSING, dataclass, fields, is_dataclass
+from itertools import permutations
+from typing import (TYPE_CHECKING, Annotated, Any, Callable, Dict, List,
+                    Literal, Optional, Type, TypeVar, Union, cast, get_args,
+                    get_origin)
+
+import regex as re
+import torch
+from pydantic import TypeAdapter, ValidationError
+from typing_extensions import TypeIs
+
+import vllm.envs as envs
+from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
+                         ConfigFormat, ConfigType, DecodingConfig,
+                         DetailedTraceModules, Device, DeviceConfig,
+                         DistributedExecutorBackend, GuidedDecodingBackend,
+                         GuidedDecodingBackendV1, HfOverrides, KVEventsConfig,
+                         KVTransferConfig, LoadConfig, LoadFormat,
+                         LogprobsMode, LoRAConfig, ModelConfig, ModelDType,
+                         ModelImpl, MultiModalConfig, ObservabilityConfig,
+                         ParallelConfig, PoolerConfig, PrefixCachingHashAlgo,
+                         SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
+                         TaskOption, TokenizerMode, VllmConfig, get_attr_docs,
+                         get_field)
+from vllm.logger import init_logger
+from vllm.platforms import CpuArchEnum, current_platform
+from vllm.plugins import load_general_plugins
+from vllm.reasoning import ReasoningParserManager
+from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
+from vllm.transformers_utils.utils import check_gguf_file
+from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser,
+                        GiB_bytes, get_ip, is_in_ray_actor)
+
+# yapf: enable
+
+if TYPE_CHECKING:
+    from vllm.executor.executor_base import ExecutorBase
+    from vllm.model_executor.layers.quantization import QuantizationMethods
+    from vllm.usage.usage_lib import UsageContext
+else:
+    ExecutorBase = Any
+    QuantizationMethods = Any
+    UsageContext = Any
+
+logger = init_logger(__name__)
+
+# object is used to allow for special typing forms
+T = TypeVar("T")
+TypeHint = Union[type[Any], object]
+TypeHintT = Union[type[T], object]
+
+
+def parse_type(return_type: Callable[[str], T]) -> Callable[[str], T]:
+
+    def _parse_type(val: str) -> T:
+        try:
+            return return_type(val)
+        except ValueError as e:
+            raise argparse.ArgumentTypeError(
+                f"Value {val} cannot be converted to {return_type}.") from e
+
+    return _parse_type
+
+
+def optional_type(
+        return_type: Callable[[str], T]) -> Callable[[str], Optional[T]]:
+
+    def _optional_type(val: str) -> Optional[T]:
+        if val == "" or val == "None":
+            return None
+        return parse_type(return_type)(val)
+
+    return _optional_type
+
+
+def union_dict_and_str(val: str) -> Optional[Union[str, dict[str, str]]]:
+    if not re.match(r"(?s)^\s*{.*}\s*$", val):
+        return str(val)
+    return optional_type(json.loads)(val)
+
+
+def is_type(type_hint: TypeHint, type: TypeHintT) -> TypeIs[TypeHintT]:
+    """Check if the type hint is a specific type."""
+    return type_hint is type or get_origin(type_hint) is type
+
+
+def contains_type(type_hints: set[TypeHint], type: TypeHintT) -> bool:
+    """Check if the type hints contain a specific type."""
+    return any(is_type(type_hint, type) for type_hint in type_hints)
+
+
+def get_type(type_hints: set[TypeHint], type: TypeHintT) -> TypeHintT:
+    """Get the specific type from the type hints."""
+    return next((th for th in type_hints if is_type(th, type)), None)
+
+
+def literal_to_kwargs(type_hints: set[TypeHint]) -> dict[str, Any]:
+    """Convert Literal type hints to argparse kwargs."""
+    type_hint = get_type(type_hints, Literal)
+    choices = get_args(type_hint)
+    choice_type = type(choices[0])
+    if not all(isinstance(choice, choice_type) for choice in choices):
+        raise ValueError(
+            "All choices must be of the same type. "
+            f"Got {choices} with types {[type(c) for c in choices]}")
+    return {"type": choice_type, "choices": sorted(choices)}
+
+
+def is_not_builtin(type_hint: TypeHint) -> bool:
+    """Check if the class is not a built-in type."""
+    return type_hint.__module__ != "builtins"
+
+
+def get_type_hints(type_hint: TypeHint) -> set[TypeHint]:
+    """Extract type hints from Annotated or Union type hints."""
+    type_hints: set[TypeHint] = set()
+    origin = get_origin(type_hint)
+    args = get_args(type_hint)
+
+    if origin is Annotated:
+        type_hints.update(get_type_hints(args[0]))
+    elif origin is Union:
+        for arg in args:
+            type_hints.update(get_type_hints(arg))
+    else:
+        type_hints.add(type_hint)
+
+    return type_hints
+
+
+def is_online_quantization(quantization: Any) -> bool:
+    return quantization in ["inc"]
+
+
+@functools.lru_cache(maxsize=30)
+def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
+    cls_docs = get_attr_docs(cls)
+    kwargs = {}
+    for field in fields(cls):
+        # Get the set of possible types for the field
+        type_hints: set[TypeHint] = get_type_hints(field.type)
+
+        # If the field is a dataclass, we can use the model_validate_json
+        generator = (th for th in type_hints if is_dataclass(th))
+        dataclass_cls = next(generator, None)
+
+        # Get the default value of the field
+        if field.default is not MISSING:
+            default = field.default
+        elif field.default_factory is not MISSING:
+            default = field.default_factory()
+
+        # Get the help text for the field
+        name = field.name
+        help = cls_docs[name].strip()
+        # Escape % for argparse
+        help = help.replace("%", "%%")
+
+        # Initialise the kwargs dictionary for the field
+        kwargs[name] = {"default": default, "help": help}
+
+        # Set other kwargs based on the type hints
+        json_tip = """Should either be a valid JSON string or JSON keys
+passed individually. For example, the following sets of arguments are
+equivalent:
+
+- `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`\n
+- `--json-arg.key1 value1 --json-arg.key2.key3 value2`
+
+Additionally, list elements can be passed individually using `+`:
+
+- `--json-arg '{"key4": ["value3", "value4", "value5"]}'`\n
+- `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'`"""
+        if dataclass_cls is not None:
+
+            def parse_dataclass(val: str, cls=dataclass_cls) -> Any:
+                try:
+                    if hasattr(cls, "from_cli"):
+                        return cls.from_cli(val)
+                    return TypeAdapter(cls).validate_json(val)
+                except ValidationError as e:
+                    raise argparse.ArgumentTypeError(repr(e)) from e
+
+            kwargs[name]["type"] = parse_dataclass
+            kwargs[name]["help"] += f"\n\n{json_tip}"
+        elif contains_type(type_hints, bool):
+            # Creates --no-<name> and --<name> flags
+            kwargs[name]["action"] = argparse.BooleanOptionalAction
+        elif contains_type(type_hints, Literal):
+            kwargs[name].update(literal_to_kwargs(type_hints))
+        elif contains_type(type_hints, tuple):
+            type_hint = get_type(type_hints, tuple)
+            types = get_args(type_hint)
+            tuple_type = types[0]
+            assert all(t is tuple_type for t in types if t is not Ellipsis), (
+                "All non-Ellipsis tuple elements must be of the same "
+                f"type. Got {types}.")
+            kwargs[name]["type"] = tuple_type
+            kwargs[name]["nargs"] = "+" if Ellipsis in types else len(types)
+        elif contains_type(type_hints, list):
+            type_hint = get_type(type_hints, list)
+            types = get_args(type_hint)
+            assert len(types) == 1, (
+                "List type must have exactly one type. Got "
+                f"{type_hint} with types {types}")
+            kwargs[name]["type"] = types[0]
+            kwargs[name]["nargs"] = "+"
+        elif contains_type(type_hints, int):
+            kwargs[name]["type"] = int
+            # Special case for large integers
+            if name in {"max_model_len", "max_num_batched_tokens"}:
+                kwargs[name]["type"] = human_readable_int
+        elif contains_type(type_hints, float):
+            kwargs[name]["type"] = float
+        elif (contains_type(type_hints, dict)
+              and (contains_type(type_hints, str)
+                   or any(is_not_builtin(th) for th in type_hints))):
+            kwargs[name]["type"] = union_dict_and_str
+        elif contains_type(type_hints, dict):
+            kwargs[name]["type"] = parse_type(json.loads)
+            kwargs[name]["help"] += f"\n\n{json_tip}"
+        elif (contains_type(type_hints, str)
+              or any(is_not_builtin(th) for th in type_hints)):
+            kwargs[name]["type"] = str
+        else:
+            raise ValueError(
+                f"Unsupported type {type_hints} for argument {name}.")
+
+        # If the type hint was a sequence of literals, use the helper function
+        # to update the type and choices
+        if get_origin(kwargs[name].get("type")) is Literal:
+            kwargs[name].update(literal_to_kwargs({kwargs[name]["type"]}))
+
+        # If None is in type_hints, make the argument optional.
+        # But not if it's a bool, argparse will handle this better.
+        if type(None) in type_hints and not contains_type(type_hints, bool):
+            kwargs[name]["type"] = optional_type(kwargs[name]["type"])
+            if kwargs[name].get("choices"):
+                kwargs[name]["choices"].append("None")
+    return kwargs
+
+
+def get_kwargs(cls: ConfigType) -> dict[str, Any]:
+    """Return argparse kwargs for the given Config dataclass.
+
+    The heavy computation is cached via functools.lru_cache, and a deep copy
+    is returned so callers can mutate the dictionary without affecting the
+    cached version.
+    """
+    return copy.deepcopy(_compute_kwargs(cls))
+
+
+@dataclass
+class EngineArgs:
+    """Arguments for vLLM engine."""
+    model: str = ModelConfig.model
+    served_model_name: Optional[Union[
+        str, List[str]]] = ModelConfig.served_model_name
+    tokenizer: Optional[str] = ModelConfig.tokenizer
+    hf_config_path: Optional[str] = ModelConfig.hf_config_path
+    task: TaskOption = ModelConfig.task
+    skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
+    enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
+    tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode
+    trust_remote_code: bool = ModelConfig.trust_remote_code
+    allowed_local_media_path: str = ModelConfig.allowed_local_media_path
+    download_dir: Optional[str] = LoadConfig.download_dir
+    load_format: str = LoadConfig.load_format
+    config_format: str = ModelConfig.config_format
+    dtype: ModelDType = ModelConfig.dtype
+    kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
+    seed: Optional[int] = ModelConfig.seed
+    max_model_len: Optional[int] = ModelConfig.max_model_len
+    cuda_graph_sizes: list[int] = get_field(SchedulerConfig,
+                                            "cuda_graph_sizes")
+    # Note: Specifying a custom executor backend by passing a class
+    # is intended for expert use only. The API may change without
+    # notice.
+    distributed_executor_backend: Optional[Union[
+        DistributedExecutorBackend,
+        Type[ExecutorBase]]] = ParallelConfig.distributed_executor_backend
+    # number of P/D disaggregation (or other disaggregation) workers
+    pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size
+    tensor_parallel_size: int = ParallelConfig.tensor_parallel_size
+    data_parallel_size: int = ParallelConfig.data_parallel_size
+    data_parallel_rank: Optional[int] = None
+    data_parallel_start_rank: Optional[int] = None
+    data_parallel_size_local: Optional[int] = None
+    data_parallel_address: Optional[str] = None
+    data_parallel_rpc_port: Optional[int] = None
+    data_parallel_hybrid_lb: bool = False
+    data_parallel_backend: str = ParallelConfig.data_parallel_backend
+    enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
+    enable_eplb: bool = ParallelConfig.enable_eplb
+    num_redundant_experts: int = ParallelConfig.num_redundant_experts
+    eplb_window_size: int = ParallelConfig.eplb_window_size
+    eplb_step_interval: int = ParallelConfig.eplb_step_interval
+    eplb_log_balancedness: bool = ParallelConfig.eplb_log_balancedness
+    max_parallel_loading_workers: Optional[
+        int] = ParallelConfig.max_parallel_loading_workers
+    block_size: Optional[BlockSize] = CacheConfig.block_size
+    enable_prefix_caching: Optional[bool] = CacheConfig.enable_prefix_caching
+    prefix_caching_hash_algo: PrefixCachingHashAlgo = \
+        CacheConfig.prefix_caching_hash_algo
+    disable_sliding_window: bool = ModelConfig.disable_sliding_window
+    disable_cascade_attn: bool = ModelConfig.disable_cascade_attn
+    swap_space: float = CacheConfig.swap_space
+    cpu_offload_gb: float = CacheConfig.cpu_offload_gb
+    gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
+    max_num_batched_tokens: Optional[
+        int] = SchedulerConfig.max_num_batched_tokens
+    max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills
+    max_long_partial_prefills: int = SchedulerConfig.max_long_partial_prefills
+    long_prefill_token_threshold: int = \
+        SchedulerConfig.long_prefill_token_threshold
+    max_num_seqs: Optional[int] = SchedulerConfig.max_num_seqs
+    max_logprobs: int = ModelConfig.max_logprobs
+    logprobs_mode: LogprobsMode = ModelConfig.logprobs_mode
+    disable_log_stats: bool = False
+    revision: Optional[str] = ModelConfig.revision
+    code_revision: Optional[str] = ModelConfig.code_revision
+    rope_scaling: dict[str, Any] = get_field(ModelConfig, "rope_scaling")
+    rope_theta: Optional[float] = ModelConfig.rope_theta
+    hf_token: Optional[Union[bool, str]] = ModelConfig.hf_token
+    hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides")
+    tokenizer_revision: Optional[str] = ModelConfig.tokenizer_revision
+    quantization: Optional[QuantizationMethods] = ModelConfig.quantization
+    enforce_eager: bool = ModelConfig.enforce_eager
+    max_seq_len_to_capture: int = ModelConfig.max_seq_len_to_capture
+    disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
+    limit_mm_per_prompt: dict[str, int] = \
+        get_field(MultiModalConfig, "limit_per_prompt")
+    interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings
+    media_io_kwargs: dict[str, dict[str,
+                                    Any]] = get_field(MultiModalConfig,
+                                                      "media_io_kwargs")
+    mm_processor_kwargs: Optional[Dict[str, Any]] = \
+        MultiModalConfig.mm_processor_kwargs
+    disable_mm_preprocessor_cache: bool = \
+        MultiModalConfig.disable_mm_preprocessor_cache
+    # LoRA fields
+    enable_lora: bool = False
+    enable_lora_bias: bool = LoRAConfig.bias_enabled
+    max_loras: int = LoRAConfig.max_loras
+    max_lora_rank: int = LoRAConfig.max_lora_rank
+    default_mm_loras: Optional[Dict[str, str]] = \
+        LoRAConfig.default_mm_loras
+    fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras
+    max_cpu_loras: Optional[int] = LoRAConfig.max_cpu_loras
+    lora_dtype: Optional[Union[str, torch.dtype]] = LoRAConfig.lora_dtype
+    lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size
+
+    num_scheduler_steps: int = SchedulerConfig.num_scheduler_steps
+    multi_step_stream_outputs: bool = SchedulerConfig.multi_step_stream_outputs
+    ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
+    num_gpu_blocks_override: Optional[
+        int] = CacheConfig.num_gpu_blocks_override
+    num_lookahead_slots: int = SchedulerConfig.num_lookahead_slots
+    model_loader_extra_config: dict = \
+        get_field(LoadConfig, "model_loader_extra_config")
+    ignore_patterns: Optional[Union[str,
+                                    List[str]]] = LoadConfig.ignore_patterns
+    preemption_mode: Optional[str] = SchedulerConfig.preemption_mode
+
+    scheduler_delay_factor: float = SchedulerConfig.delay_factor
+    enable_chunked_prefill: Optional[
+        bool] = SchedulerConfig.enable_chunked_prefill
+    disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input
+
+    disable_hybrid_kv_cache_manager: bool = (
+        SchedulerConfig.disable_hybrid_kv_cache_manager)
+
+    guided_decoding_backend: GuidedDecodingBackend = DecodingConfig.backend
+    guided_decoding_disable_fallback: bool = DecodingConfig.disable_fallback
+    guided_decoding_disable_any_whitespace: bool = \
+        DecodingConfig.disable_any_whitespace
+    guided_decoding_disable_additional_properties: bool = \
+        DecodingConfig.disable_additional_properties
+    logits_processor_pattern: Optional[
+        str] = ModelConfig.logits_processor_pattern
+
+    speculative_config: Optional[Dict[str, Any]] = None
+
+    show_hidden_metrics_for_version: Optional[str] = \
+        ObservabilityConfig.show_hidden_metrics_for_version
+    otlp_traces_endpoint: Optional[str] = \
+        ObservabilityConfig.otlp_traces_endpoint
+    collect_detailed_traces: Optional[list[DetailedTraceModules]] = \
+        ObservabilityConfig.collect_detailed_traces
+    disable_async_output_proc: bool = not ModelConfig.use_async_output_proc
+    scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
+    scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls
+
+    override_neuron_config: dict[str, Any] = \
+        get_field(ModelConfig, "override_neuron_config")
+    override_pooler_config: Optional[Union[dict, PoolerConfig]] = \
+        ModelConfig.override_pooler_config
+    compilation_config: CompilationConfig = \
+        get_field(VllmConfig, "compilation_config")
+    worker_cls: str = ParallelConfig.worker_cls
+    worker_extension_cls: str = ParallelConfig.worker_extension_cls
+
+    kv_transfer_config: Optional[KVTransferConfig] = None
+    kv_events_config: Optional[KVEventsConfig] = None
+
+    generation_config: str = ModelConfig.generation_config
+    enable_sleep_mode: bool = ModelConfig.enable_sleep_mode
+    override_generation_config: dict[str, Any] = \
+        get_field(ModelConfig, "override_generation_config")
+    model_impl: str = ModelConfig.model_impl
+    override_attention_dtype: str = ModelConfig.override_attention_dtype
+
+    calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
+
+    additional_config: dict[str, Any] = \
+        get_field(VllmConfig, "additional_config")
+    reasoning_parser: str = DecodingConfig.reasoning_backend
+
+    use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
+    pt_load_map_location: str = LoadConfig.pt_load_map_location
+
+    enable_multimodal_encoder_data_parallel: bool = \
+        ParallelConfig.enable_multimodal_encoder_data_parallel
+
+    async_scheduling: bool = SchedulerConfig.async_scheduling
+    # DEPRECATED
+    enable_prompt_adapter: bool = False
+
+    def __post_init__(self):
+        # support `EngineArgs(compilation_config={...})`
+        # without having to manually construct a
+        # CompilationConfig object
+        if isinstance(self.compilation_config, (int, dict)):
+            self.compilation_config = CompilationConfig.from_cli(
+                str(self.compilation_config))
+        # Setup plugins
+        from vllm.plugins import load_general_plugins
+        load_general_plugins()
+
+    @staticmethod
+    def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
+        """Shared CLI arguments for vLLM engine."""
+
+        # Model arguments
+        model_kwargs = get_kwargs(ModelConfig)
+        model_group = parser.add_argument_group(
+            title="ModelConfig",
+            description=ModelConfig.__doc__,
+        )
+        if not ('serve' in sys.argv[1:] and '--help' in sys.argv[1:]):
+            model_group.add_argument("--model", **model_kwargs["model"])
+        model_group.add_argument("--task", **model_kwargs["task"])
+        model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
+        model_group.add_argument("--tokenizer-mode",
+                                 **model_kwargs["tokenizer_mode"])
+        model_group.add_argument("--trust-remote-code",
+                                 **model_kwargs["trust_remote_code"])
+        model_group.add_argument("--dtype", **model_kwargs["dtype"])
+        model_group.add_argument("--seed", **model_kwargs["seed"])
+        model_group.add_argument("--hf-config-path",
+                                 **model_kwargs["hf_config_path"])
+        model_group.add_argument("--allowed-local-media-path",
+                                 **model_kwargs["allowed_local_media_path"])
+        model_group.add_argument("--revision", **model_kwargs["revision"])
+        model_group.add_argument("--code-revision",
+                                 **model_kwargs["code_revision"])
+        model_group.add_argument("--rope-scaling",
+                                 **model_kwargs["rope_scaling"])
+        model_group.add_argument("--rope-theta", **model_kwargs["rope_theta"])
+        model_group.add_argument("--tokenizer-revision",
+                                 **model_kwargs["tokenizer_revision"])
+        model_group.add_argument("--max-model-len",
+                                 **model_kwargs["max_model_len"])
+        model_group.add_argument("--quantization", "-q",
+                                 **model_kwargs["quantization"])
+        model_group.add_argument("--enforce-eager",
+                                 **model_kwargs["enforce_eager"])
+        model_group.add_argument("--max-seq-len-to-capture",
+                                 **model_kwargs["max_seq_len_to_capture"])
+        model_group.add_argument("--max-logprobs",
+                                 **model_kwargs["max_logprobs"])
+        model_group.add_argument("--logprobs-mode",
+                                 **model_kwargs["logprobs_mode"])
+        model_group.add_argument("--disable-sliding-window",
+                                 **model_kwargs["disable_sliding_window"])
+        model_group.add_argument("--disable-cascade-attn",
+                                 **model_kwargs["disable_cascade_attn"])
+        model_group.add_argument("--skip-tokenizer-init",
+                                 **model_kwargs["skip_tokenizer_init"])
+        model_group.add_argument("--enable-prompt-embeds",
+                                 **model_kwargs["enable_prompt_embeds"])
+        model_group.add_argument("--served-model-name",
+                                 **model_kwargs["served_model_name"])
+        # This one is a special case because it is the
+        # opposite of ModelConfig.use_async_output_proc
+        model_group.add_argument(
+            "--disable-async-output-proc",
+            action="store_true",
+            default=EngineArgs.disable_async_output_proc,
+            help="Disable async output processing. This may result in "
+            "lower performance.")
+        model_group.add_argument("--config-format",
+                                 choices=[f.value for f in ConfigFormat],
+                                 **model_kwargs["config_format"])
+        # This one is a special case because it can bool
+        # or str. TODO: Handle this in get_kwargs
+        model_group.add_argument("--hf-token",
+                                 type=str,
+                                 nargs="?",
+                                 const=True,
+                                 default=model_kwargs["hf_token"]["default"],
+                                 help=model_kwargs["hf_token"]["help"])
+        model_group.add_argument("--hf-overrides",
+                                 **model_kwargs["hf_overrides"])
+        model_group.add_argument("--override-neuron-config",
+                                 **model_kwargs["override_neuron_config"])
+        model_group.add_argument("--override-pooler-config",
+                                 **model_kwargs["override_pooler_config"])
+        model_group.add_argument("--logits-processor-pattern",
+                                 **model_kwargs["logits_processor_pattern"])
+        model_group.add_argument("--generation-config",
+                                 **model_kwargs["generation_config"])
+        model_group.add_argument("--override-generation-config",
+                                 **model_kwargs["override_generation_config"])
+        model_group.add_argument("--enable-sleep-mode",
+                                 **model_kwargs["enable_sleep_mode"])
+        model_group.add_argument("--model-impl",
+                                 choices=[f.value for f in ModelImpl],
+                                 **model_kwargs["model_impl"])
+        model_group.add_argument("--override-attention-dtype",
+                                 **model_kwargs["override_attention_dtype"])
+
+        # Model loading arguments
+        load_kwargs = get_kwargs(LoadConfig)
+        load_group = parser.add_argument_group(
+            title="LoadConfig",
+            description=LoadConfig.__doc__,
+        )
+        load_group.add_argument("--load-format",
+                                choices=[f.value for f in LoadFormat],
+                                **load_kwargs["load_format"])
+        load_group.add_argument("--download-dir",
+                                **load_kwargs["download_dir"])
+        load_group.add_argument("--model-loader-extra-config",
+                                **load_kwargs["model_loader_extra_config"])
+        load_group.add_argument("--ignore-patterns",
+                                **load_kwargs["ignore_patterns"])
+        load_group.add_argument("--use-tqdm-on-load",
+                                **load_kwargs["use_tqdm_on_load"])
+        load_group.add_argument('--pt-load-map-location',
+                                **load_kwargs["pt_load_map_location"])
+
+        # Guided decoding arguments
+        guided_decoding_kwargs = get_kwargs(DecodingConfig)
+        guided_decoding_group = parser.add_argument_group(
+            title="DecodingConfig",
+            description=DecodingConfig.__doc__,
+        )
+        guided_decoding_group.add_argument("--guided-decoding-backend",
+                                           **guided_decoding_kwargs["backend"])
+        guided_decoding_group.add_argument(
+            "--guided-decoding-disable-fallback",
+            **guided_decoding_kwargs["disable_fallback"])
+        guided_decoding_group.add_argument(
+            "--guided-decoding-disable-any-whitespace",
+            **guided_decoding_kwargs["disable_any_whitespace"])
+        guided_decoding_group.add_argument(
+            "--guided-decoding-disable-additional-properties",
+            **guided_decoding_kwargs["disable_additional_properties"])
+        guided_decoding_group.add_argument(
+            "--reasoning-parser",
+            # This choices is a special case because it's not static
+            choices=list(ReasoningParserManager.reasoning_parsers),
+            **guided_decoding_kwargs["reasoning_backend"])
+
+        # Parallel arguments
+        parallel_kwargs = get_kwargs(ParallelConfig)
+        parallel_group = parser.add_argument_group(
+            title="ParallelConfig",
+            description=ParallelConfig.__doc__,
+        )
+        parallel_group.add_argument(
+            "--distributed-executor-backend",
+            **parallel_kwargs["distributed_executor_backend"])
+        parallel_group.add_argument(
+            "--pipeline-parallel-size", "-pp",
+            **parallel_kwargs["pipeline_parallel_size"])
+        parallel_group.add_argument("--tensor-parallel-size", "-tp",
+                                    **parallel_kwargs["tensor_parallel_size"])
+        parallel_group.add_argument("--data-parallel-size", "-dp",
+                                    **parallel_kwargs["data_parallel_size"])
+        parallel_group.add_argument(
+            '--data-parallel-rank',
+            '-dpn',
+            type=int,
+            help='Data parallel rank of this instance. '
+            'When set, enables external load balancer mode.')
+        parallel_group.add_argument('--data-parallel-start-rank',
+                                    '-dpr',
+                                    type=int,
+                                    help='Starting data parallel rank '
+                                    'for secondary nodes.')
+        parallel_group.add_argument('--data-parallel-size-local',
+                                    '-dpl',
+                                    type=int,
+                                    help='Number of data parallel replicas '
+                                    'to run on this node.')
+        parallel_group.add_argument('--data-parallel-address',
+                                    '-dpa',
+                                    type=str,
+                                    help='Address of data parallel cluster '
+                                    'head-node.')
+        parallel_group.add_argument('--data-parallel-rpc-port',
+                                    '-dpp',
+                                    type=int,
+                                    help='Port for data parallel RPC '
+                                    'communication.')
+        parallel_group.add_argument('--data-parallel-backend',
+                                    '-dpb',
+                                    type=str,
+                                    default='mp',
+                                    help='Backend for data parallel, either '
+                                    '"mp" or "ray".')
+        parallel_group.add_argument(
+            "--data-parallel-hybrid-lb",
+            **parallel_kwargs["data_parallel_hybrid_lb"])
+        parallel_group.add_argument(
+            "--enable-expert-parallel",
+            **parallel_kwargs["enable_expert_parallel"])
+        parallel_group.add_argument("--enable-eplb",
+                                    **parallel_kwargs["enable_eplb"])
+        parallel_group.add_argument("--num-redundant-experts",
+                                    **parallel_kwargs["num_redundant_experts"])
+        parallel_group.add_argument("--eplb-window-size",
+                                    **parallel_kwargs["eplb_window_size"])
+        parallel_group.add_argument("--eplb-step-interval",
+                                    **parallel_kwargs["eplb_step_interval"])
+        parallel_group.add_argument("--eplb-log-balancedness",
+                                    **parallel_kwargs["eplb_log_balancedness"])
+        parallel_group.add_argument(
+            "--max-parallel-loading-workers",
+            **parallel_kwargs["max_parallel_loading_workers"])
+        parallel_group.add_argument(
+            "--ray-workers-use-nsight",
+            **parallel_kwargs["ray_workers_use_nsight"])
+        parallel_group.add_argument(
+            "--disable-custom-all-reduce",
+            **parallel_kwargs["disable_custom_all_reduce"])
+        parallel_group.add_argument("--worker-cls",
+                                    **parallel_kwargs["worker_cls"])
+        parallel_group.add_argument("--worker-extension-cls",
+                                    **parallel_kwargs["worker_extension_cls"])
+        parallel_group.add_argument(
+            "--enable-multimodal-encoder-data-parallel",
+            **parallel_kwargs["enable_multimodal_encoder_data_parallel"])
+
+        # KV cache arguments
+        cache_kwargs = get_kwargs(CacheConfig)
+        cache_group = parser.add_argument_group(
+            title="CacheConfig",
+            description=CacheConfig.__doc__,
+        )
+        cache_group.add_argument("--block-size", **cache_kwargs["block_size"])
+        cache_group.add_argument("--gpu-memory-utilization",
+                                 **cache_kwargs["gpu_memory_utilization"])
+        cache_group.add_argument("--swap-space", **cache_kwargs["swap_space"])
+        cache_group.add_argument("--kv-cache-dtype",
+                                 **cache_kwargs["cache_dtype"])
+        cache_group.add_argument("--num-gpu-blocks-override",
+                                 **cache_kwargs["num_gpu_blocks_override"])
+        cache_group.add_argument("--enable-prefix-caching",
+                                 **cache_kwargs["enable_prefix_caching"])
+        cache_group.add_argument("--prefix-caching-hash-algo",
+                                 **cache_kwargs["prefix_caching_hash_algo"])
+        cache_group.add_argument("--cpu-offload-gb",
+                                 **cache_kwargs["cpu_offload_gb"])
+        cache_group.add_argument("--calculate-kv-scales",
+                                 **cache_kwargs["calculate_kv_scales"])
+
+        # Multimodal related configs
+        multimodal_kwargs = get_kwargs(MultiModalConfig)
+        multimodal_group = parser.add_argument_group(
+            title="MultiModalConfig",
+            description=MultiModalConfig.__doc__,
+        )
+        multimodal_group.add_argument("--limit-mm-per-prompt",
+                                      **multimodal_kwargs["limit_per_prompt"])
+        multimodal_group.add_argument("--media-io-kwargs",
+                                      **multimodal_kwargs["media_io_kwargs"])
+        multimodal_group.add_argument(
+            "--mm-processor-kwargs",
+            **multimodal_kwargs["mm_processor_kwargs"])
+        multimodal_group.add_argument(
+            "--disable-mm-preprocessor-cache",
+            **multimodal_kwargs["disable_mm_preprocessor_cache"])
+        multimodal_group.add_argument(
+            "--interleave-mm-strings",
+            **multimodal_kwargs["interleave_mm_strings"])
+
+        # LoRA related configs
+        lora_kwargs = get_kwargs(LoRAConfig)
+        lora_group = parser.add_argument_group(
+            title="LoRAConfig",
+            description=LoRAConfig.__doc__,
+        )
+        lora_group.add_argument(
+            "--enable-lora",
+            action=argparse.BooleanOptionalAction,
+            help="If True, enable handling of LoRA adapters.")
+        lora_group.add_argument("--enable-lora-bias",
+                                **lora_kwargs["bias_enabled"])
+        lora_group.add_argument("--max-loras", **lora_kwargs["max_loras"])
+        lora_group.add_argument("--max-lora-rank",
+                                **lora_kwargs["max_lora_rank"])
+        lora_group.add_argument("--lora-extra-vocab-size",
+                                **lora_kwargs["lora_extra_vocab_size"])
+        lora_group.add_argument(
+            "--lora-dtype",
+            **lora_kwargs["lora_dtype"],
+        )
+        lora_group.add_argument("--max-cpu-loras",
+                                **lora_kwargs["max_cpu_loras"])
+        lora_group.add_argument("--fully-sharded-loras",
+                                **lora_kwargs["fully_sharded_loras"])
+        lora_group.add_argument("--default-mm-loras",
+                                **lora_kwargs["default_mm_loras"])
+
+        # Speculative arguments
+        speculative_group = parser.add_argument_group(
+            title="SpeculativeConfig",
+            description=SpeculativeConfig.__doc__,
+        )
+        speculative_group.add_argument(
+            "--speculative-config",
+            type=json.loads,
+            default=None,
+            help="The configurations for speculative decoding. Should be a "
+            "JSON string.")
+
+        # Observability arguments
+        observability_kwargs = get_kwargs(ObservabilityConfig)
+        observability_group = parser.add_argument_group(
+            title="ObservabilityConfig",
+            description=ObservabilityConfig.__doc__,
+        )
+        observability_group.add_argument(
+            "--show-hidden-metrics-for-version",
+            **observability_kwargs["show_hidden_metrics_for_version"])
+        observability_group.add_argument(
+            "--otlp-traces-endpoint",
+            **observability_kwargs["otlp_traces_endpoint"])
+        # TODO: generalise this special case
+        choices = observability_kwargs["collect_detailed_traces"]["choices"]
+        metavar = f"{{{','.join(choices)}}}"
+        observability_kwargs["collect_detailed_traces"]["metavar"] = metavar
+        observability_kwargs["collect_detailed_traces"]["choices"] += [
+            ",".join(p)
+            for p in permutations(get_args(DetailedTraceModules), r=2)
+        ]
+        observability_group.add_argument(
+            "--collect-detailed-traces",
+            **observability_kwargs["collect_detailed_traces"])
+
+        # Scheduler arguments
+        scheduler_kwargs = get_kwargs(SchedulerConfig)
+        scheduler_group = parser.add_argument_group(
+            title="SchedulerConfig",
+            description=SchedulerConfig.__doc__,
+        )
+        scheduler_group.add_argument(
+            "--max-num-batched-tokens",
+            **scheduler_kwargs["max_num_batched_tokens"])
+        scheduler_group.add_argument("--max-num-seqs",
+                                     **scheduler_kwargs["max_num_seqs"])
+        scheduler_group.add_argument(
+            "--max-num-partial-prefills",
+            **scheduler_kwargs["max_num_partial_prefills"])
+        scheduler_group.add_argument(
+            "--max-long-partial-prefills",
+            **scheduler_kwargs["max_long_partial_prefills"])
+        scheduler_group.add_argument('--cuda-graph-sizes',
+                                     **scheduler_kwargs["cuda_graph_sizes"])
+        scheduler_group.add_argument(
+            "--long-prefill-token-threshold",
+            **scheduler_kwargs["long_prefill_token_threshold"])
+        scheduler_group.add_argument("--num-lookahead-slots",
+                                     **scheduler_kwargs["num_lookahead_slots"])
+        scheduler_group.add_argument("--scheduler-delay-factor",
+                                     **scheduler_kwargs["delay_factor"])
+        scheduler_group.add_argument("--preemption-mode",
+                                     **scheduler_kwargs["preemption_mode"])
+        scheduler_group.add_argument("--num-scheduler-steps",
+                                     **scheduler_kwargs["num_scheduler_steps"])
+        scheduler_group.add_argument(
+            "--multi-step-stream-outputs",
+            **scheduler_kwargs["multi_step_stream_outputs"])
+        scheduler_group.add_argument("--scheduling-policy",
+                                     **scheduler_kwargs["policy"])
+        scheduler_group.add_argument(
+            "--enable-chunked-prefill",
+            **scheduler_kwargs["enable_chunked_prefill"])
+        scheduler_group.add_argument(
+            "--disable-chunked-mm-input",
+            **scheduler_kwargs["disable_chunked_mm_input"])
+        scheduler_group.add_argument("--scheduler-cls",
+                                     **scheduler_kwargs["scheduler_cls"])
+        scheduler_group.add_argument(
+            "--disable-hybrid-kv-cache-manager",
+            **scheduler_kwargs["disable_hybrid_kv_cache_manager"])
+        scheduler_group.add_argument("--async-scheduling",
+                                     **scheduler_kwargs["async_scheduling"])
+
+        # vLLM arguments
+        vllm_kwargs = get_kwargs(VllmConfig)
+        vllm_group = parser.add_argument_group(
+            title="VllmConfig",
+            description=VllmConfig.__doc__,
+        )
+        vllm_group.add_argument("--kv-transfer-config",
+                                **vllm_kwargs["kv_transfer_config"])
+        vllm_group.add_argument('--kv-events-config',
+                                **vllm_kwargs["kv_events_config"])
+        vllm_group.add_argument("--compilation-config", "-O",
+                                **vllm_kwargs["compilation_config"])
+        vllm_group.add_argument("--additional-config",
+                                **vllm_kwargs["additional_config"])
+
+        # Other arguments
+        parser.add_argument('--disable-log-stats',
+                            action='store_true',
+                            help='Disable logging statistics.')
+        parser.add_argument('--enable-prompt-adapter',
+                            action='store_true',
+                            deprecated=True,
+                            help='[DEPRECATED] Prompt adapter has been '
+                            'removed. Setting this flag to True or False'
+                            ' has no effect on vLLM behavior.')
+
+        return parser
+
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        # Get the list of attributes of this dataclass.
+        attrs = [attr.name for attr in dataclasses.fields(cls)]
+        # Set the attributes from the parsed arguments.
+        engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
+        return engine_args
+
+    def create_model_config(self) -> ModelConfig:
+        # gguf file needs a specific model loader and doesn't use hf_repo
+        if check_gguf_file(self.model):
+            self.quantization = self.load_format = "gguf"
+
+        # NOTE: This is to allow model loading from S3 in CI
+        if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3
+                and self.model in MODELS_ON_S3
+                and self.load_format == LoadFormat.AUTO):  # noqa: E501
+            self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}"
+            self.load_format = LoadFormat.RUNAI_STREAMER
+
+        return ModelConfig(
+            model=self.model,
+            hf_config_path=self.hf_config_path,
+            task=self.task,
+            tokenizer=self.tokenizer,
+            tokenizer_mode=self.tokenizer_mode,
+            trust_remote_code=self.trust_remote_code,
+            allowed_local_media_path=self.allowed_local_media_path,
+            dtype=self.dtype,
+            seed=self.seed,
+            revision=self.revision,
+            code_revision=self.code_revision,
+            rope_scaling=self.rope_scaling,
+            rope_theta=self.rope_theta,
+            hf_token=self.hf_token,
+            hf_overrides=self.hf_overrides,
+            tokenizer_revision=self.tokenizer_revision,
+            max_model_len=self.max_model_len,
+            quantization=self.quantization,
+            enforce_eager=self.enforce_eager,
+            max_seq_len_to_capture=self.max_seq_len_to_capture,
+            max_logprobs=self.max_logprobs,
+            logprobs_mode=self.logprobs_mode,
+            disable_sliding_window=self.disable_sliding_window,
+            disable_cascade_attn=self.disable_cascade_attn,
+            skip_tokenizer_init=self.skip_tokenizer_init,
+            enable_prompt_embeds=self.enable_prompt_embeds,
+            served_model_name=self.served_model_name,
+            limit_mm_per_prompt=self.limit_mm_per_prompt,
+            interleave_mm_strings=self.interleave_mm_strings,
+            media_io_kwargs=self.media_io_kwargs,
+            use_async_output_proc=not self.disable_async_output_proc,
+            config_format=self.config_format,
+            mm_processor_kwargs=self.mm_processor_kwargs,
+            disable_mm_preprocessor_cache=self.disable_mm_preprocessor_cache,
+            override_neuron_config=self.override_neuron_config,
+            override_pooler_config=self.override_pooler_config,
+            logits_processor_pattern=self.logits_processor_pattern,
+            generation_config=self.generation_config,
+            override_generation_config=self.override_generation_config,
+            enable_sleep_mode=self.enable_sleep_mode,
+            model_impl=self.model_impl,
+            override_attention_dtype=self.override_attention_dtype,
+        )
+
+    def validate_tensorizer_args(self):
+        from vllm.model_executor.model_loader.tensorizer import (
+            TensorizerConfig)
+        for key in self.model_loader_extra_config:
+            if key in TensorizerConfig._fields:
+                self.model_loader_extra_config["tensorizer_config"][
+                    key] = self.model_loader_extra_config[key]
+
+    def create_load_config(self) -> LoadConfig:
+
+        if self.quantization == "bitsandbytes":
+            self.load_format = "bitsandbytes"
+
+        if self.load_format == "tensorizer":
+            if hasattr(self.model_loader_extra_config, "to_serializable"):
+                self.model_loader_extra_config = (
+                    self.model_loader_extra_config.to_serializable())
+            self.model_loader_extra_config["tensorizer_config"] = {}
+            self.model_loader_extra_config["tensorizer_config"][
+                "tensorizer_dir"] = self.model
+            self.validate_tensorizer_args()
+
+        return LoadConfig(
+            load_format=self.load_format,
+            download_dir=self.download_dir,
+            device="cpu"
+            if is_online_quantization(self.quantization) else None,
+            model_loader_extra_config=self.model_loader_extra_config,
+            ignore_patterns=self.ignore_patterns,
+            use_tqdm_on_load=self.use_tqdm_on_load,
+            pt_load_map_location=self.pt_load_map_location,
+        )
+
+    def create_speculative_config(
+        self,
+        target_model_config: ModelConfig,
+        target_parallel_config: ParallelConfig,
+        enable_chunked_prefill: bool,
+        disable_log_stats: bool,
+    ) -> Optional["SpeculativeConfig"]:
+        """Initializes and returns a SpeculativeConfig object based on
+        `speculative_config`.
+
+        This function utilizes `speculative_config` to create a
+        SpeculativeConfig object. The `speculative_config` can either be
+        provided as a JSON string input via CLI arguments or directly as a
+        dictionary from the engine.
+        """
+        if self.speculative_config is None:
+            return None
+
+        # Note(Shangming): These parameters are not obtained from the cli arg
+        # '--speculative-config' and must be passed in when creating the engine
+        # config.
+        self.speculative_config.update({
+            "target_model_config": target_model_config,
+            "target_parallel_config": target_parallel_config,
+            "enable_chunked_prefill": enable_chunked_prefill,
+            "disable_log_stats": disable_log_stats,
+        })
+        speculative_config = SpeculativeConfig.from_dict(
+            self.speculative_config)
+
+        return speculative_config
+
+    def create_engine_config(
+        self,
+        usage_context: Optional[UsageContext] = None,
+        headless: bool = False,
+    ) -> VllmConfig:
+        """
+        Create the VllmConfig.
+
+        NOTE: for autoselection of V0 vs V1 engine, we need to
+        create the ModelConfig first, since ModelConfig's attrs
+        (e.g. the model arch) are needed to make the decision.
+
+        This function set VLLM_USE_V1=X if VLLM_USE_V1 is
+        unspecified by the user.
+
+        If VLLM_USE_V1 is specified by the user but the VllmConfig
+        is incompatible, we raise an error.
+        """
+        current_platform.pre_register_and_update()
+
+        device_config = DeviceConfig(
+            device=cast(Device, current_platform.device_type))
+        model_config = self.create_model_config()
+
+        # * If VLLM_USE_V1 is unset, we enable V1 for "supported features"
+        #   and fall back to V0 for experimental or unsupported features.
+        # * If VLLM_USE_V1=1, we enable V1 for supported + experimental
+        #   features and raise error for unsupported features.
+        # * If VLLM_USE_V1=0, we disable V1.
+        use_v1 = False
+        try_v1 = envs.VLLM_USE_V1 or not envs.is_set("VLLM_USE_V1")
+        if try_v1 and self._is_v1_supported_oracle(model_config):
+            use_v1 = True
+
+        # If user explicitly set VLLM_USE_V1, sanity check we respect it.
+        if envs.is_set("VLLM_USE_V1"):
+            assert use_v1 == envs.VLLM_USE_V1
+        # Otherwise, set the VLLM_USE_V1 variable globally.
+        else:
+            envs.set_vllm_use_v1(use_v1)
+
+        # Set default arguments for V0 or V1 Engine.
+        if use_v1:
+            self._set_default_args_v1(usage_context, model_config)
+            # Disable chunked prefill for POWER (ppc64le)/ARM CPUs in V1
+            if current_platform.is_cpu(
+            ) and current_platform.get_cpu_architecture() in (
+                    CpuArchEnum.POWERPC, CpuArchEnum.ARM):
+                logger.info(
+                    "Chunked prefill is not supported for ARM and POWER CPUs; "
+                    "disabling it for V1 backend.")
+                self.enable_chunked_prefill = False
+        else:
+            self._set_default_args_v0(model_config)
+        assert self.enable_chunked_prefill is not None
+
+        if envs.VLLM_ATTENTION_BACKEND in [STR_DUAL_CHUNK_FLASH_ATTN_VAL]:
+            assert self.enforce_eager, (
+                "Cuda graph is not supported with DualChunkFlashAttention. "
+                "To run the model in eager mode, set 'enforce_eager=True' "
+                "or use '--enforce-eager' in the CLI.")
+            assert current_platform.is_cuda(), (
+                "DualChunkFlashAttention is only supported on CUDA platform.")
+            assert not use_v1, (
+                "DualChunkFlashAttention is not supported on V1 engine. "
+                "To run the model in V0 engine, try set 'VLLM_USE_V1=0'")
+
+        cache_config = CacheConfig(
+            block_size=self.block_size,
+            gpu_memory_utilization=self.gpu_memory_utilization,
+            swap_space=self.swap_space,
+            cache_dtype=self.kv_cache_dtype,
+            is_attention_free=model_config.is_attention_free,
+            num_gpu_blocks_override=self.num_gpu_blocks_override,
+            sliding_window=model_config.get_sliding_window(),
+            enable_prefix_caching=self.enable_prefix_caching,
+            prefix_caching_hash_algo=self.prefix_caching_hash_algo,
+            cpu_offload_gb=self.cpu_offload_gb,
+            calculate_kv_scales=self.calculate_kv_scales,
+        )
+
+        # Get the current placement group if Ray is initialized and
+        # we are in a Ray actor. If so, then the placement group will be
+        # passed to spawned processes.
+        placement_group = None
+        if is_in_ray_actor():
+            import ray
+
+            # This call initializes Ray automatically if it is not initialized,
+            # but we should not do this here.
+            placement_group = ray.util.get_current_placement_group()
+
+        assert not headless or not self.data_parallel_hybrid_lb, (
+            "data_parallel_hybrid_lb is not applicable in "
+            "headless mode")
+
+        data_parallel_external_lb = self.data_parallel_rank is not None
+        # Local DP rank = 1, use pure-external LB.
+        if data_parallel_external_lb:
+            assert self.data_parallel_size_local in (1, None), (
+                "data_parallel_size_local must be 1 when data_parallel_rank "
+                "is set")
+            data_parallel_size_local = 1
+            # Use full external lb if we have local_size of 1.
+            self.data_parallel_hybrid_lb = False
+        elif self.data_parallel_size_local is not None:
+            data_parallel_size_local = self.data_parallel_size_local
+
+            if self.data_parallel_start_rank and not headless:
+                # Infer hybrid LB mode.
+                self.data_parallel_hybrid_lb = True
+
+            if self.data_parallel_hybrid_lb and data_parallel_size_local == 1:
+                # Use full external lb if we have local_size of 1.
+                data_parallel_external_lb = True
+                self.data_parallel_hybrid_lb = False
+
+            if data_parallel_size_local == self.data_parallel_size:
+                # Disable hybrid LB mode if set for a single node
+                self.data_parallel_hybrid_lb = False
+
+            self.data_parallel_rank = self.data_parallel_start_rank or 0
+        else:
+            assert not self.data_parallel_hybrid_lb, (
+                "data_parallel_size_local must be set to use "
+                "data_parallel_hybrid_lb.")
+
+            # Local DP size defaults to global DP size if not set.
+            data_parallel_size_local = self.data_parallel_size
+
+        # DP address, used in multi-node case for torch distributed group
+        # and ZMQ sockets.
+        if self.data_parallel_address is None:
+            if self.data_parallel_backend == "ray":
+                host_ip = get_ip()
+                logger.info(
+                    "Using host IP %s as ray-based data parallel address",
+                    host_ip)
+                data_parallel_address = host_ip
+            else:
+                assert self.data_parallel_backend == "mp", (
+                    "data_parallel_backend can only be ray or mp, got %s",
+                    self.data_parallel_backend)
+                data_parallel_address = ParallelConfig.data_parallel_master_ip
+        else:
+            data_parallel_address = self.data_parallel_address
+
+        # This port is only used when there are remote data parallel engines,
+        # otherwise the local IPC transport is used.
+        data_parallel_rpc_port = self.data_parallel_rpc_port if (
+            self.data_parallel_rpc_port
+            is not None) else ParallelConfig.data_parallel_rpc_port
+
+        if self.async_scheduling:
+            # Async scheduling does not work with the uniprocess backend.
+            if self.distributed_executor_backend is None:
+                self.distributed_executor_backend = "mp"
+                logger.info("Using mp-based distributed executor backend "
+                            "for async scheduling.")
+            if self.distributed_executor_backend == "uni":
+                raise ValueError("Async scheduling is not supported with "
+                                 "uni-process backend.")
+            if self.pipeline_parallel_size > 1:
+                raise ValueError("Async scheduling is not supported with "
+                                 "pipeline-parallel-size > 1.")
+
+            # Currently, async scheduling does not support speculative decoding.
+            # TODO(woosuk): Support it.
+            if self.speculative_config is not None:
+                raise ValueError(
+                    "Currently, speculative decoding is not supported with "
+                    "async scheduling.")
+
+        parallel_config = ParallelConfig(
+            pipeline_parallel_size=self.pipeline_parallel_size,
+            tensor_parallel_size=self.tensor_parallel_size,
+            data_parallel_size=self.data_parallel_size,
+            data_parallel_rank=self.data_parallel_rank or 0,
+            data_parallel_external_lb=data_parallel_external_lb,
+            data_parallel_size_local=data_parallel_size_local,
+            data_parallel_master_ip=data_parallel_address,
+            data_parallel_rpc_port=data_parallel_rpc_port,
+            data_parallel_backend=self.data_parallel_backend,
+            data_parallel_hybrid_lb=self.data_parallel_hybrid_lb,
+            enable_expert_parallel=self.enable_expert_parallel,
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.num_redundant_experts,
+            eplb_window_size=self.eplb_window_size,
+            eplb_step_interval=self.eplb_step_interval,
+            eplb_log_balancedness=self.eplb_log_balancedness,
+            max_parallel_loading_workers=self.max_parallel_loading_workers,
+            disable_custom_all_reduce=self.disable_custom_all_reduce,
+            ray_workers_use_nsight=self.ray_workers_use_nsight,
+            placement_group=placement_group,
+            distributed_executor_backend=self.distributed_executor_backend,
+            worker_cls=self.worker_cls,
+            worker_extension_cls=self.worker_extension_cls,
+            enable_multimodal_encoder_data_parallel=self.
+            enable_multimodal_encoder_data_parallel,
+        )
+
+        speculative_config = self.create_speculative_config(
+            target_model_config=model_config,
+            target_parallel_config=parallel_config,
+            enable_chunked_prefill=self.enable_chunked_prefill,
+            disable_log_stats=self.disable_log_stats,
+        )
+
+        # Reminder: Please update docs/features/compatibility_matrix.md
+        # If the feature combo become valid
+        if self.num_scheduler_steps > 1:
+            if speculative_config is not None:
+                raise ValueError("Speculative decoding is not supported with "
+                                 "multi-step (--num-scheduler-steps > 1)")
+            if self.enable_chunked_prefill and self.pipeline_parallel_size > 1:
+                raise ValueError("Multi-Step Chunked-Prefill is not supported "
+                                 "for pipeline-parallel-size > 1")
+            if current_platform.is_cpu():
+                logger.warning("Multi-Step (--num-scheduler-steps > 1) is "
+                               "currently not supported for CPUs and has been "
+                               "disabled.")
+                self.num_scheduler_steps = 1
+
+        # make sure num_lookahead_slots is set the higher value depending on
+        # if we are using speculative decoding or multi-step
+        num_lookahead_slots = max(self.num_lookahead_slots,
+                                  self.num_scheduler_steps - 1)
+        num_lookahead_slots = num_lookahead_slots \
+            if speculative_config is None \
+            else speculative_config.num_lookahead_slots
+
+        scheduler_config = SchedulerConfig(
+            runner_type=model_config.runner_type,
+            max_num_batched_tokens=self.max_num_batched_tokens,
+            max_num_seqs=self.max_num_seqs,
+            max_model_len=model_config.max_model_len,
+            cuda_graph_sizes=self.cuda_graph_sizes,
+            num_lookahead_slots=num_lookahead_slots,
+            delay_factor=self.scheduler_delay_factor,
+            enable_chunked_prefill=self.enable_chunked_prefill,
+            disable_chunked_mm_input=self.disable_chunked_mm_input,
+            is_multimodal_model=model_config.is_multimodal_model,
+            preemption_mode=self.preemption_mode,
+            num_scheduler_steps=self.num_scheduler_steps,
+            multi_step_stream_outputs=self.multi_step_stream_outputs,
+            send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
+                             and parallel_config.use_ray),
+            policy=self.scheduling_policy,
+            scheduler_cls=self.scheduler_cls,
+            max_num_partial_prefills=self.max_num_partial_prefills,
+            max_long_partial_prefills=self.max_long_partial_prefills,
+            long_prefill_token_threshold=self.long_prefill_token_threshold,
+            disable_hybrid_kv_cache_manager=self.
+            disable_hybrid_kv_cache_manager,
+            async_scheduling=self.async_scheduling,
+        )
+
+        if not model_config.is_multimodal_model and self.default_mm_loras:
+            raise ValueError(
+                "Default modality-specific LoRA(s) were provided for a "
+                "non multimodal model")
+
+        lora_config = LoRAConfig(
+            bias_enabled=self.enable_lora_bias,
+            max_lora_rank=self.max_lora_rank,
+            max_loras=self.max_loras,
+            default_mm_loras=self.default_mm_loras,
+            fully_sharded_loras=self.fully_sharded_loras,
+            lora_extra_vocab_size=self.lora_extra_vocab_size,
+            lora_dtype=self.lora_dtype,
+            max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
+            and self.max_cpu_loras > 0 else None) if self.enable_lora else None
+
+        # bitsandbytes pre-quantized model need a specific model loader
+        if model_config.quantization == "bitsandbytes":
+            self.quantization = self.load_format = "bitsandbytes"
+
+        load_config = self.create_load_config()
+
+        decoding_config = DecodingConfig(
+            backend=self.guided_decoding_backend,
+            disable_fallback=self.guided_decoding_disable_fallback,
+            disable_any_whitespace=self.guided_decoding_disable_any_whitespace,
+            disable_additional_properties=\
+                self.guided_decoding_disable_additional_properties,
+            reasoning_backend=self.reasoning_parser
+        )
+
+        observability_config = ObservabilityConfig(
+            show_hidden_metrics_for_version=(
+                self.show_hidden_metrics_for_version),
+            otlp_traces_endpoint=self.otlp_traces_endpoint,
+            collect_detailed_traces=self.collect_detailed_traces,
+        )
+
+        config = VllmConfig(
+            model_config=model_config,
+            cache_config=cache_config,
+            parallel_config=parallel_config,
+            scheduler_config=scheduler_config,
+            device_config=device_config,
+            lora_config=lora_config,
+            speculative_config=speculative_config,
+            load_config=load_config,
+            decoding_config=decoding_config,
+            observability_config=observability_config,
+            compilation_config=self.compilation_config,
+            kv_transfer_config=self.kv_transfer_config,
+            kv_events_config=self.kv_events_config,
+            additional_config=self.additional_config,
+        )
+
+        return config
+
+    def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
+        """Oracle for whether to use V0 or V1 Engine by default."""
+
+        #############################################################
+        # Unsupported Feature Flags on V1.
+
+        if self.load_format == LoadFormat.SHARDED_STATE.value:
+            _raise_or_fallback(
+                feature_name=f"--load_format {self.load_format}",
+                recommend_to_remove=False)
+            return False
+
+        if (self.logits_processor_pattern
+                != EngineArgs.logits_processor_pattern):
+            _raise_or_fallback(feature_name="--logits-processor-pattern",
+                               recommend_to_remove=False)
+            return False
+
+        if self.preemption_mode != SchedulerConfig.preemption_mode:
+            _raise_or_fallback(feature_name="--preemption-mode",
+                               recommend_to_remove=True)
+            return False
+
+        if (self.disable_async_output_proc
+                != EngineArgs.disable_async_output_proc):
+            _raise_or_fallback(feature_name="--disable-async-output-proc",
+                               recommend_to_remove=True)
+            return False
+
+        if self.num_scheduler_steps != SchedulerConfig.num_scheduler_steps:
+            _raise_or_fallback(feature_name="--num-scheduler-steps",
+                               recommend_to_remove=True)
+            return False
+
+        if self.scheduler_delay_factor != SchedulerConfig.delay_factor:
+            _raise_or_fallback(feature_name="--scheduler-delay-factor",
+                               recommend_to_remove=True)
+            return False
+
+        if self.guided_decoding_backend not in get_args(
+                GuidedDecodingBackendV1):
+            _raise_or_fallback(
+                feature_name=
+                f"--guided-decoding-backend={self.guided_decoding_backend}",
+                recommend_to_remove=False)
+            return False
+
+        # Need at least Ampere for now (FA support required).
+        # Skip this check if we are running on a non-GPU platform,
+        # or if the device capability is not available
+        # (e.g. in a Ray actor without GPUs).
+        if (current_platform.is_cuda()
+                and current_platform.get_device_capability()
+                and current_platform.get_device_capability().major < 8):
+            _raise_or_fallback(feature_name="Compute Capability < 8.0",
+                               recommend_to_remove=False)
+            return False
+
+        # No Fp8 KV cache so far.
+        if self.kv_cache_dtype != "auto":
+            supported = current_platform.is_kv_cache_dtype_supported(
+                self.kv_cache_dtype)
+            if not supported:
+                _raise_or_fallback(feature_name="--kv-cache-dtype",
+                                   recommend_to_remove=False)
+                return False
+
+        # No text embedding inputs so far.
+        if self.enable_prompt_embeds:
+            _raise_or_fallback(feature_name="--enable-prompt-embeds",
+                               recommend_to_remove=False)
+            return False
+
+        # No Mamba or Encoder-Decoder so far.
+        if not model_config.is_v1_compatible:
+            _raise_or_fallback(feature_name=model_config.architectures,
+                               recommend_to_remove=False)
+            return False
+
+        # V1 mamba models are unoptimized.
+        if model_config.has_inner_state and _warn_or_fallback(
+                feature_name="Mamba"):
+            return False
+
+        # No Concurrent Partial Prefills so far.
+        if (self.max_num_partial_prefills
+                != SchedulerConfig.max_num_partial_prefills
+                or self.max_long_partial_prefills
+                != SchedulerConfig.max_long_partial_prefills):
+            _raise_or_fallback(feature_name="Concurrent Partial Prefill",
+                               recommend_to_remove=False)
+            return False
+
+        # No OTLP observability so far.
+        if (self.otlp_traces_endpoint or self.collect_detailed_traces):
+            _raise_or_fallback(feature_name="--otlp-traces-endpoint",
+                               recommend_to_remove=False)
+            return False
+
+        # V1 supports N-gram, Medusa, and Eagle speculative decoding.
+        if (self.speculative_config is not None
+                and self.speculative_config.get("method") == "draft_model"):
+            raise NotImplementedError(
+                "Speculative decoding with draft model is not supported yet. "
+                "Please consider using other speculative decoding methods "
+                "such as ngram, medusa, eagle, or deepseek_mtp.")
+
+        # No XFormers so far.
+        V1_BACKENDS = [
+            "FLASH_ATTN_VLLM_V1",
+            "FLASH_ATTN",
+            "PALLAS",
+            "PALLAS_VLLM_V1",
+            "TRITON_ATTN_VLLM_V1",
+            "TRITON_MLA",
+            "CUTLASS_MLA_VLLM_V1",
+            "FLASHMLA",
+            "FLASHINFER",
+            "FLASHINFER_VLLM_V1",
+            "ROCM_AITER_MLA",
+            "TORCH_SDPA_VLLM_V1",
+            "FLEX_ATTENTION",
+        ]
+        if (envs.is_set("VLLM_ATTENTION_BACKEND")
+                and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS):
+            name = f"VLLM_ATTENTION_BACKEND={envs.VLLM_ATTENTION_BACKEND}"
+            _raise_or_fallback(feature_name=name, recommend_to_remove=True)
+            return False
+
+        # Platforms must decide if they can support v1 for this model
+        if not current_platform.supports_v1(model_config=model_config):
+            _raise_or_fallback(
+                feature_name=f"device type={current_platform.device_type}",
+                recommend_to_remove=False)
+            return False
+        #############################################################
+        # Experimental Features - allow users to opt in.
+
+        # Signal Handlers requires running in main thread.
+        if (threading.current_thread() != threading.main_thread()
+                and _warn_or_fallback("Engine in background thread")):
+            return False
+
+        if (self.pipeline_parallel_size > 1
+                and self.distributed_executor_backend
+                not in (ParallelConfig.distributed_executor_backend, "ray",
+                        "mp", "external_launcher")):
+            name = "Pipeline Parallelism without Ray distributed executor " \
+                    "or multiprocessing executor or external launcher"
+            _raise_or_fallback(feature_name=name, recommend_to_remove=False)
+            return False
+
+        # The platform may be supported on V1, but off by default for now.
+        if not current_platform.default_v1(  # noqa: SIM103
+                model_config=model_config) and _warn_or_fallback(
+                    current_platform.device_name):
+            return False
+
+        if (current_platform.is_cpu()
+                and model_config.get_sliding_window() is not None):
+            _raise_or_fallback(feature_name="sliding window (CPU backend)",
+                               recommend_to_remove=False)
+            return False
+
+        #############################################################
+
+        return True
+
+    def _set_default_args_v0(self, model_config: ModelConfig) -> None:
+        """Set Default Arguments for V0 Engine."""
+
+        max_model_len = model_config.max_model_len
+        use_long_context = max_model_len > 32768
+        if self.enable_chunked_prefill is None:
+            # Chunked prefill not supported for Multimodal or MLA in V0.
+            if model_config.is_multimodal_model or model_config.use_mla:
+                self.enable_chunked_prefill = False
+
+            # Enable chunked prefill by default for long context (> 32K)
+            # models to avoid OOM errors in initial memory profiling phase.
+            elif use_long_context:
+                is_gpu = current_platform.is_cuda()
+                use_sliding_window = (model_config.get_sliding_window()
+                                      is not None)
+                use_spec_decode = self.speculative_config is not None
+
+                if (is_gpu and not use_sliding_window and not use_spec_decode
+                        and not self.enable_lora
+                        and model_config.runner_type != "pooling"):
+                    self.enable_chunked_prefill = True
+                    logger.warning(
+                        "Chunked prefill is enabled by default for models "
+                        "with max_model_len > 32K. Chunked prefill might "
+                        "not work with some features or models. If you "
+                        "encounter any issues, please disable by launching "
+                        "with --enable-chunked-prefill=False.")
+
+            if self.enable_chunked_prefill is None:
+                self.enable_chunked_prefill = False
+
+        if not self.enable_chunked_prefill and use_long_context:
+            logger.warning(
+                "The model has a long context length (%s). This may cause"
+                "OOM during the initial memory profiling phase, or result "
+                "in low performance due to small KV cache size. Consider "
+                "setting --max-model-len to a smaller value.", max_model_len)
+        elif (self.enable_chunked_prefill
+              and model_config.runner_type == "pooling"):
+            msg = "Chunked prefill is not supported for pooling models"
+            raise ValueError(msg)
+
+        # if using prefix caching, we must set a hash algo
+        if self.enable_prefix_caching:
+            # Disable prefix caching for multimodal models for VLLM_V0.
+            if model_config.is_multimodal_model:
+                logger.warning(
+                    "--enable-prefix-caching is not supported for multimodal "
+                    "models in V0 and has been disabled.")
+                self.enable_prefix_caching = False
+
+            # VLLM_V0 only supports builtin hash algo for prefix caching.
+            if self.prefix_caching_hash_algo == "sha256":
+                raise ValueError(
+                    "sha256 is not supported for prefix caching in V0 engine. "
+                    "Please use 'builtin'.")
+
+        # Set max_num_seqs to 256 for VLLM_V0.
+        if self.max_num_seqs is None:
+            self.max_num_seqs = 256
+
+    def _set_default_args_v1(self, usage_context: UsageContext,
+                             model_config: ModelConfig) -> None:
+        """Set Default Arguments for V1 Engine."""
+
+        # V1 always uses chunked prefills and prefix caching
+        # for non-pooling tasks.
+        # For pooling tasks the default is False
+        if model_config.runner_type != "pooling":
+            self.enable_chunked_prefill = True
+            if self.enable_prefix_caching is None:
+                self.enable_prefix_caching = True
+        else:
+
+            pooling_type = model_config.pooler_config.pooling_type
+
+            # TODO: when encoder models are supported we'll have to
+            # check for causal attention here.
+            incremental_prefill_supported = (pooling_type is not None and
+                                             pooling_type.lower() == "last")
+
+            action = "Enabling" if \
+                incremental_prefill_supported else "Disabling"
+
+            if self.enable_chunked_prefill is None:
+                self.enable_chunked_prefill = incremental_prefill_supported
+                logger.info("(%s) chunked prefill by default", action)
+            if self.enable_prefix_caching is None:
+                self.enable_prefix_caching = incremental_prefill_supported
+                logger.info("(%s) prefix caching by default", action)
+
+        if not self.enable_chunked_prefill:
+            self.max_num_batched_tokens = model_config.max_model_len
+
+        # V1 should use the new scheduler by default.
+        # Swap it only if this arg is set to the original V0 default
+        if self.scheduler_cls == EngineArgs.scheduler_cls:
+            self.scheduler_cls = "vllm.v1.core.sched.scheduler.Scheduler"
+
+        # When no user override, set the default values based on the usage
+        # context.
+        # Use different default values for different hardware.
+
+        # Try to query the device name on the current platform. If it fails,
+        # it may be because the platform that imports vLLM is not the same
+        # as the platform that vLLM is running on (e.g. the case of scaling
+        # vLLM with Ray) and has no GPUs. In this case we use the default
+        # values for non-H100/H200 GPUs.
+        try:
+            device_memory = current_platform.get_device_total_memory()
+            device_name = current_platform.get_device_name().lower()
+        except Exception:
+            # This is only used to set default_max_num_batched_tokens
+            device_memory = 0
+
+        # NOTE(Kuntai): Setting large `max_num_batched_tokens` for A100 reduces
+        # throughput, see PR #17885 for more details.
+        # So here we do an extra device name check to prevent such regression.
+        from vllm.usage.usage_lib import UsageContext
+        if device_memory >= 70 * GiB_bytes and "a100" not in device_name:
+            # For GPUs like H100 and MI300x, use larger default values.
+            default_max_num_batched_tokens = {
+                UsageContext.LLM_CLASS: 16384,
+                UsageContext.OPENAI_API_SERVER: 8192,
+            }
+            default_max_num_seqs = {
+                UsageContext.LLM_CLASS: 1024,
+                UsageContext.OPENAI_API_SERVER: 1024,
+            }
+        else:
+            # TODO(woosuk): Tune the default values for other hardware.
+            default_max_num_batched_tokens = {
+                UsageContext.LLM_CLASS: 8192,
+                UsageContext.OPENAI_API_SERVER: 2048,
+            }
+            default_max_num_seqs = {
+                UsageContext.LLM_CLASS: 256,
+                UsageContext.OPENAI_API_SERVER: 256,
+            }
+
+        # tpu specific default values.
+        if current_platform.is_tpu():
+            default_max_num_batched_tokens_tpu = {
+                UsageContext.LLM_CLASS: {
+                    'V6E': 2048,
+                    'V5E': 1024,
+                    'V5P': 512,
+                },
+                UsageContext.OPENAI_API_SERVER: {
+                    'V6E': 1024,
+                    'V5E': 512,
+                    'V5P': 256,
+                }
+            }
+
+        # cpu specific default values.
+        if current_platform.is_cpu():
+            world_size = self.pipeline_parallel_size * self.tensor_parallel_size
+            default_max_num_batched_tokens = {
+                UsageContext.LLM_CLASS: 4096 * world_size,
+                UsageContext.OPENAI_API_SERVER: 2048 * world_size,
+            }
+            default_max_num_seqs = {
+                UsageContext.LLM_CLASS: 256 * world_size,
+                UsageContext.OPENAI_API_SERVER: 128 * world_size,
+            }
+
+        use_context_value = usage_context.value if usage_context else None
+        if (self.max_num_batched_tokens is None
+                and usage_context in default_max_num_batched_tokens):
+            if current_platform.is_tpu():
+                chip_name = current_platform.get_device_name()
+                if chip_name in default_max_num_batched_tokens_tpu[
+                        usage_context]:
+                    self.max_num_batched_tokens = \
+                        default_max_num_batched_tokens_tpu[
+                            usage_context][chip_name]
+                else:
+                    self.max_num_batched_tokens = \
+                        default_max_num_batched_tokens[usage_context]
+            else:
+                self.max_num_batched_tokens = default_max_num_batched_tokens[
+                    usage_context]
+            logger.debug(
+                "Setting max_num_batched_tokens to %d for %s usage context.",
+                self.max_num_batched_tokens, use_context_value)
+
+        if (self.max_num_seqs is None
+                and usage_context in default_max_num_seqs):
+            self.max_num_seqs = default_max_num_seqs[usage_context]
+
+            logger.debug("Setting max_num_seqs to %d for %s usage context.",
+                         self.max_num_seqs, use_context_value)
+
+
+@dataclass
+class AsyncEngineArgs(EngineArgs):
+    """Arguments for asynchronous vLLM engine."""
+    disable_log_requests: bool = False
+
+    @staticmethod
+    def add_cli_args(parser: FlexibleArgumentParser,
+                     async_args_only: bool = False) -> FlexibleArgumentParser:
+        # Initialize plugin to update the parser, for example, The plugin may
+        # adding a new kind of quantization method to --quantization argument or
+        # a new device to --device argument.
+        load_general_plugins()
+        if not async_args_only:
+            parser = EngineArgs.add_cli_args(parser)
+        parser.add_argument('--disable-log-requests',
+                            action='store_true',
+                            help='Disable logging requests.')
+        current_platform.pre_register_and_update(parser)
+        return parser
+
+
+def _raise_or_fallback(feature_name: str, recommend_to_remove: bool):
+    if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
+        raise NotImplementedError(
+            f"VLLM_USE_V1=1 is not supported with {feature_name}.")
+    msg = f"{feature_name} is not supported by the V1 Engine. "
+    msg += "Falling back to V0. "
+    if recommend_to_remove:
+        msg += f"We recommend to remove {feature_name} from your config "
+        msg += "in favor of the V1 Engine."
+    logger.warning(msg)
+
+
+def _warn_or_fallback(feature_name: str) -> bool:
+    if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
+        logger.warning(
+            "Detected VLLM_USE_V1=1 with %s. Usage should "
+            "be considered experimental. Please report any "
+            "issues on Github.", feature_name)
+        should_exit = False
+    else:
+        logger.info(
+            "%s is experimental on VLLM_USE_V1=1. "
+            "Falling back to V0 Engine.", feature_name)
+        should_exit = True
+    return should_exit
+
+
+def human_readable_int(value):
+    """Parse human-readable integers like '1k', '2M', etc.
+    Including decimal values with decimal multipliers.
+
+    Examples:
+    - '1k' -> 1,000
+    - '1K' -> 1,024
+    - '25.6k' -> 25,600
+    """
+    value = value.strip()
+    match = re.fullmatch(r'(\d+(?:\.\d+)?)([kKmMgGtT])', value)
+    if match:
+        decimal_multiplier = {
+            'k': 10**3,
+            'm': 10**6,
+            'g': 10**9,
+        }
+        binary_multiplier = {
+            'K': 2**10,
+            'M': 2**20,
+            'G': 2**30,
+        }
+
+        number, suffix = match.groups()
+        if suffix in decimal_multiplier:
+            mult = decimal_multiplier[suffix]
+            return int(float(number) * mult)
+        elif suffix in binary_multiplier:
+            mult = binary_multiplier[suffix]
+            # Do not allow decimals with binary multipliers
+            try:
+                return int(number) * mult
+            except ValueError as e:
+                raise argparse.ArgumentTypeError("Decimals are not allowed " \
+                f"with binary suffixes like {suffix}. Did you mean to use " \
+                f"{number}{suffix.lower()} instead?") from e
+
+    # Regular plain number.
+    return int(value)
+
+
+# These functions are used by sphinx to build the documentation
+def _engine_args_parser():
+    return EngineArgs.add_cli_args(FlexibleArgumentParser())
+
+
+def _async_engine_args_parser():
+    return AsyncEngineArgs.add_cli_args(FlexibleArgumentParser(),
+                                        async_args_only=True)
diff --git a/vllm_v0.10.0/vllm/engine/async_llm_engine.py b/vllm_v0.10.0/vllm/engine/async_llm_engine.py
new file mode 100644
index 0000000..39642d8
--- /dev/null
+++ b/vllm_v0.10.0/vllm/engine/async_llm_engine.py
@@ -0,0 +1,1196 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import copy
+import time
+import weakref
+from functools import partial
+from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List,
+                    Mapping, Optional, Set, Tuple, Type, Union)
+from weakref import ReferenceType
+
+import vllm.envs as envs
+from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig, VllmConfig)
+from vllm.core.scheduler import SchedulerOutputs
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_timeout import asyncio_timeout
+from vllm.engine.llm_engine import LLMEngine, SchedulerOutputState
+from vllm.engine.metrics_types import StatLoggerBase
+from vllm.engine.protocol import EngineClient
+from vllm.executor.executor_base import ExecutorBase
+from vllm.inputs import PromptType
+from vllm.inputs.preprocess import InputPreprocessor
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.model_executor.guided_decoding import (
+    get_guided_decoding_logits_processor)
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.outputs import PoolingRequestOutput, RequestOutput
+from vllm.pooling_params import PoolingParams
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import ExecuteModelRequest
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.usage.usage_lib import UsageContext
+from vllm.utils import Device, weak_bind
+
+logger = init_logger(__name__)
+ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
+
+
+class AsyncEngineDeadError(RuntimeError):
+    pass
+
+
+def _log_task_completion(task: asyncio.Task,
+                         error_callback: Callable[[Exception], None]) -> None:
+    """This function is only intended for the `engine.run_engine_loop()` task.
+
+    In particular, that task runs a `while True` loop that can only exit if
+    there is an exception.
+    """
+
+    exception = None
+    try:
+        return_value = task.result()
+        raise AssertionError(
+            f"The engine background task should never finish without an "
+            f"exception. {return_value}")
+    except asyncio.exceptions.CancelledError:
+        # We assume that if the task is cancelled, we are gracefully shutting
+        # down. This should only happen on program exit.
+        logger.info("Engine is gracefully shutting down.")
+    except Exception as e:
+        exception = e
+        logger.error("Engine background task failed", exc_info=e)
+        error_callback(exception)
+        raise AsyncEngineDeadError(
+            "Task finished unexpectedly. This should never happen! "
+            "Please open an issue on GitHub. See stack trace above for the "
+            "actual cause.") from e
+
+
+STOP_ITERATION = Exception()  # Sentinel
+
+
+class AsyncStream:
+    """A stream of RequestOutputs or PoolingRequestOutputs for a request
+    that can be iterated over asynchronously via an async generator."""
+
+    def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None:
+        self.request_id = request_id
+        self._cancel = cancel
+        self._queue: asyncio.Queue = asyncio.Queue()
+        self._finished = False
+
+    def put(self, item: Union[RequestOutput, PoolingRequestOutput,
+                              Exception]) -> None:
+        if not self._finished:
+            self._queue.put_nowait(item)
+
+    def finish(
+        self,
+        exception: Optional[Union[BaseException, Type[BaseException]]] = None,
+    ) -> None:
+        if not self._finished:
+            self._finished = True
+            self._queue.put_nowait(
+                exception if self._is_raisable(exception) else STOP_ITERATION)
+
+    @property
+    def finished(self) -> bool:
+        return self._finished
+
+    async def generator(
+        self
+    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
+        try:
+            while True:
+                result = await self._queue.get()
+                if self._is_raisable(result):
+                    if result == STOP_ITERATION:
+                        return
+                    raise result
+                yield result
+        except GeneratorExit:
+            self._cancel(self.request_id)
+            raise asyncio.CancelledError from None
+
+    @staticmethod
+    def _is_raisable(value: Any):
+        return isinstance(value, BaseException) or \
+                (isinstance(value, type) and \
+                 issubclass(value, BaseException))
+
+
+class RequestTracker:
+    """Synchronous abstraction for tracking requests."""
+
+    def __init__(self) -> None:
+        self._request_streams: Dict[str, AsyncStream] = {}
+        self._aborted_requests: asyncio.Queue[str] = asyncio.Queue()
+        self._new_requests: asyncio.Queue[Tuple[AsyncStream,
+                                                dict]] = asyncio.Queue()
+        self.new_requests_event = asyncio.Event()
+
+    def __contains__(self, item):
+        return item in self._request_streams
+
+    def __len__(self) -> int:
+        return len(self._request_streams)
+
+    def propagate_exception(self,
+                            exc: Exception,
+                            request_id: Optional[str] = None) -> None:
+        """Propagate an exception to request streams
+        (all if request_id is None)."""
+        if request_id is not None:
+            self.abort_request(request_id, exception=exc)
+        else:
+            # NB: tuple() used here because self.abort_request pops the stream
+            # out of self._request_streams, so we can't iterate on it directly
+            for rid in tuple(self._request_streams.keys()):
+                self.abort_request(rid, exception=exc)
+
+    def process_request_output(self,
+                               request_output: Union[RequestOutput,
+                                                     PoolingRequestOutput],
+                               *,
+                               verbose: bool = False) -> None:
+        """Process a request output from the engine."""
+        request_id = request_output.request_id
+        finished = request_output.finished
+
+        if finished:
+            stream = self._request_streams.pop(request_id, None)
+        else:
+            stream = self._request_streams.get(request_id)
+        # Guard against a KeyError which can occur if the request was aborted
+        # while the output was generated
+        if stream is not None:
+            stream.put(request_output)
+            if finished:
+                stream.finish()
+
+        if verbose and finished:
+            logger.info("Finished request %s.", request_id)
+
+    def process_exception(self,
+                          request_id: str,
+                          exception: BaseException,
+                          *,
+                          verbose: bool = False) -> None:
+        """Propagate an exception from the engine."""
+        if verbose:
+            logger.info("Finished request %s.", request_id)
+        self.abort_request(request_id, exception=exception)
+
+    def add_request(self,
+                    request_id: str,
+                    *,
+                    verbose: bool = False,
+                    **engine_add_request_kwargs) -> AsyncStream:
+        """Add a request to be sent to the engine on the next background
+        loop iteration."""
+        if request_id in self._request_streams:
+            raise KeyError(f"Request {request_id} already exists.")
+
+        abort_request = partial(self.abort_request, verbose=verbose)
+        stream = AsyncStream(request_id, abort_request)
+        self._new_requests.put_nowait((stream, {
+            "request_id": request_id,
+            **engine_add_request_kwargs
+        }))
+
+        self.new_requests_event.set()
+
+        if verbose:
+            logger.info("Added request %s.", request_id)
+
+        return stream
+
+    def abort_request(self,
+                      request_id: str,
+                      *,
+                      exception: Optional[Union[BaseException,
+                                                Type[BaseException]]] = None,
+                      verbose: bool = False) -> None:
+        """Abort a request during next background loop iteration."""
+        if verbose:
+            logger.info("Aborted request %s.", request_id)
+
+        self._aborted_requests.put_nowait(request_id)
+
+        stream = self._request_streams.pop(request_id, None)
+        if stream is not None:
+            stream.finish(exception=exception)
+
+    def get_new_and_aborted_requests(self) -> Tuple[List[Dict], Set[str]]:
+        """Get the new requests and finished requests to be
+        sent to the engine."""
+        new_requests: List[Dict] = []
+        finished_requests: Set[str] = set()
+
+        while not self._aborted_requests.empty():
+            request_id = self._aborted_requests.get_nowait()
+            finished_requests.add(request_id)
+
+        while not self._new_requests.empty():
+            stream, new_request = self._new_requests.get_nowait()
+            request_id = stream.request_id
+            if request_id in finished_requests:
+                # The request has already been aborted.
+                stream.finish(asyncio.CancelledError)
+                finished_requests.discard(request_id)
+            else:
+                self._request_streams[request_id] = stream
+                new_requests.append(new_request)
+
+        return new_requests, finished_requests
+
+    async def wait_for_new_requests(self):
+        if not self.has_new_requests():
+            await self.new_requests_event.wait()
+        self.new_requests_event.clear()
+
+    def has_new_requests(self):
+        return not self._new_requests.empty()
+
+
+class _AsyncLLMEngine(LLMEngine):
+    """Extension of LLMEngine to add async methods."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    async def step_async(
+        self, virtual_engine: int
+    ) -> List[Union[RequestOutput, PoolingRequestOutput]]:
+        """Performs one decoding iteration and returns newly generated results.
+        The workers are ran asynchronously if possible.
+
+        This function performs one decoding iteration of the engine. It first
+        schedules the sequences to be executed in the next iteration and the
+        token blocks to be swapped in/out/copy. Then, it executes the model
+        and updates the scheduler with the model outputs. Finally, it decodes
+        the sequences and returns the newly generated results.
+        """
+        # these are cached outputs from previous iterations. None if on first
+        # iteration
+        cached_outputs = self.cached_scheduler_outputs[virtual_engine]
+        seq_group_metadata_list = cached_outputs.seq_group_metadata_list
+        scheduler_outputs = cached_outputs.scheduler_outputs
+        allow_async_output_proc = cached_outputs.allow_async_output_proc
+
+        ctx = self.scheduler_contexts[virtual_engine]
+
+        # Clear outputs for each new scheduler iteration
+        ctx.request_outputs.clear()
+
+        # skip the scheduler if there are any remaining steps in the seq groups.
+        # This ensures that the scheduler is only called again when the current
+        # batch has completed.
+        if not self._has_remaining_steps(seq_group_metadata_list):
+
+            # Schedule iteration
+            (seq_group_metadata_list, scheduler_outputs,
+             allow_async_output_proc
+             ) = self.scheduler[virtual_engine].schedule()
+
+            ctx.seq_group_metadata_list = seq_group_metadata_list
+            ctx.scheduler_outputs = scheduler_outputs
+
+            if not scheduler_outputs.is_empty():
+                # this will cause mamba_cache/minimax_cache failed
+                # to release finished_requests_ids of the last steps
+                finished_requests_ids = self.scheduler[
+                    virtual_engine].get_and_reset_finished_requests_ids()
+
+            # Maybe switch from async mode to sync mode
+            if not allow_async_output_proc and len(ctx.output_queue) > 0:
+                self._process_model_outputs(ctx=ctx)
+
+            if (self.scheduler_config.is_multi_step
+                    and scheduler_outputs.num_lookahead_slots > 0):
+                # cache the scheduler outputs for the next iteration if we have
+                # lookahead slots
+                self._cache_scheduler_outputs_for_multi_step(
+                    virtual_engine, seq_group_metadata_list, scheduler_outputs,
+                    allow_async_output_proc)
+        else:
+            finished_requests_ids = list()
+
+        assert seq_group_metadata_list is not None
+        assert scheduler_outputs is not None
+
+        if not scheduler_outputs.is_empty():
+
+            # Check if we have a cached last_output from the previous iteration.
+            # For supporting PP this is probably the best way to pass the
+            # sampled_token_ids, as a separate broadcast over all the PP stages
+            # will cause one virtual engine's microbatch to block the pipeline.
+            last_sampled_token_ids = \
+                self._get_last_sampled_token_ids(virtual_engine)
+
+            execute_model_req = ExecuteModelRequest(
+                seq_group_metadata_list=seq_group_metadata_list,
+                blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
+                blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
+                blocks_to_copy=scheduler_outputs.blocks_to_copy,
+                virtual_engine=virtual_engine,
+                num_lookahead_slots=scheduler_outputs.num_lookahead_slots,
+                running_queue_size=scheduler_outputs.running_queue_size,
+                finished_requests_ids=finished_requests_ids,
+                # We use ExecuteModelRequest to pass the last sampled_token_ids
+                # to each of the non-last PP stages for in-place prepare_input.
+                last_sampled_token_ids=last_sampled_token_ids)
+
+            if allow_async_output_proc:
+                execute_model_req.async_callback = self.async_callbacks[
+                    virtual_engine]
+
+            # Execute the model.
+            outputs = await self.model_executor.execute_model_async(
+                execute_model_req)
+
+            # we need to do this here so that last step's sampled_token_ids can
+            # be passed to the next iteration for PP.
+            if self.scheduler_config.is_multi_step:
+                self._update_cached_scheduler_output(virtual_engine, outputs)
+        else:
+            if len(ctx.output_queue) > 0:
+                self._process_model_outputs(ctx=ctx)
+            outputs = []
+
+        # Finish the current step for all the sequence groups.
+        if self.scheduler_config.is_multi_step:
+            for seq_group in seq_group_metadata_list:
+                seq_group.finish_step()
+
+        if not self._has_remaining_steps(seq_group_metadata_list):
+            # Clear the cache if we have finished all the steps
+            if self.scheduler_config.is_multi_step:
+                self.cached_scheduler_outputs[
+                    virtual_engine] = SchedulerOutputState()
+
+            # is_first_step_output is True only when the num_steps of all
+            # the sequences are 1. When the num_steps > 1,
+            # multi_step_model_runner does the first-step output append.
+            is_first_step_output: bool = False if not seq_group_metadata_list \
+                else seq_group_metadata_list[0].state.num_steps == 1
+
+            ctx.append_output(outputs=outputs,
+                              seq_group_metadata_list=seq_group_metadata_list,
+                              scheduler_outputs=scheduler_outputs,
+                              is_async=allow_async_output_proc,
+                              is_last_step=True,
+                              is_first_step_output=is_first_step_output)
+
+            if outputs and allow_async_output_proc:
+                assert len(
+                    outputs
+                ) == 1, "Async postprocessor expects only a single output set"
+                self._advance_to_next_step(
+                    outputs[0], seq_group_metadata_list,
+                    scheduler_outputs.scheduled_seq_groups)
+
+            if not allow_async_output_proc:
+                self._process_model_outputs(ctx=ctx)
+
+                # Log stats.
+                self.do_log_stats(scheduler_outputs, outputs)
+
+                # Tracing
+                self.do_tracing(scheduler_outputs)
+
+        else:
+            # Multi-step case
+            return ctx.request_outputs
+
+        if not self.has_unfinished_requests():
+            # Drain async postprocessor (if exists)
+            if len(ctx.output_queue) > 0:
+                self._process_model_outputs(ctx=ctx)
+            assert len(ctx.output_queue) == 0
+
+        return ctx.request_outputs
+
+    async def stop_remote_worker_execution_loop_async(self) -> None:
+        """Stop the remote worker execution loop."""
+        await self.model_executor.stop_remote_worker_execution_loop_async()
+
+    async def get_tokenizer_async(self,
+                                  lora_request: Optional[LoRARequest] = None
+                                  ) -> AnyTokenizer:
+        return await (
+            self.get_tokenizer_group().get_lora_tokenizer_async(lora_request))
+
+    async def add_request_async(
+        self,
+        request_id: str,
+        prompt: PromptType,
+        params: Union[SamplingParams, PoolingParams],
+        arrival_time: Optional[float] = None,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+    ) -> None:
+        """
+        Async version of
+        [`add_request`][vllm.engine.llm_engine.LLMEngine.add_request].
+        """
+        if lora_request is not None and not self.lora_config:
+            raise ValueError(f"Got lora_request {lora_request} but LoRA is "
+                             "not enabled!")
+        if priority != 0 and not self.scheduler_config.policy == "priority":
+            raise ValueError(f"Got priority {priority} but "
+                             "Priority scheduling is not enabled.")
+        if arrival_time is None:
+            arrival_time = time.time()
+
+        if data_parallel_rank is not None:
+            raise ValueError("Targeting data_parallel_rank only supported "
+                             "in v1 client.")
+
+        if (isinstance(prompt, dict)
+                and prompt.get("prompt_embeds", None) is not None
+                and not prompt.get("prompt_token_ids", None)):
+            # We use the -2 dimension (instead of 0) in case a batched input
+            # of batch size 1 is passed in.
+            prompt["prompt_token_ids"] = [0
+                                          ] * prompt["prompt_embeds"].shape[-2]
+
+        processed_inputs = await self.input_preprocessor.preprocess_async(
+            prompt,
+            lora_request=lora_request,
+            tokenization_kwargs=tokenization_kwargs,
+        )
+
+        if isinstance(params, SamplingParams) and \
+            params.guided_decoding is not None:
+            # Guided decoding has an async implementation for building logits
+            # processors in a separate threadpool.
+            # We want to invoke that here instead of using the blocking
+            # implementation in the LLMEngine
+            params = await build_guided_decoding_logits_processor_async(
+                sampling_params=params,
+                tokenizer=await self.get_tokenizer_async(lora_request),
+                default_guided_backend=self.decoding_config.backend,
+                reasoning_backend=self.decoding_config.reasoning_backend,
+                model_config=self.model_config)
+
+        self._add_processed_request(
+            request_id=request_id,
+            processed_inputs=processed_inputs,
+            params=params,
+            arrival_time=arrival_time,
+            lora_request=lora_request,
+            trace_headers=trace_headers,
+            priority=priority,
+        )
+
+    async def check_health_async(self) -> None:
+        self.model_executor.check_health()
+
+    async def collective_rpc_async(self,
+                                   method: str,
+                                   timeout: Optional[float] = None,
+                                   args: tuple = (),
+                                   kwargs: Optional[dict] = None):
+        raise NotImplementedError
+
+
+async def build_guided_decoding_logits_processor_async(
+        sampling_params: SamplingParams, tokenizer: AnyTokenizer,
+        default_guided_backend: str, reasoning_backend: Optional[str],
+        model_config: ModelConfig) -> SamplingParams:
+    """Constructs logits processors based on the guided_decoding,
+    logits_bias, and allowed_token_ids fields in sampling_params. Deletes
+    those fields and adds the constructed logits processors to the
+    logits_processors field. Modifies sampling params in-place and returns
+    the modified sampling params."""
+    if sampling_params.guided_decoding is None:
+        return sampling_params
+
+    # Defensively copy sampling params since guided decoding logits
+    # processors can have different state for each request
+    sampling_params = copy.copy(sampling_params)
+    guided_decoding = sampling_params.guided_decoding
+
+    logger.debug(
+        "Building guided decoding logits processor. "
+        "guided_decoding: %s%s", guided_decoding,
+        f", reasoning_backend: {reasoning_backend}"
+        if reasoning_backend is not None else "")
+
+    guided_decoding.backend = guided_decoding.backend or default_guided_backend
+
+    processor = await get_guided_decoding_logits_processor(
+        guided_params=guided_decoding,
+        tokenizer=tokenizer,
+        reasoning_backend=reasoning_backend,
+        model_config=model_config)
+
+    if processor:
+        if sampling_params.logits_processors is None:
+            sampling_params.logits_processors = []
+        sampling_params.logits_processors.append(processor)
+
+    # Unset guided decoding params after constructing the lp from them
+    sampling_params.guided_decoding = None
+
+    return sampling_params
+
+
+class AsyncLLMEngine(EngineClient):
+    """An asynchronous wrapper for [`LLMEngine`][vllm.LLMEngine].
+
+    This class is used to wrap the [`LLMEngine`][vllm.LLMEngine] class to
+    make it asynchronous. It uses asyncio to create a background loop that keeps
+    processing incoming requests. The [`LLMEngine`][vllm.LLMEngine] is kicked
+    by the generate method when there are requests in the waiting queue. The
+    generate method yields the outputs from the [`LLMEngine`][vllm.LLMEngine]
+    to the caller.
+
+    Args:
+        log_requests: Whether to log the requests.
+        start_engine_loop: If True, the background task to run the engine
+            will be automatically started in the generate call.
+        *args: Arguments for [`LLMEngine`][vllm.LLMEngine].
+        **kwargs: Arguments for [`LLMEngine`][vllm.LLMEngine].
+    """
+
+    _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
+
+    def __init__(self,
+                 *args,
+                 log_requests: bool = True,
+                 start_engine_loop: bool = True,
+                 **kwargs) -> None:
+        if envs.VLLM_USE_V1:
+            raise ValueError(
+                "Using V0 AsyncLLMEngine, but envs.VLLM_USE_V1=True. "
+                "This should not happen. As a workaround, try using "
+                "AsyncLLMEngine.from_vllm_config(...) or explicitly set "
+                "VLLM_USE_V1=0 or 1 and report this issue on Github.")
+
+        self.log_requests = log_requests
+        self.engine = self._engine_class(*args, **kwargs)
+
+        # This ensures quick processing of request outputs
+        # so the append to asyncio queues is not delayed,
+        # especially for multi-step.
+        self.use_process_request_outputs_callback = (
+            self.engine.model_config.use_async_output_proc)
+
+        if self.use_process_request_outputs_callback:
+            self.engine.process_request_outputs_callback = \
+                weak_bind(self.process_request_outputs)
+
+        self.background_loop: Optional[asyncio.Future] = None
+        # We need to keep a reference to unshielded
+        # task as well to prevent it from being garbage
+        # collected
+        self._background_loop_unshielded: Optional[asyncio.Task] = None
+        self.start_engine_loop = start_engine_loop
+        self._errored_with: Optional[BaseException] = None
+
+        # Lazy initialized fields
+        self._request_tracker: RequestTracker
+
+    def __del__(self):
+        if rt := getattr(self, "request_tracker", None):
+            # Wake up engine loop so that it will exit cleanly
+            rt.new_requests_event.set()
+
+    @classmethod
+    def _get_executor_cls(cls,
+                          engine_config: VllmConfig) -> Type[ExecutorBase]:
+        return LLMEngine._get_executor_cls(engine_config)
+
+    @classmethod
+    def from_vllm_config(
+        cls,
+        vllm_config: VllmConfig,
+        start_engine_loop: bool = True,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[dict[str, StatLoggerBase]] = None,
+        disable_log_requests: bool = False,
+        disable_log_stats: bool = False,
+    ) -> "AsyncLLMEngine":
+        """Create an AsyncLLMEngine from the EngineArgs."""
+
+        return cls(
+            vllm_config=vllm_config,
+            executor_class=cls._get_executor_cls(vllm_config),
+            start_engine_loop=start_engine_loop,
+            log_requests=not disable_log_requests,
+            log_stats=not disable_log_stats,
+            usage_context=usage_context,
+            stat_loggers=stat_loggers,
+        )
+
+    @classmethod
+    def from_engine_args(
+        cls,
+        engine_args: AsyncEngineArgs,
+        start_engine_loop: bool = True,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
+    ) -> "AsyncLLMEngine":
+        """Creates an async LLM engine from the engine arguments."""
+
+        vllm_config = engine_args.create_engine_config(usage_context)
+
+        async_engine_cls = cls
+        if envs.VLLM_USE_V1:
+            from vllm.v1.engine.async_llm import AsyncLLM as V1AsyncLLMEngine
+            async_engine_cls = V1AsyncLLMEngine
+
+        return async_engine_cls.from_vllm_config(
+            vllm_config=vllm_config,
+            start_engine_loop=start_engine_loop,
+            usage_context=usage_context,
+            stat_loggers=stat_loggers,
+            disable_log_stats=engine_args.disable_log_stats,
+            disable_log_requests=engine_args.disable_log_requests,
+        )
+
+    @property
+    def is_running(self) -> bool:
+        return (self.background_loop is not None
+                and self._background_loop_unshielded is not None
+                and not self._background_loop_unshielded.done())
+
+    @property
+    def is_stopped(self) -> bool:
+        return self.errored or (self.background_loop is not None and
+                                self._background_loop_unshielded is not None
+                                and self._background_loop_unshielded.done())
+
+    @property
+    def errored(self) -> bool:
+        return self._errored_with is not None
+
+    @property
+    def dead_error(self) -> BaseException:
+        return AsyncEngineDeadError(
+            "Background loop is not running. If it was running, "
+            "inspect the output to find the stacktrace of the "
+            "error that caused the background loop to stop "
+            "(AsyncEngineDeadError).")
+
+    def set_errored(self, exc: Exception) -> None:
+        self._errored_with = exc
+
+    def _error_callback(self, exc: Exception) -> None:
+        self.set_errored(exc)
+        self._request_tracker.propagate_exception(exc)
+
+    async def get_input_preprocessor(self) -> InputPreprocessor:
+        return self.engine.input_preprocessor
+
+    async def get_tokenizer(
+        self,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> AnyTokenizer:
+        return await self.engine.get_tokenizer_async(lora_request)
+
+    def start_background_loop(self) -> None:
+        """Start the background loop."""
+        if self.errored:
+            raise AsyncEngineDeadError(
+                "Background loop has errored already.") from self._errored_with
+        if self.is_running:
+            raise RuntimeError("Background loop is already running.")
+        # Initialize the RequestTracker here so it uses the right event loop.
+        self._request_tracker = RequestTracker()
+
+        self._background_loop_unshielded = asyncio.get_event_loop(
+        ).create_task(self.run_engine_loop(weakref.ref(self)))
+        self._background_loop_unshielded.add_done_callback(
+            partial(_log_task_completion, error_callback=self._error_callback))
+        self.background_loop = asyncio.shield(self._background_loop_unshielded)
+
+    def shutdown_background_loop(self) -> None:
+        """
+        Shut down the background loop.
+
+        This method needs to be called during cleanup to remove
+        references to `self` and properly GC the resources held
+        by the async LLM engine (e.g., the executors as well as
+        their resources).
+        """
+        if self._background_loop_unshielded is not None:
+            self._background_loop_unshielded.cancel()
+            self._background_loop_unshielded = None
+        self.background_loop = None
+
+    async def engine_step(self, virtual_engine: int) -> bool:
+        """Kick the engine to process the waiting requests.
+
+        Returns True if there are in-progress requests."""
+
+        new_requests, aborted_requests = (
+            self._request_tracker.get_new_and_aborted_requests())
+
+        for new_request in new_requests:
+            # Add the request into the vLLM engine's waiting queue.
+            try:
+                await self.engine.add_request_async(**new_request)
+            except ValueError as e:
+                # TODO: use a vLLM specific error for failed validation
+                self._request_tracker.process_exception(
+                    new_request["request_id"],
+                    e,
+                    verbose=self.log_requests,
+                )
+
+        if aborted_requests:
+            await self._engine_abort(aborted_requests)
+
+        request_outputs = await self.engine.step_async(virtual_engine)
+
+        # Put the outputs into the corresponding streams.
+        # If used as a callback, then already invoked inside
+        # LLMEngine's _process_model_outputs
+        if not self.use_process_request_outputs_callback:
+            all_finished = self.process_request_outputs(request_outputs)
+        else:
+            # For callback case, we only need to detect when all
+            # requests are finished
+            all_finished = all(request_output.finished
+                               for request_output in request_outputs)
+
+        return not all_finished
+
+    def process_request_outputs(self, request_outputs) -> bool:
+        # Put the outputs into the corresponding streams.
+        all_finished = True
+        for request_output in request_outputs:
+            self._request_tracker.process_request_output(
+                request_output, verbose=self.log_requests)
+            all_finished = all_finished and request_output.finished
+
+        return all_finished
+
+    async def _engine_abort(self, request_ids: Iterable[str]):
+        self.engine.abort_request(request_ids)
+
+    @staticmethod
+    async def run_engine_loop(engine_ref: ReferenceType):
+        """We use a weakref to the engine so that the running loop
+        doesn't prevent the engine being garbage collected."""
+        engine: Optional[AsyncLLMEngine] = engine_ref()
+        if not engine:
+            return
+
+        pipeline_parallel_size = \
+                engine.engine.parallel_config.pipeline_parallel_size
+        has_requests_in_progress = [False] * pipeline_parallel_size
+        while True:
+            if not any(has_requests_in_progress):
+                logger.debug("Waiting for new requests...")
+                # Stop the execute model loop in parallel workers until there
+                # are more requests to process. This avoids waiting
+                # indefinitely in torch.distributed ops which may otherwise
+                # timeout, and unblocks the RPC thread in the workers so that
+                # they can process any other queued control plane messages,
+                # such as add/remove lora adapters.
+                await engine.engine.stop_remote_worker_execution_loop_async()
+                request_tracker = engine._request_tracker
+                # Allow engine to be garbage collected while
+                # waiting for new requests
+                del engine
+                await asyncio.sleep(0)
+                if engine_ref() is None:
+                    return
+                await request_tracker.wait_for_new_requests()
+                engine = engine_ref()
+                if not engine:
+                    return
+                logger.debug("Got new requests!")
+                requests_in_progress = [
+                    asyncio.create_task(engine.engine_step(ve))
+                    for ve in range(pipeline_parallel_size)
+                ]
+                has_requests_in_progress = [True] * pipeline_parallel_size
+
+            # Abort if iteration takes too long due to unrecoverable errors
+            # (eg. NCCL timeouts).
+            try:
+                async with asyncio_timeout(ENGINE_ITERATION_TIMEOUT_S):
+                    done, _ = await asyncio.wait(
+                        requests_in_progress,
+                        return_when=asyncio.FIRST_COMPLETED)
+                    for _ in range(pipeline_parallel_size):
+                        await asyncio.sleep(0)
+                for task in done:
+                    result = task.result()
+                    virtual_engine = requests_in_progress.index(task)
+                    has_unfinished_requests = (
+                        engine.engine.
+                        has_unfinished_requests_for_virtual_engine(
+                            virtual_engine))
+                    if result or has_unfinished_requests:
+                        requests_in_progress[virtual_engine] = (
+                            asyncio.create_task(
+                                engine.engine_step(virtual_engine)))
+                        has_requests_in_progress[virtual_engine] = True
+                    else:
+                        has_requests_in_progress[virtual_engine] = False
+            except asyncio.TimeoutError as exc:
+                logger.error(
+                    "Engine iteration timed out. This should never happen!")
+                engine.set_errored(exc)
+                raise
+            await asyncio.sleep(0)
+
+    async def add_request(
+        self,
+        request_id: str,
+        prompt: PromptType,
+        params: Union[SamplingParams, PoolingParams],
+        arrival_time: Optional[float] = None,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
+        if not self.is_running:
+            if self.start_engine_loop:
+                self.start_background_loop()
+            else:
+                raise AsyncEngineDeadError(
+                    "Background loop is not running. If it was running, "
+                    "inspect the output to find the stacktrace of the "
+                    "error that caused the background loop to stop "
+                    "(AsyncEngineDeadError).")
+
+        if (priority != 0
+                and not self.engine.scheduler_config.policy == "priority"):
+            raise ValueError(f"Got priority {priority} but "
+                             "Priority scheduling is not enabled.")
+
+        stream = self._request_tracker.add_request(
+            request_id,
+            verbose=self.log_requests,
+            prompt=prompt,
+            params=params,
+            arrival_time=arrival_time or time.time(),
+            lora_request=lora_request,
+            trace_headers=trace_headers,
+            priority=priority,
+            data_parallel_rank=data_parallel_rank,
+            tokenization_kwargs=tokenization_kwargs,
+        )
+
+        return stream.generator()
+
+    async def generate(
+        self,
+        prompt: PromptType,
+        sampling_params: SamplingParams,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
+    ) -> AsyncGenerator[RequestOutput, None]:
+        """Generate outputs for a request.
+
+        Generate outputs for a request. This method is a coroutine. It adds the
+        request into the waiting queue of the LLMEngine and streams the outputs
+        from the LLMEngine to the caller.
+
+        Args:
+            prompt: The prompt to the LLM. See
+                [`PromptType`][vllm.inputs.PromptType] for more details about
+                the format of each input.
+            sampling_params: The sampling parameters of the request.
+            request_id: The unique id of the request.
+            lora_request: LoRA request to use for generation, if any.
+            trace_headers: OpenTelemetry trace headers.
+            priority: The priority of the request.
+                Only applicable with priority scheduling.
+            data_parallel_rank: The (global) data parallel rank that must
+                handle this request. Only applicable if DP is enabled.
+        Yields:
+            The output `RequestOutput` objects from the LLMEngine
+            for the request.
+
+        Details:
+            - If the engine is not running, start the background loop,
+              which iteratively invokes
+              [`engine_step`][vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step]
+              to process the waiting requests.
+            - Add the request to the engine's `RequestTracker`.
+              On the next background loop, this request will be sent to
+              the underlying engine.
+              Also, a corresponding `AsyncStream` will be created.
+            - Wait for the request outputs from `AsyncStream` and yield them.
+
+        Example:
+            >>> # Please refer to entrypoints/api_server.py for
+            >>> # the complete example.
+            >>>
+            >>> # initialize the engine and the example input
+            >>> # note that engine_args here is AsyncEngineArgs instance
+            >>> engine = AsyncLLMEngine.from_engine_args(engine_args)
+            >>> example_input = {
+            >>>     "prompt": "What is LLM?",
+            >>>     "stream": False, # assume the non-streaming case
+            >>>     "temperature": 0.0,
+            >>>     "request_id": 0,
+            >>> }
+            >>>
+            >>> # start the generation
+            >>> results_generator = engine.generate(
+            >>>    example_input["prompt"],
+            >>>    SamplingParams(temperature=example_input["temperature"]),
+            >>>    example_input["request_id"])
+            >>>
+            >>> # get the results
+            >>> final_output = None
+            >>> async for request_output in results_generator:
+            >>>     if await request.is_disconnected():
+            >>>         # Abort the request if the client disconnects.
+            >>>         await engine.abort(request_id)
+            >>>         # Return or raise an error
+            >>>         ...
+            >>>     final_output = request_output
+            >>>
+            >>> # Process and return the final output
+            >>> ...
+        """
+        try:
+            async for output in await self.add_request(
+                    request_id,
+                    prompt,
+                    sampling_params,
+                    lora_request=lora_request,
+                    trace_headers=trace_headers,
+                    priority=priority,
+                    data_parallel_rank=data_parallel_rank,
+            ):
+                yield LLMEngine.validate_output(output, RequestOutput)
+        except asyncio.CancelledError:
+            await self.abort(request_id)
+            raise
+
+    async def encode(
+        self,
+        prompt: PromptType,
+        pooling_params: PoolingParams,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        priority: int = 0,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+    ) -> AsyncGenerator[PoolingRequestOutput, None]:
+        """Generate outputs for a request from a pooling model.
+
+        Generate outputs for a request. This method is a coroutine. It adds the
+        request into the waiting queue of the LLMEngine and streams the outputs
+        from the LLMEngine to the caller.
+
+        Args:
+            prompt: The prompt to the LLM. See
+                [`PromptType`][vllm.inputs.PromptType] for more details about
+                the format of each input.
+            pooling_params: The pooling parameters of the request.
+            request_id: The unique id of the request.
+            lora_request: LoRA request to use for generation, if any.
+            trace_headers: OpenTelemetry trace headers.
+            priority: The priority of the request.
+                Only applicable with priority scheduling.
+
+        Yields:
+            The output `PoolingRequestOutput` objects from the LLMEngine
+            for the request.
+
+        Details:
+            - If the engine is not running, start the background loop,
+                which iteratively invokes
+                [`vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`][]
+                to process the waiting requests.
+            - Add the request to the engine's `RequestTracker`.
+                On the next background loop, this request will be sent to
+                the underlying engine.
+                Also, a corresponding `AsyncStream` will be created.
+            - Wait for the request outputs from `AsyncStream` and yield them.
+
+        Example:
+        ```
+        # Please refer to entrypoints/api_server.py for
+        # the complete example.
+    
+        # initialize the engine and the example input
+        # note that engine_args here is AsyncEngineArgs instance
+        engine = AsyncLLMEngine.from_engine_args(engine_args)
+        example_input = {
+            "input": "What is LLM?",
+            "request_id": 0,
+        }
+    
+        # start the generation
+        results_generator = engine.encode(
+        example_input["input"],
+        PoolingParams(),
+        example_input["request_id"])
+    
+        # get the results
+        final_output = None
+        async for request_output in results_generator:
+            if await request.is_disconnected():
+                # Abort the request if the client disconnects.
+                await engine.abort(request_id)
+                # Return or raise an error
+                ...
+            final_output = request_output
+    
+        # Process and return the final output
+        ...
+        ```
+        """
+        try:
+            async for output in await self.add_request(
+                    request_id,
+                    prompt,
+                    pooling_params,
+                    lora_request=lora_request,
+                    trace_headers=trace_headers,
+                    priority=priority,
+                    tokenization_kwargs=tokenization_kwargs,
+            ):
+                yield LLMEngine.validate_output(output, PoolingRequestOutput)
+        except asyncio.CancelledError:
+            await self.abort(request_id)
+            raise
+
+    async def abort(self, request_id: str) -> None:
+        """Abort a request.
+
+        Abort a submitted request. If the request is finished or not found,
+        this method will be a no-op.
+
+        Args:
+            request_id: The unique id of the request.
+        """
+        if not self.is_running:
+            raise AsyncEngineDeadError(
+                "Background loop is not running. If it was running, "
+                "inspect the output to find the stacktrace of the "
+                "error that caused the background loop to stop "
+                "(AsyncEngineDeadError).")
+
+        return self._abort(request_id)
+
+    def _abort(self, request_id: str) -> None:
+        """Abort a request.
+
+        Abort a submitted request. If the request is finished or not found,
+        this method will be a no-op.
+
+        Args:
+            request_id: The unique id of the request.
+        """
+        self._request_tracker.abort_request(request_id,
+                                            exception=asyncio.CancelledError,
+                                            verbose=self.log_requests)
+
+    async def get_vllm_config(self) -> VllmConfig:
+        """Get the vllm configuration of the vLLM engine."""
+        return self.engine.get_vllm_config()
+
+    async def get_model_config(self) -> ModelConfig:
+        """Get the model configuration of the vLLM engine."""
+        return self.engine.get_model_config()
+
+    async def get_parallel_config(self) -> ParallelConfig:
+        """Get the parallel configuration of the vLLM engine."""
+        return self.engine.get_parallel_config()
+
+    async def get_decoding_config(self) -> DecodingConfig:
+        """Get the decoding configuration of the vLLM engine."""
+        return self.engine.get_decoding_config()
+
+    async def get_scheduler_config(self) -> SchedulerConfig:
+        """Get the scheduling configuration of the vLLM engine."""
+        return self.engine.get_scheduler_config()
+
+    async def get_lora_config(self) -> LoRAConfig:
+        """Get the lora configuration of the vLLM engine."""
+        return self.engine.get_lora_config()
+
+    async def do_log_stats(
+            self,
+            scheduler_outputs: Optional[SchedulerOutputs] = None,
+            model_output: Optional[List[SamplerOutput]] = None) -> None:
+        self.engine.do_log_stats()
+
+    async def check_health(self) -> None:
+        """Raises an error if engine is unhealthy."""
+        t = time.perf_counter()
+        logger.debug("Starting health check...")
+        if self.is_stopped:
+            raise AsyncEngineDeadError("Background loop is stopped.")
+
+        await self.engine.check_health_async()
+        logger.debug("Health check took %fs", time.perf_counter() - t)
+
+    async def is_tracing_enabled(self) -> bool:
+        return self.engine.is_tracing_enabled()
+
+    def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None:
+        self.engine.add_logger(logger_name=logger_name, logger=logger)
+
+    def remove_logger(self, logger_name: str) -> None:
+        self.engine.remove_logger(logger_name=logger_name)
+
+    async def start_profile(self) -> None:
+        self.engine.start_profile()
+
+    async def stop_profile(self) -> None:
+        self.engine.stop_profile()
+
+    async def reset_mm_cache(self) -> None:
+        self.engine.reset_mm_cache()
+
+    async def reset_prefix_cache(self,
+                                 device: Optional[Device] = None) -> None:
+        self.engine.reset_prefix_cache(device)
+
+    async def sleep(self, level: int = 1) -> None:
+        self.engine.sleep(level)
+
+    async def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        self.engine.wake_up(tags)
+
+    async def is_sleeping(self) -> bool:
+        return self.engine.is_sleeping()
+
+    async def add_lora(self, lora_request: LoRARequest) -> None:
+        self.engine.add_lora(lora_request)
+
+    async def collective_rpc(self,
+                             method: str,
+                             timeout: Optional[float] = None,
+                             args: tuple = (),
+                             kwargs: Optional[dict] = None):
+        """
+        Perform a collective RPC call to the given path.
+        """
+        return await self.engine.collective_rpc_async(method, timeout, args,
+                                                      kwargs)
+
+
+# TODO(v1): Remove this class proxy when V1 goes default.
+if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
+    from vllm.v1.engine.async_llm import AsyncLLM
+
+    AsyncLLMEngine = AsyncLLM  # type: ignore
diff --git a/vllm_v0.10.0/vllm/engine/async_timeout.py b/vllm_v0.10.0/vllm/engine/async_timeout.py
new file mode 100644
index 0000000..28a023a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/engine/async_timeout.py
@@ -0,0 +1,173 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Workaround for https://github.com/python/cpython/issues/86296
+#
+# From https://github.com/aio-libs/async-timeout/blob/master/async_timeout/__init__.py
+# Licensed under the Apache License (Apache-2.0)
+
+import asyncio
+import enum
+import sys
+from types import TracebackType
+from typing import Any, Optional, Type
+
+if sys.version_info[:2] >= (3, 11):
+    from asyncio import timeout as asyncio_timeout
+else:
+
+    def asyncio_timeout(delay: Optional[float]) -> "Timeout":
+        """timeout context manager.
+        Useful in cases when you want to apply timeout logic around block
+        of code or in cases when asyncio.wait_for is not suitable. For example:
+        >>> async with timeout(0.001):
+        ...     async with aiohttp.get('https://github.com') as r:
+        ...         await r.text()
+        delay - value in seconds or None to disable timeout logic
+        """
+        loop = asyncio.get_running_loop()
+        deadline = loop.time() + delay if delay is not None else None
+        return Timeout(deadline, loop)
+
+    class _State(enum.Enum):
+        INIT = "INIT"
+        ENTER = "ENTER"
+        TIMEOUT = "TIMEOUT"
+        EXIT = "EXIT"
+
+    class Timeout:
+        # Internal class, please don't instantiate it directly
+        # Use timeout() and timeout_at() public factories instead.
+        #
+        # Implementation note: `async with timeout()` is preferred
+        # over `with timeout()`.
+        # While technically the Timeout class implementation
+        # doesn't need to be async at all,
+        # the `async with` statement explicitly points that
+        # the context manager should be used from async function context.
+        #
+        # This design allows to avoid many silly misusages.
+        #
+        # TimeoutError is raised immediately when scheduled
+        # if the deadline is passed.
+        # The purpose is to time out as soon as possible
+        # without waiting for the next await expression.
+
+        __slots__ = ("_deadline", "_loop", "_state", "_timeout_handler")
+
+        def __init__(self, deadline: Optional[float],
+                     loop: asyncio.AbstractEventLoop) -> None:
+            self._loop = loop
+            self._state = _State.INIT
+
+            self._timeout_handler = None  # type: Optional[asyncio.Handle]
+            if deadline is None:
+                self._deadline = None  # type: Optional[float]
+            else:
+                self.update(deadline)
+
+        async def __aenter__(self) -> "Timeout":
+            self._do_enter()
+            return self
+
+        async def __aexit__(
+            self,
+            exc_type: Optional[Type[BaseException]],
+            exc_val: Optional[BaseException],
+            exc_tb: Optional[TracebackType],
+        ) -> Optional[bool]:
+            self._do_exit(exc_type)
+            return None
+
+        @property
+        def expired(self) -> bool:
+            """Is timeout expired during execution?"""
+            return self._state == _State.TIMEOUT
+
+        @property
+        def deadline(self) -> Optional[float]:
+            return self._deadline
+
+        def reject(self) -> None:
+            """Reject scheduled timeout if any."""
+            # cancel is maybe better name but
+            # task.cancel() raises CancelledError in asyncio world.
+            if self._state not in (_State.INIT, _State.ENTER):
+                raise RuntimeError(f"invalid state {self._state.value}")
+            self._reject()
+
+        def _reject(self) -> None:
+            if self._timeout_handler is not None:
+                self._timeout_handler.cancel()
+                self._timeout_handler = None
+
+        def shift(self, delay: float) -> None:
+            """Advance timeout on delay seconds.
+            The delay can be negative.
+            Raise RuntimeError if shift is called when deadline is not scheduled
+            """
+            deadline = self._deadline
+            if deadline is None:
+                raise RuntimeError(
+                    "cannot shift timeout if deadline is not scheduled")
+            self.update(deadline + delay)
+
+        def update(self, deadline: float) -> None:
+            """Set deadline to absolute value.
+            deadline argument points on the time in the same clock system
+            as loop.time().
+            If new deadline is in the past the timeout is raised immediately.
+            Please note: it is not POSIX time but a time with
+            undefined starting base, e.g. the time of the system power on.
+            """
+            if self._state == _State.EXIT:
+                raise RuntimeError(
+                    "cannot reschedule after exit from context manager")
+            if self._state == _State.TIMEOUT:
+                raise RuntimeError("cannot reschedule expired timeout")
+            if self._timeout_handler is not None:
+                self._timeout_handler.cancel()
+            self._deadline = deadline
+            if self._state != _State.INIT:
+                self._reschedule()
+
+        def _reschedule(self) -> None:
+            assert self._state == _State.ENTER
+            deadline = self._deadline
+            if deadline is None:
+                return
+
+            now = self._loop.time()
+            if self._timeout_handler is not None:
+                self._timeout_handler.cancel()
+
+            task = asyncio.current_task()
+            if deadline <= now:
+                self._timeout_handler = self._loop.call_soon(
+                    self._on_timeout, task)
+            else:
+                self._timeout_handler = self._loop.call_at(
+                    deadline, self._on_timeout, task)
+
+        def _do_enter(self) -> None:
+            if self._state != _State.INIT:
+                raise RuntimeError(f"invalid state {self._state.value}")
+            self._state = _State.ENTER
+            self._reschedule()
+
+        def _do_exit(self, exc_type: Optional[Type[BaseException]]) -> None:
+            if exc_type is asyncio.CancelledError and \
+                    self._state == _State.TIMEOUT:
+                self._timeout_handler = None
+                raise asyncio.TimeoutError
+            # timeout has not expired
+            self._state = _State.EXIT
+            self._reject()
+            return None
+
+        def _on_timeout(self, task: "Optional[asyncio.Task[Any]]") -> None:
+            if task:
+                task.cancel()
+            self._state = _State.TIMEOUT
+            # drop the reference early
+            self._timeout_handler = None
diff --git a/vllm_v0.10.0/vllm/engine/llm_engine.py b/vllm_v0.10.0/vllm/engine/llm_engine.py
new file mode 100644
index 0000000..e7919d9
--- /dev/null
+++ b/vllm_v0.10.0/vllm/engine/llm_engine.py
@@ -0,0 +1,2061 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import copy
+import time
+from collections import Counter as collectionsCounter
+from collections import deque
+from contextlib import contextmanager
+from dataclasses import dataclass
+from functools import partial
+from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
+                    Iterable, List, Literal, Mapping, NamedTuple, Optional)
+from typing import Sequence as GenericSequence
+from typing import Set, Type, Union, cast
+
+import torch
+from typing_extensions import TypeVar
+
+import vllm.envs as envs
+from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
+                         ObservabilityConfig, ParallelConfig, SchedulerConfig,
+                         VllmConfig)
+from vllm.core.scheduler import ScheduledSequenceGroup, SchedulerOutputs
+from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.metrics_types import StatLoggerBase, Stats
+from vllm.engine.output_processor.interfaces import (
+    SequenceGroupOutputProcessor)
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.engine.output_processor.util import create_output_by_sequence_group
+from vllm.entrypoints.openai.logits_processors import (
+    get_logits_processors as get_openai_logits_processors)
+from vllm.executor.executor_base import ExecutorBase
+from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs
+from vllm.inputs.parse import split_enc_dec_inputs
+from vllm.inputs.preprocess import InputPreprocessor
+from vllm.logger import init_logger
+from vllm.logits_process import get_bad_words_logits_processors
+from vllm.lora.request import LoRARequest
+from vllm.model_executor.guided_decoding import (
+    get_local_guided_decoding_logits_processor)
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.multimodal.processing import EncDecMultiModalProcessor
+from vllm.outputs import (PoolingRequestOutput, RequestOutput,
+                          RequestOutputFactory)
+from vllm.pooling_params import PoolingParams
+from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup,
+                           PoolingSequenceGroupOutput, Sequence, SequenceGroup,
+                           SequenceGroupBase, SequenceGroupMetadata,
+                           SequenceGroupOutput, SequenceStatus)
+from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context,
+                          init_tracer)
+from vllm.transformers_utils.detokenizer import Detokenizer
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.transformers_utils.tokenizer_group import (
+    TokenizerGroup, init_tokenizer_from_configs)
+from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
+                                  usage_message)
+from vllm.utils import Counter, Device, resolve_obj_by_qualname, weak_bind
+from vllm.version import __version__ as VLLM_VERSION
+from vllm.worker.model_runner_base import InputProcessingError
+
+logger = init_logger(__name__)
+_LOCAL_LOGGING_INTERVAL_SEC = 5
+
+_O = TypeVar("_O", RequestOutput, PoolingRequestOutput)
+_R = TypeVar("_R", default=Any)
+
+
+@dataclass
+class SchedulerOutputState:
+    """Caches the scheduler outputs for a virtual engine. Used for Multi-Step"""
+    seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None
+    scheduler_outputs: Optional[SchedulerOutputs] = None
+    allow_async_output_proc: bool = False
+    last_output: Optional[SamplerOutput] = None
+
+
+class OutputData(NamedTuple):
+    outputs: List[SamplerOutput]
+    seq_group_metadata_list: List[SequenceGroupMetadata]
+    scheduler_outputs: SchedulerOutputs
+    is_async: bool
+    is_last_step: bool
+    # Indicates if this output is from the first step of the
+    # multi-step. When multi-step is disabled, this is always
+    # set to True.
+    # is_first_step_output is invalid when `outputs` has
+    # outputs from multiple steps.
+    is_first_step_output: Optional[bool]
+    skip: List[int]
+
+
+class SchedulerContext:
+
+    def __init__(self, multi_step_stream_outputs: bool = False):
+        self.output_queue: Deque[OutputData] = deque()
+        self.request_outputs: List[Union[RequestOutput,
+                                         PoolingRequestOutput]] = []
+        self.seq_group_metadata_list: Optional[
+            List[SequenceGroupMetadata]] = None
+        self.scheduler_outputs: Optional[SchedulerOutputs] = None
+
+        self.multi_step_stream_outputs: bool = multi_step_stream_outputs
+
+    def append_output(self, outputs: List[SamplerOutput],
+                      seq_group_metadata_list: List[SequenceGroupMetadata],
+                      scheduler_outputs: SchedulerOutputs, is_async: bool,
+                      is_last_step: bool,
+                      is_first_step_output: Optional[bool]):
+        self.output_queue.append(
+            OutputData(outputs=outputs,
+                       seq_group_metadata_list=seq_group_metadata_list,
+                       scheduler_outputs=scheduler_outputs,
+                       is_async=is_async,
+                       is_last_step=is_last_step,
+                       is_first_step_output=is_first_step_output,
+                       skip=[]))
+
+
+class LLMEngine:
+    """An LLM engine that receives requests and generates texts.
+
+    This is the main class for the vLLM engine. It receives requests
+    from clients and generates texts from the LLM. It includes a tokenizer, a
+    language model (possibly distributed across multiple GPUs), and GPU memory
+    space allocated for intermediate states (aka KV cache). This class utilizes
+    iteration-level scheduling and efficient memory management to maximize the
+    serving throughput.
+
+    The [`LLM`][vllm.LLM] class wraps this class for offline batched inference
+    and the [`AsyncLLMEngine`][vllm.engine.async_llm_engine.AsyncLLMEngine]
+    class wraps this class for online serving.
+
+    The config arguments are derived from [`EngineArgs`][vllm.EngineArgs].
+
+    Args:
+        vllm_config: The configuration for initializing and running vLLM.
+        executor_class: The model executor class for managing distributed
+            execution.
+        log_stats: Whether to log statistics.
+        usage_context: Specified entry point, used for usage info collection.
+    """
+
+    DO_VALIDATE_OUTPUT: ClassVar[bool] = False
+    """A flag to toggle whether to validate the type of request output."""
+
+    @classmethod
+    @contextmanager
+    def enable_output_validation(cls):
+        cls.DO_VALIDATE_OUTPUT = True
+
+        yield
+
+        cls.DO_VALIDATE_OUTPUT = False
+
+    @classmethod
+    def validate_output(
+        cls,
+        output: object,
+        output_type: Type[_O],
+    ) -> _O:
+        do_validate = cls.DO_VALIDATE_OUTPUT
+
+        if ((TYPE_CHECKING or do_validate)
+                and not isinstance(output, output_type)):
+            raise TypeError(f"Expected output of type {output_type}, "
+                            f"but found type {type(output)}")
+
+        return cast(_O, output)
+
+    @classmethod
+    def validate_outputs(
+        cls,
+        outputs: GenericSequence[object],
+        output_type: Type[_O],
+    ) -> List[_O]:
+        do_validate = cls.DO_VALIDATE_OUTPUT
+
+        outputs_: List[_O]
+        if TYPE_CHECKING or do_validate:
+            outputs_ = []
+            for output in outputs:
+                if not isinstance(output, output_type):
+                    raise TypeError(f"Expected output of type {output_type}, "
+                                    f"but found type {type(output)}")
+
+                outputs_.append(output)
+        else:
+            outputs_ = outputs
+
+        return outputs_
+
+    tokenizer: Optional[TokenizerGroup]
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        executor_class: Type[ExecutorBase],
+        log_stats: bool,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+        use_cached_outputs: bool = False,
+    ) -> None:
+        if envs.VLLM_USE_V1:
+            raise ValueError(
+                "Using V0 LLMEngine, but envs.VLLM_USE_V1=True. "
+                "This should not happen. As a workaround, try using "
+                "LLMEngine.from_vllm_config(...) or explicitly set "
+                "VLLM_USE_V1=0 or 1 and report this issue on Github.")
+
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.parallel_config = vllm_config.parallel_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.device_config = vllm_config.device_config
+        self.speculative_config = vllm_config.speculative_config  # noqa
+        self.load_config = vllm_config.load_config
+        self.decoding_config = vllm_config.decoding_config or DecodingConfig(  # noqa
+        )
+        self.observability_config = vllm_config.observability_config or ObservabilityConfig(  # noqa
+        )
+
+        logger.info(
+            "Initializing a V0 LLM engine (v%s) with config: %s, "
+            "use_cached_outputs=%s, ",
+            VLLM_VERSION,
+            vllm_config,
+            use_cached_outputs,
+        )
+
+        self.log_stats = log_stats
+        self.use_cached_outputs = use_cached_outputs
+
+        if self.model_config.skip_tokenizer_init:
+            self.tokenizer = None
+            self.detokenizer = None
+            tokenizer_group = None
+        else:
+            self.tokenizer = self._init_tokenizer()
+            self.detokenizer = Detokenizer(self.tokenizer)
+            tokenizer_group = self.get_tokenizer_group()
+
+        # Ensure that the function doesn't contain a reference to self,
+        # to avoid engine GC issues
+        def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
+            assert tokenizer_group, ("tokenizer_group cannot be None, "
+                                     "make sure skip_tokenizer_init is False")
+            return tokenizer_group.get_lora_tokenizer(sequence.lora_request)
+
+        self.seq_counter = Counter()
+        self.generation_config_fields = (
+            self.model_config.try_get_generation_config())
+
+        self.input_preprocessor = InputPreprocessor(self.model_config,
+                                                    self.tokenizer,
+                                                    mm_registry)
+
+        self.model_executor = executor_class(vllm_config=vllm_config)
+
+        if self.model_config.runner_type != "pooling":
+            self._initialize_kv_caches()
+
+        # If usage stat is enabled, collect relevant info.
+        if is_usage_stats_enabled():
+            from vllm.model_executor.model_loader import (
+                get_architecture_class_name)
+            usage_message.report_usage(
+                get_architecture_class_name(self.model_config),
+                usage_context,
+                extra_kvs={
+                    # Common configuration
+                    "dtype":
+                    str(self.model_config.dtype),
+                    "tensor_parallel_size":
+                    self.parallel_config.tensor_parallel_size,
+                    "block_size":
+                    self.cache_config.block_size,
+                    "gpu_memory_utilization":
+                    self.cache_config.gpu_memory_utilization,
+
+                    # Quantization
+                    "quantization":
+                    self.model_config.quantization,
+                    "kv_cache_dtype":
+                    str(self.cache_config.cache_dtype),
+
+                    # Feature flags
+                    "enable_lora":
+                    bool(self.lora_config),
+                    "enable_prefix_caching":
+                    self.cache_config.enable_prefix_caching,
+                    "enforce_eager":
+                    self.model_config.enforce_eager,
+                    "disable_custom_all_reduce":
+                    self.parallel_config.disable_custom_all_reduce,
+                })
+
+        self.cached_scheduler_outputs = [
+            SchedulerOutputState()
+            for _ in range(self.parallel_config.pipeline_parallel_size)
+        ]
+
+        self.scheduler_contexts = [
+            SchedulerContext(multi_step_stream_outputs=self.scheduler_config.
+                             multi_step_stream_outputs)
+            for _ in range(self.parallel_config.pipeline_parallel_size)
+        ]
+
+        if self.model_config.use_async_output_proc:
+            process_model_outputs = weak_bind(self._process_model_outputs)
+
+            self.async_callbacks = [
+                partial(process_model_outputs,
+                        ctx=self.scheduler_contexts[v_id])
+                for v_id in range(self.parallel_config.pipeline_parallel_size)
+            ]
+        else:
+            self.async_callbacks = []
+
+        # Currently used by AsyncLLMEngine to ensure quick append
+        # of request outputs to asyncio queues
+        self.process_request_outputs_callback: Optional[Callable] = None
+
+        # Create the scheduler.
+        # NOTE: the cache_config here have been updated with the numbers of
+        # GPU and CPU blocks, which are profiled in the distributed executor.
+        if isinstance(self.vllm_config.scheduler_config.scheduler_cls, str):
+            Scheduler = resolve_obj_by_qualname(
+                self.vllm_config.scheduler_config.scheduler_cls)
+        else:
+            Scheduler = self.vllm_config.scheduler_config.scheduler_cls
+        self.scheduler = [
+            Scheduler(
+                self.scheduler_config, self.cache_config, self.lora_config,
+                self.parallel_config.pipeline_parallel_size,
+                self.async_callbacks[v_id]
+                if self.model_config.use_async_output_proc else None)
+            for v_id in range(self.parallel_config.pipeline_parallel_size)
+        ]
+
+        # Metric Logging.
+        if self.log_stats:
+            if stat_loggers is not None:
+                self.stat_loggers = stat_loggers
+            else:
+                # Lazy import for prometheus multiprocessing.
+                # We need to set PROMETHEUS_MULTIPROC_DIR environment variable
+                # before prometheus_client is imported.
+                # See https://prometheus.github.io/client_python/multiprocess/
+                from vllm.engine.metrics import (LoggingStatLogger,
+                                                 PrometheusStatLogger)
+
+                self.stat_loggers = {
+                    "logging":
+                    LoggingStatLogger(
+                        local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
+                        vllm_config=vllm_config),
+                    "prometheus":
+                    PrometheusStatLogger(
+                        local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
+                        labels=dict(
+                            model_name=self.model_config.served_model_name),
+                        vllm_config=vllm_config),
+                }
+                self.stat_loggers["prometheus"].info("cache_config",
+                                                     self.cache_config)
+
+        self.tracer = None
+        if self.observability_config.otlp_traces_endpoint:
+            self.tracer = init_tracer(
+                "vllm.llm_engine",
+                self.observability_config.otlp_traces_endpoint)
+
+        # Create sequence output processor, e.g. for beam search or
+        # speculative decoding.
+        self.output_processor = (
+            SequenceGroupOutputProcessor.create_output_processor(
+                self.scheduler_config,
+                self.detokenizer,
+                self.scheduler,
+                self.seq_counter,
+                get_tokenizer_for_seq,
+                stop_checker=StopChecker(self.scheduler_config.max_model_len,
+                                         get_tokenizer_for_seq),
+            ))
+
+        self.seq_id_to_seq_group: Dict[str, SequenceGroupBase] = {}
+
+        # Flag to set when an input fails to process and the engine should run
+        # the next step without re-scheduling.
+        self._skip_scheduling_next_step = False
+
+        # Don't keep the dummy data in memory
+        self.reset_mm_cache()
+
+    def _initialize_kv_caches(self) -> None:
+        """Initialize the KV cache in the worker(s).
+
+        The workers will determine the number of blocks in both the GPU cache
+        and the swap CPU cache.
+        """
+        start = time.time()
+        num_gpu_blocks, num_cpu_blocks = (
+            self.model_executor.determine_num_available_blocks())
+
+        if self.cache_config.num_gpu_blocks_override is not None:
+            num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override
+            logger.info(
+                "Overriding num_gpu_blocks=%d with "
+                "num_gpu_blocks_override=%d", num_gpu_blocks,
+                num_gpu_blocks_override)
+            num_gpu_blocks = num_gpu_blocks_override
+
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+
+        self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks)
+        elapsed = time.time() - start
+        logger.info(("init engine (profile, create kv cache, "
+                     "warmup model) took %.2f seconds"), elapsed)
+
+    @classmethod
+    def _get_executor_cls(cls,
+                          engine_config: VllmConfig) -> Type[ExecutorBase]:
+        # distributed_executor_backend must be set in VllmConfig.__post_init__
+        distributed_executor_backend = (
+            engine_config.parallel_config.distributed_executor_backend)
+        # Initialize the cluster and specify the executor class.
+        if isinstance(distributed_executor_backend, type):
+            if not issubclass(distributed_executor_backend, ExecutorBase):
+                raise TypeError(
+                    "distributed_executor_backend must be a subclass of "
+                    f"ExecutorBase. Got {distributed_executor_backend}.")
+            executor_class = distributed_executor_backend
+        elif distributed_executor_backend == "ray":
+            from vllm.executor.ray_distributed_executor import (
+                RayDistributedExecutor)
+            executor_class = RayDistributedExecutor
+        elif distributed_executor_backend == "mp":
+            from vllm.executor.mp_distributed_executor import (
+                MultiprocessingDistributedExecutor)
+            assert not envs.VLLM_USE_RAY_SPMD_WORKER, (
+                "multiprocessing distributed executor backend does not "
+                "support VLLM_USE_RAY_SPMD_WORKER=1")
+            executor_class = MultiprocessingDistributedExecutor
+        elif distributed_executor_backend == "uni":
+            # JAX-style, single-process, multi-device executor.
+            from vllm.executor.uniproc_executor import UniProcExecutor
+            executor_class = UniProcExecutor
+        elif distributed_executor_backend == "external_launcher":
+            # executor with external launcher
+            from vllm.executor.uniproc_executor import (  # noqa
+                ExecutorWithExternalLauncher)
+            executor_class = ExecutorWithExternalLauncher
+        else:
+            raise ValueError("unrecognized distributed_executor_backend: "
+                             f"{distributed_executor_backend}")
+        return executor_class
+
+    @classmethod
+    def from_vllm_config(
+        cls,
+        vllm_config: VllmConfig,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
+        disable_log_stats: bool = False,
+    ) -> "LLMEngine":
+        return cls(
+            vllm_config=vllm_config,
+            executor_class=cls._get_executor_cls(vllm_config),
+            log_stats=(not disable_log_stats),
+            usage_context=usage_context,
+            stat_loggers=stat_loggers,
+        )
+
+    @classmethod
+    def from_engine_args(
+        cls,
+        engine_args: EngineArgs,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
+    ) -> "LLMEngine":
+        """Creates an LLM engine from the engine arguments."""
+        # Create the engine configs.
+        vllm_config = engine_args.create_engine_config(usage_context)
+
+        engine_cls = cls
+        if envs.VLLM_USE_V1:
+            from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
+            engine_cls = V1LLMEngine
+
+        return engine_cls.from_vllm_config(
+            vllm_config=vllm_config,
+            usage_context=usage_context,
+            stat_loggers=stat_loggers,
+            disable_log_stats=engine_args.disable_log_stats,
+        )
+
+    def __reduce__(self):
+        # This is to ensure that the LLMEngine is not referenced in
+        # the closure used to initialize Ray worker actors
+        raise RuntimeError("LLMEngine should not be pickled!")
+
+    def __del__(self):
+        # Shutdown model executor when engine is garbage collected
+        # Use getattr since __init__ can fail before the field is set
+        if model_executor := getattr(self, "model_executor", None):
+            model_executor.shutdown()
+
+    def get_tokenizer_group(self) -> TokenizerGroup:
+        if self.tokenizer is None:
+            raise ValueError("Unable to get tokenizer because "
+                             "skip_tokenizer_init is True")
+
+        return self.tokenizer
+
+    def get_tokenizer(
+        self,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> AnyTokenizer:
+        return self.get_tokenizer_group().get_lora_tokenizer(lora_request)
+
+    def _init_tokenizer(self) -> TokenizerGroup:
+        return init_tokenizer_from_configs(
+            model_config=self.model_config,
+            scheduler_config=self.scheduler_config,
+            lora_config=self.lora_config)
+
+    def _verify_args(self) -> None:
+        self.model_config.verify_with_parallel_config(self.parallel_config)
+        self.cache_config.verify_with_parallel_config(self.parallel_config)
+        if self.lora_config:
+            self.lora_config.verify_with_model_config(self.model_config)
+            self.lora_config.verify_with_scheduler_config(
+                self.scheduler_config)
+
+    def _add_processed_request(
+        self,
+        request_id: str,
+        processed_inputs: ProcessorInputs,
+        params: Union[SamplingParams, PoolingParams],
+        arrival_time: float,
+        lora_request: Optional[LoRARequest],
+        trace_headers: Optional[Mapping[str, str]] = None,
+        priority: int = 0,
+    ) -> Optional[SequenceGroup]:
+        """Add a processed request to the engine's request pool.
+        return the created sequence group.
+        """
+        if isinstance(params, SamplingParams) and params.n > 1:
+            ParallelSampleSequenceGroup.add_request(
+                request_id,
+                self,
+                params,
+                processed_inputs=processed_inputs,
+                arrival_time=arrival_time,
+                lora_request=lora_request,
+                trace_headers=trace_headers,
+                priority=priority,
+            )
+            return None
+
+        self._validate_model_inputs(processed_inputs, lora_request)
+        # Create the sequences.
+        block_size = self.cache_config.block_size
+        seq_id = next(self.seq_counter)
+        eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
+
+        encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
+
+        seq = Sequence(seq_id, decoder_inputs, block_size, eos_token_id,
+                       lora_request)
+
+        encoder_seq = (None if encoder_inputs is None else Sequence(
+            seq_id, encoder_inputs, block_size, eos_token_id, lora_request))
+
+        # Create a SequenceGroup based on SamplingParams or PoolingParams
+        if isinstance(params, SamplingParams):
+            seq_group = self._create_sequence_group_with_sampling(
+                request_id,
+                seq,
+                params,
+                arrival_time=arrival_time,
+                lora_request=lora_request,
+                trace_headers=trace_headers,
+                encoder_seq=encoder_seq,
+                priority=priority)
+        elif isinstance(params, PoolingParams):
+            seq_group = self._create_sequence_group_with_pooling(
+                request_id,
+                seq,
+                params,
+                arrival_time=arrival_time,
+                lora_request=lora_request,
+                encoder_seq=encoder_seq,
+                priority=priority)
+        else:
+            raise ValueError(
+                "Either SamplingParams or PoolingParams must be provided.")
+
+        # Add the sequence group to the scheduler with least unfinished seqs.
+        costs = [
+            scheduler.get_num_unfinished_seq_groups()
+            for scheduler in self.scheduler
+        ]
+        min_cost_scheduler = self.scheduler[costs.index(min(costs))]
+        min_cost_scheduler.add_seq_group(seq_group)
+
+        return seq_group
+
+    def stop_remote_worker_execution_loop(self) -> None:
+        self.model_executor.stop_remote_worker_execution_loop()
+
+    def add_request(
+        self,
+        request_id: str,
+        prompt: PromptType,
+        params: Union[SamplingParams, PoolingParams],
+        arrival_time: Optional[float] = None,
+        lora_request: Optional[LoRARequest] = None,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        priority: int = 0,
+    ) -> None:
+        """Add a request to the engine's request pool.
+
+        The request is added to the request pool and will be processed by the
+        scheduler as `engine.step()` is called. The exact scheduling policy is
+        determined by the scheduler.
+
+        Args:
+            request_id: The unique ID of the request.
+            prompt: The prompt to the LLM. See
+                [PromptType][vllm.inputs.PromptType]
+                for more details about the format of each input.
+            params: Parameters for sampling or pooling.
+                [SamplingParams][vllm.SamplingParams] for text generation.
+                [PoolingParams][vllm.PoolingParams] for pooling.
+            arrival_time: The arrival time of the request. If None, we use
+                the current monotonic time.
+            lora_request: The LoRA request to add.
+            trace_headers: OpenTelemetry trace headers.
+            priority: The priority of the request.
+                Only applicable with priority scheduling.
+
+        Details:
+            - Set arrival_time to the current time if it is None.
+            - Set prompt_token_ids to the encoded prompt if it is None.
+            - Create `n` number of [Sequence][vllm.Sequence] objects.
+            - Create a [SequenceGroup][vllm.SequenceGroup] object
+              from the list of [Sequence][vllm.Sequence].
+            - Add the [SequenceGroup][vllm.SequenceGroup] object to the
+              scheduler.
+
+        Example:
+            >>> # initialize engine
+            >>> engine = LLMEngine.from_engine_args(engine_args)
+            >>> # set request arguments
+            >>> example_prompt = "Who is the president of the United States?"
+            >>> sampling_params = SamplingParams(temperature=0.0)
+            >>> request_id = 0
+            >>>
+            >>> # add the request to the engine
+            >>> engine.add_request(
+            >>>    str(request_id),
+            >>>    example_prompt,
+            >>>    SamplingParams(temperature=0.0))
+            >>> # continue the request processing
+            >>> ...
+        """
+        if not isinstance(request_id, str):
+            raise TypeError(
+                f"request_id must be a string, got {type(request_id)}")
+
+        if lora_request is not None and not self.lora_config:
+            raise ValueError(f"Got lora_request {lora_request} but LoRA is "
+                             "not enabled!")
+
+        if priority != 0 and not self.scheduler_config.policy == "priority":
+            raise ValueError(f"Got priority {priority} but "
+                             "Priority scheduling is not enabled.")
+
+        if isinstance(params, SamplingParams) \
+            and (params.guided_decoding or params.logits_processors) \
+            and self.scheduler_config.num_scheduler_steps > 1:
+            raise ValueError(
+                "Guided decoding and logits processors are not supported "
+                "in multi-step decoding")
+
+        if arrival_time is None:
+            arrival_time = time.time()
+
+        if (isinstance(prompt, dict)
+                and prompt.get("prompt_embeds", None) is not None
+                and not prompt.get("prompt_token_ids", None)):
+            seq_len = prompt["prompt_embeds"].shape[0]
+            prompt["prompt_token_ids"] = [0] * seq_len
+
+        processed_inputs = self.input_preprocessor.preprocess(
+            prompt,
+            tokenization_kwargs=tokenization_kwargs,
+            lora_request=lora_request,
+        )
+
+        self._add_processed_request(
+            request_id=request_id,
+            processed_inputs=processed_inputs,
+            params=params,
+            arrival_time=arrival_time,
+            lora_request=lora_request,
+            trace_headers=trace_headers,
+            priority=priority,
+        )
+
+    def _create_sequence_group_with_sampling(
+        self,
+        request_id: str,
+        seq: Sequence,
+        sampling_params: SamplingParams,
+        arrival_time: float,
+        lora_request: Optional[LoRARequest],
+        trace_headers: Optional[Mapping[str, str]] = None,
+        encoder_seq: Optional[Sequence] = None,
+        priority: int = 0,
+    ) -> SequenceGroup:
+        """Creates a SequenceGroup with SamplingParams."""
+        max_logprobs = self.get_model_config().max_logprobs
+        if (sampling_params.logprobs
+                and sampling_params.logprobs > max_logprobs) or (
+                    sampling_params.prompt_logprobs
+                    and sampling_params.prompt_logprobs > max_logprobs):
+            raise ValueError(f"Cannot request more than "
+                             f"{max_logprobs} logprobs.")
+
+        sampling_params = self._build_logits_processors(
+            sampling_params, lora_request)
+
+        # Defensive copy of SamplingParams, which are used by the sampler,
+        # this doesn't deep-copy LogitsProcessor objects
+        sampling_params = sampling_params.clone()
+
+        sampling_params.update_from_generation_config(
+            self.generation_config_fields, seq.eos_token_id)
+
+        # Create the sequence group.
+        draft_size = 1
+        if self.vllm_config.speculative_config is not None:
+            draft_size = \
+                self.vllm_config.speculative_config.num_speculative_tokens + 1
+        seq_group = SequenceGroup(request_id=request_id,
+                                  seqs=[seq],
+                                  arrival_time=arrival_time,
+                                  sampling_params=sampling_params,
+                                  lora_request=lora_request,
+                                  trace_headers=trace_headers,
+                                  encoder_seq=encoder_seq,
+                                  priority=priority,
+                                  draft_size=draft_size)
+
+        return seq_group
+
+    def _create_sequence_group_with_pooling(
+        self,
+        request_id: str,
+        seq: Sequence,
+        pooling_params: PoolingParams,
+        arrival_time: float,
+        lora_request: Optional[LoRARequest],
+        encoder_seq: Optional[Sequence] = None,
+        priority: int = 0,
+    ) -> SequenceGroup:
+        """Creates a SequenceGroup with PoolingParams."""
+        # Defensive copy of PoolingParams, which are used by the pooler
+        pooling_params = pooling_params.clone()
+        # Create the sequence group.
+        seq_group = SequenceGroup(request_id=request_id,
+                                  seqs=[seq],
+                                  arrival_time=arrival_time,
+                                  lora_request=lora_request,
+                                  pooling_params=pooling_params,
+                                  encoder_seq=encoder_seq,
+                                  priority=priority)
+        return seq_group
+
+    def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
+        """Aborts a request(s) with the given ID.
+
+        Args:
+            request_id: The ID(s) of the request to abort.
+
+        Details:
+            - Refer to [vllm.core.scheduler.Scheduler.abort_seq_group][].
+
+        Example:
+            >>> # initialize engine and add a request with request_id
+            >>> request_id = str(0)
+            >>> # abort the request
+            >>> engine.abort_request(request_id)
+        """
+        for scheduler in self.scheduler:
+            scheduler.abort_seq_group(
+                request_id, seq_id_to_seq_group=self.seq_id_to_seq_group)
+
+    def get_vllm_config(self) -> VllmConfig:
+        """Gets the vllm configuration."""
+        return self.vllm_config
+
+    def get_model_config(self) -> ModelConfig:
+        """Gets the model configuration."""
+        return self.model_config
+
+    def get_parallel_config(self) -> ParallelConfig:
+        """Gets the parallel configuration."""
+        return self.parallel_config
+
+    def get_decoding_config(self) -> DecodingConfig:
+        """Gets the decoding configuration."""
+        return self.decoding_config
+
+    def get_scheduler_config(self) -> SchedulerConfig:
+        """Gets the scheduler configuration."""
+        return self.scheduler_config
+
+    def get_lora_config(self) -> LoRAConfig:
+        """Gets the LoRA configuration."""
+        return self.lora_config
+
+    def get_num_unfinished_requests(self) -> int:
+        """Gets the number of unfinished requests."""
+        return sum(scheduler.get_num_unfinished_seq_groups()
+                   for scheduler in self.scheduler)
+
+    def has_unfinished_requests(self) -> bool:
+        """Returns True if there are unfinished requests."""
+        return any(scheduler.has_unfinished_seqs()
+                   for scheduler in self.scheduler)
+
+    def has_unfinished_requests_for_virtual_engine(
+            self, virtual_engine: int) -> bool:
+        """
+        Returns True if there are unfinished requests for the virtual engine.
+        """
+        return self.scheduler[virtual_engine].has_unfinished_seqs()
+
+    def reset_mm_cache(self) -> bool:
+        """Reset the multi-modal cache."""
+        return self.input_preprocessor.mm_registry.reset_processor_cache()
+
+    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
+        """Reset prefix cache for all devices."""
+
+        success = True
+        for scheduler in self.scheduler:
+            success = success and scheduler.reset_prefix_cache(device)
+        return success
+
+    @staticmethod
+    def _process_sequence_group_outputs(
+        seq_group: SequenceGroup,
+        outputs: List[PoolingSequenceGroupOutput],
+    ) -> None:
+        seq_group.pooled_data = outputs[0].data
+
+        for seq in seq_group.get_seqs():
+            seq.status = SequenceStatus.FINISHED_STOPPED
+
+        return
+
+    def _update_num_computed_tokens_for_multi_step_prefill(
+            self, seq_group: SequenceGroup,
+            seq_group_meta: SequenceGroupMetadata,
+            is_first_step_output: Optional[bool]):
+        """
+        This function updates num_computed_tokens for prompt sequences
+        when Multi-Step is enabled.
+
+        seq_group: SequenceGroup to update the num_computed_tokens for.
+        seq_group_meta: Metadata of the given SequenceGroup.
+        is_first_step_output: Optional[bool] -
+            When available, is_first_step_output indicates if the appended
+            output token is the output of the first-step in multi-step.
+            A value of None indicates that outputs from all steps in
+            in multi-step are submitted in a single burst.
+        """
+
+        assert self.scheduler_config.is_multi_step
+
+        if not seq_group_meta.is_prompt:
+            # num_computed_token updates for multi-step decodes happen after
+            # the tokens are appended to the sequence.
+            return
+
+        do_update: bool = False
+        if self.scheduler_config.chunked_prefill_enabled:
+            # In multi-step + chunked-prefill case, the prompt sequences
+            # that are scheduled are fully processed in the first step.
+            do_update = is_first_step_output is None or is_first_step_output
+        else:
+            # Normal multi-step decoding case. In this case prompt-sequences
+            # are actually single-stepped. Always update in this case.
+            assert seq_group.state.num_steps == 1
+            do_update = True
+
+        if do_update:
+            seq_group.update_num_computed_tokens(
+                seq_group_meta.token_chunk_size)
+
+    def _process_model_outputs(self,
+                               ctx: SchedulerContext,
+                               request_id: Optional[str] = None) -> None:
+        """Apply the model output to the sequences in the scheduled seq groups
+        and return responses.
+
+        ctx: The virtual engine context to work on
+        request_id: If provided, then only this request is going to be processed
+        """
+
+        now = time.time()
+
+        if len(ctx.output_queue) == 0:
+            return None
+
+        # Get pending async postprocessor
+        if request_id:
+            # When we process only one request, no pop is required
+            # (since later we will process all of the rest)
+            (outputs, seq_group_metadata_list, scheduler_outputs, is_async,
+             is_last_step, is_first_step_output, skip) = ctx.output_queue[0]
+        else:
+            (outputs, seq_group_metadata_list, scheduler_outputs, is_async,
+             is_last_step, is_first_step_output,
+             skip) = ctx.output_queue.popleft()
+
+        # Sanity check
+        assert len(seq_group_metadata_list) == len(
+            scheduler_outputs.scheduled_seq_groups)
+
+        has_multiple_outputs: bool = len(outputs) > 1
+        outputs_by_sequence_group: List[List[SequenceGroupOutput]]
+        if has_multiple_outputs:
+            assert self.scheduler_config.is_multi_step or \
+                     self.speculative_config
+            # Organize outputs by [step][sequence group] instead of
+            # [sequence group][step].
+            if self.scheduler_config.is_multi_step:
+                outputs_by_sequence_group = create_output_by_sequence_group(
+                    outputs, len(seq_group_metadata_list))
+            elif self.speculative_config:
+                # Decodes are multi-steps while prefills are not, outputting at
+                # most 1 token. Separate them so that we can trigger chunk
+                # processing without having to pad or copy over prompts K times
+                # to match decodes structure (costly with prompt_logprobs).
+                num_prefills = sum(sg.is_prompt
+                                   for sg in seq_group_metadata_list)
+                prefills, decodes = outputs[:num_prefills], outputs[
+                    num_prefills:]
+                outputs_by_sequence_group = create_output_by_sequence_group(
+                    decodes,
+                    num_seq_groups=len(seq_group_metadata_list) - num_prefills)
+                outputs_by_sequence_group = [p.outputs for p in prefills
+                                             ] + outputs_by_sequence_group
+            # We have outputs for multiple steps submitted in a single burst,
+            # so invalidate is_first_step_output.
+            is_first_step_output = None
+        else:
+            outputs_by_sequence_group = outputs
+
+        # Determine the requests we need to operate on
+        if request_id:
+            indices = []
+            for i, seq_group_meta in enumerate(seq_group_metadata_list):
+                if seq_group_meta.request_id == request_id:
+                    assert i not in skip  # Cannot be called twice
+                    indices.append(i)
+                    break
+
+            # If the request_id was not found, then it means that
+            # this is a new request that has no pending async
+            # postprocessor
+            if not indices:
+                return
+        else:
+            indices = range(len(seq_group_metadata_list))  # type: ignore
+
+        finished_before: List[int] = []
+        finished_now: List[int] = []
+        for i in indices:
+            if i in skip:
+                continue
+
+            seq_group_meta = seq_group_metadata_list[i]
+            scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i]
+
+            seq_group: SequenceGroup = scheduled_seq_group.seq_group
+
+            if seq_group.is_finished():
+                finished_before.append(i)
+                continue
+
+            output: List[SequenceGroupOutput]
+            if has_multiple_outputs:
+                output = outputs_by_sequence_group[i]
+            else:
+                output = [outputs_by_sequence_group[0][i]]
+
+            if not is_async:
+                if self.scheduler_config.is_multi_step:
+                    # Updates happen only if the sequence is prefill
+                    self._update_num_computed_tokens_for_multi_step_prefill(
+                        seq_group, seq_group_meta, is_first_step_output)
+                else:
+                    seq_group.update_num_computed_tokens(
+                        seq_group_meta.token_chunk_size or 0)
+
+            if outputs:
+                for o in outputs:
+                    if (isinstance(o, SamplerOutput)
+                            and seq_group.metrics is not None):
+                        if seq_group.metrics.model_forward_time is not None:
+                            seq_group.metrics.model_forward_time += (
+                                o.model_forward_time or 0)
+                        else:
+                            seq_group.metrics.model_forward_time = (
+                                o.model_forward_time)
+                        if seq_group.metrics.model_execute_time is not None:
+                            seq_group.metrics.model_execute_time += (
+                                o.model_execute_time or 0)
+                        else:
+                            seq_group.metrics.model_execute_time = (
+                                o.model_execute_time)
+
+            if self.model_config.runner_type == "pooling":
+                self._process_sequence_group_outputs(seq_group, output)
+            else:
+                self.output_processor.process_prompt_logprob(seq_group, output)
+                if seq_group_meta.do_sample:
+                    self.output_processor.process_outputs(
+                        seq_group, output, is_async)
+
+            if seq_group.is_finished():
+                finished_now.append(i)
+
+        # Generate outputs for the requests that finished this iteration
+        for i in finished_now:
+            scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i]
+
+            seq_group = scheduled_seq_group.seq_group
+            seq_group.maybe_set_first_token_time(now)
+            if not seq_group.is_prefill():
+                seq_group.set_last_token_time(now)
+            request_output = RequestOutputFactory.create(
+                seq_group,
+                self.seq_id_to_seq_group,
+                use_cache=self.use_cached_outputs)
+            if request_output:
+                ctx.request_outputs.append(request_output)
+
+        # When we process a single request, we skip it for the next time,
+        # and invoke the request output callback (if there was final output)
+        if request_id:
+            assert len(indices) == 1
+            skip.append(indices[0])
+
+            if (finished_now
+                    and self.process_request_outputs_callback is not None):
+                self.process_request_outputs_callback(ctx.request_outputs)
+                ctx.request_outputs.clear()
+            return
+
+        # Free currently finished requests
+        if finished_now:
+            for scheduler in self.scheduler:
+                scheduler.free_finished_seq_groups()
+
+        # For multi-step without streaming, don't create outputs each iteration
+        if not is_last_step and not ctx.multi_step_stream_outputs:
+            # Immediately process request outputs here (if callback is given)
+            if (finished_now
+                    and self.process_request_outputs_callback is not None):
+                self.process_request_outputs_callback(ctx.request_outputs)
+                ctx.request_outputs.clear()
+            return
+
+        # Create the outputs
+        for i in indices:
+            if i in skip or i in finished_before or i in finished_now:
+                continue  # Avoids double processing
+
+            scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i]
+
+            seq_group = scheduled_seq_group.seq_group
+            seq_group.maybe_set_first_token_time(now)
+            if not seq_group.is_prefill():
+                seq_group.set_last_token_time(now)
+            request_output = RequestOutputFactory.create(
+                seq_group,
+                self.seq_id_to_seq_group,
+                use_cache=self.use_cached_outputs)
+            if request_output:
+                ctx.request_outputs.append(request_output)
+
+        # For multi-step with streaming, create outputs each iteration
+        if not is_last_step and ctx.multi_step_stream_outputs:
+            # Immediately process request outputs here (if callback is given)
+            if self.process_request_outputs_callback is not None:
+                self.process_request_outputs_callback(ctx.request_outputs)
+                ctx.request_outputs.clear()
+            return
+
+        for seq_group in scheduler_outputs.ignored_seq_groups:
+            params = seq_group.sampling_params
+            if params is not None and params.output_kind == (
+                    RequestOutputKind.DELTA) and not seq_group.is_finished():
+                continue
+
+            request_output = RequestOutputFactory.create(
+                seq_group,
+                self.seq_id_to_seq_group,
+                use_cache=self.use_cached_outputs,
+            )
+            if request_output:
+                ctx.request_outputs.append(request_output)
+
+        # Immediately process request outputs here (if callback is given)
+        if (ctx.request_outputs
+                and self.process_request_outputs_callback is not None):
+            self.process_request_outputs_callback(ctx.request_outputs)
+            ctx.request_outputs.clear()
+
+        # For async case, we need to record the stats here.
+        # For non-async case, the stats are done in the
+        # LLMEngine/AsyncLLMEngine directly
+        if is_async:
+            # Log stats.
+            self.do_log_stats(scheduler_outputs, outputs, finished_before,
+                              skip)
+
+            # Tracing
+            self.do_tracing(scheduler_outputs, finished_before)
+
+        return None
+
+    def _advance_to_next_step(
+            self, output: SamplerOutput,
+            seq_group_metadata_list: List[SequenceGroupMetadata],
+            scheduled_seq_groups: List[ScheduledSequenceGroup]) -> None:
+        """Given model output from a single run, append the tokens to the
+        sequences. This is normally done inside output processor, but it is
+        required if the worker is to perform async forward pass to next step.
+        """
+        for seq_group_metadata, sequence_group_outputs, scheduled_seq_group in \
+            zip(seq_group_metadata_list, output, scheduled_seq_groups):
+            seq_group = scheduled_seq_group.seq_group
+
+            if seq_group.is_finished():
+                continue
+
+            if self.scheduler_config.is_multi_step:
+                # Updates happen only if the sequence is prefill
+                self._update_num_computed_tokens_for_multi_step_prefill(
+                    seq_group, seq_group_metadata,
+                    seq_group.state.num_steps == 1)
+            else:
+                token_chunk_size = (seq_group_metadata.token_chunk_size
+                                    if seq_group_metadata.token_chunk_size
+                                    is not None else 0)
+                seq_group.update_num_computed_tokens(token_chunk_size)
+
+            if seq_group_metadata.do_sample:
+                assert len(sequence_group_outputs.samples) == 1, (
+                    "Async output processor expects a single sample"
+                    " (i.e sampling_params.n == 1)")
+                sample = sequence_group_outputs.samples[0]
+
+                assert len(seq_group.seqs) == 1
+                seq = seq_group.seqs[0]
+
+                if self.scheduler_config.is_multi_step:
+                    is_prefill_append = seq.data.get_num_uncomputed_tokens(
+                    ) == 0
+                    seq.append_token_id(sample.output_token, sample.logprobs,
+                                        sample.output_embed)
+                    if not is_prefill_append:
+                        seq_group.update_num_computed_tokens(1)
+                else:
+                    seq.append_token_id(sample.output_token, sample.logprobs,
+                                        sample.output_embed)
+
+    def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
+        """Performs one decoding iteration and returns newly generated results.
+
+        <figure markdown="span">
+        ![Overview of the step function](https://i.imgur.com/sv2HssD.png)
+        <figcaption>Overview of the step function</figcaption>
+        </figure>
+
+        Details:
+        - Step 1: Schedules the sequences to be executed in the next
+            iteration and the token blocks to be swapped in/out/copy.
+
+            - Depending on the scheduling policy,
+                sequences may be `preempted/reordered`.
+            - A Sequence Group (SG) refer to a group of sequences
+                that are generated from the same prompt.
+
+        - Step 2: Calls the distributed executor to execute the model.
+        - Step 3: Processes the model output. This mainly includes:
+
+            - Decodes the relevant outputs.
+            - Updates the scheduled sequence groups with model outputs
+                based on its `sampling parameters` (`use_beam_search` or not).
+            - Frees the finished sequence groups.
+
+        - Finally, it creates and returns the newly generated results.
+
+        Example:
+        ```
+        # Please see the example/ folder for more detailed examples.
+
+        # initialize engine and request arguments
+        engine = LLMEngine.from_engine_args(engine_args)
+        example_inputs = [(0, "What is LLM?",
+        SamplingParams(temperature=0.0))]
+    
+        # Start the engine with an event loop
+        while True:
+            if example_inputs:
+                req_id, prompt, sampling_params = example_inputs.pop(0)
+                engine.add_request(str(req_id),prompt,sampling_params)
+
+            # continue the request processing
+            request_outputs = engine.step()
+            for request_output in request_outputs:
+                if request_output.finished:
+                    # return or show the request output
+
+            if not (engine.has_unfinished_requests() or example_inputs):
+                break
+        ```
+        """
+        if self.parallel_config.pipeline_parallel_size > 1:
+            raise NotImplementedError(
+                "Pipeline parallelism is only supported through AsyncLLMEngine "
+                "as performance will be severely degraded otherwise.")
+
+        # For llm_engine, there is no pipeline parallel support, so the engine
+        # used is always 0.
+        virtual_engine = 0
+
+        # These are cached outputs from previous iterations. None if on first
+        # iteration
+        cached_outputs = self.cached_scheduler_outputs[virtual_engine]
+        seq_group_metadata_list = cached_outputs.seq_group_metadata_list
+        scheduler_outputs = cached_outputs.scheduler_outputs
+        allow_async_output_proc = cached_outputs.allow_async_output_proc
+
+        ctx = self.scheduler_contexts[virtual_engine]
+
+        # Clear outputs for each new scheduler iteration
+        ctx.request_outputs.clear()
+
+        # Skip the scheduler if there are any remaining steps in the seq groups.
+        # This ensures that the scheduler is only called again when the current
+        # batch has completed.
+        # The scheduler is also skipped if a single request caused the last
+        # engine step to fail, and the previous schedule needs to be rerun.
+        if not self._has_remaining_steps(
+                seq_group_metadata_list
+        ) and not self._skip_scheduling_next_step:
+            # Schedule iteration
+            (seq_group_metadata_list, scheduler_outputs,
+             allow_async_output_proc
+             ) = self.scheduler[virtual_engine].schedule()
+
+            ctx.seq_group_metadata_list = seq_group_metadata_list
+            ctx.scheduler_outputs = scheduler_outputs
+
+            finished_requests_ids = self.scheduler[
+                virtual_engine].get_and_reset_finished_requests_ids()
+            # When n>1, elements in self.seq_id_to_seq_group should be deleted
+            # here, otherwise memory leaks.
+            for finished_request_id in finished_requests_ids:
+                if finished_request_id in self.seq_id_to_seq_group:
+                    del self.seq_id_to_seq_group[finished_request_id]
+
+            # Maybe switch from async mode to sync mode
+            if not allow_async_output_proc and len(ctx.output_queue) > 0:
+                self._process_model_outputs(ctx=ctx)
+
+            if (self.scheduler_config.is_multi_step
+                    and scheduler_outputs.num_lookahead_slots > 0):
+                # cache the scheduler outputs for the next iteration if we have
+                # lookahead slots
+                self._cache_scheduler_outputs_for_multi_step(
+                    virtual_engine, seq_group_metadata_list, scheduler_outputs,
+                    allow_async_output_proc)
+        else:
+            finished_requests_ids = list()
+
+        assert seq_group_metadata_list is not None
+        assert scheduler_outputs is not None
+
+        if not scheduler_outputs.is_empty():
+
+            # Check if we have a cached last_output from the previous iteration.
+            # For supporting PP this is probably the best way to pass the
+            # sampled_token_ids, as a separate broadcast over all the PP stages
+            # will cause one virtual engine's microbatch to block the pipeline.
+            last_sampled_token_ids = \
+                self._get_last_sampled_token_ids(virtual_engine)
+
+            execute_model_req = ExecuteModelRequest(
+                seq_group_metadata_list=seq_group_metadata_list,
+                blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
+                blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
+                blocks_to_copy=scheduler_outputs.blocks_to_copy,
+                num_lookahead_slots=scheduler_outputs.num_lookahead_slots,
+                running_queue_size=scheduler_outputs.running_queue_size,
+                finished_requests_ids=finished_requests_ids,
+                # We use ExecuteModelRequest to pass the last sampled_token_ids
+                # to each of the non-last PP stages for in-place prepare_input.
+                last_sampled_token_ids=last_sampled_token_ids)
+
+            if allow_async_output_proc:
+                execute_model_req.async_callback = self.async_callbacks[
+                    virtual_engine]
+
+            try:
+                outputs = self.model_executor.execute_model(
+                    execute_model_req=execute_model_req)
+                self._skip_scheduling_next_step = False
+            except InputProcessingError as e:
+                # The input for this request cannot be processed, so we must
+                # abort it. If there are remaining requests in the batch that
+                # have been scheduled, they will be retried on the next step.
+                invalid_request_id = e.request_id
+                self._abort_and_cache_schedule(
+                    request_id=invalid_request_id,
+                    virtual_engine=virtual_engine,
+                    seq_group_metadata_list=seq_group_metadata_list,
+                    scheduler_outputs=scheduler_outputs,
+                    allow_async_output_proc=allow_async_output_proc)
+                # Raise so the caller is notified that this request failed
+                raise
+
+            # We need to do this here so that last step's sampled_token_ids can
+            # be passed to the next iteration for PP.
+            if self.scheduler_config.is_multi_step:
+                self._update_cached_scheduler_output(virtual_engine, outputs)
+        else:
+            # Nothing scheduled => If there is pending async postprocessor,
+            # then finish it here.
+            if len(ctx.output_queue) > 0:
+                self._process_model_outputs(ctx=ctx)
+            # No outputs in this case
+            outputs = []
+
+        # Finish the current step for all the sequence groups.
+        if self.scheduler_config.is_multi_step:
+            for seq_group in seq_group_metadata_list:
+                seq_group.finish_step()
+
+        if not self._has_remaining_steps(seq_group_metadata_list):
+            # clear the cache if we have finished all the steps.
+            if self.scheduler_config.is_multi_step:
+                self.cached_scheduler_outputs[0] = SchedulerOutputState()
+
+            # is_first_step_output is True only when the num_steps of all
+            # the sequences are 1. When the num_steps > 1,
+            # multi_step_model_runner does the first-step output append.
+            is_first_step_output: bool = False if not seq_group_metadata_list \
+                else seq_group_metadata_list[0].state.num_steps == 1
+
+            # Add results to the output_queue
+            ctx.append_output(outputs=outputs,
+                              seq_group_metadata_list=seq_group_metadata_list,
+                              scheduler_outputs=scheduler_outputs,
+                              is_async=allow_async_output_proc,
+                              is_last_step=True,
+                              is_first_step_output=is_first_step_output)
+
+            if outputs and allow_async_output_proc:
+                assert len(outputs) == 1, (
+                    "Async postprocessor expects only a single output set")
+
+                self._advance_to_next_step(
+                    outputs[0], seq_group_metadata_list,
+                    scheduler_outputs.scheduled_seq_groups)
+
+            # Check if need to run the usual non-async path
+            if not allow_async_output_proc:
+                self._process_model_outputs(ctx=ctx)
+
+                # Log stats.
+                self.do_log_stats(scheduler_outputs, outputs)
+
+                # Tracing
+                self.do_tracing(scheduler_outputs)
+        else:
+            # Multi-step case
+            return ctx.request_outputs
+
+        if not self.has_unfinished_requests():
+            # Drain async postprocessor (if exists)
+            if len(ctx.output_queue) > 0:
+                self._process_model_outputs(ctx=ctx)
+            assert len(ctx.output_queue) == 0
+
+            # Stop the execute model loop in parallel workers until there are
+            # more requests to process. This avoids waiting indefinitely in
+            # torch.distributed ops which may otherwise timeout, and unblocks
+            # the RPC thread in the workers so that they can process any other
+            # queued control plane messages, such as add/remove lora adapters.
+            logger.debug("Stopping remote worker execution loop.")
+            self.model_executor.stop_remote_worker_execution_loop()
+
+        return ctx.request_outputs
+
+    def _abort_and_cache_schedule(
+            self, request_id: str, virtual_engine: int,
+            seq_group_metadata_list: List[SequenceGroupMetadata],
+            scheduler_outputs: SchedulerOutputs,
+            allow_async_output_proc: bool) -> None:
+        """Aborts a single request, and caches the scheduler outputs minus that
+        request. This allows the next step to continue processing the remaining
+        requests without having to re-run the scheduler."""
+
+        # Abort the request and remove its sequence group from the current
+        # schedule
+        self.abort_request(request_id)
+        for i, metadata in enumerate(seq_group_metadata_list):
+            if metadata.request_id == request_id:
+                del seq_group_metadata_list[i]
+                break
+        for i, group in enumerate(scheduler_outputs.scheduled_seq_groups):
+            if group.seq_group.request_id == request_id:
+                del scheduler_outputs.scheduled_seq_groups[i]
+                break
+
+        # If there are still other sequence groups left in the schedule, cache
+        # them and flag the engine to reuse the schedule.
+        if len(seq_group_metadata_list) > 0:
+            self._skip_scheduling_next_step = True
+            # Reuse multi-step caching logic
+            self._cache_scheduler_outputs_for_multi_step(
+                virtual_engine=virtual_engine,
+                scheduler_outputs=scheduler_outputs,
+                seq_group_metadata_list=seq_group_metadata_list,
+                allow_async_output_proc=allow_async_output_proc)
+
+    def _has_remaining_steps(
+        self, seq_group_metadata_list: Optional[List[SequenceGroupMetadata]]
+    ) -> bool:
+        if (not self.scheduler_config.is_multi_step
+                or not seq_group_metadata_list):
+            return False
+
+        # TODO(will) this is a sanity check for nowto make sure that all the
+        # seqs are on the same steps. Eventually we will want to do some sort of
+        # dynamic scheduling when doing multi-step decoding.
+        ref_remaining_steps = seq_group_metadata_list[0].state.remaining_steps
+        if any([
+                seq_group.state.remaining_steps != ref_remaining_steps
+                for seq_group in seq_group_metadata_list[1:]
+        ]):
+            raise AssertionError("All running sequence groups should "
+                                 "have the same remaining steps.")
+
+        return ref_remaining_steps > 0
+
+    def _cache_scheduler_outputs_for_multi_step(
+            self, virtual_engine: int,
+            seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+            scheduler_outputs: SchedulerOutputs,
+            allow_async_output_proc: bool) -> None:
+        co = self.cached_scheduler_outputs[virtual_engine]
+
+        co.seq_group_metadata_list = seq_group_metadata_list
+        co.scheduler_outputs = scheduler_outputs
+        co.allow_async_output_proc = allow_async_output_proc
+        co.last_output = None
+
+    def _update_cached_scheduler_output(
+            self, virtual_engine: int,
+            output: List[Optional[SamplerOutput]]) -> None:
+        if (self.parallel_config.pipeline_parallel_size > 1 and len(output) > 0
+                and output[0] is not None):
+            last_output = output[-1]
+            assert last_output is not None
+            assert last_output.sampled_token_ids_cpu is not None
+            assert last_output.sampled_token_ids is None
+            assert last_output.sampled_token_probs is None
+            self.cached_scheduler_outputs[
+                virtual_engine].last_output = last_output
+
+    def _get_last_sampled_token_ids(
+            self, virtual_engine: int) -> Optional[torch.Tensor]:
+        cached_last_output = self.cached_scheduler_outputs[
+            virtual_engine].last_output
+        if (self.scheduler_config.is_multi_step
+                and self.parallel_config.pipeline_parallel_size > 1
+                and cached_last_output is not None
+                and cached_last_output.sampled_token_ids_cpu is not None):
+            return cached_last_output.sampled_token_ids_cpu
+        return None
+
+    def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None:
+        if not self.log_stats:
+            raise RuntimeError(
+                "Stat logging is disabled. Set `disable_log_stats=False` "
+                "argument to enable.")
+        if logger_name in self.stat_loggers:
+            raise KeyError(f"Logger with name {logger_name} already exists.")
+        self.stat_loggers[logger_name] = logger
+
+    def remove_logger(self, logger_name: str) -> None:
+        if not self.log_stats:
+            raise RuntimeError(
+                "Stat logging is disabled. Set `disable_log_stats=False` "
+                "argument to enable.")
+        if logger_name not in self.stat_loggers:
+            raise KeyError(f"Logger with name {logger_name} does not exist.")
+        del self.stat_loggers[logger_name]
+
+    def do_log_stats(self,
+                     scheduler_outputs: Optional[SchedulerOutputs] = None,
+                     model_output: Optional[List[SamplerOutput]] = None,
+                     finished_before: Optional[List[int]] = None,
+                     skip: Optional[List[int]] = None) -> None:
+        """Forced log when no requests active."""
+        if self.log_stats:
+            stats = self._get_stats(scheduler_outputs, model_output,
+                                    finished_before, skip)
+            for logger in self.stat_loggers.values():
+                logger.log(stats)
+
+    def _get_stats(self,
+                   scheduler_outputs: Optional[SchedulerOutputs],
+                   model_output: Optional[List[SamplerOutput]] = None,
+                   finished_before: Optional[List[int]] = None,
+                   skip: Optional[List[int]] = None) -> Stats:
+        """Get Stats to be Logged to Prometheus.
+
+        Args:
+            scheduler_outputs: Optional, used to populate metrics related to
+                the scheduled batch,
+            model_output: Optional, used to emit speculative decoding metrics
+                which are created by the workers.
+            finished_before: Optional, indices of sequences that were finished
+                before. These sequences will be ignored.
+            skip: Optional, indices of sequences that were preempted. These
+                sequences will be ignored.
+        """
+        now = time.time()
+
+        # System State
+        #   Scheduler State
+        num_running_sys = sum(
+            len(scheduler.running) for scheduler in self.scheduler)
+        num_swapped_sys = sum(
+            len(scheduler.swapped) for scheduler in self.scheduler)
+        num_waiting_sys = sum(
+            len(scheduler.waiting) for scheduler in self.scheduler)
+
+        # KV Cache Usage in %
+        num_total_gpu = self.cache_config.num_gpu_blocks
+        gpu_cache_usage_sys = 0.
+        if num_total_gpu:  # Guard against both None and 0
+            num_free_gpu = sum(
+                scheduler.block_manager.get_num_free_gpu_blocks()
+                for scheduler in self.scheduler)
+            gpu_cache_usage_sys = 1.0 - (num_free_gpu / num_total_gpu)
+
+        num_total_cpu = self.cache_config.num_cpu_blocks
+        cpu_cache_usage_sys = 0.
+        if num_total_cpu:  # Guard against both None and 0
+            num_free_cpu = sum(
+                scheduler.block_manager.get_num_free_cpu_blocks()
+                for scheduler in self.scheduler)
+            cpu_cache_usage_sys = 1.0 - (num_free_cpu / num_total_cpu)
+
+        # Prefix Cache Hit Rate. Note that we always use
+        # the cache hit rate of the first virtual engine.
+        cpu_prefix_cache_hit_rate = self.scheduler[
+            0].get_prefix_cache_hit_rate(Device.CPU)
+        gpu_prefix_cache_hit_rate = self.scheduler[
+            0].get_prefix_cache_hit_rate(Device.GPU)
+
+        # Exchange the uasge and cache hit stats between gpu and cpu when
+        # running on cpu because the cpu_worker.py intentionally reports the
+        # number of cpu blocks as gpu blocks in favor of cache management.
+        if self.device_config.device_type == "cpu":
+            num_total_gpu, num_total_cpu = num_total_cpu, num_total_gpu
+            gpu_cache_usage_sys, cpu_cache_usage_sys = (
+                cpu_cache_usage_sys,
+                gpu_cache_usage_sys,
+            )
+            gpu_prefix_cache_hit_rate, cpu_prefix_cache_hit_rate = (
+                cpu_prefix_cache_hit_rate,
+                gpu_prefix_cache_hit_rate,
+            )
+
+        # Iteration stats
+        num_prompt_tokens_iter = 0
+        num_generation_tokens_iter = 0
+        num_tokens_iter = 0
+        time_to_first_tokens_iter: List[float] = []
+        time_per_output_tokens_iter: List[float] = []
+        num_preemption_iter = (0 if scheduler_outputs is None else
+                               scheduler_outputs.preempted)
+
+        # Request stats
+        #   Latency
+        time_e2e_requests: List[float] = []
+        time_queue_requests: List[float] = []
+        time_inference_requests: List[float] = []
+        time_prefill_requests: List[float] = []
+        time_decode_requests: List[float] = []
+        #   Metadata
+        num_prompt_tokens_requests: List[int] = []
+        num_generation_tokens_requests: List[int] = []
+        n_requests: List[int] = []
+        max_num_generation_tokens_requests: List[int] = []
+        max_tokens_requests: List[int] = []
+        finished_reason_requests: List[str] = []
+
+        # LoRA requests
+        running_lora_adapters = dict(
+            collectionsCounter([
+                running_request.lora_request.lora_name
+                for scheduler in self.scheduler
+                for running_request in scheduler.running
+                if running_request.lora_request
+            ]))
+        waiting_lora_adapters = dict(
+            collectionsCounter([
+                waiting_request.lora_request.lora_name
+                for scheduler in self.scheduler
+                for waiting_request in scheduler.waiting
+                if waiting_request.lora_request
+            ]))
+        max_lora_stat = "0"
+        if self.lora_config:
+            max_lora_stat = str(self.lora_config.max_loras)
+
+        # NOTE: This loop assumes prefill seq_groups are before
+        # decode seq_groups in scheduled_seq_groups.
+        if scheduler_outputs is not None:
+            # For async postprocessor, already finished sequences need to be
+            # not counted (to avoid double counting)
+            actual_num_batched_tokens = scheduler_outputs.num_batched_tokens  # type: ignore
+
+            num_generation_tokens_from_prefill_groups = 0
+            # NOTE: if scheduler_outputs.num_prefill_groups > 0 and
+            # the len of scheduler_outputs.scheduled_seq_groups is !=
+            # scheduler_outputs.num_prefill_groups, this means that
+            # chunked prefills have been detected.
+
+            for idx, scheduled_seq_group in enumerate(
+                    scheduler_outputs.scheduled_seq_groups):
+                # Skip double logging when using async output proc
+                if finished_before and idx in finished_before:
+                    actual_num_batched_tokens -= 1
+                    continue
+
+                # Currently, skip == preempted sequences, so we need to skip
+                # their log stats
+                if skip and idx in skip:
+                    continue
+
+                group_was_prefill = idx < scheduler_outputs.num_prefill_groups
+                seq_group = scheduled_seq_group.seq_group
+
+                # NOTE: a seq_group that completed all of its prefill tokens
+                # in the last iteration will have seq_group.is_prefill() = False
+                # with group_was_prefill = True
+                if group_was_prefill:
+                    # Number of prompt tokens.
+                    num_prompt_tokens_iter += (
+                        scheduled_seq_group.token_chunk_size)
+
+                    # If the seq_group just finished the prefill state
+                    # get TTFT.
+                    if not seq_group.is_prefill():
+                        latency = seq_group.get_last_token_latency()
+                        time_to_first_tokens_iter.append(latency)
+
+                        # One generation token per finished prefill.
+                        num_generation_tokens_from_prefill_groups += (
+                            seq_group.num_seqs())
+                else:
+                    # TPOTs.
+                    latency = seq_group.get_last_token_latency()
+                    time_per_output_tokens_iter.append(latency)
+                    if seq_group.state.current_step == 0:
+                        # For async_output_proc, the do_log_stats()
+                        # is called following init_multi_step(), which
+                        # sets the current_step to zero.
+                        actual_num_batched_tokens +=\
+                            seq_group.state.num_steps - 1
+                    else:
+                        actual_num_batched_tokens +=\
+                            seq_group.state.current_step - 1
+
+                # Because of chunked prefill, we can have a single sequence
+                # group that does multiple prompt_runs. To prevent logging
+                # the same metadata more than once per request, we standardize
+                # on logging request level information for finished requests,
+                # which can only happen once.
+                if seq_group.is_finished():
+                    # Latency timings
+                    time_e2e_requests.append(now -
+                                             seq_group.metrics.arrival_time)
+                    if (seq_group.metrics.first_scheduled_time is not None and
+                            seq_group.metrics.first_token_time is not None):
+                        time_queue_requests.append(
+                            seq_group.metrics.first_scheduled_time -
+                            seq_group.metrics.arrival_time)
+                        time_prefill_requests.append(
+                            seq_group.metrics.first_token_time -
+                            seq_group.metrics.first_scheduled_time)
+                        time_decode_requests.append(
+                            now - seq_group.metrics.first_token_time)
+                        time_inference_requests.append(
+                            now - seq_group.metrics.first_scheduled_time)
+                    # Metadata
+                    num_prompt_tokens_requests.append(
+                        len(seq_group.prompt_token_ids))
+                    num_generation_tokens_requests.extend([
+                        seq.get_output_len()
+                        for seq in seq_group.get_finished_seqs()
+                    ])
+                    max_num_generation_tokens_requests.append(
+                        max(seq.get_output_len()
+                            for seq in seq_group.get_seqs()))
+                    if seq_group.sampling_params is not None:
+                        n_requests.append(seq_group.sampling_params.n)
+                        max_tokens_requests.append(
+                            seq_group.sampling_params.max_tokens)
+                    finished_reason_requests.extend([
+                        SequenceStatus.get_finished_reason(seq.status)
+                        for seq in seq_group.get_finished_seqs()
+                    ])
+
+            # Number of generation tokens.
+            #   num_batched_tokens equals the number of prompt_tokens plus the
+            #   number of decode_tokens in a single iteration. So,
+            #   num_generation_tokens = num_batched_tokens - num_prompt_tokens
+            #   + num_generation_tokens_from_prefill_groups (since we generate
+            #   one token on prefills on iters where the prefill finishes).
+            num_generation_tokens_iter = (
+                actual_num_batched_tokens - num_prompt_tokens_iter +
+                num_generation_tokens_from_prefill_groups)
+            num_tokens_iter = (num_generation_tokens_iter +
+                               num_prompt_tokens_iter)
+
+        return Stats(
+            now=now,
+            # System stats
+            #   Scheduler State
+            num_running_sys=num_running_sys,
+            num_swapped_sys=num_swapped_sys,
+            num_waiting_sys=num_waiting_sys,
+            #   KV Cache Usage in %
+            gpu_cache_usage_sys=gpu_cache_usage_sys,
+            cpu_cache_usage_sys=cpu_cache_usage_sys,
+            #   Prefix Cache Hit Rate
+            cpu_prefix_cache_hit_rate=cpu_prefix_cache_hit_rate,
+            gpu_prefix_cache_hit_rate=gpu_prefix_cache_hit_rate,
+
+            # Iteration stats
+            num_prompt_tokens_iter=num_prompt_tokens_iter,
+            num_generation_tokens_iter=num_generation_tokens_iter,
+            num_tokens_iter=num_tokens_iter,
+            time_to_first_tokens_iter=time_to_first_tokens_iter,
+            time_per_output_tokens_iter=time_per_output_tokens_iter,
+            num_preemption_iter=num_preemption_iter,
+
+            # Request stats
+            #   Latency
+            time_e2e_requests=time_e2e_requests,
+            time_queue_requests=time_queue_requests,
+            time_inference_requests=time_inference_requests,
+            time_prefill_requests=time_prefill_requests,
+            time_decode_requests=time_decode_requests,
+            #   Metadata
+            num_prompt_tokens_requests=num_prompt_tokens_requests,
+            num_generation_tokens_requests=num_generation_tokens_requests,
+            max_num_generation_tokens_requests=
+            max_num_generation_tokens_requests,
+            n_requests=n_requests,
+            max_tokens_requests=max_tokens_requests,
+            finished_reason_requests=finished_reason_requests,
+            max_lora=str(max_lora_stat),
+            waiting_lora_adapters=list(waiting_lora_adapters.keys()),
+            running_lora_adapters=list(running_lora_adapters.keys()))
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.model_executor.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        return self.model_executor.remove_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        return self.model_executor.list_loras()
+
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.model_executor.pin_lora(lora_id)
+
+    def start_profile(self) -> None:
+        self.model_executor.start_profile()
+
+    def stop_profile(self) -> None:
+        self.model_executor.stop_profile()
+
+    def sleep(self, level: int = 1) -> None:
+        assert self.vllm_config.model_config.enable_sleep_mode, (
+            "Sleep mode is not enabled in the model config")
+        self.model_executor.sleep(level=level)
+
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        assert self.vllm_config.model_config.enable_sleep_mode, (
+            "Sleep mode is not enabled in the model config")
+        self.model_executor.wake_up(tags)
+
+    def is_sleeping(self) -> bool:
+        return self.model_executor.is_sleeping
+
+    def check_health(self) -> None:
+        self.model_executor.check_health()
+
+    def is_tracing_enabled(self) -> bool:
+        return self.tracer is not None
+
+    def do_tracing(self,
+                   scheduler_outputs: SchedulerOutputs,
+                   finished_before: Optional[List[int]] = None) -> None:
+        if self.tracer is None:
+            return
+
+        for idx, scheduled_seq_group in enumerate(
+                scheduler_outputs.scheduled_seq_groups):
+            # Skip double tracing when using async output proc
+            if finished_before and idx in finished_before:
+                continue
+
+            seq_group = scheduled_seq_group.seq_group
+            if seq_group.is_finished():
+                self.create_trace_span(seq_group)
+
+    def create_trace_span(self, seq_group: SequenceGroup) -> None:
+        if self.tracer is None or seq_group.sampling_params is None:
+            return
+        arrival_time_nano_seconds = int(seq_group.metrics.arrival_time * 1e9)
+
+        trace_context = extract_trace_context(seq_group.trace_headers)
+
+        with self.tracer.start_as_current_span(
+                "llm_request",
+                kind=SpanKind.SERVER,
+                context=trace_context,
+                start_time=arrival_time_nano_seconds) as seq_span:
+            metrics = seq_group.metrics
+            ttft = metrics.first_token_time - metrics.arrival_time
+            e2e_time = metrics.finished_time - metrics.arrival_time
+            seq_span.set_attribute(SpanAttributes.GEN_AI_RESPONSE_MODEL,
+                                   self.model_config.model)
+            seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID,
+                                   seq_group.request_id)
+            seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE,
+                                   seq_group.sampling_params.temperature)
+            seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P,
+                                   seq_group.sampling_params.top_p)
+            seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS,
+                                   seq_group.sampling_params.max_tokens)
+            seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N,
+                                   seq_group.sampling_params.n)
+            seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_NUM_SEQUENCES,
+                                   seq_group.num_seqs())
+            seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS,
+                                   len(seq_group.prompt_token_ids))
+            seq_span.set_attribute(
+                SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS,
+                sum([
+                    seq.get_output_len()
+                    for seq in seq_group.get_finished_seqs()
+                ]))
+            seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE,
+                                   metrics.time_in_queue)
+            seq_span.set_attribute(
+                SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN, ttft)
+            seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E, e2e_time)
+            if metrics.scheduler_time is not None:
+                seq_span.set_attribute(
+                    SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER,
+                    metrics.scheduler_time)
+            if metrics.model_forward_time is not None:
+                seq_span.set_attribute(
+                    SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD,
+                    metrics.model_forward_time / 1000.0)
+            if metrics.model_execute_time is not None:
+                seq_span.set_attribute(
+                    SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE,
+                    metrics.model_execute_time)
+
+    def _validate_model_inputs(self, inputs: ProcessorInputs,
+                               lora_request: Optional[LoRARequest]):
+        encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs)
+
+        if encoder_inputs is not None:
+            self._validate_model_input(encoder_inputs,
+                                       lora_request,
+                                       prompt_type="encoder")
+
+        self._validate_model_input(decoder_inputs,
+                                   lora_request,
+                                   prompt_type="decoder")
+
+    def _validate_model_input(
+        self,
+        prompt_inputs: SingletonInputs,
+        lora_request: Optional[LoRARequest],
+        *,
+        prompt_type: Literal["encoder", "decoder"],
+    ):
+        model_config = self.model_config
+        tokenizer = (None if self.tokenizer is None else
+                     self.tokenizer.get_lora_tokenizer(lora_request))
+
+        prompt_ids = prompt_inputs.get("prompt_token_ids", [])
+        if not prompt_ids:
+            if prompt_type == "encoder" and model_config.is_multimodal_model:
+                pass  # Mllama may have empty encoder inputs for text-only data
+            elif prompt_inputs["type"] == "embeds":
+                pass
+            else:
+                raise ValueError(f"The {prompt_type} prompt cannot be empty")
+
+        if tokenizer is not None:
+            max_input_id = max(prompt_ids, default=0)
+            if max_input_id > tokenizer.max_token_id:
+                raise ValueError(
+                    f"Token id {max_input_id} is out of vocabulary")
+
+        max_prompt_len = self.model_config.max_model_len
+        if len(prompt_ids) > max_prompt_len:
+            if prompt_type == "encoder" and model_config.is_multimodal_model:
+                mm_registry = self.input_preprocessor.mm_registry
+                mm_processor = mm_registry.create_processor(
+                    model_config,
+                    tokenizer=tokenizer or object(),  # Dummy if no tokenizer
+                )
+                assert isinstance(mm_processor, EncDecMultiModalProcessor)
+
+                if mm_processor.pad_dummy_encoder_prompt:
+                    return  # Skip encoder length check for Whisper
+
+            if model_config.is_multimodal_model:
+                suggestion = (
+                    "Make sure that `max_model_len` is no smaller than the "
+                    "number of text tokens plus multimodal tokens. For image "
+                    "inputs, the number of image tokens depends on the number "
+                    "of images, and possibly their aspect ratios as well.")
+            else:
+                suggestion = (
+                    "Make sure that `max_model_len` is no smaller than the "
+                    "number of text tokens.")
+
+            raise ValueError(
+                f"The {prompt_type} prompt (length {len(prompt_ids)}) is "
+                f"longer than the maximum model length of {max_prompt_len}. "
+                f"{suggestion}")
+
+            # TODO: Find out how many placeholder tokens are there so we can
+            # check that chunked prefill does not truncate them
+            # max_batch_len = self.scheduler_config.max_num_batched_tokens
+
+    def _build_logits_processors(
+            self, sampling_params: SamplingParams,
+            lora_request: Optional[LoRARequest]) -> SamplingParams:
+        """Constructs logits processors based on the guided_decoding,
+        logits_bias, and allowed_token_ids fields in sampling_params. Deletes
+        those fields and adds the constructed logits processors to the
+        logits_processors field. Returns the modified sampling params."""
+
+        logits_processors = []
+
+        if sampling_params.guided_decoding is not None:
+            # Defensively copy sampling params since guided decoding logits
+            # processors can have different state for each request
+            sampling_params = copy.copy(sampling_params)
+            guided_decoding = sampling_params.guided_decoding
+
+            logger.debug(
+                "Building guided decoding logits processor in "
+                "LLMEngine. Params: %s", guided_decoding)
+
+            tokenizer = self.get_tokenizer(lora_request=lora_request)
+            guided_decoding.backend = guided_decoding.backend or \
+                self.decoding_config.backend
+
+            if self.decoding_config.reasoning_backend:
+                logger.debug("Building with reasoning backend %s",
+                             self.decoding_config.reasoning_backend)
+
+            processor = get_local_guided_decoding_logits_processor(
+                guided_params=guided_decoding,
+                tokenizer=tokenizer,
+                model_config=self.model_config,
+                reasoning_backend=self.decoding_config.reasoning_backend,
+            )
+            if processor:
+                logits_processors.append(processor)
+
+            # Unset so this doesn't get passed down to the model
+            sampling_params.guided_decoding = None
+
+        if (sampling_params.logit_bias or sampling_params.allowed_token_ids):
+            tokenizer = self.get_tokenizer(lora_request=lora_request)
+
+            processors = get_openai_logits_processors(
+                logit_bias=sampling_params.logit_bias,
+                allowed_token_ids=sampling_params.allowed_token_ids,
+                tokenizer=tokenizer)
+            logits_processors.extend(processors)
+
+            # Unset so these don't get passed down to the model
+            sampling_params.logit_bias = None
+            sampling_params.allowed_token_ids = None
+
+        if len(sampling_params.bad_words) > 0:
+            tokenizer = self.get_tokenizer(lora_request)
+            processors = get_bad_words_logits_processors(
+                bad_words=sampling_params.bad_words, tokenizer=tokenizer)
+            logits_processors.extend(processors)
+
+        if logits_processors:
+            if sampling_params.logits_processors is None:
+                sampling_params.logits_processors = logits_processors
+            else:
+                sampling_params.logits_processors.extend(logits_processors)
+
+        return sampling_params
+
+    def collective_rpc(self,
+                       method: Union[str, Callable[..., _R]],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        return self.model_executor.collective_rpc(method, timeout, args,
+                                                  kwargs)
+
+
+if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
+    from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
+    LLMEngine = V1LLMEngine  # type: ignore
diff --git a/vllm_v0.10.0/vllm/engine/metrics.py b/vllm_v0.10.0/vllm/engine/metrics.py
new file mode 100644
index 0000000..ba8dbd1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/engine/metrics.py
@@ -0,0 +1,563 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import time
+from typing import Counter as CollectionsCounter
+from typing import Dict, List, Optional, Type, Union, cast
+
+import numpy as np
+import prometheus_client
+
+from vllm.config import SupportsMetricsInfo, VllmConfig
+from vllm.engine.metrics_types import StatLoggerBase, Stats
+from vllm.executor.ray_utils import ray
+from vllm.logger import init_logger
+
+if ray is not None:
+    from ray.util import metrics as ray_metrics
+else:
+    ray_metrics = None
+
+logger = init_logger(__name__)
+
+prometheus_client.disable_created_metrics()
+
+# The begin-* and end* here are used by the documentation generator
+# to extract the metrics definitions.
+
+
+# --8<-- [start:metrics-definitions]
+class Metrics:
+    """
+    vLLM uses a multiprocessing-based frontend for the OpenAI server.
+    This means that we need to run prometheus_client in multiprocessing mode
+    See https://prometheus.github.io/client_python/multiprocess/ for more
+    details on limitations.
+    """
+
+    labelname_finish_reason = "finished_reason"
+    labelname_waiting_lora_adapters = "waiting_lora_adapters"
+    labelname_running_lora_adapters = "running_lora_adapters"
+    labelname_max_lora = "max_lora"
+    _gauge_cls = prometheus_client.Gauge
+    _counter_cls = prometheus_client.Counter
+    _histogram_cls = prometheus_client.Histogram
+
+    def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
+        # Unregister any existing vLLM collectors (for CI/CD)
+        self._unregister_vllm_metrics()
+
+        max_model_len = vllm_config.model_config.max_model_len
+
+        # Use this flag to hide metrics that were deprecated in
+        # a previous release and which will be removed future
+        self.show_hidden_metrics = \
+            vllm_config.observability_config.show_hidden_metrics
+
+        # System stats
+        #   Scheduler State
+        self.gauge_scheduler_running = self._gauge_cls(
+            name="vllm:num_requests_running",
+            documentation="Number of requests currently running on GPU.",
+            labelnames=labelnames,
+            multiprocess_mode="sum")
+        self.gauge_scheduler_waiting = self._gauge_cls(
+            name="vllm:num_requests_waiting",
+            documentation="Number of requests waiting to be processed.",
+            labelnames=labelnames,
+            multiprocess_mode="sum")
+        self.gauge_lora_info = self._gauge_cls(
+            name="vllm:lora_requests_info",
+            documentation="Running stats on lora requests.",
+            labelnames=[
+                self.labelname_running_lora_adapters,
+                self.labelname_max_lora,
+                self.labelname_waiting_lora_adapters,
+            ],
+            multiprocess_mode="livemostrecent",
+        )
+
+        #   KV Cache Usage in %
+        self.gauge_gpu_cache_usage = self._gauge_cls(
+            name="vllm:gpu_cache_usage_perc",
+            documentation="GPU KV-cache usage. 1 means 100 percent usage.",
+            labelnames=labelnames,
+            multiprocess_mode="sum")
+
+        # Iteration stats
+        self.counter_num_preemption = self._counter_cls(
+            name="vllm:num_preemptions_total",
+            documentation="Cumulative number of preemption from the engine.",
+            labelnames=labelnames)
+        self.counter_prompt_tokens = self._counter_cls(
+            name="vllm:prompt_tokens_total",
+            documentation="Number of prefill tokens processed.",
+            labelnames=labelnames)
+        self.counter_generation_tokens = self._counter_cls(
+            name="vllm:generation_tokens_total",
+            documentation="Number of generation tokens processed.",
+            labelnames=labelnames)
+        self.histogram_iteration_tokens = self._histogram_cls(
+            name="vllm:iteration_tokens_total",
+            documentation="Histogram of number of tokens per engine_step.",
+            labelnames=labelnames,
+            buckets=[
+                1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384
+            ])
+        self.histogram_time_to_first_token = self._histogram_cls(
+            name="vllm:time_to_first_token_seconds",
+            documentation="Histogram of time to first token in seconds.",
+            labelnames=labelnames,
+            buckets=[
+                0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
+                0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0,
+                2560.0
+            ])
+        self.histogram_time_per_output_token = self._histogram_cls(
+            name="vllm:time_per_output_token_seconds",
+            documentation="Histogram of time per output token in seconds.",
+            labelnames=labelnames,
+            buckets=[
+                0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
+                1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
+            ])
+
+        # Request stats
+        #   Latency
+        request_latency_buckets = [
+            0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
+            40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0
+        ]
+        self.histogram_e2e_time_request = self._histogram_cls(
+            name="vllm:e2e_request_latency_seconds",
+            documentation="Histogram of end to end request latency in seconds.",
+            labelnames=labelnames,
+            buckets=request_latency_buckets)
+        self.histogram_queue_time_request = self._histogram_cls(
+            name="vllm:request_queue_time_seconds",
+            documentation=
+            "Histogram of time spent in WAITING phase for request.",
+            labelnames=labelnames,
+            buckets=request_latency_buckets)
+        self.histogram_inference_time_request = self._histogram_cls(
+            name="vllm:request_inference_time_seconds",
+            documentation=
+            "Histogram of time spent in RUNNING phase for request.",
+            labelnames=labelnames,
+            buckets=request_latency_buckets)
+        self.histogram_prefill_time_request = self._histogram_cls(
+            name="vllm:request_prefill_time_seconds",
+            documentation=
+            "Histogram of time spent in PREFILL phase for request.",
+            labelnames=labelnames,
+            buckets=request_latency_buckets)
+        self.histogram_decode_time_request = self._histogram_cls(
+            name="vllm:request_decode_time_seconds",
+            documentation=
+            "Histogram of time spent in DECODE phase for request.",
+            labelnames=labelnames,
+            buckets=request_latency_buckets)
+
+        #   Metadata
+        self.histogram_num_prompt_tokens_request = self._histogram_cls(
+            name="vllm:request_prompt_tokens",
+            documentation="Number of prefill tokens processed.",
+            labelnames=labelnames,
+            buckets=build_1_2_5_buckets(max_model_len),
+        )
+        self.histogram_num_generation_tokens_request = \
+            self._histogram_cls(
+                name="vllm:request_generation_tokens",
+                documentation="Number of generation tokens processed.",
+                labelnames=labelnames,
+                buckets=build_1_2_5_buckets(max_model_len),
+            )
+        self.histogram_max_num_generation_tokens_request = self._histogram_cls(
+            name="vllm:request_max_num_generation_tokens",
+            documentation=
+            "Histogram of maximum number of requested generation tokens.",
+            labelnames=labelnames,
+            buckets=build_1_2_5_buckets(max_model_len))
+        self.histogram_n_request = self._histogram_cls(
+            name="vllm:request_params_n",
+            documentation="Histogram of the n request parameter.",
+            labelnames=labelnames,
+            buckets=[1, 2, 5, 10, 20],
+        )
+        self.histogram_max_tokens_request = self._histogram_cls(
+            name="vllm:request_params_max_tokens",
+            documentation="Histogram of the max_tokens request parameter.",
+            labelnames=labelnames,
+            buckets=build_1_2_5_buckets(max_model_len),
+        )
+        self.counter_request_success = self._counter_cls(
+            name="vllm:request_success_total",
+            documentation="Count of successfully processed requests.",
+            labelnames=labelnames + [Metrics.labelname_finish_reason])
+
+
+# --8<-- [end:metrics-definitions]
+
+    def _unregister_vllm_metrics(self) -> None:
+        for collector in list(prometheus_client.REGISTRY._collector_to_names):
+            if hasattr(collector, "_name") and "vllm" in collector._name:
+                prometheus_client.REGISTRY.unregister(collector)
+
+
+class _RayGaugeWrapper:
+    """Wraps around ray.util.metrics.Gauge to provide same API as
+    prometheus_client.Gauge"""
+
+    def __init__(self,
+                 name: str,
+                 documentation: str = "",
+                 labelnames: Optional[List[str]] = None,
+                 multiprocess_mode: str = ""):
+        del multiprocess_mode
+        labelnames_tuple = tuple(labelnames) if labelnames else None
+        self._gauge = ray_metrics.Gauge(name=name,
+                                        description=documentation,
+                                        tag_keys=labelnames_tuple)
+
+    def labels(self, **labels):
+        self._gauge.set_default_tags(labels)
+        return self
+
+    def set(self, value: Union[int, float]):
+        return self._gauge.set(value)
+
+    def set_to_current_time(self):
+        # ray metrics doesn't have set_to_current time, https://docs.ray.io/en/latest/_modules/ray/util/metrics.html
+        return self._gauge.set(time.time())
+
+
+class _RayCounterWrapper:
+    """Wraps around ray.util.metrics.Counter to provide same API as
+    prometheus_client.Counter"""
+
+    def __init__(self,
+                 name: str,
+                 documentation: str = "",
+                 labelnames: Optional[List[str]] = None):
+        labelnames_tuple = tuple(labelnames) if labelnames else None
+        self._counter = ray_metrics.Counter(name=name,
+                                            description=documentation,
+                                            tag_keys=labelnames_tuple)
+
+    def labels(self, **labels):
+        self._counter.set_default_tags(labels)
+        return self
+
+    def inc(self, value: Union[int, float] = 1.0):
+        if value == 0:
+            return
+        return self._counter.inc(value)
+
+
+class _RayHistogramWrapper:
+    """Wraps around ray.util.metrics.Histogram to provide same API as
+    prometheus_client.Histogram"""
+
+    def __init__(self,
+                 name: str,
+                 documentation: str = "",
+                 labelnames: Optional[List[str]] = None,
+                 buckets: Optional[List[float]] = None):
+        labelnames_tuple = tuple(labelnames) if labelnames else None
+        boundaries = buckets if buckets else []
+        self._histogram = ray_metrics.Histogram(name=name,
+                                                description=documentation,
+                                                tag_keys=labelnames_tuple,
+                                                boundaries=boundaries)
+
+    def labels(self, **labels):
+        self._histogram.set_default_tags(labels)
+        return self
+
+    def observe(self, value: Union[int, float]):
+        return self._histogram.observe(value)
+
+
+class RayMetrics(Metrics):
+    """
+    RayMetrics is used by RayPrometheusStatLogger to log to Ray metrics.
+    Provides the same metrics as Metrics but uses Ray's util.metrics library.
+    """
+    _gauge_cls: Type[prometheus_client.Gauge] = cast(
+        Type[prometheus_client.Gauge], _RayGaugeWrapper)
+    _counter_cls: Type[prometheus_client.Counter] = cast(
+        Type[prometheus_client.Counter], _RayCounterWrapper)
+    _histogram_cls: Type[prometheus_client.Histogram] = cast(
+        Type[prometheus_client.Histogram], _RayHistogramWrapper)
+
+    def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
+        if ray_metrics is None:
+            raise ImportError("RayMetrics requires Ray to be installed.")
+        super().__init__(labelnames, vllm_config)
+
+    def _unregister_vllm_metrics(self) -> None:
+        # No-op on purpose
+        pass
+
+
+def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]:
+    """
+    Builds a list of buckets with increasing powers of 10 multiplied by
+    mantissa values until the value exceeds the specified maximum.
+
+    """
+    exponent = 0
+    buckets: List[int] = []
+    while True:
+        for m in mantissa_lst:
+            value = m * 10**exponent
+            if value <= max_value:
+                buckets.append(value)
+            else:
+                return buckets
+        exponent += 1
+
+
+def build_1_2_5_buckets(max_value: int) -> List[int]:
+    """
+    Example:
+    >>> build_1_2_5_buckets(100)
+    [1, 2, 5, 10, 20, 50, 100]
+    """
+    return build_buckets([1, 2, 5], max_value)
+
+
+def build_1_2_3_5_8_buckets(max_value: int) -> List[int]:
+    """
+    Example:
+    >>> build_1_2_3_5_8_buckets(100)
+    [1, 2, 3, 5, 8, 10, 20, 30, 50, 80, 100]
+    """
+    return build_buckets([1, 2, 3, 5, 8], max_value)
+
+
+def local_interval_elapsed(now: float, last_log: float,
+                           local_interval: float) -> bool:
+    elapsed_time = now - last_log
+    return elapsed_time > local_interval
+
+
+def get_throughput(tracked_stats: List[int], now: float,
+                   last_log: float) -> float:
+    return float(np.sum(tracked_stats) / (now - last_log))
+
+
+class LoggingStatLogger(StatLoggerBase):
+    """LoggingStatLogger is used in LLMEngine to log to Stdout."""
+
+    def __init__(self, local_interval: float, vllm_config: VllmConfig) -> None:
+        super().__init__(local_interval, vllm_config)
+        self.last_prompt_throughput: Optional[float] = None
+        self.last_generation_throughput: Optional[float] = None
+
+    def log(self, stats: Stats) -> None:
+        """Called by LLMEngine.
+           Logs to Stdout every self.local_interval seconds."""
+
+        # Save tracked stats for token counters.
+        self.num_prompt_tokens.append(stats.num_prompt_tokens_iter)
+        self.num_generation_tokens.append(stats.num_generation_tokens_iter)
+
+        # Log locally every local_interval seconds.
+        if local_interval_elapsed(stats.now, self.last_local_log,
+                                  self.local_interval):
+            # Compute summary metrics for tracked stats (and log them
+            # to promethus if applicable).
+            prompt_throughput = get_throughput(self.num_prompt_tokens,
+                                               now=stats.now,
+                                               last_log=self.last_local_log)
+            generation_throughput = get_throughput(
+                self.num_generation_tokens,
+                now=stats.now,
+                last_log=self.last_local_log)
+
+            log_fn = logger.info
+            if not any((prompt_throughput, generation_throughput,
+                        self.last_prompt_throughput,
+                        self.last_generation_throughput)):
+                # Avoid log noise on an idle production system
+                log_fn = logger.debug
+
+            log_fn(
+                "Avg prompt throughput: %.1f tokens/s, "
+                "Avg generation throughput: %.1f tokens/s, "
+                "Running: %d reqs, Swapped: %d reqs, "
+                "Pending: %d reqs, GPU KV cache usage: %.1f%%, "
+                "CPU KV cache usage: %.1f%%.",
+                prompt_throughput,
+                generation_throughput,
+                stats.num_running_sys,
+                stats.num_swapped_sys,
+                stats.num_waiting_sys,
+                stats.gpu_cache_usage_sys * 100,
+                stats.cpu_cache_usage_sys * 100,
+            )
+            if (stats.cpu_prefix_cache_hit_rate >= 0
+                    or stats.gpu_prefix_cache_hit_rate >= 0):
+                log_fn(
+                    "Prefix cache hit rate: GPU: %.2f%%, CPU: %.2f%%",
+                    stats.gpu_prefix_cache_hit_rate * 100,
+                    stats.cpu_prefix_cache_hit_rate * 100,
+                )
+
+            self._reset(stats, prompt_throughput, generation_throughput)
+
+    def _reset(self, stats, prompt_throughput, generation_throughput) -> None:
+        # Reset tracked stats for next interval.
+        self.num_prompt_tokens = []
+        self.num_generation_tokens = []
+        self.last_local_log = stats.now
+        self.last_prompt_throughput = prompt_throughput
+        self.last_generation_throughput = generation_throughput
+
+    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
+        raise NotImplementedError
+
+
+class PrometheusStatLogger(StatLoggerBase):
+    """PrometheusStatLogger is used LLMEngine to log to Promethus."""
+    _metrics_cls = Metrics
+    _gauge_cls = prometheus_client.Gauge
+
+    def __init__(self, local_interval: float, labels: Dict[str, str],
+                 vllm_config: VllmConfig) -> None:
+        super().__init__(local_interval, vllm_config)
+        # Prometheus metrics
+        self.labels = labels
+        self.metrics = self._metrics_cls(labelnames=list(labels.keys()),
+                                         vllm_config=vllm_config)
+
+    def _log_gauge(self, gauge, data: Union[int, float]) -> None:
+        # Convenience function for logging to gauge.
+        gauge.labels(**self.labels).set(data)
+
+    def _log_counter(self, counter, data: Union[int, float]) -> None:
+        # Convenience function for logging to counter.
+        # Prevent ValueError from negative increment
+        if data < 0:
+            logger.warning("Skipping negative increment of %g to %s", data,
+                           counter)
+            return
+        counter.labels(**self.labels).inc(data)
+
+    def _log_counter_labels(self, counter, data: CollectionsCounter,
+                            label_key: str) -> None:
+        # Convenience function for collection counter of labels.
+        for label, count in data.items():
+            counter.labels(**{**self.labels, label_key: label}).inc(count)
+
+    def _log_histogram(self, histogram, data: Union[List[int],
+                                                    List[float]]) -> None:
+        # Convenience function for logging list to histogram.
+        for datum in data:
+            histogram.labels(**self.labels).observe(datum)
+
+    def _log_gauge_string(self, gauge, data: Dict[str, str]) -> None:
+        gauge.labels(**data).set_to_current_time()
+
+    def _log_prometheus(self, stats: Stats) -> None:
+        # System state data
+        self._log_gauge(self.metrics.gauge_scheduler_running,
+                        stats.num_running_sys)
+        self._log_gauge(self.metrics.gauge_scheduler_waiting,
+                        stats.num_waiting_sys)
+        self._log_gauge(self.metrics.gauge_gpu_cache_usage,
+                        stats.gpu_cache_usage_sys)
+        # Including max-lora in metric, in future this property of lora
+        # config maybe extended to be dynamic.
+        lora_info = {
+            self.metrics.labelname_running_lora_adapters:
+            ",".join(stats.running_lora_adapters),
+            self.metrics.labelname_waiting_lora_adapters:
+            ",".join(stats.waiting_lora_adapters),
+            self.metrics.labelname_max_lora:
+            stats.max_lora,
+        }
+        self._log_gauge_string(self.metrics.gauge_lora_info, lora_info)
+        # Iteration level data
+        self._log_counter(self.metrics.counter_num_preemption,
+                          stats.num_preemption_iter)
+        self._log_counter(self.metrics.counter_prompt_tokens,
+                          stats.num_prompt_tokens_iter)
+        self._log_counter(self.metrics.counter_generation_tokens,
+                          stats.num_generation_tokens_iter)
+        self._log_histogram(self.metrics.histogram_iteration_tokens,
+                            [stats.num_tokens_iter])
+        self._log_histogram(self.metrics.histogram_time_to_first_token,
+                            stats.time_to_first_tokens_iter)
+        self._log_histogram(self.metrics.histogram_time_per_output_token,
+                            stats.time_per_output_tokens_iter)
+
+        # Request level data
+        # Latency
+        self._log_histogram(self.metrics.histogram_e2e_time_request,
+                            stats.time_e2e_requests)
+        self._log_histogram(self.metrics.histogram_queue_time_request,
+                            stats.time_queue_requests)
+        self._log_histogram(self.metrics.histogram_inference_time_request,
+                            stats.time_inference_requests)
+        self._log_histogram(self.metrics.histogram_prefill_time_request,
+                            stats.time_prefill_requests)
+        self._log_histogram(self.metrics.histogram_decode_time_request,
+                            stats.time_decode_requests)
+        # Metadata
+        finished_reason_counter = CollectionsCounter(
+            stats.finished_reason_requests)
+        self._log_counter_labels(self.metrics.counter_request_success,
+                                 finished_reason_counter,
+                                 Metrics.labelname_finish_reason)
+        self._log_histogram(self.metrics.histogram_num_prompt_tokens_request,
+                            stats.num_prompt_tokens_requests)
+        self._log_histogram(
+            self.metrics.histogram_num_generation_tokens_request,
+            stats.num_generation_tokens_requests)
+        self._log_histogram(self.metrics.histogram_n_request, stats.n_requests)
+        self._log_histogram(
+            self.metrics.histogram_max_num_generation_tokens_request,
+            stats.max_num_generation_tokens_requests)
+        self._log_histogram(self.metrics.histogram_max_tokens_request,
+                            stats.max_tokens_requests)
+
+    def log(self, stats: Stats):
+        """Logs to prometheus and tracked stats every iteration."""
+        # Log to prometheus.
+        self._log_prometheus(stats)
+
+        # Save tracked stats for token counters.
+        self.num_prompt_tokens.append(stats.num_prompt_tokens_iter)
+        self.num_generation_tokens.append(stats.num_generation_tokens_iter)
+
+        # Log locally every local_interval seconds.
+        if local_interval_elapsed(stats.now, self.last_local_log,
+                                  self.local_interval):
+
+            # Reset tracked stats for next interval.
+            self.num_prompt_tokens = []
+            self.num_generation_tokens = []
+            self.last_local_log = stats.now
+
+    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
+        # Info type metrics are syntactic sugar for a gauge permanently set to 1
+        # Since prometheus multiprocessing mode does not support Info, emulate
+        # info here with a gauge.
+        if type == "cache_config":
+            metrics_info = obj.metrics_info()
+            info_gauge = self._gauge_cls(
+                name="vllm:cache_config_info",
+                documentation="Information of the LLMEngine CacheConfig",
+                labelnames=metrics_info.keys(),
+                multiprocess_mode="mostrecent")
+            info_gauge.labels(**metrics_info).set(1)
+
+
+class RayPrometheusStatLogger(PrometheusStatLogger):
+    """RayPrometheusStatLogger uses Ray metrics instead."""
+    _metrics_cls = RayMetrics
+
+    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
+        return None
diff --git a/vllm_v0.10.0/vllm/engine/metrics_types.py b/vllm_v0.10.0/vllm/engine/metrics_types.py
new file mode 100644
index 0000000..3281a91
--- /dev/null
+++ b/vllm_v0.10.0/vllm/engine/metrics_types.py
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+These types are defined in this file to avoid importing vllm.engine.metrics
+and therefore importing prometheus_client.
+
+This is required due to usage of Prometheus multiprocess mode to enable 
+metrics after splitting out the uvicorn process from the engine process.
+
+Prometheus multiprocess mode requires setting PROMETHEUS_MULTIPROC_DIR
+before prometheus_client is imported. Typically, this is done by setting
+the env variable before launch, but since we are a library, we need to
+do this in Python code and lazily import prometheus_client.
+"""
+
+import time
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import List
+
+from vllm.config import SupportsMetricsInfo, VllmConfig
+
+
+@dataclass
+class Stats:
+    """Created by LLMEngine for use by StatLogger."""
+    now: float
+
+    # System stats (should have _sys suffix)
+    #   Scheduler State
+    num_running_sys: int
+    num_waiting_sys: int
+    num_swapped_sys: int
+    #   KV Cache Usage in %
+    gpu_cache_usage_sys: float
+    cpu_cache_usage_sys: float
+    #   Prefix caching block hit rate
+    cpu_prefix_cache_hit_rate: float
+    gpu_prefix_cache_hit_rate: float
+
+    # Iteration stats (should have _iter suffix)
+    num_prompt_tokens_iter: int
+    num_generation_tokens_iter: int
+    num_tokens_iter: int
+    time_to_first_tokens_iter: List[float]
+    time_per_output_tokens_iter: List[float]
+    num_preemption_iter: int
+
+    # Request stats (should have _requests suffix)
+    #   Latency
+    time_e2e_requests: List[float]
+    time_queue_requests: List[float]
+    time_inference_requests: List[float]
+    time_prefill_requests: List[float]
+    time_decode_requests: List[float]
+    #   Metadata
+    num_prompt_tokens_requests: List[int]
+    num_generation_tokens_requests: List[int]
+    n_requests: List[int]
+    max_num_generation_tokens_requests: List[int]
+    max_tokens_requests: List[int]
+    finished_reason_requests: List[str]
+    waiting_lora_adapters: List[str]
+    running_lora_adapters: List[str]
+    max_lora: str
+
+
+class StatLoggerBase(ABC):
+    """Base class for StatLogger."""
+
+    def __init__(self, local_interval: float, vllm_config: VllmConfig) -> None:
+        # Tracked stats over current local logging interval.
+        self.num_prompt_tokens: List[int] = []
+        self.num_generation_tokens: List[int] = []
+        self.last_local_log = time.time()
+        self.local_interval = local_interval
+
+    @abstractmethod
+    def log(self, stats: Stats) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
+        raise NotImplementedError
diff --git a/vllm_v0.10.0/vllm/engine/multiprocessing/__init__.py b/vllm_v0.10.0/vllm/engine/multiprocessing/__init__.py
new file mode 100644
index 0000000..ff0405d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/engine/multiprocessing/__init__.py
@@ -0,0 +1,144 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import uuid
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import List, Mapping, Optional, Union
+
+from vllm import PoolingParams
+from vllm.inputs import PromptType
+from vllm.lora.request import LoRARequest
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import SamplingParams
+from vllm.utils import Device
+
+VLLM_RPC_SUCCESS_STR = "SUCCESS"
+
+IPC_INPUT_EXT = "_input_socket"
+IPC_OUTPUT_EXT = "_output_socket"
+IPC_HEALTH_EXT = "_health_socket"
+IPC_DATA_EXT = "_data_socket"
+
+
+class MQEngineDeadError(RuntimeError):
+    pass
+
+
+@dataclass
+class RPCProcessRequest:
+    prompt: PromptType
+    params: Union[SamplingParams, PoolingParams]
+    request_id: str
+    lora_request: Optional[LoRARequest] = None
+    trace_headers: Optional[Mapping[str, str]] = None
+    priority: int = 0
+
+    def __init__(
+        self,
+        prompt: PromptType,
+        params: Union[SamplingParams, PoolingParams],
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        priority: int = 0,
+    ) -> None:
+        super().__init__()
+
+        self.prompt = prompt
+        self.params = params
+        self.request_id = request_id
+        self.lora_request = lora_request
+        self.trace_headers = trace_headers
+        self.priority = priority
+
+
+@dataclass
+class RPCError:
+    request_id: Optional[str]
+    is_engine_errored: bool
+    exception: BaseException
+
+
+@dataclass
+class RPCAbortRequest:
+    request_id: str
+
+
+class RPCStartupRequest(Enum):
+    IS_SERVER_READY = 1
+
+
+@dataclass
+class RPCStartupResponse:
+    tracing_enabled: bool
+
+
+class RPCUProfileRequest(Enum):
+    START_PROFILE = 1
+    STOP_PROFILE = 2
+
+
+class RPCResetMultiModalCacheRequest(Enum):
+    RESET = 1
+
+
+@dataclass
+class RPCResetPrefixCacheRequest:
+    device: Device
+
+
+class RPCSleepRequest(Enum):
+    SLEEP_LEVEL_1 = 1
+    SLEEP_LEVEL_2 = 2
+
+
+@dataclass
+class RPCWakeUpRequest:
+    tags: Optional[list[str]] = None
+
+
+@dataclass
+class RPCIsSleepingRequest:
+    # Set the default value of request_id to a new UUID
+    request_id: str = field(default_factory=lambda: str(uuid.uuid4()))
+
+
+@dataclass
+class RPCIsSleepingResponse:
+    request_id: str
+    is_sleeping: bool
+
+
+@dataclass
+class RPCLoadAdapterRequest:
+    lora_request: LoRARequest
+    # Set the default value of request_id to a new UUID
+    request_id: str = field(default_factory=lambda: str(uuid.uuid4()))
+
+
+@dataclass
+class RPCAdapterLoadedResponse:
+    request_id: str
+
+
+RPC_REQUEST_T = Union[RPCProcessRequest, RPCAbortRequest, RPCStartupRequest,
+                      RPCUProfileRequest, RPCLoadAdapterRequest,
+                      RPCResetMultiModalCacheRequest,
+                      RPCResetPrefixCacheRequest, RPCSleepRequest,
+                      RPCWakeUpRequest, RPCIsSleepingRequest]
+
+REQUEST_OUTPUTS_T = Union[List[RequestOutput], RPCAdapterLoadedResponse,
+                          RPCIsSleepingResponse, RPCError]
+
+
+def ENGINE_DEAD_ERROR(
+        error: Optional[BaseException] = None) -> MQEngineDeadError:
+    if error is None:
+        return MQEngineDeadError(
+            "Engine loop is not running. Inspect the stacktrace to "
+            "find the original error")
+
+    return MQEngineDeadError(
+        "Engine loop is not running. Inspect the stacktrace to "
+        f"find the original error: {repr(error)}.")
diff --git a/vllm_v0.10.0/vllm/engine/multiprocessing/client.py b/vllm_v0.10.0/vllm/engine/multiprocessing/client.py
new file mode 100644
index 0000000..67d9a3b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/engine/multiprocessing/client.py
@@ -0,0 +1,674 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import copy
+import pickle
+from contextlib import contextmanager, suppress
+from typing import (Any, AsyncGenerator, Dict, Iterator, List, Mapping,
+                    Optional, Union, cast)
+
+import cloudpickle
+import psutil
+import zmq
+import zmq.asyncio
+from zmq import Frame  # type: ignore[attr-defined]
+from zmq.asyncio import Socket
+
+from vllm import PoolingParams
+from vllm.config import DecodingConfig, ModelConfig, VllmConfig
+from vllm.core.scheduler import SchedulerOutputs
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.engine.async_llm_engine import (
+    build_guided_decoding_logits_processor_async)
+from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR, IPC_DATA_EXT,
+                                         IPC_HEALTH_EXT, IPC_INPUT_EXT,
+                                         IPC_OUTPUT_EXT, RPC_REQUEST_T,
+                                         VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
+                                         RPCAdapterLoadedResponse, RPCError,
+                                         RPCIsSleepingRequest,
+                                         RPCIsSleepingResponse,
+                                         RPCLoadAdapterRequest,
+                                         RPCProcessRequest,
+                                         RPCResetMultiModalCacheRequest,
+                                         RPCResetPrefixCacheRequest,
+                                         RPCSleepRequest, RPCStartupRequest,
+                                         RPCStartupResponse,
+                                         RPCUProfileRequest, RPCWakeUpRequest)
+from vllm.engine.protocol import EngineClient
+# yapf: enable
+from vllm.envs import VLLM_RPC_TIMEOUT
+from vllm.inputs import PromptType
+from vllm.inputs.preprocess import InputPreprocessor
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.outputs import PoolingRequestOutput, RequestOutput
+from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
+from vllm.utils import Device
+
+logger = init_logger(__name__)
+
+
+class MQClientClosedError(Exception):
+    """Exception class raised when the client is used post-close.
+
+    The client can be closed, which closes the ZMQ context. This normally
+    happens on server shutdown. In some cases, methods like abort and
+    do_log_stats will still be called and then try to open a socket, which
+    causes a ZMQError and creates a huge stack trace.
+    So, we throw this error such that we can suppress it.
+    """
+
+
+class MQLLMEngineClient(EngineClient):
+    """A client wrapper for MQLLMEngine that conforms to the
+    EngineClient protocol.
+
+    MQLLMEngine and MQLLMEngineClient are intended to run in separate
+    processes communicating via zeromq ipc sockets.
+
+    The entrypoint to MQLLMEngineClient is through the generate()
+    method. On generate() MQLLMEngine does three things:
+        - Creates an asyncio output queue
+        - Sends a RPCGenerateRequest to the MQLLMEngine via zmq
+        - Pulls RequestOutputs from its queue and yields them
+
+    MQLLMEngine runs two background loops:
+        - output_loop: the output loop pulls List[RequestOutput]
+            from the MQLLMEngine via zmq (each list is the output
+            of one engine_step in the LLMEngine). It then parses
+            the list and pushes individual request_outputs into
+            the corresponding output_queue such that they can be
+            consumed by the .generate() method.
+        - health_loop: the health loop queries the health socket
+            every N seconds, confirming the engine is healthy
+    """
+
+    def __init__(self, ipc_path: str, engine_config: VllmConfig,
+                 engine_pid: int):
+        self.context = zmq.asyncio.Context()
+        self._errored_with: Optional[BaseException] = None
+
+        # Get the configs.
+        self.vllm_config = engine_config
+        self.model_config = engine_config.model_config
+        self.decoding_config = engine_config.decoding_config
+
+        # Create the tokenizer group.
+        self.tokenizer = init_tokenizer_from_configs(
+            model_config=self.model_config,
+            scheduler_config=engine_config.scheduler_config,
+            lora_config=engine_config.lora_config)
+        self.input_preprocessor = InputPreprocessor(self.model_config,
+                                                    self.tokenizer)
+
+        # Send RPCGenerateRequest to the MQLLMEngine.
+        self.input_socket: Socket = self.context.socket(zmq.constants.PUSH)
+        self.input_socket.connect(f"{ipc_path}{IPC_INPUT_EXT}")
+
+        # Receive streams of RequestOutput from the MQLLMEngine.
+        self.output_socket: Socket = self.context.socket(zmq.constants.PULL)
+        self.output_socket.connect(f"{ipc_path}{IPC_OUTPUT_EXT}")
+
+        # IPC path for acking heartbeats.
+        self.heartbeat_socket: Socket = self.context.socket(zmq.constants.PULL)
+        self.heartbeat_socket.connect(f"{ipc_path}{IPC_HEALTH_EXT}")
+
+        # IPC path for the data socket.
+        self.data_ipc_path = f"{ipc_path}{IPC_DATA_EXT}"
+
+        # Stream for each individual request.
+        self.output_queues: Dict[str, asyncio.Queue] = {}
+
+        # Loop to handle output of the LLMEngine periodically.
+        # Started after the MQLLMEngine is ready so that we can
+        # build the Client in an executor to enable clean shutdown.
+        self.output_loop: Optional[asyncio.Task] = None
+
+        # Loop to check health of the LLMEngine periodically.
+        # Started after the MQLLMEngine is ready.
+        self.health_loop: Optional[asyncio.Task] = None
+        self._engine_process = psutil.Process(engine_pid)
+
+    @staticmethod
+    def is_unsupported_config(vllm_config: VllmConfig):
+        # Pipeline parallel not yet supported
+        return vllm_config.parallel_config.pipeline_parallel_size > 1
+
+    @contextmanager
+    def get_data_socket(self) -> Iterator[Socket]:
+        socket = self.context.socket(zmq.constants.DEALER)
+        try:
+            socket.connect(self.data_ipc_path)
+            yield socket
+        finally:
+            socket.close(linger=0)
+
+    async def run_heartbeat_loop(self, timeout: int):
+        """Background loop that continually checks to ensure the engine process
+        is still alive.
+        """
+        try:
+            while True:
+                # Check if the engine process is running:
+                if not self._engine_process.is_running() or (
+                        self._engine_process.status() == psutil.STATUS_ZOMBIE):
+                    # NB: is_running() returns True for zombies
+                    self._set_errored(
+                        RuntimeError(
+                            f"Engine process (pid {self._engine_process.pid}) "
+                            "died."))
+                    break
+
+                if await self.heartbeat_socket.poll(timeout=timeout):
+                    # Heartbeat received- check the message
+                    await self._check_success(
+                        error_message="Heartbeat failed.",
+                        socket=self.heartbeat_socket)
+
+                logger.debug("Heartbeat successful.")
+
+        except asyncio.CancelledError:
+            logger.debug("Shutting down MQLLMEngineClient check health loop.")
+
+        except psutil.NoSuchProcess:
+            self._set_errored(
+                RuntimeError(
+                    f"Engine process (pid {self._engine_process.pid}) died."))
+
+        except Exception as e:
+            self._set_errored(e)
+
+    async def run_output_handler_loop(self):
+        """Get RequestOutputs from Engine and stream to Request Queues"""
+
+        try:
+            while True:
+                # Poll, checking for ENGINE_DEAD
+                while await self.output_socket.poll(timeout=VLLM_RPC_TIMEOUT
+                                                    ) == 0:
+                    logger.debug("Waiting for output from MQLLMEngine.")
+
+                    # If errored, alert all running requests.
+                    if self.errored:
+                        for queue_j in tuple(self.output_queues.values()):
+                            queue_j.put_nowait(
+                                ENGINE_DEAD_ERROR(self._errored_with))
+                        return
+
+                message: Frame = await self.output_socket.recv(copy=False)
+                request_outputs = pickle.loads(message.buffer)
+
+                is_error = isinstance(request_outputs,
+                                      (BaseException, RPCError))
+                if is_error:
+                    if isinstance(request_outputs, RPCError):
+                        rpc_error: RPCError = request_outputs
+                        request_id = rpc_error.request_id
+                        exception = rpc_error.exception
+                        is_engine_errored = rpc_error.is_engine_errored
+                    else:
+                        # MPLLMEngine should always return an RPCError to
+                        # the output_socket when an issue arises.
+                        # If we are here, we are in a bad state and
+                        # should shut down the server.
+                        error: BaseException = request_outputs
+                        logger.error(
+                            "Received Exception %s rather than RPCError from "
+                            "MPLLMEngine. This should never happen.", error)
+                        request_id = None
+                        exception = error
+                        is_engine_errored = True
+
+                    # Set to error state only on engine critical error
+                    # (and record only the first one)
+                    if is_engine_errored and not self._errored_with:
+                        self._errored_with = exception
+                        # If engine is errored, no matter the type of exception
+                        # it will no longer be able to receive new requests,
+                        # therefore we have to inform that the current
+                        # processed requests failed as well. Send back a dead
+                        # engine error give this feedback and also give a
+                        # 'hint' to the server to shutdown next.
+                        exception = self.dead_error
+
+                    if request_id is None:
+                        # If request_id is None, then the engine raised an
+                        # exception for a batch, and we may not know the
+                        # request that caused it, neither if it was actually
+                        # caused by any of them (e.g. CUDA OOM). Therefore we
+                        # broadcast the same exception for all requests.
+                        for queue_i in tuple(self.output_queues.values()):
+                            queue_i.put_nowait(exception)
+                    else:
+                        queue = self.output_queues.get(request_id)
+                        if queue is not None:
+                            queue.put_nowait(exception)
+                # Put each output into the appropriate queue.
+                elif isinstance(
+                        request_outputs,
+                    (RPCAdapterLoadedResponse, RPCIsSleepingResponse)):
+                    self._add_output(request_outputs)
+                else:
+                    for request_output in request_outputs:
+                        self._add_output(request_output)
+
+        except asyncio.CancelledError:
+            logger.debug("Shutting down MQLLMEngineClient output handler.")
+
+    def _add_output(self, request_output: Union[RequestOutput,
+                                                RPCAdapterLoadedResponse,
+                                                RPCIsSleepingResponse]):
+        queue = self.output_queues.get(request_output.request_id)
+        if queue is not None:
+            queue.put_nowait(request_output)
+
+    async def setup(self):
+        """Setup the client before it starts sending server requests."""
+
+        # Start output_loop
+        if self.output_loop is None:
+            # only generate once to avoid multiple concurrent output_loops
+            # this will lead to race conditions and wrong orders of tokens
+            # returned by the engine
+            # setup will be called multiple times during the startup of
+            # the engine
+            self.output_loop = asyncio.create_task(
+                self.run_output_handler_loop())
+
+        with self.get_data_socket() as socket:
+            # Wait until server is ready.
+            response = await self._wait_for_server_rpc(socket)
+
+            self.tracing_flag = response.tracing_enabled
+
+            # Start health_loop.
+            if self.health_loop is None:
+                self.health_loop = asyncio.create_task(
+                    self.run_heartbeat_loop(timeout=VLLM_RPC_TIMEOUT))
+
+    def close(self):
+        """Destroy the ZeroMQ Context."""
+        # Close all sockets and terminate the context.
+        self.context.destroy(linger=0)
+
+        # Cancel background tasks.
+        if self.health_loop is not None:
+            self.health_loop.cancel()
+        if self.output_loop is not None:
+            self.output_loop.cancel()
+
+    def _set_errored(self, e: BaseException):
+        logger.exception(repr(e))
+        if self._errored_with is None:
+            self._errored_with = e
+
+    @staticmethod
+    async def _send_get_data_rpc_request(request: RPCStartupRequest,
+                                         expected_type: Any,
+                                         error_message: str,
+                                         socket: Socket) -> Any:
+        """Send an RPC request that is expecting data back."""
+
+        # Ping RPCServer with a request.
+        await socket.send_multipart((pickle.dumps(request), ), copy=False)
+
+        # Make sure the server responds in time.
+        if await socket.poll(timeout=VLLM_RPC_TIMEOUT) == 0:
+            raise TimeoutError("RPCServer didn't reply within "
+                               f"{VLLM_RPC_TIMEOUT} ms")
+
+        # Await the data from the Server.
+        frame = await socket.recv(copy=False)
+        data = pickle.loads(frame.buffer)
+
+        if isinstance(data, BaseException):
+            raise data
+        elif not isinstance(data, expected_type):
+            raise ValueError(error_message)
+
+        return data
+
+    @staticmethod
+    async def _send_one_way_rpc_request(request: RPC_REQUEST_T,
+                                        socket: Socket):
+        """Send one-way RPC request to trigger an action."""
+
+        if socket.closed:
+            raise MQClientClosedError()
+
+        await socket.send_multipart((pickle.dumps(request), ))
+
+    async def _await_ack(self, error_message: str, socket: Socket):
+        """Await acknowledgement that a request succeeded."""
+
+        if socket.closed:
+            raise MQClientClosedError()
+
+        if await socket.poll(timeout=VLLM_RPC_TIMEOUT) == 0:
+            raise TimeoutError("MQLLMEngine didn't reply within "
+                               f"{VLLM_RPC_TIMEOUT}ms")
+
+        await self._check_success(error_message, socket)
+
+    @staticmethod
+    async def _check_success(error_message: str, socket: Socket):
+        """Confirm that socket has a VLLM_RPC_SUCCESS_STR message"""
+
+        if socket.closed:
+            raise MQClientClosedError()
+
+        frame = await socket.recv(copy=False)
+        response = pickle.loads(frame.buffer)
+
+        # Raise error if unsuccessful
+        if isinstance(response, BaseException):
+            raise response
+        elif (not isinstance(response, str)
+              or response != VLLM_RPC_SUCCESS_STR):
+            raise ValueError(error_message)
+
+    async def get_input_preprocessor(self) -> InputPreprocessor:
+        return self.input_preprocessor
+
+    async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None):
+        return await self.tokenizer.get_lora_tokenizer_async(lora_request)
+
+    async def get_vllm_config(self) -> VllmConfig:
+        return self.vllm_config
+
+    async def get_decoding_config(self) -> DecodingConfig:
+        return self.decoding_config
+
+    async def get_model_config(self) -> ModelConfig:
+        return self.model_config
+
+    async def is_tracing_enabled(self) -> bool:
+        return self.tracing_flag
+
+    async def _wait_for_server_rpc(self, socket: Socket) -> RPCStartupResponse:
+        """Wait for the RPCServer to start up."""
+
+        return await self._send_get_data_rpc_request(
+            request=RPCStartupRequest.IS_SERVER_READY,
+            expected_type=RPCStartupResponse,
+            error_message="Unable to start RPC Server",
+            socket=socket)
+
+    async def abort(self, request_id: str):
+        """Send an ABORT_REQUEST signal to the RPC Server"""
+
+        with suppress(MQClientClosedError):
+            await self._send_one_way_rpc_request(
+                request=RPCAbortRequest(request_id), socket=self.input_socket)
+
+    async def do_log_stats(
+        self,
+        scheduler_outputs: Optional[SchedulerOutputs] = None,
+        model_output: Optional[List[SamplerOutput]] = None,
+    ) -> None:
+        """
+        Ignore do_log_stats (handled on MQLLMEngine polling)
+        """
+        pass
+
+    async def check_health(self):
+        """
+        The check health loop probes the health status of the
+        Engine's health every N seconds and sets _errored_with
+        if the engine is unhealthy.
+        """
+        if self._errored_with is not None:
+            raise self._errored_with
+
+    @property
+    def is_running(self) -> bool:
+        return not self.errored
+
+    @property
+    def is_stopped(self) -> bool:
+        return self.errored
+
+    @property
+    def errored(self) -> bool:
+        return self._errored_with is not None
+
+    @property
+    def dead_error(self) -> BaseException:
+        return ENGINE_DEAD_ERROR(self._errored_with)
+
+    def generate(
+        self,
+        prompt: PromptType,
+        sampling_params: SamplingParams,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        priority: int = 0,
+    ) -> AsyncGenerator[RequestOutput, None]:
+        """Generate outputs for a request.
+
+        Generate outputs for a request. This method is a coroutine. It adds the
+        request into the waiting queue of the LLMEngine and streams the outputs
+        from the LLMEngine to the caller.
+
+        Args:
+            prompt: The prompt to the LLM. See
+                [`PromptType`][vllm.inputs.PromptType] for more details about
+                the format of each input.
+            sampling_params: The sampling parameters of the request.
+            request_id: The unique id of the request.
+            lora_request: LoRA request to use for generation, if any.
+            trace_headers: OpenTelemetry trace headers.
+            priority: Priority of the request (lower means earlier handling).
+                Any priority other than 0 will lead to an error if the
+                scheduling policy is not "priority".
+        """
+        return cast(
+            AsyncGenerator[RequestOutput, None],
+            self._process_request(prompt, sampling_params, request_id,
+                                  lora_request, trace_headers, priority))
+
+    def encode(
+        self,
+        prompt: PromptType,
+        pooling_params: PoolingParams,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        priority: int = 0,
+    ) -> AsyncGenerator[PoolingRequestOutput, None]:
+        """Generate outputs for a request from a pooling model.
+
+        Generate outputs for a request. This method is a coroutine. It adds the
+        request into the waiting queue of the LLMEngine and streams the outputs
+        from the LLMEngine to the caller.
+
+        Args:
+            prompt: The prompt to the LLM. See
+                [`PromptType`][vllm.inputs.PromptType] for more details about
+                the format of each input.
+            pooling_params: The pooling parameters of the request.
+            request_id: The unique id of the request.
+            lora_request: LoRA request to use for generation, if any.
+            trace_headers: OpenTelemetry trace headers.
+
+        Yields:
+            The output `PoolingRequestOutput` objects from the LLMEngine
+            for the request.
+        """
+        return cast(
+            AsyncGenerator[PoolingRequestOutput, None],
+            self._process_request(prompt,
+                                  pooling_params,
+                                  request_id,
+                                  lora_request,
+                                  trace_headers,
+                                  priority=priority))
+
+    async def _process_request(
+        self,
+        prompt: PromptType,
+        params: Union[SamplingParams, PoolingParams],
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        priority: int = 0,
+    ) -> Union[AsyncGenerator[RequestOutput, None], AsyncGenerator[
+            PoolingRequestOutput, None]]:
+        """Send an RPCGenerateRequest to the RPCServer and stream responses."""
+
+        # If already dead, error out.
+        if self._errored_with is not None:
+            raise ENGINE_DEAD_ERROR(self._errored_with)
+
+        # Ensure the request id is unique among running requests
+        if request_id in self.output_queues:
+            raise ValueError(f"Request {request_id} already exists")
+
+        # Constructing guided decoding logits processors is expensive, so we do
+        # it here to avoid contending with cpu resources and the GIL on the
+        # backend process.
+        if isinstance(params, SamplingParams) and \
+            params.guided_decoding is not None:
+            params = await \
+                build_guided_decoding_logits_processor_async(
+                    sampling_params=params,
+                    tokenizer=await self.get_tokenizer(lora_request),
+                    default_guided_backend=(self.decoding_config.backend
+                        if self.decoding_config
+                        else DecodingConfig.backend),
+                    model_config=self.model_config,
+                    reasoning_backend=self.decoding_config.reasoning_backend,
+                )
+
+        # 1) Create output queue for this requests.
+        queue: asyncio.Queue[Union[RequestOutput,
+                                   BaseException]] = asyncio.Queue()
+        self.output_queues[request_id] = queue
+
+        try:
+            # 2) Detach logits processors so that they can be pickled
+            # separately (may require cloudpickle which is slower)
+            if isinstance(params, SamplingParams) and params.logits_processors:
+                # Defensive shallow copy
+                params = copy.copy(params)
+                logits_processors = params.logits_processors
+                params.logits_processors = None
+                lp_bytes = cloudpickle.dumps(logits_processors)
+            else:
+                lp_bytes = None
+
+            request_bytes = pickle.dumps(
+                RPCProcessRequest(
+                    prompt=prompt,
+                    params=params,
+                    request_id=request_id,
+                    lora_request=lora_request,
+                    trace_headers=trace_headers,
+                    priority=priority,
+                ))
+
+            # 3) Send the RPCGenerateRequest to the MQLLMEngine.
+            parts = (request_bytes,
+                     lp_bytes) if lp_bytes else (request_bytes, )
+            await self.input_socket.send_multipart(parts, copy=False)
+
+            # 4) Stream the RequestOutputs from the output queue. Note
+            # that the output_loop pushes RequestOutput objects to this
+            # queue after pulling them from the zmq socket.
+            finished = False
+            try:
+                while not finished:
+                    request_output = await queue.get()
+
+                    if isinstance(request_output, BaseException):
+                        raise request_output
+
+                    finished = request_output.finished
+                    yield request_output
+            finally:
+                # Request was canceled by the client.
+                if not finished and not self.errored:
+                    await self.abort(request_id)
+        finally:
+            self.output_queues.pop(request_id)
+
+    async def start_profile(self) -> None:
+        """Start profiling the engine"""
+
+        await self._send_one_way_rpc_request(
+            request=RPCUProfileRequest.START_PROFILE, socket=self.input_socket)
+
+    async def stop_profile(self) -> None:
+        """Stop profiling the engine"""
+
+        await self._send_one_way_rpc_request(
+            request=RPCUProfileRequest.STOP_PROFILE, socket=self.input_socket)
+
+    async def reset_mm_cache(self) -> None:
+        """Reset the multi-modal cache"""
+
+        await self._send_one_way_rpc_request(
+            request=RPCResetMultiModalCacheRequest.RESET,
+            socket=self.input_socket)
+
+    async def reset_prefix_cache(self,
+                                 device: Optional[Device] = None) -> None:
+        """Reset the prefix cache"""
+
+        await self._send_one_way_rpc_request(
+            request=RPCResetPrefixCacheRequest(device),
+            socket=self.input_socket)
+
+    async def sleep(self, level: int = 1) -> None:
+        """Sleep the engine for a given level"""
+        return await self._send_one_way_rpc_request(
+            request=RPCSleepRequest(level), socket=self.input_socket)
+
+    async def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        """Wake up the engine"""
+        return await self._send_one_way_rpc_request(
+            request=RPCWakeUpRequest(tags), socket=self.input_socket)
+
+    async def is_sleeping(self) -> bool:
+        """Check whether the engine is sleeping"""
+        request = RPCIsSleepingRequest()
+
+        queue: asyncio.Queue[Union[BaseException,
+                                   RPCIsSleepingResponse]] = asyncio.Queue()
+        self.output_queues[request.request_id] = queue
+
+        request_bytes = pickle.dumps(request)
+        await self.input_socket.send_multipart((request_bytes, ), copy=False)
+
+        request_output = await queue.get()
+        self.output_queues.pop(request.request_id)
+
+        if isinstance(request_output, BaseException):
+            raise request_output
+        return request_output.is_sleeping
+
+    async def add_lora(self, lora_request: LoRARequest) -> None:
+        """Load a new LoRA adapter into the engine for future requests."""
+        # Uses the same I/O as generate requests
+        request = RPCLoadAdapterRequest(lora_request)
+
+        # Create output queue for this requests.
+        queue: asyncio.Queue[Union[None, BaseException]] = asyncio.Queue()
+        self.output_queues[request.request_id] = queue
+
+        # Send the request
+        request_bytes = pickle.dumps(request)
+        await self.input_socket.send_multipart((request_bytes, ), copy=False)
+
+        # Wait for the response
+        request_output = await queue.get()
+        self.output_queues.pop(request.request_id)
+
+        # Raise on error, otherwise happily return None
+        if isinstance(request_output, BaseException):
+            raise request_output
diff --git a/vllm_v0.10.0/vllm/engine/multiprocessing/engine.py b/vllm_v0.10.0/vllm/engine/multiprocessing/engine.py
new file mode 100644
index 0000000..fe6eb0d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/engine/multiprocessing/engine.py
@@ -0,0 +1,458 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pickle
+import signal
+from contextlib import contextmanager
+from typing import Iterator, List, Optional, Union
+
+import cloudpickle
+import zmq
+
+from vllm import AsyncEngineArgs, SamplingParams
+from vllm.config import VllmConfig
+from vllm.engine.llm_engine import LLMEngine
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR, IPC_DATA_EXT,
+                                         IPC_HEALTH_EXT, IPC_INPUT_EXT,
+                                         IPC_OUTPUT_EXT, REQUEST_OUTPUTS_T,
+                                         VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
+                                         RPCAdapterLoadedResponse, RPCError,
+                                         RPCIsSleepingRequest,
+                                         RPCIsSleepingResponse,
+                                         RPCLoadAdapterRequest,
+                                         RPCProcessRequest,
+                                         RPCResetMultiModalCacheRequest,
+                                         RPCResetPrefixCacheRequest,
+                                         RPCSleepRequest, RPCStartupRequest,
+                                         RPCStartupResponse,
+                                         RPCUProfileRequest, RPCWakeUpRequest)
+# yapf: enable
+from vllm.logger import init_logger
+from vllm.outputs import RequestOutput
+from vllm.transformers_utils.config import (
+    maybe_register_config_serialize_by_value)
+from vllm.usage.usage_lib import UsageContext
+from vllm.worker.model_runner_base import InputProcessingError
+
+logger = init_logger(__name__)
+
+POLLING_TIMEOUT_MS = 10000
+HEALTHY_RESPONSE = (pickle.dumps(VLLM_RPC_SUCCESS_STR), )
+
+
+class MQLLMEngine:
+    """A multiprocessing wrapper for
+    [`LLMEngine`][vllm.engine.llm_engine.LLMEngine].
+
+    This class is used to wrap the
+    [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] class to enable use
+    in concurrnet manner. It runs a background loop and uses zeromq to
+    receive new requests and stream outputs incrementally via ipc.
+
+    The [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] generate or encode
+    process is kicked off when a new RPCProcessRequest is received by the
+    input_socket.
+
+    The self.engine_loop checks the input_socket for new requests,
+    adds them to the LLMEngine if there are any, calls the internal
+    [`LLMEngine.step()`][vllm.engine.llm_engine.LLMEngine.step], and sends
+    the RequestOutputs back over the output_socket.
+
+    If use_async_sockets is set, the logic associated with reading new
+    requests from the socket and sending data to the socket is passed
+    as a callback to the llm_engine, which calls the logic asynchronously
+    such that the IPC can be overlapped with the GPU.
+
+    Args:
+        ipc_path: Base path for zeromq interprocess messaging
+        use_async_sockets: Whether to make send/recv async with GPU
+        log_requests: Whether to log the requests.
+        *args: Arguments for [`LLMEngine`][vllm.engine.llm_engine.LLMEngine].
+        **kwargs: Arguments for [`LLMEngine`][vllm.engine.llm_engine.LLMEngine].
+    """
+
+    def __init__(self,
+                 ipc_path: str,
+                 use_async_sockets: bool,
+                 *args,
+                 log_requests: bool = True,
+                 **kwargs) -> None:
+        # For MQLLMEngine, we can use cached outputs, since each new request
+        # output is immediately pickled and send over the socket, which frees
+        # the python object to be reused again.
+        kwargs['use_cached_outputs'] = True
+
+        self.engine = LLMEngine(*args, **kwargs)
+        self.log_requests = log_requests
+
+        self.use_async_sockets = use_async_sockets
+        if self.use_async_sockets:
+            self.engine.process_request_outputs_callback = \
+                self._async_socket_engine_callback
+
+        self.ctx = zmq.Context()  # type: ignore[attr-defined]
+
+        # Receive input from the client.
+        self.input_socket = self.ctx.socket(zmq.constants.PULL)
+        self.input_socket.bind(f"{ipc_path}{IPC_INPUT_EXT}")
+
+        # Send output stream back to client.
+        self.output_socket = self.ctx.socket(zmq.constants.PUSH)
+        self.output_socket.bind(f"{ipc_path}{IPC_OUTPUT_EXT}")
+
+        # Send heartbeats back to client.
+        self.heartbeat_socket = self.ctx.socket(zmq.constants.PUSH)
+        self.heartbeat_socket.bind(f"{ipc_path}{IPC_HEALTH_EXT}")
+
+        # IPC path for the data socket.
+        self.data_ipc_path = f"{ipc_path}{IPC_DATA_EXT}"
+
+        # Error state.
+        self._errored_with: Optional[BaseException] = None
+
+    @property
+    def dead_error(self) -> BaseException:
+        if self._errored_with is not None:
+            return ENGINE_DEAD_ERROR(self._errored_with)
+        else:
+            return ENGINE_DEAD_ERROR()
+
+    @classmethod
+    def from_vllm_config(cls, vllm_config: VllmConfig,
+                         usage_context: UsageContext,
+                         disable_log_requests: bool, disable_log_stats: bool,
+                         ipc_path: str) -> "MQLLMEngine":
+        # Setup plugins for each process
+        from vllm.plugins import load_general_plugins
+        load_general_plugins()
+
+        use_async_sockets = vllm_config.model_config.use_async_output_proc
+
+        return cls(
+            vllm_config=vllm_config,
+            executor_class=LLMEngine._get_executor_cls(vllm_config),
+            ipc_path=ipc_path,
+            usage_context=usage_context,
+            use_async_sockets=use_async_sockets,
+            log_requests=(not disable_log_requests),
+            log_stats=(not disable_log_stats),
+        )
+
+    @staticmethod
+    def from_engine_args(engine_args: AsyncEngineArgs,
+                         usage_context: UsageContext, ipc_path: str):
+        """Creates an MQLLMEngine from the engine arguments."""
+
+        vllm_config = engine_args.create_engine_config(usage_context)
+        return MQLLMEngine.from_vllm_config(
+            ipc_path=ipc_path,
+            vllm_config=vllm_config,
+            usage_context=usage_context,
+            disable_log_requests=engine_args.disable_log_requests,
+            disable_log_stats=engine_args.disable_log_stats,
+        )
+
+    def start(self):
+        try:
+            try:
+                logger.debug("Starting Startup Loop.")
+                self.run_startup_loop()
+                logger.debug("Starting Engine Loop.")
+                self.run_engine_loop()
+            except Exception as e:
+                logger.exception(repr(e))
+        except KeyboardInterrupt:
+            logger.debug("Shutting down MQLLMEngine.")
+        finally:
+            logger.debug("MQLLMEngine is shut down.")
+            self.cleanup()
+
+    def cleanup(self):
+        """Cleanup zeromq state on shutdown."""
+        # Closes all sockets and destroys context.
+        self.ctx.destroy(linger=0)
+        del self.engine
+
+    @contextmanager
+    def make_data_socket(
+            self) -> Iterator[zmq.Socket]:  # type: ignore[name-defined]
+        socket = self.ctx.socket(zmq.constants.ROUTER)
+        try:
+            socket.bind(self.data_ipc_path)
+            yield socket
+        finally:
+            socket.close(linger=0)
+
+    def run_startup_loop(self) -> None:
+        """Startup loop for sending data from Engine -> Client."""
+
+        with self.make_data_socket() as socket:
+            response: Union[RPCStartupResponse, BaseException]
+            try:
+                identity, message = socket.recv_multipart(copy=False)
+                request: RPCStartupRequest = pickle.loads(message.buffer)
+
+                # Handle the query from the Client.
+                if request == RPCStartupRequest.IS_SERVER_READY:
+                    tracing_enabled = self.engine.is_tracing_enabled()
+                    response = RPCStartupResponse(
+                        tracing_enabled=tracing_enabled)
+
+            except Exception as e:
+                response = e
+
+            socket.send_multipart((identity, pickle.dumps(response)),
+                                  copy=False)
+
+    def run_engine_loop(self):
+        """Core busy loop of the LLMEngine."""
+
+        while True:
+            if not self.engine.has_unfinished_requests():
+                # Poll until there is work to do.
+                while self.input_socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
+                    # When there's no work, check on engine health and send
+                    # health status back to client
+                    self._health_check()
+                    self.engine.do_log_stats()
+                    logger.debug("Waiting for new requests in engine loop.")
+
+            # Handle any input from the client.
+            self.handle_new_input()
+
+            # Engine step.
+            request_outputs = self.engine_step()
+
+            # Send request outputs (if async, done in engine_step callback).
+            if not self.use_async_sockets:
+                self._send_outputs(request_outputs)
+
+    def engine_step(self) -> List[RequestOutput]:
+        """Engine step wrapper with error handling."""
+        try:
+            return self.engine.step()
+        except SystemExit:
+            raise
+        except InputProcessingError as e:
+            # Special case where we handle an error preparing the inputs for
+            # a single request in the batch
+            rpc_err = RPCError(request_id=e.request_id,
+                               is_engine_errored=False,
+                               exception=e.__cause__)
+            self._send_outputs(rpc_err)
+            return []
+        except BaseException as e:
+            self._set_errored(e)
+            rpc_err = RPCError(request_id=None,
+                               is_engine_errored=True,
+                               exception=e)
+            self._send_outputs(rpc_err)
+            raise e
+
+    def handle_new_input(self):
+        """Handle new input from the socket"""
+        try:
+            while self.input_socket.poll(timeout=0) != 0:
+                frames = self.input_socket.recv_multipart(copy=False)
+                request = pickle.loads(frames[0].buffer)
+
+                if isinstance(request, RPCProcessRequest):
+                    if len(frames) > 1:
+                        # Use cloudpickle for logits processors
+                        assert isinstance(request.params, SamplingParams)
+                        lprocs = cloudpickle.loads(frames[1].buffer)
+                        request.params.logits_processors = lprocs
+                    self._handle_process_request(request)
+                elif isinstance(request, RPCAbortRequest):
+                    self._handle_abort_request(request)
+                elif isinstance(request, RPCUProfileRequest):
+                    if request == RPCUProfileRequest.START_PROFILE:
+                        self.start_profile()
+                    else:
+                        self.stop_profile()
+                elif isinstance(request, RPCLoadAdapterRequest):
+                    self._handle_load_adapter_request(request)
+                elif isinstance(request, RPCResetMultiModalCacheRequest):
+                    self.reset_mm_cache()
+                elif isinstance(request, RPCResetPrefixCacheRequest):
+                    self.reset_prefix_cache()
+                elif isinstance(request, RPCSleepRequest):
+                    self.sleep(request.value)
+                elif isinstance(request, RPCWakeUpRequest):
+                    self.wake_up(request.tags)
+                elif isinstance(request, RPCIsSleepingRequest):
+                    self._handle_is_sleeping_request(request)
+                else:
+                    raise ValueError("Unknown RPCRequest Type: "
+                                     f"{type(request)}")
+
+        except Exception as e:
+            self._set_errored(e)
+            self._send_unhealthy(e)
+            raise e from None
+
+    def _handle_process_request(self, request: RPCProcessRequest):
+        """Handle RPCProcessRequest by adding it to the LLMEngine."""
+        request_id = request.request_id
+
+        if self._errored_with is not None:
+            rpc_err = RPCError(request_id=request_id,
+                               is_engine_errored=True,
+                               exception=ENGINE_DEAD_ERROR(self._errored_with))
+            self._send_outputs(rpc_err)
+
+        try:
+            self.engine.add_request(request_id=request_id,
+                                    prompt=request.prompt,
+                                    params=request.params,
+                                    lora_request=request.lora_request,
+                                    trace_headers=request.trace_headers,
+                                    priority=request.priority)
+
+            if self.log_requests:
+                logger.info("Added request %s.", request.request_id)
+
+        except Exception as e:
+            # We do not set self._errored = True here, since the error
+            # is due to an issue adding this request to the engine,
+            # rather than an issue with the engine itself.
+            logger.debug("Failed to add request %s to engine. %s",
+                         request.request_id, e)
+            is_errored = self._errored_with is not None
+            rpc_err = RPCError(request_id=request_id,
+                               is_engine_errored=is_errored,
+                               exception=e)
+            self._send_outputs(rpc_err)
+
+            # Remove request from the engine.
+            self.engine.abort_request(request_id)
+
+    def _handle_abort_request(self, request: RPCAbortRequest):
+        self.engine.abort_request(request.request_id)
+        if self.log_requests:
+            logger.info("Aborted request %s.", request.request_id)
+
+    def _handle_load_adapter_request(self, request: RPCLoadAdapterRequest):
+        try:
+            self.engine.add_lora(request.lora_request)
+        except BaseException as e:
+            # Send back an error if the adater fails to load
+            rpc_err = RPCError(request_id=request.request_id,
+                               is_engine_errored=False,
+                               exception=e)
+            self._send_outputs(rpc_err)
+            return
+        # Otherwise, send back the successful load message
+        self._send_outputs(
+            RPCAdapterLoadedResponse(request_id=request.request_id))
+
+    def _handle_is_sleeping_request(self, request: RPCIsSleepingRequest):
+        is_sleeping = self.is_sleeping()
+        self._send_outputs(
+            RPCIsSleepingResponse(request_id=request.request_id,
+                                  is_sleeping=is_sleeping))
+
+    def _health_check(self):
+        # Send unhealthy if engine has already errored
+        if self._errored_with is not None:
+            self._send_unhealthy(self._errored_with)
+        try:
+            self.engine.check_health()
+            self._send_healthy()
+        except Exception as e:
+            self._set_errored(e)
+            self._send_unhealthy(e)
+
+    def _send_outputs(self, outputs: REQUEST_OUTPUTS_T):
+        """Send outputs back to the engine client. These can be:
+        - Exceptions
+        - A list of generation outputs
+        - A response from loading a lora adapter
+        """
+        if outputs:
+            try:
+                from ray.exceptions import RayTaskError
+
+                # RayTaskError might not pickelable here. We need to unpack the
+                # underlying exception as the real exception in the output.
+                if (isinstance(outputs, RPCError)
+                        and isinstance(outputs.exception, RayTaskError)):
+                    outputs.exception = outputs.exception.cause
+            except ImportError:
+                pass
+
+            output_bytes = pickle.dumps(outputs)
+            self.output_socket.send_multipart((output_bytes, ), copy=False)
+
+    def _send_healthy(self):
+        """Send HEALTHY message to RPCClient."""
+        if not self.heartbeat_socket.closed:
+            self.heartbeat_socket.send_multipart(HEALTHY_RESPONSE, copy=False)
+
+    def _send_unhealthy(self, error: BaseException):
+        """Send UNHEALTHY message to RPCClient."""
+        if not self.heartbeat_socket.closed:
+            error_bytes = pickle.dumps(error)
+            self.heartbeat_socket.send_multipart((error_bytes, ), copy=False)
+
+    def _async_socket_engine_callback(self,
+                                      request_outputs: REQUEST_OUTPUTS_T):
+        """Callback used by engine to make socket handling async with GPU."""
+        self._send_outputs(request_outputs)
+        self.handle_new_input()
+
+    def _set_errored(self, e: BaseException):
+        """Log and set errored status if this is the first issue."""
+        if self._errored_with is None:
+            self._errored_with = e
+
+    def start_profile(self) -> None:
+        self.engine.start_profile()
+
+    def stop_profile(self) -> None:
+        self.engine.stop_profile()
+
+    def reset_mm_cache(self) -> bool:
+        return self.engine.reset_mm_cache()
+
+    def reset_prefix_cache(self) -> bool:
+        return self.engine.reset_prefix_cache()
+
+    def sleep(self, level: int = 1) -> None:
+        self.engine.sleep(level)
+
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        self.engine.wake_up(tags)
+
+    def is_sleeping(self) -> bool:
+        return self.engine.is_sleeping()
+
+
+def signal_handler(*_) -> None:
+    raise KeyboardInterrupt("MQLLMEngine terminated")
+
+
+def run_mp_engine(vllm_config: VllmConfig, usage_context: UsageContext,
+                  ipc_path: str, disable_log_stats: bool,
+                  disable_log_requests: bool, engine_alive):
+    try:
+        # Ensure we can serialize transformer config before spawning
+        maybe_register_config_serialize_by_value()
+
+        engine = MQLLMEngine.from_vllm_config(
+            vllm_config=vllm_config,
+            usage_context=usage_context,
+            disable_log_stats=disable_log_stats,
+            disable_log_requests=disable_log_requests,
+            ipc_path=ipc_path)
+
+        signal.signal(signal.SIGTERM, signal_handler)
+
+        engine.start()
+
+    except BaseException as e:
+        logger.exception(e)
+        engine_alive.value = False
+        raise e from None
diff --git a/vllm_v0.10.0/vllm/engine/output_processor/__init__.py b/vllm_v0.10.0/vllm/engine/output_processor/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/engine/output_processor/interfaces.py b/vllm_v0.10.0/vllm/engine/output_processor/interfaces.py
new file mode 100644
index 0000000..19c5963
--- /dev/null
+++ b/vllm_v0.10.0/vllm/engine/output_processor/interfaces.py
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import ABC, abstractmethod
+from typing import Callable, List
+
+from vllm.config import SchedulerConfig
+from vllm.core.scheduler import Scheduler
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.sequence import Sequence, SequenceGroup, SequenceGroupOutput
+from vllm.transformers_utils.detokenizer import Detokenizer
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import Counter
+
+
+class SequenceGroupOutputProcessor(ABC):
+    """Interface for logic that processes new token ids in sequence groups,
+    managing detokenization, stop checking, and freeing/forking sequences with
+    the scheduler.
+
+    This is highly coupled with the LLMEngine and should be seen as an extension
+    of it. The logic is separated to simplify the LLMEngine class and allow
+    separate implementations for single-step decoding (which supports beam
+    search sequence forking) and multi-step decoding (which does not support
+    beam search, but does support speculative decoding).
+    """
+
+    @staticmethod
+    def create_output_processor(
+        scheduler_config: SchedulerConfig,
+        detokenizer: Detokenizer,
+        scheduler: List[Scheduler],
+        seq_counter: Counter,
+        get_tokenizer_for_seq: Callable[[Sequence], AnyTokenizer],
+        stop_checker: "StopChecker",
+    ):
+        """Create an output processor.
+
+        This returns a single-step output processor if num_lookahead_slots is
+        zero, else returns a multi-step output processor.
+        """
+        if scheduler_config.num_lookahead_slots == 0:
+            # Importing here to avoid cycle.
+            from vllm.engine.output_processor.single_step import (
+                SingleStepOutputProcessor)
+            return SingleStepOutputProcessor(scheduler_config, detokenizer,
+                                             scheduler, seq_counter,
+                                             stop_checker)
+        else:
+            # Importing here to avoid cycle.
+            from vllm.engine.output_processor.multi_step import (
+                MultiStepOutputProcessor)
+            return MultiStepOutputProcessor(
+                detokenizer,
+                scheduler,
+                seq_counter,
+                get_tokenizer_for_seq,
+                stop_checker,
+            )
+
+    @abstractmethod
+    def process_outputs(self, sequence_group: SequenceGroup,
+                        outputs: List[SequenceGroupOutput],
+                        is_async: bool) -> None:
+        """Process new token ids for the sequence group. Handles logic such as
+        detokenization, stop checking, and freeing/forking sequences in the
+        scheduler.
+        """
+        pass
+
+    @abstractmethod
+    def process_prompt_logprob(self, seq_group: SequenceGroup,
+                               outputs: List[SequenceGroupOutput]) -> None:
+        """Update prompt logprobs received from outputs to seq_group."""
+        pass
diff --git a/vllm_v0.10.0/vllm/engine/output_processor/multi_step.py b/vllm_v0.10.0/vllm/engine/output_processor/multi_step.py
new file mode 100644
index 0000000..8b66ef0
--- /dev/null
+++ b/vllm_v0.10.0/vllm/engine/output_processor/multi_step.py
@@ -0,0 +1,211 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import functools
+from typing import Callable, List, cast
+
+from vllm.core.scheduler import Scheduler
+from vllm.engine.output_processor.interfaces import (
+    SequenceGroupOutputProcessor)
+from vllm.engine.output_processor.single_step import (
+    single_step_process_prompt_logprob)
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.logger import init_logger
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import (VLLM_INVALID_TOKEN_ID,
+                           CompletionSequenceGroupOutput, Sequence,
+                           SequenceGroup, SequenceGroupOutput, SequenceOutput,
+                           SequenceStatus)
+from vllm.transformers_utils.detokenizer import Detokenizer
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import Counter
+
+logger = init_logger(__name__)
+
+
+class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
+    """SequenceGroupOutputProcessor which handles logic related to
+    detokenization and stopping conditions. It specializes to "multi-step
+    decoding", where vLLM's worker may generate multiple tokens per invocation.
+    This is currently mutually exclusive with advanced sampling techniques like
+    beam search, which motivates the separation of this logic from the single
+    step output processor.
+
+    This class is responsible for things such as correctly appending all new
+    token ids to their sequence, detokenizing new token ids, truncating new
+    output tokens after an eos token, and correctly handling the case where the
+    number of new output tokens per sequence differs in a single batch.
+    """
+
+    def __init__(
+        self,
+        detokenizer: Detokenizer,
+        scheduler: List[Scheduler],
+        seq_counter: Counter,
+        get_tokenizer_for_seq: Callable[[Sequence], AnyTokenizer],
+        stop_checker: StopChecker,
+    ):
+        self.detokenizer = detokenizer
+        self.scheduler = scheduler
+        self.seq_counter = seq_counter
+        self.get_tokenizer_for_seq = get_tokenizer_for_seq
+        self.stop_checker = stop_checker
+
+    def process_prompt_logprob(self, seq_group: SequenceGroup,
+                               outputs: List[SequenceGroupOutput]) -> None:
+        """Process prompt logprobs associated with each step of a multi-step-
+        scheduled computation.
+
+        Args:
+          seq_group: the outputs are associated with this
+              [`SequenceGroup`][vllm.sequence.SequenceGroup]
+          outputs: the
+              [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]s
+              for all scheduler steps
+        """
+        for output in outputs:
+            # Concatenate single-step prompt logprob processing results.
+            assert isinstance(output, CompletionSequenceGroupOutput)
+            single_step_process_prompt_logprob(self, seq_group, output)
+
+    @staticmethod
+    @functools.lru_cache
+    def _log_prompt_logprob_unsupported_warning_once():
+        # Reminder: Please update docs/features/compatibility_matrix.md
+        # If the feature combo become valid
+        logger.warning(
+            "Prompt logprob is not supported by multi step workers. "
+            "(e.g., speculative decode uses multi step workers).")
+
+    def process_outputs(self,
+                        sequence_group: SequenceGroup,
+                        outputs: List[SequenceGroupOutput],
+                        is_async: bool = False) -> None:
+        """Append new tokens in the outputs to sequences in the sequence group.
+
+        This only supports sequence groups of size 1. It supports greater than
+        one new token per sequence.
+
+        This applies logic like stop condition checking and detokenization.
+        It also handles cases where there are tokens emitted after 
+        the EOS token.
+
+        is_async - Indicates whether this postprocessor runs in 
+            parallel with the GPU forward pass and is processing 
+            tokens from the previous step. If this is true, then
+            no tokens need to be appended since it is already done
+            externally (before the next schedule() call)
+        """
+        # Sequences can be in RUNNING or FINISHED_ABORTED state
+        # once scheduled, as a sequence is moved to FINISHED_ABORTED
+        # if a client disconnects from the api server.
+        seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING)
+        if seqs is None:
+            seqs = sequence_group.get_seqs(
+                status=SequenceStatus.FINISHED_ABORTED)
+
+        assert seqs, "Expected RUNNING or FINISHED_ABORTED sequences"
+        assert len(seqs) == 1, (
+            "Beam search not supported in multi-step decoding.")
+        seq = seqs[0]
+        seq_id = seq.seq_id
+        # This method is defined in the more generic
+        # SequenceGroupOutputProcessor, but here we assume that the outputs are
+        # of a more specific type.
+        assert all([
+            isinstance(output, CompletionSequenceGroupOutput)
+            for output in outputs
+        ])
+        compl_outputs = cast(List[CompletionSequenceGroupOutput], outputs)
+        assert all([
+            seq_id == output.samples[0].parent_seq_id
+            for output in compl_outputs
+        ])
+
+        if is_async:
+            # Async case: We process tokens one by one. Here, we know the token
+            # was already appended, so we only need to do the rest of the
+            # postprocessor: Detokenization + stopping logic
+            self._process_decode_and_stop(seq, sequence_group.sampling_params)
+        else:
+            # Standard multi-step case
+
+            # Since there's only one sequence per sequence group,
+            # we can take the first sample.
+            samples = [output.samples[0] for output in compl_outputs]
+
+            # entries in sample tokens may be invalid (eg. due to spec decode
+            # rejecting tokens).
+            valid_samples = [
+                sample for sample in samples
+                if sample.output_token != VLLM_INVALID_TOKEN_ID
+            ]
+
+            # When both spec-decode and pre-fill chunking are enabled, we
+            # don't have guaranteed samples here (e.g. all -1s).
+            if valid_samples:
+                self._process_seq_outputs(seq, valid_samples,
+                                          sequence_group.sampling_params)
+
+    def _process_decode_and_stop(self, seq: Sequence,
+                                 sampling_params: SamplingParams) -> None:
+        new_char_count = 0
+        if sampling_params.detokenize and self.detokenizer:
+            new_char_count = self.detokenizer.decode_sequence_inplace(
+                seq, sampling_params)
+
+        # TODO(sang): Support lora.
+        self.stop_checker.maybe_stop_sequence(
+            seq,
+            new_char_count=new_char_count,
+            sampling_params=sampling_params,
+        )
+
+    def _process_seq_outputs(self, seq: Sequence,
+                             valid_samples: List[SequenceOutput],
+                             sampling_params: SamplingParams) -> None:
+        output_token_ids = [sample.output_token for sample in valid_samples]
+        output_logprobs = [sample.logprobs for sample in valid_samples]
+        output_embeds = [sample.output_embed for sample in valid_samples]
+
+        # Truncate to max_tokens if necessary.
+        remaining_tokens = sampling_params.max_tokens - (seq.get_output_len() +
+                                                         len(output_token_ids))
+        if remaining_tokens < 0:
+            output_token_ids = output_token_ids[:remaining_tokens]
+
+        # Truncate any tokens after EOS. This is required as spec decode
+        # generates a fixed number of tokens without evaluating stopping
+        # conditions within the block. This can cause an eos token to be
+        # unintentionally ignored.
+        if not sampling_params.ignore_eos and self.detokenizer:
+            eos_token_id = self.get_tokenizer_for_seq(seq).eos_token_id
+            # Avoiding .index calls as exception throwing in the happy path
+            # is expensive.
+            for i in range(len(output_token_ids)):
+                if output_token_ids[i] == eos_token_id:
+                    output_token_ids = output_token_ids[:i + 1]
+                    break
+
+        is_prefill_sampled_token = seq.data.get_num_uncomputed_tokens() == 0
+        # Incrementally append tokens to the sequence, as if we had only one new
+        # token.
+        for output_token_id, output_logprob, output_embed in zip(
+                output_token_ids, output_logprobs, output_embeds):
+            seq.append_token_id(
+                token_id=output_token_id,
+                logprobs=output_logprob,
+                token_embed=output_embed,
+            )
+
+            if is_prefill_sampled_token:
+                is_prefill_sampled_token = False
+            else:
+                # Update num_computed_tokens iff the sampled token is not from
+                # a prefill step.
+                seq.data.update_num_computed_tokens(1)
+
+            self._process_decode_and_stop(seq, sampling_params)
+
+            if seq.is_finished():
+                break
diff --git a/vllm_v0.10.0/vllm/engine/output_processor/single_step.py b/vllm_v0.10.0/vllm/engine/output_processor/single_step.py
new file mode 100644
index 0000000..dbf6a37
--- /dev/null
+++ b/vllm_v0.10.0/vllm/engine/output_processor/single_step.py
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import List
+
+from vllm.config import SchedulerConfig
+from vllm.core.scheduler import Scheduler
+from vllm.engine.output_processor.interfaces import (
+    SequenceGroupOutputProcessor)
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.logger import init_logger
+from vllm.sequence import (CompletionSequenceGroupOutput, SequenceGroup,
+                           SequenceGroupOutput)
+from vllm.transformers_utils.detokenizer import Detokenizer
+from vllm.utils import Counter
+
+logger = init_logger(__name__)
+
+
+def single_step_process_prompt_logprob(
+        sg_output_proc: SequenceGroupOutputProcessor, seq_group: SequenceGroup,
+        output: CompletionSequenceGroupOutput) -> None:
+    """Process prompt logprobs associated with the
+    [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput] for a given step.
+
+    Do nothing if the output has no prompt logprobs.
+
+    Account for the fact that transformers do not compute first-token logprobs.
+    
+    Args:
+      sg_output_proc:
+          [`SequenceGroupOutputProcessor`][vllm.engine.output_processor.interfaces.SequenceGroupOutputProcessor]
+          instance
+      seq_group: the output is associated with this
+          [`SequenceGroup`][vllm.sequence.SequenceGroup]
+      output: the [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]
+          for a single scheduler step
+    """
+    prompt_logprobs = output.prompt_logprobs
+
+    # If this is the first (or only) "chunk" of the prefill, we need
+    # to prepend None to the list of prompt logprobs. The reason for this
+    # is that for N prompt tokens, the Sampler will generate N-1 total
+    # prompt logprobs during prefill since the token at idx 0 will not
+    # have a logprob associated with it.
+    if prompt_logprobs is not None:
+        if not seq_group.prompt_logprobs:
+            prompt_logprobs = [None] + prompt_logprobs
+            seq_group.prompt_logprobs = []
+
+        assert hasattr(sg_output_proc, 'detokenizer')
+        if (seq_group.sampling_params.detokenize
+                and sg_output_proc.detokenizer):
+            sg_output_proc.detokenizer.decode_prompt_logprobs_inplace(
+                seq_group,
+                prompt_logprobs,
+                position_offset=len(seq_group.prompt_logprobs))
+
+        seq_group.prompt_logprobs.extend(prompt_logprobs)
+
+
+class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
+    """SequenceGroupOutputProcessor which handles "output processing" logic,
+    which happens after the model returns generated token ids and before
+    scheduling of the next batch. Output processing logic includes
+    detokenization, and determining if a sequence is finished (e.g. via max len
+    or eos token).
+
+    The SingleStepOutputProcessor is specialized to the case where the model
+    emits at most a single token per invocation, which precludes configurations
+    such as speculative decoding or multi-step decoding. This enables beam
+    search sampling, which requires forking/finishing/freeing sequences in a way
+    that is currently difficult to schedule multiple steps ahead of time.
+    """
+
+    def __init__(self, scheduler_config: SchedulerConfig,
+                 detokenizer: Detokenizer, scheduler: List[Scheduler],
+                 seq_counter: Counter, stop_checker: StopChecker):
+        self.scheduler_config = scheduler_config
+        self.detokenizer = detokenizer
+        self.scheduler = scheduler
+        self.seq_counter = seq_counter
+        self.stop_checker = stop_checker
+
+    def process_outputs(self, sequence_group: SequenceGroup,
+                        outputs: List[SequenceGroupOutput],
+                        is_async: bool) -> None:
+        """Append all new tokens to sequences in the sequence group. Fork any
+        surviving beam candidates; free any unsurviving ones.
+
+        Invokes detokenizer to detokenize new tokens, and also marks sequences
+        as finished if they meet stop conditions.
+        
+        is_async - Indicates whether this postprocessor runs in 
+            parallel with the GPU forward pass and is processing 
+            tokens from the previous step. If this is true, then
+            no tokens need to be appended since it is already done
+            externally (before the next schedule() call)
+        """
+        assert (len(outputs) == 1
+                ), f"{type(self)} does not support multiple outputs per step"
+        return self._process_sequence_group_outputs(sequence_group, outputs[0],
+                                                    is_async)
+
+    def process_prompt_logprob(self, seq_group: SequenceGroup,
+                               outputs: List[SequenceGroupOutput]) -> None:
+        """Process prompt logprobs associated with one step of a single-step-
+        scheduled computation.
+        
+        Args:
+          seq_group: the output is associated with this
+              [`SequenceGroup`][vllm.sequence.SequenceGroup]
+          outputs: the
+              [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]
+              for a single scheduler step
+        """
+        assert len(outputs) == 1, "Single step should only have 1 output."
+        output = outputs[0]
+        assert isinstance(output, CompletionSequenceGroupOutput)
+        single_step_process_prompt_logprob(self, seq_group, output)
+
+    def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
+                                        outputs: SequenceGroupOutput,
+                                        is_async: bool) -> None:
+        sampling_params = seq_group.sampling_params
+
+        sample = outputs.samples[0]
+        seq = seq_group.first_seq
+        if not is_async:
+            seq.append_token_id(sample.output_token, sample.logprobs,
+                                sample.output_embed)
+        if sampling_params.detokenize and self.detokenizer:
+            new_char_count = self.detokenizer.decode_sequence_inplace(
+                seq, sampling_params)
+        else:
+            new_char_count = 0
+        self.stop_checker.maybe_stop_sequence(
+            seq,
+            new_char_count,
+            sampling_params,
+            lora_req=seq_group.lora_request,
+        )
+        if seq.is_finished():
+            for scheduler in self.scheduler:
+                scheduler.free_seq(seq)
diff --git a/vllm_v0.10.0/vllm/engine/output_processor/stop_checker.py b/vllm_v0.10.0/vllm/engine/output_processor/stop_checker.py
new file mode 100644
index 0000000..3fb2f71
--- /dev/null
+++ b/vllm_v0.10.0/vllm/engine/output_processor/stop_checker.py
@@ -0,0 +1,131 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Callable, List, Optional, Tuple
+
+from vllm.lora.request import LoRARequest
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import Sequence, SequenceStatus
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+
+class StopChecker:
+    """LLMEngine helper class which separates out the logic involving stop
+    checking. This checks things such as: whether the eos token was emitted,
+    whether the max_tokens has been consumed, whether a stop string has been
+    emitted, or if we have exceeded the max model len.
+    """
+
+    def __init__(self, max_model_len: int,
+                 get_tokenizer_for_seq: Callable[[Sequence], AnyTokenizer]):
+        # Do not use it directly, but use `self._get_max_model_len`.
+        self._max_model_len = max_model_len
+        self.get_tokenizer_for_seq = get_tokenizer_for_seq
+
+    def _get_max_model_len(self, lora_req: Optional[LoRARequest]):
+        if lora_req and lora_req.long_lora_max_len:
+            return lora_req.long_lora_max_len
+        else:
+            return self._max_model_len
+
+    def maybe_stop_sequence(
+        self,
+        seq: Sequence,
+        new_char_count: int,
+        sampling_params: SamplingParams,
+        lora_req: Optional[LoRARequest] = None,
+    ) -> None:
+        """Stop the finished sequences.
+
+       new_char_count is the number of chars added to the
+           sequence's output text for the newly generated token
+        """
+
+        # Check if the minimum number of tokens has been generated yet;
+        # skip the stop string/token checks if not
+        if seq.get_output_len() < sampling_params.min_tokens:
+            return
+
+        # Check if the sequence has generated the EOS token.
+        if ((not sampling_params.ignore_eos)
+                and seq.get_last_token_id() == seq.eos_token_id):
+            # Remove the last EOS token unless explicitly specified
+            # This prevents unintended exposure of the EOS token
+            if new_char_count and (
+                    not sampling_params.include_stop_str_in_output):
+                seq.output_text = seq.output_text[:-new_char_count]
+            seq.status = SequenceStatus.FINISHED_STOPPED
+            return
+
+        # Check if a stop token was encountered.
+        # This assumes a single token produced per step.
+        last_token_id = seq.get_last_token_id()
+        if last_token_id in (sampling_params.stop_token_ids or ()):
+            if new_char_count and (
+                    not sampling_params.include_stop_str_in_output):
+                # Remove last token
+                seq.output_text = seq.output_text[:-new_char_count]
+            seq.status = SequenceStatus.FINISHED_STOPPED
+            seq.stop_reason = last_token_id
+            return
+
+        # Check if any stop strings are matched.
+        stop = self.check_stop_strings(
+            seq.output_text, new_char_count, sampling_params.stop,
+            sampling_params.include_stop_str_in_output)
+        if stop is not None:
+            stop_str, truncate_to = stop
+            if truncate_to != -1:
+                seq.output_text = seq.output_text[:truncate_to]
+            seq.status = SequenceStatus.FINISHED_STOPPED
+            seq.stop_reason = stop_str
+            return
+
+        # Check if the sequence has reached max_model_len.
+        if seq.get_len() >= self._get_max_model_len(lora_req):
+            seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
+            return
+
+        # Check if the sequence has reached max_tokens.
+        if seq.get_output_len() == sampling_params.max_tokens:
+            seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
+            return
+
+    @staticmethod
+    def check_stop_strings(
+        output_text: str,
+        new_char_count: int,
+        stop: List[str],
+        include_in_output: bool,
+    ) -> Optional[Tuple[str, int]]:
+        """Check if any stop strings are matched and truncate sequence
+        output text accordingly.
+
+        Returns tuple (stop_string, offset) if matched or else None.
+
+        Where stop_string is the matched stop string and offset is the
+        length to which output_text should be truncated, or -1 for no
+        truncation.
+        """
+        if not new_char_count or not stop:
+            return None
+
+        for stop_str in stop:
+            stop_string_len = len(stop_str)
+            # Avoid searching already-searched text.
+            stop_index = output_text.find(stop_str,
+                                          1 - new_char_count - stop_string_len)
+            if stop_index == -1:
+                continue
+
+            if include_in_output:
+                # Truncate to end of stop string.
+                stop_index += stop_string_len
+                if stop_index >= len(output_text):
+                    # No truncation required.
+                    return stop_str, -1
+
+            # Truncate the output text to either the beginning
+            # or end of the stop string.
+            return stop_str, stop_index
+        return None
diff --git a/vllm_v0.10.0/vllm/engine/output_processor/util.py b/vllm_v0.10.0/vllm/engine/output_processor/util.py
new file mode 100644
index 0000000..1e127eb
--- /dev/null
+++ b/vllm_v0.10.0/vllm/engine/output_processor/util.py
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import List
+from typing import Sequence as GenericSequence
+from typing import cast
+
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.sequence import CompletionSequenceGroupOutput, SequenceGroupOutput
+
+
+def create_output_by_sequence_group(
+        outputs: GenericSequence[SamplerOutput],
+        num_seq_groups: int) -> List[List[SequenceGroupOutput]]:
+    """Helper method which transforms a 2d list organized by
+    [step][sequence group] into [sequence group][step].
+    """
+    output_by_sequence_group: List[List[CompletionSequenceGroupOutput]] = [
+        [] for _ in range(num_seq_groups)
+    ]
+    for step in outputs:
+        sequence_group_output: CompletionSequenceGroupOutput
+        for i, sequence_group_output in enumerate(step):
+            output_by_sequence_group[i].append(sequence_group_output)
+
+    # Cast to the more generic type that CompletionSequenceGroupOutput
+    # inherits from.
+    return cast(List[List[SequenceGroupOutput]], output_by_sequence_group)
diff --git a/vllm_v0.10.0/vllm/engine/protocol.py b/vllm_v0.10.0/vllm/engine/protocol.py
new file mode 100644
index 0000000..671e964
--- /dev/null
+++ b/vllm_v0.10.0/vllm/engine/protocol.py
@@ -0,0 +1,330 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+from abc import ABC, abstractmethod
+from typing import AsyncGenerator, Mapping, Optional
+
+from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
+from vllm.config import DecodingConfig, ModelConfig, VllmConfig
+from vllm.core.scheduler import SchedulerOutputs
+from vllm.inputs.data import PromptType, TokensPrompt
+from vllm.inputs.parse import is_explicit_encoder_decoder_prompt
+from vllm.inputs.preprocess import InputPreprocessor
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput
+from vllm.pooling_params import PoolingParams
+from vllm.sampling_params import BeamSearchParams, SamplingParams
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import Device, collect_from_async_generator, random_uuid
+
+logger = init_logger(__name__)
+
+
+class EngineClient(ABC):
+    """Protocol class for Clients to Engine"""
+
+    @property
+    @abstractmethod
+    def is_running(self) -> bool:
+        ...
+
+    @property
+    @abstractmethod
+    def is_stopped(self) -> bool:
+        ...
+
+    @property
+    @abstractmethod
+    def errored(self) -> bool:
+        ...
+
+    @property
+    @abstractmethod
+    def dead_error(self) -> BaseException:
+        ...
+
+    @abstractmethod
+    def generate(
+        self,
+        prompt: PromptType,
+        sampling_params: SamplingParams,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        priority: int = 0,
+    ) -> AsyncGenerator[RequestOutput, None]:
+        """Generate outputs for a request."""
+        ...
+
+    async def beam_search(
+        self,
+        prompt: PromptType,
+        request_id: str,
+        params: BeamSearchParams,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> AsyncGenerator[RequestOutput, None]:
+
+        beam_width = params.beam_width
+        max_tokens = params.max_tokens
+        ignore_eos = params.ignore_eos
+        temperature = params.temperature
+        length_penalty = params.length_penalty
+        include_stop_str_in_output = params.include_stop_str_in_output
+
+        preprocessor = await self.get_input_preprocessor()
+        tokenizer_group = preprocessor.get_tokenizer_group()
+        tokenizer = await tokenizer_group.get_lora_tokenizer_async()
+
+        if is_explicit_encoder_decoder_prompt(prompt):
+            raise NotImplementedError
+        else:
+            processed_inputs = preprocessor._prompt_to_llm_inputs(prompt)
+
+        if processed_inputs["type"] == "embeds":
+            raise NotImplementedError
+
+        # This is a workaround to fix multimodal beam search; this is a
+        # bandaid fix for 2 small problems:
+        # 1. Multi_modal_data on the processed_inputs currently resolves to
+        #    `None`.
+        # 2. preprocessing above expands the multimodal placeholders. However,
+        #    this happens again in generation, so the double expansion causes
+        #    a mismatch.
+        # TODO - would be ideal to handle this more gracefully.
+        prompt_token_ids = prompt.get("prompt_token_ids")
+        multi_modal_data = prompt.get("multi_modal_data")
+
+        prompt_text = processed_inputs.get("prompt")
+        mm_processor_kwargs = processed_inputs.get("mm_processor_kwargs")
+
+        tokenized_length = len(prompt_token_ids)
+
+        sort_beams_key = create_sort_beams_key_function(
+            tokenizer.eos_token_id, length_penalty)
+
+        beam_search_params = SamplingParams(
+            logprobs=2 * beam_width,
+            max_tokens=1,
+            temperature=temperature,
+        )
+        all_beams = [
+            BeamSearchSequence(tokens=prompt_token_ids,
+                               cum_logprob=0,
+                               logprobs=[],
+                               multi_modal_data=multi_modal_data,
+                               mm_processor_kwargs=mm_processor_kwargs,
+                               lora_request=lora_request)
+        ]
+        completed = []
+
+        for _ in range(max_tokens):
+            prompts_batch, lora_req_batch = zip(*[(
+                TokensPrompt(prompt_token_ids=beam.tokens,
+                             multi_modal_data=beam.multi_modal_data,
+                             mm_processor_kwargs=beam.mm_processor_kwargs),
+                beam.lora_request,
+            ) for beam in all_beams])
+
+            tasks = []
+
+            request_id = f"beam_search-{random_uuid()}"
+            for i, (individual_prompt,
+                    lora_req) in enumerate(zip(prompts_batch, lora_req_batch)):
+                request_id_item = f"{request_id}-{i}"
+                task = asyncio.create_task(
+                    collect_from_async_generator(
+                        self.generate(individual_prompt,
+                                      beam_search_params,
+                                      request_id_item,
+                                      lora_request=lora_req)))
+                tasks.append(task)
+
+            output = await asyncio.gather(*tasks)
+
+            output = [x[0] for x in output]
+
+            new_beams = []
+            for i, current_beam in enumerate(all_beams):
+                result = output[i]
+
+                if result.outputs[0].logprobs is not None:
+                    logprobs = result.outputs[0].logprobs[0]
+                    for token_id, logprob_obj in logprobs.items():
+                        if token_id == tokenizer.eos_token_id and \
+                            not ignore_eos:
+                            completed.append(
+                                BeamSearchSequence(
+                                    tokens=current_beam.tokens +
+                                    [token_id] if include_stop_str_in_output
+                                    else current_beam.tokens,
+                                    logprobs=current_beam.logprobs +
+                                    [logprobs],
+                                    cum_logprob=current_beam.cum_logprob +
+                                    logprob_obj.logprob,
+                                    finish_reason="stop",
+                                    stop_reason=tokenizer.eos_token_id))
+                        else:
+                            new_beams.append(
+                                BeamSearchSequence(
+                                    tokens=current_beam.tokens + [token_id],
+                                    logprobs=current_beam.logprobs +
+                                    [logprobs],
+                                    lora_request=current_beam.lora_request,
+                                    cum_logprob=current_beam.cum_logprob +
+                                    logprob_obj.logprob,
+                                    multi_modal_data=current_beam.
+                                    multi_modal_data,
+                                    mm_processor_kwargs=current_beam.
+                                    mm_processor_kwargs))
+
+            sorted_beams = sorted(new_beams, key=sort_beams_key, reverse=True)
+            all_beams = sorted_beams[:beam_width]
+
+        completed.extend(all_beams)
+        sorted_completed = sorted(completed, key=sort_beams_key, reverse=True)
+        best_beams = sorted_completed[:beam_width]
+
+        for beam in best_beams:
+            if (beam.tokens[-1] == tokenizer.eos_token_id and not ignore_eos):
+                # Skip the eos token in the text.
+                tokens = beam.tokens[tokenized_length:-1]
+            else:
+                tokens = beam.tokens[tokenized_length:]
+            beam.text = tokenizer.decode(tokens)
+
+        beam_search_output = RequestOutput(
+            request_id=request_id,
+            prompt=prompt_text,
+            outputs=[
+                CompletionOutput(text=beam.text,
+                                 cumulative_logprob=beam.cum_logprob,
+                                 token_ids=beam.tokens[tokenized_length:],
+                                 index=i,
+                                 logprobs=beam.logprobs,
+                                 finish_reason=beam.finish_reason if
+                                 beam.finish_reason is not None else "length",
+                                 stop_reason=beam.stop_reason)
+                for (i, beam) in enumerate(best_beams)
+            ],
+            finished=True,
+            prompt_token_ids=prompt_token_ids,
+            prompt_logprobs=None)
+
+        yield beam_search_output
+
+    @abstractmethod
+    def encode(
+        self,
+        prompt: PromptType,
+        pooling_params: PoolingParams,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        priority: int = 0,
+    ) -> AsyncGenerator[PoolingRequestOutput, None]:
+        """Generate outputs for a request from a pooling model."""
+        ...
+
+    @abstractmethod
+    async def abort(self, request_id: str) -> None:
+        """Abort a request.
+
+        Args:
+            request_id: The unique id of the request.
+        """
+        ...
+
+    @abstractmethod
+    async def get_vllm_config(self) -> VllmConfig:
+        """Get the vllm configuration of the vLLM engine."""
+        ...
+
+    @abstractmethod
+    async def get_model_config(self) -> ModelConfig:
+        """Get the model configuration of the vLLM engine."""
+        ...
+
+    @abstractmethod
+    async def get_decoding_config(self) -> DecodingConfig:
+        """Get the decoding configuration of the vLLM engine."""
+        ...
+
+    @abstractmethod
+    async def get_input_preprocessor(self) -> InputPreprocessor:
+        """Get the input processor of the vLLM engine."""
+        ...
+
+    @abstractmethod
+    async def get_tokenizer(
+        self,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> AnyTokenizer:
+        """Get the appropriate tokenizer for the request"""
+        ...
+
+    @abstractmethod
+    async def is_tracing_enabled(self) -> bool:
+        ...
+
+    @abstractmethod
+    async def do_log_stats(
+        self,
+        scheduler_outputs: Optional[SchedulerOutputs] = None,
+        model_output: Optional[list[SamplerOutput]] = None,
+    ) -> None:
+        ...
+
+    @abstractmethod
+    async def check_health(self) -> None:
+        """Raise if unhealthy"""
+        ...
+
+    @abstractmethod
+    async def start_profile(self) -> None:
+        """Start profiling the engine"""
+        ...
+
+    @abstractmethod
+    async def stop_profile(self) -> None:
+        """Start profiling the engine"""
+        ...
+
+    @abstractmethod
+    async def reset_mm_cache(self) -> None:
+        """Reset the multi-modal cache"""
+        ...
+
+    @abstractmethod
+    async def reset_prefix_cache(self,
+                                 device: Optional[Device] = None) -> None:
+        """Reset the prefix cache"""
+        ...
+
+    @abstractmethod
+    async def sleep(self, level: int = 1) -> None:
+        """Sleep the engine"""
+        ...
+
+    @abstractmethod
+    async def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        """Wake up the engine"""
+        ...
+
+    @abstractmethod
+    async def is_sleeping(self) -> bool:
+        """Check whether the engine is sleeping"""
+        ...
+
+    @abstractmethod
+    async def add_lora(self, lora_request: LoRARequest) -> None:
+        """Load a new LoRA adapter into the engine for future requests."""
+        ...
+
+    async def scale_elastic_ep(self,
+                               new_data_parallel_size: int,
+                               drain_timeout: int = 300) -> None:
+        """Scale the engine"""
+        raise NotImplementedError
diff --git a/vllm_v0.10.0/vllm/entrypoints/__init__.py b/vllm_v0.10.0/vllm/entrypoints/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/entrypoints/api_server.py b/vllm_v0.10.0/vllm/entrypoints/api_server.py
new file mode 100644
index 0000000..3d1e5dc
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/api_server.py
@@ -0,0 +1,178 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+NOTE: This API server is used only for demonstrating usage of AsyncEngine
+and simple performance benchmarks. It is not intended for production use.
+For production use, we recommend using our OpenAI compatible server.
+We are also not going to accept PRs modifying this file, please
+change `vllm/entrypoints/openai/api_server.py` instead.
+"""
+import asyncio
+import json
+import ssl
+from argparse import Namespace
+from collections.abc import AsyncGenerator
+from typing import Any, Optional
+
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse, Response, StreamingResponse
+
+import vllm.envs as envs
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.entrypoints.launcher import serve_http
+from vllm.entrypoints.utils import with_cancellation
+from vllm.logger import init_logger
+from vllm.sampling_params import SamplingParams
+from vllm.usage.usage_lib import UsageContext
+from vllm.utils import FlexibleArgumentParser, random_uuid, set_ulimit
+from vllm.version import __version__ as VLLM_VERSION
+
+logger = init_logger("vllm.entrypoints.api_server")
+
+app = FastAPI()
+engine = None
+
+
+@app.get("/health")
+async def health() -> Response:
+    """Health check."""
+    return Response(status_code=200)
+
+
+@app.post("/generate")
+async def generate(request: Request) -> Response:
+    """Generate completion for the request.
+
+    The request should be a JSON object with the following fields:
+    - prompt: the prompt to use for the generation.
+    - stream: whether to stream the results or not.
+    - other fields: the sampling parameters (See `SamplingParams` for details).
+    """
+    request_dict = await request.json()
+    return await _generate(request_dict, raw_request=request)
+
+
+@with_cancellation
+async def _generate(request_dict: dict, raw_request: Request) -> Response:
+    prompt = request_dict.pop("prompt")
+    stream = request_dict.pop("stream", False)
+    sampling_params = SamplingParams(**request_dict)
+    request_id = random_uuid()
+
+    assert engine is not None
+    results_generator = engine.generate(prompt, sampling_params, request_id)
+
+    # Streaming case
+    async def stream_results() -> AsyncGenerator[bytes, None]:
+        async for request_output in results_generator:
+            prompt = request_output.prompt
+            assert prompt is not None
+            text_outputs = [
+                prompt + output.text for output in request_output.outputs
+            ]
+            ret = {"text": text_outputs}
+            yield (json.dumps(ret) + "\n").encode("utf-8")
+
+    if stream:
+        return StreamingResponse(stream_results())
+
+    # Non-streaming case
+    final_output = None
+    try:
+        async for request_output in results_generator:
+            final_output = request_output
+    except asyncio.CancelledError:
+        return Response(status_code=499)
+
+    assert final_output is not None
+    prompt = final_output.prompt
+    assert prompt is not None
+    text_outputs = [prompt + output.text for output in final_output.outputs]
+    ret = {"text": text_outputs}
+    return JSONResponse(ret)
+
+
+def build_app(args: Namespace) -> FastAPI:
+    global app
+
+    app.root_path = args.root_path
+    return app
+
+
+async def init_app(
+    args: Namespace,
+    llm_engine: Optional[AsyncLLMEngine] = None,
+) -> FastAPI:
+    app = build_app(args)
+
+    global engine
+
+    engine_args = AsyncEngineArgs.from_cli_args(args)
+    engine = (llm_engine
+              if llm_engine is not None else AsyncLLMEngine.from_engine_args(
+                  engine_args, usage_context=UsageContext.API_SERVER))
+    app.state.engine_client = engine
+    return app
+
+
+async def run_server(args: Namespace,
+                     llm_engine: Optional[AsyncLLMEngine] = None,
+                     **uvicorn_kwargs: Any) -> None:
+    logger.info("vLLM API server version %s", VLLM_VERSION)
+    logger.info("args: %s", args)
+
+    set_ulimit()
+
+    app = await init_app(args, llm_engine)
+    assert engine is not None
+
+    shutdown_task = await serve_http(
+        app,
+        sock=None,
+        enable_ssl_refresh=args.enable_ssl_refresh,
+        host=args.host,
+        port=args.port,
+        log_level=args.log_level,
+        timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE,
+        ssl_keyfile=args.ssl_keyfile,
+        ssl_certfile=args.ssl_certfile,
+        ssl_ca_certs=args.ssl_ca_certs,
+        ssl_cert_reqs=args.ssl_cert_reqs,
+        **uvicorn_kwargs,
+    )
+
+    await shutdown_task
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser()
+    parser.add_argument("--host", type=str, default=None)
+    parser.add_argument("--port", type=parser.check_port, default=8000)
+    parser.add_argument("--ssl-keyfile", type=str, default=None)
+    parser.add_argument("--ssl-certfile", type=str, default=None)
+    parser.add_argument("--ssl-ca-certs",
+                        type=str,
+                        default=None,
+                        help="The CA certificates file")
+    parser.add_argument(
+        "--enable-ssl-refresh",
+        action="store_true",
+        default=False,
+        help="Refresh SSL Context when SSL certificate files change")
+    parser.add_argument(
+        "--ssl-cert-reqs",
+        type=int,
+        default=int(ssl.CERT_NONE),
+        help="Whether client certificate is required (see stdlib ssl module's)"
+    )
+    parser.add_argument(
+        "--root-path",
+        type=str,
+        default=None,
+        help="FastAPI root_path when app is behind a path based routing proxy")
+    parser.add_argument("--log-level", type=str, default="debug")
+    parser = AsyncEngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+
+    asyncio.run(run_server(args))
diff --git a/vllm_v0.10.0/vllm/entrypoints/chat_utils.py b/vllm_v0.10.0/vllm/entrypoints/chat_utils.py
new file mode 100644
index 0000000..a660239
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/chat_utils.py
@@ -0,0 +1,1378 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import json
+from abc import ABC, abstractmethod
+from collections import Counter, defaultdict, deque
+from collections.abc import Awaitable, Iterable
+from functools import cached_property, lru_cache, partial
+from pathlib import Path
+from typing import (Any, Callable, Generic, Literal, Optional, TypeVar, Union,
+                    cast)
+
+import jinja2.nodes
+import transformers.utils.chat_template_utils as hf_chat_utils
+# yapf conflicts with isort for this block
+# yapf: disable
+from openai.types.chat import (ChatCompletionAssistantMessageParam,
+                               ChatCompletionContentPartImageParam,
+                               ChatCompletionContentPartInputAudioParam)
+from openai.types.chat import (
+    ChatCompletionContentPartParam as OpenAIChatCompletionContentPartParam)
+from openai.types.chat import (ChatCompletionContentPartRefusalParam,
+                               ChatCompletionContentPartTextParam)
+from openai.types.chat import (
+    ChatCompletionMessageParam as OpenAIChatCompletionMessageParam)
+from openai.types.chat import (ChatCompletionMessageToolCallParam,
+                               ChatCompletionToolMessageParam)
+from openai.types.chat.chat_completion_content_part_input_audio_param import (
+    InputAudio)
+from openai.types.responses import ResponseInputImageParam
+from PIL import Image
+from pydantic import BaseModel, ConfigDict, TypeAdapter
+# yapf: enable
+from transformers import (PreTrainedTokenizer, PreTrainedTokenizerFast,
+                          ProcessorMixin)
+# pydantic needs the TypedDict from typing_extensions
+from typing_extensions import Required, TypeAlias, TypedDict
+
+from vllm.config import ModelConfig
+from vllm.logger import init_logger
+from vllm.model_executor.models import SupportsMultiModal
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
+from vllm.multimodal.utils import MediaConnector
+# yapf: disable
+from vllm.transformers_utils.chat_templates import (
+    get_chat_template_fallback_path)
+# yapf: enable
+from vllm.transformers_utils.processor import cached_get_processor
+from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+from vllm.utils import deprecate_kwargs, random_uuid
+
+logger = init_logger(__name__)
+
+MODALITY_PLACEHOLDERS_MAP = {
+    "image": "<##IMAGE##>",
+    "audio": "<##AUDIO##>",
+    "video": "<##VIDEO##>",
+}
+
+
+class AudioURL(TypedDict, total=False):
+    url: Required[str]
+    """
+    Either a URL of the audio or a data URL with base64 encoded audio data.
+    """
+
+
+class ChatCompletionContentPartAudioParam(TypedDict, total=False):
+    audio_url: Required[AudioURL]
+
+    type: Required[Literal["audio_url"]]
+    """The type of the content part."""
+
+
+class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
+    image_embeds: Required[Union[str, dict[str, str]]]
+    """
+    The image embeddings. It can be either:
+    - A single base64 string.
+    - A dictionary where each value is a base64 string.
+    """
+    type: Required[Literal["image_embeds"]]
+    """The type of the content part."""
+
+
+class VideoURL(TypedDict, total=False):
+    url: Required[str]
+    """
+    Either a URL of the video or a data URL with base64 encoded video data.
+    """
+
+
+class ChatCompletionContentPartVideoParam(TypedDict, total=False):
+    video_url: Required[VideoURL]
+
+    type: Required[Literal["video_url"]]
+    """The type of the content part."""
+
+
+class PILImage(BaseModel):
+    """
+    A PIL.Image.Image object.
+    """
+    image_pil: Image.Image
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+
+class CustomChatCompletionContentPILImageParam(TypedDict, total=False):
+    """A simpler version of the param that only accepts a PIL image.
+
+    Example:
+    {
+        "image_pil": ImageAsset('cherry_blossom').pil_image
+    }
+    """
+    image_pil: Required[PILImage]
+
+
+class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
+    """A simpler version of the param that only accepts a plain image_url.
+    This is supported by OpenAI API, although it is not documented.
+
+    Example:
+    {
+        "image_url": "https://example.com/image.jpg"
+    }
+    """
+    image_url: Required[str]
+
+
+class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False):
+    """A simpler version of the param that only accepts a plain audio_url.
+
+    Example:
+    {
+        "audio_url": "https://example.com/audio.mp3"
+    }
+    """
+    audio_url: Required[str]
+
+
+class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
+    """A simpler version of the param that only accepts a plain audio_url.
+
+    Example:
+    {
+        "video_url": "https://example.com/video.mp4"
+    }
+    """
+    video_url: Required[str]
+
+
+class CustomThinkCompletionContentParam(TypedDict, total=False):
+    """A Think Completion Content Param that accepts a plain text and a boolean.
+
+    Example:
+    {
+        "thinking": "I am thinking about the answer",
+        "closed": True,
+        "type": "thinking"
+    }
+    """
+
+    thinking: Required[str]
+    """The thinking content."""
+
+    closed: bool
+    """Whether the thinking is closed."""
+
+    type: Required[Literal["thinking"]]
+    """The thinking type."""
+
+
+ChatCompletionContentPartParam: TypeAlias = Union[
+    OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam,
+    ChatCompletionContentPartInputAudioParam,
+    ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
+    CustomChatCompletionContentPILImageParam,
+    CustomChatCompletionContentSimpleImageParam,
+    ChatCompletionContentPartImageEmbedsParam,
+    CustomChatCompletionContentSimpleAudioParam,
+    CustomChatCompletionContentSimpleVideoParam, str,
+    CustomThinkCompletionContentParam]
+
+
+class CustomChatCompletionMessageParam(TypedDict, total=False):
+    """Enables custom roles in the Chat Completion API."""
+    role: Required[str]
+    """The role of the message's author."""
+
+    content: Union[str, list[ChatCompletionContentPartParam]]
+    """The contents of the message."""
+
+    name: str
+    """An optional name for the participant.
+
+    Provides the model information to differentiate between participants of the
+    same role.
+    """
+
+    tool_call_id: Optional[str]
+    """Tool call that this message is responding to."""
+
+    tool_calls: Optional[Iterable[ChatCompletionMessageToolCallParam]]
+    """The tool calls generated by the model, such as function calls."""
+
+
+ChatCompletionMessageParam = Union[OpenAIChatCompletionMessageParam,
+                                   CustomChatCompletionMessageParam]
+
+
+# TODO: Make fields ReadOnly once mypy supports it
+class ConversationMessage(TypedDict, total=False):
+    role: Required[str]
+    """The role of the message's author."""
+
+    content: Union[Optional[str], list[dict[str, str]]]
+    """The contents of the message"""
+
+    tool_call_id: Optional[str]
+    """Tool call that this message is responding to."""
+
+    name: Optional[str]
+    """The name of the function to call"""
+
+    tool_calls: Optional[Iterable[ChatCompletionMessageToolCallParam]]
+    """The tool calls generated by the model, such as function calls."""
+
+
+# Passed in by user
+ChatTemplateContentFormatOption = Literal["auto", "string", "openai"]
+
+# Used internally
+_ChatTemplateContentFormat = Literal["string", "openai"]
+
+
+def _is_var_access(node: jinja2.nodes.Node, varname: str) -> bool:
+    if isinstance(node, jinja2.nodes.Name):
+        return node.ctx == "load" and node.name == varname
+
+    return False
+
+
+def _is_attr_access(node: jinja2.nodes.Node, varname: str, key: str) -> bool:
+    if isinstance(node, jinja2.nodes.Getitem):
+        return (_is_var_access(node.node, varname)
+                and isinstance(node.arg, jinja2.nodes.Const)
+                and node.arg.value == key)
+
+    if isinstance(node, jinja2.nodes.Getattr):
+        return _is_var_access(node.node, varname) and node.attr == key
+
+    return False
+
+
+def _is_var_or_elems_access(
+    node: jinja2.nodes.Node,
+    varname: str,
+    key: Optional[str] = None,
+) -> bool:
+    if isinstance(node, jinja2.nodes.Filter):
+        return (node.node is not None
+                and _is_var_or_elems_access(node.node, varname, key))
+    if isinstance(node, jinja2.nodes.Test):
+        return _is_var_or_elems_access(node.node, varname, key)
+
+    if (isinstance(node, jinja2.nodes.Getitem)
+            and isinstance(node.arg, jinja2.nodes.Slice)):
+        return _is_var_or_elems_access(node.node, varname, key)
+
+    # yapf: disable
+    return (
+        _is_attr_access(node, varname, key) if key
+        else _is_var_access(node, varname)
+    ) # yapf: enable
+
+
+def _iter_nodes_assign_var_or_elems(root: jinja2.nodes.Node, varname: str):
+    # Global variable that is implicitly defined at the root
+    yield root, varname
+
+    # Iterative BFS
+    related_varnames = deque([varname])
+    while related_varnames:
+        related_varname = related_varnames.popleft()
+
+        for assign_ast in root.find_all(jinja2.nodes.Assign):
+            lhs = assign_ast.target
+            rhs = assign_ast.node
+
+            if _is_var_or_elems_access(rhs, related_varname):
+                assert isinstance(lhs, jinja2.nodes.Name)
+                yield assign_ast, lhs.name
+
+                # Avoid infinite looping for self-assignment
+                if lhs.name != related_varname:
+                    related_varnames.append(lhs.name)
+
+
+# NOTE: The proper way to handle this is to build a CFG so that we can handle
+# the scope in which each variable is defined, but that is too complicated
+def _iter_nodes_assign_messages_item(root: jinja2.nodes.Node):
+    messages_varnames = [
+        varname
+        for _, varname in _iter_nodes_assign_var_or_elems(root, "messages")
+    ]
+
+    # Search for {%- for message in messages -%} loops
+    for loop_ast in root.find_all(jinja2.nodes.For):
+        loop_iter = loop_ast.iter
+        loop_target = loop_ast.target
+
+        for varname in messages_varnames:
+            if _is_var_or_elems_access(loop_iter, varname):
+                assert isinstance(loop_target, jinja2.nodes.Name)
+                yield loop_ast, loop_target.name
+                break
+
+
+def _iter_nodes_assign_content_item(root: jinja2.nodes.Node):
+    message_varnames = [
+        varname for _, varname in _iter_nodes_assign_messages_item(root)
+    ]
+
+    # Search for {%- for content in message['content'] -%} loops
+    for loop_ast in root.find_all(jinja2.nodes.For):
+        loop_iter = loop_ast.iter
+        loop_target = loop_ast.target
+
+        for varname in message_varnames:
+            if _is_var_or_elems_access(loop_iter, varname, "content"):
+                assert isinstance(loop_target, jinja2.nodes.Name)
+                yield loop_ast, loop_target.name
+                break
+
+
+def _try_extract_ast(chat_template: str) -> Optional[jinja2.nodes.Template]:
+    try:
+        jinja_compiled = hf_chat_utils._compile_jinja_template(chat_template)
+        return jinja_compiled.environment.parse(chat_template)
+    except Exception:
+        logger.exception("Error when compiling Jinja template")
+        return None
+
+
+@lru_cache(maxsize=32)
+def _detect_content_format(
+    chat_template: str,
+    *,
+    default: _ChatTemplateContentFormat,
+) -> _ChatTemplateContentFormat:
+    jinja_ast = _try_extract_ast(chat_template)
+    if jinja_ast is None:
+        return default
+
+    try:
+        next(_iter_nodes_assign_content_item(jinja_ast))
+    except StopIteration:
+        return "string"
+    except Exception:
+        logger.exception("Error when parsing AST of Jinja template")
+        return default
+    else:
+        return "openai"
+
+
+def resolve_mistral_chat_template(
+    chat_template: Optional[str],
+    **kwargs: Any,
+) -> Optional[str]:
+    if chat_template is not None:
+        logger.warning_once(
+            "'chat_template' cannot be overridden for mistral tokenizer.")
+    if "add_generation_prompt" in kwargs:
+        logger.warning_once(
+            "'add_generation_prompt' is not supported for mistral tokenizer, "
+            "so it will be ignored.")
+    if "continue_final_message" in kwargs:
+        logger.warning_once(
+            "'continue_final_message' is not supported for mistral tokenizer, "
+            "so it will be ignored.")
+    return None
+
+
+@deprecate_kwargs(
+    "trust_remote_code",
+    additional_message="Please use `model_config.trust_remote_code` instead.",
+)
+def resolve_hf_chat_template(
+    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    chat_template: Optional[str],
+    tools: Optional[list[dict[str, Any]]],
+    *,
+    model_config: ModelConfig,
+    trust_remote_code: Optional[bool] = None,
+) -> Optional[str]:
+    # 1st priority: The given chat template
+    if chat_template is not None:
+        return chat_template
+
+    # 2nd priority: AutoProcessor chat template, unless tool calling is enabled
+    if tools is None:
+        try:
+            processor = cached_get_processor(
+                tokenizer.name_or_path,
+                processor_cls=(PreTrainedTokenizer, PreTrainedTokenizerFast,
+                               ProcessorMixin),
+                trust_remote_code=model_config.trust_remote_code,
+            )
+            if isinstance(processor, ProcessorMixin) and \
+                hasattr(processor, 'chat_template') and \
+                processor.chat_template is not None:
+                return processor.chat_template
+        except Exception:
+            logger.debug("Failed to load AutoProcessor chat template for %s", tokenizer.name_or_path, exc_info=True)  # noqa: E501
+
+    # 3rd priority: AutoTokenizer chat template
+    try:
+        return tokenizer.get_chat_template(chat_template, tools=tools)
+    except Exception:
+        logger.debug("Failed to load AutoTokenizer chat template for %s",
+                     tokenizer.name_or_path, exc_info=True)
+
+    # 4th priority: Predefined fallbacks
+    path = get_chat_template_fallback_path(
+        model_type=model_config.hf_config.model_type,
+        tokenizer_name_or_path=model_config.tokenizer,
+    )
+    if path is not None:
+        logger.info("Loading chat template fallback for %s as there isn't one "
+                    "defined on HF Hub.", tokenizer.name_or_path)
+        chat_template = load_chat_template(path)
+    else:
+        logger.debug("There is no chat template fallback for %s",
+                     tokenizer.name_or_path)
+
+    return chat_template
+
+
+def _resolve_chat_template_content_format(
+    chat_template: Optional[str],
+    tools: Optional[list[dict[str, Any]]],
+    tokenizer: AnyTokenizer,
+    *,
+    model_config: ModelConfig,
+) -> _ChatTemplateContentFormat:
+    if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
+        hf_chat_template = resolve_hf_chat_template(
+            tokenizer,
+            chat_template=chat_template,
+            tools=tools,
+            model_config=model_config,
+        )
+    else:
+        hf_chat_template = None
+
+    jinja_text = (hf_chat_template if isinstance(hf_chat_template, str)
+                  else load_chat_template(chat_template, is_literal=True))
+
+    detected_format = ("string" if jinja_text is None else
+                       _detect_content_format(jinja_text, default="string"))
+
+    return detected_format
+
+
+@lru_cache
+def _log_chat_template_content_format(
+    chat_template: Optional[str],
+    given_format: ChatTemplateContentFormatOption,
+    detected_format: ChatTemplateContentFormatOption,
+):
+    logger.info(
+        "Detected the chat template content format to be '%s'. "
+        "You can set `--chat-template-content-format` to override this.",
+        detected_format,
+    )
+
+    if given_format != "auto" and given_format != detected_format:
+        logger.warning(
+            "You specified `--chat-template-content-format %s` "
+            "which is different from the detected format '%s'. "
+            "If our automatic detection is incorrect, please consider "
+            "opening a GitHub issue so that we can improve it: "
+            "https://github.com/vllm-project/vllm/issues/new/choose",
+            given_format,
+            detected_format,
+        )
+
+
+@deprecate_kwargs(
+    "trust_remote_code",
+    additional_message="Please use `model_config.trust_remote_code` instead.",
+)
+def resolve_chat_template_content_format(
+    chat_template: Optional[str],
+    tools: Optional[list[dict[str, Any]]],
+    given_format: ChatTemplateContentFormatOption,
+    tokenizer: AnyTokenizer,
+    *,
+    model_config: ModelConfig,
+    trust_remote_code: Optional[bool] = None,
+) -> _ChatTemplateContentFormat:
+    if given_format != "auto":
+        return given_format
+
+    detected_format = _resolve_chat_template_content_format(
+        chat_template,
+        tools,
+        tokenizer,
+        model_config=model_config,
+    )
+
+    _log_chat_template_content_format(
+        chat_template,
+        given_format=given_format,
+        detected_format=detected_format,
+    )
+
+    return detected_format
+
+
+
+ModalityStr = Literal["image", "audio", "video", "image_embeds"]
+_T = TypeVar("_T")
+
+
+class BaseMultiModalItemTracker(ABC, Generic[_T]):
+    """
+    Tracks multi-modal items in a given request and ensures that the number
+    of multi-modal items in a given request does not exceed the configured
+    maximum per prompt.
+    """
+
+    def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer):
+        super().__init__()
+
+        self._model_config = model_config
+        self._tokenizer = tokenizer
+
+        self._items_by_modality = defaultdict[str, list[_T]](list)
+
+    @property
+    def model_config(self) -> ModelConfig:
+        return self._model_config
+
+    @cached_property
+    def model_cls(self):
+        from vllm.model_executor.model_loader import get_model_cls
+        return get_model_cls(self.model_config)
+
+    @property
+    def allowed_local_media_path(self):
+        return self._model_config.allowed_local_media_path
+
+    @property
+    def mm_registry(self):
+        return MULTIMODAL_REGISTRY
+
+    def add(self, modality: ModalityStr, item: _T) -> Optional[str]:
+        """
+        Add a multi-modal item to the current prompt and returns the
+        placeholder string to use, if any.
+        """
+        mm_registry = self.mm_registry
+        model_config = self.model_config
+        model_cls = cast(SupportsMultiModal, self.model_cls)
+
+        input_modality = modality.replace("_embeds", "")
+
+        if mm_registry.has_processor(model_config):
+            mm_processor = mm_registry.create_processor(model_config)
+            allowed_counts = mm_processor.info.get_allowed_mm_limits()
+            allowed_count = allowed_counts.get(input_modality, 0)
+        else:
+            mm_config = model_config.multimodal_config
+            if mm_config is None:
+                msg = "This model does not support multi-modal inputs"
+                raise ValueError(msg)
+
+            allowed_count = mm_config.get_limit_per_prompt(input_modality)
+
+        current_count = len(self._items_by_modality[modality]) + 1
+        if current_count > allowed_count:
+            raise ValueError(
+                f"At most {allowed_count} {modality}(s) may be provided in "
+                "one request. You can set `--limit-mm-per-prompt` to "
+                "increase this limit if the model supports it.")
+
+        self._items_by_modality[modality].append(item)
+
+        return model_cls.get_placeholder_str(modality, current_count)
+
+    @abstractmethod
+    def create_parser(self) -> "BaseMultiModalContentParser":
+        raise NotImplementedError
+
+
+class MultiModalItemTracker(BaseMultiModalItemTracker[object]):
+
+    def all_mm_data(self) -> Optional[MultiModalDataDict]:
+        if not self._items_by_modality:
+            return None
+        mm_inputs = {}
+        items_by_modality = dict(self._items_by_modality)
+        if "image" in items_by_modality and "image_embeds" in items_by_modality:
+            raise ValueError(\
+                "Mixing raw image and embedding inputs is not allowed")
+
+        if "image_embeds" in items_by_modality:
+            image_embeds_lst = items_by_modality["image_embeds"]
+            if len(image_embeds_lst) > 1:
+                raise ValueError(\
+                    "Only one message can have {'type': 'image_embeds'}")
+            mm_inputs["image"] = image_embeds_lst[0]
+        if "image" in items_by_modality:
+            mm_inputs["image"] = items_by_modality["image"] # A list of images
+        if "audio" in items_by_modality:
+            mm_inputs["audio"] = items_by_modality["audio"] # A list of audios
+        if "video" in items_by_modality:
+            mm_inputs["video"] = items_by_modality["video"] # A list of videos
+        return mm_inputs
+
+    def create_parser(self) -> "BaseMultiModalContentParser":
+        return MultiModalContentParser(self)
+
+
+class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]):
+
+    async def all_mm_data(self) -> Optional[MultiModalDataDict]:
+        if not self._items_by_modality:
+            return None
+        mm_inputs = {}
+        items_by_modality = {
+                modality: await asyncio.gather(*items)
+                for modality, items in self._items_by_modality.items()
+            }
+
+        if "image" in items_by_modality and "image_embeds" in items_by_modality:
+            raise ValueError(
+                "Mixing raw image and embedding inputs is not allowed")
+
+        if "image_embeds" in items_by_modality:
+            image_embeds_lst = items_by_modality["image_embeds"]
+            if len(image_embeds_lst) > 1:
+                raise ValueError(
+                    "Only one message can have {'type': 'image_embeds'}")
+            mm_inputs["image"] = image_embeds_lst[0]
+        if "image" in items_by_modality:
+            mm_inputs["image"] = items_by_modality["image"] # A list of images
+        if "audio" in items_by_modality:
+            mm_inputs["audio"] = items_by_modality["audio"] # A list of audios
+        if "video" in items_by_modality:
+            mm_inputs["video"] = items_by_modality["video"] # A list of videos
+        return mm_inputs
+
+    def create_parser(self) -> "BaseMultiModalContentParser":
+        return AsyncMultiModalContentParser(self)
+
+
+class BaseMultiModalContentParser(ABC):
+
+    def __init__(self) -> None:
+        super().__init__()
+
+        # stores model placehodlers list with corresponding
+        # general MM placeholder:
+        # {
+        #   "<##IMAGE##>": ["<image>", "<image>", "<image>"],
+        #   "<##AUDIO##>": ["<audio>", "<audio>"]
+        # }
+        self._placeholder_storage: dict[str, list] = defaultdict(list)
+
+    def _add_placeholder(self, modality: ModalityStr,
+                         placeholder: Optional[str]):
+        mod_placeholder = MODALITY_PLACEHOLDERS_MAP[modality]
+        if placeholder:
+            self._placeholder_storage[mod_placeholder].append(placeholder)
+
+    def mm_placeholder_storage(self) -> dict[str, list]:
+        return dict(self._placeholder_storage)
+
+    @abstractmethod
+    def parse_image(self, image_url: str) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def parse_image_embeds(self,
+                           image_embeds: Union[str, dict[str, str]]) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def parse_image_pil(self, image_pil: Image.Image) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def parse_audio(self, audio_url: str) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def parse_input_audio(self, input_audio: InputAudio) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def parse_video(self, video_url: str) -> None:
+        raise NotImplementedError
+
+
+class MultiModalContentParser(BaseMultiModalContentParser):
+
+    def __init__(self, tracker: MultiModalItemTracker) -> None:
+        super().__init__()
+
+        self._tracker = tracker
+
+        self._connector = MediaConnector(
+            media_io_kwargs=self._tracker._model_config.media_io_kwargs,
+            allowed_local_media_path=tracker.allowed_local_media_path,
+        )
+
+    def parse_image(self, image_url: str) -> None:
+        image = self._connector.fetch_image(image_url)
+
+        placeholder = self._tracker.add("image", image)
+        self._add_placeholder("image", placeholder)
+
+    def parse_image_embeds(self,
+                           image_embeds: Union[str, dict[str, str]]) -> None:
+        if isinstance(image_embeds, dict):
+            embeds = {
+                k: self._connector.fetch_image_embedding(v)
+                for k, v in image_embeds.items()
+            }
+            placeholder = self._tracker.add("image_embeds", embeds)
+
+        if isinstance(image_embeds, str):
+            embedding = self._connector.fetch_image_embedding(image_embeds)
+            placeholder = self._tracker.add("image_embeds", embedding)
+
+        self._add_placeholder("image", placeholder)
+
+    def parse_image_pil(self, image_pil: Image.Image) -> None:
+        placeholder = self._tracker.add("image", image_pil)
+        self._add_placeholder("image", placeholder)
+
+    def parse_audio(self, audio_url: str) -> None:
+        audio = self._connector.fetch_audio(audio_url)
+
+        placeholder = self._tracker.add("audio", audio)
+        self._add_placeholder("audio", placeholder)
+
+    def parse_input_audio(self, input_audio: InputAudio) -> None:
+        audio_data = input_audio.get("data", "")
+        audio_format = input_audio.get("format", "")
+        audio_url = f"data:audio/{audio_format};base64,{audio_data}"
+
+        return self.parse_audio(audio_url)
+
+    def parse_video(self, video_url: str) -> None:
+        video = self._connector.fetch_video(video_url=video_url)
+
+        placeholder = self._tracker.add("video", video)
+        self._add_placeholder("video", placeholder)
+
+
+class AsyncMultiModalContentParser(BaseMultiModalContentParser):
+
+    def __init__(self, tracker: AsyncMultiModalItemTracker) -> None:
+        super().__init__()
+
+        self._tracker = tracker
+        self._connector = MediaConnector(
+            media_io_kwargs=self._tracker._model_config.media_io_kwargs,
+            allowed_local_media_path=tracker.allowed_local_media_path
+        )
+
+    def parse_image(self, image_url: str) -> None:
+        image_coro = self._connector.fetch_image_async(image_url)
+
+        placeholder = self._tracker.add("image", image_coro)
+        self._add_placeholder("image", placeholder)
+
+    def parse_image_embeds(self,
+                           image_embeds: Union[str, dict[str, str]]) -> None:
+        future: asyncio.Future[Union[str, dict[str, str]]] = asyncio.Future()
+
+        if isinstance(image_embeds, dict):
+            embeds = {
+                k: self._connector.fetch_image_embedding(v)
+                for k, v in image_embeds.items()
+            }
+            future.set_result(embeds)
+
+        if isinstance(image_embeds, str):
+            embedding = self._connector.\
+                fetch_image_embedding(image_embeds)
+            future.set_result(embedding)
+
+        placeholder = self._tracker.add("image_embeds", future)
+        self._add_placeholder("image", placeholder)
+
+    def parse_image_pil(self, image_pil: Image.Image) -> None:
+        future: asyncio.Future[Image.Image] = asyncio.Future()
+        future.set_result(image_pil)
+
+        placeholder = self._tracker.add("image", future)
+        self._add_placeholder("image", placeholder)
+
+    def parse_audio(self, audio_url: str) -> None:
+        audio_coro = self._connector.fetch_audio_async(audio_url)
+
+        placeholder = self._tracker.add("audio", audio_coro)
+        self._add_placeholder("audio", placeholder)
+
+    def parse_input_audio(self, input_audio: InputAudio) -> None:
+        audio_data = input_audio.get("data", "")
+        audio_format = input_audio.get("format", "")
+        audio_url = f"data:audio/{audio_format};base64,{audio_data}"
+
+        return self.parse_audio(audio_url)
+
+    def parse_video(self, video_url: str) -> None:
+        video = self._connector.fetch_video_async(video_url=video_url)
+
+        placeholder = self._tracker.add("video", video)
+        self._add_placeholder("video", placeholder)
+
+
+def validate_chat_template(chat_template: Optional[Union[Path, str]]):
+    """Raises if the provided chat template appears invalid."""
+    if chat_template is None:
+        return
+
+    elif isinstance(chat_template, Path) and not chat_template.exists():
+        raise FileNotFoundError(
+            "the supplied chat template path doesn't exist")
+
+    elif isinstance(chat_template, str):
+        JINJA_CHARS = "{}\n"
+        if not any(c in chat_template
+                   for c in JINJA_CHARS) and not Path(chat_template).exists():
+            raise ValueError(
+                f"The supplied chat template string ({chat_template}) "
+                f"appears path-like, but doesn't exist!")
+
+    else:
+        raise TypeError(
+            f"{type(chat_template)} is not a valid chat template type")
+
+
+def _load_chat_template(
+    chat_template: Optional[Union[Path, str]],
+    *,
+    is_literal: bool = False,
+) -> Optional[str]:
+    if chat_template is None:
+        return None
+
+    if is_literal:
+        if isinstance(chat_template, Path):
+            raise TypeError("chat_template is expected to be read directly "
+                            "from its value")
+
+        return chat_template
+
+    try:
+        with open(chat_template) as f:
+            return f.read()
+    except OSError as e:
+        if isinstance(chat_template, Path):
+            raise
+
+        JINJA_CHARS = "{}\n"
+        if not any(c in chat_template for c in JINJA_CHARS):
+            msg = (f"The supplied chat template ({chat_template}) "
+                   f"looks like a file path, but it failed to be "
+                   f"opened. Reason: {e}")
+            raise ValueError(msg) from e
+
+        # If opening a file fails, set chat template to be args to
+        # ensure we decode so our escape are interpreted correctly
+        return _load_chat_template(chat_template, is_literal=True)
+
+
+_cached_load_chat_template = lru_cache(_load_chat_template)
+
+
+def load_chat_template(
+    chat_template: Optional[Union[Path, str]],
+    *,
+    is_literal: bool = False,
+) -> Optional[str]:
+    return _cached_load_chat_template(chat_template, is_literal=is_literal)
+
+
+def _get_interleaved_text_prompt(placeholder_storage: dict[str, list],
+                                 texts: list[str]) -> str:
+    for idx, elem in enumerate(texts):
+        if elem in placeholder_storage:
+            texts[idx] = placeholder_storage[elem].pop(0)
+
+    return "\n".join(texts)
+
+
+# TODO: Let user specify how to insert multimodal tokens into prompt
+# (similar to chat template)
+def _get_full_multimodal_text_prompt(placeholder_storage: dict[str, list],
+                                     texts: list[str],
+                                     interleave_strings: bool
+                                     ) -> str:
+    """Combine multimodal prompts for a multimodal language model."""
+
+    # flatten storage to make it looks like
+    # {
+    #   "<|image|>": 2,
+    #   "<|audio|>": 1
+    # }
+    placeholder_counts = Counter(
+        [v for elem in placeholder_storage.values() for v in elem]
+    )
+
+    if interleave_strings:
+        text_prompt = _get_interleaved_text_prompt(placeholder_storage, texts)
+    else:
+        text_prompt = "\n".join(texts)
+
+    # Pass interleaved text further in case the user used image placeholders
+    # himself, but forgot to disable the 'interleave_strings' flag
+
+    # Look through the text prompt to check for missing placeholders
+    missing_placeholders: list[str] = []
+    for placeholder in placeholder_counts:
+
+        # For any existing placeholder in the text prompt, we leave it as is
+        placeholder_counts[placeholder] -= text_prompt.count(placeholder)
+
+        if placeholder_counts[placeholder] < 0:
+            logger.error(
+                "Placeholder count is negative! "
+                "Ensure that the 'interleave_strings' flag is disabled "
+                "(current value: %s) "
+                "when manually placing image placeholders.", interleave_strings
+            )
+            logger.debug("Input prompt: %s", text_prompt)
+            raise ValueError(
+                f"Found more '{placeholder}' placeholders in input prompt than "
+                "actual multimodal data items.")
+
+        missing_placeholders.extend([placeholder] *
+                                    placeholder_counts[placeholder])
+
+    # NOTE: Default behaviour: we always add missing placeholders
+    # at the front of the prompt, if interleave_strings=False
+    return "\n".join(missing_placeholders + [text_prompt])
+
+
+# No need to validate using Pydantic again
+_TextParser = partial(cast, ChatCompletionContentPartTextParam)
+_ImageEmbedsParser = partial(cast, ChatCompletionContentPartImageEmbedsParam)
+_InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
+_RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
+_PILImageParser = partial(cast, CustomChatCompletionContentPILImageParam)
+_ThinkParser = partial(cast, CustomThinkCompletionContentParam)
+# Need to validate url objects
+_ImageParser = TypeAdapter(ChatCompletionContentPartImageParam).validate_python
+_AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam).validate_python
+_VideoParser = TypeAdapter(ChatCompletionContentPartVideoParam).validate_python
+
+_ResponsesInputImageParser = TypeAdapter(
+    ResponseInputImageParam).validate_python
+_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio, PILImage]
+
+# Define a mapping from part types to their corresponding parsing functions.
+MM_PARSER_MAP: dict[
+    str,
+    Callable[[ChatCompletionContentPartParam], _ContentPart],
+] = {
+    "text":
+    lambda part: _TextParser(part).get("text", None),
+    "thinking":
+    lambda part: _ThinkParser(part).get("thinking", None),
+    "input_text":
+    lambda part: _TextParser(part).get("text", None),
+    "input_image":
+    lambda part: _ResponsesInputImageParser(part).get("image_url", None),
+    "image_url":
+    lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
+    "image_embeds":
+    lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
+    "image_pil": lambda part: _PILImageParser(part).get("image_pil", None),
+    "audio_url":
+    lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
+    "input_audio":
+    lambda part: _InputAudioParser(part).get("input_audio", None),
+    "refusal":
+    lambda part: _RefusalParser(part).get("refusal", None),
+    "video_url":
+    lambda part: _VideoParser(part).get("video_url", {}).get("url", None),
+}
+
+
+def _parse_chat_message_content_mm_part(
+        part: ChatCompletionContentPartParam) -> tuple[str, _ContentPart]:
+    """
+    Parses a given multi-modal content part based on its type.
+
+    Args:
+        part: A dict containing the content part, with a potential 'type' field.
+
+    Returns:
+        A tuple (part_type, content) where:
+        - part_type: Type of the part (e.g., 'text', 'image_url').
+        - content: Parsed content (e.g., text, image URL).
+
+    Raises:
+        ValueError: If the 'type' field is missing and no direct URL is found.
+    """
+    assert isinstance(
+        part, dict)  # This is needed to avoid mypy errors: part.get() from str
+    part_type = part.get("type", None)
+
+    if isinstance(part_type, str) and part_type in MM_PARSER_MAP:
+        content = MM_PARSER_MAP[part_type](part)
+
+        # Special case for 'image_url.detail'
+        # We only support 'auto', which is the default
+        if part_type == "image_url" and part.get("detail", "auto") != "auto":
+            logger.warning("'image_url.detail' is currently not supported "
+                           "and will be ignored.")
+
+        return part_type, content
+
+    # Handle missing 'type' but provided direct URL fields.
+    # 'type' is required field by pydantic
+    if part_type is None:
+        if part.get("image_url") is not None:
+            image_params = cast(CustomChatCompletionContentSimpleImageParam,
+                                part)
+            return "image_url", image_params.get("image_url", "")
+        if part.get("audio_url") is not None:
+            audio_params = cast(CustomChatCompletionContentSimpleAudioParam,
+                                part)
+            return "audio_url", audio_params.get("audio_url", "")
+        if part.get("input_audio") is not None:
+            input_audio_params = cast(dict[str, str], part)
+            return "input_audio", input_audio_params
+        if part.get("video_url") is not None:
+            video_params = cast(CustomChatCompletionContentSimpleVideoParam,
+                                part)
+            return "video_url", video_params.get("video_url", "")
+        # Raise an error if no 'type' or direct URL is found.
+        raise ValueError("Missing 'type' field in multimodal part.")
+
+    if not isinstance(part_type, str):
+        raise ValueError("Invalid 'type' field in multimodal part.")
+    return part_type, "unknown part_type content"
+
+
+VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url",
+                                       "image_embeds", "image_pil",
+                                       "audio_url", "input_audio", "video_url")
+
+
+def _parse_chat_message_content_parts(
+    role: str,
+    parts: Iterable[ChatCompletionContentPartParam],
+    mm_tracker: BaseMultiModalItemTracker,
+    *,
+    wrap_dicts: bool,
+    interleave_strings: bool,
+) -> list[ConversationMessage]:
+    content = list[_ContentPart]()
+
+    mm_parser = mm_tracker.create_parser()
+
+    for part in parts:
+        parse_res = _parse_chat_message_content_part(
+            part,
+            mm_parser,
+            wrap_dicts=wrap_dicts,
+            interleave_strings=interleave_strings
+        )
+        if parse_res:
+            content.append(parse_res)
+
+    if wrap_dicts:
+        # Parsing wraps images and texts as interleaved dictionaries
+        return [ConversationMessage(role=role,
+                                    content=content)]  # type: ignore
+    texts = cast(list[str], content)
+    mm_placeholder_storage = mm_parser.mm_placeholder_storage()
+    if mm_placeholder_storage:
+        text_prompt = _get_full_multimodal_text_prompt(mm_placeholder_storage,
+                                                       texts,
+                                                       interleave_strings)
+    else:
+        text_prompt = "\n".join(texts)
+
+    return [ConversationMessage(role=role, content=text_prompt)]
+
+
+def _parse_chat_message_content_part(
+    part: ChatCompletionContentPartParam,
+    mm_parser: BaseMultiModalContentParser,
+    *,
+    wrap_dicts: bool,
+    interleave_strings: bool,
+) -> Optional[_ContentPart]:
+    """Parses a single part of a conversation. If wrap_dicts is True,
+    structured dictionary pieces for texts and images will be
+    wrapped in dictionaries, i.e., {"type": "text", "text", ...} and
+    {"type": "image"}, respectively. Otherwise multimodal data will be
+    handled by mm_parser, and texts will be returned as strings to be joined
+    with multimodal placeholders.
+    """
+    if isinstance(part, str):  # Handle plain text parts
+        return part
+    # Handle structured dictionary parts
+    part_type, content = _parse_chat_message_content_mm_part(part)
+    # if part_type is text/refusal/image_url/audio_url/video_url/input_audio but
+    # content is None, log a warning and skip
+    if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and content is None:
+        logger.warning(
+            "Skipping multimodal part '%s' (type: '%s') "
+            "with empty / unparsable content.", part, part_type)
+        return None
+
+    if part_type in ("text", "input_text", "refusal", "thinking"):
+        str_content = cast(str, content)
+        if wrap_dicts:
+            return {'type': 'text', 'text': str_content}
+        else:
+            return str_content
+
+    modality = None
+    if part_type == "image_pil":
+        image_content = cast(Image.Image, content)
+        mm_parser.parse_image_pil(image_content)
+        modality = "image"
+    elif part_type in ("image_url", "input_image"):
+        str_content = cast(str, content)
+        mm_parser.parse_image(str_content)
+        modality = "image"
+    elif part_type == "image_embeds":
+        content = cast(Union[str, dict[str, str]], content)
+        mm_parser.parse_image_embeds(content)
+        modality = "image"
+    elif part_type == "audio_url":
+        str_content = cast(str, content)
+        mm_parser.parse_audio(str_content)
+        modality = "audio"
+    elif part_type == "input_audio":
+        dict_content = cast(InputAudio, content)
+        mm_parser.parse_input_audio(dict_content)
+        modality = "audio"
+    elif part_type == "video_url":
+        str_content = cast(str, content)
+        mm_parser.parse_video(str_content)
+        modality = "video"
+    else:
+        raise NotImplementedError(f"Unknown part type: {part_type}")
+
+    return {'type': modality} if wrap_dicts else (
+        MODALITY_PLACEHOLDERS_MAP[modality] if interleave_strings else None
+    )
+
+
+# No need to validate using Pydantic again
+_AssistantParser = partial(cast, ChatCompletionAssistantMessageParam)
+_ToolParser = partial(cast, ChatCompletionToolMessageParam)
+
+
+def _parse_chat_message_content(
+    message: ChatCompletionMessageParam,
+    mm_tracker: BaseMultiModalItemTracker,
+    content_format: _ChatTemplateContentFormat,
+    interleave_strings: bool,
+) -> list[ConversationMessage]:
+    role = message["role"]
+    content = message.get("content")
+
+    if content is None:
+        content = []
+    elif isinstance(content, str):
+        content = [
+            ChatCompletionContentPartTextParam(type="text", text=content)
+        ]
+    result = _parse_chat_message_content_parts(
+        role,
+        content,  # type: ignore
+        mm_tracker,
+        wrap_dicts=(content_format == "openai"),
+        interleave_strings=interleave_strings,
+    )
+
+    for result_msg in result:
+        if role == 'assistant':
+            parsed_msg = _AssistantParser(message)
+
+            # The 'tool_calls' is not None check ensures compatibility.
+            # It's needed only if downstream code doesn't strictly
+            # follow the OpenAI spec.
+            if ("tool_calls" in parsed_msg
+                and parsed_msg["tool_calls"] is not None):
+                result_msg["tool_calls"] = list(parsed_msg["tool_calls"])
+        elif role == "tool":
+            parsed_msg = _ToolParser(message)
+            if "tool_call_id" in parsed_msg:
+                result_msg["tool_call_id"] = parsed_msg["tool_call_id"]
+
+        if "name" in message and isinstance(message["name"], str):
+            result_msg["name"] = message["name"]
+
+    return result
+
+
+def _postprocess_messages(messages: list[ConversationMessage]) -> None:
+    # per the Transformers docs & maintainers, tool call arguments in
+    # assistant-role messages with tool_calls need to be dicts not JSON str -
+    # this is how tool-use chat templates will expect them moving forwards
+    # so, for messages that have tool_calls, parse the string (which we get
+    # from openAI format) to dict
+    for message in messages:
+        if (message["role"] == "assistant" and "tool_calls" in message
+                and isinstance(message["tool_calls"], list)):
+
+            for item in message["tool_calls"]:
+                item["function"]["arguments"] = json.loads(
+                    item["function"]["arguments"])
+
+
+def parse_chat_messages(
+    messages: list[ChatCompletionMessageParam],
+    model_config: ModelConfig,
+    tokenizer: AnyTokenizer,
+    content_format: _ChatTemplateContentFormat,
+) -> tuple[list[ConversationMessage], Optional[MultiModalDataDict]]:
+    conversation: list[ConversationMessage] = []
+    mm_tracker = MultiModalItemTracker(model_config, tokenizer)
+
+    for msg in messages:
+        sub_messages = _parse_chat_message_content(
+            msg,
+            mm_tracker,
+            content_format,
+            interleave_strings=(
+                content_format == "string"
+                and model_config.multimodal_config is not None
+                and model_config.multimodal_config.interleave_mm_strings
+            )
+        )
+
+        conversation.extend(sub_messages)
+
+    _postprocess_messages(conversation)
+
+    return conversation, mm_tracker.all_mm_data()
+
+
+def parse_chat_messages_futures(
+    messages: list[ChatCompletionMessageParam],
+    model_config: ModelConfig,
+    tokenizer: AnyTokenizer,
+    content_format: _ChatTemplateContentFormat,
+) -> tuple[list[ConversationMessage], Awaitable[Optional[MultiModalDataDict]]]:
+    conversation: list[ConversationMessage] = []
+    mm_tracker = AsyncMultiModalItemTracker(model_config, tokenizer)
+
+    for msg in messages:
+        sub_messages = _parse_chat_message_content(
+            msg,
+            mm_tracker,
+            content_format,
+            interleave_strings=(
+                content_format == "string"
+                and model_config.multimodal_config is not None
+                and model_config.multimodal_config.interleave_mm_strings
+            )
+        )
+
+        conversation.extend(sub_messages)
+
+    _postprocess_messages(conversation)
+
+    return conversation, mm_tracker.all_mm_data()
+
+
+@deprecate_kwargs(
+    "trust_remote_code",
+    additional_message="Please use `model_config.trust_remote_code` instead.",
+)
+def apply_hf_chat_template(
+    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    conversation: list[ConversationMessage],
+    chat_template: Optional[str],
+    tools: Optional[list[dict[str, Any]]],
+    *,
+    model_config: ModelConfig,
+    tokenize: bool = False,  # Different from HF's default
+    # Deprecated, explicitly capture here so it doesn't slit into kwargs.
+    trust_remote_code: Optional[bool] = None,
+    **kwargs: Any,
+) -> str:
+    hf_chat_template = resolve_hf_chat_template(
+        tokenizer,
+        chat_template=chat_template,
+        tools=tools,
+        model_config=model_config,
+    )
+
+    if hf_chat_template is None:
+        raise ValueError(
+            "As of transformers v4.44, default chat template is no longer "
+            "allowed, so you must provide a chat template if the tokenizer "
+            "does not define one.")
+
+    try:
+
+        return tokenizer.apply_chat_template(
+            conversation=conversation,  # type: ignore[arg-type]
+            tools=tools,  # type: ignore[arg-type]
+            chat_template=hf_chat_template,
+            tokenize=tokenize,
+            **kwargs,
+        )
+
+    # External library exceptions can sometimes occur despite the framework's
+    # internal exception management capabilities.
+    except Exception as e:
+
+        # Log and report any library-related exceptions for further
+        # investigation.
+        logger.exception(
+            "An error occurred in `transformers` while applying chat template")
+        raise ValueError(str(e)) from e
+
+def apply_mistral_chat_template(
+    tokenizer: MistralTokenizer,
+    messages: list[ChatCompletionMessageParam],
+    chat_template: Optional[str],
+    tools: Optional[list[dict[str, Any]]],
+    **kwargs: Any,
+) -> list[int]:
+    from mistral_common.exceptions import MistralCommonException
+
+    # The return value of resolve_mistral_chat_template is always None,
+    # and we won't use it.
+    resolve_mistral_chat_template(
+        chat_template=chat_template,
+        **kwargs,
+    )
+
+    try:
+        return tokenizer.apply_chat_template(
+            messages=messages,
+            tools=tools,
+            **kwargs,
+        )
+    # mistral-common uses assert statements to stop processing of input
+    # if input does not comply with the expected format.
+    # We convert those assertion errors to ValueErrors so they can be
+    # are properly caught in the preprocessing_input step
+    except (AssertionError, MistralCommonException) as e:
+        raise ValueError(str(e)) from e
+
+    # External library exceptions can sometimes occur despite the framework's
+    # internal exception management capabilities.
+    except Exception as e:
+
+        # Log and report any library-related exceptions for further
+        # investigation.
+        logger.exception(
+            "An error occurred in `mistral_common` while applying chat "
+            "template")
+        raise ValueError(str(e)) from e
+
+def random_tool_call_id() -> str:
+    return f"chatcmpl-tool-{random_uuid()}"
diff --git a/vllm_v0.10.0/vllm/entrypoints/cli/__init__.py b/vllm_v0.10.0/vllm/entrypoints/cli/__init__.py
new file mode 100644
index 0000000..41671b5
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/cli/__init__.py
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.entrypoints.cli.benchmark.latency import BenchmarkLatencySubcommand
+from vllm.entrypoints.cli.benchmark.serve import BenchmarkServingSubcommand
+from vllm.entrypoints.cli.benchmark.throughput import (
+    BenchmarkThroughputSubcommand)
+
+__all__: list[str] = [
+    "BenchmarkLatencySubcommand",
+    "BenchmarkServingSubcommand",
+    "BenchmarkThroughputSubcommand",
+]
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/entrypoints/cli/benchmark/__init__.py b/vllm_v0.10.0/vllm/entrypoints/cli/benchmark/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/entrypoints/cli/benchmark/base.py b/vllm_v0.10.0/vllm/entrypoints/cli/benchmark/base.py
new file mode 100644
index 0000000..0c22bc7
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/cli/benchmark/base.py
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+
+from vllm.entrypoints.cli.types import CLISubcommand
+
+
+class BenchmarkSubcommandBase(CLISubcommand):
+    """ The base class of subcommands for vllm bench. """
+
+    help: str
+
+    @classmethod
+    def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
+        """Add the CLI arguments to the parser."""
+        raise NotImplementedError
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        """Run the benchmark.
+
+        Args:
+            args: The arguments to the command.
+        """
+        raise NotImplementedError
diff --git a/vllm_v0.10.0/vllm/entrypoints/cli/benchmark/latency.py b/vllm_v0.10.0/vllm/entrypoints/cli/benchmark/latency.py
new file mode 100644
index 0000000..3e68963
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/cli/benchmark/latency.py
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+
+from vllm.benchmarks.latency import add_cli_args, main
+from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
+
+
+class BenchmarkLatencySubcommand(BenchmarkSubcommandBase):
+    """ The `latency` subcommand for vllm bench. """
+
+    name = "latency"
+    help = "Benchmark the latency of a single batch of requests."
+
+    @classmethod
+    def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
+        add_cli_args(parser)
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        main(args)
diff --git a/vllm_v0.10.0/vllm/entrypoints/cli/benchmark/main.py b/vllm_v0.10.0/vllm/entrypoints/cli/benchmark/main.py
new file mode 100644
index 0000000..87fb9f3
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/cli/benchmark/main.py
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import argparse
+import typing
+
+from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
+from vllm.entrypoints.cli.types import CLISubcommand
+from vllm.entrypoints.utils import (VLLM_SUBCMD_PARSER_EPILOG,
+                                    show_filtered_argument_or_group_from_help)
+
+if typing.TYPE_CHECKING:
+    from vllm.utils import FlexibleArgumentParser
+
+
+class BenchmarkSubcommand(CLISubcommand):
+    """ The `bench` subcommand for the vLLM CLI. """
+
+    name = "bench"
+    help = "vLLM bench subcommand."
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        args.dispatch_function(args)
+
+    def validate(self, args: argparse.Namespace) -> None:
+        pass
+
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        bench_parser = subparsers.add_parser(
+            self.name,
+            help=self.help,
+            description=self.help,
+            usage="vllm bench <bench_type> [options]")
+        bench_subparsers = bench_parser.add_subparsers(required=True,
+                                                       dest="bench_type")
+
+        for cmd_cls in BenchmarkSubcommandBase.__subclasses__():
+            cmd_subparser = bench_subparsers.add_parser(
+                cmd_cls.name,
+                help=cmd_cls.help,
+                description=cmd_cls.help,
+                usage=f"vllm bench {cmd_cls.name} [options]",
+            )
+            cmd_subparser.set_defaults(dispatch_function=cmd_cls.cmd)
+            cmd_cls.add_cli_args(cmd_subparser)
+            show_filtered_argument_or_group_from_help(cmd_subparser,
+                                                      ["bench", cmd_cls.name])
+            cmd_subparser.epilog = VLLM_SUBCMD_PARSER_EPILOG
+        return bench_parser
+
+
+def cmd_init() -> list[CLISubcommand]:
+    return [BenchmarkSubcommand()]
diff --git a/vllm_v0.10.0/vllm/entrypoints/cli/benchmark/serve.py b/vllm_v0.10.0/vllm/entrypoints/cli/benchmark/serve.py
new file mode 100644
index 0000000..3dd7a46
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/cli/benchmark/serve.py
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+
+from vllm.benchmarks.serve import add_cli_args, main
+from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
+
+
+class BenchmarkServingSubcommand(BenchmarkSubcommandBase):
+    """ The `serve` subcommand for vllm bench. """
+
+    name = "serve"
+    help = "Benchmark the online serving throughput."
+
+    @classmethod
+    def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
+        add_cli_args(parser)
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        main(args)
diff --git a/vllm_v0.10.0/vllm/entrypoints/cli/benchmark/throughput.py b/vllm_v0.10.0/vllm/entrypoints/cli/benchmark/throughput.py
new file mode 100644
index 0000000..d5d43ad
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/cli/benchmark/throughput.py
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+
+from vllm.benchmarks.throughput import add_cli_args, main
+from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
+
+
+class BenchmarkThroughputSubcommand(BenchmarkSubcommandBase):
+    """ The `throughput` subcommand for vllm bench. """
+
+    name = "throughput"
+    help = "Benchmark offline inference throughput."
+
+    @classmethod
+    def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
+        add_cli_args(parser)
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        main(args)
diff --git a/vllm_v0.10.0/vllm/entrypoints/cli/collect_env.py b/vllm_v0.10.0/vllm/entrypoints/cli/collect_env.py
new file mode 100644
index 0000000..785c188
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/cli/collect_env.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import argparse
+import typing
+
+from vllm.collect_env import main as collect_env_main
+from vllm.entrypoints.cli.types import CLISubcommand
+
+if typing.TYPE_CHECKING:
+    from vllm.utils import FlexibleArgumentParser
+
+
+class CollectEnvSubcommand(CLISubcommand):
+    """The `collect-env` subcommand for the vLLM CLI. """
+    name = "collect-env"
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        """Collect information about the environment."""
+        collect_env_main()
+
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        return subparsers.add_parser(
+            "collect-env",
+            help="Start collecting environment information.",
+            description="Start collecting environment information.",
+            usage="vllm collect-env")
+
+
+def cmd_init() -> list[CLISubcommand]:
+    return [CollectEnvSubcommand()]
diff --git a/vllm_v0.10.0/vllm/entrypoints/cli/main.py b/vllm_v0.10.0/vllm/entrypoints/cli/main.py
new file mode 100644
index 0000000..fed3ea6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/cli/main.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+'''The CLI entrypoints of vLLM
+
+Note that all future modules must be lazily loaded within main
+to avoid certain eager import breakage.'''
+from __future__ import annotations
+
+import importlib.metadata
+
+
+def main():
+    import vllm.entrypoints.cli.benchmark.main
+    import vllm.entrypoints.cli.collect_env
+    import vllm.entrypoints.cli.openai
+    import vllm.entrypoints.cli.run_batch
+    import vllm.entrypoints.cli.serve
+    from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG, cli_env_setup
+    from vllm.utils import FlexibleArgumentParser
+
+    CMD_MODULES = [
+        vllm.entrypoints.cli.openai,
+        vllm.entrypoints.cli.serve,
+        vllm.entrypoints.cli.benchmark.main,
+        vllm.entrypoints.cli.collect_env,
+        vllm.entrypoints.cli.run_batch,
+    ]
+
+    cli_env_setup()
+
+    parser = FlexibleArgumentParser(
+        description="vLLM CLI",
+        epilog=VLLM_SUBCMD_PARSER_EPILOG,
+    )
+    parser.add_argument(
+        '-v',
+        '--version',
+        action='version',
+        version=importlib.metadata.version('vllm'),
+    )
+    subparsers = parser.add_subparsers(required=False, dest="subparser")
+    cmds = {}
+    for cmd_module in CMD_MODULES:
+        new_cmds = cmd_module.cmd_init()
+        for cmd in new_cmds:
+            cmd.subparser_init(subparsers).set_defaults(
+                dispatch_function=cmd.cmd)
+            cmds[cmd.name] = cmd
+    args = parser.parse_args()
+    if args.subparser in cmds:
+        cmds[args.subparser].validate(args)
+
+    if hasattr(args, "dispatch_function"):
+        args.dispatch_function(args)
+    else:
+        parser.print_help()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_v0.10.0/vllm/entrypoints/cli/openai.py b/vllm_v0.10.0/vllm/entrypoints/cli/openai.py
new file mode 100644
index 0000000..e71f77b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/cli/openai.py
@@ -0,0 +1,204 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import argparse
+import os
+import signal
+import sys
+from typing import TYPE_CHECKING
+
+from openai import OpenAI
+from openai.types.chat import ChatCompletionMessageParam
+
+from vllm.entrypoints.cli.types import CLISubcommand
+
+if TYPE_CHECKING:
+    from vllm.utils import FlexibleArgumentParser
+
+
+def _register_signal_handlers():
+
+    def signal_handler(sig, frame):
+        sys.exit(0)
+
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTSTP, signal_handler)
+
+
+def _interactive_cli(args: argparse.Namespace) -> tuple[str, OpenAI]:
+    _register_signal_handlers()
+
+    base_url = args.url
+    api_key = args.api_key or os.environ.get("OPENAI_API_KEY", "EMPTY")
+    openai_client = OpenAI(api_key=api_key, base_url=base_url)
+
+    if args.model_name:
+        model_name = args.model_name
+    else:
+        available_models = openai_client.models.list()
+        model_name = available_models.data[0].id
+
+    print(f"Using model: {model_name}")
+
+    return model_name, openai_client
+
+
+def chat(system_prompt: str | None, model_name: str, client: OpenAI) -> None:
+    conversation: list[ChatCompletionMessageParam] = []
+    if system_prompt is not None:
+        conversation.append({"role": "system", "content": system_prompt})
+
+    print("Please enter a message for the chat model:")
+    while True:
+        try:
+            input_message = input("> ")
+        except EOFError:
+            break
+        conversation.append({"role": "user", "content": input_message})
+
+        chat_completion = client.chat.completions.create(model=model_name,
+                                                         messages=conversation)
+
+        response_message = chat_completion.choices[0].message
+        output = response_message.content
+
+        conversation.append(response_message)  # type: ignore
+        print(output)
+
+
+def _add_query_options(
+        parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
+    parser.add_argument(
+        "--url",
+        type=str,
+        default="http://localhost:8000/v1",
+        help="url of the running OpenAI-Compatible RESTful API server")
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default=None,
+        help=("The model name used in prompt completion, default to "
+              "the first model in list models API call."))
+    parser.add_argument(
+        "--api-key",
+        type=str,
+        default=None,
+        help=(
+            "API key for OpenAI services. If provided, this api key "
+            "will overwrite the api key obtained through environment variables."
+        ))
+    return parser
+
+
+class ChatCommand(CLISubcommand):
+    """The `chat` subcommand for the vLLM CLI. """
+    name = "chat"
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        model_name, client = _interactive_cli(args)
+        system_prompt = args.system_prompt
+        conversation: list[ChatCompletionMessageParam] = []
+
+        if system_prompt is not None:
+            conversation.append({"role": "system", "content": system_prompt})
+
+        if args.quick:
+            conversation.append({"role": "user", "content": args.quick})
+
+            chat_completion = client.chat.completions.create(
+                model=model_name, messages=conversation)
+            print(chat_completion.choices[0].message.content)
+            return
+
+        print("Please enter a message for the chat model:")
+        while True:
+            try:
+                input_message = input("> ")
+            except EOFError:
+                break
+            conversation.append({"role": "user", "content": input_message})
+
+            chat_completion = client.chat.completions.create(
+                model=model_name, messages=conversation)
+
+            response_message = chat_completion.choices[0].message
+            output = response_message.content
+
+            conversation.append(response_message)  # type: ignore
+            print(output)
+
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        chat_parser = subparsers.add_parser(
+            "chat",
+            help="Generate chat completions via the running API server.",
+            description="Generate chat completions via the running API server.",
+            usage="vllm chat [options]")
+        _add_query_options(chat_parser)
+        chat_parser.add_argument(
+            "--system-prompt",
+            type=str,
+            default=None,
+            help=("The system prompt to be added to the chat template, "
+                  "used for models that support system prompts."))
+        chat_parser.add_argument("-q",
+                                 "--quick",
+                                 type=str,
+                                 metavar="MESSAGE",
+                                 help=("Send a single prompt as MESSAGE "
+                                       "and print the response, then exit."))
+        return chat_parser
+
+
+class CompleteCommand(CLISubcommand):
+    """The `complete` subcommand for the vLLM CLI. """
+    name = 'complete'
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        model_name, client = _interactive_cli(args)
+
+        if args.quick:
+            completion = client.completions.create(model=model_name,
+                                                   prompt=args.quick)
+            print(completion.choices[0].text)
+            return
+
+        print("Please enter prompt to complete:")
+        while True:
+            try:
+                input_prompt = input("> ")
+            except EOFError:
+                break
+            completion = client.completions.create(model=model_name,
+                                                   prompt=input_prompt)
+            output = completion.choices[0].text
+            print(output)
+
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        complete_parser = subparsers.add_parser(
+            "complete",
+            help=("Generate text completions based on the given prompt "
+                  "via the running API server."),
+            description=("Generate text completions based on the given prompt "
+                         "via the running API server."),
+            usage="vllm complete [options]")
+        _add_query_options(complete_parser)
+        complete_parser.add_argument(
+            "-q",
+            "--quick",
+            type=str,
+            metavar="PROMPT",
+            help=
+            "Send a single prompt and print the completion output, then exit.")
+        return complete_parser
+
+
+def cmd_init() -> list[CLISubcommand]:
+    return [ChatCommand(), CompleteCommand()]
diff --git a/vllm_v0.10.0/vllm/entrypoints/cli/run_batch.py b/vllm_v0.10.0/vllm/entrypoints/cli/run_batch.py
new file mode 100644
index 0000000..8649167
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/cli/run_batch.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import importlib.metadata
+import typing
+
+from vllm.entrypoints.cli.types import CLISubcommand
+from vllm.entrypoints.utils import (VLLM_SUBCMD_PARSER_EPILOG,
+                                    show_filtered_argument_or_group_from_help)
+from vllm.logger import init_logger
+
+if typing.TYPE_CHECKING:
+    from vllm.utils import FlexibleArgumentParser
+
+logger = init_logger(__name__)
+
+
+class RunBatchSubcommand(CLISubcommand):
+    """The `run-batch` subcommand for vLLM CLI."""
+    name = "run-batch"
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        from vllm.entrypoints.openai.run_batch import main as run_batch_main
+
+        logger.info("vLLM batch processing API version %s",
+                    importlib.metadata.version("vllm"))
+        logger.info("args: %s", args)
+
+        # Start the Prometheus metrics server.
+        # LLMEngine uses the Prometheus client
+        # to publish metrics at the /metrics endpoint.
+        if args.enable_metrics:
+            from prometheus_client import start_http_server
+
+            logger.info("Prometheus metrics enabled")
+            start_http_server(port=args.port, addr=args.url)
+        else:
+            logger.info("Prometheus metrics disabled")
+
+        asyncio.run(run_batch_main(args))
+
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        from vllm.entrypoints.openai.run_batch import make_arg_parser
+
+        run_batch_parser = subparsers.add_parser(
+            "run-batch",
+            help="Run batch prompts and write results to file.",
+            description=(
+                "Run batch prompts using vLLM's OpenAI-compatible API.\n"
+                "Supports local or HTTP input/output files."),
+            usage=
+            "vllm run-batch -i INPUT.jsonl -o OUTPUT.jsonl --model <model>",
+        )
+        run_batch_parser = make_arg_parser(run_batch_parser)
+        show_filtered_argument_or_group_from_help(run_batch_parser,
+                                                  ["run-batch"])
+        run_batch_parser.epilog = VLLM_SUBCMD_PARSER_EPILOG
+        return run_batch_parser
+
+
+def cmd_init() -> list[CLISubcommand]:
+    return [RunBatchSubcommand()]
diff --git a/vllm_v0.10.0/vllm/entrypoints/cli/serve.py b/vllm_v0.10.0/vllm/entrypoints/cli/serve.py
new file mode 100644
index 0000000..72460c2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/cli/serve.py
@@ -0,0 +1,238 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import os
+import signal
+import sys
+from typing import Optional
+
+import uvloop
+
+import vllm
+import vllm.envs as envs
+from vllm.entrypoints.cli.types import CLISubcommand
+from vllm.entrypoints.openai.api_server import (run_server, run_server_worker,
+                                                setup_server)
+from vllm.entrypoints.openai.cli_args import (make_arg_parser,
+                                              validate_parsed_serve_args)
+from vllm.entrypoints.utils import (VLLM_SUBCMD_PARSER_EPILOG,
+                                    show_filtered_argument_or_group_from_help)
+from vllm.executor.multiproc_worker_utils import _add_prefix
+from vllm.logger import init_logger
+from vllm.usage.usage_lib import UsageContext
+from vllm.utils import FlexibleArgumentParser, get_tcp_uri
+from vllm.v1.engine.core import EngineCoreProc
+from vllm.v1.engine.utils import CoreEngineProcManager, launch_core_engines
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus
+from vllm.v1.utils import (APIServerProcessManager,
+                           wait_for_completion_or_failure)
+
+logger = init_logger(__name__)
+
+
+class ServeSubcommand(CLISubcommand):
+    """The `serve` subcommand for the vLLM CLI. """
+    name = "serve"
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        # If model is specified in CLI (as positional arg), it takes precedence
+        if hasattr(args, 'model_tag') and args.model_tag is not None:
+            args.model = args.model_tag
+
+        if args.headless or args.api_server_count < 1:
+            run_headless(args)
+        else:
+            if args.api_server_count > 1:
+                run_multi_api_server(args)
+            else:
+                # Single API server (this process).
+                uvloop.run(run_server(args))
+
+    def validate(self, args: argparse.Namespace) -> None:
+        validate_parsed_serve_args(args)
+
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        serve_parser = subparsers.add_parser(
+            "serve",
+            help="Start the vLLM OpenAI Compatible API server.",
+            description="Start the vLLM OpenAI Compatible API server.",
+            usage="vllm serve [model_tag] [options]")
+
+        serve_parser = make_arg_parser(serve_parser)
+        show_filtered_argument_or_group_from_help(serve_parser, ["serve"])
+        serve_parser.epilog = VLLM_SUBCMD_PARSER_EPILOG
+        return serve_parser
+
+
+def cmd_init() -> list[CLISubcommand]:
+    return [ServeSubcommand()]
+
+
+def run_headless(args: argparse.Namespace):
+
+    if args.api_server_count > 1:
+        raise ValueError("api_server_count can't be set in headless mode")
+
+    # Create the EngineConfig.
+    engine_args = vllm.AsyncEngineArgs.from_cli_args(args)
+    usage_context = UsageContext.OPENAI_API_SERVER
+    vllm_config = engine_args.create_engine_config(usage_context=usage_context,
+                                                   headless=True)
+
+    if not envs.VLLM_USE_V1:
+        raise ValueError("Headless mode is only supported for V1")
+
+    if engine_args.data_parallel_hybrid_lb:
+        raise ValueError("data_parallel_hybrid_lb is not applicable in "
+                         "headless mode")
+
+    parallel_config = vllm_config.parallel_config
+    local_engine_count = parallel_config.data_parallel_size_local
+
+    if local_engine_count <= 0:
+        raise ValueError("data_parallel_size_local must be > 0 in "
+                         "headless mode")
+
+    host = parallel_config.data_parallel_master_ip
+    port = engine_args.data_parallel_rpc_port  # add to config too
+    handshake_address = get_tcp_uri(host, port)
+
+    # Catch SIGTERM and SIGINT to allow graceful shutdown.
+    def signal_handler(signum, frame):
+        logger.debug("Received %d signal.", signum)
+        raise SystemExit
+
+    signal.signal(signal.SIGTERM, signal_handler)
+    signal.signal(signal.SIGINT, signal_handler)
+
+    logger.info(
+        "Launching %d data parallel engine(s) in headless mode, "
+        "with head node address %s.", local_engine_count, handshake_address)
+
+    # Create the engines.
+    engine_manager = CoreEngineProcManager(
+        target_fn=EngineCoreProc.run_engine_core,
+        local_engine_count=local_engine_count,
+        start_index=vllm_config.parallel_config.data_parallel_rank,
+        local_start_index=0,
+        vllm_config=vllm_config,
+        local_client=False,
+        handshake_address=handshake_address,
+        executor_class=Executor.get_class(vllm_config),
+        log_stats=not engine_args.disable_log_stats,
+    )
+
+    try:
+        engine_manager.join_first()
+    finally:
+        logger.info("Shutting down.")
+        engine_manager.close()
+
+
+def run_multi_api_server(args: argparse.Namespace):
+
+    assert not args.headless
+    num_api_servers = args.api_server_count
+    assert num_api_servers > 0
+
+    if num_api_servers > 1:
+        setup_multiprocess_prometheus()
+
+    listen_address, sock = setup_server(args)
+
+    engine_args = vllm.AsyncEngineArgs.from_cli_args(args)
+    usage_context = UsageContext.OPENAI_API_SERVER
+    vllm_config = engine_args.create_engine_config(usage_context=usage_context)
+    model_config = vllm_config.model_config
+
+    if num_api_servers > 1:
+        if not envs.VLLM_USE_V1:
+            raise ValueError("api_server_count > 1 is only supported for V1")
+
+        if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
+            raise ValueError("VLLM_ALLOW_RUNTIME_LORA_UPDATING cannot be used "
+                             "with api_server_count > 1")
+
+        if model_config.is_multimodal_model and not (
+                model_config.disable_mm_preprocessor_cache):
+            logger.warning(
+                "Multi-model preprocessor cache will be disabled for"
+                " api_server_count > 1")
+            model_config.disable_mm_preprocessor_cache = True
+
+        if vllm_config.parallel_config.data_parallel_hybrid_lb:
+            raise NotImplementedError(
+                "Hybrid load balancing with --api-server-count > 0"
+                "is not yet supported.")
+
+    executor_class = Executor.get_class(vllm_config)
+    log_stats = not engine_args.disable_log_stats
+
+    parallel_config = vllm_config.parallel_config
+    dp_rank = parallel_config.data_parallel_rank
+    external_dp_lb = parallel_config.data_parallel_external_lb
+    assert external_dp_lb or dp_rank == 0
+
+    api_server_manager: Optional[APIServerProcessManager] = None
+
+    with launch_core_engines(vllm_config, executor_class, log_stats,
+                             num_api_servers) as (local_engine_manager,
+                                                  coordinator, addresses):
+
+        # Construct common args for the APIServerProcessManager up-front.
+        api_server_manager_kwargs = dict(
+            target_server_fn=run_api_server_worker_proc,
+            listen_address=listen_address,
+            sock=sock,
+            args=args,
+            num_servers=num_api_servers,
+            input_addresses=addresses.inputs,
+            output_addresses=addresses.outputs,
+            stats_update_address=coordinator.get_stats_publish_address()
+            if coordinator else None)
+
+        # For dp ranks > 0 in external DP LB mode, we must delay the
+        # start of the API servers until the local engine is started
+        # (after the launcher context manager exits),
+        # since we get the front-end stats update address from the coordinator
+        # via the handshake with the local engine.
+        if dp_rank == 0 or not external_dp_lb:
+            # Start API servers using the manager.
+            api_server_manager = APIServerProcessManager(
+                **api_server_manager_kwargs)
+
+    # Start API servers now if they weren't already started.
+    if api_server_manager is None:
+        api_server_manager_kwargs["stats_update_address"] = (
+            addresses.frontend_stats_publish_address)
+        api_server_manager = APIServerProcessManager(
+            **api_server_manager_kwargs)
+
+    # Wait for API servers
+    wait_for_completion_or_failure(api_server_manager=api_server_manager,
+                                   engine_manager=local_engine_manager,
+                                   coordinator=coordinator)
+
+
+def run_api_server_worker_proc(listen_address,
+                               sock,
+                               args,
+                               client_config=None,
+                               **uvicorn_kwargs) -> None:
+    """Entrypoint for individual API server worker processes."""
+
+    # Add process-specific prefix to stdout and stderr.
+    from multiprocessing import current_process
+    process_name = current_process().name
+    pid = os.getpid()
+    _add_prefix(sys.stdout, process_name, pid)
+    _add_prefix(sys.stderr, process_name, pid)
+
+    uvloop.run(
+        run_server_worker(listen_address, sock, args, client_config,
+                          **uvicorn_kwargs))
diff --git a/vllm_v0.10.0/vllm/entrypoints/cli/types.py b/vllm_v0.10.0/vllm/entrypoints/cli/types.py
new file mode 100644
index 0000000..b88f094
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/cli/types.py
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import argparse
+import typing
+
+if typing.TYPE_CHECKING:
+    from vllm.utils import FlexibleArgumentParser
+
+
+class CLISubcommand:
+    """Base class for CLI argument handlers."""
+
+    name: str
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        raise NotImplementedError("Subclasses should implement this method")
+
+    def validate(self, args: argparse.Namespace) -> None:
+        # No validation by default
+        pass
+
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        raise NotImplementedError("Subclasses should implement this method")
diff --git a/vllm_v0.10.0/vllm/entrypoints/launcher.py b/vllm_v0.10.0/vllm/entrypoints/launcher.py
new file mode 100644
index 0000000..9f4dc19
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/launcher.py
@@ -0,0 +1,147 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import signal
+import socket
+from http import HTTPStatus
+from typing import Any, Optional
+
+import uvicorn
+from fastapi import FastAPI, Request, Response
+
+from vllm import envs
+from vllm.engine.async_llm_engine import AsyncEngineDeadError
+from vllm.engine.multiprocessing import MQEngineDeadError
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.ssl import SSLCertRefresher
+from vllm.logger import init_logger
+from vllm.utils import find_process_using_port
+from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
+
+logger = init_logger(__name__)
+
+
+async def serve_http(app: FastAPI,
+                     sock: Optional[socket.socket],
+                     enable_ssl_refresh: bool = False,
+                     **uvicorn_kwargs: Any):
+    logger.info("Available routes are:")
+    for route in app.routes:
+        methods = getattr(route, "methods", None)
+        path = getattr(route, "path", None)
+
+        if methods is None or path is None:
+            continue
+
+        logger.info("Route: %s, Methods: %s", path, ', '.join(methods))
+
+    config = uvicorn.Config(app, **uvicorn_kwargs)
+    config.load()
+    server = uvicorn.Server(config)
+    _add_shutdown_handlers(app, server)
+
+    loop = asyncio.get_running_loop()
+
+    watchdog_task = loop.create_task(
+        watchdog_loop(server, app.state.engine_client))
+    server_task = loop.create_task(
+        server.serve(sockets=[sock] if sock else None))
+
+    ssl_cert_refresher = None if not enable_ssl_refresh else SSLCertRefresher(
+        ssl_context=config.ssl,
+        key_path=config.ssl_keyfile,
+        cert_path=config.ssl_certfile,
+        ca_path=config.ssl_ca_certs)
+
+    def signal_handler() -> None:
+        # prevents the uvicorn signal handler to exit early
+        server_task.cancel()
+        watchdog_task.cancel()
+        if ssl_cert_refresher:
+            ssl_cert_refresher.stop()
+
+    async def dummy_shutdown() -> None:
+        pass
+
+    loop.add_signal_handler(signal.SIGINT, signal_handler)
+    loop.add_signal_handler(signal.SIGTERM, signal_handler)
+
+    try:
+        await server_task
+        return dummy_shutdown()
+    except asyncio.CancelledError:
+        port = uvicorn_kwargs["port"]
+        process = find_process_using_port(port)
+        if process is not None:
+            logger.debug(
+                "port %s is used by process %s launched with command:\n%s",
+                port, process, " ".join(process.cmdline()))
+        logger.info("Shutting down FastAPI HTTP server.")
+        return server.shutdown()
+    finally:
+        watchdog_task.cancel()
+
+
+async def watchdog_loop(server: uvicorn.Server, engine: EngineClient):
+    """
+    # Watchdog task that runs in the background, checking
+    # for error state in the engine. Needed to trigger shutdown
+    # if an exception arises is StreamingResponse() generator.
+    """
+    VLLM_WATCHDOG_TIME_S = 5.0
+    while True:
+        await asyncio.sleep(VLLM_WATCHDOG_TIME_S)
+        terminate_if_errored(server, engine)
+
+
+def terminate_if_errored(server: uvicorn.Server, engine: EngineClient):
+    """
+    See discussions here on shutting down a uvicorn server
+    https://github.com/encode/uvicorn/discussions/1103
+    In this case we cannot await the server shutdown here
+    because handler must first return to close the connection
+    for this request.
+    """
+    engine_errored = engine.errored and not engine.is_running
+    if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine_errored:
+        server.should_exit = True
+
+
+def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None:
+    """
+    VLLM V1 AsyncLLM catches exceptions and returns
+    only two types: EngineGenerateError and EngineDeadError.
+    
+    EngineGenerateError is raised by the per request generate()
+    method. This error could be request specific (and therefore
+    recoverable - e.g. if there is an error in input processing).
+    
+    EngineDeadError is raised by the background output_handler
+    method. This error is global and therefore not recoverable.
+    
+    We register these @app.exception_handlers to return nice
+    responses to the end user if they occur and shut down if needed.
+    See https://fastapi.tiangolo.com/tutorial/handling-errors/
+    for more details on how exception handlers work.
+
+    If an exception is encountered in a StreamingResponse
+    generator, the exception is not raised, since we already sent
+    a 200 status. Rather, we send an error message as the next chunk.
+    Since the exception is not raised, this means that the server
+    will not automatically shut down. Instead, we use the watchdog
+    background task for check for errored state.
+    """
+
+    @app.exception_handler(RuntimeError)
+    @app.exception_handler(AsyncEngineDeadError)
+    @app.exception_handler(MQEngineDeadError)
+    @app.exception_handler(EngineDeadError)
+    @app.exception_handler(EngineGenerateError)
+    async def runtime_exception_handler(request: Request, __):
+        terminate_if_errored(
+            server=server,
+            engine=request.app.state.engine_client,
+        )
+
+        return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
diff --git a/vllm_v0.10.0/vllm/entrypoints/llm.py b/vllm_v0.10.0/vllm/entrypoints/llm.py
new file mode 100644
index 0000000..2f766a2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/llm.py
@@ -0,0 +1,1730 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import itertools
+import warnings
+from collections.abc import Sequence
+from contextlib import contextmanager
+from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union,
+                    cast, overload)
+
+import cloudpickle
+import torch.nn as nn
+from pydantic import ValidationError
+from tqdm.auto import tqdm
+from typing_extensions import TypeVar, deprecated
+
+from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
+                              BeamSearchSequence,
+                              create_sort_beams_key_function)
+from vllm.config import (CompilationConfig, ModelDType, TokenizerMode,
+                         is_init_field)
+from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
+                                   TaskOption)
+from vllm.engine.llm_engine import LLMEngine
+from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
+                                         ChatTemplateContentFormatOption,
+                                         apply_hf_chat_template,
+                                         apply_mistral_chat_template,
+                                         parse_chat_messages,
+                                         resolve_chat_template_content_format)
+from vllm.entrypoints.score_utils import (ScoreContentPartParam,
+                                          ScoreMultiModalParam,
+                                          _cosine_similarity,
+                                          _validate_score_input_lens,
+                                          get_score_prompt)
+from vllm.entrypoints.utils import _validate_truncation_size
+from vllm.inputs import PromptType, SingletonPrompt, TextPrompt, TokensPrompt
+from vllm.inputs.parse import parse_and_batch_prompt
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.model_executor.guided_decoding.guided_fields import (
+    GuidedDecodingRequest, LLMGuidedOptions)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.outputs import (ClassificationRequestOutput, EmbeddingRequestOutput,
+                          PoolingRequestOutput, RequestOutput,
+                          ScoringRequestOutput)
+from vllm.pooling_params import PoolingParams, PoolingTask
+from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
+                                  RequestOutputKind, SamplingParams)
+from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
+                                               get_cached_tokenizer)
+from vllm.usage.usage_lib import UsageContext
+from vllm.utils import Counter, Device, deprecate_kwargs, is_list_of
+
+if TYPE_CHECKING:
+    from vllm.v1.metrics.reader import Metric
+
+logger = init_logger(__name__)
+
+_R = TypeVar("_R", default=Any)
+
+
+class LLM:
+    """An LLM for generating texts from given prompts and sampling parameters.
+
+    This class includes a tokenizer, a language model (possibly distributed
+    across multiple GPUs), and GPU memory space allocated for intermediate
+    states (aka KV cache). Given a batch of prompts and sampling parameters,
+    this class generates texts from the model, using an intelligent batching
+    mechanism and efficient memory management.
+
+    Args:
+        model: The name or path of a HuggingFace Transformers model.
+        tokenizer: The name or path of a HuggingFace Transformers tokenizer.
+        tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer
+            if available, and "slow" will always use the slow tokenizer.
+        skip_tokenizer_init: If true, skip initialization of tokenizer and
+            detokenizer. Expect valid prompt_token_ids and None for prompt
+            from the input.
+        trust_remote_code: Trust remote code (e.g., from HuggingFace) when
+            downloading the model and tokenizer.
+        allowed_local_media_path: Allowing API requests to read local images
+            or videos from directories specified by the server file system.
+            This is a security risk. Should only be enabled in trusted
+            environments.
+        tensor_parallel_size: The number of GPUs to use for distributed
+            execution with tensor parallelism.
+        dtype: The data type for the model weights and activations. Currently,
+            we support `float32`, `float16`, and `bfloat16`. If `auto`, we use
+            the `torch_dtype` attribute specified in the model config file.
+            However, if the `torch_dtype` in the config is `float32`, we will
+            use `float16` instead.
+        quantization: The method used to quantize the model weights. Currently,
+            we support "awq", "gptq", and "fp8" (experimental).
+            If None, we first check the `quantization_config` attribute in the
+            model config file. If that is None, we assume the model weights are
+            not quantized and use `dtype` to determine the data type of
+            the weights.
+        revision: The specific model version to use. It can be a branch name,
+            a tag name, or a commit id.
+        tokenizer_revision: The specific tokenizer version to use. It can be a
+            branch name, a tag name, or a commit id.
+        seed: The seed to initialize the random number generator for sampling.
+        gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to
+            reserve for the model weights, activations, and KV cache. Higher
+            values will increase the KV cache size and thus improve the model's
+            throughput. However, if the value is too high, it may cause out-of-
+            memory (OOM) errors.
+        swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
+            This can be used for temporarily storing the states of the requests
+            when their `best_of` sampling parameters are larger than 1. If all
+            requests will have `best_of=1`, you can safely set this to 0.
+            Noting that `best_of` is only supported in V0. Otherwise, too small
+            values may cause out-of-memory (OOM) errors.
+        cpu_offload_gb: The size (GiB) of CPU memory to use for offloading
+            the model weights. This virtually increases the GPU memory space
+            you can use to hold the model weights, at the cost of CPU-GPU data
+            transfer for every forward pass.
+        enforce_eager: Whether to enforce eager execution. If True, we will
+            disable CUDA graph and always execute the model in eager mode.
+            If False, we will use CUDA graph and eager execution in hybrid.
+        max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
+            When a sequence has context length larger than this, we fall back
+            to eager mode. Additionally for encoder-decoder models, if the
+            sequence length of the encoder input is larger than this, we fall
+            back to the eager mode.
+        disable_custom_all_reduce: See
+            [ParallelConfig][vllm.config.ParallelConfig].
+        disable_async_output_proc: Disable async output processing.
+            This may result in lower performance.
+        hf_token: The token to use as HTTP bearer authorization for remote files
+            . If `True`, will use the token generated when running
+            `huggingface-cli login` (stored in `~/.huggingface`).
+        hf_overrides: If a dictionary, contains arguments to be forwarded to the
+            HuggingFace config. If a callable, it is called to update the
+            HuggingFace config.
+        mm_processor_kwargs: Arguments to be forwarded to the model's processor
+            for multi-modal data, e.g., image processor. Overrides for the
+            multi-modal processor obtained from `AutoProcessor.from_pretrained`.
+            The available overrides depend on the model that is being run.
+            For example, for Phi-3-Vision: `{"num_crops": 4}`.
+        override_pooler_config: Initialize non-default pooling config or
+            override default pooling config for the pooling model.
+            e.g. `PoolerConfig(pooling_type="mean", normalize=False)`.
+        compilation_config: Either an integer or a dictionary. If it is an
+            integer, it is used as the level of compilation optimization. If it
+            is a dictionary, it can specify the full compilation configuration.
+        **kwargs: Arguments for [`EngineArgs`][vllm.EngineArgs].
+
+    Note:
+        This class is intended to be used for offline inference. For online
+        serving, use the [AsyncLLMEngine][vllm.AsyncLLMEngine] class instead.
+    """
+
+    DEPRECATE_LEGACY: ClassVar[bool] = True
+    """A flag to toggle whether to deprecate the legacy generate/encode API."""
+
+    @classmethod
+    @contextmanager
+    def deprecate_legacy_api(cls):
+        cls.DEPRECATE_LEGACY = True
+
+        yield
+
+        cls.DEPRECATE_LEGACY = False
+
+    def __init__(
+        self,
+        model: str,
+        *,
+        task: TaskOption = "auto",
+        tokenizer: Optional[str] = None,
+        tokenizer_mode: TokenizerMode = "auto",
+        skip_tokenizer_init: bool = False,
+        trust_remote_code: bool = False,
+        allowed_local_media_path: str = "",
+        tensor_parallel_size: int = 1,
+        dtype: ModelDType = "auto",
+        quantization: Optional[QuantizationMethods] = None,
+        revision: Optional[str] = None,
+        tokenizer_revision: Optional[str] = None,
+        seed: Optional[int] = None,
+        gpu_memory_utilization: float = 0.9,
+        swap_space: float = 4,
+        cpu_offload_gb: float = 0,
+        enforce_eager: bool = False,
+        max_seq_len_to_capture: int = 8192,
+        disable_custom_all_reduce: bool = False,
+        disable_async_output_proc: bool = False,
+        hf_token: Optional[Union[bool, str]] = None,
+        hf_overrides: Optional[HfOverrides] = None,
+        mm_processor_kwargs: Optional[dict[str, Any]] = None,
+        override_pooler_config: Optional[PoolerConfig] = None,
+        compilation_config: Optional[Union[int, dict[str, Any],
+                                           CompilationConfig]] = None,
+        **kwargs,
+    ) -> None:
+        """LLM constructor."""
+
+        if "disable_log_stats" not in kwargs:
+            kwargs["disable_log_stats"] = True
+
+        if "worker_cls" in kwargs:
+            worker_cls = kwargs["worker_cls"]
+            # if the worker_cls is not qualified string name,
+            # we serialize it using cloudpickle to avoid pickling issues
+            if isinstance(worker_cls, type):
+                kwargs["worker_cls"] = cloudpickle.dumps(worker_cls)
+
+        if "kv_transfer_config" in kwargs and isinstance(
+                kwargs["kv_transfer_config"], dict):
+            from vllm.config import KVTransferConfig
+            raw_config_dict = kwargs["kv_transfer_config"]
+            try:
+                kwargs["kv_transfer_config"] = KVTransferConfig(
+                    **raw_config_dict)
+            except ValidationError as e:
+                logger.error(
+                    "Failed to convert 'kv_transfer_config' dict to "
+                    "KVTransferConfig object. Dict: %s. Error: %s",
+                    raw_config_dict, e)
+                # Consider re-raising a more specific vLLM error or ValueError
+                # to provide better context to the user.
+                raise ValueError(
+                    f"Invalid 'kv_transfer_config' provided: {e}") from e
+
+        if hf_overrides is None:
+            hf_overrides = {}
+
+        if compilation_config is not None:
+            if isinstance(compilation_config, int):
+                compilation_config_instance = CompilationConfig(
+                    level=compilation_config)
+            elif isinstance(compilation_config, dict):
+                predicate = lambda x: is_init_field(CompilationConfig, x[0])
+                compilation_config_instance = CompilationConfig(
+                    **dict(filter(predicate, compilation_config.items())))
+            else:
+                compilation_config_instance = compilation_config
+        else:
+            compilation_config_instance = CompilationConfig()
+
+        engine_args = EngineArgs(
+            model=model,
+            task=task,
+            tokenizer=tokenizer,
+            tokenizer_mode=tokenizer_mode,
+            skip_tokenizer_init=skip_tokenizer_init,
+            trust_remote_code=trust_remote_code,
+            allowed_local_media_path=allowed_local_media_path,
+            tensor_parallel_size=tensor_parallel_size,
+            dtype=dtype,
+            quantization=quantization,
+            revision=revision,
+            tokenizer_revision=tokenizer_revision,
+            seed=seed,
+            gpu_memory_utilization=gpu_memory_utilization,
+            swap_space=swap_space,
+            cpu_offload_gb=cpu_offload_gb,
+            enforce_eager=enforce_eager,
+            max_seq_len_to_capture=max_seq_len_to_capture,
+            disable_custom_all_reduce=disable_custom_all_reduce,
+            disable_async_output_proc=disable_async_output_proc,
+            hf_token=hf_token,
+            hf_overrides=hf_overrides,
+            mm_processor_kwargs=mm_processor_kwargs,
+            override_pooler_config=override_pooler_config,
+            compilation_config=compilation_config_instance,
+            **kwargs,
+        )
+
+        # Create the Engine (autoselects V0 vs V1)
+        self.llm_engine = LLMEngine.from_engine_args(
+            engine_args=engine_args, usage_context=UsageContext.LLM_CLASS)
+        self.engine_class = type(self.llm_engine)
+
+        self.request_counter = Counter()
+        self.default_sampling_params: Union[dict[str, Any], None] = None
+
+    def get_tokenizer(
+        self,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> AnyTokenizer:
+        return self.llm_engine.get_tokenizer_group().get_lora_tokenizer(
+            lora_request)
+
+    def set_tokenizer(self, tokenizer: AnyTokenizer) -> None:
+        tokenizer_group = self.llm_engine.get_tokenizer_group()
+
+        # While CachedTokenizer is dynamic, have no choice but
+        # compare class name. Misjudgment will arise from
+        # user-defined tokenizer started with 'Cached'
+        if tokenizer.__class__.__name__.startswith("Cached"):
+            tokenizer_group.tokenizer = tokenizer
+        else:
+            tokenizer_group.tokenizer = get_cached_tokenizer(tokenizer)
+
+    def get_default_sampling_params(self) -> SamplingParams:
+        if self.default_sampling_params is None:
+            self.default_sampling_params = (
+                self.llm_engine.model_config.get_diff_sampling_param())
+        if self.default_sampling_params:
+            return SamplingParams.from_optional(**self.default_sampling_params)
+        return SamplingParams()
+
+    @overload
+    def generate(
+        self,
+        prompts: Union[PromptType, Sequence[PromptType]],
+        /,
+        sampling_params: Optional[Union[SamplingParams,
+                                        Sequence[SamplingParams]]] = None,
+        *,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
+    ) -> list[RequestOutput]:
+        ...
+
+    @overload  # LEGACY: single (prompt + optional token ids)
+    @deprecated("'prompt_token_ids' will become part of 'prompts'")
+    def generate(
+        self,
+        prompts: str,
+        sampling_params: Optional[Union[SamplingParams,
+                                        list[SamplingParams]]] = None,
+        prompt_token_ids: Optional[list[int]] = None,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
+    ) -> list[RequestOutput]:
+        ...
+
+    @overload  # LEGACY: multi (prompt + optional token ids)
+    @deprecated("'prompt_token_ids' will become part of 'prompts'")
+    def generate(
+        self,
+        prompts: list[str],
+        sampling_params: Optional[Union[SamplingParams,
+                                        list[SamplingParams]]] = None,
+        prompt_token_ids: Optional[list[list[int]]] = None,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
+    ) -> list[RequestOutput]:
+        ...
+
+    @overload  # LEGACY: single (token ids + optional prompt)
+    @deprecated("'prompt_token_ids' will become part of 'prompts'")
+    def generate(
+        self,
+        prompts: Optional[str] = None,
+        sampling_params: Optional[Union[SamplingParams,
+                                        list[SamplingParams]]] = None,
+        *,
+        prompt_token_ids: list[int],
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
+    ) -> list[RequestOutput]:
+        ...
+
+    @overload  # LEGACY: multi (token ids + optional prompt)
+    @deprecated("'prompt_token_ids' will become part of 'prompts'")
+    def generate(
+        self,
+        prompts: Optional[list[str]] = None,
+        sampling_params: Optional[Union[SamplingParams,
+                                        list[SamplingParams]]] = None,
+        *,
+        prompt_token_ids: list[list[int]],
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
+    ) -> list[RequestOutput]:
+        ...
+
+    @overload  # LEGACY: single or multi token ids [pos-only]
+    @deprecated("'prompt_token_ids' will become part of 'prompts'")
+    def generate(
+        self,
+        prompts: None,
+        sampling_params: None,
+        prompt_token_ids: Union[list[int], list[list[int]]],
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
+    ) -> list[RequestOutput]:
+        ...
+
+    @deprecate_kwargs(
+        "prompt_token_ids",
+        is_deprecated=lambda: LLM.DEPRECATE_LEGACY,
+        additional_message="Please use the 'prompts' parameter instead.",
+    )
+    def generate(
+        self,
+        prompts: Union[Union[PromptType, Sequence[PromptType]],
+                       Optional[Union[str, list[str]]]] = None,
+        sampling_params: Optional[Union[SamplingParams,
+                                        Sequence[SamplingParams]]] = None,
+        prompt_token_ids: Optional[Union[list[int], list[list[int]]]] = None,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
+        priority: Optional[list[int]] = None,
+    ) -> list[RequestOutput]:
+        """Generates the completions for the input prompts.
+
+        This class automatically batches the given prompts, considering
+        the memory constraint. For the best performance, put all of your prompts
+        into a single list and pass it to this method.
+
+        Args:
+            prompts: The prompts to the LLM. You may pass a sequence of prompts
+                for batch inference. See [PromptType][vllm.inputs.PromptType]
+                for more details about the format of each prompts.
+            sampling_params: The sampling parameters for text generation. If
+                None, we use the default sampling parameters.
+                When it is a single value, it is applied to every prompt.
+                When it is a list, the list must have the same length as the
+                prompts and it is paired one by one with the prompt.
+            use_tqdm: If `True`, shows a tqdm progress bar.
+                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
+                it is used to create the progress bar.
+                If `False`, no progress bar is created.
+            lora_request: LoRA request to use for generation, if any.
+            priority: The priority of the requests, if any.
+                Only applicable when priority scheduling policy is enabled.
+
+        Returns:
+            A list of `RequestOutput` objects containing the
+            generated completions in the same order as the input prompts.
+
+        Note:
+            Using `prompts` and `prompt_token_ids` as keyword parameters is
+            considered legacy and may be deprecated in the future. You should
+            instead pass them via the `inputs` parameter.
+        """
+        model_config = self.llm_engine.model_config
+        runner_type = model_config.runner_type
+        if runner_type != "generate":
+            messages = [
+                "LLM.generate() is only supported for generative models."
+            ]
+
+            if "generate" in model_config.supported_runner_types:
+                messages.append(
+                    "Your model supports the 'generate' runner, but is "
+                    f"currently initialized for the '{runner_type}' runner. "
+                    "Please initialize vLLM using `--task generate` or "
+                    "`--task transcription`.")
+
+            raise ValueError(" ".join(messages))
+
+        if prompt_token_ids is not None:
+            parsed_prompts = self._convert_v1_inputs(
+                prompts=cast(Optional[Union[str, list[str]]], prompts),
+                prompt_token_ids=prompt_token_ids,
+            )
+        else:
+            parsed_prompts = cast(Union[PromptType, Sequence[PromptType]],
+                                  prompts)
+
+        if isinstance(guided_options_request, dict):
+            if len(guided_options_request) > 1:
+                raise ValueError(
+                    "You can only use one guided decoding but multiple is "
+                    f"specified: {guided_options_request}")
+            guided_options_request = GuidedDecodingRequest(
+                **guided_options_request)
+
+        if sampling_params is None:
+            # Use default sampling params.
+            sampling_params = self.get_default_sampling_params()
+
+        tokenization_kwargs: dict[str, Any] = {}
+        truncate_prompt_tokens = None
+        if isinstance(sampling_params, SamplingParams):
+            truncate_prompt_tokens = sampling_params.truncate_prompt_tokens
+        _validate_truncation_size(self.llm_engine.model_config.max_model_len,
+                                  truncate_prompt_tokens, tokenization_kwargs)
+
+        # Add any modality specific loras to the corresponding prompts
+        lora_request = self._get_modality_specific_lora_reqs(
+            parsed_prompts, lora_request)
+
+        self._validate_and_add_requests(
+            prompts=parsed_prompts,
+            params=sampling_params,
+            use_tqdm=use_tqdm,
+            lora_request=lora_request,
+            guided_options=guided_options_request,
+            tokenization_kwargs=tokenization_kwargs,
+            priority=priority,
+        )
+
+        outputs = self._run_engine(use_tqdm=use_tqdm)
+        return self.engine_class.validate_outputs(outputs, RequestOutput)
+
+    def _get_modality_specific_lora_reqs(
+            self, parsed_prompts: Union[PromptType, Sequence[PromptType]],
+            lora_request: Optional[Union[list[LoRARequest], LoRARequest]]):
+        # Grab the lora config off the vllm config on the engine,
+        # since this is the same for both v0 & v1.
+        lora_config = self.llm_engine.vllm_config.lora_config
+
+        # If there's no lora config / default_mm_loras, or the model
+        # isn't multimodal, leave the lora as is.
+        if (lora_config is None
+                or not self.llm_engine.model_config.is_multimodal_model
+                or (lora_config and lora_config.default_mm_loras is None)):
+            return lora_request
+
+        if not isinstance(parsed_prompts, Sequence):
+            parsed_prompts = [parsed_prompts]
+
+        optional_loras = ([lora_request] * len(parsed_prompts)
+                          if not isinstance(lora_request, Sequence) else
+                          lora_request)
+
+        return [
+            self._resolve_single_prompt_mm_lora(
+                parsed_prompt,
+                opt_lora_req,
+                lora_config.default_mm_loras,
+            ) for parsed_prompt, opt_lora_req in zip(parsed_prompts,
+                                                     optional_loras)
+        ]
+
+    def _resolve_single_prompt_mm_lora(self, parsed_prompt: PromptType,
+                                       lora_request: Optional[LoRARequest],
+                                       default_mm_loras: Optional[dict[str,
+                                                                       str]]):
+        if (not default_mm_loras or not isinstance(parsed_prompt, dict)
+                or "multi_modal_data" not in parsed_prompt):
+            return lora_request
+
+        parsed_prompt = cast(Union[TextPrompt, TokensPrompt], parsed_prompt)
+
+        intersection = set(
+            parsed_prompt["multi_modal_data"].keys()).intersection(
+                default_mm_loras.keys())
+        if not intersection:
+            return lora_request
+        if len(intersection) > 1:
+            # TODO: Would be nice to be able to have multiple loras per prompt
+            logger.warning(
+                "Multiple modality specific loras were registered and would be"
+                " used by a single prompt consuming several modalities; "
+                " currently we only support one lora per request; as such,"
+                " lora(s) registered with modalities: %s"
+                " will be skipped", intersection)
+            return lora_request
+
+        # Build the LoRA request; the ID of the default mm lora is the
+        # index of the modality name sorted alphabetically + 1.
+        modality_name = intersection.pop()
+        modality_lora_path = default_mm_loras[modality_name]
+        modality_lora_id = sorted(default_mm_loras).index(modality_name) + 1
+
+        # If we have a collision, warn if there is a collision,
+        # but always send the explicitly provided request.
+        if lora_request:
+            if lora_request.lora_int_id != modality_lora_id:
+                logger.warning(
+                    "A modality with a registered lora and a lora_request "
+                    "with a different ID were provided; falling back to the "
+                    "lora_request as we only apply one LoRARequest per prompt")
+            return lora_request
+
+        return LoRARequest(
+            modality_name,
+            modality_lora_id,
+            modality_lora_path,
+        )
+
+    def collective_rpc(self,
+                       method: Union[str, Callable[..., _R]],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        """
+        Execute an RPC call on all workers.
+
+        Args:
+            method: Name of the worker method to execute, or a callable that
+                is serialized and sent to all workers to execute.
+
+                If the method is a callable, it should accept an additional
+                `self` argument, in addition to the arguments passed in `args`
+                and `kwargs`. The `self` argument will be the worker object.
+            timeout: Maximum time in seconds to wait for execution. Raises a
+                [`TimeoutError`][] on timeout. `None` means wait indefinitely.
+            args: Positional arguments to pass to the worker method.
+            kwargs: Keyword arguments to pass to the worker method.
+
+        Returns:
+            A list containing the results from each worker.
+
+        Note:
+            It is recommended to use this API to only pass control messages,
+            and set up data-plane communication to pass data.
+        """
+
+        return self.llm_engine.collective_rpc(method, timeout, args, kwargs)
+
+    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
+        """
+        Run a function directly on the model inside each worker,
+        returning the result for each of them.
+        """
+        executor = self.llm_engine.model_executor
+        return executor.apply_model(func)
+
+    def _get_beam_search_lora_requests(
+        self,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]],
+        prompts: list[Union[TokensPrompt, TextPrompt]],
+    ) -> list[Optional[LoRARequest]]:
+        """Get the optional lora request corresponding to each prompt."""
+        if isinstance(lora_request,
+                      Sequence) and len(lora_request) != len(prompts):
+            raise ValueError(
+                "Lora request list should be the same length as the prompts")
+
+        if lora_request is None or isinstance(lora_request, LoRARequest):
+            return [lora_request] * len(prompts)
+
+        raise TypeError(f"Invalid lora_request type {type(lora_request)}")
+
+    def beam_search(
+        self,
+        prompts: list[Union[TokensPrompt, TextPrompt]],
+        params: BeamSearchParams,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        use_tqdm: bool = False,
+    ) -> list[BeamSearchOutput]:
+        """
+        Generate sequences using beam search.
+
+        Args:
+            prompts: A list of prompts. Each prompt can be a string or a list
+                of token IDs.
+            params: The beam search parameters.
+            lora_request: LoRA request to use for generation, if any.
+            use_tqdm: Whether to use tqdm to display the progress bar.
+        """
+        # TODO: how does beam search work together with length penalty,
+        # frequency, penalty, and stopping criteria, etc.?
+        beam_width = params.beam_width
+        max_tokens = params.max_tokens
+        temperature = params.temperature
+        ignore_eos = params.ignore_eos
+        length_penalty = params.length_penalty
+
+        lora_requests = self._get_beam_search_lora_requests(
+            lora_request, prompts)
+
+        tokenizer = self.get_tokenizer()
+        sort_beams_key = create_sort_beams_key_function(
+            tokenizer.eos_token_id,
+            length_penalty,
+        )
+
+        def create_tokens_prompt_from_beam(
+                beam: BeamSearchSequence) -> TokensPrompt:
+            token_prompt_kwargs: TokensPrompt = {
+                "prompt_token_ids": beam.tokens
+            }
+            if beam.multi_modal_data is not None:
+                token_prompt_kwargs["multi_modal_data"] = beam.multi_modal_data
+
+            if beam.mm_processor_kwargs is not None:
+                token_prompt_kwargs[
+                    "mm_processor_kwargs"] = beam.mm_processor_kwargs
+            return TokensPrompt(**token_prompt_kwargs)
+
+        # generate 2 * beam_width candidates at each step
+        # following the huggingface transformers implementation
+        # at https://github.com/huggingface/transformers/blob/e15687fffe5c9d20598a19aeab721ae0a7580f8a/src/transformers/generation/beam_search.py#L534 # noqa
+        beam_search_params = SamplingParams(logprobs=2 * beam_width,
+                                            max_tokens=1,
+                                            temperature=temperature)
+        instances: list[BeamSearchInstance] = []
+
+        for lora_req, prompt in zip(lora_requests, prompts):
+            # Add multimodal processor kwargs & data
+            mm_kwargs = {}
+            if "multi_modal_data" in prompt:
+                mm_kwargs["multi_modal_data"] = prompt["multi_modal_data"]
+            if "mm_processor_kwargs" in prompt:
+                mm_kwargs["mm_processor_kwargs"] = prompt[
+                    "mm_processor_kwargs"]
+
+            if "prompt_token_ids" in prompt:
+                prompt = cast(TokensPrompt, prompt)  # Needed for mypy
+                prompt_tokens = prompt["prompt_token_ids"]
+            else:
+                prompt_tokens = tokenizer.encode(prompt["prompt"])
+
+            instances.append(
+                BeamSearchInstance(
+                    prompt_tokens,
+                    lora_request=lora_req,
+                    logprobs=None,
+                    **mm_kwargs,
+                ), )
+
+        token_iter = range(max_tokens)
+        if use_tqdm:
+            token_iter = tqdm(token_iter,
+                              desc="Beam search",
+                              unit="token",
+                              unit_scale=False)
+            logger.warning(
+                "The progress bar shows the upper bound on token steps and "
+                "may finish early due to stopping conditions. It does not "
+                "reflect instance-level progress.")
+
+        for _ in token_iter:
+            all_beams: list[BeamSearchSequence] = list(
+                sum((instance.beams for instance in instances), []))
+            pos = [0] + list(
+                itertools.accumulate(
+                    len(instance.beams) for instance in instances))
+            instance_start_and_end: list[tuple[int, int]] = list(
+                zip(pos[:-1], pos[1:]))
+
+            if len(all_beams) == 0:
+                break
+
+            # create the corresponding batch entries for prompt & optional lora
+            prompts_batch, lora_req_batch = zip(
+                *[(create_tokens_prompt_from_beam(beam), beam.lora_request)
+                  for beam in all_beams])
+
+            # only runs for one step
+            # we don't need to use tqdm here
+            output = self.generate(prompts_batch,
+                                   sampling_params=beam_search_params,
+                                   use_tqdm=False,
+                                   lora_request=lora_req_batch)
+
+            for (start, end), instance in zip(instance_start_and_end,
+                                              instances):
+                instance_new_beams = []
+                for i in range(start, end):
+                    current_beam = all_beams[i]
+                    result = output[i]
+
+                    if result.outputs[0].logprobs is not None:
+                        # if `result.outputs[0].logprobs` is None, it means
+                        # the sequence is completed because of the max-model-len
+                        # or abortion. we don't need to add it to the new beams.
+                        logprobs = result.outputs[0].logprobs[0]
+                        for token_id, logprob_obj in logprobs.items():
+                            new_beam = BeamSearchSequence(
+                                tokens=current_beam.tokens + [token_id],
+                                logprobs=current_beam.logprobs + [logprobs],
+                                lora_request=current_beam.lora_request,
+                                cum_logprob=current_beam.cum_logprob +
+                                logprob_obj.logprob,
+                                multi_modal_data=current_beam.multi_modal_data,
+                                mm_processor_kwargs=current_beam.
+                                mm_processor_kwargs)
+
+                            if token_id == tokenizer.eos_token_id and \
+                                not ignore_eos:
+                                instance.completed.append(new_beam)
+                            else:
+                                instance_new_beams.append(new_beam)
+                sorted_beams = sorted(instance_new_beams,
+                                      key=sort_beams_key,
+                                      reverse=True)
+                instance.beams = sorted_beams[:beam_width]
+
+        outputs = []
+        for instance in instances:
+            instance.completed.extend(instance.beams)
+            sorted_completed = sorted(instance.completed,
+                                      key=sort_beams_key,
+                                      reverse=True)
+            best_beams = sorted_completed[:beam_width]
+
+            for beam in best_beams:
+                beam.text = tokenizer.decode(beam.tokens)
+            outputs.append(BeamSearchOutput(sequences=best_beams))
+
+        return outputs
+
+    def chat(
+        self,
+        messages: Union[list[ChatCompletionMessageParam],
+                        list[list[ChatCompletionMessageParam]]],
+        sampling_params: Optional[Union[SamplingParams,
+                                        list[SamplingParams]]] = None,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
+        lora_request: Optional[LoRARequest] = None,
+        chat_template: Optional[str] = None,
+        chat_template_content_format: ChatTemplateContentFormatOption = "auto",
+        add_generation_prompt: bool = True,
+        continue_final_message: bool = False,
+        tools: Optional[list[dict[str, Any]]] = None,
+        chat_template_kwargs: Optional[dict[str, Any]] = None,
+        mm_processor_kwargs: Optional[dict[str, Any]] = None,
+    ) -> list[RequestOutput]:
+        """
+        Generate responses for a chat conversation.
+
+        The chat conversation is converted into a text prompt using the
+        tokenizer and calls the [generate][] method to generate the
+        responses.
+
+        Multi-modal inputs can be passed in the same way you would pass them
+        to the OpenAI API.
+
+        Args:
+            messages: A list of conversations or a single conversation.
+
+                - Each conversation is represented as a list of messages.
+                - Each message is a dictionary with 'role' and 'content' keys.
+
+            sampling_params: The sampling parameters for text generation.
+                If None, we use the default sampling parameters. When it
+                is a single value, it is applied to every prompt. When it
+                is a list, the list must have the same length as the
+                prompts and it is paired one by one with the prompt.
+            use_tqdm: If `True`, shows a tqdm progress bar.
+                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
+                it is used to create the progress bar.
+                If `False`, no progress bar is created.
+            lora_request: LoRA request to use for generation, if any.
+            chat_template: The template to use for structuring the chat.
+                If not provided, the model's default chat template will be used.
+            chat_template_content_format: The format to render message content.
+
+                - "string" will render the content as a string.
+                  Example: `"Who are you?"`
+                - "openai" will render the content as a list of dictionaries,
+                  similar to OpenAI schema.
+                  Example: `[{"type": "text", "text": "Who are you?"}]`
+
+            add_generation_prompt: If True, adds a generation template
+                to each message.
+            continue_final_message: If True, continues the final message in
+                the conversation instead of starting a new one. Cannot be
+                `True` if `add_generation_prompt` is also `True`.
+            chat_template_kwargs: Additional kwargs to pass to the chat
+                template.
+            mm_processor_kwargs: Multimodal processor kwarg overrides for this
+                chat request. Only used for offline requests.
+
+        Returns:
+            A list of `RequestOutput` objects containing the generated
+            responses in the same order as the input messages.
+        """
+        list_of_messages: list[list[ChatCompletionMessageParam]]
+
+        # Handle multi and single conversations
+        if is_list_of(messages, list):
+            # messages is list[list[...]]
+            list_of_messages = cast(list[list[ChatCompletionMessageParam]],
+                                    messages)
+        else:
+            # messages is list[...]
+            list_of_messages = [
+                cast(list[ChatCompletionMessageParam], messages)
+            ]
+
+        tokenizer = self.get_tokenizer(lora_request)
+        model_config = self.llm_engine.get_model_config()
+        resolved_content_format = resolve_chat_template_content_format(
+            chat_template,
+            tools,
+            chat_template_content_format,
+            tokenizer,
+            model_config=model_config,
+        )
+
+        _chat_template_kwargs: dict[str, Any] = dict(
+            chat_template=chat_template,
+            add_generation_prompt=add_generation_prompt,
+            continue_final_message=continue_final_message,
+            tools=tools,
+        )
+        _chat_template_kwargs.update(chat_template_kwargs or {})
+
+        prompts: list[Union[TokensPrompt, TextPrompt]] = []
+
+        for msgs in list_of_messages:
+            # NOTE: _parse_chat_message_content_parts() currently doesn't
+            # handle mm_processor_kwargs, since there is no implementation in
+            # the chat message parsing for it.
+            conversation, mm_data = parse_chat_messages(
+                msgs,
+                model_config,
+                tokenizer,
+                content_format=resolved_content_format,
+            )
+
+            if isinstance(tokenizer, MistralTokenizer):
+                prompt_token_ids = apply_mistral_chat_template(
+                    tokenizer,
+                    messages=msgs,
+                    **_chat_template_kwargs,
+                )
+            else:
+                prompt_str = apply_hf_chat_template(
+                    tokenizer=tokenizer,
+                    conversation=conversation,
+                    model_config=model_config,
+                    **_chat_template_kwargs,
+                )
+                # Special tokens are already included in chat templates so
+                # should not be added by the tokenizer in this case.
+                prompt_token_ids = tokenizer.encode(prompt_str,
+                                                    add_special_tokens=False)
+
+            prompt = TokensPrompt(prompt_token_ids=prompt_token_ids)
+
+            if mm_data is not None:
+                prompt["multi_modal_data"] = mm_data
+
+            if mm_processor_kwargs is not None:
+                prompt["mm_processor_kwargs"] = mm_processor_kwargs
+
+            prompts.append(prompt)
+
+        return self.generate(
+            prompts,
+            sampling_params=sampling_params,
+            use_tqdm=use_tqdm,
+            lora_request=lora_request,
+        )
+
+    @overload
+    def encode(
+        self,
+        prompts: Union[PromptType, Sequence[PromptType]],
+        /,
+        pooling_params: Optional[Union[PoolingParams,
+                                       Sequence[PoolingParams]]] = None,
+        *,
+        truncate_prompt_tokens: Optional[int] = None,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        pooling_task: PoolingTask = "encode",
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+    ) -> list[PoolingRequestOutput]:
+        ...
+
+    @overload  # LEGACY: single (prompt + optional token ids)
+    @deprecated("'prompt_token_ids' will become part of 'prompts'")
+    def encode(
+        self,
+        prompts: str,
+        pooling_params: Optional[Union[PoolingParams,
+                                       Sequence[PoolingParams]]] = None,
+        prompt_token_ids: Optional[list[int]] = None,
+        truncate_prompt_tokens: Optional[int] = None,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        pooling_task: PoolingTask = "encode",
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+    ) -> list[PoolingRequestOutput]:
+        ...
+
+    @overload  # LEGACY: multi (prompt + optional token ids)
+    @deprecated("'prompt_token_ids' will become part of 'prompts'")
+    def encode(
+        self,
+        prompts: list[str],
+        pooling_params: Optional[Union[PoolingParams,
+                                       Sequence[PoolingParams]]] = None,
+        prompt_token_ids: Optional[list[list[int]]] = None,
+        truncate_prompt_tokens: Optional[int] = None,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        pooling_task: PoolingTask = "encode",
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+    ) -> list[PoolingRequestOutput]:
+        ...
+
+    @overload  # LEGACY: single (token ids + optional prompt)
+    @deprecated("'prompt_token_ids' will become part of 'prompts'")
+    def encode(
+        self,
+        prompts: Optional[str] = None,
+        pooling_params: Optional[Union[PoolingParams,
+                                       Sequence[PoolingParams]]] = None,
+        *,
+        prompt_token_ids: list[int],
+        truncate_prompt_tokens: Optional[int] = None,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        pooling_task: PoolingTask = "encode",
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+    ) -> list[PoolingRequestOutput]:
+        ...
+
+    @overload  # LEGACY: multi (token ids + optional prompt)
+    @deprecated("'prompt_token_ids' will become part of 'prompts'")
+    def encode(
+        self,
+        prompts: Optional[list[str]] = None,
+        pooling_params: Optional[Union[PoolingParams,
+                                       Sequence[PoolingParams]]] = None,
+        *,
+        prompt_token_ids: list[list[int]],
+        truncate_prompt_tokens: Optional[int] = None,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        pooling_task: PoolingTask = "encode",
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+    ) -> list[PoolingRequestOutput]:
+        ...
+
+    @overload  # LEGACY: single or multi token ids [pos-only]
+    @deprecated("'prompt_token_ids' will become part of 'prompts'")
+    def encode(
+        self,
+        prompts: None,
+        pooling_params: None,
+        prompt_token_ids: Union[list[int], list[list[int]]],
+        truncate_prompt_tokens: Optional[int] = None,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        pooling_task: PoolingTask = "encode",
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+    ) -> list[PoolingRequestOutput]:
+        ...
+
+    @deprecate_kwargs(
+        "prompt_token_ids",
+        is_deprecated=lambda: LLM.DEPRECATE_LEGACY,
+        additional_message="Please use the 'prompts' parameter instead.",
+    )
+    def encode(
+        self,
+        prompts: Union[Union[PromptType, Sequence[PromptType]],
+                       Optional[Union[str, list[str]]]] = None,
+        pooling_params: Optional[Union[PoolingParams,
+                                       Sequence[PoolingParams]]] = None,
+        prompt_token_ids: Optional[Union[list[int], list[list[int]]]] = None,
+        truncate_prompt_tokens: Optional[int] = None,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        pooling_task: PoolingTask = "encode",
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+    ) -> list[PoolingRequestOutput]:
+        """Apply pooling to the hidden states corresponding to the input
+        prompts.
+
+        This class automatically batches the given prompts, considering
+        the memory constraint. For the best performance, put all of your prompts
+        into a single list and pass it to this method.
+
+        Args:
+            prompts: The prompts to the LLM. You may pass a sequence of prompts
+                for batch inference. See [PromptType][vllm.inputs.PromptType]
+                for more details about the format of each prompts.
+            pooling_params: The pooling parameters for pooling. If None, we
+                use the default pooling parameters.
+            use_tqdm: If `True`, shows a tqdm progress bar.
+                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
+                it is used to create the progress bar.
+                If `False`, no progress bar is created.
+            lora_request: LoRA request to use for generation, if any.
+            pooling_task: Override the pooling task to use.
+
+        Returns:
+            A list of `PoolingRequestOutput` objects containing the
+            pooled hidden states in the same order as the input prompts.
+
+        Note:
+            Using `prompts` and `prompt_token_ids` as keyword parameters is
+            considered legacy and may be deprecated in the future. You should
+            instead pass them via the `inputs` parameter.
+        """
+        model_config = self.llm_engine.model_config
+        runner_type = model_config.runner_type
+        if runner_type != "pooling":
+            messages = ["LLM.encode() is only supported for pooling models."]
+
+            if "pooling" in model_config.supported_runner_types:
+                messages.append(
+                    "Your model supports the 'pooling' runner, but is "
+                    f"currently initialized for the '{runner_type}' runner. "
+                    "Please initialize vLLM using `--task embed`, "
+                    "`--task classify`, `--task score` etc.")
+
+            raise ValueError(" ".join(messages))
+
+        if prompt_token_ids is not None:
+            parsed_prompts = self._convert_v1_inputs(
+                prompts=cast(Optional[Union[str, list[str]]], prompts),
+                prompt_token_ids=prompt_token_ids,
+            )
+        else:
+            parsed_prompts = cast(Union[PromptType, Sequence[PromptType]],
+                                  prompts)
+
+        if pooling_params is None:
+            # Use default pooling params.
+            pooling_params = PoolingParams()
+
+        if isinstance(pooling_params, PoolingParams):
+            pooling_params.verify(pooling_task, model_config)
+        else:
+            for pooling_param in pooling_params:
+                pooling_param.verify(pooling_task, model_config)
+
+        if tokenization_kwargs is None:
+            tokenization_kwargs = dict[str, Any]()
+            _validate_truncation_size(model_config.max_model_len,
+                                      truncate_prompt_tokens,
+                                      tokenization_kwargs)
+
+        self._validate_and_add_requests(
+            prompts=parsed_prompts,
+            params=pooling_params,
+            use_tqdm=use_tqdm,
+            lora_request=lora_request,
+            tokenization_kwargs=tokenization_kwargs,
+        )
+
+        outputs = self._run_engine(use_tqdm=use_tqdm)
+        return self.engine_class.validate_outputs(outputs,
+                                                  PoolingRequestOutput)
+
+    def embed(
+        self,
+        prompts: Union[PromptType, Sequence[PromptType]],
+        /,
+        *,
+        truncate_prompt_tokens: Optional[int] = None,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
+        pooling_params: Optional[Union[PoolingParams,
+                                       Sequence[PoolingParams]]] = None,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+    ) -> list[EmbeddingRequestOutput]:
+        """
+        Generate an embedding vector for each prompt.
+
+        This class automatically batches the given prompts, considering
+        the memory constraint. For the best performance, put all of your prompts
+        into a single list and pass it to this method.
+
+        Args:
+            prompts: The prompts to the LLM. You may pass a sequence of prompts
+                for batch inference. See [PromptType][vllm.inputs.PromptType]
+                for more details about the format of each prompts.
+            pooling_params: The pooling parameters for pooling. If None, we
+                use the default pooling parameters.
+            use_tqdm: If `True`, shows a tqdm progress bar.
+                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
+                it is used to create the progress bar.
+                If `False`, no progress bar is created.
+            lora_request: LoRA request to use for generation, if any.
+
+        Returns:
+            A list of `EmbeddingRequestOutput` objects containing the
+            embedding vectors in the same order as the input prompts.
+        """
+        model_config = self.llm_engine.model_config
+        if "embed" not in model_config.supported_tasks:
+            raise ValueError("Embedding API is not supported by this model. "
+                             "Please set `--task embed`.")
+
+        items = self.encode(
+            prompts,
+            truncate_prompt_tokens=truncate_prompt_tokens,
+            use_tqdm=use_tqdm,
+            pooling_params=pooling_params,
+            lora_request=lora_request,
+            pooling_task="embed",
+        )
+
+        return [EmbeddingRequestOutput.from_base(item) for item in items]
+
+    def classify(
+        self,
+        prompts: Union[PromptType, Sequence[PromptType]],
+        /,
+        *,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+    ) -> list[ClassificationRequestOutput]:
+        """
+        Generate class logits for each prompt.
+
+        This class automatically batches the given prompts, considering
+        the memory constraint. For the best performance, put all of your prompts
+        into a single list and pass it to this method.
+
+        Args:
+            prompts: The prompts to the LLM. You may pass a sequence of prompts
+                for batch inference. See [PromptType][vllm.inputs.PromptType]
+                for more details about the format of each prompts.
+            use_tqdm: If `True`, shows a tqdm progress bar.
+                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
+                it is used to create the progress bar.
+                If `False`, no progress bar is created.
+            lora_request: LoRA request to use for generation, if any.
+
+        Returns:
+            A list of `ClassificationRequestOutput` objects containing the
+            embedding vectors in the same order as the input prompts.
+        """
+        model_config = self.llm_engine.model_config
+        if "classify" not in model_config.supported_tasks:
+            raise ValueError(
+                "Classification API is not supported by this model. "
+                "Please set `--task classify`.")
+
+        items = self.encode(
+            prompts,
+            use_tqdm=use_tqdm,
+            lora_request=lora_request,
+            pooling_task="classify",
+        )
+
+        return [ClassificationRequestOutput.from_base(item) for item in items]
+
+    def _embedding_score(
+        self,
+        tokenizer: AnyTokenizer,
+        text_1: list[Union[str, TextPrompt, TokensPrompt]],
+        text_2: list[Union[str, TextPrompt, TokensPrompt]],
+        truncate_prompt_tokens: Optional[int] = None,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+    ) -> list[ScoringRequestOutput]:
+
+        encoded_output: list[PoolingRequestOutput] = self.encode(
+            text_1 + text_2,
+            truncate_prompt_tokens=truncate_prompt_tokens,
+            use_tqdm=use_tqdm,
+            lora_request=lora_request,
+            pooling_task="embed",
+        )
+
+        encoded_output_1: list[PoolingRequestOutput] = encoded_output[
+            0:len(text_1)]
+        encoded_output_2: list[PoolingRequestOutput] = encoded_output[
+            len(text_1):]
+
+        if len(encoded_output_1) == 1:
+            encoded_output_1 = encoded_output_1 * len(encoded_output_2)
+
+        scores = _cosine_similarity(tokenizer=tokenizer,
+                                    embed_1=encoded_output_1,
+                                    embed_2=encoded_output_2)
+
+        items = self.engine_class.validate_outputs(scores,
+                                                   PoolingRequestOutput)
+        return [ScoringRequestOutput.from_base(item) for item in items]
+
+    def _cross_encoding_score(
+        self,
+        tokenizer: AnyTokenizer,
+        data_1: Union[list[str], list[ScoreContentPartParam]],
+        data_2: Union[list[str], list[ScoreContentPartParam]],
+        truncate_prompt_tokens: Optional[int] = None,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+    ) -> list[ScoringRequestOutput]:
+
+        if isinstance(tokenizer, MistralTokenizer):
+            raise ValueError(
+                "Score API is only enabled for `--task embed or score`")
+
+        if len(data_1) == 1:
+            data_1 = data_1 * len(data_2)
+
+        pooling_params = PoolingParams(task="score")
+        tokenization_kwargs: dict[str, Any] = {}
+        _validate_truncation_size(self.llm_engine.model_config.max_model_len,
+                                  truncate_prompt_tokens, tokenization_kwargs)
+
+        parsed_prompts = []
+
+        input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
+
+        if self.llm_engine.model_config.is_multimodal_model:
+
+            model_config = self.llm_engine.model_config
+
+            for q, d in input_pairs:
+                _, engine_prompt = get_score_prompt(
+                    model_config=model_config,
+                    data_1=q,
+                    data_2=d,
+                    tokenizer=tokenizer,
+                    tokenization_kwargs=tokenization_kwargs,
+                )
+
+                parsed_prompts.append(engine_prompt)
+
+        else:
+
+            for q, t in input_pairs:
+                if self.llm_engine.model_config.use_pad_token:
+                    # cross_encoder models defaults to using pad_token.
+                    prompt_inputs = tokenizer(
+                        text=q,  # type: ignore[arg-type]
+                        text_pair=t,  # type: ignore[arg-type]
+                        **tokenization_kwargs)
+                else:
+                    # `llm as reranker` models defaults to not using pad_token.
+                    prompt_inputs = tokenizer(
+                        text=q + t,  # type: ignore[operator]
+                        **tokenization_kwargs)
+                engine_prompt = TokensPrompt(
+                    prompt_token_ids=prompt_inputs["input_ids"],
+                    token_type_ids=prompt_inputs.get("token_type_ids"))
+                parsed_prompts.append(engine_prompt)
+
+        self._validate_and_add_requests(
+            prompts=parsed_prompts,
+            params=pooling_params,
+            use_tqdm=use_tqdm,
+            lora_request=lora_request,
+        )
+
+        outputs = self._run_engine(use_tqdm=use_tqdm)
+        items = self.engine_class.validate_outputs(outputs,
+                                                   PoolingRequestOutput)
+
+        return [ScoringRequestOutput.from_base(item) for item in items]
+
+    def score(
+        self,
+        data_1: Union[SingletonPrompt, Sequence[SingletonPrompt],
+                      ScoreMultiModalParam],
+        data_2: Union[SingletonPrompt, Sequence[SingletonPrompt],
+                      ScoreMultiModalParam],
+        /,
+        *,
+        truncate_prompt_tokens: Optional[int] = None,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+    ) -> list[ScoringRequestOutput]:
+        """Generate similarity scores for all pairs `<text,text_pair>` or
+          `<multi-modal data, multi-modal data pair>`.
+
+        The inputs can be `1 -> 1`, `1 -> N` or `N -> N`.
+        In the `1 - N` case the `data_1` input will be replicated `N`
+        times to pair with the `data_2` inputs.
+        The input pairs are used to build a list of prompts for the
+        cross encoder model. This class automatically batches the prompts,
+        considering the memory constraint. For the best performance, put all
+        of your inputs into a single list and pass it to this method.
+
+        Supports both text and multi-modal data (images, etc.) when used with
+        appropriate multi-modal models. For multi-modal inputs, ensure the 
+        prompt structure matches the model's expected input format.
+
+        Args:
+            data_1: Can be a single prompt, a list of prompts or 
+                `ScoreMultiModalParam`, which can contain either text or 
+                multi-modal data. When a list, it must have the same length as 
+                the `data_2` list.
+            data_2: The data to pair with the query to form the input to 
+                the LLM. Can be text or multi-modal data. See [PromptType]
+                [vllm.inputs.PromptType] for more details about the format of 
+                each prompt.
+            use_tqdm: If `True`, shows a tqdm progress bar.
+                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
+                it is used to create the progress bar.
+                If `False`, no progress bar is created.
+            lora_request: LoRA request to use for generation, if any.
+
+        Returns:
+            A list of `ScoringRequestOutput` objects containing the
+            generated scores in the same order as the input prompts.
+        """
+        model_config = self.llm_engine.model_config
+        runner_type = model_config.runner_type
+        if runner_type != "pooling":
+            messages = ["LLM.score() is only supported for pooling models."]
+
+            if "pooling" in model_config.supported_runner_types:
+                messages.append(
+                    "Your model supports the 'pooling' runner, but is "
+                    f"currently initialized for the '{runner_type}' runner. "
+                    "Please initialize vLLM using `--task embed`, "
+                    "`--task classify`, `--task score` etc.")
+
+            raise ValueError(" ".join(messages))
+
+        if all(t not in model_config.supported_tasks
+               for t in ("embed", "classify")):
+            raise ValueError("Score API is not supported by this model. "
+                             "Please set `--task embed` or `--task classify`.")
+
+        if (model_config.task == "classify"
+                and getattr(model_config.hf_config, "num_labels", 0) != 1):
+            raise ValueError("Score API is only enabled for num_labels == 1.")
+
+        # the tokenizer for models such as
+        # "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing
+        # lists of tokens to the `text` and `text_pair` kwargs
+        tokenizer = self.get_tokenizer()
+
+        if not self.llm_engine.model_config.is_multimodal_model:
+
+            def check_data_type(data: Union[SingletonPrompt,
+                                            Sequence[SingletonPrompt],
+                                            ScoreMultiModalParam]):
+                if isinstance(data, dict) and "content" in data:
+                    raise ValueError(
+                        f"ScoreMultiModalParam is not supported for {self.llm_engine.model_config.architecture}",  # noqa: E501
+                    )
+
+            check_data_type(data_1)
+            check_data_type(data_2)
+
+            def ensure_str(prompt: SingletonPrompt):
+                if isinstance(prompt, dict):
+                    if "multi_modal_data" in prompt:
+                        raise ValueError("Multi-modal prompt is not "
+                                         "supported for scoring")
+                    elif "prompt_token_ids" in prompt:
+                        prompt = tokenizer.decode(
+                            cast(TokensPrompt, prompt)["prompt_token_ids"])
+                    elif "prompt" in prompt:
+                        prompt = cast(TextPrompt, prompt)["prompt"]
+                assert type(prompt) is str
+                return prompt
+
+            if isinstance(data_1, (str, dict)):
+                # Convert a single prompt to a list.
+                data_1 = [data_1]  # type: ignore[list-item]
+
+            data_1 = [ensure_str(t) for t in data_1]
+
+            if isinstance(data_2, (str, dict)):
+                # Convert a single prompt to a list.
+                data_2 = [data_2]  # type: ignore[list-item]
+
+            data_2 = [ensure_str(t) for t in data_2]
+
+        if isinstance(data_1, dict) and "content" in data_1:
+            data_1 = data_1.get("content")  # type: ignore[assignment]
+        elif isinstance(data_1, str):
+            data_1 = [data_1]
+
+        if isinstance(data_2, dict) and "content" in data_2:
+            data_2 = data_2.get("content")  # type: ignore[assignment]
+        elif isinstance(data_2, str):
+            data_2 = [data_2]
+
+        _validate_score_input_lens(data_1, data_2)  # type: ignore[arg-type]
+
+        if self.llm_engine.model_config.is_cross_encoder:
+            return self._cross_encoding_score(
+                tokenizer,
+                data_1,  # type: ignore[arg-type]
+                data_2,  # type: ignore[arg-type]
+                truncate_prompt_tokens,
+                use_tqdm,
+                lora_request)
+        else:
+            return self._embedding_score(
+                tokenizer,
+                data_1,  # type: ignore[arg-type]
+                data_2,  # type: ignore[arg-type]
+                truncate_prompt_tokens,
+                use_tqdm,
+                lora_request)
+
+    def start_profile(self) -> None:
+        self.llm_engine.start_profile()
+
+    def stop_profile(self) -> None:
+        self.llm_engine.stop_profile()
+
+    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
+        return self.llm_engine.reset_prefix_cache(device)
+
+    def sleep(self, level: int = 1):
+        """
+        Put the engine to sleep. The engine should not process any requests.
+        The caller should guarantee that no requests are being processed
+        during the sleep period, before `wake_up` is called.
+
+        Args:
+            level: The sleep level. Level 1 sleep will offload the model
+                weights and discard the kv cache. The content of kv cache
+                is forgotten. Level 1 sleep is good for sleeping and waking
+                up the engine to run the same model again. The model weights
+                are backed up in CPU memory. Please make sure there's enough
+                CPU memory to store the model weights. Level 2 sleep will
+                discard both the model weights and the kv cache. The content
+                of both the model weights and kv cache is forgotten. Level 2
+                sleep is good for sleeping and waking up the engine to run a
+                different model or update the model, where previous model
+                weights are not needed. It reduces CPU memory pressure.
+        """
+        self.reset_prefix_cache()
+        self.llm_engine.sleep(level=level)
+
+    def wake_up(self, tags: Optional[list[str]] = None):
+        """
+        Wake up the engine from sleep mode. See the [sleep][] method
+        for more details.
+
+        Args:
+            tags: An optional list of tags to reallocate the engine memory
+                for specific memory allocations. Values must be in
+                `("weights", "kv_cache")`. If None, all memory is reallocated.
+                wake_up should be called with all tags (or None) before the
+                engine is used again.
+        """
+        self.llm_engine.wake_up(tags)
+
+    def get_metrics(self) -> list["Metric"]:
+        """Return a snapshot of aggregated metrics from Prometheus.
+
+        Returns:
+            A ``MetricSnapshot`` instance capturing the current state
+            of all aggregated metrics from Prometheus.
+
+        Note:
+            This method is only available with the V1 LLM engine.
+        """
+        from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
+        assert isinstance(self.llm_engine, V1LLMEngine)
+        return self.llm_engine.get_metrics()
+
+    # LEGACY
+    def _convert_v1_inputs(
+        self,
+        prompts: Optional[Union[str, list[str]]],
+        prompt_token_ids: Optional[Union[list[int], list[list[int]]]],
+    ):
+        # skip_tokenizer_init is now checked in engine
+
+        if prompts is None and prompt_token_ids is None:
+            raise ValueError(
+                "Either prompts or prompt_token_ids must be provided.")
+        if prompts is not None and prompt_token_ids is not None \
+                and len(prompts) != len(prompt_token_ids):
+            raise ValueError(
+                "The lengths of prompts and prompt_token_ids must be the same."
+            )
+
+        if prompts is not None:
+            prompts = [p["content"] for p in parse_and_batch_prompt(prompts)]
+        if prompt_token_ids is not None:
+            prompt_token_ids = [
+                p["content"] for p in parse_and_batch_prompt(prompt_token_ids)
+            ]
+        if prompts is not None:
+            num_requests = len(prompts)
+        elif prompt_token_ids is not None:
+            num_requests = len(prompt_token_ids)
+        parsed_prompts: list[PromptType] = []
+        for i in range(num_requests):
+            item: PromptType
+
+            if prompts is not None:
+                item = TextPrompt(prompt=prompts[i])
+            elif prompt_token_ids is not None:
+                item = TokensPrompt(prompt_token_ids=prompt_token_ids[i])
+            else:
+                raise AssertionError
+
+            parsed_prompts.append(item)
+
+        return parsed_prompts
+
+    def _validate_and_add_requests(
+        self,
+        prompts: Union[PromptType, Sequence[PromptType]],
+        params: Union[SamplingParams, Sequence[SamplingParams], PoolingParams,
+                      Sequence[PoolingParams]],
+        *,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
+        lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]],
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        guided_options: Optional[GuidedDecodingRequest] = None,
+        priority: Optional[list[int]] = None,
+    ) -> None:
+        if guided_options is not None:
+            warnings.warn(
+                "guided_options_request is deprecated, use "
+                "SamplingParams.guided_decoding instead",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+
+        if isinstance(prompts, (str, dict)):
+            # Convert a single prompt to a list.
+            prompts = [prompts]
+
+        num_requests = len(prompts)
+        if isinstance(params, Sequence) and len(params) != num_requests:
+            raise ValueError("The lengths of prompts and params "
+                             "must be the same.")
+        if isinstance(lora_request,
+                      Sequence) and len(lora_request) != num_requests:
+            raise ValueError("The lengths of prompts and lora_request "
+                             "must be the same.")
+
+        for sp in params if isinstance(params, Sequence) else (params, ):
+            if isinstance(sp, SamplingParams):
+                self._add_guided_params(sp, guided_options)
+
+                # We only care about the final output
+                sp.output_kind = RequestOutputKind.FINAL_ONLY
+
+        # Add requests to the engine.
+        it = prompts
+        if use_tqdm:
+            tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
+            it = tqdm_func(it, desc="Adding requests")
+
+        for i, prompt in enumerate(it):
+            self._add_request(
+                prompt,
+                params[i] if isinstance(params, Sequence) else params,
+                tokenization_kwargs=tokenization_kwargs,
+                lora_request=lora_request[i] if isinstance(
+                    lora_request, Sequence) else lora_request,
+                priority=priority[i] if priority else 0,
+            )
+
+    def _add_request(
+        self,
+        prompt: PromptType,
+        params: Union[SamplingParams, PoolingParams],
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        lora_request: Optional[LoRARequest] = None,
+        priority: int = 0,
+    ) -> None:
+        request_id = str(next(self.request_counter))
+        self.llm_engine.add_request(
+            request_id,
+            prompt,
+            params,
+            lora_request=lora_request,
+            tokenization_kwargs=tokenization_kwargs,
+            priority=priority,
+        )
+
+    def _add_guided_params(
+            self,
+            params: SamplingParams,
+            guided_options: Optional[GuidedDecodingRequest] = None):
+        if guided_options is None:
+            return params
+
+        if params.guided_decoding is not None:
+            raise ValueError("Cannot set both guided_options_request and "
+                             "params.guided_decoding.")
+
+        params.guided_decoding = GuidedDecodingParams(
+            json=guided_options.guided_json,
+            regex=guided_options.guided_regex,
+            choice=guided_options.guided_choice,
+            grammar=guided_options.guided_grammar,
+            json_object=guided_options.guided_json_object,
+            backend=guided_options.guided_decoding_backend,
+            whitespace_pattern=guided_options.guided_whitespace_pattern,
+            structural_tag=guided_options.structural_tag,
+        )
+        return params
+
+    def _run_engine(
+        self,
+        *,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True
+    ) -> list[Union[RequestOutput, PoolingRequestOutput]]:
+        # Initialize tqdm.
+        if use_tqdm:
+            num_requests = self.llm_engine.get_num_unfinished_requests()
+            tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
+            pbar = tqdm_func(
+                total=num_requests,
+                desc="Processed prompts",
+                dynamic_ncols=True,
+                postfix=(f"est. speed input: {0:.2f} toks/s, "
+                         f"output: {0:.2f} toks/s"),
+            )
+
+        # Run the engine.
+        outputs: list[Union[RequestOutput, PoolingRequestOutput]] = []
+        total_in_toks = 0
+        total_out_toks = 0
+        while self.llm_engine.has_unfinished_requests():
+            step_outputs = self.llm_engine.step()
+            for output in step_outputs:
+                if output.finished:
+                    outputs.append(output)
+                    if use_tqdm:
+                        if isinstance(output, RequestOutput):
+                            # Calculate tokens only for RequestOutput
+                            n = len(output.outputs)
+                            assert output.prompt_token_ids is not None
+                            total_in_toks += len(output.prompt_token_ids) * n
+                            in_spd = total_in_toks / pbar.format_dict["elapsed"]
+                            total_out_toks += sum(
+                                len(stp.token_ids) for stp in output.outputs)
+                            out_spd = (total_out_toks /
+                                       pbar.format_dict["elapsed"])
+                            pbar.postfix = (
+                                f"est. speed input: {in_spd:.2f} toks/s, "
+                                f"output: {out_spd:.2f} toks/s")
+                            pbar.update(n)
+                        else:
+                            pbar.update(1)
+                        if pbar.n == num_requests:
+                            pbar.refresh()
+
+        if use_tqdm:
+            pbar.close()
+        # Sort the outputs by request ID.
+        # This is necessary because some requests may be finished earlier than
+        # its previous requests.
+        return sorted(outputs, key=lambda x: int(x.request_id))
diff --git a/vllm_v0.10.0/vllm/entrypoints/logger.py b/vllm_v0.10.0/vllm/entrypoints/logger.py
new file mode 100644
index 0000000..06ff3b4
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/logger.py
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional, Union
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.pooling_params import PoolingParams
+from vllm.sampling_params import BeamSearchParams, SamplingParams
+
+logger = init_logger(__name__)
+
+
+class RequestLogger:
+
+    def __init__(self, *, max_log_len: Optional[int]) -> None:
+        super().__init__()
+
+        self.max_log_len = max_log_len
+
+    def log_inputs(
+        self,
+        request_id: str,
+        prompt: Optional[str],
+        prompt_token_ids: Optional[list[int]],
+        prompt_embeds: Optional[torch.Tensor],
+        params: Optional[Union[SamplingParams, PoolingParams,
+                               BeamSearchParams]],
+        lora_request: Optional[LoRARequest],
+    ) -> None:
+        max_log_len = self.max_log_len
+        if max_log_len is not None:
+            if prompt is not None:
+                prompt = prompt[:max_log_len]
+
+            if prompt_token_ids is not None:
+                prompt_token_ids = prompt_token_ids[:max_log_len]
+
+        logger.info(
+            "Received request %s: prompt: %r, "
+            "params: %s, prompt_token_ids: %s, "
+            "prompt_embeds shape: %s, "
+            "lora_request: %s.", request_id, prompt, params, prompt_token_ids,
+            prompt_embeds.shape if prompt_embeds is not None else None,
+            lora_request)
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/__init__.py b/vllm_v0.10.0/vllm/entrypoints/openai/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/api_server.py b/vllm_v0.10.0/vllm/entrypoints/openai/api_server.py
new file mode 100644
index 0000000..d413551
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/api_server.py
@@ -0,0 +1,1856 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import atexit
+import gc
+import importlib
+import inspect
+import json
+import multiprocessing
+import os
+import signal
+import socket
+import tempfile
+import uuid
+from argparse import Namespace
+from collections.abc import AsyncIterator, Awaitable
+from contextlib import asynccontextmanager
+from functools import partial
+from http import HTTPStatus
+from typing import Annotated, Any, Callable, Optional
+
+import prometheus_client
+import pydantic
+import regex as re
+import uvloop
+from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request
+from fastapi.exceptions import RequestValidationError
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse, Response, StreamingResponse
+from prometheus_client import make_asgi_app
+from prometheus_fastapi_instrumentator import Instrumentator
+from starlette.concurrency import iterate_in_threadpool
+from starlette.datastructures import URL, Headers, MutableHeaders, State
+from starlette.routing import Mount
+from starlette.types import ASGIApp, Message, Receive, Scope, Send
+from typing_extensions import assert_never
+
+import vllm.envs as envs
+from vllm.config import VllmConfig
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine  # type: ignore
+from vllm.engine.multiprocessing.client import MQLLMEngineClient
+from vllm.engine.multiprocessing.engine import run_mp_engine
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.chat_utils import (load_chat_template,
+                                         resolve_hf_chat_template,
+                                         resolve_mistral_chat_template)
+from vllm.entrypoints.launcher import serve_http
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.cli_args import (log_non_default_args,
+                                              make_arg_parser,
+                                              validate_parsed_serve_args)
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              ChatCompletionResponse,
+                                              ClassificationRequest,
+                                              ClassificationResponse,
+                                              CompletionRequest,
+                                              CompletionResponse,
+                                              DetokenizeRequest,
+                                              DetokenizeResponse,
+                                              EmbeddingRequest,
+                                              EmbeddingResponse, ErrorResponse,
+                                              LoadLoRAAdapterRequest,
+                                              PoolingRequest, PoolingResponse,
+                                              RerankRequest, RerankResponse,
+                                              ResponsesRequest,
+                                              ResponsesResponse, ScoreRequest,
+                                              ScoreResponse, TokenizeRequest,
+                                              TokenizeResponse,
+                                              TranscriptionRequest,
+                                              TranscriptionResponse,
+                                              TranslationRequest,
+                                              TranslationResponse,
+                                              UnloadLoRAAdapterRequest)
+# yapf: enable
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.entrypoints.openai.serving_classification import (
+    ServingClassification)
+from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
+from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import (BaseModelPath,
+                                                    LoRAModulePath,
+                                                    OpenAIServingModels)
+from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling
+from vllm.entrypoints.openai.serving_responses import OpenAIServingResponses
+from vllm.entrypoints.openai.serving_score import ServingScores
+from vllm.entrypoints.openai.serving_tokenization import (
+    OpenAIServingTokenization)
+from vllm.entrypoints.openai.serving_transcription import (
+    OpenAIServingTranscription, OpenAIServingTranslation)
+from vllm.entrypoints.openai.tool_parsers import ToolParserManager
+from vllm.entrypoints.utils import (cli_env_setup, load_aware_call,
+                                    with_cancellation)
+from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParserManager
+from vllm.transformers_utils.config import (
+    maybe_register_config_serialize_by_value)
+from vllm.transformers_utils.tokenizer import MistralTokenizer
+from vllm.usage.usage_lib import UsageContext
+from vllm.utils import (Device, FlexibleArgumentParser, get_open_zmq_ipc_path,
+                        is_valid_ipv6_address, set_ulimit)
+from vllm.v1.metrics.prometheus import get_prometheus_registry
+from vllm.version import __version__ as VLLM_VERSION
+
+prometheus_multiproc_dir: tempfile.TemporaryDirectory
+
+# Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765)
+logger = init_logger('vllm.entrypoints.openai.api_server')
+
+_running_tasks: set[asyncio.Task] = set()
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    try:
+        if app.state.log_stats:
+            engine_client: EngineClient = app.state.engine_client
+
+            async def _force_log():
+                while True:
+                    await asyncio.sleep(10.)
+                    await engine_client.do_log_stats()
+
+            task = asyncio.create_task(_force_log())
+            _running_tasks.add(task)
+            task.add_done_callback(_running_tasks.remove)
+        else:
+            task = None
+
+        # Mark the startup heap as static so that it's ignored by GC.
+        # Reduces pause times of oldest generation collections.
+        gc.collect()
+        gc.freeze()
+        try:
+            yield
+        finally:
+            if task is not None:
+                task.cancel()
+    finally:
+        # Ensure app state including engine ref is gc'd
+        del app.state
+
+
+@asynccontextmanager
+async def build_async_engine_client(
+    args: Namespace,
+    client_config: Optional[dict[str, Any]] = None,
+) -> AsyncIterator[EngineClient]:
+
+    # Context manager to handle engine_client lifecycle
+    # Ensures everything is shutdown and cleaned up on error/exit
+    engine_args = AsyncEngineArgs.from_cli_args(args)
+
+    async with build_async_engine_client_from_engine_args(
+            engine_args, args.disable_frontend_multiprocessing,
+            client_config) as engine:
+        yield engine
+
+
+@asynccontextmanager
+async def build_async_engine_client_from_engine_args(
+    engine_args: AsyncEngineArgs,
+    disable_frontend_multiprocessing: bool = False,
+    client_config: Optional[dict[str, Any]] = None,
+) -> AsyncIterator[EngineClient]:
+    """
+    Create EngineClient, either:
+        - in-process using the AsyncLLMEngine Directly
+        - multiprocess using AsyncLLMEngine RPC
+
+    Returns the Client or None if the creation failed.
+    """
+
+    # Create the EngineConfig (determines if we can use V1).
+    usage_context = UsageContext.OPENAI_API_SERVER
+    vllm_config = engine_args.create_engine_config(usage_context=usage_context)
+
+    # V1 AsyncLLM.
+    if envs.VLLM_USE_V1:
+        if disable_frontend_multiprocessing:
+            logger.warning(
+                "V1 is enabled, but got --disable-frontend-multiprocessing. "
+                "To disable frontend multiprocessing, set VLLM_USE_V1=0.")
+
+        from vllm.v1.engine.async_llm import AsyncLLM
+        async_llm: Optional[AsyncLLM] = None
+        client_index = client_config.pop(
+            "client_index") if client_config else 0
+        try:
+            async_llm = AsyncLLM.from_vllm_config(
+                vllm_config=vllm_config,
+                usage_context=usage_context,
+                disable_log_requests=engine_args.disable_log_requests,
+                disable_log_stats=engine_args.disable_log_stats,
+                client_addresses=client_config,
+                client_index=client_index)
+
+            # Don't keep the dummy data in memory
+            await async_llm.reset_mm_cache()
+
+            yield async_llm
+        finally:
+            if async_llm:
+                async_llm.shutdown()
+
+    # V0 AsyncLLM.
+    elif (MQLLMEngineClient.is_unsupported_config(vllm_config)
+          or disable_frontend_multiprocessing):
+
+        engine_client: Optional[EngineClient] = None
+        try:
+            engine_client = AsyncLLMEngine.from_vllm_config(
+                vllm_config=vllm_config,
+                usage_context=usage_context,
+                disable_log_requests=engine_args.disable_log_requests,
+                disable_log_stats=engine_args.disable_log_stats)
+            yield engine_client
+        finally:
+            if engine_client and hasattr(engine_client, "shutdown"):
+                engine_client.shutdown()
+
+    # V0MQLLMEngine.
+    else:
+        if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
+            # Make TemporaryDirectory for prometheus multiprocessing
+            # Note: global TemporaryDirectory will be automatically
+            #   cleaned up upon exit.
+            global prometheus_multiproc_dir
+            prometheus_multiproc_dir = tempfile.TemporaryDirectory()
+            os.environ[
+                "PROMETHEUS_MULTIPROC_DIR"] = prometheus_multiproc_dir.name
+        else:
+            logger.warning(
+                "Found PROMETHEUS_MULTIPROC_DIR was set by user. "
+                "This directory must be wiped between vLLM runs or "
+                "you will find inaccurate metrics. Unset the variable "
+                "and vLLM will properly handle cleanup.")
+
+        # Select random path for IPC.
+        ipc_path = get_open_zmq_ipc_path()
+        logger.debug("Multiprocessing frontend to use %s for IPC Path.",
+                     ipc_path)
+
+        # Start RPCServer in separate process (holds the LLMEngine).
+        # the current process might have CUDA context,
+        # so we need to spawn a new process
+        context = multiprocessing.get_context("spawn")
+
+        # Ensure we can serialize transformer config before spawning
+        maybe_register_config_serialize_by_value()
+
+        # The Process can raise an exception during startup, which may
+        # not actually result in an exitcode being reported. As a result
+        # we use a shared variable to communicate the information.
+        engine_alive = multiprocessing.Value('b', True, lock=False)
+        engine_process = context.Process(
+            target=run_mp_engine,
+            args=(vllm_config, UsageContext.OPENAI_API_SERVER, ipc_path,
+                  engine_args.disable_log_stats,
+                  engine_args.disable_log_requests, engine_alive))
+        engine_process.start()
+        engine_pid = engine_process.pid
+        assert engine_pid is not None, "Engine process failed to start."
+        logger.info("Started engine process with PID %d", engine_pid)
+
+        def _cleanup_ipc_path():
+            socket_path = ipc_path.replace("ipc://", "")
+            if os.path.exists(socket_path):
+                os.remove(socket_path)
+
+        # Ensure we clean up the local IPC socket file on exit.
+        atexit.register(_cleanup_ipc_path)
+
+        # Build RPCClient, which conforms to EngineClient Protocol.
+        build_client = partial(MQLLMEngineClient, ipc_path, vllm_config,
+                               engine_pid)
+        mq_engine_client = await asyncio.get_running_loop().run_in_executor(
+            None, build_client)
+        try:
+            while True:
+                try:
+                    await mq_engine_client.setup()
+                    break
+                except TimeoutError:
+                    if (not engine_process.is_alive()
+                            or not engine_alive.value):
+                        raise RuntimeError(
+                            "Engine process failed to start. See stack "
+                            "trace for the root cause.") from None
+
+            yield mq_engine_client  # type: ignore[misc]
+        finally:
+            # Ensure rpc server process was terminated
+            engine_process.terminate()
+
+            # Close all open connections to the backend
+            mq_engine_client.close()
+
+            # Wait for engine process to join
+            engine_process.join(4)
+            if engine_process.exitcode is None:
+                # Kill if taking longer than 5 seconds to stop
+                engine_process.kill()
+
+            # Lazy import for prometheus multiprocessing.
+            # We need to set PROMETHEUS_MULTIPROC_DIR environment variable
+            # before prometheus_client is imported.
+            # See https://prometheus.github.io/client_python/multiprocess/
+            from prometheus_client import multiprocess
+            multiprocess.mark_process_dead(engine_process.pid)
+
+
+async def validate_json_request(raw_request: Request):
+    content_type = raw_request.headers.get("content-type", "").lower()
+    media_type = content_type.split(";", maxsplit=1)[0]
+    if media_type != "application/json":
+        raise RequestValidationError(errors=[
+            "Unsupported Media Type: Only 'application/json' is allowed"
+        ])
+
+
+router = APIRouter()
+
+
+class PrometheusResponse(Response):
+    media_type = prometheus_client.CONTENT_TYPE_LATEST
+
+
+def mount_metrics(app: FastAPI):
+    """Mount prometheus metrics to a FastAPI app."""
+
+    registry = get_prometheus_registry()
+
+    # `response_class=PrometheusResponse` is needed to return an HTTP response
+    # with header "Content-Type: text/plain; version=0.0.4; charset=utf-8"
+    # instead of the default "application/json" which is incorrect.
+    # See https://github.com/trallnag/prometheus-fastapi-instrumentator/issues/163#issue-1296092364
+    Instrumentator(
+        excluded_handlers=[
+            "/metrics",
+            "/health",
+            "/load",
+            "/ping",
+            "/version",
+            "/server_info",
+        ],
+        registry=registry,
+    ).add().instrument(app).expose(app, response_class=PrometheusResponse)
+
+    # Add prometheus asgi middleware to route /metrics requests
+    metrics_route = Mount("/metrics", make_asgi_app(registry=registry))
+
+    # Workaround for 307 Redirect for /metrics
+    metrics_route.path_regex = re.compile("^/metrics(?P<path>.*)$")
+    app.routes.append(metrics_route)
+
+
+def base(request: Request) -> OpenAIServing:
+    # Reuse the existing instance
+    return tokenization(request)
+
+
+def models(request: Request) -> OpenAIServingModels:
+    return request.app.state.openai_serving_models
+
+
+def responses(request: Request) -> Optional[OpenAIServingResponses]:
+    return request.app.state.openai_serving_responses
+
+
+def chat(request: Request) -> Optional[OpenAIServingChat]:
+    return request.app.state.openai_serving_chat
+
+
+def completion(request: Request) -> Optional[OpenAIServingCompletion]:
+    return request.app.state.openai_serving_completion
+
+
+def pooling(request: Request) -> Optional[OpenAIServingPooling]:
+    return request.app.state.openai_serving_pooling
+
+
+def embedding(request: Request) -> Optional[OpenAIServingEmbedding]:
+    return request.app.state.openai_serving_embedding
+
+
+def score(request: Request) -> Optional[ServingScores]:
+    return request.app.state.openai_serving_scores
+
+
+def classify(request: Request) -> Optional[ServingClassification]:
+    return request.app.state.openai_serving_classification
+
+
+def rerank(request: Request) -> Optional[ServingScores]:
+    return request.app.state.openai_serving_scores
+
+
+def tokenization(request: Request) -> OpenAIServingTokenization:
+    return request.app.state.openai_serving_tokenization
+
+
+def transcription(request: Request) -> OpenAIServingTranscription:
+    return request.app.state.openai_serving_transcription
+
+
+def translation(request: Request) -> OpenAIServingTranslation:
+    return request.app.state.openai_serving_translation
+
+
+def engine_client(request: Request) -> EngineClient:
+    return request.app.state.engine_client
+
+
+@router.get("/health", response_class=Response)
+async def health(raw_request: Request) -> Response:
+    """Health check."""
+    await engine_client(raw_request).check_health()
+    return Response(status_code=200)
+
+
+@router.get("/load")
+async def get_server_load_metrics(request: Request):
+    # This endpoint returns the current server load metrics.
+    # It tracks requests utilizing the GPU from the following routes:
+    # - /v1/chat/completions
+    # - /v1/completions
+    # - /v1/audio/transcriptions
+    # - /v1/audio/translations
+    # - /v1/embeddings
+    # - /pooling
+    # - /classify
+    # - /score
+    # - /v1/score
+    # - /rerank
+    # - /v1/rerank
+    # - /v2/rerank
+    return JSONResponse(
+        content={'server_load': request.app.state.server_load_metrics})
+
+
+@router.get("/ping", response_class=Response)
+@router.post("/ping", response_class=Response)
+async def ping(raw_request: Request) -> Response:
+    """Ping check. Endpoint required for SageMaker"""
+    return await health(raw_request)
+
+
+@router.post("/tokenize",
+             dependencies=[Depends(validate_json_request)],
+             responses={
+                 HTTPStatus.BAD_REQUEST.value: {
+                     "model": ErrorResponse
+                 },
+                 HTTPStatus.NOT_FOUND.value: {
+                     "model": ErrorResponse
+                 },
+                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
+                     "model": ErrorResponse
+                 },
+                 HTTPStatus.NOT_IMPLEMENTED.value: {
+                     "model": ErrorResponse
+                 },
+             })
+@with_cancellation
+async def tokenize(request: TokenizeRequest, raw_request: Request):
+    handler = tokenization(raw_request)
+
+    try:
+        generator = await handler.create_tokenize(request, raw_request)
+    except NotImplementedError as e:
+        raise HTTPException(status_code=HTTPStatus.NOT_IMPLEMENTED.value,
+                            detail=str(e)) from e
+    except Exception as e:
+        raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+                            detail=str(e)) from e
+
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    elif isinstance(generator, TokenizeResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
+
+
+@router.post("/detokenize",
+             dependencies=[Depends(validate_json_request)],
+             responses={
+                 HTTPStatus.BAD_REQUEST.value: {
+                     "model": ErrorResponse
+                 },
+                 HTTPStatus.NOT_FOUND.value: {
+                     "model": ErrorResponse
+                 },
+                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
+                     "model": ErrorResponse
+                 },
+             })
+@with_cancellation
+async def detokenize(request: DetokenizeRequest, raw_request: Request):
+    handler = tokenization(raw_request)
+
+    try:
+        generator = await handler.create_detokenize(request, raw_request)
+    except OverflowError as e:
+        raise RequestValidationError(errors=[str(e)]) from e
+    except Exception as e:
+        raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+                            detail=str(e)) from e
+
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    elif isinstance(generator, DetokenizeResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
+
+
+def maybe_register_tokenizer_info_endpoint(args):
+    """Conditionally register the tokenizer info endpoint if enabled."""
+    if getattr(args, 'enable_tokenizer_info_endpoint', False):
+
+        @router.get("/tokenizer_info")
+        async def get_tokenizer_info(raw_request: Request):
+            """Get comprehensive tokenizer information."""
+            result = await tokenization(raw_request).get_tokenizer_info()
+            return JSONResponse(content=result.model_dump(),
+                                status_code=result.code if isinstance(
+                                    result, ErrorResponse) else 200)
+
+
+@router.get("/v1/models")
+async def show_available_models(raw_request: Request):
+    handler = models(raw_request)
+
+    models_ = await handler.show_available_models()
+    return JSONResponse(content=models_.model_dump())
+
+
+@router.get("/version")
+async def show_version():
+    ver = {"version": VLLM_VERSION}
+    return JSONResponse(content=ver)
+
+
+@router.post("/v1/responses",
+             dependencies=[Depends(validate_json_request)],
+             responses={
+                 HTTPStatus.OK.value: {
+                     "content": {
+                         "text/event-stream": {}
+                     }
+                 },
+                 HTTPStatus.BAD_REQUEST.value: {
+                     "model": ErrorResponse
+                 },
+                 HTTPStatus.NOT_FOUND.value: {
+                     "model": ErrorResponse
+                 },
+                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
+                     "model": ErrorResponse
+                 },
+             })
+@with_cancellation
+async def create_responses(request: ResponsesRequest, raw_request: Request):
+    handler = responses(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Responses API")
+
+    generator = await handler.create_responses(request, raw_request)
+
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    elif isinstance(generator, ResponsesResponse):
+        return JSONResponse(content=generator.model_dump())
+    return StreamingResponse(content=generator, media_type="text/event-stream")
+
+
+@router.get("/v1/responses/{response_id}")
+async def retrieve_responses(response_id: str, raw_request: Request):
+    handler = responses(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Responses API")
+
+    response = await handler.retrieve_responses(response_id)
+
+    if isinstance(response, ErrorResponse):
+        return JSONResponse(content=response.model_dump(),
+                            status_code=response.code)
+    return JSONResponse(content=response.model_dump())
+
+
+@router.post("/v1/responses/{response_id}/cancel")
+async def cancel_responses(response_id: str, raw_request: Request):
+    handler = responses(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Responses API")
+
+    response = await handler.cancel_responses(response_id)
+
+    if isinstance(response, ErrorResponse):
+        return JSONResponse(content=response.model_dump(),
+                            status_code=response.code)
+    return JSONResponse(content=response.model_dump())
+
+
+@router.post("/v1/chat/completions",
+             dependencies=[Depends(validate_json_request)],
+             responses={
+                 HTTPStatus.OK.value: {
+                     "content": {
+                         "text/event-stream": {}
+                     }
+                 },
+                 HTTPStatus.BAD_REQUEST.value: {
+                     "model": ErrorResponse
+                 },
+                 HTTPStatus.NOT_FOUND.value: {
+                     "model": ErrorResponse
+                 },
+                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
+                     "model": ErrorResponse
+                 }
+             })
+@with_cancellation
+@load_aware_call
+async def create_chat_completion(request: ChatCompletionRequest,
+                                 raw_request: Request):
+    handler = chat(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Chat Completions API")
+
+    generator = await handler.create_chat_completion(request, raw_request)
+
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+
+    elif isinstance(generator, ChatCompletionResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    return StreamingResponse(content=generator, media_type="text/event-stream")
+
+
+@router.post("/v1/completions",
+             dependencies=[Depends(validate_json_request)],
+             responses={
+                 HTTPStatus.OK.value: {
+                     "content": {
+                         "text/event-stream": {}
+                     }
+                 },
+                 HTTPStatus.BAD_REQUEST.value: {
+                     "model": ErrorResponse
+                 },
+                 HTTPStatus.NOT_FOUND.value: {
+                     "model": ErrorResponse
+                 },
+                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
+                     "model": ErrorResponse
+                 },
+             })
+@with_cancellation
+@load_aware_call
+async def create_completion(request: CompletionRequest, raw_request: Request):
+    handler = completion(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Completions API")
+
+    try:
+        generator = await handler.create_completion(request, raw_request)
+    except OverflowError as e:
+        raise HTTPException(status_code=HTTPStatus.BAD_REQUEST.value,
+                            detail=str(e)) from e
+    except Exception as e:
+        raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+                            detail=str(e)) from e
+
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    elif isinstance(generator, CompletionResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    return StreamingResponse(content=generator, media_type="text/event-stream")
+
+
+@router.post("/v1/embeddings",
+             dependencies=[Depends(validate_json_request)],
+             responses={
+                 HTTPStatus.BAD_REQUEST.value: {
+                     "model": ErrorResponse
+                 },
+                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
+                     "model": ErrorResponse
+                 },
+             })
+@with_cancellation
+@load_aware_call
+async def create_embedding(request: EmbeddingRequest, raw_request: Request):
+    handler = embedding(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Embeddings API")
+
+    generator = await handler.create_embedding(request, raw_request)
+
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    elif isinstance(generator, EmbeddingResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
+
+
+@router.post("/pooling",
+             dependencies=[Depends(validate_json_request)],
+             responses={
+                 HTTPStatus.BAD_REQUEST.value: {
+                     "model": ErrorResponse
+                 },
+                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
+                     "model": ErrorResponse
+                 },
+             })
+@with_cancellation
+@load_aware_call
+async def create_pooling(request: PoolingRequest, raw_request: Request):
+    handler = pooling(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Pooling API")
+
+    generator = await handler.create_pooling(request, raw_request)
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    elif isinstance(generator, PoolingResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
+
+
+@router.post("/classify", dependencies=[Depends(validate_json_request)])
+@with_cancellation
+@load_aware_call
+async def create_classify(request: ClassificationRequest,
+                          raw_request: Request):
+    handler = classify(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Classification API")
+
+    generator = await handler.create_classify(request, raw_request)
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+
+    elif isinstance(generator, ClassificationResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
+
+
+@router.post("/score",
+             dependencies=[Depends(validate_json_request)],
+             responses={
+                 HTTPStatus.BAD_REQUEST.value: {
+                     "model": ErrorResponse
+                 },
+                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
+                     "model": ErrorResponse
+                 },
+             })
+@with_cancellation
+@load_aware_call
+async def create_score(request: ScoreRequest, raw_request: Request):
+    handler = score(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Score API")
+
+    generator = await handler.create_score(request, raw_request)
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    elif isinstance(generator, ScoreResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
+
+
+@router.post("/v1/score",
+             dependencies=[Depends(validate_json_request)],
+             responses={
+                 HTTPStatus.BAD_REQUEST.value: {
+                     "model": ErrorResponse
+                 },
+                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
+                     "model": ErrorResponse
+                 },
+             })
+@with_cancellation
+@load_aware_call
+async def create_score_v1(request: ScoreRequest, raw_request: Request):
+    logger.warning(
+        "To indicate that Score API is not part of standard OpenAI API, we "
+        "have moved it to `/score`. Please update your client accordingly.")
+
+    return await create_score(request, raw_request)
+
+
+@router.post("/v1/audio/transcriptions",
+             responses={
+                 HTTPStatus.OK.value: {
+                     "content": {
+                         "text/event-stream": {}
+                     }
+                 },
+                 HTTPStatus.BAD_REQUEST.value: {
+                     "model": ErrorResponse
+                 },
+                 HTTPStatus.UNPROCESSABLE_ENTITY.value: {
+                     "model": ErrorResponse
+                 },
+                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
+                     "model": ErrorResponse
+                 },
+             })
+@with_cancellation
+@load_aware_call
+async def create_transcriptions(raw_request: Request,
+                                request: Annotated[TranscriptionRequest,
+                                                   Form()]):
+    handler = transcription(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Transcriptions API")
+
+    audio_data = await request.file.read()
+    generator = await handler.create_transcription(audio_data, request,
+                                                   raw_request)
+
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+
+    elif isinstance(generator, TranscriptionResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    return StreamingResponse(content=generator, media_type="text/event-stream")
+
+
+@router.post("/v1/audio/translations",
+             responses={
+                 HTTPStatus.OK.value: {
+                     "content": {
+                         "text/event-stream": {}
+                     }
+                 },
+                 HTTPStatus.BAD_REQUEST.value: {
+                     "model": ErrorResponse
+                 },
+                 HTTPStatus.UNPROCESSABLE_ENTITY.value: {
+                     "model": ErrorResponse
+                 },
+                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
+                     "model": ErrorResponse
+                 },
+             })
+@with_cancellation
+@load_aware_call
+async def create_translations(request: Annotated[TranslationRequest,
+                                                 Form()],
+                              raw_request: Request):
+    handler = translation(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Translations API")
+
+    audio_data = await request.file.read()
+    generator = await handler.create_translation(audio_data, request,
+                                                 raw_request)
+
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+
+    elif isinstance(generator, TranslationResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    return StreamingResponse(content=generator, media_type="text/event-stream")
+
+
+@router.post("/rerank",
+             dependencies=[Depends(validate_json_request)],
+             responses={
+                 HTTPStatus.BAD_REQUEST.value: {
+                     "model": ErrorResponse
+                 },
+                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
+                     "model": ErrorResponse
+                 },
+             })
+@with_cancellation
+@load_aware_call
+async def do_rerank(request: RerankRequest, raw_request: Request):
+    handler = rerank(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Rerank (Score) API")
+    generator = await handler.do_rerank(request, raw_request)
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    elif isinstance(generator, RerankResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
+
+
+@router.post("/v1/rerank",
+             dependencies=[Depends(validate_json_request)],
+             responses={
+                 HTTPStatus.BAD_REQUEST.value: {
+                     "model": ErrorResponse
+                 },
+                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
+                     "model": ErrorResponse
+                 },
+             })
+@with_cancellation
+async def do_rerank_v1(request: RerankRequest, raw_request: Request):
+    logger.warning_once(
+        "To indicate that the rerank API is not part of the standard OpenAI"
+        " API, we have located it at `/rerank`. Please update your client "
+        "accordingly. (Note: Conforms to JinaAI rerank API)")
+
+    return await do_rerank(request, raw_request)
+
+
+@router.post("/v2/rerank",
+             dependencies=[Depends(validate_json_request)],
+             responses={
+                 HTTPStatus.BAD_REQUEST.value: {
+                     "model": ErrorResponse
+                 },
+                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
+                     "model": ErrorResponse
+                 },
+             })
+@with_cancellation
+async def do_rerank_v2(request: RerankRequest, raw_request: Request):
+    return await do_rerank(request, raw_request)
+
+
+if envs.VLLM_SERVER_DEV_MODE:
+    logger.warning("SECURITY WARNING: Development endpoints are enabled! "
+                   "This should NOT be used in production!")
+
+    @router.get("/server_info")
+    async def show_server_info(raw_request: Request):
+        server_info = {"vllm_config": str(raw_request.app.state.vllm_config)}
+        return JSONResponse(content=server_info)
+
+    @router.post("/reset_prefix_cache")
+    async def reset_prefix_cache(raw_request: Request):
+        """
+        Reset the prefix cache. Note that we currently do not check if the
+        prefix cache is successfully reset in the API server.
+        """
+        device = None
+        device_str = raw_request.query_params.get("device")
+        if device_str is not None:
+            device = Device[device_str.upper()]
+        logger.info("Resetting prefix cache with specific %s...", str(device))
+        await engine_client(raw_request).reset_prefix_cache(device)
+        return Response(status_code=200)
+
+    @router.post("/sleep")
+    async def sleep(raw_request: Request):
+        # get POST params
+        level = raw_request.query_params.get("level", "1")
+        await engine_client(raw_request).sleep(int(level))
+        # FIXME: in v0 with frontend multiprocessing, the sleep command
+        # is sent but does not finish yet when we return a response.
+        return Response(status_code=200)
+
+    @router.post("/wake_up")
+    async def wake_up(raw_request: Request):
+        tags = raw_request.query_params.getlist("tags")
+        if tags == []:
+            # set to None to wake up all tags if no tags are provided
+            tags = None
+        logger.info("wake up the engine with tags: %s", tags)
+        await engine_client(raw_request).wake_up(tags)
+        # FIXME: in v0 with frontend multiprocessing, the wake-up command
+        # is sent but does not finish yet when we return a response.
+        return Response(status_code=200)
+
+    @router.get("/is_sleeping")
+    async def is_sleeping(raw_request: Request):
+        logger.info("check whether the engine is sleeping")
+        is_sleeping = await engine_client(raw_request).is_sleeping()
+        return JSONResponse(content={"is_sleeping": is_sleeping})
+
+
+@router.post("/scale_elastic_ep",
+             dependencies=[Depends(validate_json_request)],
+             responses={
+                 HTTPStatus.OK.value: {
+                     "model": dict
+                 },
+                 HTTPStatus.BAD_REQUEST.value: {
+                     "model": ErrorResponse
+                 },
+                 HTTPStatus.REQUEST_TIMEOUT.value: {
+                     "model": ErrorResponse
+                 },
+                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
+                     "model": ErrorResponse
+                 },
+             })
+async def scale_elastic_ep(raw_request: Request):
+    try:
+        body = await raw_request.json()
+    except json.JSONDecodeError as e:
+        raise HTTPException(status_code=400,
+                            detail="Invalid JSON format") from e  # noqa: B904
+
+    new_data_parallel_size = body.get("new_data_parallel_size")
+    drain_timeout = body.get("drain_timeout", 120)  # Default 2 minutes
+
+    if new_data_parallel_size is None:
+        raise HTTPException(status_code=400,
+                            detail="new_data_parallel_size is required")
+
+    if not isinstance(new_data_parallel_size,
+                      int) or new_data_parallel_size <= 0:
+        raise HTTPException(
+            status_code=400,
+            detail="new_data_parallel_size must be a positive integer")
+
+    if not isinstance(drain_timeout, int) or drain_timeout <= 0:
+        raise HTTPException(status_code=400,
+                            detail="drain_timeout must be a positive integer")
+
+    # Set scaling flag to prevent new requests
+    global _scaling_elastic_ep
+    _scaling_elastic_ep = True
+    client = engine_client(raw_request)
+    try:
+        await client.scale_elastic_ep(new_data_parallel_size, drain_timeout)
+        return JSONResponse({
+            "message":
+            f"Scaled to {new_data_parallel_size} "
+            "data parallel engines",
+        })
+    except TimeoutError as e:
+        raise HTTPException(status_code=408,
+                            detail="Scale failed due to request drain timeout "
+                            f"after {drain_timeout} seconds") from e
+    except Exception as e:
+        logger.error("Scale failed: %s", e)
+        raise HTTPException(status_code=500, detail="Scale failed") from e
+    finally:
+        _scaling_elastic_ep = False
+
+
+@router.post("/is_scaling_elastic_ep")
+async def is_scaling_elastic_ep(raw_request: Request):
+    return JSONResponse({"is_scaling_elastic_ep": _scaling_elastic_ep})
+
+
+# TODO: RequestType = TypeForm[BaseModel] when recognized by type checkers
+# (requires typing_extensions >= 4.13)
+RequestType = Any
+GetHandlerFn = Callable[[Request], Optional[OpenAIServing]]
+EndpointFn = Callable[[RequestType, Request], Awaitable[Any]]
+
+# NOTE: Items defined earlier take higher priority
+INVOCATION_TYPES: list[tuple[RequestType, tuple[GetHandlerFn, EndpointFn]]] = [
+    (ChatCompletionRequest, (chat, create_chat_completion)),
+    (CompletionRequest, (completion, create_completion)),
+    (EmbeddingRequest, (embedding, create_embedding)),
+    (ClassificationRequest, (classify, create_classify)),
+    (ScoreRequest, (score, create_score)),
+    (RerankRequest, (rerank, do_rerank)),
+    (PoolingRequest, (pooling, create_pooling)),
+]
+
+# NOTE: Construct the TypeAdapters only once
+INVOCATION_VALIDATORS = [
+    (pydantic.TypeAdapter(request_type), (get_handler, endpoint))
+    for request_type, (get_handler, endpoint) in INVOCATION_TYPES
+]
+
+
+@router.post("/invocations",
+             dependencies=[Depends(validate_json_request)],
+             responses={
+                 HTTPStatus.BAD_REQUEST.value: {
+                     "model": ErrorResponse
+                 },
+                 HTTPStatus.UNSUPPORTED_MEDIA_TYPE.value: {
+                     "model": ErrorResponse
+                 },
+                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
+                     "model": ErrorResponse
+                 },
+             })
+async def invocations(raw_request: Request):
+    """For SageMaker, routes requests based on the request type."""
+    try:
+        body = await raw_request.json()
+    except json.JSONDecodeError as e:
+        raise HTTPException(status_code=HTTPStatus.BAD_REQUEST.value,
+                            detail=f"JSON decode error: {e}") from e
+
+    valid_endpoints = [(validator, endpoint)
+                       for validator, (get_handler,
+                                       endpoint) in INVOCATION_VALIDATORS
+                       if get_handler(raw_request) is not None]
+
+    for request_validator, endpoint in valid_endpoints:
+        try:
+            request = request_validator.validate_python(body)
+        except pydantic.ValidationError:
+            continue
+
+        return await endpoint(request, raw_request)
+
+    type_names = [
+        t.__name__ if isinstance(t := validator._type, type) else str(t)
+        for validator, _ in valid_endpoints
+    ]
+    msg = ("Cannot find suitable handler for request. "
+           f"Expected one of: {type_names}")
+    res = base(raw_request).create_error_response(message=msg)
+    return JSONResponse(content=res.model_dump(), status_code=res.code)
+
+
+if envs.VLLM_TORCH_PROFILER_DIR:
+    logger.warning(
+        "Torch Profiler is enabled in the API server. This should ONLY be "
+        "used for local development!")
+
+    @router.post("/start_profile")
+    async def start_profile(raw_request: Request):
+        logger.info("Starting profiler...")
+        await engine_client(raw_request).start_profile()
+        logger.info("Profiler started.")
+        return Response(status_code=200)
+
+    @router.post("/stop_profile")
+    async def stop_profile(raw_request: Request):
+        logger.info("Stopping profiler...")
+        await engine_client(raw_request).stop_profile()
+        logger.info("Profiler stopped.")
+        return Response(status_code=200)
+
+
+if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
+    logger.warning(
+        "LoRA dynamic loading & unloading is enabled in the API server. "
+        "This should ONLY be used for local development!")
+
+    @router.post("/v1/load_lora_adapter",
+                 dependencies=[Depends(validate_json_request)])
+    async def load_lora_adapter(request: LoadLoRAAdapterRequest,
+                                raw_request: Request):
+        handler = models(raw_request)
+        response = await handler.load_lora_adapter(request)
+        if isinstance(response, ErrorResponse):
+            return JSONResponse(content=response.model_dump(),
+                                status_code=response.code)
+
+        return Response(status_code=200, content=response)
+
+    @router.post("/v1/unload_lora_adapter",
+                 dependencies=[Depends(validate_json_request)])
+    async def unload_lora_adapter(request: UnloadLoRAAdapterRequest,
+                                  raw_request: Request):
+        handler = models(raw_request)
+        response = await handler.unload_lora_adapter(request)
+        if isinstance(response, ErrorResponse):
+            return JSONResponse(content=response.model_dump(),
+                                status_code=response.code)
+
+        return Response(status_code=200, content=response)
+
+
+def load_log_config(log_config_file: Optional[str]) -> Optional[dict]:
+    if not log_config_file:
+        return None
+    try:
+        with open(log_config_file) as f:
+            return json.load(f)
+    except Exception as e:
+        logger.warning("Failed to load log config from file %s: error %s",
+                       log_config_file, e)
+        return None
+
+
+class AuthenticationMiddleware:
+    """
+    Pure ASGI middleware that authenticates each request by checking
+    if the Authorization header exists and equals "Bearer {api_key}".
+
+    Notes
+    -----
+    There are two cases in which authentication is skipped:
+        1. The HTTP method is OPTIONS.
+        2. The request path doesn't start with /v1 (e.g. /health).
+    """
+
+    def __init__(self, app: ASGIApp, api_token: str) -> None:
+        self.app = app
+        self.api_token = api_token
+
+    def __call__(self, scope: Scope, receive: Receive,
+                 send: Send) -> Awaitable[None]:
+        if scope["type"] not in ("http",
+                                 "websocket") or scope["method"] == "OPTIONS":
+            # scope["type"] can be "lifespan" or "startup" for example,
+            # in which case we don't need to do anything
+            return self.app(scope, receive, send)
+        root_path = scope.get("root_path", "")
+        url_path = URL(scope=scope).path.removeprefix(root_path)
+        headers = Headers(scope=scope)
+        # Type narrow to satisfy mypy.
+        if url_path.startswith("/v1") and headers.get(
+                "Authorization") != f"Bearer {self.api_token}":
+            response = JSONResponse(content={"error": "Unauthorized"},
+                                    status_code=401)
+            return response(scope, receive, send)
+        return self.app(scope, receive, send)
+
+
+class XRequestIdMiddleware:
+    """
+    Middleware the set's the X-Request-Id header for each response
+    to a random uuid4 (hex) value if the header isn't already
+    present in the request, otherwise use the provided request id.
+    """
+
+    def __init__(self, app: ASGIApp) -> None:
+        self.app = app
+
+    def __call__(self, scope: Scope, receive: Receive,
+                 send: Send) -> Awaitable[None]:
+        if scope["type"] not in ("http", "websocket"):
+            return self.app(scope, receive, send)
+
+        # Extract the request headers.
+        request_headers = Headers(scope=scope)
+
+        async def send_with_request_id(message: Message) -> None:
+            """
+            Custom send function to mutate the response headers
+            and append X-Request-Id to it.
+            """
+            if message["type"] == "http.response.start":
+                response_headers = MutableHeaders(raw=message["headers"])
+                request_id = request_headers.get("X-Request-Id",
+                                                 uuid.uuid4().hex)
+                response_headers.append("X-Request-Id", request_id)
+            await send(message)
+
+        return self.app(scope, receive, send_with_request_id)
+
+
+# Global variable to track scaling state
+_scaling_elastic_ep = False
+
+
+class ScalingMiddleware:
+    """
+    Middleware that checks if the model is currently scaling and
+    returns a 503 Service Unavailable response if it is.
+    
+    This middleware applies to all HTTP requests and prevents
+    processing when the model is in a scaling state.
+    """
+
+    def __init__(self, app: ASGIApp) -> None:
+        self.app = app
+
+    def __call__(self, scope: Scope, receive: Receive,
+                 send: Send) -> Awaitable[None]:
+        if scope["type"] != "http":
+            return self.app(scope, receive, send)
+
+        # Check global scaling state
+        global _scaling_elastic_ep
+        if _scaling_elastic_ep:
+            # Return 503 Service Unavailable response
+            response = JSONResponse(content={
+                "error":
+                "The model is currently scaling. Please try again later."
+            },
+                                    status_code=503)
+            return response(scope, receive, send)
+
+        return self.app(scope, receive, send)
+
+
+def _extract_content_from_chunk(chunk_data: dict) -> str:
+    """Extract content from a streaming response chunk."""
+    try:
+        from vllm.entrypoints.openai.protocol import (
+            ChatCompletionStreamResponse, CompletionStreamResponse)
+
+        # Try using Completion types for type-safe parsing
+        if chunk_data.get('object') == 'chat.completion.chunk':
+            chat_response = ChatCompletionStreamResponse.model_validate(
+                chunk_data)
+            if chat_response.choices and chat_response.choices[0].delta.content:
+                return chat_response.choices[0].delta.content
+        elif chunk_data.get('object') == 'text_completion':
+            completion_response = CompletionStreamResponse.model_validate(
+                chunk_data)
+            if completion_response.choices and completion_response.choices[
+                    0].text:
+                return completion_response.choices[0].text
+    except pydantic.ValidationError:
+        # Fallback to manual parsing
+        if 'choices' in chunk_data and chunk_data['choices']:
+            choice = chunk_data['choices'][0]
+            if 'delta' in choice and choice['delta'].get('content'):
+                return choice['delta']['content']
+            elif choice.get('text'):
+                return choice['text']
+    return ""
+
+
+class SSEDecoder:
+    """Robust Server-Sent Events decoder for streaming responses."""
+
+    def __init__(self):
+        self.buffer = ""
+        self.content_buffer = []
+
+    def decode_chunk(self, chunk: bytes) -> list[dict]:
+        """Decode a chunk of SSE data and return parsed events."""
+        import json
+
+        try:
+            chunk_str = chunk.decode('utf-8')
+        except UnicodeDecodeError:
+            # Skip malformed chunks
+            return []
+
+        self.buffer += chunk_str
+        events = []
+
+        # Process complete lines
+        while '\n' in self.buffer:
+            line, self.buffer = self.buffer.split('\n', 1)
+            line = line.rstrip('\r')  # Handle CRLF
+
+            if line.startswith('data: '):
+                data_str = line[6:].strip()
+                if data_str == '[DONE]':
+                    events.append({'type': 'done'})
+                elif data_str:
+                    try:
+                        event_data = json.loads(data_str)
+                        events.append({'type': 'data', 'data': event_data})
+                    except json.JSONDecodeError:
+                        # Skip malformed JSON
+                        continue
+
+        return events
+
+    def extract_content(self, event_data: dict) -> str:
+        """Extract content from event data."""
+        return _extract_content_from_chunk(event_data)
+
+    def add_content(self, content: str) -> None:
+        """Add content to the buffer."""
+        if content:
+            self.content_buffer.append(content)
+
+    def get_complete_content(self) -> str:
+        """Get the complete buffered content."""
+        return ''.join(self.content_buffer)
+
+
+def _log_streaming_response(response, response_body: list) -> None:
+    """Log streaming response with robust SSE parsing."""
+    from starlette.concurrency import iterate_in_threadpool
+
+    sse_decoder = SSEDecoder()
+    chunk_count = 0
+
+    def buffered_iterator():
+        nonlocal chunk_count
+
+        for chunk in response_body:
+            chunk_count += 1
+            yield chunk
+
+            # Parse SSE events from chunk
+            events = sse_decoder.decode_chunk(chunk)
+
+            for event in events:
+                if event['type'] == 'data':
+                    content = sse_decoder.extract_content(event['data'])
+                    sse_decoder.add_content(content)
+                elif event['type'] == 'done':
+                    # Log complete content when done
+                    full_content = sse_decoder.get_complete_content()
+                    if full_content:
+                        # Truncate if too long
+                        if len(full_content) > 2048:
+                            full_content = full_content[:2048] + ""
+                            "...[truncated]"
+                        logger.info(
+                            "response_body={streaming_complete: " \
+                            "content='%s', chunks=%d}",
+                            full_content, chunk_count)
+                    else:
+                        logger.info(
+                            "response_body={streaming_complete: " \
+                            "no_content, chunks=%d}",
+                            chunk_count)
+                    return
+
+    response.body_iterator = iterate_in_threadpool(buffered_iterator())
+    logger.info("response_body={streaming_started: chunks=%d}",
+                len(response_body))
+
+
+def _log_non_streaming_response(response_body: list) -> None:
+    """Log non-streaming response."""
+    try:
+        decoded_body = response_body[0].decode()
+        logger.info("response_body={%s}", decoded_body)
+    except UnicodeDecodeError:
+        logger.info("response_body={<binary_data>}")
+
+
+def build_app(args: Namespace) -> FastAPI:
+    if args.disable_fastapi_docs:
+        app = FastAPI(openapi_url=None,
+                      docs_url=None,
+                      redoc_url=None,
+                      lifespan=lifespan)
+    else:
+        app = FastAPI(lifespan=lifespan)
+    app.include_router(router)
+    app.root_path = args.root_path
+
+    mount_metrics(app)
+
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=args.allowed_origins,
+        allow_credentials=args.allow_credentials,
+        allow_methods=args.allowed_methods,
+        allow_headers=args.allowed_headers,
+    )
+
+    @app.exception_handler(HTTPException)
+    async def http_exception_handler(_: Request, exc: HTTPException):
+        err = ErrorResponse(message=exc.detail,
+                            type=HTTPStatus(exc.status_code).phrase,
+                            code=exc.status_code)
+        return JSONResponse(err.model_dump(), status_code=exc.status_code)
+
+    @app.exception_handler(RequestValidationError)
+    async def validation_exception_handler(_: Request,
+                                           exc: RequestValidationError):
+        exc_str = str(exc)
+        errors_str = str(exc.errors())
+
+        if exc.errors() and errors_str and errors_str != exc_str:
+            message = f"{exc_str} {errors_str}"
+        else:
+            message = exc_str
+
+        err = ErrorResponse(message=message,
+                            type=HTTPStatus.BAD_REQUEST.phrase,
+                            code=HTTPStatus.BAD_REQUEST)
+        return JSONResponse(err.model_dump(),
+                            status_code=HTTPStatus.BAD_REQUEST)
+
+    # Ensure --api-key option from CLI takes precedence over VLLM_API_KEY
+    if token := args.api_key or envs.VLLM_API_KEY:
+        app.add_middleware(AuthenticationMiddleware, api_token=token)
+
+    if args.enable_request_id_headers:
+        app.add_middleware(XRequestIdMiddleware)
+
+    # Add scaling middleware to check for scaling state
+    app.add_middleware(ScalingMiddleware)
+
+    if envs.VLLM_DEBUG_LOG_API_SERVER_RESPONSE:
+        logger.warning("CAUTION: Enabling log response in the API Server. "
+                       "This can include sensitive information and should be "
+                       "avoided in production.")
+
+        @app.middleware("http")
+        async def log_response(request: Request, call_next):
+            response = await call_next(request)
+            response_body = [
+                section async for section in response.body_iterator
+            ]
+            response.body_iterator = iterate_in_threadpool(iter(response_body))
+            # Check if this is a streaming response by looking at content-type
+            content_type = response.headers.get("content-type", "")
+            is_streaming = content_type == "text/event-stream; charset=utf-8"
+
+            # Log response body based on type
+            if not response_body:
+                logger.info("response_body={<empty>}")
+            elif is_streaming:
+                _log_streaming_response(response, response_body)
+            else:
+                _log_non_streaming_response(response_body)
+            return response
+
+    for middleware in args.middleware:
+        module_path, object_name = middleware.rsplit(".", 1)
+        imported = getattr(importlib.import_module(module_path), object_name)
+        if inspect.isclass(imported):
+            app.add_middleware(imported)  # type: ignore[arg-type]
+        elif inspect.iscoroutinefunction(imported):
+            app.middleware("http")(imported)
+        else:
+            raise ValueError(f"Invalid middleware {middleware}. "
+                             f"Must be a function or a class.")
+
+    return app
+
+
+async def init_app_state(
+    engine_client: EngineClient,
+    vllm_config: VllmConfig,
+    state: State,
+    args: Namespace,
+) -> None:
+    if args.served_model_name is not None:
+        served_model_names = args.served_model_name
+    else:
+        served_model_names = [args.model]
+
+    if args.disable_log_requests:
+        request_logger = None
+    else:
+        request_logger = RequestLogger(max_log_len=args.max_log_len)
+
+    base_model_paths = [
+        BaseModelPath(name=name, model_path=args.model)
+        for name in served_model_names
+    ]
+
+    state.engine_client = engine_client
+    state.log_stats = not args.disable_log_stats
+    state.vllm_config = vllm_config
+    model_config = vllm_config.model_config
+
+    resolved_chat_template = load_chat_template(args.chat_template)
+    if resolved_chat_template is not None:
+        # Get the tokenizer to check official template
+        tokenizer = await engine_client.get_tokenizer()
+
+        if isinstance(tokenizer, MistralTokenizer):
+            # The warning is logged in resolve_mistral_chat_template.
+            resolved_chat_template = resolve_mistral_chat_template(
+                chat_template=resolved_chat_template)
+        else:
+            hf_chat_template = resolve_hf_chat_template(
+                tokenizer=tokenizer,
+                chat_template=None,
+                tools=None,
+                model_config=vllm_config.model_config,
+            )
+
+            if hf_chat_template != resolved_chat_template:
+                logger.warning(
+                    "Using supplied chat template: %s\n"
+                    "It is different from official chat template '%s'. "
+                    "This discrepancy may lead to performance degradation.",
+                    resolved_chat_template, args.model)
+
+    # Merge default_mm_loras into the static lora_modules
+    default_mm_loras = (vllm_config.lora_config.default_mm_loras
+                        if vllm_config.lora_config is not None else {})
+
+    lora_modules = args.lora_modules
+    if default_mm_loras:
+        default_mm_lora_paths = [
+            LoRAModulePath(
+                name=modality,
+                path=lora_path,
+            ) for modality, lora_path in default_mm_loras.items()
+        ]
+        if args.lora_modules is None:
+            lora_modules = default_mm_lora_paths
+        else:
+            lora_modules += default_mm_lora_paths
+
+    state.openai_serving_models = OpenAIServingModels(
+        engine_client=engine_client,
+        model_config=model_config,
+        base_model_paths=base_model_paths,
+        lora_modules=lora_modules,
+    )
+    await state.openai_serving_models.init_static_loras()
+    state.openai_serving_responses = OpenAIServingResponses(
+        engine_client,
+        model_config,
+        state.openai_serving_models,
+        request_logger=request_logger,
+        chat_template=resolved_chat_template,
+        chat_template_content_format=args.chat_template_content_format,
+        return_tokens_as_token_ids=args.return_tokens_as_token_ids,
+        enable_auto_tools=args.enable_auto_tool_choice,
+        tool_parser=args.tool_call_parser,
+        reasoning_parser=args.reasoning_parser,
+        enable_prompt_tokens_details=args.enable_prompt_tokens_details,
+        enable_force_include_usage=args.enable_force_include_usage,
+    ) if "generate" in model_config.supported_tasks else None
+    state.openai_serving_chat = OpenAIServingChat(
+        engine_client,
+        model_config,
+        state.openai_serving_models,
+        args.response_role,
+        request_logger=request_logger,
+        chat_template=resolved_chat_template,
+        chat_template_content_format=args.chat_template_content_format,
+        return_tokens_as_token_ids=args.return_tokens_as_token_ids,
+        enable_auto_tools=args.enable_auto_tool_choice,
+        tool_parser=args.tool_call_parser,
+        reasoning_parser=args.reasoning_parser,
+        enable_prompt_tokens_details=args.enable_prompt_tokens_details,
+        enable_force_include_usage=args.enable_force_include_usage,
+    ) if "generate" in model_config.supported_tasks else None
+    state.openai_serving_completion = OpenAIServingCompletion(
+        engine_client,
+        model_config,
+        state.openai_serving_models,
+        request_logger=request_logger,
+        return_tokens_as_token_ids=args.return_tokens_as_token_ids,
+        enable_prompt_tokens_details=args.enable_prompt_tokens_details,
+        enable_force_include_usage=args.enable_force_include_usage,
+    ) if "generate" in model_config.supported_tasks else None
+    state.openai_serving_pooling = OpenAIServingPooling(
+        engine_client,
+        model_config,
+        state.openai_serving_models,
+        request_logger=request_logger,
+        chat_template=resolved_chat_template,
+        chat_template_content_format=args.chat_template_content_format,
+    ) if "encode" in model_config.supported_tasks else None
+    state.openai_serving_embedding = OpenAIServingEmbedding(
+        engine_client,
+        model_config,
+        state.openai_serving_models,
+        request_logger=request_logger,
+        chat_template=resolved_chat_template,
+        chat_template_content_format=args.chat_template_content_format,
+    ) if "embed" in model_config.supported_tasks else None
+    state.openai_serving_classification = ServingClassification(
+        engine_client,
+        model_config,
+        state.openai_serving_models,
+        request_logger=request_logger,
+    ) if "classify" in model_config.supported_tasks else None
+
+    enable_serving_reranking = ("classify" in model_config.supported_tasks
+                                and getattr(model_config.hf_config,
+                                            "num_labels", 0) == 1)
+    state.openai_serving_scores = ServingScores(
+        engine_client,
+        model_config,
+        state.openai_serving_models,
+        request_logger=request_logger,
+    ) if ("embed" in model_config.supported_tasks
+          or enable_serving_reranking) else None
+
+    state.openai_serving_tokenization = OpenAIServingTokenization(
+        engine_client,
+        model_config,
+        state.openai_serving_models,
+        request_logger=request_logger,
+        chat_template=resolved_chat_template,
+        chat_template_content_format=args.chat_template_content_format,
+    )
+    state.openai_serving_transcription = OpenAIServingTranscription(
+        engine_client,
+        model_config,
+        state.openai_serving_models,
+        request_logger=request_logger,
+    ) if "transcription" in model_config.supported_tasks else None
+    state.openai_serving_translation = OpenAIServingTranslation(
+        engine_client,
+        model_config,
+        state.openai_serving_models,
+        request_logger=request_logger,
+    ) if "transcription" in model_config.supported_tasks else None
+    state.task = model_config.task
+
+    state.enable_server_load_tracking = args.enable_server_load_tracking
+    state.server_load_metrics = 0
+
+
+def create_server_socket(addr: tuple[str, int]) -> socket.socket:
+    family = socket.AF_INET
+    if is_valid_ipv6_address(addr[0]):
+        family = socket.AF_INET6
+
+    sock = socket.socket(family=family, type=socket.SOCK_STREAM)
+    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
+    sock.bind(addr)
+
+    return sock
+
+
+def validate_api_server_args(args):
+    valid_tool_parses = ToolParserManager.tool_parsers.keys()
+    if args.enable_auto_tool_choice \
+            and args.tool_call_parser not in valid_tool_parses:
+        raise KeyError(f"invalid tool call parser: {args.tool_call_parser} "
+                       f"(chose from {{ {','.join(valid_tool_parses)} }})")
+
+    valid_reasoning_parses = ReasoningParserManager.reasoning_parsers.keys()
+    if args.reasoning_parser \
+        and args.reasoning_parser not in valid_reasoning_parses:
+        raise KeyError(
+            f"invalid reasoning parser: {args.reasoning_parser} "
+            f"(chose from {{ {','.join(valid_reasoning_parses)} }})")
+
+
+def setup_server(args):
+    """Validate API server args, set up signal handler, create socket
+    ready to serve."""
+
+    logger.info("vLLM API server version %s", VLLM_VERSION)
+    log_non_default_args(args)
+
+    if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
+        ToolParserManager.import_tool_parser(args.tool_parser_plugin)
+
+    validate_api_server_args(args)
+
+    # workaround to make sure that we bind the port before the engine is set up.
+    # This avoids race conditions with ray.
+    # see https://github.com/vllm-project/vllm/issues/8204
+    sock_addr = (args.host or "", args.port)
+    sock = create_server_socket(sock_addr)
+
+    # workaround to avoid footguns where uvicorn drops requests with too
+    # many concurrent requests active
+    set_ulimit()
+
+    def signal_handler(*_) -> None:
+        # Interrupt server on sigterm while initializing
+        raise KeyboardInterrupt("terminated")
+
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    addr, port = sock_addr
+    is_ssl = args.ssl_keyfile and args.ssl_certfile
+    host_part = f"[{addr}]" if is_valid_ipv6_address(
+        addr) else addr or "0.0.0.0"
+    listen_address = f"http{'s' if is_ssl else ''}://{host_part}:{port}"
+
+    return listen_address, sock
+
+
+async def run_server(args, **uvicorn_kwargs) -> None:
+    """Run a single-worker API server."""
+    listen_address, sock = setup_server(args)
+    await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
+
+
+async def run_server_worker(listen_address,
+                            sock,
+                            args,
+                            client_config=None,
+                            **uvicorn_kwargs) -> None:
+    """Run a single API server worker."""
+
+    if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
+        ToolParserManager.import_tool_parser(args.tool_parser_plugin)
+
+    server_index = client_config.get("client_index", 0) if client_config else 0
+
+    # Load logging config for uvicorn if specified
+    log_config = load_log_config(args.log_config_file)
+    if log_config is not None:
+        uvicorn_kwargs['log_config'] = log_config
+
+    async with build_async_engine_client(args, client_config) as engine_client:
+        maybe_register_tokenizer_info_endpoint(args)
+        app = build_app(args)
+
+        vllm_config = await engine_client.get_vllm_config()
+        await init_app_state(engine_client, vllm_config, app.state, args)
+
+        logger.info("Starting vLLM API server %d on %s", server_index,
+                    listen_address)
+        shutdown_task = await serve_http(
+            app,
+            sock=sock,
+            enable_ssl_refresh=args.enable_ssl_refresh,
+            host=args.host,
+            port=args.port,
+            log_level=args.uvicorn_log_level,
+            # NOTE: When the 'disable_uvicorn_access_log' value is True,
+            # no access log will be output.
+            access_log=not args.disable_uvicorn_access_log,
+            timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE,
+            ssl_keyfile=args.ssl_keyfile,
+            ssl_certfile=args.ssl_certfile,
+            ssl_ca_certs=args.ssl_ca_certs,
+            ssl_cert_reqs=args.ssl_cert_reqs,
+            **uvicorn_kwargs,
+        )
+
+    # NB: Await server shutdown only after the backend context is exited
+    try:
+        await shutdown_task
+    finally:
+        sock.close()
+
+
+if __name__ == "__main__":
+    # NOTE(simon):
+    # This section should be in sync with vllm/entrypoints/cli/main.py for CLI
+    # entrypoints.
+    cli_env_setup()
+    parser = FlexibleArgumentParser(
+        description="vLLM OpenAI-Compatible RESTful API server.")
+    parser = make_arg_parser(parser)
+    args = parser.parse_args()
+    validate_parsed_serve_args(args)
+
+    uvloop.run(run_server(args))
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/cli_args.py b/vllm_v0.10.0/vllm/entrypoints/openai/cli_args.py
new file mode 100644
index 0000000..3025a62
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/cli_args.py
@@ -0,0 +1,267 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file contains the command line arguments for the vLLM's
+OpenAI-compatible server. It is kept in a separate file for documentation
+purposes.
+"""
+
+import argparse
+import json
+import ssl
+from collections.abc import Sequence
+from dataclasses import field
+from typing import Literal, Optional, Union
+
+from pydantic.dataclasses import dataclass
+
+import vllm.envs as envs
+from vllm.config import config
+from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
+from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
+                                         validate_chat_template)
+from vllm.entrypoints.openai.serving_models import LoRAModulePath
+from vllm.entrypoints.openai.tool_parsers import ToolParserManager
+from vllm.logger import init_logger
+from vllm.utils import FlexibleArgumentParser
+
+logger = init_logger(__name__)
+
+
+class LoRAParserAction(argparse.Action):
+
+    def __call__(
+        self,
+        parser: argparse.ArgumentParser,
+        namespace: argparse.Namespace,
+        values: Optional[Union[str, Sequence[str]]],
+        option_string: Optional[str] = None,
+    ):
+        if values is None:
+            values = []
+        if isinstance(values, str):
+            raise TypeError("Expected values to be a list")
+
+        lora_list: list[LoRAModulePath] = []
+        for item in values:
+            if item in [None, '']:  # Skip if item is None or empty string
+                continue
+            if '=' in item and ',' not in item:  # Old format: name=path
+                name, path = item.split('=')
+                lora_list.append(LoRAModulePath(name, path))
+            else:  # Assume JSON format
+                try:
+                    lora_dict = json.loads(item)
+                    lora = LoRAModulePath(**lora_dict)
+                    lora_list.append(lora)
+                except json.JSONDecodeError:
+                    parser.error(
+                        f"Invalid JSON format for --lora-modules: {item}")
+                except TypeError as e:
+                    parser.error(
+                        f"Invalid fields for --lora-modules: {item} - {str(e)}"
+                    )
+        setattr(namespace, self.dest, lora_list)
+
+
+@config
+@dataclass
+class FrontendArgs:
+    """Arguments for the OpenAI-compatible frontend server."""
+    host: Optional[str] = None
+    """Host name."""
+    port: int = 8000
+    """Port number."""
+    uvicorn_log_level: Literal["debug", "info", "warning", "error", "critical",
+                               "trace"] = "info"
+    """Log level for uvicorn."""
+    disable_uvicorn_access_log: bool = False
+    """Disable uvicorn access log."""
+    allow_credentials: bool = False
+    """Allow credentials."""
+    allowed_origins: list[str] = field(default_factory=lambda: ["*"])
+    """Allowed origins."""
+    allowed_methods: list[str] = field(default_factory=lambda: ["*"])
+    """Allowed methods."""
+    allowed_headers: list[str] = field(default_factory=lambda: ["*"])
+    """Allowed headers."""
+    api_key: Optional[str] = None
+    """If provided, the server will require this key to be presented in the
+    header."""
+    lora_modules: Optional[list[LoRAModulePath]] = None
+    """LoRA modules configurations in either 'name=path' format or JSON format
+    or JSON list format. Example (old format): `'name=path'` Example (new 
+    format): `{\"name\": \"name\", \"path\": \"lora_path\", 
+    \"base_model_name\": \"id\"}`"""
+    chat_template: Optional[str] = None
+    """The file path to the chat template, or the template in single-line form 
+    for the specified model."""
+    chat_template_content_format: ChatTemplateContentFormatOption = "auto"
+    """The format to render message content within a chat template.
+
+* "string" will render the content as a string. Example: `"Hello World"`
+* "openai" will render the content as a list of dictionaries, similar to OpenAI 
+schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
+    response_role: str = "assistant"
+    """The role name to return if `request.add_generation_prompt=true`."""
+    ssl_keyfile: Optional[str] = None
+    """The file path to the SSL key file."""
+    ssl_certfile: Optional[str] = None
+    """The file path to the SSL cert file."""
+    ssl_ca_certs: Optional[str] = None
+    """The CA certificates file."""
+    enable_ssl_refresh: bool = False
+    """Refresh SSL Context when SSL certificate files change"""
+    ssl_cert_reqs: int = int(ssl.CERT_NONE)
+    """Whether client certificate is required (see stdlib ssl module's)."""
+    root_path: Optional[str] = None
+    """FastAPI root_path when app is behind a path based routing proxy."""
+    middleware: list[str] = field(default_factory=lambda: [])
+    """Additional ASGI middleware to apply to the app. We accept multiple 
+    --middleware arguments. The value should be an import path. If a function 
+    is provided, vLLM will add it to the server using 
+    `@app.middleware('http')`. If a class is provided, vLLM will 
+    add it to the server using `app.add_middleware()`."""
+    return_tokens_as_token_ids: bool = False
+    """When `--max-logprobs` is specified, represents single tokens as 
+    strings of the form 'token_id:{token_id}' so that tokens that are not 
+    JSON-encodable can be identified."""
+    disable_frontend_multiprocessing: bool = False
+    """If specified, will run the OpenAI frontend server in the same process as 
+    the model serving engine."""
+    enable_request_id_headers: bool = False
+    """If specified, API server will add X-Request-Id header to responses. 
+    Caution: this hurts performance at high QPS."""
+    enable_auto_tool_choice: bool = False
+    """Enable auto tool choice for supported models. Use `--tool-call-parser` 
+    to specify which parser to use."""
+    tool_call_parser: Optional[str] = None
+    """Select the tool call parser depending on the model that you're using. 
+    This is used to parse the model-generated tool call into OpenAI API format. 
+    Required for `--enable-auto-tool-choice`. You can choose any option from 
+    the built-in parsers or register a plugin via `--tool-parser-plugin`."""
+    tool_parser_plugin: str = ""
+    """Special the tool parser plugin write to parse the model-generated tool 
+    into OpenAI API format, the name register in this plugin can be used in 
+    `--tool-call-parser`."""
+    log_config_file: Optional[str] = envs.VLLM_LOGGING_CONFIG_PATH
+    """Path to logging config JSON file for both vllm and uvicorn"""
+    max_log_len: Optional[int] = None
+    """Max number of prompt characters or prompt ID numbers being printed in 
+    log. The default of None means unlimited."""
+    disable_fastapi_docs: bool = False
+    """Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint."""
+    enable_prompt_tokens_details: bool = False
+    """If set to True, enable prompt_tokens_details in usage."""
+    enable_server_load_tracking: bool = False
+    """If set to True, enable tracking server_load_metrics in the app state."""
+    enable_force_include_usage: bool = False
+    """If set to True, including usage on every request."""
+    enable_tokenizer_info_endpoint: bool = False
+    """Enable the /get_tokenizer_info endpoint. May expose chat
+    templates and other tokenizer configuration."""
+
+    @staticmethod
+    def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
+        from vllm.engine.arg_utils import get_kwargs
+
+        frontend_kwargs = get_kwargs(FrontendArgs)
+
+        # Special case: allowed_origins, allowed_methods, allowed_headers all
+        # need json.loads type
+        # Should also remove nargs
+        frontend_kwargs["allowed_origins"]["type"] = json.loads
+        frontend_kwargs["allowed_methods"]["type"] = json.loads
+        frontend_kwargs["allowed_headers"]["type"] = json.loads
+        del frontend_kwargs["allowed_origins"]["nargs"]
+        del frontend_kwargs["allowed_methods"]["nargs"]
+        del frontend_kwargs["allowed_headers"]["nargs"]
+
+        # Special case: LoRA modules need custom parser action and
+        # optional_type(str)
+        frontend_kwargs["lora_modules"]["type"] = optional_type(str)
+        frontend_kwargs["lora_modules"]["action"] = LoRAParserAction
+
+        # Special case: Middleware needs append action
+        frontend_kwargs["middleware"]["action"] = "append"
+        frontend_kwargs["middleware"]["type"] = str
+        if "nargs" in frontend_kwargs["middleware"]:
+            del frontend_kwargs["middleware"]["nargs"]
+        frontend_kwargs["middleware"]["default"] = []
+
+        # Special case: Tool call parser shows built-in options.
+        valid_tool_parsers = list(ToolParserManager.tool_parsers.keys())
+        frontend_kwargs["tool_call_parser"]["choices"] = valid_tool_parsers
+
+        frontend_group = parser.add_argument_group(
+            title="Frontend",
+            description=FrontendArgs.__doc__,
+        )
+
+        for key, value in frontend_kwargs.items():
+            frontend_group.add_argument(f"--{key.replace('_', '-')}", **value)
+
+        return parser
+
+
+def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
+    """Create the CLI argument parser used by the OpenAI API server.
+
+    We rely on the helper methods of `FrontendArgs` and `AsyncEngineArgs` to
+    register all arguments instead of manually enumerating them here. This
+    avoids code duplication and keeps the argument definitions in one place.
+    """
+    parser.add_argument("model_tag",
+                        type=str,
+                        nargs="?",
+                        help="The model tag to serve "
+                        "(optional if specified in config)")
+    parser.add_argument(
+        "--headless",
+        action="store_true",
+        default=False,
+        help="Run in headless mode. See multi-node data parallel "
+        "documentation for more details.")
+    parser.add_argument("--api-server-count",
+                        "-asc",
+                        type=int,
+                        default=1,
+                        help="How many API server processes to run.")
+    parser.add_argument(
+        "--config",
+        help="Read CLI options from a config file. "
+        "Must be a YAML with the following options: "
+        "https://docs.vllm.ai/en/latest/configuration/serve_args.html")
+    parser = FrontendArgs.add_cli_args(parser)
+    parser = AsyncEngineArgs.add_cli_args(parser)
+
+    return parser
+
+
+def validate_parsed_serve_args(args: argparse.Namespace):
+    """Quick checks for model serve args that raise prior to loading."""
+    if hasattr(args, "subparser") and args.subparser != "serve":
+        return
+
+    # Ensure that the chat template is valid; raises if it likely isn't
+    validate_chat_template(args.chat_template)
+
+    # Enable auto tool needs a tool call parser to be valid
+    if args.enable_auto_tool_choice and not args.tool_call_parser:
+        raise TypeError("Error: --enable-auto-tool-choice requires "
+                        "--tool-call-parser")
+
+
+def log_non_default_args(args: argparse.Namespace):
+    non_default_args = {}
+    parser = make_arg_parser(FlexibleArgumentParser())
+    for arg, default in vars(parser.parse_args([])).items():
+        if default != getattr(args, arg):
+            non_default_args[arg] = getattr(args, arg)
+    logger.info("non-default args: %s", non_default_args)
+
+
+def create_parser_for_docs() -> FlexibleArgumentParser:
+    parser_for_docs = FlexibleArgumentParser(
+        prog="-m vllm.entrypoints.openai.api_server")
+    return make_arg_parser(parser_for_docs)
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/logits_processors.py b/vllm_v0.10.0/vllm/entrypoints/openai/logits_processors.py
new file mode 100644
index 0000000..29d7225
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/logits_processors.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable
+from functools import lru_cache, partial
+from typing import Optional, Union
+
+import torch
+
+from vllm.sampling_params import LogitsProcessor
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+
+class AllowedTokenIdsLogitsProcessor:
+    """Logits processor for constraining generated tokens to a
+    specific set of token ids."""
+
+    def __init__(self, allowed_ids: Iterable[int]):
+        self.allowed_ids: Optional[list[int]] = list(allowed_ids)
+        self.mask: Optional[torch.Tensor] = None
+
+    def __call__(self, token_ids: list[int],
+                 logits: torch.Tensor) -> torch.Tensor:
+        if self.mask is None:
+            self.mask = torch.ones((logits.shape[-1], ),
+                                   dtype=torch.bool,
+                                   device=logits.device)
+            self.mask[self.allowed_ids] = False
+            self.allowed_ids = None
+        logits.masked_fill_(self.mask, float("-inf"))
+        return logits
+
+
+@lru_cache(maxsize=32)
+def _get_allowed_token_ids_logits_processor(
+    allowed_token_ids: frozenset[int],
+    vocab_size: int,
+) -> LogitsProcessor:
+    if not allowed_token_ids:
+        raise ValueError("Empty allowed_token_ids provided")
+    if not all(0 <= tid < vocab_size for tid in allowed_token_ids):
+        raise ValueError("allowed_token_ids contains "
+                         "out-of-vocab token id")
+    return AllowedTokenIdsLogitsProcessor(allowed_token_ids)
+
+
+def logit_bias_logits_processor(
+    logit_bias: dict[int, float],
+    token_ids: list[int],
+    logits: torch.Tensor,
+) -> torch.Tensor:
+    for token_id, bias in logit_bias.items():
+        logits[token_id] += bias
+    return logits
+
+
+def get_logits_processors(
+    logit_bias: Optional[Union[dict[int, float], dict[str, float]]],
+    allowed_token_ids: Optional[list[int]],
+    tokenizer: AnyTokenizer,
+) -> list[LogitsProcessor]:
+    logits_processors: list[LogitsProcessor] = []
+    if logit_bias:
+        try:
+            # Convert token_id to integer
+            # Clamp the bias between -100 and 100 per OpenAI API spec
+            clamped_logit_bias: dict[int, float] = {
+                int(token_id): min(100.0, max(-100.0, bias))
+                for token_id, bias in logit_bias.items()
+            }
+        except ValueError as exc:
+            raise ValueError(
+                "Found token_id in logit_bias that is not "
+                "an integer or string representing an integer") from exc
+
+        # Check if token_id is within the vocab size
+        for token_id, bias in clamped_logit_bias.items():
+            if token_id < 0 or token_id >= len(tokenizer):
+                raise ValueError(f"token_id {token_id} in logit_bias contains "
+                                 "out-of-vocab token id")
+
+        logits_processors.append(
+            partial(logit_bias_logits_processor, clamped_logit_bias))
+
+    if allowed_token_ids is not None:
+        logits_processors.append(
+            _get_allowed_token_ids_logits_processor(
+                frozenset(allowed_token_ids), len(tokenizer)))
+
+    return logits_processors
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/protocol.py b/vllm_v0.10.0/vllm/entrypoints/openai/protocol.py
new file mode 100644
index 0000000..6c6ec20
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/protocol.py
@@ -0,0 +1,2387 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
+import json
+import time
+from http import HTTPStatus
+from typing import Annotated, Any, ClassVar, Literal, Optional, Union
+
+import regex as re
+import torch
+from fastapi import HTTPException, UploadFile
+# yapf: disable
+from openai.types.chat.chat_completion_audio import (
+    ChatCompletionAudio as OpenAIChatCompletionAudio)
+from openai.types.chat.chat_completion_message import (
+    Annotation as OpenAIAnnotation)
+# yapf: enable
+from openai.types.responses import (ResponseInputParam, ResponseOutputItem,
+                                    ResponseOutputMessage, ResponsePrompt,
+                                    ResponseStatus, ResponseTextConfig)
+from openai.types.responses.response import ToolChoice
+from openai.types.responses.tool import Tool
+from openai.types.shared import Metadata, Reasoning
+from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
+                      ValidationInfo, field_validator, model_validator)
+from typing_extensions import TypeAlias
+
+from vllm import envs
+from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
+                                         random_tool_call_id)
+from vllm.entrypoints.score_utils import (ScoreContentPartParam,
+                                          ScoreMultiModalParam)
+from vllm.logger import init_logger
+from vllm.pooling_params import PoolingParams
+from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
+                                  RequestOutputKind, SamplingParams)
+from vllm.sequence import Logprob
+from vllm.utils import random_uuid, resolve_obj_by_qualname
+
+logger = init_logger(__name__)
+
+_LONG_INFO = torch.iinfo(torch.long)
+
+
+class OpenAIBaseModel(BaseModel):
+    # OpenAI API does allow extra fields
+    model_config = ConfigDict(extra="allow")
+
+    # Cache class field names
+    field_names: ClassVar[Optional[set[str]]] = None
+
+    @model_validator(mode="wrap")
+    @classmethod
+    def __log_extra_fields__(cls, data, handler):
+        result = handler(data)
+        if not isinstance(data, dict):
+            return result
+        field_names = cls.field_names
+        if field_names is None:
+            # Get all class field names and their potential aliases
+            field_names = set()
+            for field_name, field in cls.model_fields.items():
+                field_names.add(field_name)
+                if alias := getattr(field, "alias", None):
+                    field_names.add(alias)
+            cls.field_names = field_names
+
+        # Compare against both field names and aliases
+        if any(k not in field_names for k in data):
+            logger.warning(
+                "The following fields were present in the request "
+                "but ignored: %s",
+                data.keys() - field_names,
+            )
+        return result
+
+
+class ErrorResponse(OpenAIBaseModel):
+    object: str = "error"
+    message: str
+    type: str
+    param: Optional[str] = None
+    code: int
+
+
+class ModelPermission(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}")
+    object: str = "model_permission"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    allow_create_engine: bool = False
+    allow_sampling: bool = True
+    allow_logprobs: bool = True
+    allow_search_indices: bool = False
+    allow_view: bool = True
+    allow_fine_tuning: bool = False
+    organization: str = "*"
+    group: Optional[str] = None
+    is_blocking: bool = False
+
+
+class ModelCard(OpenAIBaseModel):
+    id: str
+    object: str = "model"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    owned_by: str = "vllm"
+    root: Optional[str] = None
+    parent: Optional[str] = None
+    max_model_len: Optional[int] = None
+    permission: list[ModelPermission] = Field(default_factory=list)
+
+
+class ModelList(OpenAIBaseModel):
+    object: str = "list"
+    data: list[ModelCard] = Field(default_factory=list)
+
+
+class PromptTokenUsageInfo(OpenAIBaseModel):
+    cached_tokens: Optional[int] = None
+
+
+class UsageInfo(OpenAIBaseModel):
+    prompt_tokens: int = 0
+    total_tokens: int = 0
+    completion_tokens: Optional[int] = 0
+    prompt_tokens_details: Optional[PromptTokenUsageInfo] = None
+
+
+class RequestResponseMetadata(BaseModel):
+    request_id: str
+    final_usage_info: Optional[UsageInfo] = None
+
+
+class JsonSchemaResponseFormat(OpenAIBaseModel):
+    name: str
+    description: Optional[str] = None
+    # schema is the field in openai but that causes conflicts with pydantic so
+    # instead use json_schema with an alias
+    json_schema: Optional[dict[str, Any]] = Field(default=None, alias='schema')
+    strict: Optional[bool] = None
+
+
+class StructuralTag(OpenAIBaseModel):
+    begin: str
+    # schema is the field, but that causes conflicts with pydantic so
+    # instead use structural_tag_schema with an alias
+    structural_tag_schema: Optional[dict[str, Any]] = Field(default=None,
+                                                            alias="schema")
+    end: str
+
+
+class StructuralTagResponseFormat(OpenAIBaseModel):
+    type: Literal["structural_tag"]
+    structures: list[StructuralTag]
+    triggers: list[str]
+
+
+class ResponseFormat(OpenAIBaseModel):
+    # type must be "json_schema", "json_object", or "text"
+    type: Literal["text", "json_object", "json_schema"]
+    json_schema: Optional[JsonSchemaResponseFormat] = None
+
+
+AnyResponseFormat = Union[ResponseFormat, StructuralTagResponseFormat]
+
+
+class StreamOptions(OpenAIBaseModel):
+    include_usage: Optional[bool] = True
+    continuous_usage_stats: Optional[bool] = False
+
+
+class FunctionDefinition(OpenAIBaseModel):
+    name: str
+    description: Optional[str] = None
+    parameters: Optional[dict[str, Any]] = None
+
+
+class ChatCompletionToolsParam(OpenAIBaseModel):
+    type: Literal["function"] = "function"
+    function: FunctionDefinition
+
+
+class ChatCompletionNamedFunction(OpenAIBaseModel):
+    name: str
+
+
+class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
+    function: ChatCompletionNamedFunction
+    type: Literal["function"] = "function"
+
+
+# extra="forbid" is a workaround to have kwargs as a field,
+# see https://github.com/pydantic/pydantic/issues/3125
+class LogitsProcessorConstructor(BaseModel):
+    qualname: str
+    args: Optional[list[Any]] = None
+    kwargs: Optional[dict[str, Any]] = None
+
+    model_config = ConfigDict(extra="forbid")
+
+
+LogitsProcessors = list[Union[str, LogitsProcessorConstructor]]
+
+
+def get_logits_processors(processors: Optional[LogitsProcessors],
+                          pattern: Optional[str]) -> Optional[list[Any]]:
+    if processors and pattern:
+        logits_processors = []
+        for processor in processors:
+            qualname = processor if isinstance(processor,
+                                               str) else processor.qualname
+            if not re.match(pattern, qualname):
+                raise ValueError(
+                    f"Logits processor '{qualname}' is not allowed by this "
+                    "server. See --logits-processor-pattern engine argument "
+                    "for more information.")
+            try:
+                logits_processor = resolve_obj_by_qualname(qualname)
+            except Exception as e:
+                raise ValueError(
+                    f"Logits processor '{qualname}' could not be resolved: {e}"
+                ) from e
+            if isinstance(processor, LogitsProcessorConstructor):
+                logits_processor = logits_processor(*processor.args or [],
+                                                    **processor.kwargs or {})
+            logits_processors.append(logits_processor)
+        return logits_processors
+    elif processors:
+        raise ValueError(
+            "The `logits_processors` argument is not supported by this "
+            "server. See --logits-processor-pattern engine argugment "
+            "for more information.")
+    return None
+
+
+class ResponsesRequest(OpenAIBaseModel):
+    # Ordered by official OpenAI API documentation
+    # https://platform.openai.com/docs/api-reference/responses/create
+    background: Optional[bool] = False
+    include: Optional[list[
+        Literal[
+            "code_interpreter_call.outputs",
+            "computer_call_output.output.image_url",
+            "file_search_call.results",
+            "message.input_image.image_url",
+            "message.output_text.logprobs",
+            "reasoning.encrypted_content",
+        ],
+    ]] = None
+    input: Union[str, ResponseInputParam]
+    instructions: Optional[str] = None
+    max_output_tokens: Optional[int] = None
+    max_tool_calls: Optional[int] = None
+    metadata: Optional[Metadata] = None
+    model: Optional[str] = None
+    parallel_tool_calls: Optional[bool] = True
+    previous_response_id: Optional[str] = None
+    prompt: Optional[ResponsePrompt] = None
+    reasoning: Optional[Reasoning] = None
+    service_tier: Literal["auto", "default", "flex", "scale",
+                          "priority"] = "auto"
+    store: Optional[bool] = True
+    stream: Optional[bool] = False
+    temperature: Optional[float] = None
+    text: Optional[ResponseTextConfig] = None
+    tool_choice: ToolChoice = "auto"
+    tools: list[Tool] = Field(default_factory=list)
+    top_logprobs: Optional[int] = 0
+    top_p: Optional[float] = None
+    truncation: Optional[Literal["auto", "disabled"]] = "disabled"
+    user: Optional[str] = None
+
+    # --8<-- [start:responses-extra-params]
+    request_id: str = Field(
+        default_factory=lambda: f"resp_{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."),
+    )
+    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."),
+    )
+    cache_salt: Optional[str] = Field(
+        default=None,
+        description=(
+            "If specified, the prefix cache will be salted with the provided "
+            "string to prevent an attacker to guess prompts in multi-user "
+            "environments. The salt should be random, protected from "
+            "access by 3rd parties, and long enough to be "
+            "unpredictable (e.g., 43 characters base64-encoded, corresponding "
+            "to 256 bit). Not supported by vLLM engine V0."))
+    # --8<-- [end:responses-extra-params]
+
+    _DEFAULT_SAMPLING_PARAMS = {
+        "temperature": 1.0,
+        "top_p": 1.0,
+    }
+
+    def to_sampling_params(
+        self,
+        default_max_tokens: int,
+        default_sampling_params: Optional[dict] = None,
+    ) -> SamplingParams:
+        if self.max_output_tokens is None:
+            max_tokens = default_max_tokens
+        else:
+            max_tokens = min(self.max_output_tokens, default_max_tokens)
+
+        default_sampling_params = default_sampling_params or {}
+        if (temperature := self.temperature) is None:
+            temperature = default_sampling_params.get(
+                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
+        if (top_p := self.top_p) is None:
+            top_p = default_sampling_params.get(
+                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
+
+        # Structured output
+        guided_decoding = None
+        if self.text is not None and self.text.format is not None:
+            response_format = self.text.format
+            if response_format.type == "json_schema":
+                guided_decoding = GuidedDecodingParams.from_optional(
+                    json=response_format.schema_)
+            elif response_format.type == "json_object":
+                raise NotImplementedError("json_object is not supported")
+
+        # TODO: add more parameters
+        return SamplingParams.from_optional(
+            temperature=temperature,
+            top_p=top_p,
+            max_tokens=max_tokens,
+            logprobs=self.top_logprobs,
+            output_kind=(RequestOutputKind.DELTA
+                         if self.stream else RequestOutputKind.FINAL_ONLY),
+            guided_decoding=guided_decoding,
+        )
+
+    @model_validator(mode="before")
+    def validate_background(cls, data):
+        if not data.get("background"):
+            return data
+        if not data.get("store", True):
+            raise ValueError(
+                "background can only be used when `store` is true")
+        return data
+
+    @model_validator(mode="before")
+    def validate_prompt(cls, data):
+        if data.get("prompt") is not None:
+            raise ValueError("prompt template is not supported")
+        return data
+
+    @model_validator(mode="before")
+    def check_cache_salt_support(cls, data):
+        if data.get("cache_salt") is not None:
+            if not envs.VLLM_USE_V1:
+                raise ValueError(
+                    "Parameter 'cache_salt' is not supported with "
+                    "this instance of vLLM, which uses engine V0.")
+            if not isinstance(data["cache_salt"],
+                              str) or not data["cache_salt"]:
+                raise ValueError("Parameter 'cache_salt' must be a "
+                                 "non-empty string if provided.")
+        return data
+
+
+class ChatCompletionRequest(OpenAIBaseModel):
+    # Ordered by official OpenAI API documentation
+    # https://platform.openai.com/docs/api-reference/chat/create
+    messages: list[ChatCompletionMessageParam]
+    model: Optional[str] = None
+    frequency_penalty: Optional[float] = 0.0
+    logit_bias: Optional[dict[str, float]] = None
+    logprobs: Optional[bool] = False
+    top_logprobs: Optional[int] = 0
+    max_tokens: Optional[int] = Field(
+        default=None,
+        deprecated=
+        'max_tokens is deprecated in favor of the max_completion_tokens field')
+    max_completion_tokens: Optional[int] = None
+    n: Optional[int] = 1
+    presence_penalty: Optional[float] = 0.0
+    response_format: Optional[AnyResponseFormat] = None
+    seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
+    stop: Optional[Union[str, list[str]]] = []
+    stream: Optional[bool] = False
+    stream_options: Optional[StreamOptions] = None
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+    tools: Optional[list[ChatCompletionToolsParam]] = None
+    tool_choice: Optional[Union[
+        Literal["none"],
+        Literal["auto"],
+        Literal["required"],
+        ChatCompletionNamedToolChoiceParam,
+    ]] = "none"
+
+    # NOTE this will be ignored by vLLM -- the model determines the behavior
+    parallel_tool_calls: Optional[bool] = False
+    user: Optional[str] = None
+
+    # --8<-- [start:chat-completion-sampling-params]
+    best_of: Optional[int] = None
+    use_beam_search: bool = False
+    top_k: Optional[int] = None
+    min_p: Optional[float] = None
+    repetition_penalty: Optional[float] = None
+    length_penalty: float = 1.0
+    stop_token_ids: Optional[list[int]] = []
+    include_stop_str_in_output: bool = False
+    ignore_eos: bool = False
+    min_tokens: int = 0
+    skip_special_tokens: bool = True
+    spaces_between_special_tokens: bool = True
+    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
+    prompt_logprobs: Optional[int] = None
+    allowed_token_ids: Optional[list[int]] = None
+    bad_words: list[str] = Field(default_factory=list)
+    # --8<-- [end:chat-completion-sampling-params]
+
+    # --8<-- [start:chat-completion-extra-params]
+    echo: bool = Field(
+        default=False,
+        description=(
+            "If true, the new message will be prepended with the last message "
+            "if they belong to the same role."),
+    )
+    add_generation_prompt: bool = Field(
+        default=True,
+        description=
+        ("If true, the generation prompt will be added to the chat template. "
+         "This is a parameter used by chat template in tokenizer config of the "
+         "model."),
+    )
+    continue_final_message: bool = Field(
+        default=False,
+        description=
+        ("If this is set, the chat will be formatted so that the final "
+         "message in the chat is open-ended, without any EOS tokens. The "
+         "model will continue this message rather than starting a new one. "
+         "This allows you to \"prefill\" part of the model's response for it. "
+         "Cannot be used at the same time as `add_generation_prompt`."),
+    )
+    add_special_tokens: bool = Field(
+        default=False,
+        description=(
+            "If true, special tokens (e.g. BOS) will be added to the prompt "
+            "on top of what is added by the chat template. "
+            "For most models, the chat template takes care of adding the "
+            "special tokens so this should be set to false (as is the "
+            "default)."),
+    )
+    documents: Optional[list[dict[str, str]]] = Field(
+        default=None,
+        description=
+        ("A list of dicts representing documents that will be accessible to "
+         "the model if it is performing RAG (retrieval-augmented generation)."
+         " If the template does not support RAG, this argument will have no "
+         "effect. We recommend that each document should be a dict containing "
+         "\"title\" and \"text\" keys."),
+    )
+    chat_template: Optional[str] = Field(
+        default=None,
+        description=(
+            "A Jinja template to use for this conversion. "
+            "As of transformers v4.44, default chat template is no longer "
+            "allowed, so you must provide a chat template if the tokenizer "
+            "does not define one."),
+    )
+    chat_template_kwargs: Optional[dict[str, Any]] = Field(
+        default=None,
+        description=(
+            "Additional keyword args to pass to the template renderer. "
+            "Will be accessible by the chat template."),
+    )
+    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+    guided_json: Optional[Union[str, dict, BaseModel]] = Field(
+        default=None,
+        description=("If specified, the output will follow the JSON schema."),
+    )
+    guided_regex: Optional[str] = Field(
+        default=None,
+        description=(
+            "If specified, the output will follow the regex pattern."),
+    )
+    guided_choice: Optional[list[str]] = Field(
+        default=None,
+        description=(
+            "If specified, the output will be exactly one of the choices."),
+    )
+    guided_grammar: Optional[str] = Field(
+        default=None,
+        description=(
+            "If specified, the output will follow the context free grammar."),
+    )
+    structural_tag: Optional[str] = Field(
+        default=None,
+        description=(
+            "If specified, the output will follow the structural tag schema."),
+    )
+    guided_decoding_backend: Optional[str] = Field(
+        default=None,
+        description=(
+            "If specified, will override the default guided decoding backend "
+            "of the server for this specific request. If set, must be either "
+            "'outlines' / 'lm-format-enforcer'"),
+    )
+    guided_whitespace_pattern: Optional[str] = Field(
+        default=None,
+        description=(
+            "If specified, will override the default whitespace pattern "
+            "for guided json decoding."),
+    )
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."),
+    )
+    request_id: str = Field(
+        default_factory=lambda: f"{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."),
+    )
+    logits_processors: Optional[LogitsProcessors] = Field(
+        default=None,
+        description=(
+            "A list of either qualified names of logits processors, or "
+            "constructor objects, to apply when sampling. A constructor is "
+            "a JSON object with a required 'qualname' field specifying the "
+            "qualified name of the processor class/factory, and optional "
+            "'args' and 'kwargs' fields containing positional and keyword "
+            "arguments. For example: {'qualname': "
+            "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
+            "{'param': 'value'}}."))
+    return_tokens_as_token_ids: Optional[bool] = Field(
+        default=None,
+        description=(
+            "If specified with 'logprobs', tokens are represented "
+            " as strings of the form 'token_id:{token_id}' so that tokens "
+            "that are not JSON-encodable can be identified."))
+    cache_salt: Optional[str] = Field(
+        default=None,
+        description=(
+            "If specified, the prefix cache will be salted with the provided "
+            "string to prevent an attacker to guess prompts in multi-user "
+            "environments. The salt should be random, protected from "
+            "access by 3rd parties, and long enough to be "
+            "unpredictable (e.g., 43 characters base64-encoded, corresponding "
+            "to 256 bit). Not supported by vLLM engine V0."))
+    kv_transfer_params: Optional[dict[str, Any]] = Field(
+        default=None,
+        description="KVTransfer parameters used for disaggregated serving.")
+
+    vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
+        default=None,
+        description=("Additional request parameters with string or "
+                     "numeric values, used by custom extensions."),
+    )
+
+    # --8<-- [end:chat-completion-extra-params]
+
+    # Default sampling parameters for chat completion requests
+    _DEFAULT_SAMPLING_PARAMS: dict = {
+        "repetition_penalty": 1.0,
+        "temperature": 1.0,
+        "top_p": 1.0,
+        "top_k": 0,
+        "min_p": 0.0,
+    }
+
+    def to_beam_search_params(
+            self, max_tokens: int,
+            default_sampling_params: dict) -> BeamSearchParams:
+
+        n = self.n if self.n is not None else 1
+        if (temperature := self.temperature) is None:
+            temperature = default_sampling_params.get(
+                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
+
+        return BeamSearchParams(
+            beam_width=n,
+            max_tokens=max_tokens,
+            ignore_eos=self.ignore_eos,
+            temperature=temperature,
+            length_penalty=self.length_penalty,
+            include_stop_str_in_output=self.include_stop_str_in_output,
+        )
+
+    def to_sampling_params(
+        self,
+        max_tokens: int,
+        logits_processor_pattern: Optional[str],
+        default_sampling_params: dict,
+    ) -> SamplingParams:
+
+        # Default parameters
+        if (repetition_penalty := self.repetition_penalty) is None:
+            repetition_penalty = default_sampling_params.get(
+                "repetition_penalty",
+                self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
+            )
+        if (temperature := self.temperature) is None:
+            temperature = default_sampling_params.get(
+                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
+        if (top_p := self.top_p) is None:
+            top_p = default_sampling_params.get(
+                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
+        if (top_k := self.top_k) is None:
+            top_k = default_sampling_params.get(
+                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
+        if (min_p := self.min_p) is None:
+            min_p = default_sampling_params.get(
+                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
+
+        prompt_logprobs = self.prompt_logprobs
+        if prompt_logprobs is None and self.echo:
+            prompt_logprobs = self.top_logprobs
+
+        guided_json_object = None
+        if self.response_format is not None:
+            if self.response_format.type == "json_object":
+                guided_json_object = True
+            elif self.response_format.type == "json_schema":
+                json_schema = self.response_format.json_schema
+                assert json_schema is not None
+                self.guided_json = json_schema.json_schema
+            elif self.response_format.type == "structural_tag":
+                structural_tag = self.response_format
+                assert structural_tag is not None and isinstance(
+                    structural_tag, StructuralTagResponseFormat)
+                s_tag_obj = structural_tag.model_dump(by_alias=True)
+                self.structural_tag = json.dumps(s_tag_obj)
+
+        guided_decoding = GuidedDecodingParams.from_optional(
+            json=self._get_guided_json_from_tool() or self.guided_json,
+            regex=self.guided_regex,
+            choice=self.guided_choice,
+            grammar=self.guided_grammar,
+            json_object=guided_json_object,
+            backend=self.guided_decoding_backend,
+            whitespace_pattern=self.guided_whitespace_pattern,
+            structural_tag=self.structural_tag,
+        )
+
+        extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
+        if self.kv_transfer_params:
+            # Pass in kv_transfer_params via extra_args
+            extra_args["kv_transfer_params"] = self.kv_transfer_params
+        return SamplingParams.from_optional(
+            n=self.n,
+            best_of=self.best_of,
+            presence_penalty=self.presence_penalty,
+            frequency_penalty=self.frequency_penalty,
+            repetition_penalty=repetition_penalty,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            min_p=min_p,
+            seed=self.seed,
+            stop=self.stop,
+            stop_token_ids=self.stop_token_ids,
+            logprobs=self.top_logprobs if self.logprobs else None,
+            prompt_logprobs=prompt_logprobs,
+            ignore_eos=self.ignore_eos,
+            max_tokens=max_tokens,
+            min_tokens=self.min_tokens,
+            skip_special_tokens=self.skip_special_tokens,
+            spaces_between_special_tokens=self.spaces_between_special_tokens,
+            logits_processors=get_logits_processors(self.logits_processors,
+                                                    logits_processor_pattern),
+            include_stop_str_in_output=self.include_stop_str_in_output,
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            output_kind=RequestOutputKind.DELTA if self.stream \
+                else RequestOutputKind.FINAL_ONLY,
+            guided_decoding=guided_decoding,
+            logit_bias=self.logit_bias,
+            bad_words= self.bad_words,
+            allowed_token_ids=self.allowed_token_ids,
+            extra_args=extra_args or None,
+        )
+
+    def _get_guided_json_from_tool(
+            self) -> Optional[Union[str, dict, BaseModel]]:
+        # user has chosen to not use any tool
+        if self.tool_choice == "none" or self.tools is None:
+            return None
+
+        # user has chosen to use a named tool
+        if type(self.tool_choice) is ChatCompletionNamedToolChoiceParam:
+            tool_name = self.tool_choice.function.name
+            tools = {tool.function.name: tool.function for tool in self.tools}
+            if tool_name not in tools:
+                raise ValueError(
+                    f"Tool '{tool_name}' has not been passed in `tools`.")
+            tool = tools[tool_name]
+            return tool.parameters
+
+        if self.tool_choice == "required":
+            # Pydantic schema generation cannot be used since the JSON schema
+            # has to be constructed for a specific instantiation of a tool list
+            # so that parameters of a function are correctly generated
+            # based on the chosen function name
+            def get_tool_schema(tool: ChatCompletionToolsParam) -> dict:
+                return {
+                    "properties": {
+                        "name": {
+                            "type": "string",
+                            "enum": [tool.function.name]
+                        },
+                        # parameters are always generated as '{}' in the final
+                        # output if they are missing from the request
+                        # (i.e. are None or '{}') so the schema is
+                        # updated to produce an empty object in that case
+                        "parameters": tool.function.parameters
+                        if tool.function.parameters else {
+                            "type": "object",
+                            "properties": {}
+                        }
+                    },
+                    "required": ["name", "parameters"]
+                }
+
+            def get_tool_schema_defs(
+                    tools: list[ChatCompletionToolsParam]) -> dict:
+                all_defs = dict[str, dict[str, Any]]()
+                for tool in tools:
+                    if tool.function.parameters is None:
+                        continue
+                    defs = tool.function.parameters.pop("$defs", {})
+                    for def_name, def_schema in defs.items():
+                        if def_name in all_defs and all_defs[
+                                def_name] != def_schema:
+                            raise ValueError(
+                                f"Tool definition '{def_name}' has "
+                                "multiple schemas, which is not "
+                                "supported.")
+                        else:
+                            all_defs[def_name] = def_schema
+                return all_defs
+
+            json_schema = {
+                "type": "array",
+                "minItems": 1,
+                "items": {
+                    "type": "object",
+                    "anyOf": [get_tool_schema(tool) for tool in self.tools]
+                }
+            }
+            json_schema_defs = get_tool_schema_defs(self.tools)
+            if json_schema_defs:
+                json_schema["$defs"] = json_schema_defs
+            return json_schema
+
+        return None
+
+    @model_validator(mode="before")
+    @classmethod
+    def validate_stream_options(cls, data):
+        if data.get("stream_options") and not data.get("stream"):
+            raise ValueError(
+                "Stream options can only be defined when `stream=True`.")
+
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_logprobs(cls, data):
+        if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
+            if data.get("stream") and prompt_logprobs > 0:
+                raise ValueError(
+                    "`prompt_logprobs` are not available when `stream=True`.")
+
+            if prompt_logprobs < 0:
+                raise ValueError("`prompt_logprobs` must be a positive value.")
+
+        if (top_logprobs := data.get("top_logprobs")) is not None:
+            if top_logprobs < 0:
+                raise ValueError("`top_logprobs` must be a positive value.")
+
+            if top_logprobs > 0 and not data.get("logprobs"):
+                raise ValueError(
+                    "when using `top_logprobs`, `logprobs` must be set to true."
+                )
+
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_guided_decoding_count(cls, data):
+        if isinstance(data, ValueError):
+            raise data
+
+        guide_count = sum([
+            "guided_json" in data and data["guided_json"] is not None,
+            "guided_regex" in data and data["guided_regex"] is not None,
+            "guided_choice" in data and data["guided_choice"] is not None
+        ])
+        # you can only use one kind of guided decoding
+        if guide_count > 1:
+            raise ValueError(
+                "You can only use one kind of guided decoding "
+                "('guided_json', 'guided_regex' or 'guided_choice').")
+        # you can only either use guided decoding or tools, not both
+        if guide_count > 1 and data.get("tool_choice", "none") not in (
+                "none",
+                "auto",
+                "required",
+        ):
+            raise ValueError(
+                "You can only either use guided decoding or tools, not both.")
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_tool_usage(cls, data):
+
+        # if "tool_choice" is not specified but tools are provided,
+        # default to "auto" tool_choice
+        if "tool_choice" not in data and data.get("tools"):
+            data["tool_choice"] = "auto"
+
+        # if "tool_choice" is "none" -- no validation is needed for tools
+        if "tool_choice" in data and data["tool_choice"] == "none":
+            return data
+
+        # if "tool_choice" is specified -- validation
+        if "tool_choice" in data and data["tool_choice"] is not None:
+
+            # ensure that if "tool choice" is specified, tools are present
+            if "tools" not in data or data["tools"] is None:
+                raise ValueError(
+                    "When using `tool_choice`, `tools` must be set.")
+
+            # make sure that tool choice is either a named tool
+            # OR that it's set to "auto" or "required"
+            if data["tool_choice"] not in [
+                    "auto", "required"
+            ] and not isinstance(data["tool_choice"], dict):
+                raise ValueError(
+                    f'Invalid value for `tool_choice`: {data["tool_choice"]}! '\
+                    'Only named tools, "none", "auto" or "required" '\
+                    'are supported.'
+                )
+
+            # ensure that if "tool_choice" is specified as an object,
+            # it matches a valid tool
+            correct_usage_message = 'Correct usage: `{"type": "function",' \
+                ' "function": {"name": "my_function"}}`'
+            if isinstance(data["tool_choice"], dict):
+                valid_tool = False
+                function = data["tool_choice"].get("function")
+                if not isinstance(function, dict):
+                    raise ValueError(
+                        f"Invalid value for `function`: `{function}` in "
+                        f"`tool_choice`! {correct_usage_message}")
+                if "name" not in function:
+                    raise ValueError(f"Expected field `name` in `function` in "
+                                     f"`tool_choice`! {correct_usage_message}")
+                function_name = function["name"]
+                if not isinstance(function_name,
+                                  str) or len(function_name) == 0:
+                    raise ValueError(
+                        f"Invalid `name` in `function`: `{function_name}`"
+                        f" in `tool_choice`! {correct_usage_message}")
+                for tool in data["tools"]:
+                    if tool["function"]["name"] == function_name:
+                        valid_tool = True
+                        break
+                if not valid_tool:
+                    raise ValueError(
+                        "The tool specified in `tool_choice` does not match any"
+                        " of the specified `tools`")
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_generation_prompt(cls, data):
+        if data.get("continue_final_message") and data.get(
+                "add_generation_prompt"):
+            raise ValueError("Cannot set both `continue_final_message` and "
+                             "`add_generation_prompt` to True.")
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_cache_salt_support(cls, data):
+        if data.get("cache_salt") is not None:
+            if not envs.VLLM_USE_V1:
+                raise ValueError(
+                    "Parameter 'cache_salt' is not supported with "
+                    "this instance of vLLM, which uses engine V0.")
+            if not isinstance(data["cache_salt"],
+                              str) or not data["cache_salt"]:
+                raise ValueError("Parameter 'cache_salt' must be a "
+                                 "non-empty string if provided.")
+        return data
+
+
+class CompletionRequest(OpenAIBaseModel):
+    # Ordered by official OpenAI API documentation
+    # https://platform.openai.com/docs/api-reference/completions/create
+    model: Optional[str] = None
+    prompt: Optional[Union[list[int], list[list[int]], str, list[str]]] = None
+    prompt_embeds: Optional[Union[bytes, list[bytes]]] = None
+    best_of: Optional[int] = None
+    echo: Optional[bool] = False
+    frequency_penalty: Optional[float] = 0.0
+    logit_bias: Optional[dict[str, float]] = None
+    logprobs: Optional[int] = None
+    max_tokens: Optional[int] = 16
+    n: int = 1
+    presence_penalty: Optional[float] = 0.0
+    seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
+    stop: Optional[Union[str, list[str]]] = []
+    stream: Optional[bool] = False
+    stream_options: Optional[StreamOptions] = None
+    suffix: Optional[str] = None
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+    user: Optional[str] = None
+
+    # --8<-- [start:completion-sampling-params]
+    use_beam_search: bool = False
+    top_k: Optional[int] = None
+    min_p: Optional[float] = None
+    repetition_penalty: Optional[float] = None
+    length_penalty: float = 1.0
+    stop_token_ids: Optional[list[int]] = []
+    include_stop_str_in_output: bool = False
+    ignore_eos: bool = False
+    min_tokens: int = 0
+    skip_special_tokens: bool = True
+    spaces_between_special_tokens: bool = True
+    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
+    allowed_token_ids: Optional[list[int]] = None
+    prompt_logprobs: Optional[int] = None
+    # --8<-- [end:completion-sampling-params]
+
+    # --8<-- [start:completion-extra-params]
+    add_special_tokens: bool = Field(
+        default=True,
+        description=(
+            "If true (the default), special tokens (e.g. BOS) will be added to "
+            "the prompt."),
+    )
+    response_format: Optional[AnyResponseFormat] = Field(
+        default=None,
+        description=(
+            "Similar to chat completion, this parameter specifies the format "
+            "of output. Only {'type': 'json_object'}, {'type': 'json_schema'}"
+            ", {'type': 'structural_tag'}, or {'type': 'text' } is supported."
+        ),
+    )
+    guided_json: Optional[Union[str, dict, BaseModel]] = Field(
+        default=None,
+        description="If specified, the output will follow the JSON schema.",
+    )
+    guided_regex: Optional[str] = Field(
+        default=None,
+        description=(
+            "If specified, the output will follow the regex pattern."),
+    )
+    guided_choice: Optional[list[str]] = Field(
+        default=None,
+        description=(
+            "If specified, the output will be exactly one of the choices."),
+    )
+    guided_grammar: Optional[str] = Field(
+        default=None,
+        description=(
+            "If specified, the output will follow the context free grammar."),
+    )
+    guided_decoding_backend: Optional[str] = Field(
+        default=None,
+        description=(
+            "If specified, will override the default guided decoding backend "
+            "of the server for this specific request. If set, must be one of "
+            "'outlines' / 'lm-format-enforcer'"),
+    )
+    guided_whitespace_pattern: Optional[str] = Field(
+        default=None,
+        description=(
+            "If specified, will override the default whitespace pattern "
+            "for guided json decoding."),
+    )
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."),
+    )
+    logits_processors: Optional[LogitsProcessors] = Field(
+        default=None,
+        description=(
+            "A list of either qualified names of logits processors, or "
+            "constructor objects, to apply when sampling. A constructor is "
+            "a JSON object with a required 'qualname' field specifying the "
+            "qualified name of the processor class/factory, and optional "
+            "'args' and 'kwargs' fields containing positional and keyword "
+            "arguments. For example: {'qualname': "
+            "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
+            "{'param': 'value'}}."))
+
+    return_tokens_as_token_ids: Optional[bool] = Field(
+        default=None,
+        description=(
+            "If specified with 'logprobs', tokens are represented "
+            " as strings of the form 'token_id:{token_id}' so that tokens "
+            "that are not JSON-encodable can be identified."))
+
+    cache_salt: Optional[str] = Field(
+        default=None,
+        description=(
+            "If specified, the prefix cache will be salted with the provided "
+            "string to prevent an attacker to guess prompts in multi-user "
+            "environments. The salt should be random, protected from "
+            "access by 3rd parties, and long enough to be "
+            "unpredictable (e.g., 43 characters base64-encoded, corresponding "
+            "to 256 bit). Not supported by vLLM engine V0."))
+
+    kv_transfer_params: Optional[dict[str, Any]] = Field(
+        default=None,
+        description="KVTransfer parameters used for disaggregated serving.")
+
+    vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
+        default=None,
+        description=("Additional request parameters with string or "
+                     "numeric values, used by custom extensions."),
+    )
+
+    # --8<-- [end:completion-extra-params]
+
+    # Default sampling parameters for completion requests
+    _DEFAULT_SAMPLING_PARAMS: dict = {
+        "repetition_penalty": 1.0,
+        "temperature": 1.0,
+        "top_p": 1.0,
+        "top_k": 0,
+        "min_p": 0.0,
+    }
+
+    def to_beam_search_params(
+        self,
+        max_tokens: int,
+        default_sampling_params: Optional[dict] = None,
+    ) -> BeamSearchParams:
+
+        if default_sampling_params is None:
+            default_sampling_params = {}
+        n = self.n if self.n is not None else 1
+
+        if (temperature := self.temperature) is None:
+            temperature = default_sampling_params.get("temperature", 1.0)
+
+        return BeamSearchParams(
+            beam_width=n,
+            max_tokens=max_tokens,
+            ignore_eos=self.ignore_eos,
+            temperature=temperature,
+            length_penalty=self.length_penalty,
+            include_stop_str_in_output=self.include_stop_str_in_output,
+        )
+
+    def to_sampling_params(
+        self,
+        max_tokens: int,
+        logits_processor_pattern: Optional[str],
+        default_sampling_params: Optional[dict] = None,
+    ) -> SamplingParams:
+
+        if default_sampling_params is None:
+            default_sampling_params = {}
+
+        # Default parameters
+        if (repetition_penalty := self.repetition_penalty) is None:
+            repetition_penalty = default_sampling_params.get(
+                "repetition_penalty",
+                self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
+            )
+        if (temperature := self.temperature) is None:
+            temperature = default_sampling_params.get(
+                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
+        if (top_p := self.top_p) is None:
+            top_p = default_sampling_params.get(
+                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
+        if (top_k := self.top_k) is None:
+            top_k = default_sampling_params.get(
+                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
+        if (min_p := self.min_p) is None:
+            min_p = default_sampling_params.get(
+                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
+
+        prompt_logprobs = self.prompt_logprobs
+        if prompt_logprobs is None and self.echo:
+            prompt_logprobs = self.logprobs
+
+        echo_without_generation = self.echo and self.max_tokens == 0
+
+        guided_json_object = None
+        if (self.response_format is not None
+                and self.response_format.type == "json_object"):
+            guided_json_object = True
+
+        guided_decoding = GuidedDecodingParams.from_optional(
+            json=self.guided_json,
+            regex=self.guided_regex,
+            choice=self.guided_choice,
+            grammar=self.guided_grammar,
+            json_object=guided_json_object,
+            backend=self.guided_decoding_backend,
+            whitespace_pattern=self.guided_whitespace_pattern,
+        )
+
+        extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
+        if self.kv_transfer_params:
+            # Pass in kv_transfer_params via extra_args
+            extra_args["kv_transfer_params"] = self.kv_transfer_params
+        return SamplingParams.from_optional(
+            n=self.n,
+            best_of=self.best_of,
+            presence_penalty=self.presence_penalty,
+            frequency_penalty=self.frequency_penalty,
+            repetition_penalty=repetition_penalty,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            min_p=min_p,
+            seed=self.seed,
+            stop=self.stop,
+            stop_token_ids=self.stop_token_ids,
+            logprobs=self.logprobs,
+            ignore_eos=self.ignore_eos,
+            max_tokens=max_tokens if not echo_without_generation else 1,
+            min_tokens=self.min_tokens,
+            prompt_logprobs=prompt_logprobs,
+            skip_special_tokens=self.skip_special_tokens,
+            spaces_between_special_tokens=self.spaces_between_special_tokens,
+            include_stop_str_in_output=self.include_stop_str_in_output,
+            logits_processors=get_logits_processors(self.logits_processors,
+                                                    logits_processor_pattern),
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            output_kind=RequestOutputKind.DELTA if self.stream \
+                else RequestOutputKind.FINAL_ONLY,
+            guided_decoding=guided_decoding,
+            logit_bias=self.logit_bias,
+            allowed_token_ids=self.allowed_token_ids,
+            extra_args=extra_args or None,
+            )
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_guided_decoding_count(cls, data):
+        guide_count = sum([
+            "guided_json" in data and data["guided_json"] is not None,
+            "guided_regex" in data and data["guided_regex"] is not None,
+            "guided_choice" in data and data["guided_choice"] is not None
+        ])
+        if guide_count > 1:
+            raise ValueError(
+                "You can only use one kind of guided decoding "
+                "('guided_json', 'guided_regex' or 'guided_choice').")
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_logprobs(cls, data):
+        if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
+            if data.get("stream") and prompt_logprobs > 0:
+                raise ValueError(
+                    "`prompt_logprobs` are not available when `stream=True`.")
+
+            if prompt_logprobs < 0:
+                raise ValueError("`prompt_logprobs` must be a positive value.")
+
+        if (logprobs := data.get("logprobs")) is not None and logprobs < 0:
+            raise ValueError("`logprobs` must be a positive value.")
+
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def validate_stream_options(cls, data):
+        if data.get("stream_options") and not data.get("stream"):
+            raise ValueError(
+                "Stream options can only be defined when `stream=True`.")
+
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def validate_prompt_and_prompt_embeds(cls, data):
+        if data.get("prompt") is None and data.get("prompt_embeds") is None:
+            raise ValueError(
+                "At least one of `prompt` or `prompt_embeds` must be set.")
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_cache_salt_support(cls, data):
+        if data.get("cache_salt") is not None:
+            if not envs.VLLM_USE_V1:
+                raise ValueError(
+                    "Parameter 'cache_salt' is not supported with "
+                    "this instance of vLLM, which uses engine V0.")
+            if not isinstance(data["cache_salt"],
+                              str) or not data["cache_salt"]:
+                raise ValueError("Parameter 'cache_salt' must be a "
+                                 "non-empty string if provided.")
+        return data
+
+
+class EmbeddingCompletionRequest(OpenAIBaseModel):
+    # Ordered by official OpenAI API documentation
+    # https://platform.openai.com/docs/api-reference/embeddings
+    model: Optional[str] = None
+    input: Union[list[int], list[list[int]], str, list[str]]
+    encoding_format: Literal["float", "base64"] = "float"
+    dimensions: Optional[int] = None
+    user: Optional[str] = None
+    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
+
+    # --8<-- [start:embedding-extra-params]
+    add_special_tokens: bool = Field(
+        default=True,
+        description=(
+            "If true (the default), special tokens (e.g. BOS) will be added to "
+            "the prompt."),
+    )
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."),
+    )
+
+    # --8<-- [end:embedding-extra-params]
+
+    def to_pooling_params(self):
+        return PoolingParams(dimensions=self.dimensions)
+
+
+class EmbeddingChatRequest(OpenAIBaseModel):
+    model: Optional[str] = None
+    messages: list[ChatCompletionMessageParam]
+
+    encoding_format: Literal["float", "base64"] = "float"
+    dimensions: Optional[int] = None
+    user: Optional[str] = None
+    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
+
+    # --8<-- [start:chat-embedding-extra-params]
+    add_special_tokens: bool = Field(
+        default=False,
+        description=(
+            "If true, special tokens (e.g. BOS) will be added to the prompt "
+            "on top of what is added by the chat template. "
+            "For most models, the chat template takes care of adding the "
+            "special tokens so this should be set to false (as is the "
+            "default)."),
+    )
+    chat_template: Optional[str] = Field(
+        default=None,
+        description=(
+            "A Jinja template to use for this conversion. "
+            "As of transformers v4.44, default chat template is no longer "
+            "allowed, so you must provide a chat template if the tokenizer "
+            "does not define one."),
+    )
+    chat_template_kwargs: Optional[dict[str, Any]] = Field(
+        default=None,
+        description=(
+            "Additional keyword args to pass to the template renderer. "
+            "Will be accessible by the chat template."),
+    )
+    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."),
+    )
+    # --8<-- [end:chat-embedding-extra-params]
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_generation_prompt(cls, data):
+        if data.get("continue_final_message") and data.get(
+                "add_generation_prompt"):
+            raise ValueError("Cannot set both `continue_final_message` and "
+                             "`add_generation_prompt` to True.")
+        return data
+
+    def to_pooling_params(self):
+        return PoolingParams(dimensions=self.dimensions)
+
+
+EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest]
+
+PoolingCompletionRequest = EmbeddingCompletionRequest
+PoolingChatRequest = EmbeddingChatRequest
+PoolingRequest = Union[PoolingCompletionRequest, PoolingChatRequest]
+
+
+class ScoreRequest(OpenAIBaseModel):
+    model: Optional[str] = None
+    text_1: Union[list[str], str, ScoreMultiModalParam]
+    text_2: Union[list[str], str, ScoreMultiModalParam]
+    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
+
+    # --8<-- [start:score-extra-params]
+
+    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."),
+    )
+
+    # --8<-- [end:score-extra-params]
+
+    def to_pooling_params(self):
+        return PoolingParams()
+
+
+class RerankRequest(OpenAIBaseModel):
+    model: Optional[str] = None
+    query: Union[str, ScoreMultiModalParam]
+    documents: Union[list[str], ScoreMultiModalParam]
+    top_n: int = Field(default_factory=lambda: 0)
+    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
+
+    # --8<-- [start:rerank-extra-params]
+
+    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."),
+    )
+
+    # --8<-- [end:rerank-extra-params]
+
+    def to_pooling_params(self):
+        return PoolingParams()
+
+
+class RerankDocument(BaseModel):
+    text: Optional[str] = None
+    multi_modal: Optional[ScoreContentPartParam] = None
+
+
+class RerankResult(BaseModel):
+    index: int
+    document: RerankDocument
+    relevance_score: float
+
+
+class RerankUsage(BaseModel):
+    total_tokens: int
+
+
+class RerankResponse(OpenAIBaseModel):
+    id: str
+    model: str
+    usage: RerankUsage
+    results: list[RerankResult]
+
+
+class CompletionLogProbs(OpenAIBaseModel):
+    text_offset: list[int] = Field(default_factory=list)
+    token_logprobs: list[Optional[float]] = Field(default_factory=list)
+    tokens: list[str] = Field(default_factory=list)
+    top_logprobs: list[Optional[dict[str,
+                                     float]]] = Field(default_factory=list)
+
+
+class CompletionResponseChoice(OpenAIBaseModel):
+    index: int
+    text: str
+    logprobs: Optional[CompletionLogProbs] = None
+    finish_reason: Optional[str] = None
+    stop_reason: Optional[Union[int, str]] = Field(
+        default=None,
+        description=(
+            "The stop string or token id that caused the completion "
+            "to stop, None if the completion finished for some other reason "
+            "including encountering the EOS token"),
+    )
+    prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
+
+
+class CompletionResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
+    object: Literal["text_completion"] = "text_completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: list[CompletionResponseChoice]
+    service_tier: Optional[Literal["auto", "default", "flex", "scale",
+                                   "priority"]] = None
+    system_fingerprint: Optional[str] = None
+    usage: UsageInfo
+
+    # vLLM-specific fields that are not in OpenAI spec
+    kv_transfer_params: Optional[dict[str, Any]] = Field(
+        default=None, description="KVTransfer parameters.")
+
+
+class CompletionResponseStreamChoice(OpenAIBaseModel):
+    index: int
+    text: str
+    logprobs: Optional[CompletionLogProbs] = None
+    finish_reason: Optional[str] = None
+    stop_reason: Optional[Union[int, str]] = Field(
+        default=None,
+        description=(
+            "The stop string or token id that caused the completion "
+            "to stop, None if the completion finished for some other reason "
+            "including encountering the EOS token"),
+    )
+
+
+class CompletionStreamResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
+    object: str = "text_completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: list[CompletionResponseStreamChoice]
+    usage: Optional[UsageInfo] = Field(default=None)
+
+
+class EmbeddingResponseData(OpenAIBaseModel):
+    index: int
+    object: str = "embedding"
+    embedding: Union[list[float], str]
+
+
+class EmbeddingResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
+    object: str = "list"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    data: list[EmbeddingResponseData]
+    usage: UsageInfo
+
+
+class PoolingResponseData(OpenAIBaseModel):
+    index: int
+    object: str = "pooling"
+    data: Union[list[list[float]], list[float], str]
+
+
+class PoolingResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"pool-{random_uuid()}")
+    object: str = "list"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    data: list[PoolingResponseData]
+    usage: UsageInfo
+
+
+class ScoreResponseData(OpenAIBaseModel):
+    index: int
+    object: str = "score"
+    score: float
+
+
+class ScoreResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
+    object: str = "list"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    data: list[ScoreResponseData]
+    usage: UsageInfo
+
+
+class ClassificationRequest(OpenAIBaseModel):
+    model: Optional[str] = None
+    input: Union[list[str], str]
+    truncate_prompt_tokens: Optional[int] = None
+    user: Optional[str] = None
+
+    # --8<-- [start:classification-extra-params]
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."),
+    )
+
+    # --8<-- [end:classification-extra-params]
+
+    def to_pooling_params(self):
+        return PoolingParams()
+
+
+class ClassificationData(OpenAIBaseModel):
+    index: int
+    label: Optional[str]
+    probs: list[float]
+    num_classes: int
+
+
+class ClassificationResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"classify-{random_uuid()}")
+    object: str = "list"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    data: list[ClassificationData]
+    usage: UsageInfo
+
+
+class FunctionCall(OpenAIBaseModel):
+    name: str
+    arguments: str
+
+
+class ToolCall(OpenAIBaseModel):
+    id: str = Field(default_factory=random_tool_call_id)
+    type: Literal["function"] = "function"
+    function: FunctionCall
+
+
+class DeltaFunctionCall(BaseModel):
+    name: Optional[str] = None
+    arguments: Optional[str] = None
+
+
+# a tool call delta where everything is optional
+class DeltaToolCall(OpenAIBaseModel):
+    id: Optional[str] = None
+    type: Optional[Literal["function"]] = None
+    index: int
+    function: Optional[DeltaFunctionCall] = None
+
+
+class ExtractedToolCallInformation(BaseModel):
+    # indicate if tools were called
+    tools_called: bool
+
+    # extracted tool calls
+    tool_calls: list[ToolCall]
+
+    # content - per OpenAI spec, content AND tool calls can be returned rarely
+    # But some models will do this intentionally
+    content: Optional[str] = None
+
+
+class ChatMessage(OpenAIBaseModel):
+    role: str
+    content: Optional[str] = None
+    refusal: Optional[str] = None
+    annotations: Optional[OpenAIAnnotation] = None
+    audio: Optional[OpenAIChatCompletionAudio] = None
+    function_call: Optional[FunctionCall] = None
+    tool_calls: list[ToolCall] = Field(default_factory=list)
+
+    # vLLM-specific fields that are not in OpenAI spec
+    reasoning_content: Optional[str] = None
+
+
+class ChatCompletionLogProb(OpenAIBaseModel):
+    token: str
+    logprob: float = -9999.0
+    bytes: Optional[list[int]] = None
+
+
+class ChatCompletionLogProbsContent(ChatCompletionLogProb):
+    # Workaround: redefine fields name cache so that it's not
+    # shared with the super class.
+    field_names: ClassVar[Optional[set[str]]] = None
+    top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list)
+
+
+class ChatCompletionLogProbs(OpenAIBaseModel):
+    content: Optional[list[ChatCompletionLogProbsContent]] = None
+
+
+class ChatCompletionResponseChoice(OpenAIBaseModel):
+    index: int
+    message: ChatMessage
+    logprobs: Optional[ChatCompletionLogProbs] = None
+    # per OpenAI spec this is the default
+    finish_reason: Optional[str] = "stop"
+    # not part of the OpenAI spec but included in vLLM for legacy reasons
+    stop_reason: Optional[Union[int, str]] = None
+
+
+class ChatCompletionResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
+    object: Literal["chat.completion"] = "chat.completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: list[ChatCompletionResponseChoice]
+    service_tier: Optional[Literal["auto", "default", "flex", "scale",
+                                   "priority"]] = None
+    system_fingerprint: Optional[str] = None
+    usage: UsageInfo
+
+    # vLLM-specific fields that are not in OpenAI spec
+    prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
+    kv_transfer_params: Optional[dict[str, Any]] = Field(
+        default=None, description="KVTransfer parameters.")
+
+
+class DeltaMessage(OpenAIBaseModel):
+    role: Optional[str] = None
+    content: Optional[str] = None
+    reasoning_content: Optional[str] = None
+    tool_calls: list[DeltaToolCall] = Field(default_factory=list)
+
+
+class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
+    index: int
+    delta: DeltaMessage
+    logprobs: Optional[ChatCompletionLogProbs] = None
+    finish_reason: Optional[str] = None
+    stop_reason: Optional[Union[int, str]] = None
+
+
+class ChatCompletionStreamResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
+    object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: list[ChatCompletionResponseStreamChoice]
+    usage: Optional[UsageInfo] = Field(default=None)
+
+
+class TranscriptionResponseStreamChoice(OpenAIBaseModel):
+    delta: DeltaMessage
+    finish_reason: Optional[str] = None
+    stop_reason: Optional[Union[int, str]] = None
+
+
+class TranscriptionStreamResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"trsc-{random_uuid()}")
+    object: Literal["transcription.chunk"] = "transcription.chunk"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: list[TranscriptionResponseStreamChoice]
+    usage: Optional[UsageInfo] = Field(default=None)
+
+
+class ResponseReasoningItem(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"rs_{random_uuid()}")
+    text: str
+    summary: list = Field(default_factory=list)
+    type: Literal["reasoning"] = "reasoning"
+    encrypted_content: Optional[str] = None
+    status: Optional[Literal["in_progress", "completed", "incomplete"]]
+
+
+class ResponsesResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"resp_{random_uuid()}")
+    created_at: int = Field(default_factory=lambda: int(time.time()))
+    # error: Optional[ResponseError] = None
+    # incomplete_details: Optional[IncompleteDetails] = None
+    instructions: Optional[str] = None
+    metadata: Optional[Metadata] = None
+    model: str
+    object: Literal["response"] = "response"
+    output: list[Union[ResponseOutputMessage, ResponseReasoningItem]]
+    parallel_tool_calls: bool
+    temperature: float
+    tool_choice: ToolChoice
+    tools: list[Tool]
+    top_p: float
+    background: bool
+    max_output_tokens: int
+    max_tool_calls: Optional[int] = None
+    previous_response_id: Optional[str] = None
+    prompt: Optional[ResponsePrompt] = None
+    reasoning: Optional[Reasoning] = None
+    service_tier: Literal["auto", "default", "flex", "scale", "priority"]
+    status: ResponseStatus
+    text: Optional[ResponseTextConfig] = None
+    top_logprobs: int
+    truncation: Literal["auto", "disabled"]
+    usage: Optional[UsageInfo] = None
+    user: Optional[str] = None
+
+    @classmethod
+    def from_request(
+        cls,
+        request: ResponsesRequest,
+        sampling_params: SamplingParams,
+        model_name: str,
+        created_time: int,
+        output: list[ResponseOutputItem],
+        status: ResponseStatus,
+        usage: Optional[UsageInfo] = None,
+    ) -> "ResponsesResponse":
+        return cls(
+            id=request.request_id,
+            created_at=created_time,
+            instructions=request.instructions,
+            metadata=request.metadata,
+            model=model_name,
+            output=output,
+            parallel_tool_calls=request.parallel_tool_calls,
+            temperature=sampling_params.temperature,
+            tool_choice=request.tool_choice,
+            tools=request.tools,
+            top_p=sampling_params.top_p,
+            background=request.background,
+            max_output_tokens=sampling_params.max_tokens,
+            max_tool_calls=request.max_tool_calls,
+            previous_response_id=request.previous_response_id,
+            prompt=request.prompt,
+            reasoning=request.reasoning,
+            service_tier=request.service_tier,
+            status=status,
+            text=request.text,
+            top_logprobs=sampling_params.logprobs,
+            truncation=request.truncation,
+            user=request.user,
+            usage=usage,
+        )
+
+
+BatchRequestInputBody = Union[ChatCompletionRequest, EmbeddingRequest,
+                              ScoreRequest, RerankRequest]
+
+
+class BatchRequestInput(OpenAIBaseModel):
+    """
+    The per-line object of the batch input file.
+
+    NOTE: Currently only the `/v1/chat/completions` endpoint is supported.
+    """
+
+    # A developer-provided per-request id that will be used to match outputs to
+    # inputs. Must be unique for each request in a batch.
+    custom_id: str
+
+    # The HTTP method to be used for the request. Currently only POST is
+    # supported.
+    method: str
+
+    # The OpenAI API relative URL to be used for the request. Currently
+    # /v1/chat/completions is supported.
+    url: str
+
+    # The parameters of the request.
+    body: BatchRequestInputBody
+
+    @field_validator('body', mode='plain')
+    @classmethod
+    def check_type_for_url(cls, value: Any, info: ValidationInfo):
+        # Use url to disambiguate models
+        url: str = info.data["url"]
+        if url == "/v1/chat/completions":
+            return ChatCompletionRequest.model_validate(value)
+        if url == "/v1/embeddings":
+            return TypeAdapter(EmbeddingRequest).validate_python(value)
+        if url.endswith("/score"):
+            return ScoreRequest.model_validate(value)
+        if url.endswith("/rerank"):
+            return RerankRequest.model_validate(value)
+        return TypeAdapter(BatchRequestInputBody).validate_python(value)
+
+
+class BatchResponseData(OpenAIBaseModel):
+    # HTTP status code of the response.
+    status_code: int = 200
+
+    # An unique identifier for the API request.
+    request_id: str
+
+    # The body of the response.
+    body: Optional[Union[ChatCompletionResponse, EmbeddingResponse,
+                         ScoreResponse, RerankResponse]] = None
+
+
+class BatchRequestOutput(OpenAIBaseModel):
+    """
+    The per-line object of the batch output and error files
+    """
+
+    id: str
+
+    # A developer-provided per-request id that will be used to match outputs to
+    # inputs.
+    custom_id: str
+
+    response: Optional[BatchResponseData]
+
+    # For requests that failed with a non-HTTP error, this will contain more
+    # information on the cause of the failure.
+    error: Optional[Any]
+
+
+class TokenizeCompletionRequest(OpenAIBaseModel):
+    model: Optional[str] = None
+    prompt: str
+
+    add_special_tokens: bool = Field(
+        default=True,
+        description=(
+            "If true (the default), special tokens (e.g. BOS) will be added to "
+            "the prompt."),
+    )
+    return_token_strs: Optional[bool] = Field(
+        default=False,
+        description=("If true, also return the token strings "
+                     "corresponding to the token ids."),
+    )
+
+
+class TokenizeChatRequest(OpenAIBaseModel):
+    model: Optional[str] = None
+    messages: list[ChatCompletionMessageParam]
+
+    add_generation_prompt: bool = Field(
+        default=True,
+        description=
+        ("If true, the generation prompt will be added to the chat template. "
+         "This is a parameter used by chat template in tokenizer config of the "
+         "model."),
+    )
+    return_token_strs: Optional[bool] = Field(
+        default=False,
+        description=("If true, also return the token strings "
+                     "corresponding to the token ids."),
+    )
+    continue_final_message: bool = Field(
+        default=False,
+        description=
+        ("If this is set, the chat will be formatted so that the final "
+         "message in the chat is open-ended, without any EOS tokens. The "
+         "model will continue this message rather than starting a new one. "
+         "This allows you to \"prefill\" part of the model's response for it. "
+         "Cannot be used at the same time as `add_generation_prompt`."),
+    )
+    add_special_tokens: bool = Field(
+        default=False,
+        description=(
+            "If true, special tokens (e.g. BOS) will be added to the prompt "
+            "on top of what is added by the chat template. "
+            "For most models, the chat template takes care of adding the "
+            "special tokens so this should be set to false (as is the "
+            "default)."),
+    )
+    chat_template: Optional[str] = Field(
+        default=None,
+        description=(
+            "A Jinja template to use for this conversion. "
+            "As of transformers v4.44, default chat template is no longer "
+            "allowed, so you must provide a chat template if the tokenizer "
+            "does not define one."),
+    )
+    chat_template_kwargs: Optional[dict[str, Any]] = Field(
+        default=None,
+        description=(
+            "Additional keyword args to pass to the template renderer. "
+            "Will be accessible by the chat template."),
+    )
+    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+    tools: Optional[list[ChatCompletionToolsParam]] = Field(
+        default=None,
+        description=("A list of tools the model may call."),
+    )
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_generation_prompt(cls, data):
+        if data.get("continue_final_message") and data.get(
+                "add_generation_prompt"):
+            raise ValueError("Cannot set both `continue_final_message` and "
+                             "`add_generation_prompt` to True.")
+        return data
+
+
+TokenizeRequest = Union[TokenizeCompletionRequest, TokenizeChatRequest]
+
+
+class TokenizeResponse(OpenAIBaseModel):
+    count: int
+    max_model_len: int
+    tokens: list[int]
+    token_strs: Optional[list[str]] = None
+
+
+class DetokenizeRequest(OpenAIBaseModel):
+    model: Optional[str] = None
+    tokens: list[int]
+
+
+class DetokenizeResponse(OpenAIBaseModel):
+    prompt: str
+
+
+class TokenizerInfoResponse(OpenAIBaseModel):
+    """
+    Response containing tokenizer configuration 
+    equivalent to tokenizer_config.json
+    """
+
+    model_config = ConfigDict(extra="allow")
+    tokenizer_class: str
+
+
+class LoadLoRAAdapterRequest(BaseModel):
+    lora_name: str
+    lora_path: str
+
+
+class UnloadLoRAAdapterRequest(BaseModel):
+    lora_name: str
+    lora_int_id: Optional[int] = Field(default=None)
+
+
+## Protocols for Audio
+AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json",
+                                         "vtt"]
+
+
+class TranscriptionRequest(OpenAIBaseModel):
+    # Ordered by official OpenAI API documentation
+    # https://platform.openai.com/docs/api-reference/audio/createTranscription
+
+    file: UploadFile
+    """
+    The audio file object (not file name) to transcribe, in one of these
+    formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
+    """
+
+    model: Optional[str] = None
+    """ID of the model to use.
+    """
+
+    language: Optional[str] = None
+    """The language of the input audio.
+
+    Supplying the input language in
+    [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
+    will improve accuracy and latency.
+    """
+
+    prompt: str = Field(default="")
+    """An optional text to guide the model's style or continue a previous audio
+    segment.
+
+    The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
+    should match the audio language.
+    """
+
+    response_format: AudioResponseFormat = Field(default="json")
+    """
+    The format of the output, in one of these options: `json`, `text`, `srt`,
+    `verbose_json`, or `vtt`.
+    """
+
+    ## TODO (varun) : Support if set to 0, certain thresholds are met !!
+
+    timestamp_granularities: list[Literal["word", "segment"]] = Field(
+        alias="timestamp_granularities[]", default=[])
+    """The timestamp granularities to populate for this transcription.
+
+    `response_format` must be set `verbose_json` to use timestamp granularities.
+    Either or both of these options are supported: `word`, or `segment`. Note:
+    There is no additional latency for segment timestamps, but generating word
+    timestamps incurs additional latency.
+    """
+
+    stream: Optional[bool] = False
+    """When set, it will enable output to be streamed in a similar fashion
+    as the Chat Completion endpoint.
+    """
+    # --8<-- [start:transcription-extra-params]
+    # Flattened stream option to simplify form data.
+    stream_include_usage: Optional[bool] = False
+    stream_continuous_usage_stats: Optional[bool] = False
+
+    vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
+        default=None,
+        description=("Additional request parameters with string or "
+                     "numeric values, used by custom extensions."),
+    )
+    # --8<-- [end:transcription-extra-params]
+
+    # --8<-- [start:transcription-sampling-params]
+    temperature: float = Field(default=0.0)
+    """The sampling temperature, between 0 and 1.
+
+    Higher values like 0.8 will make the output more random, while lower values
+    like 0.2 will make it more focused / deterministic. If set to 0, the model
+    will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
+    to automatically increase the temperature until certain thresholds are hit.
+    """
+
+    top_p: Optional[float] = None
+    """Enables nucleus (top-p) sampling, where tokens are selected from the
+    smallest possible set whose cumulative probability exceeds `p`.
+    """
+
+    top_k: Optional[int] = None
+    """Limits sampling to the `k` most probable tokens at each step."""
+
+    min_p: Optional[float] = None
+    """Filters out tokens with a probability lower than `min_p`, ensuring a
+    minimum likelihood threshold during sampling.
+    """
+
+    seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
+    """The seed to use for sampling."""
+
+    frequency_penalty: Optional[float] = 0.0
+    """The frequency penalty to use for sampling."""
+
+    repetition_penalty: Optional[float] = None
+    """The repetition penalty to use for sampling."""
+
+    presence_penalty: Optional[float] = 0.0
+    """The presence penalty to use for sampling."""
+    # --8<-- [end:transcription-sampling-params]
+
+    # Default sampling parameters for transcription requests.
+    _DEFAULT_SAMPLING_PARAMS: dict = {
+        "repetition_penalty": 1.0,
+        "temperature": 1.0,
+        "top_p": 1.0,
+        "top_k": 0,
+        "min_p": 0.0,
+    }
+
+    def to_sampling_params(
+            self,
+            default_max_tokens: int,
+            default_sampling_params: Optional[dict] = None) -> SamplingParams:
+
+        max_tokens = default_max_tokens
+
+        if default_sampling_params is None:
+            default_sampling_params = {}
+
+        # Default parameters
+        if (temperature := self.temperature) is None:
+            temperature = default_sampling_params.get(
+                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
+        if (top_p := self.top_p) is None:
+            top_p = default_sampling_params.get(
+                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
+        if (top_k := self.top_k) is None:
+            top_k = default_sampling_params.get(
+                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
+        if (min_p := self.min_p) is None:
+            min_p = default_sampling_params.get(
+                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
+
+        if (repetition_penalty := self.repetition_penalty) is None:
+            repetition_penalty = default_sampling_params.get(
+                "repetition_penalty",
+                self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"])
+
+        return SamplingParams.from_optional(temperature=temperature,
+                                            max_tokens=max_tokens,
+                                            seed=self.seed,
+                                            top_p=top_p,
+                                            top_k=top_k,
+                                            min_p=min_p,
+                                            frequency_penalty=self.frequency_penalty,
+                                            repetition_penalty=repetition_penalty,
+                                            presence_penalty=self.presence_penalty,
+                                            output_kind=RequestOutputKind.DELTA
+                                            if self.stream \
+                                            else RequestOutputKind.FINAL_ONLY,
+                                            extra_args=self.vllm_xargs)
+
+    @model_validator(mode="before")
+    @classmethod
+    def validate_transcription_request(cls, data):
+        if isinstance(data.get("file"), str):
+            raise HTTPException(
+                status_code=HTTPStatus.UNPROCESSABLE_ENTITY,
+                detail="Expected 'file' to be a file-like object, not 'str'.",
+            )
+
+        stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
+        stream = data.get("stream", False)
+        if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
+            raise ValueError(
+                "Stream options can only be defined when `stream=True`.")
+
+        return data
+
+
+# Transcription response objects
+class TranscriptionResponse(OpenAIBaseModel):
+    text: str
+    """The transcribed text."""
+
+
+class TranscriptionWord(OpenAIBaseModel):
+    end: float
+    """End time of the word in seconds."""
+
+    start: float
+    """Start time of the word in seconds."""
+
+    word: str
+    """The text content of the word."""
+
+
+class TranscriptionSegment(OpenAIBaseModel):
+    id: int
+    """Unique identifier of the segment."""
+
+    avg_logprob: float
+    """Average logprob of the segment.
+
+    If the value is lower than -1, consider the logprobs failed.
+    """
+
+    compression_ratio: float
+    """Compression ratio of the segment.
+
+    If the value is greater than 2.4, consider the compression failed.
+    """
+
+    end: float
+    """End time of the segment in seconds."""
+
+    no_speech_prob: float
+    """Probability of no speech in the segment.
+
+    If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
+    this segment silent.
+    """
+
+    seek: int
+    """Seek offset of the segment."""
+
+    start: float
+    """Start time of the segment in seconds."""
+
+    temperature: float
+    """Temperature parameter used for generating the segment."""
+
+    text: str
+    """Text content of the segment."""
+
+    tokens: list[int]
+    """Array of token IDs for the text content."""
+
+
+class TranscriptionResponseVerbose(OpenAIBaseModel):
+    duration: str
+    """The duration of the input audio."""
+
+    language: str
+    """The language of the input audio."""
+
+    text: str
+    """The transcribed text."""
+
+    segments: Optional[list[TranscriptionSegment]] = None
+    """Segments of the transcribed text and their corresponding details."""
+
+    words: Optional[list[TranscriptionWord]] = None
+    """Extracted words and their corresponding timestamps."""
+
+
+class TranslationResponseStreamChoice(OpenAIBaseModel):
+    delta: DeltaMessage
+    finish_reason: Optional[str] = None
+    stop_reason: Optional[Union[int, str]] = None
+
+
+class TranslationStreamResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"trsl-{random_uuid()}")
+    object: Literal["translation.chunk"] = "translation.chunk"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: list[TranslationResponseStreamChoice]
+    usage: Optional[UsageInfo] = Field(default=None)
+
+
+class TranslationRequest(OpenAIBaseModel):
+    # Ordered by official OpenAI API documentation
+    # https://platform.openai.com/docs/api-reference/audio/createTranslation
+
+    file: UploadFile
+    """
+    The audio file object (not file name) to translate, in one of these
+    formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
+    """
+
+    model: Optional[str] = None
+    """ID of the model to use.
+    """
+
+    prompt: str = Field(default="")
+    """An optional text to guide the model's style or continue a previous audio
+    segment.
+
+    The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
+    should match the audio language.
+    """
+
+    response_format: AudioResponseFormat = Field(default="json")
+    """
+    The format of the output, in one of these options: `json`, `text`, `srt`,
+    `verbose_json`, or `vtt`.
+    """
+
+    # TODO support additional sampling parameters
+    # --8<-- [start:translation-sampling-params]
+    temperature: float = Field(default=0.0)
+    """The sampling temperature, between 0 and 1.
+
+    Higher values like 0.8 will make the output more random, while lower values
+    like 0.2 will make it more focused / deterministic. If set to 0, the model
+    will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
+    to automatically increase the temperature until certain thresholds are hit.
+    """
+    # --8<-- [end:translation-sampling-params]
+
+    # --8<-- [start:translation-extra-params]
+    language: Optional[str] = None
+    """The language of the input audio we translate from.
+
+    Supplying the input language in
+    [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
+    will improve accuracy.
+    """
+
+    stream: Optional[bool] = False
+    """Custom field not present in the original OpenAI definition. When set,
+    it will enable output to be streamed in a similar fashion as the Chat
+    Completion endpoint.
+    """
+    # Flattened stream option to simplify form data.
+    stream_include_usage: Optional[bool] = False
+    stream_continuous_usage_stats: Optional[bool] = False
+    # --8<-- [end:translation-extra-params]
+
+    # Default sampling parameters for translation requests.
+    _DEFAULT_SAMPLING_PARAMS: dict = {
+        "temperature": 0,
+    }
+
+    def to_sampling_params(
+            self,
+            default_max_tokens: int,
+            default_sampling_params: Optional[dict] = None) -> SamplingParams:
+
+        max_tokens = default_max_tokens
+
+        if default_sampling_params is None:
+            default_sampling_params = {}
+        # Default parameters
+        if (temperature := self.temperature) is None:
+            temperature = default_sampling_params.get(
+                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
+
+        return SamplingParams.from_optional(temperature=temperature,
+                                            max_tokens=max_tokens,
+                                            output_kind=RequestOutputKind.DELTA
+                                            if self.stream \
+                                            else RequestOutputKind.FINAL_ONLY)
+
+    @model_validator(mode="before")
+    @classmethod
+    def validate_stream_options(cls, data):
+        stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
+        stream = data.get("stream", False)
+        if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
+            raise ValueError(
+                "Stream options can only be defined when `stream=True`.")
+
+        return data
+
+
+# Translation response objects
+class TranslationResponse(OpenAIBaseModel):
+    text: str
+    """The translated text."""
+
+
+class TranslationWord(OpenAIBaseModel):
+    end: float
+    """End time of the word in seconds."""
+
+    start: float
+    """Start time of the word in seconds."""
+
+    word: str
+    """The text content of the word."""
+
+
+class TranslationSegment(OpenAIBaseModel):
+    id: int
+    """Unique identifier of the segment."""
+
+    avg_logprob: float
+    """Average logprob of the segment.
+
+    If the value is lower than -1, consider the logprobs failed.
+    """
+
+    compression_ratio: float
+    """Compression ratio of the segment.
+
+    If the value is greater than 2.4, consider the compression failed.
+    """
+
+    end: float
+    """End time of the segment in seconds."""
+
+    no_speech_prob: float
+    """Probability of no speech in the segment.
+
+    If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
+    this segment silent.
+    """
+
+    seek: int
+    """Seek offset of the segment."""
+
+    start: float
+    """Start time of the segment in seconds."""
+
+    temperature: float
+    """Temperature parameter used for generating the segment."""
+
+    text: str
+    """Text content of the segment."""
+
+    tokens: list[int]
+    """Array of token IDs for the text content."""
+
+
+class TranslationResponseVerbose(OpenAIBaseModel):
+    duration: str
+    """The duration of the input audio."""
+
+    language: str
+    """The language of the input audio."""
+
+    text: str
+    """The translated text."""
+
+    segments: Optional[list[TranslationSegment]] = None
+    """Segments of the translated text and their corresponding details."""
+
+    words: Optional[list[TranslationWord]] = None
+    """Extracted words and their corresponding timestamps."""
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/run_batch.py b/vllm_v0.10.0/vllm/entrypoints/openai/run_batch.py
new file mode 100644
index 0000000..ef5bf6f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/run_batch.py
@@ -0,0 +1,474 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import tempfile
+from collections.abc import Awaitable
+from http import HTTPStatus
+from io import StringIO
+from typing import Callable, Optional
+
+import aiohttp
+import torch
+from prometheus_client import start_http_server
+from tqdm import tqdm
+
+from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.entrypoints.logger import RequestLogger
+# yapf: disable
+from vllm.entrypoints.openai.protocol import (BatchRequestInput,
+                                              BatchRequestOutput,
+                                              BatchResponseData,
+                                              ChatCompletionResponse,
+                                              EmbeddingResponse, ErrorResponse,
+                                              RerankResponse, ScoreResponse)
+# yapf: enable
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
+from vllm.entrypoints.openai.serving_models import (BaseModelPath,
+                                                    OpenAIServingModels)
+from vllm.entrypoints.openai.serving_score import ServingScores
+from vllm.logger import init_logger
+from vllm.usage.usage_lib import UsageContext
+from vllm.utils import FlexibleArgumentParser, random_uuid
+from vllm.version import __version__ as VLLM_VERSION
+
+logger = init_logger(__name__)
+
+
+def make_arg_parser(parser: FlexibleArgumentParser):
+    parser.add_argument(
+        "-i",
+        "--input-file",
+        required=True,
+        type=str,
+        help=
+        "The path or url to a single input file. Currently supports local file "
+        "paths, or the http protocol (http or https). If a URL is specified, "
+        "the file should be available via HTTP GET.")
+    parser.add_argument(
+        "-o",
+        "--output-file",
+        required=True,
+        type=str,
+        help="The path or url to a single output file. Currently supports "
+        "local file paths, or web (http or https) urls. If a URL is specified,"
+        " the file should be available via HTTP PUT.")
+    parser.add_argument(
+        "--output-tmp-dir",
+        type=str,
+        default=None,
+        help="The directory to store the output file before uploading it "
+        "to the output URL.",
+    )
+    parser.add_argument("--response-role",
+                        type=optional_type(str),
+                        default="assistant",
+                        help="The role name to return if "
+                        "`request.add_generation_prompt=True`.")
+
+    parser = AsyncEngineArgs.add_cli_args(parser)
+
+    parser.add_argument('--max-log-len',
+                        type=int,
+                        default=None,
+                        help='Max number of prompt characters or prompt '
+                        'ID numbers being printed in log.'
+                        '\n\nDefault: Unlimited')
+
+    parser.add_argument("--enable-metrics",
+                        action="store_true",
+                        help="Enable Prometheus metrics")
+    parser.add_argument(
+        "--url",
+        type=str,
+        default="0.0.0.0",
+        help="URL to the Prometheus metrics server "
+        "(only needed if enable-metrics is set).",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=8000,
+        help="Port number for the Prometheus metrics server "
+        "(only needed if enable-metrics is set).",
+    )
+    parser.add_argument(
+        "--enable-prompt-tokens-details",
+        action='store_true',
+        default=False,
+        help="If set to True, enable prompt_tokens_details in usage.")
+
+    return parser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description="vLLM OpenAI-Compatible batch runner.")
+    return make_arg_parser(parser).parse_args()
+
+
+# explicitly use pure text format, with a newline at the end
+# this makes it impossible to see the animation in the progress bar
+# but will avoid messing up with ray or multiprocessing, which wraps
+# each line of output with some prefix.
+_BAR_FORMAT = "{desc}: {percentage:3.0f}% Completed | {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]\n"  # noqa: E501
+
+
+class BatchProgressTracker:
+
+    def __init__(self):
+        self._total = 0
+        self._pbar: Optional[tqdm] = None
+
+    def submitted(self):
+        self._total += 1
+
+    def completed(self):
+        if self._pbar:
+            self._pbar.update()
+
+    def pbar(self) -> tqdm:
+        enable_tqdm = not torch.distributed.is_initialized(
+        ) or torch.distributed.get_rank() == 0
+        self._pbar = tqdm(total=self._total,
+                          unit="req",
+                          desc="Running batch",
+                          mininterval=5,
+                          disable=not enable_tqdm,
+                          bar_format=_BAR_FORMAT)
+        return self._pbar
+
+
+async def read_file(path_or_url: str) -> str:
+    if path_or_url.startswith("http://") or path_or_url.startswith("https://"):
+        async with aiohttp.ClientSession() as session, \
+                   session.get(path_or_url) as resp:
+            return await resp.text()
+    else:
+        with open(path_or_url, encoding="utf-8") as f:
+            return f.read()
+
+
+async def write_local_file(output_path: str,
+                           batch_outputs: list[BatchRequestOutput]) -> None:
+    """
+    Write the responses to a local file.
+    output_path: The path to write the responses to.
+    batch_outputs: The list of batch outputs to write.
+    """
+    # We should make this async, but as long as run_batch runs as a
+    # standalone program, blocking the event loop won't effect performance.
+    with open(output_path, "w", encoding="utf-8") as f:
+        for o in batch_outputs:
+            print(o.model_dump_json(), file=f)
+
+
+async def upload_data(output_url: str, data_or_file: str,
+                      from_file: bool) -> None:
+    """
+    Upload a local file to a URL.
+    output_url: The URL to upload the file to.
+    data_or_file: Either the data to upload or the path to the file to upload.
+    from_file: If True, data_or_file is the path to the file to upload.
+    """
+    # Timeout is a common issue when uploading large files.
+    # We retry max_retries times before giving up.
+    max_retries = 5
+    # Number of seconds to wait before retrying.
+    delay = 5
+
+    for attempt in range(1, max_retries + 1):
+        try:
+            # We increase the timeout to 1000 seconds to allow
+            # for large files (default is 300).
+            async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(
+                    total=1000)) as session:
+                if from_file:
+                    with open(data_or_file, "rb") as file:
+                        async with session.put(output_url,
+                                               data=file) as response:
+                            if response.status != 200:
+                                raise Exception(f"Failed to upload file.\n"
+                                                f"Status: {response.status}\n"
+                                                f"Response: {response.text()}")
+                else:
+                    async with session.put(output_url,
+                                           data=data_or_file) as response:
+                        if response.status != 200:
+                            raise Exception(f"Failed to upload data.\n"
+                                            f"Status: {response.status}\n"
+                                            f"Response: {response.text()}")
+
+        except Exception as e:
+            if attempt < max_retries:
+                logger.error(
+                    "Failed to upload data (attempt %d). Error message: %s.\nRetrying in %d seconds...",  # noqa: E501
+                    attempt,
+                    e,
+                    delay,
+                )
+                await asyncio.sleep(delay)
+            else:
+                raise Exception(
+                    f"Failed to upload data (attempt {attempt}). Error message: {str(e)}."  # noqa: E501
+                ) from e
+
+
+async def write_file(path_or_url: str, batch_outputs: list[BatchRequestOutput],
+                     output_tmp_dir: str) -> None:
+    """
+    Write batch_outputs to a file or upload to a URL.
+    path_or_url: The path or URL to write batch_outputs to.
+    batch_outputs: The list of batch outputs to write.
+    output_tmp_dir: The directory to store the output file before uploading it
+    to the output URL.
+    """
+    if path_or_url.startswith("http://") or path_or_url.startswith("https://"):
+        if output_tmp_dir is None:
+            logger.info("Writing outputs to memory buffer")
+            output_buffer = StringIO()
+            for o in batch_outputs:
+                print(o.model_dump_json(), file=output_buffer)
+            output_buffer.seek(0)
+            logger.info("Uploading outputs to %s", path_or_url)
+            await upload_data(
+                path_or_url,
+                output_buffer.read().strip().encode("utf-8"),
+                from_file=False,
+            )
+        else:
+            # Write responses to a temporary file and then upload it to the URL.
+            with tempfile.NamedTemporaryFile(
+                    mode="w",
+                    encoding="utf-8",
+                    dir=output_tmp_dir,
+                    prefix="tmp_batch_output_",
+                    suffix=".jsonl",
+            ) as f:
+                logger.info("Writing outputs to temporary local file %s",
+                            f.name)
+                await write_local_file(f.name, batch_outputs)
+                logger.info("Uploading outputs to %s", path_or_url)
+                await upload_data(path_or_url, f.name, from_file=True)
+    else:
+        logger.info("Writing outputs to local file %s", path_or_url)
+        await write_local_file(path_or_url, batch_outputs)
+
+
+def make_error_request_output(request: BatchRequestInput,
+                              error_msg: str) -> BatchRequestOutput:
+    batch_output = BatchRequestOutput(
+        id=f"vllm-{random_uuid()}",
+        custom_id=request.custom_id,
+        response=BatchResponseData(
+            status_code=HTTPStatus.BAD_REQUEST,
+            request_id=f"vllm-batch-{random_uuid()}",
+        ),
+        error=error_msg,
+    )
+    return batch_output
+
+
+async def make_async_error_request_output(
+        request: BatchRequestInput, error_msg: str) -> BatchRequestOutput:
+    return make_error_request_output(request, error_msg)
+
+
+async def run_request(serving_engine_func: Callable,
+                      request: BatchRequestInput,
+                      tracker: BatchProgressTracker) -> BatchRequestOutput:
+    response = await serving_engine_func(request.body)
+
+    if isinstance(
+            response,
+        (ChatCompletionResponse, EmbeddingResponse, ScoreResponse,
+         RerankResponse),
+    ):
+        batch_output = BatchRequestOutput(
+            id=f"vllm-{random_uuid()}",
+            custom_id=request.custom_id,
+            response=BatchResponseData(
+                body=response, request_id=f"vllm-batch-{random_uuid()}"),
+            error=None,
+        )
+    elif isinstance(response, ErrorResponse):
+        batch_output = BatchRequestOutput(
+            id=f"vllm-{random_uuid()}",
+            custom_id=request.custom_id,
+            response=BatchResponseData(
+                status_code=response.code,
+                request_id=f"vllm-batch-{random_uuid()}"),
+            error=response,
+        )
+    else:
+        batch_output = make_error_request_output(
+            request, error_msg="Request must not be sent in stream mode")
+
+    tracker.completed()
+    return batch_output
+
+
+async def main(args):
+    if args.served_model_name is not None:
+        served_model_names = args.served_model_name
+    else:
+        served_model_names = [args.model]
+
+    engine_args = AsyncEngineArgs.from_cli_args(args)
+    engine = AsyncLLMEngine.from_engine_args(
+        engine_args, usage_context=UsageContext.OPENAI_BATCH_RUNNER)
+
+    model_config = await engine.get_model_config()
+    base_model_paths = [
+        BaseModelPath(name=name, model_path=args.model)
+        for name in served_model_names
+    ]
+
+    if args.disable_log_requests:
+        request_logger = None
+    else:
+        request_logger = RequestLogger(max_log_len=args.max_log_len)
+
+    # Create the openai serving objects.
+    openai_serving_models = OpenAIServingModels(
+        engine_client=engine,
+        model_config=model_config,
+        base_model_paths=base_model_paths,
+        lora_modules=None,
+    )
+    openai_serving_chat = OpenAIServingChat(
+        engine,
+        model_config,
+        openai_serving_models,
+        args.response_role,
+        request_logger=request_logger,
+        chat_template=None,
+        chat_template_content_format="auto",
+        enable_prompt_tokens_details=args.enable_prompt_tokens_details,
+    ) if "generate" in model_config.supported_tasks else None
+    openai_serving_embedding = OpenAIServingEmbedding(
+        engine,
+        model_config,
+        openai_serving_models,
+        request_logger=request_logger,
+        chat_template=None,
+        chat_template_content_format="auto",
+    ) if "embed" in model_config.supported_tasks else None
+
+    enable_serving_reranking = ("classify" in model_config.supported_tasks
+                                and getattr(model_config.hf_config,
+                                            "num_labels", 0) == 1)
+
+    openai_serving_scores = ServingScores(
+        engine,
+        model_config,
+        openai_serving_models,
+        request_logger=request_logger,
+    ) if ("embed" in model_config.supported_tasks
+          or enable_serving_reranking) else None
+
+    tracker = BatchProgressTracker()
+    logger.info("Reading batch from %s...", args.input_file)
+
+    # Submit all requests in the file to the engine "concurrently".
+    response_futures: list[Awaitable[BatchRequestOutput]] = []
+    for request_json in (await read_file(args.input_file)).strip().split("\n"):
+        # Skip empty lines.
+        request_json = request_json.strip()
+        if not request_json:
+            continue
+
+        request = BatchRequestInput.model_validate_json(request_json)
+
+        # Determine the type of request and run it.
+        if request.url == "/v1/chat/completions":
+            chat_handler_fn = openai_serving_chat.create_chat_completion if \
+                openai_serving_chat is not None else None
+            if chat_handler_fn is None:
+                response_futures.append(
+                    make_async_error_request_output(
+                        request,
+                        error_msg=
+                        "The model does not support Chat Completions API",
+                    ))
+                continue
+
+            response_futures.append(
+                run_request(chat_handler_fn, request, tracker))
+            tracker.submitted()
+        elif request.url == "/v1/embeddings":
+            embed_handler_fn = openai_serving_embedding.create_embedding if \
+                openai_serving_embedding is not None else None
+            if embed_handler_fn is None:
+                response_futures.append(
+                    make_async_error_request_output(
+                        request,
+                        error_msg="The model does not support Embeddings API",
+                    ))
+                continue
+
+            response_futures.append(
+                run_request(embed_handler_fn, request, tracker))
+            tracker.submitted()
+        elif request.url.endswith("/score"):
+            score_handler_fn = openai_serving_scores.create_score if \
+                openai_serving_scores is not None else None
+            if score_handler_fn is None:
+                response_futures.append(
+                    make_async_error_request_output(
+                        request,
+                        error_msg="The model does not support Scores API",
+                    ))
+                continue
+
+            response_futures.append(
+                run_request(score_handler_fn, request, tracker))
+            tracker.submitted()
+        elif request.url.endswith("/rerank"):
+            rerank_handler_fn = openai_serving_scores.do_rerank if \
+                openai_serving_scores is not None else None
+            if rerank_handler_fn is None:
+                response_futures.append(
+                    make_async_error_request_output(
+                        request,
+                        error_msg="The model does not support Rerank API",
+                    ))
+                continue
+
+            response_futures.append(
+                run_request(rerank_handler_fn, request, tracker))
+            tracker.submitted()
+        else:
+            response_futures.append(
+                make_async_error_request_output(
+                    request,
+                    error_msg=f"URL {request.url} was used. "
+                    "Supported endpoints: /v1/chat/completions, /v1/embeddings,"
+                    " /score, /rerank ."
+                    "See vllm/entrypoints/openai/api_server.py for supported "
+                    "score/rerank versions.",
+                ))
+
+    with tracker.pbar():
+        responses = await asyncio.gather(*response_futures)
+
+    await write_file(args.output_file, responses, args.output_tmp_dir)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    logger.info("vLLM batch processing API version %s", VLLM_VERSION)
+    logger.info("args: %s", args)
+
+    # Start the Prometheus metrics server. LLMEngine uses the Prometheus client
+    # to publish metrics at the /metrics endpoint.
+    if args.enable_metrics:
+        logger.info("Prometheus metrics enabled")
+        start_http_server(port=args.port, addr=args.url)
+    else:
+        logger.info("Prometheus metrics disabled")
+
+    asyncio.run(main(args))
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/serving_chat.py b/vllm_v0.10.0/vllm/entrypoints/openai/serving_chat.py
new file mode 100644
index 0000000..33d8074
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/serving_chat.py
@@ -0,0 +1,1251 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import json
+import time
+from collections.abc import AsyncGenerator, AsyncIterator
+from collections.abc import Sequence as GenericSequence
+from typing import Callable, Final, Optional, Union
+
+import jinja2
+import partial_json_parser
+import regex as re
+from fastapi import Request
+from pydantic import TypeAdapter
+
+from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
+                                         ConversationMessage,
+                                         random_tool_call_id)
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionLogProb, ChatCompletionLogProbs,
+    ChatCompletionLogProbsContent, ChatCompletionNamedToolChoiceParam,
+    ChatCompletionRequest, ChatCompletionResponse,
+    ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
+    ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage,
+    DeltaToolCall, ErrorResponse, FunctionCall, FunctionDefinition,
+    PromptTokenUsageInfo, RequestResponseMetadata, ToolCall, UsageInfo)
+from vllm.entrypoints.openai.serving_engine import (OpenAIServing,
+                                                    clamp_prompt_logprobs)
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
+from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
+    MistralToolCall)
+from vllm.entrypoints.utils import get_max_tokens
+from vllm.logger import init_logger
+from vllm.outputs import CompletionOutput, RequestOutput
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+from vllm.sampling_params import BeamSearchParams, SamplingParams
+from vllm.sequence import Logprob
+from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+from vllm.transformers_utils.tokenizers import (maybe_serialize_tool_calls,
+                                                truncate_tool_call_ids,
+                                                validate_request_params)
+
+logger = init_logger(__name__)
+
+
+class OpenAIServingChat(OpenAIServing):
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        model_config: ModelConfig,
+        models: OpenAIServingModels,
+        response_role: str,
+        *,
+        request_logger: Optional[RequestLogger],
+        chat_template: Optional[str],
+        chat_template_content_format: ChatTemplateContentFormatOption,
+        return_tokens_as_token_ids: bool = False,
+        reasoning_parser: str = "",
+        enable_auto_tools: bool = False,
+        tool_parser: Optional[str] = None,
+        enable_prompt_tokens_details: bool = False,
+        enable_force_include_usage: bool = False,
+    ) -> None:
+        super().__init__(engine_client=engine_client,
+                         model_config=model_config,
+                         models=models,
+                         request_logger=request_logger,
+                         return_tokens_as_token_ids=return_tokens_as_token_ids,
+                         enable_force_include_usage=enable_force_include_usage)
+
+        self.response_role = response_role
+        self.chat_template = chat_template
+        self.chat_template_content_format: Final = chat_template_content_format
+
+        # set up tool use
+        self.enable_auto_tools: bool = enable_auto_tools
+        if self.enable_auto_tools:
+            logger.info(
+                "\"auto\" tool choice has been enabled please note that while"
+                " the parallel_tool_calls client option is preset for "
+                "compatibility reasons, it will be ignored.")
+
+        self.reasoning_parser: Optional[Callable[[AnyTokenizer],
+                                                 ReasoningParser]] = None
+        if reasoning_parser:
+            try:
+                self.reasoning_parser = (
+                    ReasoningParserManager.get_reasoning_parser(
+                        reasoning_parser))
+                assert self.reasoning_parser is not None
+            except Exception as e:
+                raise TypeError(
+                    f"{reasoning_parser=} has not been registered") from e
+        self.tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None
+        if self.enable_auto_tools:
+            try:
+                if (tool_parser == "pythonic" and
+                        model_config.model.startswith("meta-llama/Llama-3.2")):
+                    logger.warning(
+                        "Llama3.2 models may struggle to emit valid pythonic"
+                        " tool calls")
+                self.tool_parser = ToolParserManager.get_tool_parser(
+                    tool_parser)
+            except Exception as e:
+                raise TypeError("Error: --enable-auto-tool-choice requires "
+                                f"tool_parser:'{tool_parser}' which has not "
+                                "been registered") from e
+
+        self.enable_prompt_tokens_details = enable_prompt_tokens_details
+        self.enable_force_include_usage = enable_force_include_usage
+        self.default_sampling_params = (
+            self.model_config.get_diff_sampling_param())
+        if self.default_sampling_params:
+            source = self.model_config.generation_config
+            source = "model" if source == "auto" else source
+            logger.info("Using default chat sampling params from %s: %s",
+                        source, self.default_sampling_params)
+
+    async def create_chat_completion(
+        self,
+        request: ChatCompletionRequest,
+        raw_request: Optional[Request] = None,
+    ) -> Union[AsyncGenerator[str, None], ChatCompletionResponse,
+               ErrorResponse]:
+        """
+        Chat Completion API similar to OpenAI's API.
+
+        See https://platform.openai.com/docs/api-reference/chat/create
+        for the API specification. This API mimics the OpenAI
+        Chat Completion API.
+        """
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            logger.error("Error with model %s", error_check_ret)
+            return error_check_ret
+
+        # If the engine is dead, raise the engine's DEAD_ERROR.
+        # This is required for the streaming case, where we return a
+        # success status before we actually start generating text :).
+        if self.engine_client.errored:
+            raise self.engine_client.dead_error
+
+        try:
+            lora_request = self._maybe_get_adapters(
+                request, supports_default_mm_loras=True)
+
+            model_name = self._get_model_name(request.model, lora_request)
+
+            tokenizer = await self.engine_client.get_tokenizer(lora_request)
+
+            tool_parser = self.tool_parser
+
+            if isinstance(tokenizer, MistralTokenizer):
+                # because of issues with pydantic we need to potentially
+                # re-serialize the tool_calls field of the request
+                # for more info: see comment in `maybe_serialize_tool_calls`
+                maybe_serialize_tool_calls(request)
+                truncate_tool_call_ids(request)
+                validate_request_params(request)
+
+            if (request.tool_choice == "auto" and
+                    not (self.enable_auto_tools and tool_parser is not None)
+                    and not isinstance(tokenizer, MistralTokenizer)):
+                # for hf tokenizers, "auto" tools requires
+                # --enable-auto-tool-choice and --tool-call-parser
+                return self.create_error_response(
+                    "\"auto\" tool choice requires "
+                    "--enable-auto-tool-choice and --tool-call-parser to be set"
+                )
+
+            if request.tools is None:
+                tool_dicts = None
+            else:
+                tool_dicts = [tool.model_dump() for tool in request.tools]
+
+            (
+                conversation,
+                request_prompts,
+                engine_prompts,
+            ) = await self._preprocess_chat(
+                request,
+                tokenizer,
+                request.messages,
+                chat_template=request.chat_template or self.chat_template,
+                chat_template_content_format=self.chat_template_content_format,
+                add_generation_prompt=request.add_generation_prompt,
+                continue_final_message=request.continue_final_message,
+                tool_dicts=tool_dicts,
+                documents=request.documents,
+                chat_template_kwargs=request.chat_template_kwargs,
+                tool_parser=tool_parser,
+                truncate_prompt_tokens=request.truncate_prompt_tokens,
+                add_special_tokens=request.add_special_tokens,
+            )
+        except (ValueError, TypeError, RuntimeError,
+                jinja2.TemplateError) as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(f"{e} {e.__cause__}")
+
+        request_id = "chatcmpl-" \
+                     f"{self._base_request_id(raw_request, request.request_id)}"
+
+        request_metadata = RequestResponseMetadata(request_id=request_id)
+        if raw_request:
+            raw_request.state.request_metadata = request_metadata
+
+        # Schedule the request and get the result generator.
+        generators: list[AsyncGenerator[RequestOutput, None]] = []
+        try:
+            for i, engine_prompt in enumerate(engine_prompts):
+                sampling_params: Union[SamplingParams, BeamSearchParams]
+
+                if self.default_sampling_params is None:
+                    self.default_sampling_params = {}
+
+                max_tokens = get_max_tokens(
+                    max_model_len=self.max_model_len,
+                    request=request,
+                    input_length=len(engine_prompt["prompt_token_ids"]),
+                    default_sampling_params=self.default_sampling_params)
+
+                if request.use_beam_search:
+                    sampling_params = request.to_beam_search_params(
+                        max_tokens, self.default_sampling_params)
+                else:
+                    sampling_params = request.to_sampling_params(
+                        max_tokens, self.model_config.logits_processor_pattern,
+                        self.default_sampling_params)
+
+                self._log_inputs(request_id,
+                                 request_prompts[i],
+                                 params=sampling_params,
+                                 lora_request=lora_request)
+
+                trace_headers = (None if raw_request is None else await
+                                 self._get_trace_headers(raw_request.headers))
+
+                if isinstance(sampling_params, BeamSearchParams):
+                    generator = self.engine_client.beam_search(
+                        prompt=engine_prompt,
+                        request_id=request_id,
+                        params=sampling_params,
+                        lora_request=lora_request,
+                    )
+                else:
+                    generator = self.engine_client.generate(
+                        engine_prompt,
+                        sampling_params,
+                        request_id,
+                        lora_request=lora_request,
+                        trace_headers=trace_headers,
+                        priority=request.priority,
+                    )
+
+                generators.append(generator)
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+        assert len(generators) == 1
+        result_generator, = generators
+
+        # Streaming response
+        if request.stream:
+            return self.chat_completion_stream_generator(
+                request,
+                result_generator,
+                request_id,
+                model_name,
+                conversation,
+                tokenizer,
+                request_metadata,
+                enable_force_include_usage=self.enable_force_include_usage)
+
+        try:
+            return await self.chat_completion_full_generator(
+                request, result_generator, request_id, model_name,
+                conversation, tokenizer, request_metadata)
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+    def get_chat_request_role(self, request: ChatCompletionRequest) -> str:
+        if request.add_generation_prompt:
+            return self.response_role
+        return request.messages[-1]["role"]
+
+    @staticmethod
+    def _bracket_level(s: str, opening='{', closing='}') -> int:
+        """
+        Calculate the current level of nested brackets in a given string.
+        """
+        level = 0
+        for char in s:
+            if char == opening:
+                level += 1
+            elif char == closing:
+                level -= 1
+        return level
+
+    @staticmethod
+    def _filter_delta_text(delta_text: str,
+                           previous_text: str) -> tuple[str, bool]:
+        # remove last '},' of the tool definition stemming from the
+        # "name"/"parameters" outer object or closing ']' of the tool list
+        # count occurrences of opening and closing curly braces and
+        # once level 0 is reached stop outputting text
+        # if 0 is reached while parsing the delta_text we know the current
+        # tool will finish in this current iteration
+        bracket_level = OpenAIServingChat._bracket_level(previous_text)
+        updated_delta, passed_zero = "", False
+        for c in delta_text:
+            if c == '{':
+                bracket_level += 1
+                passed_zero = bracket_level == 0
+            elif c == '}':
+                bracket_level -= 1
+                passed_zero = bracket_level == 0
+
+            if bracket_level != 0:
+                updated_delta += c
+            else:
+                # if a comma is reached at level 0 we can stop
+                if c == ',':
+                    break
+        return updated_delta, passed_zero
+
+    def extract_tool_call_required_streaming(
+        self,
+        previous_text: str,
+        current_text: Optional[str],
+        delta_text: str,
+        function_name_returned: bool,
+    ) -> tuple[Optional[DeltaMessage], bool]:
+        if current_text is None or current_text == "":
+            # if the current text is empty, we cannot parse it
+            return None, function_name_returned
+        try:
+            obj = partial_json_parser.loads(current_text)
+        except partial_json_parser.core.exceptions.MalformedJSON:
+            logger.debug('not enough tokens to parse into JSON yet')
+            obj = None
+
+        # check if the current text is a valid array
+        # containing a partial tool calling object
+        # if not repeat
+        if obj is None or not isinstance(obj, list) or not len(obj) > 0:
+            function_name_returned = False
+            delta_message = None
+        else:
+            _, finishes_previous_tool = OpenAIServingChat._filter_delta_text(
+                delta_text, previous_text)
+            # take the last tool call from the generated list
+            current_tool_call = obj[-1]
+
+            # once parameters have been generated the name is complete as well
+            if not finishes_previous_tool and ("name" not in current_tool_call
+                                               or "parameters"
+                                               not in current_tool_call):
+                function_name_returned = False
+                delta_message = None
+            else:
+                if not function_name_returned:
+                    # get partly generated arguments from the latest tool call
+                    param_match = re.search(r'.*"parameters":\s*(.*)',
+                                            current_text)
+                    arguments = param_match.group(1) if param_match else ""
+                    arguments, _ = OpenAIServingChat._filter_delta_text(
+                        arguments, previous_text)
+
+                    # if this iteration finishes a previous tool call but a
+                    # new incomplete tool is already generated, take the
+                    # previous from the list
+                    if (finishes_previous_tool
+                            and "parameters" not in current_tool_call):
+                        current_tool_call = obj[-2]
+
+                    function_name_returned = True
+                    delta_message = DeltaMessage(tool_calls=[
+                        DeltaToolCall(id=random_tool_call_id(),
+                                      function=DeltaFunctionCall(
+                                          name=current_tool_call["name"],
+                                          arguments=arguments),
+                                      index=len(obj) - 1,
+                                      type="function")
+                    ])
+
+                else:
+                    delta_text, _ = OpenAIServingChat._filter_delta_text(
+                        delta_text, previous_text)
+
+                    if delta_text != "":
+                        delta_message = DeltaMessage(tool_calls=[
+                            DeltaToolCall(
+                                function=DeltaFunctionCall(
+                                    # OpenAI API returns None
+                                    # instead of name every time
+                                    name=None,
+                                    arguments=delta_text),
+                                index=len(obj) - 1)
+                        ])
+                    else:
+                        delta_message = None
+
+        return delta_message, function_name_returned
+
+    async def chat_completion_stream_generator(
+        self,
+        request: ChatCompletionRequest,
+        result_generator: AsyncIterator[RequestOutput],
+        request_id: str,
+        model_name: str,
+        conversation: list[ConversationMessage],
+        tokenizer: AnyTokenizer,
+        request_metadata: RequestResponseMetadata,
+        enable_force_include_usage: bool,
+    ) -> AsyncGenerator[str, None]:
+        created_time = int(time.time())
+        chunk_object_type: Final = "chat.completion.chunk"
+        first_iteration = True
+
+        # Send response for each token for each request.n (index)
+        num_choices = 1 if request.n is None else request.n
+        previous_num_tokens = [0] * num_choices
+        finish_reason_sent = [False] * num_choices
+        num_prompt_tokens = 0
+        num_cached_tokens = None
+
+        if isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam):
+            tool_choice_function_name = request.tool_choice.function.name
+        else:
+            tool_choice_function_name = None
+
+        # Determine whether tools are in use with "auto" tool choice
+        tool_choice_auto = (
+            not tool_choice_function_name
+            and self._should_stream_with_auto_tool_parsing(request))
+
+        all_previous_token_ids: Optional[list[list[int]]]
+        function_name_returned = [False] * num_choices
+
+        # Only one of these will be used, thus previous_texts and
+        # all_previous_token_ids will not be used twice in the same iteration.
+        if tool_choice_auto or self.reasoning_parser:
+            # These are only required in "auto" tool choice case
+            previous_texts = [""] * num_choices
+            all_previous_token_ids = [[]] * num_choices
+            # For reasoning parser and tool call all enabled
+            added_content_delta_arr = [False] * num_choices
+            reasoning_end_arr = [False] * num_choices
+        elif request.tool_choice == "required":
+            previous_texts = [""] * num_choices
+            all_previous_token_ids = None
+        else:
+            previous_texts, all_previous_token_ids = None, None
+
+        try:
+            if self.reasoning_parser:
+                reasoning_parser = self.reasoning_parser(tokenizer)
+        except RuntimeError as e:
+            logger.exception("Error in reasoning parser creation.")
+            data = self.create_streaming_error_response(str(e))
+            yield f"data: {data}\n\n"
+            yield "data: [DONE]\n\n"
+            return
+        # Prepare the tool parser if it's needed
+        try:
+            if tool_choice_auto and self.tool_parser:
+                tool_parsers: list[Optional[ToolParser]] = [
+                    self.tool_parser(tokenizer)
+                ] * num_choices
+            else:
+                tool_parsers = [None] * num_choices
+        except Exception as e:
+            logger.exception("Error in tool parser creation.")
+            data = self.create_streaming_error_response(str(e))
+            yield f"data: {data}\n\n"
+            yield "data: [DONE]\n\n"
+            return
+
+        stream_options = request.stream_options
+        if stream_options:
+            include_usage = stream_options.include_usage \
+                            or enable_force_include_usage
+            include_continuous_usage = include_usage and \
+                                       stream_options.continuous_usage_stats
+        else:
+            include_usage, include_continuous_usage = False, False
+
+        try:
+            async for res in result_generator:
+                if res.prompt_token_ids is not None:
+                    num_prompt_tokens = len(res.prompt_token_ids)
+                    if res.encoder_prompt_token_ids is not None:
+                        num_prompt_tokens += len(res.encoder_prompt_token_ids)
+
+                # We need to do it here, because if there are exceptions in
+                # the result_generator, it needs to be sent as the FIRST
+                # response (by the try...catch).
+                if first_iteration:
+                    num_cached_tokens = res.num_cached_tokens
+                    # Send first response for each request.n (index) with
+                    # the role
+                    role = self.get_chat_request_role(request)
+
+                    # NOTE num_choices defaults to 1 so this usually executes
+                    # once per request
+                    for i in range(num_choices):
+                        choice_data = ChatCompletionResponseStreamChoice(
+                            index=i,
+                            delta=DeltaMessage(
+                                role=role,
+                                content="",
+                            ),
+                            logprobs=None,
+                            finish_reason=None)
+                        chunk = ChatCompletionStreamResponse(
+                            id=request_id,
+                            object=chunk_object_type,
+                            created=created_time,
+                            choices=[choice_data],
+                            model=model_name)
+
+                        # if continuous usage stats are requested, add it
+                        if include_continuous_usage:
+                            chunk.usage = UsageInfo(
+                                prompt_tokens=num_prompt_tokens,
+                                completion_tokens=0,
+                                total_tokens=num_prompt_tokens)
+
+                        data = chunk.model_dump_json(exclude_unset=True)
+                        yield f"data: {data}\n\n"
+
+                    # Send response to echo the input portion of the
+                    # last message
+                    if request.echo:
+                        last_msg_content: Union[str, list[dict[str, str]]] = ""
+                        if conversation and "content" in conversation[
+                                -1] and conversation[-1].get("role") == role:
+                            last_msg_content = conversation[-1]["content"] or ""
+
+                        if last_msg_content:
+                            for i in range(num_choices):
+                                choice_data = (
+                                    ChatCompletionResponseStreamChoice(
+                                        index=i,
+                                        delta=DeltaMessage(
+                                            content=last_msg_content),
+                                        logprobs=None,
+                                        finish_reason=None))
+                                chunk = ChatCompletionStreamResponse(
+                                    id=request_id,
+                                    object=chunk_object_type,
+                                    created=created_time,
+                                    choices=[choice_data],
+                                    model=model_name)
+                                if include_continuous_usage:
+                                    chunk.usage = UsageInfo(
+                                        prompt_tokens=num_prompt_tokens,
+                                        completion_tokens=0,
+                                        total_tokens=num_prompt_tokens)
+
+                                data = chunk.model_dump_json(
+                                    exclude_unset=True)
+                                yield f"data: {data}\n\n"
+                    first_iteration = False
+
+                for output in res.outputs:
+                    i = output.index
+                    tool_parser = tool_parsers[i]
+
+                    if finish_reason_sent[i]:
+                        continue
+
+                    if request.logprobs and request.top_logprobs is not None:
+                        assert output.logprobs is not None, (
+                            "Did not output logprobs")
+                        logprobs = self._create_chat_logprobs(
+                            token_ids=output.token_ids,
+                            top_logprobs=output.logprobs,
+                            tokenizer=tokenizer,
+                            num_output_top_logprobs=request.top_logprobs,
+                            return_as_token_id=request.
+                            return_tokens_as_token_ids,
+                        )
+                    else:
+                        logprobs = None
+
+                    delta_text = output.text
+
+                    if not delta_text and not output.token_ids and \
+                        not previous_num_tokens[i]:
+                        # Chunked prefill case, don't return empty chunks
+                        continue
+
+                    delta_message: Optional[DeltaMessage]
+
+                    # just update previous_texts and previous_token_ids
+                    if tool_choice_auto or self.reasoning_parser:
+                        assert previous_texts is not None
+                        assert all_previous_token_ids is not None
+                        previous_text = previous_texts[i]
+                        previous_token_ids = all_previous_token_ids[i]
+                        current_text = previous_text + delta_text
+
+                        # avoid the None + list error.
+                        if previous_token_ids:
+                            current_token_ids = previous_token_ids + list(
+                                output.token_ids)
+                        else:
+                            current_token_ids = list(output.token_ids)
+
+                    # handle streaming deltas for tools with named tool_choice
+                    if tool_choice_function_name:
+                        if (self.reasoning_parser
+                                and not reasoning_parser.is_reasoning_end(
+                                    previous_token_ids)):
+                            assert reasoning_parser is not None
+                            delta_message = (
+                                reasoning_parser.
+                                extract_reasoning_content_streaming(
+                                    previous_text,
+                                    current_text,
+                                    delta_text,
+                                    previous_token_ids,
+                                    current_token_ids,
+                                    output.token_ids,
+                                ))
+                            # When encountering think end id in delta_token_ids,
+                            # process the `content`. Only keep 'content',
+                            # remove 'reasoning_content'
+                            if reasoning_parser.is_reasoning_end(
+                                    list(output.token_ids)):
+                                if delta_message and delta_message.content:
+                                    # This need to be added to next `delta_text`
+                                    current_text = delta_message.content
+                                    delta_message.content = None
+                                else:
+                                    current_text = ""
+                        else:
+                            # Just to add remaining `content`
+                            if self.reasoning_parser:
+                                delta_text = previous_text + delta_text
+                                current_text = ""
+
+                            if function_name_returned[i]:
+                                delta_tool_call = DeltaToolCall(
+                                    function=DeltaFunctionCall(
+                                        arguments=delta_text),
+                                    index=i)
+                            else:
+                                delta_tool_call = DeltaToolCall(
+                                    id=random_tool_call_id(),
+                                    type="function",
+                                    function=DeltaFunctionCall(
+                                        name=tool_choice_function_name,
+                                        arguments=delta_text),
+                                    index=i)
+                                function_name_returned[i] = True
+
+                            delta_message = DeltaMessage(tool_calls=[
+                                delta_tool_call,
+                            ])
+
+                    elif request.tool_choice == "required":
+                        assert previous_texts is not None
+                        previous_text = previous_texts[i]
+                        current_text = previous_text + delta_text
+                        fn_name_returned = function_name_returned[i]
+
+                        if self.reasoning_parser:
+                            _, content = \
+                                reasoning_parser.extract_reasoning_content(
+                                    current_text,
+                                    request
+                                )
+                        else:
+                            content = current_text
+                        delta_message, function_name_returned[i] = (
+                            self.extract_tool_call_required_streaming(
+                                previous_text=previous_text,
+                                current_text=content,
+                                delta_text=delta_text,
+                                function_name_returned=fn_name_returned))
+
+                        # update the previous values for the next iteration
+                        previous_texts[i] = current_text
+
+                    # handle streaming deltas for tools with "auto" tool choice
+                    # and reasoning parser
+                    elif tool_choice_auto and self.reasoning_parser:
+                        assert tool_parser is not None
+                        assert reasoning_parser is not None
+                        assert added_content_delta_arr is not None
+                        assert reasoning_end_arr is not None
+                        if not reasoning_end_arr[i]:
+                            delta_message = (
+                                reasoning_parser.
+                                extract_reasoning_content_streaming(
+                                    previous_text,
+                                    current_text,
+                                    delta_text,
+                                    previous_token_ids,
+                                    current_token_ids,
+                                    output.token_ids,
+                                ))
+                            # When encountering think end id in prompt_token_ids
+                            # i.e {"enable_thinking": False},
+                            # set reasoning status to end.
+                            # Remove the text and token ids related
+                            # to 'reasoning_content'.
+                            if res.prompt_token_ids and \
+                                reasoning_parser.is_reasoning_end(
+                                    list(res.prompt_token_ids)):
+                                reasoning_end_arr[i] = True
+                                current_token_ids = list(output.token_ids)
+                                if delta_message and delta_message.content:
+                                    current_text = delta_message.content
+                                    delta_message.content = None
+                                else:
+                                    current_text = ""
+                            # When encountering think end id in delta_token_ids,
+                            # set reasoning status to end.
+                            # Remove the text and token ids related
+                            # to 'reasoning_content'.
+                            if reasoning_parser.is_reasoning_end(
+                                    list(output.token_ids)):
+                                reasoning_end_arr[i] = True
+                                current_token_ids =  \
+                                    reasoning_parser.extract_content_ids(
+                                        list(output.token_ids))
+                                if delta_message and delta_message.content:
+                                    current_text = delta_message.content
+                                    delta_message.content = None
+                                else:
+                                    current_text = ""
+
+                        # handle tool calls only after reasoning is done,
+                        else:
+                            delta_token_ids = list(output.token_ids)
+                            # First time to tool call,
+                            # add the remaining text and token ids
+                            # to delta from previous
+                            if not added_content_delta_arr[i]:
+                                added_content_delta_arr[i] = True
+                                previous_text = ""
+                                previous_token_ids = []
+                                delta_text = current_text
+                                delta_token_ids = current_token_ids
+
+                            delta_message = (
+                                tool_parser.extract_tool_calls_streaming(
+                                    previous_text=previous_text,
+                                    current_text=current_text,
+                                    delta_text=delta_text,
+                                    previous_token_ids=previous_token_ids,
+                                    current_token_ids=current_token_ids,
+                                    delta_token_ids=delta_token_ids,
+                                    request=request))
+                    # when only tool calls
+                    elif tool_choice_auto:
+                        assert tool_parser is not None
+                        delta_message = (
+                            tool_parser.extract_tool_calls_streaming(
+                                previous_text=previous_text,
+                                current_text=current_text,
+                                delta_text=delta_text,
+                                previous_token_ids=previous_token_ids,
+                                current_token_ids=current_token_ids,
+                                delta_token_ids=output.token_ids,
+                                request=request))
+                    # when only reasoning
+                    elif self.reasoning_parser:
+                        delta_message = (reasoning_parser.
+                                         extract_reasoning_content_streaming(
+                                             previous_text,
+                                             current_text,
+                                             delta_text,
+                                             previous_token_ids,
+                                             current_token_ids,
+                                             output.token_ids,
+                                         ))
+                    # handle streaming just a content delta
+                    else:
+                        delta_message = DeltaMessage(content=delta_text)
+
+                    # update the previous values for the next iteration
+                    if tool_choice_auto or self.reasoning_parser:
+                        assert previous_texts is not None
+                        assert all_previous_token_ids is not None
+                        previous_texts[i] = current_text
+                        all_previous_token_ids[i] = current_token_ids
+
+                    # set the previous values for the next iteration
+                    previous_num_tokens[i] += len(output.token_ids)
+
+                    # if the message delta is None (e.g. because it was a
+                    # "control token" for tool calls or the parser otherwise
+                    # wasn't ready to send a token, then
+                    #   get the next token without streaming a chunk
+                    if delta_message is None:
+                        continue
+
+                    if output.finish_reason is None:
+                        # Send token-by-token response for each request.n
+                        choice_data = ChatCompletionResponseStreamChoice(
+                            index=i,
+                            delta=delta_message,
+                            logprobs=logprobs,
+                            finish_reason=None)
+
+                    # if the model is finished generating
+                    else:
+                        # check to make sure we haven't "forgotten" to stream
+                        #   any tokens that were generated but previously
+                        #   matched by partial json parsing
+                        # only happens if we are NOT using guided decoding
+                        auto_tools_called = False
+                        if tool_parser:
+                            auto_tools_called = len(
+                                tool_parser.prev_tool_call_arr) > 0
+                            index = len(tool_parser.prev_tool_call_arr
+                                        ) - 1 if auto_tools_called else 0
+                        else:
+                            index = 0
+
+                        if self._should_check_for_unstreamed_tool_arg_tokens(
+                                delta_message, output) and tool_parser:
+                            latest_delta_len = 0
+                            if ((isinstance(
+                                    delta_message.tool_calls[0].function,
+                                    DeltaFunctionCall)) and isinstance(
+                                        delta_message.tool_calls[0].function.
+                                        arguments, str)):
+                                latest_delta_len = len(
+                                    delta_message.tool_calls[0].function.
+                                    arguments)
+
+                            # get the expected call based on partial JSON
+                            # parsing which "autocompletes" the JSON
+                            expected_call = json.dumps(
+                                tool_parser.prev_tool_call_arr[index].get(
+                                    "arguments", {}),
+                                ensure_ascii=False)
+
+                            # get what we've streamed so far for arguments
+                            # for the current tool
+                            actual_call = tool_parser.streamed_args_for_tool[
+                                index]
+                            if (latest_delta_len > 0):
+                                actual_call = actual_call[:-latest_delta_len]
+
+                            # check to see if there's anything left to stream
+                            remaining_call = expected_call.replace(
+                                actual_call, "", 1)
+                            # set that as a delta message
+                            delta_message = DeltaMessage(tool_calls=[
+                                DeltaToolCall(index=index,
+                                              function=DeltaFunctionCall(
+                                                  arguments=remaining_call).
+                                              model_dump(exclude_none=True))
+                            ])
+
+                        # Send the finish response for each request.n only once
+                        choice_data = ChatCompletionResponseStreamChoice(
+                            index=i,
+                            delta=delta_message,
+                            logprobs=logprobs,
+                            finish_reason=output.finish_reason
+                            if not auto_tools_called else "tool_calls",
+                            stop_reason=output.stop_reason)
+
+                        finish_reason_sent[i] = True
+
+                    chunk = ChatCompletionStreamResponse(
+                        id=request_id,
+                        object=chunk_object_type,
+                        created=created_time,
+                        choices=[choice_data],
+                        model=model_name)
+
+                    # handle usage stats if requested & if continuous
+                    if include_continuous_usage:
+                        completion_tokens = previous_num_tokens[i]
+                        chunk.usage = UsageInfo(
+                            prompt_tokens=num_prompt_tokens,
+                            completion_tokens=completion_tokens,
+                            total_tokens=num_prompt_tokens + completion_tokens,
+                        )
+
+                    data = chunk.model_dump_json(exclude_unset=True)
+                    yield f"data: {data}\n\n"
+
+            # once the final token is handled, if stream_options.include_usage
+            # is sent, send the usage
+            if include_usage:
+                completion_tokens = sum(previous_num_tokens)
+                final_usage = UsageInfo(prompt_tokens=num_prompt_tokens,
+                                        completion_tokens=completion_tokens,
+                                        total_tokens=num_prompt_tokens +
+                                        completion_tokens)
+                if self.enable_prompt_tokens_details and num_cached_tokens:
+                    final_usage.prompt_tokens_details = PromptTokenUsageInfo(
+                        cached_tokens=num_cached_tokens)
+
+                final_usage_chunk = ChatCompletionStreamResponse(
+                    id=request_id,
+                    object=chunk_object_type,
+                    created=created_time,
+                    choices=[],
+                    model=model_name,
+                    usage=final_usage)
+                final_usage_data = (final_usage_chunk.model_dump_json(
+                    exclude_unset=True, exclude_none=True))
+                yield f"data: {final_usage_data}\n\n"
+
+            # report to FastAPI middleware aggregate usage across all choices
+            num_completion_tokens = sum(previous_num_tokens)
+            request_metadata.final_usage_info = UsageInfo(
+                prompt_tokens=num_prompt_tokens,
+                completion_tokens=num_completion_tokens,
+                total_tokens=num_prompt_tokens + num_completion_tokens)
+
+        except Exception as e:
+            # TODO: Use a vllm-specific Validation Error
+            logger.exception("Error in chat completion stream generator.")
+            data = self.create_streaming_error_response(str(e))
+            yield f"data: {data}\n\n"
+        # Send the final done message after all response.n are finished
+        yield "data: [DONE]\n\n"
+
+    async def chat_completion_full_generator(
+        self,
+        request: ChatCompletionRequest,
+        result_generator: AsyncIterator[RequestOutput],
+        request_id: str,
+        model_name: str,
+        conversation: list[ConversationMessage],
+        tokenizer: AnyTokenizer,
+        request_metadata: RequestResponseMetadata,
+    ) -> Union[ErrorResponse, ChatCompletionResponse]:
+
+        created_time = int(time.time())
+        final_res: Optional[RequestOutput] = None
+
+        try:
+            async for res in result_generator:
+                final_res = res
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+        assert final_res is not None
+
+        choices: list[ChatCompletionResponseChoice] = []
+
+        role = self.get_chat_request_role(request)
+        for output in final_res.outputs:
+            token_ids = output.token_ids
+            out_logprobs = output.logprobs
+
+            if request.logprobs and request.top_logprobs is not None:
+                assert out_logprobs is not None, "Did not output logprobs"
+                logprobs = self._create_chat_logprobs(
+                    token_ids=token_ids,
+                    top_logprobs=out_logprobs,
+                    num_output_top_logprobs=request.top_logprobs,
+                    tokenizer=tokenizer,
+                    return_as_token_id=request.return_tokens_as_token_ids,
+                )
+            else:
+                logprobs = None
+            auto_tools_called = False
+
+            if self.reasoning_parser:
+                try:
+                    reasoning_parser = self.reasoning_parser(tokenizer)
+                except RuntimeError as e:
+                    logger.exception("Error in reasoning parser creation.")
+                    return self.create_error_response(str(e))
+                # If the reasoning parser is enabled,
+                # tool calls are extracted exclusively from the content.
+                reasoning_content, content = (
+                    reasoning_parser.extract_reasoning_content(
+                        output.text, request=request))
+            else:
+                reasoning_content = None
+                content = output.text
+
+            # if auto tools are not enabled, and a named tool choice using
+            #   outlines is not being used
+            if (not self.enable_auto_tools or not self.tool_parser) and \
+                (not isinstance(request.tool_choice,
+                                ChatCompletionNamedToolChoiceParam
+                                ) and request.tool_choice != "required"):
+                message = ChatMessage(role=role,
+                                      reasoning_content=reasoning_content,
+                                      content=content)
+
+            # if the request uses tools and specified a tool choice
+            elif request.tool_choice and type(
+                    request.tool_choice) is ChatCompletionNamedToolChoiceParam:
+
+                tool_call_class = MistralToolCall if isinstance(
+                    tokenizer, MistralTokenizer) else ToolCall
+                message = ChatMessage(
+                    role=role,
+                    reasoning_content=reasoning_content,
+                    content="",
+                    tool_calls=[
+                        tool_call_class(function=FunctionCall(
+                            name=request.tool_choice.function.name,
+                            arguments=content))
+                    ])
+
+            elif request.tool_choice and request.tool_choice == "required":
+                tool_call_class = MistralToolCall if isinstance(
+                    tokenizer, MistralTokenizer) else ToolCall
+
+                # the fields of FunctionDefinition are a superset of the
+                # tool call outputs and can be used for parsing
+                assert content is not None
+                tool_calls = TypeAdapter(
+                    list[FunctionDefinition]).validate_json(content)
+                message = ChatMessage(
+                    role=role,
+                    content="",
+                    reasoning_content=reasoning_content,
+                    tool_calls=[
+                        tool_call_class(function=FunctionCall(
+                            name=tool_call.name,
+                            arguments=json.dumps(tool_call.parameters,
+                                                 ensure_ascii=False)))
+                        for tool_call in tool_calls
+                    ])
+
+            # if the request doesn't use tool choice
+            # OR specifies to not use a tool
+            elif not request.tool_choice or request.tool_choice == "none":
+
+                message = ChatMessage(role=role,
+                                      reasoning_content=reasoning_content,
+                                      content=content)
+
+            # handle when there are tools and tool choice is auto
+            elif request.tools and (
+                    request.tool_choice == "auto"
+                    or request.tool_choice is None) and self.enable_auto_tools \
+                    and self.tool_parser:
+
+                try:
+                    tool_parser = self.tool_parser(tokenizer)
+                except RuntimeError as e:
+                    logger.exception("Error in tool parser creation.")
+                    return self.create_error_response(str(e))
+
+                tool_call_info = tool_parser.extract_tool_calls(
+                    content if content is not None else "", request=request)
+                # In the OpenAI API the finish_reason is "tools_called"
+                # if the tool choice is auto and the model produced a tool
+                # call. The same is not true for named function calls
+                auto_tools_called = tool_call_info.tools_called
+                if tool_call_info.tools_called:
+                    message = ChatMessage(role=role,
+                                          reasoning_content=reasoning_content,
+                                          content=tool_call_info.content,
+                                          tool_calls=tool_call_info.tool_calls)
+
+                else:
+                    # FOR NOW make it a chat message; we will have to detect
+                    # the type to make it later.
+                    ret_content = content
+
+                    # try to use content return from tool parser first,
+                    # tool parser may do some modify for the content.
+                    if (tool_call_info.content
+                            and len(tool_call_info.content) > 0):
+                        ret_content = tool_call_info.content
+
+                    message = ChatMessage(role=role,
+                                          reasoning_content=reasoning_content,
+                                          content=ret_content)
+
+            # undetermined case that is still important to handle
+            else:
+                logger.error(
+                    "Error in chat_completion_full_generator - cannot determine"
+                    " if tools should be extracted. Returning a standard chat "
+                    "completion.")
+                message = ChatMessage(role=role,
+                                      reasoning_content=reasoning_content,
+                                      content=content)
+
+            choice_data = ChatCompletionResponseChoice(
+                index=output.index,
+                message=message,
+                logprobs=logprobs,
+                finish_reason="tool_calls" if auto_tools_called else
+                output.finish_reason if output.finish_reason else "stop",
+                stop_reason=output.stop_reason)
+            choices.append(choice_data)
+
+        if request.echo:
+            last_msg_content: Union[str, list[dict[str, str]]] = ""
+            if conversation and "content" in conversation[-1] and conversation[
+                    -1].get("role") == role:
+                last_msg_content = conversation[-1]["content"] or ""
+            if isinstance(last_msg_content, list):
+                last_msg_content = "\n".join(msg['text']
+                                             for msg in last_msg_content)
+
+            for choice in choices:
+                full_message = last_msg_content + (choice.message.content
+                                                   or "")
+                choice.message.content = full_message
+
+        assert final_res.prompt_token_ids is not None
+        num_prompt_tokens = len(final_res.prompt_token_ids)
+        if final_res.encoder_prompt_token_ids is not None:
+            num_prompt_tokens += len(final_res.encoder_prompt_token_ids)
+        num_generated_tokens = sum(
+            len(output.token_ids) for output in final_res.outputs)
+        usage = UsageInfo(prompt_tokens=num_prompt_tokens,
+                          completion_tokens=num_generated_tokens,
+                          total_tokens=num_prompt_tokens +
+                          num_generated_tokens)
+        if self.enable_prompt_tokens_details and final_res.num_cached_tokens:
+            usage.prompt_tokens_details = PromptTokenUsageInfo(
+                cached_tokens=final_res.num_cached_tokens)
+
+        request_metadata.final_usage_info = usage
+
+        response = ChatCompletionResponse(
+            id=request_id,
+            created=created_time,
+            model=model_name,
+            choices=choices,
+            usage=usage,
+            prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs),
+            kv_transfer_params=final_res.kv_transfer_params,
+        )
+
+        return response
+
+    def _get_top_logprobs(
+            self, logprobs: dict[int, Logprob], top_logprobs: Optional[int],
+            tokenizer: AnyTokenizer,
+            should_return_as_token_id: bool) -> list[ChatCompletionLogProb]:
+        return [
+            ChatCompletionLogProb(token=(token := self._get_decoded_token(
+                p[1],
+                p[0],
+                tokenizer,
+                return_as_token_id=should_return_as_token_id)),
+                                  logprob=max(p[1].logprob, -9999.0),
+                                  bytes=list(
+                                      token.encode("utf-8", errors="replace")))
+            for i, p in enumerate(logprobs.items())
+            if top_logprobs and i < top_logprobs
+        ]
+
+    def _create_chat_logprobs(
+        self,
+        token_ids: GenericSequence[int],
+        top_logprobs: GenericSequence[Optional[dict[int, Logprob]]],
+        tokenizer: AnyTokenizer,
+        num_output_top_logprobs: Optional[int] = None,
+        return_as_token_id: Optional[bool] = None,
+    ) -> ChatCompletionLogProbs:
+        """Create OpenAI-style logprobs."""
+        logprobs_content: list[ChatCompletionLogProbsContent] = []
+
+        should_return_as_token_id = return_as_token_id if \
+            return_as_token_id is not None else self.return_tokens_as_token_ids
+        for i, token_id in enumerate(token_ids):
+            step_top_logprobs = top_logprobs[i]
+            if step_top_logprobs is None or step_top_logprobs.get(
+                    token_id) is None:
+                token = tokenizer.decode(token_id)
+                if should_return_as_token_id:
+                    token = f"token_id:{token_id}"
+
+                logprobs_content.append(
+                    ChatCompletionLogProbsContent(
+                        token=token,
+                        bytes=list(token.encode("utf-8", errors="replace")),
+                    ))
+            else:
+                step_token = step_top_logprobs[token_id]
+                step_decoded = step_token.decoded_token
+
+                logprobs_content.append(
+                    ChatCompletionLogProbsContent(
+                        token=self._get_decoded_token(
+                            step_token,
+                            token_id,
+                            tokenizer,
+                            should_return_as_token_id,
+                        ),
+                        logprob=max(step_token.logprob, -9999.0),
+                        bytes=None if step_decoded is None else list(
+                            step_decoded.encode("utf-8", errors="replace")),
+                        top_logprobs=self._get_top_logprobs(
+                            step_top_logprobs, num_output_top_logprobs,
+                            tokenizer, should_return_as_token_id),
+                    ))
+
+        return ChatCompletionLogProbs(content=logprobs_content)
+
+    def _should_stream_with_auto_tool_parsing(self,
+                                              request: ChatCompletionRequest):
+        """
+        Utility function to check if streamed tokens should go through the tool
+        call parser that was configured.
+
+        We only want to do this IF user-provided tools are set, a tool parser
+        is configured, "auto" tool choice is enabled, and the request's tool
+        choice field indicates that "auto" tool choice should be used.
+        """
+        return (request.tools and self.tool_parser and self.enable_auto_tools
+                and request.tool_choice in ['auto', None])
+
+    def _should_check_for_unstreamed_tool_arg_tokens(
+        self,
+        delta_message: Optional[DeltaMessage],
+        output: CompletionOutput,
+    ) -> bool:
+        """
+        Check to see if we should check for unstreamed tool arguments tokens.
+        This is only applicable when auto tool parsing is enabled, the delta
+        is a tool call with arguments.
+        """
+
+        # yapf: disable
+        return bool(
+            # if there is a delta message that includes tool calls which
+            # include a function that has arguments
+            output.finish_reason is not None
+            and self.enable_auto_tools and self.tool_parser and delta_message
+            and delta_message.tool_calls and delta_message.tool_calls[0]
+            and delta_message.tool_calls[0].function
+            and delta_message.tool_calls[0].function.arguments is not None
+        )
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/serving_classification.py b/vllm_v0.10.0/vllm/entrypoints/openai/serving_classification.py
new file mode 100644
index 0000000..377f7f6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/serving_classification.py
@@ -0,0 +1,184 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from http import HTTPStatus
+from typing import Optional, Union, cast
+
+import numpy as np
+from fastapi import Request
+from typing_extensions import override
+
+from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.protocol import (ClassificationData,
+                                              ClassificationRequest,
+                                              ClassificationResponse,
+                                              ErrorResponse, UsageInfo)
+# yapf: enable
+from vllm.entrypoints.openai.serving_engine import (ClassificationServeContext,
+                                                    OpenAIServing,
+                                                    ServeContext)
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.logger import init_logger
+from vllm.outputs import ClassificationOutput, PoolingRequestOutput
+from vllm.pooling_params import PoolingParams
+
+logger = init_logger(__name__)
+
+
+class ClassificationMixin(OpenAIServing):
+
+    @override
+    async def _preprocess(
+        self,
+        ctx: ServeContext,
+    ) -> Optional[ErrorResponse]:
+        """
+        Process classification inputs: tokenize text, resolve adapters,
+        and prepare model-specific inputs.
+        """
+        ctx = cast(ClassificationServeContext, ctx)
+        if isinstance(ctx.request.input, str) and not ctx.request.input:
+            return self.create_error_response(
+                "Input cannot be empty for classification",
+                status_code=HTTPStatus.BAD_REQUEST,
+            )
+
+        if isinstance(ctx.request.input, list) and len(ctx.request.input) == 0:
+            return None
+
+        try:
+            ctx.lora_request = self._maybe_get_adapters(ctx.request)
+
+            ctx.tokenizer = await self.engine_client.get_tokenizer(
+                ctx.lora_request)
+
+            (
+                ctx.request_prompts,
+                ctx.engine_prompts,
+            ) = await self._preprocess_completion(
+                ctx.request,
+                ctx.tokenizer,
+                ctx.request.input,
+                truncate_prompt_tokens=ctx.request.truncate_prompt_tokens,
+            )
+
+            return None
+
+        except (ValueError, TypeError) as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
+
+    @override
+    def _build_response(
+        self,
+        ctx: ServeContext,
+    ) -> Union[ClassificationResponse, ErrorResponse]:
+        """
+        Convert model outputs to a formatted classification response
+        with probabilities and labels.
+        """
+        ctx = cast(ClassificationServeContext, ctx)
+        items: list[ClassificationData] = []
+        num_prompt_tokens = 0
+
+        final_res_batch_checked = cast(list[PoolingRequestOutput],
+                                       ctx.final_res_batch)
+
+        for idx, final_res in enumerate(final_res_batch_checked):
+            classify_res = ClassificationOutput.from_base(final_res.outputs)
+
+            probs = classify_res.probs
+            predicted_index = int(np.argmax(probs))
+            label = getattr(self.model_config.hf_config, "id2label",
+                            {}).get(predicted_index)
+
+            item = ClassificationData(
+                index=idx,
+                label=label,
+                probs=probs,
+                num_classes=len(probs),
+            )
+
+            items.append(item)
+            prompt_token_ids = final_res.prompt_token_ids
+            num_prompt_tokens += len(prompt_token_ids)
+
+        usage = UsageInfo(
+            prompt_tokens=num_prompt_tokens,
+            total_tokens=num_prompt_tokens,
+        )
+
+        return ClassificationResponse(
+            id=ctx.request_id,
+            created=ctx.created_time,
+            model=ctx.model_name,
+            data=items,
+            usage=usage,
+        )
+
+
+class ServingClassification(ClassificationMixin):
+    request_id_prefix = "classify"
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        model_config: ModelConfig,
+        models: OpenAIServingModels,
+        *,
+        request_logger: Optional[RequestLogger],
+    ) -> None:
+        super().__init__(
+            engine_client=engine_client,
+            model_config=model_config,
+            models=models,
+            request_logger=request_logger,
+        )
+
+    async def create_classify(
+        self,
+        request: ClassificationRequest,
+        raw_request: Request,
+    ) -> Union[ClassificationResponse, ErrorResponse]:
+        model_name = self._get_model_name(request.model)
+        request_id = (f"{self.request_id_prefix}-"
+                      f"{self._base_request_id(raw_request)}")
+
+        ctx = ClassificationServeContext(
+            request=request,
+            raw_request=raw_request,
+            model_name=model_name,
+            request_id=request_id,
+        )
+
+        return await super().handle(ctx)  # type: ignore
+
+    @override
+    def _validate_request(
+        self,
+        ctx: ClassificationServeContext,
+    ) -> Optional[ErrorResponse]:
+        if error := super()._validate_request(ctx):
+            return error
+
+        ctx.truncate_prompt_tokens = ctx.request.truncate_prompt_tokens
+
+        return None
+
+    @override
+    def _create_pooling_params(
+        self,
+        ctx: ClassificationServeContext,
+    ) -> Union[PoolingParams, ErrorResponse]:
+        pooling_params = super()._create_pooling_params(ctx)
+        if isinstance(pooling_params, ErrorResponse):
+            return pooling_params
+
+        try:
+            pooling_params.verify("classify", self.model_config)
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+        return pooling_params
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/serving_completion.py b/vllm_v0.10.0/vllm/entrypoints/openai/serving_completion.py
new file mode 100644
index 0000000..323795c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/serving_completion.py
@@ -0,0 +1,652 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import time
+from collections.abc import AsyncGenerator, AsyncIterator
+from collections.abc import Sequence as GenericSequence
+from typing import Optional, Union, cast
+
+import jinja2
+from fastapi import Request
+from typing_extensions import assert_never
+
+from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.logger import RequestLogger
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.entrypoints.openai.protocol import (CompletionLogProbs,
+                                              CompletionRequest,
+                                              CompletionResponse,
+                                              CompletionResponseChoice,
+                                              CompletionResponseStreamChoice,
+                                              CompletionStreamResponse,
+                                              ErrorResponse,
+                                              PromptTokenUsageInfo,
+                                              RequestResponseMetadata,
+                                              UsageInfo)
+from vllm.entrypoints.openai.serving_engine import (
+    EmbedsPrompt as ServingEngineEmbedsPrompt)
+from vllm.entrypoints.openai.serving_engine import (OpenAIServing,
+                                                    TextTokensPrompt,
+                                                    clamp_prompt_logprobs,
+                                                    is_text_tokens_prompt)
+# yapf: enable
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.utils import get_max_tokens
+from vllm.inputs.data import (EmbedsPrompt, TokensPrompt, is_embeds_prompt,
+                              is_tokens_prompt)
+from vllm.logger import init_logger
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import BeamSearchParams, SamplingParams
+from vllm.sequence import Logprob
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import merge_async_iterators
+
+logger = init_logger(__name__)
+
+
+class OpenAIServingCompletion(OpenAIServing):
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        model_config: ModelConfig,
+        models: OpenAIServingModels,
+        *,
+        request_logger: Optional[RequestLogger],
+        return_tokens_as_token_ids: bool = False,
+        enable_prompt_tokens_details: bool = False,
+        enable_force_include_usage: bool = False,
+    ):
+        super().__init__(
+            engine_client=engine_client,
+            model_config=model_config,
+            models=models,
+            request_logger=request_logger,
+            return_tokens_as_token_ids=return_tokens_as_token_ids,
+            enable_force_include_usage=enable_force_include_usage,
+        )
+        self.enable_prompt_tokens_details = enable_prompt_tokens_details
+        self.default_sampling_params = (
+            self.model_config.get_diff_sampling_param())
+        if self.default_sampling_params:
+            source = self.model_config.generation_config
+            source = "model" if source == "auto" else source
+            logger.info(
+                "Using default completion sampling params from %s: %s",
+                source,
+                self.default_sampling_params,
+            )
+
+    async def create_completion(
+        self,
+        request: CompletionRequest,
+        raw_request: Optional[Request] = None,
+    ) -> Union[AsyncGenerator[str, None], CompletionResponse, ErrorResponse]:
+        """Completion API similar to OpenAI's API.
+
+        See https://platform.openai.com/docs/api-reference/completions/create
+        for the API specification. This API mimics the OpenAI Completion API.
+
+        NOTE: Currently we do not support the following feature:
+            - suffix (the language models we currently support do not support
+            suffix)
+        """
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        # If the engine is dead, raise the engine's DEAD_ERROR.
+        # This is required for the streaming case, where we return a
+        # success status before we actually start generating text :).
+        if self.engine_client.errored:
+            raise self.engine_client.dead_error
+
+        # Return error for unsupported features.
+        if request.suffix is not None:
+            return self.create_error_response(
+                "suffix is not currently supported")
+
+        if request.echo and request.prompt_embeds is not None:
+            return self.create_error_response(
+                "Echo is unsupported with prompt embeds.")
+
+        request_id = f"cmpl-{self._base_request_id(raw_request)}"
+        created_time = int(time.time())
+
+        request_metadata = RequestResponseMetadata(request_id=request_id)
+        if raw_request:
+            raw_request.state.request_metadata = request_metadata
+
+        try:
+            lora_request = self._maybe_get_adapters(request)
+
+            tokenizer = await self.engine_client.get_tokenizer(lora_request)
+
+            request_prompts, engine_prompts = await self._preprocess_completion(
+                request,
+                tokenizer,
+                request.prompt,
+                truncate_prompt_tokens=request.truncate_prompt_tokens,
+                add_special_tokens=request.add_special_tokens,
+            )
+        except ValueError as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
+        except TypeError as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
+        except RuntimeError as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
+        except jinja2.TemplateError as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
+
+        # Schedule the request and get the result generator.
+        generators: list[AsyncGenerator[RequestOutput, None]] = []
+        try:
+            for i, engine_prompt in enumerate(engine_prompts):
+                sampling_params: Union[SamplingParams, BeamSearchParams]
+                # Mypy does not infer that engine_prompt will have only one of
+                # "prompt_token_ids" or "prompt_embeds" defined, and both of
+                # these as Union[object, the expected type], where it infers
+                # object if engine_prompt is a subclass of one of the
+                # typeddicts that defines both keys. Worse, because of
+                # https://github.com/python/mypy/issues/8586, mypy does not
+                # infer the type of engine_prompt correctly because of the
+                # enumerate. So we need an unnecessary cast here.
+                engine_prompt = cast(Union[EmbedsPrompt, TokensPrompt],
+                                     engine_prompt)
+                if is_embeds_prompt(engine_prompt):
+                    input_length = len(engine_prompt["prompt_embeds"])
+                elif is_tokens_prompt(engine_prompt):
+                    input_length = len(engine_prompt["prompt_token_ids"])
+                else:
+                    assert_never(engine_prompt)
+
+                if self.default_sampling_params is None:
+                    self.default_sampling_params = {}
+
+                max_tokens = get_max_tokens(
+                    max_model_len=self.max_model_len,
+                    request=request,
+                    input_length=input_length,
+                    default_sampling_params=self.default_sampling_params,
+                )
+
+                if request.use_beam_search:
+                    sampling_params = request.to_beam_search_params(
+                        max_tokens, self.default_sampling_params)
+                else:
+                    sampling_params = request.to_sampling_params(
+                        max_tokens,
+                        self.model_config.logits_processor_pattern,
+                        self.default_sampling_params,
+                    )
+
+                request_id_item = f"{request_id}-{i}"
+
+                self._log_inputs(
+                    request_id_item,
+                    request_prompts[i],
+                    params=sampling_params,
+                    lora_request=lora_request,
+                )
+
+                trace_headers = (None if raw_request is None else await
+                                 self._get_trace_headers(raw_request.headers))
+
+                # Mypy inconsistently requires this second cast in different
+                # environments. It shouldn't be necessary (redundant from above)
+                # but pre-commit in CI fails without it.
+                engine_prompt = cast(Union[EmbedsPrompt, TokensPrompt],
+                                     engine_prompt)
+                if isinstance(sampling_params, BeamSearchParams):
+                    generator = self.engine_client.beam_search(
+                        prompt=engine_prompt,
+                        request_id=request_id,
+                        params=sampling_params,
+                        lora_request=lora_request,
+                    )
+                else:
+                    generator = self.engine_client.generate(
+                        engine_prompt,
+                        sampling_params,
+                        request_id_item,
+                        lora_request=lora_request,
+                        trace_headers=trace_headers,
+                        priority=request.priority,
+                    )
+
+                generators.append(generator)
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+        result_generator = merge_async_iterators(*generators)
+
+        model_name = self._get_model_name(request.model, lora_request)
+        num_prompts = len(engine_prompts)
+
+        # Similar to the OpenAI API, when n != best_of, we do not stream the
+        # results. Noting that best_of is only supported in V0. In addition,
+        # we do not stream the results when use beam search.
+        stream = (request.stream
+                  and (request.best_of is None or request.n == request.best_of)
+                  and not request.use_beam_search)
+
+        # Streaming response
+        if stream:
+            return self.completion_stream_generator(
+                request,
+                request_prompts,
+                result_generator,
+                request_id,
+                created_time,
+                model_name,
+                num_prompts=num_prompts,
+                tokenizer=tokenizer,
+                request_metadata=request_metadata,
+                enable_force_include_usage=self.enable_force_include_usage,
+            )
+
+        # Non-streaming response
+        final_res_batch: list[Optional[RequestOutput]] = [None] * num_prompts
+        try:
+            async for i, res in result_generator:
+                final_res_batch[i] = res
+
+            for i, final_res in enumerate(final_res_batch):
+                assert final_res is not None
+
+                # The output should contain the input text
+                # We did not pass it into vLLM engine to avoid being redundant
+                # with the inputs token IDs
+                if final_res.prompt is None:
+                    request_prompt = request_prompts[i]
+                    if is_text_tokens_prompt(request_prompt):
+                        final_res.prompt = request_prompt["prompt"]
+                    else:
+                        final_res.prompt = None
+
+            final_res_batch_checked = cast(list[RequestOutput],
+                                           final_res_batch)
+
+            response = self.request_output_to_completion_response(
+                final_res_batch_checked,
+                request,
+                request_id,
+                created_time,
+                model_name,
+                tokenizer,
+                request_metadata,
+            )
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+        # When user requests streaming but we don't stream, we still need to
+        # return a streaming response with a single event.
+        if request.stream:
+            response_json = response.model_dump_json()
+
+            async def fake_stream_generator() -> AsyncGenerator[str, None]:
+                yield f"data: {response_json}\n\n"
+                yield "data: [DONE]\n\n"
+
+            return fake_stream_generator()
+
+        return response
+
+    async def completion_stream_generator(
+        self,
+        request: CompletionRequest,
+        request_prompts: list[Union[TextTokensPrompt,
+                                    ServingEngineEmbedsPrompt]],
+        result_generator: AsyncIterator[tuple[int, RequestOutput]],
+        request_id: str,
+        created_time: int,
+        model_name: str,
+        num_prompts: int,
+        tokenizer: AnyTokenizer,
+        request_metadata: RequestResponseMetadata,
+        enable_force_include_usage: bool,
+    ) -> AsyncGenerator[str, None]:
+        num_choices = 1 if request.n is None else request.n
+        previous_text_lens = [0] * num_choices * num_prompts
+        previous_num_tokens = [0] * num_choices * num_prompts
+        has_echoed = [False] * num_choices * num_prompts
+        num_prompt_tokens = [0] * num_prompts
+        num_cached_tokens = None
+        first_iteration = True
+
+        stream_options = request.stream_options
+        if stream_options:
+            include_usage = (stream_options.include_usage
+                             or enable_force_include_usage)
+            include_continuous_usage = (include_usage and
+                                        stream_options.continuous_usage_stats)
+        else:
+            include_usage, include_continuous_usage = False, False
+
+        try:
+            async for prompt_idx, res in result_generator:
+                prompt_token_ids = res.prompt_token_ids
+                prompt_logprobs = res.prompt_logprobs
+
+                if first_iteration:
+                    num_cached_tokens = res.num_cached_tokens
+                    first_iteration = False
+
+                if res.prompt is not None:
+                    prompt_text = res.prompt
+                else:
+                    request_prompt = request_prompts[prompt_idx]
+                    if is_text_tokens_prompt(request_prompt):
+                        prompt_text = request_prompt["prompt"]
+                    else:
+                        prompt_text = None
+
+                # Prompt details are excluded from later streamed outputs
+                if prompt_token_ids is not None:
+                    num_prompt_tokens[prompt_idx] = len(prompt_token_ids)
+
+                delta_token_ids: GenericSequence[int]
+                out_logprobs: Optional[GenericSequence[Optional[dict[
+                    int, Logprob]]]]
+
+                for output in res.outputs:
+                    i = output.index + prompt_idx * num_choices
+
+                    assert request.max_tokens is not None
+                    if request.echo and not has_echoed[i]:
+                        assert prompt_token_ids is not None
+                        assert prompt_text is not None
+                        if request.max_tokens == 0:
+                            # only return the prompt
+                            delta_text = prompt_text
+                            delta_token_ids = prompt_token_ids
+                            out_logprobs = prompt_logprobs
+                        else:
+                            # echo the prompt and first token
+                            delta_text = prompt_text + output.text
+                            delta_token_ids = [
+                                *prompt_token_ids,
+                                *output.token_ids,
+                            ]
+                            out_logprobs = [
+                                *(prompt_logprobs or []),
+                                *(output.logprobs or []),
+                            ]
+                        has_echoed[i] = True
+                    else:
+                        # return just the delta
+                        delta_text = output.text
+                        delta_token_ids = output.token_ids
+                        out_logprobs = output.logprobs
+
+                        if (not delta_text and not delta_token_ids
+                                and not previous_num_tokens[i]):
+                            # Chunked prefill case, don't return empty chunks
+                            continue
+
+                    if request.logprobs is not None:
+                        assert out_logprobs is not None, (
+                            "Did not output logprobs")
+                        logprobs = self._create_completion_logprobs(
+                            token_ids=delta_token_ids,
+                            top_logprobs=out_logprobs,
+                            num_output_top_logprobs=request.logprobs,
+                            tokenizer=tokenizer,
+                            initial_text_offset=previous_text_lens[i],
+                            return_as_token_id=request.
+                            return_tokens_as_token_ids,
+                        )
+                    else:
+                        logprobs = None
+
+                    previous_text_lens[i] += len(output.text)
+                    previous_num_tokens[i] += len(output.token_ids)
+                    finish_reason = output.finish_reason
+                    stop_reason = output.stop_reason
+
+                    chunk = CompletionStreamResponse(
+                        id=request_id,
+                        created=created_time,
+                        model=model_name,
+                        choices=[
+                            CompletionResponseStreamChoice(
+                                index=i,
+                                text=delta_text,
+                                logprobs=logprobs,
+                                finish_reason=finish_reason,
+                                stop_reason=stop_reason,
+                            )
+                        ],
+                    )
+                    if include_continuous_usage:
+                        prompt_tokens = num_prompt_tokens[prompt_idx]
+                        completion_tokens = previous_num_tokens[i]
+                        chunk.usage = UsageInfo(
+                            prompt_tokens=prompt_tokens,
+                            completion_tokens=completion_tokens,
+                            total_tokens=prompt_tokens + completion_tokens,
+                        )
+
+                    response_json = chunk.model_dump_json(exclude_unset=False)
+                    yield f"data: {response_json}\n\n"
+
+            total_prompt_tokens = sum(num_prompt_tokens)
+            total_completion_tokens = sum(previous_num_tokens)
+            final_usage_info = UsageInfo(
+                prompt_tokens=total_prompt_tokens,
+                completion_tokens=total_completion_tokens,
+                total_tokens=total_prompt_tokens + total_completion_tokens,
+            )
+
+            if self.enable_prompt_tokens_details and num_cached_tokens:
+                final_usage_info.prompt_tokens_details = PromptTokenUsageInfo(
+                    cached_tokens=num_cached_tokens)
+
+            if include_usage:
+                final_usage_chunk = CompletionStreamResponse(
+                    id=request_id,
+                    created=created_time,
+                    model=model_name,
+                    choices=[],
+                    usage=final_usage_info,
+                )
+                final_usage_data = final_usage_chunk.model_dump_json(
+                    exclude_unset=False, exclude_none=True)
+                yield f"data: {final_usage_data}\n\n"
+
+            # report to FastAPI middleware aggregate usage across all choices
+            request_metadata.final_usage_info = final_usage_info
+
+        except Exception as e:
+            # TODO: Use a vllm-specific Validation Error
+            data = self.create_streaming_error_response(str(e))
+            yield f"data: {data}\n\n"
+        yield "data: [DONE]\n\n"
+
+    def request_output_to_completion_response(
+        self,
+        final_res_batch: list[RequestOutput],
+        request: CompletionRequest,
+        request_id: str,
+        created_time: int,
+        model_name: str,
+        tokenizer: AnyTokenizer,
+        request_metadata: RequestResponseMetadata,
+    ) -> CompletionResponse:
+        choices: list[CompletionResponseChoice] = []
+        num_prompt_tokens = 0
+        num_generated_tokens = 0
+        kv_transfer_params = None
+        last_final_res = None
+        for final_res in final_res_batch:
+            last_final_res = final_res
+            prompt_token_ids = final_res.prompt_token_ids
+            assert prompt_token_ids is not None
+            prompt_logprobs = clamp_prompt_logprobs(final_res.prompt_logprobs)
+            prompt_text = final_res.prompt
+
+            token_ids: GenericSequence[int]
+            out_logprobs: Optional[GenericSequence[Optional[dict[int,
+                                                                 Logprob]]]]
+
+            for output in final_res.outputs:
+                assert request.max_tokens is not None
+                if request.echo:
+                    assert prompt_text is not None
+                    if request.max_tokens == 0:
+                        token_ids = prompt_token_ids
+                        out_logprobs = prompt_logprobs
+                        output_text = prompt_text
+                    else:
+                        token_ids = [*prompt_token_ids, *output.token_ids]
+
+                        if request.logprobs is None:
+                            out_logprobs = None
+                        else:
+                            assert prompt_logprobs is not None
+                            assert output.logprobs is not None
+                            out_logprobs = [
+                                *prompt_logprobs,
+                                *output.logprobs,
+                            ]
+
+                        output_text = prompt_text + output.text
+                else:
+                    token_ids = output.token_ids
+                    out_logprobs = output.logprobs
+                    output_text = output.text
+
+                if request.logprobs is not None:
+                    assert out_logprobs is not None, "Did not output logprobs"
+                    logprobs = self._create_completion_logprobs(
+                        token_ids=token_ids,
+                        top_logprobs=out_logprobs,
+                        tokenizer=tokenizer,
+                        num_output_top_logprobs=request.logprobs,
+                        return_as_token_id=request.return_tokens_as_token_ids,
+                    )
+                else:
+                    logprobs = None
+
+                choice_data = CompletionResponseChoice(
+                    index=len(choices),
+                    text=output_text,
+                    logprobs=logprobs,
+                    finish_reason=output.finish_reason,
+                    stop_reason=output.stop_reason,
+                    prompt_logprobs=final_res.prompt_logprobs,
+                )
+                choices.append(choice_data)
+
+                num_generated_tokens += len(output.token_ids)
+
+            num_prompt_tokens += len(prompt_token_ids)
+
+        usage = UsageInfo(
+            prompt_tokens=num_prompt_tokens,
+            completion_tokens=num_generated_tokens,
+            total_tokens=num_prompt_tokens + num_generated_tokens,
+        )
+
+        if (self.enable_prompt_tokens_details and last_final_res
+                and last_final_res.num_cached_tokens):
+            usage.prompt_tokens_details = PromptTokenUsageInfo(
+                cached_tokens=last_final_res.num_cached_tokens)
+
+        request_metadata.final_usage_info = usage
+        if final_res_batch:
+            kv_transfer_params = final_res_batch[0].kv_transfer_params
+        return CompletionResponse(
+            id=request_id,
+            created=created_time,
+            model=model_name,
+            choices=choices,
+            usage=usage,
+            kv_transfer_params=kv_transfer_params,
+        )
+
+    def _create_completion_logprobs(
+        self,
+        token_ids: GenericSequence[int],
+        top_logprobs: GenericSequence[Optional[dict[int, Logprob]]],
+        num_output_top_logprobs: int,
+        tokenizer: AnyTokenizer,
+        initial_text_offset: int = 0,
+        return_as_token_id: Optional[bool] = None,
+    ) -> CompletionLogProbs:
+        """Create logprobs for OpenAI Completion API."""
+        out_text_offset: list[int] = []
+        out_token_logprobs: list[Optional[float]] = []
+        out_tokens: list[str] = []
+        out_top_logprobs: list[Optional[dict[str, float]]] = []
+
+        last_token_len = 0
+
+        should_return_as_token_id = (return_as_token_id
+                                     if return_as_token_id is not None else
+                                     self.return_tokens_as_token_ids)
+        for i, token_id in enumerate(token_ids):
+            step_top_logprobs = top_logprobs[i]
+            if step_top_logprobs is None:
+                token = tokenizer.decode(token_id)
+                if should_return_as_token_id:
+                    token = f"token_id:{token_id}"
+
+                out_tokens.append(token)
+                out_token_logprobs.append(None)
+                out_top_logprobs.append(None)
+            else:
+                step_token = step_top_logprobs[token_id]
+
+                token = self._get_decoded_token(
+                    step_token,
+                    token_id,
+                    tokenizer,
+                    return_as_token_id=should_return_as_token_id,
+                )
+                token_logprob = max(step_token.logprob, -9999.0)
+
+                out_tokens.append(token)
+                out_token_logprobs.append(token_logprob)
+
+                # makes sure to add the top num_output_top_logprobs + 1
+                # logprobs, as defined in the openai API
+                # (cf. https://github.com/openai/openai-openapi/blob/
+                # 893ba52242dbd5387a97b96444ee1c742cfce9bd/openapi.yaml#L7153)
+                out_top_logprobs.append({
+                    # Convert float("-inf") to the
+                    # JSON-serializable float that OpenAI uses
+                    self._get_decoded_token(
+                        top_lp[1],
+                        top_lp[0],
+                        tokenizer,
+                        return_as_token_id=should_return_as_token_id,
+                    ):
+                    max(top_lp[1].logprob, -9999.0)
+                    for i, top_lp in enumerate(step_top_logprobs.items())
+                    if num_output_top_logprobs >= i
+                })
+
+            if len(out_text_offset) == 0:
+                out_text_offset.append(initial_text_offset)
+            else:
+                out_text_offset.append(out_text_offset[-1] + last_token_len)
+            last_token_len = len(token)
+
+        return CompletionLogProbs(
+            text_offset=out_text_offset,
+            token_logprobs=out_token_logprobs,
+            tokens=out_tokens,
+            top_logprobs=out_top_logprobs,
+        )
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/serving_embedding.py b/vllm_v0.10.0/vllm/entrypoints/openai/serving_embedding.py
new file mode 100644
index 0000000..697f43c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/serving_embedding.py
@@ -0,0 +1,206 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import base64
+from typing import Final, Literal, Optional, Union, cast
+
+import numpy as np
+from fastapi import Request
+from typing_extensions import assert_never, override
+
+from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.protocol import (EmbeddingChatRequest,
+                                              EmbeddingRequest,
+                                              EmbeddingResponse,
+                                              EmbeddingResponseData,
+                                              ErrorResponse, UsageInfo)
+from vllm.entrypoints.openai.serving_engine import (EmbeddingServeContext,
+                                                    OpenAIServing,
+                                                    ServeContext)
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.logger import init_logger
+from vllm.outputs import (EmbeddingOutput, EmbeddingRequestOutput,
+                          PoolingRequestOutput)
+from vllm.pooling_params import PoolingParams
+
+logger = init_logger(__name__)
+
+
+def _get_embedding(
+    output: EmbeddingOutput,
+    encoding_format: Literal["float", "base64"],
+) -> Union[list[float], str]:
+    if encoding_format == "float":
+        return output.embedding
+    elif encoding_format == "base64":
+        # Force to use float32 for base64 encoding
+        # to match the OpenAI python client behavior
+        embedding_bytes = np.array(output.embedding, dtype="float32").tobytes()
+        return base64.b64encode(embedding_bytes).decode("utf-8")
+
+    assert_never(encoding_format)
+
+
+class EmbeddingMixin(OpenAIServing):
+
+    @override
+    async def _preprocess(
+        self,
+        ctx: ServeContext,
+    ) -> Optional[ErrorResponse]:
+        ctx = cast(EmbeddingServeContext, ctx)
+        try:
+            ctx.lora_request = self._maybe_get_adapters(ctx.request)
+
+            tokenizer = await self.engine_client.get_tokenizer(ctx.lora_request
+                                                               )
+
+            if isinstance(ctx.request, EmbeddingChatRequest):
+                (
+                    _,
+                    ctx.request_prompts,
+                    ctx.engine_prompts,
+                ) = await self._preprocess_chat(
+                    ctx.request,
+                    tokenizer,
+                    ctx.request.messages,
+                    chat_template=ctx.request.chat_template
+                    or ctx.chat_template,
+                    chat_template_content_format=ctx.
+                    chat_template_content_format,
+                    # In embedding requests, we are not generating tokens,
+                    # so there is no need to append extra tokens to the input
+                    add_generation_prompt=False,
+                    continue_final_message=False,
+                    truncate_prompt_tokens=ctx.truncate_prompt_tokens,
+                    add_special_tokens=ctx.request.add_special_tokens,
+                )
+            else:
+                (ctx.request_prompts,
+                 ctx.engine_prompts) = await self._preprocess_completion(
+                     ctx.request,
+                     tokenizer,
+                     ctx.request.input,
+                     truncate_prompt_tokens=ctx.truncate_prompt_tokens,
+                     add_special_tokens=ctx.request.add_special_tokens,
+                 )
+            return None
+        except (ValueError, TypeError) as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
+
+    @override
+    def _build_response(
+        self,
+        ctx: ServeContext,
+    ) -> Union[EmbeddingResponse, ErrorResponse]:
+        items: list[EmbeddingResponseData] = []
+        num_prompt_tokens = 0
+
+        final_res_batch_checked = cast(list[PoolingRequestOutput],
+                                       ctx.final_res_batch)
+
+        for idx, final_res in enumerate(final_res_batch_checked):
+            embedding_res = EmbeddingRequestOutput.from_base(final_res)
+
+            item = EmbeddingResponseData(
+                index=idx,
+                embedding=_get_embedding(embedding_res.outputs,
+                                         ctx.request.encoding_format),
+            )
+            prompt_token_ids = final_res.prompt_token_ids
+
+            items.append(item)
+            num_prompt_tokens += len(prompt_token_ids)
+
+        usage = UsageInfo(
+            prompt_tokens=num_prompt_tokens,
+            total_tokens=num_prompt_tokens,
+        )
+
+        return EmbeddingResponse(
+            id=ctx.request_id,
+            created=ctx.created_time,
+            model=ctx.model_name,
+            data=items,
+            usage=usage,
+        )
+
+
+class OpenAIServingEmbedding(EmbeddingMixin):
+    request_id_prefix = "embd"
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        model_config: ModelConfig,
+        models: OpenAIServingModels,
+        *,
+        request_logger: Optional[RequestLogger],
+        chat_template: Optional[str],
+        chat_template_content_format: ChatTemplateContentFormatOption,
+    ) -> None:
+        super().__init__(engine_client=engine_client,
+                         model_config=model_config,
+                         models=models,
+                         request_logger=request_logger)
+
+        self.chat_template = chat_template
+        self.chat_template_content_format: Final = chat_template_content_format
+
+    async def create_embedding(
+        self,
+        request: EmbeddingRequest,
+        raw_request: Optional[Request] = None,
+    ) -> Union[EmbeddingResponse, ErrorResponse]:
+        """
+        Embedding API similar to OpenAI's API.
+
+        See https://platform.openai.com/docs/api-reference/embeddings/create
+        for the API specification. This API mimics the OpenAI Embedding API.
+        """
+        model_name = self._get_model_name(request.model)
+        request_id = (f"{self.request_id_prefix}-"
+                      f"{self._base_request_id(raw_request)}")
+
+        ctx = EmbeddingServeContext(
+            request=request,
+            raw_request=raw_request,
+            model_name=model_name,
+            request_id=request_id,
+            chat_template=self.chat_template,
+            chat_template_content_format=self.chat_template_content_format,
+        )
+
+        return await super().handle(ctx)  # type: ignore
+
+    @override
+    def _validate_request(
+        self,
+        ctx: ServeContext[EmbeddingRequest],
+    ) -> Optional[ErrorResponse]:
+        if error := super()._validate_request(ctx):
+            return error
+
+        ctx.truncate_prompt_tokens = ctx.request.truncate_prompt_tokens
+
+        return None
+
+    @override
+    def _create_pooling_params(
+        self,
+        ctx: ServeContext[EmbeddingRequest],
+    ) -> Union[PoolingParams, ErrorResponse]:
+        pooling_params = super()._create_pooling_params(ctx)
+        if isinstance(pooling_params, ErrorResponse):
+            return pooling_params
+
+        try:
+            pooling_params.verify("embed", self.model_config)
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+        return pooling_params
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/serving_engine.py b/vllm_v0.10.0/vllm/entrypoints/openai/serving_engine.py
new file mode 100644
index 0000000..edc366f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/serving_engine.py
@@ -0,0 +1,1063 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import base64
+import io
+import json
+import sys
+import time
+from collections.abc import AsyncGenerator, Iterable, Mapping, Sequence
+from concurrent.futures import ThreadPoolExecutor
+from http import HTTPStatus
+from typing import (Annotated, Any, Callable, ClassVar, Generic, Optional,
+                    TypeVar, Union, cast, overload)
+
+import torch
+from fastapi import Request
+from pydantic import BaseModel, ConfigDict, Field
+from starlette.datastructures import Headers
+from typing_extensions import TypeIs
+
+if sys.version_info >= (3, 12):
+    from typing import TypedDict
+else:
+    from typing_extensions import TypedDict
+
+import vllm.envs as envs
+from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
+                                         ChatTemplateContentFormatOption,
+                                         ConversationMessage,
+                                         apply_hf_chat_template,
+                                         apply_mistral_chat_template,
+                                         parse_chat_messages_futures,
+                                         resolve_chat_template_content_format)
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              ChatCompletionResponse,
+                                              ClassificationRequest,
+                                              ClassificationResponse,
+                                              CompletionRequest,
+                                              CompletionResponse,
+                                              DetokenizeRequest,
+                                              EmbeddingChatRequest,
+                                              EmbeddingCompletionRequest,
+                                              EmbeddingRequest,
+                                              EmbeddingResponse, ErrorResponse,
+                                              PoolingResponse, RerankRequest,
+                                              ResponsesRequest, ScoreRequest,
+                                              ScoreResponse,
+                                              TokenizeChatRequest,
+                                              TokenizeCompletionRequest,
+                                              TokenizeResponse,
+                                              TranscriptionRequest,
+                                              TranscriptionResponse,
+                                              TranslationRequest)
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.openai.tool_parsers import ToolParser
+# yapf: enable
+from vllm.inputs.data import EmbedsPrompt as EngineEmbedsPrompt
+from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
+from vllm.inputs.parse import parse_and_batch_prompt
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.multimodal import (  # noqa: F401 - Required to resolve Pydantic error in RequestProcessingMixin
+    MultiModalDataDict)
+from vllm.outputs import PoolingRequestOutput, RequestOutput
+from vllm.pooling_params import PoolingParams
+from vllm.sampling_params import BeamSearchParams, SamplingParams
+from vllm.sequence import Logprob, PromptLogprobs
+from vllm.tracing import (contains_trace_headers, extract_trace_headers,
+                          log_tracing_disabled_warning)
+from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+from vllm.utils import (AsyncMicrobatchTokenizer, is_list_of,
+                        merge_async_iterators, random_uuid)
+
+logger = init_logger(__name__)
+
+CompletionLikeRequest = Union[CompletionRequest, DetokenizeRequest,
+                              EmbeddingCompletionRequest, RerankRequest,
+                              ClassificationRequest, ScoreRequest,
+                              TokenizeCompletionRequest]
+
+ChatLikeRequest = Union[ChatCompletionRequest, EmbeddingChatRequest,
+                        TokenizeChatRequest]
+SpeechToTextRequest = Union[TranscriptionRequest, TranslationRequest]
+AnyRequest = Union[CompletionLikeRequest, ChatLikeRequest, SpeechToTextRequest,
+                   ResponsesRequest]
+
+AnyResponse = Union[
+    CompletionResponse,
+    ChatCompletionResponse,
+    EmbeddingResponse,
+    TranscriptionResponse,
+    TokenizeResponse,
+    PoolingResponse,
+    ClassificationResponse,
+    ScoreResponse,
+]
+
+
+class TextTokensPrompt(TypedDict):
+    prompt: str
+    prompt_token_ids: list[int]
+
+
+class EmbedsPrompt(TypedDict):
+    prompt_embeds: torch.Tensor
+
+
+RequestPrompt = Union[list[int], str, TextTokensPrompt, EmbedsPrompt]
+
+
+def is_text_tokens_prompt(prompt: RequestPrompt) -> TypeIs[TextTokensPrompt]:
+    return (isinstance(prompt, dict) and "prompt_token_ids" in prompt
+            and "prompt_embeds" not in prompt)
+
+
+def is_embeds_prompt(prompt: RequestPrompt) -> TypeIs[EmbedsPrompt]:
+    return (isinstance(prompt, dict) and "prompt_token_ids" not in prompt
+            and "prompt_embeds" in prompt)
+
+
+RequestT = TypeVar("RequestT", bound=AnyRequest)
+
+
+class RequestProcessingMixin(BaseModel):
+    """
+    Mixin for request processing,
+    handling prompt preparation and engine input.
+    """
+    request_prompts: Optional[Sequence[RequestPrompt]] = []
+    engine_prompts: Optional[Union[list[EngineTokensPrompt],
+                                   list[EngineEmbedsPrompt]]] = []
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+
+class ResponseGenerationMixin(BaseModel):
+    """
+    Mixin for response generation,
+    managing result generators and final batch results.
+    """
+    result_generator: Optional[AsyncGenerator[tuple[int, Union[
+        RequestOutput, PoolingRequestOutput]], None]] = None
+    final_res_batch: list[Union[RequestOutput, PoolingRequestOutput]] = Field(
+        default_factory=list)
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+
+class ServeContext(RequestProcessingMixin, ResponseGenerationMixin, BaseModel,
+                   Generic[RequestT]):
+    # Shared across all requests
+    request: RequestT
+    raw_request: Optional[Request] = None
+    model_name: str
+    request_id: str
+    created_time: int = Field(default_factory=lambda: int(time.time()))
+    lora_request: Optional[LoRARequest] = None
+
+    # Shared across most requests
+    tokenizer: Optional[AnyTokenizer] = None
+    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
+
+    # `protected_namespaces` resolves Pydantic v2's warning
+    # on conflict with protected namespace "model_"
+    model_config = ConfigDict(
+        protected_namespaces=(),
+        arbitrary_types_allowed=True,
+    )
+
+
+ClassificationServeContext = ServeContext[ClassificationRequest]
+
+
+class EmbeddingServeContext(ServeContext[EmbeddingRequest]):
+    chat_template: Optional[str] = None
+    chat_template_content_format: ChatTemplateContentFormatOption
+
+
+# Used to resolve the Pydantic error related to
+# forward reference of MultiModalDataDict in TokensPrompt
+RequestProcessingMixin.model_rebuild()
+ServeContext.model_rebuild()
+ClassificationServeContext.model_rebuild()
+EmbeddingServeContext.model_rebuild()
+
+
+class OpenAIServing:
+    request_id_prefix: ClassVar[str] = """
+    A short string prepended to every request’s ID (e.g. "embd", "classify")
+    so you can easily tell “this ID came from Embedding vs Classification.”
+    """
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        model_config: ModelConfig,
+        models: OpenAIServingModels,
+        *,
+        request_logger: Optional[RequestLogger],
+        return_tokens_as_token_ids: bool = False,
+        enable_force_include_usage: bool = False,
+    ):
+        super().__init__()
+
+        self.engine_client = engine_client
+        self.model_config = model_config
+        self.max_model_len = model_config.max_model_len
+
+        self.models = models
+
+        self.request_logger = request_logger
+        self.return_tokens_as_token_ids = return_tokens_as_token_ids
+        self.enable_force_include_usage = enable_force_include_usage
+
+        self._tokenizer_executor = ThreadPoolExecutor(max_workers=1)
+
+        self._async_tokenizer_pool: dict[AnyTokenizer,
+                                         AsyncMicrobatchTokenizer] = {}
+
+    def _get_async_tokenizer(self, tokenizer) -> AsyncMicrobatchTokenizer:
+        """
+        Return (and cache) an `AsyncMicrobatchTokenizer` bound to the
+        given tokenizer.
+        """
+        async_tokenizer = self._async_tokenizer_pool.get(tokenizer)
+        if async_tokenizer is None:
+            async_tokenizer = AsyncMicrobatchTokenizer(tokenizer)
+            self._async_tokenizer_pool[tokenizer] = async_tokenizer
+        return async_tokenizer
+
+    async def _preprocess(
+        self,
+        ctx: ServeContext,
+    ) -> Optional[ErrorResponse]:
+        """
+        Default preprocessing hook. Subclasses may override
+        to prepare `ctx` (classification, embedding, etc.).
+        """
+        return None
+
+    def _build_response(
+        self,
+        ctx: ServeContext,
+    ) -> Union[AnyResponse, ErrorResponse]:
+        """
+        Default response builder. Subclass may override this method
+        to return the appropriate response object.
+        """
+        return self.create_error_response("unimplemented endpoint")
+
+    async def handle(
+        self,
+        ctx: ServeContext,
+    ) -> Union[AnyResponse, ErrorResponse]:
+        generation: AsyncGenerator[Union[AnyResponse, ErrorResponse], None]
+        generation = self._pipeline(ctx)
+
+        async for response in generation:
+            return response
+
+        return self.create_error_response("No response yielded from pipeline")
+
+    async def _pipeline(
+        self,
+        ctx: ServeContext,
+    ) -> AsyncGenerator[Union[AnyResponse, ErrorResponse], None]:
+        """Execute the request processing pipeline yielding responses."""
+        if error := await self._check_model(ctx.request):
+            yield error
+        if error := self._validate_request(ctx):
+            yield error
+
+        preprocess_ret = await self._preprocess(ctx)
+        if isinstance(preprocess_ret, ErrorResponse):
+            yield preprocess_ret
+
+        generators_ret = await self._prepare_generators(ctx)
+        if isinstance(generators_ret, ErrorResponse):
+            yield generators_ret
+
+        collect_ret = await self._collect_batch(ctx)
+        if isinstance(collect_ret, ErrorResponse):
+            yield collect_ret
+
+        yield self._build_response(ctx)
+
+    def _validate_request(self, ctx: ServeContext) -> Optional[ErrorResponse]:
+        truncate_prompt_tokens = getattr(ctx.request, "truncate_prompt_tokens",
+                                         None)
+
+        if truncate_prompt_tokens is not None:
+            if truncate_prompt_tokens <= self.max_model_len:
+                ctx.truncate_prompt_tokens = truncate_prompt_tokens
+            else:
+                return self.create_error_response(
+                    "truncate_prompt_tokens value is "
+                    "greater than max_model_len."
+                    " Please, select a smaller truncation size.")
+        return None
+
+    def _create_pooling_params(
+        self,
+        ctx: ServeContext,
+    ) -> Union[PoolingParams, ErrorResponse]:
+        if not hasattr(ctx.request, "to_pooling_params"):
+            return self.create_error_response(
+                "Request type does not support pooling parameters")
+
+        return ctx.request.to_pooling_params()
+
+    async def _prepare_generators(
+        self,
+        ctx: ServeContext,
+    ) -> Optional[ErrorResponse]:
+        """Schedule the request and get the result generator."""
+        generators: list[AsyncGenerator[Union[RequestOutput,
+                                              PoolingRequestOutput],
+                                        None]] = []
+
+        try:
+            trace_headers = (None if ctx.raw_request is None else await
+                             self._get_trace_headers(ctx.raw_request.headers))
+
+            pooling_params = self._create_pooling_params(ctx)
+            if isinstance(pooling_params, ErrorResponse):
+                return pooling_params
+
+            if ctx.engine_prompts is None:
+                return self.create_error_response(
+                    "Engine prompts not available")
+
+            for i, engine_prompt in enumerate(ctx.engine_prompts):
+                request_id_item = f"{ctx.request_id}-{i}"
+
+                if ctx.request_prompts is None:
+                    return self.create_error_response(
+                        "Request prompts not available")
+
+                self._log_inputs(request_id_item,
+                                 ctx.request_prompts[i],
+                                 params=pooling_params,
+                                 lora_request=ctx.lora_request)
+
+                # Mypy has an existing bug related to inferring the variance of
+                # TypedDicts with `builtins.enumerate`:
+                # https://github.com/python/mypy/issues/8586#issuecomment-2867698435
+                engine_prompt = cast(
+                    Union[EngineTokensPrompt, EngineEmbedsPrompt],
+                    engine_prompt)
+                generator = self.engine_client.encode(
+                    engine_prompt,
+                    pooling_params,
+                    request_id_item,
+                    lora_request=ctx.lora_request,
+                    trace_headers=trace_headers,
+                    priority=getattr(ctx.request, "priority", 0),
+                )
+
+                generators.append(generator)
+
+            ctx.result_generator = merge_async_iterators(*generators)
+
+            return None
+
+        except Exception as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+    async def _collect_batch(
+        self,
+        ctx: ServeContext,
+    ) -> Optional[ErrorResponse]:
+        """Collect batch results from the result generator."""
+        try:
+            if ctx.engine_prompts is None:
+                return self.create_error_response(
+                    "Engine prompts not available")
+
+            num_prompts = len(ctx.engine_prompts)
+            final_res_batch: list[Optional[Union[RequestOutput,
+                                                 PoolingRequestOutput]]]
+            final_res_batch = [None] * num_prompts
+
+            if ctx.result_generator is None:
+                return self.create_error_response(
+                    "Result generator not available")
+
+            async for i, res in ctx.result_generator:
+                final_res_batch[i] = res
+
+            if None in final_res_batch:
+                return self.create_error_response(
+                    "Failed to generate results for all prompts")
+
+            ctx.final_res_batch = [
+                res for res in final_res_batch if res is not None
+            ]
+
+            return None
+
+        except Exception as e:
+            return self.create_error_response(str(e))
+
+    def create_error_response(
+            self,
+            message: str,
+            err_type: str = "BadRequestError",
+            status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse:
+        return ErrorResponse(message=message,
+                             type=err_type,
+                             code=status_code.value)
+
+    def create_streaming_error_response(
+            self,
+            message: str,
+            err_type: str = "BadRequestError",
+            status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> str:
+        json_str = json.dumps({
+            "error":
+            self.create_error_response(message=message,
+                                       err_type=err_type,
+                                       status_code=status_code).model_dump()
+        })
+        return json_str
+
+    async def _check_model(
+        self,
+        request: AnyRequest,
+    ) -> Optional[ErrorResponse]:
+
+        error_response = None
+
+        if self._is_model_supported(request.model):
+            return None
+        if request.model in self.models.lora_requests:
+            return None
+        if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING and request.model and (
+                load_result := await self.models.resolve_lora(request.model)):
+            if isinstance(load_result, LoRARequest):
+                return None
+            if isinstance(load_result, ErrorResponse) and \
+                load_result.code == HTTPStatus.BAD_REQUEST.value:
+                error_response = load_result
+
+        return error_response or self.create_error_response(
+            message=f"The model `{request.model}` does not exist.",
+            err_type="NotFoundError",
+            status_code=HTTPStatus.NOT_FOUND)
+
+    def _get_active_default_mm_loras(
+            self, request: AnyRequest) -> Optional[LoRARequest]:
+        """Determine if there are any active default multimodal loras."""
+        # TODO: Currently this is only enabled for chat completions
+        # to be better aligned with only being enabled for .generate
+        # when run offline. It would be nice to support additional
+        # tasks types in the future.
+        message_types = self._get_message_types(request)
+        default_mm_loras = set()
+
+        for lora in self.models.lora_requests.values():
+            # Best effort match for default multimodal lora adapters;
+            # There is probably a better way to do this, but currently
+            # this matches against the set of 'types' in any content lists
+            # up until '_', e.g., to match audio_url -> audio
+            if lora.lora_name in message_types:
+                default_mm_loras.add(lora)
+
+        # Currently only support default modality specific loras if
+        # we have exactly one lora matched on the request.
+        if len(default_mm_loras) == 1:
+            return default_mm_loras.pop()
+        return None
+
+    def _maybe_get_adapters(
+        self,
+        request: AnyRequest,
+        supports_default_mm_loras: bool = False,
+    ) -> Optional[LoRARequest]:
+
+        if request.model in self.models.lora_requests:
+            return self.models.lora_requests[request.model]
+
+        # Currently only support default modality specific loras
+        # if we have exactly one lora matched on the request.
+        if supports_default_mm_loras:
+            default_mm_lora = self._get_active_default_mm_loras(request)
+            if default_mm_lora is not None:
+                return default_mm_lora
+
+        if self._is_model_supported(request.model):
+            return None
+
+        # if _check_model has been called earlier, this will be unreachable
+        raise ValueError(f"The model `{request.model}` does not exist.")
+
+    def _get_message_types(self, request: AnyRequest) -> set[str]:
+        """Retrieve the set of types from message content dicts up
+        until `_`; we use this to match potential multimodal data
+        with default per modality loras.
+        """
+        message_types: set[str] = set()
+
+        if not hasattr(request, "messages"):
+            return message_types
+
+        for message in request.messages:
+            if (isinstance(message, dict) and "content" in message
+                    and isinstance(message["content"], list)):
+                for content_dict in message["content"]:
+                    if "type" in content_dict:
+                        message_types.add(content_dict["type"].split("_")[0])
+        return message_types
+
+    async def _normalize_prompt_text_to_input(
+        self,
+        request: AnyRequest,
+        tokenizer: AnyTokenizer,
+        prompt: str,
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]],
+        add_special_tokens: bool,
+    ) -> TextTokensPrompt:
+        async_tokenizer = self._get_async_tokenizer(tokenizer)
+
+        if (self.model_config.encoder_config is not None
+                and self.model_config.encoder_config.get(
+                    "do_lower_case", False)):
+            prompt = prompt.lower()
+
+        if truncate_prompt_tokens is None:
+            encoded = await async_tokenizer(
+                prompt, add_special_tokens=add_special_tokens)
+        elif truncate_prompt_tokens < 0:
+            # Negative means we cap at the model's max length
+            encoded = await async_tokenizer(
+                prompt,
+                add_special_tokens=add_special_tokens,
+                truncation=True,
+                max_length=self.max_model_len)
+        else:
+            encoded = await async_tokenizer(
+                prompt,
+                add_special_tokens=add_special_tokens,
+                truncation=True,
+                max_length=truncate_prompt_tokens)
+
+        input_ids = encoded.input_ids
+        input_text = prompt
+
+        return self._validate_input(request, input_ids, input_text)
+
+    async def _normalize_prompt_tokens_to_input(
+        self,
+        request: AnyRequest,
+        tokenizer: AnyTokenizer,
+        prompt_ids: list[int],
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]],
+    ) -> TextTokensPrompt:
+        async_tokenizer = self._get_async_tokenizer(tokenizer)
+
+        if truncate_prompt_tokens is None:
+            input_ids = prompt_ids
+        elif truncate_prompt_tokens < 0:
+            input_ids = prompt_ids[-self.max_model_len:]
+        else:
+            input_ids = prompt_ids[-truncate_prompt_tokens:]
+
+        input_text = await async_tokenizer.decode(input_ids)
+
+        return self._validate_input(request, input_ids, input_text)
+
+    def _validate_input(
+        self,
+        request: AnyRequest,
+        input_ids: list[int],
+        input_text: str,
+    ) -> TextTokensPrompt:
+        token_num = len(input_ids)
+
+        # Note: EmbeddingRequest, ClassificationRequest,
+        # and ScoreRequest doesn't have max_tokens
+        if isinstance(request,
+                      (EmbeddingChatRequest, EmbeddingCompletionRequest,
+                       ScoreRequest, RerankRequest, ClassificationRequest)):
+
+            if token_num > self.max_model_len:
+                operations: dict[type[AnyRequest], str] = {
+                    ScoreRequest: "score",
+                    ClassificationRequest: "classification"
+                }
+                operation = operations.get(type(request),
+                                           "embedding generation")
+                raise ValueError(
+                    f"This model's maximum context length is "
+                    f"{self.max_model_len} tokens. However, you requested "
+                    f"{token_num} tokens in the input for {operation}. "
+                    f"Please reduce the length of the input.")
+            return TextTokensPrompt(prompt=input_text,
+                                    prompt_token_ids=input_ids)
+
+        # Note: TokenizeRequest and DetokenizeRequest doesn't have max_tokens
+        # and does not require model context length validation
+        if isinstance(request, (TokenizeCompletionRequest, TokenizeChatRequest,
+                                DetokenizeRequest)):
+            return TextTokensPrompt(prompt=input_text,
+                                    prompt_token_ids=input_ids)
+
+        # chat completion endpoint supports max_completion_tokens
+        if isinstance(request, ChatCompletionRequest):
+            # TODO(#9845): remove max_tokens when field dropped from OpenAI API
+            max_tokens = request.max_completion_tokens or request.max_tokens
+        else:
+            max_tokens = getattr(request, "max_tokens", None)
+        if max_tokens is None:
+            if token_num >= self.max_model_len:
+                raise ValueError(
+                    f"This model's maximum context length is "
+                    f"{self.max_model_len} tokens. However, you requested "
+                    f"{token_num} tokens in the messages, "
+                    f"Please reduce the length of the messages.")
+        elif token_num + max_tokens > self.max_model_len:
+            raise ValueError(
+                f"This model's maximum context length is "
+                f"{self.max_model_len} tokens. However, you requested "
+                f"{max_tokens + token_num} tokens "
+                f"({token_num} in the messages, "
+                f"{max_tokens} in the completion). "
+                f"Please reduce the length of the messages or completion.")
+
+        return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
+
+    async def _tokenize_prompt_input_async(
+        self,
+        request: AnyRequest,
+        tokenizer: AnyTokenizer,
+        prompt_input: Union[str, list[int]],
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None,
+        add_special_tokens: bool = True,
+    ) -> TextTokensPrompt:
+        """
+        A simpler implementation of
+        [`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs]
+        that assumes single input.
+        """
+        async for result in self._tokenize_prompt_inputs_async(
+                request,
+                tokenizer,
+            [prompt_input],
+                truncate_prompt_tokens=truncate_prompt_tokens,
+                add_special_tokens=add_special_tokens,
+        ):
+            return result
+        raise ValueError("No results yielded from tokenization")
+
+    async def _tokenize_prompt_inputs_async(
+        self,
+        request: AnyRequest,
+        tokenizer: AnyTokenizer,
+        prompt_inputs: Iterable[Union[str, list[int]]],
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None,
+        add_special_tokens: bool = True,
+    ) -> AsyncGenerator[TextTokensPrompt, None]:
+        """
+        A simpler implementation of
+        [`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs]
+        that assumes multiple inputs.
+        """
+        for text in prompt_inputs:
+            if isinstance(text, str):
+                yield await self._normalize_prompt_text_to_input(
+                    request,
+                    tokenizer,
+                    prompt=text,
+                    truncate_prompt_tokens=truncate_prompt_tokens,
+                    add_special_tokens=add_special_tokens,
+                )
+            else:
+                yield await self._normalize_prompt_tokens_to_input(
+                    request,
+                    tokenizer,
+                    prompt_ids=text,
+                    truncate_prompt_tokens=truncate_prompt_tokens,
+                )
+
+    async def _tokenize_prompt_input_or_inputs_async(
+        self,
+        request: AnyRequest,
+        tokenizer: AnyTokenizer,
+        input_or_inputs: Optional[Union[str, list[str], list[int],
+                                        list[list[int]]]],
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None,
+        add_special_tokens: bool = True,
+    ) -> tuple[list[TextTokensPrompt], list[EmbedsPrompt]]:
+        """
+        Tokenize/detokenize depending on the input format.
+
+        According to `OpenAI API <https://platform.openai.com/docs/api-reference/embeddings/create>`_
+        , each input can be a string or array of tokens. Note that each request
+        can pass one or more inputs.
+        """
+        inputs_embeds = list[EmbedsPrompt]()
+        inputs_text = list[TextTokensPrompt]()
+
+        if (isinstance(request, CompletionRequest)
+                and request.prompt_embeds is not None):
+            inputs_embeds.extend(
+                self._load_prompt_embeds(request.prompt_embeds,
+                                         truncate_prompt_tokens))
+
+        # Empty prompts are okay as long as there are prompt embeddings
+        if input_or_inputs is None or (inputs_embeds
+                                       and input_or_inputs == ""):
+            return [], inputs_embeds
+
+        # Although our type checking is based on mypy,
+        # VSCode Pyright extension should still work properly
+        # "is False" is required for Pyright to perform type narrowing
+        # See: https://github.com/microsoft/pyright/issues/7672
+
+        # Parse and batch the input prompts
+        batch_inputs = parse_and_batch_prompt(input_or_inputs)
+
+        # Process each input in the batch concurrently
+        tasks = []
+        for prompt_input in batch_inputs:
+            if prompt_input["is_tokens"] is False:
+                task = self._normalize_prompt_text_to_input(
+                    request,
+                    tokenizer,
+                    prompt_input["content"],
+                    truncate_prompt_tokens=truncate_prompt_tokens,
+                    add_special_tokens=add_special_tokens)
+            else:
+                task = self._normalize_prompt_tokens_to_input(
+                    request,
+                    tokenizer,
+                    prompt_input["content"],
+                    truncate_prompt_tokens=truncate_prompt_tokens)
+            tasks.append(task)
+
+        # Wait for all tokenization tasks to complete
+        results = await asyncio.gather(*tasks)
+        inputs_text.extend(results)
+
+        return inputs_text, inputs_embeds
+
+    @overload
+    async def _preprocess_completion(
+        self,
+        request: Union[DetokenizeRequest, EmbeddingCompletionRequest,
+                       RerankRequest, ClassificationRequest, ScoreRequest,
+                       TokenizeCompletionRequest],
+        tokenizer: AnyTokenizer,
+        input_or_inputs: Union[str, list[str], list[int], list[list[int]]],
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = ...,
+        add_special_tokens: bool = ...,
+    ) -> tuple[list[TextTokensPrompt], list[EngineTokensPrompt]]:
+        ...
+
+    @overload
+    async def _preprocess_completion(
+        self,
+        request: CompletionRequest,
+        tokenizer: AnyTokenizer,
+        input_or_inputs: Optional[Union[str, list[str], list[int],
+                                        list[list[int]]]],
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = ...,
+        add_special_tokens: bool = ...,
+    ) -> tuple[list[Union[TextTokensPrompt, EmbedsPrompt]], list[Union[
+            EngineTokensPrompt, EngineEmbedsPrompt]]]:
+        ...
+
+    async def _preprocess_completion(
+        self,
+        request: CompletionLikeRequest,
+        tokenizer: AnyTokenizer,
+        input_or_inputs: Optional[Union[str, list[str], list[int],
+                                        list[list[int]]]],
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None,
+        add_special_tokens: bool = True,
+    ) -> tuple[Union[list[TextTokensPrompt], list[Union[
+            TextTokensPrompt, EmbedsPrompt]]], Union[
+                list[EngineTokensPrompt], list[Union[EngineTokensPrompt,
+                                                     EngineEmbedsPrompt]]]]:
+        if not isinstance(request,
+                          CompletionRequest) and input_or_inputs is None:
+            raise ValueError(
+                "Prompt embeds with non-completion requests is not"
+                " currently supported.")
+
+        (request_prompts_text, request_prompts_embeds
+         ) = await self._tokenize_prompt_input_or_inputs_async(
+             request,
+             tokenizer,
+             input_or_inputs,
+             truncate_prompt_tokens=truncate_prompt_tokens,
+             add_special_tokens=add_special_tokens,
+         )
+
+        engine_prompts_text = [
+            EngineTokensPrompt(
+                prompt_token_ids=request_prompt_text["prompt_token_ids"])
+            for request_prompt_text in request_prompts_text
+        ]
+        cache_salt = request.cache_salt if (
+            hasattr(request, "cache_salt")
+            and request.cache_salt is not None) else None
+        if cache_salt:
+            for prompt_text in engine_prompts_text:
+                prompt_text["cache_salt"] = cache_salt
+
+        # This check is equivalent to simply checking if
+        # `request_prompts_embeds` is empty, but it's difficult to propagate
+        # overloads to the private helper functions to enable this check.
+        # This overload is needed because only TextPrompts are allowed for
+        # non-completion requests and if we don't add the overload here,
+        # everywhere this function is used outside of serving_completion will
+        # need logic asserting that only text prompts are in the request.
+        if not isinstance(request,
+                          CompletionRequest) and input_or_inputs is not None:
+            return request_prompts_text, engine_prompts_text
+
+        engine_prompts_embeds = [
+            EngineEmbedsPrompt(
+                prompt_embeds=request_prompt_embeds["prompt_embeds"])
+            for request_prompt_embeds in request_prompts_embeds
+        ]
+        if cache_salt:
+            for prompt_embed in engine_prompts_embeds:
+                prompt_embed["cache_salt"] = cache_salt
+
+        request_prompts = request_prompts_embeds + request_prompts_text
+        engine_prompts = engine_prompts_embeds + engine_prompts_text
+        return request_prompts, engine_prompts
+
+    async def _preprocess_chat(
+        self,
+        request: Union[ChatLikeRequest, ResponsesRequest],
+        tokenizer: AnyTokenizer,
+        messages: list[ChatCompletionMessageParam],
+        chat_template: Optional[str],
+        chat_template_content_format: ChatTemplateContentFormatOption,
+        add_generation_prompt: bool = True,
+        continue_final_message: bool = False,
+        tool_dicts: Optional[list[dict[str, Any]]] = None,
+        documents: Optional[list[dict[str, str]]] = None,
+        chat_template_kwargs: Optional[dict[str, Any]] = None,
+        tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None,
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
+        add_special_tokens: bool = False,
+    ) -> tuple[list[ConversationMessage], Sequence[RequestPrompt],
+               list[EngineTokensPrompt]]:
+        model_config = self.model_config
+
+        resolved_content_format = resolve_chat_template_content_format(
+            chat_template,
+            tool_dicts,
+            chat_template_content_format,
+            tokenizer,
+            model_config=model_config,
+        )
+        conversation, mm_data_future = parse_chat_messages_futures(
+            messages,
+            model_config,
+            tokenizer,
+            content_format=resolved_content_format,
+        )
+
+        _chat_template_kwargs: dict[str, Any] = dict(
+            chat_template=chat_template,
+            add_generation_prompt=add_generation_prompt,
+            continue_final_message=continue_final_message,
+            tools=tool_dicts,
+            documents=documents,
+        )
+        _chat_template_kwargs.update(chat_template_kwargs or {})
+
+        request_prompt: Union[str, list[int]]
+        if isinstance(tokenizer, MistralTokenizer):
+            request_prompt = apply_mistral_chat_template(
+                tokenizer,
+                messages=messages,
+                **_chat_template_kwargs,
+            )
+        else:
+            request_prompt = apply_hf_chat_template(
+                tokenizer=tokenizer,
+                conversation=conversation,
+                model_config=model_config,
+                **_chat_template_kwargs,
+            )
+
+        mm_data = await mm_data_future
+
+        # tool parsing is done only if a tool_parser has been set and if
+        # tool_choice is not "none" (if tool_choice is "none" but a tool_parser
+        # is set, we want to prevent parsing a tool_call hallucinated by the LLM
+        should_parse_tools = tool_parser is not None and (hasattr(
+            request, "tool_choice") and request.tool_choice != "none")
+
+        if should_parse_tools:
+            if not isinstance(request, ChatCompletionRequest):
+                msg = "Tool usage is only supported for Chat Completions API"
+                raise NotImplementedError(msg)
+
+            request = tool_parser(tokenizer).adjust_request(  # type: ignore
+                request=request)
+
+        if isinstance(request_prompt, str):
+            prompt_inputs = await self._tokenize_prompt_input_async(
+                request,
+                tokenizer,
+                request_prompt,
+                truncate_prompt_tokens=truncate_prompt_tokens,
+                add_special_tokens=add_special_tokens,
+            )
+        else:
+            # For MistralTokenizer
+            assert is_list_of(request_prompt, int), (
+                "Prompt has to be either a string or a list of token ids")
+            prompt_inputs = TextTokensPrompt(
+                prompt=tokenizer.decode(request_prompt),
+                prompt_token_ids=request_prompt)
+
+        engine_prompt = EngineTokensPrompt(
+            prompt_token_ids=prompt_inputs["prompt_token_ids"])
+        if mm_data is not None:
+            engine_prompt["multi_modal_data"] = mm_data
+        if request.mm_processor_kwargs is not None:
+            engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs
+
+        if hasattr(request, "cache_salt") and request.cache_salt is not None:
+            engine_prompt["cache_salt"] = request.cache_salt
+
+        return conversation, [request_prompt], [engine_prompt]
+
+    def _load_prompt_embeds(
+        self,
+        prompt_embeds: Optional[Union[bytes, list[bytes]]],
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
+    ) -> list[EmbedsPrompt]:
+
+        def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt:
+            tensor = torch.load(io.BytesIO(base64.b64decode(embed)),
+                                weights_only=True)
+            assert isinstance(
+                tensor,
+                (torch.FloatTensor, torch.BFloat16Tensor, torch.HalfTensor))
+            if tensor.dim() > 2:
+                tensor = tensor.squeeze(0)
+                assert tensor.dim() == 2
+            if truncate_prompt_tokens is not None:
+                tensor = tensor[-truncate_prompt_tokens:]
+            return {"prompt_embeds": tensor}
+
+        if prompt_embeds:
+            if isinstance(prompt_embeds, list):
+                return [
+                    _load_and_validate_embed(embed) for embed in prompt_embeds
+                ]
+            else:
+                return [_load_and_validate_embed(prompt_embeds)]
+        else:
+            return []
+
+    def _log_inputs(
+        self,
+        request_id: str,
+        inputs: RequestPrompt,
+        params: Optional[Union[SamplingParams, PoolingParams,
+                               BeamSearchParams]],
+        lora_request: Optional[LoRARequest],
+    ) -> None:
+        if self.request_logger is None:
+            return
+        prompt, prompt_token_ids, prompt_embeds = None, None, None
+        if isinstance(inputs, str):
+            prompt = inputs
+        elif isinstance(inputs, list):
+            prompt_token_ids = inputs
+        elif 'prompt_embeds' in inputs:
+            prompt_embeds = inputs.get("prompt_embeds")
+        else:
+            prompt = inputs["prompt"]
+            prompt_token_ids = inputs["prompt_token_ids"]
+
+        self.request_logger.log_inputs(
+            request_id,
+            prompt,
+            prompt_token_ids,
+            prompt_embeds,
+            params=params,
+            lora_request=lora_request,
+        )
+
+    async def _get_trace_headers(
+        self,
+        headers: Headers,
+    ) -> Optional[Mapping[str, str]]:
+        is_tracing_enabled = await self.engine_client.is_tracing_enabled()
+
+        if is_tracing_enabled:
+            return extract_trace_headers(headers)
+
+        if contains_trace_headers(headers):
+            log_tracing_disabled_warning()
+
+        return None
+
+    @staticmethod
+    def _base_request_id(raw_request: Optional[Request],
+                         default: Optional[str] = None) -> Optional[str]:
+        """Pulls the request id to use from a header, if provided"""
+        default = default or random_uuid()
+        if raw_request is None:
+            return default
+
+        return raw_request.headers.get("X-Request-Id", default)
+
+    @staticmethod
+    def _get_decoded_token(logprob: Logprob,
+                           token_id: int,
+                           tokenizer: AnyTokenizer,
+                           return_as_token_id: bool = False) -> str:
+        if return_as_token_id:
+            return f"token_id:{token_id}"
+
+        if logprob.decoded_token is not None:
+            return logprob.decoded_token
+        return tokenizer.decode(token_id)
+
+    def _is_model_supported(self, model_name: Optional[str]) -> bool:
+        if not model_name:
+            return True
+        return self.models.is_base_model(model_name)
+
+    def _get_model_name(self,
+                        model_name: Optional[str] = None,
+                        lora_request: Optional[LoRARequest] = None) -> str:
+        if lora_request:
+            return lora_request.lora_name
+        if not model_name:
+            return self.models.base_model_paths[0].name
+        return model_name
+
+
+def clamp_prompt_logprobs(
+    prompt_logprobs: Union[PromptLogprobs,
+                           None]) -> Union[PromptLogprobs, None]:
+    if prompt_logprobs is None:
+        return prompt_logprobs
+
+    for logprob_dict in prompt_logprobs:
+        if logprob_dict is None:
+            continue
+        for logprob_values in logprob_dict.values():
+            if logprob_values.logprob == float('-inf'):
+                logprob_values.logprob = -9999.0
+    return prompt_logprobs
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/serving_models.py b/vllm_v0.10.0/vllm/entrypoints/openai/serving_models.py
new file mode 100644
index 0000000..27614fc
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/serving_models.py
@@ -0,0 +1,289 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from asyncio import Lock
+from collections import defaultdict
+from dataclasses import dataclass
+from http import HTTPStatus
+from typing import Optional, Union
+
+from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.openai.protocol import (ErrorResponse,
+                                              LoadLoRAAdapterRequest,
+                                              ModelCard, ModelList,
+                                              ModelPermission,
+                                              UnloadLoRAAdapterRequest)
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
+from vllm.utils import AtomicCounter
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class BaseModelPath:
+    name: str
+    model_path: str
+
+
+@dataclass
+class LoRAModulePath:
+    name: str
+    path: str
+    base_model_name: Optional[str] = None
+
+
+class OpenAIServingModels:
+    """Shared instance to hold data about the loaded base model(s) and adapters.
+
+    Handles the routes:
+    - /v1/models
+    - /v1/load_lora_adapter
+    - /v1/unload_lora_adapter
+    """
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        model_config: ModelConfig,
+        base_model_paths: list[BaseModelPath],
+        *,
+        lora_modules: Optional[list[LoRAModulePath]] = None,
+    ):
+        super().__init__()
+
+        self.base_model_paths = base_model_paths
+
+        self.max_model_len = model_config.max_model_len
+        self.engine_client = engine_client
+        self.model_config = model_config
+
+        self.static_lora_modules = lora_modules
+        self.lora_requests: dict[str, LoRARequest] = {}
+        self.lora_id_counter = AtomicCounter(0)
+
+        self.lora_resolvers: list[LoRAResolver] = []
+        for lora_resolver_name in LoRAResolverRegistry.get_supported_resolvers(
+        ):
+            self.lora_resolvers.append(
+                LoRAResolverRegistry.get_resolver(lora_resolver_name))
+        self.lora_resolver_lock: dict[str, Lock] = defaultdict(Lock)
+
+    async def init_static_loras(self):
+        """Loads all static LoRA modules.
+        Raises if any fail to load"""
+        if self.static_lora_modules is None:
+            return
+        for lora in self.static_lora_modules:
+            load_request = LoadLoRAAdapterRequest(lora_path=lora.path,
+                                                  lora_name=lora.name)
+            load_result = await self.load_lora_adapter(
+                request=load_request, base_model_name=lora.base_model_name)
+            if isinstance(load_result, ErrorResponse):
+                raise ValueError(load_result.message)
+
+    def is_base_model(self, model_name) -> bool:
+        return any(model.name == model_name for model in self.base_model_paths)
+
+    def model_name(self, lora_request: Optional[LoRARequest] = None) -> str:
+        """Returns the appropriate model name depending on the availability
+        and support of the LoRA or base model.
+        Parameters:
+        - lora: LoRARequest that contain a base_model_name.
+        Returns:
+        - str: The name of the base model or the first available model path.
+        """
+        if lora_request is not None:
+            return lora_request.lora_name
+        return self.base_model_paths[0].name
+
+    async def show_available_models(self) -> ModelList:
+        """Show available models. This includes the base model and all 
+        adapters"""
+        model_cards = [
+            ModelCard(id=base_model.name,
+                      max_model_len=self.max_model_len,
+                      root=base_model.model_path,
+                      permission=[ModelPermission()])
+            for base_model in self.base_model_paths
+        ]
+        lora_cards = [
+            ModelCard(id=lora.lora_name,
+                      root=lora.local_path,
+                      parent=lora.base_model_name if lora.base_model_name else
+                      self.base_model_paths[0].name,
+                      permission=[ModelPermission()])
+            for lora in self.lora_requests.values()
+        ]
+        model_cards.extend(lora_cards)
+        return ModelList(data=model_cards)
+
+    async def load_lora_adapter(
+            self,
+            request: LoadLoRAAdapterRequest,
+            base_model_name: Optional[str] = None
+    ) -> Union[ErrorResponse, str]:
+        lora_name = request.lora_name
+
+        # Ensure atomicity based on the lora name
+        async with self.lora_resolver_lock[lora_name]:
+            error_check_ret = await self._check_load_lora_adapter_request(
+                request)
+            if error_check_ret is not None:
+                return error_check_ret
+
+            lora_path = request.lora_path
+            unique_id = self.lora_id_counter.inc(1)
+            lora_request = LoRARequest(lora_name=lora_name,
+                                       lora_int_id=unique_id,
+                                       lora_path=lora_path)
+            if base_model_name is not None and self.is_base_model(
+                    base_model_name):
+                lora_request.base_model_name = base_model_name
+
+            # Validate that the adapter can be loaded into the engine
+            # This will also pre-load it for incoming requests
+            try:
+                await self.engine_client.add_lora(lora_request)
+            except Exception as e:
+                error_type = "BadRequestError"
+                status_code = HTTPStatus.BAD_REQUEST
+                if "No adapter found" in str(e):
+                    error_type = "NotFoundError"
+                    status_code = HTTPStatus.NOT_FOUND
+
+                return create_error_response(message=str(e),
+                                             err_type=error_type,
+                                             status_code=status_code)
+
+            self.lora_requests[lora_name] = lora_request
+            logger.info("Loaded new LoRA adapter: name '%s', path '%s'",
+                        lora_name, lora_path)
+            return f"Success: LoRA adapter '{lora_name}' added successfully."
+
+    async def unload_lora_adapter(
+            self,
+            request: UnloadLoRAAdapterRequest) -> Union[ErrorResponse, str]:
+        lora_name = request.lora_name
+
+        # Ensure atomicity based on the lora name
+        async with self.lora_resolver_lock[lora_name]:
+            error_check_ret = await self._check_unload_lora_adapter_request(
+                request)
+            if error_check_ret is not None:
+                return error_check_ret
+
+            # Safe to delete now since we hold the lock
+            del self.lora_requests[lora_name]
+            logger.info("Removed LoRA adapter: name '%s'", lora_name)
+            return f"Success: LoRA adapter '{lora_name}' removed successfully."
+
+    async def _check_load_lora_adapter_request(
+            self, request: LoadLoRAAdapterRequest) -> Optional[ErrorResponse]:
+        # Check if both 'lora_name' and 'lora_path' are provided
+        if not request.lora_name or not request.lora_path:
+            return create_error_response(
+                message="Both 'lora_name' and 'lora_path' must be provided.",
+                err_type="InvalidUserInput",
+                status_code=HTTPStatus.BAD_REQUEST)
+
+        # Check if the lora adapter with the given name already exists
+        if request.lora_name in self.lora_requests:
+            return create_error_response(
+                message=
+                f"The lora adapter '{request.lora_name}' has already been "
+                "loaded.",
+                err_type="InvalidUserInput",
+                status_code=HTTPStatus.BAD_REQUEST)
+
+        return None
+
+    async def _check_unload_lora_adapter_request(
+            self,
+            request: UnloadLoRAAdapterRequest) -> Optional[ErrorResponse]:
+        # Check if 'lora_name' is not provided return an error
+        if not request.lora_name:
+            return create_error_response(
+                message=
+                "'lora_name' needs to be provided to unload a LoRA adapter.",
+                err_type="InvalidUserInput",
+                status_code=HTTPStatus.BAD_REQUEST)
+
+        # Check if the lora adapter with the given name exists
+        if request.lora_name not in self.lora_requests:
+            return create_error_response(
+                message=
+                f"The lora adapter '{request.lora_name}' cannot be found.",
+                err_type="NotFoundError",
+                status_code=HTTPStatus.NOT_FOUND)
+
+        return None
+
+    async def resolve_lora(
+            self, lora_name: str) -> Union[LoRARequest, ErrorResponse]:
+        """Attempt to resolve a LoRA adapter using available resolvers.
+
+        Args:
+            lora_name: Name/identifier of the LoRA adapter
+
+        Returns:
+            LoRARequest if found and loaded successfully.
+            ErrorResponse (404) if no resolver finds the adapter.
+            ErrorResponse (400) if adapter(s) are found but none load.
+        """
+        async with self.lora_resolver_lock[lora_name]:
+            # First check if this LoRA is already loaded
+            if lora_name in self.lora_requests:
+                return self.lora_requests[lora_name]
+
+            base_model_name = self.model_config.model
+            unique_id = self.lora_id_counter.inc(1)
+            found_adapter = False
+
+            # Try to resolve using available resolvers
+            for resolver in self.lora_resolvers:
+                lora_request = await resolver.resolve_lora(
+                    base_model_name, lora_name)
+
+                if lora_request is not None:
+                    found_adapter = True
+                    lora_request.lora_int_id = unique_id
+
+                    try:
+                        await self.engine_client.add_lora(lora_request)
+                        self.lora_requests[lora_name] = lora_request
+                        logger.info(
+                            "Resolved and loaded LoRA adapter '%s' using %s",
+                            lora_name, resolver.__class__.__name__)
+                        return lora_request
+                    except BaseException as e:
+                        logger.warning(
+                            "Failed to load LoRA '%s' resolved by %s: %s. "
+                            "Trying next resolver.", lora_name,
+                            resolver.__class__.__name__, e)
+                        continue
+
+            if found_adapter:
+                # An adapter was found, but all attempts to load it failed.
+                return create_error_response(
+                    message=(f"LoRA adapter '{lora_name}' was found "
+                             "but could not be loaded."),
+                    err_type="BadRequestError",
+                    status_code=HTTPStatus.BAD_REQUEST)
+            else:
+                # No adapter was found
+                return create_error_response(
+                    message=f"LoRA adapter {lora_name} does not exist",
+                    err_type="NotFoundError",
+                    status_code=HTTPStatus.NOT_FOUND)
+
+
+def create_error_response(
+        message: str,
+        err_type: str = "BadRequestError",
+        status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse:
+    return ErrorResponse(message=message,
+                         type=err_type,
+                         code=status_code.value)
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/serving_pooling.py b/vllm_v0.10.0/vllm/entrypoints/openai/serving_pooling.py
new file mode 100644
index 0000000..12334cd
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/serving_pooling.py
@@ -0,0 +1,231 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import base64
+import time
+from collections.abc import AsyncGenerator
+from typing import Final, Literal, Optional, Union, cast
+
+import jinja2
+import numpy as np
+import torch
+from fastapi import Request
+from typing_extensions import assert_never
+
+from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.protocol import (ErrorResponse,
+                                              PoolingChatRequest,
+                                              PoolingRequest, PoolingResponse,
+                                              PoolingResponseData, UsageInfo)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.utils import _validate_truncation_size
+from vllm.logger import init_logger
+from vllm.outputs import PoolingOutput, PoolingRequestOutput
+from vllm.utils import merge_async_iterators
+
+logger = init_logger(__name__)
+
+
+def _get_data(
+    output: PoolingOutput,
+    encoding_format: Literal["float", "base64"],
+) -> Union[list[float], str]:
+    if encoding_format == "float":
+        return output.data.tolist()
+    elif encoding_format == "base64":
+        # Force to use float32 for base64 encoding
+        # to match the OpenAI python client behavior
+        pt_float32 = output.data.to(dtype=torch.float32)
+        pooling_bytes = np.array(pt_float32, dtype="float32").tobytes()
+        return base64.b64encode(pooling_bytes).decode("utf-8")
+
+    assert_never(encoding_format)
+
+
+class OpenAIServingPooling(OpenAIServing):
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        model_config: ModelConfig,
+        models: OpenAIServingModels,
+        *,
+        request_logger: Optional[RequestLogger],
+        chat_template: Optional[str],
+        chat_template_content_format: ChatTemplateContentFormatOption,
+    ) -> None:
+        super().__init__(engine_client=engine_client,
+                         model_config=model_config,
+                         models=models,
+                         request_logger=request_logger)
+
+        self.chat_template = chat_template
+        self.chat_template_content_format: Final = chat_template_content_format
+
+    async def create_pooling(
+        self,
+        request: PoolingRequest,
+        raw_request: Optional[Request] = None,
+    ) -> Union[PoolingResponse, ErrorResponse]:
+        """
+        See https://platform.openai.com/docs/api-reference/embeddings/create
+        for the API specification. This API mimics the OpenAI Embedding API.
+        """
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        encoding_format = request.encoding_format
+        if request.dimensions is not None:
+            return self.create_error_response(
+                "dimensions is currently not supported")
+
+        model_name = self._get_model_name(request.model)
+        request_id = f"pool-{self._base_request_id(raw_request)}"
+        created_time = int(time.time())
+
+        truncate_prompt_tokens = request.truncate_prompt_tokens
+
+        try:
+            truncate_prompt_tokens = _validate_truncation_size(
+                self.max_model_len, truncate_prompt_tokens)
+            lora_request = self._maybe_get_adapters(request)
+
+            tokenizer = await self.engine_client.get_tokenizer(lora_request)
+
+            if isinstance(request, PoolingChatRequest):
+                (
+                    _,
+                    request_prompts,
+                    engine_prompts,
+                ) = await self._preprocess_chat(
+                    request,
+                    tokenizer,
+                    request.messages,
+                    chat_template=request.chat_template or self.chat_template,
+                    chat_template_content_format=self.
+                    chat_template_content_format,
+                    # In pooling requests, we are not generating tokens,
+                    # so there is no need to append extra tokens to the input
+                    add_generation_prompt=False,
+                    continue_final_message=False,
+                    truncate_prompt_tokens=truncate_prompt_tokens,
+                    add_special_tokens=request.add_special_tokens,
+                )
+            else:
+                (request_prompts,
+                 engine_prompts) = await self._preprocess_completion(
+                     request,
+                     tokenizer,
+                     request.input,
+                     truncate_prompt_tokens=truncate_prompt_tokens,
+                     add_special_tokens=request.add_special_tokens,
+                 )
+        except (ValueError, TypeError, jinja2.TemplateError) as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
+
+        # Schedule the request and get the result generator.
+        generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
+        try:
+            pooling_params = request.to_pooling_params()
+
+            try:
+                pooling_params.verify("encode", self.model_config)
+            except ValueError as e:
+                return self.create_error_response(str(e))
+
+            for i, engine_prompt in enumerate(engine_prompts):
+                request_id_item = f"{request_id}-{i}"
+
+                self._log_inputs(request_id_item,
+                                 request_prompts[i],
+                                 params=pooling_params,
+                                 lora_request=lora_request)
+
+                trace_headers = (None if raw_request is None else await
+                                 self._get_trace_headers(raw_request.headers))
+
+                generator = self.engine_client.encode(
+                    engine_prompt,
+                    pooling_params,
+                    request_id_item,
+                    lora_request=lora_request,
+                    trace_headers=trace_headers,
+                    priority=request.priority,
+                )
+
+                generators.append(generator)
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+        result_generator = merge_async_iterators(*generators)
+
+        num_prompts = len(engine_prompts)
+
+        # Non-streaming response
+        final_res_batch: list[Optional[PoolingRequestOutput]]
+        final_res_batch = [None] * num_prompts
+        try:
+            async for i, res in result_generator:
+                final_res_batch[i] = res
+
+            assert all(final_res is not None for final_res in final_res_batch)
+
+            final_res_batch_checked = cast(list[PoolingRequestOutput],
+                                           final_res_batch)
+
+            response = self.request_output_to_pooling_response(
+                final_res_batch_checked,
+                request_id,
+                created_time,
+                model_name,
+                encoding_format,
+            )
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+        return response
+
+    def request_output_to_pooling_response(
+        self,
+        final_res_batch: list[PoolingRequestOutput],
+        request_id: str,
+        created_time: int,
+        model_name: str,
+        encoding_format: Literal["float", "base64"],
+    ) -> PoolingResponse:
+        items: list[PoolingResponseData] = []
+        num_prompt_tokens = 0
+
+        for idx, final_res in enumerate(final_res_batch):
+            item = PoolingResponseData(
+                index=idx,
+                data=_get_data(final_res.outputs, encoding_format),
+            )
+            prompt_token_ids = final_res.prompt_token_ids
+
+            items.append(item)
+            num_prompt_tokens += len(prompt_token_ids)
+
+        usage = UsageInfo(
+            prompt_tokens=num_prompt_tokens,
+            total_tokens=num_prompt_tokens,
+        )
+
+        return PoolingResponse(
+            id=request_id,
+            created=created_time,
+            model=model_name,
+            data=items,
+            usage=usage,
+        )
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/serving_responses.py b/vllm_v0.10.0/vllm/entrypoints/openai/serving_responses.py
new file mode 100644
index 0000000..64880a3
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/serving_responses.py
@@ -0,0 +1,458 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import time
+from collections.abc import AsyncGenerator, AsyncIterator
+from http import HTTPStatus
+from typing import Callable, Final, Optional, Union
+
+import jinja2
+from fastapi import Request
+from openai.types.responses import ResponseOutputMessage, ResponseOutputText
+
+from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
+                                         ChatTemplateContentFormatOption)
+from vllm.entrypoints.logger import RequestLogger
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.entrypoints.openai.protocol import (ErrorResponse,
+                                              PromptTokenUsageInfo,
+                                              RequestResponseMetadata,
+                                              ResponseReasoningItem,
+                                              ResponsesRequest,
+                                              ResponsesResponse, UsageInfo)
+# yapf: enable
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.logger import init_logger
+from vllm.outputs import RequestOutput
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import random_uuid
+
+logger = init_logger(__name__)
+
+
+class OpenAIServingResponses(OpenAIServing):
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        model_config: ModelConfig,
+        models: OpenAIServingModels,
+        *,
+        request_logger: Optional[RequestLogger],
+        chat_template: Optional[str],
+        chat_template_content_format: ChatTemplateContentFormatOption,
+        return_tokens_as_token_ids: bool = False,
+        reasoning_parser: str = "",
+        enable_auto_tools: bool = False,
+        tool_parser: Optional[str] = None,
+        enable_prompt_tokens_details: bool = False,
+        enable_force_include_usage: bool = False,
+    ) -> None:
+        super().__init__(
+            engine_client=engine_client,
+            model_config=model_config,
+            models=models,
+            request_logger=request_logger,
+            return_tokens_as_token_ids=return_tokens_as_token_ids,
+            enable_force_include_usage=enable_force_include_usage,
+        )
+
+        self.chat_template = chat_template
+        self.chat_template_content_format: Final = chat_template_content_format
+
+        self.reasoning_parser: Optional[Callable[[AnyTokenizer],
+                                                 ReasoningParser]] = None
+        if reasoning_parser:
+            try:
+                self.reasoning_parser = (
+                    ReasoningParserManager.get_reasoning_parser(
+                        reasoning_parser))
+                assert self.reasoning_parser is not None
+            except Exception as e:
+                raise TypeError(
+                    f"{reasoning_parser=} has not been registered") from e
+
+        self.enable_prompt_tokens_details = enable_prompt_tokens_details
+        self.enable_force_include_usage = enable_force_include_usage
+        self.default_sampling_params = (
+            self.model_config.get_diff_sampling_param())
+        if self.default_sampling_params:
+            source = self.model_config.generation_config
+            source = "model" if source == "auto" else source
+            logger.info("Using default chat sampling params from %s: %s",
+                        source, self.default_sampling_params)
+
+        # HACK(woosuk): This is a hack. We should use a better store.
+        # FIXME: This causes a memory leak since we never remove responses
+        # from the store.
+        self.response_store: dict[str, ResponsesResponse] = {}
+        self.response_store_lock = asyncio.Lock()
+
+        # HACK(woosuk): This is a hack. We should use a better store.
+        # FIXME: This causes a memory leak since we never remove messages
+        # from the store.
+        self.msg_store: dict[str, list[ChatCompletionMessageParam]] = {}
+
+        self.background_tasks: dict[str, asyncio.Task] = {}
+
+    async def create_responses(
+        self,
+        request: ResponsesRequest,
+        raw_request: Optional[Request] = None,
+    ) -> Union[AsyncGenerator[str, None], ResponsesResponse, ErrorResponse]:
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            logger.error("Error with model %s", error_check_ret)
+            return error_check_ret
+
+        # If the engine is dead, raise the engine's DEAD_ERROR.
+        # This is required for the streaming case, where we return a
+        # success status before we actually start generating text :).
+        if self.engine_client.errored:
+            raise self.engine_client.dead_error
+
+        # Handle the previous response ID.
+        prev_response_id = request.previous_response_id
+        if prev_response_id is not None:
+            if not prev_response_id.startswith("resp_"):
+                return self._make_invalid_id_error(prev_response_id)
+            async with self.response_store_lock:
+                prev_response = self.response_store.get(prev_response_id)
+            if prev_response is None:
+                return self._make_not_found_error(prev_response_id)
+        else:
+            prev_response = None
+        # Construct the input messages.
+        messages = self._construct_input_messages(request, prev_response)
+
+        try:
+            lora_request = self._maybe_get_adapters(request)
+            model_name = self._get_model_name(request.model, lora_request)
+            tokenizer = await self.engine_client.get_tokenizer(lora_request)
+
+            _, request_prompts, engine_prompts = await self._preprocess_chat(
+                request,
+                tokenizer,
+                messages,
+                chat_template=self.chat_template,
+                chat_template_content_format=self.chat_template_content_format,
+            )
+        except (ValueError, TypeError, RuntimeError,
+                jinja2.TemplateError) as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(f"{e} {e.__cause__}")
+
+        request_metadata = RequestResponseMetadata(
+            request_id=request.request_id)
+        if raw_request:
+            raw_request.state.request_metadata = request_metadata
+
+        # Schedule the request and get the result generator.
+        generators: list[AsyncGenerator[RequestOutput, None]] = []
+        try:
+            for i, engine_prompt in enumerate(engine_prompts):
+                default_max_tokens = self.max_model_len - len(
+                    engine_prompt["prompt_token_ids"])
+                sampling_params = request.to_sampling_params(
+                    default_max_tokens, self.default_sampling_params)
+
+                self._log_inputs(request.request_id,
+                                 request_prompts[i],
+                                 params=sampling_params,
+                                 lora_request=lora_request)
+
+                trace_headers = (None if raw_request is None else await
+                                 self._get_trace_headers(raw_request.headers))
+
+                generator = self.engine_client.generate(
+                    engine_prompt,
+                    sampling_params,
+                    request.request_id,
+                    lora_request=lora_request,
+                    trace_headers=trace_headers,
+                    priority=request.priority,
+                )
+                generators.append(generator)
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+        assert len(generators) == 1
+        result_generator, = generators
+
+        # Store the input messages.
+        if request.store:
+            self.msg_store[request.request_id] = messages
+
+        if request.background:
+            created_time = int(time.time())
+            response = ResponsesResponse.from_request(
+                request,
+                sampling_params,
+                model_name=model_name,
+                created_time=created_time,
+                output=[],
+                status="queued",
+                usage=None,
+            )
+            async with self.response_store_lock:
+                self.response_store[response.id] = response
+
+            # Run the request in the background.
+            task = asyncio.create_task(
+                self._run_background_request(
+                    request,
+                    sampling_params,
+                    result_generator,
+                    model_name,
+                    tokenizer,
+                    request_metadata,
+                    created_time,
+                ),
+                name=f"create_{response.id}",
+            )
+
+            # For cleanup.
+            response_id = response.id
+            self.background_tasks[response_id] = task
+            task.add_done_callback(
+                lambda _: self.background_tasks.pop(response_id, None))
+            return response
+
+        if request.stream:
+            raise NotImplementedError("Streaming responses are not supported")
+
+        try:
+            return await self.responses_full_generator(
+                request,
+                sampling_params,
+                result_generator,
+                model_name,
+                tokenizer,
+                request_metadata,
+            )
+        except Exception as e:
+            return self.create_error_response(str(e))
+
+    async def responses_full_generator(
+        self,
+        request: ResponsesRequest,
+        sampling_params: SamplingParams,
+        result_generator: AsyncIterator[RequestOutput],
+        model_name: str,
+        tokenizer: AnyTokenizer,
+        request_metadata: RequestResponseMetadata,
+        created_time: Optional[int] = None,
+    ) -> Union[ErrorResponse, ResponsesResponse]:
+        if created_time is None:
+            created_time = int(time.time())
+        final_res: Optional[RequestOutput] = None
+
+        try:
+            async for res in result_generator:
+                final_res = res
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+        assert final_res is not None
+        assert len(final_res.outputs) == 1
+        final_output = final_res.outputs[0]
+
+        if self.reasoning_parser:
+            try:
+                reasoning_parser = self.reasoning_parser(tokenizer)
+            except RuntimeError as e:
+                logger.exception("Error in reasoning parser creation.")
+                return self.create_error_response(str(e))
+
+            reasoning_content, content = (
+                reasoning_parser.extract_reasoning_content(final_output.text,
+                                                           request=request))
+        else:
+            reasoning_content = None
+            content = final_output.text
+
+        output = []
+        if reasoning_content:
+            reasoning_item = ResponseReasoningItem(
+                text=reasoning_content,
+                status=None,  # NOTE: Only the last output item has status.
+            )
+            output.append(reasoning_item)
+        if content:
+            output_text = ResponseOutputText(
+                text=content,
+                annotations=[],  # TODO
+                type="output_text",
+                logprobs=None,  # TODO
+            )
+            message = ResponseOutputMessage(
+                id=f"msg_{random_uuid()}",
+                content=[output_text],
+                role="assistant",
+                status="completed",
+                type="message",
+            )
+            output.append(message)
+
+        # Calculate usage.
+        assert final_res.prompt_token_ids is not None
+        num_prompt_tokens = len(final_res.prompt_token_ids)
+        num_generated_tokens = len(final_output.token_ids)
+        usage = UsageInfo(
+            prompt_tokens=num_prompt_tokens,
+            completion_tokens=num_generated_tokens,
+            total_tokens=num_prompt_tokens + num_generated_tokens,
+        )
+        if self.enable_prompt_tokens_details and final_res.num_cached_tokens:
+            usage.prompt_tokens_details = PromptTokenUsageInfo(
+                cached_tokens=final_res.num_cached_tokens)
+        request_metadata.final_usage_info = usage
+
+        response = ResponsesResponse.from_request(
+            request,
+            sampling_params,
+            model_name=model_name,
+            created_time=created_time,
+            output=output,
+            status="completed",
+            usage=usage,
+        )
+
+        if request.store:
+            async with self.response_store_lock:
+                stored_response = self.response_store.get(response.id)
+                # If the response is already cancelled, don't update it.
+                if (stored_response is None
+                        or stored_response.status != "cancelled"):
+                    self.response_store[response.id] = response
+        return response
+
+    def _construct_input_messages(
+        self,
+        request: ResponsesRequest,
+        prev_response: Optional[ResponsesResponse] = None,
+    ) -> list[ChatCompletionMessageParam]:
+        messages: list[ChatCompletionMessageParam] = []
+        if request.instructions:
+            messages.append({
+                "role": "system",
+                "content": request.instructions,
+            })
+
+        # Prepend the conversation history.
+        if prev_response is not None:
+            # Add the previous messages.
+            prev_msg = self.msg_store[prev_response.id]
+            messages.extend(prev_msg)
+
+            # Add the previous output.
+            for output_item in prev_response.output:
+                # NOTE: We skip the reasoning output.
+                if isinstance(output_item, ResponseOutputMessage):
+                    for content in output_item.content:
+                        messages.append({
+                            "role": "assistant",
+                            "content": content.text,
+                        })
+
+        # Append the new input.
+        # Responses API supports simple text inputs without chat format.
+        if isinstance(request.input, str):
+            messages.append({"role": "user", "content": request.input})
+        else:
+            messages.extend(request.input)  # type: ignore
+        return messages
+
+    async def _run_background_request(
+        self,
+        request: ResponsesRequest,
+        *args,
+        **kwargs,
+    ):
+        try:
+            response = await self.responses_full_generator(
+                request, *args, **kwargs)
+        except Exception as e:
+            logger.exception("Background request failed for %s",
+                             request.request_id)
+            response = self.create_error_response(str(e))
+
+        if isinstance(response, ErrorResponse):
+            # If the request has failed, update the status to "failed".
+            response_id = request.request_id
+            async with self.response_store_lock:
+                stored_response = self.response_store.get(response_id)
+                assert stored_response is not None
+                if stored_response.status not in ("completed", "cancelled"):
+                    stored_response.status = "failed"
+
+    async def retrieve_responses(
+        self,
+        response_id: str,
+    ) -> Union[ErrorResponse, ResponsesResponse]:
+        if not response_id.startswith("resp_"):
+            return self._make_invalid_id_error(response_id)
+
+        async with self.response_store_lock:
+            response = self.response_store.get(response_id)
+
+        if response is None:
+            return self._make_not_found_error(response_id)
+        return response
+
+    async def cancel_responses(
+        self,
+        response_id: str,
+    ) -> Union[ErrorResponse, ResponsesResponse]:
+        if not response_id.startswith("resp_"):
+            return self._make_invalid_id_error(response_id)
+
+        async with self.response_store_lock:
+            response = self.response_store.get(response_id)
+            if response is None:
+                return self._make_not_found_error(response_id)
+
+            prev_status = response.status
+            if prev_status not in ("queued", "in_progress"):
+                return self.create_error_response(
+                    err_type="invalid_request_error",
+                    message="Cannot cancel a synchronous response.",
+                )
+
+            # Update the status to "cancelled".
+            response.status = "cancelled"
+
+        # Abort the request.
+        if (task := self.background_tasks.get(response_id)):
+            task.cancel()
+            try:
+                await task
+            except asyncio.CancelledError:
+                logger.exception("Background task for %s was cancelled",
+                                 response_id)
+        return response
+
+    def _make_invalid_id_error(self, response_id: str) -> ErrorResponse:
+        return self.create_error_response(
+            err_type="invalid_request_error",
+            message=(f"Invalid 'response_id': '{response_id}'. "
+                     "Expected an ID that begins with 'resp'."),
+        )
+
+    def _make_not_found_error(self, response_id: str) -> ErrorResponse:
+        return self.create_error_response(
+            err_type="invalid_request_error",
+            message=f"Response with id '{response_id}' not found.",
+            status_code=HTTPStatus.NOT_FOUND,
+        )
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/serving_score.py b/vllm_v0.10.0/vllm/entrypoints/openai/serving_score.py
new file mode 100644
index 0000000..4da2094
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/serving_score.py
@@ -0,0 +1,499 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import time
+from collections.abc import AsyncGenerator, Mapping
+from typing import Any, Optional, Union
+
+from fastapi import Request
+
+from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.protocol import (ErrorResponse, RerankDocument,
+                                              RerankRequest, RerankResponse,
+                                              RerankResult, RerankUsage,
+                                              ScoreRequest, ScoreResponse,
+                                              ScoreResponseData, UsageInfo)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.score_utils import (ScoreContentPartParam,
+                                          ScoreMultiModalParam,
+                                          _cosine_similarity,
+                                          _validate_score_input_lens,
+                                          get_score_prompt)
+from vllm.entrypoints.utils import _validate_truncation_size
+from vllm.inputs.data import TokensPrompt
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
+from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+from vllm.utils import make_async, merge_async_iterators
+
+logger = init_logger(__name__)
+
+
+class ServingScores(OpenAIServing):
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        model_config: ModelConfig,
+        models: OpenAIServingModels,
+        *,
+        request_logger: Optional[RequestLogger],
+    ) -> None:
+        super().__init__(engine_client=engine_client,
+                         model_config=model_config,
+                         models=models,
+                         request_logger=request_logger)
+
+    async def _embedding_score(
+        self,
+        tokenizer: AnyTokenizer,
+        texts_1: list[str],
+        texts_2: list[str],
+        request: Union[RerankRequest, ScoreRequest],
+        request_id: str,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        lora_request: Optional[Union[LoRARequest, None]] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+    ) -> Union[list[PoolingRequestOutput], ErrorResponse]:
+        input_texts = texts_1 + texts_2
+
+        engine_prompts: list[TokensPrompt] = []
+        tokenize_async = make_async(tokenizer.__call__,
+                                    executor=self._tokenizer_executor)
+
+        tokenization_kwargs = tokenization_kwargs or {}
+        tokenized_prompts = await asyncio.gather(
+            *(tokenize_async(t, **tokenization_kwargs) for t in input_texts))
+
+        for tok_result, input_text in zip(tokenized_prompts, input_texts):
+
+            text_token_prompt = \
+                self._validate_input(
+                    request,
+                    tok_result["input_ids"],
+                    input_text)
+
+            engine_prompts.append(
+                TokensPrompt(
+                    prompt_token_ids=text_token_prompt["prompt_token_ids"]))
+
+        # Schedule the request and get the result generator.
+        generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
+        pooling_params = request.to_pooling_params()
+
+        try:
+            pooling_params.verify("embed", self.model_config)
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+        for i, engine_prompt in enumerate(engine_prompts):
+
+            request_id_item = f"{request_id}-{i}"
+
+            self._log_inputs(request_id_item,
+                             input_texts[i],
+                             params=pooling_params,
+                             lora_request=lora_request)
+
+            generators.append(
+                self.engine_client.encode(
+                    engine_prompt,
+                    pooling_params,
+                    request_id_item,
+                    lora_request=lora_request,
+                    trace_headers=trace_headers,
+                    priority=request.priority,
+                ))
+
+        result_generator = merge_async_iterators(*generators)
+
+        # Non-streaming response
+        final_res_batch: list[PoolingRequestOutput] = []
+
+        embeddings: list[Optional[PoolingRequestOutput]] =\
+              [None] * len(engine_prompts)
+
+        async for i, res in result_generator:
+            embeddings[i] = res
+
+        emb_texts_1: list[PoolingRequestOutput] = []
+        emb_texts_2: list[PoolingRequestOutput] = []
+
+        for i in range(0, len(texts_1)):
+            assert (emb := embeddings[i]) is not None
+            emb_texts_1.append(emb)
+
+        for i in range(len(texts_1), len(embeddings)):
+            assert (emb := embeddings[i]) is not None
+            emb_texts_2.append(emb)
+
+        if len(emb_texts_1) == 1:
+            emb_texts_1 = emb_texts_1 * len(emb_texts_2)
+
+        final_res_batch = _cosine_similarity(tokenizer=tokenizer,
+                                             embed_1=emb_texts_1,
+                                             embed_2=emb_texts_2)
+
+        return final_res_batch
+
+    def _preprocess_score(
+        self,
+        request: Union[RerankRequest, ScoreRequest],
+        tokenizer: AnyTokenizer,
+        tokenization_kwargs: dict[str, Any],
+        data_1: Union[str, ScoreContentPartParam],
+        data_2: Union[str, ScoreContentPartParam],
+    ) -> tuple[str, TokensPrompt]:
+
+        model_config = self.model_config
+
+        full_prompt, engine_prompt = get_score_prompt(
+            model_config=model_config,
+            data_1=data_1,
+            data_2=data_2,
+            tokenizer=tokenizer,
+            tokenization_kwargs=tokenization_kwargs,
+        )
+        if request.mm_processor_kwargs is not None:
+            engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs
+
+        return full_prompt, engine_prompt
+
+    async def _cross_encoding_score(
+        self,
+        tokenizer: AnyTokenizer,
+        data_1: Union[list[str], list[ScoreContentPartParam]],
+        data_2: Union[list[str], list[ScoreContentPartParam]],
+        request: Union[RerankRequest, ScoreRequest],
+        request_id: str,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        lora_request: Optional[Union[LoRARequest, None]] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+    ) -> Union[list[PoolingRequestOutput], ErrorResponse]:
+        request_prompts: list[str] = []
+        engine_prompts: list[TokensPrompt] = []
+
+        if len(data_1) == 1:
+            data_1 = data_1 * len(data_2)
+
+        if isinstance(tokenizer, MistralTokenizer):
+            raise ValueError(
+                "MistralTokenizer not supported for cross-encoding")
+
+        tokenization_kwargs = tokenization_kwargs or {}
+
+        input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
+
+        if self.model_config.is_multimodal_model:
+
+            preprocess_async = make_async(self._preprocess_score,
+                                          executor=self._tokenizer_executor)
+
+            preprocessed_prompts = await asyncio.gather(
+                *(preprocess_async(request=request,
+                                   tokenizer=tokenizer,
+                                   tokenization_kwargs=tokenization_kwargs,
+                                   data_1=t1,
+                                   data_2=t2) for t1, t2 in input_pairs))
+
+            for full_prompt, engine_prompt in preprocessed_prompts:
+                request_prompts.append(full_prompt)
+                engine_prompts.append(engine_prompt)
+
+        else:
+            tokenize_async = make_async(tokenizer.__call__,
+                                        executor=self._tokenizer_executor)
+            use_pad_token = self.model_config.use_pad_token
+
+            if use_pad_token:
+                # cross_encoder models defaults to using pad_token.
+                tokenized_prompts = await asyncio.gather(*(
+                    tokenize_async(
+                        text=t1,  # type: ignore[arg-type]
+                        text_pair=t2,  # type: ignore[arg-type]
+                        **tokenization_kwargs) for t1, t2 in input_pairs))
+            else:
+                # `llm as reranker` models defaults to not using pad_token.
+                tokenized_prompts = await asyncio.gather(*(
+                    tokenize_async(
+                        text=t1 +  # type: ignore[operator]
+                        t2,
+                        **tokenization_kwargs) for t1, t2 in input_pairs))
+
+            for prompt_inputs, (t1, t2) in zip(tokenized_prompts, input_pairs):
+                sep_token = tokenizer.sep_token if (tokenizer.sep_token
+                                                    and use_pad_token) else ''
+                request_prompt = f"{t1}{sep_token}{t2}"
+
+                input_ids = prompt_inputs["input_ids"]
+                text_token_prompt = \
+                    self._validate_input(request, input_ids, request_prompt)
+                engine_prompt = TokensPrompt(
+                    prompt_token_ids=text_token_prompt["prompt_token_ids"],
+                    token_type_ids=prompt_inputs.get("token_type_ids"))
+
+                request_prompts.append(request_prompt)
+                engine_prompts.append(engine_prompt)
+
+        # Schedule the request and get the result generator.
+        generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
+
+        pooling_params = request.to_pooling_params()
+
+        try:
+            pooling_params.verify("score", self.model_config)
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+        for i, engine_prompt in enumerate(engine_prompts):
+            request_id_item = f"{request_id}-{i}"
+
+            self._log_inputs(request_id_item,
+                             request_prompts[i],
+                             params=pooling_params,
+                             lora_request=lora_request)
+
+            generator = self.engine_client.encode(
+                engine_prompt,
+                pooling_params,
+                request_id_item,
+                lora_request=lora_request,
+                trace_headers=trace_headers,
+                priority=request.priority,
+            )
+
+            generators.append(generator)
+
+        result_generator = merge_async_iterators(*generators)
+
+        # Non-streaming response
+        final_res_batch: list[
+            Optional[PoolingRequestOutput]] = [None] * len(engine_prompts)
+
+        async for i, res in result_generator:
+            final_res_batch[i] = res
+
+        return [out for out in final_res_batch if out is not None]
+
+    async def _run_scoring(
+        self,
+        data_1: Union[list[str], str, ScoreMultiModalParam],
+        data_2: Union[list[str], str, ScoreMultiModalParam],
+        request: Union[ScoreRequest, RerankRequest],
+        request_id: str,
+        raw_request: Optional[Request] = None,
+        truncate_prompt_tokens: Optional[int] = None,
+    ) -> Union[list[PoolingRequestOutput], ErrorResponse]:
+        lora_request = self._maybe_get_adapters(request)
+
+        tokenizer = await self.engine_client.get_tokenizer(lora_request)
+
+        tokenization_kwargs: dict[str, Any] = {}
+        _validate_truncation_size(self.max_model_len, truncate_prompt_tokens,
+                                  tokenization_kwargs)
+
+        trace_headers = (None if raw_request is None else await
+                         self._get_trace_headers(raw_request.headers))
+
+        if not self.model_config.is_multimodal_model and (isinstance(
+                data_1, dict) or isinstance(data_2, dict)):
+            raise ValueError(
+                f"MultiModalParam is not supported for {self.model_config.architecture}"  # noqa: E501
+            )
+
+        if isinstance(data_1, str):
+            data_1 = [data_1]
+        elif isinstance(data_1, dict):
+            data_1 = data_1.get("content")  # type: ignore[assignment]
+
+        if isinstance(data_2, str):
+            data_2 = [data_2]
+        elif isinstance(data_2, dict):
+            data_2 = data_2.get("content")  # type: ignore[assignment]
+
+        _validate_score_input_lens(data_1, data_2)  # type: ignore[arg-type]
+
+        if self.model_config.is_cross_encoder:
+            return await self._cross_encoding_score(
+                tokenizer=tokenizer,
+                data_1=data_1,  # type: ignore[arg-type]
+                data_2=data_2,  # type: ignore[arg-type]
+                request=request,
+                request_id=request_id,
+                tokenization_kwargs=tokenization_kwargs,
+                lora_request=lora_request,
+                trace_headers=trace_headers)
+
+        else:
+            return await self._embedding_score(
+                tokenizer=tokenizer,
+                texts_1=data_1,  # type: ignore[arg-type]
+                texts_2=data_2,  # type: ignore[arg-type]
+                request=request,
+                request_id=request_id,
+                tokenization_kwargs=tokenization_kwargs,
+                lora_request=lora_request,
+                trace_headers=trace_headers)
+
+    async def create_score(
+        self,
+        request: ScoreRequest,
+        raw_request: Optional[Request] = None,
+    ) -> Union[ScoreResponse, ErrorResponse]:
+        """
+        Score API similar to Sentence Transformers cross encoder
+
+        See https://sbert.net/docs/package_reference/cross_encoder
+        """
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        request_id = f"score-{self._base_request_id(raw_request)}"
+        created_time = int(time.time())
+
+        try:
+            final_res_batch = await self._run_scoring(
+                request.text_1,
+                request.text_2,
+                request,
+                request_id,
+                raw_request,
+                request.truncate_prompt_tokens,
+            )
+            if isinstance(final_res_batch, ErrorResponse):
+                return final_res_batch
+
+            return self.request_output_to_score_response(
+                final_res_batch,
+                request_id,
+                created_time,
+                self._get_model_name(request.model),
+            )
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+    async def do_rerank(
+        self,
+        request: RerankRequest,
+        raw_request: Optional[Request] = None
+    ) -> Union[RerankResponse, ErrorResponse]:
+        """
+        Rerank API based on JinaAI's rerank API; implements the same
+        API interface. Designed for compatibility with off-the-shelf
+        tooling, since this is a common standard for reranking APIs
+
+        See example client implementations at
+        https://github.com/infiniflow/ragflow/blob/main/rag/llm/rerank_model.py
+        numerous clients use this standard.
+        """
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        request_id = f"rerank-{self._base_request_id(raw_request)}"
+        documents = request.documents
+        top_n = request.top_n if request.top_n > 0 else (
+            len(documents)
+            if isinstance(documents, list) else len(documents["content"]))
+
+        try:
+            final_res_batch = await self._run_scoring(
+                request.query,
+                documents,
+                request,
+                request_id,
+                raw_request,
+                request.truncate_prompt_tokens,
+            )
+            if isinstance(final_res_batch, ErrorResponse):
+                return final_res_batch
+
+            return self.request_output_to_rerank_response(
+                final_res_batch,
+                request_id,
+                self._get_model_name(request.model),
+                documents,
+                top_n,
+            )
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+    def request_output_to_score_response(
+        self,
+        final_res_batch: list[PoolingRequestOutput],
+        request_id: str,
+        created_time: int,
+        model_name: str,
+    ) -> ScoreResponse:
+        items: list[ScoreResponseData] = []
+        num_prompt_tokens = 0
+
+        for idx, final_res in enumerate(final_res_batch):
+            classify_res = ScoringRequestOutput.from_base(final_res)
+
+            item = ScoreResponseData(
+                index=idx,
+                score=classify_res.outputs.score,
+            )
+            prompt_token_ids = final_res.prompt_token_ids
+
+            items.append(item)
+            num_prompt_tokens += len(prompt_token_ids)
+
+        usage = UsageInfo(
+            prompt_tokens=num_prompt_tokens,
+            total_tokens=num_prompt_tokens,
+        )
+
+        return ScoreResponse(
+            id=request_id,
+            created=created_time,
+            model=model_name,
+            data=items,
+            usage=usage,
+        )
+
+    def request_output_to_rerank_response(
+            self, final_res_batch: list[PoolingRequestOutput], request_id: str,
+            model_name: str, documents: Union[list[str], ScoreMultiModalParam],
+            top_n: int) -> RerankResponse:
+        """
+        Convert the output of do_rank to a RerankResponse
+        """
+        results: list[RerankResult] = []
+        num_prompt_tokens = 0
+        for idx, final_res in enumerate(final_res_batch):
+            classify_res = ScoringRequestOutput.from_base(final_res)
+
+            result = RerankResult(
+                index=idx,
+                document=RerankDocument(text=documents[idx]) if isinstance(
+                    documents, list) else RerankDocument(
+                        multi_modal=documents["content"][idx]),
+                relevance_score=classify_res.outputs.score,
+            )
+            results.append(result)
+            prompt_token_ids = final_res.prompt_token_ids
+            num_prompt_tokens += len(prompt_token_ids)
+
+        # sort by relevance, then return the top n if set
+        results.sort(key=lambda x: x.relevance_score, reverse=True)
+        if top_n < len(documents):
+            results = results[:top_n]
+
+        return RerankResponse(
+            id=request_id,
+            model=model_name,
+            results=results,
+            usage=RerankUsage(total_tokens=num_prompt_tokens))
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/serving_tokenization.py b/vllm_v0.10.0/vllm/entrypoints/openai/serving_tokenization.py
new file mode 100644
index 0000000..58d7204
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/serving_tokenization.py
@@ -0,0 +1,192 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+from typing import Any, Final, Optional, Union
+
+import jinja2
+from fastapi import Request
+
+from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
+from vllm.entrypoints.logger import RequestLogger
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.entrypoints.openai.protocol import (DetokenizeRequest,
+                                              DetokenizeResponse,
+                                              ErrorResponse,
+                                              TokenizeChatRequest,
+                                              TokenizeRequest,
+                                              TokenizeResponse,
+                                              TokenizerInfoResponse)
+# yapf: enable
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+logger = init_logger(__name__)
+
+
+class OpenAIServingTokenization(OpenAIServing):
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        model_config: ModelConfig,
+        models: OpenAIServingModels,
+        *,
+        request_logger: Optional[RequestLogger],
+        chat_template: Optional[str],
+        chat_template_content_format: ChatTemplateContentFormatOption,
+    ) -> None:
+        super().__init__(engine_client=engine_client,
+                         model_config=model_config,
+                         models=models,
+                         request_logger=request_logger)
+
+        self.chat_template = chat_template
+        self.chat_template_content_format: Final = chat_template_content_format
+
+    async def create_tokenize(
+        self,
+        request: TokenizeRequest,
+        raw_request: Request,
+    ) -> Union[TokenizeResponse, ErrorResponse]:
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        request_id = f"tokn-{self._base_request_id(raw_request)}"
+
+        try:
+            lora_request = self._maybe_get_adapters(request)
+
+            tokenizer = await self.engine_client.get_tokenizer(lora_request)
+
+            if isinstance(request, TokenizeChatRequest):
+                tool_dicts = (None if request.tools is None else
+                              [tool.model_dump() for tool in request.tools])
+                (
+                    _,
+                    request_prompts,
+                    engine_prompts,
+                ) = await self._preprocess_chat(
+                    request,
+                    tokenizer,
+                    request.messages,
+                    tool_dicts=tool_dicts,
+                    chat_template=request.chat_template or self.chat_template,
+                    chat_template_content_format=self.
+                    chat_template_content_format,
+                    add_generation_prompt=request.add_generation_prompt,
+                    continue_final_message=request.continue_final_message,
+                    chat_template_kwargs=request.chat_template_kwargs,
+                    add_special_tokens=request.add_special_tokens,
+                )
+            else:
+                (request_prompts,
+                 engine_prompts) = await self._preprocess_completion(
+                     request,
+                     tokenizer,
+                     request.prompt,
+                     add_special_tokens=request.add_special_tokens,
+                 )
+        except (ValueError, TypeError, jinja2.TemplateError) as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(f"{e} {e.__cause__}")
+
+        input_ids: list[int] = []
+        for i, engine_prompt in enumerate(engine_prompts):
+            self._log_inputs(request_id,
+                             request_prompts[i],
+                             params=None,
+                             lora_request=lora_request)
+
+            if isinstance(engine_prompt,
+                          dict) and "prompt_token_ids" in engine_prompt:
+                input_ids.extend(engine_prompt["prompt_token_ids"])
+
+        token_strs = None
+        if request.return_token_strs:
+            token_strs = tokenizer.convert_ids_to_tokens(input_ids)
+
+        return TokenizeResponse(tokens=input_ids,
+                                token_strs=token_strs,
+                                count=len(input_ids),
+                                max_model_len=self.max_model_len)
+
+    async def create_detokenize(
+        self,
+        request: DetokenizeRequest,
+        raw_request: Request,
+    ) -> Union[DetokenizeResponse, ErrorResponse]:
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        request_id = f"tokn-{self._base_request_id(raw_request)}"
+
+        lora_request = self._maybe_get_adapters(request)
+
+        tokenizer = await self.engine_client.get_tokenizer(lora_request)
+
+        self._log_inputs(request_id,
+                         request.tokens,
+                         params=None,
+                         lora_request=lora_request)
+
+        prompt_input = await self._tokenize_prompt_input_async(
+            request,
+            tokenizer,
+            request.tokens,
+        )
+        input_text = prompt_input["prompt"]
+
+        return DetokenizeResponse(prompt=input_text)
+
+    async def get_tokenizer_info(
+        self, ) -> Union[TokenizerInfoResponse, ErrorResponse]:
+        """Get comprehensive tokenizer information."""
+        try:
+            tokenizer = await self.engine_client.get_tokenizer()
+            info = TokenizerInfo(tokenizer, self.chat_template).to_dict()
+            return TokenizerInfoResponse(**info)
+        except Exception as e:
+            return self.create_error_response(
+                f"Failed to get tokenizer info: {str(e)}")
+
+
+@dataclass
+class TokenizerInfo:
+    tokenizer: AnyTokenizer
+    chat_template: Optional[str]
+
+    def to_dict(self) -> dict[str, Any]:
+        """Return the tokenizer configuration."""
+        return self._get_tokenizer_config()
+
+    def _get_tokenizer_config(self) -> dict[str, Any]:
+        """Get tokenizer configuration directly from the tokenizer object."""
+        config = dict(getattr(self.tokenizer, "init_kwargs", None) or {})
+
+        # Remove file path fields
+        config.pop("vocab_file", None)
+        config.pop("merges_file", None)
+
+        config = self._make_json_serializable(config)
+        config["tokenizer_class"] = type(self.tokenizer).__name__
+        if self.chat_template:
+            config["chat_template"] = self.chat_template
+        return config
+
+    def _make_json_serializable(self, obj):
+        """Convert any non-JSON-serializable objects to serializable format."""
+        if hasattr(obj, "content"):
+            return obj.content
+        elif isinstance(obj, dict):
+            return {k: self._make_json_serializable(v) for k, v in obj.items()}
+        elif isinstance(obj, list):
+            return [self._make_json_serializable(item) for item in obj]
+        else:
+            return obj
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/serving_transcription.py b/vllm_v0.10.0/vllm/entrypoints/openai/serving_transcription.py
new file mode 100644
index 0000000..0d6989f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/serving_transcription.py
@@ -0,0 +1,132 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import AsyncGenerator
+from typing import Optional, Union
+
+from fastapi import Request
+
+from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.protocol import (
+    ErrorResponse, RequestResponseMetadata, TranscriptionRequest,
+    TranscriptionResponse, TranscriptionResponseStreamChoice,
+    TranscriptionStreamResponse, TranslationRequest, TranslationResponse,
+    TranslationResponseStreamChoice, TranslationStreamResponse)
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.openai.speech_to_text import OpenAISpeechToText
+from vllm.logger import init_logger
+from vllm.outputs import RequestOutput
+
+logger = init_logger(__name__)
+
+
+class OpenAIServingTranscription(OpenAISpeechToText):
+    """Handles transcription requests."""
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        model_config: ModelConfig,
+        models: OpenAIServingModels,
+        *,
+        request_logger: Optional[RequestLogger],
+        return_tokens_as_token_ids: bool = False,
+    ):
+        super().__init__(engine_client=engine_client,
+                         model_config=model_config,
+                         models=models,
+                         request_logger=request_logger,
+                         return_tokens_as_token_ids=return_tokens_as_token_ids,
+                         task_type="transcribe")
+
+    async def create_transcription(
+        self, audio_data: bytes, request: TranscriptionRequest,
+        raw_request: Request
+    ) -> Union[TranscriptionResponse, AsyncGenerator[str, None],
+               ErrorResponse]:
+        """Transcription API similar to OpenAI's API.
+
+        See https://platform.openai.com/docs/api-reference/audio/createTranscription
+        for the API specification. This API mimics the OpenAI transcription API.
+        """
+        return await self._create_speech_to_text(
+            audio_data=audio_data,
+            request=request,
+            raw_request=raw_request,
+            response_class=TranscriptionResponse,
+            stream_generator_method=self.transcription_stream_generator,
+        )
+
+    async def transcription_stream_generator(
+            self, request: TranscriptionRequest,
+            result_generator: list[AsyncGenerator[RequestOutput, None]],
+            request_id: str, request_metadata: RequestResponseMetadata,
+            audio_duration_s: float) -> AsyncGenerator[str, None]:
+        generator = self._speech_to_text_stream_generator(
+            request=request,
+            list_result_generator=result_generator,
+            request_id=request_id,
+            request_metadata=request_metadata,
+            audio_duration_s=audio_duration_s,
+            chunk_object_type="transcription.chunk",
+            response_stream_choice_class=TranscriptionResponseStreamChoice,
+            stream_response_class=TranscriptionStreamResponse,
+        )
+        async for chunk in generator:
+            yield chunk
+
+
+class OpenAIServingTranslation(OpenAISpeechToText):
+    """Handles translation requests."""
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        model_config: ModelConfig,
+        models: OpenAIServingModels,
+        *,
+        request_logger: Optional[RequestLogger],
+        return_tokens_as_token_ids: bool = False,
+    ):
+        super().__init__(engine_client=engine_client,
+                         model_config=model_config,
+                         models=models,
+                         request_logger=request_logger,
+                         return_tokens_as_token_ids=return_tokens_as_token_ids,
+                         task_type="translate")
+
+    async def create_translation(
+        self, audio_data: bytes, request: TranslationRequest,
+        raw_request: Request
+    ) -> Union[TranslationResponse, AsyncGenerator[str, None], ErrorResponse]:
+        """Translation API similar to OpenAI's API.
+
+        See https://platform.openai.com/docs/api-reference/audio/createTranslation
+        for the API specification. This API mimics the OpenAI translation API.
+        """
+        return await self._create_speech_to_text(
+            audio_data=audio_data,
+            request=request,
+            raw_request=raw_request,
+            response_class=TranslationResponse,
+            stream_generator_method=self.translation_stream_generator,
+        )
+
+    async def translation_stream_generator(
+            self, request: TranslationRequest,
+            result_generator: list[AsyncGenerator[RequestOutput, None]],
+            request_id: str, request_metadata: RequestResponseMetadata,
+            audio_duration_s: float) -> AsyncGenerator[str, None]:
+        generator = self._speech_to_text_stream_generator(
+            request=request,
+            list_result_generator=result_generator,
+            request_id=request_id,
+            request_metadata=request_metadata,
+            audio_duration_s=audio_duration_s,
+            chunk_object_type="translation.chunk",
+            response_stream_choice_class=TranslationResponseStreamChoice,
+            stream_response_class=TranslationStreamResponse,
+        )
+        async for chunk in generator:
+            yield chunk
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/speech_to_text.py b/vllm_v0.10.0/vllm/entrypoints/openai/speech_to_text.py
new file mode 100644
index 0000000..c2227a2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/speech_to_text.py
@@ -0,0 +1,370 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import io
+import math
+import time
+from collections.abc import AsyncGenerator
+from functools import cached_property
+from typing import Callable, Literal, Optional, TypeVar, Union, cast
+
+import numpy as np
+from fastapi import Request
+
+import vllm.envs as envs
+from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.protocol import (
+    DeltaMessage, ErrorResponse, RequestResponseMetadata,
+    TranscriptionResponse, TranscriptionResponseStreamChoice,
+    TranscriptionStreamResponse, TranslationResponse,
+    TranslationResponseStreamChoice, TranslationStreamResponse, UsageInfo)
+from vllm.entrypoints.openai.serving_engine import (OpenAIServing,
+                                                    SpeechToTextRequest)
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.inputs.data import PromptType
+from vllm.logger import init_logger
+from vllm.model_executor.models import SupportsTranscription
+from vllm.outputs import RequestOutput
+from vllm.utils import PlaceholderModule
+
+try:
+    import librosa
+except ImportError:
+    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
+
+SpeechToTextResponse = Union[TranscriptionResponse, TranslationResponse]
+T = TypeVar("T", bound=SpeechToTextResponse)
+
+logger = init_logger(__name__)
+
+
+class OpenAISpeechToText(OpenAIServing):
+    """Base class for speech-to-text operations like transcription and 
+    translation."""
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        model_config: ModelConfig,
+        models: OpenAIServingModels,
+        *,
+        request_logger: Optional[RequestLogger],
+        return_tokens_as_token_ids: bool = False,
+        task_type: Literal["transcribe", "translate"] = "transcribe",
+    ):
+        super().__init__(engine_client=engine_client,
+                         model_config=model_config,
+                         models=models,
+                         request_logger=request_logger,
+                         return_tokens_as_token_ids=return_tokens_as_token_ids)
+
+        self.default_sampling_params = (
+            self.model_config.get_diff_sampling_param())
+        self.task_type = task_type
+
+        self.asr_config = self.model_cls.get_speech_to_text_config(
+            model_config, task_type)
+
+        self.max_audio_filesize_mb = envs.VLLM_MAX_AUDIO_CLIP_FILESIZE_MB
+
+        if self.default_sampling_params:
+            logger.info(
+                "Overwriting default completion sampling param with: %s",
+                self.default_sampling_params)
+
+    @cached_property
+    def model_cls(self) -> type[SupportsTranscription]:
+        from vllm.model_executor.model_loader import get_model_cls
+        model_cls = get_model_cls(self.model_config)
+        return cast(type[SupportsTranscription], model_cls)
+
+    async def _preprocess_speech_to_text(
+        self,
+        request: SpeechToTextRequest,
+        audio_data: bytes,
+    ) -> tuple[list[PromptType], float]:
+        # Validate request
+        # TODO language should be optional and can be guessed.
+        # For now we default to en. See
+        # https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/generation_whisper.py#L1520
+        lang = request.language or "en"
+        self.model_cls.validate_language(lang)
+
+        if len(audio_data) / 1024**2 > self.max_audio_filesize_mb:
+            raise ValueError("Maximum file size exceeded.")
+
+        with io.BytesIO(audio_data) as bytes_:
+            # NOTE resample to model SR here for efficiency. This is also a
+            # pre-requisite for chunking, as it assumes Whisper SR.
+            y, sr = librosa.load(bytes_, sr=self.asr_config.sample_rate)
+
+        duration = librosa.get_duration(y=y, sr=sr)
+        do_split_audio = (self.asr_config.allow_audio_chunking
+                          and duration > self.asr_config.max_audio_clip_s)
+        chunks = [y] if not do_split_audio else self._split_audio(y, int(sr))
+        prompts = []
+        for chunk in chunks:
+            # The model has control over the construction, as long as it
+            # returns a valid PromptType.
+            prompt = self.model_cls.get_generation_prompt(
+                audio=chunk,
+                stt_config=self.asr_config,
+                model_config=self.model_config,
+                language=lang,
+                task_type=self.task_type,
+                request_prompt=request.prompt)
+            prompts.append(prompt)
+        return prompts, duration
+
+    async def _create_speech_to_text(
+        self,
+        audio_data: bytes,
+        request: SpeechToTextRequest,
+        raw_request: Request,
+        response_class: type[T],
+        stream_generator_method: Callable[..., AsyncGenerator[str, None]],
+    ) -> Union[T, AsyncGenerator[str, None], ErrorResponse]:
+        """Base method for speech-to-text operations like transcription and 
+        translation."""
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        # If the engine is dead, raise the engine's DEAD_ERROR.
+        # This is required for the streaming case, where we return a
+        # success status before we actually start generating text :).
+        if self.engine_client.errored:
+            raise self.engine_client.dead_error
+
+        if request.response_format not in ['text', 'json']:
+            return self.create_error_response(
+                "Currently only support response_format `text` or `json`")
+
+        request_id = f"{self.task_type}-{self._base_request_id(raw_request)}"
+
+        request_metadata = RequestResponseMetadata(request_id=request_id)
+        if raw_request:
+            raw_request.state.request_metadata = request_metadata
+
+        try:
+            lora_request = self._maybe_get_adapters(request)
+
+            if lora_request:
+                return self.create_error_response(
+                    "Currently do not support LoRA for "
+                    f"{self.task_type.title()}.")
+
+            prompts, duration_s = await self._preprocess_speech_to_text(
+                request=request,
+                audio_data=audio_data,
+            )
+
+        except ValueError as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
+
+        list_result_generator: Optional[list[AsyncGenerator[RequestOutput,
+                                                            None]]] = None
+        try:
+            # Unlike most decoder-only models, whisper generation length is not
+            # constrained by the size of the input audio, which is mapped to a
+            # fixed-size log-mel-spectogram.
+            default_max_tokens = self.model_config.max_model_len
+            sampling_params = request.to_sampling_params(
+                default_max_tokens, self.default_sampling_params)
+
+            self._log_inputs(
+                request_id,
+                # It will not display special tokens like <|startoftranscript|>
+                request.prompt,
+                params=sampling_params,
+                lora_request=None)
+
+            list_result_generator = [
+                self.engine_client.generate(
+                    prompt,
+                    sampling_params,
+                    request_id,
+                ) for prompt in prompts
+            ]
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+        if request.stream:
+            return stream_generator_method(request, list_result_generator,
+                                           request_id, request_metadata,
+                                           duration_s)
+        # Non-streaming response.
+        try:
+            assert list_result_generator is not None
+            text = ""
+            for result_generator in list_result_generator:
+                async for op in result_generator:
+                    text += op.outputs[0].text
+            return cast(T, response_class(text=text))
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+    async def _speech_to_text_stream_generator(
+        self,
+        request: SpeechToTextRequest,
+        list_result_generator: list[AsyncGenerator[RequestOutput, None]],
+        request_id: str,
+        request_metadata: RequestResponseMetadata,
+        audio_duration_s: float,
+        chunk_object_type: Literal["translation.chunk", "transcription.chunk"],
+        response_stream_choice_class: Union[
+            type[TranscriptionResponseStreamChoice],
+            type[TranslationResponseStreamChoice]],
+        stream_response_class: Union[type[TranscriptionStreamResponse],
+                                     type[TranslationStreamResponse]],
+    ) -> AsyncGenerator[str, None]:
+        created_time = int(time.time())
+        model_name = request.model
+
+        completion_tokens = 0
+        num_prompt_tokens = 0
+
+        include_usage = request.stream_include_usage \
+            if request.stream_include_usage else False
+        include_continuous_usage = request.stream_continuous_usage_stats\
+            if include_usage and request.stream_continuous_usage_stats\
+            else False
+
+        try:
+            for result_generator in list_result_generator:
+                async for res in result_generator:
+                    # On first result.
+                    if res.prompt_token_ids is not None:
+                        num_prompt_tokens = len(res.prompt_token_ids)
+                        if audio_tokens := self.model_cls.get_num_audio_tokens(
+                                audio_duration_s, self.asr_config,
+                                self.model_config):
+                            num_prompt_tokens += audio_tokens
+
+                    # We need to do it here, because if there are exceptions in
+                    # the result_generator, it needs to be sent as the FIRST
+                    # response (by the try...catch).
+
+                    # Just one output (n=1) supported.
+                    assert len(res.outputs) == 1
+                    output = res.outputs[0]
+
+                    delta_message = DeltaMessage(content=output.text)
+                    completion_tokens += len(output.token_ids)
+
+                    if output.finish_reason is None:
+                        # Still generating, send delta update.
+                        choice_data = response_stream_choice_class(
+                            delta=delta_message)
+                    else:
+                        # Model is finished generating.
+                        choice_data = response_stream_choice_class(
+                            delta=delta_message,
+                            finish_reason=output.finish_reason,
+                            stop_reason=output.stop_reason)
+
+                    chunk = stream_response_class(id=request_id,
+                                                  object=chunk_object_type,
+                                                  created=created_time,
+                                                  choices=[choice_data],
+                                                  model=model_name)
+
+                    # handle usage stats if requested & if continuous
+                    if include_continuous_usage:
+                        chunk.usage = UsageInfo(
+                            prompt_tokens=num_prompt_tokens,
+                            completion_tokens=completion_tokens,
+                            total_tokens=num_prompt_tokens + completion_tokens,
+                        )
+
+                    data = chunk.model_dump_json(exclude_unset=True)
+                    yield f"data: {data}\n\n"
+
+            # Once the final token is handled, if stream_options.include_usage
+            # is sent, send the usage.
+            if include_usage:
+                final_usage = UsageInfo(prompt_tokens=num_prompt_tokens,
+                                        completion_tokens=completion_tokens,
+                                        total_tokens=num_prompt_tokens +
+                                        completion_tokens)
+
+                final_usage_chunk = stream_response_class(
+                    id=request_id,
+                    object=chunk_object_type,
+                    created=created_time,
+                    choices=[],
+                    model=model_name,
+                    usage=final_usage)
+                final_usage_data = (final_usage_chunk.model_dump_json(
+                    exclude_unset=True, exclude_none=True))
+                yield f"data: {final_usage_data}\n\n"
+
+            # report to FastAPI middleware aggregate usage across all choices
+            request_metadata.final_usage_info = UsageInfo(
+                prompt_tokens=num_prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=num_prompt_tokens + completion_tokens)
+
+        except Exception as e:
+            # TODO: Use a vllm-specific Validation Error
+            logger.exception("Error in %s stream generator.", self.task_type)
+            data = self.create_streaming_error_response(str(e))
+            yield f"data: {data}\n\n"
+        # Send the final done message after all response.n are finished
+        yield "data: [DONE]\n\n"
+
+    def _split_audio(self, audio_data: np.ndarray,
+                     sample_rate: int) -> list[np.ndarray]:
+        chunk_size = sample_rate * self.asr_config.max_audio_clip_s
+        overlap_size = sample_rate * self.asr_config.overlap_chunk_second
+        chunks = []
+        i = 0
+        while i < audio_data.shape[-1]:
+            if i + chunk_size >= audio_data.shape[-1]:
+                # handle last chunk
+                chunks.append(audio_data[..., i:])
+                break
+
+            # Find the best split point in the overlap region
+            search_start = i + chunk_size - overlap_size
+            search_end = min(i + chunk_size, audio_data.shape[-1])
+            split_point = self._find_split_point(audio_data, search_start,
+                                                 search_end)
+
+            # Extract chunk up to the split point
+            chunks.append(audio_data[..., i:split_point])
+            i = split_point
+        return chunks
+
+    def _find_split_point(self, wav: np.ndarray, start_idx: int,
+                          end_idx: int) -> int:
+        """Find the best point to split audio by 
+        looking for silence or low amplitude.
+        Args:
+            wav: Audio tensor [1, T]
+            start_idx: Start index of search region
+            end_idx: End index of search region
+        Returns:
+            Index of best splitting point
+        """
+        segment = wav[start_idx:end_idx]
+
+        # Calculate RMS energy in small windows
+        min_energy = math.inf
+        quietest_idx = 0
+        min_energy_window = self.asr_config.min_energy_split_window_size
+        assert min_energy_window is not None
+        for i in range(0, len(segment) - min_energy_window, min_energy_window):
+            window = segment[i:i + min_energy_window]
+            energy = (window**2).mean()**0.5
+            if energy < min_energy:
+                quietest_idx = i + start_idx
+                min_energy = energy
+        return quietest_idx
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/__init__.py
new file mode 100644
index 0000000..88c8aa9
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from .abstract_tool_parser import ToolParser, ToolParserManager
+from .deepseekv3_tool_parser import DeepSeekV3ToolParser
+from .glm4_moe_tool_parser import Glm4MoeModelToolParser
+from .granite_20b_fc_tool_parser import Granite20bFCToolParser
+from .granite_tool_parser import GraniteToolParser
+from .hermes_tool_parser import Hermes2ProToolParser
+from .hunyuan_a13b_tool_parser import HunyuanA13BToolParser
+from .internlm2_tool_parser import Internlm2ToolParser
+from .jamba_tool_parser import JambaToolParser
+from .kimi_k2_tool_parser import KimiK2ToolParser
+from .llama4_pythonic_tool_parser import Llama4PythonicToolParser
+from .llama_tool_parser import Llama3JsonToolParser
+from .minimax_tool_parser import MinimaxToolParser
+from .mistral_tool_parser import MistralToolParser
+from .phi4mini_tool_parser import Phi4MiniJsonToolParser
+from .pythonic_tool_parser import PythonicToolParser
+from .qwen3coder_tool_parser import Qwen3CoderToolParser
+from .xlam_tool_parser import xLAMToolParser
+
+__all__ = [
+    "ToolParser",
+    "ToolParserManager",
+    "Granite20bFCToolParser",
+    "GraniteToolParser",
+    "Hermes2ProToolParser",
+    "MistralToolParser",
+    "Internlm2ToolParser",
+    "Llama3JsonToolParser",
+    "JambaToolParser",
+    "Llama4PythonicToolParser",
+    "PythonicToolParser",
+    "Phi4MiniJsonToolParser",
+    "DeepSeekV3ToolParser",
+    "xLAMToolParser",
+    "MinimaxToolParser",
+    "KimiK2ToolParser",
+    "HunyuanA13BToolParser",
+    "Glm4MoeModelToolParser",
+    "Qwen3CoderToolParser",
+]
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
new file mode 100644
index 0000000..02aeab6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
@@ -0,0 +1,164 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from collections.abc import Sequence
+from functools import cached_property
+from typing import Callable, Optional, Union
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage,
+                                              ExtractedToolCallInformation)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import import_from_path, is_list_of
+
+logger = init_logger(__name__)
+
+
+class ToolParser:
+    """
+    Abstract ToolParser class that should not be used directly. Provided
+    properties and methods should be used in
+    derived classes.
+    """
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        self.prev_tool_call_arr: list[dict] = []
+        # the index of the tool call that is currently being parsed
+        self.current_tool_id: int = -1
+        self.current_tool_name_sent: bool = False
+        self.streamed_args_for_tool: list[str] = []
+
+        self.model_tokenizer = tokenizer
+
+    @cached_property
+    def vocab(self) -> dict[str, int]:
+        # NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab
+        # whereas all tokenizers have .get_vocab()
+        return self.model_tokenizer.get_vocab()
+
+    def adjust_request(
+            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        """
+        Static method that used to adjust the request parameters.
+        """
+        return request
+
+    def extract_tool_calls(
+            self, model_output: str,
+            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        """
+        Static method that should be implemented for extracting tool calls from
+        a complete model-generated string.
+        Used for non-streaming responses where we have the entire model response
+        available before sending to the client.
+        Static because it's stateless.
+        """
+        raise NotImplementedError(
+            "AbstractToolParser.extract_tool_calls has not been implemented!")
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+        """
+        Instance method that should be implemented for extracting tool calls
+        from an incomplete response; for use when handling tool calls and
+        streaming. Has to be an instance method because  it requires state -
+        the current tokens/diffs, but also the information about what has
+        previously been parsed and extracted (see constructor)
+        """
+        raise NotImplementedError(
+            "AbstractToolParser.extract_tool_calls_streaming has not been "
+            "implemented!")
+
+
+class ToolParserManager:
+    tool_parsers: dict[str, type] = {}
+
+    @classmethod
+    def get_tool_parser(cls, name) -> type:
+        """
+        Get tool parser by name which is registered by `register_module`.
+
+        Raise a KeyError exception if the name is not registered.
+        """
+        if name in cls.tool_parsers:
+            return cls.tool_parsers[name]
+
+        raise KeyError(f"tool helper: '{name}' not found in tool_parsers")
+
+    @classmethod
+    def _register_module(cls,
+                         module: type,
+                         module_name: Optional[Union[str, list[str]]] = None,
+                         force: bool = True) -> None:
+        if not issubclass(module, ToolParser):
+            raise TypeError(
+                f'module must be subclass of ToolParser, but got {type(module)}'
+            )
+        if module_name is None:
+            module_name = module.__name__
+        if isinstance(module_name, str):
+            module_name = [module_name]
+        for name in module_name:
+            if not force and name in cls.tool_parsers:
+                existed_module = cls.tool_parsers[name]
+                raise KeyError(f'{name} is already registered '
+                               f'at {existed_module.__module__}')
+            cls.tool_parsers[name] = module
+
+    @classmethod
+    def register_module(
+            cls,
+            name: Optional[Union[str, list[str]]] = None,
+            force: bool = True,
+            module: Union[type, None] = None) -> Union[type, Callable]:
+        """
+        Register module with the given name or name list. it can be used as a
+        decoder(with module as None) or normal function(with module as not 
+        None).
+        """
+        if not isinstance(force, bool):
+            raise TypeError(f'force must be a boolean, but got {type(force)}')
+
+        # raise the error ahead of time
+        if not (name is None or isinstance(name, str)
+                or is_list_of(name, str)):
+            raise TypeError(
+                'name must be None, an instance of str, or a sequence of str, '
+                f'but got {type(name)}')
+
+        # use it as a normal method: x.register_module(module=SomeClass)
+        if module is not None:
+            cls._register_module(module=module, module_name=name, force=force)
+            return module
+
+        # use it as a decorator: @x.register_module()
+        def _register(module):
+            cls._register_module(module=module, module_name=name, force=force)
+            return module
+
+        return _register
+
+    @classmethod
+    def import_tool_parser(cls, plugin_path: str) -> None:
+        """
+        Import a user-defined tool parser by the path of the tool parser define
+        file.
+        """
+        module_name = os.path.splitext(os.path.basename(plugin_path))[0]
+
+        try:
+            import_from_path(module_name, plugin_path)
+        except Exception:
+            logger.exception("Failed to load module '%s' from %s.",
+                             module_name, plugin_path)
+            return
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
new file mode 100644
index 0000000..da4760a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
@@ -0,0 +1,370 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+from typing import Union
+
+import regex as re
+
+from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("deepseek_v3")
+class DeepSeekV3ToolParser(ToolParser):
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+        self.current_tool_name_sent: bool = False
+        self.prev_tool_call_arr: list[dict] = []
+        self.current_tool_id: int = -1
+        self.streamed_args_for_tool: list[str] = (
+            [])  # map what has been streamed for each tool so far to a list
+
+        self.tool_calls_start_token: str = "<｜tool▁calls▁begin｜>"
+        self.tool_calls_end_token: str = "<｜tool▁calls▁end｜>"
+
+        self.tool_call_start_token: str = "<｜tool▁call▁begin｜>"
+        self.tool_call_end_token: str = "<｜tool▁call▁end｜>"
+
+        self.tool_call_regex = re.compile(
+            r"<｜tool▁call▁begin｜>(?P<type>.*)<｜tool▁sep｜>(?P<function_name>.*)\n```json\n(?P<function_arguments>.*)\n```<｜tool▁call▁end｜>"
+        )
+
+        self.stream_tool_call_portion_regex = re.compile(
+            r"(?P<type>.*)<｜tool▁sep｜>(?P<function_name>.*)\n```json\n(?P<function_arguments>.*[^\n`])"
+        )
+
+        self.stream_tool_call_name_regex = re.compile(
+            r"(?P<type>.*)<｜tool▁sep｜>(?P<function_name>.*)\n")
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ToolParser "
+                "constructor during construction.")
+        self.tool_calls_start_token_id = self.vocab.get(
+            self.tool_calls_start_token)
+        self.tool_calls_end_token_id = self.vocab.get(
+            self.tool_calls_end_token)
+
+        self.tool_call_start_token_id = self.vocab.get(
+            self.tool_call_start_token)
+        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
+
+        if (self.tool_calls_start_token_id is None
+                or self.tool_calls_end_token_id is None):
+            raise RuntimeError(
+                "DeepSeek-V3 Tool parser could not locate tool call start/end "
+                "tokens in the tokenizer!")
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+
+        # sanity check; avoid unnecessary processing
+        if self.tool_calls_start_token not in model_output:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        else:
+            try:
+                # there are two possible captures - between tags, or between a
+                # tag and end-of-string so the result of
+                # findall is an array of tuples where one is a function call and
+                # the other is None
+                function_call_tuples = self.tool_call_regex.findall(
+                    model_output)
+
+                tool_calls = []
+                for match in function_call_tuples:
+                    tool_type, function_name, function_args = match
+                    tool_calls.append(
+                        ToolCall(
+                            type=tool_type,
+                            function=FunctionCall(name=function_name,
+                                                  arguments=function_args),
+                        ))
+
+                content = model_output[:model_output.
+                                       find(self.tool_calls_start_token)]
+                return ExtractedToolCallInformation(
+                    tools_called=True,
+                    tool_calls=tool_calls,
+                    content=content if content else None,
+                )
+
+            except Exception:
+                logger.exception(
+                    "Error in extracting tool call from response.")
+                return ExtractedToolCallInformation(tools_called=False,
+                                                    tool_calls=[],
+                                                    content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+
+        logger.debug("delta_text: %s", delta_text)
+        logger.debug("delta_token_ids: %s", delta_token_ids)
+        # check to see if we should be streaming a tool call - is there a
+        if self.tool_calls_start_token_id not in current_token_ids:
+            logger.debug("No tool call tokens found!")
+            return DeltaMessage(content=delta_text)
+        delta_text = delta_text.replace(self.tool_calls_start_token,
+                                        "").replace(self.tool_calls_end_token,
+                                                    "")
+        try:
+
+            # figure out where we are in the parsing by counting tool call
+            # start & end tags
+            prev_tool_start_count = previous_token_ids.count(
+                self.tool_call_start_token_id)
+            prev_tool_end_count = previous_token_ids.count(
+                self.tool_call_end_token_id)
+            cur_tool_start_count = current_token_ids.count(
+                self.tool_call_start_token_id)
+            cur_tool_end_count = current_token_ids.count(
+                self.tool_call_end_token_id)
+            tool_call_portion = None
+            text_portion = None
+
+            # case: if we're generating text, OR rounding out a tool call
+            if (cur_tool_start_count == cur_tool_end_count
+                    and prev_tool_end_count == cur_tool_end_count
+                    and self.tool_call_end_token not in delta_text):
+                logger.debug("Generating text content! skipping tool parsing.")
+                return DeltaMessage(content=delta_text)
+
+            if self.tool_call_end_token in delta_text:
+                logger.debug("tool_call_end_token in delta_text")
+                full_text = current_text + delta_text
+                tool_call_portion = full_text.split(
+                    self.tool_call_start_token)[-1].split(
+                        self.tool_call_end_token)[0].rstrip()
+                delta_text = delta_text.split(
+                    self.tool_call_end_token)[0].rstrip()
+                text_portion = delta_text.split(
+                    self.tool_call_end_token)[-1].lstrip()
+
+            # case -- we're starting a new tool call
+            if (cur_tool_start_count > cur_tool_end_count
+                    and cur_tool_start_count > prev_tool_start_count):
+                if len(delta_token_ids) > 1:
+                    tool_call_portion = current_text.split(
+                        self.tool_call_start_token)[-1]
+                else:
+                    tool_call_portion = None
+                    delta = None
+
+                text_portion = None
+
+                # set cursors and state appropriately
+                self.current_tool_id += 1
+                self.current_tool_name_sent = False
+                self.streamed_args_for_tool.append("")
+                logger.debug("Starting on a new tool %s", self.current_tool_id)
+
+            # case -- we're updating an existing tool call
+            elif (cur_tool_start_count > cur_tool_end_count
+                  and cur_tool_start_count == prev_tool_start_count):
+
+                # get the portion of the text that's the tool call
+                tool_call_portion = current_text.split(
+                    self.tool_call_start_token)[-1]
+                text_portion = None
+
+            # case -- the current tool call is being closed.
+            elif (cur_tool_start_count == cur_tool_end_count
+                  and cur_tool_end_count >= prev_tool_end_count):
+                if self.prev_tool_call_arr is None or len(
+                        self.prev_tool_call_arr) == 0:
+                    logger.debug(
+                        "attempting to close tool call, but no tool call")
+                    return None
+                diff = self.prev_tool_call_arr[self.current_tool_id].get(
+                    "arguments")
+                if diff:
+                    diff = (diff.encode("utf-8").decode("unicode_escape")
+                            if diff is str else diff)
+                    if '"}' not in delta_text:
+                        return None
+                    end_loc = delta_text.rindex('"}')
+                    diff = delta_text[:end_loc] + '"}'
+                    logger.debug(
+                        "Finishing tool and found diff that had not "
+                        "been streamed yet: %s",
+                        diff,
+                    )
+                    self.streamed_args_for_tool[self.current_tool_id] += diff
+                    return DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_id,
+                            function=DeltaFunctionCall(
+                                arguments=diff).model_dump(exclude_none=True),
+                        )
+                    ])
+
+            # case -- otherwise we're just generating text
+            else:
+                text = delta_text.replace(self.tool_call_start_token, "")
+                text = text.replace(self.tool_call_end_token, "")
+                delta = DeltaMessage(tool_calls=[], content=text)
+                return delta
+
+            current_tool_call = dict()
+            if tool_call_portion:
+                current_tool_call_matches = (
+                    self.stream_tool_call_portion_regex.match(
+                        tool_call_portion))
+                if current_tool_call_matches:
+                    tool_type, tool_name, tool_args = (
+                        current_tool_call_matches.groups())
+                    current_tool_call["name"] = tool_name
+                    current_tool_call["arguments"] = tool_args
+                else:
+                    current_tool_call_name_matches = (
+                        self.stream_tool_call_name_regex.match(
+                            tool_call_portion))
+                    if current_tool_call_name_matches:
+                        tool_type, tool_name = (
+                            current_tool_call_name_matches.groups())
+                        current_tool_call["name"] = tool_name
+                        current_tool_call["arguments"] = ""
+                    else:
+                        logger.debug("Not enough token")
+                        return None
+
+            # case - we haven't sent the tool name yet. If it's available, send
+            #   it. otherwise, wait until it's available.
+            if not self.current_tool_name_sent:
+                if current_tool_call is None:
+                    return None
+                function_name: Union[str, None] = current_tool_call.get("name")
+                if function_name:
+                    self.current_tool_name_sent = True
+                    return DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_id,
+                            type="function",
+                            id=random_tool_call_id(),
+                            function=DeltaFunctionCall(
+                                name=function_name).model_dump(
+                                    exclude_none=True),
+                        )
+                    ])
+                else:
+                    return None
+
+            # case -- otherwise, send the tool call delta
+
+            # if the tool call portion is None, send the delta as text
+            if tool_call_portion is None:
+                # if there's text but not tool calls, send that -
+                # otherwise None to skip chunk
+                delta = (DeltaMessage(
+                    content=delta_text) if text_portion is not None else None)
+                return delta
+
+            # now, the nitty-gritty of tool calls
+            # now we have the portion to parse as tool call.
+
+            logger.debug("Trying to parse current tool call with ID %s",
+                         self.current_tool_id)
+
+            # if we're starting a new tool call, push an empty object in as
+            #   a placeholder for the arguments
+            if len(self.prev_tool_call_arr) <= self.current_tool_id:
+                self.prev_tool_call_arr.append({})
+
+            # main logic for tool parsing here - compare prev. partially-parsed
+            #   JSON to the current partially-parsed JSON
+            prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
+                "arguments")
+            cur_arguments = current_tool_call.get("arguments")
+
+            logger.debug("diffing old arguments: %s", prev_arguments)
+            logger.debug("against new ones: %s", cur_arguments)
+
+            # case -- no arguments have been created yet. skip sending a delta.
+            if not cur_arguments and not prev_arguments:
+                logger.debug("Skipping text %s - no arguments", delta_text)
+                delta = None
+
+            # case -- prev arguments are defined, but non are now.
+            #   probably impossible, but not a fatal error - just keep going
+            elif not cur_arguments and prev_arguments:
+                logger.error("should be impossible to have arguments reset "
+                             "mid-call. skipping streaming anything.")
+                delta = None
+
+            # case -- we now have the first info about arguments available from
+            #   autocompleting the JSON
+            elif cur_arguments and not prev_arguments:
+
+                delta = DeltaMessage(tool_calls=[
+                    DeltaToolCall(
+                        index=self.current_tool_id,
+                        function=DeltaFunctionCall(
+                            arguments=cur_arguments).model_dump(
+                                exclude_none=True),
+                    )
+                ])
+                self.streamed_args_for_tool[
+                    self.current_tool_id] = cur_arguments
+
+            # last case -- we have an update to existing arguments.
+            elif cur_arguments and prev_arguments:
+                if (isinstance(delta_text, str)
+                        and cur_arguments != prev_arguments
+                        and len(cur_arguments) > len(prev_arguments)
+                        and cur_arguments.startswith(prev_arguments)):
+                    delta_arguments = cur_arguments[len(prev_arguments):]
+                    logger.debug("got diff %s", delta_text)
+
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_id,
+                            function=DeltaFunctionCall(
+                                arguments=delta_arguments).model_dump(
+                                    exclude_none=True),
+                        )
+                    ])
+                    self.streamed_args_for_tool[
+                        self.current_tool_id] = cur_arguments
+                else:
+                    delta = None
+
+            # handle saving the state for the current tool into
+            # the "prev" list for use in diffing for the next iteration
+            if self.current_tool_id == len(self.prev_tool_call_arr) - 1:
+                self.prev_tool_call_arr[
+                    self.current_tool_id] = current_tool_call
+            else:
+                self.prev_tool_call_arr.append(current_tool_call)
+
+            return delta
+
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            return None  # do not stream a delta. skip this token ID.
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
new file mode 100644
index 0000000..c3f9d79
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
@@ -0,0 +1,402 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# code modified from deepseekv3_tool_parser.py
+
+from collections.abc import Sequence
+from typing import Union
+
+import regex as re
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("glm4_moe")
+class Glm4MoeModelToolParser(ToolParser):
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+        self.current_tool_name_sent = False
+        self.prev_tool_call_arr: list[dict] = []
+        self.current_tool_id = -1
+        self.streamed_args_for_tool: list[str] = []
+        self.tool_call_start_token = "<tool_call>"
+        self.tool_call_end_token = "</tool_call>"
+
+        self.tool_calls_start_token = self.tool_call_start_token
+
+        # Updated regex for the XML-based format
+        self.tool_call_regex = re.compile(
+            r"<tool_call>\s*"
+            r"(?P<function_name>[^\n<]+)\s*"  # 函数名（到换行或 <）
+            r"(?P<arguments>(?:\s*<arg_key>[^<]+</arg_key>\s*"
+            r"<arg_value>[^<]*</arg_value>\s*)*)\s*"
+            r"</tool_call>",
+            re.DOTALL,
+        )
+
+        # Regex for parsing individual arguments
+        self.arg_regex = re.compile(
+            r"<arg_key>(?P<key>[^<]+)</arg_key>\s*<arg_value>(?P<value>[^<]*)</arg_value>",
+            re.DOTALL,
+        )
+
+        # Streaming regex
+        self.stream_tool_call_portion_regex = re.compile(
+            r"(?P<function_name>[^\n<]+)\s*"
+            r"(?P<arguments>(?:\s*<arg_key>[^<]+</arg_key>\s*"
+            r"<arg_value>[^<]*</arg_value>\s*)*)",
+            re.DOTALL,
+        )
+
+        # For streaming, we also need a regex to match just the function name
+        self.stream_tool_call_name_regex = re.compile(
+            r"(?P<function_name>[^\n<]+)",
+            re.DOTALL,
+        )
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ToolParser "
+                "constructor during construction.")
+
+        self.tool_call_start_token_id = self.vocab.get(
+            self.tool_call_start_token)
+        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
+
+    def _parse_arguments(self, args_text: str) -> str:
+        """Parse XML-based arguments into JSON format."""
+        if not args_text or not args_text.strip():
+            return "{}"
+
+        args_dict = {}
+        matches = self.arg_regex.findall(args_text)
+
+        for key, value in matches:
+            args_dict[key.strip()] = value.strip()
+
+        import json
+        return json.dumps(args_dict, ensure_ascii=False)
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+
+        # sanity check; avoid unnecessary processing
+        if self.tool_calls_start_token not in model_output:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        try:
+            # Find all tool calls in the output
+            function_call_matches = self.tool_call_regex.findall(model_output)
+
+            logger.debug("function_call_matches: %s", function_call_matches)
+
+            if not function_call_matches:
+                return ExtractedToolCallInformation(
+                    tools_called=False,
+                    tool_calls=[],
+                    content=model_output,
+                )
+
+            tool_calls = []
+            for i, match in enumerate(function_call_matches):
+                function_name, function_args_xml = match
+                function_name = function_name.strip()
+
+                # Parse XML arguments to JSON
+                function_args_json = self._parse_arguments(function_args_xml)
+
+                tool_calls.append(
+                    ToolCall(
+                        id=f"call_{i}",
+                        type='function',
+                        function=FunctionCall(name=function_name,
+                                              arguments=function_args_json),
+                    ))
+
+            # Extract content before the first tool call
+            content = model_output[:model_output.find(self.
+                                                      tool_calls_start_token)]
+            return ExtractedToolCallInformation(
+                tools_called=bool(tool_calls),
+                tool_calls=tool_calls,
+                content=content.strip() if content.strip() else None,
+            )
+
+        except Exception:
+            logger.exception("Error in extracting tool call from response.")
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+
+        logger.debug("delta_text: %s", delta_text)
+        logger.debug("delta_token_ids: %s", delta_token_ids)
+        # check to see if we should be streaming a tool call - is there a
+        if self.tool_call_start_token_id not in current_token_ids:
+            logger.debug("No tool call tokens found!")
+            return DeltaMessage(content=delta_text)
+        delta_text = delta_text.replace(self.tool_calls_start_token,
+                                        "").replace(self.tool_call_end_token,
+                                                    "")
+        try:
+
+            # figure out where we are in the parsing by counting tool call
+            # start & end tags
+            prev_tool_start_count = previous_token_ids.count(
+                self.tool_call_start_token_id)
+            prev_tool_end_count = previous_token_ids.count(
+                self.tool_call_end_token_id)
+            cur_tool_start_count = current_token_ids.count(
+                self.tool_call_start_token_id)
+            cur_tool_end_count = current_token_ids.count(
+                self.tool_call_end_token_id)
+            tool_call_portion = None
+            text_portion = None
+
+            # case: if we're generating text, OR rounding out a tool call
+            if (cur_tool_start_count == cur_tool_end_count
+                    and prev_tool_end_count == cur_tool_end_count
+                    and self.tool_call_end_token not in delta_text):
+                logger.debug("Generating text content! skipping tool parsing.")
+                return DeltaMessage(content=delta_text)
+
+            if self.tool_call_end_token in delta_text:
+                logger.debug("tool_call_end_token in delta_text")
+                full_text = current_text + delta_text
+                tool_call_portion = full_text.split(
+                    self.tool_call_start_token)[-1].split(
+                        self.tool_call_end_token)[0].rstrip()
+                delta_text = delta_text.split(
+                    self.tool_call_end_token)[0].rstrip()
+                text_portion = delta_text.split(
+                    self.tool_call_end_token)[-1].lstrip()
+
+            # case -- we're starting a new tool call
+            if (cur_tool_start_count > cur_tool_end_count
+                    and cur_tool_start_count > prev_tool_start_count):
+                if len(delta_token_ids) > 1:
+                    tool_call_portion = current_text.split(
+                        self.tool_call_start_token)[-1]
+                else:
+                    tool_call_portion = None
+                    delta = None
+
+                text_portion = None
+
+                # set cursors and state appropriately
+                self.current_tool_id += 1
+                self.current_tool_name_sent = False
+                self.streamed_args_for_tool.append("")
+                logger.debug("Starting on a new tool %s", self.current_tool_id)
+
+            # case -- we're updating an existing tool call
+            elif (cur_tool_start_count > cur_tool_end_count
+                  and cur_tool_start_count == prev_tool_start_count):
+
+                # get the portion of the text that's the tool call
+                tool_call_portion = current_text.split(
+                    self.tool_call_start_token)[-1]
+                text_portion = None
+
+            # case -- the current tool call is being closed.
+            elif (cur_tool_start_count == cur_tool_end_count
+                  and cur_tool_end_count >= prev_tool_end_count):
+                if self.prev_tool_call_arr is None or len(
+                        self.prev_tool_call_arr) == 0:
+                    logger.debug(
+                        "attempting to close tool call, but no tool call")
+                    return None
+                diff = self.prev_tool_call_arr[self.current_tool_id].get(
+                    "arguments")
+                if diff:
+                    diff = (diff.encode("utf-8").decode("unicode_escape")
+                            if diff is str else diff)
+                    if '"}' not in delta_text:
+                        return None
+                    end_loc = delta_text.rindex('"}')
+                    diff = delta_text[:end_loc] + '"}'
+                    logger.debug(
+                        "Finishing tool and found diff that had not "
+                        "been streamed yet: %s",
+                        diff,
+                    )
+                    self.streamed_args_for_tool[self.current_tool_id] += diff
+                    return DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_id,
+                            function=DeltaFunctionCall(
+                                arguments=diff).model_dump(exclude_none=True),
+                        )
+                    ])
+
+            # case -- otherwise we're just generating text
+            else:
+                text = delta_text.replace(self.tool_call_start_token, "")
+                text = text.replace(self.tool_call_end_token, "")
+                delta = DeltaMessage(tool_calls=[], content=text)
+                return delta
+
+            current_tool_call = dict()
+            if tool_call_portion:
+                current_tool_call_matches = (
+                    self.stream_tool_call_portion_regex.match(
+                        tool_call_portion))
+                if current_tool_call_matches:
+                    tool_id, tool_args = (current_tool_call_matches.groups())
+                    tool_name = tool_id.split('.')[1].split(':')[0]
+                    current_tool_call['id'] = tool_id
+                    current_tool_call["name"] = tool_name
+                    current_tool_call["arguments"] = tool_args
+                else:
+                    current_tool_call_name_matches = (
+                        self.stream_tool_call_name_regex.match(
+                            tool_call_portion))
+                    if current_tool_call_name_matches:
+                        tool_id_str, = current_tool_call_name_matches.groups()
+                        tool_name = tool_id_str.split('.')[1].split(':')[0]
+                        current_tool_call['id'] = tool_id_str
+                        current_tool_call["name"] = tool_name
+                        current_tool_call["arguments"] = ""
+                    else:
+                        logger.debug("Not enough token")
+                        return None
+
+            # case - we haven't sent the tool name yet. If it's available, send
+            #   it. otherwise, wait until it's available.
+            if not self.current_tool_name_sent:
+                if current_tool_call is None:
+                    return None
+                function_name: Union[str, None] = current_tool_call.get("name")
+                tool_id = current_tool_call.get("id")
+                if function_name:
+                    self.current_tool_name_sent = True
+                    return DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_id,
+                            type="function",
+                            id=tool_id,
+                            function=DeltaFunctionCall(
+                                name=function_name).model_dump(
+                                    exclude_none=True),
+                        )
+                    ])
+                else:
+                    return None
+
+            # case -- otherwise, send the tool call delta
+
+            # if the tool call portion is None, send the delta as text
+            if tool_call_portion is None:
+                # if there's text but not tool calls, send that -
+                # otherwise None to skip chunk
+                delta = (DeltaMessage(
+                    content=delta_text) if text_portion is not None else None)
+                return delta
+
+            # now, the nitty-gritty of tool calls
+            # now we have the portion to parse as tool call.
+
+            logger.debug("Trying to parse current tool call with ID %s",
+                         self.current_tool_id)
+
+            # if we're starting a new tool call, push an empty object in as
+            #   a placeholder for the arguments
+            if len(self.prev_tool_call_arr) <= self.current_tool_id:
+                self.prev_tool_call_arr.append({})
+
+            # main logic for tool parsing here - compare prev. partially-parsed
+            #   JSON to the current partially-parsed JSON
+            prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
+                "arguments")
+            cur_arguments = current_tool_call.get("arguments")
+
+            logger.debug("diffing old arguments: %s", prev_arguments)
+            logger.debug("against new ones: %s", cur_arguments)
+
+            # case -- no arguments have been created yet. skip sending a delta.
+            if not cur_arguments and not prev_arguments:
+                logger.debug("Skipping text %s - no arguments", delta_text)
+                delta = None
+
+            # case -- prev arguments are defined, but non are now.
+            #   probably impossible, but not a fatal error - just keep going
+            elif not cur_arguments and prev_arguments:
+                logger.error("should be impossible to have arguments reset "
+                             "mid-call. skipping streaming anything.")
+                delta = None
+
+            # case -- we now have the first info about arguments available from
+            #   autocompleting the JSON
+            elif cur_arguments and not prev_arguments:
+
+                delta = DeltaMessage(tool_calls=[
+                    DeltaToolCall(
+                        index=self.current_tool_id,
+                        function=DeltaFunctionCall(
+                            arguments=cur_arguments).model_dump(
+                                exclude_none=True),
+                    )
+                ])
+                self.streamed_args_for_tool[
+                    self.current_tool_id] = cur_arguments
+
+            # last case -- we have an update to existing arguments.
+            elif cur_arguments and prev_arguments:
+                if (isinstance(delta_text, str)
+                        and cur_arguments != prev_arguments
+                        and len(cur_arguments) > len(prev_arguments)
+                        and cur_arguments.startswith(prev_arguments)):
+                    delta_arguments = cur_arguments[len(prev_arguments):]
+                    logger.debug("got diff %s", delta_text)
+
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_id,
+                            function=DeltaFunctionCall(
+                                arguments=delta_arguments).model_dump(
+                                    exclude_none=True),
+                        )
+                    ])
+                    self.streamed_args_for_tool[
+                        self.current_tool_id] = cur_arguments
+                else:
+                    delta = None
+
+            # handle saving the state for the current tool into
+            # the "prev" list for use in diffing for the next iteration
+            if self.current_tool_id == len(self.prev_tool_call_arr) - 1:
+                self.prev_tool_call_arr[
+                    self.current_tool_id] = current_tool_call
+            else:
+                self.prev_tool_call_arr.append(current_tool_call)
+
+            return delta
+
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            return None  # do not stream a delta. skip this token ID.
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
new file mode 100644
index 0000000..5508ba6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
@@ -0,0 +1,259 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from collections.abc import Sequence
+from json import JSONDecoder
+from typing import Union
+
+import partial_json_parser
+import regex as re
+from partial_json_parser.core.options import Allow
+
+from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.entrypoints.openai.tool_parsers.utils import (consume_space,
+                                                        find_common_prefix,
+                                                        is_complete_json,
+                                                        partial_json_loads)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("granite-20b-fc")
+class Granite20bFCToolParser(ToolParser):
+    """
+    Tool call parser for the granite-20b-functioncalling model intended
+    for use with the examples/tool_chat_template_granite20b_fc.jinja
+    template.
+
+    Used when --enable-auto-tool-choice --tool-call-parser granite-20-fc
+    are all set
+    """
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+        self.bot_token = "<function_call>"
+        self.tool_start_token = self.bot_token
+        self.tool_call_regex = re.compile(r"<function_call>\s*")
+
+    def extract_tool_calls(
+            self, model_output: str,
+            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        if self.tool_start_token not in model_output:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        dec = JSONDecoder()
+        try:
+            matches = list(self.tool_call_regex.finditer(model_output))
+            logger.debug("Found %d tool call matches", len(matches))
+
+            raw_function_calls = []
+
+            for i, match in enumerate(matches):
+                # position after the <function_call> tag
+                start_of_json = match.end()
+                # end_index == the start of the next function call
+                # (if exists)
+                next_function_call_start = (matches[i + 1].start() if i +
+                                            1 < len(matches) else None)
+
+                raw_function_calls.append(
+                    dec.raw_decode(
+                        model_output[start_of_json:next_function_call_start])
+                    [0])
+
+            logger.debug("Extracted %d tool calls", len(raw_function_calls))
+            tool_calls = [
+                ToolCall(
+                    type="function",
+                    function=FunctionCall(
+                        name=function_call["name"],
+                        # function call args are JSON but as a string
+                        arguments=json.dumps(function_call["arguments"],
+                                             ensure_ascii=False),
+                    ),
+                ) for function_call in raw_function_calls
+            ]
+
+            content = model_output[:model_output.find(self.bot_token)]
+            return ExtractedToolCallInformation(
+                tools_called=True,
+                tool_calls=tool_calls,
+                content=content if content else None,
+            )
+
+        except Exception as e:
+            logger.error("Error in extracting tool call from response %s", e)
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+
+        if len(current_text) < len(
+                self.bot_token) and self.bot_token.startswith(current_text):
+            return None
+
+        if not current_text.startswith(self.bot_token):
+            return DeltaMessage(content=delta_text)
+
+        # bit mask flags for partial JSON parsing. If the name hasn't been
+        # sent yet, don't allow sending
+        # an incomplete string since OpenAI only ever (as far as I have
+        # seen) allows sending the entire tool/ function name at once.
+        flags = Allow.ALL if self.current_tool_name_sent \
+            else Allow.ALL & ~Allow.STR
+        try:
+            tool_call_arr = []
+            is_complete = []
+            try:
+                start_idx = len(self.bot_token)
+                start_idx = consume_space(start_idx, current_text)
+
+                while start_idx < len(current_text):
+                    (obj,
+                     end_idx) = partial_json_loads(current_text[start_idx:],
+                                                   flags)
+                    is_complete.append(
+                        is_complete_json(current_text[start_idx:start_idx +
+                                                      end_idx]))
+                    start_idx += end_idx
+                    start_idx = consume_space(start_idx, current_text)
+                    start_idx += len(self.bot_token)
+                    start_idx = consume_space(start_idx, current_text)
+                    tool_call_arr.append(obj)
+            except partial_json_parser.core.exceptions.MalformedJSON:
+                logger.debug('not enough tokens to parse into JSON yet')
+                return None
+
+            # select as the current tool call the one we're on the state at
+            current_tool_call: dict = tool_call_arr[self.current_tool_id] \
+                if len(tool_call_arr) > 0 else {}
+
+            # case -- if no tokens have been streamed for the tool, e.g.
+            #   only the array brackets, stream nothing
+            if len(tool_call_arr) == 0:
+                return None
+
+            # case: we are starting a new tool in the array
+            #   -> array has > 0 length AND length has moved past cursor
+            elif (len(tool_call_arr) > 0
+                  and len(tool_call_arr) > self.current_tool_id + 1):
+
+                # if we're moving on to a new call, first make sure we
+                # haven't missed anything in the previous one that was
+                # auto-generated due to JSON completions, but wasn't
+                # streamed to the client yet.
+                if self.current_tool_id >= 0:
+                    cur_arguments = current_tool_call.get("arguments")
+                    if cur_arguments:
+                        cur_args_json = json.dumps(cur_arguments,
+                                                   ensure_ascii=False)
+                        sent = len(
+                            self.streamed_args_for_tool[self.current_tool_id])
+                        argument_diff = cur_args_json[sent:]
+
+                        logger.debug("got arguments diff: %s", argument_diff)
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=argument_diff).
+                                          model_dump(exclude_none=True))
+                        ])
+                        self.streamed_args_for_tool[
+                            self.current_tool_id] += argument_diff
+                    else:
+                        delta = None
+                else:
+                    delta = None
+                # re-set stuff pertaining to progress in the current tool
+                self.current_tool_id = len(tool_call_arr) - 1
+                self.current_tool_name_sent = False
+                self.streamed_args_for_tool.append("")
+                logger.debug("starting on new tool %d", self.current_tool_id)
+                return delta
+
+            # if the current tool name hasn't been sent, send if available
+            # - otherwise send nothing
+            elif not self.current_tool_name_sent:
+                function_name = current_tool_call.get("name")
+                if function_name:
+
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      type="function",
+                                      id=random_tool_call_id(),
+                                      function=DeltaFunctionCall(
+                                          name=function_name).model_dump(
+                                              exclude_none=True))
+                    ])
+                    self.current_tool_name_sent = True
+                else:
+                    delta = None
+
+            # now we know we're on the same tool call and we're streaming
+            # arguments
+            else:
+                cur_arguments = current_tool_call.get("arguments")
+                delta = None
+
+                if cur_arguments:
+                    sent = len(
+                        self.streamed_args_for_tool[self.current_tool_id])
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
+                    prev_arguments = self.prev_tool_call_arr[
+                        self.current_tool_id].get("arguments")
+
+                    argument_diff = None
+                    if is_complete[self.current_tool_id]:
+                        argument_diff = cur_args_json[sent:]
+                    elif prev_arguments:
+                        prev_args_json = json.dumps(prev_arguments,
+                                                    ensure_ascii=False)
+                        if cur_args_json != prev_args_json:
+
+                            prefix = find_common_prefix(
+                                prev_args_json, cur_args_json)
+                            argument_diff = prefix[sent:]
+
+                    if argument_diff is not None:
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=argument_diff).
+                                          model_dump(exclude_none=True))
+                        ])
+                        self.streamed_args_for_tool[
+                            self.current_tool_id] += argument_diff
+
+            self.prev_tool_call_arr = tool_call_arr
+            return delta
+
+        except Exception as e:
+            logger.error("Error trying to handle streaming tool call: %s", e)
+            logger.debug(
+                "Skipping chunk as a result of tool streaming extraction "
+                "error")
+            return None
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
new file mode 100644
index 0000000..fcc5b7e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
@@ -0,0 +1,237 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from collections.abc import Sequence
+from typing import Union
+
+import partial_json_parser
+from partial_json_parser.core.options import Allow
+
+from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.entrypoints.openai.tool_parsers.utils import (consume_space,
+                                                        find_common_prefix,
+                                                        is_complete_json,
+                                                        partial_json_loads)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("granite")
+class GraniteToolParser(ToolParser):
+    """
+    Tool call parser for the granite 3.0 models. Intended
+    for use with the examples/tool_chat_template_granite.jinja
+    template.
+
+    Used when --enable-auto-tool-choice --tool-call-parser granite
+    are all set
+    """
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+        # for granite 3.0, the token `<|tool_call|>`
+        self.bot_token = "<|tool_call|>"
+        # for granite 3.1, the string `<tool_call>`
+        self.bot_string = "<tool_call>"
+
+    def extract_tool_calls(
+            self, model_output: str,
+            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        stripped = model_output.strip()\
+                    .removeprefix(self.bot_token)\
+                    .removeprefix(self.bot_string)\
+                    .lstrip()
+        if not stripped or stripped[0] != '[':
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+        try:
+            raw_function_calls = json.loads(stripped)
+            if not isinstance(raw_function_calls, list):
+                raise Exception(
+                    f"Expected dict or list, got {type(raw_function_calls)}")
+
+            logger.debug("Extracted %d tool calls", len(raw_function_calls))
+            tool_calls = [
+                ToolCall(
+                    type="function",
+                    function=FunctionCall(
+                        name=function_call["name"],
+                        # function call args are JSON but as a string
+                        arguments=json.dumps(function_call["arguments"],
+                                             ensure_ascii=False),
+                    ),
+                ) for function_call in raw_function_calls
+            ]
+
+            return ExtractedToolCallInformation(
+                tools_called=True,
+                tool_calls=tool_calls,
+                content=None,
+            )
+
+        except Exception as e:
+            logger.error("Error in extracting tool call from response %s", e)
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+
+        start_idx = consume_space(0, current_text)
+        if current_text[start_idx:].startswith(self.bot_token):
+            start_idx = consume_space(start_idx + len(self.bot_token),
+                                      current_text)
+        if current_text[start_idx:].startswith(self.bot_string):
+            start_idx = consume_space(start_idx + len(self.bot_string),
+                                      current_text)
+        if not current_text or start_idx >= len(current_text)\
+            or current_text[start_idx] != '[':
+            return DeltaMessage(content=delta_text)
+
+        # bit mask flags for partial JSON parsing. If the name hasn't been
+        # sent yet, don't allow sending
+        # an incomplete string since OpenAI only ever (as far as I have
+        # seen) allows sending the entire tool/ function name at once.
+        flags = Allow.ALL if self.current_tool_name_sent \
+            else Allow.ALL & ~Allow.STR
+        try:
+            tool_call_arr = None
+            is_complete = None
+            try:
+                tool_calls, end_idx = partial_json_loads(
+                    current_text[start_idx:], flags)
+                if type(tool_calls) is list:
+                    tool_call_arr = tool_calls
+                else:
+                    return DeltaMessage(content=delta_text)
+
+                is_complete = [True] * len(tool_calls)
+                if not is_complete_json(
+                        current_text[start_idx:start_idx + end_idx]):
+                    is_complete[-1] = False
+            except partial_json_parser.core.exceptions.MalformedJSON:
+                logger.debug('not enough tokens to parse into JSON yet')
+                return None
+
+            # case -- if no tokens have been streamed for the tool, e.g.
+            #   only the array brackets, stream nothing
+            if not tool_call_arr:
+                return None
+
+            # select as the current tool call the one we're on the state at
+            current_tool_call: dict = tool_call_arr[self.current_tool_id]
+
+            delta = None
+            # case: we are starting a new tool in the array
+            #   -> array has > 0 length AND length has moved past cursor
+            if len(tool_call_arr) > self.current_tool_id + 1:
+
+                # if we're moving on to a new call, first make sure we
+                # haven't missed anything in the previous one that was
+                # auto-generated due to JSON completions, but wasn't
+                # streamed to the client yet.
+                if self.current_tool_id >= 0:
+                    cur_arguments = current_tool_call.get("arguments")
+                    if cur_arguments:
+                        cur_args_json = json.dumps(cur_arguments,
+                                                   ensure_ascii=False)
+                        sent = len(
+                            self.streamed_args_for_tool[self.current_tool_id])
+                        argument_diff = cur_args_json[sent:]
+
+                        logger.debug("got arguments diff: %s", argument_diff)
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=argument_diff).
+                                          model_dump(exclude_none=True))
+                        ])
+                        self.streamed_args_for_tool[
+                            self.current_tool_id] += argument_diff
+
+                # re-set stuff pertaining to progress in the current tool
+                self.current_tool_id = len(tool_call_arr) - 1
+                self.current_tool_name_sent = False
+                self.streamed_args_for_tool.append("")
+                logger.debug("starting on new tool %d", self.current_tool_id)
+                return delta
+
+            # if the current tool name hasn't been sent, send if available
+            # - otherwise send nothing
+            elif not self.current_tool_name_sent:
+                function_name = current_tool_call.get("name")
+                if function_name:
+
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      type="function",
+                                      id=random_tool_call_id(),
+                                      function=DeltaFunctionCall(
+                                          name=function_name).model_dump(
+                                              exclude_none=True))
+                    ])
+                    self.current_tool_name_sent = True
+
+            # now we know we're on the same tool call and we're streaming
+            # arguments
+            else:
+                cur_arguments = current_tool_call.get("arguments")
+
+                if cur_arguments:
+                    sent = len(
+                        self.streamed_args_for_tool[self.current_tool_id])
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
+                    prev_arguments = self.prev_tool_call_arr[
+                        self.current_tool_id].get("arguments")
+
+                    argument_diff = None
+                    if is_complete[self.current_tool_id]:
+                        argument_diff = cur_args_json[sent:]
+                    elif prev_arguments:
+                        prev_args_json = json.dumps(prev_arguments,
+                                                    ensure_ascii=False)
+                        if cur_args_json != prev_args_json:
+                            prefix = find_common_prefix(
+                                prev_args_json, cur_args_json)
+                            argument_diff = prefix[sent:]
+
+                    if argument_diff is not None:
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=argument_diff).
+                                          model_dump(exclude_none=True))
+                        ])
+                        self.streamed_args_for_tool[
+                            self.current_tool_id] += argument_diff
+
+            self.prev_tool_call_arr = tool_call_arr
+            return delta
+
+        except Exception as e:
+            logger.error("Error trying to handle streaming tool call: %s", e)
+            logger.debug(
+                "Skipping chunk as a result of tool streaming extraction "
+                "error")
+            return None
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
new file mode 100644
index 0000000..c7030d3
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -0,0 +1,371 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from collections.abc import Sequence
+from typing import Union
+
+import partial_json_parser
+import regex as re
+from partial_json_parser.core.options import Allow
+
+from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("hermes")
+class Hermes2ProToolParser(ToolParser):
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+        if isinstance(self.model_tokenizer, MistralTokenizer):
+            logger.error(
+                "Detected Mistral tokenizer when using a Hermes model")
+            self.model_tokenizer = self.model_tokenizer.tokenizer
+
+        self.current_tool_name_sent: bool = False
+        self.prev_tool_call_arr: list[dict] = []
+        self.current_tool_id: int = -1
+        self.streamed_args_for_tool: list[str] = [
+        ]  # map what has been streamed for each tool so far to a list
+
+        self.tool_call_start_token: str = "<tool_call>"
+        self.tool_call_end_token: str = "</tool_call>"
+
+        self.tool_call_regex = re.compile(
+            r"<tool_call>(.*?)</tool_call>|<tool_call>(.*)", re.DOTALL)
+        self.scratch_pad_regex = re.compile(
+            r"<scratch_pad>(.*?)</scratch_pad>", re.DOTALL)
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ToolParser "
+                "constructor during construction.")
+        self.tool_call_start_token_id = self.vocab.get(
+            self.tool_call_start_token)
+        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
+        if (self.tool_call_start_token_id is None
+                or self.tool_call_end_token_id is None):
+            raise RuntimeError(
+                "Hermes 2 Pro Tool parser could not locate tool call start/end "
+                "tokens in the tokenizer!")
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+
+        # sanity check; avoid unnecessary processing
+        if self.tool_call_start_token not in model_output:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        else:
+
+            try:
+                # there are two possible captures - between tags, or between a
+                # tag and end-of-string so the result of
+                # findall is an array of tuples where one is a function call and
+                # the other is None
+                function_call_tuples = (
+                    self.tool_call_regex.findall(model_output))
+
+                # load the JSON, and then use it to build the Function and
+                # Tool Call
+                raw_function_calls = [
+                    json.loads(match[0] if match[0] else match[1])
+                    for match in function_call_tuples
+                ]
+                tool_calls = [
+                    ToolCall(
+                        type="function",
+                        function=FunctionCall(
+                            name=function_call["name"],
+                            # function call args are JSON but as a string
+                            arguments=json.dumps(function_call["arguments"],
+                                                 ensure_ascii=False)))
+                    for function_call in raw_function_calls
+                ]
+
+                content = model_output[:model_output.
+                                       find(self.tool_call_start_token)]
+                return ExtractedToolCallInformation(
+                    tools_called=True,
+                    tool_calls=tool_calls,
+                    content=content if content else None)
+
+            except Exception:
+                logger.exception(
+                    "Error in extracting tool call from response.")
+                return ExtractedToolCallInformation(tools_called=False,
+                                                    tool_calls=[],
+                                                    content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+
+        logger.debug("delta_text: %s", delta_text)
+        logger.debug("delta_token_ids: %s", delta_token_ids)
+        # check to see if we should be streaming a tool call - is there a
+        if self.tool_call_start_token_id not in current_token_ids:
+            logger.debug("No tool call tokens found!")
+            return DeltaMessage(content=delta_text)
+
+        try:
+
+            # figure out where we are in the parsing by counting tool call
+            # start & end tags
+            prev_tool_start_count = previous_token_ids.count(
+                self.tool_call_start_token_id)
+            prev_tool_end_count = previous_token_ids.count(
+                self.tool_call_end_token_id)
+            cur_tool_start_count = current_token_ids.count(
+                self.tool_call_start_token_id)
+            cur_tool_end_count = current_token_ids.count(
+                self.tool_call_end_token_id)
+            tool_call_portion = None
+            text_portion = None
+
+            # case: if we're generating text, OR rounding out a tool call
+            if (cur_tool_start_count == cur_tool_end_count
+                    and prev_tool_end_count == cur_tool_end_count
+                    and self.tool_call_end_token not in delta_text):
+                logger.debug("Generating text content! skipping tool parsing.")
+                return DeltaMessage(content=delta_text)
+
+            if self.tool_call_end_token in delta_text:
+                logger.debug("tool_call_end_token in delta_text")
+                full_text = current_text + delta_text
+                tool_call_portion = full_text.split(
+                    self.tool_call_start_token)[-1].split(
+                        self.tool_call_end_token)[0].rstrip()
+                delta_text = delta_text.split(
+                    self.tool_call_end_token)[0].rstrip()
+                text_portion = delta_text.split(
+                    self.tool_call_end_token)[-1].lstrip()
+
+            # case: if tool open & close tag counts don't match, we're doing
+            # imaginary "else" block here
+            # something with tools with this diff.
+            # flags for partial JSON parting. exported constants from
+            # "Allow" are handled via BIT MASK
+            flags = Allow.ALL if self.current_tool_name_sent \
+                else Allow.ALL & ~Allow.STR
+
+            # case -- we're starting a new tool call
+            if (cur_tool_start_count > cur_tool_end_count
+                    and cur_tool_start_count > prev_tool_start_count):
+                if len(delta_token_ids) > 1:
+                    tool_call_portion = current_text.split(
+                        self.tool_call_start_token)[-1]
+                else:
+                    tool_call_portion = None
+                    delta = None
+
+                text_portion = None
+
+                # set cursors and state appropriately
+                self.current_tool_id += 1
+                self.current_tool_name_sent = False
+                self.streamed_args_for_tool.append("")
+                logger.debug("Starting on a new tool %s", self.current_tool_id)
+
+            # case -- we're updating an existing tool call
+            elif (cur_tool_start_count > cur_tool_end_count
+                  and cur_tool_start_count == prev_tool_start_count):
+
+                # get the portion of the text that's the tool call
+                tool_call_portion = current_text.split(
+                    self.tool_call_start_token)[-1]
+                text_portion = None
+
+            # case -- the current tool call is being closed.
+            elif (cur_tool_start_count == cur_tool_end_count
+                  and cur_tool_end_count >= prev_tool_end_count):
+                if (self.prev_tool_call_arr is None
+                        or len(self.prev_tool_call_arr) == 0):
+                    logger.debug(
+                        "attempting to close tool call, but no tool call")
+                    return None
+                diff = self.prev_tool_call_arr[self.current_tool_id].get(
+                    "arguments")
+                if diff:
+                    diff = diff.encode('utf-8').decode(
+                        'unicode_escape') if diff is str else diff
+                    if ('"}' not in delta_text):
+                        return None
+                    end_loc = delta_text.rindex('"}')
+                    diff = delta_text[:end_loc] + '"}'
+                    logger.debug(
+                        "Finishing tool and found diff that had not "
+                        "been streamed yet: %s", diff)
+                    self.streamed_args_for_tool[self.current_tool_id] \
+                        += diff
+                    return DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      function=DeltaFunctionCall(
+                                          arguments=diff).model_dump(
+                                              exclude_none=True))
+                    ])
+
+            # case -- otherwise we're just generating text
+            else:
+                text = delta_text.replace(self.tool_call_start_token, "")
+                text = text.replace(self.tool_call_end_token, "")
+                delta = DeltaMessage(tool_calls=[], content=text)
+                return delta
+
+            try:
+
+                current_tool_call = partial_json_parser.loads(
+                    tool_call_portion or "{}",
+                    flags) if tool_call_portion else None
+                logger.debug("Parsed tool call %s", current_tool_call)
+            except partial_json_parser.core.exceptions.MalformedJSON:
+                logger.debug('not enough tokens to parse into JSON yet')
+                return None
+            except json.decoder.JSONDecodeError:
+                logger.debug("unable to parse JSON")
+                return None
+
+            # case - we haven't sent the tool name yet. If it's available, send
+            #   it. otherwise, wait until it's available.
+            if not self.current_tool_name_sent:
+                if (current_tool_call is None):
+                    return None
+                function_name: Union[str, None] = current_tool_call.get("name")
+                if function_name:
+                    self.current_tool_name_sent = True
+                    return DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      type="function",
+                                      id=random_tool_call_id(),
+                                      function=DeltaFunctionCall(
+                                          name=function_name).model_dump(
+                                              exclude_none=True))
+                    ])
+                else:
+                    return None
+            # case -- otherwise, send the tool call delta
+
+            # if the tool call portion is None, send the delta as text
+            if tool_call_portion is None:
+                # if there's text but not tool calls, send that -
+                # otherwise None to skip chunk
+                delta = DeltaMessage(content=delta_text) \
+                    if text_portion is not None else None
+                return delta
+
+            # now, the nitty-gritty of tool calls
+            # now we have the portion to parse as tool call.
+
+            logger.debug("Trying to parse current tool call with ID %s",
+                         self.current_tool_id)
+
+            # if we're starting a new tool call, push an empty object in as
+            #   a placeholder for the arguments
+            if len(self.prev_tool_call_arr) <= self.current_tool_id:
+                self.prev_tool_call_arr.append({})
+
+            # main logic for tool parsing here - compare prev. partially-parsed
+            #   JSON to the current partially-parsed JSON
+            prev_arguments = (
+                self.prev_tool_call_arr[self.current_tool_id].get("arguments"))
+            cur_arguments = current_tool_call.get("arguments")
+
+            logger.debug("diffing old arguments: %s", prev_arguments)
+            logger.debug("against new ones: %s", cur_arguments)
+
+            # case -- no arguments have been created yet. skip sending a delta.
+            if not cur_arguments and not prev_arguments:
+                logger.debug("Skipping text %s - no arguments", delta_text)
+                delta = None
+
+            # case -- prev arguments are defined, but non are now.
+            #   probably impossible, but not a fatal error - just keep going
+            elif not cur_arguments and prev_arguments:
+                logger.error("should be impossible to have arguments reset "
+                             "mid-call. skipping streaming anything.")
+                delta = None
+
+            # case -- we now have the first info about arguments available from
+            #   autocompleting the JSON
+            elif cur_arguments and not prev_arguments:
+
+                cur_arguments_json = json.dumps(cur_arguments,
+                                                ensure_ascii=False)
+                logger.debug("finding %s in %s", delta_text,
+                             cur_arguments_json)
+
+                # get the location where previous args differ from current
+                if (delta_text not in cur_arguments_json[:-2]):
+                    return None
+                args_delta_start_loc = cur_arguments_json[:-2]. \
+                                           rindex(delta_text) + \
+                                           len(delta_text)
+
+                # use that to find the actual delta
+                arguments_delta = cur_arguments_json[:args_delta_start_loc]
+                logger.debug("First tokens in arguments received: %s",
+                             arguments_delta)
+
+                delta = DeltaMessage(tool_calls=[
+                    DeltaToolCall(index=self.current_tool_id,
+                                  function=DeltaFunctionCall(
+                                      arguments=arguments_delta).model_dump(
+                                          exclude_none=True))
+                ])
+                self.streamed_args_for_tool[self.current_tool_id] \
+                    += arguments_delta
+
+            # last case -- we have an update to existing arguments.
+            elif cur_arguments and prev_arguments:
+                if isinstance(delta_text, str) and len(delta_text.rstrip(
+                )) >= 1 and delta_text.rstrip()[-1] == '}':
+                    delta_text = delta_text.rstrip()[:-1]
+
+                logger.debug("got diff %s", delta_text)
+
+                delta = DeltaMessage(tool_calls=[
+                    DeltaToolCall(index=self.current_tool_id,
+                                  function=DeltaFunctionCall(
+                                      arguments=delta_text).model_dump(
+                                          exclude_none=True))
+                ])
+                self.streamed_args_for_tool[self.current_tool_id] \
+                    += delta_text
+
+            # handle saving the state for the current tool into
+            # the "prev" list for use in diffing for the next iteration
+            if self.current_tool_id == len(self.prev_tool_call_arr) - 1:
+                self.prev_tool_call_arr[self.current_tool_id] = \
+                    current_tool_call
+            else:
+                self.prev_tool_call_arr.append(current_tool_call)
+
+            return delta
+
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            return None  # do not stream a delta. skip this token ID.
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py
new file mode 100644
index 0000000..2b65f25
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py
@@ -0,0 +1,372 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501, SIM102
+
+import json
+from collections.abc import Sequence
+from typing import Any, Optional, Union
+
+import regex as re
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.entrypoints.openai.tool_parsers.utils import consume_space
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import random_uuid
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("hunyuan_a13b")
+class HunyuanA13BToolParser(ToolParser):
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+        # Initialize state for streaming mode
+        self.prev_tool_calls: list[dict] = []
+        self.current_tool_id = -1
+        self.current_tool_name_sent = False
+        self.streamed_args: list[str] = [
+        ]  # Track arguments sent for each tool
+
+        # For backward compatibility with tests
+        self.current_tools_sent: list[bool] = []
+
+        # For backward compatibility with serving code
+        self.prev_tool_call_arr = []
+
+        # Regex patterns for preprocessing
+        self.answer_tool_calls_pattern = re.compile(
+            r"<tool_calls>([\s\S]*?)</tool_calls>", re.DOTALL)
+
+        self.tool_name_reg = re.compile(r'"name"\s*:\s*"([^"]+)"')
+
+        self.tool_empty_arg_reg = re.compile(
+            r'"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:\s*\{\s*\}')
+
+        # TODO: not support nested json object in fc arguments.
+        self.tool_non_empty_arg_reg = re.compile(
+            r'"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:\s*(\{(?:[^{}]|(?:\{[^{}]*\}))*\})'
+        )
+
+        self.bot_string = "<tool_calls>"
+
+        # Define streaming state type to be initialized later
+        self.streaming_state: dict[str, Any] = {
+            "current_tool_index": -1,
+            "tool_ids": [],
+            "sent_tools": [],
+        }
+
+    def preprocess_model_output(
+            self, model_output: str) -> tuple[Optional[str], Optional[str]]:
+        # find the location tool call
+        for match in self.answer_tool_calls_pattern.finditer(model_output):
+            start, end = match.span()
+            # check tool_calls whether in side of <think>
+            think_regions = [(m.start(), m.end()) for m in re.finditer(
+                r"<think>(.*?)</think>", model_output, flags=re.DOTALL)]
+            in_think = any(start > t_start and end < t_end
+                           for t_start, t_end in think_regions)
+            if not in_think:
+                content = model_output[:start]
+                tool_calls_content = match.group(1).strip()
+                try:
+                    json.loads(tool_calls_content)
+                    return content, tool_calls_content
+                except Exception:
+                    continue
+        return model_output, None
+
+    def extract_tool_calls(
+            self, model_output: str,
+            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        """
+        Extract tool calls from a complete model output.
+        """
+        try:
+            # Preprocess the model output
+            content, potential_tool_calls = self.preprocess_model_output(
+                model_output)
+
+            if not potential_tool_calls:
+                # some text should be filtered out for no function call
+                # this text is in a13b's chat template.
+                if content:
+                    content = content.replace("助手：", "", 1)
+                return ExtractedToolCallInformation(tools_called=False,
+                                                    tool_calls=[],
+                                                    content=content)
+
+            # Parse the potential tool calls as JSON
+            tool_calls_data = json.loads(potential_tool_calls)
+
+            # Ensure it's an array
+            if not isinstance(tool_calls_data, list):
+                logger.debug("Tool calls data is not an array")
+                return ExtractedToolCallInformation(
+                    tools_called=False,
+                    tool_calls=[],
+                    content=content or model_output,
+                )
+
+            tool_calls: list[ToolCall] = []
+
+            for idx, call in enumerate(tool_calls_data):
+                if (not isinstance(call, dict) or "name" not in call
+                        or "arguments" not in call):
+                    continue
+
+                tool_call = ToolCall(
+                    id=f"call_{random_uuid()}",
+                    type="function",
+                    function=FunctionCall(
+                        name=call["name"],
+                        arguments=(json.dumps(call["arguments"]) if isinstance(
+                            call["arguments"], dict) else call["arguments"]),
+                    ),
+                )
+                tool_calls.append(tool_call)
+
+            if not content or len(content.strip()) == 0:
+                # clear the whitespace content.
+                content = None
+
+            return ExtractedToolCallInformation(
+                tools_called=len(tool_calls) > 0,
+                tool_calls=tool_calls,
+                content=content,
+            )
+
+        except Exception:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+        """
+        Extract tool calls for streaming mode.
+        """
+
+        start_idx = consume_space(0, current_text)
+        if current_text[start_idx:].startswith(self.bot_string):
+            start_idx = consume_space(start_idx + len(self.bot_string),
+                                      current_text)
+        if not current_text or start_idx >= len(
+                current_text) or current_text[start_idx] != '[':
+            return DeltaMessage(content=delta_text)
+
+        self._try_parse_json_tools(current_text[start_idx:])
+
+        test_delta = self._handle_test_compatibility(current_text)
+        if test_delta:
+            return test_delta
+
+        name_matches = list(self.tool_name_reg.finditer(current_text))
+        tool_count = len(name_matches)
+        if tool_count == 0:
+            return None
+        self._ensure_state_arrays(tool_count)
+        current_idx = self.streaming_state["current_tool_index"]
+
+        name_delta = self._handle_tool_name_streaming(current_idx, tool_count,
+                                                      name_matches)
+        if name_delta:
+            return name_delta
+
+        args_delta = self._handle_tool_args_streaming(current_text,
+                                                      current_idx, tool_count)
+        if args_delta:
+            return args_delta
+
+        return None
+
+    def _try_parse_json_tools(self, current_text: str):
+        try:
+            parsed_tools = json.loads(current_text)
+            if isinstance(parsed_tools, list):
+                self.prev_tool_call_arr = parsed_tools
+        except json.JSONDecodeError:
+            pass
+
+    def _handle_test_compatibility(self, current_text: str):
+        if len(self.current_tools_sent) > 0:
+            if (len(self.current_tools_sent) == 1
+                    and self.current_tools_sent[0] is False):
+                name_match = self.tool_name_reg.search(current_text)
+                if name_match:
+                    function_name = name_match.group(1)
+                    tool_id = f"chatcmpl-tool-{random_uuid()}"
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=0,
+                            type="function",
+                            id=tool_id,
+                            function=DeltaFunctionCall(
+                                name=function_name).model_dump(
+                                    exclude_none=True),
+                        )
+                    ])
+                    self.current_tools_sent = [True]
+                    self.current_tool_id = 0
+                    self.streaming_state["current_tool_index"] = 0
+                    if len(self.streaming_state["sent_tools"]) == 0:
+                        self.streaming_state["sent_tools"].append({
+                            "sent_name":
+                            True,
+                            "sent_arguments_prefix":
+                            False,
+                            "sent_arguments":
+                            "",
+                        })
+                    else:
+                        self.streaming_state["sent_tools"][0][
+                            "sent_name"] = True
+                    self.current_tool_name_sent = True
+                    return delta
+        return None
+
+    def _ensure_state_arrays(self, tool_count: int):
+        while len(self.streaming_state["sent_tools"]) < tool_count:
+            self.streaming_state["sent_tools"].append({
+                "sent_name": False,
+                "sent_arguments_prefix": False,
+                "sent_arguments": "",
+            })
+        while len(self.streaming_state["tool_ids"]) < tool_count:
+            self.streaming_state["tool_ids"].append(None)
+
+    def _handle_tool_name_streaming(self, current_idx: int, tool_count: int,
+                                    name_matches):
+        if current_idx == -1 or current_idx < tool_count - 1:
+            next_idx = current_idx + 1
+            if (next_idx < tool_count
+                    and not self.streaming_state["sent_tools"][next_idx]
+                ["sent_name"]):
+                self.streaming_state["current_tool_index"] = next_idx
+                self.current_tool_id = next_idx
+                current_idx = next_idx
+                tool_name = name_matches[current_idx].group(1)
+                tool_id = f"call_{current_idx}_{random_uuid()}"
+                self.streaming_state["tool_ids"][current_idx] = tool_id
+                delta = DeltaMessage(tool_calls=[
+                    DeltaToolCall(
+                        index=current_idx,
+                        type="function",
+                        id=tool_id,
+                        function=DeltaFunctionCall(name=tool_name).model_dump(
+                            exclude_none=True),
+                    )
+                ])
+                self.streaming_state["sent_tools"][current_idx][
+                    "sent_name"] = True
+                self.current_tool_name_sent = True
+                while len(self.streamed_args) <= current_idx:
+                    self.streamed_args.append("")
+                return delta
+        return None
+
+    def _handle_tool_args_streaming(self, current_text: str, current_idx: int,
+                                    tool_count: int):
+
+        if current_idx >= 0 and current_idx < tool_count:
+            empty_args_match = self.tool_empty_arg_reg.search(current_text)
+            if empty_args_match and empty_args_match.start() > 0:
+                for i in range(tool_count):
+                    if i == current_idx:
+                        if not self.streaming_state["sent_tools"][current_idx][
+                                "sent_arguments_prefix"]:
+                            self.streaming_state["sent_tools"][current_idx][
+                                "sent_arguments_prefix"] = True
+                            self.streaming_state["sent_tools"][current_idx][
+                                "sent_arguments"] = "{}"
+                            while len(self.streamed_args) <= current_idx:
+                                self.streamed_args.append("")
+                            self.streamed_args[current_idx] += "{}"
+                            delta = DeltaMessage(tool_calls=[
+                                DeltaToolCall(
+                                    index=current_idx,
+                                    function=DeltaFunctionCall(
+                                        arguments="{}").model_dump(
+                                            exclude_none=True),
+                                )
+                            ])
+                            if current_idx < tool_count - 1:
+                                self.streaming_state["current_tool_index"] += 1
+                                self.current_tool_id = self.streaming_state[
+                                    "current_tool_index"]
+                            return delta
+
+            args_matches = list(
+                self.tool_non_empty_arg_reg.finditer(current_text))
+            if current_idx < len(args_matches):
+                args_text = args_matches[current_idx].group(1)
+                is_last_tool = current_idx == tool_count - 1
+                if not is_last_tool:
+                    next_tool_pos = current_text.find(
+                        "},{", args_matches[current_idx].start())
+                    if next_tool_pos != -1:
+                        args_end_pos = (next_tool_pos + 1)
+                        args_text = (
+                            current_text[args_matches[current_idx].start(
+                            ):args_end_pos].split('"arguments":')[1].strip())
+                sent_args = self.streaming_state["sent_tools"][current_idx][
+                    "sent_arguments"]
+                if not self.streaming_state["sent_tools"][current_idx][
+                        "sent_arguments_prefix"] and args_text.startswith("{"):
+                    self.streaming_state["sent_tools"][current_idx][
+                        "sent_arguments_prefix"] = True
+                    self.streaming_state["sent_tools"][current_idx][
+                        "sent_arguments"] = "{"
+                    while len(self.streamed_args) <= current_idx:
+                        self.streamed_args.append("")
+                    self.streamed_args[current_idx] += "{"
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=current_idx,
+                            function=DeltaFunctionCall(
+                                arguments="{").model_dump(exclude_none=True),
+                        )
+                    ])
+                    return delta
+
+                if args_text.startswith(sent_args):
+                    args_diff = args_text[len(sent_args):]
+                    if args_diff:
+                        self.streaming_state["sent_tools"][current_idx][
+                            "sent_arguments"] = args_text
+                        while len(self.streamed_args) <= current_idx:
+                            self.streamed_args.append("")
+                        self.streamed_args[current_idx] += args_diff
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(
+                                index=current_idx,
+                                function=DeltaFunctionCall(
+                                    arguments=args_diff).model_dump(
+                                        exclude_none=True),
+                            )
+                        ])
+                        return delta
+
+                if args_text.endswith("}") and args_text == sent_args:
+                    if current_idx < tool_count - 1:
+                        self.streaming_state["current_tool_index"] += 1
+                        self.current_tool_id = self.streaming_state[
+                            "current_tool_index"]
+        return None
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
new file mode 100644
index 0000000..92004de
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
@@ -0,0 +1,216 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from collections.abc import Sequence
+from typing import Union
+
+import partial_json_parser
+from partial_json_parser.core.options import Allow
+
+from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.entrypoints.openai.tool_parsers.utils import (
+    extract_intermediate_diff)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module(["internlm"])
+class Internlm2ToolParser(ToolParser):
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+        self.position = 0
+
+    def adjust_request(
+            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        if request.tools and request.tool_choice != 'none':
+            # do not skip special tokens because internlm use the special
+            # tokens to indicated the start and end of the tool calls
+            # information.
+            request.skip_special_tokens = False
+        return request
+
+    def get_arguments(self, obj):
+        if "parameters" in obj:
+            return obj.get("parameters")
+        elif "arguments" in obj:
+            return obj.get("arguments")
+        return None
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+        if '<|action_start|>' not in current_text:
+            self.position = len(current_text)
+            return DeltaMessage(content=delta_text)
+        # if the tool call is sended, return a empty delta message
+        # to make sure the finish_reason will be send correctly.
+        if self.current_tool_id > 0:
+            return DeltaMessage(content='')
+
+        last_pos = self.position
+        if '<|action_start|><|plugin|>' not in current_text[last_pos:]:
+            return None
+
+        new_delta = current_text[last_pos:]
+        text, action = new_delta.split('<|action_start|><|plugin|>')
+
+        if len(text) > 0:
+            self.position = self.position + len(text)
+            return DeltaMessage(content=text)
+
+        action = action.strip()
+        action = action.split('<|action_end|>'.strip())[0]
+
+        # bit mask flags for partial JSON parsing. If the name hasn't been
+        # sent yet, don't allow sending
+        # an incomplete string since OpenAI only ever (as far as I have
+        # seen) allows sending the entire tool/ function name at once.
+        flags = Allow.ALL if self.current_tool_name_sent \
+            else Allow.ALL & ~Allow.STR
+
+        try:
+            parsable_arr = action
+
+            # tool calls are generated in an object in inernlm2
+            # it's not support parallel tool calls
+            try:
+                tool_call_arr: dict = partial_json_parser.loads(
+                    parsable_arr, flags)
+            except partial_json_parser.core.exceptions.MalformedJSON:
+                logger.debug('not enough tokens to parse into JSON yet')
+                return None
+
+            # if the current tool name hasn't been sent, send if available
+            # - otherwise send nothing
+            if not self.current_tool_name_sent:
+                function_name = tool_call_arr.get("name")
+                if function_name:
+                    self.current_tool_id = self.current_tool_id + 1
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      type="function",
+                                      id=random_tool_call_id(),
+                                      function=DeltaFunctionCall(
+                                          name=function_name).model_dump(
+                                              exclude_none=True))
+                    ])
+                    self.current_tool_name_sent = True
+                    self.streamed_args_for_tool.append("")
+                else:
+                    delta = None
+            # now we know we're on the same tool call and we're streaming
+            # arguments
+            else:
+                prev_arguments = self.get_arguments(
+                    self.prev_tool_call_arr[self.current_tool_id])
+                cur_arguments = self.get_arguments(tool_call_arr)
+
+                # not arguments generated
+                if not cur_arguments and not prev_arguments:
+                    delta = None
+                # will never happen
+                elif not cur_arguments and prev_arguments:
+                    logger.error(
+                        "INVARIANT - impossible to have arguments reset "
+                        "mid-arguments")
+                    delta = None
+                # first time to get parameters
+                elif cur_arguments and not prev_arguments:
+                    cur_arguments_json = json.dumps(cur_arguments,
+                                                    ensure_ascii=False)
+
+                    arguments_delta = cur_arguments_json[:cur_arguments_json.
+                                                         index(delta_text) +
+                                                         len(delta_text)]
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      function=DeltaFunctionCall(
+                                          arguments=arguments_delta).
+                                      model_dump(exclude_none=True))
+                    ])
+                    self.streamed_args_for_tool[
+                        self.current_tool_id] += arguments_delta
+                # both prev and cur parameters, send the increase parameters
+                elif cur_arguments and prev_arguments:
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
+                    prev_args_json = json.dumps(prev_arguments,
+                                                ensure_ascii=False)
+
+                    argument_diff = extract_intermediate_diff(
+                        cur_args_json, prev_args_json)
+
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      function=DeltaFunctionCall(
+                                          arguments=argument_diff).model_dump(
+                                              exclude_none=True))
+                    ])
+                    self.streamed_args_for_tool[
+                        self.current_tool_id] += argument_diff
+
+            # check to see if the name is defined and has been sent. if so,
+            # stream the name - otherwise keep waiting
+            # finish by setting old and returning None as base case
+            tool_call_arr["arguments"] = self.get_arguments(tool_call_arr)
+            self.prev_tool_call_arr = [tool_call_arr]
+            return delta
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            logger.debug(
+                "Skipping chunk as a result of tool streaming extraction "
+                "error")
+            return None
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+        text = model_output
+        tools = request.tools
+        if '<|action_start|><|plugin|>' in text:
+            text, action = text.split('<|action_start|><|plugin|>')
+            action = action.split('<|action_end|>'.strip())[0]
+            action = action[action.find('{'):]
+            action_dict = json.loads(action)
+            name, parameters = action_dict['name'], json.dumps(
+                action_dict.get('parameters', action_dict.get('arguments',
+                                                              {})),
+                ensure_ascii=False)
+
+            if not tools or name not in [t.function.name for t in tools]:
+                ExtractedToolCallInformation(tools_called=False,
+                                             tool_calls=[],
+                                             content=text)
+
+            tool_calls = [
+                ToolCall(
+                    function=FunctionCall(name=name, arguments=parameters))
+            ]
+            return ExtractedToolCallInformation(
+                tools_called=True,
+                tool_calls=tool_calls,
+                content=text if len(text) > 0 else None)
+
+        return ExtractedToolCallInformation(tools_called=False,
+                                            tool_calls=[],
+                                            content=text)
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
new file mode 100644
index 0000000..66b483d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
@@ -0,0 +1,308 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from collections.abc import Sequence
+from typing import Union
+
+import partial_json_parser
+import regex as re
+from partial_json_parser.core.options import Allow
+
+from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
+from vllm.entrypoints.openai.tool_parsers.utils import (
+    extract_intermediate_diff)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.transformers_utils.tokenizers import MistralTokenizer
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("jamba")
+class JambaToolParser(ToolParser):
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+        if isinstance(self.model_tokenizer, MistralTokenizer):
+            raise ValueError(
+                "Detected a MistralTokenizer tokenizer when using a Jamba model"
+            )
+
+        self.current_tool_name_sent: bool = False
+        self.prev_tool_call_arr: list[dict] = []
+        self.current_tool_id: int = -1
+        self.streamed_args_for_tool: list[str] = [
+        ]  # map what has been streamed for each tool so far to a list
+
+        self.tool_calls_start_token: str = "<tool_calls>"
+        self.tool_calls_end_token: str = "</tool_calls>"
+
+        self.tool_calls_regex = re.compile(
+            rf"{self.tool_calls_start_token}(.*?){self.tool_calls_end_token}",
+            re.DOTALL)
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ToolParser "
+                "constructor during construction.")
+        self.tool_calls_start_token_id = self.vocab.get(
+            self.tool_calls_start_token)
+        self.tool_calls_end_token_id = self.vocab.get(
+            self.tool_calls_end_token)
+        if (self.tool_calls_start_token_id is None
+                or self.tool_calls_end_token_id is None):
+            raise RuntimeError(
+                "Jamba Tool parser could not locate tool calls start/end "
+                "tokens in the tokenizer!")
+
+    def adjust_request(
+            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        if request.tools and request.tool_choice != 'none':
+            # do not skip special tokens because jamba use the special
+            # tokens to indicate the start and end of the tool calls
+            # information.
+            request.skip_special_tokens = False
+        return request
+
+    def extract_tool_calls(
+            self, model_output: str,
+            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+
+        # sanity check; avoid unnecessary processing
+        if self.tool_calls_start_token not in model_output:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        else:
+
+            try:
+                # use a regex to find the tool call between the tags
+                function_calls = self.tool_calls_regex.findall(model_output)[0]
+
+                # load the JSON, and then use it to build the Function and
+                # Tool Call
+                raw_function_calls = json.loads(function_calls)
+                tool_calls = [
+                    ToolCall(
+                        type="function",
+                        function=FunctionCall(
+                            name=function_call["name"],
+                            # function call args are JSON but as a string
+                            arguments=json.dumps(function_call["arguments"],
+                                                 ensure_ascii=False),
+                        )) for function_call in raw_function_calls
+                ]
+
+                content = model_output[:model_output.
+                                       find(self.tool_calls_start_token)]
+                return ExtractedToolCallInformation(
+                    tools_called=True,
+                    tool_calls=tool_calls,
+                    content=content if
+                    (len(content) > 0 and content != " ") else None)
+
+            except Exception:
+                logger.exception(
+                    "Error in extracting tool call from response.")
+                return ExtractedToolCallInformation(tools_called=False,
+                                                    tool_calls=[],
+                                                    content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+
+        # if the tool call token is not in the tokens generated so far, append
+        # output to contents since it's not a tool
+        if self.tool_calls_start_token not in current_text:
+            return DeltaMessage(content=delta_text)
+
+        # if the tool call token ID IS in the tokens generated so far, that
+        # means we're parsing as tool calls now
+
+        # handle if we detected the start of tool calls token which means
+        # the start of tool calling
+        if (self.tool_calls_start_token_id in delta_token_ids
+                and len(delta_token_ids) == 1):
+            # if it's the only token, return None, so we don't send a chat
+            # completion and don't send a control token
+            return None
+
+        # bit mask flags for partial JSON parsing. If the name hasn't been
+        # sent yet, don't allow sending
+        # an incomplete string since OpenAI only ever (as far as I have
+        # seen) allows sending the entire tool/ function name at once.
+        flags = Allow.ALL if self.current_tool_name_sent \
+            else Allow.ALL & ~Allow.STR
+        try:
+
+            # Extract the tool calls between the special tool call tokens
+            parsable_arr = current_text.split(
+                self.tool_calls_start_token)[-1].split(
+                    self.tool_calls_end_token)[0]
+
+            # tool calls are generated in an array, so do partial JSON
+            # parsing on the entire array
+            try:
+                tool_call_arr: list[dict] = partial_json_parser.loads(
+                    parsable_arr, flags)
+            except partial_json_parser.core.exceptions.MalformedJSON:
+                logger.debug('not enough tokens to parse into JSON yet')
+                return None
+
+            # select as the current tool call the one we're on the state at
+
+            current_tool_call: dict = tool_call_arr[self.current_tool_id] \
+                if len(tool_call_arr) > 0 else {}
+
+            # case -- if no tokens have been streamed for the tool, e.g.
+            #   only the array brackets, stream nothing
+            if len(tool_call_arr) == 0:
+                return None
+
+            # case: we are starting a new tool in the array
+            #   -> array has > 0 length AND length has moved past cursor
+            elif (len(tool_call_arr) > 0
+                  and len(tool_call_arr) > self.current_tool_id + 1):
+
+                # if we're moving on to a new call, first make sure we
+                # haven't missed anything in the previous one that was
+                # auto-generated due to JSON completions, but wasn't
+                # streamed to the client yet.
+                if self.current_tool_id >= 0:
+                    diff: Union[str, None] = current_tool_call.get("arguments")
+
+                    if diff:
+                        diff = json.dumps(diff, ensure_ascii=False).replace(
+                            self.streamed_args_for_tool[self.current_tool_id],
+                            "")
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=diff).model_dump(
+                                                  exclude_none=True))
+                        ])
+                        self.streamed_args_for_tool[
+                            self.current_tool_id] += diff
+                    else:
+                        delta = None
+                else:
+                    delta = None
+                # re-set stuff pertaining to progress in the current tool
+                self.current_tool_id = len(tool_call_arr) - 1
+                self.current_tool_name_sent = False
+                self.streamed_args_for_tool.append("")
+                logger.debug("starting on new tool %d", self.current_tool_id)
+                return delta
+
+            # case: update an existing tool - this is handled below
+
+            # if the current tool name hasn't been sent, send if available
+            # - otherwise send nothing
+            if not self.current_tool_name_sent:
+                function_name = current_tool_call.get("name")
+                if function_name:
+
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      type="function",
+                                      id=random_tool_call_id(),
+                                      function=DeltaFunctionCall(
+                                          name=function_name).model_dump(
+                                              exclude_none=True))
+                    ])
+                    self.current_tool_name_sent = True
+                else:
+                    delta = None
+
+            # now we know we're on the same tool call and we're streaming
+            # arguments
+            else:
+
+                prev_arguments = self.prev_tool_call_arr[
+                    self.current_tool_id].get("arguments")
+                cur_arguments = current_tool_call.get("arguments")
+
+                new_text = delta_text.replace("\'", "\"")
+
+                if not cur_arguments and not prev_arguments:
+
+                    delta = None
+                elif not cur_arguments and prev_arguments:
+                    logger.error(
+                        "INVARIANT - impossible to have arguments reset "
+                        "mid-arguments")
+                    delta = None
+                elif cur_arguments and not prev_arguments:
+                    cur_arguments_json = json.dumps(cur_arguments,
+                                                    ensure_ascii=False)
+                    logger.debug("finding %s in %s", new_text,
+                                 cur_arguments_json)
+
+                    arguments_delta = cur_arguments_json[:cur_arguments_json.
+                                                         index(new_text) +
+                                                         len(new_text)]
+                    logger.debug("First tokens in arguments received: %s",
+                                 arguments_delta)
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      function=DeltaFunctionCall(
+                                          arguments=arguments_delta).
+                                      model_dump(exclude_none=True))
+                    ])
+                    self.streamed_args_for_tool[
+                        self.current_tool_id] += arguments_delta
+
+                elif cur_arguments and prev_arguments:
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
+                    prev_args_json = json.dumps(prev_arguments,
+                                                ensure_ascii=False)
+                    logger.debug("Searching for diff between \n%s\n%s",
+                                 cur_args_json, prev_args_json)
+
+                    argument_diff = extract_intermediate_diff(
+                        cur_args_json, prev_args_json)
+                    logger.debug("got arguments diff: %s", argument_diff)
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      function=DeltaFunctionCall(
+                                          arguments=argument_diff).model_dump(
+                                              exclude_none=True))
+                    ])
+                    self.streamed_args_for_tool[
+                        self.current_tool_id] += argument_diff
+                else:
+                    # try parsing it with regular JSON - if it works we're
+                    # at the end, and we need to send the difference between
+                    # tokens streamed so far and the valid JSON
+                    delta = None
+
+            # check to see if the name is defined and has been sent. if so,
+            # stream the name - otherwise keep waiting
+            # finish by setting old and returning None as base case
+            self.prev_tool_call_arr = tool_call_arr
+            return delta
+
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            logger.debug(
+                "Skipping chunk as a result of tool streaming extraction "
+                "error")
+            return None
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
new file mode 100644
index 0000000..b0df442
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
@@ -0,0 +1,377 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# code modified from deepseekv3_tool_parser.py
+
+from collections.abc import Sequence
+from typing import Union
+
+import regex as re
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module(["kimi_k2"])
+class KimiK2ToolParser(ToolParser):
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+        self.current_tool_name_sent: bool = False
+        self.prev_tool_call_arr: list[dict] = []
+        self.current_tool_id: int = -1
+        self.streamed_args_for_tool: list[str] = (
+            [])  # map what has been streamed for each tool so far to a list
+
+        self.tool_calls_start_token: str = "<|tool_calls_section_begin|>"
+        self.tool_calls_end_token: str = "<|tool_calls_section_end|>"
+
+        self.tool_call_start_token: str = "<|tool_call_begin|>"
+        self.tool_call_end_token: str = "<|tool_call_end|>"
+
+        self.tool_call_regex = re.compile(
+            r"<\|tool_call_begin\|>\s*(?P<tool_call_id>[\w\.]+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P<function_arguments>.*?)\s*<\|tool_call_end\|>"
+        )
+
+        self.stream_tool_call_portion_regex = re.compile(
+            r"(?P<tool_call_id>[\w\.]+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P<function_arguments>.*)"
+        )
+
+        self.stream_tool_call_name_regex = re.compile(
+            r"(?P<tool_call_id>[\w\.]+:\d+)\s*")
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ToolParser "
+                "constructor during construction.")
+        self.tool_calls_start_token_id = self.vocab.get(
+            self.tool_calls_start_token)
+        self.tool_calls_end_token_id = self.vocab.get(
+            self.tool_calls_end_token)
+
+        self.tool_call_start_token_id = self.vocab.get(
+            self.tool_call_start_token)
+        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
+
+        if (self.tool_calls_start_token_id is None
+                or self.tool_calls_end_token_id is None):
+            raise RuntimeError(
+                "Kimi-K2 Tool parser could not locate tool call start/end "
+                "tokens in the tokenizer!")
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+
+        # sanity check; avoid unnecessary processing
+        if self.tool_calls_start_token not in model_output:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        else:
+            try:
+                # there are two possible captures - between tags, or between a
+                # tag and end-of-string so the result of
+                # findall is an array of tuples where one is a function call and
+                # the other is None
+                function_call_tuples = self.tool_call_regex.findall(
+                    model_output)
+
+                logger.debug("function_call_tuples: %s", function_call_tuples)
+
+                tool_calls = []
+                for match in function_call_tuples:
+                    function_id, function_args = match
+                    # function_id: functions.get_weather:0
+                    function_name = function_id.split('.')[1].split(':')[0]
+                    tool_calls.append(
+                        ToolCall(
+                            id=function_id,
+                            type='function',
+                            function=FunctionCall(name=function_name,
+                                                  arguments=function_args),
+                        ))
+
+                content = model_output[:model_output.
+                                       find(self.tool_calls_start_token)]
+                return ExtractedToolCallInformation(
+                    tools_called=True,
+                    tool_calls=tool_calls,
+                    content=content if content else None,
+                )
+
+            except Exception:
+                logger.exception(
+                    "Error in extracting tool call from response.")
+                return ExtractedToolCallInformation(tools_called=False,
+                                                    tool_calls=[],
+                                                    content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+
+        logger.debug("delta_text: %s", delta_text)
+        logger.debug("delta_token_ids: %s", delta_token_ids)
+        # check to see if we should be streaming a tool call - is there a
+        if self.tool_calls_start_token_id not in current_token_ids:
+            logger.debug("No tool call tokens found!")
+            return DeltaMessage(content=delta_text)
+        delta_text = delta_text.replace(self.tool_calls_start_token,
+                                        "").replace(self.tool_calls_end_token,
+                                                    "")
+        try:
+
+            # figure out where we are in the parsing by counting tool call
+            # start & end tags
+            prev_tool_start_count = previous_token_ids.count(
+                self.tool_call_start_token_id)
+            prev_tool_end_count = previous_token_ids.count(
+                self.tool_call_end_token_id)
+            cur_tool_start_count = current_token_ids.count(
+                self.tool_call_start_token_id)
+            cur_tool_end_count = current_token_ids.count(
+                self.tool_call_end_token_id)
+            tool_call_portion = None
+            text_portion = None
+
+            # case: if we're generating text, OR rounding out a tool call
+            if (cur_tool_start_count == cur_tool_end_count
+                    and prev_tool_end_count == cur_tool_end_count
+                    and self.tool_call_end_token not in delta_text):
+                logger.debug("Generating text content! skipping tool parsing.")
+                return DeltaMessage(content=delta_text)
+
+            if self.tool_call_end_token in delta_text:
+                logger.debug("tool_call_end_token in delta_text")
+                full_text = current_text + delta_text
+                tool_call_portion = full_text.split(
+                    self.tool_call_start_token)[-1].split(
+                        self.tool_call_end_token)[0].rstrip()
+                delta_text = delta_text.split(
+                    self.tool_call_end_token)[0].rstrip()
+                text_portion = delta_text.split(
+                    self.tool_call_end_token)[-1].lstrip()
+
+            # case -- we're starting a new tool call
+            if (cur_tool_start_count > cur_tool_end_count
+                    and cur_tool_start_count > prev_tool_start_count):
+                if len(delta_token_ids) > 1:
+                    tool_call_portion = current_text.split(
+                        self.tool_call_start_token)[-1]
+                else:
+                    tool_call_portion = None
+                    delta = None
+
+                text_portion = None
+
+                # set cursors and state appropriately
+                self.current_tool_id += 1
+                self.current_tool_name_sent = False
+                self.streamed_args_for_tool.append("")
+                logger.debug("Starting on a new tool %s", self.current_tool_id)
+
+            # case -- we're updating an existing tool call
+            elif (cur_tool_start_count > cur_tool_end_count
+                  and cur_tool_start_count == prev_tool_start_count):
+
+                # get the portion of the text that's the tool call
+                tool_call_portion = current_text.split(
+                    self.tool_call_start_token)[-1]
+                text_portion = None
+
+            # case -- the current tool call is being closed.
+            elif (cur_tool_start_count == cur_tool_end_count
+                  and cur_tool_end_count >= prev_tool_end_count):
+                if self.prev_tool_call_arr is None or len(
+                        self.prev_tool_call_arr) == 0:
+                    logger.debug(
+                        "attempting to close tool call, but no tool call")
+                    return None
+                diff = self.prev_tool_call_arr[self.current_tool_id].get(
+                    "arguments")
+                if diff:
+                    diff = (diff.encode("utf-8").decode("unicode_escape")
+                            if diff is str else diff)
+                    if '"}' not in delta_text:
+                        return None
+                    end_loc = delta_text.rindex('"}')
+                    diff = delta_text[:end_loc] + '"}'
+                    logger.debug(
+                        "Finishing tool and found diff that had not "
+                        "been streamed yet: %s",
+                        diff,
+                    )
+                    self.streamed_args_for_tool[self.current_tool_id] += diff
+                    return DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_id,
+                            function=DeltaFunctionCall(
+                                arguments=diff).model_dump(exclude_none=True),
+                        )
+                    ])
+
+            # case -- otherwise we're just generating text
+            else:
+                text = delta_text.replace(self.tool_call_start_token, "")
+                text = text.replace(self.tool_call_end_token, "")
+                delta = DeltaMessage(tool_calls=[], content=text)
+                return delta
+
+            current_tool_call = dict()
+            if tool_call_portion:
+                current_tool_call_matches = (
+                    self.stream_tool_call_portion_regex.match(
+                        tool_call_portion))
+                if current_tool_call_matches:
+                    tool_id, tool_args = (current_tool_call_matches.groups())
+                    tool_name = tool_id.split('.')[1].split(':')[0]
+                    current_tool_call['id'] = tool_id
+                    current_tool_call["name"] = tool_name
+                    current_tool_call["arguments"] = tool_args
+                else:
+                    current_tool_call_name_matches = (
+                        self.stream_tool_call_name_regex.match(
+                            tool_call_portion))
+                    if current_tool_call_name_matches:
+                        tool_id_str, = current_tool_call_name_matches.groups()
+                        tool_name = tool_id_str.split('.')[1].split(':')[0]
+                        current_tool_call['id'] = tool_id_str
+                        current_tool_call["name"] = tool_name
+                        current_tool_call["arguments"] = ""
+                    else:
+                        logger.debug("Not enough token")
+                        return None
+
+            # case - we haven't sent the tool name yet. If it's available, send
+            #   it. otherwise, wait until it's available.
+            if not self.current_tool_name_sent:
+                if current_tool_call is None:
+                    return None
+                function_name: Union[str, None] = current_tool_call.get("name")
+                tool_id = current_tool_call.get("id")
+                if function_name:
+                    self.current_tool_name_sent = True
+                    return DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_id,
+                            type="function",
+                            id=tool_id,
+                            function=DeltaFunctionCall(
+                                name=function_name).model_dump(
+                                    exclude_none=True),
+                        )
+                    ])
+                else:
+                    return None
+
+            # case -- otherwise, send the tool call delta
+
+            # if the tool call portion is None, send the delta as text
+            if tool_call_portion is None:
+                # if there's text but not tool calls, send that -
+                # otherwise None to skip chunk
+                delta = (DeltaMessage(
+                    content=delta_text) if text_portion is not None else None)
+                return delta
+
+            # now, the nitty-gritty of tool calls
+            # now we have the portion to parse as tool call.
+
+            logger.debug("Trying to parse current tool call with ID %s",
+                         self.current_tool_id)
+
+            # if we're starting a new tool call, push an empty object in as
+            #   a placeholder for the arguments
+            if len(self.prev_tool_call_arr) <= self.current_tool_id:
+                self.prev_tool_call_arr.append({})
+
+            # main logic for tool parsing here - compare prev. partially-parsed
+            #   JSON to the current partially-parsed JSON
+            prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
+                "arguments")
+            cur_arguments = current_tool_call.get("arguments")
+
+            logger.debug("diffing old arguments: %s", prev_arguments)
+            logger.debug("against new ones: %s", cur_arguments)
+
+            # case -- no arguments have been created yet. skip sending a delta.
+            if not cur_arguments and not prev_arguments:
+                logger.debug("Skipping text %s - no arguments", delta_text)
+                delta = None
+
+            # case -- prev arguments are defined, but non are now.
+            #   probably impossible, but not a fatal error - just keep going
+            elif not cur_arguments and prev_arguments:
+                logger.error("should be impossible to have arguments reset "
+                             "mid-call. skipping streaming anything.")
+                delta = None
+
+            # case -- we now have the first info about arguments available from
+            #   autocompleting the JSON
+            elif cur_arguments and not prev_arguments:
+
+                delta = DeltaMessage(tool_calls=[
+                    DeltaToolCall(
+                        index=self.current_tool_id,
+                        function=DeltaFunctionCall(
+                            arguments=cur_arguments).model_dump(
+                                exclude_none=True),
+                    )
+                ])
+                self.streamed_args_for_tool[
+                    self.current_tool_id] = cur_arguments
+
+            # last case -- we have an update to existing arguments.
+            elif cur_arguments and prev_arguments:
+                if (isinstance(delta_text, str)
+                        and cur_arguments != prev_arguments
+                        and len(cur_arguments) > len(prev_arguments)
+                        and cur_arguments.startswith(prev_arguments)):
+                    delta_arguments = cur_arguments[len(prev_arguments):]
+                    logger.debug("got diff %s", delta_text)
+
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_id,
+                            function=DeltaFunctionCall(
+                                arguments=delta_arguments).model_dump(
+                                    exclude_none=True),
+                        )
+                    ])
+                    self.streamed_args_for_tool[
+                        self.current_tool_id] = cur_arguments
+                else:
+                    delta = None
+
+            # handle saving the state for the current tool into
+            # the "prev" list for use in diffing for the next iteration
+            if self.current_tool_id == len(self.prev_tool_call_arr) - 1:
+                self.prev_tool_call_arr[
+                    self.current_tool_id] = current_tool_call
+            else:
+                self.prev_tool_call_arr.append(current_tool_call)
+
+            return delta
+
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            return None  # do not stream a delta. skip this token ID.
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
new file mode 100644
index 0000000..6bf44a4
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
@@ -0,0 +1,316 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import ast
+import json
+from collections.abc import Sequence
+from typing import Any, Union
+
+import regex as re
+from transformers import PreTrainedTokenizerBase
+
+import vllm.envs as envs
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class _UnexpectedAstError(Exception):
+    pass
+
+
+@ToolParserManager.register_module("llama4_pythonic")
+class Llama4PythonicToolParser(ToolParser):
+    """
+    Toolcall parser for Llama4 that produce tool calls in a pythonic style
+    Use --enable-auto-tool-choice --tool-call-parser llama4_pythonic
+    """
+    # TODO(mdepinet): Possible future improvements:
+    #   1. Support text + tools separated by either <|python_tag|> or \n\n
+    #   2. Support tools outside of a list (or separated by a semicolon).
+    #      This depends on item 1 for consistent streaming.
+    # Neither of these are necessary for e.g. ToolACE, but both would help make
+    # Llama3.2 models more reliable.
+
+    TOOL_CALL_REGEX = re.compile(
+        r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]",
+        re.DOTALL)
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
+        super().__init__(tokenizer)
+
+    # Rename for readability. This is NOT a tool id.
+    @property
+    def current_tool_index(self) -> int:
+        return self.current_tool_id
+
+    @current_tool_index.setter
+    def current_tool_index(self, value: int) -> None:
+        self.current_tool_id = value
+
+    def extract_tool_calls(
+            self, model_output: str,
+            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        """
+        Extract the tool calls from a complete model response.
+        """
+
+        # remove <|python_start|> and <|python_end|>
+        # as Llama 4 model sometime will output those tokens
+        if model_output.startswith("<|python_start|>"):
+            model_output = model_output[len("<|python_start|>"):]
+            model_output = model_output.replace("<|python_end|>", "")
+
+        is_tool_call_pattern = False
+        try:
+            is_tool_call_pattern = self.TOOL_CALL_REGEX.match(
+                model_output,
+                timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS) is not None
+        except TimeoutError:
+            logger.warning(
+                "Regex timeout occurred when matching tool call pattern.")
+            logger.debug("Regex timeout occurred when matching user input: %s",
+                         model_output)
+
+        if not is_tool_call_pattern:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        try:
+            module = ast.parse(model_output)
+            parsed = getattr(module.body[0], "value", None)
+            if isinstance(parsed, ast.List) and all(
+                    isinstance(e, ast.Call) for e in parsed.elts):
+                return ExtractedToolCallInformation(
+                    tools_called=True,
+                    tool_calls=[
+                        _handle_single_tool(e)  # type: ignore
+                        for e in parsed.elts
+                    ],
+                    content=None)
+            else:
+                raise _UnexpectedAstError(
+                    "Tool output must be a list of function calls")
+        except Exception:
+            logger.exception("Error in extracting tool call from response.")
+            # Treat as regular text
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+
+        if not current_text.startswith("[") and not current_text.startswith(
+                "<|python_start|>"):
+            return DeltaMessage(content=delta_text)
+
+        try:
+            # remove <|python_start|> and <|python_end|>
+            if current_text.startswith("<|python_start|>"):
+                current_text = current_text[len("<|python_start|>"):]
+            if current_text.endswith("<|python_end|>"):
+                current_text = current_text[:current_text.
+                                            rfind("<|python_end|>")]
+            valid_and_added_text = _make_valid_python(current_text)
+            if valid_and_added_text is None:
+                return None
+            valid_text, added_text = valid_and_added_text
+
+            module = ast.parse(valid_text)
+            parsed = getattr(module.body[0], "value", None)
+            if not isinstance(parsed, ast.List) or not all(
+                    isinstance(e, ast.Call) for e in parsed.elts):
+                raise _UnexpectedAstError(
+                    "Tool output must be a list of function calls")
+            tool_calls = [
+                _handle_single_tool(e)  # type: ignore
+                for e in parsed.elts
+            ]
+
+            tool_deltas = []
+            for index, new_call in enumerate(tool_calls):
+                if index < self.current_tool_index:
+                    continue
+
+                self.current_tool_index = index
+                if len(self.streamed_args_for_tool) == index:
+                    self.streamed_args_for_tool.append("")
+
+                new_call_complete = index < len(
+                    tool_calls) - 1 or ")]" not in added_text
+                if new_call_complete:
+                    self.current_tool_index += 1
+
+                withheld_suffix = (added_text[:-2]
+                                   if not new_call_complete else "")
+                if not new_call_complete and added_text[-2] == ")":
+                    # Function call is incomplete. Withhold the closing bracket.
+                    withheld_suffix = withheld_suffix + "}"
+                # Strings get single quotes in the model-produced string.
+                # JSON requires double quotes.
+                withheld_suffix = withheld_suffix.replace("'", '"')
+                delta = _compute_tool_delta(self.streamed_args_for_tool[index],
+                                            new_call, index, withheld_suffix)
+
+                if delta is not None:
+                    tool_deltas.append(delta)
+                    if (delta.function is not None
+                            and delta.function.arguments is not None):
+                        self.streamed_args_for_tool[
+                            index] += delta.function.arguments
+
+        # HACK: serving_chat.py inspects the internal state of tool parsers
+        # when determining it's final streaming delta, automatically
+        # adding autocompleted JSON.
+        # These two lines avoid that nonsense while ensuring finish_reason
+        # is set to tool_calls when at least one tool is called.
+            if tool_deltas and not self.prev_tool_call_arr:
+                self.prev_tool_call_arr = [{"arguments": {}}]
+
+            if tool_deltas:
+                return DeltaMessage(tool_calls=tool_deltas)
+            elif not added_text and self.current_tool_id > 0:
+                # Return an empty DeltaMessage once the tool calls are all done
+                # so that finish_reason gets set.
+                return DeltaMessage(content='')
+            else:
+                return None
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            logger.debug(
+                "Skipping chunk as a result of tool streaming extraction "
+                "error")
+            return None
+
+
+def _get_parameter_value(val: ast.expr) -> Any:
+    if isinstance(val, ast.Constant):
+        return val.value
+    elif isinstance(val, ast.Dict):
+        if not all(isinstance(k, ast.Constant) for k in val.keys):
+            raise _UnexpectedAstError(
+                "Dict tool call arguments must have literal keys")
+        return {
+            k.value: _get_parameter_value(v)  # type: ignore
+            for k, v in zip(val.keys, val.values)
+        }
+    elif isinstance(val, ast.List):
+        return [_get_parameter_value(v) for v in val.elts]
+    else:
+        raise _UnexpectedAstError("Tool call arguments must be literals")
+
+
+def _handle_single_tool(call: ast.Call) -> ToolCall:
+    if not isinstance(call.func, ast.Name):
+        raise _UnexpectedAstError("Invalid tool call name")
+    function_name = call.func.id
+    arguments = {}
+    for keyword in call.keywords:
+        arguments[keyword.arg] = _get_parameter_value(keyword.value)
+    return ToolCall(type="function",
+                    function=FunctionCall(name=function_name,
+                                          arguments=json.dumps(arguments)))
+
+
+def _make_valid_python(text: str) -> Union[tuple[str, str], None]:
+    bracket_stack = []
+    for index, char in enumerate(text):
+        if char in {"[", "(", "{"}:
+            bracket_stack.append(char)
+        elif char == "]":
+            if not bracket_stack or bracket_stack.pop() != "[":
+                raise _UnexpectedAstError("Mismatched square brackets")
+        elif char == ")":
+            if not bracket_stack or bracket_stack.pop() != "(":
+                raise _UnexpectedAstError("Mismatched parentheses")
+        elif char == "}":
+            if not bracket_stack or bracket_stack.pop() != "{":
+                raise _UnexpectedAstError("Mismatched curly braces")
+        elif char in {"'", '"'}:
+            if bracket_stack and bracket_stack[-1] == char:
+                if index > 0 and text[index - 1] == "\\":
+                    # Treat an escaped quote as a regular character
+                    pass
+                else:
+                    bracket_stack.pop()
+            elif bracket_stack and bracket_stack[-1] in {"'", '"'}:
+                # Double quote within a single quote string or vice versa.
+                pass
+            else:
+                bracket_stack.append(char)
+
+    text = text.rstrip()
+    if text.endswith("=") or text.endswith(":"):
+        # Since we have no type information for this property/parameter value,
+        # we can't fill in a valid value.
+        return None
+    if bracket_stack and bracket_stack[-1] == "{":
+        trailing_dict_text = text[:text.rfind("{")]
+        num_keys = trailing_dict_text.count(":")
+        num_values = trailing_dict_text.count(",")
+        if num_keys <= num_values:
+            return None  # Incomplete property name within parameter value
+    if bracket_stack and bracket_stack[-1] == "(":
+        trailing_params_text = text[:text.rfind("(")]
+        num_full_param_names = trailing_params_text.count("=")
+        num_full_param_values = trailing_params_text.count(",")
+        if num_full_param_names <= num_full_param_values:
+            return None  # Incomplete parameter name
+    if text.endswith(","):
+        text = text[:-1]
+    if bracket_stack and bracket_stack[-1] == "[" and not text.endswith(
+            "[") and not text.endswith(")"):
+        return None  # Incomplete function name
+
+    added_text = ""
+    for char in reversed(bracket_stack):
+        if char == "[":
+            added_text += "]"
+        elif char == "(":
+            added_text += ")"
+        elif char == "{":
+            added_text += "}"
+        elif char == "'":
+            added_text += "'"
+        elif char == '"':
+            added_text += '"'
+
+    return text + added_text, added_text
+
+
+def _compute_tool_delta(previously_sent_args: str, new_call: ToolCall,
+                        index: int,
+                        withheld_suffix: str) -> Union[DeltaToolCall, None]:
+    new_call_args = new_call.function.arguments
+    if withheld_suffix:
+        assert new_call_args.endswith(withheld_suffix)
+        new_call_args = new_call_args[:-len(withheld_suffix)]
+    if not previously_sent_args:
+        return DeltaToolCall(id=new_call.id,
+                             type="function",
+                             index=index,
+                             function=DeltaFunctionCall(
+                                 name=new_call.function.name,
+                                 arguments=new_call_args,
+                             ))
+
+    arg_diff = new_call_args[len(previously_sent_args):]
+    return DeltaToolCall(
+        id=None, index=index, function=DeltaFunctionCall(
+            arguments=arg_diff)) if arg_diff else None
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
new file mode 100644
index 0000000..5698bc7
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -0,0 +1,267 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from collections.abc import Sequence
+from json import JSONDecoder
+from typing import Union
+
+import partial_json_parser
+import regex as re
+from partial_json_parser.core.options import Allow
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.entrypoints.openai.tool_parsers.utils import (find_common_prefix,
+                                                        is_complete_json,
+                                                        partial_json_loads)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("llama3_json")
+@ToolParserManager.register_module("llama4_json")
+class Llama3JsonToolParser(ToolParser):
+    """
+    Tool call parser for Llama 3.1 models intended for use with the
+    examples/tool_chat_template_llama.jinja template.
+
+    Used when --enable-auto-tool-choice --tool-call-parser llama3_json 
+    are all set
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
+        super().__init__(tokenizer)
+
+        # initialize properties used for state when parsing tool calls in
+        # streaming mode
+        self.prev_tool_call_arr: list[dict] = []
+        self.current_tool_id: int = -1
+        self.current_tool_name_sent: bool = False
+        self.streamed_args_for_tool: list[str] = [
+        ]  # map what has been streamed for each tool so far to a list
+        self.bot_token = "<|python_tag|>"
+        self.bot_token_id = tokenizer.encode(self.bot_token,
+                                             add_special_tokens=False)[0]
+        self.tool_call_regex = re.compile(r"\[{.*?}\]", re.DOTALL)
+
+    def extract_tool_calls(
+            self, model_output: str,
+            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        """
+        Extract the tool calls from a complete model response.
+        """
+        # case -- if a tool call token is not present, return a text response
+        if not (model_output.startswith(self.bot_token)
+                or model_output.startswith('{')):
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        try:
+            # load the JSON, and then use it to build the Function and
+            # Tool Call
+            dec = JSONDecoder()
+            function_call_arr = []
+
+            # depending on the prompt format the Llama model may or may not
+            # prefix the output with the <|python_tag|> token
+            start_idx = len(self.bot_token) if model_output.startswith(
+                self.bot_token) else 0
+            while start_idx < len(model_output):
+                (obj, end_idx) = dec.raw_decode(model_output[start_idx:])
+                start_idx += end_idx + len('; ')
+                function_call_arr.append(obj)
+
+            tool_calls: list[ToolCall] = [
+                ToolCall(
+                    type="function",
+                    function=FunctionCall(
+                        name=raw_function_call["name"],
+                        # function call args are JSON but as a string
+                        arguments=json.dumps(raw_function_call["arguments"] \
+                                if "arguments" in raw_function_call \
+                                else raw_function_call["parameters"],
+                                ensure_ascii=False)))
+                for raw_function_call in function_call_arr
+            ]
+
+            # get any content before  the tool call
+            ret = ExtractedToolCallInformation(tools_called=True,
+                                               tool_calls=tool_calls,
+                                               content=None)
+            return ret
+
+        except Exception:
+            logger.exception("Error in extracting tool call from response.")
+            # return information to just treat the tool call as regular JSON
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+
+        if not (current_text.startswith(self.bot_token)
+                or current_text.startswith('{')):
+            return DeltaMessage(content=delta_text)
+
+        # bit mask flags for partial JSON parsing. If the name hasn't been
+        # sent yet, don't allow sending
+        # an incomplete string since OpenAI only ever (as far as I have
+        # seen) allows sending the entire tool/ function name at once.
+        flags = Allow.ALL if self.current_tool_name_sent \
+            else Allow.ALL & ~Allow.STR
+        try:
+            tool_call_arr = []
+            is_complete = []
+            try:
+                # depending on the prompt format the Llama model may or may not
+                # prefix the output with the <|python_tag|> token
+                start_idx = len(self.bot_token) if current_text.startswith(
+                    self.bot_token) else 0
+                while start_idx < len(current_text):
+                    (obj,
+                     end_idx) = partial_json_loads(current_text[start_idx:],
+                                                   flags)
+                    is_complete.append(
+                        is_complete_json(current_text[start_idx:start_idx +
+                                                      end_idx]))
+                    start_idx += end_idx + len('; ')
+                    # depending on the prompt Llama can use
+                    # either arguments or parameters
+                    if "parameters" in obj:
+                        assert "arguments" not in obj, \
+                            "model generated both parameters and arguments"
+                        obj["arguments"] = obj["parameters"]
+                    tool_call_arr.append(obj)
+            except partial_json_parser.core.exceptions.MalformedJSON:
+                logger.debug('not enough tokens to parse into JSON yet')
+                return None
+
+            # select as the current tool call the one we're on the state at
+            current_tool_call: dict = tool_call_arr[self.current_tool_id] \
+                if len(tool_call_arr) > 0 else {}
+
+            # case -- if no tokens have been streamed for the tool, e.g.
+            #   only the array brackets, stream nothing
+            if len(tool_call_arr) == 0:
+                return None
+
+            # case: we are starting a new tool in the array
+            #   -> array has > 0 length AND length has moved past cursor
+            elif (len(tool_call_arr) > 0
+                  and len(tool_call_arr) > self.current_tool_id + 1):
+
+                # if we're moving on to a new call, first make sure we
+                # haven't missed anything in the previous one that was
+                # auto-generated due to JSON completions, but wasn't
+                # streamed to the client yet.
+                if self.current_tool_id >= 0:
+                    cur_arguments = current_tool_call.get("arguments")
+                    if cur_arguments:
+                        cur_args_json = json.dumps(cur_arguments,
+                                                   ensure_ascii=False)
+                        sent = len(
+                            self.streamed_args_for_tool[self.current_tool_id])
+                        argument_diff = cur_args_json[sent:]
+
+                        logger.debug("got arguments diff: %s", argument_diff)
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=argument_diff).
+                                          model_dump(exclude_none=True))
+                        ])
+                        self.streamed_args_for_tool[
+                            self.current_tool_id] += argument_diff
+                    else:
+                        delta = None
+                else:
+                    delta = None
+                # re-set stuff pertaining to progress in the current tool
+                self.current_tool_id = len(tool_call_arr) - 1
+                self.current_tool_name_sent = False
+                self.streamed_args_for_tool.append("")
+                logger.debug("starting on new tool %d", self.current_tool_id)
+                return delta
+
+            # if the current tool name hasn't been sent, send if available
+            # - otherwise send nothing
+            elif not self.current_tool_name_sent:
+                function_name = current_tool_call.get("name")
+                if function_name:
+
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      type="function",
+                                      id=random_tool_call_id(),
+                                      function=DeltaFunctionCall(
+                                          name=function_name).model_dump(
+                                              exclude_none=True))
+                    ])
+                    self.current_tool_name_sent = True
+                else:
+                    delta = None
+
+            # now we know we're on the same tool call and we're streaming
+            # arguments
+            else:
+                cur_arguments = current_tool_call.get("arguments")
+                delta = None
+
+                if cur_arguments:
+                    sent = len(
+                        self.streamed_args_for_tool[self.current_tool_id])
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
+                    prev_arguments = self.prev_tool_call_arr[
+                        self.current_tool_id].get("arguments")
+
+                    argument_diff = None
+                    if is_complete[self.current_tool_id]:
+                        argument_diff = cur_args_json[sent:]
+                    elif prev_arguments:
+                        prev_args_json = json.dumps(prev_arguments,
+                                                    ensure_ascii=False)
+                        if cur_args_json != prev_args_json:
+
+                            prefix = find_common_prefix(
+                                prev_args_json, cur_args_json)
+                            argument_diff = prefix[sent:]
+
+                    if argument_diff is not None:
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=argument_diff).
+                                          model_dump(exclude_none=True))
+                        ])
+                        self.streamed_args_for_tool[
+                            self.current_tool_id] += argument_diff
+
+            self.prev_tool_call_arr = tool_call_arr
+            return delta
+
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            logger.debug(
+                "Skipping chunk as a result of tool streaming extraction "
+                "error")
+            return None
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
new file mode 100644
index 0000000..6ba32e3
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
@@ -0,0 +1,369 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from collections.abc import Sequence
+from typing import Union
+
+import partial_json_parser
+import regex as re
+from partial_json_parser.core.options import Allow
+
+from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("minimax")
+class MinimaxToolParser(ToolParser):
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+        self.current_tool_name_sent: bool = False
+        self.prev_tool_call_arr: list[dict] = []
+        self.current_tool_id: int = -1
+        self.streamed_args_for_tool: list[str] = []
+
+        self.tool_call_start_token: str = "<tool_calls>"
+        self.tool_call_end_token: str = "</tool_calls>"
+
+        self.tool_call_regex = re.compile(
+            r"<tool_calls>(.*?)</tool_calls>|<tool_calls>(.*)", re.DOTALL)
+
+        # Add regex pattern for thinking tag
+        self.thinking_tag_pattern = r"<think>(.*?)</think>"
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ToolParser "
+                "constructor during construction.")
+
+        self.tool_call_start_token_id = self.vocab.get(
+            self.tool_call_start_token)
+        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
+
+        if (self.tool_call_start_token_id is None
+                or self.tool_call_end_token_id is None):
+            logger.warning(
+                "Minimax Tool parser could not locate tool call start/end "
+                "tokens in the tokenizer. Falling back to string matching.")
+
+    def preprocess_model_output(self, model_output: str) -> str:
+        """
+        Remove tool calls from within thinking tags to avoid processing them.
+        """
+
+        def remove_tool_calls_from_think(match):
+            think_content = match.group(1)
+            # Remove tool_calls from within the think tag
+            cleaned_content = re.sub(r"<tool_calls>.*?</tool_calls>",
+                                     "",
+                                     think_content,
+                                     flags=re.DOTALL)
+            return f"<think>{cleaned_content}</think>"
+
+        # Process thinking tags and remove tool_calls from within them
+        processed_output = re.sub(self.thinking_tag_pattern,
+                                  remove_tool_calls_from_think,
+                                  model_output,
+                                  flags=re.DOTALL)
+
+        return processed_output
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+
+        # Preprocess to remove tool calls from thinking tags
+        processed_output = self.preprocess_model_output(model_output)
+
+        if self.tool_call_start_token not in processed_output:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        try:
+            function_call_tuples = (
+                self.tool_call_regex.findall(processed_output))
+
+            raw_function_calls = []
+            for match in function_call_tuples:
+                tool_call_content = match[0] if match[0] else match[1]
+                if tool_call_content.strip():
+                    lines = tool_call_content.strip().split('\n')
+                    for line in lines:
+                        line = line.strip()
+                        if line and line.startswith('{') and line.endswith(
+                                '}'):
+                            try:
+                                parsed_call = json.loads(line)
+                                raw_function_calls.append(parsed_call)
+                            except json.JSONDecodeError:
+                                continue
+
+            tool_calls = []
+            for function_call in raw_function_calls:
+                if "name" in function_call and "arguments" in function_call:
+                    tool_calls.append(
+                        ToolCall(type="function",
+                                 function=FunctionCall(
+                                     name=function_call["name"],
+                                     arguments=json.dumps(
+                                         function_call["arguments"],
+                                         ensure_ascii=False))))
+
+            # Extract content before the first valid tool call
+            # Find the position in processed output, then map back to original
+            processed_pos = processed_output.find(self.tool_call_start_token)
+            if processed_pos != -1:
+                # Get the content before tool calls in processed output
+                processed_content = processed_output[:processed_pos].strip()
+
+                if processed_content:
+                    # Find the end of this content in the original output
+                    # Look for the last non-empty line of processed content
+                    lines = processed_content.split('\n')
+                    for line in reversed(lines):
+                        line = line.strip()
+                        if line:
+                            # Find this line in original output
+                            pos = model_output.find(line)
+                            if pos != -1:
+                                content = model_output[:pos + len(line)]
+                                break
+                    else:
+                        content = ""
+                else:
+                    content = ""
+            else:
+                content = model_output
+
+            return ExtractedToolCallInformation(
+                tools_called=len(tool_calls) > 0,
+                tool_calls=tool_calls,
+                content=content.strip() if content.strip() else None)
+
+        except Exception:
+            logger.exception(
+                "An unexpected error occurred during tool call extraction.")
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+        logger.debug("delta_text: %s", delta_text)
+        logger.debug("delta_token_ids: %s", delta_token_ids)
+
+        # Preprocess to remove tool calls from thinking tags
+        processed_current_text = self.preprocess_model_output(current_text)
+
+        if self.tool_call_start_token not in processed_current_text:
+            return DeltaMessage(content=delta_text)
+
+        if (self.tool_call_start_token_id is not None
+                and self.tool_call_start_token_id in delta_token_ids
+                and len(delta_token_ids) == 1):
+            return None
+
+        original_tool_call_start_pos = current_text.find(
+            self.tool_call_start_token)
+        if original_tool_call_start_pos > 0:
+            delta_start_pos = len(current_text) - len(delta_text)
+            if delta_start_pos < original_tool_call_start_pos:
+                content_part = delta_text
+                if delta_start_pos + len(
+                        delta_text) > original_tool_call_start_pos:
+                    content_part = delta_text[:original_tool_call_start_pos -
+                                              delta_start_pos]
+                if content_part:
+                    return DeltaMessage(content=content_part)
+
+        flags = Allow.ALL if self.current_tool_name_sent \
+            else Allow.ALL & ~Allow.STR
+
+        try:
+            parsable_content = processed_current_text.split(
+                self.tool_call_start_token)[-1].split(
+                    self.tool_call_end_token)[0]
+
+            tool_call_arr = []
+            if parsable_content.strip():
+                lines = parsable_content.strip().split('\n')
+                for line in lines:
+                    line = line.strip()
+                    if line and (line.startswith('{') or '"name"' in line):
+                        try:
+                            if line.endswith('}'):
+                                parsed_call = json.loads(line)
+                                tool_call_arr.append(parsed_call)
+                            else:
+                                parsed_call = partial_json_parser.loads(
+                                    line, flags)
+                                if parsed_call and isinstance(
+                                        parsed_call, dict):
+                                    tool_call_arr.append(parsed_call)
+                        except (json.JSONDecodeError, partial_json_parser.core.
+                                exceptions.MalformedJSON):
+                            continue
+
+            current_tool_call: dict = tool_call_arr[self.current_tool_id] \
+                if len(tool_call_arr) > self.current_tool_id >= 0 else {}
+
+            if len(tool_call_arr) == 0:
+                return None
+
+            # Starting a new tool in the array
+            elif (len(tool_call_arr) > 0
+                  and len(tool_call_arr) > self.current_tool_id + 1):
+
+                # Handle any missed arguments from previous tool
+                if self.current_tool_id >= 0 and self.current_tool_id < len(
+                        self.prev_tool_call_arr):
+                    prev_tool_call = self.prev_tool_call_arr[
+                        self.current_tool_id]
+                    diff_arguments = prev_tool_call.get("arguments")
+
+                    if diff_arguments:
+                        diff_arguments_json = json.dumps(diff_arguments,
+                                                         ensure_ascii=False)
+                        already_streamed = self.streamed_args_for_tool[
+                            self.
+                            current_tool_id] if self.current_tool_id < len(
+                                self.streamed_args_for_tool) else ""
+
+                        if diff_arguments_json != already_streamed:
+                            diff = diff_arguments_json[len(already_streamed):]
+                            delta = DeltaMessage(tool_calls=[
+                                DeltaToolCall(index=self.current_tool_id,
+                                              function=DeltaFunctionCall(
+                                                  arguments=diff).model_dump(
+                                                      exclude_none=True))
+                            ])
+                            if self.current_tool_id < len(
+                                    self.streamed_args_for_tool):
+                                self.streamed_args_for_tool[
+                                    self.current_tool_id] = diff_arguments_json
+                        else:
+                            delta = None
+                    else:
+                        delta = None
+                else:
+                    delta = None
+
+                self.current_tool_id = len(tool_call_arr) - 1
+                self.current_tool_name_sent = False
+                self.streamed_args_for_tool.append("")
+                logger.debug("starting on new tool %d", self.current_tool_id)
+                return delta
+
+            # Send tool name if not sent yet
+            if not self.current_tool_name_sent:
+                function_name = current_tool_call.get("name")
+                if function_name:
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      type="function",
+                                      id=random_tool_call_id(),
+                                      function=DeltaFunctionCall(
+                                          name=function_name).model_dump(
+                                              exclude_none=True))
+                    ])
+                    self.current_tool_name_sent = True
+                else:
+                    delta = None
+
+            # Stream arguments
+            else:
+                prev_arguments = None
+                if (self.current_tool_id < len(self.prev_tool_call_arr)
+                        and self.prev_tool_call_arr[self.current_tool_id]):
+                    prev_arguments = self.prev_tool_call_arr[
+                        self.current_tool_id].get("arguments")
+
+                cur_arguments = current_tool_call.get("arguments")
+
+                if not cur_arguments and not prev_arguments:
+                    delta = None
+                elif not cur_arguments and prev_arguments:
+                    logger.error(
+                        "Arguments reset mid-call, skipping streaming")
+                    delta = None
+                elif cur_arguments and not prev_arguments:
+                    cur_arguments_json = json.dumps(cur_arguments,
+                                                    ensure_ascii=False)
+                    logger.debug("First tokens in arguments received: %s",
+                                 cur_arguments_json)
+
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      function=DeltaFunctionCall(
+                                          arguments=cur_arguments_json).
+                                      model_dump(exclude_none=True))
+                    ])
+                    self.streamed_args_for_tool[
+                        self.current_tool_id] = cur_arguments_json
+
+                elif cur_arguments and prev_arguments:
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
+                    prev_args_json = json.dumps(prev_arguments,
+                                                ensure_ascii=False)
+
+                    logger.debug("Searching for diff between \n%s\n%s",
+                                 cur_args_json, prev_args_json)
+
+                    already_streamed = self.streamed_args_for_tool[
+                        self.current_tool_id] if self.current_tool_id < len(
+                            self.streamed_args_for_tool) else ""
+
+                    if cur_args_json.startswith(already_streamed):
+                        argument_diff = cur_args_json[len(already_streamed):]
+                    elif cur_args_json != already_streamed:
+                        argument_diff = cur_args_json
+                        self.streamed_args_for_tool[self.current_tool_id] = ""
+                    else:
+                        argument_diff = ""
+
+                    if argument_diff:
+                        logger.debug("got arguments diff: %s", argument_diff)
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=argument_diff).
+                                          model_dump(exclude_none=True))
+                        ])
+                        self.streamed_args_for_tool[
+                            self.current_tool_id] += argument_diff
+                    else:
+                        delta = None
+                else:
+                    delta = None
+
+            self.prev_tool_call_arr = tool_call_arr
+            return delta
+
+        except Exception:
+            logger.exception("An unexpected error occurred",
+                             "during streaming tool call handling.")
+            return None
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
new file mode 100644
index 0000000..c0691f1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -0,0 +1,369 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from collections.abc import Sequence
+from random import choices
+from string import ascii_letters, digits
+from typing import Union
+
+import partial_json_parser
+import regex as re
+from partial_json_parser.core.options import Allow
+from pydantic import Field
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.entrypoints.openai.tool_parsers.utils import (
+    extract_intermediate_diff)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+
+logger = init_logger(__name__)
+
+ALPHANUMERIC = ascii_letters + digits
+
+
+class MistralToolCall(ToolCall):
+    id: str = Field(
+        default_factory=lambda: MistralToolCall.generate_random_id())
+
+    @staticmethod
+    def generate_random_id():
+        # Mistral Tool Call Ids must be alphanumeric with a length of 9.
+        # https://github.com/mistralai/mistral-common/blob/21ee9f6cee3441e9bb1e6ed2d10173f90bd9b94b/src/mistral_common/protocol/instruct/validator.py#L299
+        return "".join(choices(ALPHANUMERIC, k=9))
+
+    @staticmethod
+    def is_valid_id(id: str) -> bool:
+        return id.isalnum() and len(id) == 9
+
+
+def _is_fn_name_regex_support(model_tokenizer: AnyTokenizer) -> bool:
+    return isinstance(model_tokenizer, MistralTokenizer) \
+        and model_tokenizer.version >= 11
+
+
+@ToolParserManager.register_module("mistral")
+class MistralToolParser(ToolParser):
+    """
+    Tool call parser for Mistral 7B Instruct v0.3, intended for use with
+    - [`mistral_common`](https://github.com/mistralai/mistral-common/)
+    - the examples/tool_chat_template_mistral.jinja template.
+
+    Used when --enable-auto-tool-choice --tool-call-parser mistral are all set
+    """
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+        if not isinstance(self.model_tokenizer, MistralTokenizer):
+            logger.info("Non-Mistral tokenizer detected when using a Mistral "
+                        "model...")
+
+        # initialize properties used for state when parsing tool calls in
+        # streaming mode
+        self.prev_tool_call_arr: list[dict] = []
+        self.current_tool_id: int = -1
+        self.current_tool_name_sent: bool = False
+        self.streamed_args_for_tool: list[str] = [
+        ]  # map what has been streamed for each tool so far to a list
+        self.bot_token = "[TOOL_CALLS]"
+        self.bot_token_id = self.vocab.get(self.bot_token)
+        self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
+        if _is_fn_name_regex_support(self.model_tokenizer):
+            self.fn_name_regex = re.compile(
+                r'([a-zA-Z0-9_-]+)(\{[\s\S]*?\})(?=\s*$|,|\s)', re.DOTALL)
+        else:
+            self.fn_name_regex = None
+
+        if self.bot_token_id is None:
+            raise RuntimeError(
+                "Mistral Tool Parser could not locate the tool call token in "
+                "the tokenizer!")
+
+    def adjust_request(
+            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        if not isinstance(
+                self.model_tokenizer, MistralTokenizer
+        ) and request.tools and request.tool_choice != 'none':
+            # Do not skip special tokens when using chat template
+            # with Mistral parser as TOOL_CALL token is needed
+            # for tool detection.
+            # Note: we don't want skip_special_tokens=False
+            # with MistralTokenizer as it is incompatible
+            request.skip_special_tokens = False
+        return request
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+        """
+        Extract the tool calls from a complete model response. Requires
+        find-and-replacing single quotes with double quotes for JSON parsing,
+        make sure your tool call arguments don't ever include quotes!
+        """
+
+        # case -- if a tool call token is not present, return a text response
+        if self.bot_token not in model_output:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        # first remove the BOT token
+        tool_content = model_output.replace(self.bot_token, "").strip()
+
+        try:
+            # we first try to directly load the json as parsing very nested
+            # jsons is difficult
+            try:
+                if self.fn_name_regex:
+                    matches = self.fn_name_regex.findall(tool_content)
+
+                    function_call_arr = []
+                    for match in matches:
+                        fn_name = match[0]
+                        args = match[1]
+
+                        # fn_name is encoded outside serialized json dump
+                        # only arguments are serialized
+                        function_call_arr.append({
+                            "name": fn_name,
+                            "arguments": json.loads(args)
+                        })
+                else:
+                    function_call_arr = json.loads(tool_content)
+            except json.JSONDecodeError:
+                # use a regex to find the part corresponding to the tool call.
+                # NOTE: This use case should not happen if the model is trained
+                # correctly. It's a easy possible fix so it's included, but
+                # can be brittle for very complex / highly nested tool calls
+                raw_tool_call = self.tool_call_regex.findall(tool_content)[0]
+                function_call_arr = json.loads(raw_tool_call)
+
+            # Tool Call
+            tool_calls: list[MistralToolCall] = [
+                MistralToolCall(
+                    type="function",
+                    function=FunctionCall(
+                        name=raw_function_call["name"],
+                        # function call args are JSON but as a string
+                        arguments=json.dumps(raw_function_call["arguments"],
+                                             ensure_ascii=False)))
+                for raw_function_call in function_call_arr
+            ]
+
+            # get any content before  the tool call
+            content = model_output.split(self.bot_token)[0]
+            return ExtractedToolCallInformation(
+                tools_called=True,
+                tool_calls=tool_calls,
+                content=content if len(content) > 0 else None)
+
+        except Exception:
+            logger.exception("Error in extracting tool call from response.")
+            # return information to just treat the tool call as regular JSON
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=tool_content)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+
+        # if the tool call token is not in the tokens generated so far, append
+        # output to contents since it's not a tool
+        if self.bot_token not in current_text:
+            return DeltaMessage(content=delta_text)
+
+        # if the tool call token ID IS in the tokens generated so far, that
+        # means we're parsing as tool calls now
+
+        # handle if we detected the BOT token which means the start of tool
+        # calling
+        if (self.bot_token_id in delta_token_ids
+                and len(delta_token_ids) == 1):
+            # if it's the only token, return None, so we don't send a chat
+            # completion any don't send a control token
+            return None
+
+        # bit mask flags for partial JSON parsing. If the name hasn't been
+        # sent yet, don't allow sending
+        # an incomplete string since OpenAI only ever (as far as I have
+        # seen) allows sending the entire tool/ function name at once.
+        flags = Allow.ALL if self.current_tool_name_sent \
+            else Allow.ALL & ~Allow.STR
+        try:
+
+            # replace BOT token with empty string, and convert single quotes
+            # to double to allow parsing as JSON since mistral uses single
+            # quotes instead of double for tool calls
+            parsable_arr = current_text.split(self.bot_token)[-1]
+
+            # tool calls are generated in an array, so do partial JSON
+            # parsing on the entire array
+            try:
+                tool_call_arr: list[dict] = partial_json_parser.loads(
+                    parsable_arr, flags)
+            except partial_json_parser.core.exceptions.MalformedJSON:
+                logger.debug('not enough tokens to parse into JSON yet')
+                return None
+
+            # select as the current tool call the one we're on the state at
+
+            current_tool_call: dict = tool_call_arr[self.current_tool_id] \
+                if len(tool_call_arr) > 0 else {}
+
+            # case -- if no tokens have been streamed for the tool, e.g.
+            #   only the array brackets, stream nothing
+            if len(tool_call_arr) == 0:
+                return None
+
+            # case: we are starting a new tool in the array
+            #   -> array has > 0 length AND length has moved past cursor
+            elif (len(tool_call_arr) > 0
+                  and len(tool_call_arr) > self.current_tool_id + 1):
+
+                # if we're moving on to a new call, first make sure we
+                # haven't missed anything in the previous one that was
+                # auto-generated due to JSON completions, but wasn't
+                # streamed to the client yet.
+                if self.current_tool_id >= 0:
+                    diff: Union[str, None] = current_tool_call.get("arguments")
+
+                    if diff:
+                        diff = json.dumps(diff, ensure_ascii=False).replace(
+                            self.streamed_args_for_tool[self.current_tool_id],
+                            "")
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=diff).model_dump(
+                                                  exclude_none=True))
+                        ])
+                        self.streamed_args_for_tool[
+                            self.current_tool_id] += diff
+                    else:
+                        delta = None
+                else:
+                    delta = None
+                # re-set stuff pertaining to progress in the current tool
+                self.current_tool_id = len(tool_call_arr) - 1
+                self.current_tool_name_sent = False
+                self.streamed_args_for_tool.append("")
+                logger.debug("starting on new tool %d", self.current_tool_id)
+                return delta
+
+            # case: update an existing tool - this is handled below
+
+            # if the current tool name hasn't been sent, send if available
+            # - otherwise send nothing
+            if not self.current_tool_name_sent:
+                function_name = current_tool_call.get("name")
+                if function_name:
+
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      type="function",
+                                      id=MistralToolCall.generate_random_id(),
+                                      function=DeltaFunctionCall(
+                                          name=function_name).model_dump(
+                                              exclude_none=True))
+                    ])
+                    self.current_tool_name_sent = True
+                else:
+                    delta = None
+
+            # now we know we're on the same tool call and we're streaming
+            # arguments
+            else:
+
+                prev_arguments = self.prev_tool_call_arr[
+                    self.current_tool_id].get("arguments")
+                cur_arguments = current_tool_call.get("arguments")
+
+                new_text = delta_text.replace("\'", "\"")
+                if ('"}' in new_text):
+                    new_text = new_text[:new_text.rindex('"}')]
+
+                if not cur_arguments and not prev_arguments:
+
+                    delta = None
+                elif not cur_arguments and prev_arguments:
+                    logger.error(
+                        "INVARIANT - impossible to have arguments reset "
+                        "mid-arguments")
+                    delta = None
+                elif cur_arguments and not prev_arguments:
+                    cur_arguments_json = json.dumps(cur_arguments,
+                                                    ensure_ascii=False)[:-2]
+                    logger.debug("finding %s in %s", new_text,
+                                 cur_arguments_json)
+
+                    if (new_text not in cur_arguments_json):
+                        return None
+                    arguments_delta = cur_arguments_json[:cur_arguments_json.
+                                                         rindex(new_text) +
+                                                         len(new_text)]
+                    logger.debug("First tokens in arguments received: %s",
+                                 arguments_delta)
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      function=DeltaFunctionCall(
+                                          arguments=arguments_delta).
+                                      model_dump(exclude_none=True))
+                    ])
+                    self.streamed_args_for_tool[
+                        self.current_tool_id] += arguments_delta
+
+                elif cur_arguments and prev_arguments:
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
+                    prev_args_json = json.dumps(prev_arguments,
+                                                ensure_ascii=False)
+                    logger.debug("Searching for diff between \n%s\n%s",
+                                 cur_args_json, prev_args_json)
+
+                    argument_diff = extract_intermediate_diff(
+                        cur_args_json, prev_args_json)
+                    logger.debug("got arguments diff: %s", argument_diff)
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      function=DeltaFunctionCall(
+                                          arguments=argument_diff).model_dump(
+                                              exclude_none=True))
+                    ])
+                    self.streamed_args_for_tool[
+                        self.current_tool_id] += argument_diff
+                else:
+                    # try parsing it with regular JSON - if it works we're
+                    # at the end, and we need to send the difference between
+                    # tokens streamed so far and the valid JSON
+                    delta = None
+
+            # check to see if the name is defined and has been sent. if so,
+            # stream the name - otherwise keep waiting
+            # finish by setting old and returning None as base case
+            self.prev_tool_call_arr = tool_call_arr
+            return delta
+
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            logger.debug(
+                "Skipping chunk as a result of tool streaming extraction "
+                "error")
+            return None
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
new file mode 100644
index 0000000..5501028
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
@@ -0,0 +1,112 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from collections.abc import Sequence
+from typing import Any, Optional
+
+import regex as re
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("phi4_mini_json")
+class Phi4MiniJsonToolParser(ToolParser):
+    """
+    Tool call parser for phi-4-mini models intended for use with the
+    examples/tool_chat_template_llama.jinja template.
+
+    Used when --enable-auto-tool-choice --tool-call-parser phi4_mini_json  
+    are all set
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase) -> None:
+        super().__init__(tokenizer)
+
+        # initialize properties used for state when parsing tool calls in
+        # streaming mode
+        self.prev_tool_call_arr: list[dict[str, Any]] = []
+        self.current_tool_id: int = -1
+        self.current_tool_name_sent: bool = False
+        self.streamed_args_for_tool: list[str] = [
+        ]  # map what has been streamed for each tool so far to a list
+        self.bot_token: str = "functools"
+
+    def extract_tool_calls(
+            self, model_output: str,
+            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        """
+        Extract the tool calls from a complete model response.
+        """
+        logger.debug("Model output: %s", model_output)
+
+        pattern = r'functools\[(.*?)\]'
+        matches = re.search(pattern, model_output, re.DOTALL)
+
+        if not matches:
+            logger.debug("No function calls found")
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        try:
+            function_call_arr: list[dict[str, Any]] = []
+            try:
+                json_content = '[' + matches.group(1) + ']'
+
+                function_call_arr = json.loads(json_content)
+                logger.debug("Successfully extracted %d function calls",
+                             len(function_call_arr))
+            except json.JSONDecodeError as e:
+                logger.error(
+                    "Failed to parse function calls from model output. "
+                    "Error: %s", str(e))
+
+            tool_calls: list[ToolCall] = [
+                ToolCall(
+                    id=random_tool_call_id(),
+                    type="function",
+                    function=FunctionCall(
+                        name=raw_function_call["name"],
+                        # function call args are JSON but as a string
+                        arguments=json.dumps(
+                            raw_function_call["arguments"]
+                            if "arguments" in raw_function_call else
+                            raw_function_call["parameters"],
+                            ensure_ascii=False),
+                    )) for raw_function_call in function_call_arr
+            ]
+
+            # get any content before the tool call
+            ret = ExtractedToolCallInformation(tools_called=True,
+                                               tool_calls=tool_calls,
+                                               content=None)
+            return ret
+
+        except Exception:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Optional[DeltaMessage]:
+
+        return None
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
new file mode 100644
index 0000000..73329cd
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
@@ -0,0 +1,308 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import ast
+import json
+from collections.abc import Sequence
+from typing import Any, Union
+
+import regex as re
+from transformers import PreTrainedTokenizerBase
+
+import vllm.envs as envs
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class _UnexpectedAstError(Exception):
+    pass
+
+
+@ToolParserManager.register_module("pythonic")
+class PythonicToolParser(ToolParser):
+    """
+    Tool call parser for models that produce tool calls in a pythonic style,
+    such as Llama 3.2 and Llama 4 models.
+
+    Used when --enable-auto-tool-choice --tool-call-parser pythonic are all set
+    """
+    # TODO(mdepinet): Possible future improvements:
+    #   1. Support text + tools separated by either <|python_tag|> or \n\n
+    #   2. Support tools outside of a list (or separated by a semicolon).
+    #      This depends on item 1 for consistent streaming.
+    # Neither of these are necessary for e.g. ToolACE, but both would help make
+    # Llama3.2 models more reliable.
+
+    TOOL_CALL_REGEX = re.compile(
+        r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]",
+        re.DOTALL)
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
+        super().__init__(tokenizer)
+
+    # Rename for readability. This is NOT a tool id.
+    @property
+    def current_tool_index(self) -> int:
+        return self.current_tool_id
+
+    @current_tool_index.setter
+    def current_tool_index(self, value: int) -> None:
+        self.current_tool_id = value
+
+    def extract_tool_calls(
+            self, model_output: str,
+            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        """
+        Extract the tool calls from a complete model response.
+        """
+        is_tool_call_pattern = False
+        try:
+            is_tool_call_pattern = self.TOOL_CALL_REGEX.match(
+                model_output,
+                timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS) is not None
+        except TimeoutError:
+            logger.warning(
+                "Regex timeout occurred when matching tool call pattern.")
+            logger.debug("Regex timeout occurred when matching user input: %s",
+                         model_output)
+
+        if not is_tool_call_pattern:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        try:
+            module = ast.parse(model_output)
+            parsed = getattr(module.body[0], "value", None)
+            if isinstance(parsed, ast.List) and all(
+                    isinstance(e, ast.Call) for e in parsed.elts):
+                return ExtractedToolCallInformation(
+                    tools_called=True,
+                    tool_calls=[
+                        _handle_single_tool(e)  # type: ignore
+                        for e in parsed.elts
+                    ],
+                    content=None)
+            else:
+                raise _UnexpectedAstError(
+                    "Tool output must be a list of function calls")
+        except Exception:
+            logger.exception("Error in extracting tool call from response.")
+            # Treat as regular text
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+
+        if not current_text.startswith("["):
+            return DeltaMessage(content=delta_text)
+
+        try:
+            valid_and_added_text = _make_valid_python(current_text)
+            if valid_and_added_text is None:
+                return None
+            valid_text, added_text = valid_and_added_text
+
+            module = ast.parse(valid_text)
+            parsed = getattr(module.body[0], "value", None)
+            if not isinstance(parsed, ast.List) or not all(
+                    isinstance(e, ast.Call) for e in parsed.elts):
+                raise _UnexpectedAstError(
+                    "Tool output must be a list of function calls")
+            tool_calls = [
+                _handle_single_tool(e)  # type: ignore
+                for e in parsed.elts
+            ]
+
+            tool_deltas = []
+            for index, new_call in enumerate(tool_calls):
+                if index < self.current_tool_index:
+                    continue
+
+                self.current_tool_index = index
+                if len(self.streamed_args_for_tool) == index:
+                    self.streamed_args_for_tool.append("")
+
+                new_call_complete = index < len(
+                    tool_calls) - 1 or ")]" not in added_text
+                if new_call_complete:
+                    self.current_tool_index += 1
+
+                withheld_suffix = (added_text[:-2]
+                                   if not new_call_complete else "")
+                if not new_call_complete and added_text[-2] == ")":
+                    # Function call is incomplete. Withhold the closing bracket.
+                    withheld_suffix = withheld_suffix + "}"
+                # Strings get single quotes in the model-produced string.
+                # JSON requires double quotes.
+                withheld_suffix = withheld_suffix.replace("'", '"')
+                delta = _compute_tool_delta(self.streamed_args_for_tool[index],
+                                            new_call, index, withheld_suffix)
+
+                if delta is not None:
+                    tool_deltas.append(delta)
+                    if (delta.function is not None
+                            and delta.function.arguments is not None):
+                        self.streamed_args_for_tool[
+                            index] += delta.function.arguments
+
+            # HACK: serving_chat.py inspects the internal state of tool parsers
+            # when determining it's final streaming delta, automatically
+            # adding autocompleted JSON.
+            # These two lines avoid that nonsense while ensuring finish_reason
+            # is set to tool_calls when at least one tool is called.
+            if tool_deltas and not self.prev_tool_call_arr:
+                self.prev_tool_call_arr = [{"arguments": {}}]
+
+            if tool_deltas:
+                return DeltaMessage(tool_calls=tool_deltas)
+            elif not added_text and self.current_tool_id > 0:
+                # Return an empty DeltaMessage once the tool calls are all done
+                # so that finish_reason gets set.
+                return DeltaMessage(content='')
+            else:
+                return None
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            logger.debug(
+                "Skipping chunk as a result of tool streaming extraction "
+                "error")
+            return None
+
+
+def _get_parameter_value(val: ast.expr) -> Any:
+    if isinstance(val, ast.Constant):
+        return val.value
+    elif isinstance(val, ast.Dict):
+        if not all(isinstance(k, ast.Constant) for k in val.keys):
+            raise _UnexpectedAstError(
+                "Dict tool call arguments must have literal keys")
+        return {
+            k.value: _get_parameter_value(v)  # type: ignore
+            for k, v in zip(val.keys, val.values)
+        }
+    elif isinstance(val, ast.List):
+        return [_get_parameter_value(v) for v in val.elts]
+    else:
+        raise _UnexpectedAstError("Tool call arguments must be literals")
+
+
+def _handle_single_tool(call: ast.Call) -> ToolCall:
+    if not isinstance(call.func, ast.Name):
+        raise _UnexpectedAstError("Invalid tool call name")
+    function_name = call.func.id
+    arguments = {}
+    for keyword in call.keywords:
+        arguments[keyword.arg] = _get_parameter_value(keyword.value)
+    return ToolCall(
+        type="function",
+        function=FunctionCall(name=function_name,
+                              arguments=json.dumps(arguments,
+                                                   ensure_ascii=False)),
+    )
+
+
+def _make_valid_python(text: str) -> Union[tuple[str, str], None]:
+    bracket_stack = []
+    for index, char in enumerate(text):
+        if char in {"[", "(", "{"}:
+            bracket_stack.append(char)
+        elif char == "]":
+            if not bracket_stack or bracket_stack.pop() != "[":
+                raise _UnexpectedAstError("Mismatched square brackets")
+        elif char == ")":
+            if not bracket_stack or bracket_stack.pop() != "(":
+                raise _UnexpectedAstError("Mismatched parentheses")
+        elif char == "}":
+            if not bracket_stack or bracket_stack.pop() != "{":
+                raise _UnexpectedAstError("Mismatched curly braces")
+        elif char in {"'", '"'}:
+            if bracket_stack and bracket_stack[-1] == char:
+                if index > 0 and text[index - 1] == "\\":
+                    # Treat an escaped quote as a regular character
+                    pass
+                else:
+                    bracket_stack.pop()
+            elif bracket_stack and bracket_stack[-1] in {"'", '"'}:
+                # Double quote within a single quote string or vice versa.
+                pass
+            else:
+                bracket_stack.append(char)
+
+    text = text.rstrip()
+    if text.endswith("=") or text.endswith(":"):
+        # Since we have no type information for this property/parameter value,
+        # we can't fill in a valid value.
+        return None
+    if bracket_stack and bracket_stack[-1] == "{":
+        trailing_dict_text = text[:text.rfind("{")]
+        num_keys = trailing_dict_text.count(":")
+        num_values = trailing_dict_text.count(",")
+        if num_keys <= num_values:
+            return None  # Incomplete property name within parameter value
+    if bracket_stack and bracket_stack[-1] == "(":
+        trailing_params_text = text[:text.rfind("(")]
+        num_full_param_names = trailing_params_text.count("=")
+        num_full_param_values = trailing_params_text.count(",")
+        if num_full_param_names <= num_full_param_values:
+            return None  # Incomplete parameter name
+    if text.endswith(","):
+        text = text[:-1]
+    if bracket_stack and bracket_stack[-1] == "[" and not text.endswith(
+            "[") and not text.endswith(")"):
+        return None  # Incomplete function name
+
+    added_text = ""
+    for char in reversed(bracket_stack):
+        if char == "[":
+            added_text += "]"
+        elif char == "(":
+            added_text += ")"
+        elif char == "{":
+            added_text += "}"
+        elif char == "'":
+            added_text += "'"
+        elif char == '"':
+            added_text += '"'
+
+    return text + added_text, added_text
+
+
+def _compute_tool_delta(previously_sent_args: str, new_call: ToolCall,
+                        index: int,
+                        withheld_suffix: str) -> Union[DeltaToolCall, None]:
+    new_call_args = new_call.function.arguments
+    if withheld_suffix:
+        assert new_call_args.endswith(withheld_suffix)
+        new_call_args = new_call_args[:-len(withheld_suffix)]
+    if not previously_sent_args:
+        return DeltaToolCall(id=new_call.id,
+                             type="function",
+                             index=index,
+                             function=DeltaFunctionCall(
+                                 name=new_call.function.name,
+                                 arguments=new_call_args,
+                             ))
+
+    arg_diff = new_call_args[len(previously_sent_args):]
+    return DeltaToolCall(
+        id=None, index=index, function=DeltaFunctionCall(
+            arguments=arg_diff)) if arg_diff else None
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
new file mode 100644
index 0000000..cf4d0b2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
@@ -0,0 +1,669 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+import uuid
+from collections.abc import Sequence
+from typing import Any, Optional, Union
+
+import regex as re
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              ChatCompletionToolsParam,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module(["qwen3_coder"])
+class Qwen3CoderToolParser(ToolParser):
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+        self.current_tool_name_sent: bool = False
+        self.prev_tool_call_arr: list[dict] = []
+        self.streamed_args_for_tool: list[str] = []
+
+        # Sentinel tokens for streaming mode
+        self.tool_call_start_token: str = "<tool_call>"
+        self.tool_call_end_token: str = "</tool_call>"
+        self.tool_call_prefix: str = "<function="
+        self.function_end_token: str = "</function>"
+        self.parameter_prefix: str = "<parameter="
+        self.parameter_end_token: str = "</parameter>"
+        self.is_tool_call_started: bool = False
+        self.failed_count: int = 0
+
+        # Streaming state variables
+        self.current_tool_index: int = 0
+        self.header_sent: bool = False
+        self.current_tool_string_id: Optional[str] = None
+        self.current_function_name: Optional[str] = None
+        self.current_param_name: Optional[str] = None
+        self.current_param_value: str = ""
+        self.param_count: int = 0
+        self.in_param: bool = False
+        self.in_function: bool = False
+        self.accumulated_text: str = ""
+        self.json_started: bool = False
+        self.json_closed: bool = False
+
+        # Enhanced streaming state - reset for each new message
+        self._reset_streaming_state()
+
+        # Regex patterns
+        self.tool_call_complete_regex = re.compile(
+            r"<tool_call>(.*?)</tool_call>", re.DOTALL)
+        self.tool_call_regex = re.compile(
+            r"<tool_call>(.*?)</tool_call>|<tool_call>(.*?)$", re.DOTALL)
+        self.tool_call_function_regex = re.compile(
+            r"<function=(.*?)</function>|<function=(.*)$", re.DOTALL)
+        self.tool_call_parameter_regex = re.compile(
+            r"<parameter=(.*?)</parameter>|<parameter=(.*?)$", re.DOTALL)
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ToolParser "
+                "constructor during construction.")
+
+        self.tool_call_start_token_id = self.vocab.get(
+            self.tool_call_start_token)
+        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
+
+        if (self.tool_call_start_token_id is None
+                or self.tool_call_end_token_id is None):
+            raise RuntimeError(
+                "Qwen3 XML Tool parser could not locate tool call start/end "
+                "tokens in the tokenizer!")
+
+        logger.debug("vLLM Successfully import tool parser %s !",
+                     self.__class__.__name__)
+
+    def _generate_tool_call_id(self) -> str:
+        """Generate a unique tool call ID."""
+        return f"call_{uuid.uuid4().hex[:24]}"
+
+    def _reset_streaming_state(self):
+        """Reset all streaming state."""
+        self.current_tool_index = 0
+        self.is_tool_call_started = False
+        self.header_sent = False
+        self.current_tool_string_id = None
+        self.current_function_name = None
+        self.current_param_name = None
+        self.current_param_value = ""
+        self.param_count = 0
+        self.in_param = False
+        self.in_function = False
+        self.accumulated_text = ""
+        self.json_started = False
+        self.json_closed = False
+
+    def _parse_xml_function_call(
+            self, function_call_str: str,
+            tools: Optional[list[ChatCompletionToolsParam]]
+    ) -> Optional[ToolCall]:
+
+        def get_arguments_config(func_name: str) -> dict:
+            if tools is None:
+                return {}
+            for config in tools:
+                if not hasattr(config, "type") or not (
+                        hasattr(config, "function")
+                        and hasattr(config.function, "name")):
+                    continue
+                if (config.type == "function"
+                        and config.function.name == func_name):
+                    if not hasattr(config.function, "parameters"):
+                        return {}
+                    params = config.function.parameters
+                    if isinstance(params, dict) and "properties" in params:
+                        return params["properties"]
+                    elif isinstance(params, dict):
+                        return params
+                    else:
+                        return {}
+            logger.warning("Tool '%s' is not defined in the tools list.",
+                           func_name)
+            return {}
+
+        def convert_param_value(param_value: str, param_name: str,
+                                param_config: dict, func_name: str) -> Any:
+            # Handle null value for any type
+            if param_value.lower() == "null":
+                return None
+
+            converted_value: Any
+
+            if param_name not in param_config:
+                if param_config != {}:
+                    logger.warning(
+                        "Parsed parameter '%s' is not defined in the tool "
+                        "parameters for tool '%s', directly returning the "
+                        "string value.", param_name, func_name)
+                return param_value
+
+            if (isinstance(param_config[param_name], dict)
+                    and "type" in param_config[param_name]):
+                param_type = str(
+                    param_config[param_name]["type"]).strip().lower()
+            else:
+                param_type = "string"
+            if param_type in [
+                    "string", "str", "text", "varchar", "char", "enum"
+            ]:
+                return param_value
+            elif (param_type.startswith("int") or param_type.startswith("uint")
+                  or param_type.startswith("long")
+                  or param_type.startswith("short")
+                  or param_type.startswith("unsigned")):
+                try:
+                    converted_value = int(param_value)
+                    return converted_value
+                except ValueError:
+                    logger.warning(
+                        "Parsed value '%s' of parameter '%s' is not an "
+                        "integer in tool '%s', degenerating to string.",
+                        param_value, param_name, func_name)
+                return param_value
+            elif (param_type.startswith("num")
+                  or param_type.startswith("float")):
+                try:
+                    float_param_value = float(param_value)
+                    converted_value = (float_param_value if float_param_value -
+                                       int(float_param_value) != 0 else
+                                       int(float_param_value))
+                    return converted_value
+                except ValueError:
+                    logger.warning(
+                        "Parsed value '%s' of parameter '%s' is not a float "
+                        "in tool '%s', degenerating to string.", param_value,
+                        param_name, func_name)
+                return param_value
+            elif param_type in ["boolean", "bool", "binary"]:
+                param_value = param_value.lower()
+                if param_value not in ["true", "false"]:
+                    logger.warning(
+                        "Parsed value '%s' of parameter '%s' is not a "
+                        "boolean (`true` of `false`) in tool '%s', "
+                        "degenerating to false.", param_value, param_name,
+                        func_name)
+                return param_value == "true"
+            else:
+                if param_type == "object" or param_type.startswith("dict"):
+                    try:
+                        converted_value = json.loads(param_value)
+                        return converted_value
+                    except json.JSONDecodeError:
+                        logger.warning(
+                            "Parsed value '%s' of parameter '%s' is not a "
+                            "valid JSON object in tool '%s', will try other "
+                            "methods to parse it.", param_value, param_name,
+                            func_name)
+                try:
+                    converted_value = eval(param_value)
+                    return converted_value
+                except Exception:
+                    logger.warning(
+                        "Parsed value '%s' of parameter '%s' cannot be "
+                        "converted via Python `eval()` in tool '%s', "
+                        "degenerating to string.", param_value, param_name,
+                        func_name)
+                return param_value
+
+        # Extract function name
+        end_index = function_call_str.index(">")
+        function_name = function_call_str[:end_index]
+        param_config = get_arguments_config(function_name)
+        parameters = function_call_str[end_index + 1:]
+        param_dict = {}
+        for match in self.tool_call_parameter_regex.findall(parameters):
+            match_text = match[0] if match[0] else match[1]
+            idx = match_text.index(">")
+            param_name = match_text[:idx]
+            param_value = str(match_text[idx + 1:])
+            # Remove prefix and trailing \n
+            if param_value.startswith("\n"):
+                param_value = param_value[1:]
+            if param_value.endswith("\n"):
+                param_value = param_value[:-1]
+
+            param_dict[param_name] = convert_param_value(
+                param_value, param_name, param_config, function_name)
+        return ToolCall(
+            type="function",
+            function=FunctionCall(name=function_name,
+                                  arguments=json.dumps(param_dict,
+                                                       ensure_ascii=False)),
+        )
+
+    def _get_function_calls(self, model_output: str) -> list[str]:
+        # Find all tool calls
+        matched_ranges = self.tool_call_regex.findall(model_output)
+        raw_tool_calls = [
+            match[0] if match[0] else match[1] for match in matched_ranges
+        ]
+
+        # Back-off strategy if no tool_call tags found
+        if len(raw_tool_calls) == 0:
+            raw_tool_calls = [model_output]
+
+        raw_function_calls = []
+        for tool_call in raw_tool_calls:
+            raw_function_calls.extend(
+                self.tool_call_function_regex.findall(tool_call))
+
+        function_calls = [
+            match[0] if match[0] else match[1] for match in raw_function_calls
+        ]
+        return function_calls
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+        # Quick check to avoid unnecessary processing
+        if self.tool_call_prefix not in model_output:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        try:
+            function_calls = self._get_function_calls(model_output)
+            if len(function_calls) == 0:
+                return ExtractedToolCallInformation(tools_called=False,
+                                                    tool_calls=[],
+                                                    content=model_output)
+
+            tool_calls = [
+                self._parse_xml_function_call(function_call_str, request.tools)
+                for function_call_str in function_calls
+            ]
+
+            # Populate prev_tool_call_arr for serving layer to set
+            # finish_reason
+            self.prev_tool_call_arr.clear()  # Clear previous calls
+            for tool_call in tool_calls:
+                if tool_call:
+                    self.prev_tool_call_arr.append({
+                        "name":
+                        tool_call.function.name,
+                        "arguments":
+                        tool_call.function.arguments,
+                    })
+
+            # Extract content before tool calls
+            content_index = model_output.find(self.tool_call_start_token)
+            content_index = (content_index if content_index >= 0 else
+                             model_output.find(self.tool_call_prefix))
+            content = model_output[:content_index]  # .rstrip()
+
+            return ExtractedToolCallInformation(
+                tools_called=(len(tool_calls) > 0),
+                tool_calls=tool_calls,
+                content=content if content else None,
+            )
+
+        except Exception:
+            logger.exception("Error in extracting tool call from response.")
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+        # If no delta text, return None unless it's an EOS token after tool
+        # calls
+        if not delta_text:
+            # Check if this is an EOS token after all tool calls are complete
+            # We check for tool calls in the text even if is_tool_call_started
+            # is False because it might have been reset after processing all
+            # tools
+            if (delta_token_ids
+                    and self.tool_call_end_token_id not in delta_token_ids):
+                # Count complete tool calls
+                complete_calls = len(
+                    self.tool_call_complete_regex.findall(current_text))
+
+                # If we have completed tool calls and populated
+                # prev_tool_call_arr
+                if (complete_calls > 0 and len(self.prev_tool_call_arr) > 0):
+                    # Check if all tool calls are closed
+                    open_calls = (
+                        current_text.count(self.tool_call_start_token) -
+                        current_text.count(self.tool_call_end_token))
+                    if open_calls == 0:
+                        # Return empty delta message to allow finish_reason
+                        # processing
+                        return DeltaMessage(content="")
+                elif not self.is_tool_call_started and current_text:
+                    # This is a regular content response that's now complete
+                    return DeltaMessage(content="")
+            return None
+
+        # Check if this is the first call (reset state if needed)
+        if not previous_text:
+            self._reset_streaming_state()
+
+        # Update accumulated text
+        self.accumulated_text = current_text
+
+        # Check if we need to advance to next tool
+        if self.json_closed and not self.in_function:
+            # Check if this tool call has ended
+            tool_ends = current_text.count(self.tool_call_end_token)
+            if tool_ends > self.current_tool_index:
+                # This tool has ended, advance to next
+                self.current_tool_index += 1
+                self.header_sent = False
+                self.param_count = 0
+                self.json_started = False
+                self.json_closed = False
+
+                # Check if there are more tool calls
+                tool_starts_count = current_text.count(
+                    self.tool_call_start_token)
+                if self.current_tool_index >= tool_starts_count:
+                    # No more tool calls
+                    self.is_tool_call_started = False
+                # Continue processing next tool
+                return None
+
+        # Handle normal content before tool calls
+        if not self.is_tool_call_started:
+            # Check if tool call is starting
+            if (self.tool_call_start_token_id in delta_token_ids
+                    or self.tool_call_start_token in delta_text):
+                self.is_tool_call_started = True
+                # Return any content before the tool call
+                if self.tool_call_start_token in delta_text:
+                    content_before = delta_text[:delta_text.index(
+                        self.tool_call_start_token)]
+                    if content_before:
+                        return DeltaMessage(content=content_before)
+                return None
+            else:
+                # Check if we're between tool calls - skip whitespace
+                if (current_text.rstrip().endswith(self.tool_call_end_token)
+                        and delta_text.strip() == ""):
+                    # We just ended a tool call, skip whitespace
+                    return None
+                # Normal content, no tool call
+                return DeltaMessage(content=delta_text)
+
+        # Check if we're between tool calls (waiting for next one)
+        # Count tool calls we've seen vs processed
+        tool_starts_count = current_text.count(self.tool_call_start_token)
+        if self.current_tool_index >= tool_starts_count:
+            # We're past all tool calls, shouldn't be here
+            return None
+
+        # We're in a tool call, find the current tool call portion
+        # Need to find the correct tool call based on current_tool_index
+        tool_starts: list[int] = []
+        idx = 0
+        while True:
+            idx = current_text.find(self.tool_call_start_token, idx)
+            if idx == -1:
+                break
+            tool_starts.append(idx)
+            idx += len(self.tool_call_start_token)
+
+        if self.current_tool_index >= len(tool_starts):
+            # No more tool calls to process yet
+            return None
+
+        tool_start_idx = tool_starts[self.current_tool_index]
+        # Find where this tool call ends (or current position if not ended yet)
+        tool_end_idx = current_text.find(self.tool_call_end_token,
+                                         tool_start_idx)
+        if tool_end_idx == -1:
+            tool_text = current_text[tool_start_idx:]
+        else:
+            tool_text = current_text[tool_start_idx:tool_end_idx +
+                                     len(self.tool_call_end_token)]
+
+        # Looking for function header
+        if not self.header_sent:
+            if self.tool_call_prefix in tool_text:
+                func_start = (tool_text.find(self.tool_call_prefix) +
+                              len(self.tool_call_prefix))
+                func_end = tool_text.find(">", func_start)
+
+                if func_end != -1:
+                    # Found complete function name
+                    self.current_function_name = tool_text[func_start:func_end]
+                    self.current_tool_string_id = self._generate_tool_call_id()
+                    self.header_sent = True
+                    self.in_function = True
+
+                    # IMPORTANT: Add to prev_tool_call_arr immediately when we
+                    # detect a tool call. This ensures
+                    # finish_reason="tool_calls" even if parsing isn't complete
+                    already_added = any(
+                        tool.get("name") == self.current_function_name
+                        for tool in self.prev_tool_call_arr)
+                    if not already_added:
+                        self.prev_tool_call_arr.append({
+                            "name": self.current_function_name,
+                            "arguments":
+                            "{}",  # Placeholder, will be updated later
+                        })
+
+                    # Send header with function info
+                    return DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_index,
+                            id=self.current_tool_string_id,
+                            function=DeltaFunctionCall(
+                                name=self.current_function_name, arguments=""),
+                            type="function",
+                        )
+                    ])
+            return None
+
+        # We've sent header, now handle function body
+        if self.in_function:
+            # Send opening brace if not sent yet
+            if (not self.json_started
+                    and self.parameter_prefix not in delta_text):
+                self.json_started = True
+                return DeltaMessage(tool_calls=[
+                    DeltaToolCall(
+                        index=self.current_tool_index,
+                        function=DeltaFunctionCall(arguments="{"),
+                    )
+                ])
+
+            # Make sure json_started is set if we're processing parameters
+            if not self.json_started:
+                self.json_started = True
+
+            # Check for function end in accumulated text
+            if not self.json_closed and self.function_end_token in tool_text:
+                # Close JSON
+                self.json_closed = True
+
+                # Extract the complete tool call to update prev_tool_call_arr
+                # with final arguments. Find the function content
+                func_start = (tool_text.find(self.tool_call_prefix) +
+                              len(self.tool_call_prefix))
+                func_content_end = tool_text.find(self.function_end_token,
+                                                  func_start)
+                if func_content_end != -1:
+                    func_content = tool_text[func_start:func_content_end]
+                    # Parse to get the complete arguments
+                    try:
+                        parsed_tool = self._parse_xml_function_call(
+                            func_content, request.tools if request else None)
+                        if parsed_tool:
+                            # Update existing entry in prev_tool_call_arr with
+                            # complete arguments
+                            for i, tool in enumerate(self.prev_tool_call_arr):
+                                if (tool.get("name") ==
+                                        parsed_tool.function.name):
+                                    self.prev_tool_call_arr[i]["arguments"] = (
+                                        parsed_tool.function.arguments)
+                                    break
+                    except Exception:
+                        pass  # Ignore parsing errors during streaming
+
+                result = DeltaMessage(tool_calls=[
+                    DeltaToolCall(
+                        index=self.current_tool_index,
+                        function=DeltaFunctionCall(arguments="}"),
+                    )
+                ])
+
+                # Reset state for next tool
+                self.in_function = False
+                self.json_closed = True
+
+                return result
+
+            # Look for parameters
+            # Count how many complete parameters we have processed
+            complete_params = tool_text.count(self.parameter_end_token)
+
+            # Check if we should start a new parameter
+            if not self.in_param and self.param_count < complete_params:
+                # Find the unprocessed parameter
+                # Count parameter starts
+                param_starts = []
+                idx = 0
+                while True:
+                    idx = tool_text.find(self.parameter_prefix, idx)
+                    if idx == -1:
+                        break
+                    param_starts.append(idx)
+                    idx += len(self.parameter_prefix)
+
+                if len(param_starts) > self.param_count:
+                    # Process the next parameter
+                    param_idx = param_starts[self.param_count]
+                    param_start = param_idx + len(self.parameter_prefix)
+                    remaining = tool_text[param_start:]
+
+                    if ">" in remaining:
+                        # We have the complete parameter name
+                        name_end = remaining.find(">")
+                        self.current_param_name = remaining[:name_end]
+
+                        # Find the parameter value
+                        value_start = param_start + name_end + 1
+                        value_text = tool_text[value_start:]
+                        if value_text.startswith("\n"):
+                            value_text = value_text[1:]
+
+                        # Find where this parameter ends
+                        param_end_idx = value_text.find(
+                            self.parameter_end_token)
+                        if param_end_idx != -1:
+                            # Complete parameter found
+                            param_value = value_text[:param_end_idx]
+                            if param_value.endswith("\n"):
+                                param_value = param_value[:-1]
+
+                            # Build complete JSON fragment for this parameter
+                            if self.param_count == 0:
+                                json_fragment = (
+                                    '"' + self.current_param_name + '": "' +
+                                    json.dumps(param_value)[1:-1] + '"')
+                            else:
+                                json_fragment = (
+                                    ', "' + self.current_param_name + '": "' +
+                                    json.dumps(param_value)[1:-1] + '"')
+
+                            self.param_count += 1
+
+                            return DeltaMessage(tool_calls=[
+                                DeltaToolCall(
+                                    index=self.current_tool_index,
+                                    function=DeltaFunctionCall(
+                                        arguments=json_fragment),
+                                )
+                            ])
+
+            # Continue parameter value
+            if self.in_param:
+                if self.parameter_end_token in delta_text:
+                    # End of parameter
+                    end_idx = delta_text.find(self.parameter_end_token)
+                    value_chunk = delta_text[:end_idx]
+
+                    # Skip past > if at start
+                    if not self.current_param_value and ">" in value_chunk:
+                        gt_idx = value_chunk.find(">")
+                        value_chunk = value_chunk[gt_idx + 1:]
+
+                    if (not self.current_param_value
+                            and value_chunk.startswith("\n")):
+                        value_chunk = value_chunk[1:]
+
+                    # Calculate incremental JSON
+                    full_value = self.current_param_value + value_chunk
+                    prev_escaped = (json.dumps(self.current_param_value)[1:-1]
+                                    if self.current_param_value else "")
+                    full_escaped = json.dumps(full_value)[1:-1]
+                    delta_escaped = full_escaped[len(prev_escaped):]
+
+                    self.in_param = False
+                    self.current_param_value = ""
+
+                    return DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_index,
+                            function=DeltaFunctionCall(
+                                arguments=delta_escaped + '"'),
+                        )
+                    ])
+                else:
+                    # Continue accumulating value
+                    value_chunk = delta_text
+
+                    # Handle first chunk after param name
+                    if not self.current_param_value and ">" in value_chunk:
+                        gt_idx = value_chunk.find(">")
+                        value_chunk = value_chunk[gt_idx + 1:]
+
+                    if (not self.current_param_value
+                            and value_chunk.startswith("\n")):
+                        value_chunk = value_chunk[1:]
+
+                    if value_chunk:
+                        # Stream the escaped delta
+                        prev_escaped = (json.dumps(
+                            self.current_param_value)[1:-1]
+                                        if self.current_param_value else "")
+                        self.current_param_value += value_chunk
+                        full_escaped = json.dumps(
+                            self.current_param_value)[1:-1]
+                        delta_escaped = full_escaped[len(prev_escaped):]
+
+                        if delta_escaped:
+                            return DeltaMessage(tool_calls=[
+                                DeltaToolCall(
+                                    index=self.current_tool_index,
+                                    function=DeltaFunctionCall(
+                                        arguments=delta_escaped),
+                                )
+                            ])
+
+        return None
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/utils.py b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/utils.py
new file mode 100644
index 0000000..aa41cd6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/utils.py
@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from json import JSONDecodeError, JSONDecoder
+from typing import Any
+
+import partial_json_parser
+from partial_json_parser.core.options import Allow
+
+
+def find_common_prefix(s1: str, s2: str) -> str:
+    """
+    Finds a common prefix that is shared between two strings, if there is one.
+    Order of arguments is NOT important.
+
+    This function is provided as a UTILITY for extracting information from JSON
+    generated by partial_json_parser, to help in ensuring that the right tokens
+    are returned in streaming, so that close-quotes, close-brackets and
+    close-braces are not returned prematurely.
+
+    e.g. find_common_prefix('{"fruit": "ap"}', '{"fruit": "apple"}') ->
+    '{"fruit": "ap'
+    """
+    prefix = ''
+    min_length = min(len(s1), len(s2))
+    for i in range(0, min_length):
+        if s1[i] == s2[i]:
+            prefix += s1[i]
+        else:
+            break
+    return prefix
+
+
+def find_common_suffix(s1: str, s2: str) -> str:
+    """
+    Finds a common suffix shared between two strings, if there is one. Order of
+    arguments is NOT important.
+    Stops when the suffix ends OR it hits an alphanumeric character
+
+    e.g. find_common_suffix('{"fruit": "ap"}', '{"fruit": "apple"}') -> '"}'
+    """
+    suffix = ''
+    min_length = min(len(s1), len(s2))
+    for i in range(1, min_length + 1):
+        if s1[-i] == s2[-i] and not s1[-i].isalnum():
+            suffix = s1[-i] + suffix
+        else:
+            break
+    return suffix
+
+
+def extract_intermediate_diff(curr: str, old: str) -> str:
+    """
+    Given two strings, extract the difference in the middle between two strings
+    that are known to have a common prefix and/or suffix.
+
+    This function is provided as a UTILITY for extracting information from JSON
+    generated by partial_json_parser, to help in ensuring that the right tokens
+    are returned in streaming, so that close-quotes, close-brackets and
+    close-braces are not returned prematurely. The order of arguments IS
+    important - the new version of the partially-parsed JSON must be the first
+    argument, and the secnod argument must be from the previous generation.
+
+    What it returns, is tokens that should be streamed to the client.
+
+    e.g. extract_intermediate_diff('{"fruit": "apple"}', '{"fruit": "ap"}')
+        -> 'ple'
+
+    """
+    suffix = find_common_suffix(curr, old)
+
+    old = old[::-1].replace(suffix[::-1], '', 1)[::-1]
+    prefix = find_common_prefix(curr, old)
+    diff = curr
+    if len(suffix):
+        diff = diff[::-1].replace(suffix[::-1], '', 1)[::-1]
+
+    if len(prefix):
+        # replace the prefix only once in case it's mirrored
+        diff = diff.replace(prefix, '', 1)
+
+    return diff
+
+
+def find_all_indices(string: str, substring: str) -> list[int]:
+    """
+    Find all (starting) indices of a substring in a given string. Useful for
+    tool call extraction
+    """
+    indices = []
+    index = -1
+    while True:
+        index = string.find(substring, index + 1)
+        if index == -1:
+            break
+        indices.append(index)
+    return indices
+
+
+# partial_json_parser doesn't support extra data and
+# JSONDecoder.raw_decode doesn't support partial JSON
+def partial_json_loads(input_str: str, flags: Allow) -> tuple[Any, int]:
+    try:
+        return (partial_json_parser.loads(input_str, flags), len(input_str))
+    except JSONDecodeError as e:
+        if "Extra data" in e.msg:
+            dec = JSONDecoder()
+            return dec.raw_decode(input_str)
+        raise
+
+
+def is_complete_json(input_str: str) -> bool:
+    try:
+        json.loads(input_str)
+        return True
+    except JSONDecodeError:
+        return False
+
+
+def consume_space(i: int, s: str) -> int:
+    while i < len(s) and s[i].isspace():
+        i += 1
+    return i
diff --git a/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
new file mode 100644
index 0000000..321718b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
@@ -0,0 +1,466 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa
+import json
+from collections.abc import Sequence
+from typing import Any, Optional, Union
+
+import regex as re
+
+from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import random_uuid
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("xlam")
+class xLAMToolParser(ToolParser):
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+        # Initialize state for streaming mode
+        self.prev_tool_calls: list[dict] = []
+        self.current_tool_id = -1
+        self.current_tool_name_sent = False
+        self.streamed_args: list[str] = [
+        ]  # Track arguments sent for each tool
+
+        # For backward compatibility with tests
+        self.current_tools_sent: list[bool] = []
+
+        # For backward compatibility with serving code
+        self.prev_tool_call_arr = []
+
+        # Regex patterns for preprocessing
+        self.json_code_block_patterns = [
+            r"```(?:json)?\s*([\s\S]*?)```",
+            r"\[TOOL_CALLS\]([\s\S]*?)(?=\n|$)",
+            r"<tool_call>([\s\S]*?)</tool_call>",
+        ]
+        self.thinking_tag_pattern = r"</think>([\s\S]*)"
+
+        # Define streaming state type to be initialized later
+        self.streaming_state: dict[str, Any] = {
+            "current_tool_index": -1,
+            "tool_ids": [],
+            "sent_tools": [],
+        }
+
+    def preprocess_model_output(
+            self, model_output: str) -> tuple[Optional[str], Optional[str]]:
+        """
+        Preprocess the model output to extract content and potential tool calls.
+        Returns:
+            Tuple of (content, potential_tool_calls_json)
+        """
+        # Check for thinking tag
+        thinking_match = re.search(self.thinking_tag_pattern, model_output)
+        if thinking_match:
+            content = model_output[:thinking_match.start() +
+                                   len("</think>")].strip()
+            thinking_content = thinking_match.group(1).strip()
+
+            # Try to parse the thinking content as JSON
+            try:
+                json.loads(thinking_content)
+                return content, thinking_content
+            except json.JSONDecodeError:
+                # If can't parse as JSON, look for JSON code blocks
+                for json_pattern in self.json_code_block_patterns:
+                    json_matches = re.findall(json_pattern, thinking_content)
+                    if json_matches:
+                        for json_str in json_matches:
+                            try:
+                                json.loads(json_str)
+                                return content, json_str
+                            except json.JSONDecodeError:
+                                continue
+
+        # Check for JSON code blocks in the entire output
+        for json_pattern in self.json_code_block_patterns:
+            json_matches = re.findall(json_pattern, model_output)
+            if json_matches:
+                for json_str in json_matches:
+                    try:
+                        json.loads(json_str)
+                        # Extract content by removing the JSON code block
+                        content = re.sub(json_pattern, "",
+                                         model_output).strip()
+                        return content, json_str
+                    except json.JSONDecodeError:
+                        continue
+
+        # If the entire output is a valid JSON array or looks like one, treat it as tool calls
+        if model_output.strip().startswith("["):
+            try:
+                json.loads(model_output)
+                return None, model_output
+            except json.JSONDecodeError:
+                # Even if it's not valid JSON yet, it might be a tool call in progress
+                if ("{" in model_output and "name" in model_output
+                        and "arguments" in model_output):
+                    return None, model_output
+
+        # If no tool calls found, return the original output as content
+        return model_output, None
+
+    def extract_tool_calls(
+            self, model_output: str,
+            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        """
+        Extract tool calls from a complete model output.
+        """
+        try:
+            # Preprocess the model output
+            content, potential_tool_calls = self.preprocess_model_output(
+                model_output)
+
+            if not potential_tool_calls:
+                return ExtractedToolCallInformation(tools_called=False,
+                                                    tool_calls=[],
+                                                    content=content)
+
+            # Parse the potential tool calls as JSON
+            tool_calls_data = json.loads(potential_tool_calls)
+
+            # Ensure it's an array
+            if not isinstance(tool_calls_data, list):
+                logger.debug("Tool calls data is not an array")
+                return ExtractedToolCallInformation(
+                    tools_called=False,
+                    tool_calls=[],
+                    content=content or model_output,
+                )
+
+            tool_calls: list[ToolCall] = []
+
+            for idx, call in enumerate(tool_calls_data):
+                if (not isinstance(call, dict) or "name" not in call
+                        or "arguments" not in call):
+                    logger.debug("Invalid tool call format at index %d", idx)
+                    continue
+
+                tool_call = ToolCall(
+                    id=f"call_{idx}_{random_uuid()}",
+                    type="function",
+                    function=FunctionCall(
+                        name=call["name"],
+                        arguments=(json.dumps(call["arguments"]) if isinstance(
+                            call["arguments"], dict) else call["arguments"]),
+                    ),
+                )
+                tool_calls.append(tool_call)
+
+            return ExtractedToolCallInformation(
+                tools_called=len(tool_calls) > 0,
+                tool_calls=tool_calls,
+                content=content,
+            )
+
+        except Exception as e:
+            logger.exception("Error extracting tool calls: %s", str(e))
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+        """
+        Extract tool calls for streaming mode.
+        """
+        # Simplify detection: if it begins with "[" treat it as a function call
+        is_function_call = (current_text.strip().startswith("["))
+
+        # If not a function call, return normal content
+        if not is_function_call:
+            return DeltaMessage(content=delta_text)
+
+        try:
+            # Initialize streaming state if not exists
+            if not hasattr(self, "streaming_state"):
+                self.streaming_state = {
+                    "current_tool_index": -1,
+                    "tool_ids": [],
+                    "sent_tools": [],  # Track complete state of each tool
+                }
+
+            # Try parsing as JSON to check for complete tool calls
+            try:
+                parsed_tools = json.loads(current_text)
+                if isinstance(parsed_tools, list):
+                    # Update our tool array for next time
+                    self.prev_tool_call_arr = parsed_tools
+            except json.JSONDecodeError:
+                # Not complete JSON yet, use regex for partial parsing
+                pass
+
+            # Check for test-specific state setup (current_tools_sent)
+            # This handles the case where tests manually set current_tools_sent
+            if (hasattr(self, "current_tools_sent")  # type: ignore
+                    and len(self.current_tools_sent) > 0):
+                # If current_tools_sent is set to [False], it means the test wants us to send the name
+                if (len(self.current_tools_sent) == 1
+                        and self.current_tools_sent[0] is False):
+                    # Extract the function name using regex
+                    name_pattern = r'"name"\s*:\s*"([^"]+)"'
+                    name_match = re.search(name_pattern, current_text)
+                    if name_match:
+                        function_name = name_match.group(1)
+
+                        # The test expects us to send just the name first
+                        tool_id = random_tool_call_id()
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(
+                                index=0,
+                                type="function",
+                                id=tool_id,
+                                function=DeltaFunctionCall(
+                                    name=function_name).model_dump(
+                                        exclude_none=True),  # type: ignore
+                            )
+                        ])
+                        # Update state to reflect that we've sent the name
+                        self.current_tools_sent = [True]
+                        self.current_tool_id = 0
+                        self.streaming_state["current_tool_index"] = 0
+                        if len(self.streaming_state["sent_tools"]) == 0:
+                            self.streaming_state["sent_tools"].append({
+                                "sent_name":
+                                True,
+                                "sent_arguments_prefix":
+                                False,
+                                "sent_arguments":
+                                "",
+                            })
+                        else:
+                            self.streaming_state["sent_tools"][0][
+                                "sent_name"] = True
+                        self.current_tool_name_sent = True
+                        return delta
+
+            # Use regex to identify tool calls in the output
+            name_pattern = r'"name"\s*:\s*"([^"]+)"'
+            name_matches = list(re.finditer(name_pattern, current_text))
+            tool_count = len(name_matches)
+
+            # If no tools found yet, return
+            if tool_count == 0:
+                return None
+
+            # Ensure our state arrays are large enough
+            while len(self.streaming_state["sent_tools"]) < tool_count:
+                self.streaming_state["sent_tools"].append({
+                    "sent_name":
+                    False,
+                    "sent_arguments_prefix":
+                    False,
+                    "sent_arguments":
+                    "",
+                })
+
+            while len(self.streaming_state["tool_ids"]) < tool_count:
+                self.streaming_state["tool_ids"].append(None)
+
+            # Determine if we need to move to a new tool
+            current_idx = self.streaming_state["current_tool_index"]
+
+            # If we haven't processed any tool yet or current tool is complete, move to next
+            if current_idx == -1 or current_idx < tool_count - 1:
+                next_idx = current_idx + 1
+
+                # If tool at next_idx has not been sent yet
+                if (next_idx < tool_count
+                        and not self.streaming_state["sent_tools"][next_idx]
+                    ["sent_name"]):
+                    # Update indexes
+                    self.streaming_state["current_tool_index"] = next_idx
+                    self.current_tool_id = (
+                        next_idx  # For backward compatibility
+                    )
+                    current_idx = next_idx
+
+                    # Extract the tool name
+                    tool_name = name_matches[current_idx].group(1)
+
+                    # Generate ID and send tool name
+                    tool_id = f"call_{current_idx}_{random_uuid()}"
+                    self.streaming_state["tool_ids"][current_idx] = tool_id
+
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=current_idx,
+                            type="function",
+                            id=tool_id,
+                            function=DeltaFunctionCall(
+                                name=tool_name).model_dump(
+                                    exclude_none=True),  # type: ignore
+                        )
+                    ])
+                    self.streaming_state["sent_tools"][current_idx][
+                        "sent_name"] = True
+                    self.current_tool_name_sent = (
+                        True  # For backward compatibility
+                    )
+
+                    # Keep track of streamed args for backward compatibility
+                    while len(self.streamed_args) <= current_idx:
+                        self.streamed_args.append("")
+
+                    return delta
+
+            # Process arguments for the current tool
+            if current_idx >= 0 and current_idx < tool_count:
+                # Support both regular and empty argument objects
+                # First, check for the empty arguments case: "arguments": {}
+                empty_args_pattern = (
+                    r'"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:\s*\{\s*\}')
+                empty_args_match = re.search(empty_args_pattern, current_text)
+
+                # Check if this tool has empty arguments
+                if empty_args_match and empty_args_match.start() > 0:
+                    # Find which tool this empty arguments belongs to
+                    empty_args_tool_idx = 0
+                    for i in range(tool_count):
+                        if i == current_idx:
+                            # If this is our current tool and it has empty arguments
+                            if not self.streaming_state["sent_tools"][
+                                    current_idx]["sent_arguments_prefix"]:
+                                # Send empty object
+                                self.streaming_state["sent_tools"][
+                                    current_idx][
+                                        "sent_arguments_prefix"] = True
+                                self.streaming_state["sent_tools"][
+                                    current_idx]["sent_arguments"] = "{}"
+
+                                # Update streamed_args for backward compatibility
+                                while len(self.streamed_args) <= current_idx:
+                                    self.streamed_args.append("")
+                                self.streamed_args[current_idx] += "{}"
+
+                                delta = DeltaMessage(tool_calls=[
+                                    DeltaToolCall(
+                                        index=current_idx,
+                                        function=DeltaFunctionCall(
+                                            arguments="{}").
+                                        model_dump(
+                                            exclude_none=True),  # type: ignore
+                                    )
+                                ])
+
+                                # Move to next tool if available
+                                if current_idx < tool_count - 1:
+                                    self.streaming_state[
+                                        "current_tool_index"] += 1
+                                    self.current_tool_id = self.streaming_state[
+                                        "current_tool_index"]
+
+                                return delta
+
+                # Extract arguments for current tool using regex for non-empty arguments
+                args_pattern = r'"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:\s*(\{(?:[^{}]|(?:\{[^{}]*\}))*\})'
+                args_matches = list(re.finditer(args_pattern, current_text))
+
+                if current_idx < len(args_matches):
+                    args_text = args_matches[current_idx].group(1)
+
+                    # Handle transition between tools
+                    is_last_tool = current_idx == tool_count - 1
+
+                    # Find where the arguments for our current tool end
+                    if not is_last_tool:
+                        # If we have more tools after this one, try to find the complete argument block
+                        next_tool_pos = current_text.find(
+                            "},{", args_matches[current_idx].start())
+                        if next_tool_pos != -1:
+                            args_end_pos = (next_tool_pos + 1
+                                            )  # +1 to include the '}'
+                            args_text = (current_text[args_matches[current_idx]
+                                                      .start():args_end_pos].
+                                         split('"arguments":')[1].strip())
+
+                    # If arguments haven't been sent yet
+                    sent_args = self.streaming_state["sent_tools"][
+                        current_idx]["sent_arguments"]
+
+                    # If we haven't sent the opening bracket yet
+                    if not self.streaming_state["sent_tools"][current_idx][
+                            "sent_arguments_prefix"] and args_text.startswith(
+                                "{"):
+                        self.streaming_state["sent_tools"][current_idx][
+                            "sent_arguments_prefix"] = True
+                        self.streaming_state["sent_tools"][current_idx][
+                            "sent_arguments"] = "{"
+
+                        # Update streamed_args for backward compatibility
+                        while len(self.streamed_args) <= current_idx:
+                            self.streamed_args.append("")
+                        self.streamed_args[current_idx] += "{"
+
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(
+                                index=current_idx,
+                                function=DeltaFunctionCall(
+                                    arguments="{").model_dump(
+                                        exclude_none=True),  # type: ignore  
+                            )
+                        ])
+                        return delta
+
+                    # If we need to send more arguments
+                    if args_text.startswith(sent_args):
+                        # Calculate what part of arguments we need to send
+                        args_diff = args_text[len(sent_args):]
+
+                        if args_diff:
+                            # Update our state
+                            self.streaming_state["sent_tools"][current_idx][
+                                "sent_arguments"] = args_text
+
+                            # Update streamed_args for backward compatibility
+                            while len(self.streamed_args) <= current_idx:
+                                self.streamed_args.append("")
+                            self.streamed_args[current_idx] += args_diff
+
+                            delta = DeltaMessage(tool_calls=[
+                                DeltaToolCall(
+                                    index=current_idx,
+                                    function=DeltaFunctionCall(
+                                        arguments=args_diff).model_dump(
+                                            exclude_none=True),  # type: ignore
+                                )
+                            ])
+                            return delta
+
+                    # If the tool's arguments are complete, check if we need to move to the next tool
+                    if args_text.endswith("}") and args_text == sent_args:
+                        # This tool is complete, move to the next one in the next iteration
+                        if current_idx < tool_count - 1:
+                            self.streaming_state["current_tool_index"] += 1
+                            self.current_tool_id = self.streaming_state[
+                                "current_tool_index"]  # For compatibility
+
+            # If we got here, we couldn't determine what to stream next
+            return None
+
+        except Exception as e:
+            logger.exception(f"Error in streaming tool calls: {e}")
+            # If we encounter an error, just return the delta text as regular content
+            return DeltaMessage(content=delta_text)
diff --git a/vllm_v0.10.0/vllm/entrypoints/score_utils.py b/vllm_v0.10.0/vllm/entrypoints/score_utils.py
new file mode 100644
index 0000000..f3f0423
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/score_utils.py
@@ -0,0 +1,198 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, Optional, Union, cast
+
+from torch.nn import CosineSimilarity
+from typing_extensions import Required, TypeAlias, TypedDict
+
+from vllm.config import ModelConfig
+from vllm.entrypoints.chat_utils import (
+    BaseMultiModalItemTracker, ChatCompletionContentPartImageEmbedsParam,
+    ChatCompletionContentPartImageParam, ChatCompletionContentPartTextParam,
+    MultiModalItemTracker, _ContentPart, _parse_chat_message_content_part)
+from vllm.inputs import TokensPrompt
+from vllm.model_executor.models.interfaces import supports_score_template
+from vllm.multimodal.inputs import MultiModalDataDict
+from vllm.outputs import PoolingRequestOutput
+from vllm.transformers_utils.tokenizer import (AnyTokenizer,
+                                               PreTrainedTokenizer,
+                                               PreTrainedTokenizerFast)
+
+ScoreContentPartParam: TypeAlias = Union[
+    ChatCompletionContentPartImageParam,
+    ChatCompletionContentPartImageEmbedsParam]
+
+
+class ScoreMultiModalParam(TypedDict, total=False):
+    """
+    A specialized parameter type for scoring multimodal content
+    
+    The reasons why don't reuse `CustomChatCompletionMessageParam` directly:
+    1. Score tasks don't need the 'role' field (user/assistant/system) that's required in chat completions
+    2. Including chat-specific fields would confuse users about their purpose in scoring
+    3. This is a more focused interface that only exposes what's needed for scoring
+    """ # noqa: E501
+    content: Required[list[ScoreContentPartParam]]
+    """The multimodal contents"""
+
+
+def _cosine_similarity(
+    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    embed_1: list[PoolingRequestOutput],
+    embed_2: list[PoolingRequestOutput],
+) -> list[PoolingRequestOutput]:
+
+    scorer = CosineSimilarity(0)
+    scores: Union[list[PoolingRequestOutput]] = []
+
+    for emb_1, emb_2 in zip(embed_1, embed_2):
+        pair_score = scorer(emb_1.outputs.data, emb_2.outputs.data)
+
+        padding = []
+        if (pad_token_id := getattr(tokenizer, "pad_token_id",
+                                    None)) is not None:
+            padding = [pad_token_id]
+
+        tokens = emb_1.prompt_token_ids + padding + emb_2.prompt_token_ids
+
+        scores.append(
+            PoolingRequestOutput(
+                request_id=f"{emb_1.request_id}_{emb_2.request_id}",
+                outputs=pair_score,
+                prompt_token_ids=tokens,
+                finished=True))
+
+    return scores
+
+
+def _validate_score_input_lens(
+    data_1: Union[list[str], list[ScoreContentPartParam]],
+    data_2: Union[list[str], list[ScoreContentPartParam]],
+):
+    len_1 = len(data_1)
+    len_2 = len(data_2)
+
+    if len_1 > 1 and len_1 != len_2:
+        raise ValueError("Input lengths must be either 1:1, 1:N or N:N")
+    if len_1 == 0:
+        raise ValueError("At least one text element must be given")
+    if len_2 == 0:
+        raise ValueError("At least one text_pair element must be given")
+
+
+def parse_score_data(
+    data_1: Union[str, ScoreContentPartParam],
+    data_2: Union[str, ScoreContentPartParam],
+    model_config: ModelConfig,
+    tokenizer: AnyTokenizer,
+) -> tuple[str, str, Optional[MultiModalDataDict]]:
+    mm_tracker = MultiModalItemTracker(model_config, tokenizer)
+
+    content_1 = _parse_score_content(data_1, mm_tracker)
+
+    content_2 = _parse_score_content(data_2, mm_tracker)
+
+    def ensure_str(content: Optional[_ContentPart]) -> str:
+        if content is not None and isinstance(content, str):
+            return cast(str, content)
+        else:
+            raise ValueError(
+                f"Only string content is supported, but got {content}.")
+
+    prompt_1 = ensure_str(content_1)
+    prompt_2 = ensure_str(content_2)
+
+    return prompt_1, prompt_2, mm_tracker.all_mm_data()
+
+
+def _parse_score_content(
+    data: Union[str, ScoreContentPartParam],
+    mm_tracker: BaseMultiModalItemTracker,
+) -> Optional[_ContentPart]:
+
+    if isinstance(data, str):
+        data = ChatCompletionContentPartTextParam(type="text", text=data)
+
+    mm_parser = mm_tracker.create_parser()
+
+    parse_res = _parse_chat_message_content_part(
+        data,
+        mm_parser,
+        wrap_dicts=False,
+        interleave_strings=False,
+    )
+
+    if parse_res:
+        return parse_res
+
+    mm_placeholder_storage = mm_parser.mm_placeholder_storage()
+
+    if len(mm_placeholder_storage) != 1 or len(
+            next(iter(mm_placeholder_storage.values()))) != 1:
+        raise ValueError("Only one multi-modal item is supported")
+
+    return next(iter(mm_placeholder_storage.values()))[0]
+
+
+def apply_score_template(
+    model_config: ModelConfig,
+    prompt_1: str,
+    prompt_2: str,
+) -> str:
+    # NOTE(Simon): lazy import to avoid bring in all dependencies (e.g. gguf)
+    from vllm.model_executor.model_loader import get_model_cls
+
+    model = get_model_cls(model_config)
+    if supports_score_template(model):
+        full_prompt = model.get_score_template(prompt_1, prompt_2)
+        if full_prompt is None:
+            raise ValueError("Get empty score template from model")
+        return full_prompt
+
+    raise ValueError(
+        f"Unsupported model architecture: {model_config.architecture}")
+
+
+def post_process_tokens(
+    model_config: ModelConfig,
+    prompt: TokensPrompt,
+) -> None:
+    """
+    Perform architecture-specific manipulations on the input tokens.
+    
+    Note:
+        This is an in-place operation.
+    """
+    # NOTE(Simon): lazy import to avoid bring in all dependencies (e.g. gguf)
+    from vllm.model_executor.model_loader import get_model_cls
+
+    model = get_model_cls(model_config)
+    if supports_score_template(model):
+        model.post_process_tokens(prompt)
+
+
+def get_score_prompt(
+    model_config: ModelConfig,
+    tokenizer: AnyTokenizer,
+    tokenization_kwargs: dict[str, Any],
+    data_1: Union[str, ScoreContentPartParam],
+    data_2: Union[str, ScoreContentPartParam],
+) -> tuple[str, TokensPrompt]:
+    prompt_1, prompt_2, mm_data = parse_score_data(
+        data_1,
+        data_2,
+        model_config,
+        tokenizer,
+    )
+
+    full_prompt = apply_score_template(model_config, prompt_1, prompt_2)
+
+    prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
+
+    engine_prompt = TokensPrompt(prompt_token_ids=prompt_inputs["input_ids"])
+
+    post_process_tokens(model_config, engine_prompt)
+
+    if mm_data is not None:
+        engine_prompt["multi_modal_data"] = mm_data
+    return full_prompt, engine_prompt
diff --git a/vllm_v0.10.0/vllm/entrypoints/ssl.py b/vllm_v0.10.0/vllm/entrypoints/ssl.py
new file mode 100644
index 0000000..e3646a6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/ssl.py
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+from ssl import SSLContext
+from typing import Callable, Optional
+
+from watchfiles import Change, awatch
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class SSLCertRefresher:
+    """A class that monitors SSL certificate files and
+    reloads them when they change.
+    """
+
+    def __init__(self,
+                 ssl_context: SSLContext,
+                 key_path: Optional[str] = None,
+                 cert_path: Optional[str] = None,
+                 ca_path: Optional[str] = None) -> None:
+        self.ssl = ssl_context
+        self.key_path = key_path
+        self.cert_path = cert_path
+        self.ca_path = ca_path
+
+        # Setup certification chain watcher
+        def update_ssl_cert_chain(change: Change, file_path: str) -> None:
+            logger.info("Reloading SSL certificate chain")
+            assert self.key_path and self.cert_path
+            self.ssl.load_cert_chain(self.cert_path, self.key_path)
+
+        self.watch_ssl_cert_task = None
+        if self.key_path and self.cert_path:
+            self.watch_ssl_cert_task = asyncio.create_task(
+                self._watch_files([self.key_path, self.cert_path],
+                                  update_ssl_cert_chain))
+
+        # Setup CA files watcher
+        def update_ssl_ca(change: Change, file_path: str) -> None:
+            logger.info("Reloading SSL CA certificates")
+            assert self.ca_path
+            self.ssl.load_verify_locations(self.ca_path)
+
+        self.watch_ssl_ca_task = None
+        if self.ca_path:
+            self.watch_ssl_ca_task = asyncio.create_task(
+                self._watch_files([self.ca_path], update_ssl_ca))
+
+    async def _watch_files(self, paths, fun: Callable[[Change, str],
+                                                      None]) -> None:
+        """Watch multiple file paths asynchronously."""
+        logger.info("SSLCertRefresher monitors files: %s", paths)
+        async for changes in awatch(*paths):
+            try:
+                for change, file_path in changes:
+                    logger.info("File change detected: %s - %s", change.name,
+                                file_path)
+                    fun(change, file_path)
+            except Exception as e:
+                logger.error(
+                    "SSLCertRefresher failed taking action on file change. "
+                    "Error: %s", e)
+
+    def stop(self) -> None:
+        """Stop watching files."""
+        if self.watch_ssl_cert_task:
+            self.watch_ssl_cert_task.cancel()
+            self.watch_ssl_cert_task = None
+        if self.watch_ssl_ca_task:
+            self.watch_ssl_ca_task.cancel()
+            self.watch_ssl_ca_task = None
diff --git a/vllm_v0.10.0/vllm/entrypoints/utils.py b/vllm_v0.10.0/vllm/entrypoints/utils.py
new file mode 100644
index 0000000..87334f4
--- /dev/null
+++ b/vllm_v0.10.0/vllm/entrypoints/utils.py
@@ -0,0 +1,297 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import asyncio
+import functools
+import os
+import subprocess
+import sys
+from typing import Any, Optional, Union
+
+from fastapi import Request
+from fastapi.responses import JSONResponse, StreamingResponse
+from starlette.background import BackgroundTask, BackgroundTasks
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              CompletionRequest)
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+
+logger = init_logger(__name__)
+
+VLLM_SUBCMD_PARSER_EPILOG = (
+    "Tip: Use `vllm [serve|run-batch|bench <bench_type>] "
+    "--help=<keyword>` to explore arguments from help.\n"
+    "   - To view a argument group:     --help=ModelConfig\n"
+    "   - To view a single argument:    --help=max-num-seqs\n"
+    "   - To search by keyword:         --help=max\n"
+    "   - To list all groups:           --help=listgroup\n"
+    "   - To view help with pager:      --help=page")
+
+
+async def listen_for_disconnect(request: Request) -> None:
+    """Returns if a disconnect message is received"""
+    while True:
+        message = await request.receive()
+        if message["type"] == "http.disconnect":
+            # If load tracking is enabled *and* the counter exists, decrement
+            # it. Combines the previous nested checks into a single condition
+            # to satisfy the linter rule.
+            if (getattr(request.app.state, "enable_server_load_tracking",
+                        False)
+                    and hasattr(request.app.state, "server_load_metrics")):
+                request.app.state.server_load_metrics -= 1
+            break
+
+
+def with_cancellation(handler_func):
+    """Decorator that allows a route handler to be cancelled by client
+    disconnections.
+
+    This does _not_ use request.is_disconnected, which does not work with
+    middleware. Instead this follows the pattern from
+    starlette.StreamingResponse, which simultaneously awaits on two tasks- one
+    to wait for an http disconnect message, and the other to do the work that we
+    want done. When the first task finishes, the other is cancelled.
+
+    A core assumption of this method is that the body of the request has already
+    been read. This is a safe assumption to make for fastapi handlers that have
+    already parsed the body of the request into a pydantic model for us.
+    This decorator is unsafe to use elsewhere, as it will consume and throw away
+    all incoming messages for the request while it looks for a disconnect
+    message.
+
+    In the case where a `StreamingResponse` is returned by the handler, this
+    wrapper will stop listening for disconnects and instead the response object
+    will start listening for disconnects.
+    """
+
+    # Functools.wraps is required for this wrapper to appear to fastapi as a
+    # normal route handler, with the correct request type hinting.
+    @functools.wraps(handler_func)
+    async def wrapper(*args, **kwargs):
+
+        # The request is either the second positional arg or `raw_request`
+        request = args[1] if len(args) > 1 else kwargs["raw_request"]
+
+        handler_task = asyncio.create_task(handler_func(*args, **kwargs))
+        cancellation_task = asyncio.create_task(listen_for_disconnect(request))
+
+        done, pending = await asyncio.wait([handler_task, cancellation_task],
+                                           return_when=asyncio.FIRST_COMPLETED)
+        for task in pending:
+            task.cancel()
+
+        if handler_task in done:
+            return handler_task.result()
+        return None
+
+    return wrapper
+
+
+def decrement_server_load(request: Request):
+    request.app.state.server_load_metrics -= 1
+
+
+def load_aware_call(func):
+
+    @functools.wraps(func)
+    async def wrapper(*args, **kwargs):
+        raw_request = kwargs.get("raw_request",
+                                 args[1] if len(args) > 1 else None)
+
+        if raw_request is None:
+            raise ValueError(
+                "raw_request required when server load tracking is enabled")
+
+        if not getattr(raw_request.app.state, "enable_server_load_tracking",
+                       False):
+            return await func(*args, **kwargs)
+
+        # ensure the counter exists
+        if not hasattr(raw_request.app.state, "server_load_metrics"):
+            raw_request.app.state.server_load_metrics = 0
+
+        raw_request.app.state.server_load_metrics += 1
+        try:
+            response = await func(*args, **kwargs)
+        except Exception:
+            raw_request.app.state.server_load_metrics -= 1
+            raise
+
+        if isinstance(response, (JSONResponse, StreamingResponse)):
+            if response.background is None:
+                response.background = BackgroundTask(decrement_server_load,
+                                                     raw_request)
+            elif isinstance(response.background, BackgroundTasks):
+                response.background.add_task(decrement_server_load,
+                                             raw_request)
+            elif isinstance(response.background, BackgroundTask):
+                # Convert the single BackgroundTask to BackgroundTasks
+                # and chain the decrement_server_load task to it
+                tasks = BackgroundTasks()
+                tasks.add_task(response.background.func,
+                               *response.background.args,
+                               **response.background.kwargs)
+                tasks.add_task(decrement_server_load, raw_request)
+                response.background = tasks
+        else:
+            raw_request.app.state.server_load_metrics -= 1
+
+        return response
+
+    return wrapper
+
+
+def cli_env_setup():
+    # The safest multiprocessing method is `spawn`, as the default `fork` method
+    # is not compatible with some accelerators. The default method will be
+    # changing in future versions of Python, so we should use it explicitly when
+    # possible.
+    #
+    # We only set it here in the CLI entrypoint, because changing to `spawn`
+    # could break some existing code using vLLM as a library. `spawn` will cause
+    # unexpected behavior if the code is not protected by
+    # `if __name__ == "__main__":`.
+    #
+    # References:
+    # - https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
+    # - https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
+    # - https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors
+    # - https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders
+    if "VLLM_WORKER_MULTIPROC_METHOD" not in os.environ:
+        logger.debug("Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'")
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+
+def _validate_truncation_size(
+    max_model_len: int,
+    truncate_prompt_tokens: Optional[int],
+    tokenization_kwargs: Optional[dict[str, Any]] = None,
+) -> Optional[int]:
+
+    if truncate_prompt_tokens is not None:
+        if truncate_prompt_tokens <= -1:
+            truncate_prompt_tokens = max_model_len
+
+        if truncate_prompt_tokens > max_model_len:
+            raise ValueError(
+                f"truncate_prompt_tokens value ({truncate_prompt_tokens}) "
+                f"is greater than max_model_len ({max_model_len})."
+                f" Please, select a smaller truncation size.")
+
+        if tokenization_kwargs is not None:
+            tokenization_kwargs["truncation"] = True
+            tokenization_kwargs["max_length"] = truncate_prompt_tokens
+
+    else:
+        if tokenization_kwargs is not None:
+            tokenization_kwargs["truncation"] = False
+
+    return truncate_prompt_tokens
+
+
+def _output_with_pager(text: str):
+    """Output text using scrolling view if available and appropriate."""
+
+    pagers = ['less -R', 'more']
+    for pager_cmd in pagers:
+        try:
+            proc = subprocess.Popen(pager_cmd.split(),
+                                    stdin=subprocess.PIPE,
+                                    text=True)
+            proc.communicate(input=text)
+            return
+        except (subprocess.SubprocessError, OSError, FileNotFoundError):
+            continue
+
+    # No pager worked, fall back to normal print
+    print(text)
+
+
+def show_filtered_argument_or_group_from_help(parser: argparse.ArgumentParser,
+                                              subcommand_name: list[str]):
+
+    # Only handle --help=<keyword> for the current subcommand.
+    # Since subparser_init() runs for all subcommands during CLI setup,
+    # we skip processing if the subcommand name is not in sys.argv.
+    # sys.argv[0] is the program name. The subcommand follows.
+    # e.g., for `vllm bench latency`,
+    # sys.argv is `['vllm', 'bench', 'latency', ...]`
+    # and subcommand_name is "bench latency".
+    if len(sys.argv) <= len(subcommand_name) or sys.argv[
+            1:1 + len(subcommand_name)] != subcommand_name:
+        return
+
+    for arg in sys.argv:
+        if arg.startswith('--help='):
+            search_keyword = arg.split('=', 1)[1]
+
+            # Enable paged view for full help
+            if search_keyword == 'page':
+                help_text = parser.format_help()
+                _output_with_pager(help_text)
+                sys.exit(0)
+
+            # List available groups
+            if search_keyword == 'listgroup':
+                output_lines = ["\nAvailable argument groups:"]
+                for group in parser._action_groups:
+                    if group.title and not group.title.startswith(
+                            "positional arguments"):
+                        output_lines.append(f"  - {group.title}")
+                        if group.description:
+                            output_lines.append("    " +
+                                                group.description.strip())
+                        output_lines.append("")
+                _output_with_pager("\n".join(output_lines))
+                sys.exit(0)
+
+            # For group search
+            formatter = parser._get_formatter()
+            for group in parser._action_groups:
+                if group.title and group.title.lower() == search_keyword.lower(
+                ):
+                    formatter.start_section(group.title)
+                    formatter.add_text(group.description)
+                    formatter.add_arguments(group._group_actions)
+                    formatter.end_section()
+                    _output_with_pager(formatter.format_help())
+                    sys.exit(0)
+
+            # For single arg
+            matched_actions = []
+
+            for group in parser._action_groups:
+                for action in group._group_actions:
+                    # search option name
+                    if any(search_keyword.lower() in opt.lower()
+                           for opt in action.option_strings):
+                        matched_actions.append(action)
+
+            if matched_actions:
+                header = f"\nParameters matching '{search_keyword}':\n"
+                formatter = parser._get_formatter()
+                formatter.add_arguments(matched_actions)
+                _output_with_pager(header + formatter.format_help())
+                sys.exit(0)
+
+            print(f"\nNo group or parameter matching '{search_keyword}'")
+            print("Tip: use `--help=listgroup` to view all groups.")
+            sys.exit(1)
+
+
+def get_max_tokens(max_model_len: int, request: Union[ChatCompletionRequest,
+                                                      CompletionRequest],
+                   input_length: int, default_sampling_params: dict) -> int:
+
+    max_tokens = getattr(request, "max_completion_tokens",
+                         None) or request.max_tokens
+    default_max_tokens = max_model_len - input_length
+    max_output_tokens = current_platform.get_max_output_tokens(input_length)
+
+    return min(val
+               for val in (default_max_tokens, max_tokens, max_output_tokens,
+                           default_sampling_params.get("max_tokens"))
+               if val is not None)
diff --git a/vllm_v0.10.0/vllm/env_override.py b/vllm_v0.10.0/vllm/env_override.py
new file mode 100644
index 0000000..ef425d4
--- /dev/null
+++ b/vllm_v0.10.0/vllm/env_override.py
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+
+import torch
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+# set some common config/environment variables that should be set
+# for all processes created by vllm and all processes
+# that interact with vllm workers.
+# they are executed whenever `import vllm` is called.
+
+if os.environ.get('NCCL_CUMEM_ENABLE', '0') != '0':
+    logger.warning(
+        "NCCL_CUMEM_ENABLE is set to %s, skipping override. "
+        "This may increase memory overhead with cudagraph+allreduce: "
+        "https://github.com/NVIDIA/nccl/issues/1234",
+        os.environ['NCCL_CUMEM_ENABLE'])
+elif not os.path.exists('/dev/nvidia-caps-imex-channels'):
+    # NCCL requires NCCL_CUMEM_ENABLE to work with
+    # multi-node NVLink, typically on GB200-NVL72 systems.
+    # The ultimate way to detect multi-node NVLink is to use
+    # NVML APIs, which are too expensive to call here.
+    # As an approximation, we check the existence of
+    # /dev/nvidia-caps-imex-channels, used by
+    # multi-node NVLink to communicate across nodes.
+    # This will still cost some GPU memory, but it is worthwhile
+    # because we can get very fast cross-node bandwidth with NVLink.
+    os.environ['NCCL_CUMEM_ENABLE'] = '0'
+
+# see https://github.com/vllm-project/vllm/pull/15951
+# it avoids unintentional cuda initialization from torch.cuda.is_available()
+os.environ['PYTORCH_NVML_BASED_CUDA_CHECK'] = '1'
+
+# see https://github.com/vllm-project/vllm/issues/10480
+os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
+# see https://github.com/vllm-project/vllm/issues/10619
+torch._inductor.config.compile_threads = 1
diff --git a/vllm_v0.10.0/vllm/envs.py b/vllm_v0.10.0/vllm/envs.py
new file mode 100755
index 0000000..5c414e8
--- /dev/null
+++ b/vllm_v0.10.0/vllm/envs.py
@@ -0,0 +1,1060 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import hashlib
+import os
+import sys
+import tempfile
+from typing import TYPE_CHECKING, Any, Callable, Optional
+
+if TYPE_CHECKING:
+    VLLM_HOST_IP: str = ""
+    VLLM_PORT: Optional[int] = None
+    VLLM_RPC_BASE_PATH: str = tempfile.gettempdir()
+    VLLM_USE_MODELSCOPE: bool = False
+    VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60
+    VLLM_NCCL_SO_PATH: Optional[str] = None
+    LD_LIBRARY_PATH: Optional[str] = None
+    VLLM_USE_TRITON_FLASH_ATTN: bool = True
+    VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False
+    VLLM_FLASH_ATTN_VERSION: Optional[int] = None
+    LOCAL_RANK: int = 0
+    CUDA_VISIBLE_DEVICES: Optional[str] = None
+    VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60
+    VLLM_API_KEY: Optional[str] = None
+    S3_ACCESS_KEY_ID: Optional[str] = None
+    S3_SECRET_ACCESS_KEY: Optional[str] = None
+    S3_ENDPOINT_URL: Optional[str] = None
+    VLLM_MODEL_REDIRECT_PATH: Optional[str] = None
+    VLLM_CACHE_ROOT: str = os.path.expanduser("~/.cache/vllm")
+    VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm")
+    VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
+    VLLM_NO_USAGE_STATS: bool = False
+    VLLM_DO_NOT_TRACK: bool = False
+    VLLM_USAGE_SOURCE: str = ""
+    VLLM_CONFIGURE_LOGGING: int = 1
+    VLLM_LOGGING_LEVEL: str = "INFO"
+    VLLM_LOGGING_PREFIX: str = ""
+    VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
+    VLLM_LOGITS_PROCESSOR_THREADS: Optional[int] = None
+    VLLM_TRACE_FUNCTION: int = 0
+    VLLM_ATTENTION_BACKEND: Optional[str] = None
+    VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None
+    VLLM_FLASHINFER_FORCE_TENSOR_CORES: bool = False
+    VLLM_PP_LAYER_PARTITION: Optional[str] = None
+    VLLM_CPU_KVCACHE_SPACE: Optional[int] = 0
+    VLLM_CPU_OMP_THREADS_BIND: str = ""
+    VLLM_CPU_NUM_OF_RESERVED_CPU: Optional[int] = None
+    VLLM_CPU_MOE_PREPACK: bool = True
+    VLLM_CPU_SGL_KERNEL: bool = False
+    VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
+    VLLM_XLA_CHECK_RECOMPILATION: bool = False
+    VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
+    VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING: bool = True
+    VLLM_USE_RAY_SPMD_WORKER: bool = False
+    VLLM_USE_RAY_COMPILED_DAG: bool = False
+    VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "auto"
+    VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
+    VLLM_XLA_USE_SPMD: bool = False
+    VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
+    VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
+    VLLM_IMAGE_FETCH_TIMEOUT: int = 5
+    VLLM_VIDEO_FETCH_TIMEOUT: int = 30
+    VLLM_AUDIO_FETCH_TIMEOUT: int = 10
+    VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25
+    VLLM_VIDEO_LOADER_BACKEND: str = "opencv"
+    VLLM_MM_INPUT_CACHE_GIB: int = 8
+    VLLM_TARGET_DEVICE: str = "cuda"
+    MAX_JOBS: Optional[str] = None
+    NVCC_THREADS: Optional[str] = None
+    VLLM_USE_PRECOMPILED: bool = False
+    VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
+    VLLM_NO_DEPRECATION_WARNING: bool = False
+    VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
+    CMAKE_BUILD_TYPE: Optional[str] = None
+    VERBOSE: bool = False
+    VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
+    VLLM_RPC_TIMEOUT: int = 10000  # ms
+    VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5  # seconds
+    VLLM_PLUGINS: Optional[list[str]] = None
+    VLLM_LORA_RESOLVER_CACHE_DIR: Optional[str] = None
+    VLLM_TORCH_PROFILER_DIR: Optional[str] = None
+    VLLM_USE_TRITON_AWQ: bool = False
+    VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
+    VLLM_SKIP_P2P_CHECK: bool = False
+    VLLM_DISABLED_KERNELS: list[str] = []
+    VLLM_USE_V1: bool = True
+    VLLM_ROCM_USE_AITER: bool = False
+    VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False
+    VLLM_ROCM_USE_AITER_LINEAR: bool = True
+    VLLM_ROCM_USE_AITER_MOE: bool = True
+    VLLM_ROCM_USE_AITER_RMSNORM: bool = True
+    VLLM_ROCM_USE_AITER_MLA: bool = True
+    VLLM_ROCM_USE_AITER_MHA: bool = True
+    VLLM_ROCM_USE_SKINNY_GEMM: bool = True
+    VLLM_ROCM_FP8_PADDING: bool = True
+    VLLM_ROCM_MOE_PADDING: bool = True
+    VLLM_ROCM_CUSTOM_PAGED_ATTN: bool = True
+    VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
+    VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
+    VLLM_DISABLE_COMPILE_CACHE: bool = False
+    Q_SCALE_CONSTANT: int = 200
+    K_SCALE_CONSTANT: int = 200
+    V_SCALE_CONSTANT: int = 100
+    VLLM_SERVER_DEV_MODE: bool = False
+    VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
+    VLLM_MLA_DISABLE: bool = False
+    VLLM_RAY_PER_WORKER_GPUS: float = 1.0
+    VLLM_RAY_BUNDLE_INDICES: str = ""
+    VLLM_CUDART_SO_PATH: Optional[str] = None
+    VLLM_DP_RANK: int = 0
+    VLLM_DP_RANK_LOCAL: int = -1
+    VLLM_DP_SIZE: int = 1
+    VLLM_DP_MASTER_IP: str = ""
+    VLLM_DP_MASTER_PORT: int = 0
+    VLLM_MOE_DP_CHUNK_SIZE: int = 256
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False
+    VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
+    VLLM_V0_USE_OUTLINES_CACHE: bool = False
+    VLLM_V1_USE_OUTLINES_CACHE: bool = False
+    VLLM_TPU_BUCKET_PADDING_GAP: int = 0
+    VLLM_TPU_MOST_MODEL_LEN: Optional[int] = None
+    VLLM_USE_DEEP_GEMM: bool = False
+    VLLM_USE_FLASHINFER_MOE_FP8: bool = False
+    VLLM_USE_FLASHINFER_MOE_FP4: bool = False
+    VLLM_XGRAMMAR_CACHE_MB: int = 0
+    VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
+    VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
+    VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost"
+    VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557
+    VLLM_ALL2ALL_BACKEND: str = "naive"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
+    VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
+    VLLM_SLEEP_WHEN_IDLE: bool = False
+    VLLM_MQ_MAX_CHUNK_BYTES_MB: int = 16
+    VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: int = 300
+    VLLM_KV_CACHE_LAYOUT: Optional[str] = None
+    VLLM_COMPUTE_NANS_IN_LOGITS: bool = False
+    VLLM_USE_NVFP4_CT_EMULATIONS: bool = False
+    VLLM_ROCM_QUICK_REDUCE_QUANTIZATION: str = "NONE"
+    VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True
+    VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: Optional[int] = None
+    VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120
+    VLLM_USE_CUDNN_PREFILL: bool = False
+    VLLM_ENABLE_CUDAGRAPH_GC: bool = False
+    VLLM_LOOPBACK_IP: str = ""
+
+
+def get_default_cache_root():
+    return os.getenv(
+        "XDG_CACHE_HOME",
+        os.path.join(os.path.expanduser("~"), ".cache"),
+    )
+
+
+def get_default_config_root():
+    return os.getenv(
+        "XDG_CONFIG_HOME",
+        os.path.join(os.path.expanduser("~"), ".config"),
+    )
+
+
+def maybe_convert_int(value: Optional[str]) -> Optional[int]:
+    if value is None:
+        return None
+    return int(value)
+
+
+def get_vllm_port() -> Optional[int]:
+    """Get the port from VLLM_PORT environment variable.
+
+    Returns:
+        The port number as an integer if VLLM_PORT is set, None otherwise.
+
+    Raises:
+        ValueError: If VLLM_PORT is a URI, suggest k8s service discovery issue.
+    """
+    if 'VLLM_PORT' not in os.environ:
+        return None
+
+    port = os.getenv('VLLM_PORT', '0')
+
+    try:
+        return int(port)
+    except ValueError as err:
+        from urllib.parse import urlparse
+        parsed = urlparse(port)
+        if parsed.scheme:
+            raise ValueError(
+                f"VLLM_PORT '{port}' appears to be a URI. "
+                "This may be caused by a Kubernetes service discovery issue,"
+                "check the warning in: https://docs.vllm.ai/en/stable/serving/env_vars.html"
+            ) from None
+        raise ValueError(
+            f"VLLM_PORT '{port}' must be a valid integer") from err
+
+
+# The begin-* and end* here are used by the documentation generator
+# to extract the used env vars.
+
+# --8<-- [start:env-vars-definition]
+
+environment_variables: dict[str, Callable[[], Any]] = {
+
+    # ================== Installation Time Env Vars ==================
+
+    # Target device of vLLM, supporting [cuda (by default),
+    # rocm, neuron, cpu]
+    "VLLM_TARGET_DEVICE":
+    lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"),
+
+    # Maximum number of compilation jobs to run in parallel.
+    # By default this is the number of CPUs
+    "MAX_JOBS":
+    lambda: os.getenv("MAX_JOBS", None),
+
+    # Number of threads to use for nvcc
+    # By default this is 1.
+    # If set, `MAX_JOBS` will be reduced to avoid oversubscribing the CPU.
+    "NVCC_THREADS":
+    lambda: os.getenv("NVCC_THREADS", None),
+
+    # If set, vllm will use precompiled binaries (*.so)
+    "VLLM_USE_PRECOMPILED":
+    lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool(
+        os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
+
+    # Whether to force using nightly wheel in python build.
+    # This is used for testing the nightly wheel in python build.
+    "VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL":
+    lambda: bool(int(os.getenv("VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL", "0"))
+                 ),
+
+    # CMake build type
+    # If not set, defaults to "Debug" or "RelWithDebInfo"
+    # Available options: "Debug", "Release", "RelWithDebInfo"
+    "CMAKE_BUILD_TYPE":
+    lambda: os.getenv("CMAKE_BUILD_TYPE"),
+
+    # If set, vllm will print verbose logs during installation
+    "VERBOSE":
+    lambda: bool(int(os.getenv('VERBOSE', '0'))),
+
+    # Root directory for vLLM configuration files
+    # Defaults to `~/.config/vllm` unless `XDG_CONFIG_HOME` is set
+    # Note that this not only affects how vllm finds its configuration files
+    # during runtime, but also affects how vllm installs its configuration
+    # files during **installation**.
+    "VLLM_CONFIG_ROOT":
+    lambda: os.path.expanduser(
+        os.getenv(
+            "VLLM_CONFIG_ROOT",
+            os.path.join(get_default_config_root(), "vllm"),
+        )),
+
+    # ================== Runtime Env Vars ==================
+
+    # Root directory for vLLM cache files
+    # Defaults to `~/.cache/vllm` unless `XDG_CACHE_HOME` is set
+    "VLLM_CACHE_ROOT":
+    lambda: os.path.expanduser(
+        os.getenv(
+            "VLLM_CACHE_ROOT",
+            os.path.join(get_default_cache_root(), "vllm"),
+        )),
+
+    # used in distributed environment to determine the ip address
+    # of the current node, when the node has multiple network interfaces.
+    # If you are using multi-node inference, you should set this differently
+    # on each node.
+    'VLLM_HOST_IP':
+    lambda: os.getenv('VLLM_HOST_IP', ""),
+
+    # used in distributed environment to manually set the communication port
+    # Note: if VLLM_PORT is set, and some code asks for multiple ports, the
+    # VLLM_PORT will be used as the first port, and the rest will be generated
+    # by incrementing the VLLM_PORT value.
+    'VLLM_PORT':
+    get_vllm_port,
+
+    # path used for ipc when the frontend api server is running in
+    # multi-processing mode to communicate with the backend engine process.
+    'VLLM_RPC_BASE_PATH':
+    lambda: os.getenv('VLLM_RPC_BASE_PATH', tempfile.gettempdir()),
+
+    # If true, will load models from ModelScope instead of Hugging Face Hub.
+    # note that the value is true or false, not numbers
+    "VLLM_USE_MODELSCOPE":
+    lambda: os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true",
+
+    # Interval in seconds to log a warning message when the ring buffer is full
+    "VLLM_RINGBUFFER_WARNING_INTERVAL":
+    lambda: int(os.environ.get("VLLM_RINGBUFFER_WARNING_INTERVAL", "60")),
+
+    # path to cudatoolkit home directory, under which should be bin, include,
+    # and lib directories.
+    "CUDA_HOME":
+    lambda: os.environ.get("CUDA_HOME", None),
+
+    # Path to the NCCL library file. It is needed because nccl>=2.19 brought
+    # by PyTorch contains a bug: https://github.com/NVIDIA/nccl/issues/1234
+    "VLLM_NCCL_SO_PATH":
+    lambda: os.environ.get("VLLM_NCCL_SO_PATH", None),
+
+    # when `VLLM_NCCL_SO_PATH` is not set, vllm will try to find the nccl
+    # library file in the locations specified by `LD_LIBRARY_PATH`
+    "LD_LIBRARY_PATH":
+    lambda: os.environ.get("LD_LIBRARY_PATH", None),
+
+    # flag to control if vllm should use triton flash attention
+    "VLLM_USE_TRITON_FLASH_ATTN":
+    lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in
+             ("true", "1")),
+
+    # Use separate prefill and decode kernels for V1 attention instead of
+    # the unified triton kernel.
+    "VLLM_V1_USE_PREFILL_DECODE_ATTENTION":
+    lambda:
+    (os.getenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "False").lower() in
+     ("true", "1")),
+
+    # Force vllm to use a specific flash-attention version (2 or 3), only valid
+    # when using the flash-attention backend.
+    "VLLM_FLASH_ATTN_VERSION":
+    lambda: maybe_convert_int(os.environ.get("VLLM_FLASH_ATTN_VERSION", None)),
+
+    # Internal flag to enable Dynamo fullgraph capture
+    "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE":
+    lambda: bool(
+        os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"),
+
+    # Feature flag to enable/disable Inductor standalone compile.
+    # In torch <= 2.7 we ignore this flag; in torch >= 2.8 this is
+    # enabled by default.
+    "VLLM_USE_STANDALONE_COMPILE":
+    lambda: os.environ.get("VLLM_USE_STANDALONE_COMPILE", "1") == "1",
+
+    # local rank of the process in the distributed setting, used to determine
+    # the GPU device id
+    "LOCAL_RANK":
+    lambda: int(os.environ.get("LOCAL_RANK", "0")),
+
+    # used to control the visible devices in the distributed setting
+    "CUDA_VISIBLE_DEVICES":
+    lambda: os.environ.get("CUDA_VISIBLE_DEVICES", None),
+
+    # timeout for each iteration in the engine
+    "VLLM_ENGINE_ITERATION_TIMEOUT_S":
+    lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")),
+
+    # API key for vLLM API server
+    "VLLM_API_KEY":
+    lambda: os.environ.get("VLLM_API_KEY", None),
+
+    # Whether to log responses from API Server for debugging
+    "VLLM_DEBUG_LOG_API_SERVER_RESPONSE":
+    lambda: os.environ.get("VLLM_DEBUG_LOG_API_SERVER_RESPONSE", "False"
+                           ).lower() == "true",
+
+    # S3 access information, used for tensorizer to load model from S3
+    "S3_ACCESS_KEY_ID":
+    lambda: os.environ.get("S3_ACCESS_KEY_ID", None),
+    "S3_SECRET_ACCESS_KEY":
+    lambda: os.environ.get("S3_SECRET_ACCESS_KEY", None),
+    "S3_ENDPOINT_URL":
+    lambda: os.environ.get("S3_ENDPOINT_URL", None),
+
+    # Usage stats collection
+    "VLLM_USAGE_STATS_SERVER":
+    lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"),
+    "VLLM_NO_USAGE_STATS":
+    lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1",
+    "VLLM_DO_NOT_TRACK":
+    lambda: (os.environ.get("VLLM_DO_NOT_TRACK", None) or os.environ.get(
+        "DO_NOT_TRACK", None) or "0") == "1",
+    "VLLM_USAGE_SOURCE":
+    lambda: os.environ.get("VLLM_USAGE_SOURCE", "production"),
+
+    # Logging configuration
+    # If set to 0, vllm will not configure logging
+    # If set to 1, vllm will configure logging using the default configuration
+    #    or the configuration file specified by VLLM_LOGGING_CONFIG_PATH
+    "VLLM_CONFIGURE_LOGGING":
+    lambda: int(os.getenv("VLLM_CONFIGURE_LOGGING", "1")),
+    "VLLM_LOGGING_CONFIG_PATH":
+    lambda: os.getenv("VLLM_LOGGING_CONFIG_PATH"),
+
+    # this is used for configuring the default logging level
+    "VLLM_LOGGING_LEVEL":
+    lambda: os.getenv("VLLM_LOGGING_LEVEL", "INFO").upper(),
+
+    # if set, VLLM_LOGGING_PREFIX will be prepended to all log messages
+    "VLLM_LOGGING_PREFIX":
+    lambda: os.getenv("VLLM_LOGGING_PREFIX", ""),
+
+    # if set, vllm will call logits processors in a thread pool with this many
+    # threads. This is useful when using custom logits processors that either
+    # (a) launch additional CUDA kernels or (b) do significant CPU-bound work
+    # while not holding the python GIL, or both.
+    "VLLM_LOGITS_PROCESSOR_THREADS":
+    lambda: int(os.getenv("VLLM_LOGITS_PROCESSOR_THREADS", "0"))
+    if "VLLM_LOGITS_PROCESSOR_THREADS" in os.environ else None,
+
+    # Trace function calls
+    # If set to 1, vllm will trace function calls
+    # Useful for debugging
+    "VLLM_TRACE_FUNCTION":
+    lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")),
+
+    # Backend for attention computation
+    # Available options:
+    # - "TORCH_SDPA": use torch.nn.MultiheadAttention
+    # - "FLASH_ATTN": use FlashAttention
+    # - "XFORMERS": use XFormers
+    # - "ROCM_FLASH": use ROCmFlashAttention
+    # - "FLASHINFER": use flashinfer
+    # - "FLASHMLA": use FlashMLA
+    "VLLM_ATTENTION_BACKEND":
+    lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
+
+    # If set, vllm will use flashinfer sampler
+    "VLLM_USE_FLASHINFER_SAMPLER":
+    lambda: bool(int(os.environ["VLLM_USE_FLASHINFER_SAMPLER"]))
+    if "VLLM_USE_FLASHINFER_SAMPLER" in os.environ else None,
+
+    # If set, vllm will force flashinfer to use tensor cores;
+    # otherwise will use heuristic based on model architecture.
+    "VLLM_FLASHINFER_FORCE_TENSOR_CORES":
+    lambda: bool(int(os.getenv("VLLM_FLASHINFER_FORCE_TENSOR_CORES", "0"))),
+
+    # Pipeline stage partition strategy
+    "VLLM_PP_LAYER_PARTITION":
+    lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None),
+
+    # (CPU backend only) CPU key-value cache space.
+    # default is None and will be set as 4 GB
+    "VLLM_CPU_KVCACHE_SPACE":
+    lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0"))
+    if "VLLM_CPU_KVCACHE_SPACE" in os.environ else None,
+
+    # (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31",
+    # "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'.
+    "VLLM_CPU_OMP_THREADS_BIND":
+    lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "auto"),
+
+    # (CPU backend only) CPU cores not used by OMP threads .
+    # Those CPU cores will not be used by OMP threads of a rank.
+    "VLLM_CPU_NUM_OF_RESERVED_CPU":
+    lambda: int(os.getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0"))
+    if "VLLM_CPU_NUM_OF_RESERVED_CPU" in os.environ else None,
+
+    # (CPU backend only) whether to use prepack for MoE layer. This will be
+    # passed to ipex.llm.modules.GatedMLPMOE. On unsupported CPUs, you might
+    # need to set this to "0" (False).
+    "VLLM_CPU_MOE_PREPACK":
+    lambda: bool(int(os.getenv("VLLM_CPU_MOE_PREPACK", "1"))),
+
+    # (CPU backend only) whether to use SGL kernels, optimized for small batch.
+    "VLLM_CPU_SGL_KERNEL":
+    lambda: bool(int(os.getenv("VLLM_CPU_SGL_KERNEL", "0"))),
+
+    # If the env var is set, then all workers will execute as separate
+    # processes from the engine, and we use the same mechanism to trigger
+    # execution on all workers.
+    # Run vLLM with VLLM_USE_RAY_SPMD_WORKER=1 to enable it.
+    "VLLM_USE_RAY_SPMD_WORKER":
+    lambda: bool(int(os.getenv("VLLM_USE_RAY_SPMD_WORKER", "0"))),
+
+    # If the env var is set, it uses the Ray's Compiled Graph
+    # (previously known as ADAG) API which optimizes the
+    # control plane overhead.
+    # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
+    # Note that this variable is set to 1 in V1 by default
+    # when ray distributed executor is used.
+    "VLLM_USE_RAY_COMPILED_DAG":
+    lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG", "0"))),
+
+    # If the env var is set, Ray Compiled Graph uses the specified
+    # channel type to communicate between workers belonging to
+    # different pipeline-parallel stages.
+    # Available options:
+    # - "auto": use the default channel type
+    # - "nccl": use NCCL for communication
+    # - "shm": use shared memory and gRPC for communication
+    # This flag is ignored if VLLM_USE_RAY_COMPILED_DAG is not set.
+    "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE":
+    lambda: os.getenv("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "auto"),
+
+    # If the env var is set, it enables GPU communication overlap
+    # (experimental feature) in Ray's Compiled Graph. This flag is ignored if
+    # VLLM_USE_RAY_COMPILED_DAG is not set.
+    "VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM":
+    lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "0"))
+                 ),
+
+    # Use dedicated multiprocess context for workers.
+    # Both spawn and fork work
+    "VLLM_WORKER_MULTIPROC_METHOD":
+    lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"),
+
+    # Path to the cache for storing downloaded assets
+    "VLLM_ASSETS_CACHE":
+    lambda: os.path.expanduser(
+        os.getenv(
+            "VLLM_ASSETS_CACHE",
+            os.path.join(get_default_cache_root(), "vllm", "assets"),
+        )),
+
+    # Timeout for fetching images when serving multimodal models
+    # Default is 5 seconds
+    "VLLM_IMAGE_FETCH_TIMEOUT":
+    lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")),
+
+    # Timeout for fetching videos when serving multimodal models
+    # Default is 30 seconds
+    "VLLM_VIDEO_FETCH_TIMEOUT":
+    lambda: int(os.getenv("VLLM_VIDEO_FETCH_TIMEOUT", "30")),
+
+    # Timeout for fetching audio when serving multimodal models
+    # Default is 10 seconds
+    "VLLM_AUDIO_FETCH_TIMEOUT":
+    lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")),
+
+    # Maximum filesize in MB for a single audio file when processing
+    # speech-to-text requests. Files larger than this will be rejected.
+    # Default is 25 MB
+    "VLLM_MAX_AUDIO_CLIP_FILESIZE_MB":
+    lambda: int(os.getenv("VLLM_MAX_AUDIO_CLIP_FILESIZE_MB", "25")),
+
+    # Backend for Video IO
+    # - "opencv": Default backend that uses OpenCV stream buffered backend.
+    #
+    # Custom backend implementations can be registered
+    # via `@VIDEO_LOADER_REGISTRY.register("my_custom_video_loader")` and
+    # imported at runtime.
+    # If a non-existing backend is used, an AssertionError will be thrown.
+    "VLLM_VIDEO_LOADER_BACKEND":
+    lambda: os.getenv("VLLM_VIDEO_LOADER_BACKEND", "opencv"),
+
+    # Cache size (in GiB) for multimodal input cache
+    # Default is 4 GiB
+    "VLLM_MM_INPUT_CACHE_GIB":
+    lambda: int(os.getenv("VLLM_MM_INPUT_CACHE_GIB", "4")),
+
+    # Path to the XLA persistent cache directory.
+    # Only used for XLA devices such as TPUs.
+    "VLLM_XLA_CACHE_PATH":
+    lambda: os.path.expanduser(
+        os.getenv(
+            "VLLM_XLA_CACHE_PATH",
+            os.path.join(get_default_cache_root(), "vllm", "xla_cache"),
+        )),
+
+    # If set, assert on XLA recompilation after each execution step.
+    "VLLM_XLA_CHECK_RECOMPILATION":
+    lambda: bool(int(os.getenv("VLLM_XLA_CHECK_RECOMPILATION", "0"))),
+
+    # Enable SPMD mode for TPU backend.
+    "VLLM_XLA_USE_SPMD":
+    lambda: bool(int(os.getenv("VLLM_XLA_USE_SPMD", "0"))),
+    "VLLM_FUSED_MOE_CHUNK_SIZE":
+    lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")),
+    # Control whether to use fused MoE activation chunking. Current chunking
+    # logic is incompatible with torch.compile and causes IMA. See issue
+    # https://github.com/vllm-project/vllm/issues/19631.
+    "VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING":
+    lambda: bool(
+        int(os.getenv("VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING", "1"))),
+
+    # If set, vllm will skip the deprecation warnings.
+    "VLLM_NO_DEPRECATION_WARNING":
+    lambda: bool(int(os.getenv("VLLM_NO_DEPRECATION_WARNING", "0"))),
+
+    # If set, the OpenAI API server will stay alive even after the underlying
+    # AsyncLLMEngine errors and stops serving requests
+    "VLLM_KEEP_ALIVE_ON_ENGINE_DEATH":
+    lambda: bool(os.getenv("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", 0)),
+
+    # If the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN is set, it allows
+    # the user to specify a max sequence length greater than
+    # the max length derived from the model's config.json.
+    # To enable this, set VLLM_ALLOW_LONG_MAX_MODEL_LEN=1.
+    "VLLM_ALLOW_LONG_MAX_MODEL_LEN":
+    lambda:
+    (os.environ.get("VLLM_ALLOW_LONG_MAX_MODEL_LEN", "0").strip().lower() in
+     ("1", "true")),
+
+    # If set, forces FP8 Marlin to be used for FP8 quantization regardless
+    # of the hardware support for FP8 compute.
+    "VLLM_TEST_FORCE_FP8_MARLIN":
+    lambda:
+    (os.environ.get("VLLM_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in
+     ("1", "true")),
+    "VLLM_TEST_FORCE_LOAD_FORMAT":
+    lambda: os.getenv("VLLM_TEST_FORCE_LOAD_FORMAT", "dummy"),
+
+    # Time in ms for the zmq client to wait for a response from the backend
+    # server for simple data operations
+    "VLLM_RPC_TIMEOUT":
+    lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "10000")),
+
+    # Timeout in seconds for keeping HTTP connections alive in API server
+    "VLLM_HTTP_TIMEOUT_KEEP_ALIVE":
+    lambda: int(os.environ.get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5")),
+
+    # a list of plugin names to load, separated by commas.
+    # if this is not set, it means all plugins will be loaded
+    # if this is set to an empty string, no plugins will be loaded
+    "VLLM_PLUGINS":
+    lambda: None if "VLLM_PLUGINS" not in os.environ else os.environ[
+        "VLLM_PLUGINS"].split(","),
+
+    # a local directory to look in for unrecognized LoRA adapters.
+    # only works if plugins are enabled and
+    # VLLM_ALLOW_RUNTIME_LORA_UPDATING is enabled.
+    "VLLM_LORA_RESOLVER_CACHE_DIR":
+    lambda: os.getenv("VLLM_LORA_RESOLVER_CACHE_DIR", None),
+
+    # Enables torch profiler if set. Path to the directory where torch profiler
+    # traces are saved. Note that it must be an absolute path.
+    "VLLM_TORCH_PROFILER_DIR":
+    lambda: (None if os.getenv("VLLM_TORCH_PROFILER_DIR", None) is None else os
+             .path.expanduser(os.getenv("VLLM_TORCH_PROFILER_DIR", "."))),
+
+    # If set, vLLM will use Triton implementations of AWQ.
+    "VLLM_USE_TRITON_AWQ":
+    lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),
+
+    # If set, allow loading or unloading lora adapters in runtime,
+    "VLLM_ALLOW_RUNTIME_LORA_UPDATING":
+    lambda:
+    (os.environ.get("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "0").strip().lower() in
+     ("1", "true")),
+
+    # By default, vLLM will check the peer-to-peer capability itself,
+    # in case of broken drivers. See https://github.com/vllm-project/vllm/blob/a9b15c606fea67a072416ea0ea115261a2756058/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L101-L108 for details. # noqa
+    # If this env var is set to 1, vLLM will skip the peer-to-peer check,
+    # and trust the driver's peer-to-peer capability report.
+    "VLLM_SKIP_P2P_CHECK":
+    lambda: os.getenv("VLLM_SKIP_P2P_CHECK", "0") == "1",
+
+    # List of quantization kernels that should be disabled, used for testing
+    # and performance comparisons. Currently only affects MPLinearKernel
+    # selection
+    # (kernels: MacheteLinearKernel, MarlinLinearKernel, ExllamaLinearKernel)
+    "VLLM_DISABLED_KERNELS":
+    lambda: [] if "VLLM_DISABLED_KERNELS" not in os.environ else os.environ[
+        "VLLM_DISABLED_KERNELS"].split(","),
+
+    # If set, use the V1 code path.
+    "VLLM_USE_V1":
+    lambda: bool(int(os.getenv("VLLM_USE_V1", "1"))),
+
+    # Disable aiter ops unless specifically enabled.
+    # Acts as a parent switch to enable the rest of the other operations.
+    "VLLM_ROCM_USE_AITER":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
+             ("true", "1")),
+
+    # Whether to use aiter paged attention.
+    # By default is disabled.
+    "VLLM_ROCM_USE_AITER_PAGED_ATTN":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER_PAGED_ATTN", "False").lower() in
+             ("true", "1")),
+
+    # use aiter linear op if aiter ops are enabled
+    # The following list of related ops
+    # - scaled_mm (per-tensor / rowwise)
+    "VLLM_ROCM_USE_AITER_LINEAR":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER_LINEAR", "True").lower() in
+             ("true", "1")),
+
+    # Whether to use aiter moe ops.
+    # By default is enabled.
+    "VLLM_ROCM_USE_AITER_MOE":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER_MOE", "True").lower() in
+             ("true", "1")),
+
+    # use aiter rms norm op if aiter ops are enabled.
+    "VLLM_ROCM_USE_AITER_RMSNORM":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER_RMSNORM", "True").lower() in
+             ("true", "1")),
+
+    # Whether to use aiter mla ops.
+    # By default is enabled.
+    "VLLM_ROCM_USE_AITER_MLA":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER_MLA", "True").lower() in
+             ("true", "1")),
+
+    # Whether to use aiter mha ops.
+    # By default is enabled.
+    "VLLM_ROCM_USE_AITER_MHA":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER_MHA", "True").lower() in
+             ("true", "1")),
+
+    # use rocm skinny gemms
+    "VLLM_ROCM_USE_SKINNY_GEMM":
+    lambda: (os.getenv("VLLM_ROCM_USE_SKINNY_GEMM", "True").lower() in
+             ("true", "1")),
+
+    # Pad the fp8 weights to 256 bytes for ROCm
+    "VLLM_ROCM_FP8_PADDING":
+    lambda: bool(int(os.getenv("VLLM_ROCM_FP8_PADDING", "1"))),
+
+    # Pad the weights for the moe kernel
+    "VLLM_ROCM_MOE_PADDING":
+    lambda: bool(int(os.getenv("VLLM_ROCM_MOE_PADDING", "1"))),
+
+    # custom paged attention kernel for MI3* cards
+    "VLLM_ROCM_CUSTOM_PAGED_ATTN":
+    lambda: (os.getenv("VLLM_ROCM_CUSTOM_PAGED_ATTN", "True").lower() in
+             ("true", "1")),
+
+    # Custom quick allreduce kernel for MI3* cards
+    # Choice of quantization level: FP, INT8, INT6, INT4 or NONE
+    # Recommended for large models to get allreduce
+    "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION":
+    lambda: os.getenv("VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", "NONE").upper(),
+
+    # Custom quick allreduce kernel for MI3* cards
+    # Due to the lack of the bfloat16 asm instruction, bfloat16
+    # kernels are slower than fp16,
+    # If environment variable is set to 1, the input is converted to fp16
+    "VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16":
+    lambda:
+    (os.getenv("VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16", "True").lower() in
+     ("true", "1")),
+
+    # Custom quick allreduce kernel for MI3* cards.
+    # Controls the maximum allowed number of data bytes(MB) for custom quick
+    # allreduce communication.
+    # Default: 2048 MB.
+    # Data exceeding this size will use either custom allreduce or RCCL
+    # communication.
+    "VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB":
+    lambda: maybe_convert_int(
+        os.environ.get("VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB", None)),
+
+    # Divisor for dynamic query scale factor calculation for FP8 KV Cache
+    "Q_SCALE_CONSTANT":
+    lambda: int(os.getenv("Q_SCALE_CONSTANT", "200")),
+    # Divisor for dynamic key scale factor calculation for FP8 KV Cache
+    "K_SCALE_CONSTANT":
+    lambda: int(os.getenv("K_SCALE_CONSTANT", "200")),
+    # Divisor for dynamic value scale factor calculation for FP8 KV Cache
+    "V_SCALE_CONSTANT":
+    lambda: int(os.getenv("V_SCALE_CONSTANT", "100")),
+
+    # If set, enable multiprocessing in LLM for the V1 code path.
+    "VLLM_ENABLE_V1_MULTIPROCESSING":
+    lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))),
+    "VLLM_LOG_BATCHSIZE_INTERVAL":
+    lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")),
+    "VLLM_DISABLE_COMPILE_CACHE":
+    lambda: bool(int(os.getenv("VLLM_DISABLE_COMPILE_CACHE", "0"))),
+
+    # If set, vllm will run in development mode, which will enable
+    # some additional endpoints for developing and debugging,
+    # e.g. `/reset_prefix_cache`
+    "VLLM_SERVER_DEV_MODE":
+    lambda: bool(int(os.getenv("VLLM_SERVER_DEV_MODE", "0"))),
+
+    # Controls the maximum number of requests to handle in a
+    # single asyncio task when processing per-token outputs in the
+    # V1 AsyncLLM interface. It is applicable when handling a high
+    # concurrency of streaming requests.
+    # Setting this too high can result in a higher variance of
+    # inter-message latencies. Setting it too low can negatively impact
+    # TTFT and overall throughput.
+    "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE":
+    lambda: int(os.getenv("VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", "128")),
+
+    # If set, vLLM will disable the MLA attention optimizations.
+    "VLLM_MLA_DISABLE":
+    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))),
+
+    # Number of GPUs per worker in Ray, if it is set to be a fraction,
+    # it allows ray to schedule multiple actors on a single GPU,
+    # so that users can colocate other actors on the same GPUs as vLLM.
+    "VLLM_RAY_PER_WORKER_GPUS":
+    lambda: float(os.getenv("VLLM_RAY_PER_WORKER_GPUS", "1.0")),
+
+    # Bundle indices for Ray, if it is set, it can control precisely
+    # which indices are used for the Ray bundle, for every worker.
+    # Format: comma-separated list of integers, e.g. "0,1,2,3"
+    "VLLM_RAY_BUNDLE_INDICES":
+    lambda: os.getenv("VLLM_RAY_BUNDLE_INDICES", ""),
+
+    # In some system, find_loaded_library() may not work. So we allow users to
+    # specify the path through environment variable VLLM_CUDART_SO_PATH.
+    "VLLM_CUDART_SO_PATH":
+    lambda: os.getenv("VLLM_CUDART_SO_PATH", None),
+
+    # Rank of the process in the data parallel setting
+    "VLLM_DP_RANK":
+    lambda: int(os.getenv("VLLM_DP_RANK", "0")),
+
+    # Rank of the process in the data parallel setting.
+    # Defaults to VLLM_DP_RANK when not set.
+    "VLLM_DP_RANK_LOCAL":
+    lambda: int(
+        os.getenv("VLLM_DP_RANK_LOCAL", sys.modules[__name__].VLLM_DP_RANK)),
+
+    # World size of the data parallel setting
+    "VLLM_DP_SIZE":
+    lambda: int(os.getenv("VLLM_DP_SIZE", "1")),
+
+    # IP address of the master node in the data parallel setting
+    "VLLM_DP_MASTER_IP":
+    lambda: os.getenv("VLLM_DP_MASTER_IP", "127.0.0.1"),
+
+    # Port of the master node in the data parallel setting
+    "VLLM_DP_MASTER_PORT":
+    lambda: int(os.getenv("VLLM_DP_MASTER_PORT", "0")),
+
+    # In the context of executing MoE models with Data-Parallel, Expert-Parallel
+    # and Batched All-to-All dispatch/combine kernels, VLLM_MOE_DP_CHUNK_SIZE
+    # dictates the quantum of tokens that can be dispatched from a DP
+    # rank. All DP ranks process the activations in VLLM_MOE_DP_CHUNK_SIZE
+    # units.
+    "VLLM_MOE_DP_CHUNK_SIZE":
+    lambda: int(os.getenv("VLLM_MOE_DP_CHUNK_SIZE", "256")),
+
+    # Randomize inputs during dummy runs when using Data Parallel
+    "VLLM_RANDOMIZE_DP_DUMMY_INPUTS":
+    lambda: os.environ.get("VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0") == "1",
+
+    # Whether to use S3 path for model loading in CI via RunAI Streamer
+    "VLLM_CI_USE_S3":
+    lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1",
+
+    # Use model_redirect to redirect the model name to a local folder.
+    # `model_redirect` can be a json file mapping the model between
+    # repo_id and local folder:
+    # {"meta-llama/Llama-3.2-1B": "/tmp/Llama-3.2-1B"}
+    # or a space separated values table file:
+    # meta-llama/Llama-3.2-1B   /tmp/Llama-3.2-1B
+    "VLLM_MODEL_REDIRECT_PATH":
+    lambda: os.environ.get("VLLM_MODEL_REDIRECT_PATH", None),
+
+    # Whether to use atomicAdd reduce in gptq/awq marlin kernel.
+    "VLLM_MARLIN_USE_ATOMIC_ADD":
+    lambda: os.environ.get("VLLM_MARLIN_USE_ATOMIC_ADD", "0") == "1",
+
+    # Whether to turn on the outlines cache for V0
+    # This cache is unbounded and on disk, so it's not safe to use in
+    # an environment with potentially malicious users.
+    "VLLM_V0_USE_OUTLINES_CACHE":
+    lambda: os.environ.get("VLLM_V0_USE_OUTLINES_CACHE", "0") == "1",
+
+    # Whether to turn on the outlines cache for V1
+    # This cache is unbounded and on disk, so it's not safe to use in
+    # an environment with potentially malicious users.
+    "VLLM_V1_USE_OUTLINES_CACHE":
+    lambda: os.environ.get("VLLM_V1_USE_OUTLINES_CACHE", "0") == "1",
+
+    # Gap between padding buckets for the forward pass. So we have
+    # 8, we will run forward pass with [16, 24, 32, ...].
+    "VLLM_TPU_BUCKET_PADDING_GAP":
+    lambda: int(os.environ["VLLM_TPU_BUCKET_PADDING_GAP"])
+    if "VLLM_TPU_BUCKET_PADDING_GAP" in os.environ else 0,
+    "VLLM_TPU_MOST_MODEL_LEN":
+    lambda: maybe_convert_int(os.environ.get("VLLM_TPU_MOST_MODEL_LEN", None)),
+
+    # Allow use of DeepGemm kernels for fused moe ops.
+    "VLLM_USE_DEEP_GEMM":
+    lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))),
+
+    # Allow use of FlashInfer MoE kernels for fused moe ops.
+    "VLLM_USE_FLASHINFER_MOE_FP8":
+    lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP8", "0"))),
+
+    # Allow use of FlashInfer CUTLASS kernels for fused moe ops.
+    "VLLM_USE_FLASHINFER_MOE_FP4":
+    lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP4", "0"))),
+
+    # Control the cache sized used by the xgrammar compiler. The default
+    # of 512 MB should be enough for roughly 1000 JSON schemas.
+    # It can be changed with this variable if needed for some reason.
+    "VLLM_XGRAMMAR_CACHE_MB":
+    lambda: int(os.getenv("VLLM_XGRAMMAR_CACHE_MB", "512")),
+
+    # Control the threshold for msgspec to use 'zero copy' for
+    # serialization/deserialization of tensors. Tensors below
+    # this limit will be encoded into the msgpack buffer, and
+    # tensors above will instead be sent via a separate message.
+    # While the sending side still actually copies the tensor
+    # in all cases, on the receiving side, tensors above this
+    # limit will actually be zero-copy decoded.
+    "VLLM_MSGPACK_ZERO_COPY_THRESHOLD":
+    lambda: int(os.getenv("VLLM_MSGPACK_ZERO_COPY_THRESHOLD", "256")),
+
+    # If set, allow insecure serialization using pickle.
+    # This is useful for environments where it is deemed safe to use the
+    # insecure method and it is needed for some reason.
+    "VLLM_ALLOW_INSECURE_SERIALIZATION":
+    lambda: bool(int(os.getenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "0"))),
+
+    # IP address used for NIXL handshake between remote agents.
+    "VLLM_NIXL_SIDE_CHANNEL_HOST":
+    lambda: os.getenv("VLLM_NIXL_SIDE_CHANNEL_HOST", "localhost"),
+
+    # Port used for NIXL handshake between remote agents.
+    "VLLM_NIXL_SIDE_CHANNEL_PORT":
+    lambda: int(os.getenv("VLLM_NIXL_SIDE_CHANNEL_PORT", "5557")),
+
+    # all2all backend for vllm's expert parallel communication
+    # Available options:
+    # - "naive": naive all2all implementation using all-reduce
+    # - "pplx": use pplx kernels
+    # - "deepep_high_throughput", use deepep high-throughput kernels
+    # - "deepep_low_latency", use deepep low-latency kernels
+    "VLLM_ALL2ALL_BACKEND":
+    lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"),
+
+    # Control the maximum number of tokens per expert supported by the
+    # NVFP4 MoE CUTLASS Kernel. This value is used to create a buffer for
+    # the blockscale tensor of activations NVFP4 Quantization.
+    # This is used to prevent the kernel from running out of memory.
+    "VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE":
+    lambda: int(os.getenv("VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840")),
+
+    # Regex timeout for use by the vLLM tool parsing plugins.
+    "VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS":
+    lambda: int(os.getenv("VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS", "1")),
+
+    # Reduce CPU usage when vLLM is idle. Enabling this will incur small
+    # latency penalty when a request eventually comes.
+    "VLLM_SLEEP_WHEN_IDLE":
+    lambda: bool(int(os.getenv("VLLM_SLEEP_WHEN_IDLE", "0"))),
+
+    # Control the max chunk bytes (in MB) for the rpc message queue.
+    # Object larger than this threshold will be broadcast to worker
+    # processes via zmq.
+    "VLLM_MQ_MAX_CHUNK_BYTES_MB":
+    lambda: int(os.getenv("VLLM_MQ_MAX_CHUNK_BYTES_MB", "16")),
+
+    # Timeout in seconds for execute_model RPC calls in multiprocessing
+    # executor (only applies when TP > 1).
+    "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS":
+    lambda: int(os.getenv("VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS", "300")),
+
+    # KV Cache layout used throughout vllm.
+    # Some common values are:
+    # - NHD
+    # - HND
+    # Where N=num_blocks, H=num_heads and D=head_size. The default value will
+    # leave the layout choice to the backend. Mind that backends may only
+    # implement and support a subset of all possible layouts.
+    "VLLM_KV_CACHE_LAYOUT":
+    lambda: os.getenv("VLLM_KV_CACHE_LAYOUT", None),
+
+    # Enable checking whether the generated logits contain NaNs,
+    # indicating corrupted output. Useful for debugging low level bugs
+    # or bad hardware but it may add compute overhead.
+    "VLLM_COMPUTE_NANS_IN_LOGITS":
+    lambda: bool(int(os.getenv("VLLM_COMPUTE_NANS_IN_LOGITS", "0"))),
+
+    # Controls whether or not emulations are used for NVFP4
+    # generations on machines < 100 for compressed-tensors
+    # models
+    "VLLM_USE_NVFP4_CT_EMULATIONS":
+    lambda: bool(int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0"))),
+
+    # Time (in seconds) after which the KV cache on the producer side is
+    # automatically cleared if no READ notification is received from the
+    # consumer. This is only applicable when using NixlConnector in a
+    # disaggregated decode-prefill setup.
+    "VLLM_NIXL_ABORT_REQUEST_TIMEOUT":
+    lambda: int(os.getenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", "120")),
+
+    # Controls whether or not to use cudnn prefill
+    "VLLM_USE_CUDNN_PREFILL":
+    lambda: bool(int(os.getenv("VLLM_USE_CUDNN_PREFILL", "0"))),
+
+    # If set to 1, use the TRTLLM Decode Attention backend in flashinfer.
+    "VLLM_USE_TRTLLM_DECODE_ATTENTION":
+    lambda: os.getenv("VLLM_USE_TRTLLM_DECODE_ATTENTION", None),
+
+    # Controls garbage collection during CUDA graph capture.
+    # If set to 0 (default), enables GC freezing to speed up capture time.
+    # If set to 1, allows GC to run during capture.
+    "VLLM_ENABLE_CUDAGRAPH_GC":
+    lambda: bool(int(os.getenv("VLLM_ENABLE_CUDAGRAPH_GC", "0"))),
+
+    # Used to force set up loopback IP
+    "VLLM_LOOPBACK_IP":
+    lambda: os.getenv("VLLM_LOOPBACK_IP", ""),
+}
+
+# --8<-- [end:env-vars-definition]
+
+
+def __getattr__(name: str):
+    # lazy evaluation of environment variables
+    if name in environment_variables:
+        return environment_variables[name]()
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+
+def __dir__():
+    return list(environment_variables.keys())
+
+
+def is_set(name: str):
+    """Check if an environment variable is explicitly set."""
+    if name in environment_variables:
+        return name in os.environ
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+
+def set_vllm_use_v1(use_v1: bool):
+    if is_set("VLLM_USE_V1"):
+        raise ValueError(
+            "Should not call set_vllm_use_v1() if VLLM_USE_V1 is set "
+            "explicitly by the user. Please raise this as a Github "
+            "Issue and explicitly set VLLM_USE_V1=0 or 1.")
+    os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
+
+
+def compute_hash() -> str:
+    """
+    WARNING: Whenever a new key is added to this environment
+    variables, ensure that it is included in the factors list if
+    it affects the computation graph. For example, different values
+    of VLLM_PP_LAYER_PARTITION will generate different computation
+    graphs, so it is included in the factors list. The env vars that
+    affect the choice of different kernels or attention backends should
+    also be included in the factors list.
+    """
+    factors: list[Any] = []
+
+    # summarize environment variables
+    def factorize(name: str):
+        if __getattr__(name):
+            factors.append(__getattr__(name))
+        else:
+            factors.append("None")
+
+    # The values of envs may affects the computation graph.
+    # TODO(DefTruth): hash all environment variables?
+    # for key in environment_variables:
+    #     factorize(key)
+    environment_variables_to_hash = [
+        "VLLM_PP_LAYER_PARTITION",
+        "VLLM_MLA_DISABLE",
+        "VLLM_USE_TRITON_FLASH_ATTN",
+        "VLLM_USE_TRITON_AWQ",
+        "VLLM_DP_RANK",
+        "VLLM_DP_SIZE",
+        "VLLM_USE_STANDALONE_COMPILE",
+        "VLLM_FUSED_MOE_CHUNK_SIZE",
+    ]
+    for key in environment_variables_to_hash:
+        if key in environment_variables:
+            factorize(key)
+
+    hash_str = hashlib.md5(str(factors).encode(),
+                           usedforsecurity=False).hexdigest()
+
+    return hash_str
diff --git a/vllm_v0.10.0/vllm/executor/__init__.py b/vllm_v0.10.0/vllm/executor/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/executor/executor_base.py b/vllm_v0.10.0/vllm/executor/executor_base.py
new file mode 100644
index 0000000..483fdb1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/executor/executor_base.py
@@ -0,0 +1,377 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import time
+from abc import ABC, abstractmethod
+from functools import cached_property
+from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple,
+                    Union)
+
+import torch.nn as nn
+from typing_extensions import TypeVar
+
+import vllm.platforms
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.pooling_params import PoolingTask
+from vllm.sequence import ExecuteModelRequest, PoolerOutput
+from vllm.utils import make_async
+from vllm.worker.worker_base import WorkerBase
+
+logger = init_logger(__name__)
+
+_R = TypeVar("_R", default=Any)
+
+
+class ExecutorBase(ABC):
+    """Base class for all executors.
+
+    An executor is responsible for executing the model on one device,
+    or it can be a distributed executor 
+    that can execute the model on multiple devices.
+    """
+
+    uses_ray: bool  # whether the executor uses Ray for orchestration.
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+    ) -> None:
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.load_config = vllm_config.load_config
+        self.parallel_config = vllm_config.parallel_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.device_config = vllm_config.device_config
+        self.speculative_config = vllm_config.speculative_config
+        self.observability_config = vllm_config.observability_config
+        self._init_executor()
+        self.is_sleeping = False
+        self.sleeping_tags: set[str] = set()
+
+    @abstractmethod
+    def _init_executor(self) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def collective_rpc(self,
+                       method: Union[str, Callable[..., _R]],
+                       timeout: Optional[float] = None,
+                       args: Tuple = (),
+                       kwargs: Optional[Dict[str, Any]] = None) -> List[_R]:
+        """
+        Execute an RPC call on all workers.
+
+        Args:
+            method: Name of the worker method to execute, or a callable that
+                is serialized and sent to all workers to execute.
+
+                If the method is a callable, it should accept an additional
+                `self` argument, in addition to the arguments passed in `args`
+                and `kwargs`. The `self` argument will be the worker object.
+            timeout: Maximum time in seconds to wait for execution. Raises a
+                [`TimeoutError`][] on timeout. `None` means wait indefinitely.
+            args: Positional arguments to pass to the worker method.
+            kwargs: Keyword arguments to pass to the worker method.
+
+        Returns:
+            A list containing the results from each worker.
+        
+        Note:
+            It is recommended to use this API to only pass control messages,
+            and set up data-plane communication to pass data.
+        """
+        raise NotImplementedError
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Determine the number of available blocks for the GPU KV cache and
+        swappable CPU KV cache.
+
+        Normally, this should simply delegate to the underlying Worker. Some
+        ExecutorBase may require modification of the result, e.g. to ensure the
+        selected cache sizes are compatible with all workers.
+
+        Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
+        are blocks that are "active" on the device and can be appended to.
+        num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
+        appended to.
+        """
+        results = self.collective_rpc("determine_num_available_blocks")
+        a = min([r[0] for r in results])
+        b = min([r[1] for r in results])
+        return a, b
+
+    def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
+        """Initialize the KV cache by invoking the underlying worker.
+        """
+        # NOTE: This is logged in the executor because there can be >1 workers.
+        logger.info("# %s blocks: %d, # CPU blocks: %d",
+                    vllm.platforms.current_platform.device_name,
+                    num_gpu_blocks, num_cpu_blocks)
+        max_concurrency = (num_gpu_blocks * self.cache_config.block_size /
+                           self.model_config.max_model_len)
+        logger.info("Maximum concurrency for %s tokens per request: %.2fx",
+                    self.model_config.max_model_len, max_concurrency)
+
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+
+        self.collective_rpc("initialize_cache",
+                            args=(num_gpu_blocks, num_cpu_blocks))
+
+    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
+        """
+        Run a function directly on the model inside each worker,
+        returning the result for each of them.
+        """
+
+        def rpc_func(worker: WorkerBase) -> _R:
+            return func(worker.get_model())
+
+        return self.collective_rpc(rpc_func)
+
+    @cached_property  # Avoid unnecessary RPC calls
+    def supported_pooling_tasks(self) -> tuple[PoolingTask, ...]:
+        output = self.collective_rpc("get_supported_pooling_tasks")
+        return tuple({task for tasks in output for task in tasks})
+
+    def execute_model(
+        self, execute_model_req: ExecuteModelRequest
+    ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]:
+        output = self.collective_rpc("execute_model",
+                                     args=(execute_model_req, ))
+        return output[0]
+
+    def stop_remote_worker_execution_loop(self) -> None:
+        """Releases parallel workers from model loop."""
+        return
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
+        return all(self.collective_rpc("add_lora", args=(lora_request, )))
+
+    def remove_lora(self, lora_id: int) -> bool:
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return all(self.collective_rpc("remove_lora", args=(lora_id, )))
+
+    def pin_lora(self, lora_id: int) -> bool:
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return all(self.collective_rpc("pin_lora", args=(lora_id, )))
+
+    def list_loras(self) -> Set[int]:
+        sets = self.collective_rpc("list_loras")
+        for s in sets:
+            assert s == sets[0], "All workers should have the same LORAs."
+        return sets[0]
+
+    def start_profile(self) -> None:
+        self.collective_rpc("start_profile")
+
+    def stop_profile(self) -> None:
+        self.collective_rpc("stop_profile")
+
+    def sleep(self, level: int = 1):
+        if self.is_sleeping:
+            logger.warning("Executor is already sleeping.")
+            return
+        time_before_sleep = time.perf_counter()
+        self.collective_rpc("sleep", kwargs=dict(level=level))
+        time_after_sleep = time.perf_counter()
+        self.sleeping_tags = {"weights", "kv_cache"}
+        self.is_sleeping = True
+        logger.info("It took %.6f seconds to fall asleep.",
+                    time_after_sleep - time_before_sleep)
+
+    def wake_up(self, tags: Optional[list[str]] = None):
+        if not self.is_sleeping:
+            logger.warning("Executor is not sleeping.")
+            return
+        if tags:
+            for tag in tags:
+                if tag not in self.sleeping_tags:
+                    logger.warning("Tag %s is not in sleeping tags %s", tag,
+                                   self.sleeping_tags)
+                    return
+        time_before_wakeup = time.perf_counter()
+        self.collective_rpc("wake_up", kwargs=dict(tags=tags))
+        time_after_wakeup = time.perf_counter()
+        logger.info("It took %.6f seconds to wake up tags %s.",
+                    time_after_wakeup - time_before_wakeup,
+                    tags if tags is not None else self.sleeping_tags)
+        if tags:
+            for tag in tags:
+                self.sleeping_tags.remove(tag)
+        else:
+            self.sleeping_tags.clear()
+        if not self.sleeping_tags:
+            self.is_sleeping = False
+
+    def save_sharded_state(
+        self,
+        path: str,
+        pattern: Optional[str] = None,
+        max_size: Optional[int] = None,
+    ) -> None:
+        self.collective_rpc("save_sharded_state",
+                            kwargs=dict(path=path,
+                                        pattern=pattern,
+                                        max_size=max_size))
+
+    @abstractmethod
+    def check_health(self) -> None:
+        """Checks if the executor is healthy. If not, it should raise an
+        exception."""
+        raise NotImplementedError
+
+    def shutdown(self) -> None:
+        """Shutdown the executor."""
+        return
+
+    def __del__(self):
+        self.shutdown()
+
+    async def execute_model_async(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        """Executes one model step on the given sequences."""
+        output = await make_async(self.execute_model)(execute_model_req)
+        return output
+
+    async def stop_remote_worker_execution_loop_async(self) -> None:
+        """Releases parallel workers from model loop."""
+        return
+
+    async def check_health_async(self) -> None:
+        """Checks if the executor is healthy. If not, it should raise an
+        exception."""
+        self.check_health()
+
+
+class DistributedExecutorBase(ExecutorBase):
+    """Abstract superclass of distributed executor implementations."""
+
+    def __init__(self, *args, **kwargs):
+        # This is non-None when the execute model loop is running
+        # in the parallel workers. It's a coroutine in the AsyncLLMEngine case.
+        self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None
+
+        super().__init__(*args, **kwargs)
+
+    def execute_model(
+        self,
+        execute_model_req: ExecuteModelRequest,
+    ) -> List[SamplerOutput]:
+        # TODO: unify into collective_rpc
+        if self.parallel_worker_tasks is None:
+            self.parallel_worker_tasks = self._run_workers(
+                "start_worker_execution_loop",
+                async_run_tensor_parallel_workers_only=True)
+
+        # Only the driver worker returns the sampling results.
+        driver_outputs = self._driver_execute_model(execute_model_req)
+        assert driver_outputs is not None
+        return driver_outputs
+
+    def stop_remote_worker_execution_loop(self) -> None:
+        if self.parallel_worker_tasks is None:
+            return
+
+        self._driver_execute_model(execute_model_req=None)
+        parallel_worker_tasks = self.parallel_worker_tasks
+        self.parallel_worker_tasks = None
+        # Ensure that workers exit model loop cleanly
+        # (this will raise otherwise)
+        self._wait_for_tasks_completion(parallel_worker_tasks)
+
+    @abstractmethod
+    def _driver_execute_model(
+        self, execute_model_req: Optional[ExecuteModelRequest]
+    ) -> Optional[List[SamplerOutput]]:
+        """Run execute_model in the driver worker.
+
+        Passing None will cause the driver to stop the model execution loop
+        running in each of the remote workers. In this case, this method
+        returns None. Otherwise, this method returns the model output.
+        """
+        raise NotImplementedError
+
+    def collective_rpc(self,
+                       method: Union[str, Callable],
+                       timeout: Optional[float] = None,
+                       args: Tuple = (),
+                       kwargs: Optional[Dict] = None) -> List[Any]:
+        return self._run_workers(method, *args, **(kwargs or {}))
+
+    @abstractmethod
+    def _run_workers(
+        self,
+        method: Union[str, Callable],
+        *args,
+        async_run_tensor_parallel_workers_only: bool = False,
+        max_concurrent_workers: Optional[int] = None,
+        **kwargs,
+    ) -> Any:
+        """Runs the given method on all workers.
+
+        Args:
+            async_run_tensor_parallel_workers_only: If True the method will be
+                run only in the remote TP workers, not the driver worker.
+                It will also be run asynchronously and return a list of futures
+                rather than blocking on the results.
+        
+        # TODO: simplify and merge with collective_rpc
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
+        """Wait for futures returned from _run_workers() with
+        async_run_remote_workers_only to complete."""
+        raise NotImplementedError
+
+    async def execute_model_async(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        if self.parallel_worker_tasks is None:
+            # Start model execution loop running in the parallel workers
+            self.parallel_worker_tasks = asyncio.create_task(
+                self._start_worker_execution_loop())
+
+        # Only the driver worker returns the sampling results.
+        return await self._driver_execute_model_async(execute_model_req)
+
+    async def stop_remote_worker_execution_loop_async(self) -> None:
+        if self.parallel_worker_tasks is None:
+            return
+
+        await self._driver_execute_model_async()
+        parallel_worker_tasks = self.parallel_worker_tasks
+        self.parallel_worker_tasks = None
+        # Ensure that workers exit model loop cleanly
+        # (this will raise otherwise)
+        await parallel_worker_tasks
+
+    @abstractmethod
+    async def _driver_execute_model_async(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None,
+    ) -> List[SamplerOutput]:
+        """Execute the model asynchronously in the driver worker.
+
+        Passing None will cause the driver to stop the model execution
+        loop running in each of the remote workers.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    async def _start_worker_execution_loop(self):
+        """Run execution loop on all workers. It guarantees all workers run
+        the loop or None of them is running the loop. Loop can be stopped by
+        `stop_remote_worker_execution_loop`.
+        The API is idempotent (guarantee only 1 loop run at any moment)."""
+        raise NotImplementedError
diff --git a/vllm_v0.10.0/vllm/executor/mp_distributed_executor.py b/vllm_v0.10.0/vllm/executor/mp_distributed_executor.py
new file mode 100644
index 0000000..4e8c6d7
--- /dev/null
+++ b/vllm_v0.10.0/vllm/executor/mp_distributed_executor.py
@@ -0,0 +1,244 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import os
+from typing import Any, Callable, List, Optional, Union
+
+import cloudpickle
+
+from vllm.executor.executor_base import DistributedExecutorBase
+from vllm.executor.multiproc_worker_utils import (
+    ProcessWorkerWrapper, ResultHandler, WorkerMonitor,
+    set_multiprocessing_worker_envs)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.sequence import ExecuteModelRequest
+from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless,
+                        get_distributed_init_method, get_ip, get_open_port,
+                        make_async, run_method, update_environment_variables)
+from vllm.worker.worker_base import WorkerWrapperBase
+
+logger = init_logger(__name__)
+
+
+class MultiprocessingDistributedExecutor(DistributedExecutorBase):
+    """Python multiprocessing-based distributed executor"""
+
+    uses_ray: bool = False
+
+    def _check_cuda(self) -> None:
+        """Check that the number of GPUs is sufficient for the parallel
+        configuration. Separate from _init_executor to reduce the number of
+        indented blocks.
+        """
+        parallel_config = self.parallel_config
+        world_size = parallel_config.world_size
+        tensor_parallel_size = parallel_config.tensor_parallel_size
+
+        cuda_device_count = cuda_device_count_stateless()
+        # Use confusing message for more common TP-only case.
+        if tensor_parallel_size > cuda_device_count:
+            raise RuntimeError(
+                f"please set tensor_parallel_size ({tensor_parallel_size}) "
+                f"to less than max local gpu count ({cuda_device_count})")
+
+        if world_size > cuda_device_count:
+            raise RuntimeError(
+                f"please ensure that world_size ({world_size}) "
+                f"is less than than max local gpu count ({cuda_device_count})")
+
+        # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers
+        if "CUDA_VISIBLE_DEVICES" not in os.environ:
+            update_environment_variables({
+                "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
+            })
+
+    def _init_executor(self) -> None:
+
+        from vllm.platforms import current_platform
+        if current_platform.is_cuda_alike():
+            self._check_cuda()
+
+        # Create the parallel GPU workers.
+        world_size = self.parallel_config.world_size
+        tensor_parallel_size = self.parallel_config.tensor_parallel_size
+
+        # Set multiprocessing envs that are common to V0 and V1
+        set_multiprocessing_worker_envs(self.parallel_config)
+
+        # Multiprocessing-based executor does not support multi-node setting.
+        # Since it only works for single node, we can use the loopback address
+        # 127.0.0.1 for communication.
+        distributed_init_method = get_distributed_init_method(
+            "127.0.0.1", get_open_port())
+
+        self.workers: List[ProcessWorkerWrapper] = []
+        # This is the list of workers that are rank 0 of each TP group EXCEPT
+        # global rank 0. These are the workers that will broadcast to the
+        # rest of the workers.
+        self.tp_driver_workers: List[ProcessWorkerWrapper] = []
+        # This is the list of workers that are not drivers and not the first
+        # worker in a TP group. These are the workers that will be
+        # broadcasted to.
+        self.non_driver_workers: List[ProcessWorkerWrapper] = []
+
+        if world_size == 1:
+            self.worker_monitor = None
+        else:
+            result_handler = ResultHandler()
+            for rank in range(1, world_size):
+                worker = ProcessWorkerWrapper(result_handler,
+                                              WorkerWrapperBase,
+                                              self.vllm_config, rank)
+                self.workers.append(worker)
+                if rank % tensor_parallel_size == 0:
+                    self.tp_driver_workers.append(worker)
+                else:
+                    self.non_driver_workers.append(worker)
+
+            self.worker_monitor = WorkerMonitor(self.workers, result_handler)
+            result_handler.start()
+            self.worker_monitor.start()
+
+        # Set up signal handlers to shutdown the executor cleanly
+        # sometimes gc does not work well
+
+        self.driver_worker = WorkerWrapperBase(self.vllm_config, 0)
+
+        all_kwargs = []
+        distributed_init_method = get_distributed_init_method(
+            get_ip(), get_open_port())
+        for i in range(world_size):
+            local_rank = i
+            rank = i
+            kwargs = dict(
+                vllm_config=self.vllm_config,
+                local_rank=local_rank,
+                rank=rank,
+                distributed_init_method=distributed_init_method,
+                is_driver_worker=(not self.parallel_config)
+                or (rank % self.parallel_config.tensor_parallel_size == 0),
+            )
+            all_kwargs.append(kwargs)
+        self._run_workers("init_worker", all_kwargs)
+        self._run_workers("init_device")
+        self._run_workers("load_model",
+                          max_concurrent_workers=self.parallel_config.
+                          max_parallel_loading_workers)
+        self.driver_exec_model = make_async(self.driver_worker.execute_model)
+        self.pp_locks: Optional[List[asyncio.Lock]] = None
+
+    def shutdown(self):
+        if (worker_monitor := getattr(self, "worker_monitor",
+                                      None)) is not None:
+            worker_monitor.close()
+
+    def _driver_execute_model(
+        self, execute_model_req: Optional[ExecuteModelRequest]
+    ) -> Optional[List[SamplerOutput]]:
+        """Run execute_model in the driver worker.
+
+        Passing None will cause the driver to stop the model execution
+        loop running in each of the remote workers.
+        """
+        return self.driver_worker.execute_model(execute_model_req)
+
+    def _run_workers(
+        self,
+        method: Union[str, Callable],
+        *args,
+        async_run_tensor_parallel_workers_only: bool = False,
+        max_concurrent_workers: Optional[int] = None,
+        **kwargs,
+    ) -> List[Any]:
+        """Runs the given method on all workers.
+
+        Args:
+            async_run_tensor_parallel_workers_only: If True the method will be
+                run only in the remote TP workers, not the driver worker.
+                It will also be run asynchronously and return a list of futures
+                rather than blocking on the results.
+        """
+        if isinstance(method, str):
+            sent_method = method
+        else:
+            sent_method = cloudpickle.dumps(method)
+        del method
+
+        if max_concurrent_workers:
+            raise NotImplementedError(
+                "max_concurrent_workers is not supported yet.")
+
+        if async_run_tensor_parallel_workers_only:
+            # Run only non-driver workers and just return futures.
+            return [
+                worker.execute_method(sent_method, *args, **kwargs)
+                for worker in self.non_driver_workers
+            ]
+
+        # Start all remote workers first.
+        worker_outputs = [
+            worker.execute_method(sent_method, *args, **kwargs)
+            for worker in self.workers
+        ]
+
+        driver_worker_output = run_method(self.driver_worker, sent_method,
+                                          args, kwargs)
+
+        # Get the results of the workers.
+        return [driver_worker_output
+                ] + [output.get() for output in worker_outputs]
+
+    def check_health(self) -> None:
+        """Raises an error if engine is unhealthy."""
+        if self.worker_monitor is not None and not self.worker_monitor.is_alive(
+        ):
+            raise RuntimeError("Worker processes are not running")
+
+    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
+        """Wait for futures returned from _run_workers() with
+        async_run_remote_workers_only to complete."""
+        for result in parallel_worker_tasks:
+            result.get()
+
+    async def _driver_execute_model_async(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> List[SamplerOutput]:
+        if not self.tp_driver_workers:
+            return await self.driver_exec_model(execute_model_req)
+
+        if self.pp_locks is None:
+            # This locks each pipeline parallel stage so multiple virtual
+            # engines can't execute on the same stage at the same time
+            # We create the locks here to avoid creating them in the constructor
+            # which uses a different asyncio loop.
+            self.pp_locks = [
+                asyncio.Lock()
+                for _ in range(self.parallel_config.pipeline_parallel_size)
+            ]
+
+        tasks = [
+            asyncio.create_task(
+                _run_task_with_lock(self.driver_exec_model, self.pp_locks[0],
+                                    execute_model_req))
+        ]
+        for pp_rank, driver_worker in enumerate(self.tp_driver_workers,
+                                                start=1):
+            tasks.append(
+                asyncio.create_task(
+                    _run_task_with_lock(driver_worker.execute_method_async,
+                                        self.pp_locks[pp_rank],
+                                        "execute_model", execute_model_req)))
+        results = await asyncio.gather(*tasks)
+
+        # Only the last PP stage has the final results.
+        return results[-1]
+
+    async def _start_worker_execution_loop(self):
+        coros = [
+            worker.execute_method_async("start_worker_execution_loop")
+            for worker in self.non_driver_workers
+        ]
+        return await asyncio.gather(*coros)
diff --git a/vllm_v0.10.0/vllm/executor/msgspec_utils.py b/vllm_v0.10.0/vllm/executor/msgspec_utils.py
new file mode 100644
index 0000000..852c8f5
--- /dev/null
+++ b/vllm_v0.10.0/vllm/executor/msgspec_utils.py
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from array import array
+from typing import Any, Type
+
+from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE
+
+
+def encode_hook(obj: Any) -> Any:
+    """Custom msgspec enc hook that supports array types.
+
+    See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
+    """
+    if isinstance(obj, array):
+        assert obj.typecode == VLLM_TOKEN_ID_ARRAY_TYPE, (
+            f"vLLM array type should use '{VLLM_TOKEN_ID_ARRAY_TYPE}' type. "
+            f"Given array has a type code of {obj.typecode}.")
+        return obj.tobytes()
+
+
+def decode_hook(type: Type, obj: Any) -> Any:
+    """Custom msgspec dec hook that supports array types.
+
+    See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
+    """
+    if type is array:
+        deserialized = array(VLLM_TOKEN_ID_ARRAY_TYPE)
+        deserialized.frombytes(obj)
+        return deserialized
diff --git a/vllm_v0.10.0/vllm/executor/multiproc_worker_utils.py b/vllm_v0.10.0/vllm/executor/multiproc_worker_utils.py
new file mode 100644
index 0000000..a6c172b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/executor/multiproc_worker_utils.py
@@ -0,0 +1,313 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import os
+import sys
+import threading
+import uuid
+from dataclasses import dataclass
+from multiprocessing import Queue
+from multiprocessing.connection import wait
+from multiprocessing.process import BaseProcess
+from typing import (Any, Callable, Dict, Generic, List, Optional, TextIO,
+                    TypeVar, Union)
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.utils import _maybe_force_spawn, get_mp_context, run_method
+
+logger = init_logger(__name__)
+
+T = TypeVar('T')
+
+_TERMINATE = "TERMINATE"  # sentinel
+
+# ANSI color codes
+CYAN = '\033[1;36m'
+RESET = '\033[0;0m'
+
+JOIN_TIMEOUT_S = 2
+
+
+@dataclass
+class Result(Generic[T]):
+    """Result of task dispatched to worker"""
+
+    task_id: uuid.UUID
+    value: Optional[T] = None
+    exception: Optional[BaseException] = None
+
+
+class ResultFuture(threading.Event, Generic[T]):
+    """Synchronous future for non-async case"""
+
+    def __init__(self):
+        super().__init__()
+        self.result: Optional[Result[T]] = None
+
+    def set_result(self, result: Result[T]):
+        self.result = result
+        self.set()
+
+    def get(self) -> T:
+        self.wait()
+        assert self.result is not None
+        if self.result.exception is not None:
+            raise self.result.exception
+        return self.result.value  # type: ignore[return-value]
+
+
+def _set_future_result(future: Union[ResultFuture, asyncio.Future],
+                       result: Result):
+    if isinstance(future, ResultFuture):
+        future.set_result(result)
+        return
+    loop = future.get_loop()
+    if not loop.is_closed():
+        if result.exception is not None:
+            loop.call_soon_threadsafe(future.set_exception, result.exception)
+        else:
+            loop.call_soon_threadsafe(future.set_result, result.value)
+
+
+class ResultHandler(threading.Thread):
+    """Handle results from all workers (in background thread)"""
+
+    def __init__(self) -> None:
+        super().__init__(daemon=True)
+        self.result_queue = get_mp_context().Queue()
+        self.tasks: Dict[uuid.UUID, Union[ResultFuture, asyncio.Future]] = {}
+
+    def run(self):
+        for result in iter(self.result_queue.get, _TERMINATE):
+            future = self.tasks.pop(result.task_id)
+            _set_future_result(future, result)
+        # Ensure that all waiters will receive an exception
+        for task_id, future in self.tasks.items():
+            _set_future_result(
+                future,
+                Result(task_id=task_id,
+                       exception=ChildProcessError("worker died")))
+
+    def close(self):
+        self.result_queue.put(_TERMINATE)
+
+
+class WorkerMonitor(threading.Thread):
+    """Monitor worker status (in background thread)"""
+
+    def __init__(self, workers: List['ProcessWorkerWrapper'],
+                 result_handler: ResultHandler):
+        super().__init__(daemon=True)
+        self.workers = workers
+        self.result_handler = result_handler
+        self._close = False
+
+    def run(self) -> None:
+        # Blocks until any worker exits
+        dead_sentinels = wait([w.process.sentinel for w in self.workers])
+        if not self._close:
+            self._close = True
+
+            # Kill / cleanup all workers
+            for worker in self.workers:
+                process = worker.process
+                if process.sentinel in dead_sentinels:
+                    process.join(JOIN_TIMEOUT_S)
+                if process.exitcode is not None and process.exitcode != 0:
+                    logger.error("Worker %s pid %s died, exit code: %s",
+                                 process.name, process.pid, process.exitcode)
+            # Cleanup any remaining workers
+            if logger:
+                logger.info("Killing local vLLM worker processes")
+            for worker in self.workers:
+                worker.kill_worker()
+            # Must be done after worker task queues are all closed
+            self.result_handler.close()
+
+        for worker in self.workers:
+            worker.process.join(JOIN_TIMEOUT_S)
+
+    def close(self):
+        if self._close:
+            return
+        self._close = True
+        logger.info("Terminating local vLLM worker processes")
+        for worker in self.workers:
+            worker.terminate_worker()
+        # Must be done after worker task queues are all closed
+        self.result_handler.close()
+
+
+class ProcessWorkerWrapper:
+    """Local process wrapper for vllm.worker.Worker,
+    for handling single-node multi-GPU tensor parallel."""
+
+    def __init__(self, result_handler: ResultHandler,
+                 worker_factory: Callable[[VllmConfig, int], Any],
+                 vllm_config: VllmConfig, rank: int) -> None:
+        self.mp = get_mp_context()
+        self._task_queue = self.mp.Queue()
+        self.result_queue = result_handler.result_queue
+        self.tasks = result_handler.tasks
+        self.process: BaseProcess = self.mp.Process(  # type: ignore[attr-defined]
+            target=_run_worker_process,
+            name="VllmWorkerProcess",
+            kwargs=dict(
+                worker_factory=worker_factory,
+                task_queue=self._task_queue,
+                result_queue=self.result_queue,
+                vllm_config=vllm_config,
+                rank=rank,
+            ),
+            daemon=True)
+
+        self.process.start()
+
+    def _enqueue_task(self, future: Union[ResultFuture, asyncio.Future],
+                      method: Union[str, bytes], args, kwargs):
+        task_id = uuid.uuid4()
+        self.tasks[task_id] = future
+        try:
+            self._task_queue.put((task_id, method, args, kwargs))
+        except SystemExit:
+            raise
+        except BaseException as e:
+            del self.tasks[task_id]
+            raise ChildProcessError("worker died") from e
+
+    def execute_method(self, method: Union[str, bytes], *args, **kwargs):
+        future: ResultFuture = ResultFuture()
+        self._enqueue_task(future, method, args, kwargs)
+        return future
+
+    async def execute_method_async(self, method: Union[str, bytes], *args,
+                                   **kwargs):
+        future = asyncio.get_running_loop().create_future()
+        self._enqueue_task(future, method, args, kwargs)
+        return await future
+
+    def terminate_worker(self):
+        try:
+            self._task_queue.put(_TERMINATE)
+        except ValueError:
+            self.process.kill()
+        self._task_queue.close()
+
+    def kill_worker(self):
+        self._task_queue.close()
+        self.process.kill()
+
+
+def _run_worker_process(
+    worker_factory: Callable[[VllmConfig, int], Any],
+    task_queue: Queue,
+    result_queue: Queue,
+    vllm_config: VllmConfig,
+    rank: int,
+) -> None:
+    """Worker process event loop"""
+
+    # Add process-specific prefix to stdout and stderr
+    process_name = get_mp_context().current_process().name
+    pid = os.getpid()
+    _add_prefix(sys.stdout, process_name, pid)
+    _add_prefix(sys.stderr, process_name, pid)
+
+    # Initialize worker
+    worker = worker_factory(vllm_config, rank)
+    del worker_factory
+
+    # Accept tasks from the engine in task_queue
+    # and return task output in result_queue
+    logger.info("Worker ready; awaiting tasks")
+    try:
+        for items in iter(task_queue.get, _TERMINATE):
+            output = None
+            exception = None
+            task_id, method, args, kwargs = items
+            try:
+                output = run_method(worker, method, args, kwargs)
+            except SystemExit:
+                raise
+            except KeyboardInterrupt:
+                break
+            except BaseException as e:
+                logger.exception(
+                    "Exception in worker %s while processing method %s.",
+                    process_name, method)
+                exception = e
+            result_queue.put(
+                Result(task_id=task_id, value=output, exception=exception))
+    except KeyboardInterrupt:
+        pass
+    except Exception:
+        logger.exception("Worker failed")
+
+    # Flush TunableOp results when TunableOp is enabled and
+    # online (in situ) tuning is enabled.
+    # Offline tuning API (record_untuned_is_enabled()) only
+    # available in PyTorch 2.6 or later.
+    if torch.cuda.is_available():
+        import torch.cuda.tunable as tunable
+        if (tunable.is_enabled() and tunable.tuning_is_enabled()
+                and not tunable.record_untuned_is_enabled()):
+            tunable.write_file()
+
+    logger.info("Worker exiting")
+
+
+def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None:
+    """Prepend each output line with process-specific prefix"""
+
+    prefix = f"{CYAN}({worker_name} pid={pid}){RESET} "
+    file_write = file.write
+
+    def write_with_prefix(s: str):
+        if not s:
+            return
+        if file.start_new_line:  # type: ignore[attr-defined]
+            file_write(prefix)
+        idx = 0
+        while (next_idx := s.find('\n', idx)) != -1:
+            next_idx += 1
+            file_write(s[idx:next_idx])
+            if next_idx == len(s):
+                file.start_new_line = True  # type: ignore[attr-defined]
+                return
+            file_write(prefix)
+            idx = next_idx
+        file_write(s[idx:])
+        file.start_new_line = False  # type: ignore[attr-defined]
+
+    file.start_new_line = True  # type: ignore[attr-defined]
+    file.write = write_with_prefix  # type: ignore[method-assign]
+
+
+def set_multiprocessing_worker_envs(parallel_config):
+    """ Set up environment variables that should be used when there are workers
+    in a multiprocessing environment. This should be called by the parent 
+    process before worker processes are created"""
+
+    _maybe_force_spawn()
+
+    # Configure thread parallelism if OMP_NUM_THREADS isn't set
+    #
+    # Helps to avoid CPU contention. The default of spawning a thread per
+    # core combined with multiprocessing for each GPU can have a negative
+    # impact on performance. The contention is amplified when running in a
+    # container where CPU limits can cause throttling.
+    default_omp_num_threads = 1
+    if "OMP_NUM_THREADS" not in os.environ and (
+            current_parallelism :=
+            torch.get_num_threads()) > default_omp_num_threads:
+        logger.warning(
+            "Reducing Torch parallelism from %d threads to %d to avoid "
+            "unnecessary CPU contention. Set OMP_NUM_THREADS in the "
+            "external environment to tune this value as needed.",
+            current_parallelism, default_omp_num_threads)
+        os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
+        torch.set_num_threads(default_omp_num_threads)
diff --git a/vllm_v0.10.0/vllm/executor/ray_distributed_executor.py b/vllm_v0.10.0/vllm/executor/ray_distributed_executor.py
new file mode 100644
index 0000000..e9ad62a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/executor/ray_distributed_executor.py
@@ -0,0 +1,682 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import os
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
+
+import cloudpickle
+import msgspec
+
+import vllm.envs as envs
+from vllm.executor.executor_base import (
+    DistributedExecutorBase)  # yapf: disable
+from vllm.executor.msgspec_utils import encode_hook
+from vllm.executor.ray_utils import (RayWorkerWrapper, initialize_ray_cluster,
+                                     ray)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.platforms import current_platform
+from vllm.ray.ray_env import get_env_vars_to_copy
+from vllm.sequence import ExecuteModelRequest
+from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
+                        get_ip, get_open_port, make_async)
+
+if ray is not None:
+    from ray.actor import ActorHandle
+    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+else:
+    ActorHandle = None
+
+if TYPE_CHECKING:
+    from ray.util.placement_group import PlacementGroup
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class RayWorkerMetaData:
+    """
+    Metadata for a Ray worker.
+    The order of ray worker creation can be random,
+    and we need to reset the rank after creating all workers.
+    """
+    worker: ActorHandle
+    created_rank: int
+    adjusted_rank: int = -1
+    ip: str = ""
+
+
+class RayDistributedExecutor(DistributedExecutorBase):
+    """Ray-based distributed executor"""
+
+    # These env vars are worker-specific, therefore are NOT copied
+    # from the driver to the workers
+    WORKER_SPECIFIC_ENV_VARS = {
+        "VLLM_HOST_IP", "VLLM_HOST_PORT", "LOCAL_RANK", "CUDA_VISIBLE_DEVICES"
+    }
+
+    # These non-vLLM env vars are copied from the driver to workers
+    ADDITIONAL_ENV_VARS = {"HF_TOKEN", "HUGGING_FACE_HUB_TOKEN"}
+
+    uses_ray: bool = True
+
+    def _init_executor(self) -> None:
+        self.forward_dag: Optional[ray.dag.CompiledDAG] = None
+        if envs.VLLM_USE_V1:
+            # V1 uses SPMD worker and compiled DAG
+            os.environ["VLLM_USE_RAY_SPMD_WORKER"] = "1"
+            os.environ["VLLM_USE_RAY_COMPILED_DAG"] = "1"
+
+            # For TPU or XPU, avoid compiling NVIDIA's NCCL
+            if current_platform.is_tpu() or current_platform.is_xpu():
+                os.environ["VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE"] = "shm"
+
+        # If the env var is set, it uses the Ray's compiled DAG API
+        # which optimizes the control plane overhead.
+        # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
+        # Currently, this requires USE_RAY_SPMD_WORKER=True.
+        self.use_ray_compiled_dag = envs.VLLM_USE_RAY_COMPILED_DAG
+        # If the env var is set, then we do not distinguish between the
+        # "driver worker" vs other workers. Also, the rank 0 worker will
+        # be executed in a remote Ray worker. Currently this requires
+        # USE_RAY_COMPILED_DAG=True.
+        self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
+        if self.use_ray_compiled_dag:
+            assert self.use_ray_spmd_worker, (
+                "VLLM_USE_RAY_COMPILED_DAG=1 requires "
+                "VLLM_USE_RAY_SPMD_WORKER=1")
+        if self.use_ray_spmd_worker:
+            # TODO: Support SPMD worker for non-DAG Ray executor.
+            assert self.use_ray_compiled_dag, (
+                "VLLM_USE_RAY_SPMD_WORKER=1 requires "
+                "VLLM_USE_RAY_COMPILED_DAG=1")
+
+        assert self.uses_ray
+        initialize_ray_cluster(self.parallel_config)
+        placement_group = self.parallel_config.placement_group
+
+        # Disable Ray usage stats collection.
+        ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
+        if ray_usage != "1":
+            os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
+
+        # Create the parallel GPU workers.
+        self._init_workers_ray(placement_group)
+
+        self.input_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
+        self.output_decoder = msgspec.msgpack.Decoder(
+            Optional[List[SamplerOutput]])
+        self.use_v1 = envs.VLLM_USE_V1
+
+        self.pp_locks: Optional[List[asyncio.Lock]] = None
+        if not self.use_ray_compiled_dag:
+            self.driver_exec_method = make_async(
+                self.driver_worker.execute_method)
+
+    def shutdown(self) -> None:
+        logger.info(
+            "Shutting down Ray distributed executor. If you see error log "
+            "from logging.cc regarding SIGTERM received, please ignore because "
+            "this is the expected termination process in Ray.")
+        if hasattr(self, "forward_dag") and self.forward_dag is not None:
+            self.forward_dag.teardown()
+            import ray
+            for worker in self.workers:
+                ray.kill(worker)
+            self.forward_dag = None
+
+    def _configure_ray_workers_use_nsight(self,
+                                          ray_remote_kwargs) -> Dict[str, Any]:
+        # If nsight profiling is enabled, we need to set the profiling
+        # configuration for the ray workers as runtime env.
+        runtime_env = ray_remote_kwargs.setdefault("runtime_env", {})
+        runtime_env.update({
+            "nsight": {
+                "t": "cuda,cudnn,cublas",
+                "o": "'worker_process_%p'",
+                "cuda-graph-trace": "node",
+            }
+        })
+
+        return ray_remote_kwargs
+
+    # child class could overwrite this to return actual env vars.
+    def _get_env_vars_to_be_updated(self):
+        return self._env_vars_for_all_workers
+
+    def _init_workers_ray(self, placement_group: "PlacementGroup",
+                          **ray_remote_kwargs):
+        num_gpus = envs.VLLM_RAY_PER_WORKER_GPUS
+
+        # The driver dummy worker does not actually use any resources.
+        # It holds the resource for the driver worker.
+        self.driver_dummy_worker: Optional[RayWorkerWrapper] = None
+        # The remaining workers are the actual ray actors.
+        self.workers: List[RayWorkerWrapper] = []
+
+        # Used in ray compiled DAG: indexed first by PP rank,
+        # and then TP rank. In other words, the inner list is
+        # the TP group of workers for a PP rank.
+        self.pp_tp_workers: List[List[RayWorkerWrapper]] = []
+
+        if self.parallel_config.ray_workers_use_nsight:
+            ray_remote_kwargs = self._configure_ray_workers_use_nsight(
+                ray_remote_kwargs)
+
+        logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker)
+
+        # Create the workers.
+        bundle_indices: List[int]
+        if envs.VLLM_RAY_BUNDLE_INDICES:
+            # Use the bundle indices specified by the user.
+            bundle_indices = list(
+                map(int, envs.VLLM_RAY_BUNDLE_INDICES.split(",")))
+            assert len(bundle_indices) == self.parallel_config.world_size, \
+            ("VLLM_RAY_BUNDLE_INDICES must have the same size"
+            f" as the world size, but got {bundle_indices=} "
+            f"and {self.parallel_config.world_size=}")
+            assert len(set(bundle_indices)) == len(bundle_indices), \
+            ("VLLM_RAY_BUNDLE_INDICES cannot have duplicate values,"
+            f" but got {bundle_indices=}")
+        else:
+            # use the first N bundles that have GPU resources.
+            bundle_indices = []
+            for bundle_id, bundle in enumerate(placement_group.bundle_specs):
+                if bundle.get(current_platform.ray_device_key, 0):
+                    bundle_indices.append(bundle_id)
+            bundle_indices = bundle_indices[:self.parallel_config.world_size]
+
+        worker_metadata: List[RayWorkerMetaData] = []
+        driver_ip = get_ip()
+        for rank, bundle_id in enumerate(bundle_indices):
+            scheduling_strategy = PlacementGroupSchedulingStrategy(
+                placement_group=placement_group,
+                placement_group_capture_child_tasks=True,
+                placement_group_bundle_index=bundle_id,
+            )
+
+            if current_platform.ray_device_key == "GPU":
+                # NV+AMD GPUs, and Intel XPUs
+                worker = ray.remote(
+                    num_cpus=0,
+                    num_gpus=num_gpus,
+                    scheduling_strategy=scheduling_strategy,
+                    **ray_remote_kwargs,
+                )(RayWorkerWrapper).remote(vllm_config=self.vllm_config,
+                                           rpc_rank=rank)
+            else:
+                worker = ray.remote(
+                    num_cpus=0,
+                    num_gpus=0,
+                    resources={current_platform.ray_device_key: num_gpus},
+                    scheduling_strategy=scheduling_strategy,
+                    **ray_remote_kwargs,
+                )(RayWorkerWrapper).remote(vllm_config=self.vllm_config,
+                                           rpc_rank=rank)
+            worker_metadata.append(
+                RayWorkerMetaData(worker=worker, created_rank=rank))
+
+        worker_ips = ray.get([
+            each.worker.get_node_ip.remote()  # type: ignore[attr-defined]
+            for each in worker_metadata
+        ])
+
+        for each, ip in zip(worker_metadata, worker_ips):
+            each.ip = ip
+
+        if not self.use_ray_spmd_worker:
+            for i, each in enumerate(worker_metadata):
+                # find and remove the dummy worker from the list
+                worker = each.worker
+                worker_ip = each.ip
+                if self.driver_dummy_worker is None and worker_ip == driver_ip:
+                    # If the worker is on the same node as the driver, we use it
+                    # as the resource holder for the driver process.
+                    self.driver_dummy_worker = worker
+                    self.driver_worker = RayWorkerWrapper(
+                        vllm_config=self.vllm_config, rpc_rank=0)
+                    worker_metadata.pop(i)
+                    break
+
+        logger.debug("workers: %s", worker_metadata)
+        logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker)
+        if not self.use_ray_spmd_worker and self.driver_dummy_worker is None:
+            raise ValueError(
+                "Ray does not allocate any GPUs on the driver node."
+                f"Driver IP: {driver_ip}, worker IPs: {worker_ips}."
+                "Consider adjusting the Ray placement group or running "
+                "the driver on a GPU node.")
+
+        ip_counts: Dict[str, int] = {}
+        for ip in worker_ips:
+            ip_counts[ip] = ip_counts.get(ip, 0) + 1
+
+        def sort_by_driver_then_worker_ip(item: RayWorkerMetaData):
+            """
+            Sort the workers based on 3 properties:
+            1. If the worker is on the same node as the driver (vllm engine),
+                it should be placed first.
+            2. Then, if the worker is on a node with fewer workers, it should
+                be placed first.
+            3. Finally, if the work is on a node with smaller IP address, it
+                should be placed first.
+            """
+            ip = item.ip
+            return (0 if ip == driver_ip else 1, ip_counts[ip], ip)
+
+        # After sorting, the workers on the same node will be
+        # close to each other, and the workers on the driver
+        # node will be placed first.
+        sorted_worker_metadata = sorted(worker_metadata,
+                                        key=sort_by_driver_then_worker_ip)
+        start_rank = 0 if self.use_ray_spmd_worker else 1
+        for i, item in enumerate(sorted_worker_metadata):
+            item.adjusted_rank = i + start_rank
+        self.workers = [item.worker for item in sorted_worker_metadata]
+        rerank_mapping = {
+            item.created_rank: item.adjusted_rank
+            for item in sorted_worker_metadata
+        }
+        self._run_workers("adjust_rank", rerank_mapping)
+
+        # Get the set of GPU IDs used on each node.
+        worker_node_and_gpu_ids = []
+        for worker in [self.driver_dummy_worker] + self.workers:
+            if worker is None:
+                # driver_dummy_worker can be None when using ray spmd worker.
+                continue
+            worker_node_and_gpu_ids.append(
+                ray.get(worker.get_node_and_gpu_ids.remote()) \
+            ) # type: ignore
+
+        node_workers = defaultdict(list)  # node id -> list of worker ranks
+        node_gpus = defaultdict(list)  # node id -> list of gpu ids
+
+        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
+            node_workers[node_id].append(i)
+            # `gpu_ids` can be a list of strings or integers.
+            # convert them to integers for consistency.
+            # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
+            # string sorting is not sufficient.
+            # see https://github.com/vllm-project/vllm/issues/5590
+            gpu_ids = [int(x) for x in gpu_ids]
+            node_gpus[node_id].extend(gpu_ids)
+        for node_id, gpu_ids in node_gpus.items():
+            node_gpus[node_id] = sorted(gpu_ids)
+
+        all_ips = set(worker_ips + [driver_ip])
+        n_ips = len(all_ips)
+        n_nodes = len(node_workers)
+
+        if n_nodes != n_ips:
+            raise RuntimeError(
+                f"Every node should have a unique IP address. Got {n_nodes}"
+                f" nodes with node ids {list(node_workers.keys())} and "
+                f"{n_ips} unique IP addresses {all_ips}. Please check your"
+                " network configuration. If you set `VLLM_HOST_IP`"
+                " environment variable, make sure it is unique for"
+                " each node.")
+
+        # Set environment variables for the driver and workers.
+        all_args_to_update_environment_variables = [{
+            current_platform.device_control_env_var:
+            ",".join(map(str, node_gpus[node_id])),
+        } for (node_id, _) in worker_node_and_gpu_ids]
+
+        # Environment variables to copy from driver to workers
+        env_vars_to_copy = get_env_vars_to_copy(
+            exclude_vars=self.WORKER_SPECIFIC_ENV_VARS,
+            additional_vars=set(current_platform.additional_env_vars).union(
+                self.ADDITIONAL_ENV_VARS),
+            destination="workers")
+
+        # Copy existing env vars to each worker's args
+        for args in all_args_to_update_environment_variables:
+            # TODO: refactor platform-specific env vars
+            for name in env_vars_to_copy:
+                if name in os.environ:
+                    args[name] = os.environ[name]
+
+        self._env_vars_for_all_workers = (
+            all_args_to_update_environment_variables)
+
+        self._run_workers("update_environment_variables",
+                          self._get_env_vars_to_be_updated())
+
+        if len(node_gpus) == 1:
+            # in single node case, we don't need to get the IP address.
+            # the loopback address is sufficient
+            # NOTE: a node may have several IP addresses, one for each
+            # network interface. `get_ip()` might return any of them,
+            # while they might not work for communication inside the node
+            # if the network setup is complicated. Using the loopback address
+            # solves this issue, as it always works for communication inside
+            # the node.
+            driver_ip = "127.0.0.1"
+        distributed_init_method = get_distributed_init_method(
+            driver_ip, get_open_port())
+
+        # Initialize the actual workers inside worker wrapper.
+        all_kwargs = []
+        for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids):
+            local_rank = node_workers[node_id].index(rank)
+            kwargs = dict(
+                vllm_config=self.vllm_config,
+                local_rank=local_rank,
+                rank=rank,
+                distributed_init_method=distributed_init_method,
+                is_driver_worker=(not self.parallel_config)
+                or (rank % self.parallel_config.tensor_parallel_size == 0),
+            )
+            all_kwargs.append(kwargs)
+        self._run_workers("init_worker", all_kwargs)
+
+        self._run_workers("init_device")
+        self._run_workers("load_model",
+                          max_concurrent_workers=self.parallel_config.
+                          max_parallel_loading_workers)
+
+        if self.use_ray_spmd_worker:
+            for pp_rank in range(self.parallel_config.pipeline_parallel_size):
+                self.pp_tp_workers.append([])
+                for tp_rank in range(
+                        self.parallel_config.tensor_parallel_size):
+                    # PP=2, TP=4
+                    # pp_tp_workers = [[0, 1, 2, 3], [4, 5, 6, 7]]
+                    rank = (pp_rank * self.parallel_config.tensor_parallel_size
+                            ) + tp_rank
+                    assert len(self.pp_tp_workers[pp_rank]) == tp_rank
+                    assert pp_rank < len(self.pp_tp_workers)
+                    self.pp_tp_workers[pp_rank].append(self.workers[rank])
+
+        # This is the list of workers that are rank 0 of each TP group EXCEPT
+        # global rank 0. These are the workers that will broadcast to the
+        # rest of the workers.
+        self.tp_driver_workers: List[RayWorkerWrapper] = []
+        # This is the list of workers that are not drivers and not the first
+        # worker in a TP group. These are the workers that will be
+        # broadcasted to.
+        self.non_driver_workers: List[RayWorkerWrapper] = []
+
+        # Enforce rank order for correct rank to return final output.
+        for index, worker in enumerate(self.workers):
+            # The driver worker is rank 0 and not in self.workers.
+            rank = index + 1
+            if rank % self.parallel_config.tensor_parallel_size == 0:
+                self.tp_driver_workers.append(worker)
+            else:
+                self.non_driver_workers.append(worker)
+
+    def _driver_execute_model(
+        self, execute_model_req: Optional[ExecuteModelRequest]
+    ) -> Optional[List[SamplerOutput]]:
+        """Run execute_model in the driver worker.
+
+        Passing None will cause the driver to stop the model execution
+        loop running in each of the remote workers.
+        """
+        assert not self.use_ray_spmd_worker, (
+            "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1")
+        return self.driver_worker.execute_method("execute_model",
+                                                 execute_model_req)
+
+    def execute_model(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        if not self.use_ray_spmd_worker:
+            return super().execute_model(execute_model_req)
+
+        if self.forward_dag is None:
+            self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
+
+        if self.use_v1:
+            serialized_data = execute_model_req
+        else:
+            serialized_data = self.input_encoder.encode(execute_model_req)
+        outputs = ray.get(self.forward_dag.execute(serialized_data))
+        if self.use_v1:
+            output = outputs[0]
+        else:
+            output = self.output_decoder.decode(outputs[0])
+        return output
+
+    def _run_workers(
+        self,
+        method: Union[str, Callable],
+        *args,
+        async_run_tensor_parallel_workers_only: bool = False,
+        max_concurrent_workers: Optional[int] = None,
+        **kwargs,
+    ) -> Any:
+        """Runs the given method on all workers. Can be used in the following
+        ways:
+
+        Args:
+        - async_run_tensor_parallel_workers_only: If True the method will be
+          run only in the remote TP workers, not the driver worker.
+          It will also be run asynchronously and return a list of futures
+          rather than blocking on the results.
+        - args/kwargs: All workers share the same args/kwargs
+        """
+        if isinstance(method, str):
+            sent_method = method
+        else:
+            sent_method = cloudpickle.dumps(method)
+        del method
+        if self.use_ray_spmd_worker:
+            assert not async_run_tensor_parallel_workers_only, (
+                "async_run_tensor_parallel_workers_only is not supported for "
+                "spmd mode.")
+
+        if max_concurrent_workers:
+            raise NotImplementedError(
+                "max_concurrent_workers is not supported yet.")
+
+        # Start the ray workers first.
+        ray_workers = self.workers
+        if async_run_tensor_parallel_workers_only:
+            ray_workers = self.non_driver_workers
+        ray_worker_outputs = [
+            worker.execute_method.remote(sent_method, *args, **kwargs)
+            for worker in ray_workers
+        ]
+
+        if async_run_tensor_parallel_workers_only:
+            # Just return futures
+            return ray_worker_outputs
+
+        driver_worker_output = []
+        # In SPMD mode, the driver worker is the same as any other worker,
+        # so we only explicitly execute on the driver worker if using a
+        # non-SPMD worker class.
+        if not self.use_ray_spmd_worker:
+            # Start the driver worker after all the ray workers.
+            driver_worker_output = [
+                self.driver_worker.execute_method(sent_method, *args, **kwargs)
+            ]
+
+        # Get the results of the ray workers.
+        if self.workers:
+            ray_worker_outputs = ray.get(ray_worker_outputs)
+
+        return driver_worker_output + ray_worker_outputs
+
+    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
+        """Wait for futures returned from _run_workers() with
+        async_run_remote_workers_only to complete."""
+        ray.get(parallel_worker_tasks)
+
+    def _check_ray_cgraph_installation(self):
+        import importlib.metadata
+
+        from packaging import version
+
+        required_version = version.parse("2.43.0")
+        current_version = version.parse(importlib.metadata.version("ray"))
+        if current_version < required_version:
+            raise ValueError(f"Ray version {required_version} is "
+                             f"required, but found {current_version}")
+
+        import importlib.util
+        cgraph_spec = importlib.util.find_spec(
+            "ray.experimental.compiled_dag_ref")
+        if cgraph_spec is None:
+            raise ValueError("Ray Compiled Graph is not installed. "
+                             "Run `pip install ray[cgraph]` to install it.")
+
+        cupy_spec = importlib.util.find_spec("cupy")
+        if (cupy_spec is None
+                and envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE == "nccl"):
+            raise ValueError(
+                "cupy is not installed but required since "
+                "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE is set to 'nccl'. "
+                "Run `pip install ray[cgraph]` and check cupy installation.")
+
+    def _compiled_ray_dag(self, enable_asyncio: bool):
+        assert self.parallel_config.use_ray
+        self._check_ray_cgraph_installation()
+        # Enlarge the default value of "RAY_CGRAPH_get_timeout" to 300 seconds
+        # (it is 10 seconds by default). This is a Ray environment variable to
+        # control the timeout of getting result from a compiled graph execution,
+        # i.e., the distributed execution that includes model forward runs and
+        # intermediate tensor communications, in the case of vllm.
+        # Note: we should set this env var before importing
+        # ray.dag, otherwise it will not take effect.
+        os.environ.setdefault("RAY_CGRAPH_get_timeout", "300")  # noqa: SIM112
+        from ray.dag import InputNode, MultiOutputNode
+        logger.info("RAY_CGRAPH_get_timeout is set to %s",
+                    os.environ["RAY_CGRAPH_get_timeout"])  # noqa: SIM112
+        logger.info("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE = %s",
+                    envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE)
+        logger.info("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM = %s",
+                    envs.VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM)
+
+        channel_type = envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE
+        if channel_type not in ("auto", "nccl", "shm"):
+            raise ValueError(
+                "Invalid value for VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: "
+                f"{channel_type}. Valid values are: 'auto', 'nccl', or 'shm'.")
+
+        with InputNode() as input_data:
+            # Example DAG: PP=2, TP=4
+            #
+            # For V0:
+            # ExecuteModelRequest -> 0 -> (ExecuteModelReq, IntermediateTensors) -> 4 -> SamplerOutput   # noqa: E501
+            # ExecuteModelRequest -> 1 -> (ExecuteModelReq, IntermediateTensors) -> 5 -> SamplerOutput   # noqa: E501
+            # ExecuteModelRequest -> 2 -> (ExecuteModelReq, IntermediateTensors) -> 6 -> SamplerOutput   # noqa: E501
+            # ExecuteModelRequest -> 3 -> (ExecuteModelReq, IntermediateTensors) -> 7 -> SamplerOutput   # noqa: E501
+            #
+            # For V1:
+            # SchedulerOutput -> 0 -> (SchedulerOutput, IntermediateTensors) -> 4 -> ModelRunnerOutput   # noqa: E501
+            # SchedulerOutput -> 1 -> (SchedulerOutput, IntermediateTensors) -> 5 -> ModelRunnerOutput   # noqa: E501
+            # SchedulerOutput -> 2 -> (SchedulerOutput, IntermediateTensors) -> 6 -> ModelRunnerOutput   # noqa: E501
+            # SchedulerOutput -> 3 -> (SchedulerOutput, IntermediateTensors) -> 7 -> ModelRunnerOutput   # noqa: E501
+
+            # All workers in the first TP group will take in the
+            # ExecuteModelRequest as input.
+            outputs = [input_data for _ in self.pp_tp_workers[0]]
+            for pp_rank, tp_group in enumerate(self.pp_tp_workers):
+                # Each PP worker takes in the output of the previous PP worker,
+                # and the TP group executes in SPMD fashion.
+                if self.use_v1:
+                    outputs = [
+                        worker.execute_model_ray.
+                        bind(  # type: ignore[attr-defined]
+                            outputs[i]) for i, worker in enumerate(tp_group)
+                    ]
+                else:
+                    outputs = [
+                        worker.execute_model_spmd.
+                        bind(  # type: ignore[attr-defined]
+                            outputs[i]) for i, worker in enumerate(tp_group)
+                    ]
+
+                last_pp_rank = len(self.pp_tp_workers) - 1
+                if (pp_rank < last_pp_rank and
+                        envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE != "shm"):
+                    # Specify how intermediate tensors should be passed
+                    # between pp stages, no need to specify for the last
+                    # pp stage or when using shared memory (the default).
+                    transport = envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE
+                    outputs = [
+                        output.with_tensor_transport(transport=transport)
+                        for output in outputs
+                    ]
+
+            forward_dag = MultiOutputNode(outputs)
+
+        return forward_dag.experimental_compile(
+            enable_asyncio=enable_asyncio,
+            _overlap_gpu_communication=envs.
+            VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM)
+
+    def __del__(self):
+        self.shutdown()
+
+    async def execute_model_async(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        if not self.use_ray_spmd_worker:
+            return await super().execute_model_async(execute_model_req)
+
+        if self.forward_dag is None:
+            self.forward_dag = self._compiled_ray_dag(enable_asyncio=True)
+
+        serialized_data = self.input_encoder.encode(execute_model_req)
+        dag_future = await self.forward_dag.execute_async(serialized_data)
+        output = await dag_future[0]
+        return self.output_decoder.decode(output)
+
+    async def _driver_execute_model_async(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> List[SamplerOutput]:
+        assert not self.use_ray_spmd_worker, (
+            "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1")
+        if not self.tp_driver_workers:
+            return await self.driver_exec_method("execute_model",
+                                                 execute_model_req)
+        if self.pp_locks is None:
+            # This locks each pipeline parallel stage so multiple virtual
+            # engines can't execute on the same stage at the same time
+            # We create the locks here to avoid creating them in the constructor
+            # which uses a different asyncio loop.
+            self.pp_locks = [
+                asyncio.Lock()
+                for _ in range(self.parallel_config.pipeline_parallel_size)
+            ]
+
+        tasks = [
+            asyncio.create_task(
+                _run_task_with_lock(self.driver_exec_method, self.pp_locks[0],
+                                    "execute_model", execute_model_req))
+        ]
+        for pp_rank, driver_worker in enumerate(self.tp_driver_workers,
+                                                start=1):
+            tasks.append(
+                asyncio.create_task(
+                    _run_task_with_lock(driver_worker.execute_method.remote,
+                                        self.pp_locks[pp_rank],
+                                        "execute_model", execute_model_req)))
+
+        results = await asyncio.gather(*tasks)
+
+        # Only the last PP stage has the final results.
+        return results[-1]
+
+    async def _start_worker_execution_loop(self):
+        assert not self.use_ray_spmd_worker, (
+            "worker loop is disabled for VLLM_USE_RAY_SPMD_WORKER=1")
+        coros = [
+            worker.execute_method.remote("start_worker_execution_loop")
+            for worker in self.non_driver_workers
+        ]
+        return await asyncio.gather(*coros)
+
+    def check_health(self) -> None:
+        # Assume that the Ray workers are healthy.
+        # TODO: check the health of the Ray workers
+        return
diff --git a/vllm_v0.10.0/vllm/executor/ray_utils.py b/vllm_v0.10.0/vllm/executor/ray_utils.py
new file mode 100644
index 0000000..033ecc0
--- /dev/null
+++ b/vllm_v0.10.0/vllm/executor/ray_utils.py
@@ -0,0 +1,401 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import time
+from collections import defaultdict
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+
+import msgspec
+
+import vllm.platforms
+from vllm.config import ParallelConfig
+from vllm.executor.msgspec_utils import decode_hook, encode_hook
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.sequence import ExecuteModelRequest, IntermediateTensors
+from vllm.utils import get_ip
+from vllm.worker.worker_base import WorkerWrapperBase
+
+if TYPE_CHECKING:
+    from vllm.v1.core.sched.output import SchedulerOutput
+    from vllm.v1.outputs import ModelRunnerOutput
+
+logger = init_logger(__name__)
+PG_WAIT_TIMEOUT = 1800
+
+try:
+    import ray
+    from ray.util import placement_group_table
+    from ray.util.placement_group import PlacementGroup
+    try:
+        from ray._private.state import available_resources_per_node
+    except ImportError:
+        # Ray 2.9.x doesn't expose `available_resources_per_node`
+        from ray._private.state import state as _state
+        available_resources_per_node = _state._available_resources_per_node
+
+    class RayWorkerWrapper(WorkerWrapperBase):
+        """Ray wrapper for vllm.worker.Worker, allowing Worker to be
+        lazily initialized after Ray sets CUDA_VISIBLE_DEVICES."""
+
+        def __init__(self, *args, **kwargs) -> None:
+            super().__init__(*args, **kwargs)
+            # Since the compiled DAG runs a main execution
+            # in a different thread that calls cuda.set_device.
+            # The flag indicates is set_device is called on
+            # that thread.
+            self.compiled_dag_cuda_device_set = False
+
+            self.input_decoder = msgspec.msgpack.Decoder(ExecuteModelRequest,
+                                                         dec_hook=decode_hook)
+            self.output_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
+
+        def get_node_ip(self) -> str:
+            return get_ip()
+
+        def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]:
+            node_id = ray.get_runtime_context().get_node_id()
+            device_key = vllm.platforms.current_platform.ray_device_key
+            if not device_key:
+                raise RuntimeError("current platform %s does not support ray.",
+                                   vllm.platforms.current_platform.device_name)
+            gpu_ids = ray.get_runtime_context().get_accelerator_ids(
+            )[device_key]
+            return node_id, gpu_ids
+
+        def execute_model_spmd(
+            self, req_or_tuple: Union[bytes,
+                                      Tuple[bytes,
+                                            Optional[IntermediateTensors]]]
+        ) -> bytes:
+            """Execute model in SPMD fashion: used only when SPMD worker and
+            compiled DAG are both enabled.
+
+            Args:
+                req_or_tuple: A request or a tuple containing the
+                    request and intermediate tensors. Intermediate tensors are
+                    None unless if it is provided because it is > 0 pipeline
+                    stage. The request is serialized by msgspec.
+            """
+            if isinstance(req_or_tuple, bytes):
+                serialized_req, intermediate_tensors = req_or_tuple, None
+            else:
+                serialized_req, intermediate_tensors = req_or_tuple
+
+            execute_model_req = self.input_decoder.decode(serialized_req)
+
+            # TODO(swang): This is needed right now because Ray Compiled Graph
+            # executes on a background thread, so we need to reset torch's
+            # current device.
+            if not self.compiled_dag_cuda_device_set:
+                current_platform.set_device(self.worker.device)
+                self.compiled_dag_cuda_device_set = True
+
+            output = self.worker._execute_model_spmd(execute_model_req,
+                                                     intermediate_tensors)
+            # Pipeline model request and output to the next pipeline stage.
+            if isinstance(output, IntermediateTensors):
+                output = serialized_req, output
+            else:
+                output = self.output_encoder.encode(output)
+
+            return output
+
+        def setup_device_if_necessary(self):
+            # TODO(swang): This is needed right now because Ray CG executes
+            # on a background thread, so we need to reset torch's current
+            # device.
+            # We can remove this API after it is fixed in compiled graph.
+            assert self.worker is not None, "Worker is not initialized"
+            if not self.compiled_dag_cuda_device_set:
+                if current_platform.is_tpu():
+                    # Not needed
+                    pass
+                else:
+                    current_platform.set_device(self.worker.device)
+
+                self.compiled_dag_cuda_device_set = True
+
+        def execute_model_ray(
+            self,
+            scheduler_output: Union["SchedulerOutput",
+                                    Tuple["SchedulerOutput",
+                                          "IntermediateTensors"]],
+        ) -> Union["ModelRunnerOutput", Tuple["SchedulerOutput",
+                                              "IntermediateTensors"]]:
+            # This method is used by Ray Compiled Graph to execute the model,
+            # and it needs a special logic of self.setup_device_if_necessary()
+            self.setup_device_if_necessary()
+            assert self.worker is not None, "Worker is not initialized"
+            if isinstance(scheduler_output, tuple):
+                scheduler_output, intermediate_tensors = scheduler_output
+            else:
+                scheduler_output, intermediate_tensors = scheduler_output, None
+            output = self.worker.model_runner.execute_model(
+                scheduler_output, intermediate_tensors)
+            if isinstance(output, IntermediateTensors):
+                output = scheduler_output, output
+            return output
+
+        def override_env_vars(self, vars: Dict[str, str]):
+            os.environ.update(vars)
+
+    ray_import_err = None
+
+except ImportError as e:
+    ray = None  # type: ignore
+    # only capture string to avoid variable references in the traceback that can
+    # prevent garbage collection in some cases
+    ray_import_err = str(e)
+    RayWorkerWrapper = None  # type: ignore
+
+
+def ray_is_available() -> bool:
+    """Returns True if Ray is available."""
+    return ray is not None
+
+
+def assert_ray_available():
+    """Raise an exception if Ray is not available."""
+    if ray is None:
+        raise ValueError(f"Failed to import Ray: {ray_import_err}."
+                         "Please install Ray with `pip install ray`.")
+
+
+def _verify_bundles(placement_group: "PlacementGroup",
+                    parallel_config: ParallelConfig, device_str: str):
+    """Verify a given placement group has bundles located in the right place.
+
+    There are 2 rules.
+    - Warn if all tensor parallel workers cannot fit in a single node.
+    - Fail if driver node is not included in a placement group.
+    """
+    assert ray.is_initialized(), (
+        "Ray is not initialized although distributed-executor-backend is ray.")
+    pg_data = placement_group_table(placement_group)
+    # bundle_idx -> node_id
+    bundle_to_node_ids = pg_data["bundles_to_node_id"]
+    # bundle_idx -> bundle (e.g., {"GPU": 1})
+    bundles = pg_data["bundles"]
+    # node_id -> List of bundle (e.g., {"GPU": 1})
+    node_id_to_bundle: Dict[str, List[Dict[str, float]]] = defaultdict(list)
+
+    for bundle_idx, node_id in bundle_to_node_ids.items():
+        node_id_to_bundle[node_id].append(bundles[bundle_idx])
+    driver_node_id = ray.get_runtime_context().get_node_id()
+
+    if driver_node_id not in node_id_to_bundle:
+        raise RuntimeError(
+            f"driver node id {driver_node_id} is not included in a placement "
+            f"group {placement_group.id}. Node id -> bundles "
+            f"{node_id_to_bundle}. "
+            "You don't have enough GPUs available in a current node. Check "
+            "`ray status` and `ray list nodes` to see if you have available "
+            "GPUs in a node `{driver_node_id}` before starting an vLLM engine."
+        )
+
+    for node_id, bundles in node_id_to_bundle.items():
+        if len(bundles) < parallel_config.tensor_parallel_size:
+            logger.warning(
+                "tensor_parallel_size=%d "
+                "is bigger than a reserved number of %ss (%d "
+                "%ss) in a node %s. Tensor parallel workers can be "
+                "spread out to 2+ nodes which can degrade the performance "
+                "unless you have fast interconnect across nodes, like "
+                "Infiniband. To resolve this issue, make sure you have more "
+                "than %d GPUs available at each node.",
+                parallel_config.tensor_parallel_size, device_str, len(bundles),
+                device_str, node_id, parallel_config.tensor_parallel_size)
+
+
+def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
+    """Wait until a placement group is ready.
+
+    It prints the informative log messages if the placement group is
+    not created within time.
+
+    """
+    # Wait until PG is ready - this will block until all
+    # requested resources are available, and will timeout
+    # if they cannot be provisioned.
+    placement_group_specs = current_placement_group.bundle_specs
+
+    s = time.time()
+    pg_ready_ref = current_placement_group.ready()
+    wait_interval = 10
+    while time.time() - s < PG_WAIT_TIMEOUT:
+        ready, _ = ray.wait([pg_ready_ref], timeout=wait_interval)
+        if len(ready) > 0:
+            break
+
+        # Exponential backoff for warning print.
+        wait_interval *= 2
+        logger.info(
+            "Waiting for creating a placement group of specs for "
+            "%d seconds. specs=%s. Check `ray status` and "
+            "`ray list nodes` to see if you have enough resources,"
+            " and make sure the IP addresses used by ray cluster"
+            " are the same as VLLM_HOST_IP environment variable"
+            " specified in each node if you are running on a multi-node.",
+            int(time.time() - s), placement_group_specs)
+
+    try:
+        ray.get(pg_ready_ref, timeout=0)
+    except ray.exceptions.GetTimeoutError:
+        raise ValueError(
+            "Cannot provide a placement group of "
+            f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See "
+            "`ray status` and `ray list nodes` to make sure the cluster has "
+            "enough resources.") from None
+
+
+def _wait_until_pg_removed(current_placement_group: "PlacementGroup"):
+    ray.util.remove_placement_group(current_placement_group)
+    s = time.time()
+    wait_interval = 10
+    while time.time() - s < PG_WAIT_TIMEOUT:
+        pg = ray.util.get_current_placement_group()
+        if pg is None:
+            break
+
+        # Exponential backoff for warning print.
+        wait_interval *= 2
+        logger.info(
+            "Waiting for removing a placement group of specs for "
+            "%d seconds.", int(time.time() - s))
+        time.sleep(wait_interval)
+
+
+def initialize_ray_cluster(
+    parallel_config: ParallelConfig,
+    ray_address: Optional[str] = None,
+):
+    """Initialize the distributed cluster with Ray.
+
+    it will connect to the Ray cluster and create a placement group
+    for the workers, which includes the specification of the resources
+    for each distributed worker.
+
+    Args:
+        parallel_config: The configurations for parallel execution.
+        ray_address: The address of the Ray cluster. If None, uses
+            the default Ray cluster address.
+    """
+    assert_ray_available()
+    from vllm.platforms import current_platform
+
+    if ray.is_initialized():
+        logger.info("Ray is already initialized. Skipping Ray initialization.")
+    elif current_platform.is_rocm() or current_platform.is_xpu():
+        # Try to connect existing ray instance and create a new one if not found
+        try:
+            ray.init("auto")
+        except ConnectionError:
+            logger.warning(
+                "No existing RAY instance detected. "
+                "A new instance will be launched with current node resources.")
+            ray.init(address=ray_address, num_gpus=parallel_config.world_size)
+    else:
+        ray.init(address=ray_address)
+
+    device_str = current_platform.ray_device_key
+    if not device_str:
+        raise ValueError(
+            f"current platform {current_platform.device_name} does not "
+            "support ray.")
+
+    # Create or get the placement group for worker processes
+    if parallel_config.placement_group:
+        current_placement_group = parallel_config.placement_group
+    else:
+        current_placement_group = ray.util.get_current_placement_group()
+
+    if current_placement_group:
+        logger.info("Using the existing placement group")
+
+        # We are in a placement group
+        bundles = current_placement_group.bundle_specs
+        # Verify that we can use the placement group.
+        device_bundles = 0
+        for bundle in bundles:
+            bundle_devices = bundle.get(device_str, 0)
+            if bundle_devices > 1:
+                raise ValueError(
+                    "Placement group bundle cannot have more than 1 "
+                    f"{device_str}.")
+            if bundle_devices:
+                device_bundles += 1
+        if parallel_config.world_size > device_bundles:
+            raise ValueError(
+                f"The number of required {device_str}s exceeds the total "
+                f"number of available {device_str}s in the placement group. "
+                f"Required number of devices: {parallel_config.world_size}. "
+                f"Total number of devices: {device_bundles}.")
+    else:
+        logger.info("No current placement group found. "
+                    "Creating a new placement group.")
+        num_devices_in_cluster = ray.cluster_resources().get(device_str, 0)
+        # Log a warning message and delay resource allocation failure response.
+        # Avoid immediate rejection to allow user-initiated placement group
+        # created and wait cluster to be ready
+        if parallel_config.world_size > num_devices_in_cluster:
+            logger.warning(
+                "The number of required %ss exceeds the total "
+                "number of available %ss in the placement group.", device_str,
+                device_str)
+        # Create a new placement group
+        placement_group_specs: List[Dict[str, float]] = ([{
+            device_str: 1.0
+        } for _ in range(parallel_config.world_size)])
+
+        # vLLM engine is also a worker to execute model with an accelerator,
+        # so it requires to have the device in a current node. Check if
+        # the current node has at least one device.
+        current_ip = get_ip()
+        current_node_id = ray.get_runtime_context().get_node_id()
+        current_node_resource = available_resources_per_node()[current_node_id]
+        if current_node_resource.get(device_str, 0) < 1:
+            raise ValueError(
+                f"Current node has no {device_str} available. "
+                f"{current_node_resource=}. vLLM engine cannot start without "
+                f"{device_str}. Make sure you have at least 1 {device_str} "
+                f"available in a node {current_node_id=} {current_ip=}.")
+        # This way, at least bundle is required to be created in a current
+        # node.
+        placement_group_specs[0][f"node:{current_ip}"] = 0.001
+
+        # By default, Ray packs resources as much as possible.
+        current_placement_group = ray.util.placement_group(
+            placement_group_specs, strategy="PACK")
+        _wait_until_pg_ready(current_placement_group)
+
+    assert current_placement_group is not None
+    _verify_bundles(current_placement_group, parallel_config, device_str)
+    # Set the placement group in the parallel config
+    parallel_config.placement_group = current_placement_group
+
+
+def get_num_tpu_nodes() -> int:
+    from ray._private.accelerators import TPUAcceleratorManager
+    cluster_resources = ray.cluster_resources()
+    total_tpus = int(cluster_resources["TPU"])
+    tpus_per_node = TPUAcceleratorManager.get_current_node_num_accelerators()
+    assert total_tpus % tpus_per_node == 0
+    return total_tpus // tpus_per_node
+
+
+def get_num_nodes_in_placement_group() -> int:
+    pg_table = ray.util.placement_group_table()
+    current_pg = ray.util.get_current_placement_group()
+    num_nodes = 0
+
+    if current_pg:
+        nodes_in_pg = set()
+        for pg_key, pg in pg_table.items():
+            if pg_key == current_pg.id.hex():
+                for _, node in pg["bundles_to_node_id"].items():
+                    nodes_in_pg.add(node)
+        num_nodes = len(nodes_in_pg)
+
+    return num_nodes
diff --git a/vllm_v0.10.0/vllm/executor/uniproc_executor.py b/vllm_v0.10.0/vllm/executor/uniproc_executor.py
new file mode 100644
index 0000000..aabc9ed
--- /dev/null
+++ b/vllm_v0.10.0/vllm/executor/uniproc_executor.py
@@ -0,0 +1,148 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.distributed as dist
+
+import vllm.envs as envs
+from vllm.executor.executor_base import ExecutorBase
+from vllm.logger import init_logger
+from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+                        run_method)
+from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
+from vllm.worker.worker_base import WorkerWrapperBase
+
+logger = init_logger(__name__)
+
+
+class UniProcExecutor(ExecutorBase):
+
+    uses_ray: bool = False
+
+    def _init_executor(self) -> None:
+        """Initialize the worker and load the model.
+        """
+        self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config,
+                                               rpc_rank=0)
+        distributed_init_method = get_distributed_init_method(
+            get_ip(), get_open_port())
+        local_rank = 0
+        # set local rank as the device index if specified
+        device_info = self.vllm_config.device_config.device.__str__().split(
+            ":")
+        if len(device_info) > 1:
+            local_rank = int(device_info[1])
+        rank = 0
+        is_driver_worker = True
+        kwargs = dict(
+            vllm_config=self.vllm_config,
+            local_rank=local_rank,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            is_driver_worker=is_driver_worker,
+        )
+        self.collective_rpc("init_worker", args=([kwargs], ))
+        self.collective_rpc("init_device")
+        self.collective_rpc("load_model")
+
+    def collective_rpc(self,
+                       method: Union[str, Callable],
+                       timeout: Optional[float] = None,
+                       args: Tuple = (),
+                       kwargs: Optional[Dict] = None) -> List[Any]:
+        if kwargs is None:
+            kwargs = {}
+        answer = run_method(self.driver_worker, method, args, kwargs)
+        return [answer]
+
+    def check_health(self) -> None:
+        # UniProcExecutor will always be healthy as long as
+        # it's running.
+        return
+
+    def reinitialize_distributed(
+            self, reconfig_request: ReconfigureDistributedRequest) -> None:
+        self.driver_worker.reinitialize_distributed(reconfig_request)
+        if reconfig_request.new_data_parallel_rank == \
+        ReconfigureRankType.SHUTDOWN_CURRENT_RANK:
+            self.shutdown()
+        return
+
+
+UniProcExecutorAsync = UniProcExecutor
+
+
+class ExecutorWithExternalLauncher(UniProcExecutor):
+    """An executor that uses external launchers to launch engines,
+    specially designed for torchrun-compatible launchers, for
+    offline inference with tensor parallelism.
+
+    see https://github.com/vllm-project/vllm/issues/11400 for
+    the motivation, and examples/offline_inference/torchrun_example.py
+    for the usage example.
+
+    The key idea: although it is tensor-parallel inference, we only
+    create one worker per executor, users will launch multiple
+    engines with torchrun-compatible launchers, and all these engines
+    work together to process the same prompts. When scheduling is
+    deterministic, all the engines will generate the same outputs,
+    and they don't need to synchronize the states with each other.
+    """
+    uses_ray: bool = False
+
+    def _init_executor(self) -> None:
+        """Initialize the worker and load the model.
+        """
+        assert self.vllm_config.scheduler_config.delay_factor == 0.0, \
+            ("ExecutorWithExternalLauncher needs deterministic "
+            "execution, so it"
+            "does not support delay_factor in scheduling")
+        if envs.VLLM_USE_V1:
+            assert not envs.VLLM_ENABLE_V1_MULTIPROCESSING, \
+            ("To get deterministic execution in V1, "
+            "please set VLLM_ENABLE_V1_MULTIPROCESSING=0")
+        self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config,
+                                               rpc_rank=0)
+        # engines are launched in torchrun-compatible launchers
+        # so we can use the env:// method.
+        # required env vars:
+        # - RANK
+        # - LOCAL_RANK
+        # - MASTER_ADDR
+        # - MASTER_PORT
+        distributed_init_method = "env://"
+        rank = int(os.environ["RANK"])
+        local_rank = int(os.environ["LOCAL_RANK"])
+        is_driver_worker = True
+        kwargs = dict(
+            vllm_config=self.vllm_config,
+            local_rank=local_rank,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            is_driver_worker=is_driver_worker,
+        )
+        self.collective_rpc("init_worker", args=([kwargs], ))
+        self.collective_rpc("init_device")
+        self.collective_rpc("load_model")
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """
+        Determine the number of available KV blocks.
+        Add an additional all_reduce to get the min across all ranks.
+        Note that even if we have the same `gpu_memory_utilization` and 
+        `swap_space`, the available memory in every rank might still 
+        differ because NCCL can take different amounts of memory in 
+        different ranks. Therefore, it is necessary to test if all ranks 
+        agree on the same KV cache configuration.
+        """
+        a, b = super().determine_num_available_blocks()
+        from vllm.distributed.parallel_state import get_world_group
+        cpu_group = get_world_group().cpu_group
+        a_tensor = torch.tensor([a], device="cpu", dtype=torch.int64)
+        b_tensor = torch.tensor([b], device="cpu", dtype=torch.int64)
+        dist.all_reduce(a_tensor, group=cpu_group, op=dist.ReduceOp.MIN)
+        dist.all_reduce(b_tensor, group=cpu_group, op=dist.ReduceOp.MIN)
+        return a_tensor.item(), b_tensor.item()
diff --git a/vllm_v0.10.0/vllm/forward_context.py b/vllm_v0.10.0/vllm/forward_context.py
new file mode 100644
index 0000000..dd55b19
--- /dev/null
+++ b/vllm_v0.10.0/vllm/forward_context.py
@@ -0,0 +1,185 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import time
+from collections import defaultdict
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+import torch
+import torch.distributed as dist
+
+import vllm.envs as envs
+from vllm.config import ParallelConfig, VllmConfig
+from vllm.logger import init_logger
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionMetadata
+
+logger = init_logger(__name__)
+
+track_batchsize: bool = envs.VLLM_LOG_BATCHSIZE_INTERVAL >= 0
+last_logging_time: float = 0
+forward_start_time: float = 0
+batchsize_logging_interval: float = envs.VLLM_LOG_BATCHSIZE_INTERVAL
+batchsize_forward_time: defaultdict = defaultdict(list)
+
+
+@dataclass
+class DPMetadata:
+    max_tokens_across_dp_cpu: torch.Tensor
+    cu_tokens_across_dp_cpu: torch.Tensor
+
+    @staticmethod
+    def num_tokens_across_dp(num_tokens: int, dp_size: int,
+                             dp_rank: int) -> torch.Tensor:
+        """
+        Gather the num_tokens across all DP ranks and return results in a
+        CPU tensor of size dp_size.
+        """
+        num_tokens_across_dp = [0] * dp_size
+        num_tokens_across_dp[dp_rank] = num_tokens
+        num_tokens_tensor = torch.tensor(num_tokens_across_dp,
+                                         device="cpu",
+                                         dtype=torch.int32)
+        from vllm.distributed.parallel_state import get_dp_group
+        dist.all_reduce(num_tokens_tensor, group=get_dp_group().cpu_group)
+        return num_tokens_tensor
+
+    @staticmethod
+    def make(
+            parallel_config: ParallelConfig,
+            attn_metadata: Any,
+            num_tokens: int,
+            num_tokens_across_dp: Optional[torch.Tensor] = None
+    ) -> "DPMetadata":
+
+        assert parallel_config.data_parallel_size > 1
+        dp_size = parallel_config.data_parallel_size
+        dp_rank = parallel_config.data_parallel_rank
+        if attn_metadata is not None and hasattr(attn_metadata,
+                                                 "num_prefill_tokens"):
+            # for v0 attention backends
+            batchsize = attn_metadata.num_prefill_tokens + \
+                attn_metadata.num_decode_tokens
+        else:
+            # for v1 attention backends or no attn_metadata
+            batchsize = num_tokens
+
+        # If num_tokens_across_dp is None, it will be computed by all_reduce
+        # Otherwise, num_tokens_across_dp[dp_rank] should be equal to batchsize
+        assert (num_tokens_across_dp is None
+                or num_tokens_across_dp[dp_rank] == batchsize)
+        if num_tokens_across_dp is None:
+            num_tokens_across_dp = DPMetadata.num_tokens_across_dp(
+                batchsize, dp_size, dp_rank)
+        max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp)
+        cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_across_dp, dim=0)
+        return DPMetadata(max_tokens_across_dp_cpu, cu_tokens_across_dp_cpu)
+
+
+@dataclass
+class ForwardContext:
+    # copy from vllm_config.compilation_config.static_forward_context
+    no_compile_layers: dict[str, Any]
+    """
+    Type AttentionMetadata for v0, 
+    Type Dict[str, AttentionMetadata] for v1, map from layer_name of each 
+    attention layer to its attention metadata
+    set dynamically for each forward pass
+    """
+    attn_metadata: Union["AttentionMetadata", dict[str, "AttentionMetadata"]]
+    # TODO: remove after making all virtual_engines share the same kv cache
+    virtual_engine: int  # set dynamically for each forward pass
+    # set dynamically for each forward pass
+    dp_metadata: Optional[DPMetadata] = None
+    skip_cuda_graphs: bool = False
+
+
+_forward_context: Optional[ForwardContext] = None
+
+
+def get_forward_context() -> ForwardContext:
+    """Get the current forward context."""
+    assert _forward_context is not None, (
+        "Forward context is not set. "
+        "Please use `set_forward_context` to set the forward context.")
+    return _forward_context
+
+
+@contextmanager
+def set_forward_context(
+    attn_metadata: Any,
+    vllm_config: VllmConfig,
+    virtual_engine: int = 0,
+    num_tokens: Optional[int] = None,
+    num_tokens_across_dp: Optional[torch.Tensor] = None,
+    skip_cuda_graphs: bool = False,
+):
+    """A context manager that stores the current forward context,
+    can be attention metadata, etc.
+    Here we can inject common logic for every model forward pass.
+    """
+    global forward_start_time
+    need_to_track_batchsize = track_batchsize and attn_metadata is not None
+    if need_to_track_batchsize:
+        forward_start_time = time.perf_counter()
+    dp_metadata: Optional[DPMetadata] = None
+    if vllm_config.parallel_config.data_parallel_size > 1 and (
+            attn_metadata is not None or num_tokens is not None):
+        dp_metadata = DPMetadata.make(vllm_config.parallel_config,
+                                      attn_metadata, num_tokens or 0,
+                                      num_tokens_across_dp)
+
+    global _forward_context
+    prev_context = _forward_context
+    _forward_context = ForwardContext(
+        no_compile_layers=vllm_config.compilation_config.
+        static_forward_context,
+        virtual_engine=virtual_engine,
+        attn_metadata=attn_metadata,
+        dp_metadata=dp_metadata,
+        skip_cuda_graphs=skip_cuda_graphs,
+    )
+
+    try:
+        yield
+    finally:
+        global last_logging_time, batchsize_logging_interval
+        if need_to_track_batchsize:
+            if hasattr(attn_metadata, "num_prefill_tokens"):
+                # for v0 attention backends
+                batchsize = attn_metadata.num_prefill_tokens + \
+                    attn_metadata.num_decode_tokens
+            else:
+                # for v1 attention backends
+                batchsize = num_tokens
+            # we use synchronous scheduling right now,
+            # adding a sync point here should not affect
+            # scheduling of the next batch
+            from vllm.platforms import current_platform
+            synchronize = current_platform.synchronize
+            if synchronize is not None:
+                synchronize()
+            now = time.perf_counter()
+            # time measurement is in milliseconds
+            batchsize_forward_time[batchsize].append(
+                (now - forward_start_time) * 1000)
+            if now - last_logging_time > batchsize_logging_interval:
+                last_logging_time = now
+                forward_stats = []
+                for bs, times in batchsize_forward_time.items():
+                    if len(times) <= 1:
+                        # can be cudagraph / profiling run
+                        continue
+                    medium = torch.quantile(torch.tensor(times), q=0.5).item()
+                    medium = round(medium, 2)
+                    forward_stats.append((bs, len(times), medium))
+                forward_stats.sort(key=lambda x: x[1], reverse=True)
+                if forward_stats:
+                    logger.info(("Batchsize forward time stats "
+                                 "(batchsize, count, median_time(ms)): %s"),
+                                forward_stats)
+
+        _forward_context = prev_context
diff --git a/vllm_v0.10.0/vllm/inputs/__init__.py b/vllm_v0.10.0/vllm/inputs/__init__.py
new file mode 100644
index 0000000..37bf2b7
--- /dev/null
+++ b/vllm_v0.10.0/vllm/inputs/__init__.py
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from .data import (DecoderOnlyInputs, EmbedsInputs, EncoderDecoderInputs,
+                   ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType,
+                   SingletonInputs, SingletonPrompt, TextPrompt, TokenInputs,
+                   TokensPrompt, build_explicit_enc_dec_prompt, embeds_inputs,
+                   to_enc_dec_tuple_list, token_inputs, zip_enc_dec_prompts)
+from .registry import (DummyData, InputContext, InputProcessingContext,
+                       InputRegistry)
+
+INPUT_REGISTRY = InputRegistry()
+"""
+The global [`InputRegistry`][vllm.inputs.registry.InputRegistry] which is used
+by [`LLMEngine`][vllm.LLMEngine] to dispatch data processing according to the
+target model.
+"""
+
+__all__ = [
+    "TextPrompt",
+    "TokensPrompt",
+    "PromptType",
+    "SingletonPrompt",
+    "ExplicitEncoderDecoderPrompt",
+    "TokenInputs",
+    "EmbedsInputs",
+    "token_inputs",
+    "embeds_inputs",
+    "DecoderOnlyInputs",
+    "EncoderDecoderInputs",
+    "ProcessorInputs",
+    "SingletonInputs",
+    "build_explicit_enc_dec_prompt",
+    "to_enc_dec_tuple_list",
+    "zip_enc_dec_prompts",
+    "INPUT_REGISTRY",
+    "DummyData",
+    "InputContext",
+    "InputProcessingContext",
+    "InputRegistry",
+]
diff --git a/vllm_v0.10.0/vllm/inputs/data.py b/vllm_v0.10.0/vllm/inputs/data.py
new file mode 100644
index 0000000..23cb5e5
--- /dev/null
+++ b/vllm_v0.10.0/vllm/inputs/data.py
@@ -0,0 +1,331 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable
+from typing import TYPE_CHECKING, Any, Generic, Literal, Optional, Union, cast
+
+import torch
+from typing_extensions import NotRequired, TypedDict, TypeIs, TypeVar
+
+if TYPE_CHECKING:
+    from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputs
+
+
+class TextPrompt(TypedDict):
+    """Schema for a text prompt."""
+
+    prompt: str
+    """The input text to be tokenized before passing to the model."""
+
+    multi_modal_data: NotRequired["MultiModalDataDict"]
+    """
+    Optional multi-modal data to pass to the model,
+    if the model supports it.
+    """
+
+    mm_processor_kwargs: NotRequired[dict[str, Any]]
+    """
+    Optional multi-modal processor kwargs to be forwarded to the
+    multimodal input mapper & processor. Note that if multiple modalities
+    have registered mappers etc for the model being considered, we attempt
+    to pass the mm_processor_kwargs to each of them.
+    """
+
+    cache_salt: NotRequired[str]
+    """
+    Optional cache salt to be used for prefix caching.
+    """
+
+
+class TokensPrompt(TypedDict):
+    """Schema for a tokenized prompt."""
+
+    prompt_token_ids: list[int]
+    """A list of token IDs to pass to the model."""
+
+    token_type_ids: NotRequired[list[int]]
+    """A list of token type IDs to pass to the cross encoder model."""
+
+    multi_modal_data: NotRequired["MultiModalDataDict"]
+    """
+    Optional multi-modal data to pass to the model,
+    if the model supports it.
+    """
+
+    mm_processor_kwargs: NotRequired[dict[str, Any]]
+    """
+    Optional multi-modal processor kwargs to be forwarded to the
+    multimodal input mapper & processor. Note that if multiple modalities
+    have registered mappers etc for the model being considered, we attempt
+    to pass the mm_processor_kwargs to each of them.
+    """
+
+    cache_salt: NotRequired[str]
+    """
+    Optional cache salt to be used for prefix caching.
+    """
+
+
+class EmbedsPrompt(TypedDict):
+    """Schema for a prompt provided via token embeddings."""
+
+    prompt_embeds: torch.Tensor
+    """The embeddings of the prompt."""
+
+    cache_salt: NotRequired[str]
+    """
+    Optional cache salt to be used for prefix caching.
+    """
+
+
+SingletonPrompt = Union[str, TextPrompt, TokensPrompt, EmbedsPrompt]
+"""
+Set of possible schemas for a single prompt:
+
+- A text prompt ([`str`][] or [`TextPrompt`][vllm.inputs.data.TextPrompt])
+- A tokenized prompt ([`TokensPrompt`][vllm.inputs.data.TokensPrompt])
+- An embeddings prompt ([`EmbedsPrompt`][vllm.inputs.data.EmbedsPrompt])
+
+Note that "singleton" is as opposed to a data structure
+which encapsulates multiple prompts, i.e. of the sort
+which may be utilized for encoder/decoder models when
+the user desires to express both the encoder & decoder
+prompts explicitly, i.e. 
+[`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt]
+
+A prompt of type [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] may be 
+employed as (1) input to a decoder-only model, (2) input to
+the encoder of an encoder/decoder model, in the scenario
+where the decoder-prompt is not specified explicitly, or
+(3) as a member of a larger data structure encapsulating
+more than one prompt, i.e. 
+[`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt]
+"""
+
+
+def is_tokens_prompt(prompt: SingletonPrompt) -> TypeIs[TokensPrompt]:
+    return (isinstance(prompt, dict) and "prompt_token_ids" in prompt
+            and "prompt_embeds" not in prompt)
+
+
+def is_embeds_prompt(prompt: SingletonPrompt) -> TypeIs[EmbedsPrompt]:
+    return (isinstance(prompt, dict) and "prompt_token_ids" not in prompt
+            and "prompt_embeds" in prompt)
+
+
+_T1_co = TypeVar("_T1_co",
+                 bound=SingletonPrompt,
+                 default=SingletonPrompt,
+                 covariant=True)
+_T2_co = TypeVar("_T2_co",
+                 bound=SingletonPrompt,
+                 default=SingletonPrompt,
+                 covariant=True)
+
+
+# TODO: Make fields ReadOnly once mypy supports it
+class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]):
+    """
+    Represents an encoder/decoder model input prompt,
+    comprising an explicit encoder prompt and a decoder prompt.
+
+    The encoder and decoder prompts, respectively, may be formatted
+    according to any of the
+    [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] schemas,
+    and are not required to have the same schema.
+
+    Only the encoder prompt may have multi-modal data. mm_processor_kwargs
+    should be at the top-level, and should not be set in the encoder/decoder
+    prompts, since they are agnostic to the encoder/decoder.
+
+    Note that an
+    [`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt]
+    may not be used as an input to a decoder-only model,
+    and that the `encoder_prompt` and `decoder_prompt`
+    fields of this data structure themselves must be
+    [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] instances.
+    """
+
+    encoder_prompt: _T1_co
+
+    decoder_prompt: Optional[_T2_co]
+
+    mm_processor_kwargs: NotRequired[dict[str, Any]]
+
+
+PromptType = Union[SingletonPrompt, ExplicitEncoderDecoderPrompt]
+"""
+Set of possible schemas for an LLM input, including
+both decoder-only and encoder/decoder input types:
+
+- A text prompt ([`str`][] or [`TextPrompt`][vllm.inputs.data.TextPrompt])
+- A tokenized prompt ([`TokensPrompt`][vllm.inputs.data.TokensPrompt])
+- An embeddings prompt ([`EmbedsPrompt`][vllm.inputs.data.EmbedsPrompt])
+- A single data structure containing both an encoder and a decoder prompt
+  ([`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt])
+"""
+
+
+class TokenInputs(TypedDict):
+    """Represents token-based inputs."""
+
+    type: Literal["token"]
+    """The type of inputs."""
+
+    prompt_token_ids: list[int]
+    """The token IDs of the prompt."""
+
+    token_type_ids: NotRequired[list[int]]
+    """The token type IDs of the prompt."""
+
+    prompt: NotRequired[str]
+    """
+    The original prompt text corresponding to the token IDs, if available.
+    """
+
+    cache_salt: NotRequired[str]
+    """
+    Optional cache salt to be used for prefix caching.
+    """
+
+
+def token_inputs(
+    prompt_token_ids: list[int],
+    token_type_ids: Optional[list[int]] = None,
+    prompt: Optional[str] = None,
+    cache_salt: Optional[str] = None,
+) -> TokenInputs:
+    """Construct [`TokenInputs`][vllm.inputs.data.TokenInputs] from optional
+    values."""
+    inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids)
+
+    if prompt is not None:
+        inputs["prompt"] = prompt
+    if token_type_ids is not None:
+        inputs["token_type_ids"] = token_type_ids
+    if cache_salt is not None:
+        inputs["cache_salt"] = cache_salt
+
+    return inputs
+
+
+class EmbedsInputs(TypedDict):
+    """Represents embeddings-based inputs."""
+
+    type: Literal["embeds"]
+    """The type of inputs."""
+
+    prompt_embeds: torch.Tensor
+    """The embeddings of the prompt."""
+
+    cache_salt: NotRequired[str]
+    """
+    Optional cache salt to be used for prefix caching.
+    """
+
+
+def embeds_inputs(
+    prompt_embeds: torch.Tensor,
+    cache_salt: Optional[str] = None,
+) -> EmbedsInputs:
+    """Construct [`EmbedsInputs`][vllm.inputs.data.EmbedsInputs] from optional
+    values."""
+    inputs = EmbedsInputs(type="embeds", prompt_embeds=prompt_embeds)
+
+    if cache_salt is not None:
+        inputs["cache_salt"] = cache_salt
+
+    return inputs
+
+
+DecoderOnlyInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"]
+"""
+The inputs in [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] before they are
+passed to the model executor.
+This specifies the data required for decoder-only models.
+"""
+
+
+class EncoderDecoderInputs(TypedDict):
+    """
+    The inputs in [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] before they
+    are passed to the model executor.
+
+    This specifies the required data for encoder-decoder models.
+    """
+
+    encoder: Union[TokenInputs, "MultiModalInputs"]
+    """The inputs for the encoder portion."""
+
+    decoder: Union[TokenInputs, "MultiModalInputs"]
+    """The inputs for the decoder portion."""
+
+
+SingletonInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"]
+"""
+A processed [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] which can be 
+passed to [`vllm.sequence.Sequence`][].
+"""
+
+ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs]
+"""
+The outputs from [`vllm.inputs.preprocess.InputPreprocessor`][].
+"""
+
+_T1 = TypeVar("_T1", bound=SingletonPrompt, default=SingletonPrompt)
+_T2 = TypeVar("_T2", bound=SingletonPrompt, default=SingletonPrompt)
+
+
+def build_explicit_enc_dec_prompt(
+    encoder_prompt: _T1,
+    decoder_prompt: Optional[_T2],
+    mm_processor_kwargs: Optional[dict[str, Any]] = None,
+) -> ExplicitEncoderDecoderPrompt[_T1, _T2]:
+    if mm_processor_kwargs is None:
+        mm_processor_kwargs = {}
+    return ExplicitEncoderDecoderPrompt(
+        encoder_prompt=encoder_prompt,
+        decoder_prompt=decoder_prompt,
+        mm_processor_kwargs=mm_processor_kwargs,
+    )
+
+
+def zip_enc_dec_prompts(
+    enc_prompts: Iterable[_T1],
+    dec_prompts: Iterable[Optional[_T2]],
+    mm_processor_kwargs: Optional[Union[Iterable[dict[str, Any]],
+                                        dict[str, Any]]] = None,
+) -> list[ExplicitEncoderDecoderPrompt[_T1, _T2]]:
+    """
+    Zip encoder and decoder prompts together into a list of
+    [`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt]
+    instances.
+
+    ``mm_processor_kwargs`` may also be provided; if a dict is passed, the same
+    dictionary will be used for every encoder/decoder prompt. If an iterable is
+    provided, it will be zipped with the encoder/decoder prompts.
+    """
+    if mm_processor_kwargs is None:
+        mm_processor_kwargs = cast(dict[str, Any], {})
+    if isinstance(mm_processor_kwargs, dict):
+        return [
+            build_explicit_enc_dec_prompt(
+                encoder_prompt,
+                decoder_prompt,
+                cast(dict[str, Any], mm_processor_kwargs),
+            ) for (encoder_prompt,
+                   decoder_prompt) in zip(enc_prompts, dec_prompts)
+        ]
+    return [
+        build_explicit_enc_dec_prompt(encoder_prompt, decoder_prompt,
+                                      mm_proc_kwargs)
+        for (encoder_prompt, decoder_prompt, mm_proc_kwargs
+             ) in zip(enc_prompts, dec_prompts, mm_processor_kwargs)
+    ]
+
+
+def to_enc_dec_tuple_list(
+    enc_dec_prompts: Iterable[ExplicitEncoderDecoderPrompt[_T1, _T2]],
+) -> list[tuple[_T1, Optional[_T2]]]:
+    return [(enc_dec_prompt["encoder_prompt"],
+             enc_dec_prompt["decoder_prompt"])
+            for enc_dec_prompt in enc_dec_prompts]
diff --git a/vllm_v0.10.0/vllm/inputs/parse.py b/vllm_v0.10.0/vllm/inputs/parse.py
new file mode 100644
index 0000000..8c37007
--- /dev/null
+++ b/vllm_v0.10.0/vllm/inputs/parse.py
@@ -0,0 +1,151 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Sequence
+from typing import Literal, Optional, TypedDict, Union, cast, overload
+
+from typing_extensions import TypeIs
+
+from vllm.utils import is_list_of
+
+from .data import (EmbedsPrompt, ExplicitEncoderDecoderPrompt, ProcessorInputs,
+                   PromptType, SingletonInputs, SingletonPrompt, TextPrompt,
+                   TokensPrompt)
+
+
+class ParsedText(TypedDict):
+    content: str
+    is_tokens: Literal[False]
+
+
+class ParsedTokens(TypedDict):
+    content: list[int]
+    is_tokens: Literal[True]
+
+
+@overload
+def parse_and_batch_prompt(
+    prompt: Union[str, list[str]], ) -> Sequence[ParsedText]:
+    ...
+
+
+@overload
+def parse_and_batch_prompt(
+    prompt: Union[list[int], list[list[int]]], ) -> Sequence[ParsedTokens]:
+    ...
+
+
+def parse_and_batch_prompt(
+    prompt: Union[str, list[str], list[int], list[list[int]]],
+) -> Union[Sequence[ParsedText], Sequence[ParsedTokens]]:
+    if isinstance(prompt, str):
+        # case 1: a string
+        return [ParsedText(content=prompt, is_tokens=False)]
+
+    if isinstance(prompt, list):
+        if len(prompt) == 0:
+            raise ValueError("please provide at least one prompt")
+
+        if is_list_of(prompt, str):
+            # case 2: array of strings
+            prompt = cast(list[str], prompt)
+            return [
+                ParsedText(content=elem, is_tokens=False) for elem in prompt
+            ]
+        if is_list_of(prompt, int):
+            # case 3: array of tokens
+            prompt = cast(list[int], prompt)
+            return [ParsedTokens(content=prompt, is_tokens=True)]
+        if is_list_of(prompt, list):
+            prompt = cast(list[list[int]], prompt)
+            if len(prompt[0]) == 0:
+                raise ValueError("please provide at least one prompt")
+
+            if is_list_of(prompt[0], int):
+                # case 4: array of token arrays
+                return [
+                    ParsedTokens(content=elem, is_tokens=True)
+                    for elem in prompt
+                ]
+
+    raise TypeError("prompt must be a string, array of strings, "
+                    "array of tokens, or array of token arrays")
+
+
+class ParsedStrPrompt(TypedDict):
+    type: Literal["str"]
+    content: str
+
+
+class ParsedTextPrompt(TypedDict):
+    type: Literal["text"]
+    content: TextPrompt
+
+
+class ParsedTokensPrompt(TypedDict):
+    type: Literal["tokens"]
+    content: TokensPrompt
+
+
+class ParsedEmbedsPrompt(TypedDict):
+    type: Literal["embeds"]
+    content: EmbedsPrompt
+
+
+ParsedSingletonPrompt = Union[ParsedStrPrompt, ParsedTextPrompt,
+                              ParsedTokensPrompt, ParsedEmbedsPrompt]
+
+
+@overload
+def parse_singleton_prompt(prompt: str) -> ParsedStrPrompt:
+    ...
+
+
+@overload
+def parse_singleton_prompt(prompt: TextPrompt) -> ParsedTextPrompt:
+    ...
+
+
+@overload
+def parse_singleton_prompt(prompt: TokensPrompt) -> ParsedTokensPrompt:
+    ...
+
+
+@overload
+def parse_singleton_prompt(prompt: EmbedsPrompt) -> ParsedEmbedsPrompt:
+    ...
+
+
+def parse_singleton_prompt(prompt: SingletonPrompt) -> ParsedSingletonPrompt:
+    if isinstance(prompt, str):
+        return ParsedStrPrompt(type="str", content=prompt)
+    elif isinstance(prompt, dict):
+        # Type ignores are because mypy does not correctly infer the TypedDicts
+        # Pyright does succeed.
+        if "prompt_embeds" in prompt:
+            return ParsedEmbedsPrompt(
+                type="embeds", content=prompt)  # type: ignore[typeddict-item]
+        elif "prompt_token_ids" in prompt:
+            return ParsedTokensPrompt(
+                type="tokens", content=prompt)  # type: ignore[typeddict-item]
+        elif "prompt" in prompt:
+            return ParsedTextPrompt(type="text", content=prompt)
+    raise TypeError(
+        "inputs must be a string, TextPrompt, TokensPrompt, or EmbedsPrompt")
+
+
+def is_explicit_encoder_decoder_prompt(
+    prompt: PromptType, ) -> TypeIs[ExplicitEncoderDecoderPrompt]:
+    return isinstance(prompt, dict) and "encoder_prompt" in prompt
+
+
+def split_enc_dec_inputs(
+    inputs: ProcessorInputs,
+) -> tuple[Optional[SingletonInputs], SingletonInputs]:
+    if "encoder" in inputs and "decoder" in inputs:
+        # NOTE: This passes pyright but not mypy
+        return (
+            inputs["encoder"],  # type: ignore[typeddict-item]
+            inputs["decoder"],  # type: ignore[typeddict-item]
+        )
+
+    return None, inputs
diff --git a/vllm_v0.10.0/vllm/inputs/preprocess.py b/vllm_v0.10.0/vllm/inputs/preprocess.py
new file mode 100644
index 0000000..de5dc08
--- /dev/null
+++ b/vllm_v0.10.0/vllm/inputs/preprocess.py
@@ -0,0 +1,893 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+from collections.abc import Mapping
+from typing import Any, Optional, Union, cast
+
+from typing_extensions import assert_never
+
+from vllm.config import ModelConfig
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
+                                    MultiModalInputs)
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
+
+from .data import (DecoderOnlyInputs, EmbedsInputs, EmbedsPrompt,
+                   EncoderDecoderInputs, ProcessorInputs, PromptType,
+                   SingletonInputs, SingletonPrompt, TextPrompt, TokenInputs,
+                   TokensPrompt, embeds_inputs, token_inputs)
+from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt
+
+logger = init_logger(__name__)
+
+
+class InputPreprocessor:
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        tokenizer: Optional[TokenizerGroup],
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+    ) -> None:
+        super().__init__()
+
+        self.model_config = model_config
+        self.tokenizer = tokenizer
+        self.mm_registry = mm_registry
+
+    def get_tokenizer_group(self) -> TokenizerGroup:
+        if self.tokenizer is None:
+            raise ValueError("You cannot pass text prompts when "
+                             "`skip_tokenizer_init` is True")
+
+        return self.tokenizer
+
+    def get_bos_token_id(self,
+                         lora_request: Optional[LoRARequest] = None
+                         ) -> Optional[int]:
+        if self.tokenizer is None:
+            logger.warning("Using None for BOS token id because tokenizer "
+                           "is not initialized")
+            return None
+
+        return self.tokenizer.get_lora_tokenizer(lora_request).bos_token_id
+
+    def get_eos_token_id(self,
+                         lora_request: Optional[LoRARequest] = None
+                         ) -> Optional[int]:
+        if self.tokenizer is None:
+            logger.warning("Using None for EOS token id because tokenizer "
+                           "is not initialized")
+            return None
+
+        return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id
+
+    def get_decoder_start_token_id(self) -> Optional[int]:
+        """
+        Obtain the decoder start token id employed by an encoder/decoder
+        model. Returns None for non-encoder/decoder models or if the
+        model config is unavailable.
+        """
+
+        if not self.model_config.is_encoder_decoder:
+            logger.warning_once(
+                "Using None for decoder start token id because "
+                "this is not an encoder/decoder model.")
+            return None
+
+        if self.model_config is None or self.model_config.hf_config is None:
+            logger.warning_once(
+                "Using None for decoder start token id because "
+                "model config is not available.")
+            return None
+
+        dec_start_token_id = getattr(self.model_config.hf_config,
+                                     "decoder_start_token_id", None)
+        if dec_start_token_id is None:
+            logger.warning_once(
+                "Falling back on <BOS> for decoder start token "
+                "id because decoder start token id is not "
+                "available.")
+            dec_start_token_id = self.get_bos_token_id()
+
+        return dec_start_token_id
+
+    def _get_default_enc_dec_decoder_prompt(self) -> list[int]:
+        """
+        Specifically for encoder/decoder models:
+        generate a default decoder prompt for when
+        the user specifies only the encoder prompt.
+
+        Encoder/decoder models utilize the decoder
+        prompt in different ways; as new models are
+        added, it is intended that this function
+        will be extended to produce differing
+        default decoder prompts, depending on the
+        model variety.
+
+        Absent a special case, the default behavior
+        of this method is to mirror the behavior of
+        the HuggingFace (HF) GenerationMixin for a None
+        decoder prompt, which is to employ a logit processor
+        setting to force the first decoded token to be <BOS>.
+        Here, this behavior is approximated by having the
+        "default" decoder prompt be <BOS>.
+
+        However, it is possible that in the future
+        other models may have different or more
+        complex logic for the default decoder prompt.
+        This motivates having a special helper method
+        for default decoder prompts.
+
+        Returns:
+
+        * prompt_token_ids
+        """
+
+        bos_token_id = self.get_bos_token_id()
+        assert bos_token_id is not None
+        return [bos_token_id]
+
+    def _prepare_decoder_input_ids_for_generation(
+        self,
+        decoder_input_ids: Optional[list[int]],
+    ) -> list[int]:
+        """
+        Prepares `decoder_input_ids` for generation with encoder-decoder models.
+
+        Based on:
+        https://github.com/huggingface/transformers/blob/4037a2b5b1278736e566aec12e169100275545ea/src/transformers/generation/utils.py
+        specifically,
+        `GenerationMixin._prepare_decoder_input_ids_for_generation()`.
+
+        Arguments:
+
+        * decoder_input_ids: input token ids to preprocess
+
+        Returns:
+
+        * Processed token list
+        """
+
+        decoder_start_token_id = self.get_decoder_start_token_id()
+        assert decoder_start_token_id is not None
+
+        if decoder_input_ids is None:
+            # no decoder prompt input ->
+            # use decoder_start_token_id as decoder_input_ids
+            decoder_input_ids = self._get_default_enc_dec_decoder_prompt()
+
+        if (len(decoder_input_ids) == 0
+                or decoder_input_ids[0] != decoder_start_token_id):
+            decoder_input_ids = [decoder_start_token_id] + decoder_input_ids
+
+        return decoder_input_ids
+
+    def _get_tokenization_kw(
+        self,
+        overrides: Optional[dict[str, Any]] = None,
+    ) -> dict[str, Any]:
+        kwargs = dict[str, Any]()
+
+        if self.model_config.hf_config.model_type == "whisper":
+            # For Whisper, special tokens should be provided by the user based
+            # on the task and language of their request. Also needed to avoid
+            # appending an EOS token to the prompt which disrupts generation.
+            kwargs["add_special_tokens"] = False
+
+        if overrides:
+            kwargs.update(overrides)
+
+        return kwargs
+
+    def _tokenize_prompt(
+        self,
+        prompt: str,
+        lora_request: Optional[LoRARequest],
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+    ) -> list[int]:
+        """
+        Apply the model's tokenizer to a text prompt, returning the
+        corresponding token IDs.
+        """
+        tokenizer = self.get_tokenizer_group()
+        tokenization_kwargs = self._get_tokenization_kw(tokenization_kwargs)
+
+        encoder_config = self.model_config.encoder_config
+
+        if encoder_config and encoder_config.get("do_lower_case", False):
+            prompt = prompt.lower()
+
+        return tokenizer.encode(prompt=prompt,
+                                lora_request=lora_request,
+                                **tokenization_kwargs)
+
+    async def _tokenize_prompt_async(
+        self,
+        prompt: str,
+        lora_request: Optional[LoRARequest],
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+    ) -> list[int]:
+        """
+        Async version of
+        [`_tokenize_prompt`][vllm.inputs.preprocess.InputPreprocessor._tokenize_prompt].
+        """
+        tokenizer = self.get_tokenizer_group()
+        tokenization_kwargs = self._get_tokenization_kw(tokenization_kwargs)
+
+        return await tokenizer.encode_async(prompt=prompt,
+                                            lora_request=lora_request,
+                                            **tokenization_kwargs)
+
+    def _get_mm_tokenizer(
+        self,
+        lora_request: Optional[LoRARequest],
+    ) -> AnyTokenizer:
+        # PrithviGeoSpatialMAE needs to be initialized without a tokenizer
+        # while using also multi-modal input
+        if not self.tokenizer:
+            return cast(AnyTokenizer, object())  # Dummy
+
+        tokenizer_group = self.get_tokenizer_group()
+        return tokenizer_group.get_lora_tokenizer(lora_request)
+
+    async def _get_mm_tokenizer_async(
+        self,
+        lora_request: Optional[LoRARequest],
+    ) -> AnyTokenizer:
+        # PrithviGeoSpatialMAE needs to be initialized without a tokenizer
+        # while using also multi-modal input
+        if not self.tokenizer:
+            return cast(AnyTokenizer, object())  # Dummy
+
+        tokenizer_group = self.get_tokenizer_group()
+        return await tokenizer_group.get_lora_tokenizer_async(lora_request)
+
+    def _process_multimodal(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+        mm_processor_kwargs: Optional[Mapping[str, object]],
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        lora_request: Optional[LoRARequest] = None,
+        return_mm_hashes: bool = False,
+    ) -> MultiModalInputs:
+        """
+        Apply the model's multi-modal processor to a multi-modal prompt,
+        returning the corresponding token IDs and metadata.
+        """
+        tokenizer = self._get_mm_tokenizer(lora_request)
+
+        mm_processor = self.mm_registry.create_processor(self.model_config,
+                                                         tokenizer=tokenizer)
+
+        if mm_processor_kwargs is None:
+            mm_processor_kwargs = {}
+
+        return mm_processor.apply(prompt,
+                                  mm_data,
+                                  hf_processor_mm_kwargs=mm_processor_kwargs,
+                                  tokenization_kwargs=tokenization_kwargs,
+                                  return_mm_hashes=return_mm_hashes)
+
+    async def _process_multimodal_async(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+        mm_processor_kwargs: Optional[Mapping[str, object]],
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        lora_request: Optional[LoRARequest] = None,
+        return_mm_hashes: bool = False,
+    ) -> MultiModalInputs:
+        """
+        Async version of
+        [`_process_multimodal`][vllm.inputs.preprocess.InputPreprocessor._process_multimodal].
+        """
+        tokenizer = await self._get_mm_tokenizer_async(lora_request)
+
+        mm_processor = self.mm_registry.create_processor(self.model_config,
+                                                         tokenizer=tokenizer)
+        if mm_processor_kwargs is None:
+            mm_processor_kwargs = {}
+
+        return mm_processor.apply(prompt,
+                                  mm_data,
+                                  hf_processor_mm_kwargs=mm_processor_kwargs,
+                                  tokenization_kwargs=tokenization_kwargs,
+                                  return_mm_hashes=return_mm_hashes)
+
+    def _process_embeds(
+        self,
+        parsed_content: EmbedsPrompt,
+    ) -> EmbedsInputs:
+        if not self.model_config.enable_prompt_embeds:
+            raise ValueError("You must set `--enable-prompt-embeds` to input "
+                             "`prompt_embeds`.")
+
+        prompt_embeds = parsed_content["prompt_embeds"]
+
+        # prompt_embeds must be (seq_len, hidden_size), but if the user
+        # passes in a batch of size 1, i.e. (1, seq_len, hidden_size),
+        # we can unambiguously process the intent by squeezing the batch
+        # dimension.
+        if prompt_embeds.ndim == 3:
+            prompt_embeds = prompt_embeds.squeeze(dim=0)
+
+        if prompt_embeds.ndim != 2:
+            raise ValueError(
+                "prompt_embeds must be of shape (seq_len, hidden_size).")
+
+        return embeds_inputs(prompt_embeds=prompt_embeds,
+                             cache_salt=parsed_content.get("cache_salt"))
+
+    async def _process_embeds_async(
+        self,
+        parsed_content: EmbedsPrompt,
+    ) -> EmbedsInputs:
+        return self._process_embeds(parsed_content)
+
+    def _process_tokens(
+        self,
+        parsed_content: TokensPrompt,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        lora_request: Optional[LoRARequest] = None,
+        return_mm_hashes: bool = False,
+    ) -> Union[TokenInputs, MultiModalInputs]:
+        prompt_token_ids = parsed_content["prompt_token_ids"]
+        token_type_ids = parsed_content.get("token_type_ids")
+
+        inputs: Union[TokenInputs, MultiModalInputs]
+        if multi_modal_data := parsed_content.get("multi_modal_data"):
+            inputs = self._process_multimodal(
+                prompt_token_ids,
+                multi_modal_data,
+                parsed_content.get("mm_processor_kwargs"),
+                tokenization_kwargs=tokenization_kwargs,
+                lora_request=lora_request,
+                return_mm_hashes=return_mm_hashes,
+            )
+        else:
+            inputs = token_inputs(
+                prompt_token_ids=prompt_token_ids,
+                token_type_ids=token_type_ids,
+            )
+
+        if cache_salt := parsed_content.get("cache_salt"):
+            inputs["cache_salt"] = cache_salt
+
+        return inputs
+
+    async def _process_tokens_async(
+        self,
+        parsed_content: TokensPrompt,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        lora_request: Optional[LoRARequest] = None,
+        return_mm_hashes: bool = False,
+    ) -> Union[TokenInputs, MultiModalInputs]:
+        prompt_token_ids = parsed_content["prompt_token_ids"]
+        token_type_ids = parsed_content.get("token_type_ids")
+
+        inputs: Union[TokenInputs, MultiModalInputs]
+        if multi_modal_data := parsed_content.get("multi_modal_data"):
+            inputs = await self._process_multimodal_async(
+                prompt_token_ids,
+                multi_modal_data,
+                parsed_content.get("mm_processor_kwargs"),
+                tokenization_kwargs=tokenization_kwargs,
+                lora_request=lora_request,
+                return_mm_hashes=return_mm_hashes,
+            )
+        else:
+            inputs = token_inputs(
+                prompt_token_ids=prompt_token_ids,
+                token_type_ids=token_type_ids,
+            )
+
+        if cache_salt := parsed_content.get("cache_salt"):
+            inputs["cache_salt"] = cache_salt
+
+        return inputs
+
+    def _process_text(
+        self,
+        parsed_content: TextPrompt,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        lora_request: Optional[LoRARequest] = None,
+        return_mm_hashes: bool = False,
+    ) -> Union[TokenInputs, MultiModalInputs]:
+        prompt_text = parsed_content["prompt"]
+
+        inputs: Union[TokenInputs, MultiModalInputs]
+        if multi_modal_data := parsed_content.get("multi_modal_data"):
+            inputs = self._process_multimodal(
+                prompt_text,
+                multi_modal_data,
+                parsed_content.get("mm_processor_kwargs"),
+                tokenization_kwargs=tokenization_kwargs,
+                lora_request=lora_request,
+                return_mm_hashes=return_mm_hashes,
+            )
+        else:
+            prompt_token_ids = self._tokenize_prompt(
+                prompt_text,
+                lora_request=lora_request,
+                tokenization_kwargs=tokenization_kwargs,
+            )
+            inputs = token_inputs(
+                prompt=prompt_text,
+                prompt_token_ids=prompt_token_ids,
+            )
+
+        if cache_salt := parsed_content.get("cache_salt"):
+            inputs["cache_salt"] = cache_salt
+
+        return inputs
+
+    async def _process_text_async(
+        self,
+        parsed_content: TextPrompt,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        lora_request: Optional[LoRARequest] = None,
+        return_mm_hashes: bool = False,
+    ) -> Union[TokenInputs, MultiModalInputs]:
+        prompt_text = parsed_content["prompt"]
+
+        inputs: Union[TokenInputs, MultiModalInputs]
+        if multi_modal_data := parsed_content.get("multi_modal_data"):
+            inputs = await self._process_multimodal_async(
+                prompt_text,
+                multi_modal_data,
+                parsed_content.get("mm_processor_kwargs"),
+                tokenization_kwargs=tokenization_kwargs,
+                lora_request=lora_request,
+                return_mm_hashes=return_mm_hashes,
+            )
+        else:
+            prompt_token_ids = await self._tokenize_prompt_async(
+                prompt_text,
+                lora_request=lora_request,
+                tokenization_kwargs=tokenization_kwargs,
+            )
+            inputs = token_inputs(
+                prompt=prompt_text,
+                prompt_token_ids=prompt_token_ids,
+            )
+
+        if cache_salt := parsed_content.get("cache_salt"):
+            inputs["cache_salt"] = cache_salt
+
+        return inputs
+
+    def _prompt_to_llm_inputs(
+        self,
+        prompt: SingletonPrompt,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        lora_request: Optional[LoRARequest] = None,
+        return_mm_hashes: bool = False,
+    ) -> SingletonInputs:
+        """
+        Extract the singleton inputs from a prompt.
+
+        Arguments:
+
+        * prompt: single encoder or decoder input prompt
+        * lora_request: this is only valid for decoder prompts
+        * return_mm_hashes: whether to return multimodal hashes
+
+        Returns:
+
+        * [`SingletonInputs`][vllm.inputs.data.SingletonInputs] instance
+        """
+        parsed = parse_singleton_prompt(prompt)
+
+        if parsed["type"] == "embeds":
+            return self._process_embeds(parsed["content"])
+        if parsed["type"] == "tokens":
+            return self._process_tokens(
+                parsed["content"],
+                lora_request=lora_request,
+                return_mm_hashes=return_mm_hashes,
+            )
+        if parsed["type"] == "text":
+            return self._process_text(
+                parsed["content"],
+                tokenization_kwargs=tokenization_kwargs,
+                lora_request=lora_request,
+                return_mm_hashes=return_mm_hashes,
+            )
+        if parsed["type"] == "str":
+            return self._process_text(
+                TextPrompt(prompt=parsed["content"]),
+                tokenization_kwargs=tokenization_kwargs,
+                lora_request=lora_request,
+                return_mm_hashes=return_mm_hashes,
+            )
+
+        assert_never(parsed)
+
+    async def _prompt_to_llm_inputs_async(
+        self,
+        prompt: SingletonPrompt,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        lora_request: Optional[LoRARequest] = None,
+        return_mm_hashes: bool = False,
+    ) -> SingletonInputs:
+        """
+        Async version of
+        [`_prompt_to_llm_inputs`][vllm.inputs.preprocess.InputPreprocessor._prompt_to_llm_inputs].
+        """
+        parsed = parse_singleton_prompt(prompt)
+
+        if parsed["type"] == "embeds":
+            return await self._process_embeds_async(parsed["content"])
+        if parsed["type"] == "tokens":
+            return await self._process_tokens_async(
+                parsed["content"],
+                lora_request=lora_request,
+                return_mm_hashes=return_mm_hashes,
+            )
+        if parsed["type"] == "text":
+            return await self._process_text_async(
+                parsed["content"],
+                tokenization_kwargs=tokenization_kwargs,
+                lora_request=lora_request,
+                return_mm_hashes=return_mm_hashes,
+            )
+        if parsed["type"] == "str":
+            return await self._process_text_async(
+                TextPrompt(prompt=parsed["content"]),
+                tokenization_kwargs=tokenization_kwargs,
+                lora_request=lora_request,
+                return_mm_hashes=return_mm_hashes,
+            )
+
+        assert_never(parsed)
+
+    def _build_enc_dec_llm_inputs(
+        self,
+        encoder_inputs: SingletonInputs,
+        decoder_inputs: Optional[SingletonInputs],
+    ) -> EncoderDecoderInputs:
+        if (encoder_inputs["type"] == "embeds"
+                or decoder_inputs and decoder_inputs["type"] == "embeds"):
+            raise ValueError("Embedding inputs are not supported for encoder-"
+                             "decoder models")
+
+        # Needed for mypy
+        encoder_inputs = cast(Union[TokenInputs, MultiModalInputs],
+                              encoder_inputs)
+        decoder_inputs = cast(Optional[Union[TokenInputs, MultiModalInputs]],
+                              decoder_inputs)
+
+        if decoder_inputs is None:
+            if self.model_config.hf_config.model_type == "whisper":
+                # For Whisper models, the text prompt should go to the decoder.
+                # If no explicit encoder/decoder inputs, then copy the prompt
+                # from the encoder to the decoder. The encoder tokens are later
+                # overridden by the audio features.
+                dec_token_ids = encoder_inputs["prompt_token_ids"].copy()
+            else:
+                dec_token_ids = self._prepare_decoder_input_ids_for_generation(
+                    None)
+            decoder_inputs = token_inputs(dec_token_ids)
+        else:
+            if "multi_modal_data" in decoder_inputs:
+                raise ValueError("Multi-modal decoder inputs of encoder-"
+                                 "decoder models are not supported yet")
+
+            dec_token_ids = self._prepare_decoder_input_ids_for_generation(
+                decoder_inputs["prompt_token_ids"])
+            decoder_inputs["prompt_token_ids"] = dec_token_ids
+
+        return EncoderDecoderInputs(
+            encoder=encoder_inputs,
+            decoder=decoder_inputs,
+        )
+
+    def _split_enc_dec_mm_inputs(
+        self,
+        inputs: Union[SingletonInputs, MultiModalEncDecInputs],
+        decoder_inputs_to_override: Optional[SingletonInputs] = None,
+    ) -> tuple[SingletonInputs, SingletonInputs]:
+        """
+        For encoder/decoder models only:
+        Separate Encoder/Decoder inputs from a MultiModalEncDecInputs
+        """
+        if (inputs["type"] == "embeds" or decoder_inputs_to_override
+                and decoder_inputs_to_override["type"] == "embeds"):
+            raise ValueError("Embedding inputs are not supported for encoder-"
+                             "decoder models")
+
+        # Needed for mypy
+        inputs = cast(
+            Union[TokenInputs, MultiModalInputs, MultiModalEncDecInputs],
+            inputs,
+        )
+        decoder_inputs_to_override = cast(
+            Optional[Union[TokenInputs, MultiModalInputs]],
+            decoder_inputs_to_override,
+        )
+
+        encoder_inputs: SingletonInputs
+        decoder_inputs: SingletonInputs
+
+        if inputs["type"] == "multimodal":  # Multimodal data inputs
+            if not ("encoder_prompt" in inputs
+                    and "encoder_prompt_token_ids" in inputs):
+                raise RuntimeError("You should register an encoder-decoder "
+                                   "multi-modal processor for encoder-decoder "
+                                   "models.")
+            inputs = cast(MultiModalEncDecInputs, inputs)
+
+            encoder_inputs = token_inputs(
+                prompt=inputs["encoder_prompt"],
+                prompt_token_ids=inputs["encoder_prompt_token_ids"],
+            )
+
+            decoder_prompt_inputs = decoder_inputs_to_override or inputs
+            decoder_inputs = MultiModalInputs(
+                type="multimodal",
+                prompt=decoder_prompt_inputs.get("prompt", ""),
+                prompt_token_ids=decoder_prompt_inputs["prompt_token_ids"],
+                mm_kwargs=inputs["mm_kwargs"],
+                mm_hashes=inputs["mm_hashes"],
+                mm_placeholders=inputs["mm_placeholders"],
+            )
+            if cache_salt := inputs.get("cache_salt"):
+                decoder_inputs["cache_salt"] = cache_salt
+
+        elif inputs["type"] == "token":  # Text-only inputs
+            encoder_inputs = token_inputs(prompt="", prompt_token_ids=[])
+            decoder_inputs = decoder_inputs_to_override or inputs
+        else:
+            assert_never(inputs)  # type: ignore[arg-type]
+
+        return encoder_inputs, decoder_inputs
+
+    def _process_encoder_decoder_prompt(
+        self,
+        prompt: PromptType,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+    ) -> EncoderDecoderInputs:
+        """
+        For encoder/decoder models only:
+        Process an input prompt into an
+        [`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs]
+        instance.
+
+        There are two types of input prompts:
+        singleton prompts which carry only the
+        encoder prompt, and explicit encoder/decoder
+        prompts which carry both the encoder and the
+        decoder prompts as member variables.
+
+        This function handles the following scenarios:
+        * Singleton encoder prompt: extract encoder prompt
+          token ids & infer default decoder prompt token ids
+        * Explicit encoder/decoder prompt: extract encoder
+          and decoder prompt token ids
+
+        Note that for Explicit encoder/decoder prompts,
+        each sub-prompt (encoder or decoder prompt) can
+        have any possible singleton type; thus this
+        method relies on helper functions to obtain
+        token ids for the sub-prompts.
+
+        Arguments:
+
+        * prompt: an input prompt
+
+        Returns:
+
+        * [`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs]
+          instance
+        """
+        encoder_inputs: SingletonInputs
+        decoder_inputs: Optional[SingletonInputs]
+
+        if is_explicit_encoder_decoder_prompt(prompt):
+            encoder_inputs = self._prompt_to_llm_inputs(
+                prompt["encoder_prompt"],
+                tokenization_kwargs=tokenization_kwargs,
+            )
+            if (decoder_input := prompt["decoder_prompt"]) is None:
+                decoder_inputs = None
+            else:
+                decoder_inputs = self._prompt_to_llm_inputs(decoder_input)
+            # For multimodal model, override decoder prompt from processor
+            # with explicit decoder prompt.
+            if self.model_config.is_multimodal_model:
+                encoder_inputs, decoder_inputs = (
+                    self._split_enc_dec_mm_inputs(encoder_inputs,
+                                                  decoder_inputs))
+        else:
+            inputs = self._prompt_to_llm_inputs(
+                prompt,
+                tokenization_kwargs=tokenization_kwargs,
+            )
+            if self.model_config.is_multimodal_model:
+                # Encoder-Decoder Multimodal model
+                encoder_inputs, decoder_inputs = (
+                    self._split_enc_dec_mm_inputs(inputs))
+            else:
+                encoder_inputs = inputs
+                decoder_inputs = None
+
+        return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs)
+
+    async def _process_encoder_decoder_prompt_async(
+        self,
+        prompt: PromptType,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+    ) -> EncoderDecoderInputs:
+        """
+        Async version of
+        [`_process_encoder_decoder_prompt`][vllm.inputs.preprocess.InputPreprocessor._process_encoder_decoder_prompt].
+        """
+        encoder_inputs: SingletonInputs
+        decoder_inputs: Optional[SingletonInputs]
+
+        if is_explicit_encoder_decoder_prompt(prompt):
+            encoder_task = self._prompt_to_llm_inputs_async(
+                prompt["encoder_prompt"],
+                tokenization_kwargs=tokenization_kwargs,
+            )
+
+            if (decoder_input := prompt["decoder_prompt"]) is None:
+                encoder_inputs = await encoder_task
+                decoder_inputs = None
+            else:
+                decoder_task = self._prompt_to_llm_inputs_async(
+                    decoder_input,
+                    tokenization_kwargs=tokenization_kwargs,
+                )
+
+                encoder_inputs, decoder_inputs = await asyncio.gather(
+                    encoder_task, decoder_task)
+
+            # For multimodal model, override decoder prompt from processor
+            # with explicit decoder prompt.
+            if self.model_config.is_multimodal_model:
+                encoder_inputs, decoder_inputs = (
+                    self._split_enc_dec_mm_inputs(encoder_inputs,
+                                                  decoder_inputs))
+        else:
+            inputs = await self._prompt_to_llm_inputs_async(
+                prompt,
+                tokenization_kwargs=tokenization_kwargs,
+            )
+            if self.model_config.is_multimodal_model:
+                # Encoder-Decoder Multimodal model
+                encoder_inputs, decoder_inputs = (
+                    self._split_enc_dec_mm_inputs(inputs))
+            else:
+                encoder_inputs = inputs
+                decoder_inputs = None
+
+        return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs)
+
+    def _build_decoder_only_llm_inputs(
+        self,
+        prompt_inputs: DecoderOnlyInputs,
+    ) -> DecoderOnlyInputs:
+        if "prompt_token_ids" in prompt_inputs:
+            prompt_inputs = cast(Union[TokenInputs, MultiModalInputs],
+                                 prompt_inputs)  # Needed for mypy
+
+        return prompt_inputs
+
+    def _process_decoder_only_prompt(
+        self,
+        prompt: SingletonPrompt,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        lora_request: Optional[LoRARequest] = None,
+        return_mm_hashes: bool = False,
+    ) -> DecoderOnlyInputs:
+        """
+        For decoder-only models:
+        Process an input prompt into a
+        [`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] instance.
+
+        Arguments:
+
+        * prompt: input prompt
+        * lora_request
+        * return_mm_hashes
+
+        Returns:
+
+        * [`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] instance
+        """
+
+        prompt_comps = self._prompt_to_llm_inputs(
+            prompt,
+            tokenization_kwargs=tokenization_kwargs,
+            lora_request=lora_request,
+            return_mm_hashes=return_mm_hashes,
+        )
+
+        return self._build_decoder_only_llm_inputs(prompt_comps)
+
+    async def _process_decoder_only_prompt_async(
+        self,
+        prompt: SingletonPrompt,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        lora_request: Optional[LoRARequest] = None,
+        return_mm_hashes: bool = False,
+    ) -> DecoderOnlyInputs:
+        """
+        Async version of
+        [`_process_decoder_only_prompt`][vllm.inputs.preprocess.InputPreprocessor._process_decoder_only_prompt].
+        """
+        prompt_comps = await self._prompt_to_llm_inputs_async(
+            prompt,
+            tokenization_kwargs=tokenization_kwargs,
+            lora_request=lora_request,
+            return_mm_hashes=return_mm_hashes,
+        )
+
+        return self._build_decoder_only_llm_inputs(prompt_comps)
+
+    def preprocess(
+        self,
+        prompt: PromptType,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        lora_request: Optional[LoRARequest] = None,
+        return_mm_hashes: bool = False,
+    ) -> ProcessorInputs:
+        """Preprocess the input prompt."""
+        if self.model_config.is_encoder_decoder:
+            assert not return_mm_hashes, (
+                "Multimodal hashes for encoder-decoder models should not be ",
+                "returned until they are supported on vLLM V1.")
+            # Encoder-decoder model requires special mapping of
+            # input prompts to encoder & decoder
+            return self._process_encoder_decoder_prompt(
+                prompt, tokenization_kwargs)
+
+        if is_explicit_encoder_decoder_prompt(prompt):
+            raise ValueError("Cannot pass encoder-decoder prompt "
+                             "to decoder-only models")
+
+        # Decoder-only operation
+        return self._process_decoder_only_prompt(
+            prompt,
+            tokenization_kwargs=tokenization_kwargs,
+            lora_request=lora_request,
+            return_mm_hashes=return_mm_hashes,
+        )
+
+    async def preprocess_async(
+        self,
+        prompt: PromptType,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        lora_request: Optional[LoRARequest] = None,
+        return_mm_hashes: bool = False,
+    ) -> ProcessorInputs:
+        """
+        Async version of
+        [`preprocess`][vllm.inputs.preprocess.InputPreprocessor.preprocess].
+        """
+        if self.model_config.is_encoder_decoder:
+            assert not return_mm_hashes, (
+                "Multimodal hashes for encoder-decoder models should not be ",
+                "returned until they are supported on vLLM V1.")
+            # Encoder-decoder model requires special mapping of
+            # input prompts to encoder & decoder
+            return await self._process_encoder_decoder_prompt_async(prompt)
+
+        if is_explicit_encoder_decoder_prompt(prompt):
+            raise ValueError("Cannot pass encoder-decoder prompt "
+                             "to decoder-only models")
+
+        # Decoder-only operation
+        return await self._process_decoder_only_prompt_async(
+            prompt,
+            tokenization_kwargs=tokenization_kwargs,
+            lora_request=lora_request,
+            return_mm_hashes=return_mm_hashes,
+        )
diff --git a/vllm_v0.10.0/vllm/inputs/registry.py b/vllm_v0.10.0/vllm/inputs/registry.py
new file mode 100644
index 0000000..652136f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/inputs/registry.py
@@ -0,0 +1,246 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Mapping
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
+
+import torch
+from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
+from typing_extensions import TypeVar
+
+from vllm.jsontree import JSONTree, json_map_leaves
+from vllm.logger import init_logger
+from vllm.transformers_utils.processor import cached_processor_from_config
+from vllm.utils import resolve_mm_processor_kwargs
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+    from vllm.multimodal import (MultiModalDataDict, MultiModalPlaceholderDict,
+                                 MultiModalRegistry)
+    from vllm.sequence import SequenceData
+    from vllm.transformers_utils.tokenizer import AnyTokenizer
+else:
+    ModelConfig = Any
+    MultiModalDataDict = Any
+    MultiModalPlaceholderDict = Any
+    MultiModalRegistry = Any
+    SequenceData = Any
+    AnyTokenizer = Any
+
+_T = TypeVar("_T")
+_C = TypeVar("_C", bound=PretrainedConfig, default=PretrainedConfig)
+_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
+
+logger = init_logger(__name__)
+
+
+@dataclass(frozen=True)
+class InputContext:
+    """
+    Contains information about the model which may be used to
+    modify the inputs.
+    """
+
+    model_config: ModelConfig
+    """The configuration of the model."""
+
+    def get_hf_config(
+        self,
+        typ: Union[type[_C], tuple[type[_C], ...]] = PretrainedConfig,
+        /,
+    ) -> _C:
+        """
+        Get the HuggingFace configuration
+        (`transformers.PretrainedConfig`) of the model,
+        additionally checking its type.
+
+        Raises:
+            TypeError: If the configuration is not of the specified type.
+        """
+        hf_config = self.model_config.hf_config
+        if not isinstance(hf_config, typ):
+            raise TypeError("Invalid type of HuggingFace config. "
+                            f"Expected type: {typ}, but "
+                            f"found type: {type(hf_config)}")
+
+        return hf_config
+
+    def get_hf_image_processor_config(self) -> dict[str, Any]:
+        """
+        Get the HuggingFace image processor configuration of the model.
+        """
+        return self.model_config.hf_image_processor_config
+
+    def get_mm_config(self):
+        """
+        Get the multimodal config of the model.
+
+        Raises:
+            RuntimeError: If the model is not a multimodal model.
+        """
+        mm_config = self.model_config.multimodal_config
+        if mm_config is None:
+            raise RuntimeError("Not a multimodal model")
+
+        return mm_config
+
+    def get_hf_processor(
+        self,
+        typ: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
+        /,
+        **kwargs: object,
+    ) -> _P:
+        """
+        Get the HuggingFace processor
+        (`transformers.ProcessorMixin`) of the model,
+        additionally checking its type.
+
+        Raises:
+            TypeError: If the processor is not of the specified type.
+        """
+        return cached_processor_from_config(
+            self.model_config,
+            processor_cls=typ,
+            **kwargs,
+        )
+
+    def init_processor(
+        self,
+        typ: type[_T],
+        /,
+        **kwargs: object,
+    ) -> _T:
+        """
+        Initialize a HuggingFace-like processor class, merging the
+        keyword arguments with those in the model's configuration.
+        """
+        mm_config = self.model_config.get_multimodal_config()
+        base_kwargs = mm_config.mm_processor_kwargs
+        if base_kwargs is None:
+            base_kwargs = {}
+
+        merged_kwargs = {**base_kwargs, **kwargs}
+
+        return typ(**merged_kwargs)
+
+
+@dataclass(frozen=True)
+class InputProcessingContext(InputContext):
+    tokenizer: AnyTokenizer
+    """The tokenizer used to tokenize the inputs."""
+
+    def get_hf_processor(
+        self,
+        typ: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
+        /,
+        **kwargs: object,
+    ) -> _P:
+        return super().get_hf_processor(
+            typ,
+            tokenizer=self.tokenizer,
+            **kwargs,
+        )
+
+    def call_hf_processor(
+        self,
+        hf_processor: ProcessorMixin,
+        data: Mapping[str, object],
+        kwargs: Mapping[str, object] = {},
+    ) -> Union[BatchFeature, JSONTree]:
+        """
+        Call `hf_processor` on the prompt `data`
+        (text, image, audio...) with configurable options `kwargs`.
+        """
+        assert callable(hf_processor)
+
+        mm_config = self.model_config.get_multimodal_config()
+        base_kwargs = mm_config.mm_processor_kwargs
+        if base_kwargs is None:
+            base_kwargs = {}
+
+        merged_kwargs = resolve_mm_processor_kwargs(
+            base_kwargs,
+            kwargs,
+            hf_processor,
+            requires_kw_only=False,
+            allow_var_kwargs=True,
+        )
+
+        def maybe_cast_dtype(x):
+            # This mimics the behavior of transformers.BatchFeature
+            if isinstance(x, torch.Tensor) and x.is_floating_point():
+                return x.to(dtype=self.model_config.dtype)
+            return x
+
+        try:
+            output = hf_processor(**data, **merged_kwargs, return_tensors="pt")
+            # this emulates output.to(dtype=self.model_config.dtype)
+            if isinstance(output, BatchFeature):
+                cast_output = json_map_leaves(maybe_cast_dtype, output.data)
+                return BatchFeature(cast_output)
+
+            cast_output = json_map_leaves(maybe_cast_dtype, output)
+
+            logger.warning_once(
+                f"{type(hf_processor).__name__} did not return `BatchFeature`. "
+                "Make sure to match the behaviour of `ProcessorMixin` when "
+                "implementing custom processors.")
+            return cast_output
+
+        except Exception as exc:
+            msg = (f"Failed to apply {type(hf_processor).__name__} "
+                   f"on data={data} with kwargs={merged_kwargs}")
+
+            raise ValueError(msg) from exc
+
+
+class DummyData(NamedTuple):
+    """
+    Dummy data used for profiling.
+
+    Note: This is only used in V0.
+    """
+
+    seq_data: SequenceData
+    multi_modal_data: Optional[MultiModalDataDict] = None
+    multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
+
+
+class InputRegistry:
+    """
+    Note: This is only used in V0.
+    """
+
+    def dummy_data_for_profiling(
+        self,
+        model_config: ModelConfig,
+        seq_len: int,
+        mm_registry: MultiModalRegistry,
+        is_encoder_data: bool = False,
+    ) -> DummyData:
+        """
+        Create dummy data for profiling the memory usage of a model.
+
+        The model is identified by ``model_config``.
+        """
+        # Avoid circular import
+        from vllm.sequence import SequenceData
+
+        if not model_config.is_multimodal_model:
+            seq_data = SequenceData.from_prompt_token_counts((0, seq_len))
+            return DummyData(seq_data=seq_data)
+
+        # Encoder dummy data does not contain multi-modal data
+        if is_encoder_data:
+            enc_data = mm_registry.get_encoder_dummy_data(
+                model_config, seq_len)
+            seq_data = SequenceData.from_seqs(enc_data.prompt_token_ids)
+            return DummyData(seq_data=seq_data)
+
+        dec_data = mm_registry.get_decoder_dummy_data(model_config, seq_len)
+
+        return DummyData(
+            seq_data=SequenceData.from_seqs(dec_data.prompt_token_ids),
+            multi_modal_data=dec_data.multi_modal_data,
+            multi_modal_placeholders=dec_data.multi_modal_placeholders,
+        )
diff --git a/vllm_v0.10.0/vllm/jsontree.py b/vllm_v0.10.0/vllm/jsontree.py
new file mode 100644
index 0000000..4cbe0f7
--- /dev/null
+++ b/vllm_v0.10.0/vllm/jsontree.py
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Helper functions to work with nested JSON structures."""
+from collections.abc import Iterable
+from functools import reduce
+from typing import Callable, TypeVar, Union, overload
+
+_T = TypeVar("_T")
+_U = TypeVar("_U")
+
+JSONTree = Union[dict[str, "JSONTree[_T]"], list["JSONTree[_T]"],
+                 tuple["JSONTree[_T]", ...], _T]
+"""A nested JSON structure where the leaves need not be JSON-serializable."""
+
+
+def json_iter_leaves(value: JSONTree[_T]) -> Iterable[_T]:
+    """Iterate through each leaf in a nested JSON structure."""
+    if isinstance(value, dict):
+        for v in value.values():
+            yield from json_iter_leaves(v)
+    elif isinstance(value, (list, tuple)):
+        for v in value:
+            yield from json_iter_leaves(v)
+    else:
+        yield value
+
+
+def json_map_leaves(
+    func: Callable[[_T], _U],
+    value: JSONTree[_T],
+) -> JSONTree[_U]:
+    """Apply a function to each leaf in a nested JSON structure."""
+    if isinstance(value, dict):
+        return {k: json_map_leaves(func, v) for k, v in value.items()}
+    elif isinstance(value, list):
+        return [json_map_leaves(func, v) for v in value]
+    elif isinstance(value, tuple):
+        return tuple(json_map_leaves(func, v) for v in value)
+    else:
+        return func(value)
+
+
+@overload
+def json_reduce_leaves(
+    func: Callable[[_T, _T], _T],
+    value: JSONTree[_T],
+    /,
+) -> _T:
+    ...
+
+
+@overload
+def json_reduce_leaves(
+    func: Callable[[_U, _T], _U],
+    value: JSONTree[_T],
+    initial: _U,
+    /,
+) -> _U:
+    ...
+
+
+def json_reduce_leaves(
+    func: Callable[..., Union[_T, _U]],
+    value: JSONTree[_T],
+    initial: _U = ...,  # type: ignore[assignment]
+    /,
+) -> Union[_T, _U]:
+    """
+    Apply a function of two arguments cumulatively to each leaf in a
+    nested JSON structure, from left to right, so as to reduce the
+    sequence to a single value.
+    """
+    if initial is ...:
+        return reduce(func, json_iter_leaves(value))  # type: ignore[arg-type]
+
+    return reduce(
+        func,  # type: ignore[arg-type]
+        json_iter_leaves(value),
+        initial,
+    )
diff --git a/vllm_v0.10.0/vllm/logger.py b/vllm_v0.10.0/vllm/logger.py
new file mode 100644
index 0000000..69aaf43
--- /dev/null
+++ b/vllm_v0.10.0/vllm/logger.py
@@ -0,0 +1,226 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Logging configuration for vLLM."""
+import datetime
+import json
+import logging
+import os
+import sys
+from collections.abc import Hashable
+from functools import lru_cache, partial
+from logging import Logger
+from logging.config import dictConfig
+from os import path
+from types import MethodType
+from typing import Any, Optional, cast
+
+import vllm.envs as envs
+
+VLLM_CONFIGURE_LOGGING = envs.VLLM_CONFIGURE_LOGGING
+VLLM_LOGGING_CONFIG_PATH = envs.VLLM_LOGGING_CONFIG_PATH
+VLLM_LOGGING_LEVEL = envs.VLLM_LOGGING_LEVEL
+VLLM_LOGGING_PREFIX = envs.VLLM_LOGGING_PREFIX
+
+_FORMAT = (f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s "
+           "[%(filename)s:%(lineno)d] %(message)s")
+_DATE_FORMAT = "%m-%d %H:%M:%S"
+
+DEFAULT_LOGGING_CONFIG = {
+    "formatters": {
+        "vllm": {
+            "class": "vllm.logging_utils.NewLineFormatter",
+            "datefmt": _DATE_FORMAT,
+            "format": _FORMAT,
+        },
+    },
+    "handlers": {
+        "vllm": {
+            "class": "logging.StreamHandler",
+            "formatter": "vllm",
+            "level": VLLM_LOGGING_LEVEL,
+            "stream": "ext://sys.stdout",
+        },
+    },
+    "loggers": {
+        "vllm": {
+            "handlers": ["vllm"],
+            "level": "DEBUG",
+            "propagate": False,
+        },
+    },
+    "version": 1,
+    "disable_existing_loggers": False
+}
+
+
+@lru_cache
+def _print_debug_once(logger: Logger, msg: str, *args: Hashable) -> None:
+    # Set the stacklevel to 2 to print the original caller's line info
+    logger.debug(msg, *args, stacklevel=2)
+
+
+@lru_cache
+def _print_info_once(logger: Logger, msg: str, *args: Hashable) -> None:
+    # Set the stacklevel to 2 to print the original caller's line info
+    logger.info(msg, *args, stacklevel=2)
+
+
+@lru_cache
+def _print_warning_once(logger: Logger, msg: str, *args: Hashable) -> None:
+    # Set the stacklevel to 2 to print the original caller's line info
+    logger.warning(msg, *args, stacklevel=2)
+
+
+class _VllmLogger(Logger):
+    """
+    Note:
+        This class is just to provide type information.
+        We actually patch the methods directly on the [`logging.Logger`][]
+        instance to avoid conflicting with other libraries such as
+        `intel_extension_for_pytorch.utils._logger`.
+    """
+
+    def debug_once(self, msg: str, *args: Hashable) -> None:
+        """
+        As [`debug`][logging.Logger.debug], but subsequent calls with
+        the same message are silently dropped.
+        """
+        _print_debug_once(self, msg, *args)
+
+    def info_once(self, msg: str, *args: Hashable) -> None:
+        """
+        As [`info`][logging.Logger.info], but subsequent calls with
+        the same message are silently dropped.
+        """
+        _print_info_once(self, msg, *args)
+
+    def warning_once(self, msg: str, *args: Hashable) -> None:
+        """
+        As [`warning`][logging.Logger.warning], but subsequent calls with
+        the same message are silently dropped.
+        """
+        _print_warning_once(self, msg, *args)
+
+
+def _configure_vllm_root_logger() -> None:
+    logging_config = dict[str, Any]()
+
+    if not VLLM_CONFIGURE_LOGGING and VLLM_LOGGING_CONFIG_PATH:
+        raise RuntimeError(
+            "VLLM_CONFIGURE_LOGGING evaluated to false, but "
+            "VLLM_LOGGING_CONFIG_PATH was given. VLLM_LOGGING_CONFIG_PATH "
+            "implies VLLM_CONFIGURE_LOGGING. Please enable "
+            "VLLM_CONFIGURE_LOGGING or unset VLLM_LOGGING_CONFIG_PATH.")
+
+    if VLLM_CONFIGURE_LOGGING:
+        logging_config = DEFAULT_LOGGING_CONFIG
+
+    if VLLM_LOGGING_CONFIG_PATH:
+        if not path.exists(VLLM_LOGGING_CONFIG_PATH):
+            raise RuntimeError(
+                "Could not load logging config. File does not exist: %s",
+                VLLM_LOGGING_CONFIG_PATH)
+        with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8") as file:
+            custom_config = json.loads(file.read())
+
+        if not isinstance(custom_config, dict):
+            raise ValueError("Invalid logging config. Expected dict, got %s.",
+                             type(custom_config).__name__)
+        logging_config = custom_config
+
+    for formatter in logging_config.get("formatters", {}).values():
+        # This provides backwards compatibility after #10134.
+        if formatter.get("class") == "vllm.logging.NewLineFormatter":
+            formatter["class"] = "vllm.logging_utils.NewLineFormatter"
+
+    if logging_config:
+        dictConfig(logging_config)
+
+
+def init_logger(name: str) -> _VllmLogger:
+    """The main purpose of this function is to ensure that loggers are
+    retrieved in such a way that we can be sure the root vllm logger has
+    already been configured."""
+
+    logger = logging.getLogger(name)
+
+    methods_to_patch = {
+        "debug_once": _print_debug_once,
+        "info_once": _print_info_once,
+        "warning_once": _print_warning_once,
+    }
+
+    for method_name, method in methods_to_patch.items():
+        setattr(logger, method_name, MethodType(method, logger))
+
+    return cast(_VllmLogger, logger)
+
+
+# The root logger is initialized when the module is imported.
+# This is thread-safe as the module is only imported once,
+# guaranteed by the Python GIL.
+_configure_vllm_root_logger()
+
+logger = init_logger(__name__)
+
+
+def _trace_calls(log_path, root_dir, frame, event, arg=None):
+    if event in ['call', 'return']:
+        # Extract the filename, line number, function name, and the code object
+        filename = frame.f_code.co_filename
+        lineno = frame.f_lineno
+        func_name = frame.f_code.co_name
+        if not filename.startswith(root_dir):
+            # only log the functions in the vllm root_dir
+            return
+        # Log every function call or return
+        try:
+            last_frame = frame.f_back
+            if last_frame is not None:
+                last_filename = last_frame.f_code.co_filename
+                last_lineno = last_frame.f_lineno
+                last_func_name = last_frame.f_code.co_name
+            else:
+                # initial frame
+                last_filename = ""
+                last_lineno = 0
+                last_func_name = ""
+            with open(log_path, 'a') as f:
+                ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
+                if event == 'call':
+                    f.write(f"{ts} Call to"
+                            f" {func_name} in {filename}:{lineno}"
+                            f" from {last_func_name} in {last_filename}:"
+                            f"{last_lineno}\n")
+                else:
+                    f.write(f"{ts} Return from"
+                            f" {func_name} in {filename}:{lineno}"
+                            f" to {last_func_name} in {last_filename}:"
+                            f"{last_lineno}\n")
+        except NameError:
+            # modules are deleted during shutdown
+            pass
+    return partial(_trace_calls, log_path, root_dir)
+
+
+def enable_trace_function_call(log_file_path: str,
+                               root_dir: Optional[str] = None):
+    """
+    Enable tracing of every function call in code under `root_dir`.
+    This is useful for debugging hangs or crashes.
+    `log_file_path` is the path to the log file.
+    `root_dir` is the root directory of the code to trace. If None, it is the
+    vllm root directory.
+
+    Note that this call is thread-level, any threads calling this function
+    will have the trace enabled. Other threads will not be affected.
+    """
+    logger.warning(
+        "VLLM_TRACE_FUNCTION is enabled. It will record every"
+        " function executed by Python. This will slow down the code. It "
+        "is suggested to be used for debugging hang or crashes only.")
+    logger.info("Trace frame log is saved to %s", log_file_path)
+    if root_dir is None:
+        # by default, this is the vllm root directory
+        root_dir = os.path.dirname(os.path.dirname(__file__))
+    sys.settrace(partial(_trace_calls, log_file_path, root_dir))
diff --git a/vllm_v0.10.0/vllm/logging_utils/__init__.py b/vllm_v0.10.0/vllm/logging_utils/__init__.py
new file mode 100644
index 0000000..cf690a8
--- /dev/null
+++ b/vllm_v0.10.0/vllm/logging_utils/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.logging_utils.formatter import NewLineFormatter
+
+__all__ = [
+    "NewLineFormatter",
+]
diff --git a/vllm_v0.10.0/vllm/logging_utils/dump_input.py b/vllm_v0.10.0/vllm/logging_utils/dump_input.py
new file mode 100644
index 0000000..ad89638
--- /dev/null
+++ b/vllm_v0.10.0/vllm/logging_utils/dump_input.py
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import contextlib
+import enum
+import json
+from typing import Optional
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.metrics.stats import SchedulerStats
+from vllm.version import __version__ as VLLM_VERSION
+
+logger = init_logger(__name__)
+
+
+def prepare_object_to_dump(obj) -> str:
+    if isinstance(obj, str):
+        return f"'{obj}'"  # Double quotes
+    elif isinstance(obj, dict):
+        dict_str = ', '.join({f'{str(k)}: {prepare_object_to_dump(v)}' \
+            for k, v in obj.items()})
+        return f'{{{dict_str}}}'
+    elif isinstance(obj, list):
+        return f"[{', '.join([prepare_object_to_dump(v) for v in obj])}]"
+    elif isinstance(obj, set):
+        return f"[{', '.join([prepare_object_to_dump(v) for v in list(obj)])}]"
+        # return [prepare_object_to_dump(v) for v in list(obj)]
+    elif isinstance(obj, tuple):
+        return f"[{', '.join([prepare_object_to_dump(v) for v in obj])}]"
+    elif isinstance(obj, enum.Enum):
+        return repr(obj)
+    elif isinstance(obj, torch.Tensor):
+        # We only print the 'draft' of the tensor to not expose sensitive data
+        # and to get some metadata in case of CUDA runtime crashed
+        return (f"Tensor(shape={obj.shape}, "
+                f"device={obj.device},"
+                f"dtype={obj.dtype})")
+    elif hasattr(obj, 'anon_repr'):
+        return obj.anon_repr()
+    elif hasattr(obj, '__dict__'):
+        items = obj.__dict__.items()
+        dict_str = ', '.join([f'{str(k)}={prepare_object_to_dump(v)}' \
+            for k, v in items])
+        return f"{type(obj).__name__}({dict_str})"
+    else:
+        # Hacky way to make sure we can serialize the object in JSON format
+        try:
+            return json.dumps(obj)
+        except (TypeError, OverflowError):
+            return repr(obj)
+
+
+def dump_engine_exception(config: VllmConfig,
+                          scheduler_output: SchedulerOutput,
+                          scheduler_stats: Optional[SchedulerStats]):
+    # NOTE: ensure we can log extra info without risking raises
+    # unexpected errors during logging
+    with contextlib.suppress(Exception):
+        _dump_engine_exception(config, scheduler_output, scheduler_stats)
+
+
+def _dump_engine_exception(config: VllmConfig,
+                           scheduler_output: SchedulerOutput,
+                           scheduler_stats: Optional[SchedulerStats]):
+    logger.error(
+        "Dumping input data for V1 LLM engine (v%s) with config: %s, ",
+        VLLM_VERSION,
+        config,
+    )
+    try:
+        dump_obj = prepare_object_to_dump(scheduler_output)
+        logger.error("Dumping scheduler output for model execution: %s",
+                     dump_obj)
+        if scheduler_stats:
+            logger.error("Dumping scheduler stats: %s", scheduler_stats)
+    except Exception:
+        logger.exception("Error preparing object to dump")
diff --git a/vllm_v0.10.0/vllm/logging_utils/formatter.py b/vllm_v0.10.0/vllm/logging_utils/formatter.py
new file mode 100644
index 0000000..0affef1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/logging_utils/formatter.py
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import logging
+
+
+class NewLineFormatter(logging.Formatter):
+    """Adds logging prefix to newlines to align multi-line messages."""
+
+    def __init__(self, fmt, datefmt=None, style="%"):
+        logging.Formatter.__init__(self, fmt, datefmt, style)
+
+    def format(self, record):
+        msg = logging.Formatter.format(self, record)
+        if record.message != "":
+            parts = msg.split(record.message)
+            msg = msg.replace("\n", "\r\n" + parts[0])
+        return msg
diff --git a/vllm_v0.10.0/vllm/logits_process.py b/vllm_v0.10.0/vllm/logits_process.py
new file mode 100644
index 0000000..5967d08
--- /dev/null
+++ b/vllm_v0.10.0/vllm/logits_process.py
@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Callable, Union
+
+import torch
+
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+LogitsProcessor = Union[
+    Callable[[list[int], torch.Tensor], torch.Tensor],
+    Callable[[list[int], list[int], torch.Tensor], torch.Tensor],
+]
+"""LogitsProcessor is a function that takes a list
+of previously generated tokens, the logits tensor
+for the next token and, optionally, prompt tokens as a
+first argument, and returns a modified tensor of logits
+to sample from."""
+
+
+def get_bad_words_logits_processors(
+        bad_words: list[str],
+        tokenizer: AnyTokenizer) -> list[LogitsProcessor]:
+    bad_words_ids: list[list[int]] = list()
+
+    for bad_word in bad_words:
+        # To prohibit words both at the beginning
+        # and in the middle of text
+        # (related to add_prefix_space tokenizer parameter)
+        for add_prefix_space in [False, True]:
+            prefix = " " if add_prefix_space else ""
+            prompt = prefix + bad_word.lstrip()
+
+            prompt_token_ids = tokenizer.encode(text=prompt,
+                                                add_special_tokens=False)
+
+            # If no space at the beginning
+            # or if prefix space produces a new word token
+            if (not add_prefix_space) or (
+                    add_prefix_space
+                    and prompt_token_ids[0] != bad_words_ids[-1][0]
+                    and len(prompt_token_ids) == len(bad_words_ids[-1])):
+                bad_words_ids.append(prompt_token_ids)
+
+    return [NoBadWordsLogitsProcessor(bad_words_ids=bad_words_ids)]
+
+
+class NoBadWordsLogitsProcessor:
+    _SMALLEST_LOGIT = float("-inf")
+    _NEUTRAL_LOGIT = 0.0
+
+    def __init__(self, bad_words_ids: list[list[int]]):
+        self.bad_words_ids = bad_words_ids
+        self.word_bias: torch.FloatTensor = None
+
+    def __call__(
+        self,
+        past_tokens_ids: Union[list[int], tuple[int]],
+        logits: torch.FloatTensor,
+    ) -> torch.Tensor:
+        if self.word_bias is None:
+            self._init_word_bias(logits=logits)
+
+        last_token_bias = torch.zeros_like(logits)
+
+        for bad_word_ids in self.bad_words_ids:
+            if len(bad_word_ids) == 1:  # 1-token words already processed
+                continue
+
+            if len(bad_word_ids) > len(past_tokens_ids) + 1:
+                continue
+
+            prefix_length = len(bad_word_ids) - 1
+            last_token_id = bad_word_ids[-1]
+            actual_prefix = past_tokens_ids[-prefix_length:]
+            expected_prefix = bad_word_ids[:prefix_length]
+
+            assert len(actual_prefix) == len(expected_prefix)
+
+            is_match = tuple(actual_prefix) == tuple(expected_prefix)
+            last_token_bias[last_token_id] += (self._SMALLEST_LOGIT if is_match
+                                               else self._NEUTRAL_LOGIT)
+
+        logits = logits + self.word_bias + last_token_bias
+
+        return logits
+
+    def _init_word_bias(self, logits: torch.FloatTensor) -> None:
+        # Code based on NoBadWordsLogitsProcessor and SequenceBiasLogitsProcessor  # noqa: E501
+        # from https://github.com/huggingface/transformers/blob/main/src/transformers/generation/logits_process.py
+
+        vocab_size = logits.shape[-1]
+
+        self._check_token_ids_bounds(vocab_size=vocab_size)
+
+        self.word_bias = torch.zeros((vocab_size, ),
+                                     dtype=torch.float,
+                                     device=logits.device)
+
+        for bad_word_ids in self.bad_words_ids:
+            if len(bad_word_ids) == 1:
+                bad_word_id = bad_word_ids[-1]
+                self.word_bias[bad_word_id] = self._SMALLEST_LOGIT
+
+    def _check_token_ids_bounds(self, vocab_size: int) -> None:
+        invalid_token_ids = []
+
+        for bad_word_ids in self.bad_words_ids:
+            for token_id in bad_word_ids:
+                if token_id < 0 or token_id >= vocab_size:
+                    invalid_token_ids.append(token_id)
+
+        if len(invalid_token_ids) > 0:
+            raise ValueError(
+                f"The model vocabulary size is {vocab_size},"
+                f" but the following tokens"
+                f" were specified as bad: {invalid_token_ids}."
+                f" All token id values should be integers satisfying:"
+                f" 0 <= token_id < {vocab_size}.")
diff --git a/vllm_v0.10.0/vllm/lora/__init__.py b/vllm_v0.10.0/vllm/lora/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/lora/fully_sharded_layers.py b/vllm_v0.10.0/vllm/lora/fully_sharded_layers.py
new file mode 100644
index 0000000..7fc4cfe
--- /dev/null
+++ b/vllm_v0.10.0/vllm/lora/fully_sharded_layers.py
@@ -0,0 +1,355 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# pylint: disable=unused-argument
+from typing import TYPE_CHECKING, Optional, Union, cast
+
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig
+
+from vllm.config import LoRAConfig
+from vllm.distributed.communication_op import (
+    tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce)
+from vllm.distributed.parallel_state import get_tensor_model_parallel_rank
+from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
+                              MergedColumnParallelLinearWithLoRA,
+                              MergedQKVParallelLinearWithLoRA,
+                              QKVParallelLinearWithLoRA,
+                              RowParallelLinearWithLoRA)
+from vllm.platforms import current_platform
+
+if TYPE_CHECKING:
+    pass
+
+
+def _fully_sharded_can_replace(can_replace):
+    """
+    decorator which adds the condition of fully sharded loras
+    intended to wrap can_replace_layer()
+    """
+
+    def dec(*args, **kwargs):
+        return (can_replace(*args, **kwargs)
+                and kwargs["lora_config"].fully_sharded_loras)
+
+    return dec
+
+
+def _mcp_apply(x, bias, layer: ColumnParallelLinearWithLoRA):
+    """ 
+    For `ColumnParallelLinearWithLoRA` or classes that inherit from 
+    `ColumnParallelLinearWithLoRA`, they share the same `apply` logic.
+    """
+    assert (layer.n_slices == len(layer.lora_a_stacked) == len(
+        layer.lora_b_stacked) == len(layer.output_slices))
+    if layer.lora_bias_stacked is not None:
+        assert layer.n_slices == len(layer.lora_bias_stacked)
+
+    output = layer.base_layer.quant_method.apply(layer.base_layer, x, bias)
+
+    x = x.view(-1, x.shape[-1])
+    output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape
+
+    # Since communication is needed, the buffer is directly initialized as a
+    # tensor rather than a tuple of tensor.
+    buffers = torch.zeros(
+        (layer.n_slices, x.shape[0], layer.lora_a_stacked[0].shape[2]),
+        dtype=torch.float32,
+        device=x.device,
+    )
+
+    shrunk_buffers: Optional[torch.Tensor] = layer.punica_wrapper.add_shrink(
+        buffers, x, layer.lora_a_stacked, 1.0)
+
+    if not current_platform.can_update_inplace():
+        buffers = shrunk_buffers
+
+    buffers = tensor_model_parallel_all_gather(buffers)
+
+    lora_output: Optional[torch.Tensor] = layer.punica_wrapper.add_expand(
+        output,
+        buffers,
+        layer.lora_b_stacked,
+        layer.lora_bias_stacked,
+        layer.output_slices,
+        offset_start=0,
+        add_input=True)
+
+    if not current_platform.can_update_inplace():
+        output = lora_output
+
+    output = output.view(*out_orig_shape)
+    # now have column partitioned and packed output
+    return output
+
+
+# these layers are based on the tensor parallelism strategy given in
+# Y. Sheng et al., S-LoRA: Serving Thousands of Concurrent LoRA Adapters. 2023,
+# https://arxiv.org/abs/2311.03285.
+
+
+class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA):
+    """
+    Differs from ColumnParallelLinearWithLoRA by slicing LoRA A also.
+
+    Based on S-LoRA, slicing happens along the rank dim.
+    """
+
+    # For all LoRA layers where the `base_layer` is `ColumnParallelLinear`,
+    # their `lora_a` and `lora_b` have different sharding patterns. After
+    # completing the `lora_a` GEMM , a gather operation is performed.
+    # Therefore, the sharding of `lora_a` only needs to correspond with the
+    # gather operation.
+    def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
+        tp_rank = get_tensor_model_parallel_rank()
+        shard_size = self.lora_a_stacked[0].shape[2]
+        start_idx = tp_rank * shard_size
+        lora_a = lora_a[:, start_idx:start_idx + shard_size]
+        return lora_a
+
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return _mcp_apply(x, bias, self)
+
+    @classmethod
+    @_fully_sharded_can_replace
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        # specifying kwargs so they can be easily accessed in decorator
+        return super().can_replace_layer(
+            source_layer=source_layer,
+            lora_config=lora_config,
+            packed_modules_list=packed_modules_list,
+            model_config=model_config,
+            decorate=False,
+        )
+
+
+class MergedColumnParallelLinearWithShardedLoRA(
+        MergedColumnParallelLinearWithLoRA):
+    """
+    Differs from MergedColumnParallelLinearWithLoRA by slicing the
+    LoRA A's also.
+
+    Based on S-LoRA, slicing happens along the rank dim.
+    """
+
+    def slice_lora_a(
+        self, lora_a: list[Union[torch.Tensor, None]]
+    ) -> list[Union[torch.Tensor, None]]:
+        #NOTE: lora_a contains 2 subloras, and each sublora could be None.
+        output_shard_size = self.lora_a_stacked[0].shape[2]
+        output_start_idx = self.tp_rank * output_shard_size
+        lora_a = [
+            lora_a[0][:, output_start_idx:output_start_idx +
+                      output_shard_size] if lora_a[0] is not None else None,
+            lora_a[1][:, output_start_idx:output_start_idx +
+                      output_shard_size] if lora_a[1] is not None else None,
+        ]
+        return lora_a
+
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return _mcp_apply(x, bias, self)
+
+    @classmethod
+    @_fully_sharded_can_replace
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        # specifying kwargs so they can be easily accessed in decorator
+        return super().can_replace_layer(
+            source_layer=source_layer,
+            lora_config=lora_config,
+            packed_modules_list=packed_modules_list,
+            model_config=model_config,
+            decorate=False,
+        )
+
+
+class QKVParallelLinearWithShardedLoRA(QKVParallelLinearWithLoRA):
+    """
+    Differs from QKVParallelLinearWithLoRA by slicing the
+    LoRA A's also.
+
+    Based on S-LoRA, slicing happens along the rank dim.
+    """
+
+    def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
+        tp_rank = get_tensor_model_parallel_rank()
+        shard_size = self.lora_a_stacked[0].shape[2]
+        start_idx = tp_rank * shard_size
+        lora_a = lora_a[:, start_idx:start_idx + shard_size]
+        return lora_a
+
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return _mcp_apply(x, bias, self)
+
+    @classmethod
+    @_fully_sharded_can_replace
+    def can_replace_layer(cls, source_layer: nn.Module,
+                          lora_config: LoRAConfig, packed_modules_list: list,
+                          model_config: Optional[PretrainedConfig]) -> bool:
+        # specifying kwargs so they can be easily accessed in decorator
+        return super().can_replace_layer(
+            source_layer=source_layer,
+            lora_config=lora_config,
+            packed_modules_list=packed_modules_list,
+            model_config=model_config,
+            decorate=False,
+        )
+
+
+class MergedQKVParallelLinearWithShardedLoRA(MergedQKVParallelLinearWithLoRA):
+    """
+    Differs from MergedQKVParallelLinearWithLoRA by slicing the 
+    LoRA A's also.
+
+    Based on S-LoRA, slicing happens along the rank dim.
+    """
+
+    def slice_lora_a(
+        self, lora_a: list[Union[torch.Tensor, None]]
+    ) -> list[Union[torch.Tensor, None]]:
+        # NOTE: lora_a contains 3 subloras, and each sublora could be None.
+        shard_size = [self.lora_a_stacked[i].shape[2] for i in range(3)]
+        start_idx = [self.tp_rank * shard_size[i] for i in range(3)]
+        lora_a = [
+            lora_a[0][:, start_idx[0]:start_idx[0] +
+                      shard_size[0]] if lora_a[0] is not None else None,
+            lora_a[1][:, start_idx[1]:start_idx[1] +
+                      shard_size[1]] if lora_a[1] is not None else None,
+            lora_a[2][:, start_idx[2]:start_idx[2] +
+                      shard_size[2]] if lora_a[2] is not None else None,
+        ]
+        return lora_a
+
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return _mcp_apply(x, bias, self)
+
+    @classmethod
+    @_fully_sharded_can_replace
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        # specifying kwargs so they can be easily accessed in decorator
+        return super().can_replace_layer(
+            source_layer=source_layer,
+            lora_config=lora_config,
+            packed_modules_list=packed_modules_list,
+            model_config=model_config,
+            decorate=False,
+        )
+
+
+class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
+    """
+    Differs from RowParallelLinearWithLoRA by slicing the
+    LoRA B's also.
+
+    Based on S-LoRA, slicing happens along the output dim.
+    This yields a combined partial sum from the row parallel base
+    layer and column partitioned output from the LoRA.
+    """
+
+    def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
+        shard_size = self.lora_b_stacked[0].shape[2]
+        start_idx = self.tp_rank * shard_size
+        end_idx = (self.tp_rank + 1) * shard_size
+        lora_b = lora_b[:, start_idx:end_idx]
+        return lora_b
+
+    def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
+        if bias is None:
+            return bias
+        self.lora_bias_stacked = cast(tuple[torch.Tensor, ...],
+                                      self.lora_bias_stacked)
+        shard_size = self.lora_bias_stacked[0].shape[2]
+        start_idx = self.tp_rank * shard_size
+        end_idx = (self.tp_rank + 1) * shard_size
+        bias = bias[start_idx:end_idx]
+        return bias
+
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        output = self.base_layer.quant_method.apply(self.base_layer, x)
+
+        x = x.view(-1, x.shape[-1])
+        output, out_orig_shape = output.view(-1,
+                                             output.shape[-1]), output.shape
+        buffer = torch.zeros(
+            (self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]),
+            dtype=torch.float32,
+            device=x.device,
+        )
+
+        shrunk_buffer: Optional[torch.Tensor] = self.punica_wrapper.add_shrink(
+            buffer, x, self.lora_a_stacked, 1.0)
+        if not current_platform.can_update_inplace():
+            buffer = shrunk_buffer
+
+        buffer = tensor_model_parallel_all_reduce(buffer)
+
+        # following S-LoRA, allows the fusing of all_gather and all_reduce
+        # by adding the column partitioned lora output to a slice of output
+        # tensor, which is a partial sum due to row parallel. All that
+        # remains is a standard all_reduce. User should be aware though that
+        # the output is not the same as a normal row_parallel, it should be
+        # reduced before being used
+        # NOTE offset are based on the rank.
+        shard_size = self.lora_b_stacked[0].shape[2]
+        offset_start = self.tp_rank * shard_size
+        lora_output: Optional[torch.Tensor] = self.punica_wrapper.add_expand(
+            output,
+            buffer,
+            self.lora_b_stacked,
+            self.lora_bias_stacked,
+            self.output_slices,
+            offset_start=offset_start,
+            add_input=True,
+        )
+
+        if not current_platform.can_update_inplace():
+            output = lora_output
+
+        output = output.view(*out_orig_shape)
+        return output
+
+    @classmethod
+    @_fully_sharded_can_replace
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        # specifying kwargs so they can be easily accessed in decorator
+        return super().can_replace_layer(
+            source_layer=source_layer,
+            lora_config=lora_config,
+            packed_modules_list=packed_modules_list,
+            model_config=model_config,
+            decorate=False,
+        )
diff --git a/vllm_v0.10.0/vllm/lora/layers.py b/vllm_v0.10.0/vllm/lora/layers.py
new file mode 100644
index 0000000..c3512ec
--- /dev/null
+++ b/vllm_v0.10.0/vllm/lora/layers.py
@@ -0,0 +1,1193 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# pylint: disable=unused-argument
+import math
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional, Union, cast
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PretrainedConfig
+
+from vllm.adapter_commons.layers import AdapterMapping
+from vllm.config import LoRAConfig
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              split_tensor_along_last_dim,
+                              tensor_model_parallel_all_gather,
+                              tensor_model_parallel_all_reduce)
+from vllm.distributed.utils import divide
+# yapf: disable
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               LinearBase,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+# yapf: enable
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.platforms import current_platform
+
+if TYPE_CHECKING:
+    from vllm.lora.punica_wrapper import PunicaWrapperBase
+
+
+def _get_lora_device(base_layer: nn.Module) -> torch.device:
+    # code borrowed from https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/vllm/lora/layers.py#L34
+    """Returns the device for where to place the LoRA tensors."""
+    # unquantizedLinear
+    if hasattr(base_layer, "weight"):
+        return base_layer.weight.device
+    # Compressed Tensor
+    elif hasattr(base_layer, "weight_packed"):
+        return base_layer.weight_packed.device
+    # GPTQ/AWQ
+    elif hasattr(base_layer, "qweight"):
+        return base_layer.qweight.device
+    # marlin
+    elif hasattr(base_layer, "B"):
+        return base_layer.B.device
+    # HQQ marlin
+    elif hasattr(base_layer, "W_q"):
+        return base_layer.W_q.device
+    else:
+        raise ValueError(f"Unsupported base layer: {base_layer}")
+
+
+def _not_fully_sharded_can_replace(can_replace):
+    """
+    decorator which adds the condition of not using fully sharded loras
+    intended to wrap can_replace_layer()
+    """
+
+    def dec(*args, **kwargs):
+        decorate = kwargs.pop("decorate") if "decorate" in kwargs else True
+        condition = (not kwargs["lora_config"].fully_sharded_loras
+                     if decorate else True)
+        return can_replace(*args, **kwargs) and condition
+
+    return dec
+
+
+@dataclass
+class LoRAMapping(AdapterMapping):
+    is_prefill: bool = False
+
+
+class BaseLayerWithLoRA(nn.Module):
+
+    def slice_lora_a(
+        self, lora_a: Union[torch.Tensor, list[Union[torch.Tensor, None]]]
+    ) -> Union[torch.Tensor, list[Union[torch.Tensor, None]]]:
+        """Slice lora a if splitting for tensor parallelism."""
+        ...
+
+    def slice_lora_b(
+        self, lora_b: Union[torch.Tensor, list[Union[torch.Tensor, None]]]
+    ) -> Union[torch.Tensor, list[Union[torch.Tensor, None]]]:
+        """Slice lora b if splitting with tensor parallelism."""
+        ...
+
+    def create_lora_weights(
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
+        """Initializes lora matrices."""
+        ...
+
+    def reset_lora(self, index: int):
+        """Resets the lora weights at index back to 0."""
+        ...
+
+    def set_lora(
+        self,
+        index: int,
+        lora_a: torch.Tensor,
+        lora_b: torch.Tensor,
+        embeddings_tensor: Optional[torch.Tensor],
+        bias: Optional[torch.Tensor] = None,
+    ):
+        """Overwrites lora tensors at index."""
+        ...
+
+    def set_mapping(
+        self,
+        punica_wrapper,
+    ):
+        self.punica_wrapper: PunicaWrapperBase = punica_wrapper
+
+    @classmethod
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        """Returns True if the layer can be replaced by this LoRA layer."""
+        raise NotImplementedError
+
+
+class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
+
+    def __init__(self, base_layer: VocabParallelEmbedding) -> None:
+        super().__init__()
+        self.base_layer = base_layer
+        self.embeddings_slice: Optional[tuple[int, int]]
+        self.embeddings_weights: Optional[torch.Tensor]
+
+    def create_lora_weights(
+            self,
+            max_loras: int,
+            lora_config: LoRAConfig,
+            model_config: Optional[PretrainedConfig] = None) -> None:
+
+        if self.base_layer.num_added_embeddings_per_partition > 0:
+            # We can start adding lora weights
+            self.embeddings_weights = self.base_layer.weight.data[
+                self.base_layer.num_org_embeddings_per_partition:self.
+                base_layer.num_org_embeddings_per_partition +
+                self.base_layer.num_added_embeddings_per_partition]
+            self.embeddings_slice = (
+                self.base_layer.shard_indices.added_vocab_start_index -
+                self.base_layer.org_vocab_size,
+                self.base_layer.shard_indices.added_vocab_end_index -
+                self.base_layer.org_vocab_size)
+            self.base_layer.weight.data[
+                self.base_layer.num_org_embeddings_per_partition:].fill_(0)
+        else:
+            self.embeddings_slice = None
+            self.embeddings_weights = None
+
+        self.embeddings_tensors = torch.zeros(
+            (
+                max_loras,
+                lora_config.lora_extra_vocab_size,
+                self.base_layer.embedding_dim,
+            ),
+            dtype=self.base_layer.weight.dtype,
+            device=self.base_layer.weight.device,
+        )
+        self.lora_a_stacked = torch.zeros(
+            (
+                max_loras,
+                self.base_layer.org_vocab_size +
+                lora_config.lora_extra_vocab_size,
+                lora_config.max_lora_rank,
+            ),
+            dtype=lora_config.lora_dtype,
+            device=self.base_layer.weight.device,
+        )
+        self.lora_b_stacked = torch.zeros(
+            (
+                max_loras,
+                1,
+                self.base_layer.embedding_dim,
+                lora_config.max_lora_rank,
+            ),
+            dtype=lora_config.lora_dtype,
+            device=self.base_layer.weight.device,
+        )
+        self.lora_a_stacked_2d = self.lora_a_stacked.view(
+            self.lora_a_stacked.shape[0] * self.lora_a_stacked.shape[1],
+            self.lora_a_stacked.shape[2],
+        )
+
+    def reset_lora(self, index: int):
+        self.lora_a_stacked[index] = 0
+        self.lora_b_stacked[index] = 0
+        self.embeddings_tensors[index] = 0
+
+    def set_lora(
+        self,
+        index: int,
+        lora_a: torch.Tensor,
+        lora_b: torch.Tensor,
+        embeddings_tensor: Optional[torch.Tensor],
+        bias: Optional[torch.Tensor] = None,
+    ):
+        self.reset_lora(index)
+        self.lora_a_stacked[index, :lora_a.shape[0], :lora_a.shape[1]].copy_(
+            lora_a, non_blocking=True)
+        self.lora_b_stacked[index,
+                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
+                                lora_b.T, non_blocking=True)
+        if embeddings_tensor is not None:
+            self.embeddings_tensors[
+                index,
+                :embeddings_tensor.shape[0],
+                :embeddings_tensor.shape[1],
+            ].copy_(embeddings_tensor, non_blocking=True)
+            if self.embeddings_slice is not None:
+                # TODO(yard1): Optimize this copy, we don't need to copy
+                # everything, just the modified part
+                embeddings = self.embeddings_tensors.view(
+                    self.embeddings_tensors.shape[0] *
+                    self.embeddings_tensors.shape[1],
+                    self.embeddings_tensors.shape[2],
+                )[self.embeddings_slice[0]:self.embeddings_slice[1]]
+                assert self.embeddings_weights is not None
+                self.embeddings_weights[:embeddings.shape[0]].copy_(embeddings)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        added_tokens_mask = torch.where(x > self.base_layer.org_vocab_size - 1,
+                                        1, 0)
+
+        # NB: Don't use torch.narrow here. torch.narrow triggers some
+        # Dynamic Shape specialization in torch.compile
+        num_tokens = x.shape[0]
+        indices_1 = self.punica_wrapper._embeddings_indices[1][:num_tokens]
+        indices_0 = self.punica_wrapper._embeddings_indices[0][:num_tokens]
+
+        full_lora_a_embeddings = F.embedding(
+            x + indices_1,
+            self.lora_a_stacked_2d,
+        )
+        full_output = self.base_layer.forward(x +
+                                              (indices_0 * added_tokens_mask))
+
+        full_output_org = full_output
+        if full_output.ndim == 3:
+            full_output = full_output.view(
+                full_output.shape[0] * full_output.shape[1], -1)
+        if full_lora_a_embeddings.ndim == 3:
+            full_lora_a_embeddings = full_lora_a_embeddings.view(
+                full_lora_a_embeddings.shape[0] *
+                full_lora_a_embeddings.shape[1],
+                -1,
+            )
+
+        lora_output: Optional[
+            torch.Tensor] = self.punica_wrapper.add_lora_embedding(
+                full_output,
+                full_lora_a_embeddings,
+                self.lora_b_stacked,
+                add_input=True)
+
+        if not current_platform.can_update_inplace():
+            full_output = lora_output
+
+        return full_output.view_as(full_output_org)
+
+    @classmethod
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        return type(source_layer) is VocabParallelEmbedding
+
+    @property
+    def weight(self):
+        return self.base_layer.weight
+
+
+class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
+
+    def __init__(self, base_layer: LinearBase):
+        super().__init__()
+        self.base_layer = base_layer
+        self.input_size = self.base_layer.input_size
+        self.device = _get_lora_device(self.base_layer)
+        self.lora_bias_stacked: Optional[tuple[torch.Tensor, ...]] = None
+
+        self.output_slices: tuple[int, ...]
+        self.tp_size: int
+        self.output_size: int
+        self.n_slices: int
+
+    def create_lora_weights(
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
+        self.lora_config = lora_config
+        #
+        if isinstance(self.base_layer, ReplicatedLinear):
+            lora_a_out_size = lora_config.max_lora_rank
+            lora_b_out_size = self.output_size
+
+        elif isinstance(self.base_layer, ColumnParallelLinear):
+            lora_a_out_size = (lora_config.max_lora_rank if
+                               not lora_config.fully_sharded_loras else divide(
+                                   lora_config.max_lora_rank, self.tp_size))
+            lora_b_out_size = self.output_size
+
+        elif isinstance(self.base_layer, RowParallelLinear):
+            lora_a_out_size = lora_config.max_lora_rank
+            lora_b_out_size = (self.output_size if
+                               not lora_config.fully_sharded_loras else divide(
+                                   self.output_size, self.tp_size))
+        else:
+            raise NotImplementedError
+
+        self.lora_a_stacked = tuple(
+            torch.zeros(
+                max_loras,
+                1,
+                lora_a_out_size,
+                self.input_size,
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            ) for _ in range(self.n_slices))
+        self.lora_b_stacked = tuple(
+            torch.zeros(
+                max_loras,
+                1,
+                lora_b_out_size,
+                lora_config.max_lora_rank,
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            ) for _ in range(self.n_slices))
+        if lora_config.bias_enabled:
+            lora_bias_out_size = lora_b_out_size
+            self.lora_bias_stacked = tuple(
+                torch.zeros(
+                    max_loras,
+                    1,
+                    lora_bias_out_size,
+                    dtype=lora_config.lora_dtype,
+                    device=self.device,
+                ) for _ in range(self.n_slices))
+        self.output_slices = (self.lora_b_stacked[0].shape[2], )
+
+    def reset_lora(self, index: int):
+        for s_index in range(self.n_slices):
+            self.lora_a_stacked[s_index][index] = 0
+            self.lora_b_stacked[s_index][index] = 0
+            if self.lora_config.bias_enabled:
+                # Make mypy happy
+                self.lora_bias_stacked = cast(tuple[torch.Tensor, ...],
+                                              self.lora_bias_stacked)
+                self.lora_bias_stacked[s_index][index] = 0
+
+    def set_lora(
+        self,
+        index: int,
+        lora_a: torch.Tensor,
+        lora_b: torch.Tensor,
+        embeddings_tensor: Optional[torch.Tensor],
+        lora_bias: Optional[torch.Tensor] = None,
+    ):
+        # Except for QKVParallelLinearWithLoRA and
+        # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers
+        # store weights in a tuple of size 1. These two layers will
+        # override this function.
+        assert (len(self.lora_a_stacked) == len(self.lora_b_stacked) ==
+                self.n_slices == 1)
+
+        self.reset_lora(index)
+        if self.tp_size > 1:
+            lora_a = self.slice_lora_a(lora_a)
+            lora_b = self.slice_lora_b(lora_b)
+            if lora_bias is not None:
+                lora_bias = self.slice_bias(lora_bias)
+
+        self.lora_a_stacked[0][index,
+                               0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
+                                   lora_a.T, non_blocking=True)
+        self.lora_b_stacked[0][index,
+                               0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
+                                   lora_b.T, non_blocking=True)
+        if lora_bias is not None:
+
+            self.lora_bias_stacked = cast(tuple[torch.Tensor, ...],
+                                          self.lora_bias_stacked)
+            assert len(self.lora_bias_stacked)
+            self.lora_bias_stacked[0][index, 0, :lora_bias.shape[0]].copy_(
+                lora_bias.T, non_blocking=True)
+
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
+
+        # In transformers backend, x and output have extra batch dimension like
+        # (1, seq_len, hidden_dim), while punica expects (seq_len, hidden_dim),
+        # therefore we need to flatten the batch dimensions.
+        if x.ndim == 3 and output.ndim == 3:
+            output = output.flatten(0, 1)
+            x = x.flatten(0, 1)
+
+        lora_output: Optional[
+            torch.Tensor] = self.punica_wrapper.add_lora_linear(
+                output, x, self.lora_a_stacked, self.lora_b_stacked,
+                self.lora_bias_stacked, 1.0, self.output_slices)
+        if not current_platform.can_update_inplace():
+            output = lora_output
+
+        return output
+
+    @property
+    def weight(self) -> torch.Tensor:
+
+        # unquantizedLinear
+        if hasattr(self.base_layer, "weight"):
+            return self.base_layer.weight
+        # Compressed Tensor
+        elif hasattr(self.base_layer, "weight_packed"):
+            return self.base_layer.weight_packed
+        # GPTQ/AWQ
+        elif hasattr(self.base_layer, "qweight"):
+            return self.base_layer.qweight
+        # marlin
+        elif hasattr(self.base_layer, "B"):
+            return self.base_layer.B
+        # HQQ marlin
+        elif hasattr(self.base_layer, "W_q"):
+            return self.base_layer.W_q
+        else:
+            raise ValueError(f"Unsupported base layer: {self.base_layer}")
+
+    @property
+    def bias(self) -> Optional[torch.Tensor]:
+        if hasattr(self.base_layer, "bias"):
+            return self.base_layer.bias
+        else:
+            return None
+
+
+class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
+
+    def __init__(self, base_layer: ReplicatedLinear) -> None:
+        super().__init__(base_layer, )
+        # To ensure interface compatibility, set to 1 always.
+        self.tp_size = 1
+        self.output_size = self.base_layer.output_size
+        self.n_slices = 1
+
+    def forward(
+        self, input_: torch.Tensor
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]:
+        """Forward of ReplicatedLinearWithLoRA
+
+        Args:
+            input_: Tensor whose last dimension is `input_size`.
+
+        Returns:
+            - output
+            - bias
+        """
+        bias = (self.base_layer.bias
+                if not self.base_layer.skip_bias_add else None)
+
+        # Matrix multiply.
+        output = self.apply(input_, bias)
+
+        output_bias = (self.base_layer.bias
+                       if self.base_layer.skip_bias_add else None)
+
+        if not self.base_layer.return_bias:
+            return output
+
+        return output, output_bias
+
+    # ReplicatedLinear should always be replaced, regardless of the fully
+    # sharded LoRAs setting, because it is, by definition, copied per GPU.
+    @classmethod
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        return type(source_layer) is ReplicatedLinear
+
+
+class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
+    """
+    LoRA on top of ColumnParallelLinear layer.
+    LoRA B is sliced for tensor parallelism.
+    There are two types for the `base_layer`:
+    1. ColumnParallelLinear, e.g.`dense_h_to_4h` in `FalconForCausalLM`.
+    2. MergedColumnParallelLinear, e.g.`gate_up_proj` in `Phi3ForCausalLM`.
+    """
+
+    def __init__(self, base_layer: ColumnParallelLinear) -> None:
+        super().__init__(base_layer)
+        # The base_layer type is ColumnParallelLinear or
+        # MergedColumnParallelLinear, their weight sharding logic is
+        # inconsistent when TP is greater than 1.
+        self.is_merged_col_linear = type(
+            base_layer) is MergedColumnParallelLinear
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.output_size = self.base_layer.output_size_per_partition
+        # There is only one LoRA layer
+        self.n_slices = 1
+
+    def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
+        return lora_a
+
+    def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
+        # Applicable to cases where the base_layer is
+        # MergedColumnParallelLinear.
+        if self.is_merged_col_linear:
+            tp_rank = get_tensor_model_parallel_rank()
+            shard_size = self.output_size // 2
+            offset = lora_b.shape[-1] // 2
+
+            left_weight = lora_b[:, tp_rank * shard_size:(tp_rank + 1) *
+                                 shard_size]
+            right_weight = lora_b[:, offset + tp_rank * shard_size:offset +
+                                  (tp_rank + 1) * shard_size]
+            lora_b = torch.cat([left_weight, right_weight], dim=1)
+        # Applicable to cases where the base_layer is
+        # ColumnParallelLinear.
+        else:
+            tensor_model_parallel_rank = get_tensor_model_parallel_rank()
+            shard_size = self.output_size
+            start_idx = tensor_model_parallel_rank * shard_size
+            end_idx = (tensor_model_parallel_rank + 1) * shard_size
+            lora_b = lora_b[:, start_idx:end_idx]
+        return lora_b
+
+    def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
+        # TODO: Fix the slicing logic of bias.
+        if bias is None:
+            return bias
+        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
+        shard_size = self.output_size
+        start_idx = tensor_model_parallel_rank * shard_size
+        end_idx = (tensor_model_parallel_rank + 1) * shard_size
+        bias = bias[start_idx:end_idx]
+        return bias
+
+    def forward(
+        self, input_: torch.Tensor
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]:
+        """Forward of ColumnParallelLinear
+
+        Args:
+            input_: Tensor whose last dimension is `input_size`.
+
+        Returns:
+            - output
+            - bias
+        """
+        bias = (self.base_layer.bias
+                if not self.base_layer.skip_bias_add else None)
+
+        # Matrix multiply.
+        output_parallel = self.apply(input_, bias)
+        if self.base_layer.gather_output:
+            # All-gather across the partitions.
+            output = tensor_model_parallel_all_gather(output_parallel)
+        else:
+            output = output_parallel
+
+        if not self.base_layer.return_bias:
+            return output
+
+        output_bias = (self.base_layer.bias
+                       if self.base_layer.skip_bias_add else None)
+        return output, output_bias
+
+    @classmethod
+    @_not_fully_sharded_can_replace
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        return type(source_layer) is ColumnParallelLinear or (
+            type(source_layer) is MergedColumnParallelLinear
+            and len(packed_modules_list) == 1)
+
+
+class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
+    """ColumnParallelLinear layer that is composed of 2 sublayers (slices)
+    packed together (eg. gate_proj + up_proj -> gate_up_proj).
+
+    This means we have 2 LoRAs, each applied to one half of the layer.
+
+    Both slices must have the same size.
+    """
+
+    def __init__(
+        self, base_layer: Union[MergedColumnParallelLinear,
+                                QKVParallelLinear]) -> None:
+        super().__init__(base_layer)
+        # There are two LoRA layers
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        # the output_sizes in MergedColumnParallelLinear is not sharded by tp
+        # we need to divide it by the tp_size to get correct slices size
+        output_sizes = self.base_layer.output_sizes
+        self.output_slices = tuple(
+            divide(output_size, self.tp_size) for output_size in output_sizes)
+        self.n_slices = len(self.output_slices)
+        self.output_ids = (self.tp_rank, ) * self.n_slices
+
+    def create_lora_weights(
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
+        """
+        The main reason for overriding this function is to enhance  code 
+        maintainability.
+        """
+        self.lora_config = lora_config
+
+        lora_a_output_size_per_partition = (
+            lora_config.max_lora_rank if not lora_config.fully_sharded_loras
+            else divide(lora_config.max_lora_rank, self.tp_size))
+
+        self.lora_a_stacked = tuple(
+            torch.zeros(
+                max_loras,
+                1,
+                lora_a_output_size_per_partition,
+                self.input_size,
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            ) for _ in range(self.n_slices))
+        self.lora_b_stacked = tuple(
+            torch.zeros(
+                max_loras,
+                1,
+                output_size,
+                lora_config.max_lora_rank,
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            ) for output_size in self.output_slices)
+        if lora_config.bias_enabled:
+            self.lora_bias_stacked = tuple(
+                torch.zeros(
+                    max_loras,
+                    1,
+                    output_size,
+                    dtype=lora_config.lora_dtype,
+                    device=self.device,
+                ) for output_size in self.output_slices)
+
+    def slice_lora_a(
+        self, lora_a: list[Union[torch.Tensor, None]]
+    ) -> list[Union[torch.Tensor, None]]:
+        return lora_a
+
+    def slice_lora_b(
+        self, lora_b: list[Union[torch.Tensor, None]]
+    ) -> list[Union[torch.Tensor, None]]:
+        for i, (shard_id, shard_size) in enumerate(
+                zip(self.output_ids, self.output_slices)):
+            if (lora_b_i := lora_b[i]) is not None:
+                lora_b[i] = lora_b_i[:, shard_size * shard_id:shard_size *
+                                     (shard_id + 1)]
+        return lora_b
+
+    def slice_bias(
+        self, bias: list[Union[torch.Tensor,
+                               None]]) -> list[Union[torch.Tensor, None]]:
+        for i, (shard_id, shard_size) in enumerate(
+                zip(self.output_ids, self.output_slices)):
+            if (bias_i := bias[i]) is not None:
+                bias[i] = bias_i[shard_size * shard_id:shard_size *
+                                 (shard_id + 1)]
+        return bias
+
+    def set_lora(
+        self,
+        index: int,
+        lora_a: torch.Tensor,
+        lora_b: torch.Tensor,
+        embeddings_tensor: Optional[torch.Tensor],
+        lora_bias: Optional[torch.Tensor] = None,
+    ):
+        self.reset_lora(index)
+
+        if self.tp_size > 1:
+            lora_a = self.slice_lora_a(lora_a)
+            lora_b = self.slice_lora_b(lora_b)
+            if lora_bias is not None:
+                lora_bias = self.slice_bias(lora_bias)
+
+        for i in range(self.n_slices):
+            if (lora_a_i := lora_a[i]) is not None:
+                self.lora_a_stacked[i][
+                    index, 0, :lora_a_i.shape[1], :lora_a_i.shape[0]].copy_(
+                        lora_a_i.T, non_blocking=True)
+            if (lora_b_i := lora_b[i]) is not None:
+                self.lora_b_stacked[i][
+                    index, 0, :lora_b_i.shape[1], :lora_b_i.shape[0]].copy_(
+                        lora_b_i.T, non_blocking=True)
+
+        if lora_bias is not None:
+            self.lora_bias_stacked = cast(tuple[torch.Tensor, ...],
+                                          self.lora_bias_stacked)
+            for i in range(self.n_slices):
+                if (lora_bias_i := lora_bias[i]) is not None:
+                    self.lora_bias_stacked[i][index,
+                                              0, :lora_bias_i.shape[0]].copy_(
+                                                  lora_bias_i.T,
+                                                  non_blocking=True)
+
+    @classmethod
+    @_not_fully_sharded_can_replace
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        return (type(source_layer) is MergedColumnParallelLinear
+                and len(packed_modules_list) == 2)
+
+
+class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
+    """
+    ColumnParallelLinear layer that is specifically designed for
+    qkv_proj. Certain models, such as chatglm3 and baichuan-7b,
+    only contains a single LoRA within their qkv_proj layer.
+
+    During inference with Tensor Parallel, the weights of lora_b
+    must be accurately partitioned according to the respective ranks.
+
+    Q slice may have different shape than K and V slices (which both have
+    the same shape).
+    """
+
+    def __init__(self, base_layer: QKVParallelLinear) -> None:
+        super().__init__(base_layer)
+        self.q_proj_total_size = (self.base_layer.total_num_heads *
+                                  self.base_layer.head_size)
+        self.q_proj_shard_size = (self.base_layer.num_heads *
+                                  self.base_layer.head_size)
+        self.kv_proj_shard_size = (self.base_layer.num_kv_heads *
+                                   self.base_layer.head_size)
+        self.kv_proj_total_size = (self.base_layer.total_num_kv_heads *
+                                   self.base_layer.head_size)
+        # There is only one LoRA layer
+        self.n_slices = 1
+
+    def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
+        tp_rank = get_tensor_model_parallel_rank()
+        self.q_shard_id = tp_rank
+        self.kv_shard_id = tp_rank // self.base_layer.num_kv_head_replicas
+        lora_b_q = lora_b[:, self.q_proj_shard_size *
+                          self.q_shard_id:self.q_proj_shard_size *
+                          (self.q_shard_id + 1)]
+        k_offset = self.q_proj_total_size
+        lora_b_k = lora_b[:, k_offset +
+                          self.kv_proj_shard_size * self.kv_shard_id:k_offset +
+                          self.kv_proj_shard_size * (self.kv_shard_id + 1)]
+        v_offset = k_offset + self.kv_proj_total_size
+        lora_b_v = lora_b[:, v_offset +
+                          self.kv_proj_shard_size * self.kv_shard_id:v_offset +
+                          self.kv_proj_shard_size * (self.kv_shard_id + 1)]
+        lora_b = torch.cat([lora_b_q, lora_b_k, lora_b_v], dim=1)
+        return lora_b
+
+    def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
+        bias_q = bias[self.q_proj_shard_size *
+                      self.q_shard_id:self.q_proj_shard_size *
+                      (self.q_shard_id + 1)]
+        k_offset = self.q_proj_total_size
+        bias_k = bias[k_offset +
+                      self.kv_proj_shard_size * self.kv_shard_id:k_offset +
+                      self.kv_proj_shard_size * (self.kv_shard_id + 1)]
+        v_offset = k_offset + self.kv_proj_total_size
+        bias_v = bias[v_offset +
+                      self.kv_proj_shard_size * self.kv_shard_id:v_offset +
+                      self.kv_proj_shard_size * (self.kv_shard_id + 1)]
+        bias = torch.cat([bias_q, bias_k, bias_v], dim=1)
+        return bias
+
+    @classmethod
+    @_not_fully_sharded_can_replace
+    def can_replace_layer(cls, source_layer: nn.Module,
+                          lora_config: LoRAConfig, packed_modules_list: list,
+                          model_config: Optional[PretrainedConfig]) -> bool:
+        return type(source_layer) is QKVParallelLinear and len(
+            packed_modules_list) == 1
+
+
+class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA):
+    """MergedColumnParallelLinear layer that is composed of 3 sublayers (slices)
+    packed together in qkv proj fashion
+    (q_proj + k_proj + v_proj -> qkv_proj).
+
+    This means we have 3 LoRAs, each applied to one slice of the layer.
+
+    Q slice may have different shape than K and V slices (which both have
+    the same shape).
+    """
+
+    def __init__(self, base_layer: QKVParallelLinear) -> None:
+        super().__init__(base_layer)
+        # There are three LoRA layer.
+        self.n_slices = len(self.base_layer.output_sizes)
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+        self.q_proj_shard_size = (self.base_layer.num_heads *
+                                  self.base_layer.head_size)
+        self.kv_proj_shard_size = (self.base_layer.num_kv_heads *
+                                   self.base_layer.head_size)
+        self.q_shard_id = self.tp_rank
+        self.kv_shard_id = self.tp_rank // self.base_layer.num_kv_head_replicas
+
+        self.output_slices = (
+            self.q_proj_shard_size,
+            self.kv_proj_shard_size,
+            self.kv_proj_shard_size,
+        )
+        self.output_ids = (
+            self.q_shard_id,
+            self.kv_shard_id,
+            self.kv_shard_id,
+        )
+
+    def create_lora_weights(
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
+        """
+        The main reason for overloading this function is to handle inconsistent 
+        weight dimensions in qkv lora.
+        """
+        super().create_lora_weights(max_loras, lora_config, model_config)
+
+    @classmethod
+    @_not_fully_sharded_can_replace
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        return (type(source_layer) is QKVParallelLinear
+                and len(packed_modules_list) == 3)
+
+
+#TODO: Implement this
+class QKVCrossParallelLinearWithLoRA(BaseLayerWithLoRA):
+    pass
+
+
+class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
+
+    def __init__(self, base_layer: RowParallelLinear) -> None:
+        super().__init__(base_layer)
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+        # reset input_size
+        self.input_size = self.base_layer.input_size_per_partition
+        self.output_size = self.base_layer.output_size
+
+        self.tp_rank = get_tensor_model_parallel_rank()
+        # There is only one LoRA layer.
+        self.n_slices = 1
+
+    def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
+
+        shard_size = self.input_size
+        start_idx = self.tp_rank * shard_size
+        end_idx = (self.tp_rank + 1) * shard_size
+        lora_a = lora_a[start_idx:end_idx, :]
+        return lora_a
+
+    def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
+        return lora_b
+
+    def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
+        return bias
+
+    def forward(
+        self, input_: torch.Tensor
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]:
+        """Forward of RowParallelLinear
+
+        Args:
+            input_: tensor whose last dimension is `input_size`. If
+                    `input_is_parallel` is set, then the last dimension
+                    is `input_size // tp_size`.
+
+        Returns:
+            - output
+            - bias
+        """
+        # set up backprop all-reduce.
+        if self.base_layer.input_is_parallel:
+            input_parallel = input_
+        else:
+            # TODO: simplify code below
+            splitted_input = split_tensor_along_last_dim(
+                input_, num_partitions=self.base_layer.tp_size)
+            input_parallel = splitted_input[self.tp_rank].contiguous()
+
+        # Matrix multiply.
+        output_parallel = self.apply(input_parallel)
+        if self.base_layer.reduce_results and self.base_layer.tp_size > 1:
+            output_ = tensor_model_parallel_all_reduce(output_parallel)
+        else:
+            output_ = output_parallel
+
+        if not self.base_layer.skip_bias_add:
+            output = (output_ + self.base_layer.bias
+                      if self.base_layer.bias is not None else output_)
+            output_bias = None
+        else:
+            output = output_
+            output_bias = self.base_layer.bias
+
+        if not self.base_layer.return_bias:
+            return output
+
+        return output, output_bias
+
+    @classmethod
+    @_not_fully_sharded_can_replace
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        return type(source_layer) is RowParallelLinear
+
+
+class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
+    """
+    LoRA wrapper for LogitsProcessor, with extra logic to handle the
+    application of the LoRA adapter and added LoRA vocabulary.
+
+    Args:
+        base_layer: LogitsProcessor layer
+        hidden_size: hidden size of the model
+        dtype: data type of the model
+        device: device of the model
+        sharded_to_full_mapping: index mapping from sharded vocab to full vocab
+            received from base_layer.get_sharded_to_full_mapping(). If None,
+            no reindexing will be done.
+    """
+
+    def __init__(self, base_layer: LogitsProcessor, hidden_size: int,
+                 dtype: torch.dtype, device: torch.device,
+                 sharded_to_full_mapping: Optional[list[int]]) -> None:
+        super().__init__()
+        self.base_layer = base_layer
+        self.hidden_size = hidden_size
+        self.dtype = dtype
+        self.device = device
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.sharded_to_full_mapping = sharded_to_full_mapping
+
+    @property
+    def logits_as_input(self):
+        return self.base_layer.logits_as_input
+
+    @property
+    def vocab_size(self):
+        return self.base_layer.vocab_size
+
+    @property
+    def scale(self):
+        return self.base_layer.scale
+
+    @property
+    def soft_cap(self):
+        return self.base_layer.soft_cap
+
+    @property
+    def use_all_gather(self):
+        return self.base_layer.use_all_gather
+
+    @property
+    def org_vocab_size(self):
+        return self.base_layer.org_vocab_size
+
+    @property
+    def include_gpu_probs_tensor(self):
+        return self.base_layer.include_gpu_probs_tensor
+
+    @property
+    def should_modify_greedy_probs_inplace(self):
+        return self.base_layer.should_modify_greedy_probs_inplace
+
+    def create_lora_weights(
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
+        # TODO: Verify if this condition can be further relaxed
+        if 32000 < self.base_layer.vocab_size > 257024:
+            raise ValueError("When using LoRA, vocab size must be "
+                             "32000 >= vocab_size <= 257024")
+        self.lora_a_stacked = torch.zeros(
+            (
+                max_loras,
+                1,
+                lora_config.max_lora_rank,
+                self.hidden_size,
+            ),
+            dtype=lora_config.lora_dtype,
+            device=self.device,
+        )
+        self.lora_b_stacked = torch.zeros(
+            (
+                max_loras,
+                1,
+                # Pad for kernel compatibility
+                math.ceil(self.base_layer.vocab_size /
+                          lora_config.lora_vocab_padding_size) *
+                lora_config.lora_vocab_padding_size,
+                lora_config.max_lora_rank,
+            ),
+            dtype=lora_config.lora_dtype,
+            device=self.device,
+        )
+        self.embeddings_tensors = torch.full(
+            (max_loras, lora_config.lora_extra_vocab_size, self.hidden_size),
+            fill_value=float("-inf"),
+            dtype=self.dtype,
+            device=self.device,
+        )
+        if self.sharded_to_full_mapping is not None:
+            self.sharded_to_full_mapping_gpu = torch.tensor(
+                self.sharded_to_full_mapping,
+                device=self.device,
+                dtype=torch.long)
+        else:
+            self.sharded_to_full_mapping_gpu = None
+
+    def reset_lora(self, index: int):
+        self.lora_a_stacked[index] = 0
+        self.lora_b_stacked[index] = 0
+        self.embeddings_tensors[index] = float("-inf")
+
+    def set_lora(
+        self,
+        index: int,
+        lora_a: torch.Tensor,
+        lora_b: torch.Tensor,
+        embeddings_tensor: Optional[torch.Tensor],
+        bias: Optional[torch.Tensor] = None,
+    ):
+        self.reset_lora(index)
+        self.lora_a_stacked[index,
+                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
+                                lora_a.T, non_blocking=True)
+        self.lora_b_stacked[index,
+                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
+                                lora_b.T, non_blocking=True)
+        if embeddings_tensor is not None:
+            self.embeddings_tensors[
+                index,
+                :embeddings_tensor.shape[0],
+                :embeddings_tensor.shape[1],
+            ] = embeddings_tensor
+
+    def _get_logits(
+        self,
+        hidden_states: torch.Tensor,
+        lm_head: VocabParallelEmbedding,
+        embedding_bias: Optional[torch.Tensor] = None,
+    ) -> Optional[torch.Tensor]:
+        # Get the logits for the next tokens.
+        logits = lm_head.quant_method.apply(lm_head, hidden_states)
+        if embedding_bias is not None:
+            logits += embedding_bias
+
+        # Gather logits for TP
+        logits = self.base_layer._gather_logits(logits)
+
+        if logits is None:
+            return None
+
+        if self.sharded_to_full_mapping_gpu is not None:
+            # Reindex full logits tensor to ensure 1:1 mapping between
+            # index and token_id
+            # Example for:
+            #   org_vocab_size = 4
+            #   added_vocab_size = 2
+            #   pad_to_size = 8
+            #   tp_size = 2
+
+            # indices:  [0, 1, 2,  3, 4, 5, 6,  7]
+            # token_id: [0, 1, 4, -1, 2, 3, 5, -1]
+
+            # Therefore, the mapping is expected to be:
+            # [0, 1, 4, 6, 2, 3, 5, 7] so that when we reindex,
+            # we get:
+            # indices:  [0, 1, 2, 3, 4, 5,  6,  7]
+            # token_id: [0, 1, 2, 3, 4, 5, -1, -1]
+            logits = logits[:, self.sharded_to_full_mapping_gpu]
+
+        lora_logits = torch.empty(
+            self.embeddings_tensors.shape[0] + 1,
+            self.embeddings_tensors.shape[1],
+            hidden_states.shape[0],
+            dtype=self.embeddings_tensors.dtype,
+            device=self.embeddings_tensors.device,
+        )
+        torch.matmul(self.embeddings_tensors,
+                     hidden_states.T,
+                     out=lora_logits[:-1])
+
+        neg_inf, pos_inf = current_platform.get_infinity_values(
+            lora_logits.dtype)
+
+        lora_logits[-1] = neg_inf
+        lora_logits = lora_logits.mT
+        indices_padded = self.punica_wrapper.sampler_indices_padded
+
+        if current_platform.is_tpu():
+            indices_padded = indices_padded[:logits.size(0)]
+
+        lora_logits = (lora_logits.reshape(
+            lora_logits.shape[0] * lora_logits.shape[1],
+            lora_logits.shape[2],
+        ).index_select(0, indices_padded).nan_to_num_(nan=neg_inf,
+                                                      posinf=pos_inf,
+                                                      neginf=neg_inf))
+
+        logits[:,
+               self.base_layer.org_vocab_size:self.base_layer.org_vocab_size +
+               lora_logits.shape[1]] = lora_logits
+
+        lora_output: Optional[
+            torch.Tensor] = self.punica_wrapper.add_lora_logits(
+                logits, hidden_states, self.lora_a_stacked,
+                self.lora_b_stacked, 1.0)
+
+        if not current_platform.can_update_inplace():
+            logits = lora_output
+
+        # Remove paddings in vocab (if any).
+        logits = logits[:, :self.base_layer.vocab_size]
+        return logits
+
+    def forward(self, *args, **kwargs):
+        return type(self.base_layer).forward(self, *args, **kwargs)
+
+    @classmethod
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        # Special handling for the LogitsProcessor.
+        return False
diff --git a/vllm_v0.10.0/vllm/lora/lora.py b/vllm_v0.10.0/vllm/lora/lora.py
new file mode 100644
index 0000000..958364f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/lora/lora.py
@@ -0,0 +1,199 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence as GenericSequence
+from typing import Optional
+
+import torch
+import torch.types
+
+from vllm.lora.peft_helper import PEFTHelper
+from vllm.utils import is_pin_memory_available
+
+
+class LoRALayerWeights:
+    """LoRA weights for a layer composed of two low rank matrixes."""
+
+    def __init__(
+        self,
+        module_name: str,
+        rank: int,
+        lora_alpha: int,
+        lora_a: torch.Tensor,
+        lora_b: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+        embeddings_tensor: Optional[torch.Tensor] = None,
+        scaling: Optional[float] = None,
+    ) -> None:
+        self.module_name = module_name
+        self.rank = rank
+        self.lora_alpha = lora_alpha
+        self.lora_a = lora_a
+        self.lora_b = lora_b
+        self.bias = bias
+        self.embeddings_tensor = embeddings_tensor
+
+        if scaling is None:
+            self.scaling = self.lora_alpha / self.rank
+        else:
+            self.scaling = scaling
+
+    def optimize(self) -> "LoRALayerWeights":
+        """Optimize the LoRA by merging the scaling into lora_b."""
+        if self.scaling == 1:
+            return self
+        self.lora_b *= self.scaling
+        self.scaling = 1
+        return self
+
+    @property
+    def input_dim(self) -> int:
+        return self.lora_a.shape[0]
+
+    @property
+    def output_dim(self) -> int:
+        return self.lora_b.shape[1]
+
+    @property
+    def is_packed(self) -> bool:
+        return False
+
+    @property
+    def extra_vocab_size(self) -> int:
+        return self.embeddings_tensor.shape[
+            0] if self.embeddings_tensor is not None else 0
+
+    @classmethod
+    def from_config(
+        cls,
+        module_name: str,
+        peft_helper: PEFTHelper,
+        embeddings_tensor: Optional[torch.Tensor] = None,
+    ) -> "LoRALayerWeights":
+        return cls(module_name, peft_helper.r, peft_helper.lora_alpha, None,
+                   None, None, embeddings_tensor,
+                   peft_helper.vllm_lora_scaling_factor)
+
+    @classmethod
+    def create_dummy_lora_weights(
+            cls,
+            module_name: str,
+            input_dim: int,
+            output_dim: int,
+            rank: int,
+            dtype: torch.dtype,
+            device: torch.types.Device,
+            embeddings_tensor_dim: Optional[int] = None,
+            bias_enabled: Optional[bool] = False) -> "LoRALayerWeights":
+        pin_memory = str(device) == "cpu" and is_pin_memory_available()
+        lora_a = torch.zeros([input_dim, rank],
+                             dtype=dtype,
+                             device=device,
+                             pin_memory=pin_memory)
+        lora_b = torch.zeros([rank, output_dim],
+                             dtype=dtype,
+                             device=device,
+                             pin_memory=pin_memory)
+        if bias_enabled:
+            bias = torch.zeros([output_dim],
+                               dtype=dtype,
+                               device=device,
+                               pin_memory=pin_memory)
+        else:
+            bias = None
+
+        embeddings_tensor = torch.rand(
+            10,
+            embeddings_tensor_dim,
+            dtype=dtype,
+            device=device,
+            pin_memory=pin_memory) if embeddings_tensor_dim else None
+        return cls(
+            module_name,
+            rank=rank,
+            lora_alpha=1,
+            lora_a=lora_a,
+            lora_b=lora_b,
+            bias=bias,
+            embeddings_tensor=embeddings_tensor,
+        )
+
+
+class PackedLoRALayerWeights(LoRALayerWeights):
+    """LoRA used for packed layers (eg. qkv_proj)."""
+
+    def __init__(
+        self,
+        module_name: str,
+        rank: int,
+        lora_alphas: list[Optional[int]],
+        lora_a: list[Optional[torch.Tensor]],
+        lora_b: list[Optional[torch.Tensor]],
+        bias: Optional[list[Optional[torch.Tensor]]] = None,
+        scaling: Optional[list[float]] = None,
+    ) -> None:
+        super().__init__(
+            module_name=module_name,
+            rank=rank,
+            lora_alpha=0,
+            lora_a=lora_a,
+            lora_b=lora_b,
+            bias=bias,
+            scaling=scaling,  # type: ignore
+            embeddings_tensor=None,
+        )
+        self.lora_alphas = lora_alphas
+        if scaling is None:
+            self.scaling = [  # type: ignore
+                lora_alpha / self.rank  # type: ignore # noqa
+                for lora_alpha in self.lora_alphas
+            ]
+
+    @classmethod
+    def pack(
+        cls, loras: GenericSequence[Optional["LoRALayerWeights"]]
+    ) -> "PackedLoRALayerWeights":
+        """Pack a list of LoRAs into a single LoRA.
+
+        If LoRA is None, it signifies that the submodule does not have a LoRA.
+        """
+        first_lora = next(lora for lora in loras if lora is not None)
+        for lora in loras:
+            if lora is None:
+                continue
+            lora.optimize()
+        rank = first_lora.rank
+        module_name = first_lora.module_name
+        obj = cls(
+            module_name,
+            rank,
+            [lora.lora_alpha if lora is not None else None for lora in loras],
+            [lora.lora_a if lora is not None else None for lora in loras],
+            [lora.lora_b if lora is not None else None for lora in loras],
+            [lora.bias if lora is not None else None for lora in loras],
+            scaling=[
+                1 if lora is not None else None  # type: ignore
+                for lora in loras
+            ])
+        return obj
+
+    def optimize(self) -> "PackedLoRALayerWeights":
+        """Optimize the LoRA by merging the scaling into lora_b."""
+        for i in range(len(self.lora_b)):
+            if self.scaling[i] == 1 or self.lora_b[i] is None:  # type: ignore
+                continue
+            self.lora_b[i] *= self.scaling[i]  # type: ignore
+            self.scaling[i] = 1  # type: ignore
+        return self
+
+    @property
+    def input_dim(self) -> int:
+        raise NotImplementedError()
+
+    @property
+    def output_dim(self) -> int:
+        raise NotImplementedError()
+
+    @property
+    def is_packed(self) -> bool:
+        return True
diff --git a/vllm_v0.10.0/vllm/lora/models.py b/vllm_v0.10.0/vllm/lora/models.py
new file mode 100644
index 0000000..e6b19d4
--- /dev/null
+++ b/vllm_v0.10.0/vllm/lora/models.py
@@ -0,0 +1,787 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+import os
+from collections.abc import Sequence
+from typing import Any, Callable, Optional, Union
+
+import regex as re
+import safetensors.torch
+import torch
+from torch import nn
+
+from vllm.adapter_commons.models import (AdapterLRUCache, AdapterModel,
+                                         AdapterModelManager)
+from vllm.adapter_commons.utils import (add_adapter, deactivate_adapter,
+                                        get_adapter, list_adapters,
+                                        remove_adapter, set_adapter_mapping)
+from vllm.config import LoRAConfig
+from vllm.logger import init_logger
+from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping
+from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
+from vllm.lora.peft_helper import PEFTHelper
+from vllm.lora.punica_wrapper import get_punica_wrapper
+from vllm.lora.utils import (from_layer, from_layer_logits_processor,
+                             get_supported_lora_modules,
+                             is_regex_target_modules,
+                             parse_fine_tuned_lora_name, replace_submodule)
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
+from vllm.model_executor.models import SupportsLoRA, supports_multimodal
+from vllm.model_executor.models.interfaces import is_pooling_model
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper
+from vllm.model_executor.utils import get_packed_modules_mapping
+from vllm.utils import is_pin_memory_available
+
+logger = init_logger(__name__)
+
+_GLOBAL_LORA_ID = 0
+
+
+def get_lora_id():
+    global _GLOBAL_LORA_ID
+    _GLOBAL_LORA_ID += 1
+    return _GLOBAL_LORA_ID
+
+
+def is_moe_model(model: nn.Module) -> bool:
+    """Checks if the model contains FusedMoE layers and warns the user."""
+    if any(isinstance(module, FusedMoE) for module in model.modules()):
+        logger.warning_once(
+            "For MoE models, vLLM currently does not support fused MoE LoRA "
+            "inference. Please ensure that the loaded LoRA model does not "
+            "contain expert weights.")
+        return True
+    return False
+
+
+class LoRAModel(AdapterModel):
+    """A LoRA fine-tuned model."""
+
+    def __init__(
+        self,
+        lora_model_id: int,
+        rank: int,
+        loras: dict[str, LoRALayerWeights],
+    ) -> None:
+        """
+        Args:
+            lora_model_id: The integer id for the lora model.
+            rank: lora rank.
+            loras: module name -> weights for lora-replaced layers.
+
+        """
+        self.id = lora_model_id
+
+        assert (
+            lora_model_id
+            > 0), f"a valid lora id should be greater than 0, got {self.id}"
+        self.rank = rank
+        self.loras: dict[str, LoRALayerWeights] = loras
+
+    def clone(self, lora_model_id: int) -> "LoRAModel":
+        """Return a copy of the object with different ids.
+
+        Will share the underlying tensors."""
+        return self.__class__(
+            lora_model_id,
+            rank=self.rank,
+            loras=self.loras.copy(),
+        )
+
+    @property
+    def extra_vocab_size(self) -> int:
+        return max(lora.extra_vocab_size
+                   for lora in self.loras.values()) if self.loras else 0
+
+    def get_lora(self, module_name: str) -> Optional[LoRALayerWeights]:
+        """Get LoRA for a given module by name"""
+        return self.loras.get(module_name, None)
+
+    def check_lora_name(self, lora_name: str) -> bool:
+        return lora_name in self.loras
+
+    # (yard1): TODO see if we can derive target_embedding_padding automatically
+    @classmethod
+    def from_lora_tensors(
+        cls,
+        lora_model_id: int,
+        tensors: dict[str, torch.Tensor],
+        peft_helper: PEFTHelper,
+        device: str = "cuda",
+        dtype: Optional[torch.dtype] = None,
+        embeddings: Optional[dict[str, torch.Tensor]] = None,
+        target_embedding_padding: Optional[int] = None,
+        embedding_modules: Optional[dict[str, str]] = None,
+        embedding_padding_modules: Optional[list[str]] = None,
+        weights_mapper: Optional[WeightsMapper] = None,
+    ) -> "LoRAModel":
+        """Create a LoRAModel from a dictionary of tensors."""
+        pin_memory = str(device) == "cpu" and is_pin_memory_available()
+        loras: dict[str, LoRALayerWeights] = {}
+        for tensor_name, tensor in tensors.items():
+            module_name, is_lora_a, is_bias = parse_fine_tuned_lora_name(
+                tensor_name, weights_mapper)
+            if module_name not in loras:
+                lora_embeddings_tensor = None
+                if embeddings:
+                    assert embedding_modules is not None
+                    embeddings_module = next(
+                        (k for k in embedding_modules if k in module_name),
+                        None)
+                    if embeddings_module:
+                        lora_embeddings_tensor = embeddings[
+                            embedding_modules[embeddings_module]].to(
+                                device=device, dtype=dtype)
+                        if pin_memory:
+                            lora_embeddings_tensor = (
+                                lora_embeddings_tensor.pin_memory())
+                loras[module_name] = LoRALayerWeights.from_config(
+                    module_name, peft_helper, lora_embeddings_tensor)
+
+            if is_bias:
+                loras[module_name].bias = tensor.to(device=device,
+                                                    dtype=dtype).t()
+                bias = tensor.to(device=device, dtype=dtype).t()
+                if pin_memory:
+                    bias = bias.pin_memory()
+                loras[module_name].bias = bias
+            elif is_lora_a:
+                loras[module_name].lora_a = tensor.to(device=device,
+                                                      dtype=dtype).t()
+                if pin_memory:
+                    loras[module_name].lora_a = loras[
+                        module_name].lora_a.pin_memory()
+            else:
+                loras[module_name].lora_b = tensor.to(device=device,
+                                                      dtype=dtype).t()
+                assert embedding_padding_modules is not None
+                if any(name in module_name
+                       for name in embedding_padding_modules
+                       ) and target_embedding_padding is not None:
+                    lora_b = loras[module_name].lora_b
+                    assert target_embedding_padding >= lora_b.shape[1]
+                    addition = target_embedding_padding - lora_b.shape[1]
+                    loras[module_name].lora_b = torch.nn.functional.pad(
+                        lora_b, (0, addition))
+                if pin_memory:
+                    loras[module_name].lora_b = loras[
+                        module_name].lora_b.pin_memory()
+
+        for lora in loras.values():
+            lora.optimize()
+
+        return cls(lora_model_id, peft_helper.r, loras)
+
+    @classmethod
+    def from_local_checkpoint(
+            cls,
+            lora_dir: str,
+            expected_lora_modules: list[str],
+            peft_helper: PEFTHelper,
+            *,
+            lora_model_id: Optional[int] = None,
+            device: str = "cuda",
+            dtype: Optional[torch.dtype] = None,
+            target_embedding_padding: Optional[int] = None,
+            embedding_modules: Optional[dict[str, str]] = None,
+            embedding_padding_modules: Optional[list[str]] = None,
+            weights_mapper: Optional[WeightsMapper] = None,
+            tensorizer_config_dict: Optional[dict] = None) -> "LoRAModel":
+        """Create a LoRAModel from a local checkpoint.
+
+        Args:
+            lora_dir: The local path that has lora data.
+            expected_lora_modules: Name of modules that are expected to be
+                replaced by lora.
+            peft_helper: Loaded lora configuration information.
+            lora_model_id: LoRA model id. If not given, automatically set by
+                a global counter.
+            device: Device where the lora model is loaded.
+            dtype: dtype of the lora model weights.
+
+        Returns:
+            Loaded LoRA Model.
+        """
+        lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors")
+        lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin")
+        new_embeddings_tensor_path = os.path.join(
+            lora_dir, "new_embeddings.safetensors")
+        new_embeddings_bin_file_path = os.path.join(lora_dir,
+                                                    "new_embeddings.bin")
+        tensors: dict[str, torch.Tensor] = {}
+        unexpected_modules: list[Union[list[str], str]] = []
+
+        def check_unexpected_modules(modules: dict):
+            for lora_module in modules.keys():  # noqa
+                module_name, _, _ = parse_fine_tuned_lora_name(
+                    lora_module, weights_mapper)
+                part_name = module_name.split(".")[-1]
+                if part_name not in expected_lora_modules:
+                    unexpected_modules.append(module_name)
+            if unexpected_modules:
+                raise ValueError(
+                    f"While loading {lora_dir}, expected"
+                    f" target modules in {expected_lora_modules}"
+                    f" but received {unexpected_modules}."
+                    f" Please verify that the loaded LoRA module is correct")
+
+        if tensorizer_config_dict:
+            from tensorizer import TensorDeserializer
+
+            tensorizer_config = TensorizerConfig(**tensorizer_config_dict)
+            lora_tensor_path = os.path.join(tensorizer_config.tensorizer_dir,
+                                            "adapter_model.tensors")
+            tensorizer_args = tensorizer_config._construct_tensorizer_args()
+            tensors = TensorDeserializer(
+                lora_tensor_path,
+                dtype=tensorizer_config.dtype,
+                **tensorizer_args.deserialization_kwargs)
+            check_unexpected_modules(tensors)
+
+        elif os.path.isfile(lora_tensor_path):
+            # Find unexpected modules.
+            # Use safetensor key as a source of truth to find expected modules.
+            # in peft if you have target_modules A, B, C and C does not exist
+            # in the model it won’t error and model will be trained with A, B
+            # loraified. C won’t exist in the safetensor but it will exist in
+            # the target_modules of the adapter_config.json.
+            unexpected_modules = []
+            with safetensors.safe_open(lora_tensor_path,
+                                       framework="pt") as f:  # type: ignore
+                # Load tensors if there are only expected modules.
+                check_unexpected_modules(f)
+                for module in f.keys():  # noqa
+                    tensors[module] = f.get_tensor(module)
+        elif os.path.isfile(lora_bin_file_path):
+            # When a bin file is provided, we rely on config to find unexpected
+            # modules.
+            unexpected_modules = []
+            target_modules = peft_helper.target_modules
+            if not isinstance(target_modules, list):
+                target_modules = [target_modules]
+            for module in target_modules:
+                # Compatible with more modules,
+                # such as:layers.11.self_attn.k_proj
+                part_name = module.split(".")[-1]
+                if part_name not in expected_lora_modules:
+                    unexpected_modules.append(module)
+            # loaded lora's target modules must be a subset of
+            # expected_lora_modules. It is not reliable. See
+            # https://github.com/vllm-project/vllm/pull/5909. But there's no
+            # other better mechanism.
+            if unexpected_modules and not is_regex_target_modules(
+                    peft_helper.target_modules, expected_lora_modules):
+                raise ValueError(
+                    f"While loading {lora_dir}, expected"
+                    f" target modules in {expected_lora_modules}"
+                    f" but received {unexpected_modules}."
+                    f" Please verify that the loaded LoRA module is correct")
+            tensors = torch.load(lora_bin_file_path,
+                                 map_location=device,
+                                 weights_only=True)
+        else:
+            raise ValueError(f"{lora_dir} doesn't contain tensors")
+
+        embeddings = None
+        if os.path.isfile(new_embeddings_tensor_path):
+            embeddings = safetensors.torch.load_file(
+                new_embeddings_tensor_path)
+        elif os.path.isfile(new_embeddings_bin_file_path):
+            embeddings = torch.load(new_embeddings_bin_file_path,
+                                    map_location=device,
+                                    weights_only=True)
+
+        return cls.from_lora_tensors(
+            lora_model_id=get_lora_id()
+            if lora_model_id is None else lora_model_id,
+            tensors=tensors,
+            peft_helper=peft_helper,
+            device=device,
+            dtype=dtype,
+            embeddings=embeddings,
+            target_embedding_padding=target_embedding_padding,
+            embedding_modules=embedding_modules,
+            embedding_padding_modules=embedding_padding_modules,
+            weights_mapper=weights_mapper)
+
+
+class LoRAModelManager(AdapterModelManager):
+    """A manager that manages multiple LoRA-fine-tuned models."""
+
+    def __init__(
+        self,
+        model: SupportsLoRA,
+        max_num_seqs: int,
+        max_num_batched_tokens: int,
+        vocab_size: int,
+        lora_config: LoRAConfig,
+        device: torch.device,
+    ):
+        """Create a LoRAModelManager and adapter for a given model.
+
+        Args:
+            model: the model to be adapted.
+            max_num_seqs: the maximum number of sequences model can run in a
+                single batch.
+            max_num_batched_tokens: the maximum number of tokens model can run
+                in a single batch.
+            vocab_size: the vocab size of the model.
+            lora_config: the LoRA configuration.
+        """
+        self.lora_config = lora_config
+        self.device = device
+        self.max_num_seqs = max_num_seqs
+        assert self.capacity >= self.lora_slots
+        self.max_num_batched_tokens = math.ceil(max_num_batched_tokens / 8) * 8
+        self.lora_index_to_id: list[Optional[int]] = [None] * self.lora_slots
+        self.vocab_size = vocab_size
+        self.punica_wrapper = get_punica_wrapper(
+            max_num_batched_tokens,
+            max_batches=self.max_num_seqs,
+            device=self.device,
+            max_loras=self.lora_config.max_loras)
+
+        super().__init__(model)
+
+        self.supported_lora_modules = get_supported_lora_modules(self.model)
+        assert self.supported_lora_modules, "No supported LoRA modules found in"
+        f" {self.model.__class__.__name__}."
+
+        self.packed_modules_mapping = get_packed_modules_mapping(self.model)
+        # Used to indicate whether the model is a multimodal model
+        self.supports_mm: bool = (
+            supports_multimodal(self.model)
+            # In case the model only supports LoRA for
+            # text modules (e.g. ChatGLM)
+            and hasattr(self.model, "get_mm_mapping"))
+        self.is_pooling_model = is_pooling_model(self.model)
+        self.is_moe_model = is_moe_model(self.model)
+        self.packed_modules: dict[str, list[str]] = {}
+        self.modules: dict[str, BaseLayerWithLoRA] = {}
+        # Dict instead of a set for compatibility with LRUCache.
+        self._last_mapping: Optional[LoRAMapping] = None
+        self._create_lora_modules()
+        self.model.lora_manager = self
+        self.adapter_type = 'LoRA'
+
+    @property
+    def capacity(self) -> int:
+        return self.lora_config.max_cpu_loras
+
+    @property
+    def lora_slots(self) -> int:
+        return self.lora_config.max_loras
+
+    @property
+    def adapter_slots(self) -> int:
+        return self.lora_slots
+
+    def activate_adapter(
+        self,
+        lora_id: int,
+    ) -> bool:
+        """Move LoRA into a GPU buffer to be used in the forward pass."""
+        if lora_id in self._active_adapters:
+            return False
+        first_free_slot = next(
+            ((i, lora_id) for i, lora_id in enumerate(self.lora_index_to_id)
+             if lora_id is None), None)
+        if first_free_slot is None:
+            raise ValueError("No free lora slots")
+        index, _ = first_free_slot
+        self._active_adapters[lora_id] = None
+        lora_model = self._registered_adapters[lora_id]
+        logger.debug("Activating LoRA. int id: %d, slot index: %d",
+                     lora_model.id, index)
+        self.lora_index_to_id[index] = lora_model.id
+        for module_name, module in self.modules.items():
+            module_lora = self._get_lora_layer_weights(lora_model, module_name)
+            if module_lora:
+                module_lora.optimize()
+                # Bias is not explicitly enabled with the flag enable_lora_bias.
+                bias = module_lora.bias
+                if ((torch.is_tensor(bias) or
+                     (isinstance(bias, Sequence) and any(b is not None
+                                                         for b in bias)))
+                        and not self.lora_config.bias_enabled):
+                    module_lora.bias = None
+                    raise ValueError(
+                        f"Adapter bias cannot be used for {module_name}"
+                        " without --enable-lora-bias.")
+                module.set_lora(index, module_lora.lora_a, module_lora.lora_b,
+                                module_lora.embeddings_tensor,
+                                module_lora.bias)
+            else:
+                module.reset_lora(index)
+        return True
+
+    def _deactivate_adapter(self, lora_id: int):
+        try:
+            index = self.lora_index_to_id.index(lora_id)
+            self.lora_index_to_id[index] = None
+        except ValueError:
+            pass
+
+    def _add_adapter(self, lora: LoRAModel):
+        self._create_merged_loras_inplace(lora)
+        self._registered_adapters[lora.id] = lora
+
+    def pin_adapter(self, lora_id: int) -> bool:
+        """Pin a LoRAModel in the manager cache."""
+        raise NotImplementedError(
+            "Pinning is not supported in LoRAModelManager. "
+            "Use LRUCacheLoRAModelManager for pinning")  # type: ignore
+
+    def _set_adapter_mapping(self, mapping: LoRAMapping) -> None:
+        # update lora states
+        self.punica_wrapper.update_metadata(
+            mapping,
+            self.lora_index_to_id,
+            self.lora_slots + 1,
+            self.vocab_size,
+            self.lora_config.lora_extra_vocab_size,
+        )
+
+    def remove_all_adapters(self):
+        """Remove all LoRAModels from the manager."""
+        self._registered_adapters.clear()
+        self.lora_index_to_id = [None] * self.lora_slots
+        self._active_adapters.clear()
+
+    def _create_lora_modules(self):
+
+        def _parent_module(module_name: str) -> str:
+            # module name is a dot separated name.
+            # for example:
+            #  - given an input 'x.y.z' return 'x.y'
+            #  - given an input 'x' return ''
+            return module_name.rpartition('.')[0]
+
+        for module_name, module in self.model.named_modules(
+                remove_duplicate=False):
+            if isinstance(module, PPMissingLayer):
+                continue
+            if not self._match_target_modules(module_name):
+                continue
+            # A temporary approach for multimodal models to support LoRA
+            # TODO: Remove this restriction
+            if self._filter_unsupported_mm_module(module_name):
+                logger.warning(
+                    "Regarding multimodal models, vLLM currently only supports "
+                    "adding LoRA to language model, %s will be ignored.",
+                    module_name,
+                )
+                continue
+            parts = module_name.split(".")[-1]
+            packed_moduled_lst = self.packed_modules_mapping.get(parts, [])
+            new_module = replace_submodule(
+                self.model, module_name,
+                from_layer(module, self.lora_slots, self.lora_config,
+                           packed_moduled_lst, self.model.config))
+
+            # (yard1): TODO make this more robust
+            if "lm_head" in module_name:
+                logits_processor_module_name = 'logits_processor'
+                parent_module = _parent_module(module_name)
+                if parent_module:
+                    logits_processor_module_name = (
+                        f"{parent_module}.{logits_processor_module_name}")
+
+                logits_processor_module = self.model.get_submodule(
+                    logits_processor_module_name)
+
+                new_module = replace_submodule(
+                    self.model, logits_processor_module_name,
+                    from_layer_logits_processor(logits_processor_module,
+                                                module, self.lora_slots,
+                                                self.lora_config,
+                                                self.model.config))
+
+            # In some models, especially multimodal ones, layers with the same
+            # name may have different types, such as nn.Linear and
+            # ReplicatedLinear. The nn.Linear layers cannot be replaced with
+            # LoRA layers, leading to assertion error. The following check
+            # aims to prevent this error
+            if self.supports_mm and not isinstance(new_module,
+                                                   BaseLayerWithLoRA):
+                continue
+            self.register_module(module_name, new_module)
+            self._register_packed_modules(module_name)
+            # All lora layers share the same punica_wrapper based on reference.
+            new_module.set_mapping(self.punica_wrapper)
+
+    def register_module(self, module_name: str, module: "BaseLayerWithLoRA"):
+        assert isinstance(module, BaseLayerWithLoRA)
+        self.modules[module_name] = module
+
+    def create_dummy_lora(
+            self,
+            lora_id: int,
+            rank: int,
+            embedding_modules: Optional[dict[str, str]] = None) -> LoRAModel:
+        """Create zero-initialized LoRAModel for warmup."""
+        model = LoRAModel(lora_id, rank, {})
+        for module_name, module in self.model.named_modules():
+            bias_enabled = self.lora_config.bias_enabled
+            if (not self._match_target_modules(module_name)
+                    or not isinstance(module, BaseLayerWithLoRA)
+                    or self._filter_unsupported_mm_module(module_name)):
+                continue
+            parts = module_name.split(".")
+            if module_name not in self.packed_modules:
+                assert embedding_modules is not None
+                if parts[-1] in embedding_modules:
+                    input_dim = (module.base_layer.org_vocab_size +
+                                 self.lora_config.lora_extra_vocab_size if
+                                 hasattr(module.base_layer, "org_vocab_size")
+                                 else module.base_layer.weight.shape[1])
+                    output_dim = module.base_layer.embedding_dim if hasattr(
+                        module.base_layer,
+                        "embedding_dim") else module.base_layer.weight.shape[0]
+                    embeddings_tensor_dim = (module.base_layer.embedding_dim if
+                                             hasattr(module.base_layer,
+                                                     "embedding_dim") else
+                                             module.base_layer.weight.shape[1])
+                    lora = LoRALayerWeights.create_dummy_lora_weights(
+                        module_name,
+                        input_dim,
+                        output_dim,
+                        rank,
+                        module.lora_a_stacked[0].dtype,
+                        "cpu",
+                        embeddings_tensor_dim=embeddings_tensor_dim,
+                        bias_enabled=bias_enabled)
+                else:
+                    lora = LoRALayerWeights.create_dummy_lora_weights(
+                        module_name,
+                        module.lora_a_stacked[0].shape[-1],
+                        module.lora_b_stacked[0].shape[-2],
+                        rank,
+                        module.lora_a_stacked[0].dtype,
+                        "cpu",
+                        bias_enabled=bias_enabled,
+                    )
+                lora.optimize()
+            else:
+                parts = module_name.split(".")
+                replacements = self.packed_modules_mapping[parts[-1]]
+                subloras: list[Optional[LoRALayerWeights]] = []
+                for i, r in enumerate(replacements):
+                    lora = LoRALayerWeights.create_dummy_lora_weights(
+                        module_name + "." + r,
+                        module.lora_a_stacked[i].shape[-1],
+                        module.lora_b_stacked[i].shape[-2],
+                        rank,
+                        module.lora_a_stacked[i].dtype,
+                        "cpu",
+                        bias_enabled=bias_enabled,
+                    )
+                    lora.optimize()
+                    subloras.append(lora)
+                lora = PackedLoRALayerWeights.pack(subloras)
+            model.loras[module_name] = lora
+        return model
+
+    def _match_target_modules(self, module_name: str):
+        return any(
+            re.match(
+                r".*\.{target_module}$".format(target_module=target_module),
+                module_name) or target_module == module_name
+            for target_module in self.supported_lora_modules)
+
+    def _filter_unsupported_mm_module(self, module_name: str) -> bool:
+        """
+        Regarding multimodal models, vLLM currently only supports adding LoRA to
+        language model. LoRA for other modules, such as the vision tower, will
+        be filtered out.
+        """
+        if self.supports_mm:
+            module_mapping: MultiModelKeys = self.model.get_mm_mapping()
+            prefix_lst = module_mapping.connector + module_mapping.tower_model
+            return any(
+                [module_name.startswith(prefix) for prefix in prefix_lst])
+        return False
+
+    def _register_packed_modules(self, module_full_name: str) -> None:
+        parts = module_full_name.split(".")
+        module_name = parts[-1]
+        replacements = self.packed_modules_mapping.get(module_name, [])
+        # When replacements is less than or equal to 1, it indicates that this
+        # module is not a packed module.
+        if len(replacements) <= 1:
+            return
+        prefix = ".".join(parts[:-1])
+        self.packed_modules[module_full_name] = [
+            prefix + "." + r if prefix else r for r in replacements
+        ]
+
+    def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None:
+        for module_name, new_module_names in self.packed_modules.items():
+            replacement_loras: list[Optional[LoRALayerWeights]] = []
+            replaced_module: set[str] = set()
+            has_replacement = False
+            for r in new_module_names:
+                lora = self._get_lora_layer_weights(lora_model, r)
+                replacement_loras.append(lora)
+                if lora:
+                    has_replacement = True
+                    replaced_module.add(r)
+            if not has_replacement:
+                continue
+            for i in range(len(replacement_loras)):
+                if replacement_loras[i]:
+                    continue
+                replacement_loras[i] = None
+            # HACK Temporary solution for the pool model.
+            if self.is_pooling_model and not lora_model.check_lora_name(
+                    module_name):
+                replaced_module_name = module_name.replace("model.", "")
+                if lora_model.check_lora_name(module_name):
+                    module_name = replaced_module_name
+            lora_model.loras[module_name] = PackedLoRALayerWeights.pack(
+                replacement_loras)
+            # Remove the modules that have been replaced.
+            for module in replaced_module:
+                lora_model.loras.pop(module, None)
+
+    def _get_lora_layer_weights(
+            self, lora_model: LoRAModel,
+            module_name: str) -> Optional[LoRALayerWeights]:
+        org_module_name = module_name
+        if self.is_pooling_model and not lora_model.check_lora_name(
+                module_name):
+            # If it's a pool model, and the layer name is not found,
+            # remove the prefix 'model.' and search again.
+            module_name = module_name.replace("model.", "")
+            if lora_model.check_lora_name(module_name):
+                org_module_name = module_name
+                logger.info_once(
+                    "For the pool model, successfully loaded the LoRA weights "
+                    "after removing the prefix 'model.'.")
+        return lora_model.get_lora(org_module_name)
+
+    def deactivate_adapter(self, adapter_id: int) -> bool:
+        return deactivate_adapter(adapter_id, self._active_adapters,
+                                  self._deactivate_adapter)
+
+    def add_adapter(self, adapter: LoRAModel) -> bool:
+        logger.debug("Adding lora. Model id: %d, "
+                     "int id: %d", adapter.id, adapter.id)
+        return add_adapter(adapter, self._registered_adapters, self.capacity,
+                           self._add_adapter)
+
+    def set_adapter_mapping(self, mapping: LoRAMapping) -> None:
+        self._last_mapping = set_adapter_mapping(mapping, self._last_mapping,
+                                                 self._set_adapter_mapping)
+
+    def remove_adapter(self, adapter_id: int) -> bool:
+        return remove_adapter(adapter_id, self._registered_adapters,
+                              self.deactivate_adapter)
+
+    def list_adapters(self) -> dict[int, Any]:
+        return list_adapters(self._registered_adapters)
+
+    def get_adapter(self, adapter_id: int) -> Optional[Any]:
+        return get_adapter(adapter_id, self._registered_adapters)
+
+
+class LoRALRUCache(AdapterLRUCache[LoRAModel]):
+
+    def __init__(self, capacity: int, deactivate_lora_fn: Callable[[int],
+                                                                   bool]):
+        super().__init__(capacity, deactivate_lora_fn)
+
+
+class LRUCacheLoRAModelManager(LoRAModelManager):
+    """A model manager that manages multiple LoRAs with LRU cache."""
+
+    def __init__(self, model: nn.Module, max_num_seqs: int,
+                 max_num_batched_tokens: int, vocab_size: int,
+                 lora_config: LoRAConfig, device: torch.device):
+        super().__init__(model, max_num_seqs, max_num_batched_tokens,
+                         vocab_size, lora_config, device)
+        self._registered_adapters: LoRALRUCache = LoRALRUCache(
+            self.capacity, self.deactivate_adapter)
+        self._active_adapters: LoRALRUCache = LoRALRUCache(
+            self.lora_slots, self._deactivate_adapter)
+
+    def list_adapters(self) -> dict[int, LoRAModel]:
+        """List all registered LoRAModels."""
+        return dict(self._registered_adapters.cache)
+
+    def add_adapter(self, lora: LoRAModel) -> bool:
+        """Add a LoRAModel to the manager."""
+        logger.debug("Adding lora. Model id: %d, "
+                     "int id: %d", lora.id, lora.id)
+        if lora.id not in self._registered_adapters:
+            self._add_adapter(lora)
+            was_added = True
+        else:
+            # We always touch to update the LRU cache order
+            self._registered_adapters.touch(lora.id)
+            was_added = False
+        return was_added
+
+    def activate_adapter(
+        self,
+        lora_id: int,
+    ) -> bool:
+        if lora_id not in self._active_adapters and len(
+                self._active_adapters) >= self.lora_slots:
+            self._active_adapters.remove_oldest()
+        result = super().activate_adapter(lora_id)
+        # We always touch to update the LRU cache order
+        self._active_adapters.touch(lora_id)
+        return result
+
+    def remove_oldest_adapter(self) -> bool:
+        if len(self._registered_adapters) > 0:
+            self._registered_adapters.remove_oldest()
+            return True
+        return False
+
+    def pin_adapter(self, lora_id: int) -> bool:
+        """Pin a LoRAModel in the manager cache."""
+        self._pin_lora_in_cpu_cache(lora_id)
+        self._pin_lora_in_gpu_cache(lora_id)
+        return True
+
+    def _pin_lora_in_cpu_cache(self, lora_id: int):
+        try:
+            self._registered_adapters.pin(lora_id)
+        except ValueError as err:
+            raise ValueError("Pinning failed. "
+                             f"LoRA {lora_id} is not registered.") from err
+
+    def _pin_lora_in_gpu_cache(self, lora_id: int):
+        if lora_id not in self._active_adapters:
+            # move lora to gpu if not already active
+            self.activate_adapter(lora_id)
+
+        self._active_adapters.pin(lora_id)
+
+
+def create_lora_manager(
+        model: nn.Module,
+        max_num_seqs: int,
+        max_num_batched_tokens: int,
+        vocab_size: int,
+        lora_config: LoRAConfig,
+        device: torch.device,
+        lora_manager_cls: type[LoRAModelManager] = LoRAModelManager,
+        **kwargs) -> LoRAModelManager:
+    """Create a LoRA adapter for a given model."""
+    if not isinstance(model, SupportsLoRA):
+        raise ValueError(f"Model {type(model)} is not supported for LoRA.")
+    lora_manager = lora_manager_cls(
+        model=model,
+        max_num_seqs=max_num_seqs,
+        max_num_batched_tokens=max_num_batched_tokens,
+        vocab_size=vocab_size,
+        lora_config=lora_config,
+        device=device,
+        **kwargs)
+    return lora_manager
diff --git a/vllm_v0.10.0/vllm/lora/ops/__init__.py b/vllm_v0.10.0/vllm/lora/ops/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/lora/ops/torch_ops/__init__.py b/vllm_v0.10.0/vllm/lora/ops/torch_ops/__init__.py
new file mode 100644
index 0000000..22aa3c6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/lora/ops/torch_ops/__init__.py
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.lora.ops.torch_ops.lora_ops import bgmv_expand  # noqa: F401
+from vllm.lora.ops.torch_ops.lora_ops import (bgmv_expand_slice, bgmv_shrink,
+                                              sgmv_expand, sgmv_expand_slice,
+                                              sgmv_shrink)
+
+__all__ = [
+    "bgmv_expand",
+    "bgmv_expand_slice",
+    "bgmv_shrink",
+    "sgmv_expand",
+    "sgmv_expand_slice",
+    "sgmv_shrink",
+]
diff --git a/vllm_v0.10.0/vllm/lora/ops/torch_ops/lora_ops.py b/vllm_v0.10.0/vllm/lora/ops/torch_ops/lora_ops.py
new file mode 100644
index 0000000..cba5baa
--- /dev/null
+++ b/vllm_v0.10.0/vllm/lora/ops/torch_ops/lora_ops.py
@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+
+def sgmv_expand(inputs: torch.Tensor,
+                lora_b_weights: torch.Tensor,
+                output_tensor: torch.Tensor,
+                b_seq_start_loc: torch.Tensor,
+                seq_len_tensor: torch.Tensor,
+                lora_indices_tensor: torch.Tensor,
+                batches: int,
+                max_seq_length: int,
+                token_nums: int,
+                add_inputs: bool = False):
+    exploded_indices = torch.repeat_interleave(lora_indices_tensor,
+                                               seq_len_tensor)
+
+    bgmv_expand(inputs, lora_b_weights, output_tensor, exploded_indices,
+                add_inputs)
+
+
+def bgmv_expand(inputs: torch.Tensor,
+                lora_b_weights: torch.Tensor,
+                output_tensor: torch.Tensor,
+                lora_indices_tensor: torch.Tensor,
+                add_inputs: bool = True):
+    selected_loras = lora_b_weights[lora_indices_tensor].to(
+        dtype=output_tensor.dtype)
+    if len(selected_loras.shape) == 4:
+        selected_loras = selected_loras.squeeze(dim=1)
+    inputs = inputs.to(dtype=output_tensor.dtype)
+    outputs = torch.einsum("bi, boi -> bo", inputs, selected_loras)
+
+    limit = output_tensor.shape[0]
+    if outputs.shape[0] == 1 and output_tensor.shape[0] != 1:
+        limit = 1
+
+    # LoRA adapter and model may add different amounts of padding to output
+    common_len = min(outputs.shape[1], output_tensor.shape[1])
+
+    if add_inputs:
+        output_tensor[:, :common_len] += outputs[:limit, :common_len]
+    else:
+        output_tensor[:, :common_len] = outputs[:limit, :common_len]
+
+
+def sgmv_shrink(
+    inputs: torch.Tensor,
+    lora_a_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    b_seq_start_loc: torch.Tensor,
+    seq_len_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    batches: int,
+    max_seq_length: int,
+    token_nums: int,
+    scaling: float,
+):
+    exploded_indices = torch.repeat_interleave(lora_indices_tensor,
+                                               seq_len_tensor)
+
+    bgmv_shrink(inputs, lora_a_weights, output_tensor, exploded_indices,
+                scaling)
+
+
+def bgmv_shrink(inputs: torch.Tensor,
+                lora_b_weights: torch.Tensor,
+                output_tensor: torch.Tensor,
+                lora_indices_tensor: torch.Tensor,
+                scaling: float = 1.0):
+    selected_loras = lora_b_weights[lora_indices_tensor].to(
+        dtype=output_tensor.dtype)
+    if len(selected_loras.shape) == 4:
+        selected_loras = selected_loras.squeeze(dim=1)
+    inputs = inputs.to(dtype=output_tensor.dtype)
+    outputs = torch.einsum("bi, boi -> bo", inputs, selected_loras)
+
+    output_tensor[:, :outputs.shape[1]] = scaling * outputs[:]
+
+
+def sgmv_expand_slice(inputs: torch.Tensor,
+                      lora_b_weights: torch.Tensor,
+                      output_tensor: torch.Tensor,
+                      b_seq_start_loc: torch.Tensor,
+                      seq_len_tensor: torch.Tensor,
+                      lora_indices_tensor: torch.Tensor,
+                      batches: int,
+                      max_seq_length: int,
+                      token_nums: int,
+                      slice_offset: int,
+                      slice_size: int,
+                      add_inputs: bool = False):
+    exploded_indices = torch.repeat_interleave(lora_indices_tensor,
+                                               seq_len_tensor)
+
+    bgmv_expand_slice(inputs, lora_b_weights, output_tensor, exploded_indices,
+                      slice_offset, slice_size, add_inputs)
+
+
+def bgmv_expand_slice(inputs: torch.Tensor,
+                      lora_b_weights: torch.Tensor,
+                      output_tensor: torch.Tensor,
+                      lora_indices_tensor: torch.Tensor,
+                      slice_offset: int,
+                      slice_size: int,
+                      add_inputs: bool = True):
+    selected_loras = lora_b_weights[lora_indices_tensor].to(
+        dtype=output_tensor.dtype)
+    inputs = inputs.to(dtype=output_tensor.dtype)
+    if len(selected_loras.shape) == 4:
+        selected_loras = selected_loras.squeeze(dim=1)
+    outputs = torch.einsum("bi, boi -> bo", inputs, selected_loras)
+
+    if add_inputs:
+        output_tensor[:, slice_offset:slice_offset + slice_size] += outputs[:]
+    else:
+        output_tensor[:, slice_offset:slice_offset + slice_size] = outputs[:]
diff --git a/vllm_v0.10.0/vllm/lora/ops/triton_ops/__init__.py b/vllm_v0.10.0/vllm/lora/ops/triton_ops/__init__.py
new file mode 100644
index 0000000..805de4b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/lora/ops/triton_ops/__init__.py
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.lora.ops.triton_ops.lora_expand_op import lora_expand
+from vllm.lora.ops.triton_ops.lora_kernel_metadata import LoRAKernelMeta
+from vllm.lora.ops.triton_ops.lora_shrink_op import lora_shrink
+
+__all__ = [
+    "lora_expand",
+    "lora_shrink",
+    "LoRAKernelMeta",
+]
diff --git a/vllm_v0.10.0/vllm/lora/ops/triton_ops/kernel_utils.py b/vllm_v0.10.0/vllm/lora/ops/triton_ops/kernel_utils.py
new file mode 100644
index 0000000..e93064d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/lora/ops/triton_ops/kernel_utils.py
@@ -0,0 +1,243 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Utilities for Punica kernel construction.
+"""
+from vllm.triton_utils import tl, triton
+
+
+@triton.jit
+def mm_k(a_ptr, b_ptr, ak_stride, bk_stride, offset_k, K: tl.constexpr,
+         BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+         EVEN_K: tl.constexpr, SPLIT_K: tl.constexpr, CAST_TYPE: tl.constexpr,
+         b_dtype: tl.constexpr):
+    """
+    Given a_ptr and b_ptr, that identify the rows of A (m x k) and columns of
+    B (k x n), iterate, through the K dimension to compute the partial/complete
+    matrix block product.
+    If SPLIT_K == 1, the output m x n product is complete.
+    If SPLIT_K > 1, the thread block computes partial outputs. The partial
+    outputs are then atomically summed in the caller code. 
+    Args:
+        a_ptr: Array of pointers, identifying rows of A 
+        b_ptr: Array of pointers, identifying columns of B
+        ak_stride: K dimension stride of the A matrix
+        bk_stride: K dimension stride of the B matrix
+        K: Length of the K dimension
+        BLOCK_M: M dimension of the output block m x n
+        BLOCK_N: N dimension of the output block m x n
+        BLOCK_K: K dimension atom
+        EVEN_K: True if the blocks of A and B can be loaded without any
+          masking.
+        SPLIT_K: Parameter signifying parallelism in the K dimension. 
+        CAST_TYPE: if True, cast the values from the A matrix to the B
+          matrix dtype.
+        b_dtype: datatype of the B matrix
+    """
+    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for k in range(tl.cdiv(K, BLOCK_K * SPLIT_K)):
+        if EVEN_K:
+            tiled_a = tl.load(a_ptr)
+            tiled_b = tl.load(b_ptr)
+        else:
+            tiled_a = tl.load(a_ptr,
+                              mask=offset_k[None, :]
+                              < K - k * (BLOCK_K * SPLIT_K),
+                              other=0)
+            tiled_b = tl.load(b_ptr,
+                              mask=offset_k[:, None]
+                              < K - k * (BLOCK_K * SPLIT_K),
+                              other=0)
+        if CAST_TYPE:
+            tiled_a = tiled_a.to(b_dtype)
+        accumulator += tl.dot(
+            tiled_a,
+            tiled_b,
+        )
+        a_ptr += BLOCK_K * SPLIT_K * ak_stride
+        b_ptr += BLOCK_K * SPLIT_K * bk_stride
+    return accumulator
+
+
+@triton.jit
+def do_expand_kernel(
+    pid_n,
+    lora_index,
+    slice_id,
+    input_ptr,
+    lora_ptr,
+    out_ptr,
+    N,
+    K,
+    M_LEN,
+    ram,  # array identifying the rows of Input ptr to operate on
+    slice_start_loc,
+    # input ptr strides
+    input_d0_stride,
+    input_d1_stride,
+    input_d2_stride,
+    # lora ptr strides
+    ls_d0_ptr,
+    ls_d1_ptr,
+    ls_d2_ptr,
+    # out ptr strides
+    output_d0_stride,
+    output_d1_stride,
+    # constants
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    SAME_STRIDE: tl.constexpr,
+    SLICE_NUM: tl.constexpr,
+    EVEN_K: tl.constexpr,
+    CAST_TYPE: tl.constexpr,
+    ADD_INPUTS: tl.constexpr,
+):
+    """
+    Given an array of integers that identifies the rows of A, ram,
+    a lora index that identifies which LoRA to use from lora_ptr, lora_index,
+    a slice_id that identifies the input/output slice,
+    compute the matrix product and store in the appropriate output location.
+    Given that this is an expand kernel, we don't perform any split-K reduction
+    as the K dimension is assumed to be small.
+    """
+
+    # ls_d*_ptr can be either an integer or a pointer
+    if SAME_STRIDE:
+        # integer
+        cur_lora_d0_stride = ls_d0_ptr
+        cur_lora_d1_stride = ls_d1_ptr
+        cur_lora_d2_stride = ls_d2_ptr
+    else:
+        # pointer
+        cur_lora_d0_stride = tl.load(ls_d0_ptr + slice_id)
+        cur_lora_d1_stride = tl.load(ls_d1_ptr + slice_id)
+        cur_lora_d2_stride = tl.load(ls_d2_ptr + slice_id)
+
+    # Identify the input_ptr and lora_ptr from slice_id.
+    if SLICE_NUM == 1:
+        cur_input_ptr = input_ptr
+        cur_lora_ptr = lora_ptr
+    else:
+        cur_input_ptr = input_ptr + slice_id * input_d0_stride
+        cur_lora_ptr = tl.load(lora_ptr + slice_id).to(
+            tl.pointer_type(out_ptr.dtype.element_ty))
+
+    # Identify the column indices of B to process.
+    offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
+
+    # Identify A and B block pointers
+    offset_k = tl.arange(0, BLOCK_K)
+    a_ptr = (cur_input_ptr + ram[:, None] * input_d1_stride +
+             offset_k[None, :] * input_d2_stride)
+    b_ptr = (cur_lora_ptr + cur_lora_d0_stride * lora_index +
+             offset_k[:, None] * cur_lora_d2_stride +
+             rbn[None, :] * cur_lora_d1_stride)
+
+    # Compute the block matrix product.
+    SPLIT_K = 1
+    accumulator = mm_k(a_ptr, b_ptr, input_d2_stride, cur_lora_d2_stride,
+                       offset_k, K, BLOCK_M, BLOCK_N, BLOCK_K, EVEN_K, SPLIT_K,
+                       CAST_TYPE, cur_lora_ptr.dtype.element_ty)
+
+    tiled_c = accumulator.to(cur_lora_ptr.dtype.element_ty)
+    if SLICE_NUM == 1:
+        cur_slice_start = slice_start_loc
+    else:
+        cur_slice_start = tl.load(slice_start_loc + slice_id)
+
+    # Identify the C output pointers to store the results of the accumulator.
+    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + cur_slice_start
+    offset_cm = tl.arange(0, BLOCK_M)
+    c_ptr = (out_ptr + ram[:, None] * output_d0_stride +
+             offset_cn[None, :] * output_d1_stride)
+    c_mask = (offset_cm[:, None] < M_LEN) & (offset_cn[None, :]
+                                             < (cur_slice_start + N))
+
+    if ADD_INPUTS:
+        tiled_out = tl.load(c_ptr, mask=c_mask)
+        tiled_c += tiled_out
+    tl.store(c_ptr, tiled_c, mask=c_mask)
+
+
+@triton.jit
+def do_shrink_kernel(
+    pid_n,
+    pid_sk,
+    slice_id,
+    lora_index,
+    input_ptr,
+    lora_ptr,
+    out_ptr,
+    N,
+    K,
+    M_LEN,
+    ram,
+    # input strides
+    input_d0_stride,
+    input_d1_stride,
+    # lora strides
+    lora_d0_stride,
+    lora_d1_stride,
+    lora_d2_stride,
+    # output strides
+    output_d0_stride,
+    output_d1_stride,
+    output_d2_stride,
+    scaling,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    EVEN_K: tl.constexpr,
+    SPLIT_K: tl.constexpr,
+    SLICE_NUM: tl.constexpr,
+):
+    """
+    Given an array of integers that identifies the rows of A, ram,
+    a lora index that identifies which LoRA to use from lora_ptr, lora_index,
+    a slice_id that identifies the input/output slice, compute the
+    matrix product and store in the appropriate output location.
+    """
+
+    # Identify the lora_ptr from slice_id.
+    if SLICE_NUM == 1:
+        # current lora ptr
+        cur_lora_ptr = lora_ptr
+    else:
+        # current lora ptr
+        cur_lora_ptr = tl.load(lora_ptr + slice_id).to(
+            tl.pointer_type(input_ptr.dtype.element_ty))
+
+    # Identify the column indices of B to process.
+    offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
+
+    # Identify A and B block pointers
+    offset_k = pid_sk * BLOCK_K + tl.arange(0, BLOCK_K)
+    a_ptr = (input_ptr + ram[:, None] * input_d0_stride +
+             offset_k[None, :] * input_d1_stride)
+    b_ptr = (cur_lora_ptr + lora_d0_stride * lora_index +
+             rbn[None, :] * lora_d1_stride +
+             offset_k[:, None] * lora_d2_stride)
+
+    # Compute partial/complete block matrix product.
+    accumulator = mm_k(a_ptr, b_ptr, input_d1_stride, lora_d2_stride, offset_k,
+                       K, BLOCK_M, BLOCK_N, BLOCK_K, EVEN_K, SPLIT_K, False,
+                       cur_lora_ptr.dtype.element_ty)
+
+    # Identify the C output pointers to store the results of the accumulator.
+    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    offset_cm = tl.arange(0, BLOCK_M)
+    cur_out_ptr = (out_ptr if SLICE_NUM == 1 else out_ptr +
+                   slice_id * output_d0_stride)
+    c_ptr = cur_out_ptr + ram[:, None] * output_d1_stride + offset_cn[
+        None, :] * output_d2_stride
+    c_mask = (offset_cm[:, None] < M_LEN) & (offset_cn[None, :] < N)
+
+    accumulator *= scaling
+    # handles write-back with reduction-splitting
+    if SPLIT_K == 1:
+        tl.store(c_ptr, accumulator, mask=c_mask)
+    else:
+        tl.atomic_add(c_ptr, accumulator, mask=c_mask)
diff --git a/vllm_v0.10.0/vllm/lora/ops/triton_ops/lora_expand_op.py b/vllm_v0.10.0/vllm/lora/ops/triton_ops/lora_expand_op.py
new file mode 100644
index 0000000..b1ab84e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/lora/ops/triton_ops/lora_expand_op.py
@@ -0,0 +1,291 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023).
+Punica: Multi-Tenant LoRA Serving.
+https://arxiv.org/abs/2310.18547
+"""
+
+import torch
+
+from vllm.lora.ops.triton_ops.kernel_utils import do_expand_kernel
+from vllm.lora.ops.triton_ops.utils import _get_lora_b_ptr
+from vllm.platforms import current_platform
+from vllm.triton_utils import tl, triton
+from vllm.utils import direct_register_custom_op
+
+
+@triton.jit
+def _lora_expand_kernel(
+        input_ptr,
+        lora_ptr,
+        out_ptr,
+        M,
+        N,
+        K,
+        token_indices_sorted_by_lora_ids,
+        num_tokens_per_lora,
+        lora_token_start_loc,
+        lora_ids,
+        slice_start_loc,
+        input_d0_stride,
+        input_d1_stride,
+        input_d2_stride,  # 1
+        ls_d0_ptr,
+        ls_d1_ptr,
+        ls_d2_ptr,  # 1
+        output_d0_stride,
+        output_d1_stride,  # 1
+        output_hs_ptr,
+        BLOCK_M: tl.constexpr,
+        BLOCK_N: tl.constexpr,
+        BLOCK_K: tl.constexpr,
+        EVEN_K: tl.constexpr,
+        ADD_INPUTS: tl.constexpr,
+        CAST_TYPE: tl.constexpr,
+        SLICE_NUM: tl.constexpr,
+        SAME_STRIDE: tl.constexpr):
+
+    cta_n_num = tl.cdiv(N, BLOCK_N)
+    cta_m_num = tl.cdiv(M, BLOCK_M)
+
+    pid_mn = tl.program_id(axis=0)
+    pid_m = pid_mn % cta_m_num
+    pid_n = (pid_mn // cta_m_num) % cta_n_num
+
+    slice_id = tl.program_id(axis=1)
+    lora_idx = tl.program_id(axis=2)
+
+    lora_id = tl.load(lora_ids + lora_idx)
+    if lora_id == -1:
+        # Early exit for the no-lora case.
+        return
+
+    lora_m_size = tl.load(num_tokens_per_lora + lora_idx)
+
+    cta_m_offset = pid_m * BLOCK_M
+    if cta_m_offset >= lora_m_size:
+        # Early exit CTA.
+        return
+
+    # When the output dimensions of each slice are the same,cur_n=N, otherwise
+    # cur_n=tl.load(output_hs_ptr + slice_id), this situation exists in GQA's
+    # qkv linear.
+    curr_N = N if SAME_STRIDE else tl.load(output_hs_ptr + slice_id)
+    if pid_n * BLOCK_N >= curr_N:
+        # Early exit CTA.
+        return
+
+    # num rows this CTA should process.
+    cta_m_len = min(BLOCK_M, lora_m_size - cta_m_offset)
+
+    # Identify all rows that this CTA should process.
+    lora_m_indices_start = tl.load(lora_token_start_loc + lora_idx)
+    cta_lora_seq_indices = (token_indices_sorted_by_lora_ids +
+                            lora_m_indices_start + cta_m_offset)
+
+    # Load all relevant row indices.
+    offset_m = tl.arange(0, BLOCK_M) % cta_m_len
+    ram = tl.load(cta_lora_seq_indices + offset_m)
+
+    do_expand_kernel(
+        pid_n,
+        lora_id,
+        slice_id,
+        input_ptr,
+        lora_ptr,
+        out_ptr,
+        curr_N,
+        K,
+        cta_m_len,
+        ram,  # array identifying the rows of Input ptr to operate on
+        slice_start_loc,
+        # input ptr strides
+        input_d0_stride,
+        input_d1_stride,
+        input_d2_stride,
+        # lora ptr strides
+        ls_d0_ptr,
+        ls_d1_ptr,
+        ls_d2_ptr,
+        # out ptr strides
+        output_d0_stride,
+        output_d1_stride,
+        # constants
+        BLOCK_M,
+        BLOCK_N,
+        BLOCK_K,
+        SAME_STRIDE,
+        SLICE_NUM,
+        EVEN_K,
+        CAST_TYPE,
+        ADD_INPUTS)
+
+
+@torch.inference_mode()
+def _lora_expand(
+    inputs: torch.Tensor,  # shape [num_slices, num_tokens, lora_rank]
+    lora_b_weights: list[
+        torch.Tensor],  # shape [num_lora, hidden_size, lora_rank]
+    output_tensor: torch.
+    Tensor,  # shape [num_tokens, hidden_size * num_slices]
+    token_lora_mapping: torch.Tensor,  # shape [num_tokens]
+    token_indices_sorted_by_lora_ids: torch.Tensor,  # shape [num_tokens]
+    num_tokens_per_lora: torch.Tensor,  # shape [max-loras + 1]
+    lora_token_start_loc: torch.Tensor,  # shape [max-loras + 2]
+    lora_ids: torch.Tensor,  # shape [max-loras + 1]
+    no_lora_flag_cpu: torch.Tensor,  # shape [1] 
+    offset_start: int = 0,
+    add_inputs: bool = False,
+) -> None:
+    """
+    Args:
+        inputs (torch.Tensor): input tensor
+        lora_b_weights (list[torch.Tensor]): lora'b weight
+        output_tensor (torch.Tensor): output tensor
+        token_lora_mapping (torch.Tensor): A tensor mapping each input token
+            to the lora-id related to that token. A value of -1 indicates that
+            LoRA doesn't apply to that token.
+        token_indices_sorted_by_lora_ids (torch.Tensor): Row/Token indices from
+            the A matrix grouped by LoRA IDs.
+        num_tokens_per_lora (torch.Tensor): num_tokens_per_lora[i] is the number
+            of tokens that are to be processed by LoRA ID lora_ids[i] 
+        lora_token_start_loc (torch.Tensor): A cumulative sum of
+            num_tokens_per_lora. lora_token_start_loc[0] is always 0 so that
+            lora_token_start_loc[i], along with num_tokens_per_lora[i]
+            identifies the region in token_indices_sorted_by_lora_ids that
+            LoRA lora_ids[i] should process.
+        lora_ids (torch.Tensor): LoRA ids to process.
+        no_lora_flag_cpu (torch.Tensor): A CPU tensor of size 1, that indicates
+            if there are any requests that require LoRA.
+        offset_start (int, optional): Offset start for output_tensor. 
+            Defaults to 0.
+        add_inputs (bool, optional): Whether to add the input tensor to the 
+            output tensor. Defaults to False.
+    """
+
+    assert no_lora_flag_cpu.numel() == 1
+    if no_lora_flag_cpu.item():
+        # None of the inputs require LoRA.
+        return
+
+    assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
+    for weight in lora_b_weights:
+        assert weight.dtype in [torch.float16, torch.bfloat16]
+
+    assert inputs.size(0) == len(lora_b_weights)
+    assert output_tensor.is_contiguous()
+
+    # metadata sanity check.
+    M = inputs.size(1)
+    assert token_lora_mapping.size(0) == M
+    assert token_lora_mapping.size(0) == token_indices_sorted_by_lora_ids.size(
+        0)
+    assert lora_ids.size(0) == num_tokens_per_lora.size(0)
+    assert lora_token_start_loc.size(0) == lora_ids.size(0) + 1
+
+    (slice_start_tensor, lora_ptr_tensor, lora_strides_d0_tensor,
+     lora_strides_d1_tensor, lora_strides_d2_tensor, hidden_sizes_tensor,
+     same_stride, MAX_N) = _get_lora_b_ptr(lora_b_weights, offset_start,
+                                           inputs.device)
+
+    K = lora_b_weights[0].shape[-1]  # K= rank
+    ADD_INPUTS = add_inputs
+    MAX_LORAS = lora_ids.size(0)
+    CAST_TYPE = False
+    NUM_SLICES = len(lora_b_weights)
+
+    # Triton kernel configs.
+    BLOCK_M = 64
+    BLOCK_N = 128
+    BLOCK_K = 16
+    NUM_WARPS = 4
+    NUM_CTAS = 1
+    NUM_STAGES = 2
+
+    EVEN_K = K % BLOCK_K == 0  # type: ignore
+
+    if inputs.dtype == torch.float32 and lora_b_weights[0].dtype in [
+            torch.float16,
+            torch.bfloat16,
+    ]:
+        CAST_TYPE = True
+
+    # TODO (varun): This grid formulation maximizes parallelization at the
+    # cost of wasteful thread block launch when only a few input tokens require
+    # LoRA. This might not be the best in all cases.
+    grid = (
+        triton.cdiv(M, BLOCK_M) * triton.cdiv(MAX_N, BLOCK_N),
+        NUM_SLICES,
+        # Each LoRA receives its own set of thread blocks for output
+        # computation. If some LoRA doesn't have any tokens to process, its
+        # thread blocks simply exit.
+        MAX_LORAS,
+    )
+
+    _lora_expand_kernel[grid](
+        inputs,
+        lora_ptr_tensor,
+        output_tensor,
+        M,
+        MAX_N,
+        K,
+        token_indices_sorted_by_lora_ids,
+        num_tokens_per_lora,
+        lora_token_start_loc,
+        lora_ids,
+        slice_start_tensor,
+        inputs.stride(0),
+        inputs.stride(1),
+        inputs.stride(2),
+        lora_strides_d0_tensor,
+        lora_strides_d1_tensor,
+        lora_strides_d2_tensor,
+        output_tensor.stride(0),
+        output_tensor.stride(1),
+        hidden_sizes_tensor,
+        BLOCK_M,
+        BLOCK_N,
+        BLOCK_K,
+        EVEN_K,
+        ADD_INPUTS,
+        CAST_TYPE,
+        NUM_SLICES,
+        same_stride,
+        num_warps=NUM_WARPS,
+        num_ctas=NUM_CTAS,
+        num_stages=NUM_STAGES,
+    )
+
+    return
+
+
+def _lora_expand_fake(
+    inputs: torch.Tensor,
+    lora_b_weights: list[torch.Tensor],
+    output_tensor: torch.Tensor,
+    token_lora_mapping: torch.Tensor,
+    token_indices_sorted_by_lora_ids: torch.Tensor,
+    num_tokens_per_lora: torch.Tensor,
+    lora_token_start_loc: torch.Tensor,
+    lora_ids: torch.Tensor,
+    no_lora_flag_cpu: torch.Tensor,
+    offset_start: int = 0,
+    add_inputs: bool = False,
+) -> None:
+    return
+
+
+try:
+    direct_register_custom_op(
+        op_name="lora_expand",
+        op_func=_lora_expand,
+        mutates_args=["output_tensor"],
+        fake_impl=_lora_expand_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+    lora_expand = torch.ops.vllm.lora_expand
+
+except AttributeError:
+    lora_expand = _lora_expand
diff --git a/vllm_v0.10.0/vllm/lora/ops/triton_ops/lora_kernel_metadata.py b/vllm_v0.10.0/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
new file mode 100644
index 0000000..39e647b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
@@ -0,0 +1,148 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+LoRA kernels metadata preparation utilities.
+"""
+
+from dataclasses import dataclass
+from typing import Union
+
+import torch
+
+
+@dataclass
+class LoRAKernelMeta:
+    token_lora_mapping: torch.Tensor
+    token_indices_sorted_by_lora_ids: torch.Tensor
+    active_lora_ids: torch.Tensor
+    num_tokens_per_lora: torch.Tensor
+    lora_token_start_loc: torch.Tensor
+
+    # The V1 architecture uses the traced torch.compile graphs to execute
+    # a forward pass. Things to note about this process,
+    # 1. The tracing infers all python scalar datatype objects into a constant
+    # value.
+    # 2. The tracing cannot handle dynamic control flow. (dynamic control flow
+    # is an experimental feature in pytorch)
+    # 3. The internals of torch.ops functions are not traced.
+    # We disguise the "no_lora" flag as a cpu tensor and leverage point number 3
+    # to early exit from inside the lora_expand / lora_shrink torch operation.
+    no_lora_flag_cpu: torch.Tensor
+
+    @staticmethod
+    def make(max_loras: int, max_num_tokens: int,
+             device: Union[torch.device, str]) -> "LoRAKernelMeta":
+
+        token_lora_mapping = torch.empty(max_num_tokens,
+                                         dtype=torch.int32,
+                                         device=device)
+
+        token_indices_sorted_by_lora_ids = torch.empty(max_num_tokens,
+                                                       dtype=torch.int32,
+                                                       device=device)
+
+        # +1 because "no-lora" is also a possibility
+        # example: let max_loras be 3, active_lora_ids of [-1, 0, 2, 1]
+        # is a possibility.
+        active_lora_ids = torch.empty(max_loras + 1,
+                                      dtype=torch.int32,
+                                      device=device)
+
+        # using running example, [3, 10, 5, 2] is a possibility.
+        num_tokens_per_lora = torch.zeros(max_loras + 1,
+                                          dtype=torch.int32,
+                                          device=device)
+
+        # +2 for this because, the first index is always 0.
+        # using running example, lora_token_start_loc
+        # is [0, 3, 13, 18, 20].
+        lora_token_start_loc = torch.zeros(max_loras + 2,
+                                           dtype=torch.int32,
+                                           device=device)
+
+        no_lora_flag_cpu = torch.tensor([False],
+                                        dtype=torch.bool,
+                                        device='cpu')
+
+        return LoRAKernelMeta(
+            token_lora_mapping=token_lora_mapping,
+            token_indices_sorted_by_lora_ids=token_indices_sorted_by_lora_ids,
+            active_lora_ids=active_lora_ids,
+            num_tokens_per_lora=num_tokens_per_lora,
+            lora_token_start_loc=lora_token_start_loc,
+            no_lora_flag_cpu=no_lora_flag_cpu)
+
+    def _reset(self):
+        self.active_lora_ids.fill_(-1)
+        self.num_tokens_per_lora.fill_(0)
+        self.lora_token_start_loc.fill_(0)
+        self.no_lora_flag_cpu.fill_(False)
+
+    def prepare_tensors(self, token_lora_mapping: torch.Tensor) -> None:
+        """
+        Prepare kernel metadata tensors for the current forward pass.
+
+        Args:
+            token_lora_tensor (torch.Tensor): Tensor containing lora indices
+            for each input token.
+        """
+
+        self._reset()
+
+        # Check and record no-lora case.
+        no_lora = torch.all(token_lora_mapping == -1)
+        self.no_lora_flag_cpu[0] = no_lora
+
+        if no_lora:
+            # Early exit. LoRA kernels will not be run.
+            return
+
+        num_tokens = token_lora_mapping.size(0)
+
+        # copy token lora mapping
+        self.token_lora_mapping[:num_tokens].copy_(token_lora_mapping,
+                                                   non_blocking=True)
+
+        # token_indices_sorted_by_lora_ids
+        _, token_indices_sorted_by_lora_ids = torch.sort(token_lora_mapping,
+                                                         stable=True)
+        # start gpu transfer
+        self.token_indices_sorted_by_lora_ids[:num_tokens].copy_(
+            token_indices_sorted_by_lora_ids, non_blocking=True)
+
+        # active_lora_ids, num_tokens_per_lora
+        lora_ids, num_tokens_per_lora = torch.unique(token_lora_mapping,
+                                                     sorted=True,
+                                                     return_counts=True)
+        self.active_lora_ids[:lora_ids.size(0)].copy_(lora_ids,
+                                                      non_blocking=True)
+        self.num_tokens_per_lora[:num_tokens_per_lora.size(0)].copy_(
+            num_tokens_per_lora, non_blocking=True)
+
+        # lora_token_start_loc
+        lora_token_start_loc = torch.cumsum(num_tokens_per_lora, dim=0)
+        self.lora_token_start_loc[1:1 + lora_token_start_loc.size(0)].copy_(
+            lora_token_start_loc, non_blocking=True)
+
+    def meta_args(
+        self, token_nums: int
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
+               torch.Tensor, torch.Tensor]:
+        """
+        This function returns the kernel metadata required for the current
+        forward pass execution of the kernel. The function returns all the
+        metadata required by the kernel, in order, as a tuple, so it can be
+        unpacked directly during the lora_shrink/lora_expand function call.
+
+        Args:
+            token_nums (int): Number of input tokens in the current forward
+            pass. 
+        """
+        return (
+            self.token_lora_mapping[:token_nums],
+            self.token_indices_sorted_by_lora_ids[:token_nums],
+            self.num_tokens_per_lora,
+            self.lora_token_start_loc,
+            self.active_lora_ids,
+            self.no_lora_flag_cpu,
+        )
diff --git a/vllm_v0.10.0/vllm/lora/ops/triton_ops/lora_shrink_op.py b/vllm_v0.10.0/vllm/lora/ops/triton_ops/lora_shrink_op.py
new file mode 100644
index 0000000..1e7075a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/lora/ops/triton_ops/lora_shrink_op.py
@@ -0,0 +1,245 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
+
+import torch
+
+from vllm.lora.ops.triton_ops.kernel_utils import do_shrink_kernel
+from vllm.lora.ops.triton_ops.utils import _get_lora_a_ptr
+from vllm.platforms import current_platform
+from vllm.triton_utils import tl, triton
+from vllm.utils import direct_register_custom_op
+
+
+@triton.jit
+def _lora_shrink_kernel(input_ptr, lora_ptr, out_ptr, M, N, K,
+                        token_indices_sorted_by_lora_ids, num_tokens_per_lora,
+                        lora_token_start_loc, lora_ids, scaling,
+                        input_d0_stride, input_d1_stride, lora_d0_stride,
+                        lora_d1_stride, lora_d2_stride, output_d0_stride,
+                        output_d1_stride, output_d2_stride,
+                        BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr,
+                        BLOCK_K: tl.constexpr, EVEN_K: tl.constexpr,
+                        SPLIT_K: tl.constexpr, SLICE_NUM: tl.constexpr):
+
+    cta_n_num = tl.cdiv(N, BLOCK_N)
+    cta_m_num = tl.cdiv(M, BLOCK_M)
+
+    pid_sk_m_n = tl.program_id(axis=0)
+    pid_sk = pid_sk_m_n % SPLIT_K
+    pid_m = (pid_sk_m_n // SPLIT_K) % cta_m_num
+    pid_n = pid_sk_m_n // (SPLIT_K * cta_m_num) % cta_n_num
+
+    slice_id = tl.program_id(axis=1)
+    lora_idx = tl.program_id(axis=2)
+
+    lora_id = tl.load(lora_ids + lora_idx)
+    if lora_id == -1:
+        # Early exit for the no-lora case.
+        return
+
+    lora_m_size = tl.load(num_tokens_per_lora + lora_idx)
+
+    cta_m_offset = pid_m * BLOCK_M
+    if cta_m_offset >= lora_m_size:
+        # Early exit CTA.
+        return
+
+    # num rows this CTA should process.
+    cta_m_len = min(BLOCK_M, lora_m_size - cta_m_offset)
+
+    # Identify all rows that this CTA should process.
+    lora_m_indices_start = tl.load(lora_token_start_loc + lora_idx)
+    cta_lora_seq_indices = (token_indices_sorted_by_lora_ids +
+                            lora_m_indices_start + cta_m_offset)
+
+    # Load all relevant row indices.
+    offset_m = tl.arange(0, BLOCK_M) % cta_m_len
+    ram = tl.load(cta_lora_seq_indices + offset_m)
+
+    do_shrink_kernel(
+        pid_n,
+        pid_sk,
+        slice_id,
+        lora_id,
+        input_ptr,
+        lora_ptr,
+        out_ptr,
+        N,
+        K,
+        cta_m_len,
+        ram,  # array identifying the rows of Input ptr to operate on
+        # input strides
+        input_d0_stride,
+        input_d1_stride,
+        # lora strides
+        lora_d0_stride,
+        lora_d1_stride,
+        lora_d2_stride,
+        # output strides
+        output_d0_stride,
+        output_d1_stride,
+        output_d2_stride,
+        scaling,
+        BLOCK_M,
+        BLOCK_N,
+        BLOCK_K,
+        EVEN_K,
+        SPLIT_K,
+        SLICE_NUM)
+
+
+@torch.inference_mode()
+def _lora_shrink(
+    inputs: torch.Tensor,  #  shape [num_tokens, hidden_size]
+    lora_a_weights: list[
+        torch.Tensor],  # shape [num_loras, lora_rank, hidden_size]
+    output_tensor: torch.Tensor,  # shape [num_slices, num_tokens, lora_rank]
+    token_lora_mapping: torch.Tensor,  # shape [num_tokens]
+    token_indices_sorted_by_lora_ids: torch.Tensor,  # shape [num_tokens] 
+    num_tokens_per_lora: torch.Tensor,  # shape [max-loras + 1]
+    lora_token_start_loc: torch.Tensor,  # shape [max-loras + 2]
+    lora_ids: torch.Tensor,  # shape [max-loras + 1]
+    no_lora_flag_cpu: torch.Tensor,  # shape [1]
+    scaling: float,
+) -> None:
+    """
+    Args:
+        inputs (torch.Tensor): Input tensor
+        lora_a_weights (list[torch.Tensor]): LoRA weights
+        output_tensor (torch.Tensor): output tensor
+        token_lora_mapping (torch.Tensor): A tensor mapping each input token
+            to the lora-id related to that token. A value of -1 indicates that
+            LoRA doesn't apply to that token.
+        token_indices_sorted_by_lora_ids (torch.Tensor): Row/Token indices from
+            the A matrix grouped by LoRA IDs.
+        num_tokens_per_lora (torch.Tensor): num_tokens_per_lora[i] is the number
+            of tokens that are to be processed by LoRA ID lora_ids[i] 
+        lora_token_start_loc (torch.Tensor): A cumulative sum of
+            num_tokens_per_lora. lora_token_start_loc[0] is always 0 so that
+            lora_token_start_loc[i], along with num_tokens_per_lora[i]
+            identifies the region in token_indices_sorted_by_lora_ids that
+            LoRA lora_ids[i] should process.
+        lora_ids (torch.Tensor): LoRA ids to process.
+        no_lora_flag_cpu (torch.Tensor): A CPU tensor of size 1, that indicates
+            if there are any requests that require LoRA.
+        scaling (float): Scaling factor.
+    """
+
+    assert no_lora_flag_cpu.numel() == 1
+    if no_lora_flag_cpu.item():
+        # None of the inputs require LoRA.
+        return
+
+    assert inputs.dtype == lora_a_weights[0].dtype
+    assert inputs.dtype in [torch.float16, torch.bfloat16]
+    for weight in lora_a_weights:
+        assert weight.dtype in [torch.float16, torch.bfloat16]
+
+    assert inputs.size(1) == lora_a_weights[0].size(-1)
+    assert inputs.is_contiguous()
+    assert output_tensor.is_contiguous()
+
+    # metadata sanity check
+    M = inputs.size(0)
+    assert token_lora_mapping.size(0) == M
+    assert token_lora_mapping.size(0) == token_indices_sorted_by_lora_ids.size(
+        0)
+    assert lora_ids.size(0) == num_tokens_per_lora.size(0)
+    assert lora_token_start_loc.size(0) == lora_ids.size(0) + 1
+
+    (lora_ptr_tensor, lora_strides_d0, lora_strides_d1,
+     lora_strides_d2) = _get_lora_a_ptr(lora_a_weights, inputs.device)
+    N, K = lora_a_weights[0].shape[-2:]  # K=hidden_size,N=rank
+    NUM_SLICES = len(lora_a_weights)
+    MAX_LORAS = lora_ids.size(0)
+
+    # Triton kernel configs
+    BLOCK_M = 32
+    BLOCK_N = 16
+    BLOCK_K = 256 if M < 128 else 32
+    SPLIT_K = 64 if M < 128 else 8
+    NUM_WARPS = 4
+    NUM_CTAS = 1
+    NUM_STAGES = 2
+
+    EVEN_K = K % (BLOCK_K * SPLIT_K) == 0  # type: ignore
+
+    # TODO (varun): This grid formulation maximizes parallelization at the
+    # cost of wasteful thread block launch when only few of the input tokens
+    # require LoRA. This might not be the best in all cases.
+    grid = (
+        SPLIT_K * triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N),
+        NUM_SLICES,
+        # Each LoRA receives its own set of thread blocks for output
+        # computation. If some LoRA doesn't have any tokens to process, its
+        # thread blocks exit early.
+        MAX_LORAS,
+    )
+
+    _lora_shrink_kernel[grid](
+        inputs,
+        lora_ptr_tensor,
+        output_tensor,
+        M,
+        N,
+        K,
+        token_indices_sorted_by_lora_ids,
+        num_tokens_per_lora,
+        lora_token_start_loc,
+        lora_ids,
+        scaling,
+        inputs.stride(0),
+        inputs.stride(1),
+        lora_strides_d0,
+        lora_strides_d1,
+        lora_strides_d2,
+        output_tensor.stride(0),
+        output_tensor.stride(1),
+        output_tensor.stride(2),
+        BLOCK_M,
+        BLOCK_N,
+        BLOCK_K,
+        EVEN_K,
+        SPLIT_K,
+        NUM_SLICES,
+        num_warps=NUM_WARPS,
+        num_ctas=NUM_CTAS,
+        num_stages=NUM_STAGES,
+    )
+
+    return
+
+
+def _lora_shrink_fake(
+    inputs: torch.Tensor,
+    lora_a_weights: list[torch.Tensor],
+    output_tensor: torch.Tensor,
+    token_lora_mapping: torch.Tensor,
+    token_indices_sorted_by_lora_ids: torch.Tensor,
+    num_tokens_per_lora: torch.Tensor,
+    lora_token_start_loc: torch.Tensor,
+    lora_ids: torch.Tensor,
+    no_lora_flag_cpu: torch.Tensor,
+    scaling: float,
+) -> None:
+    return
+
+
+try:
+    direct_register_custom_op(
+        op_name="lora_shrink",
+        op_func=_lora_shrink,
+        mutates_args=["output_tensor"],
+        fake_impl=_lora_shrink_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+    lora_shrink = torch.ops.vllm.lora_shrink
+
+except AttributeError:
+    lora_shrink = _lora_shrink
diff --git a/vllm_v0.10.0/vllm/lora/ops/triton_ops/utils.py b/vllm_v0.10.0/vllm/lora/ops/triton_ops/utils.py
new file mode 100644
index 0000000..4c50fbd
--- /dev/null
+++ b/vllm_v0.10.0/vllm/lora/ops/triton_ops/utils.py
@@ -0,0 +1,126 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+_LORA_A_PTR_DICT: dict[tuple[int, ...], tuple[torch.tensor, ...]] = {}
+_LORA_B_PTR_DICT: dict[tuple[int, ...], tuple[torch.tensor, ...]] = {}
+
+
+def _get_lora_a_ptr(lora_a_weights: list[torch.Tensor], device: torch.device):
+    """
+    `_LORA_A_PTR_DICT` collects the required information during `profile_run`, 
+    After this, it remains constant and subsequent usage is through LUT.
+    Refer to: 
+    https://github.com/triton-lang/triton/blob/release/3.1.x/python/tutorials/08-grouped-gemm.py
+    """
+    key = tuple(lora_weight.data_ptr() for lora_weight in lora_a_weights)
+
+    if values := _LORA_A_PTR_DICT.get(key):
+        return values
+
+    lora_strides_d0 = []
+    lora_strides_d1 = []
+    lora_strides_d2 = []
+    tensor_ptrs = []
+    for lora_a_weight in lora_a_weights:
+        if lora_a_weight.ndim == 4:  # shape:(lora_num,1,size,rank)
+            assert lora_a_weight.size(1) == 1
+            lora_a_weight = lora_a_weight.squeeze(dim=1)
+        else:
+            assert lora_a_weight.ndim == 3  # shape:(lora_num,size,rank)
+        assert lora_a_weight.is_contiguous()
+        tensor_ptrs.append(lora_a_weight.data_ptr())
+        lora_strides_d0.append(lora_a_weight.stride(0))
+        lora_strides_d1.append(lora_a_weight.stride(1))
+        lora_strides_d2.append(lora_a_weight.stride(2))
+    if len(lora_a_weights) > 1:
+        lora_ptr_tensor = torch.tensor(tensor_ptrs,
+                                       device=device,
+                                       dtype=torch.uint64)
+    else:
+        lora_ptr_tensor = lora_a_weights[0]
+
+    if (len(set(lora_strides_d0)) > 1 or len(set(lora_strides_d1)) > 1
+            or len(set(lora_strides_d2)) > 1):
+        raise ValueError("All LoRA weights must have the same stride.")
+
+    _LORA_A_PTR_DICT[key] = (
+        lora_ptr_tensor,
+        lora_strides_d0[0],
+        lora_strides_d1[0],
+        lora_strides_d2[0],
+    )
+    return _LORA_A_PTR_DICT.get(key)
+
+
+def _get_lora_b_ptr(lora_weights: list[torch.Tensor], offset_start: int,
+                    device: torch.device):
+    """ 
+     `_LORA_B_PTR_DICT` collects the required information during `profile_run`, 
+    After this, it remains constant and subsequent usage is through LUT.
+    Refer to: 
+    https://github.com/triton-lang/triton/blob/release/3.1.x/python/tutorials/08-grouped-gemm.py
+
+    """
+
+    key = tuple(lora_weight.data_ptr() for lora_weight in lora_weights)
+    if values := _LORA_B_PTR_DICT.get(key):
+        return values
+    slice_offset_lst = []
+    tensor_ptrs = []
+    lora_strides_d0 = []
+    lora_strides_d1 = []
+    lora_strides_d2 = []
+    hidden_sizes = []
+    slice_offset = offset_start
+    for lora_b_weight in lora_weights:
+        if lora_b_weight.ndim == 4:  # shape:(lora_num,1,size,rank)
+            assert lora_b_weight.size(1) == 1
+            lora_b_weight = lora_b_weight.squeeze(dim=1)
+        else:
+            assert lora_b_weight.ndim == 3  # shape:(lora_num,size,rank)
+        assert lora_b_weight.is_contiguous()
+        tensor_ptrs.append(lora_b_weight.data_ptr())
+        lora_strides_d0.append(lora_b_weight.stride(0))
+        lora_strides_d1.append(lora_b_weight.stride(1))
+        lora_strides_d2.append(lora_b_weight.stride(2))
+        slice_offset_lst.append(slice_offset)
+        slice_offset += lora_b_weight.size(1)
+        hidden_sizes.append(lora_b_weight.size(1))
+
+    if len(lora_weights) > 1:
+        # note these are device tensors
+        lora_ptr_tensor = torch.tensor(tensor_ptrs,
+                                       device=device,
+                                       dtype=torch.uint64)
+        slice_start_tensor = torch.tensor(slice_offset_lst,
+                                          device=device,
+                                          dtype=torch.uint64)
+    else:
+        slice_start_tensor = slice_offset_lst[0]
+        lora_ptr_tensor = lora_b_weight[0]
+
+    # If each lora has the same stride, there's no need to use a
+    # tensor for storage.
+    if (len(set(lora_strides_d0)) == 1 and len(set(lora_strides_d1)) == 1 and
+            len(set(lora_strides_d2)) == 1) and len(set(hidden_sizes)) == 1:
+        lora_strides_d0_tensor = lora_strides_d0[0]
+        lora_strides_d1_tensor = lora_strides_d1[0]
+        lora_strides_d2_tensor = lora_strides_d2[0]
+        hidden_sizes_tensor = hidden_sizes[0]
+        same_stride = True
+
+    else:
+        lora_strides_d0_tensor = torch.tensor(lora_strides_d0, device=device)
+        lora_strides_d1_tensor = torch.tensor(lora_strides_d1, device=device)
+        lora_strides_d2_tensor = torch.tensor(lora_strides_d2, device=device)
+        hidden_sizes_tensor = torch.tensor(hidden_sizes, device=device)
+        same_stride = False
+    # MAX_N is the maximum hidden size among all the lora_b weights
+    MAX_N = max(hidden_sizes)
+    _LORA_B_PTR_DICT[key] = (slice_start_tensor, lora_ptr_tensor,
+                             lora_strides_d0_tensor, lora_strides_d1_tensor,
+                             lora_strides_d2_tensor, hidden_sizes_tensor,
+                             same_stride, MAX_N)
+    return _LORA_B_PTR_DICT.get(key)
diff --git a/vllm_v0.10.0/vllm/lora/ops/xla_ops/__init__.py b/vllm_v0.10.0/vllm/lora/ops/xla_ops/__init__.py
new file mode 100644
index 0000000..7e7c3c8
--- /dev/null
+++ b/vllm_v0.10.0/vllm/lora/ops/xla_ops/__init__.py
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.lora.ops.xla_ops.lora_ops import (bgmv_expand, bgmv_expand_slice,
+                                            bgmv_shrink)
+
+__all__ = ["bgmv_expand", "bgmv_expand_slice", "bgmv_shrink"]
diff --git a/vllm_v0.10.0/vllm/lora/ops/xla_ops/lora_ops.py b/vllm_v0.10.0/vllm/lora/ops/xla_ops/lora_ops.py
new file mode 100644
index 0000000..9118f33
--- /dev/null
+++ b/vllm_v0.10.0/vllm/lora/ops/xla_ops/lora_ops.py
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import jax
+import jax.numpy as jnp
+import torch
+import torch.nn.functional as F
+import torch_xla.core.xla_builder as xb
+from torch.library import impl
+from torch_xla.experimental.custom_kernel import XLA_LIB, jax_import_guard
+
+
+@jax.jit
+def bgmv_jax(inputs, loras, idxs):
+    return jnp.einsum(
+        "td,tX,Xld->tl",
+        inputs,
+        jax.nn.one_hot(idxs, loras.shape[0], dtype=inputs.dtype),
+        loras,
+    )
+
+
+XLA_LIB.define("bgmv(Tensor inputs, Tensor loras, Tensor idxs) -> Tensor")
+
+
+@impl(XLA_LIB, "bgmv", "XLA")
+def bgmv_xla(inputs: torch.Tensor, loras: torch.Tensor, idxs: torch.IntTensor):
+    if len(loras.shape) == 4:
+        loras = loras.squeeze(axis=1)
+
+    jax_import_guard()
+    return xb.call_jax(bgmv_jax, (inputs, loras, idxs))
+
+
+@impl(XLA_LIB, "bgmv", "CompositeExplicitAutograd")
+def bgmv_non_xla(inputs: torch.Tensor, loras: torch.Tensor,
+                 idxs: torch.IntTensor):
+    T, _ = inputs.shape
+    if len(loras.shape) == 4:
+        loras = loras.squeeze(axis=1)
+    _, L, _ = loras.shape
+
+    return torch.empty((T, L), device=inputs.device)
+
+
+def bgmv_expand(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    add_inputs: bool = True,
+):
+    """
+    Args:
+        inputs (torch.Tensor): Input tensor of shape [num_tokens, hidden_size].
+
+        lora_b_weights (torch.Tensor): LoRA weights of shape
+            [num_loras, lora_rank, hidden_size].
+
+        output_tensor (torch.Tensor): output tensor of shape
+            [num_tokens, hidden_size * num_slices].
+
+        lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens]
+            indicating which LoRA matrix to use for each token.
+        add_inputs (bool): Whether or not to add the input tensor to the output
+            tensor.
+    """
+
+    outputs = torch.ops.xla.bgmv(inputs, lora_b_weights, lora_indices_tensor)
+
+    limit = output_tensor.shape[0]
+    if outputs.shape[0] == 1 and output_tensor.shape[0] != 1:
+        limit = 1
+
+    if output_tensor.shape[1] > outputs.shape[1]:
+        outputs = F.pad(outputs,
+                        (0, output_tensor.shape[1] - outputs.shape[1], 0, 0))
+
+    if add_inputs:
+        return output_tensor + outputs[:limit, :output_tensor.shape[1]]
+    else:
+        return outputs[:limit, :output_tensor.shape[1]]
+
+
+def bgmv_shrink(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    scaling: float = 1.0,
+):
+    """
+    Args:
+        inputs (torch.Tensor): Input tensor of shape [num_tokens, hidden_size].
+        lora_b_weights (torch.Tensor): LoRA weights of shape
+            [num_loras, lora_rank, hidden_size].
+        output_tensor (torch.Tensor): (Unused) output tensor (placeholder).
+        lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens]
+            indicating which LoRA matrix to use for each token.
+        scaling (float, optional): Scalar multiplier applied to the output.
+    """
+
+    return scaling * torch.ops.xla.bgmv(inputs, lora_b_weights,
+                                        lora_indices_tensor)
+
+
+def bgmv_expand_slice(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    slice_offset: int,
+    slice_size: int,
+    add_inputs: bool = True,
+):
+    """
+    Args:
+        inputs (torch.Tensor): Input tensor of shape [num_tokens, hidden_size].
+
+        lora_b_weights (torch.Tensor): LoRA weights of shape
+            [num_loras, lora_rank, hidden_size].
+
+        output_tensor (torch.Tensor): output tensor of shape
+            [num_tokens, hidden_size * num_slices].
+
+        lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens]
+            indicating which LoRA matrix to use for each token.
+        add_inputs (bool): Whether or not to add the input tensor to the output
+            tensor.
+    """
+    outputs = torch.ops.xla.bgmv(inputs, lora_b_weights, lora_indices_tensor)
+
+    outputs = F.pad(
+        outputs,
+        (
+            slice_offset,
+            output_tensor.shape[1] - (slice_offset + slice_size),
+            0,
+            0,
+        ),
+    )
+
+    if add_inputs:
+        return output_tensor + outputs
+    else:
+        return outputs
diff --git a/vllm_v0.10.0/vllm/lora/peft_helper.py b/vllm_v0.10.0/vllm/lora/peft_helper.py
new file mode 100644
index 0000000..8b8e5cb
--- /dev/null
+++ b/vllm_v0.10.0/vllm/lora/peft_helper.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from: https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/config.py
+
+import json
+import math
+import os
+from dataclasses import MISSING, dataclass, field, fields
+from typing import Literal, Optional, Union
+
+from vllm.config import LoRAConfig
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class PEFTHelper:
+    """ 
+    A helper class for PEFT configurations, specifically designed for LoRA.
+    This class handles configuration validation, compatibility checks for 
+    various LoRA implementations.
+    """
+
+    # Required fields
+    r: int
+    lora_alpha: int
+    target_modules: Union[list[str], str]
+
+    bias: Literal["none", "all", "lora_only"] = field(default="none")
+    modules_to_save: Optional[list[str]] = field(default=None)
+    # True to use Rank-Stabilized LoRA (rsLoRA, see: https://arxiv.org/abs/2312.03732)
+    use_rslora: bool = field(default=False)
+    # True to use Weight-Decomposed Low-Rank Adaptation (DoRA, see: https://arxiv.org/abs/2402.09353)
+    use_dora: bool = field(default=False)
+    # Extra vllm field, start with 'vllm_' to avoid conflict
+    vllm_lora_scaling_factor: float = field(default=1.0)
+    vllm_max_position_embeddings: Optional[int] = field(default=False)
+
+    def _validate_features(self) -> list[str]:
+        """
+        Check if there are any unsupported LoRA features.
+        """
+        error_msg = []
+        if self.modules_to_save:
+            error_msg.append("vLLM only supports modules_to_save being None.")
+        if self.use_dora:
+            error_msg.append("vLLM does not yet support DoRA.")
+        return error_msg
+
+    def __post_init__(self):
+        if self.use_rslora:
+            logger.info_once("Loading LoRA weights trained with rsLoRA.")
+            self.vllm_lora_scaling_factor = self.lora_alpha / math.sqrt(self.r)
+        else:
+            self.vllm_lora_scaling_factor = self.lora_alpha / self.r
+
+    @classmethod
+    def from_dict(cls, config_dict: dict) -> "PEFTHelper":
+        # Get all field information from the class
+        class_fields = {f.name: f for f in fields(cls)}
+        # Check for required fields
+        required_fields = {
+            name
+            for name, f in class_fields.items()
+            if f.default is MISSING and f.default_factory is MISSING
+        }
+
+        # Identify any missing required fields
+        missing_fields = required_fields - set(config_dict.keys())
+        if missing_fields:
+            raise ValueError(
+                f"Missing required configuration fields: {missing_fields}")
+
+        # Filter out fields that aren't defined in the class
+        filtered_dict = {
+            k: v
+            for k, v in config_dict.items() if k in class_fields
+        }
+        return cls(**filtered_dict)
+
+    @classmethod
+    def from_local_dir(
+            cls,
+            lora_path: str,
+            max_position_embeddings: Optional[int],
+            tensorizer_config_dict: Optional[dict] = None) -> "PEFTHelper":
+        lora_config_path = os.path.join(lora_path, "adapter_config.json")
+
+        if tensorizer_config_dict:
+            tensorizer_config = TensorizerConfig(**tensorizer_config_dict)
+            tensorizer_args = tensorizer_config._construct_tensorizer_args()
+            from tensorizer.stream_io import open_stream
+            lora_config_path = os.path.join(tensorizer_config.tensorizer_dir,
+                                            "adapter_config.json")
+            with open_stream(lora_config_path,
+                             mode="rb",
+                             **tensorizer_args.stream_kwargs) as f:
+                config = json.load(f)
+
+            logger.info("Successfully deserialized LoRA config from %s",
+                        tensorizer_config.tensorizer_dir)
+
+        else:
+            with open(lora_config_path) as f:
+                config = json.load(f)
+
+        config["vllm_max_position_embeddings"] = max_position_embeddings
+        return cls.from_dict(config)
+
+    def validate_legal(self, lora_config: LoRAConfig) -> None:
+        """
+        Validates the LoRA configuration settings against application 
+        constraints and requirements.
+        """
+        error_msg = self._validate_features()
+        if self.r > lora_config.max_lora_rank:
+            error_msg.append(
+                f"LoRA rank {self.r} is greater than max_lora_rank"
+                f" {lora_config.max_lora_rank}.")
+        if self.bias != "none" and not lora_config.bias_enabled:
+            error_msg.append(
+                "Adapter bias cannot be used without bias_enabled.")
+        if error_msg:
+            raise ValueError(f"{' '.join(error_msg)}")
diff --git a/vllm_v0.10.0/vllm/lora/punica_wrapper/__init__.py b/vllm_v0.10.0/vllm/lora/punica_wrapper/__init__.py
new file mode 100644
index 0000000..e664ffa
--- /dev/null
+++ b/vllm_v0.10.0/vllm/lora/punica_wrapper/__init__.py
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase
+from vllm.lora.punica_wrapper.punica_selector import get_punica_wrapper
+
+__all__ = [
+    "PunicaWrapperBase",
+    "get_punica_wrapper",
+]
diff --git a/vllm_v0.10.0/vllm/lora/punica_wrapper/punica_base.py b/vllm_v0.10.0/vllm/lora/punica_wrapper/punica_base.py
new file mode 100644
index 0000000..b3413de
--- /dev/null
+++ b/vllm_v0.10.0/vllm/lora/punica_wrapper/punica_base.py
@@ -0,0 +1,458 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
+
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Optional, Union
+
+import torch
+
+from .utils import compute_meta, convert_mapping
+
+if TYPE_CHECKING:
+    # avoid circuit import
+    from vllm.lora.layers import LoRAMapping
+
+
+class PunicaWrapperABC(ABC):
+    """
+    PunicaWrapper ABC.
+    """
+
+    @abstractmethod
+    def update_metadata(
+        self,
+        mapping: "LoRAMapping",
+        lora_index_to_id: list[Optional[int]],
+        max_loras: int,
+        vocab_size: int,
+        extra_vocab_size: int,
+        **kwargs,
+    ) -> None:
+        """
+        Update the lora-related metadata
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_shrink(
+        self,
+        y: Union[tuple[torch.Tensor, ...], torch.Tensor],
+        x: torch.Tensor,
+        lora_a_stacked: tuple[torch.Tensor, ...],
+        scale: float,
+        **kwargs,
+    ) -> Optional[torch.Tensor]:
+        """
+        Performs GEMM  for multiple slices of lora_a.
+        """
+
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_expand(
+        self,
+        y: torch.Tensor,
+        x: Union[tuple[torch.Tensor, ...], torch.Tensor],
+        lora_b_stacked: tuple[torch.Tensor, ...],
+        lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
+        output_slices: tuple[int, ...],
+        offset_start: int = 0,
+        add_inputs=True,
+        **kwargs,
+    ) -> Optional[torch.Tensor]:
+        """
+        Performs GEMM and bias addition for multiple slices of lora_b.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_lora_embedding(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        lora_b_stacked: torch.Tensor,
+        add_inputs: bool = True,
+        **kwargs,
+    ) -> Optional[torch.Tensor]:
+        """
+        Applies lora  specifically for VocabParallelEmbeddingWithLoRA, 
+        and this layer only requires the expand operation.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_lora_linear(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: tuple[torch.Tensor, ...],
+                        lora_b_stacked: tuple[torch.Tensor, ...],
+                        lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
+                        scale: float,
+                        output_slices: tuple[int, ...],
+                        *,
+                        buffer: Optional[tuple[torch.Tensor, ...]] = None,
+                        **kwargs) -> Optional[torch.Tensor]:
+        """
+        Applicable to linear-related lora. 
+        """
+
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_lora_logits(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: torch.Tensor,
+                        lora_b_stacked: torch.Tensor,
+                        scale,
+                        *,
+                        buffer: Optional[torch.Tensor] = None,
+                        **kwargs) -> Optional[torch.Tensor]:
+        """
+        Applies lora  specifically for LogitsProcessorWithLoRA.
+        """
+        raise NotImplementedError
+
+
+class PunicaWrapperBase(PunicaWrapperABC):
+    """
+    PunicaWrapperBase is designed to manage and provide metadata for the punica 
+    kernel. The main function is to maintain the state information for 
+    Multi-LoRA, and to provide the interface for the punica.
+    """
+
+    def __init__(self, max_num_batched_tokens: int, max_batches: int,
+                 device: Union[torch.device, str], **kwargs):
+        self._token_lora_indices = torch.empty(max_num_batched_tokens,
+                                               dtype=torch.long,
+                                               device=device)
+        self._sampler_indices = torch.empty(max_num_batched_tokens,
+                                            dtype=torch.long,
+                                            device=device)
+        self._sampler_indices_padded = torch.empty(max_num_batched_tokens,
+                                                   dtype=torch.long,
+                                                   device=device)
+        self._embeddings_indices = torch.empty(2,
+                                               max_num_batched_tokens,
+                                               dtype=torch.long,
+                                               device=device)
+
+        # 4 is the number of indices tensors.
+        # base_indices, sampler_indices, sampler_indices_padded,
+        # embeddings_indices
+        self.indices_len: list[Optional[int]] = [None] * 4
+        # these attributes are the information required for sgmv kernel
+        self._seq_start_locs = torch.empty(max_batches,
+                                           dtype=torch.long,
+                                           device=device)
+        self._seq_lengths = torch.empty(max_batches,
+                                        dtype=torch.long,
+                                        device=device)
+        self._lora_indices_per_batch = torch.empty(max_batches,
+                                                   dtype=torch.long,
+                                                   device=device)
+        self.device: torch.device = device
+        self.max_length: int = 0
+        self.token_nums: int = 0
+        self.batch_size: int = -1
+        self.is_prefill = False
+        self.no_lora = False
+
+    def _update_base_metadata(
+        self,
+        mapping: "LoRAMapping",
+        lora_index_to_id: list[Optional[int]],
+        max_loras: int,
+        vocab_size: int,
+        extra_vocab_size: int,
+    ):
+        (
+            base_indices,
+            sampler_indices,
+            sampler_indices_padded,
+            embeddings_indices,
+            indices_len,
+        ) = convert_mapping(
+            mapping,
+            lora_index_to_id,
+            max_loras,
+            vocab_size,
+            extra_vocab_size,
+            self.device,
+        )
+        self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices)
+        self._sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices)
+        self._sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_(
+            sampler_indices_padded)
+        self._embeddings_indices[:embeddings_indices.
+                                 shape[0], :embeddings_indices.shape[1]].copy_(
+                                     embeddings_indices)
+
+        self.indices_len[:] = indices_len
+
+    def _update_prefill_metadata(self,
+                                 token_lora_tensor: torch.Tensor) -> None:
+
+        (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor,
+         batch_size, max_length, token_nums,
+         no_lora) = compute_meta(token_lora_tensor)
+
+        self._seq_start_locs[:b_seq_start_tensor.shape[0]].copy_(
+            b_seq_start_tensor)
+        self._seq_lengths[:seq_length_tensor.shape[0]].copy_(seq_length_tensor)
+        self._lora_indices_per_batch[:lora_indices_tensor.shape[0]].copy_(
+            lora_indices_tensor)
+        self.batch_size = batch_size
+        self.max_length = max_length
+        self.token_nums = token_nums
+        self.no_lora = no_lora
+
+    def _apply_bias(
+        self,
+        indices: torch.Tensor,
+        output: torch.Tensor,
+        output_slices: tuple[int, ...],
+        lora_bias_stacked: tuple[Optional[torch.Tensor], ...],
+    ):
+        """Applies bias to output
+
+        Input shapes:
+            lora_bias_stacked:      3 element tuple of (num_loras, output_dim)
+            indices:           (batch_size)
+            output:            (batch_size, q_slice_size + 2*kv_slice_size)
+            output_slices:     n-1 element tuple of (slice_size...),
+                            where n is number of slices
+        """
+        org_output = output
+        output = output.view(-1, output.shape[-1])
+        indices = indices.view(-1)
+
+        offset_left = 0
+        for slice_idx, slice in enumerate(output_slices):
+            bias = lora_bias_stacked[slice_idx]
+            if bias is not None:
+                bias = bias.view(-1, bias.shape[-1])
+                bias = bias[indices]
+                bias[indices == -1] = 0
+                output[:, offset_left:offset_left + slice] += bias
+            offset_left += slice
+
+        return output.view_as(org_output)
+
+    @property
+    def prefill_metadata(
+        self
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int]:
+        """
+        This property provides a convenient way to access the necessary 
+        metadata for prefill-related  kernel computations.
+            1. seq_start_locs: Tensor of sequence start positions.
+            2. seq_lengths: Tensor of sequence lengths.
+            3. lora_indices_per_batch: Tensor of lora indices, and an index of 
+                -1 means no lora should be applied.
+            4. batch_size: Batch size after clustering identical lora indices.
+            5. max_length: The maximum sequence length in the batch.
+            6. token_nums: The token numbers in the batch.
+        """
+        return (self._seq_start_locs[:self.batch_size],
+                self._seq_lengths[:self.batch_size],
+                self._lora_indices_per_batch[:self.batch_size],
+                self.batch_size, self.max_length, self.token_nums)
+
+    @property
+    def token_lora_indices(self) -> torch.Tensor:
+        """
+        This property provides the lora indices corresponding to each token 
+        in the batch. An index of -1 means no lora should be applied.
+        """
+        token_lora_len = self.indices_len[0]
+        return self._token_lora_indices[:token_lora_len]
+
+    @property
+    def sampler_indices(self) -> torch.Tensor:
+        """ 
+        This property is used to access the lora indices specifically for 
+        LogitsProcessorWithLoRA.
+        """
+        sampler_indices_len = self.indices_len[1]
+        return self._sampler_indices[:sampler_indices_len]
+
+    @property
+    def sampler_indices_padded(self) -> torch.Tensor:
+        """
+        This property provides access to padded sampler indices.
+        """
+        indices_padded_len = self.indices_len[2]
+        return self._sampler_indices_padded[:indices_padded_len]
+
+    @property
+    def embeddings_indices(self) -> torch.Tensor:
+        """
+        This property provides access to the indices used for lora embeddings, 
+        specifically for VocabParallelEmbeddingWithLoRA.
+        """
+        embeddings_indices_len = self.indices_len[3]
+        return self._embeddings_indices[:, :embeddings_indices_len]
+
+    def update_metadata(self, mapping: "LoRAMapping",
+                        lora_index_to_id: list[Optional[int]], max_loras: int,
+                        vocab_size: int, extra_vocab_size: int, **kwargs):
+
+        self._update_base_metadata(mapping, lora_index_to_id, max_loras,
+                                   vocab_size, extra_vocab_size)
+
+        if mapping.is_prefill:
+            # Update metadata required for prefill-related operators.
+            self._update_prefill_metadata(self.token_lora_indices)
+            self.is_prefill = True
+        else:
+            self.is_prefill = False
+
+    @abstractmethod
+    def add_shrink(self, y: Union[tuple[torch.Tensor, ...], torch.Tensor],
+                   x: torch.Tensor, lora_a_stacked: tuple[torch.Tensor, ...],
+                   scale: float, **kwargs) -> Optional[torch.Tensor]:
+        """
+        Performs GEMM  for multiple slices of lora_a.
+
+        Semantics:
+        for i in range(len(lora_a_stacked)):
+            y[i] += (x @ lora_a_stacked[i]) * scale
+        
+        Args:
+            y (Union[tuple[torch.Tensor, ...], torch.Tensor]): Output tensors
+            x (torch.Tensor): Input tensor
+            lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weights
+            scale (float): Scaling factor for the operation
+
+        """
+        # TODO: implement it based on torch ops
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_expand(self,
+                   y: torch.Tensor,
+                   x: Union[tuple[torch.Tensor, ...], torch.Tensor],
+                   lora_b_stacked: tuple[torch.Tensor, ...],
+                   lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
+                   output_slices: tuple[int, ...],
+                   offset_start: int = 0,
+                   add_inputs=True,
+                   **kwargs) -> Optional[torch.Tensor]:
+        """
+        Performs GEMM and bias addition for multiple slices of lora_b.
+      
+        Semantics:
+            offset = offset_start
+            for i in range(len(lora_b_stacked)):
+                slice = output_slices[i]
+                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + 
+                    lora_bias_stacked[i] 
+                offset += slice
+            
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (Union[tuple[torch.Tensor, ...], torch.Tensor]): Input tensors
+            lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight
+            lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): 
+                bias's weight
+            output_slices (tuple[int, ...]): Every slice's size
+            offset_start (int): The starting position of y, defaults to 0
+            add_inputs (bool):  Defaults to True.
+
+        """
+        # TODO: implement it based on torch ops
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_lora_embedding(self,
+                           y: torch.Tensor,
+                           x: torch.Tensor,
+                           lora_b_stacked: torch.Tensor,
+                           add_inputs: bool = True,
+                           **kwargs) -> Optional[torch.Tensor]:
+        """
+        Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
+        and this layer only requires the expand operation.
+        Semantics:
+            y += x @ lora_b_stacked
+
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (torch.Tensor): Input tensor.
+            lora_b_stacked (torch.Tensor): lora_b's weights.
+            add_inputs (bool): Default to True.
+        """
+        # TODO: implement it based on torch ops
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_lora_linear(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: tuple[torch.Tensor, ...],
+                        lora_b_stacked: tuple[torch.Tensor, ...],
+                        lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
+                        scale: float,
+                        output_slices: tuple[int, ...],
+                        *,
+                        buffer: Optional[tuple[torch.Tensor, ...]] = None,
+                        **kwargs) -> Optional[torch.Tensor]:
+        """
+        Applicable to linear-related lora. 
+
+        Semantics:
+            for i in range(len(lora_a_stacked)):
+                y[i] += (
+                    x[i].unsqueeze(0)
+                    @ lora_a_stacked[indices[i], layer_idx, :, :]
+                    @ lora_b_stacked[indices[i], layer_idx, :, :]
+                    * scale
+                    ).squeeze(0)+lora_bias_stacked[i]
+
+        Args:
+            y (torch.Tensor): Output tensor. Will be changed in-place.
+            x (torch.Tensor): Input tensor
+            lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight.
+            lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight.
+            lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): lora's bias.
+            scale (float): Scaling factor.
+            output_slices (tuple[int, ...]): Every slice's size.
+            buffer (Optional[tuple[torch.Tensor, ...]]): Defaults to None.
+        """
+        # TODO: implement it based on torch ops
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_lora_logits(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: torch.Tensor,
+                        lora_b_stacked: torch.Tensor,
+                        scale,
+                        *,
+                        buffer: Optional[torch.Tensor] = None,
+                        **kwargs) -> Optional[torch.Tensor]:
+        """
+        Applies lora  specifically for LogitsProcessorWithLoRA.
+        
+        Semantics:
+            buffer = (x @ lora_a_stacked) * scale
+            y += buffer @ lora_b_stacked
+
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (torch.Tensor): Input tensor.
+            lora_a_stacked (torch.Tensor): lora_a's weights.
+            lora_b_stacked (torch.Tensor):lora_b's weights.
+            scale (float): Scaling factor.
+            buffer (Optional[torch.Tensor]):Default to None.
+        """
+        # TODO: implement it based on torch ops
+        raise NotImplementedError
diff --git a/vllm_v0.10.0/vllm/lora/punica_wrapper/punica_cpu.py b/vllm_v0.10.0/vllm/lora/punica_wrapper/punica_cpu.py
new file mode 100644
index 0000000..59049cc
--- /dev/null
+++ b/vllm_v0.10.0/vllm/lora/punica_wrapper/punica_cpu.py
@@ -0,0 +1,349 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Callable, Optional, Union
+
+import torch
+
+from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
+                                     bgmv_shrink, sgmv_expand,
+                                     sgmv_expand_slice, sgmv_shrink)
+
+from .punica_base import PunicaWrapperBase
+
+
+# The platforms that are compatible with the PyTorch-native implementation can
+# inherit this class
+class PunicaWrapperCPU(PunicaWrapperBase):
+    """
+    PunicaWrapperCPU is designed to manage and provide metadata for the punica 
+    kernel. The main function is to maintain the state information for 
+    Multi-LoRA, and to provide the interface for the pytorch punica ops.
+    """
+
+    def __init__(self, max_num_batched_tokens: int, max_batches: int,
+                 device: Union[torch.device, str], **kwargs):
+        PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches,
+                                   device)
+
+    def _shrink_prefill(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        scale: float,
+    ):
+        #No LoRA request, so return directly
+        if self.no_lora:
+            return
+        sgmv_shrink(
+            x,
+            w_t_all,
+            y,
+            *self.prefill_metadata,
+            scale,
+        )
+
+    def _shrink_decode(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        scale: float,
+    ):
+        bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale)
+
+    def _expand_prefill(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        add_inputs: bool,
+    ):
+        #No LoRA request, so return directly
+        if self.no_lora:
+            return
+        sgmv_expand(
+            x,
+            w_t_all,
+            y,
+            *self.prefill_metadata,
+            add_inputs,
+        )
+
+    def _expand_decode(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        add_inputs: bool,
+    ):
+        bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_inputs)
+
+    def _expand_slice_prefill(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        y_offset: int,
+        y_slice_size: int,
+        add_inputs: bool,
+    ):
+        #No LoRA request, so return directly
+        if self.no_lora:
+            return
+        sgmv_expand_slice(
+            x,
+            w_t_all,
+            y,
+            *self.prefill_metadata,
+            y_offset,
+            y_slice_size,
+            add_inputs,
+        )
+
+    def _expand_slice_decode(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        y_offset: int,
+        y_slice_size: int,
+        add_inputs: bool,
+    ):
+        bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset,
+                          y_slice_size, add_inputs)
+
+    def _apply_expand(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        y_offset: int,
+        y_slice_size: int,
+        add_inputs: bool = True,
+    ):
+        """
+        Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all` 
+        computation, which is suitable for the
+        GEMM of lora'b.
+        """
+
+        expand_slice_fun: Callable = (self._expand_slice_prefill
+                                      if self.is_prefill else
+                                      self._expand_slice_decode)
+        expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_inputs)
+
+    def _apply_shrink(self, y: torch.Tensor, x: torch.Tensor,
+                      w_t_all: torch.Tensor, scale: float):
+        """
+        Perform the ` y+=x@w_t_all` computation, which is suitable for the
+        GEMM of lora'a.
+        When `is_prefill is` true, it indicates that it is currently the
+        prefill stage, and the `_shrink_prefill` function should be called.
+        Otherwise, it is the decode stage, and the _shrink_decode function
+        should be called.
+        """
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        shrink_fun: Callable = (self._shrink_prefill
+                                if self.is_prefill else self._shrink_decode)
+        shrink_fun(y, x, w_t_all, scale)
+        y = y.view_as(y_org)
+
+    def add_shrink(self, y: Union[tuple[torch.Tensor, ...], torch.Tensor],
+                   x: torch.Tensor, lora_a_stacked: tuple[torch.Tensor, ...],
+                   scale: float, **kwargs):
+        """
+        Performs GEMM  for multiple slices of lora_a.
+        When `is_prefill is` true, it indicates that it is currently the
+        prefill stage, and the `_shrink_prefill` function should be called.
+        Otherwise, it is the decode stage, and the _shrink_decode function
+        should be called.
+            
+        Semantics:
+        for i in range(len(lora_a_stacked)):
+            y[i] += (x @ lora_a_stacked[i]) * scale
+        
+        Args:
+            y (Union[tuple[torch.Tensor, ...], torch.Tensor]): Output tensors
+            x (torch.Tensor): Input tensor
+            lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weights
+            scale (float): Scaling factor for the operation
+        """
+
+        x = x.view(-1, x.shape[-1])
+        # TODO fuse these kernels
+        for slice_idx in range(len(lora_a_stacked)):
+            self._apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx],
+                               scale)
+
+    def add_expand(self,
+                   y: torch.Tensor,
+                   x: Union[tuple[torch.Tensor, ...], torch.Tensor],
+                   lora_b_stacked: tuple[torch.Tensor, ...],
+                   lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
+                   output_slices: tuple[int, ...],
+                   offset_start: int = 0,
+                   add_inputs=True,
+                   **kwargs) -> None:
+        """
+        Performs GEMM and bias addition for multiple slices of lora_b.
+      
+        Semantics:
+            for i in range(len(lora_b_stacked)):
+                slice = output_slices[i]
+                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + 
+                    lora_bias_stacked[i] 
+                offset += slice
+            
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (Union[tuple[torch.Tensor, ...], torch.Tensor]): Input tensors
+            lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight
+            lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): 
+                bias's weight
+            output_slices (tuple[int, ...]): Every slice's size
+            add_inputs (bool):  Defaults to True.
+        """
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        offset_left = offset_start
+        if lora_bias_stacked is not None:
+            self._apply_bias(self.token_lora_indices, y, output_slices,
+                             lora_bias_stacked)
+        for slice_idx in range(len(lora_b_stacked)):
+            self._apply_expand(
+                y,
+                x[slice_idx],
+                lora_b_stacked[slice_idx],
+                offset_left,
+                output_slices[slice_idx],
+                add_inputs=add_inputs,
+            )
+            offset_left += output_slices[slice_idx]
+        y = y.view_as(y_org)
+
+    def add_lora_embedding(self,
+                           y: torch.Tensor,
+                           x: torch.Tensor,
+                           lora_b_stacked: torch.Tensor,
+                           add_inputs: bool = True,
+                           **kwargs) -> None:
+        """
+        Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
+
+        Semantics:
+            y += x @ lora_b_stacked
+
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (torch.Tensor): Input tensor.
+            lora_b_stacked (torch.Tensor): lora_b's weights.
+            add_inputs (bool): Default to True.
+        """
+
+        # Embedding layer only need expand op
+        expand_fun: Callable = (self._expand_prefill
+                                if self.is_prefill else self._expand_decode)
+        expand_fun(y, x, lora_b_stacked, add_inputs)
+
+    def add_lora_linear(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: tuple[torch.Tensor, ...],
+                        lora_b_stacked: tuple[torch.Tensor, ...],
+                        lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
+                        scale: float,
+                        output_slices: tuple[int, ...],
+                        *,
+                        buffer: Optional[tuple[torch.Tensor, ...]] = None,
+                        **kwargs) -> None:
+        """
+        Applicable to linear-related lora. 
+
+        Semantics:
+            for i in range(len(lora_a_stacked)):
+                y[i] += (
+                    x[i].unsqueeze(0)
+                    @ lora_a_stacked[indices[i], layer_idx, :, :]
+                    @ lora_b_stacked[indices[i], layer_idx, :, :]
+                    * scale
+                    ).squeeze(0)+lora_bias_stacked[i]
+
+        Args:
+            y (torch.Tensor): Output tensor. Will be changed in-place.
+            x (torch.Tensor): Input tensor
+            lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight.
+            lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight.
+            lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): lora's bias.
+            scale (float): Scaling factor.
+            output_slices (tuple[int, ...]): Every slice's size.
+            buffer (Optional[tuple[torch.Tensor, ...]]): Defaults to None.
+        """
+
+        assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)
+        if lora_bias_stacked is not None:
+            assert len(lora_bias_stacked) == len(output_slices)
+            y = self._apply_bias(self.token_lora_indices, y, output_slices,
+                                 lora_bias_stacked)
+
+        if buffer is None:
+            r = lora_b_stacked[0].size(-1)
+            # We set the buffer to be float32 by default, consistent with the
+            # triton op
+            buffer = tuple(
+                torch.zeros(
+                    (x.size(0), r), dtype=torch.float32, device=x.device)
+                for _ in range(len(output_slices)))
+        self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs)
+        self.add_expand(y,
+                        buffer,
+                        lora_b_stacked,
+                        None,
+                        output_slices,
+                        add_inputs=True,
+                        **kwargs)
+
+    def add_lora_logits(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: torch.Tensor,
+                        lora_b_stacked: torch.Tensor,
+                        scale,
+                        *,
+                        buffer: Optional[torch.Tensor] = None,
+                        **kwargs) -> None:
+        """
+        Applies lora  specifically for LogitsProcessorWithLoRA.
+        
+        Semantics:
+            buffer = (x @ lora_a_stacked) * scale
+            y += buffer @ lora_b_stacked
+
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (torch.Tensor): Input tensor.
+            lora_a_stacked (torch.Tensor): lora_a's weights.
+            lora_b_stacked (torch.Tensor):lora_b's weights.
+            scale (float): Scaling factor.
+            buffer (Optional[torch.Tensor]):Default to None.
+        """
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        x = x.view(-1, x.shape[-1])
+        r = lora_b_stacked.size(-1)
+        if buffer is None:
+            # We set the buffer to be float32 by default, consistent with the
+            # triton op
+            buffer = torch.zeros((x.size(0), r),
+                                 dtype=torch.float32,
+                                 device=x.device)
+        # LogitsProcessorWithLoRA always using bgmv.
+        bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, scale)
+        bgmv_expand(buffer,
+                    lora_b_stacked,
+                    y,
+                    self.sampler_indices,
+                    add_inputs=True)
+        y = y.view_as(y_org)
diff --git a/vllm_v0.10.0/vllm/lora/punica_wrapper/punica_gpu.py b/vllm_v0.10.0/vllm/lora/punica_wrapper/punica_gpu.py
new file mode 100644
index 0000000..2db0e9f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/lora/punica_wrapper/punica_gpu.py
@@ -0,0 +1,279 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
+
+from typing import Optional, Union, final
+
+import torch
+
+import vllm.envs as envs
+from vllm.lora.layers import LoRAMapping
+from vllm.triton_utils import HAS_TRITON
+
+if HAS_TRITON:
+    from vllm.lora.ops.triton_ops import (LoRAKernelMeta, lora_expand,
+                                          lora_shrink)
+
+from .punica_base import PunicaWrapperBase
+
+
+@final
+class PunicaWrapperGPU(PunicaWrapperBase):
+    """
+    PunicaWrapperGPU is designed to manage and provide metadata for the punica 
+    kernel. The main function is to maintain the state information for 
+    Multi-LoRA, and to provide the interface for the punica triton kernel.
+    """
+
+    def __init__(self, max_num_batched_tokens: int, max_batches: int,
+                 device: Union[torch.device, str], **kwargs):
+        PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches,
+                                   device)
+
+        self.max_loras = kwargs['max_loras']
+
+        self.token_mapping_meta = LoRAKernelMeta.make(self.max_loras,
+                                                      max_num_batched_tokens,
+                                                      device=device)
+
+        # When cudagraph capture size is greater than max_num_seqs (max_batches,
+        # here), V0 captures the graph as if max_num_seqs is set to
+        # the capture size.
+        # V1 doesn't have this problem and always respects max_num_seqs.
+        max_num_prompts = (max_batches
+                           if envs.VLLM_USE_V1 else max_num_batched_tokens)
+        self.prompt_mapping_meta = LoRAKernelMeta.make(self.max_loras,
+                                                       max_num_prompts,
+                                                       device=device)
+
+    def update_metadata(self, mapping: LoRAMapping,
+                        lora_index_to_id: list[Optional[int]], max_loras: int,
+                        vocab_size: int, extra_vocab_size: int, **kwargs):
+
+        self.is_prefill = mapping.is_prefill
+        self._update_base_metadata(mapping, lora_index_to_id, max_loras,
+                                   vocab_size, extra_vocab_size)
+
+        # Prepare cuda kernel metadata tensors
+        self.token_mapping_meta.prepare_tensors(self.token_lora_indices)
+        self.prompt_mapping_meta.prepare_tensors(self.sampler_indices)
+
+    def add_shrink(self, y: torch.Tensor, x: torch.Tensor,
+                   lora_a_stacked: tuple[torch.Tensor,
+                                         ...], scale: float, **kwargs):
+        """
+        Performs GEMM  for multiple slices of lora_a.
+            
+        Semantics:
+        for i in range(len(lora_a_stacked)):
+            y[i] += (x @ lora_a_stacked[i]) * scale
+        
+        Args:
+            y (torch.Tensor): Output tensors
+            x (torch.Tensor): Input tensor
+            lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weights
+            scale (float): Scaling factor for the operation
+        """
+
+        x = x.view(-1, x.shape[-1])
+        lora_shrink(
+            x,
+            lora_a_stacked,
+            y,
+            *self.token_mapping_meta.meta_args(x.size(0)),
+            scale,
+        )
+
+    def add_expand(self,
+                   y: torch.Tensor,
+                   x: torch.Tensor,
+                   lora_b_stacked: tuple[torch.Tensor, ...],
+                   lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
+                   output_slices: tuple[int, ...],
+                   offset_start: int = 0,
+                   add_inputs=True,
+                   **kwargs) -> None:
+        """
+        Performs GEMM and bias addition for multiple slices of lora_b.
+      
+        Semantics:
+            for i in range(len(lora_b_stacked)):
+                slice = output_slices[i]
+                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + 
+                    lora_bias_stacked[i] 
+                offset += slice
+            
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (torch.Tensor): Input tensors
+            lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight
+            lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): 
+                bias's weight
+            output_slices (tuple[int, ...]): Every slice's size
+            add_inputs (bool): Defaults to True.
+        """
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        if lora_bias_stacked is not None:
+            token_lora_indices = torch.narrow(self._token_lora_indices, 0, 0,
+                                              y.size(0))
+            self._apply_bias(token_lora_indices, y, output_slices,
+                             lora_bias_stacked)
+
+        assert x.ndim == 3
+        assert x.size(0) == len(output_slices)
+        num_tokens = x.size(1)  # first dimension is the num slices
+
+        lora_expand(
+            x,
+            lora_b_stacked,
+            y,
+            *self.token_mapping_meta.meta_args(num_tokens),
+            offset_start=offset_start,
+            add_inputs=True,
+        )
+
+        y = y.view_as(y_org)
+
+    def add_lora_embedding(self,
+                           y: torch.Tensor,
+                           x: torch.Tensor,
+                           lora_b_stacked: torch.Tensor,
+                           add_inputs: bool = True,
+                           **kwargs) -> None:
+        """
+        Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
+
+        Semantics:
+            y += x @ lora_b_stacked
+
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (torch.Tensor): Input tensor.
+            lora_b_stacked (torch.Tensor): lora_b's weights.
+            add_inputs (bool): Default to True.
+        """
+
+        lora_expand(
+            x.unsqueeze(dim=0),
+            (lora_b_stacked, ),
+            y,
+            *self.token_mapping_meta.meta_args(x.size(0)),
+            offset_start=0,
+            add_inputs=add_inputs,
+        )
+
+    def add_lora_linear(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: tuple[torch.Tensor, ...],
+                        lora_b_stacked: tuple[torch.Tensor, ...],
+                        lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
+                        scale: float,
+                        output_slices: tuple[int, ...],
+                        *,
+                        buffer: Optional[torch.Tensor] = None,
+                        **kwargs) -> None:
+        """
+        Applicable to linear-related lora. 
+
+        Semantics:
+            for i in range(len(lora_a_stacked)):
+                y[i] += (
+                    x[i].unsqueeze(0)
+                    @ lora_a_stacked[indices[i], layer_idx, :, :]
+                    @ lora_b_stacked[indices[i], layer_idx, :, :]
+                    * scale
+                    ).squeeze(0)+lora_bias_stacked[i]
+
+        Args:
+            y (torch.Tensor): Output tensor. Will be changed in-place.
+            x (torch.Tensor): Input tensor
+            lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight.
+            lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight.
+            lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): lora's bias.
+            scale (float): Scaling factor.
+            output_slices (tuple[int, ...]): Every slice's size.
+            buffer (Optional[torch.Tensor]): Defaults to None.
+        """
+
+        assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)
+        if lora_bias_stacked is not None:
+            assert len(lora_bias_stacked) == len(output_slices)
+            token_lora_indices = torch.narrow(self._token_lora_indices, 0, 0,
+                                              y.size(0))
+            y = self._apply_bias(token_lora_indices, y, output_slices,
+                                 lora_bias_stacked)
+
+        if buffer is None:
+            r = lora_b_stacked[0].size(-1)
+            # We set the buffer to be float32 by default, refer to:
+            # https://github.com/triton-lang/triton/issues/1387
+            buffer = torch.zeros(  # type: ignore
+                (len(output_slices), x.size(0), r),
+                dtype=torch.float32,
+                device=x.device,
+            )
+        self.add_shrink(
+            buffer,  # type: ignore
+            x,
+            lora_a_stacked,
+            scale,
+            **kwargs)
+        self.add_expand(
+            y,
+            buffer,  # type: ignore
+            lora_b_stacked,
+            None,
+            output_slices,
+            add_inputs=True,
+            **kwargs)
+
+    def add_lora_logits(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: torch.Tensor,
+                        lora_b_stacked: torch.Tensor,
+                        scale,
+                        *,
+                        buffer: Optional[torch.Tensor] = None,
+                        **kwargs) -> None:
+        """
+        Applies lora  specifically for LogitsProcessorWithLoRA.
+        
+        Semantics:
+            buffer = (x @ lora_a_stacked) * scale
+            y += buffer @ lora_b_stacked
+
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (torch.Tensor): Input tensor.
+            lora_a_stacked (torch.Tensor): lora_a's weights.
+            lora_b_stacked (torch.Tensor): lora_b's weights.
+            scale (float): Scaling factor.
+            buffer (Optional[torch.Tensor]): Default to None.
+        """
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        x = x.view(-1, x.shape[-1])
+        r = lora_b_stacked.size(-1)
+        if buffer is None:
+            # We set the buffer to be float32 by default, refer to:
+            # https://github.com/triton-lang/triton/issues/1387
+            buffer = torch.zeros((x.size(0), r),
+                                 dtype=torch.float32,
+                                 device=x.device)
+
+        lora_shrink(x, [lora_a_stacked], buffer.unsqueeze(dim=0),
+                    *self.prompt_mapping_meta.meta_args(x.size(0)), scale)
+
+        lora_expand(buffer.unsqueeze(dim=0), [lora_b_stacked],
+                    y,
+                    *self.prompt_mapping_meta.meta_args(buffer.size(0)),
+                    add_inputs=True)
+        y = y.view_as(y_org)
diff --git a/vllm_v0.10.0/vllm/lora/punica_wrapper/punica_selector.py b/vllm_v0.10.0/vllm/lora/punica_wrapper/punica_selector.py
new file mode 100644
index 0000000..c684ac7
--- /dev/null
+++ b/vllm_v0.10.0/vllm/lora/punica_wrapper/punica_selector.py
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils import resolve_obj_by_qualname
+
+from .punica_base import PunicaWrapperBase
+
+logger = init_logger(__name__)
+
+
+def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
+    punica_wrapper_qualname = current_platform.get_punica_wrapper()
+    punica_wrapper_cls = resolve_obj_by_qualname(punica_wrapper_qualname)
+    punica_wrapper = punica_wrapper_cls(*args, **kwargs)
+    assert punica_wrapper is not None, \
+        "the punica_wrapper_qualname(" + punica_wrapper_qualname + ") is wrong."
+    logger.info_once("Using %s.", punica_wrapper_qualname.rsplit(".", 1)[1])
+    return punica_wrapper
diff --git a/vllm_v0.10.0/vllm/lora/punica_wrapper/punica_tpu.py b/vllm_v0.10.0/vllm/lora/punica_wrapper/punica_tpu.py
new file mode 100644
index 0000000..07dc337
--- /dev/null
+++ b/vllm_v0.10.0/vllm/lora/punica_wrapper/punica_tpu.py
@@ -0,0 +1,391 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+from typing import TYPE_CHECKING, Optional, Union
+
+import torch
+import torch.nn.functional as F
+import torch_xla.core.xla_model as xm
+
+from vllm.lora.ops.xla_ops import bgmv_expand, bgmv_expand_slice, bgmv_shrink
+from vllm.lora.punica_wrapper.utils import convert_mapping
+
+if TYPE_CHECKING:
+    # avoid circuit import
+    from vllm.lora.layers import LoRAMapping
+
+from .punica_base import PunicaWrapperBase
+
+
+class PunicaWrapperTPU(PunicaWrapperBase):
+    """
+    PunicaWrapperTPU is designed to manage and provide metadata for the punica
+    kernel. The main function is to maintain the state information for
+    Multi-LoRA, and to provide the interface for the pytorch punica ops.
+    """
+
+    def __init__(self, max_num_batched_tokens: int, max_batches: int,
+                 device: Union[torch.device, str], **kwargs):
+        PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches,
+                                   device)
+
+        # PunicaWrapperBase defines some tensors with dtype=torch.int64, which
+        # isn't supported by the TPU. So convert those tensors to int32.
+        # Not all of them are used by the TPU so only convert the useful ones.
+        self._token_lora_indices = self._token_lora_indices.to(
+            dtype=torch.int32)
+        self._sampler_indices = self._sampler_indices.to(dtype=torch.int32)
+        self._sampler_indices_padded = self._sampler_indices_padded.to(
+            dtype=torch.int32)
+
+        torch.ops.xla.dynamo_set_buffer_donor_(self._token_lora_indices, True)
+        torch.ops.xla.dynamo_set_buffer_donor_(self._sampler_indices, True)
+        torch.ops.xla.dynamo_set_buffer_donor_(self._sampler_indices_padded,
+                                               True)
+        torch.ops.xla.dynamo_set_buffer_donor_(self._embeddings_indices, True)
+        torch.ops.xla.dynamo_set_buffer_donor_(self._lora_indices_per_batch,
+                                               True)
+
+        torch._dynamo.mark_dynamic(self._token_lora_indices, 0)
+        torch._dynamo.mark_dynamic(self._embeddings_indices, 1)
+        torch._dynamo.mark_dynamic(self._sampler_indices_padded, 0)
+
+    def _get_token_lora_indices(self, x: torch.Tensor) -> torch.IntTensor:
+        return torch.narrow(self._token_lora_indices, 0, 0, x.size(0))
+
+    @property
+    def embeddings_indices(self) -> torch.Tensor:
+        """
+        This property provides access to the indices used for lora embeddings,
+        specifically for VocabParallelEmbeddingWithLoRA.
+        """
+        return self._embeddings_indices[:]
+
+    @property
+    def sampler_indices_padded(self) -> torch.Tensor:
+        """
+        This property provides access to padded sampler indices.
+        """
+        return self._sampler_indices_padded[:]
+
+    def shrink(
+        self,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        scale: float,
+    ):
+        return bgmv_shrink(x, w_t_all, self._get_token_lora_indices(x), scale)
+
+    def expand(self, y: torch.Tensor, x: torch.Tensor, w_t_all: torch.Tensor,
+               add_inputs: bool):
+        return bgmv_expand(x, w_t_all, y, self._get_token_lora_indices(x),
+                           add_inputs)
+
+    def expand_slice(self, y: torch.Tensor, x: torch.Tensor,
+                     w_t_all: torch.Tensor, y_offset: int, y_slice_size: int,
+                     add_inputs: bool) -> torch.Tensor:
+        return bgmv_expand_slice(x, w_t_all, y,
+                                 self._get_token_lora_indices(x), y_offset,
+                                 y_slice_size, add_inputs)
+
+    def add_shrink(self, y: Union[tuple[torch.Tensor, ...], torch.Tensor],
+                   x: torch.Tensor, lora_a_stacked: tuple[torch.Tensor, ...],
+                   scale: float, **kwargs) -> Optional[torch.Tensor]:
+        """
+        Performs GEMM for multiple slices of lora_a.
+
+        Semantics:
+        for i in range(len(lora_a_stacked)):
+            y[i] += (x @ lora_a_stacked[i]) * scale
+
+        Args:
+            y (Union[tuple[torch.Tensor, ...], torch.Tensor]): Output tensors
+            x (torch.Tensor): Input tensor
+            lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weights
+            scale (float): Scaling factor for the operation
+        """
+
+        torch.ops.xla.dynamo_set_buffer_donor_(y, True)
+        x = x.view(-1, x.shape[-1])
+
+        for slice_idx in range(len(lora_a_stacked)):
+            lora_s = lora_a_stacked[slice_idx]
+            y_s = self.shrink(x, lora_s, scale)
+            y[slice_idx, :, :] = y_s  # type: ignore[index]
+        return y
+
+    def add_expand(self,
+                   y: torch.Tensor,
+                   x: Union[tuple[torch.Tensor, ...], torch.Tensor],
+                   lora_b_stacked: tuple[torch.Tensor, ...],
+                   lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
+                   output_slices: tuple[int, ...],
+                   offset_start: int = 0,
+                   add_inputs=True,
+                   **kwargs) -> torch.Tensor:
+        """
+        Performs GEMM and bias addition for multiple slices of lora_b.
+
+        Semantics:
+            for i in range(len(lora_b_stacked)):
+                slice = output_slices[i]
+                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] +
+                    lora_bias_stacked[i]
+                offset += slice
+
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (Union[tuple[torch.Tensor, ...], torch.Tensor]): Input tensors
+            lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight
+            lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]):
+                bias's weight
+            output_slices (tuple[int, ...]): Every slice's size
+            add_inputs (bool):  Defaults to True.
+        """
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        offset_left = 0
+
+        if lora_bias_stacked is not None:
+            y = self._apply_bias(self._get_token_lora_indices(y), y,
+                                 output_slices, lora_bias_stacked)
+        for slice_idx in range(len(lora_b_stacked)):
+            y = self.expand_slice(y,
+                                  x[slice_idx],
+                                  lora_b_stacked[slice_idx],
+                                  offset_left,
+                                  output_slices[slice_idx],
+                                  add_inputs=add_inputs)
+            offset_left += output_slices[slice_idx]
+        return y.view_as(y_org)
+
+    def add_lora_embedding(self,
+                           y: torch.Tensor,
+                           x: torch.Tensor,
+                           lora_b_stacked: torch.Tensor,
+                           add_inputs: bool = True,
+                           **kwargs) -> torch.Tensor:
+        """
+        Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
+
+        Semantics:
+            y += x @ lora_b_stacked
+
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (torch.Tensor): Input tensor.
+            lora_b_stacked (torch.Tensor): lora_b's weights.
+            add_inputs (bool): Default to True.
+        """
+
+        # Embedding layer only needs the expand op
+        return self.expand(y, x, lora_b_stacked, add_inputs)
+
+    def add_lora_linear(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: tuple[torch.Tensor, ...],
+                        lora_b_stacked: tuple[torch.Tensor, ...],
+                        lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
+                        scale: float,
+                        output_slices: tuple[int, ...],
+                        *,
+                        buffer: Optional[tuple[torch.Tensor, ...]] = None,
+                        **kwargs) -> torch.Tensor:
+        """
+        Applicable to linear-related lora.
+
+        Semantics:
+            for i in range(len(lora_a_stacked)):
+                y[i] += (
+                    x[i].unsqueeze(0)
+                    @ lora_a_stacked[indices[i], layer_idx, :, :]
+                    @ lora_b_stacked[indices[i], layer_idx, :, :]
+                    * scale
+                    ).squeeze(0)+lora_bias_stacked[i]
+
+        Args:
+            y (torch.Tensor): Output tensor. Will not be changed in-place.
+            x (torch.Tensor): Input tensor (T, E)
+            lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight.
+            lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight.
+            lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): lora's bias.
+            scale (float): Scaling factor.
+            output_slices (tuple[int, ...]): Every slice's size.
+            buffer (Optional[tuple[torch.Tensor, ...]]): Defaults to None.
+        """
+
+        assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)
+        if lora_bias_stacked is not None:
+            assert len(lora_bias_stacked) == len(output_slices)
+            y = self._apply_bias(self._get_token_lora_indices(y), y,
+                                 output_slices, lora_bias_stacked)
+
+        if buffer is None:
+            r = lora_b_stacked[0].size(-1)
+            T = x.size(0)
+            buffer = torch.zeros(
+                (len(output_slices), T, r),
+                dtype=x.dtype,
+                device=x.device,
+            )
+        buffer = self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs)
+        return self.add_expand(y,
+                               buffer,
+                               lora_b_stacked,
+                               None,
+                               output_slices,
+                               add_inputs=True,
+                               **kwargs)
+
+    def add_lora_logits(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: torch.Tensor,
+                        lora_b_stacked: torch.Tensor,
+                        scale,
+                        *,
+                        buffer: Optional[torch.Tensor] = None,
+                        **kwargs) -> torch.Tensor:
+        """
+        Applies lora specifically for LogitsProcessorWithLoRA.
+
+        Semantics:
+            buffer = (x @ lora_a_stacked) * scale
+            y += buffer @ lora_b_stacked
+
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (torch.Tensor): Input tensor.
+            lora_a_stacked (torch.Tensor): lora_a's weights.
+            lora_b_stacked (torch.Tensor):lora_b's weights.
+            scale (float): Scaling factor.
+            buffer (Optional[torch.Tensor]):Default to None.
+        """
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        x = x.view(-1, x.shape[-1])
+
+        sampler_indices = torch.narrow(self._sampler_indices, 0, 0, x.size(0))
+        buffer = bgmv_shrink(x, lora_a_stacked, sampler_indices, scale)
+        y = bgmv_expand(buffer,
+                        lora_b_stacked,
+                        y,
+                        sampler_indices,
+                        add_inputs=True)
+        return y.view_as(y_org)
+
+    def _apply_bias(
+        self,
+        indices: torch.Tensor,
+        output: torch.Tensor,
+        output_slices: tuple[int, ...],
+        lora_bias_stacked: tuple[Optional[torch.Tensor], ...],
+    ):
+        """Applies bias to output
+
+        Input shapes:
+            lora_bias_stacked:      3 element tuple of (num_loras, output_dim)
+            indices:           (batch_size)
+            output:            (batch_size, q_slice_size + 2*kv_slice_size)
+            output_slices:     n-1 element tuple of (slice_size...),
+                            where n is number of slices
+        """
+        org_output = output
+        output = output.view(-1, output.shape[-1])
+        indices = indices.view(-1)
+
+        offset_left = 0
+        for slice_idx, slice in enumerate(output_slices):
+            bias = lora_bias_stacked[slice_idx]
+            if bias is not None:
+                bias = bias.view(-1, bias.shape[-1])
+                bias = bias[indices]
+                bias = torch.where(indices[:, None] == -1, 0, bias)
+
+                bias = F.pad(bias, (offset_left, output.shape[1] -
+                                    (offset_left + slice), 0, 0))
+
+                output += bias
+            offset_left += slice
+
+        return output.view_as(org_output)
+
+    # This performs the same tensor ops as the base method, except it does them
+    # on the CPU then transfers the results to the TPU
+    def _update_base_metadata(
+        self,
+        mapping: "LoRAMapping",
+        lora_index_to_id: list[Optional[int]],
+        max_loras: int,
+        vocab_size: int,
+        extra_vocab_size: int,
+    ):
+        # Make sure we don't accidentally collect outside operations
+        xm.mark_step()
+
+        # Pad the prompt mapping to avoid running into recompiles on the TPU
+        # TODO: Should this happen inside mapping internally? If so how can we
+        # avoid having backend specific LoRAMapping classes?
+        mapping.prompt_mapping = self._pad_prompt_mapping(
+            mapping.prompt_mapping)
+
+        (
+            base_indices,
+            sampler_indices,
+            sampler_indices_padded,
+            embeddings_indices,
+            indices_len,
+        ) = convert_mapping(
+            mapping,
+            lora_index_to_id,
+            max_loras,
+            vocab_size,
+            extra_vocab_size,
+            "cpu",
+        )
+        self._token_lora_indices = self._pad_to_shape(
+            base_indices, self._token_lora_indices.shape,
+            dims=1).to(self.device)
+        self._sampler_indices = self._pad_to_shape(sampler_indices,
+                                                   self._sampler_indices.shape,
+                                                   dims=1).to(self.device)
+        self._sampler_indices_padded = self._pad_to_shape(
+            sampler_indices_padded, self._sampler_indices_padded.shape,
+            dims=1).to(self.device)
+        self._embeddings_indices = self._pad_to_shape(
+            embeddings_indices, self._embeddings_indices.shape,
+            dims=2).to(self.device)
+        self.indices_len[:] = indices_len
+
+    def _update_prefill_metadata(self,
+                                 token_lora_tensor: torch.Tensor) -> None:
+        self.batch_size = 1
+        self._lora_indices_per_batch[:self.
+                                     batch_size] = token_lora_tensor[:self.
+                                                                     batch_size]
+
+    def _pad_prompt_mapping(
+            self, prompt_mapping: tuple[int, ...]) -> tuple[int, ...]:
+        num_reqs = len(prompt_mapping)
+
+        # From vllm/v1/worker/tpu_model_runner:51, but need to avoid a circular
+        # import
+        MIN_NUM_SEQS = 8
+
+        padded_num_reqs = max(2**math.ceil(math.log2(num_reqs)), MIN_NUM_SEQS)
+        pad_len = padded_num_reqs - num_reqs
+
+        padding = [-1] * pad_len
+        return tuple(list(prompt_mapping) + padding)
+
+    def _pad_to_shape(self, src, target_shape, dims=1):
+        if dims == 1:
+            pad_len = target_shape[0] - src.shape[0]
+            return F.pad(src, (0, pad_len), value=0).to(torch.int32)
+        else:
+            pad_rows = target_shape[0] - src.shape[0]
+            pad_cols = target_shape[1] - src.shape[1]
+            return F.pad(src, (0, pad_cols, 0, pad_rows),
+                         value=0).to(torch.int32)
diff --git a/vllm_v0.10.0/vllm/lora/punica_wrapper/utils.py b/vllm_v0.10.0/vllm/lora/punica_wrapper/utils.py
new file mode 100644
index 0000000..d22c29d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/lora/punica_wrapper/utils.py
@@ -0,0 +1,136 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import TYPE_CHECKING, Optional, Union
+
+import torch
+
+if TYPE_CHECKING:
+    # avoid circuit import
+    from vllm.lora.layers import LoRAMapping
+
+
+def compute_meta(
+    token_lora_tensor: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int, bool]:
+    """
+    Get the information required for the sgmv kernel. With the  features:
+    1. If consecutive requests in the batch use the same LoRA, this function
+    will combine them into a single request, improving sgmv kernel inference
+    performance.
+    2. At the beginning of each prefill stage inference, recalculations are
+    needed based on the input, but only once.
+    """
+
+    lora_indices_tensor, seq_length_tensor = torch.unique_consecutive(
+        token_lora_tensor, return_counts=True)
+    cum_result = torch.cumsum(seq_length_tensor, dim=0)
+    b_seq_start_tensor = torch.zeros_like(seq_length_tensor)
+    b_seq_start_tensor[1:].copy_(cum_result[:-1])
+    max_length = seq_length_tensor.max().item()
+    token_nums = seq_length_tensor.sum().item()
+    batch_size = lora_indices_tensor.size(0)
+    no_lora = False
+    # -1 means no lora should be applied. Use `no_lora` to determine whether
+    # the current step requires LoRA. If LoRA is not needed, the prefill stage
+    # does not need to launch the triton kernel, which can improve performance
+    if batch_size == 1 and lora_indices_tensor == -1:
+        no_lora = True
+    return (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor,
+            batch_size, max_length, token_nums, no_lora)
+
+
+# TODO see if this can be vectorized
+def convert_mapping(
+    mapping: "LoRAMapping",
+    lora_index_to_id: list[Optional[int]],
+    max_loras: int,
+    vocab_size: int,
+    extra_vocab_size: int,
+    device: torch.device,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, list[int]]:
+    """Converts LoRAMapping to index tensors.
+
+    Args:
+        mapping: LoRAMapping mapping rows in a batch to LoRA ids.
+        lora_index_to_id: List mapping LoRA ids to LoRA indices.
+        max_loras: Maximum number of LoRAs.
+        vocab_size: Model vocab size.
+        extra_vocab_size: Extra vocab size each LoRA can have.
+
+    Returns:
+        A tuple of tensors:
+            base_indices: Tensor of shape [batch_size] mapping batch rows to
+                LoRA indices.
+            sampler_indices: Tensor of shape [batch_size] mapping requests to
+                LoRA indices for sampler. For generation, this will be the
+                same as base_indices. For prefill, this will map requests
+                to LoRA indices.
+            sampler_indices_padded: Tensor of shape [batch_size] mapping
+                requests to LoRA indices for sampler with padding.
+                Same as sampler_indices, but -1 is replaced with
+                max_loras.
+            embeddings_indices: Tensor of shape [2, batch_size] mapping
+                requests to embedding indices. First row is for embeddings
+                added by the LoRAs, second row is for the LoRA.lora_a
+                embeddings.
+            indices_len: List of lengths of the above tensors. It contains
+                (base_indices, sampler_indices, sampler_indices_padded,
+                embeddings_indices).
+    """
+    index_mapping_indices: list[int] = list(mapping.index_mapping).copy()
+    embedding_indices = index_mapping_indices.copy()
+    lora_indices = index_mapping_indices.copy()
+
+    prompt_mapping: list[int] = [
+        lora_index_to_id.index(x) if x > 0 else -1
+        for x in mapping.prompt_mapping
+    ]
+    lora_idx = None
+    for i in range(len(index_mapping_indices)):
+        # TODO index can be slow. optimize
+        lora_idx = (lora_index_to_id.index(index_mapping_indices[i])
+                    if index_mapping_indices[i] > 0 else -1)
+        embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0
+        lora_indices[i] = lora_idx
+
+    indices_list: list[Union[list[int], torch.Tensor]] = [
+        index_mapping_indices,
+        lora_indices,
+        embedding_indices,
+    ]
+
+    indices = torch.tensor(indices_list, dtype=torch.long, device=device)
+    prompt_mapping_tensor = torch.tensor(prompt_mapping,
+                                         dtype=torch.long,
+                                         device=device)
+    embeddings_indices = torch.stack([
+        indices[2] * extra_vocab_size,
+        indices[2] * (vocab_size + extra_vocab_size),
+    ])
+    embeddings_indices = torch.where(embeddings_indices == -1, max_loras - 1,
+                                     embeddings_indices)
+    base_indices = indices[1]
+    sampler_indices = prompt_mapping_tensor
+    sampler_indices_padded = sampler_indices.clone()
+    sampler_indices_padded = torch.where(sampler_indices_padded == -1,
+                                         max_loras - 1, sampler_indices_padded)
+    sampler_indices_padded = torch.arange(
+        0, len(sampler_indices_padded), device=device, dtype=torch.long) + (
+            sampler_indices_padded * len(sampler_indices_padded))
+
+    # Contain length of indices tensors. Used to index into each tensor.
+    indices_len = [
+        base_indices.shape[-1],
+        sampler_indices.shape[-1],
+        sampler_indices_padded.shape[-1],
+        embeddings_indices.shape[-1],
+    ]
+
+    return (
+        base_indices,
+        sampler_indices,
+        sampler_indices_padded,
+        embeddings_indices,
+        indices_len,
+    )
diff --git a/vllm_v0.10.0/vllm/lora/request.py b/vllm_v0.10.0/vllm/lora/request.py
new file mode 100644
index 0000000..5bbba78
--- /dev/null
+++ b/vllm_v0.10.0/vllm/lora/request.py
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import warnings
+from typing import Optional
+
+import msgspec
+
+from vllm.adapter_commons.request import AdapterRequest
+
+
+class LoRARequest(
+        msgspec.Struct,
+        omit_defaults=True,  # type: ignore[call-arg]
+        array_like=True):  # type: ignore[call-arg]
+    """
+    Request for a LoRA adapter.
+
+    Note that this class should be used internally. For online
+    serving, it is recommended to not allow users to use this class but
+    instead provide another layer of abstraction to prevent users from
+    accessing unauthorized LoRA adapters.
+
+    lora_int_id must be globally unique for a given adapter.
+    This is currently not enforced in vLLM.
+    """
+    __metaclass__ = AdapterRequest
+
+    lora_name: str
+    lora_int_id: int
+    lora_path: str = ""
+    lora_local_path: Optional[str] = msgspec.field(default=None)
+    long_lora_max_len: Optional[int] = None
+    base_model_name: Optional[str] = msgspec.field(default=None)
+    tensorizer_config_dict: Optional[dict] = None
+
+    def __post_init__(self):
+        if self.lora_local_path:
+            warnings.warn(
+                "The 'lora_local_path' attribute is deprecated "
+                "and will be removed in a future version. "
+                "Please use 'lora_path' instead.",
+                DeprecationWarning,
+                stacklevel=2)
+            if not self.lora_path:
+                self.lora_path = self.lora_local_path or ""
+
+        # Ensure lora_path is not empty
+        assert self.lora_path, "lora_path cannot be empty"
+
+    @property
+    def adapter_id(self):
+        return self.lora_int_id
+
+    @property
+    def name(self):
+        return self.lora_name
+
+    @property
+    def path(self):
+        return self.lora_path
+
+    @property
+    def local_path(self):
+        warnings.warn(
+            "The 'local_path' attribute is deprecated "
+            "and will be removed in a future version. "
+            "Please use 'path' instead.",
+            DeprecationWarning,
+            stacklevel=2)
+        return self.lora_path
+
+    @local_path.setter
+    def local_path(self, value):
+        warnings.warn(
+            "The 'local_path' attribute is deprecated "
+            "and will be removed in a future version. "
+            "Please use 'path' instead.",
+            DeprecationWarning,
+            stacklevel=2)
+        self.lora_path = value
+
+    def __eq__(self, value: object) -> bool:
+        """
+        Overrides the equality method to compare LoRARequest
+        instances based on lora_name. This allows for identification
+        and comparison lora adapter across engines.
+        """
+        return isinstance(value,
+                          self.__class__) and self.lora_name == value.lora_name
+
+    def __hash__(self) -> int:
+        """
+        Overrides the hash method to hash LoRARequest instances
+        based on lora_name. This ensures that LoRARequest instances
+        can be used in hash-based collections such as sets and dictionaries,
+        identified by their names across engines.
+        """
+        return hash(self.lora_name)
diff --git a/vllm_v0.10.0/vllm/lora/resolver.py b/vllm_v0.10.0/vllm/lora/resolver.py
new file mode 100644
index 0000000..5808ae1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/lora/resolver.py
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import ABC, abstractmethod
+from collections.abc import Set
+from dataclasses import dataclass, field
+from typing import Optional
+
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+
+logger = init_logger(__name__)
+
+
+class LoRAResolver(ABC):
+    """Base class for LoRA adapter resolvers.
+
+    This class defines the interface for resolving and fetching LoRA adapters.
+    Implementations of this class should handle the logic for locating and
+    downloading LoRA adapters from various sources (e.g. S3, cloud storage,
+    etc.).
+    """
+
+    @abstractmethod
+    async def resolve_lora(self, base_model_name: str,
+                           lora_name: str) -> Optional[LoRARequest]:
+        """Abstract method to resolve and fetch a LoRA model adapter.
+
+        Implements logic to locate and download LoRA adapter based on the name.
+        Implementations might fetch from a blob storage or other sources.
+
+        Args:
+            base_model_name: The name/identifier of the base model to resolve.
+            lora_name: The name/identifier of the LoRA model to resolve.
+
+        Returns:
+            Optional[LoRARequest]: The resolved LoRA model information, or None
+            if the LoRA model cannot be found.
+        """
+        pass
+
+
+@dataclass
+class _LoRAResolverRegistry:
+    resolvers: dict[str, LoRAResolver] = field(default_factory=dict)
+
+    def get_supported_resolvers(self) -> Set[str]:
+        """Get all registered resolver names."""
+        return self.resolvers.keys()
+
+    def register_resolver(
+        self,
+        resolver_name: str,
+        resolver: LoRAResolver,
+    ) -> None:
+        """Register a LoRA resolver.
+        Args:
+            resolver_name: Name to register the resolver under.
+            resolver: The LoRA resolver instance to register.
+        """
+        if resolver_name in self.resolvers:
+            logger.warning(
+                "LoRA resolver %s is already registered, and will be "
+                "overwritten by the new resolver instance %s.", resolver_name,
+                resolver)
+
+        self.resolvers[resolver_name] = resolver
+
+    def get_resolver(self, resolver_name: str) -> LoRAResolver:
+        """Get a registered resolver instance by name.
+        Args:
+            resolver_name: Name of the resolver to get.
+        Returns:
+            The resolver instance.
+        Raises:
+            KeyError: If the resolver is not found in the registry.
+        """
+        if resolver_name not in self.resolvers:
+            raise KeyError(
+                f"LoRA resolver '{resolver_name}' not found. "
+                f"Available resolvers: {list(self.resolvers.keys())}")
+        return self.resolvers[resolver_name]
+
+
+LoRAResolverRegistry = _LoRAResolverRegistry()
diff --git a/vllm_v0.10.0/vllm/lora/utils.py b/vllm_v0.10.0/vllm/lora/utils.py
new file mode 100644
index 0000000..ab0a9fb
--- /dev/null
+++ b/vllm_v0.10.0/vllm/lora/utils.py
@@ -0,0 +1,246 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from typing import TYPE_CHECKING, Optional, Union
+
+import huggingface_hub
+import regex as re
+from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
+                                   HFValidationError, RepositoryNotFoundError)
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.config import LoRAConfig
+from vllm.logger import init_logger
+from vllm.lora.fully_sharded_layers import (
+    ColumnParallelLinearWithShardedLoRA,
+    MergedColumnParallelLinearWithShardedLoRA,
+    MergedQKVParallelLinearWithShardedLoRA, QKVParallelLinearWithShardedLoRA,
+    RowParallelLinearWithShardedLoRA)
+# being imported for _all_lora_classes below
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
+                              LogitsProcessorWithLoRA,
+                              MergedColumnParallelLinearWithLoRA,
+                              MergedQKVParallelLinearWithLoRA,
+                              QKVParallelLinearWithLoRA,
+                              ReplicatedLinearWithLoRA,
+                              RowParallelLinearWithLoRA,
+                              VocabParallelEmbeddingWithLoRA)
+from vllm.model_executor.layers.linear import LinearBase
+
+# yapf: enable
+
+if TYPE_CHECKING:
+    from vllm.model_executor.layers.logits_processor import LogitsProcessor
+    from vllm.model_executor.layers.vocab_parallel_embedding import (
+        ParallelLMHead)
+    from vllm.model_executor.models.utils import WeightsMapper
+
+logger = init_logger(__name__)
+
+_all_lora_classes: set[type[BaseLayerWithLoRA]] = {
+    VocabParallelEmbeddingWithLoRA,
+    ColumnParallelLinearWithLoRA,
+    MergedColumnParallelLinearWithLoRA,
+    QKVParallelLinearWithLoRA,
+    MergedQKVParallelLinearWithLoRA,
+    RowParallelLinearWithLoRA,
+    ReplicatedLinearWithLoRA,
+    LogitsProcessorWithLoRA,
+    ColumnParallelLinearWithShardedLoRA,
+    QKVParallelLinearWithShardedLoRA,
+    MergedColumnParallelLinearWithShardedLoRA,
+    MergedQKVParallelLinearWithShardedLoRA,
+    RowParallelLinearWithShardedLoRA,
+}
+
+
+def from_layer(layer: nn.Module,
+               max_loras: int,
+               lora_config: LoRAConfig,
+               packed_modules_list: list,
+               model_config: Optional[PretrainedConfig] = None) -> nn.Module:
+    for lora_cls in _all_lora_classes:
+        # specifying kwargs so they can be easily accessed in decorator
+        if lora_cls.can_replace_layer(source_layer=layer,
+                                      lora_config=lora_config,
+                                      packed_modules_list=packed_modules_list,
+                                      model_config=model_config):
+            instance_layer = lora_cls(layer)
+            instance_layer.create_lora_weights(max_loras, lora_config,
+                                               model_config)
+            return instance_layer
+    return layer
+
+
+def from_layer_logits_processor(
+    layer: "LogitsProcessor",
+    lm_head: "ParallelLMHead",
+    max_loras: int,
+    lora_config: LoRAConfig,
+    model_config: Optional[PretrainedConfig] = None,
+) -> LogitsProcessorWithLoRA:
+    ret = LogitsProcessorWithLoRA(layer, lm_head.embedding_dim,
+                                  lm_head.weight.dtype, lm_head.weight.device,
+                                  lm_head.get_sharded_to_full_mapping())
+    ret.create_lora_weights(max_loras, lora_config, model_config)
+    return ret
+
+
+def replace_submodule(model: nn.Module, module_name: str,
+                      new_module: nn.Module) -> nn.Module:
+    """Replace a submodule in a model with a new module."""
+    parent = model.get_submodule(".".join(module_name.split(".")[:-1]))
+    target_name = module_name.split(".")[-1]
+    setattr(parent, target_name, new_module)
+    return new_module
+
+
+def parse_fine_tuned_lora_name(
+    name: str,
+    weights_mapper: Optional["WeightsMapper"] = None
+) -> tuple[str, bool, bool]:
+    """Parse the name of lora weights.
+
+    args:
+        name: the name of the fine-tuned LoRA, e.g.
+            base_model.model.dense1.weight
+        weights_mapper: maps the name of weight, e.g.
+            `model.` -> `language_model.model.`,
+    return:
+        tuple(module_name, is_lora_a):
+            module_name: the name of the module, e.g. model.dense1,
+            is_lora_a whether the tensor is lora_a or lora_b.
+            is_bias whether the tensor is lora bias.
+    """
+
+    # LoRA weight qualified name usually starts with `base_model.model.`,
+    # so we remove the prefix `base_model.model.` to make the following
+    # mapping correctly.
+    if name.startswith("base_model.model."):
+        name = name.replace("base_model.model.", "")
+        name = weights_mapper._map_name(name) if weights_mapper else name
+        # recover the prefix `base_model.model.`
+        name = "base_model.model." + name
+    else:
+        name = weights_mapper._map_name(name) if weights_mapper else name
+
+    # In some situations, we may not start with `base_model.model.`.
+    # If we don't (e.g., ibm-granite/granite-speech-3.3-8b),
+    # we should keep the prefix intact.
+    start_index = 2 if name.startswith("base_model.model.") else 0
+
+    parts = name.split(".")
+    if parts[-1] == "weight" and (parts[-2] == "lora_A"
+                                  or parts[-2] == "lora_B"):
+        new_name = ".".join(parts[start_index:-2])
+        return new_name, parts[-2] == "lora_A", False
+
+    if parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B":
+        new_name = ".".join(parts[start_index:-1])
+        return new_name, parts[-1] == "lora_embedding_A", False
+
+    if parts[-1] == "bias":
+        new_name = ".".join(parts[start_index:-2])
+        return new_name, False, True
+
+    raise ValueError(f"{name} is unsupported LoRA weight")
+
+
+def is_regex_target_modules(load_modules: Union[str, list[str]],
+                            expected_lora_modules: list[str]) -> bool:
+    """
+    PEFT supports passing `target_modules` in the form of regular expressions, 
+    such as `model.*(q_proj|k_proj|v_proj)$`. This function is mainly used to 
+    determine whether the suffix in the regular expression is present in the 
+    `expected_lora_modules`.
+    """
+
+    def is_valid_regex(pattern):
+        try:
+            re.compile(pattern)
+            return True
+        except re.error:
+            return False
+
+    def is_subset(sub_list, full_list):
+        return set(sub_list).issubset(set(full_list))
+
+    # Similar to PEFT's processing logic, regex-related operations are only
+    #  executed when the load_modules is a `str`.
+    if not isinstance(load_modules, str):
+        return False
+
+    if is_valid_regex(load_modules):
+        match = re.search(r"\((.*?)\)\$?$", load_modules)
+        if match:
+            suffix = match.group(1).split("|")
+            return is_subset(suffix, expected_lora_modules)
+    return False
+
+
+def get_supported_lora_modules(model: nn.Module) -> list[str]:
+    """
+    In vLLM, all linear layers support LoRA.
+    """
+
+    supported_lora_modules: set[str] = set()
+    for name, module in model.named_modules():
+        # get the embedding modules if the module's embedding_modules
+        # is not empty.
+        embedding_modules = getattr(module, "embedding_modules", None)
+        if embedding_modules is not None:
+            for name in embedding_modules:
+                supported_lora_modules.add(name)
+
+        # get all the linear subfixes.
+        if isinstance(module, (LinearBase, )):
+            supported_lora_modules.add(name.split(".")[-1])
+
+    return list(supported_lora_modules)
+
+
+def get_adapter_absolute_path(lora_path: str) -> str:
+    """
+    Resolves the given lora_path to an absolute local path.
+
+    If the lora_path is identified as a Hugging Face model identifier,
+    it will download the model and return the local snapshot path.
+    Otherwise, it treats the lora_path as a local file path and
+    converts it to an absolute path.
+
+    Parameters:
+    lora_path (str): The path to the lora model, which can be an absolute path,
+                     a relative path, or a Hugging Face model identifier.
+
+    Returns:
+    str: The resolved absolute local path to the lora model.
+    """
+
+    # Check if the path is an absolute path. Return it no matter exists or not.
+    if os.path.isabs(lora_path):
+        return lora_path
+
+    # If the path starts with ~, expand the user home directory.
+    if lora_path.startswith('~'):
+        return os.path.expanduser(lora_path)
+
+    # Check if the expanded relative path exists locally.
+    if os.path.exists(lora_path):
+        return os.path.abspath(lora_path)
+
+    # If the path does not exist locally, assume it's a Hugging Face repo.
+    try:
+        local_snapshot_path = huggingface_hub.snapshot_download(
+            repo_id=lora_path)
+    except (HfHubHTTPError, RepositoryNotFoundError, EntryNotFoundError,
+            HFValidationError):
+        # Handle errors that may occur during the download
+        # Return original path instead instead of throwing error here
+        logger.exception("Error downloading the HuggingFace model")
+        return lora_path
+
+    return local_snapshot_path
diff --git a/vllm_v0.10.0/vllm/lora/worker_manager.py b/vllm_v0.10.0/vllm/lora/worker_manager.py
new file mode 100644
index 0000000..248d295
--- /dev/null
+++ b/vllm_v0.10.0/vllm/lora/worker_manager.py
@@ -0,0 +1,256 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from contextlib import contextmanager
+from typing import Any, Literal, Optional, Union
+
+import torch
+
+from vllm.adapter_commons.utils import (add_adapter_worker,
+                                        apply_adapters_worker,
+                                        list_adapters_worker,
+                                        set_active_adapters_worker)
+from vllm.adapter_commons.worker_manager import AbstractWorkerManager
+from vllm.config import LoRAConfig
+from vllm.logger import init_logger
+from vllm.lora.models import (LoRAModel, LoRAModelManager,
+                              LRUCacheLoRAModelManager, create_lora_manager)
+from vllm.lora.peft_helper import PEFTHelper
+from vllm.lora.request import LoRARequest
+from vllm.lora.utils import get_adapter_absolute_path
+
+logger = init_logger(__name__)
+
+
+class WorkerLoRAManager(AbstractWorkerManager):
+    """WorkerLoRAManager that manages LoRA models on the worker side.
+
+    Every request, the requested LoRAs will be loaded (unless they are already
+    loaded), and every other LoRA will be unloaded."""
+
+    _manager_cls: type[LoRAModelManager] = LoRAModelManager
+
+    def __init__(
+        self,
+        max_num_seqs: int,
+        max_num_batched_tokens: int,
+        vocab_size: int,
+        lora_config: LoRAConfig,
+        device: torch.device,
+        embedding_modules: dict[str, str],
+        embedding_padding_modules: list[str],
+        lora_model_cls: type[LoRAModel] = LoRAModel,
+        max_position_embeddings: Optional[int] = None,
+    ):
+        self._lora_model_cls = lora_model_cls
+        self.embedding_modules = embedding_modules
+        self.embedding_padding_modules = embedding_padding_modules
+        self._cached_dummy_lora: Union[None, Literal[False], LoRAModel] = False
+        self.max_num_seqs = max_num_seqs
+        self.max_num_batched_tokens = max_num_batched_tokens
+        self.vocab_size = vocab_size
+        self.lora_config = lora_config
+        self.max_position_embeddings = max_position_embeddings
+        super().__init__(device)
+        # Lazily initialized by create_lora_manager.
+        self._adapter_manager: LoRAModelManager
+
+    @contextmanager
+    def dummy_lora_cache(self):
+        """Use this context manager to reuse the dummy lora model
+        to avoid creating it repeatedly."""
+        self._cached_dummy_lora = None
+        yield
+        self._cached_dummy_lora = False
+
+    @property
+    def is_enabled(self) -> bool:
+        return True
+
+    def create_lora_manager(
+        self,
+        model: torch.nn.Module,
+    ) -> Any:
+        lora_manager = create_lora_manager(
+            model,
+            max_num_seqs=self.max_num_seqs,
+            max_num_batched_tokens=self.max_num_batched_tokens,
+            vocab_size=self.vocab_size,
+            lora_config=self.lora_config,
+            device=self.device,
+            lora_manager_cls=self._manager_cls,
+        )
+        self._adapter_manager = lora_manager
+        return lora_manager.model
+
+    def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
+        try:
+            supported_lora_modules = (
+                self._adapter_manager.supported_lora_modules)
+            packed_modules_mapping = (
+                self._adapter_manager.packed_modules_mapping)
+            expected_lora_modules: list[str] = []
+            for module in supported_lora_modules:
+                if module in packed_modules_mapping:
+                    expected_lora_modules.extend(
+                        packed_modules_mapping[module])
+                else:
+                    expected_lora_modules.append(module)
+
+            expected_lora_modules = list(set(expected_lora_modules))
+            lora_path = get_adapter_absolute_path(lora_request.lora_path)
+
+            peft_helper = PEFTHelper.from_local_dir(
+                lora_path, self.max_position_embeddings,
+                lora_request.tensorizer_config_dict)
+
+            # Validates the LoRA configuration against requirements before
+            # loading weights, throwing an exception if validation fails.
+            peft_helper.validate_legal(self.lora_config)
+
+            # For some models like Qwen2VL, we need to use hf_to_vllm_mapper
+            # to ensure correct loading of lora weights.
+            model = self._adapter_manager.model
+            hf_to_vllm_mapper = getattr(model, "hf_to_vllm_mapper", None)
+
+            lora = self._lora_model_cls.from_local_checkpoint(
+                lora_path,
+                expected_lora_modules,
+                peft_helper=peft_helper,
+                lora_model_id=lora_request.lora_int_id,
+                device="cpu",
+                dtype=self.lora_config.lora_dtype,
+                target_embedding_padding=self.vocab_size +
+                self.lora_config.lora_extra_vocab_size,
+                embedding_modules=self.embedding_modules,
+                embedding_padding_modules=self.embedding_padding_modules,
+                tensorizer_config_dict=lora_request.tensorizer_config_dict,
+                weights_mapper=hf_to_vllm_mapper)
+
+        except FileNotFoundError as e:
+            # FileNotFoundError should be raised if both
+            # - No adapter found to download from huggingface (or in
+            #       offline mode)
+            # - No local adapter files found at `lora_request.lora_path`
+            # For NotFoundError
+            raise ValueError(
+                f"Loading lora {lora_request.lora_name} failed: No adapter "
+                f"found for {lora_request.lora_path}") from e
+        except Exception as e:
+            # For BadRequestError
+            raise e
+
+        if lora.extra_vocab_size > self.lora_config.lora_extra_vocab_size:
+            raise ValueError(f"LoRA added vocab size {lora.extra_vocab_size} "
+                             f"is greater than lora_extra_vocab_size "
+                             f"{self.lora_config.lora_extra_vocab_size}.")
+        return lora
+
+    def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool:
+        if lora_request.lora_int_id in self.list_adapters():
+            return False
+        if isinstance(self._cached_dummy_lora, LoRAModel):
+            dummy_lora = self._cached_dummy_lora.clone(
+                lora_request.lora_int_id)
+        else:
+            dummy_lora = self._adapter_manager.create_dummy_lora(
+                lora_request.lora_int_id, rank, self.embedding_modules)
+            if self._cached_dummy_lora is None:
+                self._cached_dummy_lora = dummy_lora
+        return self._adapter_manager.add_adapter(dummy_lora)
+
+    def pin_adapter(self, adapter_id: int) -> bool:
+        return self._adapter_manager.pin_adapter(adapter_id)
+
+    def set_active_adapters(self, requests: set[Any],
+                            mapping: Optional[Any]) -> None:
+        set_active_adapters_worker(requests, mapping, self._apply_adapters,
+                                   self._adapter_manager.set_adapter_mapping)
+
+    def _apply_adapters(self, adapter_requests: set[Any]) -> None:
+        apply_adapters_worker(adapter_requests, self.list_adapters,
+                              self._adapter_manager.adapter_slots,
+                              self.remove_adapter, self.add_adapter)
+
+    def add_adapter(self, adapter_request: Any) -> bool:
+        return add_adapter_worker(adapter_request, self.list_adapters,
+                                  self._load_adapter,
+                                  self._adapter_manager.add_adapter,
+                                  self._adapter_manager.activate_adapter)
+
+    def remove_adapter(self, adapter_id: int) -> bool:
+        return self._adapter_manager.remove_adapter(adapter_id)
+
+    def remove_all_adapters(self):
+        self._adapter_manager.remove_all_adapters()
+
+    def list_adapters(self) -> set[int]:
+        return list_adapters_worker(self._adapter_manager.list_adapters)
+
+
+class LRUCacheWorkerLoRAManager(WorkerLoRAManager):
+    """WorkerLoRAManager that manages LoRA models on the worker side.
+
+    Uses an LRU Cache. Every request, the requested LoRAs will be loaded
+    (unless they are already loaded) and least recently used LoRAs will
+    be unloaded if the cache is above capacity."""
+
+    _manager_cls: type[LRUCacheLoRAModelManager] = LRUCacheLoRAModelManager
+
+    def create_lora_manager(
+        self,
+        model: torch.nn.Module,
+    ) -> Any:
+        lora_manager = create_lora_manager(
+            model,
+            lora_manager_cls=self._manager_cls,
+            max_num_seqs=self.max_num_seqs,
+            vocab_size=self.vocab_size,
+            lora_config=self.lora_config,
+            device=self.device,
+            max_num_batched_tokens=self.max_num_batched_tokens,
+        )
+        self._adapter_manager = lora_manager
+        return lora_manager.model
+
+    def _apply_adapters(self, lora_requests: set[LoRARequest]) -> None:
+        loras_map = {
+            lora_request.lora_int_id: lora_request
+            for lora_request in lora_requests if lora_request
+        }
+        if len(loras_map) > self._adapter_manager.lora_slots:
+            raise RuntimeError(
+                f"Number of requested LoRAs ({len(loras_map)}) is greater "
+                "than the number of GPU LoRA slots "
+                f"({self._adapter_manager.lora_slots}).")
+        for lora in loras_map.values():
+            self.add_adapter(lora)
+
+    def add_adapter(self, lora_request: LoRARequest) -> bool:
+        # Note that this method is not thread-safe. It may be invoked multiple
+        # times for the same adapter when using multiple API servers.
+        # This is ok because it's currently only called from
+        # the single-threaded core engine loop.
+
+        if lora_request.lora_int_id not in self.list_adapters():
+            # Load the new adapter first to ensure it is actually valid, before
+            # evicting any existing adapters.
+            # This may cause the # of loaded lora adapters to very temporarily
+            # exceed `--max-cpu-loras`.
+            lora = self._load_adapter(lora_request)
+
+            # Loading succeeded, now check if we will exceed cache capacity and
+            # evict if the oldest adapter if so
+            if len(self._adapter_manager) + 1 > self._adapter_manager.capacity:
+                assert isinstance(self._adapter_manager,
+                                  LRUCacheLoRAModelManager)
+                self._adapter_manager.remove_oldest_adapter()
+            # Then add the new adapter to the cache
+            loaded = self._adapter_manager.add_adapter(lora)
+        else:
+            # If the lora is already loaded, just touch it to
+            # update its position in the caches
+            loaded = self._adapter_manager.get_adapter(
+                lora_request.lora_int_id) is not None
+        self._adapter_manager.activate_adapter(lora_request.lora_int_id)
+        return loaded
diff --git a/vllm_v0.10.0/vllm/mocks/__init__.py b/vllm_v0.10.0/vllm/mocks/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/mocks/mock_nixl_connector.py b/vllm_v0.10.0/vllm/mocks/mock_nixl_connector.py
new file mode 100644
index 0000000..54e2c5e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/mocks/mock_nixl_connector.py
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import uuid
+from collections import defaultdict
+from typing import Optional
+
+
+class FakeNixlWrapper:
+    """Mock implementation of NixlWrapper for testing.
+    
+    We don't inherit from nixl._api.nixl_agent because nixl may not be
+    installed.
+    """
+
+    AGENT_METADATA = b"fake_agent_metadata"
+    REMOTE_AGENT_NAME = "remote_agent"
+
+    def __init__(self, agent_name: str, *args, **kwargs):
+        self._cycles_before_xfer_done = 0
+        self._check_xfer_state_cycles: defaultdict[int, int] = defaultdict(
+            lambda: 0)
+
+    def get_reg_descs(self, caches_data, memory_type: str) -> list:
+        return [str(uuid.uuid4()) for _ in caches_data]
+
+    def register_memory(self, descs) -> None:
+        pass
+
+    def get_xfer_descs(self, blocks_data, memory_type: str) -> list:
+        return [str(uuid.uuid4()) for _ in blocks_data]
+
+    def prep_xfer_dlist(self, agent_name: str, descs: list) -> int:
+        return uuid.uuid4().int
+
+    def get_agent_metadata(self) -> bytes:
+        return self.AGENT_METADATA
+
+    def add_remote_agent(self, agent_metadata: bytes) -> str:
+        return self.REMOTE_AGENT_NAME
+
+    def get_new_notifs(self) -> dict[str, list[bytes]]:
+        # Used to collect done_sending, which we don't test yet.
+        return {}
+
+    def check_xfer_state(self, handle: int) -> str:
+        if self._check_xfer_state_cycles[
+                handle] >= self._cycles_before_xfer_done:
+            return "DONE"
+        self._check_xfer_state_cycles[handle] += 1
+        return "PROC"
+
+    def release_xfer_handle(self, handle: int) -> None:
+        pass
+
+    def send_notif(self, agent_name: str, notif_msg: bytes) -> None:
+        pass
+
+    def make_prepped_xfer(self,
+                          xfer_type: str,
+                          local_xfer_side_handle: int,
+                          local_block_descs_ids: list[int],
+                          remote_xfer_side_handle: int,
+                          remote_block_descs_ids: list[int],
+                          notif_msg: Optional[bytes] = None) -> int:
+        return uuid.uuid4().int
+
+    def transfer(self, handle: int) -> str:
+        return "PROC"
+
+    ############################################################
+    # Follow are for changing the behavior during testing.
+    ############################################################
+
+    def set_cycles_before_xfer_done(self, cycles: int):
+        """Set the number of cycles before a transfer is considered done."""
+        self._cycles_before_xfer_done = cycles
diff --git a/vllm_v0.10.0/vllm/model_executor/__init__.py b/vllm_v0.10.0/vllm/model_executor/__init__.py
new file mode 100644
index 0000000..55dfe80
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/__init__.py
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           PackedvLLMParameter)
+from vllm.model_executor.sampling_metadata import (SamplingMetadata,
+                                                   SamplingMetadataCache)
+from vllm.model_executor.utils import set_random_seed
+
+__all__ = [
+    "SamplingMetadata",
+    "SamplingMetadataCache",
+    "set_random_seed",
+    "BasevLLMParameter",
+    "PackedvLLMParameter",
+]
diff --git a/vllm_v0.10.0/vllm/model_executor/custom_op.py b/vllm_v0.10.0/vllm/model_executor/custom_op.py
new file mode 100644
index 0000000..f6e79cd
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/custom_op.py
@@ -0,0 +1,201 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch.nn as nn
+
+from vllm.config import get_current_vllm_config
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+
+logger = init_logger(__name__)
+
+
+class CustomOp(nn.Module):
+    """
+    Base class for custom ops.
+    Dispatches the forward method to the appropriate backend.
+    """
+
+    def __new__(cls, *args, **kwargs):
+        try:
+            op_name = cls.__name__
+        except AttributeError:
+            raise TypeError(
+                f"Cannot instantiate '{cls.__name__}': its 'name' attribute "
+                f"was not set, possibly because it was not decorated with "
+                f"@CustomOp.register, or it's the CustomOp base class itself."
+            ) from None
+
+        if op_name not in cls.op_registry_oot:
+            op_cls_to_instantiate = cls
+        else:
+            op_cls_to_instantiate = cls.op_registry_oot[op_name]
+            logger.debug("Instantiating custom op: %s using %s", op_name,
+                         str(op_cls_to_instantiate))
+        return super().__new__(op_cls_to_instantiate)
+
+    def __init__(self):
+        super().__init__()
+        self._forward_method = self.dispatch_forward()
+
+    def forward(self, *args, **kwargs):
+        return self._forward_method(*args, **kwargs)
+
+    def forward_native(self, *args, **kwargs):
+        """PyTorch-native implementation of the forward method.
+        This method is optional. If implemented, it can be used with compilers
+        such as torch.compile or PyTorch XLA. Also, it can be used for testing
+        purposes.
+        """
+        raise NotImplementedError
+
+    def forward_cuda(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def forward_hip(self, *args, **kwargs):
+        # By default, we assume that HIP ops are compatible with CUDA ops.
+        return self.forward_cuda(*args, **kwargs)
+
+    def forward_xpu(self, *args, **kwargs):
+        # By default, we assume that XPU ops are compatible with the
+        # PyTorch-native implementation.
+        return self.forward_native(*args, **kwargs)
+
+    def forward_cpu(self, *args, **kwargs):
+        # By default, we assume that CPU ops are compatible with CUDA ops.
+        return self.forward_cuda(*args, **kwargs)
+
+    def forward_tpu(self, *args, **kwargs):
+        # By default, we assume that TPU ops are compatible with the
+        # PyTorch-native implementation.
+        # NOTE(woosuk): This is a placeholder for future extensions.
+        return self.forward_native(*args, **kwargs)
+
+    def forward_neuron(self, *args, **kwargs):
+        # By default, we assume that Neuron ops are compatible with the
+        # PyTorch-native implementation.
+        return self.forward_native(*args, **kwargs)
+
+    def forward_oot(self, *args, **kwargs):
+        # By default, we assume that OOT ops are compatible with the
+        # PyTorch-native implementation.
+        return self.forward_native(*args, **kwargs)
+
+    def dispatch_forward(self):
+        # NOTE(woosuk): Here we assume that vLLM was built for only one
+        # specific backend. Currently, we do not support dynamic dispatching.
+        compilation_config = get_current_vllm_config().compilation_config
+        enabled = self.enabled()
+        if enabled:
+            compilation_config.enabled_custom_ops.update([self.__class__.name])
+        else:
+            compilation_config.disabled_custom_ops.update(
+                [self.__class__.name])
+
+        if not enabled:
+            return self.forward_native
+
+        if current_platform.is_rocm():
+            return self.forward_hip
+        elif current_platform.is_cpu():
+            return self.forward_cpu
+        elif current_platform.is_tpu():
+            return self.forward_tpu
+        elif current_platform.is_xpu():
+            return self.forward_xpu
+        elif current_platform.is_neuron():
+            return self.forward_neuron
+        elif current_platform.is_out_of_tree():
+            return self.forward_oot
+        else:
+            return self.forward_cuda
+
+    @classmethod
+    def enabled(cls) -> bool:
+        # if no name, then it was not registered
+        compilation_config = get_current_vllm_config().compilation_config
+        custom_ops = compilation_config.custom_ops
+        if not hasattr(cls, "name"):
+            logger.warning_once(
+                "Custom op %s was not registered, which means it won't appear in the op registry. It will be enabled/disabled based on the global settings.",  # noqa: E501
+                cls.__name__,
+            )
+            return CustomOp.default_on()
+
+        enabled = f"+{cls.name}" in custom_ops
+        disabled = f"-{cls.name}" in custom_ops
+        assert not (enabled
+                    and disabled), f"Cannot enable and disable {cls.name}"
+
+        return (CustomOp.default_on() or enabled) and not disabled
+
+    @staticmethod
+    def default_on() -> bool:
+        """
+        On by default if PyTorch Inductor is not used.
+        Specifying 'all' or 'none' in custom_op takes precedence.
+        """
+        from vllm.config import CompilationLevel
+        compilation_config = get_current_vllm_config().compilation_config
+        default_on = (compilation_config.level < CompilationLevel.PIECEWISE
+                      or not compilation_config.use_inductor)
+        count_none = compilation_config.custom_ops.count("none")
+        count_all = compilation_config.custom_ops.count("all")
+        return default_on and not count_none > 0 or count_all > 0
+
+    # Dictionary of all custom ops (classes, indexed by registered name).
+    # To check if an op with a name is enabled, call .enabled() on the class.
+    # Examples:
+    # - MyOp.enabled()
+    # - op_registry["my_op"].enabled()
+    op_registry: dict[str, type['CustomOp']] = {}
+    op_registry_oot: dict[str, type['CustomOp']] = {}
+
+    # Decorator to register custom ops.
+    @classmethod
+    def register(cls, name: str):
+
+        def decorator(op_cls):
+            assert name not in cls.op_registry, f"Duplicate op name: {name}"
+            op_cls.name = name
+            cls.op_registry[name] = op_cls
+            return op_cls
+
+        return decorator
+
+    # Decorator to register out-of-tree(oot) custom ops.
+    # For OOT custom ops:
+    #   if in-tree layer class is registered with an oot_custom_op layer,
+    #   the oot_custom_op layer will be used instead.
+    # Example:
+    # - @UnquantizedFusedMoEMethod.register_oot
+    #   class HPUUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod)
+    # or
+    # - @CustomOP.register_oot(name="UnquantizedFusedMoEMethod")
+    @classmethod
+    def register_oot(cls, _decorated_op_cls=None, name: Optional[str] = None):
+
+        def decorator(op_cls):
+            reg_name = name if name is not None else cls.__name__
+            assert reg_name not in cls.op_registry_oot, \
+                f"Duplicate op name: {reg_name}"
+            op_cls.name = reg_name
+            cls.op_registry_oot[reg_name] = op_cls
+            return op_cls
+
+        if _decorated_op_cls is None:
+            # Called with parentheses: @CustomOP.register_oot()
+            # or @CustomOP.register_oot(name="...")
+            # So, _decorated_op_cls is None.
+            # We return the actual decorator function.
+            return decorator
+        elif isinstance(_decorated_op_cls, type):  # Check if it's a class
+            # Called without parentheses: @CustomOP.register_oot
+            # The first argument is the class itself.
+            # We call the 'decorator' function immediately with the class.
+            return decorator(_decorated_op_cls)
+        else:
+            # Handle other unexpected cases if necessary
+            raise TypeError("Decorator can only be applied to classes.")
diff --git a/vllm_v0.10.0/vllm/model_executor/guided_decoding/__init__.py b/vllm_v0.10.0/vllm/model_executor/guided_decoding/__init__.py
new file mode 100644
index 0000000..7540e63
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/guided_decoding/__init__.py
@@ -0,0 +1,192 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from vllm.logger import init_logger
+from vllm.model_executor.guided_decoding.utils import (
+    convert_lark_to_gbnf, grammar_is_likely_lark,
+    has_lmf_unsupported_json_features, has_xgrammar_unsupported_json_features)
+from vllm.reasoning import ReasoningParserManager
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer
+
+    from vllm.config import ModelConfig
+    from vllm.logits_process import LogitsProcessor
+    from vllm.sampling_params import GuidedDecodingParams
+
+logger = init_logger(__name__)
+
+
+def maybe_backend_fallback(
+        guided_params: GuidedDecodingParams) -> GuidedDecodingParams:
+
+    def fallback_or_error(guided_params: GuidedDecodingParams, message: str,
+                          fallback: str) -> None:
+        """Change the backend to the specified fallback with a warning log,
+        or raise a ValueError if the `disable_fallback` option is specified."""
+        if guided_params.disable_fallback:
+            raise ValueError(message)
+
+        logger.warning("%s Falling back to use %s instead.", message, fallback)
+        guided_params.backend = fallback
+
+    # `auto` was added for V1 to explicitly declare a mode that has fallbacks
+    # in place. If that is specified with V0, treat it as `xgrammar`, as we have
+    # fallbacks enabled for that and it is the V0 default.
+    if guided_params.backend == "auto":
+        guided_params.backend = "xgrammar"
+
+    # lm-format-enforce doesn't support grammar, fallback to xgrammar
+    if guided_params.backend == "lm-format-enforcer":
+        if guided_params.grammar is not None:
+            fallback_or_error(
+                guided_params,
+                "lm-format-enforcer does not support grammar guided decoding.",
+                "xgrammar")
+
+        # lm-format-enforcer doesn't support some JSON schema features
+        elif (guided_params.json is not None
+              and has_lmf_unsupported_json_features(guided_params.json)):
+            fallback_or_error(
+                guided_params,
+                "lm-format-enforcer does not support advanced JSON schema "
+                "features like patterns or numeric ranges.", "outlines")
+
+    if guided_params.backend == "xgrammar":
+        from vllm.model_executor.guided_decoding.xgrammar_decoding import (
+            xgr_installed)
+
+        # xgrammar doesn't support some JSON schema features
+        if (guided_params.json is not None and
+                has_xgrammar_unsupported_json_features(guided_params.json)):
+            fallback_or_error(
+                guided_params,
+                "xgrammar does not support advanced JSON schema features like "
+                "string length, item limits, or property bounds.", "outlines")
+
+        # xgrammar only supports GBNF grammars, so we must convert Lark.
+        # We must check if the grammar is likely Lark and if that
+        # grammar is convertible to GBNF
+        elif (guided_params.grammar is not None
+              and grammar_is_likely_lark(guided_params.grammar)):
+            try:
+                convert_lark_to_gbnf(guided_params.grammar)
+            except Exception:
+                fallback_or_error(
+                    guided_params,
+                    "xgrammar does not support Lark grammars and the "
+                    "grammar failed to convert to GBNF.", "guidance")
+
+        # If the xgrammar module cannot be imported successfully,
+        # we should still allow users to use guided decoding with a fallback.
+        elif not xgr_installed:
+            fallback_or_error(
+                guided_params,
+                "xgrammar module cannot be imported successfully.", "guidance")
+
+    if guided_params.backend == "outlines":
+        if guided_params.json_object is not None:
+            # outlines doesn't support json_object, fallback to guidance
+            fallback_or_error(guided_params,
+                              "outlines does not support json_object.",
+                              "guidance")
+        elif guided_params.grammar is not None:
+            # outlines grammar support has been removed, fallback to guidance
+            # if it is a lark-based grammar and xgrammar otherwise
+            if grammar_is_likely_lark(guided_params.grammar):
+                fallback_or_error(guided_params,
+                                  "outlines no longer supports grammars.",
+                                  "guidance")
+            else:
+                # The grammar is likely already GBNF format.
+                fallback_or_error(guided_params,
+                                  "outlines no longer supports grammars.",
+                                  "xgrammar")
+
+    return guided_params
+
+
+async def get_guided_decoding_logits_processor(
+        guided_params: GuidedDecodingParams,
+        tokenizer: PreTrainedTokenizer,
+        model_config: ModelConfig,
+        reasoning_backend: str | None = None) -> LogitsProcessor | None:
+
+    reasoner = None
+    if reasoning_backend:
+        reasoner_class = ReasoningParserManager.get_reasoning_parser(
+            reasoning_backend)
+        reasoner = reasoner_class(tokenizer)
+
+    guided_params = maybe_backend_fallback(guided_params)
+
+    if guided_params.backend == 'outlines':
+        # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
+        from vllm.model_executor.guided_decoding.outlines_decoding import (  # noqa
+            get_outlines_guided_decoding_logits_processor)
+        return await get_outlines_guided_decoding_logits_processor(
+            guided_params, tokenizer, reasoner)
+    if guided_params.backend == 'lm-format-enforcer':
+        from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (  # noqa
+            get_local_lm_format_enforcer_guided_decoding_logits_processor)
+        return get_local_lm_format_enforcer_guided_decoding_logits_processor(
+            guided_params, tokenizer)
+    if guided_params.backend == 'xgrammar':
+        from vllm.model_executor.guided_decoding.xgrammar_decoding import (  # noqa
+            get_local_xgrammar_guided_decoding_logits_processor)
+        return get_local_xgrammar_guided_decoding_logits_processor(
+            guided_params, tokenizer, model_config, reasoner)
+    if guided_params.backend == 'guidance':
+        from vllm.model_executor.guided_decoding.guidance_decoding import (
+            get_local_guidance_guided_decoding_logits_processor)
+        return get_local_guidance_guided_decoding_logits_processor(
+            guided_params, tokenizer)
+    raise ValueError(
+        f"Unknown guided decoding backend '{guided_params.backend}'. "
+        "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar', 'guidance'"
+    )
+
+
+def get_local_guided_decoding_logits_processor(
+        guided_params: GuidedDecodingParams,
+        tokenizer: PreTrainedTokenizer,
+        model_config: ModelConfig,
+        reasoning_backend: str | None = None) -> LogitsProcessor | None:
+    guided_params = maybe_backend_fallback(guided_params)
+
+    reasoner = None
+    if reasoning_backend:
+        reasoner_class = ReasoningParserManager.get_reasoning_parser(
+            reasoning_backend)
+        reasoner = reasoner_class(tokenizer)
+
+    if guided_params.backend == 'outlines':
+        # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
+        from vllm.model_executor.guided_decoding.outlines_decoding import (  # noqa
+            get_local_outlines_guided_decoding_logits_processor)
+        return get_local_outlines_guided_decoding_logits_processor(
+            guided_params, tokenizer, reasoner)
+    if guided_params.backend == 'lm-format-enforcer':
+        from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (  # noqa
+            get_local_lm_format_enforcer_guided_decoding_logits_processor)
+        return get_local_lm_format_enforcer_guided_decoding_logits_processor(
+            guided_params, tokenizer)
+    if guided_params.backend == 'xgrammar':
+        from vllm.model_executor.guided_decoding.xgrammar_decoding import (  # noqa
+            get_local_xgrammar_guided_decoding_logits_processor)
+        return get_local_xgrammar_guided_decoding_logits_processor(
+            guided_params, tokenizer, model_config, reasoner)
+    if guided_params.backend == 'guidance':
+        from vllm.model_executor.guided_decoding.guidance_decoding import (
+            get_local_guidance_guided_decoding_logits_processor)
+        return get_local_guidance_guided_decoding_logits_processor(
+            guided_params, tokenizer)
+
+    raise ValueError(
+        f"Unknown guided decoding backend '{guided_params.backend}'. "
+        "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar', 'guidance'"
+    )
diff --git a/vllm_v0.10.0/vllm/model_executor/guided_decoding/guidance_decoding.py b/vllm_v0.10.0/vllm/model_executor/guided_decoding/guidance_decoding.py
new file mode 100644
index 0000000..05b6a1c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/guided_decoding/guidance_decoding.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+
+import llguidance
+from regex import escape as regex_escape
+from transformers import PreTrainedTokenizerBase
+
+from vllm.model_executor.guided_decoding.guidance_logits_processors import (
+    GuidanceLogitsProcessor)
+from vllm.sampling_params import GuidedDecodingParams
+from vllm.v1.structured_output.backend_guidance import (
+    process_for_additional_properties)
+
+
+def get_local_guidance_guided_decoding_logits_processor(
+        guided_params: GuidedDecodingParams,
+        tokenizer: PreTrainedTokenizerBase) -> GuidanceLogitsProcessor:
+    """
+    Given an OpenAI-compatible request, check for guided decoding parameters
+    and get the necessary logits processor for the given guide.
+    """
+
+    grm = ""
+    any_whitespace = not guided_params.disable_any_whitespace
+    if (guide_json := guided_params.json) is not None:
+        # Optionally set additionalProperties to False at the top-level
+        # By default, other backends do not allow additional top-level
+        # properties, so this makes guidance more similar to other backends
+        if guided_params.disable_additional_properties:
+            if not isinstance(guide_json, str):
+                guide_json = json.dumps(guide_json)
+            guide_json = process_for_additional_properties(guide_json)
+
+        grm = llguidance.LLMatcher.grammar_from_json_schema(
+            guide_json,
+            overrides={"whitespace_pattern": guided_params.whitespace_pattern},
+            defaults={
+                "whitespace_flexible": any_whitespace,
+            })
+    elif guided_params.json_object:
+        grm = llguidance.LLMatcher.grammar_from_json_schema(
+            '{"type": "object"}',
+            overrides={"whitespace_pattern": guided_params.whitespace_pattern},
+            defaults={
+                "whitespace_flexible": any_whitespace,
+            })
+    elif guided_params.regex:
+        grm = llguidance.grammar_from("regex", guided_params.regex)
+    elif guided_params.choice:
+        # choice just uses regex
+        choices = (regex_escape(str(choice))
+                   for choice in guided_params.choice)
+        choices_regex = "(" + "|".join(choices) + ")"
+        grm = llguidance.grammar_from("regex", choices_regex)
+    elif guided_params.grammar:
+        # this supports Lark and GBNF
+        grm = llguidance.grammar_from("grammar", guided_params.grammar)
+
+    if grm:
+        return GuidanceLogitsProcessor(grm, tokenizer)
+
+    raise ValueError("Unknown guided decoding mode")
diff --git a/vllm_v0.10.0/vllm/model_executor/guided_decoding/guidance_logits_processors.py b/vllm_v0.10.0/vllm/model_executor/guided_decoding/guidance_logits_processors.py
new file mode 100644
index 0000000..379b5ea
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/guided_decoding/guidance_logits_processors.py
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
+import os
+from typing import Any
+
+import llguidance
+import llguidance.hf
+import llguidance.torch
+import torch
+from transformers import PreTrainedTokenizerBase
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class GuidanceLogitsProcessor:
+    """Base Guidance Logits Processor"""
+
+    cached_tokenizers: dict[str, Any] = {}
+
+    def __init__(
+        self,
+        grammar: str,
+        tokenizer: PreTrainedTokenizerBase,
+    ) -> None:
+        """Base Guidance Logits Processor
+
+        Args:
+            grammar (str)
+                grammar to guide the generation
+            tokenizer (PreTrainedTokenizerBase)
+                model's tokenizer
+        """
+        self.grammar = grammar
+        self.tokenizer = tokenizer
+        self.tokenizer_name = tokenizer.name_or_path
+        self.ll_tokenizer = None
+        self.ll_matcher = None
+        self.bitmask = None
+        self.new_sampling = False
+        self.initialized = False
+
+    def clone(self) -> "GuidanceLogitsProcessor":
+        cloned = copy.copy(self)
+        if self.initialized:
+            cloned.ll_matcher = llguidance.LLMatcher(
+                self.ll_tokenizer,  # type: ignore[assignment]
+                self.grammar,
+                log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")),
+            )
+            self.bitmask = llguidance.torch.allocate_token_bitmask(
+                1, self.ll_tokenizer.vocab_size)  # type: ignore[attr-defined]
+        return cloned
+
+    def _initialize(self):
+        if self.initialized:
+            return
+
+        ll_tokenizer = self.cached_tokenizers.get(self.tokenizer.name_or_path,
+                                                  None)
+        if ll_tokenizer is None:
+            ll_tokenizer = llguidance.hf.from_tokenizer(self.tokenizer, None)
+            self.cached_tokenizers[self.tokenizer.name_or_path] = ll_tokenizer
+
+        self.ll_tokenizer = ll_tokenizer
+        self.ll_matcher = llguidance.LLMatcher(
+            self.ll_tokenizer,
+            self.grammar,
+            log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")),
+        )
+
+        # create reusable bitmask
+        self.bitmask = llguidance.torch.allocate_token_bitmask(
+            1, self.ll_tokenizer.vocab_size)  # type: ignore[attr-defined]
+
+        self.initialized = True
+
+    def __call__(
+        self,
+        input_ids: list[int],
+        scores: torch.Tensor,
+    ) -> torch.Tensor:
+        # we initialize the guidance model here
+        # to avoid pickling ll_tokenizer and ll_interpreter
+        self._initialize()
+
+        if self.new_sampling and len(input_ids) > 0:
+            self.ll_matcher.consume_token(  # type: ignore[attr-defined]
+                input_ids[-1])
+            err = self.ll_matcher.get_error()  # type: ignore[attr-defined]
+            if err:
+                logger.warning("Error in LLMatcher: %s", err)
+
+        llguidance.torch.fill_next_token_bitmask(self.ll_matcher, self.bitmask,
+                                                 0)
+        llguidance.torch.apply_token_bitmask_inplace(
+            scores,
+            self.bitmask.to(scores.device))  # type: ignore[attr-defined]
+
+        self.new_sampling = True
+
+        return scores
diff --git a/vllm_v0.10.0/vllm/model_executor/guided_decoding/guided_fields.py b/vllm_v0.10.0/vllm/model_executor/guided_decoding/guided_fields.py
new file mode 100644
index 0000000..fa97b6d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/guided_decoding/guided_fields.py
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from typing import Optional, TypedDict, Union
+
+
+# These classes are deprecated, see SamplingParams
+class LLMGuidedOptions(TypedDict, total=False):
+    guided_json: Union[dict, str]
+    guided_regex: str
+    guided_choice: list[str]
+    guided_grammar: str
+    guided_decoding_backend: str
+    guided_whitespace_pattern: str
+    guided_json_object: bool
+
+
+@dataclass
+class GuidedDecodingRequest:
+    """One of the fields will be used to retrieve the logit processor."""
+    guided_json: Optional[Union[dict, str]] = None
+    guided_regex: Optional[str] = None
+    guided_choice: Optional[list[str]] = None
+    guided_grammar: Optional[str] = None
+    guided_decoding_backend: Optional[str] = None
+    guided_whitespace_pattern: Optional[str] = None
+    guided_json_object: Optional[bool] = None
+    structural_tag: Optional[str] = None
+
+    def __post_init__(self):
+        """Validate that some fields are mutually exclusive."""
+        guide_count = sum(x is not None
+                          for x in (self.guided_json, self.guided_regex,
+                                    self.guided_choice, self.guided_grammar,
+                                    self.guided_json_object,
+                                    self.structural_tag))
+        if guide_count > 1:
+            raise ValueError(
+                "You can only use one kind of guided decoding but multiple are "
+                f"specified: {self.__dict__}")
diff --git a/vllm_v0.10.0/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py b/vllm_v0.10.0/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
new file mode 100644
index 0000000..f9b51f4
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from functools import lru_cache
+from json import loads as json_loads
+from typing import Optional, Union
+
+from lmformatenforcer import (CharacterLevelParser, JsonSchemaParser,
+                              RegexParser, StringParser,
+                              TokenEnforcerTokenizerData, UnionParser)
+from lmformatenforcer.integrations.vllm import (
+    build_vllm_logits_processor, build_vllm_token_enforcer_tokenizer_data)
+from transformers import PreTrainedTokenizerBase
+
+from vllm.logits_process import LogitsProcessor
+from vllm.sampling_params import GuidedDecodingParams
+
+
+def get_local_lm_format_enforcer_guided_decoding_logits_processor(
+        guided_params: GuidedDecodingParams,
+        tokenizer) -> Optional[LogitsProcessor]:
+    """
+    Given an OpenAI-compatible request, check for guided decoding parameters
+    and get the necessary logits processor for the given guide.
+    We cache logit processors by (guide, tokenizer), and on cache hit
+    we make a shallow copy to reuse the same underlying FSM.
+    """
+
+    tokenizer_data = _cached_build_vllm_token_enforcer_tokenizer_data(
+        tokenizer)
+    character_level_parser: CharacterLevelParser
+    if guided_params.json:
+        schema_dict = _normalize_json_schema_object(guided_params.json)
+        character_level_parser = JsonSchemaParser(schema_dict)
+    elif guided_params.choice:
+        character_level_parser = UnionParser(
+            [StringParser(choice) for choice in guided_params.choice])
+    elif guided_params.regex:
+        character_level_parser = RegexParser(guided_params.regex)
+    elif guided_params.grammar:
+        # CFG grammar not supported by LMFE
+        raise ValueError("Cannot construct a guided decoding logits processor"
+                         " using the grammar option with the"
+                         " lm_format_enforcer backend.")
+    elif guided_params.json_object:
+        # None means any json object
+        character_level_parser = JsonSchemaParser(None)
+    else:
+        return None
+
+    logits_processor = build_vllm_logits_processor(tokenizer_data,
+                                                   character_level_parser)
+    return logits_processor
+
+
+def _normalize_json_schema_object(schema: Union[str, dict]) -> dict:
+    if isinstance(schema, str):
+        return json_loads(schema)
+    if isinstance(schema, dict):
+        return schema
+    raise AssertionError(f"Unsupported schema type {schema}")
+
+
+@lru_cache
+def _cached_build_vllm_token_enforcer_tokenizer_data(
+        tokenizer: PreTrainedTokenizerBase) -> TokenEnforcerTokenizerData:
+    return build_vllm_token_enforcer_tokenizer_data(tokenizer)
diff --git a/vllm_v0.10.0/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm_v0.10.0/vllm/model_executor/guided_decoding/outlines_decoding.py
new file mode 100644
index 0000000..7e365b2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/guided_decoding/outlines_decoding.py
@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import concurrent.futures
+import os
+from enum import Enum
+from json import dumps as json_dumps
+from typing import Optional, Union
+
+from regex import escape as regex_escape
+from transformers import PreTrainedTokenizerBase
+
+from vllm.model_executor.guided_decoding.outlines_logits_processors import (
+    JSONLogitsProcessor, RegexLogitsProcessor)
+from vllm.reasoning import ReasoningParser
+from vllm.sampling_params import GuidedDecodingParams
+
+
+class GuidedDecodingMode(Enum):
+    JSON = "json"
+    REGEX = "regex"
+    CHOICE = "choice"
+
+
+global_thread_pool = None  # used for generating logits processor fsm
+
+# It's not yet clear that using more provides a benefit, and it could
+# potentially starve other processes on the machine. We'll cap this for now and
+# adjust later if testing proves it to help overcome a bottleneck.
+_MAX_THREADPOOL_WORKERS = 16
+
+
+async def get_outlines_guided_decoding_logits_processor(
+    guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizerBase,
+    reasoner: Optional[ReasoningParser]
+) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, None]:
+    """
+    Given an OpenAI-compatible request, check for guided decoding parameters
+    and get the necessary logits processor for the given guide.
+    """
+    global global_thread_pool
+    guide, mode = _get_guide_and_mode(guided_params)
+    if not guide or not mode:
+        return None
+
+    if global_thread_pool is None:
+        max_workers = os.cpu_count() or 2
+        if max_workers > _MAX_THREADPOOL_WORKERS:
+            max_workers = _MAX_THREADPOOL_WORKERS
+        global_thread_pool = concurrent.futures.ThreadPoolExecutor(
+            max_workers=max_workers)
+    loop = asyncio.get_running_loop()
+    return await loop.run_in_executor(global_thread_pool,
+                                      _get_logits_processor, guide, tokenizer,
+                                      mode, guided_params.whitespace_pattern,
+                                      reasoner)
+
+
+def get_local_outlines_guided_decoding_logits_processor(
+    guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizerBase,
+    reasoner: Optional[ReasoningParser]
+) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, None]:
+    """
+    Given an OpenAI-compatible request, check for guided decoding parameters
+    and get the necessary logits processor for the given guide.
+    """
+    guide, mode = _get_guide_and_mode(guided_params)
+    if not guide or not mode:
+        return None
+
+    return _get_logits_processor(guide, tokenizer, mode,
+                                 guided_params.whitespace_pattern, reasoner)
+
+
+def _get_guide_and_mode(
+    guided_params: GuidedDecodingParams
+) -> Union[tuple[str, GuidedDecodingMode], tuple[None, None]]:
+    if guided_params.json:
+        if isinstance(guided_params.json, dict):
+            # turn dict into hashable string
+            json = json_dumps(guided_params.json)
+        else:
+            json = guided_params.json
+        return json, GuidedDecodingMode.JSON
+    elif guided_params.regex:
+        return guided_params.regex, GuidedDecodingMode.REGEX
+    elif guided_params.choice:
+        # choice just uses regex
+        choices = [
+            regex_escape(str(choice)) for choice in guided_params.choice
+        ]
+        choices_regex = "(" + "|".join(choices) + ")"
+        return choices_regex, GuidedDecodingMode.CHOICE
+    elif guided_params.grammar:
+        raise ValueError(
+            "The `outlines` guided decoding backend no longer supports grammar "
+            "guided generation. Please use either the `xgrammar` or `guidance` "
+            "backend")
+    else:
+        return None, None
+
+
+def _get_logits_processor(
+    guide: str,
+    tokenizer: PreTrainedTokenizerBase,
+    mode: GuidedDecodingMode,
+    whitespace_pattern: Union[str, None],
+    reasoner: Optional[ReasoningParser],
+) -> Union[JSONLogitsProcessor, RegexLogitsProcessor]:
+    if mode == GuidedDecodingMode.JSON:
+        return JSONLogitsProcessor(guide, tokenizer, whitespace_pattern,
+                                   reasoner)
+    elif mode == GuidedDecodingMode.REGEX or mode == GuidedDecodingMode.CHOICE:
+        return RegexLogitsProcessor(guide, tokenizer, reasoner)
+    else:
+        raise ValueError(f"Unknown guided decoding mode {mode}")
diff --git a/vllm_v0.10.0/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm_v0.10.0/vllm/model_executor/guided_decoding/outlines_logits_processors.py
new file mode 100644
index 0000000..7f047a1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -0,0 +1,307 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Copyright 2024-present the Outlines developers
+from __future__ import annotations
+
+import copy
+import hashlib
+import importlib.metadata
+import json
+import os
+from typing import Optional, Union
+
+import regex as re
+import torch
+from cachetools import LRUCache
+from diskcache import Cache
+from outlines_core import Guide, Index, Vocabulary
+from outlines_core.json_schema import build_regex_from_schema
+from outlines_core.kernels.torch import (_apply_token_bitmask_inplace_kernel,
+                                         allocate_token_bitmask)
+from pydantic import BaseModel
+from transformers import PreTrainedTokenizerBase
+from transformers.file_utils import SPIECE_UNDERLINE
+from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParser
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+logger = init_logger(__name__)
+
+CACHE = None
+
+
+class BaseLogitsProcessor:
+
+    def __init__(self, guide: Guide, eos_token_id: int,
+                 reasoner: Optional[ReasoningParser]) -> None:
+        self._guide: Guide = guide
+        self._eos_token_id: int = eos_token_id
+        self._reasoner: Optional[ReasoningParser] = reasoner
+        self._mask: Optional[torch.Tensor] = None
+
+    def __call__(self, input_ids: list[int],
+                 scores: torch.Tensor) -> torch.Tensor:
+        if self._mask is None:
+            self._mask = allocate_token_bitmask(scores.size(-1))
+
+        # Skip the structured logits processing if reasoning is not finished.
+        # reasoner is not None only when `--reasoning-parser` is set.
+        if self._reasoner is not None and not self._reasoner.is_reasoning_end(
+                input_ids):
+            return scores
+
+        # Remove the reasoning tokens from the input_ids
+        # We need this because our implementation relies on the
+        # input_ids sequence to store the FSM state.
+        input_ids = (self._reasoner.extract_content_ids(input_ids)
+                     if self._reasoner is not None else input_ids)
+
+        # Vllm V0 engine has a weird bug where we have to repeat
+        # the eos token id twice for generation to stop, or at least
+        # that is what we have to do from here in any case.
+        # This is a patch until a better solution can be pushed
+        # to outlines_core
+        if input_ids and input_ids[-1] != self._eos_token_id:
+            self._guide.advance(token_id=input_ids[-1], return_tokens=False)
+
+        self._guide.write_mask_into(
+            data_ptr=self._mask.data_ptr(),
+            numel=self._mask.numel(),
+            element_size=self._mask.element_size(),
+        )
+
+        # Any allowed tokens beyond the length of the scores will
+        # be ignored by the kernel, taking care of the issue with
+        # models such as Llama 3.2 Vision with an `<|image|>` token
+        # with id 128256, but scores.shape == torch.Size([128256])
+        _apply_token_bitmask_inplace_kernel(
+            logits=scores.unsqueeze(dim=0),
+            # mask must be on same device
+            mask=self._mask.to(scores.device, non_blocking=True))
+        self._mask.to("cpu", non_blocking=True)
+
+        return scores
+
+    def clone(self) -> BaseLogitsProcessor:
+        guide = copy.deepcopy(self._guide)
+        guide.reset()
+        return BaseLogitsProcessor(guide=guide,
+                                   eos_token_id=self._eos_token_id,
+                                   reasoner=self._reasoner)
+
+
+class RegexLogitsProcessor(BaseLogitsProcessor):
+
+    @classmethod
+    def _get_guide(cls, regex_string: str,
+                   tokenizer: PreTrainedTokenizerBase) -> Guide:
+        global CACHE
+        if CACHE is None:
+            CACHE = get_cache()
+        vocabulary = get_vocabulary(tokenizer)  # type: ignore[arg-type]
+        cache_key = f"{vocabulary._hash}_{regex_string}"
+        if CACHE is not None and cache_key in CACHE:
+            return Guide(CACHE[cache_key])
+
+        index = Index(regex_string, vocabulary.inner)
+
+        if CACHE is not None:
+            CACHE[cache_key] = index
+
+        return Guide(index)
+
+    def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase,
+                 reasoner: Optional[ReasoningParser]) -> None:
+        super().__init__(
+            guide=RegexLogitsProcessor._get_guide(regex_string, tokenizer),
+            eos_token_id=tokenizer.eos_token_id,  # type: ignore
+            reasoner=reasoner)
+
+
+class JSONLogitsProcessor(RegexLogitsProcessor):
+
+    def __init__(self, schema: Union[str, dict, BaseModel],
+                 tokenizer: PreTrainedTokenizerBase,
+                 whitespace_pattern: Union[str, None],
+                 reasoner: Optional[ReasoningParser]) -> None:
+
+        if isinstance(schema, type(BaseModel)):
+            schema_str = json.dumps(schema.model_json_schema())
+        elif isinstance(schema, dict):
+            schema_str = json.dumps(schema)
+        elif isinstance(schema, str):
+            schema_str = schema
+        else:
+            raise ValueError(
+                f"Cannot parse schema {schema}. The schema must be either "
+                f"a Pydantic object, a dictionary or a string that contains "
+                f"the JSON Schema specification")
+
+        regex_string = build_regex_from_schema(schema_str, whitespace_pattern)
+        super().__init__(regex_string, tokenizer, reasoner)
+
+
+class OutlinesVocabulary:
+    """
+    Wrapper class for `outlines_core.Vocabulary`,
+    which allows us to store a hash with the vocabulary
+    """
+
+    def __init__(self, vocabulary: Vocabulary) -> None:
+        # Actual vocabulary object
+        self.inner = vocabulary
+        # Have to do abs(hash()) because python hashes can
+        # be negative, and we are using hash as a cache key.
+        hex_str = hashlib.sha256(
+            vocabulary.__repr__().encode('utf-8')).hexdigest()
+        hash_int = int(hex_str, 16)
+        self._hash = hash_int
+
+
+re_llama_byte_token = re.compile(r"^<0x[0-9A-F]{2}>$")
+re_replacement_seq = re.compile(r"^.{0,6}�+.{0,6}$")
+
+
+def _reduced_vocabulary(tokenizer: AnyTokenizer,
+                        eos_token_id: int) -> dict[bytes, list[int]]:
+    """Create a map from vocabulary tokens to lists of equivalent token ids.
+    
+    Returns:
+        A Dict of token string -> equivalent token ids
+    """
+    unicode_to_bytes = {v: k for k, v in bytes_to_unicode().items()}
+
+    def convert_token_to_string(token: str) -> str:
+
+        string = tokenizer.convert_tokens_to_string([token])
+
+        # A hack to handle missing spaces to HF's Llama tokenizers
+        if (type(token) is str and token.startswith(SPIECE_UNDERLINE)
+                or token == "<0x20>"):
+            return " " + string
+
+        return string
+
+    vocabulary: dict[bytes, list[int]] = {}
+    empty_token_ids: list[int] = []
+    for token, token_idx in tokenizer.get_vocab().items():
+        if token in tokenizer.all_special_tokens:  # type: ignore
+            continue
+
+        token_str = convert_token_to_string(token)
+        if token_str:
+            if isinstance(token, (bytes, bytearray)):
+                # For BPE tokenizers where tokens are stored as bytes.
+
+                # safe to ignore since token_str is of type (bytearray, bytes)
+                # by this point.
+                token_bytes = bytes(token_str)  # type: ignore[arg-type]
+
+            elif "\ufffd" in token_str and not re_replacement_seq.match(
+                    token_str):
+                # Handle tokens with invalid UTF-8 sequences.
+                if re_llama_byte_token.match(token):
+                    # Llama-like tokenizers use <0xXX> for incomplete sequences.
+                    token_bytes = bytes([int(token[3:5], 16)])
+                else:
+                    # GPT2 tokenizers: map each byte back using unicode_to_bytes
+                    byte_vals = [unicode_to_bytes.get(c) for c in token]
+                    if None in byte_vals:
+                        raise RuntimeError(
+                            f"Cannot convert token `{token}`"
+                            f" ({token_idx}) to bytes: {token_str}")
+                    # safe to ignore, since if None in byte_vals,
+                    # an error is thrown.
+                    token_bytes = bytes(byte_vals)  # type: ignore[arg-type]
+            else:
+                token_bytes = token_str.encode('utf-8')
+
+            if token_idx != eos_token_id:
+                vocabulary.setdefault(token_bytes, []).append(token_idx)
+        else:
+            empty_token_ids.append(token_idx)
+
+    return vocabulary
+
+
+def get_vocabulary(tokenizer: AnyTokenizer) -> Vocabulary:
+    """Get the `Vocabulary` object for a given tokenizer.
+    """
+    if hasattr(tokenizer, "_outlines_vocabulary"):
+        return tokenizer._outlines_vocabulary  # type: ignore
+
+    try:
+        if hasattr(
+                tokenizer,
+                "eos_token_id",
+        ) and tokenizer.eos_token_id is not None:
+            eos_token_id = tokenizer.eos_token_id
+        else:
+            raise ValueError(
+                f"Error during guided decoding setup: Tokenizer"
+                f" ({type(tokenizer)}) has no `eos_token_id` property, "
+                "but `eos_token_id` is required for guided decoding"
+                " to work properly.")
+
+        reduced_vocab = _reduced_vocabulary(
+            tokenizer,
+            eos_token_id  #type: ignore
+        )
+        vocabulary = OutlinesVocabulary(Vocabulary(eos_token_id,
+                                                   reduced_vocab))
+        tokenizer._outlines_vocabulary = vocabulary  # type: ignore
+
+        return vocabulary
+    except AttributeError as e:
+        raise ValueError(f"Cannot get the vocabulary of the tokenizer "
+                         f"({type(tokenizer)}). The tokenizer should have a "
+                         "get_vocab method.") from e
+
+
+def get_cache_path() -> str:
+    """Get the context object that contains previously-computed return values"""
+    outlines_cache_dir = os.getenv("OUTLINES_CACHE_DIR")
+    xdg_cache_home = os.getenv("XDG_CACHE_HOME")
+    home_dir = os.path.expanduser("~")
+
+    if outlines_cache_dir:
+        # OUTLINES_CACHE_DIR takes precedence
+        return outlines_cache_dir
+    elif xdg_cache_home:
+        return os.path.join(xdg_cache_home, ".cache", "outlines")
+    # If homedir is "/", we may be inside a container, and thus writing to
+    # root would be problematic, so we fallback to using a tempfile.
+    # Also validate the path exists, since os.path.expanduser does
+    # not garuntee existence.
+    elif os.path.isdir(home_dir) and home_dir != "/":
+        # Default Unix fallback: ~/.cache/outlines
+        return os.path.join(home_dir, ".cache", "outlines")
+    else:
+        import tempfile
+
+        # home_dir may be / inside a docker container without existing user
+        tempdir = tempfile.gettempdir()
+        return os.path.join(tempdir, ".cache", "outlines")
+
+
+def get_cache():
+    """Get the Cache instance to be used for index caching"""
+
+    cache_dir = get_cache_path()
+    if envs.VLLM_V0_USE_OUTLINES_CACHE:
+        logger.warning("Enabling outlines cache. This is an unbounded on-disk "
+                       "cache. It may consume a lot of disk space and should "
+                       "not be used with untrusted clients.")
+        cache = Cache(cache_dir, eviction_policy="none", cull_limit=0)
+        outlines_version = importlib.metadata.version("outlines_core")
+
+        cached_version = cache.get('__version__', None)
+        if cached_version != outlines_version:
+            cache.clear()
+        cache.set('__version__', outlines_version)
+        return cache
+    else:
+        return LRUCache(maxsize=128)
diff --git a/vllm_v0.10.0/vllm/model_executor/guided_decoding/utils.py b/vllm_v0.10.0/vllm/model_executor/guided_decoding/utils.py
new file mode 100644
index 0000000..8fdfa98
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/guided_decoding/utils.py
@@ -0,0 +1,242 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import regex as re
+
+
+def has_xgrammar_unsupported_json_features(schema: dict) -> bool:
+    """Check if JSON schema contains features unsupported by xgrammar."""
+
+    def check_object(obj: dict) -> bool:
+        if not isinstance(obj, dict):
+            return False
+
+        # Check for numeric ranges
+        if obj.get("type") in ("integer", "number") and ("multipleOf" in obj):
+            return True
+
+        # Check for array unsupported keywords
+        if obj.get("type") == "array" and any(key in obj for key in [
+                "uniqueItems", "contains", "minContains", "maxContains",
+                "minItems", "maxItems"
+        ]):
+            return True
+
+        # Unsupported keywords for strings
+        if obj.get("type") == "string" and any(
+                key in obj for key in ["minLength", "maxLength", "format"]):
+            return True
+
+        # Unsupported keywords for objects
+        if obj.get("type") == "object" and any(key in obj for key in [
+                "minProperties", "maxProperties", "propertyNames",
+                "patternProperties"
+        ]):
+            return True
+
+        # Recursively check all nested objects and arrays
+        for value in obj.values():
+            if isinstance(value, dict):
+                if check_object(value):
+                    return True
+            elif isinstance(value, list):
+                for item in value:
+                    if isinstance(item, dict) and check_object(item):
+                        return True
+
+        return False
+
+    return check_object(schema)
+
+
+def has_lmf_unsupported_json_features(schema: dict) -> bool:
+    """
+    Check if JSON schema contains features unsupported 
+    by lm_format_enforcer.
+
+    Known issues:
+    - Regex patterns:
+        "grade": {
+            "type": "string",
+            "pattern": "^[A-D]$"  # Regex pattern
+        },
+    """
+
+    def check_object(obj: dict) -> bool:
+        if not isinstance(obj, dict):
+            return False
+
+        # Check for pattern restrictions
+        if "pattern" in obj:
+            return True
+
+        # Recursively check all nested objects and arrays
+        for value in obj.values():
+            if isinstance(value, dict):
+                if check_object(value):
+                    return True
+            elif isinstance(value, list):
+                for item in value:
+                    if isinstance(item, dict) and check_object(item):
+                        return True
+
+        return False
+
+    return check_object(schema)
+
+
+def grammar_is_likely_lark(grammar_str: str) -> bool:
+    """
+    Check if grammar appears to use Lark syntax.
+    
+    Args:
+        grammar_str: Input grammar string
+        
+    Returns:
+        bool: True if grammar appears to be in Lark format, False otherwise
+        
+    Examples:
+        >>> grammar_is_likely_lark("rule: 'abc'")
+        True
+        >>> grammar_is_likely_lark("rule ::= 'abc'")
+        False
+    """
+    if not grammar_str or not isinstance(grammar_str, str):
+        return False
+
+    for line in grammar_str.split('\n'):
+        # Remove both comment styles
+        line = re.sub(r'(#|//).*$', '', line).strip()
+        if not line:
+            continue
+
+        # Look for GBNF rule definition
+        if '::=' in line:
+            return False
+
+    return True
+
+
+def convert_lark_to_gbnf(grammar_str: str) -> str:
+    """
+    Convert a Lark grammar string to GBNF format.
+
+    GBNF reference:
+    https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md
+    Lark grammar reference:
+    https://lark-parser.readthedocs.io/en/latest/grammar.html
+    
+    Args:
+        grammar_str: Input grammar in Lark format
+        
+    Returns:
+        str: Converted grammar in GBNF format
+        
+    Examples:
+        >>> print(convert_lark_to_gbnf("rule: 'hello'"))
+        root ::= rule
+        rule ::= "hello"
+    """
+    if not isinstance(grammar_str, str):
+        raise ValueError(f"Grammar must be a string, got {type(grammar_str)}")
+    if not grammar_str.strip():
+        raise ValueError("Grammar string cannot be empty")
+
+    defined_rules = set()
+    referenced_rules = set()
+    output_lines = []
+
+    def clean_line(line: str) -> str:
+        """Remove comments and whitespace from line."""
+        return re.sub(r'(#|//).*$', '', line).strip()
+
+    def check_quotes(text: str, rule_name: str, line_num: int) -> None:
+        """Validate quote matching in text."""
+        if text.count("'") % 2 != 0 or text.count('"') % 2 != 0:
+            raise ValueError(
+                f"Mismatched quotes in {rule_name} on line {line_num}")
+
+    def extract_references(text: str) -> set:
+        """Extract rule references from text."""
+        # Remove quoted strings and special characters
+        text = re.sub(r'"[^"]*"', '', text)
+        text = re.sub(r'[+*?()|\[\]{}]', ' ', text)
+        return set(re.findall(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', text))
+
+    # First pass: Find root rule and validate rule definitions
+    lines = [clean_line(line) for line in grammar_str.split('\n')]
+    first_rule = None
+
+    for line_num, line in enumerate(lines, 1):
+        if not line or line.startswith('|'):
+            continue
+
+        if ':' in line:
+            try:
+                name = line.split(':', 1)[0].strip().strip('?')
+                defined_rules.add(name)
+                if first_rule is None:
+                    first_rule = name
+                if name == 'start':
+                    first_rule = 'start'
+            except IndexError as e:
+                raise ValueError(f"Invalid rule format on line {line_num}. "
+                                 "Expected 'rule_name: definition'") from e
+
+    if not defined_rules:
+        raise ValueError("No valid rules found in grammar")
+
+    # Add root rule
+    output_lines.append(f"root ::= {first_rule}")
+
+    # Second pass: Process rule definitions and alternatives
+    current_rule = None
+    current_definition = []
+
+    for line_num, line in enumerate(lines, 1):
+        if not line:
+            continue
+
+        try:
+            if ':' in line and not line.startswith('|'):
+                # Save previous rule if exists
+                if current_rule:
+                    output_lines.append(
+                        f"{current_rule} ::= {' | '.join(current_definition)}")
+
+                # Process new rule
+                name, definition = line.split(':', 1)
+                current_rule = name.strip().strip('?')
+
+                check_quotes(definition, f"rule '{current_rule}'", line_num)
+                definition = re.sub(r"'([^']*)'", r'"\1"', definition)
+                referenced_rules.update(extract_references(definition))
+                current_definition = [definition.strip()]
+
+            elif line.startswith('|'):
+                if not current_rule:
+                    raise ValueError(f"Alternative '|' on line {line_num} "
+                                     "without a preceding rule definition")
+
+                alt_def = line[1:].strip()
+                check_quotes(alt_def, f"alternative for rule '{current_rule}'",
+                             line_num)
+                alt_def = re.sub(r"'([^']*)'", r'"\1"', alt_def)
+                referenced_rules.update(extract_references(alt_def))
+                current_definition.append(alt_def)
+
+        except ValueError as e:
+            raise ValueError(f"Error on line {line_num}: {str(e)}") from e
+
+    # Add final rule if exists
+    if current_rule:
+        output_lines.append(
+            f"{current_rule} ::= {' | '.join(current_definition)}")
+
+    # Validate all rules are defined
+    undefined_rules = referenced_rules - defined_rules - {'root'}
+    if undefined_rules:
+        raise ValueError("Referenced rules are not defined: "
+                         f"{', '.join(sorted(undefined_rules))}")
+
+    return '\n'.join(output_lines)
diff --git a/vllm_v0.10.0/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm_v0.10.0/vllm/model_executor/guided_decoding/xgrammar_decoding.py
new file mode 100644
index 0000000..bdd3a1a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -0,0 +1,426 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# noqa: UP007
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any
+
+import regex as re
+import torch
+
+import vllm.envs
+from vllm.logger import init_logger
+
+try:
+    import xgrammar as xgr
+    xgr_installed = True
+except ImportError:
+    xgr_installed = False
+    pass
+
+from vllm.model_executor.guided_decoding.utils import (convert_lark_to_gbnf,
+                                                       grammar_is_likely_lark)
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer
+
+    from vllm.config import ModelConfig
+    from vllm.reasoning import ReasoningParser
+    from vllm.sampling_params import GuidedDecodingParams
+
+logger = init_logger(__name__)
+
+
+def get_local_xgrammar_guided_decoding_logits_processor(
+        guided_params: GuidedDecodingParams,
+        tokenizer: PreTrainedTokenizer,
+        model_config: ModelConfig,
+        reasoner: ReasoningParser | None,
+        max_threads: int = 8):
+    config = GrammarConfig.from_guided_params(guided_params=guided_params,
+                                              model_config=model_config,
+                                              tokenizer=tokenizer,
+                                              max_threads=max_threads)
+    return XGrammarLogitsProcessor(config, reasoner)
+
+
+@dataclass(frozen=True)
+class TokenizerData:
+    """Immutable container for cached tokenizer data."""
+    metadata: str
+    encoded_vocab: list[str] = field(default_factory=list)
+
+
+class TokenizerDataCache:
+    """Cache manager for tokenizer data to avoid repeated processing."""
+    _cache: dict[int, TokenizerData] = {}
+
+    @classmethod
+    def get_tokenizer_data(
+        cls,
+        tokenizer: PreTrainedTokenizer,
+        /,
+        *,
+        tokenizer_hash: int,
+        vocab_size: int,
+    ) -> TokenizerData:
+
+        if tokenizer_hash not in cls._cache:
+            tokenizer_info = xgr.TokenizerInfo.from_huggingface(
+                tokenizer,
+                # NOTE: We will need to use lm_head's vocab_size
+                # to determine correct special_token_ids for this tokenizer.
+                # See https://github.com/mlc-ai/xgrammar/commit/70c959fb6d9cea75aae33c414763cd0602022d92  # noqa: E501
+                vocab_size=vocab_size,
+            )
+            metadata = json.loads(tokenizer_info.dump_metadata())
+
+            # Vendored from xgrammar logic to get encoded_vocab
+            # https://github.com/mlc-ai/xgrammar/blob/989222175c2a30fb7987d8bcce35bec1bf6817f2/python/xgrammar/tokenizer_info.py#L127 # noqa: E501
+            try:
+                vocab_dict = tokenizer.get_vocab()
+            except AttributeError as e:
+                raise ValueError(
+                    f"Cannot get the vocabulary of the tokenizer "
+                    f"{type(tokenizer)}. The tokenizer should have a "
+                    "get_vocab method.") from e
+
+            # maintain tokenizer's indexing
+            encoded_vocab = [""] * tokenizer_info.vocab_size
+            for token, idx in vocab_dict.items():
+                if idx < tokenizer_info.vocab_size:
+                    encoded_vocab[idx] = token
+
+            if isinstance(tokenizer, MistralTokenizer):
+                # REF: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
+                metadata.update({
+                    "vocab_type": xgr.VocabType.BYTE_FALLBACK,
+                    "add_prefix_space": True
+                })
+
+            cls._cache[tokenizer_hash] = TokenizerData(
+                encoded_vocab=encoded_vocab,
+                metadata=json.dumps(metadata),
+            )
+
+        return cls._cache[tokenizer_hash]
+
+
+class GrammarCompilerCache:
+    """
+    Cache for GrammarCompiler instances based on tokenizer.
+
+    This cache reduces the overhead of creating new compiler instances when
+    using the same tokenizer configuration.
+    """
+    _cache: dict[str, xgr.GrammarCompiler] = {}
+
+    @classmethod
+    def get_compiler(cls, config: GrammarConfig) -> xgr.GrammarCompiler:
+        cache_key = str(config.tokenizer_hash)
+
+        if cache_key not in cls._cache:
+            config_data = config.tokenizer_data
+
+            # In TokenizerDataCache.get_tokenizer_data, a serializable
+            # tokenizer_data is created and cached. This data is used to build
+            # a tokenizer_info and create an xgrammar compiler.
+            tokenizer_info = xgr.TokenizerInfo.from_vocab_and_metadata(
+                encoded_vocab=config_data.encoded_vocab,
+                metadata=config_data.metadata,
+            )
+            cache_size = vllm.envs.VLLM_XGRAMMAR_CACHE_MB * 1024 * 1024
+            cls._cache[cache_key] = xgr.GrammarCompiler(
+                tokenizer_info,
+                max_threads=config.max_threads,
+                cache_enabled=True,
+                cache_limit_bytes=cache_size,
+            )
+
+        return cls._cache[cache_key]
+
+
+@dataclass
+class GrammarConfig:
+    """Serializable configuration for grammar compilation"""
+    tokenizer_hash: int
+    tokenizer_data: TokenizerData
+    json_str: str | None = None
+    grammar_str: str | None = None
+    json_object: bool | None = None
+    any_whitespace: bool = True
+    regex_str: str | None = None
+    max_threads: int = 8
+
+    @classmethod
+    def from_guided_params(cls,
+                           guided_params: GuidedDecodingParams,
+                           model_config: ModelConfig,
+                           tokenizer: PreTrainedTokenizer,
+                           max_threads: int = 8) -> GrammarConfig:
+
+        tokenizer_hash = hash(tokenizer)
+        tokenizer_data = TokenizerDataCache.get_tokenizer_data(
+            tokenizer,
+            tokenizer_hash=tokenizer_hash,
+            vocab_size=model_config.hf_text_config.vocab_size,
+        )
+
+        if guided_params.json:
+            if not isinstance(guided_params.json, str):
+                json_str = json.dumps(guided_params.json)
+            else:
+                json_str = guided_params.json
+
+            any_whitespace = not guided_params.disable_any_whitespace
+
+            # Check and log if model with xgrammar and whitespace have history
+            # of runaway generation of whitespaces.
+            # References:
+            # https://github.com/vllm-project/vllm/pull/12744
+            # https://github.com/mlc-ai/xgrammar/issues/212
+            model_with_warn = None
+
+            if 'Mistral' in model_config.model:
+                model_with_warn = 'Mistral'
+            elif 'Qwen' in model_config.model:
+                model_with_warn = 'Qwen'
+
+            if model_with_warn is not None and any_whitespace:
+                logger.info_once(
+                    "%s model detected, consider setting `disable_any_whitespace` to prevent runaway generation of whitespaces.",  # noqa: E501
+                    model_with_warn,
+                )
+            # Validate the schema and raise ValueError here if it is invalid.
+            # This is to avoid exceptions in model execution, which will crash
+            # the engine worker process.
+            try:
+                xgr.Grammar.from_json_schema(json_str,
+                                             any_whitespace=any_whitespace)
+            except RuntimeError as err:
+                raise ValueError(str(err)) from err
+
+            return cls(json_str=json_str,
+                       tokenizer_hash=tokenizer_hash,
+                       max_threads=max_threads,
+                       tokenizer_data=tokenizer_data,
+                       any_whitespace=any_whitespace)
+        elif guided_params.grammar:
+            # XGrammar only supports GBNF grammars, so we must convert Lark
+            if grammar_is_likely_lark(guided_params.grammar):
+                try:
+                    grammar_str = convert_lark_to_gbnf(guided_params.grammar)
+                except ValueError as e:
+                    raise ValueError(
+                        "Failed to convert the grammar from Lark to GBNF. "
+                        "Please either use GBNF grammar directly or specify"
+                        " --guided-decoding-backend=outlines.\n"
+                        f"Conversion error: {str(e)}") from e
+            else:
+                grammar_str = guided_params.grammar
+
+            # Validate the grammar and raise ValueError here if it is invalid.
+            # This is to avoid exceptions in model execution, which will crash
+            # the engine worker process.
+            try:
+                xgr.Grammar.from_ebnf(grammar_str)
+            except RuntimeError as err:
+                raise ValueError(str(err)) from err
+
+            return cls(grammar_str=grammar_str,
+                       tokenizer_hash=tokenizer_hash,
+                       max_threads=max_threads,
+                       tokenizer_data=tokenizer_data)
+        elif guided_params.json_object:
+            return cls(
+                json_object=True,
+                tokenizer_hash=tokenizer_hash,
+                max_threads=max_threads,
+                tokenizer_data=tokenizer_data,
+            )
+        elif guided_params.choice:
+            choice_str = GrammarConfig.choice_as_grammar(guided_params.choice)
+            try:
+                xgr.Grammar.from_ebnf(choice_str)
+            except RuntimeError as err:
+                raise ValueError(str(err)) from err
+
+            return cls(
+                grammar_str=choice_str,
+                tokenizer_hash=tokenizer_hash,
+                max_threads=max_threads,
+                tokenizer_data=tokenizer_data,
+            )
+        elif guided_params.regex:
+            return cls(
+                regex_str=guided_params.regex,
+                tokenizer_hash=tokenizer_hash,
+                max_threads=max_threads,
+                tokenizer_data=tokenizer_data,
+            )
+        else:
+            raise ValueError(
+                "Currently only support JSON and EBNF grammar mode for xgrammar"
+            )
+
+    @staticmethod
+    def escape_ebnf_string(s: str) -> str:
+        """Escape special characters in a EBNF string."""
+        # Escape double quotes and backslashes
+        return re.sub(r'(["\\])', r'\\\1', s)
+
+    @staticmethod
+    def choice_as_grammar(choice: list[str] | None) -> str:
+        if choice is None:
+            raise ValueError("Choice is not set")
+        escaped_choices = (GrammarConfig.escape_ebnf_string(c) for c in choice)
+        grammar = ('root ::= ' + ' | '.join(f'"{c}"' for c in escaped_choices))
+        return grammar
+
+    @staticmethod
+    def tokenizer_info(tokenizer_data: TokenizerData) -> xgr.TokenizerInfo:
+        return xgr.TokenizerInfo.from_vocab_and_metadata(
+            encoded_vocab=tokenizer_data.encoded_vocab,
+            metadata=tokenizer_data.metadata,
+        )
+
+
+@dataclass
+class XGrammarLogitsProcessor:
+    """Wrapper class to support pickle protocol"""
+    config: GrammarConfig
+    reasoner: ReasoningParser | None = None
+
+    ctx: xgr.CompiledGrammar | None = None
+    tokenizer_info: xgr.TokenizerInfo = None  # type: ignore[assignment]
+    token_bitmask: torch.Tensor = None  # type: ignore[assignment]
+    matchers: list[xgr.GrammarMatcher] = field(default_factory=list)
+    batch_size: int = field(default=1)
+    prefilled: bool = field(default=False)
+
+    def __post_init__(self):
+        if self.tokenizer_info is None:
+            self.tokenizer_info = self.config.tokenizer_info(
+                self.config.tokenizer_data)
+
+    def __getstate__(self) -> dict[str, Any]:
+        return {'config': self.config, 'reasoner': self.reasoner}
+
+    def __setstate__(self, state: dict[str, Any]):
+        self.config = state['config']
+        self.reasoner = state['reasoner']
+
+        self.tokenizer_info = GrammarConfig.tokenizer_info(
+            self.config.tokenizer_data)
+        self.ctx = None
+        self.matchers = []
+        self.batch_size = 1
+        self.token_bitmask = None  # type: ignore[assignment]
+        self.prefilled = False
+
+    def _ensure_ctx(self):
+        """Lazily initialize the processor in the worker process"""
+        if self.ctx is None:
+            compiler = GrammarCompilerCache.get_compiler(self.config)
+            if self.config.json_str is not None:
+                any_whitespace = self.config.any_whitespace
+                self.ctx = compiler\
+                    .compile_json_schema(self.config.json_str,
+                                         any_whitespace=any_whitespace)
+            elif self.config.grammar_str is not None:
+                self.ctx = compiler.compile_grammar(self.config.grammar_str)
+            elif self.config.json_object:
+                any_whitespace = self.config.any_whitespace
+                self.ctx = compiler\
+                    .compile_json_schema('{"type": "object"}',
+                                         any_whitespace=any_whitespace)
+            elif self.config.regex_str:
+                self.ctx = compiler.compile_regex(self.config.regex_str)
+            else:
+                raise ValueError(
+                    "Invalid configuration for xgrammar logits processor")
+
+    def __call__(self, input_ids: list[int],
+                 scores: torch.Tensor) -> torch.Tensor:
+
+        # Skip the structured logits processing if reasoning is not finished.
+        # reasoner is not None only when `--reasoning-parser` is set.
+        if self.reasoner is not None and \
+        not self.reasoner.is_reasoning_end(
+                input_ids):
+            return scores
+
+        if self.ctx is None:
+            self._ensure_ctx()
+
+        if len(self.matchers) == 0:
+            self.matchers = [
+                xgr.GrammarMatcher(self.ctx) for _ in range(self.batch_size)
+            ]
+            self.token_bitmask = xgr.allocate_token_bitmask(
+                self.batch_size, self.tokenizer_info.vocab_size)
+
+        if not self.prefilled:
+            # Have not sampled a token yet
+            self.prefilled = True
+        else:
+            for i, matcher in enumerate(self.matchers):
+                if not matcher.is_terminated():
+                    sampled_token = input_ids[-1]
+                    assert self.matchers[i].accept_token(sampled_token)
+
+        for i, matcher in enumerate(self.matchers):
+            if not matcher.is_terminated():
+                # @ubospica: ideally, fill_next_token_bitmask should be
+                # parallelized with model decoding
+                # See https://github.com/vllm-project/vllm/pull/10785/files#r1864278303
+                matcher.fill_next_token_bitmask(self.token_bitmask, i)
+
+        # token_bitmask is a CPU tensor for use with accept_token and
+        # fill_next_token_bitmask so we move it to the device of scores
+        device_type = scores.device.type
+        dtype = scores.dtype
+        if device_type != "cuda":
+            # xgrammar on cpu only supports float32 scores
+            # see: https://github.com/mlc-ai/xgrammar/blob/c1b64920cad24f44f235778c1c00bb52d57da01a/python/xgrammar/kernels/apply_token_bitmask_inplace_cpu.py#L22
+            scores = scores.to("cpu").float().unsqueeze(0)
+
+        # Note: In this method, if the tensors have different dimensions
+        # on CPU device fails, but on GPU it runs without error. Hence the
+        # unsqueeze above for scores, to match the token bitmask shape
+        xgr.apply_token_bitmask_inplace(
+            scores, self.token_bitmask.to(scores.device, non_blocking=True))
+        if device_type != "cuda":
+            scores = scores.to(dtype).to(device_type).squeeze()
+
+        return scores
+
+    def clone(self) -> XGrammarLogitsProcessor:
+        """Create a new instance with shared compiled grammar
+          but separate state"""
+        new_processor = XGrammarLogitsProcessor(self.config, self.reasoner,
+                                                None, self.tokenizer_info)
+
+        # Share the compiled grammar context (immutable after compilation)
+        new_processor.ctx = self.ctx
+
+        # Create fresh matchers for the new sequence
+        if self.ctx is not None:
+            new_processor.matchers = [
+                xgr.GrammarMatcher(self.ctx) for _ in range(self.batch_size)
+            ]
+
+        # Create a new token bitmask with the same size
+        if hasattr(self, 'token_bitmask') and self.token_bitmask is not None:
+            new_processor.token_bitmask = self.token_bitmask
+
+        # Copy simple attributes
+        new_processor.batch_size = self.batch_size
+        # Reset prefilled state for new sequence
+        new_processor.prefilled = False
+
+        return new_processor
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/__init__.py b/vllm_v0.10.0/vllm/model_executor/layers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/activation.py b/vllm_v0.10.0/vllm/model_executor/layers/activation.py
new file mode 100644
index 0000000..1fd96fe
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/activation.py
@@ -0,0 +1,420 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Custom activation functions."""
+import math
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from vllm.distributed import (divide, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size)
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+from vllm.utils import LazyDict
+
+
+@CustomOp.register("fatrelu_and_mul")
+class FatreluAndMul(CustomOp):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    def __init__(self, threshold: float = 0.):
+        super().__init__()
+        self.threshold = threshold
+        if current_platform.is_cuda_alike():
+            self.op = torch.ops._C.fatrelu_and_mul
+        elif current_platform.is_cpu():
+            self._forward_method = self.forward_native
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        x1 = x[..., :d]
+        x2 = x[..., d:]
+        x1 = F.threshold(x1, self.threshold, 0.0)
+        return x1 * x2
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = (x.shape[:-1] + (d, ))
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        self.op(out, x, self.threshold)
+        return out
+
+
+@CustomOp.register("silu_and_mul")
+class SiluAndMul(CustomOp):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    def __init__(self):
+        super().__init__()
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+            self.op = torch.ops._C.silu_and_mul
+        elif current_platform.is_xpu():
+            from vllm._ipex_ops import ipex_ops
+            self.op = ipex_ops.silu_and_mul
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+        d = x.shape[-1] // 2
+        return F.silu(x[..., :d]) * x[..., d:]
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = (x.shape[:-1] + (d, ))
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        self.op(out, x)
+        return out
+
+    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = (x.shape[:-1] + (d, ))
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        self.op(out, x)
+        return out
+
+    def forward_neuron(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        x_reshaped = x.view(-1, x.shape[-1])
+        s = x_reshaped[:, :d] * F.sigmoid(x_reshaped[:, :d])
+        result = s * x_reshaped[:, d:]
+        return result.view(*x.shape[:-1], d)
+
+
+@CustomOp.register("mul_and_silu")
+class MulAndSilu(CustomOp):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    def __init__(self):
+        super().__init__()
+        if current_platform.is_cuda_alike():
+            self.op = torch.ops._C.mul_and_silu
+        elif current_platform.is_xpu():
+            from vllm._ipex_ops import ipex_ops
+            self.op = ipex_ops.silu_and_mul
+        elif current_platform.is_cpu():
+            self._forward_method = self.forward_native
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+        d = x.shape[-1] // 2
+        return x[..., :d] * F.silu(x[..., d:])
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = (x.shape[:-1] + (d, ))
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        self.op(out, x)
+        return out
+
+    # TODO implement forward_xpu for MulAndSilu
+    # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+
+
+@CustomOp.register("gelu_and_mul_sparse")
+class GeluAndMulSparse(CustomOp):
+    """An activation function for GeluAndMulSparse.
+    This activation function is used in Gemma3n. It computes:
+        up_proj = self.up_proj(x)
+        gate_proj = self.gate_proj(x)
+        gate_proj = self._gaussian_topk(gate_proj) # sparsity
+        activations = self.act_fn(gate_proj) # gelu
+        down_proj = self.down_proj(activations * up_proj)
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    def __init__(self, activation_sparsity: float, approximate: str = "none"):
+        super().__init__()
+        # Gelu.
+        self.approximate = approximate
+        if approximate not in ("none", "tanh"):
+            raise ValueError(f"Unknown approximate mode: {approximate}")
+
+        # Sparsity.
+        if activation_sparsity == 0.0:
+            raise ValueError(
+                "activation_sparsity is 0.0. Please use GeluAndMul.")
+        target_sparsity_tensor = torch.tensor(activation_sparsity,
+                                              dtype=torch.float32)
+        normal_dist = torch.distributions.normal.Normal(0, 1)
+        self.std_multiplier = normal_dist.icdf(target_sparsity_tensor)
+
+    def _gaussian_topk(self, x: torch.Tensor) -> torch.Tensor:
+        """Get % sparse percentile of the Gaussian distribution."""
+        # NOTE(rob): for TP>1, we could all-gather to get the means/std.
+        # But we do not do this because in expectation they are the same
+        # and in practice the eval scores are good without gathering.
+        mean = torch.mean(x, dim=-1, keepdim=True)
+        std = torch.std(x, dim=-1, keepdim=True, unbiased=False)
+        cutoff_x = mean + std * self.std_multiplier
+        return nn.functional.relu(x - cutoff_x)
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+        d = x.shape[-1] // 2
+        out = self._gaussian_topk(x[..., :d])
+        out = F.gelu(out, approximate=self.approximate)
+        return out * x[..., d:]
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        return self.forward_native(x)
+
+
+@CustomOp.register("gelu_and_mul")
+class GeluAndMul(CustomOp):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    def __init__(self, approximate: str = "none"):
+        super().__init__()
+        self.approximate = approximate
+        if approximate not in ("none", "tanh"):
+            raise ValueError(f"Unknown approximate mode: {approximate}")
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+            if approximate == "none":
+                self.op = torch.ops._C.gelu_and_mul
+            elif approximate == "tanh":
+                self.op = torch.ops._C.gelu_tanh_and_mul
+        elif current_platform.is_xpu():
+            from vllm._ipex_ops import ipex_ops
+            if approximate == "none":
+                self.op = ipex_ops.gelu_and_mul
+            else:
+                self.op = ipex_ops.gelu_tanh_and_mul
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+        d = x.shape[-1] // 2
+        return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:]
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = (x.shape[:-1] + (d, ))
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        self.op(out, x)
+        return out
+
+    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = (x.shape[:-1] + (d, ))
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        self.op(out, x)
+        return out
+
+    def extra_repr(self) -> str:
+        return f'approximate={repr(self.approximate)}'
+
+
+@CustomOp.register("gelu_new")
+class NewGELU(CustomOp):
+
+    def __init__(self):
+        super().__init__()
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+            self.op = torch.ops._C.gelu_new
+        elif current_platform.is_xpu():
+            from vllm._ipex_ops import ipex_ops
+            self.op = ipex_ops.gelu_new
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+        c = math.sqrt(2.0 / math.pi)
+        return 0.5 * x * (1.0 + torch.tanh(c *
+                                           (x + 0.044715 * torch.pow(x, 3.0))))
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        self.op(out, x)
+        return out
+
+    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+        return self.op(x)
+
+
+@CustomOp.register("gelu_fast")
+class FastGELU(CustomOp):
+
+    def __init__(self):
+        super().__init__()
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+            self.op = torch.ops._C.gelu_fast
+        elif current_platform.is_xpu():
+            from vllm._ipex_ops import ipex_ops
+            self.op = ipex_ops.gelu_fast
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+        return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 *
+                                           (1.0 + 0.044715 * x * x)))
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        self.op(out, x)
+        return out
+
+    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+        return self.op(x)
+
+
+@CustomOp.register("quick_gelu")
+class QuickGELU(CustomOp):
+    # https://github.com/huggingface/transformers/blob/main/src/transformers/activations.py#L90
+    def __init__(self):
+        super().__init__()
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+            self.op = torch.ops._C.gelu_quick
+        elif current_platform.is_xpu():
+            from vllm._ipex_ops import ipex_ops
+            self.op = ipex_ops.gelu_quick
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+        return x * torch.sigmoid(1.702 * x)
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        self.op(out, x)
+        return out
+
+    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        self.op(out, x)
+        return out
+
+    # TODO implement forward_xpu for QuickGELU
+    # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+
+
+@CustomOp.register("relu2")
+class ReLUSquaredActivation(CustomOp):
+    """
+    Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2
+    """
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+        return torch.square(F.relu(x))
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        return self.forward_native(x)
+
+
+class ScaledActivation(nn.Module):
+    """An activation function with post-scale parameters.
+
+    This is used for some quantization methods like AWQ.
+    """
+
+    def __init__(
+        self,
+        act_module: nn.Module,
+        intermediate_size: int,
+        input_is_parallel: bool = True,
+        params_dtype: Optional[torch.dtype] = None,
+    ):
+        super().__init__()
+        self.act = act_module
+        self.input_is_parallel = input_is_parallel
+        if input_is_parallel:
+            tp_size = get_tensor_model_parallel_world_size()
+            intermediate_size_per_partition = divide(intermediate_size,
+                                                     tp_size)
+        else:
+            intermediate_size_per_partition = intermediate_size
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.scales = nn.Parameter(
+            torch.empty(intermediate_size_per_partition, dtype=params_dtype))
+        set_weight_attrs(self.scales, {"weight_loader": self.weight_loader})
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.act(x) / self.scales
+
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
+        param_data = param.data
+        if self.input_is_parallel:
+            tp_rank = get_tensor_model_parallel_rank()
+            shard_size = param_data.shape[0]
+            start_idx = tp_rank * shard_size
+            loaded_weight = loaded_weight.narrow(0, start_idx, shard_size)
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+
+_ACTIVATION_REGISTRY = LazyDict({
+    "gelu":
+    lambda: nn.GELU(),
+    "gelu_fast":
+    lambda: FastGELU(),
+    "gelu_new":
+    lambda: NewGELU(),
+    "gelu_pytorch_tanh":
+    lambda: nn.GELU(approximate="tanh"),
+    "relu":
+    lambda: nn.ReLU(),
+    "relu2":
+    lambda: ReLUSquaredActivation(),
+    "silu":
+    lambda: nn.SiLU(),
+    "quick_gelu":
+    lambda: QuickGELU(),
+})
+
+
+def get_act_fn(act_fn_name: str) -> nn.Module:
+    """Get an activation function by name."""
+    act_fn_name = act_fn_name.lower()
+    if act_fn_name not in _ACTIVATION_REGISTRY:
+        raise ValueError(
+            f"Activation function {act_fn_name!r} is not supported.")
+
+    return _ACTIVATION_REGISTRY[act_fn_name]
+
+
+_ACTIVATION_AND_MUL_REGISTRY = LazyDict({
+    "gelu": lambda: GeluAndMul(),
+    "silu": lambda: SiluAndMul(),
+    "geglu": lambda: GeluAndMul(),
+})
+
+
+def get_act_and_mul_fn(act_fn_name: str) -> nn.Module:
+    """Get an activation-and-mul (i.e. SiluAndMul) function by name."""
+    act_fn_name = act_fn_name.lower()
+    if act_fn_name not in _ACTIVATION_AND_MUL_REGISTRY:
+        raise ValueError(
+            f"Activation function {act_fn_name!r} is not supported.")
+
+    return _ACTIVATION_AND_MUL_REGISTRY[act_fn_name]
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/__init__.py b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/__init__.py
new file mode 100644
index 0000000..3d40879
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/__init__.py
@@ -0,0 +1,78 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from contextlib import contextmanager
+from typing import Any, Optional
+
+from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
+from vllm.model_executor.layers.fused_moe.layer import (
+    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEActivationFormat, FusedMoEPermuteExpertsUnpermute,
+    FusedMoEPrepareAndFinalize)
+from vllm.triton_utils import HAS_TRITON
+
+_config: Optional[dict[str, Any]] = None
+
+
+@contextmanager
+def override_config(config):
+    global _config
+    old_config = _config
+    _config = config
+    yield
+    _config = old_config
+
+
+def get_config() -> Optional[dict[str, Any]]:
+    return _config
+
+
+__all__ = [
+    "FusedMoE",
+    "FusedMoEConfig",
+    "FusedMoEMethodBase",
+    "FusedMoeWeightScaleSupported",
+    "FusedMoEPermuteExpertsUnpermute",
+    "FusedMoEActivationFormat",
+    "FusedMoEPrepareAndFinalize",
+    "override_config",
+    "get_config",
+]
+
+if HAS_TRITON:
+    # import to register the custom ops
+    import vllm.model_executor.layers.fused_moe.fused_marlin_moe  # noqa
+    import vllm.model_executor.layers.fused_moe.fused_moe  # noqa
+    from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
+        BatchedDeepGemmExperts)
+    from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
+        BatchedTritonOrDeepGemmExperts)
+    from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+        CutlassExpertsFp8, cutlass_moe_fp4, cutlass_moe_fp8)
+    from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+        DeepGemmExperts)
+    from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+        BatchedTritonExperts)
+    from vllm.model_executor.layers.fused_moe.fused_moe import (
+        TritonExperts, fused_experts, fused_moe, fused_topk,
+        get_config_file_name, grouped_topk)
+    from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
+        TritonOrDeepGemmExperts)
+
+    __all__ += [
+        "fused_moe",
+        "fused_topk",
+        "fused_experts",
+        "get_config_file_name",
+        "grouped_topk",
+        "cutlass_moe_fp8",
+        "cutlass_moe_fp4",
+        "CutlassExpertsFp8",
+        "TritonExperts",
+        "BatchedTritonExperts",
+        "DeepGemmExperts",
+        "BatchedDeepGemmExperts",
+        "TritonOrDeepGemmExperts",
+        "BatchedTritonOrDeepGemmExperts",
+    ]
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
new file mode 100644
index 0000000..3ccddb5
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -0,0 +1,296 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, Optional
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
+    TopKWeightAndReduceDelegate)
+from vllm.model_executor.layers.fused_moe.utils import _resize_cache
+from vllm.triton_utils import tl, triton
+from vllm.utils.deep_gemm import (fp8_m_grouped_gemm_nt_masked,
+                                  is_blackwell_deep_gemm_used)
+
+logger = init_logger(__name__)
+
+
+@triton.jit
+def _silu_mul_fp8_quant_deep_gemm(
+    # Pointers ------------------------------------------------------------
+    input_ptr,  # 16-bit activations (E, T, 2*H)
+    y_q_ptr,  # fp8 quantized activations (E, T, H)
+    y_s_ptr,  # 16-bit scales (E, T, G)
+    counts_ptr,  # int32 num tokens per expert (E)
+
+    # Sizes ---------------------------------------------------------------
+    H: tl.constexpr,  # hidden dimension (per output)
+    GROUP_SIZE: tl.constexpr,  # elements per group (usually 128)
+
+    # Strides for input (elements) ---------------------------------------
+    stride_i_e,
+    stride_i_t,
+    stride_i_h,
+
+    # Strides for y_q (elements) -----------------------------------------
+    stride_yq_e,
+    stride_yq_t,
+    stride_yq_h,
+
+    # Strides for y_s (elements) -----------------------------------------
+    stride_ys_e,
+    stride_ys_t,
+    stride_ys_g,
+
+    # Stride for counts (elements)
+    stride_counts_e,
+
+    # Numeric params ------------------------------------------------------
+    eps: tl.constexpr,
+    fp8_min: tl.constexpr,
+    fp8_max: tl.constexpr,
+    use_ue8m0: tl.constexpr,
+
+    # Meta ---------------------------------------------------------------
+    BLOCK: tl.constexpr,
+    NUM_STAGES: tl.constexpr,
+):
+    G = H // GROUP_SIZE
+
+    # map program id -> (e, g)
+    pid = tl.program_id(0)
+    e = pid // G
+    g = pid % G
+
+    e = e.to(tl.int64)
+    g = g.to(tl.int64)
+
+    # number of valid tokens for this expert
+    n_tokens = tl.load(counts_ptr + e * stride_counts_e).to(tl.int64)
+
+    cols = tl.arange(0, BLOCK)
+    cols = cols.to(tl.int64)
+    mask_h = cols < BLOCK
+
+    for t in tl.range(0, n_tokens, num_stages=NUM_STAGES):
+        base_i_offset = (e * stride_i_e + t * stride_i_t +
+                         g * GROUP_SIZE * stride_i_h)
+        base_yq_offset = (e * stride_yq_e + t * stride_yq_t +
+                          g * GROUP_SIZE * stride_yq_h)
+        base_ys_offset = e * stride_ys_e + t * stride_ys_t + g * stride_ys_g
+
+        mask = mask_h
+        x = tl.load(input_ptr + base_i_offset + cols * stride_i_h,
+                    mask=mask,
+                    other=0.0).to(tl.float32)
+        y2 = tl.load(input_ptr + base_i_offset + H * stride_i_h +
+                     cols * stride_i_h,
+                     mask=mask,
+                     other=0.0).to(tl.float32)
+
+        x = x * (1.0 / (1.0 + tl.exp(-x)))
+        y = x * y2
+
+        _absmax = tl.maximum(tl.max(tl.abs(y)), eps)
+        scale_raw = _absmax / fp8_max
+        y_s = tl.math.exp2(tl.ceil(
+            tl.log2(scale_raw))) if use_ue8m0 else scale_raw
+        y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
+
+        tl.store(y_q_ptr + base_yq_offset + cols * stride_yq_h, y_q, mask=mask)
+        tl.store(y_s_ptr + base_ys_offset, y_s)
+
+
+def silu_mul_fp8_quant_deep_gemm(
+    y: torch.Tensor,  # (E, T, 2*H) float32
+    tokens_per_expert: torch.Tensor,  # (E,) number of valid tokens per expert
+    group_size: int = 128,
+    eps: float = 1e-10,
+):
+    """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
+
+    y has shape (E, T, 2*H). The first half of the last dimension is 
+    silu-activated, multiplied by the second half, then quantized into FP8.
+
+    Returns `(y_q, y_s)` where
+    * `y_q` is the FP8 tensor of shape `(E, T, H)`, same layout as `y[..., :H]`.
+    * `y_s` has shape `(E, T, H // group_size)` and strides `(T*G, 1, T)`
+    """
+    assert y.ndim == 3, "y must be (E, T, 2*H)"
+    E, T, H2 = y.shape
+    assert H2 % 2 == 0, "last dim of y must be even (2*H)"
+    H = H2 // 2
+    G = H // group_size
+    assert H % group_size == 0, "H must be divisible by group_size"
+    assert tokens_per_expert.ndim == 1 and tokens_per_expert.shape[0] == E, \
+        "tokens_per_expert must be shape (E,)"
+    tokens_per_expert = tokens_per_expert.to(device=y.device,
+                                             dtype=torch.int32)
+
+    # allocate outputs
+    fp8_dtype = torch.float8_e4m3fn
+    y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device)
+
+    # strides (elements)
+    stride_i_e, stride_i_t, stride_i_h = y.stride()
+    stride_yq_e, stride_yq_t, stride_yq_h = y_q.stride()
+
+    # desired scale strides (elements): (T*G, 1, T)
+    stride_ys_e = T * G
+    stride_ys_t = 1
+    stride_ys_g = T
+    y_s = torch.empty_strided((E, T, G),
+                              (stride_ys_e, stride_ys_t, stride_ys_g),
+                              dtype=torch.float32,
+                              device=y.device)
+
+    stride_cnt_e = tokens_per_expert.stride()[0]
+
+    # static grid over experts and H-groups.
+    # A loop inside the kernel handles the token dim
+    grid = (E * G, )
+
+    f_info = torch.finfo(fp8_dtype)
+    fp8_max = f_info.max
+    fp8_min = f_info.min
+
+    _silu_mul_fp8_quant_deep_gemm[grid](
+        y,
+        y_q,
+        y_s,
+        tokens_per_expert,
+        H,
+        group_size,
+        stride_i_e,
+        stride_i_t,
+        stride_i_h,
+        stride_yq_e,
+        stride_yq_t,
+        stride_yq_h,
+        stride_ys_e,
+        stride_ys_t,
+        stride_ys_g,
+        stride_cnt_e,
+        eps,
+        fp8_min,
+        fp8_max,
+        is_blackwell_deep_gemm_used(),
+        BLOCK=group_size,
+        NUM_STAGES=8,
+        num_warps=1,
+    )
+
+    return y_q, y_s
+
+
+class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
+
+    # The Deep Gemm kernels only support block size of 128
+    DEEPGEMM_BLOCK_SHAPE: list[int] = [128, 128]
+
+    def __init__(self,
+                 max_num_tokens: int,
+                 num_dispatchers: int,
+                 block_shape: list[int],
+                 per_act_token_quant=False):
+        """
+        max_num_tokens: Maximum number of tokens from a DP Rank
+        num_dispatchers: The number of DP dispatchers.
+        block_shape: Block quantization block shape.
+        per_act_token_quant: Per activation token quantization flag.
+        """
+        super().__init__(
+            FusedMoEQuantConfig(
+                quant_dtype=torch.float8_e4m3fn,
+                per_act_token_quant=per_act_token_quant,
+                block_shape=block_shape,
+            ))
+        assert self.block_shape == self.DEEPGEMM_BLOCK_SHAPE
+        self.max_num_tokens = max_num_tokens
+        self.num_dispatchers = num_dispatchers
+
+    @property
+    def activation_formats(
+        self
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (mk.FusedMoEActivationFormat.BatchedExperts,
+                mk.FusedMoEActivationFormat.BatchedExperts)
+
+    def supports_chunking(self) -> bool:
+        return False
+
+    def supports_expert_map(self) -> bool:
+        return False
+
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        # Let PrepareAndFinalize::finalize() decide the impl.
+        return TopKWeightAndReduceDelegate()
+
+    def workspace_shapes(
+        self,
+        a: torch.Tensor,
+        aq: torch.Tensor,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_metadata: Optional[mk.ExpertTokensMetadata],
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        assert a.dim() == 2
+        # FIXME (varun): We should be able to dispatch only from the leader
+        # DP ranks in the case of TP > 1. At the moment, all the Ranks
+        # end up sending their tokens. This needs to be fixed.
+        num_dispatchers = self.num_dispatchers
+        num_experts = local_num_experts
+        max_num_tokens = a.size(
+            0) if self.max_num_tokens is None else self.max_num_tokens
+        workspace13 = (num_experts, max_num_tokens * num_dispatchers,
+                       max(K, N))
+        workspace2 = (num_experts, max_num_tokens * num_dispatchers, (N // 2))
+        output = (num_experts, max_num_tokens * num_dispatchers, K)
+        return (workspace13, workspace2, output, a.dtype)
+
+    def apply(self, output: torch.Tensor, hidden_states: torch.Tensor,
+              w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor,
+              topk_ids: torch.Tensor, activation: str, global_num_experts: int,
+              expert_map: Optional[torch.Tensor],
+              w1_scale: Optional[torch.Tensor],
+              w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor],
+              w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor],
+              a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor,
+              workspace2: torch.Tensor,
+              expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+              apply_router_weight_on_input: bool,
+              extra_expert_args: Optional[dict[str, Any]]):
+        assert expert_tokens_meta is not None
+        expert_num_tokens = expert_tokens_meta.expert_num_tokens
+
+        assert hidden_states.ndim == 3
+        assert self.block_shape is not None
+
+        a1q = hidden_states
+        _, N, K = w1.size()
+
+        assert w2.size(1) == K
+
+        E, max_num_tokens, N, K, top_k_num = mk._moe_problem_size(
+            hidden_states, w1, w2, topk_ids)
+
+        workspace1 = _resize_cache(workspace13, (E, max_num_tokens, N))
+
+        # (from deepgemm docs) : A value hint (which is a value on CPU)
+        # for the M expectation of each batch, correctly setting this value
+        # may lead to better performance.
+        expected_m = max_num_tokens
+        fp8_m_grouped_gemm_nt_masked((a1q, a1q_scale), (w1, w1_scale),
+                                     workspace1, expert_num_tokens, expected_m)
+
+        a2q, a2q_scale = silu_mul_fp8_quant_deep_gemm(workspace1,
+                                                      expert_num_tokens)
+
+        fp8_m_grouped_gemm_nt_masked((a2q, a2q_scale), (w2, w2_scale), output,
+                                     expert_num_tokens, expected_m)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
new file mode 100644
index 0000000..fc30e84
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
@@ -0,0 +1,154 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, Optional
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
+    BatchedDeepGemmExperts)
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+    BatchedTritonExperts)
+
+
+class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
+
+    def __init__(self,
+                 max_num_tokens: int,
+                 num_dispatchers: int,
+                 use_fp8_w8a8: bool = False,
+                 use_int8_w8a8: bool = False,
+                 use_int8_w8a16: bool = False,
+                 use_int4_w4a16: bool = False,
+                 block_shape: Optional[list[int]] = None,
+                 per_act_token_quant: bool = False,
+                 allow_deep_gemm: bool = False):
+        assert not use_int8_w8a8, "NYI"
+        assert not use_int8_w8a16, "NYI"
+        assert not use_int4_w4a16, "NYI"
+
+        super().__init__(
+            FusedMoEQuantConfig.make(
+                use_fp8_w8a8=use_fp8_w8a8,
+                use_int8_w8a8=use_int8_w8a8,
+                use_int8_w8a16=use_int8_w8a16,
+                use_int4_w4a16=use_int4_w4a16,
+                block_shape=block_shape,
+                per_act_token_quant=per_act_token_quant,
+            ))
+
+        self.batched_triton_experts = BatchedTritonExperts(
+            max_num_tokens=max_num_tokens,
+            num_dispatchers=num_dispatchers,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a8=use_int8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            use_int4_w4a16=use_int4_w4a16,
+            per_act_token_quant=self.per_act_token_quant,
+            block_shape=self.block_shape,
+        )
+
+        self.allow_deep_gemm = (allow_deep_gemm and use_fp8_w8a8
+                                and self.block_shape
+                                == BatchedDeepGemmExperts.DEEPGEMM_BLOCK_SHAPE)
+
+        self.batched_deep_gemm_experts = BatchedDeepGemmExperts(
+            max_num_tokens=max_num_tokens,
+            num_dispatchers=num_dispatchers,
+            block_shape=self.block_shape,  # type: ignore[arg-type]
+        ) if self.allow_deep_gemm else None
+
+        assert (self.batched_deep_gemm_experts is not None
+                or self.batched_triton_experts is not None)
+
+    @property
+    def activation_formats(
+        self
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        if self.batched_triton_experts is not None:
+            assert (self.batched_deep_gemm_experts is None
+                    or self.batched_deep_gemm_experts.activation_formats
+                    == self.batched_triton_experts.activation_formats)
+            return self.batched_triton_experts.activation_formats
+        else:
+            assert self.batched_deep_gemm_experts is not None
+            return self.batched_deep_gemm_experts.activation_formats
+
+    def supports_chunking(self) -> bool:
+        bdge = self.batched_deep_gemm_experts
+        bte = self.batched_triton_experts
+        return ((bdge is None or bdge.supports_chunking())
+                and (bte is None or bte.supports_chunking()))
+
+    def supports_expert_map(self) -> bool:
+        bdge = self.batched_deep_gemm_experts
+        bte = self.batched_triton_experts
+        return ((bdge is None or bdge.supports_expert_map())
+                and (bte is None or bte.supports_expert_map()))
+
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        bdge = self.batched_deep_gemm_experts
+        bte = self.batched_triton_experts
+        bdge_war = bdge.finalize_weight_and_reduce_impl() if bdge else None
+        bte_war = bte.finalize_weight_and_reduce_impl() if bte else None
+        is_bdge_war = bdge_war is not None
+        is_bte_war = bte_war is not None
+
+        if is_bdge_war and is_bte_war:
+            assert bdge_war == bte_war, (
+                "Both implementations should agree on WeightAndReduce impls. "
+                f"Got bdge_war: {bdge_war}, and bte_war: {bte_war}")
+
+        if bdge_war is not None:
+            return bdge_war
+
+        assert bte_war is not None
+        return bte_war
+
+    def workspace_shapes(
+        self,
+        a: torch.Tensor,
+        aq: torch.Tensor,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_metadata: Optional[mk.ExpertTokensMetadata],
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        # Note: the deep gemm workspaces are strictly larger than the triton
+        # workspaces so we can be pessimistic here and allocate for DeepGemm
+        # even if we fall back to triton later, e.g. if expert maps are set.
+        if self.allow_deep_gemm:
+            assert self.batched_deep_gemm_experts is not None
+            return self.batched_deep_gemm_experts.workspace_shapes(
+                a, aq, M, N, K, topk, global_num_experts, local_num_experts,
+                expert_tokens_metadata)
+        else:
+            assert self.batched_triton_experts is not None
+            return self.batched_triton_experts.workspace_shapes(
+                a, aq, M, N, K, topk, global_num_experts, local_num_experts,
+                expert_tokens_metadata)
+
+    def apply(self, output: torch.Tensor, hidden_states: torch.Tensor,
+              w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor,
+              topk_ids: torch.Tensor, activation: str, global_num_experts: int,
+              expert_map: Optional[torch.Tensor],
+              w1_scale: Optional[torch.Tensor],
+              w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor],
+              w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor],
+              a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor,
+              workspace2: torch.Tensor,
+              expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+              apply_router_weight_on_input: bool,
+              extra_expert_args: Optional[dict[str, Any]]):
+        experts = (self.batched_deep_gemm_experts
+                   if self.allow_deep_gemm else self.batched_triton_experts)
+        assert experts is not None
+        experts.apply(output, hidden_states, w1, w2, topk_weights, topk_ids,
+                      activation, global_num_experts, expert_map, w1_scale,
+                      w2_scale, w1_zp, w2_zp, a1q_scale, a2_scale, workspace13,
+                      workspace2, expert_tokens_meta,
+                      apply_router_weight_on_input, extra_expert_args)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/config.py b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/config.py
new file mode 100644
index 0000000..f5ed286
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/config.py
@@ -0,0 +1,484 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from compressed_tensors.quantization import (QuantizationArgs,
+                                             QuantizationStrategy,
+                                             QuantizationType)
+
+import vllm.envs as envs
+from vllm.config import ParallelConfig
+from vllm.distributed import get_dp_group, get_tensor_model_parallel_rank
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.utils import cdiv
+from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
+
+logger = init_logger(__name__)
+
+
+def _get_quant_config_quantization_args(
+    quant_config: Optional[QuantizationConfig],
+    prop_name: str,
+) -> Optional[QuantizationArgs]:
+    if (quant_config is not None and hasattr(quant_config, 'target_scheme_map')
+            and "Linear" in quant_config.target_scheme_map and
+            "input_activations" in quant_config.target_scheme_map["Linear"]):
+        return quant_config.target_scheme_map["Linear"].get(prop_name)
+    else:
+        return None
+
+
+def get_quant_config_input_quant(
+        quant_config: Optional[QuantizationConfig]
+) -> Optional[QuantizationArgs]:
+    return _get_quant_config_quantization_args(quant_config,
+                                               "input_activations")
+
+
+def get_quant_config_weight_quant(
+        quant_config: Optional[QuantizationConfig]
+) -> Optional[QuantizationArgs]:
+    return _get_quant_config_quantization_args(quant_config, "weights")
+
+
+# TODO (bnell): use scalar_type instead of bools?
+def get_config_quant_dtype(
+    use_fp8_w8a8: bool,
+    use_int8_w8a8: bool,
+    use_int8_w8a16: bool,
+    use_int4_w4a16: bool,
+    use_mxfp4_w4a4: bool,
+) -> Union[None, torch.dtype, str]:
+    if use_fp8_w8a8:
+        return torch.float8_e4m3fn
+    elif use_int8_w8a8:
+        return torch.int8
+    elif use_mxfp4_w4a4:
+        return "mxfp4"
+    return None
+
+
+@dataclass
+class FusedMoEQuantConfig:
+    # The post quantization activation type.
+    quant_dtype: Optional[torch.dtype] = None
+    per_act_token_quant: bool = False
+    per_out_ch_quant: bool = False
+    block_shape: Optional[list[int]] = None
+
+    # TODO: add col major flag?
+    # add detailed quant info for input, intermediates, weights, etc?
+
+    def __post_init__(self):
+        assert (not self.per_act_token_quant
+                or self.block_shape is None), "illegal quantization"
+
+    @property
+    def is_quantized(self) -> bool:
+        return self.quant_dtype is not None
+
+    @property
+    def is_per_act_token(self) -> bool:
+        return self.per_act_token_quant
+
+    @property
+    def is_block_quantized(self) -> bool:
+        return self.block_shape is not None
+
+    @property
+    def is_per_tensor(self) -> bool:
+        return not self.per_act_token_quant and self.block_shape is None
+
+    def scale_shape(
+        self,
+        max_tokens: int,
+        hidden_dim: int,
+    ) -> Optional[tuple[int, int]]:
+        if self.is_quantized:
+            if self.is_block_quantized:
+                assert self.block_shape is not None
+                _, block_k = self.block_shape
+                k_tiles = cdiv(hidden_dim, block_k)
+                return (max_tokens, k_tiles)
+            elif self.is_per_act_token:
+                return (max_tokens, 1)
+            else:
+                return (1, 1)
+        else:
+            return None
+
+    def batched_scale_shape(
+        self,
+        num_experts: int,
+        max_tokens: int,
+        hidden_dim: int,
+    ) -> Optional[tuple[int, int, int]]:
+        if self.is_quantized:
+            scale_shape = self.scale_shape(max_tokens, hidden_dim)
+            assert scale_shape is not None
+            return (num_experts, *scale_shape)
+        else:
+            return None
+
+    @staticmethod
+    def make(
+        use_fp8_w8a8: bool = False,
+        use_int8_w8a8: bool = False,
+        use_int8_w8a16: bool = False,
+        use_int4_w4a16: bool = False,
+        use_mxfp4_w4a4: bool = False,
+        per_act_token_quant: bool = False,
+        per_out_ch_quant: bool = False,
+        block_shape: Optional[list[int]] = None,
+    ) -> "FusedMoEQuantConfig":
+        assert sum([
+            int(flag) for flag in [
+                use_fp8_w8a8,
+                use_int8_w8a8,
+                use_int8_w8a16,
+                use_int4_w4a16,
+            ]
+        ]) <= 1, "Quantization flags are mutually exclusive."
+
+        quant_dtype = get_config_quant_dtype(
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a8=use_int8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            use_int4_w4a16=use_int4_w4a16,
+            use_mxfp4_w4a4=use_mxfp4_w4a4,
+        )
+        return FusedMoEQuantConfig(
+            quant_dtype,
+            per_act_token_quant,
+            per_out_ch_quant,
+            block_shape,
+        )
+
+
+@dataclass
+class FusedMoEParallelConfig:
+    tp_size: int
+    dp_size: int
+    ep_size: int
+    tp_rank: int
+    dp_rank: int
+    ep_rank: int
+
+    use_ep: bool  # whether to use EP or not
+
+    @property
+    def use_all2all_kernels(self):
+        return self.dp_size > 1 and self.use_ep
+
+    @property
+    def use_pplx_kernels(self):
+        return (self.use_all2all_kernels
+                and envs.VLLM_ALL2ALL_BACKEND == "pplx")
+
+    @property
+    def use_deepep_ht_kernels(self):
+        return (self.use_all2all_kernels
+                and envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput")
+
+    @property
+    def use_deepep_ll_kernels(self):
+        return (self.use_all2all_kernels
+                and envs.VLLM_ALL2ALL_BACKEND == "deepep_low_latency")
+
+    @property
+    def use_flashinfer_cutlass_kernels(self):
+        return (envs.VLLM_USE_FLASHINFER_MOE_FP4
+                and has_flashinfer_cutlass_fused_moe())
+
+    @staticmethod
+    def make(tp_size_: int, dp_size_: int,
+             vllm_parallel_config: ParallelConfig) -> "FusedMoEParallelConfig":
+        """
+        Determine MoE parallel configuration. Based on the input `tp_size_`,
+        `dp_size_` and vllm's parallel config, determine what
+        level's of parallelism to use in the fused moe layer.
+
+        Args:
+            tp_size_ (int): `tp_size` passed into the FusedMoE constructor.
+            dp_size_ (int): `dp_size` passed into the FusedMoE constructor.
+            vllm_parallel_config (ParallelConfig): vLLM's parallel config
+                object which contains the `enable_expert_parallel` flag.
+
+        Examples:
+            When there is no parallelism requested,
+            i.e. `tp_size_` = `dp_size_` = 1, we simply return the sizes
+            unaltered and the ranks set to 0.
+
+            Expert Parallelism is considered only when either `dp_size_` or
+            `tp_size_` is non trivial.
+
+            When TP = 2, DP = 1 and EP = False, the configuration on different
+            devices:
+
+            - device 0 : TP = {2, 0} DP = {1, 0} EP = {1, 0} //
+                legend : {size, rank}
+            - device 1 : TP = {2, 1} DP = {1, 0} EP = {1, 0}
+            - Comment : Tensors are sharded across 2 devices.
+
+            When TP = 1, DP = 2 and EP = False, the configuration on different
+                devices:
+
+            - device 0 : TP = {2, 0} DP = {2, 0} EP = {1, 0}
+            - device 1 : TP = {2, 1} DP = {2, 1} EP = {1, 0}
+            - Comment: There are 2 engine instances and the tensors are sharded
+                across 2 decvices.
+
+            When TP = 2, DP = 2 and EP = False, the configuration on different
+                devices:
+
+            - device 0: TP = {4, 0} DP = {2, 0} EP = {1, 0}
+            - device 1: TP = {4, 1} DP = {2, 0} EP = {1, 0}
+            - device 2: TP = {4, 2} DP = {2, 1} EP = {1, 0}
+            - device 3: TP = {4, 3} DP = {2, 1} EP = {1, 0}
+            - Comment: There are 2 engine instances and the tensors are sharded
+                across 4 devices.
+
+            When, TP = 2, DP = 1 and EP = True, the configuration on different
+                devices:
+
+            - device 0: TP = {1, 0} DP = {1, 0} EP = {2, 0}
+            - device 1: TP = {1, 0} DP = {1, 0} EP = {2, 1}
+            - Comment: The experts are split between the 2 devices.
+
+            When, TP = 1, DP = 2 and EP = True, the configuration on different
+                devices:
+
+            - device 0: TP = {1, 0} DP = {2, 0} EP = {2, 0}
+            - device 1: TP = {1, 0} DP = {2, 1} EP = {2, 1}
+            - Comment: There are 2 engine instances and the experts are split
+                between the 2 devices.
+
+            When TP = 2, DP = 2 and EP = True, the configuration on different
+                devices:
+
+            - device 0: TP = {1, 0} DP = {2, 0} EP = {4, 0}
+            - device 1: TP = {1, 0} DP = {2, 0} EP = {4, 1}
+            - device 2: TP = {1, 0} DP = {2, 1} EP = {4, 2}
+            - device 3: TP = {1, 0} DP = {2, 1} EP = {4, 3}
+            - Comment: There are 2 engine instances and the experts are split
+                between the 4 devices.
+        """
+
+        def flatten_tp_across_dp(dp_rank: int):
+            tp_rank = 0 if tp_size_ == 1 else get_tensor_model_parallel_rank()
+            # There are actually dp_size_ * tp_size_ devices. Update tp_size
+            # and tp_rank so we shard across all devices.
+            tp_size = dp_size_ * tp_size_
+            tp_rank = dp_rank * tp_size_ + tp_rank
+            return tp_size, tp_rank
+
+        use_ep = (dp_size_ * tp_size_ > 1
+                  and vllm_parallel_config.enable_expert_parallel)
+
+        dp_size = dp_size_
+        dp_rank = get_dp_group().rank_in_group if dp_size > 1 else 0
+        tp_size, tp_rank = flatten_tp_across_dp(dp_rank)
+
+        if not use_ep:
+            return FusedMoEParallelConfig(tp_size=tp_size,
+                                          tp_rank=tp_rank,
+                                          dp_size=dp_size,
+                                          dp_rank=dp_rank,
+                                          ep_size=1,
+                                          ep_rank=0,
+                                          use_ep=False)
+        # DP + EP / TP + EP / DP + TP + EP
+        assert use_ep
+        # In EP, each device owns a set of experts fully. There is no tensor
+        # parallel update tp_size, tp_rank, ep_size and ep_rank to reflect that.
+        ep_size = tp_size
+        ep_rank = tp_rank
+        return FusedMoEParallelConfig(tp_size=1,
+                                      tp_rank=0,
+                                      dp_size=dp_size,
+                                      dp_rank=dp_rank,
+                                      ep_size=ep_size,
+                                      ep_rank=ep_rank,
+                                      use_ep=True)
+
+
+# Adapted from pplx-kernels tests/all_to_all_utils.py
+@dataclass
+class FusedMoEConfig:
+    num_experts: int
+    experts_per_token: int
+    hidden_dim: int
+
+    num_local_experts: int
+    moe_parallel_config: FusedMoEParallelConfig
+
+    # The activation type.
+    in_dtype: torch.dtype
+
+    quant_config: Optional[FusedMoEQuantConfig] = None
+
+    max_num_tokens: int = envs.VLLM_MOE_DP_CHUNK_SIZE
+
+    def __post_init__(self):
+        if self.dp_size > 1:
+            logger.debug("Using FusedMoEConfig::max_num_tokens=%d",
+                         self.max_num_tokens)
+
+        assert self.max_num_tokens > 0
+
+    @property
+    def quant_dtype(self) -> Optional[torch.dtype]:
+        if self.quant_config is not None:
+            return self.quant_config.quant_dtype
+        else:
+            return None
+
+    @property
+    def block_shape(self) -> Optional[list[int]]:
+        if self.quant_config is not None:
+            return self.quant_config.block_shape
+        else:
+            return None
+
+    @property
+    def per_act_token_quant(self) -> bool:
+        if self.quant_config is not None:
+            return self.quant_config.per_act_token_quant
+        else:
+            return False
+
+    @property
+    def per_out_ch_quant(self) -> bool:
+        if self.quant_config is not None:
+            return self.quant_config.per_out_ch_quant
+        else:
+            return False
+
+    @property
+    def tp_size(self):
+        return self.moe_parallel_config.tp_size
+
+    @property
+    def dp_size(self):
+        return self.moe_parallel_config.dp_size
+
+    @property
+    def ep_size(self):
+        return self.moe_parallel_config.ep_size
+
+    @property
+    def tp_rank(self):
+        return self.moe_parallel_config.tp_rank
+
+    @property
+    def dp_rank(self):
+        return self.moe_parallel_config.dp_rank
+
+    @property
+    def ep_rank(self):
+        return self.moe_parallel_config.ep_rank
+
+    @property
+    def use_ep(self):
+        return self.moe_parallel_config.use_ep
+
+    @property
+    def use_pplx_kernels(self):
+        return self.moe_parallel_config.use_pplx_kernels
+
+    @property
+    def use_deepep_ht_kernels(self):
+        return self.moe_parallel_config.use_deepep_ht_kernels
+
+    @property
+    def use_deepep_ll_kernels(self):
+        return self.moe_parallel_config.use_deepep_ll_kernels
+
+    @property
+    def use_flashinfer_cutlass_kernels(self):
+        return self.moe_parallel_config.use_flashinfer_cutlass_kernels
+
+    @staticmethod
+    def make(
+        num_experts: int,
+        experts_per_token: int,
+        hidden_dim: int,
+        num_local_experts: int,
+        moe_parallel_config: FusedMoEParallelConfig,
+        in_dtype: torch.dtype,
+        max_num_tokens: int = envs.VLLM_MOE_DP_CHUNK_SIZE,
+        quant_config: Optional[Union[FusedMoEQuantConfig,
+                                     QuantizationConfig]] = None
+    ) -> "FusedMoEConfig":
+
+        _quant_config: Optional[FusedMoEQuantConfig] = None
+
+        if quant_config is not None and isinstance(quant_config,
+                                                   QuantizationConfig):
+            if hasattr(quant_config, 'weight_block_size'):
+                block_shape = quant_config.weight_block_size
+            else:
+                block_shape = None
+            per_act_token_quant = False
+            per_out_ch_quant = False
+            quant_dtype: Optional[torch.dtype] = None
+
+            input_quant = get_quant_config_input_quant(quant_config)
+            weight_quant = get_quant_config_weight_quant(quant_config)
+
+            if input_quant is not None:
+                per_act_token_quant = (input_quant.strategy
+                                       == QuantizationStrategy.TOKEN
+                                       if input_quant is not None else False)
+
+                if input_quant.num_bits == 8:
+                    if input_quant.type == QuantizationType.FLOAT:
+                        quant_dtype = torch.float8_e4m3fn
+                    elif input_quant.type == QuantizationType.INT:
+                        quant_dtype = torch.int8
+
+            from vllm.model_executor.layers.quantization.fp8 import Fp8Config
+            if quant_dtype is None and isinstance(quant_config, Fp8Config):
+                quant_dtype = torch.float8_e4m3fn
+
+            from vllm.model_executor.layers.quantization.modelopt import (
+                ModelOptNvFp4Config)
+            if quant_dtype is None and isinstance(quant_config,
+                                                  ModelOptNvFp4Config):
+                quant_dtype = torch.uint8
+
+            if weight_quant is not None:
+                per_out_ch_quant = (
+                    weight_quant.strategy == QuantizationStrategy.CHANNEL)
+
+            if quant_dtype is not None:
+                _quant_config = FusedMoEQuantConfig(
+                    quant_dtype=quant_dtype,
+                    per_act_token_quant=per_act_token_quant,
+                    per_out_ch_quant=per_out_ch_quant,
+                    block_shape=block_shape,
+                )
+            else:
+                _quant_config = FusedMoEQuantConfig()
+                if moe_parallel_config.dp_size > 1:
+                    logger.warning_once("MoE DP setup unable to determine "
+                                        "quantization scheme or unsupported "
+                                        "quantization type. This model will "
+                                        "not run with DP enabled.")
+        else:
+            _quant_config = quant_config
+
+        return FusedMoEConfig(
+            num_experts=num_experts,
+            experts_per_token=experts_per_token,
+            hidden_dim=hidden_dim,
+            num_local_experts=num_local_experts,
+            moe_parallel_config=moe_parallel_config,
+            in_dtype=in_dtype,
+            quant_config=_quant_config,
+            max_num_tokens=max_num_tokens,
+        )
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 0000000..56c1a4e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,146 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 2
+  },
+  "48": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "128": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "512": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000..d3677be
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "24": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "32": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "48": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "64": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "96": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "128": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "256": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "512": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 0000000..265768f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,218 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "128": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "5120": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "9216": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "13312": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "17408": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "25600": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "33792": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "41984": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "50176": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "58368": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000..d3be23d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,218 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "24": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "32": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "48": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "64": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "96": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "128": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "256": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "5120": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "9216": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "13312": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "17408": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "25600": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "33792": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "41984": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "50176": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "58368": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 0000000..589f5d3
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,218 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "24": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "48": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "128": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "5120": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "9216": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "13312": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "17408": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "25600": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "33792": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "41984": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "50176": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "58368": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
new file mode 100644
index 0000000..2c78bfa
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
@@ -0,0 +1,218 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "24": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "48": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 2
+  },
+  "128": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 2
+  },
+  "256": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "5120": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "9216": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "13312": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "17408": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "25600": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "33792": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "41984": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "50176": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "58368": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000..4da841e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,218 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "24": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "32": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "48": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "64": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "96": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "128": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "256": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "512": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "5120": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "9216": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "13312": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "17408": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "25600": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "33792": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "41984": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "50176": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "58368": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 3
+  }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 0000000..2003567
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,218 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "96": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "128": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "512": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "5120": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "9216": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "13312": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "17408": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "25600": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "33792": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "41984": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "50176": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "58368": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000..e076615
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,218 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "24": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "32": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "48": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "64": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "96": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "128": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "256": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "5120": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "9216": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "13312": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "17408": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "25600": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "33792": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "41984": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "50176": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "58368": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 0000000..ee89655
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,218 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "128": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "5120": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "9216": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "13312": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "17408": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "25600": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "33792": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "41984": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "50176": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "58368": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000..05aed8b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,218 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "24": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "32": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "48": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "64": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "96": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "128": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "256": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "5120": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "9216": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "13312": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "17408": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "25600": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "33792": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "41984": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "50176": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "58368": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..555d173
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000..e539335
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000..e1c4cac
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000..5de5605
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json
new file mode 100644
index 0000000..b506820
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json
new file mode 100644
index 0000000..2221e99
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000..74374c5
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..c275cec
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..b34b6e4
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..60ccde1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json
new file mode 100644
index 0000000..b0139b9
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json
new file mode 100644
index 0000000..ab169a0
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..324ad7b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000..ab6e155
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000..249359f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..9942546
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..b9dc2d7
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..b4efc9b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..3559f33
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json
new file mode 100644
index 0000000..03dfc73
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..9c07695
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000..beaac7f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json
new file mode 100644
index 0000000..ebff99e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000..f10e394
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..beeb5a6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
@@ -0,0 +1,147 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
+
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000..1fa444b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json
new file mode 100644
index 0000000..0442038
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json
new file mode 100644
index 0000000..9262a74
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000..d251f9b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000..0ecf814
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 0000000..51ad5b2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,146 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "128": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000..ee51191
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "4": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 2
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "64": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "128": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 4
+  }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 0000000..68793c7
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,218 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "128": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "5120": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "9216": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "13312": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "17408": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "25600": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "33792": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "41984": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "50176": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "58368": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000..6129107
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,218 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "4": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "48": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 2
+  },
+  "64": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "96": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 2
+  },
+  "128": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 2
+  },
+  "256": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "5120": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "9216": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "13312": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "17408": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "25600": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "33792": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "41984": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "50176": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "58368": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000..039a10e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000..3793fca
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 0000000..51d03d8
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,146 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "64": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "96": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "128": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
new file mode 100644
index 0000000..26f9abd
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
@@ -0,0 +1,146 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "64": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "128": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "256": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "512": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..cd0cdbe
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,130 @@
+{
+    "3328": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "768": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1792": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2560": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2816": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3584": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3840": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1280": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2304": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 0000000..64be6e6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,146 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 2
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "128": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000..0a6a6a7
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,218 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "64": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "128": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "256": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "512": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "5120": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "9216": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "13312": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "17408": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "25600": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "33792": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "41984": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "50176": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "58368": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..ba9041d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,130 @@
+{
+    "3840": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1792": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3584": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2816": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1280": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "768": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3328": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2560": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2304": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 0000000..7a7508a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,146 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 2
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "128": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000..dbf9a2d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 2
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "32": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "128": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 3
+  }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
new file mode 100644
index 0000000..bbb2386
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
@@ -0,0 +1,146 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "128": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..5705545
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,130 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1792": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3328": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2560": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "768": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2816": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2304": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1280": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3840": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3584": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json
new file mode 100644
index 0000000..0611620
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json
new file mode 100644
index 0000000..43c249d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..43c249d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..4dd00d1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json
new file mode 100644
index 0000000..48f9697
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..a8c0571
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
new file mode 100644
index 0000000..f1244c6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..2e692a1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..857d11e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..a2ee05d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..63e1187
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..e676960
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..e676960
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..fc573cd
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..3e0ad0d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..c6d7e96
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..9264ca1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..6fcf408
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..c6eabea
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..e676960
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
new file mode 100644
index 0000000..21f6022
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000..d09508b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000..746463a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000..bbdb9ad
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000..43584b1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000..8cc6c64
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json
new file mode 100644
index 0000000..39a9912
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..05b5463
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000..d4c9ddd
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..c17a4ec
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000..170ae7f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..298a361
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..1d9d352
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..9ad5b31
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000..2883dfd
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..0e210cb
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json
new file mode 100644
index 0000000..e4fa1e2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..8abfd84
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000..2fc18a5
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..be8d4a7
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000..71fdd88
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..082456d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json
new file mode 100644
index 0000000..c3b2e7f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000..b2799ed
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json
new file mode 100644
index 0000000..c02de2f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..3e0bc75
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..9f7ed67
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000..b8d3be2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..21b7255
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000..eaf32f6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..bba1d21
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json
new file mode 100644
index 0000000..de1c413
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json
new file mode 100644
index 0000000..5a9910a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..b6f1d01
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000..4bf7753
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..f245285
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json
new file mode 100644
index 0000000..3918c93
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..3f3ccda
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,138 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..841044a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000..59be497
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..0e5fd1e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000..d6ad635
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..16e0a91
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json
new file mode 100644
index 0000000..d766fc0
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..8323f51
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000..1b46cb5
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..6d5b1ae
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json
new file mode 100644
index 0000000..ffc1b23
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json
new file mode 100644
index 0000000..f4c0f84
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000..5c8185c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000..97c9f44
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..e4110a5
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000..0883ef4
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..81bb765
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000..811c77a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..2758e48
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json
new file mode 100644
index 0000000..fc31215
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000..0bb423b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..5557187
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000..26bcbf2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..1a0aa33
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000..9952be6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..379ca10
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000..5a3f415
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..6cb80f4
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json
new file mode 100644
index 0000000..de9d0ab
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json
new file mode 100644
index 0000000..b41f9d4
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000..edf2a38
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..32bbadb
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..673bae2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000..b2100ce
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..e6f753c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000..53f3394
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json
new file mode 100644
index 0000000..d720deb
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json
@@ -0,0 +1,173 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 2,
+        "num_warps": 4,
+        "num_ctas": 1,
+        "num_stages": 7
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 128,
+        "num_warps": 2,
+        "num_ctas": 1,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_ctas": 1,
+        "num_stages": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_ctas": 1,
+        "num_stages": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 2,
+        "num_warps": 4,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 2,
+        "num_warps": 4,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 2,
+        "num_warps": 4,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "192": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 16,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 128,
+        "num_warps": 2,
+        "num_ctas": 1,
+        "num_stages": 8
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_ctas": 1,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 16,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 16,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "6144": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "8192": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 16,
+        "num_ctas": 1,
+        "num_stages": 2
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..48bb5f2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000..a64d06c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..2c49f35
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json
new file mode 100644
index 0000000..c7db6c0
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000..dbc6247
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..cc614e6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000..32c0c9d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..4dd475c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000..2ed15f3
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..bd2c6fb
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000..8d7b780
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..7a07bbf
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json
new file mode 100644
index 0000000..3a3268c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000..f578c8d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..918f683
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000..e341a67
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..eb81726
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000..0c7062a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..cd4fb8f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000..cf66868
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..c27ca0a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json
new file mode 100644
index 0000000..da477b1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..34b916e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000..96cbc11
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/README b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/README
new file mode 100644
index 0000000..85970e2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/configs/README
@@ -0,0 +1,12 @@
+This directory contains tuned configurations for different settings of the fused_moe kernel.
+For different settings of
+- E (number of experts)
+- N (intermediate size)
+- device_name (torch.cuda.get_device_name())
+the JSON file contains a mapping from M (batch size) to the chosen configuration.
+
+The example configurations provided are for the Mixtral model for TP2 on H100
+and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have
+N = 7168 and for TP4 we have N = 3584.
+
+See `benchmark/kernels/benchmark_moe.py` on how to generate these config files.
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
new file mode 100644
index 0000000..e67ff66
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
@@ -0,0 +1,215 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Callable, Optional
+
+import torch
+
+from vllm import envs
+
+
+class IPEXFusedMOE:
+
+    def __init__(self, layer: torch.nn.Module) -> None:
+        import intel_extension_for_pytorch as ipex
+        layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
+            layer.w13_weight,
+            layer.w2_weight,
+            use_prepack=envs.VLLM_CPU_MOE_PREPACK,
+        )
+
+    def __call__(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+    ) -> torch.Tensor:
+        assert activation == "silu", f"{activation} is not supported."
+        assert not apply_router_weight_on_input
+        return layer.ipex_fusion(
+            x,
+            use_grouped_topk,
+            top_k,
+            router_logits,
+            renormalize,
+            topk_group,
+            num_expert_group,
+            custom_routing_function,
+            scoring_func,
+            e_score_correction_bias,
+        )
+
+
+class SGLFusedMOE:
+
+    def __init__(self, layer: torch.nn.Module) -> None:
+        pass
+
+    @staticmethod
+    def _grouped_topk(
+        hidden_states: torch.Tensor,
+        gating_output: torch.Tensor,
+        topk: int,
+        renormalize: bool,
+        num_expert_group: int = 0,
+        topk_group: int = 0,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        assert hidden_states.shape[0] == gating_output.shape[0], (
+            "Number of tokens mismatch")
+
+        gating_output = gating_output.float()
+        if scoring_func == "softmax":
+            scores = torch.softmax(gating_output, dim=-1)
+        elif scoring_func == "sigmoid":
+            scores = gating_output.sigmoid()
+        else:
+            raise ValueError(f"Unsupported scoring function: {scoring_func}")
+
+        num_token = scores.shape[0]
+        if e_score_correction_bias is not None:
+            # Store original scores before applying correction bias. We use
+            # biased scores for expert selection but original scores for
+            # routing weights
+            original_scores = scores
+            scores = scores + e_score_correction_bias.unsqueeze(0)
+            group_scores = (scores.view(num_token, num_expert_group,
+                                        -1).topk(2, dim=-1)[0].sum(dim=-1))
+        else:
+            group_scores = scores.view(num_token, num_expert_group,
+                                       -1).max(dim=-1).values  # [n, n_group]
+        group_idx = torch.topk(group_scores,
+                               k=topk_group,
+                               dim=-1,
+                               sorted=False)[1]  # [n, top_k_group]
+        group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+        group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+        score_mask = group_mask.unsqueeze(-1).expand(
+            num_token, num_expert_group,
+            scores.shape[-1] // num_expert_group).reshape(num_token,
+                                                          -1)  # [n, e]
+        tmp_scores = scores.masked_fill(~score_mask.bool(),
+                                        float("-inf"))  # [n, e]
+
+        if e_score_correction_bias is not None:
+            topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)[1]
+            # Use original unbiased scores for the routing weights
+            topk_weights = original_scores.gather(1, topk_ids)
+        else:
+            topk_weights, topk_ids = torch.topk(tmp_scores,
+                                                k=topk,
+                                                dim=-1,
+                                                sorted=False)
+
+        if renormalize:
+            topk_weights = topk_weights / topk_weights.sum(dim=-1,
+                                                           keepdim=True)
+
+        return topk_weights, topk_ids.to(torch.int32)
+
+    @staticmethod
+    def _select_experts(
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        use_grouped_topk: bool,
+        renormalize: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # DeekSeekv2 uses grouped_top_k
+        if use_grouped_topk:
+            assert topk_group is not None
+            assert num_expert_group is not None
+            topk_weights, topk_ids = SGLFusedMOE._grouped_topk(
+                hidden_states=hidden_states,
+                gating_output=router_logits,
+                topk=top_k,
+                renormalize=renormalize,
+                num_expert_group=num_expert_group,
+                topk_group=topk_group,
+                scoring_func=scoring_func,
+                e_score_correction_bias=e_score_correction_bias)
+        elif custom_routing_function is None:
+            assert scoring_func == "softmax"
+            topk_weights = torch.nn.functional.softmax(router_logits,
+                                                       dim=1,
+                                                       dtype=torch.float32)
+            topk_weights, topk_ids = torch.topk(topk_weights, top_k, dim=-1)
+            if renormalize:
+                topk_weights /= topk_weights.sum(dim=-1, keepdim=True)
+            topk_ids = topk_ids.to(torch.int32)
+        else:
+            topk_weights, topk_ids = custom_routing_function(
+                hidden_states=hidden_states,
+                gating_output=router_logits,
+                topk=top_k,
+                renormalize=renormalize)
+
+        return topk_weights, topk_ids
+
+    def __call__(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+    ) -> torch.Tensor:
+        assert activation == "silu", f"{activation} is not supported."
+        assert not apply_router_weight_on_input
+        topk_weights, topk_ids = SGLFusedMOE._select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias,
+        )
+
+        torch.ops._C.fused_experts_cpu(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            topk_weights,
+            topk_ids,
+            True,
+            False,
+            False,
+            None,
+            None,
+            None,
+            None,
+            None,
+            True,
+        )
+        return x
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/cutlass_moe.py
new file mode 100644
index 0000000..2585a29
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -0,0 +1,895 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+""" CUTLASS based Fused MoE kernels."""
+from typing import Any, Callable, Optional
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP)
+from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
+    TopKWeightAndReduceDelegate)
+from vllm.model_executor.layers.fused_moe.utils import (_fp8_perm,
+                                                        _fp8_quantize,
+                                                        _resize_cache,
+                                                        extract_required_args)
+from vllm.scalar_type import scalar_types
+
+logger = init_logger(__name__)
+
+
+def run_cutlass_moe_fp8(
+    output: torch.Tensor,
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_ids: torch.Tensor,
+    activation_callable: Callable,
+    global_num_experts: int,
+    expert_map: Optional[torch.Tensor],
+    w1_scale: Optional[torch.Tensor],
+    w2_scale: Optional[torch.Tensor],
+    a1q_scale: Optional[torch.Tensor],
+    a2_scale: Optional[torch.Tensor],
+    workspace13: torch.Tensor,
+    workspace2: torch.Tensor,
+    expert_num_tokens: Optional[torch.Tensor],
+    out_dtype: torch.dtype,
+    per_act_token: bool,
+    per_out_ch: bool,
+    use_batched_format: bool,
+):
+    a1q = hidden_states
+
+    assert w1_scale is not None
+    assert w2_scale is not None
+    assert w1.dtype == torch.float8_e4m3fn
+    assert w2.dtype == torch.float8_e4m3fn
+    assert a1q.size(-1) == w1.size(2), "Hidden size mismatch w1"
+    assert w1.size(1) == w2.size(2) * 2, "Hidden size mismatch w2"
+    assert w1_scale.dim() == 1 or w1_scale.size(
+        1) == 1 or w1_scale.shape[1] == w1.size(1), "W1 scale shape mismatch"
+    assert w2_scale.dim() == 1 or w2_scale.size(
+        1) == 1 or w2_scale.shape[1] == w2.size(1), "W2 scale shape mismatch"
+    assert w1.size(0) == w2.size(0), "Expert number mismatch"
+    assert a1q_scale is None or a1q_scale.dim() == 0 or a1q_scale.size(
+        0) == 1 or a1q_scale.size(
+            0) == a1q.shape[0], "Input scale shape mismatch"
+    assert w1.size(0) == w2.size(0), "Weights expert number mismatch"
+    assert w1.size(0) == w1_scale.size(0), "w1 scales expert number mismatch"
+    assert w1.size(0) == w2_scale.size(0), "w2 scales expert number mismatch"
+    assert a2_scale is None or a2_scale.dim() == 0 or a2_scale.size(
+        0) == 1 or a2_scale.size(
+            0) == a1q.shape[0], "Intermediate scale shape mismatch"
+    assert out_dtype in [torch.half, torch.bfloat16], "Invalid output dtype"
+    if expert_map is not None:
+        assert expert_num_tokens is None
+
+    # We have two modes: batched experts and non-batched experts.
+    # In the non-batched mode, the input tokens are not padded: thus, the shape
+    # of the input is [total_num_tokens, hidden_size]. The input and output
+    # require shuffling by a_map and c_map such that the tokens assigned to
+    # each expert are contiguous.
+    # In the batched mode, the input tokens are padded per expert to ensure that
+    # the batched dispatch and combine functions work correctly: thus, the shape
+    # of the input is [num_experts, max_num_tokens_per_expert, hidden_size].
+    # The batched input and output require no shuffling by a_map and c_map since
+    # their tokens are already contiguous for each expert as a result of
+    # the dispatch function.
+
+    M = a1q.size(0)  # non batched expert M
+    padded_M = a1q.size(1)  # batched expert M
+    _, K, N = w2.shape
+    device = a1q.device
+
+    assert w1.size(2) == K
+    assert global_num_experts != -1
+    assert a1q_scale is not None
+
+    if expert_map is not None:
+        "Translate info from expert_map to topk_ids"
+        local_topk_ids = torch.where(expert_map[topk_ids] != -1,
+                                     expert_map[topk_ids], -1)
+    else:
+        local_topk_ids = topk_ids
+
+    topk = local_topk_ids.size(1)
+    local_E = w1.size(0)
+
+    if use_batched_format:
+        assert expert_num_tokens is not None
+
+        expert_offsets = torch.empty((local_E),
+                                     dtype=torch.int32,
+                                     device=device)
+        problem_sizes1 = torch.empty((local_E, 3),
+                                     dtype=torch.int32,
+                                     device=device)
+        problem_sizes2 = torch.empty((local_E, 3),
+                                     dtype=torch.int32,
+                                     device=device)
+
+        ops.get_cutlass_pplx_moe_mm_data(expert_offsets, problem_sizes1,
+                                         problem_sizes2, expert_num_tokens,
+                                         local_E, padded_M, N, K)
+
+        w1_scale = w1_scale.reshape(w1_scale.size(0), -1)
+        w2_scale = w2_scale.reshape(w2_scale.size(0), -1)
+        a1q = a1q.reshape(-1, a1q.size(2))
+        a1q_scale = a1q_scale.reshape(-1, a1q_scale.size(2)).contiguous()
+
+    else:
+        expert_offsets = torch.empty((global_num_experts + 1),
+                                     dtype=torch.int32,
+                                     device=device)
+        problem_sizes1 = torch.empty((global_num_experts, 3),
+                                     dtype=torch.int32,
+                                     device=device)
+        problem_sizes2 = torch.empty((global_num_experts, 3),
+                                     dtype=torch.int32,
+                                     device=device)
+
+        # With expert_map each Rank processes only a subset of experts. As
+        # a result not all of a_map and c2 tensors are filled. We fill it
+        # zeros for correctness.
+        if expert_map is not None:
+            a_map = torch.zeros((local_topk_ids.numel()),
+                                dtype=torch.int32,
+                                device=device)
+        else:
+            a_map = torch.empty((local_topk_ids.numel()),
+                                dtype=torch.int32,
+                                device=device)
+
+        c_map = torch.empty((local_topk_ids.numel()),
+                            dtype=torch.int32,
+                            device=device)
+
+        ops.get_cutlass_moe_mm_data(local_topk_ids, expert_offsets,
+                                    problem_sizes1, problem_sizes2, a_map,
+                                    c_map, global_num_experts, N, K)
+
+        a1q = _fp8_perm(a1q, a_map)
+        a1q_scale = a1q_scale[a_map] if per_act_token else a1q_scale
+        expert_offsets = expert_offsets[:-1]
+
+    ab_strides1 = torch.full((w1.size(0), ),
+                             K,
+                             device=device,
+                             dtype=torch.int64)
+    c_strides1 = torch.full((w1.size(0), ),
+                            2 * N,
+                            device=device,
+                            dtype=torch.int64)
+    ab_strides2 = torch.full((w1.size(0), ),
+                             N,
+                             device=device,
+                             dtype=torch.int64)
+    c_strides2 = torch.full((w1.size(0), ),
+                            K,
+                            device=device,
+                            dtype=torch.int64)
+
+    if use_batched_format:
+        c1 = _resize_cache(workspace13, (local_E * padded_M, N * 2))
+        c2 = _resize_cache(workspace2, (local_E * padded_M, N))
+        c3 = _resize_cache(workspace13, (local_E * padded_M, K))
+    else:
+        c1 = _resize_cache(workspace13, (M * topk, N * 2))
+        c2 = _resize_cache(workspace2, (M * topk, N))
+        c3 = _resize_cache(workspace13, (M * topk, K))
+
+    if not per_act_token and (expert_map is not None or use_batched_format):
+        # this is necessary to avoid imprecise scale calculation caused by
+        # random data in the unused workspace. The workspace is unused when
+        # this rank handles only partial tokens, or when it is batched .
+        c1.fill_(0)
+
+    ops.cutlass_moe_mm(c1, a1q, w1, a1q_scale, w1_scale, expert_offsets,
+                       problem_sizes1, ab_strides1, ab_strides1, c_strides1,
+                       per_act_token, per_out_ch)
+
+    activation_callable(c2, c1)
+
+    a2q, a2q_scale = ops.scaled_fp8_quant(
+        c2, a2_scale, use_per_token_if_dynamic=per_act_token)
+
+    if expert_map is not None:
+        c3.fill_(0)
+
+    ops.cutlass_moe_mm(c3, a2q, w2, a2q_scale, w2_scale, expert_offsets,
+                       problem_sizes2, ab_strides2, ab_strides2, c_strides2,
+                       per_act_token, per_out_ch)
+
+    if use_batched_format:
+        output.copy_(c3.reshape(local_E, padded_M, K), non_blocking=True)
+    else:
+        # We can't do this inplace because output may point to the same tensor
+        # as c3.
+        output.copy_(c3[c_map].view(M * topk, K), non_blocking=True)
+
+
+# TODO (bnell): split class batched vs. non-batched?
+# maybe remove need for passing aq to workspace_shapes
+class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute):
+
+    def __init__(
+        self,
+        max_experts_per_worker: int,
+        out_dtype: Optional[torch.dtype],
+        per_act_token_quant: bool,
+        per_out_ch_quant: bool,
+        block_shape: Optional[list[int]] = None,
+        num_dispatchers: Optional[int] = None,
+        use_batched_format: bool = False,
+    ):
+        super().__init__(
+            FusedMoEQuantConfig(
+                quant_dtype=torch.float8_e4m3fn,
+                per_act_token_quant=per_act_token_quant,
+                per_out_ch_quant=per_out_ch_quant,
+                block_shape=block_shape,
+            ))
+        assert max_experts_per_worker > 0
+        assert not use_batched_format or num_dispatchers is not None
+        self.max_experts_per_worker = max_experts_per_worker
+        self.num_dispatchers = num_dispatchers
+        self.out_dtype = out_dtype
+        self.use_batched_format = use_batched_format
+
+    @property
+    def activation_formats(
+        self
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        if self.use_batched_format:
+            return (mk.FusedMoEActivationFormat.BatchedExperts,
+                    mk.FusedMoEActivationFormat.BatchedExperts)
+        else:
+            return (mk.FusedMoEActivationFormat.Standard,
+                    mk.FusedMoEActivationFormat.Standard)
+
+    def supports_chunking(self) -> bool:
+        return not self.use_batched_format
+
+    def supports_expert_map(self) -> bool:
+        return not self.use_batched_format
+
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        # Let PrepareAndFinalize::finalize() decide the impl.
+        return TopKWeightAndReduceDelegate()
+
+    def workspace_shapes(
+        self,
+        a: torch.Tensor,
+        aq: torch.Tensor,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        workspace1: tuple[int, ...] = ()
+        workspace2: tuple[int, ...] = ()
+        output: tuple[int, ...] = ()
+        if self.use_batched_format:
+            padded_M = aq.size(1)
+            num_dp = self.num_dispatchers
+            assert num_dp is not None
+            workspace1 = (self.max_experts_per_worker, padded_M * num_dp,
+                          max(N, K))
+            workspace2 = (self.max_experts_per_worker, padded_M * num_dp,
+                          (N // 2))
+            output = (self.max_experts_per_worker, padded_M, K)
+        else:
+            workspace1 = (M * topk, max(N, K))
+            workspace2 = (M * topk, N // 2)
+            output = (M * topk, K)
+        return (workspace1, workspace2, output,
+                self.out_dtype if self.out_dtype is not None else a.dtype)
+
+    def apply(self, output: torch.Tensor, hidden_states: torch.Tensor,
+              w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor,
+              topk_ids: torch.Tensor, activation: str, global_num_experts: int,
+              expert_map: Optional[torch.Tensor],
+              w1_scale: Optional[torch.Tensor],
+              w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor],
+              w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor],
+              a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor,
+              workspace2: torch.Tensor,
+              expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+              apply_router_weight_on_input: bool,
+              extra_expert_args: Optional[dict[str, Any]]):
+        assert w1_zp is None, "w1_zp is not supported in CUTLASS MoE"
+        assert w2_zp is None, "w2_zp is not supported in CUTLASS MoE"
+
+        expert_num_tokens = None
+        if expert_tokens_meta is not None:
+            expert_num_tokens = expert_tokens_meta.expert_num_tokens
+
+        activation_callable = lambda o, i: self.activation(activation, o, i)
+
+        in_dtype = hidden_states.dtype
+        run_cutlass_moe_fp8(
+            output, hidden_states, w1, w2, topk_ids, activation_callable,
+            global_num_experts, expert_map, w1_scale, w2_scale, a1q_scale,
+            a2_scale, workspace13, workspace2, expert_num_tokens,
+            self.out_dtype if self.out_dtype is not None else in_dtype,
+            self.per_act_token_quant, self.per_out_ch_quant,
+            self.use_batched_format)
+
+
+def cutlass_moe_fp8(
+    a: torch.Tensor,
+    w1_q: torch.Tensor,
+    w2_q: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    per_act_token: Optional[bool] = None,
+    activation: str = "silu",
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    expert_map: Optional[torch.Tensor] = None,
+    apply_router_weight_on_input: bool = False,
+    global_num_experts: int = -1,
+) -> torch.Tensor:
+    """
+    This function computes a a8w8-quantized Mixture of Experts (MoE) layer
+    using two sets of quantized weights, w1_q and w2_q, and top-k gating
+    mechanism. The matrix multiplications are implemented with CUTLASS
+    grouped gemm.
+
+    Parameters:
+    - a (torch.Tensor): The input tensor to the MoE layer.
+        Shape: [M, K]
+    - w1_q (torch.Tensor): The first set of fp8-quantized expert weights.
+        Shape: [num_experts, K, 2N] (the weights are passed transposed)
+    - w2_q (torch.Tensor): The second set of fp8-quantized expert weights.
+        Shape: [num_experts, N, K] (the weights are passed transposed)
+    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
+    - topk_ids (torch.Tensor): The token->expert mappings.
+    - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
+        Shape: [num_experts] or [num_experts, 2N]
+    - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
+        Shape: [num_experts] or [num_experts, K]
+    - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
+        Shape: scalar or [M]
+    - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
+        quantize the intermediate result between the gemms.
+        Shape: scalar or [M]
+    - expert_map (Optional[torch.Tensor]): In the case of Expert parallel,
+        every Rank is responsible for a subset of experts. expert_map is a
+        mapping from global expert-id to local expert-id. When expert_map[i]
+        is -1, it means that this Rank is not responsible for global
+        expert-id i.
+    - apply_router_weight_on_input (bool): When true, the topk weights are
+        applied directly on the inputs. This is only applicable when topk is 1.
+    - global_num_experts (int): The total number of experts.
+
+    Returns:
+    - torch.Tensor: The fp16 output tensor after applying the MoE layer.
+    """
+    if per_act_token is None:
+        per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
+            a2_scale.numel() != 1 if a2_scale is not None else False)
+    per_out_ch = w1_scale.numel() != w1_q.size(0)
+
+    num_experts = global_num_experts if global_num_experts != -1 else w1_q.size(
+        0)
+
+    fn = mk.FusedMoEModularKernel(
+        MoEPrepareAndFinalizeNoEP(),
+        CutlassExpertsFp8(
+            max_experts_per_worker=num_experts,
+            out_dtype=a.dtype,
+            per_act_token_quant=per_act_token,
+            per_out_ch_quant=per_out_ch,
+            use_batched_format=False,
+        ),
+    )
+
+    return fn(
+        a,
+        w1_q,
+        w2_q,
+        topk_weights,
+        topk_ids,
+        False,
+        activation,
+        num_experts,
+        expert_map,
+        w1_scale,
+        w2_scale,
+        a1_scale=a1_scale,
+        a2_scale=a2_scale,
+        apply_router_weight_on_input=apply_router_weight_on_input,
+    )
+
+
+FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
+FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
+
+
+def run_cutlass_moe_fp4(
+    output: torch.Tensor,
+    a: torch.Tensor,
+    a1_gscale: torch.Tensor,
+    w1_fp4: torch.Tensor,
+    w1_blockscale: torch.Tensor,
+    w1_alphas: torch.Tensor,
+    a2_gscale: torch.Tensor,
+    w2_fp4: torch.Tensor,
+    w2_blockscale: torch.Tensor,
+    w2_alphas: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    workspace13: torch.Tensor,
+    workspace2: torch.Tensor,
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    device: torch.device,
+    apply_router_weight_on_input: bool = False,
+) -> None:
+    """
+    MoE implementation for FP4 Inputs
+    
+    # Gemm 1
+    a: Input tensor: [m, k] (half/bfloat16)
+    a1_gscale: Activation scale per expert: [e]  (float32)
+    w1(gate up) (not an argument to cutlass_moe_fp4): [e, 2 * n, k]
+    w1_fp4: [e, 2 * n, k // 2], dtype: torch.uint8 (stacked fp4: E2M1)
+    (Note: `n` is the up projection output dim, `k` is the input dim in
+     full precision)
+    w1_blockscale: [e, 2 * n, k // block_size] (float8_e4m3)
+                   (Block size = 16 for NVFP4)
+    
+    # Gemm 2
+    a2_gscale: Activation scale per expert: [e]
+    w2(down projection) (not an argument to cutlass_moe_fp4): [e, k, n]
+    w2_fp4: [e, k, n // 2], dtype: torch.uint8 (stacked E2M1)
+    w2_blockscale: [e, k, n // block_size], dtype: float8_e4m3
+    
+    topk_weights: [m, topk] dtype: float8
+    topk_ids: [m, topk] dtype: float8
+    
+    m, n, k: Unquantized weight shapes, dtype: int
+    e: number of experts, dtype: int
+
+    assumes that topk < k < n to satisfy - up/down projection expectations.
+    """
+    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
+    assert w1_fp4.dtype == torch.uint8, "weight 1 must be uint8"
+    assert w2_fp4.dtype == torch.uint8, "weight 2 must be uint8"
+    assert (w1_fp4.ndim == 3 and w2_fp4.ndim == 3 and w1_blockscale.ndim == 3
+            and w2_blockscale.ndim
+            == 3), ("All Weights must be of rank 3 for cutlass_moe_fp4")
+    m_a, k_a = a.shape
+    e_w1, nx2_w1, half_k_w1 = w1_fp4.shape
+    e_w2, k_w2, half_n_w2 = w2_fp4.shape
+
+    assert (e_w1 == e_w2 and e_w1 == e), ("Number of experts must match",
+                                          " between weights.")
+    assert (k_a == half_k_w1 * 2
+            and k == k_w2), ("Hidden size mismatch between a, w1 and w2")
+    assert (nx2_w1 == n * 2 and half_n_w2 * 2 == n), ("mismatch in "
+                                                      "expected `n`")
+    assert (m == m_a), "input shape mismatch"
+    assert 2 * half_k_w1 == k_w2, "Hidden size mismatch w2 and w1"
+    assert a.dtype in [torch.half, torch.bfloat16], "Invalid input dtype"
+    assert (topk_weights.size(0) == m and topk_ids.size(0)
+            == m), ("topk must be provided for each row of a")
+    topk = topk_ids.size(1)
+    out_dtype = a.dtype
+    num_topk = topk_ids.size(1)
+
+    expert_offsets = torch.empty((e + 1), dtype=torch.int32, device=device)
+    blockscale_offsets = torch.empty((e + 1), dtype=torch.int32, device=device)
+    # Problem size:  (num_experts, (m,2n,k))
+    problem_sizes1 = torch.empty((e, 3), dtype=torch.int32, device=device)
+    # Problem size:  (num_experts, (m,n,k))
+    problem_sizes2 = torch.empty((e, 3), dtype=torch.int32, device=device)
+
+    a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
+    c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
+
+    if apply_router_weight_on_input:
+        # TODO: this only works for topK=1, will need to update for topK>1
+        assert num_topk == 1, \
+            "apply_router_weight_on_input is only implemented for topk=1"
+        a.mul_(topk_weights.to(out_dtype))
+
+    # problem shapes should have [m, n, k]
+    # Note that problem sizes are based on logical number of elements.
+    ops.get_cutlass_moe_mm_data(topk_ids, expert_offsets, problem_sizes1,
+                                problem_sizes2, a_map, c_map, e, n, k,
+                                blockscale_offsets)
+
+    a = ops.shuffle_rows(a, a_map)
+    rep_a_fp4, rep_a_blockscale = ops.scaled_fp4_experts_quant(
+        a,
+        a1_gscale,
+        expert_offsets,
+        blockscale_offsets,
+        num_topk,
+    )
+    c1 = _resize_cache(workspace13, (m * topk, n * 2))
+    c2 = _resize_cache(workspace2, (m * topk, n))
+    c3 = _resize_cache(workspace13, (m * topk, k))
+    ops.cutlass_fp4_moe_mm(c1, rep_a_fp4, w1_fp4, rep_a_blockscale,
+                           w1_blockscale, w1_alphas, problem_sizes1,
+                           expert_offsets[:-1], blockscale_offsets[:-1])
+    del rep_a_fp4, rep_a_blockscale
+    torch.ops._C.silu_and_mul(c2, c1)
+    int_fp4, int_blockscale = ops.scaled_fp4_experts_quant(
+        c2, a2_gscale, expert_offsets, blockscale_offsets, num_topk)
+
+    ops.cutlass_fp4_moe_mm(c3, int_fp4, w2_fp4, int_blockscale, w2_blockscale,
+                           w2_alphas, problem_sizes2, expert_offsets[:-1],
+                           blockscale_offsets[:-1])
+    del int_fp4, int_blockscale
+
+    c3 = ops.shuffle_rows(c3, c_map)
+
+    assert output.dtype == out_dtype
+    if not apply_router_weight_on_input:
+        output.copy_(
+            (c3.view(m, num_topk, k) *
+             topk_weights.view(m, num_topk, 1).to(out_dtype)).sum(dim=1),
+            non_blocking=True)
+    else:
+        output.copy_(c3.view(m, num_topk, k).sum(dim=1), non_blocking=True)
+    return
+
+
+class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute):
+
+    def __init__(
+        self,
+        max_experts_per_worker: int,
+        out_dtype: torch.dtype,
+        per_act_token_quant: bool,
+        per_out_ch_quant: bool,
+        block_shape: Optional[list[int]] = None,
+        use_batched_format: bool = False,
+    ):
+        super().__init__(
+            FusedMoEQuantConfig(
+                quant_dtype=torch.uint8,
+                per_act_token_quant=per_act_token_quant,
+                per_out_ch_quant=per_out_ch_quant,
+                block_shape=block_shape,
+            ))
+        self.max_experts_per_worker = max_experts_per_worker
+        self.out_dtype = out_dtype
+        self.use_batched_format = use_batched_format
+
+    @property
+    def activation_formats(
+        self
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        if self.use_batched_format:
+            return (mk.FusedMoEActivationFormat.BatchedExperts,
+                    mk.FusedMoEActivationFormat.BatchedExperts)
+        else:
+            return (mk.FusedMoEActivationFormat.Standard,
+                    mk.FusedMoEActivationFormat.Standard)
+
+    def supports_expert_map(self) -> bool:
+        return False
+
+    def supports_chunking(self) -> bool:
+        return True
+
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        # Let PrepareAndFinalize::finalize() decide the impl.
+        return TopKWeightAndReduceDelegate()
+
+    def workspace_shapes(
+        self,
+        a: torch.Tensor,
+        aq: torch.Tensor,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        workspace1: tuple[int, ...] = ()
+        workspace2: tuple[int, ...] = ()
+        output: tuple[int, ...] = ()
+        if self.use_batched_format:
+            padded_M = aq.size(1)
+            workspace1 = (self.max_experts_per_worker, padded_M, max(N, K))
+            workspace2 = (self.max_experts_per_worker, padded_M, (N // 2))
+            output = (self.max_experts_per_worker, padded_M, K)
+        else:
+            workspace1 = (M * topk, max(2 * N, K))
+            workspace2 = (M * topk, N)
+            output = (M, K)
+        return (workspace1, workspace2, output,
+                self.out_dtype if self.out_dtype is not None else a.dtype)
+
+    def apply(self, output: torch.Tensor, hidden_states: torch.Tensor,
+              w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor,
+              topk_ids: torch.Tensor, activation: str, global_num_experts: int,
+              expert_map: Optional[torch.Tensor], w1_scale: torch.Tensor,
+              w2_scale: torch.Tensor, w1_zp: Optional[torch.Tensor],
+              w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor],
+              a2_scale: torch.Tensor, workspace13: Optional[torch.Tensor],
+              workspace2: Optional[torch.Tensor],
+              expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+              apply_router_weight_on_input: bool,
+              extra_expert_args: Optional[dict[str, Any]]):
+        required_keys = [
+            "g1_alphas", "g2_alphas", "a1_gscale", "a2_gscale", "m", "n", "k",
+            "e", "device"
+        ]
+        (g1_alphas, g2_alphas, a1_gscale, a2_gscale, m, n, k, e,
+         device) = extract_required_args(extra_expert_args, required_keys)
+        run_cutlass_moe_fp4(
+            output=output,
+            a=hidden_states,
+            a1_gscale=a1_gscale,
+            w1_fp4=w1,
+            w1_blockscale=w1_scale,
+            w1_alphas=g1_alphas,
+            a2_gscale=a2_gscale,
+            w2_fp4=w2,
+            w2_blockscale=w2_scale,
+            w2_alphas=g2_alphas,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            workspace13=workspace13,
+            workspace2=workspace2,
+            m=m,
+            n=n,
+            k=k,
+            e=e,
+            device=device,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+        )
+
+
+def cutlass_moe_fp4(
+        a: torch.Tensor,
+        w1_fp4: torch.Tensor,
+        w2_fp4: torch.Tensor,
+        w1_blockscale: torch.Tensor,
+        w2_blockscale: torch.Tensor,
+        g1_alphas: torch.Tensor,
+        g2_alphas: torch.Tensor,
+        a1_gscale: torch.Tensor,
+        a2_gscale: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        m: int,
+        n: int,
+        k: int,
+        e: int,
+        device: torch.device,
+        expert_map: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False) -> torch.Tensor:
+    assert expert_map is None, ("Expert Parallelism / expert_map "
+                                "is currently not supported for "
+                                "ModelOptNvFp4FusedMoE's cutlass_moe_fp4.")
+    fn = mk.FusedMoEModularKernel(
+        MoEPrepareAndFinalizeNoEP(),
+        CutlassExpertsFp4(
+            max_experts_per_worker=e,
+            out_dtype=a.dtype,
+            per_act_token_quant=False,
+            per_out_ch_quant=False,
+            use_batched_format=False,
+        ),
+    )
+    extra_expert_args = {
+        'g1_alphas': g1_alphas,
+        'g2_alphas': g2_alphas,
+        'a1_gscale': a1_gscale,
+        'a2_gscale': a2_gscale,
+        'm': m,
+        'n': n,
+        'k': k,
+        'e': e,
+        'device': device,
+    }
+
+    # NVFP4 requires two levels of quantization, which involves computing some
+    # scaling factors dynamically. This makes it incompatible with the typical
+    # prepare -> MoE -> finalize pipeline. Move the quantization logic into the
+    # MoE body.
+    extra_prepare_args = {
+        'skip_quant': True,
+    }
+    # Similar reason as above.
+    extra_finalize_args = {
+        'skip_weight_reduce': True,
+    }
+    return fn(
+        hidden_states=a,
+        w1=w1_fp4,
+        w2=w2_fp4,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        inplace=False,
+        activation="silu",
+        global_num_experts=e,
+        expert_map=None,
+        w1_scale=w1_blockscale,
+        w2_scale=w2_blockscale,
+        a1_scale=None,
+        a2_scale=None,
+        apply_router_weight_on_input=apply_router_weight_on_input,
+        extra_expert_args=extra_expert_args,
+        extra_prepare_args=extra_prepare_args,
+        extra_finalize_args=extra_finalize_args,
+    )
+
+
+def _valid_cutlass_block_scaled_grouped_gemm(
+        w1: torch.Tensor, w2: torch.Tensor, inplace: bool, activation: str,
+        apply_router_weight_on_input: bool,
+        expert_map: Optional[torch.Tensor]) -> bool:
+
+    def _valid_cutlass_block_scaled_grouped_gemm_shape(N: int, K: int):
+        return N % 128 == 0 and K % 128 == 0
+
+    _, K, N = w2.size()
+    if not _valid_cutlass_block_scaled_grouped_gemm_shape(N, K):
+        logger.debug_once(
+            "CutlassBlockScaledGroupedGemm disabled: unaligned problem size. "
+            "N: %s, K: %s",
+            N,
+            K,
+        )
+        return False
+
+    if (w1.dtype != torch.float8_e4m3fn or w2.dtype != torch.float8_e4m3fn):
+        logger.debug_once(
+            "CutlassBlockScaledGroupedGemm disabled: invalid weight dtype(s). "
+            "w1.dtype: %s, w2.dtype: %s",
+            w1.dtype,
+            w2.dtype,
+        )
+        return False
+
+    if expert_map is not None:
+        logger.debug_once(
+            "CutlassBlockScaledGroupedGemm disabled: expert_parallel is"
+            " not supported.")
+        return False
+
+    if activation != "silu":
+        logger.debug_once(
+            "CutlassBlockScaledGroupedGemm disabled: only activation silu is"
+            " supported.")
+        return False
+
+    if apply_router_weight_on_input:
+        logger.debug_once("CutlassBlockScaledGroupedGemm disabled:"
+                          " apply_router_weight_on_input is not supported.")
+        return False
+
+    if inplace:
+        logger.debug_once(
+            "CutlassBlockScaledGroupedGemm disabled: inplace is not supported."
+        )
+        return False
+
+    return True
+
+
+def run_cutlass_block_scaled_fused_experts(
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+) -> torch.Tensor:
+    w1_q = w1.transpose(1, 2)
+    w2_q = w2.transpose(1, 2)
+    w1_scale = w1_scale.transpose(1, 2)
+    w2_scale = w2_scale.transpose(1, 2)
+
+    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
+    assert a.shape[0] == topk_ids.shape[
+        0], "a and topk_ids must have the same batch size"
+    assert w1_q.dtype == torch.float8_e4m3fn, "w1_q must be float8_e4m3fn"
+    assert w2_q.dtype == torch.float8_e4m3fn, "w2_q must be float8_e4m3fn"
+    assert a.shape[1] == w1_q.shape[1], "Hidden size mismatch w1"
+    assert w1_q.shape[2] == w2_q.shape[1] * 2, "Hidden size mismatch w2"
+    assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch"
+    assert w1_q.shape[0] == w1_scale.shape[
+        0], "w1_scale expert number mismatch"
+    assert w1_q.shape[0] == w2_scale.shape[
+        0], "w2_scale expert number mismatch"
+    assert a.dtype in [torch.half, torch.bfloat16], "Invalid output dtype"
+
+    out_dtype = a.dtype
+    num_experts = w1_q.size(0)
+    m = a.size(0)
+    k = w1_q.size(1)
+    n = w2_q.size(1)
+
+    expert_offsets = torch.empty((num_experts + 1, ),
+                                 dtype=torch.int32,
+                                 device="cuda")
+    problem_sizes1 = torch.empty((num_experts, 3),
+                                 dtype=torch.int32,
+                                 device="cuda")
+    problem_sizes2 = torch.empty((num_experts, 3),
+                                 dtype=torch.int32,
+                                 device="cuda")
+
+    topk = topk_ids.size(1)
+
+    a_q, a1_scale = _fp8_quantize(a,
+                                  A_scale=None,
+                                  per_act_token=False,
+                                  block_shape=[128, 128])
+    device = a_q.device
+
+    a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
+    c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
+
+    ops.get_cutlass_moe_mm_data(
+        topk_ids,
+        expert_offsets,
+        problem_sizes1,
+        problem_sizes2,
+        a_map,
+        c_map,
+        num_experts,
+        n,
+        k,
+    )
+
+    rep_a_q = a_q.view(dtype=torch.uint8)[a_map].view(dtype=a_q.dtype)
+    rep_a1_scales = a1_scale[a_map]
+
+    c1 = torch.empty((m * topk, n * 2), dtype=out_dtype, device=device)
+    c2 = torch.empty((m * topk, k), dtype=out_dtype, device=device)
+
+    ops.cutlass_blockwise_scaled_grouped_mm(
+        c1,
+        rep_a_q,
+        w1_q,
+        rep_a1_scales,
+        w1_scale,
+        problem_sizes1,
+        expert_offsets[:-1],
+    )
+
+    intermediate = torch.empty((m * topk, n), dtype=out_dtype, device=device)
+    torch.ops._C.silu_and_mul(intermediate, c1)
+
+    intermediate_q, a2_scale = _fp8_quantize(intermediate,
+                                             A_scale=None,
+                                             per_act_token=False,
+                                             block_shape=[128, 128])
+
+    ops.cutlass_blockwise_scaled_grouped_mm(
+        c2,
+        intermediate_q,
+        w2_q,
+        a2_scale,
+        w2_scale,
+        problem_sizes2,
+        expert_offsets[:-1],
+    )
+
+    return (c2[c_map].view(m, topk, k) *
+            topk_weights.view(m, topk, 1).to(out_dtype)).sum(dim=1)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
new file mode 100644
index 0000000..b89e5ac
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -0,0 +1,290 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import functools
+from typing import Any, Optional
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.deep_gemm_utils import (
+    compute_aligned_M, deepgemm_moe_permute, deepgemm_unpermute_and_reduce)
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP)
+from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
+    TopKWeightAndReduceNoOP)
+from vllm.model_executor.layers.fused_moe.utils import _resize_cache
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8)
+from vllm.utils import has_deep_gemm
+from vllm.utils.deep_gemm import m_grouped_fp8_gemm_nt_contiguous
+
+logger = init_logger(__name__)
+
+
+@functools.cache
+def deep_gemm_block_shape() -> list[int]:
+    # Lazy import to avoid CUDA initialization problems.
+    import deep_gemm as dg
+    block = dg.get_m_alignment_for_contiguous_layout()
+    return [block, block]
+
+
+def _valid_deep_gemm_shape(M: int, N: int, K: int):
+    align = deep_gemm_block_shape()[0]
+    return align <= M and N % align == 0 and K % align == 0
+
+
+def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor,
+                     w2: torch.Tensor) -> bool:
+    """
+    Check if the given problem size is supported by the DeepGemm grouped
+    gemm kernel.  All of M, N, K and the quantization block_shape must be
+    aligned by `dg.get_m_alignment_for_contiguous_layout()`.
+    """
+    if not has_deep_gemm():
+        logger.debug_once("DeepGemm disabled: deep_gemm not available.")
+        return False
+
+    M = hidden_states.size(0)
+    _, K, N = w2.size()
+    if not _valid_deep_gemm_shape(M, N, K):
+        logger.debug_once(
+            "DeepGemm disabled: unaligned problem size. M: %s, N: %s, K: %s",
+            M,
+            N,
+            K,
+        )
+        return False
+
+    if (w1.dtype != torch.float8_e4m3fn or w2.dtype != torch.float8_e4m3fn):
+        logger.debug_once(
+            "DeepGemm disabled: invalid weight dtype(s). "
+            "w1.dtype: %s, w2.dtype: %s",
+            w1.dtype,
+            w2.dtype,
+        )
+        return False
+
+    if (not hidden_states.is_contiguous() or not w1.is_contiguous()
+            or not w2.is_contiguous()):
+        logger.debug_once(
+            "DeepGemm disabled: weights or activations not contiguous. "
+            "hidden_states.is_contiguous(): %s, w1.is_contiguous(): %s, "
+            "w2.is_contiguous(): %s",
+            hidden_states.is_contiguous(),
+            w1.is_contiguous(),
+            w2.is_contiguous(),
+        )
+        return False
+
+    return True
+
+
+class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
+
+    def __init__(self):
+        super().__init__(
+            FusedMoEQuantConfig(
+                quant_dtype=torch.float8_e4m3fn,
+                per_act_token_quant=False,
+                block_shape=deep_gemm_block_shape(),
+            ))
+
+    @property
+    def activation_formats(
+        self
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (mk.FusedMoEActivationFormat.Standard,
+                mk.FusedMoEActivationFormat.Standard)
+
+    def supports_chunking(self) -> bool:
+        return True
+
+    def supports_expert_map(self) -> bool:
+        return True
+
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        return TopKWeightAndReduceNoOP()
+
+    def workspace_shapes(
+        self,
+        a: torch.Tensor,
+        aq: torch.Tensor,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        assert self.block_shape is not None
+        block_m = self.block_shape[0]
+        M_sum = compute_aligned_M(M, topk, local_num_experts, block_m,
+                                  expert_tokens_meta)
+        assert M_sum % block_m == 0
+
+        workspace1 = (M_sum, max(N, K))
+        workspace2 = (M_sum, max(N // 2, K))
+        output = (M, K)
+        return (workspace1, workspace2, output, a.dtype)
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        apply_router_weight_on_input: bool,
+        extra_expert_args: Optional[dict[str, Any]],
+    ):
+        assert self.block_shape is not None
+        assert a1q_scale is not None
+
+        a1q = hidden_states
+        _, N, K = w1.size()
+
+        local_num_experts = w1.size(0)
+        if global_num_experts == -1:
+            global_num_experts = local_num_experts
+
+        assert w2.size(1) == K
+
+        M_sum = compute_aligned_M(M=topk_ids.size(0),
+                                  num_topk=topk_ids.size(1),
+                                  local_num_experts=local_num_experts,
+                                  alignment=deep_gemm_block_shape()[0],
+                                  expert_tokens_meta=expert_tokens_meta)
+
+        a1q_perm = _resize_cache(workspace2.view(dtype=torch.float8_e4m3fn),
+                                 (M_sum, K))
+        mm1_out = _resize_cache(workspace13, (M_sum, N))
+        act_out = _resize_cache(workspace2, (M_sum, N // 2))
+        quant_out = _resize_cache(workspace13.view(dtype=torch.float8_e4m3fn),
+                                  (M_sum, N // 2))
+        mm2_out = _resize_cache(workspace2, (M_sum, K))
+
+        a1q, a1q_scale, expert_ids, inv_perm = deepgemm_moe_permute(
+            aq=a1q,
+            aq_scale=a1q_scale,
+            topk_ids=topk_ids,
+            local_num_experts=local_num_experts,
+            expert_map=expert_map,
+            expert_tokens_meta=expert_tokens_meta,
+            aq_out=a1q_perm)
+        assert a1q.size(0) == M_sum
+
+        m_grouped_fp8_gemm_nt_contiguous((a1q, a1q_scale), (w1, w1_scale),
+                                         mm1_out, expert_ids)
+
+        self.activation(activation, act_out, mm1_out.view(-1, N))
+
+        a2q_scale: Optional[torch.Tensor] = None
+        a2q, a2q_scale = per_token_group_quant_fp8(act_out,
+                                                   self.block_shape[1],
+                                                   column_major_scales=True,
+                                                   out_q=quant_out)
+
+        m_grouped_fp8_gemm_nt_contiguous((a2q, a2q_scale), (w2, w2_scale),
+                                         mm2_out, expert_ids)
+
+        if apply_router_weight_on_input:
+            topk_weights = torch.ones_like(topk_weights)
+
+        deepgemm_unpermute_and_reduce(a=mm2_out,
+                                      topk_ids=topk_ids,
+                                      topk_weights=topk_weights,
+                                      inv_perm=inv_perm,
+                                      expert_map=expert_map,
+                                      output=output)
+
+
+def deep_gemm_moe_fp8(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    activation: str = "silu",
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    apply_router_weight_on_input=False,
+) -> torch.Tensor:
+    """
+    This function computes a a8w8-quantized Mixture of Experts (MoE) layer
+    using two sets of quantized weights, w1_q and w2_q, and top-k gating
+    mechanism. The matrix multiplications are implemented with DeepGemm
+    grouped gemm.
+
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+        Shape: [M, K]
+    - w1 (torch.Tensor): The first set of fp8 quantized expert weights.
+        Shape: [num_experts, K, 2N] (the weights are passed transposed)
+    - w2 (torch.Tensor): The second set of fp8 quantized expert weights.
+        Shape: [num_experts, N, K] (the weights are passed transposed)
+    - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
+        Shape: [num_experts] or [num_experts, 2N]
+    - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
+        Shape: [num_experts] or [num_experts, K]
+    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
+    - topk_ids (torch.Tensor): The token->expert mapping for topk_weights.
+    - inplace (bool): If True, perform the operation in-place.
+        Defaults to False.
+    - activation (str): The activation function to apply after the first
+        MoE layer.
+    - global_num_experts (int): The total number of experts in the global
+        expert space.
+    - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices
+        from the global expert space to the local expert space of the expert
+        parallel shard.
+    - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
+        Shape: scalar or [M]
+    - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
+        quantize the intermediate result between the gemms.
+        Shape: scalar or [M]
+
+    Returns:
+    - torch.Tensor: The bfloat16 output tensor after applying the MoE layer.
+    """
+    fn = mk.FusedMoEModularKernel(
+        MoEPrepareAndFinalizeNoEP(),
+        DeepGemmExperts(),
+    )
+    return fn(
+        hidden_states,
+        w1,
+        w2,
+        topk_weights,
+        topk_ids,
+        inplace,
+        activation,
+        global_num_experts,
+        expert_map,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        a1_scale=a1_scale,
+        a2_scale=a2_scale,
+        apply_router_weight_on_input=apply_router_weight_on_input,
+    )
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
new file mode 100644
index 0000000..8cc5a74
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
@@ -0,0 +1,413 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Taken from https://github.com/ModelTC/LightLLM/blob/8ed97c74c18f11505b048b1ba00ba5c0cef8bff6/lightllm/common/fused_moe/deepep_scatter_gather.py
+and updated to fit vllm needs and terminology.
+"""
+
+import functools
+from typing import Optional
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.utils import count_expert_num_tokens
+from vllm.triton_utils import tl, triton
+from vllm.utils import round_up
+
+
+@functools.cache
+def deep_gemm_block_shape() -> list[int]:
+    # Lazy import to avoid CUDA initialization problems.
+    import deep_gemm as dg
+    block = dg.get_m_alignment_for_contiguous_layout()
+    return [block, block]
+
+
+def expert_num_tokens_round_up_and_sum(expert_num_tokens: torch.Tensor,
+                                       alignment: int) -> int:
+    # Round up each element in expert_num_tokens to the nearest multiple of
+    # alignment.
+    ent = (expert_num_tokens.to(torch.int64) +
+           (alignment - 1)) // alignment * alignment
+    return torch.sum(ent).item()
+
+
+def compute_aligned_M(M: int, num_topk: int, local_num_experts: int,
+                      alignment: int,
+                      expert_tokens_meta: Optional[mk.ExpertTokensMetadata]):
+
+    if ((expert_tokens_meta is not None)
+            and (expert_tokens_meta.expert_num_tokens_cpu is not None)):
+        return expert_num_tokens_round_up_and_sum(
+            expert_tokens_meta.expert_num_tokens_cpu, alignment=alignment)
+
+    # expert_num_tokens information is not available on the cpu.
+    # compute the max required size.
+    M_sum = (M * num_topk) + local_num_experts * (alignment - 1)
+    M_sum = round_up(M_sum, alignment)
+    return M_sum
+
+
+@triton.jit
+def apply_expert_map(expert_id, expert_map):
+    if expert_id != -1:
+        expert_id = tl.load(expert_map + expert_id).to(tl.int64)
+    return expert_id
+
+
+@triton.jit
+def round_up_128(x: int) -> int:
+    y = 128
+    return ((x + y - 1) // y) * y
+
+
+@triton.jit
+def _fwd_kernel_ep_scatter_1(
+    num_recv_tokens_per_expert,
+    expert_start_loc,
+    m_indices,
+    num_experts: tl.constexpr,
+    BLOCK_E: tl.constexpr,
+    BLOCK_EXPERT_NUM: tl.constexpr,
+):
+    cur_expert = tl.program_id(0)
+
+    offset_cumsum = tl.arange(0, BLOCK_EXPERT_NUM)
+    tokens_per_expert = tl.load(num_recv_tokens_per_expert + offset_cumsum,
+                                mask=offset_cumsum < num_experts,
+                                other=0)
+    tokens_per_expert = round_up_128(tokens_per_expert)
+    cumsum = tl.cumsum(tokens_per_expert) - tokens_per_expert
+    tl.store(expert_start_loc + offset_cumsum,
+             cumsum,
+             mask=offset_cumsum < num_experts)
+
+    cur_expert_start = tl.load(expert_start_loc + cur_expert)
+    cur_expert_token_num = tl.load(num_recv_tokens_per_expert + cur_expert)
+
+    m_indices_start_ptr = m_indices + cur_expert_start
+    off_expert = tl.arange(0, BLOCK_E)
+
+    for start_m in tl.range(0, cur_expert_token_num, BLOCK_E, num_stages=4):
+        tl.store(
+            m_indices_start_ptr + start_m + off_expert,
+            cur_expert,
+        )
+
+
+@triton.jit
+def _fwd_kernel_ep_scatter_2(
+    total_token_num,
+    expert_start_loc,
+    recv_x,
+    recv_x_stride0,
+    recv_x_stride1,
+    recv_x_scale,
+    recv_x_scale_stride0,
+    recv_x_scale_stride1,
+    recv_topk,
+    recv_topk_stride0,
+    recv_topk_stride1,
+    output_tensor,
+    output_tensor_stride0,
+    output_tensor_stride1,
+    output_tensor_scale,
+    output_tensor_scale_stride0,
+    output_tensor_scale_stride1,
+    output_index,
+    output_index_stride0,
+    output_index_stride1,
+    topk_num: tl.constexpr,
+    expert_map,
+    HAS_EXPERT_MAP: tl.constexpr,
+    HIDDEN_SIZE: tl.constexpr,
+    HIDDEN_SIZE_PAD: tl.constexpr,
+    SCALE_HIDDEN_SIZE: tl.constexpr,
+    SCALE_HIDDEN_SIZE_PAD: tl.constexpr,
+):
+    start_token_id = tl.program_id(0)
+    grid_num = tl.num_programs(0)
+
+    offset_in = tl.arange(0, HIDDEN_SIZE_PAD)
+    mask = offset_in < HIDDEN_SIZE
+
+    offset_in_s = tl.arange(0, SCALE_HIDDEN_SIZE_PAD)
+    mask_s = offset_in_s < SCALE_HIDDEN_SIZE
+
+    for token_id in range(start_token_id, total_token_num, grid_num):
+        to_copy = tl.load(recv_x + token_id * recv_x_stride0 + offset_in,
+                          mask=mask)
+        to_copy_s = tl.load(recv_x_scale + token_id * recv_x_scale_stride0 +
+                            offset_in_s,
+                            mask=mask_s)
+
+        for topk_index in tl.range(0, topk_num, 1, num_stages=4):
+            expert_id = tl.load(recv_topk + token_id * recv_topk_stride0 +
+                                topk_index)
+
+            if HAS_EXPERT_MAP:
+                expert_id = apply_expert_map(expert_id, expert_map)
+
+            if expert_id >= 0:
+                dest_token_index = tl.atomic_add(expert_start_loc + expert_id,
+                                                 1)
+                tl.store(
+                    output_index + token_id * output_index_stride0 +
+                    topk_index, dest_token_index)
+                output_tensor_ptr = (output_tensor +
+                                     dest_token_index * output_tensor_stride0)
+                output_tensor_scale_ptr = (
+                    output_tensor_scale +
+                    dest_token_index * output_tensor_scale_stride0)
+                tl.store(output_tensor_ptr + offset_in, to_copy, mask=mask)
+                tl.store(output_tensor_scale_ptr + offset_in_s,
+                         to_copy_s,
+                         mask=mask_s)
+
+
+@torch.no_grad()
+def ep_scatter(
+    recv_x: torch.Tensor,
+    recv_x_scale: torch.Tensor,
+    recv_topk: torch.Tensor,
+    num_recv_tokens_per_expert: torch.Tensor,
+    expert_map: Optional[torch.Tensor],
+    expert_start_loc: torch.Tensor,
+    output_tensor: torch.Tensor,
+    output_tensor_scale: torch.Tensor,
+    m_indices: torch.Tensor,
+    output_index: torch.Tensor,
+):
+    BLOCK_E = 128  # token num of per expert is aligned to 128
+    BLOCK_D = 128  # block size of quantization
+    num_warps = 8
+    num_experts = num_recv_tokens_per_expert.shape[0]
+    hidden_size = recv_x.shape[1]
+    # grid = (triton.cdiv(hidden_size, BLOCK_D), num_experts)
+    grid = num_experts
+
+    assert m_indices.shape[0] % BLOCK_E == 0
+
+    _fwd_kernel_ep_scatter_1[(grid, )](
+        num_recv_tokens_per_expert,
+        expert_start_loc,
+        m_indices,
+        num_experts=num_experts,
+        num_warps=num_warps,
+        BLOCK_E=BLOCK_E,
+        BLOCK_EXPERT_NUM=triton.next_power_of_2(num_experts),
+    )
+
+    grid = min(recv_topk.shape[0], 1024 * 8)
+
+    _fwd_kernel_ep_scatter_2[(grid, )](
+        recv_topk.shape[0],
+        expert_start_loc,
+        recv_x,
+        recv_x.stride(0),
+        recv_x.stride(1),
+        recv_x_scale,
+        recv_x_scale.stride(0),
+        recv_x_scale.stride(1),
+        recv_topk,
+        recv_topk.stride(0),
+        recv_topk.stride(1),
+        output_tensor,
+        output_tensor.stride(0),
+        output_tensor.stride(1),
+        output_tensor_scale,
+        output_tensor_scale.stride(0),
+        output_tensor_scale.stride(1),
+        output_index,
+        output_index.stride(0),
+        output_index.stride(1),
+        topk_num=recv_topk.shape[1],
+        expert_map=expert_map,
+        HAS_EXPERT_MAP=expert_map is not None,
+        num_warps=num_warps,
+        HIDDEN_SIZE=hidden_size,
+        HIDDEN_SIZE_PAD=triton.next_power_of_2(hidden_size),
+        SCALE_HIDDEN_SIZE=hidden_size // BLOCK_D,
+        SCALE_HIDDEN_SIZE_PAD=triton.next_power_of_2(hidden_size // BLOCK_D),
+    )
+    return
+
+
+@triton.jit
+def _fwd_kernel_ep_gather(
+    total_token_num,
+    input_tensor,
+    input_tensor_stride0,
+    input_tensor_stride1,
+    recv_topk_ids,
+    recv_topk_ids_stride0,
+    recv_topk_ids_stride1,
+    recv_topk_weight,
+    recv_topk_weight_stride0,
+    recv_topk_weight_stride1,
+    input_index,
+    input_index_stride0,
+    input_index_stride1,
+    output_tensor,
+    output_tensor_stride0,
+    output_tensor_stride1,
+    topk_num: tl.constexpr,
+    expert_map,
+    HAS_EXPERT_MAP: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+):
+    cur_block = tl.program_id(0)
+    start_cur_token = tl.program_id(1)
+    grid_num = tl.num_programs(1)
+
+    for cur_token in range(start_cur_token, total_token_num, grid_num):
+        off_d = tl.arange(0, BLOCK_D)
+        accumulator = tl.zeros([BLOCK_D], dtype=tl.float32)
+        for topk_index in range(0, topk_num):
+            expert_id = tl.load(recv_topk_ids +
+                                cur_token * recv_topk_ids_stride0 + topk_index)
+
+            if HAS_EXPERT_MAP:
+                expert_id = apply_expert_map(expert_id, expert_map)
+
+            if expert_id >= 0:
+                source_token_index = tl.load(input_index +
+                                             cur_token * input_index_stride0 +
+                                             topk_index)
+                acc_weight = tl.load(recv_topk_weight +
+                                     cur_token * recv_topk_weight_stride0 +
+                                     topk_index)
+                tmp = tl.load(input_tensor +
+                              source_token_index * input_tensor_stride0 +
+                              cur_block * BLOCK_D + off_d)
+                accumulator += tmp.to(tl.float32) * acc_weight
+
+        tl.store(
+            output_tensor + cur_token * output_tensor_stride0 +
+            cur_block * BLOCK_D + off_d,
+            accumulator.to(output_tensor.dtype.element_ty),
+        )
+
+
+@torch.no_grad()
+def ep_gather(
+    input_tensor: torch.Tensor,
+    recv_topk_ids: torch.Tensor,
+    recv_topk_weight: torch.Tensor,
+    input_index: torch.Tensor,
+    expert_map: Optional[torch.Tensor],
+    output_tensor: torch.Tensor,
+):
+    num_warps = 2
+    num_tokens = output_tensor.shape[0]
+    hidden_size = input_tensor.shape[1]
+    BLOCK_D = min(hidden_size, 1024)
+    assert hidden_size % BLOCK_D == 0
+    grid = (triton.cdiv(hidden_size, BLOCK_D), min(num_tokens, 1024))
+
+    _fwd_kernel_ep_gather[grid](
+        num_tokens,
+        input_tensor,
+        input_tensor.stride(0),
+        input_tensor.stride(1),
+        recv_topk_ids,
+        recv_topk_ids.stride(0),
+        recv_topk_ids.stride(1),
+        recv_topk_weight,
+        recv_topk_weight.stride(0),
+        recv_topk_weight.stride(1),
+        input_index,
+        input_index.stride(0),
+        input_index.stride(1),
+        output_tensor,
+        output_tensor.stride(0),
+        output_tensor.stride(1),
+        topk_num=recv_topk_ids.shape[1],
+        expert_map=expert_map,
+        HAS_EXPERT_MAP=expert_map is not None,
+        num_warps=num_warps,
+        BLOCK_D=BLOCK_D,
+    )
+    return
+
+
+def deepgemm_moe_permute(aq: torch.Tensor,
+                         aq_scale: torch.Tensor,
+                         topk_ids: torch.Tensor,
+                         local_num_experts: int,
+                         expert_map: Optional[torch.Tensor],
+                         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+                         aq_out: Optional[torch.Tensor] = None):
+
+    assert aq.ndim == 2
+    assert topk_ids.dtype.is_signed, (
+        "The kernel uses -1 to represent invalid topk_ids")
+    H = aq.size(1)
+    device = aq.device
+
+    block_m = deep_gemm_block_shape()[0]
+    block_k = deep_gemm_block_shape()[1]
+
+    M_sum = compute_aligned_M(M=topk_ids.size(0),
+                              num_topk=topk_ids.size(1),
+                              local_num_experts=local_num_experts,
+                              alignment=block_m,
+                              expert_tokens_meta=expert_tokens_meta)
+
+    expert_start_loc = torch.empty((local_num_experts),
+                                   device=device,
+                                   dtype=torch.int32)
+
+    assert aq_out is None or aq_out.shape == (M_sum, H)
+    if aq_out is None:
+        aq_out = torch.empty((M_sum, H), device=device, dtype=aq.dtype)
+
+    aq_scale_out = torch.empty((M_sum, H // block_k),
+                               device=device,
+                               dtype=torch.float32)
+
+    maybe_has_empty_blocks = ((expert_tokens_meta is None)
+                              or (expert_tokens_meta.expert_num_tokens_cpu
+                                  is None))
+    expert_ids_init = torch.zeros if maybe_has_empty_blocks else torch.empty
+
+    expert_ids = expert_ids_init((M_sum), device=device, dtype=torch.int32)
+    inv_perm = torch.empty(topk_ids.shape, device=device, dtype=torch.int32)
+
+    expert_num_tokens = None
+    if expert_tokens_meta is not None:
+        expert_num_tokens = expert_tokens_meta.expert_num_tokens
+    else:
+        expert_num_tokens = count_expert_num_tokens(topk_ids,
+                                                    local_num_experts,
+                                                    expert_map)
+
+    ep_scatter(recv_x=aq,
+               recv_x_scale=aq_scale,
+               recv_topk=topk_ids,
+               num_recv_tokens_per_expert=expert_num_tokens,
+               expert_start_loc=expert_start_loc,
+               expert_map=expert_map,
+               output_tensor=aq_out,
+               output_tensor_scale=aq_scale_out,
+               m_indices=expert_ids,
+               output_index=inv_perm)
+
+    return aq_out, aq_scale_out, expert_ids, inv_perm
+
+
+def deepgemm_unpermute_and_reduce(
+        a: torch.Tensor,  # Grouped gemm output
+        topk_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        inv_perm: torch.Tensor,
+        expert_map: Optional[torch.Tensor],
+        output: torch.Tensor):
+
+    return ep_gather(input_tensor=a,
+                     recv_topk_ids=topk_ids,
+                     recv_topk_weight=topk_weights,
+                     input_index=inv_perm,
+                     expert_map=expert_map,
+                     output_tensor=output)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
new file mode 100644
index 0000000..7016ff3
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
@@ -0,0 +1,217 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, Optional
+
+import deep_ep
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
+    TopKWeightAndReduceContiguous, TopKWeightAndReduceDelegate)
+from vllm.model_executor.layers.fused_moe.utils import (
+    moe_kernel_quantize_input)
+
+
+class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
+    """
+    Prepare/Finalize using DeepEP High-Throughput kernels.
+    """
+
+    def __init__(self, buffer: deep_ep.Buffer, num_dispatchers: int,
+                 dp_size: int, rank_expert_offset: int):
+        super().__init__()
+        self.buffer = buffer
+        self.num_dispatchers_ = num_dispatchers
+        self.dp_size = dp_size
+        self.rank_expert_offset = rank_expert_offset
+        # The dispatch function returns a handle that the combine function
+        # requires. We store the handle here so it is available to the
+        # combine function.
+        self.handle = None
+
+        # From https://github.com/deepseek-ai/DeepEP/blob/9fe9021f29c9083cd1808ab36b740208524d9f63/deep_ep/buffer.py#L164
+        self.available_rank_configs = [2, 4, 8, 16, 24, 32, 64, 128, 144, 160]
+
+    def num_dispatchers(self) -> int:
+        return self.num_dispatchers_
+
+    @property
+    def activation_format(self) -> mk.FusedMoEActivationFormat:
+        return mk.FusedMoEActivationFormat.Standard
+
+    def max_num_tokens_per_rank(self) -> Optional[int]:
+        return None
+
+    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+        return torch.int64
+
+    def _get_dispatch_config(self) -> Optional[deep_ep.Config]:
+        if self.dp_size not in self.available_rank_configs:
+            return None
+        return deep_ep.Buffer.get_dispatch_config(self.dp_size)
+
+    def _get_combine_config(self) -> Optional[deep_ep.Config]:
+        if self.dp_size not in self.available_rank_configs:
+            return None
+        return deep_ep.Buffer.get_combine_config(self.dp_size)
+
+    def _do_dispatch(self, tokens: torch.Tensor,
+                     token_scales: Optional[torch.Tensor],
+                     rank_topk_ids: torch.Tensor,
+                     rank_topk_weights: torch.Tensor, num_experts: int):
+
+        has_scales = token_scales is not None
+
+        (num_tokens_per_rank, num_tokens_per_rdma_rank,
+         dispatch_expert_num_tokens, is_token_in_rank,
+         event) = self.buffer.get_dispatch_layout(
+             topk_idx=rank_topk_ids,
+             num_experts=num_experts,
+             previous_event=None,
+             async_finish=False,
+             allocate_on_comm_stream=False)
+
+        token_data = tokens
+        if has_scales:
+            token_data = (tokens, token_scales)
+
+        (
+            token_data, expert_topk_ids, expert_topk_weights,
+            expert_num_tokens_per_expert_list, self.handle, event
+        ) = self.buffer.dispatch(
+            x=token_data,
+            handle=None,
+            num_tokens_per_rank=num_tokens_per_rank,
+            num_tokens_per_rdma_rank=num_tokens_per_rdma_rank,
+            is_token_in_rank=is_token_in_rank,
+            num_tokens_per_expert=dispatch_expert_num_tokens,
+            topk_idx=rank_topk_ids,
+            topk_weights=rank_topk_weights,
+            # expert_alignment rounds the number of tokens per expert
+            # to this value.
+            expert_alignment=1,
+            config=self._get_dispatch_config(),
+            previous_event=None,
+            async_finish=False,
+            allocate_on_comm_stream=False)
+
+        if has_scales:
+            expert_x, expert_x_scale = token_data
+        else:
+            expert_x, expert_x_scale = token_data, None
+
+        # The existing MOE kernels assume that all entries of topk_ids are
+        # valid. To that effect, set the -1s in expert_topk_ids to some expert
+        # outside this rank so the expert_map can remap it to -1 when safe.
+        # With Expert Parallel, the experts are divided amongst the rank
+        # sequentially. For rank 0, set it to num_experts - 1 and for all other
+        # ranks set it to 0 as we know that expert_map will have a -1 in those
+        # regions for those ranks.
+        #
+        # DeepEP's topk_ids output refers to the local experts directly. Offset
+        # the topk_ids to move it back to the global experts space so it aligns
+        # with existing vLLM interfaces.
+        expert_topk_ids = torch.where(
+            expert_topk_ids == -1,
+            num_experts - 1 if self.rank_expert_offset == 0 else 0,
+            expert_topk_ids + self.rank_expert_offset)
+
+        # Makes a GPU-CPU copy.
+        # TODO (varun): Maybe it is better to re-compute the expert_num_tokens
+        # on GPU.
+        expert_tokens_meta = mk.ExpertTokensMetadata.make_from_list(
+            expert_num_tokens_per_expert_list, device=expert_x.device)
+
+        return (expert_x, expert_x_scale, expert_tokens_meta, expert_topk_ids,
+                expert_topk_weights)
+
+    def prepare(
+        self, a1: torch.Tensor, a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor, num_experts: int,
+        expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool,
+        quant_config: FusedMoEQuantConfig,
+        extra_prepare_args: Optional[dict[str, Any]]
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor],
+               Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor],
+               Optional[torch.Tensor]]:
+
+        if apply_router_weight_on_input:
+            topk = topk_ids.size(1)
+            # TODO: this only works for topK=1, will need to update for topK>1
+            assert topk == 1, (
+                "apply_router_weight_on_input is only implemented for topk=1")
+            a1 = a1 * topk_weights.to(a1.dtype)
+
+        if quant_config.per_act_token_quant:
+            a1q, a1q_scale = moe_kernel_quantize_input(
+                a1,
+                a1_scale,
+                quant_dtype=quant_config.quant_dtype,
+                per_act_token_quant=True,
+                block_shape=quant_config.block_shape,
+            )
+            if a1q_scale is not None and a1q_scale.numel() == 1:
+                a1q_scale = a1q_scale.view(1, 1)
+            (expert_x, expert_x_scale, expert_tokens_meta, expert_topk_ids,
+             expert_topk_weights) = self._do_dispatch(
+                 tokens=a1q,
+                 token_scales=a1q_scale,
+                 rank_topk_ids=topk_ids,
+                 rank_topk_weights=topk_weights,
+                 num_experts=num_experts)
+        else:
+            # DeepEP kernels only support dispatching per-token-quant
+            # quantization. dispatch in bfloat16.
+            (expert_x, _, expert_tokens_meta, expert_topk_ids,
+             expert_topk_weights) = self._do_dispatch(
+                 tokens=a1,
+                 token_scales=None,
+                 rank_topk_ids=topk_ids,
+                 rank_topk_weights=topk_weights,
+                 num_experts=num_experts)
+            # quantize now
+            expert_x_scale = None
+            if expert_x.numel() != 0:
+                expert_x, expert_x_scale = moe_kernel_quantize_input(
+                    expert_x,
+                    a1_scale,
+                    quant_dtype=quant_config.quant_dtype,
+                    per_act_token_quant=False,
+                    block_shape=quant_config.block_shape)
+
+        return (expert_x, expert_x_scale, expert_tokens_meta, expert_topk_ids,
+                expert_topk_weights)
+
+    def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
+                 topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                 apply_router_weight_on_input: bool,
+                 weight_and_reduce_impl: mk.TopKWeightAndReduce,
+                 extra_finalize_args: Optional[dict[str, Any]]) -> None:
+
+        assert self.handle is not None
+
+        # fused_expert_output can have 0 tokens - This happens when none of the
+        # tokens from the all2all reach this EP rank.
+        if fused_expert_output.numel() != 0:
+            if isinstance(weight_and_reduce_impl, TopKWeightAndReduceDelegate):
+                weight_and_reduce_impl = TopKWeightAndReduceContiguous()
+            fused_expert_output = weight_and_reduce_impl.apply(
+                output=None,
+                fused_expert_output=fused_expert_output,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+            )
+
+        combined_x, _, event = self.buffer.combine(
+            x=fused_expert_output,
+            handle=self.handle,
+            topk_weights=None,
+            config=self._get_combine_config(),
+            previous_event=None,
+            async_finish=False,
+            allocate_on_comm_stream=False)
+        # Respect inplace outputs.
+        output.copy_(combined_x, non_blocking=True)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
new file mode 100644
index 0000000..57871ca
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -0,0 +1,189 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, Optional, Union
+
+import deep_ep
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
+    TopKWeightAndReduceDelegate)
+from vllm.model_executor.layers.fused_moe.utils import (
+    moe_kernel_quantize_input, normalize_batched_scales_shape)
+
+# DeepEP kernels quantize dispatch inputs in 128 element chunks.
+DEEPEP_QUANT_BLOCK_SIZE = 128
+DEEPEP_QUANT_BLOCK_SHAPE = [DEEPEP_QUANT_BLOCK_SIZE, DEEPEP_QUANT_BLOCK_SIZE]
+
+
+def dequant_fp8(expert_x_fp8: torch.Tensor,
+                expert_x_scales: torch.Tensor) -> torch.Tensor:
+    """
+    Return dequantized tensor in fp32
+    """
+    # TODO (varun) : Optimize leverage num_tokens_per_expert counts
+    assert expert_x_fp8.is_contiguous()
+    expert_x_scales = expert_x_scales.contiguous()
+    num_experts = expert_x_fp8.size(0)
+
+    expert_x_fp32 = expert_x_fp8.to(torch.float32).view(
+        num_experts, -1, DEEPEP_QUANT_BLOCK_SIZE)
+    expert_x_scales = expert_x_scales.view(num_experts, -1, 1)
+    return (expert_x_fp32 * expert_x_scales).view(expert_x_fp8.size())
+
+
+class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
+    """
+    Prepare/Finalize using DeepEP low-latency kernels.
+    """
+
+    # DeepEP low-latency kernels are compiled only for certain
+    # specific hidden sizes.
+    SUPPORTED_HIDDEN_SIZES = [2048, 2560, 4096, 5120, 7168]
+
+    def __init__(self,
+                 buffer: deep_ep.Buffer,
+                 max_tokens_per_rank: int,
+                 num_dispatchers: int,
+                 use_fp8_dispatch: bool = False):
+        super().__init__()
+
+        self.buffer = buffer
+        self.max_tokens_per_rank = max_tokens_per_rank
+        self.use_fp8_dispatch = use_fp8_dispatch
+        # The dispatch function returns a handle that the combine function
+        # requires. We store the handle here so it is available to the
+        # combine function.
+        self.handle = None
+        self.num_dispatchers_ = num_dispatchers
+
+    def num_dispatchers(self) -> int:
+        return self.num_dispatchers_
+
+    @property
+    def activation_format(self) -> mk.FusedMoEActivationFormat:
+        return mk.FusedMoEActivationFormat.BatchedExperts
+
+    def max_num_tokens_per_rank(self) -> Optional[int]:
+        return self.max_tokens_per_rank
+
+    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+        return torch.int64
+
+    def _do_quant(
+        self,
+        x: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        a1_dtype: torch.dtype,
+        quant_dtype: Optional[torch.dtype],
+        per_act_token_quant: bool,
+        block_shape: Optional[list[int]],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+
+        block_k = block_shape[1] if block_shape is not None else None
+        if self.use_fp8_dispatch:
+            if block_k == DEEPEP_QUANT_BLOCK_SIZE:
+                # DeepEP kernels did the quantization for us.
+                x, x_scales = x
+                return x, x_scales
+
+            # Dequant to get back the tokens in the datatype we dispatched in.
+            x_fp8, x_scales = x
+            x = dequant_fp8(x_fp8, x_scales).to(dtype=a1_dtype)
+
+        assert isinstance(x, torch.Tensor)
+
+        num_experts, max_tokens, hidden_dim = x.size()
+
+        # TODO (varun): Optimization - Use a batched version of quant
+        x = x.view((-1, hidden_dim))
+        x, x_scales = moe_kernel_quantize_input(x, a1_scale, quant_dtype,
+                                                per_act_token_quant,
+                                                block_shape)
+        x = x.view((num_experts, -1, hidden_dim))
+
+        if quant_dtype is not None:
+            assert x_scales is not None
+            x_scales = normalize_batched_scales_shape(x_scales, num_experts)
+
+        return x, x_scales
+
+    def prepare(
+        self, a1: torch.Tensor, a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor, num_experts: int,
+        expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool,
+        quant_config: FusedMoEQuantConfig,
+        extra_prepare_args: Optional[dict[str, Any]]
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor],
+               Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor],
+               Optional[torch.Tensor]]:
+
+        hidden_size = a1.size(1)
+        assert hidden_size in self.SUPPORTED_HIDDEN_SIZES, \
+            (f"Hidden Size {hidden_size} not in supported list of hidden sizes"
+            f"{self.SUPPORTED_HIDDEN_SIZES}")
+
+        if self.use_fp8_dispatch:
+            assert hidden_size % 128 == 0, \
+            "DeepEP kernels quantize the inputs in blocks of shape 128"
+
+        has_per_token_scales = a1_scale.numel(
+        ) != 1 if a1_scale is not None else (
+            a2_scale.numel() != 1 if a2_scale is not None else False)
+        assert not has_per_token_scales, (
+            "low_latency kernels doesn't support dispatching per-token scales")
+
+        if apply_router_weight_on_input:
+            topk = topk_ids.size(1)
+            # TODO: this only works for topK=1, will need to update for topK>1
+            assert topk == 1, (
+                "apply_router_weight_on_input is only implemented for topk=1")
+            a1 = a1 * topk_weights.to(a1.dtype)
+
+        # Dispatch
+        expert_x, expert_num_tokens, self.handle, event, hook = \
+                self.buffer.low_latency_dispatch(a1,
+                                                topk_ids,
+                                                self.max_tokens_per_rank,
+                                                num_experts,
+                                                use_fp8=self.use_fp8_dispatch,
+                                                async_finish=False,
+                                                return_recv_hook=False)
+
+        expert_x, expert_x_scale = self._do_quant(
+            expert_x, a1_scale, a2_scale, a1.dtype, quant_config.quant_dtype,
+            quant_config.per_act_token_quant, quant_config.block_shape)
+
+        expert_tokens_meta = mk.ExpertTokensMetadata(
+            expert_num_tokens=expert_num_tokens, expert_num_tokens_cpu=None)
+
+        return (expert_x, expert_x_scale, expert_tokens_meta, None, None)
+
+    def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
+                 topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                 apply_router_weight_on_input: bool,
+                 weight_and_reduce_impl: mk.TopKWeightAndReduce,
+                 extra_finalize_args: Optional[dict[str, Any]]) -> None:
+        assert isinstance(
+            weight_and_reduce_impl, TopKWeightAndReduceDelegate
+        ), ("Weight application and reduction happens in the combine kernel.")
+        assert self.handle is not None
+
+        combine_topk_weights = topk_weights
+        if apply_router_weight_on_input:
+            # weights have already been applied.
+            combine_topk_weights = torch.ones_like(topk_weights)
+
+        # TODO (varun) : Enable zero copy mode
+        _, event, hook = self.buffer.low_latency_combine(
+            fused_expert_output,
+            topk_ids,
+            combine_topk_weights,
+            self.handle,
+            async_finish=False,
+            zero_copy=False,
+            return_recv_hook=False,
+            out=output)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
new file mode 100644
index 0000000..3e79a1a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
@@ -0,0 +1,198 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, Optional
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
+    TopKWeightAndReduceDelegate)
+from vllm.model_executor.layers.fused_moe.utils import extract_required_args
+from vllm.utils.flashinfer import (flashinfer_cutlass_fused_moe,
+                                   has_flashinfer_cutlass_fused_moe)
+
+logger = init_logger(__name__)
+
+
+def is_valid_flashinfer_cutlass_fused_moe(hidden_states: torch.Tensor,
+                                          w1: torch.Tensor,
+                                          w2: torch.Tensor) -> bool:
+    """
+    Check if the given problem size is supported by the FlashInfer CUTLASS MoE 
+    kernel.
+    """
+    if not has_flashinfer_cutlass_fused_moe():
+        logger.debug_once("FlashInferExperts disabled: "
+                          "flashinfer_cutlass_fused_moe not available.")
+        return False
+    # Data type checks
+    if (w1.dtype != torch.uint8 or w2.dtype != torch.uint8
+            or hidden_states.dtype
+            not in [torch.float32, torch.float16, torch.bfloat16]):
+        logger.debug_once(
+            "FlashInferExperts disabled: w1/w2 must be torch.uint8 "
+            f"(got w1={w1.dtype}, w2={w2.dtype}), hidden_states must be "
+            f"float32, float16, or bfloat16 (got {hidden_states.dtype}).")
+        return False
+    return True
+
+
+class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
+
+    def __init__(
+        self,
+        use_nvfp4_w4a4: bool = False,
+        use_fp8_w8a8: bool = False,
+        use_dp: bool = False,
+        ep_rank: int = 0,
+        ep_size: int = 1,
+        tp_rank: int = 0,
+        tp_size: int = 1,
+        num_dispatchers: Optional[int] = None,
+        use_batched_format: bool = False,
+    ):
+        super().__init__(
+            FusedMoEQuantConfig(
+                quant_dtype=torch.uint8,
+                per_act_token_quant=False,
+                block_shape=None,
+            ))
+        self.use_nvfp4_w4a4 = use_nvfp4_w4a4
+        self.use_fp8_w8a8 = use_fp8_w8a8
+        self.ep_rank = ep_rank
+        self.ep_size = ep_size
+        self.tp_rank = tp_rank
+        self.tp_size = tp_size
+        self.use_dp = use_dp
+        assert not use_batched_format or num_dispatchers is not None
+        self.num_dispatchers = num_dispatchers
+
+    @property
+    def activation_formats(
+        self
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (mk.FusedMoEActivationFormat.Standard,
+                mk.FusedMoEActivationFormat.Standard)
+
+    def supports_expert_map(self) -> bool:
+        return False
+
+    def supports_chunking(self) -> bool:
+        # This refers to TP chunking; DP chunking is handled separately.
+        return True
+
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        # Let PrepareAndFinalize::finalize() decide the impl.
+        return TopKWeightAndReduceDelegate()
+
+    def workspace_shapes(
+        self,
+        a: torch.Tensor,
+        aq: torch.Tensor,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        # We use global_num_experts due to how moe_align_block_size handles
+        # expert_maps.
+        """
+        Compute the shapes for the temporary and final outputs of the two gemms
+        and activation in the fused expert function.  Since the gemms are
+        independent, the workspace for the first gemm can be shared with the
+        workspace for the last gemm.
+
+        Returns a tuple of:
+        - workspace13 shape tuple: must be large enough to hold the
+          result of either expert gemm.
+        - workspace2 shape tuple: must be large enough to hold the
+          result of the activation function.
+        - output shape tuple: must be exact size of the final gemm output.
+        - Workspace type: The dtype to use for the workspace tensors.
+        - Note: in order for activation chunking to work, the first dimension
+          of each tuple must be the number of tokens.
+        """
+        assert self.use_nvfp4_w4a4 is True, ("Only nvfp4 quantization is "
+                                             "currently supported.")
+        aq_m, aq_n = aq.shape
+        workspace2 = ()
+        output_shape = (aq_m, aq_n * 2)
+        workspace_dtype = a.dtype
+        workspace1 = output_shape
+        # The workspace is determined by `aq`, since it comes after any
+        # potential communication op and is involved in the expert computation.
+        return (workspace1, workspace2, output_shape, workspace_dtype)
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],  # Not used
+        workspace13: Optional[torch.Tensor],
+        workspace2: Optional[torch.Tensor],
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        apply_router_weight_on_input: Optional[bool],
+        extra_expert_args: Optional[dict[str, Any]],
+    ):
+        assert extra_expert_args is not None, \
+            "extra_expert_args must be provided"
+        required_keys = [
+            'g1_alphas', 'g2_alphas', 'a1_gscale', 'a2_gscale', 'out_dtype'
+        ]
+
+        g1_alphas, g2_alphas, a1_gscale, a2_gscale, out_dtype = (
+            extract_required_args(extra_expert_args, required_keys))
+
+        # Flashinfer CUTLASS kernel takes scalar global scales,
+        # min because inv_scale.
+        assert self.use_nvfp4_w4a4 is True, ("Only nvfp4 quantization is "
+                                             "currently supported.")
+
+        # Ensure w1_scale and w2_scale are not None before calling view
+        assert w1_scale is not None and w2_scale is not None, (
+            "w1_scale and w2_scale must not "
+            "be None for FlashInferExperts")
+
+        assert not apply_router_weight_on_input
+
+        quant_scales = [
+            a1_gscale,
+            w1_scale.view(torch.int32),
+            g1_alphas,
+            a2_gscale,
+            w2_scale.view(torch.int32),
+            g2_alphas,
+        ]
+        _ = flashinfer_cutlass_fused_moe(
+            input=hidden_states,
+            token_selected_experts=topk_ids.to(torch.int),
+            token_final_scales=topk_weights,
+            # FlashInfer API requires weight to be long for nvfp4
+            fc1_expert_weights=w1.view(torch.long),
+            fc2_expert_weights=w2.view(torch.long),
+            output_dtype=out_dtype,
+            quant_scales=quant_scales,
+            input_sf=a1q_scale,
+            tp_size=self.tp_size,
+            tp_rank=self.tp_rank,
+            ep_size=self.ep_size,
+            ep_rank=self.ep_rank,
+            output=output,
+        )
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
new file mode 100644
index 0000000..e658990
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
@@ -0,0 +1,114 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, Optional
+
+import torch
+
+import vllm.envs as envs
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.distributed import get_dp_group
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.utils import (
+    extract_required_args, moe_kernel_quantize_input)
+from vllm.utils.flashinfer import block_scale_interleave
+
+
+def get_local_sizes(local_tokens):
+    cu_sizes = get_forward_context().dp_metadata.cu_tokens_across_dp_cpu
+    sizes = [cu_sizes[0].item()]
+    for i in range(1, len(cu_sizes)):
+        sizes.append((cu_sizes[i] - cu_sizes[i - 1]).item())
+    max_num_tokens = envs.VLLM_MOE_DP_CHUNK_SIZE
+    sizes_chunked = [max_num_tokens] * len(sizes)
+    if local_tokens < max_num_tokens:
+        # When the number of local tokens is less than max_num_tokens, all other
+        # ranks will also have fewer than max_num_tokens. The remaining tokens
+        # are accounted for as residual.
+        sizes_chunked = [x % max_num_tokens for x in sizes]
+
+    return sizes_chunked
+
+
+class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
+
+    def __init__(
+        self,
+        quant_dtype: Optional[torch.dtype] = None,
+        per_channel_quant: bool = False,
+        block_shape: Optional[list[int]] = None,
+        num_dispatchers: int = 1,
+    ):
+        super().__init__()
+        self.per_channel_quant = per_channel_quant
+        self.block_shape = block_shape
+        self.quant_dtype = quant_dtype
+        self.num_dispatchers_ = num_dispatchers
+
+    @property
+    def activation_format(self) -> mk.FusedMoEActivationFormat:
+        return mk.FusedMoEActivationFormat.Standard
+
+    def max_num_tokens_per_rank(self) -> Optional[int]:
+        return None
+
+    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+        return None
+
+    def num_dispatchers(self) -> int:
+        return self.num_dispatchers_
+
+    def prepare(
+        self,
+        a1: torch.Tensor,
+        a1_scale: Optional[torch.Tensor],  # Not used
+        a2_scale: Optional[torch.Tensor],  # Not used
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool,
+        quant_config: FusedMoEQuantConfig,
+        extra_prepare_args: Optional[dict[str, Any]]
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
+               Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+        assert not apply_router_weight_on_input
+
+        (a1_gscale, use_dp, local_tokens) = extract_required_args(
+            extra_prepare_args, ['a1_gscale', 'use_dp', 'local_tokens'])
+
+        a1q, a1q_scale = moe_kernel_quantize_input(
+            a1,
+            a1_gscale,
+            quant_config.quant_dtype,
+            self.per_channel_quant,
+            self.block_shape,
+            is_fp4_scale_swizzled=not use_dp,  # Swizzling after communication
+        )
+        if use_dp:
+            topk_weights, topk_ids, a1q, a1q_scale = \
+                get_dp_group().all_gatherv([topk_weights, topk_ids, a1q, a1q_scale], # noqa: E501
+                                           dim=0,
+                                           sizes=get_local_sizes(local_tokens))
+            a1_m, a1_n = a1q.shape
+            a1q_scale = block_scale_interleave(a1q_scale)
+
+        return a1q, a1q_scale, None, topk_ids, topk_weights
+
+    def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
+                 topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                 apply_router_weight_on_input: bool,
+                 weight_and_reduce_impl: mk.TopKWeightAndReduce,
+                 extra_finalize_args: Optional[dict[str, Any]]) -> None:
+
+        (use_dp,
+         local_tokens) = extract_required_args(extra_finalize_args,
+                                               ['use_dp', 'local_tokens'])
+        if use_dp:
+            fused_expert_output = get_dp_group().reduce_scatterv(
+                fused_expert_output,
+                dim=0,
+                sizes=get_local_sizes(local_tokens),
+            )
+        output.copy_(fused_expert_output)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
new file mode 100644
index 0000000..9a5c85e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -0,0 +1,1016 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Fused batched MoE kernel."""
+from typing import Any, Optional
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.fused_moe import (
+    get_config_dtype_str, try_get_optimal_moe_config)
+from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
+    TopKWeightAndReduceDelegate, TopKWeightAndReduceNaiveBatched)
+from vllm.model_executor.layers.fused_moe.utils import (
+    _resize_cache, moe_kernel_quantize_input, normalize_batched_scales_shape,
+    normalize_scales_shape)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    group_broadcast)
+from vllm.triton_utils import tl, triton
+
+
+@triton.jit
+def moe_mmk(
+    a_ptrs,
+    b_ptrs,
+    K,
+    expert_id,
+    a_scale_ptr,
+    b_scale_ptr,
+    # The stride variables represent how much to increase the ptr by when
+    # moving by 1 element in a particular dimension. E.g. `stride_am` is
+    # how much to increase `a_ptr` by to get the element one row down
+    # (A has M rows).
+    stride_ak: tl.int64,
+    stride_bk: tl.int64,
+    stride_ase: tl.int64,
+    stride_asm: tl.int64,
+    stride_ask: tl.int64,
+    stride_bse: tl.int64,
+    stride_bsk: tl.int64,
+    stride_bsn: tl.int64,
+    # Offsets and masks
+    offs_m,
+    offs_n,
+    offs_bn,
+    mask_m,
+    # Block size for block-wise quantization
+    group_n: tl.constexpr,
+    group_k: tl.constexpr,
+    # Meta-parameters
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    compute_type: tl.constexpr,
+    use_w8a8: tl.constexpr,
+    use_w8a16: tl.constexpr,
+    per_act_token_quant: tl.constexpr,
+):
+
+    offs_k = tl.arange(0, BLOCK_K)
+
+    if use_w8a16:
+        b_scale_ptrs = b_scale_ptr + expert_id * stride_bse + offs_n[
+            None, :] * stride_bsn
+        b_scale = tl.load(b_scale_ptrs)
+
+    if use_w8a8:
+        # block-wise
+        if group_k > 0 and group_n > 0:
+            a_scale_ptrs = a_scale_ptr + offs_m * stride_asm
+            offs_bsn = offs_bn // group_n
+            b_scale_ptrs = b_scale_ptr + offs_bsn * stride_bsn
+
+        # per act token
+        elif per_act_token_quant:
+            # Load per-token scale for activations
+            a_scale_ptrs = a_scale_ptr + offs_m * stride_asm
+            a_scale = tl.load(a_scale_ptrs, mask=mask_m, other=0.0)[:, None]
+
+            b_scale_ptrs = b_scale_ptr + offs_bn[None, :] * stride_bsn
+            b_scale = tl.load(b_scale_ptrs)
+
+        # tensor-wise
+        else:
+            a_scale = tl.load(a_scale_ptr)
+            b_scale = tl.load(b_scale_ptr)
+
+    # -----------------------------------------------------------
+    # Iterate to compute a block of the C matrix.
+    # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
+    # of fp32 values for higher accuracy.
+    # `accumulator` will be converted back to fp16 after the loop.
+    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        # Load the next block of A and B, generate a mask by checking the
+        # K dimension.
+        a = tl.load(a_ptrs,
+                    mask=mask_m[:, None] & (offs_k[None, :] < K - k * BLOCK_K),
+                    other=0.0)
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_K, other=0.0)
+        # We accumulate along the K dimension.
+        if use_w8a16:
+            accumulator = tl.dot(a, b.to(compute_type), acc=accumulator)
+        elif use_w8a8:
+            if group_k > 0 and group_n > 0:
+                k_start = k * BLOCK_K
+                offs_ks = k_start // group_k
+                a_scale = tl.load(a_scale_ptrs + offs_ks * stride_ask,
+                                  mask=mask_m,
+                                  other=0.0)
+                b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk)
+
+                accumulator += tl.dot(a, b) * a_scale[:,
+                                                      None] * b_scale[None, :]
+            else:
+                # acc used to enable fp8_fast_accum
+                accumulator = tl.dot(a, b, acc=accumulator)
+        else:
+            accumulator += tl.dot(a, b)
+
+        # Advance the ptrs to the next K block.
+        a_ptrs += BLOCK_K * stride_ak
+        b_ptrs += BLOCK_K * stride_bk
+
+    if use_w8a16:
+        accumulator = (accumulator * b_scale).to(compute_type)
+    elif use_w8a8:
+        if group_k > 0 and group_n > 0:
+            accumulator = accumulator.to(compute_type)
+        else:
+            accumulator = (accumulator * a_scale * b_scale).to(compute_type)
+    else:
+        accumulator = accumulator.to(compute_type)
+
+    return accumulator
+
+
+@triton.jit
+def expert_triton_kernel(
+    a_ptr,  #[max_tokens, K]
+    b_ptr,  #[K, N]
+    c_ptr,  #[max_tokens, N]
+    expert_id,
+    compute_type: tl.constexpr,
+    # Dimensions
+    M,
+    N,
+    K,
+    # Quantization data
+    a_scale_ptr,
+    b_scale_ptr,
+    b_zp_ptr,
+    # strides
+    stride_am: tl.int64,
+    stride_ak: tl.int64,
+    stride_bk: tl.int64,
+    stride_bn: tl.int64,
+    stride_cm: tl.int64,
+    stride_cn: tl.int64,
+    stride_ase: tl.int64,
+    stride_asm: tl.int64,
+    stride_ask: tl.int64,
+    stride_bse: tl.int64,
+    stride_bsk: tl.int64,
+    stride_bsn: tl.int64,
+    # offsets
+    offs_bn,
+    # Blockwise quantization data
+    group_n,
+    group_k,
+    # Quantization schemes
+    use_fp8_w8a8: tl.constexpr,
+    use_int8_w8a16: tl.constexpr,
+    per_act_token_quant: tl.constexpr,
+    # Kernel config
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N) % N
+    offs_k = tl.arange(0, BLOCK_K)
+    mask_m = offs_m < M
+
+    # Make grids of a + b pointers
+    a_ptrs = a_ptr + offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak
+    b_ptrs = b_ptr + offs_k[:, None] * stride_bk + offs_n[None, :] * stride_bn
+
+    accumulator = moe_mmk(
+        a_ptrs,
+        b_ptrs,
+        K,
+        expert_id,
+        a_scale_ptr,
+        b_scale_ptr,
+        # The stride variables represent how much to increase the ptr by when
+        # moving by 1 element in a particular dimension. E.g. `stride_am` is
+        # how much to increase `a_ptr` by to get the element one row down
+        # (A has M rows).
+        stride_ak,
+        stride_bk,
+        stride_ase,
+        stride_asm,
+        stride_ask,
+        stride_bse,
+        stride_bsk,
+        stride_bsn,
+        # Offsets and masks
+        offs_m,
+        offs_n,
+        offs_bn,
+        mask_m,
+        # Block size for block-wise quantization
+        group_n,
+        group_k,
+        # Meta-parameters
+        BLOCK_M,
+        BLOCK_N,
+        BLOCK_K,
+        compute_type,
+        use_fp8_w8a8,
+        use_int8_w8a16,
+        per_act_token_quant)
+
+    # store in C
+    offs_cn = tl.arange(0, BLOCK_N)
+    c_ptrs = c_ptr + offs_m[:, None] * stride_cm + offs_cn[None, :] * stride_cn
+    c_mask = mask_m[:, None] & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
+@triton.jit
+def batched_triton_kernel(
+    a_ptr,  # [E, max_num_tokens, K]
+    b_ptr,  # [E, K, N]
+    c_ptr,  # [E, max_num_tokens, N]
+    expert_num_tokens,  # [E]
+    compute_type: tl.constexpr,
+    # Dimensions
+    max_num_tokens,
+    K,
+    N,
+    # Quantization data
+    a_scale_ptr,
+    b_scale_ptr,
+    b_zp_ptr,
+    # The stride variables represent how much to increase the ptr by when
+    # moving by 1 element in a particular dimension. E.g. `stride_am` is
+    # how much to increase `a_ptr` by to get the element one row down
+    # (A has M rows).
+    stride_ae: tl.int64,
+    stride_am: tl.int64,
+    stride_ak: tl.int64,
+    stride_be: tl.int64,
+    stride_bk: tl.int64,
+    stride_bn: tl.int64,
+    stride_ce: tl.int64,
+    stride_cm: tl.int64,
+    stride_cn: tl.int64,
+    stride_ase: tl.int64,
+    stride_asm: tl.int64,
+    stride_ask: tl.int64,
+    stride_bse: tl.int64,
+    stride_bsk: tl.int64,
+    stride_bsn: tl.int64,
+    # Blockwise quantization data
+    group_n: tl.constexpr,
+    group_k: tl.constexpr,
+    # Quantization schemes
+    use_fp8_w8a8: tl.constexpr,
+    use_int8_w8a16: tl.constexpr,
+    per_act_token_quant: tl.constexpr,
+    # Kernel config
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    expert_id = tl.program_id(axis=0)
+    e_num_tokens = tl.load(expert_num_tokens + expert_id)
+    if e_num_tokens == 0:
+        # Early exit
+        return
+
+    # axis 1 is M_blocks * N_blocks
+    pid_mn = tl.program_id(axis=1)
+    #num_pid_m = tl.cdiv(max_num_tokens, BLOCK_M)
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    pid_m = pid_mn // num_pid_n
+    pid_n = pid_mn % num_pid_n
+
+    cta_m_start = pid_m * BLOCK_M
+    cta_n_start = pid_n * BLOCK_N
+    if cta_m_start >= e_num_tokens:
+        # Early exit
+        return
+
+    cta_m_size = min(BLOCK_M, e_num_tokens - cta_m_start)
+    cta_n_size = min(BLOCK_N, N - cta_n_start)
+
+    a_ptr = a_ptr + expert_id * stride_ae + cta_m_start * stride_am
+    b_ptr = b_ptr + expert_id * stride_be + cta_n_start * stride_bn
+    c_ptr = (c_ptr + expert_id * stride_ce + cta_m_start * stride_cm +
+             cta_n_start * stride_cn)
+
+    offs_bn = (pid_n * BLOCK_N + tl.arange(0, BLOCK_N).to(tl.int64)) % N
+
+    if use_fp8_w8a8:
+        a_scale_ptr = a_scale_ptr + expert_id * stride_ase
+        b_scale_ptr = b_scale_ptr + expert_id * stride_bse
+
+        # block-wise
+        if group_k > 0 and group_n > 0 or per_act_token_quant:
+            a_scale_ptr = a_scale_ptr + cta_m_start * stride_asm
+
+    expert_triton_kernel(
+        a_ptr,
+        b_ptr,
+        c_ptr,
+        expert_id,
+        compute_type,
+        cta_m_size,  # M
+        cta_n_size,  # N
+        K,  # K
+        a_scale_ptr,
+        b_scale_ptr,
+        b_zp_ptr,
+        # Strides
+        stride_am,
+        stride_ak,
+        stride_bk,
+        stride_bn,
+        stride_cm,
+        stride_cn,
+        stride_ase,
+        stride_asm,
+        stride_ask,
+        stride_bse,
+        stride_bsk,
+        stride_bsn,
+        # offsets
+        offs_bn,
+        # Blockwise quantization data
+        group_n,
+        group_k,
+        # Quantization schemes
+        use_fp8_w8a8,
+        use_int8_w8a16,
+        per_act_token_quant,
+        # Kernel config
+        BLOCK_M,
+        BLOCK_N,
+        BLOCK_K)
+
+
+def invoke_moe_batched_triton_kernel(
+        A: torch.Tensor,  # [E, max_tokens, K]
+        B: torch.Tensor,  # [E, K, N]
+        C: torch.Tensor,  # [E, max_tokens, N]
+        expert_num_tokens: torch.Tensor,  # [E]
+        compute_type: tl.dtype,
+        # Quantization data
+        A_scale: Optional[torch.Tensor],
+        B_scale: Optional[torch.Tensor],
+        B_zp: torch.Tensor,
+        # Quantization schemes
+        use_fp8_w8a8: bool,
+        use_int8_w8a16: bool,
+        use_int4_w4a16: bool,
+        config: dict[str, int],
+        per_act_token_quant: bool,
+        block_shape: Optional[list[int]] = None):
+
+    assert not use_int4_w4a16
+    max_num_tokens = A.size(1)
+    K = A.size(2)
+    N = C.size(2)
+
+    BLOCK_M = config['BLOCK_SIZE_M']
+    BLOCK_N = config['BLOCK_SIZE_N']
+    BLOCK_K = config['BLOCK_SIZE_K']
+
+    grid = (expert_num_tokens.size(0), triton.cdiv(max_num_tokens, BLOCK_M) *
+            triton.cdiv(B.size(1), BLOCK_N))
+
+    A_scale = normalize_batched_scales_shape(A_scale,
+                                             expert_num_tokens.shape[0])
+
+    if B_scale is not None and B_scale.ndim == 1:
+        assert B_scale.numel() == expert_num_tokens.shape[0]
+        B_scale = B_scale.view(-1, 1, 1)
+
+    assert A_scale is None or A_scale.ndim == 3, (
+        f"{0 if A_scale is None else A_scale.shape}")
+    assert B_scale is None or B_scale.ndim == 1 or B_scale.ndim == 3, (
+        f"{0 if B_scale is None else B_scale.shape}")
+
+    if B_scale is not None:
+        if B_scale.ndim == 1:
+            stride_bse = 1
+            stride_bsk = 0
+            stride_bsn = 0
+        else:
+            stride_bse = B_scale.stride(0)
+            stride_bsk = B_scale.stride(2)
+            stride_bsn = B_scale.stride(1)
+
+    else:
+        stride_bse = 0
+        stride_bsk = 0
+        stride_bsn = 0
+
+    if A_scale is not None:
+        stride_ase = A_scale.stride(0)
+        stride_asm = A_scale.stride(1)
+        stride_ask = A_scale.stride(2)
+    else:
+        stride_ase = 0
+        stride_asm = 0
+        stride_ask = 0
+
+    batched_triton_kernel[grid](
+        A,
+        B,
+        C,
+        expert_num_tokens,
+        compute_type,
+        # Dimensions
+        max_num_tokens,
+        K,
+        N,
+        # Quantization data
+        A_scale,
+        B_scale,
+        B_zp,
+        # Strides
+        A.stride(0),
+        A.stride(1),
+        A.stride(2),
+        B.stride(0),
+        B.stride(2),
+        B.stride(1),
+        C.stride(0),
+        C.stride(1),
+        C.stride(2),
+        stride_ase,
+        stride_asm,
+        stride_ask,
+        stride_bse,
+        stride_bsk,
+        stride_bsn,
+        # Blockwise quantization data
+        0 if block_shape is None else block_shape[0],
+        0 if block_shape is None else block_shape[1],
+        # Quantization schemes
+        use_fp8_w8a8,
+        use_int8_w8a16,
+        per_act_token_quant,
+        # Kernel config
+        BLOCK_M=BLOCK_M,
+        BLOCK_N=BLOCK_N,
+        BLOCK_K=BLOCK_K)
+
+
+class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
+    """
+    A reference prepare/finalize class that reorganizes the tokens into
+    expert batched format, i.e. E x max_num_tokens x K.  This is the format
+    that the PPLX dispatch/combine kernels use.
+    """
+
+    def __init__(
+        self,
+        max_num_tokens: int,
+        num_local_experts: int,
+        num_dispatchers: int,
+        rank: int,
+    ):
+        super().__init__()
+        self.max_num_tokens = max_num_tokens
+        self.num_local_experts = num_local_experts
+        self.rank = rank
+        self.num_dispatchers_ = num_dispatchers
+
+    @property
+    def activation_format(self) -> mk.FusedMoEActivationFormat:
+        return mk.FusedMoEActivationFormat.BatchedExperts
+
+    def max_num_tokens_per_rank(self) -> Optional[int]:
+        return self.max_num_tokens
+
+    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+        return None
+
+    def num_dispatchers(self) -> int:
+        return self.num_dispatchers_
+
+    def prepare(
+        self, a1: torch.Tensor, a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor, num_experts: int,
+        expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool,
+        quant_config: FusedMoEQuantConfig,
+        extra_prepare_args: Optional[dict[str, Any]]
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor],
+               Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor],
+               Optional[torch.Tensor]]:
+        assert a1.dim() == 2
+        assert topk_ids.dim() == 2
+        assert topk_ids.size(0) == a1.size(0)
+
+        if apply_router_weight_on_input:
+            topk = topk_ids.size(1)
+            # TODO: this only works for topK=1, will need to update for topK>1
+            assert topk == 1, \
+                "apply_router_weight_on_input is only implemented for topk=1"
+            a1.mul_(topk_weights.to(a1.dtype))
+
+        num_tokens, hidden_dim = a1.size()
+        topk = topk_ids.size(1)
+
+        tokens_per_expert = torch.zeros(num_experts,
+                                        dtype=torch.int,
+                                        device=a1.device)
+
+        num_local_experts = self.num_local_experts
+
+        if quant_config.quant_dtype is None:
+            b_type = a1.dtype
+        else:
+            b_type = quant_config.quant_dtype
+
+        b_a1 = torch.zeros(
+            (num_local_experts, self.max_num_tokens, hidden_dim),
+            dtype=b_type,
+            device=a1.device)
+
+        if quant_config.is_quantized:
+            scale_shape = quant_config.batched_scale_shape(
+                num_local_experts, self.max_num_tokens, hidden_dim)
+
+            b_a1_scale = torch.empty(scale_shape,
+                                     dtype=torch.float32,
+                                     device=a1.device)
+        else:
+            assert a1_scale is None
+            b_a1_scale = None
+
+        first_expert = num_local_experts * self.rank
+        last_expert = first_expert + num_local_experts
+
+        a1_scale = normalize_scales_shape(a1_scale)
+        a2_scale = normalize_scales_shape(a2_scale)
+
+        for expert_id in range(first_expert, last_expert):
+            topks = torch.any(topk_ids == expert_id, dim=1).flatten()
+            rows = torch.count_nonzero(topks.flatten())
+            if rows == 0:
+                continue
+            idx = expert_id - first_expert
+            tokens_per_expert[idx] = rows
+            rhs = a1[:topks.numel()][topks]
+            if quant_config.quant_dtype is not None:
+                if a1_scale is not None:
+                    if quant_config.is_per_act_token:
+                        rhs_a1_scale = a1_scale[:topks.numel()][topks]
+                    else:
+                        rhs_a1_scale = a1_scale
+                else:
+                    rhs_a1_scale = None
+                b_a1[idx, :rows, :], b_s = moe_kernel_quantize_input(
+                    rhs,
+                    rhs_a1_scale,
+                    quant_config.quant_dtype,
+                    quant_config.per_act_token_quant,
+                    quant_config.block_shape,
+                )
+                assert b_s is not None
+                if quant_config.is_per_act_token:
+                    b_a1_scale[idx, :rows] = b_s[:rows]
+                else:
+                    b_a1_scale[idx, :b_s.shape[0]] = b_s
+            else:
+                b_a1[idx, :rows, :] = rhs
+
+        assert b_a1_scale is None or b_a1_scale.ndim == 3
+
+        expert_tokens_meta = mk.ExpertTokensMetadata(
+            expert_num_tokens=tokens_per_expert, expert_num_tokens_cpu=None)
+
+        return b_a1, b_a1_scale, expert_tokens_meta, None, None
+
+    def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
+                 topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                 apply_router_weight_on_input: bool,
+                 weight_and_reduce_impl: mk.TopKWeightAndReduce,
+                 extra_finalize_args: Optional[dict[str, Any]]) -> None:
+        if isinstance(weight_and_reduce_impl, TopKWeightAndReduceDelegate):
+            weight_and_reduce_impl = TopKWeightAndReduceNaiveBatched(self.rank)
+        weight_and_reduce_impl.apply(
+            output=output,
+            fused_expert_output=fused_expert_output,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+        )
+
+
+class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
+    """
+    A reference MoE expert class that operates on expert batched format,
+    i.e. E x max_num_tokens x K.  This is the format that the pplx
+    dispatch/combine kernels use.
+    """
+
+    def __init__(
+        self,
+        max_num_tokens: int,
+        num_dispatchers: int,
+        use_fp8_w8a8: bool = False,
+        use_int8_w8a8: bool = False,
+        use_int8_w8a16: bool = False,
+        use_int4_w4a16: bool = False,
+        use_mxfp4_w4a4: bool = False,
+        block_shape: Optional[list[int]] = None,
+        per_act_token_quant: bool = False,
+    ):
+        super().__init__(
+            FusedMoEQuantConfig.make(
+                use_fp8_w8a8=use_fp8_w8a8,
+                use_int8_w8a8=use_int8_w8a8,
+                use_int8_w8a16=use_int8_w8a16,
+                use_int4_w4a16=use_int4_w4a16,
+                use_mxfp4_w4a4=use_mxfp4_w4a4,
+                per_act_token_quant=per_act_token_quant,
+                block_shape=block_shape,
+            ))
+        assert not use_int8_w8a8, "NYI"
+        assert not use_int8_w8a16, "NYI"
+        assert not use_int4_w4a16, "NYI"
+        assert not use_mxfp4_w4a4, "NYI"
+        self.max_num_tokens = max_num_tokens
+        self.num_dispatchers = num_dispatchers
+
+    @property
+    def activation_formats(
+        self
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (mk.FusedMoEActivationFormat.BatchedExperts,
+                mk.FusedMoEActivationFormat.BatchedExperts)
+
+    def supports_chunking(self) -> bool:
+        return False
+
+    def supports_expert_map(self) -> bool:
+        return False
+
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        # Let PrepareAndFinalize::finalize() decide the impl.
+        return TopKWeightAndReduceDelegate()
+
+    def workspace_shapes(
+        self,
+        a: torch.Tensor,
+        aq: torch.Tensor,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        assert a.dim() == 2
+        num_dp = self.num_dispatchers
+        num_experts = local_num_experts
+        workspace13 = (num_experts, self.max_num_tokens * num_dp, K)
+        workspace2 = (self.max_num_tokens * num_dp, N)
+        output = workspace13
+        return (workspace13, workspace2, output, a.dtype)
+
+    def dequant(self, t: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
+        assert self.quant_config.is_quantized
+        f32 = torch.float32
+        if (self.quant_config.is_per_act_token
+                or self.quant_config.is_per_tensor):
+            return t.to(f32) * scale
+        else:
+            return t.to(f32) * group_broadcast(scale, t.shape)
+
+    def apply(self, output: torch.Tensor, hidden_states: torch.Tensor,
+              w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor,
+              topk_ids: torch.Tensor, activation: str, global_num_experts: int,
+              expert_map: Optional[torch.Tensor],
+              w1_scale: Optional[torch.Tensor],
+              w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor],
+              w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor],
+              a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor,
+              workspace2: torch.Tensor,
+              expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+              apply_router_weight_on_input: bool,
+              extra_expert_args: Optional[dict[str, Any]]):
+        assert hidden_states.dim() == 3
+        assert expert_tokens_meta is not None
+        expert_num_tokens = expert_tokens_meta.expert_num_tokens
+
+        num_local_experts = w1.size(0)
+        assert num_local_experts == w1.size(0), (
+            f"{num_local_experts} == {w1.size(0)}")
+
+        N = w1.size(1) // 2
+
+        for expert in range(num_local_experts):
+            # Indexing expert_num_tokens doesn't work w/cudagraphs or inductor
+            if (torch.compiler.is_compiling()
+                    or torch.cuda.is_current_stream_capturing()):
+                num = hidden_states.shape[1]
+            else:
+                num = int(expert_num_tokens[expert].item())
+
+            if num == 0:
+                continue
+
+            tmp = _resize_cache(workspace2, (num, N))
+
+            if self.quant_config.is_quantized:
+                assert a1q_scale is not None and w1_scale is not None
+                input = self.dequant(hidden_states[expert, :, :],
+                                     a1q_scale[expert])
+                w1_dq = self.dequant(w1[expert], w1_scale[expert])
+                input = input[:num] @ w1_dq.transpose(0, 1)
+            else:
+                input = hidden_states[expert, :num, :] @ w1[expert].transpose(
+                    0, 1)
+
+            self.activation(activation, tmp, input.to(tmp.dtype))
+
+            if self.quant_config.is_quantized:
+                assert w2_scale is not None
+                w2_dq = self.dequant(w2[expert], w2_scale[expert])
+            else:
+                w2_dq = w2[expert]
+
+            output[expert, :num, :] = tmp @ w2_dq.transpose(0, 1).to(tmp.dtype)
+
+
+def batched_moe_kernel_quantize_input(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    num_tokens: int,
+    E: int,
+    N: int,
+    expert_num_tokens: torch.Tensor,
+    qtype: Optional[torch.dtype],
+    per_act_token_quant: bool,
+    block_shape: Optional[list[int]] = None,
+) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    if (torch.compiler.is_compiling()
+            or torch.cuda.is_current_stream_capturing()):
+        # Note: this does a bunch of extra work because expert_num_tokens is
+        # ignored but it does support torch.compile + cudagraphs.
+        hidden_dim = A.size(-1)
+        assert A_scale is None or A_scale.ndim <= 2, (
+            f"{A_scale.shape if A_scale is not None else None}")
+        A_q, A_q_scale = moe_kernel_quantize_input(A.view(-1,
+                                                          hidden_dim), A_scale,
+                                                   qtype, per_act_token_quant,
+                                                   block_shape)
+        A_q = A_q.view(E, -1, hidden_dim)
+        A_q_scale = normalize_batched_scales_shape(A_q_scale, E)
+
+        return A_q, A_q_scale
+    elif qtype is None:
+        return A, normalize_batched_scales_shape(A_scale, E)
+    else:
+        A_q = torch.empty_like(A, dtype=qtype)
+
+        if per_act_token_quant:
+            assert block_shape is None
+            scale_shape = (E, num_tokens, 1)
+        elif block_shape is not None:
+            _, block_k = block_shape
+            k_tiles = (A.shape[-1] + block_k - 1) // block_k
+            scale_shape = (E, num_tokens, k_tiles)
+        else:
+            scale_shape = (E, 1, 1)
+
+        A_q_scale = torch.zeros(scale_shape,
+                                dtype=torch.float32,
+                                device=A.device)
+
+        num_experts = expert_num_tokens.numel()
+
+        A_scale = normalize_batched_scales_shape(A_scale, num_experts)
+
+        for e in range(E):
+            num_tokens = int(expert_num_tokens[e].item())
+            if num_tokens > 0:
+                if A_scale is not None:
+                    scales = A_scale[e, :min(num_tokens, A_scale.shape[1])]
+                else:
+                    scales = None
+                A_q[e, :num_tokens], tmp_scale = moe_kernel_quantize_input(
+                    A[e, :num_tokens],
+                    scales,
+                    qtype,
+                    per_act_token_quant,
+                    block_shape,
+                )
+                assert tmp_scale is not None
+                A_q_scale[e, :tmp_scale.shape[0]] = tmp_scale
+
+        return A_q, A_q_scale
+
+
+class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
+    """
+    A Triton based MoE expert class that operates on expert batched format,
+    i.e. E x max_num_tokens x K.  This is the format that the pplx
+    dispatch/combine kernels use.
+    """
+
+    def __init__(
+        self,
+        max_num_tokens: int,
+        num_dispatchers: int,
+        use_fp8_w8a8: bool = False,
+        use_int8_w8a8: bool = False,
+        use_int8_w8a16: bool = False,
+        use_int4_w4a16: bool = False,
+        use_mxfp4_w4a4: bool = False,
+        per_act_token_quant: bool = False,
+        block_shape: Optional[list[int]] = None,
+    ):
+        super().__init__(
+            FusedMoEQuantConfig.make(
+                use_fp8_w8a8=use_fp8_w8a8,
+                use_int8_w8a8=use_int8_w8a8,
+                use_int8_w8a16=use_int8_w8a16,
+                use_int4_w4a16=use_int4_w4a16,
+                use_mxfp4_w4a4=use_mxfp4_w4a4,
+                per_act_token_quant=per_act_token_quant,
+                block_shape=block_shape,
+            ))
+        assert not use_int8_w8a8, "NYI"
+        assert not use_int8_w8a16, "NYI"
+        assert not use_int4_w4a16, "NYI"
+        assert not use_mxfp4_w4a4, "NYI"
+        assert max_num_tokens > 0
+        assert num_dispatchers > 0
+        self.use_fp8_w8a8 = use_fp8_w8a8
+        self.use_int8_w8a8 = use_int8_w8a8
+        self.use_int4_w4a16 = use_int4_w4a16
+        self.use_int8_w8a16 = use_int8_w8a16
+        self.use_mxfp4_w4a4 = use_mxfp4_w4a4
+        self.max_num_tokens = max_num_tokens
+        self.num_dispatchers = num_dispatchers
+
+    @property
+    def activation_formats(
+        self
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (mk.FusedMoEActivationFormat.BatchedExperts,
+                mk.FusedMoEActivationFormat.BatchedExperts)
+
+    def supports_chunking(self) -> bool:
+        return False
+
+    def supports_expert_map(self) -> bool:
+        return False
+
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        # Let PrepareAndFinalize::finalize() decide the impl.
+        return TopKWeightAndReduceDelegate()
+
+    def workspace_shapes(
+        self,
+        a: torch.Tensor,
+        aq: torch.Tensor,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        assert a.dim() == 2
+        num_dp = self.num_dispatchers
+        num_experts = local_num_experts
+        max_num_tokens = self.max_num_tokens
+        workspace13 = (num_experts, max_num_tokens * num_dp, max(K, N))
+        workspace2 = (num_experts, max_num_tokens * num_dp, (N // 2))
+        output = (num_experts, max_num_tokens * num_dp, K)
+        return (workspace13, workspace2, output, a.dtype)
+
+    def apply(self, output: torch.Tensor, hidden_states: torch.Tensor,
+              w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor,
+              topk_ids: torch.Tensor, activation: str, global_num_experts: int,
+              expert_map: Optional[torch.Tensor],
+              w1_scale: Optional[torch.Tensor],
+              w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor],
+              w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor],
+              a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor,
+              workspace2: torch.Tensor,
+              expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+              apply_router_weight_on_input: bool,
+              extra_expert_args: Optional[dict[str, Any]]):
+        # Check constraints.
+        if self.use_int4_w4a16:
+            assert hidden_states.size(-1) // 2 == w1.size(2), (
+                "Hidden size mismatch")
+        else:
+            assert hidden_states.size(-1) == w1.size(2), (
+                f"Hidden size mismatch {hidden_states.size(-1)} "
+                f"!= {w1.size(2)}")
+
+        assert hidden_states.is_contiguous(
+        ), "Hidden_states must be contiguous"
+        assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
+        assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
+        assert hidden_states.dtype in [
+            torch.float32, torch.float16, torch.bfloat16, torch.float8_e4m3fn
+        ]
+        assert expert_tokens_meta is not None
+
+        expert_num_tokens = expert_tokens_meta.expert_num_tokens
+
+        E, max_num_tokens, N, K, top_k_num = mk._moe_problem_size(
+            hidden_states, w1, w2, topk_ids)
+
+        assert w1.size(0) == E
+        assert w2.size(0) == E
+
+        config_dtype = get_config_dtype_str(use_fp8_w8a8=self.use_fp8_w8a8,
+                                            use_int8_w8a16=self.use_int8_w8a16,
+                                            use_int4_w4a16=self.use_int4_w4a16,
+                                            use_mxfp4_w4a4=self.use_mxfp4_w4a4,
+                                            dtype=hidden_states.dtype)
+
+        config = try_get_optimal_moe_config(
+            w1.size(),
+            w2.size(),
+            top_k_num,
+            config_dtype,
+            max_num_tokens,
+            block_shape=self.block_shape,
+        )
+
+        if hidden_states.dtype == torch.bfloat16:
+            compute_type = tl.bfloat16
+        elif hidden_states.dtype == torch.float16:
+            compute_type = tl.float16
+        elif hidden_states.dtype == torch.float32:
+            compute_type = tl.float32
+        elif hidden_states.dtype == torch.float8_e4m3fn:
+            compute_type = tl.bfloat16
+        else:
+            raise ValueError(
+                f"Unsupported compute_type: {hidden_states.dtype}")
+
+        # We can reuse the memory between these because by the time we need
+        # cache3, we're done with cache1
+        intermediate_cache1 = _resize_cache(workspace13,
+                                            (E, max_num_tokens, N))
+        intermediate_cache2 = _resize_cache(workspace2,
+                                            (E, max_num_tokens, N // 2))
+
+        if self.use_fp8_w8a8:
+            intermediate_cache1.fill_(0)
+
+        a1q_scale = normalize_batched_scales_shape(a1q_scale, E)
+
+        # MM1
+        invoke_moe_batched_triton_kernel(
+            A=hidden_states,
+            B=w1,
+            C=intermediate_cache1,
+            expert_num_tokens=expert_num_tokens,
+            compute_type=compute_type,
+            A_scale=a1q_scale,
+            B_scale=w1_scale,
+            B_zp=w1_zp,
+            use_fp8_w8a8=self.use_fp8_w8a8,
+            use_int8_w8a16=self.use_int8_w8a16,
+            use_int4_w4a16=self.use_int4_w4a16,
+            config=config,
+            per_act_token_quant=self.per_act_token_quant,
+            block_shape=self.block_shape)
+
+        intermediate_cache2.fill_(0)
+
+        # TODO (bnell): use triton utility from batched deep gemm.
+        self.activation(activation, intermediate_cache2.view(-1, N // 2),
+                        intermediate_cache1.view(-1, N))
+
+        qintermediate_cache2, a2q_scale = batched_moe_kernel_quantize_input(
+            intermediate_cache2, a2_scale, max_num_tokens, E, N,
+            expert_num_tokens, self.quant_dtype, self.per_act_token_quant,
+            self.block_shape)
+
+        invoke_moe_batched_triton_kernel(
+            A=qintermediate_cache2,
+            B=w2,
+            C=output,
+            expert_num_tokens=expert_num_tokens,
+            compute_type=compute_type,
+            A_scale=a2q_scale,
+            B_scale=w2_scale,
+            B_zp=w2_zp,
+            use_fp8_w8a8=self.use_fp8_w8a8,
+            use_int8_w8a16=self.use_int8_w8a16,
+            use_int4_w4a16=self.use_int4_w4a16,
+            config=config,
+            per_act_token_quant=self.per_act_token_quant,
+            block_shape=self.block_shape)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
new file mode 100644
index 0000000..1988c73
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -0,0 +1,234 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Fused MoE utilities for GPTQ."""
+import functools
+from typing import Optional
+
+import torch
+
+import vllm._custom_ops as ops
+from vllm.model_executor.layers.fused_moe.fused_moe import (
+    moe_align_block_size, try_get_optimal_moe_config)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    marlin_make_workspace_new, maybe_warn_marlin_atomic_add)
+from vllm.scalar_type import ScalarType, scalar_types
+from vllm.utils import direct_register_custom_op
+
+
+def fused_marlin_moe(hidden_states: torch.Tensor,
+                     w1: torch.Tensor,
+                     w2: torch.Tensor,
+                     w1_scale: torch.Tensor,
+                     w2_scale: torch.Tensor,
+                     gating_output: torch.Tensor,
+                     topk_weights: torch.Tensor,
+                     topk_ids: torch.Tensor,
+                     quant_type_id: int,
+                     apply_router_weight_on_input: bool = False,
+                     global_num_experts: int = -1,
+                     expert_map: Optional[torch.Tensor] = None,
+                     global_scale1: Optional[torch.Tensor] = None,
+                     global_scale2: Optional[torch.Tensor] = None,
+                     g_idx1: Optional[torch.Tensor] = None,
+                     g_idx2: Optional[torch.Tensor] = None,
+                     sort_indices1: Optional[torch.Tensor] = None,
+                     sort_indices2: Optional[torch.Tensor] = None,
+                     w1_zeros: Optional[torch.Tensor] = None,
+                     w2_zeros: Optional[torch.Tensor] = None,
+                     workspace: Optional[torch.Tensor] = None,
+                     is_k_full: bool = True,
+                     inplace: bool = False) -> torch.Tensor:
+    """
+    This function computes a Mixture of Experts (MoE) layer using two sets of
+    weights, w1 and w2, and top-k gating mechanism.
+
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+    - w1 (torch.Tensor): The first set of expert weights.
+    - w2 (torch.Tensor): The second set of expert weights.
+    - w1_scale (torch.Tensor): Scale to be used for w1.
+    - w2_scale (torch.Tensor): Scale to be used for w2.
+    - gating_output (torch.Tensor): The output of the gating operation
+        (before softmax).
+    - g_idx1 (Optional[torch.Tensor]): The first set of act_order indices.
+    - g_idx2 (Optional[torch.Tensor]): The second set of act_order indices.
+    - sort_indices1 (Optional[torch.Tensor]): The first act_order input
+        permutation.
+    - sort_indices2 (Optional[torch.Tensor]): The second act_order input
+        permutation.
+    - topk_weights (torch.Tensor): Top-k weights.
+    - topk_ids (torch.Tensor): Indices of topk-k elements.
+    - w1_zeros (Optional[torch.Tensor]): Optional zero points to be used for w1.
+    - w2_zeros (Optional[torch.Tensor]): Optional zero points to be used for w2.
+    - num_bits (bool): The number of bits in expert weights quantization.
+
+    Returns:
+    - torch.Tensor: The output tensor after applying the MoE layer.
+    """
+    quant_type = ScalarType.from_id(quant_type_id)
+    assert quant_type in [
+        scalar_types.uint4, scalar_types.uint8b128, scalar_types.uint4b8,
+        scalar_types.float8_e4m3fn, scalar_types.float4_e2m1f
+    ]
+
+    bit4_scalar_types = [
+        scalar_types.uint4, scalar_types.uint4b8, scalar_types.float4_e2m1f
+    ]
+    num_bits = 4 if quant_type in bit4_scalar_types else 8
+
+    # Check constraints.
+    assert hidden_states.shape[0] == gating_output.shape[
+        0], "Number of tokens mismatch"
+    assert hidden_states.shape[
+        1] == w1.shape[1] * 16, "Hidden size mismatch w1"
+    assert hidden_states.shape[1] == w2.shape[2] // (
+        num_bits // 2), "Hidden size mismatch w2"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
+    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
+    assert hidden_states.dtype in [torch.float16, torch.bfloat16]
+    assert num_bits in [4, 8]
+
+    M, K = hidden_states.shape
+    E = w1.shape[0]
+    N = w2.shape[1] * 16
+    topk = topk_ids.shape[1]
+
+    get_config_func = functools.partial(
+        try_get_optimal_moe_config,
+        w1.shape,
+        w2.shape,
+        topk_ids.shape[1],
+        None,
+        is_marlin=True,
+    )
+    config = get_config_func(M)
+
+    block_size_m = config["BLOCK_SIZE_M"]
+
+    if global_num_experts == -1:
+        global_num_experts = E
+    sorted_token_ids, expert_ids, num_tokens_post_padded = \
+        moe_align_block_size(topk_ids, block_size_m, global_num_experts,
+                             expert_map)
+
+    if workspace is None:
+        workspace = marlin_make_workspace_new(hidden_states.device, 4)
+
+    intermediate_cache2 = torch.empty(
+        (M * topk_ids.shape[1], N),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+    intermediate_cache13 = torch.empty(
+        (M * topk_ids.shape[1] * max(2 * N, K), ),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+    intermediate_cache1 = intermediate_cache13[:M * topk_ids.shape[1] * 2 * N]
+    intermediate_cache1 = intermediate_cache1.view(-1, 2 * N)
+    intermediate_cache3 = intermediate_cache13[:M * topk_ids.shape[1] * K]
+    intermediate_cache3 = intermediate_cache3.view(-1, K)
+
+    maybe_warn_marlin_atomic_add(hidden_states.device, hidden_states.dtype)
+    use_atomic_add = hidden_states.dtype == torch.half or \
+        torch.cuda.get_device_capability(hidden_states.device)[0] >= 9
+
+    intermediate_cache1 = ops.moe_wna16_marlin_gemm(
+        hidden_states,
+        intermediate_cache1,
+        w1,
+        w1_scale,
+        global_scale1,
+        w1_zeros,
+        g_idx1,
+        sort_indices1,
+        workspace,
+        sorted_token_ids,
+        expert_ids,
+        num_tokens_post_padded,
+        topk_weights,
+        moe_block_size=block_size_m,
+        top_k=topk,
+        mul_topk_weights=apply_router_weight_on_input,
+        is_ep=expert_map is not None,
+        b_q_type=quant_type,
+        size_m=M,
+        size_n=2 * N,
+        size_k=K,
+        is_k_full=is_k_full,
+        use_atomic_add=use_atomic_add,
+        use_fp32_reduce=True,
+        is_zp_float=False)
+
+    torch.ops._C.silu_and_mul(intermediate_cache2,
+                              intermediate_cache1.view(-1, 2 * N))
+
+    if expert_map is not None:
+        intermediate_cache3.zero_()
+
+    intermediate_cache3 = ops.moe_wna16_marlin_gemm(
+        intermediate_cache2,
+        intermediate_cache3,
+        w2,
+        w2_scale,
+        global_scale2,
+        w2_zeros,
+        g_idx2,
+        sort_indices2,
+        workspace,
+        sorted_token_ids,
+        expert_ids,
+        num_tokens_post_padded,
+        topk_weights,
+        moe_block_size=block_size_m,
+        top_k=1,
+        mul_topk_weights=not apply_router_weight_on_input,
+        is_ep=expert_map is not None,
+        b_q_type=quant_type,
+        size_m=M * topk,
+        size_n=K,
+        size_k=N,
+        is_k_full=is_k_full,
+        use_atomic_add=use_atomic_add,
+        use_fp32_reduce=True,
+        is_zp_float=False).view(-1, topk, K)
+
+    output = hidden_states if inplace else torch.empty_like(hidden_states)
+    return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
+                     dim=1,
+                     out=output)
+
+
+def fused_marlin_moe_fake(hidden_states: torch.Tensor,
+                          w1: torch.Tensor,
+                          w2: torch.Tensor,
+                          w1_scale: torch.Tensor,
+                          w2_scale: torch.Tensor,
+                          gating_output: torch.Tensor,
+                          topk_weights: torch.Tensor,
+                          topk_ids: torch.Tensor,
+                          quant_type_id: int,
+                          apply_router_weight_on_input: bool = False,
+                          global_num_experts: int = -1,
+                          global_scale1: Optional[torch.Tensor] = None,
+                          global_scale2: Optional[torch.Tensor] = None,
+                          expert_map: Optional[torch.Tensor] = None,
+                          g_idx1: Optional[torch.Tensor] = None,
+                          g_idx2: Optional[torch.Tensor] = None,
+                          sort_indices1: Optional[torch.Tensor] = None,
+                          sort_indices2: Optional[torch.Tensor] = None,
+                          w1_zeros: Optional[torch.Tensor] = None,
+                          w2_zeros: Optional[torch.Tensor] = None,
+                          workspace: Optional[torch.Tensor] = None,
+                          is_k_full: bool = True,
+                          inplace: bool = False) -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+
+
+direct_register_custom_op(
+    op_name="fused_marlin_moe",
+    op_func=fused_marlin_moe,
+    mutates_args=[],
+    fake_impl=fused_marlin_moe_fake,
+)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/fused_moe.py
new file mode 100644
index 0000000..c412f69
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -0,0 +1,1889 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Fused MoE kernel."""
+import functools
+import json
+import os
+from typing import Any, Callable, Optional
+
+import torch
+
+import vllm.envs as envs
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+# yapf: disable
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEQuantConfig, get_config_quant_dtype)
+from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+    _valid_cutlass_block_scaled_grouped_gemm,
+    run_cutlass_block_scaled_fused_experts)
+# yapf: enable
+from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+    _valid_deep_gemm, deep_gemm_moe_fp8)
+from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
+    moe_align_block_size)
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP)
+from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
+    TopKWeightAndReduceNoOP)
+from vllm.model_executor.layers.fused_moe.utils import (
+    _resize_cache, moe_kernel_quantize_input, per_token_group_quant_fp8)
+from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
+    dequant_mxfp4)
+from vllm.platforms import current_platform
+from vllm.triton_utils import tl, triton
+from vllm.utils import direct_register_custom_op, is_torch_equal_or_newer
+from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
+
+from .rocm_aiter_fused_moe import is_rocm_aiter_moe_enabled
+
+logger = init_logger(__name__)
+
+
+@triton.jit
+def write_zeros_to_output(c_ptr, stride_cm, stride_cn, pid_n, N, offs_token,
+                          token_mask, BLOCK_SIZE_M, BLOCK_SIZE_N,
+                          compute_type):
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=compute_type)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[
+        None, :]
+    c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
+@triton.jit
+def fused_moe_kernel_gptq_awq(
+        # Pointers to matrices
+        a_ptr,
+        b_ptr,
+        c_ptr,
+        b_scale_ptr,
+        b_zp_ptr,
+        topk_weights_ptr,
+        sorted_token_ids_ptr,
+        expert_ids_ptr,
+        num_tokens_post_padded_ptr,
+        # Matrix dimensions
+        N: tl.constexpr,
+        K: tl.constexpr,
+        EM,
+        num_valid_tokens,
+        # The stride variables represent how much to increase the ptr by when
+        # moving by 1 element in a particular dimension. E.g. `stride_am` is
+        # how much to increase `a_ptr` by to get the element one row down
+        # (A has M rows).
+        stride_am,
+        stride_ak,
+        stride_be,
+        stride_bk,
+        stride_bn,
+        stride_cm,
+        stride_cn,
+        stride_bse,
+        stride_bsk,
+        stride_bsn,
+        stride_bze,
+        stride_bzk,
+        stride_bzn,
+        block_k_diviable: tl.constexpr,
+        group_size: tl.constexpr,
+        # Meta-parameters
+        BLOCK_SIZE_M: tl.constexpr,
+        BLOCK_SIZE_N: tl.constexpr,
+        BLOCK_SIZE_K: tl.constexpr,
+        GROUP_SIZE_M: tl.constexpr,
+        MUL_ROUTED_WEIGHT: tl.constexpr,
+        top_k: tl.constexpr,
+        compute_type: tl.constexpr,
+        has_zp: tl.constexpr,
+        use_int4_w4a16: tl.constexpr,
+        use_int8_w8a16: tl.constexpr):
+    """
+    Implements the fused computation for a Mixture of Experts (MOE) using
+    token and expert matrices.
+
+    Key Parameters:
+    - A: The input tensor representing tokens with shape (*, K), where '*' can
+        be any shape representing batches and K is the feature dimension of
+        each token.
+    - B: The stacked MOE weight tensor with shape (E, N, K), where E is
+        the number of experts, K is the input feature dimension, and N is
+        the output feature dimension.
+    - C: The output cache tensor with shape (M, topk, N), where M is the
+        total number of tokens post padding, topk is the number of times
+        each token is repeated, and N is the output feature dimension.
+    - sorted_token_ids: A tensor containing the sorted indices of tokens,
+        repeated topk times and arranged by the expert index they are
+        assigned to.
+    - expert_ids: A tensor containing the indices of the expert for each
+        block. It determines which expert matrix from B should be used for
+        each block in A.
+    This kernel performs the multiplication of a token by its corresponding
+    expert matrix as determined by `expert_ids`. The sorting of
+    `sorted_token_ids` by expert index and padding ensures divisibility by
+    BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix
+    multiplication across different blocks processed by the same expert.
+    """
+    # -----------------------------------------------------------
+    # Map program ids `pid` to the block of C it should compute.
+    # This is done in a grouped ordering to promote L2 data reuse.
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    # ----------------------------------------------------------
+    # Create pointers for the first blocks of A and B.
+    # We will advance this pointer as we move in the K direction
+    # and accumulate
+    # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers
+    # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers
+    num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr)
+    if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
+        return
+    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(
+        tl.int64)
+    offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
+    token_mask = offs_token < num_valid_tokens
+
+    off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64)
+    if off_experts == -1:
+        # -----------------------------------------------------------
+        # Write back zeros to the output when the expert is not
+        # in the current expert parallel rank.
+        write_zeros_to_output(c_ptr, stride_cm, stride_cn, pid_n, N,
+                              offs_token, token_mask, BLOCK_SIZE_M,
+                              BLOCK_SIZE_N, compute_type)
+        return
+
+    offs_bn = (pid_n * BLOCK_SIZE_N +
+               tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am +
+                      offs_k[None, :] * stride_ak)
+
+    if use_int4_w4a16:
+        b_ptrs = b_ptr + off_experts * stride_be + \
+            (offs_k[:, None] // 2) * stride_bk + offs_bn[None, :] * \
+                stride_bn
+        b_shifter = (offs_k[:, None] % 2) * 4
+    elif use_int8_w8a16:
+        b_ptrs = b_ptr + off_experts * stride_be + \
+            offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn
+
+    if not has_zp and use_int4_w4a16:
+        b_zp_num = 8
+    if not has_zp and use_int8_w8a16:
+        b_zp_num = 128
+    elif has_zp and use_int4_w4a16:
+        b_zp_shifter = (offs_bn[None, :] % 2) * 4
+
+    # -----------------------------------------------------------
+    # Iterate to compute a block of the C matrix.
+    # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
+    # of fp32 values for higher accuracy.
+    # `accumulator` will be converted back to fp16 after the loop.
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        # Load the next block of A and B, generate a mask by checking the
+        # K dimension.
+
+        if not block_k_diviable:
+            k_mask = offs_k[:, None] < K - k * BLOCK_SIZE_K
+            k_other = 0.0
+        else:
+            k_mask = None
+            k_other = None
+
+        a = tl.load(a_ptrs,
+                    mask=token_mask[:, None] &
+                    (offs_k[None, :] < K - k * BLOCK_SIZE_K),
+                    other=0.0)
+        b = tl.load(b_ptrs)
+        if use_int4_w4a16:
+            b = (b >> b_shifter) & 0xF
+
+        b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + \
+            offs_bn[None, :] * stride_bsn + \
+            ((offs_k[:, None] + BLOCK_SIZE_K * k) // group_size) * \
+                stride_bsk
+        b_scale = tl.load(b_scale_ptrs, mask=k_mask, other=k_other)
+        b_scale = b_scale.to(tl.float32)
+
+        if has_zp and use_int4_w4a16:
+            offs_k_true = (offs_k[:, None] + BLOCK_SIZE_K * k) // group_size
+            b_zp_ptrs = b_zp_ptr + off_experts * stride_bze + \
+                (offs_bn[None, :] // 2) * stride_bzn + \
+                offs_k_true * stride_bzk
+            b_zp = tl.load(b_zp_ptrs, mask=k_mask, other=k_other)
+            b_zp = ((b_zp >> b_zp_shifter) & 0xF)
+            b_zp = b_zp.to(tl.float32)
+        elif has_zp and use_int8_w8a16:
+            offs_k_true = (offs_k[:, None] + BLOCK_SIZE_K * k) // group_size
+            b_zp_ptrs = b_zp_ptr + off_experts * stride_bze + \
+                offs_bn[None, :] * stride_bzn + \
+                offs_k_true * stride_bzk
+            b_zp = tl.load(b_zp_ptrs, mask=k_mask, other=k_other)
+            b_zp = b_zp.to(tl.float32)
+
+        # We accumulate along the K dimension.
+        if has_zp:
+            b = ((b.to(tl.float32) - b_zp) * b_scale).to(compute_type)
+        else:
+            b = ((b.to(tl.float32) - b_zp_num) * b_scale).to(compute_type)
+        accumulator = tl.dot(a, b, acc=accumulator)
+
+        # Advance the ptrs to the next K block.
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        if use_int4_w4a16:
+            b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk
+        else:
+            b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    if MUL_ROUTED_WEIGHT:
+        moe_weight = tl.load(topk_weights_ptr + offs_token,
+                             mask=token_mask,
+                             other=0)
+        accumulator = accumulator * moe_weight[:, None]
+
+    accumulator = accumulator.to(compute_type)
+    # -----------------------------------------------------------
+    # Write back the block of the output
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[
+        None, :]
+    c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
+@triton.jit
+def fused_moe_kernel(
+    # Pointers to matrices
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    a_scale_ptr,
+    b_scale_ptr,
+    topk_weights_ptr,
+    sorted_token_ids_ptr,
+    expert_ids_ptr,
+    num_tokens_post_padded_ptr,
+    # Matrix dimensions
+    N,
+    K,
+    EM,
+    num_valid_tokens,
+    # The stride variables represent how much to increase the ptr by when
+    # moving by 1 element in a particular dimension. E.g. `stride_am` is
+    # how much to increase `a_ptr` by to get the element one row down
+    # (A has M rows).
+    stride_am,
+    stride_ak,
+    stride_be,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_asm,
+    stride_ask,
+    stride_bse,
+    stride_bsk,
+    stride_bsn,
+    # Block size for block-wise quantization
+    group_n: tl.constexpr,
+    group_k: tl.constexpr,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    MUL_ROUTED_WEIGHT: tl.constexpr,
+    top_k: tl.constexpr,
+    compute_type: tl.constexpr,
+    use_fp8_w8a8: tl.constexpr,
+    use_int8_w8a8: tl.constexpr,
+    use_int8_w8a16: tl.constexpr,
+    per_channel_quant: tl.constexpr,
+):
+    """
+    Implements the fused computation for a Mixture of Experts (MOE) using
+    token and expert matrices.
+
+    Key Parameters:
+    - A: The input tensor representing tokens with shape (*, K), where '*' can
+        be any shape representing batches and K is the feature dimension of
+        each token.
+    - B: The stacked MOE weight tensor with shape (E, N, K), where E is
+        the number of experts, K is the input feature dimension, and N is
+        the output feature dimension.
+    - C: The output cache tensor with shape (M, topk, N), where M is the
+        total number of tokens post padding, topk is the number of times
+        each token is repeated, and N is the output feature dimension.
+    - sorted_token_ids: A tensor containing the sorted indices of tokens,
+        repeated topk times and arranged by the expert index they are
+        assigned to.
+    - expert_ids: A tensor containing the indices of the expert for each
+        block. It determines which expert matrix from B should be used for
+        each block in A.
+    This kernel performs the multiplication of a token by its corresponding
+    expert matrix as determined by `expert_ids`. The sorting of
+    `sorted_token_ids` by expert index and padding ensures divisibility by
+    BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix
+    multiplication across different blocks processed by the same expert.
+    """
+    # -----------------------------------------------------------
+    # Map program ids `pid` to the block of C it should compute.
+    # This is done in a grouped ordering to promote L2 data reuse.
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    # ----------------------------------------------------------
+    # Create pointers for the first blocks of A and B.
+    # We will advance this pointer as we move in the K direction
+    # and accumulate
+    # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers
+    # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers
+    num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr)
+    if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
+        return
+    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(
+        tl.int64)
+    offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
+    token_mask = offs_token < num_valid_tokens
+
+    off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64)
+    if off_experts == -1:
+        # -----------------------------------------------------------
+        # Write back zeros to the output when the expert is not
+        # in the current expert parallel rank.
+        write_zeros_to_output(c_ptr, stride_cm, stride_cn, pid_n, N,
+                              offs_token, token_mask, BLOCK_SIZE_M,
+                              BLOCK_SIZE_N, compute_type)
+        return
+
+    offs_bn = (pid_n * BLOCK_SIZE_N +
+               tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am +
+                      offs_k[None, :] * stride_ak)
+
+    b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk +
+                                                offs_bn[None, :] * stride_bn)
+    if use_int8_w8a16:
+        b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[
+            None, :] * stride_bsn
+        b_scale = tl.load(b_scale_ptrs)
+
+    if use_fp8_w8a8 or use_int8_w8a8:
+        # block-wise
+        if group_k > 0 and group_n > 0:
+            a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
+            offs_bsn = offs_bn // group_n
+            b_scale_ptrs = (b_scale_ptr + off_experts * stride_bse +
+                            offs_bsn * stride_bsn)
+        # channel-wise
+        elif per_channel_quant:
+            b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[
+                None, :] * stride_bsn
+            b_scale = tl.load(b_scale_ptrs)
+            # Load per-token scale for activations
+            a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
+            a_scale = tl.load(a_scale_ptrs, mask=token_mask, other=0.0)[:,
+                                                                        None]
+        # tensor-wise
+        else:
+            a_scale = tl.load(a_scale_ptr)
+            b_scale = tl.load(b_scale_ptr + off_experts)
+
+    # -----------------------------------------------------------
+    # Iterate to compute a block of the C matrix.
+    # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
+    # of fp32 values for higher accuracy.
+    # `accumulator` will be converted back to fp16 after the loop.
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        # Load the next block of A and B, generate a mask by checking the
+        # K dimension.
+        a = tl.load(a_ptrs,
+                    mask=token_mask[:, None] &
+                    (offs_k[None, :] < K - k * BLOCK_SIZE_K),
+                    other=0.0)
+        b = tl.load(b_ptrs,
+                    mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        # We accumulate along the K dimension.
+        if use_int8_w8a16:
+            accumulator = tl.dot(a, b.to(compute_type), acc=accumulator)
+        elif use_fp8_w8a8 or use_int8_w8a8:
+            if group_k > 0 and group_n > 0:
+                k_start = k * BLOCK_SIZE_K
+                offs_ks = k_start // group_k
+                a_scale = tl.load(a_scale_ptrs + offs_ks * stride_ask,
+                                  mask=token_mask,
+                                  other=0.0)
+                b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk)
+
+                accumulator += tl.dot(a, b) * a_scale[:,
+                                                      None] * b_scale[None, :]
+            else:
+                if use_fp8_w8a8:
+                    # acc used to enable fp8_fast_accum
+                    accumulator = tl.dot(a, b, acc=accumulator)
+                else:
+                    accumulator += tl.dot(a, b)
+        else:
+            accumulator += tl.dot(a, b)
+        # Advance the ptrs to the next K block.
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    if MUL_ROUTED_WEIGHT:
+        moe_weight = tl.load(topk_weights_ptr + offs_token,
+                             mask=token_mask,
+                             other=0)
+        accumulator = accumulator * moe_weight[:, None]
+    if use_int8_w8a16:
+        accumulator = (accumulator * b_scale).to(compute_type)
+    elif use_fp8_w8a8 or use_int8_w8a8:
+        if group_k > 0 and group_n > 0:
+            accumulator = accumulator.to(compute_type)
+        else:
+            accumulator = (accumulator * a_scale * b_scale).to(compute_type)
+    else:
+        accumulator = accumulator.to(compute_type)
+    # -----------------------------------------------------------
+    # Write back the block of the output
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[
+        None, :]
+    c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
+def invoke_fused_moe_kernel(A: torch.Tensor,
+                            B: torch.Tensor,
+                            C: torch.Tensor,
+                            A_scale: Optional[torch.Tensor],
+                            B_scale: Optional[torch.Tensor],
+                            B_zp: Optional[torch.Tensor],
+                            topk_weights: Optional[torch.Tensor],
+                            sorted_token_ids: torch.Tensor,
+                            expert_ids: torch.Tensor,
+                            num_tokens_post_padded: torch.Tensor,
+                            mul_routed_weight: bool,
+                            top_k: int,
+                            config: dict[str, Any],
+                            compute_type: tl.dtype,
+                            use_fp8_w8a8: bool,
+                            use_int8_w8a8: bool,
+                            use_int8_w8a16: bool,
+                            use_int4_w4a16: bool,
+                            per_channel_quant: bool,
+                            block_shape: Optional[list[int]] = None) -> None:
+    assert topk_weights is not None or not mul_routed_weight
+    assert topk_weights is None or topk_weights.stride(1) == 1
+    assert sorted_token_ids.stride(0) == 1
+
+    if use_fp8_w8a8 or use_int8_w8a8:
+        assert B_scale is not None
+        assert (block_shape is None
+                or triton.cdiv(B.size(-2), block_shape[0]) == B_scale.size(-2))
+        assert (block_shape is None
+                or triton.cdiv(B.size(-1), block_shape[1]) == B_scale.size(-1))
+
+    elif use_int8_w8a16 or use_int4_w4a16:
+        assert B_scale is not None
+        assert block_shape is None or block_shape[0] == 0
+    else:
+        assert A_scale is None
+        assert B_scale is None
+
+    M = A.size(0)
+    num_tokens = M * top_k
+
+    EM = sorted_token_ids.size(0)
+    if A.size(0) < config["BLOCK_SIZE_M"]:
+        # optimize for small batch_size.
+        # We assume that top_ids of each token is unique, so
+        # so num_valid_experts <= batch_size <= BLOCK_SIZE_M,
+        # and we can skip some invalid blocks.
+        EM = min(sorted_token_ids.size(0),
+                 A.size(0) * top_k * config['BLOCK_SIZE_M'])
+    grid = lambda META: (triton.cdiv(EM, META['BLOCK_SIZE_M']) * triton.cdiv(
+        B.size(1), META['BLOCK_SIZE_N']), )
+
+    if (use_int8_w8a16 or use_int4_w4a16) and \
+            block_shape is not None and block_shape[1] > 0:
+        assert B_scale is not None and B_scale.ndim == 3
+        assert B_zp is None or B_zp.ndim == 3
+
+        use_moe_wna16_cuda = should_moe_wna16_use_cuda(
+            num_valid_tokens=num_tokens,
+            group_size=block_shape[1],
+            num_experts=B.size(0),
+            bit=4 if use_int4_w4a16 else 8)
+        config = config.copy()
+        config.update(
+            get_moe_wna16_block_config(config=config,
+                                       use_moe_wna16_cuda=use_moe_wna16_cuda,
+                                       num_valid_tokens=num_tokens,
+                                       size_k=A.size(1),
+                                       size_n=B.size(1),
+                                       num_experts=B.size(1),
+                                       group_size=block_shape[1],
+                                       real_top_k=top_k,
+                                       block_size_m=config["BLOCK_SIZE_M"]))
+
+        if use_moe_wna16_cuda:
+            bit = 4 if use_int4_w4a16 else 8
+            ops.moe_wna16_gemm(A, C, B, B_scale, B_zp,
+                               topk_weights if mul_routed_weight else None,
+                               sorted_token_ids, expert_ids,
+                               num_tokens_post_padded, top_k,
+                               config["BLOCK_SIZE_M"], config["BLOCK_SIZE_N"],
+                               config["BLOCK_SIZE_K"], bit)
+            return
+
+        fused_moe_kernel_gptq_awq[grid](
+            A,
+            B,
+            C,
+            B_scale,
+            B_zp,
+            topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            B.size(1),
+            A.size(1),
+            EM,
+            num_tokens,
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(2),
+            B.stride(1),
+            C.stride(1),
+            C.stride(2),
+            B_scale.stride(0),
+            B_scale.stride(2),
+            B_scale.stride(1),
+            B_zp.stride(0) if B_zp is not None else 0,
+            B_zp.stride(2) if B_zp is not None else 0,
+            B_zp.stride(1) if B_zp is not None else 0,
+            block_k_diviable=A.size(1) % config["BLOCK_SIZE_K"] == 0,
+            group_size=block_shape[1],
+            MUL_ROUTED_WEIGHT=mul_routed_weight,
+            top_k=top_k,
+            compute_type=compute_type,
+            has_zp=B_zp is not None,
+            use_int4_w4a16=use_int4_w4a16,
+            use_int8_w8a16=use_int8_w8a16,
+            **config,
+        )
+    else:
+        config = config.copy()
+        BLOCK_SIZE_K = config.pop("BLOCK_SIZE_K")
+        if block_shape is not None:
+            BLOCK_SIZE_K = min(BLOCK_SIZE_K, min(block_shape[0],
+                                                 block_shape[1]))
+        fused_moe_kernel[grid](
+            A,
+            B,
+            C,
+            A_scale,
+            B_scale,
+            topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            B.size(1),
+            B.size(2),
+            EM,
+            num_tokens,
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(2),
+            B.stride(1),
+            C.stride(1),
+            C.stride(2),
+            A_scale.stride(0)
+            if A_scale is not None and A_scale.ndim == 2 else 0,
+            A_scale.stride(1)
+            if A_scale is not None and A_scale.ndim == 2 else 0,
+            B_scale.stride(0)
+            if B_scale is not None and B_scale.ndim >= 2 else 0,
+            B_scale.stride(2)
+            if B_scale is not None and B_scale.ndim == 3 else 0,
+            B_scale.stride(1)
+            if B_scale is not None and B_scale.ndim >= 2 else 0,
+            0 if block_shape is None else block_shape[0],
+            0 if block_shape is None else block_shape[1],
+            MUL_ROUTED_WEIGHT=mul_routed_weight,
+            top_k=top_k,
+            compute_type=compute_type,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a8=use_int8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            per_channel_quant=per_channel_quant,
+            BLOCK_SIZE_K=BLOCK_SIZE_K,
+            **config,
+        )
+
+
+# Adapted from: https://github.com/sgl-project/sglang/pull/2628
+def get_config_file_name(E: int,
+                         N: int,
+                         dtype: Optional[str],
+                         block_shape: Optional[list[int]] = None) -> str:
+    device_name = current_platform.get_device_name().replace(" ", "_")
+    dtype_selector = "" if not dtype else f",dtype={dtype}"
+    block_shape_selector = ("" if not block_shape or not all(block_shape) else
+                            f",block_shape={block_shape}").replace(" ", "")
+    return f"E={E},N={N},device_name={device_name}{dtype_selector}{block_shape_selector}.json"  # noqa: E501
+
+
+# Adapted from: https://github.com/sgl-project/sglang/pull/2628
+@functools.lru_cache
+def get_moe_configs(
+    E: int,
+    N: int,
+    dtype: Optional[str],
+    block_n: Optional[int] = None,
+    block_k: Optional[int] = None,
+) -> Optional[dict[int, Any]]:
+    """
+    Return optimized configurations for the fused MoE kernel.
+
+    The return value will be a dictionary that maps an irregular grid of
+    batch sizes to configurations of the fused_moe kernel. To evaluate the
+    kernel on a given batch size bs, the closest batch size in the grid should
+    be picked and the associated configuration chosen to invoke the kernel.
+    """
+
+    # First look up if an optimized configuration is available in the configs
+    # directory
+    block_shape = [block_n, block_k] if block_n and block_k else None
+    json_file_name = get_config_file_name(E, N, dtype, block_shape)
+
+    config_file_path = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)
+    if os.path.exists(config_file_path):
+        with open(config_file_path) as f:
+            logger.info("Using configuration from %s for MoE layer.",
+                        config_file_path)
+            # If a configuration has been found, return it
+            return {int(key): val for key, val in json.load(f).items()}
+
+    # If no optimized configuration is available, we will use the default
+    # configuration
+    logger.warning(
+        ("Using default MoE config. Performance might be sub-optimal! "
+         "Config file not found at %s"), config_file_path)
+    return None
+
+
+def get_moe_wna16_block_config(config: dict[str,
+                                            int], use_moe_wna16_cuda: bool,
+                               num_valid_tokens: int, size_k: int, size_n: int,
+                               num_experts: int, group_size: int,
+                               real_top_k: int, block_size_m: int):
+    if "BLOCK_SIZE_N" in config and "BLOCK_SIZE_K" in config:
+        # optimal block config is set
+        return {}
+    if not use_moe_wna16_cuda:
+        # triton moe wna16 kernel
+        if num_valid_tokens // real_top_k == 1:
+            # if bs=1, use a smaller BLOCK_SIZE_N
+            return {"BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64}
+        else:
+            return {"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32}
+    else:
+        # cuda moe wna16 kernel
+        # set default block_size 128, and increase them when num_blocks
+        # is too large.
+        block_size_n = 128
+        block_size_k = 128
+        if block_size_k <= group_size:
+            block_size_k = group_size
+
+        num_n_blocks = size_k // block_size_k
+        num_k_blocks = size_n // block_size_k
+        num_m_blocks = (num_valid_tokens + block_size_m - 1) / block_size_m + \
+            num_experts
+        if num_valid_tokens // real_top_k <= block_size_m:
+            num_m_blocks = min(num_m_blocks, num_valid_tokens)
+        num_blocks = num_m_blocks * num_n_blocks * num_k_blocks
+
+        if size_k % 256 == 0 and num_blocks >= 256 and \
+                block_size_k < 256:
+            block_size_k = 256
+            num_blocks = num_blocks // (256 // block_size_k)
+
+        if num_m_blocks <= 16 and size_k % (block_size_k * 2) == 0 and \
+                size_k % (block_size_k * 2) == 0 and block_size_k <= 512 and \
+                num_blocks >= 512:
+            block_size_k = block_size_k * 2
+            num_blocks = num_blocks // 2
+
+        if num_blocks > 1024:
+            block_size_n = 256
+            num_n_blocks = num_n_blocks // 2
+            num_blocks = num_blocks // 2
+
+        if size_n <= 1024 and num_blocks >= 1024:
+            # The kernel performance got much better with BLOCK_SIZE_N=1024
+            # when num_blocks is large, event when N is small.
+            # Not sure why, maybe it force the CUDA SM process only one block
+            # at the same time.
+            block_size_n = 1024
+
+        return {"BLOCK_SIZE_N": block_size_n, "BLOCK_SIZE_K": block_size_k}
+
+
+def should_moe_wna16_use_cuda(num_valid_tokens: int, group_size: int,
+                              num_experts: int, bit: int):
+    return bit == 4 and group_size in [32, 64, 128] and \
+        num_valid_tokens / num_experts <= 6
+
+
+def get_default_config(
+    M: int,
+    E: int,
+    N: int,
+    K: int,
+    topk: int,
+    dtype: Optional[str],
+    is_marlin: bool,
+    block_shape: Optional[list[int]] = None,
+) -> dict[str, int]:
+    if dtype == "fp8_w8a8" and block_shape is not None:
+        # Block-wise quant: BLOCK_SIZE_N must be divisible by block_shape[0]
+        # BLOCK_SIZE_K must be divisible by block_shape[1]
+        # num_stages=3 can cause triton.runtime.errors.OutOfResources
+        # on ROCm, set it to 2 instead.
+        config = {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": block_shape[0],
+            "BLOCK_SIZE_K": block_shape[1],
+            "GROUP_SIZE_M": 32,
+            "num_warps": 4,
+            "num_stages": 3 if not current_platform.is_rocm() else 2,
+        }
+    elif dtype in ["int4_w4a16", "int8_w8a16"] and block_shape is not None:
+        # moe wna16 kernels
+        # only set BLOCK_SIZE_M
+        # BLOCK_SIZE_N and BLOCK_SIZE_K would be set later
+        bit = 4 if dtype == "int4_w4a16" else 8
+        use_moe_wna16_cuda = should_moe_wna16_use_cuda(M * topk,
+                                                       block_shape[1], E, bit)
+        if use_moe_wna16_cuda:
+            config = {"BLOCK_SIZE_M": min(16, M)}
+        elif M <= 20:
+            config = {"BLOCK_SIZE_M": 16, "GROUP_SIZE_M": 1}
+        elif M <= 40:
+            config = {"BLOCK_SIZE_M": 32, "GROUP_SIZE_M": 1}
+        else:
+            config = {"BLOCK_SIZE_M": 64, "GROUP_SIZE_M": 1}
+    elif is_marlin:
+        for block_size_m in [8, 16, 32, 48, 64]:
+            if M * topk / E / block_size_m < 0.9:
+                break
+        return {"BLOCK_SIZE_M": block_size_m}
+    elif M <= E:
+        config = {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 1,
+        }
+    else:
+        config = {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 8,
+        }
+    return config
+
+
+def try_get_optimal_moe_config(
+    w1_shape: tuple[int, ...],
+    w2_shape: tuple[int, ...],
+    top_k: int,
+    dtype: Optional[str],
+    M: int,
+    is_marlin: bool = False,
+    block_shape: Optional[list[int]] = None,
+) -> dict[str, int]:
+    from vllm.model_executor.layers.fused_moe import get_config
+    override_config = get_config()
+    if override_config:
+        config = override_config
+    else:
+        # First try to load optimal config from the file
+        E, _, N = w2_shape
+        if dtype == "int4_w4a16":
+            N = N * 2
+        block_n = block_shape[0] if block_shape else 0
+        block_k = block_shape[1] if block_shape else 0
+        configs = get_moe_configs(E, N, dtype, block_n, block_k)
+
+        if configs:
+            # If an optimal configuration map has been found, look up the
+            # optimal config
+            config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
+        else:
+            # Else use the default config
+            config = get_default_config(M, E, N, w1_shape[2], top_k, dtype,
+                                        is_marlin, block_shape)
+    return config
+
+
+def vllm_topk_softmax(topk_weights: torch.Tensor, topk_indices: torch.Tensor,
+                      token_expert_indices: torch.Tensor,
+                      gating_output: torch.Tensor,
+                      renormalize: bool) -> tuple[torch.Tensor, ...]:
+    ops.topk_softmax(
+        topk_weights,
+        topk_indices,
+        token_expert_indices,
+        gating_output,
+    )
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+
+    return topk_weights, topk_indices
+
+
+def dispatch_topk_func() -> Callable[..., tuple[torch.Tensor, ...]]:
+    if is_rocm_aiter_moe_enabled():
+        from .rocm_aiter_fused_moe import rocm_aiter_topk_softmax
+        return rocm_aiter_topk_softmax
+    return vllm_topk_softmax
+
+
+def fused_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    indices_type: Optional[torch.dtype] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    assert hidden_states.size(0) == gating_output.size(0), (
+        "Number of tokens mismatch")
+
+    M, _ = hidden_states.size()
+
+    topk_weights = torch.empty(M,
+                               topk,
+                               dtype=torch.float32,
+                               device=hidden_states.device)
+    topk_ids = torch.empty(
+        M,
+        topk,
+        dtype=torch.int32 if indices_type is None else indices_type,
+        device=hidden_states.device)
+    token_expert_indices = torch.empty(M,
+                                       topk,
+                                       dtype=torch.int32,
+                                       device=hidden_states.device)
+
+    gating_output_float = gating_output.float()  # TODO(woosuk): Optimize this.
+
+    topk_func = dispatch_topk_func()
+    topk_weights, topk_ids = topk_func(topk_weights, topk_ids,
+                                       token_expert_indices,
+                                       gating_output_float, renormalize)
+
+    return topk_weights, topk_ids, token_expert_indices
+
+
+# This is used by the Deepseek-V2 and Deepseek-V3 model
+@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
+def grouped_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: int = 0,
+    topk_group: int = 0,
+    scoring_func: str = "softmax",
+    e_score_correction_bias: Optional[torch.Tensor] = None
+) -> tuple[torch.Tensor, torch.Tensor]:
+
+    assert hidden_states.size(0) == gating_output.size(0), (
+        "Number of tokens mismatch")
+
+    if scoring_func == "softmax":
+        scores = torch.softmax(gating_output, dim=-1)
+    elif scoring_func == "sigmoid":
+        scores = gating_output.sigmoid()
+    else:
+        raise ValueError(f"Unsupported scoring function: {scoring_func}")
+
+    num_token = scores.size(0)
+    if e_score_correction_bias is not None:
+        # Store original scores before applying correction bias. We use biased
+        # scores for expert selection but original scores for routing weights
+        original_scores = scores
+        scores = scores + e_score_correction_bias.unsqueeze(0)
+        group_scores = (scores.view(num_token, num_expert_group,
+                                    -1).topk(2, dim=-1)[0].sum(dim=-1))
+    else:
+        group_scores = scores.view(num_token, num_expert_group,
+                                   -1).max(dim=-1).values  # [n, n_group]
+    group_idx = torch.topk(group_scores, k=topk_group, dim=-1,
+                           sorted=False)[1]  # [n, top_k_group]
+    group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+    group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+    score_mask = group_mask.unsqueeze(-1).expand(
+        num_token, num_expert_group,
+        scores.size(-1) // num_expert_group).reshape(num_token, -1)  # [n, e]
+    tmp_scores = scores.masked_fill(~score_mask.bool(),
+                                    float("-inf"))  # [n, e]
+
+    if e_score_correction_bias is not None:
+        topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)[1]
+        # Use original unbiased scores for the routing weights
+        topk_weights = original_scores.gather(1, topk_ids)
+    else:
+        topk_weights, topk_ids = torch.topk(tmp_scores,
+                                            k=topk,
+                                            dim=-1,
+                                            sorted=False)
+
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+
+    return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
+
+
+def get_config_dtype_str(
+        dtype: torch.dtype,
+        use_int4_w4a16: Optional[bool] = False,
+        use_int8_w8a16: Optional[bool] = False,
+        use_fp8_w8a8: Optional[bool] = False,
+        use_mxfp4_w4a4: Optional[bool] = False) -> Optional[str]:
+    if use_fp8_w8a8:
+        return "fp8_w8a8"
+    elif use_int8_w8a16:
+        return "int8_w8a16"
+    elif use_int4_w4a16:
+        return "int4_w4a16"
+    elif use_mxfp4_w4a4:
+        return "mxfp4_w4a4"
+    elif dtype == torch.float:
+        # avoiding cases where kernel fails when float32 MoE
+        # use fp16/bfloat16 configs
+        return "float32"
+    return None
+
+
+def inplace_fused_experts(hidden_states: torch.Tensor,
+                          w1: torch.Tensor,
+                          w2: torch.Tensor,
+                          topk_weights: torch.Tensor,
+                          topk_ids: torch.Tensor,
+                          activation: str = "silu",
+                          apply_router_weight_on_input: bool = False,
+                          use_fp8_w8a8: bool = False,
+                          use_int8_w8a8: bool = False,
+                          use_int8_w8a16: bool = False,
+                          use_int4_w4a16: bool = False,
+                          use_mxfp4_w4a4: bool = False,
+                          per_channel_quant: bool = False,
+                          global_num_experts: int = -1,
+                          expert_map: Optional[torch.Tensor] = None,
+                          w1_scale: Optional[torch.Tensor] = None,
+                          w2_scale: Optional[torch.Tensor] = None,
+                          w1_zp: Optional[torch.Tensor] = None,
+                          w2_zp: Optional[torch.Tensor] = None,
+                          a1_scale: Optional[torch.Tensor] = None,
+                          a2_scale: Optional[torch.Tensor] = None,
+                          block_shape: Optional[list[int]] = None) -> None:
+    fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True,
+                       activation, apply_router_weight_on_input, use_fp8_w8a8,
+                       use_int8_w8a8, use_int8_w8a16, use_int4_w4a16,
+                       use_mxfp4_w4a4, per_channel_quant, global_num_experts,
+                       expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale,
+                       a2_scale, block_shape)
+
+
+def inplace_fused_experts_fake(
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str = "silu",
+        apply_router_weight_on_input: bool = False,
+        use_fp8_w8a8: bool = False,
+        use_int8_w8a8: bool = False,
+        use_int8_w8a16: bool = False,
+        use_int4_w4a16: bool = False,
+        use_mxfp4_w4a4: bool = False,
+        per_channel_quant: bool = False,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        w1_scale: Optional[torch.Tensor] = None,
+        w2_scale: Optional[torch.Tensor] = None,
+        w1_zp: Optional[torch.Tensor] = None,
+        w2_zp: Optional[torch.Tensor] = None,
+        a1_scale: Optional[torch.Tensor] = None,
+        a2_scale: Optional[torch.Tensor] = None,
+        block_shape: Optional[list[int]] = None) -> None:
+    pass
+
+
+direct_register_custom_op(
+    op_name="inplace_fused_experts",
+    op_func=inplace_fused_experts,
+    mutates_args=["hidden_states"],
+    fake_impl=inplace_fused_experts_fake,
+    tags=(() if is_torch_equal_or_newer("2.7.0") else
+          (torch.Tag.needs_fixed_stride_order, )),
+)
+
+
+def next_positive_power_of_2(x: int) -> int:
+    if x < 1:
+        return 1
+    return 1 << (x - 1).bit_length()
+
+
+def _get_tile_tokens_dim(num_tokens, top_k, num_experts):
+    # Guess tokens per expert assuming perfect expert distribution first.
+    num_tokens_per_expert = (num_tokens * top_k) // num_experts
+    # And pad the number to the next power of 2.
+    tile_tokens_dim = next_positive_power_of_2(num_tokens_per_expert)
+    # Cap to 8-64 tokens per CTA tile as it's the range supported by the kernel.
+    tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
+    return tile_tokens_dim
+
+
+def flashinfer_fused_moe_blockscale_fp8(
+        routing_logits: torch.Tensor,
+        routing_bias: torch.Tensor,
+        x: torch.Tensor,
+        w13_weight: torch.Tensor,
+        w13_weight_scale_inv: torch.Tensor,
+        w2_weight: torch.Tensor,
+        w2_weight_scale_inv: torch.Tensor,
+        global_num_experts: int,
+        top_k: int,
+        num_expert_group: int,
+        topk_group: int,
+        intermediate_size: int,
+        expert_offset: int,
+        local_num_experts: int,
+        block_shape: list[int],
+        routed_scaling: float = 1.0) -> torch.Tensor:
+    from vllm.utils.flashinfer import flashinfer_trtllm_fp8_block_scale_moe
+    assert top_k <= global_num_experts
+    assert top_k <= 8
+    assert topk_group <= 4
+    assert global_num_experts > num_expert_group
+    assert global_num_experts % num_expert_group == 0
+    assert global_num_experts % 4 == 0
+    assert top_k < (topk_group * global_num_experts / num_expert_group)
+    assert block_shape == [128, 128]
+
+    a_q, a_sf = per_token_group_quant_fp8(x, block_shape[1])
+    # NOTE: scales of hidden states have to be transposed!
+    a_sf_t = a_sf.t().contiguous()
+    return flashinfer_trtllm_fp8_block_scale_moe(
+        routing_logits=routing_logits,
+        routing_bias=routing_bias,
+        hidden_states=a_q,
+        hidden_states_scale=a_sf_t,
+        gemm1_weights=w13_weight,
+        gemm1_weights_scale=w13_weight_scale_inv,
+        gemm2_weights=w2_weight,
+        gemm2_weights_scale=w2_weight_scale_inv,
+        num_experts=global_num_experts,
+        top_k=top_k,
+        n_group=num_expert_group,
+        topk_group=topk_group,
+        intermediate_size=intermediate_size,
+        local_expert_offset=expert_offset,
+        local_num_experts=local_num_experts,
+        routed_scaling_factor=routed_scaling,
+        tile_tokens_dim=_get_tile_tokens_dim(x.shape[0], top_k,
+                                             global_num_experts),
+        routing_method_type=2,  # DeepSeek-styled routing method
+    )
+
+
+def flashinfer_fused_moe_blockscale_fp8_fake(
+        routing_logits: torch.Tensor,
+        routing_bias: torch.Tensor,
+        x: torch.Tensor,
+        w13_weight: torch.Tensor,
+        w13_weight_scale_inv: torch.Tensor,
+        w2_weight: torch.Tensor,
+        w2_weight_scale_inv: torch.Tensor,
+        global_num_experts: int,
+        top_k: int,
+        num_expert_group: int,
+        topk_group: int,
+        intermediate_size: int,
+        expert_offset: int,
+        local_num_experts: int,
+        block_shape: list[int],
+        routed_scaling: float = 1.0) -> torch.Tensor:
+    return torch.empty_like(x)
+
+
+direct_register_custom_op(
+    op_name="flashinfer_fused_moe_blockscale_fp8",
+    op_func=flashinfer_fused_moe_blockscale_fp8,
+    mutates_args=[],
+    fake_impl=flashinfer_fused_moe_blockscale_fp8_fake,
+    tags=(torch.Tag.needs_fixed_stride_order, ),
+)
+
+
+def outplace_fused_experts(
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str = "silu",
+        apply_router_weight_on_input: bool = False,
+        use_fp8_w8a8: bool = False,
+        use_int8_w8a8: bool = False,
+        use_int8_w8a16: bool = False,
+        use_int4_w4a16: bool = False,
+        use_mxfp4_w4a4: bool = False,
+        per_channel_quant: bool = False,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        w1_scale: Optional[torch.Tensor] = None,
+        w2_scale: Optional[torch.Tensor] = None,
+        w1_zp: Optional[torch.Tensor] = None,
+        w2_zp: Optional[torch.Tensor] = None,
+        a1_scale: Optional[torch.Tensor] = None,
+        a2_scale: Optional[torch.Tensor] = None,
+        block_shape: Optional[list[int]] = None) -> torch.Tensor:
+    return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids,
+                              False, activation, apply_router_weight_on_input,
+                              use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16,
+                              use_int4_w4a16, use_mxfp4_w4a4,
+                              per_channel_quant, global_num_experts,
+                              expert_map, w1_scale, w2_scale, w1_zp, w2_zp,
+                              a1_scale, a2_scale, block_shape)
+
+
+def outplace_fused_experts_fake(
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str = "silu",
+        use_fp8_w8a8: bool = False,
+        use_int8_w8a8: bool = False,
+        use_int8_w8a16: bool = False,
+        use_int4_w4a16: bool = False,
+        use_mxfp4_w4a4: bool = False,
+        per_channel_quant: bool = False,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        w1_scale: Optional[torch.Tensor] = None,
+        w2_scale: Optional[torch.Tensor] = None,
+        w1_zp: Optional[torch.Tensor] = None,
+        w2_zp: Optional[torch.Tensor] = None,
+        a1_scale: Optional[torch.Tensor] = None,
+        a2_scale: Optional[torch.Tensor] = None,
+        block_shape: Optional[list[int]] = None) -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+
+
+direct_register_custom_op(
+    op_name="outplace_fused_experts",
+    op_func=outplace_fused_experts,
+    mutates_args=[],
+    fake_impl=outplace_fused_experts_fake,
+    tags=(() if is_torch_equal_or_newer("2.7.0") else
+          (torch.Tag.needs_fixed_stride_order, )),
+)
+
+
+def torch_vllm_inplace_fused_experts(**kwargs) -> torch.Tensor:
+    torch.ops.vllm.inplace_fused_experts(**kwargs)
+    hidden_states = kwargs['hidden_states']
+    return hidden_states
+
+
+def torch_vllm_outplace_fused_experts(**kwargs) -> torch.Tensor:
+    return torch.ops.vllm.outplace_fused_experts(**kwargs)
+
+
+def dispatch_fused_experts_func(inplace: bool) -> Callable[..., torch.Tensor]:
+    if inplace:
+        return torch_vllm_inplace_fused_experts
+    return torch_vllm_outplace_fused_experts
+
+
+# TODO (bnell): replace this with modular op.  Can get rid of inplace/outplace
+# torch ops.
+def fused_experts(
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        inplace: bool = False,
+        activation: str = "silu",
+        apply_router_weight_on_input: bool = False,
+        use_fp8_w8a8: bool = False,
+        use_int8_w8a8: bool = False,
+        use_int8_w8a16: bool = False,
+        use_int4_w4a16: bool = False,
+        use_mxfp4_w4a4: bool = False,
+        per_channel_quant: bool = False,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        w1_scale: Optional[torch.Tensor] = None,
+        w2_scale: Optional[torch.Tensor] = None,
+        w1_zp: Optional[torch.Tensor] = None,
+        w2_zp: Optional[torch.Tensor] = None,
+        a1_scale: Optional[torch.Tensor] = None,
+        a2_scale: Optional[torch.Tensor] = None,
+        block_shape: Optional[list[int]] = None,
+        allow_deep_gemm: bool = False,
+        allow_cutlass_block_scaled_grouped_gemm: bool = False) -> torch.Tensor:
+    # For now, disable DeepGemm for small N (<= 512) until better
+    # permute/unpermute ops are available.
+    # However, on B200, we use DeepGemm for all cases because they only support
+    # E8M0 scale, which means we requantize the weight and input to the specific
+    # scale. Fallen back to cutlass or triton for some cases would cause
+    # accuracy issue.
+    N = w1.size(1)
+    should_use_deep_gemm = ((N > 512
+                             and _valid_deep_gemm(hidden_states, w1, w2))
+                            or is_blackwell_deep_gemm_used())
+    if (allow_deep_gemm and use_fp8_w8a8 and should_use_deep_gemm):
+        assert apply_router_weight_on_input is False
+        return deep_gemm_moe_fp8(
+            hidden_states=hidden_states,
+            w1=w1,
+            w2=w2,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=inplace,
+            activation=activation,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+        )
+    elif (allow_cutlass_block_scaled_grouped_gemm and use_fp8_w8a8
+          and _valid_cutlass_block_scaled_grouped_gemm(
+              w1, w2, inplace, activation, apply_router_weight_on_input,
+              expert_map)):
+        return run_cutlass_block_scaled_fused_experts(
+            a=hidden_states,
+            w1=w1,
+            w2=w2,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids)
+    else:
+        return dispatch_fused_experts_func(inplace)(
+            hidden_states=hidden_states,
+            w1=w1,
+            w2=w2,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            activation=activation,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a8=use_int8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            use_int4_w4a16=use_int4_w4a16,
+            use_mxfp4_w4a4=use_mxfp4_w4a4,
+            per_channel_quant=per_channel_quant,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            w1_zp=w1_zp,
+            w2_zp=w2_zp,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            block_shape=block_shape)
+
+
+def fused_experts_impl(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    activation: str = "silu",
+    apply_router_weight_on_input: bool = False,
+    use_fp8_w8a8: bool = False,
+    use_int8_w8a8: bool = False,
+    use_int8_w8a16: bool = False,
+    use_int4_w4a16: bool = False,
+    use_mxfp4_w4a4: bool = False,
+    per_channel_quant: bool = False,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    w1_zp: Optional[torch.Tensor] = None,
+    w2_zp: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[list[int]] = None,
+) -> torch.Tensor:
+    # Check constraints.
+    if use_int4_w4a16:
+        assert hidden_states.size(1) // 2 == w1.size(2), (
+            "Hidden size mismatch")
+    elif use_mxfp4_w4a4:
+        # 16bit activation and fp4x2 packed weight
+        assert hidden_states.size(1) // 2 == w1.size(2), "hidden size mismatch"
+    else:
+        assert hidden_states.size(1) == w1.size(2), (
+            f"Hidden size mismatch {hidden_states.size(1)} != {w1.size(2)}")
+
+    assert topk_weights.size() == topk_ids.size(), "topk shape mismatch"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
+    assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
+    assert hidden_states.dtype in [
+        torch.float32, torch.float16, torch.bfloat16
+    ]
+
+    num_tokens = hidden_states.size(0)
+    E, N, _ = w1.size()
+    K = w2.size(1)
+    if global_num_experts == -1:
+        global_num_experts = E
+    top_k_num = topk_ids.size(1)
+    # We execute the fused_moe kernel in chunks to circumvent this issue:
+    # https://github.com/vllm-project/vllm/issues/5938
+    CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
+    M = min(num_tokens, CHUNK_SIZE)
+    config_dtype = get_config_dtype_str(use_fp8_w8a8=use_fp8_w8a8,
+                                        use_int8_w8a16=use_int8_w8a16,
+                                        use_int4_w4a16=use_int4_w4a16,
+                                        use_mxfp4_w4a4=use_mxfp4_w4a4,
+                                        dtype=hidden_states.dtype)
+
+    qtype = get_config_quant_dtype(use_fp8_w8a8=use_fp8_w8a8,
+                                   use_int8_w8a8=use_int8_w8a8,
+                                   use_int8_w8a16=use_int8_w8a16,
+                                   use_int4_w4a16=use_int4_w4a16,
+                                   use_mxfp4_w4a4=use_mxfp4_w4a4)
+
+    get_config_func = functools.partial(
+        try_get_optimal_moe_config,
+        w1.size(),
+        w2.size(),
+        top_k_num,
+        config_dtype,
+        block_shape=block_shape,
+    )
+
+    config = get_config_func(M)
+
+    # We can reuse the memory between these because by the time we need
+    # cache3, we're done with cache1
+    cache13 = torch.empty(M * top_k_num * max(N, K),
+                          device=hidden_states.device,
+                          dtype=hidden_states.dtype)
+    intermediate_cache1 = cache13[:M * top_k_num * N].view(M, top_k_num, N)
+    intermediate_cache3 = cache13[:M * top_k_num * K].view(M, top_k_num, K)
+
+    # This needs separate memory since it's used concurrently with cache1
+    intermediate_cache2 = torch.empty((M * top_k_num, N // 2),
+                                      device=hidden_states.device,
+                                      dtype=hidden_states.dtype)
+
+    if hidden_states.dtype == torch.bfloat16:
+        compute_type = tl.bfloat16
+    elif hidden_states.dtype == torch.float16:
+        compute_type = tl.float16
+    elif hidden_states.dtype == torch.float32:
+        compute_type = tl.float32
+    else:
+        raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}")
+
+    if inplace:
+        out_hidden_states = hidden_states
+    else:
+        out_hidden_states = torch.empty_like(hidden_states)
+
+    if use_mxfp4_w4a4:
+        # Weight has to be dequantized for mxfp4 emulation.
+        w1 = dequant_mxfp4(w1, w1_scale, hidden_states.dtype)
+        w1_scale = None
+        w2 = dequant_mxfp4(w2, w2_scale, hidden_states.dtype)
+        w2_scale = None
+
+    for chunk in range((num_tokens // CHUNK_SIZE) + 1):
+        begin_chunk_idx, end_chunk_idx = (chunk * CHUNK_SIZE,
+                                          min((chunk + 1) * CHUNK_SIZE,
+                                              num_tokens))
+        curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx]
+        tokens_in_chunk, _ = curr_hidden_states.size()
+
+        if tokens_in_chunk == 0:
+            break
+
+        if tokens_in_chunk < CHUNK_SIZE and chunk > 0:
+            # Adjust the intermediate cache size and config for the last
+            # chunk. Note that in most cases we only have one chunk
+            # so the cache size and config are already set correctly and
+            # do not need to be adjusted.
+            intermediate_cache1 = intermediate_cache1[:tokens_in_chunk]
+            intermediate_cache2 = intermediate_cache2[:tokens_in_chunk *
+                                                      topk_ids.size(1)]
+            intermediate_cache3 = intermediate_cache3[:tokens_in_chunk]
+            config = get_config_func(tokens_in_chunk)
+
+        curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]
+        curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx]
+        qcurr_hidden_states, a1q_scale = moe_kernel_quantize_input(
+            A=curr_hidden_states,
+            A_scale=a1_scale,
+            quant_dtype=qtype,
+            per_act_token_quant=per_channel_quant,
+            block_shape=block_shape)
+
+        sorted_token_ids, expert_ids, num_tokens_post_padded = (
+            moe_align_block_size(curr_topk_ids, config['BLOCK_SIZE_M'],
+                                 global_num_experts, expert_map))
+
+        invoke_fused_moe_kernel(qcurr_hidden_states,
+                                w1,
+                                intermediate_cache1,
+                                a1q_scale,
+                                w1_scale,
+                                w1_zp,
+                                curr_topk_weights,
+                                sorted_token_ids,
+                                expert_ids,
+                                num_tokens_post_padded,
+                                apply_router_weight_on_input,
+                                top_k_num,
+                                config,
+                                compute_type=compute_type,
+                                use_fp8_w8a8=use_fp8_w8a8,
+                                use_int8_w8a8=use_int8_w8a8,
+                                use_int8_w8a16=use_int8_w8a16,
+                                use_int4_w4a16=use_int4_w4a16,
+                                per_channel_quant=per_channel_quant,
+                                block_shape=block_shape)
+
+        if activation == "silu":
+            torch.ops._C.silu_and_mul(intermediate_cache2,
+                                      intermediate_cache1.view(-1, N))
+        elif activation == "gelu":
+            torch.ops._C.gelu_and_mul(intermediate_cache2,
+                                      intermediate_cache1.view(-1, N))
+        else:
+            raise ValueError(f"Unsupported FusedMoe activation: {activation}")
+
+        qintermediate_cache2, a2q_scale = moe_kernel_quantize_input(
+            A=intermediate_cache2,
+            A_scale=a2_scale,
+            quant_dtype=qtype,
+            per_act_token_quant=per_channel_quant,
+            block_shape=block_shape)
+
+        invoke_fused_moe_kernel(qintermediate_cache2,
+                                w2,
+                                intermediate_cache3,
+                                a2q_scale,
+                                w2_scale,
+                                w2_zp,
+                                curr_topk_weights,
+                                sorted_token_ids,
+                                expert_ids,
+                                num_tokens_post_padded,
+                                not apply_router_weight_on_input,
+                                1,
+                                config,
+                                compute_type=compute_type,
+                                use_fp8_w8a8=use_fp8_w8a8,
+                                use_int8_w8a8=use_int8_w8a8,
+                                use_int8_w8a16=use_int8_w8a16,
+                                use_int4_w4a16=use_int4_w4a16,
+                                per_channel_quant=per_channel_quant,
+                                block_shape=block_shape)
+
+        ops.moe_sum(intermediate_cache3.view(*intermediate_cache3.size()),
+                    out_hidden_states[begin_chunk_idx:end_chunk_idx])
+
+    return out_hidden_states
+
+
+def fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    inplace: bool = False,
+    activation: str = "silu",
+    use_grouped_topk: bool = False,
+    num_expert_group: Optional[int] = None,
+    topk_group: Optional[int] = None,
+    custom_routing_function: Optional[Callable] = None,
+    use_fp8_w8a8: bool = False,
+    use_int8_w8a8: bool = False,
+    use_int8_w8a16: bool = False,
+    use_int4_w4a16: bool = False,
+    use_mxfp4_w4a4: bool = False,
+    per_channel_quant: bool = False,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    w1_zp: Optional[torch.Tensor] = None,
+    w2_zp: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[list[int]] = None,
+) -> torch.Tensor:
+    """
+    This function computes a Mixture of Experts (MoE) layer using two sets of
+    weights, w1 and w2, and top-k gating mechanism.
+
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+    - w1 (torch.Tensor): The first set of expert weights.
+    - w2 (torch.Tensor): The second set of expert weights.
+    - gating_output (torch.Tensor): The output of the gating operation
+        (before softmax).
+    - topk (int): The number of top-k experts to select.
+    - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
+    - inplace (bool): If True, perform the operation in-place.
+        Defaults to False.
+    - activation (str): The activation function to apply after the first
+        MoE layer.
+    - num_expert_group: Optional[int]: additional parameter for grouped_topk
+    - topk_group: Optional[int]: additional parameter for grouped_topk
+    - use_grouped_topk: If True, use grouped_topk instead of fused_topk
+        note: Deepseekv2 model uses grouped_topk
+    - use_fp8_w8a8 (bool): If True, use fp8 arithmetic to compute the inner
+        products for w1 and w2. Defaults to False.
+    - use_int8_w8a8 (bool): If True, use int8 arithmetic to compute the inner
+        products for w1 and w2. Defaults to False.
+    - use_int8_w8a16 (bool): If True, use matmul of int8 weight and bf16/fp16
+        activation to compute the inner products for w1 and w2.
+        Defaults to False.
+    - use_int4_w4a16 (bool): If True, use matmul of int4 weight and bf16/fp16
+        activation to compute the inner products for w1 and w2.
+        Defaults to False.
+    - use_mxfp4_w4a4 (bool): If True, use matmul of OCP MXFP4 weight and
+        OCP MXFP4 activation to compute the inner products for w1 and w2.
+        Defaults to False.
+    - global_num_experts (int): The total number of experts in the global
+        expert space.
+    - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices 
+        from the global expert space to the local expert space of the expert 
+        parallel shard.
+    - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
+        w1.
+    - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
+        w2.
+    - a1_scale (Optional[torch.Tensor]): Optional scale to be used for
+        a1.
+    - a2_scale (Optional[torch.Tensor]): Optional scale to be used for
+        a2.
+    - block_shape: (Optional[list[int]]): Optional block size for block-wise
+        quantization.
+
+    Returns:
+    - torch.Tensor: The output tensor after applying the MoE layer.
+    """
+
+    if use_grouped_topk:
+        assert num_expert_group is not None and topk_group is not None
+        topk_weights, topk_ids = grouped_topk(hidden_states, gating_output,
+                                              topk, renormalize,
+                                              num_expert_group, topk_group)
+    elif custom_routing_function is None:
+        topk_weights, topk_ids, token_expert_indices = fused_topk(
+            hidden_states, gating_output, topk, renormalize)
+    else:
+        topk_weights, topk_ids = custom_routing_function(
+            hidden_states, gating_output, topk, renormalize)
+
+    return fused_experts(hidden_states,
+                         w1,
+                         w2,
+                         topk_weights,
+                         topk_ids,
+                         inplace=inplace,
+                         activation=activation,
+                         use_fp8_w8a8=use_fp8_w8a8,
+                         use_int8_w8a8=use_int8_w8a8,
+                         use_int8_w8a16=use_int8_w8a16,
+                         use_int4_w4a16=use_int4_w4a16,
+                         use_mxfp4_w4a4=use_mxfp4_w4a4,
+                         per_channel_quant=per_channel_quant,
+                         global_num_experts=global_num_experts,
+                         expert_map=expert_map,
+                         w1_scale=w1_scale,
+                         w2_scale=w2_scale,
+                         w1_zp=w1_zp,
+                         w2_zp=w2_zp,
+                         a1_scale=a1_scale,
+                         a2_scale=a2_scale,
+                         block_shape=block_shape)
+
+
+class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
+
+    def __init__(
+        self,
+        use_fp8_w8a8: bool = False,
+        use_int8_w8a8: bool = False,
+        use_int8_w8a16: bool = False,
+        use_int4_w4a16: bool = False,
+        use_mxfp4_w4a4: bool = False,
+        per_act_token_quant: bool = False,
+        block_shape: Optional[list[int]] = None,
+    ):
+        super().__init__(
+            FusedMoEQuantConfig.make(
+                use_fp8_w8a8=use_fp8_w8a8,
+                use_int8_w8a8=use_int8_w8a8,
+                use_int8_w8a16=use_int8_w8a16,
+                use_int4_w4a16=use_int4_w4a16,
+                use_mxfp4_w4a4=use_mxfp4_w4a4,
+                per_act_token_quant=per_act_token_quant,
+                block_shape=block_shape,
+            ))
+
+        self.use_fp8_w8a8 = use_fp8_w8a8
+        self.use_int4_w4a16 = use_int4_w4a16
+        self.use_int8_w8a8 = use_int8_w8a8
+        self.use_int8_w8a16 = use_int8_w8a16
+        self.use_mxfp4_w4a4 = use_mxfp4_w4a4
+
+    @property
+    def activation_formats(
+        self
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (mk.FusedMoEActivationFormat.Standard,
+                mk.FusedMoEActivationFormat.Standard)
+
+    def supports_chunking(self) -> bool:
+        return True
+
+    def supports_expert_map(self) -> bool:
+        return True
+
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        return TopKWeightAndReduceNoOP()
+
+    def workspace_shapes(
+        self,
+        a: torch.Tensor,
+        aq: torch.Tensor,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        workspace1 = (M, topk, max(N // 2, K))
+        workspace2 = (M, topk, max(N, K))
+        output = (M, K)
+        return (workspace1, workspace2, output, a.dtype)
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        apply_router_weight_on_input: bool,
+        extra_expert_args: Optional[dict[str, Any]],
+    ):
+        # Check constraints.
+        if self.use_int4_w4a16:
+            assert hidden_states.size(-1) // 2 == w1.size(2), (
+                "Hidden size mismatch")
+        else:
+            assert hidden_states.size(-1) == w1.size(2), \
+                (f"Hidden size mismatch {hidden_states.size(-1)} "
+                 f"!= {w1.size(2)}")
+
+        assert hidden_states.is_contiguous(
+        ), "Hidden_states must be contiguous"
+        assert hidden_states.dim() == 2
+        assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
+        assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
+        assert hidden_states.dtype in [
+            torch.float32, torch.float16, torch.bfloat16, torch.float8_e4m3fn
+        ]
+
+        E, num_tokens, N, K, top_k_num = mk._moe_problem_size(
+            hidden_states, w1, w2, topk_ids)
+
+        if global_num_experts == -1:
+            global_num_experts = E
+
+        config_dtype = get_config_dtype_str(use_fp8_w8a8=self.use_fp8_w8a8,
+                                            use_int8_w8a16=self.use_int8_w8a16,
+                                            use_int4_w4a16=self.use_int4_w4a16,
+                                            use_mxfp4_w4a4=self.use_mxfp4_w4a4,
+                                            dtype=hidden_states.dtype)
+
+        config = try_get_optimal_moe_config(
+            w1.size(),
+            w2.size(),
+            top_k_num,
+            config_dtype,
+            num_tokens,
+            block_shape=self.block_shape,
+        )
+
+        if hidden_states.dtype == torch.bfloat16:
+            compute_type = tl.bfloat16
+        elif hidden_states.dtype == torch.float16:
+            compute_type = tl.float16
+        elif hidden_states.dtype == torch.float32:
+            compute_type = tl.float32
+        elif hidden_states.dtype == torch.float8_e4m3fn:
+            compute_type = tl.bfloat16
+        else:
+            raise ValueError(
+                f"Unsupported compute_type: {hidden_states.dtype}")
+
+        # Note that the output tensor might be in workspace1
+        intermediate_cache1 = _resize_cache(workspace2,
+                                            (num_tokens, top_k_num, N))
+        intermediate_cache2 = _resize_cache(workspace13,
+                                            (num_tokens * top_k_num, N // 2))
+        intermediate_cache3 = _resize_cache(workspace2,
+                                            (num_tokens, top_k_num, K))
+
+        sorted_token_ids, expert_ids, num_tokens_post_padded = (
+            moe_align_block_size(topk_ids, config['BLOCK_SIZE_M'],
+                                 global_num_experts, expert_map))
+
+        invoke_fused_moe_kernel(
+            hidden_states,
+            w1,
+            intermediate_cache1,
+            a1q_scale,
+            w1_scale,
+            w1_zp,
+            None,  # topk_weights
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            False,  # mul_routed_weights
+            top_k_num,
+            config,
+            compute_type=compute_type,
+            use_fp8_w8a8=self.use_fp8_w8a8,
+            use_int8_w8a8=self.use_int8_w8a8,
+            use_int8_w8a16=self.use_int8_w8a16,
+            use_int4_w4a16=self.use_int4_w4a16,
+            per_channel_quant=self.per_act_token_quant,
+            block_shape=self.block_shape)
+
+        self.activation(activation, intermediate_cache2,
+                        intermediate_cache1.view(-1, N))
+
+        a2q_scale: Optional[torch.Tensor] = None
+
+        qintermediate_cache2, a2q_scale = moe_kernel_quantize_input(
+            intermediate_cache2, a2_scale, self.quant_dtype,
+            self.per_act_token_quant, self.block_shape)
+
+        invoke_fused_moe_kernel(qintermediate_cache2,
+                                w2,
+                                intermediate_cache3,
+                                a2q_scale,
+                                w2_scale,
+                                w2_zp,
+                                topk_weights,
+                                sorted_token_ids,
+                                expert_ids,
+                                num_tokens_post_padded,
+                                not apply_router_weight_on_input,
+                                1,
+                                config,
+                                compute_type=compute_type,
+                                use_fp8_w8a8=self.use_fp8_w8a8,
+                                use_int8_w8a8=self.use_int8_w8a8,
+                                use_int8_w8a16=self.use_int8_w8a16,
+                                use_int4_w4a16=self.use_int4_w4a16,
+                                per_channel_quant=self.per_act_token_quant,
+                                block_shape=self.block_shape)
+
+        ops.moe_sum(intermediate_cache3, output)
+
+
+def modular_triton_fused_moe(
+    use_fp8_w8a8: bool,
+    use_int8_w8a8: bool,
+    use_int8_w8a16: bool,
+    use_int4_w4a16: bool,
+    use_mxfp4_w4a4: bool,
+    per_act_token_quant: bool,
+    block_shape: Optional[list[int]] = None,
+) -> mk.FusedMoEModularKernel:
+    return mk.FusedMoEModularKernel(
+        MoEPrepareAndFinalizeNoEP(),
+        TritonExperts(
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a8=use_int8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            use_int4_w4a16=use_int4_w4a16,
+            use_mxfp4_w4a4=use_mxfp4_w4a4,
+            per_act_token_quant=per_act_token_quant,
+            block_shape=block_shape,
+        ),
+    )
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/layer.py b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/layer.py
new file mode 100644
index 0000000..4a6a3b9
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/layer.py
@@ -0,0 +1,1598 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import abstractmethod
+from collections.abc import Iterable
+from enum import Enum
+from typing import Callable, Literal, Optional, overload
+
+import torch
+import torch.nn.functional as F
+from torch.nn.parameter import UninitializedParameter
+
+import vllm.envs as envs
+from vllm.config import get_current_vllm_config
+from vllm.distributed import (get_dp_group, get_ep_group,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.distributed.eplb.eplb_state import EplbState
+from vllm.forward_context import ForwardContext, get_forward_context
+from vllm.logger import init_logger
+from vllm.model_executor.custom_op import CustomOp
+# yapf: disable
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig, FusedMoEParallelConfig)
+# yapf: enable
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEActivationFormat, FusedMoEModularKernel,
+    FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize)
+from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+    is_rocm_aiter_moe_enabled)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+from vllm.platforms.interface import CpuArchEnum
+from vllm.utils import direct_register_custom_op, has_deep_ep, has_pplx
+from vllm.utils.flashinfer import has_flashinfer
+
+if current_platform.is_cuda_alike():
+    from .fused_batched_moe import BatchedTritonExperts
+    from .fused_moe import TritonExperts, fused_experts
+    if has_pplx():
+        from .pplx_prepare_finalize import (PplxPrepareAndFinalize,
+                                            pplx_hidden_dim_scale_bytes)
+    if has_deep_ep():
+        from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize
+        from .deepep_ll_prepare_finalize import (DEEPEP_QUANT_BLOCK_SHAPE,
+                                                 DeepEPLLPrepareAndFinalize)
+    if has_flashinfer():
+        from .flashinfer_cutlass_prepare_finalize import (
+            FlashInferCutlassMoEPrepareAndFinalize)
+else:
+    fused_experts = None  # type: ignore
+    FusedMoEPermuteExpertsUnpermute = None  # type: ignore
+    FusedMoEPrepareAndFinalize = None  # type: ignore
+if is_rocm_aiter_moe_enabled():
+    from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
+        rocm_aiter_grouped_topk as grouped_topk)
+elif current_platform.is_cpu():
+    pass
+else:
+    from vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk
+if current_platform.is_tpu():
+    from .moe_pallas import fused_moe as fused_moe_pallas
+else:
+    fused_moe_pallas = None  # type: ignore
+
+logger = init_logger(__name__)
+
+
+class FusedMoeWeightScaleSupported(Enum):
+    TENSOR = "tensor"
+    CHANNEL = "channel"
+    GROUP = "group"
+    BLOCK = "block"
+
+
+class FusedMoEMethodBase(QuantizeMethodBase):
+
+    moe: FusedMoEConfig
+
+    @abstractmethod
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+        raise NotImplementedError
+
+    def uses_weight_scale_2_pattern(self) -> bool:
+        """
+        Returns True if this quantization method uses 'weight_scale_2' pattern
+        for per-tensor weight scales (e.g., FP4 variants), False otherwise.
+
+        This method should be overridden by subclasses that use the
+        'weight_scale_2' pattern instead of the standard 'weight_scale' pattern.
+        """
+        return False
+
+    @staticmethod
+    def maybe_make_prepare_finalize(
+            moe: FusedMoEConfig) -> Optional[FusedMoEPrepareAndFinalize]:
+        all2all_manager = get_ep_group().device_communicator.all2all_manager
+        assert all2all_manager is not None
+
+        prepare_finalize: Optional[FusedMoEPrepareAndFinalize] = None
+
+        if moe.use_flashinfer_cutlass_kernels:
+            prepare_finalize = FlashInferCutlassMoEPrepareAndFinalize(
+                quant_dtype=moe.quant_dtype, )
+        if moe.use_pplx_kernels:
+            hidden_dim_bytes, hidden_scale_bytes = pplx_hidden_dim_scale_bytes(
+                moe.max_num_tokens,
+                moe.hidden_dim,
+                moe.in_dtype,
+                moe.quant_dtype,
+                per_act_token_quant=moe.per_act_token_quant,
+                block_shape=moe.block_shape,
+            )
+
+            all_to_all_args = dict(
+                max_num_tokens=moe.max_num_tokens,
+                num_experts=moe.num_experts,
+                experts_per_token=moe.experts_per_token,  # topk
+                rank=all2all_manager.rank,
+                world_size=all2all_manager.world_size,
+                # dp_size actually means tp_size, bug in pplx kernels
+                dp_size=all2all_manager.tp_group.world_size,
+                hidden_dim=moe.hidden_dim,
+                hidden_dim_bytes=hidden_dim_bytes,
+                hidden_dim_scale_bytes=hidden_scale_bytes,
+            )
+
+            num_dispatchers = (all2all_manager.world_size //
+                               all2all_manager.tp_group.world_size)
+
+            # Intranode pplx a2a takes a group name while internode does not.
+            if not all2all_manager.internode:
+                all_to_all_args[
+                    "group_name"] = all2all_manager.cpu_group.group_name
+
+            handle = all2all_manager.get_handle(all_to_all_args)
+
+            prepare_finalize = PplxPrepareAndFinalize(
+                handle,
+                max_num_tokens=moe.max_num_tokens,
+                num_local_experts=moe.num_local_experts,
+                num_dispatchers=num_dispatchers,
+            )
+        elif moe.use_deepep_ht_kernels:
+            assert moe.dp_size == all2all_manager.dp_world_size
+
+            all_to_all_args = dict()
+            handle = all2all_manager.get_handle(all_to_all_args)
+            prepare_finalize = DeepEPHTPrepareAndFinalize(
+                handle,
+                num_dispatchers=all2all_manager.world_size,
+                dp_size=all2all_manager.dp_world_size,
+                rank_expert_offset=all2all_manager.rank *
+                moe.num_local_experts,
+            )
+
+        elif moe.use_deepep_ll_kernels:
+            all_to_all_args = dict(
+                max_num_tokens_per_dp_rank=moe.max_num_tokens,
+                token_hidden_size=moe.hidden_dim,
+                num_ep_ranks=all2all_manager.world_size,
+                num_global_experts=moe.num_experts,
+                num_local_experts=moe.num_experts //
+                all2all_manager.world_size)
+            handle = all2all_manager.get_handle(all_to_all_args)
+
+            # Note : We may want to use FP8 dispatch even otherwise just to
+            # reduce datamovement
+            use_fp8_dispatch = (moe.quant_config is not None
+                                and moe.quant_config.quant_dtype
+                                == current_platform.fp8_dtype()
+                                and moe.quant_config.block_shape
+                                == DEEPEP_QUANT_BLOCK_SHAPE)
+
+            prepare_finalize = DeepEPLLPrepareAndFinalize(
+                handle,
+                max_tokens_per_rank=moe.max_num_tokens,
+                num_dispatchers=all2all_manager.world_size,
+                use_fp8_dispatch=use_fp8_dispatch,
+            )
+
+        return prepare_finalize
+
+    def init_prepare_finalize(self, moe: FusedMoEConfig):
+        self.moe = moe
+        prepare_finalize = FusedMoEMethodBase.maybe_make_prepare_finalize(
+            self.moe)
+
+        self.topk_indices_dtype = None
+        if prepare_finalize is not None:
+            logger.debug("%s", prepare_finalize.__class__.__name__)
+            self.topk_indices_dtype = prepare_finalize.topk_indices_dtype()
+            experts = self.select_gemm_impl(prepare_finalize, self.moe)
+            self.fused_experts = FusedMoEModularKernel(
+                prepare_finalize,
+                experts,
+            )
+
+    def select_gemm_impl(
+        self,
+        prepare_finalize: FusedMoEPrepareAndFinalize,
+        moe: FusedMoEConfig,
+    ) -> FusedMoEPermuteExpertsUnpermute:
+        # based on the all2all implementation, select the appropriate
+        # gemm implementation
+        raise NotImplementedError(
+            f"{self.__class__.__name__} must select appropriate gemm "
+            "implementation based on the prepare_finalize")
+
+    def maybe_swap_experts_impl(
+        self,
+        moe_parallel_config: FusedMoEParallelConfig,
+    ):
+        pass
+
+    @abstractmethod
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+
+@CustomOp.register("unquantized_fused_moe")
+class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
+    """MoE method without quantization."""
+
+    def __init__(self, moe: FusedMoEConfig):
+        super().__init__()
+        self.fused_experts = fused_experts  # type: ignore
+        self.topk_indices_dtype = None
+        self.moe = moe
+
+        self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
+        if self.rocm_aiter_moe_enabled:
+            from .rocm_aiter_fused_moe import rocm_aiter_fused_experts
+            self.rocm_aiter_fused_experts = rocm_aiter_fused_experts
+        else:
+            self.rocm_aiter_fused_experts = None  # type: ignore
+
+    def select_gemm_impl(
+        self,
+        prepare_finalize: FusedMoEPrepareAndFinalize,
+        moe: FusedMoEConfig,
+    ) -> FusedMoEPermuteExpertsUnpermute:
+        if (prepare_finalize.activation_format ==
+                FusedMoEActivationFormat.BatchedExperts):
+            logger.debug("BatchedTritonExperts %s", self.moe)
+            return BatchedTritonExperts(
+                max_num_tokens=self.moe.max_num_tokens,
+                num_dispatchers=prepare_finalize.num_dispatchers(),
+            )
+        else:
+            logger.debug("TritonExperts %s", self.moe)
+            return TritonExperts()
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+        # Fused gate_up_proj (column parallel)
+        w13_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            hidden_size,
+            dtype=params_dtype),
+                                        requires_grad=False)
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        # down_proj (row parallel)
+        w2_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition,
+            dtype=params_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+    def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor:
+        # Pad the weight tensor. This is an optimization on ROCm platform, which
+        # can benefit from tensors located far enough from one another in memory
+        if (envs.VLLM_ROCM_MOE_PADDING and current_platform.is_rocm()
+                and weight.stride(-1) == 1
+                and (weight.stride(-2) * weight.element_size()) % 512 == 0):
+            num_pad = 256 // weight.element_size()
+            weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
+            torch.cuda.empty_cache()
+        return weight
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        super().process_weights_after_loading(layer)
+
+        # Padding the weight for better performance on ROCm
+        layer.w13_weight.data = self._maybe_pad_weight(layer.w13_weight.data)
+        layer.w2_weight.data = self._maybe_pad_weight(layer.w2_weight.data)
+        # Lazy import to avoid importing triton.
+        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+            shuffle_weights)
+
+        if self.rocm_aiter_moe_enabled:
+            shuffled_w13, shuffled_w2 = shuffle_weights(
+                layer.w13_weight.data, layer.w2_weight.data)
+
+            layer.w13_weight.data = shuffled_w13
+            layer.w2_weight.data = shuffled_w2
+
+        if current_platform.is_cpu():
+            if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
+                from vllm.model_executor.layers.fused_moe import cpu_fused_moe
+                dtype = layer.w13_weight.dtype
+                if (envs.VLLM_CPU_SGL_KERNEL
+                        and torch._C._cpu._is_amx_tile_supported()
+                        and dtype == torch.bfloat16):
+                    packed_w13_weight = torch.ops._C.convert_weight_packed(
+                        layer.w13_weight)
+                    assert packed_w13_weight.size() == layer.w13_weight.size()
+                    layer.w13_weight.copy_(packed_w13_weight)
+                    del packed_w13_weight
+                    packed_w2_weight = torch.ops._C.convert_weight_packed(
+                        layer.w2_weight)
+                    assert packed_w2_weight.size() == layer.w2_weight.size()
+                    layer.w2_weight.copy_(packed_w2_weight)
+                    layer.cpu_fused_moe = cpu_fused_moe.SGLFusedMOE(layer)
+                else:
+                    layer.cpu_fused_moe = cpu_fused_moe.IPEXFusedMOE(layer)
+            else:
+                raise NotImplementedError("CPU MOE only supports x86 arch.")
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if enable_eplb:
+            assert expert_load_view is not None
+            assert logical_to_physical_map is not None
+            assert logical_replica_count is not None
+            assert isinstance(layer, FusedMoE)
+
+        return self.forward(
+            x=x,
+            layer=layer,
+            router_logits=router_logits,
+            top_k=top_k,
+            renormalize=renormalize,
+            use_grouped_topk=use_grouped_topk,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias,
+            activation=activation,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            enable_eplb=enable_eplb,
+            expert_load_view=expert_load_view,
+            logical_to_physical_map=logical_to_physical_map,
+            logical_replica_count=logical_replica_count,
+        )
+
+    def forward_cuda(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype,
+            enable_eplb=enable_eplb,
+            expert_map=expert_map,
+            expert_load_view=expert_load_view,
+            logical_to_physical_map=logical_to_physical_map,
+            logical_replica_count=logical_replica_count)
+
+        if self.rocm_aiter_moe_enabled:
+            return self.rocm_aiter_fused_experts(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                expert_map=expert_map,
+                activation=activation,
+                apply_router_weight_on_input=apply_router_weight_on_input)
+        else:
+            return self.fused_experts(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                inplace=True,
+                activation=activation,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                global_num_experts=global_num_experts,
+                expert_map=expert_map,
+            )
+
+    def forward_cpu(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        **kwargs,
+    ):
+        return layer.cpu_fused_moe(
+            layer,
+            x,
+            use_grouped_topk,
+            top_k,
+            router_logits,
+            renormalize,
+            topk_group,
+            num_expert_group,
+            global_num_experts,
+            expert_map,
+            custom_routing_function,
+            scoring_func,
+            e_score_correction_bias,
+            apply_router_weight_on_input,
+            activation,
+        )
+
+    def forward_tpu(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+    ) -> torch.Tensor:
+        assert not use_grouped_topk
+        assert num_expert_group is None
+        assert topk_group is None
+        assert custom_routing_function is None
+        assert apply_router_weight_on_input is False
+        if scoring_func != "softmax":
+            raise NotImplementedError(
+                "Only softmax scoring function is supported for TPU.")
+        if e_score_correction_bias is not None:
+            raise NotImplementedError(
+                "Expert score correction bias is not supported for TPU.")
+        assert activation == "silu", f"{activation} is not supported for TPU."
+        return fused_moe_pallas(hidden_states=x,
+                                w1=layer.w13_weight,
+                                w2=layer.w2_weight,
+                                topk=top_k,
+                                gating_output=router_logits,
+                                global_num_experts=global_num_experts,
+                                expert_map=expert_map,
+                                renormalize=renormalize)
+
+    if current_platform.is_tpu():
+        forward_native = forward_tpu
+    elif current_platform.is_cpu():
+        forward_native = forward_cpu
+    else:
+        forward_native = forward_cuda
+
+
+def determine_expert_map(
+        ep_size: int, ep_rank: int,
+        global_num_experts: int) -> tuple[int, Optional[torch.Tensor]]:
+    """
+        Calculates how many experts should be assigned to each rank for EP and
+        creates a mapping from global to local expert index. Experts are
+        distributed evenly across ranks. Any remaining are assigned to the
+        last rank.
+
+        Args:
+            ep_size (int): The size of the expert parallel group
+            global_num_experts (int): The total number of experts in the model.
+
+        Returns:
+            tuple[int, Optional[torch.Tensor]]: A tuple containing:
+                - local_num_experts (int): The number of experts assigned
+                    to the current rank.
+                - expert_map (Optional[torch.Tensor]): A tensor of shape
+                    (global_num_experts,) mapping from global to local index.
+                    Contains -1 for experts not assigned to the current rank.
+                    Returns None if ep_size is 1.
+        """
+    assert ep_size > 0
+    if ep_size == 1:
+        return (global_num_experts, None)
+
+    local_num_experts = global_num_experts // ep_size
+
+    # Create a tensor of size num_experts filled with -1
+    expert_map = torch.full((global_num_experts, ), -1, dtype=torch.int32)
+    # Create a expert map for the local experts
+    if ep_rank < (ep_size - 1):
+        # Each non-last rank gets local_num_experts experts.
+        expert_map[ep_rank * local_num_experts:
+                        (ep_rank + 1) * local_num_experts] = \
+            torch.arange(0, local_num_experts, dtype=torch.int32)
+    else:
+        # All remaining experts are assigned to the last rank.
+        local_num_experts = (global_num_experts - ep_rank * local_num_experts)
+
+        expert_map[-local_num_experts:] = \
+            torch.arange(0, local_num_experts, dtype=torch.int32)
+    return (local_num_experts, expert_map)
+
+
+class FusedMoE(torch.nn.Module):
+    """FusedMoE layer for MoE models.
+
+    This layer contains both MergedColumnParallel weights (gate_up_proj /
+    w13) and RowParallelLinear weights (down_proj/ w2).
+
+    Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We
+    copy that naming convention here and handle any remapping in the
+    load_weights function in each model implementation.
+
+    Args:
+        num_experts: Number of experts in the model
+        top_k: Number of experts selected for each token
+        hidden_size: Input hidden state size of the transformer
+        intermediate_size: Intermediate size of the experts
+        params_dtype: Data type for the parameters.
+        reduce_results: Whether to all all_reduce on the output of the layer
+        renomalize: Whether to renormalize the logits in the fused_moe kernel
+        quant_config: Quantization configure.
+        enable_eplb: Whether to enable expert parallelism load balancer.
+    """
+
+    def __init__(
+        self,
+        num_experts: int,  # Global number of experts
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: Optional[torch.dtype] = None,
+        reduce_results: bool = False,
+        renormalize: bool = True,
+        use_grouped_topk: bool = False,
+        num_expert_group: Optional[int] = None,
+        topk_group: Optional[int] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        tp_size: Optional[int] = None,
+        ep_size: Optional[int] = None,
+        dp_size: Optional[int] = None,
+        prefix: str = "",
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        num_redundant_experts: int = 0,
+    ):
+        super().__init__()
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
+
+        tp_size_ = (tp_size if tp_size is not None else
+                    get_tensor_model_parallel_world_size())
+        dp_size_ = (dp_size
+                    if dp_size is not None else get_dp_group().world_size)
+
+        vllm_config = get_current_vllm_config()
+        self.moe_parallel_config: FusedMoEParallelConfig = (
+            FusedMoEParallelConfig.make(
+                tp_size_=tp_size_,
+                dp_size_=dp_size_,
+                vllm_parallel_config=vllm_config.parallel_config))
+
+        self.global_num_experts = num_experts + num_redundant_experts
+
+        # For smuggling this layer into the fused moe custom op
+        compilation_config = vllm_config.compilation_config
+        if prefix in compilation_config.static_forward_context:
+            raise ValueError("Duplicate layer name: {}".format(prefix))
+        compilation_config.static_forward_context[prefix] = self
+        self.layer_name = prefix
+
+        self.enable_eplb = enable_eplb
+        self.expert_load_view: Optional[torch.Tensor] = None
+        self.logical_to_physical_map: Optional[torch.Tensor] = None
+        self.logical_replica_count: Optional[torch.Tensor] = None
+
+        # Determine expert maps
+        if self.use_ep:
+            if self.enable_eplb:
+                assert self.global_num_experts % self.ep_size == 0, \
+                    "EPLB currently only supports even distribution of " \
+                    "experts across ranks."
+            else:
+                assert num_redundant_experts == 0, \
+                    "Redundant experts are only supported with EPLB."
+            self.local_num_experts, self.expert_map = determine_expert_map(
+                ep_size=self.ep_size,
+                ep_rank=self.ep_rank,
+                global_num_experts=self.global_num_experts)
+        else:
+            self.local_num_experts, self.expert_map = (self.global_num_experts,
+                                                       None)
+
+        self.top_k = top_k
+
+        assert intermediate_size % self.tp_size == 0
+        self.hidden_size = hidden_size
+        self.intermediate_size_per_partition = intermediate_size // self.tp_size
+        self.reduce_results = reduce_results
+        self.renormalize = renormalize
+        self.use_grouped_topk = use_grouped_topk
+        if self.use_grouped_topk:
+            assert num_expert_group is not None and topk_group is not None
+        self.num_expert_group = num_expert_group
+        self.topk_group = topk_group
+        self.custom_routing_function = custom_routing_function
+        self.scoring_func = scoring_func
+        self.e_score_correction_bias = e_score_correction_bias
+        self.apply_router_weight_on_input = apply_router_weight_on_input
+        self.activation = activation
+
+        if self.scoring_func != "softmax" and not self.use_grouped_topk:
+            raise ValueError("Only softmax scoring function is supported for "
+                             "non-grouped topk.")
+
+        if vllm_config.model_config is not None:
+            model_dtype = vllm_config.model_config.dtype
+        else:
+            # TODO (bnell): This is a hack to get test_mixtral_moe to work
+            # since model_config is not set in the pytest test.
+            model_dtype = params_dtype
+
+        moe = FusedMoEConfig.make(
+            num_experts=self.global_num_experts,
+            experts_per_token=top_k,
+            hidden_dim=hidden_size,
+            num_local_experts=self.local_num_experts,
+            moe_parallel_config=self.moe_parallel_config,
+            in_dtype=model_dtype,
+            max_num_tokens=envs.VLLM_MOE_DP_CHUNK_SIZE,
+            quant_config=quant_config,
+        )
+        self.moe_config = moe
+        self.quant_config = quant_config
+
+        # Note: get_quant_method will look at the layer's local_num_experts
+        # for heuristic purposes, so it must be initialized first.
+        quant_method: Optional[QuantizeMethodBase] = None
+        quant_method = (UnquantizedFusedMoEMethod(moe) if quant_config is None
+                        else quant_config.get_quant_method(self, prefix))
+
+        assert quant_method is not None
+        assert isinstance(quant_method, FusedMoEMethodBase)
+        self.quant_method = quant_method
+
+        if self.enable_eplb:
+            from vllm.model_executor.layers.quantization.fp8 import (
+                Fp8MoEMethod)
+            if not isinstance(quant_method,
+                              (Fp8MoEMethod, UnquantizedFusedMoEMethod)):
+                # TODO: Add support for additional quantization methods.
+                # The implementation for other quantization methods does not
+                # contain essential differences, but the current quant API
+                # design causes duplicated work when extending to new
+                # quantization methods, so I'm leaving it for now.
+                # If you plan to add support for more quantization methods,
+                # please refer to the implementation in `Fp8MoEMethod`.
+                raise NotImplementedError("EPLB is only supported for FP8 "
+                                          "quantization for now.")
+
+        moe_quant_params = {
+            "num_experts": self.local_num_experts,
+            "hidden_size": hidden_size,
+            "intermediate_size_per_partition":
+            self.intermediate_size_per_partition,
+            "params_dtype": params_dtype,
+            "weight_loader": self.weight_loader,
+        }
+        # need full intermediate size pre-sharding for WNA16 act order
+        if (self.quant_method.__class__.__name__
+                in ("GPTQMarlinMoEMethod",
+                    "CompressedTensorsWNA16MarlinMoEMethod",
+                    "CompressedTensorsWNA16MoEMethod")):
+            moe_quant_params["intermediate_size_full"] = intermediate_size
+
+        self.quant_method.create_weights(layer=self, **moe_quant_params)
+        if isinstance(self.quant_method, FusedMoEMethodBase):
+            self.quant_method.maybe_swap_experts_impl(self.moe_parallel_config)
+
+        # Chunked all2all staging tensor
+        self.batched_hidden_states: Optional[torch.Tensor] = None
+        self.batched_router_logits: Optional[torch.Tensor] = None
+        if (self.moe_parallel_config.use_pplx_kernels
+                or self.moe_parallel_config.use_deepep_ll_kernels
+                or self.moe_parallel_config.use_flashinfer_cutlass_kernels):
+            self.batched_hidden_states = torch.zeros(
+                (moe.max_num_tokens, self.hidden_size),
+                dtype=moe.in_dtype,
+                device=torch.cuda.current_device())
+
+            # Note here we use `num_experts` which is logical expert count
+            self.batched_router_logits = torch.zeros(
+                (moe.max_num_tokens, num_experts),
+                dtype=moe.in_dtype,
+                device=torch.cuda.current_device())
+
+    @property
+    def tp_size(self):
+        return self.moe_parallel_config.tp_size
+
+    @property
+    def dp_size(self):
+        return self.moe_parallel_config.dp_size
+
+    @property
+    def ep_size(self):
+        return self.moe_parallel_config.ep_size
+
+    @property
+    def tp_rank(self):
+        return self.moe_parallel_config.tp_rank
+
+    @property
+    def dp_rank(self):
+        return self.moe_parallel_config.dp_rank
+
+    @property
+    def ep_rank(self):
+        return self.moe_parallel_config.ep_rank
+
+    @property
+    def use_ep(self):
+        return self.moe_parallel_config.use_ep
+
+    @property
+    def use_pplx_kernels(self):
+        return self.moe_parallel_config.use_pplx_kernels
+
+    @property
+    def use_deepep_ht_kernels(self):
+        return self.moe_parallel_config.use_deepep_ht_kernels
+
+    @property
+    def use_deepep_ll_kernels(self):
+        return self.moe_parallel_config.use_deepep_ll_kernels
+
+    @property
+    def use_flashinfer_cutlass_kernels(self):
+        return self.moe_parallel_config.use_flashinfer_cutlass_kernels
+
+    def update_expert_map(self):
+        # ep_size and ep_rank should already be updated
+        assert self.expert_map is not None
+        with self.expert_map.device:
+            self.local_num_experts, self.expert_map = determine_expert_map(
+                ep_size=self.ep_size,
+                ep_rank=self.ep_rank,
+                global_num_experts=self.global_num_experts)
+
+    def _load_per_tensor_weight_scale(self, shard_id: str,
+                                      param: torch.nn.Parameter,
+                                      loaded_weight: torch.Tensor,
+                                      expert_id: int):
+        param_data = param.data
+        # for per tensor weight quantization
+        if shard_id in ("w1", "w3"):
+            # We have to keep the weight scales of w1 and w3 because
+            # we need to re-quantize w1/w3 weights after weight loading.
+            idx = 0 if shard_id == "w1" else 1
+            param_data[expert_id][idx] = loaded_weight
+        # If we are in the row parallel case (down_proj)
+        elif shard_id == "w2":
+            param_data[expert_id] = loaded_weight
+
+    def _load_model_weight_or_group_weight_scale(self,
+                                                 shard_dim: int,
+                                                 expert_data: torch.Tensor,
+                                                 shard_id: str,
+                                                 loaded_weight: torch.Tensor,
+                                                 tp_rank: int,
+                                                 load_full_w2: bool = False):
+        """
+        Load grouped weight scales for group quantization or model weights
+            :param shard_dim: dimension to shard
+            :param expert_data: parameter for a particular expert
+            :param shard_id: either w1, w2, or w3
+            :param loaded_weight: checkpoint weight to load into the param
+            :param tp_rank: tensor parallel rank
+            :param load_full_w2: whether or not the w2 loaded should be sharded.
+        """
+        if shard_id == "w2":
+            # In the case where we have actorder/g_idx, we do not partition the
+            # w2 scales, as indicated by `load_full` argument, for all tp cases
+            self._load_w2(shard_dim=shard_dim,
+                          loaded_weight=loaded_weight,
+                          expert_data=expert_data,
+                          tp_rank=tp_rank,
+                          load_full=load_full_w2)
+        elif shard_id in ("w1", "w3"):
+            self._load_w13(shard_id=shard_id,
+                           shard_dim=shard_dim,
+                           loaded_weight=loaded_weight,
+                           expert_data=expert_data,
+                           tp_rank=tp_rank)
+
+    def _load_per_channel_weight_scale(self, expert_data: torch.Tensor,
+                                       shard_dim: int, shard_id: str,
+                                       loaded_weight: torch.Tensor,
+                                       tp_rank: int):
+        # for per channel weight quantization
+        if shard_id == "w2":
+            expert_data.copy_(loaded_weight)
+        elif shard_id in ("w1", "w3"):
+            self._load_w13(shard_id=shard_id,
+                           shard_dim=shard_dim,
+                           loaded_weight=loaded_weight,
+                           expert_data=expert_data,
+                           tp_rank=tp_rank)
+
+    def _load_w13(self,
+                  expert_data: torch.Tensor,
+                  shard_dim: int,
+                  shard_id: str,
+                  loaded_weight: torch.Tensor,
+                  tp_rank: int,
+                  load_full: bool = False):
+
+        # Index the loaded weight for tp sharding.
+        # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim
+        shard_size = expert_data.shape[shard_dim] // 2
+        if not load_full:
+            loaded_weight = loaded_weight.narrow(shard_dim,
+                                                 shard_size * tp_rank,
+                                                 shard_size)
+        # Narrow parameter and load.
+        # w1, gate_proj: Load into first logical weight of w13.
+        if shard_id == "w1":
+            expert_data = expert_data.narrow(shard_dim, 0, shard_size)
+        # w3, up_proj: Load into second logical weight of w13.
+        else:
+            assert shard_id == "w3"
+            expert_data = expert_data.narrow(shard_dim, shard_size, shard_size)
+        expert_data.copy_(loaded_weight)
+
+    def _load_w2(self,
+                 expert_data: torch.Tensor,
+                 shard_dim: int,
+                 loaded_weight: torch.Tensor,
+                 tp_rank: int,
+                 load_full: bool = False):
+
+        # Index the loaded weight for tp sharding.
+        # down_proj: "RowParallel" so tp sharding on input_dim
+        # Narrow parameter and load.
+        shard_size = expert_data.shape[shard_dim]
+        if not load_full:
+            loaded_weight = loaded_weight.narrow(shard_dim,
+                                                 shard_size * tp_rank,
+                                                 shard_size)
+        # w2, down_proj: Load into only logical weight of w2.
+        expert_data.copy_(loaded_weight)
+
+    def _load_single_value(self, param: torch.nn.Parameter,
+                           loaded_weight: torch.Tensor, expert_id: int):
+        param_data = param.data
+
+        # Input scales can be loaded directly and should be equal.
+        param_data[expert_id] = loaded_weight
+
+    def _load_g_idx(self, shard_id: str, expert_data: torch.Tensor,
+                    shard_dim: int, loaded_weight: torch.Tensor, tp_rank: int):
+
+        if shard_id == "w2":
+            self._load_w2(shard_dim=shard_dim,
+                          loaded_weight=loaded_weight,
+                          expert_data=expert_data,
+                          tp_rank=tp_rank)
+        else:
+            assert shard_id in ("w1", "w3")
+            expert_data.copy_(loaded_weight)
+
+    def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int:
+        if self.expert_map is None:
+            return expert_id
+        return self.expert_map[expert_id].item()
+
+    @overload
+    def weight_loader(self, param: torch.nn.Parameter,
+                      loaded_weight: torch.Tensor, weight_name: str,
+                      shard_id: str, expert_id: int,
+                      return_success: Literal[False]) -> None:
+        ...
+
+    @overload
+    def weight_loader(self, param: torch.nn.Parameter,
+                      loaded_weight: torch.Tensor, weight_name: str,
+                      shard_id: str, expert_id: int,
+                      return_success: Literal[True]) -> bool:
+        ...
+
+    def weight_loader(self,
+                      param: torch.nn.Parameter,
+                      loaded_weight: torch.Tensor,
+                      weight_name: str,
+                      shard_id: str,
+                      expert_id: int,
+                      return_success: bool = False) -> Optional[bool]:
+        expert_id = self._map_global_expert_id_to_local_expert_id(expert_id)
+        if expert_id == -1:
+            # Failed to load this param since it's not local to this rank
+            return False if return_success else None
+        # Hereafter, `expert_id` is local physical id
+
+        quant_method_name = self.quant_method.__class__.__name__
+        # compressed-tensors checkpoints with packed weights are stored flipped
+        # TODO (mgoin): check self.quant_method.quant_config.quant_format
+        # against known CompressionFormat enum values that have this quality
+        if self.quant_method.__class__.__name__ in (
+                "CompressedTensorsWNA16MarlinMoEMethod",
+                "CompressedTensorsWNA16MoEMethod"):
+            loaded_weight = loaded_weight.t().contiguous()
+
+        if shard_id not in ("w1", "w2", "w3"):
+            raise ValueError(f"shard_id must be ['w1','w2','w3'] but "
+                             f"got {shard_id}.")
+
+        WEIGHT_SCALE_SUPPORTED = [
+            e.value for e in FusedMoeWeightScaleSupported
+        ]
+        # Fetch the dim to shard the parameter/loaded weight
+        # based on the shard id. This will be whatever
+        # dimension intermediate_size_per_partition is used.
+        SHARD_ID_TO_SHARDED_DIM = {"w1": 0, "w2": 1, "w3": 0}
+
+        is_gguf_weight = getattr(param, "is_gguf_weight", False)
+        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
+        if is_gguf_weight_type:
+            param.weight_type = loaded_weight.item()
+            param.data.copy_(loaded_weight)
+            return True if return_success else None
+
+        # Case for BitsAndBytes
+        use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
+        if use_bitsandbytes_4bit:
+            shard_dim = 0
+
+            expert_data = param.data[expert_id]
+            if shard_id == "w2":
+                expert_data.copy_(loaded_weight)
+            elif shard_id in ("w1", "w3"):
+                # BNB inflight quantization has already sharded the weights
+                full_load = True
+                self._load_w13(
+                    shard_id=shard_id,
+                    shard_dim=shard_dim,
+                    loaded_weight=loaded_weight,
+                    expert_data=expert_data,
+                    tp_rank=self.tp_rank,
+                    load_full=full_load,
+                )
+            return True if return_success else None
+
+        # is_transposed: if the dim to shard the weight
+        # should be flipped. Required by GPTQ, compressed-tensors
+        # should be whatever dimension intermediate_size_per_partition is
+        is_transposed = getattr(param, "is_transposed", False)
+        shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id]
+        if is_transposed:
+            shard_dim = int(not shard_dim)
+
+        full_load = len(loaded_weight.shape) == 3
+        if full_load:
+            shard_dim += 1
+
+        # Materialize GGUF UninitializedParameter
+        if is_gguf_weight and isinstance(param, UninitializedParameter):
+            final_shape = list(loaded_weight.shape)
+            if shard_id in ["w1", "w3"]:
+                final_shape[1] *= 2
+            final_shape[shard_dim] = final_shape[shard_dim] // self.tp_size
+            param.materialize(final_shape, dtype=loaded_weight.dtype)
+
+        expert_data = param.data if full_load else param.data[expert_id]
+
+        # Case input scale: input_scale loading is only supported for fp8
+        if "input_scale" in weight_name:
+            # this is needed for compressed-tensors only
+            loaded_weight = loaded_weight.to(param.data.device)
+
+            if ("compressed" in quant_method_name.lower()
+                    and param.data[expert_id] != 1
+                    and (param.data[expert_id] - loaded_weight).abs() > 1e-5):
+                raise ValueError(
+                    "input_scales of w1 and w3 of a layer "
+                    f"must be equal. But got {param.data[expert_id]} "
+                    f"vs. {loaded_weight}")
+
+            self._load_single_value(param=param,
+                                    loaded_weight=loaded_weight,
+                                    expert_id=expert_id)
+            return True if return_success else None
+
+        # Case g_idx
+        if "g_idx" in weight_name:
+            self._load_g_idx(shard_dim=0,
+                             shard_id=shard_id,
+                             loaded_weight=loaded_weight,
+                             expert_data=expert_data,
+                             tp_rank=self.tp_rank)
+            return True if return_success else None
+
+        # TODO @dsikka: ModelOpt should follow the proper MoE loading pattern
+        if "ModelOpt" in quant_method_name:
+            # Determine per-tensor weight scale patterns based on variant
+            # Use the dedicated method instead of brittle string matching
+            uses_weight_scale_2 = self.quant_method.uses_weight_scale_2_pattern(
+            )
+
+            # For per-tensor, FP4 uses "weight_scale_2", FP8 uses "weight_scale"
+            per_tensor_conditions = (
+                "weight_scale_2" in weight_name if uses_weight_scale_2 else
+                "weight_scale" in weight_name) or "input_scale" in weight_name
+
+            if per_tensor_conditions:
+                self._load_per_tensor_weight_scale(
+                    shard_id=shard_id,
+                    param=param,
+                    loaded_weight=loaded_weight,
+                    expert_id=expert_id,
+                )
+            elif "weight" in weight_name:
+                self._load_model_weight_or_group_weight_scale(
+                    shard_id=shard_id,
+                    shard_dim=shard_dim,
+                    loaded_weight=loaded_weight,
+                    expert_data=expert_data,
+                    tp_rank=self.tp_rank)
+            return True if return_success else None
+
+        # Case weight scales, zero_points and offset, weight/input global scales
+        if ("scale" in weight_name or "zero" in weight_name
+                or "offset" in weight_name):
+            # load the weight scales and zp based on the quantization scheme
+            # supported weight scales/zp can be found in
+            # FusedMoeWeightScaleSupported
+            # TODO @dsikka: once hardened, refactor to use vLLM Parameters
+            # specific to each case
+            quant_method = getattr(param, "quant_method", None)
+            if quant_method == FusedMoeWeightScaleSupported.CHANNEL.value:
+                self._load_per_channel_weight_scale(
+                    shard_id=shard_id,
+                    shard_dim=shard_dim,
+                    loaded_weight=loaded_weight,
+                    expert_data=expert_data,
+                    tp_rank=self.tp_rank)
+            elif quant_method in [
+                    FusedMoeWeightScaleSupported.GROUP.value,
+                    FusedMoeWeightScaleSupported.BLOCK.value,
+            ]:
+                self._load_model_weight_or_group_weight_scale(
+                    shard_id=shard_id,
+                    shard_dim=shard_dim,
+                    loaded_weight=loaded_weight,
+                    expert_data=expert_data,
+                    tp_rank=self.tp_rank,
+                    load_full_w2=getattr(param, "load_full_w2", False))
+            elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value:
+                self._load_per_tensor_weight_scale(shard_id=shard_id,
+                                                   param=param,
+                                                   loaded_weight=loaded_weight,
+                                                   expert_id=expert_id)
+            else:
+                raise ValueError(
+                    f"quant method must be one of {WEIGHT_SCALE_SUPPORTED}")
+            return True if return_success else None
+
+        # Case weight_shape
+        if "weight_shape" in weight_name:
+            # only required by compressed-tensors
+            self._load_single_value(param=param,
+                                    loaded_weight=loaded_weight,
+                                    expert_id=expert_id)
+            return True if return_success else None
+
+        # Case model weights
+        if "weight" in weight_name:
+            self._load_model_weight_or_group_weight_scale(
+                shard_id=shard_id,
+                shard_dim=shard_dim,
+                loaded_weight=loaded_weight,
+                expert_data=expert_data,
+                tp_rank=self.tp_rank)
+            return True if return_success else None
+
+        return False if return_success else None
+
+    def get_expert_weights(self) -> Iterable[torch.Tensor]:
+        weights = list(self.named_parameters())
+        assert all(weight.is_contiguous() for _, weight in weights)
+
+        # Filter out the non-expert weights.
+        # `e_score_correction_bias` is a bias for each logical expert,
+        # with shape (num_logical_experts,), not an expert weight.
+        NON_EXPERT_WEIGHTS = {
+            "e_score_correction_bias",
+        }
+
+        return [
+            weight.view(self.local_num_experts, -1) for name, weight in weights
+            if name not in NON_EXPERT_WEIGHTS
+        ]
+
+    def set_eplb_state(
+        self,
+        moe_layer_idx: int,
+        expert_load_view: torch.Tensor,
+        logical_to_physical_map: torch.Tensor,
+        logical_replica_count: torch.Tensor,
+    ) -> None:
+        """
+        Register the EPLB state in this layer.
+
+        This is used later in forward pass, where we get the expert mapping
+        and record the load metrics in `expert_load_view`.
+        """
+        self.expert_load_view = expert_load_view[moe_layer_idx]
+        self.logical_to_physical_map = logical_to_physical_map[moe_layer_idx]
+        self.logical_replica_count = logical_replica_count[moe_layer_idx]
+
+    @staticmethod
+    def select_experts(
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        use_grouped_topk: bool,
+        renormalize: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        indices_type: Optional[torch.dtype] = None,
+        enable_eplb: bool = False,
+        expert_map: Optional[torch.Tensor] = None,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Route the input hidden states to the top-k experts based on the
+        router logits.
+
+        Returns:
+            (topk_weights, topk_ids) (tuple[torch.Tensor, torch.Tensor]):
+            The weights and *global physical* expert ids of the top-k experts.
+
+            **Compatibility**: When EPLB is not enabled, the returned ids are
+            equivalent to global logical ids, so should be compatible with
+            plain MoE implementations without redundant experts.
+        """
+        from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
+
+        # DeepSeekv2 uses grouped_top_k
+        if use_grouped_topk:
+            assert topk_group is not None
+            assert num_expert_group is not None
+            topk_weights, topk_ids = grouped_topk(
+                hidden_states=hidden_states,
+                gating_output=router_logits,
+                topk=top_k,
+                renormalize=renormalize,
+                num_expert_group=num_expert_group,
+                topk_group=topk_group,
+                scoring_func=scoring_func,
+                e_score_correction_bias=e_score_correction_bias)
+            if indices_type is not None:
+                topk_ids = topk_ids.to(dtype=indices_type)
+        elif custom_routing_function is None:
+            topk_weights, topk_ids, token_expert_indices = fused_topk(
+                hidden_states=hidden_states,
+                gating_output=router_logits,
+                topk=top_k,
+                renormalize=renormalize,
+                indices_type=indices_type,
+            )
+        else:
+            topk_weights, topk_ids = custom_routing_function(
+                hidden_states=hidden_states,
+                gating_output=router_logits,
+                topk=top_k,
+                renormalize=renormalize)
+            if indices_type is not None:
+                topk_ids = topk_ids.to(dtype=indices_type)
+
+        if enable_eplb:
+            assert expert_load_view is not None
+            assert logical_to_physical_map is not None
+            assert logical_replica_count is not None
+
+            # 1. Convert the logical expert ids to physical expert ids
+            # Directly select a random replica for each logical expert
+
+            # TODO: maybe optimize this by using specified kernels,
+            # or compute pseudo-random indices by modulo
+
+            # In case `indices_type` is not `torch.long` or `torch.int`,
+            # e.g. `torch.uint32` as required by dispatch/combine kernels
+            topk_ids_long = topk_ids.long()
+            replica_indices = (
+                torch.rand_like(topk_ids, dtype=torch.float) *
+                logical_replica_count[topk_ids_long]).long().unsqueeze(-1)
+            physical_ids = logical_to_physical_map[topk_ids_long].gather(
+                -1, replica_indices).squeeze(-1)
+
+            topk_ids = physical_ids
+
+            # 2. Record expert load metrics.
+
+            # TODO(bowen): When using `FusedMoEModularKernel`, this
+            # can be done in a more unified way, since
+            # `FusedMoEPrepareAndFinalize` will return the expert
+            # token count, in some cases directly from the kernel.
+            # However, now there are many code paths not using
+            # the modular kernel, e.g. calling `fused_experts`,
+            # so we decide to keep the logic here.
+            #
+            # If later refactor moved all the MoE kernel calls
+            # to the modular kernel, we can move this logic there
+            # to achieve better efficiency.
+
+            # `expert_load_view`: (num_logical_experts,)
+
+            # Mask out non-local experts
+            if expert_map is not None:
+                topk_ids_local = expert_map[topk_ids]
+                topk_ids_flatten = topk_ids_local.flatten()
+            else:
+                topk_ids_flatten = topk_ids.flatten()
+
+            # Should be equivalent to:
+            # ```
+            # topk_ids_masked = topk_ids_local[topk_ids_local >= 0]
+            # expert_load_view += topk_ids_masked.bincount(
+            #     minlength=expert_load_view.shape[0])
+            # ```
+            # We use `scatter_add_` since `bincount` cannot be compiled
+
+            # Performance optimization:
+            # `masked_fill` is significantly faster than `masked_select`
+            invalid_mask = topk_ids_flatten < 0
+            # Replace invalid expert ids with 0 (just a dummy position)
+            # to avoid out-of-bounds errors in scatter_add_
+            index = topk_ids_flatten.masked_fill_(invalid_mask, 0)
+            # `src` is the valid mask, which is 1 for valid and 0 for invalid
+            src = ~invalid_mask
+
+            expert_load_view.scatter_add_(dim=0,
+                                          index=index.long(),
+                                          src=src.to(expert_load_view))
+
+            topk_ids = topk_ids.to(dtype=indices_type)
+
+        assert topk_ids.dtype == indices_type or indices_type is None
+
+        return topk_weights, topk_ids
+
+    def must_reduce_shared_expert_outputs(self) -> bool:
+        """
+        The shared_experts are typically computed using the RowParallelLinear
+        layer. The result of this function is typically used as
+        the reduce_results argument to the module.
+        When just tensor-parallel is used, it is not required to reduce
+        the shared_experts results immediately. Instead we reduce at the
+        once at the end of the MoE op. (Refer to DeepSeekV2MoE module)
+        With EP and all2all kernels - this is no longer viable as all
+        GPU ranks in DP, produce the complete set of hidden_states.
+        Therefore it is required that we reduce the shared_experts output
+        early.
+        """
+        return (self.use_pplx_kernels or self.use_deepep_ht_kernels
+                or self.use_deepep_ll_kernels)
+
+    def maybe_all_reduce_tensor_model_parallel(
+            self, final_hidden_states: torch.Tensor):
+        """
+        The pplx combine kernel reduces across GPU ranks by default.
+        """
+        if (self.use_pplx_kernels or self.use_deepep_ht_kernels
+                or self.use_deepep_ll_kernels):
+            return final_hidden_states
+        else:
+            return tensor_model_parallel_all_reduce(final_hidden_states)
+
+    def forward(self, hidden_states: torch.Tensor,
+                router_logits: torch.Tensor):
+        # TODO: Once the OOM issue for the TPU backend is resolved, we will
+        # switch to using the moe_forward custom op.
+        if current_platform.is_tpu():
+            return self.forward_impl(hidden_states, router_logits)
+        else:
+            return torch.ops.vllm.moe_forward(hidden_states, router_logits,
+                                              self.layer_name)
+
+    def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
+                             full_router_logits: torch.Tensor):
+        assert self.batched_hidden_states is not None
+        assert self.batched_router_logits is not None
+        assert self.batched_hidden_states.dtype == full_hidden_states.dtype
+        assert self.batched_router_logits.dtype == full_router_logits.dtype
+        # Check size compatibility.
+        assert (
+            self.batched_hidden_states.size(-1) == full_hidden_states.size(-1))
+        assert (
+            self.batched_router_logits.size(-1) == full_router_logits.size(-1))
+
+        full_final_hidden_states = torch.empty_like(full_hidden_states)
+
+        def process_chunk(chunk_start, chunk_end, skip_result_store=False):
+            chunk_size = chunk_end - chunk_start
+            hidden_states = full_hidden_states[chunk_start:chunk_end, :]
+            router_logits = full_router_logits[chunk_start:chunk_end, :]
+
+            assert (self.batched_hidden_states.size(0)  # type: ignore
+                    >= chunk_size)
+            assert (self.batched_router_logits.size(0)  # type: ignore
+                    >= chunk_size)
+            staged_hidden_states = self.batched_hidden_states[:
+                                                              chunk_size, :]  # type: ignore
+            staged_router_logits = self.batched_router_logits[:
+                                                              chunk_size, :]  # type: ignore
+            staged_hidden_states.copy_(hidden_states, non_blocking=True)
+            staged_router_logits.copy_(router_logits, non_blocking=True)
+
+            # Matrix multiply.
+            final_hidden_states = self.quant_method.apply(
+                layer=self,
+                x=staged_hidden_states,
+                router_logits=staged_router_logits,
+                top_k=self.top_k,
+                renormalize=self.renormalize,
+                use_grouped_topk=self.use_grouped_topk,
+                global_num_experts=self.global_num_experts,
+                expert_map=self.expert_map,
+                topk_group=self.topk_group,
+                num_expert_group=self.num_expert_group,
+                custom_routing_function=self.custom_routing_function,
+                scoring_func=self.scoring_func,
+                e_score_correction_bias=self.e_score_correction_bias,
+                activation=self.activation,
+                enable_eplb=self.enable_eplb,
+                expert_load_view=self.expert_load_view,
+                logical_to_physical_map=self.logical_to_physical_map,
+                logical_replica_count=self.logical_replica_count,
+            )
+
+            if not skip_result_store:
+                full_final_hidden_states[chunk_start:chunk_end, :].copy_(
+                    final_hidden_states, non_blocking=True)
+
+        ctx = get_forward_context()
+        # flashinfer_cutlass_kernels can handle: optional DP + TP/EP
+        max_tokens_across_dp = ctx.dp_metadata.max_tokens_across_dp_cpu
+        moe_dp_chunk_size_per_rank = self.moe_config.max_num_tokens
+        num_tokens = full_hidden_states.size(0)
+        for chunk_start_ in range(0, max_tokens_across_dp,
+                                  moe_dp_chunk_size_per_rank):
+            chunk_start = chunk_start_
+            chunk_end = min(chunk_start + moe_dp_chunk_size_per_rank,
+                            max_tokens_across_dp)
+            # clamp start and end
+            chunk_start = min(chunk_start, num_tokens - 1)
+            chunk_end = min(chunk_end, num_tokens)
+
+            process_chunk(chunk_start,
+                          chunk_end,
+                          skip_result_store=chunk_start_ >= num_tokens)
+
+        return full_final_hidden_states
+
+    def forward_impl(self, hidden_states: torch.Tensor,
+                     router_logits: torch.Tensor):
+        assert self.quant_method is not None
+        # Route to the chunked forward path using the FlashInfer Cutlass kernel
+        # only when data parallelism (DP) is enabled.
+        use_flashinfer_cutlass_kernels = (
+            self.dp_size > 1
+            and self.moe_parallel_config.use_flashinfer_cutlass_kernels)
+        if (self.moe_parallel_config.use_pplx_kernels
+                or self.moe_parallel_config.use_deepep_ll_kernels
+                or use_flashinfer_cutlass_kernels):
+            return self.forward_impl_chunked(hidden_states, router_logits)
+
+        do_naive_dispatch_combine: bool = (
+            self.dp_size > 1
+            and not self.moe_parallel_config.use_deepep_ht_kernels
+            and not self.moe_parallel_config.use_flashinfer_cutlass_kernels)
+        if do_naive_dispatch_combine:
+            hidden_states, router_logits = get_ep_group().dispatch(
+                hidden_states, router_logits)
+
+        # Matrix multiply.
+        final_hidden_states = self.quant_method.apply(
+            layer=self,
+            x=hidden_states,
+            router_logits=router_logits,
+            top_k=self.top_k,
+            renormalize=self.renormalize,
+            use_grouped_topk=self.use_grouped_topk,
+            global_num_experts=self.global_num_experts,
+            expert_map=self.expert_map,
+            topk_group=self.topk_group,
+            num_expert_group=self.num_expert_group,
+            custom_routing_function=self.custom_routing_function,
+            scoring_func=self.scoring_func,
+            e_score_correction_bias=self.e_score_correction_bias,
+            activation=self.activation,
+            apply_router_weight_on_input=self.apply_router_weight_on_input,
+            enable_eplb=self.enable_eplb,
+            expert_load_view=self.expert_load_view,
+            logical_to_physical_map=self.logical_to_physical_map,
+            logical_replica_count=self.logical_replica_count,
+        )
+
+        if do_naive_dispatch_combine:
+            final_hidden_states = get_ep_group().combine(final_hidden_states)
+        if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
+            # Default set to False. (May have to add shared expert outputs.
+            final_hidden_states = self.maybe_all_reduce_tensor_model_parallel(
+                final_hidden_states)
+
+        return final_hidden_states
+
+    @classmethod
+    def make_expert_params_mapping(
+            cls,
+            ckpt_gate_proj_name: str,
+            ckpt_down_proj_name: str,
+            ckpt_up_proj_name: str,
+            num_experts: int,
+            num_redundant_experts: int = 0) -> list[tuple[str, str, int, str]]:
+
+        num_physical_experts = num_experts + num_redundant_experts
+
+        # In the returned mapping:
+        # - `expert_id` is the physical expert id
+        # - `weight_name` contains the weight name of the logical expert
+        # So that we should map the expert id to logical in `weight_name`
+        physical_to_logical_map = \
+            EplbState.build_initial_global_physical_to_logical_map(
+            num_experts, num_redundant_experts)
+
+        return [
+            # (param_name, weight_name, expert_id, shard_id)
+            ("experts.w13_" if weight_name
+             in [ckpt_gate_proj_name, ckpt_up_proj_name] else "experts.w2_",
+             f"experts.{physical_to_logical_map[expert_id]}.{weight_name}.",
+             expert_id, shard_id) for expert_id in range(num_physical_experts)
+            for shard_id, weight_name in [
+                ("w1", ckpt_gate_proj_name),
+                ("w2", ckpt_down_proj_name),
+                ("w3", ckpt_up_proj_name),
+            ]
+        ]
+
+    def extra_repr(self) -> str:
+
+        s = (
+            f"global_num_experts={self.global_num_experts}, "
+            f"local_num_experts={self.local_num_experts}, "
+            f"top_k={self.top_k}, "
+            f"intermediate_size_per_partition={self.intermediate_size_per_partition}, "  # noqa: E501
+            f"tp_size={self.tp_size},\n"
+            f"ep_size={self.ep_size}, "
+            f"reduce_results={self.reduce_results}, "
+            f"renormalize={self.renormalize}, "
+            f"use_grouped_topk={self.use_grouped_topk}")
+
+        if self.use_grouped_topk:
+            s += f", num_expert_group={self.num_expert_group}, topk_group={self.topk_group}"  # noqa: E501
+
+        s += f", scoring_func='{self.scoring_func}', activation='{self.activation}'"  # noqa: E501
+
+        return s
+
+
+def moe_forward(hidden_states: torch.Tensor, router_logits: torch.Tensor,
+                layer_name: str) -> torch.Tensor:
+    forward_context: ForwardContext = get_forward_context()
+    self = forward_context.no_compile_layers[layer_name]
+    assert self.quant_method is not None
+
+    return self.forward_impl(hidden_states, router_logits)
+
+
+def moe_forward_fake(hidden_states: torch.Tensor, router_logits: torch.Tensor,
+                     layer_name: str) -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+
+
+direct_register_custom_op(
+    op_name="moe_forward",
+    op_func=moe_forward,
+    mutates_args=["hidden_states"],
+    fake_impl=moe_forward_fake,
+    dispatch_key=current_platform.dispatch_key,
+    tags=(torch.Tag.needs_fixed_stride_order, ),
+)
+
+# Mark the FusedMoE weight_loader as supporting MoE-specific parameters
+# to avoid expensive runtime reflection in model loading code
+FusedMoE.weight_loader.supports_moe_loading = True  # type: ignore[attr-defined]
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/modular_kernel.py
new file mode 100644
index 0000000..6262904
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -0,0 +1,797 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from enum import Enum
+from math import prod
+from typing import Any, Optional, final
+
+import torch
+
+import vllm.envs as envs
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.utils import (  # yapf: disable
+    _resize_cache, count_expert_num_tokens)
+from vllm.utils import cdiv
+
+#
+# This file defines a set of base classes used to make MoE kernels more modular.
+# The goal is to be able to utilize different communication mechanisms with
+# any fused MoE kernel without needing to have combinatoric implementations.
+#
+# The fused moe kernels are broken down into the following components:
+#
+# [Router] → [Quantize-Dispatch] → [Permute-Experts-Unpermute] → [Combine]
+#
+# Each component will be independent of (but may inform) the others except for
+# [Quantize-Dispatch] and `[Combine] (see below). The components can then be
+# mixed and matched with so that DP+EP can be supported easily for multiple
+# MoE kernel implementations.
+#
+# The following main classes are defined:
+# * FusedMoEPrepareAndFinalize - an abstract base class for preparation of MoE
+#   inputs (e.g. quantization, distribution) and finalization of Moe outputs.
+#   The prepare method must take care of any needed quantization and the
+#   finalize method, informed by the FusedMoEPermuteExpertsUnpermute method,
+#   may apply weights and/or do the final reduction of the output.
+# * FusedMoEPermuteExpertsUnpermute - an abstract base class for the main fused
+#   MoE operation, i.e matmul + act_mul + optionally quant + matmul.
+#   Some FusedMoEPermuteExpertsUnpermute implementations may choose to do
+#   the weight application and/or reduction. The class communicates this
+#   to [Finalize] via a TopKWeightAndReduce object.
+# * FusedMoEModularKernel - an interface class that combines a
+#   FusedMoEPrepareAndFinalize and a FusedMoEPermuteExpertsUnpermute to
+#   provide the standard fused MoE kernel interface.
+# * TopKWeightAndReduce - A TopKWeightAndReduce implementation chosen
+#   by the FusedMoEPermuteExpertsUnpermute implementation that is passed
+#   on to [Finalize].
+#
+# [Quantize-Prepare] and [Finalize] functionality are bundled into a single
+# class `FusedMoEPrepareAndFinalize` since they could use collective
+# communication mechanisms that need to be consistent.
+#
+
+
+def _moe_problem_size(
+    a1: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_ids: torch.Tensor,
+) -> tuple[int, int, int, int, int]:
+    """
+    Extract the MoE problem size from the given tensor arguments:
+    - a: The hidden states, input to the MoE layer.
+    - w1: The first set of expert weights.
+    - w2: The second set of expert weights.
+    - topk_ids: The topk ids.
+
+    Note: extracting the problem shape from the weight and activation tensors is
+    not obvious.  It needs to be done this way specifically due to subtle issues
+    with particular kernels, e.g. the int4 kernels divide the trailing dimension
+    by two, so it's not "correct" to extract N or K from the trailing dimension
+    of w1 or w2.  Similarly, some kernels transpose the weights, so this needs
+    to be kept in mind.
+    """
+    assert w1.dim() == 3 and w2.dim() == 3
+    E, N, _ = w1.size()
+    K = w2.size(1)
+
+    if a1.dim() == 2:
+        # Make sure we are using the correct a1 (pre-permute).
+        assert topk_ids.size(0) == a1.size(0), \
+            f"{topk_ids.size(0)} != {a1.size(0)}"
+        M = a1.size(0)
+    else:
+        assert a1.dim() == 3
+        assert a1.size(0) == E, f"{a1.size(0)} == {E}"
+        M = a1.size(1)  # This is max_num_tokens
+
+    assert topk_ids.dim() == 2
+    topk = topk_ids.size(1)
+
+    return E, M, N, K, topk
+
+
+class FusedMoEActivationFormat(Enum):
+    """
+    The standard activation format (num_tokens, hidden dim).
+    """
+    Standard = "standard",
+    """
+    The batched experts format (num experts, max tokens per expert, hidden dim)
+    """
+    BatchedExperts = "batched_experts",
+
+
+@dataclass
+class ExpertTokensMetadata:
+    """
+  Metadata regarding expert-token routing.
+  """
+    expert_num_tokens: torch.Tensor
+    expert_num_tokens_cpu: Optional[torch.Tensor]
+
+    @staticmethod
+    def make_from_list(expert_num_tokens_list: list[int],
+                       device: str) -> "ExpertTokensMetadata":
+        expert_num_tokens_cpu = torch.tensor(expert_num_tokens_list,
+                                             device="cpu",
+                                             dtype=torch.int32)
+        return ExpertTokensMetadata(
+            expert_num_tokens=expert_num_tokens_cpu.to(device,
+                                                       non_blocking=True),
+            expert_num_tokens_cpu=expert_num_tokens_cpu)
+
+
+class TopKWeightAndReduce(ABC):
+    """
+    An abstract base class for weight application and reduction implementations.
+    """
+
+    @abstractmethod
+    def apply(self, output: Optional[torch.Tensor],
+              fused_expert_output: torch.Tensor, topk_weights: torch.Tensor,
+              topk_ids: torch.Tensor,
+              apply_router_weight_on_input: bool) -> torch.Tensor:
+        """
+        Apply topk_weights to the fused_experts_outputs and/or reduce.
+        If an output tensor is not passed, it will be created in the
+        function.
+        """
+        raise NotImplementedError
+
+
+# TODO: pass FusedMoEParallelConfig in as ctor parameter?
+class FusedMoEPrepareAndFinalize(ABC):
+    """
+    An abstract base class for the [Quantize-Prepare] and [Finalize] steps
+    described above.
+    """
+
+    @abstractmethod
+    def prepare(
+        self, a1: torch.Tensor, a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor, num_experts: int,
+        expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool,
+        quant_config: FusedMoEQuantConfig,
+        extra_prepare_args: Optional[dict[str, Any]]
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor],
+               Optional[ExpertTokensMetadata], Optional[torch.Tensor],
+               Optional[torch.Tensor]]:
+        """
+        Perform any quantization (and/or) dispatching needed
+        for this kernel.
+        - a1: The (unquantized) input to the MoE layer.
+        - a1_scale: Optional scales for a1
+        - a2_scale: Optional scales for the second MoE gemm.  Required to make
+          sure the quantization is consistent for both gemms.
+        - topk_ids: The topk ids.
+        - topk_weights: The topk weights.
+        - num_experts: The total number of experts in the global expert space.
+        - expert_map: A tensor mapping expert indices from the global expert
+          space to the local expert space of the expert parallel shard.
+        - apply_router_weight_on_input: When True, apply the weights to the
+          activations, before quantization + dispatching.
+
+        Returns a tuple of:
+        - quantized + dispatched a.
+        - quantized + dispatched a1_scales.
+        - Optional ExpertTokensMetadata containing gpu/cpu tensors
+          as big as the number of local experts with the information about the
+          number of tokens assigned to each local expert.
+        - Optional dispatched expert topk IDs
+        - Optional dispatched expert topk weight
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
+                 topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                 apply_router_weight_on_input: bool,
+                 weight_and_reduce_impl: TopKWeightAndReduce,
+                 extra_finalize_args: Optional[dict[str, Any]]) -> None:
+        """
+        Perform any combine plus apply weights and perform a reduction on the
+        fused experts output.
+        - output: The output tensor, written in place.  Must be (M, K) shape.
+        - fused_expert_output: The unweighted, unreduced output of the fused
+          experts, it will have (M, topk, K) shape.
+        - topk_weights: The weights to be applied to the fused_experts_output.
+        - topk_ids: The topk_ids.
+        - apply_router_weight_on_input: When False, apply the weights to
+          fused_expert_output.
+        - weight_and_reduce_impl: An optional TopKWeightAndReduce
+          implementation.
+        """
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def activation_format(self) -> FusedMoEActivationFormat:
+        """
+        A property indicating the output format of the activations for the
+        'prepare' method.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+        """
+        The PrepareFinalize All2All implementations generally constrain the
+        dtype of the topk_ids they support. This function returns the
+        required topk indices dtype so it can be respected.
+        Return None if there are no such restrictions.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def max_num_tokens_per_rank(self) -> Optional[int]:
+        """
+        Some PrepareFinalize All2All implementations are batched. Meaning,
+        they can processes only as set of tokens at a time. This
+        function returns the batch size i.e the maximum number of tokens
+        the implementation can process at a time.
+        Return None if there are no such restrictions.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def num_dispatchers(self) -> int:
+        raise NotImplementedError
+
+
+class FusedMoEPermuteExpertsUnpermute(ABC):
+    """
+    An abstract base class for the [Permute-Experts-Unpermute] step described
+    above.
+    """
+
+    def __init__(
+        self,
+        quant_config: Optional[FusedMoEQuantConfig],
+    ):
+        if quant_config is not None:
+            self.quant_config = quant_config
+        else:
+            self.quant_config = FusedMoEQuantConfig()
+
+    @property
+    @abstractmethod
+    def activation_formats(
+            self) -> tuple[FusedMoEActivationFormat, FusedMoEActivationFormat]:
+        """
+        A property which is a tuple of the input and output activation formats
+        for the 'apply' method.
+        """
+        raise NotImplementedError
+
+    @property
+    def quant_dtype(self) -> Optional[torch.dtype]:
+        return self.quant_config.quant_dtype
+
+    @property
+    def block_shape(self) -> Optional[list[int]]:
+        return self.quant_config.block_shape
+
+    @property
+    def per_act_token_quant(self) -> bool:
+        return self.quant_config.per_act_token_quant
+
+    @property
+    def per_out_ch_quant(self) -> bool:
+        return self.quant_config.per_out_ch_quant
+
+    # TODO (bnell): make this return a CHUNK_SIZE or None instead?
+    @abstractmethod
+    def supports_chunking(self) -> bool:
+        """
+        A flag indicating whether or not this class supports activation
+        chunking.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def supports_expert_map(self) -> bool:
+        """
+        A flag indicating whether or not this class supports expert maps
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def workspace_shapes(
+        self,
+        a: torch.Tensor,
+        aq: torch.Tensor,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: Optional[ExpertTokensMetadata],
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        """
+        Compute the shapes for the temporary and final outputs of the two gemms
+        and activation in the fused expert function.  Since the gemms are
+        independent, the workspace for the first gemm can be shared with the
+        workspace for the last gemm.
+
+        Returns a tuple of:
+        - workspace13 shape tuple: must be large enough to hold the
+          result of either expert gemm.
+        - workspace2 shape tuple: must be large enough to hold the
+          result of the activation function.
+        - output shape tuple: must be exact size of the final gemm output.
+        - Workspace type: The dtype to use for the workspace tensors.
+        - Note: in order for activation chunking to work, the first dimension
+          of each tuple must be the number of tokens.
+        """
+        raise NotImplementedError
+
+    def activation(self, activation: str, output: torch.Tensor,
+                   input: torch.Tensor) -> None:
+        assert output.size(-1) * 2 == input.size(-1)
+        if activation == "silu":
+            torch.ops._C.silu_and_mul(output, input)
+        elif activation == "gelu":
+            torch.ops._C.gelu_and_mul(output, input)
+        else:
+            raise ValueError(f"Unsupported FusedMoe activation: {activation}")
+
+    def enable_chunking(self):
+        return envs.VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING and \
+          self.supports_chunking()
+
+    def finalize_weight_and_reduce_impl(self) -> TopKWeightAndReduce:
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: Optional[ExpertTokensMetadata],
+        apply_router_weight_on_input: bool,
+        extra_expert_args: Optional[dict[str, Any]],
+    ):
+        """
+        This function computes the intermediate result of a Mixture of Experts
+        (MoE) layer using two sets of weights, w1 and w2.
+
+        Parameters:
+        - output: (torch.Tensor): The unweighted, unreduced output tensor.
+        - hidden_states: (torch.Tensor): The (quantized) input tensor to the MoE
+          layer.
+        - w1 (torch.Tensor): The first set of expert weights.
+        - w2 (torch.Tensor): The second set of expert weights.
+        - topk_weights: A map of row to expert weights. Some implementations
+          choose to do weight application. 
+        - topk_ids (torch.Tensor): A map of row to expert id.
+        - activation (str): The activation function to apply after the first
+          MoE layer.
+        - global_num_experts (int): The total number of experts in the global
+          expert space.
+        - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices
+          from the global expert space to the local expert space of the expert
+          parallel shard.
+        - w1_scale (Optional[torch.Tensor]): Optional scale to be used for w1.
+        - w2_scale (Optional[torch.Tensor]): Optional scale to be used for w2.
+        - w1_zp (Optional[torch.Tensor]): Optional zero points to be used for
+          w1.
+        - w2_zp (Optional[torch.Tensor]): Optional zero points to be used for
+          w2.
+        - a1q_scale (Optional[torch.Tensor]): Optional quantized scale to be
+          used for a1.
+        - a2_scale (Optional[torch.Tensor]): Optional scale to be used for a2.
+        - workspace13 (torch.Tensor): A scratch tensor used for gemm outputs
+          must be large enough to hold output of either MoE gemm.
+        - workspace2 (torch.Tensor): A scratch tensor used for the activation
+          function.
+        - expert_tokens_meta (Optional[ExpertTokensMetadata]) - An optional
+          ExpertTokensMetadata object containing gpu/cpu tensors
+          as big as the number of local experts with the information about the
+          number of tokens assigned to each local expert.
+        - apply_router_weight_on_input: True if router weights are already
+          applied on the input. This is relevant if the implementation
+          chooses to do weight application.
+        """
+        raise NotImplementedError
+
+
+def _chunk_scales(scales: Optional[torch.Tensor], start: int,
+                  end: int) -> Optional[torch.Tensor]:
+    if scales is not None:
+        if scales.numel() == 1:
+            return scales
+        else:
+            return scales[start:end]
+    return None
+
+
+@final
+class FusedMoEModularKernel(torch.nn.Module):
+    """
+    This class combines a FusedMoEPrepareAndFinalize instance and
+    a FusedMoEPermuteExpertsUnpermute to provide an interface that
+    is compatible with the `fused_experts` function in fused_moe.py.
+
+    It takes care of managing any required scratch space.
+
+    Note: Instances of this class should only be used for a single model
+    layer due to any layer specific state that may be used by the component
+    objects.
+    """
+
+    def __init__(
+        self,
+        prepare_finalize: FusedMoEPrepareAndFinalize,
+        fused_experts: FusedMoEPermuteExpertsUnpermute,
+    ):
+        super().__init__()
+        self.prepare_finalize = prepare_finalize
+        self.fused_experts = fused_experts
+        assert prepare_finalize.activation_format == \
+            fused_experts.activation_formats[0], (
+                f"{prepare_finalize.__class__.__name__}."
+                f"{prepare_finalize.activation_format} == "
+                f"{fused_experts.__class__.__name__}."
+                f"{fused_experts.activation_formats[0]}")
+
+    def _do_fused_experts(
+            self, fused_out: Optional[torch.Tensor], a1: torch.Tensor,
+            a1q: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor,
+            topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+            activation: str, global_num_experts: int, local_num_experts: int,
+            expert_map: Optional[torch.Tensor],
+            w1_scale: Optional[torch.Tensor], w2_scale: Optional[torch.Tensor],
+            w1_zp: Optional[torch.Tensor], w2_zp: Optional[torch.Tensor],
+            a1q_scale: Optional[torch.Tensor],
+            a2_scale: Optional[torch.Tensor],
+            expert_tokens_meta: Optional[ExpertTokensMetadata],
+            apply_router_weight_on_input: bool,
+            extra_expert_args: Optional[dict[str, Any]]) -> torch.Tensor:
+
+        _, M, N, K, top_k = _moe_problem_size(a1q, w1, w2, topk_ids)
+
+        (workspace13_shape, workspace2_shape, fused_out_shape,
+         workspace_dtype) = self.fused_experts.workspace_shapes(
+             a1, a1q, M, N, K, top_k, global_num_experts, local_num_experts,
+             expert_tokens_meta)
+
+        # We can reuse the memory between cache1 and cache3 because by the
+        # time we need cache3, we're done with cache1.
+        workspace13 = torch.empty(prod(workspace13_shape),
+                                  device=a1.device,
+                                  dtype=workspace_dtype)
+        workspace2 = torch.empty(prod(workspace2_shape),
+                                 device=a1.device,
+                                 dtype=workspace_dtype)
+
+        assert fused_out is None or fused_out.shape == fused_out_shape, (
+            f"fused_out {fused_out.shape} but expected {fused_out_shape}")
+        if fused_out is None:
+            # reuse workspace13 for the output
+            fused_out = _resize_cache(workspace13, fused_out_shape)
+
+        self.fused_experts.apply(
+            fused_out,
+            a1q,
+            w1,
+            w2,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            activation=activation,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            w1_zp=w1_zp,
+            w2_zp=w2_zp,
+            a1q_scale=a1q_scale,
+            a2_scale=a2_scale,
+            workspace13=workspace13,
+            workspace2=workspace2,
+            expert_tokens_meta=expert_tokens_meta,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            extra_expert_args=extra_expert_args)
+
+        return fused_out
+
+    def _maybe_chunk_fused_experts(
+        self,
+        a1: torch.Tensor,
+        a1q: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        expert_tokens_meta: Optional[ExpertTokensMetadata],
+        apply_router_weight_on_input: bool,
+        extra_expert_args: Optional[dict[str, Any]],
+    ) -> torch.Tensor:
+
+        _, M, N, K, top_k = _moe_problem_size(a1q, w1, w2, topk_ids)
+
+        CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
+        num_chunks = cdiv(M, CHUNK_SIZE)
+
+        if not self.fused_experts.supports_chunking() or num_chunks == 1:
+            return self._do_fused_experts(
+                fused_out=None,
+                a1=a1,
+                a1q=a1q,
+                w1=w1,
+                w2=w2,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                activation=activation,
+                global_num_experts=global_num_experts,
+                local_num_experts=local_num_experts,
+                expert_map=expert_map,
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                w1_zp=w1_zp,
+                w2_zp=w2_zp,
+                a1q_scale=a1q_scale,
+                a2_scale=a2_scale,
+                expert_tokens_meta=expert_tokens_meta,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                extra_expert_args=extra_expert_args)
+
+        # Chunking required case
+        assert num_chunks > 1
+
+        # Construct the entire output that can then be processed in chunks.
+        (_, _, fused_out_shape, _) = self.fused_experts.workspace_shapes(
+            a1, a1q, M, N, K, top_k, global_num_experts, local_num_experts,
+            expert_tokens_meta)
+        fused_out = torch.empty(fused_out_shape,
+                                device=a1q.device,
+                                dtype=a1.dtype)
+
+        def slice_input_tensors(
+            chunk_idx: int
+        ) -> tuple[torch.Tensor, Optional[torch.Tensor],
+                   Optional[torch.Tensor], torch.Tensor, torch.Tensor]:
+            s = chunk_idx * CHUNK_SIZE
+            e = min(s + CHUNK_SIZE, M)
+            return (a1q[s:e], _chunk_scales(a1q_scale, s, e),
+                    _chunk_scales(a2_scale, s,
+                                  e), topk_ids[s:e], topk_weights[s:e])
+
+        def slice_output_tensor(chunk_idx: int) -> torch.Tensor:
+            assert fused_out.size(0) % M == 0, (
+                f"fused_out shape {fused_out.shape} vs M {M}")
+            factor = fused_out.size(0) // M
+            out_chunk_size = CHUNK_SIZE * factor
+            s = chunk_idx * out_chunk_size
+            e = min(s + out_chunk_size, fused_out.size(0))
+            return fused_out[s:e]
+
+        def slice_expert_tokens_metadata(
+                full_expert_tokens_meta: ExpertTokensMetadata,
+                chunk_topk_ids: torch.Tensor, local_num_experts: int,
+                expert_map: Optional[torch.Tensor]) -> ExpertTokensMetadata:
+            # The existing expert_num_tokens is for the entire a1q
+            # input. Chunking forces recomputation of the number
+            # of tokens assigned to each expert.
+            c_expert_num_tokens = count_expert_num_tokens(
+                chunk_topk_ids, local_num_experts, expert_map)
+
+            c_expert_num_tokens_cpu = None
+            need_expert_num_tokens_cpu = (
+                full_expert_tokens_meta.expert_num_tokens_cpu is not None)
+            if need_expert_num_tokens_cpu:
+                # This is blocking as some implementations need the count
+                # on the CPU to determine appropriate input/out fused-moe
+                # buffers
+                c_expert_num_tokens_cpu = c_expert_num_tokens.to(
+                    "cpu", non_blocking=False)
+
+            return ExpertTokensMetadata(
+                expert_num_tokens=c_expert_num_tokens,
+                expert_num_tokens_cpu=c_expert_num_tokens_cpu)
+
+        m = None
+        if extra_expert_args is not None and 'm' in extra_expert_args:
+            m = extra_expert_args.get('m')
+
+        if extra_expert_args is not None:
+            chunked_extra_expert_args = extra_expert_args
+        else:
+            chunked_extra_expert_args = {}
+
+        for chunk_idx in range(num_chunks):
+            c_a1q, c_a1q_scale, c_a2_scale, c_topk_ids, c_topk_weights = (
+                slice_input_tensors(chunk_idx))
+
+            c_expert_tokens_meta = None
+            if expert_tokens_meta is not None:
+                c_expert_tokens_meta = slice_expert_tokens_metadata(
+                    expert_tokens_meta, c_topk_ids, local_num_experts,
+                    expert_map)
+
+            s = chunk_idx * CHUNK_SIZE
+            e = min(s + CHUNK_SIZE, M)
+
+            if m is not None:
+                chunked_extra_expert_args['m'] = e - s
+            self._do_fused_experts(
+                fused_out=slice_output_tensor(chunk_idx),
+                a1=a1,
+                a1q=c_a1q,
+                w1=w1,
+                w2=w2,
+                topk_weights=c_topk_weights,
+                topk_ids=c_topk_ids,
+                activation=activation,
+                global_num_experts=global_num_experts,
+                local_num_experts=local_num_experts,
+                expert_map=expert_map,
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                w1_zp=w1_zp,
+                w2_zp=w2_zp,
+                a1q_scale=c_a1q_scale,
+                a2_scale=c_a2_scale,
+                expert_tokens_meta=c_expert_tokens_meta,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                extra_expert_args=chunked_extra_expert_args)
+
+        return fused_out
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        inplace: bool = False,
+        activation: str = "silu",
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        w1_scale: Optional[torch.Tensor] = None,
+        w2_scale: Optional[torch.Tensor] = None,
+        w1_zp: Optional[torch.Tensor] = None,
+        w2_zp: Optional[torch.Tensor] = None,
+        a1_scale: Optional[torch.Tensor] = None,
+        a2_scale: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        extra_expert_args: Optional[dict] = None,
+        extra_prepare_args: Optional[dict] = None,
+        extra_finalize_args: Optional[dict] = None,
+    ) -> torch.Tensor:
+        """
+        This function computes a Mixture of Experts (MoE) layer using two sets
+        of weights, w1 and w2, and top-k gating mechanism.
+
+        Parameters:
+        - hidden_states: (torch.Tensor): The input tensor to the MoE layer.
+        - w1 (torch.Tensor): The first set of expert weights.
+        - w2 (torch.Tensor): The second set of expert weights.
+        - topk_weights (torch.Tensor): The topk weights applied at the end of
+          the layer.
+        - topk_ids (torch.Tensor): A map of row to expert id.
+        - inplace (bool): If True, perform the operation in-place.
+          Defaults to False.
+        - activation (str): The activation function to apply after the first
+          MoE layer.
+        - global_num_experts (int): The total number of experts in the global
+          expert space.
+        - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices
+          from the global expert space to the local expert space of the expert
+          parallel shard.
+        - w1_scale (Optional[torch.Tensor]): Optional scale to be used for w1.
+        - w2_scale (Optional[torch.Tensor]): Optional scale to be used for w2.
+        - w1_zp (Optional[torch.Tensor]): Optional zero points to be used for
+          w1.
+        - w2_zp (Optional[torch.Tensor]): Optional zero points to be used for
+          w2.
+        - a1_scale (Optional[torch.Tensor]): Optional scale to be used for a1.
+        - a2_scale (Optional[torch.Tensor]): Optional scale to be used for a2.
+        - apply_router_weight_on_input (bool): When true, the topk weights are
+          applied directly on the inputs. This is only applicable when topk is
+          1.
+        - extra_expert_args (Optional[dict]): Extra keyword arguments to pass to
+          fused_experts.apply.
+        - extra_prepare_args (Optional[dict]): Extra keyword arguments to pass
+          to prepare.
+        - extra_finalize_args (Optional[dict]): Extra keyword arguments to pass 
+          to finalize.
+
+        Returns:
+        - torch.Tensor: The output tensor after applying the MoE layer.
+        """
+
+        a1 = hidden_states
+        output = a1 if inplace else torch.zeros_like(a1)
+
+        local_num_experts = w1.size(0)
+        if global_num_experts == -1:
+            global_num_experts = local_num_experts
+
+        (a1q, a1q_scale, expert_tokens_meta, _expert_topk_ids,
+         _expert_topk_weights) = self.prepare_finalize.prepare(
+             a1,
+             a1_scale,
+             a2_scale,
+             topk_weights,
+             topk_ids,
+             global_num_experts,
+             expert_map,
+             apply_router_weight_on_input,
+             self.fused_experts.quant_config,
+             extra_prepare_args,
+         )
+
+        # Maybe prepare gathered topk_ids and topk_weights from other EP ranks.
+        topk_ids = topk_ids if _expert_topk_ids is None else _expert_topk_ids
+        topk_weights = (topk_weights if _expert_topk_weights is None else
+                        _expert_topk_weights)
+
+        fused_out = None
+
+        if a1q.numel() == 0:
+            # This happens when none of the tokens from the all2all reach this
+            # EP rank. Also, note that this is only relevant for CUDAGraph
+            # incompatible all2all kernels like the DeepEP high-throughput
+            # kernels. CUDAGraph compatible all2all kernels like the pplx
+            # kernels and the DeepEP low-latency kernels are always batched
+            # and can never run into the tensor.numel() == 0 case.
+            fused_out = torch.empty_like(a1q).to(dtype=a1.dtype)
+        else:
+            fused_out = self._maybe_chunk_fused_experts(
+                a1=a1,
+                a1q=a1q,
+                w1=w1,
+                w2=w2,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                activation=activation,
+                global_num_experts=global_num_experts,
+                local_num_experts=local_num_experts,
+                expert_map=expert_map,
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                w1_zp=w1_zp,
+                w2_zp=w2_zp,
+                a1q_scale=a1q_scale,
+                a2_scale=a2_scale,
+                expert_tokens_meta=expert_tokens_meta,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                extra_expert_args=extra_expert_args)
+
+        self.prepare_finalize.finalize(
+            output, fused_out, topk_weights, topk_ids,
+            apply_router_weight_on_input,
+            self.fused_experts.finalize_weight_and_reduce_impl(),
+            extra_finalize_args)
+
+        return output
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/moe_align_block_size.py b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
new file mode 100644
index 0000000..2c9ad50
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
@@ -0,0 +1,223 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.triton_utils import tl, triton
+from vllm.utils import cdiv, round_up
+
+
+@triton.jit
+def moe_align_block_size_stage1(
+    topk_ids_ptr,
+    tokens_cnts_ptr,
+    num_experts: tl.constexpr,
+    numel: tl.constexpr,
+    tokens_per_thread: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    start_idx = pid * tokens_per_thread
+
+    off_c = (pid + 1) * num_experts
+
+    for i in range(tokens_per_thread):
+        if start_idx + i < numel:
+            idx = tl.load(topk_ids_ptr + start_idx + i)
+            token_cnt = tl.load(tokens_cnts_ptr + off_c + idx)
+            tl.store(tokens_cnts_ptr + off_c + idx, token_cnt + 1)
+
+
+@triton.jit
+def moe_align_block_size_stage2(
+    tokens_cnts_ptr,
+    num_experts: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    last_cnt = 0
+    for i in range(1, num_experts + 1):
+        token_cnt = tl.load(tokens_cnts_ptr + i * num_experts + pid)
+        last_cnt = last_cnt + token_cnt
+        tl.store(tokens_cnts_ptr + i * num_experts + pid, last_cnt)
+
+
+@triton.jit
+def moe_align_block_size_stage3(
+    total_tokens_post_pad_ptr,
+    tokens_cnts_ptr,
+    cumsum_ptr,
+    num_experts: tl.constexpr,
+    block_size: tl.constexpr,
+):
+    last_cumsum = 0
+    off_cnt = num_experts * num_experts
+    for i in range(1, num_experts + 1):
+        token_cnt = tl.load(tokens_cnts_ptr + off_cnt + i - 1)
+        last_cumsum = last_cumsum + tl.cdiv(token_cnt, block_size) * block_size
+        tl.store(cumsum_ptr + i, last_cumsum)
+    tl.store(total_tokens_post_pad_ptr, last_cumsum)
+
+
+@triton.jit
+def moe_align_block_size_stage4(
+    topk_ids_ptr,
+    sorted_token_ids_ptr,
+    expert_ids_ptr,
+    tokens_cnts_ptr,
+    cumsum_ptr,
+    num_experts: tl.constexpr,
+    block_size: tl.constexpr,
+    numel: tl.constexpr,
+    tokens_per_thread: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    start_idx = tl.load(cumsum_ptr + pid)
+    end_idx = tl.load(cumsum_ptr + pid + 1)
+
+    for i in range(start_idx, end_idx, block_size):
+        tl.store(expert_ids_ptr + i // block_size, pid)
+
+    start_idx = pid * tokens_per_thread
+    off_t = pid * num_experts
+
+    for i in range(start_idx, tl.minimum(start_idx + tokens_per_thread,
+                                         numel)):
+        expert_id = tl.load(topk_ids_ptr + i)
+        token_cnt = tl.load(tokens_cnts_ptr + off_t + expert_id)
+        rank_post_pad = token_cnt + tl.load(cumsum_ptr + expert_id)
+        tl.store(sorted_token_ids_ptr + rank_post_pad, i)
+        tl.store(tokens_cnts_ptr + off_t + expert_id, token_cnt + 1)
+
+
+# Triton implementation based on:
+# https://github.com/sgl-project/sglang/commit/ba5112ff691d791a9e38c6c71f59324a5fcb49d0
+def moe_align_block_size_triton(
+    topk_ids: torch.Tensor,
+    num_experts: int,
+    block_size: int,
+    sorted_token_ids: torch.Tensor,
+    expert_ids: torch.Tensor,
+    num_tokens_post_pad: torch.Tensor,
+) -> None:
+    numel = topk_ids.numel()
+    grid = (num_experts, )
+    tokens_cnts = torch.zeros((num_experts + 1, num_experts),
+                              dtype=torch.int32,
+                              device=topk_ids.device)
+    cumsum = torch.zeros((num_experts + 1, ),
+                         dtype=torch.int32,
+                         device=topk_ids.device)
+    tokens_per_thread = cdiv(numel, num_experts)
+    sorted_token_ids.fill_(numel)
+    expert_ids.zero_()
+
+    moe_align_block_size_stage1[grid](
+        topk_ids,
+        tokens_cnts,
+        num_experts,
+        numel,
+        tokens_per_thread,
+    )
+    moe_align_block_size_stage2[grid](
+        tokens_cnts,
+        num_experts,
+    )
+    moe_align_block_size_stage3[(1, )](
+        num_tokens_post_pad,
+        tokens_cnts,
+        cumsum,
+        num_experts,
+        block_size,
+    )
+    moe_align_block_size_stage4[grid](
+        topk_ids,
+        sorted_token_ids,
+        expert_ids,
+        tokens_cnts,
+        cumsum,
+        num_experts,
+        block_size,
+        numel,
+        tokens_per_thread,
+    )
+
+
+def moe_align_block_size(
+    topk_ids: torch.Tensor,
+    block_size: int,
+    num_experts: int,
+    expert_map: Optional[torch.Tensor] = None,
+    pad_sorted_ids: bool = False
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Aligns the token distribution across experts to be compatible with block
+    size for matrix multiplication.
+
+    Note: In the case of expert_parallel, moe_align_block_size initially
+    considers all experts as valid and aligns all tokens appropriately.
+    Before the function returns it marks the experts_ids that are not in
+    the current GPU rank as -1 so the MoE matmuls could skip those blocks.
+    This requires the num_experts input arg to be the num global experts.
+
+    Parameters:
+    - topk_ids: A tensor of shape [total_tokens, top_k] representing the
+        top-k expert indices for each token.
+    - block_size: The block size used in block matrix multiplication.
+    - num_experts: The total number of experts.
+    - expert_map: A tensor of shape [num_experts] that maps the expert index
+        from the global space to the local index space of the current
+        expert parallel shard. If the expert is not in the current expert
+        parallel shard, the mapping is set to -1.
+    - pad_sorted_ids: A flag indicating whether the sorted_token_ids length
+      should be padded to a multiple of block_size,
+
+    Returns:
+    - sorted_token_ids: A tensor containing the sorted token indices according
+        to their allocated expert.
+    - expert_ids: A tensor indicating the assigned expert index for each block.
+    - num_tokens_post_padded: The total number of tokens after padding,
+        ensuring divisibility by block_size.
+
+    This function pads the number of tokens that each expert needs to process
+    so that it is divisible by block_size.
+    Padding ensures that during block matrix multiplication, the dimensions
+    align correctly.
+
+    Example:
+    Given topk_ids = [[2, 3, 4], [1, 2, 4], [1, 3, 4], [1, 2, 3]],
+    block_size = 4, and num_experts = 4:
+    - We initially have 12 tokens (after repeating 'top_k' times) and 4 experts,
+        with each expert needing to process 3 tokens.
+    - As block_size is 4, we pad 1 token for each expert.
+    - First, flatten topk_ids to [2, 3, 4, 1, 2, 4, 1, 3, 4, 1, 2, 3].
+    - Then append padding tokens [12, 12, 12, 12] for each block.
+    - After sorting by expert index, we obtain token_ids
+        [3, 6, 9, 12, 0, 4, 10, 12, 1, 7, 11, 12, 2, 5, 8, 12].
+        Tokens 12 are non-existent (padding) and are ignored in
+        the subsequent matrix multiplication.
+    - The padding ensures that the total number of tokens is now divisible
+        by block_size for proper block matrix operations.
+    """
+    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+    if pad_sorted_ids:
+        max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
+    sorted_ids = torch.empty((max_num_tokens_padded, ),
+                             dtype=torch.int32,
+                             device=topk_ids.device)
+    max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
+    expert_ids = torch.empty((max_num_m_blocks, ),
+                             dtype=torch.int32,
+                             device=topk_ids.device)
+    num_tokens_post_pad = torch.empty((1),
+                                      dtype=torch.int32,
+                                      device=topk_ids.device)
+
+    ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids,
+                             expert_ids, num_tokens_post_pad)
+    if expert_map is not None:
+        expert_ids = expert_map[expert_ids]
+
+    return sorted_ids, expert_ids, num_tokens_post_pad
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/moe_pallas.py
new file mode 100644
index 0000000..d35bd00
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/moe_pallas.py
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+import torch.nn.functional as F
+import torch_xla.experimental.custom_kernel  # noqa: F401
+
+
+def _histogram(input: torch.Tensor, min: int, max: int) -> torch.Tensor:
+    """
+  Compute the histogram of a int32 tensor. The bin edges are defined by the
+  min and max values, with step = 1.
+  """
+    assert input.dtype == torch.int32, "input must be of torch.int32 dtype."
+    assert min <= max, "min must be less than or equal to max."
+
+    def searchsorted(sorted_sequence: torch.Tensor,
+                     values_to_search: torch.Tensor) -> torch.Tensor:
+        return (sorted_sequence.unsqueeze(1) == values_to_search).sum(dim=1)
+
+    bin_edges = torch.linspace(min, max, max - min + 1,
+                               dtype=input.dtype).to(input.device)
+    return searchsorted(bin_edges, input).to(torch.int32)
+
+
+def fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    global_num_experts: int,
+    expert_map: torch.Tensor = None,
+    renormalize: bool = False,
+) -> torch.Tensor:
+    """
+    Args:
+        hidden_states: [*, hidden_size]
+        w1: [num_experts, intermediate_size * 2, hidden_size]
+        w2: [num_experts, hidden_size, intermediate_size]
+        gating_output: [*, num_experts]
+    """
+    assert expert_map is None, "expert_map is not supported for pallas MoE."
+    orig_shape = hidden_states.shape
+    hidden_size = hidden_states.shape[-1]
+    num_tokens = hidden_states.shape[:-1].numel()
+    num_experts = w1.shape[0]
+    intermediate_size = w2.shape[-1]
+    device = hidden_states.device
+    dtype = hidden_states.dtype
+    assert (num_tokens * topk) % 16 == 0, (
+        "The Pallas GMM kernel requires num_tokens * topk to be a multiple of "
+        f"16 but got {num_tokens * topk}")
+
+    hidden_states = hidden_states.view(num_tokens, hidden_size)
+    gating_output = gating_output.view(num_tokens, num_experts)
+    topk_weights = gating_output.softmax(dim=-1, dtype=torch.float)
+    topk_weights, topk_indices = topk_weights.topk(topk, dim=-1)
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+    topk_weights = topk_weights.to(dtype)
+
+    topk_indices = topk_indices.flatten()
+    topk_argsort_indices = topk_indices.argsort()
+    topk_argsort_revert_indices = topk_argsort_indices.argsort()
+    token_indices = torch.arange(num_tokens,
+                                 device=device).repeat_interleave(topk)
+    token_indices = token_indices[topk_argsort_indices]
+    group_sizes = _histogram(topk_indices.to(torch.int32), 0, num_experts - 1)
+
+    x = hidden_states[token_indices]
+    x = torch.ops.xla.gmm(x, w1, group_sizes, transpose_rhs=True)
+    x = F.silu(x[..., :intermediate_size]) * x[..., intermediate_size:]
+    x = torch.ops.xla.gmm(x, w2, group_sizes, transpose_rhs=True)
+    x = x[topk_argsort_revert_indices].reshape(-1, topk, hidden_size)
+
+    x = x * topk_weights.unsqueeze(dim=-1)
+    x = x.sum(dim=-2)
+    x = x.reshape(orig_shape)
+    return x
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
new file mode 100644
index 0000000..20ee0d9
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
@@ -0,0 +1,190 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
+    moe_align_block_size)
+from vllm.model_executor.layers.fused_moe.utils import _fp8_perm
+
+
+def _moe_permute(
+    curr_hidden_states: torch.Tensor,
+    a1q_scale: Optional[torch.Tensor],
+    curr_topk_ids: torch.Tensor,
+    global_num_experts: int,
+    expert_map: Optional[torch.Tensor],
+    block_m: int,
+) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
+           torch.Tensor]:
+    """
+    Determine the sorted_token_ids, expert_ids for the given problem size.
+    Permute the hidden states and scales according to `sorted_token_ids`.
+    """
+    top_k_num = curr_topk_ids.size(1)
+
+    tokens_in_chunk = curr_hidden_states.size(0)
+
+    sorted_token_ids, expert_ids, num_tokens_post_padded = (
+        moe_align_block_size(curr_topk_ids,
+                             block_m,
+                             global_num_experts,
+                             expert_map,
+                             pad_sorted_ids=True))
+
+    inv_perm: Optional[torch.Tensor] = None
+
+    num_tokens = top_k_num * tokens_in_chunk
+    expert_ids = torch.repeat_interleave(expert_ids, block_m, dim=0)
+    inv_perm = torch.argsort(sorted_token_ids)[:num_tokens]
+
+    # Permute according to sorted token ids.
+    sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1)
+
+    curr_hidden_states = _fp8_perm(curr_hidden_states,
+                                   sorted_token_ids // top_k_num)
+
+    if a1q_scale is not None:
+        a1q_scale = a1q_scale[sorted_token_ids // top_k_num]
+
+    return (curr_hidden_states, a1q_scale, sorted_token_ids, expert_ids,
+            inv_perm)
+
+
+def _moe_unpermute_and_reduce(
+    out: torch.Tensor,
+    curr_hidden: torch.Tensor,
+    inv_perm: Optional[torch.Tensor],
+    topk_weight: torch.Tensor,
+    apply_router_weight_on_input: bool,
+) -> None:
+    """
+    Unpermute the final result and apply topk_weights, then perform the final
+    reduction on the hidden states.
+    """
+    M, topk = topk_weight.size()
+    K = curr_hidden.size(-1)
+    if inv_perm is not None:
+        curr_hidden = curr_hidden[inv_perm, ...]
+    curr_hidden = curr_hidden.view(-1, topk, K)
+    if not apply_router_weight_on_input:
+        curr_hidden.mul_(topk_weight.view(M, -1, 1))
+    ops.moe_sum(curr_hidden, out)
+
+
+def moe_permute(
+    hidden_states: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    token_expert_indices: torch.Tensor,
+    topk: int,
+    n_expert: int,
+    n_local_expert: int,
+    expert_map: Optional[torch.Tensor] = None,
+    align_block_size: Optional[int] = None,
+    fill_invalid_expert: int = -1
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    This function expands and permutes activation to gather uncontinuous tokens
+      for each expert.
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+    - topk_weights (torch.Tensor): topk expert route weight for each token.
+    - topk_ids (torch.Tensor): topk expert route id for each token.
+    - token_expert_indices (torch.Tensor): indice for expanded hidden.
+    - topk (int): The number of top-k experts to select.
+    - n_expert (int): The number of expert.
+    - n_local_expert (int): The number of expert in current EP rank.
+    - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices
+        from the global expert space to the local expert space of the expert
+        parallel shard.
+    - align_block_size (Optional[int]): align group gemm block size for deepgemm
+    - fill_invalid_expert(int): fill expert id in m_indices for invalid expert
+      to workaround DeepGemm unsupported -1 in m_indices
+    Returns:
+    - permuted_hidden_states (torch.Tensor): permuted activation.
+    - expert_first_token_offset (torch.Tensor): offset of the first token
+       of each expert for standard grouped gemm. if enable 'align_block_size'
+       expert_first_token_offset will align up to 'align_block_size'.
+    - src_row_id2dst_row_id_map (torch.Tensor): idx map for moe_unpermute.
+    - m_indices: m_indices for grouped gemm in deepgemm,`m_indices[i]` records
+    the group which the j-th row of the LHS belong to.`
+    """
+    n_token, n_hidden = hidden_states.size()
+    assert (n_hidden * hidden_states.element_size()
+            ) % 16 == 0, "permue kernel need hidden dim align to 16B"
+    permuted_row_size = n_token * topk
+    if align_block_size is not None:
+        permuted_row_size = (permuted_row_size + n_expert *
+                             (align_block_size - 1) + align_block_size -
+                             1) // align_block_size * align_block_size
+
+    permuted_hidden_states = torch.empty(
+        (permuted_row_size, n_hidden),
+        dtype=hidden_states.dtype,
+        device=hidden_states.device,
+    )
+    m_indices = torch.full((permuted_row_size, ),
+                           fill_invalid_expert,
+                           dtype=torch.int32,
+                           device=hidden_states.device)
+    expert_first_token_offset = torch.empty(n_local_expert + 1,
+                                            dtype=torch.int64,
+                                            device=hidden_states.device)
+    src_row_id2dst_row_id_map = torch.empty((n_token, topk),
+                                            dtype=torch.int32,
+                                            device=hidden_states.device)
+    torch.ops._moe_C.moe_permute(hidden_states, topk_weights, topk_ids,
+                                 token_expert_indices, expert_map, n_expert,
+                                 n_local_expert, topk, align_block_size,
+                                 permuted_hidden_states,
+                                 expert_first_token_offset,
+                                 src_row_id2dst_row_id_map, m_indices)
+    return (permuted_hidden_states, expert_first_token_offset,
+            src_row_id2dst_row_id_map, m_indices)
+
+
+def moe_unpermute(
+    permuted_hidden_states: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    src_row_id2dst_row_id_map: torch.Tensor,
+    expert_first_token_offset: torch.Tensor,
+    topk: int,
+    n_expert: int,
+    n_local_expert: int,
+) -> torch.Tensor:
+    """
+    This function expands and permutes activation to gathering uncontinuous
+      tokens for each expert.
+    Parameters:
+    - permuted_hidden_states (torch.Tensor): permuted activation.
+    - topk_weights (torch.Tensor): topk expert route weight for each token.
+    - topk_ids (torch.Tensor): topk expert route id for each token.
+    - expert_first_token_offset (torch.Tensor): offset of the first token
+       of each expert for grouped gemm.
+    - topk (int): The number of top-k experts to select.
+    - n_expert (int): The number of expert.
+    - n_local_expert (int): The number of expert in current EP rank.
+    Returns:
+    - hidden_states (torch.Tensor): The reduced and unpermuted activation
+      tensor.
+    """
+    n_token, n_hidden = topk_weights.size(0), permuted_hidden_states.size(-1)
+    assert (n_hidden * permuted_hidden_states.element_size()
+            ) % 16 == 0, "unpermue kernel need hidden dim align to 16B"
+    hidden_states = torch.empty((n_token, n_hidden),
+                                dtype=permuted_hidden_states.dtype,
+                                device=permuted_hidden_states.device)
+
+    torch.ops._moe_C.moe_unpermute(permuted_hidden_states, topk_weights,
+                                   topk_ids, src_row_id2dst_row_id_map,
+                                   expert_first_token_offset, n_expert,
+                                   n_local_expert, topk, hidden_states)
+    return hidden_states
+
+
+def moe_permute_unpermute_supported():
+    return torch.ops._moe_C.moe_permute_unpermute_supported()
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
new file mode 100644
index 0000000..6160da7
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+import torch.nn.functional as F
+
+
+def fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    global_num_experts: int,
+    expert_map: torch.Tensor = None,
+    renormalize: bool = False,
+) -> torch.Tensor:
+    """
+    Args:
+        hidden_states: [*, hidden_size]
+        w1: [num_experts, intermediate_size * 2, hidden_size]
+        w2: [num_experts, hidden_size, intermediate_size]
+        gating_output: [*, num_experts]
+        expert_map: [num_experts]
+    """
+    orig_shape = hidden_states.shape
+    hidden_size = hidden_states.shape[-1]
+    num_tokens = hidden_states.shape[:-1].numel()
+    num_experts = w1.shape[0]
+    intermediate_size = w2.shape[-1]
+    dtype = hidden_states.dtype
+
+    hidden_states = hidden_states.view(num_tokens, hidden_size)
+    gating_output = gating_output.view(num_tokens, global_num_experts)
+    topk_weights = gating_output.softmax(dim=-1, dtype=torch.float)
+    topk_weights, selected_experts = topk_weights.topk(topk, dim=-1)
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+    topk_weights = topk_weights.to(dtype)
+
+    if expert_map is not None:
+        selected_experts = expert_map[selected_experts]
+
+    final_hidden_states = None
+    for expert_idx in range(num_experts):
+        expert_w1 = w1[expert_idx]
+        expert_w2 = w2[expert_idx]
+        expert_mask = (selected_experts == expert_idx)
+        expert_weights = (topk_weights * expert_mask).sum(dim=-1, keepdim=True)
+        x = F.linear(hidden_states, expert_w1)
+        gate = F.silu(x[:, :intermediate_size])
+        x = x[:, intermediate_size:] * gate
+        x = F.linear(x, expert_w2)
+        current_hidden_states = x * expert_weights
+        if final_hidden_states is None:
+            final_hidden_states = current_hidden_states
+        else:
+            final_hidden_states = final_hidden_states + current_hidden_states
+
+    return final_hidden_states.view(orig_shape)  # type: ignore
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
new file mode 100644
index 0000000..46931f2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -0,0 +1,247 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, Optional
+
+import pplx_kernels as pplx
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
+    TopKWeightAndReduceDelegate)
+from vllm.model_executor.layers.fused_moe.utils import (
+    _validate_scale_shape, moe_kernel_quantize_input)
+from vllm.utils import cdiv, round_up
+
+logger = init_logger(__name__)
+
+
+def pplx_hidden_dim_scale_bytes(
+    max_num_tokens: int,
+    hidden_dim: int,
+    in_dtype: torch.dtype,
+    quant_dtype: Optional[torch.dtype],
+    per_act_token_quant: bool,
+    block_shape: Optional[list[int]],
+):
+    # All pplx byte sizes must be 16-byte aligned.
+    align = 16
+
+    # For blocked per token: set to
+    #   ceil_div(hidden_dim, block_size) * sizeof(float32)
+    # For per-token: set to 4 * sizeof(float32) (x4 for alignment)
+    if quant_dtype is not None:
+        assert quant_dtype.itemsize == 1
+        hidden_dim_bytes = hidden_dim * quant_dtype.itemsize
+        elem_size = torch.float32.itemsize
+
+        if per_act_token_quant:
+            # per-token (M x 1)
+            assert block_shape is None
+            hidden_scale_bytes = elem_size
+        elif block_shape is not None:
+            # per-group (M x K_tiles)
+            block_size = block_shape[1]
+            num_blocks = cdiv(hidden_dim, block_size)
+            hidden_scale_bytes = num_blocks * elem_size
+        else:
+            # per-tensor (1 x 1)
+            hidden_scale_bytes = elem_size
+    else:
+        hidden_dim_bytes = hidden_dim * in_dtype.itemsize
+        hidden_scale_bytes = 0
+
+    return (
+        round_up(hidden_dim_bytes, align),
+        round_up(hidden_scale_bytes, align),
+    )
+
+
+class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
+
+    def __init__(
+        self,
+        a2a: pplx.AllToAll,
+        max_num_tokens: int,
+        num_local_experts: int,
+        num_dispatchers: int,
+    ):
+        super().__init__()
+        assert max_num_tokens > 0
+        assert num_local_experts > 0
+        self.a2a = a2a
+        self.max_num_tokens = max_num_tokens
+        self.num_local_experts = num_local_experts
+        self.num_dispatchers_ = num_dispatchers
+
+    @property
+    def activation_format(self) -> mk.FusedMoEActivationFormat:
+        return mk.FusedMoEActivationFormat.BatchedExperts
+
+    def max_num_tokens_per_rank(self) -> Optional[int]:
+        return self.max_num_tokens
+
+    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+        return torch.int32
+
+    def num_dispatchers(self) -> int:
+        return self.num_dispatchers_
+
+    def prepare(
+        self, a1: torch.Tensor, a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor, num_experts: int,
+        expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool,
+        quant_config: FusedMoEQuantConfig,
+        extra_prepare_args: Optional[dict[str, Any]]
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor],
+               Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor],
+               Optional[torch.Tensor]]:
+        num_tokens = a1.size(0)  # M
+        hidden_dim = a1.size(-1)  # K
+
+        assert topk_ids.size(0) == num_tokens
+        # expert_map should be None because with expert map, -1 id is used for
+        # non-local token; this causes error when casting ids to the
+        # topk_indices_dtype() int32
+        #
+        if expert_map is not None:
+            logger.warning_once(
+                "The PPLX backend does not support expert mapping. "
+                "The provided `expert_map` will be ignored.")
+        expert_map = None  #noqa: F841
+
+        # Is this always going to be a1.device?
+        device = a1.device
+
+        if apply_router_weight_on_input:
+            topk = topk_ids.size(1)
+            # TODO: this only works for topK=1, will need to update for topK>1
+            assert topk == 1, (
+                "apply_router_weight_on_input is only implemented for topk=1")
+            a1 = a1 * topk_weights.to(a1.dtype)
+
+        repeat_cols = 4
+        repeat_rows = 1 if quant_config.per_act_token_quant else a1.size(0)
+        a1q, a1q_scale = moe_kernel_quantize_input(
+            a1, (None if quant_config.per_act_token_quant else a1_scale),
+            quant_dtype=quant_config.quant_dtype,
+            per_act_token_quant=quant_config.per_act_token_quant,
+            block_shape=quant_config.block_shape)
+
+        _validate_scale_shape(a1q, a1q_scale, quant_config.per_act_token_quant,
+                              quant_config.block_shape)
+
+        if a1q_scale is not None:
+            scalar_scales = a1q_scale.numel() == 1
+
+            # pplx requires 2-d scales even for scalar scales
+            if a1q_scale.dim() <= 1:
+                assert scalar_scales
+                a1q_scale = a1q_scale.view(1, 1)
+
+            orig_a_scale_block_shape = a1q_scale.shape[-1]
+
+            if not quant_config.is_block_quantized:
+                # TODO (bnell): use group_broadcast instead?
+                a1q_scale = a1q_scale.repeat(repeat_rows, repeat_cols)
+
+        assert a1q_scale is None or a1q_scale.ndim == 2, \
+            f"{0 if a1q_scale is None else (a1q_scale.ndim, a1q_scale.shape)}"
+
+        expert_num_tokens = torch.empty(
+            self.num_local_experts,
+            dtype=torch.int32,
+            device=device,
+        )
+
+        expert_x = torch.empty(
+            (self.num_local_experts,
+             self.max_num_tokens * self.num_dispatchers(), hidden_dim),
+            dtype=a1q.dtype,
+            device=device,
+        )
+
+        expert_x_scale: Optional[torch.Tensor] = None
+        if a1q.dtype.itemsize == 1:
+            if quant_config.is_per_act_token:
+                # (M x 1) -> (E x M x K)
+                final_dim = expert_x.size(2)
+            elif quant_config.is_per_tensor:
+                # (1 x 1) -> (E x 1 x 1)
+                final_dim = 1
+            else:
+                # (M x K_tiles) -> (E x M x K_tiles)
+                assert quant_config.block_shape is not None
+                num_blocks = cdiv(expert_x.size(2),
+                                  quant_config.block_shape[1])
+                final_dim = num_blocks
+
+            expert_x_scale_shape = (
+                self.num_local_experts,
+                expert_x.size(1),
+                round_up(final_dim, 4)  # round up for alignment
+            )
+
+            expert_x_scale = torch.empty(
+                expert_x_scale_shape,
+                dtype=torch.float32,
+                device=expert_x.device,
+            )
+
+        # This argument is optional, defaults to indices.size(0)
+        # There's not much point setting this unless it is != indices.size(0)
+        bound_m: Optional[torch.Tensor] = None
+
+        self.a2a.dispatch(
+            out_expert_num_tokens=expert_num_tokens,
+            out_expert_x=expert_x,
+            out_expert_x_scale=expert_x_scale,
+            dp_x=a1q,
+            dp_x_scale=a1q_scale,
+            indices=topk_ids.view(dtype=torch.uint32),
+            bound_m=bound_m,
+        )
+
+        if expert_x_scale is not None:
+            expert_x_scale = expert_x_scale[:, :, :orig_a_scale_block_shape]
+            assert expert_x_scale.ndim == 3
+
+        expert_tokens_meta = mk.ExpertTokensMetadata(
+            expert_num_tokens=expert_num_tokens, expert_num_tokens_cpu=None)
+
+        return expert_x, expert_x_scale, expert_tokens_meta, None, None
+
+    def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
+                 topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                 apply_router_weight_on_input: bool,
+                 weight_and_reduce_impl: mk.TopKWeightAndReduce,
+                 extra_finalize_args: Optional[dict[str, Any]]) -> None:
+        assert isinstance(
+            weight_and_reduce_impl, TopKWeightAndReduceDelegate
+        ), ("Weight application and reduction happens in the combine kernel.")
+
+        # This argument is optional
+        # There's not much point setting this unless it is != topk_ids.size(0)
+        bound_m: Optional[torch.Tensor] = None
+
+        # TODO (bnell): fails in test_pplx_moe.py, figure out what's going on
+        #num_tokens = output.size(0)  # M
+        #assert topk_ids.size(0) == num_tokens, (
+        #    f"{topk_ids.size(0)} == {num_tokens}")
+        assert topk_ids.size() == topk_weights.size(), (
+            f"{topk_ids.size()} == {topk_weights.size()}")
+        assert output.size(0) <= self.max_num_tokens, (
+            f"{output.size(0)} <= {self.max_num_tokens}")
+        assert output.size(1) == fused_expert_output.size(-1)
+
+        # Set weights to 1 if we did them in dispatch. This is hacky.
+        if apply_router_weight_on_input:
+            topk_weights = torch.ones_like(topk_weights)
+
+        self.a2a.combine(out_tokens=output,
+                         indices=topk_ids.view(dtype=torch.uint32),
+                         weights=topk_weights,
+                         expert_y=fused_expert_output,
+                         bound_m=bound_m)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/prepare_finalize.py
new file mode 100644
index 0000000..696c7cd
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/prepare_finalize.py
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, Optional
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
+    TopKWeightAndReduceContiguous, TopKWeightAndReduceDelegate)
+from vllm.model_executor.layers.fused_moe.utils import (
+    moe_kernel_quantize_input)
+
+
+class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
+
+    @property
+    def activation_format(self) -> mk.FusedMoEActivationFormat:
+        return mk.FusedMoEActivationFormat.Standard
+
+    def max_num_tokens_per_rank(self) -> Optional[int]:
+        return None
+
+    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+        return None
+
+    def num_dispatchers(self) -> int:
+        return 1
+
+    def prepare(
+        self,
+        a1: torch.Tensor,
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool,
+        quant_config: FusedMoEQuantConfig,
+        extra_prepare_args: Optional[dict[str, Any]],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor],
+               Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor],
+               Optional[torch.Tensor]]:
+
+        if apply_router_weight_on_input:
+            topk = topk_ids.size(1)
+            # TODO: this only works for topK=1, will need to update for topK>1
+            assert topk == 1, \
+                "apply_router_weight_on_input is only implemented for topk=1"
+            a1.mul_(topk_weights.to(a1.dtype))
+
+        if (extra_prepare_args is not None
+                and extra_prepare_args.get("skip_quant", True)):
+            # Skip quantization if explicitly requested
+            return a1, None, None, None, None
+
+        a1q, a1q_scale = moe_kernel_quantize_input(
+            a1, a1_scale, quant_config.quant_dtype,
+            quant_config.per_act_token_quant, quant_config.block_shape)
+
+        return a1q, a1q_scale, None, None, None
+
+    def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
+                 topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                 apply_router_weight_on_input: bool,
+                 weight_and_reduce_impl: mk.TopKWeightAndReduce,
+                 extra_finalize_args: Optional[dict[str, Any]]) -> None:
+        if (extra_finalize_args is not None
+                and extra_finalize_args.get("skip_weight_reduce", True)):
+            assert output.shape == fused_expert_output.shape
+            output.copy_(fused_expert_output)
+        else:
+            if isinstance(weight_and_reduce_impl, TopKWeightAndReduceDelegate):
+                weight_and_reduce_impl = TopKWeightAndReduceContiguous()
+            weight_and_reduce_impl.apply(
+                output=output,
+                fused_expert_output=fused_expert_output,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                apply_router_weight_on_input=apply_router_weight_on_input)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
new file mode 100644
index 0000000..93e20c3
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -0,0 +1,429 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from enum import IntEnum
+from functools import cache
+from typing import Optional
+
+import torch
+
+from vllm import envs
+from vllm.platforms import current_platform
+from vllm.utils import direct_register_custom_op
+
+
+class QuantMethod(IntEnum):
+    # This allows interfacing with AITER QuantType Enum
+    # without importing the QuantType from AITER globally.
+
+    # Note that these quantization methods are
+    # supported in AITER package. However,
+    # not all are used in this module.
+
+    NO = 0  # a16w16
+    PER_TENSOR = 1  # w8a8 (pre_Tensor)
+    PER_TOKEN = 2  # w8a8/w8a4 (per_Token)
+    BLOCK_1X32 = 3  # fp4x2
+    BLOCK_1X128 = 4  # block quantized w8a8 (per_1x128)
+    BLOCK_128x128 = 5  # block quantized w8a8 (per_128x128)
+
+
+class ActivationMethod(IntEnum):
+    # This allows interfacing with AITER ActivationType enum
+    # without importing the ActivationType enum from AITER globally.
+    SILU = 0
+    GELU = 1
+
+
+@cache
+def is_rocm_aiter_moe_enabled() -> bool:
+    return current_platform.is_rocm() \
+        and envs.VLLM_ROCM_USE_AITER_MOE \
+        and envs.VLLM_ROCM_USE_AITER
+
+
+def rocm_aiter_asm_moe_tkw1_impl(
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        fc1_scale: Optional[torch.Tensor] = None,
+        fc2_scale: Optional[torch.Tensor] = None,
+        fc1_smooth_scale: Optional[torch.Tensor] = None,
+        fc2_smooth_scale: Optional[torch.Tensor] = None,
+        a16: bool = False,
+        per_tensor_quant_scale: Optional[torch.Tensor] = None,
+        expert_mask: Optional[torch.Tensor] = None,
+        activation_method: int = ActivationMethod.SILU.value) -> torch.Tensor:
+
+    from aiter import ActivationType
+    from aiter.fused_moe_bf16_asm import asm_moe_tkw1
+
+    activation = ActivationType(activation_method)
+
+    return asm_moe_tkw1(hidden_states,
+                        w1,
+                        w2,
+                        topk_weights,
+                        topk_ids,
+                        fc1_scale=fc1_scale,
+                        fc2_scale=fc2_scale,
+                        fc1_smooth_scale=fc1_smooth_scale,
+                        fc2_smooth_scale=fc2_smooth_scale,
+                        a16=a16,
+                        per_tensor_quant_scale=per_tensor_quant_scale,
+                        expert_mask=expert_mask,
+                        activation=activation)
+
+
+def rocm_aiter_asm_moe_tkw1_fake(
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        fc1_scale: Optional[torch.Tensor] = None,
+        fc2_scale: Optional[torch.Tensor] = None,
+        fc1_smooth_scale: Optional[torch.Tensor] = None,
+        fc2_smooth_scale: Optional[torch.Tensor] = None,
+        a16: bool = False,
+        per_tensor_quant_scale: Optional[torch.Tensor] = None,
+        expert_mask: Optional[torch.Tensor] = None,
+        activation_method: int = ActivationMethod.SILU.value) -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+
+
+def rocm_aiter_topk_softmax_impl(topk_weights: torch.Tensor,
+                                 topk_indices: torch.Tensor,
+                                 token_expert_indices: torch.Tensor,
+                                 gating_output: torch.Tensor,
+                                 renormalize: bool) -> None:
+    from aiter import topk_softmax
+    topk_softmax(topk_weights, topk_indices, token_expert_indices,
+                 gating_output, renormalize)
+
+
+def rocm_aiter_topk_softmax_fake(topk_weights: torch.Tensor,
+                                 topk_indices: torch.Tensor,
+                                 token_expert_indices: torch.Tensor,
+                                 gating_output: torch.Tensor,
+                                 renormalize: bool) -> None:
+    pass
+
+
+def rocm_aiter_biased_grouped_topk_impl(
+        gating_output: torch.Tensor,
+        correction_bias: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_expert_group: int,
+        topk_group: int,
+        need_renorm: bool,
+        routed_scaling_factor: float = 1.0  # mul to topk_weights
+) -> None:
+
+    from aiter import biased_grouped_topk
+
+    biased_grouped_topk(gating_output, correction_bias, topk_weights, topk_ids,
+                        num_expert_group, topk_group, need_renorm,
+                        routed_scaling_factor)
+
+
+def rocm_aiter_biased_grouped_topk_fake(
+        gating_output: torch.Tensor,
+        correction_bias: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_expert_group: int,
+        topk_group: int,
+        need_renorm: bool,
+        routed_scaling_factor: float = 1.0  # mul to topk_weights
+) -> None:
+    pass
+
+
+def rocm_aiter_grouped_topk_impl(
+        gating_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_expert_group: int,
+        topk_group: int,
+        need_renorm: bool,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0  # mul to topk_weights
+) -> None:
+
+    from aiter import grouped_topk
+
+    grouped_topk(gating_output, topk_weights, topk_ids, num_expert_group,
+                 topk_group, need_renorm, scoring_func, routed_scaling_factor)
+
+
+def rocm_aiter_grouped_topk_fake(
+        gating_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_expert_group: int,
+        topk_group: int,
+        need_renorm: bool,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0  # mul to topk_weights
+) -> None:
+    pass
+
+
+def rocm_aiter_fused_moe_impl(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weight: torch.Tensor,
+    topk_ids: torch.Tensor,
+    expert_mask: Optional[torch.Tensor] = None,
+    activation_method: int = ActivationMethod.SILU.value,
+    quant_method: int = QuantMethod.NO.value,
+    doweight_stage1: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    from aiter import ActivationType, QuantType
+    from aiter.fused_moe import fused_moe
+
+    activation = ActivationType(activation_method)
+    quant_type = QuantType(quant_method)
+
+    return fused_moe(hidden_states, w1, w2, topk_weight, topk_ids, expert_mask,
+                     activation, quant_type, doweight_stage1, w1_scale,
+                     w2_scale, a1_scale, a2_scale)
+
+
+def rocm_aiter_fused_moe_fake(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weight: torch.Tensor,
+    topk_ids: torch.Tensor,
+    expert_mask: Optional[torch.Tensor] = None,
+    activation_method: int = ActivationMethod.SILU.value,
+    quant_method: int = QuantMethod.NO.value,
+    doweight_stage1: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+
+
+if current_platform.is_rocm():
+
+    direct_register_custom_op(
+        op_name="rocm_aiter_asm_moe_tkw1",
+        op_func=rocm_aiter_asm_moe_tkw1_impl,
+        mutates_args=[],
+        fake_impl=rocm_aiter_asm_moe_tkw1_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
+    direct_register_custom_op(
+        op_name="rocm_aiter_fused_moe",
+        op_func=rocm_aiter_fused_moe_impl,
+        mutates_args=[],
+        fake_impl=rocm_aiter_fused_moe_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
+    direct_register_custom_op(
+        op_name="rocm_aiter_topk_softmax",
+        op_func=rocm_aiter_topk_softmax_impl,
+        mutates_args=["topk_weights", "topk_indices", "token_expert_indices"],
+        fake_impl=rocm_aiter_topk_softmax_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
+    direct_register_custom_op(
+        op_name="rocm_aiter_biased_grouped_topk",
+        op_func=rocm_aiter_biased_grouped_topk_impl,
+        mutates_args=["topk_weights", "topk_ids"],
+        fake_impl=rocm_aiter_biased_grouped_topk_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
+    direct_register_custom_op(
+        op_name="rocm_aiter_grouped_topk",
+        op_func=rocm_aiter_grouped_topk_impl,
+        mutates_args=["topk_weights", "topk_ids"],
+        fake_impl=rocm_aiter_grouped_topk_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
+
+def rocm_aiter_grouped_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: int = 0,
+    topk_group: int = 0,
+    scoring_func: str = "softmax",
+    e_score_correction_bias: Optional[torch.Tensor] = None
+) -> tuple[torch.Tensor, torch.Tensor]:
+    token = hidden_states.shape[0]
+    device = hidden_states.device
+    topk_ids = torch.empty((token, topk), dtype=torch.int32, device=device)
+    topk_weights = torch.empty((token, topk),
+                               dtype=torch.float32,
+                               device=device)
+
+    if e_score_correction_bias is not None:
+        torch.ops.vllm.rocm_aiter_biased_grouped_topk(
+            gating_output,
+            e_score_correction_bias,
+            topk_weights,
+            topk_ids,
+            num_expert_group,
+            topk_group,
+            renormalize,
+        )
+    else:
+        assert (scoring_func == "softmax" or scoring_func == "sigmoid")
+        torch.ops.vllm.rocm_aiter_grouped_topk(
+            gating_output,
+            topk_weights,
+            topk_ids,
+            num_expert_group,
+            topk_group,
+            renormalize,
+            scoring_func,
+        )
+
+    return topk_weights, topk_ids
+
+
+def rocm_aiter_fused_experts(
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str = "silu",
+        apply_router_weight_on_input: bool = False,
+        use_fp8_w8a8: bool = False,
+        per_channel_quant: bool = False,
+        w1_scale: Optional[torch.Tensor] = None,
+        w2_scale: Optional[torch.Tensor] = None,
+        a1_scale: Optional[torch.Tensor] = None,
+        a2_scale: Optional[torch.Tensor] = None,
+        block_shape: Optional[list[int]] = None,
+        expert_map: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+    activation_method = (ActivationMethod.SILU
+                         if activation == "silu" else ActivationMethod.GELU)
+    # All AITER Fused MoE kernels are expecting the following datatypes
+    topk_weights = topk_weights.to(torch.float32)
+    topk_ids = topk_ids.to(torch.int32)
+
+    if expert_map is not None:
+        expert_mask = (expert_map > -1).to(torch.int32)
+    else:
+        expert_mask = None
+
+    # w8a8 per-channel quantization
+    if per_channel_quant and apply_router_weight_on_input and use_fp8_w8a8:
+        # AITER tkw1 kernel for FP8 models with `apply_router_weight_on_input`
+        # This applies topk_weights on the GEMM output of the first FC layer
+        #  rather than the second FC.
+        assert (topk_weights.dim() == 2
+                ), "`topk_weights` should be in shape (num_tokens, topk)"
+        assert topk_weights.shape[-1] == 1, (
+            "Only support topk=1 when"
+            " `apply_router_weight_on_input` is True")
+
+        return torch.ops.vllm.rocm_aiter_asm_moe_tkw1(
+            hidden_states,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            fc1_scale=w1_scale,
+            fc2_scale=w2_scale,
+            fc1_smooth_scale=None,
+            fc2_smooth_scale=None,
+            a16=False,
+            per_tensor_quant_scale=None,
+            expert_mask=expert_mask,
+            activation_method=activation_method)
+
+    else:
+        quant_method = QuantMethod.NO.value
+
+        # w8a8 block-scaled
+        if block_shape is not None and use_fp8_w8a8:
+            assert not apply_router_weight_on_input, (
+                "apply_router_weight_on_input is\
+                not supported for block scaled moe")
+            assert w1_scale is not None
+            assert w2_scale is not None
+            quant_method = QuantMethod.BLOCK_128x128.value
+        elif use_fp8_w8a8:
+            # Currently only per tensor quantization method is enabled.
+            quant_method = QuantMethod.PER_TENSOR.value
+
+        if apply_router_weight_on_input:
+            assert (topk_weights.dim() == 2
+                    ), "`topk_weights` should be in shape (num_tokens, topk)"
+            _, topk = topk_weights.shape
+            assert (
+                topk == 1
+            ), "Only support topk=1 when `apply_router_weight_on_input` is True"
+
+        return torch.ops.vllm.rocm_aiter_fused_moe(
+            hidden_states,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            expert_mask=expert_mask,
+            quant_method=quant_method,
+            activation_method=activation_method,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            doweight_stage1=apply_router_weight_on_input)
+
+
+def rocm_aiter_topk_softmax(topk_weights: torch.Tensor,
+                            topk_indices: torch.Tensor,
+                            token_expert_indices: torch.Tensor,
+                            gating_output: torch.Tensor,
+                            renormalize: bool) -> tuple[torch.Tensor, ...]:
+    torch.ops.vllm.rocm_aiter_topk_softmax(topk_weights, topk_indices,
+                                           token_expert_indices, gating_output,
+                                           renormalize)
+    return topk_weights, topk_indices
+
+
+def shuffle_weights(
+    *tensors: torch.Tensor, layout: tuple[int, int] = (16, 16)
+) -> tuple[torch.Tensor, ...]:
+    """
+    Applies shuffle_weight function from AITER to each 
+    input tensor and returns them.
+    
+    Rearranges (shuffles) the input tensor/s
+    into a specified block layout for optimized computation.
+
+    Args:
+        *tensors: Variable number of torch.Tensor objects.
+        layout: A pair of integers specifying the 
+        block sizes used to divide the tensors during shuffling.
+        Default is (16, 16).
+
+    Returns:
+    A Tuple of shuffled tensors.
+    """
+    from aiter.ops.shuffle import shuffle_weight
+
+    return tuple(shuffle_weight(tensor, layout=layout) for tensor in tensors)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py
new file mode 100644
index 0000000..fb398ee
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py
@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+import vllm._custom_ops as ops
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+
+
+class TopKWeightAndReduceDelegate(mk.TopKWeightAndReduce):
+    """
+    Useful in the case when some FusedMoEPermuteExpertsUnpermute
+    implementation does not perform weight application and reduction
+    but cannot address the needs of all the compatible PrepareAndFinalize
+    implementations.
+    For example, BatchedTritonExperts is compatible with both
+    PplxPrepareAndFinalize and BatchedPrepareAndFinalize. PplxPrepareAndFinalize
+    does the weight-application + reduction as part of the pplx combine kernel.
+    But the BatchedPrepareAndFinalize needs an implementation. To facilitate
+    this case, the BatchedTritonExperts could use TopKWeightAndReduceDelegate 
+    so the PrepareAndFinalize implementations could choose how to
+    weight + reduce.
+    """
+
+    def __eq__(self, other):
+        return isinstance(other, TopKWeightAndReduceDelegate)
+
+    def apply(self, output: Optional[torch.Tensor],
+              fused_expert_output: torch.Tensor, topk_weights: torch.Tensor,
+              topk_ids: torch.Tensor,
+              apply_router_weight_on_input: bool) -> torch.Tensor:
+        raise RuntimeError("The caller is expected to choose an appropriate "
+                           "TopKWeightAndReduce implementation.")
+
+
+class TopKWeightAndReduceNoOP(mk.TopKWeightAndReduce):
+    """
+    The fused_experts outputs have already been weight applied and reduced.
+    This implementation is a no-op.
+    """
+
+    def __eq__(self, other):
+        return isinstance(other, TopKWeightAndReduceNoOP)
+
+    def apply(self, output: Optional[torch.Tensor],
+              fused_expert_output: torch.Tensor, topk_weights: torch.Tensor,
+              topk_ids: torch.Tensor,
+              apply_router_weight_on_input: bool) -> torch.Tensor:
+        # Weight application and reduction operations are already done.
+        if output is None:
+            return fused_expert_output
+
+        # MoEPrepareAndFinalizeNoEP needs the output to be in the `output`
+        # tensor.
+        assert output.size() == fused_expert_output.size(), (
+            "output shape is expected to match the fused_expert_output shape. "
+            f"But got output={output.size()}, "
+            f"used_expert_output={fused_expert_output.size()}")
+        output.copy_(fused_expert_output, non_blocking=True)
+        return output
+
+
+class TopKWeightAndReduceContiguous(mk.TopKWeightAndReduce):
+    """
+    TopKWeightAndReduce implementation for a fused_experts output
+    of shape (m, topk, K)
+    """
+
+    def __eq__(self, other):
+        return isinstance(other, TopKWeightAndReduceContiguous)
+
+    def apply(self, output: Optional[torch.Tensor],
+              fused_expert_output: torch.Tensor, topk_weights: torch.Tensor,
+              topk_ids: torch.Tensor,
+              apply_router_weight_on_input: bool) -> torch.Tensor:
+
+        m, num_topk = topk_ids.size()
+        k = fused_expert_output.size(-1)
+        if fused_expert_output.ndim == 2:
+            fused_expert_output = fused_expert_output.view(m, num_topk, k)
+
+        assert fused_expert_output.size() == (m, num_topk, k), (
+            f"Expected fused_expert_output size {(m, num_topk, k)}. But got "
+            f"{fused_expert_output.size()}")
+
+        if not apply_router_weight_on_input:
+            fused_expert_output.mul_(topk_weights.view(m, -1, 1))
+
+        if output is None:
+            output = torch.empty((m, k),
+                                 device=fused_expert_output.device,
+                                 dtype=fused_expert_output.dtype)
+        assert output.size() == (m, k), (
+            f"Expected output size {(m, k)}. But got {output.size()}")
+
+        ops.moe_sum(fused_expert_output, output)
+        return output
+
+
+class TopKWeightAndReduceNaiveBatched(mk.TopKWeightAndReduce):
+    """
+    TopKWeightAndReduce implementation for a fused_experts output
+    of shape (num_experts, batch_size, K)
+    """
+
+    def __init__(self, rank: int):
+        self.rank = rank
+
+    def __eq__(self, other):
+        return (isinstance(other, TopKWeightAndReduceNaiveBatched)
+                and (other.rank == self.rank))
+
+    def apply(self, output: Optional[torch.Tensor],
+              fused_expert_output: torch.Tensor, topk_weights: torch.Tensor,
+              topk_ids: torch.Tensor,
+              apply_router_weight_on_input: bool) -> torch.Tensor:
+        assert fused_expert_output.ndim == 3
+        num_tokens = topk_ids.size(0)
+        num_local_experts = fused_expert_output.size(0)
+        K = fused_expert_output.size(-1)
+
+        if output is None:
+            output = torch.zeros((num_tokens, K),
+                                 device=fused_expert_output.device,
+                                 dtype=fused_expert_output.dtype)
+        else:
+            output.fill_(0)
+
+        assert output.size() == (num_tokens, K), (
+            f"Expected output size {(num_tokens, K)}, but got {output.size()}")
+
+        first_expert = num_local_experts * self.rank
+        last_expert = first_expert + num_local_experts
+
+        for expert_id in range(first_expert, last_expert):
+            matching_tokens = topk_ids == expert_id
+            topks = torch.any(matching_tokens, dim=1).flatten()
+            rows = torch.count_nonzero(topks)
+            rhs = fused_expert_output[expert_id - first_expert, :rows, :]
+            if not apply_router_weight_on_input:
+                rhs.mul_(topk_weights[matching_tokens].view(rhs.size(0), 1))
+            output[topks] = output[topks] + rhs
+
+        return output
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
new file mode 100644
index 0000000..1b31368
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
@@ -0,0 +1,162 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, Optional
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+    DeepGemmExperts, _valid_deep_gemm, _valid_deep_gemm_shape,
+    deep_gemm_block_shape)
+from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
+from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
+
+
+class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
+
+    def __init__(
+        self,
+        use_fp8_w8a8: bool = False,
+        use_int8_w8a8: bool = False,
+        use_int8_w8a16: bool = False,
+        use_int4_w4a16: bool = False,
+        use_mxfp4_w4a4: bool = False,
+        per_act_token_quant: bool = False,
+        block_shape: Optional[list[int]] = None,
+        allow_deep_gemm: bool = False,
+    ):
+        super().__init__(
+            FusedMoEQuantConfig.make(
+                use_fp8_w8a8=use_fp8_w8a8,
+                use_int8_w8a8=use_int8_w8a8,
+                use_int8_w8a16=use_int8_w8a16,
+                use_int4_w4a16=use_int4_w4a16,
+                use_mxfp4_w4a4=use_mxfp4_w4a4,
+                per_act_token_quant=per_act_token_quant,
+                block_shape=block_shape,
+            ))
+        self.triton_expert = TritonExperts(
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a8=use_int8_w8a8,
+            use_int4_w4a16=use_int4_w4a16,
+            use_int8_w8a16=use_int8_w8a16,
+            use_mxfp4_w4a4=use_mxfp4_w4a4,
+            per_act_token_quant=per_act_token_quant,
+            block_shape=block_shape,
+        )
+
+        self.allow_deep_gemm = (allow_deep_gemm and use_fp8_w8a8 and
+                                self.block_shape == deep_gemm_block_shape())
+
+        self.deep_gemm_expert = DeepGemmExperts(
+        ) if self.allow_deep_gemm else None
+
+    @property
+    def activation_formats(
+        self
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        assert (self.deep_gemm_expert is None
+                or self.triton_expert.activation_formats
+                == self.deep_gemm_expert.activation_formats)
+        return self.triton_expert.activation_formats
+
+    def supports_chunking(self) -> bool:
+        dge = self.deep_gemm_expert
+        te = self.triton_expert
+        return ((dge is None or dge.supports_chunking())
+                and (te is None or te.supports_chunking()))
+
+    def supports_expert_map(self) -> bool:
+        dge = self.deep_gemm_expert
+        te = self.triton_expert
+        return ((dge is None or dge.supports_expert_map())
+                and (te is None or te.supports_expert_map()))
+
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        dge = self.deep_gemm_expert
+        te = self.triton_expert
+        dge_war = dge.finalize_weight_and_reduce_impl() if dge else None
+        te_war = te.finalize_weight_and_reduce_impl() if te else None
+        is_dge_war = dge_war is not None
+        is_te_war = te_war is not None
+
+        if is_dge_war and is_te_war:
+            assert dge_war == te_war, (
+                "Both implementations should agree on WeightAndReduce impls. "
+                f"Got dge_war: {dge_war}, and te_war: {te_war}")
+
+        if dge_war is not None:
+            return dge_war
+
+        assert te_war is not None
+        return te_war
+
+    def workspace_shapes(
+        self,
+        a: torch.Tensor,
+        aq: torch.Tensor,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        # Note: the deep gemm workspaces are strictly larger than the triton
+        # workspaces so we can be pessimistic here and allocate for DeepGemm
+        # even if we fall back to triton later, e.g. if expert maps are set.
+        if self.allow_deep_gemm and (_valid_deep_gemm_shape(M, N, K)
+                                     or is_blackwell_deep_gemm_used()):
+            assert self.deep_gemm_expert is not None
+            return self.deep_gemm_expert.workspace_shapes(
+                a, aq, M, N, K, topk, global_num_experts, local_num_experts,
+                expert_tokens_meta)
+        else:
+            return self.triton_expert.workspace_shapes(a, aq, M, N, K, topk,
+                                                       global_num_experts,
+                                                       local_num_experts,
+                                                       expert_tokens_meta)
+
+    def apply(self, output: torch.Tensor, hidden_states: torch.Tensor,
+              w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor,
+              topk_ids: torch.Tensor, activation: str, global_num_experts: int,
+              expert_map: Optional[torch.Tensor],
+              w1_scale: Optional[torch.Tensor],
+              w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor],
+              w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor],
+              a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor,
+              workspace2: torch.Tensor,
+              expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+              apply_router_weight_on_input: bool,
+              extra_expert_args: Optional[dict[str, Any]]):
+        use_deep_gemm = (self.allow_deep_gemm
+                         and (_valid_deep_gemm(hidden_states, w1, w2)
+                              or is_blackwell_deep_gemm_used()))
+
+        experts = self.deep_gemm_expert if use_deep_gemm else self.triton_expert
+        assert experts is not None
+
+        experts.apply(
+            output,
+            hidden_states,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            activation,
+            global_num_experts,
+            expert_map,
+            w1_scale,
+            w2_scale,
+            w1_zp,
+            w2_zp,
+            a1q_scale,
+            a2_scale,
+            workspace13,
+            workspace2,
+            expert_tokens_meta,
+            apply_router_weight_on_input,
+            extra_expert_args,
+        )
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/utils.py b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/utils.py
new file mode 100644
index 0000000..966471b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/fused_moe/utils.py
@@ -0,0 +1,268 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from math import prod
+from typing import Any, Optional, Union
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8)
+from vllm.model_executor.layers.quantization.utils.int8_utils import (
+    per_token_group_quant_int8, per_token_quant_int8)
+from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
+    quant_dequant_mxfp4)
+from vllm.platforms import current_platform
+from vllm.triton_utils import tl, triton
+from vllm.utils import cdiv
+from vllm.utils.flashinfer import fp4_quantize
+
+
+@triton.jit
+def _count_expert_num_tokens(topk_ids_ptr, expert_num_tokens_ptr, num_experts,
+                             topk_numel, expert_map,
+                             HAS_EXPERT_MAP: tl.constexpr,
+                             BLOCK_SIZE: tl.constexpr):
+
+    curr_expert = tl.program_id(0)
+
+    offsets = tl.arange(0, BLOCK_SIZE)
+    topk_ids_ptrs = topk_ids_ptr + offsets
+
+    acc = tl.zeros((BLOCK_SIZE, ), dtype=tl.int32)
+    for x in range(tl.cdiv(topk_numel, BLOCK_SIZE)):
+        mask = offsets < (topk_numel - x * BLOCK_SIZE)
+        expert_ids = tl.load(topk_ids_ptrs, mask=mask, other=-1)
+        if HAS_EXPERT_MAP:
+            expert_map_ptrs = expert_map + expert_ids
+            expert_map_mask = expert_ids >= 0
+            expert_ids = tl.load(expert_map_ptrs,
+                                 mask=expert_map_mask,
+                                 other=-1)
+
+        has_curr_expert = tl.where(expert_ids == curr_expert, 1, 0)
+        acc = acc + has_curr_expert
+        topk_ids_ptrs += BLOCK_SIZE
+
+    if curr_expert < num_experts:
+        tl.store(expert_num_tokens_ptr + curr_expert, tl.sum(acc))
+
+
+def count_expert_num_tokens(
+        topk_ids: torch.Tensor, num_local_experts: int,
+        expert_map: Optional[torch.Tensor]) -> torch.Tensor:
+    """
+    Count the number to tokens assigned to each expert.
+
+    Parameters:
+    - topk_ids (torch.Tensor): Tensor mapping each token to its
+    list of experts.
+    - num_local_experts (int): Number of experts in this rank.
+    - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices
+    from the global expert space to the local expert space of the expert
+    parallel shard.
+
+    Returns:
+    A tensor of size num_local_experts, where tensor[i] holds the number
+    of tokens assigned to the ith expert.
+    """
+    assert topk_ids.dtype.is_signed, (
+        "The kernel uses -1 to represent invalid topk_ids")
+    expert_num_tokens = torch.empty((num_local_experts),
+                                    device=topk_ids.device,
+                                    dtype=torch.int32)
+
+    grid = num_local_experts
+    BLOCK_SIZE = min(topk_ids.numel(), 1024)
+    BLOCK_SIZE = triton.next_power_of_2(BLOCK_SIZE)
+
+    _count_expert_num_tokens[(grid, )](
+        topk_ids,
+        expert_num_tokens,
+        num_local_experts,
+        topk_ids.numel(),
+        expert_map,
+        HAS_EXPERT_MAP=expert_map is not None,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+
+    return expert_num_tokens
+
+
+def _resize_cache(x: torch.Tensor, v: tuple[int, ...]) -> torch.Tensor:
+    """
+    Shrink the given tensor and apply the given view to it.  This is
+    used to resize the intermediate fused_moe caches.
+    """
+    assert prod(v) <= x.numel(
+    ), f"{v} ({prod(v)}) <= {x.shape} ({x.numel()})"  # CUDAGRAPH unfriendly?
+    return x.flatten()[:prod(v)].view(*v)
+
+
+def _fp4_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    is_sf_swizzled_layout: bool,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    return fp4_quantize(A,
+                        A_scale,
+                        is_sf_swizzled_layout=is_sf_swizzled_layout)
+
+
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    per_act_token: bool,
+    block_shape: Optional[list[int]] = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        # TODO(luka): use QuantFP8 custom op
+        #  https://github.com/vllm-project/vllm/issues/20711
+        A, A_scale = ops.scaled_fp8_quant(
+            A, A_scale, use_per_token_if_dynamic=per_act_token)
+    else:
+        assert not per_act_token
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.size(-1), block_k) == A_scale.size(-1)
+
+    return A, A_scale
+
+
+def _int8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    per_act_token: bool,
+    block_shape: Optional[list[int]] = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform int8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+
+    # If weights are per-channel (per_channel_quant=True), then
+    # activations apply per-token quantization. Otherwise, assume
+    # activation tensor-wise fp8/int8 quantization, dynamic or static
+    if block_shape is None:
+        assert per_act_token, \
+            "int8 quantization only supports block or channel-wise"
+        A, A_scale = per_token_quant_int8(A)
+    else:
+        assert not per_act_token
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_int8(A, block_k)
+        assert cdiv(A.size(-1), block_k) == A_scale.size(-1)
+
+    return A, A_scale
+
+
+def _mxfp4_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    per_act_token_quant: bool,
+    block_shape: Optional[list[int]] = None,
+) -> tuple[torch.Tensor, None]:
+    assert block_shape is None
+    if not current_platform.supports_mx():
+        A = quant_dequant_mxfp4(A)
+    else:
+        raise NotImplementedError()
+
+    return A, None
+
+
+def moe_kernel_quantize_input(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    quant_dtype: Union[None, torch.dtype, str],
+    per_act_token_quant: bool,
+    block_shape: Optional[list[int]] = None,
+    is_fp4_scale_swizzled: bool = True,
+) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    if quant_dtype == torch.float8_e4m3fn:
+        return _fp8_quantize(A, A_scale, per_act_token_quant, block_shape)
+    elif quant_dtype == torch.int8:
+        return _int8_quantize(A, A_scale, per_act_token_quant, block_shape)
+    elif quant_dtype == torch.uint8:  # nvfp4
+        return _fp4_quantize(A,
+                             A_scale,
+                             is_sf_swizzled_layout=is_fp4_scale_swizzled)
+    elif quant_dtype == "mxfp4":
+        return _mxfp4_quantize(A, A_scale, per_act_token_quant, block_shape)
+    else:
+        return A, A_scale
+
+
+def _fp8_perm(m: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
+    """
+    A permutation routine that works on fp8 types.
+    """
+    if torch.is_floating_point(m) and m.dtype.itemsize == 1:
+        return m.view(dtype=torch.uint8)[idx, ...].view(dtype=m.dtype)
+    else:
+        return m[idx, ...]
+
+
+def normalize_scales_shape(
+        scales: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    if scales is not None:
+        if scales.numel() == 1:
+            scales = scales.view(1, 1)
+        else:
+            scales = scales.view(-1, scales.size(-1))
+    return scales
+
+
+def normalize_batched_scales_shape(
+    scales: Optional[torch.Tensor],
+    num_experts: int,
+) -> Optional[torch.Tensor]:
+    if scales is not None and scales.ndim < 3:
+        if scales.numel() == 1:
+            scales = scales.view(1)
+            scales = torch.repeat_interleave(scales, num_experts,
+                                             dim=0).view(num_experts, 1, 1)
+        else:
+            scales = scales.view(num_experts, -1, scales.size(-1))
+
+    return scales
+
+
+def _validate_scale_shape(
+    a: torch.Tensor,
+    a_scale: Optional[torch.Tensor],
+    per_act_token_quant: bool,
+    block_shape: Optional[list[int]],
+) -> None:
+    if a_scale is None:
+        return
+
+    if not per_act_token_quant and block_shape is None:
+        assert a_scale.numel() == 1, f"{a_scale.shape}"
+    elif per_act_token_quant:
+        assert a_scale.shape[0] == a.shape[0] and a_scale.shape[1] == 1, (
+            f"{a_scale.shape[0]} == {a.shape[0]} and {a_scale.shape[1]} == 1")
+    else:
+        assert block_shape is not None
+        expected = (a.shape[0], cdiv(a.shape[1], block_shape[1]))
+        assert a_scale.shape == expected, f"{a_scale.shape} == {expected}"
+
+
+def extract_required_args(
+    extra_args: Optional[dict[str, Any]],
+    required_keys: list[str],
+) -> tuple[Any, ...]:
+    if extra_args is None:
+        raise ValueError("`extra_args` must be provided.")
+
+    missing_keys = [k for k in required_keys if k not in extra_args]
+    if missing_keys:
+        raise ValueError(f"Missing keys in `extra_args`: {missing_keys}")
+
+    return tuple(extra_args[k] for k in required_keys)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/layernorm.py b/vllm_v0.10.0/vllm/model_executor/layers/layernorm.py
new file mode 100644
index 0000000..a5fc1db
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/layernorm.py
@@ -0,0 +1,267 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Custom normalization layers."""
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+import vllm.envs as envs
+from vllm.model_executor.custom_op import CustomOp
+from vllm.platforms import current_platform
+
+
+def is_rocm_aiter_rmsnorm_enabled() -> bool:
+    return current_platform.is_rocm() \
+        and envs.VLLM_ROCM_USE_AITER_RMSNORM \
+        and envs.VLLM_ROCM_USE_AITER
+
+
+def rms_norm(x: torch.Tensor, weight: torch.Tensor,
+             variance_epsilon: float) -> torch.Tensor:
+    from vllm import _custom_ops as ops
+    out = torch.empty_like(x)
+    ops.rms_norm(
+        out,
+        x,
+        weight,
+        variance_epsilon,
+    )
+    return out
+
+
+def fused_add_rms_norm(
+        x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor,
+        variance_epsilon: float) -> tuple[torch.Tensor, torch.Tensor]:
+    from vllm import _custom_ops as ops
+    ops.fused_add_rms_norm(
+        x,
+        residual,
+        weight,
+        variance_epsilon,
+    )
+    return x, residual
+
+
+def rocm_aiter_rms_norm(x: torch.Tensor, weight: torch.Tensor,
+                        variance_epsilon: float) -> torch.Tensor:
+    import aiter as rocm_aiter
+    if x.dim() > 2:
+        x_original_shape = x.shape
+        x = x.reshape(-1, x_original_shape[-1])
+        x = rocm_aiter.rms_norm(x, weight, variance_epsilon)
+        return x.reshape(x_original_shape)
+
+    return rocm_aiter.rms_norm(x, weight, variance_epsilon)
+
+
+def rocm_aiter_fused_add_rms_norm(
+        x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor,
+        variance_epsilon: float) -> tuple[torch.Tensor, torch.Tensor]:
+
+    import aiter as rocm_aiter
+
+    residual_out = torch.empty_like(residual)
+    output = torch.empty_like(x)
+    rocm_aiter.rmsnorm2d_fwd_with_add(
+        output,  # output
+        x,  # input
+        residual,  # residual input
+        residual_out,  # residual output
+        weight,
+        variance_epsilon,
+    )
+    return output, residual_out
+
+
+def dispatch_cuda_rmsnorm_func(add_residual: bool):
+    if add_residual:
+        if is_rocm_aiter_rmsnorm_enabled():
+            return rocm_aiter_fused_add_rms_norm
+        return fused_add_rms_norm
+
+    if is_rocm_aiter_rmsnorm_enabled():
+        return rocm_aiter_rms_norm
+    return rms_norm
+
+
+@CustomOp.register("rms_norm")
+class RMSNorm(CustomOp):
+    """Root mean square normalization.
+
+    Computes x -> w * x / sqrt(E[x^2] + eps) where w is the learned weight.
+    Refer to https://arxiv.org/abs/1910.07467
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-6,
+        var_hidden_size: Optional[int] = None,
+        has_weight: bool = True,
+        dtype: Optional[torch.dtype] = None,
+    ) -> None:
+        super().__init__()
+
+        self.hidden_size = hidden_size
+        self.variance_epsilon = eps
+        self.variance_size_override = (None if var_hidden_size == hidden_size
+                                       else var_hidden_size)
+        self.has_weight = has_weight
+        if dtype is not None:
+            self.weight = torch.ones(hidden_size, dtype=dtype)
+        else:
+            self.weight = torch.ones(hidden_size)
+        if self.has_weight:
+            self.weight = nn.Parameter(self.weight)
+
+    def forward_native(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        """PyTorch-native implementation equivalent to forward()."""
+        orig_dtype = x.dtype
+        x = x.to(torch.float32)
+        if residual is not None:
+            x = x + residual.to(torch.float32)
+            residual = x.to(orig_dtype)
+
+        hidden_size = x.shape[-1]
+        if hidden_size != self.hidden_size:
+            raise ValueError("Expected hidden_size to be "
+                             f"{self.hidden_size}, but found: {hidden_size}")
+
+        if self.variance_size_override is None:
+            x_var = x
+        else:
+            if hidden_size < self.variance_size_override:
+                raise ValueError(
+                    "Expected hidden_size to be at least "
+                    f"{self.variance_size_override}, but found: {hidden_size}")
+
+            x_var = x[:, :, :self.variance_size_override]
+
+        variance = x_var.pow(2).mean(dim=-1, keepdim=True)
+
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        x = x.to(orig_dtype)
+        if self.has_weight:
+            x = x * self.weight
+        if residual is None:
+            return x
+        else:
+            return x, residual
+
+    def forward_cuda(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        if self.variance_size_override is not None:
+            return self.forward_native(x, residual)
+
+        add_residual = residual is not None
+        norm_func = dispatch_cuda_rmsnorm_func(add_residual)
+
+        if add_residual:
+            return norm_func(x, residual, self.weight.data,
+                             self.variance_epsilon)
+        else:
+            return norm_func(x, self.weight.data, self.variance_epsilon)
+
+    def forward_xpu(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        if self.variance_size_override is not None:
+            return self.forward_native(x, residual)
+
+        from vllm._ipex_ops import ipex_ops as ops
+
+        if residual is not None:
+            ops.fused_add_rms_norm(
+                x,
+                residual,
+                self.weight.data,
+                self.variance_epsilon,
+            )
+            return x, residual
+        return ops.rms_norm(
+            x,
+            self.weight.data,
+            self.variance_epsilon,
+        )
+
+    def extra_repr(self) -> str:
+        s = f"hidden_size={self.weight.data.size(0)}"
+        s += f", eps={self.variance_epsilon}"
+        return s
+
+
+@CustomOp.register("gemma_rms_norm")
+class GemmaRMSNorm(CustomOp):
+    """RMS normalization for Gemma.
+
+    Two differences from the above RMSNorm:
+        1. x * (1 + w) instead of x * w.
+        2. (x * w).to(orig_dtype) instead of x.to(orig_dtype) * w.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-6,
+    ) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.zeros(hidden_size))
+        self.variance_epsilon = eps
+
+    @staticmethod
+    def forward_static(
+        weight: torch.Tensor,
+        variance_epsilon: float,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        """PyTorch-native implementation equivalent to forward()."""
+        orig_dtype = x.dtype
+        if residual is not None:
+            if orig_dtype == torch.float16:
+                x = x + residual.float()
+            else:
+                x = x + residual
+            residual = x
+
+        x = x.float()
+        variance = x.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(variance + variance_epsilon)
+        # Llama does x.to(float16) * w whilst Gemma is (x * w).to(float16)
+        # See https://github.com/huggingface/transformers/pull/29402
+        x = x * (1.0 + weight.float())
+        x = x.to(orig_dtype)
+        return x if residual is None else (x, residual)
+
+    def forward_native(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        """PyTorch-native implementation equivalent to forward()."""
+        return self.forward_static(self.weight.data, self.variance_epsilon, x,
+                                   residual)
+
+    def forward_cuda(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        if torch.compiler.is_compiling():
+            return self.forward_native(x, residual)
+
+        if not getattr(self, "_is_compiled", False):
+            self.forward_static = torch.compile(  # type: ignore
+                self.forward_static)
+            self._is_compiled = True
+        return self.forward_native(x, residual)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/lightning_attn.py b/vllm_v0.10.0/vllm/model_executor/layers/lightning_attn.py
new file mode 100644
index 0000000..978086d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/lightning_attn.py
@@ -0,0 +1,652 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+from einops import rearrange
+
+from vllm.triton_utils import tl, triton
+
+
+@triton.jit
+def _fwd_diag_kernel(Q, K, V, Out, S, b: tl.constexpr, h: tl.constexpr, n,
+                     d: tl.constexpr, e: tl.constexpr, BLOCK: tl.constexpr,
+                     NUM_BLOCK, CBLOCK: tl.constexpr):
+    # This kernel computes the diagonal blocks of the attention matrix
+    # Each diagonal block represents attention
+    # where queries attend to keys in the same block
+    off = tl.program_id(0)
+    off_bh = off // NUM_BLOCK  # batch-head index
+    off_block = off % NUM_BLOCK  # block index within the sequence
+    off_cblock = tl.program_id(1)  # sub-block index within a block
+
+    off_h = off_bh % h  # head index
+
+    # Calculate base offsets for the current batch and head
+    qk_offset = off_bh * n * d
+    v_offset = off_bh * n * e
+    o_offset = off_bh * n * e
+
+    # Calculate offsets for the current block
+    block_offset = off_block * BLOCK
+    qk_block_offset = block_offset * d
+    v_block_offset = block_offset * e
+    o_block_offset = block_offset * e
+
+    # Calculate offsets for the current sub-block
+    cblock_offset = off_cblock * CBLOCK
+    q_cblock_offset = cblock_offset * d
+    o_cblock_offset = cblock_offset * e
+
+    # Calculate pointers to the query, key, value, and output tensors
+    Q_block_ptr = (Q + qk_offset + qk_block_offset + q_cblock_offset +
+                   tl.arange(0, CBLOCK)[:, None] * d +
+                   tl.arange(0, d)[None, :])
+    K_trans_block_ptr = (K + qk_offset + qk_block_offset +
+                         tl.arange(0, CBLOCK)[None, :] * d +
+                         tl.arange(0, d)[:, None])
+    V_block_ptr = (V + v_offset + v_block_offset +
+                   tl.arange(0, CBLOCK)[:, None] * e +
+                   tl.arange(0, e)[None, :])
+    O_block_ptr = (Out + o_offset + o_block_offset + o_cblock_offset +
+                   tl.arange(0, CBLOCK)[:, None] * e +
+                   tl.arange(0, e)[None, :])
+
+    # Load the decay rate for the current head
+    S_block_ptr = S + off_h
+    s = tl.load(S_block_ptr)
+
+    i = off_cblock
+    q_index = tl.arange(0, CBLOCK) + i * CBLOCK
+
+    # Load query values
+    q = tl.load(Q_block_ptr,
+                mask=block_offset + q_index[:, None] < n,
+                other=0.0).to(tl.float32)
+
+    # Initialize output accumulator
+    qkv = tl.zeros([CBLOCK, e], dtype=tl.float32)
+
+    # Process all sub-blocks up to and
+    # including the current one (causal attention)
+    for j in range(i + 1):
+        kv_index = tl.arange(0, CBLOCK) + j * CBLOCK
+        diff = q_index[:, None] - kv_index[None, :]
+        s_index = s * diff
+        # Apply causal mask: only attend to positions before the current one
+        s_index = tl.where(diff >= 0, -s_index, float("-inf"))
+        decay = tl.exp(s_index)
+
+        # Load key and value
+        k_trans = tl.load(
+            K_trans_block_ptr,
+            mask=block_offset + kv_index[None, :] < n,
+            other=0.0,
+        ).to(tl.float32)
+        v = tl.load(
+            V_block_ptr,
+            mask=block_offset + kv_index[:, None] < n,
+            other=0.0,
+        ).to(tl.float32)
+
+        # Compute attention scores and apply decay
+        qk = tl.dot(q, k_trans) * decay
+
+        # Compute weighted values and accumulate
+        qkv += tl.dot(qk, v)
+
+        # Move to the next sub-block
+        K_trans_block_ptr += CBLOCK * d
+        V_block_ptr += CBLOCK * e
+
+    # Store the result
+    tl.store(
+        O_block_ptr,
+        qkv.to(O_block_ptr.dtype.element_ty),
+        mask=block_offset + q_index[:, None] < n,
+    )
+
+
+@triton.jit
+def _fwd_kv_parallel(
+    K,
+    V,
+    K_decay,
+    KV,
+    b: tl.constexpr,
+    h: tl.constexpr,
+    n,
+    d: tl.constexpr,
+    e: tl.constexpr,
+    BLOCK: tl.constexpr,
+    NUM_BLOCK,
+    D_FBLOCK: tl.constexpr,
+    E_FBLOCK: tl.constexpr,
+    NUM_FBLOCK: tl.constexpr,
+    CBLOCK: tl.constexpr,
+    NUM_CBLOCK: tl.constexpr,
+):
+    # This kernel computes the key-value outer
+    # products for each block in parallel
+    off_bh = tl.program_id(0)  # batch-head index
+    off_block = tl.program_id(1)  # block index
+
+    off_h = off_bh % h  # head index
+
+    block_offset = off_block * BLOCK
+
+    # Calculate offsets for the current block
+    k_block_offset = block_offset * d
+    v_block_offset = block_offset * e
+    kv_block_offset = off_block * d * e
+
+    # Calculate base offsets for the current batch and head
+    k_offset = off_bh * n * d
+    v_offset = off_bh * n * e
+    kv_offset = off_bh * NUM_BLOCK * d * e
+
+    # Calculate pointers to the key, value, and key-value tensors
+    K_trans_block_ptr = (K + k_offset + k_block_offset +
+                         tl.arange(0, CBLOCK)[None, :] * d +
+                         tl.arange(0, D_FBLOCK)[:, None])
+    V_block_ptr = (V + v_offset + v_block_offset +
+                   tl.arange(0, CBLOCK)[:, None] * e +
+                   tl.arange(0, E_FBLOCK)[None, :])
+    KV_block_ptr = (KV + kv_offset + kv_block_offset +
+                    tl.arange(0, D_FBLOCK)[:, None] * e +
+                    tl.arange(0, E_FBLOCK)[None, :])
+
+    # Load the decay factors for the current head and block
+    k_decay_ptr = (K_decay + off_h * BLOCK + tl.arange(0, CBLOCK)[None, :])
+
+    kv_index = tl.arange(0, CBLOCK)
+
+    # Initialize the key-value outer product accumulator
+    kv = tl.zeros([D_FBLOCK, E_FBLOCK], dtype=tl.float32)
+
+    # Handle the last block which might be smaller than BLOCK
+    if off_block == NUM_BLOCK - 1:
+        split_n = n - (NUM_BLOCK - 1) * BLOCK
+    else:
+        split_n = BLOCK
+    left_shift = tl.cdiv(split_n, CBLOCK) * CBLOCK - split_n
+    num_blocks = min(tl.cdiv(split_n, CBLOCK), NUM_CBLOCK)
+    k_decay_ptr += (NUM_CBLOCK - num_blocks) * CBLOCK
+
+    # Process all sub-blocks in the current block
+    for j in range(num_blocks):
+        left_bound = (1 - j) * left_shift
+        # Load key and value, handling boundary conditions
+        k_trans = tl.load(K_trans_block_ptr - left_shift * d,
+                          mask=kv_index[None, :] >= left_bound,
+                          other=0.0)
+        v = tl.load(V_block_ptr - left_shift * e,
+                    mask=kv_index[:, None] >= left_bound,
+                    other=0.0)
+
+        # Load decay factor and compute weighted key-value outer product
+        k_decay = tl.load(k_decay_ptr)
+        kv += tl.dot(k_trans * k_decay, v)
+
+        # Move to the next sub-block
+        K_trans_block_ptr += CBLOCK * d
+        V_block_ptr += CBLOCK * e
+        k_decay_ptr += CBLOCK
+
+    # Store the result
+    tl.store(KV_block_ptr, kv.to(KV_block_ptr.dtype.element_ty))
+
+
+@triton.jit
+def _fwd_kv_reduce(S, KV, KV_HISTORY, b: tl.constexpr, h: tl.constexpr, n,
+                   d: tl.constexpr, e: tl.constexpr, BLOCK: tl.constexpr,
+                   NUM_BLOCK, D_FBLOCK: tl.constexpr, E_FBLOCK: tl.constexpr):
+    # This kernel reduces the key-value outer products
+    # across blocks and updates the KV history
+    off_bh = tl.program_id(0)  # batch-head index
+    off_h = off_bh % h  # head index
+
+    kv_offset = off_bh * NUM_BLOCK * d * e
+
+    # Calculate pointer to the key-value tensor
+    KV_block_ptr = (KV + kv_offset + tl.arange(0, D_FBLOCK)[:, None] * e +
+                    tl.arange(0, E_FBLOCK)[None, :])
+
+    # Load the decay rate for the current head
+    s_ptrs = S + off_h
+    s = tl.load(s_ptrs)
+
+    # Calculate pointer to the key-value history tensor
+    kv_history_offset = off_bh * d * e
+    KV_HISTORY_block_ptr = (KV_HISTORY + kv_history_offset +
+                            tl.arange(0, D_FBLOCK)[:, None] * e +
+                            tl.arange(0, E_FBLOCK)[None, :])
+
+    # Load the previous key-value history
+    kv_pre = tl.load(KV_HISTORY_block_ptr).to(tl.float32)
+
+    # Process all blocks in reverse order to compute the prefix sum
+    for i in range(NUM_BLOCK):
+        block_size = min(n - i * BLOCK, BLOCK)
+        # Compute decay factor for the current block
+        block_decay = tl.exp(-s.to(tl.float32) * block_size)
+
+        # Load the current key-value outer product
+        kv_cur = tl.load(KV_block_ptr).to(tl.float32)
+        # Store the previous key-value history to the current block
+        tl.store(KV_block_ptr, kv_pre.to(KV_block_ptr.dtype.element_ty))
+
+        # Update the key-value history with the current block
+        kv_pre = block_decay * kv_pre + kv_cur
+        KV_block_ptr += d * e
+
+    # Store the updated key-value history
+    tl.store(KV_HISTORY_block_ptr, kv_pre)
+
+
+@triton.jit
+def _fwd_none_diag_kernel(
+    Q,
+    Out,
+    S,
+    KV,
+    b: tl.constexpr,
+    h: tl.constexpr,
+    n,
+    d: tl.constexpr,
+    e: tl.constexpr,
+    BLOCK: tl.constexpr,
+    NUM_BLOCK,
+    E_FBLOCK: tl.constexpr,
+    CBLOCK: tl.constexpr,
+    NUM_CBLOCK: tl.constexpr,
+):
+    # This kernel computes the non-diagonal blocks of the attention matrix
+    # Each non-diagonal block represents attention
+    # where queries attend to keys in different blocks
+    off_bh = tl.program_id(0)  # batch-head index
+    off_h = off_bh % h  # head index
+
+    off_nc = tl.program_id(1)
+    off_n = off_nc // NUM_CBLOCK  # block index
+    off_c = off_nc % NUM_CBLOCK  # sub-block index
+    off_e = tl.program_id(2)  # output feature block index
+
+    n_offset = off_n * BLOCK
+    c_offset = off_c * CBLOCK
+    e_offset = off_e * E_FBLOCK
+    block_offset = n_offset + c_offset
+
+    # Calculate offsets for the current batch, head, and block
+    q_offset = off_bh * n * d + (n_offset + c_offset) * d
+    o_offset = off_bh * n * e + (n_offset + c_offset) * e + e_offset
+    kv_offset = off_bh * NUM_BLOCK * d * e + off_n * d * e + e_offset
+
+    # Calculate pointers to the query, output, and key-value tensors
+    Q_block_ptr = (Q + q_offset + tl.arange(0, CBLOCK)[:, None] * d +
+                   tl.arange(0, d)[None, :])
+    O_block_ptr = (Out + o_offset + tl.arange(0, CBLOCK)[:, None] * e +
+                   tl.arange(0, E_FBLOCK)[None, :])
+    KV_block_ptr = (KV + kv_offset + tl.arange(0, d)[:, None] * e +
+                    tl.arange(0, E_FBLOCK)[None, :])
+
+    # Load the decay rate for the current head
+    S_block_ptr = S + off_h
+    s = tl.load(S_block_ptr)
+
+    c_array = tl.arange(0, CBLOCK)
+
+    # Load the key-value outer product for the current block
+    kv = tl.load(KV_block_ptr).to(tl.float32)
+    q_index = block_offset + tl.arange(0, CBLOCK)
+
+    # Load query values
+    q = tl.load(Q_block_ptr, mask=q_index[:, None] < n,
+                other=0.).to(tl.float32)
+
+    # Compute decay factors for the current sub-block
+    q_decay = tl.exp(-s.to(tl.float32) * (off_c * CBLOCK + c_array[:, None]))
+
+    # Compute non-diagonal attention output
+    qkv_none_diag = tl.dot(q, kv) * q_decay
+
+    # Load diagonal attention output (computed by _fwd_diag_kernel)
+    qkv_diag = tl.load(O_block_ptr, mask=q_index[:, None] < n,
+                       other=0.).to(tl.float32)
+
+    # Combine diagonal and non-diagonal attention outputs
+    qkv = qkv_diag + qkv_none_diag
+
+    # Store the result
+    tl.store(O_block_ptr,
+             qkv.to(O_block_ptr.dtype.element_ty),
+             mask=q_index[:, None] < n)
+
+
+class _attention(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, q, k, v, s, kv_history):
+        # Forward pass of the lightning attention algorithm
+        q = q.contiguous()
+        k = k.contiguous()
+        v = v.contiguous()
+        s = s.contiguous()
+
+        # Check CUDA compute capability
+        capability = torch.cuda.get_device_capability()
+        if capability[0] < 8:
+            raise RuntimeError("Flash attention currently only supported",
+                               "for compute capability >= 80")
+
+        # Get input dimensions
+        b, h, n, d = q.shape
+        e = v.shape[-1]
+
+        # Initialize output tensor
+        o = torch.empty((b, h, n, e), dtype=q.dtype, device=q.device)
+
+        # Set block sizes
+        BLOCK = 256
+        NUM_BLOCK = triton.cdiv(n, BLOCK)
+
+        CBLOCK = 32
+        NUM_CBLOCK = BLOCK // CBLOCK
+        assert BLOCK % CBLOCK == 0, "BLOCK must be a multiple of CBLOCK"
+
+        # Compute decay factors for keys
+        array = torch.arange(0, BLOCK, device=q.device) + 1
+        k_decay = torch.exp(-s * (BLOCK - array.reshape(1, -1)))
+
+        # Step 1: Compute diagonal blocks of attention
+        grid = (b * h * NUM_BLOCK, NUM_CBLOCK)
+        _fwd_diag_kernel[grid](q,
+                               k,
+                               v,
+                               o,
+                               s,
+                               b,
+                               h,
+                               n,
+                               d,
+                               e,
+                               BLOCK=BLOCK,
+                               NUM_BLOCK=NUM_BLOCK,
+                               CBLOCK=CBLOCK)
+
+        # Set feature block sizes
+        NUM_FBLOCK = 1
+        D_FBLOCK = d // NUM_FBLOCK
+        assert d % NUM_FBLOCK == 0
+        E_FBLOCK = e // NUM_FBLOCK
+        assert e % NUM_FBLOCK == 0
+
+        CBLOCK = 64
+        NUM_CBLOCK = BLOCK // CBLOCK
+        assert BLOCK % CBLOCK == 0, "BLOCK must be a multiple of CBLOCK"
+
+        # Step 2: Compute key-value outer products for each block in parallel
+        kv = torch.empty((b, h, NUM_BLOCK, d, e),
+                         dtype=torch.float32,
+                         device=q.device)
+        grid = (b * h, NUM_BLOCK)
+        _fwd_kv_parallel[grid](
+            k,
+            v,
+            k_decay,
+            kv,
+            b,
+            h,
+            n,
+            d,
+            e,
+            BLOCK=BLOCK,
+            NUM_BLOCK=NUM_BLOCK,
+            D_FBLOCK=D_FBLOCK,
+            E_FBLOCK=E_FBLOCK,
+            NUM_FBLOCK=NUM_FBLOCK,
+            CBLOCK=CBLOCK,
+            NUM_CBLOCK=NUM_CBLOCK,
+        )
+
+        # Step 3: Reduce key-value outer products
+        # across blocks and update KV history
+        grid = (b * h, NUM_FBLOCK)
+        _fwd_kv_reduce[grid](s,
+                             kv,
+                             kv_history,
+                             b,
+                             h,
+                             n,
+                             d,
+                             e,
+                             BLOCK=BLOCK,
+                             NUM_BLOCK=NUM_BLOCK,
+                             D_FBLOCK=D_FBLOCK,
+                             E_FBLOCK=E_FBLOCK)
+
+        # Step 4: Compute non-diagonal blocks of attention
+        grid = (b * h, NUM_BLOCK * NUM_CBLOCK)
+        _fwd_none_diag_kernel[grid](
+            q,
+            o,
+            s,
+            kv,
+            b,
+            h,
+            n,
+            d,
+            e,
+            BLOCK=BLOCK,
+            NUM_BLOCK=NUM_BLOCK,
+            E_FBLOCK=E_FBLOCK,
+            CBLOCK=CBLOCK,
+            NUM_CBLOCK=NUM_CBLOCK,
+        )
+
+        # Save tensors for backward pass
+        ctx.save_for_backward(q, k, v, s, kv)
+        ctx.BLOCK = BLOCK
+
+        return o, torch.cat([kv, kv_history.unsqueeze(2)], dim=2)
+
+
+# Apply the lightning attention function
+lightning_attention_ = _attention.apply
+
+
+def lightning_attention(q, k, v, ed, block_size=256, kv_history=None):
+    """
+    Apply lightning attention algorithm 
+    to compute attention efficiently.
+    
+    Args:
+        q: Query tensor of shape [batch, heads, seq_len, dim]
+        k: Key tensor of shape [batch, heads, seq_len, dim]
+        v: Value tensor of shape [batch, heads, seq_len, dim_v]
+        ed: Decay rate tensor of shape [heads]
+        block_size: Size of blocks for block-sparse attention
+        kv_history: Optional key-value history from previous computations
+        
+    Returns:
+        output: Attention output
+        kv: Updated key-value history
+    """
+    d = q.shape[-1]
+    e = v.shape[-1]
+
+    if ed.dim() == 1:
+        ed = ed.view(1, -1, 1, 1)
+
+    # Split the computation into chunks for better parallelism
+    m = 128 if d >= 128 else 64
+    assert d % m == 0, f"Dimension d ({d}) must be divisible by m ({m})"
+    arr = [m * i for i in range(d // m + 1)]
+    if arr[-1] != d:
+        arr.append(d)
+    n = len(arr)
+    output = 0
+
+    # Initialize or clone key-value history
+    if kv_history is None:
+        kv_history = torch.zeros((q.shape[0], q.shape[1], d, e),
+                                 dtype=torch.float32,
+                                 device=q.device)
+    else:
+        kv_history = kv_history.clone().contiguous()
+
+    # Process each chunk and accumulate results
+    for i in range(n - 1):
+        s = arr[i]
+        e = arr[i + 1]
+        q1 = q[..., s:e]
+        k1 = k[..., s:e]
+        o, kv = lightning_attention_(q1, k1, v, ed, kv_history)
+        output = output + o
+    return output, kv
+
+
+@triton.jit
+def _linear_attn_decode_kernel(
+    q_ptr,
+    k_ptr,
+    v_ptr,
+    kv_cache_ptr,
+    slope_rate,
+    slot_idx,
+    output_ptr,
+    D: tl.constexpr,
+    qkv_b_stride,
+    qkv_h_stride,
+    cache_b_stride,
+    cache_h_stride,
+    cache_d0_stride,
+    cache_d1_stride,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    Kernel for linear attention decoding with KV cache.
+    
+    This kernel computes attention for a single token using the KV cache.
+    """
+    pid_b = tl.program_id(0)  # batch index
+    pid_h = tl.program_id(1)  # head index
+    pid_d = tl.program_id(2)  # dimension block index
+
+    # Load slot index for the current batch
+    slot_id = tl.load(slot_idx + pid_b)
+
+    # Skip if slot_id is -1 (padding)
+    if slot_id == -1:
+        return
+
+    batch_id = pid_b
+    head_id = pid_h
+
+    # Load decay rate for the current head
+    ratio = tl.load(slope_rate + pid_h)
+
+    # Calculate offsets for dimensions
+    qk_d_offsets = tl.arange(0, D)
+    v_d_offsets = tl.arange(0, BLOCK_SIZE) + pid_d * BLOCK_SIZE
+    cache_d_offsets = qk_d_offsets[:, None] * cache_d0_stride + v_d_offsets[
+        None, :] * cache_d1_stride
+
+    # Calculate offsets for the current batch and head
+    q_offset = batch_id * qkv_b_stride + head_id * qkv_h_stride
+    k_offset = batch_id * qkv_b_stride + head_id * qkv_h_stride
+    v_offset = batch_id * qkv_b_stride + head_id * qkv_h_stride
+
+    cache_offset = slot_id * cache_b_stride + head_id * cache_h_stride
+
+    # Create masks for loading tensors
+    qk_mask = qk_d_offsets < D
+    v_mask = v_d_offsets < D
+
+    # Load query, key, and value tensors
+    q = tl.load(q_ptr + q_offset + qk_d_offsets, mask=qk_mask, other=0.0)
+    k = tl.load(k_ptr + k_offset + qk_d_offsets, mask=qk_mask, other=0.0)
+    v = tl.load(v_ptr + v_offset + v_d_offsets, mask=v_mask, other=0.0)
+
+    # Compute key-value outer product
+    kv_outer = k[:, None] * v[None, :]
+    kv_mask = qk_mask[:, None] & v_mask[None, :]
+
+    # Apply decay to previous KV cache
+    ratio = tl.exp(-ratio)
+    kv_ptr = kv_cache_ptr + cache_offset + cache_d_offsets
+    kv_cache_old = tl.load(kv_ptr, mask=kv_mask, other=0.0)
+    kv_outer = kv_outer + ratio * kv_cache_old
+
+    # Compute attention output
+    output = q[:, None].to(tl.float32) * kv_outer
+    output = tl.sum(output, axis=0)
+
+    # Update KV cache and store output
+    tl.store(kv_ptr, kv_outer, mask=kv_mask)
+    tl.store(output_ptr + q_offset + v_d_offsets, output, mask=v_mask)
+
+
+def linear_decode_forward_triton(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    kv_caches: torch.Tensor,
+    slope_rate: torch.Tensor,
+    slot_idx: torch.Tensor,
+    BLOCK_SIZE: int = 32,
+) -> torch.Tensor:
+    """
+    Perform linear attention decoding using Triton kernels.
+    
+    Args:
+        q: Query tensor of shape [B, H, 1, D]
+        k: Key tensor of shape [B, H, 1, D]
+        v: Value tensor of shape [B, H, 1, D]
+        kv_caches: Key-value cache tensor
+        slope_rate: Decay rate tensor
+        slot_idx: Slot indices for batches
+        BLOCK_SIZE: Size of blocks for processing
+        
+    Returns:
+        output: Attention output tensor
+    """
+    B, H, _, D = q.shape
+    assert k.shape == (B, H, 1, D)
+    assert v.shape == (B, H, 1, D)
+
+    # Initialize output tensor
+    output = torch.empty_like(q)
+
+    # Set grid dimensions for the kernel
+    grid = (B, H, D // BLOCK_SIZE)
+
+    # Calculate strides for tensors
+    qkv_b_stride = q.stride(0)
+    qkv_h_stride = q.stride(1)
+
+    cache_b_stride = kv_caches.stride(0)
+    cache_h_stride = kv_caches.stride(1)
+    cache_d0_stride = kv_caches.stride(2)
+    cache_d1_stride = kv_caches.stride(3)
+
+    # Launch the kernel
+    _linear_attn_decode_kernel[grid](
+        q,
+        k,
+        v,
+        kv_caches,
+        slope_rate,
+        slot_idx,
+        output,
+        D,
+        qkv_b_stride,
+        qkv_h_stride,
+        cache_b_stride,
+        cache_h_stride,
+        cache_d0_stride,
+        cache_d1_stride,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+
+    # Reshape output and return
+    output = rearrange(output, "b h n d -> b n (h d)")
+    return output.squeeze(1).contiguous()
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/linear.py b/vllm_v0.10.0/vllm/model_executor/layers/linear.py
new file mode 100644
index 0000000..bb81a66
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/linear.py
@@ -0,0 +1,1620 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import itertools
+from abc import abstractmethod
+from typing import Any, Literal, Optional, Union
+
+import torch
+import torch.nn as nn
+from torch.nn.parameter import Parameter, UninitializedParameter
+
+from vllm import envs
+from vllm.distributed import (divide, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              split_tensor_along_last_dim,
+                              tensor_model_parallel_all_gather,
+                              tensor_model_parallel_all_reduce)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.utils import dispatch_unquantized_gemm
+# yapf: disable
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           BlockQuantScaleParameter,
+                                           PackedColumnParameter,
+                                           PackedvLLMParameter,
+                                           PerTensorScaleParameter,
+                                           RowvLLMParameter)
+# yapf: enable
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+
+logger = init_logger(__name__)
+
+WEIGHT_LOADER_V2_SUPPORTED = [
+    "CompressedTensorsLinearMethod",
+    "BitBLASLinearMethod",
+    "GPTQBitBLASLinearMethod",
+    "AWQMarlinLinearMethod",
+    "AWQLinearMethod",
+    "GPTQMarlinLinearMethod",
+    "Fp8LinearMethod",
+    "MarlinLinearMethod",
+    "QQQLinearMethod",
+    "GPTQMarlin24LinearMethod",
+    "TPUInt8LinearMethod",
+    "GPTQLinearMethod",
+    "FBGEMMFp8LinearMethod",
+    "ModelOptFp8LinearMethod",
+    "IPEXAWQLinearMethod",
+    "IPEXGPTQLinearMethod",
+    "HQQMarlinMethod",
+    "QuarkLinearMethod",
+    "ModelOptNvFp4LinearMethod",
+]
+
+
+def adjust_bitblas_shard(param, shard_size, shard_offset):
+    bitblas_tile_size = getattr(param, "bitblas_tile_size", None)
+    if bitblas_tile_size is not None:
+        return (shard_size // bitblas_tile_size,
+                shard_offset // bitblas_tile_size)
+
+    return shard_size, shard_offset
+
+
+def adjust_marlin_shard(param, shard_size, shard_offset):
+    marlin_tile_size = getattr(param, "marlin_tile_size", None)
+    if marlin_tile_size is None:
+        return shard_size, shard_offset
+
+    return shard_size * marlin_tile_size, shard_offset * marlin_tile_size
+
+
+def adjust_bitsandbytes_4bit_shard(param: Parameter,
+                                   shard_offsets: dict[str, tuple[int, int]],
+                                   loaded_shard_id: str) -> tuple[int, int]:
+    """Adjust the quantization offsets and sizes for BitsAndBytes sharding."""
+
+    total, _ = shard_offsets["total"]
+    orig_offset, orig_size = shard_offsets[loaded_shard_id]
+
+    quantized_total = param.data.shape[0]
+    quantized_offset = orig_offset * quantized_total // total
+    quantized_size = orig_size * quantized_total // total
+
+    return quantized_size, quantized_offset
+
+
+def adjust_scalar_to_fused_array(param, loaded_weight, shard_id):
+    """For fused modules (QKV and MLP) we have an array of length
+    N that holds 1 scale for each "logical" matrix. So the param
+    is an array of length N. The loaded_weight corresponds to 
+    one of the shards on disk. Here, we slice the param based on 
+    the shard_id for loading.
+    """
+    qkv_idxs = {"q": 0, "k": 1, "v": 2}
+
+    if isinstance(shard_id, str):
+        shard_id = qkv_idxs[shard_id]
+    elif not isinstance(shard_id, int):
+        raise ValueError(f"Unknown Shard Id {shard_id}")
+
+    # AutoFP8 scales do not have a shape
+    # compressed-tensors scales do have a shape
+    if len(loaded_weight.shape) != 0:
+        assert loaded_weight.shape[0] == 1
+        loaded_weight = loaded_weight[0]
+
+    return param[shard_id], loaded_weight
+
+
+# TODO(Isotr0py): We might need a more flexible structure to handle
+# bitsandbytes shard offsets.
+def left_shift_bitsandbytes_4bit_shard(bnb_weight_attrs: dict[str, Any]):
+    """
+    Separate the BitsAndBytes 4-bit shard.
+
+    For example, given bnb weight attributes as below:
+    {
+        'bnb_shard_offsets': array([0, 4, 8, 16]), 
+        'bnb_quant_state': {0: ..., 1: ..., 2: ...},
+    }
+
+    The function will return:
+    {
+        'bnb_shard_offsets': array([0, 4]), 
+        'bnb_quant_state': {0: ...},
+    }
+    and
+    {
+        'bnb_shard_offsets': array([0, 4, 12]),
+        'bnb_quant_state': {0: ..., 1: ...},
+    }
+    """
+    shard_offsets = bnb_weight_attrs["bnb_shard_offsets"]
+    offset_l = shard_offsets[:2]
+    offset_r = shard_offsets[1:] - shard_offsets[1]
+    quant_state_l = {0: bnb_weight_attrs["bnb_quant_state"][0]}
+    quant_state_r = {
+        i - 1: bnb_weight_attrs["bnb_quant_state"][i]
+        for i in range(1,
+                       len(shard_offsets) - 1)
+    }
+    left = dict(bnb_shard_offsets=offset_l, bnb_quant_state=quant_state_l)
+    right = dict(bnb_shard_offsets=offset_r, bnb_quant_state=quant_state_r)
+    return left, right
+
+
+class LinearMethodBase(QuantizeMethodBase):
+    """Base class for different (maybe quantized) linear methods."""
+
+    @abstractmethod
+    def create_weights(self, layer: torch.nn.Module,
+                       input_size_per_partition: int,
+                       output_partition_sizes: list[int], input_size: int,
+                       output_size: int, params_dtype: torch.dtype,
+                       **extra_weight_attrs):
+        """Create weights for a linear layer. 
+           The weights will be set as attributes of the layer.
+
+        Args:
+            layer: The layer that is using the LinearMethodBase factory.
+            input_size_per_partition: Size of the weight input dim on rank X.
+            output_partition_sizes: Sizes of the output dim of each logical 
+                weight on rank X. E.g., output_partition_sizes for QKVLinear
+                is a list contains the width of Wq, Wk, Wv on rank X.
+            input_size: Size of the input dim of the weight across all ranks.
+            output_size: Size of the output dim of the weight across all ranks.
+            params_dtype: Datatype of the parameters.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """Apply the weights in layer to the input tensor.
+        Expects create_weights to have been called before on the layer."""
+        raise NotImplementedError
+
+
+class UnquantizedLinearMethod(LinearMethodBase):
+    """Linear method without quantization."""
+
+    def create_weights(self, layer: torch.nn.Module,
+                       input_size_per_partition: int,
+                       output_partition_sizes: list[int], input_size: int,
+                       output_size: int, params_dtype: torch.dtype,
+                       **extra_weight_attrs):
+        weight = Parameter(torch.empty(sum(output_partition_sizes),
+                                       input_size_per_partition,
+                                       dtype=params_dtype),
+                           requires_grad=False)
+        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(weight, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if current_platform.is_cpu() and envs.VLLM_CPU_SGL_KERNEL:
+            N, K = layer.weight.size()
+            dtype = layer.weight.dtype
+            if (torch._C._cpu._is_amx_tile_supported()
+                    and dtype == torch.bfloat16 and N % 32 == 0
+                    and K % 32 == 0):
+                packed_weight = torch.ops._C.convert_weight_packed(
+                    layer.weight)
+                assert packed_weight.size() == layer.weight.size()
+                layer.weight.copy_(packed_weight)
+                if layer.bias is not None:
+                    layer.bias = Parameter(layer.bias.to(torch.float32),
+                                           requires_grad=False)
+                layer.use_cpu_sgl = True
+            else:
+                logger.warning(
+                    "CPU SGL kernels require Intel AMX support,"
+                    " bfloat16 weight, IC and OC are divisible by 32.")
+                layer.use_cpu_sgl = False
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        return dispatch_unquantized_gemm()(layer, x, layer.weight, bias)
+
+
+class LinearBase(torch.nn.Module):
+    """Base linear layer.
+
+    Args:
+        input_size: input dimension of the linear layer.
+        output_size: output dimension of the linear layer.
+        bias: If true, add bias.
+        skip_bias_add: If true, skip adding bias but instead return it.
+        params_dtype: Data type for the parameters.
+        quant_config: Quantization configure.
+        return_bias: If true, return bias together with outputs in forward pass.
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        *,
+        return_bias: bool = True,
+    ):
+        super().__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.skip_bias_add = skip_bias_add
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
+        self.quant_config = quant_config
+        self.prefix = prefix
+        if quant_config is None:
+            self.quant_method: Optional[
+                QuantizeMethodBase] = UnquantizedLinearMethod()
+        else:
+            self.quant_method = quant_config.get_quant_method(self,
+                                                              prefix=prefix)
+        self.return_bias = return_bias
+
+    def forward(
+        self, x: torch.Tensor
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+        raise NotImplementedError
+
+
+class ReplicatedLinear(LinearBase):
+    """Replicated linear layer.
+
+    Args:
+        input_size: input dimension of the linear layer.
+        output_size: output dimension of the linear layer.
+        bias: If true, add bias.
+        skip_bias_add: If true, skip adding bias but instead return it.
+        params_dtype: Data type for the parameters.
+        quant_config: Quantization configure.
+        prefix: The name of the layer in the state dict, including all parents
+                        (e.g. model.layers.0.qkv_proj)
+        return_bias: If true, return bias together with outputs in forward pass.
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = True,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        *,
+        return_bias: bool = True,
+    ):
+        # If MergedReplicatedLinear, use output size of each partition.
+        if hasattr(self, "output_sizes"):
+            self.output_partition_sizes = self.output_sizes
+        else:
+            self.output_partition_sizes = [output_size]
+
+        super().__init__(input_size,
+                         output_size,
+                         skip_bias_add,
+                         params_dtype,
+                         quant_config,
+                         prefix=prefix,
+                         return_bias=return_bias)
+
+        # All the linear layer supports quant method.
+        assert self.quant_method is not None
+        self.quant_method.create_weights(self,
+                                         self.input_size,
+                                         self.output_partition_sizes,
+                                         self.input_size,
+                                         self.output_size,
+                                         self.params_dtype,
+                                         weight_loader=self.weight_loader)
+
+        if bias:
+            self.bias = Parameter(
+                torch.empty(self.output_size, dtype=self.params_dtype))
+            set_weight_attrs(self.bias, {
+                "output_dim": 0,
+                "weight_loader": self.weight_loader,
+            })
+        else:
+            self.register_parameter("bias", None)
+
+    def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
+        # If the weight on disk does not have a shape, give it one
+        # (such scales for AutoFp8).
+        # Special case for GGUF
+
+        is_gguf_weight = getattr(param, "is_gguf_weight", False)
+        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
+        if is_gguf_weight_type:
+            param.weight_type = loaded_weight.item()
+
+        # Materialize GGUF UninitializedParameter
+        if is_gguf_weight and isinstance(param, UninitializedParameter):
+            param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype)
+
+        if len(loaded_weight.shape) == 0:
+            loaded_weight = loaded_weight.reshape(1)
+
+        assert param.size() == loaded_weight.size(), (
+            f"Tried to load weights of size {loaded_weight.size()}"
+            f"to a parameter of size {param.size()}")
+        param.data.copy_(loaded_weight)
+
+    def forward(
+        self, x: torch.Tensor
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+        bias = self.bias if not self.skip_bias_add else None
+        assert self.quant_method is not None
+        output = self.quant_method.apply(self, x, bias)
+        output_bias = self.bias if self.skip_bias_add else None
+        if not self.return_bias:
+            return output
+        return output, output_bias
+
+    def extra_repr(self) -> str:
+        s = f"in_features={self.input_size}"
+        s += f", output_features={self.output_size}"
+        s += f", bias={self.bias is not None}"
+        return s
+
+
+class MergedReplicatedLinear(ReplicatedLinear):
+    """Replicated linear layer.
+
+    Args:
+        input_size: input dimension of the linear layer.
+        output_size: output dimension of the linear layer.
+        bias: If true, add bias.
+        skip_bias_add: If true, skip adding bias but instead return it.
+        params_dtype: Data type for the parameters.
+        quant_config: Quantization configure.
+        prefix: The name of the layer in the state dict, including all parents
+                        (e.g. model.layers.0.qkv_proj)
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_sizes: list[int],
+        bias: bool = True,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        *,
+        return_bias: bool = True,
+    ):
+        self.output_sizes = output_sizes
+        super().__init__(input_size,
+                         sum(output_sizes),
+                         bias,
+                         skip_bias_add,
+                         params_dtype,
+                         quant_config,
+                         prefix=prefix,
+                         return_bias=return_bias)
+
+    def weight_loader(self,
+                      param: Union[Parameter, BasevLLMParameter],
+                      loaded_weight: torch.Tensor,
+                      loaded_shard_id: Optional[int] = None):
+        assert loaded_shard_id is not None
+        assert loaded_shard_id < len(self.output_sizes)
+
+        if isinstance(param, BlockQuantScaleParameter):
+            from vllm.model_executor.layers.quantization.fp8 import (
+                Fp8LinearMethod, Fp8MoEMethod)
+            assert self.quant_method is not None
+            assert isinstance(self.quant_method,
+                              (Fp8LinearMethod, Fp8MoEMethod))
+            weight_block_size = self.quant_method.quant_config.weight_block_size
+            assert weight_block_size is not None
+            block_n, _ = weight_block_size[0], weight_block_size[1]
+            shard_offset = (
+                (sum(self.output_sizes[:loaded_shard_id]) + block_n - 1) //
+                block_n)
+            shard_size = ((self.output_sizes[loaded_shard_id] + block_n - 1) //
+                          block_n)
+        elif isinstance(param, PerTensorScaleParameter):
+            shard_offset = loaded_shard_id
+            shard_size = 1
+        else:
+            shard_offset = sum(self.output_sizes[:loaded_shard_id])
+            shard_size = self.output_sizes[loaded_shard_id]
+
+        param[shard_offset:shard_offset + shard_size] = loaded_weight
+
+
+class ColumnParallelLinear(LinearBase):
+    """Linear layer with column parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its second dimension as A = [A_1, ..., A_p].
+
+    Args:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+        bias: If true, add bias.
+        gather_output: If true, call all-gather on output and make Y available
+                       to all GPUs, otherwise, every GPU will have its output
+                       which is Y_i = XA_i
+        skip_bias_add: This was added to enable performance optimizations where
+                       bias can be fused with other element-wise operations. we
+                       skip adding bias but instead return it.
+        params_dtype: Data type for the parameters.
+        quant_config: Quantization configure.
+        output_sizes: list of output sizes packed into one output, like for QKV
+                       the list would be size 3.
+        prefix: The name of the layer in the state dict, including all parents
+                        (e.g. model.layers.0.qkv_proj) 
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = True,
+        gather_output: bool = False,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        output_sizes: Optional[list[int]] = None,
+        prefix: str = "",
+        *,
+        return_bias: bool = True,
+    ):
+        # Divide the weight matrix along the last dimension.
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.input_size_per_partition = input_size
+        self.output_size_per_partition = divide(output_size, self.tp_size)
+        self.output_partition_sizes = [self.output_size_per_partition]
+        # If QKV or MergedColumn, use output size of each partition.
+        if hasattr(self, "output_sizes"):
+            self.output_partition_sizes = [
+                divide(output_size, self.tp_size)
+                for output_size in self.output_sizes
+            ]
+
+        super().__init__(input_size,
+                         output_size,
+                         skip_bias_add,
+                         params_dtype,
+                         quant_config,
+                         prefix,
+                         return_bias=return_bias)
+
+        self.gather_output = gather_output
+
+        if output_sizes is None:
+            output_sizes = [output_size]
+
+        assert self.quant_method is not None
+        self.quant_method.create_weights(
+            layer=self,
+            input_size_per_partition=self.input_size_per_partition,
+            output_partition_sizes=self.output_partition_sizes,
+            input_size=self.input_size,
+            output_size=self.output_size,
+            params_dtype=self.params_dtype,
+            weight_loader=(
+                self.weight_loader_v2 if self.quant_method.__class__.__name__
+                in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader))
+        if bias:
+            self.bias = Parameter(
+                torch.empty(self.output_size_per_partition,
+                            dtype=params_dtype))
+            set_weight_attrs(self.bias, {
+                "output_dim": 0,
+                "weight_loader": self.weight_loader,
+            })
+        else:
+            self.register_parameter("bias", None)
+
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+    def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
+
+        output_dim = getattr(param, "output_dim", None)
+
+        is_sharded_weight = getattr(param, "is_sharded_weight", False)
+        use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
+        # bitsandbytes loads the weights of the specific portion
+        # no need to narrow
+        is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit
+
+        # Special case for GGUF
+        is_gguf_weight = getattr(param, "is_gguf_weight", False)
+        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
+        if is_gguf_weight_type:
+            param.weight_type = loaded_weight.item()
+
+        # Materialize GGUF UninitializedParameter
+        if is_gguf_weight and isinstance(param, UninitializedParameter):
+            final_shape = list(loaded_weight.shape)
+            if output_dim is not None:
+                assert final_shape[output_dim] % self.tp_size == 0
+                final_shape[output_dim] = (final_shape[output_dim] //
+                                           self.tp_size)
+            param.materialize(final_shape, dtype=loaded_weight.dtype)
+
+        param_data = param.data
+        if output_dim is not None and not is_sharded_weight:
+            shard_size = param_data.shape[output_dim]
+            start_idx = self.tp_rank * shard_size
+            loaded_weight = loaded_weight.narrow(output_dim, start_idx,
+                                                 shard_size)
+
+        # Special case for loading scales off disk, which often do not
+        # have a shape (such as in the case of AutoFP8).
+        if len(loaded_weight.shape) == 0:
+            loaded_weight = loaded_weight.reshape(1)
+
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+    def weight_loader_v2(self, param: Parameter, loaded_weight: torch.Tensor):
+        # Special case for loading scales off disk, which often do not
+        # have a shape (such as in the case of AutoFP8).
+        if len(loaded_weight.shape) == 0:
+            assert loaded_weight.numel() == 1
+            loaded_weight = loaded_weight.reshape(1)
+        param.load_column_parallel_weight(loaded_weight=loaded_weight)
+
+    def forward(
+        self, input_
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+        bias = self.bias if not self.skip_bias_add else None
+
+        # Matrix multiply.
+        assert self.quant_method is not None
+        output_parallel = self.quant_method.apply(self, input_, bias)
+        if self.gather_output:
+            # All-gather across the partitions.
+            output = tensor_model_parallel_all_gather(output_parallel)
+        else:
+            output = output_parallel
+        output_bias = self.bias if self.skip_bias_add else None
+        if not self.return_bias:
+            return output
+        return output, output_bias
+
+    def extra_repr(self) -> str:
+        s = f"in_features={self.input_size}"
+        s += f", output_features={self.output_size_per_partition}"
+        s += f", bias={self.bias is not None}"
+        s += f", tp_size={get_tensor_model_parallel_world_size()}"
+        s += f", gather_output={self.gather_output}"
+        return s
+
+
+class MergedColumnParallelLinear(ColumnParallelLinear):
+    """Packed linear layers with column parallelism.
+
+    Similar to ColumnParallelLinear, but the weight matrix is concatenated
+    along the output dimension. When the weight matrix is loaded, the
+    different partitions are sharded separately.
+
+    Args:
+        input_size: input dimension of the linear layer.
+        output_sizes: list of output dimensions of the linear layer.
+        bias: If true, add bias.
+        gather_output: If true, call all-gather on output and make the output
+                       available to all GPUs, otherwise, every GPU will have
+                       its own output.
+        skip_bias_add: This was added to enable performance optimizations where
+                       bias can be fused with other element-wise operations. we
+                       skip adding bias but instead return it.
+        params_dtype: Data type for the parameters.
+        quant_config: Quantization configure.
+        prefix: The name of the layer in the state dict, including all parents
+                        (e.g. model.layers.0.qkv_proj)
+        return_bias: If true, return bias together with outputs in forward pass.
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_sizes: list[int],
+        bias: bool = True,
+        gather_output: bool = False,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        *,
+        return_bias: bool = True,
+    ):
+        self.output_sizes = output_sizes
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+        assert all(output_size % self.tp_size == 0
+                   for output_size in output_sizes)
+        super().__init__(input_size=input_size,
+                         output_size=sum(output_sizes),
+                         bias=bias,
+                         gather_output=gather_output,
+                         skip_bias_add=skip_bias_add,
+                         params_dtype=params_dtype,
+                         quant_config=quant_config,
+                         prefix=prefix,
+                         return_bias=return_bias)
+
+    def weight_loader(self,
+                      param: Parameter,
+                      loaded_weight: torch.Tensor,
+                      loaded_shard_id: Optional[int] = None):
+
+        # Special case for GGUF
+        # initialize GGUF param after we know the quantize type
+        is_gguf_weight = getattr(param, "is_gguf_weight", False)
+        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
+        if is_gguf_weight_type:
+            if loaded_shard_id is not None:
+                param.data[loaded_shard_id].copy_(loaded_weight)
+                param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
+            else:
+                param.shard_weight_type = {
+                    i: loaded_weight.item()
+                    for i, _ in enumerate(self.output_sizes)
+                }
+            return
+
+        if is_gguf_weight:
+
+            output_dim = getattr(param, "output_dim", None)
+            shard_size = loaded_weight.size(output_dim) // self.tp_size
+            start_idx = self.tp_rank * shard_size
+
+            if loaded_shard_id is not None:
+                loaded_weight = loaded_weight.narrow(output_dim, start_idx,
+                                                     shard_size)
+                param.shard_id.append(loaded_shard_id)
+                param.shard_id_map[loaded_shard_id] = len(param.data_container)
+                param.data_container.append(loaded_weight)
+                return
+
+        param_data = param.data
+        output_dim = getattr(param, "output_dim", None)
+        # Special case for AQLM codebooks.
+        is_metadata = getattr(param, "is_metadata", False)
+        # Special case for per-tensor scale to load scalar into fused array.
+        needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False)
+
+        if loaded_shard_id is None:
+            # Loaded weight is already fused on disk (mlp).
+            # (e.g., Phi-3's gate_up_proj).
+            if output_dim is None:
+                if needs_scalar_to_array:
+                    param_data, loaded_weight = adjust_scalar_to_fused_array(
+                        param_data, loaded_weight, 0)
+
+                assert param_data.shape == loaded_weight.shape
+                param_data.copy_(loaded_weight)
+                return
+            current_shard_offset = 0
+            use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit",
+                                            False)
+            shard_offsets: list[tuple[int, int, int]] = []
+            for i, output_size in enumerate(self.output_sizes):
+                shard_offsets.append((i, current_shard_offset, output_size))
+                current_shard_offset += output_size
+            packed_dim = getattr(param, "packed_dim", None)
+            for shard_id, shard_offset, shard_size in shard_offsets:
+                # Special case for Quantization.
+                # If quantized, we need to adjust the offset and size to account
+                # for the packing.
+                if packed_dim == output_dim:
+                    shard_size = shard_size // param.pack_factor
+                    shard_offset = shard_offset // param.pack_factor
+                    # Special case for Marlin.
+                    shard_size, shard_offset = adjust_marlin_shard(
+                        param, shard_size, shard_offset)
+
+                shard_size, shard_offset = adjust_bitblas_shard(
+                    param, shard_size, shard_offset)
+
+                if use_bitsandbytes_4bit:
+                    index = list(itertools.accumulate([0] + self.output_sizes))
+                    orig_offsets = {
+                        str(i): (index[i], size)
+                        for i, size in enumerate(self.output_sizes)
+                    }
+                    orig_offsets["total"] = (self.output_size, 0)
+                    shard_size, shard_offset = adjust_bitsandbytes_4bit_shard(
+                        param, orig_offsets, str(shard_id))
+
+                loaded_weight_shard = loaded_weight.narrow(
+                    output_dim, shard_offset, shard_size)
+                self.weight_loader(param, loaded_weight_shard, shard_id)
+            return
+
+        assert loaded_shard_id < len(self.output_sizes)
+        if output_dim is not None:
+            shard_offset = (sum(self.output_sizes[:loaded_shard_id]) //
+                            self.tp_size)
+            shard_size = self.output_sizes[loaded_shard_id] // self.tp_size
+            # Special case for quantization.
+            # If quantized, we need to adjust the offset and size to account
+            # for the packing.
+            packed_dim = getattr(param, "packed_dim", None)
+            if packed_dim == output_dim:
+                shard_size = shard_size // param.pack_factor
+                shard_offset = shard_offset // param.pack_factor
+                # Special case for Marlin.
+                shard_size, shard_offset = adjust_marlin_shard(
+                    param, shard_size, shard_offset)
+            shard_size, shard_offset = adjust_bitblas_shard(
+                param, shard_size, shard_offset)
+
+            use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit",
+                                            False)
+            is_sharded_weight = getattr(param, "is_sharded_weight", False)
+            # bitsandbytes loads the weights of the specific portion
+            # no need to narrow
+            is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit
+
+            if use_bitsandbytes_4bit:
+                shard_size = loaded_weight.shape[output_dim]
+                shard_offset = loaded_weight.shape[output_dim] * \
+                    loaded_shard_id
+
+            param_data = param_data.narrow(output_dim, shard_offset,
+                                           shard_size)
+            start_idx = self.tp_rank * shard_size
+            if not is_sharded_weight:
+                loaded_weight = loaded_weight.narrow(output_dim, start_idx,
+                                                     shard_size)
+        # Special case for AQLM codebooks.
+        elif is_metadata:
+            # metadata indicates fixed size concatenated along dim 0
+            shard_size = loaded_weight.shape[0]
+            shard_offset = loaded_shard_id * shard_size
+            param_data = param_data.narrow(0, shard_offset, shard_size)
+
+        # Special case for per-tensor scales in fused case.
+        elif needs_scalar_to_array:
+            param_data, loaded_weight = adjust_scalar_to_fused_array(
+                param_data, loaded_weight, loaded_shard_id)
+
+        else:
+            ignore_warning = getattr(param, "ignore_warning", False)
+            if not ignore_warning:
+                logger.warning(
+                    "Loading a weight without `output_dim` attribute in "
+                    "MergedColumnParallelLinear, assume the weight is "
+                    "the same for all partitions.")
+
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+    def _load_fused_module_from_checkpoint(self, param: BasevLLMParameter,
+                                           loaded_weight: torch.Tensor):
+        """
+        Handle special case for models where MLP layers are already
+        fused on disk. In this case, we have no shard id. This function
+        determmines the shard id by splitting these layers and then calls
+        the weight loader using the shard id.
+
+        An example of a model with these fused layers:
+        https://huggingface.co/microsoft/Phi-3-mini-4k-instruct
+        """
+
+        current_shard_offset = 0
+        shard_offsets: list[tuple[int, int, int]] = []
+        for i, output_size in enumerate(self.output_sizes):
+            shard_offsets.append((i, current_shard_offset, output_size))
+            current_shard_offset += output_size
+
+        for shard_id, shard_offset, shard_size in shard_offsets:
+            # Special case for Quantization.
+            # If quantized, we need to adjust the offset and size to account
+            # for the packing.
+            if isinstance(param, (PackedColumnParameter, PackedvLLMParameter
+                                  )) and param.packed_dim == param.output_dim:
+                shard_size, shard_offset = \
+                    param.adjust_shard_indexes_for_packing(
+                    shard_size=shard_size, shard_offset=shard_offset)
+
+            loaded_weight_shard = loaded_weight.narrow(param.output_dim,
+                                                       shard_offset,
+                                                       shard_size)
+            self.weight_loader_v2(param, loaded_weight_shard, shard_id)
+
+    def weight_loader_v2(self,
+                         param: BasevLLMParameter,
+                         loaded_weight: torch.Tensor,
+                         loaded_shard_id: Optional[int] = None):
+        if loaded_shard_id is None:
+            if isinstance(param, PerTensorScaleParameter):
+                param.load_merged_column_weight(loaded_weight=loaded_weight,
+                                                shard_id=0)
+                return
+            elif type(param) in (RowvLLMParameter, BasevLLMParameter):
+                param.load_merged_column_weight(loaded_weight=loaded_weight)
+                return
+            # TODO: @dsikka - move to parameter.py
+            self._load_fused_module_from_checkpoint(param, loaded_weight)
+            return
+
+        assert loaded_shard_id < len(self.output_sizes)
+
+        tp_size = get_tensor_model_parallel_world_size()
+
+        if isinstance(param, BlockQuantScaleParameter):
+            from vllm.model_executor.layers.quantization.fp8 import (
+                Fp8LinearMethod, Fp8MoEMethod)
+            assert self.quant_method is not None
+            assert isinstance(self.quant_method,
+                              (Fp8LinearMethod, Fp8MoEMethod))
+            weight_block_size = self.quant_method.quant_config.weight_block_size
+            assert weight_block_size is not None
+            block_n, _ = weight_block_size[0], weight_block_size[1]
+            shard_offset = (
+                (sum(self.output_sizes[:loaded_shard_id]) + block_n - 1) //
+                block_n) // tp_size
+            shard_size = ((self.output_sizes[loaded_shard_id] + block_n - 1) //
+                          block_n // tp_size)
+        else:
+            shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
+            shard_size = self.output_sizes[loaded_shard_id] // tp_size
+
+        param.load_merged_column_weight(loaded_weight=loaded_weight,
+                                        shard_id=loaded_shard_id,
+                                        shard_offset=shard_offset,
+                                        shard_size=shard_size)
+
+
+class QKVParallelLinear(ColumnParallelLinear):
+    """Linear layers for the attention's QKV transformation.
+
+    Linear layers for the linear transformation of the query, key, and value
+    vectors in the attention layer. The weight matrix is concatenated along
+    the output dimension. The layer is parallelized along the head dimension.
+    When the number of key/value heads is smaller than the number of query
+    heads (e.g., multi-query/grouped-query attention), the key/value head may
+    be replicated while the query heads are partitioned.
+
+    Args:
+        hidden_size: input hidden state size of the transformer.
+        head_size: size of each attention head.
+        total_num_heads: total number of attention query heads.
+        total_num_kv_heads: total number of attention key/value heads. If
+                            None, assume total_num_kv_heads = total_num_heads.
+        bias: If true, add bias.
+        skip_bias_add: This was added to enable performance optimizations where
+                       bias can be fused with other element-wise operations. we
+                       skip adding bias but instead return it.
+        params_dtype: Data type for the parameters.
+        quant_config: Quantization configure.
+        prefix: The name of the layer in the state dict, including all parents
+                        (e.g. model.layers.0.qkv_proj)
+        return_bias: If true, return bias together with outputs in forward pass.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        head_size: int,
+        total_num_heads: int,
+        total_num_kv_heads: Optional[int] = None,
+        bias: bool = True,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        *,
+        return_bias: bool = True,
+    ):
+        self.hidden_size = hidden_size
+        self.head_size = head_size
+        self.total_num_heads = total_num_heads
+        if total_num_kv_heads is None:
+            total_num_kv_heads = total_num_heads
+        self.total_num_kv_heads = total_num_kv_heads
+        # Divide the weight matrix along the last dimension.
+        tp_size = get_tensor_model_parallel_world_size()
+        self.num_heads = divide(self.total_num_heads, tp_size)
+        if tp_size >= self.total_num_kv_heads:
+            self.num_kv_heads = 1
+            self.num_kv_head_replicas = divide(tp_size,
+                                               self.total_num_kv_heads)
+        else:
+            self.num_kv_heads = divide(self.total_num_kv_heads, tp_size)
+            self.num_kv_head_replicas = 1
+        input_size = self.hidden_size
+        output_size = (self.num_heads +
+                       2 * self.num_kv_heads) * tp_size * self.head_size
+        self.output_sizes = [
+            self.num_heads * self.head_size * tp_size,  # q_proj
+            self.num_kv_heads * self.head_size * tp_size,  # k_proj
+            self.num_kv_heads * self.head_size * tp_size,  # v_proj 
+        ]
+
+        super().__init__(input_size=input_size,
+                         output_size=output_size,
+                         bias=bias,
+                         gather_output=False,
+                         skip_bias_add=skip_bias_add,
+                         params_dtype=params_dtype,
+                         quant_config=quant_config,
+                         prefix=prefix,
+                         return_bias=return_bias)
+
+    def _get_shard_offset_mapping(self, loaded_shard_id: str):
+        shard_offset_mapping = {
+            "q": 0,
+            "k": self.num_heads * self.head_size,
+            "v": (self.num_heads + self.num_kv_heads) * self.head_size,
+            "total": (self.num_heads + 2 * self.num_kv_heads) * self.head_size
+        }
+        return shard_offset_mapping.get(loaded_shard_id)
+
+    def _get_shard_size_mapping(self, loaded_shard_id: str):
+        shard_size_mapping = {
+            "q": self.num_heads * self.head_size,
+            "k": self.num_kv_heads * self.head_size,
+            "v": self.num_kv_heads * self.head_size,
+        }
+        return shard_size_mapping.get(loaded_shard_id)
+
+    def _load_fused_module_from_checkpoint(self, param: BasevLLMParameter,
+                                           loaded_weight: torch.Tensor):
+        """
+        Handle special case for models where QKV layers are already 
+        fused on disk. In this case, we have no shard id. This function
+        determmines the shard id by splitting these layers and then calls
+        the weight loader using the shard id.
+
+        An example of a model with these fused layers:
+        https://huggingface.co/microsoft/Phi-3-mini-4k-instruct
+        """
+        shard_offsets = [
+            # (shard_id, shard_offset, shard_size)
+            ("q", 0, self.total_num_heads * self.head_size),
+            ("k", self.total_num_heads * self.head_size,
+             self.total_num_kv_heads * self.head_size),
+            ("v",
+             (self.total_num_heads + self.total_num_kv_heads) * self.head_size,
+             self.total_num_kv_heads * self.head_size),
+        ]
+
+        for shard_id, shard_offset, shard_size in shard_offsets:
+            # Special case for Quantization.
+            # If quantized, we need to adjust the offset and size to account
+            # for the packing.
+            if isinstance(param, (PackedColumnParameter, PackedvLLMParameter
+                                  )) and param.packed_dim == param.output_dim:
+                shard_size, shard_offset = \
+                    param.adjust_shard_indexes_for_packing(
+                    shard_size=shard_size, shard_offset=shard_offset)
+
+            loaded_weight_shard = loaded_weight.narrow(param.output_dim,
+                                                       shard_offset,
+                                                       shard_size)
+            self.weight_loader_v2(param, loaded_weight_shard, shard_id)
+
+    def weight_loader_v2(self,
+                         param: BasevLLMParameter,
+                         loaded_weight: torch.Tensor,
+                         loaded_shard_id: Optional[str] = None):
+        if loaded_shard_id is None:  # special case for certain models
+            if isinstance(param, PerTensorScaleParameter):
+                param.load_qkv_weight(loaded_weight=loaded_weight, shard_id=0)
+                return
+            elif type(param) in (RowvLLMParameter, BasevLLMParameter):
+                param.load_qkv_weight(loaded_weight=loaded_weight)
+                return
+            # TODO: @dsikka - move to parameter.py
+            self._load_fused_module_from_checkpoint(param, loaded_weight)
+            return
+
+        assert loaded_shard_id in ["q", "k", "v"]
+
+        shard_offset = self._get_shard_offset_mapping(loaded_shard_id)
+        shard_size = self._get_shard_size_mapping(loaded_shard_id)
+
+        # Note(simon): This is needed for Qwen3's fp8 quantization.
+        if isinstance(param, BlockQuantScaleParameter):
+            assert self.quant_method is not None
+            assert hasattr(self.quant_method, "quant_config")
+            weight_block_size = self.quant_method.quant_config.weight_block_size
+            block_n, _ = weight_block_size[0], weight_block_size[1]
+            shard_offset = (shard_offset + block_n - 1) // block_n
+            shard_size = (shard_size + block_n - 1) // block_n
+
+        param.load_qkv_weight(loaded_weight=loaded_weight,
+                              num_heads=self.num_kv_head_replicas,
+                              shard_id=loaded_shard_id,
+                              shard_offset=shard_offset,
+                              shard_size=shard_size)
+
+    def weight_loader(self,
+                      param: Parameter,
+                      loaded_weight: torch.Tensor,
+                      loaded_shard_id: Optional[str] = None):
+
+        # Special case for GGUF
+        # initialize GGUF param after we know the quantize type
+        is_gguf_weight = getattr(param, "is_gguf_weight", False)
+        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
+        if is_gguf_weight_type:
+            idx_map = {"q": 0, "k": 1, "v": 2}
+            if loaded_shard_id is not None:
+                param.data[idx_map[loaded_shard_id]].copy_(loaded_weight)
+                param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
+            else:
+                param.shard_weight_type = {
+                    k: loaded_weight.item()
+                    for k in idx_map
+                }
+            return
+
+        if is_gguf_weight:
+            output_dim = getattr(param, "output_dim", None)
+            shard_size = loaded_weight.size(output_dim) // self.tp_size
+            start_idx = self.tp_rank * shard_size
+
+            if loaded_shard_id is not None:
+                loaded_weight = loaded_weight.narrow(output_dim, start_idx,
+                                                     shard_size)
+                param.shard_id.append(loaded_shard_id)
+                param.shard_id_map[loaded_shard_id] = len(param.data_container)
+                param.data_container.append(loaded_weight)
+                return
+
+        param_data = param.data
+        output_dim = getattr(param, "output_dim", None)
+        # Special case for AQLM codebooks.
+        is_metadata = getattr(param, "is_metadata", False)
+
+        # Special case for per-tensor scales in fused case.
+        needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False)
+
+        if loaded_shard_id is None:
+            # Loaded weight is already fused on disk (qkv).
+            # (e.g., Phi-3's qkv_proj).
+            if output_dim is None:
+                if needs_scalar_to_array:
+                    param_data, loaded_weight = adjust_scalar_to_fused_array(
+                        param_data, loaded_weight, 0)
+
+                assert param_data.shape == loaded_weight.shape
+                param_data.copy_(loaded_weight)
+                return
+            shard_offsets = [
+                # (shard_id, shard_offset, shard_size)
+                ("q", 0, self.total_num_heads * self.head_size),
+                ("k", self.total_num_heads * self.head_size,
+                 self.total_num_kv_heads * self.head_size),
+                ("v", (self.total_num_heads + self.total_num_kv_heads) *
+                 self.head_size, self.total_num_kv_heads * self.head_size),
+            ]
+            use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit",
+                                            False)
+
+            packed_dim = getattr(param, "packed_dim", None)
+            for shard_id, shard_offset, shard_size in shard_offsets:
+                # Special case for Quantized Weights.
+                # If quantized, we need to adjust the offset and size to account
+                # for the packing.
+                if packed_dim == output_dim:
+                    shard_size = shard_size // param.pack_factor
+                    shard_offset = shard_offset // param.pack_factor
+
+                    # Special case for Marlin.
+                    shard_size, shard_offset = adjust_marlin_shard(
+                        param, shard_size, shard_offset)
+
+                if use_bitsandbytes_4bit:
+                    orig_qkv_offsets = {
+                        "q": (0, self.total_num_heads * self.head_size),
+                        "k": (self.total_num_heads * self.head_size,
+                              self.total_num_kv_heads * self.head_size),
+                        "v":
+                        ((self.total_num_heads + self.total_num_kv_heads) *
+                         self.head_size,
+                         self.total_num_kv_heads * self.head_size),
+                        "total":
+                        ((self.total_num_heads + 2 * self.total_num_kv_heads) *
+                         self.head_size, 0)
+                    }
+
+                    shard_size, shard_offset = adjust_bitsandbytes_4bit_shard(
+                        param, orig_qkv_offsets, shard_id)
+
+                loaded_weight_shard = loaded_weight.narrow(
+                    output_dim, shard_offset, shard_size)
+                self.weight_loader(param, loaded_weight_shard, shard_id)
+            return
+
+        assert loaded_shard_id in ["q", "k", "v"]
+
+        # If output dim is defined, use the default loading process.
+        if output_dim is not None:
+            if loaded_shard_id == "q":
+                shard_offset = 0
+                shard_size = self.num_heads * self.head_size
+            elif loaded_shard_id == "k":
+                shard_offset = self.num_heads * self.head_size
+                shard_size = self.num_kv_heads * self.head_size
+            elif loaded_shard_id == "v":
+                shard_offset = (self.num_heads +
+                                self.num_kv_heads) * self.head_size
+                shard_size = self.num_kv_heads * self.head_size
+            # Special case for Quantized Weights.
+            # If quantized, we need to adjust the offset and size to account
+            # for the packing.
+            packed_dim = getattr(param, "packed_dim", None)
+            if packed_dim == output_dim:
+                shard_size = shard_size // param.pack_factor
+                shard_offset = shard_offset // param.pack_factor
+
+                # Special case for Marlin.
+                shard_size, shard_offset = adjust_marlin_shard(
+                    param, shard_size, shard_offset)
+
+            use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit",
+                                            False)
+            is_sharded_weight = getattr(param, "is_sharded_weight", False)
+            # bitsandbytes loads the weights of the specific portion
+            # no need to narrow
+            is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit
+
+            if use_bitsandbytes_4bit:
+                orig_qkv_offsets = {
+                    "q": (0, self.num_heads * self.head_size),
+                    "k": (self.num_heads * self.head_size,
+                          self.num_kv_heads * self.head_size),
+                    "v":
+                    ((self.num_heads + self.num_kv_heads) * self.head_size,
+                     self.num_kv_heads * self.head_size),
+                    "total":
+                    ((self.num_heads + 2 * self.num_kv_heads) * self.head_size,
+                     0)
+                }
+                shard_size, shard_offset = adjust_bitsandbytes_4bit_shard(
+                    param, orig_qkv_offsets, loaded_shard_id)
+
+            param_data = param_data.narrow(output_dim, shard_offset,
+                                           shard_size)
+            if loaded_shard_id == "q":
+                shard_id = self.tp_rank
+            else:
+                shard_id = self.tp_rank // self.num_kv_head_replicas
+            start_idx = shard_id * shard_size
+
+            if not is_sharded_weight:
+                loaded_weight = loaded_weight.narrow(output_dim, start_idx,
+                                                     shard_size)
+
+        # Special case for for AQLM codebooks.
+        elif is_metadata:
+            # metadata indicates fixed size concatenated along dim 0
+            shard_size = loaded_weight.shape[0]
+            shard_index = ["q", "k", "v"].index(loaded_shard_id)
+            param_data = param_data.narrow(0, shard_index * shard_size,
+                                           shard_size)
+        # Special case for per-tensor scales in fused case.
+        elif needs_scalar_to_array:
+            param_data, loaded_weight = adjust_scalar_to_fused_array(
+                param_data, loaded_weight, loaded_shard_id)
+        else:
+            ignore_warning = getattr(param, "ignore_warning", False)
+            if not ignore_warning:
+                logger.warning(
+                    "Loading a weight without `output_dim` attribute in "
+                    "QKVParallelLinear, assume the weight is the same "
+                    "for all partitions.")
+
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+
+class RowParallelLinear(LinearBase):
+    """Linear layer with row parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its first dimension and X along its second dimension as:
+               -   -
+              | A_1 |
+              | .   |
+          A = | .   |        X = [X_1, ..., X_p]
+              | .   |
+              | A_p |
+               -   -
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+        bias: If true, add bias. Note that bias is not parallelized.
+        input_is_parallel: If true, we assume that the input is already
+                           split across the GPUs and we do not split
+                           again.
+        skip_bias_add: This was added to enable performance optimization where
+                       bias can be fused with other element-wise operations.
+                       We skip adding bias but instead return it.
+        params_dtype: Data type for the parameters.
+        reduce_results: If true, call all-reduce on output and make Y available
+                       to all GPUs, otherwise, every GPU will have its output
+                       which is Y = X_iA_i
+        quant_config: Quantization configure.
+        prefix: The name of the layer in the state dict, including all parents
+                        (e.g. model.layers.0.down_proj)
+        return_bias: If true, return bias together with outputs in forward pass.
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = True,
+        input_is_parallel: bool = True,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        reduce_results: bool = True,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        *,
+        return_bias: bool = True,
+    ):
+        # Divide the weight matrix along the first dimension.
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.input_size_per_partition = divide(input_size, self.tp_size)
+        self.output_size_per_partition = output_size
+        self.output_partition_sizes = [output_size]
+
+        super().__init__(input_size,
+                         output_size,
+                         skip_bias_add,
+                         params_dtype,
+                         quant_config,
+                         prefix,
+                         return_bias=return_bias)
+
+        self.input_is_parallel = input_is_parallel
+        self.reduce_results = reduce_results
+
+        assert self.quant_method is not None
+        self.quant_method.create_weights(
+            layer=self,
+            input_size_per_partition=self.input_size_per_partition,
+            output_partition_sizes=self.output_partition_sizes,
+            input_size=self.input_size,
+            output_size=self.output_size,
+            params_dtype=self.params_dtype,
+            weight_loader=(
+                self.weight_loader_v2 if self.quant_method.__class__.__name__
+                in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader))
+        if not reduce_results and (bias and not skip_bias_add):
+            raise ValueError("When not reduce the results, adding bias to the "
+                             "results can lead to incorrect results")
+
+        if bias:
+            self.bias = Parameter(
+                torch.empty(self.output_size, dtype=params_dtype))
+            set_weight_attrs(self.bias, {
+                "output_dim": 0,
+                "weight_loader": self.weight_loader,
+            })
+        else:
+            self.register_parameter("bias", None)
+
+    def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
+        input_dim = getattr(param, "input_dim", None)
+        use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
+        is_sharded_weight = getattr(param, "is_sharded_weight", False)
+        # bitsandbytes loads the weights of the specific portion
+        # no need to narrow
+        is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit
+
+        # Special case for GGUF
+        is_gguf_weight = getattr(param, "is_gguf_weight", False)
+        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
+        if is_gguf_weight_type:
+            param.weight_type = loaded_weight.item()
+
+        # Materialize GGUF UninitializedParameter
+        if is_gguf_weight and isinstance(param, UninitializedParameter):
+            weight_shape = list(loaded_weight.shape)
+            if input_dim:
+                weight_shape[input_dim] = (weight_shape[input_dim] //
+                                           self.tp_size)
+            param.materialize(tuple(weight_shape), dtype=loaded_weight.dtype)
+
+        param_data = param.data
+        if input_dim is not None and not is_sharded_weight:
+            shard_size = param_data.shape[input_dim]
+            start_idx = self.tp_rank * shard_size
+            loaded_weight = loaded_weight.narrow(input_dim, start_idx,
+                                                 shard_size)
+
+        # Special case for loading scales off disk, which often do not
+        # have a shape (such as in the case of AutoFP8).
+        if len(loaded_weight.shape) == 0:
+            loaded_weight = loaded_weight.reshape(1)
+
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+    def weight_loader_v2(self, param: BasevLLMParameter,
+                         loaded_weight: torch.Tensor):
+
+        # Special case for loading scales off disk, which often do not
+        # have a shape (such as in the case of AutoFP8).
+        if len(loaded_weight.shape) == 0:
+            assert loaded_weight.numel() == 1
+            loaded_weight = loaded_weight.reshape(1)
+
+        param.load_row_parallel_weight(loaded_weight=loaded_weight)
+
+    def forward(
+        self, input_
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+        if self.input_is_parallel:
+            input_parallel = input_
+        else:
+            tp_rank = get_tensor_model_parallel_rank()
+            splitted_input = split_tensor_along_last_dim(
+                input_, num_partitions=self.tp_size)
+            input_parallel = splitted_input[tp_rank].contiguous()
+
+        # Matrix multiply.
+        assert self.quant_method is not None
+        # Only fuse bias add into GEMM for rank 0 (this ensures that
+        # bias will not get added more than once in TP>1 case)
+        bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias
+        output_parallel = self.quant_method.apply(self,
+                                                  input_parallel,
+                                                  bias=bias_)
+        if self.reduce_results and self.tp_size > 1:
+            output = tensor_model_parallel_all_reduce(output_parallel)
+        else:
+            output = output_parallel
+
+        output_bias = self.bias if self.skip_bias_add else None
+
+        if not self.return_bias:
+            return output
+        return output, output_bias
+
+    def extra_repr(self) -> str:
+        s = f"input_features={self.input_size_per_partition}"
+        s += f", output_features={self.output_size}"
+        s += f", bias={self.bias is not None}"
+        s += f", tp_size={self.tp_size}"
+        s += f", reduce_results={self.reduce_results}"
+        return s
+
+
+class QKVCrossParallelLinear(LinearBase):
+    """Linear layers for efficient cross-attention's QKV transformation.
+
+    Args:
+        hidden_size: input hidden state size of the transformer.
+        head_size: size of each attention head.
+        total_num_heads: total number of attention query heads.
+        total_num_kv_heads: total number of attention key/value heads. If
+                            None, assume total_num_kv_heads = total_num_heads.
+        bias: If true, add bias.
+        skip_bias_add: This was added to enable performance optimizations where
+                       bias can be fused with other element-wise operations. we
+                       skip adding bias but instead return it.
+        params_dtype: Data type for the parameters.
+        quant_config: Quantization configure.
+        prefix: The name of the layer in the state dict, including all parents
+                        (e.g. model.layers.0.qkv_proj)
+    """
+
+    def __init__(self,
+                 hidden_size: int,
+                 head_size: int,
+                 total_num_heads: int,
+                 total_num_kv_heads: Optional[int] = None,
+                 bias: bool = True,
+                 skip_bias_add: bool = False,
+                 params_dtype: Optional[torch.dtype] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        # input_size and output_size are not used, just for alignment
+        input_size = hidden_size
+        output_size = (total_num_heads + (total_num_kv_heads or 0)) * head_size
+        super().__init__(input_size=input_size,
+                         output_size=output_size,
+                         skip_bias_add=skip_bias_add,
+                         params_dtype=params_dtype,
+                         quant_config=quant_config,
+                         prefix=prefix)
+
+        self.quant_config = quant_config
+
+        # Empty placeholders for loading as a single module.
+        placeholder_size = 0
+        assert self.quant_method is not None
+        self.quant_method.create_weights(self,
+                                         placeholder_size, [placeholder_size],
+                                         placeholder_size,
+                                         placeholder_size,
+                                         self.params_dtype,
+                                         weight_loader=self.weight_loader)
+
+        # Use a dictionary to avoid submodules parameters auto-registration:
+        # drop-in replacement for a `QKVParallelLinear` module.
+        self.proj = dict()
+        self.proj["q_proj_decoder"] = ColumnParallelLinear(
+            input_size=hidden_size,
+            output_size=total_num_heads * head_size,
+            bias=bias,
+            quant_config=quant_config,
+            skip_bias_add=skip_bias_add,
+            params_dtype=params_dtype,
+            prefix=f"{prefix}.q_proj_decoder")
+
+        self.proj["kv_proj_encoder"] = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=head_size,
+            total_num_heads=0,
+            total_num_kv_heads=total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            skip_bias_add=skip_bias_add,
+            params_dtype=params_dtype,
+            prefix=f"{prefix}.kv_proj_encoder")
+
+        # `kv_proj_encoder.num_kv_heads` accounts for sharding with tp>1.
+        self.q_size = self.q_proj_decoder.output_size_per_partition
+        self.kv_size = self.kv_proj_encoder.num_kv_heads * head_size
+
+        if bias:
+            self.bias = torch.nn.Parameter()
+            set_weight_attrs(self.bias, {
+                "output_dim": 0,
+                "weight_loader": self.weight_loader,
+            })
+        else:
+            self.bias = None
+
+    def process_weights_after_loading(self):
+        for layer in self.proj.values():
+            if self.quant_method is not None:
+                self.quant_method.process_weights_after_loading(layer)
+
+    @property
+    def q_proj_decoder(self) -> ColumnParallelLinear:
+        layer = self.proj["q_proj_decoder"]
+        for name, param in self.named_parameters():
+            target_param = getattr(layer, name, None)
+            if target_param is not None:
+                self.sync_weight_attrs(param,
+                                       target_param,
+                                       mode="q_proj_decoder")
+        return layer
+
+    @property
+    def kv_proj_encoder(self) -> QKVParallelLinear:
+        layer = self.proj["kv_proj_encoder"]
+        for name, param in self.named_parameters():
+            target_param = getattr(layer, name, None)
+            if target_param is not None:
+                self.sync_weight_attrs(param,
+                                       target_param,
+                                       mode="kv_proj_encoder")
+        return layer
+
+    def sync_weight_attrs(
+        self,
+        src_param: nn.Parameter,
+        tgt_param: nn.Parameter,
+        mode: Literal["q_proj_decoder", "kv_proj_encoder"],
+    ):
+        missing_attrs_dict = {
+            k: getattr(src_param, k)
+            for k in (set(vars(src_param).keys()) -
+                      set(vars(tgt_param).keys()))
+        }
+        # TODO(Isotr0py): handle bitsandbytes 8bit
+        use_bitsandbytes_4bit = getattr(src_param, "use_bitsandbytes_4bit",
+                                        False)
+        if (missing_attrs_dict and use_bitsandbytes_4bit):
+            q_proj_attrs, kv_proj_attrs = left_shift_bitsandbytes_4bit_shard(
+                missing_attrs_dict)
+            if mode == "q_proj_decoder":
+                set_weight_attrs(tgt_param, q_proj_attrs)
+            elif mode == "kv_proj_encoder":
+                set_weight_attrs(tgt_param, kv_proj_attrs)
+        else:
+            set_weight_attrs(tgt_param, missing_attrs_dict)
+
+    def _is_same_param(
+        self,
+        src_param: torch.nn.Parameter,
+        map_param: torch.nn.Parameter,
+    ) -> bool:
+        """Check if two parameters are exactly pointing to same things."""
+        # ignore weight_loader because it's always different
+        key_to_ignore = ["weight_loader", "_weight_loader"]
+        has_same_type_name = type(src_param) is type(map_param)
+        src_param_attrs = {
+            k: v
+            for k, v in src_param.__dict__.items() if k not in key_to_ignore
+        }
+        map_param_attrs = {
+            k: v
+            for k, v in map_param.__dict__.items() if k not in key_to_ignore
+        }
+        has_same_attrs = src_param_attrs == map_param_attrs
+        return has_same_type_name and has_same_attrs
+
+    def select_proj_params(
+        self,
+        layer: nn.Module,
+        param: nn.Parameter,
+    ) -> nn.Parameter:
+        """
+        Given the placeholder param, 
+        return the corresponding param in the proj layers.
+        """
+        target_param_list = [
+            v for _, v in layer.named_parameters()
+            if self._is_same_param(param, v)
+        ]
+        assert len(target_param_list) == 1
+        target_param = target_param_list[0]
+        return target_param
+
+    def forward(  # type: ignore[override]
+        self,
+        decoder_hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, ...]:
+        q, _ = self.q_proj_decoder(decoder_hidden_states)
+        if encoder_hidden_states is None:
+            # Encoder KV already cached.
+            k = None
+            v = None
+        else:
+            # Prefill phase, encoder KV cached here.
+            kv_enc, _ = self.kv_proj_encoder(encoder_hidden_states)
+            # Split kv in half
+            k, v = kv_enc.split(self.kv_size, dim=-1)
+        return q, k, v
+
+    def weight_loader(self,
+                      param: torch.nn.Parameter,
+                      loaded_weight: torch.Tensor,
+                      loaded_shard_id: Optional[str] = None):
+        layer = (self.q_proj_decoder
+                 if loaded_shard_id == "q" else self.kv_proj_encoder)
+        target_param = self.select_proj_params(layer, param)
+        shard_id_args = (loaded_shard_id, ) if loaded_shard_id != "q" else ()
+        if self.quant_method.__class__.__name__ in WEIGHT_LOADER_V2_SUPPORTED:
+            layer.weight_loader_v2(target_param, loaded_weight, *shard_id_args)
+        else:
+            layer.weight_loader(target_param, loaded_weight, *shard_id_args)
+
+    def extra_repr(self) -> str:
+        s = f"in_features={self.input_size}"
+        s += f", q_size={self.q_size}"
+        s += f", kv_size={self.kv_size}"
+        s += f", bias={self.bias is not None}"
+        s += f", tp_size={get_tensor_model_parallel_world_size()}"
+        s += ", gather_output=False"
+        return s
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/logits_processor.py b/vllm_v0.10.0/vllm/model_executor/layers/logits_processor.py
new file mode 100644
index 0000000..e93be9b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/logits_processor.py
@@ -0,0 +1,198 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""A layer that compute logits from hidden_stats."""
+import inspect
+from concurrent.futures import ThreadPoolExecutor
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+import vllm.envs as envs
+from vllm.distributed import (tensor_model_parallel_all_gather,
+                              tensor_model_parallel_gather)
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.platforms import current_platform
+
+_logits_processor_threadpool: Optional[ThreadPoolExecutor] = None
+if envs.VLLM_LOGITS_PROCESSOR_THREADS is not None:
+    _logits_processor_threadpool = ThreadPoolExecutor(
+        envs.VLLM_LOGITS_PROCESSOR_THREADS)
+
+
+class LogitsProcessor(nn.Module):
+    """Process logits and apply logits processors from sampling metadata.
+
+    This layer does the following:
+    1. Gather logits from model hidden_states.
+    2. Scale logits if needed.
+    3. Apply logits processors (if any).
+    """
+
+    def __init__(self,
+                 vocab_size: int,
+                 org_vocab_size: Optional[int] = None,
+                 scale: float = 1.0,
+                 logits_as_input: bool = False,
+                 soft_cap: Optional[float] = None) -> None:
+        """
+        Args:
+            scale: A scaling factor to apply to the logits.
+        """
+        super().__init__()
+        self.scale = scale
+        self.vocab_size = vocab_size
+        # Whether the input is logits (default is hidden states).
+        self.logits_as_input = logits_as_input
+        # original vocabulary size (without LoRA).
+        self.org_vocab_size = org_vocab_size or vocab_size
+        # Soft cap the logits. Used in Gemma 2.
+        self.soft_cap = soft_cap
+        # Whether to use gather or all-gather to gather the logits.
+        self.use_all_gather = current_platform.use_all_gather()
+
+    def forward(
+        self,
+        lm_head: VocabParallelEmbedding,
+        hidden_states: torch.Tensor,
+        sampling_metadata: Optional[SamplingMetadata] = None,
+        embedding_bias: Optional[torch.Tensor] = None,
+        prune_hidden_states: bool = True,
+    ) -> Optional[torch.Tensor]:
+        if self.logits_as_input:
+            logits = hidden_states
+        else:
+            if sampling_metadata is not None and prune_hidden_states:
+                hidden_states = _prune_hidden_states(hidden_states,
+                                                     sampling_metadata)
+
+            # Get the logits for the next tokens.
+            logits = self._get_logits(hidden_states, lm_head, embedding_bias)
+        if logits is not None:
+            if self.soft_cap is not None:
+                logits = logits / self.soft_cap
+                logits = torch.tanh(logits)
+                logits = logits * self.soft_cap
+
+            if self.scale != 1.0:
+                logits *= self.scale
+
+            # Apply logits processors (if any).
+            if sampling_metadata is not None and \
+                sampling_metadata.seq_groups is not None:
+                logits = _apply_logits_processors(logits, sampling_metadata)
+
+        return logits
+
+    def _gather_logits(self, logits: torch.Tensor) -> torch.Tensor:
+        """gather/all-gather the logits tensor across model parallel group."""
+        if self.use_all_gather:
+            # Gather is not supported for some devices such as TPUs.
+            # Use all-gather instead.
+            # NOTE(woosuk): Here, the outputs of every device should not be None
+            # because XLA requires strict SPMD among all devices. Every device
+            # should execute the same operations after gathering the logits.
+            logits = tensor_model_parallel_all_gather(logits)
+        else:
+            # None may be returned for rank > 0
+            logits = tensor_model_parallel_gather(logits)
+        return logits
+
+    def _get_logits(
+        self,
+        hidden_states: torch.Tensor,
+        lm_head: VocabParallelEmbedding,
+        embedding_bias: Optional[torch.Tensor],
+    ) -> Optional[torch.Tensor]:
+        # Get the logits for the next tokens.
+        logits = lm_head.quant_method.apply(lm_head,
+                                            hidden_states,
+                                            bias=embedding_bias)
+
+        # Gather logits for TP
+        logits = self._gather_logits(logits)
+
+        # Remove paddings in vocab (if any).
+        if logits is not None:
+            logits = logits[..., :self.org_vocab_size]
+        return logits
+
+    def extra_repr(self) -> str:
+        s = f"vocab_size={self.vocab_size}"
+        s += f", org_vocab_size={self.org_vocab_size}"
+        s += f", scale={self.scale}, logits_as_input={self.logits_as_input}"
+        return s
+
+
+def _prune_hidden_states(
+    hidden_states: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+) -> torch.Tensor:
+    # NOTE(kzawora): The if guard is needed for Gaudi - in some scenarios
+    # (warmup, profile_run) we might not have selected_token_indices,
+    # so we skip pruning.
+    if sampling_metadata.selected_token_indices is not None:
+        return hidden_states.index_select(
+            0, sampling_metadata.selected_token_indices)
+    else:
+        return hidden_states
+
+
+def _apply_logits_processors(
+    logits: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+) -> torch.Tensor:
+    found_logits_processors = False
+    logits_processed = 0
+    logits_row_ids_and_logits_row_futures = []
+    for seq_group in sampling_metadata.seq_groups:
+        seq_ids = seq_group.seq_ids
+        sampling_params = seq_group.sampling_params
+        logits_processors = sampling_params.logits_processors
+        if logits_processors:
+            found_logits_processors = True
+
+            for seq_id, logits_row_idx in zip(seq_ids,
+                                              seq_group.sample_indices):
+                logits_row = logits[logits_row_idx]
+                past_tokens_ids = seq_group.seq_data[seq_id].output_token_ids
+                prompt_tokens_ids = seq_group.seq_data[seq_id].prompt_token_ids
+
+                if _logits_processor_threadpool is not None:
+                    logits_row_ids_and_logits_row_futures.append(
+                        (logits_row_idx,
+                         _logits_processor_threadpool.submit(
+                             _apply_logits_processors_single_seq, logits_row,
+                             logits_processors, past_tokens_ids,
+                             prompt_tokens_ids)))
+                else:
+                    logits[logits_row_idx] = \
+                        _apply_logits_processors_single_seq(
+                            logits_row, logits_processors, past_tokens_ids,
+                            prompt_tokens_ids)
+
+        logits_processed += len(seq_group.sample_indices) + len(
+            seq_group.prompt_logprob_indices)
+
+    for logits_row_idx, future in logits_row_ids_and_logits_row_futures:
+        logits[logits_row_idx] = future.result()
+
+    if found_logits_processors:
+        # verifies that no rows in logits were missed unexpectedly
+        assert logits_processed == logits.shape[0]
+    return logits
+
+
+def _apply_logits_processors_single_seq(logits_row, logits_processors,
+                                        past_tokens_ids,
+                                        prompt_tokens_ids) -> torch.Tensor:
+    for logits_processor in logits_processors:
+        parameters = inspect.signature(logits_processor).parameters
+        if len(parameters) == 3:
+            logits_row = logits_processor(prompt_tokens_ids, past_tokens_ids,
+                                          logits_row)
+        else:
+            logits_row = logits_processor(past_tokens_ids, logits_row)
+    return logits_row
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/mamba/__init__.py b/vllm_v0.10.0/vllm/model_executor/layers/mamba/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/mamba/abstract.py b/vllm_v0.10.0/vllm/model_executor/layers/mamba/abstract.py
new file mode 100644
index 0000000..4c4997b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/mamba/abstract.py
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from collections.abc import Iterable
+
+import torch
+
+
+class MambaBase(ABC):
+    """
+    Base class for Mamba-like layers which support the v1 engine.
+    Inherit from this class if you implement a custom layer.
+    """
+
+    # Contains the KV cache (mamba state) for the layer
+    # in the shape specified by `self.get_state_shape`.
+    # The outer list is for v0 PP virtual engine. Though this code path
+    # only runs for v1, we have to do this to unify with the interface
+    # of Attention + v0 PP.
+    kv_cache: list[Iterable[torch.Tensor]]
+
+    @abstractmethod
+    def get_state_shape(self) -> Iterable[tuple[int, ...]]:
+        """
+        Defines the shape of the state.
+        For mamba layers this is usually a (conv_state, ssm_state) tuple.
+        In this case, returns (conv_state_shape, ssm_state_shape).
+        """
+        pass
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/mamba/mamba2_metadata.py b/vllm_v0.10.0/vllm/model_executor/layers/mamba/mamba2_metadata.py
new file mode 100644
index 0000000..0a836fd
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/mamba/mamba2_metadata.py
@@ -0,0 +1,186 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import numpy as np
+import torch
+
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.attention.backends.placeholder_attn import (
+    PlaceholderAttentionMetadata)
+from vllm.attention.backends.utils import PAD_SLOT_ID
+from vllm.platforms import current_platform
+from vllm.v1.attention.backends.mamba_attn import (
+    Mamba2AttentionMetadata, _query_start_loc_to_chunk_indices_offsets)
+
+
+@dataclass
+class Mamba2Metadata:
+
+    has_initial_states: torch.Tensor
+    prep_initial_states: bool
+
+    chunk_size: int
+    seq_idx: torch.Tensor
+    chunk_indices: torch.Tensor
+    chunk_offsets: torch.Tensor
+    """
+    With continuous batching layout of `x` in vLLM, to enable a Triton program
+    to handle a request in parallel, two supporting tensors are used
+    (batch_ptr, token_chunk_offset_ptr)
+    BLOCK_M = the # tokens to be handled by a Triton program
+              (can be customized for different hardware)
+
+    nums_dict:
+       tracks the data associated with a given value of BLOCK_M
+       BLOCK_M = #tokens handled by a Triton program
+    cu_seqlen: total tokens per batch
+           (used as flag to update other data at each new input)
+    batch_ptr: tracks batch-id handled by the Triton program
+    token_chunk_offset_ptr: tracks token group_idx handled by the Triton program
+           (Triton implementation of causal_conv1d handles parallelism in 3-axes
+           - feature-axis
+           - batch-axis
+           - sequence-axis)
+    """
+    nums_dict: Optional[dict] = None
+    cu_seqlen: Optional[int] = None
+    batch_ptr: Optional[torch.tensor] = None
+    token_chunk_offset_ptr: Optional[torch.tensor] = None
+
+
+def get_platform_metadata_classes() -> tuple[type[AttentionMetadata], ...]:
+    """Returns the appropriate metadata classes for the current platform."""
+    if current_platform.is_rocm():
+        from vllm.attention.backends.rocm_flash_attn import (
+            ROCmFlashAttentionMetadata)
+        return (ROCmFlashAttentionMetadata, PlaceholderAttentionMetadata)
+    elif current_platform.is_cuda():
+        from vllm.attention.backends.flash_attn import FlashAttentionMetadata
+        from vllm.attention.backends.xformers import XFormersMetadata
+        return (FlashAttentionMetadata, XFormersMetadata,
+                PlaceholderAttentionMetadata)
+    raise ValueError(
+        f"Unsupported platform for Mamba2: {current_platform.device_type}")
+
+
+def prepare_mamba2_metadata(
+    chunk_size: int,
+    attn_metadata: AttentionMetadata,
+    mamba2_metadata=None,
+) -> Mamba2Metadata:
+
+    # compute number of prefill and decode requests
+    # NOTE: in V0 we assume prefills are before decodes
+    num_prefills = attn_metadata.num_prefills
+    num_prefill_tokens = attn_metadata.num_prefill_tokens
+
+    seq_idx = None
+    chunk_indices, chunk_offsets = None, None
+    # Need flags to indicate if there are initial states
+    # currently we really only support the FlashAttention backend
+    has_initial_states = None
+    prep_initial_states = False
+
+    # Compute seq_idx, chunk_indices and chunk_offsets for prefill only
+    if num_prefills > 0:
+        attn_metadata_instances = get_platform_metadata_classes()
+        if (isinstance(attn_metadata, attn_metadata_instances)
+                and attn_metadata.context_lens_tensor is not None):
+            # precompute flag to avoid device syncs later in mamba2 layer
+            # forwards
+            # prep is only needed for mamba2 ssd prefill processing
+            has_initial_states = attn_metadata.context_lens_tensor > 0
+            prep_initial_states = torch.any(
+                has_initial_states[:num_prefills]).item()
+        query_start_loc = attn_metadata.query_start_loc[:num_prefills + 1]
+        seq_idx = torch.repeat_interleave(torch.arange(
+            num_prefills, dtype=torch.int32, device=query_start_loc.device),
+                                          query_start_loc.diff(),
+                                          output_size=num_prefill_tokens)
+        seq_idx.unsqueeze_(0)
+
+        # We compute metadata for chunked prefill once at the top level model
+        # forward and reuse them in mamba layers. If not needed, they will be
+        # ignored inside mamba kernels.
+        if prep_initial_states:
+            chunk_indices, chunk_offsets = \
+                _query_start_loc_to_chunk_indices_offsets(
+                query_start_loc, chunk_size, num_prefill_tokens)
+
+    if mamba2_metadata is not None:
+        mamba2_metadata.has_initial_states = has_initial_states
+        mamba2_metadata.prep_initial_states = prep_initial_states
+        mamba2_metadata.chunk_size = chunk_size
+        mamba2_metadata.seq_idx = seq_idx
+        mamba2_metadata.chunk_indices = chunk_indices
+        mamba2_metadata.chunk_offsets = chunk_offsets
+        # We use 1 reset flag:
+        #  * mamba2_metadata.cu_seqlen is None
+        #      update config specific to (each input)
+        #      (become available at first layer, e.g. conv_weights)
+        mamba2_metadata.cu_seqlen = None  # suppose to be updated at each input
+
+        return mamba2_metadata
+    return Mamba2Metadata(has_initial_states=has_initial_states,
+                          prep_initial_states=prep_initial_states,
+                          chunk_size=chunk_size,
+                          seq_idx=seq_idx,
+                          chunk_indices=chunk_indices,
+                          chunk_offsets=chunk_offsets)
+
+
+def update_metadata(x: torch.Tensor, query_start_loc: torch.Tensor,
+                    mamba2_metadata: Union[Mamba2Metadata,
+                                           Mamba2AttentionMetadata]):
+    """
+    this is triggered upon handling a new input at the first layer
+    """
+    dim, cu_seqlen = x.shape
+    mamba2_metadata.cu_seqlen = cu_seqlen
+    seqlens = np.diff(query_start_loc.to('cpu'))
+    nums_dict = {}  # type: ignore
+    for BLOCK_M in [8]:  # cover all BLOCK_M values
+        nums = -(-seqlens // BLOCK_M)
+        nums_dict[BLOCK_M] = {}
+        nums_dict[BLOCK_M]['nums'] = nums
+        nums_dict[BLOCK_M]['tot'] = nums.sum().item()
+        mlist = torch.from_numpy(np.repeat(np.arange(len(nums)), nums))
+        nums_dict[BLOCK_M]['mlist'] = mlist
+        mlist_len = len(nums_dict[BLOCK_M]['mlist'])
+        nums_dict[BLOCK_M]['mlist_len'] = mlist_len
+        MAX_NUM_PROGRAMS = max(1024, mlist_len) * 2
+        offsetlist = []  # type: ignore
+        for idx, num in enumerate(nums):
+            offsetlist.extend(range(num))
+        offsetlist = torch.tensor(offsetlist, dtype=torch.int32)
+        nums_dict[BLOCK_M]['offsetlist'] = offsetlist
+
+        if mamba2_metadata.batch_ptr is None:
+            # Update default value after class definition
+            #mamba2_metadata.MAX_NUM_PROGRAMS *= 2
+            mamba2_metadata.batch_ptr = torch.full((MAX_NUM_PROGRAMS, ),
+                                                   PAD_SLOT_ID,
+                                                   dtype=torch.int32,
+                                                   device='cuda')
+            mamba2_metadata.token_chunk_offset_ptr = torch.full(
+                (MAX_NUM_PROGRAMS, ),
+                PAD_SLOT_ID,
+                dtype=torch.int32,
+                device='cuda')
+        else:
+            if mamba2_metadata.batch_ptr.nelement() < MAX_NUM_PROGRAMS:
+                mamba2_metadata.batch_ptr.resize_(MAX_NUM_PROGRAMS).fill_(
+                    PAD_SLOT_ID)
+                mamba2_metadata.token_chunk_offset_ptr.resize_(  # type: ignore
+                    MAX_NUM_PROGRAMS).fill_(PAD_SLOT_ID)
+
+        mamba2_metadata.batch_ptr[0:mlist_len].copy_(mlist)
+        mamba2_metadata.token_chunk_offset_ptr[  # type: ignore
+            0:mlist_len].copy_(offsetlist)
+        nums_dict[BLOCK_M]['batch_ptr'] = mamba2_metadata.batch_ptr
+        nums_dict[BLOCK_M]['token_chunk_offset_ptr'] = (
+            mamba2_metadata.token_chunk_offset_ptr)  # type: ignore
+    mamba2_metadata.nums_dict = nums_dict
+    return mamba2_metadata
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm_v0.10.0/vllm/model_executor/layers/mamba/mamba_mixer.py
new file mode 100644
index 0000000..796c8d9
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -0,0 +1,245 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+from torch import nn
+from torch.nn.parameter import Parameter
+
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.distributed.parallel_state import (
+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
+    causal_conv1d_fn, causal_conv1d_update)
+from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
+    selective_scan_fn, selective_state_update)
+from vllm.model_executor.models.mamba_cache import MambaCacheParams
+from vllm.model_executor.utils import set_weight_attrs
+
+
+# Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
+@CustomOp.register("mamba_mixer")
+class MambaMixer(CustomOp):
+    """
+    Compute ∆, A, B, C, and D the state space parameters and compute
+    the `contextualized_states`. A, D are input independent
+    (see Mamba paper [1] Section 3.5.2 "Interpretation of A"
+    for why A isn't selective) ∆, B, C are input-dependent
+    (this is a key difference between Mamba and the linear time
+    invariant S4, and is why Mamba is called
+    **selective** state spaces)
+    """
+
+    def __init__(self,
+                 hidden_size: int,
+                 ssm_state_size: int,
+                 conv_kernel_size: int,
+                 intermediate_size: int,
+                 time_step_rank: int,
+                 use_conv_bias: bool,
+                 use_bias: bool,
+                 use_rms_norm: bool,
+                 rms_norm_has_weight: bool = True,
+                 rms_norm_eps: float = 1e-5,
+                 activation="silu",
+                 is_lora_enabled: bool = False):
+        super().__init__()
+        self.time_step_rank = time_step_rank
+        self.ssm_state_size = ssm_state_size
+        self.use_rms_norm = use_rms_norm
+        self.activation = activation
+        self.is_lora_enabled = is_lora_enabled
+
+        self.conv1d = ColumnParallelLinear(
+            input_size=conv_kernel_size,
+            output_size=intermediate_size,
+            bias=use_conv_bias,
+        )
+        # unsqueeze to fit conv1d weights shape into the linear weights shape.
+        # Can't do this in `weight_loader` since it already exists in
+        # `ColumnParallelLinear` and `set_weight_attrs`
+        # doesn't allow to override it
+        self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1)
+
+        self.in_proj = MergedColumnParallelLinear(hidden_size,
+                                                  [intermediate_size] * 2,
+                                                  bias=use_bias)
+
+        # selective projection used to make dt, B and C input dependent
+        self.x_proj = RowParallelLinear(
+            intermediate_size,
+            time_step_rank + ssm_state_size * 2,
+            bias=False,
+        )
+        # time step projection (discretization) -
+        # In the forward we need to apply dt_proj without the bias,
+        # as the bias is added in the selective scan kernel.
+        self.dt_proj = ColumnParallelLinear(time_step_rank,
+                                            intermediate_size,
+                                            bias=True,
+                                            skip_bias_add=True)
+
+        def weight_loader(param: Parameter, loaded_weight: torch.Tensor):
+            tp_rank = get_tensor_model_parallel_rank()
+            tp_size = get_tensor_model_parallel_world_size()
+            param.data.copy_(
+                loaded_weight.data.split(loaded_weight.shape[0] // tp_size,
+                                         dim=0)[tp_rank])
+
+        def A_weight_loader(param: Parameter, loaded_weight: torch.Tensor):
+            weight_loader(param, -torch.exp(loaded_weight.float()))
+
+        tp_size = get_tensor_model_parallel_world_size()
+        self.A = nn.Parameter(
+            torch.empty(
+                intermediate_size // tp_size,
+                ssm_state_size,
+                dtype=torch.float32,
+            ))
+        self.D = nn.Parameter(torch.ones(intermediate_size // tp_size))
+
+        set_weight_attrs(self.D, {"weight_loader": weight_loader})
+        set_weight_attrs(self.A, {"weight_loader": A_weight_loader})
+
+        self.out_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=use_bias,
+            input_is_parallel=True,
+        )
+
+        self.dt_layernorm = RMSNorm(
+            time_step_rank,
+            eps=rms_norm_eps,
+            has_weight=rms_norm_has_weight,
+        ) if use_rms_norm else None
+
+        self.b_layernorm = RMSNorm(
+            ssm_state_size,
+            eps=rms_norm_eps,
+            has_weight=rms_norm_has_weight,
+        ) if use_rms_norm else None
+
+        self.c_layernorm = RMSNorm(
+            ssm_state_size,
+            eps=rms_norm_eps,
+            has_weight=rms_norm_has_weight,
+        ) if use_rms_norm else None
+
+    def forward_native(self, hidden_states: torch.Tensor,
+                       conv_state: torch.Tensor, ssm_state: torch.Tensor):
+        pass
+
+    def forward_cuda(self, hidden_states: torch.Tensor,
+                     mamba_cache_params: MambaCacheParams):
+
+        attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
+
+        # 1. Gated MLP's linear projection
+        projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1)
+        hidden_states, gate = projected_states.chunk(2, dim=-2)
+
+        # 2. Convolution sequence transformation
+        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
+                                               self.conv1d.weight.size(2))
+
+        if attn_metadata.query_start_loc is not None \
+            and attn_metadata.context_lens_tensor is not None:
+            # |---------- N-1 iteration --------|
+            # |---------------- N iteration ---------------------|
+            # |- tokenA -|......................|-- newTokens ---|
+            # |---------- context_len ----------|
+            # |-------------------- seq_len ---------------------|
+            #                                   |-- query_len ---|
+            hidden_states = causal_conv1d_fn(
+                hidden_states,
+                conv_weights,
+                bias=self.conv1d.bias,
+                activation=self.activation,
+                conv_states=mamba_cache_params.conv_state,
+                has_initial_state=attn_metadata.context_lens_tensor > 0,
+                cache_indices=mamba_cache_params.state_indices_tensor,
+                query_start_loc=attn_metadata.query_start_loc)
+        else:
+            hidden_states = causal_conv1d_update(
+                hidden_states.transpose(0, 1),
+                mamba_cache_params.conv_state,
+                conv_weights,
+                self.conv1d.bias,
+                self.activation,
+                conv_state_indices=mamba_cache_params.state_indices_tensor)
+            hidden_states = hidden_states.transpose(0, 1)
+
+        # 3. State Space Model sequence transformation
+        # 3.a. input varying initialization of time_step, B and C
+
+        if self.is_lora_enabled:
+            #   lora kernel requires contiguous tensor
+            ssm_parameters = self.x_proj(
+                hidden_states.transpose(-2, -1).contiguous())[0]
+        else:
+            ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0]
+
+        time_step, B, C = torch.split(
+            ssm_parameters,
+            [self.time_step_rank, self.ssm_state_size, self.ssm_state_size],
+            dim=-1,
+        )
+        if self.use_rms_norm:
+            assert self.dt_layernorm is not None
+            assert self.b_layernorm is not None
+            assert self.c_layernorm is not None
+            time_step = self.dt_layernorm(time_step.contiguous())
+            B = self.b_layernorm(B.contiguous())
+            C = self.c_layernorm(C.contiguous())
+
+        discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1)
+        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+        time_proj_bias = (self.dt_proj.bias.float() if hasattr(
+            self.dt_proj, "bias") else None)
+
+        if attn_metadata.query_start_loc is not None \
+            and attn_metadata.context_lens_tensor is not None:
+            scan_outputs = selective_scan_fn(
+                hidden_states,
+                mamba_cache_params.ssm_state,
+                discrete_time_step,
+                self.A,
+                B.transpose(-2, -1),
+                C.transpose(-2, -1),
+                self.D.float(),
+                gate,
+                time_proj_bias,
+                delta_softplus=True,
+                cache_indices=mamba_cache_params.state_indices_tensor,
+                has_initial_state=attn_metadata.context_lens_tensor > 0,
+                query_start_loc=attn_metadata.query_start_loc)
+        else:
+            scan_outputs = selective_state_update(
+                mamba_cache_params.ssm_state,
+                hidden_states.transpose(0, 1),
+                discrete_time_step.transpose(0, 1),
+                self.A,
+                B,
+                C,
+                self.D,
+                gate.transpose(0, 1),
+                time_proj_bias,
+                dt_softplus=True,
+                state_batch_indices=mamba_cache_params.state_indices_tensor)
+            scan_outputs = scan_outputs.transpose(0, 1)
+
+        # 4. Final linear projection
+        if self.is_lora_enabled:
+            #  lora kernel requires contiguous tensor
+            contextualized_states = self.out_proj(
+                scan_outputs.transpose(-2, -1).contiguous())[0]
+        else:
+            contextualized_states = self.out_proj(
+                scan_outputs.transpose(-2, -1))[0]
+        return contextualized_states
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm_v0.10.0/vllm/model_executor/layers/mamba/mamba_mixer2.py
new file mode 100644
index 0000000..e32b2be
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -0,0 +1,771 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from vllm import envs
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.config import get_current_vllm_config
+from vllm.distributed import (divide, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_gather,
+                              tensor_model_parallel_all_reduce)
+from vllm.forward_context import ForwardContext, get_forward_context
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.mamba.abstract import MambaBase
+from vllm.model_executor.layers.mamba.mamba2_metadata import (Mamba2Metadata,
+                                                              update_metadata)
+from vllm.model_executor.layers.mamba.mamba_utils import (
+    extra_groups_for_head_shards, get_mamba_state_shape)
+from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
+    causal_conv1d_fn, causal_conv1d_update)
+from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
+    selective_state_update)
+from vllm.model_executor.layers.mamba.ops.ssd_combined import (
+    mamba_chunk_scan_combined)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import (
+    LoaderFunction, composed_weight_loader, sharded_weight_loader)
+from vllm.model_executor.models.mamba_cache import MambaCacheParams
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+from vllm.utils import direct_register_custom_op
+from vllm.v1.attention.backends.mamba_attn import Mamba2AttentionMetadata
+
+# Added by the IBM Team, 2024
+
+
+# Adapted from transformers.models.mamba2.modeling_mamba2.MambaRMSNormGated
+@CustomOp.register("mixer2_gated_rms_norm")
+class Mixer2RMSNormGated(CustomOp):
+
+    def __init__(self,
+                 full_hidden_size: int,
+                 full_n_groups: int,
+                 use_rms_norm: bool = True,
+                 eps: float = 1e-6):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.full_hidden_size = full_hidden_size
+        self.group_size = full_hidden_size // full_n_groups
+        self.per_rank_hidden_size = full_hidden_size // self.tp_size
+        self.n_groups = full_hidden_size // self.group_size
+
+        self.variance_epsilon = eps
+        self.use_rms_norm = use_rms_norm
+        if self.use_rms_norm:
+            # Register norm weight only if we're actually applying RMSNorm
+            self.weight = nn.Parameter(torch.ones(self.per_rank_hidden_size))
+            set_weight_attrs(self.weight,
+                             {"weight_loader": sharded_weight_loader(0)})
+        else:
+            # Avoid checkpoint mismatch by skipping unused parameter
+            self.register_parameter("weight", None)
+        assert (self.full_hidden_size % self.tp_size == 0
+                ), "Tensor parallel world size must divide hidden size."
+
+    def forward_native(
+        self,
+        x: torch.Tensor,
+        gate: torch.Tensor,
+    ):
+        # Three tensor-parallel cases:
+        #   1. n_groups is 1
+        #      In this case we parallelize along the reduction dim.
+        #      Each rank computes a local sum of squares followed by AllReduce
+        #   2. tp_size divides n_groups
+        #      Each rank only reduces within its local group(s).
+        #      No collective ops necessary.
+        #   3. The general case can be pretty complicated so we AllGather
+        #      the input and then redundantly compute the RMSNorm.
+        input_dtype = x.dtype
+        x = x * nn.functional.silu(gate.to(torch.float32))
+        if not self.use_rms_norm:
+            return x.to(input_dtype)
+
+        if self.n_groups == 1:
+            if self.tp_size > 1:
+                # Compute local sum and then reduce to obtain global sum
+                local_sums = x.pow(2).sum(dim=-1, keepdim=True)
+                global_sums = tensor_model_parallel_all_reduce(local_sums)
+                # Calculate the variance
+                count = self.tp_size * x.shape[-1]
+                variance = global_sums / count
+
+            else:
+                variance = x.pow(2).mean(-1, keepdim=True)
+            x = x * torch.rsqrt(variance + self.variance_epsilon)
+        else:
+            redundant_tp: bool = self.n_groups % self.tp_size != 0
+            if redundant_tp:
+                # To handle the general case, redundantly apply the variance
+                x = tensor_model_parallel_all_gather(x, -1)
+
+            *prefix_dims, hidden_dim = x.shape
+            group_count = hidden_dim // self.group_size
+            x_grouped = x.view(*prefix_dims, group_count, self.group_size)
+            variance = x_grouped.pow(2).mean(-1, keepdim=True)
+            x_grouped = x_grouped * torch.rsqrt(variance +
+                                                self.variance_epsilon)
+            x = x_grouped.view(*prefix_dims, hidden_dim)
+
+            if redundant_tp:
+                start = self.per_rank_hidden_size * self.tp_rank
+                end = start + self.per_rank_hidden_size
+                x = x[..., start:end]
+
+        return self.weight * x.to(input_dtype)
+
+    def forward_cuda(
+        self,
+        x: torch.Tensor,
+        gate: torch.Tensor,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        input_dtype = x.dtype
+        if not self.use_rms_norm:
+            # Keep gate in float32 for numerical stability during silu
+            return x * nn.functional.silu(gate.to(
+                torch.float32)).to(input_dtype)
+
+        if self.tp_size > 1 or self.n_groups != 1:
+            return self.forward_native(x, gate)
+
+        from vllm import _custom_ops as ops
+
+        # cast x and gate to float32 before silu
+        out = torch.empty_like(x)
+        y = x * nn.functional.silu(gate.to(torch.float32))
+        ops.rms_norm(
+            out,
+            y.to(x.dtype),
+            self.weight.data,
+            self.variance_epsilon,
+        )
+        return out
+
+
+def mamba_v2_sharded_weight_loader(
+    shard_spec: list[tuple[int, int, float]],
+    tp_size: int,
+    tp_rank: int,
+) -> LoaderFunction:
+    """Create a weight loader for mamba v2. This ensures that the projections
+    are correctly sharded so that they can be split into x, B, C. It also
+    ensures that all the groups corresponding to a head shard is placed
+    together with it.
+    """
+
+    def loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
+
+        # - track boundary of (sharded) param, and loaded_weight, respectively
+        boundary, loaded_boundary = 0, 0
+
+        # - iterate over the shard specs
+        for full_dim, extra, duplicate_groups in shard_spec:
+            # - full dim is the model dim (before TP).
+            # - extra > 0, means there is expected overall increase
+            #   of dimensions. This is so because of replication.
+            # - ratio is used map the tp_rank to the actual shard
+            #   rank. This is useful when there is replication of
+            #   groups to accompany head shards.
+
+            # - size of the loaded shard
+            shard_size = full_dim // tp_size
+
+            # - compute the rank into the loaded shard.
+            # - if there is replication, different TP shards will
+            #   take from the same rank.
+            # NOTE: currently we only support duplication
+            # in the case where num_groups == 1
+            rank = 0 if duplicate_groups else tp_rank
+
+            # - leftmost boundary index into loaded weight.
+            loaded_skip = rank * shard_size
+            loaded_start_idx = loaded_boundary + loaded_skip
+
+            # - take these many dims from the loaded weight.
+            take = min(shard_size, full_dim - extra - loaded_skip)
+
+            # - always shard on dim 0
+            # - the ignore is for a mundane mypy error as it does not
+            #   seem to handle slices well.
+            # https://github.com/python/mypy/issues/2410
+            param.data[
+                boundary:(boundary + take),
+                ...  # type: ignore[misc]
+            ] = loaded_weight[loaded_start_idx:(loaded_start_idx +
+                                                take)  # type: ignore[misc]
+                              ]  # type: ignore[misc]
+
+            # move indexing boundaries
+            boundary += shard_size
+            loaded_boundary += full_dim - extra
+
+    return loader
+
+
+# Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
+@CustomOp.register("mamba_mixer2")
+class MambaMixer2(MambaBase, CustomOp):
+    """
+    Compute ∆, A, B, C, and D the state space parameters and compute
+    the `contextualized_states`. A, D are input independent
+    (see Mamba paper [1] Section 3.5.2 "Interpretation of A"
+    for why A isn't selective) ∆, B, C are input-dependent
+    (this is a key difference between Mamba and the linear time
+    invariant S4, and is why Mamba is called
+    **selective** state spaces)
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        ssm_state_size: int,
+        conv_kernel_size: int,
+        intermediate_size: int,
+        use_conv_bias: bool,
+        use_bias: bool,
+        n_groups: int = 1,
+        num_heads: int = 128,
+        head_dim: int = 64,
+        rms_norm_eps: float = 1e-5,
+        activation: str = "silu",
+        use_rms_norm: bool = True,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        # For TP, the sharding plan is as follows:
+        # - for the conv modules, since
+        #   conv_dim = intermediate_size * 2 * n_groups * ssm_state_size,
+        #   we shard intermediate_size and n_groups
+        # - since intermediate_size = n_heads * head_dim, sharding on
+        #   intermediate_size is achieved by sharding on n_heads.
+        # - IF, world_size divides groups, then sharding
+        #   (n_groups / world_size, n_heads / world_size)
+        #   also maintains the invariant n_heads % n_groups == 0
+        # - HOWEVER IF, world_size DOES NOT divide groups, then we need
+        #   to allocate extra space in the shard, such that groups
+        #   may be replicated to follow the head shard.
+        # - NOTE: currently for the world size DOES NOT divide groups
+        #   case, we only support the case when n_groups == 1
+        self.tp_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+
+        assert (num_heads % self.tp_size == 0
+                ), "Tensor parallel world size must divide num heads."
+
+        assert (n_groups % self.tp_size) == 0 or n_groups == 1, (
+            "If tensor parallel world size does not divide num_heads, "
+            "then num_groups must equal 1.")
+
+        assert (
+            self.tp_size == 1 or quant_config is None
+        ), "Tensor parallel currently not supported for quantized models."
+
+        self.ssm_state_size = ssm_state_size
+        self.conv_kernel_size = conv_kernel_size
+        self.activation = activation
+
+        self.intermediate_size = intermediate_size
+        self.head_dim = head_dim
+        self.num_heads = num_heads
+
+        self.n_groups = n_groups
+        if n_groups % self.tp_size != 0:
+            # - for TP we shard conv_dim by sharding on n_groups,
+            # - but if n_groups cannot divide tp_size, we need to
+            #   extend some extra groups
+            self.n_groups = n_groups + extra_groups_for_head_shards(
+                n_groups, self.tp_size)
+
+        self.conv_dim = intermediate_size + 2 * self.n_groups * ssm_state_size
+        self.conv1d = ColumnParallelLinear(
+            input_size=conv_kernel_size,
+            output_size=self.conv_dim,
+            bias=use_conv_bias,
+            quant_config=None,
+        )
+        # unsqueeze to fit conv1d weights shape into the linear weights shape.
+        # Can't do this in `weight_loader` since it already exists in
+        # `ColumnParallelLinear` and `set_weight_attrs`
+        # doesn't allow to override it
+        self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1)
+
+        self.in_proj = ColumnParallelLinear(
+            input_size=hidden_size,
+            output_size=intermediate_size + self.conv_dim + self.num_heads,
+            bias=use_bias,
+            quant_config=quant_config,
+        )
+
+        # - because in_proj is a concatenation of 3 weights, we
+        #   need to interleave them before sharding
+        # - use the custom weight loader mamba_v2_sharded_weight_loader
+        #   for conv1d.bias, covn1d.weight and in_proj.weight
+        # - need to set these settings, to assign the groups to the head shards
+        group_shard_settings = (
+            self.n_groups * self.ssm_state_size,  # expected model size
+            (self.n_groups - n_groups) *
+            self.ssm_state_size,  # extra dims assigned
+            n_groups == 1,  # if there was only one group
+        )
+        intermediate_settings = (intermediate_size, 0, False)
+        head_settings = (self.num_heads, 0, False)
+
+        # - the weight already has a "weight_loader" attribute
+        #   which set_weight_attrs will raise if we do not
+        #   delete before trying to override it
+        # - ditto for the otther two weights below
+        delattr(self.conv1d.bias, "weight_loader")
+        set_weight_attrs(
+            self.conv1d.bias,
+            {
+                "weight_loader":
+                mamba_v2_sharded_weight_loader(
+                    [
+                        intermediate_settings,
+                        group_shard_settings,
+                        group_shard_settings,
+                    ],
+                    self.tp_size,
+                    tp_rank,
+                )
+            },
+        )
+
+        delattr(self.conv1d.weight, "weight_loader")
+        set_weight_attrs(
+            self.conv1d.weight,
+            {
+                "weight_loader":
+                mamba_v2_sharded_weight_loader(
+                    [
+                        intermediate_settings,
+                        group_shard_settings,
+                        group_shard_settings,
+                    ],
+                    self.tp_size,
+                    tp_rank,
+                )
+            },
+        )
+
+        if quant_config is None:
+            # - quant layers do not have a weight loader
+            delattr(self.in_proj.weight, "weight_loader")
+            set_weight_attrs(
+                self.in_proj.weight,
+                {
+                    "weight_loader":
+                    mamba_v2_sharded_weight_loader(
+                        [
+                            intermediate_settings,  # for gate
+                            intermediate_settings,
+                            group_shard_settings,
+                            group_shard_settings,
+                            head_settings,  # for dt
+                        ],
+                        self.tp_size,
+                        tp_rank,
+                    )
+                },
+            )
+
+        # - these are TPed by heads to reduce the size of the
+        #   temporal shape
+        self.A = nn.Parameter(
+            torch.empty(
+                divide(num_heads, self.tp_size),
+                dtype=torch.float32,
+            ))
+        self.D = nn.Parameter(torch.ones(num_heads // self.tp_size))
+        self.dt_bias = nn.Parameter(torch.ones(num_heads // self.tp_size))
+        self.use_rms_norm = use_rms_norm
+
+        set_weight_attrs(self.D, {"weight_loader": sharded_weight_loader(0)})
+        a_weight_loader = composed_weight_loader(
+            sharded_weight_loader(0), lambda x: -torch.exp(x.float()))
+        set_weight_attrs(self.A, {"weight_loader": a_weight_loader})
+        set_weight_attrs(self.dt_bias,
+                         {"weight_loader": sharded_weight_loader(0)})
+
+        self.out_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=use_bias,
+            input_is_parallel=True,
+            quant_config=quant_config,
+        )
+
+        self.norm = Mixer2RMSNormGated(intermediate_size,
+                                       n_groups,
+                                       self.use_rms_norm,
+                                       eps=rms_norm_eps)
+
+        if envs.VLLM_USE_V1:
+            compilation_config = get_current_vllm_config().compilation_config
+            if prefix in compilation_config.static_forward_context:
+                raise ValueError(f"Duplicate layer name: {prefix}")
+            compilation_config.static_forward_context[prefix] = self
+            # The outer list is for v0 PP virtual engine. Though this code path
+            # only runs for v1, we have to do this to unify with the interface
+            # of Attention + v0 PP.
+            # The inner tuple is (conv_state, ssm_state)
+            self.kv_cache = [(torch.tensor([]), torch.tensor([]))]
+
+        self.prefix = prefix
+
+    def forward_native(
+        self,
+        hidden_states: torch.Tensor,
+        output: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        mamba2_metadata: Mamba2Metadata,
+        mup_vector: Optional[torch.Tensor] = None,
+    ):
+        pass
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        mamba2_metadata: Mamba2Metadata,
+        mup_vector: Optional[torch.Tensor] = None,
+    ):
+        if not envs.VLLM_USE_V1:
+            CustomOp.forward(self, hidden_states, output, mamba_cache_params,
+                             mamba2_metadata, mup_vector)
+        else:
+            torch.ops.vllm.mamba_mixer2(
+                hidden_states,
+                output,
+                self.prefix,
+                mup_vector,
+            )
+
+    def forward_cuda(
+        self,
+        hidden_states: torch.Tensor,
+        output: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        mamba2_metadata: Mamba2Metadata,
+        mup_vector: Optional[torch.Tensor] = None,
+    ):
+        forward_context = get_forward_context()
+        # mamba2_metadata contains metadata necessary for the mamba2 triton
+        # kernels to operate in continuous batching and in chunked prefill
+        # modes; they are computed at top-level model forward since they
+        # stay the same and reused for all mamba layers in the same iteration
+        attn_metadata: AttentionMetadata = forward_context.attn_metadata
+        if envs.VLLM_USE_V1:
+            if attn_metadata is not None:
+                assert isinstance(attn_metadata, dict)
+                attn_metadata = attn_metadata[self.prefix]
+                mamba2_metadata = attn_metadata
+                assert isinstance(attn_metadata, Mamba2AttentionMetadata)
+                self_kv_cache = self.kv_cache[forward_context.virtual_engine]
+                # conv_state = (..., dim, width-1) yet contiguous along 'dim'
+                conv_state = self_kv_cache[0].transpose(-1, -2)
+                ssm_state = self_kv_cache[1]
+                state_indices_tensor = attn_metadata.state_indices_tensor
+                has_initial_states_p = attn_metadata.has_initial_states
+                prep_initial_states = attn_metadata.prep_initial_states
+                chunk_size = attn_metadata.chunk_size
+                seq_idx_p = attn_metadata.seq_idx
+                chunk_indices_p = attn_metadata.chunk_indices
+                chunk_offsets_p = attn_metadata.chunk_offsets
+        else:
+            conv_state = mamba_cache_params.conv_state
+            ssm_state = mamba_cache_params.ssm_state
+            state_indices_tensor = mamba_cache_params.state_indices_tensor
+            has_initial_states_p = mamba2_metadata.has_initial_states
+            prep_initial_states = mamba2_metadata.prep_initial_states
+            chunk_size = mamba2_metadata.chunk_size
+            seq_idx_p = mamba2_metadata.seq_idx
+            chunk_indices_p = mamba2_metadata.chunk_indices
+            chunk_offsets_p = mamba2_metadata.chunk_offsets
+
+        groups_time_state_size = self.n_groups * self.ssm_state_size
+
+        # 1. Gated MLP's linear projection
+        projected_states, _ = self.in_proj(hidden_states)
+
+        if mup_vector is not None:
+            projected_states = projected_states * mup_vector
+
+        gate, hidden_states_B_C, dt = torch.split(
+            projected_states,
+            [
+                self.intermediate_size // self.tp_size,
+                self.conv_dim // self.tp_size,
+                self.num_heads // self.tp_size,
+            ],
+            dim=-1,
+        )
+
+        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
+                                               self.conv1d.weight.size(2))
+
+        # - get hidden_states, B and C after depthwise convolution.
+        split_hidden_states_B_C_fn = lambda hidden_states_B_C: torch.split(
+            hidden_states_B_C,
+            [
+                self.intermediate_size // self.tp_size,
+                groups_time_state_size // self.tp_size,
+                groups_time_state_size // self.tp_size,
+            ],
+            dim=-1,
+        )
+
+        if envs.VLLM_USE_V1 and attn_metadata is None:
+            # V1 profile run
+            hidden_states_B_C = (hidden_states_B_C.transpose(
+                0, 1).clone().transpose(0, 1)).contiguous()
+            hidden_states, _B, _C = split_hidden_states_B_C_fn(
+                hidden_states_B_C)
+            hidden_states = self.norm(hidden_states, gate)
+            out, _ = self.out_proj(hidden_states)
+            return out
+
+        num_prefills = attn_metadata.num_prefills  # request count
+        num_decodes = attn_metadata.num_decode_tokens  # token count (=request)
+        num_prefill_tokens = attn_metadata.num_prefill_tokens  # token count
+        has_prefill = num_prefills > 0
+        has_decode = num_decodes > 0
+        num_actual_tokens = num_prefill_tokens + num_decodes
+
+        # NOTE: V0 put prefill before decode, v1 puts decode before prefill
+        # Separate prefill and decode by splitting varlen input
+        # Split along token dimension
+        # NOTE: V0 put prefill before decode, v1 puts decode before prefill
+        if envs.VLLM_USE_V1:
+            hidden_states_B_C_d, hidden_states_B_C_p = torch.split(
+                hidden_states_B_C[:num_actual_tokens],
+                [num_decodes, num_prefill_tokens],
+                dim=0,
+            )
+            dt_d, dt_p = torch.split(
+                dt[:num_actual_tokens],
+                [num_decodes, num_prefill_tokens],
+                dim=0,
+            )
+            # Split along batch dimension
+            state_indices_tensor_d, state_indices_tensor_p = torch.split(
+                state_indices_tensor[:num_actual_tokens],
+                [num_decodes, num_prefills],
+                dim=0,
+            )
+            query_start_loc_p = (
+                attn_metadata.query_start_loc[-num_prefills - 1:] -
+                num_decodes if has_prefill else None)
+        else:
+            hidden_states_B_C_p, hidden_states_B_C_d = torch.split(
+                hidden_states_B_C,
+                [num_prefill_tokens, num_decodes],
+                dim=0,
+            )
+            dt_p, dt_d = torch.split(
+                dt,
+                [num_prefill_tokens, num_decodes],
+                dim=0,
+            )
+            # Split along batch dimension
+            state_indices_tensor_p, state_indices_tensor_d = torch.split(
+                state_indices_tensor,
+                [num_prefills, num_decodes],
+                dim=0,
+            )
+            query_start_loc_p = (attn_metadata.query_start_loc[:num_prefills +
+                                                               1]
+                                 if has_prefill else None)
+
+        ssd_output_list = []
+
+        # Process prefill requests
+        if has_prefill:
+            # 2. Convolution sequence transformation
+            # - "cache_indices" updates the conv_state cache in positions
+            #   pointed to by "state_indices_tensor"
+            x = hidden_states_B_C_p.transpose(
+                0, 1)  # this is the form that causal-conv see
+            if mamba2_metadata.cu_seqlen is None:
+                mamba2_metadata = update_metadata(x, query_start_loc_p,
+                                                  mamba2_metadata)
+            hidden_states_B_C_p = causal_conv1d_fn(
+                x,
+                conv_weights,
+                self.conv1d.bias,
+                activation=self.activation,
+                conv_states=conv_state,
+                has_initial_state=has_initial_states_p,
+                cache_indices=state_indices_tensor_p,
+                metadata=mamba2_metadata,
+                query_start_loc=query_start_loc_p).transpose(
+                    0, 1)[:num_prefill_tokens]
+
+            hidden_states_p, B_p, C_p = split_hidden_states_B_C_fn(
+                hidden_states_B_C_p)
+
+            # 3. State Space Model sequence transformation
+            initial_states = None
+            if (has_initial_states_p is not None and prep_initial_states):
+                # making a copy of the states
+                if envs.VLLM_USE_V1:
+                    initial_states = torch.where(
+                        has_initial_states_p[:, None, None, None],
+                        ssm_state[state_indices_tensor_p], 0)
+                else:
+                    initial_states = torch.where(
+                        has_initial_states_p[:num_prefills, None, None, None],
+                        ssm_state[state_indices_tensor_p], 0)
+
+            scan_output, varlen_state = mamba_chunk_scan_combined(
+                hidden_states_p.view(1, num_prefill_tokens,
+                                     self.num_heads // self.tp_size,
+                                     self.head_dim),
+                dt_p.unsqueeze(0),
+                self.A,
+                B_p.view(1, num_prefill_tokens, self.n_groups // self.tp_size,
+                         -1),
+                C_p.view(1, num_prefill_tokens, self.n_groups // self.tp_size,
+                         -1),
+                chunk_size=chunk_size,
+                D=self.D,
+                z=None,
+                dt_bias=self.dt_bias,
+                seq_idx=seq_idx_p,
+                chunk_indices=chunk_indices_p,
+                chunk_offsets=chunk_offsets_p,
+                cu_seqlens=query_start_loc_p,
+                initial_states=initial_states,
+                return_varlen_states=True,
+                return_final_states=False,
+                dt_softplus=True,
+                dt_limit=(0.0, float("inf")),
+            )
+
+            # update ssm states
+            # - varlen state is a (num_prefills, nheads, headdim, dstate) tensor
+            ssm_state[state_indices_tensor_p] = varlen_state
+
+            # - reshape
+            ssd_output_list.append(scan_output.view(num_prefill_tokens, -1))
+
+        # Process decode requests
+        if has_decode:
+            # 2. Convolution sequence transformation
+            hidden_states_B_C_d = causal_conv1d_update(
+                hidden_states_B_C_d,
+                conv_state,
+                conv_weights,
+                self.conv1d.bias,
+                self.activation,
+                conv_state_indices=state_indices_tensor_d)
+
+            hidden_states_d, B_d, C_d = split_hidden_states_B_C_fn(
+                hidden_states_B_C_d)
+
+            # 3. State Space Model sequence transformation
+            n_groups = self.n_groups // self.tp_size
+            A_d = self.A[:, None, ...][:, :, None].expand(
+                -1, self.head_dim, self.ssm_state_size).to(dtype=torch.float32)
+            dt_d = dt_d[:, :, None].expand(-1, -1, self.head_dim)
+            dt_bias = self.dt_bias[:, None, ...].expand(-1, self.head_dim)
+            D_d = self.D[:, None, ...].expand(-1, self.head_dim)
+            B_d = B_d.view(-1, n_groups, B_d.shape[1] // n_groups)
+            C_d = C_d.view(-1, n_groups, C_d.shape[1] // n_groups)
+            hidden_states_d = hidden_states_d.view(
+                -1, self.num_heads // self.tp_size, self.head_dim)
+
+            # - the hidden is reshaped into (bs, num_heads, head_dim)
+            # - mamba_cache_params.ssm_state's slots will be selected
+            #   using state_indices_tensor_d
+
+            hidden_states_d = selective_state_update(
+                ssm_state,
+                hidden_states_d,
+                dt_d,
+                A_d,
+                B_d,
+                C_d,
+                D_d,
+                z=None,
+                dt_bias=dt_bias,
+                dt_softplus=True,
+                state_batch_indices=state_indices_tensor_d,
+            )
+
+            if envs.VLLM_USE_V1:
+                ssd_output_list.insert(
+                    0,
+                    hidden_states_d.view(-1, (self.num_heads // self.tp_size) *
+                                         self.head_dim))
+            else:
+                ssd_output_list.append(
+                    hidden_states_d.view(-1, (self.num_heads // self.tp_size) *
+                                         self.head_dim))
+
+        # Merge prefill and decode outputs before passing to gated MLP
+        hidden_states = torch.vstack(ssd_output_list)
+
+        # 4. gated MLP
+        # GatedRMSNorm internally applying SiLU to the gate
+        # SiLU is applied internally before normalization, unlike standard
+        # norm usage
+        hidden_states = self.norm(hidden_states, gate[:num_actual_tokens])
+
+        # 5. Final linear projection
+        output[:num_actual_tokens], _ = self.out_proj(hidden_states)
+
+    def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
+        return get_mamba_state_shape(
+            intermediate_size=self.intermediate_size,
+            tp_world_size=get_tensor_model_parallel_world_size(),
+            n_groups=self.n_groups,
+            num_heads=self.num_heads,
+            head_dim=self.head_dim,
+            state_size=self.ssm_state_size,
+            conv_kernel=self.conv_kernel_size,
+        )
+
+
+def mamba_mixer2(
+    hidden_states: torch.Tensor,
+    output: torch.Tensor,
+    layer_name: str,
+    mup_vector: Optional[torch.Tensor] = None,
+) -> None:
+    forward_context: ForwardContext = get_forward_context()
+    self = forward_context.no_compile_layers[layer_name]
+    self.forward_cuda(hidden_states=hidden_states,
+                      output=output,
+                      mamba_cache_params=None,
+                      mamba2_metadata=None,
+                      mup_vector=mup_vector)
+
+
+def mamba_mixer2_fake(
+    hidden_states: torch.Tensor,
+    output: torch.Tensor,
+    layer_name: str,
+    mup_vector: Optional[torch.Tensor] = None,
+) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="mamba_mixer2",
+    op_func=mamba_mixer2,
+    mutates_args=["output"],
+    fake_impl=mamba_mixer2_fake,
+    dispatch_key=current_platform.dispatch_key,
+)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/mamba/mamba_utils.py b/vllm_v0.10.0/vllm/model_executor/layers/mamba/mamba_utils.py
new file mode 100644
index 0000000..99a5820
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/mamba/mamba_utils.py
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.distributed import divide
+
+
+def extra_groups_for_head_shards(ngroups: int, tp_size: int):
+    """Compute the increase in group numbers to account for
+    replication in order to accompany the head shards."""
+
+    # in the case ngoups % tp_size == 0, this will be zero
+    if ngroups % tp_size == 0:
+        return 0
+
+    # for n_groups == 1, this is exactly tp_size - n_groups
+    return tp_size - ngroups
+
+
+def get_mamba_state_shape(
+    intermediate_size: int,
+    tp_world_size: int,
+    n_groups: int,
+    num_heads: int,
+    head_dim: int,
+    state_size: int,
+    conv_kernel: int,
+    use_v1: bool = True,
+) -> tuple[tuple[int, int], tuple[int, int, int]]:
+    """ Get the shape of mamba state."""
+
+    # if n_groups is not divisible by world_size, need to extend the shards
+    # to ensure all groups needed by a head is sharded along with it
+    n_groups = (n_groups +
+                extra_groups_for_head_shards(n_groups, tp_world_size))
+
+    # - heads and n_groups are TP-ed
+    conv_dim = (intermediate_size + 2 * n_groups * state_size)
+    # contiguous along 'dim' axis
+    conv_state_shape = (
+        conv_kernel - 1,
+        divide(conv_dim, tp_world_size),
+    )
+
+    if not use_v1:
+        conv_state_shape = (conv_state_shape[1], conv_state_shape[0])
+
+    # These are not TP-ed as they depend on A, dt_bias, D
+    # - they are typically small
+    #   e.g., (h_heads, head_dim, state_size) = (128, 64, 128)
+    temporal_state_shape = (
+        divide(num_heads, tp_world_size),
+        head_dim,
+        state_size,
+    )
+
+    return conv_state_shape, temporal_state_shape
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/mamba/ops/__init__.py b/vllm_v0.10.0/vllm/model_executor/layers/mamba/ops/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/mamba/ops/causal_conv1d.py b/vllm_v0.10.0/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
new file mode 100644
index 0000000..b8d4bbc
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
@@ -0,0 +1,945 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright (c) 2024, Tri Dao.
+# Adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/causal_conv1d/causal_conv1d_interface.py
+
+from typing import Optional, Union
+
+import numpy as np
+import torch
+
+from vllm.attention.backends.utils import PAD_SLOT_ID
+from vllm.triton_utils import tl, triton
+
+
+@triton.jit()
+def _causal_conv1d_fwd_kernel(  # continuous batching
+    # Pointers to matrices
+    x_ptr,  # (dim, cu_seqlen) holding `batch` of actual sequences + padded sequences
+    w_ptr,  # (dim, width)
+    bias_ptr,
+    initial_states_ptr,  # conv_states_ptr
+    cache_indices_ptr,  # conv_state_indices_ptr
+    has_initial_states_ptr,
+    query_start_loc_ptr,
+    batch_ptr,
+    token_chunk_offset_ptr,
+    o_ptr,  # (dim, seqlen) - actually pointing to x_ptr
+    # Matrix dimensions
+    batch: tl.int32,  # actually padded_batch
+    dim: tl.constexpr,
+    seqlen: tl.int32,  # cu_seqlen
+    num_cache_lines: tl.constexpr,  # added to support vLLM larger cache lines
+    # Strides
+    stride_x_seq: tl.constexpr,  # stride to get to next sequence,
+    stride_x_dim: tl.constexpr,  # stride to get to next feature-value,
+    stride_x_token: tl.
+    constexpr,  # stride to get to next token (same feature-index, same sequence-index)
+    stride_w_dim: tl.constexpr,  # stride to get to next dim-axis value
+    stride_w_width: tl.constexpr,  # stride to get to next width-axis value
+    stride_istate_seq: tl.constexpr,
+    stride_istate_dim: tl.constexpr,
+    stride_istate_token: tl.constexpr,
+    stride_o_seq: tl.constexpr,
+    stride_o_dim: tl.constexpr,
+    stride_o_token: tl.constexpr,
+    # others
+    pad_slot_id: tl.constexpr,
+    # Meta-parameters
+    HAS_BIAS: tl.constexpr,
+    KERNEL_WIDTH: tl.constexpr,
+    SILU_ACTIVATION: tl.constexpr,
+    HAS_INITIAL_STATES: tl.constexpr,
+    HAS_CACHE: tl.constexpr,
+    IS_CONTINUOUS_BATCHING: tl.constexpr,
+    USE_PAD_SLOT: tl.constexpr,
+    NP2_STATELEN: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    conv_states_ptr = initial_states_ptr
+    conv_state_indices_ptr = cache_indices_ptr
+    stride_conv_state_seq = stride_istate_seq
+    stride_conv_state_dim = stride_istate_dim
+    stride_conv_state_tok = stride_istate_token
+    state_len = KERNEL_WIDTH - 1  # can be passed via argument if it's not the same as this value
+
+    # one program handles one chunk in a single sequence
+    # rather than mixing sequences - to make updating initial_states across sequences efficiently
+
+    # single-sequence id
+    idx_seq = tl.load(batch_ptr + tl.program_id(0))
+    chunk_offset = tl.load(token_chunk_offset_ptr + tl.program_id(0))
+
+    # BLOCK_N elements along the feature-dimension (channel)
+    idx_feats = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    if idx_seq == pad_slot_id:
+        return
+
+    sequence_start_index = tl.load(query_start_loc_ptr + idx_seq)
+    sequence_end_index = tl.load(query_start_loc_ptr + idx_seq + 1)
+    # find the actual sequence length
+    seqlen = sequence_end_index - sequence_start_index
+
+    token_offset = BLOCK_M * chunk_offset
+    segment_len = min(BLOCK_M, seqlen - token_offset)
+
+    # base of the sequence
+    x_base = x_ptr + sequence_start_index * stride_x_token + idx_feats * stride_x_dim  # [BLOCK_N,]
+
+    if IS_CONTINUOUS_BATCHING:
+        # cache_idx
+        conv_state_batch_coord = tl.load(conv_state_indices_ptr + idx_seq).to(
+            tl.int64)
+    else:
+        # cache_idx
+        conv_state_batch_coord = idx_seq
+    if USE_PAD_SLOT:  # noqa
+        if conv_state_batch_coord == pad_slot_id:
+            # not processing as this is not the actual sequence
+            return
+    conv_states_base = (conv_states_ptr +
+                        (conv_state_batch_coord * stride_conv_state_seq) +
+                        (idx_feats * stride_conv_state_dim))  # [BLOCK_N,]
+
+    w_base = w_ptr + (idx_feats * stride_w_dim)  # [BLOCK_N,]
+
+    # Does 2 things:
+    # 1. READ prior-block init-state data - [done by every Triton programs]
+    # 2. update conv_state with new data [only by the Triton program handles chunk_offset=0]
+    if chunk_offset == 0:
+        # read from conv_states
+        load_init_state = False
+        if HAS_INITIAL_STATES:  # the new HAS_INITIAL_STATES
+            load_init_state = tl.load(has_initial_states_ptr + idx_seq).to(
+                tl.int1)
+        if load_init_state:
+            # load from conv_states
+            prior_tokens = conv_states_base + (state_len -
+                                               1) * stride_conv_state_tok
+            mask_w = idx_feats < dim
+            if KERNEL_WIDTH == 2:
+                conv_states_ptrs = prior_tokens  # [BLOCK_N]
+                col0 = tl.load(conv_states_ptrs, mask_w, 0.0)
+            if KERNEL_WIDTH == 3:
+                conv_states_ptrs = prior_tokens  # [BLOCK_N]
+                col1 = tl.load(conv_states_ptrs, mask_w, 0.0)
+                conv_states_ptrs = prior_tokens - 1 * stride_conv_state_tok  # [BLOCK_N]
+                col0 = tl.load(conv_states_ptrs, mask_w, 0.0)
+            if KERNEL_WIDTH == 4:
+                conv_states_ptrs = prior_tokens  # [BLOCK_N]
+                col2 = tl.load(conv_states_ptrs, mask_w, 0.0)
+                conv_states_ptrs = prior_tokens - 1 * stride_conv_state_tok  # [BLOCK_N]
+                col1 = tl.load(conv_states_ptrs, mask_w, 0.0)
+                conv_states_ptrs = prior_tokens - 2 * stride_conv_state_tok  # [BLOCK_N]
+                col0 = tl.load(conv_states_ptrs, mask_w, 0.0)
+            if KERNEL_WIDTH == 5:
+                conv_states_ptrs = prior_tokens  # [BLOCK_N]
+                col3 = tl.load(conv_states_ptrs, mask_w, 0.0)
+                conv_states_ptrs = prior_tokens - 1 * stride_conv_state_tok  # [BLOCK_N]
+                col2 = tl.load(conv_states_ptrs, mask_w, 0.0)
+                conv_states_ptrs = prior_tokens - 2 * stride_conv_state_tok  # [BLOCK_N]
+                col1 = tl.load(conv_states_ptrs, mask_w, 0.0)
+                conv_states_ptrs = prior_tokens - 3 * stride_conv_state_tok  # [BLOCK_N]
+                col0 = tl.load(conv_states_ptrs, mask_w, 0.0)
+        else:
+            # prior-tokens are zeros
+            if KERNEL_WIDTH >= 2:  # STRATEGY1
+                # first chunk and does not have prior-token, so just set to 0
+                col0 = tl.zeros((BLOCK_N, ), dtype=x_ptr.dtype.element_ty)
+            if KERNEL_WIDTH >= 3:  # STRATEGY1
+                col1 = tl.zeros((BLOCK_N, ), dtype=x_ptr.dtype.element_ty)
+            if KERNEL_WIDTH >= 4:  # STRATEGY1
+                col2 = tl.zeros((BLOCK_N, ), dtype=x_ptr.dtype.element_ty)
+            if KERNEL_WIDTH >= 5:  # STRATEGY1
+                col3 = tl.zeros((BLOCK_N, ), dtype=x_ptr.dtype.element_ty)
+
+        # STEP 2:
+        # here prepare data for updating conv_state
+        if state_len <= seqlen:  # SMALL_CACHE=True (only move part of 'x' into conv_state cache)
+            # just read from 'x'
+            # copy 'x' data to conv_state
+            # load only 'x' data (and set 0 before 'x' if seqlen < state_len)
+            idx_tokens_last = (seqlen - state_len) + tl.arange(
+                0, NP2_STATELEN)  # [BLOCK_M]
+            x_ptrs = x_ptr + (
+                (sequence_start_index + idx_tokens_last) *
+                stride_x_token)[:, None] + (
+                    idx_feats * stride_x_dim)[None, :]  # [BLOCK_M,BLOCK_N,]
+            mask_x = ((idx_tokens_last >= 0)[:, None] &
+                      (idx_tokens_last < seqlen)[:, None] &
+                      (idx_feats < dim)[None, :]
+                      )  # token-index  # token-index  # feature-index
+            loaded_x = tl.load(x_ptrs, mask_x, 0.0)
+            new_conv_state = tl.load(x_ptrs, mask_x, 0.0)
+            idx_tokens_conv = tl.arange(0, NP2_STATELEN)  # [BLOCK_M]
+            conv_states_ptrs_target = conv_states_base[None, :] + (
+                idx_tokens_conv * stride_conv_state_tok)[:, None]
+
+            mask = (idx_tokens_conv < state_len)[:, None] & (idx_feats
+                                                             < dim)[None, :]
+            tl.debug_barrier()  #  NOTE: use this due to bug in Triton compiler
+            tl.store(conv_states_ptrs_target, new_conv_state, mask)
+
+        else:
+            if load_init_state:
+                # update conv_state by shifting left, i.e. take last few cols from conv_state + cols from 'x'
+                idx_tokens_conv = tl.arange(0, NP2_STATELEN)  # [BLOCK_M]
+
+                conv_states_ptrs_source = (
+                    conv_states_ptr +
+                    (conv_state_batch_coord * stride_conv_state_seq) +
+                    (idx_feats * stride_conv_state_dim)[None, :] +
+                    ((idx_tokens_conv + seqlen) * stride_conv_state_tok)[:,
+                                                                         None]
+                )  # [BLOCK_M, BLOCK_N]
+                mask = ((conv_state_batch_coord < num_cache_lines)
+                        & ((idx_tokens_conv + seqlen) < state_len)[:, None]
+                        & (idx_feats < dim)[None, :])
+                conv_state = tl.load(conv_states_ptrs_source, mask, other=0.0)
+
+                VAL = state_len - seqlen
+
+                x_ptrs = x_base[None, :] + (
+                    (idx_tokens_conv - VAL) *
+                    stride_x_token)[:, None]  # [BLOCK_M, BLOCK_N]
+
+                mask_x = ((idx_tokens_conv - VAL >= 0)[:, None] &
+                          (idx_tokens_conv - VAL < seqlen)[:, None] &
+                          (idx_feats < dim)[None, :]
+                          )  # token-index  # token-index  # feature-index
+                loaded_x = tl.load(x_ptrs, mask_x, 0.0)
+
+                tl.debug_barrier(
+                )  # need this due to the bug in tl.where not enforcing this when data is the result of another tl.load
+                new_conv_state = tl.where(
+                    mask, conv_state, loaded_x
+                )  # BUG in 'tl.where'  which requires a barrier before this
+                conv_states_ptrs_target = conv_states_base + (
+                    idx_tokens_conv *
+                    stride_conv_state_tok)[:, None]  # [BLOCK_M, BLOCK_N]
+                mask = (idx_tokens_conv
+                        < state_len)[:, None] & (idx_feats < dim)[None, :]
+                tl.store(conv_states_ptrs_target, new_conv_state, mask)
+            else:  # load_init_state == False
+                # update conv_state by shifting left, BUT
+                # set cols prior to 'x' as zeros + cols from 'x'
+                idx_tokens_conv = tl.arange(0, NP2_STATELEN)  # [BLOCK_M]
+
+                VAL = state_len - seqlen
+
+                x_ptrs = x_base[None, :] + (
+                    (idx_tokens_conv - VAL) *
+                    stride_x_token)[:, None]  # [BLOCK_M, BLOCK_N]
+
+                mask_x = ((idx_tokens_conv - VAL >= 0)[:, None] &
+                          (idx_tokens_conv - VAL < seqlen)[:, None] &
+                          (idx_feats < dim)[None, :]
+                          )  # token-index  # token-index  # feature-index
+                new_conv_state = tl.load(x_ptrs, mask_x, 0.0)
+
+                conv_states_ptrs_target = conv_states_base + (
+                    idx_tokens_conv *
+                    stride_conv_state_tok)[:, None]  # [BLOCK_M, BLOCK_N]
+                mask = (idx_tokens_conv
+                        < state_len)[:, None] & (idx_feats < dim)[None, :]
+                tl.store(conv_states_ptrs_target, new_conv_state, mask)
+
+    else:  # chunk_offset > 0
+        # read prior-token data from `x`
+        load_init_state = True
+        prior_tokens = x_base + (token_offset - 1) * stride_x_token
+        mask_w = idx_feats < dim
+        if KERNEL_WIDTH == 2:
+            conv_states_ptrs = prior_tokens  # [BLOCK_N]
+            col0 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca')
+        if KERNEL_WIDTH == 3:
+            conv_states_ptrs = prior_tokens  # [BLOCK_N]
+            col1 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca')
+            conv_states_ptrs = prior_tokens - 1 * stride_x_token  # [BLOCK_N]
+            col0 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca')
+        if KERNEL_WIDTH == 4:
+            conv_states_ptrs = prior_tokens  # [BLOCK_N]
+            col2 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca')
+            conv_states_ptrs = prior_tokens - 1 * stride_x_token  # [BLOCK_N]
+            col1 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca')
+            conv_states_ptrs = prior_tokens - 2 * stride_x_token  # [BLOCK_N]
+            col0 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca')
+        if KERNEL_WIDTH == 5:
+            # ruff: noqa: F841
+            conv_states_ptrs = prior_tokens  # [BLOCK_N]
+            col3 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca')
+            conv_states_ptrs = prior_tokens - 1 * stride_x_token  # [BLOCK_N]
+            col2 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca')
+            conv_states_ptrs = prior_tokens - 2 * stride_x_token  # [BLOCK_N]
+            col1 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca')
+            conv_states_ptrs = prior_tokens - 3 * stride_x_token  # [BLOCK_N]
+            col0 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca')
+
+    if HAS_BIAS:
+        bias = bias_ptr + idx_feats
+        mask_bias = idx_feats < dim
+        acc_preload = tl.load(bias, mask=mask_bias,
+                              other=0.0).to(tl.float32)  # [BLOCK_N]
+    else:
+        acc_preload = tl.zeros((BLOCK_N, ), dtype=tl.float32)
+
+    x_base_1d = x_base + token_offset * stride_x_token  # starting of chunk
+
+    # PRE-LOAD WEIGHTS
+    mask_w = idx_feats < dim
+    if KERNEL_WIDTH >= 2:
+        w_ptrs = w_base + (0 * stride_w_width)  # [BLOCK_N] tensor
+        w_col0 = tl.load(w_ptrs, mask_w, other=0.0)
+        w_ptrs = w_base + (1 * stride_w_width)  # [BLOCK_N] tensor
+        w_col1 = tl.load(w_ptrs, mask_w, other=0.0)
+    if KERNEL_WIDTH >= 3:
+        w_ptrs = w_base + (2 * stride_w_width)  # [BLOCK_N] tensor
+        w_col2 = tl.load(w_ptrs, mask_w, other=0.0)
+    if KERNEL_WIDTH >= 4:
+        w_ptrs = w_base + (3 * stride_w_width)  # [BLOCK_N] tensor
+        w_col3 = tl.load(w_ptrs, mask_w, other=0.0)
+    mask_x_1d = idx_feats < dim
+    for idx_token in range(segment_len):
+        acc = acc_preload
+
+        matrix_w = w_col0
+        matrix_x = col0
+        for j in tl.static_range(KERNEL_WIDTH):
+
+            if KERNEL_WIDTH == 2:
+                if j == 1:  # KERNEL_WIDTH-1:
+                    matrix_w = w_col1
+                    x_ptrs_1d = x_base_1d + idx_token * stride_x_token  # [BLOCK_N]
+                    matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d)
+            elif KERNEL_WIDTH == 3:
+                if j == 1:
+                    matrix_w = w_col1
+                    matrix_x = col1
+                elif j == 2:
+                    matrix_w = w_col2
+                    x_ptrs_1d = x_base_1d + idx_token * stride_x_token  # [BLOCK_N]
+                    matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d)
+            elif KERNEL_WIDTH == 4:
+                if j == 1:
+                    matrix_w = w_col1
+                    matrix_x = col1
+                elif j == 2:
+                    matrix_w = w_col2
+                    matrix_x = col2
+                elif j == 3:
+                    matrix_w = w_col3
+                    x_ptrs_1d = x_base_1d + idx_token * stride_x_token  # [BLOCK_N]
+                    matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d)
+
+            acc += matrix_x * matrix_w  # [BLOCK_N]
+
+        if KERNEL_WIDTH == 2:
+            col0 = matrix_x
+        elif KERNEL_WIDTH == 3:
+            col0 = col1
+            col1 = matrix_x
+        elif KERNEL_WIDTH == 4:
+            col0 = col1
+            col1 = col2
+            col2 = matrix_x
+
+        if SILU_ACTIVATION:
+            acc = acc / (1 + tl.exp(-acc))
+        mask_1d = (idx_token < segment_len) & (
+            idx_feats < dim)  # token-index  # feature-index
+        o_ptrs = o_ptr + (sequence_start_index + token_offset + idx_token
+                          ) * stride_o_token + (idx_feats * stride_o_dim)
+
+        tl.store(o_ptrs, acc, mask=mask_1d)
+
+
+def causal_conv1d_fn(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Union[torch.Tensor, None],
+    conv_states: torch.Tensor,
+    query_start_loc: torch.Tensor,
+    cache_indices: Optional[torch.Tensor] = None,
+    has_initial_state: Optional[torch.Tensor] = None,
+    activation: Optional[str] = "silu",
+    pad_slot_id: int = PAD_SLOT_ID,
+    metadata=None,
+    validate_data=False,
+):
+    """support varlen + continuous batching when x is 2D tensor
+
+    x: (dim,cu_seq_len)
+        cu_seq_len = total tokens of all seqs in that batch
+        sequences are concatenated from left to right for varlen
+    weight: (dim, width)
+    conv_states: (...,dim,width - 1) itype
+        updated inplace if provided
+        [it use `cache_indices` to get the index to the cache of conv_state for that sequence
+
+        conv_state[cache_indices[i]] for seq-i - to be used as initial_state when has_initial_state[i] = True
+             and after that conv_state[cache_indices[i]] need to be shift-left and updated with values from 'x'
+        ]
+    query_start_loc: (batch + 1) int32
+        The cumulative sequence lengths of the sequences in
+        the batch, used to index into sequence. prepended by 0.
+        if
+        x = [5, 1, 1, 1] <- continuous batching (batch=4)
+        then
+        query_start_loc = [0, 5, 6, 7, 8] <- the starting index of the next sequence; while the last value is
+           the ending index of the last sequence
+        [length(query_start_loc)-1 == batch]
+        for example: query_start_loc = torch.Tensor([0,10,16,17]),
+        x.shape=(dim,17)
+    cache_indices: (batch)  int32
+        indicates the corresponding state index,
+        like so: conv_state = conv_states[cache_indices[batch_id]]
+    has_initial_state: (batch) bool
+        indicates whether should the kernel take the current state as initial
+        state for the calculations
+        [single boolean for each sequence in the batch: True or False]
+    bias: (dim,)
+    activation: either None or "silu" or "swish" or True
+    pad_slot_id: int
+        if cache_indices is passed, lets the kernel identify padded
+        entries that will not be processed,
+        for example: cache_indices = [pad_slot_id, 1, 20, pad_slot_id]
+        in this case, the kernel will not process entries at
+        indices 0 and 3
+
+    out: same shape as `x`
+    """
+    if isinstance(activation, bool) and activation:
+        activation = "silu"
+
+    args = None
+    out = torch.empty_like(x)
+    if metadata is not None:
+        cu_seqlen = metadata.cu_seqlen
+        nums_dict = metadata.nums_dict
+        #x = metadata.x
+        args = nums_dict
+        batch_ptr = metadata.batch_ptr
+        token_chunk_offset_ptr = metadata.token_chunk_offset_ptr
+    else:
+        seqlens = np.diff(query_start_loc.to('cpu'))
+        args = seqlens
+        MAX_NUM_PROGRAMS = 1024
+
+        batch_ptr = torch.full(
+            (MAX_NUM_PROGRAMS, ),
+            PAD_SLOT_ID,
+            dtype=torch.int32,
+            device=x.device
+        )  # tracking which seq-idx the Triton program is handling
+        token_chunk_offset_ptr = torch.full(
+            (MAX_NUM_PROGRAMS, ),
+            PAD_SLOT_ID,
+            dtype=torch.int32,
+            device=x.device
+        )  # tracking BLOCK_M-based index in the sequence the Triton program is handling
+
+    is_channel_last = (x.stride(0) == 1) & (x.stride(1) > 1)
+    dim, cu_seqlen = x.shape
+    _, width = weight.shape
+    state_len = width - 1
+    np2_statelen = triton.next_power_of_2(state_len)
+
+    padded_batch = query_start_loc.size(0) - 1
+    stride_x_seq = 0
+    stride_x_dim = x.stride(0)
+    stride_x_token = x.stride(1)
+    stride_w_dim = weight.stride(0)
+    stride_w_width = weight.stride(1)
+    stride_istate_seq = 0
+    stride_istate_dim = 0
+    stride_istate_token = 0
+    num_cache_lines = 0
+    if conv_states is not None:
+        # extensions to support vLLM:
+        # 1. conv_states is used to replaced initial_states
+        # 2. conv_states serve as a cache with num cache lines can be larger than batch size
+        # 3. mapping from sequence x[idx] to a cache line at index as specified via cache_indices[idx]
+        # 4. computation can be skipped if cache_indices[idx] == pad_slot_id
+        num_cache_lines = conv_states.size(0)
+        assert (num_cache_lines, dim, width - 1) == conv_states.shape
+        stride_istate_seq = conv_states.stride(0)
+        stride_istate_dim = conv_states.stride(1)
+        stride_istate_token = conv_states.stride(2)
+        assert stride_istate_dim == 1
+    if out.dim() == 2:
+        stride_o_seq = 0
+        stride_o_dim = out.stride(0)
+        stride_o_token = out.stride(1)
+    else:
+        stride_o_seq = out.stride(0)
+        stride_o_dim = out.stride(1)
+        stride_o_token = out.stride(2)
+
+    if validate_data:
+        assert x.dim() == 2
+        assert query_start_loc is not None
+        assert query_start_loc.dim() == 1
+        assert x.stride(0) == 1 or x.stride(1) == 1
+        if bias is not None:
+            assert bias.dim() == 1
+            assert dim == bias.size(0)
+        if cache_indices is not None:
+            assert cache_indices.dim() == 1
+            assert padded_batch == cache_indices.size(0)
+        if has_initial_state is not None:
+            assert has_initial_state.size() == (padded_batch, )
+            assert conv_states is not None, "ERROR: `has_initial_state` is used, which needs also `conv_states`"
+        assert weight.stride(1) == 1
+        assert (dim, width) == weight.shape
+        assert is_channel_last, "Need to run in channel-last layout"
+
+    if metadata is None:
+
+        def num_program(META, seqlens):
+            tot = 0
+
+            mlist = []
+            offsetlist = []  # type: ignore
+
+            nums = -(-seqlens // META["BLOCK_M"])
+
+            tot = nums.sum().item()
+            mlist = np.repeat(np.arange(len(nums)), nums)
+            for idx, num in enumerate(nums):
+                offsetlist.extend(
+                    range(num)
+                )  # chunk-idx if a sequence is split into multiple chunks
+
+            if META["batch_ptr"].nelement() < len(mlist):
+                newlen = len(mlist) + 1
+                META["batch_ptr"].resize_(newlen).fill_(PAD_SLOT_ID)
+                META["token_chunk_offset_ptr"].resize_(newlen).fill_(
+                    PAD_SLOT_ID)
+
+            if META["batch_ptr"].nelement() >= len(mlist):
+                META["batch_ptr"][0:len(mlist)].copy_(
+                    torch.from_numpy(np.array(mlist)))
+                META["token_chunk_offset_ptr"][0:len(mlist)].copy_(
+                    torch.from_numpy(np.array(offsetlist)))
+
+            META["batch_ptr"] = META["batch_ptr"].to(META["x_ptr"].device)
+            META["token_chunk_offset_ptr"] = META["token_chunk_offset_ptr"].to(
+                META["x_ptr"].device)
+            return tot
+    else:
+
+        def num_program(META, nums_dict):
+            tot = nums_dict[META["BLOCK_M"]]['tot']
+
+            mlist = nums_dict[META["BLOCK_M"]]['mlist']
+            mlist_len = nums_dict[META["BLOCK_M"]]['mlist_len']
+
+            offsetlist = nums_dict[META["BLOCK_M"]]['offsetlist']
+
+            if nums_dict[META["BLOCK_M"]]["batch_ptr"] is not None:
+                META["batch_ptr"] = nums_dict[META["BLOCK_M"]]["batch_ptr"]
+                META["token_chunk_offset_ptr"] = nums_dict[
+                    META["BLOCK_M"]]["token_chunk_offset_ptr"]
+            else:
+                if META["batch_ptr"].nelement() < mlist_len:
+                    newlen = mlist_len + 1
+                    META["batch_ptr"].resize_(newlen).fill_(PAD_SLOT_ID)
+                    META["token_chunk_offset_ptr"].resize_(newlen).fill_(
+                        PAD_SLOT_ID)
+
+                if META["batch_ptr"].nelement() >= mlist_len:
+                    META["batch_ptr"][0:mlist_len].copy_(mlist)
+                    META["token_chunk_offset_ptr"][0:mlist_len].copy_(
+                        offsetlist)
+            return tot
+
+    def grid(META):
+        return (
+            num_program(META, args),
+            triton.cdiv(dim, META["BLOCK_N"]),
+        )
+
+    if batch_ptr.device != x.device:
+        batch_ptr = batch_ptr.to(x.device)
+        token_chunk_offset_ptr = token_chunk_offset_ptr.to(x.device)
+
+    _causal_conv1d_fwd_kernel[grid](
+        # Pointers to matrices
+        x,
+        weight,
+        bias,
+        conv_states,
+        cache_indices,
+        has_initial_state,
+        query_start_loc,
+        batch_ptr,
+        token_chunk_offset_ptr,
+        out,
+        # Matrix dimensions
+        padded_batch,
+        dim,
+        cu_seqlen,
+        num_cache_lines,
+        # stride
+        stride_x_seq,
+        stride_x_dim,
+        stride_x_token,
+        stride_w_dim,
+        stride_w_width,
+        stride_istate_seq,
+        stride_istate_dim,
+        stride_istate_token,
+        stride_o_seq,
+        stride_o_dim,
+        stride_o_token,
+        # others
+        pad_slot_id,
+        # META
+        HAS_BIAS=bias is not None,
+        KERNEL_WIDTH=width,
+        SILU_ACTIVATION=activation in ["silu", "swish"],
+        HAS_INITIAL_STATES=has_initial_state is not None,
+        HAS_CACHE=conv_states is not None,
+        IS_CONTINUOUS_BATCHING=cache_indices is not None,
+        USE_PAD_SLOT=pad_slot_id is not None,
+        NP2_STATELEN=np2_statelen,
+        #launch_cooperative_grid=True
+        BLOCK_M=8,
+        BLOCK_N=256,
+        num_stages=2,
+    )
+    return out
+
+
+@triton.jit()
+def _causal_conv1d_update_kernel(
+    # Pointers to matrices
+    x_ptr,  # (batch, dim, seqlen)
+    w_ptr,  # (dim, width)
+    bias_ptr,
+    conv_state_ptr,
+    cache_seqlens_ptr,  # circular buffer
+    conv_state_indices_ptr,
+    o_ptr,  # (batch, dim, seqlen)
+    # Matrix dimensions
+    batch: int,
+    dim: tl.constexpr,
+    seqlen: tl.constexpr,
+    state_len: tl.constexpr,
+    num_cache_lines: tl.constexpr,  # added to support vLLM larger cache lines
+    # Strides
+    stride_x_seq: tl.constexpr,
+    stride_x_dim: tl.constexpr,
+    stride_x_token: tl.constexpr,
+    stride_w_dim: tl.constexpr,
+    stride_w_width: tl.constexpr,
+    stride_conv_state_seq: tl.constexpr,
+    stride_conv_state_dim: tl.constexpr,
+    stride_conv_state_tok: tl.constexpr,
+    stride_o_seq: tl.constexpr,
+    stride_o_dim: tl.constexpr,
+    stride_o_token: tl.constexpr,
+    # others
+    pad_slot_id: tl.constexpr,
+    # Meta-parameters
+    HAS_BIAS: tl.constexpr,
+    KERNEL_WIDTH: tl.constexpr,
+    SILU_ACTIVATION: tl.constexpr,
+    IS_CONTINUOUS_BATCHING: tl.constexpr,
+    NP2_STATELEN: tl.constexpr,
+    USE_PAD_SLOT: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    # ruff: noqa: E501
+    idx_seq = tl.program_id(0)
+    if idx_seq >= batch:
+        return
+
+    # [BLOCK_N,] elements along the feature-dimension (channel)
+    idx_feats = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    if IS_CONTINUOUS_BATCHING:
+        # mask = idx_seq < batch
+        conv_state_batch_coord = tl.load(conv_state_indices_ptr + idx_seq).to(
+            tl.int64)
+    else:
+        conv_state_batch_coord = idx_seq
+    if USE_PAD_SLOT:  # noqa
+        if conv_state_batch_coord == pad_slot_id:
+            # not processing as this is not the actual sequence
+            return
+
+    # STEP 1: READ init_state data
+    conv_states_base = (conv_state_ptr +
+                        (conv_state_batch_coord * stride_conv_state_seq) +
+                        (idx_feats * stride_conv_state_dim))
+    mask_w = idx_feats < dim
+
+    prior_tokens = conv_states_base
+    if KERNEL_WIDTH >= 2:
+        conv_states_ptrs = prior_tokens  # [BLOCK_N]
+        col0 = tl.load(conv_states_ptrs, mask_w, 0.0)
+    if KERNEL_WIDTH >= 3:
+        conv_states_ptrs = prior_tokens + 1 * stride_conv_state_tok  # [BLOCK_N]
+        col1 = tl.load(conv_states_ptrs, mask_w, 0.0)
+    if KERNEL_WIDTH >= 4:
+        conv_states_ptrs = prior_tokens + 2 * stride_conv_state_tok  # [BLOCK_N]
+        col2 = tl.load(conv_states_ptrs, mask_w, 0.0)
+    if KERNEL_WIDTH == 5:
+        conv_states_ptrs = prior_tokens + 3 * stride_conv_state_tok  # [BLOCK_N]
+        col3 = tl.load(conv_states_ptrs, mask_w, 0.0)
+
+    # STEP 2: assume state_len > seqlen
+    idx_tokens = tl.arange(0, NP2_STATELEN)  # [BLOCK_M]
+
+    conv_state_ptrs_source = (
+        conv_state_ptr + (conv_state_batch_coord * stride_conv_state_seq) +
+        (idx_feats * stride_conv_state_dim)[None, :] +
+        ((idx_tokens + seqlen) * stride_conv_state_tok)[:, None]
+    )  # [BLOCK_M, BLOCK_N]
+    mask = ((conv_state_batch_coord < num_cache_lines)
+            & ((idx_tokens + seqlen) < state_len)[:, None]
+            & (idx_feats < dim)[None, :])
+    conv_state = tl.load(conv_state_ptrs_source, mask, other=0.0)
+
+    VAL = state_len - seqlen
+    x_base = x_ptr + (idx_seq * stride_x_seq) + (idx_feats * stride_x_dim
+                                                 )  # [BLOCK_N]
+
+    x_ptrs = x_base[None, :] + (
+        (idx_tokens - VAL) * stride_x_token)[:, None]  # [BLOCK_M, BLOCK_N]
+
+    mask_x = ((idx_tokens - VAL >= 0)[:, None] &
+              (idx_tokens - VAL < seqlen)[:, None] & (idx_feats < dim)[None, :]
+              )  # token-index  # token-index  # feature-index
+    loaded_x = tl.load(x_ptrs, mask_x, 0.0)
+    tl.debug_barrier()
+
+    new_conv_state = tl.where(mask, conv_state, loaded_x)
+
+    conv_state_base = (conv_state_ptr +
+                       (conv_state_batch_coord * stride_conv_state_seq) +
+                       (idx_feats * stride_conv_state_dim))  # [BLOCK_N,]
+    conv_state_ptrs_target = conv_state_base + (
+        idx_tokens * stride_conv_state_tok)[:, None]  # [BLOCK_M, BLOCK_N]
+    mask = (idx_tokens < state_len)[:, None] & (idx_feats < dim)[None, :]
+    tl.store(conv_state_ptrs_target, new_conv_state, mask)
+
+    # STEP 3: init accumulator
+    if HAS_BIAS:
+        bias = bias_ptr + idx_feats
+        mask_bias = idx_feats < dim
+        acc_preload = tl.load(bias, mask=mask_bias,
+                              other=0.0).to(tl.float32)  # [BLOCK_N]
+    else:
+        acc_preload = tl.zeros((BLOCK_N, ), dtype=tl.float32)
+
+    # STEP 4:
+    # PRE-LOAD WEIGHTS
+    # first kernel column, configured for weights to handle BLOCK_N features in range
+    w_base = w_ptr + (idx_feats * stride_w_dim)  # [BLOCK_N,]
+    mask_w = idx_feats < dim
+    if KERNEL_WIDTH >= 2:
+        w_ptrs = w_base + (0 * stride_w_width)  # [BLOCK_N] tensor
+        w_col0 = tl.load(w_ptrs, mask_w, other=0.0)
+        w_ptrs = w_base + (1 * stride_w_width)  # [BLOCK_N] tensor
+        w_col1 = tl.load(w_ptrs, mask_w, other=0.0)
+    if KERNEL_WIDTH >= 3:
+        w_ptrs = w_base + (2 * stride_w_width)  # [BLOCK_N] tensor
+        w_col2 = tl.load(w_ptrs, mask_w, other=0.0)
+    if KERNEL_WIDTH >= 4:
+        w_ptrs = w_base + (3 * stride_w_width)  # [BLOCK_N] tensor
+        w_col3 = tl.load(w_ptrs, mask_w, other=0.0)
+
+    x_base_1d = x_base  # starting of chunk [BLOCK_N]
+    mask_x_1d = idx_feats < dim
+
+    # STEP 5: compute each token
+    for idx_token in tl.static_range(seqlen):
+        acc = acc_preload
+
+        matrix_w = w_col0
+        matrix_x = col0
+        for j in tl.static_range(KERNEL_WIDTH):
+            if KERNEL_WIDTH == 2:
+                if j == 1:  # KERNEL_WIDTH-1:
+                    matrix_w = w_col1
+                    x_ptrs_1d = x_base_1d + idx_token * stride_x_token  # [BLOCK_N]
+                    matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d)
+            elif KERNEL_WIDTH == 3:
+                if j == 1:
+                    matrix_w = w_col1
+                    matrix_x = col1
+                elif j == 2:
+                    matrix_w = w_col2
+                    x_ptrs_1d = x_base_1d + idx_token * stride_x_token  # [BLOCK_N]
+                    matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d)
+            elif KERNEL_WIDTH == 4:
+                if j == 1:
+                    matrix_w = w_col1
+                    matrix_x = col1
+                elif j == 2:
+                    matrix_w = w_col2
+                    matrix_x = col2
+                elif j == 3:
+                    matrix_w = w_col3
+                    x_ptrs_1d = x_base_1d + idx_token * stride_x_token  # [BLOCK_N]
+                    matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d)
+
+            acc += matrix_x * matrix_w  # [BLOCK_N]
+
+        if KERNEL_WIDTH == 2:
+            col0 = matrix_x
+        elif KERNEL_WIDTH == 3:
+            col0 = col1
+            col1 = matrix_x
+        elif KERNEL_WIDTH == 4:
+            col0 = col1
+            col1 = col2
+            col2 = matrix_x
+
+        if SILU_ACTIVATION:
+            acc = acc / (1 + tl.exp(-acc))
+        mask_1d = (idx_token < seqlen) & (idx_feats < dim
+                                          )  # token-index  # feature-index
+        o_ptrs = o_ptr + (
+            idx_seq) * stride_o_seq + idx_token * stride_o_token + (
+                idx_feats * stride_o_dim)
+
+        tl.store(o_ptrs, acc, mask=mask_1d)
+
+
+def causal_conv1d_update(
+    x: torch.Tensor,
+    conv_state: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    activation: Union[bool, str, None] = None,
+    cache_seqlens: Optional[torch.Tensor] = None,
+    conv_state_indices: Optional[torch.Tensor] = None,
+    pad_slot_id: int = PAD_SLOT_ID,
+    metadata=None,
+    validate_data=False,
+):
+    """
+    x: (batch, dim) or (batch, dim, seqlen)
+        [shape=2: single token prediction]
+        [shape=3: single or multiple tokens prediction]
+    conv_state: (..., dim, state_len), where state_len >= width - 1
+    weight: (dim, width)
+    bias: (dim,)
+    cache_seqlens: (batch,), dtype int32.
+        If not None, the conv_state is treated as a circular buffer.
+        The conv_state will be updated by copying x to the conv_state
+        starting at the index
+        @cache_seqlens % state_len.
+    conv_state_indices: (batch,), dtype int32
+        If not None, the conv_state is a larger tensor along the batch dim,
+        and we are selecting the batch coords specified by conv_state_indices.
+        Useful for a continuous batching scenario.
+    pad_slot_id: int
+            if cache_indices is passed, lets the kernel identify padded
+            entries that will not be processed,
+            for example: cache_indices = [pad_slot_id, 1 ,20 ,pad_slot_id]
+            in this case, the kernel will not process entries at
+            indices 0 and 3
+    out: (batch, dim) or (batch, dim, seqlen)
+    """
+    if validate_data:
+        assert cache_seqlens is None  # not implemented yet - ok for vLLM
+        assert pad_slot_id is not None
+        assert x.stride(1) == 1
+    if isinstance(activation, bool):
+        activation = "silu" if activation is True else None
+    elif activation is not None:
+        assert activation in ["silu", "swish"]
+    unsqueeze = x.dim() == 2
+    if unsqueeze:
+        # make it (batch, dim, seqlen) with seqlen == 1
+        x = x.unsqueeze(-1)
+    batch, dim, seqlen = x.shape
+    _, width = weight.shape
+    # conv_state: (..., dim, state_len), where state_len >= width - 1
+    num_cache_lines, _, state_len = conv_state.size()
+
+    if validate_data:
+        assert dim == weight.size(0)
+        assert conv_state.stride(
+            -2
+        ) == 1, f"ERROR: expect contiguous along feat-dim of conv_state (currently stride={conv_state.stride()})"
+        assert state_len >= width - 1
+        # when above happens, we don't shift-left to keep any records in conv_state
+        assert dim == conv_state.size(1)
+        if conv_state_indices is None:
+            assert conv_state.size(0) >= batch
+        else:
+            assert (batch, ) == conv_state_indices.shape
+
+        assert num_cache_lines >= batch
+        assert weight.stride(1) == 1  # Need this
+        assert cache_seqlens is None  # not needed for vLLM - circular buffer
+
+    # adopt the strategy in vLLM that overwrite on 'x' directly, rather than creating a new tensor 'o'
+    out = x
+    stride_w_dim, stride_w_width = weight.stride()
+
+    stride_x_seq, stride_x_dim, stride_x_token = x.stride(
+    )  # X (batch, dim, seqlen)
+
+    stride_o_seq, stride_o_dim, stride_o_token = out.stride()
+
+    stride_istate_seq, stride_istate_dim, stride_istate_token = conv_state.stride(
+    )
+    state_len = width - 1
+    np2_statelen = triton.next_power_of_2(state_len)
+
+    def grid(META):
+        return (
+            batch,
+            triton.cdiv(dim, META["BLOCK_N"]),
+        )
+
+    _causal_conv1d_update_kernel[grid](
+        # Pointers to matrices
+        x,
+        weight,
+        bias,
+        conv_state,
+        cache_seqlens,
+        conv_state_indices,
+        out,
+        # Matrix dimensions
+        batch,
+        dim,
+        seqlen,
+        state_len,
+        num_cache_lines,
+        # stride
+        stride_x_seq,
+        stride_x_dim,
+        stride_x_token,
+        stride_w_dim,
+        stride_w_width,
+        stride_istate_seq,
+        stride_istate_dim,
+        stride_istate_token,
+        stride_o_seq,
+        stride_o_dim,
+        stride_o_token,
+        # others
+        pad_slot_id,
+        # META
+        HAS_BIAS=bias is not None,
+        KERNEL_WIDTH=width,
+        SILU_ACTIVATION=activation in ["silu", "swish"],
+        IS_CONTINUOUS_BATCHING=conv_state_indices is not None,
+        NP2_STATELEN=np2_statelen,
+        USE_PAD_SLOT=pad_slot_id is not None,
+        BLOCK_N=256,
+    )
+    if unsqueeze:
+        out = out.squeeze(-1)
+    return out
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/mamba/ops/mamba_ssm.py b/vllm_v0.10.0/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
new file mode 100644
index 0000000..3f67fc3
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
@@ -0,0 +1,414 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright (c) 2024, Tri Dao, Albert Gu.
+# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/selective_state_update.py
+
+import torch
+from packaging import version
+
+from vllm import _custom_ops as ops
+from vllm.attention.backends.utils import PAD_SLOT_ID
+from vllm.triton_utils import HAS_TRITON, tl, triton
+
+TRITON3 = HAS_TRITON and (version.parse(triton.__version__)
+                          >= version.parse("3.0.0"))
+
+if TRITON3:
+
+    @triton.jit
+    def softplus(dt):
+        dt = tl.where(dt <= 20.0, tl.math.log(tl.math.exp(dt) + 1), dt)
+        return dt
+else:
+
+    @triton.jit
+    def softplus(dt):
+        dt = tl.where(dt <= 20.0, tl.math.log1p(tl.exp(dt)), dt)
+        return dt
+
+
+@triton.heuristics(
+    {"HAS_DT_BIAS": lambda args: args["dt_bias_ptr"] is not None})
+@triton.heuristics({"HAS_D": lambda args: args["D_ptr"] is not None})
+@triton.heuristics({"HAS_Z": lambda args: args["z_ptr"] is not None})
+@triton.heuristics({
+    "HAS_STATE_BATCH_INDICES":
+    lambda args: args["state_batch_indices_ptr"] is not None
+})
+@triton.heuristics(
+    {"BLOCK_SIZE_DSTATE": lambda args: triton.next_power_of_2(args["dstate"])})
+@triton.jit
+def _selective_scan_update_kernel(
+    # Pointers to matrices
+    state_ptr,
+    x_ptr,
+    dt_ptr,
+    dt_bias_ptr,
+    A_ptr,
+    B_ptr,
+    C_ptr,
+    D_ptr,
+    z_ptr,
+    out_ptr,
+    state_batch_indices_ptr,
+    pad_slot_id,
+    # Matrix dimensions
+    batch,
+    nheads,
+    dim,
+    dstate,
+    nheads_ngroups_ratio,
+    # Strides
+    stride_state_batch,
+    stride_state_head,
+    stride_state_dim,
+    stride_state_dstate,
+    stride_x_batch,
+    stride_x_head,
+    stride_x_dim,
+    stride_dt_batch,
+    stride_dt_head,
+    stride_dt_dim,
+    stride_dt_bias_head,
+    stride_dt_bias_dim,
+    stride_A_head,
+    stride_A_dim,
+    stride_A_dstate,
+    stride_B_batch,
+    stride_B_group,
+    stride_B_dstate,
+    stride_C_batch,
+    stride_C_group,
+    stride_C_dstate,
+    stride_D_head,
+    stride_D_dim,
+    stride_z_batch,
+    stride_z_head,
+    stride_z_dim,
+    stride_out_batch,
+    stride_out_head,
+    stride_out_dim,
+    # Meta-parameters
+    DT_SOFTPLUS: tl.constexpr,
+    TIE_HDIM: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    HAS_DT_BIAS: tl.constexpr,
+    HAS_D: tl.constexpr,
+    HAS_Z: tl.constexpr,
+    HAS_STATE_BATCH_INDICES: tl.constexpr,
+    BLOCK_SIZE_DSTATE: tl.constexpr,
+):
+    pid_m = tl.program_id(axis=0)
+    pid_b = tl.program_id(axis=1)
+    pid_h = tl.program_id(axis=2)
+
+    # If HAS_STATE_BATCH_INDICES is true, then the ssm state's batch coordinate
+    # is taken from the state_batch_indices_ptr Otherwise, the state coordinate
+    # is the same as the batch id.
+    if HAS_STATE_BATCH_INDICES:
+        state_batch_indices_ptr += pid_b
+        state_batch_idx = tl.load(state_batch_indices_ptr).to(tl.int64)
+        state_ptr += (state_batch_idx * stride_state_batch +
+                      pid_h * stride_state_head)
+    else:
+        state_ptr += pid_b * stride_state_batch + pid_h * stride_state_head
+
+    x_ptr += pid_b * stride_x_batch + pid_h * stride_x_head
+    dt_ptr += pid_b * stride_dt_batch + pid_h * stride_dt_head
+    if HAS_DT_BIAS:
+        dt_bias_ptr += pid_h * stride_dt_bias_head
+    A_ptr += pid_h * stride_A_head
+    B_ptr += pid_b * stride_B_batch + (pid_h //
+                                       nheads_ngroups_ratio) * stride_B_group
+    C_ptr += pid_b * stride_C_batch + (pid_h //
+                                       nheads_ngroups_ratio) * stride_C_group
+    if HAS_Z:
+        z_ptr += pid_b * stride_z_batch + pid_h * stride_z_head
+    out_ptr += pid_b * stride_out_batch + pid_h * stride_out_head
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = tl.arange(0, BLOCK_SIZE_DSTATE)
+    state_ptrs = state_ptr + (offs_m[:, None] * stride_state_dim +
+                              offs_n[None, :] * stride_state_dstate)
+    x_ptrs = x_ptr + offs_m * stride_x_dim
+    dt_ptrs = dt_ptr + offs_m * stride_dt_dim
+    if HAS_DT_BIAS:
+        dt_bias_ptrs = dt_bias_ptr + offs_m * stride_dt_bias_dim
+    if HAS_D:
+        D_ptr += pid_h * stride_D_head
+    A_ptrs = A_ptr + (offs_m[:, None] * stride_A_dim +
+                      offs_n[None, :] * stride_A_dstate)
+    B_ptrs = B_ptr + offs_n * stride_B_dstate
+    C_ptrs = C_ptr + offs_n * stride_C_dstate
+    if HAS_D:
+        D_ptrs = D_ptr + offs_m * stride_D_dim
+    if HAS_Z:
+        z_ptrs = z_ptr + offs_m * stride_z_dim
+    out_ptrs = out_ptr + offs_m * stride_out_dim
+    mask = (offs_m[:, None] < dim) & (offs_n[None, :] < dstate)
+    if HAS_STATE_BATCH_INDICES:
+        mask &= (state_batch_idx != pad_slot_id)
+    state = tl.load(state_ptrs, mask=mask, other=0.0)
+
+    x = tl.load(x_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+    if not TIE_HDIM:
+        dt = tl.load(dt_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+        if HAS_DT_BIAS:
+            dt += tl.load(dt_bias_ptrs, mask=offs_m < dim,
+                          other=0.0).to(tl.float32)
+        if DT_SOFTPLUS:
+            dt = softplus(dt)
+        A = tl.load(A_ptrs,
+                    mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate),
+                    other=0.0).to(tl.float32)
+        dA = tl.exp(A * dt[:, None])
+    else:
+        dt = tl.load(dt_ptr).to(tl.float32)
+        if HAS_DT_BIAS:
+            dt += tl.load(dt_bias_ptr).to(tl.float32)
+        if DT_SOFTPLUS:
+            dt = softplus(dt)
+        A = tl.load(A_ptr).to(tl.float32)
+        dA = tl.exp(A * dt)  # scalar, not a matrix
+
+    B = tl.load(B_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
+    C = tl.load(C_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
+    if HAS_D:
+        D = tl.load(D_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+    if HAS_Z:
+        z = tl.load(z_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+
+    dB = B[None, :] * dt[:, None] if not TIE_HDIM else B * dt
+    state = state * dA + dB * x[:, None]
+
+    mask = (offs_m[:, None] < dim) & (offs_n[None, :] < dstate)
+    if HAS_STATE_BATCH_INDICES:
+        mask &= (state_batch_idx != pad_slot_id)
+    tl.store(state_ptrs, state, mask=mask)
+    out = tl.sum(state * C[None, :], axis=1)
+    if HAS_D:
+        out += x * D
+    if HAS_Z:
+        out *= z * tl.sigmoid(z)
+    tl.store(out_ptrs, out, mask=offs_m < dim)
+
+
+def selective_state_update(state,
+                           x,
+                           dt,
+                           A,
+                           B,
+                           C,
+                           D=None,
+                           z=None,
+                           dt_bias=None,
+                           dt_softplus=False,
+                           state_batch_indices=None,
+                           pad_slot_id=PAD_SLOT_ID):
+    """
+    Argument:
+        state: (batch, dim, dstate) or (batch, nheads, dim, dstate)
+        x: (batch, dim) or (batch, nheads, dim)
+        dt: (batch, dim) or (batch, nheads, dim)
+        A: (dim, dstate) or (nheads, dim, dstate)
+        B: (batch, dstate) or (batch, ngroups, dstate)
+        C: (batch, dstate) or (batch, ngroups, dstate)
+        D: (dim,) or (nheads, dim)
+        z: (batch, dim) or (batch, nheads, dim)
+        dt_bias: (dim,) or (nheads, dim)
+        pad_slot_id: int
+            if cache_indices is passed, lets the kernel identify padded 
+            entries that will not be processed, 
+            for example: cache_indices = [pad_slot_id, 1, 20, pad_slot_id] 
+            in this case, the kernel will not process entries at 
+            indices 0 and 3
+    Return:
+        out: (batch, dim) or (batch, nheads, dim)
+    """
+    has_heads = state.dim() > 3
+    if state.dim() == 3:
+        state = state.unsqueeze(1)
+    if x.dim() == 2:
+        x = x.unsqueeze(1)
+    if dt.dim() == 2:
+        dt = dt.unsqueeze(1)
+    if A.dim() == 2:
+        A = A.unsqueeze(0)
+    if B.dim() == 2:
+        B = B.unsqueeze(1)
+    if C.dim() == 2:
+        C = C.unsqueeze(1)
+    if D is not None and D.dim() == 1:
+        D = D.unsqueeze(0)
+    if z is not None and z.dim() == 2:
+        z = z.unsqueeze(1)
+    if dt_bias is not None and dt_bias.dim() == 1:
+        dt_bias = dt_bias.unsqueeze(0)
+
+    _, nheads, dim, dstate = state.shape
+    batch = x.shape[0]
+
+    assert x.shape == (batch, nheads, dim)
+    assert dt.shape == x.shape
+    assert A.shape == (nheads, dim, dstate)
+    ngroups = B.shape[1]
+    assert nheads % ngroups == 0, "nheads must be divisible by ngroups"
+    assert B.shape == (batch, ngroups, dstate)
+    assert C.shape == B.shape
+    if D is not None:
+        assert D.shape == (nheads, dim)
+    if z is not None:
+        assert z.shape == x.shape
+    if dt_bias is not None:
+        assert dt_bias.shape == (nheads, dim)
+    if state_batch_indices is not None:
+        assert state_batch_indices.shape == (batch, )
+    out = torch.empty_like(x)
+    grid = lambda META: (triton.cdiv(dim, META['BLOCK_SIZE_M']), batch, nheads)
+    z_strides = ((z.stride(0), z.stride(1), z.stride(2)) if z is not None else
+                 (0, 0, 0))
+    # We don't want autotune since it will overwrite the state
+    # We instead tune by hand.
+    BLOCK_SIZE_M, num_warps = ((32, 4) if dstate <= 16 else
+                               ((16, 4) if dstate <= 32 else
+                                ((8, 4) if dstate <= 64 else
+                                 ((4, 4) if dstate <= 128 else ((4, 8))))))
+    tie_hdim = A.stride(-1) == 0 and A.stride(-2) == 0 and dt.stride(
+        -1) == 0 and dt_bias.stride(-1) == 0
+    with torch.cuda.device(x.device.index):
+        _selective_scan_update_kernel[grid](
+            state,
+            x,
+            dt,
+            dt_bias,
+            A,
+            B,
+            C,
+            D,
+            z,
+            out,
+            state_batch_indices,
+            pad_slot_id,
+            batch,
+            nheads,
+            dim,
+            dstate,
+            nheads // ngroups,
+            state.stride(0),
+            state.stride(1),
+            state.stride(2),
+            state.stride(3),
+            x.stride(0),
+            x.stride(1),
+            x.stride(2),
+            dt.stride(0),
+            dt.stride(1),
+            dt.stride(2),
+            *(dt_bias.stride(0),
+              dt_bias.stride(1)) if dt_bias is not None else 0,
+            A.stride(0),
+            A.stride(1),
+            A.stride(2),
+            B.stride(0),
+            B.stride(1),
+            B.stride(2),
+            C.stride(0),
+            C.stride(1),
+            C.stride(2),
+            *(D.stride(0), D.stride(1)) if D is not None else 0,
+            z_strides[0],
+            z_strides[1],
+            z_strides[2],
+            out.stride(0),
+            out.stride(1),
+            out.stride(2),
+            dt_softplus,
+            tie_hdim,
+            BLOCK_SIZE_M,
+            num_warps=num_warps,
+        )
+    if not has_heads:
+        out = out.squeeze(1)
+    return out
+
+
+def selective_scan_fn(u,
+                      ssm_states,
+                      delta,
+                      A,
+                      B,
+                      C,
+                      D=None,
+                      z=None,
+                      delta_bias=None,
+                      delta_softplus=False,
+                      query_start_loc=None,
+                      cache_indices=None,
+                      has_initial_state=None,
+                      pad_slot_id=PAD_SLOT_ID) -> torch.Tensor:
+    """
+    u: (dim, total_length) for varlen or (batch, dim, seqlen) 
+        applies changes in place.
+    ssm_states: (batch, dim, dstate) or (batch, nheads, dim, dstate)
+        applies changes in place.
+    delta: (dim, total_length) for varlen or (batch, dim, seqlen)
+    A: (dim, dstate) 
+    B: (ngroups, dstate, total_length) for varlen or 
+                                        (batch,ngroups,dstate,seqlen)
+    C: (ngroups, dstate, total_length) for varlen or 
+                                        (batch,ngroups,dstate,seqlen)
+    D: (dim,) 
+    z: (dim, total_length) for varlen or (batch, dim, seqlen) 
+    dt_bias: (dim,) or (dim)
+    query_start_loc: (batch + 1) int32
+        The cumulative sequence lengths of the sequences in
+        the batch, used to index into sequence. prepended with 0.
+        for example: query_start_loc = torch.Tensor([0,10,16,17]), 
+        x.shape=(dim,17)
+    cache_indices: (batch) int32
+        A tensor with each cell is a correspondent 
+        input and output ssm_state index
+    has_initial_state: (batch) bool
+        A tensor populated with ones and zeros, 
+        indicate if the ssm_state at the corresponding index should be 
+        used as initial state. Not providing argument assumes 
+        there's no initial state
+    pad_slot_id: int
+        if cache_indices is passed, lets the kernel identify padding entries 
+        that will not be processed, 
+        for example: cache_indices = [pad_slot_id, 1 ,20 ,pad_slot_id] 
+        in this case, the kernel will not process entries at indices 0 and 3
+    returns
+        output: (dim, total_length) for varlen or (batch, dim, seqlen) 
+                supports inplace replacement
+    """
+    if u.stride(-1) != 1:
+        u = u.contiguous()
+    if delta.stride(-1) != 1:
+        delta = delta.contiguous()
+    if D is not None:
+        D = D.contiguous()
+    if B.stride(-1) != 1:
+        B = B.contiguous()
+    if C.stride(-1) != 1:
+        C = C.contiguous()
+    if z is not None and z.stride(-1) != 1:
+        z = z.contiguous()
+    if B.dim() == 3 and query_start_loc is None:
+        B = B.unsqueeze(1)
+    if B.dim() == 2 and query_start_loc is not None:
+        B = B.unsqueeze(0)
+    if C.dim() == 3 and query_start_loc is None:
+        C = C.unsqueeze(1)
+    if C.dim() == 2 and query_start_loc is not None:
+        C = C.unsqueeze(0)
+
+    ops.selective_scan_fwd(u, delta, A, B, C, D, z, delta_bias, delta_softplus,
+                           query_start_loc, cache_indices, has_initial_state,
+                           ssm_states, pad_slot_id)
+
+    if z is None:
+        return delta  # output written inplace to delta
+    else:
+        return z  # output written inplace to z
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/mamba/ops/ssd_bmm.py b/vllm_v0.10.0/vllm/model_executor/layers/mamba/ops/ssd_bmm.py
new file mode 100644
index 0000000..11ca125
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/mamba/ops/ssd_bmm.py
@@ -0,0 +1,262 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright (c) 2024, Tri Dao, Albert Gu.
+# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_bmm.py
+
+# ruff: noqa: E501,SIM102
+
+import math
+
+import torch
+
+from vllm.triton_utils import tl, triton
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 256,
+                'BLOCK_SIZE_K': 64
+            },
+            num_stages=3,
+            num_warps=8),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 256,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 128,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 128,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 32,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 32,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=5,
+            num_warps=2),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 32,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=5,
+            num_warps=2),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=2),
+    ],
+    key=['chunk_size', 'K', 'IS_CAUSAL'],
+)
+@triton.jit
+def _bmm_chunk_fwd_kernel(
+    # Pointers to matrices
+    a_ptr,
+    b_ptr,
+    out_ptr,
+    seq_idx_ptr,
+    # Matrix dimensions
+    seqlen,
+    chunk_size,
+    K,
+    ngroups,
+    stride_a_batch,
+    stride_a_seqlen,
+    stride_a_head,
+    stride_ak,
+    stride_b_batch,
+    stride_b_seqlen,
+    stride_b_head,
+    stride_bk,
+    stride_out_batch,
+    stride_out_chunk,
+    stride_out_head,
+    stride_outm,
+    stride_outn,
+    stride_seq_idx_batch,
+    stride_seq_idx_seqlen,
+    # Meta-parameters
+    IS_CAUSAL: tl.constexpr,
+    dot_dtype: tl.constexpr,
+    HAS_SEQ_IDX: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+):
+    pid_b = tl.program_id(axis=1)
+    pid_ch = tl.program_id(axis=2).to(tl.int64)
+    pid_c = pid_ch // ngroups
+    pid_h = pid_ch - pid_c * ngroups
+    num_pid_n = tl.cdiv(chunk_size, BLOCK_SIZE_N)
+    pid_m = tl.program_id(axis=0) // num_pid_n
+    pid_n = tl.program_id(axis=0) % num_pid_n
+    if IS_CAUSAL:
+        if pid_n * BLOCK_SIZE_N >= (pid_m + 1) * BLOCK_SIZE_M:
+            return
+    a_ptr += pid_b * stride_a_batch + pid_c * chunk_size * stride_a_seqlen + pid_h * stride_a_head
+    b_ptr += pid_b * stride_b_batch + pid_c * chunk_size * stride_b_seqlen + pid_h * stride_b_head
+    if HAS_SEQ_IDX:
+        seq_idx_ptr += pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + (offs_m[:, None] * stride_a_seqlen +
+                      offs_k[None, :] * stride_ak)
+    b_ptrs = b_ptr + (offs_k[:, None] * stride_bk +
+                      offs_n[None, :] * stride_b_seqlen)
+    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs,
+                    mask=(offs_m[:, None] < chunk_size_limit) &
+                    (offs_k[None, :] < K - k * BLOCK_SIZE_K),
+                    other=0.0).to(dot_dtype)
+        b = tl.load(b_ptrs,
+                    mask=(offs_k[:, None] < K - k * BLOCK_SIZE_K) &
+                    (offs_n[None, :] < chunk_size_limit),
+                    other=0.0).to(dot_dtype)
+        acc += tl.dot(a, b)
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    if HAS_SEQ_IDX:
+        chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+        seq_idx_m = tl.load(seq_idx_ptr + offs_m * stride_seq_idx_seqlen,
+                            mask=offs_m < chunk_size_limit,
+                            other=-1)
+        seq_idx_n = tl.load(seq_idx_ptr + offs_n * stride_seq_idx_seqlen,
+                            mask=offs_n < chunk_size_limit,
+                            other=-2)
+        acc = tl.where(seq_idx_m[:, None] == seq_idx_n[None, :], acc, 0.0)
+    out = acc.to(out_ptr.dtype.element_ty)
+
+    out_ptr += pid_b * stride_out_batch + pid_c * stride_out_chunk + pid_h * stride_out_head
+    out_ptrs = out_ptr + (stride_outm * offs_m[:, None] +
+                          offs_n[None, :] * stride_outn)
+    tl.store(out_ptrs,
+             out,
+             mask=(offs_m[:, None] < chunk_size) &
+             (offs_n[None, :] < chunk_size))
+
+
+def _bmm_chunk_fwd(a,
+                   b,
+                   chunk_size,
+                   seq_idx=None,
+                   causal=False,
+                   output_dtype=None):
+    """
+    Argument:
+        a: (batch, seqlen, k) or (batch, seqlen, ngroups, k)
+        b: (batch, seqlen, k) or (batch, seqlen, ngroups, k)
+        seq_idx: (batch, seqlen) or None. out[i, j] for seq_idx[i] != seq_idx[j] will be zeroed out.
+        causal: if True, then out[i, j] for i > j will be arbitrary, only out[i, j] for i <= j are
+            guaranteed to be correct.
+    Return:
+        out: (batch, nchunks, chunk_size, chunk_size) or (batch, nchunks, ngroups, chunk_size, chunk_size)
+    """
+    # Check constraints.
+    has_groups = a.dim() == 4
+    if not has_groups:
+        batch, seqlen, k = a.shape
+    else:
+        batch, seqlen, ngroups, k = a.shape
+    assert b.shape == a.shape
+    if seq_idx is not None:
+        assert seq_idx.shape == (batch, seqlen)
+    if a.stride(-1) != 1 and a.stride(1) != 1:
+        a = a.contiguous()
+    if b.stride(-1) != 1 and b.stride(1) != 1:
+        b = b.contiguous()
+    nchunks = math.ceil(seqlen / chunk_size)
+    # Allocates output.
+    out_dtype = a.dtype if output_dtype is None else output_dtype
+    out = torch.empty(
+        (batch, nchunks, chunk_size, chunk_size) if not has_groups else
+        (batch, nchunks, ngroups, chunk_size, chunk_size),
+        device=a.device,
+        dtype=out_dtype)
+    dot_dtype = (tl.bfloat16
+                 if a.dtype == torch.bfloat16 or b.dtype == torch.bfloat16 else
+                 (tl.float16 if a.dtype == torch.float16
+                  or b.dtype == torch.float16 else tl.float32))
+    grid = lambda META: (triton.cdiv(
+        chunk_size, META['BLOCK_SIZE_M']) * triton.cdiv(
+            chunk_size, META['BLOCK_SIZE_N']), batch, nchunks
+                         if not has_groups else nchunks * ngroups)
+    with torch.cuda.device(a.device.index):
+        _bmm_chunk_fwd_kernel[grid](
+            a,
+            b,
+            out,
+            seq_idx,
+            seqlen,
+            chunk_size,
+            k,
+            ngroups if has_groups else 1,
+            a.stride(0),
+            a.stride(1),
+            0 if not has_groups else a.stride(2),
+            a.stride(-1),
+            b.stride(0),
+            b.stride(1),
+            0 if not has_groups else b.stride(2),
+            b.stride(-1),
+            out.stride(0),
+            out.stride(1),
+            0 if not has_groups else out.stride(2),
+            out.stride(-2),
+            out.stride(-1),
+            *((seq_idx.stride(0),
+               seq_idx.stride(1)) if seq_idx is not None else (0, 0)),
+            causal,
+            dot_dtype,
+            HAS_SEQ_IDX=seq_idx is not None,
+        )
+    return out
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py b/vllm_v0.10.0/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
new file mode 100644
index 0000000..365e1c5
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
@@ -0,0 +1,589 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright (c) 2024, Tri Dao, Albert Gu.
+# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_chunk_scan.py
+
+# ruff: noqa: E501,SIM102
+
+import torch
+from packaging import version
+
+from vllm.triton_utils import tl, triton
+
+TRITON_22 = version.parse(triton.__version__) >= version.parse('2.2.0')
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 256,
+                'BLOCK_SIZE_K': 64
+            },
+            num_stages=3,
+            num_warps=8),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 256,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 128,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 128,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 64
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 128,
+                'BLOCK_SIZE_K': 64
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 32,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 32,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=5,
+            num_warps=2),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 32,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=5,
+            num_warps=2),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=2),
+    ],
+    key=['chunk_size', 'hdim', 'dstate', 'IS_CAUSAL'],
+)
+@triton.jit
+def _chunk_scan_fwd_kernel(
+    # Pointers to matrices
+    cb_ptr,
+    x_ptr,
+    z_ptr,
+    out_ptr,
+    out_x_ptr,
+    dt_ptr,
+    dA_cumsum_ptr,
+    seq_idx_ptr,
+    C_ptr,
+    states_ptr,
+    D_ptr,
+    initstates_ptr,
+    chunk_indices_ptr,
+    chunk_offsets_ptr,
+    chunk_meta_num,
+    # Matrix dimensions
+    chunk_size,
+    hdim,
+    dstate,
+    batch,
+    seqlen,
+    nheads_ngroups_ratio,
+    # Strides
+    stride_cb_batch,
+    stride_cb_chunk,
+    stride_cb_head,
+    stride_cb_csize_m,
+    stride_cb_csize_k,
+    stride_x_batch,
+    stride_x_seqlen,
+    stride_x_head,
+    stride_x_hdim,
+    stride_z_batch,
+    stride_z_seqlen,
+    stride_z_head,
+    stride_z_hdim,
+    stride_out_batch,
+    stride_out_seqlen,
+    stride_out_head,
+    stride_out_hdim,
+    stride_dt_batch,
+    stride_dt_chunk,
+    stride_dt_head,
+    stride_dt_csize,
+    stride_dA_cs_batch,
+    stride_dA_cs_chunk,
+    stride_dA_cs_head,
+    stride_dA_cs_csize,
+    stride_seq_idx_batch,
+    stride_seq_idx_seqlen,
+    stride_C_batch,
+    stride_C_seqlen,
+    stride_C_head,
+    stride_C_dstate,
+    stride_states_batch,
+    stride_states_chunk,
+    stride_states_head,
+    stride_states_hdim,
+    stride_states_dstate,
+    stride_init_states_batch,
+    stride_init_states_head,
+    stride_init_states_hdim,
+    stride_init_states_dstate,
+    stride_D_head,
+    # Meta-parameters
+    IS_CAUSAL: tl.constexpr,
+    HAS_D: tl.constexpr,
+    D_HAS_HDIM: tl.constexpr,
+    HAS_Z: tl.constexpr,
+    HAS_SEQ_IDX: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    BLOCK_SIZE_DSTATE: tl.constexpr,
+    IS_TRITON_22: tl.constexpr,
+    HAS_INITSTATES: tl.constexpr,
+):
+    pid_bc = tl.program_id(axis=1).to(tl.int64)
+    pid_c = pid_bc // batch
+    pid_b = pid_bc - pid_c * batch
+    if not HAS_INITSTATES:
+        c_idx = pid_c
+        c_off = 0
+    else:
+        c_idx = tl.load(chunk_indices_ptr + pid_c, mask=pid_c > -1, other=0)
+        c_off = tl.load(chunk_offsets_ptr + pid_c, mask=pid_c > -1, other=0)
+
+    pid_h = tl.program_id(axis=2)
+    num_pid_n = tl.cdiv(hdim, BLOCK_SIZE_N)
+    pid_m = tl.program_id(axis=0) // num_pid_n
+    pid_n = tl.program_id(axis=0) % num_pid_n
+    cb_ptr += pid_b * stride_cb_batch + c_idx * stride_cb_chunk + (
+        pid_h // nheads_ngroups_ratio) * stride_cb_head
+    x_ptr += pid_b * stride_x_batch + c_idx * chunk_size * stride_x_seqlen + pid_h * stride_x_head
+    dt_ptr += pid_b * stride_dt_batch + c_idx * stride_dt_chunk + pid_h * stride_dt_head
+    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + c_idx * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
+    C_ptr += pid_b * stride_C_batch + c_idx * chunk_size * stride_C_seqlen + (
+        pid_h // nheads_ngroups_ratio) * stride_C_head
+
+    # M-block offsets and prev states
+    #  - logic in next block may override these if there is an active offset
+    offs_m = pid_m * BLOCK_SIZE_M + c_off + tl.arange(0, BLOCK_SIZE_M)
+    prev_states_ptr = states_ptr + pid_b * stride_states_batch + c_idx * stride_states_chunk + pid_h * stride_states_head
+    prev_states_hdim = stride_states_hdim
+    prev_states_dstate = stride_states_dstate
+
+    chunk_size_limit = min(chunk_size, seqlen - c_idx * chunk_size)
+    if HAS_SEQ_IDX:
+        seq_idx_ptr += pid_b * stride_seq_idx_batch + c_idx * chunk_size * stride_seq_idx_seqlen
+
+        # - we only need seq_idx_prev to be aligned to chunk boundary
+        seq_idx_prev = tl.load(seq_idx_ptr - stride_seq_idx_seqlen,
+                               mask=c_idx >= 1,
+                               other=0)
+
+        if HAS_INITSTATES:
+            # if there are init states, we only need seq_idx_m to point
+            # what is the current seq_idx
+
+            # get current seq idx
+            if (pid_m * BLOCK_SIZE_M + c_off) < chunk_size_limit:
+                seq_idx_m = tl.load(
+                    seq_idx_ptr +
+                    (pid_m * BLOCK_SIZE_M + c_off) * stride_seq_idx_seqlen, )
+
+                # - recall that in ssd_state_passing, for the case c_off == 0
+                # i.e., the very first sequence, we made states_ptr hold its initial state
+                # so this edge case is taken care of
+                if ((c_off == 0) and
+                    (seq_idx_prev != seq_idx_m
+                     )  # if a seq is changed exactly on boundary
+                        or (c_off > 0)  # implies a new example (pseudo chunk)
+                    ):
+
+                    # - replace prev_states_ptr with init_states
+                    prev_states_ptr = initstates_ptr + seq_idx_m * stride_init_states_batch + pid_h * stride_init_states_head
+                    prev_states_hdim = stride_init_states_hdim  # override strides
+                    prev_states_dstate = stride_init_states_dstate
+
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    dA_cs_m = tl.load(dA_cumsum_ptr + offs_m * stride_dA_cs_csize,
+                      mask=offs_m < chunk_size,
+                      other=0.0).to(tl.float32)
+
+    # - handle chunk state limit
+    if HAS_INITSTATES:
+
+        # have to split this if otherwise compilation will have problems
+        dA_cs_m_boundary = 0.0
+
+        # get the c_idx for the next (logica) chunk
+        c_idx_n = tl.load(
+            chunk_indices_ptr + (pid_c + 1),
+            mask=pid_c > -1 and (pid_c + 1) < chunk_meta_num,
+            other=-1  # to trigger different chunk
+        )
+
+        # - there are things to consider
+        # A. if c_off > 0 then we need to move the dA_cs boundary to ensure correct
+        #    contribution of past states
+        # B. if c_off_n < chunk_size_limit, then we need to adjust this so as not to
+        #    encroach into the next sequence, where c_off_n is the offset of the next
+        #    (logical) chunk.
+        # An equivalent check for B is c_idx == c_idx_n, where there is repetition in
+        # (logical) chunk indices.
+
+        if (c_idx == c_idx_n) or c_off > 0:
+
+            # get the next offset
+            c_off_n = tl.load(chunk_offsets_ptr + (pid_c + 1),
+                              mask=pid_c > -1 and (pid_c + 1) < chunk_meta_num,
+                              other=chunk_size)
+
+            # in this case, adjust down the chunk_size_limit
+            if c_idx == c_idx_n:
+                chunk_size_limit = min(c_off_n, chunk_size_limit)
+
+            # get the cs at the offset boundary
+            # - c_off == 0 is a passthrough
+            dA_cs_m_boundary = tl.load(
+                dA_cumsum_ptr +
+                (pid_m * BLOCK_SIZE_M + c_off - 1) * stride_dA_cs_csize,
+                mask=(((pid_m * BLOCK_SIZE_M + c_off - 1) > -1)
+                      and ((pid_m * BLOCK_SIZE_M + c_off) < chunk_size)),
+                other=0.0).to(tl.float32)
+
+    if HAS_SEQ_IDX:
+        # - handle seq idx when HAS_INITSTATES==False
+        if not HAS_INITSTATES:
+            seq_idx_m = tl.load(seq_idx_ptr + offs_m * stride_seq_idx_seqlen,
+                                mask=offs_m < chunk_size_limit,
+                                other=-1)
+
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+
+    # Without the if (pid_c > -1), with Triton 2.1.0, I get
+    # Assertion `!(srcMmaLayout && dstMmaLayout) && "Unexpected mma -> mm a layout conversion"' failed.
+    # With Triton 2.2.0, this works
+    if IS_TRITON_22 or c_idx > -1:
+        # Faster to just do 1 iteration with larger BLOCK_SIZE_K, up to block size 128
+        offs_k_dstate = tl.arange(
+            0, BLOCK_SIZE_DSTATE if BLOCK_SIZE_DSTATE <= 128 else BLOCK_SIZE_K)
+        C_ptrs = C_ptr + (offs_m[:, None] * stride_C_seqlen +
+                          offs_k_dstate[None, :] * stride_C_dstate)
+
+        prev_states_ptrs = prev_states_ptr + (
+            offs_n[None, :] * prev_states_hdim +
+            offs_k_dstate[:, None] * prev_states_dstate)
+        if HAS_SEQ_IDX:
+
+            if not HAS_INITSTATES:
+                # - this is for continuous batching where there is no init states
+                scale_m = tl.where(seq_idx_m == seq_idx_prev, tl.exp(dA_cs_m),
+                                   0.0)
+            else:
+                # - if there is initstates, we will rely on prev_states, no zeroing
+                #   required.
+                scale_m = tl.exp(dA_cs_m - dA_cs_m_boundary)
+        else:
+            scale_m = tl.exp(dA_cs_m)
+        if BLOCK_SIZE_DSTATE <= 128:
+            C = tl.load(C_ptrs,
+                        mask=(offs_m[:, None] < chunk_size_limit) &
+                        (offs_k_dstate[None, :] < dstate),
+                        other=0.0)
+
+            prev_states = tl.load(prev_states_ptrs,
+                                  mask=(offs_k_dstate[:, None] < dstate) &
+                                  (offs_n[None, :] < hdim),
+                                  other=0.0)
+            prev_states = prev_states.to(C_ptr.dtype.element_ty)
+            acc = tl.dot(C, prev_states) * scale_m[:, None]
+        else:
+            for k in range(0, dstate, BLOCK_SIZE_K):
+                C = tl.load(C_ptrs,
+                            mask=(offs_m[:, None] < chunk_size_limit) &
+                            (offs_k_dstate[None, :] < dstate - k),
+                            other=0.0)
+                # C = (C * scale_m[:, None]).to(C_ptr.dtype.element_ty)
+                prev_states = tl.load(
+                    prev_states_ptrs,
+                    mask=(offs_k_dstate[:, None] < dstate - k) &
+                    (offs_n[None, :] < hdim),
+                    other=0.0)
+                prev_states = prev_states.to(C_ptr.dtype.element_ty)
+                acc += tl.dot(C, prev_states)
+                C_ptrs += BLOCK_SIZE_K
+                prev_states_ptrs += BLOCK_SIZE_K
+            acc *= scale_m[:, None]
+
+    offs_k = tl.arange(0, BLOCK_SIZE_K) + c_off
+    cb_ptrs = cb_ptr + (offs_m[:, None] * stride_cb_csize_m +
+                        offs_k[None, :] * stride_cb_csize_k)
+    x_ptrs = x_ptr + (offs_k[:, None] * stride_x_seqlen +
+                      offs_n[None, :] * stride_x_hdim)
+    dt_ptrs = dt_ptr + offs_k * stride_dt_csize
+    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
+    K_MAX = chunk_size_limit if not IS_CAUSAL else min(
+        (pid_m + 1) * BLOCK_SIZE_M, chunk_size_limit)
+    for k in range(0, K_MAX, BLOCK_SIZE_K):
+        cb = tl.load(cb_ptrs,
+                     mask=(offs_m[:, None] < chunk_size) &
+                     (offs_k[None, :] < chunk_size - k),
+                     other=0.0).to(tl.float32)
+        dA_cs_k = tl.load(dA_cumsum_ptrs,
+                          mask=offs_k < chunk_size - k,
+                          other=0.0).to(tl.float32)
+        # If there's seq_idx, we already set cb[i, j] = 0 for seq_idx[i] != seq_idx[j].
+        # So we don't need masking wrt seq_idx here.
+        cb *= tl.exp(dA_cs_m[:, None] - dA_cs_k[None, :])
+        dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size - k,
+                       other=0.0).to(tl.float32)
+        cb *= dt_k
+        if IS_CAUSAL:
+            mask = offs_m[:, None] >= k + offs_k[None, :]
+            cb = tl.where(mask, cb, 0.0)
+        cb = cb.to(x_ptr.dtype.element_ty)
+        x = tl.load(x_ptrs,
+                    mask=(offs_k[:, None] < chunk_size_limit - k) &
+                    (offs_n[None, :] < hdim),
+                    other=0.0)
+        acc += tl.dot(cb, x)
+        cb_ptrs += BLOCK_SIZE_K * stride_cb_csize_k
+        x_ptrs += BLOCK_SIZE_K * stride_x_seqlen
+        dt_ptrs += BLOCK_SIZE_K * stride_dt_csize
+        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
+
+    offs_out_m = pid_m * BLOCK_SIZE_M + c_off + tl.arange(0, BLOCK_SIZE_M)
+    offs_out_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+
+    if HAS_D:
+        if D_HAS_HDIM:
+            D = tl.load(D_ptr + pid_h * stride_D_head + offs_n,
+                        mask=offs_n < hdim,
+                        other=0.0).to(tl.float32)
+        else:
+            D = tl.load(D_ptr + pid_h * stride_D_head).to(tl.float32)
+        x_residual = tl.load(x_ptr + (offs_m[:, None] * stride_x_seqlen +
+                                      offs_n[None, :] * stride_x_hdim),
+                             mask=(offs_m[:, None] < chunk_size_limit) &
+                             (offs_n[None, :] < hdim),
+                             other=0.0).to(tl.float32)
+        acc += x_residual * D
+
+    if HAS_Z:
+        out_x_ptr += pid_b * stride_out_batch + c_idx * chunk_size * stride_out_seqlen + pid_h * stride_out_head
+        out_x_ptrs = out_x_ptr + (stride_out_seqlen * offs_out_m[:, None] +
+                                  offs_out_n[None, :])
+        tl.store(out_x_ptrs,
+                 acc,
+                 mask=(offs_out_m[:, None] < chunk_size_limit) &
+                 (offs_out_n[None, :] < hdim))
+
+        z_ptr += pid_b * stride_z_batch + c_idx * chunk_size * stride_z_seqlen + pid_h * stride_z_head
+        z_ptrs = z_ptr + (stride_z_seqlen * offs_out_m[:, None] +
+                          stride_z_hdim * offs_out_n[None, :])
+        z = tl.load(z_ptrs,
+                    mask=(offs_out_m[:, None] < chunk_size_limit) &
+                    (offs_out_n[None, :] < hdim),
+                    other=0.0).to(tl.float32)
+        acc *= z * tl.sigmoid(z)
+
+    out_ptr += pid_b * stride_out_batch + c_idx * chunk_size * stride_out_seqlen + pid_h * stride_out_head
+    out_ptrs = out_ptr + (stride_out_seqlen * offs_out_m[:, None] +
+                          offs_out_n[None, :] * stride_out_hdim)
+    tl.store(out_ptrs,
+             acc,
+             mask=(offs_out_m[:, None] < chunk_size_limit) &
+             (offs_out_n[None, :] < hdim))
+
+
+def _chunk_scan_fwd(
+    cb,
+    x,
+    dt,
+    dA_cumsum,
+    C,
+    states,
+    D=None,
+    z=None,
+    seq_idx=None,
+    chunk_indices=None,
+    chunk_offsets=None,
+    initial_states=None,
+):
+    batch, seqlen, nheads, headdim = x.shape
+    _, _, nchunks, chunk_size = dt.shape
+    _, _, ngroups, dstate = C.shape
+    assert nheads % ngroups == 0
+    assert C.shape == (batch, seqlen, ngroups, dstate)
+    assert cb.shape == (batch, nchunks, ngroups, chunk_size, chunk_size)
+    if z is not None:
+        assert z.shape == x.shape
+    if D is not None:
+        assert D.shape == (nheads, headdim) or D.shape == (nheads, )
+    assert dt.shape == (batch, nheads, nchunks, chunk_size)
+    assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size)
+    assert states.shape == (batch, nchunks, nheads, headdim, dstate)
+
+    if seq_idx is not None:
+        assert seq_idx.shape == (batch, seqlen)
+
+        if initial_states is not None:
+            # with initial states, we need to take care of how
+            # seq_idx crosses the boundaries
+            assert batch == 1, "chunk scan only supports initial states with batch 1"
+
+            if initial_states.shape[0] == 1:
+                # no in this case no point to use initial states
+                initial_states = None
+            else:
+                assert chunk_indices is not None and chunk_offsets is not None, \
+                    (
+                        "chunk_indices and chunk_offsets should have been set"
+                    )
+        else:
+            chunk_indices, chunk_offsets = None, None
+    else:
+        chunk_indices, chunk_offsets = None, None
+
+    # Allocates output.
+    out = torch.empty(batch,
+                      seqlen,
+                      nheads,
+                      headdim,
+                      device=x.device,
+                      dtype=x.dtype)
+    if z is not None:
+        out_x = torch.empty(batch,
+                            seqlen,
+                            nheads,
+                            headdim,
+                            device=x.device,
+                            dtype=x.dtype)
+        assert out_x.stride() == out.stride()
+    else:
+        out_x = None
+
+    grid = lambda META: (
+        triton.cdiv(chunk_size, META['BLOCK_SIZE_M']) * triton.cdiv(
+            headdim, META['BLOCK_SIZE_N']), batch * nchunks
+        if chunk_offsets is None else len(chunk_offsets), nheads)
+    z_strides = ((z.stride(0), z.stride(1), z.stride(2),
+                  z.stride(3)) if z is not None else (0, 0, 0, 0))
+    _chunk_scan_fwd_kernel[grid](
+        cb,
+        x,
+        z,
+        out,
+        out_x,
+        dt,
+        dA_cumsum,
+        seq_idx,
+        C,
+        states,
+        D,
+        initial_states,
+        chunk_indices,
+        chunk_offsets,
+        len(chunk_indices) if chunk_indices is not None else 0,
+        chunk_size,
+        headdim,
+        dstate,
+        batch,
+        seqlen,
+        nheads // ngroups,
+        cb.stride(0),
+        cb.stride(1),
+        cb.stride(2),
+        cb.stride(3),
+        cb.stride(4),
+        x.stride(0),
+        x.stride(1),
+        x.stride(2),
+        x.stride(3),
+        z_strides[0],
+        z_strides[1],
+        z_strides[2],
+        z_strides[3],
+        out.stride(0),
+        out.stride(1),
+        out.stride(2),
+        out.stride(3),
+        dt.stride(0),
+        dt.stride(2),
+        dt.stride(1),
+        dt.stride(3),
+        dA_cumsum.stride(0),
+        dA_cumsum.stride(2),
+        dA_cumsum.stride(1),
+        dA_cumsum.stride(3),
+        *((seq_idx.stride(0), seq_idx.stride(1)) if seq_idx is not None else
+          (0, 0)),
+        C.stride(0),
+        C.stride(1),
+        C.stride(2),
+        C.stride(3),
+        states.stride(0),
+        states.stride(1),
+        states.stride(2),
+        states.stride(3),
+        states.stride(4),
+        *((initial_states.stride(0), initial_states.stride(1),
+           initial_states.stride(2),
+           initial_states.stride(3)) if initial_states is not None else
+          (0, 0, 0, 0)),
+        D.stride(0) if D is not None else 0,
+        True,
+        D is not None,
+        D.dim() == 2 if D is not None else True,
+        BLOCK_SIZE_DSTATE=max(triton.next_power_of_2(dstate), 16),
+        HAS_Z=z is not None,
+        HAS_SEQ_IDX=seq_idx is not None,
+        IS_TRITON_22=TRITON_22,
+        HAS_INITSTATES=initial_states is not None,
+    )
+    return out, out_x
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py b/vllm_v0.10.0/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py
new file mode 100644
index 0000000..ad58a99
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py
@@ -0,0 +1,751 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright (c) 2024, Tri Dao, Albert Gu.
+# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_chunk_state.py
+
+# ruff: noqa: E501
+
+import math
+
+import torch
+
+from vllm.triton_utils import tl, triton
+
+from .mamba_ssm import softplus
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_H': 1}),
+        triton.Config({'BLOCK_SIZE_H': 2}),
+        triton.Config({'BLOCK_SIZE_H': 4}),
+        triton.Config({'BLOCK_SIZE_H': 8}),
+        triton.Config({'BLOCK_SIZE_H': 16}),
+        triton.Config({'BLOCK_SIZE_H': 32}),
+        triton.Config({'BLOCK_SIZE_H': 64}),
+    ],
+    key=['chunk_size', 'nheads'],
+)
+@triton.jit
+def _chunk_cumsum_fwd_kernel(
+    # Pointers to matrices
+    dt_ptr,
+    A_ptr,
+    dt_bias_ptr,
+    dt_out_ptr,
+    dA_cumsum_ptr,
+    # Matrix dimension
+    batch,
+    seqlen,
+    nheads,
+    chunk_size,
+    dt_min,
+    dt_max,
+    # Strides
+    stride_dt_batch,
+    stride_dt_seqlen,
+    stride_dt_head,
+    stride_A_head,
+    stride_dt_bias_head,
+    stride_dt_out_batch,
+    stride_dt_out_chunk,
+    stride_dt_out_head,
+    stride_dt_out_csize,
+    stride_dA_cs_batch,
+    stride_dA_cs_chunk,
+    stride_dA_cs_head,
+    stride_dA_cs_csize,
+    # Meta-parameters
+    DT_SOFTPLUS: tl.constexpr,
+    HAS_DT_BIAS: tl.constexpr,
+    BLOCK_SIZE_H: tl.constexpr,
+    BLOCK_SIZE_CHUNK: tl.constexpr,
+):
+    pid_b = tl.program_id(axis=0)
+
+    # if dt is long, may cause problems, so use 64 bit
+    # https://github.com/triton-lang/triton/issues/1058
+    pid_c = tl.program_id(axis=1).to(tl.int64)
+    pid_h = tl.program_id(axis=2)
+    dt_ptr += pid_b * stride_dt_batch + pid_c * chunk_size * stride_dt_seqlen
+    dt_out_ptr += pid_b * stride_dt_out_batch + pid_c * stride_dt_out_chunk
+    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk
+
+    offs_h = pid_h * BLOCK_SIZE_H + tl.arange(0, BLOCK_SIZE_H)
+    offs_c = tl.arange(0, BLOCK_SIZE_CHUNK)
+    dt_ptrs = dt_ptr + (offs_h[:, None] * stride_dt_head +
+                        offs_c[None, :] * stride_dt_seqlen)
+    A_ptrs = A_ptr + offs_h * stride_A_head
+    dt_out_ptrs = dt_out_ptr + (offs_h[:, None] * stride_dt_out_head +
+                                offs_c[None, :] * stride_dt_out_csize)
+    dA_cs_ptrs = dA_cumsum_ptr + (offs_h[:, None] * stride_dA_cs_head +
+                                  offs_c[None, :] * stride_dA_cs_csize)
+    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+
+    dt = tl.load(dt_ptrs,
+                 mask=(offs_h[:, None] < nheads) &
+                 (offs_c[None, :] < chunk_size_limit),
+                 other=0.0).to(tl.float32)
+    if HAS_DT_BIAS:
+        dt_bias = tl.load(dt_bias_ptr + offs_h * stride_dt_bias_head,
+                          mask=offs_h < nheads,
+                          other=0.0).to(tl.float32)
+        dt += dt_bias[:, None]
+    if DT_SOFTPLUS:
+        dt = tl.where(dt <= 20.0, softplus(dt), dt)
+    # As of Triton 2.2.0, tl.clamp is not available yet
+    # dt = tl.clamp(dt, dt_min, dt_max)
+    dt = tl.minimum(tl.maximum(dt, dt_min), dt_max)
+    dt = tl.where(
+        (offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), dt,
+        0.0)
+    tl.store(dt_out_ptrs,
+             dt,
+             mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size))
+    A = tl.load(A_ptrs, mask=offs_h < nheads, other=0.0).to(tl.float32)
+    dA = dt * A[:, None]
+    dA_cs = tl.cumsum(dA, axis=1)
+    tl.store(dA_cs_ptrs,
+             dA_cs,
+             mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size))
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 256,
+                'BLOCK_SIZE_K': 64
+            },
+            num_stages=3,
+            num_warps=8),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 256,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 128,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 128,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 32,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 32,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=5,
+            num_warps=2),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 32,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=5,
+            num_warps=2),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=2),
+    ],
+    key=['hdim', 'dstate', 'chunk_size'],
+)
+@triton.jit
+def _chunk_state_fwd_kernel(
+    # Pointers to matrices
+    x_ptr,
+    b_ptr,
+    states_ptr,
+    dt_ptr,
+    dA_cumsum_ptr,
+    seq_idx_ptr,
+    # Matrix dimensions
+    hdim,
+    dstate,
+    chunk_size,
+    batch,
+    seqlen,
+    nheads_ngroups_ratio,
+    # Strides
+    stride_x_batch,
+    stride_x_seqlen,
+    stride_x_head,
+    stride_x_hdim,
+    stride_b_batch,
+    stride_b_seqlen,
+    stride_b_head,
+    stride_b_dstate,
+    stride_states_batch,
+    stride_states_chunk,
+    stride_states_head,
+    stride_states_hdim,
+    stride_states_dstate,
+    stride_dt_batch,
+    stride_dt_chunk,
+    stride_dt_head,
+    stride_dt_csize,
+    stride_dA_cs_batch,
+    stride_dA_cs_chunk,
+    stride_dA_cs_head,
+    stride_dA_cs_csize,
+    stride_seq_idx_batch,
+    stride_seq_idx_seqlen,
+    # Meta-parameters
+    HAS_SEQ_IDX: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+):
+    pid_bc = tl.program_id(axis=1).to(tl.int64)
+    pid_c = pid_bc // batch
+    pid_b = pid_bc - pid_c * batch
+    pid_h = tl.program_id(axis=2)
+    num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N)
+    pid_m = tl.program_id(axis=0) // num_pid_n
+    pid_n = tl.program_id(axis=0) % num_pid_n
+    b_ptr += pid_b * stride_b_batch + pid_c * chunk_size * stride_b_seqlen + (
+        pid_h // nheads_ngroups_ratio) * stride_b_head
+    x_ptr += pid_b * stride_x_batch + pid_c * chunk_size * stride_x_seqlen + pid_h * stride_x_head
+    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
+    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
+    if HAS_SEQ_IDX:
+        seq_idx_ptr += pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    x_ptrs = x_ptr + (offs_m[:, None] * stride_x_hdim +
+                      offs_k[None, :] * stride_x_seqlen)
+    b_ptrs = b_ptr + (offs_n[None, :] * stride_b_dstate +
+                      offs_k[:, None] * stride_b_seqlen)
+    dt_ptrs = dt_ptr + offs_k * stride_dt_csize
+    dA_cs_last = tl.load(dA_cumsum_ptr +
+                         (chunk_size - 1) * stride_dA_cs_csize).to(tl.float32)
+    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
+    if HAS_SEQ_IDX:
+        seq_idx_ptrs = seq_idx_ptr + offs_k * stride_seq_idx_seqlen
+
+    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+    if HAS_SEQ_IDX:
+        seq_idx_last = tl.load(seq_idx_ptr +
+                               (chunk_size_limit - 1) * stride_seq_idx_seqlen)
+
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, chunk_size_limit, BLOCK_SIZE_K):
+        x = tl.load(x_ptrs,
+                    mask=(offs_m[:, None] < hdim) &
+                    (offs_k[None, :] < chunk_size_limit - k),
+                    other=0.0)
+        b = tl.load(b_ptrs,
+                    mask=(offs_k[:, None] < chunk_size_limit - k) &
+                    (offs_n[None, :] < dstate),
+                    other=0.0).to(tl.float32)
+        dA_cs_k = tl.load(dA_cumsum_ptrs,
+                          mask=offs_k < chunk_size_limit - k,
+                          other=0.0).to(tl.float32)
+        if HAS_SEQ_IDX:
+            seq_idx_k = tl.load(seq_idx_ptrs,
+                                mask=offs_k < chunk_size_limit - k,
+                                other=-1)
+        dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size_limit - k,
+                       other=0.0).to(tl.float32)
+        if not HAS_SEQ_IDX:
+            scale = tl.exp(dA_cs_last - dA_cs_k) * dt_k
+        else:
+            scale = tl.where(seq_idx_k == seq_idx_last,
+                             tl.exp(dA_cs_last - dA_cs_k) * dt_k, 0.0)
+        b *= scale[:, None]
+        b = b.to(x_ptr.dtype.element_ty)
+        acc += tl.dot(x, b)
+        x_ptrs += BLOCK_SIZE_K * stride_x_seqlen
+        b_ptrs += BLOCK_SIZE_K * stride_b_seqlen
+        dt_ptrs += BLOCK_SIZE_K * stride_dt_csize
+        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
+        if HAS_SEQ_IDX:
+            seq_idx_ptrs += BLOCK_SIZE_K * stride_seq_idx_seqlen
+    states = acc.to(states_ptr.dtype.element_ty)
+
+    states_ptr += pid_b * stride_states_batch + pid_c * stride_states_chunk + pid_h * stride_states_head
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    states_ptrs = states_ptr + (offs_m[:, None] * stride_states_hdim +
+                                offs_n[None, :] * stride_states_dstate)
+    c_mask = (offs_m[:, None] < hdim) & (offs_n[None, :] < dstate)
+    tl.store(states_ptrs, states, mask=c_mask)
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 256,
+                'BLOCK_SIZE_K': 64
+            },
+            num_stages=3,
+            num_warps=8),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 256,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 128,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 128,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 32,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 32,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=5,
+            num_warps=2),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 32,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=5,
+            num_warps=2),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=2),
+    ],
+    key=['hdim', 'dstate', 'chunk_size'],
+)
+@triton.jit
+def _chunk_state_varlen_kernel(
+    # Pointers to matrices
+    x_ptr,
+    b_ptr,
+    dt_ptr,
+    dA_cumsum_ptr,
+    chunk_states_ptr,
+    cu_seqlens_ptr,
+    states_ptr,
+    initstates_ptr,
+    # Matrix dimensions
+    hdim,
+    dstate,
+    chunk_size,
+    seqlen,
+    nheads_ngroups_ratio,
+    # Strides
+    stride_x_seqlen,
+    stride_x_head,
+    stride_x_hdim,
+    stride_b_seqlen,
+    stride_b_head,
+    stride_b_dstate,
+    stride_dt_chunk,
+    stride_dt_head,
+    stride_dt_csize,
+    stride_dA_cs_chunk,
+    stride_dA_cs_head,
+    stride_dA_cs_csize,
+    stride_chunk_states_chunk,
+    stride_chunk_states_head,
+    stride_chunk_states_hdim,
+    stride_chunk_states_dstate,
+    stride_states_batch,
+    stride_states_head,
+    stride_states_hdim,
+    stride_states_dstate,
+    stride_init_states_batch,
+    stride_init_states_head,
+    stride_init_states_hdim,
+    stride_init_states_dstate,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    HAS_INITSTATES: tl.constexpr,
+):
+    pid_b = tl.program_id(axis=1)
+    pid_h = tl.program_id(axis=2)
+    num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N)
+    pid_m = tl.program_id(axis=0) // num_pid_n
+    pid_n = tl.program_id(axis=0) % num_pid_n
+    end_idx = tl.load(cu_seqlens_ptr + pid_b + 1)
+    pid_c = (end_idx - 1) // chunk_size
+    b_ptr += pid_c * chunk_size * stride_b_seqlen + (
+        pid_h // nheads_ngroups_ratio) * stride_b_head
+    x_ptr += pid_c * chunk_size * stride_x_seqlen + pid_h * stride_x_head
+    dt_ptr += pid_c * stride_dt_chunk + pid_h * stride_dt_head
+    dA_cumsum_ptr += pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
+    chunk_states_ptr += pid_c * stride_chunk_states_chunk + pid_h * stride_chunk_states_head
+
+    if HAS_INITSTATES:
+        # if there are init states provided, we differentiate between states (which
+        # are boundary conditions at a chunk boundary) and initstates (which are boundary
+        # conditions when a new example in a cont batch starts)
+        initstates_ptr += pid_h * stride_init_states_head
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    x_ptrs = x_ptr + (offs_m[:, None] * stride_x_hdim +
+                      offs_k[None, :] * stride_x_seqlen)
+    b_ptrs = b_ptr + (offs_n[None, :] * stride_b_dstate +
+                      offs_k[:, None] * stride_b_seqlen)
+    dt_ptrs = dt_ptr + offs_k * stride_dt_csize
+    dA_cs_last = tl.load(dA_cumsum_ptr + (end_idx - pid_c * chunk_size - 1) *
+                         stride_dA_cs_csize).to(tl.float32)
+    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
+
+    chunk_size_limit = end_idx - pid_c * chunk_size
+    start_idx = tl.load(cu_seqlens_ptr + pid_b)
+    start_idx_cur = tl.maximum(start_idx - pid_c * chunk_size, 0)
+
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, chunk_size_limit, BLOCK_SIZE_K):
+        x = tl.load(x_ptrs,
+                    mask=(offs_m[:, None] < hdim) &
+                    (offs_k[None, :] < chunk_size_limit - k) &
+                    (offs_k[None, :] >= start_idx_cur - k),
+                    other=0.0)
+        b = tl.load(b_ptrs,
+                    mask=(offs_k[:, None] < chunk_size_limit - k) &
+                    (offs_n[None, :] < dstate) &
+                    (offs_k[:, None] >= start_idx_cur - k),
+                    other=0.0).to(tl.float32)
+        dA_cs_k = tl.load(dA_cumsum_ptrs,
+                          mask=offs_k < chunk_size_limit - k,
+                          other=0.0).to(tl.float32)
+        dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size_limit - k,
+                       other=0.0).to(tl.float32)
+        scale = tl.where(
+            (offs_k >= start_idx_cur - k) & (offs_k < chunk_size_limit - k),
+            tl.exp(dA_cs_last - dA_cs_k) * dt_k, 0.0)
+        b *= scale[:, None]
+        b = b.to(x_ptr.dtype.element_ty)
+        acc += tl.dot(x, b)
+        x_ptrs += BLOCK_SIZE_K * stride_x_seqlen
+        b_ptrs += BLOCK_SIZE_K * stride_b_seqlen
+        dt_ptrs += BLOCK_SIZE_K * stride_dt_csize
+        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
+
+    # If the sequence starts after the last chunk idx, we don't need to add the contribution from the last chunk
+    # If HAS_INITSTATES==True need to consider two possiblties
+    # - if start_idx < pid_c * chunk_size, then we need to take the past_states_ptrs
+    # - if state_idx >= pid * chunk_size, then we need to insert initstates
+    if ((start_idx < pid_c * chunk_size)  # first chunk
+            or (HAS_INITSTATES)):
+
+        dA_cs_boundary = 0.0  # default
+
+        if not HAS_INITSTATES:
+            past_states_ptrs = chunk_states_ptr + (
+                offs_m[:, None] * stride_chunk_states_hdim +
+                offs_n[None, :] * stride_chunk_states_dstate)
+        else:
+
+            # - this seems repetitive, buts its to help the compiler
+            if start_idx < pid_c * chunk_size:
+                past_states_ptrs = chunk_states_ptr + (
+                    offs_m[:, None] * stride_chunk_states_hdim +
+                    offs_n[None, :] * stride_chunk_states_dstate)
+            else:
+                past_states_ptrs = initstates_ptr + (
+                    pid_b * stride_init_states_batch +
+                    offs_m[:, None] * stride_init_states_hdim +
+                    offs_n[None, :] * stride_init_states_dstate)
+
+                # need to adjust the boundary
+                if start_idx > pid_c * chunk_size:
+                    dA_cs_boundary = tl.load(dA_cumsum_ptr +
+                                             (start_idx - pid_c * chunk_size -
+                                              1) * stride_dA_cs_csize).to(
+                                                  tl.float32)
+
+        past_states = tl.load(past_states_ptrs,
+                              mask=(offs_m[:, None] < hdim) &
+                              (offs_n[None, :] < dstate),
+                              other=0.0).to(tl.float32)
+
+        scale = tl.exp(dA_cs_last - dA_cs_boundary)
+        acc += past_states * scale
+
+    states = acc.to(states_ptr.dtype.element_ty)
+
+    states_ptr += pid_b * stride_states_batch + pid_h * stride_states_head
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    states_ptrs = states_ptr + (offs_m[:, None] * stride_states_hdim +
+                                offs_n[None, :] * stride_states_dstate)
+    c_mask = (offs_m[:, None] < hdim) & (offs_n[None, :] < dstate)
+    tl.store(states_ptrs, states, mask=c_mask)
+
+
+def _chunk_cumsum_fwd(dt,
+                      A,
+                      chunk_size,
+                      dt_bias=None,
+                      dt_softplus=False,
+                      dt_limit=(0.0, float("inf"))):
+    batch, seqlen, nheads = dt.shape
+    assert A.shape == (nheads, )
+    if dt_bias is not None:
+        assert dt_bias.shape == (nheads, )
+    nchunks = math.ceil(seqlen / chunk_size)
+    dt_out = torch.empty(batch,
+                         nheads,
+                         nchunks,
+                         chunk_size,
+                         device=dt.device,
+                         dtype=torch.float32)
+    dA_cumsum = torch.empty(batch,
+                            nheads,
+                            nchunks,
+                            chunk_size,
+                            device=dt.device,
+                            dtype=torch.float32)
+    grid_chunk_cs = lambda META: (batch, nchunks,
+                                  triton.cdiv(nheads, META['BLOCK_SIZE_H']))
+    with torch.cuda.device(dt.device.index):
+        _chunk_cumsum_fwd_kernel[grid_chunk_cs](
+            dt,
+            A,
+            dt_bias,
+            dt_out,
+            dA_cumsum,
+            batch,
+            seqlen,
+            nheads,
+            chunk_size,
+            dt_limit[0],
+            dt_limit[1],
+            dt.stride(0),
+            dt.stride(1),
+            dt.stride(2),
+            A.stride(0),
+            dt_bias.stride(0) if dt_bias is not None else 0,
+            dt_out.stride(0),
+            dt_out.stride(2),
+            dt_out.stride(1),
+            dt_out.stride(3),
+            dA_cumsum.stride(0),
+            dA_cumsum.stride(2),
+            dA_cumsum.stride(1),
+            dA_cumsum.stride(3),
+            dt_softplus,
+            HAS_DT_BIAS=dt_bias is not None,
+            BLOCK_SIZE_CHUNK=triton.next_power_of_2(chunk_size),
+        )
+    return dA_cumsum, dt_out
+
+
+def _chunk_state_fwd(B,
+                     x,
+                     dt,
+                     dA_cumsum,
+                     seq_idx=None,
+                     states=None,
+                     states_in_fp32=True):
+    batch, seqlen, nheads, headdim = x.shape
+    _, _, nchunks, chunk_size = dt.shape
+    _, _, ngroups, dstate = B.shape
+    assert nheads % ngroups == 0
+    assert B.shape == (batch, seqlen, ngroups, dstate)
+    assert dt.shape == (batch, nheads, nchunks, chunk_size)
+    assert dA_cumsum.shape == dt.shape
+    if seq_idx is not None:
+        assert seq_idx.shape == (batch, seqlen)
+    if states is not None:
+        assert states.shape == (batch, nchunks, nheads, headdim, dstate)
+    else:
+        states_dtype = torch.float32 if states_in_fp32 else B.dtype
+        states = torch.empty((batch, nchunks, nheads, headdim, dstate),
+                             device=x.device,
+                             dtype=states_dtype)
+    grid = lambda META: (
+        triton.cdiv(headdim, META['BLOCK_SIZE_M']) * triton.cdiv(
+            dstate, META['BLOCK_SIZE_N']), batch * nchunks, nheads)
+    with torch.cuda.device(x.device.index):
+        _chunk_state_fwd_kernel[grid](
+            x,
+            B,
+            states,
+            dt,
+            dA_cumsum,
+            seq_idx,
+            headdim,
+            dstate,
+            chunk_size,
+            batch,
+            seqlen,
+            nheads // ngroups,
+            x.stride(0),
+            x.stride(1),
+            x.stride(2),
+            x.stride(3),
+            B.stride(0),
+            B.stride(1),
+            B.stride(2),
+            B.stride(-1),
+            states.stride(0),
+            states.stride(1),
+            states.stride(2),
+            states.stride(3),
+            states.stride(4),
+            dt.stride(0),
+            dt.stride(2),
+            dt.stride(1),
+            dt.stride(3),
+            dA_cumsum.stride(0),
+            dA_cumsum.stride(2),
+            dA_cumsum.stride(1),
+            dA_cumsum.stride(3),
+            *((seq_idx.stride(0),
+               seq_idx.stride(1)) if seq_idx is not None else (0, 0)),
+            HAS_SEQ_IDX=seq_idx is not None,
+        )
+    return states
+
+
+def chunk_state_varlen(B,
+                       x,
+                       dt,
+                       dA_cumsum,
+                       cu_seqlens,
+                       chunk_states,
+                       initial_states=None):
+    total_seqlen, nheads, headdim = x.shape
+    _, nchunks, chunk_size = dt.shape
+    _, ngroups, dstate = B.shape
+    batch = cu_seqlens.shape[0] - 1
+    cu_seqlens = cu_seqlens.contiguous()
+    assert nheads % ngroups == 0
+    assert B.shape == (total_seqlen, ngroups, dstate)
+    assert dt.shape == (nheads, nchunks, chunk_size)
+    assert dA_cumsum.shape == dt.shape
+    assert chunk_states.shape == (nchunks, nheads, headdim, dstate)
+
+    if initial_states is not None:
+        assert initial_states.shape == (batch, nheads, headdim, dstate)
+
+    states = torch.empty(batch,
+                         nheads,
+                         headdim,
+                         dstate,
+                         dtype=chunk_states.dtype,
+                         device=chunk_states.device)
+    grid = lambda META: (triton.cdiv(headdim, META['BLOCK_SIZE_M']) * triton.
+                         cdiv(dstate, META['BLOCK_SIZE_N']), batch, nheads)
+    with torch.cuda.device(x.device.index):
+        _chunk_state_varlen_kernel[grid](
+            x,
+            B,
+            dt,
+            dA_cumsum,
+            chunk_states,
+            cu_seqlens,
+            states,
+            initial_states,
+            headdim,
+            dstate,
+            chunk_size,
+            total_seqlen,
+            nheads // ngroups,
+            x.stride(0),
+            x.stride(1),
+            x.stride(2),
+            B.stride(0),
+            B.stride(1),
+            B.stride(2),
+            dt.stride(1),
+            dt.stride(0),
+            dt.stride(2),
+            dA_cumsum.stride(1),
+            dA_cumsum.stride(0),
+            dA_cumsum.stride(2),
+            chunk_states.stride(0),
+            chunk_states.stride(1),
+            chunk_states.stride(2),
+            chunk_states.stride(3),
+            states.stride(0),
+            states.stride(1),
+            states.stride(2),
+            states.stride(3),
+            *((initial_states.stride(0), initial_states.stride(1),
+               initial_states.stride(2),
+               initial_states.stride(3)) if initial_states is not None else
+              (0, 0, 0, 0)),
+            HAS_INITSTATES=initial_states is not None)
+    return states
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/mamba/ops/ssd_combined.py b/vllm_v0.10.0/vllm/model_executor/layers/mamba/ops/ssd_combined.py
new file mode 100644
index 0000000..b121275
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/mamba/ops/ssd_combined.py
@@ -0,0 +1,232 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright (c) 2024, Tri Dao, Albert Gu.
+# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_combined.py
+
+# ruff: noqa: E501
+
+import torch
+from einops import rearrange
+from packaging import version
+
+from vllm.triton_utils import triton
+
+from .ssd_bmm import _bmm_chunk_fwd
+from .ssd_chunk_scan import _chunk_scan_fwd
+from .ssd_chunk_state import (_chunk_cumsum_fwd, _chunk_state_fwd,
+                              chunk_state_varlen)
+from .ssd_state_passing import _state_passing_fwd
+
+TRITON_22 = version.parse(triton.__version__) >= version.parse('2.2.0')
+
+
+def _mamba_chunk_scan_combined_fwd(x,
+                                   dt,
+                                   A,
+                                   B,
+                                   C,
+                                   chunk_size,
+                                   D=None,
+                                   z=None,
+                                   dt_bias=None,
+                                   initial_states=None,
+                                   seq_idx=None,
+                                   chunk_indices=None,
+                                   chunk_offsets=None,
+                                   cu_seqlens=None,
+                                   dt_softplus=False,
+                                   dt_limit=(0.0, float("inf"))):
+    batch, seqlen, nheads, headdim = x.shape
+    _, _, ngroups, dstate = B.shape
+    assert nheads % ngroups == 0
+    assert B.shape == (batch, seqlen, ngroups, dstate)
+    assert dt.shape == (batch, seqlen, nheads)
+    assert A.shape == (nheads, )
+    assert C.shape == B.shape
+    if z is not None:
+        assert z.shape == x.shape
+    if D is not None:
+        assert D.shape == (nheads, headdim) or D.shape == (nheads, )
+    if seq_idx is not None:
+        assert seq_idx.shape == (batch, seqlen)
+    if B.stride(-1) != 1:
+        B = B.contiguous()
+    if C.stride(-1) != 1:
+        C = C.contiguous()
+    if x.stride(-1) != 1 and x.stride(
+            1) != 1:  # Either M or K dimension should be contiguous
+        x = x.contiguous()
+    if z is not None and z.stride(-1) != 1 and z.stride(
+            1) != 1:  # Either M or K dimension should be contiguous
+        z = z.contiguous()
+    if D is not None and D.stride(-1) != 1:
+        D = D.contiguous()
+    if initial_states is not None:
+        if cu_seqlens is None:
+            assert initial_states.shape == (batch, nheads, headdim, dstate)
+        else:
+            assert initial_states.shape == (len(cu_seqlens) - 1, nheads,
+                                            headdim, dstate)
+
+    # This function executes 5 sub-functions for computing mamba
+    # - a good resource is the blog https://goombalab.github.io/blog/2024/mamba2-part3-algorithm/
+    #   which has a minimal implementation to understand the below operations
+    # - as explained by the blog, mamba is a special case of causal attention
+    # - the idea is to chunk the attention matrix and compute each
+    #   submatrix separately using different optimizations.
+    # - see the blog and paper for a visualization of the submatrices
+    #   which we refer to in the comments below
+
+    # 1. Compute chunked cumsum of A * dt
+    # - here dt may go through a softplus activation
+    dA_cumsum, dt = _chunk_cumsum_fwd(dt,
+                                      A,
+                                      chunk_size,
+                                      dt_bias=dt_bias,
+                                      dt_softplus=dt_softplus,
+                                      dt_limit=dt_limit)
+
+    # 2. Compute the state for each intra-chunk
+    # (right term of low-rank factorization of off-diagonal blocks; B terms)
+    states = _chunk_state_fwd(B,
+                              x,
+                              dt,
+                              dA_cumsum,
+                              seq_idx=seq_idx,
+                              states_in_fp32=True)
+
+    # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at chunk boundaries
+    # (middle term of factorization of off-diag blocks; A terms)
+    # - for handling chunked prefill, this requires i) initial_states
+    #   ii) seq_idx and iii) is_cont_batched to be all specified.
+    # - When a new seq_idx is detected, we will stop passing the prev_state
+    #   and switch accordingly to the init_state corresponding to the new seq_idx.
+    # - this will ensure that states will be updated with the rightmost flushed seq_idx
+    #   of the previous chunk. This implies that the first chunk of states is either 0
+    #   or equal to init_states of the first example.
+    states, final_states = _state_passing_fwd(
+        rearrange(states, "... p n -> ... (p n)"),
+        dA_cumsum[:, :, :, -1],
+        initial_states=rearrange(initial_states, "... p n -> ... (p n)")
+        if initial_states is not None else None,
+        seq_idx=seq_idx,
+        chunk_size=chunk_size,
+        out_dtype=C.dtype,
+        is_cont_batched=cu_seqlens is not None)
+    states, final_states = (rearrange(t, "... (p n) -> ... p n", n=dstate)
+                            for t in [states, final_states])
+
+    # 4. Compute batched matrix multiply for C_j^T B_i terms
+    CB = _bmm_chunk_fwd(C,
+                        B,
+                        chunk_size,
+                        seq_idx=seq_idx,
+                        output_dtype=torch.float32)
+
+    # 5. Scan and compute the diagonal blocks, taking into
+    #    account past causal states.
+    # - if initial states are provided, then states information will be
+    #   augmented with initial_states.
+    # - to do this properly, we need to account for example changes in
+    #   the continuous batch, therefore we introduce pseudo chunks, which is
+    #   a chunk that is split up each time an example changes.
+    # - in each (pseudo) chunk, we detect if the previous (pseudo) chunk had
+    #   a seq_idx change, in which case we take states information from
+    #   init_states.
+    out, out_x = _chunk_scan_fwd(
+        CB,
+        x,
+        dt,
+        dA_cumsum,
+        C,
+        states,
+        D=D,
+        z=z,
+        seq_idx=seq_idx,
+        chunk_indices=chunk_indices,
+        chunk_offsets=chunk_offsets,
+        initial_states=initial_states,
+    )
+    if cu_seqlens is None:
+        return out, out_x, dt, dA_cumsum, states, final_states
+    else:
+        assert batch == 1, "passing cu_seqlens to get the varlen states is only supported if batch dimension is 1"
+        varlen_states = chunk_state_varlen(
+            B.squeeze(0),
+            x.squeeze(0),
+            dt.squeeze(0),
+            dA_cumsum.squeeze(0),
+            cu_seqlens,
+            states.squeeze(0),
+            initial_states=initial_states,
+        )
+        return out, out_x, dt, dA_cumsum, states, final_states, varlen_states
+
+
+def mamba_chunk_scan_combined(x,
+                              dt,
+                              A,
+                              B,
+                              C,
+                              chunk_size,
+                              D=None,
+                              z=None,
+                              dt_bias=None,
+                              initial_states=None,
+                              seq_idx=None,
+                              chunk_indices=None,
+                              chunk_offsets=None,
+                              cu_seqlens=None,
+                              dt_softplus=False,
+                              dt_limit=(0.0, float("inf")),
+                              return_final_states=False,
+                              return_varlen_states=False):
+    """
+    Argument:
+        x: (batch, seqlen, nheads, headdim)
+        dt: (batch, seqlen, nheads)
+        A: (nheads)
+        B: (batch, seqlen, ngroups, dstate)
+        C: (batch, seqlen, ngroups, dstate)
+        chunk_size: int
+        D: (nheads, headdim) or (nheads,)
+        z: (batch, seqlen, nheads, headdim)
+        dt_bias: (nheads,)
+        initial_states: (batch, nheads, headdim, dstate)
+        seq_idx: (batch, seqlen)
+        cu_seqlens: (num_sequences + 1) or None, only used if return_varlen_states is True
+        dt_softplus: Whether to apply softplus to dt
+    Return:
+        out: (batch, seqlen, nheads, headdim)
+    """
+
+    if not return_varlen_states:
+        cu_seqlens = None
+    else:
+        assert cu_seqlens is not None, "cu_seqlens must be provided if return_varlen_states is True"
+    out, out_x, dt_out, dA_cumsum, states, final_states, *rest = _mamba_chunk_scan_combined_fwd(
+        x,
+        dt,
+        A,
+        B,
+        C,
+        chunk_size,
+        D=D,
+        z=z,
+        dt_bias=dt_bias,
+        initial_states=initial_states,
+        seq_idx=seq_idx,
+        chunk_indices=chunk_indices,
+        chunk_offsets=chunk_offsets,
+        cu_seqlens=cu_seqlens,
+        dt_softplus=dt_softplus,
+        dt_limit=dt_limit)
+    if not return_varlen_states:
+        return out if not return_final_states else (out, final_states)
+    else:
+        varlen_states = rest[0]
+        return (out,
+                varlen_states) if not return_final_states else (out,
+                                                                final_states,
+                                                                varlen_states)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py b/vllm_v0.10.0/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
new file mode 100644
index 0000000..a28fc9f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
@@ -0,0 +1,206 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright (c) 2024, Tri Dao, Albert Gu.
+# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_state_passing.py
+
+# ruff: noqa: E501
+
+import torch
+
+from vllm.triton_utils import tl, triton
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE': 64}),
+        triton.Config({'BLOCK_SIZE': 128}),
+        triton.Config({'BLOCK_SIZE': 256}),
+        triton.Config({'BLOCK_SIZE': 512}),
+        triton.Config({'BLOCK_SIZE': 1024}),
+        triton.Config({'BLOCK_SIZE': 2048}),
+    ],
+    key=['dim'],
+)
+@triton.jit
+def _state_passing_fwd_kernel(
+    # Pointers to matrices
+    states_ptr,
+    out_ptr,
+    final_states_ptr,
+    dA_cs_ptr,
+    initstates_ptr,
+    seq_idx_ptr,
+    # Matrix dimensions
+    dim,
+    nchunks,
+    seqlen,
+    chunk_size,
+    # Strides
+    stride_states_batch,
+    stride_states_chunk,
+    stride_states_head,
+    stride_states_dim,
+    stride_out_batch,
+    stride_out_chunk,
+    stride_out_head,
+    stride_out_dim,
+    stride_final_states_batch,
+    stride_final_states_head,
+    stride_final_states_dim,
+    stride_dA_cs_batch,
+    stride_dA_cs_chunk,
+    stride_dA_cs_head,
+    stride_initstates_batch,
+    stride_initstates_head,
+    stride_initstates_dim,
+    stride_seq_idx_batch,
+    stride_seq_idx_seqlen,
+    # Meta-parameters
+    HAS_INITSTATES: tl.constexpr,
+    HAS_SEQ_IDX: tl.constexpr,
+    IS_CONT_BATCHED: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid_b = tl.program_id(axis=1)
+    pid_h = tl.program_id(axis=2)
+    pid_m = tl.program_id(axis=0)
+    states_ptr += pid_b * stride_states_batch + pid_h * stride_states_head
+    dA_cs_ptr += pid_b * stride_dA_cs_batch + pid_h * stride_dA_cs_head
+    out_ptr += pid_b * stride_out_batch + pid_h * stride_out_head
+    final_states_ptr += pid_b * stride_final_states_batch + pid_h * stride_final_states_head
+    if HAS_INITSTATES:
+        initstates_ptr += pid_h * stride_initstates_head
+        if not IS_CONT_BATCHED:
+            initstates_ptr += pid_b * stride_initstates_batch
+
+    if HAS_SEQ_IDX:
+        seq_idx_ptr += pid_b * stride_seq_idx_batch
+
+    offs_m = pid_m * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    states_ptrs = states_ptr + offs_m * stride_states_dim
+    out_ptrs = out_ptr + offs_m * stride_out_dim
+    final_states_ptrs = final_states_ptr + offs_m * stride_final_states_dim
+
+    # - states will be the past state of the sequence that continues on the current check
+    if not HAS_INITSTATES:
+        states = tl.zeros((BLOCK_SIZE, ), dtype=tl.float32)
+    else:
+        initstates_ptr += offs_m * stride_initstates_dim
+        initstates_ptrs = initstates_ptr
+        # - for cont batches, for the first chunk mean it will be the first batch's
+        #   init state
+        states = tl.load(initstates_ptrs, mask=offs_m < dim,
+                         other=0.0).to(tl.float32)
+
+    tl.store(out_ptrs, states, mask=offs_m < dim)
+    out_ptrs += stride_out_chunk
+    seq_idx = 0
+    for c in range(nchunks):
+        new_states = tl.load(states_ptrs, mask=offs_m < dim,
+                             other=0.0).to(tl.float32)
+        dA_cs = tl.load(dA_cs_ptr).to(tl.float32)
+        scale = tl.exp(dA_cs)
+        if HAS_SEQ_IDX:
+            # - the seq to pass forward is the one that is flushed to the right
+            #   boundary.
+            # - that is given by seq_idx_new below.
+            seq_idx_new = tl.load(seq_idx_ptr +
+                                  (min((c + 1) * chunk_size, seqlen) - 1) *
+                                  stride_seq_idx_seqlen)
+            if HAS_INITSTATES:
+                if IS_CONT_BATCHED and seq_idx != seq_idx_new:
+                    # this means in the current chunk the rightmost flushed seq
+                    # has changed.
+                    # - so we do not propagate the state from previous chunk
+                    # - but rather we load that sequence's init state
+                    initstates_ptrs = initstates_ptr + seq_idx_new * stride_initstates_batch
+
+                    # - update state with seq_idx_new's init state
+                    states = tl.load(initstates_ptrs,
+                                     mask=offs_m < dim,
+                                     other=0.0).to(tl.float32)
+            else:
+                scale = tl.where(seq_idx_new == seq_idx, scale, 0.0)
+
+            seq_idx = seq_idx_new
+        states = scale * states + new_states
+        if c < nchunks - 1:
+            tl.store(out_ptrs, states, mask=offs_m < dim)
+        else:
+            tl.store(final_states_ptrs, states, mask=offs_m < dim)
+        states_ptrs += stride_states_chunk
+        dA_cs_ptr += stride_dA_cs_chunk
+        out_ptrs += stride_out_chunk
+
+
+def _state_passing_fwd(
+    states,
+    dA_chunk_cumsum,
+    initial_states=None,
+    seq_idx=None,
+    chunk_size=None,
+    out_dtype=None,
+    is_cont_batched=False,
+):
+    batch, nchunks, nheads, dim = states.shape
+    assert dA_chunk_cumsum.shape == (batch, nheads, nchunks)
+    if initial_states is not None:
+        if is_cont_batched:
+            # - if cu_seqlens is provided, then the initial states
+            #   are used for continuous batching. In which case we
+            #   require seq_idx to be provided
+            assert seq_idx is not None, ""
+        else:
+            # - this is the regular batching case, where initial
+            #   states are used are for each example of the batch.
+            assert initial_states.shape == (batch, nheads, dim)
+
+    if seq_idx is not None:
+        assert chunk_size is not None
+        seqlen = seq_idx.shape[-1]
+        assert seq_idx.shape == (batch, seqlen)
+    out_dtype = states.dtype if out_dtype is None else out_dtype
+    out = torch.empty((batch, nchunks, nheads, dim),
+                      device=states.device,
+                      dtype=out_dtype)
+    final_states = torch.empty((batch, nheads, dim),
+                               device=states.device,
+                               dtype=torch.float32)
+    grid = lambda META: (triton.cdiv(dim, META['BLOCK_SIZE']), batch, nheads)
+    with torch.cuda.device(states.device.index):
+        _state_passing_fwd_kernel[grid](
+            states,
+            out,
+            final_states,
+            dA_chunk_cumsum,
+            initial_states,
+            seq_idx,
+            dim,
+            nchunks,
+            seqlen if seq_idx is not None else 0,
+            chunk_size if seq_idx is not None else 0,
+            states.stride(0),
+            states.stride(1),
+            states.stride(2),
+            states.stride(3),
+            out.stride(0),
+            out.stride(1),
+            out.stride(2),
+            out.stride(3),
+            final_states.stride(0),
+            final_states.stride(1),
+            final_states.stride(2),
+            dA_chunk_cumsum.stride(0),
+            dA_chunk_cumsum.stride(2),
+            dA_chunk_cumsum.stride(1),
+            *((initial_states.stride(0), initial_states.stride(1),
+               initial_states.stride(2)) if initial_states is not None else
+              (0, 0, 0)),
+            *((seq_idx.stride(0),
+               seq_idx.stride(1)) if seq_idx is not None else (0, 0)),
+            HAS_INITSTATES=initial_states is not None,
+            HAS_SEQ_IDX=seq_idx is not None,
+            IS_CONT_BATCHED=is_cont_batched,
+        )
+    return out, final_states
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/pooler.py b/vllm_v0.10.0/vllm/model_executor/layers/pooler.py
new file mode 100644
index 0000000..c06cca0
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/pooler.py
@@ -0,0 +1,753 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from collections.abc import Mapping, Set
+from dataclasses import dataclass
+from enum import IntEnum
+from itertools import groupby
+from typing import Callable, Optional, TypeVar, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PretrainedConfig
+
+from vllm.config import ModelConfig, PoolerConfig
+from vllm.model_executor.pooling_metadata import (  # noqa: E501
+    PoolingMetadata as V0PoolingMetadata)
+from vllm.model_executor.pooling_metadata import PoolingTensors
+from vllm.pooling_params import PoolingParams, PoolingTask
+from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput
+from vllm.utils import resolve_obj_by_qualname
+from vllm.v1.pool.metadata import PoolingMetadata as V1PoolingMetadata
+
+PoolingMetadata = Union[V0PoolingMetadata, V1PoolingMetadata]
+PoolingFn = Callable[
+    [Union[torch.Tensor, list[torch.Tensor]], PoolingMetadata],
+    Union[torch.Tensor, list[torch.Tensor]]]
+ClassifierFn = Callable[[torch.Tensor], torch.Tensor]
+
+
+class PoolingType(IntEnum):
+    """Enumeration for different types of pooling methods."""
+    LAST = 0
+    ALL = 1
+    CLS = 2
+    STEP = 3
+    MEAN = 4
+
+
+@dataclass(frozen=True)
+class ResolvedPoolingConfig:
+    pooling_type: PoolingType
+
+    normalize: bool
+    softmax: bool
+    step_tag_id: Optional[int]
+    returned_token_ids: Optional[list[int]]
+
+    @classmethod
+    def from_config_with_defaults(
+        cls,
+        pooler_config: PoolerConfig,
+        pooling_type: PoolingType,
+        normalize: bool,
+        softmax: bool,
+        step_tag_id: Optional[int] = None,
+        returned_token_ids: Optional[list[int]] = None,
+    ) -> "ResolvedPoolingConfig":
+        return cls(
+            pooling_type=PoolingType[pooler_config.pooling_type]
+            if pooler_config.pooling_type is not None else pooling_type,
+            normalize=pooler_config.normalize
+            if pooler_config.normalize is not None else normalize,
+            softmax=pooler_config.softmax
+            if pooler_config.softmax is not None else softmax,
+            step_tag_id=pooler_config.step_tag_id
+            if pooler_config.step_tag_id is not None else step_tag_id,
+            returned_token_ids=pooler_config.returned_token_ids
+            if pooler_config.returned_token_ids is not None else
+            returned_token_ids,
+        )
+
+
+@dataclass(frozen=True)
+class PoolingParamsUpdate:
+    requires_token_ids: bool = False
+    """Set this flag to enable `get_prompt_token_ids` for your pooler."""
+
+    def apply(self, params: PoolingParams) -> None:
+        params.requires_token_ids = self.requires_token_ids
+
+
+class Pooler(nn.Module, ABC):
+    """The interface required for all poolers used in pooling models in vLLM."""
+
+    @staticmethod
+    def for_encode(
+        pooler_config: PoolerConfig,
+        *,
+        default_pooling_type: PoolingType = PoolingType.ALL,
+        default_normalize: bool = False,
+        default_softmax: bool = False,
+        default_step_tag_id: Optional[int] = None,
+        default_returned_token_ids: Optional[list[int]] = None,
+    ):
+        resolved_config = ResolvedPoolingConfig.from_config_with_defaults(
+            pooler_config=pooler_config,
+            pooling_type=default_pooling_type,
+            normalize=default_normalize,
+            softmax=default_softmax,
+            step_tag_id=default_step_tag_id,
+            returned_token_ids=default_returned_token_ids,
+        )
+
+        if resolved_config.pooling_type == PoolingType.STEP:
+            return StepPooler.from_config(resolved_config)
+
+        return SimplePooler.from_config(resolved_config)
+
+    @staticmethod
+    def for_embed(
+        pooler_config: PoolerConfig,
+        *,
+        default_pooling_type: PoolingType = PoolingType.LAST,
+        default_normalize: bool = True,
+        default_softmax: bool = False,
+    ):
+        resolved_config = ResolvedPoolingConfig.from_config_with_defaults(
+            pooler_config=pooler_config,
+            pooling_type=default_pooling_type,
+            normalize=default_normalize,
+            softmax=default_softmax,
+        )
+
+        return SimplePooler.from_config(resolved_config)
+
+    @staticmethod
+    def for_classify(
+        pooler_config: PoolerConfig,
+        classifier: Optional[ClassifierFn],
+        *,
+        default_pooling_type: PoolingType = PoolingType.LAST,
+        default_normalize: bool = False,
+        default_softmax: bool = True,
+    ):
+        resolved_config = ResolvedPoolingConfig.from_config_with_defaults(
+            pooler_config=pooler_config,
+            pooling_type=default_pooling_type,
+            normalize=default_normalize,
+            softmax=default_softmax,
+        )
+        base_pooler = SimplePooler.from_config(resolved_config)
+        if classifier is None:
+            return base_pooler
+
+        return ClassifierPooler(
+            pooling=base_pooler.pooling,
+            classifier=classifier,
+            act_fn=base_pooler.head.activation,
+        )
+
+    @abstractmethod
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        """Determine which pooling tasks are supported."""
+        raise NotImplementedError
+
+    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
+        """
+        Construct the updated pooling parameters to use for a supported task.
+        """
+        return PoolingParamsUpdate()
+
+    @abstractmethod
+    def forward(
+        self,
+        hidden_states: Union[list[torch.Tensor], torch.Tensor],
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        raise NotImplementedError
+
+
+def get_prompt_lens(
+    hidden_states: Union[torch.Tensor, list[torch.Tensor]],
+    pooling_metadata: PoolingMetadata,
+) -> torch.Tensor:
+    if isinstance(pooling_metadata, V1PoolingMetadata):
+        return pooling_metadata.prompt_lens
+
+    return PoolingTensors.from_pooling_metadata(
+        pooling_metadata, hidden_states[0].device).prompt_lens
+
+
+def get_prompt_token_ids(
+        pooling_metadata: PoolingMetadata) -> list[torch.Tensor]:
+    if isinstance(pooling_metadata, V1PoolingMetadata):
+        assert pooling_metadata.prompt_token_ids is not None, (
+            "Please set `requires_token_ids=True` in `get_pooling_updates`")
+
+        return [
+            pooling_metadata.prompt_token_ids[i, :num]
+            for i, num in enumerate(pooling_metadata.prompt_lens)
+        ]
+
+    return [
+        torch.tensor(seq_data_i.prompt_token_ids)
+        for seq_data_i in pooling_metadata.seq_data.values()
+    ]
+
+
+def get_tasks(pooling_metadata: PoolingMetadata) -> list[PoolingTask]:
+    if isinstance(pooling_metadata, V0PoolingMetadata):
+        pooling_params = [p for _, p in pooling_metadata.seq_groups]
+    else:
+        pooling_params = pooling_metadata.pooling_params
+
+    tasks: list[PoolingTask] = [
+        task for pooling_param in pooling_params
+        if (task := pooling_param.task) is not None
+    ]
+    assert len(pooling_params) == len(tasks)
+
+    return tasks
+
+
+def get_classification_activation_function(config: PretrainedConfig):
+    return PoolerClassify()
+
+
+def get_cross_encoder_activation_function(config: PretrainedConfig):
+    function_name: Optional[str] = None
+    if (hasattr(config, "sentence_transformers")
+            and "activation_fn" in config.sentence_transformers):
+        function_name = config.sentence_transformers["activation_fn"]
+    elif (hasattr(config, "sbert_ce_default_activation_function")
+          and config.sbert_ce_default_activation_function is not None):
+        function_name = config.sbert_ce_default_activation_function
+
+    if function_name is not None:
+        assert function_name.startswith("torch.nn.modules."), (
+            "Loading of activation functions is restricted to "
+            "torch.nn.modules for security reasons")
+        fn = resolve_obj_by_qualname(function_name)()
+        return PoolerActivation.wraps(fn)
+
+    return PoolerScore()
+
+
+def build_output(
+    all_data: Union[torch.Tensor, list[torch.Tensor]], ) -> PoolerOutput:
+    all_outputs = [PoolingSequenceGroupOutput(data) for data in all_data]
+    return PoolerOutput(outputs=all_outputs)
+
+
+class PoolingMethod(nn.Module, ABC):
+
+    @staticmethod
+    def from_pooling_type(pooling_type: PoolingType) -> "PoolingMethod":
+        if pooling_type == PoolingType.LAST:
+            return LastPool()
+        if pooling_type == PoolingType.ALL:
+            return AllPool()
+        if pooling_type == PoolingType.CLS:
+            return CLSPool()
+        if pooling_type == PoolingType.MEAN:
+            return MeanPool()
+
+        raise NotImplementedError(f"Unsupported method: {pooling_type}")
+
+    @abstractmethod
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        raise NotImplementedError
+
+    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
+        return PoolingParamsUpdate()
+
+    @abstractmethod
+    def forward_one(
+        self,
+        hidden_states: torch.Tensor,
+        prompt_len: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Note:
+            `prompt_len=None` means `prompt_len=len(hidden_states)`.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def forward_all(
+        self,
+        hidden_states: torch.Tensor,
+        prompt_lens: torch.Tensor,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        raise NotImplementedError
+
+    def forward(
+        self,
+        hidden_states: Union[torch.Tensor, list[torch.Tensor]],
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        prompt_lens = get_prompt_lens(hidden_states, pooling_metadata)
+
+        if isinstance(hidden_states, list):
+            return [
+                self.forward_one(h, prompt_len)
+                for h, prompt_len in zip(hidden_states, prompt_lens)
+            ]
+
+        return self.forward_all(hidden_states, prompt_lens)
+
+
+class CLSPool(PoolingMethod):
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return {"encode", "embed", "classify", "score"}
+
+    def forward_one(
+        self,
+        hidden_states: torch.Tensor,
+        prompt_len: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        assert prompt_len is None or prompt_len == hidden_states.shape[0], \
+            "partial prefill not supported with CLS pooling"
+
+        return hidden_states[0]
+
+    def forward_all(
+        self,
+        hidden_states: torch.Tensor,
+        prompt_lens: torch.Tensor,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        first_token_flat_indices = torch.zeros_like(prompt_lens)
+        first_token_flat_indices[1:] += torch.cumsum(prompt_lens, dim=0)[:-1]
+        return hidden_states[first_token_flat_indices]
+
+
+class LastPool(PoolingMethod):
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return {"encode", "embed", "classify", "score"}
+
+    def forward_one(
+        self,
+        hidden_states: torch.Tensor,
+        prompt_len: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return hidden_states[-1]
+
+    def forward_all(
+        self,
+        hidden_states: torch.Tensor,
+        prompt_lens: torch.Tensor,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        last_token_flat_indices = torch.cumsum(prompt_lens, dim=0) - 1
+        return hidden_states[last_token_flat_indices]
+
+
+class AllPool(PoolingMethod):
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return {"encode"}
+
+    def forward_one(
+        self,
+        hidden_states: torch.Tensor,
+        prompt_len: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        assert prompt_len is None or prompt_len == hidden_states.shape[0], \
+            "partial prefill not supported with ALL pooling"
+
+        return hidden_states
+
+    def forward_all(
+        self,
+        hidden_states: torch.Tensor,
+        prompt_lens: torch.Tensor,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        return list(hidden_states.split_with_sizes(prompt_lens.tolist()))
+
+
+class MeanPool(PoolingMethod):
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return {"encode", "embed", "classify", "score"}
+
+    def forward_one(
+        self,
+        hidden_states: torch.Tensor,
+        prompt_len: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        assert prompt_len is None or prompt_len == hidden_states.shape[0], \
+            "partial prefill not supported with MEAN pooling"
+
+        return hidden_states.mean(dim=0, dtype=torch.float32)
+
+    def forward_all(
+        self,
+        hidden_states: torch.Tensor,
+        prompt_lens: torch.Tensor,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        # Use float32 for torch.cumsum in MeanPool,
+        # otherwise precision will be lost significantly.
+        cumsum = torch.cumsum(hidden_states, dim=0, dtype=torch.float32)
+
+        start_indices = torch.cat([
+            torch.tensor([0], device=hidden_states.device),
+            torch.cumsum(prompt_lens[:-1], dim=0)
+        ])
+        end_indices = torch.cumsum(prompt_lens, dim=0)
+        return (cumsum[end_indices - 1] - cumsum[start_indices] +
+                hidden_states[start_indices]) / prompt_lens.unsqueeze(1)
+
+
+_T = TypeVar("_T", torch.Tensor, list[torch.Tensor])
+
+
+class BasePoolerActivation(nn.Module, ABC):
+
+    @abstractmethod
+    def forward(self, pooled_data: _T) -> _T:
+        # shape:
+        # classify (& score) -> (batch_size, num_classes)
+        # embed -> (batch_size, embedding_dim) or list(embedding_dim)
+        #          (batch_size, dimensions) or list(dimensions) if using MRL
+        raise NotImplementedError
+
+
+class PoolerActivation(BasePoolerActivation):
+
+    @staticmethod
+    def wraps(module: nn.Module):
+        if isinstance(module, nn.Identity):
+            return PoolerIdentity()
+        if isinstance(module, (nn.Sigmoid, nn.Softmax)):
+            return PoolerClassify()
+
+        return LambdaPoolerActivation(module)
+
+    @abstractmethod
+    def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+
+    def forward(self, pooled_data: _T) -> _T:
+        if isinstance(pooled_data, list):
+            return [self.forward_chunk(data) for data in pooled_data]
+
+        return self.forward_chunk(pooled_data)
+
+
+class PoolerIdentity(PoolerActivation):
+
+    def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
+        return pooled_data
+
+
+class PoolerNormalize(PoolerActivation):
+
+    def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
+        x = F.normalize(pooled_data.float(), p=2, dim=-1)
+        return x.to(pooled_data.dtype)
+
+
+class PoolerClassify(PoolerActivation):
+
+    def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
+        num_labels = pooled_data.shape[-1]
+        if num_labels < 2:
+            return F.sigmoid(pooled_data.float()).to(pooled_data.dtype)
+
+        return F.softmax(pooled_data.float(), dim=-1).to(pooled_data.dtype)
+
+
+class PoolerScore(PoolerActivation):
+
+    def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
+        num_labels = pooled_data.shape[-1]
+        if num_labels < 2:
+            return F.sigmoid(pooled_data.float()).to(pooled_data.dtype)
+
+        return pooled_data
+
+
+class LambdaPoolerActivation(PoolerActivation):
+
+    def __init__(self, fn: Callable[[torch.Tensor], torch.Tensor]):
+        super().__init__()
+
+        self.fn = fn
+
+    def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
+        return self.fn(pooled_data)
+
+
+class PoolerHead(nn.Module):
+
+    @classmethod
+    def from_config(cls, pooler_config: ResolvedPoolingConfig) -> "PoolerHead":
+        if pooler_config.normalize and pooler_config.softmax:
+            raise ValueError("`normalize=True` and `softmax=True` should not "
+                             "be set together")
+
+        activation: PoolerActivation
+        if pooler_config.normalize:
+            activation = PoolerNormalize()
+        elif pooler_config.softmax:
+            activation = PoolerClassify()
+        else:
+            activation = PoolerIdentity()
+
+        return cls(activation)
+
+    def __init__(self, activation: PoolerActivation) -> None:
+        super().__init__()
+
+        self.activation = activation
+
+    def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor],
+                pooling_metadata: PoolingMetadata):
+
+        # Using float32 in PoolerHead
+        if isinstance(pooled_data, list):
+            for i in range(len(pooled_data)):
+                pooled_data[i] = pooled_data[i].to(torch.float32)
+        else:
+            pooled_data = pooled_data.to(torch.float32)
+
+        # for matryoshka representation
+        if isinstance(pooling_metadata, V0PoolingMetadata):
+            dimensions_list = [
+                pooling_param.dimensions
+                for _, pooling_param in pooling_metadata.seq_groups
+            ]
+        else:
+            assert isinstance(pooled_data, list)
+            dimensions_list = [
+                pooling_param.dimensions
+                for pooling_param in pooling_metadata.pooling_params
+            ]
+        if any(d is not None for d in dimensions_list):
+            # change the output dimension
+            assert len(pooled_data) == len(dimensions_list)
+            if len(set(dimensions_list)) == 1 and not isinstance(
+                    pooled_data, list):
+                # if all dimensions are the same
+                d = dimensions_list[0]
+                pooled_data = pooled_data[..., :d]
+            else:
+                pooled_data = [
+                    vecs if d is None else vecs[..., :d]
+                    for vecs, d in zip(pooled_data, dimensions_list)
+                ]
+
+        return self.activation(pooled_data)
+
+
+class SimplePooler(Pooler):
+    """A layer that pools specific information from hidden states.
+
+    This layer does the following:
+    1. Extracts specific tokens or aggregates data based on pooling method.
+    2. Normalizes output if specified.
+    3. Returns structured results as `PoolerOutput`.
+    """
+
+    @classmethod
+    def from_config(
+        cls,
+        pooler_config: ResolvedPoolingConfig,
+    ) -> "SimplePooler":
+        pooling = PoolingMethod.from_pooling_type(pooler_config.pooling_type)
+        head = PoolerHead.from_config(pooler_config)
+
+        return cls(pooling, head)
+
+    def __init__(self, pooling: PoolingMethod, head: PoolerHead) -> None:
+        super().__init__()
+
+        self.pooling = pooling
+        self.head = head
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return self.pooling.get_supported_tasks()
+
+    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
+        return self.pooling.get_pooling_updates(task)
+
+    def forward(
+        self,
+        hidden_states: Union[torch.Tensor, list[torch.Tensor]],
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        pooled_data = self.pooling(hidden_states, pooling_metadata)
+        pooled_data = self.head(pooled_data, pooling_metadata)
+        return build_output(pooled_data)
+
+
+class StepPooler(Pooler):
+
+    @classmethod
+    def from_config(cls, pooler_config: ResolvedPoolingConfig) -> "StepPooler":
+        assert pooler_config.pooling_type == PoolingType.STEP
+
+        return cls(
+            PoolerHead.from_config(pooler_config),
+            step_tag_id=pooler_config.step_tag_id,
+            returned_token_ids=pooler_config.returned_token_ids,
+        )
+
+    def __init__(
+        self,
+        head: PoolerHead,
+        *,
+        step_tag_id: Optional[int] = None,
+        returned_token_ids: Optional[list[int]] = None,
+    ) -> None:
+        super().__init__()
+
+        self.pooling = AllPool()
+        self.head = head
+        self.step_tag_id = step_tag_id
+        self.returned_token_ids = returned_token_ids
+
+    def extract_states(
+        self,
+        hidden_states: Union[torch.Tensor, list[torch.Tensor]],
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        pooled_data_lst = self.pooling(hidden_states, pooling_metadata)
+        prompt_token_ids = get_prompt_token_ids(pooling_metadata)
+
+        pooled_data = list[torch.Tensor]()
+        returned_token_ids = self.returned_token_ids
+        step_tag_id = self.step_tag_id
+
+        for data, token_id in zip(pooled_data_lst, prompt_token_ids):
+            if returned_token_ids is not None and len(returned_token_ids) > 0:
+                data = data[:, returned_token_ids]
+
+            if step_tag_id is not None:
+                data = data[token_id == step_tag_id]
+            pooled_data.append(data)
+
+        return pooled_data
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return {"encode"}
+
+    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
+        return PoolingParamsUpdate(requires_token_ids=True)
+
+    def forward(
+        self,
+        hidden_states: Union[torch.Tensor, list[torch.Tensor]],
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        pooled_data = self.extract_states(hidden_states, pooling_metadata)
+        pooled_data = self.head(pooled_data, pooling_metadata)
+        return build_output(pooled_data)
+
+
+class ClassifierPooler(Pooler):
+    """A pooling layer for classification tasks.
+
+    This layer does the following:
+    1. Applies a classification layer to the hidden states.
+    2. Optionally applies a pooler layer.
+    3. Applies an activation function to the output.
+    """
+
+    @staticmethod
+    def act_fn_for_seq_cls(config: ModelConfig):
+        return get_classification_activation_function(config.hf_config)
+
+    @staticmethod
+    def act_fn_for_cross_encoder(config: ModelConfig):
+        return get_cross_encoder_activation_function(config.hf_config)
+
+    def __init__(
+        self,
+        pooling: PoolingFn,
+        classifier: ClassifierFn,
+        act_fn: PoolerActivation,
+    ) -> None:
+        super().__init__()
+
+        self.pooling = pooling
+        self.classifier = classifier
+        self.act_fn = act_fn
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return {"classify", "score"}
+
+    def forward(
+        self,
+        hidden_states: Union[torch.Tensor, list[torch.Tensor]],
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        pooled_data = self.pooling(hidden_states, pooling_metadata)
+
+        # apply classifier once on the full batch if possible
+        if isinstance(pooled_data, torch.Tensor):
+            pooled_output = self.classifier(pooled_data)
+        elif len({data.shape for data in pooled_data}) <= 1:
+            pooled_output = self.classifier(torch.stack(pooled_data))
+        else:
+            pooled_output = [self.classifier(data) for data in pooled_data]
+
+        scores = self.act_fn(pooled_output)
+
+        return build_output(scores)
+
+
+class DispatchPooler(Pooler):
+    """Dispatches calls to a sub-pooler based on the pooling task."""
+
+    def __init__(self, poolers_by_task: Mapping[PoolingTask, Pooler]) -> None:
+        super().__init__()
+
+        for task, pooler in poolers_by_task.items():
+            if task not in pooler.get_supported_tasks():
+                raise ValueError(
+                    f"{pooler=} does not support {task=}. "
+                    f"Supported tasks: {pooler.get_supported_tasks()}")
+
+        self.poolers_by_task = poolers_by_task
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return set(self.poolers_by_task)
+
+    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
+        return self.poolers_by_task[task].get_pooling_updates(task)
+
+    def forward(
+        self,
+        hidden_states: Union[torch.Tensor, list[torch.Tensor]],
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        poolers_by_task = self.poolers_by_task
+
+        if isinstance(hidden_states, list):
+            hidden_states_lst = hidden_states
+        else:
+            prompt_lens = get_prompt_lens(hidden_states, pooling_metadata)
+            hidden_states_lst = list(hidden_states.split(prompt_lens.tolist()))
+
+        outputs = list[PoolingSequenceGroupOutput]()
+        offset = 0
+        for task, group in groupby(get_tasks(pooling_metadata)):
+            if not (pooler := poolers_by_task.get(task)):
+                raise ValueError(
+                    f"Unsupported task: {task} "
+                    f"Supported tasks: {self.get_supported_tasks()}")
+
+            num_items = len(list(group))
+            group_output: PoolerOutput = pooler(
+                hidden_states_lst[offset:offset + num_items],
+                pooling_metadata[offset:offset + num_items],
+            )
+
+            outputs.extend(group_output.outputs)
+            offset += num_items
+
+        return PoolerOutput(outputs)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/__init__.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/__init__.py
new file mode 100644
index 0000000..95aea91
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/__init__.py
@@ -0,0 +1,163 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Literal, get_args
+
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+
+QuantizationMethods = Literal[
+    "aqlm",
+    "awq",
+    "deepspeedfp",
+    "tpu_int8",
+    "fp8",
+    "ptpc_fp8",
+    "fbgemm_fp8",
+    "modelopt",
+    "modelopt_fp4",
+    "marlin",
+    "bitblas",
+    "gguf",
+    "gptq_marlin_24",
+    "gptq_marlin",
+    "gptq_bitblas",
+    "awq_marlin",
+    "gptq",
+    "compressed-tensors",
+    "bitsandbytes",
+    "qqq",
+    "hqq",
+    "experts_int8",
+    "neuron_quant",
+    "ipex",
+    "quark",
+    "moe_wna16",
+    "torchao",
+    "auto-round",
+    "rtn",
+    "inc",
+]
+QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods))
+
+# The customized quantization methods which will be added to this dict.
+_CUSTOMIZED_METHOD_TO_QUANT_CONFIG = {}
+
+
+def register_quantization_config(quantization: str):
+    """Register a customized vllm quantization config.
+
+    When a quantization method is not supported by vllm, you can register a customized
+    quantization config to support it.
+
+    Args:
+        quantization (str): The quantization method name.
+
+    Examples:
+        >>> from vllm.model_executor.layers.quantization import register_quantization_config
+        >>> from vllm.model_executor.layers.quantization import get_quantization_config
+        >>> from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+        >>>
+        >>> @register_quantization_config("my_quant")
+        ... class MyQuantConfig(QuantizationConfig):
+        ...     pass
+        >>>
+        >>> get_quantization_config("my_quant")
+        <class 'MyQuantConfig'>
+    """  # noqa: E501
+
+    def _wrapper(quant_config_cls):
+        if quantization in QUANTIZATION_METHODS:
+            raise ValueError(
+                f"The quantization method `{quantization}` is already exists.")
+        if not issubclass(quant_config_cls, QuantizationConfig):
+            raise ValueError("The quantization config must be a subclass of "
+                             "`QuantizationConfig`.")
+        _CUSTOMIZED_METHOD_TO_QUANT_CONFIG[quantization] = quant_config_cls
+        QUANTIZATION_METHODS.append(quantization)
+        return quant_config_cls
+
+    return _wrapper
+
+
+def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
+    if quantization not in QUANTIZATION_METHODS:
+        raise ValueError(f"Invalid quantization method: {quantization}")
+
+    # lazy import to avoid triggering `torch.compile` too early
+    from vllm.model_executor.layers.quantization.quark.quark import QuarkConfig
+
+    from .aqlm import AQLMConfig
+    from .auto_round import AutoRoundConfig
+    from .awq import AWQConfig
+    from .awq_marlin import AWQMarlinConfig
+    from .bitblas import BitBLASConfig
+    from .bitsandbytes import BitsAndBytesConfig
+    from .compressed_tensors.compressed_tensors import (  # noqa: E501
+        CompressedTensorsConfig)
+    from .deepspeedfp import DeepSpeedFPConfig
+    from .experts_int8 import ExpertsInt8Config
+    from .fbgemm_fp8 import FBGEMMFp8Config
+    from .fp8 import Fp8Config
+    from .gguf import GGUFConfig
+    from .gptq import GPTQConfig
+    from .gptq_bitblas import GPTQBitBLASConfig
+    from .gptq_marlin import GPTQMarlinConfig
+    from .gptq_marlin_24 import GPTQMarlin24Config
+    from .hqq_marlin import HQQMarlinConfig
+    from .inc import INCConfig
+    from .ipex_quant import IPEXConfig
+    from .marlin import MarlinConfig
+    from .modelopt import ModelOptFp8Config, ModelOptNvFp4Config
+    from .moe_wna16 import MoeWNA16Config
+    from .neuron_quant import NeuronQuantConfig
+    from .ptpc_fp8 import PTPCFp8Config
+    from .qqq import QQQConfig
+    from .rtn import RTNConfig
+    from .torchao import TorchAOConfig
+    from .tpu_int8 import Int8TpuConfig
+
+    method_to_config: dict[str, type[QuantizationConfig]] = {
+        "aqlm": AQLMConfig,
+        "awq": AWQConfig,
+        "deepspeedfp": DeepSpeedFPConfig,
+        "tpu_int8": Int8TpuConfig,
+        "fp8": Fp8Config,
+        "fbgemm_fp8": FBGEMMFp8Config,
+        "modelopt": ModelOptFp8Config,
+        "modelopt_fp4": ModelOptNvFp4Config,
+        "marlin": MarlinConfig,
+        "bitblas": BitBLASConfig,
+        "gguf": GGUFConfig,
+        "gptq_marlin_24": GPTQMarlin24Config,
+        "gptq_marlin": GPTQMarlinConfig,
+        "gptq_bitblas": GPTQBitBLASConfig,
+        "awq_marlin": AWQMarlinConfig,
+        "gptq": GPTQConfig,
+        "compressed-tensors": CompressedTensorsConfig,
+        "bitsandbytes": BitsAndBytesConfig,
+        "ptpc_fp8": PTPCFp8Config,
+        "qqq": QQQConfig,
+        "hqq": HQQMarlinConfig,
+        "experts_int8": ExpertsInt8Config,
+        "neuron_quant": NeuronQuantConfig,
+        "ipex": IPEXConfig,
+        "quark": QuarkConfig,
+        "moe_wna16": MoeWNA16Config,
+        "torchao": TorchAOConfig,
+        "auto-round": AutoRoundConfig,
+        "rtn": RTNConfig,
+        "inc": INCConfig,
+    }
+    # Update the `method_to_config` with customized quantization methods.
+    method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)
+
+    return method_to_config[quantization]
+
+
+__all__ = [
+    "QuantizationConfig",
+    "QuantizationMethods",
+    "get_quantization_config",
+    "QUANTIZATION_METHODS",
+]
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/aqlm.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/aqlm.py
new file mode 100644
index 0000000..2ea8c5d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/aqlm.py
@@ -0,0 +1,376 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Supports AQLM compression, see https://github.com/Vahe1994/AQLM
+# and https://arxiv.org/pdf/2401.06118.pdf
+
+import math
+from typing import Any, Optional
+
+import torch
+import torch.nn.functional as F
+from torch.nn.parameter import Parameter
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.utils import set_weight_attrs
+
+
+def get_int_dtype(nbits: int) -> torch.dtype:
+    if nbits <= 8:
+        return torch.int8
+    if nbits <= 16:
+        return torch.int16
+    if nbits <= 32:
+        return torch.int32
+    if nbits <= 64:
+        return torch.int64
+    raise ValueError(f"No dtype available for {nbits}-bit codebooks")
+
+
+@torch.inference_mode()
+def unpack_int_data(data: torch.IntTensor, nbits: int) -> torch.IntTensor:
+    return data.to(torch.int64) % (2**nbits)
+
+
+def dequantize_weight(codes: torch.Tensor,
+                      codebooks: torch.Tensor,
+                      scales: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """
+    Decode float weights from quantization codes. Differentiable.
+    :param codes: tensor of integer quantization codes, shape 
+        [*dims, num_out_groups, num_in_groups, num_codebooks]
+    :param codebooks: tensor of vectors for each quantization code, 
+        [num_codebooks, codebook_size, out_group_size, in_group_size]
+    :param scales: weight will be multiplied by this factor, must be 
+        broadcastble with 
+        [*dims, out_groups, num_in_groups, out_group_size, in_group_size]
+    :return: reconstructed weight tensor of shape 
+        [*dims, num_in_groups*group_size]
+    """
+    num_out_groups, num_in_groups, num_codebooks = codes.shape[-3:]
+    num_codebooks, codebook_size, out_group_size, in_group_size = \
+        codebooks.shape
+    out_features = num_out_groups * out_group_size
+    in_features = num_in_groups * in_group_size
+    codebook_offsets = torch.arange(
+        0, num_codebooks * codebook_size, codebook_size,
+        device=codes.device)  # shape: [num_codebooks]
+    reconstructed_weight_flat = F.embedding_bag(
+        codes.flatten(0, -2) + codebook_offsets,
+        codebooks.flatten(0, 1).flatten(-2, -1),
+        mode="sum"
+    )  # [prod(dims) * num_out_groups * num_in_groups, out_group_size
+    # * in_group_size]
+
+    reconstructed_weight_groupwise = reconstructed_weight_flat.view(
+        list(codes.shape[:-3]) +
+        [num_out_groups, num_in_groups, out_group_size, in_group_size])
+    if scales is not None:
+        reconstructed_weight_groupwise = reconstructed_weight_groupwise.mul(
+            scales)
+    return reconstructed_weight_groupwise.swapaxes(
+        -3, -2).reshape(list(codes.shape[:-3]) + [out_features, in_features])
+
+
+def dequantize_gemm(
+    input: torch.Tensor,  #  [..., in_features]
+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
+    codebooks: torch.
+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+    dequantized_weight = dequantize_weight(
+        unpack_int_data(codes, codebooks.shape[1].bit_length() - 1),
+        codebooks,
+        scales,
+    )
+    return F.linear(input, dequantized_weight, bias)
+
+
+# Generic dequantization, slow but flexible.
+def generic_dequantize_gemm(
+    input: torch.Tensor,  #  [..., in_features]
+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
+    codebooks: torch.
+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+    output_partition_sizes: list[int],
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+    output_shape = input.shape[:-1] + (scales.shape[0], )
+    output = torch.empty(output_shape, dtype=input.dtype, device=input.device)
+    num_outputs = len(output_partition_sizes)
+
+    # break the inputs and codebooks apart then combine the outputs.
+    # Surprisingly (to me) this is faster than doing 3 de-quants and 1 big
+    # multiply at the end.
+    num_codebooks = codebooks.shape[0] // num_outputs
+    assert (scales.shape[0] == codes.shape[0])
+    assert (sum(output_partition_sizes) == scales.shape[0])
+    output_offset = 0
+    codebooks_offset = 0
+    for output_size in output_partition_sizes:
+        shard_output = dequantize_gemm(
+            input, codes.narrow(0, output_offset, output_size),
+            codebooks.narrow(0, codebooks_offset, num_codebooks),
+            scales.narrow(0, output_offset, output_size), None
+            if bias is None else bias.narrow(0, output_offset, output_size))
+
+        output_slice = output.narrow(-1, output_offset, output_size)
+        assert (output_slice.shape == shard_output.shape)
+        output_slice.copy_(shard_output)
+        output_offset += output_size
+        codebooks_offset += num_codebooks
+    return output
+
+
+# Optimized dequnantize/decompression kernels, supports 1x16 and 2x8
+# at 6 and 9 times faster than the generic version above, respectively.
+def optimized_dequantize_gemm(
+    input: torch.Tensor,  #  [..., in_features]
+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
+    codebooks: torch.
+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+    output_partition_sizes: list[int],
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+    if bias is None:
+        # scaling the output is fastest, so we do that when possible.
+        output = F.linear(input, weights, bias)
+        orig_shape = output.shape
+        flattened_output = output.view(-1, output.size(-1))
+        f_scales = scales.view(-1, scales.shape[0])
+        b_scales = f_scales.expand(flattened_output.shape[0], -1)
+        flattened_output *= b_scales
+        return output.view(orig_shape)
+    else:
+        b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
+            -1, weights.shape[1])
+        weights *= b_scales
+        return F.linear(input, weights, bias)
+
+
+class AQLMConfig(QuantizationConfig):
+    """Config class for AQLM.
+
+    Reference: https://github.com/Vahe1994/AQLM
+    """
+
+    def __init__(
+        self,
+        in_group_size: int,
+        nbits_per_codebook: int,
+        num_codebooks: int,
+        out_group_size: int,
+    ) -> None:
+        super().__init__()
+        self.in_group_size = in_group_size
+        self.nbits_per_codebook = nbits_per_codebook
+        self.num_codebooks = num_codebooks
+        self.out_group_size = out_group_size
+
+        # out_group_size > 1 is untested, and probably won't work as-is.
+        assert (self.out_group_size == 1)
+        self.pack_factor = (self.in_group_size * self.out_group_size)
+
+    def __repr__(self) -> str:
+        return (f"AQLMConfig(in_group_size={self.in_group_size}, "
+                f"nbits_per_codebook={self.nbits_per_codebook}, "
+                f"num_codebooks={self.num_codebooks}, "
+                f"out_group_size={self.out_group_size})")
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "aqlm"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 60
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return []  # no extra configs.
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "AQLMConfig":
+        in_group_size = cls.get_from_keys(config, ["in_group_size"])
+        nbits_per_codebook = cls.get_from_keys(config, ["nbits_per_codebook"])
+        num_code_books = cls.get_from_keys(config, ["num_codebooks"])
+        out_group_size = cls.get_from_keys(config, ["out_group_size"])
+        return cls(in_group_size, nbits_per_codebook, num_code_books,
+                   out_group_size)
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["AQLMLinearMethod"]:
+        if isinstance(layer, LinearBase):
+            return AQLMLinearMethod(self)
+        return None
+
+
+class AQLMLinearMethod(LinearMethodBase):
+    """Linear method for AQLM.
+
+    Args:
+        quant_config: The AQLM quantization config.
+    """
+
+    def __init__(self, quant_config: AQLMConfig):
+        self.quant_config = quant_config
+
+    def create_weights(self, layer: torch.nn.Module,
+                       input_size_per_partition: int,
+                       output_partition_sizes: list[int], input_size: int,
+                       output_size: int, params_dtype: torch.dtype,
+                       **extra_weight_attrs):
+        del output_size  # Unused.
+        del input_size  # Unused.
+
+        if params_dtype != torch.half:
+            raise ValueError("Only half is currently supported by aqlm")
+        if input_size_per_partition % self.quant_config.in_group_size != 0:
+            raise ValueError(
+                "The input size is not aligned with the quantized "
+                "weight shape. This can be caused by too large "
+                "tensor parallel size.")
+
+        output_size_per_partition = sum(output_partition_sizes)
+        if output_size_per_partition % self.quant_config.out_group_size != 0:
+            raise ValueError(
+                "The output size is not aligned with the quantized "
+                "weight shape. This can be caused by too large "
+                "tensor parallel size.")
+
+        codes = Parameter(
+            torch.empty(
+                # There could actually be two pack factors, one along input and
+                # one along output, but we don't currently support
+                # out_group_size, and only the one along output needs to be
+                # marked with "packed_dim" in order for QKVLinear to work.
+                output_size_per_partition,
+                input_size_per_partition // self.quant_config.pack_factor,
+                self.quant_config.num_codebooks,
+                dtype=get_int_dtype(self.quant_config.nbits_per_codebook),
+            ),
+            requires_grad=False,
+        )
+
+        set_weight_attrs(
+            codes,
+            {
+                "input_dim": 1,
+                "output_dim": 0,
+                "packed_dim": 1,
+                "pack_factor": self.quant_config.pack_factor,
+            },
+        )
+
+        codebooks = Parameter(
+            torch.empty(
+                self.quant_config.num_codebooks * len(output_partition_sizes),
+                2**self.quant_config.nbits_per_codebook,
+                self.quant_config.out_group_size,
+                self.quant_config.in_group_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            codebooks,
+            {
+                # metadata indicates fixed size concatenated along dim 0
+                "is_metadata": True,
+                "output_partition_sizes": output_partition_sizes
+            },
+        )
+
+        scales = Parameter(
+            torch.empty(
+                (
+                    output_size_per_partition //
+                    self.quant_config.out_group_size,
+                    1,
+                    1,
+                    1,
+                ),
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            scales,
+            {
+                "output_dim": 0,
+                "packed_dim": 0,
+                "pack_factor": self.quant_config.out_group_size
+            },
+        )
+
+        layer.register_parameter("codes", codes)
+        set_weight_attrs(codes, extra_weight_attrs)
+        layer.register_parameter("codebooks", codebooks)
+        set_weight_attrs(codebooks, extra_weight_attrs)
+        layer.register_parameter("scales", scales)
+        set_weight_attrs(scales, extra_weight_attrs)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        codebooks = layer.codebooks
+        codes = layer.codes
+        scales = layer.scales
+        output_partition_sizes = getattr(codebooks, "output_partition_sizes",
+                                         [])
+
+        nbooks = codes.shape[2]
+        ingroups = codebooks.shape[3]
+        outgroups = codebooks.shape[2]
+        bits = codebooks.shape[1]
+
+        # We support these formats with dedicated gemm and decompression
+        # kernels.
+        if ingroups == 8 and outgroups == 1 and (
+            (bits == 256 and nbooks == 2) or (bits == 65536 and nbooks == 1)):
+
+            # thresholds determined by timings on an A6000, one GPU
+            use_gemv = math.prod(x.shape[:-1]) <= 6
+
+            return ops.aqlm_gemm(
+                x,
+                codes,
+                codebooks,
+                scales,
+                output_partition_sizes,
+                bias,
+            ) if use_gemv else optimized_dequantize_gemm(
+                x,
+                codes,
+                codebooks,
+                scales,
+                output_partition_sizes,
+                bias,
+            )
+
+        # fall back all unoptimized formats
+        return generic_dequantize_gemm(
+            x,
+            codes,
+            codebooks,
+            scales,
+            output_partition_sizes,
+            bias,
+        )
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/auto_round.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/auto_round.py
new file mode 100644
index 0000000..ea17cd5
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/auto_round.py
@@ -0,0 +1,310 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from fractions import Fraction
+from typing import Any, Optional, Union
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (LinearBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+
+logger = init_logger(__name__)
+
+
+class AutoRoundConfig(QuantizationConfig):
+    """Config class for AutoRound.
+    Reference: https://arxiv.org/pdf/2309.05516
+    """
+
+    SUPPORTED_BITS = {2, 3, 4, 8}
+    SUPPORTED_DTYPES = {"int"}
+    SUPPORTED_FORMATS = {"auto_round:auto_gptq", "auto_round:auto_awq"}
+    SUPPORTED_BACKENDS = {
+        "auto", "gptq", "gptq:marlin", "awq", "awq:marlin", "marlin", "ipex"
+    }
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        sym: bool = True,
+        packing_format: str = "auto_round:auto_gptq",
+        block_name_to_quantize: Optional[Union[str, list[str]]] = None,
+        extra_config: Optional[dict[str, Any]] = None,
+        data_type: str = "int",
+        backend: str = "auto",
+    ) -> None:
+        super().__init__()
+        if weight_bits not in self.SUPPORTED_BITS:
+            raise ValueError(f"Unsupported weight_bits: {weight_bits}, "
+                             f"currently only support  {self.SUPPORTED_BITS}")
+        if data_type not in self.SUPPORTED_DTYPES:
+            raise ValueError(
+                f"Unsupported data_type: {data_type},"
+                f" currently only support  {self.SUPPORTED_DTYPES}")
+        if packing_format not in self.SUPPORTED_FORMATS:
+            raise ValueError(
+                f"Unsupported packing_format: {packing_format}, "
+                f"currently only support  {self.SUPPORTED_FORMATS}")
+        if backend not in self.SUPPORTED_BACKENDS:
+            raise ValueError(
+                f"Unsupported backend: {backend},  "
+                f"currently only support  {self.SUPPORTED_BACKENDS}")
+
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.sym = sym
+        self.packing_format = packing_format
+        self.block_name_to_quantize = (block_name_to_quantize.split(",") if
+                                       isinstance(block_name_to_quantize, str)
+                                       else block_name_to_quantize)
+        self.extra_config = extra_config
+        self.data_type = data_type
+        self.backend = backend
+        self.pack_factor = Fraction(32, weight_bits)
+
+    def __repr__(self) -> str:
+        return (f"AutoRoundConfig(weight_bits={self.weight_bits}, "
+                f"group_size={self.group_size}, sym={self.sym})")
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "auto-round"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.half, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 60
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return ["quantization_config.json"]
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "AutoRoundConfig":
+        return cls(
+            weight_bits=cls.get_from_keys(config, ["bits"]),
+            group_size=cls.get_from_keys(config, ["group_size"]),
+            sym=cls.get_from_keys(config, ["sym"]),
+            packing_format=cls.get_from_keys_or(config, ["packing_format"],
+                                                "auto_round:auto_gptq"),
+            block_name_to_quantize=cls.get_from_keys_or(
+                config, ["block_name_to_quantize", "to_quant_block_names"],
+                None),
+            extra_config=cls.get_from_keys_or(config, ["extra_config"], None),
+            data_type=cls.get_from_keys_or(config, ["data_type"], "int"),
+            backend=cls.get_from_keys_or(config, ["backend", "vllm_backend"],
+                                         "auto"),
+        )
+
+    def get_layer_config(self, layer, layer_name: str):
+        # Priority: extra_config > block_name_to_quantize > type fallback
+        if self.extra_config and layer_name in self.extra_config:
+            cfg = self.extra_config[layer_name]
+            return cfg.get("bits", self.weight_bits), cfg.get(
+                "group_size", self.group_size), cfg.get("sym", self.sym)
+
+        quantized = True
+        if self.block_name_to_quantize:
+            quantized = any(
+                layer_name.startswith(name)
+                for name in self.block_name_to_quantize)
+        elif isinstance(layer, ParallelLMHead):
+            quantized = False
+
+        return (self.weight_bits, self.group_size,
+                self.sym) if quantized else (16, -1, True)
+
+    def check_quantized(self, weight_bits: int) -> bool:
+        return weight_bits < 16
+
+    def apply_awq_quant_layer(self, layer, prefix: str, backend: str = "auto"):
+        from vllm.model_executor.layers.fused_moe import FusedMoE
+        from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+            check_marlin_supported, check_moe_marlin_supports_layer)
+
+        weight_bits, group_size, sym = self.get_layer_config(layer, prefix)
+        if not self.check_quantized(weight_bits):
+            if isinstance(layer, (LinearBase, ParallelLMHead)):
+                return UnquantizedLinearMethod()
+            else:
+                return None
+
+        logger.debug("[%s] Type: %s, Bits: %s, Group Size: %s, Sym: %s",
+                     prefix, layer.__class__.__name__, weight_bits, group_size,
+                     sym)
+        if backend == "auto" or "marlin" in backend:
+            AWQ_TYPE_MAP = {
+                4: scalar_types.uint4,
+                8: scalar_types.uint8,
+            }
+            use_marlin = (weight_bits
+                          in AWQ_TYPE_MAP) and check_marlin_supported(
+                              AWQ_TYPE_MAP[weight_bits], group_size, not sym)
+
+            if isinstance(layer, FusedMoE):
+                use_marlin = use_marlin and check_moe_marlin_supports_layer(
+                    layer, group_size)
+
+        else:
+            use_marlin = False
+        if use_marlin:
+            from vllm.model_executor.layers.quantization.awq_marlin import (
+                AWQMarlinConfig, AWQMarlinLinearMethod, AWQMoEMethod)
+            quant_args_marlin = AWQMarlinConfig(weight_bits=weight_bits,
+                                                group_size=group_size,
+                                                zero_point=not sym,
+                                                lm_head_quantized=False,
+                                                full_config={},
+                                                modules_to_not_convert=[])
+        else:
+            from vllm.model_executor.layers.quantization.awq import (
+                AWQConfig, AWQLinearMethod)
+            quant_args = AWQConfig(
+                weight_bits=weight_bits,
+                group_size=group_size,
+                zero_point=not sym,
+            )
+
+        if isinstance(layer, FusedMoE):
+            if use_marlin:
+                return AWQMoEMethod(quant_args_marlin)
+            from vllm.model_executor.layers.quantization.moe_wna16 import (
+                MoeWNA16Config)
+            config = {
+                "quant_method": "awq",
+                "bits": weight_bits,
+                "group_size": group_size,
+                "zero_point": not sym,
+                "lm_head": False,
+            }
+            return MoeWNA16Config.from_config(config).get_quant_method(
+                layer, prefix)
+
+        if isinstance(layer, (LinearBase, ParallelLMHead)):
+            if use_marlin:
+                return AWQMarlinLinearMethod(quant_args_marlin)
+            else:
+                return AWQLinearMethod(quant_args)
+        return None
+
+    def apply_gptq_quant_layer(self,
+                               layer,
+                               prefix: str,
+                               backend: str = "auto"):
+        from vllm.model_executor.layers.fused_moe import FusedMoE
+        from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+            check_marlin_supported, check_moe_marlin_supports_layer)
+        weight_bits, group_size, sym = self.get_layer_config(layer, prefix)
+        if not self.check_quantized(weight_bits):
+            if isinstance(layer, (LinearBase, ParallelLMHead)):
+                return UnquantizedLinearMethod()
+            else:
+                return None
+
+        logger.debug("[%s] Type: %s, Bits: %s, Group Size: %s, Sym: %s",
+                     prefix, layer.__class__.__name__, weight_bits, group_size,
+                     sym)
+        if backend == "auto" or "marlin" in backend:
+            GPTQ_TYPE_MAP = {
+                (4, True): scalar_types.uint4b8,
+                (8, True): scalar_types.uint8b128,
+            }
+            use_marlin = ((weight_bits, sym) in GPTQ_TYPE_MAP
+                          and check_marlin_supported(
+                              GPTQ_TYPE_MAP[(weight_bits, sym)],
+                              group_size,
+                              has_zp=not sym))
+            if isinstance(layer, FusedMoE):
+                use_marlin = use_marlin and check_moe_marlin_supports_layer(
+                    layer, group_size)
+        else:
+            use_marlin = False
+        if use_marlin:
+            from vllm.model_executor.layers.quantization.gptq_marlin import (
+                GPTQMarlinConfig, GPTQMarlinLinearMethod, GPTQMarlinMoEMethod)
+            quant_args_marlin = GPTQMarlinConfig(weight_bits=weight_bits,
+                                                 group_size=group_size,
+                                                 is_sym=sym,
+                                                 lm_head_quantized=False,
+                                                 desc_act=False,
+                                                 dynamic={},
+                                                 full_config={})
+        else:
+            from vllm.model_executor.layers.quantization.gptq import (
+                GPTQConfig, GPTQLinearMethod)
+            quant_args = GPTQConfig(weight_bits=weight_bits,
+                                    group_size=group_size,
+                                    lm_head_quantized=False,
+                                    desc_act=False,
+                                    dynamic={})
+
+        if isinstance(layer, FusedMoE):
+            if use_marlin:
+                from vllm.model_executor.layers.quantization.moe_wna16 import (
+                    MoeWNA16Config)
+                config = {
+                    "quant_method": "gptq",
+                    "bits": weight_bits,
+                    "group_size": group_size,
+                    "sym": sym,
+                    "lm_head": False,
+                }
+                return MoeWNA16Config.from_config(config).get_quant_method(
+                    layer, prefix)
+            return GPTQMarlinMoEMethod(quant_args_marlin)
+
+        if isinstance(layer, (LinearBase, ParallelLMHead)):
+            if use_marlin:
+                return GPTQMarlinLinearMethod(quant_args_marlin)
+            else:
+                return GPTQLinearMethod(quant_args)
+
+        return None
+
+    def apply_ipex_quant_layer(self, layer, prefix: str):
+        weight_bits, group_size, sym = self.get_layer_config(layer, prefix)
+        if not self.check_quantized(weight_bits):
+            if isinstance(layer, (LinearBase, ParallelLMHead)):
+                return UnquantizedLinearMethod()
+            else:
+                return None
+        from vllm.model_executor.layers.quantization.ipex_quant import (
+            IPEXAWQLinearMethod, IPEXConfig, IPEXGPTQLinearMethod)
+        if isinstance(layer, (LinearBase, ParallelLMHead)):
+            if "awq" in self.packing_format:
+                config = IPEXConfig(method="awq",
+                                    weight_bits=weight_bits,
+                                    group_size=group_size)
+                return IPEXAWQLinearMethod(config)
+            elif "gptq" in self.packing_format:
+                config = IPEXConfig(method="gptq",
+                                    weight_bits=weight_bits,
+                                    group_size=group_size)
+                return IPEXGPTQLinearMethod(config)
+            else:
+                raise ValueError(
+                    f"ipex backend only supports awq "
+                    f"and gtpq format,but got {self.packing_format}")
+        else:
+            return None
+
+    def get_quant_method(self, layer: torch.nn.Module, prefix: str):
+        if (current_platform.is_cpu() or current_platform.is_xpu()
+                or self.backend == "ipex"):
+            return self.apply_ipex_quant_layer(layer, prefix)
+        if "gptq" in self.packing_format or "gptq" in self.backend:
+            return self.apply_gptq_quant_layer(layer, prefix)
+        if "awq" in self.packing_format or "awq" in self.backend:
+            return self.apply_awq_quant_layer(layer, prefix)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/awq.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/awq.py
new file mode 100644
index 0000000..fe42e26
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/awq.py
@@ -0,0 +1,228 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Optional, Union
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.parameter import (GroupQuantScaleParameter,
+                                           PackedvLLMParameter)
+
+logger = init_logger(__name__)
+
+
+class AWQConfig(QuantizationConfig):
+    """Config class for AWQ.
+
+    Reference: https://arxiv.org/abs/2306.00978
+    """
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        zero_point: bool,
+        modules_to_not_convert: Optional[list[str]] = None,
+    ) -> None:
+        super().__init__()
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.zero_point = zero_point
+        self.modules_to_not_convert = modules_to_not_convert or []
+
+        if self.weight_bits != 4:
+            raise ValueError(
+                "Currently, only 4-bit weight quantization is supported for "
+                f"AWQ, but got {self.weight_bits} bits.")
+        self.pack_factor = 32 // self.weight_bits
+
+    def __repr__(self) -> str:
+        return (f"AWQConfig(weight_bits={self.weight_bits}, "
+                f"group_size={self.group_size}, "
+                f"zero_point={self.zero_point}, "
+                f"modules_to_not_convert={self.modules_to_not_convert})")
+
+    def get_name(self) -> QuantizationMethods:
+        return "awq"
+
+    def get_supported_act_dtypes(self) -> list[torch.dtype]:
+        return [torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # The AWQ kernel only supports Turing or newer GPUs.
+        return 75
+
+    @staticmethod
+    def get_config_filenames() -> list[str]:
+        return [
+            "quant_config.json",  # E.g., casperhansen/vicuna-7b-v1.5-awq
+            # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq
+            "quantize_config.json",
+        ]
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "AWQConfig":
+        weight_bits = cls.get_from_keys(config, ["w_bit", "bits"])
+        group_size = cls.get_from_keys(config, ["q_group_size", "group_size"])
+        zero_point = cls.get_from_keys(config, ["zero_point"])
+        modules_to_not_convert = cls.get_from_keys_or(
+            config, ["modules_to_not_convert"], None)
+        return cls(weight_bits, group_size, zero_point, modules_to_not_convert)
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[Union["LinearMethodBase", "QuantizeMethodBase"]]:
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped_awq(prefix, self.modules_to_not_convert):
+                return UnquantizedLinearMethod()
+            return AWQLinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            # Lazy import to avoid circular import.
+            from .awq_marlin import AWQMarlinConfig, AWQMoEMethod
+            from .moe_wna16 import MoeWNA16Config
+            from .utils.marlin_utils import check_moe_marlin_supports_layer
+            if not check_moe_marlin_supports_layer(layer, self.group_size):
+                logger.warning_once(
+                    f"Layer '{prefix}' is not supported by AWQMoeMarlin. "
+                    "Falling back to Moe WNA16 kernels.")
+                config = {
+                    "quant_method": "awq",
+                    "bits": self.weight_bits,
+                    "group_size": self.group_size,
+                    "zero_point": self.zero_point,
+                    "lm_head": False,
+                }
+                return MoeWNA16Config.from_config(config).get_quant_method(
+                    layer, prefix)
+            marlin_compatible_config_dict = {
+                "quant_method": "awq",
+                "bits": self.weight_bits,
+                "group_size": self.group_size,
+                "zero_point": self.zero_point,
+                "lm_head": False,
+                "modules_to_not_convert": self.modules_to_not_convert,
+            }
+            awq_marlin_config = AWQMarlinConfig.from_config(
+                marlin_compatible_config_dict)
+            return AWQMoEMethod(awq_marlin_config)
+        return None
+
+
+def is_layer_skipped_awq(prefix: str, modules_to_not_convert: list[str]):
+    return any(module_name in prefix for module_name in modules_to_not_convert)
+
+
+class AWQLinearMethod(LinearMethodBase):
+    """Linear method for AWQ.
+
+    Args:
+        quant_config: The AWQ quantization config.
+    """
+
+    def __init__(self, quant_config: AWQConfig):
+        self.quant_config = quant_config
+
+    def create_weights(self, layer: torch.nn.Module,
+                       input_size_per_partition: int,
+                       output_partition_sizes: list[int], input_size: int,
+                       output_size: int, params_dtype: torch.dtype,
+                       **extra_weight_attrs):
+        # Normalize group_size
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+
+        if input_size_per_partition % group_size != 0:
+            raise ValueError(
+                "The input size is not aligned with the quantized "
+                "weight shape. This can be caused by too large "
+                "tensor parallel size.")
+
+        output_size_per_partition = sum(output_partition_sizes)
+        if output_size_per_partition % self.quant_config.pack_factor != 0:
+            raise ValueError(
+                "The output size is not aligned with the quantized "
+                "weight shape. This can be caused by too large "
+                "tensor parallel size.")
+
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=1,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader)
+
+        num_groups = input_size_per_partition // group_size
+
+        qzeros = PackedvLLMParameter(
+            data=torch.empty(
+                num_groups,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=1,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader)
+
+        scales = GroupQuantScaleParameter(data=torch.empty(
+            num_groups,
+            output_size_per_partition,
+            dtype=params_dtype,
+        ),
+                                          input_dim=0,
+                                          output_dim=1,
+                                          weight_loader=weight_loader)
+
+        layer.register_parameter("qweight", qweight)
+        layer.register_parameter("qzeros", qzeros)
+        layer.register_parameter("scales", scales)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.qweight = torch.nn.Parameter(layer.qweight.data,
+                                           requires_grad=False)
+        layer.qzeros = torch.nn.Parameter(layer.qzeros.data,
+                                          requires_grad=False)
+        layer.scales = torch.nn.Parameter(layer.scales.data,
+                                          requires_grad=False)
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        qweight = layer.qweight
+        scales = layer.scales
+        qzeros = layer.qzeros
+        pack_factor = self.quant_config.pack_factor
+        out_shape = (x.shape[:-1] + (qweight.shape[-1] * pack_factor, ))
+        reshaped_x = x.reshape(-1, x.shape[-1])
+
+        # num_tokens >= threshold
+        FP16_MATMUL_HEURISTIC_CONDITION = x.shape[:-1].numel() >= 256
+
+        if FP16_MATMUL_HEURISTIC_CONDITION:
+            out = ops.awq_dequantize(qweight, scales, qzeros, 0, 0, 0)
+            out = torch.matmul(reshaped_x, out)
+        else:
+            out = ops.awq_gemm(reshaped_x, qweight, scales, qzeros,
+                               pack_factor)
+        if bias is not None:
+            out.add_(bias)
+        return out.reshape(out_shape)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/awq_marlin.py
new file mode 100644
index 0000000..0fdded0
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -0,0 +1,523 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Callable, Optional
+
+import torch
+from torch.nn import Parameter
+
+import vllm.model_executor.layers.fused_moe  # noqa
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.layer import (
+    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               UnquantizedLinearMethod,
+                                               set_weight_attrs)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.awq import (AWQConfig,
+                                                         is_layer_skipped_awq)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    apply_awq_marlin_linear, awq_to_marlin_zero_points, check_marlin_supported,
+    check_marlin_supports_layer, check_moe_marlin_supports_layer,
+    marlin_make_empty_g_idx, marlin_make_workspace_new,
+    marlin_moe_permute_scales, marlin_permute_scales,
+    moe_awq_to_marlin_zero_points, verify_marlin_supported,
+    verify_marlin_supports_shape)
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.parameter import (GroupQuantScaleParameter,
+                                           PackedvLLMParameter)
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+
+logger = init_logger(__name__)
+
+
+class AWQMarlinConfig(QuantizationConfig):
+    """Config class for AWQ Marlin"""
+
+    # num_bits -> type
+    TYPE_MAP = {
+        4: scalar_types.uint4,
+        8: scalar_types.uint8,
+    }
+
+    def __init__(self, weight_bits: int, group_size: int, zero_point: bool,
+                 lm_head_quantized: bool,
+                 modules_to_not_convert: Optional[list[str]],
+                 full_config: dict[str, Any]) -> None:
+        super().__init__()
+        self.pack_factor = 32 // weight_bits  # packed into int32
+        self.group_size = group_size
+        self.zero_point = zero_point
+        self.lm_head_quantized = lm_head_quantized
+        self.weight_bits = weight_bits
+        self.modules_to_not_convert = modules_to_not_convert or []
+        self.full_config = full_config
+
+        if self.weight_bits not in self.TYPE_MAP:
+            raise ValueError(f"Unsupported num_bits = {self.weight_bits}. "
+                             f"Supported num_bits = {self.TYPE_MAP.keys()}")
+
+        self.quant_type = self.TYPE_MAP[self.weight_bits]
+
+        verify_marlin_supported(self.quant_type,
+                                group_size=self.group_size,
+                                has_zp=self.zero_point)
+
+    def __repr__(self) -> str:
+        return (f"AWQMarlinConfig(quant_type={self.quant_type}, "
+                f"group_size={self.group_size}, "
+                f"zero_point={self.zero_point}, "
+                f"lm_head_quantized={self.lm_head_quantized}, "
+                f"modules_to_not_convert={self.modules_to_not_convert})")
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "awq_marlin"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.half, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "AWQMarlinConfig":
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        zero_point = cls.get_from_keys(config, ["zero_point"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
+                                                 default=False)
+        modules_to_not_convert = cls.get_from_keys_or(
+            config, ["modules_to_not_convert"], None)
+        return cls(weight_bits, group_size, zero_point, lm_head_quantized,
+                   modules_to_not_convert, config)
+
+    @classmethod
+    def override_quantization_method(
+            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
+        can_convert = cls.is_awq_marlin_compatible(hf_quant_cfg)
+        is_valid_user_quant = (user_quant is None or user_quant == "marlin"
+                               or user_quant == "awq_marlin")
+
+        if can_convert and is_valid_user_quant:
+            msg = ("The model is convertible to {} during runtime."
+                   " Using {} kernel.".format(cls.get_name(), cls.get_name()))
+            logger.info(msg)
+            return cls.get_name()
+
+        if can_convert and user_quant == "awq":
+            logger.info("Detected that the model can run with awq_marlin"
+                        ", however you specified quantization=awq explicitly,"
+                        " so forcing awq. Use quantization=awq_marlin for"
+                        " faster inference")
+        return None
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        if (isinstance(layer, LinearBase) or
+            (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
+            if is_layer_skipped_awq(prefix, self.modules_to_not_convert):
+                return UnquantizedLinearMethod()
+            # Check if the layer is supported by AWQMarlin.
+            if not check_marlin_supports_layer(layer, self.group_size):
+                logger.warning_once(
+                    "Layer '%s' is not supported by AWQMarlin. Falling back to unoptimized AWQ kernels.",  # noqa: E501
+                    prefix,
+                )
+                return AWQConfig.from_config(
+                    self.full_config).get_quant_method(layer, prefix)
+            return AWQMarlinLinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            from vllm.model_executor.layers.quantization.moe_wna16 import (
+                MoeWNA16Config)
+            if not check_moe_marlin_supports_layer(layer, self.group_size):
+                logger.warning_once(
+                    f"Layer '{prefix}' is not supported by AWQMoeMarlin. "
+                    "Falling back to Moe WNA16 kernels.")
+                return MoeWNA16Config.from_config(
+                    self.full_config).get_quant_method(layer, prefix)
+            return AWQMoEMethod(self)
+        return None
+
+    @classmethod
+    def is_awq_marlin_compatible(cls, quant_config: dict[str, Any]):
+        # Extract data from quant config.
+        quant_method = quant_config.get("quant_method", "").lower()
+        num_bits = quant_config.get("bits")
+        group_size = quant_config.get("group_size")
+        zero_point = quant_config.get("zero_point")
+
+        if not current_platform.is_cuda():
+            return False
+
+        if quant_method != "awq":
+            return False
+
+        # If we cannot find the info needed in the config, cannot convert.
+        if (num_bits is None or group_size is None or zero_point is None):
+            return False
+
+        if num_bits not in cls.TYPE_MAP:
+            return False
+
+        return check_marlin_supported(quant_type=cls.TYPE_MAP[num_bits],
+                                      group_size=group_size,
+                                      has_zp=zero_point)
+
+
+class AWQMarlinLinearMethod(LinearMethodBase):
+    """Linear method for AWQ Marlin.
+
+    Args:
+        quant_config: The AWQ Marlin quantization config.
+    """
+
+    def __init__(self, quant_config: AWQMarlinConfig) -> None:
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        del output_size
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        # Normalize group_size
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+
+        verify_marlin_supports_shape(
+            output_size_per_partition=output_size_per_partition,
+            input_size_per_partition=input_size_per_partition,
+            input_size=input_size,
+            group_size=group_size)
+
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=1,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader)
+
+        num_groups = input_size_per_partition // group_size
+
+        qzeros = PackedvLLMParameter(
+            data=torch.empty(
+                num_groups,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=1,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader)
+
+        scales = GroupQuantScaleParameter(data=torch.empty(
+            num_groups,
+            output_size_per_partition,
+            dtype=params_dtype,
+        ),
+                                          input_dim=0,
+                                          output_dim=1,
+                                          weight_loader=weight_loader)
+
+        layer.register_parameter("qweight", qweight)
+        layer.register_parameter("qzeros", qzeros)
+        layer.register_parameter("scales", scales)
+
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        layer.num_groups = num_groups
+
+    # TODO: Update this docs
+    # Checkpoints are serialized in AutoAWQ format, which is different from the
+    # marlin format. This function is called after the weights are loaded.
+    # Here, we handle the repacking
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        device = layer.qweight.device
+        layer.qweight = torch.nn.Parameter(layer.qweight.data,
+                                           requires_grad=False)
+        layer.qzeros = torch.nn.Parameter(layer.qzeros.data,
+                                          requires_grad=False)
+        layer.scales = torch.nn.Parameter(layer.scales.data,
+                                          requires_grad=False)
+
+        # Allocate marlin workspace
+        layer.workspace = marlin_make_workspace_new(device)
+
+        # Repack weights from AWQ format to marlin format.
+        marlin_qweight = ops.awq_marlin_repack(
+            layer.qweight,
+            size_k=layer.input_size_per_partition,
+            size_n=layer.output_size_per_partition,
+            num_bits=self.quant_config.quant_type.size_bits)
+        replace_parameter(layer, "qweight", marlin_qweight)
+
+        # Permute scales from AWQ format to marlin format.
+        marlin_scales = marlin_permute_scales(
+            layer.scales,
+            size_k=layer.input_size_per_partition,
+            size_n=layer.output_size_per_partition,
+            group_size=self.quant_config.group_size)
+        replace_parameter(layer, "scales", marlin_scales)
+
+        # Permute zero-points from AWQ format to marlin format.
+        marlin_zp = awq_to_marlin_zero_points(
+            layer.qzeros,
+            size_k=layer.num_groups,
+            size_n=layer.output_size_per_partition,
+            num_bits=self.quant_config.quant_type.size_bits)
+        replace_parameter(layer, "qzeros", marlin_zp)
+
+        # Not-used
+        layer.g_idx = marlin_make_empty_g_idx(device)
+        layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return apply_awq_marlin_linear(
+            input=x,
+            weight=layer.qweight,
+            weight_scale=layer.scales,
+            weight_zp=layer.qzeros,
+            g_idx=layer.g_idx,
+            g_idx_sort_indices=layer.g_idx_sort_indices,
+            workspace=layer.workspace,
+            quant_type=self.quant_config.quant_type,
+            output_size_per_partition=layer.output_size_per_partition,
+            input_size_per_partition=layer.input_size_per_partition,
+            bias=bias)
+
+
+class AWQMoEMethod(FusedMoEMethodBase):
+
+    def __init__(self, quant_config: AWQMarlinConfig):
+        self.quant_config = quant_config
+        if self.quant_config.weight_bits != 4:
+            raise ValueError("AWQMoEMethod only supports 4bit now.")
+        self.quant_type = scalar_types.uint4
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+        extra_weight_attrs.update({
+            "is_transposed":
+            True,
+            "quant_method":
+            FusedMoeWeightScaleSupported.GROUP.value,
+        })
+
+        w13_qweight = Parameter(
+            torch.empty(num_experts,
+                        hidden_size,
+                        2 * intermediate_size_per_partition //
+                        self.quant_config.pack_factor,
+                        dtype=torch.int32),
+            requires_grad=False)
+        layer.register_parameter("w13_qweight", w13_qweight)
+        set_weight_attrs(w13_qweight, extra_weight_attrs)
+
+        w2_qweight = Parameter(torch.empty(num_experts,
+                                           intermediate_size_per_partition,
+                                           hidden_size //
+                                           self.quant_config.pack_factor,
+                                           dtype=torch.int32),
+                               requires_grad=False)
+        layer.register_parameter("w2_qweight", w2_qweight)
+        set_weight_attrs(w2_qweight, extra_weight_attrs)
+
+        num_groups_w13 = hidden_size // self.quant_config.group_size
+        num_groups_w2 = (intermediate_size_per_partition //
+                         self.quant_config.group_size)
+
+        # WEIGHT_SCALES
+        # Allocate 2 scales for w1 and w3 respectively.
+        w13_scales = Parameter(torch.empty(num_experts,
+                                           num_groups_w13,
+                                           intermediate_size_per_partition * 2,
+                                           dtype=params_dtype),
+                               requires_grad=False)
+        layer.register_parameter("w13_scales", w13_scales)
+        set_weight_attrs(w13_scales, extra_weight_attrs)
+
+        w2_scales = Parameter(torch.empty(num_experts,
+                                          num_groups_w2,
+                                          hidden_size,
+                                          dtype=params_dtype),
+                              requires_grad=False)
+        layer.register_parameter("w2_scales", w2_scales)
+        set_weight_attrs(w2_scales, extra_weight_attrs)
+
+        # WEIGHT_ZERO_POINT
+        # Allocate 2 zero points for w1 and w3 respectively.
+        w13_qzeros = Parameter(
+            torch.empty(num_experts,
+                        num_groups_w13,
+                        2 * intermediate_size_per_partition //
+                        self.quant_config.pack_factor,
+                        dtype=torch.int32),
+            requires_grad=False)
+        layer.register_parameter("w13_qzeros", w13_qzeros)
+        set_weight_attrs(w13_qzeros, extra_weight_attrs)
+
+        w2_qzeros = Parameter(torch.empty(num_experts,
+                                          num_groups_w2,
+                                          hidden_size //
+                                          self.quant_config.pack_factor,
+                                          dtype=torch.int32),
+                              requires_grad=False)
+        layer.register_parameter("w2_qzeros", w2_qzeros)
+        set_weight_attrs(w2_qzeros, extra_weight_attrs)
+
+        device = layer.w13_qweight.device
+        layer.workspace = marlin_make_workspace_new(device, 4)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        num_experts = layer.w13_qweight.shape[0]
+        device = layer.w13_qweight.device
+
+        layer.w13_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
+            requires_grad=False,
+        )
+        layer.w2_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
+            requires_grad=False,
+        )
+
+        marlin_w13_qweight = ops.awq_marlin_moe_repack(
+            layer.w13_qweight,
+            layer.w13_g_idx_sort_indices,
+            size_k=layer.w13_qweight.shape[1],
+            size_n=layer.w13_qweight.shape[2] * self.quant_config.pack_factor,
+            num_bits=self.quant_config.weight_bits,
+        )
+        replace_parameter(layer, "w13_qweight", marlin_w13_qweight)
+
+        marlin_w2_qweight = ops.awq_marlin_moe_repack(
+            layer.w2_qweight,
+            layer.w2_g_idx_sort_indices,
+            size_k=layer.w2_qweight.shape[1],
+            size_n=layer.w2_qweight.shape[2] * self.quant_config.pack_factor,
+            num_bits=self.quant_config.weight_bits,
+        )
+        replace_parameter(layer, "w2_qweight", marlin_w2_qweight)
+
+        # Why does this take the intermediate size for size_k?
+        marlin_w13_scales = marlin_moe_permute_scales(
+            s=layer.w13_scales,
+            size_k=layer.intermediate_size_per_partition,
+            size_n=layer.w13_scales.shape[2],
+            group_size=self.quant_config.group_size,
+        )
+
+        replace_parameter(layer, "w13_scales", marlin_w13_scales)
+
+        marlin_w2_scales = marlin_moe_permute_scales(
+            s=layer.w2_scales,
+            size_k=layer.intermediate_size_per_partition,
+            size_n=layer.w2_scales.shape[2],
+            group_size=self.quant_config.group_size,
+        )
+        replace_parameter(layer, "w2_scales", marlin_w2_scales)
+
+        marlin_w13_zp = moe_awq_to_marlin_zero_points(
+            layer.w13_qzeros,
+            size_k=layer.w13_qzeros.shape[1],
+            size_n=layer.w13_qzeros.shape[2] * self.quant_config.pack_factor,
+            num_bits=self.quant_config.weight_bits)
+        replace_parameter(layer, "w13_qzeros", marlin_w13_zp)
+
+        marlin_w2_zp = moe_awq_to_marlin_zero_points(
+            layer.w2_qzeros,
+            size_k=layer.w2_qzeros.shape[1],
+            size_n=layer.w2_qzeros.shape[2] * self.quant_config.pack_factor,
+            num_bits=self.quant_config.weight_bits)
+        replace_parameter(layer, "w2_qzeros", marlin_w2_zp)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if enable_eplb:
+            raise NotImplementedError(
+                "EPLB not supported for `AWQMoEMethod` yet.")
+
+        assert activation == "silu", "Only SiLU activation is supported."
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
+
+        return torch.ops.vllm.fused_marlin_moe(
+            x,
+            layer.w13_qweight,
+            layer.w2_qweight,
+            layer.w13_scales,
+            layer.w2_scales,
+            router_logits,
+            topk_weights,
+            topk_ids,
+            quant_type_id=self.quant_type.id,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            w1_zeros=layer.w13_qzeros,
+            w2_zeros=layer.w2_qzeros,
+            workspace=layer.workspace)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/awq_triton.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/awq_triton.py
new file mode 100644
index 0000000..ebc526d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/awq_triton.py
@@ -0,0 +1,320 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.triton_utils import tl, triton
+
+AWQ_TRITON_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
+
+
+@triton.jit
+def awq_dequantize_kernel(
+        qweight_ptr,  # quantized matrix
+        scales_ptr,  # scales, per group
+        zeros_ptr,  # zeros, per group
+        group_size,  # Should always be one of the supported group sizes
+        result_ptr,  # Output matrix
+        num_cols,  # input num cols in qweight
+        num_rows,  # input num rows in qweight
+        BLOCK_SIZE_X: tl.constexpr,
+        BLOCK_SIZE_Y: tl.constexpr):
+    # Setup the pids.
+    pid_x = tl.program_id(axis=0)
+    pid_y = tl.program_id(axis=1)
+
+    # Compute offsets and masks for qweight_ptr.
+    offsets_y = pid_y * BLOCK_SIZE_Y + tl.arange(0, BLOCK_SIZE_Y)
+    offsets_x = pid_x * BLOCK_SIZE_X + tl.arange(0, BLOCK_SIZE_X)
+    offsets = num_cols * offsets_y[:, None] + offsets_x[None, :]
+
+    masks_y = offsets_y < num_rows
+    masks_x = offsets_x < num_cols
+
+    masks = masks_y[:, None] & masks_x[None, :]
+
+    # Compute offsets and masks for result output ptr.
+    result_offsets_y = pid_y * BLOCK_SIZE_Y + tl.arange(0, BLOCK_SIZE_Y)
+    result_offsets_x = pid_x * BLOCK_SIZE_X * 8 + tl.arange(
+        0, BLOCK_SIZE_X * 8)
+    result_offsets = (8 * num_cols * result_offsets_y[:, None] +
+                      result_offsets_x[None, :])
+
+    result_masks_y = result_offsets_y < num_rows
+    result_masks_x = result_offsets_x < num_cols * 8
+    result_masks = result_masks_y[:, None] & result_masks_x[None, :]
+
+    # Load the weights.
+    iweights = tl.load(qweight_ptr + offsets, masks, 0.0)
+    iweights = tl.interleave(iweights, iweights)
+    iweights = tl.interleave(iweights, iweights)
+    iweights = tl.interleave(iweights, iweights)
+
+    # Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7]
+    # that will map given indices to the correct order.
+    reverse_awq_order_tensor = ((tl.arange(0, 2) * 4)[None, :] +
+                                tl.arange(0, 4)[:, None]).reshape(8)
+
+    # Use this to compute a set of shifts that can be used to unpack and
+    # reorder the values in iweights and zeros.
+    shifts = reverse_awq_order_tensor * 4
+    shifts = tl.broadcast_to(shifts[None, :], (BLOCK_SIZE_Y * BLOCK_SIZE_X, 8))
+    shifts = tl.reshape(shifts, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8))
+
+    # Unpack and reorder: shift out the correct 4-bit value and mask.
+    iweights = (iweights >> shifts) & 0xF
+
+    # Compute zero offsets and masks.
+    zero_offsets_y = pid_y * BLOCK_SIZE_Y // group_size + tl.arange(0, 1)
+    zero_offsets_x = pid_x * BLOCK_SIZE_X + tl.arange(0, BLOCK_SIZE_X)
+    zero_offsets = num_cols * zero_offsets_y[:, None] + zero_offsets_x[None, :]
+
+    zero_masks_y = zero_offsets_y < num_rows // group_size
+    zero_masks_x = zero_offsets_x < num_cols
+    zero_masks = zero_masks_y[:, None] & zero_masks_x[None, :]
+
+    # Load the zeros.
+    zeros = tl.load(zeros_ptr + zero_offsets, zero_masks, 0.0)
+    zeros = tl.interleave(zeros, zeros)
+    zeros = tl.interleave(zeros, zeros)
+    zeros = tl.interleave(zeros, zeros)
+    zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8))
+
+    # Unpack and reorder: shift out the correct 4-bit value and mask.
+    zeros = (zeros >> shifts) & 0xF
+
+    # Compute scale offsets and masks.
+    scale_offsets_y = pid_y * BLOCK_SIZE_Y // group_size + tl.arange(0, 1)
+    scale_offsets_x = (pid_x * BLOCK_SIZE_X * 8 +
+                       tl.arange(0, BLOCK_SIZE_X * 8))
+    scale_offsets = (num_cols * 8 * scale_offsets_y[:, None] +
+                     scale_offsets_x[None, :])
+    scale_masks_y = scale_offsets_y < num_rows // group_size
+    scale_masks_x = scale_offsets_x < num_cols * 8
+    scale_masks = scale_masks_y[:, None] & scale_masks_x[None, :]
+
+    # Load the scales.
+    scales = tl.load(scales_ptr + scale_offsets, scale_masks, 0.0)
+    scales = tl.broadcast_to(scales, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8))
+
+    # Dequantize.
+    iweights = (iweights - zeros) * scales
+    iweights = iweights.to(result_ptr.type.element_ty)
+
+    # Finally, store.
+    tl.store(result_ptr + result_offsets, iweights, result_masks)
+
+
+@triton.jit
+def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
+                    group_size, BLOCK_SIZE_M: tl.constexpr,
+                    BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+                    SPLIT_K: tl.constexpr):
+    pid = tl.program_id(axis=0)
+    pid_z = tl.program_id(1)
+
+    # NOTE: This doesn't work in TRITON_INTERPRET=1 mode.  Use below instead.
+    # num_pid_n = (N + BLOCK_SIZE_N - 1) // BLOCK_SIZE_N
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+
+    pid_m = pid // num_pid_n
+    pid_n = pid % num_pid_n
+
+    accumulator_dtype = c_ptr.type.element_ty
+
+    # NOTE: This doesn't work in TRITON_INTERPRET=1 mode.  Use below instead.
+    # accumulator = tl.arange(0, BLOCK_SIZE_N)
+    # accumulator = tl.broadcast_to(accumulator[None, :],
+    # (BLOCK_SIZE_M, BLOCK_SIZE_N))
+    # accumulator = accumulator & 0x0
+    # accumulator = accumulator.to(accumulator_dtype)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N),
+                           dtype=accumulator_dtype)
+
+    # Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7]
+    # that will map given indices to the correct order.
+    reverse_awq_order_tensor = ((tl.arange(0, 2) * 4)[None, :] +
+                                tl.arange(0, 4)[:, None]).reshape(8)
+
+    # Create the necessary shifts to use to unpack.
+    shifts = reverse_awq_order_tensor * 4
+    shifts = tl.broadcast_to(shifts[None, :],
+                             (BLOCK_SIZE_K * (BLOCK_SIZE_N // 8), 8))
+    shifts = tl.reshape(shifts, (BLOCK_SIZE_K, BLOCK_SIZE_N))
+
+    # Offsets and masks.
+    offsets_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    masks_am = offsets_am < M
+
+    offsets_bn = pid_n * (BLOCK_SIZE_N // 8) + tl.arange(0, BLOCK_SIZE_N // 8)
+    masks_bn = offsets_bn < N // 8
+
+    offsets_zn = pid_n * (BLOCK_SIZE_N // 8) + tl.arange(0, BLOCK_SIZE_N // 8)
+    masks_zn = offsets_zn < N // 8
+
+    offsets_sn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    masks_sn = offsets_sn < N
+
+    offsets_k = pid_z * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
+    offsets_a = K * offsets_am[:, None] + offsets_k[None, :]
+    offsets_b = (N // 8) * offsets_k[:, None] + offsets_bn[None, :]
+
+    a_ptrs = a_ptr + offsets_a
+    b_ptrs = b_ptr + offsets_b
+
+    # NOTE: Use this in TRITON_INTERPRET=1 mode instead of tl.cdiv
+    # block_offset = BLOCK_SIZE_K * SPLIT_K
+    # for k in range(0, (K + block_offset - 1) // (block_offset)):
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K)):
+        masks_k = offsets_k < K
+        masks_a = masks_am[:, None] & masks_k[None, :]
+        a = tl.load(a_ptrs, mask=masks_a, other=0.0)
+
+        masks_b = masks_k[:, None] & masks_bn[None, :]
+        b = tl.load(b_ptrs, mask=masks_b, other=0.0)
+        b = tl.interleave(b, b)
+        b = tl.interleave(b, b)
+        b = tl.interleave(b, b)
+
+        # Dequantize b.
+        offsets_szk = (
+            (BLOCK_SIZE_K * SPLIT_K * k + pid_z * BLOCK_SIZE_K) // group_size +
+            tl.arange(0, 1))
+        offsets_z = (N // 8) * offsets_szk[:, None] + offsets_zn[None, :]
+        masks_zk = offsets_szk < K // group_size
+        masks_z = masks_zk[:, None] & masks_zn[None, :]
+        zeros_ptrs = zeros_ptr + offsets_z
+        zeros = tl.load(zeros_ptrs, mask=masks_z, other=0.0)
+        zeros = tl.interleave(zeros, zeros)
+        zeros = tl.interleave(zeros, zeros)
+        zeros = tl.interleave(zeros, zeros)
+        zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_K, BLOCK_SIZE_N))
+
+        offsets_s = N * offsets_szk[:, None] + offsets_sn[None, :]
+        masks_sk = offsets_szk < K // group_size
+        masks_s = masks_sk[:, None] & masks_sn[None, :]
+        scales_ptrs = scales_ptr + offsets_s
+        scales = tl.load(scales_ptrs, mask=masks_s, other=0.0)
+        scales = tl.broadcast_to(scales, (BLOCK_SIZE_K, BLOCK_SIZE_N))
+
+        b = (b >> shifts) & 0xF
+        zeros = (zeros >> shifts) & 0xF
+        b = (b - zeros) * scales
+        b = b.to(c_ptr.type.element_ty)
+
+        # Accumulate results.
+        accumulator = tl.dot(a, b, accumulator, out_dtype=accumulator_dtype)
+
+        offsets_k += BLOCK_SIZE_K * SPLIT_K
+        a_ptrs += BLOCK_SIZE_K * SPLIT_K
+        b_ptrs += BLOCK_SIZE_K * SPLIT_K * (N // 8)
+
+    c = accumulator.to(c_ptr.type.element_ty)
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + pid_z * N * M + N * offs_cm[:, None] + offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=c_mask)
+
+
+# qweights - [K     , M // 8], int32
+# scales   - [K // G, M     ], float16
+# zeros    - [K // G, M // 8], int32
+def awq_dequantize_triton(qweight: torch.Tensor,
+                          scales: torch.Tensor,
+                          zeros: torch.Tensor,
+                          block_size_x: int = 32,
+                          block_size_y: int = 32) -> torch.Tensor:
+    K = qweight.shape[0]
+    M = scales.shape[1]
+    group_size = qweight.shape[0] // scales.shape[0]
+
+    assert K > 0 and M > 0
+    assert scales.shape[0] == K // group_size and scales.shape[1] == M
+    assert zeros.shape[0] == K // group_size and zeros.shape[1] == M // 8
+    assert group_size <= K
+    assert group_size in AWQ_TRITON_SUPPORTED_GROUP_SIZES or group_size == K
+
+    # Result tensor:
+    # number of rows = same as input tensor
+    # number of cols = 8 x input tensor num cols
+    result = torch.empty(qweight.shape[0],
+                         qweight.shape[1] * 8,
+                         device=qweight.device,
+                         dtype=scales.dtype)
+
+    Y = qweight.shape[0]  # num rows
+    X = qweight.shape[1]  # num cols
+
+    grid = lambda META: (
+        triton.cdiv(X, META['BLOCK_SIZE_X']),
+        triton.cdiv(Y, META['BLOCK_SIZE_Y']),
+    )
+    awq_dequantize_kernel[grid](qweight,
+                                scales,
+                                zeros,
+                                group_size,
+                                result,
+                                X,
+                                Y,
+                                BLOCK_SIZE_X=block_size_x,
+                                BLOCK_SIZE_Y=block_size_y)
+
+    return result
+
+
+# input   - [M, K]
+# qweight - [K, N // 8]
+# qzeros  - [K // G, N // 8]
+# scales  - [K // G, N]
+# split_k_iters - parallelism along K-dimension, int, power of 2.
+def awq_gemm_triton(input: torch.Tensor,
+                    qweight: torch.Tensor,
+                    scales: torch.Tensor,
+                    qzeros: torch.Tensor,
+                    split_k_iters: int,
+                    block_size_m: int = 32,
+                    block_size_n: int = 32,
+                    block_size_k: int = 32) -> torch.Tensor:
+    M, K = input.shape
+    N = qweight.shape[1] * 8
+    group_size = qweight.shape[0] // qzeros.shape[0]
+
+    assert N > 0 and K > 0 and M > 0
+    assert qweight.shape[0] == K and qweight.shape[1] == N // 8
+    assert qzeros.shape[0] == K // group_size and qzeros.shape[1] == N // 8
+    assert scales.shape[0] == K // group_size and scales.shape[1] == N
+    assert split_k_iters & (split_k_iters - 1) == 0 and split_k_iters != 0
+    assert split_k_iters <= 32
+    assert group_size <= K
+    assert group_size in AWQ_TRITON_SUPPORTED_GROUP_SIZES or group_size == K
+
+    grid = lambda META: (
+        triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
+            N, META['BLOCK_SIZE_N']),
+        split_k_iters,
+    )
+
+    result = torch.zeros((split_k_iters, M, N),
+                         dtype=scales.dtype,
+                         device=input.device)
+
+    # A = input, B = qweight, C = result
+    # A = M x K, B = K x N, C = M x N
+    awq_gemm_kernel[grid](input,
+                          qweight,
+                          result,
+                          qzeros,
+                          scales,
+                          M,
+                          N,
+                          K,
+                          group_size,
+                          BLOCK_SIZE_M=block_size_m,
+                          BLOCK_SIZE_N=block_size_n,
+                          BLOCK_SIZE_K=block_size_k,
+                          SPLIT_K=split_k_iters)
+
+    result = result.sum(0)
+
+    return result
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/base_config.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/base_config.py
new file mode 100644
index 0000000..4a43351
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/base_config.py
@@ -0,0 +1,164 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import inspect
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any, Optional
+
+import torch
+from torch import nn
+
+if TYPE_CHECKING:
+    from vllm.model_executor.layers.quantization import QuantizationMethods
+    from vllm.model_executor.models.utils import WeightsMapper
+else:
+    QuantizationMethods = str
+
+
+class QuantizeMethodBase(ABC):
+    """Base class for different quantized methods."""
+
+    @abstractmethod
+    def create_weights(self, layer: torch.nn.Module, *weight_args,
+                       **extra_weight_attrs):
+        """Create weights for a layer.
+
+        The weights will be set as attributes of the layer."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply(self, layer: torch.nn.Module, *args, **kwargs) -> torch.Tensor:
+        """Apply the weights in layer to the input tensor.
+
+        Expects create_weights to have been called before on the layer."""
+        raise NotImplementedError
+
+    # Not required functions
+    def embedding(self, layer: torch.nn.Module, *args,
+                  **kwargs) -> torch.Tensor:
+        """Gather embeddings in the layer based on indices in the input tensor.
+
+        Expects create_weights to have been called before on the layer."""
+        raise NotImplementedError
+
+    def process_weights_after_loading(self, layer: nn.Module) -> None:
+        """Process the weight after loading.
+
+        This can be used for example, to transpose weights for computation.
+        """
+        return
+
+
+def method_has_implemented_embedding(
+        method_class: type[QuantizeMethodBase]) -> bool:
+    """
+    Not all quant methods have embedding implemented, so we need to check that
+    it exists for our given method. We check this by making sure the function
+    has been changed from the base implementation.
+    """
+    base_embedding = inspect.getattr_static(QuantizeMethodBase, "embedding",
+                                            None)
+    class_embedding = inspect.getattr_static(method_class, "embedding", None)
+
+    return (class_embedding is not None
+            and class_embedding is not base_embedding)
+
+
+class QuantizationConfig(ABC):
+    """Base class for quantization configs."""
+
+    def __init__(self):
+        super().__init__()
+        # mapping is updated by models as they initialize
+        self.packed_modules_mapping: dict[str, list[str]] = dict()
+
+    @abstractmethod
+    def get_name(self) -> QuantizationMethods:
+        """Name of the quantization method."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_supported_act_dtypes(self) -> list[torch.dtype]:
+        """List of supported activation dtypes."""
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def get_min_capability(cls) -> int:
+        """Minimum GPU capability to support the quantization method.
+
+        E.g., 70 for Volta, 75 for Turing, 80 for Ampere.
+        This requirement is due to the custom CUDA kernels used by the
+        quantization method.
+        """
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def get_config_filenames() -> list[str]:
+        """List of filenames to search for in the model directory."""
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def from_config(cls, config: dict[str, Any]) -> "QuantizationConfig":
+        """Create a config class from the model's quantization config."""
+        raise NotImplementedError
+
+    @classmethod
+    def override_quantization_method(
+            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
+        """
+           Detects if this quantization method can support a given checkpoint
+           format by overriding the user specified quantization method -- 
+           this method should only be overwritten by subclasses in exceptional 
+           circumstances
+        """
+        return None
+
+    @staticmethod
+    def get_from_keys(config: dict[str, Any], keys: list[str]) -> Any:
+        """Get a value from the model's quantization config."""
+        for key in keys:
+            if key in config:
+                return config[key]
+        raise ValueError(f"Cannot find any of {keys} in the model's "
+                         "quantization config.")
+
+    @staticmethod
+    def get_from_keys_or(config: dict[str, Any], keys: list[str],
+                         default: Any) -> Any:
+        """Get a optional value from the model's quantization config."""
+        try:
+            return QuantizationConfig.get_from_keys(config, keys)
+        except ValueError:
+            return default
+
+    @abstractmethod
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional[QuantizeMethodBase]:
+        """Get the quantize method to use for the quantized layer.
+        
+        Args:
+            layer: The layer for the quant method.
+            prefix: The full name of the layer in the state dict
+        Returns:
+            The quantize method. None if the given layer doesn't support quant
+            method.
+        """
+        raise NotImplementedError
+
+    def get_cache_scale(self, name: str) -> Optional[str]:
+        return None
+
+    def apply_vllm_mapper(  # noqa: B027
+            self, hf_to_vllm_mapper: "WeightsMapper"):
+        """
+        Interface for models to update module names referenced in
+        quantization configs in order to reflect the vllm model structure
+
+        :param hf_to_vllm_mapper: maps from hf model structure (the assumed
+            structure of the qconfig) to vllm model structure
+        """
+        # TODO (@kylesayrs): add implementations for all subclasses
+        pass
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/bitblas.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/bitblas.py
new file mode 100644
index 0000000..aa8eee8
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/bitblas.py
@@ -0,0 +1,462 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, Optional
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
+    BITBLAS_OPTIMIZE_FEATURES, BITBLAS_SUPPORTED_NUM_BITS,
+    BITBLAS_SUPPORTED_SYM, MINIMUM_BITBLAS_VERSION)
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           ChannelQuantScaleParameter,
+                                           GroupQuantScaleParameter,
+                                           PackedvLLMParameter)
+from vllm.model_executor.utils import set_weight_attrs
+
+logger = init_logger(__name__)
+
+
+class BitBLASConfig(QuantizationConfig):
+    """Config class for BitBLAS.
+
+    Reference: https://github.com/Microsoft/BitBLAS
+    """
+    TORCH_DTYPE = torch.float16
+    STORAGE_DTYPE = "int8"  # assume int8 storage
+    TORCH_STORAGE_DTYPE = getattr(torch, STORAGE_DTYPE)
+    # "original" or "rescale" or "quantized",
+    # gptq_with_bitblas prefer "quantized implementation"
+    ZEROS_MODE = "quantized"
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: Optional[int],
+        desc_act: Optional[bool],
+        is_sym: Optional[bool],
+        quant_method: Optional[str],
+        lm_head_quantized: bool,
+    ) -> None:
+        try:
+            import bitblas
+            if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
+                raise ImportError(
+                    "bitblas version is wrong. Please "
+                    f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
+        except ImportError as e:
+            bitblas_import_exception = e
+            raise ValueError(
+                "Trying to use the bitblas backend, but could not import"
+                f"with the following error: {bitblas_import_exception}. "
+                "Please install bitblas through the following command: "
+                f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`"
+            ) from bitblas_import_exception
+
+        if desc_act and group_size == -1:
+            # In this case, act_order == True is the same as act_order == False
+            # (since we have only one group per output channel)
+            desc_act = False
+
+        super().__init__()
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.desc_act = desc_act
+        self.is_sym = is_sym
+        self.quant_method = quant_method
+        self.lm_head_quantized = lm_head_quantized
+
+        # Verify
+        if self.weight_bits not in BITBLAS_SUPPORTED_NUM_BITS:
+            raise ValueError(
+                f"BitBLAS does not support weight_bits = {self.weight_bits}. "
+                f"Only weight_bits = {BITBLAS_SUPPORTED_NUM_BITS} "
+                "are supported.")
+
+        if self.is_sym not in BITBLAS_SUPPORTED_SYM:
+            raise ValueError(
+                f"BitBLAS does not support is_sym = {self.is_sym}. "
+                f"Only sym = {BITBLAS_SUPPORTED_SYM} are supported.")
+
+        storage_dtype = self.STORAGE_DTYPE
+        storage_nbit = int("".join(c for c in storage_dtype if c.isdigit()))
+
+        self.storage_dtype = storage_dtype
+        self.storage_torch_dtype = self.TORCH_STORAGE_DTYPE
+        # 4 Bits packed into 32 bit datatype.
+        self.pack_factor = storage_nbit // weight_bits
+        self.nbits = weight_bits
+
+        # Zeros type for the quantized weights.
+        self.zeros_mode = self.ZEROS_MODE
+
+    def __repr__(self) -> str:
+        return (f"BitBLASConfig(weight_bits={self.weight_bits}, "
+                f"group_size={self.group_size}, "
+                f"desc_act={self.desc_act}, "
+                f"is_sym={self.is_sym}, "
+                f"quant_method={self.quant_method})")
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "bitblas"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.half, torch.bfloat16]
+
+    @classmethod
+    # Need to figure it out
+    def get_min_capability(cls) -> int:
+        return 70
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return ["quantize_config.json"]
+
+    @staticmethod
+    def get_from_keys(config: dict[str, Any],
+                      keys: list[str],
+                      default: Any = None) -> Any:
+        """Get a value from the model's quantization config."""
+        for key in keys:
+            if key in config:
+                return config[key]
+        return default
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "BitBLASConfig":
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"], -1)
+        desc_act = cls.get_from_keys(config, ["desc_act"], False)
+        is_sym = cls.get_from_keys(config, ["sym"], False)
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
+                                                 default=False)
+        return cls(weight_bits, group_size, desc_act, is_sym, quant_method,
+                   lm_head_quantized)
+
+    @classmethod
+    def override_quantization_method(
+            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
+        # compat: autogptq >=0.8.0 use checkpoint_format: str
+        # compat: autogptq <=0.7.1 is_bitblas_format: bool
+        is_bitblas_format = (hf_quant_cfg.get("checkpoint_format") == "bitblas"
+                             or hf_quant_cfg.get("is_bitblas_format", False))
+
+        is_valid_user_quant = (user_quant is None or user_quant == "gptq"
+                               or user_quant == "bitblas")
+
+        if is_bitblas_format and is_valid_user_quant:
+            msg = ("The model is serialized in {} format. Using {} kernel.".
+                   format(cls.get_name(), cls.get_name()))
+            logger.info(msg)
+            return cls.get_name()
+
+        return None
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["BitBLASLinearMethod"]:
+        if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
+                                             and self.lm_head_quantized):
+            return BitBLASLinearMethod(self)
+        return None
+
+
+class BitBLASLinearMethod(LinearMethodBase):
+    """Linear method for BitBLAS.
+
+    Args:
+        quant_config: The BitBLAS quantization config.
+    """
+    # USE BITBLAS_OPTIMIZE_FEATURES_CONTIGUOUS
+    # Instead of BITBLAS_OPTIMIZE_FEATURES
+    # If you want to high contiguous batching
+    # performance
+    OPT_FEATURES = BITBLAS_OPTIMIZE_FEATURES
+    ENABLE_TUNING = True
+    BITBLAS_DTYPES = {
+        torch.float32: "float32",
+        torch.float16: "float16",
+        torch.bfloat16: "bfloat16",
+        torch.half: "float16",
+        torch.int8: "int8",
+    }
+
+    def __init__(self, quant_config: BitBLASConfig):
+        self.quant_config = quant_config
+
+    def create_weights_gptq(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        """Creates quantized weights for use in linear operations.
+
+        The function initializes and returns a dictionary containing quantized 
+        weights, scales, and zeros
+        for performing quantized matrix multiplication operations.
+
+        Args:
+            input_size_per_partition: The size of the input partition.
+            output_size_per_partition: The size of the output partition.
+            input_size: The total size of the input (unused).
+            output_size: The total size of the output (unused).
+            params_dtype: 
+                The data type of the parameters (expected to be torch.float16).
+
+        Returns:
+            A dictionary containing the quantized weights ('qweight'), 
+            scales ('scales'), and zeros ('zeros').
+
+        Raises:
+            ValueError: If `params_dtype` is not `torch.float16` or if the 
+            input size per partition is not divisible by the group size in 
+            `quant_config`.
+        """
+        del input_size, output_size  # Unused arguments.
+        weight_loader = extra_weight_attrs["weight_loader"]
+
+        if params_dtype not in self.quant_config.get_supported_act_dtypes():
+            raise ValueError("Parameter data type must be torch.float16, "
+                             f"but got {params_dtype}")
+        group_size = self.quant_config.group_size
+        if group_size is None:
+            group_size = -1
+        # Validate output_size_per_partition
+        output_size_per_partition = sum(output_partition_sizes)
+        if (group_size != -1 and input_size_per_partition % group_size != 0):
+            raise ValueError(
+                f"Input size per partition ({input_size_per_partition}) must "
+                f"be divisible by group size ({group_size}).")
+
+        # Initialize or retrieve the BitBLAS matrix multiplication operator.
+        self._configure_bitblas_matmul(
+            input_size_per_partition,
+            output_size_per_partition,
+            params_dtype=params_dtype,
+            enable_tuning=self.ENABLE_TUNING,
+            bias=False,
+            layout="nt",
+            bits=self.quant_config.weight_bits,
+        )
+
+        # Initialize quantized weights with dimensions
+        # Quantized 4Bit weights packed.
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                self.bitblas_matmul.retrieve_weight_shape(),
+                device="cuda",
+                dtype=self.quant_config.storage_torch_dtype,
+                requires_grad=False,
+            ),
+            input_dim=1,
+            output_dim=0,
+            packed_dim=1,
+            packed_factor=self.quant_config.pack_factor,
+            bitblas_tile_size=(self.bitblas_matmul.retrieve_weight_shape()[-2]
+                               if self.bitblas_matmul.propagate_b else None),
+            weight_loader=weight_loader,
+        )
+
+        # Compute the number of input groups for channel-wise quantization.
+        input_groups = (1 if group_size == -1 else input_size_per_partition //
+                        group_size)
+
+        # Initialize scales and zeros for the quantized weights.
+        weight_scale_args = {
+            "data":
+            torch.empty(
+                output_size_per_partition,
+                input_groups,
+                device="cuda",
+                dtype=params_dtype,
+            ),
+            "weight_loader":
+            weight_loader
+        }
+        if input_groups == 1:
+            scales = ChannelQuantScaleParameter(output_dim=0,
+                                                **weight_scale_args)
+        else:
+            scales = GroupQuantScaleParameter(output_dim=0,
+                                              input_dim=1,
+                                              **weight_scale_args)
+
+        if self.quant_config.zeros_mode == "quantized":
+            zeros = PackedvLLMParameter(
+                data=torch.empty(
+                    input_groups,
+                    output_size_per_partition // self.quant_config.pack_factor,
+                    device="cuda",
+                    dtype=self.quant_config.storage_torch_dtype,
+                    requires_grad=False,
+                ),
+                input_dim=0,
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                weight_loader=weight_loader,
+            )
+
+        else:
+            zeros = BasevLLMParameter(
+                torch.empty(output_size_per_partition,
+                            input_groups,
+                            device="cuda",
+                            dtype=params_dtype),
+                weight_loader=weight_loader,
+            )
+            # Set attributes to indicate how scales and zeros are applied.
+            set_weight_attrs(zeros, {
+                "input_dim": None if input_groups == 1 else 1,
+                "output_dim": 0,
+            })
+
+        layer.register_parameter("qweight", qweight)
+        layer.register_parameter("scales", scales)
+        layer.register_parameter("zeros", zeros)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        if self.quant_config.quant_method == "gptq":
+            return self.create_weights_gptq(layer, input_size_per_partition,
+                                            output_partition_sizes, input_size,
+                                            output_size, params_dtype,
+                                            **extra_weight_attrs)
+        else:
+            raise ValueError(
+                f"Unsupported quant_method {self.quant_config.quant_method}")
+
+    def _configure_bitblas_matmul(
+        self,
+        infeatures,
+        outfeatures,
+        params_dtype,
+        enable_tuning,
+        bias,
+        layout,
+        bits,
+        out_dtype="float16",
+    ):
+        from bitblas import MatmulConfig
+        bitblas_dtype = self.BITBLAS_DTYPES[params_dtype]
+
+        with_scaling = False
+        with_zeros = False
+        group_size = self.quant_config.group_size
+        zeros_mode = self.quant_config.zeros_mode
+        if self.quant_config.quant_method == "gptq":
+            with_scaling = True
+            with_zeros = True
+            W_dtype = f"uint{bits}"
+            if self.quant_config.is_sym:
+                with_zeros = False
+                W_dtype = f"int{bits}"
+        else:
+            raise ValueError(
+                f"Unsupported quant_method {self.quant_config.quant_method}")
+
+        matmul_config = MatmulConfig(
+            N=outfeatures,
+            K=infeatures,
+            A_dtype=bitblas_dtype,
+            W_dtype=W_dtype,
+            out_dtype=out_dtype,
+            accum_dtype="int32" if bitblas_dtype == "int8" else bitblas_dtype,
+            storage_dtype=self.quant_config.STORAGE_DTYPE,
+            with_scaling=with_scaling,
+            with_zeros=with_zeros,
+            group_size=group_size,
+            with_bias=bias,
+            layout=layout,
+            zeros_mode=zeros_mode,
+        )
+        self.bitblas_matmul = self._get_or_create_bitblas_operator(
+            matmul_config, enable_tuning)
+
+    def _get_or_create_bitblas_operator(self, config, enable_tuning):
+        from bitblas import Matmul, auto_detect_nvidia_target
+        from bitblas.cache import get_database_path, global_operator_cache
+        BITBLAS_DATABASE_PATH = get_database_path()
+        BITBLAS_TARGET = auto_detect_nvidia_target()
+        if global_operator_cache.size() == 0:
+            global_operator_cache.load_from_database(BITBLAS_DATABASE_PATH,
+                                                     BITBLAS_TARGET)
+
+        bitblas_matmul = global_operator_cache.get(config)
+        if bitblas_matmul is None:
+            bitblas_matmul = Matmul(config,
+                                    target=BITBLAS_TARGET,
+                                    enable_tuning=False)
+            if enable_tuning:
+                TUNING_MESSAGE = (f"BitBLAS Operator {config} is tuning ...")
+                logger.info(TUNING_MESSAGE)
+                bitblas_matmul.hardware_aware_finetune(topk=20)
+                global_operator_cache.add(config, bitblas_matmul)
+                global_operator_cache.save_into_database(
+                    BITBLAS_DATABASE_PATH, BITBLAS_TARGET)
+                TUNED_MESSAGE = (
+                    f"BitBLAS Operator {config} tuned and saved to database.")
+                logger.info(TUNED_MESSAGE)
+            else:
+                _message = f"BitBLAS Operator {config} created."
+                logger.info(_message)
+        else:
+            _message = (
+                f"BitBLAS Operator {config} found in global_operator_cache.")
+            logger.info(_message)
+        return bitblas_matmul
+
+    def apply_gptq(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        qweight = layer.qweight
+        scales = layer.scales
+        qzeros = layer.zeros
+
+        x_2d = x.view(-1, x.shape[-1])
+
+        if self.quant_config.is_sym:
+            output_2d = self.bitblas_matmul(x_2d, qweight, scales)
+        else:
+            output_2d = self.bitblas_matmul(x_2d, qweight, scales, qzeros)
+
+        output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
+
+        if bias is not None:
+            output.add_(bias)  # In-place add
+
+        return output
+
+    def apply(
+        self,
+        *args: Any,
+        **kwargs: Any,
+    ) -> torch.Tensor:
+        if self.quant_config.quant_method == "gptq":
+            return self.apply_gptq(*args, **kwargs)
+        else:
+            raise ValueError(
+                f"Unsupported quant_method {self.quant_config.quant_method}")
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/bitsandbytes.py
new file mode 100644
index 0000000..a96f3ee
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -0,0 +1,610 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Callable, Optional, Union
+
+import torch
+
+from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
+                                                        FusedMoEMethodBase)
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               UnquantizedLinearMethod,
+                                               set_weight_attrs)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.platforms import current_platform
+from vllm.utils import direct_register_custom_op
+
+
+class BitsAndBytesConfig(QuantizationConfig):
+    """Config class for BitsAndBytes Quantization.
+
+    Reference: https://arxiv.org/abs/2305.14314
+    """
+
+    def __init__(
+        self,
+        load_in_8bit: bool = False,
+        load_in_4bit: bool = True,
+        bnb_4bit_compute_dtype: str = "float32",
+        bnb_4bit_quant_storage: str = "uint8",
+        bnb_4bit_quant_type: str = "fp4",
+        bnb_4bit_use_double_quant: bool = False,
+        llm_int8_enable_fp32_cpu_offload: bool = False,
+        llm_int8_has_fp16_weight: bool = False,
+        llm_int8_skip_modules: Optional[list[str]] = None,
+        llm_int8_threshold: float = 6.0,
+    ) -> None:
+        super().__init__()
+        self.load_in_8bit = load_in_8bit
+        self.load_in_4bit = load_in_4bit
+        self.bnb_4bit_compute_dtype = bnb_4bit_compute_dtype
+        self.bnb_4bit_quant_storage = bnb_4bit_quant_storage
+        self.bnb_4bit_quant_type = bnb_4bit_quant_type
+        self.bnb_4bit_use_double_quant = bnb_4bit_use_double_quant
+        self.llm_int8_enable_fp32_cpu_offload = llm_int8_enable_fp32_cpu_offload
+        self.llm_int8_has_fp16_weight = llm_int8_has_fp16_weight
+        self.llm_int8_skip_modules = llm_int8_skip_modules or []
+        self.llm_int8_threshold = llm_int8_threshold
+
+        if self.bnb_4bit_quant_storage not in ["uint8"]:
+            raise ValueError("Unsupported bnb_4bit_quant_storage: "
+                             f"{self.bnb_4bit_quant_storage}")
+
+    def __repr__(self) -> str:
+        return (f"BitsAndBytesConfig(load_in_8bit={self.load_in_8bit}, "
+                f"load_in_4bit={self.load_in_4bit}, "
+                f"bnb_4bit_compute_dtype={self.bnb_4bit_compute_dtype}, "
+                f"bnb_4bit_quant_storage={self.bnb_4bit_quant_storage}, "
+                f"bnb_4bit_quant_type={self.bnb_4bit_quant_type}, "
+                f"llm_int8_skip_modules={self.llm_int8_skip_modules})")
+
+    @classmethod
+    def get_name(self) -> QuantizationMethods:
+        return "bitsandbytes"
+
+    @classmethod
+    def get_supported_act_dtypes(self) -> list[torch.dtype]:
+        return [torch.float32, torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+
+    @staticmethod
+    def get_config_filenames() -> list[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "BitsAndBytesConfig":
+
+        def get_safe_value(config, keys, default_value=None):
+            try:
+                value = cls.get_from_keys(config, keys)
+                return value if value is not None else default_value
+            except ValueError:
+                return default_value
+
+        load_in_8bit = get_safe_value(config, ["load_in_8bit"],
+                                      default_value=False)
+        load_in_4bit = get_safe_value(config, ["load_in_4bit"],
+                                      default_value=True)
+        bnb_4bit_compute_dtype = get_safe_value(config,
+                                                ["bnb_4bit_compute_dtype"],
+                                                default_value="float32")
+        bnb_4bit_quant_storage = get_safe_value(config,
+                                                ["bnb_4bit_quant_storage"],
+                                                default_value="uint8")
+        bnb_4bit_quant_type = get_safe_value(config, ["bnb_4bit_quant_type"],
+                                             default_value="fp4")
+        bnb_4bit_use_double_quant = get_safe_value(
+            config, ["bnb_4bit_use_double_quant"], default_value=False)
+        llm_int8_enable_fp32_cpu_offload = get_safe_value(
+            config, ["llm_int8_enable_fp32_cpu_offload"], default_value=False)
+        llm_int8_has_fp16_weight = get_safe_value(config,
+                                                  ["llm_int8_has_fp16_weight"],
+                                                  default_value=False)
+        llm_int8_skip_modules = get_safe_value(config,
+                                               ["llm_int8_skip_modules"],
+                                               default_value=[])
+        llm_int8_threshold = get_safe_value(config, ["llm_int8_threshold"],
+                                            default_value=6.0)
+
+        return cls(
+            load_in_8bit=load_in_8bit,
+            load_in_4bit=load_in_4bit,
+            bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
+            bnb_4bit_quant_storage=bnb_4bit_quant_storage,
+            bnb_4bit_quant_type=bnb_4bit_quant_type,
+            bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
+            llm_int8_enable_fp32_cpu_offload=llm_int8_enable_fp32_cpu_offload,
+            llm_int8_has_fp16_weight=llm_int8_has_fp16_weight,
+            llm_int8_skip_modules=llm_int8_skip_modules,
+            llm_int8_threshold=llm_int8_threshold)
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[Union["LinearMethodBase", "BitsAndBytesMoEMethod"]]:
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped_bnb(prefix, self.llm_int8_skip_modules):
+                return UnquantizedLinearMethod()
+            return BitsAndBytesLinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return BitsAndBytesMoEMethod(self)
+        return None
+
+
+def is_layer_skipped_bnb(prefix: str, llm_int8_skip_modules: list[str]):
+    # Split the prefix into its dot-separated components
+    components = prefix.split('.')
+
+    # Check if any of the skip modules exactly matches any component
+    substr_check = any(module_name in components
+                       for module_name in llm_int8_skip_modules)
+
+    # Allow certain layers to not be quantized
+    set_components = set(".".join(components[:i + 1])
+                         for i in range(len(components)))
+    set_llm_int8_skip_modules = set(llm_int8_skip_modules)
+    prefix_check = len(set_llm_int8_skip_modules & set_components) != 0
+
+    return substr_check or prefix_check
+
+
+def calculate_quant_ratio(dtype):
+    if dtype.is_floating_point:
+        return torch.finfo(dtype).bits // torch.iinfo(torch.uint8).bits
+    else:
+        return torch.iinfo(dtype).bits // torch.iinfo(torch.uint8).bits
+
+
+class BitsAndBytesLinearMethod(LinearMethodBase):
+    """Linear method for BitsAndBytes.
+
+    Args:
+       quant_config: The BitsAndBytes quantization config.
+    """
+
+    def __init__(self, quant_config: BitsAndBytesConfig):
+        try:
+            import bitsandbytes
+            if bitsandbytes.__version__ < "0.46.1":
+                raise ImportError("bitsandbytes version is wrong. Please "
+                                  "install bitsandbytes>=0.46.1.")
+        except ImportError as err:
+            raise ImportError("Please install bitsandbytes>=0.46.1 via "
+                              "`pip install bitsandbytes>=0.46.1` to use "
+                              "bitsandbytes quantizer.") from err
+
+        self.quant_config = quant_config
+
+    def create_weights(self, layer: torch.nn.Module,
+                       input_size_per_partition: int,
+                       output_partition_sizes: list[int], input_size: int,
+                       output_size: int, params_dtype: torch.dtype,
+                       **extra_weight_attrs):
+        from bitsandbytes.nn import Int8Params
+
+        def create_qweight_for_8bit():
+            qweight = Int8Params(
+                data=torch.empty(sum(output_partition_sizes),
+                                 input_size_per_partition,
+                                 dtype=torch.int8),
+                has_fp16_weights=self.quant_config.llm_int8_has_fp16_weight,
+                requires_grad=False)
+            set_weight_attrs(
+                qweight, {
+                    "input_dim": 0,
+                    "output_dim": 0,
+                    "pack_factor": 1,
+                    "use_bitsandbytes_8bit": True,
+                    "generation": 0
+                })
+            return qweight
+
+        def create_qweight_for_4bit():
+            quant_ratio = calculate_quant_ratio(params_dtype)
+
+            total_size = input_size_per_partition * sum(output_partition_sizes)
+            if total_size % quant_ratio != 0:
+                raise ValueError(
+                    "The input size is not aligned with the quantized "
+                    "weight shape.")
+
+            qweight = torch.nn.Parameter(torch.empty(total_size // quant_ratio,
+                                                     1,
+                                                     dtype=torch.uint8),
+                                         requires_grad=False)
+            set_weight_attrs(
+                qweight, {
+                    "input_dim": 0,
+                    "output_dim": 0,
+                    "pack_factor": quant_ratio,
+                    "use_bitsandbytes_4bit": True
+                })
+            return qweight
+
+        if self.quant_config.load_in_8bit:
+            qweight = create_qweight_for_8bit()
+        else:
+            qweight = create_qweight_for_4bit()
+        # Enable parameters to have the same name as in the BNB
+        # checkpoint format.
+        layer.register_parameter("weight", qweight)
+        set_weight_attrs(qweight, extra_weight_attrs)
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        if self.quant_config.load_in_8bit:
+            return self._apply_8bit_weight(layer, x, bias)
+        else:
+            return self._apply_4bit_weight(layer, x, bias)
+
+    def _apply_8bit_weight(
+            self,
+            layer: torch.nn.Module,
+            x: torch.Tensor,
+            bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        # only load the bitsandbytes module when needed
+        from bitsandbytes import MatmulLtState, matmul
+
+        original_type = x.dtype
+        original_shape = x.shape
+        reshape_after_matmul = False
+        if x.ndim > 2:
+            x = x.reshape(-1, x.size(-1))
+            reshape_after_matmul = True
+        bf_x = x.to(torch.bfloat16)
+
+        qweight = layer.weight
+        offsets = qweight.bnb_shard_offsets
+        quant_states = qweight.bnb_quant_state
+        matmul_states = qweight.matmul_state
+        generation = qweight.generation
+
+        out_dim_0 = x.shape[0]
+        out_dim_1 = sum(
+            [quant_state[1].shape[0] for quant_state in quant_states.items()])
+        out = torch.empty(out_dim_0,
+                          out_dim_1,
+                          dtype=torch.float16,
+                          device=x.device)
+
+        current_index = 0
+        for i in range(len(quant_states)):
+            output_size = quant_states[i].shape[0]
+
+            # in profile_run or the first generation of inference,
+            # create new matmul_states
+            if generation == 0 or generation == 1:
+                matmul_states[i] = MatmulLtState()
+                matmul_states[i].CB = qweight[offsets[i]:offsets[i + 1]]
+                matmul_states[i].SCB = quant_states[i].to(x.device)
+                matmul_states[i].threshold = (
+                    self.quant_config.llm_int8_threshold)
+                matmul_states[i].has_fp16_weights = (
+                    self.quant_config.llm_int8_has_fp16_weight)
+                matmul_states[i].is_training = False
+                if matmul_states[i].threshold > 0.0 and not matmul_states[
+                        i].has_fp16_weights:
+                    matmul_states[i].use_pool = True
+
+            new_x = bf_x.unsqueeze(0)
+
+            out[:, current_index:current_index + output_size] = matmul(
+                new_x,
+                qweight[offsets[i]:offsets[i + 1]],
+                state=matmul_states[i])
+
+            current_index += output_size
+
+            # only update the matmul_states if it is not profile_run
+            if (generation > 0
+                    and not self.quant_config.llm_int8_has_fp16_weight
+                    and matmul_states[i].CB is not None
+                    and matmul_states[i].CxB is not None):
+                del matmul_states[i].CB
+                qweight[offsets[i]:offsets[i + 1]] = matmul_states[i].CxB
+
+        out = out.to(original_type)
+
+        if reshape_after_matmul:
+            out = out.view(*original_shape[:-1], out.size(-1))
+
+        if bias is not None:
+            out += bias
+
+        qweight.generation += 1
+
+        return out
+
+    def _apply_4bit_weight(
+            self,
+            layer: torch.nn.Module,
+            x: torch.Tensor,
+            bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        original_type = x.dtype
+        original_shape = x.shape
+        reshape_after_matmul = False
+        if x.ndim > 2:
+            x = x.reshape(-1, x.size(-1))
+            reshape_after_matmul = True
+        bf_x = x.to(torch.bfloat16)
+
+        qweight = layer.weight
+        quant_states = qweight.bnb_quant_state
+        offsets = qweight.bnb_shard_offsets
+
+        out_dim_0 = x.shape[0]
+        out_dim_1 = sum(
+            [quant_state[1].shape[0] for quant_state in quant_states.items()])
+        out = torch.empty(out_dim_0,
+                          out_dim_1,
+                          dtype=torch.bfloat16,
+                          device=x.device)
+        apply_bnb_4bit(bf_x, qweight, offsets, out)
+        out = out.to(original_type)
+
+        if reshape_after_matmul:
+            out = out.view(*original_shape[:-1], out.size(-1))
+
+        if bias is not None:
+            out += bias
+
+        return out
+
+
+def _apply_bnb_4bit(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    offsets: torch.Tensor,
+    out: torch.Tensor,
+) -> None:
+    # only load the bitsandbytes module when needed
+    from bitsandbytes import matmul_4bit
+    quant_states = weight.bnb_quant_state
+    current_index = 0
+    for i in range(len(quant_states)):
+        output_size = quant_states[i].shape[0]
+        # It is more efficient to use out kwarg like
+        # matmul_4bit(..., out = ...).  Infeasible now due to the bug
+        # https://github.com/TimDettmers/bitsandbytes/issues/1235.
+        # Need to change  after the bug is fixed.
+        out[:, current_index:current_index + output_size] = matmul_4bit(
+            x, weight[offsets[i]:offsets[i + 1]].t(), quant_states[i])
+        current_index += output_size
+
+
+def _apply_bnb_4bit_fake(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    offsets: torch.Tensor,
+    out: torch.Tensor,
+) -> None:
+    return
+
+
+try:
+    direct_register_custom_op(op_name="apply_bnb_4bit",
+                              op_func=_apply_bnb_4bit,
+                              mutates_args=["out"],
+                              fake_impl=_apply_bnb_4bit_fake,
+                              dispatch_key=current_platform.dispatch_key)
+    apply_bnb_4bit = torch.ops.vllm.apply_bnb_4bit
+
+except AttributeError as error:
+    raise error
+
+
+class BitsAndBytesMoEMethod(FusedMoEMethodBase):
+    """MoE method for BitsAndBytes.
+
+    Args:
+       quant_config: The BitsAndBytes quantization config.
+    """
+
+    def __init__(self, quant_config: BitsAndBytesConfig):
+        try:
+            import bitsandbytes
+            if bitsandbytes.__version__ < "0.45.3":
+                raise ImportError("bitsandbytes version is wrong. Please "
+                                  "install bitsandbytes>=0.45.3.")
+        except ImportError as err:
+            raise ImportError("Please install bitsandbytes>=0.45.3 via "
+                              "`pip install bitsandbytes>=0.45.3` to use "
+                              "bitsandbytes quantizer.") from err
+        self.topk_indices_dtype = None
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        if self.quant_config.load_in_8bit:
+            call_fun = self._create_weights_8bit
+        else:
+            call_fun = self._create_weights_4bit
+        call_fun(
+            layer,
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition,
+            params_dtype,
+            **extra_weight_attrs,
+        )
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        from vllm.model_executor.layers.fused_moe import fused_experts
+
+        if enable_eplb:
+            raise NotImplementedError(
+                "EPLB not supported for `BitsAndBytesMoEMethod` yet.")
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype)
+        if self.quant_config.load_in_8bit:
+            w13, w2 = self._apply_8bit_dequant(layer)
+        else:
+            w13, w2 = self._apply_4bit_dequnt(layer)
+        return fused_experts(
+            hidden_states=x,
+            w1=w13,
+            w2=w2,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            activation=activation,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+        )
+
+    def _create_weights_4bit(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        quant_ratio = calculate_quant_ratio(params_dtype)
+        # Fused gate_up_proj (column parallel)
+        w13_total_size = (hidden_size * 2 *
+                          intermediate_size_per_partition) // quant_ratio
+        w13_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                w13_total_size,
+                1,
+                dtype=torch.uint8,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_qweight)
+        set_weight_attrs(w13_qweight, extra_weight_attrs)
+        set_weight_attrs(
+            w13_qweight,
+            {
+                "num_experts":
+                num_experts,
+                "input_dim":
+                hidden_size,
+                "output_dim":
+                2 * intermediate_size_per_partition,
+                "experts_shape": (
+                    num_experts,
+                    intermediate_size_per_partition * 2,
+                    hidden_size,
+                ),
+                "pack_factor":
+                quant_ratio,
+                "use_bitsandbytes_4bit":
+                True,
+            },
+        )
+        # down_proj (row parallel)
+        w2_total_size = (hidden_size *
+                         intermediate_size_per_partition) // quant_ratio
+        w2_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                w2_total_size,
+                1,
+                dtype=torch.uint8,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            w2_qweight,
+            {
+                "num_experts":
+                num_experts,
+                "input_dim":
+                intermediate_size_per_partition,
+                "output_dim":
+                hidden_size,
+                "experts_shape": (
+                    num_experts,
+                    hidden_size,
+                    intermediate_size_per_partition,
+                ),
+                "pack_factor":
+                quant_ratio,
+                "use_bitsandbytes_4bit":
+                True,
+            },
+        )
+        layer.register_parameter("w2_weight", w2_qweight)
+        set_weight_attrs(w2_qweight, extra_weight_attrs)
+
+    def _create_weights_8bit(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        raise NotImplementedError
+
+    def _apply_4bit_dequnt(
+            self, layer: torch.nn.Module) -> tuple[torch.Tensor, torch.Tensor]:
+        from bitsandbytes.functional import dequantize_4bit
+        w13 = dequantize_4bit(
+            layer.w13_weight.reshape(-1, 1),
+            layer.w13_weight.bnb_quant_state,
+        )
+        w2 = dequantize_4bit(
+            layer.w2_weight.reshape(-1, 1),
+            layer.w2_weight.bnb_quant_state,
+        )
+        w13 = w13.reshape(layer.w13_weight.experts_shape)
+        w2 = w2.reshape(layer.w2_weight.experts_shape)
+        return w13, w2
+
+    def _apply_8bit_dequant(
+            self, layer: torch.nn.Module) -> tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
new file mode 100644
index 0000000..90b45e3
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -0,0 +1,700 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from contextlib import suppress
+from typing import TYPE_CHECKING, Any, Literal, Optional, cast
+
+import torch
+from compressed_tensors.config import (CompressionFormat,
+                                       SparsityCompressionConfig,
+                                       SparsityStructure)
+from compressed_tensors.quantization import (QuantizationArgs,
+                                             QuantizationStrategy,
+                                             QuantizationType)
+from pydantic import BaseModel
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import (  # noqa: E501
+    CompressedTensorsMoEMethod)
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS, CompressedTensors24,
+    CompressedTensorsScheme, CompressedTensorsW4A4Fp4,
+    CompressedTensorsW4A16Fp4, CompressedTensorsW4A16Sparse24,
+    CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
+    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    find_matched_target, is_activation_quantization_format,
+    should_ignore_layer)
+from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import (  # noqa: E501
+    cutlass_fp4_supported)
+from vllm.platforms import current_platform
+
+if TYPE_CHECKING:
+    from vllm.model_executor.models.utils import WeightsMapper
+
+logger = init_logger(__name__)
+
+__all__ = ["CompressedTensorsLinearMethod"]
+
+SPARSITY_CONFIG_NAME: Literal["sparsity_config"] = "sparsity_config"
+QUANTIZATION_SCHEME_MAP_TYPE = dict[str, Optional[dict[str, QuantizationArgs]]]
+
+
+class CompressedTensorsConfig(QuantizationConfig):
+
+    def __init__(
+        self,
+        target_scheme_map: dict[str, Any],
+        ignore: list[str],
+        quant_format: str,
+        sparsity_scheme_map: dict[str, SparsityCompressionConfig],
+        sparsity_ignore_list: list[str],
+        kv_cache_scheme: Optional[dict[str, Any]] = None,
+        config: Optional[dict[str, Any]] = None,
+    ):
+        super().__init__()
+        self.ignore = ignore
+        self.quant_format = quant_format
+        # Map from [target -> scheme]
+        self.target_scheme_map = target_scheme_map
+        self.kv_cache_scheme = kv_cache_scheme
+        self.sparsity_scheme_map = sparsity_scheme_map
+        self.sparsity_ignore_list = sparsity_ignore_list
+        self.config = config
+
+    def get_linear_method(self) -> "CompressedTensorsLinearMethod":
+        return CompressedTensorsLinearMethod(self)
+
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+
+    def get_name(self) -> QuantizationMethods:
+        return "compressed-tensors"
+
+    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
+        self.target_scheme_map = hf_to_vllm_mapper.apply_dict(
+            self.target_scheme_map)
+        self.ignore = hf_to_vllm_mapper.apply_list(self.ignore)
+        self.sparsity_scheme_map = hf_to_vllm_mapper.apply_dict(
+            self.sparsity_scheme_map)
+        self.sparsity_ignore_list = hf_to_vllm_mapper.apply_list(
+            self.sparsity_ignore_list)
+        if self.kv_cache_scheme is not None:
+            self.kv_cache_scheme = hf_to_vllm_mapper.apply_dict(
+                self.kv_cache_scheme)
+
+    def get_quant_method(
+        self,
+        layer: torch.nn.Module,
+        prefix: str,
+    ) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import Attention  # Avoid circular import
+
+        # Check if the layer is skipped for quantization.
+        # TODO (@robertgshaw2): support module names
+        if should_ignore_layer(prefix,
+                               ignore=self.ignore,
+                               fused_mapping=self.packed_modules_mapping):
+            return UnquantizedLinearMethod()
+        if isinstance(layer, LinearBase):
+            scheme = self.get_scheme(layer=layer, layer_name=prefix)
+            if scheme is None:
+                return UnquantizedLinearMethod()
+            layer.scheme = scheme
+            return CompressedTensorsLinearMethod(self)
+        if isinstance(layer, Attention):
+            return CompressedTensorsKVCacheMethod(self)
+        if isinstance(layer, FusedMoE):
+            return CompressedTensorsMoEMethod.get_moe_method(self, layer)
+        return None
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "CompressedTensorsConfig":
+        ignore: list[str] = cast(list[str], config.get("ignore", []))
+        quant_format = cast(str, config.get("format"))
+        target_scheme_map = cls._quantization_scheme_map_from_config(
+            config=config)
+        sparsity_scheme_map, sparsity_ignore_list = cls._parse_sparsity_config(
+            config=config)
+
+        return cls(
+            target_scheme_map=target_scheme_map,
+            ignore=ignore,
+            quant_format=quant_format,
+            sparsity_scheme_map=sparsity_scheme_map,
+            sparsity_ignore_list=sparsity_ignore_list,
+            config=config,
+        )
+
+    @classmethod
+    def _parse_sparsity_config(
+        cls, config: dict[str, Any]
+    ) -> tuple[dict[str, SparsityCompressionConfig], list[str]]:
+        """
+        :param config: The `quantization_config` dictionary from config.json
+        :return: A tuple with two elements
+            1. A dictionary mapping target layer names to their corresponding
+                sparsity_config
+            2. A list of layer names to ignore for sparsity
+        """
+        if not (sparsity_config := config.get(SPARSITY_CONFIG_NAME)):
+            return dict(), []
+
+        sparsity_config = SparsityCompressionConfig.model_validate(
+            sparsity_config)
+        sparse_scheme_map: dict[str, SparsityCompressionConfig] = {
+            target: sparsity_config
+            for target in sparsity_config.targets or list()
+        }
+        sparsity_ignore_list = sparsity_config.ignore or list()
+        return sparse_scheme_map, sparsity_ignore_list
+
+    @classmethod
+    def _quantization_scheme_map_from_config(
+            cls, config: dict[str, Any]) -> QUANTIZATION_SCHEME_MAP_TYPE:
+        """
+        :param config: The `quantization_config` dictionary from config.json
+        :return: A dictionary mapping target layer names to their corresponding
+            quantization_args for weights and input activations
+        """
+        target_scheme_map: dict[str, Any] = dict()
+        quant_format = cast(str, config.get("format"))
+
+        # The quant_config has multiple config_groups, each containing
+        # an input_activations key with details about how the activations are
+        # quantized, a weights key indicating how the weights are quantized,
+        # and a list of targets under the `targets` key, dictating which
+        # layers are impacted by the quantization details. The quantization
+        # details follow the structure defined by the QuantizationArgs
+        # pydantic model, which is used to verify the structure of the
+        # quant_config and also store the details for later use.
+
+        config_groups = config.get("config_groups", dict())
+        for _, quant_config in config_groups.items():
+            targets = quant_config.get("targets")
+            for target in targets:
+                target_scheme_map[target] = {}
+                target_scheme_map[target][
+                    "weights"] = QuantizationArgs.model_validate(
+                        quant_config.get("weights"))
+
+                target_scheme_map[target]["input_activations"] = None
+                if is_activation_quantization_format(quant_format):
+                    input_activations = quant_config.get("input_activations")
+                    # The only case where we have activation quant supported
+                    # but no input_activations provided in the config
+                    # should be w8a16fp8 w8a16fp8 can also run for cases where
+                    # there is an input_quant but it is ignored
+                    if not input_activations:
+                        assert target_scheme_map[target][
+                            "weights"].type == QuantizationType.FLOAT
+                    else:
+                        target_scheme_map[target][
+                            "input_activations"] = QuantizationArgs.model_validate(  # noqa: E501
+                                quant_config.get("input_activations"))
+        return target_scheme_map
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return []
+
+    def _check_scheme_supported(self,
+                                min_capability: int,
+                                error: bool = True,
+                                match_exact: bool = False) -> bool:
+        capability_tuple = current_platform.get_device_capability()
+
+        if capability_tuple is not None:
+            capability = capability_tuple.to_int()
+            if match_exact:
+                supported = capability == min_capability
+                if error and not supported:
+                    raise RuntimeError(
+                        "Quantization scheme is not supported for ",
+                        "the current GPU. Required capability: ",
+                        f"{min_capability}. Current capability: {capability}.")
+            else:
+                supported = capability >= min_capability
+                if error and not supported:
+                    raise RuntimeError(
+                        "Quantization scheme is not supported for ",
+                        f"the current GPU. Min capability: {min_capability}. ",
+                        f"Current capability: {capability}.")
+            return supported
+        else:
+            return False
+
+    def _is_fp4a4_nvfp4(self, weight_quant: BaseModel, input_quant: BaseModel):
+
+        if weight_quant is None or input_quant is None:
+            return False
+
+        is_tensor_group_quant = (weight_quant.strategy
+                                 == QuantizationStrategy.TENSOR_GROUP.value
+                                 and input_quant.strategy
+                                 == QuantizationStrategy.TENSOR_GROUP.value)
+        is_symmetric = weight_quant.symmetric and input_quant.symmetric
+
+        is_group_size_16 = (weight_quant.group_size == 16
+                            and input_quant.group_size == 16)
+        is_float_type = (weight_quant.type == QuantizationType.FLOAT
+                         and input_quant.type == QuantizationType.FLOAT.value)
+        is_4_bits = weight_quant.num_bits == 4 and input_quant.num_bits == 4
+
+        return (is_tensor_group_quant and is_float_type and is_4_bits
+                and is_group_size_16 and is_symmetric)
+
+    def _is_fp4a16_nvfp4(self, weight_quant: BaseModel,
+                         input_quant: BaseModel):
+
+        is_weight_only = weight_quant is not None and input_quant is None
+        is_tensor_group_quant = (
+            weight_quant.strategy == QuantizationStrategy.TENSOR_GROUP.value)
+        is_symmetric = weight_quant.symmetric
+
+        is_group_size_16 = weight_quant.group_size == 16
+        is_float_type = weight_quant.type == QuantizationType.FLOAT
+        is_4_bits = weight_quant.num_bits == 4
+
+        return (is_weight_only and is_tensor_group_quant and is_float_type
+                and is_4_bits and is_group_size_16 and is_symmetric)
+
+    def _is_static_tensor_w8a8(self, weight_quant: BaseModel,
+                               input_quant: BaseModel) -> bool:
+        is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
+        weight_strategy = (
+            weight_quant.strategy == QuantizationStrategy.TENSOR.value
+            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value)
+        is_tensor = (weight_strategy and input_quant.strategy
+                     == QuantizationStrategy.TENSOR.value)
+        is_static = not weight_quant.dynamic and not input_quant.dynamic
+
+        # Both symmetric and asymmetric input quantization supported.
+        # Only symmetric weight quantization supported.
+        return is_8_bits and is_tensor and weight_quant.symmetric and is_static
+
+    def _is_dynamic_token_w8a8(self, weight_quant: BaseModel,
+                               input_quant: BaseModel) -> bool:
+        is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
+        weight_strategy = (
+            weight_quant.strategy == QuantizationStrategy.TENSOR.value
+            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value)
+        is_token = (weight_strategy and input_quant.strategy
+                    == QuantizationStrategy.TOKEN.value)
+        is_dynamic = not weight_quant.dynamic and input_quant.dynamic
+
+        # Both symmetric and asymmetric input quantization supported.
+        # Only symmetric weight quantization supported.
+        return is_8_bits and is_token and weight_quant.symmetric and is_dynamic
+
+    def _is_fp8_w8a8(self, weight_quant: BaseModel,
+                     input_quant: BaseModel) -> bool:
+        # Confirm weights and activations quantized.
+        if weight_quant is None or input_quant is None:
+            return False
+
+        # Confirm weight scheme is supported.
+        is_floating_point = (weight_quant.type == QuantizationType.FLOAT
+                             and input_quant.type == QuantizationType.FLOAT)
+        is_symmetric_weight = weight_quant.symmetric
+        is_static_weight = not weight_quant.dynamic
+        is_per_tensor_or_channel_weight = (weight_quant.strategy in [
+            QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL
+        ])
+        if not (is_floating_point and is_symmetric_weight and is_static_weight
+                and is_per_tensor_or_channel_weight):
+            return False
+
+        # Dynamic quantization is always supported if weights supported.
+        if input_quant.dynamic:
+            return True
+
+        # Confirm activation scheme is supported.
+        is_symmetric_activation = input_quant.symmetric
+        is_per_tensor_activation = (
+            input_quant.strategy == QuantizationStrategy.TENSOR)
+        return is_symmetric_activation and is_per_tensor_activation
+
+    def _is_fp8_w8a8_sm90(self, weight_quant: BaseModel,
+                          input_quant: BaseModel) -> bool:
+        return (self._check_scheme_supported(90, error=False, match_exact=True)
+                and self._is_fp8_w8a8(weight_quant, input_quant))
+
+    def _is_fp8_w8a8_sm100(self, weight_quant: BaseModel,
+                           input_quant: BaseModel) -> bool:
+        return (self._check_scheme_supported(
+            100, error=False, match_exact=True)
+                and self._is_fp8_w8a8(weight_quant, input_quant))
+
+    def _is_fp8_w8a16(self, weight_quant: BaseModel,
+                      input_quant: BaseModel) -> bool:
+        # Confirm weights quantized.
+        if weight_quant is None:
+            return False
+
+        # Confirm we have floating points.
+        if weight_quant.type != QuantizationType.FLOAT:
+            return False
+
+        # Confirm weight scheme is supported.
+        is_symmetric_weight = weight_quant.symmetric
+        is_static_weight = not weight_quant.dynamic
+        is_per_tensor_or_channel_weight = (weight_quant.strategy in [
+            QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL
+        ])
+        if not (is_symmetric_weight and is_static_weight  # noqa: SIM103
+                and is_per_tensor_or_channel_weight):
+            return False
+
+        # All conditions satisfied.
+        return True
+
+    def _is_wNa16_group_channel(self, weight_quant: BaseModel,
+                                input_quant: BaseModel) -> bool:
+        input_quant_none = input_quant is None
+        is_channel_group = (
+            weight_quant.strategy == QuantizationStrategy.CHANNEL.value
+            or weight_quant.strategy == QuantizationStrategy.GROUP.value)
+        is_static = not weight_quant.dynamic
+
+        return (is_channel_group and input_quant_none and is_static)
+
+    def _get_scheme_from_parts(
+            self, weight_quant: BaseModel,
+            input_quant: BaseModel) -> "CompressedTensorsScheme":
+
+        # Detect If Mixed Precision
+        if self._is_fp4a16_nvfp4(weight_quant, input_quant):
+            return CompressedTensorsW4A16Fp4()
+
+        if self._is_wNa16_group_channel(weight_quant, input_quant):
+            if (self.quant_format == CompressionFormat.marlin_24.value
+                    and weight_quant.num_bits in W4A16SPARSE24_SUPPORTED_BITS):
+                assert weight_quant.symmetric
+                return CompressedTensorsW4A16Sparse24(
+                    strategy=weight_quant.strategy,
+                    num_bits=weight_quant.num_bits,
+                    group_size=weight_quant.group_size)
+            if (self.quant_format == CompressionFormat.pack_quantized.value
+                    and weight_quant.num_bits in WNA16_SUPPORTED_BITS):
+                return CompressedTensorsWNA16(
+                    num_bits=weight_quant.num_bits,
+                    strategy=weight_quant.strategy,
+                    symmetric=weight_quant.symmetric,
+                    group_size=weight_quant.group_size,
+                    actorder=weight_quant.actorder)
+
+        if is_activation_quantization_format(self.quant_format):
+            if self._is_fp4a4_nvfp4(weight_quant, input_quant):
+                if cutlass_fp4_supported(
+                ) or envs.VLLM_USE_NVFP4_CT_EMULATIONS:
+                    return CompressedTensorsW4A4Fp4()
+                else:
+                    logger.warning_once(
+                        "Current platform does not support cutlass NVFP4."
+                        " Running CompressedTensorsW4A16Fp4.")
+                    return CompressedTensorsW4A16Fp4(
+                        has_input_global_scale=True)
+
+            if self._is_fp8_w8a8(weight_quant, input_quant):
+                is_fp8_w8a8_supported = self._check_scheme_supported(
+                    CompressedTensorsW8A8Fp8.get_min_capability(), error=False)
+                if is_fp8_w8a8_supported:
+                    return CompressedTensorsW8A8Fp8(
+                        strategy=weight_quant.strategy,
+                        is_static_input_scheme=(input_quant
+                                                and not input_quant.dynamic))
+                else:
+                    # note: input_quant will be present for converted models;
+                    # will be ignored during inference post loading
+                    return CompressedTensorsW8A16Fp8(
+                        strategy=weight_quant.strategy,
+                        is_static_input_scheme=not input_quant.dynamic)
+
+            # note: input_quant can be None
+            if self._is_fp8_w8a16(weight_quant, input_quant):
+                is_static_input_scheme = (input_quant
+                                          and not input_quant.dynamic)
+                return CompressedTensorsW8A16Fp8(
+                    strategy=weight_quant.strategy,
+                    is_static_input_scheme=is_static_input_scheme)
+
+            if self._is_static_tensor_w8a8(weight_quant, input_quant):
+                return CompressedTensorsW8A8Int8(
+                    strategy=weight_quant.strategy,
+                    is_static_input_scheme=True,
+                    input_symmetric=input_quant.symmetric)
+
+            if self._is_dynamic_token_w8a8(weight_quant, input_quant):
+                return CompressedTensorsW8A8Int8(
+                    strategy=weight_quant.strategy,
+                    is_static_input_scheme=False,
+                    input_symmetric=input_quant.symmetric)
+
+        raise NotImplementedError(
+            "No compressed-tensors compatible scheme was found.")
+
+    def get_scheme(self,
+                   layer: torch.nn.Module,
+                   layer_name: Optional[str] = None
+                   ) -> Optional["CompressedTensorsScheme"]:
+        """
+        compressed-tensors supports non uniform in the following way:
+
+        targets of config_groups: There can be N config_groups which each
+            have a quantization scheme. Each config_group has a list of targets
+            which can be a full layer_name, a regex for a layer_name, or
+            an nn.Module name.
+
+        Detect whether a layer_name is found in any target and
+        use the quantization scheme corresponding to the matched target
+        to select the CompressedTensorsScheme used for inference.
+        """
+
+        # Find the "target" in the compressed-tensors config
+        # that our layer conforms to.
+        # TODO (@robertgshaw): add compressed-tensors as dep
+        # so we do not have to re-write these functions
+        # need to make accelerate optional in ct to do this
+
+        # Will be empty for models with only sparsity
+        weight_quant = input_quant = None
+        if self.target_scheme_map:
+            matched_target = find_matched_target(
+                layer_name=layer_name,
+                module=layer,
+                targets=self.target_scheme_map.keys(),
+                fused_mapping=self.packed_modules_mapping)
+
+            scheme_dict = self.target_scheme_map[matched_target]
+            weight_quant = scheme_dict.get("weights")
+            input_quant = scheme_dict.get("input_activations")
+
+        # Find the sparsity scheme of the layer
+        # assume that fused layers inerhit first component's sparsity scheme
+        sparsity_targets = (self.sparsity_scheme_map.keys() -
+                            set(self.sparsity_ignore_list))
+        sparsity_scheme: Optional[SparsityCompressionConfig] = None
+        with suppress(ValueError):
+            matched_target = find_matched_target(
+                layer_name=layer_name,
+                module=layer,
+                targets=sparsity_targets,
+                fused_mapping=self.packed_modules_mapping)
+            sparsity_scheme = self.sparsity_scheme_map[matched_target]
+
+        if self.supports_cutlass_24(weight_quant=weight_quant,
+                                    input_quant=input_quant,
+                                    sparsity_scheme=sparsity_scheme):
+            # Have a valid sparsity scheme
+            # Validate layer is supported by Cutlass 2:4 Kernel
+            model_compression_config = (None if sparsity_scheme is None
+                                        or sparsity_scheme.format == "dense"
+                                        else self.config)
+
+            scheme = CompressedTensors24(
+                quantized=weight_quant is not None or input_quant is not None,
+                weight_quant=weight_quant,
+                input_quant=input_quant,
+                model_compression_config=model_compression_config,
+            )
+        elif weight_quant is None:
+            logger.warning_once("Acceleration for non-quantized schemes is "
+                                "not supported by Compressed Tensors. "
+                                "Falling back to UnquantizedLinearMethod")
+            return None
+
+        else:
+            # Find the quant_scheme
+            scheme = self._get_scheme_from_parts(  # type: ignore
+                weight_quant=weight_quant,
+                input_quant=input_quant,
+            )
+
+        # Raise error if device does not support the scheme
+        # (e.g. fp8 needs ada lovelace)
+        self._check_scheme_supported(scheme.get_min_capability())
+        logger.debug("Using scheme: %s for %s", scheme.__class__.__name__,
+                     layer_name)
+        return scheme
+
+    def get_cache_scale(self, name: str) -> Optional[str]:
+        """
+        Check whether the param name matches the format for k/v cache scales
+        in compressed-tensors. If this is the case, return its equivalent
+        param name expected by vLLM
+
+        :param name: param name
+        :return: matching param name for KV cache scale in vLLM
+        """
+        if name.endswith(".output_scale") and ".k_proj" in name:
+            return name.replace(".k_proj.output_scale", ".attn.k_scale")
+        if name.endswith(".output_scale") and ".v_proj" in name:
+            return name.replace(".v_proj.output_scale", ".attn.v_scale")
+        # If no matches, return None
+        return None
+
+    @staticmethod
+    def supports_cutlass_24(
+            weight_quant: Optional[QuantizationArgs],
+            input_quant: Optional[QuantizationArgs],
+            sparsity_scheme: Optional[SparsityCompressionConfig] = None
+    ) -> bool:
+        """
+        Check if the layer is supported by the Cutlass 2:4 Kernel
+        Conditions:
+            - Overarching condition: Sparsity Structure is 2:4
+            - Unquantized cases are supported
+            - Weight only quantization is not-supported
+            - Supported weight quantization strategies are TENSOR and CHANNEL
+            - Supported input quantization strategies are TENSOR and TOKEN
+            - Only 8 bit quantization is supported 
+
+        :return: True if the layer is supported by the Cutlass 2:4 Kernel
+            False otherwise
+        """
+        if sparsity_scheme is None:
+            return False
+
+        is_valid_sparsity_structure: bool = (
+            sparsity_scheme.sparsity_structure ==
+            SparsityStructure.TWO_FOUR.value)
+
+        valid_compressors = {
+            CompressionFormat.dense.value,
+            CompressionFormat.sparse_24_bitmask.value
+        }
+
+        is_valid_sparsity = (is_valid_sparsity_structure
+                             and sparsity_scheme.format in valid_compressors)
+
+        if not is_valid_sparsity:
+            return False
+
+        # Unquantized cases are supported
+        if weight_quant is None and input_quant is None:
+            return True
+
+        # Weight only quantization is not-supported
+        if weight_quant is not None and input_quant is None:
+            return False
+
+        supported_weight_quant_strategies = [
+            QuantizationStrategy.TENSOR.value,
+            QuantizationStrategy.CHANNEL.value
+        ]
+
+        assert weight_quant is not None
+        assert input_quant is not None
+        if weight_quant.strategy not in supported_weight_quant_strategies:
+            return False
+
+        supported_input_quant_strategies = [
+            QuantizationStrategy.TENSOR.value, QuantizationStrategy.TOKEN.value
+        ]
+
+        if input_quant.strategy not in supported_input_quant_strategies:
+            return False
+
+        return weight_quant.num_bits == input_quant.num_bits == 8
+
+
+class CompressedTensorsLinearMethod(LinearMethodBase):
+
+    def __init__(self, quantization_config: CompressedTensorsConfig):
+        self.quantization_config = quantization_config
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.scheme.process_weights_after_loading(layer)
+
+    def create_weights(self, layer: torch.nn.Module,
+                       input_size_per_partition: int,
+                       output_partition_sizes: list[int], input_size: int,
+                       output_size: int, params_dtype: torch.dtype,
+                       **extra_weight_attrs):
+        """
+        Use the CompressedTensorsScheme associated with each layer to create
+        the necessary parameters for the layer. See LinearMethodBase for param
+        details
+        """
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        layer.scheme.create_weights(
+            layer=layer,
+            input_size=input_size,
+            input_size_per_partition=input_size_per_partition,
+            output_partition_sizes=output_partition_sizes,
+            output_size=output_size,
+            params_dtype=params_dtype,
+            weight_loader=weight_loader)
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None):
+        """
+        Use the output of create_weights and the CompressedTensorsScheme
+        associated with the layer to apply the forward pass with the
+        layer input.  See LinearMethodBase for param details
+
+        """
+
+        scheme = layer.scheme
+        if scheme is None:
+            raise ValueError("A scheme must be defined for each layer")
+        return scheme.apply_weights(layer, x, bias=bias)
+
+
+class CompressedTensorsKVCacheMethod(BaseKVCacheMethod):
+    """
+    Supports loading kv-cache scaling factors from compressed-tensors
+    checkpoints.
+    """
+
+    def __init__(self, quant_config: CompressedTensorsConfig):
+        self.validate_kv_cache_scheme(quant_config.kv_cache_scheme)
+        super().__init__(quant_config)
+
+    @staticmethod
+    def validate_kv_cache_scheme(kv_cache_scheme: Optional[dict[str, Any]]):
+        """
+        Validator for the kv cache scheme. Useful for controlling the
+        kv cache quantization schemes, that are being supported in vLLM
+        :param kv_cache_scheme: the compressed-tensors kv cache scheme
+        """
+        if kv_cache_scheme is None:
+            return
+
+        type_ = kv_cache_scheme.get("type")
+        num_bits = kv_cache_scheme.get("num_bits")
+
+        if type_ != "float" and num_bits != 8:
+            raise NotImplementedError(
+                "Currently supported kv cache quantization is "
+                "num_bits=8, type=float, however "
+                f"received num_bits={num_bits}, type={type_}")
+
+        strategy = kv_cache_scheme.get("strategy")
+        if strategy != "tensor":
+            raise NotImplementedError(
+                "Only support per-tensor scaling factor "
+                "for compressed-tensors KV cache. "
+                f"Expected strategy: tensor, found strategy: {strategy}")
+
+        is_symmetric = kv_cache_scheme.get("symmetric")
+        if not is_symmetric:
+            raise NotImplementedError(
+                "Only support symmetric scaling factor "
+                "for compressed-tensors KV cache. "
+                f"However found symmetric: {is_symmetric}")
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
new file mode 100644
index 0000000..7da52ce
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -0,0 +1,1657 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import enum
+from enum import Enum
+from typing import Callable, Optional
+
+import torch
+from compressed_tensors import CompressionFormat
+from compressed_tensors.quantization import (ActivationOrdering,
+                                             QuantizationStrategy)
+
+import vllm.envs as envs
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe import (
+    FusedMoE, FusedMoEActivationFormat, FusedMoEConfig, FusedMoEMethodBase,
+    FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize,
+    FusedMoeWeightScaleSupported)
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import (  # noqa
+    WNA16_SUPPORTED_BITS, WNA16_SUPPORTED_TYPES_MAP)
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    check_moe_marlin_supports_layer, marlin_make_workspace_new,
+    marlin_moe_permute_scales)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
+    prepare_moe_fp4_layer_for_marlin)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
+    prepare_moe_fp8_layer_for_marlin)
+from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import (  # noqa: E501
+    cutlass_fp4_supported)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+
+logger = init_logger(__name__)
+
+
+class GPTQMarlinState(Enum):
+    REPACK = enum.auto()
+    READY = enum.auto()
+
+
+__all__ = [
+    "CompressedTensorsMoEMethod", "CompressedTensorsW8A8Fp8MoEMethod",
+    "CompressedTensorsW8A8Fp8MoECutlassMethod",
+    "CompressedTensorsW8A8Int8MoEMethod",
+    "CompressedTensorsWNA16MarlinMoEMethod", "CompressedTensorsWNA16MoEMethod",
+    "CompressedTensorsW4A4MoeMethod"
+]
+
+
+class CompressedTensorsMoEMethod(FusedMoEMethodBase):
+
+    @staticmethod
+    def get_moe_method(
+        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
+        layer: torch.nn.Module,
+    ) -> "CompressedTensorsMoEMethod":
+        # TODO: @dsikka: refactor this to use schemes as other kernels
+        # are supported + check if the layer is being ignored.
+        weight_quant = quant_config.target_scheme_map["Linear"].get("weights")
+        input_quant = quant_config.target_scheme_map["Linear"].get(
+            "input_activations")
+
+        if quant_config._is_wNa16_group_channel(weight_quant, input_quant):
+            # group_size=None means channelwise
+            group_size = weight_quant.group_size or -1
+            # Prefer to use the MarlinMoE kernel when it is supported.
+            if not check_moe_marlin_supports_layer(layer, group_size):
+                if (weight_quant.strategy in QuantizationStrategy.GROUP and
+                        weight_quant.actorder in (ActivationOrdering.GROUP,
+                                                  ActivationOrdering.DYNAMIC)):
+                    raise ValueError(
+                        "WNA16MoE is not supported with actorder=group/dynamic."
+                    )
+                logger.info_once("Using CompressedTensorsWNA16MoEMethod")
+                return CompressedTensorsWNA16MoEMethod(quant_config)
+            else:
+                logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod")
+                return CompressedTensorsWNA16MarlinMoEMethod(quant_config)
+        elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant):
+            return CompressedTensorsW4A4MoeMethod()
+        elif (quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
+              or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant)):
+            return CompressedTensorsW8A8Fp8MoECutlassMethod(quant_config)
+        elif quant_config._is_fp8_w8a8(weight_quant, input_quant):
+            return CompressedTensorsW8A8Fp8MoEMethod(quant_config)
+        elif quant_config._is_dynamic_token_w8a8(weight_quant, input_quant):
+            return CompressedTensorsW8A8Int8MoEMethod(quant_config)
+        else:
+            raise RuntimeError(
+                f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}")
+
+
+class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
+
+    def __init__(self):
+        self.use_marlin = not cutlass_fp4_supported()
+        self.group_size = 16
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        layer.num_experts = num_experts
+        layer.params_dtype = params_dtype
+
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                # 2 fp4 items are packed in the input dimension
+                hidden_size // 2,
+                requires_grad=False,
+                dtype=torch.uint8),
+            requires_grad=False)
+        layer.register_parameter("w13_weight_packed", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                # 2 fp4 items are packed in the input dimension
+                intermediate_size_per_partition // 2,
+                dtype=torch.uint8),
+            requires_grad=False)
+        layer.register_parameter("w2_weight_packed", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # Weight Scales
+        w13_weight_scale = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                # 2 fp4 items are packed in the input dimension
+                hidden_size // self.group_size,
+                dtype=torch.float8_e4m3fn),
+            requires_grad=False)
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.GROUP.value})
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+
+        w2_weight_scale = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                # 2 fp4 items are packed in the input dimension
+                intermediate_size_per_partition // self.group_size,
+                dtype=torch.float8_e4m3fn),
+            requires_grad=False)
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.GROUP.value})
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        # Weight Global Scales
+        w13_weight_scale_2 = torch.nn.Parameter(torch.empty(
+            num_experts, 2, dtype=torch.float32),
+                                                requires_grad=False)
+        layer.register_parameter("w13_weight_global_scale", w13_weight_scale_2)
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+        set_weight_attrs(w13_weight_scale_2, extra_weight_attrs)
+
+        w2_weight_scale_2 = torch.nn.Parameter(torch.empty(
+            num_experts, dtype=torch.float32),
+                                               requires_grad=False)
+        layer.register_parameter("w2_weight_global_scale", w2_weight_scale_2)
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+        set_weight_attrs(w2_weight_scale_2, extra_weight_attrs)
+
+        # Input Global Scales
+        w13_input_scale = torch.nn.Parameter(torch.empty(num_experts,
+                                                         2,
+                                                         dtype=torch.float32),
+                                             requires_grad=False)
+        layer.register_parameter("w13_input_global_scale", w13_input_scale)
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+        set_weight_attrs(w13_input_scale, extra_weight_attrs)
+
+        w2_input_scale = torch.nn.Parameter(torch.empty(num_experts,
+                                                        dtype=torch.float32),
+                                            requires_grad=False)
+        layer.register_parameter("w2_input_global_scale", w2_input_scale)
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+        set_weight_attrs(w2_input_scale, extra_weight_attrs)
+
+    def swizzle_blockscale(self, scale: torch.tensor):
+        assert (scale.dtype == torch.float8_e4m3fn)
+        # Pad and blockwise interleave weight_scale
+        scale_ndim = scale.ndim
+        if scale.ndim == 2:
+            scale = scale.unsqueeze(0)
+        assert scale.ndim == 3
+        B, M, K = scale.shape
+        round_up_multiple = lambda x, m: (x + m - 1) // m * m
+        M_padded = round_up_multiple(M, 128)
+        K_padded = round_up_multiple(K, 4)
+        padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
+        padded_scale[:B, :M, :K] = scale
+        batches, rows, cols = padded_scale.shape
+        assert rows % 128 == 0
+        assert cols % 4 == 0
+        padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32,
+                                            cols // 4, 4)
+        swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
+        swizzled_scale = swizzled_scale.contiguous().cuda()
+        return (swizzled_scale.reshape(M, K)
+                if scale_ndim == 2 else swizzled_scale.reshape(B, M, K))
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+
+        # From packed to weight
+        layer.w13_weight = torch.nn.Parameter(layer.w13_weight_packed.data,
+                                              requires_grad=False)
+
+        layer.w2_weight = torch.nn.Parameter(layer.w2_weight_packed.data,
+                                             requires_grad=False)
+
+        if not torch.allclose(layer.w13_weight_global_scale[:, 0],
+                              layer.w13_weight_global_scale[:, 1]):
+            logger.warning_once(
+                "w1_weight_global_scale must match w3_weight_global_scale. "
+                "Accuracy may be affected.")
+
+        # Take inverse of global scale saved to disk
+        layer.w13_weight_scale_2 = torch.nn.Parameter(
+            1 / layer.w13_weight_global_scale[:, 0], requires_grad=False)
+
+        layer.w2_weight_scale_2 = torch.nn.Parameter(
+            1 / layer.w2_weight_global_scale.data, requires_grad=False)
+
+        if self.use_marlin:
+            prepare_moe_fp4_layer_for_marlin(layer)
+            return
+
+        # swizzle weight scales
+        layer.w13_blockscale_swizzled = torch.nn.Parameter(
+            self.swizzle_blockscale(layer.w13_weight_scale),
+            requires_grad=False)
+
+        layer.w2_blockscale_swizzled = torch.nn.Parameter(
+            self.swizzle_blockscale(layer.w2_weight_scale),
+            requires_grad=False)
+
+        # w13
+        w13_input_global_scale = layer.w13_input_global_scale.max(
+            dim=1).values.to(torch.float32)
+
+        layer.g1_alphas = torch.nn.Parameter(
+            ((1 / w13_input_global_scale) * layer.w13_weight_scale_2),
+            requires_grad=False)
+
+        layer.w13_input_scale_quant = torch.nn.Parameter(
+            (w13_input_global_scale), requires_grad=False)
+
+        # w2
+        layer.g2_alphas = torch.nn.Parameter(
+            ((1 / layer.w2_input_global_scale) * layer.w2_weight_scale_2).to(
+                torch.float32),
+            requires_grad=False)
+
+        layer.w2_input_scale_quant = torch.nn.Parameter(
+            (layer.w2_input_global_scale), requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if enable_eplb:
+            raise NotImplementedError("EPLB not supported for "
+                                      "`CompressedTensorsW4A4MoeMethod` yet.")
+        assert activation == "silu", "Only SiLU activation is supported."
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias,
+        )
+
+        if self.use_marlin:
+            return torch.ops.vllm.fused_marlin_moe(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                layer.w13_weight_scale,
+                layer.w2_weight_scale,
+                router_logits,
+                topk_weights,
+                topk_ids,
+                global_scale1=layer.w13_weight_scale_2,
+                global_scale2=layer.w2_weight_scale_2,
+                quant_type_id=scalar_types.float4_e2m1f.id,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                global_num_experts=global_num_experts,
+                expert_map=expert_map)
+
+        assert expert_map is None, ("Expert Parallelism / expert_map "
+                                    "is currently not supported for "
+                                    "CompressedTensorsW4A4MoeMethod.")
+
+        from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+            cutlass_moe_fp4)
+
+        # Cutlass moe takes in activations in BF16/Half precision
+        # and fp4 quantized weights loaded from the checkpoint
+        return cutlass_moe_fp4(
+            a=x,
+            w1_fp4=layer.w13_weight,
+            w2_fp4=layer.w2_weight,
+            w1_blockscale=layer.w13_blockscale_swizzled,
+            w2_blockscale=layer.w2_blockscale_swizzled,
+            g1_alphas=layer.g1_alphas,
+            g2_alphas=layer.g2_alphas,
+            a1_gscale=layer.w13_input_scale_quant,
+            a2_gscale=layer.w2_input_scale_quant,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            m=x.shape[0],
+            n=layer.w2_weight.shape[2] * 2,
+            k=x.shape[1],
+            e=layer.w13_weight.shape[0],
+            device=x.device,
+            apply_router_weight_on_input=apply_router_weight_on_input).to(
+                x.dtype)
+
+
+class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
+
+    def __init__(
+            self,
+            quant_config: "CompressedTensorsConfig"  # type: ignore # noqa E501
+    ):
+        self.quant_config = quant_config
+        self.weight_quant = self.quant_config.target_scheme_map["Linear"].get(
+            "weights")
+        self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
+            "input_activations")
+        self.topk_indices_dtype = None
+
+        per_tensor = (self.weight_quant.strategy == QuantizationStrategy.TENSOR
+                      and self.input_quant.strategy
+                      == QuantizationStrategy.TENSOR)
+        per_channel = (
+            self.weight_quant.strategy == QuantizationStrategy.CHANNEL
+            and self.input_quant.strategy == QuantizationStrategy.TOKEN)
+        if not (per_tensor or per_channel):
+            raise ValueError(
+                "For FP8 Fused MoE layers, we require per tensor "
+                "or channelwise, dynamic per token quantization. Found "
+                f"{self.weight_quant}, {self.input_quant}")
+
+        self.static_input_scales = not self.input_quant.dynamic
+        if self.static_input_scales and per_channel:
+            raise ValueError(
+                "For FP8 Fused MoE layer, we require either per tensor or "
+                "channelwise, dynamic per token quantization.")
+
+        # For GPUs that lack FP8 hardware support, we can leverage the Marlin
+        # kernel for fast weight-only FP8 quantization
+        self.use_marlin = (not current_platform.has_device_capability(89)
+                           or envs.VLLM_TEST_FORCE_FP8_MARLIN)
+        # Disable marlin for rocm
+        if current_platform.is_rocm():
+            self.use_marlin = False
+        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+            is_rocm_aiter_moe_enabled)
+
+        self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        layer.intermediate_size_per_partition = intermediate_size_per_partition
+        layer.hidden_size = hidden_size
+        layer.num_experts = num_experts
+        layer.orig_dtype = params_dtype
+        layer.weight_block_size = None
+
+        params_dtype = torch.float8_e4m3fn
+
+        # WEIGHTS
+        w13_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            hidden_size,
+            dtype=params_dtype),
+                                        requires_grad=False)
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition,
+            dtype=params_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # WEIGHT_SCALES
+        if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
+            # Allocate 2 scales for w1 and w3 respectively.
+            # They are combined to a single scale after weight loading.
+            w13_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts, 2, dtype=torch.float32),
+                                                  requires_grad=False)
+            layer.register_parameter("w13_weight_scale", w13_weight_scale)
+            w2_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts, dtype=torch.float32),
+                                                 requires_grad=False)
+            layer.register_parameter("w2_weight_scale", w2_weight_scale)
+            # Add PER-TENSOR quantization for FusedMoE.weight_loader.
+            extra_weight_attrs.update(
+                {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+            set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+            set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        elif self.weight_quant.strategy == QuantizationStrategy.CHANNEL:
+            w13_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                1,
+                dtype=torch.float32),
+                                                  requires_grad=False)
+            layer.register_parameter("w13_weight_scale", w13_weight_scale)
+            w2_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts, hidden_size, 1, dtype=torch.float32),
+                                                 requires_grad=False)
+            layer.register_parameter("w2_weight_scale", w2_weight_scale)
+            # Add PER-CHANNEL quantization for FusedMoE.weight_loader.
+            extra_weight_attrs.update(
+                {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value})
+            set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+            set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        # INPUT_SCALES
+        if self.static_input_scales:
+            w13_input_scale = torch.nn.Parameter(torch.ones(
+                num_experts, dtype=torch.float32),
+                                                 requires_grad=False)
+            layer.register_parameter("w13_input_scale", w13_input_scale)
+            set_weight_attrs(w13_input_scale, extra_weight_attrs)
+
+            w2_input_scale = torch.nn.Parameter(torch.ones(
+                num_experts, dtype=torch.float32),
+                                                requires_grad=False)
+            layer.register_parameter("w2_input_scale", w2_input_scale)
+            set_weight_attrs(w2_input_scale, extra_weight_attrs)
+        else:
+            layer.w13_input_scale = None
+            layer.w2_input_scale = None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # Fp8 moe kernels require a single activation scale.
+        # We take the max of all the scales in case they differ.
+        if self.static_input_scales:
+            assert self.input_quant.strategy == QuantizationStrategy.TENSOR
+            if (layer.w13_input_scale is None or layer.w2_input_scale is None):
+                raise ValueError(
+                    "QuantConfig has static quantization, but found "
+                    "activation scales are None.")
+            if (not all_close_1d(layer.w13_input_scale)
+                    or not all_close_1d(layer.w2_input_scale)):
+                logger.warning_once(
+                    "Found input_scales that are not equal for "
+                    "fp8 MoE layer. Using the maximum across experts "
+                    "for each layer.")
+            layer.w13_input_scale = torch.nn.Parameter(
+                layer.w13_input_scale.max(), requires_grad=False)
+            layer.w2_input_scale = torch.nn.Parameter(
+                layer.w2_input_scale.max(), requires_grad=False)
+
+        if current_platform.is_fp8_fnuz():
+            # Normalize the weights and scales
+            w13_weight, w13_weight_scale, w13_input_scale = \
+                normalize_e4m3fn_to_e4m3fnuz(
+                    layer.w13_weight, layer.w13_weight_scale,
+                    layer.w13_input_scale)
+            w2_weight, w2_weight_scale, w2_input_scale = \
+                normalize_e4m3fn_to_e4m3fnuz(
+                    layer.w2_weight, layer.w2_weight_scale,
+                    layer.w2_input_scale)
+            # Reset the parameter
+            layer.w13_weight = torch.nn.Parameter(w13_weight,
+                                                  requires_grad=False)
+            layer.w13_weight_scale = torch.nn.Parameter(w13_weight_scale,
+                                                        requires_grad=False)
+            if w13_input_scale is not None:
+                layer.w13_input_scale = torch.nn.Parameter(w13_input_scale,
+                                                           requires_grad=False)
+            layer.w2_weight = torch.nn.Parameter(w2_weight,
+                                                 requires_grad=False)
+            layer.w2_weight_scale = torch.nn.Parameter(w2_weight_scale,
+                                                       requires_grad=False)
+            if w2_input_scale is not None:
+                layer.w2_input_scale = torch.nn.Parameter(w2_input_scale,
+                                                          requires_grad=False)
+
+        # For Per-TENSOR case, Fp8 moe kernel needs single weight scale
+        # for w13 per expert. Use max then dequant and requant each expert.
+        if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
+            assert layer.w13_weight_scale is not None
+            shard_size = layer.intermediate_size_per_partition
+            max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+            for expert_id in range(layer.local_num_experts):
+                start = 0
+                for shard_id in range(2):
+                    dq_weight = per_tensor_dequantize(
+                        layer.w13_weight[expert_id][start:start +
+                                                    shard_size, :],
+                        layer.w13_weight_scale[expert_id][shard_id])
+                    layer.w13_weight[expert_id][
+                        start:start + shard_size, :], _ = ops.scaled_fp8_quant(
+                            dq_weight, max_w13_scales[expert_id])
+                    start += shard_size
+            layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
+                                                        requires_grad=False)
+
+        # Property to determine if AITER is used
+        if self.rocm_aiter_moe_enabled:
+            from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa E501
+                rocm_aiter_fused_experts, shuffle_weights)
+
+            # reshaping weights is required for aiter moe kernel.
+            shuffled_w13, shuffled_w2 = shuffle_weights(
+                layer.w13_weight.data, layer.w2_weight.data)
+
+            layer.w13_weight = torch.nn.Parameter(shuffled_w13,
+                                                  requires_grad=False)
+            layer.w2_weight = torch.nn.Parameter(shuffled_w2,
+                                                 requires_grad=False)
+
+            self.rocm_aiter_fused_experts_func = rocm_aiter_fused_experts
+        elif self.use_marlin:
+            prepare_moe_fp8_layer_for_marlin(layer, False)
+            # Activations not quantized for marlin.
+            del layer.w13_input_scale
+            del layer.w2_input_scale
+            self.fused_experts_func = None
+        else:
+            from vllm.model_executor.layers.fused_moe import fused_experts
+            self.fused_experts_func = fused_experts
+
+    def select_gemm_impl(
+        self,
+        prepare_finalize: FusedMoEPrepareAndFinalize,
+        moe: FusedMoEConfig,
+    ) -> FusedMoEPermuteExpertsUnpermute:
+        from vllm.model_executor.layers.fused_moe import TritonExperts
+        from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+            BatchedTritonExperts)
+
+        assert not self.rocm_aiter_moe_enabled and not self.use_marlin
+
+        logger.debug("BatchedTritonExperts(%s)", self.__class__.__name__)
+
+        if (prepare_finalize.activation_format ==
+                FusedMoEActivationFormat.BatchedExperts):
+            max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank(
+            )
+            assert max_num_tokens_per_rank is not None
+
+            return BatchedTritonExperts(
+                max_num_tokens=max_num_tokens_per_rank,
+                num_dispatchers=prepare_finalize.num_dispatchers(),
+                use_fp8_w8a8=True,
+                block_shape=self.quant_config.weight_block_size,
+                per_act_token_quant=(
+                    self.input_quant.strategy == QuantizationStrategy.TOKEN),
+            )
+        else:
+            return TritonExperts(
+                use_fp8_w8a8=True,
+                block_shape=self.quant_config.weight_block_size,
+                per_act_token_quant=(
+                    self.input_quant.strategy == QuantizationStrategy.TOKEN),
+            )
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if enable_eplb:
+            raise NotImplementedError(
+                "EPLB not supported for "
+                "`CompressedTensorsW8A8Fp8MoEMethod` yet.")
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype,
+        )
+
+        if self.rocm_aiter_moe_enabled:
+            return self.rocm_aiter_fused_experts_func(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                activation=activation,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                use_fp8_w8a8=True,
+                per_channel_quant=self.weight_quant.strategy ==
+                QuantizationStrategy.CHANNEL,
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
+                a1_scale=layer.w13_input_scale,
+                a2_scale=layer.w2_input_scale,
+                expert_map=expert_map)
+        if self.use_marlin:
+            assert activation == "silu", (
+                f"{activation} not supported for Marlin MoE.")
+            return torch.ops.vllm.fused_marlin_moe(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                layer.w13_weight_scale,
+                layer.w2_weight_scale,
+                router_logits,
+                topk_weights,
+                topk_ids,
+                quant_type_id=scalar_types.float8_e4m3fn.id,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                global_num_experts=global_num_experts,
+                expert_map=expert_map)
+
+        assert self.fused_experts_func is not None
+
+        return self.fused_experts_func(
+            hidden_states=x,
+            w1=layer.w13_weight,
+            w2=layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            activation=activation,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            use_fp8_w8a8=True,
+            per_channel_quant=self.weight_quant.strategy ==
+            QuantizationStrategy.CHANNEL,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            a1_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale)
+
+
+class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
+
+    def __init__(
+            self,
+            quant_config: "CompressedTensorsConfig"  # type: ignore # noqa E501
+    ):
+        self.quant_config = quant_config
+        self.weight_quant = self.quant_config.target_scheme_map["Linear"].get(
+            "weights")
+        self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
+            "input_activations")
+
+        per_tensor = (self.weight_quant.strategy == QuantizationStrategy.TENSOR
+                      and self.input_quant.strategy
+                      == QuantizationStrategy.TENSOR)
+        per_channel = (
+            self.weight_quant.strategy == QuantizationStrategy.CHANNEL
+            and self.input_quant.strategy == QuantizationStrategy.TOKEN)
+        if not (per_tensor or per_channel):
+            raise ValueError(
+                "For FP8 Fused MoE layers, we require per tensor "
+                "or channelwise, dynamic per token quantization. Found "
+                f"{self.weight_quant}, {self.input_quant}")
+
+        self.static_input_scales = not self.input_quant.dynamic
+        if self.static_input_scales and per_channel:
+            raise ValueError(
+                "For FP8 Fused MoE layer, we require either per tensor or "
+                "channelwise, dynamic per token quantization.")
+
+        self.topk_indices_dtype = None
+        self.fused_experts = None  # type: ignore
+        self.disable_expert_map = False
+        self.is_fp8_w8a8_sm100 = self.quant_config._is_fp8_w8a8_sm100(
+            self.weight_quant, self.input_quant)
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        params_dtype = torch.float8_e4m3fn
+
+        # WEIGHTS
+        w13_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            hidden_size,
+            dtype=params_dtype),
+                                        requires_grad=False)
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition,
+            dtype=params_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # WEIGHT_SCALES
+        if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
+            # Allocate 2 scales for w1 and w3 respectively.
+            # They are combined to a single scale after weight loading.
+            w13_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts, 2, dtype=torch.float32),
+                                                  requires_grad=False)
+            layer.register_parameter("w13_weight_scale", w13_weight_scale)
+            w2_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts, dtype=torch.float32),
+                                                 requires_grad=False)
+            layer.register_parameter("w2_weight_scale", w2_weight_scale)
+            # Add PER-TENSOR quantization for FusedMoE.weight_loader.
+            extra_weight_attrs.update(
+                {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+            set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+            set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        elif self.weight_quant.strategy == QuantizationStrategy.CHANNEL:
+            w13_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                1,
+                dtype=torch.float32),
+                                                  requires_grad=False)
+            layer.register_parameter("w13_weight_scale", w13_weight_scale)
+            w2_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts, hidden_size, 1, dtype=torch.float32),
+                                                 requires_grad=False)
+            layer.register_parameter("w2_weight_scale", w2_weight_scale)
+            # Add PER-CHANNEL quantization for FusedMoE.weight_loader.
+            extra_weight_attrs.update(
+                {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value})
+            set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+            set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        # INPUT_SCALES
+        if self.static_input_scales:
+            w13_input_scale = torch.nn.Parameter(torch.ones(
+                num_experts, dtype=torch.float32),
+                                                 requires_grad=False)
+            layer.register_parameter("w13_input_scale", w13_input_scale)
+            set_weight_attrs(w13_input_scale, extra_weight_attrs)
+
+            w2_input_scale = torch.nn.Parameter(torch.ones(
+                num_experts, dtype=torch.float32),
+                                                requires_grad=False)
+            layer.register_parameter("w2_input_scale", w2_input_scale)
+            set_weight_attrs(w2_input_scale, extra_weight_attrs)
+        else:
+            layer.w13_input_scale = None
+            layer.w2_input_scale = None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # Fp8 moe kernels require a single activation scale.
+        # We take the max of all the scales in case they differ.
+        if self.static_input_scales:
+            assert self.input_quant.strategy == QuantizationStrategy.TENSOR
+            if (layer.w13_input_scale is None or layer.w2_input_scale is None):
+                raise ValueError(
+                    "QuantConfig has static quantization, but found "
+                    "activation scales are None.")
+            if (not all_close_1d(layer.w13_input_scale)
+                    or not all_close_1d(layer.w2_input_scale)):
+                logger.warning_once(
+                    "Found input_scales that are not equal for "
+                    "fp8 MoE layer. Using the maximum across experts "
+                    "for each layer.")
+            layer.w13_input_scale = torch.nn.Parameter(
+                layer.w13_input_scale.max(), requires_grad=False)
+            layer.w2_input_scale = torch.nn.Parameter(
+                layer.w2_input_scale.max(), requires_grad=False)
+
+        # For Per-TENSOR case, Fp8 moe kernel needs single weight scale
+        # for w13 per expert. Use max then dequant and requant each expert.
+        if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
+            assert layer.w13_weight_scale is not None
+            shard_size = layer.intermediate_size_per_partition
+            max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+            for expert_id in range(layer.local_num_experts):
+                start = 0
+                for shard_id in range(2):
+                    dq_weight = per_tensor_dequantize(
+                        layer.w13_weight[expert_id][start:start +
+                                                    shard_size, :],
+                        layer.w13_weight_scale[expert_id][shard_id])
+                    layer.w13_weight[expert_id][
+                        start:start + shard_size, :], _ = ops.scaled_fp8_quant(
+                            dq_weight, max_w13_scales[expert_id])
+                    start += shard_size
+            layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
+                                                        requires_grad=False)
+
+    def select_gemm_impl(
+        self,
+        prepare_finalize: FusedMoEPrepareAndFinalize,
+        moe: FusedMoEConfig,
+    ) -> FusedMoEPermuteExpertsUnpermute:
+        from vllm.model_executor.layers.fused_moe import CutlassExpertsFp8
+
+        use_batched_format = (prepare_finalize.activation_format ==
+                              FusedMoEActivationFormat.BatchedExperts)
+
+        num_dispatchers = prepare_finalize.num_dispatchers()
+
+        num_experts = (moe.num_local_experts
+                       if use_batched_format else moe.num_experts)
+
+        logger.debug("CutlassExpertsFp8(%s)", self.__class__.__name__)
+
+        experts = CutlassExpertsFp8(
+            num_experts,
+            moe.in_dtype,
+            self.input_quant.strategy == QuantizationStrategy.TOKEN,
+            self.weight_quant.strategy == QuantizationStrategy.CHANNEL,
+            num_dispatchers=num_dispatchers,
+            use_batched_format=use_batched_format,
+        )
+
+        self.disable_expert_map = (num_dispatchers > 1
+                                   or not experts.supports_expert_map())
+
+        return experts
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if enable_eplb:
+            raise NotImplementedError(
+                "EPLB not supported for "
+                "`CompressedTensorsW8A8Fp8MoECutlassMethod` yet.")
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
+
+        per_act_token = (
+            self.input_quant.strategy == QuantizationStrategy.TOKEN)
+        per_channel_quant = (
+            self.weight_quant.strategy == QuantizationStrategy.CHANNEL)
+        # Triton fused_experts is faster in small batch sizes on SM100.
+        # Fall back to fused_experts in small batch sizes.
+        if self.is_fp8_w8a8_sm100 and topk_ids.shape[0] <= 8:
+            from vllm.model_executor.layers.fused_moe import fused_experts
+            return fused_experts(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights,
+                topk_ids,
+                inplace=True,
+                activation=activation,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                use_fp8_w8a8=True,
+                per_channel_quant=per_channel_quant,
+                global_num_experts=global_num_experts,
+                expert_map=None if self.disable_expert_map else expert_map,
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
+                a1_scale=layer.w13_input_scale,
+                a2_scale=layer.w2_input_scale)
+        if self.fused_experts is None:
+            # If no modular kernel is provided, use cutlass_moe_fp8
+            from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+                cutlass_moe_fp8)
+            return cutlass_moe_fp8(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights,
+                topk_ids,
+                per_act_token=per_act_token,
+                activation=activation,
+                global_num_experts=global_num_experts,
+                expert_map=None if self.disable_expert_map else expert_map,
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
+                a1_scale=layer.w13_input_scale,
+                a2_scale=layer.w2_input_scale,
+            )
+        else:
+            return self.fused_experts(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights,
+                topk_ids,
+                activation=activation,
+                global_num_experts=global_num_experts,
+                expert_map=None if self.disable_expert_map else expert_map,
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
+                a1_scale=layer.w13_input_scale,
+                a2_scale=layer.w2_input_scale,
+            )
+
+
+class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
+
+    def __init__(
+            self,
+            quant_config: "CompressedTensorsConfig"  # type: ignore # noqa E501
+    ):
+        self.quant_config = quant_config
+        self.weight_quant = self.quant_config.target_scheme_map["Linear"].get(
+            "weights")
+        self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
+            "input_activations")
+
+        per_channel = (
+            self.weight_quant.strategy == QuantizationStrategy.CHANNEL
+            and self.input_quant.strategy == QuantizationStrategy.TOKEN)
+        if not per_channel:
+            raise ValueError(
+                "For INT8 Fused MoE layers, we require channelwise, "
+                "dynamic per token quantization. Found "
+                f"{self.weight_quant}, {self.input_quant}")
+
+        self.static_input_scales = not self.input_quant.dynamic
+        if self.static_input_scales:
+            raise ValueError(
+                "For INT8 Fused MoE layers, we require channelwise, "
+                "dynamic per token quantization. Found static input scales.")
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        params_dtype = torch.int8
+
+        # WEIGHTS
+        w13_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            hidden_size,
+            dtype=params_dtype),
+                                        requires_grad=False)
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition,
+            dtype=params_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # WEIGHT_SCALES
+        assert self.weight_quant.strategy == QuantizationStrategy.CHANNEL
+        w13_weight_scale = torch.nn.Parameter(torch.ones(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            1,
+            dtype=torch.float32),
+                                              requires_grad=False)
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        w2_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
+                                                        hidden_size,
+                                                        1,
+                                                        dtype=torch.float32),
+                                             requires_grad=False)
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        # Add PER-CHANNEL quantization for FusedMoE.weight_loader.
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value})
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        # INPUT_SCALES
+        assert not self.static_input_scales
+        layer.w13_input_scale = None
+        layer.w2_input_scale = None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        pass
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if enable_eplb:
+            raise NotImplementedError(
+                "EPLB not supported for "
+                "`CompressedTensorsW8A8Int8MoEMethod` yet.")
+
+        from vllm.model_executor.layers.fused_moe import fused_experts
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
+
+        return fused_experts(
+            hidden_states=x,
+            w1=layer.w13_weight,
+            w2=layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            activation=activation,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            use_int8_w8a8=True,
+            per_channel_quant=True,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            a1_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale)
+
+
+class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
+
+    def __init__(
+            self,
+            quant_config: "CompressedTensorsConfig"  # type: ignore # noqa E501
+    ):
+        self.quant_config = quant_config
+        # TODO: @dsikka: refactor this to use schemes as other kernels
+        # are supported + check if the layer is being ignored.
+        config = self.quant_config.target_scheme_map["Linear"].get("weights")
+        self.num_bits = config.num_bits
+        self.packed_factor = 32 // config.num_bits
+        self.strategy = config.strategy
+        self.group_size = config.group_size
+        self.actorder = config.actorder
+        assert config.symmetric, (
+            "Only symmetric quantization is supported for MoE")
+
+        if not (self.quant_config.quant_format
+                == CompressionFormat.pack_quantized.value
+                and self.num_bits in WNA16_SUPPORTED_BITS):
+            raise ValueError("For Fused MoE layers, only ",
+                             f"{CompressionFormat.pack_quantized.value} ",
+                             "is supported for the following bits: ",
+                             f"{WNA16_SUPPORTED_BITS}")
+        self.quant_type = WNA16_SUPPORTED_TYPES_MAP[self.num_bits]
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        intermediate_size_full = extra_weight_attrs.pop(
+            "intermediate_size_full")
+
+        # Will transpose the loaded weight along the
+        # intermediate and hidden dim sizes. Will
+        # shard for TP along the transposed dims
+        extra_weight_attrs.update({
+            "is_transposed": True,
+            "quant_method": self.strategy
+        })
+        w13_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            hidden_size // self.packed_factor,
+            2 * intermediate_size_per_partition,
+            dtype=torch.int32),
+                                        requires_grad=False)
+        layer.register_parameter("w13_weight_packed", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            intermediate_size_per_partition // self.packed_factor,
+            hidden_size,
+            dtype=torch.int32),
+                                       requires_grad=False)
+        layer.register_parameter("w2_weight_packed", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # In the case where we have actorder/g_idx,
+        # we do not partition the w2 scales
+        load_full_w2 = self.actorder and self.group_size != -1
+        w2_scales_size = (intermediate_size_full
+                          if load_full_w2 else intermediate_size_per_partition)
+
+        self.is_k_full = (not self.actorder) or (
+            intermediate_size_per_partition == intermediate_size_full)
+
+        if self.strategy == "channel":
+            num_groups_w2 = num_groups_w13 = 1
+            self.group_size = -1
+        else:
+            num_groups_w2 = w2_scales_size // self.group_size
+            num_groups_w13 = hidden_size // self.group_size
+
+        w13_scale = torch.nn.Parameter(torch.ones(
+            num_experts,
+            num_groups_w13,
+            2 * intermediate_size_per_partition,
+            dtype=params_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w13_weight_scale", w13_scale)
+        set_weight_attrs(w13_scale, extra_weight_attrs)
+
+        w2_scale = torch.nn.Parameter(torch.ones(num_experts,
+                                                 num_groups_w2,
+                                                 hidden_size,
+                                                 dtype=params_dtype),
+                                      requires_grad=False)
+        layer.register_parameter("w2_weight_scale", w2_scale)
+        set_weight_attrs(w2_scale, extra_weight_attrs)
+        set_weight_attrs(w2_scale, {"load_full_w2": load_full_w2})
+
+        w2_weight_shape = torch.nn.Parameter(torch.empty(num_experts, 2),
+                                             requires_grad=False)
+        layer.register_parameter("w2_weight_shape", w2_weight_shape)
+        set_weight_attrs(w2_weight_shape, extra_weight_attrs)
+        w13_weight_shape = torch.nn.Parameter(torch.empty(num_experts, 2),
+                                              requires_grad=False)
+
+        layer.register_parameter("w13_weight_shape", w13_weight_shape)
+        set_weight_attrs(w13_weight_shape, extra_weight_attrs)
+
+        w13_g_idx = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_g_idx", w13_g_idx)
+        set_weight_attrs(w13_g_idx, extra_weight_attrs)
+
+        w2_g_idx = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size_per_partition,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_g_idx", w2_g_idx)
+        set_weight_attrs(w2_g_idx, extra_weight_attrs)
+
+        w13_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_g_idx_sort_indices",
+                                 w13_g_idx_sort_indices)
+        set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)
+
+        w2_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size_per_partition,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_g_idx_sort_indices",
+                                 w2_g_idx_sort_indices)
+        set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)
+
+        layer.a13_scale = None
+        layer.a2_scale = None
+        layer.marlin_state = GPTQMarlinState.REPACK
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        num_experts = layer.w13_weight_g_idx.shape[0]
+        device = layer.w13_weight_g_idx.device
+
+        # when running models with grouped act order,
+        # resort to g_idx values provided in checkpoint
+        if self.actorder == "group":
+            w13_g_idx_sort_indices = torch.empty_like(layer.w13_weight_g_idx)
+            w2_g_idx_sort_indices = torch.empty_like(layer.w2_weight_g_idx)
+            w13_sorted_g_idx = torch.empty_like(layer.w13_weight_g_idx)
+            w2_sorted_g_idx = torch.empty_like(layer.w2_weight_g_idx)
+
+            for e in range(num_experts):
+                w13_g_idx_sort_indices[e] = torch.argsort(
+                    layer.w13_weight_g_idx[e]).to(torch.int32)
+                w2_g_idx_sort_indices[e] = torch.argsort(
+                    layer.w2_weight_g_idx[e]).to(torch.int32)
+                w13_sorted_g_idx[e] = layer.w13_weight_g_idx[e][
+                    w13_g_idx_sort_indices[e]]
+                w2_sorted_g_idx[e] = layer.w2_weight_g_idx[e][
+                    w2_g_idx_sort_indices[e]]
+
+            replace_parameter(layer, "w13_weight_g_idx", w13_sorted_g_idx)
+            replace_parameter(layer, "w2_weight_g_idx", w2_sorted_g_idx)
+            replace_parameter(layer, "w13_g_idx_sort_indices",
+                              w13_g_idx_sort_indices)
+            replace_parameter(layer, "w2_g_idx_sort_indices",
+                              w2_g_idx_sort_indices)
+
+        else:
+            layer.w13_weight_g_idx = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+            layer.w2_weight_g_idx = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+            layer.w13_g_idx_sort_indices = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+            layer.w2_g_idx_sort_indices = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+
+        marlin_w13_qweight = ops.gptq_marlin_moe_repack(
+            layer.w13_weight_packed,
+            layer.w13_g_idx_sort_indices,
+            layer.w13_weight_packed.shape[1] * self.packed_factor,
+            layer.w13_weight_packed.shape[2],
+            self.num_bits,
+        )
+        replace_parameter(layer, "w13_weight_packed", marlin_w13_qweight)
+        marlin_w2_qweight = ops.gptq_marlin_moe_repack(
+            layer.w2_weight_packed,
+            layer.w2_g_idx_sort_indices,
+            layer.w2_weight_packed.shape[1] * self.packed_factor,
+            layer.w2_weight_packed.shape[2],
+            self.num_bits,
+        )
+        replace_parameter(layer, "w2_weight_packed", marlin_w2_qweight)
+        # Repack scales
+        marlin_w13_scales = marlin_moe_permute_scales(
+            s=layer.w13_weight_scale,
+            size_k=layer.w13_weight_packed.shape[2],
+            size_n=layer.w13_weight_scale.shape[2],
+            group_size=self.group_size,
+        )
+        replace_parameter(layer, "w13_weight_scale", marlin_w13_scales)
+        marlin_w2_scales = marlin_moe_permute_scales(
+            s=layer.w2_weight_scale,
+            size_k=layer.w2_weight_scale.shape[1] *
+            (self.group_size if self.group_size != -1 else self.packed_factor),
+            size_n=layer.w2_weight_scale.shape[2],
+            group_size=self.group_size,
+        )
+        replace_parameter(layer, "w2_weight_scale", marlin_w2_scales)
+
+        layer.workspace = marlin_make_workspace_new(device, 4)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if enable_eplb:
+            raise NotImplementedError(
+                "EPLB not supported for "
+                "`CompressedTensorsWNA16MarlinMoEMethod` yet.")
+
+        assert activation == "silu", (
+            f"{activation} not supported for Marlin MoE.")
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
+
+        return torch.ops.vllm.fused_marlin_moe(
+            x,
+            layer.w13_weight_packed,
+            layer.w2_weight_packed,
+            layer.w13_weight_scale,
+            layer.w2_weight_scale,
+            router_logits,
+            topk_weights,
+            topk_ids,
+            quant_type_id=self.quant_type.id,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            g_idx1=layer.w13_weight_g_idx,
+            g_idx2=layer.w2_weight_g_idx,
+            sort_indices1=layer.w13_g_idx_sort_indices,
+            sort_indices2=layer.w2_g_idx_sort_indices,
+            workspace=layer.workspace,
+            is_k_full=self.is_k_full)
+
+
+class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
+
+    def __init__(
+            self,
+            quant_config: "CompressedTensorsConfig"  # type: ignore # noqa E501
+    ):
+        self.quant_config = quant_config
+        # TODO: @dsikka: refactor this to use schemes as other kernels
+        # are supported + check if the layer is being ignored.
+        config = self.quant_config.target_scheme_map["Linear"].get("weights")
+        self.num_bits = config.num_bits
+        self.packed_factor = 32 // config.num_bits
+        self.strategy = config.strategy
+        # channelwise is not supported by this kernel
+        assert config.strategy == "group"
+        self.group_size = config.group_size
+        # grouped actorder isn't supported by this kernel
+        assert config.actorder != "group"
+        assert config.symmetric, (
+            "Only symmetric quantization is supported for MoE")
+
+        if not (self.quant_config.quant_format
+                == CompressionFormat.pack_quantized.value
+                and self.num_bits in WNA16_SUPPORTED_BITS):
+            raise ValueError("For Fused MoE layers, only ",
+                             f"{CompressionFormat.pack_quantized.value} ",
+                             "is supported for the following bits: ",
+                             f"{WNA16_SUPPORTED_BITS}")
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        # Will transpose the loaded weight along the
+        # intermediate and hidden dim sizes. Will
+        # shard for TP along the transposed dims
+        extra_weight_attrs.update({
+            "is_transposed": True,
+            "quant_method": self.strategy
+        })
+        w13_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            hidden_size // self.packed_factor,
+            2 * intermediate_size_per_partition,
+            dtype=torch.int32),
+                                        requires_grad=False)
+        layer.register_parameter("w13_weight_packed", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            intermediate_size_per_partition // self.packed_factor,
+            hidden_size,
+            dtype=torch.int32),
+                                       requires_grad=False)
+        layer.register_parameter("w2_weight_packed", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        w2_scales_size = intermediate_size_per_partition
+
+        if self.strategy == "channel":
+            num_groups_w2 = num_groups_w13 = 1
+            self.group_size = -1
+        else:
+            num_groups_w2 = w2_scales_size // self.group_size
+            num_groups_w13 = hidden_size // self.group_size
+
+        w13_scale = torch.nn.Parameter(torch.ones(
+            num_experts,
+            num_groups_w13,
+            2 * intermediate_size_per_partition,
+            dtype=params_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w13_weight_scale", w13_scale)
+        set_weight_attrs(w13_scale, extra_weight_attrs)
+
+        w2_scale = torch.nn.Parameter(torch.ones(num_experts,
+                                                 num_groups_w2,
+                                                 hidden_size,
+                                                 dtype=params_dtype),
+                                      requires_grad=False)
+        layer.register_parameter("w2_weight_scale", w2_scale)
+        set_weight_attrs(w2_scale, extra_weight_attrs)
+        set_weight_attrs(w2_scale, {"load_full_w2": False})
+
+        w2_weight_shape = torch.nn.Parameter(torch.empty(num_experts, 2),
+                                             requires_grad=False)
+        layer.register_parameter("w2_weight_shape", w2_weight_shape)
+        set_weight_attrs(w2_weight_shape, extra_weight_attrs)
+        w13_weight_shape = torch.nn.Parameter(torch.empty(num_experts, 2),
+                                              requires_grad=False)
+
+        layer.register_parameter("w13_weight_shape", w13_weight_shape)
+        set_weight_attrs(w13_weight_shape, extra_weight_attrs)
+
+        w13_g_idx = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_g_idx", w13_g_idx)
+        set_weight_attrs(w13_g_idx, extra_weight_attrs)
+
+        w2_g_idx = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size_per_partition,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_g_idx", w2_g_idx)
+        set_weight_attrs(w2_g_idx, extra_weight_attrs)
+
+        w13_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_g_idx_sort_indices",
+                                 w13_g_idx_sort_indices)
+        set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)
+
+        w2_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size_per_partition,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_g_idx_sort_indices",
+                                 w2_g_idx_sort_indices)
+        set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)
+
+        layer.a13_scale = None
+        layer.a2_scale = None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # Reconfigure packed weights and scales to match moe_wna16 format
+        layer.w13_weight_packed = torch.nn.Parameter(
+            layer.w13_weight_packed.transpose(1, 2).contiguous().view(
+                torch.uint8),
+            requires_grad=False)
+        layer.w2_weight_packed = torch.nn.Parameter(
+            layer.w2_weight_packed.transpose(1,
+                                             2).contiguous().view(torch.uint8),
+            requires_grad=False)
+        layer.w13_weight_scale = torch.nn.Parameter(
+            layer.w13_weight_scale.transpose(1, 2).contiguous(),
+            requires_grad=False)
+        layer.w2_weight_scale = torch.nn.Parameter(
+            layer.w2_weight_scale.transpose(1, 2).contiguous(),
+            requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if enable_eplb:
+            raise NotImplementedError("EPLB not supported for "
+                                      "`CompressedTensorsWNA16MoEMethod` yet.")
+
+        from vllm.model_executor.layers.fused_moe import fused_experts
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
+
+        return fused_experts(
+            x,
+            layer.w13_weight_packed,
+            layer.w2_weight_packed,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            activation=activation,
+            use_int4_w4a16=self.num_bits == 4,
+            use_int8_w8a16=self.num_bits == 8,
+            global_num_experts=global_num_experts,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            expert_map=expert_map,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            w1_zp=None,
+            w2_zp=None,
+            block_shape=[0, self.group_size])
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
new file mode 100644
index 0000000..6e4e75d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from .compressed_tensors_scheme import CompressedTensorsScheme
+from .compressed_tensors_w4a4_nvfp4 import CompressedTensorsW4A4Fp4
+from .compressed_tensors_w4a16_24 import (W4A16SPARSE24_SUPPORTED_BITS,
+                                          CompressedTensorsW4A16Sparse24)
+from .compressed_tensors_w4a16_nvfp4 import CompressedTensorsW4A16Fp4
+from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8
+from .compressed_tensors_w8a8_int8 import CompressedTensorsW8A8Int8
+from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8
+from .compressed_tensors_wNa16 import (WNA16_SUPPORTED_BITS,
+                                       CompressedTensorsWNA16)
+
+from .compressed_tensors_24 import CompressedTensors24  # isort: skip
+
+__all__ = [
+    "CompressedTensorsScheme", "CompressedTensorsWNA16",
+    "CompressedTensorsW8A16Fp8", "CompressedTensorsW4A16Sparse24",
+    "CompressedTensorsW8A8Int8", "CompressedTensorsW8A8Fp8",
+    "WNA16_SUPPORTED_BITS", "W4A16SPARSE24_SUPPORTED_BITS",
+    "CompressedTensors24", "CompressedTensorsW4A16Fp4",
+    "CompressedTensorsW4A4Fp4"
+]
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
new file mode 100644
index 0000000..168b221
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -0,0 +1,366 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Callable, Optional
+
+import torch
+from compressed_tensors import CompressionFormat, ModelCompressor
+from compressed_tensors.quantization import (QuantizationArgs,
+                                             QuantizationStrategy,
+                                             QuantizationType)
+from compressed_tensors.utils import combine_shards
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear)
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    convert_to_channelwise, sparse_cutlass_supported)
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           ChannelQuantScaleParameter,
+                                           ModelWeightParameter,
+                                           PerTensorScaleParameter)
+
+__all__ = ["CompressedTensors24"]
+
+from vllm.platforms import current_platform
+
+
+class CompressedTensors24(CompressedTensorsScheme):
+
+    def __init__(
+        self,
+        quantized: bool = False,
+        weight_quant: Optional[QuantizationArgs] = None,
+        input_quant: Optional[QuantizationArgs] = None,
+        model_compression_config: Optional[dict[str, Any]] = None,
+    ):
+        self.quantized = quantized
+        self.weight_quant = weight_quant
+        self.input_quant = input_quant
+        self.model_compressor = (
+            ModelCompressor.from_compression_config(model_compression_config)
+            if model_compression_config is not None else None)
+        self.do_sparse_decompress = (
+            self.model_compressor is not None
+            and self.model_compressor.sparsity_config.format
+            == CompressionFormat.sparse_24_bitmask.value)
+
+        if quantized and input_quant is not None and \
+                self._get_quant_dtype() == current_platform.fp8_dtype():
+            static = not input_quant.dynamic
+            g_shape = GroupShape.PER_TENSOR if static else GroupShape.PER_TOKEN
+            self.quant_fp8 = QuantFP8(static, g_shape)
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # Only cutlass 3.x kernels are implemented so far
+        return 90
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size: int,
+        output_partition_sizes: list[int],
+        input_size_per_partition: int,
+        params_dtype: torch.dtype,
+        weight_loader: Callable,
+        **kwargs,
+    ):
+        if not sparse_cutlass_supported():
+            raise ValueError(
+                "Sparse CUTLASS not supported. vLLM must be built with "
+                "CUDA 12.2 or later to use this feature")
+
+        layer.logical_widths = output_partition_sizes
+        layer.input_size = input_size
+        layer.input_size_per_partition = input_size_per_partition
+        self.weights_dtype: torch.dtype = self._get_params_dtype(params_dtype)
+
+        # parameter to store uncompressed weight
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition,
+                dtype=self.weights_dtype,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        if self.do_sparse_decompress:
+            assert all(partition_size % 8 == 0
+                       for partition_size in output_partition_sizes
+                       ), "All partitions must be divisible by 8 for "
+            "2:4 sparse compressed models"
+
+            shape = BasevLLMParameter(
+                data=torch.empty(2, 1, dtype=torch.int64),
+                weight_loader=weight_loader,
+            )
+            compressed_weight = ModelWeightParameter(
+                data=torch.empty(
+                    sum(output_partition_sizes),
+                    input_size_per_partition // 2,
+                    dtype=self.weights_dtype,
+                ),
+                input_dim=1,
+                output_dim=0,
+                weight_loader=weight_loader,
+            )
+
+            bitmask = ModelWeightParameter(
+                data=torch.empty(
+                    sum(output_partition_sizes),
+                    input_size_per_partition // 8,
+                    dtype=torch.uint8,
+                ),
+                input_dim=1,
+                output_dim=0,
+                weight_loader=weight_loader,
+            )
+
+            layer.register_parameter("shape", shape)
+            layer.register_parameter("compressed", compressed_weight)
+            layer.register_parameter("bitmask", bitmask)
+
+        # Check if quantized, not just 2:4 Sparse
+        if self.quantized:
+            if (self.weight_quant and self.weight_quant.strategy
+                    == QuantizationStrategy.CHANNEL.value):
+                weight_scale = ChannelQuantScaleParameter(
+                    data=torch.empty((sum(output_partition_sizes), 1),
+                                     dtype=torch.float32),
+                    output_dim=0,
+                    weight_loader=weight_loader,
+                )
+            else:
+                assert (self.weight_quant and self.weight_quant.strategy
+                        == QuantizationStrategy.TENSOR.value)
+                weight_scale = PerTensorScaleParameter(
+                    data=torch.empty(len(output_partition_sizes),
+                                     dtype=torch.float32),
+                    weight_loader=weight_loader,
+                )
+
+            layer.register_parameter("weight_scale", weight_scale)
+
+            # input quant will be non-none
+            if self.input_quant and not self.input_quant.dynamic:
+                # register input quant scale
+                assert (self.input_quant.strategy ==
+                        QuantizationStrategy.TENSOR.value)
+                input_scale = BasevLLMParameter(
+                    data=torch.empty(1, dtype=torch.float32),
+                    weight_loader=weight_loader,
+                )
+
+                layer.register_parameter("input_scale", input_scale)
+
+        else:
+            # for sparse-only, pass in 1 for weight/input scales
+            weight_scale = torch.nn.Parameter(data=torch.ones(
+                1, dtype=torch.float32),
+                                              requires_grad=False)
+            input_scale = torch.nn.Parameter(data=torch.ones(
+                1, dtype=torch.float32),
+                                             requires_grad=False)
+            layer.register_parameter("input_scale", input_scale)
+            layer.register_parameter("weight_scale", weight_scale)
+
+        layer.register_parameter("weight", weight)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        """
+        Compress weights after loading. Store compressed weight and meta
+            tensor
+
+        :post-condition: layer.w_compressed and layer.meta are
+            set to the compressed weight and meta tensor in the
+            format expected by the Cutlass kernels
+        :param layer: The layer with the weights to be processed
+
+        """
+        if self.do_sparse_decompress:
+            layer.weight.data = self._decompress_bitmask_compressed_weight(
+                compressed=layer.compressed,
+                bitmask=layer.bitmask,
+                layer=layer,
+            )
+
+            # compressed and bitmask tensors
+            # are no longer needed after decompression
+            del layer.compressed
+            del layer.bitmask
+
+        # torch.compile workaround
+        if hasattr(layer, "input_scale"):
+            layer.input_scale = torch.nn.Parameter(layer.input_scale.data,
+                                                   requires_grad=False)
+
+        if self.weight_quant:
+            if self.weight_quant.strategy == QuantizationStrategy.TENSOR.value:
+                layer.weight_scale = torch.nn.Parameter(
+                    convert_to_channelwise(
+                        weight_scale=layer.weight_scale,
+                        logical_widths=layer.logical_widths,
+                    ),
+                    requires_grad=False,
+                )
+            else:
+                # torch.compile workaround
+                layer.weight_scale = torch.nn.Parameter(
+                    layer.weight_scale.data, requires_grad=False)
+
+        # Set all negative zero values to 0 prior to compression
+        if (layer.weight.dtype.is_floating_point
+                and layer.weight.dtype.itemsize >= 2):
+            layer.weight.data[layer.weight.data == -0.0] = 0.0
+
+        w_compressed, meta = ops.cutlass_sparse_compress(layer.weight.data)
+        layer.weight = torch.nn.Parameter(w_compressed, requires_grad=False)
+        layer.meta = torch.nn.Parameter(meta, requires_grad=False)
+
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Returns the output tensor for the layer with 2:4
+        sparse compressed weights, given the input tensor
+        and bias
+
+        :param layer: The layer with 2:4 sparse compressed
+            weights to be used for the computation
+        :param x: The input tensor to the layer
+        :param bias: The bias to be added to the output tensor
+        :return: The output tensor of the layer
+        """
+        if self.quantized:
+            scale = getattr(layer, 'input_scale', None)
+
+            if self.weights_dtype == torch.int8:
+                ops_output = ops.scaled_int8_quant(x, scale=scale)
+                q_input = ops_output[0]
+                input_scale = ops_output[1]
+            else:
+                assert self.weights_dtype == torch.float8_e4m3fn
+                q_input, input_scale = self.quant_fp8(x, scale=scale)
+
+        else:
+            # Not quantized, nothing to do with the input_scales, use as is
+            input_scale = layer.input_scale
+            q_input = x
+
+        out = ops.cutlass_scaled_sparse_mm(
+            a=q_input,
+            bt_nzs=layer.weight,
+            bt_meta=layer.meta,
+            scale_a=input_scale,
+            scale_b=layer.weight_scale,
+            out_dtype=x.dtype,
+            bias=bias,
+        )
+
+        assert out.is_contiguous()
+        return out
+
+    def _get_params_dtype(self, params_dtype: torch.dtype) -> torch.dtype:
+        if not self.quantized:
+            return params_dtype
+        return self._get_quant_dtype()
+
+    def _get_quant_dtype(self) -> torch.dtype:
+        assert self.quantized
+        assert self.weight_quant is not None
+        assert self.input_quant is not None
+
+        is_8_bits = self.weight_quant.num_bits == self.input_quant.num_bits == 8
+
+        if not is_8_bits:
+            raise ValueError("Cutlass only supports 8-bit quantization")
+
+        if (self.weight_quant.type == QuantizationType.FLOAT
+                and self.input_quant.type == QuantizationType.FLOAT):
+            return torch.float8_e4m3fn
+
+        if (self.weight_quant.type == QuantizationType.INT
+                and self.input_quant.type == QuantizationType.INT):
+            return torch.int8
+
+        raise ValueError("Quantization type not supported by Cutlass")
+
+    def _decompress_bitmask_compressed_weight(
+        self,
+        compressed: torch.Tensor,
+        bitmask: torch.Tensor,
+        layer: torch.nn.Module,
+    ) -> torch.Tensor:
+        """
+        Decompress a compressed 2:4 sparse weight tensor using the bitmask and
+        return the result.
+
+        This function also supports sharded decompression.
+
+        :param compressed: The 2:4 sparse weight tensor compressed using the
+            sparse-24-bitmask compressor. This is different from
+            `cutlass_sparse_compress` which uses a different scheme (2 bits for
+            every nonzero element that represent the coordinate within the block
+            of 4). The bitmask compression here uses a bitmask to indicate the
+            positions of non-zero elements.
+        :param bitmask: The 2:4 bitmask associated with the compressed weights,
+            representing the positions of non-zero elements in the compressed
+            tensor.
+        :param layer: The layer whose weights need to be processed after 
+            loading.
+        :return: The decompressed 2:4 sparse weight tensor.
+        """
+
+        sparsity_compressor = self.model_compressor.sparsity_compressor
+
+        def _process_split(
+            bitmask_compressed_weight: torch.Tensor,
+            shape,
+            bitmask: torch.Tensor,
+        ) -> torch.Tensor:
+            weight_data = dict(
+                compressed=bitmask_compressed_weight,
+                shape=shape,
+                bitmask=bitmask,
+            )
+            return sparsity_compressor.decompress_weight(weight_data)
+
+        split_weights: list[torch.Tensor] = []
+        split_bitmask: list[torch.Tensor] = []
+        split_shape: list[tuple[int, int]] = []
+
+        if isinstance(layer, (QKVParallelLinear, MergedColumnParallelLinear)):
+            split_weights = torch.split(compressed, layer.logical_widths)
+            split_bitmask = torch.split(bitmask, layer.logical_widths)
+            split_shape = [(out, layer.input_size_per_partition)
+                           for out in layer.logical_widths]
+
+        if split_weights:
+            decompressed_shards = [
+                _process_split(compressed_weight, shape, bitmask)
+                for compressed_weight, shape, bitmask in zip(
+                    split_weights, split_shape, split_bitmask)
+            ]
+            decompressed = combine_shards(decompressed_shards)
+        else:
+            decompressed = sparsity_compressor.decompress_weight(
+                dict(
+                    compressed=compressed,
+                    shape=(
+                        layer.logical_widths[0],
+                        layer.input_size_per_partition,
+                    ),
+                    bitmask=bitmask,
+                ))
+        return decompressed
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
new file mode 100644
index 0000000..a5d48f2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import ABC, abstractmethod
+from typing import Optional
+
+import torch
+
+__all__ = ["CompressedTensorsScheme"]
+
+
+class CompressedTensorsScheme(ABC):
+    """
+    Abstract class used to describe the weight creation and forward pass 
+    of different quantization schemes supported by CompressedTensors.
+    """
+
+    @classmethod
+    @abstractmethod
+    def get_min_capability(cls) -> int:
+        """
+        Get minimum device capability.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def create_weights(self, *args, **kwargs):
+        """
+        Weight creation for the particular scheme. Inputs to this function 
+
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+                      bias: Optional[torch.Tensor]):
+        """
+        Run the forward pass for the particular scheme. This is where 
+        scheme-specific dequant/quant steps/kernels should be applied.
+
+        :param layer: torch.nn.Module with the registered weights and 
+            other parameters relevant to the particular scheme. 
+        :param x: input to the layer
+        :param bias: bias parameter
+
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        """
+        Called after weight loading is complete for any cleanup that
+        needs to occur.
+        """
+        raise NotImplementedError
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
new file mode 100644
index 0000000..3f3e766
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Callable, Optional
+
+import torch
+from torch.nn import Parameter
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
+    GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N)
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           ChannelQuantScaleParameter,
+                                           GroupQuantScaleParameter,
+                                           PackedvLLMParameter)
+from vllm.scalar_type import scalar_types
+
+__all__ = ["CompressedTensorsW4A16Sparse24"]
+W4A16SPARSE24_SUPPORTED_TYPES_MAP = {
+    4: scalar_types.uint4b8,
+}
+W4A16SPARSE24_SUPPORTED_BITS = list(W4A16SPARSE24_SUPPORTED_TYPES_MAP.keys())
+
+
+class CompressedTensorsW4A16Sparse24(CompressedTensorsScheme):
+
+    def __init__(self,
+                 strategy: str,
+                 num_bits: int,
+                 group_size: Optional[int] = None):
+        self.strategy = strategy
+        self.group_size = group_size
+        self.tile_size = 16
+
+        if num_bits not in W4A16SPARSE24_SUPPORTED_TYPES_MAP:
+            raise ValueError(
+                f"Unsupported num_bits = {num_bits}. "
+                f"Supported num_bits = {W4A16SPARSE24_SUPPORTED_BITS}")
+
+        self.quant_type = W4A16SPARSE24_SUPPORTED_TYPES_MAP[num_bits]
+
+        if self.strategy == "group" and self.group_size is None:
+            raise ValueError(
+                "group_size must be given when using strategy group")
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # ampere + up
+        return 80
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # required by torch.compile to be torch.nn.Parameter
+        layer.weight_packed = Parameter(layer.weight_packed.data,
+                                        requires_grad=False)
+        layer.scale_packed = Parameter(layer.scale_packed.data,
+                                       requires_grad=False)
+        layer.meta = Parameter(layer.meta.data, requires_grad=False)
+
+    def create_weights(self, layer: torch.nn.Module, input_size: int,
+                       output_partition_sizes: list[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+
+        assert params_dtype == torch.float16, (
+            "float16 is required for marlin24 compressed models. Set dtype=torch.float16"  # noqa: E501
+        )
+
+        pack_factor = 32 // self.quant_type.size_bits
+        output_size_per_partition = sum(output_partition_sizes)
+
+        qweight = PackedvLLMParameter(data=torch.empty(
+            input_size_per_partition // self.tile_size // 2,
+            output_size_per_partition * self.tile_size // pack_factor,
+            dtype=torch.int32,
+        ),
+                                      input_dim=0,
+                                      output_dim=1,
+                                      packed_dim=1,
+                                      packed_factor=pack_factor,
+                                      marlin_tile_size=self.tile_size,
+                                      weight_loader=weight_loader)
+
+        input_groups = (1 if self.group_size is None else
+                        input_size_per_partition // self.group_size)
+
+        weight_scale_args = {
+            "data":
+            torch.empty(
+                input_groups,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            "weight_loader":
+            weight_loader
+        }
+
+        if self.group_size is not None:
+            scales = GroupQuantScaleParameter(output_dim=1,
+                                              input_dim=0,
+                                              **weight_scale_args)
+        else:
+            scales = ChannelQuantScaleParameter(output_dim=1,
+                                                **weight_scale_args)
+
+        weight_shape = BasevLLMParameter(data=torch.empty(2,
+                                                          dtype=torch.int64),
+                                         weight_loader=weight_loader)
+
+        meta = PackedvLLMParameter(data=torch.empty(
+            input_size_per_partition // 8 // 2 // 2,
+            output_size_per_partition * 2,
+            dtype=torch.int16,
+        ),
+                                   input_dim=0,
+                                   output_dim=1,
+                                   packed_dim=1,
+                                   packed_factor=1,
+                                   marlin_tile_size=2,
+                                   weight_loader=weight_loader)
+
+        layer.register_parameter("weight_packed", qweight)
+        layer.register_parameter("weight_shape", weight_shape)
+        layer.register_parameter("scale_packed", scales)
+        layer.register_parameter("meta", meta)
+
+        max_workspace_size = (
+            output_size_per_partition //
+            GPTQ_MARLIN_24_MIN_THREAD_N) * GPTQ_MARLIN_24_MAX_PARALLEL
+
+        workspace = Parameter(torch.zeros(max_workspace_size, dtype=torch.int),
+                              requires_grad=False)
+        layer.workspace = workspace
+
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+                      bias: Optional[torch.Tensor]) -> torch.Tensor:
+
+        qweight = layer.weight_packed
+        meta = layer.meta
+        scales = layer.scale_packed
+        workspace = layer.workspace
+
+        x_2d = x.view(-1, x.shape[-1])
+
+        size_m = x_2d.shape[0]
+        size_k = x_2d.shape[1]
+        size_n = scales.shape[1]
+
+        output_2d = ops.gptq_marlin_24_gemm(x_2d, qweight, meta, scales,
+                                            workspace, self.quant_type, size_m,
+                                            size_n, size_k)
+
+        output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
+
+        if bias is not None:
+            output.add_(bias)  # In-place add
+
+        return output
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py
new file mode 100644
index 0000000..96dccf0
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py
@@ -0,0 +1,105 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Callable, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
+    apply_fp4_marlin_linear, prepare_fp4_layer_for_marlin)
+from vllm.model_executor.parameter import (GroupQuantScaleParameter,
+                                           ModelWeightParameter,
+                                           PerTensorScaleParameter)
+
+__all__ = ["CompressedTensorsW4A16Fp4"]
+
+
+class CompressedTensorsW4A16Fp4(CompressedTensorsScheme):
+
+    def __init__(self, has_input_global_scale: bool = False):
+        self.has_input_global_scale = has_input_global_scale
+        self.group_size = 16
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # dont restrict as emulations
+        return 80
+
+    def create_weights(self, layer: torch.nn.Module,
+                       output_partition_sizes: list[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+        output_size_per_partition = sum(output_partition_sizes)
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+
+        # Weight
+        weight = ModelWeightParameter(data=torch.empty(
+            sum(output_partition_sizes),
+            input_size_per_partition // 2,
+            dtype=torch.uint8),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
+        layer.register_parameter("weight_packed", weight)
+
+        # Global Weight Scale
+        weight_global_scale = PerTensorScaleParameter(
+            data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+            weight_loader=weight_loader)
+        layer.register_parameter("weight_global_scale", weight_global_scale)
+
+        # Per Group Weight Scale
+        weight_scale = GroupQuantScaleParameter(data=torch.empty(
+            sum(output_partition_sizes),
+            input_size_per_partition // self.group_size,
+            dtype=torch.float8_e4m3fn,
+        ),
+                                                input_dim=1,
+                                                output_dim=0,
+                                                weight_loader=weight_loader)
+
+        layer.register_parameter("weight_scale", weight_scale)
+
+        if self.has_input_global_scale:
+            input_global_scale = PerTensorScaleParameter(
+                data=torch.empty(len(output_partition_sizes),
+                                 dtype=torch.float32),
+                weight_loader=weight_loader)
+            layer.register_parameter("input_global_scale", input_global_scale)
+
+    def process_weights_after_loading(self, layer) -> None:
+        # Process parameters for marlin repacking
+
+        # Rename weight_packed to weight that marlin expects
+        layer.weight = Parameter(layer.weight_packed.data, requires_grad=False)
+        del layer.weight_packed
+        # Rename weight_global_scale to weight_scale_2 that marlin expects
+        # Note: ct stores the inverse of what is expected by the marlin kernel
+        layer.weight_scale_2 = Parameter(
+            1 / layer.weight_global_scale.max().to(torch.float32),
+            requires_grad=False)
+        del layer.weight_global_scale
+
+        if self.has_input_global_scale:
+            layer.input_global_scale = torch.nn.Parameter(
+                layer.input_global_scale.data, requires_grad=False)
+
+        prepare_fp4_layer_for_marlin(layer)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return apply_fp4_marlin_linear(input=x,
+                                       weight=layer.weight,
+                                       weight_scale=layer.weight_scale,
+                                       weight_scale_2=layer.weight_scale_2,
+                                       workspace=layer.workspace,
+                                       size_n=layer.output_size_per_partition,
+                                       size_k=layer.input_size_per_partition,
+                                       bias=bias)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
new file mode 100644
index 0000000..8ba7216
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
@@ -0,0 +1,149 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Callable, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+import vllm.envs as envs
+from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import (  # noqa: E501
+    run_nvfp4_emulations)
+from vllm.model_executor.parameter import (GroupQuantScaleParameter,
+                                           ModelWeightParameter,
+                                           PerTensorScaleParameter)
+
+logger = init_logger(__name__)
+
+__all__ = ["CompressedTensorsW4A4Fp4"]
+
+
+class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
+
+    def __init__(self):
+        self.group_size = 16
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        if envs.VLLM_USE_NVFP4_CT_EMULATIONS:
+            return 80
+        return 100
+
+    def create_weights(self, layer: torch.nn.Module,
+                       output_partition_sizes: list[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+        output_size_per_partition = sum(output_partition_sizes)
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+
+        # Weight
+        weight = ModelWeightParameter(data=torch.empty(
+            sum(output_partition_sizes),
+            input_size_per_partition // 2,
+            dtype=torch.uint8),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
+        layer.register_parameter("weight_packed", weight)
+
+        # Global Weight Scale
+        weight_global_scale = PerTensorScaleParameter(
+            data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+            weight_loader=weight_loader)
+        layer.register_parameter("weight_global_scale", weight_global_scale)
+
+        # Per Group Weight Scale
+        weight_scale = GroupQuantScaleParameter(data=torch.empty(
+            sum(output_partition_sizes),
+            input_size_per_partition // self.group_size,
+            dtype=torch.float8_e4m3fn,
+        ),
+                                                input_dim=1,
+                                                output_dim=0,
+                                                weight_loader=weight_loader)
+
+        layer.register_parameter("weight_scale", weight_scale)
+
+        input_global_scale = PerTensorScaleParameter(
+            data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+            weight_loader=weight_loader)
+        layer.register_parameter("input_global_scale", input_global_scale)
+
+    def swizzle_blockscale(self, scale: torch.tensor):
+        assert (scale.dtype == torch.float8_e4m3fn)
+        # Pad and blockwise interleave weight_scale
+        scale_ndim = scale.ndim
+        if scale.ndim == 2:
+            scale = scale.unsqueeze(0)
+        assert scale.ndim == 3
+        B, M, K = scale.shape
+        round_up_multiple = lambda x, m: (x + m - 1) // m * m
+        M_padded = round_up_multiple(M, 128)
+        K_padded = round_up_multiple(K, 4)
+        padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
+        padded_scale[:B, :M, :K] = scale
+        batches, rows, cols = padded_scale.shape
+        assert rows % 128 == 0
+        assert cols % 4 == 0
+        padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32,
+                                            cols // 4, 4)
+        swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
+        swizzled_scale = swizzled_scale.contiguous().cuda()
+        return (swizzled_scale.reshape(M, K)
+                if scale_ndim == 2 else swizzled_scale.reshape(B, M, K))
+
+    def process_weights_after_loading(self, layer) -> None:
+
+        global_input_scale = layer.input_global_scale.max().to(torch.float32)
+        layer.input_global_scale = Parameter(global_input_scale,
+                                             requires_grad=False)
+
+        layer.weight_global_scale = Parameter(
+            layer.weight_global_scale.max().to(torch.float32),
+            requires_grad=False)
+
+        swizzled_weight_scale = self.swizzle_blockscale(layer.weight_scale)
+        layer.weight_scale_swizzled = Parameter(swizzled_weight_scale,
+                                                requires_grad=False)
+
+        # required by cutlass kernel; need Parameter, not ModelWeightParameter
+        layer.weight = Parameter(layer.weight_packed.data, requires_grad=False)
+
+        layer.alpha = Parameter(layer.input_global_scale *
+                                layer.weight_global_scale,
+                                requires_grad=False)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        if envs.VLLM_USE_NVFP4_CT_EMULATIONS:
+            out = run_nvfp4_emulations(
+                x=x,
+                input_global_scale=layer.input_global_scale,
+                weight=layer.weight,
+                weight_scale_swizzled=layer.weight_scale_swizzled,
+                weight_global_scale=layer.weight_global_scale)
+            if bias is not None:
+                out = out + bias
+            return out
+
+        output_dtype = x.dtype
+        output_shape = [x.shape[0], layer.weight.shape[0]]
+
+        # quantize BF16 or FP16 to (FP4 and interleaved block scale)
+        x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_global_scale)
+
+        out = cutlass_scaled_fp4_mm(x_fp4, layer.weight, x_blockscale,
+                                    layer.weight_scale_swizzled,
+                                    1 / layer.alpha, output_dtype)
+        if bias is not None:
+            out = out + bias
+        return out.view(*output_shape)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
new file mode 100644
index 0000000..01a87a0
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
@@ -0,0 +1,121 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Callable, Optional
+
+import torch
+from compressed_tensors.quantization import QuantizationStrategy
+
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
+    apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    convert_to_channelwise)
+from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
+                                           ModelWeightParameter,
+                                           PerTensorScaleParameter)
+
+__all__ = ["CompressedTensorsW8A16Fp8"]
+
+SUPPORTED_STRATEGIES = [
+    QuantizationStrategy.CHANNEL, QuantizationStrategy.TENSOR
+]
+
+
+class CompressedTensorsW8A16Fp8(CompressedTensorsScheme):
+
+    def __init__(self, strategy: str, is_static_input_scheme: bool):
+        self.strategy = strategy
+        self.is_static_input_scheme = is_static_input_scheme
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # ampere and up
+        return 80
+
+    # W8A8-Fp8 kernels support only per-tensor and per-channel cases.
+    # So if we have a fused module (QKV, MLP) with per tensor scales,
+    # we expand each scale to its shard's channels.
+    def process_weights_after_loading(self, layer) -> None:
+        if self.strategy == QuantizationStrategy.TENSOR:
+            ws_channelwise = convert_to_channelwise(layer.weight_scale,
+                                                    layer.logical_widths)
+            layer.weight_scale = torch.nn.Parameter(ws_channelwise,
+                                                    requires_grad=False)
+        else:
+            # required by torch.compile to be torch.nn.Parameter
+            layer.weight_scale = torch.nn.Parameter(layer.weight_scale.data,
+                                                    requires_grad=False)
+
+        # Weights must be transposed for marlin
+        layer.weight = torch.nn.Parameter(layer.weight.t(),
+                                          requires_grad=False)
+
+        if self.is_static_input_scheme:
+            # required by torch.compile to be torch.nn.Parameter
+            layer.input_scale = torch.nn.Parameter(layer.input_scale.data,
+                                                   requires_grad=False)
+        prepare_fp8_layer_for_marlin(layer)
+
+    def create_weights(self, layer: torch.nn.Module, input_size: int,
+                       output_partition_sizes: list[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+
+        output_size_per_partition = sum(output_partition_sizes)
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        layer.orig_dtype = params_dtype
+        layer.weight_block_size = None
+
+        # WEIGHT
+        weight = ModelWeightParameter(data=torch.empty(
+            output_size_per_partition,
+            input_size_per_partition,
+            dtype=torch.float8_e4m3fn),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
+        layer.register_parameter("weight", weight)
+
+        # WEIGHT SCALE
+        if self.strategy == QuantizationStrategy.CHANNEL:
+            weight_scale = ChannelQuantScaleParameter(
+                data=torch.empty((sum(output_partition_sizes), 1),
+                                 dtype=torch.float32),
+                output_dim=0,
+                weight_loader=weight_loader)
+        elif self.strategy == QuantizationStrategy.TENSOR:
+            weight_scale = PerTensorScaleParameter(data=torch.empty(
+                len(output_partition_sizes), dtype=torch.float32),
+                                                   weight_loader=weight_loader)
+        else:
+            raise ValueError(
+                f"Unsupported weight strategy={self.strategy}, "
+                f"supported strategies are {SUPPORTED_STRATEGIES}")
+
+        weight_scale[:] = torch.finfo(torch.float32).min
+        layer.register_parameter("weight_scale", weight_scale)
+
+        # INPUT SCALE (to deal with converted checkpoints)
+        if self.is_static_input_scheme:
+            input_scale = PerTensorScaleParameter(data=torch.empty(
+                len(output_partition_sizes), dtype=torch.float32),
+                                                  weight_loader=weight_loader)
+            layer.register_parameter("input_scale", input_scale)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        return apply_fp8_marlin_linear(input=x,
+                                       weight=layer.weight,
+                                       weight_scale=layer.weight_scale,
+                                       workspace=layer.workspace,
+                                       size_n=layer.output_size_per_partition,
+                                       size_k=layer.input_size_per_partition,
+                                       bias=bias)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
new file mode 100644
index 0000000..d984e89
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -0,0 +1,156 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Callable, Optional
+
+import torch
+from compressed_tensors.quantization import QuantizationStrategy
+from torch.nn import Parameter
+
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    Fp8LinearOp, maybe_create_device_identity, normalize_e4m3fn_to_e4m3fnuz,
+    requantize_with_max_scale)
+from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
+                                           ModelWeightParameter,
+                                           PerTensorScaleParameter)
+from vllm.platforms import current_platform
+
+__all__ = ["CompressedTensorsW8A8Fp8"]
+
+
+class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
+
+    def __init__(self, strategy: str, is_static_input_scheme: bool):
+        self.strategy = strategy
+        self.out_dtype = torch.get_default_dtype()
+        self.is_static_input_scheme = is_static_input_scheme
+        self.act_q_group_shape = GroupShape.PER_TENSOR \
+            if is_static_input_scheme else GroupShape.PER_TOKEN
+        self.fp8_linear = Fp8LinearOp(
+            act_quant_static=self.is_static_input_scheme,
+            act_quant_group_shape=self.act_q_group_shape)
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # lovelace and up
+        return 89
+
+    def process_weights_after_loading(self, layer) -> None:
+        # If per tensor, when we have a fused module (e.g. QKV) with per
+        # tensor scales (thus N scales being passed to the kernel),
+        # requantize so we can always run per tensor
+        if self.strategy == QuantizationStrategy.TENSOR:
+            max_w_scale, weight = requantize_with_max_scale(
+                weight=layer.weight,
+                weight_scale=layer.weight_scale,
+                logical_widths=layer.logical_widths,
+            )
+
+            if current_platform.is_fp8_fnuz():
+                input_scale = getattr(layer, 'input_scale', None)
+
+                weight, max_w_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
+                    weight=weight,
+                    weight_scale=max_w_scale,
+                    input_scale=input_scale)
+                if input_scale is not None:
+                    layer.input_scale = Parameter(input_scale,
+                                                  requires_grad=False)
+
+            layer.weight = Parameter(weight.t(), requires_grad=False)
+            layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
+
+        # If channelwise, scales are already lined up, so just transpose.
+        elif self.strategy == QuantizationStrategy.CHANNEL:
+            weight = layer.weight
+
+            if current_platform.is_fp8_fnuz():
+                input_scale = getattr(layer, 'input_scale', None)
+
+                weight, weight_scale, input_scale = \
+                    normalize_e4m3fn_to_e4m3fnuz(
+                        weight=weight,
+                        weight_scale=layer.weight_scale,
+                        input_scale=input_scale)
+                if input_scale is not None:
+                    layer.input_scale = Parameter(input_scale,
+                                                  requires_grad=False)
+            else:
+                weight_scale = layer.weight_scale.data
+
+            layer.weight = Parameter(weight.t(), requires_grad=False)
+            # required by torch.compile to be torch.nn.Parameter
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+
+        else:
+            raise ValueError(f"Unknown quantization strategy {self.strategy}")
+
+        # INPUT SCALE
+        if self.is_static_input_scheme and hasattr(layer, 'input_scale'):
+            layer.input_scale = Parameter(layer.input_scale.max(),
+                                          requires_grad=False)
+        else:
+            layer.input_scale = None
+
+    def create_weights(self, layer: torch.nn.Module,
+                       output_partition_sizes: list[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+        maybe_create_device_identity()
+
+        output_size_per_partition = sum(output_partition_sizes)
+        layer.logical_widths = output_partition_sizes
+
+        # WEIGHT
+        weight = ModelWeightParameter(data=torch.empty(
+            output_size_per_partition,
+            input_size_per_partition,
+            dtype=torch.float8_e4m3fn),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
+        layer.register_parameter("weight", weight)
+
+        # WEIGHT SCALE
+        # TODO: update create_xxx_parameter functions to return
+        # the newly added parameters
+        if self.strategy == QuantizationStrategy.CHANNEL:
+            weight_scale = ChannelQuantScaleParameter(
+                data=torch.empty((sum(output_partition_sizes), 1),
+                                 dtype=torch.float32),
+                output_dim=0,
+                weight_loader=weight_loader)
+        else:
+            assert self.strategy == QuantizationStrategy.TENSOR
+            weight_scale = PerTensorScaleParameter(data=torch.empty(
+                len(output_partition_sizes), dtype=torch.float32),
+                                                   weight_loader=weight_loader)
+
+        # min requirement for fp8 kernels
+        weight_scale[:] = torch.finfo(torch.float32).min
+        layer.register_parameter("weight_scale", weight_scale)
+
+        # INPUT SCALE
+        if self.is_static_input_scheme:
+            input_scale = PerTensorScaleParameter(data=torch.empty(
+                len(output_partition_sizes), dtype=torch.float32),
+                                                  weight_loader=weight_loader)
+            input_scale[:] = torch.finfo(torch.float32).min
+            layer.register_parameter("input_scale", input_scale)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        return self.fp8_linear.apply(input=x,
+                                     weight=layer.weight,
+                                     weight_scale=layer.weight_scale,
+                                     out_dtype=self.out_dtype,
+                                     input_scale=layer.input_scale,
+                                     bias=bias)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
new file mode 100644
index 0000000..6189f06
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -0,0 +1,111 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Callable, Optional
+
+import torch
+from compressed_tensors.quantization import QuantizationStrategy
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm import (
+    ScaledMMLinearLayerConfig, choose_scaled_mm_linear_kernel)
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           ChannelQuantScaleParameter,
+                                           ModelWeightParameter,
+                                           PerTensorScaleParameter)
+
+logger = init_logger(__name__)
+
+
+class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
+    _kernel_backends_being_used: set[str] = set()
+
+    def __init__(self, strategy: str, is_static_input_scheme: bool,
+                 input_symmetric: bool):
+        self.strategy = strategy
+        self.is_static_input_scheme = is_static_input_scheme
+        self.input_symmetric = input_symmetric
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # turing and up
+        return 75
+
+    def create_weights(self, layer: torch.nn.Module,
+                       output_partition_sizes: list[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+        layer.logical_widths = output_partition_sizes
+
+        scaled_mm_linear_kernel_config = ScaledMMLinearLayerConfig(
+            is_channelwise=(self.strategy == QuantizationStrategy.CHANNEL),
+            is_static_input_scheme=self.is_static_input_scheme,
+            input_symmetric=self.input_symmetric)
+
+        kernel_type = choose_scaled_mm_linear_kernel(
+            scaled_mm_linear_kernel_config)
+
+        if kernel_type.__name__ not in self._kernel_backends_being_used:
+            logger.info("Using %s for CompressedTensorsW8A8Int8",
+                        kernel_type.__name__)
+            self._kernel_backends_being_used.add(kernel_type.__name__)
+
+        # WEIGHT
+        weight = ModelWeightParameter(data=torch.empty(
+            sum(output_partition_sizes),
+            input_size_per_partition,
+            dtype=torch.int8),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
+
+        layer.register_parameter("weight", weight)
+
+        # WEIGHT SCALE
+        if self.strategy == QuantizationStrategy.CHANNEL:
+            weight_scale = ChannelQuantScaleParameter(
+                data=torch.empty((sum(output_partition_sizes), 1),
+                                 dtype=torch.float32),
+                output_dim=0,
+                weight_loader=weight_loader)
+        else:
+            assert self.strategy == QuantizationStrategy.TENSOR
+            weight_scale = PerTensorScaleParameter(data=torch.empty(
+                len(output_partition_sizes), dtype=torch.float32),
+                                                   weight_loader=weight_loader)
+        layer.register_parameter("weight_scale", weight_scale)
+
+        # INPUT SCALE
+        if self.is_static_input_scheme:
+            input_scale = BasevLLMParameter(data=torch.empty(
+                1, dtype=torch.float32),
+                                            weight_loader=weight_loader)
+            layer.register_parameter("input_scale", input_scale)
+
+            if not self.input_symmetric:
+                # Note: compressed-tensors stores the zp using the same dtype
+                # as the weights
+                # AZP loaded as int8 but used as int32
+                input_zero_point = BasevLLMParameter(
+                    data=torch.empty(1, dtype=torch.int8),
+                    weight_loader=weight_loader)
+                layer.register_parameter("input_zero_point", input_zero_point)
+
+        self.kernel = kernel_type(c=scaled_mm_linear_kernel_config,
+                                  w_q_param_name="weight",
+                                  w_s_param_name="weight_scale",
+                                  i_s_param_name="input_scale",
+                                  i_zp_param_name="input_zero_point",
+                                  azp_adj_param_name="azp_adj")
+
+    # Checkpoints are serialized in compressed-tensors format, which is
+    # different from the format the kernel may want. Handle repacking here.
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        self.kernel.process_weights_after_loading(layer)
+
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+                      bias: Optional[torch.Tensor]) -> torch.Tensor:
+        return self.kernel.apply_weights(layer, x, bias)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
new file mode 100644
index 0000000..7478760
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -0,0 +1,201 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Callable, Optional
+
+import torch
+from compressed_tensors.quantization import ActivationOrdering
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
+    MPLinearLayerConfig, choose_mp_linear_kernel)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    marlin_repeat_scales_on_all_ranks)
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           ChannelQuantScaleParameter,
+                                           GroupQuantScaleParameter,
+                                           PackedColumnParameter,
+                                           PackedvLLMParameter,
+                                           RowvLLMParameter)
+# yapf: enable
+from vllm.scalar_type import scalar_types
+
+logger = init_logger(__name__)
+
+__all__ = ["CompressedTensorsWNA16"]
+WNA16_SUPPORTED_TYPES_MAP = {
+    4: scalar_types.uint4b8,
+    8: scalar_types.uint8b128
+}
+WNA16_ZP_SUPPORTED_TYPES_MAP = {4: scalar_types.uint4, 8: scalar_types.uint8}
+WNA16_SUPPORTED_BITS = list(WNA16_SUPPORTED_TYPES_MAP.keys())
+
+
+class CompressedTensorsWNA16(CompressedTensorsScheme):
+    _kernel_backends_being_used: set[str] = set()
+
+    def __init__(self,
+                 strategy: str,
+                 num_bits: int,
+                 group_size: Optional[int] = None,
+                 symmetric: Optional[bool] = True,
+                 actorder: Optional[ActivationOrdering] = None):
+
+        self.pack_factor = 32 // num_bits
+        self.strategy = strategy
+        self.symmetric = symmetric
+        self.group_size = -1 if group_size is None else group_size
+        self.has_g_idx = actorder == ActivationOrdering.GROUP
+
+        if self.group_size == -1 and self.strategy != "channel":
+            raise ValueError("Marlin kernels require group quantization or "
+                             "channelwise quantization, but found no group "
+                             "size and strategy is not channelwise.")
+
+        if num_bits not in WNA16_SUPPORTED_TYPES_MAP:
+            raise ValueError(
+                f"Unsupported num_bits = {num_bits}. "
+                f"Supported num_bits = {WNA16_SUPPORTED_TYPES_MAP.keys()}")
+
+        self.quant_type = (WNA16_ZP_SUPPORTED_TYPES_MAP[num_bits]
+                           if not self.symmetric else
+                           WNA16_SUPPORTED_TYPES_MAP[num_bits])
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # ampere and up
+        return 80
+
+    def create_weights(self, layer: torch.nn.Module, output_size: int,
+                       input_size: int, output_partition_sizes: list[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+
+        output_size_per_partition = sum(output_partition_sizes)
+
+        mp_linear_kernel_config = MPLinearLayerConfig(
+            full_weight_shape=(input_size, output_size),
+            partition_weight_shape=\
+                (input_size_per_partition, output_size_per_partition),
+            weight_type=self.quant_type,
+            act_type=params_dtype,
+            group_size=self.group_size,
+            zero_points=not self.symmetric,
+            has_g_idx=self.has_g_idx
+        )
+
+        kernel_type = choose_mp_linear_kernel(mp_linear_kernel_config)
+
+        if kernel_type.__name__ not in self._kernel_backends_being_used:
+            logger.info("Using %s for CompressedTensorsWNA16",
+                        kernel_type.__name__)
+            self._kernel_backends_being_used.add(kernel_type.__name__)
+
+        # If group_size is -1, we are in channelwise case.
+        group_size = self.group_size if self.group_size != -1 else input_size
+        row_parallel = (input_size != input_size_per_partition)
+        partition_scales = not marlin_repeat_scales_on_all_ranks(
+            self.has_g_idx, self.group_size, row_parallel)
+
+        scales_and_zp_size = input_size // group_size
+
+        if partition_scales:
+            assert input_size_per_partition % group_size == 0
+            scales_and_zp_size = input_size_per_partition // group_size
+
+        weight = PackedvLLMParameter(input_dim=1,
+                                     output_dim=0,
+                                     weight_loader=weight_loader,
+                                     packed_factor=self.pack_factor,
+                                     packed_dim=1,
+                                     data=torch.empty(
+                                         output_size_per_partition,
+                                         input_size_per_partition //
+                                         self.pack_factor,
+                                         dtype=torch.int32,
+                                     ))
+
+        weight_scale_args = {
+            "weight_loader":
+            weight_loader,
+            "data":
+            torch.empty(
+                output_size_per_partition,
+                scales_and_zp_size,
+                dtype=params_dtype,
+            )
+        }
+
+        zeros_args = {
+            "weight_loader":
+            weight_loader,
+            "data":
+            torch.zeros(
+                output_size_per_partition // self.pack_factor,
+                scales_and_zp_size,
+                dtype=torch.int32,
+            )
+        }
+
+        if not partition_scales:
+            weight_scale = ChannelQuantScaleParameter(output_dim=0,
+                                                      **weight_scale_args)
+
+            if not self.symmetric:
+                qzeros = PackedColumnParameter(output_dim=0,
+                                               packed_dim=0,
+                                               packed_factor=self.pack_factor,
+                                               **zeros_args)
+        else:
+            weight_scale = GroupQuantScaleParameter(output_dim=0,
+                                                    input_dim=1,
+                                                    **weight_scale_args)
+            if not self.symmetric:
+                qzeros = PackedvLLMParameter(input_dim=1,
+                                             output_dim=0,
+                                             packed_dim=0,
+                                             packed_factor=self.pack_factor,
+                                             **zeros_args)
+
+        # A 2D array defining the original shape of the weights
+        # before packing
+        weight_shape = BasevLLMParameter(data=torch.empty(2,
+                                                          dtype=torch.int64),
+                                         weight_loader=weight_loader)
+
+        layer.register_parameter("weight_packed", weight)
+        layer.register_parameter("weight_scale", weight_scale)
+        layer.register_parameter("weight_shape", weight_shape)
+
+        if not self.symmetric:
+            layer.register_parameter("weight_zero_point", qzeros)
+
+        # group index (for activation reordering)
+        if self.has_g_idx:
+            weight_g_idx = RowvLLMParameter(data=torch.empty(
+                input_size_per_partition,
+                dtype=torch.int32,
+            ),
+                                            input_dim=0,
+                                            weight_loader=weight_loader)
+            layer.register_parameter("weight_g_idx", weight_g_idx)
+
+        self.kernel = kernel_type(mp_linear_kernel_config,
+                                  w_q_param_name="weight_packed",
+                                  w_s_param_name="weight_scale",
+                                  w_zp_param_name="weight_zero_point",
+                                  w_gidx_param_name="weight_g_idx")
+
+    # Checkpoints are serialized in compressed-tensors format, which is
+    # different from the format the kernel may want. Handle repacking here.
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        self.kernel.process_weights_after_loading(layer)
+
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+                      bias: Optional[torch.Tensor]) -> torch.Tensor:
+        return self.kernel.apply_weights(layer, x, bias)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
new file mode 100644
index 0000000..d926b4c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
@@ -0,0 +1,206 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+from vllm.triton_utils import tl, triton
+
+
+def is_weak_contiguous(x: torch.Tensor):
+    strides = x.stride()
+    sizes = x.shape
+    is_not_transpose = strides[0] == 1 and (strides[1] >= max(1, sizes[0]))
+    is_transpose = strides[1] == 1 and (strides[0] >= max(1, sizes[1]))
+    return is_transpose or is_not_transpose
+
+
+@triton.jit
+def scaled_mm_kernel(a_ptr, b_ptr, scale_a_ptr, scale_b_ptr, c_ptr, bias_ptr,
+                     M, N, K, stride_am, stride_ak, stride_bk, stride_bn,
+                     stride_cm, stride_cn, ACCUMULATOR_DTYPE: tl.constexpr,
+                     BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr,
+                     BLOCK_SIZE_K: tl.constexpr,
+                     BLOCK_SIZE_SCALE_A: tl.constexpr,
+                     BLOCK_SIZE_SCALE_B: tl.constexpr):
+    pid = tl.program_id(axis=0)
+
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+
+    pid_m = pid // num_pid_n
+    pid_n = pid % num_pid_n
+
+    accumulator_dtype = ACCUMULATOR_DTYPE
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N),
+                           dtype=accumulator_dtype)
+
+    # NOTE: Some tensor inputs are so large, they will cause int32 overflow
+    # so it is necessary to use tl.int64 for all the offsets, else SEGV will
+    # eventually occur.
+
+    # Offsets and masks.
+    offsets_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64)
+    masks_am = offsets_am < M
+
+    offsets_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)
+    masks_bn = offsets_bn < N
+
+    offsets_k = tl.arange(0, BLOCK_SIZE_K).to(tl.int64)
+    offsets_a = (stride_am * offsets_am[:, None] +
+                 stride_ak * offsets_k[None, :])
+    offsets_b = (stride_bk * offsets_k[:, None] +
+                 stride_bn * offsets_bn[None, :])
+
+    # NOTE: BLOCK_SIZE_SCALE_A could be 1 or BLOCK_SIZE_M, so need to create
+    # appropriate offsets and masks for each case. Same goes for
+    # BLOCK_SIZE_SCALE_B.
+    offsets_scale_am = (tl.arange(0, BLOCK_SIZE_SCALE_A) +
+                        (BLOCK_SIZE_SCALE_A > 1) * pid_m * BLOCK_SIZE_M)
+    masks_scale_am = offsets_scale_am < M
+
+    offsets_scale_bn = (tl.arange(0, BLOCK_SIZE_SCALE_B) +
+                        (BLOCK_SIZE_SCALE_B > 1) * pid_n * BLOCK_SIZE_N)
+    masks_scale_bn = offsets_scale_bn < N
+
+    a_ptrs = a_ptr + offsets_a
+    b_ptrs = b_ptr + offsets_b
+
+    scale_a_ptrs = scale_a_ptr + offsets_scale_am
+    scale_b_ptrs = scale_b_ptr + offsets_scale_bn
+
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        masks_k = offsets_k < K
+        masks_a = masks_am[:, None] & masks_k[None, :]
+        a = tl.load(a_ptrs, mask=masks_a)
+
+        masks_b = masks_k[:, None] & masks_bn[None, :]
+        b = tl.load(b_ptrs, mask=masks_b)
+
+        # Accumulate results.
+        accumulator = tl.dot(a, b, accumulator, out_dtype=accumulator_dtype)
+
+        offsets_k += BLOCK_SIZE_K
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    # Apply scale at end.
+    masks_scale_a = masks_scale_am[:, None] & (tl.arange(0, 1) < 1)[:, None]
+    scale_a = tl.load(scale_a_ptrs[:, None], masks_scale_a)
+    # Need to broadcast to the appropriate size, if scale_a is already
+    # (BLOCK_SIZE_M, 1) then it will broadcast to its own shape. Same goes
+    # for scale_b below.
+    scale_a = scale_a.broadcast_to((BLOCK_SIZE_M, 1))
+    accumulator = scale_a * accumulator.to(tl.float32)
+
+    masks_scale_b = masks_scale_bn[:, None] & (tl.arange(0, 1) < 1)[None, :]
+    scale_b = tl.load(scale_b_ptrs[:, None], masks_scale_b)
+    scale_b = scale_b.broadcast_to((BLOCK_SIZE_N, 1))
+    accumulator = scale_b.T * accumulator.to(tl.float32)
+
+    # Convert to output format.
+    c = accumulator.to(c_ptr.type.element_ty)
+
+    # Add bias, it's already in output format, so add it after conversion.
+    if bias_ptr:
+        offsets_bias = offsets_bn
+        bias_ptrs = bias_ptr + offsets_bias
+        bias_mask = offsets_bias < N
+        bias = tl.load(bias_ptrs, bias_mask)
+        c += bias
+
+    # Save output
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)
+    offs_cm = offs_cm.to(tl.int64)
+    offs_cn = offs_cn.to(tl.int64)
+    c_ptrs = (c_ptr + stride_cm * offs_cm[:, None] +
+              stride_cn * offs_cn[None, :])
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+
+    tl.store(c_ptrs, c, mask=c_mask)
+
+
+# input   - [M, K]
+# weight - [K, N]
+def triton_scaled_mm(input: torch.Tensor,
+                     weight: torch.Tensor,
+                     scale_a: torch.Tensor,
+                     scale_b: torch.Tensor,
+                     out_dtype: type[torch.dtype],
+                     bias: Optional[torch.Tensor] = None,
+                     block_size_m: int = 32,
+                     block_size_n: int = 32,
+                     block_size_k: int = 32,
+                     use_heuristic=True) -> torch.Tensor:
+    M, K = input.shape
+    N = weight.shape[1]
+
+    assert N > 0 and K > 0 and M > 0
+    assert weight.shape[0] == K
+    assert input.dtype == weight.dtype
+
+    scale_a = scale_a.reshape(-1, 1) if scale_a.dim() <= 1 else scale_a
+    scale_b = scale_b.reshape(-1, 1) if scale_b.dim() <= 1 else scale_b
+
+    assert scale_a.dtype == scale_b.dtype and scale_a.is_floating_point()
+    assert scale_a.shape[1] == 1 and (scale_a.shape[0] == 1
+                                      or scale_a.shape[0] == M)
+    assert scale_b.shape[1] == 1 and (scale_b.shape[0] == 1
+                                      or scale_b.shape[0] == N)
+    assert out_dtype.is_floating_point
+    assert bias is None or bias.is_floating_point()
+    assert is_weak_contiguous(input)
+    assert is_weak_contiguous(weight)
+
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
+        N, META['BLOCK_SIZE_N']), )
+
+    result = torch.empty((M, N), dtype=out_dtype, device=input.device)
+
+    has_scalar = lambda x: x.shape[0] == 1 and x.shape[1] == 1
+
+    if use_heuristic:
+        is_small_N = N < 8192
+        next_power_of_2_M = max(32, triton.next_power_of_2(M))
+        if next_power_of_2_M <= 32:
+            tile_shape = (64, 64, 256) if is_small_N else (64, 128, 256)
+        elif next_power_of_2_M <= 64:
+            tile_shape = (64, 64, 256)
+        elif next_power_of_2_M <= 128:
+            tile_shape = (64, 128, 128)
+        else:
+            tile_shape = (128, 128, 128)
+
+    block_size_m, block_size_n, block_size_k = tile_shape
+
+    block_size_sa = 1 if has_scalar(scale_a) else block_size_m
+    block_size_sb = 1 if has_scalar(scale_b) else block_size_n
+
+    accumulator_dtype = tl.float32 if input.is_floating_point() else tl.int32
+
+    # A = input, B = weight, C = result
+    # A = M x K, B = K x N, C = M x N
+    scaled_mm_kernel[grid](input,
+                           weight,
+                           scale_a,
+                           scale_b,
+                           result,
+                           bias,
+                           M,
+                           N,
+                           K,
+                           input.stride(0),
+                           input.stride(1),
+                           weight.stride(0),
+                           weight.stride(1),
+                           result.stride(0),
+                           result.stride(1),
+                           accumulator_dtype,
+                           BLOCK_SIZE_M=block_size_m,
+                           BLOCK_SIZE_N=block_size_n,
+                           BLOCK_SIZE_K=block_size_k,
+                           BLOCK_SIZE_SCALE_A=block_size_sa,
+                           BLOCK_SIZE_SCALE_B=block_size_sb)
+
+    return result.to(out_dtype)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
new file mode 100644
index 0000000..099d861
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -0,0 +1,216 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable, Mapping
+from types import MappingProxyType
+from typing import Optional
+
+import regex as re
+from compressed_tensors import CompressionFormat
+from torch.nn import Module
+
+
+def is_activation_quantization_format(format: str) -> bool:
+    _ACTIVATION_QUANTIZATION_FORMATS = [
+        CompressionFormat.naive_quantized.value,
+        CompressionFormat.int_quantized.value,
+        CompressionFormat.float_quantized.value,
+        CompressionFormat.nvfp4_pack_quantized.value
+    ]
+    return format in _ACTIVATION_QUANTIZATION_FORMATS
+
+
+def should_ignore_layer(
+    layer_name: Optional[str],
+    ignore: Iterable[str] = tuple(),
+    fused_mapping: Mapping[str, list[str]] = MappingProxyType({})
+) -> bool:
+    if layer_name is None:
+        return False
+
+    # layer_name = model.layers.0.self_attn.qkv_proj
+    # proj_name = qkv_proj
+    proj_name = layer_name.split(".")[-1]
+
+    # Fused layers like gate_up_proj or qkv_proj will not be fused
+    # in the safetensors checkpoint. So, we convert the name
+    # from the fused version to unfused + check to make sure that
+    # each shard of the fused layer has the same scheme.
+    if proj_name in fused_mapping and layer_name not in ignore:
+        shard_proj_names = fused_mapping[proj_name]
+
+        # Convert fused_name --> [shard_names]
+        shard_names = [
+            layer_name.replace(proj_name, shard_proj_name)
+            for shard_proj_name in shard_proj_names
+        ]
+
+        # Layer should be ignored if shards are ignored.
+        should_ignore_layer = None
+        for shard_name in shard_names:
+            should_ignore_shard = check_equal_or_regex_match(
+                layer_name=shard_name, targets=ignore)
+
+            # If shard_idx=0, set layer ignore to match shard.
+            if should_ignore_layer is None:
+                should_ignore_layer = should_ignore_shard
+
+            # If shard_idx=1+ confirm scheme matches prior shards.
+            elif should_ignore_shard != should_ignore_layer:
+                raise ValueError(f"Found a different quantization schemes for "
+                                 f"{shard_proj_names} in {layer_name}. vLLM "
+                                 "requires all to use the same scheme.")
+
+    # Unfused layers like down_proj and o_proj will match
+    # the safetensors checkpoint already.
+    else:
+        should_ignore_layer = check_equal_or_regex_match(layer_name=layer_name,
+                                                         targets=ignore)
+
+    assert should_ignore_layer is not None
+    return should_ignore_layer
+
+
+def check_equal_or_regex_match(layer_name: str,
+                               targets: Iterable[str]) -> bool:
+    """
+    Checks whether a layer_name is exactly equal or a regex match for
+    if target starts with 're:' to any target in list.
+    """
+    for target in targets:
+        if _is_equal_or_regex_match(layer_name, target):
+            return True
+    return False
+
+
+def find_matched_target(
+    layer_name: Optional[str],
+    module: Module,
+    targets: Iterable[str],
+    fused_mapping: Mapping[str, list[str]] = MappingProxyType({})
+) -> str:
+    """
+    Helper function to look up which "target" in the compressed-tensors
+    config that a layer corresponds to.
+
+    Recall that a compressed-tensors configs has a concept of
+    config_groups, where each layer can be quantized with with a different
+    scheme.
+
+    targets in each config_group will be a list of either layer names
+    (or regexes corresponding to layer names) or names of torch Modules.
+
+    First, we try to match the layer_name with a target
+    Second, we try to match the module's name with a target
+    Third, we try to map the layer_name to a list of fused module names.
+        *All* component module names must match in order for a match to be
+        successful. A successful match returns the first component target
+
+    :param layer_name: layer name
+    :param module: torch.nn.Module
+    :param targets: list of targets to match the layer against
+    :param fused_mapping: map from fused layer names to its components
+    :param fused_strategy: either "all" or "any". If using "all", fused
+        layers match if "all" of its components match
+    """
+
+    if layer_name is None:
+        layer_name = ""
+
+    matched_target = (
+        _find_first_match(layer_name, targets)
+        or _find_first_match(module.__class__.__name__, targets, True)
+        or _match_fused_layer(layer_name, targets, fused_mapping))
+
+    if matched_target is None:
+        raise ValueError(
+            f"Unable to find matching target for {layer_name} in the "
+            "compressed-tensors config.")
+
+    return matched_target
+
+
+def _find_first_match(value: str,
+                      targets: Iterable[str],
+                      check_contains: bool = False) -> Optional[str]:
+    """
+    Returns first element of target that matches value either
+    exactly or as a regex after 're:'. If check_contains is set to True,
+    additionally checks if the target string is contained within the value.
+
+    :param value: string to compare the list of targets against
+    :param targets: list of targets to match the layer against
+    :param check_contains: whether or not to do a substring match
+    """
+
+    for target in targets:
+        if _is_equal_or_regex_match(value,
+                                    target,
+                                    check_contains=check_contains):
+            return target
+    return None
+
+
+def _is_equal_or_regex_match(value: str,
+                             target: str,
+                             check_contains: bool = False) -> bool:
+    """
+    Checks whether a value is exactly equal or a regex match for target
+    if target starts with 're:'. If check_contains is set to True,
+    additionally checks if the target string is contained within the value.
+    """
+
+    if target.startswith("re:"):
+        pattern = target[3:]
+        if re.match(pattern, value):
+            return True
+    elif check_contains:
+        if target.lower() in value.lower():
+            return True
+    elif target == value:
+        return True
+    return False
+
+
+def _match_fused_layer(
+        layer_name: str, target_layers: Iterable[str],
+        fused_mapping: Mapping[str, list[str]]) -> Optional[str]:
+    """
+    Match a fused layer name to its corresponding individual layer in 
+    target_layers. Returns first value in fused_mapping which matches targets
+
+    Implements an "all" matching strategy where a fused layer matches iff
+    "all" of its components match
+
+    :param layer_name: layer name
+    :param target_layers: list of targets to match the layer against
+    :param fused_mapping: map from fused layer names to its components
+
+    Examples:
+        layer_name = "model.layers.0.self_attn.qkv_proj"
+        target_layers = ["model.layers.0.self_attn.q_proj",
+                        "model.layers.0.self_attn.k_proj",
+                        "model.layers.0.self_attn.v_proj"]
+    """
+    # find layer_name in mapping
+    fused = next((key for key in fused_mapping if layer_name.endswith(key)),
+                 None)
+    if fused is None:
+        return None
+
+    # expand path of unfused components
+    unfused_paths = [
+        layer_name.replace(fused, unfused) for unfused in fused_mapping[fused]
+    ]
+
+    # for each unfused component, find a match in targets
+    unfused_matches: list[Optional[str]] = []
+    for unfused in unfused_paths:
+        for target in target_layers:
+            if _is_equal_or_regex_match(unfused, target):
+                unfused_matches.append(target)
+                break
+        else:
+            unfused_matches.append(None)
+
+    return unfused_matches[0] if all(unfused_matches) else None
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/deepgemm.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/deepgemm.py
new file mode 100644
index 0000000..d26a932
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/deepgemm.py
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import logging
+
+import torch
+
+from vllm.platforms import current_platform
+from vllm.triton_utils import triton
+from vllm.utils import direct_register_custom_op
+from vllm.utils.deep_gemm import fp8_gemm_nt
+
+logger = logging.getLogger(__name__)
+
+
+def prepare_block_fp8_matmul_inputs(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int],
+    output_dtype: torch.dtype = torch.float16,
+) -> tuple[int, int, int, torch.Tensor]:
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+
+    assert A.shape[-1] == B.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1]
+    assert A.is_contiguous()
+    assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
+
+    M = A.numel() // A.shape[-1]
+
+    assert B.ndim == 2
+    assert B.is_contiguous()
+    assert Bs.ndim == 2
+    N, K = B.shape
+    assert triton.cdiv(N, block_n) == Bs.shape[0]
+    assert triton.cdiv(K, block_k) == Bs.shape[1]
+
+    C_shape = A.shape[:-1] + (N, )
+    C = A.new_empty(C_shape, dtype=output_dtype)
+
+    return M, N, K, C
+
+
+def w8a8_block_fp8_matmul_deepgemm(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int],
+    output_dtype: torch.dtype,
+) -> torch.Tensor:
+    M, N, K, C = prepare_block_fp8_matmul_inputs(A, B, As, Bs, block_size,
+                                                 output_dtype)
+    # Deepgemm only supports output tensor type as bfloat16
+    assert C.dtype == torch.bfloat16
+    fp8_gemm_nt((A, As), (B, Bs), C)
+    return C
+
+
+def w8a8_block_fp8_matmul_deepgemm_fake(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int],
+    output_dtype: torch.dtype,
+) -> torch.Tensor:
+    M, N, K, C = prepare_block_fp8_matmul_inputs(A, B, As, Bs, block_size,
+                                                 output_dtype)
+    return C
+
+
+direct_register_custom_op(
+    op_name="w8a8_block_fp8_matmul_deepgemm",
+    op_func=w8a8_block_fp8_matmul_deepgemm,
+    mutates_args=[],
+    fake_impl=w8a8_block_fp8_matmul_deepgemm_fake,
+    dispatch_key=current_platform.dispatch_key,
+)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/deepspeedfp.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/deepspeedfp.py
new file mode 100644
index 0000000..8030be5
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/deepspeedfp.py
@@ -0,0 +1,195 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.utils import set_weight_attrs
+
+
+class DeepSpeedFPConfig(QuantizationConfig):
+    """Config for DeepSpeed FP quantizer. It supports fp6 and fp8.
+    
+    Args: 
+        weight_bits: the target quantization bits, 6 or 8.
+        group_size: group size for quantizaiton, default to 128.
+    """
+
+    def __init__(
+        self,
+        weight_bits: int = 8,
+        group_size: int = 512,
+    ) -> None:
+        super().__init__()
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.valid_types = [torch.bfloat16, torch.float16]
+
+        if self.weight_bits not in (6, 8):
+            raise ValueError(
+                "Currently, only 6-bit or 8-bit weight quantization are "
+                f"supported for DeepSpeed FP quantizaiton, but got "
+                f"{self.weight_bits} bits.")
+
+    def __repr__(self) -> str:
+        return (f"DeepSpeedFPConfig(weight_bits={self.weight_bits}), "
+                f"group_size={self.group_size}")
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "deepspeedfp"
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "DeepSpeedFPConfig":
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        return cls(weight_bits=weight_bits, group_size=group_size)
+
+    def get_linear_method(self) -> "DeepSpeedFPLinearMethod":
+        return DeepSpeedFPLinearMethod(self)
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.half, torch.bfloat16]
+
+    @classmethod
+    # Need to figure it out
+    def get_min_capability(cls) -> int:
+        return 60
+
+    @staticmethod
+    def get_config_filenames() -> list[str]:
+        return [
+            "quant_config.json",
+            "quantize_config.json",
+        ]
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["DeepSpeedFPLinearMethod"]:
+        if isinstance(layer, LinearBase):
+            return DeepSpeedFPLinearMethod(self)
+        return None
+
+
+class DeepSpeedFPLinearMethod(LinearMethodBase):
+    """Linear method for DeepSpeedFP quantizer.
+
+    Args:
+        quant_config: the DeepSpeedFP quantization config.
+    """
+
+    def __init__(self, quant_config: DeepSpeedFPConfig):
+        self.quant_config = quant_config
+        self.weight = None
+
+    def create_weights(self,
+                       layer: torch.nn.Module,
+                       input_size_per_partition: int,
+                       output_partition_sizes: list[int],
+                       input_size: int,
+                       output_size: int,
+                       params_dtype: torch.dtype,
+                       weight_loader=None,
+                       **extra_weight_attrs):
+        del output_size
+        del input_size
+        output_size_per_partition = sum(output_partition_sizes)
+        weight = DeepSpeedFPParameter(
+            torch.Size((output_size_per_partition, input_size_per_partition)),
+            params_dtype=params_dtype,
+            quant_config=self.quant_config,
+        )
+        set_weight_attrs(weight, {
+            "input_dim": 1,
+            "output_dim": 0,
+        })
+        layer.register_parameter("weight", weight)
+
+        def quant_weight_loader(param, loaded_weight, *args, **kwargs):
+            # Calls the original weight loader (if any), quantizes the result,
+            # and then loads the quantized parameter.
+            if weight_loader is not None:
+                orig_param_data = param.data
+                param.data = param.ds_dequantize()
+                weight_loader(param, loaded_weight, *args, **kwargs)
+                param.data, loaded_weight = orig_param_data, param.data
+            param.ds_quantize_(loaded_weight.cuda())
+
+        extra_weight_attrs["weight_loader"] = quant_weight_loader
+        set_weight_attrs(weight, extra_weight_attrs)
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        weight = layer.weight
+        y = weight.ds_dequantize()
+        return F.linear(x, y, bias)
+
+
+class DeepSpeedFPParameter(nn.Parameter):
+    """
+    DeepSpeedFP quantized parameter class that implements fp8/fp6
+    quantization deepspeed. Weights are stored in quantized form on
+    GPUs, and can be dequantized on-the-fly when needed by the model.
+    """
+
+    def __new__(cls, orig_shape: torch.Size, params_dtype: torch.dtype,
+                quant_config: DeepSpeedFPConfig):
+        try:
+            import deepspeed
+            if deepspeed.__version__ < "0.14.2":
+                raise ImportError("deepspeed version is wrong. Please "
+                                  "install deepspeed>=0.14.2.")
+            from deepspeed.ops.fp_quantizer import FP_Quantize
+        except ImportError as err:
+            raise ImportError("Please install deepspeed>=0.14.2 via "
+                              "`pip install deepspeed>=0.14.2` to use "
+                              "deepspeedfp quantizer.") from err
+        data = torch.empty((
+            orig_shape.numel() // quant_config.group_size,
+            quant_config.group_size * quant_config.weight_bits // 8 + 4,
+        ),
+                           dtype=torch.int8)
+        self = torch.Tensor._make_subclass(cls, data, data.requires_grad)
+        self.orig_shape = orig_shape
+        self.quant_config = quant_config
+        self.fp_quantizer = FP_Quantize(group_size=quant_config.group_size)
+        self.fp_quantizer.orig_shape = orig_shape
+        self.fp_quantizer.orig_dtype = params_dtype
+        return self
+
+    def ds_quantize_(self, tensor: torch.Tensor):
+        assert tensor.device.type == "cuda" and tensor.dtype != torch.int8
+        return self.data.copy_(
+            self.fp_quantizer.quantize(
+                tensor.data,
+                q_bits=self.quant_config.weight_bits,
+            ))
+
+    def ds_dequantize(self, fp_out=None) -> torch.Tensor:
+        """
+        Return a tensor containing the dequantized weights of this parameter.
+        """
+        assert self.data.device.type == "cuda" and self.data.dtype == torch.int8
+        return self.fp_quantizer.dequantize(
+            self.data, fp_out=fp_out, q_bits=self.quant_config.weight_bits)
+
+    def ds_selective_dequantize(self, indices, fp_out=None) -> torch.Tensor:
+        """
+        Return a tensor where only the weights at `indices` are dequantized
+        (to save HBM -> SRAM bandwidth).
+        """
+        assert self.data.device.type == "cuda" and self.data.dtype == torch.int8
+        return self.fp_quantizer.selective_dequantize(
+            self.data,
+            indices,
+            fp_out=fp_out,
+            q_bits=self.quant_config.weight_bits)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/experts_int8.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/experts_int8.py
new file mode 100644
index 0000000..47eca80
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/experts_int8.py
@@ -0,0 +1,204 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Callable, Optional
+
+import torch
+
+from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group
+from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase
+from vllm.model_executor.layers.linear import (LinearBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.utils import set_weight_attrs
+
+
+class ExpertsInt8Config(QuantizationConfig):
+    """Config class for Int8 experts quantization."""
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "experts_int8"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "ExpertsInt8Config":
+        return cls()
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        if isinstance(layer, LinearBase):
+            return UnquantizedLinearMethod()
+        elif isinstance(layer, FusedMoE):
+            return ExpertsInt8MoEMethod(self)
+        return None
+
+
+class ExpertsInt8MoEMethod(FusedMoEMethodBase):
+
+    def __init__(self, quant_config: ExpertsInt8Config):
+        self.quant_config = quant_config
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        int8_dtype = torch.int8
+
+        assert 'weight_loader' in extra_weight_attrs
+        weight_loader = extra_weight_attrs['weight_loader']
+        wrapped_weight_loader = ExpertsInt8MoEMethod.quantizing_weight_loader(
+            layer, weight_loader)
+        extra_weight_attrs['weight_loader'] = wrapped_weight_loader
+
+        # Fused gate_up_proj (column parallel)
+        w13_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            hidden_size,
+            dtype=int8_dtype),
+                                        requires_grad=False)
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        # down_proj (row parallel)
+        w2_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition,
+            dtype=int8_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        w13_scale = torch.nn.Parameter(torch.zeros(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            dtype=torch.float32),
+                                       requires_grad=False)
+        layer.register_parameter("w13_scale", w13_scale)
+
+        w2_scale = torch.nn.Parameter(torch.zeros(num_experts,
+                                                  hidden_size,
+                                                  dtype=torch.float32),
+                                      requires_grad=False)
+        layer.register_parameter("w2_scale", w2_scale)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if enable_eplb:
+            raise NotImplementedError(
+                "EPLB not supported for `ExpertsInt8MoEMethod` yet.")
+
+        from vllm.model_executor.layers.fused_moe import fused_experts
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
+
+        return fused_experts(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            activation=activation,
+            use_int8_w8a16=True,
+            global_num_experts=global_num_experts,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            expert_map=expert_map,
+            w1_scale=layer.w13_scale,
+            w2_scale=layer.w2_scale)
+
+    @staticmethod
+    def quantizing_weight_loader(layer, weight_loader):
+
+        def quantize_and_call_weight_loader(param: torch.nn.Parameter,
+                                            loaded_weight: torch.Tensor,
+                                            weight_name: str, shard_id: int,
+                                            expert_id: int):
+            tp_rank = get_tensor_model_parallel_rank()
+            shard_size = layer.intermediate_size_per_partition
+            shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
+            device = get_tp_group().device
+            loaded_weight = loaded_weight.to(device)
+            # w1, gate_proj case: Load into first shard of w13.
+            if shard_id == "w1":
+                scales = quantize_in_place_and_get_scales(
+                    loaded_weight[shard, :])
+                layer.w13_scale.data[expert_id, 0:shard_size].copy_(scales[:,
+                                                                           0])
+            # w3, up_proj case: Load into second shard of w13.
+            elif shard_id == "w3":
+                scales = quantize_in_place_and_get_scales(
+                    loaded_weight[shard, :])
+                layer.w13_scale.data[expert_id, shard_size:2 *
+                                     shard_size].copy_(scales[:, 0])
+            # w2, down_proj case: Load into only shard of w2.
+            elif shard_id == "w2":
+                scales = quantize_in_place_and_get_scales(loaded_weight[:,
+                                                                        shard])
+                layer.w2_scale.data[expert_id, :].copy_(scales[:, 0])
+            else:
+                raise ValueError(
+                    f"Shard id must be in [0,1,2] but got {shard_id}")
+            weight_loader(param, loaded_weight, weight_name, shard_id,
+                          expert_id)
+
+        return quantize_and_call_weight_loader
+
+
+def quantize_in_place_and_get_scales(weight: torch.Tensor) -> torch.Tensor:
+    vmax = torch.iinfo(torch.int8).max
+    scales = (torch.max(torch.abs(weight), dim=1, keepdim=True)[0] / vmax)
+
+    weight.div_(scales)
+    weight.round_()
+    weight.clamp_(-vmax, vmax)
+
+    return scales
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/fbgemm_fp8.py
new file mode 100644
index 0000000..b2cab7d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -0,0 +1,172 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Optional
+
+import torch
+from torch.nn import Module
+from torch.nn.parameter import Parameter
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
+    apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape, is_layer_skipped)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    Fp8LinearOp, maybe_create_device_identity, normalize_e4m3fn_to_e4m3fnuz)
+from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
+                                           ModelWeightParameter)
+from vllm.platforms import current_platform
+
+logger = init_logger(__name__)
+
+
+class FBGEMMFp8Config(QuantizationConfig):
+    """Config class for FBGEMM Fp8."""
+
+    def __init__(self, ignore_list: list[str], input_scale_ub: float):
+        super().__init__()
+        self.ignore_list = ignore_list if ignore_list else []
+        self.input_scale_ub = input_scale_ub
+
+        # For GPUs that lack FP8 hardware support, we can leverage the Marlin
+        # kernel for fast weight-only FP8 quantization
+        self.use_marlin = not current_platform.has_device_capability(89)
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "fbgemm_fp8"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.bfloat16, torch.float16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "FBGEMMFp8Config":
+        ignore_list = cls.get_from_keys(config, ["modules_to_not_convert"])
+        input_scale_ub = cls.get_from_keys(config, ["activation_scale_ub"])
+        return cls(ignore_list=ignore_list, input_scale_ub=input_scale_ub)
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped(prefix=prefix,
+                                ignored_layers=self.ignore_list,
+                                fused_mapping=self.packed_modules_mapping):
+                return UnquantizedLinearMethod()
+            return FBGEMMFp8LinearMethod(self)
+        return None
+
+
+class FBGEMMFp8LinearMethod(LinearMethodBase):
+
+    def __init__(self, quant_config: FBGEMMFp8Config):
+        self.quant_config = quant_config
+        self.fp8_linear = Fp8LinearOp(
+            act_quant_static=False, act_quant_group_shape=GroupShape.PER_TOKEN)
+        self.out_dtype = torch.get_default_dtype()
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        maybe_create_device_identity()
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        del input_size, output_size
+        output_size_per_partition = sum(output_partition_sizes)
+
+        layer.logical_widths = output_partition_sizes
+
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        layer.orig_dtype = params_dtype
+
+        # WEIGHT
+        weight = ModelWeightParameter(data=torch.empty(
+            output_size_per_partition,
+            input_size_per_partition,
+            dtype=torch.float8_e4m3fn),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
+        layer.register_parameter("weight", weight)
+
+        # WEIGHT SCALE
+        weight_scale = ChannelQuantScaleParameter(data=torch.empty(
+            (sum(output_partition_sizes), 1), dtype=torch.float32),
+                                                  output_dim=0,
+                                                  weight_loader=weight_loader)
+        weight_scale[:] = torch.finfo(torch.float32).min
+        layer.register_parameter("weight_scale", weight_scale)
+
+        # INPUT SCALE UPPER BOUND
+        input_scale_ub = torch.nn.Parameter(torch.tensor(
+            (self.quant_config.input_scale_ub), dtype=torch.float32),
+                                            requires_grad=False)
+        layer.input_scale_ub = input_scale_ub
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        # required by torch.compile
+        layer.weight_scale = Parameter(layer.weight_scale.data,
+                                       requires_grad=False)
+        layer.weight = Parameter(layer.weight.data, requires_grad=False)
+
+        weight = layer.weight
+
+        if current_platform.is_fp8_fnuz():
+            weight, weight_scale, input_scale = \
+                normalize_e4m3fn_to_e4m3fnuz(
+                    weight=weight,
+                    weight_scale=layer.weight_scale,
+                    input_scale=None)
+            if input_scale is not None:
+                layer.input_scale = Parameter(input_scale, requires_grad=False)
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+
+        layer.weight = Parameter(weight.t(), requires_grad=False)
+        if self.quant_config.use_marlin:
+            prepare_fp8_layer_for_marlin(layer)
+            # Activations not quantized for marlin.
+            del layer.input_scale_ub
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        if self.quant_config.use_marlin:
+            return apply_fp8_marlin_linear(
+                input=x,
+                weight=layer.weight,
+                weight_scale=layer.weight_scale,
+                workspace=layer.workspace,
+                size_n=layer.output_size_per_partition,
+                size_k=layer.input_size_per_partition,
+                bias=bias)
+
+        return self.fp8_linear.apply(input=x,
+                                     weight=layer.weight,
+                                     weight_scale=layer.weight_scale,
+                                     out_dtype=self.out_dtype,
+                                     input_scale=None,
+                                     input_scale_ub=layer.input_scale_ub,
+                                     bias=bias)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/fp8.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/fp8.py
new file mode 100644
index 0000000..75f8adf
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/fp8.py
@@ -0,0 +1,1052 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import functools
+from typing import TYPE_CHECKING, Any, Callable, Optional
+
+import torch
+import torch.nn.functional as F
+from torch.nn import Module
+from torch.nn.parameter import Parameter
+
+import vllm.envs as envs
+from vllm import _custom_ops as ops
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe import (
+    FusedMoE, FusedMoEActivationFormat, FusedMoEConfig, FusedMoEMethodBase,
+    FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize,
+    FusedMoeWeightScaleSupported)
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    get_col_major_tma_aligned_tensor, requant_weight_ue8m0_inplace)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
+    apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin,
+    prepare_moe_fp8_layer_for_marlin)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape, is_layer_skipped)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    Fp8LinearOp, all_close_1d, cutlass_block_fp8_supported,
+    cutlass_fp8_supported, maybe_create_device_identity,
+    normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize,
+    requantize_with_max_scale)
+from vllm.model_executor.parameter import (BlockQuantScaleParameter,
+                                           ModelWeightParameter,
+                                           PerTensorScaleParameter)
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+from vllm.utils import has_deep_gemm
+from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
+from vllm.utils.flashinfer import has_flashinfer_moe
+
+if TYPE_CHECKING:
+    from vllm.model_executor.models.utils import WeightsMapper
+
+ACTIVATION_SCHEMES = ["static", "dynamic"]
+
+logger = init_logger(__name__)
+
+
+def _swap_w13_to_w31(x: torch.Tensor) -> torch.Tensor:
+    return x.reshape(-1, 2, x.shape[-2] // 2,
+                     x.shape[-1]).flip(dims=[1]).reshape(x.shape)
+
+
+def _is_col_major(x: torch.Tensor) -> bool:
+    assert x.dim() == 3
+    b, m, n = x.shape
+    return x.stride(0) == m * n and x.stride(1) == 1 and x.stride(2) == m
+
+
+class Fp8Config(QuantizationConfig):
+    """Config class for FP8."""
+
+    def __init__(
+        self,
+        is_checkpoint_fp8_serialized: bool = False,
+        activation_scheme: str = "dynamic",
+        ignored_layers: Optional[list[str]] = None,
+        weight_block_size: Optional[list[int]] = None,
+    ) -> None:
+        super().__init__()
+
+        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
+
+        if activation_scheme not in ACTIVATION_SCHEMES:
+            raise ValueError(
+                f"Unsupported activation scheme {activation_scheme}")
+        self.activation_scheme = activation_scheme
+        self.ignored_layers = ignored_layers or []
+        if weight_block_size is not None:
+            if not is_checkpoint_fp8_serialized:
+                raise ValueError(
+                    "The block-wise quantization only supports fp8-serialized "
+                    "checkpoint for now.")
+            if len(weight_block_size) != 2:
+                raise ValueError(
+                    "The quantization block size of weight must have 2 "
+                    f"dimensions, but got {len(weight_block_size)} dimensions")
+            if activation_scheme != "dynamic":
+                raise ValueError("The block-wise quantization only supports "
+                                 "dynamic activation scheme for now, but got "
+                                 f"{activation_scheme} activation scheme.")
+        self.weight_block_size = weight_block_size
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "fp8"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return []
+
+    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
+        if self.ignored_layers is not None:
+            self.ignored_layers = hf_to_vllm_mapper.apply_list(
+                self.ignored_layers)
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "Fp8Config":
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        is_checkpoint_fp8_serialized = ("fp8" in quant_method)
+        activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
+        ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None)
+        weight_block_size = cls.get_from_keys_or(config, ["weight_block_size"],
+                                                 None)
+        return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
+                   activation_scheme=activation_scheme,
+                   ignored_layers=ignored_layers,
+                   weight_block_size=weight_block_size)
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import Attention  # Avoid circular import
+
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped(prefix=prefix,
+                                ignored_layers=self.ignored_layers,
+                                fused_mapping=self.packed_modules_mapping):
+                return UnquantizedLinearMethod()
+            return Fp8LinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return Fp8MoEMethod(self)
+        elif isinstance(layer, Attention):
+            return Fp8KVCacheMethod(self)
+        return None
+
+    def get_cache_scale(self, name: str) -> Optional[str]:
+        """
+        Check whether the param name matches the format for k/v cache scales
+        in compressed-tensors. If this is the case, return its equivalent
+        param name expected by vLLM
+
+        :param name: param name
+        :return: matching param name for KV cache scale in vLLM
+        """
+        if name.endswith(".output_scale") and ".k_proj" in name:
+            return name.replace(".k_proj.output_scale", ".attn.k_scale")
+        if name.endswith(".output_scale") and ".v_proj" in name:
+            return name.replace(".v_proj.output_scale", ".attn.v_scale")
+        if name.endswith(".output_scale") and ".q_proj" in name:
+            return name.replace(".q_proj.output_scale", ".attn.q_scale")
+        if name.endswith("self_attn.prob_output_scale"):
+            return name.replace(".prob_output_scale", ".attn.prob_scale")
+        # If no matches, return None
+        return None
+
+
+class Fp8LinearMethod(LinearMethodBase):
+    """Linear method for FP8.
+    Supports loading FP8 checkpoints with static weight scale and
+    dynamic/static activation scale.
+
+    Also supports loading quantized FP16/BF16 model checkpoints with dynamic
+    activation scaling. The weight scaling factor will be initialized after
+    the model weights are loaded.
+
+    Limitations:
+    1. Only support per-tensor quantization due to torch._scaled_mm support.
+    2. Only support float8_e4m3fn data type due to the limitation of
+       torch._scaled_mm (https://github.com/pytorch/pytorch/blob/2e48b39603411a41c5025efbe52f89560b827825/aten/src/ATen/native/cuda/Blas.cpp#L854-L856)
+
+    Args:
+        quant_config: The quantization config.
+    """
+
+    def __init__(self, quant_config: Fp8Config):
+        self.quant_config = quant_config
+        self.cutlass_block_fp8_supported = cutlass_block_fp8_supported()
+        self.out_dtype = torch.get_default_dtype()
+
+        # For GPUs that lack FP8 hardware support, we can leverage the Marlin
+        # kernel for fast weight-only FP8 quantization
+        self.use_marlin = (not current_platform.has_device_capability(89)
+                           or envs.VLLM_TEST_FORCE_FP8_MARLIN)
+        # Disable marlin for rocm
+        if current_platform.is_rocm():
+            self.use_marlin = False
+
+        # AITER is only supported on ROCm and only for FP8_FNUZ
+        # and at the moment are MI300 series
+        self.use_aiter_and_is_supported = (current_platform.is_rocm()
+                                           and envs.VLLM_ROCM_USE_AITER
+                                           and envs.VLLM_ROCM_USE_AITER_LINEAR
+                                           and current_platform.is_fp8_fnuz())
+
+        self.block_quant = self.quant_config.weight_block_size is not None
+        self.act_q_static = self.quant_config.activation_scheme == "static"
+        # Use per-token quantization for better perf if dynamic and cutlass
+        if not self.act_q_static and cutlass_fp8_supported():
+            self.act_q_group_shape = GroupShape.PER_TOKEN
+        else:
+            self.act_q_group_shape = GroupShape.PER_TENSOR
+
+        self.fp8_linear = Fp8LinearOp(
+            act_quant_static=self.act_q_static,
+            act_quant_group_shape=self.act_q_group_shape,
+            cutlass_fp8_supported=cutlass_fp8_supported())
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        maybe_create_device_identity()
+
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        layer.orig_dtype = params_dtype
+        layer.weight_block_size = None
+
+        if self.block_quant:
+            tp_size = get_tensor_model_parallel_world_size()
+            assert self.quant_config.weight_block_size is not None
+            layer.weight_block_size = self.quant_config.weight_block_size
+            block_n, block_k = (
+                self.quant_config.weight_block_size[0],
+                self.quant_config.weight_block_size[1],
+            )
+            # Required by row parallel
+            if (tp_size > 1
+                    and input_size // input_size_per_partition == tp_size
+                    and input_size_per_partition % block_k != 0):
+                raise ValueError(
+                    f"Weight input_size_per_partition = "
+                    f"{input_size_per_partition} is not divisible by "
+                    f"weight quantization block_k = {block_k}.")
+            # Required by column parallel or enabling merged weights
+            is_tp_split = (tp_size > 1 and
+                           output_size // output_size_per_partition == tp_size)
+            is_merged_gemm = len(output_partition_sizes) > 1
+            if is_tp_split or is_merged_gemm:
+                sizes_to_check = output_partition_sizes
+                if not is_tp_split and is_merged_gemm:
+                    # In case of merged matrices, we allow the last
+                    # matrix to not be a multiple of block size
+                    sizes_to_check = output_partition_sizes[:-1]
+                for output_partition_size in sizes_to_check:
+                    if output_partition_size % block_n != 0:
+                        raise ValueError(
+                            f"Weight output_partition_size = "
+                            f"{output_partition_size} is not divisible by "
+                            f"weight quantization block_n = {block_n}.")
+
+        # WEIGHT
+        weight_dtype = (torch.float8_e4m3fn
+                        if self.quant_config.is_checkpoint_fp8_serialized else
+                        params_dtype)
+
+        weight = ModelWeightParameter(data=torch.empty(
+            output_size_per_partition,
+            input_size_per_partition,
+            dtype=weight_dtype),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
+        layer.register_parameter("weight", weight)
+
+        # If checkpoint is serialized fp8, load them.
+        # Otherwise, wait until process_weights_after_loading.
+        if self.quant_config.is_checkpoint_fp8_serialized:
+            # WEIGHT SCALE
+            if not self.block_quant:
+                scale = PerTensorScaleParameter(
+                    data=torch.empty(len(output_partition_sizes),
+                                     dtype=torch.float32),
+                    weight_loader=weight_loader,
+                )
+                scale[:] = torch.finfo(torch.float32).min
+                set_weight_attrs(scale, {"scale_type": "weight_scale"})
+                layer.register_parameter("weight_scale", scale)
+            else:
+                assert self.quant_config.activation_scheme == "dynamic"
+                scale = BlockQuantScaleParameter(
+                    data=torch.empty(
+                        (output_size_per_partition + block_n - 1) // block_n,
+                        (input_size_per_partition + block_k - 1) // block_k,
+                        dtype=torch.float32,
+                    ),
+                    input_dim=1,
+                    output_dim=0,
+                    weight_loader=weight_loader,
+                )
+                scale[:] = torch.finfo(torch.float32).min
+                set_weight_attrs(scale, {"scale_type": "weight_scale"})
+                # The weight_scale_inv name is intentional for deepseekv3
+                layer.register_parameter("weight_scale_inv", scale)
+
+            # INPUT ACTIVATION SCALE
+            if self.quant_config.activation_scheme == "static":
+                scale = PerTensorScaleParameter(data=torch.empty(
+                    len(output_partition_sizes), dtype=torch.float32),
+                                                weight_loader=weight_loader)
+
+                scale[:] = torch.finfo(torch.float32).min
+                set_weight_attrs(scale, {"scale_type": "input_scale"})
+                layer.register_parameter("input_scale", scale)
+            else:
+                layer.register_parameter("input_scale", None)
+
+    def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor:
+        # Pad the weight tensor. This is an optimization on ROCm platform, which
+        # can benefit from tensors located far enough from one another in memory
+        if (envs.VLLM_ROCM_FP8_PADDING and current_platform.is_rocm()
+                and weight.stride(-1) == 1
+                and (weight.stride(-2) * weight.element_size()) % 512 == 0):
+            num_pad = 256 // weight.element_size()
+            weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
+            torch.cuda.empty_cache()
+        return weight
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        size_k_first = True
+        # TODO(rob): refactor block quant into separate class.
+        if self.block_quant:
+            assert self.quant_config.activation_scheme == "dynamic"
+            size_k_first = False
+            if current_platform.is_fp8_fnuz():
+                weight, weight_scale_inv, _ = \
+                    normalize_e4m3fn_to_e4m3fnuz(
+                        weight=layer.weight,
+                        weight_scale=layer.weight_scale_inv)
+            else:
+                weight = layer.weight.data
+                weight_scale_inv = layer.weight_scale_inv.data
+
+            weight = self._maybe_pad_weight(weight)
+
+            # Torch.compile cannot use Parameter subclasses.
+            layer.weight = Parameter(weight, requires_grad=False)
+            layer.weight_scale_inv = Parameter(weight_scale_inv,
+                                               requires_grad=False)
+
+        # If checkpoint not serialized fp8, quantize the weights.
+        elif not self.quant_config.is_checkpoint_fp8_serialized:
+            qweight, weight_scale = ops.scaled_fp8_quant(layer.weight,
+                                                         scale=None)
+
+            # Update the layer with the new values.
+            layer.weight = Parameter(qweight.t(), requires_grad=False)
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+            layer.input_scale = None
+
+        # If checkpoint is fp8, handle that there are N scales for N
+        # shards in a fused module
+        else:
+            layer.weight_scale = torch.nn.Parameter(layer.weight_scale.data,
+                                                    requires_grad=False)
+            if self.quant_config.activation_scheme == "static":
+                layer.input_scale = torch.nn.Parameter(layer.input_scale.data,
+                                                       requires_grad=False)
+
+            weight = layer.weight
+            weight_scale = layer.weight_scale
+
+            # If using w8a8, torch._scaled_mm needs per tensor, so
+            # requantize the logical shards as a single weight.
+            if not self.use_marlin:
+                # Dequant -> Quant with max scale so we can run per tensor.
+                if current_platform.is_fp8_fnuz():
+                    weight, weight_scale, input_scale = \
+                        normalize_e4m3fn_to_e4m3fnuz(
+                            weight=weight,
+                            weight_scale=weight_scale,
+                            input_scale=layer.input_scale)
+                    if input_scale is not None:
+                        layer.input_scale = Parameter(input_scale,
+                                                      requires_grad=False)
+
+                weight_scale, weight = requantize_with_max_scale(
+                    weight=weight,
+                    weight_scale=weight_scale,
+                    logical_widths=layer.logical_widths,
+                )
+
+            weight = self._maybe_pad_weight(weight)
+            # Update layer with new values.
+            layer.weight = Parameter(weight.t(), requires_grad=False)
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+            if self.quant_config.activation_scheme == "static":
+                layer.input_scale = Parameter(layer.input_scale.max(),
+                                              requires_grad=False)
+
+        if self.use_marlin:
+            prepare_fp8_layer_for_marlin(layer, size_k_first)
+            # Activations not quantized for marlin.
+            del layer.input_scale
+
+        # On B200, DeepGemm only support E8M0 scale, which means we need to
+        # requantize the weight and input to the specific scale
+        # at the same time.
+        if is_blackwell_deep_gemm_used():
+            assert layer.weight_block_size is not None
+            block_sz = tuple(layer.weight_block_size)
+            requant_weight_ue8m0_inplace(
+                layer.weight.data,
+                layer.weight_scale_inv.data if hasattr(
+                    layer, "weight_scale_inv") else layer.weight_scale.data,
+                block_sz,
+            )
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        if self.use_marlin:
+            return apply_fp8_marlin_linear(
+                input=x,
+                weight=layer.weight,
+                weight_scale=layer.weight_scale,
+                workspace=layer.workspace,
+                size_n=layer.output_size_per_partition,
+                size_k=layer.input_size_per_partition,
+                bias=bias)
+
+        if self.block_quant:
+            assert self.quant_config.weight_block_size is not None
+
+            return torch.ops.vllm.apply_w8a8_block_fp8_linear(
+                input=x,
+                weight=layer.weight,
+                block_size=self.quant_config.weight_block_size,
+                weight_scale=layer.weight_scale_inv,
+                input_scale=layer.input_scale,
+                bias=bias,
+                cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
+                use_aiter_and_is_supported=self.use_aiter_and_is_supported,
+            )
+
+        return self.fp8_linear.apply(input=x,
+                                     weight=layer.weight,
+                                     weight_scale=layer.weight_scale,
+                                     out_dtype=self.out_dtype,
+                                     input_scale=layer.input_scale,
+                                     bias=bias)
+
+
+class Fp8MoEMethod(FusedMoEMethodBase):
+    """MoE method for FP8.
+    Supports loading FP8 checkpoints with static weight scale and
+    dynamic/static activation scale.
+
+    Also supports loading quantized FP16/BF16 model checkpoints with dynamic
+    activation scaling. The weight scaling factor will be initialized after
+    the model weights are loaded.
+
+    Args:
+        quant_config: The quantization config.
+    """
+
+    def __init__(self, quant_config: Fp8Config):
+
+        from vllm.model_executor.layers.fused_moe import fused_experts
+        self.quant_config = quant_config
+        self.block_quant = self.quant_config.weight_block_size is not None
+
+        self.flashinfer_moe_enabled = False
+        if envs.VLLM_USE_FLASHINFER_MOE_FP8 and has_flashinfer_moe():
+            logger.info_once(
+                "Using FlashInfer MoE FP8 kernels for Fp8MoEMethod.")
+            self.flashinfer_moe_enabled = True
+        # For GPUs that lack FP8 hardware support, we can leverage the Marlin
+        # kernel for fast weight-only FP8 quantization
+        self.use_marlin = (not current_platform.has_device_capability(89)
+                           or envs.VLLM_TEST_FORCE_FP8_MARLIN)
+        # Disable marlin for rocm
+        if current_platform.is_rocm():
+            self.use_marlin = False
+
+        # Check for DeepGemm support.
+        self.allow_deep_gemm = False
+        if envs.VLLM_USE_DEEP_GEMM:
+            if not has_deep_gemm():
+                logger.warning_once("Failed to import DeepGemm kernels.")
+            elif not self.block_quant:
+                logger.warning_once("Model is not block quantized. Not using "
+                                    "DeepGemm kernels")
+            elif (current_platform.is_cuda()
+                  and current_platform.is_device_capability(90)):
+                logger.info_once("Using DeepGemm kernels for Fp8MoEMethod.")
+                self.allow_deep_gemm = True
+            elif (current_platform.is_cuda()
+                  and is_blackwell_deep_gemm_used()):
+                logger.info_once("Using DeepGemm SM100 kernels for "
+                                 "Fp8MoEMethod.")
+                self.allow_deep_gemm = True
+            else:
+                logger.warning_once(
+                    "DeepGemm not supported on the current platform.")
+
+        # Check for CutlassBlockScaledGroupedGemm support.
+        self.allow_cutlass_block_scaled_grouped_gemm = False
+        if not self.block_quant:
+            logger.debug_once("Model is not block quantized. Not using "
+                              "CutlassBlockScaledGroupedGemm kernels")
+        elif (current_platform.is_cuda()
+              and current_platform.is_device_capability(100)):
+            logger.info_once(
+                "Using CutlassBlockScaledGroupedGemm kernels for Fp8MoEMethod."
+            )
+            self.allow_cutlass_block_scaled_grouped_gemm = True
+        else:
+            logger.warning_once(
+                "CutlassBlockScaledGroupedGemm not supported on the current "
+                "platform.")
+
+        self.topk_indices_dtype = None
+        self.fused_experts = functools.partial(  # type: ignore
+            fused_experts,
+            use_fp8_w8a8=True,
+            block_shape=self.quant_config.weight_block_size,
+            allow_deep_gemm=self.allow_deep_gemm,
+            allow_cutlass_block_scaled_grouped_gemm=(
+                self.allow_cutlass_block_scaled_grouped_gemm))
+
+    def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
+                       intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        layer.intermediate_size_per_partition = intermediate_size_per_partition
+        layer.hidden_size = hidden_size
+        layer.num_experts = num_experts
+        layer.orig_dtype = params_dtype
+        layer.weight_block_size = None
+
+        if self.quant_config.is_checkpoint_fp8_serialized:
+            params_dtype = torch.float8_e4m3fn
+        if self.block_quant:
+            assert self.quant_config.weight_block_size is not None
+            layer.weight_block_size = self.quant_config.weight_block_size
+            tp_size = get_tensor_model_parallel_world_size()
+            block_n, block_k = (
+                self.quant_config.weight_block_size[0],
+                self.quant_config.weight_block_size[1],
+            )
+            # NOTE: To ensure proper alignment of the block-wise quantization
+            # scales, the output_size of the weights for both the gate and up
+            # layers must be divisible by block_n.
+            # Required by column parallel or enabling merged weights
+            if intermediate_size_per_partition % block_n != 0:
+                raise ValueError(
+                    f"The output_size of gate's and up's weight = "
+                    f"{intermediate_size_per_partition} is not divisible by "
+                    f"weight quantization block_n = {block_n}.")
+            if (tp_size > 1
+                    and intermediate_size_per_partition % block_k != 0):
+                # Required by row parallel
+                raise ValueError(
+                    f"The input_size of down's weight = "
+                    f"{intermediate_size_per_partition} is not divisible by "
+                    f"weight quantization block_k = {block_k}.")
+
+        # WEIGHTS
+        w13_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            hidden_size,
+            dtype=params_dtype),
+                                        requires_grad=False)
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition,
+            dtype=params_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # WEIGHT_SCALES
+        if not self.block_quant:
+            # Allocate 2 scales for w1 and w3 respectively.
+            # They will be combined to a single scale after weight loading.
+            w13_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts, 2, dtype=torch.float32),
+                                                  requires_grad=False)
+            w2_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts, dtype=torch.float32),
+                                                 requires_grad=False)
+            layer.register_parameter("w13_weight_scale", w13_weight_scale)
+            layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        else:
+            w13_weight_scale = torch.nn.Parameter(
+                torch.ones(
+                    num_experts,
+                    2 * ((intermediate_size_per_partition + block_n - 1) //
+                         block_n),
+                    (hidden_size + block_k - 1) // block_k,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
+            w2_weight_scale = torch.nn.Parameter(
+                torch.ones(
+                    num_experts,
+                    (hidden_size + block_n - 1) // block_n,
+                    (intermediate_size_per_partition + block_k - 1) // block_k,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
+            layer.register_parameter("w13_weight_scale_inv", w13_weight_scale)
+            layer.register_parameter("w2_weight_scale_inv", w2_weight_scale)
+            assert self.quant_config.activation_scheme == "dynamic"
+
+        # Add the quantization method used (per tensor/grouped/channel)
+        # to ensure the weight scales are loaded in properly
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.BLOCK.
+             value} if self.block_quant else
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+        # If loading fp8 checkpoint, pass the weight loaders.
+        # If loading an fp16 checkpoint, do not (we will quantize in
+        #   process_weights_after_loading()
+        if self.quant_config.is_checkpoint_fp8_serialized:
+            set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+            set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        # INPUT_SCALES
+        if self.quant_config.activation_scheme == "static":
+            if not self.quant_config.is_checkpoint_fp8_serialized:
+                raise ValueError(
+                    "Found static activation scheme for checkpoint that "
+                    "was not serialized fp8.")
+
+            w13_input_scale = torch.nn.Parameter(torch.ones(
+                num_experts, dtype=torch.float32),
+                                                 requires_grad=False)
+            layer.register_parameter("w13_input_scale", w13_input_scale)
+            set_weight_attrs(w13_input_scale, extra_weight_attrs)
+
+            w2_input_scale = torch.nn.Parameter(torch.ones(
+                num_experts, dtype=torch.float32),
+                                                requires_grad=False)
+            layer.register_parameter("w2_input_scale", w2_input_scale)
+            set_weight_attrs(w2_input_scale, extra_weight_attrs)
+
+        else:
+            layer.w13_input_scale = None
+            layer.w2_input_scale = None
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        # Lazy import to avoid importing triton too early.
+        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+            is_rocm_aiter_moe_enabled, shuffle_weights)
+
+        self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
+
+        # TODO (rob): refactor block quant into separate class.
+        if self.block_quant:
+            assert self.quant_config.activation_scheme == "dynamic"
+            if current_platform.is_fp8_fnuz():
+                w13_weight, w13_weight_scale_inv, w13_input_scale = \
+                    normalize_e4m3fn_to_e4m3fnuz(
+                        layer.w13_weight, layer.w13_weight_scale_inv,
+                        layer.w13_input_scale)
+                w2_weight, w2_weight_scale_inv, w2_input_scale = \
+                    normalize_e4m3fn_to_e4m3fnuz(
+                        layer.w2_weight, layer.w2_weight_scale_inv,
+                        layer.w2_input_scale)
+            elif self.flashinfer_moe_enabled:
+                # NOTE: weights have to be swapped since the activation is
+                # applied on different half for flashinfer vs vllm
+                w13_weight = _swap_w13_to_w31(layer.w13_weight.data)
+                w13_weight_scale_inv = _swap_w13_to_w31(
+                    layer.w13_weight_scale_inv.data)
+                w2_weight = layer.w2_weight.data
+                w2_weight_scale_inv = layer.w2_weight_scale_inv.data
+            else:
+                w13_weight = layer.w13_weight.data
+                w13_weight_scale_inv = layer.w13_weight_scale_inv.data
+                w2_weight = layer.w2_weight
+                w2_weight_scale_inv = layer.w2_weight_scale_inv
+
+            # torch.compile() cannot use Parameter subclasses.
+            layer.w13_weight = Parameter(w13_weight, requires_grad=False)
+            layer.w13_weight_scale_inv = Parameter(w13_weight_scale_inv,
+                                                   requires_grad=False)
+            layer.w2_weight = Parameter(w2_weight, requires_grad=False)
+            layer.w2_weight_scale_inv = Parameter(w2_weight_scale_inv,
+                                                  requires_grad=False)
+            if self.rocm_aiter_moe_enabled:
+                # reshaping weights is required for aiter moe kernel.
+                shuffled_w13, shuffled_w2 = shuffle_weights(
+                    layer.w13_weight.data, layer.w2_weight.data)
+
+                layer.w13_weight = torch.nn.Parameter(shuffled_w13,
+                                                      requires_grad=False)
+                layer.w2_weight = torch.nn.Parameter(shuffled_w2,
+                                                     requires_grad=False)
+
+            # DeepGemm scales need to be transposed and aligned.  We try to do
+            # it ahead of time for performance reasons.
+            if self.allow_deep_gemm and not is_blackwell_deep_gemm_used():
+                # Lazy import to avoid CUDA initialization problems.
+                if _is_col_major(layer.w13_weight_scale_inv):
+                    layer.w13_weight_scale_inv = \
+                        get_col_major_tma_aligned_tensor(layer.w13_weight_scale_inv).contiguous()
+                if _is_col_major(layer.w2_weight_scale_inv):
+                    layer.w2_weight_scale_inv = \
+                        get_col_major_tma_aligned_tensor(layer.w2_weight_scale_inv).contiguous()
+
+        # If checkpoint is fp16, quantize in place.
+        elif not self.quant_config.is_checkpoint_fp8_serialized:
+            fp8_dtype = current_platform.fp8_dtype()
+            w13_weight = torch.empty_like(layer.w13_weight.data,
+                                          dtype=fp8_dtype)
+            w2_weight = torch.empty_like(layer.w2_weight.data, dtype=fp8_dtype)
+
+            # Re-initialize w13_scale because we directly quantize
+            # merged w13 weights and generate a single scaling factor.
+            layer.w13_weight_scale = torch.nn.Parameter(torch.ones(
+                layer.local_num_experts,
+                dtype=torch.float32,
+                device=w13_weight.device),
+                                                        requires_grad=False)
+            for expert in range(layer.local_num_experts):
+                w13_weight[expert, :, :], layer.w13_weight_scale[
+                    expert] = ops.scaled_fp8_quant(
+                        layer.w13_weight.data[expert, :, :])
+                w2_weight[expert, :, :], layer.w2_weight_scale[
+                    expert] = ops.scaled_fp8_quant(
+                        layer.w2_weight.data[expert, :, :])
+            layer.w13_weight = torch.nn.Parameter(w13_weight,
+                                                  requires_grad=False)
+            layer.w2_weight = torch.nn.Parameter(w2_weight,
+                                                 requires_grad=False)
+            if self.rocm_aiter_moe_enabled:
+                # reshaping weights is required for aiter moe kernel.
+                shuffled_w13, shuffled_w2 = shuffle_weights(
+                    layer.w13_weight, layer.w2_weight)
+
+                layer.w13_weight = torch.nn.Parameter(shuffled_w13,
+                                                      requires_grad=False)
+                layer.w2_weight = torch.nn.Parameter(shuffled_w2,
+                                                     requires_grad=False)
+        # If checkpoint is fp8, we need to handle that the
+        # MoE kernels require single activation scale and single weight
+        # scale for w13 per expert.
+        else:
+            # Fp8 moe kernels require a single activation scale.
+            # We take the max of all the scales in case they differ.
+            if self.quant_config.activation_scheme == "static":
+                if (layer.w13_input_scale is None
+                        or layer.w2_input_scale is None):
+                    raise ValueError(
+                        "QuantConfig has static quantization, but found "
+                        "activation scales are None.")
+                if (not all_close_1d(layer.w13_input_scale)
+                        or not all_close_1d(layer.w2_input_scale)):
+                    logger.warning_once(
+                        "Found input_scales that are not equal for "
+                        "fp8 MoE layer. Using the maximum across experts "
+                        "for each layer.")
+                layer.w13_input_scale = torch.nn.Parameter(
+                    layer.w13_input_scale.max(), requires_grad=False)
+                layer.w2_input_scale = torch.nn.Parameter(
+                    layer.w2_input_scale.max(), requires_grad=False)
+            if current_platform.is_fp8_fnuz():
+                # Normalize the weights and scales
+                w13_weight, w13_weight_scale, w13_input_scale = \
+                    normalize_e4m3fn_to_e4m3fnuz(
+                        layer.w13_weight, layer.w13_weight_scale,
+                        layer.w13_input_scale)
+                w2_weight, w2_weight_scale, w2_input_scale = \
+                    normalize_e4m3fn_to_e4m3fnuz(
+                        layer.w2_weight, layer.w2_weight_scale,
+                        layer.w2_input_scale)
+                # Reset the parameter
+                layer.w13_weight = torch.nn.Parameter(w13_weight,
+                                                      requires_grad=False)
+                layer.w13_weight_scale = torch.nn.Parameter(
+                    w13_weight_scale, requires_grad=False)
+                if w13_input_scale is not None:
+                    layer.w13_input_scale = torch.nn.Parameter(
+                        w13_input_scale, requires_grad=False)
+                layer.w2_weight = torch.nn.Parameter(w2_weight,
+                                                     requires_grad=False)
+                layer.w2_weight_scale = torch.nn.Parameter(w2_weight_scale,
+                                                           requires_grad=False)
+                if w2_input_scale is not None:
+                    layer.w2_input_scale = torch.nn.Parameter(
+                        w2_input_scale, requires_grad=False)
+
+            # Fp8 moe kernel needs single weight scale for w13 per expert.
+            # We take the max then dequant and requant each expert.
+            assert layer.w13_weight_scale is not None
+            shard_size = layer.intermediate_size_per_partition
+            max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+            for expert_id in range(layer.local_num_experts):
+                start = 0
+                for shard_id in range(2):
+                    dq_weight = per_tensor_dequantize(
+                        layer.w13_weight[expert_id][start:start +
+                                                    shard_size, :],
+                        layer.w13_weight_scale[expert_id][shard_id])
+                    layer.w13_weight[expert_id][
+                        start:start + shard_size, :], _ = ops.scaled_fp8_quant(
+                            dq_weight, max_w13_scales[expert_id])
+                    start += shard_size
+
+            if self.rocm_aiter_moe_enabled:
+                shuffled_w13, shuffled_w2 = shuffle_weights(
+                    layer.w13_weight, layer.w2_weight)
+
+                layer.w13_weight = torch.nn.Parameter(shuffled_w13,
+                                                      requires_grad=False)
+                layer.w2_weight = torch.nn.Parameter(shuffled_w2,
+                                                     requires_grad=False)
+
+            layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
+                                                        requires_grad=False)
+
+        if self.use_marlin:
+            prepare_moe_fp8_layer_for_marlin(layer, False)
+            # Activations not quantized for marlin.
+            del layer.w13_input_scale
+            del layer.w2_input_scale
+
+        if is_blackwell_deep_gemm_used():
+            assert layer.weight_block_size is not None
+            # Re-quantise the expert weights so their scales are UE8M0.
+            block_sz = tuple(layer.weight_block_size)
+            requant_weight_ue8m0_inplace(
+                layer.w13_weight.data,
+                layer.w13_weight_scale_inv.data,
+                block_sz,
+            )
+            requant_weight_ue8m0_inplace(
+                layer.w2_weight.data,
+                layer.w2_weight_scale_inv.data,
+                block_sz,
+            )
+
+            # Ensure column-major TMA alignment expected by DeepGEMM.
+            if _is_col_major(layer.w13_weight_scale_inv):
+                layer.w13_weight_scale_inv = get_col_major_tma_aligned_tensor(
+                    layer.w13_weight_scale_inv).contiguous()
+            if _is_col_major(layer.w2_weight_scale_inv):
+                layer.w2_weight_scale_inv = get_col_major_tma_aligned_tensor(
+                    layer.w2_weight_scale_inv).contiguous()
+
+    def select_gemm_impl(
+        self,
+        prepare_finalize: FusedMoEPrepareAndFinalize,
+        moe: FusedMoEConfig,
+    ) -> FusedMoEPermuteExpertsUnpermute:
+        from vllm.model_executor.layers.fused_moe import (
+            BatchedTritonOrDeepGemmExperts, TritonOrDeepGemmExperts)
+
+        assert not self.use_marlin and not self.rocm_aiter_moe_enabled, (
+            "Marlin and ROCm AITER are not supported with all2all yet.")
+
+        if (prepare_finalize.activation_format ==
+                FusedMoEActivationFormat.BatchedExperts):
+            max_num_tokens_per_rank = (
+                prepare_finalize.max_num_tokens_per_rank())
+            assert max_num_tokens_per_rank is not None
+            logger.debug(
+                "BatchedTritonOrDeepGemmExperts(%s): "
+                "max_tokens_per_rank=%s, block_size=%s, per_act_token=%s",
+                self.__class__.__name__, max_num_tokens_per_rank,
+                self.quant_config.weight_block_size, False)
+            return BatchedTritonOrDeepGemmExperts(
+                max_num_tokens=max_num_tokens_per_rank,
+                num_dispatchers=prepare_finalize.num_dispatchers(),
+                use_fp8_w8a8=True,
+                block_shape=self.quant_config.weight_block_size,
+                per_act_token_quant=False,
+                allow_deep_gemm=self.allow_deep_gemm,
+            )
+        else:
+            logger.debug(
+                "TritonOrDeepGemmExperts(%s): block_size=%s, per_act_token=%s",
+                self.__class__.__name__, self.quant_config.weight_block_size,
+                False)
+            return TritonOrDeepGemmExperts(
+                use_fp8_w8a8=True,
+                block_shape=self.quant_config.weight_block_size,
+                allow_deep_gemm=self.allow_deep_gemm,
+            )
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if enable_eplb:
+            assert expert_load_view is not None
+            assert logical_to_physical_map is not None
+            assert logical_replica_count is not None
+            assert isinstance(layer, FusedMoE)
+        if not self.flashinfer_moe_enabled:
+            topk_weights, topk_ids = FusedMoE.select_experts(
+                hidden_states=x,
+                router_logits=router_logits,
+                use_grouped_topk=use_grouped_topk,
+                top_k=top_k,
+                renormalize=renormalize,
+                topk_group=topk_group,
+                num_expert_group=num_expert_group,
+                custom_routing_function=custom_routing_function,
+                scoring_func=scoring_func,
+                e_score_correction_bias=e_score_correction_bias,
+                indices_type=self.topk_indices_dtype,
+                enable_eplb=enable_eplb,
+                expert_map=expert_map,
+                expert_load_view=expert_load_view,
+                logical_to_physical_map=logical_to_physical_map,
+                logical_replica_count=logical_replica_count,
+            )
+
+        if self.rocm_aiter_moe_enabled:
+            from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
+                rocm_aiter_fused_experts)
+            return rocm_aiter_fused_experts(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                activation=activation,
+                use_fp8_w8a8=True,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                w1_scale=(layer.w13_weight_scale_inv
+                          if self.block_quant else layer.w13_weight_scale),
+                w2_scale=(layer.w2_weight_scale_inv
+                          if self.block_quant else layer.w2_weight_scale),
+                a1_scale=layer.w13_input_scale,
+                a2_scale=layer.w2_input_scale,
+                block_shape=self.quant_config.weight_block_size,
+                expert_map=expert_map)
+        elif self.use_marlin:
+            assert activation == "silu", (
+                f"{activation} not supported for Marlin MoE.")
+            return torch.ops.vllm.fused_marlin_moe(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                layer.w13_weight_scale,
+                layer.w2_weight_scale,
+                router_logits,
+                topk_weights,
+                topk_ids,
+                quant_type_id=scalar_types.float8_e4m3fn.id,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                global_num_experts=global_num_experts,
+                expert_map=expert_map)
+        elif self.flashinfer_moe_enabled:
+            # Currently only work with DS models
+            assert self.block_quant
+            assert (renormalize and use_grouped_topk
+                    and scoring_func == 'sigmoid'
+                    and custom_routing_function is None)
+            assert activation == "silu"
+            return torch.ops.vllm.flashinfer_fused_moe_blockscale_fp8(
+                routing_logits=router_logits.to(torch.float32),
+                routing_bias=e_score_correction_bias,
+                x=x,
+                w13_weight=layer.w13_weight,
+                w13_weight_scale_inv=layer.w13_weight_scale_inv,
+                w2_weight=layer.w2_weight,
+                w2_weight_scale_inv=layer.w2_weight_scale_inv,
+                global_num_experts=global_num_experts,
+                top_k=top_k,
+                num_expert_group=num_expert_group,
+                topk_group=topk_group,
+                intermediate_size=layer.intermediate_size_per_partition,
+                expert_offset=layer.ep_rank * layer.local_num_experts,
+                local_num_experts=layer.local_num_experts,
+                block_shape=self.quant_config.weight_block_size,
+                routed_scaling=1.0,
+            )
+        else:
+            return self.fused_experts(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                inplace=True,
+                activation=activation,
+                global_num_experts=global_num_experts,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                expert_map=expert_map,
+                w1_scale=(layer.w13_weight_scale_inv
+                          if self.block_quant else layer.w13_weight_scale),
+                w2_scale=(layer.w2_weight_scale_inv
+                          if self.block_quant else layer.w2_weight_scale),
+                a1_scale=layer.w13_input_scale,
+                a2_scale=layer.w2_input_scale,
+            )
+
+
+class Fp8KVCacheMethod(BaseKVCacheMethod):
+    """
+    Supports loading kv-cache scaling factors from FP8 checkpoints.
+    """
+
+    def __init__(self, quant_config: Fp8Config):
+        super().__init__(quant_config)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/gguf.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/gguf.py
new file mode 100644
index 0000000..86da04c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/gguf.py
@@ -0,0 +1,577 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Callable, Optional
+
+import gguf
+import torch
+from gguf import GGMLQuantizationType as WeightType
+from torch.nn.parameter import Parameter, UninitializedParameter
+
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
+                                                        FusedMoEMethodBase)
+from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.utils import direct_register_custom_op
+
+logger = init_logger(__name__)
+
+
+class GGUFConfig(QuantizationConfig):
+    """Config class for GGUF."""
+
+    def __init__(self, ) -> None:
+        super().__init__()
+
+    def __repr__(self) -> str:
+        return ("GGUFConfig()")
+
+    def get_name(self) -> QuantizationMethods:
+        return "gguf"
+
+    def get_supported_act_dtypes(self) -> list[torch.dtype]:
+        return [torch.half, torch.bfloat16, torch.float32]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 60
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return []  # no extra configs.
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "GGUFConfig":
+        return cls()
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        if isinstance(layer, LinearBase):
+            return GGUFLinearMethod(self)
+        elif isinstance(layer, VocabParallelEmbedding):
+            return GGUFEmbeddingMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return GGUFMoEMethod(self)
+        return None
+
+
+UNQUANTIZED_TYPES = {WeightType.F32, WeightType.F16, WeightType.BF16}
+STANDARD_QUANT_TYPES = {
+    WeightType.Q4_0,
+    WeightType.Q4_1,
+    WeightType.Q5_0,
+    WeightType.Q5_1,
+    WeightType.Q8_0,
+    WeightType.Q8_1,
+}
+KQUANT_TYPES = {
+    WeightType.Q2_K,
+    WeightType.Q3_K,
+    WeightType.Q4_K,
+    WeightType.Q5_K,
+    WeightType.Q6_K,
+}
+IMATRIX_QUANT_TYPES = {
+    WeightType.IQ1_M,
+    WeightType.IQ1_S,
+    WeightType.IQ2_XXS,
+    WeightType.IQ2_XS,
+    WeightType.IQ2_S,
+    WeightType.IQ3_XXS,
+    WeightType.IQ3_S,
+    WeightType.IQ4_XS,
+    WeightType.IQ4_NL,
+}
+# TODO(Isotr0py): Currently, we don't have MMQ kernel for I-Matrix quantization.
+# Consolidate DEQUANT_TYPES, MMVQ_QUANT_TYPES and MMQ_QUANT_TYPES after we add
+# MMQ kernel for I-Matrix quantization.
+DEQUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES | IMATRIX_QUANT_TYPES
+MMVQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES | IMATRIX_QUANT_TYPES
+MMQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES
+
+
+def _fused_mul_mat_gguf(x: torch.Tensor, qweight: torch.Tensor,
+                        qweight_type: int) -> torch.Tensor:
+    if qweight_type in IMATRIX_QUANT_TYPES:
+        mmvq_safe = 8 if qweight.shape[0] > 5120 else 16
+    else:
+        mmvq_safe = 2 if qweight.shape[0] > 5120 else 6
+    # HACK: when doing chunked prefill we don't generate output tokens
+    # so input to logits generator is empty which causes invalid parameter
+    if x.shape[0] == 0:
+        return torch.empty(x.shape[0],
+                           qweight.shape[0],
+                           dtype=x.dtype,
+                           device=x.device)
+    # there is no need to call any kernel for fp16/bf16
+    if qweight_type in UNQUANTIZED_TYPES:
+        return x @ qweight.T
+    # enable MMVQ in contiguous batching with batch_size=1
+    if x.shape[0] <= mmvq_safe and qweight_type in MMVQ_QUANT_TYPES:
+        y = ops.ggml_mul_mat_vec_a8(qweight, x, qweight_type, qweight.shape[0])
+    # Use MMQ Kernel if it's available (standard + k-quants)
+    elif qweight_type in MMQ_QUANT_TYPES:
+        y = ops.ggml_mul_mat_a8(qweight, x, qweight_type, qweight.shape[0])
+    # If there is no available MMQ kernel, fallback to dequantize
+    elif qweight_type in DEQUANT_TYPES:
+        block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type]
+        shape = (qweight.shape[0], qweight.shape[1] // type_size * block_size)
+        weight = ops.ggml_dequantize(qweight, qweight_type, *shape, x.dtype)
+        y = x @ weight.T
+    else:
+        # Raise an error if the quantization type is not supported.
+        # Might be useful if llama.cpp adds a new quantization type.
+        # Wrap to GGMLQuantizationType IntEnum to make sure it's a valid type.
+        qweight_type = WeightType(qweight_type)
+        raise NotImplementedError(
+            f"Unsupported GGUF quantization type: {qweight_type}")
+    return y
+
+
+def _fused_mul_mat_gguf_fake(
+    x: torch.Tensor,
+    qweight: torch.Tensor,
+    qweight_type: int,
+) -> torch.Tensor:
+    return torch.empty(x.shape[0],
+                       qweight.shape[0],
+                       dtype=x.dtype,
+                       device=x.device)
+
+
+try:
+    direct_register_custom_op(
+        op_name="_fused_mul_mat_gguf",
+        op_func=_fused_mul_mat_gguf,
+        mutates_args=[],
+        fake_impl=_fused_mul_mat_gguf_fake,
+    )
+    fused_mul_mat_gguf = torch.ops.vllm._fused_mul_mat_gguf
+
+except AttributeError as error:
+    raise error
+
+
+def _fused_moe_gguf(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    qweight_type: int,
+    qweight_type2: int,
+    activation: str,
+) -> torch.Tensor:
+
+    def act(x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = (x.shape[:-1] + (d, ))
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        if activation == "silu":
+            torch.ops._C.silu_and_mul(out, x)
+        elif activation == "gelu":
+            torch.ops._C.gelu_and_mul(out, x)
+        else:
+            raise ValueError(f"Unsupported activation: {activation}")
+        return out
+
+    # lazy import to avoid triggering triton import in CPU backend
+    from vllm.model_executor.layers.fused_moe.fused_moe import (
+        moe_align_block_size)
+
+    out_hidden_states = torch.empty_like(x)
+    # unless we decent expert reuse we are better off running moe_vec kernel
+    if (qweight_type2 in MMQ_QUANT_TYPES and qweight_type in MMQ_QUANT_TYPES
+            and x.shape[0] > 64):
+        num_tokens, _ = x.shape
+        E, N, _ = w1.shape
+        top_k = topk_ids.shape[1]
+        BLOCK_SIZE = ops.ggml_moe_get_block_size(qweight_type)
+
+        sorted_token_ids, expert_ids, num_tokens_post_padded = \
+                moe_align_block_size(topk_ids, BLOCK_SIZE, E)
+        out = ops.ggml_moe_a8(x, w1, sorted_token_ids, expert_ids,
+                              num_tokens_post_padded, qweight_type, N, top_k,
+                              num_tokens)
+        out = act(out)
+        out = ops.ggml_moe_a8(out, w2, sorted_token_ids, expert_ids,
+                              num_tokens_post_padded, qweight_type2,
+                              w2.shape[1], 1, num_tokens * top_k)
+        out = out.reshape(num_tokens, top_k, w2.shape[1]).mul_(
+            topk_weights.view(num_tokens, top_k, 1))
+        ops.moe_sum(out, out_hidden_states)
+    elif qweight_type2 in MMVQ_QUANT_TYPES and qweight_type in MMVQ_QUANT_TYPES:
+        num_tokens, _ = x.shape
+        E, N, _ = w1.shape
+        top_k = topk_ids.shape[1]
+
+        out = ops.ggml_moe_a8_vec(x, w1, topk_ids, top_k, qweight_type, N,
+                                  num_tokens)
+        out = act(out)
+
+        out = ops.ggml_moe_a8_vec(out, w2, topk_ids, 1, qweight_type2,
+                                  w2.shape[1], num_tokens * top_k)
+        out = out.reshape(num_tokens, top_k, w2.shape[1]).mul_(
+            topk_weights.view(num_tokens, top_k, 1))
+        ops.moe_sum(out, out_hidden_states)
+    else:
+        logger.warning_once("There is no support for fast MoE kernel "
+                            "for current quantization method. "
+                            "Falling back to slow implementation. ")
+        for tok, (w, idx) in enumerate(zip(topk_weights, topk_ids)):
+            inp = x[tok].reshape((1, ) + x.shape[1:])
+            current_hidden_state = None
+            for ww, ii in zip(w, idx):
+                expert_up = w1[ii]
+
+                out = fused_mul_mat_gguf(inp, expert_up, qweight_type)
+                out = act(out)
+
+                expert_down = w2[ii]
+                current_state = fused_mul_mat_gguf(out, expert_down,
+                                                   qweight_type2).mul_(ww)
+                if current_hidden_state is None:
+                    current_hidden_state = current_state
+                else:
+                    current_hidden_state.add_(current_state)
+            out_hidden_states[tok] = current_hidden_state
+    return out_hidden_states
+
+
+def _fused_moe_gguf_fake(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    qweight_type: int,
+    qweight_type2: int,
+    activation: str,
+) -> torch.Tensor:
+    return torch.empty_like(x)
+
+
+try:
+    direct_register_custom_op(
+        op_name="_fused_moe_gguf",
+        op_func=_fused_moe_gguf,
+        mutates_args=[],
+        fake_impl=_fused_moe_gguf_fake,
+    )
+    fused_moe_gguf = torch.ops.vllm._fused_moe_gguf
+
+except AttributeError as error:
+    raise error
+
+
+def _apply_gguf_embedding(
+    x: torch.Tensor,
+    qweight: torch.Tensor,
+    qweight_type: int,
+    hidden_size: int,
+    dtype: Optional[torch.dtype] = None,
+) -> torch.Tensor:
+    if qweight_type in UNQUANTIZED_TYPES:
+        return torch.embedding(qweight, x)
+    elif qweight_type in DEQUANT_TYPES:
+        block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type]
+        x_flat = x.flatten()
+        assert (hidden_size == qweight.shape[1] // type_size * block_size)
+        quant = torch.index_select(qweight, dim=0, index=x_flat)
+        dequant = ops.ggml_dequantize(quant, qweight_type, hidden_size,
+                                      x_flat.shape[0], dtype)
+        return dequant.view(*x.shape, hidden_size)
+    else:
+        qweight_type = WeightType(qweight_type)
+        raise NotImplementedError(
+            f"Unsupported GGUF quantization type: {qweight_type}")
+
+
+def _apply_gguf_embedding_fake(
+    x: torch.Tensor,
+    qweight: torch.Tensor,
+    qweight_type: int,
+    hidden_size: int,
+    dtype: Optional[torch.dtype] = None,
+) -> torch.Tensor:
+    return torch.empty(x.shape[0], hidden_size, dtype=dtype, device=x.device)
+
+
+try:
+    direct_register_custom_op(
+        op_name="_apply_gguf_embedding",
+        op_func=_apply_gguf_embedding,
+        mutates_args=[],
+        fake_impl=_apply_gguf_embedding_fake,
+    )
+    apply_gguf_embedding = torch.ops.vllm._apply_gguf_embedding
+
+except AttributeError as error:
+    raise error
+
+
+class GGUFLinearMethod(LinearMethodBase):
+    """Linear method for GGUF.
+
+    Args:
+        quant_config: The GGUF quantization config.
+    """
+
+    def __init__(self, quant_config: GGUFConfig):
+        self.quant_config = quant_config
+
+    def create_weights(self, layer: torch.nn.Module,
+                       input_size_per_partition: int,
+                       output_partition_sizes: list[int], input_size: int,
+                       output_size: int, params_dtype: torch.dtype,
+                       **extra_weight_attrs):
+        self.params_dtype = params_dtype
+        output_size_per_partition = sum(output_partition_sizes)
+
+        tensor_shape = (output_size_per_partition, input_size_per_partition)
+        qweight = GGUFUninitializedParameter(requires_grad=False)
+        set_weight_attrs(
+            qweight, {
+                "input_dim": 1,
+                "output_dim": 0,
+                "tensor_shape": tensor_shape,
+                "is_gguf_weight": True,
+                "data_container": [],
+                "shard_id": [],
+                "shard_id_map": {},
+            })
+        set_weight_attrs(qweight, extra_weight_attrs)
+        layer.register_parameter("qweight", qweight)
+
+        qweight_type = Parameter(torch.empty(len(output_partition_sizes),
+                                             dtype=torch.uint8),
+                                 requires_grad=False)
+        set_weight_attrs(
+            qweight_type, {
+                "is_gguf_weight_type": True,
+                "weight_type": 0,
+                "shard_weight_type": {},
+                "ignore_warning": True
+            })
+        set_weight_attrs(qweight_type, extra_weight_attrs)
+        layer.register_parameter("qweight_type", qweight_type)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        qweight_type = layer.qweight_type.weight_type
+        if not (qweight_type in UNQUANTIZED_TYPES
+                or qweight_type in DEQUANT_TYPES):
+            qweight_type = WeightType(qweight_type)
+            raise ValueError(
+                f"Unsupported GGUF quantization type {qweight_type} in "
+                f"layer {layer}.")
+        # For MergedColumnParallelLinear and QKVParallelLinear, we need to
+        # materialize the padded weight parameter for CUDA Graph compatibility.
+        self._create_padded_weight_param(layer)
+
+    def _create_padded_weight_param(self, layer: torch.nn.Module):
+        """Create padded weight parameter for GGUF MergedLinear layer."""
+        qweight = layer.qweight
+        shard_id_map = qweight.shard_id_map
+        shard_id = qweight.shard_id
+        if len(data_container := qweight.data_container) > 1:
+            dtype = {data.dtype for data in data_container}
+            assert len(dtype) == 1, ValueError(
+                f"Data container has mixed dtypes: {dtype}")
+            dtype = next(iter(dtype))
+            # concat dim0 and pad dim1
+            padded_side = max(x.size(1) for x in data_container)
+            concat_side = sum(x.size(0) for x in data_container)
+            # Pad the quantized weights to dense tensor, and create a map
+            # with the location of each shard in the padded tensor.
+            padded_data = torch.zeros((concat_side, padded_side),
+                                      dtype=dtype,
+                                      device=qweight.device)
+            # (dim0_start, dim0_end, dim1_size)
+            shard_offset_map = dict[str, tuple[int, int, int]]()
+            for idx in shard_id:
+                id_in_container = shard_id_map[idx]
+                start = sum(
+                    x.size(0) for x in data_container[:id_in_container])
+                end = start + data_container[id_in_container].size(0)
+                size = data_container[id_in_container].size(1)
+                padded_data[start:end, :size] = data_container[id_in_container]
+                shard_offset_map[idx] = (start, end, size)
+            qweight.data_container.clear()
+            padded_param = Parameter(padded_data, requires_grad=False)
+            set_weight_attrs(padded_param, vars(qweight))
+            set_weight_attrs(padded_param,
+                             {"shard_offset_map": shard_offset_map})
+            layer.register_parameter("qweight", padded_param)
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        shard_id = layer.qweight.shard_id
+
+        if shard_id:
+            # dequantize shard weights respectively
+            shard_id = ["q", "k", "v"] if "q" in shard_id else shard_id
+            qweight = layer.qweight
+            result = []
+            for idx in shard_id:
+                start, end, offset = layer.qweight.shard_offset_map[idx]
+                qweight_type = layer.qweight_type.shard_weight_type[idx]
+                result.append(
+                    fused_mul_mat_gguf(
+                        x, qweight[start:end, :offset].contiguous(),
+                        qweight_type))
+            out = torch.cat(result, axis=1)
+        else:
+            qweight = layer.qweight
+            qweight_type = layer.qweight_type.weight_type
+            out = fused_mul_mat_gguf(x, qweight, qweight_type)
+        if bias is not None:
+            out.add_(bias)
+        return out
+
+
+class GGUFMoEMethod(FusedMoEMethodBase):
+    """MoE method for GGUF.
+
+    Args:
+        quant_config: The GGUF quantization config.
+    """
+
+    def __init__(self, quant_config: GGUFConfig):
+        self.quant_config = quant_config
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        tensor_shape = (num_experts, 2 * intermediate_size_per_partition,
+                        hidden_size)
+        #gate up proj
+        w13_qweight = GGUFUninitializedParameter(requires_grad=False)
+        set_weight_attrs(
+            w13_qweight, {
+                "input_dim": 1,
+                "output_dim": 0,
+                "tensor_shape": tensor_shape,
+                "is_gguf_weight": True,
+                "data_container": [],
+            })
+        set_weight_attrs(w13_qweight, extra_weight_attrs)
+        layer.register_parameter("w13_qweight", w13_qweight)
+
+        w13_qweight_type = Parameter(torch.empty(1, dtype=torch.uint8),
+                                     requires_grad=False)
+        set_weight_attrs(w13_qweight_type, {
+            "is_gguf_weight_type": True,
+            "weight_type": 0,
+            "ignore_warning": True
+        })
+        set_weight_attrs(w13_qweight_type, extra_weight_attrs)
+        layer.register_parameter("w13_qweight_type", w13_qweight_type)
+
+        tensor_shape = (num_experts, intermediate_size_per_partition,
+                        hidden_size)
+        #gate down proj
+        w2_qweight = GGUFUninitializedParameter(requires_grad=False)
+        set_weight_attrs(
+            w2_qweight, {
+                "input_dim": 1,
+                "output_dim": 0,
+                "tensor_shape": tensor_shape,
+                "is_gguf_weight": True,
+                "data_container": [],
+            })
+        set_weight_attrs(w2_qweight, extra_weight_attrs)
+        layer.register_parameter("w2_qweight", w2_qweight)
+
+        w2_qweight_type = Parameter(torch.empty(1, dtype=torch.uint8),
+                                    requires_grad=False)
+        set_weight_attrs(w2_qweight_type, {
+            "is_gguf_weight_type": True,
+            "weight_type": 0,
+            "ignore_warning": True
+        })
+
+        set_weight_attrs(w2_qweight_type, extra_weight_attrs)
+        layer.register_parameter("w2_qweight_type", w2_qweight_type)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ):
+        if enable_eplb:
+            raise NotImplementedError(
+                "EPLB not supported for `GGUFMoEMethod` yet.")
+
+        assert activation == "silu", "Only SiLU activation is supported."
+        if apply_router_weight_on_input:
+            raise NotImplementedError(
+                "Apply router weight on input is not supported for"
+                "fused GGUF MoE method.")
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
+        return fused_moe_gguf(x, layer.w13_qweight, layer.w2_qweight,
+                              topk_weights, topk_ids,
+                              layer.w13_qweight_type.weight_type,
+                              layer.w2_qweight_type.weight_type, activation)
+
+
+class GGUFEmbeddingMethod(GGUFLinearMethod):
+    """Embedding method for GGUF.
+
+    Args:
+        quant_config: The GGUF quantization config.
+    """
+
+    def embedding(self, layer: torch.nn.Module,
+                  x: torch.Tensor) -> torch.Tensor:
+        qweight = layer.qweight
+        qweight_type = layer.qweight_type.weight_type
+        hidden_size = qweight.tensor_shape[1]
+
+        return apply_gguf_embedding(x,
+                                    qweight,
+                                    qweight_type,
+                                    hidden_size,
+                                    dtype=self.params_dtype)
+
+
+class GGUFUninitializedParameter(UninitializedParameter):
+    cls_to_become = Parameter
+    data_container: list[torch.Tensor]
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/gptq.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/gptq.py
new file mode 100644
index 0000000..d3ab1be
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/gptq.py
@@ -0,0 +1,278 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import enum
+from enum import Enum
+from fractions import Fraction
+from typing import Any, Optional, Union
+
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.linear import LinearMethodBase
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.quantization.utils.gptq_utils import (
+    get_linear_quant_method)
+from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
+                                           GroupQuantScaleParameter,
+                                           PackedColumnParameter,
+                                           PackedvLLMParameter,
+                                           RowvLLMParameter)
+
+
+class GPTQConfig(QuantizationConfig):
+    """Config class for GPTQ.
+
+    Reference: https://arxiv.org/abs/2210.17323
+    """
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        desc_act: bool,
+        lm_head_quantized: bool,
+        dynamic: dict[str, dict[str, Union[int, bool]]],
+    ) -> None:
+        # GPTQModel use `dynamic` config property to allow per module
+        # quantization config so each module can be individually optimized.
+        # Format is dict[str, dict] where key is a regex string that can
+        # perform both positive ("+:" prefixed) or negative ("-:" prefixed)
+        # matching of a module.
+        # Default to positive match, override base quant config mode, if no
+        # prefix is used. Value is in dict format of field key and override
+        # value.
+        # Negative matching will skip quantization init for this module
+        # entirely:
+        # non-quantized inference. More details and quantization examples can be
+        # found at: https://github.com/ModelCloud/GPTQModel
+        # Example:
+        #  # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9
+        #  # last 1/4 of the layers 16-21 has 8bit and group_size 64
+        # dynamic = {
+        #  #`.*\.` matches the layers_node prefix
+        #  # positive match layer 10-15
+        #  r"+:.*\.(?:1[0-5])\..*": {"bits": 8,},
+        #  # positive match layer 16-21
+        #  r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,},
+        #  r"-:.*\.moe\..*": {}, # negative match (skip) all `moe` layers
+        # }
+        super().__init__()
+        self.dynamic = dynamic
+
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.desc_act = desc_act
+        self.lm_head_quantized = lm_head_quantized
+        self.pack_factor = Fraction(32, self.weight_bits)
+        if self.weight_bits not in [2, 3, 4, 8]:
+            raise ValueError(
+                "Currently, only 2/3/4/8-bit weight quantization is "
+                f"supported for GPTQ, but got {self.weight_bits} bits.")
+
+    def __repr__(self) -> str:
+        return (f"GPTQConfig(weight_bits={self.weight_bits}, "
+                f"group_size={self.group_size}, "
+                f"desc_act={self.desc_act}), "
+                f"lm_head_quantized={self.lm_head_quantized}), "
+                f"dynamic={self.dynamic}")
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "gptq"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.half]
+
+    @classmethod
+    # Need to figure it out
+    def get_min_capability(cls) -> int:
+        return 60
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "GPTQConfig":
+        dynamic = cls.get_from_keys_or(config, ["dynamic"], default={})
+        dynamic = {} if dynamic is None else dynamic
+
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        desc_act = cls.get_from_keys(config, ["desc_act"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
+                                                 default=False)
+        return cls(weight_bits, group_size, desc_act, lm_head_quantized,
+                   dynamic)
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["GPTQLinearMethod"]:
+        return get_linear_quant_method(self, layer, prefix, GPTQLinearMethod)
+
+
+class ExllamaState(Enum):
+
+    UNUSED = enum.auto()
+    UNINITIALIZED = enum.auto()
+    READY = enum.auto()
+
+
+class GPTQLinearMethod(LinearMethodBase):
+    """Linear method for GPTQ.
+
+    Args:
+        quant_config: The GPTQ quantization config.
+    """
+
+    def __init__(self, quant_config: GPTQConfig):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del output_size  # Unused.
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        if input_size_per_partition % self.quant_config.group_size != 0:
+            raise ValueError(
+                "The input size is not aligned with the quantized "
+                "weight shape. This can be caused by too large "
+                "tensor parallel size.")
+        output_size_per_partition = sum(output_partition_sizes)
+        if (output_size_per_partition % self.quant_config.pack_factor.numerator
+                != 0):
+            raise ValueError(
+                "The output size is not aligned with the quantized "
+                "weight shape. This can be caused by too large "
+                "tensor parallel size.")
+
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+        exllama_state = ExllamaState.UNINITIALIZED
+        scale_and_zero_size = input_size // group_size
+        scale_and_zero_input_dim = None
+        if (input_size != input_size_per_partition
+                and self.quant_config.group_size != -1):
+            # For act-order models, we cannot use Exllama for row parallel layer
+            if self.quant_config.desc_act:
+                exllama_state = ExllamaState.UNUSED
+            else:
+                # we need to partition qzeros and scales for exllama kernel
+                scale_and_zero_size = input_size_per_partition // group_size
+                scale_and_zero_input_dim = 0
+
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition // self.quant_config.pack_factor,
+                output_size_per_partition,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=0,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader)
+
+        g_idx = RowvLLMParameter(data=torch.tensor(
+            [
+                i // self.quant_config.group_size
+                for i in range(input_size_per_partition)
+            ],
+            dtype=torch.int32,
+        ),
+                                 input_dim=0,
+                                 weight_loader=weight_loader)
+        qzeros_args = {
+            "data":
+            torch.empty(
+                scale_and_zero_size,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            "weight_loader":
+            weight_loader
+        }
+        weight_scale_args = {
+            "data":
+            torch.empty(
+                scale_and_zero_size,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            "weight_loader":
+            weight_loader
+        }
+        if scale_and_zero_input_dim is None:
+            scales = ChannelQuantScaleParameter(output_dim=1,
+                                                **weight_scale_args)
+            qzeros = PackedColumnParameter(
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args)
+
+        else:
+            scales = GroupQuantScaleParameter(output_dim=1,
+                                              input_dim=0,
+                                              **weight_scale_args)
+            qzeros = PackedvLLMParameter(
+                input_dim=0,
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args)
+
+        layer.register_parameter("qweight", qweight)
+        layer.register_parameter("g_idx", g_idx)
+        layer.register_parameter("qzeros", qzeros)
+        layer.register_parameter("scales", scales)
+
+        layer.exllama_state = exllama_state
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # for torch.compile
+        layer.qzeros = Parameter(layer.qzeros.data, requires_grad=False)
+        layer.qweight = Parameter(layer.qweight.data, requires_grad=False)
+        layer.g_idx = Parameter(layer.g_idx.data, requires_grad=False)
+        layer.scales = Parameter(layer.scales.data, requires_grad=False)
+
+        # exllama needs to shuffle the weight after the weight is loaded
+        # here we do the shuffle on first forward pass
+        if layer.exllama_state == ExllamaState.UNINITIALIZED:
+            if self.quant_config.desc_act:
+                layer.g_idx.data = torch.argsort(layer.g_idx).to(torch.int)
+            else:
+                layer.g_idx.data = torch.empty((0, ),
+                                               dtype=torch.int,
+                                               device=layer.g_idx.device)
+            layer.exllama_state = ExllamaState.READY
+            ops.gptq_shuffle(layer.qweight, layer.g_idx,
+                             self.quant_config.weight_bits)
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        out_shape = x.shape[:-1] + (layer.qweight.shape[-1], )
+        reshaped_x = x.reshape(-1, x.shape[-1])
+
+        output = ops.gptq_gemm(reshaped_x, layer.qweight, layer.qzeros,
+                               layer.scales, layer.g_idx,
+                               layer.exllama_state == ExllamaState.READY,
+                               self.quant_config.weight_bits)
+        if bias is not None:
+            output.add_(bias)
+        return output.reshape(out_shape)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/gptq_bitblas.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/gptq_bitblas.py
new file mode 100644
index 0000000..caeb266
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/gptq_bitblas.py
@@ -0,0 +1,446 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               set_weight_attrs)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
+    BitBLASLinearKernel, MPLinearLayerConfig)
+from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
+    BITBLAS_SUPPORTED_NUM_BITS as GPTQ_BITBLAS_SUPPORTED_NUM_BITS)
+from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
+    BITBLAS_SUPPORTED_SYM as GPTQ_BITBLAS_SUPPORTED_SYM)
+from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
+    MINIMUM_BITBLAS_VERSION, bitblas_repeat_scales_on_all_ranks,
+    check_bitblas_supported, verify_bitblas_supported)
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
+                                           GroupQuantScaleParameter,
+                                           PackedColumnParameter,
+                                           PackedvLLMParameter,
+                                           RowvLLMParameter)
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+
+logger = init_logger(__name__)
+
+
+class GPTQBitBLASConfig(QuantizationConfig):
+    """Config class for GPTQ BitBLAS"""
+
+    # (num_bits, is_sym) -> quant_type
+    TYPE_MAP = {
+        (4, True): scalar_types.uint4b8,
+        (8, True): scalar_types.uint8b128,
+    }
+
+    TORCH_DTYPE = torch.float16
+    GPTQ_CKPT_STORAGE_DTYPE = (
+        "int32"  # GPTQ Default Checkpoints use int32 as storage dtype
+    )
+    GPTQ_BITBLAS_STORAGE_DTYPE = "int8"  # BitBLAS uses int8 as storage dtype
+    TORCH_BITBLAS_STORAGE_DTYPE = getattr(torch, GPTQ_BITBLAS_STORAGE_DTYPE)
+    # "original" or "rescale" or "quantized",
+    # the gptq_bitblas prefer "quantized"
+    ZEROS_MODE = "quantized"
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        desc_act: bool,
+        is_sym: bool,
+        quant_method: Optional[str],
+        lm_head_quantized: bool,
+    ) -> None:
+
+        try:
+            import bitblas
+            if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
+                raise ImportError(
+                    "bitblas version is wrong. Please "
+                    f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
+        except ImportError as e:
+            bitblas_import_exception = e
+            raise ValueError(
+                "Trying to use the bitblas backend, but could not import"
+                f"with the following error: {bitblas_import_exception}. "
+                "Please install bitblas through the following command: "
+                f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`"
+            ) from bitblas_import_exception
+
+        if desc_act and group_size == -1:
+            # In this case, act_order == True is the same as act_order == False
+            # (since we have only one group per output channel)
+            desc_act = False
+
+        super().__init__()
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.desc_act = desc_act
+        self.is_sym = is_sym
+        self.quant_method = quant_method
+        self.lm_head_quantized = lm_head_quantized
+
+        # Verify
+        if self.weight_bits not in GPTQ_BITBLAS_SUPPORTED_NUM_BITS:
+            raise ValueError(
+                f"BitBLAS does not support weight_bits = {self.weight_bits}. "
+                f"Only weight_bits = {GPTQ_BITBLAS_SUPPORTED_NUM_BITS} "
+                "are supported.")
+
+        if self.is_sym not in GPTQ_BITBLAS_SUPPORTED_SYM:
+            raise ValueError(
+                f"BitBLAS does not support is_sym = {self.is_sym}. "
+                f"Only sym = {GPTQ_BITBLAS_SUPPORTED_SYM} are supported.")
+
+        self.storage_dtype = self.GPTQ_BITBLAS_STORAGE_DTYPE
+
+        storage_nbit = int("".join(c for c in self.GPTQ_CKPT_STORAGE_DTYPE
+                                   if c.isdigit()))
+
+        # 4 Bits packed into 32 bit datatype.
+        self.pack_factor = storage_nbit // weight_bits
+        self.nbits = weight_bits
+
+        # Zeros type for the quantized weights.
+        self.zeros_mode = self.ZEROS_MODE
+
+        if (weight_bits, is_sym) not in self.TYPE_MAP:
+            raise ValueError("Unsupported quantization config: "
+                             f"bits={weight_bits}, sym={is_sym}")
+
+        self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)]
+
+    def __repr__(self) -> str:
+        return (f"GPTQBitBLASConfig(weight_bits={self.weight_bits}, "
+                f"group_size={self.group_size}, "
+                f"desc_act={self.desc_act})"
+                f"is_sym={self.is_sym}, "
+                f"quant_method={self.quant_method})")
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "gptq_bitblas"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.half, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "GPTQBitBLASConfig":
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        desc_act = cls.get_from_keys(config, ["desc_act"])
+        is_sym = cls.get_from_keys(config, ["sym"])
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
+                                                 default=False)
+        return cls(weight_bits, group_size, desc_act, is_sym, quant_method,
+                   lm_head_quantized)
+
+    @classmethod
+    def override_quantization_method(
+            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
+        can_convert = cls.is_gptq_bitblas_compatible(hf_quant_cfg)
+
+        is_valid_user_quant = (user_quant is None or user_quant == "bitblas"
+                               or user_quant == "gptq_bitblas")
+
+        if can_convert and is_valid_user_quant:
+            msg = ("The model is convertible to {} during runtime."
+                   " Using {} kernel.".format(cls.get_name(), cls.get_name()))
+            logger.info(msg)
+            return cls.get_name()
+
+        if can_convert and user_quant == "gptq":
+            logger.info("Detected that the model can run with gptq_bitblas"
+                        ", however you specified quantization=gptq explicitly,"
+                        " so forcing gptq. Use quantization=gptq_bitblas for"
+                        " faster inference")
+        return None
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["GPTQBitBLASLinearMethod"]:
+        if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
+                                             and self.lm_head_quantized):
+            return GPTQBitBLASLinearMethod(self)
+        return None
+
+    @property
+    def torch_storage_dtype(self) -> torch.dtype:
+        return self.TORCH_BITBLAS_STORAGE_DTYPE
+
+    @classmethod
+    def is_gptq_bitblas_compatible(cls, quant_config: dict[str, Any]):
+        # Extract data from quant config.
+        num_bits = quant_config.get("bits")
+        group_size = quant_config.get("group_size")
+        sym = quant_config.get("sym")
+        desc_act = quant_config.get("desc_act")
+
+        # temporarily disable on ROCm platform
+        if not current_platform.is_cuda():
+            return False
+
+        # If we cannot find the info needed in the config, cannot convert.
+        if (num_bits is None or group_size is None or sym is None
+                or desc_act is None):
+            return False
+
+        if (num_bits, sym) not in cls.TYPE_MAP:
+            return False
+
+        # If the capability of the device is too low, cannot convert.
+        major, minor = torch.cuda.get_device_capability()
+        device_capability = major * 10 + minor
+        if device_capability < cls.get_min_capability():
+            return False
+
+        # Otherwise, can convert if model satisfies bitblas constraints.
+        return check_bitblas_supported(quant_type=cls.TYPE_MAP[(num_bits,
+                                                                sym)],
+                                       group_size=group_size)
+
+
+class GPTQBitBLASLinearMethod(LinearMethodBase):
+    """Linear method for GPTQ BitBLAS.
+
+    Args:
+        quant_config: The GPTQ BitBLAS quantization config.
+    """
+
+    kernel_type = BitBLASLinearKernel
+    _kernel_backends_being_used: set[str] = set()
+
+    def __init__(self, quant_config: GPTQBitBLASConfig) -> None:
+        self.quant_config = quant_config
+        # Verify supported on platform.
+        verify_bitblas_supported(quant_type=self.quant_config.quant_type,
+                                 group_size=self.quant_config.group_size)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        """Creates quantized weights for use in linear operations.
+
+        The function initializes and returns a dictionary containing 
+        quantized weights, scales, and zeros
+        for performing quantized matrix multiplication operations.
+
+        Args:
+            input_size_per_partition: The size of the input partition.
+            output_partition_sizes: The size of the output partition.
+            input_size: The total size of the input (unused).
+            output_size: The total size of the output (unused).
+            params_dtype: 
+                The data type of the parameters (expected to be torch.float16).
+
+        Returns:
+            A dictionary containing the quantized weights ('qweight'), 
+            scales ('scales'), and zeros ('zeros').
+
+        Raises:
+            ValueError: If `params_dtype` is not `torch.float16` or 
+            if the input size per partition is not divisible by the 
+            group size in `quant_config`.
+        """
+        if params_dtype != torch.float16:
+            raise ValueError("Parameter data type must be torch.float16, "
+                             f"but got {params_dtype}")
+
+        # Normalize group_size
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+
+        if input_size_per_partition % group_size != 0:
+            raise ValueError(
+                f"Input size per partition ({input_size_per_partition}) must "
+                f"be divisible by group size ({self.quant_config.group_size})."
+            )
+
+        kernel_type = self.kernel_type
+        # Validate output_size_per_partition
+        output_size_per_partition = sum(output_partition_sizes)
+
+        is_row_parallel = input_size != input_size_per_partition
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        mp_linear_kernel_config = MPLinearLayerConfig(
+            full_weight_shape=(input_size, output_size),
+            partition_weight_shape=\
+                (input_size_per_partition, output_size_per_partition),
+            weight_type=self.quant_config.quant_type,
+            act_type=params_dtype,
+            group_size=self.quant_config.group_size,
+            zero_points=False,
+            has_g_idx=self.quant_config.desc_act
+        )
+
+        if kernel_type.__name__ not in self._kernel_backends_being_used:
+            logger.info("Using %s for GPTQBitBLASLinearMethod",
+                        kernel_type.__name__)
+            self._kernel_backends_being_used.add(kernel_type.__name__)
+
+        # Normalize group_size
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+
+        # Determine sharding
+        if bitblas_repeat_scales_on_all_ranks(self.quant_config.desc_act,
+                                              self.quant_config.group_size,
+                                              is_row_parallel):
+            # By setting scale_dim == None, weight_loader will
+            # repeat the scales on each GPU in TP>1 case.
+            scales_and_zp_input_dim = None
+            scales_and_zp_size = input_size // group_size
+        else:
+            # By setting scale_dim == 0, weight_loader will
+            # shard the scales in TP>1 case.
+            scales_and_zp_input_dim = 0
+            scales_and_zp_size = input_size_per_partition // group_size
+
+        # Init buffers
+        # Quantized weights
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition // self.quant_config.pack_factor,
+                output_size_per_partition,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=0,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader)
+
+        # Activation order
+        # Ignore warning from fused linear layers such as QKVParallelLinear.
+        g_idx = RowvLLMParameter(data=torch.empty(
+            input_size_per_partition,
+            dtype=torch.int32,
+        ),
+                                 input_dim=0,
+                                 weight_loader=weight_loader)
+
+        # Scales
+        scales = Parameter(
+            torch.empty(
+                scales_and_zp_size,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            scales,
+            {
+                **extra_weight_attrs,
+                "input_dim": scales_and_zp_input_dim,
+                "output_dim": 1,
+            },
+        )
+
+        # Quantized zero-points
+        qzeros_args = {
+            "data":
+            torch.empty(
+                scales_and_zp_size,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            "weight_loader":
+            weight_loader
+        }
+        weight_scale_args = {
+            "data":
+            torch.empty(
+                scales_and_zp_size,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            "weight_loader":
+            weight_loader
+        }
+
+        if scales_and_zp_input_dim is None:
+            scales = ChannelQuantScaleParameter(output_dim=1,
+                                                **weight_scale_args)
+            qzeros = PackedColumnParameter(
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args)
+
+        else:
+            scales = GroupQuantScaleParameter(output_dim=1,
+                                              input_dim=0,
+                                              **weight_scale_args)
+            qzeros = PackedvLLMParameter(
+                input_dim=0,
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args)
+
+        layer.register_parameter("qweight", qweight)
+        layer.register_parameter("g_idx", g_idx)
+        layer.register_parameter("scales", scales)
+        layer.register_parameter("qzeros", qzeros)
+
+        self.kernel = kernel_type(
+            mp_linear_kernel_config,
+            w_q_param_name="qweight",
+            w_s_param_name="scales",
+            w_zp_param_name="qzeros",
+            w_gidx_param_name="g_idx",
+            bitblas_quant_config=self.quant_config,
+        )
+
+        # Initialize or retrieve the BitBLAS matrix multiplication operator.
+        self.kernel.configure_bitblas_matmul(
+            input_size_per_partition,
+            output_size_per_partition,
+            params_dtype=params_dtype,
+            bias=False,
+        )
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        self.kernel.process_weights_after_loading(layer)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        out = self.kernel.apply_gptq_bitblas_linear(layer, x)
+        if bias is not None:
+            out.add_(bias)
+        return out
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/gptq_marlin.py
new file mode 100644
index 0000000..9bed5e2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -0,0 +1,679 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from copy import deepcopy
+from typing import Any, Callable, Optional, Union
+
+import torch
+
+import vllm.model_executor.layers.fused_moe  # noqa
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.layer import (
+    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported,
+    UnquantizedFusedMoEMethod)
+from vllm.model_executor.layers.linear import (LinearMethodBase,
+                                               set_weight_attrs)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
+    MPLinearLayerConfig, choose_mp_linear_kernel)
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.gptq_utils import (
+    get_dynamic_override, get_linear_quant_method, override_config)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    check_marlin_supported, check_moe_marlin_supports_layer,
+    marlin_make_workspace_new, marlin_moe_permute_scales,
+    marlin_repeat_scales_on_all_ranks, verify_marlin_supported)
+from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
+                                           GroupQuantScaleParameter,
+                                           PackedColumnParameter,
+                                           PackedvLLMParameter,
+                                           RowvLLMParameter)
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+
+logger = init_logger(__name__)
+
+
+def get_moe_quant_method(
+    config: QuantizationConfig,
+    layer: torch.nn.Module,
+    prefix: str,
+    moe_method_cls: type,
+):
+    cloned_config = deepcopy(config)
+
+    if isinstance(layer, FusedMoE):
+        # False = skip module, None = no override, else = Positive match
+        if get_dynamic_override(  # noqa: E712
+                cloned_config,  # noqa: E712
+                layer_name=prefix) == False:  # noqa: E712
+            return UnquantizedFusedMoEMethod(layer.moe_config)
+
+        if prefix:
+            # Dynamic per module/layer rules may override base config
+            override_config(cloned_config, prefix=prefix)
+
+        return moe_method_cls(cloned_config)
+    return None
+
+
+class GPTQMarlinConfig(QuantizationConfig):
+    """Config class for GPTQ Marlin"""
+
+    # (num_bits, is_sym) -> quant_type
+    TYPE_MAP = {
+        (4, True): scalar_types.uint4b8,
+        (8, True): scalar_types.uint8b128,
+    }
+
+    def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
+                 is_sym: bool, lm_head_quantized: bool,
+                 dynamic: dict[str, dict[str, Union[int, bool]]],
+                 full_config: dict[str, Any]) -> None:
+        super().__init__()
+        if desc_act and group_size == -1:
+            # In this case, act_order == True is the same as act_order == False
+            # (since we have only one group per output channel)
+            desc_act = False
+
+        # GPTQModel use `dynamic` config property to allow per module
+        # quantization config so each module can be individually optimized.
+        # Format is dict[str, dict] where key is a regex string that can
+        # perform both positive ("+:" prefixed) or negative ("-:" prefixed)
+        # matching of a module.
+        # Default to positive match, override base quant config mode, if no
+        # prefix is used. Value is in dict format of field key and override
+        # value.
+        # Negative matching will skip quantization init for this module
+        # entirely:
+        # non-quantized inference. More details and quantization examples can be
+        # found at: https://github.com/ModelCloud/GPTQModel
+        # Example:
+        #  # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9
+        #  # last 1/4 of the layers 16-21 has 8bit and group_size 64
+        # dynamic = {
+        #  #`.*\.` matches the layers_node prefix
+        #  # positive match layer 10-15
+        #  r"+:.*\.(?:1[0-5])\..*": {"bits": 8,},
+        #  # positive match layer 16-21
+        #  r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,},
+        #  r"-:.*\.moe\..*": {}, # negative match (skip) all `moe` layers
+        # }
+        self.dynamic = dynamic
+
+        self.weight_bits = weight_bits
+        self.is_sym = is_sym
+
+        self.pack_factor = 32 // weight_bits  # packed into int32
+        self.group_size = group_size
+        self.desc_act = desc_act
+        self.lm_head_quantized = lm_head_quantized
+        self.full_config = full_config
+
+        if (weight_bits, is_sym) not in self.TYPE_MAP:
+            raise ValueError("Unsupported quantization config: "
+                             f"bits={weight_bits}, sym={is_sym}")
+
+        self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)]
+
+    def __repr__(self) -> str:
+        return (f"GPTQMarlinConfig(quant_type={self.quant_type}, "
+                f"group_size={self.group_size}, "
+                f"desc_act={self.desc_act}, "
+                f"lm_head_quantized={self.lm_head_quantized}), "
+                f"dynamic={self.dynamic}")
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "gptq_marlin"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.half, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "GPTQMarlinConfig":
+        dynamic = cls.get_from_keys_or(config, ["dynamic"], default={})
+        dynamic = {} if dynamic is None else dynamic
+
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        desc_act = cls.get_from_keys(config, ["desc_act"])
+        is_sym = cls.get_from_keys(config, ["sym"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
+                                                 default=False)
+        return cls(weight_bits, group_size, desc_act, is_sym,
+                   lm_head_quantized, dynamic, config)
+
+    @classmethod
+    def override_quantization_method(
+            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
+        can_convert = cls.is_gptq_marlin_compatible(hf_quant_cfg)
+
+        is_valid_user_quant = (user_quant is None or user_quant == "marlin"
+                               or user_quant == "gptq_marlin")
+
+        if can_convert and is_valid_user_quant:
+            msg = ("The model is convertible to {} during runtime."
+                   " Using {} kernel.".format(cls.get_name(), cls.get_name()))
+            logger.info(msg)
+            return cls.get_name()
+
+        if can_convert and user_quant == "gptq":
+            logger.info("Detected that the model can run with gptq_marlin"
+                        ", however you specified quantization=gptq explicitly,"
+                        " so forcing gptq. Use quantization=gptq_marlin for"
+                        " faster inference")
+        return None
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        if isinstance(layer, FusedMoE):
+            from vllm.model_executor.layers.quantization.moe_wna16 import (
+                MoeWNA16Config)
+            if not check_moe_marlin_supports_layer(layer, self.group_size):
+                logger.warning_once(
+                    f"Layer '{prefix}' is not supported by GPTQMoeMarlin. "
+                    "Falling back to Moe WNA16 kernels.")
+                return MoeWNA16Config.from_config(
+                    self.full_config).get_quant_method(layer, prefix)
+            return get_moe_quant_method(self, layer, prefix,
+                                        GPTQMarlinMoEMethod)
+        return get_linear_quant_method(self, layer, prefix,
+                                       GPTQMarlinLinearMethod)
+
+    @classmethod
+    def is_gptq_marlin_compatible(cls, quant_config: dict[str, Any]):
+        quant_method = quant_config.get("quant_method", "").lower()
+        num_bits = quant_config.get("bits")
+        group_size = quant_config.get("group_size")
+        sym = quant_config.get("sym")
+        desc_act = quant_config.get("desc_act")
+
+        if not current_platform.is_cuda():
+            return False
+
+        if quant_method != "gptq":
+            return False
+
+        # Marlin conversion is only valid if required properties are found
+        if (num_bits is None or group_size is None or sym is None
+                or desc_act is None):
+            return False
+
+        if (num_bits, sym) not in cls.TYPE_MAP:
+            return False
+
+        return check_marlin_supported(quant_type=cls.TYPE_MAP[(num_bits, sym)],
+                                      group_size=group_size)
+
+
+class GPTQMarlinLinearMethod(LinearMethodBase):
+    """Linear method for GPTQ Marlin.
+
+    Args:
+        quant_config: The GPTQ Marlin quantization config.
+    """
+
+    _kernel_backends_being_used: set[str] = set()
+
+    def __init__(self, quant_config: GPTQMarlinConfig) -> None:
+        self.quant_config = quant_config
+
+        # Verify supported on platform.
+        verify_marlin_supported(quant_type=self.quant_config.quant_type,
+                                group_size=self.quant_config.group_size)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        output_size_per_partition = sum(output_partition_sizes)
+        is_row_parallel = input_size != input_size_per_partition
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        mp_linear_kernel_config = MPLinearLayerConfig(
+            full_weight_shape=(input_size, output_size),
+            partition_weight_shape=\
+                (input_size_per_partition, output_size_per_partition),
+            weight_type=self.quant_config.quant_type,
+            act_type=params_dtype,
+            group_size=self.quant_config.group_size,
+            zero_points=False,
+            has_g_idx=self.quant_config.desc_act
+        )
+
+        kernel_type = choose_mp_linear_kernel(mp_linear_kernel_config)
+
+        if kernel_type.__name__ not in self._kernel_backends_being_used:
+            logger.info("Using %s for GPTQMarlinLinearMethod",
+                        kernel_type.__name__)
+            self._kernel_backends_being_used.add(kernel_type.__name__)
+
+        # Normalize group_size
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+
+        # Determine sharding
+        if marlin_repeat_scales_on_all_ranks(self.quant_config.desc_act,
+                                             self.quant_config.group_size,
+                                             is_row_parallel):
+            # By setting scale_dim == None, weight_loader will
+            # repeat the scales on each GPU in TP>1 case.
+            scales_and_zp_input_dim = None
+            scales_and_zp_size = input_size // group_size
+        else:
+            # By setting scale_dim == 0, weight_loader will
+            # shard the scales in TP>1 case.
+            scales_and_zp_input_dim = 0
+            scales_and_zp_size = input_size_per_partition // group_size
+
+        # Quantized weights
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition // self.quant_config.pack_factor,
+                output_size_per_partition,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=0,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader)
+
+        # Activation order
+        g_idx = RowvLLMParameter(data=torch.empty(
+            input_size_per_partition,
+            dtype=torch.int32,
+        ),
+                                 input_dim=0,
+                                 weight_loader=weight_loader)
+
+        qzeros_args = {
+            "data":
+            torch.empty(
+                scales_and_zp_size,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            "weight_loader":
+            weight_loader
+        }
+        weight_scale_args = {
+            "data":
+            torch.empty(
+                scales_and_zp_size,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            "weight_loader":
+            weight_loader
+        }
+
+        if scales_and_zp_input_dim is None:
+            scales = ChannelQuantScaleParameter(output_dim=1,
+                                                **weight_scale_args)
+            qzeros = PackedColumnParameter(
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args)
+
+        else:
+            scales = GroupQuantScaleParameter(output_dim=1,
+                                              input_dim=0,
+                                              **weight_scale_args)
+            qzeros = PackedvLLMParameter(
+                input_dim=0,
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args)
+
+        layer.register_parameter("qweight", qweight)
+        layer.register_parameter("g_idx", g_idx)
+        layer.register_parameter("scales", scales)
+        layer.register_parameter("qzeros", qzeros)
+
+        self.kernel = kernel_type(mp_linear_kernel_config,
+                                  w_q_param_name="qweight",
+                                  w_s_param_name="scales",
+                                  w_zp_param_name="qzeros",
+                                  w_gidx_param_name="g_idx")
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        self.kernel.process_weights_after_loading(layer)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return self.kernel.apply_weights(layer, x, bias)
+
+
+class GPTQMarlinMoEMethod(FusedMoEMethodBase):
+    """MoE Marlin method with quantization."""
+
+    def __init__(self, quant_config: GPTQMarlinConfig) -> None:
+        self.quant_config = quant_config
+        if self.quant_config.quant_type.size_bits == 4:
+            self.quant_type = scalar_types.uint4b8
+        elif self.quant_config.quant_type.size_bits == 8:
+            self.quant_type = scalar_types.uint8b128
+        else:
+            raise ValueError(
+                "GPTQMarlinMoEMethod only supports int4 and int8 now.")
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        intermediate_size_full = extra_weight_attrs.pop(
+            "intermediate_size_full")
+
+        self.is_k_full = (not self.quant_config.desc_act) or (
+            intermediate_size_per_partition == intermediate_size_full)
+
+        if self.quant_config.group_size != -1:
+            scales_size13 = hidden_size // self.quant_config.group_size
+            w2_scales_size = (intermediate_size_full
+                              if self.quant_config.desc_act else
+                              intermediate_size_per_partition)
+            scales_size2 = (w2_scales_size // self.quant_config.group_size)
+            strategy = FusedMoeWeightScaleSupported.GROUP.value
+        else:
+            scales_size13 = 1
+            scales_size2 = 1
+            strategy = FusedMoeWeightScaleSupported.CHANNEL.value
+
+        extra_weight_attrs.update({
+            "quant_method": strategy,
+            "is_transposed": True
+        })
+        # Fused gate_up_proj (column parallel)
+        w13_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size // self.quant_config.pack_factor,
+                2 * intermediate_size_per_partition,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_qweight", w13_qweight)
+        set_weight_attrs(w13_qweight, extra_weight_attrs)
+        # down_proj (row parallel)
+        w2_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size_per_partition //
+                self.quant_config.pack_factor,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_qweight", w2_qweight)
+        set_weight_attrs(w2_qweight, extra_weight_attrs)
+        # up_proj scales
+        w13_scales = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        scales_size13,
+                        2 * intermediate_size_per_partition,
+                        dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_scales", w13_scales)
+        set_weight_attrs(w13_scales, extra_weight_attrs)
+        # down_proj scales
+        w2_scales = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        scales_size2,
+                        hidden_size,
+                        dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_scales", w2_scales)
+        set_weight_attrs(w2_scales, extra_weight_attrs)
+        # dont shard the w2 scales when running act order
+        set_weight_attrs(w2_scales,
+                         {"load_full_w2": self.quant_config.desc_act})
+        # up_proj scales
+        w13_qzeros = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        scales_size13,
+                        2 * intermediate_size_per_partition //
+                        self.quant_config.pack_factor,
+                        dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_qzeros", w13_qzeros)
+        set_weight_attrs(w13_qzeros, extra_weight_attrs)
+        # down_proj scales
+        w2_qzeros = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        scales_size2,
+                        hidden_size // self.quant_config.pack_factor,
+                        dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_qzeros", w2_qzeros)
+        set_weight_attrs(w2_qzeros, extra_weight_attrs)
+        # dont shard the w2 scales when running act order
+        set_weight_attrs(w2_qzeros,
+                         {"load_full_w2": self.quant_config.desc_act})
+        w13_g_idx = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_g_idx", w13_g_idx)
+        set_weight_attrs(w13_g_idx, extra_weight_attrs)
+        w2_g_idx = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size_per_partition,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_g_idx", w2_g_idx)
+        set_weight_attrs(w2_g_idx, extra_weight_attrs)
+        w13_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_g_idx_sort_indices",
+                                 w13_g_idx_sort_indices)
+        set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)
+        w2_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size_per_partition,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_g_idx_sort_indices",
+                                 w2_g_idx_sort_indices)
+        set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)
+
+        device = layer.w13_qweight.device
+        layer.workspace = marlin_make_workspace_new(device, 4)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+
+        # Process act_order
+        if self.quant_config.desc_act:
+            # Get sorting based on g_idx
+            num_experts = layer.w13_g_idx.shape[0]
+            w13_g_idx_sort_indices = torch.empty_like(layer.w13_g_idx)
+            w2_g_idx_sort_indices = torch.empty_like(layer.w2_g_idx)
+            w13_sorted_g_idx = torch.empty_like(layer.w13_g_idx)
+            w2_sorted_g_idx = torch.empty_like(layer.w2_g_idx)
+            for e in range(num_experts):
+                w13_g_idx_sort_indices[e] = torch.argsort(
+                    layer.w13_g_idx[e]).to(torch.int32)
+                w2_g_idx_sort_indices[e] = torch.argsort(layer.w2_g_idx[e]).to(
+                    torch.int32)
+                w13_sorted_g_idx[e] = layer.w13_g_idx[e][
+                    w13_g_idx_sort_indices[e]]
+                w2_sorted_g_idx[e] = layer.w2_g_idx[e][
+                    w2_g_idx_sort_indices[e]]
+            replace_parameter(layer, "w13_g_idx", w13_sorted_g_idx)
+            replace_parameter(layer, "w2_g_idx", w2_sorted_g_idx)
+            replace_parameter(layer, "w13_g_idx_sort_indices",
+                              w13_g_idx_sort_indices)
+            replace_parameter(layer, "w2_g_idx_sort_indices",
+                              w2_g_idx_sort_indices)
+        else:
+            # Reset g_idx related tensors
+            num_experts = layer.w13_g_idx.shape[0]
+            device = layer.w13_g_idx.device
+            layer.w13_g_idx = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+            layer.w2_g_idx = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+            layer.w13_g_idx_sort_indices = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+            layer.w2_g_idx_sort_indices = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+        # Repack weights
+        marlin_w13_qweight = ops.gptq_marlin_moe_repack(
+            layer.w13_qweight,
+            layer.w13_g_idx_sort_indices,
+            layer.w13_qweight.shape[1] * self.quant_config.pack_factor,
+            layer.w13_qweight.shape[2],
+            self.quant_config.quant_type.size_bits,
+        )
+        replace_parameter(layer, "w13_qweight", marlin_w13_qweight)
+        marlin_w2_qweight = ops.gptq_marlin_moe_repack(
+            layer.w2_qweight,
+            layer.w2_g_idx_sort_indices,
+            layer.w2_qweight.shape[1] * self.quant_config.pack_factor,
+            layer.w2_qweight.shape[2],
+            self.quant_config.quant_type.size_bits,
+        )
+        replace_parameter(layer, "w2_qweight", marlin_w2_qweight)
+        # Repack scales
+        marlin_w13_scales = marlin_moe_permute_scales(
+            s=layer.w13_scales,
+            size_k=layer.intermediate_size_per_partition,
+            size_n=layer.w13_scales.shape[2],
+            group_size=self.quant_config.group_size,
+        )
+        replace_parameter(layer, "w13_scales", marlin_w13_scales)
+        marlin_w2_scales = marlin_moe_permute_scales(
+            s=layer.w2_scales,
+            size_k=layer.w2_scales.shape[1] *
+            (self.quant_config.group_size if self.quant_config.group_size != -1
+             else self.quant_config.pack_factor),
+            size_n=layer.w2_scales.shape[2],
+            group_size=self.quant_config.group_size,
+        )
+        replace_parameter(layer, "w2_scales", marlin_w2_scales)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if enable_eplb:
+            raise NotImplementedError(
+                "EPLB not supported for `GPTQMarlinMoEMethod` yet.")
+
+        assert activation == "silu", "Only SiLU activation is supported."
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
+
+        return torch.ops.vllm.fused_marlin_moe(
+            x,
+            layer.w13_qweight,
+            layer.w2_qweight,
+            layer.w13_scales,
+            layer.w2_scales,
+            router_logits,
+            topk_weights,
+            topk_ids,
+            quant_type_id=self.quant_type.id,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            g_idx1=layer.w13_g_idx,
+            g_idx2=layer.w2_g_idx,
+            sort_indices1=layer.w13_g_idx_sort_indices,
+            sort_indices2=layer.w2_g_idx_sort_indices,
+            workspace=layer.workspace,
+            is_k_full=self.is_k_full)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/gptq_marlin_24.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/gptq_marlin_24.py
new file mode 100644
index 0000000..eba917d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/gptq_marlin_24.py
@@ -0,0 +1,297 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           ChannelQuantScaleParameter,
+                                           GroupQuantScaleParameter,
+                                           PackedvLLMParameter)
+from vllm.scalar_type import scalar_types
+
+logger = init_logger(__name__)
+
+GPTQ_MARLIN_24_TILE = 16
+GPTQ_MARLIN_24_MIN_THREAD_N = 128
+GPTQ_MARLIN_24_MIN_THREAD_K = 128
+GPTQ_MARLIN_24_MAX_PARALLEL = 64
+
+GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES = [
+    scalar_types.uint4b8, scalar_types.uint8b128
+]
+GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES = [-1, 128]
+
+
+class GPTQMarlin24Config(QuantizationConfig):
+    """Config class for Marlin24.
+    """
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+    ) -> None:
+        super().__init__()
+        quant_type = {
+            4: scalar_types.uint4b8,
+            8: scalar_types.uint8b128,
+        }.get(weight_bits)
+
+        self.group_size = group_size
+
+        # Verify
+        if quant_type is None or \
+            quant_type not in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES:
+            raise ValueError(
+                f"Marlin_24 does not support quant_type = {quant_type}. "
+                f"Only weight_bits = {GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES} "
+                "are supported.")
+        if self.group_size not in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES:
+            raise ValueError(
+                f"Marlin_24 does not support group_size = {self.group_size}. "
+                f"Only group_sizes = {GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES} "
+                "are supported.")
+
+        self.quant_type = quant_type
+
+        # 4 Bits packed into 32 bit datatype.
+        self.pack_factor = 32 // self.quant_type.size_bits
+
+        # Tile size used by marlin kernels.
+        self.tile_size = 16
+
+        # Min out_features dim
+        self.min_n_threads = GPTQ_MARLIN_24_MIN_THREAD_N
+
+        # Min in_features dim
+        self.min_k_threads = GPTQ_MARLIN_24_MIN_THREAD_K
+
+        # Max parallel problems to solve at once (improves large
+        # batch performance)
+        self.max_parallel = GPTQ_MARLIN_24_MAX_PARALLEL
+
+        # Permutation length used by the marlin kernels.
+        self.perm_len = 1024
+
+    def __repr__(self) -> str:
+        return "Marlin24Config(quant_type={}, group_size={})".format(
+            self.quant_type, self.group_size)
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "gptq_marlin_24"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.half]
+
+    @classmethod
+    # Need to figure it out
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "GPTQMarlin24Config":
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        return cls(weight_bits, group_size)
+
+    @classmethod
+    def override_quantization_method(
+            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
+        is_marlin_24_format = (
+            hf_quant_cfg.get("checkpoint_format") == "marlin_24")
+
+        is_valid_user_quant = (user_quant is None or user_quant == "gptq"
+                               or user_quant == "gptq_marlin_24")
+
+        if is_marlin_24_format and is_valid_user_quant:
+            msg = ("The model is serialized in {} format. "
+                   "Using {} kernel.".format(cls.get_name(), cls.get_name()))
+            logger.info(msg)
+            return cls.get_name()
+
+        return None
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["GPTQMarlin24LinearMethod"]:
+        if isinstance(layer, LinearBase):
+            return GPTQMarlin24LinearMethod(self)
+        return None
+
+
+class GPTQMarlin24LinearMethod(LinearMethodBase):
+    """Linear method for Marlin24.
+
+    Args:
+        quant_config: The Marlin24 quantization config.
+    """
+
+    def __init__(self, quant_config: GPTQMarlin24Config):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del output_size  # Unused.
+        weight_loader = extra_weight_attrs["weight_loader"]
+        if params_dtype != torch.float16:
+            raise ValueError(
+                f"The params dtype must be float16, but got {params_dtype}")
+
+        # Validate output_size_per_partition
+        output_size_per_partition = sum(output_partition_sizes)
+        if output_size_per_partition % self.quant_config.min_n_threads != 0:
+            raise ValueError(
+                f"Weight output_size_per_partition = "
+                f"{output_size_per_partition} is not divisible by "
+                f"min_n_threads = {self.quant_config.min_n_threads}.")
+        if output_size_per_partition % self.quant_config.pack_factor != 0:
+            raise ValueError(
+                f"Weight output_size_per_partition = "
+                f"{output_size_per_partition} is not divisible by "
+                f"pack_factor = {self.quant_config.pack_factor}.")
+
+        # Validate input_size_per_partition
+        if input_size_per_partition % self.quant_config.min_k_threads != 0:
+            raise ValueError(
+                f"Weight input_size_per_partition = "
+                f"{input_size_per_partition} is not divisible by "
+                f"min_k_threads = {self.quant_config.min_k_threads}.")
+        if (self.quant_config.group_size != -1 and
+                input_size_per_partition % self.quant_config.group_size != 0):
+            raise ValueError(f"Weight input_size_per_partition = "
+                             f"{input_size_per_partition} is not divisible by "
+                             f"group_size = {self.quant_config.group_size}.")
+
+        # Check that we have at least 4 tiles horizontally in the shard
+        num_tiles_per_perm = self.quant_config.perm_len // (
+            self.quant_config.tile_size**2)
+        if output_size_per_partition % num_tiles_per_perm != 0:
+            raise ValueError(
+                "Each permutation group must reside on the same gpu")
+
+        # Quantized 4Bit weights packed into Int32.
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition // self.quant_config.tile_size // 2,
+                output_size_per_partition * self.quant_config.tile_size //
+                self.quant_config.pack_factor,
+                device="cuda",
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=1,
+            packed_factor=self.quant_config.pack_factor,
+            marlin_tile_size=self.quant_config.tile_size,
+            weight_loader=weight_loader)
+
+        # Meta
+        meta = PackedvLLMParameter(data=torch.empty(
+            input_size_per_partition // 8 // 2 // 2,
+            output_size_per_partition * 2,
+            device="cuda",
+            dtype=torch.int16,
+        ),
+                                   input_dim=0,
+                                   output_dim=1,
+                                   packed_dim=1,
+                                   packed_factor=1,
+                                   marlin_tile_size=2,
+                                   weight_loader=weight_loader)
+
+        # Determine if channelwise or not
+        input_groups = (1 if self.quant_config.group_size == -1 else
+                        input_size_per_partition //
+                        self.quant_config.group_size)
+
+        weight_scale_args = {
+            "data":
+            torch.empty(
+                input_groups,
+                output_size_per_partition,
+                device="cuda",
+                dtype=params_dtype,
+            ),
+            "weight_loader":
+            weight_loader
+        }
+        if input_groups == 1:
+            scales = ChannelQuantScaleParameter(output_dim=1,
+                                                **weight_scale_args)
+        else:
+            scales = GroupQuantScaleParameter(output_dim=1,
+                                              input_dim=0,
+                                              **weight_scale_args)
+
+        # Allocate workspace (Used for internal locking mechanism)
+        max_workspace_size = (
+            output_size_per_partition //
+            self.quant_config.min_n_threads) * self.quant_config.max_parallel
+
+        workspace = BasevLLMParameter(data=torch.zeros(max_workspace_size,
+                                                       device="cuda",
+                                                       dtype=torch.int),
+                                      weight_loader=weight_loader)
+
+        layer.register_parameter("B_24", qweight)
+        layer.register_parameter("B_meta", meta)
+        layer.register_parameter("s", scales)
+        layer.register_parameter("workspace", workspace)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # required by torch.compile
+        layer.B_24 = Parameter(layer.B_24.data, requires_grad=False)
+        layer.s = Parameter(layer.s.data, requires_grad=False)
+        layer.B_meta = Parameter(layer.B_meta.data, requires_grad=False)
+        layer.workspace = Parameter(layer.workspace.data, requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        qweight = layer.B_24
+        meta = layer.B_meta
+        scales = layer.s
+        workspace = layer.workspace
+
+        x_2d = x.view(-1, x.shape[-1])
+
+        size_m = x_2d.shape[0]
+        size_k = x_2d.shape[1]
+        size_n = scales.shape[1]
+
+        output_2d = ops.gptq_marlin_24_gemm(x_2d, qweight, meta, scales,
+                                            workspace,
+                                            self.quant_config.quant_type,
+                                            size_m, size_n, size_k)
+
+        output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
+
+        if bias is not None:
+            output.add_(bias)  # In-place add
+
+        return output
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/hqq_marlin.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/hqq_marlin.py
new file mode 100644
index 0000000..ee8a0e3
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/hqq_marlin.py
@@ -0,0 +1,332 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
+    marlin_make_empty_g_idx, marlin_permute_scales)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
+    MarlinWorkspace)
+from vllm.model_executor.layers.quantization.utils.quant_utils import gptq_pack
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           GroupQuantScaleParameter,
+                                           PackedvLLMParameter)
+from vllm.scalar_type import scalar_types
+
+logger = init_logger(__name__)
+
+
+class HQQMarlinConfig(QuantizationConfig):
+    """Config class for HQQ Marlin"""
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        skip_modules: Optional[list[str]] = None,
+    ) -> None:
+        super().__init__()
+        assert group_size == 64, ("The only supported HQQ group size is "
+                                  "currently 64.")
+        assert weight_bits == 4, ("The only supported HQQ quantization "
+                                  "bitsize is currently 4.")
+
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.pack_factor = 32 // weight_bits  # packed into int32 in GPTQ format
+        self.quant_type = scalar_types.uint4
+        self.skip_modules = skip_modules
+
+    def __repr__(self) -> str:
+        return (f"HQQMarlinConfig(quant_type={self.quant_type}, "
+                f"group_size={self.group_size})")
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "hqq"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.half, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "HQQMarlinConfig":
+        wq_params = (config["quant_config"]["weight_quant_params"])
+        weight_bits = cls.get_from_keys(wq_params, ["nbits"])
+        group_size = cls.get_from_keys(wq_params, ["group_size"])
+        skip_modules = config["skip_modules"]
+        return cls(weight_bits, group_size, skip_modules)
+
+    def is_layer_skipped(self, prefix: str) -> bool:
+        # Split the prefix into its dot-separated components
+        components = prefix.split('.')
+
+        # Check if any of the skip modules exactly matches any component
+        return self.skip_modules is not None and any(
+            module_name in components for module_name in self.skip_modules)
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        if isinstance(layer, LinearBase):
+            if self.is_layer_skipped(prefix):
+                return UnquantizedLinearMethod()
+            return HQQMarlinMethod(self)
+        return None
+
+
+# Empty HQQ parameter, will be ignored during loading
+class HQQEmptyParameter(BasevLLMParameter):
+
+    def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        pass
+
+    def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
+        pass
+
+    def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        pass
+
+
+def error_loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
+    raise ValueError("No loader provided for HQQ parameter!")
+
+
+# HQQ packing creates issues with sharding - therefore, prior to loading, we
+# repack to GPTQ. We also reshape the weights to their proper GPTQ shape.
+class HQQweightParameter(PackedvLLMParameter):
+
+    # unpack function from https://github.com/mobiusml/hqq
+    def unpack_4bit_u8(self,
+                       W_q: torch.Tensor) -> torch.Tensor:  # uint8/2 > uint8
+        assert self.weight_bits == 4, "Unsupported quant bitsize (must be 4)"
+
+        dtype = torch.uint8
+        step = W_q.shape[0]
+        tmp = torch.empty([2 * step, W_q.shape[1]],
+                          dtype=dtype,
+                          device=W_q.device)
+        tmp[:step] = (W_q & 0b11110000) >> 4
+        tmp[step:] = W_q & 0b00001111
+        return tmp
+
+    def __init__(self, packed_factor: int, packed_dim: int, weight_bits: int,
+                 **kwargs):
+        super().__init__(packed_factor, packed_dim, None, **kwargs)
+        self.weight_bits = weight_bits
+        self.input_shape = self.shape[self.input_dim] * self.packed_factor
+        self.output_shape = self.shape[self.output_dim]
+
+    def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        loaded_weight = self.unpack_4bit_u8(loaded_weight)
+        loaded_weight = loaded_weight.reshape(-1, self.input_shape).transpose(
+            1, 0)
+        loaded_weight = gptq_pack(loaded_weight, self.weight_bits,
+                                  loaded_weight.shape[0],
+                                  loaded_weight.shape[1])
+        super().load_merged_column_weight(loaded_weight, **kwargs)
+
+    def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
+        loaded_weight = self.unpack_4bit_u8(loaded_weight)
+        loaded_weight = loaded_weight.reshape(self.output_shape,
+                                              -1).transpose(1, 0)
+        loaded_weight = gptq_pack(loaded_weight, self.weight_bits,
+                                  loaded_weight.shape[0],
+                                  loaded_weight.shape[1])
+        super().load_row_parallel_weight(loaded_weight)
+
+    def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        loaded_weight = self.unpack_4bit_u8(loaded_weight)
+        loaded_weight = loaded_weight.reshape(-1, self.input_shape).transpose(
+            1, 0)
+        loaded_weight = gptq_pack(loaded_weight, self.weight_bits,
+                                  loaded_weight.shape[0],
+                                  loaded_weight.shape[1])
+        super().load_qkv_weight(loaded_weight, **kwargs)
+
+
+# Zero points and scales in HQQ must also be reshaped to correspond to W_q's
+# GPTQ shape (transposed - we transpose them too when processing weights).
+class HQQZeroScaleParameter(GroupQuantScaleParameter):
+
+    def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        loaded_weight = loaded_weight.reshape(-1, self.shape[1])
+        super().load_merged_column_weight(loaded_weight, **kwargs)
+
+    def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
+        loaded_weight = loaded_weight.reshape(self.shape[0], -1)
+        super().load_row_parallel_weight(loaded_weight)
+
+    def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        loaded_weight = loaded_weight.reshape(-1, self.shape[1])
+        super().load_qkv_weight(loaded_weight, **kwargs)
+
+
+class HQQMarlinMethod(LinearMethodBase):
+    """Linear method for HQQ Marlin.
+    """
+
+    def __init__(
+        self,
+        quant_config: HQQMarlinConfig,
+    ):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        self.output_size_per_partition = sum(output_partition_sizes)
+        self.input_size_per_partition = input_size_per_partition
+
+        weight_loader = extra_weight_attrs.get("weight_loader", error_loader)
+
+        self.scales_and_zp_size = (input_size_per_partition //
+                                   self.quant_config.group_size)
+
+        qweight = HQQweightParameter(
+            data=torch.empty(
+                self.input_size_per_partition // self.quant_config.pack_factor,
+                self.output_size_per_partition,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=0,
+            packed_factor=self.quant_config.pack_factor,
+            weight_bits=self.quant_config.weight_bits,
+            weight_loader=weight_loader)
+
+        zeros = HQQZeroScaleParameter(data=torch.empty(
+            self.output_size_per_partition,
+            self.scales_and_zp_size,
+            dtype=params_dtype,
+        ),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
+
+        scales = HQQZeroScaleParameter(data=torch.empty(
+            self.output_size_per_partition,
+            self.scales_and_zp_size,
+            dtype=params_dtype,
+        ),
+                                       input_dim=1,
+                                       output_dim=0,
+                                       weight_loader=weight_loader)
+
+        layer.register_parameter("W_q", qweight)
+        layer.register_parameter("zero", zeros)
+        layer.register_parameter("scale", scales)
+
+        # Ignore extra parameters in the HQQ model.
+        # To be added as needed.
+        ignore_parameters = ("axis", "channel_wise", "compute_dtype",
+                             "encoded_state_dict", "group_size", "nbits",
+                             "offload_meta", "optimize", "packing",
+                             "quant_scale", "quant_zero", "round_zero",
+                             "shape", "stores_quant_config",
+                             "unpack_view_dtype", "view_as_float")
+        for name in ignore_parameters:
+            layer.register_parameter(
+                name,
+                HQQEmptyParameter(data=torch.empty(0),
+                                  weight_loader=weight_loader))
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        dev = layer.W_q.device
+
+        # Repack to Marlin
+        sort_indices = torch.empty(0, dtype=torch.int, device=dev)
+        marlin_w_q = ops.gptq_marlin_repack(
+            layer.W_q,
+            sort_indices,
+            self.input_size_per_partition,
+            self.output_size_per_partition,
+            self.quant_config.weight_bits,
+        ).to(dev)
+        marlin_s = marlin_permute_scales(layer.scale.transpose(1, 0),
+                                         self.input_size_per_partition,
+                                         self.output_size_per_partition,
+                                         self.quant_config.group_size).to(dev)
+        marlin_zp = marlin_permute_scales(layer.zero.transpose(1, 0),
+                                          self.input_size_per_partition,
+                                          self.output_size_per_partition,
+                                          self.quant_config.group_size).to(dev)
+
+        layer.g_idx = marlin_make_empty_g_idx(dev)
+        layer.g_idx_sort_indices = marlin_make_empty_g_idx(dev)
+
+        layer.marlin_qweight = marlin_w_q
+        layer.marlin_zeros = marlin_zp
+        layer.marlin_scales = marlin_s
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        workspace = MarlinWorkspace(self.output_size_per_partition,
+                                    GPTQ_MARLIN_MIN_THREAD_N,
+                                    GPTQ_MARLIN_MAX_PARALLEL)
+
+        scales = layer.marlin_scales
+        zeros = layer.marlin_zeros
+        orig_type = x.dtype
+
+        if orig_type != torch.float16:
+            x = x.to(torch.float16)
+            scales = scales.to(torch.float16)
+            zeros = zeros.to(torch.float16)
+
+        marlin_out = ops.gptq_marlin_gemm(
+            x,
+            None,
+            layer.marlin_qweight,
+            scales,
+            None,
+            zeros,
+            layer.g_idx,
+            layer.g_idx_sort_indices,
+            workspace.scratch,
+            scalar_types.uint4,
+            x.shape[0],
+            self.output_size_per_partition,
+            self.input_size_per_partition,
+            True,  # is_k_full
+            False,  # use atomic add
+            True,  # use 32-bit reduce
+            True,  # use float zp
+        )
+
+        if orig_type != torch.float16:
+            marlin_out = marlin_out.to(orig_type)
+
+        if bias is not None:
+            marlin_out.add_(bias)
+
+        return marlin_out
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/inc.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/inc.py
new file mode 100644
index 0000000..8aa1f1a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/inc.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Intel Gaudi supports quantization of various modules and functions,
+# including, but not limited to `Linear`, `KVCache`, `Matmul` and `Softmax`.
+# During model loading,
+# INC will patch layers with quantization/dequantization operators.
+# Meanwhile, INC will convert original weight to target datatype
+# and loading to target device.
+# static scaling should be provided through Quant_CONFIG:
+# `QUANT_CONFIG` is an environment variable,
+# that points to the measurement or quantization JSON config file.
+# The measurement configuration file is used during the calibration procedure,
+# to collect measurements for a given model.
+# The quantization configuration is used during inference.
+# For more information, please refer to:
+# https://docs.habana.ai/en/v1.21.1/PyTorch/vLLM_Inference/vLLM_FP8_Inference.html
+
+from typing import Any, Optional
+
+import torch
+
+from vllm.model_executor.layers.fused_moe.layer import (
+    FusedMoE, UnquantizedFusedMoEMethod)
+from vllm.model_executor.layers.linear import (LinearBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+
+
+class INCConfig(QuantizationConfig):
+    """Config class for FP8 using Intel Neural Compressor."""
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "inc"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.bfloat16]
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "INCConfig":
+        raise AssertionError
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        if isinstance(layer, LinearBase):
+            return UnquantizedLinearMethod()
+        elif isinstance(layer, FusedMoE):
+            return UnquantizedFusedMoEMethod(layer.moe_config)
+        return None
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        raise AssertionError
+
+    @staticmethod
+    def get_config_filenames() -> list[str]:
+        return []
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/input_quant_fp8.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/input_quant_fp8.py
new file mode 100644
index 0000000..e1a9bdd
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/input_quant_fp8.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape)
+from vllm.platforms import current_platform
+
+# Using the default value (240.0) from pytorch will cause accuracy
+# issue on dynamic quantization models. Here use 224.0 for fnuz on ROCm.
+_FP8_DTYPE = current_platform.fp8_dtype()
+_FP8_FINFO = torch.finfo(_FP8_DTYPE)
+_FP8_MAX = 224.0 if current_platform.is_fp8_fnuz() else _FP8_FINFO.max
+_FP8_MIN = -224.0 if current_platform.is_fp8_fnuz() else _FP8_FINFO.min
+_FP8_MIN_SCALING_FACTOR = 1.0 / (_FP8_MAX * 512.0)
+
+
+@CustomOp.register("quant_fp8")
+class QuantFP8(CustomOp):
+    """
+    Quantize input tensor to per-tensor or per-token FP8.
+    This CustomOp supports both static and dynamic quantization.
+    """
+
+    def __init__(self,
+                 static: bool,
+                 group_shape: GroupShape,
+                 num_token_padding: Optional[int] = None):
+        """
+
+        :param static: static or dynamic quantization
+        :param group_shape: quantization group shape (PER_TOKEN or PER_TENSOR)
+        :param num_token_padding: Pad the token dimension of output to this size
+        """
+        super().__init__()
+        self.num_token_padding = num_token_padding
+        assert group_shape in {GroupShape.PER_TOKEN, GroupShape.PER_TENSOR}
+        assert not static or group_shape == GroupShape.PER_TENSOR, \
+            "Only per-tensor scales supported for static quantization."
+        self.static = static
+        self.group_shape = group_shape
+        self.use_per_token_if_dynamic = group_shape == GroupShape.PER_TOKEN
+
+    def forward_cuda(
+        self,
+        x: torch.Tensor,
+        scale: Optional[torch.Tensor] = None,
+        scale_ub: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        assert (scale is not None) == self.static
+        assert scale_ub is None or (not self.static and self.group_shape
+                                    == GroupShape.PER_TOKEN
+                                    and scale_ub.numel() == 1)
+
+        return ops.scaled_fp8_quant(
+            x,
+            scale,
+            num_token_padding=self.num_token_padding,
+            scale_ub=scale_ub,
+            use_per_token_if_dynamic=self.use_per_token_if_dynamic)
+
+    def forward_native(
+        self,
+        x: torch.Tensor,
+        scale: Optional[torch.Tensor] = None,
+        scale_ub: Optional[torch.Tensor] = None,
+    ):
+        assert (scale is not None) == self.static
+        assert scale_ub is None or (not self.static and self.group_shape
+                                    == GroupShape.PER_TOKEN
+                                    and scale_ub.numel() == 1)
+
+        if scale is None:
+            if self.group_shape == GroupShape.PER_TOKEN:
+                x_max, _ = x.abs().max(dim=-1)
+                x_max = x_max.unsqueeze(-1).to(torch.float32)
+                if scale_ub is not None:
+                    x_max = x_max.clamp(max=scale_ub)
+            else:
+                x_max = x.abs().max().unsqueeze(-1).to(torch.float32)
+
+            scale = x_max / _FP8_MAX
+            scale = scale.clamp(min=_FP8_MIN_SCALING_FACTOR)
+
+        # Even for dynamic per-token scales,
+        # reciprocal performs slightly better than division
+        out = x.to(torch.float32) * scale.reciprocal()
+        out = out.clamp(_FP8_MIN, _FP8_MAX).to(_FP8_DTYPE)
+
+        # This currently generates an extra Triton kernel in compilation.
+        # Fortunately, we don't use padding if compiling.
+        # TODO(luka): benchmark torch._scaled_mm to hopefully remove padding
+        #  in general.
+        if self.num_token_padding is not None:
+            padding = max(self.num_token_padding - out.size(0), 0)
+            out = F.pad(out, (0, 0, 0, padding), "constant", 0.0)
+
+        return out, scale
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/ipex_quant.py
new file mode 100644
index 0000000..428e9b8
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -0,0 +1,250 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Optional
+
+import torch
+
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.awq import (AWQLinearMethod,
+                                                         is_layer_skipped_awq)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
+from vllm.platforms import current_platform
+
+MIN_IPEX_VERSION = "2.6.0"
+
+
+class IPEXConfig(QuantizationConfig):
+    """INT8 quantization config class using IPEX for the CPU/XPU backend,
+    including AWQ, GPTQ.
+    """
+
+    IPEX_QUANT_METHOD_MAP = {
+        "awq": 1,
+        "gptq": 0,
+    }
+
+    def __init__(
+        self,
+        method: str,
+        weight_bits: int,
+        group_size: int,
+        modules_to_not_convert: Optional[list[str]] = None,
+        desc_act: Optional[bool] = None,
+        lm_head_quantized: Optional[bool] = None,
+    ) -> None:
+        super().__init__()
+        self.method = method
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.modules_to_not_convert = modules_to_not_convert or []
+        self.desc_act = desc_act
+        self.lm_head_quantized = lm_head_quantized
+        self.pack_factor = 32 // self.weight_bits
+
+        if self.weight_bits not in [4]:
+            raise ValueError(f"IPEX quantization supports weight bits [4], "
+                             f"but got {self.weight_bits}.")
+
+        if self.method not in ["awq", "gptq"]:
+            raise ValueError(f"IPEX quantization supports [awq, gptq], "
+                             f"but got {self.method}.")
+
+    def __repr__(self) -> str:
+        return (f"IPEXConfig(method={self.method},"
+                f"weight_bits={self.weight_bits}, "
+                f"group_size={self.group_size})")
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "ipex"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.bfloat16, torch.float16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return -1
+
+    @staticmethod
+    def get_config_filenames() -> list[str]:
+        return [
+            "quant_config.json",
+            "quantize_config.json",
+        ]
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "IPEXConfig":
+        method = cls.get_from_keys(config, ["quant_method"]).lower()
+        if method == "awq":
+            weight_bits = cls.get_from_keys(config, ["w_bit", "bits"])
+            group_size = cls.get_from_keys(config,
+                                           ["q_group_size", "group_size"])
+            modules_to_not_convert = cls.get_from_keys_or(
+                config, ["modules_to_not_convert"], None)
+            return cls(method, weight_bits, group_size, modules_to_not_convert,
+                       False, False)
+        # otherwise for gptq
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
+                                                 default=False)
+        desc_act = cls.get_from_keys_or(config, ["desc_act"], default=False)
+        return cls(method, weight_bits, group_size, [], desc_act,
+                   lm_head_quantized)
+
+    @classmethod
+    def override_quantization_method(
+            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
+        if not current_platform.is_cpu() and not current_platform.is_xpu():
+            return None
+
+        quant_method = hf_quant_cfg.get("quant_method", "").lower()
+
+        if quant_method in ["awq", "gptq"]:
+            return cls.get_name()
+
+        return None
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["LinearMethodBase"]:
+        if isinstance(layer, LinearBase):
+            if self.method == "awq":
+                if is_layer_skipped_awq(prefix, self.modules_to_not_convert):
+                    return UnquantizedLinearMethod()
+                return IPEXAWQLinearMethod(self)
+            if self.method == "gptq":
+                return IPEXGPTQLinearMethod(self)
+        return None
+
+
+class IPEXGPTQLinearMethod(GPTQLinearMethod):
+    """GPTQ linear method using IPEX for the CPU/XPU backend.
+    """
+
+    def __init__(self, quant_config: IPEXConfig):
+        self.quant_config = quant_config  # type: ignore
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        bias = layer.bias if not layer.skip_bias_add else None
+
+        try:
+            import intel_extension_for_pytorch as ipex
+            if ipex.__version__ < MIN_IPEX_VERSION:
+                raise ImportError(
+                    "intel_extension_for_pytorch version is "
+                    "wrong. Please install "
+                    f"intel_extension_for_pytorch>={MIN_IPEX_VERSION}.")
+        except ImportError as err:
+            raise ImportError(
+                "Please install "
+                f"intel_extension_for_pytorch>={MIN_IPEX_VERSION} via "
+                f"`pip install intel_extension_for_pytorch>={MIN_IPEX_VERSION}`"
+                " to use IPEX-AWQ linear method.") from err
+        # Using the compute dtype (lowp_mode) as INT8 to leverage instructions
+        # with better performance.
+        lowp_mode = ipex.quantization.WoqLowpMode.INT8
+        # The weight will be de-packed from INT4 to INT8.
+        weight_dtype = ipex.quantization.WoqWeightDtype.INT4
+        # The float activation will be quantized (dynamic, per-token) to INT8.
+        act_quant_mode = ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK
+
+        qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping(
+            weight_dtype=weight_dtype,
+            lowp_mode=lowp_mode,
+            act_quant_mode=act_quant_mode,
+            group_size=self.quant_config.group_size,
+        )
+        layer.ipex_output_size = layer.qweight.shape[-1]
+        g_idx = layer.g_idx if self.quant_config.desc_act else None
+        layer.ipex_qlinear = ipex.llm.quantization.woq_linear. \
+            IPEXWeightOnlyQuantizedLinear.from_weight(
+            layer.qweight,
+            layer.scales,
+            layer.qzeros,
+            layer.qweight.size(0),
+            layer.ipex_output_size,
+            qconfig=qconfig,
+            g_idx=g_idx,
+            bias=bias,
+            group_size=self.quant_config.group_size,
+            quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["gptq"]
+        )
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        reshaped_x = x.reshape(-1, x.shape[-1])
+        out = layer.ipex_qlinear(reshaped_x)
+        return out.reshape(x.shape[:-1] + (layer.ipex_output_size, ))
+
+
+class IPEXAWQLinearMethod(AWQLinearMethod):
+    """AWQ linear method using IPEX for the CPU/XPU backend.
+    """
+
+    def __init__(self, quant_config: IPEXConfig):
+        self.quant_config = quant_config  # type: ignore
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        super().process_weights_after_loading(layer=layer)
+
+        bias = layer.bias if not layer.skip_bias_add else None
+
+        try:
+            import intel_extension_for_pytorch as ipex
+            if ipex.__version__ < MIN_IPEX_VERSION:
+                raise ImportError(
+                    "intel_extension_for_pytorch version is "
+                    "wrong. Please install "
+                    f"intel_extension_for_pytorch>={MIN_IPEX_VERSION}.")
+        except ImportError as err:
+            raise ImportError(
+                "Please install "
+                f"intel_extension_for_pytorch>={MIN_IPEX_VERSION} via "
+                f"`pip install intel_extension_for_pytorch>={MIN_IPEX_VERSION}`"
+                " to use IPEX-AWQ linear method.") from err
+
+        # Using the compute dtype (lowp_mode) as INT8 to leverage instructions
+        # with better performance.
+        lowp_mode = ipex.quantization.WoqLowpMode.INT8
+        # The weight will be de-packed from INT4 to INT8.
+        weight_dtype = ipex.quantization.WoqWeightDtype.INT4
+        # The float activation will be quantized (dynamic, per-token) to INT8.
+        act_quant_mode = ipex.quantization.WoqActQuantMode.PER_BATCH
+
+        qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping(
+            weight_dtype=weight_dtype,
+            lowp_mode=lowp_mode,
+            act_quant_mode=act_quant_mode,
+            group_size=self.quant_config.group_size,
+        )
+
+        layer.ipex_output_size = layer.qweight.size(
+            1) * self.quant_config.pack_factor
+        layer.ipex_qlinear = ipex.llm.quantization.woq_linear. \
+            IPEXWeightOnlyQuantizedLinear.from_weight(
+            layer.qweight,
+            layer.scales,
+            layer.qzeros,
+            layer.qweight.size(0),
+            layer.ipex_output_size,
+            qconfig=qconfig,
+            bias=bias,
+            group_size=self.quant_config.group_size,
+            quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["awq"]  # type: ignore
+        )
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        reshaped_x = x.reshape(-1, x.shape[-1])
+        out = layer.ipex_qlinear(reshaped_x)
+        return out.reshape(x.shape[:-1] + (layer.ipex_output_size, ))
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/__init__.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
new file mode 100644
index 0000000..07ecc09
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Callable, Optional
+
+import torch
+
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.scalar_type import ScalarType
+
+
+@dataclass
+class MPLinearLayerConfig:
+    full_weight_shape: tuple[int, int]  # [in, out]
+    partition_weight_shape: tuple[int, int]
+    weight_type: ScalarType
+    act_type: torch.dtype
+    group_size: int
+    zero_points: bool
+    has_g_idx: bool
+
+
+class MPLinearKernel(ABC):
+
+    @classmethod
+    @abstractmethod
+    def get_min_capability(cls) -> int:
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def can_implement(cls,
+                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+        raise NotImplementedError
+
+    def __init__(self,
+                 c: MPLinearLayerConfig,
+                 w_q_param_name: str,
+                 w_s_param_name: str,
+                 w_zp_param_name: Optional[str] = None,
+                 w_gidx_param_name: Optional[str] = None) -> None:
+        assert self.can_implement(c)
+        self.config = c
+        self.w_q_name = w_q_param_name
+        self.w_s_name = w_s_param_name
+        if c.zero_points:
+            assert w_zp_param_name is not None
+        if c.has_g_idx:
+            assert w_gidx_param_name is not None
+        self.w_zp_name = w_zp_param_name
+        self.w_gidx_name = w_gidx_param_name
+
+    @abstractmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        raise NotImplementedError
+
+    def _transform_param(self, layer: torch.nn.Module, name: Optional[str],
+                         fn: Callable) -> None:
+        if name is not None and getattr(layer, name, None) is not None:
+
+            old_param = getattr(layer, name)
+            new_param = fn(old_param)
+            # replace the parameter with torch.nn.Parameter for TorchDynamo
+            # compatibility
+            replace_parameter(
+                layer, name,
+                torch.nn.Parameter(new_param.data, requires_grad=False))
+
+    def _get_weight_params(
+            self, layer: torch.nn.Module) -> tuple[
+                torch.Tensor,  # w_q
+                torch.Tensor,  # w_s
+                Optional[torch.Tensor],  # w_zp, 
+                Optional[torch.Tensor]  # w_gidx
+            ]:
+        return (
+            getattr(layer, self.w_q_name),
+            getattr(layer, self.w_s_name),
+            getattr(layer, self.w_zp_name or "", None),
+            getattr(layer, self.w_gidx_name or "", None),
+        )
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
new file mode 100644
index 0000000..21e5ae7
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import vllm.envs as envs
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.allspark import (  # noqa: E501
+    AllSparkLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.bitblas import (  # noqa: E501
+    BitBLASLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.conch import (  # noqa: E501
+    ConchLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.exllama import (  # noqa: E501
+    ExllamaLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.machete import (  # noqa: E501
+    MacheteLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.marlin import (  # noqa: E501
+    MarlinLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.MPLinearKernel import (  # noqa: E501
+    MPLinearKernel, MPLinearLayerConfig)
+from vllm.platforms import current_platform
+
+# in priority/performance order (when available)
+_POSSIBLE_KERNELS: list[type[MPLinearKernel]] = [
+    MacheteLinearKernel,
+    AllSparkLinearKernel,
+    MarlinLinearKernel,
+    BitBLASLinearKernel,
+    ConchLinearKernel,
+    ExllamaLinearKernel,
+]
+
+
+def choose_mp_linear_kernel(
+        config: MPLinearLayerConfig,
+        compute_capability: Optional[int] = None) -> type[MPLinearKernel]:
+    """
+    Choose an MPLinearKernel that can implement the given config for the given
+     compute capability. Attempts to choose the best kernel in terms of 
+     performance.
+
+    Args:
+        config (MPLinearLayerConfig): Description of the linear layer to be 
+          implemented.
+        compute_capability (Optional[int], optional): The compute capability of
+          the target device, if None uses `current_platform` to get the compute 
+          capability. Defaults to None.
+
+    Raises:
+        ValueError: If no kernel can implement the given config.
+
+    Returns:
+        type[MPLinearKernel]: Chosen kernel.
+    """
+    if compute_capability is None:
+        if current_platform is None:
+            raise ValueError("Cannot determine compute capability")
+        _cc = current_platform.get_device_capability()
+        compute_capability = _cc[0] * 10 + _cc[1]
+
+    failure_reasons = []
+    for kernel in _POSSIBLE_KERNELS:
+        if kernel.__name__ in envs.VLLM_DISABLED_KERNELS:
+            failure_reasons.append(
+                f' {kernel.__name__} disabled by environment variable')
+            continue
+
+        if kernel.get_min_capability() > compute_capability:
+            failure_reasons.append(
+                f"{kernel.__name__} requires capability "
+                f"{kernel.get_min_capability()}, current compute capability "
+                f"is {compute_capability}")
+            continue
+
+        can_implement, failure_reason = kernel.can_implement(config)
+        if can_implement:
+            return kernel
+        else:
+            failure_reasons.append(
+                f' {kernel.__name__} cannot implement due to: {failure_reason}'
+            )
+
+    raise ValueError(
+        "Failed to find a kernel that can implement the "\
+        "WNA16 linear layer. Reasons: \n"
+        + '\n'.join(failure_reasons))
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py
new file mode 100644
index 0000000..785e559
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py
@@ -0,0 +1,116 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.allspark_utils import (
+    ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD, check_allspark_supported_dtype_shape)
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           permute_param_layout_)
+
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
+
+
+class AllSparkLinearKernel(MPLinearKernel):
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def can_implement(cls,
+                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+        if c.has_g_idx:
+            return False, "Act reordering currently not supported by AllSpark"
+
+        if c.zero_points:
+            return False, "Zero points currently not supported by AllSpark"
+
+        return check_allspark_supported_dtype_shape(
+            c.partition_weight_shape[0],  # in_features
+            c.partition_weight_shape[1],  # out_features
+            c.group_size,
+            c.weight_type,
+            c.act_type)
+
+    # note assumes that
+    #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
+    #  `weight_scale` is: {input_dim = 0, output_dim = 1}
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        device = getattr(layer, self.w_q_name).device
+        c = self.config
+
+        # prepare the parameters required for the kernel
+        properties = torch.cuda.get_device_properties(device.index)
+        sm_count = properties.multi_processor_count
+        sm_version = properties.major * 10 + properties.minor
+        gemm_args = {}
+        gemm_args['sm_count'] = sm_count
+        gemm_args['sm_version'] = sm_version
+
+        self.gemm_args = gemm_args
+
+        # transform param weight, scale
+        old_weight_param = getattr(layer, self.w_q_name)
+        old_scale_param = getattr(layer, self.w_s_name)
+
+        assert isinstance(old_weight_param, BasevLLMParameter)
+        permute_param_layout_(old_weight_param,
+                              input_dim=0,
+                              output_dim=1,
+                              packed_dim=0)
+
+        assert isinstance(old_scale_param, BasevLLMParameter)
+        permute_param_layout_(old_scale_param, input_dim=0, output_dim=1)
+
+        # unpack weight from K / 4 x N int32 to K x N uint8
+        new_weight_param = torch.nn.Parameter(old_weight_param.data,
+                                              requires_grad=False)
+        new_weight_param.data = new_weight_param.data.t().contiguous().view(
+            dtype=torch.uint8)
+        new_weight_param.data = new_weight_param.data.t().contiguous()
+
+        new_scale_param = torch.nn.Parameter(old_scale_param.data,
+                                             requires_grad=False)
+
+        # reorder K x N weight as N32K16 format for Ampere W8A16
+        new_weight_param.data, new_scale_param.data, _ = \
+            ops.allspark_repack_weight(
+                new_weight_param.data, new_scale_param.data, None,
+                c.zero_points)
+
+        replace_parameter(layer, self.w_q_name, new_weight_param.data)
+        replace_parameter(layer, self.w_s_name, new_scale_param.data)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        c = self.config
+        gemm_args = self.gemm_args
+        w_q, w_s, _, _ = self._get_weight_params(layer)
+
+        reshaped_x = x.reshape(-1, x.shape[-1])
+        out_shape = x.shape[:-1] + (c.partition_weight_shape[1], )
+
+        output = ops.allspark_w8a16_gemm(
+            a=reshaped_x,
+            b_qweight=w_q,
+            b_scales=w_s,
+            b_qzeros=None,
+            n=c.partition_weight_shape[1],
+            group_size=c.group_size,
+            sm_count=gemm_args['sm_count'],
+            sm_version=gemm_args['sm_version'],
+            CUBLAS_M_THRESHOLD=ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
+            has_zp=c.zero_points,
+            n32k16_reorder=True)
+
+        if bias is not None:
+            output.add_(bias)  # In-place add
+
+        return output.reshape(out_shape)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
new file mode 100644
index 0000000..649d07b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
@@ -0,0 +1,300 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
+    BITBLAS_OPTIMIZE_FEATURES, BITBLAS_SUPPORTED_GROUP_SIZES,
+    MINIMUM_BITBLAS_VERSION, bitblas_make_empty_g_idx, bitblas_sort_g_idx,
+    check_bitblas_supports_shape, query_bitblas_supported_quant_types,
+    unpack_gptq_qweight, unpack_gptq_qzeros)
+
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
+
+logger = init_logger(__name__)
+
+
+class BitBLASLinearKernel(MPLinearKernel):
+
+    OPT_FEATURES: list[int] = BITBLAS_OPTIMIZE_FEATURES
+    ENABLE_TUNING: bool = True
+    MATMUL_LAYOUT: str = "nt"
+    BITBLAS_DTYPES: dict[torch.dtype, str] = {
+        torch.float32: "float32",
+        torch.float16: "float16",
+        torch.bfloat16: "bfloat16",
+        torch.half: "float16",
+        torch.int8: "int8",
+    }
+    bitblas_matmul: object = None
+
+    def __init__(
+        self,
+        c: MPLinearLayerConfig,
+        w_q_param_name: str,
+        w_s_param_name: str,
+        w_zp_param_name: Optional[str] = None,
+        w_gidx_param_name: Optional[str] = None,
+        bitblas_quant_config: Optional[QuantizationConfig] = None,
+    ):
+        self.quant_config = bitblas_quant_config
+        super().__init__(c, w_q_param_name, w_s_param_name, w_zp_param_name,
+                         w_gidx_param_name)
+
+    def repack_bitblas_from_gptq(
+        self,
+        b_q_weight: torch.Tensor,
+        scales: torch.Tensor,
+        qzeros: Optional[torch.Tensor] = None,
+    ):
+        from bitblas.quantization.utils import general_compress
+        assert self.bitblas_matmul is not None, "bitblas_matmul is None"
+
+        quant_config = self.quant_config
+        # qweight in gptq old quant linear stored with
+        # (outfeatures, infeatures), should be transposed.
+        qweight = b_q_weight.T.contiguous().view(
+            quant_config.torch_storage_dtype)  # type: ignore[union-attr]
+        intweight = unpack_gptq_qweight(
+            qweight,
+            quant_config.weight_bits).contiguous()  # type: ignore[union-attr]
+        if self.bitblas_matmul.weight_transform is not None:  # type: ignore[attr-defined]
+            qweight = self.bitblas_matmul.weight_transform(  # type: ignore[attr-defined]
+                intweight.cpu()).cuda()
+        # scales in gptq old quant linear stored with
+        # (infeatures // group_size, outfeatures), should be transposed.
+        scales = scales.T.contiguous()
+
+        if qzeros is None:
+            return qweight, scales, None
+
+        # qzeros should be de-quantized to int zeros.
+        weight_bits = quant_config.weight_bits  # type: ignore[union-attr]
+        intzeros = unpack_gptq_qzeros(qzeros, weight_bits).T.contiguous()
+        zeros: Optional[torch.Tensor] = None
+        zeros_mode = self.bitblas_matmul.config.zeros_mode  # type: ignore[attr-defined]
+        if zeros_mode == "original":
+            zeros = intzeros.to(torch.float16).contiguous()
+        elif zeros_mode == "rescale":
+            assert zeros is not None, "zeros should not be None"
+            zeros[:, :] = intzeros.to(torch.float16)[:, :] * scales[:, :]
+        elif zeros_mode == "quantized":
+            zeros = (
+                torch.Tensor(
+                    general_compress(
+                        intzeros.T.contiguous().cpu().numpy(),
+                        weight_bits,
+                    )).to(qweight.device).
+                to(quant_config.torch_storage_dtype  # type: ignore[union-attr]
+                   ).contiguous())
+        else:
+            raise ValueError("Unsupported zeros type: {}".format(zeros_mode))
+
+        return qweight, scales, zeros
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+
+    @classmethod
+    def can_implement(cls,
+                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+
+        is_bitblas_installed = True
+
+        try:
+            import bitblas
+            if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
+                raise ImportError(
+                    "bitblas version is wrong. Please "
+                    f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
+        except ImportError:
+            is_bitblas_installed = False
+
+        if not is_bitblas_installed:
+            return False, "bitblas is not installed. Please install bitblas "\
+                          "by running `pip install bitblas>="\
+                           f"{MINIMUM_BITBLAS_VERSION}`"
+
+        quant_types = query_bitblas_supported_quant_types(c.zero_points)
+        if c.weight_type not in quant_types:
+            return False, (f"Quant type ({c.weight_type}) not supported by"
+                           f"  BitBLAS, supported types are: {quant_types}")
+
+        if c.group_size not in BITBLAS_SUPPORTED_GROUP_SIZES:
+            return False, (f"Group size ({c.group_size}) not supported by "
+                           "BitBLAS, supported group sizes are: "
+                           f"{BITBLAS_SUPPORTED_GROUP_SIZES}")
+
+        return check_bitblas_supports_shape(
+            c.partition_weight_shape[1],  # out_features
+            c.partition_weight_shape[0],  # in_features
+            c.full_weight_shape[0],  # in_features
+            c.group_size)
+
+    # note assumes that
+    #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
+    #  `weight_scale` is: {input_dim = 0, output_dim = 1}
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        device = getattr(layer, self.w_q_name).device
+        c = self.config
+        quant_config = self.quant_config
+
+        # Default names since bitblas requires empty parameters for these,
+        # TODO: remove this requirement from bitblas (allow optional tensors)
+        if self.w_gidx_name is None:
+            self.w_gidx_name = "g_idx"
+        if self.w_zp_name is None:
+            self.w_zp_name = "qzeros"
+
+        if c.has_g_idx:
+            g_idx, g_idx_sort_indices = bitblas_sort_g_idx(
+                getattr(layer, self.w_gidx_name))
+            self._transform_param(layer, self.w_gidx_name, lambda _: g_idx)
+            layer.g_idx_sort_indices = g_idx_sort_indices
+        else:
+            setattr(layer, self.w_gidx_name, bitblas_make_empty_g_idx(device))
+            layer.g_idx_sort_indices = bitblas_make_empty_g_idx(device)
+
+        if c.zero_points:
+            raise NotImplementedError("Zero points not supported by BitBLAS")
+        else:
+            setattr(layer, self.w_zp_name, bitblas_make_empty_g_idx(device))
+
+        # Repack weights
+        bitblas_qweight, bitblas_scales, bitblas_qzeros = (
+            self.repack_bitblas_from_gptq(
+                layer.qweight,
+                layer.scales,
+                None if quant_config.is_sym else  # type: ignore[union-attr]
+                layer.qzeros,  # type: ignore[union-attr]
+            ))
+        replace_parameter(layer, self.w_q_name, bitblas_qweight)
+        replace_parameter(layer, self.w_s_name, bitblas_scales)
+        if bitblas_qzeros is not None:
+            replace_parameter(layer, self.w_zp_name, bitblas_qzeros)
+
+    def configure_bitblas_matmul(
+        self,
+        infeatures: int,
+        outfeatures: int,
+        params_dtype: torch.dtype,
+        bias: bool,
+    ) -> None:
+        enable_tuning = self.ENABLE_TUNING
+        layout = self.MATMUL_LAYOUT
+        bits = self.quant_config.weight_bits  # type: ignore[union-attr]
+        self._configure_bitblas_matmul(
+            infeatures,
+            outfeatures,
+            params_dtype,
+            enable_tuning,
+            bias,
+            layout,
+            bits,
+        )
+
+    def _configure_bitblas_matmul(
+        self,
+        infeatures,
+        outfeatures,
+        params_dtype,
+        enable_tuning,
+        bias,
+        layout,
+        bits,
+    ):
+        from bitblas import MatmulConfig
+        bitblas_dtype = self.BITBLAS_DTYPES[params_dtype]
+        quant_config = self.quant_config
+        with_scaling = False
+        with_zeros = False
+        group_size = quant_config.group_size  # type: ignore[union-attr]
+        zeros_mode = quant_config.zeros_mode  # type: ignore[union-attr]
+        if quant_config.quant_method == "gptq":  # type: ignore[union-attr]
+            with_scaling = True
+            with_zeros = True
+            W_dtype = f"uint{bits}"
+            if quant_config.is_sym:  # type: ignore[union-attr]
+                with_zeros = False
+                W_dtype = f"int{bits}"
+        else:
+            raise ValueError(
+                f"Unsupported quant_method {quant_config.quant_method}"  # type: ignore[union-attr]
+            )  # type: ignore[union-attr]
+
+        matmul_config = MatmulConfig(
+            M=self.OPT_FEATURES,
+            N=outfeatures,
+            K=infeatures,
+            A_dtype=bitblas_dtype,
+            W_dtype=W_dtype,
+            out_dtype=bitblas_dtype,
+            accum_dtype="int32" if bitblas_dtype == "int8" else bitblas_dtype,
+            storage_dtype=quant_config.  # type: ignore[union-attr]
+            storage_dtype,  # type: ignore[union-attr]
+            with_scaling=with_scaling,
+            with_zeros=with_zeros,
+            group_size=group_size,
+            with_bias=bias,
+            layout=layout,
+            zeros_mode=zeros_mode,
+        )
+        self.bitblas_matmul = self._get_or_create_bitblas_operator(
+            matmul_config, enable_tuning)
+
+    def _get_or_create_bitblas_operator(self, config, enable_tuning):
+        from bitblas import Matmul, auto_detect_nvidia_target
+        from bitblas.cache import get_database_path, global_operator_cache
+        BITBLAS_DATABASE_PATH = get_database_path()
+        BITBLAS_TARGET = auto_detect_nvidia_target()
+
+        if global_operator_cache.size() == 0:
+            global_operator_cache.load_from_database(BITBLAS_DATABASE_PATH,
+                                                     BITBLAS_TARGET)
+
+        bitblas_matmul = global_operator_cache.get(config)
+        if bitblas_matmul is None:
+            bitblas_matmul = Matmul(config,
+                                    target=BITBLAS_TARGET,
+                                    enable_tuning=False)
+            if enable_tuning:
+                bitblas_matmul.hardware_aware_finetune(topk=20)
+                global_operator_cache.add(config, bitblas_matmul)
+                global_operator_cache.save_into_database(
+                    BITBLAS_DATABASE_PATH, BITBLAS_TARGET)
+                TUNING_MESSAGE = (
+                    f"BitBLAS Operator {config} tuned and saved to database.")
+                logger.info(TUNING_MESSAGE)
+            else:
+                _message = f"BitBLAS Operator {config} created without tuning. "
+                logger.info(_message)
+        else:
+            _message = f"BitBLAS Operator {config} retrieved from cache."
+            logger.info(_message)
+        return bitblas_matmul
+
+    def apply_gptq_bitblas_linear(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        output_size_per_partition = self.config.partition_weight_shape[1]
+        out_shape = x.shape[:-1] + (output_size_per_partition, )
+        args = [x, layer.qweight, layer.scales]
+        if self.bitblas_matmul.config.with_zeros:  # type: ignore[attr-defined]
+            args.append(layer.qzeros)
+        output = self.bitblas_matmul(*args)  # type: ignore[operator]
+        return output.view(out_shape)
+
+    def apply_weights(self, layer, x, bias=None):
+        NOT_IMPLEMENT_MESSAGE = (
+            f"{self.__class__.__name__}.apply_weights is not implemented. "
+            "Please use BitBLASLinearKernel.apply_gptq_bitblas_linear instead")
+        raise NotImplementedError(NOT_IMPLEMENT_MESSAGE)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py
new file mode 100644
index 0000000..f80af54
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py
@@ -0,0 +1,92 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from importlib.util import find_spec
+from typing import Final, Optional
+
+import torch
+
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           permute_param_layout_)
+from vllm.scalar_type import scalar_types
+
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
+
+_CONCH_SUPPORTED_WEIGHT_TYPES: Final = [
+    scalar_types.uint4, scalar_types.uint8, scalar_types.uint4b8,
+    scalar_types.uint8b128
+]
+_CONCH_SUPPORTED_GROUP_SIZES: Final = [-1, 128]
+
+
+class ConchLinearKernel(MPLinearKernel):
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def can_implement(cls,
+                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+        if c.weight_type not in _CONCH_SUPPORTED_WEIGHT_TYPES:
+            error_msg = f"Weight type ({c.weight_type}) not supported by "\
+                        "ConchLinearKernel, supported types are: " \
+                        f"{_CONCH_SUPPORTED_WEIGHT_TYPES}"
+            return False, error_msg
+
+        if c.group_size not in _CONCH_SUPPORTED_GROUP_SIZES:
+            error_msg = f"Group size ({c.group_size}) not supported by "\
+                        "ConchLinearKernel, supported group sizes are: " \
+                        f"{_CONCH_SUPPORTED_GROUP_SIZES}"
+            return False, error_msg
+
+        if find_spec("conch") is None:
+            error_msg = "conch-triton-kernels is not installed, please "\
+                        "install it via `pip install conch-triton-kernels` "\
+                        "and try again!"
+            return False, error_msg
+
+        return True, None
+
+    # note assumes that
+    #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
+    #  `weight_scale` is: {input_dim = 0, output_dim = 1}
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+
+        def transform_w_q(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
+            x.data = x.data.contiguous()
+            return x
+
+        def transform_w_s(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1)
+            x.data = x.data.contiguous()
+            return x
+
+        self._transform_param(layer, self.w_q_name, transform_w_q)
+        self._transform_param(layer, self.w_s_name, transform_w_s)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        from conch.ops.quantization.gemm import mixed_precision_gemm
+
+        w_q, w_s, w_zp, _ = self._get_weight_params(layer)
+
+        output = mixed_precision_gemm(
+            x=x,
+            w_q_packed=w_q.data,
+            w_s=w_s.data,
+            w_zp=w_zp.data if w_zp is not None else None,
+            weight_size_bits=self.config.weight_type.size_bits,
+            weight_bias=self.config.weight_type.bias,
+            group_size=self.config.group_size,
+        )
+
+        if bias is not None:
+            output.add_(bias)  # In-place add
+
+        return output
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
new file mode 100644
index 0000000..fef333e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
@@ -0,0 +1,143 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    pack_quantized_values_into_int32)
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           permute_param_layout_)
+from vllm.scalar_type import scalar_types
+
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
+
+
+class ExllamaLinearKernel(MPLinearKernel):
+    SUPPORTED_QUANT_TYPES = [scalar_types.uint4b8, scalar_types.uint8b128]
+    # In theory supports `scalar_types.uint2b2, scalar_types.uint3b4` too but
+    # currently untested so not added to the list
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 60
+
+    @classmethod
+    def can_implement(cls,
+                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+        if c.has_g_idx and\
+            c.partition_weight_shape[0] != c.full_weight_shape[0]:
+            return False, "Act reordering currently not supported by Exllama, "\
+                          "when the input features are partitioned across "\
+                          "devices"
+
+        if c.partition_weight_shape[1] % (32 // c.weight_type.size_bits) != 0:
+            return False, "Output features must be a multiple of the pack " \
+                            "factor (32 / num_bits) so that we can correctly " \
+                            "pack the zero points"
+
+        if c.act_type != torch.float16:
+            return False, "Exllama only supports float16 activations"
+
+        if c.weight_type not in cls.SUPPORTED_QUANT_TYPES:
+            return False, f"Quant type ({c.weight_type}) not supported by "\
+                           "Exllama, supported types are: "\
+                           f"{cls.SUPPORTED_QUANT_TYPES}"
+
+        if c.full_weight_shape[0] % c.group_size != 0:
+            return False, f"Group size ({c.group_size}) does not evenly divide"\
+                           " the number of input features "\
+                           f"({c.full_weight_shape[0]})"
+
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        c = self.config
+
+        # For Exllama, we need to set a zero-point tensor if there is not one
+        if not c.zero_points:
+            self.w_zp_name = "qzeros"
+            device = getattr(layer, self.w_q_name).device
+            groups = c.partition_weight_shape[0] // c.group_size
+            out_features = c.partition_weight_shape[1]
+
+            if c.weight_type.has_bias():
+                # if the type has a bias we have to create a zeros tensor that
+                # contains the bias values repeated for each group (-1 due to
+                # a bug in the original GPTQ checkpoint format leading to
+                # exllama kernel adding 1 to the zero points during inference)
+                # Documentation of the bug can be found here:
+                #  https://garden.danieldk.eu/GPTQ-Checkpoint-Format
+                zeros = torch.full((groups, out_features),
+                                   c.weight_type.bias - 1,
+                                   dtype=torch.int32,
+                                   device=device)
+            else:
+                raise NotImplementedError(
+                    "A 0 zero-point is not supported by Exllama due to "
+                    "a bug in the original GPTQ checkpoint format leading to "
+                    "exllama kernel adding 1 to the zero points during "
+                    "inference")
+            zeros = pack_quantized_values_into_int32(zeros,
+                                                     c.weight_type,
+                                                     packed_dim=1)
+            setattr(layer, self.w_zp_name,
+                    torch.nn.Parameter(zeros, requires_grad=False))
+
+        if c.has_g_idx:
+
+            def transform_w_g_idx(x):
+                # Exllama wants the permutation array instead of the group
+                # indices
+                return torch.argsort(x).to(torch.int)
+
+            self._transform_param(layer, self.w_gidx_name, transform_w_g_idx)
+        else:
+            self.w_gidx_name = "g_idx"
+            empty_g_idx = torch.nn.Parameter(torch.empty((0, ),
+                                                         dtype=torch.int,
+                                                         device=device),
+                                             requires_grad=False)
+            setattr(layer, self.w_gidx_name, empty_g_idx)
+
+        def transform_w_q(x):
+            assert isinstance(x, BasevLLMParameter)
+            assert self.w_gidx_name is not None
+            g_idx = getattr(layer, self.w_gidx_name)
+
+            permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
+            x_cont = x.data.contiguous()
+            ops.gptq_shuffle(x_cont, g_idx, c.weight_type.size_bits)
+            return x_cont
+
+        def transform_w_s(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1)
+            x.data = x.data.contiguous()
+            return x.to(dtype=c.act_type)
+
+        # Repack weights and scales for Machete
+        self._transform_param(layer, self.w_q_name, transform_w_q)
+        self._transform_param(layer, self.w_s_name, transform_w_s)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        c = self.config
+
+        x_2d = x.reshape(-1, x.shape[-1])
+        out_shape = x.shape[:-1] + (c.partition_weight_shape[1], )
+
+        w_q, w_s, w_zp, w_g_idx = self._get_weight_params(layer)
+
+        assert w_zp is not None, "Zero points are required by Exllama"
+        assert w_g_idx is not None, "Group index is required by Exllama"
+        output = ops.gptq_gemm(x_2d, w_q, w_zp, w_s, w_g_idx, True,
+                               c.weight_type.size_bits)
+
+        if bias is not None:
+            output.add_(bias)
+        return output.reshape(out_shape)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
new file mode 100644
index 0000000..da951dd
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
@@ -0,0 +1,144 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from functools import partial
+from typing import Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.machete_utils import (
+    check_machete_supports_shape, query_machete_supported_group_sizes,
+    query_machete_supported_quant_types)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    pack_quantized_values_into_int32, unpack_quantized_values_into_int32)
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           permute_param_layout_)
+from vllm.platforms import current_platform
+
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
+
+
+class MacheteLinearKernel(MPLinearKernel):
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 90
+
+    @classmethod
+    def can_implement(cls,
+                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+        # Machete uses CUTLASS, so it can only be compatible with Nvidia
+        if not current_platform.is_cuda():
+            return False, "Machete only supported on CUDA"
+
+        if not current_platform.is_device_capability(90):
+            return False, "Machete requires compute capability of 90 (Hopper)"
+
+        if c.has_g_idx and\
+            c.partition_weight_shape[0] != c.full_weight_shape[0]:
+            return False, "Act reordering currently not supported by Machete, "\
+                          "when the input features are partitioned across "\
+                          "devices"
+
+        if c.weight_type not in query_machete_supported_quant_types(
+                c.zero_points):
+            return False, f"Quant type ({c.weight_type}) not supported by "\
+                           "Machete, supported types are: "\
+                           f"{query_machete_supported_quant_types(c.zero_points)}"
+
+        if c.group_size not in query_machete_supported_group_sizes(c.act_type):
+            return False, f"Group size ({c.group_size}) not supported by "\
+                            "Machete, supported group sizes are: "\
+                            f"{query_machete_supported_group_sizes(c.act_type)}"
+
+        return check_machete_supports_shape(c.partition_weight_shape[0],
+                                            c.partition_weight_shape[1])
+
+    # note assumes that
+    #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
+    #  `weight_scale`  is: {input_dim = 0, output_dim = 1}
+    #  `weight_zp`     is: {input_dim = 0, output_dim = 1, packed_dim = 1}
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        c = self.config
+
+        if c.has_g_idx:
+            assert self.w_gidx_name is not None
+            perm = torch.argsort(getattr(layer, self.w_gidx_name))\
+                .to(torch.int)
+
+            self.act_perm = lambda x: x[:, perm]
+            # use `ops.permute_cols` if possible
+            if c.act_type in [torch.float16, torch.bfloat16] \
+                and c.partition_weight_shape[0] % 8 == 0:
+                self.act_perm = partial(ops.permute_cols, perm=perm)
+
+        def transform_w_q(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
+            if c.has_g_idx:
+                x_unpacked = unpack_quantized_values_into_int32(x.data,
+                                                                c.weight_type,
+                                                                packed_dim=0)
+                x_perm = x_unpacked[perm, :]
+                x.data = pack_quantized_values_into_int32(x_perm,
+                                                          c.weight_type,
+                                                          packed_dim=0)
+            x.data = ops.machete_prepack_B(x.data.t().contiguous().t(),
+                                           a_type=c.act_type,
+                                           b_type=c.weight_type,
+                                           group_scales_type=c.act_type)
+            return x
+
+        def transform_w_s(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1)
+            x.data = x.data.contiguous()
+            return x
+
+        def transform_w_zp(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=1)
+            x_unpacked = unpack_quantized_values_into_int32(x.data,
+                                                            c.weight_type,
+                                                            packed_dim=1)
+            w_s = getattr(layer, self.w_s_name).data
+            # pre-apply scales to zero-points
+            x.data = (-1.0 * w_s * (x_unpacked.to(w_s.dtype))).contiguous()
+            return x
+
+        # Repack weights and scales for Machete
+        self._transform_param(layer, self.w_q_name, transform_w_q)
+        self._transform_param(layer, self.w_s_name, transform_w_s)
+        if c.zero_points:
+            self._transform_param(layer, self.w_zp_name, transform_w_zp)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        c = self.config
+        w_q, w_s, w_zp, _ = self._get_weight_params(layer)
+
+        x_2d = x.reshape(-1, x.shape[-1])
+        out_shape = x.shape[:-1] + (c.partition_weight_shape[1], )
+
+        if c.has_g_idx:
+            x_2d = self.act_perm(x_2d)
+
+        if c.zero_points:
+            assert w_zp is not None
+        else:
+            w_zp = None
+
+        output = ops.machete_mm(a=x_2d,
+                                b_q=w_q,
+                                b_type=c.weight_type,
+                                b_group_zeros=w_zp,
+                                b_group_scales=w_s,
+                                b_group_size=c.group_size)
+
+        if bias is not None:
+            output.add_(bias)  # In-place add
+
+        return output.reshape(out_shape)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
new file mode 100644
index 0000000..73e0b17
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
@@ -0,0 +1,135 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    MARLIN_SUPPORTED_GROUP_SIZES, apply_gptq_marlin_linear,
+    check_marlin_supports_shape, marlin_is_k_full, marlin_make_empty_g_idx,
+    marlin_make_workspace_new, marlin_permute_scales, marlin_sort_g_idx,
+    marlin_zero_points, query_marlin_supported_quant_types, unpack_cols)
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           permute_param_layout_)
+from vllm.platforms import current_platform
+
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
+
+
+class MarlinLinearKernel(MPLinearKernel):
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def can_implement(cls,
+                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+        # Marlin uses inline PTX, so it can only be compatible with Nvidia
+        if not current_platform.is_cuda():
+            return False, "Marlin only supported on CUDA"
+
+        quant_types = query_marlin_supported_quant_types(c.zero_points)
+        if c.weight_type not in quant_types:
+            return False, f"Quant type ({c.weight_type}) not supported by"\
+                          f"  Marlin, supported types are: {quant_types}"
+
+        if c.group_size not in MARLIN_SUPPORTED_GROUP_SIZES:
+            return False, f"Group size ({c.group_size}) not supported by "\
+                            "Marlin, supported group sizes are: "\
+                            f"{MARLIN_SUPPORTED_GROUP_SIZES}"
+
+        return check_marlin_supports_shape(
+            c.partition_weight_shape[1],  # out_features
+            c.partition_weight_shape[0],  # in_features
+            c.full_weight_shape[0],  # in_features
+            c.group_size)
+
+    # note assumes that
+    #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
+    #  `weight_scale` is: {input_dim = 0, output_dim = 1}
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        device = getattr(layer, self.w_q_name).device
+        c = self.config
+
+        row_parallel = (c.partition_weight_shape[0] != c.full_weight_shape[0])
+        self.is_k_full = marlin_is_k_full(c.has_g_idx, row_parallel)
+
+        # Allocate marlin workspace.
+        self.workspace = marlin_make_workspace_new(device)
+
+        # Default names since marlin requires empty parameters for these,
+        # TODO: remove this requirement from marlin (allow optional tensors)
+        if self.w_gidx_name is None:
+            self.w_gidx_name = "g_idx"
+        if self.w_zp_name is None:
+            self.w_zp_name = "w_zp"
+
+        def transform_w_q(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
+            x.data = ops.gptq_marlin_repack(x.data.contiguous(),
+                                            perm=layer.g_idx_sort_indices,
+                                            size_k=c.partition_weight_shape[0],
+                                            size_n=c.partition_weight_shape[1],
+                                            num_bits=c.weight_type.size_bits)
+            return x
+
+        def transform_w_s(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1)
+            x.data = marlin_permute_scales(x.data.contiguous(),
+                                           size_k=c.partition_weight_shape[0],
+                                           size_n=c.partition_weight_shape[1],
+                                           group_size=c.group_size)
+            return x
+
+        if c.has_g_idx:
+            g_idx, g_idx_sort_indices = marlin_sort_g_idx(
+                getattr(layer, self.w_gidx_name))
+            self._transform_param(layer, self.w_gidx_name, lambda _: g_idx)
+            layer.g_idx_sort_indices = g_idx_sort_indices
+        else:
+            setattr(layer, self.w_gidx_name, marlin_make_empty_g_idx(device))
+            layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
+
+        if c.zero_points:
+            grouped_k = (c.partition_weight_shape[0] //
+                         c.group_size if c.group_size != -1 else 1)
+            self._transform_param(layer, self.w_zp_name, lambda x: \
+                marlin_zero_points(
+                    unpack_cols(x.t(), c.weight_type.size_bits,
+                                grouped_k,
+                                c.partition_weight_shape[1]),
+                    size_k=grouped_k,
+                    size_n=c.partition_weight_shape[1],
+                    num_bits=c.weight_type.size_bits))
+        else:
+            setattr(layer, self.w_zp_name, marlin_make_empty_g_idx(device))
+        self._transform_param(layer, self.w_q_name, transform_w_q)
+        self._transform_param(layer, self.w_s_name, transform_w_s)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        c = self.config
+        w_q, w_s, w_zp, w_gidx = self._get_weight_params(layer)
+
+        # `process_weights_after_loading` will ensure w_zp and w_gidx are not
+        #  None for marlin
+        return apply_gptq_marlin_linear(
+            input=x,
+            weight=w_q,
+            weight_scale=w_s,
+            weight_zp=w_zp,  # type: ignore
+            g_idx=w_gidx,  # type: ignore
+            g_idx_sort_indices=layer.g_idx_sort_indices,
+            workspace=self.workspace,
+            wtype=c.weight_type,
+            input_size_per_partition=c.partition_weight_shape[0],
+            output_size_per_partition=c.partition_weight_shape[1],
+            is_k_full=self.is_k_full,
+            bias=bias)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
new file mode 100644
index 0000000..9ebf5f3
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+
+
+@dataclass
+class ScaledMMLinearLayerConfig:
+    is_channelwise: bool
+    is_static_input_scheme: bool
+    input_symmetric: bool
+
+
+class ScaledMMLinearKernel(ABC):
+
+    @classmethod
+    @abstractmethod
+    def get_min_capability(cls) -> int:
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]:
+        raise NotImplementedError
+
+    def __init__(self, c: ScaledMMLinearLayerConfig, w_q_param_name: str,
+                 w_s_param_name: str, i_s_param_name: str,
+                 i_zp_param_name: str, azp_adj_param_name: str) -> None:
+        assert self.can_implement(c)
+        self.config = c
+        self.w_q_name = w_q_param_name
+        self.w_s_name = w_s_param_name
+        self.i_s_name = i_s_param_name
+        self.i_zp_name = i_zp_param_name
+        self.azp_adj_name = azp_adj_param_name
+
+    @abstractmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        raise NotImplementedError
+
+    def _get_weight_params(
+            self, layer: torch.nn.Module) -> tuple[
+                torch.Tensor,  # weight
+                torch.Tensor,  # weight_scale
+                Optional[torch.Tensor],  # input_scale, 
+                Optional[torch.Tensor],  # input_zp
+                Optional[torch.Tensor],  # azp_adj
+            ]:
+        return (
+            getattr(layer, self.w_q_name),
+            getattr(layer, self.w_s_name),
+            getattr(layer, self.i_s_name),
+            getattr(layer, self.i_zp_name),
+            getattr(layer, self.azp_adj_name),
+        )
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
new file mode 100644
index 0000000..18f5ce0
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from typing import Optional
+
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.aiter import (
+    AiterScaledMMLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass import (
+    CutlassScaledMMLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import (  # noqa: E501
+    ScaledMMLinearKernel, ScaledMMLinearLayerConfig)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.triton import (
+    TritonScaledMMLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.xla import (
+    XLAScaledMMLinearKernel)
+from vllm.platforms import PlatformEnum, current_platform
+
+# in priority/performance order (when available)
+_POSSIBLE_KERNELS: dict[PlatformEnum, list[type[ScaledMMLinearKernel]]] = {
+    PlatformEnum.CPU: [CutlassScaledMMLinearKernel],
+    PlatformEnum.CUDA: [CutlassScaledMMLinearKernel],
+    PlatformEnum.ROCM: [AiterScaledMMLinearKernel, TritonScaledMMLinearKernel],
+    PlatformEnum.TPU: [XLAScaledMMLinearKernel],
+}
+
+
+def choose_scaled_mm_linear_kernel(
+        config: ScaledMMLinearLayerConfig,
+        compute_capability: Optional[int] = None
+) -> type[ScaledMMLinearKernel]:
+    """
+    Choose an ScaledMMLinearKernel that can implement the given config for the 
+    given compute capability. Attempts to choose the best kernel in terms of 
+    performance.
+
+    Args:
+        config (ScaledMMLinearLayerConfig): Description of the linear layer 
+            to be implemented.
+        compute_capability (Optional[int], optional): The compute capability of
+            the target device, if None uses `current_platform` to get the 
+            compute capability. Defaults to None.
+
+    Raises:
+        ValueError: If no kernel can implement the given config.
+
+    Returns:
+        type[ScaledMMLinearKernel]: Chosen kernel.
+    """
+
+    if compute_capability is None:
+        _cc = current_platform.get_device_capability()
+        if _cc is not None:
+            compute_capability = _cc[0] * 10 + _cc[1]
+
+    failure_reasons = []
+    for kernel in _POSSIBLE_KERNELS[current_platform._enum]:
+        if kernel.__name__ in os.environ.get("VLLM_DISABLED_KERNELS", "")\
+            .split(","):
+            failure_reasons.append(
+                f' {kernel.__name__} disabled by environment variable')
+            continue
+
+        # If the current platform uses compute_capability,
+        # make sure the kernel supports the compute cability.
+        if compute_capability is not None:
+            kernel_min_capability = kernel.get_min_capability()
+            if (kernel_min_capability is not None
+                    and kernel_min_capability > compute_capability):
+                failure_reasons.append(
+                    f"{kernel.__name__} requires capability "
+                    f"{kernel_min_capability}, current compute capability "
+                    f"is {compute_capability}")
+                continue
+
+        can_implement, failure_reason = kernel.can_implement(config)
+        if can_implement:
+            return kernel
+        else:
+            failure_reasons.append(
+                f' {kernel.__name__} cannot implement due to: {failure_reason}'
+            )
+
+    raise ValueError(
+        "Failed to find a kernel that can implement the "\
+        "ScaledMM linear layer. Reasons: \n"
+        + '\n'.join(failure_reasons))
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
new file mode 100644
index 0000000..7f808fa
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
@@ -0,0 +1,163 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+import vllm.envs as envs
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.utils import direct_register_custom_op
+
+from .cutlass import CutlassScaledMMLinearKernel
+from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig
+
+
+def rocm_aiter_gemm_w8a8_impl(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+
+    from aiter import gemm_a8w8_CK
+
+    # gemm_a8w8_CK(a, b, scale_a, scale_b, bias) expects
+    # a to be [M, K]
+    # b to be [N, K]
+    # CutlassScaledMMLinearKernel prepare weight `w_q` in [K, N] format
+    return gemm_a8w8_CK(A, B, As, Bs, bias, output_dtype)
+
+
+def rocm_aiter_gemm_w8a8_fake(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+
+    m = A.shape[0]
+    n = B.shape[0]
+    Y = torch.empty(m, n, dtype=output_dtype, device=A.device)
+    return Y
+
+
+if current_platform.is_rocm():
+    direct_register_custom_op(
+        op_name="rocm_aiter_gemm_w8a8",
+        op_func=rocm_aiter_gemm_w8a8_impl,
+        mutates_args=[],
+        fake_impl=rocm_aiter_gemm_w8a8_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
+
+class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel):
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 90
+
+    @classmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]:
+        if not current_platform.is_rocm():
+            return (
+                False,
+                "AiterScaledMMLinearKernel requires `aiter` which is not " +
+                "currently supported on non-ROCm platform.")
+
+        try:
+            import aiter  # noqa: F401 # deliberately attempt to import aiter
+        except Exception:
+            return (
+                False,
+                "AiterScaledMMLinearKernel requires `aiter` which is not " +
+                "installed on ROCm.")
+        # Check if rocm_aiter_gemm_w8a8_scaled_mm is enabled
+        if not (
+            envs.VLLM_ROCM_USE_AITER_LINEAR \
+            and envs.VLLM_ROCM_USE_AITER
+        ):
+            return (False, "AiterScaledMMLinearKernel is disabled. " +
+                    "Enable by setting `VLLM_ROCM_USE_AITER=1` " +
+                    "and `VLLM_ROCM_USE_AITER_LINEAR=1`. " +
+                    "`VLLM_ROCM_USE_AITER_LINEAR` default is True.")
+
+        if not c.input_symmetric:
+            return (False,
+                    "AiterScaledMMLinearKernel only supports symmetric " +
+                    "quantization.")
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        super().process_weights_after_loading(layer)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        `AiterScaledMMLinearKernel` implements a fused version of
+            `output = torch.mm((scale_a * a), (scale_b * b)).to(out_dtype)`
+        where scale_a * a and scale_b * b are implemented using numpy-style
+        broadcasting.
+        Currently only support per-tensor-per-tensor GEMM
+        and per-token-per-channel GEMM through AITER
+        w8a8 scaled gemm. `AiterScaledMMLinearKernel` also does not support
+        ATIER block scaled GEMM and mix-precision GEMM.
+        """
+        w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer)
+
+        # ops.scaled_int8_quant supports both dynamic and static quant:
+        # * dynamic, i_s is None and x_s computed from x.
+        # * static, i_s is scalar and x_s is i_s.
+        symmetric = azp_adj is None
+        assert symmetric, ("AiterScaledMMLinearKernel only supports"
+                           " symmetric quantization.")
+        x_q, x_s, x_zp = ops.scaled_int8_quant(x,
+                                               i_s,
+                                               i_zp,
+                                               symmetric=symmetric)
+
+        assert x_zp is None, ("AiterScaledMMLinearKernel only supports"
+                              " symmetric quantization.")
+        out_dtype = x.dtype
+
+        assert (w_q.shape[0] % 16 == 0 and w_q.shape[1] % 16 == 0)
+        assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
+        assert bias is None or bias.shape[0] == w_q.shape[
+            1] and bias.dtype == out_dtype
+
+        m = x_q.shape[0]  # a
+        n = w_q.shape[1]  # b
+
+        per_tensor_scale_a = (x_s.numel() == 1)
+        per_tensor_scale_b = (w_s.numel() == 1)
+        per_token_scale_a = (x_s.numel() == m)
+        per_channel_scale_b = (w_s.numel() == n)
+
+        # @TODO:
+        # Maybe broadcast the per-tensor-scale into per-channel-scale
+        # if one of the scale is a per-channel-scale.
+        # For now, it only supports:
+        # - per-tensor-per-tensor a8w8 scaled GEMM, and
+        # - per-token-per-channel a8w8 scaled GEMM
+        assert ((per_tensor_scale_a and per_tensor_scale_b)
+                or (per_token_scale_a and per_channel_scale_b)), (
+                    "Currently only support per-tensor-per-tensor GEMM " +
+                    " and per-token-per-channel GEMM through AITER"
+                    " w8a8 scaled gemm. `AiterScaledMMLinearKernel` " +
+                    "does not support AITER block scaled GEMM.")
+
+        # gemm_a8w8_CK(a, b, scale_a, scale_b, bias) expects
+        # a to be [M, K]
+        # b to be [N, K]
+        # CutlassScaledMMLinearKernel prepare weight `w_q` in [K, N] format
+        return torch.ops.vllm.rocm_aiter_gemm_w8a8(x_q, w_q.t(), x_s, w_s,
+                                                   bias, out_dtype)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
new file mode 100644
index 0000000..6ddd4a9
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
@@ -0,0 +1,137 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    convert_to_channelwise)
+from vllm.platforms import current_platform
+
+from .ScaledMMLinearKernel import (ScaledMMLinearKernel,
+                                   ScaledMMLinearLayerConfig)
+
+
+class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 75
+
+    @classmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]:
+
+        if (not current_platform.is_cuda() and not current_platform.is_cpu()):
+            return False, "CutlassScaledMM requires running on CUDA or CPU."
+
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # WEIGHT
+        # Cutlass kernels need transposed weight.
+        weight = getattr(layer, self.w_q_name)
+        replace_parameter(
+            layer, self.w_q_name,
+            torch.nn.Parameter(weight.t().data, requires_grad=False))
+
+        # WEIGHT SCALE
+        # Cutlass kernels support only per-tensor and per-channel.
+        # If we have a fused module (QKV, MLP) with per tensor scales (thus N
+        # scales being passed to the kernel), convert to the per-channel case.
+        is_fused_module = len(layer.logical_widths) > 1
+        weight_scale = getattr(layer, self.w_s_name)
+        if is_fused_module and not self.config.is_channelwise:
+            weight_scale = convert_to_channelwise(weight_scale,
+                                                  layer.logical_widths)
+        replace_parameter(
+            layer, self.w_s_name,
+            torch.nn.Parameter(weight_scale.data, requires_grad=False))
+
+        # INPUT SCALE
+        if self.config.is_static_input_scheme:
+            input_scale = getattr(layer, self.i_s_name)
+
+            if self.config.input_symmetric:
+                replace_parameter(
+                    layer, self.i_s_name,
+                    torch.nn.Parameter(input_scale.max(), requires_grad=False))
+                setattr(layer, self.i_zp_name, None)
+            else:
+                input_zero_point = getattr(layer, self.i_zp_name)
+
+                # reconstruct the ranges
+                int8_traits = torch.iinfo(torch.int8)
+                azps = input_zero_point.to(dtype=torch.int32)
+                range_max = (input_scale * (int8_traits.max - azps)).max()
+                range_min = (input_scale * (int8_traits.min - azps)).min()
+
+                scale = (range_max - range_min) / (int8_traits.max -
+                                                   int8_traits.min)
+                replace_parameter(
+                    layer, self.i_s_name,
+                    torch.nn.Parameter(scale, requires_grad=False))
+
+                # AZP loaded as int8 but used as int32
+                azp = (int8_traits.min -
+                       range_min / scale).to(dtype=torch.int32)
+                replace_parameter(layer, self.i_zp_name,
+                                  torch.nn.Parameter(azp, requires_grad=False))
+
+        else:
+            setattr(layer, self.i_s_name, None)
+            setattr(layer, self.i_zp_name, None)
+
+        # azp_adj is the AZP adjustment term, used to account for weights.
+        # It does not depend on scales or azp, so it is the same for
+        # static and dynamic quantization.
+        # For more details, see csrc/quantization/cutlass_w8a8/Epilogues.md
+        # https://github.com/vllm-project/vllm/blob/8d59dbb00044a588cab96bcdc028006ed922eb06/csrc/quantization/cutlass_w8a8/Epilogues.md
+        if not self.config.input_symmetric:
+            weight = getattr(layer, self.w_q_name)
+            azp_adj = weight.sum(dim=0, keepdim=True, dtype=torch.int32)
+            if self.config.is_static_input_scheme:
+                # cutlass_w8a8 requires azp to be folded into azp_adj
+                # in the per-tensor case
+                azp_adj = getattr(layer, self.i_zp_name) * azp_adj
+            setattr(layer, self.azp_adj_name,
+                    torch.nn.Parameter(azp_adj, requires_grad=False))
+        else:
+            setattr(layer, self.azp_adj_name, None)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer)
+
+        # ops.scaled_int8_quant supports both dynamic and static quant:
+        # * dynamic, i_s is None and x_s computed from x.
+        # * static, i_s is scalar and x_s is i_s.
+        symmetric = azp_adj is None
+        x_q, x_s, x_zp = ops.scaled_int8_quant(x.contiguous(),
+                                               i_s,
+                                               i_zp,
+                                               symmetric=symmetric)
+
+        if x_zp is not None:
+            # Currently, static is always per-tensor and dynamic is per-token
+            static = i_zp is not None
+            azp = None if static else x_zp
+            return ops.cutlass_scaled_mm_azp(x_q,
+                                             w_q,
+                                             scale_a=x_s,
+                                             scale_b=w_s,
+                                             out_dtype=x.dtype,
+                                             azp_adj=azp_adj,
+                                             azp=azp,
+                                             bias=bias)
+        return ops.cutlass_scaled_mm(x_q,
+                                     w_q,
+                                     scale_a=x_s,
+                                     scale_b=w_s,
+                                     out_dtype=x.dtype,
+                                     bias=bias)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
new file mode 100644
index 0000000..817565c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+from vllm.platforms import current_platform
+
+from .cutlass import CutlassScaledMMLinearKernel
+from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig
+
+
+class TritonScaledMMLinearKernel(CutlassScaledMMLinearKernel):
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 75
+
+    @classmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]:
+        if current_platform.is_cpu():
+            return (
+                False,
+                "TritonScaledMMLinearKernel requires Triton which is not " +
+                "currently supported on CPU.")
+        if not c.input_symmetric:
+            return (False,
+                    "TritonScaledMMLinearKernel only supports symmetric " +
+                    "quantization.")
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        super().process_weights_after_loading(layer)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return super().apply_weights(layer, x, bias)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
new file mode 100644
index 0000000..0b931b2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import warnings
+from typing import Optional
+
+import torch
+from functorch.experimental.control_flow import cond  # noqa: F401
+
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    convert_to_channelwise)
+from vllm.platforms import current_platform
+
+from .ScaledMMLinearKernel import (ScaledMMLinearKernel,
+                                   ScaledMMLinearLayerConfig)
+
+
+class XLAScaledMMLinearKernel(ScaledMMLinearKernel):
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        raise NotImplementedError(
+            "TPU platform does have a concept of compute capability, "
+            "this method should not be called.")
+
+    @classmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]:
+
+        if not current_platform.is_tpu():
+            return False, "ScaledMMXLA requires running on TPU."
+
+        if c.is_static_input_scheme:
+            return False, "ScaledMMXLA requires dynamic activation scales."
+
+        if not c.input_symmetric:
+            return False, "ScaledMMXLA requires symmetric activation scales."
+
+        if not c.is_channelwise:
+            return False, "ScaledMMXLA requires channelwise weight scales"
+
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # WEIGHT
+        # [out, in] (different than cutlass_scaled_mm)
+        weight = getattr(layer, self.w_q_name)
+        replace_parameter(layer, self.w_q_name,
+                          torch.nn.Parameter(weight.data, requires_grad=False))
+
+        # WEIGHT SCALE
+        # XLA kernels support only per-tensor and per-channel.
+        # If we have a fused module (QKV, MLP) with per tensor scales (thus N
+        # scales being passed to the kernel), convert to the per-channel case.
+        is_fused_module = len(layer.logical_widths) > 1
+        weight_scale = getattr(layer, self.w_s_name)
+        if is_fused_module and not self.config.is_channelwise:
+            weight_scale = convert_to_channelwise(weight_scale,
+                                                  layer.logical_widths)
+
+        # [out_channel,] (different than cutlass_scaled_mm)
+        weight_scale = weight_scale.squeeze(-1)
+        replace_parameter(
+            layer, self.w_s_name,
+            torch.nn.Parameter(weight_scale.data, requires_grad=False))
+
+        # Only support symmetric dynamic activation quantization.
+        setattr(layer, self.i_s_name, None)
+        setattr(layer, self.i_zp_name, None)
+        setattr(layer, self.azp_adj_name, None)
+
+        # Filter warning for cond usage in apply_weights. It is okay
+        # to specialize the graph since bias is not dynamic.
+        warnings.filterwarnings(
+            "ignore",
+            message=
+            "Pred is a Python constant. When used with torch.cond, it specializes on one of the branches."  # noqa: E501
+        )
+
+    def no_add_bias(self, x: torch.Tensor, bias: Optional[torch.Tensor]):
+        return x
+
+    def add_bias(self, x: torch.Tensor, bias: Optional[torch.Tensor]):
+        return x + bias
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        w_q, w_s, _, _, _ = self._get_weight_params(layer)
+
+        # Required to register custom ops.
+        import torch_xla.experimental.custom_kernel  # noqa: F401
+        out = torch.ops.xla.quantized_matmul_int8(
+            x,
+            w_q,
+            w_s,
+            quantize_activation=True,
+        )
+
+        # Explicitly capture control flow to make dynamo happy.
+        # https://pytorch.org/docs/main/generated/exportdb/index.html#cond-branch-class-method # noqa: E501
+        return cond(bias is None, self.no_add_bias, self.add_bias, [out, bias])
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/kv_cache.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kv_cache.py
new file mode 100644
index 0000000..e560467
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/kv_cache.py
@@ -0,0 +1,139 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.platforms import current_platform
+
+logger = init_logger(__name__)
+
+
+class BaseKVCacheMethod(QuantizeMethodBase):
+    """
+    Quant method that adds `_k_scale` and `_v_scale` attributes to the
+    Attention layer to support loading those scaling factors from checkpoints. 
+    The k/v_scale will be used to:
+        - quantize k/v_cache entries before saving them to the cache
+        - dequantize k/v_cache entries before fetching them from the cache
+
+    :param quant_config: the appropriate QuantizationConfig 
+    """
+
+    def __init__(self, quant_config: QuantizationConfig):
+        self.quant_config = quant_config
+
+    def create_weights(self, layer: torch.nn.Module):
+        """
+        Create "weight" (aka q_scale, k_scale and v_scale)
+        for an attention layer.
+        """
+        # Initialize the Q and KV cache scales to -1.0, an invalid value.
+        # If the q and k/v_scales appear in the checkpoint, it will be
+        # overwritten when loading weights.
+        layer.q_scale = torch.nn.Parameter(torch.tensor(-1.0),
+                                           requires_grad=False)
+        layer.k_scale = torch.nn.Parameter(torch.tensor(-1.0),
+                                           requires_grad=False)
+        layer.v_scale = torch.nn.Parameter(torch.tensor(-1.0),
+                                           requires_grad=False)
+        # Initialize P = softmax(QK^T) scales
+        layer.prob_scale = torch.nn.Parameter(torch.tensor(-1.0),
+                                              requires_grad=False)
+
+    def apply(self, layer: torch.nn.Module) -> torch.Tensor:
+        raise RuntimeError(
+            f"{self.__class__.__name__}.apply should not be called.")
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # If the kv-cache dtype is auto, we enforce the k/v_scale to be 1.0
+        # regardless whether the kv-scale is available in the checkpoint.
+        # No need to process kv scales after loading if we are going to
+        # calculate them on the fly.
+        if layer.kv_cache_dtype != "auto" and not layer.calculate_kv_scales:
+            if layer.k_scale > 0.0 and layer.v_scale > 0.0:
+                # We prefer to use separate k_scale and v_scale if present
+                k_scale = layer.k_scale.to("cpu").tolist()
+                v_scale = layer.v_scale.to("cpu").tolist()
+                if current_platform.is_fp8_fnuz():
+                    k_scale *= 2
+                    v_scale *= 2
+            elif layer.k_scale < 0.0 and layer.v_scale < 0.0:
+                # If no scales were loaded (both scales are invalid negative
+                # values), use the default value of 1.0
+                k_scale = 1.0
+                v_scale = 1.0
+            else:
+                # If we find a single kv_scale in the checkpoint, we remap
+                # kv_scale to k_scale during weight loading, and duplicate
+                # k_scale to v_scale here
+                assert layer.k_scale > 0.0
+                scale_to_duplicate = max(layer.k_scale, layer.v_scale)
+                k_scale = scale_to_duplicate.to("cpu").tolist()
+                v_scale = scale_to_duplicate.to("cpu").tolist()
+                if current_platform.is_fp8_fnuz():
+                    k_scale *= 2
+                    v_scale *= 2
+
+            if not isinstance(k_scale, float) or not isinstance(
+                    v_scale, float):
+                raise ValueError("Only support per-tensor scaling factor "
+                                 "for fp8 KV cache")
+
+            if layer.q_scale < 0.0:
+                logger.warning_once(
+                    "Checkpoint does not provide a q scaling factor. "
+                    "Setting it to k_scale. This only matters for "
+                    "the flash-attn backend.")
+                layer._q_scale.copy_(k_scale)
+
+            # These are used in the final Attention.forward()
+            layer._k_scale.copy_(k_scale)
+            layer._v_scale.copy_(v_scale)
+            layer._k_scale_float = k_scale
+            layer._v_scale_float = v_scale
+            if (k_scale == 1.0 and v_scale == 1.0
+                    and "e5m2" not in layer.kv_cache_dtype):
+                logger.warning_once(
+                    "Using KV cache scaling factor 1.0 for fp8_e4m3. This "
+                    "may cause accuracy issues. Please make sure k/v_scale "
+                    "scaling factors are available in the fp8 checkpoint.")
+
+        if layer.q_scale > 0.0:
+            q_scale = layer.q_scale
+            if current_platform.is_fp8_fnuz():
+                q_scale *= 2
+            layer.calculate_kv_scales = False
+        else:
+            q_scale = 1.0
+        if layer.prob_scale > 0.0:
+            prob_scale = layer.prob_scale
+            if current_platform.is_fp8_fnuz():
+                prob_scale *= 2
+        else:
+            prob_scale = 1.0
+
+        is_singleton_float = lambda x: isinstance(x, float) or isinstance(
+            x, torch.Tensor) and x.numel() == 1 and x.is_floating_point()
+        if not is_singleton_float(q_scale) or not is_singleton_float(
+                prob_scale):
+            raise ValueError("Only support per-tensor scaling factor"
+                             "for fp8-quantized Q/prob")
+
+        # These are used in the final Attention.forward()
+        layer._q_scale.copy_(q_scale)
+        layer._prob_scale.copy_(prob_scale)
+        if layer.kv_cache_dtype == "fp8" and (q_scale == 1.0
+                                              or prob_scale == 1.0):
+            logger.warning_once(
+                f"Using uncalibrated q_scale {q_scale} and/or prob_scale "
+                f"{prob_scale} with fp8 attention. This may cause accuracy "
+                "issues. Please make sure q/prob scaling factors are "
+                "available in the fp8 checkpoint.")
+
+        del layer.k_scale
+        del layer.v_scale
+        del layer.q_scale
+        del layer.prob_scale
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/marlin.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/marlin.py
new file mode 100644
index 0000000..18d1c13
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/marlin.py
@@ -0,0 +1,263 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           ChannelQuantScaleParameter,
+                                           GroupQuantScaleParameter,
+                                           PackedvLLMParameter)
+
+logger = init_logger(__name__)
+
+
+class MarlinConfig(QuantizationConfig):
+    """Config class for Marlin.
+
+    Reference: https://github.com/IST-DASLab/marlin/tree/master
+    """
+
+    def __init__(
+        self,
+        group_size: int,
+        lm_head_quantized: bool,
+    ) -> None:
+        super().__init__()
+
+        # Group size for the quantization.
+        self.group_size = group_size
+        self.lm_head_quantized = lm_head_quantized
+        if self.group_size != 128 and self.group_size != -1:
+            raise ValueError(
+                "Currently, only group size 128 and -1 (channelwise) "
+                "is supported for Marlin, but got group_size of "
+                f"{self.group_size}")
+
+        # 4 Bits packed into 32 bit datatype.
+        self.pack_factor = 32 // 4
+
+        # Tile size used by marlin kernels.
+        self.tile_size = 16
+
+        # Min out_features dim
+        self.min_n_threads = 64
+
+        # Min in_features dim
+        self.min_k_threads = 128
+
+        # Max parallel problems to solve at once (improves large
+        # batch performance)
+        self.max_parallel = 16
+
+        # Permutation length used by the marlin kernels.
+        self.perm_len = 1024
+
+    def __repr__(self) -> str:
+        return (f"MarlinConfig(group_size={self.group_size}, "
+                f"lm_head_quantized={self.lm_head_quantized})")
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "marlin"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.half]
+
+    @classmethod
+    # Need to figure it out
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "MarlinConfig":
+        group_size = cls.get_from_keys(config, ["group_size"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
+                                                 default=False)
+        return cls(group_size, lm_head_quantized)
+
+    @classmethod
+    def override_quantization_method(
+            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
+        # compat: autogptq >=0.8.0 use checkpoint_format: str
+        # compat: autogptq <=0.7.1 is_marlin_format: bool
+        is_marlin_format = (hf_quant_cfg.get("checkpoint_format") == "marlin"
+                            or hf_quant_cfg.get("is_marlin_format", False))
+
+        is_valid_user_quant = (user_quant is None or user_quant == "gptq"
+                               or user_quant == "marlin")
+
+        if is_marlin_format and is_valid_user_quant:
+            msg = ("The model is serialized in {} format. Using {} kernel.".
+                   format(cls.get_name(), cls.get_name()))
+            logger.info(msg)
+            return cls.get_name()
+
+        return None
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["MarlinLinearMethod"]:
+        if (isinstance(layer, LinearBase) or
+            (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
+            return MarlinLinearMethod(self)
+        return None
+
+
+class MarlinLinearMethod(LinearMethodBase):
+    """Linear method for Marlin.
+
+    Args:
+        quant_config: The Marlin quantization config.
+    """
+
+    def __init__(self, quant_config: MarlinConfig):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del output_size  # Unused.
+        weight_loader = extra_weight_attrs["weight_loader"]
+
+        if params_dtype != torch.float16:
+            raise ValueError(
+                f"The params dtype must be float16, but got {params_dtype}")
+
+        # Validate output_size_per_partition
+        output_size_per_partition = sum(output_partition_sizes)
+        if output_size_per_partition % self.quant_config.min_n_threads != 0:
+            raise ValueError(
+                f"Weight output_size_per_partition = "
+                f"{output_size_per_partition} is not divisible by "
+                f"min_n_threads = {self.quant_config.min_n_threads}.")
+        if output_size_per_partition % self.quant_config.pack_factor != 0:
+            raise ValueError(
+                f"Weight output_size_per_partition = "
+                f"{output_size_per_partition} is not divisible by "
+                f"pack_factor = {self.quant_config.pack_factor}.")
+
+        # Validate input_size_per_partition
+        if input_size_per_partition % self.quant_config.min_k_threads != 0:
+            raise ValueError(
+                f"Weight input_size_per_partition = "
+                f"{input_size_per_partition} is not divisible by "
+                f"min_k_threads = {self.quant_config.min_k_threads}.")
+        if (self.quant_config.group_size != -1 and
+                input_size_per_partition % self.quant_config.group_size != 0):
+            raise ValueError(f"Weight input_size_per_partition = "
+                             f"{input_size_per_partition} is not divisible by "
+                             f"group_size = {self.quant_config.group_size}.")
+
+        # Check that we have at least 4 tiles horizontally in the shard
+        num_tiles_per_perm = self.quant_config.perm_len // (
+            self.quant_config.tile_size**2)
+        if output_size_per_partition % num_tiles_per_perm != 0:
+            raise ValueError(
+                "Each permutation group must reside on the same gpu")
+
+        # Quantized 4Bit weights packed into Int32.
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition // self.quant_config.tile_size,
+                output_size_per_partition * self.quant_config.tile_size //
+                self.quant_config.pack_factor,
+                device="cuda",
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=1,
+            packed_factor=self.quant_config.pack_factor,
+            marlin_tile_size=self.quant_config.tile_size,
+            weight_loader=weight_loader)
+
+        # Determine if channelwise or not
+        input_groups = (1 if self.quant_config.group_size == -1 else
+                        input_size_per_partition //
+                        self.quant_config.group_size)
+
+        weight_scale_args = {
+            "data":
+            torch.empty(
+                input_groups,
+                output_size_per_partition,
+                device="cuda",
+                dtype=params_dtype,
+            ),
+            "weight_loader":
+            weight_loader
+        }
+        if input_groups == 1:
+            scales = ChannelQuantScaleParameter(output_dim=1,
+                                                **weight_scale_args)
+        else:
+            scales = GroupQuantScaleParameter(output_dim=1,
+                                              input_dim=0,
+                                              **weight_scale_args)
+
+        # Allocate workspace (Used for internal locking mechanism)
+        max_workspace_size = (
+            output_size_per_partition //
+            self.quant_config.min_n_threads) * self.quant_config.max_parallel
+
+        workspace = BasevLLMParameter(data=torch.zeros(max_workspace_size,
+                                                       device="cuda",
+                                                       dtype=torch.int),
+                                      weight_loader=weight_loader)
+
+        layer.register_parameter("B", qweight)
+        layer.register_parameter("s", scales)
+        layer.register_parameter("workspace", workspace)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # required by torch.compile
+        layer.B = Parameter(layer.B.data, requires_grad=False)
+        layer.s = Parameter(layer.s.data, requires_grad=False)
+        layer.workspace = Parameter(layer.workspace.data, requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        qweight = layer.B
+        scales = layer.s
+        workspace = layer.workspace
+
+        x_2d = x.view(-1, x.shape[-1])
+
+        size_m = x_2d.shape[0]
+        size_k = x_2d.shape[1]
+        size_n = scales.shape[1]
+
+        output_2d = ops.marlin_gemm(x_2d, qweight, scales, workspace, size_m,
+                                    size_n, size_k)
+
+        output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
+
+        if bias is not None:
+            output.add_(bias)  # In-place add
+
+        return output
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/modelopt.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/modelopt.py
new file mode 100644
index 0000000..460334d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/modelopt.py
@@ -0,0 +1,1295 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Callable, Optional, Union
+
+import torch
+from torch.nn import Module
+from torch.nn.parameter import Parameter
+
+import vllm.envs as envs
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm._custom_ops import (cutlass_scaled_fp4_mm,
+                              cutlass_scaled_mm_supports_fp4, scaled_fp4_quant)
+from vllm.distributed import get_ep_group
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
+from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
+    FlashInferCutlassMoEPrepareAndFinalize)
+from vllm.model_executor.layers.fused_moe.layer import (
+    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
+    apply_fp4_marlin_linear, is_fp4_marlin_supported,
+    prepare_fp4_layer_for_marlin, prepare_moe_fp4_layer_for_marlin)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape, is_layer_skipped)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    Fp8LinearOp, requantize_with_max_scale)
+from vllm.model_executor.parameter import (ModelWeightParameter,
+                                           PerTensorScaleParameter)
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+
+logger = init_logger(__name__)
+
+QUANT_ALGOS = ["FP8", "NVFP4"]
+KV_CACHE_QUANT_ALGOS = ["FP8"]
+
+
+class ModelOptFp8Config(QuantizationConfig):
+    """Config class for ModelOpt FP8."""
+
+    def __init__(
+        self,
+        is_checkpoint_fp8_serialized: bool = False,
+        kv_cache_quant_method: Optional[str] = None,
+        exclude_modules: Optional[list[str]] = None,
+    ) -> None:
+        super().__init__()
+        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
+        self.kv_cache_quant_method = kv_cache_quant_method
+        self.exclude_modules = exclude_modules
+        if is_checkpoint_fp8_serialized:
+            logger.warning("Detected ModelOpt fp8 checkpoint. Please note that"
+                           " the format is experimental and could change.")
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "modelopt"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 89
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return ["hf_quant_config.json"]
+
+    @classmethod
+    def override_quantization_method(
+            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
+        """Detect if this ModelOpt config should be used based on
+        quantization config."""
+
+        if hf_quant_cfg is None:
+            return None
+
+        # Use the community standard 'quant_method'
+        quant_method = hf_quant_cfg.get("quant_method", "").lower()
+
+        # Only proceed if the method is explicitly "modelopt"
+        if quant_method != "modelopt":
+            return None
+
+        # Look for ModelOpt-specific config structure
+        if "quantization" in hf_quant_cfg:
+            quant_config = hf_quant_cfg["quantization"]
+            if isinstance(quant_config, dict):
+                quant_algo = quant_config.get("quant_algo", "")
+                if "FP8" in quant_algo:
+                    return "modelopt"
+        else:
+            # Check for compressed-tensors style config with specific quant_algo
+            quant_algo = hf_quant_cfg.get("quant_algo", "")
+            if isinstance(quant_algo, str) and "FP8" in quant_algo:
+                return "modelopt"
+
+        return None
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "ModelOptFp8Config":
+        # Handle both ModelOpt format and compressed-tensors style format
+        if "quantization" in config:
+            # ModelOpt format: {"quantization": {"quant_algo": "..."}}
+            quant_config = cls.get_from_keys(config, ["quantization"])
+            if not isinstance(quant_config, dict):
+                raise ValueError(
+                    "Expected 'quantization' to be a dictionary in config")
+            quant_method = quant_config.get("quant_algo", "")
+            if not quant_method:
+                raise ValueError("Missing 'quant_algo' in quantization config")
+            kv_cache_quant_method = quant_config.get("kv_cache_quant_algo")
+            exclude_modules = quant_config.get("exclude_modules")
+        else:
+            # Compressed-tensors style format:
+            # {"quant_algo": "...", "quant_method": "modelopt"}
+            quant_method = config.get("quant_algo", "")
+            kv_cache_quant_method = config.get("kv_cache_quant_algo")
+            exclude_modules = config.get("exclude_modules")
+
+        if quant_method not in QUANT_ALGOS:
+            raise ValueError(
+                f"ModelOpt currently only supports: {QUANT_ALGOS} "
+                "quantizations in vLLM. Please check the "
+                "`hf_quant_config.json` file for your model's "
+                "quant configuration.")
+        is_checkpoint_fp8_serialized = ("FP8" in quant_method)
+
+        return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method,
+                   exclude_modules)
+
+    def is_layer_excluded(self, prefix: str) -> bool:
+        """
+        Check if a layer should be excluded from quantization.
+
+        This method handles both regular models and multimodal models that use
+        the language_model prefix. For multimodal models, it checks if the
+        module name (without the language_model prefix) is in the exclude list.
+        """
+        if self.exclude_modules is None:
+            return False
+
+        # Check if any excluded module matches the prefix
+        for module in self.exclude_modules:
+            if (module in prefix
+                    or (prefix.startswith("language_model.")
+                        and module in prefix.removeprefix("language_model."))):
+                return True
+        return False
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import Attention  # Avoid circular import
+        if isinstance(layer, LinearBase):
+            if self.is_layer_excluded(prefix):
+                return UnquantizedLinearMethod()
+            return ModelOptFp8LinearMethod(self)
+        elif isinstance(layer, Attention):
+            return ModelOptFp8KVCacheMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return ModelOptFp8MoEMethod(self)
+        return None
+
+
+class ModelOptFp8LinearMethod(LinearMethodBase):
+    """Linear method for Model Optimizer static quantization.
+    Supports loading FP8 checkpoints with static weight scale and
+    activation scale. Future support might be added for dynamic
+    scales.
+
+    Limitations:
+    1. Only support per-tensor quantization due to torch._scaled_mm support.
+    2. Only support float8_e4m3fn datatype
+        Args: quant_config: The ModelOpt quantization config.
+    """
+
+    def __init__(self, quant_config: ModelOptFp8Config):
+        self.quant_config = quant_config
+        self.fp8_linear = Fp8LinearOp(
+            act_quant_static=True, act_quant_group_shape=GroupShape.PER_TENSOR)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del input_size, output_size
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        weight_dtype = (torch.float8_e4m3fn
+                        if self.quant_config.is_checkpoint_fp8_serialized else
+                        params_dtype)
+        weight = ModelWeightParameter(data=torch.empty(
+            output_size_per_partition,
+            input_size_per_partition,
+            dtype=weight_dtype),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
+        layer.register_parameter("weight", weight)
+
+        if self.quant_config.is_checkpoint_fp8_serialized:
+            # WEIGHT SCALE
+            weight_scale = PerTensorScaleParameter(data=torch.empty(
+                len(output_partition_sizes), dtype=torch.float32),
+                                                   weight_loader=weight_loader)
+            weight_scale[:] = torch.finfo(torch.float32).min
+            layer.register_parameter("weight_scale", weight_scale)
+            # INPUT SCALE
+            scale = PerTensorScaleParameter(data=torch.empty(
+                len(output_partition_sizes), dtype=torch.float32),
+                                            weight_loader=weight_loader)
+
+            scale[:] = torch.finfo(torch.float32).min
+            layer.register_parameter("input_scale", scale)
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        weight = layer.weight
+        max_w_scale = layer.weight_scale.max()
+        if not (layer.weight_scale == layer.weight_scale[0]).all():
+            max_w_scale, weight = requantize_with_max_scale(
+                layer.weight, layer.weight_scale, layer.logical_widths)
+        layer.weight = Parameter(weight.t(), requires_grad=False)
+        layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
+        layer.input_scale = Parameter(layer.input_scale.max(),
+                                      requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return self.fp8_linear.apply(input=x,
+                                     weight=layer.weight,
+                                     weight_scale=layer.weight_scale,
+                                     input_scale=layer.input_scale,
+                                     bias=bias)
+
+
+class ModelOptFp8MoEMethod(FusedMoEMethodBase):
+    """MoE method for ModelOpt FP8.
+    Supports loading FP8 checkpoints with static weight scale and
+    activation scale.
+    Args:
+        quant_config: The ModelOpt quantization config.
+    """
+
+    def __init__(self, quant_config: ModelOptFp8Config):
+        self.quant_config = quant_config
+        from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+            cutlass_fp8_supported)
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+
+        # Use FP8 dtype if checkpoint is serialized
+        weight_dtype = (torch.float8_e4m3fn
+                        if self.quant_config.is_checkpoint_fp8_serialized else
+                        params_dtype)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        w13_weight = ModelWeightParameter(
+            data=torch.empty(num_experts,
+                             2 * intermediate_size_per_partition,
+                             hidden_size,
+                             dtype=weight_dtype),
+            input_dim=2,
+            output_dim=1,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+
+        w2_weight = ModelWeightParameter(
+            data=torch.empty(num_experts,
+                             hidden_size,
+                             intermediate_size_per_partition,
+                             dtype=weight_dtype),
+            input_dim=2,
+            output_dim=1,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+
+        if self.quant_config.is_checkpoint_fp8_serialized:
+            # WEIGHT SCALES - Per-tensor scaling for ModelOpts
+            # Allocate 2 scales for w1 and w3 respectively.
+            # They will be combined to a single scale after weight loading.
+            w13_weight_scale = PerTensorScaleParameter(
+                data=torch.full(
+                    (num_experts, 2),
+                    1.0,
+                    dtype=torch.float32,
+                ),
+                weight_loader=weight_loader,
+            )
+            w2_weight_scale = PerTensorScaleParameter(
+                data=torch.full((num_experts, ), 1.0, dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("w13_weight_scale", w13_weight_scale)
+            layer.register_parameter("w2_weight_scale", w2_weight_scale)
+
+            # Set weight loader attributes for scales
+            extra_weight_attrs.update(
+                {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+
+            # INPUT SCALES - Per-tensor scaling for ModelOpt
+            w13_input_scale = PerTensorScaleParameter(
+                data=torch.full((num_experts, ), 1.0, dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
+            w2_input_scale = PerTensorScaleParameter(
+                data=torch.full((num_experts, ), 1.0, dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("w13_input_scale", w13_input_scale)
+            layer.register_parameter("w2_input_scale", w2_input_scale)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        """Process FP8 MoE weights after loading from serialized checkpoint.
+        Only supports pre-quantized checkpoints with FP8 weights and scales.
+        """
+
+        layer.w13_weight = Parameter(layer.w13_weight.data,
+                                     requires_grad=False)
+        layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
+
+        from vllm._custom_ops import scaled_fp8_quant
+        from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+            per_tensor_dequantize)
+
+        # Handle scale parameters
+        if hasattr(layer,
+                   "w13_weight_scale") and layer.w13_weight_scale is not None:
+            # Fp8 moe kernel needs single weight scale for w13 per expert.
+            # We take the max of the w1 and w3 scales
+            # then dequant and requant each expert.
+            if layer.w13_weight_scale.dim() == 2:
+
+                # Get the maximum scale across w1 and w3 for each expert
+                max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+
+                # Requantize each expert's weights using the combined scale
+                # w13_weight (num_experts, 2 * intermediate_size, hidden_size)
+                # where the first intermediate_size rows are w1, the next are w3
+                intermediate_size = layer.w13_weight.shape[1] // 2
+                for expert_id in range(layer.w13_weight.shape[0]):
+                    start = 0
+                    for shard_id in range(2):  # w1 and w3
+                        # Dequantize using the original scale for this shard
+                        dq_weight = per_tensor_dequantize(
+                            layer.w13_weight[expert_id][start:start +
+                                                        intermediate_size, :],
+                            layer.w13_weight_scale[expert_id][shard_id],
+                        )
+                        # Requantize using the combined max scale
+
+                        (
+                            layer.w13_weight[expert_id][start:start +
+                                                        intermediate_size, :],
+                            _,
+                        ) = scaled_fp8_quant(dq_weight,
+                                             max_w13_scales[expert_id])
+
+                        start += intermediate_size
+
+                # Update the scale parameter to be per-expert
+                layer.w13_weight_scale = Parameter(max_w13_scales,
+                                                   requires_grad=False)
+            else:
+                layer.w13_weight_scale = Parameter(layer.w13_weight_scale.data,
+                                                   requires_grad=False)
+
+        if hasattr(layer,
+                   "w2_weight_scale") and layer.w2_weight_scale is not None:
+            layer.w2_weight_scale = Parameter(layer.w2_weight_scale.data,
+                                              requires_grad=False)
+        # Input scales must be equal for each expert in fp8 MoE layers.
+        if hasattr(layer,
+                   "w13_input_scale") and layer.w13_input_scale is not None:
+            layer.w13_input_scale = Parameter(layer.w13_input_scale.max(),
+                                              requires_grad=False)
+        if hasattr(layer,
+                   "w2_input_scale") and layer.w2_input_scale is not None:
+            layer.w2_input_scale = Parameter(layer.w2_input_scale.max(),
+                                             requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if enable_eplb:
+            raise NotImplementedError(
+                "EPLB not supported for `ModelOptFp8MoEMethod` yet.")
+
+        # Expert selection
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias,
+        )
+        from vllm.model_executor.layers.fused_moe.fused_moe import (
+            fused_experts)
+        return fused_experts(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            activation=activation,
+            use_fp8_w8a8=True,
+            per_channel_quant=False,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            a1_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+        )
+
+
+class ModelOptNvFp4Config(QuantizationConfig):
+    """Config class for ModelOpt FP4."""
+
+    def __init__(
+        self,
+        is_checkpoint_nvfp4_serialized: bool,
+        kv_cache_quant_algo: Optional[str],
+        exclude_modules: list[str],
+        group_size: int = 16,
+    ) -> None:
+        super().__init__()
+        self.is_checkpoint_nvfp4_serialized = is_checkpoint_nvfp4_serialized
+        if is_checkpoint_nvfp4_serialized:
+            logger.warning(
+                "Detected ModelOpt NVFP4 checkpoint. Please note that"
+                " the format is experimental and could change in future.")
+
+            self.group_size = group_size
+            self.kv_cache_quant_algo = kv_cache_quant_algo
+            self.exclude_modules = exclude_modules
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "modelopt_fp4"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.bfloat16, torch.half, torch.float8_e4m3fn]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return ["hf_quant_config.json"]
+
+    @classmethod
+    def override_quantization_method(
+            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
+        """Detect if this ModelOpt FP4 config should be used based on
+        quantization config."""
+        if hf_quant_cfg is None:
+            return None
+
+        # Use the community standard 'quant_method'
+        quant_method = hf_quant_cfg.get("quant_method", "").lower()
+
+        # Only proceed if the method is explicitly "modelopt"
+        if quant_method != "modelopt":
+            return None
+
+        # Look for ModelOpt-specific config structure
+        if "quantization" in hf_quant_cfg:
+            quant_config = hf_quant_cfg["quantization"]
+            if isinstance(quant_config, dict):
+                quant_algo = quant_config.get("quant_algo", "")
+                if "NVFP4" in quant_algo:
+                    return "modelopt_fp4"
+        else:
+            # Check for compressed-tensors style config with specific
+            # quant_algo field
+            quant_algo = hf_quant_cfg.get("quant_algo", "")
+            if isinstance(quant_algo, str) and "FP4" in quant_algo.upper():
+                return "modelopt_fp4"
+
+        return None
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "ModelOptNvFp4Config":
+        # Handle both traditional ModelOpt format and compressed-tensors
+        # style format
+        if "quantization" in config:
+            # Traditional ModelOpt format:
+            # {"quantization": {"quant_algo": "..."}}
+            quant_config = cls.get_from_keys(config, ["quantization"])
+            if not isinstance(quant_config, dict):
+                raise ValueError(
+                    "Expected 'quantization' to be a dictionary in config")
+
+            quant_method = quant_config.get("quant_algo", "")
+            if not quant_method:
+                raise ValueError("Missing 'quant_algo' in quantization config")
+
+            # Handle kv_cache_quant_algo with proper type validation
+            kv_cache_quant_algo_raw = quant_config.get("kv_cache_quant_algo")
+            if kv_cache_quant_algo_raw is None:
+                # No KV cache quantization by default
+                kv_cache_quant_algo = None
+            elif isinstance(kv_cache_quant_algo_raw, str):
+                kv_cache_quant_algo = kv_cache_quant_algo_raw
+            else:
+                raise ValueError(f"kv_cache_quant_algo must be a string, got "
+                                 f"{type(kv_cache_quant_algo_raw)}")
+
+            # Handle group_size with proper type validation
+            group_size_raw = quant_config.get("group_size")
+            if group_size_raw is None:
+                group_size = 16  # Default value
+            elif isinstance(group_size_raw, int):
+                group_size = group_size_raw
+            else:
+                try:
+                    group_size = int(group_size_raw)
+                except (ValueError, TypeError):
+                    raise ValueError(f"group_size must be an integer, got "
+                                     f"{type(group_size_raw)}") from None
+
+            exclude_modules = quant_config.get("exclude_modules", [])
+            if not isinstance(exclude_modules, list):
+                raise ValueError(f"exclude_modules must be a list, got "
+                                 f"{type(exclude_modules)}")
+        else:
+            # Compressed-tensors style format:
+            # {"quant_algo": "...", "quant_method": "modelopt"}
+            quant_method = config.get("quant_algo", "")
+
+            # Handle kv_cache_quant_algo with proper type validation
+            kv_cache_quant_algo_raw = config.get("kv_cache_quant_algo")
+            if kv_cache_quant_algo_raw is None:
+                # No KV cache quantization by default
+                kv_cache_quant_algo = None
+            elif isinstance(kv_cache_quant_algo_raw, str):
+                kv_cache_quant_algo = kv_cache_quant_algo_raw
+            else:
+                raise ValueError(f"kv_cache_quant_algo must be a string, got "
+                                 f"{type(kv_cache_quant_algo_raw)}")
+
+            # Handle group_size with proper type validation
+            group_size_raw = config.get("group_size")
+            if group_size_raw is None:
+                group_size = 16  # Default value
+            elif isinstance(group_size_raw, int):
+                group_size = group_size_raw
+            else:
+                try:
+                    group_size = int(group_size_raw)
+                except (ValueError, TypeError):
+                    raise ValueError(f"group_size must be an integer, got "
+                                     f"{type(group_size_raw)}") from None
+
+            exclude_modules = config.get("exclude_modules", [])
+            if not isinstance(exclude_modules, list):
+                raise ValueError(f"exclude_modules must be a list, got "
+                                 f"{type(exclude_modules)}")
+
+        if quant_method not in QUANT_ALGOS:
+            raise ValueError(
+                f"ModelOpt currently only supports: {QUANT_ALGOS} "
+                "quantizations in vLLM. Please check the "
+                "`hf_quant_config.json` file for your model's "
+                "quant configuration.")
+        is_checkpoint_nvfp4_serialized = ("NVFP4" in quant_method)
+
+        # For FP4, these fields are required
+        if is_checkpoint_nvfp4_serialized and "quantization" in config:
+            # Check if required fields are present in the quantization config
+            quant_config = config["quantization"]
+            required_fields = [
+                "group_size", "kv_cache_quant_algo", "exclude_modules"
+            ]
+            missing_fields = [
+                field for field in required_fields if field not in quant_config
+            ]
+            if missing_fields:
+                raise ValueError(
+                    f"NVFP4 quantization requires the following fields in "
+                    f"hf_quant_config.json: {missing_fields}")
+
+        return cls(is_checkpoint_nvfp4_serialized, kv_cache_quant_algo,
+                   exclude_modules, group_size)
+
+    def is_layer_excluded(self, prefix: str, exclude_modules: list):
+        import regex as re
+        for pattern in exclude_modules:
+            regex_str = pattern.replace('.', r'\.').replace('*', r'.*')
+            if re.fullmatch(regex_str, prefix):
+                return True
+        return False
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import Attention  # Avoid circular import
+        if isinstance(layer, LinearBase):
+            if (is_layer_skipped(prefix, self.exclude_modules)
+                    or self.is_layer_excluded(prefix, self.exclude_modules)):
+                return UnquantizedLinearMethod()
+            return ModelOptNvFp4LinearMethod(self)
+        elif isinstance(layer, Attention):
+            return ModelOptFp8KVCacheMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return ModelOptNvFp4FusedMoE(self)
+        return None
+
+
+def cutlass_fp4_supported() -> bool:
+    if not current_platform.is_cuda():
+        return False
+    capability_tuple = current_platform.get_device_capability()
+    capability = -1 if capability_tuple is None else capability_tuple.to_int()
+    return cutlass_scaled_mm_supports_fp4(capability)
+
+
+class ModelOptFp8KVCacheMethod(BaseKVCacheMethod):
+    """
+    Supports loading kv-cache scaling factors from FP8 checkpoints.
+    """
+
+    def __init__(self, quant_config: Union[ModelOptFp8Config,
+                                           ModelOptNvFp4Config]):
+        super().__init__(quant_config)
+
+
+class ModelOptNvFp4LinearMethod(LinearMethodBase):
+    """Linear method for Model Optimizer NVFP4.
+    Supports loading NVFP4 checkpoints with the following structure:
+
+    input_scale: torch.float32, scalar ,
+    weight: NVFP4(represented as byte) Shape: [1, X, y/2]
+    weight_scale: FP8-E4M3, Shape: [X, Y], aka per block scale,
+    weight_scale_2: torch.float32, scalar,
+    Args: quant_config: The ModelOpt quantization config.
+    """
+
+    def __init__(self, quant_config: ModelOptNvFp4Config):
+        self.quant_config = quant_config
+        self.cutlass_nvfp4_supported = cutlass_fp4_supported()
+        self.use_marlin = False
+
+        if not self.cutlass_nvfp4_supported:
+            if is_fp4_marlin_supported():
+                self.use_marlin = True
+            else:
+                raise ValueError("Current platform does not support NVFP4"
+                                 " quantization. Please use Blackwell and"
+                                 " above.")
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del input_size, output_size
+        if not self.quant_config.is_checkpoint_nvfp4_serialized:
+            raise ValueError("NVFP4 quantization was selected, "
+                             " dynamic quantization is not supported.")
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+
+        if (input_size_per_partition % 16 != 0):
+            raise ValueError("Unsupported model when in features size is "
+                             "not multiple of 16")
+        # The nvfp4 weight is still represented as
+        weight_dtype = (torch.float8_e4m3fn
+                        if self.quant_config.is_checkpoint_nvfp4_serialized
+                        else params_dtype)
+        # Weight
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                # 2 fp4 items are packed in the input dimension
+                layer.output_size_per_partition,
+                layer.input_size_per_partition // 2,
+                dtype=torch.uint8),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader)
+        layer.register_parameter("weight", weight)
+
+        # Input Weight Scale
+        input_scale = PerTensorScaleParameter(data=torch.empty(
+            len(output_partition_sizes), dtype=torch.float32),
+                                              weight_loader=weight_loader)
+        layer.register_parameter("input_scale", input_scale)
+
+        # Global Weight Scale
+        weight_scale_2 = PerTensorScaleParameter(data=torch.empty(
+            len(output_partition_sizes), dtype=torch.float32),
+                                                 weight_loader=weight_loader)
+        layer.register_parameter("weight_scale_2", weight_scale_2)
+
+        # Per Block Weight Scale
+        weight_scale = ModelWeightParameter(data=torch.empty(
+            output_size_per_partition,
+            input_size_per_partition // self.quant_config.group_size,
+            dtype=weight_dtype,
+        ),
+                                            input_dim=1,
+                                            output_dim=0,
+                                            weight_loader=weight_loader)
+
+        layer.register_parameter("weight_scale", weight_scale)
+
+    def swizzle_blockscale(self, scale: torch.tensor):
+        assert (scale.dtype == torch.float8_e4m3fn)
+        # Pad and blockwise interleave weight_scale
+        scale_ndim = scale.ndim
+        if scale.ndim == 2:
+            scale = scale.unsqueeze(0)
+        assert scale.ndim == 3
+        B, M, K = scale.shape
+        round_up_multiple = lambda x, m: (x + m - 1) // m * m
+        M_padded = round_up_multiple(M, 128)
+        K_padded = round_up_multiple(K, 4)
+        padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
+        padded_scale[:B, :M, :K] = scale
+        batches, rows, cols = padded_scale.shape
+        assert rows % 128 == 0
+        assert cols % 4 == 0
+        padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32,
+                                            cols // 4, 4)
+        swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
+        swizzled_scale = swizzled_scale.contiguous().cuda()
+        return (swizzled_scale.reshape(M, K)
+                if scale_ndim == 2 else swizzled_scale.reshape(B, M, K))
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+
+        # global scales:
+        input_scale_2 = layer.input_scale.max().to(torch.float32)
+        layer.input_scale = Parameter(input_scale_2, requires_grad=False)
+
+        weight_scale_2 = layer.weight_scale_2.max().to(torch.float32)
+        layer.weight_scale_2 = Parameter(weight_scale_2, requires_grad=False)
+
+        layer.alpha = Parameter(layer.input_scale * layer.weight_scale_2,
+                                requires_grad=False)
+
+        # Swizzle the weight blockscale.
+        # contracting dimension is input dimension
+        # block_size = 16;
+        assert (layer.weight_scale.shape[1] % 16 == 0), (
+            "Expected weight_scale.dim(1) to be divisible by 16")
+        assert (layer.weight_scale.dtype == torch.float8_e4m3fn), (
+            "Weight Block scale must be represented as FP8-E4M3")
+        swizzled_weight_scale = self.swizzle_blockscale(layer.weight_scale)
+
+        layer.weight_scale_swizzled = Parameter(swizzled_weight_scale,
+                                                requires_grad=False)
+        layer.weight = Parameter(layer.weight.data, requires_grad=False)
+
+        if self.use_marlin:
+            prepare_fp4_layer_for_marlin(layer)
+            del layer.alpha
+            del layer.input_scale
+            del layer.weight_scale_swizzled
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if self.use_marlin:
+            return apply_fp4_marlin_linear(
+                input=x,
+                weight=layer.weight,
+                weight_scale=layer.weight_scale,
+                weight_scale_2=layer.weight_scale_2,
+                workspace=layer.workspace,
+                size_n=layer.output_size_per_partition,
+                size_k=layer.input_size_per_partition,
+                bias=bias)
+
+        output_dtype = x.dtype
+        output_shape = [x.shape[0], layer.weight.shape[0]]
+
+        # quantize BF16 or FP16 to (FP4 and interleaved block scale)
+        s_quant = 1 / layer.input_scale
+        x_fp4, x_blockscale = scaled_fp4_quant(x, s_quant)
+
+        # validate dtypes of quantized input, input block scale,
+        # weight and weight_blockscale
+        assert (x_fp4.dtype == torch.uint8)
+        assert (layer.weight.dtype == torch.uint8)
+        assert (x_blockscale.dtype == torch.float8_e4m3fn)
+        assert (layer.weight_scale_swizzled.dtype == torch.float8_e4m3fn)
+        assert (layer.alpha.dtype == torch.float32)
+
+        out = cutlass_scaled_fp4_mm(x_fp4, layer.weight, x_blockscale,
+                                    layer.weight_scale_swizzled, layer.alpha,
+                                    output_dtype)
+        if bias is not None:
+            out = out + bias
+        return out.view(*output_shape)
+
+
+class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
+    """
+    MoE Method for FP4 Quantization.
+    Args:
+        quant_config: NVFP4 Quant Config
+    """
+
+    def __init__(self, quant_config: ModelOptNvFp4Config):
+        self.quant_config = quant_config
+        self.cutlass_nvfp4_supported = cutlass_fp4_supported()
+        self.use_marlin = False
+        self.allow_flashinfer_cutlass = False
+
+        if envs.VLLM_USE_FLASHINFER_MOE_FP4:
+            if self.cutlass_nvfp4_supported and current_platform.is_cuda() \
+               and current_platform.is_device_capability(100):
+                logger.info_once(
+                    "Using FlashInfer kernels for ModelOptNvFp4FusedMoE.")
+                self.allow_flashinfer_cutlass = True
+            else:
+                logger.warning_once(
+                    "Flashinfer CUTLASS Fused MoE not supported "
+                    "or found on the current platform.")
+
+        if not self.cutlass_nvfp4_supported:
+            if is_fp4_marlin_supported():
+                self.use_marlin = True
+            else:
+                raise ValueError("Current platform does not support NVFP4"
+                                 " quantization. Please use Blackwell and"
+                                 " above.")
+
+        self.fused_experts = None  # type: ignore
+
+    def maybe_swap_experts_impl(
+        self,
+        moe_parallel_config: FusedMoEParallelConfig,
+    ):
+        if not self.allow_flashinfer_cutlass:
+            return
+
+        logger.debug_once("FlashInferExperts")
+        # default to TP/EP case only
+
+        experts_kwargs: dict[str, Any] = {
+            "use_nvfp4_w4a4": True,
+            "use_dp": moe_parallel_config.dp_size > 1,
+            "ep_rank": moe_parallel_config.ep_rank,
+            "ep_size": moe_parallel_config.ep_size,
+            "tp_rank": moe_parallel_config.tp_rank,
+            "tp_size": moe_parallel_config.tp_size,
+        }
+
+        from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
+            FlashInferExperts)
+        experts = FlashInferExperts(**experts_kwargs)
+        self.fused_experts = mk.FusedMoEModularKernel(
+            FlashInferCutlassMoEPrepareAndFinalize(
+                quant_dtype=torch.uint8,
+                #meaning 2x e2m1 packed in one, kernel requirement
+            ),
+            experts,
+        )
+
+    # This method update self.fused_experts
+    # only prepare_finalize is not None call select_gemm_impl
+    # so when native cutlass fp4, fused_expert is in fuse_moe.py fused_expert
+    # when it's not called(TP case), we still have 2 kernels to use.
+    def select_gemm_impl(self, prepare_finalize,
+                         moe) -> mk.FusedMoEPermuteExpertsUnpermute:
+
+        assert moe is not None
+        assert prepare_finalize is not None
+        experts = None
+        all2all_manager = get_ep_group().device_communicator.all2all_manager
+        assert all2all_manager is not None
+        if self.allow_flashinfer_cutlass:
+            from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
+                FlashInferExperts)
+            logger.debug_once("Using FlashInferExperts")
+            experts = FlashInferExperts(
+                use_nvfp4_w4a4=True,
+                use_dp=moe.moe_parallel_config.dp_size > 1,
+                ep_rank=moe.moe_parallel_config.ep_rank,
+                ep_size=moe.moe_parallel_config.ep_size,
+                tp_rank=moe.moe_parallel_config.tp_rank,
+                tp_size=moe.moe_parallel_config.tp_size,
+            )
+        else:
+            assert moe.dp_size > 1
+            logger.debug_once("Using CutlassExpertsFp4")
+            # Currently CutlassExpertsFp4 doesn't support DP
+            raise ValueError("CutlassExpertsFp4 doesn't support DP. "
+                             "Use flashinfer CUTLASS FusedMoE backend instead "
+                             "(set VLLM_USE_FLASHINFER_MOE_FP4=1)")
+
+        return experts
+
+    def uses_weight_scale_2_pattern(self) -> bool:
+        """
+        FP4 variants use 'weight_scale_2' pattern for per-tensor weight scales.
+        """
+        return True
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+        if not self.quant_config.is_checkpoint_nvfp4_serialized:
+            raise ValueError("NVFP4 quantization was selected, "
+                             " dynamic quantization is not supported.")
+
+        layer.num_experts = num_experts
+        layer.params_dtype = params_dtype
+        layer.quant_config = self.quant_config
+        weight_dtype = torch.uint8
+        weight_scale_dtype = torch.float8_e4m3fn
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        # GEMM 1
+        w13_weight = ModelWeightParameter(
+            data=torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                # 2 fp4 items are packed in the input dimension
+                hidden_size // 2,
+                dtype=weight_dtype),
+            input_dim=1,
+            output_dim=2,
+            weight_loader=weight_loader)
+        layer.register_parameter("w13_weight", w13_weight)
+
+        # GEMM 2
+        w2_weight = ModelWeightParameter(
+            data=torch.empty(
+                num_experts,
+                hidden_size,
+                # 2 fp4 items are packed in the input dimension
+                intermediate_size_per_partition // 2,
+                dtype=weight_dtype),
+            input_dim=1,
+            output_dim=2,
+            weight_loader=weight_loader)
+        layer.register_parameter("w2_weight", w2_weight)
+
+        w13_weight_scale = ModelWeightParameter(
+            data=torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                # 2 fp4 items are packed in the input dimension
+                hidden_size // self.quant_config.group_size,
+                dtype=weight_scale_dtype),
+            input_dim=1,
+            output_dim=2,
+            weight_loader=weight_loader)
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+
+        w2_weight_scale = ModelWeightParameter(
+            data=torch.empty(
+                num_experts,
+                hidden_size,
+                # 2 fp4 items are packed in the input dimension
+                intermediate_size_per_partition //
+                self.quant_config.group_size,
+                dtype=weight_scale_dtype),
+            input_dim=1,
+            output_dim=2,
+            weight_loader=weight_loader)
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value})
+
+        w13_weight_scale_2 = PerTensorScaleParameter(
+            data=torch.empty(num_experts, 2, dtype=torch.float32),
+            weight_loader=weight_loader)
+        layer.register_parameter("w13_weight_scale_2", w13_weight_scale_2)
+
+        w2_weight_scale_2 = PerTensorScaleParameter(
+            data=torch.empty(num_experts, dtype=torch.float32),
+            weight_loader=weight_loader)
+        layer.register_parameter("w2_weight_scale_2", w2_weight_scale_2)
+
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+
+        w13_input_scale = PerTensorScaleParameter(data=torch.empty(
+            num_experts, 2, dtype=torch.float32),
+                                                  weight_loader=weight_loader)
+        layer.register_parameter("w13_input_scale", w13_input_scale)
+
+        w2_input_scale = PerTensorScaleParameter(data=torch.empty(
+            num_experts, dtype=torch.float32),
+                                                 weight_loader=weight_loader)
+        layer.register_parameter("w2_input_scale", w2_input_scale)
+
+    def swizzle_blockscale(self, scale: torch.tensor):
+        assert (scale.dtype == torch.float8_e4m3fn)
+        # Pad and blockwise interleave weight_scale
+        scale_ndim = scale.ndim
+        if scale.ndim == 2:
+            scale = scale.unsqueeze(0)
+        assert scale.ndim == 3
+        B, M, K = scale.shape
+        round_up_multiple = lambda x, m: (x + m - 1) // m * m
+        M_padded = round_up_multiple(M, 128)
+        K_padded = round_up_multiple(K, 4)
+        padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
+        padded_scale[:B, :M, :K] = scale
+        batches, rows, cols = padded_scale.shape
+        assert rows % 128 == 0
+        assert cols % 4 == 0
+        padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32,
+                                            cols // 4, 4)
+        swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
+        swizzled_scale = swizzled_scale.contiguous().cuda()
+        return (swizzled_scale.reshape(M, K)
+                if scale_ndim == 2 else swizzled_scale.reshape(B, M, K))
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # GEMM 1
+        # The FlashInfer Cutlass fused MoE kernel expects the combined weights
+        # to be ordered as [w3, w1], unlike the standard [w1, w3] layout.
+        gemm1_weight = layer.w13_weight.data
+        gemm1_weight_scale = layer.w13_weight_scale.data
+
+        if self.allow_flashinfer_cutlass:
+            dim = -2
+            size = gemm1_weight.size(dim)
+            assert size % 2 == 0, f"Expected even size in dim {dim}, got {size}"
+            half = size // 2
+
+            # Reorder weight
+            w1, w3 = gemm1_weight.split(half, dim=dim)
+            gemm1_weight = torch.cat([w3, w1], dim=dim).contiguous()
+
+            # Reorder scale
+            s1, s3 = gemm1_weight_scale.split(half, dim=dim)
+            gemm1_weight_scale = torch.cat([s3, s1], dim=dim).contiguous()
+
+        layer.w13_weight = Parameter(gemm1_weight, requires_grad=False)
+        layer.w13_weight_scale = Parameter(gemm1_weight_scale,
+                                           requires_grad=False)
+
+        if not torch.allclose(layer.w13_weight_scale_2[:, 0],
+                              layer.w13_weight_scale_2[:, 1]):
+            logger.warning_once(
+                "w1_weight_scale_2 must match w3_weight_scale_2. "
+                "Accuracy may be affected.")
+
+        w13_weight_scale_2 = layer.w13_weight_scale_2[:, 0]
+        layer.w13_weight_scale_2 = Parameter(w13_weight_scale_2,
+                                             requires_grad=False)
+
+        w13_input_scale = layer.w13_input_scale.max(dim=1).values.to(
+            torch.float32)
+        layer.g1_alphas = Parameter(
+            (w13_input_scale * w13_weight_scale_2).to(torch.float32),
+            requires_grad=False)
+
+        assert (layer.w13_weight_scale.shape[2] % 16 == 0), (
+            "Expected weight_scale.dim(1) to be divisible by 16")
+        assert (layer.w13_weight_scale.dtype == torch.float8_e4m3fn), (
+            "Weight Blockscale must be represented as FP8-E4M3")
+        w13_blockscale_swizzled = self.swizzle_blockscale(
+            layer.w13_weight_scale)
+
+        layer.w13_blockscale_swizzled = Parameter(w13_blockscale_swizzled,
+                                                  requires_grad=False)
+
+        # This is for quantization, so we need to invert it.
+        layer.w13_input_scale_quant = Parameter(
+            (1 / w13_input_scale).to(torch.float32), requires_grad=False)
+
+        # GEMM 2
+        layer.g2_alphas = Parameter(
+            (layer.w2_input_scale * layer.w2_weight_scale_2).to(torch.float32),
+            requires_grad=False)
+
+        # This is for quantization, so we need to invert it.
+        layer.w2_input_scale_quant = Parameter(
+            (1 / layer.w2_input_scale).to(torch.float32), requires_grad=False)
+
+        assert (layer.w2_weight_scale.shape[2] % 16 == 0), (
+            "Expected weight_scale.dim(1) to be divisible by 16")
+        assert (layer.w2_weight_scale.dtype == torch.float8_e4m3fn), (
+            "Weight Blockscale must be represented as FP8-E4M3")
+        w2_blockscale_swizzled = self.swizzle_blockscale(layer.w2_weight_scale)
+
+        layer.w2_blockscale_swizzled = Parameter(w2_blockscale_swizzled,
+                                                 requires_grad=False)
+        layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
+
+        if self.use_marlin:
+            prepare_moe_fp4_layer_for_marlin(layer)
+            del layer.g1_alphas
+            del layer.g2_alphas
+            del layer.w13_input_scale_quant
+            del layer.w2_input_scale_quant
+            del layer.w13_blockscale_swizzled
+            del layer.w2_blockscale_swizzled
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ):
+        if enable_eplb:
+            raise NotImplementedError(
+                "EPLB not supported for `ModelOptNvFp4FusedMoE` yet.")
+        assert activation == "silu", "Only SiLU activation is supported."
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
+
+        if self.use_marlin:
+            return torch.ops.vllm.fused_marlin_moe(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                layer.w13_weight_scale,
+                layer.w2_weight_scale,
+                router_logits,
+                topk_weights,
+                topk_ids,
+                global_scale1=layer.w13_weight_scale_2,
+                global_scale2=layer.w2_weight_scale_2,
+                quant_type_id=scalar_types.float4_e2m1f.id,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                global_num_experts=global_num_experts,
+                expert_map=expert_map)
+
+        if self.fused_experts is None:
+            # If no modular kernel is provided, use cutlass_moe_fp4 for TP case
+            # only (no EP).
+            from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+                cutlass_moe_fp4)
+            out = cutlass_moe_fp4(
+                a=x,
+                w1_fp4=layer.w13_weight,
+                w2_fp4=layer.w2_weight,
+                w1_blockscale=layer.w13_blockscale_swizzled,
+                w2_blockscale=layer.w2_blockscale_swizzled,
+                g1_alphas=layer.g1_alphas,
+                g2_alphas=layer.g2_alphas,
+                a1_gscale=layer.w13_input_scale_quant,
+                a2_gscale=layer.w2_input_scale_quant,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                m=x.shape[0],
+                n=layer.w2_weight.shape[2] * 2,
+                k=x.shape[1],
+                e=layer.w13_weight.shape[0],
+                device=x.device,
+                expert_map=expert_map,
+                apply_router_weight_on_input=apply_router_weight_on_input)
+        else:
+            # TP or DP case
+            from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
+                is_valid_flashinfer_cutlass_fused_moe)
+            assert is_valid_flashinfer_cutlass_fused_moe(
+                x, layer.w13_weight, layer.w2_weight), (
+                    "Flashinfer CUTLASS Fused MoE not applicable!")
+
+            a1_gscale = torch.min(layer.w13_input_scale_quant)
+            a2_gscale = torch.min(layer.w2_input_scale_quant)
+            extra_expert_args = {
+                'g1_alphas': layer.g1_alphas,
+                'g2_alphas': layer.g2_alphas,
+                'out_dtype': x.dtype,
+                # Avoid confusion with a1_scale and a2_scale
+                # where are batch size related.
+                'a1_gscale': a1_gscale,
+                'a2_gscale': a2_gscale,
+            }
+            extra_prepare_args = {
+                'use_dp': layer.dp_size > 1,
+                'local_tokens': x.shape[0],
+                'a1_gscale': a1_gscale,
+            }
+            extra_finalize_args = {
+                'use_dp': layer.dp_size > 1,
+                'local_tokens': x.shape[0],
+            }
+
+            out = self.fused_experts(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                inplace=False,  # TODO(shuw): fix later, now output is high prec
+                activation=activation,
+                global_num_experts=global_num_experts,
+                expert_map=expert_map,
+                w1_scale=layer.w13_blockscale_swizzled,
+                w2_scale=layer.w2_blockscale_swizzled,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                extra_expert_args=extra_expert_args,
+                extra_prepare_args=extra_prepare_args,
+                extra_finalize_args=extra_finalize_args,
+            )
+        return out
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/moe_wna16.py
new file mode 100644
index 0000000..c5055a0
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -0,0 +1,457 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Callable, Optional
+
+import torch
+
+from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group
+from vllm.model_executor.layers.fused_moe.layer import (
+    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
+from vllm.model_executor.layers.linear import (LinearBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    check_marlin_supports_layer)
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+
+
+class MoeWNA16Config(QuantizationConfig):
+    """Config class for MOE WNA16 (W8A16/W4A16) quantization."""
+
+    def __init__(self, linear_quant_method: str, weight_bits: int,
+                 group_size: int, has_zp: bool, lm_head_quantized: bool,
+                 modules_to_not_convert: Optional[list[str]],
+                 full_config: dict[str, Any]) -> None:
+        super().__init__()
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.has_zp = has_zp
+        self.bit8_pack_factor = 8 // self.weight_bits
+        self.lm_head_quantized = lm_head_quantized
+        self.linear_quant_method = linear_quant_method
+        self.full_config = full_config
+        self.use_marlin = False
+        # Avoid circular import
+        from vllm.model_executor.layers.quantization.awq import AWQConfig
+        from vllm.model_executor.layers.quantization.awq_marlin import (
+            AWQMarlinConfig)
+        from vllm.model_executor.layers.quantization.gptq_marlin import (
+            GPTQMarlinConfig)
+        if self.linear_quant_method == "gptq":
+            self.use_marlin = GPTQMarlinConfig.is_gptq_marlin_compatible(
+                full_config)
+        elif self.linear_quant_method == "awq":
+            capability_tuple = current_platform.get_device_capability()
+            device_capability = (-1 if capability_tuple is None else
+                                 capability_tuple.to_int())
+            awq_min_capability = AWQConfig.get_min_capability()
+            if device_capability < awq_min_capability:
+                raise ValueError(
+                    "The quantization method moe_wna16 + awq is not supported "
+                    "for the current GPU. "
+                    f"Minimum capability: {awq_min_capability}. "
+                    f"Current capability: {device_capability}.")
+            self.use_marlin = AWQMarlinConfig.is_awq_marlin_compatible(
+                full_config)
+        else:
+            raise ValueError("moe_wna16 only support gptq and awq.")
+
+        if modules_to_not_convert is None:
+            self.modules_to_not_convert = []
+        else:
+            self.modules_to_not_convert = modules_to_not_convert
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "moe_wna16"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "MoeWNA16Config":
+        linear_quant_method = cls.get_from_keys(config, ["quant_method"])
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
+                                                 default=False)
+        if linear_quant_method == "gptq":
+            has_zp = not cls.get_from_keys(config, ["sym"])
+            modules_to_not_convert = []
+        elif linear_quant_method == "awq":
+            has_zp = cls.get_from_keys(config, ["zero_point"])
+            modules_to_not_convert = cls.get_from_keys_or(
+                config, ["modules_to_not_convert"], None)
+        else:
+            raise ValueError("moe_wna16 only support gptq and awq.")
+
+        return cls(linear_quant_method, weight_bits, group_size, has_zp,
+                   lm_head_quantized, modules_to_not_convert, config)
+
+    @classmethod
+    def override_quantization_method(
+            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
+        can_convert = cls.is_moe_wna16_compatible(hf_quant_cfg)
+        if can_convert and user_quant == "moe_wna16":
+            return cls.get_name()
+        return None
+
+    @classmethod
+    def is_moe_wna16_compatible(cls, quant_config: dict[str, Any]):
+        # Extract data from quant config.
+        quant_method = quant_config.get("quant_method", "").lower()
+        num_bits = quant_config.get("bits")
+        desc_act = quant_config.get("desc_act")
+
+        capability_tuple = current_platform.get_device_capability()
+        device_capability = (-1 if capability_tuple is None else
+                             capability_tuple.to_int())
+        # Avoid circular import
+        from vllm.model_executor.layers.quantization.awq import AWQConfig
+        awq_min_capability = AWQConfig.get_min_capability()
+
+        gptq_compatible = quant_method == "gptq" and \
+                not desc_act and num_bits in [4, 8]
+        awq_compatible = quant_method == "awq" and num_bits == 4 and \
+            device_capability >= awq_min_capability
+
+        return gptq_compatible or awq_compatible
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        if is_layer_skipped_quant(prefix, self.modules_to_not_convert):
+            return UnquantizedLinearMethod()
+        elif isinstance(layer, LinearBase):
+            # Avoid circular import
+            from vllm.model_executor.layers.quantization.awq import AWQConfig
+            from vllm.model_executor.layers.quantization.awq_marlin import (
+                AWQMarlinConfig)
+            from vllm.model_executor.layers.quantization.gptq import GPTQConfig
+            from vllm.model_executor.layers.quantization.gptq_marlin import (
+                GPTQMarlinConfig)
+            if self.linear_quant_method == "gptq":
+                if self.use_marlin:
+                    return GPTQMarlinConfig.from_config(
+                        self.full_config).get_quant_method(layer, prefix)
+                else:
+                    return GPTQConfig.from_config(
+                        self.full_config).get_quant_method(layer, prefix)
+            elif self.linear_quant_method == "awq":
+                if self.use_marlin and check_marlin_supports_layer(
+                        layer, self.group_size):
+                    return AWQMarlinConfig.from_config(
+                        self.full_config).get_quant_method(layer, prefix)
+                else:
+                    return AWQConfig.from_config(
+                        self.full_config).get_quant_method(layer, prefix)
+            else:
+                raise ValueError("moe_wna16 only support gptq and awq.")
+        elif isinstance(layer, FusedMoE):
+            return MoeWNA16Method(self)
+        return None
+
+
+def is_layer_skipped_quant(prefix: str, modules_to_not_convert: list[str]):
+    return any(module_name in prefix for module_name in modules_to_not_convert)
+
+
+class MoeWNA16Method(FusedMoEMethodBase):
+    """Linear method for MOE WNA16 (W8A16/W4A16) quantization.
+
+    Args:
+        quant_config: The MOE WNA16 (W8A16/W4A16) quantization config.
+    """
+
+    def __init__(self, quant_config: MoeWNA16Config):
+        self.quant_config = quant_config
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        layer.quant_config = self.quant_config
+        bit8_pack_factor = self.quant_config.bit8_pack_factor
+        group_size = self.quant_config.group_size
+        group_size_div_factor = 1
+
+        # make intermediate_size and hidden_size diviable by group_size
+        # we reduce the group size to ensure that
+        # and we would repeat the loaded_weight later
+        while intermediate_size_per_partition % group_size or \
+                hidden_size % group_size:
+            group_size = group_size // 2
+            group_size_div_factor *= 2
+            assert group_size >= 32
+        layer.group_size = group_size
+        layer.group_size_div_factor = group_size_div_factor
+
+        strategy = FusedMoeWeightScaleSupported.GROUP.value
+        extra_weight_attrs.update({
+            "quant_method": strategy,
+            "is_transposed": False
+        })
+
+        assert 'weight_loader' in extra_weight_attrs
+        weight_loader = extra_weight_attrs['weight_loader']
+        wrapped_weight_loader = MoeWNA16Method.get_weight_loader(
+            layer, weight_loader)
+        extra_weight_attrs['weight_loader'] = wrapped_weight_loader
+
+        # Fused gate_up_proj (column parallel)
+        w13_qweight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            hidden_size // bit8_pack_factor,
+            dtype=torch.uint8),
+                                         requires_grad=False)
+        layer.register_parameter("w13_qweight", w13_qweight)
+        set_weight_attrs(w13_qweight, extra_weight_attrs)
+
+        # down_proj (row parallel)
+        w2_qweight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition // bit8_pack_factor,
+            dtype=torch.uint8),
+                                        requires_grad=False)
+        layer.register_parameter("w2_qweight", w2_qweight)
+        set_weight_attrs(w2_qweight, extra_weight_attrs)
+
+        w13_scales = torch.nn.Parameter(torch.zeros(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            hidden_size // group_size,
+            dtype=params_dtype),
+                                        requires_grad=False)
+        layer.register_parameter("w13_scales", w13_scales)
+        set_weight_attrs(w13_scales, extra_weight_attrs)
+
+        w2_scales = torch.nn.Parameter(torch.zeros(
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition // group_size,
+            dtype=params_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w2_scales", w2_scales)
+        set_weight_attrs(w2_scales, extra_weight_attrs)
+
+        if self.quant_config.has_zp:
+            w13_qzeros = torch.nn.Parameter(torch.zeros(
+                num_experts,
+                2 * intermediate_size_per_partition // bit8_pack_factor,
+                hidden_size // group_size,
+                dtype=torch.uint8),
+                                            requires_grad=False)
+            layer.register_parameter("w13_qzeros", w13_qzeros)
+            set_weight_attrs(w13_qzeros, extra_weight_attrs)
+
+            w2_qzeros = torch.nn.Parameter(torch.zeros(
+                num_experts,
+                hidden_size // bit8_pack_factor,
+                intermediate_size_per_partition // group_size,
+                dtype=torch.uint8),
+                                           requires_grad=False)
+            layer.register_parameter("w2_qzeros", w2_qzeros)
+            set_weight_attrs(w2_qzeros, extra_weight_attrs)
+
+        if self.quant_config.linear_quant_method == "gptq":
+            # some param are unused, but we need to init them in order to
+            # load weights
+            invalid_param_keys = ["w13_g_idx", "w2_g_idx"]
+            if not self.quant_config.has_zp:
+                invalid_param_keys += ["w13_qzeros", "w2_qzeros"]
+            for key in invalid_param_keys:
+                param = torch.nn.Parameter(torch.empty((0, ),
+                                                       dtype=torch.int32),
+                                           requires_grad=False)
+                layer.register_parameter(key, param)
+                set_weight_attrs(param, extra_weight_attrs)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if enable_eplb:
+            raise NotImplementedError(
+                "EPLB not supported for `MoeWNA16Method` yet.")
+
+        from vllm.model_executor.layers.fused_moe import fused_experts
+        assert activation == "silu", "Only SiLU activation is supported."
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
+
+        weight_bits = self.quant_config.weight_bits
+        has_zp = self.quant_config.has_zp
+
+        return fused_experts(
+            x,
+            layer.w13_qweight,
+            layer.w2_qweight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            use_int4_w4a16=weight_bits == 4,
+            use_int8_w8a16=weight_bits == 8,
+            global_num_experts=global_num_experts,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            expert_map=expert_map,
+            w1_scale=layer.w13_scales,
+            w2_scale=layer.w2_scales,
+            w1_zp=layer.w13_qzeros if has_zp else None,
+            w2_zp=layer.w2_qzeros if has_zp else None,
+            block_shape=[0, layer.group_size])
+
+    @staticmethod
+    def get_weight_loader(layer, weight_loader):
+
+        def convert_awq_tensor(tensor, tensor_type):
+            # convert awq qweight/qzeros to a standard format (assume int4)
+            # qweight: (k, n // pack_factor_bit32) -> (n, k // pack_factor_bit8)
+            # qzeros: (k // group_size, n // pack_factor_bit32) ->
+            #         (n // pack_factor_bit8, k // group_size)
+            # pack_factor_bit32 = 32 // weight_bits
+            # pack_factor_bit8 = 8 // weight_bits
+
+            # 0. suppose origin shape (a, b), dtype int32
+            # 1. convert to uint8, shape (a, b) -> (a, 4 * b)
+            size0 = tensor.size(0)
+            tensor = tensor.view(torch.uint8)
+
+            # 2. unpack to uint4 (only when weight_bits == 4)
+            #    shape (a, 4 * b) -> (a, 4 * b, 2)
+            shifter = torch.tensor([0, 4],
+                                   dtype=torch.uint8,
+                                   device=tensor.device)
+            tensor = (tensor[:, :, None] >> shifter) & 0xF
+
+            # 3. change order, see
+            # https://github.com/casper-hansen/AutoAWQ/blob/v0.2.8/awq/utils/quant_utils.py
+            # shape -> (a, 4 * b * pack_factor_bit8)
+            reverse_awq_pack_order = [0, 4, 1, 5, 2, 6, 3, 7]
+            tensor = tensor.view(-1, 8)[:, reverse_awq_pack_order]
+            tensor = tensor.view(size0, -1)
+
+            # 4. transpose, shape -> (4 * b * pack_factor_bit8, a)
+            tensor = tensor.T.contiguous()
+
+            # 5. repack (only when weight_bits == 4)
+            # qweight shape -> (4 * b * pack_factor_bit8, a // pack_factor_bit8)
+            # qzeros shape -> (4 * b, a)
+
+            if tensor_type == "qweight":
+                tensor = tensor[:, 1::2] * 16 + tensor[:, ::2]
+            elif tensor_type == "qzeros":
+                tensor = tensor[1::2, :] * 16 + tensor[::2, :]
+            return tensor
+
+        def convert_gptq_int4_qzeros(tensor):
+            tensor = tensor.view(torch.uint8)
+            shifter = torch.tensor([0, 4],
+                                   dtype=torch.uint8,
+                                   device=tensor.device)
+            tensor = (tensor[:, :, None] >> shifter) & 0xF
+            tensor = tensor + 1
+            tensor = tensor[:, :, 0] + tensor[:, :, 1] * 16
+            return tensor
+
+        def moe_wna16_weight_loader(param: torch.nn.Parameter,
+                                    loaded_weight: torch.Tensor,
+                                    weight_name: str, shard_id: str,
+                                    expert_id: int):
+            if "g_idx" in weight_name:
+                return
+            if not layer.quant_config.has_zp and "qzeros" in weight_name:
+                return
+
+            device = get_tp_group().device
+            tp_rank = get_tensor_model_parallel_rank()
+            loaded_weight = loaded_weight.to(device)
+            shard_size = layer.intermediate_size_per_partition
+
+            # convert gptq and awq weight to a standard format
+            if layer.quant_config.linear_quant_method == "awq":
+                assert layer.quant_config.weight_bits == 4
+                if "weight" in weight_name:
+                    loaded_weight = convert_awq_tensor(loaded_weight,
+                                                       "qweight")
+                elif "zeros" in weight_name:
+                    loaded_weight = convert_awq_tensor(loaded_weight, "qzeros")
+                else:
+                    loaded_weight = loaded_weight.T
+            elif layer.quant_config.linear_quant_method == "gptq":
+                assert layer.quant_config.weight_bits in [4, 8]
+                if "weight" in weight_name:
+                    loaded_weight = loaded_weight.T.contiguous().view(
+                        torch.uint8)
+                elif "zeros" in weight_name:
+                    # add 1 to gptq qzeros to align with awq
+                    loaded_weight = loaded_weight.view(torch.uint8)
+                    if layer.quant_config.weight_bits == 4:
+                        loaded_weight = convert_gptq_int4_qzeros(
+                            loaded_weight).T
+                    else:
+                        loaded_weight = loaded_weight.T + 1
+                else:
+                    loaded_weight = loaded_weight.T
+
+            # repeat the qzeros/scales to fit new group size
+            if layer.group_size_div_factor > 1 and \
+                    "qzeros" in weight_name or "scales" in weight_name:
+                loaded_weight = loaded_weight.repeat_interleave(
+                    layer.group_size_div_factor, 1)
+
+            if "w13_qzeros" in weight_name:
+                tensor = loaded_weight.view(layer.tp_size, -1,
+                                            loaded_weight.size(1))[tp_rank]
+                if shard_id == "w1":
+                    param.data[expert_id, :shard_size // 2] = tensor
+                else:
+                    param.data[expert_id, shard_size // 2:] = tensor
+            elif "w2_qzeros" in weight_name:
+                param.data[expert_id] = loaded_weight.view(
+                    loaded_weight.size(0), layer.tp_size, -1)[:, tp_rank]
+            else:
+                weight_loader(param, loaded_weight, weight_name, shard_id,
+                              expert_id)
+
+        return moe_wna16_weight_loader
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/neuron_quant.py
new file mode 100644
index 0000000..8040236
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/neuron_quant.py
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from importlib.util import find_spec
+from typing import Any, Optional
+
+from torch.nn import Module
+
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+
+SUPPORTED_QUANT_DTYPE_LIST = ['s8', 'f8e4m3fn']
+
+
+class AlwaysSupportedDtypes(list):
+
+    def __contains__(self, item):
+        return True
+
+
+class NeuronQuantConfig(QuantizationConfig):
+    """Int8 Quantization Config class for Neuron Backend."""
+
+    def __init__(
+        self,
+        dequant_dtype: str = "f16",
+        quantize_method: str = "vector_dynamic",
+    ) -> None:
+        super().__init__()
+        self.quant_dtype = os.getenv("NEURON_QUANT_DTYPE", "s8")
+        if self.quant_dtype not in SUPPORTED_QUANT_DTYPE_LIST:
+            raise ValueError(
+                f"Neuron quantization datatype {self.quant_dtype} is not valid,"
+                f" the quantization datatype should match one of the below "
+                f"types {SUPPORTED_QUANT_DTYPE_LIST}")
+        self.dequant_dtype = dequant_dtype
+        self.quantize_method = quantize_method
+
+    def get_name(self) -> QuantizationMethods:
+        return "neuron_quant"
+
+    def get_supported_act_dtypes(self) -> list[str]:
+        # Neuron implements custom handling logic for quantization support
+        return AlwaysSupportedDtypes()
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        raise NotImplementedError(
+            "This function should not be called with Neuron Backend")
+
+    @staticmethod
+    def get_config_filenames() -> list[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "NeuronQuantConfig":
+        quantize_method = cls.get_from_keys(config, ["quantize_method"])
+        dequant_dtype = cls.get_from_keys(config, ["dequant_dtype"])
+        return cls(dequant_dtype=dequant_dtype,
+                   quantize_method=quantize_method)
+
+    def get_quant_method(self, layer: Module, prefix: str) -> Optional[Any]:
+        if find_spec("transformers_neuronx") is not None:
+            return self.get_quantization_config()
+        else:
+            raise NotImplementedError(
+                "Neuron Quantization is only supported through"
+                " transformers_neuronx.")
+
+    def get_quantization_config(self):
+        from transformers_neuronx.config import QuantizationConfig
+        return QuantizationConfig(quant_dtype=self.quant_dtype,
+                                  dequant_dtype=self.dequant_dtype,
+                                  quantize_method=self.quantize_method)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/ptpc_fp8.py
new file mode 100644
index 0000000..d11cba2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/ptpc_fp8.py
@@ -0,0 +1,129 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (LinearBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.fp8 import (Fp8Config,
+                                                         Fp8KVCacheMethod,
+                                                         Fp8LinearMethod)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape, is_layer_skipped)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    Fp8LinearOp)
+from vllm.platforms import current_platform
+
+ACTIVATION_SCHEMES = ["static", "dynamic"]
+
+logger = init_logger(__name__)
+
+
+class PTPCFp8Config(Fp8Config):
+    """Config class for Per-Token-Per-Channel Dynamic Quantization Fp8."""
+
+    def __init__(
+        self,
+        activation_scheme: str = "dynamic",
+        ignored_layers: Optional[list[str]] = None,
+    ) -> None:
+        if not current_platform.is_rocm():
+            raise ValueError(
+                "ptpc_fp8 quantization is supported only on ROCm.")
+
+        if not current_platform.has_device_capability(94):
+            raise ValueError(
+                "ptpc_fp8 quantization is supported only on AMD Instinct MI300 GPUs and newer."  # noqa: E501
+            )
+        if activation_scheme == "static":
+            raise ValueError(
+                "ptpc_fp8 as of now only support dynamic quantization.")
+
+        super().__init__(is_checkpoint_fp8_serialized=False,
+                         activation_scheme=activation_scheme,
+                         ignored_layers=ignored_layers)
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "ptpc_fp8"
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "PTPCFp8Config":
+        activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
+        ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None)
+        return cls(activation_scheme=activation_scheme,
+                   ignored_layers=ignored_layers)
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import Attention  # Avoid circular import
+
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped(prefix, self.ignored_layers):
+                return UnquantizedLinearMethod()
+            return PTPCFp8LinearMethod(self)
+        elif isinstance(layer, Attention):
+            return Fp8KVCacheMethod(self)
+        return None
+
+
+class PTPCFp8LinearMethod(Fp8LinearMethod):
+    """Linear method for Per-Token and Per-Channel FP8 Quantization.
+    Only supports loading quantized BF16 model checkpoints with dynamic
+    activation scaling. To load FP16 model checkpoints, user must specify
+    to convert the FP16 model weight loading into BF16. 
+    The weight scaling factor will be initialized after
+    the model weights are loaded.
+
+    Limitations:
+    1. Only support float8_e4m3fnuz data type due to the limitation of
+       torch._scaled_mm (https://github.com/ROCm/pytorch/blob/8c0504d7f3fb0ee4c278c096a5c3caedb01129fa/aten/src/ATen/native/cuda/Blas.cpp#L1041)
+
+    Args:
+        quant_config: The quantization config.
+    """
+
+    def __init__(self, quant_config: PTPCFp8Config):
+        super().__init__(quant_config=quant_config)
+        # Force weight quantization
+        self.quant_config.is_checkpoint_fp8_serialized = False
+        self.fp8_linear = Fp8LinearOp(
+            act_quant_static=False,
+            cutlass_fp8_supported=False,
+            act_quant_group_shape=GroupShape.PER_TOKEN)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.weight = torch.nn.Parameter(layer.weight.data,
+                                          requires_grad=False)
+
+        assert layer.weight.data.dtype == torch.bfloat16, \
+            f"Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16. {str(layer.weight.data.dtype)} is specified." # noqa: E501
+        # Quantize the weights.
+        qweight, weight_scale = ops.scaled_fp8_quant(
+            layer.weight, scale=None, use_per_token_if_dynamic=True)
+
+        # Update the layer with the new values.
+        layer.weight = Parameter(
+            qweight.t(), requires_grad=False)  # Pretranspose the weight
+        layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+        layer.input_scale = None
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        return self.fp8_linear.apply(input=x,
+                                     weight=layer.weight,
+                                     weight_scale=layer.weight_scale,
+                                     input_scale=None,
+                                     input_scale_ub=None,
+                                     bias=bias)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/qqq.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/qqq.py
new file mode 100644
index 0000000..25978cb
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/qqq.py
@@ -0,0 +1,275 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           ChannelQuantScaleParameter,
+                                           GroupQuantScaleParameter,
+                                           PackedvLLMParameter)
+
+logger = init_logger(__name__)
+
+MARLIN_QQQ_TILE = 16
+MARLIN_QQQ_MIN_THREAD_N = 64
+MARLIN_QQQ_MIN_THREAD_K = 128
+MARLIN_QQQ_MAX_PARALLEL = 16
+
+MARLIN_QQQ_SUPPORTED_NUM_BITS = [4]
+MARLIN_QQQ_SUPPORTED_GROUP_SIZES = [-1, 128]
+MARLIN_QQQ_SUPPORTED_SYM = [True]
+
+
+class QQQConfig(QuantizationConfig):
+    """Config class for QQQ
+    
+    Reference: https://arxiv.org/pdf/2406.09904
+    """
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        is_sym: bool = True,
+    ) -> None:
+        super().__init__()
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.is_sym = is_sym
+
+        # Verify
+        if self.weight_bits not in MARLIN_QQQ_SUPPORTED_NUM_BITS:
+            raise ValueError(
+                f"QQQ does not support weight_bits = {self.weight_bits}. "
+                f"Only weight_bits = {MARLIN_QQQ_SUPPORTED_NUM_BITS} "
+                "are supported.")
+        if self.group_size not in MARLIN_QQQ_SUPPORTED_GROUP_SIZES:
+            raise ValueError(
+                f"QQQ does not support group_size = {self.group_size}. "
+                f"Only group_sizes = {MARLIN_QQQ_SUPPORTED_GROUP_SIZES} "
+                "are supported.")
+        if self.is_sym not in MARLIN_QQQ_SUPPORTED_SYM:
+            raise ValueError(
+                f"QQQ does not support is_sym = {self.is_sym}. "
+                f"Only sym = {MARLIN_QQQ_SUPPORTED_SYM} are supported.")
+
+        # 4 Bits packed into 32 bit datatype.
+        self.pack_factor = 32 // self.weight_bits
+
+        # Tile size used by QQQ kernels.
+        self.tile_size = MARLIN_QQQ_TILE
+
+        # Min out_features dim
+        self.min_n_threads = MARLIN_QQQ_MIN_THREAD_N
+
+        # Min in_features dim
+        self.min_k_threads = MARLIN_QQQ_MIN_THREAD_K
+
+        # Max parallel problems to solve at once (improves large
+        # batch performance)
+        self.max_parallel = MARLIN_QQQ_MAX_PARALLEL
+
+        # Permutation length used by the QQQ kernels.
+        self.perm_len = 1024
+
+    def __repr__(self) -> str:
+        return "QQQConfig(weight_bits={}, group_size={})".format(
+            self.weight_bits, self.group_size)
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "qqq"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        """List of filenames to search for in the model directory."""
+        return [
+            "quant_config.json",
+            "quantize_config.json",
+        ]
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "QQQConfig":
+        weight_bits = cls.get_from_keys(config, ["wbits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        return cls(weight_bits, group_size)
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QQQLinearMethod"]:
+        if isinstance(layer, LinearBase):
+            return QQQLinearMethod(self)
+        return None
+
+
+class QQQLinearMethod(LinearMethodBase):
+    """Linear method for QQQ.
+
+    Args:
+        quant_config: The QQQ quantization config.
+    """
+
+    def __init__(self, quant_config: QQQConfig):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        weight_loader = extra_weight_attrs["weight_loader"]
+        if params_dtype != torch.float16:
+            raise ValueError(
+                f"The params dtype must be float16, but got {params_dtype}")
+
+        # Validate output_size_per_partition
+        output_size_per_partition = sum(output_partition_sizes)
+        if output_size_per_partition % self.quant_config.min_n_threads != 0:
+            raise ValueError(
+                f"Weight output_size_per_partition = "
+                f"{output_size_per_partition} is not divisible by "
+                f"min_n_threads = {self.quant_config.min_n_threads}.")
+        if output_size_per_partition % self.quant_config.pack_factor != 0:
+            raise ValueError(
+                f"Weight output_size_per_partition = "
+                f"{output_size_per_partition} is not divisible by "
+                f"pack_factor = {self.quant_config.pack_factor}.")
+
+        # Validate input_size_per_partition
+        if input_size_per_partition % self.quant_config.min_k_threads != 0:
+            raise ValueError(
+                f"Weight input_size_per_partition = "
+                f"{input_size_per_partition} is not divisible by "
+                f"min_k_threads = {self.quant_config.min_k_threads}.")
+        if (self.quant_config.group_size != -1 and
+                input_size_per_partition % self.quant_config.group_size != 0):
+            raise ValueError(f"Weight input_size_per_partition = "
+                             f"{input_size_per_partition} is not divisible by "
+                             f"group_size = {self.quant_config.group_size}.")
+
+        # Check that we have at least 4 tiles horizontally in the shard
+        num_tiles_per_perm = self.quant_config.perm_len // (
+            self.quant_config.tile_size**2)
+        if output_size_per_partition % num_tiles_per_perm != 0:
+            raise ValueError(
+                "Each permutation group must reside on the same gpu")
+
+        # Quantized 4Bit weights packed into Int32.
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition // self.quant_config.tile_size,
+                output_size_per_partition * self.quant_config.tile_size //
+                self.quant_config.pack_factor,
+                device="cuda",
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=1,
+            packed_factor=self.quant_config.pack_factor,
+            marlin_tile_size=self.quant_config.tile_size,
+            weight_loader=weight_loader)
+
+        s_channel = ChannelQuantScaleParameter(data=torch.empty(
+            1,
+            output_size_per_partition,
+            device="cuda",
+            dtype=torch.float,
+        ),
+                                               weight_loader=weight_loader,
+                                               output_dim=1)
+
+        if self.quant_config.group_size == -1:
+            s_group_data = torch.tensor(
+                [],
+                device="cuda",
+                dtype=torch.half,
+            )
+        else:
+            s_group_data = torch.empty(
+                input_size_per_partition // self.quant_config.group_size,
+                output_size_per_partition,
+                device="cuda",
+                dtype=torch.half,
+            )
+
+        s_group_attr = {"data": s_group_data, "weight_loader": weight_loader}
+
+        if self.quant_config.group_size == -1:
+            s_group = BasevLLMParameter(**s_group_attr)
+        else:
+            s_group = GroupQuantScaleParameter(output_dim=1,
+                                               input_dim=0,
+                                               **s_group_attr)
+
+        # Allocate workspace (Used for internal locking mechanism)
+        max_workspace_size = (
+            output_size_per_partition //
+            self.quant_config.min_n_threads) * self.quant_config.max_parallel
+
+        workspace = BasevLLMParameter(data=torch.zeros(max_workspace_size,
+                                                       device="cuda",
+                                                       dtype=torch.int),
+                                      weight_loader=weight_loader)
+
+        layer.register_parameter("B", qweight)
+        layer.register_parameter("s_channel", s_channel)
+        layer.register_parameter("s_group", s_group)
+        layer.register_parameter("workspace", workspace)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # required by torch.compile
+        layer.B = Parameter(layer.B.data, requires_grad=False)
+        layer.s_channel = Parameter(layer.s_channel.data, requires_grad=False)
+        layer.s_group = Parameter(layer.s_group.data, requires_grad=False)
+        layer.workspace = Parameter(layer.workspace.data, requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        qweight = layer.B
+        s_ch = layer.s_channel
+        s_group = layer.s_group
+        workspace = layer.workspace
+
+        x_2d = x.view(-1, x.shape[-1])
+
+        size_m = x_2d.shape[0]
+        size_k = x_2d.shape[1]
+        size_n = s_ch.shape[1]
+
+        x_int8, s_tok, _ = ops.scaled_int8_quant(x_2d)
+
+        output_2d = ops.marlin_qqq_gemm(x_int8, qweight, s_tok, s_ch, s_group,
+                                        workspace, size_m, size_n, size_k)
+
+        output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
+
+        if bias is not None:
+            output.add_(bias)  # In-place add
+
+        return output
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/quark/__init__.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/quark/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/quark/quark.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/quark/quark.py
new file mode 100644
index 0000000..b67ee5c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/quark/quark.py
@@ -0,0 +1,431 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import fnmatch
+from typing import Any, Optional, cast
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+from vllm.model_executor.layers.quantization.quark.quark_moe import (  # noqa: E501
+    QuarkMoEMethod)
+from vllm.model_executor.layers.quantization.quark.schemes import (
+    QuarkScheme, QuarkW4A4MXFP4, QuarkW8A8Fp8, QuarkW8A8Int8)
+from vllm.model_executor.layers.quantization.quark.utils import (
+    deep_compare, should_ignore_layer)
+from vllm.platforms import current_platform
+
+__all__ = ["QuarkLinearMethod"]
+
+logger = init_logger(__name__)
+
+
+class QuarkConfig(QuantizationConfig):
+
+    def __init__(self,
+                 quant_config: dict[str, Any],
+                 kv_cache_group: Optional[list[str]] = None,
+                 kv_cache_config: Optional[dict[str, Any]] = None,
+                 pack_method: str = "reorder"):
+        super().__init__()
+        if kv_cache_group is None:
+            kv_cache_group = []
+        self.quant_config = quant_config
+        self.kv_cache_group = kv_cache_group
+        self.kv_cache_config = kv_cache_config
+        self.pack_method = pack_method
+
+    def get_linear_method(self) -> "QuarkLinearMethod":
+        return QuarkLinearMethod(self)
+
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+
+    def get_name(self) -> QuantizationMethods:
+        return "quark"
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import Attention  # Avoid circular import
+
+        # Check if the layer is skipped for quantization.
+        exclude_layers = cast(list[str], self.quant_config.get("exclude"))
+        if should_ignore_layer(prefix,
+                               ignore=exclude_layers,
+                               fused_mapping=self.packed_modules_mapping):
+            return UnquantizedLinearMethod()
+        if isinstance(layer, LinearBase):
+            scheme = self.get_scheme(layer=layer, layer_name=prefix)
+            layer.scheme = scheme
+            return QuarkLinearMethod(self)
+        if isinstance(layer, Attention):
+            return QuarkKVCacheMethod(self)
+
+        if isinstance(layer, FusedMoE):
+            return QuarkMoEMethod.get_moe_method(self,
+                                                 module=layer,
+                                                 layer_name=prefix)
+        return None
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "QuarkConfig":
+        export_config = config.get("export")
+        if export_config is None:
+            raise ValueError("The export key should be included in "
+                             "the configurations of Quark quantized model")
+        kv_cache_group = cast(list[str], export_config.get("kv_cache_group"))
+        pack_method = cast(str, export_config.get("pack_method"))
+
+        # In the export model of quark, the quantization configuration
+        # of kv_cache is stored in layer_quant_config. First, it is
+        # judged whether kv_cache_group exists, and then it is judged
+        # whether layer_quant_config has a quantization configuration
+        # that matches kv_cache.
+        if len(kv_cache_group) == 0:
+            kv_cache_config = None
+        else:
+            kv_cache_set = set(kv_cache_group)
+            layer_quant_config = cast(dict[str, Any],
+                                      config.get("layer_quant_config"))
+            layer_quant_names = list(layer_quant_config.keys())
+            layer_quant_set = set(layer_quant_names)
+
+            if not kv_cache_set.issubset(layer_quant_set):
+                raise ValueError("The Quark quantized model has the "
+                                 "kv_cache_group parameter setting, "
+                                 "but no kv_cache quantization settings "
+                                 "were found in the quantization "
+                                 "configuration.")
+
+            q_configs = [
+                cast(dict[str, Any], layer_quant_config.get(name))
+                for name in kv_cache_group
+            ]
+            if not all(
+                    deep_compare(q_config, q_configs[0])
+                    for q_config in q_configs):
+                raise ValueError(
+                    "The quantization method used for kv_cache should "
+                    "be the same, but the quantization method for the "
+                    "kv_cache layer in the config is different.")
+            kv_cache_config = q_configs[0].get("output_tensors")
+            if kv_cache_config is None:
+                raise ValueError(
+                    "The kv_cache quantization configuration is empty.")
+
+            # Since we have already set kv_cache quantization configurations,
+            # we will remove the quantization configuration for the
+            # output_tensors corresponding to the kv_cache layer.
+            for q_config in q_configs:
+                q_config["output_tensors"] = None
+
+            # In case q_proj output is also quantized, remove the configuration
+            # to keep qkv consistency.
+            q_proj_q_config = cast(dict[str, Any],
+                                   layer_quant_config.get("*q_proj"))
+            if q_proj_q_config is not None:
+                q_proj_q_config["output_tensors"] = None
+
+        return cls(quant_config=config,
+                   kv_cache_group=kv_cache_group,
+                   kv_cache_config=kv_cache_config,
+                   pack_method=pack_method)
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return []
+
+    def _check_scheme_supported(self,
+                                min_capability: int,
+                                error: bool = True) -> bool:
+        capability_tuple = current_platform.get_device_capability()
+
+        if capability_tuple is not None:
+            capability = capability_tuple.to_int()
+            supported = capability >= min_capability
+            if error and not supported:
+                raise RuntimeError(
+                    "Quantization scheme is not supported for ",
+                    f"the current GPU. Min capability: {min_capability}. ",
+                    f"Current capability: {capability}.")
+            return supported
+        else:
+            return False
+
+    def _is_fp8_w8a8(self, weight_quant: Optional[dict[str, Any]],
+                     input_quant: Optional[dict[str, Any]]) -> bool:
+        # Confirm weights and input quantized.
+        if weight_quant is None or input_quant is None:
+            return False
+
+        # Confirm weight scheme is supported
+        is_fp8_dtype = (weight_quant.get("dtype") == "fp8_e4m3"
+                        and input_quant.get("dtype") == "fp8_e4m3")
+        is_static_weight = not weight_quant.get("is_dynamic")
+        is_per_tensor_or_channel_weight = (weight_quant.get("qscheme")
+                                           in ["per_tensor", "per_channel"])
+
+        if not (is_fp8_dtype and is_static_weight
+                and is_per_tensor_or_channel_weight):
+            return False
+
+        # Dynamic quantization is always supported if weights supported.
+        if input_quant.get("is_dynamic"):
+            return True
+
+        # Confirm activation scheme is supported.
+        is_per_tensor_activation = (input_quant.get("qscheme") == "per_tensor")
+        return is_per_tensor_activation
+
+    def _is_static_tensor_w8a8(self, weight_quant: Optional[dict[str, Any]],
+                               input_quant: Optional[dict[str, Any]]) -> bool:
+        # Confirm weights and input quantized.
+        if weight_quant is None or input_quant is None:
+            return False
+
+        is_int8_dtype = (weight_quant.get("dtype") == "int8"
+                         and input_quant.get("dtype") == "int8")
+
+        is_tensor = (weight_quant.get("qscheme")
+                     in ["per_tensor", "per_channel"]
+                     and input_quant.get("qscheme") == "per_tensor")
+
+        is_static = (not weight_quant.get("is_dynamic")
+                     and not input_quant.get("is_dynamic"))
+
+        is_weight_symmetric = (weight_quant.get("symmetric") is True)
+
+        # Both symmetric and asymmetric input quantization supported.
+        # Only symmetric weight quantization supported.
+        return is_int8_dtype and is_tensor and is_weight_symmetric and is_static
+
+    def _is_mx_fp4(self, weight_quant: Optional[dict[str, Any]],
+                   input_quant: Optional[dict[str, Any]]) -> bool:
+        # Confirm weights and input quantized.
+        if weight_quant is None or input_quant is None:
+            logger.debug("Quark model is not in MX-FP4 format: "
+                         "weight_quant or input_quant not set")
+            return False
+
+        # Input and weight dtype needs to be fp4.
+        if weight_quant.get("dtype") != "fp4" or input_quant.get(
+                "dtype") != "fp4":
+            logger.debug("Quark model is not in MX-FP4 format: dtype not fp4")
+            return False
+
+        # Input and weight qscheme needs to be per group.
+        if weight_quant.get("qscheme") != "per_group" or input_quant.get(
+                "qscheme") != "per_group":
+            logger.debug("Quark model is not in MX-FP4 format: not per_group")
+            return False
+
+        # Input and weight group size needs to be 32.
+        if weight_quant.get("group_size") != 32 or input_quant.get(
+                "group_size") != 32:
+            logger.debug(
+                "Quark model is not in MX-FP4 format: not group_size=32")
+            return False
+
+        # Activations need to use dynamic quantization.
+        if input_quant.get("is_dynamic") is False:
+            logger.debug(
+                "Quark model is not in MX-FP4 format: not activation dynamic")
+            return False
+
+        # Activations and weight scales need to be in e8m0 format.
+        if weight_quant.get("scale_format") != "e8m0" or input_quant.get(
+                "scale_format") != "e8m0":
+            logger.debug(
+                "Quark model is not in MX-FP4 format: not scale_format e8m0")
+            return False
+
+        return True
+
+    def _find_matched_config(self, layer_name: str,
+                             module: torch.nn.Module) -> dict[str, Any]:
+
+        proj_name = layer_name.split(".")[-1]
+        if proj_name in self.packed_modules_mapping:
+            shard_proj_names = self.packed_modules_mapping[proj_name]
+
+            # Convert fused_name --> [shard_names]
+            shard_names = [
+                layer_name.replace(proj_name, shard_proj_name)
+                for shard_proj_name in shard_proj_names
+            ]
+            shard_configs = [
+                self._find_matched_config(shard_name, module)
+                for shard_name in shard_names
+            ]
+            if not all(
+                    deep_compare(q_config, shard_configs[0])
+                    for q_config in shard_configs):
+                raise ValueError(
+                    f"Found a different quantization configuration for "
+                    f"{shard_proj_names} in {layer_name}. vLLM "
+                    "requires all to use the same scheme.")
+            return shard_configs[0]
+        else:
+            layer_quant_config = cast(
+                dict[str, Any], self.quant_config.get("layer_quant_config"))
+            for name_pattern in layer_quant_config:
+                if fnmatch.fnmatch(layer_name, name_pattern):
+                    return layer_quant_config[name_pattern]
+
+            layer_type = cast(str, type(module))
+            layer_type_quant_config = cast(
+                dict[str, Any],
+                self.quant_config.get("layer_type_quant_config"))
+            if layer_type in layer_type_quant_config:
+                return layer_type_quant_config[layer_type]
+
+            global_quant_config = cast(
+                dict[str, Any], self.quant_config.get("global_quant_config"))
+            return global_quant_config
+
+    def _get_scheme_from_config(self, config: dict[str, Any]) -> "QuarkScheme":
+        if config.get("output_tensors") or config.get("bias"):
+            raise NotImplementedError(
+                "Currently, Quark models with output_tensors "
+                "and bias quantized are not supported")
+        weight_config = cast(dict[str, Any], config.get("weight"))
+        input_config = cast(dict[str, Any], config.get("input_tensors"))
+
+        if self._is_fp8_w8a8(weight_config, input_config):
+            is_fp8_w8a8_supported = self._check_scheme_supported(
+                QuarkW8A8Fp8.get_min_capability(), error=False)
+            if is_fp8_w8a8_supported:
+                return QuarkW8A8Fp8(weight_config, input_config)
+        elif self._is_static_tensor_w8a8(weight_config, input_config):
+            weight_qscheme = cast(str, weight_config.get("qscheme"))
+            return QuarkW8A8Int8(qscheme=weight_qscheme,
+                                 is_static_input_scheme=True,
+                                 input_symmetric=input_config.get("symmetric"))
+        elif self._is_mx_fp4(weight_config, input_config):
+            return QuarkW4A4MXFP4(weight_config, input_config)
+
+        raise NotImplementedError("No quark compatible scheme was found. "
+                                  f"Weight config: {weight_config}, "
+                                  f"Input config: {input_config}")
+
+    def get_scheme(self, layer: torch.nn.Module,
+                   layer_name: str) -> "QuarkScheme":
+
+        layer_quant_config = self._find_matched_config(layer_name, layer)
+
+        # Find the quant_scheme
+        scheme = self._get_scheme_from_config(layer_quant_config)
+        # Raise error if device does not support the scheme
+        # (e.g. fp8 needs ada lovelace)
+        self._check_scheme_supported(scheme.get_min_capability())
+
+        return scheme
+
+    def get_cache_scale(self, name: str) -> Optional[str]:
+        """
+        Check whether the param name matches the format for k/v cache scales
+        in quark. If this is the case, return its equivalent param name 
+        expected by vLLM
+
+        :param name: param name
+        :return: matching param name for KV cache scale in vLLM
+        """
+        if name.endswith(".output_scale") and ".k_proj" in name:
+            return name.replace(".k_proj.output_scale", ".attn.k_scale")
+        if name.endswith(".output_scale") and ".v_proj" in name:
+            return name.replace(".v_proj.output_scale", ".attn.v_scale")
+        if name.endswith(".output_scale") and ".q_proj" in name:
+            return name.replace(".q_proj.output_scale", ".attn.q_scale")
+        if name.endswith("self_attn.prob_output_scale"):
+            return name.replace(".prob_output_scale", ".attn.prob_scale")
+
+        # If no matches, return None
+        return None
+
+
+class QuarkLinearMethod(LinearMethodBase):
+
+    def __init__(self, quantization_config: QuarkConfig):
+        self.quantization_config = quantization_config
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.scheme.process_weights_after_loading(layer)
+
+    def create_weights(self, layer: torch.nn.Module,
+                       input_size_per_partition: int,
+                       output_partition_sizes: list[int], input_size: int,
+                       output_size: int, params_dtype: torch.dtype,
+                       **extra_weight_attrs):
+        """
+        Use the CompressedTensorsScheme associated with each layer to create
+        the necessary parameters for the layer. See LinearMethodBase for param
+        details
+        """
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        layer.scheme.create_weights(
+            layer=layer,
+            input_size=input_size,
+            input_size_per_partition=input_size_per_partition,
+            output_partition_sizes=output_partition_sizes,
+            output_size=output_size,
+            params_dtype=params_dtype,
+            weight_loader=weight_loader)
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None):
+        """
+        Use the output of create_weights and the CompressedTensorsScheme
+        associated with the layer to apply the forward pass with the
+        layer input.  See LinearMethodBase for param details
+
+        """
+        scheme = layer.scheme
+        if scheme is None:
+            raise ValueError("A scheme must be defined for each layer")
+        return scheme.apply_weights(layer, x, bias=bias)
+
+
+class QuarkKVCacheMethod(BaseKVCacheMethod):
+    """
+    Supports loading kv-cache scaling factors from quark checkpoints.
+    """
+
+    def __init__(self, quant_config: QuarkConfig):
+        self.validate_kv_cache_config(quant_config.kv_cache_config)
+        super().__init__(quant_config)
+
+    @staticmethod
+    def validate_kv_cache_config(kv_cache_config: Optional[dict[str, Any]]):
+        """
+        Validator for the kv cache configuration. Useful for controlling the
+        kv cache quantization schemes, that are being supported in vLLM
+        :param kv_cache_config: the quark kv cache scheme
+        """
+        if kv_cache_config is None:
+            return
+
+        dtype = kv_cache_config.get("dtype")
+        if dtype != "fp8_e4m3":
+            raise NotImplementedError(
+                "Currently supported kv cache quantization is "
+                f"dtype=fp8_e4m3, however received {dtype}")
+
+        qscheme = kv_cache_config.get("qscheme")
+        if qscheme != "per_tensor":
+            raise NotImplementedError(
+                "Only support per-tensor scaling factor "
+                "for quark KV cache. "
+                f"Expected qscheme: per_tensor, found qscheme: {qscheme}")
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/quark/quark_moe.py
new file mode 100644
index 0000000..6f69210
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -0,0 +1,409 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Callable, Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
+                                                  FusedMoeWeightScaleSupported)
+from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
+    OCP_MX_BLOCK_SIZE)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+
+logger = init_logger(__name__)
+
+__all__ = [
+    "QuarkMoEMethod", "QuarkW8A8Fp8MoEMethod", "QuarkW4A4MXFp4MoEMethod"
+]
+
+
+class QuarkMoEMethod(FusedMoEMethodBase):
+
+    @staticmethod
+    def get_moe_method(
+            quant_config: "QuarkConfig",  # type: ignore # noqa E501 # noqa F821
+            module: torch.nn.Module,
+            layer_name: str) -> "QuarkMoEMethod":
+        layer_quant_config = quant_config._find_matched_config(
+            layer_name, module)
+
+        if (layer_quant_config.get("output_tensors")
+                or layer_quant_config.get("bias")):
+            raise NotImplementedError("Currently, Quark models with "
+                                      "output_tensors and bias "
+                                      "quantized are not supported")
+        weight_config = layer_quant_config.get("weight")
+        input_config = layer_quant_config.get("input_tensors")
+
+        if quant_config._is_fp8_w8a8(weight_config, input_config):
+            return QuarkW8A8Fp8MoEMethod(weight_config, input_config)
+        elif quant_config._is_mx_fp4(weight_config, input_config):
+            return QuarkW4A4MXFp4MoEMethod(weight_config, input_config)
+        else:
+            raise RuntimeError("Unsupported FusedMoe scheme")
+
+
+class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
+
+    def __init__(self, weight_config: dict[str, Any], input_config: dict[str,
+                                                                         Any]):
+        self.weight_quant = weight_config
+        self.input_quant = input_config
+
+        weight_qscheme = self.weight_quant.get("qscheme")
+        input_qscheme = self.input_quant.get("qscheme")
+        if not (weight_qscheme == "per_tensor"
+                and input_qscheme == "per_tensor"):
+            raise ValueError(
+                "For FP8 Fused MoE layers, only per-tensor scales "
+                "for weights and activations are supported. Found "
+                f"{weight_qscheme}, {input_qscheme}")  # noqa E501
+
+        self.static_input_scales = not self.input_quant.get("is_dynamic")
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        params_dtype = torch.float8_e4m3fn
+
+        # WEIGHTS
+        w13_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            hidden_size,
+            dtype=params_dtype),
+                                        requires_grad=False)
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition,
+            dtype=params_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # WEIGHT_SCALES
+        # Allocate 2 scales for w1 and w3 respectively.
+        # They will be combined to a single scale after weight loading.
+        w13_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
+                                                         2,
+                                                         dtype=torch.float32),
+                                              requires_grad=False)
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+
+        w2_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
+                                                        dtype=torch.float32),
+                                             requires_grad=False)
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        # Add the quantization method used (per tensor/grouped/channel)
+        # to ensure the weight scales are loaded in properly
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        # INPUT_SCALES
+        if self.static_input_scales:
+            w13_input_scale = torch.nn.Parameter(torch.ones(
+                num_experts, dtype=torch.float32),
+                                                 requires_grad=False)
+            layer.register_parameter("w13_input_scale", w13_input_scale)
+            set_weight_attrs(w13_input_scale, extra_weight_attrs)
+
+            w2_input_scale = torch.nn.Parameter(torch.ones(
+                num_experts, dtype=torch.float32),
+                                                requires_grad=False)
+            layer.register_parameter("w2_input_scale", w2_input_scale)
+            set_weight_attrs(w2_input_scale, extra_weight_attrs)
+        else:
+            layer.w13_input_scale = None
+            layer.w2_input_scale = None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # Fp8 moe kernels require a single activation scale.
+        # We take the max of all the scales in case they differ.
+        if self.static_input_scales:
+            if (layer.w13_input_scale is None or layer.w2_input_scale is None):
+                raise ValueError(
+                    "QuantConfig has static quantization, but found "
+                    "activation scales are None.")
+            if (not all_close_1d(layer.w13_input_scale)
+                    or not all_close_1d(layer.w2_input_scale)):
+                logger.warning_once(
+                    "Found input_scales that are not equal for "
+                    "fp8 MoE layer. Using the maximum across experts "
+                    "for each layer. ")
+            layer.w13_input_scale = torch.nn.Parameter(
+                layer.w13_input_scale.max(), requires_grad=False)
+            layer.w2_input_scale = torch.nn.Parameter(
+                layer.w2_input_scale.max(), requires_grad=False)
+
+        if current_platform.is_fp8_fnuz():
+            # Normalize the weights and scales
+            w13_weight, w13_weight_scale, w13_input_scale = \
+                normalize_e4m3fn_to_e4m3fnuz(
+                    layer.w13_weight, layer.w13_weight_scale,
+                    layer.w13_input_scale)
+            w2_weight, w2_weight_scale, w2_input_scale = \
+                normalize_e4m3fn_to_e4m3fnuz(
+                    layer.w2_weight, layer.w2_weight_scale,
+                    layer.w2_input_scale)
+            # Reset the parameter
+            layer.w13_weight = torch.nn.Parameter(w13_weight,
+                                                  requires_grad=False)
+            layer.w13_weight_scale = torch.nn.Parameter(w13_weight_scale,
+                                                        requires_grad=False)
+            if w13_input_scale is not None:
+                layer.w13_input_scale = torch.nn.Parameter(w13_input_scale,
+                                                           requires_grad=False)
+            layer.w2_weight = torch.nn.Parameter(w2_weight,
+                                                 requires_grad=False)
+            layer.w2_weight_scale = torch.nn.Parameter(w2_weight_scale,
+                                                       requires_grad=False)
+            if w2_input_scale is not None:
+                layer.w2_input_scale = torch.nn.Parameter(w2_input_scale,
+                                                          requires_grad=False)
+
+        # Fp8 moe kernel needs single weight scale for w13 per expert.
+        # We take the max then dequant and requant each expert.
+        assert layer.w13_weight_scale is not None
+        shard_size = layer.intermediate_size_per_partition
+        max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+        for expert_id in range(layer.local_num_experts):
+            start = 0
+            for shard_id in range(2):
+                dq_weight = per_tensor_dequantize(
+                    layer.w13_weight[expert_id][start:start + shard_size, :],
+                    layer.w13_weight_scale[expert_id][shard_id])
+                layer.w13_weight[expert_id][
+                    start:start + shard_size, :], _ = ops.scaled_fp8_quant(
+                        dq_weight, max_w13_scales[expert_id])
+                start += shard_size
+
+        layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
+                                                    requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if enable_eplb:
+            raise NotImplementedError(
+                "EPLB not supported for `QuarkW8A8Fp8MoEMethod` yet.")
+
+        from vllm.model_executor.layers.fused_moe import fused_experts
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
+
+        return fused_experts(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            use_fp8_w8a8=True,
+            global_num_experts=global_num_experts,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            expert_map=expert_map,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            a1_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale,
+            activation=activation)
+
+
+class QuarkW4A4MXFp4MoEMethod(QuarkMoEMethod):
+
+    def __init__(self, weight_config: dict[str, Any], input_config: dict[str,
+                                                                         Any]):
+        self.weight_quant = weight_config
+        self.input_quant = input_config
+
+        weight_qscheme = self.weight_quant.get("qscheme")
+        input_qscheme = self.input_quant.get("qscheme")
+        if not (weight_qscheme == "per_group"
+                and input_qscheme == "per_group"):
+            raise ValueError(
+                "For MX(FP4) Fused MoE layers, only per-group scales "
+                "for weights and activations are supported. Found "
+                f"{weight_qscheme}, {input_qscheme}")  # noqa E501
+
+        self.static_input_scales = not self.input_quant.get("is_dynamic")
+
+        if self.static_input_scales:
+            raise NotImplementedError(
+                "QuarkW4A4MXFp4MoEMethod with static input scales is currently "
+                "not implemented. Please open an issue.")
+
+        if not current_platform.supports_mx():
+            self.emulate = True
+            logger.warning_once(
+                "The current platform does not support native MXFP4 "
+                "computation. Simulated weight dequantization and activation "
+                "QDQ (quantize and dequantize) will be used, with the linear "
+                "layers computed in high precision.")
+        else:
+            self.emulate = True
+            logger.warning_once(
+                "The current platform supports native MXFP4 "
+                "computation, but kernels are not yet integrated in vLLM. "
+                "Simulated weight dequantization and activation "
+                "QDQ (quantize and dequantize) will be used, with the linear "
+                "layers computed in high precision.")
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        # Add the quantization method used (per tensor/grouped/channel)
+        # to ensure the weight scales are loaded in properly
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value})
+
+        params_dtype = torch.uint8
+
+        # WEIGHTS
+        w13_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            hidden_size // 2,
+            dtype=params_dtype),
+                                        requires_grad=False)
+        layer.register_parameter("w13_weight", w13_weight)
+
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition // 2,
+            dtype=params_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w2_weight", w2_weight)
+
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # WEIGHT_SCALES
+        w13_weight_scale = torch.nn.Parameter(
+            torch.ones(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size // OCP_MX_BLOCK_SIZE,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        w2_weight_scale = torch.nn.Parameter(
+            torch.ones(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition // OCP_MX_BLOCK_SIZE,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        if enable_eplb:
+            raise NotImplementedError(
+                "EPLB not supported for `QuarkW4A4MXFp4MoEMethod` yet.")
+
+        from vllm.model_executor.layers.fused_moe import fused_experts
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
+
+        out = fused_experts(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            use_mxfp4_w4a4=True,
+            global_num_experts=global_num_experts,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            expert_map=expert_map,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            a1_scale=None,
+            a2_scale=None,
+            block_shape=None,
+            activation=activation,
+        )
+        return out
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/quark/schemes/__init__.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/quark/schemes/__init__.py
new file mode 100644
index 0000000..ec09d9b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/quark/schemes/__init__.py
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from .quark_scheme import QuarkScheme
+from .quark_w4a4_mxfp4 import QuarkW4A4MXFP4
+from .quark_w8a8_fp8 import QuarkW8A8Fp8
+from .quark_w8a8_int8 import QuarkW8A8Int8
+
+__all__ = ["QuarkScheme", "QuarkW8A8Fp8", "QuarkW8A8Int8", "QuarkW4A4MXFP4"]
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
new file mode 100644
index 0000000..c167e94
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import ABC, abstractmethod
+from typing import Optional
+
+import torch
+
+__all__ = ["QuarkScheme"]
+
+
+class QuarkScheme(ABC):
+    """
+    Abstract class used to describe the weight creation and forward pass 
+    of different quantization schemes supported by Quark.
+    """
+
+    @classmethod
+    @abstractmethod
+    def get_min_capability(cls) -> int:
+        """
+        Get minimum device capability.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def create_weights(self, *args, **kwargs):
+        """
+        Weight creation for the particular scheme. Inputs to this function 
+
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+                      bias: Optional[torch.Tensor]):
+        """
+        Run the forward pass for the particular scheme. This is where 
+        scheme-specific dequant/quant steps/kernels should be applied.
+
+        :param layer: torch.nn.Module with the registered weights and 
+            other parameters relevant to the particular scheme. 
+        :param x: input to the layer
+        :param bias: bias parameter
+
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        """
+        Called after weight loading is complete for any cleanup that
+        needs to occur.
+        """
+        raise NotImplementedError
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
new file mode 100644
index 0000000..880438a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
@@ -0,0 +1,112 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Callable, Optional
+
+import torch
+import torch.nn.functional as F
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.quark.schemes import QuarkScheme
+from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
+    OCP_MX_BLOCK_SIZE, dequant_mxfp4, quant_dequant_mxfp4)
+from vllm.model_executor.parameter import (GroupQuantScaleParameter,
+                                           PackedvLLMParameter)
+from vllm.platforms import current_platform
+
+logger = init_logger(__name__)
+
+__all__ = ["QuarkW4A4MXFP4"]
+
+
+class QuarkW4A4MXFP4(QuarkScheme):
+
+    def __init__(self, weight_quant_spec: dict[str, Any],
+                 input_quant_spec: dict[str, Any]):
+        self.out_dtype = torch.get_default_dtype()
+        self.qscheme = "per_group"
+        self.weight_quant_spec = weight_quant_spec
+        self.input_quant_spec = input_quant_spec
+
+        self.static_input_scales = not input_quant_spec.get("is_dynamic")
+
+        if self.static_input_scales:
+            raise NotImplementedError(
+                "QuarkW4A4MXFP4 with static input scales is currently not "
+                "implemented. Please open an issue.")
+
+        if not current_platform.supports_mx():
+            self.emulate = True
+            logger.warning_once(
+                "The current platform does not support native MXFP4 "
+                "computation. Simulated weight dequantization and activation "
+                "QDQ (quantize and dequantize) will be used, with the linear "
+                "layers computed in high precision.")
+        else:
+            self.emulate = True
+            logger.warning_once(
+                "The current platform supports native MXFP4 "
+                "computation, but kernels are not yet integrated in vLLM. "
+                "Simulated weight dequantization and activation "
+                "QDQ (quantize and dequantize) will be used, with the linear "
+                "layers computed in high precision.")
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.weight = torch.nn.Parameter(layer.weight.data,
+                                          requires_grad=False)
+        layer.weight_scale = torch.nn.Parameter(layer.weight_scale.data,
+                                                requires_grad=False)
+
+    def create_weights(self, layer: torch.nn.Module,
+                       output_partition_sizes: list[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+        output_size_per_partition = sum(output_partition_sizes)
+        layer.logical_widths = output_partition_sizes
+
+        # WEIGHT
+        weight = PackedvLLMParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition // 2,
+                dtype=torch.uint8,
+            ),
+            input_dim=1,
+            output_dim=0,
+            packed_dim=1,
+            packed_factor=2,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        # WEIGHT SCALE
+        weight_scale = GroupQuantScaleParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition // OCP_MX_BLOCK_SIZE,
+                dtype=torch.uint8,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight_scale", weight_scale)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        if self.emulate:
+            dq_w = dequant_mxfp4(layer.weight, layer.weight_scale, x.dtype)
+
+            x = quant_dequant_mxfp4(x)
+
+            return F.linear(x, dq_w, bias)
+        else:
+            raise NotImplementedError()
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
new file mode 100644
index 0000000..2cb3524
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
@@ -0,0 +1,163 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Callable, Optional, cast
+
+import torch
+from torch.nn import Parameter
+
+from vllm.model_executor.layers.quantization.quark.schemes import QuarkScheme
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    Fp8LinearOp, normalize_e4m3fn_to_e4m3fnuz, requantize_with_max_scale)
+from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
+                                           ModelWeightParameter,
+                                           PerTensorScaleParameter)
+from vllm.platforms import current_platform
+
+__all__ = ["QuarkW8A8Fp8"]
+
+
+class QuarkW8A8Fp8(QuarkScheme):
+
+    def __init__(self, weight_config: dict[str, Any],
+                 input_config: Optional[dict[str, Any]]):
+        self.weight_qscheme = cast(str, weight_config.get("qscheme"))
+        self.is_static_input_scheme: bool = False
+        self.input_qscheme: Optional[str] = None
+        if input_config is not None:
+            self.is_static_input_scheme = not cast(
+                bool, input_config.get("is_dynamic"))
+            self.input_qscheme = cast(str, input_config.get("qscheme"))
+
+        per_token = (not self.is_static_input_scheme
+                     and self.input_qscheme == "per_channel")
+        self.act_quant_group_shape = GroupShape.PER_TOKEN \
+            if per_token else GroupShape.PER_TENSOR
+        self.fp8_linear = Fp8LinearOp(
+            act_quant_static=self.is_static_input_scheme,
+            act_quant_group_shape=self.act_quant_group_shape)
+        self.out_dtype = torch.get_default_dtype()
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # lovelace and up
+        return 89
+
+    def process_weights_after_loading(self, layer) -> None:
+        # If per tensor, when we have a fused module (e.g. QKV) with per
+        # tensor scales (thus N scales being passed to the kernel),
+        # requantize so we can always run per tensor
+        if self.weight_qscheme == "per_tensor":
+            if current_platform.is_fp8_fnuz():
+                input_scale = getattr(layer, 'input_scale', None)
+                weight, max_w_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
+                    weight=layer.weight,
+                    weight_scale=layer.weight_scale,
+                    input_scale=input_scale)
+                if input_scale is not None:
+                    layer.input_scale = Parameter(input_scale,
+                                                  requires_grad=False)
+            else:
+                max_w_scale = layer.weight_scale
+                weight = layer.weight
+
+            max_w_scale, weight = requantize_with_max_scale(
+                weight=weight,
+                weight_scale=max_w_scale,
+                logical_widths=layer.logical_widths,
+            )
+
+            layer.weight = Parameter(weight.t(), requires_grad=False)
+            layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
+
+        # If channelwise, scales are already lined up, so just transpose.
+        elif self.weight_qscheme == "per_channel":
+            weight = layer.weight
+
+            if current_platform.is_fp8_fnuz():
+                input_scale = getattr(layer, 'input_scale', None)
+                weight, weight_scale, input_scale = \
+                    normalize_e4m3fn_to_e4m3fnuz(
+                        weight=weight,
+                        weight_scale=layer.weight_scale,
+                        input_scale=input_scale)
+                if input_scale is not None:
+                    layer.input_scale = Parameter(input_scale,
+                                                  requires_grad=False)
+            else:
+                weight_scale = layer.weight_scale.data
+            if self.act_quant_group_shape == GroupShape.PER_TOKEN:
+                weight_scale = weight_scale.view(-1, 1)
+            layer.weight = Parameter(weight.t(), requires_grad=False)
+            # required by torch.compile to be torch.nn.Parameter
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+
+        else:
+            raise ValueError(
+                f"Unknown quantization scheme {self.weight_qscheme}")
+
+        # INPUT SCALE
+        if self.is_static_input_scheme:
+            layer.input_scale = Parameter(layer.input_scale.max(),
+                                          requires_grad=False)
+        else:
+            layer.input_scale = None
+
+    def create_weights(self, layer: torch.nn.Module,
+                       output_partition_sizes: list[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+        output_size_per_partition = sum(output_partition_sizes)
+        layer.logical_widths = output_partition_sizes
+
+        # WEIGHT
+        weight = ModelWeightParameter(data=torch.empty(
+            output_size_per_partition,
+            input_size_per_partition,
+            dtype=torch.float8_e4m3fn),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
+        layer.register_parameter("weight", weight)
+
+        # WEIGHT SCALE
+        # TODO: update create_xxx_parameter functions to return
+        # the newly added parameters
+        if self.weight_qscheme == "per_channel":
+            weight_scale = ChannelQuantScaleParameter(
+                data=torch.empty((sum(output_partition_sizes)),
+                                 dtype=torch.float32),
+                output_dim=0,
+                weight_loader=weight_loader)
+        else:
+            assert self.weight_qscheme == "per_tensor"
+            weight_scale = PerTensorScaleParameter(data=torch.empty(
+                len(output_partition_sizes), dtype=torch.float32),
+                                                   weight_loader=weight_loader)
+
+        # min requirement for fp8 kernels
+        weight_scale[:] = torch.finfo(torch.float32).min
+        layer.register_parameter("weight_scale", weight_scale)
+
+        # INPUT SCALE
+        if self.is_static_input_scheme:
+            input_scale = PerTensorScaleParameter(data=torch.empty(
+                len(output_partition_sizes), dtype=torch.float32),
+                                                  weight_loader=weight_loader)
+            input_scale[:] = torch.finfo(torch.float32).min
+            layer.register_parameter("input_scale", input_scale)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        return self.fp8_linear.apply(input=x,
+                                     weight=layer.weight,
+                                     weight_scale=layer.weight_scale,
+                                     out_dtype=self.out_dtype,
+                                     input_scale=layer.input_scale,
+                                     bias=bias)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
new file mode 100644
index 0000000..ae68d5b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
@@ -0,0 +1,122 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Callable, Optional
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.kernels.scaled_mm import (
+    ScaledMMLinearLayerConfig, choose_scaled_mm_linear_kernel)
+from vllm.model_executor.layers.quantization.quark.schemes import QuarkScheme
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           ChannelQuantScaleParameter,
+                                           ModelWeightParameter,
+                                           PerTensorScaleParameter)
+
+logger = init_logger(__name__)
+
+
+class QuarkW8A8Int8(QuarkScheme):
+    _kernel_backends_being_used: set[str] = set()
+
+    def __init__(self, qscheme: str, is_static_input_scheme: Optional[bool],
+                 input_symmetric: Optional[bool]):
+        self.qscheme = qscheme
+        self.is_static_input_scheme = is_static_input_scheme
+        self.input_symmetric = input_symmetric
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # turing and up
+        return 75
+
+    def create_weights(self, layer: torch.nn.Module,
+                       output_partition_sizes: list[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+        layer.logical_widths = output_partition_sizes
+
+        scaled_mm_linear_kernel_config = ScaledMMLinearLayerConfig(
+            is_channelwise=(self.qscheme == "per_channel"),
+            is_static_input_scheme=(self.is_static_input_scheme is True),
+            input_symmetric=(self.input_symmetric is True))
+
+        kernel_type = choose_scaled_mm_linear_kernel(
+            scaled_mm_linear_kernel_config)
+
+        if kernel_type.__name__ not in self._kernel_backends_being_used:
+            logger.info("Using %s for QuarkW8A8Int8", kernel_type.__name__)
+            self._kernel_backends_being_used.add(kernel_type.__name__)
+
+        # WEIGHT
+        weight = ModelWeightParameter(data=torch.empty(
+            sum(output_partition_sizes),
+            input_size_per_partition,
+            dtype=torch.int8),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
+
+        layer.register_parameter("weight", weight)
+
+        # WEIGHT SCALE
+        if self.qscheme == "per_channel":
+            weight_scale = ChannelQuantScaleParameter(
+                data=torch.empty((sum(output_partition_sizes)),
+                                 dtype=torch.float32),
+                output_dim=0,
+                weight_loader=weight_loader)
+            ChannelQuantZPParameter = ChannelQuantScaleParameter
+            weight_zero_point = ChannelQuantZPParameter(
+                data=torch.empty((sum(output_partition_sizes)),
+                                 dtype=torch.int8),
+                output_dim=0,
+                weight_loader=weight_loader)
+        else:
+            assert self.qscheme == "per_tensor"
+            weight_scale = PerTensorScaleParameter(data=torch.empty(
+                len(output_partition_sizes), dtype=torch.float32),
+                                                   weight_loader=weight_loader)
+            PerTensorZPParameter = PerTensorScaleParameter
+            weight_zero_point = PerTensorZPParameter(
+                data=torch.empty(len(output_partition_sizes),
+                                 dtype=torch.int8),
+                weight_loader=weight_loader)
+        layer.register_parameter("weight_scale", weight_scale)
+        layer.register_parameter("weight_zero_point", weight_zero_point)
+
+        # INPUT SCALE
+        if self.is_static_input_scheme:
+            input_scale = BasevLLMParameter(data=torch.empty(
+                1, dtype=torch.float32),
+                                            weight_loader=weight_loader)
+            layer.register_parameter("input_scale", input_scale)
+
+            input_zero_point = BasevLLMParameter(data=torch.empty(
+                1, dtype=torch.int8),
+                                                 weight_loader=weight_loader)
+            layer.register_parameter("input_zero_point", input_zero_point)
+
+        self.kernel = kernel_type(c=scaled_mm_linear_kernel_config,
+                                  w_q_param_name="weight",
+                                  w_s_param_name="weight_scale",
+                                  i_s_param_name="input_scale",
+                                  i_zp_param_name="input_zero_point",
+                                  azp_adj_param_name="azp_adj")
+
+    # Checkpoints are serialized in quark format, which is
+    # different from the format the kernel may want. Handle repacking here.
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.register_parameter("weight_zero_point", None)
+        delattr(layer, 'weight_zero_point')
+        if self.input_symmetric:
+            layer.register_parameter("input_zero_point", None)
+            delattr(layer, 'input_zero_point')
+
+        self.kernel.process_weights_after_loading(layer)
+
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+                      bias: Optional[torch.Tensor]) -> torch.Tensor:
+        return self.kernel.apply_weights(layer, x, bias)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/quark/utils.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/quark/utils.py
new file mode 100644
index 0000000..99f5ec1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/quark/utils.py
@@ -0,0 +1,105 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable, Mapping
+from types import MappingProxyType
+from typing import Any, Optional
+
+import regex as re
+
+
+def deep_compare(dict1: Any, dict2: Any) -> bool:
+    if type(dict1) is not type(dict2):
+        return False
+    if isinstance(dict1, dict):
+        if dict1.keys() != dict2.keys():
+            return False
+        return all(deep_compare(dict1[k], dict2[k]) for k in dict1)
+    elif isinstance(dict1, list):
+        return set(dict1) == set(dict2)
+    else:
+        return dict1 == dict2
+
+
+def should_ignore_layer(
+    layer_name: Optional[str],
+    ignore: Iterable[str],
+    fused_mapping: Mapping[str, list[str]] = MappingProxyType({})
+) -> bool:
+    if layer_name is None:
+        return False
+
+    # layer_name = model.layers.0.self_attn.qkv_proj
+    # proj_name = qkv_proj
+    proj_name = layer_name.split(".")[-1]
+
+    # Fused layers like gate_up_proj or qkv_proj will not be fused
+    # in the safetensors checkpoint. So, we convert the name
+    # from the fused version to unfused + check to make sure that
+    # each shard of the fused layer has the same scheme.
+    if proj_name in fused_mapping:
+        shard_proj_names = fused_mapping[proj_name]
+
+        # Convert fused_name --> [shard_names]
+        shard_names = [
+            layer_name.replace(proj_name, shard_proj_name)
+            for shard_proj_name in shard_proj_names
+        ]
+
+        # Layer should be ignored if shards are ignored.
+        should_ignore_layer = None
+        for shard_name in shard_names:
+            should_ignore_shard = check_equal_or_regex_match(
+                layer_name=shard_name, targets=ignore)
+
+            # If shard_idx=0, set layer ignore to match shard.
+            if should_ignore_layer is None:
+                should_ignore_layer = should_ignore_shard
+
+            # If shard_idx=1+ confirm scheme matches prior shards.
+            elif should_ignore_shard != should_ignore_layer:
+                raise ValueError(f"Found a different quantization schemes for "
+                                 f"{shard_proj_names} in {layer_name}. vLLM "
+                                 "requires all to use the same scheme.")
+
+    # Unfused layers like down_proj and o_proj will match
+    # the safetensors checkpoint already.
+    else:
+        should_ignore_layer = check_equal_or_regex_match(layer_name=layer_name,
+                                                         targets=ignore)
+
+    assert should_ignore_layer is not None
+    return should_ignore_layer
+
+
+def check_equal_or_regex_match(layer_name: str,
+                               targets: Iterable[str]) -> bool:
+    """
+    Checks whether a layer_name is exactly equal or a regex match for 
+    if target starts with 're:' to any target in list.
+    """
+    for target in targets:
+        if _is_equal_or_regex_match(layer_name, target):
+            return True
+    return False
+
+
+def _is_equal_or_regex_match(value: str,
+                             target: str,
+                             check_contains: bool = False) -> bool:
+    """
+    Checks whether a value is exactly equal or a regex match for target
+    if target starts with 're:'. If check_contains is set to True,
+    additionally checks if the target string is contained within the value.
+    """
+
+    if target.startswith("re:"):
+        pattern = target[3:]
+        if re.match(pattern, value):
+            return True
+    elif check_contains:
+        if target.lower() in value.lower():
+            return True
+    elif target == value:
+        return True
+    return False
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/rtn.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/rtn.py
new file mode 100644
index 0000000..6830971
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/rtn.py
@@ -0,0 +1,289 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright © 2025, Oracle and/or its affiliates.
+
+import os
+from typing import Any, Optional
+
+import torch
+import torch.nn.functional as F
+from torch.nn.parameter import Parameter
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               set_weight_attrs)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+
+logger = init_logger(__name__)
+"""By default, use 8 bit as target precision, but it can be 
+overridden by setting the RTN_NUM_BITS envvar
+"""
+NUM_BITS = os.getenv('RTN_NUM_BITS', "8")
+"""By default, use group size of 128 parameters, but it can be 
+overridden by setting the RTN_GROUP_SIZE envvar
+"""
+GROUP_SIZE = os.getenv('RTN_GROUP_SIZE', "128")
+
+
+class RTNConfig(QuantizationConfig):
+    """Config class for RTN.
+    """
+
+    def __init__(
+            self,
+            weight_bits: int = int(NUM_BITS),
+            group_size: int = int(GROUP_SIZE),
+    ) -> None:
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+
+        if self.weight_bits != 4 and self.weight_bits != 8:
+            raise ValueError(
+                "Currently, only 4-bit or 8-bit weight quantization is "
+                f"supported for RTN, but got {self.weight_bits} bits.")
+
+    def __repr__(self) -> str:
+        return (f"RTNConfig(weight_bits={self.weight_bits}, "
+                f"group_size={self.group_size})")
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "rtn"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "RTNConfig":
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        return cls(weight_bits, group_size)
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["RTNLinearMethod"]:
+        if isinstance(layer, LinearBase):
+            return RTNLinearMethod(self)
+        return None
+
+
+class RTNTensor:
+    """A wrapper over Tensor that enables quantization on-the-fly by
+    overloading the copy_ method.
+    """
+
+    def __init__(self, data: torch.Tensor, scale: torch.Tensor,
+                 quant_config: RTNConfig) -> None:
+        self.data = data
+        self.scale = scale
+        self.quant_config = quant_config
+
+    def narrow(self, dim, start, length):
+        factor = 1 if self.quant_config.weight_bits == 8 else 2
+        return RTNTensor(
+            self.data.narrow(dim, start // factor, length // factor),
+            self.scale.narrow(dim, start, length), self.quant_config)
+
+    @property
+    def shape(self):
+        shape = self.data.shape
+        factor = 1 if self.quant_config.weight_bits == 8 else 2
+        return torch.Size((shape[0] * factor, shape[1]))
+
+    def copy_(self, loaded_weight: torch.Tensor) -> None:
+        qweight, weight_scale = rtn_quantize(loaded_weight.cuda(),
+                                             self.quant_config.weight_bits,
+                                             self.quant_config.group_size)
+
+        self.data.copy_(qweight)
+        self.scale.data.copy_(weight_scale)
+
+
+class RTNParameter(Parameter):
+    """A wrapper over Parameter that returns RTNTensor (a wrapper over Tensor)
+    when its data is accessed. We need this wrapper for the data loading phase
+    only, so we can intercept a weight copying function (torch.Tensor.copy_)
+    and apply quantization on-the-fly.
+    """
+
+    def __new__(cls, data: torch.Tensor, **kwargs):
+        return super().__new__(cls, data=data, requires_grad=False)
+
+    def __init__(self, data: torch.Tensor, scale: torch.Tensor,
+                 quant_config: RTNConfig) -> None:
+        self.scale = scale
+        self.quant_config = quant_config
+
+    @property
+    def data(self):
+        return RTNTensor(super().data, self.scale, self.quant_config)
+
+
+class RTNLinearMethod(LinearMethodBase):
+    """Linear method for RTN.
+
+    Args:
+        quant_config: The RTN quantization config.
+    """
+
+    def __init__(self, quant_config: RTNConfig):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        output_size_per_partition = sum(output_partition_sizes)
+        num_groups_per_col = (input_size_per_partition //
+                              self.quant_config.group_size
+                              if self.quant_config.group_size != -1 else 1)
+
+        scale = Parameter(
+            torch.empty(output_size_per_partition,
+                        num_groups_per_col,
+                        dtype=params_dtype),
+            requires_grad=False,
+        )
+        factor = 1 if self.quant_config.weight_bits == 8 else 2
+
+        weight = RTNParameter(data=torch.empty(output_size_per_partition //
+                                               factor,
+                                               input_size_per_partition,
+                                               dtype=torch.int8),
+                              scale=scale,
+                              quant_config=self.quant_config)
+
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(weight, {
+            **extra_weight_attrs,
+            "input_dim": 1,
+            "output_dim": 0,
+        })
+
+        layer.register_parameter("scale", scale)
+        layer.output_size_per_partition = output_size_per_partition
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        """torch.compile does not know how to deal with a Parameter subclass
+        (aka RTNParameter). As we don't really need RTNParameters for the
+        forward pass, we replace them with equivalent instances of Parameters.
+        """
+        old_weight = layer.weight
+        assert isinstance(old_weight, RTNParameter)
+        data = old_weight.data.data
+
+        delattr(layer, "weight")
+
+        new_weight = Parameter(data=data, requires_grad=False)
+        layer.register_parameter("weight", new_weight)
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        qweight = layer.weight
+        scale = layer.scale
+
+        weight = rtn_dequantize(qweight, scale)
+        out = F.linear(x, weight)
+        del weight
+        if bias is not None:
+            out.add_(bias)
+
+        return out
+
+
+def rtn_quantize(tensor: torch.Tensor, num_bits: int,
+                 group_size: int) -> tuple[torch.Tensor, torch.Tensor]:
+    """Quantize a tensor using per-group static scaling factor.
+
+    Args:
+        tensor: The input tensor.
+        num_bits: Target precision for the result (supported values are
+                  8 or 4).
+        group_size: Quantization granularity. 
+                    If equal to -1, each row in the input tensor is treated
+                    as one group.
+    """
+
+    q_range = 2**num_bits
+    num_groups = (tensor.shape[0] * tensor.shape[1] //
+                  group_size if group_size != -1 else tensor.shape[0])
+    """Calculate a scaling factor per input group.
+    """
+    input_flat = tensor.reshape(num_groups, -1)
+    input_min = torch.min(input_flat, dim=1, keepdim=True)[0]
+    input_max = torch.max(input_flat, dim=1, keepdim=True)[0]
+    input_max_abs = torch.max(input_min.abs(), input_max.abs())
+    scale = (input_max_abs * 2.0 / (q_range - 1))
+    """Scale each input group, truncate and round to the nearest integer.
+    """
+    scaled_input = input_flat / scale
+    scaled_input = scaled_input.clamp(-q_range // 2, q_range // 2 - 1)
+    scaled_input = scaled_input.round()
+
+    scale = scale.reshape(tensor.shape[0], -1).contiguous()
+    inputs_q = scaled_input.reshape(tensor.shape).to(torch.int8)
+    inputs_q = inputs_q.contiguous()
+
+    if num_bits == 4:
+        """Pack two 4-bit values into each byte.
+        """
+        inputs_q = (inputs_q[:, 1::2] << 4) | (inputs_q[:, ::2] & 0xf)
+        inputs_q = inputs_q.reshape(tensor.shape[0] // 2, tensor.shape[1])
+        inputs_q = inputs_q.contiguous()
+
+    return inputs_q, scale
+
+
+def rtn_dequantize(tensor: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
+    """Dequantize a tensor using per-group static scaling factors.
+
+    Args:
+        tensor: The input tensor.
+        scale: The tensor with per-group scale factors.
+    """
+
+    num_groups = scale.size(0) * scale.size(1)
+    input_dim, output_dim = tensor.shape
+
+    num_bits = 8 if input_dim == scale.size(0) else 4
+    if num_bits == 4:
+        input_dim *= 2
+
+    data = torch.empty((input_dim, output_dim),
+                       dtype=scale.dtype,
+                       device=tensor.device)
+
+    if num_bits == 8:
+        data.copy_(tensor)
+    else:
+        """Unpack two 4-bit values from each byte.
+        """
+        tensor = tensor.reshape(input_dim, output_dim // 2)
+        for i in range(2):
+            data[:, i::2] = (tensor << 4 * (1 - i)) >> 4
+    """Scale each input group with its scaling factor.
+    """
+    scale = scale.reshape(num_groups, -1)
+    data = data.reshape(num_groups, -1)
+    data = torch.mul(data, scale)
+
+    input_deq = data.reshape((input_dim, output_dim)).contiguous()
+    return input_deq
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/schema.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/schema.py
new file mode 100644
index 0000000..a108152
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/schema.py
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file contains the Pydantic schemas for various quantization-related
+parameters. When a relevant quantization technique is specified, these
+parameters are loaded in the form of a JSON alongside the model weights
+and augment the model with additional information needed for use of that
+technique. The format of this JSON should be specified by one or more
+schemas contained here.
+
+For example, when the KV cache is quantized to FP8-E4M3 (currently only
+possible on ROCm), the model can be optionally augmented with KV cache
+scaling factors.
+"""
+
+from typing import Optional
+
+from pydantic import BaseModel, ConfigDict, ValidationInfo, model_validator
+
+
+class KVCacheQuantSchema(BaseModel):
+    dtype: str
+    # Each key is a TP rank. Each value is a dictionary mapping a TP rank's
+    # layer indices to their per-tensor KV cache scaling factor.
+    # TODO: Consider pulling this and its validation methods out into its
+    # own schema class (tricky as its members are variable)
+    scaling_factor: dict[int, dict[int, float]]
+
+    @model_validator(mode="after")
+    def check_is_fp8(self) -> "KVCacheQuantSchema":
+        assert self.dtype == "float8_e4m3fn", (
+            "Loaded scaling factors intended for KV cache dtype = "
+            f"{self.dtype} rather than float8_e4m3fn!")
+        return self
+
+    @model_validator(mode="after")
+    def check_tp_ranks(self, info: ValidationInfo) -> "KVCacheQuantSchema":
+        context = info.context
+        if context:
+            tp_size = context["tp_size"]
+            num_hidden_layers = context["num_hidden_layers"]
+            assert len(self.scaling_factor) == tp_size, (
+                f"Loaded dictionary has TP size {len(self.scaling_factor)} "
+                f"but LLM engine is currently running with TP size {tp_size}.")
+            for tp_rank, layer_maps in self.scaling_factor.items():
+                assert len(layer_maps) == num_hidden_layers, (
+                    f"KV cache scales map for TP rank {tp_rank} is malformed. "
+                    f"Expected {num_hidden_layers} layers, got "
+                    f"{len(layer_maps)}.")
+            for i in range(tp_size):
+                assert i in self.scaling_factor, (
+                    f"KV cache scales map for TP rank {i} not found.")
+        return self
+
+    @model_validator(mode="after")
+    def check_current_rank(self, info: ValidationInfo) -> "KVCacheQuantSchema":
+        context = info.context
+        if context:
+            tp_rank = context["tp_rank"]
+            num_hidden_layers = context["num_hidden_layers"]
+            layer_scales_map = self.scaling_factor[tp_rank]
+            for i in range(num_hidden_layers):
+                assert i in layer_scales_map, (
+                    f"Could not find KV cache scales for layer {i} in "
+                    f"TP rank {tp_rank}.")
+        return self
+
+
+class QuantParamSchema(BaseModel):
+    # TODO: Generalize and extend with more fields
+    # (e.g. weights/activations params) once functionality is enabled
+    model_config = ConfigDict(protected_namespaces=())
+    model_type: Optional[str]
+    kv_cache: KVCacheQuantSchema
+
+    @model_validator(mode="after")
+    def check_model_type(self, info: ValidationInfo) -> "QuantParamSchema":
+        context = info.context
+        if context:
+            model_type = context.get("model_type", None)
+            if model_type is not None:
+                assert model_type == self.model_type, (
+                    f"Model type is {model_type} but loaded "
+                    f"scaling factors belonging to different "
+                    f"model type {self.model_type}!")
+        return self
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/torchao.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/torchao.py
new file mode 100644
index 0000000..63b2ab6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/torchao.py
@@ -0,0 +1,212 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, Optional
+
+import torch
+import torch.nn.functional as F
+from torch.nn.parameter import Parameter
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.utils import set_weight_attrs
+
+logger = init_logger(__name__)
+
+
+def should_skip(prefix: str, skip_modules: list[str]) -> bool:
+    """
+    Robust skipping logic:
+    should_skip("model.model.layers.1.q_proj",
+                ["model.model.layers.1.q_proj"])  # True
+    should_skip("model.model.layers.10.o_proj", ["o_proj"])  -> True
+    should_skip("visual.model.layers.1.q_proj", ["visual"])   -> True
+    should_skip("model.model.layers.1.q_proj", ["layers.1"])  -> True
+    should_skip("model.model.layers.11.q_proj", ["layers.1"]) -> False
+    """
+    for s in skip_modules:
+        if prefix == s:
+            return True
+        if f".{s}." in f".{prefix}.":
+            return True
+    return False
+
+
+class TorchAOConfig(QuantizationConfig):
+    """Config class for torchao."""
+
+    def __init__(self,
+                 torchao_config,
+                 skip_modules: Optional[list[str]] = None) -> None:
+        """
+        # TorchAO quantization relies on tensor subclasses. In order,
+        # to enable proper caching this needs standalone compile
+        if is_torch_equal_or_newer("2.8.0.dev"):
+            os.environ["VLLM_TEST_STANDALONE_COMPILE"] = "1"
+            logger.info(
+                "Using TorchAO: Setting VLLM_TEST_STANDALONE_COMPILE=1")
+
+        # TODO: remove after the torch dependency is updated to 2.8
+        if is_torch_equal_or_newer(
+                "2.7.0") and not is_torch_equal_or_newer("2.8.0.dev"):
+            os.environ["VLLM_DISABLE_COMPILE_CACHE"] = "1"
+            logger.info("Using TorchAO: Setting VLLM_DISABLE_COMPILE_CACHE=1")
+        """
+        super().__init__()
+        self.torchao_config = torchao_config
+        self.skip_modules = skip_modules or []
+
+    def __repr__(self) -> str:
+        return f"TorchAOConfig({self.torchao_config})"
+
+    def get_name(self) -> QuantizationMethods:
+        return "torchao"
+
+    def get_supported_act_dtypes(self) -> list[torch.dtype]:
+        return [torch.float32, torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 75
+
+    @staticmethod
+    def get_config_filenames() -> list[str]:
+        return ["config.json"]
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "TorchAOConfig":
+        """Create the quant config from an hf model config"""
+        try:
+            from torchao.core.config import config_from_dict
+        except ImportError as err:
+            raise ImportError(
+                "Please install torchao>=0.10.0 via "
+                "`pip install torchao>=0.10.0` to use torchao quantization."
+            ) from err
+
+        hf_config = cls.get_from_keys_or(config, ["quant_type"], None)
+        assert hf_config is not None, "quant_type must be specified"
+        assert len(hf_config) == 1 and "default" in hf_config, (
+            "Expected only one key 'default' in quant_type dictionary")
+        quant_type = hf_config["default"]
+        ao_config = config_from_dict(quant_type)
+
+        # Adds skipped modules defined in "modules_to_not_convert"
+        skip_modules = config.get("modules_to_not_convert", []) or []
+
+        # Adds skipped modules defined in "module_fqn_to_config"
+        _data = quant_type.get("_data", {})
+        if not isinstance(_data, dict):
+            _data = {}
+
+        module_fqn = _data.get("module_fqn_to_config", {})
+        if not isinstance(module_fqn, dict):
+            module_fqn = {}
+
+        for layer, layer_cfg in module_fqn.items():
+            if layer_cfg is None:
+                skip_modules.append(layer)
+
+        return cls(ao_config, skip_modules)
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        if not isinstance(layer, LinearBase):
+            return None
+
+        from torchao.quantization import ModuleFqnToConfig
+
+        if should_skip(prefix, self.skip_modules):
+            return UnquantizedLinearMethod()
+
+        module_fqn = prefix
+        if isinstance(self.torchao_config, ModuleFqnToConfig):
+            module_fqn_to_config = self.torchao_config.module_fqn_to_config
+            c = module_fqn_to_config.get(
+                module_fqn) or module_fqn_to_config.get("_default", None)
+            if c is not None:
+                current_torchao_config = TorchAOConfig(c, self.skip_modules)
+                return TorchAOLinearMethod(current_torchao_config)
+            else:
+                return UnquantizedLinearMethod()
+
+        return TorchAOLinearMethod(self)
+
+    def get_scaled_act_names(self) -> list[str]:
+        return []
+
+
+def torchao_quantize_param_data(param: torch.Tensor,
+                                torchao_config: Any) -> torch.nn.Parameter:
+    """Quantize a Tensor with torchao quantization specified by torchao_config
+
+    Args:
+       `param`: weight parameter of the linear module
+       `torchao_config`: type of quantization and their arguments we want to
+        use to quantize the Tensor
+    """
+    from torchao.core.config import AOBaseConfig
+    from torchao.quantization import quantize_
+
+    assert isinstance(torchao_config, AOBaseConfig), f"{torchao_config}"
+    """ 
+    Avoid real weight allocation for faster load, since we will 
+    end up setting it to param.
+    """
+    with torch.device("meta"):
+        dummy_linear = torch.nn.Linear(param.shape[1],
+                                       param.shape[0],
+                                       bias=False)
+
+    dummy_linear.weight = param
+    quantize_(dummy_linear, torchao_config)
+    return dummy_linear.weight
+
+
+class TorchAOLinearMethod(LinearMethodBase):
+    """Linear method for torchao.
+
+    Args:
+        torchao_config: The torchao quantization config, a string
+        that encodes the type of quantization and all relevant arguments.
+    """
+
+    def __init__(self, quant_config: TorchAOConfig):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        weight = Parameter(
+            torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        weight = torchao_quantize_param_data(weight,
+                                             self.quant_config.torchao_config)
+
+        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
+
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(weight, extra_weight_attrs)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return F.linear(x, layer.weight, bias)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/tpu_int8.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/tpu_int8.py
new file mode 100644
index 0000000..83c8a98
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/tpu_int8.py
@@ -0,0 +1,121 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Optional
+
+import torch
+from torch.nn import Module
+from torch.nn.parameter import Parameter
+
+from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.parameter import ModelWeightParameter
+
+ACTIVATION_SCHEMES = ["none"]
+
+
+class Int8TpuConfig(QuantizationConfig):
+    """Int8 Quantization Config class for TPU Backend."""
+
+    def __init__(
+        self,
+        activation_scheme: str = "none",
+    ) -> None:
+        super().__init__()
+        if activation_scheme not in ACTIVATION_SCHEMES:
+            raise ValueError(
+                f"Unsupported activation scheme {activation_scheme}")
+        self.activation_scheme = activation_scheme
+
+    def get_name(self) -> QuantizationMethods:
+        return "tpu_int8"
+
+    def get_supported_act_dtypes(self) -> list[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        raise NotImplementedError(
+            "This function should not be called with TPU Backend")
+
+    @staticmethod
+    def get_config_filenames() -> list[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "Int8TpuConfig":
+        activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
+        return cls(activation_scheme=activation_scheme)
+
+    def get_quant_method(self, layer: Module,
+                         prefix: str) -> Optional["TPUInt8LinearMethod"]:
+        if isinstance(layer, LinearBase):
+            return TPUInt8LinearMethod(self)
+        return None
+
+
+class TPUInt8LinearMethod(LinearMethodBase):
+    """Int8 Linear method for TPU Quant. """
+
+    def __init__(self, quant_config: Int8TpuConfig):
+        self.quant_config = quant_config
+
+    def create_weights(self, layer: Module, input_size_per_partition: int,
+                       output_partition_sizes: list[int], input_size: int,
+                       output_size: int, params_dtype: torch.dtype,
+                       **extra_weight_attrs):
+
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        weight = ModelWeightParameter(data=torch.empty(
+            sum(output_partition_sizes),
+            input_size_per_partition,
+            dtype=params_dtype),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
+        layer.register_parameter("weight", weight)
+
+    def _quantize_weight(
+            self, weight: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        weight_dtype = weight.dtype
+        weight = weight.cpu().to(torch.float32)
+        n_bit = 8
+        eps = 1e-5
+        max_int = 2**(n_bit - 1) - 1
+        min_int = -(2**(n_bit - 1))
+        max_val = weight.abs().amax(dim=-1, keepdim=True)
+        max_val = max_val.clamp(min=eps)
+        qscale = max_val / max_int
+        qweight = torch.clamp(torch.round(weight * (1.0 / qscale)), min_int,
+                              max_int).to(torch.int8)
+        qscale = qscale.squeeze().to(weight_dtype)
+        return qweight, qscale
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        layer.weight = Parameter(layer.weight.data, requires_grad=False)
+        device = layer.weight.device
+        qweight, qscale = self._quantize_weight(layer.weight)
+        qweight = qweight.to(device)
+        qscale = qscale.to(device)
+        layer.weight = Parameter(qweight, requires_grad=False)
+        layer.scale = Parameter(qscale, requires_grad=False)
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        try:
+            import torch_xla.experimental.xla_quantized_matmul  # noqa: F401
+        except ImportError as err:
+            raise ImportError(
+                "Please install torch_xla by following the instructions at "
+                "https://docs.vllm.ai/en/latest/getting_started/tpu-installation.html "  # noqa: E501
+                "to run vLLM on TPU.") from err
+        weight = layer.weight
+        scale = layer.scale
+        out = torch.ops.xla.quantized_matmul(x, weight, scale)
+        if bias is not None:
+            out = out + bias
+        return out
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/__init__.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/__init__.py
new file mode 100644
index 0000000..6ad56ba
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from .layer_utils import replace_parameter, update_tensor_inplace
+
+__all__ = ['update_tensor_inplace', 'replace_parameter']
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/allspark_utils.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/allspark_utils.py
new file mode 100644
index 0000000..1992b4d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/allspark_utils.py
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.platforms import current_platform
+from vllm.scalar_type import ScalarType, scalar_types
+
+ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD = 1024
+ALLSPARK_SUPPORTED_QUANT_TYPES = [scalar_types.uint8b128]
+ALLSPARK_AMPERE_N_ALIGN = 16
+ALLSPARK_AMPERE_K_ALIGN = 16
+
+
+def check_allspark_supported_dtype_shape(input_size_per_partition: int,
+                                         output_size_per_partition: int,
+                                         group_size: int,
+                                         weight_dtype: ScalarType,
+                                         act_dtype: torch.dtype):
+    capability_tuple = current_platform.get_device_capability()
+    device_capability = (-1 if capability_tuple is None else
+                         capability_tuple.to_int())
+
+    # For Ampere GPU
+    if device_capability >= 80 and device_capability < 90:
+        if group_size != -1:
+            return False, \
+                "For Ampere GPU, AllSpark does not support group_size "\
+                f"= {group_size}. Only group_size = -1 are supported."
+
+        if weight_dtype not in ALLSPARK_SUPPORTED_QUANT_TYPES:
+            return False, "For Ampere GPU, AllSpark does not support "\
+                f"quant type ({weight_dtype}). Only quant type "\
+                f"({ALLSPARK_SUPPORTED_QUANT_TYPES}) are supported."
+
+        if input_size_per_partition % ALLSPARK_AMPERE_K_ALIGN != 0 \
+            or output_size_per_partition % ALLSPARK_AMPERE_N_ALIGN != 0:
+            return False, \
+                "AllSpark needs input_size_per_partition % "\
+                f"{ALLSPARK_AMPERE_K_ALIGN} = 0 and "\
+                f"output_size_per_partition % {ALLSPARK_AMPERE_N_ALIGN} = 0 "\
+                "for Ampere GPU optimized kernels."
+
+        if act_dtype != torch.float16 and act_dtype != torch.bfloat16:
+            return False, \
+                "AllSpark only supports act_dtype = float16 or bfloat16,"\
+                f"for Ampere GPU, but got act_dtype = {act_dtype}."
+    else:
+        return False, "AllSpark currently does not support "\
+            f"device_capability = {device_capability}."
+
+    return True, None
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/bitblas_utils.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/bitblas_utils.py
new file mode 100644
index 0000000..82ee3ed
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/bitblas_utils.py
@@ -0,0 +1,208 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import torch
+
+from vllm.platforms import current_platform
+from vllm.scalar_type import ScalarType, scalar_types
+
+MINIMUM_BITBLAS_VERSION = "0.1.0"
+
+BITBLAS_MIN_WEIGHT_SIZE_N = 16
+BITBLAS_MIN_WEIGHT_SIZE_K = 16
+GPTQ_BITBLAS_MAX_PARALLEL = 16
+
+BITBLAS_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
+
+# For dynamic shape code generation
+BITBLAS_OPTIMIZE_FEATURES = [1, 16, 32, 64, 128, 256, 512, 1024]
+# If want to enable high performance for contiguous batching
+# Please use the following values
+BITBLAS_OPTIMIZE_FEATURES_CONTIGUOUS = [16, 32, 64, 128, 256, 512, 1024]
+
+BITBLAS_SUPPORTED_NUM_BITS = [1, 2, 4, 8]
+BITBLAS_SUPPORTED_SYM = [False, True]
+
+
+# Determines the supported quantization types for BitBLAS based on the
+# device's capability and whether zero-point (zp) is used.
+def query_bitblas_supported_quant_types(has_zp: bool,
+                                        device_capability: Optional[int] = None
+                                        ):
+    if device_capability is None:
+        capability_tuple = current_platform.get_device_capability()
+        device_capability = (-1 if capability_tuple is None else
+                             capability_tuple.to_int())
+
+    if device_capability < 70:
+        return []
+
+    if has_zp:
+        # AWQ style, unsigned + runtime zero-point
+        return [scalar_types.uint4, scalar_types.uint8]
+    else:
+        # GPTQ style, unsigned + symmetric bias
+        # TODO: once fp8_bitblas is merged into "gptq_bitblas" we should be able
+        #  to add `scalar_types.float8_e4m3fn` here
+        return [scalar_types.uint4b8, scalar_types.uint8b128]
+
+
+def _check_bitblas_supported(
+        quant_type: ScalarType,
+        group_size: Optional[int],
+        has_zp: bool,
+        device_capability: Optional[int] = None) -> tuple[bool, Optional[str]]:
+
+    if device_capability is None:
+        capability_tuple = current_platform.get_device_capability()
+        device_capability = (-1 if capability_tuple is None else
+                             capability_tuple.to_int())
+
+    supported_types = query_bitblas_supported_quant_types(
+        has_zp, device_capability)
+
+    if quant_type not in supported_types:
+        return (False, f"BitBLAS does not support weight_bits = {quant_type}. "
+                f"Only types = {supported_types} "
+                f"are supported (for group_size = {group_size}, "
+                f"device_capability = {device_capability}, zp = {has_zp}).")
+    if (group_size is None or group_size not in BITBLAS_SUPPORTED_GROUP_SIZES):
+        return (False, f"BitBLAS does not support group_size = {group_size}. "
+                f"Only group_sizes = {BITBLAS_SUPPORTED_GROUP_SIZES} "
+                "are supported.")
+
+    # Finally, check if bitblas is installed
+    try:
+        import bitblas
+        if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
+            raise ImportError("bitblas version is wrong. Please "
+                              f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
+    except ImportError:
+        return False, "BitBLAS is not installed."
+
+    return True, None
+
+
+def check_bitblas_supported(quant_type: ScalarType,
+                            group_size: int,
+                            has_zp: bool = False,
+                            device_capability: Optional[int] = None) -> bool:
+    cond, _ = _check_bitblas_supported(quant_type, group_size, has_zp,
+                                       device_capability)
+    return cond
+
+
+def verify_bitblas_supported(quant_type: ScalarType,
+                             group_size: int,
+                             has_zp: bool = False) -> None:
+    cond, err_msg = _check_bitblas_supported(quant_type, group_size, has_zp)
+    if not cond:
+        assert err_msg is not None
+        raise ValueError(err_msg)
+
+
+def verify_bitblas_supports_shape(output_size_per_partition: int,
+                                  input_size_per_partition: int,
+                                  input_size: int, group_size: int) -> None:
+
+    # Validate output_size_per_partition
+    if output_size_per_partition % BITBLAS_MIN_WEIGHT_SIZE_N != 0:
+        raise ValueError(f"Weight output_size_per_partition = "
+                         f"{output_size_per_partition} is not divisible by "
+                         f" min_thread_n = {BITBLAS_MIN_WEIGHT_SIZE_N}. "
+                         "Consider reducing tensor_parallel_size or running "
+                         "with --quantization gptq.")
+
+    # Validate input_size_per_partition
+    if input_size_per_partition % BITBLAS_MIN_WEIGHT_SIZE_K != 0:
+        raise ValueError(f"Weight input_size_per_partition = "
+                         f"{input_size_per_partition} is not divisible "
+                         f"by min_thread_k = {BITBLAS_MIN_WEIGHT_SIZE_K}. "
+                         "Consider reducing tensor_parallel_size or running "
+                         "with --quantization gptq.")
+
+    if (group_size < input_size
+            and input_size_per_partition % group_size != 0):
+        raise ValueError(
+            f"Weight input_size_per_partition = {input_size_per_partition}"
+            f" is not divisible by group_size = {group_size}."
+            "Consider reducing tensor_parallel_size or running "
+            "with --quantization gptq.")
+
+
+def check_bitblas_supports_shape(output_size_per_partition: int,
+                                input_size_per_partition: int,
+                                input_size: int, group_size: int) \
+                                    -> tuple[bool, Optional[str]]:
+    try:
+        verify_bitblas_supports_shape(output_size_per_partition,
+                                      input_size_per_partition, input_size,
+                                      group_size)
+    except ValueError as e:
+        return False, e.__str__()
+    return True, None
+
+
+def bitblas_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
+    return (not act_order) or (act_order and not is_row_parallel)
+
+
+def bitblas_repeat_scales_on_all_ranks(act_order: bool, group_size: int,
+                                       is_row_parallel: bool) -> bool:
+    # Need to repeat scales on every rank if act_ordering or
+    # channelwise and RowParallelLinear
+    is_channelwise = group_size == -1
+    return act_order or (is_channelwise and is_row_parallel)
+
+
+def bitblas_make_empty_g_idx(device: torch.device) -> torch.Tensor:
+    return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
+                              requires_grad=False)
+
+
+def bitblas_make_empty_zp(device: torch.device) -> torch.Tensor:
+    return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
+                              requires_grad=False)
+
+
+def bitblas_sort_g_idx(
+        g_idx: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
+    return g_idx[g_idx_sort_indices], g_idx_sort_indices
+
+
+def unpack_gptq_qzeros(qzeros, bits, is_gptq_v2=False) -> torch.Tensor:
+    qzeros = qzeros.view(torch.int32)
+    elems_per_int32 = 32 // bits
+    unpacked_zeros = torch.zeros(
+        (qzeros.shape[0], qzeros.shape[1] * elems_per_int32),
+        dtype=torch.int8,
+        device=qzeros.device,
+        requires_grad=False,
+    )
+
+    for col in range(unpacked_zeros.shape[1]):
+        i = col % elems_per_int32
+        unpacked_zeros[:, col] = (qzeros[:, col // elems_per_int32] >>
+                                  (bits * i)) & 0xF
+    if not is_gptq_v2:
+        return unpacked_zeros + 1
+    return unpacked_zeros
+
+
+def unpack_gptq_qweight(qweight, bits):
+    qweight = qweight.view(torch.int8)
+    elems_per_int8 = 8 // bits
+    unpacked_weight = torch.zeros(
+        (qweight.shape[0], qweight.shape[1] * elems_per_int8),
+        dtype=torch.int8,
+        device=qweight.device,
+        requires_grad=False,
+    )
+    for col in range(unpacked_weight.shape[1]):
+        i = col % elems_per_int8
+        unpacked_weight[:, col] = (qweight[:, col // elems_per_int8] >>
+                                   (bits * i))
+
+    return torch.bitwise_and(unpacked_weight, 2**bits - 1)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..e9a50e1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..119969d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..119969d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..3e8ebf3
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..2bb5b45
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..6496a38
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..6e2aeee
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..b0f9442
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..b3bf9ea
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..7e52ab6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..7e52ab6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..bee8d03
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..9da876d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..3618053
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..0a1a252
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..46a982f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..9696611
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..d6279a1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..defaacb
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..ecc2fda
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..ecc2fda
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..3bc0036
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..310dff4
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..035ec02
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..206c8a2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..8b49f27
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..edc2353
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..987c8f6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..108af31
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..108af31
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..43b5bdb
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..bffa749
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..851bc9f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..f96f127
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..d1227c2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..fe3e18c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..b3ed43a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..abd1915
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..abd1915
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..e4d5b2d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..137b9dd
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..77ba0d7
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..1c61451
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..38cac46
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..8e6ebe2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..63e661c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..459062e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..1225d84
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..03e8235
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..bb61d83
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..bb61d83
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..d44e384
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..c559a69
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..cf35403
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..8ec2005
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..65840aa
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..1a457b9
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..574cf49
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..574cf49
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..0a5d7bf
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..4e120d6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..eccb86a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..125fe36
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..4415cc9
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..7bfaf93
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..7bfaf93
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..cb91a27
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..88af484
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..5c29874
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..dd06972
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..125fe36
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..7c039b4
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..c2bd478
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..c2bd478
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..4990268
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..18afdd9
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..7febe3d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..56b939e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..51d10bb
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..1480e09
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..63d9a0b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..f5fdec3
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..6bd350c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..5c604b9
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..75906ad
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..75906ad
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..7fa398c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..f15d8f6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..b4d25ae
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..fdc6437
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..fdc6437
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..9d7658b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..cd3e078
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..2b9f0d1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..9d5a329
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..7f449db
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..634c1bf
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..7eaa7d1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..7eaa7d1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..03dba5a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..96e1594
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..d979c6b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..5ffd367
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..be93dfe
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..19452df
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..3382554
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..3382554
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..9a5ff48
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..6eb22de
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..eabc423
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..84ef35e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..e6d9107
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..c9d18c9
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..c9d18c9
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..c746e70
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..0b4746c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..386928d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..51e237b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..8ec2005
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..202acf2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..6280219
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..983525f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,18 @@
+{
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..11a9bce
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..c298da8
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..56a766c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..56a766c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..386ee59
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..60df5e3
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..40c01c0
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..4f1747b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..c6fd365
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..53bbaca
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..cb993c8
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..f250d3f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..f250d3f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..ffe67dc
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..2a17e16
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..160f12e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..b259993
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..e5c4a1d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..a71ab88
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..56d3e1f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..bbd4df4
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..bbd4df4
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..eda96e7
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..bd0767b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..2bf5eb2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..29f7651
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..6db1385
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..9cdff13
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..7bb8e87
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..7bb8e87
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..1a47cae
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..8dd5ae5
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..9c908e8
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..0a1e14c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..6d1a8b5
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..e77abaf
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..15b1c93
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..0cf6a47
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..01327b2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..6f9bd75
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..f050b75
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..f050b75
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..12eea5f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..9db9dae
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..f78e706
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..8ff12e6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..365f8d0
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..f080ea5
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..4532f93
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..0cf6a47
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..e9bf044
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..c7122d3
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..4a3ccc0
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..4a3ccc0
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..1d3ce5c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..ca7f32b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..c37aced
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..5acea24
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..d962889
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..3cea21b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..24ef112
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..24ef112
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..3ab5796
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..58cdd93
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..d6bef7f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..b72e037
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..b4b08ea
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..a8141f5
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 32,
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..c911a8e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..c911a8e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..3cb7eaa
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..8df6e4b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..293adce
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..9d7edc3
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..c9566d7
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..d86b349
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..d86b349
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..e471687
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..b4c3249
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000..b4c3249
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/fp8_utils.py
new file mode 100644
index 0000000..ee5f2b5
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -0,0 +1,791 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from https://github.com/sgl-project/sglang/pull/2575
+import functools
+import json
+import os
+from collections.abc import Sequence
+from typing import Any, Callable, Optional, Union
+
+import torch
+
+import vllm.envs as envs
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    group_broadcast)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    CUTLASS_BLOCK_FP8_SUPPORTED)
+from vllm.platforms import current_platform
+from vllm.triton_utils import tl, triton
+from vllm.utils import cdiv, direct_register_custom_op, has_deep_gemm
+from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
+
+logger = init_logger(__name__)
+
+
+def is_fp8(x: Union[torch.dtype, torch.Tensor]) -> bool:
+    if isinstance(x, torch.Tensor):
+        x = x.dtype
+    return x == torch.float8_e4m3fn or x == torch.float8_e4m3fnuz
+
+
+def cutlass_scaled_mm(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int],
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    return ops.cutlass_scaled_mm(A,
+                                 B.T,
+                                 out_dtype=output_dtype,
+                                 scale_a=As,
+                                 scale_b=Bs.T)
+
+
+def rocm_aiter_gemm_w8a8_blockscale_impl(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int],
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    import aiter as rocm_aiter
+
+    return rocm_aiter.gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype)
+
+
+def rocm_aiter_gemm_w8a8_blockscale_fake(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int],
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+
+    m = A.shape[0]
+    n = B.shape[0]
+    Y = torch.empty(m, n, dtype=output_dtype, device=A.device)
+    return Y
+
+
+if current_platform.is_rocm():
+    direct_register_custom_op(
+        op_name="rocm_aiter_gemm_w8a8_blockscale",
+        op_func=rocm_aiter_gemm_w8a8_blockscale_impl,
+        mutates_args=[],
+        fake_impl=rocm_aiter_gemm_w8a8_blockscale_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
+
+def dispatch_w8a8_blockscale_func(
+    use_cutlass: bool, use_aiter_and_is_supported: bool
+) -> Callable[[
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        list[int],
+        torch.dtype,
+], torch.Tensor]:
+    if use_cutlass:
+        return cutlass_scaled_mm
+    if (use_aiter_and_is_supported):
+        return torch.ops.vllm.rocm_aiter_gemm_w8a8_blockscale
+    return w8a8_block_fp8_matmul
+
+
+def should_use_deepgemm(output_dtype: torch.dtype, weight: torch.Tensor):
+    """
+    Check if DeepGEMM should be used based on the output dtype and weight shape.
+    DeepGEMM is only supported for bfloat16 output dtype and weights with shape
+    divisible by 128.
+    """
+
+    return (current_platform.is_cuda()
+            and current_platform.is_device_capability(90) and has_deep_gemm()
+            and envs.VLLM_USE_DEEP_GEMM and output_dtype == torch.bfloat16
+            and weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0)
+
+
+# TODO fix ROCm->Triton custom path:
+#  https://github.com/vllm-project/vllm/issues/14397
+def apply_w8a8_block_fp8_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    block_size: list[int],
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+    cutlass_block_fp8_supported: bool = CUTLASS_BLOCK_FP8_SUPPORTED,
+    use_aiter_and_is_supported: bool = False,
+) -> torch.Tensor:
+    assert input_scale is None
+    # View input as 2D matrix for fp8 methods
+    input_2d = input.view(-1, input.shape[-1])
+    output_shape = [*input.shape[:-1], weight.shape[0]]
+    output_dtype = input.dtype
+
+    if should_use_deepgemm(output_dtype, weight):
+
+        input_2d = input.view(-1, input.shape[-1])
+        output_shape = [*input.shape[:-1], weight.shape[0]]
+
+        q_input, x_scale = per_token_group_quant_fp8(
+            input_2d,
+            block_size[1],
+            column_major_scales=True,
+        )
+
+        import vllm.model_executor.layers.quantization.deepgemm  # noqa: F401
+        output = torch.ops.vllm.w8a8_block_fp8_matmul_deepgemm(
+            q_input,
+            weight,
+            x_scale,
+            weight_scale,
+            block_size,
+            output_dtype=output_dtype)
+        if bias is not None:
+            output += bias
+        return output.to(dtype=output_dtype).view(*output_shape)
+
+    if current_platform.is_cuda():
+        if current_platform.has_device_capability(100):
+
+            use_cutlass = cutlass_block_fp8_supported and (
+                cdiv(weight.shape[0], 128) == weight_scale.shape[0]
+                and cdiv(weight.shape[1], 128) == weight_scale.shape[1])
+        else:
+            # TODO: update this after switching to public sm90 block scale gemm
+            # as it also supports weight.shape % 128 != 0
+            use_cutlass = cutlass_block_fp8_supported and (
+                weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0)
+    else:
+        use_cutlass = False
+
+    w8a8_blockscale_func = dispatch_w8a8_blockscale_func(
+        use_cutlass, use_aiter_and_is_supported)
+    if use_cutlass:
+        q_input, x_scale = per_token_group_quant_fp8(
+            input_2d, block_size[1], column_major_scales=use_cutlass)
+        output = w8a8_blockscale_func(q_input, weight, x_scale, weight_scale,
+                                      block_size, input.dtype)
+
+    else:
+        q_input, x_scale = per_token_group_quant_fp8(
+            input_2d, block_size[1], column_major_scales=use_cutlass)
+
+        output = w8a8_blockscale_func(q_input, weight, x_scale, weight_scale,
+                                      block_size, input.dtype)
+
+    if bias is not None:
+        output = output + bias
+    return output.to(dtype=input.dtype).view(*output_shape)
+
+
+def apply_w8a8_block_fp8_linear_fake(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    block_size: list[int],
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+    cutlass_block_fp8_supported: bool = CUTLASS_BLOCK_FP8_SUPPORTED,
+    use_aiter_and_is_supported: bool = False,
+) -> torch.Tensor:
+    output_shape = [*input.shape[:-1], weight.shape[0]]
+    return torch.empty(output_shape, dtype=input.dtype, device=input.device)
+
+
+if not current_platform.is_cpu():
+    direct_register_custom_op(
+        op_name="apply_w8a8_block_fp8_linear",
+        op_func=apply_w8a8_block_fp8_linear,
+        mutates_args=[],
+        fake_impl=apply_w8a8_block_fp8_linear_fake,
+    )
+
+
+def input_to_float8(
+        x: torch.Tensor,
+        dtype: Optional[torch.dtype] = None
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """This function quantizes input values to float8 values "
+    "with tensor-wise quantization."""
+    dtype = current_platform.fp8_dtype() if dtype is None else dtype
+    finfo = torch.finfo(dtype)
+    min_val, max_val = x.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
+    scale = finfo.max / amax
+    x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max)
+    return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
+
+
+def block_quant_to_tensor_quant(
+    x_q_block: torch.Tensor,
+    x_s: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """This function converts block-wise quantization to tensor-wise
+    quantization. The inputs are block-wise quantization tensor `x_q_block`,
+    block-wise quantization scale and the block size.
+    The outputs are tensor-wise quantization tensor and tensor-wise
+    quantization scale. Note only float8 is supported for now.
+    """
+    x_dq_block = group_broadcast(x_q_block, x_s)
+    x_q_tensor, scale = input_to_float8(x_dq_block, dtype=x_q_block.dtype)
+    return x_q_tensor, scale
+
+
+@triton.jit
+def _per_token_group_quant_fp8(
+    # Pointers to inputs and output
+    y_ptr,
+    y_q_ptr,
+    y_s_ptr,
+    group_size,
+    # Num columns of y
+    y_num_columns,
+    y_row_stride,
+    # Avoid to divide zero
+    eps,
+    # Information for float8
+    fp8_min,
+    fp8_max,
+    use_ue8m0: tl.constexpr,
+    # Meta-parameters
+    BLOCK: tl.constexpr,
+):
+    """A Triton-accelerated function to perform per-token-group
+    quantization on a tensor.
+    This function converts the tensor values into float8 values.
+    """
+    groups_per_row = y_num_columns // group_size
+
+    # Map the program id to the row of X and Y it should compute.
+    g_id = tl.program_id(0)
+    row = g_id // groups_per_row
+    row_g_id = g_id % groups_per_row
+
+    # Ensure offset calculations use int64 to prevent overflow
+    y_ptr_offset = (row.to(tl.int64) * y_row_stride) + (row_g_id.to(tl.int64) *
+                                                        group_size)
+    y_ptr += y_ptr_offset
+
+    y_q_ptr_offset = g_id.to(tl.int64) * group_size
+    y_q_ptr += y_q_ptr_offset
+    y_s_ptr += g_id
+
+    cols = tl.arange(0, BLOCK)  # N <= BLOCK
+    mask = cols < group_size
+
+    y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32)
+    # Quant
+    _absmax = tl.maximum(tl.max(tl.abs(y)), eps)
+    scale_raw = _absmax / fp8_max
+    y_s = tl.math.exp2(tl.ceil(tl.log2(scale_raw))) if use_ue8m0 else scale_raw
+    y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
+
+    tl.store(y_q_ptr + cols, y_q, mask=mask)
+    tl.store(y_s_ptr, y_s)
+
+
+@triton.jit
+def _per_token_group_quant_fp8_colmajor(
+    # Pointers to inputs and output
+    y_ptr,
+    y_q_ptr,
+    y_s_ptr,
+    group_size,
+    # Num columns of y
+    y_num_columns,
+    y_row_stride,
+    # Stride from one column to the next of y_s
+    y_s_col_stride,
+    # Avoid to divide zero
+    eps,
+    # Information for float8
+    fp8_min,
+    fp8_max,
+    use_ue8m0: tl.constexpr,
+    # Meta-parameters
+    BLOCK: tl.constexpr,
+):
+    """A Triton-accelerated function to perform per-token-group
+    quantization on a tensor.
+    This function converts the tensor values into float8 values.
+    """
+    groups_per_row = y_num_columns // group_size
+
+    # Map the program id to the row of X and Y it should compute.
+    g_id = tl.program_id(0)
+    row = g_id // groups_per_row
+    row_g_id = g_id % groups_per_row
+
+    # Ensure offset calculations use int64 to prevent overflow
+    y_ptr_offset = (row.to(tl.int64) * y_row_stride) + (row_g_id.to(tl.int64) *
+                                                        group_size)
+    y_ptr += y_ptr_offset
+
+    y_q_ptr_offset = g_id.to(tl.int64) * group_size
+    y_q_ptr += y_q_ptr_offset
+
+    # Convert g_id the flattened block coordinate to 2D so we can index
+    # into the output y_scales matrix
+    blocks_per_row = y_num_columns // group_size
+    scale_col = g_id % blocks_per_row
+    scale_row = g_id // blocks_per_row
+    # Ensure offset calculation uses int64 for y_s_ptr
+    y_s_ptr_offset = (scale_col.to(tl.int64) * y_s_col_stride) + scale_row.to(
+        tl.int64)
+    y_s_ptr += y_s_ptr_offset
+
+    cols = tl.arange(0, BLOCK)  # group_size <= BLOCK
+    mask = cols < group_size
+
+    y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32)
+    # Quant
+    _absmax = tl.maximum(tl.max(tl.abs(y)), eps)
+    scale_raw = _absmax / fp8_max
+    y_s = tl.math.exp2(tl.ceil(tl.log2(scale_raw))) if use_ue8m0 else scale_raw
+    y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
+
+    tl.store(y_q_ptr + cols, y_q, mask=mask)
+    tl.store(y_s_ptr, y_s)
+
+
+def per_token_group_quant_fp8(
+    x: torch.Tensor,
+    group_size: int,
+    eps: float = 1e-10,
+    dtype: Optional[torch.dtype] = None,
+    column_major_scales: bool = False,
+    out_q: Optional[torch.Tensor] = None,
+    use_ue8m0: bool = is_blackwell_deep_gemm_used(),
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Function to perform per-token-group quantization on an input tensor `x`.
+    It converts the tensor values into signed float8 values and returns the
+    quantized tensor along with the scaling factor used for quantization.
+    Args:
+        x: The input tensor with ndim >= 2.
+        group_size: The group size used for quantization.
+        eps: The minimum to avoid dividing zero.
+        dtype: The dype of output tensor. Note that only `torch.float8_e4m3fn`
+        is supported for now.
+        column_major_scales: Outputs scales in column major.
+        out_q: Optional output tensor. If not provided, function will create.
+    Returns:
+        tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the
+        scaling factor.
+    """
+    dtype = current_platform.fp8_dtype() if dtype is None else dtype
+    assert (x.shape[-1] % group_size == 0), (
+        f"the last dimension of `x` {x.shape[-1]} must be divisible "
+        f"by `group_size` {group_size}")
+    assert x.stride(-1) == 1, "`x` groups must be contiguous"
+
+    finfo = torch.finfo(dtype)
+    fp8_min = finfo.min
+    fp8_max = finfo.max
+
+    assert out_q is None or out_q.shape == x.shape
+    x_q = out_q
+    if x_q is None:
+        x_q = torch.empty_like(x, device=x.device, dtype=dtype)
+
+    # Allocate the scale tensor in either row- or column-major format.
+    if column_major_scales:
+        shape = (x.shape[-1] // group_size, ) + x.shape[:-1]
+        x_s = torch.empty(shape, device=x.device,
+                          dtype=torch.float32).permute(-1, -2)
+    else:
+        shape = x.shape[:-1] + (x.shape[-1] // group_size, )
+        x_s = torch.empty(shape, device=x.device, dtype=torch.float32)
+
+    # prefer CUDA kernel if available
+    if current_platform.is_cuda() and x.is_contiguous():
+        torch.ops._C.per_token_group_fp8_quant(x, x_q, x_s, group_size, eps,
+                                               fp8_min, fp8_max, use_ue8m0)
+        return x_q, x_s
+
+    # TRITON FALLBACK
+    M = x.numel() // group_size
+    N = group_size
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+    num_stages = 1
+    if column_major_scales:
+        _per_token_group_quant_fp8_colmajor[(M, )](
+            x,
+            x_q,
+            x_s,
+            group_size,
+            x.shape[1],
+            x.stride(0),
+            x_s.stride(1),
+            eps,
+            fp8_min=fp8_min,
+            fp8_max=fp8_max,
+            use_ue8m0=use_ue8m0,
+            BLOCK=BLOCK,
+            num_warps=num_warps,
+            num_stages=num_stages,
+        )
+    else:
+        _per_token_group_quant_fp8[(M, )](
+            x,
+            x_q,
+            x_s,
+            group_size,
+            x.shape[1],
+            x.stride(0),
+            eps,
+            fp8_min=fp8_min,
+            fp8_max=fp8_max,
+            use_ue8m0=use_ue8m0,
+            BLOCK=BLOCK,
+            num_warps=num_warps,
+            num_stages=num_stages,
+        )
+
+    return x_q, x_s
+
+
+@triton.jit
+def _w8a8_block_fp8_matmul(
+    # Pointers to inputs and output
+    A,
+    B,
+    C,
+    As,
+    Bs,
+    # Shape for matmul
+    M,
+    N,
+    K,
+    # Block size for block-wise quantization
+    group_n,
+    group_k,
+    # Stride for inputs and output
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_As_m,
+    stride_As_k,
+    stride_Bs_k,
+    stride_Bs_n,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    """Triton-accelerated function used to perform linear operations (dot
+    product) on input tensors `A` and `B` with block-wise quantization, and
+    store the result in output tensor `C`.
+    """
+
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    b_ptrs = B + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+
+    As_ptrs = As + offs_am * stride_As_m
+    offs_bsn = offs_bn // group_n
+    Bs_ptrs = Bs + offs_bsn * stride_Bs_n
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        b = tl.load(b_ptrs,
+                    mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+
+        k_start = k * BLOCK_SIZE_K
+        offs_ks = k_start // group_k
+        a_s = tl.load(As_ptrs + offs_ks * stride_As_k)
+        b_s = tl.load(Bs_ptrs + offs_ks * stride_Bs_k)
+
+        accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = C + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=c_mask)
+
+
+@functools.lru_cache
+def get_w8a8_block_fp8_configs(N: int, K: int, block_n: int,
+                               block_k: int) -> Optional[dict[int, Any]]:
+    """
+    Return optimized configurations for the w8a8 block fp8 kernel.
+    The return value will be a dictionary that maps an irregular grid of
+    batch sizes to configurations of the w8a8 block fp8 kernel. To evaluate the
+    kernel on a given batch size bs, the closest batch size in the grid should
+    be picked and the associated configuration chosen to invoke the kernel.
+    """
+
+    # First look up if an optimized configuration is available in the configs
+    # directory
+    device_name = current_platform.get_device_name().replace(" ", "_")
+    json_file_name = f"N={N},K={K},device_name={device_name},dtype=fp8_w8a8,block_shape=[{block_n},{block_k}].json"  # noqa: E501
+
+    config_file_path = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)
+    if os.path.exists(config_file_path):
+        with open(config_file_path) as f:
+            logger.info(
+                "Using configuration from %s for W8A8 Block FP8 kernel.",
+                config_file_path,
+            )
+            # If a configuration has been found, return it
+            return {int(key): val for key, val in json.load(f).items()}
+
+    # If no optimized configuration is available, we will use the default
+    # configuration
+    logger.warning(
+        "Using default W8A8 Block FP8 kernel config. Performance might "
+        "be sub-optimal! Config file not found at %s",
+        config_file_path,
+    )
+    return None
+
+
+def w8a8_block_fp8_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int],
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    """This function performs matrix multiplication with block-wise
+    quantization.
+    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
+    The output is returned in the specified `output_dtype`.
+    Args:
+        A: The input tensor, e.g., activation.
+        B: The input tensor, e.g., weight.
+        As: The per-token-group quantization scale for `A`.
+        Bs: The per-block quantization scale for `B`.
+        block_size: The block size for per-block quantization. It should
+        be 2-dim, e.g., [128, 128].
+        output_dytpe: The dtype of the returned tensor.
+    Returns:
+        torch.Tensor: The result of matmul.
+    """
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+
+    assert A.shape[-1] == B.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous()
+    assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
+    M = A.numel() // A.shape[-1]
+
+    assert B.ndim == 2 and Bs.ndim == 2
+    N, K = B.shape
+    assert triton.cdiv(N, block_n) == Bs.shape[0]
+    assert triton.cdiv(K, block_k) == Bs.shape[1]
+
+    C_shape = A.shape[:-1] + (N, )
+    C = A.new_empty(C_shape, dtype=output_dtype)
+
+    configs = get_w8a8_block_fp8_configs(N, K, block_size[0], block_size[1])
+    if configs:
+        # Get the optimal config if there is one
+        config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
+    else:
+        # Default config
+        # Block-wise quant: BLOCK_SIZE_N must be divisible by block_size[0]
+        # BLOCK_SIZE_K must be divisible by block_size[1]
+        config = {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": block_size[0],
+            "BLOCK_SIZE_K": block_size[1],
+            "GROUP_SIZE_M": 32,
+            "num_warps": 4,
+            "num_stages": 2,
+        }
+
+    def grid(META):
+        return (triton.cdiv(M, META["BLOCK_SIZE_M"]) *
+                triton.cdiv(N, META["BLOCK_SIZE_N"]), )
+
+    _w8a8_block_fp8_matmul[grid](
+        A,
+        B,
+        C,
+        As,
+        Bs,
+        M,
+        N,
+        K,
+        block_n,
+        block_k,
+        A.stride(-2),
+        A.stride(-1),
+        B.stride(1),
+        B.stride(0),
+        C.stride(-2),
+        C.stride(-1),
+        As.stride(-2),
+        As.stride(-1),
+        Bs.stride(1),
+        Bs.stride(0),
+        **config,
+    )
+
+    return C
+
+
+# Taken from https://github.com/deepseek-ai/DeepGEMM/blob/0c88cd01392c1073c7049a97d6328c7bba9b3947
+# TODO(wentao): remove this function when DeepGEMM exposes this function
+def get_tma_aligned_size(x: int, element_size: int) -> int:
+    """
+    Global memory address of TMA must be 16-byte aligned.
+    Since we use column-major layout for the LHS scaling tensor,
+        the M-axis of the LHS scaling tensor needs to be padded to a multiple of
+        16 bytes.
+
+    Arguments:
+        x: original M-axis shape of the LHS scaling tensor.
+        element_size: element size of the LHS scaling tensor.
+
+    Returns:
+        M-axis shape of the LHS scaling tensor after padding.
+    """
+    tma_alignment_bytes = 16
+    assert tma_alignment_bytes % element_size == 0
+    alignment = tma_alignment_bytes // element_size
+    return cdiv(x, alignment) * alignment
+
+
+# Taken from https://github.com/deepseek-ai/DeepGEMM/blob/0c88cd01392c1073c7049a97d6328c7bba9b3947
+# TODO(wentao): remove this function when DeepGEMM exposes this function
+def get_col_major_tma_aligned_tensor(x: torch.Tensor) -> torch.Tensor:
+    """
+    Returns TMA-aligned transposed format of the input tensor. `torch.transpose`
+        will be called if necessary.
+    If the input tensor is already column-major layout and 16-byte aligned along
+        the M axis (thus meets the requirement of LHS scaling tensor in
+        DeepGEMM), this function will do nothing.
+
+    Arguments:
+        x: usually the LHS scaling tensor in GEMM.
+
+    Returns:
+        The LHS scaling tensor of TMA-aligned transposed format.
+    """
+    # NOTES: for the extreme performance, you may rewrite/fuse this function in
+    # CUDA
+    assert x.dim() in (2, 3)
+    remove_dim = False
+    m, n = x.shape[-2], x.shape[-1]
+    aligned_m = get_tma_aligned_size(m, x.element_size())
+    if x.dim() == 2:
+        if x.stride(0) == 1 and x.stride(1) == aligned_m:
+            return x
+        x, remove_dim = x.unsqueeze(0), True
+
+    b = x.shape[0]
+
+    # The last kernel gives a column-major TMA aligned layout
+    if x.stride(0) == aligned_m * n and x.stride(1) == 1 and x.stride(
+            2) == aligned_m:
+        return x.squeeze(0) if remove_dim else x
+
+    # Normal layout requires transposing
+    aligned_x = torch.transpose(
+        torch.empty((b, n, aligned_m), device=x.device, dtype=x.dtype), 1, 2)
+    aligned_x[:, :m, :] = x
+    aligned_x = aligned_x[:, :m, :]
+    return aligned_x.squeeze(0) if remove_dim else aligned_x
+
+
+def requant_weight_ue8m0_inplace(
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        block_size: Sequence[int] = (128, 128),
+) -> None:
+    """Re-quantise *weight* so that its per-block scaling factors are in the
+    UE8M0 (power-of-two) format expected by the new DeepGEMM kernels inplace.
+
+    Args:
+        weight: Block-quantised weight tensor stored in ``torch.float8_e4m3fn``.
+            Expected shape ``(..., M, K)``.
+        weight_scale: Corresponding per-block scale tensor (``torch.float32``)
+            with shape ``(..., M // block_size[0], K // block_size[1])``.
+        block_size: 2-element iterable ``[block_m, block_k]`` describing the
+            block quantisation granularity.
+    """
+    if weight.numel() == 0:
+        return
+
+    if weight.dtype != torch.float8_e4m3fn:
+        raise ValueError("Expected *weight* to be torch.float8_e4m3fn, got "
+                         f"{weight.dtype} instead.")
+
+    from vllm.utils.deep_gemm import per_block_cast_to_fp8
+
+    block_m, block_k = int(block_size[0]), int(block_size[1])
+
+    # Flatten leading dimensions so we can iterate over the last two dims.
+    leading_shape = weight.shape[:-2]
+    if len(leading_shape) == 0:
+        w_view = weight.unsqueeze(0)
+        s_view = weight_scale.unsqueeze(0)
+    else:
+        w_view = weight.reshape(-1, weight.shape[-2], weight.shape[-1])
+        s_view = weight_scale.reshape(-1, *weight_scale.shape[-2:])
+
+    num_mats = w_view.size(0)
+    for idx in range(num_mats):
+        w_q = w_view[idx]
+        s_old = s_view[idx]
+
+        # De-quantise with the *old* scaling factors (float32).
+        m_cur, k_cur = w_q.shape
+        s_float = s_old.to(torch.float32)
+        # Expand scales along rows and cols by block size, then crop.
+        s_exp_r = torch.repeat_interleave(s_float, block_m, dim=0)
+        s_exp = torch.repeat_interleave(s_exp_r, block_k, dim=1)
+        s_exp = s_exp[:m_cur, :k_cur]
+        w_dq = w_q.to(torch.float32) * s_exp
+        # Re-quantise using power-of-two scaling (UE8M0).
+        w_requant, s_requant = per_block_cast_to_fp8(w_dq, [block_m, block_k])
+
+        # Write back the results in-place.
+        w_q.copy_(w_requant)
+        s_old.copy_(s_requant)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/gptq_utils.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/gptq_utils.py
new file mode 100644
index 0000000..db82b0d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/gptq_utils.py
@@ -0,0 +1,95 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from copy import deepcopy
+from typing import Optional, Union
+
+import regex as re
+import torch
+
+from vllm.config import QuantizationConfig
+from vllm.model_executor.layers.linear import (LinearBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, UnquantizedEmbeddingMethod)
+
+
+# Match dynamic rules with module name (prefix) and override quantize
+# config if module (prefix) matches a rule
+def override_config(config: QuantizationConfig, prefix: str):
+    weight_bits = get_dynamic_override(config, prefix, "bits",
+                                       config.weight_bits)
+    if isinstance(weight_bits, int):
+        config.weight_bits = weight_bits
+    group_size = get_dynamic_override(config, prefix, "group_size",
+                                      config.group_size)
+    if isinstance(group_size, int):
+        config.group_size = group_size
+    desc_act = get_dynamic_override(config, prefix, "desc_act",
+                                    config.desc_act)
+    if isinstance(desc_act, bool):
+        config.desc_act = desc_act
+
+    config.pack_factor = 32 // config.weight_bits  # packed into int32
+    if config.get_name() == "gptq_marlin":
+        is_sym = get_dynamic_override(config, prefix, "sym", config.is_sym)
+        if isinstance(is_sym, bool):
+            config.is_sym = is_sym
+
+        if (config.weight_bits, config.is_sym) not in config.TYPE_MAP:
+            raise ValueError("Unsupported quantization config: "
+                             f"bits={config.weight_bits}, sym={config.is_sym}")
+
+        config.quant_type = config.TYPE_MAP[(config.weight_bits,
+                                             config.is_sym)]
+    elif config.get_name() == "gptq":
+        if config.weight_bits not in [2, 3, 4, 8]:
+            raise ValueError(
+                "Currently, only 2/3/4/8-bit weight quantization is "
+                f"supported for GPTQ, but got {config.weight_bits} bits.")
+
+
+def get_dynamic_override(
+    config: QuantizationConfig,
+    layer_name: str,
+    key: Optional[str] = None,
+    default_value: Union[int, bool,
+                         None] = None) -> Union[dict, int, bool, None]:
+    for pattern, pattern_dict in config.dynamic.items():
+        # Negative match: matched modules are excluded from quantized init
+        if pattern.startswith("-:"):
+            if re.match(pattern.removeprefix("-:"), layer_name):
+                return False
+        # Positive match: matched modules have quant properties overrides
+        # base quant config
+        elif re.match(pattern.removeprefix("+:"), layer_name):
+            if key is None:
+                return pattern_dict
+            else:
+                return pattern_dict.get(key, default_value)
+    return default_value
+
+
+def get_linear_quant_method(
+    config: QuantizationConfig,
+    layer: torch.nn.Module,
+    prefix: str,
+    linear_method_cls: type,
+):
+    cloned_config = deepcopy(config)
+    parallel_lm_head_quantized = isinstance(
+        layer, ParallelLMHead) and cloned_config.lm_head_quantized
+    if isinstance(layer, LinearBase) or parallel_lm_head_quantized:
+        # False = skip module, None = no override, else = Positive match
+        if get_dynamic_override(  # noqa: E712
+                cloned_config,  # noqa: E712
+                layer_name=prefix) == False:  # noqa: E712
+            if parallel_lm_head_quantized:
+                return UnquantizedEmbeddingMethod()
+            return UnquantizedLinearMethod()
+
+        if prefix:
+            # Dynamic per module/layer rules may override base config
+            override_config(cloned_config, prefix=prefix)
+
+        return linear_method_cls(cloned_config)
+    return None
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/int8_utils.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/int8_utils.py
new file mode 100644
index 0000000..1fdf7d1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/int8_utils.py
@@ -0,0 +1,485 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from https://github.com/sgl-project/sglang/blob/4cb53ecd0cffceb6dee5c011a58f65997a86f151/python/sglang/srt/layers/quantization/int8_kernel.py
+import functools
+import json
+import logging
+import os
+from typing import Any, Optional
+
+import torch
+
+from vllm.platforms import current_platform
+from vllm.triton_utils import tl, triton
+
+logger = logging.getLogger(__name__)
+
+
+def apply_w8a8_block_int8_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    block_size: list[int],
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    assert input_scale is None
+    # View input as 2D matrix for fp8 methods
+    input_2d = input.view(-1, input.shape[-1])
+    output_shape = [*input.shape[:-1], weight.shape[0]]
+
+    q_input, x_scale = per_token_group_quant_int8(input_2d, block_size[1])
+    output = w8a8_block_int8_matmul(q_input,
+                                    weight,
+                                    x_scale,
+                                    weight_scale,
+                                    block_size,
+                                    output_dtype=input.dtype)
+
+    if bias is not None:
+        output = output + bias
+    return output.to(dtype=input.dtype).view(*output_shape)
+
+
+def input_to_int8(
+        x: torch.Tensor,
+        dtype: torch.dtype = torch.int8) -> tuple[torch.Tensor, torch.Tensor]:
+    """This function quantizes input values to int8 values with
+    tensor-wise quantization."""
+    iinfo = torch.iinfo(dtype)
+    min_val, max_val = x.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
+    int8_min, int8_max = iinfo.min, iinfo.max
+    scale = int8_max / amax
+    x_scl_sat = (x * scale).clamp(min=int8_min, max=int8_max)
+    return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
+
+
+def block_dequant(
+    x_q_block: torch.Tensor,
+    x_s: torch.Tensor,
+    block_size: list[int],
+) -> torch.Tensor:
+    """This function conducts block-wise dequantization.
+    The inputs are block-wise quantization tensor `x_q_block`,
+    block-wise quantization scale and the block size.
+    The outputs are dequantized tensor.
+    """
+    block_n, block_k = block_size[0], block_size[1]
+    n, k = x_q_block.shape
+    n_tiles = (n + block_n - 1) // block_n
+    k_tiles = (k + block_k - 1) // block_k
+    assert n_tiles == x_s.shape[0]
+    assert k_tiles == x_s.shape[1]
+
+    x_dq_block = x_q_block.to(torch.float32)
+
+    for i in range(k_tiles):
+        for j in range(n_tiles):
+            x_dq_block[
+                j * block_n:min((j + 1) * block_n, n),
+                i * block_k:min((i + 1) * block_k, k),
+            ] *= x_s[j][i]
+
+    return x_dq_block
+
+
+if current_platform.is_rocm():
+    from triton.language import core
+
+    # NOTE: This can be removed when hip.libdevice.round() is available.
+    @core.extern
+    def round_f32(arg0, _builder=None):
+        return core.extern_elementwise("",
+                                       "", [arg0], {
+                                           (core.dtype("fp32"), ):
+                                           ("llvm.round", core.dtype("fp32")),
+                                           (core.dtype("fp64"), ):
+                                           ("llvm.round", core.dtype("fp64")),
+                                       },
+                                       is_pure=True,
+                                       _builder=_builder)
+
+    @triton.jit
+    def round_int8(x):
+        return round_f32(x).to(tl.int8)
+else:
+
+    @triton.jit
+    def round_int8(x):
+        return tl.extra.cuda.libdevice.round(x).to(tl.int8)
+
+
+@triton.jit
+def _per_token_quant_int8(
+    x_ptr,
+    xq_ptr,
+    scale_ptr,
+    stride_x,
+    stride_xq,
+    N,
+    BLOCK: tl.constexpr,
+):
+    # Adapted from https://github.com/InternLM/lmdeploy/blob/086481ed84b59bee3b8e4274e5fc69620040c048/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py#L282
+    row_id = tl.program_id(0)
+
+    cols = tl.arange(0, BLOCK)
+    mask = cols < N
+
+    x = tl.load(x_ptr + row_id * stride_x + cols, mask=mask,
+                other=0.0).to(tl.float32)
+    absmax = tl.maximum(tl.max(tl.abs(x)), 1e-10)
+    scale_x = absmax / 127
+    x_q = x * (127 / absmax)
+    x_q = round_int8(x_q)
+
+    tl.store(xq_ptr + row_id * stride_xq + cols, x_q, mask=mask)
+    tl.store(scale_ptr + row_id, scale_x)
+
+
+def per_token_quant_int8(x):
+    M = x.numel() // x.shape[-1]
+    N = x.shape[-1]
+    x_q = torch.empty_like(x, device=x.device, dtype=torch.int8)
+    scales = torch.empty(x.shape[:-1] + (1, ),
+                         device=x.device,
+                         dtype=torch.float32)
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+
+    assert x.is_contiguous()
+    _per_token_quant_int8[(M, )](
+        x,
+        x_q,
+        scales,
+        stride_x=x.stride(-2),
+        stride_xq=x_q.stride(-2),
+        N=N,
+        BLOCK=BLOCK,
+        num_warps=num_warps,
+        num_stages=1,
+    )
+
+    return x_q, scales
+
+
+@triton.jit
+def _per_token_group_quant_int8(
+    # Pointers to inputs and output
+    y_ptr,
+    y_q_ptr,
+    y_s_ptr,
+    # Stride of input
+    y_stride,
+    # Columns of input
+    N,
+    # Avoid to divide zero
+    eps,
+    # Information for int8
+    int8_min,
+    int8_max,
+    # Meta-parameters
+    BLOCK: tl.constexpr,
+):
+    """A Triton-accelerated function to perform per-token-group
+    quantization on a tensor.
+
+    This function converts the tensor values into int8 values.
+    """
+    # Map the program id to the row of X and Y it should compute.
+    g_id = tl.program_id(0)
+    y_ptr += g_id * y_stride
+    y_q_ptr += g_id * y_stride
+    y_s_ptr += g_id
+
+    cols = tl.arange(0, BLOCK)  # N <= BLOCK
+    mask = cols < N
+
+    y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32)
+    # Quant
+    _absmax = tl.maximum(tl.max(tl.abs(y)), eps)
+    y_s = _absmax / int8_max
+    y_q = tl.clamp(y / y_s, int8_min, int8_max).to(y_q_ptr.dtype.element_ty)
+
+    tl.store(y_q_ptr + cols, y_q, mask=mask)
+    tl.store(y_s_ptr, y_s)
+
+
+def per_token_group_quant_int8(
+    x: torch.Tensor,
+    group_size: int,
+    eps: float = 1e-10,
+    dtype: torch.dtype = torch.int8,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Function to perform per-token-group quantization on an input tensor `x`.
+
+    It converts the tensor values into signed int8 values and returns the
+    quantized tensor along with the scaling factor used for quantization.
+
+    Args:
+        x: The input tensor with ndim >= 2.
+        group_size: The group size used for quantization.
+        eps: The minimum to avoid dividing zero.
+        dtype: The dype of output tensor. Note that only `torch.int8`
+            is supported for now.
+
+    Returns:
+        tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the
+            scaling factor for quantization.
+    """
+    assert (x.shape[-1] % group_size == 0
+            ), "the last dimension of `x` cannot be divisible by `group_size`"
+    assert x.is_contiguous(), "`x` is not contiguous"
+
+    iinfo = torch.iinfo(dtype)
+    int8_max = iinfo.max
+    int8_min = iinfo.min
+
+    x_q = torch.empty_like(x, device=x.device, dtype=dtype)
+    M = x.numel() // group_size
+    N = group_size
+    x_s = torch.empty(
+        x.shape[:-1] + (x.shape[-1] // group_size, ),
+        device=x.device,
+        dtype=torch.float32,
+    )
+
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+    num_stages = 1
+    _per_token_group_quant_int8[(M, )](
+        x,
+        x_q,
+        x_s,
+        group_size,
+        N,
+        eps,
+        int8_min=int8_min,
+        int8_max=int8_max,
+        BLOCK=BLOCK,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+
+    return x_q, x_s
+
+
+@triton.jit
+def _w8a8_block_int8_matmul(
+    # Pointers to inputs and output
+    A,
+    B,
+    C,
+    As,
+    Bs,
+    # Shape for matmul
+    M,
+    N,
+    K,
+    # Block size for block-wise quantization
+    group_n,
+    group_k,
+    # Stride for inputs and output
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_As_m,
+    stride_As_k,
+    stride_Bs_k,
+    stride_Bs_n,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    """Triton-accelerated function used to perform linear operations (dot
+    product) on input tensors `A` and `B` with block-wise quantization, and
+    store the result in output tensor `C`.
+    """
+
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    b_ptrs = B + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+
+    As_ptrs = As + offs_am * stride_As_m
+    offs_bsn = offs_bn // group_n
+    Bs_ptrs = Bs + offs_bsn * stride_Bs_n
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        b = tl.load(b_ptrs,
+                    mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+
+        k_start = k * BLOCK_SIZE_K
+        offs_ks = k_start // group_k
+        a_s = tl.load(As_ptrs + offs_ks * stride_As_k)
+        b_s = tl.load(Bs_ptrs + offs_ks * stride_Bs_k)
+
+        accumulator += tl.dot(a, b).to(tl.float32) * a_s[:,
+                                                         None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = C + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=c_mask)
+
+
+@functools.lru_cache
+def get_w8a8_block_int8_configs(N: int, K: int, block_n: int,
+                                block_k: int) -> Optional[dict[int, Any]]:
+    """
+    Return optimized configurations for the w8a8 block fp8 kernel.
+
+    The return value will be a dictionary that maps an irregular grid of
+    batch sizes to configurations of the w8a8 block fp8 kernel. To evaluate the
+    kernel on a given batch size bs, the closest batch size in the grid should
+    be picked and the associated configuration chosen to invoke the kernel.
+    """
+
+    # First look up if an optimized configuration is available in the configs
+    # directory
+    device_name = current_platform.get_device_name().replace(" ", "_")
+    json_file_name = f"N={N},K={K},device_name={device_name},dtype=int8_w8a8,block_shape=[{block_n}, {block_k}].json"  # noqa: E501
+
+    config_file_path = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)
+    if os.path.exists(config_file_path):
+        with open(config_file_path) as f:
+            logger.info(
+                "Using configuration from %s for W8A8 Block INT8 kernel.",
+                config_file_path,
+            )
+            # If a configuration has been found, return it
+            return {int(key): val for key, val in json.load(f).items()}
+
+    # If no optimized configuration is available, we will use the default
+    # configuration
+    logger.warning(
+        ("Using default W8A8 Block INT8 kernel config. Performance might "
+         "be sub-optimal! Config file not found at %s"),
+        config_file_path,
+    )
+    return None
+
+
+def w8a8_block_int8_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int],
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    """This function performs matrix multiplication with block-wise
+    quantization.
+
+    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
+    The output is returned in the specified `output_dtype`.
+
+    Args:
+        A: The input tensor, e.g., activation.
+        B: The input tensor, e.g., weight.
+        As: The per-token-group quantization scale for `A`.
+        Bs: The per-block quantization scale for `B`.
+        block_size: The block size for per-block quantization. It should be
+            2-dim, e.g., [128, 128].
+        output_dytpe: The dtype of the returned tensor.
+
+    Returns:
+        torch.Tensor: The result of matmul.
+    """
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+
+    assert A.shape[-1] == B.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous()
+    assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
+    M = A.numel() // A.shape[-1]
+
+    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
+    N, K = B.shape
+    assert triton.cdiv(N, block_n) == Bs.shape[0]
+    assert triton.cdiv(K, block_k) == Bs.shape[1]
+
+    C_shape = A.shape[:-1] + (N, )
+    C = A.new_empty(C_shape, dtype=output_dtype)
+
+    configs = get_w8a8_block_int8_configs(N, K, block_size[0], block_size[1])
+    if configs:
+        # If an optimal configuration map has been found, look up the
+        # optimal config
+        config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
+    else:
+        # Default config
+        # Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1]
+        config = {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": block_size[0],
+            "BLOCK_SIZE_K": block_size[1],
+            "GROUP_SIZE_M": 32,
+            "num_warps": 4,
+            "num_stages": 3,
+        }
+
+    def grid(META):
+        return (triton.cdiv(M, META["BLOCK_SIZE_M"]) *
+                triton.cdiv(N, META["BLOCK_SIZE_N"]), )
+
+    _w8a8_block_int8_matmul[grid](
+        A,
+        B,
+        C,
+        As,
+        Bs,
+        M,
+        N,
+        K,
+        block_n,
+        block_k,
+        A.stride(-2),
+        A.stride(-1),
+        B.stride(1),
+        B.stride(0),
+        C.stride(-2),
+        C.stride(-1),
+        As.stride(-2),
+        As.stride(-1),
+        Bs.stride(1),
+        Bs.stride(0),
+        **config,
+    )
+
+    return C
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/layer_utils.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/layer_utils.py
new file mode 100644
index 0000000..fbc0f23
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/layer_utils.py
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Union
+
+import torch
+
+
+def update_tensor_inplace(dst: torch.Tensor, src: torch.Tensor):
+    assert dst.dtype == src.dtype, "Tensors must have the same dtype"
+
+    # update tensor shape and stride
+    dst.as_strided_(src.shape, src.stride())
+
+    # If not the same underlying storage move tensor data
+    if dst.data_ptr() != src.data_ptr():
+        dst.copy_(src)
+        del src
+
+
+# Newly generated tensors need to replace existing tensors that are
+# already registered as parameters by vLLM (and won't be freed)
+def replace_parameter(mod: torch.nn.Module, name: str,
+                      new: Union[torch.Tensor, torch.nn.Parameter]) -> None:
+
+    old = getattr(mod, name)
+    if type(old) is type(new) and old.dtype == new.dtype and \
+        old.untyped_storage().nbytes() == new.untyped_storage().nbytes():
+        # If we can just update in-place to avoid re-registering
+        #   can be faster if the underlying storage is the same
+        update_tensor_inplace(old, new)
+    else:
+        # Fallback re-register parameter, convert to Parameter if necessary
+        # this not only ensures we don't register a tensor as a parameter, but
+        # also ensures that all parameter subclasses get re-registered as
+        # parameters for `torch.compile` compatibility
+        if not isinstance(new, torch.nn.Parameter):
+            new = torch.nn.Parameter(new, requires_grad=False)
+        mod.register_parameter(name,
+                               torch.nn.Parameter(new, requires_grad=False))
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/machete_utils.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/machete_utils.py
new file mode 100644
index 0000000..fbb850d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/machete_utils.py
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+from vllm.scalar_type import ScalarType, scalar_types
+
+MACHETE_PREPACKED_BLOCK_SHAPE = [64, 128]
+
+
+def query_machete_supported_quant_types(zero_points: bool) -> list[ScalarType]:
+    if zero_points:
+        return [scalar_types.uint4, scalar_types.uint8]
+    else:
+        return [scalar_types.uint4b8, scalar_types.uint8b128]
+
+
+def query_machete_supported_act_types(zero_points: bool) -> list[ScalarType]:
+    return [torch.float16, torch.bfloat16]
+
+
+def query_machete_supported_group_sizes(act_type: torch.dtype) -> list[int]:
+    """
+    Queries the supported group sizes for Machete based on the activation type.
+
+    Args:
+        act_type: The activation data type (torch.float16, torch.bfloat16).
+
+    Returns:
+        A list of supported group sizes. The group size must
+        be divisible by `TileShapeK = 128 * 8 // num_bits(act_type)`.
+        -1 indicates per-channel quantization.
+    """
+    if act_type in [torch.float16, torch.bfloat16]:
+        return [-1, 64, 128]
+    else:
+        return [-1, 128]
+
+
+def check_machete_supports_shape(in_features: int, out_featrues: int) \
+    -> tuple[bool, Optional[str]]:
+    if in_features % MACHETE_PREPACKED_BLOCK_SHAPE[0] != 0:
+        return False, "Input features size must be divisible by "\
+            f"{MACHETE_PREPACKED_BLOCK_SHAPE[0]}"
+    if out_featrues % MACHETE_PREPACKED_BLOCK_SHAPE[1] != 0:
+        return False, "Output features size must be divisible by "\
+            f"{MACHETE_PREPACKED_BLOCK_SHAPE[1]}"
+    return True, None
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/marlin_utils.py
new file mode 100644
index 0000000..7540a15
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -0,0 +1,476 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import numpy
+import torch
+
+import vllm.envs as envs
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import LinearBase
+from vllm.platforms import current_platform
+from vllm.scalar_type import ScalarType, scalar_types
+
+from .quant_utils import pack_cols, unpack_cols
+
+logger = init_logger(__name__)
+
+GPTQ_MARLIN_TILE = 16
+GPTQ_MARLIN_MIN_THREAD_N = 64
+GPTQ_MARLIN_MIN_THREAD_K = 128
+GPTQ_MARLIN_MAX_PARALLEL = 16
+
+MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
+
+# In case there is a performance issue with Marlin, the variable below can be
+# changed to False, which allows Marlin to perform global reductions in fp16
+# precision (instead of fp32), and therefore, save on some memory movements.
+USE_FP32_REDUCE_DEFAULT = True
+
+
+# For binary size and compile time, we don't support the same types for with and
+#  without runtime zero-point. We support common cases, i.e. AWQ and GPTQ.
+#  TODO: we may want to move this into the C++ so its closer to the actual impl
+def query_marlin_supported_quant_types(
+    has_zp: Optional[bool] = None,
+    include_fp_type: bool = True,
+    device_capability: Optional[int] = None,
+):
+    if device_capability is None:
+        capability_tuple = current_platform.get_device_capability()
+        device_capability = (-1 if capability_tuple is None else
+                             capability_tuple.to_int())
+
+    if device_capability < 80:
+        return []
+
+    # - has_zp is True: return quant_types that has zero points
+    # - has_zp is False: return quant_types that has not zero points
+    # - has_zp is None: both
+    if has_zp is None:
+        types0 = query_marlin_supported_quant_types(False, include_fp_type,
+                                                    device_capability)
+        types1 = query_marlin_supported_quant_types(True, include_fp_type,
+                                                    device_capability)
+        return types0 + types1
+
+    if has_zp:
+        # AWQ style, unsigned + runtime zero-point
+        return [scalar_types.uint4]
+    else:
+        # GPTQ style, unsigned + symmetric bias
+        res = [scalar_types.uint4b8, scalar_types.uint8b128]
+        if include_fp_type:
+            res += [scalar_types.float8_e4m3fn, scalar_types.float4_e2m1f]
+        return res
+
+
+def _check_marlin_supported(
+        quant_type: ScalarType,
+        group_size: Optional[int],
+        has_zp: bool,
+        device_capability: Optional[int] = None) -> tuple[bool, Optional[str]]:
+
+    if device_capability is None:
+        capability_tuple = current_platform.get_device_capability()
+        device_capability = (-1 if capability_tuple is None else
+                             capability_tuple.to_int())
+
+    supported_types = query_marlin_supported_quant_types(
+        has_zp, True, device_capability)
+
+    if quant_type not in supported_types:
+        return (False, f"Marlin does not support weight_bits = {quant_type}. "
+                f"Only types = {supported_types} "
+                f"are supported (for group_size = {group_size}, "
+                f"device_capability = {device_capability}, zp = {has_zp}).")
+    if (group_size is None or group_size not in MARLIN_SUPPORTED_GROUP_SIZES):
+        return (False, f"Marlin does not support group_size = {group_size}. "
+                f"Only group_sizes = {MARLIN_SUPPORTED_GROUP_SIZES} "
+                "are supported.")
+
+    return True, None
+
+
+def check_marlin_supported(quant_type: ScalarType,
+                           group_size: int,
+                           has_zp: bool = False,
+                           device_capability: Optional[int] = None) -> bool:
+    cond, _ = _check_marlin_supported(quant_type, group_size, has_zp,
+                                      device_capability)
+    return cond
+
+
+def verify_marlin_supported(quant_type: ScalarType,
+                            group_size: int,
+                            has_zp: bool = False) -> None:
+    cond, err_msg = _check_marlin_supported(quant_type, group_size, has_zp)
+    if not cond:
+        assert err_msg is not None
+        raise ValueError(err_msg)
+
+
+def verify_marlin_supports_shape(output_size_per_partition: int,
+                                 input_size_per_partition: int,
+                                 input_size: int, group_size: int) -> None:
+
+    # Validate output_size_per_partition
+    if output_size_per_partition % GPTQ_MARLIN_MIN_THREAD_N != 0:
+        raise ValueError(f"Weight output_size_per_partition = "
+                         f"{output_size_per_partition} is not divisible by "
+                         f" min_thread_n = {GPTQ_MARLIN_MIN_THREAD_N}. "
+                         "Consider reducing tensor_parallel_size or running "
+                         "with --quantization gptq.")
+
+    # Validate input_size_per_partition
+    if input_size_per_partition % GPTQ_MARLIN_MIN_THREAD_K != 0:
+        raise ValueError(f"Weight input_size_per_partition = "
+                         f"{input_size_per_partition} is not divisible "
+                         f"by min_thread_k = {GPTQ_MARLIN_MIN_THREAD_K}. "
+                         "Consider reducing tensor_parallel_size or running "
+                         "with --quantization gptq.")
+
+    if (group_size < input_size
+            and input_size_per_partition % group_size != 0):
+        raise ValueError(
+            f"Weight input_size_per_partition = {input_size_per_partition}"
+            f" is not divisible by group_size = {group_size}. "
+            "Consider reducing tensor_parallel_size or running "
+            "with --quantization gptq.")
+
+
+def check_marlin_supports_shape(output_size_per_partition: int,
+                                input_size_per_partition: int,
+                                input_size: int, group_size: int) \
+                                    -> tuple[bool, Optional[str]]:
+    try:
+        verify_marlin_supports_shape(output_size_per_partition,
+                                     input_size_per_partition, input_size,
+                                     group_size)
+    except ValueError as e:
+        return False, e.__str__()
+    return True, None
+
+
+def check_marlin_supports_layer(layer: LinearBase, group_size: int) \
+                                    -> bool:
+    output_size_per_partition = getattr(layer, "output_size_per_partition",
+                                        None) or layer.output_size
+    input_size_per_partition = getattr(layer, "input_size_per_partition",
+                                       None) or layer.input_size
+
+    return check_marlin_supports_shape(
+        output_size_per_partition=output_size_per_partition,
+        input_size_per_partition=input_size_per_partition,
+        input_size=layer.input_size,
+        group_size=group_size)[0]
+
+
+def check_moe_marlin_supports_layer(layer: LinearBase, group_size: int) \
+                                    -> bool:
+    hidden_size = layer.hidden_size
+    intermediate_size_per_partition = layer.intermediate_size_per_partition
+    # apply_router_weight_on_input is not supported for moe marlin
+    supports_router_weight = not layer.apply_router_weight_on_input
+    # moe marlin requires the activation to be silu
+    supports_activation = layer.activation == "silu"
+
+    # gate-up: (n, k) = (intermediate_size_per_partition * 2, hidden_size)
+    # down: (n, k) = (hidden_size, intermediate_size_per_partition)
+    # moe marlin requires n % 128 == 0 and k % 64 == 0
+    supports_shape = hidden_size % 128 == 0 and \
+        intermediate_size_per_partition % max(64, group_size) == 0
+    supports_group_size = group_size in [-1, 32, 64, 128]
+    return supports_shape and supports_group_size and \
+        supports_router_weight and supports_activation
+
+
+def marlin_make_workspace(output_size_per_partition: int,
+                          device: torch.device) -> torch.Tensor:
+    max_workspace_size = (output_size_per_partition //
+                          GPTQ_MARLIN_MIN_THREAD_N) * GPTQ_MARLIN_MAX_PARALLEL
+
+    return torch.zeros(max_workspace_size,
+                       dtype=torch.int,
+                       device=device,
+                       requires_grad=False)
+
+
+def marlin_make_workspace_new(device: torch.device,
+                              max_blocks_per_sm: int = 1) -> torch.Tensor:
+    # In the new marlin kernel, we use the num of threadblocks as workspace
+    # size. The num of threadblocks is is sms_count * max_blocks_per_sm.
+    sms = torch.cuda.get_device_properties(device).multi_processor_count
+    return torch.zeros(sms * max_blocks_per_sm,
+                       dtype=torch.int,
+                       device=device,
+                       requires_grad=False)
+
+
+def marlin_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
+    return (not act_order) or (act_order and not is_row_parallel)
+
+
+def marlin_repeat_scales_on_all_ranks(act_order: bool, group_size: int,
+                                      is_row_parallel: bool) -> bool:
+    # Need to repeat scales on every rank if act_ordering or
+    # channelwise and RowParallelLinear
+    is_channelwise = group_size == -1
+    return act_order or (is_channelwise and is_row_parallel)
+
+
+def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
+    return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
+                              requires_grad=False)
+
+
+def marlin_make_empty_zp(device: torch.device) -> torch.Tensor:
+    return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
+                              requires_grad=False)
+
+
+def marlin_sort_g_idx(
+        g_idx: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
+    return g_idx[g_idx_sort_indices], g_idx_sort_indices
+
+
+def get_scale_perms():
+    scale_perm: list[int] = []
+    for i in range(8):
+        scale_perm.extend([i + 8 * j for j in range(8)])
+    scale_perm_single: list[int] = []
+    for i in range(4):
+        scale_perm_single.extend(
+            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
+    return scale_perm, scale_perm_single
+
+
+def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
+                          group_size: int) -> torch.Tensor:
+
+    scale_perm, scale_perm_single = get_scale_perms()
+    if group_size < size_k and group_size != -1:
+        s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
+    else:
+        s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
+    s = s.reshape((-1, size_n)).contiguous()
+
+    return s
+
+
+def marlin_moe_permute_scales(
+    s: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    group_size: int,
+):
+    num_experts = s.shape[0]
+    output = torch.empty(
+        (num_experts, s.shape[1], s.shape[2]),
+        device=s.device,
+        dtype=s.dtype,
+    )
+
+    for e in range(num_experts):
+        output[e] = marlin_permute_scales(s[e], size_k, size_n, group_size)
+    return output
+
+
+def marlin_zero_points(zp: torch.Tensor, size_k: int, size_n: int,
+                       num_bits: int) -> torch.Tensor:
+    # Permute zero-points in a similar way to scales, but do not use the
+    # "single" permutation, since zero-points are applied on every MMA
+    scale_perm, _ = get_scale_perms()
+    zp = zp.reshape((-1, len(scale_perm)))[:, scale_perm]
+
+    # Interleave column dim (for the dequantize code) and pack it to int32
+    if num_bits == 4:
+        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
+    elif num_bits == 8:
+        interleave = numpy.array([0, 2, 1, 3])
+    else:
+        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
+
+    zp = zp.reshape((-1, len(interleave)))[:, interleave].ravel()
+    zp = zp.reshape((-1, size_n)).contiguous()
+    zp = pack_cols(zp, num_bits, size_k, size_n)
+
+    return zp
+
+
+def awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
+                              size_n: int, num_bits: int) -> torch.Tensor:
+    # AWQ zero-points are quantized and packed on the column dim.
+    # In addition, the values are permuted based on dequantizer.
+    # Here we undo both of these, and then apply marlin permutation
+    # and pack it back.
+    q_zp = unpack_cols(q_zp_packed, num_bits, size_k, size_n)
+
+    # Undo interleaving (use argsort(..) to get inverse perm)
+    if num_bits == 4:
+        undo_interleave = numpy.argsort(numpy.array([0, 2, 4, 6, 1, 3, 5, 7]))
+    elif num_bits == 8:
+        undo_interleave = numpy.argsort(numpy.array([0, 2, 1, 3]))
+    else:
+        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
+
+    q_zp = q_zp.reshape((-1, len(undo_interleave)))[:, undo_interleave].ravel()
+    q_zp = q_zp.reshape((-1, size_n)).contiguous()
+
+    marlin_zp = marlin_zero_points(q_zp, size_k, size_n, num_bits)
+    return marlin_zp
+
+
+def moe_awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
+                                  size_n: int, num_bits: int):
+    num_experts = q_zp_packed.shape[0]
+    output = torch.empty(
+        (num_experts, q_zp_packed.shape[1], q_zp_packed.shape[2]),
+        device=q_zp_packed.device,
+        dtype=q_zp_packed.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = awq_to_marlin_zero_points(q_zp_packed[e], size_k, size_n,
+                                              num_bits)
+    return output
+
+
+def maybe_warn_marlin_atomic_add(device, dtype):
+    if torch.compiler.is_dynamo_compiling():
+        return
+    device_capability = torch.cuda.get_device_capability(device)
+    if device_capability[0] < 9 and dtype == torch.bfloat16:
+        logger.info_once(
+            "You are running Marlin kernel with bf16 on GPUs before SM90. "
+            "You can consider change to fp16 to achieve better performance "
+            "if possible.")
+
+
+def maybe_warn_marlin_atomic_add_env():
+    if torch.compiler.is_dynamo_compiling():
+        return
+    if envs.VLLM_MARLIN_USE_ATOMIC_ADD:
+        return
+    logger.info_once(
+        "Marlin kernel can achieve better performance for small size_n "
+        "with experimental use_atomic_add feature. "
+        "You can consider set environment variable "
+        "VLLM_MARLIN_USE_ATOMIC_ADD to 1 if possible.")
+
+
+def should_use_atomic_add_reduce(m: int, n: int, k: int, device: torch.device,
+                                 dtype: torch.dtype) -> bool:
+
+    # the performance of atomicAdd is better than global reduce
+    # only when m*n is small and k is large
+    if n >= 2048 or k < 2048 or device.type != "cuda":
+        return False
+
+    # disable atomicAdd reduce by default,
+    # one can enable it with VLLM_MARLIN_USE_ATOMIC_ADD=1
+    if not envs.VLLM_MARLIN_USE_ATOMIC_ADD:
+        maybe_warn_marlin_atomic_add_env()
+        return False
+
+    # sm8x doesn't support atomicAdd + bfloat16 natively
+    device_capability = torch.cuda.get_device_capability(device)
+    if device_capability[0] < 9 and dtype == torch.bfloat16:
+        maybe_warn_marlin_atomic_add(device, dtype)
+        return False
+
+    return True
+
+
+def apply_gptq_marlin_linear(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        weight_zp: torch.Tensor,
+        g_idx: torch.Tensor,
+        g_idx_sort_indices: torch.Tensor,
+        workspace: torch.Tensor,
+        wtype: ScalarType,
+        output_size_per_partition: int,
+        input_size_per_partition: int,
+        is_k_full: bool,
+        bias: Optional[torch.Tensor] = None,
+        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
+    reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (output_size_per_partition, )
+
+    use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
+                                                  n=output_size_per_partition,
+                                                  k=reshaped_x.size(1),
+                                                  device=input.device,
+                                                  dtype=input.dtype)
+
+    output = ops.gptq_marlin_gemm(reshaped_x,
+                                  None,
+                                  weight,
+                                  weight_scale,
+                                  None,
+                                  weight_zp,
+                                  g_idx,
+                                  g_idx_sort_indices,
+                                  workspace,
+                                  wtype,
+                                  size_m=reshaped_x.shape[0],
+                                  size_n=output_size_per_partition,
+                                  size_k=input_size_per_partition,
+                                  is_k_full=is_k_full,
+                                  use_atomic_add=use_atomic_add,
+                                  use_fp32_reduce=use_fp32_reduce,
+                                  is_zp_float=False)
+
+    if bias is not None:
+        output.add_(bias)  # In-place add
+
+    return output.reshape(out_shape)
+
+
+def apply_awq_marlin_linear(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        weight_zp: torch.Tensor,
+        g_idx: torch.Tensor,
+        g_idx_sort_indices: torch.Tensor,
+        workspace: torch.Tensor,
+        quant_type: ScalarType,
+        output_size_per_partition: int,
+        input_size_per_partition: int,
+        bias: Optional[torch.Tensor] = None,
+        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
+    reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (output_size_per_partition, )
+
+    use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
+                                                  n=output_size_per_partition,
+                                                  k=reshaped_x.size(1),
+                                                  device=input.device,
+                                                  dtype=input.dtype)
+
+    output = ops.gptq_marlin_gemm(reshaped_x,
+                                  None,
+                                  weight,
+                                  weight_scale,
+                                  None,
+                                  weight_zp,
+                                  g_idx,
+                                  g_idx_sort_indices,
+                                  workspace,
+                                  quant_type,
+                                  size_m=reshaped_x.shape[0],
+                                  size_n=output_size_per_partition,
+                                  size_k=input_size_per_partition,
+                                  use_atomic_add=use_atomic_add,
+                                  use_fp32_reduce=use_fp32_reduce,
+                                  is_zp_float=False)
+
+    if bias is not None:
+        output.add_(bias)  # In-place add
+
+    return output.reshape(out_shape)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
new file mode 100644
index 0000000..ca10db6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
@@ -0,0 +1,283 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+import vllm._custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_scales,
+    should_use_atomic_add_reduce)
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+
+FP4_MARLIN_SUPPORTED_GROUP_SIZES = [16]
+
+logger = init_logger(__name__)
+
+
+def is_fp4_marlin_supported():
+    return current_platform.has_device_capability(80)
+
+
+def fp4_marlin_process_scales(marlin_scales):
+    if not (marlin_scales >= 0).all():
+        logger.warning_once(
+            "NVFP4 Marlin assumes the scales to be >=0, but has encountered "
+            "negative scales. Accuracy will likely be degraded. This is "
+            "because it changes the scales from FP8-S1E4M3 to a special "
+            "FP8-S0E5M3 format to speedup the dequantization.")
+
+    # convert to half first, we would convert to fp8 later
+    marlin_scales = marlin_scales.to(torch.half)
+
+    # 8 is the number of scale number using by one thread
+    marlin_scales = marlin_scales.view(marlin_scales.size(0) // 2, 2, -1, 8)
+    marlin_scales = marlin_scales.permute(0, 2, 1, 3).reshape(
+        marlin_scales.size(0) * 2, -1)
+
+    # fit the layout of fp8 dequantization
+    marlin_scales = marlin_scales.view(-1, 4)[:, [0, 2, 1, 3]].view(
+        marlin_scales.size(0), -1)
+
+    # We assume that weight_scale (FP8-S1E4M3) is always greater
+    # than or equal to 0. So we can convert
+    # (weight_scale * (2 ** 7) to a special FP8-S0E5M3 format.
+    # After multiplying by 2 ** 7, the top bit of FP8-S0E5M3 would always be 1
+    # when weight_scale > 0. This allows us to have an exponent bias
+    # closer to zero after dequantization.
+
+    marlin_scales = (marlin_scales * (2**7)).view(torch.int16) << 1
+    marlin_scales = marlin_scales.view(torch.float8_e4m3fn)
+    marlin_scales = marlin_scales[:, 1::2].contiguous()
+
+    return marlin_scales
+
+
+def fp4_marlin_process_global_scale(global_scale):
+    assert global_scale.dtype in [torch.half, torch.bfloat16]
+    fp4_exponent = 2
+    if global_scale.dtype == torch.half:
+        target_exponent = 5
+    elif global_scale.dtype == torch.bfloat16:
+        target_exponent = 8
+    # exponent_bias_fp16 = 2 ** 4 - 2 ** 1 = 14
+    # exponent_bias_bf16 = 2 ** 7 - 2 ** 1 = 126
+    exponent_bias = 2**(target_exponent - 1) - 2**(fp4_exponent - 1)
+    return global_scale * (2.0**(exponent_bias - 7))
+
+
+def apply_fp4_marlin_linear(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        weight_scale_2: torch.Tensor,
+        workspace: torch.Tensor,
+        size_n: int,
+        size_k: int,
+        bias: Optional[torch.Tensor] = None,
+        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
+    # For GPUs that lack FP4 hardware support, we can leverage the
+    # Marlin kernel for fast weight-only FP4 quantization
+
+    reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (size_n, )
+
+    use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
+                                                  n=size_n,
+                                                  k=size_k,
+                                                  device=input.device,
+                                                  dtype=input.dtype)
+
+    output = ops.gptq_marlin_gemm(a=reshaped_x,
+                                  c=None,
+                                  b_q_weight=weight,
+                                  b_scales=weight_scale,
+                                  global_scale=weight_scale_2,
+                                  b_zeros=None,
+                                  g_idx=None,
+                                  perm=None,
+                                  workspace=workspace,
+                                  b_q_type=scalar_types.float4_e2m1f,
+                                  size_m=reshaped_x.size(0),
+                                  size_n=size_n,
+                                  size_k=size_k,
+                                  use_atomic_add=use_atomic_add,
+                                  use_fp32_reduce=use_fp32_reduce)
+
+    if bias is not None:
+        output.add_(bias)  # In-place add
+
+    return output.reshape(out_shape)
+
+
+def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
+    logger.warning_once(
+        "Your GPU does not have native support for FP4 computation but "
+        "FP4 quantization is being used. Weight-only FP4 compression will "
+        "be used leveraging the Marlin kernel. This may degrade "
+        "performance for compute-heavy workloads.")
+
+    part_size_n = layer.output_size_per_partition
+    part_size_k = layer.input_size_per_partition
+    param_dtype = layer.params_dtype
+
+    assert layer.weight.shape == (part_size_n, part_size_k // 2)
+
+    device = layer.weight.device
+
+    # WORKSPACE
+    layer.workspace = marlin_make_workspace_new(device)
+
+    # WEIGHT
+    # Repack weights to marlin format
+    perm = torch.empty(0, dtype=torch.int, device=device)
+    qweight = layer.weight.view(torch.int32).T.contiguous()
+
+    marlin_qweight = ops.gptq_marlin_repack(b_q_weight=qweight,
+                                            perm=perm,
+                                            size_k=part_size_k,
+                                            size_n=part_size_n,
+                                            num_bits=4)
+    layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
+
+    # WEIGHT SCALES
+    # Permute scales
+    weight_scale = layer.weight_scale.T.to(param_dtype)
+    weight_scale = marlin_permute_scales(s=weight_scale,
+                                         size_k=part_size_k,
+                                         size_n=part_size_n,
+                                         group_size=16)
+    weight_scale = fp4_marlin_process_scales(weight_scale)
+    layer.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
+
+    weight_scale_2 = layer.weight_scale_2.to(param_dtype)
+    weight_scale_2 = fp4_marlin_process_global_scale(weight_scale_2)
+    layer.weight_scale_2 = torch.nn.Parameter(weight_scale_2,
+                                              requires_grad=False)
+
+    return
+
+
+def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
+    logger.warning_once(
+        "Your GPU does not have native support for FP4 computation but "
+        "FP4 quantization is being used. Weight-only FP4 compression will "
+        "be used leveraging the Marlin kernel. This may degrade "
+        "performance for compute-heavy workloads.")
+
+    e = layer.num_experts
+    k = layer.hidden_size
+    n = layer.intermediate_size_per_partition
+
+    # WORKSPACE
+    device = layer.w13_weight.device
+    param_dtype = layer.params_dtype
+    layer.workspace = marlin_make_workspace_new(device, 4)
+    perm = torch.empty(0, dtype=torch.int, device=device)
+
+    # WEIGHT
+    # Repack weights to marlin format
+    for name in ["w13_weight", "w2_weight"]:
+        weight = getattr(layer, name)
+        tensor_list = []
+        if "w13" in name:
+            size_n, size_k = n * 2, k
+        else:
+            size_n, size_k = k, n
+
+        assert weight.shape == (e, size_n, size_k // 2)
+
+        for i in range(e):
+            qweight = weight[i].view(torch.int32).T.contiguous()
+
+            marlin_qweight = ops.gptq_marlin_repack(b_q_weight=qweight,
+                                                    perm=perm,
+                                                    size_k=size_k,
+                                                    size_n=size_n,
+                                                    num_bits=4)
+            tensor_list.append(marlin_qweight)
+
+        weight = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
+        weight = torch.nn.Parameter(weight, requires_grad=False)
+
+        setattr(layer, name, weight)
+
+    # WEIGHT SCALES
+    # Permute scales
+    for name in ["w13", "w2"]:
+        scales = getattr(layer, name + "_weight_scale").to(param_dtype)
+        global_scale = getattr(layer, name + "_weight_scale_2").to(param_dtype)
+
+        tensor_list = []
+        if "w13" in name:
+            size_n, size_k = n * 2, k
+        else:
+            size_n, size_k = k, n
+
+        for i in range(e):
+            marlin_scales = marlin_permute_scales(s=scales[i].T,
+                                                  size_k=size_k,
+                                                  size_n=size_n,
+                                                  group_size=16)
+            marlin_scales = fp4_marlin_process_scales(marlin_scales)
+            tensor_list.append(marlin_scales)
+
+        scales = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
+        scales = torch.nn.Parameter(scales, requires_grad=False)
+        setattr(layer, name + "_weight_scale", scales)
+
+        global_scale = fp4_marlin_process_global_scale(global_scale)
+        global_scale = torch.nn.Parameter(global_scale, requires_grad=False)
+        setattr(layer, name + "_weight_scale_2", global_scale)
+
+
+def rand_marlin_weight_fp4_like(weight, group_size):
+    assert group_size > 0
+    size_n, size_k = weight.shape
+    device = weight.device
+
+    scales = weight.view(size_n, -1, group_size).abs().max(-1)[0] / 6
+    global_scale = scales.max() / 448
+    scales = (scales / global_scale).to(torch.float8_e4m3fn)
+
+    fp4_weight = torch.randint(0,
+                               256, (size_n, size_k // 2),
+                               dtype=torch.uint8,
+                               device=weight.device)
+    fp4_weight_part_1 = ((fp4_weight & 0b10000000) |
+                         ((fp4_weight & 0b01110000) >> 2))
+    fp4_weight_part_1 = fp4_weight_part_1.view(torch.float8_e4m3fn)
+    fp4_weight_part_1 = fp4_weight_part_1.to(weight.dtype) * (2**6)
+
+    fp4_weight2 = fp4_weight << 4
+    fp4_weight_part_2 = ((fp4_weight2 & 0b10000000) |
+                         ((fp4_weight2 & 0b01110000) >> 2))
+    fp4_weight_part_2 = fp4_weight_part_2.view(torch.float8_e4m3fn)
+    fp4_weight_part_2 = fp4_weight_part_2.to(weight.dtype) * (2**6)
+
+    weight_ref = torch.cat(
+        [fp4_weight_part_2.unsqueeze(2),
+         fp4_weight_part_1.unsqueeze(2)], 2).view(size_n, size_k)
+    weight_ref = weight_ref * global_scale.to(weight.dtype) * \
+        scales.repeat_interleave(group_size, 1).to(weight.dtype)
+
+    marlin_qweight = ops.gptq_marlin_repack(
+        b_q_weight=fp4_weight.view(torch.int32).T.contiguous(),
+        perm=torch.empty(0, dtype=torch.int, device=device),
+        size_k=size_k,
+        size_n=size_n,
+        num_bits=4,
+    )
+
+    marlin_scales = marlin_permute_scales(s=scales.T.to(weight.dtype),
+                                          size_k=size_k,
+                                          size_n=size_n,
+                                          group_size=group_size)
+    marlin_scales = fp4_marlin_process_scales(marlin_scales)
+
+    global_scale = fp4_marlin_process_global_scale(global_scale)
+
+    return weight_ref.T, marlin_qweight, marlin_scales, global_scale
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
new file mode 100644
index 0000000..5372c49
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -0,0 +1,325 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+import vllm._custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_scales,
+    should_use_atomic_add_reduce)
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+
+logger = init_logger(__name__)
+
+
+def is_fp8_marlin_supported():
+    return current_platform.has_device_capability(80)
+
+
+def fp8_fused_exponent_bias_into_scales(scales):
+    fp8_exponent = 4
+    if scales.dtype == torch.half:
+        target_exponent = 5
+    elif scales.dtype == torch.bfloat16:
+        target_exponent = 8
+    # exponent_bias_fp16 = 2 ** 4 - 2 ** 3 = 8
+    # exponent_bias_bf16 = 2 ** 7 - 2 ** 3 = 120
+    exponent_bias = 2**(target_exponent - 1) - 2**(fp8_exponent - 1)
+    s = torch.ones_like(scales) * 2
+    s = s**exponent_bias
+    return scales * s
+
+
+def apply_fp8_marlin_linear(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        workspace: torch.Tensor,
+        size_n: int,
+        size_k: int,
+        bias: Optional[torch.Tensor],
+        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
+    # For GPUs that lack FP8 hardware support, we can leverage the
+    # Marlin kernel for fast weight-only FP8 quantization
+
+    reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (size_n, )
+
+    use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
+                                                  n=size_n,
+                                                  k=size_k,
+                                                  device=input.device,
+                                                  dtype=input.dtype)
+
+    output = ops.gptq_marlin_gemm(a=reshaped_x,
+                                  c=None,
+                                  b_q_weight=weight,
+                                  b_scales=weight_scale,
+                                  global_scale=None,
+                                  b_zeros=None,
+                                  g_idx=None,
+                                  perm=None,
+                                  workspace=workspace,
+                                  b_q_type=scalar_types.float8_e4m3fn,
+                                  size_m=reshaped_x.size(0),
+                                  size_n=size_n,
+                                  size_k=size_k,
+                                  use_atomic_add=use_atomic_add,
+                                  use_fp32_reduce=use_fp32_reduce)
+
+    if bias is not None:
+        output.add_(bias)  # In-place add
+
+    return output.reshape(out_shape)
+
+
+def prepare_fp8_layer_for_marlin(layer: torch.nn.Module,
+                                 size_k_first: bool = True) -> None:
+    logger.warning_once(
+        "Your GPU does not have native support for FP8 computation but "
+        "FP8 quantization is being used. Weight-only FP8 compression will "
+        "be used leveraging the Marlin kernel. This may degrade "
+        "performance for compute-heavy workloads.")
+
+    part_size_n = layer.output_size_per_partition
+    part_size_k = layer.input_size_per_partition
+    weight_block_size = getattr(layer, "weight_block_size", None)
+
+    if size_k_first:
+        assert layer.weight.shape == (part_size_k, part_size_n)
+    else:
+        assert layer.weight.shape == (part_size_n, part_size_k)
+
+    device = layer.weight.device
+
+    # WORKSPACE
+    layer.workspace = marlin_make_workspace_new(device)
+
+    # WEIGHT
+    # Repack weights to marlin format
+    perm = torch.empty(0, dtype=torch.int, device=device)
+    qweight = pack_fp8_to_int32(layer.weight, size_k_first)
+    if not size_k_first:
+        qweight = qweight.T.contiguous()
+
+    marlin_qweight = ops.gptq_marlin_repack(b_q_weight=qweight,
+                                            perm=perm,
+                                            size_k=part_size_k,
+                                            size_n=part_size_n,
+                                            num_bits=8)
+    layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
+
+    # WEIGHT SCALES
+    # Permute scales
+    if "weight_scale" in dir(layer):
+        scales = layer.weight_scale.to(layer.orig_dtype)
+    elif "weight_scale_inv" in dir(layer):
+        scales = layer.weight_scale_inv.to(layer.orig_dtype)
+        del layer.weight_scale_inv
+
+    group_size = -1 if weight_block_size is None else weight_block_size[1]
+
+    # marlin kernel only support channel-wise and group-wise quantization
+    # we need to convert the scales
+    if weight_block_size is None:
+        if scales.nelement() == 1:
+            # tensor-wise quantization -> channel-wise quantization
+            # (1, 1) =>(repeat)=> (1, size_n)
+            scales = scales.view(1, 1).repeat_interleave(part_size_n, 1)
+        elif scales.nelement() > 1 and scales.nelement() != part_size_n:
+            assert part_size_n % scales.nelement() == 0
+            s_size = scales.nelement()
+            # tensor-wise quantization (for gate-up proj)
+            #     -> channel-wise quantization
+            # (1, s_size) =>(repeat)=> (1, size_n)
+            scales = scales.view(1, s_size)
+            scales = scales.repeat_interleave(part_size_n // s_size, 1)
+        else:
+            # channel-wise quantization
+            # (1, size_n)
+            scales = scales.view(1, part_size_n)
+    else:
+        # block-wise quantization -> group-wise quantization
+        # (size_k // block_size[1], ceil(size_n / block_size[0]))
+        #  =>(repeat)=> (size_k // block_size[1], size_n)
+        if not size_k_first:
+            scales = scales.T.contiguous()
+        block_n = weight_block_size[0]
+        scales = scales.repeat_interleave(block_n, 1)
+        # size_n may not divisible by block_size[0]
+        scales = scales[:, :part_size_n]
+
+    marlin_scales = marlin_permute_scales(s=scales,
+                                          size_k=part_size_k,
+                                          size_n=part_size_n,
+                                          group_size=group_size)
+    marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
+    layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False)
+
+
+def prepare_moe_fp8_layer_for_marlin(layer: torch.nn.Module,
+                                     size_k_first: bool = True) -> None:
+    logger.warning_once(
+        "Your GPU does not have native support for FP8 computation but "
+        "FP8 quantization is being used. Weight-only FP8 compression will "
+        "be used leveraging the Marlin kernel. This may degrade "
+        "performance for compute-heavy workloads.")
+
+    e = layer.num_experts
+    k = layer.hidden_size
+    n = layer.intermediate_size_per_partition
+    weight_block_size = getattr(layer, "weight_block_size", None)
+
+    # WORKSPACE
+    device = layer.w13_weight.device
+    layer.workspace = marlin_make_workspace_new(device, 4)
+    perm = torch.empty(0, dtype=torch.int, device=device)
+
+    # WEIGHT
+    # Repack weights to marlin format
+    for name in ["w13_weight", "w2_weight"]:
+        weight = getattr(layer, name)
+        tensor_list = []
+        if "w13" in name:
+            size_n, size_k = n * 2, k
+        else:
+            size_n, size_k = k, n
+
+        if size_k_first:
+            assert weight.shape == (e, size_k, size_n)
+        else:
+            assert weight.shape == (e, size_n, size_k)
+
+        for i in range(e):
+            qweight = pack_fp8_to_int32(weight[i], size_k_first)
+            if not size_k_first:
+                qweight = qweight.T.contiguous()
+
+            marlin_qweight = ops.gptq_marlin_repack(b_q_weight=qweight,
+                                                    perm=perm,
+                                                    size_k=size_k,
+                                                    size_n=size_n,
+                                                    num_bits=8)
+            tensor_list.append(marlin_qweight)
+
+        weight = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
+        weight = torch.nn.Parameter(weight, requires_grad=False)
+
+        setattr(layer, name, weight)
+
+    # WEIGHT SCALES
+    # Permute scales
+    group_size = -1 if weight_block_size is None else weight_block_size[1]
+
+    for name in ["w13", "w2"]:
+        if name + "_weight_scale" in dir(layer):
+            new_name = name + "_weight_scale"
+            scales = getattr(layer, new_name).to(layer.orig_dtype)
+            delattr(layer, new_name)
+        elif name + "_weight_scale_inv" in dir(layer):
+            new_name = name + "_weight_scale_inv"
+            scales = getattr(layer, new_name).to(layer.orig_dtype)
+            delattr(layer, new_name)
+
+        tensor_list = []
+        if "w13" in name:
+            size_n, size_k = n * 2, k
+        else:
+            size_n, size_k = k, n
+
+        # marlin kernel only support channel-wise and group-wise quantization
+        # we need to convert the scales
+        if weight_block_size is None:
+            if scales.nelement() == e:
+                # tensor-wise quantization -> channel-wise quantization
+                # (e, 1, 1) =>(repeat)=> (e, 1, size_n)
+                scales = scales.view(e, 1, 1).repeat_interleave(size_n, 2)
+            elif scales.nelement() > e and scales.nelement() != e * size_n:
+                assert (e * size_n) % scales.nelement() == 0
+                s_size = scales.nelement() // e
+                # tensor-wise quantization (for gate-up proj)
+                #     -> channel-wise quantization
+                # (e, 1, s_size) =>(repeat)=> (e, 1, size_n)
+                scales = scales.view(e, 1, s_size)
+                scales = scales.repeat_interleave(size_n // s_size, 2)
+            else:
+                # channel-wise quantization
+                # (e, 1, size_n)
+                scales = scales.view(e, 1, size_n)
+        else:
+            # block-wise quantization -> group-wise quantization
+            # (e, size_k // block_size[1], ceil(size_n / block_size[0]))
+            #  =>(repeat)=> (e, size_k // block_size[1], size_n)
+            if not size_k_first:
+                scales = scales.permute(0, 2, 1)
+            block_n = weight_block_size[0]
+            scales = scales.repeat_interleave(block_n, 2)
+            # size_n may not divisible by block_size[0]
+            scales = scales[..., :size_n].contiguous()
+
+        for i in range(e):
+            marlin_scales = marlin_permute_scales(s=scales[i],
+                                                  size_k=size_k,
+                                                  size_n=size_n,
+                                                  group_size=group_size)
+            tensor_list.append(marlin_scales)
+
+        scales = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
+        scales = fp8_fused_exponent_bias_into_scales(scales)
+        scales = torch.nn.Parameter(scales, requires_grad=False)
+
+        setattr(layer, name + "_weight_scale", scales)
+
+
+def pack_fp8_to_int32(fp8_tensor: torch.Tensor,
+                      size_k_first: bool = True) -> torch.Tensor:
+    """
+    Repack FP8 weights to gptq format (packed int32 elements)
+    """
+    assert fp8_tensor.dtype == torch.float8_e4m3fn
+    assert fp8_tensor.ndim == 2
+
+    fp8_tensor = fp8_tensor.T if size_k_first else fp8_tensor
+    fp8_tensor = fp8_tensor.contiguous()
+    # fp8_tensor is contiguous and have shape (N, K) now
+    # with `.view(torch.int32)`, it become (N, K // 4)
+    int32_tensor = fp8_tensor.view(torch.int32)
+    return int32_tensor.T.contiguous() if size_k_first else int32_tensor
+
+
+def marlin_quant_fp8_torch(weight, group_size):
+    size_n, size_k = weight.shape
+    device = weight.device
+
+    if group_size != -1:
+        scales = weight.view(size_n, -1, group_size).abs().max(-1)[0] / 448
+        repeated_scales = scales.repeat_interleave(group_size, 1)
+        fp8_weight = (weight / repeated_scales).to(torch.float8_e4m3fn)
+        weight_ref = fp8_weight.to(weight.dtype) * repeated_scales
+    else:
+        scales = weight.view(size_n, 1, group_size).abs().max(-1)[0] / 448
+        repeated_scales = scales.repeat_interleave(size_k, 1)
+        fp8_weight = (weight / repeated_scales).to(torch.float8_e4m3fn)
+        weight_ref = fp8_weight.to(weight.dtype) * repeated_scales
+
+    packed_weight = pack_fp8_to_int32(fp8_weight, False).T.contiguous()
+    marlin_qweight = ops.gptq_marlin_repack(
+        b_q_weight=packed_weight,
+        perm=torch.empty(0, dtype=torch.int, device=device),
+        size_k=size_k,
+        size_n=size_n,
+        num_bits=8,
+    )
+
+    marlin_scales = marlin_permute_scales(s=scales.T,
+                                          size_k=size_k,
+                                          size_n=size_n,
+                                          group_size=group_size)
+
+    marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
+
+    return weight_ref.T, marlin_qweight, marlin_scales
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
new file mode 100644
index 0000000..b2c228c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
@@ -0,0 +1,165 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Utility functions used for tests and benchmarks"""
+
+from typing import Optional
+
+import numpy as np
+import torch
+
+from vllm.scalar_type import ScalarType
+
+from .marlin_utils import (GPTQ_MARLIN_TILE, marlin_permute_scales,
+                           marlin_zero_points)
+from .quant_utils import (get_pack_factor, gptq_quantize_weights,
+                          quantize_weights, sort_weights)
+
+
+class MarlinWorkspace:
+
+    def __init__(self, out_features, min_thread_n, max_parallel):
+        assert (out_features % min_thread_n == 0), (
+            "out_features = {} is undivisible by min_thread_n = {}".format(
+                out_features, min_thread_n))
+
+        max_workspace_size = ((out_features // min_thread_n) * max_parallel)
+
+        self.scratch = torch.zeros(max_workspace_size,
+                                   dtype=torch.int,
+                                   device="cuda")
+
+
+def marlin_permute_weights(q_w, size_k, size_n, perm, tile=GPTQ_MARLIN_TILE):
+    assert q_w.shape == (size_k, size_n)
+    assert size_k % tile == 0, f"size_k = {size_k}, tile = {tile}"
+    assert size_n % tile == 0, f"size_k = {size_n}, tile = {tile}"
+
+    # Permute weights to 16x64 marlin tiles
+    q_w = q_w.reshape((size_k // tile, tile, size_n // tile, tile))
+    q_w = q_w.permute((0, 2, 1, 3))
+    q_w = q_w.reshape((size_k // tile, size_n * tile))
+
+    q_w = q_w.reshape((-1, perm.numel()))[:, perm].reshape(q_w.shape)
+
+    return q_w
+
+
+def marlin_weights(q_w, size_k, size_n, num_bits, perm):
+    # Permute
+    q_w = marlin_permute_weights(q_w, size_k, size_n, perm)
+
+    # Pack
+    pack_factor = get_pack_factor(num_bits)
+    orig_device = q_w.device
+
+    q_w = q_w.cpu().numpy().astype(np.uint32)
+
+    q_packed = np.zeros((q_w.shape[0], q_w.shape[1] // pack_factor),
+                        dtype=np.uint32)
+    for i in range(pack_factor):
+        q_packed |= q_w[:, i::pack_factor] << num_bits * i
+
+    q_packed = torch.from_numpy(q_packed.astype(np.int32)).to(orig_device)
+
+    return q_packed
+
+
+def get_weight_perm(num_bits: int):
+    perm_list: list[int] = []
+    for i in range(32):
+        perm1: list[int] = []
+        col = i // 4
+        for block in [0, 1]:
+            for row in [
+                    2 * (i % 4),
+                    2 * (i % 4) + 1,
+                    2 * (i % 4 + 4),
+                    2 * (i % 4 + 4) + 1,
+            ]:
+                perm1.append(16 * row + col + 8 * block)
+        for j in range(4):
+            perm_list.extend([p + 256 * j for p in perm1])
+
+    perm = np.array(perm_list)
+
+    if num_bits == 4:
+        interleave = np.array([0, 2, 4, 6, 1, 3, 5, 7])
+    elif num_bits == 8:
+        interleave = np.array([0, 2, 1, 3])
+    else:
+        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
+
+    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
+    perm = torch.from_numpy(perm)
+    return perm
+
+
+def marlin_quantize(w: torch.Tensor,
+                    quant_type: ScalarType,
+                    group_size: int,
+                    act_order: bool,
+                    test_perm: Optional[torch.Tensor] = None):
+    size_k, size_n = w.shape
+    num_bits = quant_type.size_bits
+
+    # Normalize group_size
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
+
+    # Quantize (and apply act_order if provided)
+    w_ref, q_w, s, g_idx, rand_perm = gptq_quantize_weights(
+        w, quant_type, group_size, act_order, test_perm)
+
+    # For act_order, sort the "weights" and "g_idx" so that group ids are
+    # increasing
+    sort_indices = torch.empty(0, dtype=torch.int, device=w.device)
+    if act_order:
+        q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)
+
+    # Reformat to marlin
+    weight_perm = get_weight_perm(num_bits)
+    marlin_q_w = marlin_weights(q_w, size_k, size_n, num_bits, weight_perm)
+    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size)
+
+    # Create result
+    res_list = [w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, rand_perm]
+    for i in range(len(res_list)):
+        res_list[i] = res_list[i].to(w.device)
+
+    return res_list
+
+
+def awq_marlin_quantize(w: torch.Tensor, quant_type: ScalarType,
+                        group_size: int):
+    size_k, size_n = w.shape
+
+    # Normalize group_size
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
+
+    # Detect num groups
+    assert size_k % group_size == 0
+    num_groups = size_k // group_size
+
+    # Quantize with zp
+    w_ref, q_w, s, zp = quantize_weights(w,
+                                         quant_type,
+                                         group_size,
+                                         zero_points=True)
+
+    # Reformat to marlin
+    weight_perm = get_weight_perm(quant_type.size_bits)
+    marlin_q_w = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
+                                weight_perm)
+    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size)
+    marlin_zp = marlin_zero_points(zp, num_groups, size_n,
+                                   quant_type.size_bits)
+
+    # Create result
+    res_list = [w_ref, marlin_q_w, marlin_s, marlin_zp]
+    for i in range(len(res_list)):
+        res_list[i] = res_list[i].to(w.device)
+
+    return res_list
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
new file mode 100644
index 0000000..1c93c36
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
@@ -0,0 +1,464 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Utility functions used for tests and benchmarks"""
+
+import random
+
+import numpy
+import torch
+
+from vllm.scalar_type import ScalarType
+
+from .marlin_utils_test import marlin_weights
+from .quant_utils import gptq_quantize_weights
+
+
+# This is PyTorch implementation of main part of reorder_meta()
+# function, from tools/util/include/cutlass/util/host_reorder.h file
+# of CUTLASS source tree.  Furthermore, CUTLASS template for sparse
+# GEMM decides upon layout of this matrix, and at the moment for the
+# sparse GEMM executed on tensor cores, this is layout described by
+# ColumnMajorInterleaved<2> data structure, in
+# include/cutlass/layout/matrix.h of CUTLASS source tree.  The
+# reordering of meta matrix into meta_reordered matrix calculated
+# according to these segments of CUTLASS code is re-implemented here.
+# Note that this calculation produces offsets for scattering metadata
+# matrix elements into reordered metadata matrix elements (or,
+# equivalently, for gathering reordered metadata matrix element back
+# into metadata matrix elements).
+def _calculate_meta_reordering_scatter_offsets(m, meta_ncols, meta_dtype,
+                                               device):
+    dst_rows = torch.arange(0, m, device=device)[:, None].repeat(1, meta_ncols)
+    dst_cols = torch.arange(0, meta_ncols, device=device).repeat(m, 1)
+
+    # Reorder the rows, then swizzle the 2x2 blocks.
+    group_x = 64
+    group_y = 32 if meta_dtype.itemsize == 2 else 16
+
+    dst_rows = (dst_rows // group_x * group_x + (dst_rows % 2) * 2 +
+                (dst_rows % 8) // 4 + ((dst_rows % group_y) % 4) // 2 * 32 +
+                ((dst_rows % group_x) // 8) * 4)
+
+    topright = ((dst_rows % 2 == 0) & (dst_cols % 2 == 1)).to(torch.int8)
+    bottomleft = ((dst_rows % 2 == 1) & (dst_cols % 2 == 0)).to(torch.int8)
+    dst_rows += topright - bottomleft
+    dst_cols -= topright - bottomleft
+
+    # Assumed that meta tensor is to be stored in CUTLASS
+    # InterleavedColumnMajor layout, and reverse engineered
+    # corresponding code to store values into this tensor.
+    interleave = 2
+    cols_maj = dst_cols // interleave
+    cols_min = dst_cols % interleave
+    return (cols_maj * m * interleave + dst_rows * interleave +
+            cols_min).view(-1)
+
+
+# This function converts dense matrix into sparse semi-structured
+# representation, producing "compressed" matrix, in the layout used by
+# CUTLASS backend, and corresponding metadata matrix.
+def sparse_semi_structured_from_dense_cutlass(dense):
+    if dense.dim() != 2:
+        raise RuntimeError(
+            f"Expected 2-dimensional dense tensor, got {dense.dim()}-dimensional tensor"  # noqa: E501
+        )
+
+    m, k = dense.shape
+    device = dense.device
+
+    meta_dtype = torch.int8
+    if dense.dtype == torch.int8:
+        meta_dtype = torch.int32
+    elif dense.dtype in [torch.half, torch.bfloat16, torch.float, torch.int32]:
+        meta_dtype = torch.int16
+    else:
+        raise RuntimeError(f"Invalid datatype {dense.dtype} of dense matrix")
+    quadbits_per_meta_elem = meta_dtype.itemsize * 8 // 4
+    if quadbits_per_meta_elem not in (4, 8):
+        raise RuntimeError(
+            "Invalid number of elements per meta element calculated")
+
+    if meta_dtype == torch.int32:
+        if m % 16 != 0:
+            raise RuntimeError(
+                f"Number of rows of dense matrix {m} must be divisible by 16")
+    else:
+        if m % 32 != 0:
+            raise RuntimeError(
+                f"Number of rows of dense matrix {m} must be divisible by 32")
+    if k % (4 * quadbits_per_meta_elem) != 0:
+        raise RuntimeError(
+            f"Number of columns of dense matrix {k} must be divisible by {4 * quadbits_per_meta_elem}"  # noqa: E501
+        )
+
+    if dense.dtype != torch.float:
+        ksparse = 4
+        dense_4 = dense.view(-1, k // ksparse, ksparse)
+        m0, m1, m2, m3 = (dense_4 != 0).unbind(-1)
+    else:
+        ksparse = 2
+        dense_2 = dense.view(-1, k // ksparse, ksparse)
+        m0, m2 = m1, m3 = (dense_2 != 0).unbind(-1)
+    meta_ncols = k // (ksparse * quadbits_per_meta_elem)
+
+    # Encoding quadruples of True/False values as follows:
+    #     [True,  True,  False, False] -> 0b0100
+    #     [True,  False, True,  False] -> 0b1000
+    #     [False, True,  True,  False] -> 0b1001
+    #     [True,  False, False, True ] -> 0b1100
+    #     [False, True,  False, True ] -> 0b1101
+    #     [False, False, True,  True ] -> 0b1110
+    # Thus, lower two bits in the encoding are index of the True value
+    # at the lowest index in the quadruple, and the higher two bits in
+    # the encoding are index of the other True value in the quadruple.
+    # In case there are less than two True values, than False value or
+    # values at some index or indices are considered True for the
+    # encoding.  In case there are more than two True values, then the
+    # excess True value(s) at some indices are considered False for
+    # the encoding.  The exact encodings used for these cases are as
+    # follows:
+    #     [False, False, False, False] -> 0b1110
+    #     [False, False, False, True ] -> 0b1110
+    #     [False, False, True,  False] -> 0b1110
+    #     [False, True,  False, False] -> 0b1001
+    #     [False, True,  True,  True ] -> 0b1101
+    #     [True,  False, False, False] -> 0b1000
+    #     [True,  False, True,  True ] -> 0b1100
+    #     [True,  True,  False, True ] -> 0b0100
+    #     [True,  True,  True,  False] -> 0b0100
+    #     [True,  True,  True,  True ] -> 0b0100
+    # These particular encodings are chosen, with the help of Espresso
+    # logic minimizer software, for the purpose of minimization of
+    # corresponding Boolean functions, that translate non-zero flags
+    # into encoding bits.  Note also possible choices for the first
+    # and last of these encodings were limited only to (0b0100,
+    # 0b1110), in order to produce valid encodings for 1:2 sparsity
+    # case.
+
+    expr0 = m0 & m1
+    expr1 = ~m0 & m1
+    expr2 = ~m0 & ~m1
+    bit0 = expr1
+    bit1 = expr2
+    bit2 = expr0 | expr2 | m3
+    bit3 = expr1 | ~m1
+    idxs0 = bit0 | (bit1.to(torch.int64) << 1)
+    idxs1 = bit2 | (bit3.to(torch.int64) << 1)
+
+    if dense.dtype != torch.float:
+        sparse0 = dense_4.gather(
+            -1, idxs0.unsqueeze(-1))  # type: ignore[possibly-undefined]
+        sparse1 = dense_4.gather(-1, idxs1.unsqueeze(-1))
+        sparse = torch.stack((sparse0, sparse1), dim=-1).view(m, k // 2)
+    else:
+        sparse = dense_2.gather(-1,
+                                idxs0.unsqueeze(-1) // 2).view(
+                                    m,
+                                    k // 2)  # type: ignore[possibly-undefined]
+
+    meta_4 = idxs0 | (idxs1 << 2)
+    meta_n = meta_4.view(
+        (-1, meta_ncols, quadbits_per_meta_elem)).to(meta_dtype)
+
+    if quadbits_per_meta_elem == 4:
+        meta = (meta_n[:, :, 0]
+                | (meta_n[:, :, 1] << 4)
+                | (meta_n[:, :, 2] << 8)
+                | (meta_n[:, :, 3] << 12))
+    elif quadbits_per_meta_elem == 8:
+        meta = (meta_n[:, :, 0]
+                | (meta_n[:, :, 1] << 4)
+                | (meta_n[:, :, 2] << 8)
+                | (meta_n[:, :, 3] << 12)
+                | (meta_n[:, :, 4] << 16)
+                | (meta_n[:, :, 5] << 20)
+                | (meta_n[:, :, 6] << 24)
+                | (meta_n[:, :, 7] << 28))
+
+    # Reorder meta tensor elements.
+    meta_reordered = meta.new_empty(
+        (m * meta_ncols, ))  # type: ignore[possibly-undefined]
+    meta_offsets = _calculate_meta_reordering_scatter_offsets(
+        m, meta_ncols, meta_dtype, device)
+    meta_reordered.scatter_(0, meta_offsets, meta.view(-1))
+
+    return (sparse, meta_reordered.view(m, meta_ncols))
+
+
+# This function performs reverse of the function above - it
+# reconstructs dense matrix from a pair of "compressed" matrix, given
+# in the layout used by CUTLASS backend, and accompanying metadata
+# matrix.
+def sparse_semi_structured_to_dense_cutlass(sparse, meta_reordered):
+    if sparse.dim() != 2:
+        raise RuntimeError(
+            f"Expected 2-dimensional sparse tensor, got {sparse.dim()}-dimensional tensor"  # noqa: E501
+        )
+
+    m, k = sparse.shape
+    device = sparse.device
+
+    if meta_reordered.dim() != 2:
+        raise RuntimeError(
+            f"Expected 2-dimensional meta tensor, got {meta_reordered.dim()}-dimensional tensor"  # noqa: E501
+        )
+    if meta_reordered.device != device:
+        raise RuntimeError(
+            f"Expected meta matrix to be on {device} device, got matrix on {meta_reordered.device} device"  # noqa: E501
+        )
+
+    meta_dtype = meta_reordered.dtype
+    if meta_dtype not in (torch.int16, torch.int32):
+        raise RuntimeError(f"Invalid datatype {meta_dtype} of meta matrix")
+    quadbits_per_meta_elem = meta_dtype.itemsize * 8 // 4
+
+    ksparse = 4 if sparse.dtype != torch.float else 2
+
+    meta_nrows, meta_ncols = meta_reordered.shape
+    if meta_nrows != m:
+        raise RuntimeError(
+            f"Number of rows of meta matrix {meta_nrows} must be equal to number of columns of spase matrix {m}"  # noqa: E501
+        )
+    if meta_ncols * ksparse * quadbits_per_meta_elem != 2 * k:
+        raise RuntimeError(
+            f"Number of columns of sparse matrix {k} different from the {meta_ncols * ksparse * quadbits_per_meta_elem // 2}, "  # noqa: E501
+            "expected according to the number of columns of meta matrix")
+
+    # Undo meta tensor elements reordering.
+    meta_offsets = _calculate_meta_reordering_scatter_offsets(
+        m, meta_ncols, meta_dtype, device)
+    meta = torch.gather(meta_reordered.view(-1), 0,
+                        meta_offsets).view(m, meta_ncols)
+
+    # Unpack sparse tensor back to original dense tensor, using
+    # information provided by meta tensor.  Note that torch.float
+    # datatype is handled pretty much the same as
+    # torch.half/torch.bfloat16, as metadata for a pair of torch.float
+    # value is encoded as if underlying 8 bytes contain four
+    # torch.half/torch.bfloat16 values, where either first two or last
+    # two are zeros.
+    meta_2 = torch.empty(
+        (m, meta_ncols, 2 * quadbits_per_meta_elem),
+        dtype=meta_dtype,
+        device=device,
+    )
+    if quadbits_per_meta_elem == 4:
+        meta_2[:, :, 0] = meta & 0b11
+        meta_2[:, :, 1] = (meta >> 2) & 0b11
+        meta_2[:, :, 2] = (meta >> 4) & 0b11
+        meta_2[:, :, 3] = (meta >> 6) & 0b11
+        meta_2[:, :, 4] = (meta >> 8) & 0b11
+        meta_2[:, :, 5] = (meta >> 10) & 0b11
+        meta_2[:, :, 6] = (meta >> 12) & 0b11
+        meta_2[:, :, 7] = (meta >> 14) & 0b11
+    elif quadbits_per_meta_elem == 8:
+        meta_2[:, :, 0] = meta & 0b11
+        meta_2[:, :, 1] = (meta >> 2) & 0b11
+        meta_2[:, :, 2] = (meta >> 4) & 0b11
+        meta_2[:, :, 3] = (meta >> 6) & 0b11
+        meta_2[:, :, 4] = (meta >> 8) & 0b11
+        meta_2[:, :, 5] = (meta >> 10) & 0b11
+        meta_2[:, :, 6] = (meta >> 12) & 0b11
+        meta_2[:, :, 7] = (meta >> 14) & 0b11
+        meta_2[:, :, 8] = (meta >> 16) & 0b11
+        meta_2[:, :, 9] = (meta >> 18) & 0b11
+        meta_2[:, :, 10] = (meta >> 20) & 0b11
+        meta_2[:, :, 11] = (meta >> 22) & 0b11
+        meta_2[:, :, 12] = (meta >> 24) & 0b11
+        meta_2[:, :, 13] = (meta >> 26) & 0b11
+        meta_2[:, :, 14] = (meta >> 28) & 0b11
+        meta_2[:, :, 15] = (meta >> 30) & 0b11
+
+    dense_offsets = meta_2.view(-1) + (
+        torch.arange(0, 2 * m * k // ksparse, device=device) * 4).view(
+            -1, 1).repeat(1, 2).view(-1)
+
+    dense = torch.zeros((m * 2 * k, ), dtype=sparse.dtype, device=device)
+    if sparse.dtype != torch.float:
+        # dense.scatter_(0, dense_offsets, sparse.view(-1))
+        dense.scatter_(0, dense_offsets, sparse.reshape(-1))
+    else:
+        dense.view(torch.half).scatter_(0, dense_offsets,
+                                        sparse.view(torch.half).view(-1))
+
+    return dense.view(m, 2 * k)
+
+
+def mask_creator(tensor):
+    """
+    Class for creating N:M sparsity masks.
+    Masks will be created using the N:M ratio, where for every block of 
+    M weights, N will be pruned based on ranked weight value. Each mask 
+    will correspond to the given tensor.
+
+    :param N: The number of weights in a group to keep
+    :param M: The size of a weight group
+    """
+    N = 2
+    M = 4
+
+    mask = None
+    # for i, tensor in enumerate(tensors):
+    if tensor.numel() % M != 0:
+        raise ValueError(
+            f"Tensor of size {tensor.shape} can't be evenly divided into "
+            f"{M} groups")
+
+    num_groups = tensor.numel() // M
+
+    # N:M sparsity for linear layers
+    tensor_temp = tensor.detach().abs().reshape(num_groups, M)
+    index = torch.argsort(tensor_temp, dim=1)[:, :int(M - N)]
+
+    w_b = torch.ones(tensor_temp.shape, device=tensor_temp.device)
+    mask = w_b.scatter_(dim=1, index=index, value=0).reshape(tensor.shape)
+
+    return mask
+
+
+def inject_24(w, size_k, size_n):
+    assert w.shape == (size_k, size_n)
+
+    mask = mask_creator(w.t()).t().cuda().bool()
+
+    return (mask * w).contiguous(), mask.contiguous()
+
+
+def check_24(w, num_rows_to_sample=50, _verbose=False):
+    BLOCK_SIZE = 4
+    MAX_NON_ZEROS = 2
+
+    w = w.t().contiguous()
+
+    print("check_24: w.shape = {}".format(w.shape))
+
+    num_rows, num_cols = w.shape
+    sampled_row_idxs = random.choices(range(num_rows), k=num_rows_to_sample)
+    if _verbose:
+        print(f"Sampled row idxs = {sampled_row_idxs}")
+
+    total_segments = 0
+    non_24_segments = 0
+    for i in sampled_row_idxs:
+        for j in range(0, num_cols - BLOCK_SIZE, BLOCK_SIZE):
+            total_segments += 1
+            block = w[i, j:j + BLOCK_SIZE]
+            num_nonzero = torch.count_nonzero(block)
+            if num_nonzero > MAX_NON_ZEROS:
+                print("i = {} j = {} block = {}".format(i, j, block))
+                non_24_segments += 1
+
+    print(f"{non_24_segments} / {total_segments} do not have 2:4 structure.")
+
+
+def compress_quantized_24_weight(q_24, size_k, size_n, wtype: ScalarType):
+    assert q_24.shape == (size_k, size_n)
+
+    # Remove bias to normalize over 0
+    q_24_no_zp = q_24 - wtype.bias
+
+    # Compress
+    q_24_no_zp = q_24_no_zp.t().contiguous()
+    q_24_no_zp_comp, meta = sparse_semi_structured_from_dense_cutlass(
+        q_24_no_zp)
+    q_24_no_zp_comp = q_24_no_zp_comp.t().contiguous()
+
+    # Restore bias
+    q_24_comp = q_24_no_zp_comp + wtype.bias
+
+    # Resize meta to its actual shape (without moving any data)
+    meta = meta.resize_(meta.shape[1] // 2, meta.shape[0] * 2)
+
+    return q_24_comp, meta
+
+
+def get_scale_perms_24():
+    scale_perm: list[int] = []
+    for i in range(8):
+        scale_perm.extend([i * 8 + j for j in [0, 4, 1, 5, 2, 6, 3, 7]])
+    scale_perm_single: list[int] = []
+    for i in range(8):
+        scale_perm_single.extend([8 * i + j for j in [0, 1, 2, 3, 4, 5, 6, 7]])
+    return scale_perm, scale_perm_single
+
+
+def get_weight_perm_24(num_bits: int):
+    perm_list: list[int] = []
+    for i in range(32):
+        perm1: list[int] = []
+        col = i // 4
+        col_o = col // 2
+        for block in [0, 1]:
+            for row in [
+                    2 * (i % 4),
+                    2 * (i % 4) + 1,
+                    2 * (i % 4 + 4),
+                    2 * (i % 4 + 4) + 1,
+            ]:
+                perm1.append(16 * row + col_o * 256 + 8 * (col % 2) +
+                             4 * block)
+        for j in range(4):
+            perm_list.extend([p + 1 * j for p in perm1])
+    perm = numpy.array(perm_list)
+
+    if num_bits == 4:
+        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
+    elif num_bits == 8:
+        interleave = numpy.array([0, 2, 1, 3])
+    else:
+        raise ValueError("num_bits must be 4 or 8, got {}".format(num_bits))
+
+    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
+    perm = torch.from_numpy(perm)
+    return perm
+
+
+def marlin_permute_scales_24(s: torch.Tensor, size_k: int, size_n: int,
+                             group_size: int) -> torch.Tensor:
+
+    scale_perm, scale_perm_single = get_scale_perms_24()
+    if group_size < size_k and group_size != -1:
+        s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
+    else:
+        s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
+    s = s.reshape((-1, size_n)).contiguous()
+
+    return s
+
+
+def marlin_24_quantize(
+    w: torch.Tensor,
+    quant_type: ScalarType,
+    group_size: int,
+):
+    size_k, size_n = w.shape
+
+    # Normalize group_size
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
+
+    # Inject 2:4 sparsity
+    w_24, mask_24 = inject_24(w, size_k, size_n)
+
+    # Quantize
+    w_24_ref, q_w_24, s, g_idx, rand_perm = gptq_quantize_weights(
+        w_24, quant_type, group_size, act_order=False)
+
+    # Compress quantized weight
+    q_w_24_comp, meta = compress_quantized_24_weight(q_w_24, size_k, size_n,
+                                                     quant_type)
+    size_k_comp = size_k // 2
+
+    # Reformat to marlin
+    weight_perm = get_weight_perm_24(quant_type.size_bits)
+    marlin_24_q_w_comp = marlin_weights(q_w_24_comp, size_k_comp, size_n,
+                                        quant_type.size_bits, weight_perm)
+    marlin_24_s = marlin_permute_scales_24(s, size_k, size_n, group_size)
+
+    # Create result
+    res_list = [w_24_ref, marlin_24_q_w_comp, meta, marlin_24_s]
+    for i in range(len(res_list)):
+        res_list[i] = res_list[i].to(w.device)
+
+    return res_list
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py
new file mode 100644
index 0000000..8a64beb
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py
@@ -0,0 +1,126 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import numpy
+import torch
+
+from .marlin_utils_test import marlin_permute_weights
+from .quant_utils import get_pack_factor, qqq_quantize_weights
+
+
+def marlin_qqq_weights(q_w, size_k, size_n, num_bits, perm, group_size):
+    # Permute
+    q_w = marlin_permute_weights(q_w, size_k, size_n, perm)
+
+    # Pack
+    pack_factor = get_pack_factor(num_bits)
+    orig_device = q_w.device
+
+    q_w = q_w.cpu().numpy().astype(numpy.uint32)
+
+    q_packed = numpy.zeros((q_w.shape[0], q_w.shape[1] // pack_factor),
+                           dtype=numpy.uint32)
+    if group_size == size_k:
+        for i in range(pack_factor):
+            q_packed |= (q_w[:, i::pack_factor] & 0xF) << num_bits * i
+    else:
+        for i in range(pack_factor):
+            q_packed |= q_w[:, i::pack_factor] << num_bits * i
+
+    q_packed = torch.from_numpy(q_packed.astype(numpy.int32)).to(orig_device)
+
+    return q_packed
+
+
+def get_qqq_scale_perms():
+    scale_perm: list[int] = []
+    for i in range(8):
+        scale_perm.extend([i + 8 * j for j in range(8)])
+    scale_perm_single: list[int] = []
+    for i in range(4):
+        scale_perm_single.extend(
+            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
+    return scale_perm, scale_perm_single
+
+
+# NOTE(HandH1998): QQQ employs different perms for per-group and per-channel weight quantization. # noqa: E501
+def get_qqq_weight_perm(num_bits: int, quant_type: str):
+    perm_list: list[int] = []
+    for i in range(32):
+        perm1: list[int] = []
+        col = i // 4
+        for block in [0, 1]:
+            for row in [
+                    4 * (i % 4),
+                    4 * (i % 4) + 1,
+                    4 * (i % 4) + 2,
+                    4 * (i % 4) + 3,
+            ]:
+                perm1.append(16 * row + col + 8 * block)
+        for j in range(4):
+            perm_list.extend([p + 256 * j for p in perm1])
+
+    perm = numpy.array(perm_list)
+
+    assert quant_type in ["per-channel",
+                          "per-group"], "not supported quantization type"
+    if num_bits == 4:
+        if quant_type == "per-channel":
+            interleave = numpy.array([4, 0, 5, 1, 6, 2, 7, 3])
+        else:
+            interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
+    else:
+        raise Exception("num_bits must be 4, got {}".format(num_bits))
+
+    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
+    perm = torch.from_numpy(perm)
+    return perm
+
+
+def marlin_qqq_permute_scales(s_group, s_channel, size_k, size_n, group_size):
+    scale_perm, scale_perm_single = get_qqq_scale_perms()
+    if group_size < size_k and group_size != -1:
+        s_group = s_group.reshape((-1, len(scale_perm)))[:, scale_perm]
+        s_channel = s_channel.reshape(
+            (-1, len(scale_perm_single)))[:, scale_perm_single]
+        s_group = s_group.reshape((-1, size_n)).contiguous()
+    else:
+        s_channel = s_channel.reshape(
+            (-1, len(scale_perm_single)))[:, scale_perm_single]
+    s_channel = s_channel.reshape((-1, size_n)).contiguous()
+
+    return s_group, s_channel
+
+
+def marlin_qqq_quantize(
+    w: torch.Tensor,
+    num_bits: int,
+    group_size: int,
+):
+    size_k, size_n = w.shape
+
+    # Normalize group_size
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
+    quant_type = "per-channel" if group_size == size_k else "per-group"
+
+    # Quantize
+    w_ref, q_w, s_group, s_channel = qqq_quantize_weights(
+        w, num_bits, group_size)
+
+    # Reformat to marlin_qqq
+    weight_perm = get_qqq_weight_perm(num_bits, quant_type)
+    marlin_qqq_q_w = marlin_qqq_weights(q_w, size_k, size_n, num_bits,
+                                        weight_perm, group_size)
+    marlin_qqq_s_group, marlin_qqq_s_channel = marlin_qqq_permute_scales(
+        s_group, s_channel, size_k, size_n, group_size)
+
+    # Create result
+    res_list = [
+        w_ref, marlin_qqq_q_w, marlin_qqq_s_group, marlin_qqq_s_channel
+    ]
+    for i in range(len(res_list)):
+        res_list[i] = res_list[i].to(w.device)
+
+    return res_list
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
new file mode 100644
index 0000000..1119045
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.utils import direct_register_custom_op
+
+OCP_MX_BLOCK_SIZE = 32
+
+
+def _dequant_mxfp4(x: torch.Tensor, scale: torch.Tensor,
+                   float_dtype: torch.dtype) -> torch.Tensor:
+    try:
+        from quark.torch.kernel import mx
+    except ImportError as err:
+        raise ImportError("The package `amd-quark` is required to use "
+                          "MX-FP4 models. Please install it with `pip install "
+                          "amd-quark`.") from err
+
+    return mx.dq_mxfp4(x, scale, float_dtype)
+
+
+def _dequant_mxfp4_fake(x: torch.Tensor, scale: torch.Tensor,
+                        float_dtype: torch.dtype) -> torch.Tensor:
+    return torch.empty((*x.shape[:-1], x.shape[-1] * 2),
+                       dtype=float_dtype,
+                       device=x.device)
+
+
+def _quant_dequant_mxfp4(x: torch.Tensor,
+                         scale_calculation_mode: str = "even") -> torch.Tensor:
+    try:
+        from quark.torch.kernel import mx
+    except ImportError as err:
+        raise ImportError("The package `amd-quark` is required to use "
+                          "MX-FP4 models. Please install it with `pip install "
+                          "amd-quark`.") from err
+
+    return mx.qdq_mxfp4(x, scale_calculation_mode)
+
+
+def _quant_dequant_mxfp4_fake(x: torch.Tensor,
+                              scale_calculation_mode: str = "even"
+                              ) -> torch.Tensor:
+    return torch.empty_like(x)
+
+
+try:
+    direct_register_custom_op(
+        op_name="dequant_mxfp4",
+        op_func=_dequant_mxfp4,
+        mutates_args=[],
+        fake_impl=_dequant_mxfp4_fake,
+    )
+    dequant_mxfp4 = torch.ops.vllm.dequant_mxfp4
+except AttributeError as error:
+    raise error
+
+try:
+    direct_register_custom_op(
+        op_name="quant_dequant_mxfp4",
+        op_func=_quant_dequant_mxfp4,
+        mutates_args=[],
+        fake_impl=_quant_dequant_mxfp4_fake,
+    )
+    quant_dequant_mxfp4 = torch.ops.vllm.quant_dequant_mxfp4
+except AttributeError as error:
+    raise error
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
new file mode 100644
index 0000000..fb3287d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm._custom_ops import cutlass_scaled_mm_supports_fp4
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+
+__all__ = [
+    "break_fp4_bytes", "dequantize_to_dtype", "ref_nvfp4_quant",
+    "cutlass_fp4_supported"
+]
+
+FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
+
+kE2M1ToFloat = torch.tensor([0., 0.5, 1., 1.5, 2., 3., 4., 6.],
+                            dtype=torch.float32)
+
+
+def cutlass_fp4_supported() -> bool:
+    if not current_platform.is_cuda():
+        return False
+    capability_tuple = current_platform.get_device_capability()
+    capability = -1 if capability_tuple is None else capability_tuple.to_int()
+    return cutlass_scaled_mm_supports_fp4(capability)
+
+
+def break_fp4_bytes(a, dtype):
+    assert a.dtype == torch.uint8
+    m, n = a.shape
+    # Vectorized nibble processing
+    a_flat = a.flatten()
+    high = (a_flat & 0xF0) >> 4  # Upper nibbles
+    low = a_flat & 0x0F  # Lower nibbles
+    # Combine nibbles for batch processing
+    combined = torch.stack((low, high), dim=1).flatten()
+    # Vectorized sign and magnitude extraction
+    signs = (combined & 0x08).to(torch.bool)  # Sign bits
+    abs_vals = (combined & 0x07).to(torch.long)
+    # Device-aware lookup and sign application
+    kE2M1 = kE2M1ToFloat.to(device=a.device)
+    values = kE2M1[abs_vals] * torch.where(signs, -1.0, 1.0)
+    # Reshape to final form
+    return values.reshape(m, n * 2).to(dtype=dtype)
+
+
+def convert_swizzled_to_linear(a_sf_swizzled: torch.Tensor, m, k, block_size):
+    m_tiles = (m + 128 - 1) // 128
+    f = block_size * 4
+    k_tiles = (k + f - 1) // f
+    tmp = torch.reshape(a_sf_swizzled, (1, m_tiles, k_tiles, 32, 4, 4))
+    tmp = torch.permute(tmp, (0, 1, 4, 3, 2, 5))
+    out = tmp.reshape(m_tiles * 128, k_tiles * f // block_size)
+    return out[0:m, 0:k]
+
+
+def dequantize_to_dtype(tensor_fp4,
+                        tensor_sf,
+                        global_scale,
+                        dtype,
+                        device,
+                        block_size=16):
+    """Dequantize the fp4 tensor back to high precision."""
+    # Two fp4 values are packed into one uint8.
+    assert tensor_fp4.dtype == torch.uint8
+    m, packed_k = tensor_fp4.shape
+    k = packed_k * 2
+    tensor_f32 = break_fp4_bytes(tensor_fp4, torch.float32)
+    tensor_f32 = tensor_f32.reshape(m, k // block_size, block_size)
+    tensor_sf = tensor_sf.view(torch.float8_e4m3fn)
+    tensor_sf = convert_swizzled_to_linear(tensor_sf, m, k, block_size)
+    tensor_sf_dtype = tensor_sf.to(torch.float32) / global_scale
+
+    # scale the tensor
+    out = (tensor_f32 * tensor_sf_dtype.unsqueeze(-1)).reshape(m, k)
+    return out.to(dtype)
+
+
+def get_reciprocal(x):
+    if isinstance(x, torch.Tensor):
+        return torch.where(x == 0, torch.tensor(0.0, dtype=x.dtype), 1.0 / x)
+    elif isinstance(x, (float, int)):
+        return 0.0 if x == 0 else 1.0 / x
+    else:
+        raise TypeError("Input must be a float, int, or a torch.Tensor.")
+
+
+def cast_to_fp4(x):
+    sign = torch.sign(x)
+    x = torch.abs(x)
+    x[(x >= 0.0) & (x <= 0.25)] = 0.0
+    x[(x > 0.25) & (x < 0.75)] = 0.5
+    x[(x >= 0.75) & (x <= 1.25)] = 1.0
+    x[(x > 1.25) & (x < 1.75)] = 1.5
+    x[(x >= 1.75) & (x <= 2.5)] = 2.0
+    x[(x > 2.5) & (x < 3.5)] = 3.0
+    x[(x >= 3.5) & (x <= 5.0)] = 4.0
+    x[x > 5.0] = 6.0
+    return x * sign
+
+
+def ref_nvfp4_quant(x, global_scale, block_size):
+    assert global_scale.dtype == torch.float32
+    assert x.ndim == 2
+    m, n = x.shape
+    x = torch.reshape(x, (m, n // block_size, block_size))
+    vec_max = torch.max(torch.abs(x), dim=-1,
+                        keepdim=True)[0].to(torch.float32)
+    scale = global_scale * (vec_max * get_reciprocal(FLOAT4_E2M1_MAX))
+    scale = torch.clamp(scale, max=448, min=-448)
+    scale = scale.to(torch.float8_e4m3fn).to(torch.float32)
+    output_scale = get_reciprocal(scale * get_reciprocal(global_scale))
+
+    scaled_x = x.to(torch.float32) * output_scale
+    clipped_x = torch.clamp(scaled_x, -6.0, 6.0).reshape(m, n)
+    # both outputs are float32
+    return cast_to_fp4(clipped_x), scale.squeeze(-1)
+
+
+def run_nvfp4_emulations(x: torch.Tensor, input_global_scale: torch.Tensor,
+                         weight: torch.Tensor,
+                         weight_scale_swizzled: torch.Tensor,
+                         weight_global_scale: torch.Tensor):
+    group_size = 16
+    x_m, x_k = x.shape
+    output_dtype = x.dtype
+
+    # quantize input to (FP4 and interleaved block scale)
+    x_fp4, x_blockscale = ref_nvfp4_quant(x, input_global_scale, group_size)
+
+    # dequantize input
+    x_fp4 = x_fp4.reshape(x_m, x_k // group_size, group_size)
+    x_blockscale = x_blockscale.unsqueeze(-1) / input_global_scale
+    x_dq = (x_fp4 * x_blockscale).reshape(x_m, x_k).to(output_dtype)
+    del x_fp4, x_blockscale
+
+    # dequantize weight
+    w_fp4 = weight.data.view(torch.uint8)
+    w_dq = dequantize_to_dtype(w_fp4, weight_scale_swizzled.data,
+                               weight_global_scale, output_dtype, x.device,
+                               group_size)
+
+    # matmul
+    out = torch.matmul(x_dq, w_dq.t())
+    del w_dq, x_dq
+    return out
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/quant_utils.py
new file mode 100644
index 0000000..54361a2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -0,0 +1,594 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""This file is used for /tests and /benchmarks"""
+from collections.abc import Mapping
+from types import MappingProxyType
+from typing import ClassVar, NamedTuple, Optional
+
+import numpy
+import torch
+
+from vllm.model_executor.layers.quantization.qqq import (
+    MARLIN_QQQ_SUPPORTED_NUM_BITS)
+from vllm.scalar_type import ScalarType, scalar_types
+
+
+# Use proxy as NamedTuple direct subclasses cannot have static members
+class _GroupShape(NamedTuple):
+    row: int
+    col: int
+
+
+class GroupShape(_GroupShape):
+    """
+    This class describes the quantization group shape.
+    It includes static members for common shapes (per-tensor, per-token).
+    """
+
+    # Aliases for common quantization group shapes
+    PER_TENSOR: ClassVar['GroupShape']
+    PER_TOKEN: ClassVar['GroupShape']
+
+
+GroupShape.PER_TENSOR = GroupShape(-1, -1)
+GroupShape.PER_TOKEN = GroupShape(1, -1)
+
+
+# Normalize the group_shape to the full extent for any dims that are -1
+def _normalize_quant_group_shape(x: torch.Tensor, group_shape: GroupShape):
+    # -1 means full extent
+    return (group_shape[0] if group_shape[0] > 0 else x.shape[-2],
+            group_shape[1] if group_shape[1] > 0 else x.shape[-1])
+
+
+# Useful when treating N-dimensional group scaling as extended numpy-style
+# broadcasting in numpy simply stretches dimensions with an extent of 1 to match
+# the target shape by repeating the data along that dimension (broadcasting)
+# , we extend these semantics to say if the extent of a dimension in the
+# source shape is not 1 and does not match the target shape we repeat each
+# element along that dimension src_shape[dim] // target_shape[dim] times
+# example if we have:
+#       a = [[1, 2], and target_shape = (2, 4)
+#            [3, 4]]
+# then we would expand a to:
+#       a = [[1, 1, 2, 2],
+#            [3, 3, 4, 4]]
+# NOTE this function this function does not explicitly broadcast dimensions
+# with an extent of 1, since this can be done implicitly by pytorch
+def group_broadcast(t, shape):
+    for i, s in enumerate(shape):
+        if t.shape[i] != s and t.shape[i] != 1:
+            assert s % t.shape[i] == 0
+            t = t.unsqueeze(i + 1)\
+                .expand(*t.shape[:i+1], s // t.shape[i], *t.shape[i+1:])\
+                .flatten(i, i + 1)
+    return t
+
+
+# Quantize assuming once scale per group of elements with shape group_shape,
+# example group shapes:
+#  * (-1, -1)   for per-tensor quantization
+#  * (1, -1)    for per-row quantization
+#  * (-1, 1)    for per-column quantization
+#  * (128, 128) for 128x128 deepseek style block quantization
+#  * (1, 128)   for deepseek style activation quantization
+#               (i.e. per-token-per-group)
+def scaled_quantize(
+    x: torch.Tensor,
+    group_shape: GroupShape,
+    quant_dtype: torch.dtype,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    group_shape = _normalize_quant_group_shape(x, group_shape)
+    assert quant_dtype.is_floating_point, \
+        "currently `scaled_quantize` only supports floating point dtypes " \
+        "but could be extended to support other dtypes"
+
+    finfo = torch.finfo(quant_dtype)
+
+    # Reshape (M, N) into (BLK_M, BLOCK_SIZE_M, BLK_N, BLOCK_SIZE_N)
+    assert x.ndim == 2
+    assert x.shape[0] % group_shape[0] == 0 and x.shape[1] % group_shape[1] == 0
+    blk_m, blk_n = x.shape[0] // group_shape[0], x.shape[1] // group_shape[1]
+    x_blkd = x.reshape(blk_m, group_shape[0], blk_n, group_shape[1])
+
+    # Permute to (BLK_M, BLK_N, BLOCK_SIZE_M, BLOCK_SIZE_N)
+    x_blkd_permd = x_blkd.permute(0, 2, 1, 3)
+    # Flatten to (BLK_M, BLK_N, BLOCK_SIZE_M * BLOCK_SIZE_N)
+    x_blkd_permd = x_blkd_permd.flatten(start_dim=2)
+
+    # Compute scales
+    min_val, max_val = x_blkd_permd.aminmax(dim=-1)
+    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
+    scale = finfo.max / amax
+
+    # Apply scale and convert form:
+    # (BLK_M, BLK_N, BLOCK_SIZE_M * BLOCK_SIZE_N) to (M, N)
+    x_scl_sat = (x_blkd_permd * scale.unsqueeze(-1))\
+        .clamp(min=finfo.min, max=finfo.max)\
+        .reshape(blk_m, blk_n, group_shape[0], group_shape[1])\
+        .permute(0, 2, 1, 3)\
+        .reshape(x.shape)
+
+    return x_scl_sat.to(quant_dtype).contiguous(), scale.float().reciprocal()
+
+
+# inverses `scaled_quantize`
+def scaled_dequantize(
+    x_q: torch.Tensor,
+    x_s: torch.Tensor,
+    group_shape: Optional[GroupShape] = None,
+    out_dtype: torch.dtype = torch.float32,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    if group_shape is not None:
+        group_shape = _normalize_quant_group_shape(x_q, group_shape)
+
+    if x_s.ndim == 0:  # scalar
+        x_s = x_s.unsqueeze(-1).unsqueeze(-1)  # convert to (1, 1) tensor
+    if x_s.ndim == 1:
+        if group_shape is None:
+            raise AssertionError(
+                "if x_s is 1D tensor, group_shape must be provided otherwise "
+                "its ambiguous which dimension to broadcast x_s to")
+        # unsqueeze the scales for the dimension where we want to broadcast
+        # across the full extent
+        if group_shape[0] == x_q.shape[-2]:
+            x_s = x_s.unsqueeze(-2)
+        elif group_shape[1] == x_q.shape[-1]:
+            x_s = x_s.unsqueeze(-1)
+        else:
+            raise AssertionError(
+                "if x_s is a vector we should be broadcasting it to the full "
+                "extent of one of the dimensions")
+
+    if group_shape is not None:
+        assert x_s.shape[-1] == x_q.shape[-1] // group_shape[1]
+        assert x_s.shape[-2] == x_q.shape[-2] // group_shape[0]
+    x_s = group_broadcast(x_s.to(torch.float32), x_q.shape)
+    return (x_q.to(torch.float32) * x_s).to(out_dtype)
+
+
+def pack_quantized_values_into_int32(w_q: torch.Tensor,
+                                     wtype: ScalarType,
+                                     packed_dim: int = 0):
+    # move dim to pack to the end
+    perm = (*[i for i in range(len(w_q.shape)) if i != packed_dim], packed_dim)
+    inv_perm = tuple(perm.index(i) for i in range(len(perm)))
+    w_q_perm = w_q.permute(perm)
+
+    pack_factor = 32 // wtype.size_bits
+    mask = (1 << wtype.size_bits) - 1
+
+    new_shape_perm = list(w_q_perm.shape)
+    assert w_q_perm.shape[-1] % pack_factor == 0
+    new_shape_perm[-1] //= pack_factor
+
+    res = torch.zeros(new_shape_perm, dtype=torch.int32, device=w_q.device)
+    for i in range(pack_factor):
+        res |= (w_q_perm[..., i::pack_factor] & mask) << wtype.size_bits * i
+
+    return res.permute(inv_perm)
+
+
+def unpack_quantized_values_into_int32(w_q: torch.Tensor,
+                                       wtype: ScalarType,
+                                       packed_dim: int = 0):
+    # move dim to pack to the end
+    perm = (*[i for i in range(len(w_q.shape)) if i != packed_dim], packed_dim)
+    inv_perm = tuple(perm.index(i) for i in range(len(perm)))
+    w_q_perm = w_q.permute(perm)
+
+    pack_factor = 32 // wtype.size_bits
+    mask = (1 << wtype.size_bits) - 1
+
+    new_shape_perm = list(w_q_perm.shape)
+    new_shape_perm[-1] *= pack_factor
+
+    res = torch.zeros(new_shape_perm, dtype=torch.int32, device=w_q.device)
+    for i in range(pack_factor):
+        res[..., i::pack_factor] = (w_q_perm >> wtype.size_bits * i) & mask
+
+    return res.permute(inv_perm)
+
+
+def is_layer_skipped(
+    prefix: str,
+    ignored_layers: list[str],
+    fused_mapping: Mapping[str, list[str]] = MappingProxyType({})
+) -> bool:
+    # prefix: model.layers.0.self_attn.q_proj
+    # proj_name: q_proj
+    proj_name = prefix.split(".")[-1]
+
+    # Fused layers like gate_up_proj or qkv_proj will not be fused
+    # in the safetensors checkpoint. So, we convert the name
+    # from the fused version to unfused + check to make sure that
+    # each shard of the fused layer has the same scheme.
+    if proj_name in fused_mapping:
+        shard_prefixes = [
+            prefix.replace(proj_name, shard_proj_name)
+            for shard_proj_name in fused_mapping[proj_name]
+        ]
+
+        is_skipped = None
+        for shard_prefix in shard_prefixes:
+            is_shard_skipped = shard_prefix in ignored_layers
+
+            if is_skipped is None:
+                is_skipped = is_shard_skipped
+            elif is_shard_skipped != is_skipped:
+                raise ValueError(
+                    f"Detected some but not all shards of {prefix} "
+                    "are quantized. All shards of fused layers "
+                    "to have the same precision.")
+    else:
+        is_skipped = prefix in ignored_layers
+
+    assert is_skipped is not None
+    return is_skipped
+
+
+def get_pack_factor(num_bits):
+    assert 32 % num_bits == 0, f"Unsupported num_bits = {num_bits}"
+    return 32 // num_bits
+
+
+def permute_rows(q_w: torch.Tensor,
+                 w_ref: torch.Tensor,
+                 group_size: int,
+                 test_perm: Optional[torch.Tensor] = None):
+    assert q_w.shape == w_ref.shape
+
+    orig_device = q_w.device
+    k_size, _ = q_w.shape
+
+    g_idx = torch.zeros((k_size, ), dtype=torch.int32)
+    for i in range(k_size):
+        g_idx[i] = i // group_size
+
+    # Simulate act_order by doing a random permutation on K
+    rand_perm = test_perm if test_perm is not None else torch.randperm(k_size)
+
+    g_idx = g_idx[rand_perm].contiguous()
+    q_w = q_w[rand_perm, :].contiguous()
+    w_ref = w_ref[rand_perm, :].contiguous()
+
+    return (
+        w_ref.to(device=orig_device),
+        q_w.to(device=orig_device),
+        g_idx.to(device=orig_device),
+        rand_perm.to(device=orig_device),
+    )
+
+
+def quantize_weights(w: torch.Tensor,
+                     quant_type: ScalarType,
+                     group_size: Optional[int],
+                     zero_points: bool = False,
+                     ref_zero_points_after_scales: bool = False):
+    assert quant_type.is_integer(), \
+        "Floating point quantization may work but has not been tested"
+    assert not zero_points or group_size is not None, \
+        "to have group zero points, group_size must be provided "\
+        "(-1 group_size is channelwise)"
+
+    orig_device = w.device
+    orig_type = w.dtype
+    size_k, size_n = w.shape
+
+    assert w.is_floating_point(), "w must be float"
+
+    if group_size == -1:
+        group_size = size_k
+
+    # Reshape to [groupsize, -1]
+    if group_size is not None and group_size < size_k:
+        w = w.reshape((-1, group_size, size_n))
+        w = w.permute(1, 0, 2)
+        w = w.reshape((group_size, -1))
+
+    # Compute scale for each group
+    max_val = torch.max(w, 0, keepdim=True).values
+    min_val = torch.min(w, 0, keepdim=True).values
+
+    max_q_val = quant_type.max()
+    min_q_val = quant_type.min()
+
+    w_s = torch.Tensor([1.0]).to(w.device)  # unscaled case
+    maybe_w_zp = None
+    if group_size is not None:
+        if zero_points:
+            assert not quant_type.is_signed() and quant_type.max() > 0
+            w_s = (max_val - min_val).clamp(min=1e-5) / quant_type.max()
+            maybe_w_zp = torch.round(torch.abs(min_val / w_s)) \
+                .clamp(min_q_val, max_q_val).int()
+        else:
+            # If the bias is such that there are no possible negative/positive
+            #  values, set the max value to inf to avoid divide by 0
+            w_s = torch.max(
+                abs(max_val / (max_q_val if max_q_val != 0 else torch.inf)),
+                abs(min_val / (min_q_val if min_q_val != 0 else torch.inf)))
+
+    # Quantize
+    w_q = torch.round(w / w_s).int() + (maybe_w_zp if zero_points else 0)
+    w_q = torch.clamp(w_q, min_q_val, max_q_val)
+
+    # Compute ref (dequantized)
+    # For some kernels (namely Machete) the zero-points are applied after the
+    # scales are applied, for this case computing the reference in similar way
+    # allows us to use tighter error tolerances in our unit tests.
+    if ref_zero_points_after_scales and maybe_w_zp is not None:
+        w_ref = w_q.to(orig_type) * w_s - maybe_w_zp.to(orig_type) * w_s
+    else:
+        w_ref = (w_q - (maybe_w_zp if zero_points else 0)).to(orig_type) * w_s
+
+    if quant_type.has_bias():
+        w_q += quant_type.bias
+
+    # Restore original shapes
+    if group_size is not None and group_size < size_k:
+
+        def reshape_w(w):
+            w = w.reshape((group_size, -1, size_n))
+            w = w.permute(1, 0, 2)
+            w = w.reshape((size_k, size_n)).contiguous()
+            return w
+
+        w_q = reshape_w(w_q)
+        w_ref = reshape_w(w_ref)
+        w_s = w_s.reshape((-1, size_n)).contiguous()
+
+    if maybe_w_zp is not None:
+        maybe_w_zp = maybe_w_zp.reshape((-1, size_n)).contiguous()
+        maybe_w_zp = maybe_w_zp.to(device=orig_device)
+
+    return (
+        w_ref.to(device=orig_device),
+        w_q.to(device=orig_device),
+        w_s if group_size is not None else None,
+        maybe_w_zp,
+    )
+
+
+SUPPORTED_GPTQ_QUANT_TYPES = [scalar_types.uint4b8, scalar_types.uint8b128]
+SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
+
+
+def gptq_quantize_weights(w: torch.Tensor,
+                          quant_type: ScalarType,
+                          group_size: int,
+                          act_order: bool,
+                          test_perm: Optional[torch.Tensor] = None):
+    size_k, _ = w.shape
+
+    assert w.is_floating_point(), "w must be float"
+    assert quant_type in SUPPORTED_GPTQ_QUANT_TYPES, \
+        f"Unsupported gptq type = {quant_type}"
+    assert group_size in SUPPORTED_GROUP_SIZES + [
+        size_k
+    ], f"Unsupported groupsize = {group_size}"
+
+    w_ref, w_q, w_s, _ = quantize_weights(w, quant_type, group_size)
+
+    # Apply act_order
+    g_idx = torch.empty(0, dtype=torch.int, device=w.device)
+    rand_perm = torch.empty(0, dtype=torch.int, device=w.device)
+    if act_order:
+        assert (
+            group_size < size_k
+        ), "For act_order, groupsize = {} must be less than size_k = {}".format(
+            group_size, size_k)
+
+        w_ref, w_q, g_idx, rand_perm = permute_rows(w_q, w_ref, group_size,
+                                                    test_perm)
+
+    return w_ref, w_q, w_s, g_idx, rand_perm
+
+
+# QQQ employs different quant schemes for per-group and
+# per-channel quantization.
+def qqq_quantize_weights(w: torch.Tensor, num_bits: int, group_size: int):
+    orig_device = w.device
+    size_k, size_n = w.shape
+
+    assert w.is_floating_point(), "w must be float"
+    assert num_bits in MARLIN_QQQ_SUPPORTED_NUM_BITS, \
+           f"Unsupported num_bits = {num_bits}"
+    assert group_size in SUPPORTED_GROUP_SIZES + [
+        size_k
+    ], f"Unsupported groupsize = {group_size}"
+
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
+
+    if group_size < size_k:
+        # Reshape to [groupsize, -1]
+        w = w.reshape((-1, group_size, size_n))
+        w = w.permute(1, 0, 2)
+        w = w.reshape((group_size, -1))
+
+        max_q_val = 2**num_bits - 1
+        half_q_val = (max_q_val + 1) // 2
+
+        # Compute scale for each group
+        s_group = torch.max(torch.abs(w), 0, keepdim=True)[0]
+        s_group *= 2 / max_q_val  # 2 => symmetric
+
+        # Quantize
+        q_w = torch.round(w / s_group).int()
+        q_w += half_q_val
+        q_w = torch.clamp(q_w, 0, max_q_val)
+        # Compute ref (dequantized)
+        w_ref = (q_w - half_q_val).half() * s_group
+
+        # Restore original shapes
+        def reshape_w(w):
+            w = w.reshape((group_size, -1, size_n))
+            w = w.permute(1, 0, 2)
+            w = w.reshape((size_k, size_n)).contiguous()
+            return w
+
+        q_w = reshape_w(q_w)
+        w_ref = reshape_w(w_ref)
+
+        # Compute int8 quantization scale for each channel
+        s_channel = torch.max(torch.abs(w_ref), 0, keepdim=True)[0]
+        s_channel /= 127.0
+        t_int8 = (w_ref / s_channel).round().clamp(-128, 127).to(torch.int8)
+        w_ref = t_int8.half() * s_channel
+        s_channel = s_channel.reshape(1, -1).to(dtype=torch.float)
+
+        # Fuse scales
+        s_group = (s_group.reshape(-1, size_n).contiguous() /
+                   s_channel).to(dtype=torch.half)
+    else:
+        max_q_val = 2**(num_bits - 1) - 1
+
+        # Compute scale for each channel
+        s_channel = torch.max(torch.abs(w), 0, keepdim=True)[0]
+        s_channel /= max_q_val
+
+        # Quantize
+        q_w = torch.round(w / s_channel).int()
+        q_w = torch.clamp(q_w, -max_q_val, max_q_val)
+        # Compute ref (dequantized)
+        w_ref = q_w.half() * s_channel
+
+        s_group = torch.tensor([], dtype=torch.half)
+        # div 2 ** (8 - self.bits)) to offset right shift in unpacking
+        s_channel /= (2**(8 - num_bits))
+        s_channel = s_channel.reshape(-1, size_n).contiguous().to(torch.float)
+
+    return (
+        w_ref.to(device=orig_device),
+        q_w.to(device=orig_device),
+        s_group.to(device=orig_device),
+        s_channel.to(device=orig_device),
+    )
+
+
+def sort_weights(q_w: torch.Tensor, g_idx: torch.Tensor):
+    orig_device = q_w.device
+
+    sort_indices = torch.argsort(g_idx).to(
+        dtype=torch.int32)  # Sort based on g_idx
+
+    g_idx = g_idx[sort_indices].contiguous()
+    q_w = q_w[sort_indices, :].contiguous()
+
+    return (
+        q_w.to(device=orig_device),
+        g_idx.to(device=orig_device),
+        sort_indices.to(device=orig_device),
+    )
+
+
+def pack_rows(
+    q_w: torch.Tensor,
+    num_bits: int,
+    size_k: int,
+    size_n: int,
+):
+    assert q_w.shape == (size_k, size_n)
+
+    pack_factor = get_pack_factor(num_bits)
+    assert size_k % pack_factor == 0
+
+    orig_device = q_w.device
+
+    q_w = q_w.cpu().numpy().astype(numpy.uint32)
+
+    q_res = numpy.zeros((size_k // pack_factor, size_n), dtype=numpy.uint32)
+
+    for i in range(pack_factor):
+        q_res |= q_w[i::pack_factor, :] << num_bits * i
+
+    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
+    return q_res
+
+
+def pack_cols(
+    q_w: torch.Tensor,
+    num_bits: int,
+    size_k: int,
+    size_n: int,
+):
+    assert q_w.shape == (size_k, size_n)
+
+    pack_factor = get_pack_factor(num_bits)
+    assert size_n % pack_factor == 0
+
+    orig_device = q_w.device
+
+    q_w = q_w.cpu().numpy().astype(numpy.uint32)
+
+    q_res = numpy.zeros((size_k, size_n // pack_factor), dtype=numpy.uint32)
+
+    for i in range(pack_factor):
+        q_res |= q_w[:, i::pack_factor] << num_bits * i
+
+    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
+    q_res = q_res.contiguous()
+
+    return q_res
+
+
+def unpack_cols(
+    packed_q_w: torch.Tensor,
+    num_bits: int,
+    size_k: int,
+    size_n: int,
+):
+    pack_factor = get_pack_factor(num_bits)
+    assert size_n % pack_factor == 0
+    assert packed_q_w.shape == (
+        size_k, size_n // pack_factor
+    ), "packed_q_w.shape = {} size_k = {}, size_n = {} pack_Factor = {}".format(
+        packed_q_w.shape, size_k, size_n, pack_factor)
+
+    orig_device = packed_q_w.device
+
+    packed_q_w_cpu = packed_q_w.cpu().numpy().astype(numpy.uint32)
+    q_res = numpy.zeros((size_k, size_n), dtype=numpy.uint32)
+
+    mask = (1 << num_bits) - 1
+    for i in range(pack_factor):
+        vals = packed_q_w_cpu & mask
+        packed_q_w_cpu >>= num_bits
+        q_res[:, i::pack_factor] = vals
+
+    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
+    q_res = q_res.contiguous()
+
+    return q_res
+
+
+def gptq_pack(
+    q_w: torch.Tensor,
+    num_bits: int,
+    size_k: int,
+    size_n: int,
+):
+    return pack_rows(q_w, num_bits, size_k, size_n)
+
+
+def awq_pack(
+    q_w: torch.Tensor,
+    num_bits: int,
+    size_k: int,
+    size_n: int,
+):
+    assert q_w.shape == (size_k, size_n)
+
+    # Interleave column dim (for the dequantize code) and pack it to int32
+    if num_bits == 4:
+        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
+    elif num_bits == 8:
+        interleave = numpy.array([0, 2, 1, 3])
+    else:
+        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
+
+    q_w = q_w.reshape((-1, len(interleave)))[:, interleave].ravel()
+    q_w = q_w.reshape((-1, size_n)).contiguous()
+
+    return pack_cols(q_w, num_bits, size_k, size_n)
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
new file mode 100644
index 0000000..47bb457
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -0,0 +1,401 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Callable, Optional, Union
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm import envs
+from vllm.config import CompilationLevel, get_current_vllm_config
+from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape)
+from vllm.platforms import current_platform
+
+# Input scaling factors are no longer optional in _scaled_mm starting
+# from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
+TORCH_DEVICE_IDENTITY = None
+
+# The condition to determine if it is on a platform that supports
+# torch._scaled_mm rowwise feature.
+# The condition is determined once as the operations
+# are time consuming.
+USE_ROWWISE_TORCH_SCALED_MM = (current_platform.is_rocm()
+                               and torch.__version__[0:3] >= "2.7"
+                               and current_platform.has_device_capability(94))
+
+
+def sparse_cutlass_supported() -> bool:
+    if not current_platform.is_cuda():
+        return False
+
+    capability_tuple = current_platform.get_device_capability()
+    capability = -1 if capability_tuple is None else capability_tuple.to_int()
+
+    return ops.cutlass_sparse_scaled_mm_supported(capability)
+
+
+def cutlass_fp8_supported() -> bool:
+    if not current_platform.is_cuda():
+        return False
+
+    capability_tuple = current_platform.get_device_capability()
+    capability = -1 if capability_tuple is None else capability_tuple.to_int()
+
+    return ops.cutlass_scaled_mm_supports_fp8(capability)
+
+
+def cutlass_block_fp8_supported() -> bool:
+    if not current_platform.is_cuda():
+        return False
+
+    capability_tuple = current_platform.get_device_capability()
+    capability = -1 if capability_tuple is None else capability_tuple.to_int()
+
+    return ops.cutlass_scaled_mm_supports_block_fp8(capability)
+
+
+def cutlass_group_gemm_supported() -> bool:
+    if not current_platform.is_cuda():
+        return False
+
+    capability_tuple = current_platform.get_device_capability()
+    capability = -1 if capability_tuple is None else capability_tuple.to_int()
+
+    return ops.cutlass_group_gemm_supported(capability)
+
+
+CUTLASS_FP8_SUPPORTED = cutlass_fp8_supported()
+CUTLASS_BLOCK_FP8_SUPPORTED = cutlass_block_fp8_supported()
+
+
+def per_tensor_dequantize(
+        tensor: torch.Tensor, inv_scale: Union[float,
+                                               torch.Tensor]) -> torch.Tensor:
+    fake_qweight = tensor.to(torch.float16)
+    dq_weight = fake_qweight * inv_scale
+    return dq_weight
+
+
+def all_close_1d(x: torch.Tensor) -> bool:
+    assert len(x.shape) == 1
+    return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0]))
+
+
+def convert_to_channelwise(
+        weight_scale: torch.Tensor,
+        logical_widths: list[int]) -> tuple[torch.Tensor, torch.Tensor]:
+    # Create channelwise buffer
+    weight_scale_channel = torch.empty((sum(logical_widths), 1),
+                                       dtype=torch.float32,
+                                       device=weight_scale.device)
+
+    # Expand each scale to match the size of each logical matrix.
+    start = 0
+    for idx, logical_width in enumerate(logical_widths):
+        end = start + logical_width
+        weight_scale_channel[start:end, :] = weight_scale[idx]
+        start = end
+
+    return weight_scale_channel
+
+
+def requantize_with_max_scale(
+        weight: torch.Tensor, weight_scale: torch.Tensor,
+        logical_widths: list[int]) -> tuple[torch.Tensor, torch.Tensor]:
+    # Max scale to be used for requanitzation.
+    max_w_scale = weight_scale.max()
+
+    # QKV / MLP is fused in the on disk checkpoint if any of the
+    # weight scales are still set to the default since we initialize
+    # N weight scales for N shards but we only load 1 weight scale
+    # from disk in this case. Skip requantization in this case (since)
+    # we already are quantized with the single scale.
+    # * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8
+    unfused_module_in_checkpoint = (weight_scale[-1]
+                                    > torch.finfo(torch.float8_e4m3fn).min)
+
+    # If unfused checkpoint, need requanize with the single scale.
+    if unfused_module_in_checkpoint:
+        start = 0
+        for idx, logical_width in enumerate(logical_widths):
+            end = start + logical_width
+            weight_dq = per_tensor_dequantize(weight[start:end, :],
+                                              weight_scale[idx])
+            weight[start:end, :], _ = ops.scaled_fp8_quant(
+                weight_dq, max_w_scale)
+            start = end
+
+    return max_w_scale, weight
+
+
+def maybe_create_device_identity():
+    # Allocate dummy ones tensor for torch._scaled_mm
+    global TORCH_DEVICE_IDENTITY
+    if TORCH_DEVICE_IDENTITY is None:
+        TORCH_DEVICE_IDENTITY = torch.ones(1, dtype=torch.float32)
+
+
+def cutlass_w8a8_scaled_mm(*, qinput: torch.Tensor, weight: torch.Tensor,
+                           out_dtype: torch.dtype, scale_a: torch.Tensor,
+                           scale_b: torch.Tensor, bias: torch.Tensor,
+                           output_shape: list, **kwargs) -> torch.Tensor:
+
+    # Fused GEMM_DQ
+    output = ops.cutlass_scaled_mm(qinput,
+                                   weight,
+                                   out_dtype=out_dtype,
+                                   scale_a=scale_a,
+                                   scale_b=scale_b,
+                                   bias=bias)
+    return output.view(*output_shape)
+
+
+def rocm_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor,
+                                   weight: torch.Tensor,
+                                   out_dtype: torch.dtype,
+                                   scale_a: torch.Tensor,
+                                   scale_b: torch.Tensor, bias: torch.Tensor,
+                                   input_2d: torch.Tensor,
+                                   output_shape: list) -> torch.Tensor:
+    from vllm.platforms.rocm import on_mi3xx
+    if envs.VLLM_ROCM_USE_SKINNY_GEMM and on_mi3xx(
+    ) and qinput.shape[0] == 1 and qinput.shape[1] % 16 == 0:
+        output = ops.wvSplitKQ(weight.t(), qinput, out_dtype, scale_a, scale_b,
+                               current_platform.get_cu_count())
+    else:
+        output = torch._scaled_mm(qinput,
+                                  weight,
+                                  out_dtype=out_dtype,
+                                  scale_a=scale_a,
+                                  scale_b=scale_b,
+                                  bias=bias)
+
+    return torch.narrow(output, 0, 0, input_2d.shape[0]).view(*output_shape)
+
+
+def torch_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor,
+                                    weight: torch.Tensor,
+                                    out_dtype: torch.dtype,
+                                    scale_a: torch.Tensor,
+                                    scale_b: torch.Tensor, bias: torch.Tensor,
+                                    input_2d: torch.Tensor,
+                                    output_shape: list) -> torch.Tensor:
+    output = torch._scaled_mm(qinput,
+                              weight,
+                              out_dtype=out_dtype,
+                              scale_a=scale_a,
+                              scale_b=scale_b,
+                              bias=bias)
+    # A fix for discrepancy in scaled_mm which returns tuple
+    # for torch < 2.5 and a single value in torch >= 2.5
+    if type(output) is tuple and len(output) == 2:
+        output = output[0]
+
+    return torch.narrow(output, 0, 0, input_2d.shape[0]).view(*output_shape)
+
+
+def torch_per_token_w8a8_scaled_mm(*, qinput: torch.Tensor,
+                                   weight: torch.Tensor,
+                                   out_dtype: torch.dtype,
+                                   scale_a: torch.Tensor,
+                                   scale_b: torch.Tensor, bias: torch.Tensor,
+                                   input_2d: torch.Tensor,
+                                   output_shape: list) -> torch.Tensor:
+    # Note: Callers of this function should check USE_ROWWISE_TORCH_SCALED_MM
+    #  when using it.
+    #  For now it has only been validated on ROCm platform.
+    #  fp8 rowwise scaling in torch._scaled_mm is introduced in
+    #  https://github.com/pytorch/pytorch/pull/144432 using
+    #  hipBLASLt and ROCm 6.3, which only exists in torch 2.7 and above.
+    #
+    #  For CUDA platform please validate if the torch._scaled_mm supports
+    #  rowwise scaled GEMM before using it
+
+    # Fused GEMM_DQ Rowwise GEMM
+    output = torch._scaled_mm(qinput,
+                              weight,
+                              out_dtype=out_dtype,
+                              scale_a=scale_a,
+                              scale_b=scale_b.t(),
+                              bias=bias)
+
+    output = torch.narrow(output, 0, 0, input_2d.shape[0])
+    output = output.view(*output_shape)
+    return output
+
+
+def torch_channelwise_w8a8_scaled_mm(*, qinput: torch.Tensor,
+                                     weight: torch.Tensor,
+                                     out_dtype: torch.dtype,
+                                     scale_a: torch.Tensor,
+                                     scale_b: torch.Tensor, bias: torch.Tensor,
+                                     input_2d: torch.Tensor,
+                                     output_shape: list,
+                                     **kwargs) -> torch.Tensor:
+    # Use unfused DQ due to limitations with scaled_mm
+
+    # Symmetric quantized GEMM by definition computes the following:
+    #   C = (s_x * X) (s_w * W) + bias
+    # This is equivalent to dequantizing the weights and activations
+    # before applying a GEMM.
+    #
+    # In order to compute quantized operands, a quantized kernel
+    # will rewrite the above like so:
+    #   C = s_w * s_x * (X * W) + bias
+    #
+    # For the scaled_mm fallback case, we break this down, since it
+    # does not support s_w being a vector.
+
+    # GEMM
+    # This computes C = (X * W).
+    # Output in fp32 to allow subsequent ops to happen in-place
+    output = torch._scaled_mm(qinput,
+                              weight,
+                              scale_a=TORCH_DEVICE_IDENTITY,
+                              scale_b=TORCH_DEVICE_IDENTITY,
+                              out_dtype=torch.float32)
+    # A fix for discrepancy in scaled_mm which returns tuple
+    # for torch < 2.5 and a single value in torch >= 2.5
+    if type(output) is tuple and len(output) == 2:
+        output = output[0]
+    # Unpad (undo num_token_padding)
+    output = torch.narrow(output, 0, 0, input_2d.shape[0])
+    x_scale = torch.narrow(scale_a, 0, 0, input_2d.shape[0])
+
+    # DQ
+    # C = sw * sx * (X * W) + bias
+    output = output * x_scale * scale_b.t()
+    if bias is not None:
+        output = output + bias
+    return output.to(out_dtype).view(*output_shape)
+
+
+def dispatch_w8a8_scaled_mm(
+        cutlass_fp8_supported: bool, per_tensor_weights: bool,
+        per_tensor_activations: bool) -> Callable[..., torch.Tensor]:
+
+    # cutlass_scaled_mm supports per tensor/channel W and per tensor/token A
+    if cutlass_fp8_supported:
+        return cutlass_w8a8_scaled_mm
+    if per_tensor_weights and per_tensor_activations:
+        if current_platform.is_rocm():
+            return rocm_per_tensor_w8a8_scaled_mm
+        return torch_per_tensor_w8a8_scaled_mm
+    # If torch.scaled_mm supports per-channel (weights) per-token (inputs)
+    if not per_tensor_weights and not per_tensor_activations \
+            and USE_ROWWISE_TORCH_SCALED_MM:
+        return torch_per_token_w8a8_scaled_mm
+    # Normally, torch.scaled_mm supports per tensor weights + activations only
+    # so fallback to naive if per channel or per token
+    return torch_channelwise_w8a8_scaled_mm
+
+
+# TODO(luka): follow similar pattern for marlin and block-fp8-linear
+#  https://github.com/vllm-project/vllm/issues/14397
+class Fp8LinearOp:
+    """
+    This class executes a FP8 linear layer using cutlass if supported and
+    torch.scaled_mm otherwise.
+    It needs to be a class instead of a method so that config can be read
+    in the __init__ method, as reading config is not allowed inside forward.
+    """
+
+    def __init__(self,
+                 act_quant_static: bool,
+                 cutlass_fp8_supported: bool = cutlass_fp8_supported(),
+                 act_quant_group_shape: GroupShape = GroupShape.PER_TENSOR,
+                 pad_output: Optional[bool] = None):
+        self.cutlass_fp8_supported = cutlass_fp8_supported
+
+        # Note: we pad the input because torch._scaled_mm is more performant
+        # for matrices with batch dimension > 16.
+        # This could change in the future.
+        # We also don't pad when using torch.compile,
+        # as it breaks with dynamic shapes.
+        if pad_output is None:
+            config = get_current_vllm_config().compilation_config
+            pad_output = config.level < CompilationLevel.PIECEWISE and \
+                         not cutlass_fp8_supported and \
+                         not current_platform.is_rocm()
+
+        self.output_padding = 17 if pad_output else None
+        self.act_quant_static = act_quant_static
+        self.act_quant_group_shape = act_quant_group_shape
+        self.quant_fp8 = QuantFP8(static=act_quant_static,
+                                  group_shape=act_quant_group_shape,
+                                  num_token_padding=self.output_padding)
+
+    def apply(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        out_dtype: Optional[torch.dtype] = None,
+        input_scale: Optional[torch.Tensor] = None,
+        input_scale_ub: Optional[torch.Tensor] = None,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # ops.scaled_fp8_quant supports both dynamic and static quant.
+        #   If dynamic, layer.input_scale is None and x_scale computed from x.
+        #   If static, layer.input_scale is scalar and x_scale is input_scale.
+
+        # View input as 2D matrix for fp8 methods
+        input_2d = input.view(-1, input.shape[-1])
+        output_shape = [*input.shape[:-1], weight.shape[1]]
+
+        if out_dtype is None:
+            out_dtype = input.dtype
+
+        # If input not quantized
+        # TODO(luka) remove this path if not used anymore
+        if input.dtype != current_platform.fp8_dtype():
+            qinput, x_scale = self.quant_fp8(
+                input_2d,
+                input_scale,
+                input_scale_ub,
+            )
+        else:
+            qinput, x_scale = input_2d, input_scale
+
+        per_tensor_weights = (weight_scale.numel() == 1)
+        per_tensor_activations = (x_scale.numel() == 1)
+
+        # TODO(luka) do this dispatch during init (after ScaledMM refactor)
+        w8a8_scaled_mm_func = dispatch_w8a8_scaled_mm(
+            self.cutlass_fp8_supported, per_tensor_weights,
+            per_tensor_activations)
+
+        return w8a8_scaled_mm_func(qinput=qinput,
+                                   weight=weight,
+                                   out_dtype=out_dtype,
+                                   scale_a=x_scale,
+                                   scale_b=weight_scale,
+                                   bias=bias,
+                                   input_2d=input_2d,
+                                   output_shape=output_shape)
+
+
+def normalize_e4m3fn_to_e4m3fnuz(
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    assert weight.dtype == torch.float8_e4m3fn
+    # The bits pattern 10000000(-128) represents zero in e4m3fn
+    # but NaN in e4m3fnuz. So here we set it to 0.
+    # https://onnx.ai/onnx/technical/float8.html
+    weight_as_int8 = weight.view(torch.int8)
+    ROCM_FP8_NAN_AS_INT = -128
+    weight_as_int8[weight_as_int8 == ROCM_FP8_NAN_AS_INT] = 0
+    weight = weight_as_int8.view(torch.float8_e4m3fnuz)
+
+    # For the same bits representation, e4m3fnuz value is half of
+    # the e4m3fn value, so we should double the scaling factor to
+    # get the same dequantized value.
+    # https://onnx.ai/onnx/technical/float8.html
+    weight_scale = weight_scale * 2.0
+    if input_scale is not None:
+        input_scale = input_scale * 2.0
+    return weight, weight_scale, input_scale
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/resampler.py b/vllm_v0.10.0/vllm/model_executor/layers/resampler.py
new file mode 100644
index 0000000..3f2d571
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/resampler.py
@@ -0,0 +1,270 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py
+# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
+#
+# Copyright 2023 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Shared resampler perceiver network used in multimodal models and
+related helpers for sincos positional embeddings.
+
+Example models: Qwen (Qwen-VL), MiniCPM-V 2.0
+"""
+import math
+from functools import partial
+from typing import Callable, Optional, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm.model_executor.layers.quantization import QuantizationConfig
+
+DEFAULT_LN = partial(nn.LayerNorm, eps=1e-6)
+
+
+def get_abs_pos(abs_pos: torch.Tensor, tgt_size: Union[torch.Tensor,
+                                                       int]) -> torch.Tensor:
+    # abs_pos: L, C
+    # tgt_size: (H, W)
+    # return: M, C
+    src_size = int(math.sqrt(abs_pos.size(0)))
+    dtype = abs_pos.dtype
+    if isinstance(tgt_size, int):
+        tgt_size = (tgt_size, tgt_size)
+    if (src_size == tgt_size[0] and src_size == tgt_size[1]):
+        return abs_pos
+    return (F.interpolate(
+        abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2),
+        size=(tgt_size[0], tgt_size[1]),
+        mode="bicubic",
+        align_corners=False,
+    ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype))
+
+
+# sin/cos positional embedding helpers are adapted from:
+# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
+def get_1d_sincos_pos_embed_from_grid(
+    embed_dim: int, pos: np.ndarray,
+    version: tuple[int, int] = (2, 0)) -> torch.Tensor:
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,) / (H, W)
+    out: (M, D) / (H, W, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+
+    if version == (2, 0):
+        pos = pos.reshape(-1)  # (M,)
+        out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+        emb_sin = np.sin(out)  # (M, D/2)
+        emb_cos = np.cos(out)  # (M, D/2)
+        emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    else:
+        out = np.einsum("hw,d->hwd", pos, omega)  # (H, W, D/2), outer product
+        emb_sin = np.sin(out)  # (H, W, D/2)
+        emb_cos = np.cos(out)  # (H, W, D/2)
+        emb = np.concatenate([emb_sin, emb_cos], axis=-1)  # (H, W, D)
+    return emb
+
+
+def get_2d_sincos_pos_embed_from_grid(
+    embed_dim: int, grid: np.ndarray,
+    version: tuple[int, int] = (2, 0)) -> torch.Tensor:
+    assert embed_dim % 2 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(
+        embed_dim // 2, grid[0], version)  # (H*W, D/2) or (H, W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(
+        embed_dim // 2, grid[1], version)  # (H*W, D/2) or (H, W, D/2)
+
+    if version == (2, 0):
+        emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    else:
+        emb = np.concatenate([emb_h, emb_w], axis=-1)  # (H, W, D)
+    return emb
+
+
+def get_2d_sincos_pos_embed(
+        embed_dim: int,
+        grid_size: Union[int, tuple[int, int]],
+        cls_token: bool = False,
+        version: tuple[int, int] = (2, 0),
+) -> torch.Tensor:
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or
+                [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    if isinstance(grid_size, int):
+        grid_h_size, grid_w_size = grid_size, grid_size
+    else:
+        grid_h_size, grid_w_size = grid_size[0], grid_size[1]
+
+    grid_h = np.arange(grid_h_size, dtype=np.float32)
+    grid_w = np.arange(grid_w_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    assert isinstance(grid, np.ndarray) and \
+        grid.shape == (2, grid_h_size, grid_w_size)
+
+    if version == (2, 0):
+        grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
+        pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version)
+        if cls_token:
+            pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed],
+                                       axis=0)
+    else:
+        pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version)
+    return pos_embed
+
+
+class BaseResampler(nn.Module):
+    """
+    A 2D perceiver-resampler network with one cross attention layers by
+        (grid_size**2) learnable queries and 2d sincos pos_emb.
+    Outputs:
+        A tensor with the shape of (grid_size**2, embed_dim)
+    """
+
+    def __init__(self,
+                 num_queries: int,
+                 embed_dim: int,
+                 num_heads: int,
+                 kv_dim: Optional[int] = None,
+                 norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
+                 do_post_projection: bool = True,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "") -> None:
+        super().__init__()
+
+        self.num_queries = num_queries
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+
+        self.query = nn.Parameter(torch.empty(self.num_queries, embed_dim))
+
+        if kv_dim is not None and kv_dim != embed_dim:
+            self.kv_proj = ReplicatedLinear(kv_dim,
+                                            embed_dim,
+                                            bias=False,
+                                            quant_config=quant_config,
+                                            prefix=f"{prefix}.kv_proj")
+        else:
+            # Maintain the same return value with ReplicatedLinear.forward
+            self.kv_proj = lambda *args, **kwargs: (  # type: ignore # noqa 
+                nn.Identity()(*args, **kwargs),
+                None,
+            )
+        self.attn = nn.MultiheadAttention(embed_dim, num_heads)
+        self.ln_q = norm_layer(embed_dim)
+        self.ln_kv = norm_layer(embed_dim)
+        self.do_post_projection = do_post_projection
+        self.ln_post = norm_layer(embed_dim) if do_post_projection else None
+        self.proj = nn.Parameter(
+            (embed_dim**-0.5) *
+            torch.empty(embed_dim, embed_dim)) if do_post_projection else None
+
+    def _repeat(self, query, N: int):
+        return query.unsqueeze(1).repeat(1, N, 1)
+
+
+class Resampler2(BaseResampler):
+    """Resampler-perceiver network to be used for a variety of model types,
+    e.g., Qwen-vl / Minicpmv 2.0. The main difference is the addition of the
+    do_post_projection arg, which indicates whether or not there should be
+    a post layer normalization and projector after the attention. This is
+    present in minicpmv2.0, but not qwen-vl.
+    """
+
+    def __init__(self,
+                 grid_size: int,
+                 embed_dim: int,
+                 num_heads: int,
+                 kv_dim: Optional[int] = None,
+                 norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
+                 adaptive: bool = False,
+                 do_post_projection: bool = True,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "") -> None:
+        super().__init__(grid_size**2,
+                         embed_dim,
+                         num_heads,
+                         kv_dim,
+                         norm_layer,
+                         do_post_projection=do_post_projection,
+                         quant_config=quant_config,
+                         prefix=prefix)
+
+        self.adaptive = adaptive
+        pos_embed_arr = get_2d_sincos_pos_embed(embed_dim,
+                                                grid_size,
+                                                version=(2, 0))
+
+        self.pos_embed = nn.Parameter(
+            torch.from_numpy(pos_embed_arr).requires_grad_(False))
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        tgt_sizes: Optional[torch.Tensor] = None,
+        attn_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if tgt_sizes is None:
+            tgt_sizes = int(math.sqrt(x.size(1)))
+        if self.adaptive:
+            pos_embed_arr = get_2d_sincos_pos_embed(self.embed_dim,
+                                                    tgt_sizes,
+                                                    version=(2, 0))
+            pos_embed = torch.from_numpy(pos_embed_arr).to(device=x.device,
+                                                           dtype=x.dtype)
+        else:
+            pos_embed = get_abs_pos(self.pos_embed,
+                                    tgt_sizes).to(device=x.device,
+                                                  dtype=x.dtype)
+
+        x, _ = self.kv_proj(x)
+        x = self.ln_kv(x).permute(1, 0, 2)
+
+        N = x.shape[1]
+        q = self.ln_q(self.query)
+        out = self.attn(
+            self._repeat(q, N) + self.pos_embed.unsqueeze(1),
+            x + pos_embed.unsqueeze(1),
+            x,
+            attn_mask=attn_mask,
+        )[0]
+        x = out.permute(1, 0, 2)
+        if self.do_post_projection:
+            x = self.ln_post(x)
+            x = x @ self.proj
+        return x
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/rotary_embedding.py b/vllm_v0.10.0/vllm/model_executor/layers/rotary_embedding.py
new file mode 100644
index 0000000..dddd4d6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/rotary_embedding.py
@@ -0,0 +1,1967 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Rotary Positional Embeddings."""
+import itertools
+import math
+from typing import Any, Optional, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig
+
+from vllm.model_executor.custom_op import CustomOp
+from vllm.platforms import current_platform
+
+if current_platform.is_cuda():
+    from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
+
+
+def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., :x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def _rotate_gptj(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., ::2]
+    x2 = x[..., 1::2]
+    x = torch.stack((-x2, x1), dim=-1)
+    return x.flatten(-2)
+
+
+def _apply_rotary_emb_torch(
+    x: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    is_neox_style: bool,
+) -> torch.Tensor:
+    cos = cos.unsqueeze(-2).to(x.dtype)
+    sin = sin.unsqueeze(-2).to(x.dtype)
+    if is_neox_style:
+        x1, x2 = torch.chunk(x, 2, dim=-1)
+    else:
+        x1 = x[..., ::2]
+        x2 = x[..., 1::2]
+    o1 = x1 * cos - x2 * sin
+    o2 = x2 * cos + x1 * sin
+    if is_neox_style:
+        return torch.cat((o1, o2), dim=-1)
+    else:
+        return torch.stack((o1, o2), dim=-1).flatten(-2)
+
+
+def _apply_rotary_emb(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor,
+                      is_neox_style: bool) -> torch.Tensor:
+    """
+    Args:
+        x: [num_tokens, num_heads, head_size]
+        cos: [num_tokens, head_size // 2]
+        sin: [num_tokens, head_size // 2]
+        is_neox_style: Whether to use the Neox-style or GPT-J-style rotary
+            positional embeddings.
+    """
+    if current_platform.is_cuda():
+        return apply_rotary_emb(x.unsqueeze(0), cos, sin,
+                                not is_neox_style).squeeze(0)
+    else:
+        return _apply_rotary_emb_torch(x, cos, sin, is_neox_style)
+
+
+@CustomOp.register("rotary_embedding")
+class RotaryEmbedding(CustomOp):
+    """Original rotary positional embedding."""
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+    ) -> None:
+        super().__init__()
+        self.head_size = head_size
+        self.rotary_dim = rotary_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.is_neox_style = is_neox_style
+        self.dtype = dtype
+
+        cache = self._compute_cos_sin_cache()
+        cache = cache.to(dtype)
+        self.cos_sin_cache: torch.Tensor
+        self.register_buffer("cos_sin_cache", cache, persistent=False)
+
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
+        """Compute the inverse frequency."""
+        # NOTE(woosuk): To exactly match the HF implementation, we need to
+        # use CPU to compute the cache and then move it to GPU. However, we
+        # create the cache on GPU for faster initialization. This may cause
+        # a slight numerical difference between the HF implementation and ours.
+        inv_freq = 1.0 / (base**(torch.arange(
+            0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim))
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        """Compute the cos and sin cache."""
+        inv_freq = self._compute_inv_freq(self.base)
+        t = torch.arange(self.max_position_embeddings, dtype=torch.float)
+
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """A PyTorch-native implementation of forward()."""
+        if offsets is not None:
+            positions = positions + offsets
+        positions = positions.flatten()
+        num_tokens = positions.shape[0]
+        cos_sin = self.cos_sin_cache.index_select(0, positions)
+        cos, sin = cos_sin.chunk(2, dim=-1)
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., :self.rotary_dim]
+        query_pass = query[..., self.rotary_dim:]
+        query_rot = _apply_rotary_emb_torch(query_rot, cos, sin,
+                                            self.is_neox_style)
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        # key may be None in some cases, e.g. cross-layer KV sharing
+        if key is not None:
+            key_shape = key.shape
+            key = key.view(num_tokens, -1, self.head_size)
+            key_rot = key[..., :self.rotary_dim]
+            key_pass = key[..., self.rotary_dim:]
+            key_rot = _apply_rotary_emb_torch(key_rot, cos, sin,
+                                              self.is_neox_style)
+            key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
+    def forward_cuda(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        from vllm import _custom_ops as ops
+
+        # __setattr__ in nn.Module (called by `self.cos_sin_cache = ...`)
+        # is expensive, so avoid calling it if possible
+        if self.cos_sin_cache.device != query.device or \
+            self.cos_sin_cache.dtype != query.dtype:
+            self.cos_sin_cache = self.cos_sin_cache.to(query.device,
+                                                       dtype=query.dtype)
+
+        # ops.rotary_embedding()/batched_rotary_embedding()
+        # are in-place operations that update the query and key tensors.
+        if offsets is not None:
+            ops.batched_rotary_embedding(positions, query, key, self.head_size,
+                                         self.cos_sin_cache,
+                                         self.is_neox_style, self.rotary_dim,
+                                         offsets)
+        else:
+            ops.rotary_embedding(positions, query, key, self.head_size,
+                                 self.cos_sin_cache, self.is_neox_style)
+        return query, key
+
+    def forward_xpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        from vllm._ipex_ops import ipex_ops as ops
+
+        self.cos_sin_cache = self.cos_sin_cache.to(positions.device,
+                                                   dtype=query.dtype)
+        # ops.rotary_embedding()/batched_rotary_embedding()
+        # are in-place operations that update the query and key tensors.
+        if key is None:
+            # XPU kernel doesn't support key=None so fall back to native impl
+            # TODO(sarckk): add support for optional key in
+            # ipex.llm.functional.rotary_embedding_batched
+            return self.forward_native(positions, query, key, offsets)
+        else:
+            if offsets is not None:
+                ops.batched_rotary_embedding(positions, query, key,
+                                             self.head_size,
+                                             self.cos_sin_cache,
+                                             self.is_neox_style,
+                                             self.rotary_dim, offsets)
+            else:
+                ops.rotary_embedding(positions, query, key, self.head_size,
+                                     self.cos_sin_cache, self.is_neox_style)
+        return query, key
+
+    def forward_neuron(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+
+        def _apply_rotary_emb_neuron(
+            x: torch.Tensor,
+            cos: torch.Tensor,
+            sin: torch.Tensor,
+            is_neox_style: bool,
+        ) -> torch.Tensor:
+            cos = cos.unsqueeze(-2).to(x.dtype)
+            sin = sin.unsqueeze(-2).to(x.dtype)
+            if is_neox_style:
+                x1, x2 = torch.chunk(x, 2, dim=-1)
+            else:
+                # x1 = x[..., ::2]
+
+                # x2 = x[..., 1::2]
+                d = x.shape[-1] // 2
+                x_reshaped = x.view(-1, x.shape[-1])
+                x1 = x_reshaped[:, ::2].view(*x.shape[:-1], d)
+                x2 = x_reshaped[:, 1::2].view(*x.shape[:-1], d)
+            o1 = x1 * cos - x2 * sin
+            o2 = x2 * cos + x1 * sin
+            if is_neox_style:
+                return torch.cat((o1, o2), dim=-1)
+            else:
+                return torch.stack((o1, o2), dim=-1).flatten(-2)
+
+        if offsets is not None:
+            positions = positions + offsets
+
+        self.cos_sin_cache = self.cos_sin_cache.to(query.device,
+                                                   dtype=query.dtype)
+
+        positions = positions.flatten()
+        num_tokens = positions.shape[0]
+        cos_sin = self.cos_sin_cache.index_select(0, positions)
+        cos, sin = cos_sin.chunk(2, dim=-1)
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        if key is not None:
+            key_shape = key.shape
+            key = key.view(num_tokens, -1, self.head_size)
+
+        if self.rotary_dim == self.head_size:
+            query = _apply_rotary_emb(query, cos, sin, self.is_neox_style)
+            query = query.reshape(query_shape)
+            if key is not None:
+                key = _apply_rotary_emb(key, cos, sin, self.is_neox_style)
+                key = key.reshape(key_shape)
+        else:
+            head_size = query.shape[-1]
+            query_reshaped = query.view(-1, head_size)
+            query_pass = query_reshaped[:, self.rotary_dim:].view(
+                *query.shape[:-1], head_size - self.rotary_dim)
+            query_rot = query_reshaped[:, :self.rotary_dim].view(
+                *query.shape[:-1], self.rotary_dim)
+            query_rot = _apply_rotary_emb_neuron(query_rot, cos, sin,
+                                                 self.is_neox_style)
+            query = torch.cat((query_rot, query_pass),
+                              dim=-1).reshape(query_shape)
+
+            if key is not None:
+                key_reshaped = key.view(-1, head_size)
+                key_pass = key_reshaped[:, self.rotary_dim:].view(
+                    *key.shape[:-1], head_size - self.rotary_dim)
+                key_rot = key_reshaped[:, :self.rotary_dim].view(
+                    *key.shape[:-1], self.rotary_dim)
+                key_rot = _apply_rotary_emb_neuron(key_rot, cos, sin,
+                                                   self.is_neox_style)
+                key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
+    def extra_repr(self) -> str:
+        s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
+        s += f", max_position_embeddings={self.max_position_embeddings}"
+        s += f", base={self.base}, is_neox_style={self.is_neox_style}"
+        return s
+
+
+class LinearScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with linear scaling.
+
+    It supports multiple scaling factors. Since multiple LoRA adapters may have
+    different scaling factors, we need multiple cos/sin caches. In this way,
+    instead of running rotary embedding kernel per lora, we can run multiple
+    lora in a batched way.
+
+    In addition to that, we also keep the cos/sin cache for the scaling factor
+    of 1 (default) at all times.
+
+    Exemplary for two scaling factors x=1, y and z with embeddings
+    [[x11, x12, ... x1m], ..., [xn1, xn2, ..., xnm]] and
+    [[y11, y12, ... y1o], ..., [yn1, yn2, ..., yno]], and
+    [[z11, z12, ... z1p], ..., [zn1, zn2, ..., znp]],
+
+    we construct the cos/sin cache as follows:
+    [[x11, x12, ... x1m, y11, y12, ... y1o, z11, z12, ... z1p],
+        ...
+     [xn1, xn2, ... xnm, yn1, yn2, ... yno, zn1, zn2, ... znp]]
+
+    We then use offsets to index into the cos/sin cache for
+    the respective scaling factors.
+
+    The offset to cache can be accessed via `scaling_factor_to_offset` API.
+
+    Credits to the Reddit user /u/kaiokendev
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        scaling_factors: Union[list[float], float],
+        dtype: torch.dtype,
+    ) -> None:
+        if isinstance(scaling_factors, float):
+            scaling_factors = [scaling_factors]
+        self.scaling_factors: list[float] = scaling_factors  # noqa
+        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
+                         is_neox_style, dtype)
+        # Lazy initialized.
+        self._scaling_factor_to_offset: dict[float, int]
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.base)
+        cache_list: list[torch.Tensor] = []
+        # offsets to the next cache in a tensor.
+        # Each offset corresponds to the same index in scaling_factors.
+        offsets: list[int] = []
+        for scaling_factor in self.scaling_factors:
+            # NOTE(woosuk): self.max_position_embeddings is the original
+            # maximum length before applying the rope scaling.
+            # Thus, the maximum length after applying the rope scaling is
+            # self.max_position_embeddings * self.scaling_factor.
+            max_len = self.max_position_embeddings * scaling_factor
+            t = torch.arange(max_len, dtype=torch.float)
+            t = t / scaling_factor
+
+            freqs = torch.einsum("i,j -> ij", t, inv_freq)
+            cos = freqs.cos()
+            sin = freqs.sin()
+            cache = torch.cat((cos, sin), dim=-1)
+            if not cache_list:
+                offset = 0
+            else:
+                last_offset = offsets[-1]
+                next_max_len = cache_list[-1].shape[0]
+                offset = last_offset + next_max_len
+            offsets.append(offset)
+            cache_list.append(cache)
+        self._scaling_factor_to_offset = {
+            float(scaling_factor): offsets[i]
+            for i, scaling_factor in enumerate(self.scaling_factors)
+        }
+        assert len(self.scaling_factors) == len(offsets)
+        return torch.cat(cache_list, dim=0)
+
+    @property
+    def scaling_factor_to_offset(self) -> dict[float, int]:
+        return self._scaling_factor_to_offset
+
+
+class NTKScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with fixed and mixed NTK scaling.
+    https://kexue.fm/archives/9706 """
+
+    def __init__(self,
+                 head_size: int,
+                 rotary_dim: int,
+                 max_position_embeddings: int,
+                 base: float,
+                 is_neox_style: bool,
+                 scaling_factor: float,
+                 dtype: torch.dtype,
+                 mixed_b: Optional[float] = None) -> None:
+        self.scaling_factor = scaling_factor
+        self.mixed_b = mixed_b
+        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
+                         is_neox_style, dtype)
+
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
+        base = self.base * (self.scaling_factor if self.mixed_b is None else 1)
+        inv_freq = super()._compute_inv_freq(base)
+
+        if self.mixed_b is None:
+            inv_freq = inv_freq / self.scaling_factor**(2 / self.rotary_dim)
+        else:
+            a = torch.tensor(self.scaling_factor).log() / (self.rotary_dim /
+                                                           2)**self.mixed_b
+            lambda_1_m = (a * torch.arange(
+                1, self.rotary_dim // 2 + 1).float()**self.mixed_b).exp()
+            inv_freq = inv_freq / lambda_1_m
+
+        return inv_freq
+
+
+class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with Dynamic NTK scaling.
+
+    Credits to the Reddit users /u/bloc97 and /u/emozilla
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        scaling_factor: float,
+        dtype: torch.dtype,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
+                         is_neox_style, dtype)
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        # NOTE(woosuk): self.max_position_embeddings is the original
+        # maximum length before applying the rope scaling.
+        # Thus, the maximum length after applying the rope scaling is
+        # self.max_position_embeddings * self.scaling_factor.
+        max_len = self.max_position_embeddings * self.scaling_factor
+        base = self.base * (
+            (self.scaling_factor * max_len / self.max_position_embeddings) -
+            (self.scaling_factor - 1))**(self.rotary_dim /
+                                         (self.rotary_dim - 2))
+        inv_freq = self._compute_inv_freq(base)
+        t = torch.arange(max_len, dtype=torch.float)
+
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+
+class DynamicNTKAlphaRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with Dynamic NTK alpha.
+
+    Based on the original RotaryEmbedding implementation.
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        scaling_alpha: float,
+        dtype: torch.dtype,
+    ) -> None:
+        self.scaling_alpha = scaling_alpha
+        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
+                         is_neox_style, dtype)
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        # For Hunyuan DynamicNTKAlphaRotaryEmbedding
+        max_len = self.max_position_embeddings
+        base = self.base * self.scaling_alpha**(self.rotary_dim /
+                                                (self.rotary_dim - 2))
+        inv_freq = self._compute_inv_freq(base)
+        t = torch.arange(max_len, dtype=torch.float)
+
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+
+# Inverse dim formula to find dim based on number of rotations
+def _yarn_find_correction_dim(num_rotations: int,
+                              dim: int,
+                              base: float = 10000,
+                              max_position_embeddings: int = 2048) -> float:
+    return (dim * math.log(max_position_embeddings /
+                           (num_rotations * 2 * math.pi))) / (2 *
+                                                              math.log(base))
+
+
+# Find dim range bounds based on rotations
+def _yarn_find_correction_range(
+        low_rot: int,
+        high_rot: int,
+        dim: int,
+        base: float = 10000,
+        max_position_embeddings: int = 2048) -> tuple[int, int]:
+    low = math.floor(
+        _yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings))
+    high = math.ceil(
+        _yarn_find_correction_dim(high_rot, dim, base,
+                                  max_position_embeddings))
+    return max(low, 0), min(high, dim - 1)  # Clamp values just in case
+
+
+def _yarn_linear_ramp_mask(low: float, high: float, dim: int,
+                           dtype: torch.dtype) -> torch.Tensor:
+    if low == high:
+        high += 0.001  # Prevent singularity
+
+    linear_func = (torch.arange(dim, dtype=dtype) - low) / (high - low)
+    ramp_func = torch.clamp(linear_func, 0, 1)
+    return ramp_func
+
+
+def _yarn_get_mscale(scale: float = 1) -> float:
+    if scale <= 1:
+        return 1.0
+    return 0.1 * math.log(scale) + 1.0
+
+
+class YaRNScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with YaRN method.
+
+    Credits to Peng et al. github.com/jquesnelle/yarn
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        scaling_factor: float,
+        dtype: torch.dtype,
+        *,
+        extrapolation_factor: float = 1,
+        attn_factor: float = 1,
+        beta_fast: int = 32,
+        beta_slow: int = 1,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        self.extrapolation_factor = extrapolation_factor
+        self.attn_factor = attn_factor
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        # Get n-d magnitude scaling corrected for interpolation
+        self.mscale = float(
+            _yarn_get_mscale(self.scaling_factor) * attn_factor)
+        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
+                         is_neox_style, dtype)
+
+    def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
+        pos_freqs = self.base**(
+            torch.arange(0, self.rotary_dim, 2, dtype=torch.float) /
+            self.rotary_dim)
+        inv_freq_extrapolation = 1.0 / pos_freqs
+        inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
+
+        low, high = _yarn_find_correction_range(self.beta_fast, self.beta_slow,
+                                                self.rotary_dim, self.base,
+                                                self.max_position_embeddings)
+        # Get n-d rotational scaling corrected for extrapolation
+        inv_freq_mask = (1 - _yarn_linear_ramp_mask(
+            low, high, self.rotary_dim // 2,
+            dtype=torch.float)) * self.extrapolation_factor
+        inv_freq = inv_freq_interpolation * (
+            1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.scaling_factor)
+        t = torch.arange(self.max_position_embeddings * self.scaling_factor,
+                         dtype=torch.float32)
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = (freqs.cos() * self.mscale)
+        sin = (freqs.sin() * self.mscale)
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+
+class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
+    """Phi3 family of models scaled rotary embedding.
+
+    Based on the original RotaryEmbedding implementation.
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        original_max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        short_factor: list[float],
+        long_factor: list[float],
+        short_mscale: Optional[float] = None,
+        long_mscale: Optional[float] = None,
+    ):
+        super().__init__()
+
+        if is_neox_style is False:
+            raise ValueError(
+                "`Phi3LongRoPEScaledRotaryEmbedding` only supports neox_style."
+            )
+
+        self.rotary_dim = rotary_dim
+        self.head_size = head_size
+        self.max_position_embeddings = max_position_embeddings
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self.base = base
+        self.short_factor = short_factor
+        self.long_factor = long_factor
+
+        scale = self.max_position_embeddings / \
+                self.original_max_position_embeddings
+        if scale <= 1.0:
+            scaling_factor = 1.0
+        else:
+            scaling_factor = math.sqrt(
+                1 + math.log(scale) /
+                math.log(self.original_max_position_embeddings))
+        if short_mscale is None:
+            short_mscale = scaling_factor
+        if long_mscale is None:
+            long_mscale = scaling_factor
+
+        self.short_mscale = short_mscale
+        self.long_mscale = long_mscale
+
+        short_cache = self._compute_cos_sin_cache(
+            original_max_position_embeddings, short_factor, short_mscale)
+        short_cache = short_cache.to(dtype)
+
+        long_cache = self._compute_cos_sin_cache(max_position_embeddings,
+                                                 long_factor, long_mscale)
+        long_cache = long_cache.to(dtype)
+
+        long_short_cache = torch.cat([short_cache, long_cache], dim=0)
+        self.register_buffer("long_short_cos_sin_cache",
+                             long_short_cache,
+                             persistent=False)
+
+    def _compute_inv_freq(self, rescale_factors: list[float]) -> torch.Tensor:
+        rescale_factors = torch.tensor(rescale_factors, dtype=torch.float32)
+        inv_freq = 1.0 / (rescale_factors * (self.base**(torch.arange(
+            0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim)))
+        return inv_freq
+
+    def _compute_cos_sin_cache(
+        self,
+        max_position_embeddings: int,
+        rescale_factors: list[float],
+        mscale: float,
+    ) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(rescale_factors)
+        t = torch.arange(max_position_embeddings, dtype=torch.float)
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos() * mscale
+        sin = freqs.sin() * mscale
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        assert key is not None
+        query = query.view(*query.shape[:-1], -1, self.head_size)
+        key = key.view(*key.shape[:-1], -1, self.head_size)
+
+        k = self.original_max_position_embeddings
+        long_prompt_offset = (torch.any(positions > k).float() *
+                              torch.full_like(positions, k)).long()
+        idx = (torch.add(positions, long_prompt_offset)
+               if long_prompt_offset is not None else positions)
+        idx = torch.add(idx, offsets) if offsets is not None else idx
+        cos_sin = torch.index_select(self.long_short_cos_sin_cache, 0, idx)
+
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        cos = cos.repeat(1, 2).unsqueeze(-2)
+        sin = sin.repeat(1, 2).unsqueeze(-2)
+
+        query_rot = query[..., :self.rotary_dim]
+        query_pass = query[..., self.rotary_dim:]
+        query_rot = query_rot * cos + _rotate_neox(query_rot) * sin
+        query = torch.cat((query_rot, query_pass), dim=-1)
+
+        key_rot = key[..., :self.rotary_dim]
+        key_pass = key[..., self.rotary_dim:]
+        key_rot = key_rot * cos + _rotate_neox(key_rot) * sin
+        key = torch.cat((key_rot, key_pass), dim=-1)
+
+        return query.flatten(-2), key.flatten(-2)
+
+
+def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+
+
+class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with YaRN method.
+
+    Credits to Peng et al. github.com/jquesnelle/yarn
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        scaling_factor: float,
+        dtype: torch.dtype,
+        *,
+        extrapolation_factor: float = 1,
+        attn_factor: float = 1,
+        beta_fast: int = 32,
+        beta_slow: int = 1,
+        mscale: float = 1,
+        mscale_all_dim: float = 0,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        self.extrapolation_factor = extrapolation_factor
+        self.attn_factor = attn_factor
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        # Get n-d magnitude scaling corrected for interpolation.
+        self.mscale = float(
+            yarn_get_mscale(self.scaling_factor, float(mscale)) /
+            yarn_get_mscale(self.scaling_factor, float(mscale_all_dim)) *
+            attn_factor)
+        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
+                         is_neox_style, dtype)
+
+    def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
+        pos_freqs = self.base**(
+            torch.arange(0,
+                         self.rotary_dim,
+                         2,
+                         dtype=torch.float,
+                         device=current_platform.device_type) /
+            self.rotary_dim)
+        inv_freq_extrapolation = 1.0 / pos_freqs
+        inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
+
+        low, high = _yarn_find_correction_range(self.beta_fast, self.beta_slow,
+                                                self.rotary_dim, self.base,
+                                                self.max_position_embeddings)
+        # Get n-d rotational scaling corrected for extrapolation
+        inv_freq_mask = (1 - _yarn_linear_ramp_mask(
+            low, high, self.rotary_dim // 2,
+            dtype=torch.float)) * self.extrapolation_factor
+        inv_freq = inv_freq_interpolation * (
+            1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.scaling_factor)
+        t = torch.arange(self.max_position_embeddings * self.scaling_factor,
+                         device=current_platform.device_type,
+                         dtype=torch.float32)
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = (freqs.cos() * self.mscale)
+        sin = (freqs.sin() * self.mscale)
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """PyTorch-native implementation equivalent to forward()."""
+        assert key is not None
+        query_rot = query[..., :self.rotary_dim]
+        key_rot = key[..., :self.rotary_dim]
+        if self.rotary_dim < self.head_size:
+            query_pass = query[..., self.rotary_dim:]
+            key_pass = key[..., self.rotary_dim:]
+
+        if self.cos_sin_cache.device != positions.device:
+            self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(
+                positions.device)
+        cos_sin = self.cos_sin_cache[torch.add(positions, offsets)
+                                     if offsets is not None else positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        if self.is_neox_style:
+            # NOTE(woosuk): Here we assume that the positions tensor has the
+            # shape [batch_size, seq_len].
+            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
+            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
+        else:
+            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
+            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
+
+        rotate_fn = _rotate_neox if self.is_neox_style else _rotate_gptj
+        query_rot = query_rot * cos + rotate_fn(query_rot) * sin
+        key_rot = key_rot * cos + rotate_fn(key_rot) * sin
+
+        if self.rotary_dim < self.head_size:
+            query = torch.cat((query_rot, query_pass), dim=-1)
+            key = torch.cat((key_rot, key_pass), dim=-1)
+        else:
+            query = query_rot
+            key = key_rot
+        return query, key
+
+
+class Llama3RotaryEmbedding(RotaryEmbedding):
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        scaling_factor: float,
+        low_freq_factor: float,
+        high_freq_factor: float,
+        orig_max_position: int,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        self.low_freq_factor = low_freq_factor
+        self.high_freq_factor = high_freq_factor
+        self.orig_max_position = orig_max_position
+        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
+                         is_neox_style, dtype)
+
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
+        inv_freqs = super()._compute_inv_freq(base)
+        low_freq_wavelen = self.orig_max_position / self.low_freq_factor
+        high_freq_wavelen = self.orig_max_position / self.high_freq_factor
+
+        wave_len = 2 * math.pi / inv_freqs
+        if self.low_freq_factor != self.high_freq_factor:
+            smooth = (self.orig_max_position / wave_len - self.low_freq_factor
+                      ) / (self.high_freq_factor - self.low_freq_factor)
+        else:
+            smooth = 0
+        new_freqs = torch.where(
+            wave_len < high_freq_wavelen,
+            inv_freqs,
+            torch.where(
+                wave_len > low_freq_wavelen,
+                inv_freqs / self.scaling_factor,
+                (1 - smooth) * inv_freqs / self.scaling_factor +
+                smooth * inv_freqs,
+            ),
+        )
+        return new_freqs
+
+
+class Llama4VisionRotaryEmbedding(RotaryEmbedding):
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+    ):
+        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
+                         is_neox_style, dtype)
+
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
+        inv_freqs = super()._compute_inv_freq(base)
+        inv_freqs = inv_freqs[:(self.rotary_dim // 2)]
+        return inv_freqs
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.base)
+
+        # self.max_position_embeddings here is number of image patches
+        # i.e. (image_size // patch_size) ** 2
+        num_patches = self.max_position_embeddings
+        img_idx = torch.arange(num_patches,
+                    dtype=torch.int32) \
+                    .reshape(num_patches, 1)
+        img_idx = torch.cat([img_idx, img_idx[:1]], dim=0)
+        img_idx[-1, -1] = -2  # set to ID_CLS_TOKEN
+        num_patches_single_dim = int(math.sqrt(num_patches))
+        frequencies_x = img_idx % num_patches_single_dim
+        frequencies_y = img_idx // num_patches_single_dim
+        freqs_x = ((frequencies_x + 1)[..., None] *
+                   inv_freq[None, None, :]).repeat_interleave(2, dim=-1)
+        freqs_y = ((frequencies_y + 1)[..., None] *
+                   inv_freq[None, None, :]).repeat_interleave(2, dim=-1)
+        freqs = torch.cat([freqs_x, freqs_y],
+                          dim=-1).float().contiguous()[..., ::2]
+        freqs = freqs.masked_fill(img_idx.reshape(-1, 1, 1) < 0, 0)
+        cache = torch.view_as_complex(
+            torch.stack([torch.cos(freqs), torch.sin(freqs)], dim=-1))
+        return cache
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        assert key is not None
+        self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(query.device)
+        query_ = torch.view_as_complex(query.float().reshape(
+            *query.shape[:-1], -1, 2))
+        key_ = torch.view_as_complex(key.float().reshape(
+            *key.shape[:-1], -1, 2))
+        broadcast_shape = [
+            d if i == 1 or i == (query_.ndim - 1) else 1
+            for i, d in enumerate(query_.shape)
+        ]
+        freqs_ci = self.cos_sin_cache.view(*broadcast_shape)
+        query_out = torch.view_as_real(query_ * freqs_ci).flatten(3)
+        key_out = torch.view_as_real(key_ * freqs_ci).flatten(3)
+        return query_out.type_as(query), key_out.type_as(key)
+
+
+class MRotaryEmbedding(RotaryEmbedding):
+    """Rotary Embedding with Multimodal Sections."""
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        mrope_section: Optional[list[int]] = None,
+    ) -> None:
+        # In Qwen2.5-VL, the maximum index value is related to the duration of
+        # the input video. We enlarge max_position_embeddings to 4 times to get
+        # a larger the cos and sin cache.
+        self.cache_max_position_num = max_position_embeddings * 4
+        super().__init__(head_size, rotary_dim, self.cache_max_position_num,
+                         base, is_neox_style, dtype)
+
+        self.mrope_section = mrope_section
+        if self.mrope_section:
+            assert sum(self.mrope_section) == rotary_dim // 2
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """PyTorch-native implementation equivalent to forward().
+
+        Args:
+            positions:
+                [num_tokens,] (text only) or
+                [3, num_tokens] (T/H/W positions with multimodal inputs)
+            query: [num_tokens, num_heads * head_size]
+            key: [num_tokens, num_kv_heads * head_size]
+        """
+        assert positions.ndim == 1 or positions.ndim == 2
+        assert key is not None
+
+        num_tokens = positions.shape[-1]
+        cos_sin = self.cos_sin_cache[positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        if positions.ndim == 2:
+            assert self.mrope_section
+
+            cos = torch.cat([
+                m[i]
+                for i, m in enumerate(cos.split(self.mrope_section, dim=-1))
+            ],
+                            dim=-1)
+            sin = torch.cat([
+                m[i]
+                for i, m in enumerate(sin.split(self.mrope_section, dim=-1))
+            ],
+                            dim=-1)
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., :self.rotary_dim]
+        query_pass = query[..., self.rotary_dim:]
+        query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style)
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., :self.rotary_dim]
+        key_pass = key[..., self.rotary_dim:]
+        key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style)
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
+    @classmethod
+    def get_input_positions(
+        cls,
+        input_tokens: list[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
+        video_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
+        second_per_grid_ts: Optional[list[float]],
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+        audio_feature_lengths: Optional[torch.Tensor] = None,
+        use_audio_in_video: bool = False,
+    ) -> tuple[list[list[int]], int]:
+        """Get mrope input positions and delta value."""
+
+        image_grid_thw = [] if image_grid_thw is None else image_grid_thw
+        video_grid_thw = [] if video_grid_thw is None else video_grid_thw
+        second_per_grid_ts = [] if second_per_grid_ts is None else \
+            second_per_grid_ts
+
+        llm_positions, mrope_position_delta = \
+            cls.get_input_positions_tensor(
+                input_tokens=input_tokens,
+                hf_config=hf_config,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                second_per_grid_ts=second_per_grid_ts,
+                context_len=context_len,
+                seq_len=seq_len,
+                audio_feature_lengths=audio_feature_lengths,
+                use_audio_in_video=use_audio_in_video,
+            )
+
+        return llm_positions.tolist(), mrope_position_delta
+
+    @classmethod
+    def get_input_positions_tensor(
+        cls,
+        input_tokens: list[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Union[list[list[int]], torch.Tensor],
+        video_grid_thw: Union[list[list[int]], torch.Tensor],
+        second_per_grid_ts: list[float],
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+        audio_feature_lengths: Optional[torch.Tensor] = None,
+        use_audio_in_video: bool = False,
+    ) -> tuple[torch.Tensor, int]:
+        from vllm.transformers_utils.config import thinker_uses_mrope
+        if thinker_uses_mrope(hf_config):
+            return cls._omni_get_input_positions_tensor(
+                input_tokens=input_tokens,
+                hf_config=hf_config,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                second_per_grid_ts=second_per_grid_ts,
+                context_len=context_len,
+                seq_len=seq_len,
+                audio_feature_lengths=audio_feature_lengths,
+                use_audio_in_video=use_audio_in_video,
+            )
+        elif "glm4v" in hf_config.model_type:
+            return cls._glm4v_get_input_positions_tensor(
+                input_tokens=input_tokens,
+                hf_config=hf_config,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                context_len=context_len,
+                seq_len=seq_len,
+            )
+        else:
+            return cls._vl_get_input_positions_tensor(
+                input_tokens=input_tokens,
+                hf_config=hf_config,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                second_per_grid_ts=second_per_grid_ts,
+                context_len=context_len,
+                seq_len=seq_len,
+            )
+
+    @classmethod
+    def _glm4v_get_input_positions_tensor(
+        cls,
+        input_tokens: list[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Union[list[list[int]], torch.Tensor],
+        video_grid_thw: Union[list[list[int]], torch.Tensor],
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+    ) -> tuple[torch.Tensor, int]:
+        """Get mrope input positions and delta value for GLM4V."""
+
+        image_token_id = hf_config.image_token_id
+        video_start_token_id = hf_config.video_start_token_id
+        video_end_token_id = hf_config.video_end_token_id
+        spatial_merge_size = hf_config.vision_config.spatial_merge_size
+        llm_pos_ids_list: list = []
+
+        if not (image_grid_thw is None and video_grid_thw is None):
+            if isinstance(image_grid_thw, torch.Tensor):
+                image_grid_thw = image_grid_thw.tolist()
+
+            input_token_type: list[str] = []
+            video_check_flg = False
+            for token in input_tokens:
+                if token == video_start_token_id:
+                    video_check_flg = True
+                elif token == video_end_token_id:
+                    video_check_flg = False
+
+                if (token == image_token_id) and (video_check_flg is False):
+                    input_token_type.append("image")
+                elif (token == image_token_id) and (video_check_flg is True):
+                    input_token_type.append("video")
+                else:
+                    input_token_type.append("text")
+
+            input_type_group: list[tuple[str, int, int]] = []
+            for key, group_iter in itertools.groupby(
+                    enumerate(input_token_type), lambda x: x[1]):
+                group_list = list(group_iter)
+                start_index = group_list[0][0]
+                end_index = group_list[-1][0] + 1
+                input_type_group.append((key, start_index, end_index))
+
+            video_frame_num = 1
+            mm_data_idx = 0
+            for modality_type, start_idx, end_idx in input_type_group:
+                st_idx = llm_pos_ids_list[-1].max() + 1 if len(
+                    llm_pos_ids_list) > 0 else 0
+                if modality_type == "image":
+                    t, h, w = (
+                        image_grid_thw[mm_data_idx][0],
+                        image_grid_thw[mm_data_idx][1],
+                        image_grid_thw[mm_data_idx][2],
+                    )
+                    llm_grid_t, llm_grid_h, llm_grid_w = \
+                        t, h // spatial_merge_size, w // spatial_merge_size
+
+                    t_index = torch.arange(llm_grid_t).view(-1, 1).expand(
+                        -1, llm_grid_h * llm_grid_w).flatten()
+                    h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
+                        llm_grid_t, -1, llm_grid_w).flatten()
+                    w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
+                        llm_grid_t, llm_grid_h, -1).flatten()
+                    llm_pos_ids_list.append(
+                        torch.stack([t_index, h_index, w_index]) + st_idx)
+                    mm_data_idx += 1
+
+                elif modality_type == "video":
+                    t, h, w = (
+                        video_frame_num,
+                        image_grid_thw[mm_data_idx][1],
+                        image_grid_thw[mm_data_idx][2],
+                    )
+                    llm_grid_t, llm_grid_h, llm_grid_w = \
+                        t, h // spatial_merge_size, w // spatial_merge_size
+
+                    for t_idx in range(llm_grid_t):
+                        t_index = torch.tensor(t_idx).view(-1, 1).expand(
+                            -1, llm_grid_h * llm_grid_w).flatten()
+                        h_index = torch.arange(llm_grid_h).view(
+                            1, -1, 1).expand(1, -1, llm_grid_w).flatten()
+                        w_index = torch.arange(llm_grid_w).view(
+                            1, 1, -1).expand(1, llm_grid_h, -1).flatten()
+                        llm_pos_ids_list.append(
+                            torch.stack([t_index, h_index, w_index]) + st_idx)
+
+                    mm_data_idx += 1
+                    video_frame_num += 1
+
+                else:
+                    text_len = end_idx - start_idx
+                    llm_pos_ids_list.append(
+                        torch.arange(text_len).view(1, -1).expand(3, -1) +
+                        st_idx)
+                    video_frame_num = 1
+
+        else:
+            text_len = len(input_tokens)
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1))
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        llm_positions = llm_positions[:, context_len:seq_len]
+        mrope_position_delta = (llm_positions.max() + 1 -
+                                len(input_tokens)).item()
+        return llm_positions, mrope_position_delta
+
+    @classmethod
+    def _vl_get_input_positions_tensor(
+        cls,
+        input_tokens: list[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Union[list[list[int]], torch.Tensor],
+        video_grid_thw: Union[list[list[int]], torch.Tensor],
+        second_per_grid_ts: list[float],
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+    ) -> tuple[torch.Tensor, int]:
+        """Get mrope input positions and delta value."""
+
+        image_token_id = hf_config.image_token_id
+        video_token_id = hf_config.video_token_id
+        vision_start_token_id = hf_config.vision_start_token_id
+        spatial_merge_size = hf_config.vision_config.spatial_merge_size
+        tokens_per_second = getattr(hf_config.vision_config,
+                                    "tokens_per_second", 1.0)
+
+        input_tokens_tensor = torch.tensor(input_tokens)
+        vision_start_indices = torch.argwhere(
+            input_tokens_tensor == vision_start_token_id).squeeze(1)
+        vision_tokens = input_tokens_tensor[vision_start_indices + 1]
+        image_nums = (vision_tokens == image_token_id).sum()
+        video_nums = (vision_tokens == video_token_id).sum()
+        llm_pos_ids_list: list = []
+
+        st = 0
+        remain_images, remain_videos = image_nums, video_nums
+
+        image_index, video_index = 0, 0
+        for _ in range(image_nums + video_nums):
+            video_second_per_grid_t = 0.0
+            if image_token_id in input_tokens and remain_images > 0:
+                ed_image = input_tokens.index(image_token_id, st)
+            else:
+                ed_image = len(input_tokens) + 1
+            if video_token_id in input_tokens and remain_videos > 0:
+                ed_video = input_tokens.index(video_token_id, st)
+            else:
+                ed_video = len(input_tokens) + 1
+            if ed_image < ed_video:
+                t, h, w = (
+                    image_grid_thw[image_index][0],
+                    image_grid_thw[image_index][1],
+                    image_grid_thw[image_index][2],
+                )
+                image_index += 1
+                remain_images -= 1
+                ed = ed_image
+            else:
+                t, h, w = (
+                    video_grid_thw[video_index][0],
+                    video_grid_thw[video_index][1],
+                    video_grid_thw[video_index][2],
+                )
+                video_second_per_grid_t = 1.0
+                if second_per_grid_ts:
+                    video_second_per_grid_t = second_per_grid_ts[video_index]
+                video_index += 1
+                remain_videos -= 1
+                ed = ed_video
+
+            llm_grid_t, llm_grid_h, llm_grid_w = \
+                t, h // spatial_merge_size, w // spatial_merge_size
+            text_len = ed - st
+
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
+                llm_pos_ids_list) > 0 else 0
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+            t_index = (torch.arange(llm_grid_t).view(-1, 1).expand(
+                -1, llm_grid_h * llm_grid_w) * video_second_per_grid_t *
+                       tokens_per_second).long().flatten()
+
+            h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
+                llm_grid_t, -1, llm_grid_w).flatten()
+            w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
+                llm_grid_t, llm_grid_h, -1).flatten()
+            llm_pos_ids_list.append(
+                torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+        if st < len(input_tokens):
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
+                llm_pos_ids_list) > 0 else 0
+            text_len = len(input_tokens) - st
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        mrope_position_delta = (llm_positions.max() + 1 -
+                                len(input_tokens)).item()
+        llm_positions = llm_positions[:, context_len:seq_len]
+
+        return llm_positions, mrope_position_delta
+
+    @classmethod
+    def _omni_get_input_positions_tensor(
+        cls,
+        input_tokens: list[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Union[list[list[int]], torch.Tensor],
+        video_grid_thw: Union[list[list[int]], torch.Tensor],
+        second_per_grid_ts: Optional[list[float]] = None,
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+        audio_feature_lengths: Optional[torch.Tensor] = None,
+        use_audio_in_video: bool = False,
+    ) -> tuple[torch.Tensor, int]:
+        """Get mrope input positions and delta value (Qwen2.5-Omni version).
+
+        Differences from MRotaryEmbedding:
+            1. Add audio support (and related `audio_feature_lengths`).
+            2. Add `use_audio_in_video` option to read audio from video inputs.
+                In this case, audio and vision position ids will be split into
+                chunks and interleaved.
+
+        Example:
+
+            (V_i are vision position ids, A_i are audio position ids)
+
+            |V_1 ...    V_n|A_1 ...   A_n|V_n+1 ... V_2n|A_n+1 ... A_2n|...
+            |vision chunk 1|audio chunk 1|vision chunk 2|audio chunk 2 |...
+        """
+
+        # TODO(fyabc): refactor and share more code with
+        #  _vl_get_input_positions_tensor.
+
+        thinker_config = hf_config.thinker_config
+        audio_token_id = thinker_config.audio_token_index
+        image_token_id = thinker_config.image_token_index
+        video_token_id = thinker_config.video_token_index
+        audio_start_token_id = thinker_config.audio_start_token_id
+        audio_end_token_id = thinker_config.audio_end_token_id
+        vision_start_token_id = thinker_config.vision_start_token_id
+        vision_end_token_id = thinker_config.vision_end_token_id
+        seconds_per_chunk = thinker_config.seconds_per_chunk
+        spatial_merge_size = thinker_config.vision_config.spatial_merge_size
+        tokens_per_second = getattr(thinker_config.vision_config,
+                                    "tokens_per_second", 25)
+
+        if isinstance(image_grid_thw, list):
+            image_grid_thw = torch.tensor(image_grid_thw)
+        if isinstance(video_grid_thw, list):
+            video_grid_thw = torch.tensor(video_grid_thw)
+
+        src_item = input_tokens
+        audio_seqlens = audio_feature_lengths
+        if not second_per_grid_ts:
+            second_per_grid_ts = [1] * video_grid_thw.shape[0]
+        audio_idx = 0
+        video_idx = 0
+        image_idx = 0
+        new_src_item: list[int] = []
+        llm_pos_ids_list: list[torch.Tensor] = []
+
+        idx = 0
+        while idx < len(src_item):
+            new_src_item_len = len(new_src_item)
+            start_idx = llm_pos_ids_list[-1].max() + 1 if len(
+                llm_pos_ids_list) > 0 else 0
+            if src_item[idx] not in [
+                    audio_token_id, video_token_id, image_token_id
+            ]:
+                if use_audio_in_video and idx > 0:
+                    if src_item[idx] == vision_end_token_id and \
+                        src_item[idx - 1] == audio_end_token_id:
+                        # processing the <|audio_eos|> before <|vision_eos|>
+                        start_idx -= 1
+                    elif src_item[idx] == audio_start_token_id and \
+                        src_item[idx - 1] == vision_start_token_id:
+                        # processing the <|audio_bos|> after <|vision_eos|>
+                        start_idx -= 1
+                new_src_item.append(src_item[idx])
+                llm_pos_ids = torch.tensor([start_idx],
+                                           dtype=torch.long).expand(3, -1)
+                llm_pos_ids_list.append(llm_pos_ids)
+            elif src_item[idx] == audio_token_id:
+                assert audio_seqlens is not None
+                audio_seqlen = audio_seqlens[audio_idx]
+                place_num = (((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1)
+                new_src_item.extend([audio_token_id] * place_num)
+                llm_pos_ids = torch.arange(place_num).expand(3, -1) + start_idx
+                llm_pos_ids_list.append(llm_pos_ids)
+                audio_idx += 1
+            elif src_item[idx] == image_token_id:
+                grid_t = image_grid_thw[image_idx][0]
+                grid_hs = image_grid_thw[:, 1]
+                grid_ws = image_grid_thw[:, 2]
+                t_index = (torch.arange(grid_t) * 1 * tokens_per_second).long()
+                llm_pos_ids = cls._get_llm_pos_ids_for_vision(
+                    start_idx, image_idx, spatial_merge_size, t_index, grid_hs,
+                    grid_ws)
+                llm_pos_ids_list.append(llm_pos_ids)
+                vision_seqlen = image_grid_thw[image_idx].prod() // (
+                    spatial_merge_size**2)
+                new_src_item.extend([image_token_id] * vision_seqlen)
+                image_idx += 1
+            elif src_item[idx] == video_token_id and not use_audio_in_video:
+                grid_t = video_grid_thw[video_idx][0]
+                grid_hs = video_grid_thw[:, 1]
+                grid_ws = video_grid_thw[:, 2]
+                t_index = (torch.arange(grid_t) *
+                           second_per_grid_ts[video_idx] *
+                           tokens_per_second).long()
+                llm_pos_ids = cls._get_llm_pos_ids_for_vision(
+                    start_idx, video_idx, spatial_merge_size, t_index, grid_hs,
+                    grid_ws)
+                llm_pos_ids_list.append(llm_pos_ids)
+                vision_seqlen = video_grid_thw[video_idx].prod() // (
+                    spatial_merge_size**2)
+                new_src_item.extend([video_token_id] * vision_seqlen)
+                video_idx += 1
+            else:
+                # read audio from video
+                assert audio_seqlens is not None
+                audio_seqlen = audio_seqlens[audio_idx]
+                vision_seqlen = video_grid_thw[video_idx].prod() // (
+                    spatial_merge_size**2)
+                grid_t = video_grid_thw[video_idx][0]
+                grid_h = video_grid_thw[video_idx][1]
+                grid_w = video_grid_thw[video_idx][2]
+                grid_hs = video_grid_thw[:, 1]
+                grid_ws = video_grid_thw[:, 2]
+                t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk)
+                t_index = (torch.arange(grid_t) *
+                           second_per_grid_ts[video_idx] *
+                           tokens_per_second).long()
+                t_index_split_chunk = cls._split_list_into_ranges(
+                    t_index, t_ntoken_per_chunk)
+                place_num = (((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1) + 2
+                pure_audio_len = place_num - 2
+                added_audio_len = 0
+                audio_llm_pos_ids_list: list[torch.Tensor] = []
+                for t_chunk in t_index_split_chunk:
+                    vision_ntoken_per_chunk = len(
+                        t_chunk) * grid_h * grid_w // (spatial_merge_size**2)
+                    new_src_item.extend([video_token_id] *
+                                        vision_ntoken_per_chunk)
+                    vision_llm_pos_ids_list = cls._get_llm_pos_ids_for_vision(
+                        start_idx, video_idx, spatial_merge_size, t_chunk,
+                        grid_hs, grid_ws).split(1, dim=1)
+                    llm_pos_ids_list.extend(vision_llm_pos_ids_list)
+                    new_src_item.extend(
+                        min(t_ntoken_per_chunk, pure_audio_len -
+                            added_audio_len) * [audio_token_id])
+                    audio_start_idx = start_idx if len(
+                        audio_llm_pos_ids_list
+                    ) == 0 else audio_llm_pos_ids_list[-1][0].item() + 1
+                    if min(t_ntoken_per_chunk,
+                           pure_audio_len - added_audio_len) > 0:
+                        audio_llm_pos_ids_list = (torch.arange(
+                            min(t_ntoken_per_chunk, pure_audio_len -
+                                added_audio_len)).expand(3, -1) +
+                                                  audio_start_idx).split(1,
+                                                                         dim=1)
+                    else:
+                        audio_llm_pos_ids_list = []
+                    added_audio_len += min(t_ntoken_per_chunk,
+                                           pure_audio_len - added_audio_len)
+                    llm_pos_ids_list.extend(audio_llm_pos_ids_list)
+                if added_audio_len < pure_audio_len:
+                    new_src_item.extend(
+                        (pure_audio_len - added_audio_len) * [audio_token_id])
+                    audio_llm_pos_ids_list = (
+                        torch.arange(pure_audio_len - added_audio_len).expand(
+                            3, -1) + llm_pos_ids_list[-1].max() + 1).split(
+                                1, dim=1)
+                    llm_pos_ids_list.extend(audio_llm_pos_ids_list)
+                audio_idx += 1
+                video_idx += 1
+            # move to the next token
+            idx += len(new_src_item) - new_src_item_len
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1)
+        mrope_position_delta = torch.cat(llm_pos_ids_list,
+                                         dim=1).max() + 1 - len(src_item)
+        llm_positions = llm_positions[:, context_len:seq_len]
+
+        return llm_positions, mrope_position_delta
+
+    @staticmethod
+    def _get_llm_pos_ids_for_vision(
+        start_idx: int,
+        vision_idx: int,
+        spatial_merge_size: int,
+        t_index: list[int],
+        grid_hs: torch.Tensor,
+        grid_ws: torch.Tensor,
+    ) -> torch.Tensor:
+        llm_pos_ids_list = []
+        llm_grid_h = grid_hs[vision_idx] // spatial_merge_size
+        llm_grid_w = grid_ws[vision_idx] // spatial_merge_size
+        h_index = (torch.arange(llm_grid_h).view(1, -1, 1).expand(
+            len(t_index), -1, llm_grid_w).flatten())
+        w_index = (torch.arange(llm_grid_w).view(1, 1, -1).expand(
+            len(t_index), llm_grid_h, -1).flatten())
+        t_index_tensor = torch.Tensor(t_index).to(llm_grid_h.device).view(
+            -1, 1).expand(-1, llm_grid_h * llm_grid_w).long().flatten()
+        _llm_pos_ids = torch.stack([t_index_tensor, h_index, w_index])
+        llm_pos_ids_list.append(_llm_pos_ids + start_idx)
+        llm_pos_ids = torch.cat(llm_pos_ids_list, dim=1)
+        return llm_pos_ids
+
+    @staticmethod
+    def _split_list_into_ranges(lst: torch.Tensor,
+                                interval: int) -> list[list[int]]:
+        ranges: list[list[int]] = [[]
+                                   for _ in range((max(lst) // interval) + 1)]
+        for num in lst:
+            index = num // interval
+            ranges[index].append(num)
+        return ranges
+
+    @staticmethod
+    def get_next_input_positions(
+        mrope_position_delta: int,
+        context_len: int,
+        seq_len: int,
+    ) -> list[list[int]]:
+        return [
+            list(
+                range(context_len + mrope_position_delta,
+                      seq_len + mrope_position_delta)) for _ in range(3)
+        ]
+
+    @staticmethod
+    def get_next_input_positions_tensor(out: np.ndarray, out_offset: int,
+                                        mrope_position_delta: int,
+                                        context_len: int, num_new_tokens: int):
+
+        values = np.arange(mrope_position_delta + context_len,
+                           mrope_position_delta + context_len + num_new_tokens,
+                           dtype=out.dtype)
+        out[:, out_offset:out_offset + num_new_tokens] = values
+
+    @classmethod
+    def omni_get_updates_use_audio_in_video(
+        cls,
+        thinker_config: PretrainedConfig,
+        audio_len: int,
+        video_grid_thw: Union[list[int], torch.Tensor],
+        video_second_per_grid_t: float,
+    ) -> list[int]:
+        """Get video prompt updates when `use_audio_in_video` is True.
+
+        In this case, audio and vision update ids will be split into
+        chunks and interleaved (details in `_omni_get_input_positions_tensor`).
+
+        <|video_bos|><|VIDEO|><|video_eos|> =>
+        <|video_bos|><|audio_bos|>(... chunks ...)<|audio_eos|><|video_eos|>
+        """
+
+        audio_token_id = thinker_config.audio_token_index
+        video_token_id = thinker_config.video_token_index
+        audio_start_token_id = thinker_config.audio_start_token_id
+        audio_end_token_id = thinker_config.audio_end_token_id
+        seconds_per_chunk = thinker_config.seconds_per_chunk
+        spatial_merge_size = thinker_config.vision_config.spatial_merge_size
+        tokens_per_second = getattr(thinker_config.vision_config,
+                                    "tokens_per_second", 25)
+
+        grid_t = video_grid_thw[0]
+        grid_h = video_grid_thw[1]
+        grid_w = video_grid_thw[2]
+        t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk)
+        t_index = (torch.arange(grid_t) * video_second_per_grid_t *
+                   tokens_per_second).long()
+        t_index_split_chunk = cls._split_list_into_ranges(
+            t_index, t_ntoken_per_chunk)
+
+        updates = [audio_start_token_id]
+        added_audio_len = 0
+        for t_chunk in t_index_split_chunk:
+            vision_ntoken_per_chunk = len(t_chunk) * grid_h * grid_w // (
+                spatial_merge_size**2)
+            updates.extend([video_token_id] * vision_ntoken_per_chunk)
+
+            audio_chunk_size = min(t_ntoken_per_chunk,
+                                   audio_len - added_audio_len)
+            updates.extend(audio_chunk_size * [audio_token_id])
+            added_audio_len += audio_chunk_size
+        if added_audio_len < audio_len:
+            updates.extend((audio_len - added_audio_len) * [audio_token_id])
+        updates.extend([audio_end_token_id])
+
+        return updates
+
+
+@CustomOp.register("dual_chunk_rotary_embedding")
+class DualChunkRotaryEmbedding(CustomOp):
+    """Rotary positional embedding for Dual Chunk Attention."""
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        chunk_size: int,
+        local_size: int,
+    ) -> None:
+        super().__init__()
+        self.head_size = head_size
+        self.rotary_dim = rotary_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.is_neox_style = is_neox_style
+        self.chunk_size = chunk_size
+        self.local_size = local_size
+        self.dtype = dtype
+        self.device = torch.device(f"cuda:{torch.cuda.current_device()}")
+        (q_cache, qc_cache, k_cache, qc_no_clamp_cache,
+         q_inter_cache) = self._compute_cos_sin_cache()
+
+        self.register_buffer("cos_sin_q_cache", q_cache, persistent=False)
+        self.register_buffer("cos_sin_qc_cache", qc_cache, persistent=False)
+        self.register_buffer("cos_sin_k_cache", k_cache, persistent=False)
+        self.register_buffer("cos_sin_qc_no_clamp_cache",
+                             qc_no_clamp_cache,
+                             persistent=False)
+        self.register_buffer("cos_sin_q_inter_cache",
+                             q_inter_cache,
+                             persistent=False)
+
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
+        """Compute the inverse frequency."""
+        # NOTE(woosuk): The HF implementation uses `torch.arange(...).float()`.
+        # However, we use `torch.arange(..., dtype=torch.float)` instead to
+        # avoid numerical issues with large base values (e.g., 10000000).
+        # This may cause a slight numerical difference between the HF
+        # implementation and ours.
+        # NOTE(woosuk): To exactly match the HF implementation, we need to
+        # use CPU to compute the cache and then move it to GPU. However, we
+        # create the cache on GPU for faster initialization. This may cause
+        # a slight numerical difference between the HF implementation and ours.
+        inv_freq = 1.0 / (base**(torch.arange(
+            0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim))
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        """Compute the cos and sin cache."""
+        inv_freq = self._compute_inv_freq(self.base)
+        chunk_len = self.chunk_size - self.local_size
+        q_t = torch.arange(chunk_len, dtype=torch.float)
+        qc_t = (torch.arange(chunk_len, dtype=torch.float) +
+                chunk_len).clamp(max=self.chunk_size)
+        k_t = torch.arange(self.max_position_embeddings,
+                           dtype=torch.float) % chunk_len
+
+        # count from chunk_len, no clamp(self.chunk_size) restriction
+        qc_no_clamp_t = torch.arange(chunk_len, dtype=torch.float) + chunk_len
+        # count from self.chunk_size for q_inter's rope
+        q_inter_t = torch.arange(chunk_len,
+                                 dtype=torch.float) + self.chunk_size
+
+        q_freqs = torch.outer(q_t, inv_freq)
+        qc_freqs = torch.outer(qc_t, inv_freq)
+        k_freqs = torch.outer(k_t, inv_freq)
+        qc_no_clamp_freqs = torch.outer(qc_no_clamp_t, inv_freq)
+        q_inter_freqs = torch.outer(q_inter_t, inv_freq)
+
+        q_cos = q_freqs.cos()
+        q_sin = q_freqs.sin()
+        qc_cos = qc_freqs.cos()
+        qc_sin = qc_freqs.sin()
+        k_cos = k_freqs.cos()
+        k_sin = k_freqs.sin()
+
+        qc_no_clamp_cos = qc_no_clamp_freqs.cos()
+        qc_no_clamp_sin = qc_no_clamp_freqs.sin()
+        q_inter_cos = q_inter_freqs.cos()
+        q_inter_sin = q_inter_freqs.sin()
+
+        q_cache = torch.cat((q_cos, q_sin), dim=-1).to(dtype=self.dtype,
+                                                       device=self.device)
+        qc_cache = torch.cat((qc_cos, qc_sin), dim=-1).to(dtype=self.dtype,
+                                                          device=self.device)
+        k_cache = torch.cat((k_cos, k_sin), dim=-1).to(dtype=self.dtype,
+                                                       device=self.device)
+        qc_no_clamp_cache = torch.cat((qc_no_clamp_cos, qc_no_clamp_sin),
+                                      dim=-1).to(dtype=self.dtype,
+                                                 device=self.device)
+        q_inter_cache = torch.cat((q_inter_cos, q_inter_sin),
+                                  dim=-1).to(dtype=self.dtype,
+                                             device=self.device)
+        return q_cache, qc_cache, k_cache, qc_no_clamp_cache, q_inter_cache
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        query = query.view(*query.shape[:-1], -1, self.head_size)
+        key = key.view(*key.shape[:-1], -1, self.head_size)
+        query_rot = query[..., :self.rotary_dim]
+        key_rot = key[..., :self.rotary_dim]
+        if self.rotary_dim < self.head_size:
+            query_pass = query[..., self.rotary_dim:]
+            key_pass = key[..., self.rotary_dim:]
+        else:
+            query_pass = None
+            key_pass = None
+
+        positions_with_offsets = (torch.add(positions, offsets)
+                                  if offsets is not None else positions)
+        key = self._apply_rotary_embedding(
+            self.cos_sin_k_cache[positions_with_offsets], key_rot, key_pass)
+        chunk_len = self.chunk_size - self.local_size
+        query = self._apply_rotary_embedding(
+            self.cos_sin_q_cache[positions_with_offsets % chunk_len],
+            query_rot, query_pass)
+        query_succ = self._apply_rotary_embedding(
+            self.cos_sin_qc_cache[positions_with_offsets % chunk_len],
+            query_rot, query_pass)
+        query_inter = self._apply_rotary_embedding(
+            self.cos_sin_qc_cache[chunk_len - 1].repeat(positions.shape[0], 1),
+            query_rot, query_pass)
+        query_succ_critical = self._apply_rotary_embedding(
+            self.cos_sin_qc_no_clamp_cache[positions_with_offsets % chunk_len],
+            query_rot, query_pass)
+        query_inter_critical = self._apply_rotary_embedding(
+            self.cos_sin_q_inter_cache[positions_with_offsets % chunk_len],
+            query_rot, query_pass)
+
+        # merge query into one tensor to simplify the interfaces
+        query = torch.cat((
+            query,
+            query_succ,
+            query_inter,
+            query_succ_critical,
+            query_inter_critical,
+        ),
+                          dim=-1)
+        return query, key
+
+    def _apply_rotary_embedding(self, cos_sin, hidden_rot, hidden_pass):
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        if self.is_neox_style:
+            # NOTE(woosuk): Here we assume that the positions tensor has the
+            # shape [batch_size, seq_len].
+            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
+            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
+        else:
+            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
+            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
+        rotate_fn = _rotate_neox if self.is_neox_style else _rotate_gptj
+        hidden_rot = hidden_rot * cos + rotate_fn(hidden_rot) * sin
+
+        if self.rotary_dim < self.head_size:
+            hidden = torch.cat((hidden_rot, hidden_pass), dim=-1)
+        else:
+            hidden = hidden_rot
+        return hidden.flatten(-2).squeeze(0)
+
+    def extra_repr(self) -> str:
+        s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
+        s += f", max_position_embeddings={self.max_position_embeddings}"
+        s += f", base={self.base}, is_neox_style={self.is_neox_style}"
+        s += f", chunk_size={self.chunk_size}, local_size={self.local_size}"
+        return s
+
+
+_ROPE_DICT: dict[tuple, RotaryEmbedding] = {}
+
+
+def get_rope(
+    head_size: int,
+    rotary_dim: int,
+    max_position: int,
+    base: float,
+    is_neox_style: bool = True,
+    rope_scaling: Optional[dict[str, Any]] = None,
+    dtype: Optional[torch.dtype] = None,
+    partial_rotary_factor: float = 1.0,
+    dual_chunk_attention_config: Optional[dict[str, Any]] = None,
+) -> RotaryEmbedding:
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+    if rope_scaling is not None:
+        # Transforms every value that is a list into a tuple for caching calls
+        rope_scaling_tuple = {
+            k: tuple(v) if isinstance(v, list) else v
+            for k, v in rope_scaling.items()
+        }
+        rope_scaling_args = tuple(rope_scaling_tuple.items())
+    else:
+        rope_scaling_args = None
+
+    if dual_chunk_attention_config is not None:
+        dual_chunk_attention_tuple = {
+            k: tuple(v) if isinstance(v, list) else v
+            for k, v in dual_chunk_attention_config.items()
+            if k != "sparse_attention_config"
+        }
+        dual_chunk_attention_args = tuple(dual_chunk_attention_tuple.items())
+    else:
+        dual_chunk_attention_args = None
+
+    if partial_rotary_factor < 1.0:
+        rotary_dim = int(rotary_dim * partial_rotary_factor)
+    key = (head_size, rotary_dim, max_position, base, is_neox_style,
+           rope_scaling_args, dual_chunk_attention_args, dtype)
+    if key in _ROPE_DICT:
+        return _ROPE_DICT[key]
+
+    if dual_chunk_attention_config is not None:
+        extra_kwargs = {
+            k: v
+            for k, v in dual_chunk_attention_config.items()
+            if k in ("chunk_size", "local_size")
+        }
+        rotary_emb = DualChunkRotaryEmbedding(head_size, rotary_dim,
+                                              max_position, base,
+                                              is_neox_style, dtype,
+                                              **extra_kwargs)
+    elif not rope_scaling:
+        rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base,
+                                     is_neox_style, dtype)
+    else:
+        scaling_type = rope_scaling["rope_type"]
+
+        if scaling_type == "llama3":
+            scaling_factor = rope_scaling["factor"]
+            low_freq_factor = rope_scaling["low_freq_factor"]
+            high_freq_factor = rope_scaling["high_freq_factor"]
+            original_max_position = rope_scaling[
+                "original_max_position_embeddings"]
+            rotary_emb = Llama3RotaryEmbedding(head_size, rotary_dim,
+                                               max_position, base,
+                                               is_neox_style, dtype,
+                                               scaling_factor, low_freq_factor,
+                                               high_freq_factor,
+                                               original_max_position)
+        elif scaling_type == "mllama4":
+            rotary_emb = Llama4VisionRotaryEmbedding(head_size, rotary_dim,
+                                                     max_position, base,
+                                                     is_neox_style, dtype)
+        elif scaling_type == "default":
+            if "mrope_section" in rope_scaling:
+                rotary_emb = MRotaryEmbedding(
+                    head_size,
+                    rotary_dim,
+                    max_position,
+                    base,
+                    is_neox_style,
+                    dtype,
+                    mrope_section=rope_scaling["mrope_section"],
+                )
+            else:
+                rotary_emb = RotaryEmbedding(
+                    head_size,
+                    rotary_dim,
+                    max_position,
+                    base,
+                    is_neox_style,
+                    dtype,
+                )
+        elif scaling_type == "linear":
+            scaling_factor = rope_scaling["factor"]
+            rotary_emb = LinearScalingRotaryEmbedding(head_size, rotary_dim,
+                                                      max_position, base,
+                                                      is_neox_style,
+                                                      scaling_factor, dtype)
+        elif scaling_type == "ntk":
+            scaling_factor = rope_scaling["factor"]
+            mixed_b = rope_scaling.get('mixed_b', None)
+            rotary_emb = NTKScalingRotaryEmbedding(head_size, rotary_dim,
+                                                   max_position, base,
+                                                   is_neox_style,
+                                                   scaling_factor, dtype,
+                                                   mixed_b)
+        elif scaling_type == "dynamic":
+            if "alpha" in rope_scaling:
+                scaling_alpha = rope_scaling["alpha"]
+                rotary_emb = DynamicNTKAlphaRotaryEmbedding(
+                    head_size, rotary_dim, max_position, base, is_neox_style,
+                    scaling_alpha, dtype)
+            elif "factor" in rope_scaling:
+                scaling_factor = rope_scaling["factor"]
+                rotary_emb = DynamicNTKScalingRotaryEmbedding(
+                    head_size, rotary_dim, max_position, base, is_neox_style,
+                    scaling_factor, dtype)
+            else:
+                raise ValueError("Dynamic rope scaling must contain either "
+                                 "'alpha' or 'factor' field")
+        elif scaling_type == "yarn":
+            scaling_factor = rope_scaling["factor"]
+            original_max_position = rope_scaling[
+                "original_max_position_embeddings"]
+            extra_kwargs = {
+                k: v
+                for k, v in rope_scaling.items()
+                if k in ("extrapolation_factor", "attn_factor", "beta_fast",
+                         "beta_slow")
+            }
+            rotary_emb = YaRNScalingRotaryEmbedding(head_size, rotary_dim,
+                                                    original_max_position,
+                                                    base, is_neox_style,
+                                                    scaling_factor, dtype,
+                                                    **extra_kwargs)
+        elif scaling_type == "deepseek_yarn":
+            scaling_factor = rope_scaling["factor"]
+            original_max_position = rope_scaling[
+                "original_max_position_embeddings"]
+            # assert max_position == original_max_position * scaling_factor
+            extra_kwargs = {
+                k: v
+                for k, v in rope_scaling.items()
+                if k in ("extrapolation_factor", "attn_factor", "beta_fast",
+                         "beta_slow", "mscale", "mscale_all_dim")
+            }
+            rotary_emb = DeepseekScalingRotaryEmbedding(
+                head_size, rotary_dim, original_max_position, base,
+                is_neox_style, scaling_factor, dtype, **extra_kwargs)
+        elif scaling_type == "longrope":
+            short_factor = rope_scaling["short_factor"]
+            long_factor = rope_scaling["long_factor"]
+            original_max_position = rope_scaling[
+                "original_max_position_embeddings"]
+            extra_kwargs = {
+                k: v
+                for k, v in rope_scaling.items()
+                if k in ("short_mscale", "long_mscale")
+            }
+            rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(
+                head_size, rotary_dim, max_position, original_max_position,
+                base, is_neox_style, dtype, short_factor, long_factor,
+                **extra_kwargs)
+        else:
+            raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+    _ROPE_DICT[key] = rotary_emb
+    return rotary_emb
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/sampler.py b/vllm_v0.10.0/vllm/model_executor/layers/sampler.py
new file mode 100644
index 0000000..e77eb63
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/sampler.py
@@ -0,0 +1,1198 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""A layer that samples the next tokens from the model's outputs."""
+import itertools
+from collections.abc import Iterator
+from dataclasses import dataclass
+from importlib.util import find_spec
+from math import inf
+from typing import Optional, Union
+
+import msgspec
+import torch
+import torch.nn as nn
+
+import vllm.envs as envs
+from vllm.model_executor.layers.utils import apply_penalties
+from vllm.model_executor.sampling_metadata import (SamplingMetadata,
+                                                   SamplingTensors,
+                                                   SequenceGroupToSample)
+from vllm.sampling_params import SamplingType
+from vllm.sequence import (VLLM_INVALID_TOKEN_ID,
+                           CompletionSequenceGroupOutput, Logprob,
+                           PromptLogprobs, SampleLogprobs, SequenceOutput)
+
+if envs.VLLM_USE_FLASHINFER_SAMPLER and find_spec("flashinfer"):
+    # yapf: disable
+    from flashinfer.sampling import (
+        top_k_top_p_sampling_from_probs as flashinfer_top_k_top_p_sampling)
+
+    # yapf: enable
+else:
+    flashinfer_top_k_top_p_sampling = None
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def get_sampler() -> torch.nn.Module:
+    if envs.VLLM_USE_V1:
+        # Lazy import: the v1 package isn't distributed
+        from vllm.v1.sample.sampler import Sampler as V1Sampler
+        return V1Sampler()
+    return Sampler()
+
+
+# (num_token_ids, num_parent_ids) per sequence group.
+SampleResultType = list[tuple[list[int], list[int]]]
+
+# Types of temporary data structures used for
+# computing sample_result
+SampleMetadataType = dict[SamplingType, tuple[list[int],
+                                              list[SequenceGroupToSample]]]
+MultinomialSamplesType = dict[SamplingType, torch.Tensor]
+SampleResultsDictType = dict[int, tuple[list[int], list[int]]]
+
+
+# Encapsulates temporary data structures for computing
+# sample_result.
+#
+# * For multi-step scheduling: must be returned
+#   by `Sampler.forward()` and used later to compute the pythonized
+#   sample_result
+#
+# * For single-step scheduling: consumed immediately
+#   inside `Sampler.forward()` to compute pythonized sample_result.
+@dataclass
+class SampleResultArgsType:
+    sample_metadata: SampleMetadataType
+    multinomial_samples: MultinomialSamplesType
+    sample_results_dict: SampleResultsDictType
+    sampling_metadata: SamplingMetadata
+    greedy_samples: Optional[torch.Tensor]
+
+
+# Union of non-deferred (single-step scheduling)
+# vs deferred (multi-step scheduling)
+# sample result types
+MaybeDeferredSampleResultType = Union[SampleResultType, SampleResultArgsType]
+
+# Abbreviation of the _sample() return type
+SampleReturnType = tuple[MaybeDeferredSampleResultType, Optional[torch.Tensor]]
+
+
+class SamplerOutput(
+        msgspec.Struct,
+        omit_defaults=True,  # type: ignore[call-arg]
+        array_like=True):  # type: ignore[call-arg]
+    """For each sequence group, we generate a list of SequenceOutput object,
+    each of which contains one possible candidate for the next token.
+
+    This data structure implements methods, so it can be used like a list, but
+    also has optional fields for device tensors.
+    """
+
+    outputs: list[CompletionSequenceGroupOutput]
+
+    # On-device tensor containing probabilities of each token.
+    sampled_token_probs: Optional[torch.Tensor] = None
+
+    # On-device tensor containing the logprobs of each token.
+    logprobs: Optional["torch.Tensor"] = None
+
+    # Holds either (1) the pythonized sampler result (single-step scheduling)
+    # or (2) what will be arguments for later deferred pythonization of the
+    # sampler result (muliti-step scheduling)
+    deferred_sample_results_args: Optional[SampleResultArgsType] = None
+
+    # On-device tensor containing the sampled token ids.
+    sampled_token_ids: Optional[torch.Tensor] = None
+    # CPU tensor containing the sampled token ids. Used during multi-step to
+    # return the sampled token ids from last rank to AsyncLLMEngine to be
+    # 'broadcasted' to all other PP ranks for next step.
+    sampled_token_ids_cpu: Optional[torch.Tensor] = None
+
+    # On-device tensor containing the sampled token embeddings (embeddings
+    # corresponding to the sampled token ids). Used when prompt embeddings are
+    # specified in lieu of prompt token ids or text.
+    sampled_token_embeds: Optional[torch.Tensor] = None
+
+    # Optional last hidden states from the model.
+    hidden_states: Optional[torch.Tensor] = None
+
+    # Optional prefill hidden states from the model
+    # (used for models like EAGLE).
+    prefill_hidden_states: Optional[torch.Tensor] = None
+
+    # Time taken in the forward pass for this across all workers
+    model_forward_time: Optional[float] = None
+
+    # Time taken in the model execute function. This will include model forward,
+    # block/sync across workers, cpu-gpu sync time and sampling time.
+    model_execute_time: Optional[float] = None
+
+    def __getitem__(self, idx: int) -> CompletionSequenceGroupOutput:
+        return self.outputs[idx]
+
+    def __setitem__(self, idx: int, value):
+        self.outputs[idx] = value
+
+    def __iter__(self) -> Iterator[CompletionSequenceGroupOutput]:
+        return iter(self.outputs)
+
+    def __len__(self):
+        return len(self.outputs)
+
+    def __eq__(self, other: object):
+        return isinstance(other,
+                          self.__class__) and self.outputs == other.outputs
+
+    def __repr__(self) -> str:
+        """Show the shape of a tensor instead of its values to reduce noise.
+        """
+        sampled_token_probs_repr = ("None" if self.sampled_token_probs is None
+                                    else self.sampled_token_probs.shape)
+        sampled_token_ids_repr = ("None" if self.sampled_token_ids is None else
+                                  self.sampled_token_ids.shape)
+        return (f"SamplerOutput(outputs={self.outputs}, "
+                f"sampled_token_probs={sampled_token_probs_repr}, "
+                f"sampled_token_ids={sampled_token_ids_repr})")
+
+
+class Sampler(nn.Module):
+    """Samples the next tokens from the model's outputs.
+
+    This layer does the following:
+    1. Discard the hidden states that are not used for sampling (i.e., all
+        tokens except the final one in each prompt).
+    2. Compute the logits for the next tokens.
+    3. Apply presence, frequency and repetition penalties.
+    4. Apply temperature scaling.
+    5. Apply top-p and top-k truncation.
+    6. Sample the next tokens.
+    Here, each sequence group within the batch can have different sampling
+    parameters (e.g., sampling method, temperature, top-p, top-k, etc.).
+
+    The structure of the logits tensor is coupled with the seq_groups in
+    sampling_metadata. Typically, each sequence in each seq_group has one row in
+    logits for the next token to be sampled; however, for a seq_group with a
+    prompt request with the prompt_logprobs sampling parameter, there are rows
+    in logits for each token in the input prompt.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+        # Whether or not the SamplerOutput should have on-device tensors
+        # containing the sampled token ids and probabilities. This is used by
+        # speculative decoding and when prompt embeddings are specified.
+        self.include_gpu_probs_tensor = False
+        self.should_modify_greedy_probs_inplace = False
+
+    def _init_sampling_tensors(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ):
+        """The goal here is to reuse sampling tensors between similar decode
+        runs. This is possible because sampling logic does not change between
+        decodes of the same sequences.
+        """
+        _, vocab_size = logits.shape
+
+        # First free any existing stored sampling tensors.
+        # This is necessary because some sampling tensors may
+        # have pinned memory.
+        self._sampling_tensors = None
+
+        # Initialize new sampling tensors
+        (sampling_tensors, do_penalties, do_top_p_top_k,
+         do_min_p) = SamplingTensors.from_sampling_metadata(
+             sampling_metadata, vocab_size, logits.device, logits.dtype)
+
+        self._sampling_tensors = sampling_tensors
+        self._do_penalties = do_penalties
+        self._do_top_p_top_k = do_top_p_top_k
+        self._do_min_p = do_min_p
+
+    def forward(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        """
+        Single-step scheduling:
+            * Perform GPU-side sampling computation & compute
+            GPU-side logprobs tensor
+            * Pythonize sampling result & logprobs tensor
+
+        Multi-step scheduling:
+            * Perform GPU-side sampling computation & compute
+            GPU-side logprobs tensor
+            * Defer Pythonization of sampling result & logprobs
+            tensor
+            * Encapsulate arguments required for deferred Pythonization
+            in the
+            [`SamplerOutput`][vllm.model_executor.layers.sampler.SamplerOutput]
+            structure
+
+        Args:
+            logits: (num_tokens, vocab_size).
+            sampling_metadata: Metadata for sampling.
+        """
+        assert logits is not None
+        _, vocab_size = logits.shape
+
+        # Prepare sampling tensors with pinned memory to avoid blocking.
+        if not sampling_metadata.reuse_sampling_tensors:
+            self._init_sampling_tensors(logits, sampling_metadata)
+        elif self._do_penalties:
+            # In this case, the sampling tensors logic depends on
+            # "output_tokens" of a sequence. As a result, we cannot
+            # reuse sampling tensors, since "output_tokens" changes
+            # between decode runs.
+            self._init_sampling_tensors(logits, sampling_metadata)
+
+        assert self._sampling_tensors is not None
+        sampling_tensors = self._sampling_tensors
+        do_penalties = self._do_penalties
+        do_top_p_top_k = self._do_top_p_top_k
+        do_min_p = self._do_min_p
+
+        logits = _apply_min_tokens_penalty(logits, sampling_metadata)
+
+        # Apply presence and frequency penalties.
+        if do_penalties:
+            logits = apply_penalties(logits, sampling_tensors.prompt_tokens,
+                                     sampling_tensors.output_tokens,
+                                     sampling_tensors.presence_penalties,
+                                     sampling_tensors.frequency_penalties,
+                                     sampling_tensors.repetition_penalties)
+
+        # Use float32 to apply temperature scaling.
+        # Use in-place division to avoid creating a new tensor.
+        logits = logits.to(torch.float)
+        logits.div_(sampling_tensors.temperatures.unsqueeze(dim=1))
+
+        if do_top_p_top_k and flashinfer_top_k_top_p_sampling is None:
+            logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps,
+                                        sampling_tensors.top_ks)
+
+        if do_min_p:
+            logits = _apply_min_p(logits, sampling_tensors.min_ps)
+
+        # We use float32 for probabilities and log probabilities.
+        # Compute the probabilities.
+        probs = torch.softmax(logits, dim=-1, dtype=torch.float)
+        # Compute the log probabilities.
+        logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float)
+
+        # Sample the next tokens.
+        maybe_deferred_sample_results, maybe_sampled_tokens_tensor = _sample(
+            probs,
+            logprobs,
+            sampling_metadata,
+            sampling_tensors,
+            include_gpu_probs_tensor=self.include_gpu_probs_tensor,
+            modify_greedy_probs=self._should_modify_greedy_probs_inplace,
+        )
+
+        if self.include_gpu_probs_tensor:
+            # Since we will defer sampler result Pythonization,
+            # preserve GPU-side tensors in support of later
+            # deferred pythonization of logprobs
+            assert maybe_sampled_tokens_tensor is not None
+            on_device_tensors = (probs, logprobs, maybe_sampled_tokens_tensor)
+        else:
+            # Since Pythonization has already happened, don't preserve
+            # GPU-side tensors.
+            on_device_tensors = None
+
+        # Get the logprobs query results.
+        prompt_logprobs = None
+        sample_logprobs = None
+        if not sampling_metadata.skip_sampler_cpu_output:
+            # Pythonize logprobs now (GPU -> CPU); do not defer.
+            assert not isinstance(maybe_deferred_sample_results,
+                                  SampleResultArgsType)
+            prompt_logprobs, sample_logprobs = get_logprobs(
+                logprobs, sampling_metadata, maybe_deferred_sample_results)
+
+        return _build_sampler_output(
+            maybe_deferred_sample_results,
+            sampling_metadata,
+            prompt_logprobs,
+            sample_logprobs,
+            on_device_tensors=on_device_tensors,
+            skip_sampler_cpu_output=sampling_metadata.skip_sampler_cpu_output)
+
+    @property
+    def _should_modify_greedy_probs_inplace(self) -> bool:
+        """Whether or not the sampler should modify the probability distribution
+        of greedily-sampled tokens such that multinomial sampling would sample
+        the greedily-sampled token.
+
+        In other words, if True then we set the probability of the greedily-
+        sampled token to 1.
+
+        This is used by speculative decoding, which requires that the sampling
+        method be encoded into the probability distribution.
+        """
+        return self.should_modify_greedy_probs_inplace
+
+
+def _apply_min_tokens_penalty(
+    logits: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+) -> torch.Tensor:
+    """Apply min_tokens penalty which sets stop tokens to -inf if min_tokens
+        have not been generated yet
+    """
+    # list of indices in logits that will be set to -inf
+    logits_to_penalize: list[tuple[int, int]] = []
+    logits_applied = 0
+    for seq_group in sampling_metadata.seq_groups:
+        seq_ids = seq_group.seq_ids
+        sampling_params = seq_group.sampling_params
+
+        sample_indices = seq_group.sample_indices
+        logits_applied += len(sample_indices) + len(
+            seq_group.prompt_logprob_indices)
+        if not seq_group.do_sample:
+            continue
+
+        start_idx = sample_indices[0]
+        min_tokens = sampling_params.min_tokens
+        token_ids_to_penalize = sampling_params.all_stop_token_ids
+        if min_tokens > 0 and token_ids_to_penalize:
+            seqs_to_penalize: list[int] = []
+            for j, seq_id in enumerate(seq_ids):
+                seq_data = seq_group.seq_data[seq_id]
+                if len(seq_data.output_token_ids_array) < min_tokens:
+                    seqs_to_penalize.append(j)
+
+            if seqs_to_penalize:
+                # convert to the index into logits
+                seqs_to_penalize = [start_idx + j for j in seqs_to_penalize]
+                # itertools.product pairs each seq index with every token id
+                logits_to_penalize.extend(
+                    itertools.product(seqs_to_penalize, token_ids_to_penalize))
+
+    if logits_to_penalize:
+        # use zip and * to group indices along each dimension
+        # eg. [ (1,2), (1,3), (5,6) ] -> ( (1,1,5), (2,3,6) )
+        logits[tuple(zip(*logits_to_penalize))] = -float("inf")
+
+    # verifies that no rows in logits were missed unexpectedly
+    assert logits_applied == logits.shape[0]
+    return logits
+
+
+def _apply_top_k_top_p(
+    logits: torch.Tensor,
+    p: torch.Tensor,
+    k: torch.Tensor,
+) -> torch.Tensor:
+    logits_sort, logits_idx = logits.sort(dim=-1, descending=False)
+
+    # Apply top-k.
+    top_k_mask = logits_sort.size(1) - k.to(torch.long)
+    # Get all the top_k values.
+    top_k_mask = logits_sort.gather(1, top_k_mask.unsqueeze(dim=1))
+    top_k_mask = logits_sort < top_k_mask
+    logits_sort.masked_fill_(top_k_mask, -float("inf"))
+
+    # Apply top-p.
+    probs_sort = logits_sort.softmax(dim=-1)
+    probs_sum = probs_sort.cumsum(dim=-1)
+    top_p_mask = probs_sum <= 1 - p.unsqueeze(dim=1)
+    # at least one
+    top_p_mask[:, -1] = False
+    logits_sort.masked_fill_(top_p_mask, -float("inf"))
+
+    # Re-sort the probabilities.
+    logits = torch.empty_like(logits_sort).scatter_(dim=-1,
+                                                    index=logits_idx,
+                                                    src=logits_sort)
+    return logits
+
+
+def _apply_min_p(
+    logits: torch.Tensor,
+    min_p: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Adapted from
+    https://github.com/oobabooga/text-generation-webui/blob/3146124ec01f02c8fb1650a6517cf1b60b537aaf/modules/sampler_hijack.py#L16C17-L16C17
+    """
+    probs = torch.softmax(logits, dim=-1)
+    top_probs, _ = probs.max(dim=-1, keepdim=True)
+    scaled_min_p = min_p.unsqueeze_(dim=1) * top_probs
+    tokens_to_remove = probs < scaled_min_p
+    logits = logits.masked_fill_(tokens_to_remove, -float("inf"))
+
+    return logits
+
+
+def _greedy_sample(
+    selected_seq_groups: list[SequenceGroupToSample],
+    samples: torch.Tensor,
+) -> SampleResultType:
+    """Run greedy sampling on a given samples.
+
+    Args:
+        selected_seq_groups: A list of sequence groups batched.
+        samples: (num_selected_samples,) A tensor of samples. The length of
+            samples could be smaller than selected_seq_groups if
+            seq_group.do_sample is False.
+    Returns:
+        Tuple of (next_token_ids, parent_ids). The length of returned list is
+        same as the length of selected_seq_groups. If the corresponding
+        seq_group has do_sample=False, tuple contains ([], [])
+    """
+    samples_lst = samples.tolist()
+    sample_idx = 0
+    results: SampleResultType = []
+    for seq_group in selected_seq_groups:
+        if not seq_group.do_sample:
+            results.append(([], []))
+            continue
+
+        seq_ids = seq_group.seq_ids
+        num_parent_seqs = len(seq_ids)
+        assert num_parent_seqs == 1, (
+            "Greedy sampling should have only one seq.")
+        parent_ids = list(range(num_parent_seqs))
+        next_token_ids = [samples_lst[sample_idx]]
+        results.append((next_token_ids, parent_ids))
+        sample_idx += num_parent_seqs
+    return results
+
+
+def _random_sample(
+    selected_seq_groups: list[SequenceGroupToSample],
+    random_samples: torch.Tensor,
+) -> SampleResultType:
+    """Run random sampling on a given samples.
+
+    Args:
+        selected_seq_groups: A list of sequence groups batched.
+        random_samples: (num_selected_samples,) A tensor of samples. The
+            length of samples could be smaller than selected_seq_groups if
+            seq_group.do_sample is False.
+    Returns:
+        Tuple of (next_token_ids, parent_ids). The length of returned list is
+        same as the length of selected_seq_groups. If the corresponding
+        seq_group has do_sample=False, tuple contains ([], [])
+    """
+    # Find the maximum n value of the prompt phase requests.
+    random_samples = random_samples.cpu()
+    sample_idx = 0
+    results: SampleResultType = []
+    for seq_group in selected_seq_groups:
+        if not seq_group.do_sample:
+            results.append(([], []))
+            continue
+
+        seq_ids = seq_group.seq_ids
+        sampling_params = seq_group.sampling_params
+        is_prompt = seq_group.is_prompt
+        num_parent_seqs = len(seq_ids)
+        if is_prompt:
+            # Prompt phase.
+            parent_ids = [0] * sampling_params.n
+            next_token_ids = random_samples[
+                sample_idx, :sampling_params.n].tolist()
+        else:
+            # Generation phase.
+            parent_ids = list(range(num_parent_seqs))
+            next_token_ids = random_samples[sample_idx:sample_idx +
+                                            num_parent_seqs, 0].tolist()
+        results.append((next_token_ids, parent_ids))
+        sample_idx += num_parent_seqs
+    return results
+
+
+# torch.multinomial forces a GPU<->CPU sync.
+# Therefore, we use an optimized implementation instead.
+# Note that we always sample with replacement.
+# probs will be modified in place, but this is fine, as we pass
+# in a copy already.
+def _multinomial(
+    probs: torch.Tensor,
+    num_samples: int,
+    seq_groups: Optional[list[SequenceGroupToSample]] = None,
+) -> torch.Tensor:
+    if num_samples > 1:
+        probs = probs.repeat_interleave(num_samples, dim=0)
+    q = torch.empty_like(probs)
+    if seq_groups is None:
+        q.exponential_()
+    else:
+        sample_idx = 0
+        for seq_group in seq_groups:
+            seq_ids = seq_group.seq_ids
+            stride = len(seq_ids) * num_samples
+            assert seq_group.generator is not None
+            q[sample_idx:sample_idx +
+              stride].exponential_(generator=seq_group.generator)
+            sample_idx += stride
+    return probs.div_(q).argmax(dim=1).view(-1, num_samples)
+
+
+def _top_k_top_p_multinomial_with_flashinfer(
+        probs: torch.Tensor, top_ks: torch.Tensor, top_ps: torch.Tensor,
+        num_samples: int, seq_groups: Optional[list[SequenceGroupToSample]]):
+    if num_samples > 1:
+        probs = probs.repeat_interleave(num_samples, dim=0)
+        top_ks = top_ks.repeat_interleave(num_samples)
+        top_ps = top_ps.repeat_interleave(num_samples)
+    batch_next_token_ids = flashinfer_top_k_top_p_sampling(
+        probs,
+        top_ks,
+        top_ps,
+    )
+    return batch_next_token_ids.view(-1, num_samples)
+
+
+def get_pythonized_sample_results(
+        sample_result_args: SampleResultArgsType) -> SampleResultType:
+    '''This function consumes GPU-side sampler results and computes
+    Pythonized CPU-side sampler results (GPU -> CPU sync.)
+
+    Single-step scheduling: this function is invoked at sampling-time
+    for immediate Pythonization.
+
+    Multi-step scheduling: Pythonization is deferred until after multiple
+    GPU-side steps have been completed.
+
+    Args:
+      sample_result_args: GPU-side inputs to the Pythonization process
+
+    Returns:
+      Pythonized sampler results
+    '''
+
+    (
+        sample_metadata,
+        sampling_metadata,
+        greedy_samples,
+        multinomial_samples,
+        sample_results_dict,
+    ) = (
+        sample_result_args.sample_metadata,
+        sample_result_args.sampling_metadata,
+        sample_result_args.greedy_samples,
+        sample_result_args.multinomial_samples,
+        sample_result_args.sample_results_dict,
+    )
+
+    for sampling_type in SamplingType:
+        if sampling_type not in sample_metadata:
+            continue
+        (seq_group_id, seq_groups) = sample_metadata[sampling_type]
+        if sampling_type == SamplingType.GREEDY:
+            sample_results = _greedy_sample(seq_groups, greedy_samples)
+        elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED):
+            sample_results = _random_sample(seq_groups,
+                                            multinomial_samples[sampling_type])
+        sample_results_dict.update(zip(seq_group_id, sample_results))
+
+    return [
+        sample_results_dict.get(i, ([], []))
+        for i in range(len(sampling_metadata.seq_groups))
+    ]
+
+
+def _sample_with_torch(
+    probs: torch.Tensor,
+    logprobs: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+    sampling_tensors: SamplingTensors,
+    include_gpu_probs_tensor: bool,
+    modify_greedy_probs: bool,
+) -> SampleReturnType:
+    '''Torch-oriented _sample() implementation.
+
+    Single-step scheduling:
+    * Perform GPU-side sampling computation
+    * Immediately Pythonize sampling result
+
+    Multi-step scheduling:
+    * Perform GPU-side sampling computation
+    * Defer Pythonization & preserve GPU-side
+      tensors required for Pythonization
+    '''
+
+    categorized_seq_group_ids: dict[SamplingType, list[int]] = {
+        t: []
+        for t in SamplingType
+    }
+    categorized_sample_indices = sampling_metadata.categorized_sample_indices
+    for i, seq_group in enumerate(sampling_metadata.seq_groups):
+        sampling_params = seq_group.sampling_params
+        sampling_type = sampling_params.sampling_type
+        categorized_seq_group_ids[sampling_type].append(i)
+
+    sample_results_dict: SampleResultsDictType = {}
+    sample_metadata: SampleMetadataType = {}
+    multinomial_samples: MultinomialSamplesType = {}
+    greedy_samples: Optional[torch.Tensor] = None
+
+    # Create output tensor for sampled token ids.
+    if include_gpu_probs_tensor:
+        sampled_token_ids_tensor = torch.full((logprobs.shape[0], 1),
+                                              VLLM_INVALID_TOKEN_ID,
+                                              dtype=torch.long,
+                                              device=logprobs.device)
+    else:
+        sampled_token_ids_tensor = None
+
+    # Counterintiutively, having two loops here is actually faster.
+    # The first loop can run without waiting on GPU<->CPU sync.
+    for sampling_type in SamplingType:
+        sample_indices = categorized_sample_indices[sampling_type]
+        num_tokens = len(sample_indices)
+        if num_tokens == 0:
+            continue
+
+        seq_group_id = categorized_seq_group_ids[sampling_type]
+        seq_groups = [sampling_metadata.seq_groups[i] for i in seq_group_id]
+        sample_metadata[sampling_type] = (seq_group_id, seq_groups)
+        long_sample_indices = sample_indices.long()
+        if sampling_type == SamplingType.GREEDY:
+            greedy_samples = torch.argmax(logprobs[long_sample_indices],
+                                          dim=-1)
+
+            if sampled_token_ids_tensor is not None:
+                # Store sampled tokens in output tensor.
+                sampled_token_ids_tensor[
+                    long_sample_indices] = greedy_samples.unsqueeze(-1)
+
+            if modify_greedy_probs:
+                # If required, modify the probabilities such that sampling from
+                # the modified distribution would always sample the argmax
+                # token id.
+                _modify_greedy_probs_inplace(logprobs, probs,
+                                             long_sample_indices,
+                                             greedy_samples)
+
+        elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED):
+            max_n_in_batch = 1
+            for seq_group in seq_groups:
+                if seq_group.is_prompt:
+                    sampling_params = seq_group.sampling_params
+                    max_n_in_batch = max(max_n_in_batch, sampling_params.n)
+            seq_groups_arg = (None if sampling_type == SamplingType.RANDOM else
+                              seq_groups)
+
+            if flashinfer_top_k_top_p_sampling is not None:
+                logger.warning("FlashInfer 0.2.3+ does not support "
+                               "per-request generators. Falling back to "
+                               "PyTorch-native implementation.")
+
+            multinomial_samples[sampling_type] = _multinomial(
+                probs[long_sample_indices],
+                max_n_in_batch,
+                seq_groups=seq_groups_arg)
+
+            if sampled_token_ids_tensor is not None:
+                # Store sampled tokens in output tensor.
+                sampled_token_ids_tensor[long_sample_indices] = \
+                    multinomial_samples[sampling_type].to(torch.long)
+
+        else:
+            raise ValueError(f"Unsupported sampling type: {sampling_type}")
+
+    # Encapsulate arguments for computing Pythonized sampler
+    # results, whether deferred or otherwise.
+    maybe_deferred_args = SampleResultArgsType(
+        sampling_metadata=sampling_metadata,
+        sample_metadata=sample_metadata,
+        multinomial_samples=multinomial_samples,
+        greedy_samples=greedy_samples,
+        sample_results_dict=sample_results_dict)
+
+    if not sampling_metadata.skip_sampler_cpu_output:
+        # GPU<->CPU sync happens here.
+        # This also converts the sampler output to a Python object.
+        # Return Pythonized sampler result & sampled token ids
+        return get_pythonized_sample_results(
+            maybe_deferred_args), sampled_token_ids_tensor
+    else:
+        # Defer sampler result Pythonization; return deferred
+        # Pythonization args & sampled token ids
+        return (
+            maybe_deferred_args,
+            sampled_token_ids_tensor,
+        )
+
+
+def _sample(
+    probs: torch.Tensor,
+    logprobs: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+    sampling_tensors: SamplingTensors,
+    include_gpu_probs_tensor: bool,
+    modify_greedy_probs: bool,
+) -> SampleReturnType:
+    """
+    Args:
+        probs: (num_query_tokens_in_batch, num_vocab)
+        logprobs: (num_query_tokens_in_batch, num_vocab)
+        sampling_metadata: The metadata for a batch for sampling.
+        sampling_tensors: Tensors that include sampling related metadata.
+
+    Returns:
+        (next_token_ids, parent_seq_ids) for each seq group in a batch.
+            If sampling is skipped, it returns ([], [])
+        sampled_token_ids_tensor: A tensor of sampled token ids.
+    """
+    return _sample_with_torch(
+        probs,
+        logprobs,
+        sampling_metadata,
+        sampling_tensors,
+        include_gpu_probs_tensor=include_gpu_probs_tensor,
+        modify_greedy_probs=modify_greedy_probs,
+    )
+
+
+def _get_ranks(x: torch.Tensor, indices: torch.Tensor) -> torch.Tensor:
+    """
+    This function calculates the ranks of the chosen tokens in a logprob tensor.
+
+    Args:
+        x (torch.Tensor): 2D logprob tensor of shape (N, M)
+                        where N is the no. of tokens and M is the vocab dim.
+        indices (torch.Tensor): List of chosen token indices.
+
+    Returns:
+        torch.Tensor: 1D tensor of shape (N,) where N is the no. of tokens.
+                    Each element in the returned tensor represents the rank
+                    of the chosen token in the input logprob tensor.
+    """
+    vals = x[torch.arange(0, len(x), device=x.device, dtype=indices.dtype),
+             indices]
+    result = (x > vals[:, None])
+    del vals
+    return result.sum(1).add_(1)
+
+
+def get_logprobs(
+    logprobs: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+    sample_results: SampleResultType,
+) -> tuple[list[Optional[PromptLogprobs]], list[SampleLogprobs]]:
+    """Return sample logprobs and prompt logprobs.
+
+    The logic consists of 3 parts.
+    - Select indices to compute logprob from, ranks of token ids, and
+        the top k token ids from logprobs.
+    - Compute prompt logprobs if required.
+    - Compute sample logprobs if required.
+
+    Args:
+        logprobs: (num_query_tokens_across_batch, num_vocab). Each query token's
+            logprob per vocab. Sequence groups' query tokens are batched in a
+            single flattened tensor. For example, assuming there are N
+            seq groups, it is sorted by prefill tokens for seq_group_1 (if
+            prompt logprob is enabled), decode tokens for seq_group_1 (if
+            sampling is required), prefill tokens for seq_group_2, ...
+        sampling_metadata: The sampling metadata.
+        sample_results: (num_seq_groups) The tuple of (next_token_ids,
+            parent_ids) for each sequence group. When beam search is enabled,
+            sample_results can contain different number of seq_ids from
+            sampling_metadata.seq_groups. It is because beam search creates
+            2 * BEAM_WIDTH number of samples (whereas there are only up to
+            BEAM_WIDTH number of seq_ids).
+
+    Returns:
+        A tuple of prompt and sample logprobs per sequence group in a batch.
+    """
+    # The index of query token to calculate logprobs. It includes both
+    # prompt and sample logprob indices.
+    query_indices: list[int] = []
+    # The next token ids to get the logprob value from.
+    next_token_ids: list[int] = []
+    # The largest requested number of logprobs. We find logprobs as many as the
+    # largest num logprobs in this API. If every logprobs is None, it will be
+    # set to -1.
+    largest_num_logprobs = -1
+
+    # Select indices to compute logprob from, ranks of token ids, and the top
+    # k token ids from logprobs.
+    for (seq_group, sample_result) in zip(sampling_metadata.seq_groups,
+                                          sample_results):
+        sampling_params = seq_group.sampling_params
+
+        # Update indices and tokens for prompt logprobs.
+        if (seq_group.is_prompt
+                and sampling_params.prompt_logprobs is not None):
+            largest_num_logprobs = max(largest_num_logprobs,
+                                       sampling_params.prompt_logprobs)
+            next_prompt_tokens = _get_next_prompt_tokens(seq_group)
+            query_indices.extend(seq_group.prompt_logprob_indices)
+            next_token_ids.extend(next_prompt_tokens)
+
+        # Update indices and next tokenes for sample logprob.
+        if seq_group.do_sample:
+            token_ids, parent_seq_ids = sample_result
+            # NOTE: We cannot directly use sample_indices because
+            # sample_indices only contain parent seq_ids of a previous step.
+            # The current step may have different number of seq_ids, and
+            # we can obtain it from `sample_result[1]`.
+            query_idx = seq_group.sample_indices[0]
+            query_indices.extend(
+                [query_idx + parent_id for parent_id in parent_seq_ids])
+            next_token_ids.extend(token_ids)
+
+            if sampling_params.logprobs is not None:
+                largest_num_logprobs = max(largest_num_logprobs,
+                                           sampling_params.logprobs)
+
+        assert len(next_token_ids) == len(query_indices)
+
+    if len(query_indices) == 0:
+        empty_sampled_logprob: SampleLogprobs = []
+        empty_prompt_logprob: Optional[PromptLogprobs] = None
+        num_seq_groups = len(sampling_metadata.seq_groups)
+        return [empty_prompt_logprob
+                ] * num_seq_groups, [empty_sampled_logprob] * num_seq_groups
+
+    selected_logprobs, ranks = None, None
+    top_logprobs, top_token_ids = None, None
+
+    # If largest_num_logprobs == -1, i.e. no logprobs are requested, we can
+    # skip the whole logprob calculation.
+    if largest_num_logprobs >= 0:
+        query_indices_gpu = torch.tensor(query_indices, device=logprobs.device)
+        next_token_ids_gpu = torch.tensor(next_token_ids,
+                                          device=logprobs.device)
+
+        # (num_selected_query_tokens, num_logprobs). Note that query_indices can
+        # contain duplicates if beam search is enabled.
+        selected_logprobs = logprobs[[
+            query_indices_gpu,
+            next_token_ids_gpu,
+        ]]
+        ranks = _get_ranks(
+            logprobs[query_indices_gpu],
+            next_token_ids_gpu,
+        )
+        assert selected_logprobs.shape[0] == ranks.shape[0]
+
+        # We need to compute top k only if there exists logprobs > 0.
+        if largest_num_logprobs > 0:
+            # Logprobs of topk tokens for a batch of sequence groups.
+            # (num_query_tokens_across_batch).
+            top_logprobs, top_token_ids = torch.topk(logprobs,
+                                                     largest_num_logprobs,
+                                                     dim=-1)
+            top_logprobs = top_logprobs.to('cpu')
+            top_token_ids = top_token_ids.to('cpu')
+
+        selected_logprobs = selected_logprobs.to('cpu')
+        ranks = ranks.to('cpu')
+
+    # Find prompt/sample logprobs.
+    prompt_logprobs_per_seq_group: list[Optional[PromptLogprobs]] = []
+    sample_logprobs_per_seq_group: list[SampleLogprobs] = []
+    top_logprob_idx = 0
+    selected_logprobs_idx = 0
+
+    for seq_group, sample_result in zip(sampling_metadata.seq_groups,
+                                        sample_results):
+        (prompt_logprobs, top_logprob_idx,
+         selected_logprobs_idx) = _get_prompt_logprob_if_needed(
+             seq_group, selected_logprobs, ranks, top_token_ids, top_logprobs,
+             selected_logprobs_idx, top_logprob_idx)
+        prompt_logprobs_per_seq_group.append(prompt_logprobs)
+
+        (sampled_logprobs, top_logprob_idx,
+         selected_logprobs_idx) = _get_sampled_logprob_if_needed(
+             seq_group, sample_result, selected_logprobs, ranks, top_token_ids,
+             top_logprobs, selected_logprobs_idx, top_logprob_idx)
+        sample_logprobs_per_seq_group.append(sampled_logprobs)
+
+    return prompt_logprobs_per_seq_group, sample_logprobs_per_seq_group
+
+
+def _get_prompt_logprob_if_needed(
+    seq_group: SequenceGroupToSample,
+    selected_logprobs: torch.Tensor,
+    ranks: torch.Tensor,
+    top_token_ids: torch.Tensor,
+    top_logprobs: torch.Tensor,
+    selected_logprobs_idx: int,
+    top_logprob_idx: int,
+):
+    """Compute the prompt logprob from a sequence group if needed."""
+    sampling_params = seq_group.sampling_params
+    is_prompt = seq_group.is_prompt
+
+    # Find prompt logprobs
+    prompt_logprobs: Optional[PromptLogprobs] = None
+    if is_prompt and sampling_params.prompt_logprobs is not None:
+        prompt_logprobs = []
+        num_logprobs = sampling_params.prompt_logprobs
+        next_prompt_tokens = _get_next_prompt_tokens(seq_group)
+        # Pre-select indexes and create a list. It is faster than calling .item
+        # repetitively.
+        selected_logprob_items = selected_logprobs[
+            selected_logprobs_idx:selected_logprobs_idx +
+            len(next_prompt_tokens)].tolist()
+        rank_items = ranks[selected_logprobs_idx:selected_logprobs_idx +
+                           len(next_prompt_tokens)].tolist()
+
+        for idx, token_id in enumerate(next_prompt_tokens):
+            # Calculate the prompt logprob of the real prompt tokens.
+            # {token_id: (logprob, rank_from_vocab)}
+            prompt_logprobs_dict: dict[int, tuple[float, int]] = {
+                token_id: (selected_logprob_items[idx], rank_items[idx])
+            }
+
+            # Add top K prompt logprobs along with its rank.
+            if num_logprobs > 0:
+                top_ids = top_token_ids[
+                    top_logprob_idx, :num_logprobs].tolist()
+                top_probs = top_logprobs[
+                    top_logprob_idx, :num_logprobs].tolist()
+                # Top K is already sorted by rank, so we can use 1 ~
+                # num_logprobs + 1 for rank.
+                top_ranks = range(1, num_logprobs + 1)
+                prompt_logprobs_dict.update({
+                    top_id: (top_prob, rank)
+                    for top_id, top_prob, rank in zip(top_ids, top_probs,
+                                                      top_ranks)
+                })
+            prompt_logprobs.append({
+                token_id: Logprob(*logprob_and_rank)
+                for token_id, logprob_and_rank in prompt_logprobs_dict.items()
+            })
+            # + 1 to go to the next prompt token.
+            top_logprob_idx += 1
+
+        # + len(next_prompt_tokens) to go to the next prompt.
+        selected_logprobs_idx += len(next_prompt_tokens)
+    return prompt_logprobs, top_logprob_idx, selected_logprobs_idx
+
+
+def _get_sampled_logprob_if_needed(
+    seq_group: SequenceGroupToSample,
+    sample_result: tuple[list[int], list[int]],
+    selected_logprobs: torch.Tensor,
+    ranks: torch.Tensor,
+    top_token_ids: torch.Tensor,
+    top_logprobs: torch.Tensor,
+    selected_logprobs_idx: int,
+    top_logprob_idx: int,
+):
+    """Compute the sample logprob if needed."""
+    seq_ids = seq_group.seq_ids
+    num_logprobs = seq_group.sampling_params.logprobs
+    sampled_logprobs: SampleLogprobs = []
+    next_token_ids, parent_seq_ids = sample_result
+
+    if seq_group.do_sample:
+        assert len(next_token_ids) > 0
+        if num_logprobs is None:
+            for next_token_id in next_token_ids:
+                # Use a dummy logprob
+                sampled_logprobs.append({next_token_id: Logprob(inf)})
+        else:
+            # Pre-select items from tensor. tolist() is faster than repetitive
+            # `.item()` calls.
+            selected_logprob_items = selected_logprobs[
+                selected_logprobs_idx:selected_logprobs_idx +
+                len(next_token_ids)].tolist()
+            rank_items = ranks[selected_logprobs_idx:selected_logprobs_idx +
+                               len(next_token_ids)].tolist()
+            for idx, (next_token_id, parent_id) in enumerate(
+                    zip(next_token_ids, parent_seq_ids)):
+                # Get the logprob of a sampled token.
+                sampled_logprobs_dict = {
+                    next_token_id:
+                    (selected_logprob_items[idx], rank_items[idx])
+                }
+                if num_logprobs is not None and num_logprobs > 0:
+                    # Get top K logprobs.
+                    top_ids = top_token_ids[top_logprob_idx +
+                                            parent_id, :num_logprobs].tolist()
+                    top_probs = top_logprobs[
+                        top_logprob_idx + parent_id, :num_logprobs].tolist()
+                    # Top K is already sorted by rank, so we can use 1 ~
+                    # num_logprobs + 1 for rank.
+                    top_ranks = range(1, num_logprobs + 1)
+                    sampled_logprobs_dict.update({
+                        top_id: (top_prob, rank)
+                        for top_id, top_prob, rank in zip(
+                            top_ids, top_probs, top_ranks)
+                    })
+
+                sampled_logprobs.append({
+                    token_id: Logprob(*logprob_and_rank)
+                    for token_id, logprob_and_rank in
+                    sampled_logprobs_dict.items()
+                })
+
+        # NOTE: This part of code is not intuitive. `selected_logprobs` include
+        # logprobs for the current step, which has len(next_token_ids) tokens
+        # per sequence group. `logprobs` includes logprobs from the previous
+        # steps, which has len(seq_ids) tokens per sequence group.
+
+        # Iterate to the next sequence group in a batch.
+        selected_logprobs_idx += len(next_token_ids)
+        # Iterate to the next sequence group in a batch.
+        top_logprob_idx += len(seq_ids)
+    return sampled_logprobs, top_logprob_idx, selected_logprobs_idx
+
+
+def _modify_greedy_probs_inplace(logprobs: torch.Tensor, probs: torch.Tensor,
+                                 sample_indices: torch.Tensor,
+                                 greedy_samples: torch.Tensor) -> None:
+    """Modify the probability distributions of the greedily-sampled tokens such
+    that each sampled token has a "probability" of 1.0. This is required by
+    speculative decoding, which depends on the sampling method being encoded
+    within the probability distribution for correctness.
+
+    # Why do we only need to do this for greedy sampling?
+
+    vLLM's sampler performs the following steps for greedy or multinomial
+    (random) sampling:
+        1. Get logits from model.
+        2. Modify logits according to per-sequence sampling parameters.
+            - Multiply by temperature, top-k and top-p masking, penalize tokens
+                according to their frequency, etc.
+        3. Sample a token.
+            - Random sampling simply samples from the modified probability
+                distribution.
+            - Greedy sampling performs `argmax` to obtain the token with the
+                highest likelihood.
+
+    Ignoring greedy sampling for a moment, we find that the computed probability
+    distribution has the following property: we can sample from it independently
+    and find that the token sampled by the Sampler has a frequency corresponding
+    to how often we see it in our sampling. In other words, for tokens sampled
+    with vLLM's random SamplingType, the computed probability distribution
+    encodes the sampling methodology completely.
+
+    Greedy sampling does not normally have this property. vLLM modifies logits
+    according to sampling params, then performs `argmax`, then returns the
+    sampled token and the computed probability distribution. If we sample from
+    the distribution, we'll find the likelihood of the greedily-sampled token
+    is not always 1.0.
+
+    Since lossless speculative decoding requires that the sampling methodology
+    be encoded within the probability distribution, we are motivated to modify
+    the probability distribution such that the sampled token has probability 1
+    when speculative decoding is used.
+
+    NOTE: Alternatively, we could use an extremely low temperature to achieve
+    greedy sampling using multinomial computation and unite the codepaths. This
+    has implications on the overall design of the sampler, e.g. how to record
+    accurate logprobs for the user, so this improvement is deferred to later.
+    """
+    # NOTE: logprobs are not modified so they can be returned to the user.
+    probs[sample_indices, :] = 0
+    probs[sample_indices, greedy_samples] = 1.0
+
+
+def _build_sampler_output(
+    maybe_deferred_sample_results: MaybeDeferredSampleResultType,
+    sampling_metadata: SamplingMetadata,
+    prompt_logprobs: Optional[list[Optional[PromptLogprobs]]],
+    sample_logprobs: Optional[list[SampleLogprobs]],
+    on_device_tensors: Optional[tuple[torch.Tensor, torch.Tensor,
+                                      torch.Tensor]],
+    skip_sampler_cpu_output: bool = False,
+) -> SamplerOutput:
+    """Construct Python objects with the output of sampling.
+
+    Args:
+        on_device_tensors: Tuple containing on-device tensors with the
+            probabilities used in sampling and the sampled token ids. This
+            allows post-processing without copies to CPU/serialization, e.g. in
+            speculative decoding rejection sampling.
+    """
+    sampler_output: list[CompletionSequenceGroupOutput] = []
+
+    if skip_sampler_cpu_output:
+        assert isinstance(maybe_deferred_sample_results, SampleResultArgsType)
+        deferred_sample_results_args = maybe_deferred_sample_results
+    else:
+        assert prompt_logprobs is not None
+        assert sample_logprobs is not None
+        assert not isinstance(maybe_deferred_sample_results,
+                              SampleResultArgsType)
+        assert len(sampling_metadata.seq_groups) \
+            == len(maybe_deferred_sample_results) \
+            == len(prompt_logprobs) \
+            == len(sample_logprobs)
+        deferred_sample_results_args = None
+
+        for (seq_group, sample_result, group_prompt_logprobs,
+             group_sample_logprobs) in zip(sampling_metadata.seq_groups,
+                                           maybe_deferred_sample_results,
+                                           prompt_logprobs, sample_logprobs):
+            seq_ids = seq_group.seq_ids
+            next_token_ids, parent_ids = sample_result
+            seq_outputs: list[SequenceOutput] = []
+            for parent_id, next_token_id, logprobs in zip(
+                    parent_ids, next_token_ids, group_sample_logprobs):
+                seq_outputs.append(
+                    SequenceOutput(seq_ids[parent_id], next_token_id,
+                                   logprobs))
+            sampler_output.append(
+                CompletionSequenceGroupOutput(seq_outputs,
+                                              group_prompt_logprobs))
+
+    # If not specified, store None values in SamplerOutput.
+    if on_device_tensors is not None:
+        (sampled_token_probs, logprobs_tensor,
+         sampled_token_ids) = on_device_tensors
+    else:
+        sampled_token_probs, logprobs_tensor, sampled_token_ids = (None, None,
+                                                                   None)
+
+    return SamplerOutput(
+        outputs=sampler_output,
+        sampled_token_probs=sampled_token_probs,
+        sampled_token_ids=sampled_token_ids,
+        logprobs=logprobs_tensor,
+        deferred_sample_results_args=deferred_sample_results_args)
+
+
+def _get_next_prompt_tokens(
+        seq_group: SequenceGroupToSample) -> tuple[int, ...]:
+    """Get a list of next prompt tokens to compute logprob from a
+        given sequence group.
+
+    It is used to compute prompt logprob. Imagine you have logprob for each
+    query token. Query token needs to know the next prompt token id to compute
+    prompt logprob. This is a helper to obtain next prompt token ids.
+
+    This API has to be used only when the caller knows seq_group is in prefill
+    stage.
+
+    Returns:
+        A list of next prompt tokens to compute logprob.
+    """
+    assert seq_group.is_prompt, (
+        "Caller should ensure the sequence group is in a prefill stage.")
+    seq_ids = seq_group.seq_ids
+    query_len = seq_group.query_len
+    assert query_len is not None
+    # prompt has only 1 seq id.
+    assert len(seq_ids) == 1
+    seq_data = seq_group.seq_data[seq_ids[0]]
+    computed_len = seq_data.get_num_computed_tokens()
+    prompt_tokens = seq_data.prompt_token_ids
+    # +1 because we are looking for a next prompt token.
+    next_token_index_start = computed_len + 1
+    next_token_index_end = min(computed_len + query_len + 1,
+                               len(prompt_tokens))
+    next_prompt_tokens = prompt_tokens[
+        next_token_index_start:next_token_index_end]
+    return next_prompt_tokens
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/utils.py b/vllm_v0.10.0/vllm/model_executor/layers/utils.py
new file mode 100644
index 0000000..ad4ba9c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/utils.py
@@ -0,0 +1,116 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Utility methods for model layers."""
+from typing import Callable, Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm import envs
+from vllm.platforms import current_platform
+
+
+def get_token_bin_counts_and_mask(
+    tokens: torch.Tensor,
+    vocab_size: int,
+    num_seqs: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    # Compute the bin counts for the tokens.
+    # vocab_size + 1 for padding.
+    bin_counts = torch.zeros((num_seqs, vocab_size + 1),
+                             dtype=torch.long,
+                             device=tokens.device)
+    bin_counts.scatter_add_(1, tokens, torch.ones_like(tokens))
+    bin_counts = bin_counts[:, :vocab_size]
+    mask = bin_counts > 0
+
+    return bin_counts, mask
+
+
+def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
+                    output_tokens_tensor: torch.Tensor,
+                    presence_penalties: torch.Tensor,
+                    frequency_penalties: torch.Tensor,
+                    repetition_penalties: torch.Tensor) -> torch.Tensor:
+    """
+    Applies penalties in place to the logits tensor
+    logits : The input logits tensor of shape [num_seqs, vocab_size]
+    prompt_tokens_tensor: A tensor containing the prompt tokens. The prompts 
+        are padded to the maximum prompt length within the batch using 
+        `vocab_size` as the padding value. The value `vocab_size` is used 
+        for padding because it does not correspond to any valid token ID 
+        in the vocabulary.
+    output_tokens_tensor: The output tokens tensor.
+    presence_penalties: The presence penalties of shape (num_seqs, )
+    frequency_penalties: The frequency penalties of shape (num_seqs, )
+    repetition_penalties: The repetition penalties of shape (num_seqs, )
+    """
+    num_seqs, vocab_size = logits.shape
+    _, prompt_mask = get_token_bin_counts_and_mask(prompt_tokens_tensor,
+                                                   vocab_size, num_seqs)
+    output_bin_counts, output_mask = get_token_bin_counts_and_mask(
+        output_tokens_tensor, vocab_size, num_seqs)
+
+    # Apply repetition penalties as a custom op
+    from vllm._custom_ops import apply_repetition_penalties
+    apply_repetition_penalties(logits, prompt_mask, output_mask,
+                               repetition_penalties)
+
+    # We follow the definition in OpenAI API.
+    # Refer to https://platform.openai.com/docs/api-reference/parameter-details
+    logits -= frequency_penalties.unsqueeze(dim=1) * output_bin_counts
+    logits -= presence_penalties.unsqueeze(dim=1) * output_mask
+    return logits
+
+
+def default_unquantized_gemm(layer: torch.nn.Module,
+                             x: torch.Tensor,
+                             weight: torch.Tensor,
+                             bias: Optional[torch.Tensor] = None):
+    return torch.nn.functional.linear(x, weight, bias)
+
+
+def rocm_unquantized_gemm(layer: torch.nn.Module,
+                          x: torch.Tensor,
+                          weight: torch.Tensor,
+                          bias: Optional[torch.Tensor] = None):
+    from vllm.platforms.rocm import on_gfx9
+    k = weight.shape[1]
+    use_skinny = (envs.VLLM_ROCM_USE_SKINNY_GEMM and on_gfx9() and \
+                    x.dtype in [torch.float16, torch.bfloat16] \
+                    and k % 8 == 0 and bias is None)
+
+    if use_skinny is not True:
+        return torch.nn.functional.linear(x, weight, bias)
+
+    x_view = x.view(-1, x.size(-1))
+    n = x_view.shape[0]
+    m = weight.shape[0]
+    cu_count = current_platform.get_cu_count()
+
+    if m > 8 and 0 < n <= 4:
+        out = ops.wvSplitK(weight, x_view, cu_count)
+        return out.view(*x.shape[:-1], weight.shape[0])
+    elif m % 4 == 0 and n == 1 and k <= 8192:
+        out = ops.LLMM1(weight, x_view, 4)
+        return out.view(*x.shape[:-1], weight.shape[0])
+    return torch.nn.functional.linear(x, weight, bias)
+
+
+def cpu_unquantized_gemm(layer: torch.nn.Module,
+                         x: torch.Tensor,
+                         weight: torch.Tensor,
+                         bias: Optional[torch.Tensor] = None):
+    if getattr(layer, "use_cpu_sgl", False):
+        return torch.ops._C.weight_packed_linear(x, weight, bias, True)
+    else:
+        return torch.nn.functional.linear(x, weight, bias)
+
+
+def dispatch_unquantized_gemm() -> Callable[..., torch.Tensor]:
+    if current_platform.is_rocm():
+        return rocm_unquantized_gemm
+    elif current_platform.is_cpu():
+        return cpu_unquantized_gemm
+    else:
+        return default_unquantized_gemm
diff --git a/vllm_v0.10.0/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm_v0.10.0/vllm/model_executor/layers/vocab_parallel_embedding.py
new file mode 100644
index 0000000..a5f262c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -0,0 +1,475 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+from torch.nn.parameter import Parameter, UninitializedParameter
+
+from vllm.distributed import (divide, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase, method_has_implemented_embedding)
+from vllm.model_executor.layers.utils import dispatch_unquantized_gemm
+from vllm.model_executor.parameter import BasevLLMParameter
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+
+DEFAULT_VOCAB_PADDING_SIZE = 64
+
+
+class UnquantizedEmbeddingMethod(QuantizeMethodBase):
+    """Unquantized method for embeddings."""
+
+    def create_weights(self, layer: torch.nn.Module,
+                       input_size_per_partition: int,
+                       output_partition_sizes: list[int], input_size: int,
+                       output_size: int, params_dtype: torch.dtype,
+                       **extra_weight_attrs):
+        """Create weights for embedding layer."""
+        weight = Parameter(torch.empty(sum(output_partition_sizes),
+                                       input_size_per_partition,
+                                       dtype=params_dtype),
+                           requires_grad=False)
+        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(weight, extra_weight_attrs)
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return dispatch_unquantized_gemm()(layer, x, layer.weight, bias)
+
+    def embedding(self, layer: torch.nn.Module,
+                  input_: torch.Tensor) -> torch.Tensor:
+        return F.embedding(input_, layer.weight)
+
+
+def pad_vocab_size(vocab_size: int,
+                   pad_to: int = DEFAULT_VOCAB_PADDING_SIZE) -> int:
+    """Pad the vocab size to the given value."""
+    return ((vocab_size + pad_to - 1) // pad_to) * pad_to
+
+
+def vocab_range_from_per_partition_vocab_size(
+        per_partition_vocab_size: int,
+        rank: int,
+        offset: int = 0) -> Sequence[int]:
+    index_f = rank * per_partition_vocab_size
+    index_l = index_f + per_partition_vocab_size
+    return index_f + offset, index_l + offset
+
+
+def vocab_range_from_global_vocab_size(global_vocab_size: int,
+                                       rank: int,
+                                       world_size: int,
+                                       offset: int = 0) -> Sequence[int]:
+    per_partition_vocab_size = divide(global_vocab_size, world_size)
+    return vocab_range_from_per_partition_vocab_size(per_partition_vocab_size,
+                                                     rank,
+                                                     offset=offset)
+
+
+@dataclass
+class VocabParallelEmbeddingShardIndices:
+    """Indices for a shard of a vocab parallel embedding."""
+    padded_org_vocab_start_index: int
+    padded_org_vocab_end_index: int
+    padded_added_vocab_start_index: int
+    padded_added_vocab_end_index: int
+
+    org_vocab_start_index: int
+    org_vocab_end_index: int
+    added_vocab_start_index: int
+    added_vocab_end_index: int
+
+    @property
+    def num_org_elements(self) -> int:
+        return self.org_vocab_end_index - self.org_vocab_start_index
+
+    @property
+    def num_added_elements(self) -> int:
+        return self.added_vocab_end_index - self.added_vocab_start_index
+
+    @property
+    def num_org_elements_padded(self) -> int:
+        return (self.padded_org_vocab_end_index -
+                self.padded_org_vocab_start_index)
+
+    @property
+    def num_added_elements_padded(self) -> int:
+        return (self.padded_added_vocab_end_index -
+                self.padded_added_vocab_start_index)
+
+    @property
+    def num_org_vocab_padding(self) -> int:
+        return self.num_org_elements_padded - self.num_org_elements
+
+    @property
+    def num_added_vocab_padding(self) -> int:
+        return self.num_added_elements_padded - self.num_added_elements
+
+    @property
+    def num_elements_padded(self) -> int:
+        return self.num_org_elements_padded + self.num_added_elements_padded
+
+    def __post_init__(self):
+        # sanity checks
+        assert (self.padded_org_vocab_start_index
+                <= self.padded_org_vocab_end_index)
+        assert (self.padded_added_vocab_start_index
+                <= self.padded_added_vocab_end_index)
+
+        assert self.org_vocab_start_index <= self.org_vocab_end_index
+        assert self.added_vocab_start_index <= self.added_vocab_end_index
+
+        assert self.org_vocab_start_index <= self.padded_org_vocab_start_index
+        assert (self.added_vocab_start_index
+                <= self.padded_added_vocab_start_index)
+        assert self.org_vocab_end_index <= self.padded_org_vocab_end_index
+        assert self.added_vocab_end_index <= self.padded_added_vocab_end_index
+
+        assert self.num_org_elements <= self.num_org_elements_padded
+        assert self.num_added_elements <= self.num_added_elements_padded
+
+
+@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
+def get_masked_input_and_mask(
+        input_: torch.Tensor, org_vocab_start_index: int,
+        org_vocab_end_index: int, num_org_vocab_padding: int,
+        added_vocab_start_index: int,
+        added_vocab_end_index: int) -> tuple[torch.Tensor, torch.Tensor]:
+    # torch.compile will fuse all of the pointwise ops below
+    # into a single kernel, making it very fast
+    org_vocab_mask = (input_ >= org_vocab_start_index) & (
+        input_ < org_vocab_end_index)
+    added_vocab_mask = (input_ >= added_vocab_start_index) & (
+        input_ < added_vocab_end_index)
+    added_offset = added_vocab_start_index - (
+        org_vocab_end_index - org_vocab_start_index) - num_org_vocab_padding
+    valid_offset = (org_vocab_start_index *
+                    org_vocab_mask) + (added_offset * added_vocab_mask)
+    vocab_mask = org_vocab_mask | added_vocab_mask
+    input_ = vocab_mask * (input_ - valid_offset)
+    return input_, ~vocab_mask
+
+
+class VocabParallelEmbedding(torch.nn.Module):
+    """Embedding parallelized in the vocabulary dimension.
+
+    Adapted from torch.nn.Embedding, note that we pad the vocabulary size to
+    make sure it is divisible by the number of model parallel GPUs.
+
+    In order to support various loading methods, we ensure that LoRA-added
+    embeddings are always at the end of TP-sharded tensors. In other words,
+    we shard base embeddings and LoRA embeddings separately (both padded),
+    and place them in the same tensor.
+    In this example, we will have the original vocab size = 1010,
+    added vocab size = 16 and padding to 64. Therefore, the total
+    vocab size with padding will be 1088 (because we first pad 1010 to
+    1024, add 16, and then pad to 1088).
+    Therefore, the tensor format looks like the following:
+    TP1, rank 0 (no sharding):
+                            |< --------BASE-------- >|< -BASE PADDING-- >|< -----LORA------ >|< -LORA PADDING-- >|
+    corresponding token_id: |  0  |  1  | ... | 1009 |  -1  | ... |  -1  | 1010 | ... | 1025 |  -1  | ... |  -1  |
+                     index: |  0  |  1  | ... | 1009 | 1010 | ... | 1023 | 1024 | ... | 1039 | 1040 | ... | 1087 |
+
+    TP2, rank 0:
+                            |< --------------------BASE--------------------- >|< -----LORA------ >|< -LORA PADDING- >|
+    corresponding token_id: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 1010 | ... | 1025 |  -1  | ... |  -1 |
+                     index: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 512  | ... | 527  |  528 | ... | 543 |
+    TP2, rank 1:
+                            |< -----------BASE----------- >|< -BASE PADDING- >|< -----------LORA PADDING----------- >|
+    corresponding token_id: | 512 | 513 | 514 | ... | 1009 | -1  | ...  | -1  |  -1  | ... |  -1  | -1  | ... |   -1 |
+                     index: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 512  | ... | 527  | 528 | ... |  543 |
+
+    Args:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        params_dtype: type of the parameters.
+        org_num_embeddings: original vocabulary size (without LoRA).
+        padding_size: padding size for the vocabulary.
+        quant_config: quant config for the layer
+        prefix: full name of the layer in the state dict
+    """  # noqa: E501
+
+    def __init__(self,
+                 num_embeddings: int,
+                 embedding_dim: int,
+                 params_dtype: Optional[torch.dtype] = None,
+                 org_num_embeddings: Optional[int] = None,
+                 padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+
+        # Keep the input dimensions.
+        tp_rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_embeddings = num_embeddings
+        self.padding_size = padding_size
+        self.org_vocab_size = org_num_embeddings or num_embeddings
+        num_added_embeddings = num_embeddings - self.org_vocab_size
+        self.org_vocab_size_padded = pad_vocab_size(self.org_vocab_size,
+                                                    self.padding_size)
+        self.num_embeddings_padded = pad_vocab_size(
+            self.org_vocab_size_padded + num_added_embeddings,
+            self.padding_size)
+        assert self.org_vocab_size_padded <= self.num_embeddings_padded
+
+        self.shard_indices = self._get_indices(self.num_embeddings_padded,
+                                               self.org_vocab_size_padded,
+                                               self.num_embeddings,
+                                               self.org_vocab_size, tp_rank,
+                                               self.tp_size)
+        self.embedding_dim = embedding_dim
+
+        quant_method = None
+        if quant_config is not None:
+            quant_method = quant_config.get_quant_method(self, prefix=prefix)
+        if quant_method is None:
+            quant_method = UnquantizedEmbeddingMethod()
+
+        # If we are making an embedding layer, then our quantization linear
+        # method must implement the embedding operation. If we are another
+        # layer type like ParallelLMHead, this is not important.
+        is_embedding_layer = type(self) is VocabParallelEmbedding
+        quant_method_implements_embedding = method_has_implemented_embedding(
+            type(quant_method))
+        if is_embedding_layer and not quant_method_implements_embedding:
+            raise NotImplementedError(
+                f"The class {type(quant_method).__name__} must implement "
+                "the 'embedding' method, see UnquantizedEmbeddingMethod.")
+
+        self.quant_method: QuantizeMethodBase = quant_method
+
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        # Divide the weight matrix along the vocaburaly dimension.
+        self.num_added_embeddings = self.num_embeddings - self.org_vocab_size
+        self.num_embeddings_per_partition = divide(self.num_embeddings_padded,
+                                                   self.tp_size)
+        assert (self.shard_indices.num_elements_padded ==
+                self.num_embeddings_per_partition)
+        self.num_org_embeddings_per_partition = (
+            self.shard_indices.org_vocab_end_index -
+            self.shard_indices.org_vocab_start_index)
+        self.num_added_embeddings_per_partition = (
+            self.shard_indices.added_vocab_end_index -
+            self.shard_indices.added_vocab_start_index)
+
+        self.quant_method.create_weights(self,
+                                         self.embedding_dim,
+                                         [self.num_embeddings_per_partition],
+                                         self.embedding_dim,
+                                         self.num_embeddings_padded,
+                                         params_dtype=params_dtype,
+                                         weight_loader=self.weight_loader)
+
+    @classmethod
+    def _get_indices(cls, vocab_size_padded: int, org_vocab_size_padded: int,
+                     vocab_size: int, org_vocab_size: int, tp_rank: int,
+                     tp_size: int) -> VocabParallelEmbeddingShardIndices:
+        """Get start and end indices for vocab parallel embedding, following the
+        layout outlined in the class docstring, based on the given tp_rank and
+        tp_size."""
+        num_added_embeddings_padded = vocab_size_padded - org_vocab_size_padded
+        padded_org_vocab_start_index, padded_org_vocab_end_index = (
+            vocab_range_from_global_vocab_size(org_vocab_size_padded, tp_rank,
+                                               tp_size))
+        padded_added_vocab_start_index, padded_added_vocab_end_index = (
+            vocab_range_from_global_vocab_size(num_added_embeddings_padded,
+                                               tp_rank,
+                                               tp_size,
+                                               offset=org_vocab_size))
+        # remove padding
+        org_vocab_start_index = min(padded_org_vocab_start_index,
+                                    org_vocab_size)
+        org_vocab_end_index = min(padded_org_vocab_end_index, org_vocab_size)
+        added_vocab_start_index = min(padded_added_vocab_start_index,
+                                      vocab_size)
+        added_vocab_end_index = min(padded_added_vocab_end_index, vocab_size)
+        return VocabParallelEmbeddingShardIndices(
+            padded_org_vocab_start_index, padded_org_vocab_end_index,
+            padded_added_vocab_start_index, padded_added_vocab_end_index,
+            org_vocab_start_index, org_vocab_end_index,
+            added_vocab_start_index, added_vocab_end_index)
+
+    def get_sharded_to_full_mapping(self) -> Optional[list[int]]:
+        """Get a mapping that can be used to reindex the gathered
+        logits for sampling.
+        
+        During sampling, we gather logits from all ranks. The relationship
+        of index->token_id will follow the same format as outlined in the class
+        docstring. However, after the gather, we want to reindex the final
+        logits tensor to map index->token_id one-to-one (the index is always
+        equal the token_id it corresponds to). The indices returned by this
+        method allow us to do that.
+        """
+        if self.tp_size < 2:
+            return None
+
+        base_embeddings: list[int] = []
+        added_embeddings: list[int] = []
+        padding: list[int] = []
+        for tp_rank in range(self.tp_size):
+            shard_indices = self._get_indices(self.num_embeddings_padded,
+                                              self.org_vocab_size_padded,
+                                              self.num_embeddings,
+                                              self.org_vocab_size, tp_rank,
+                                              self.tp_size)
+            range_start = self.num_embeddings_per_partition * tp_rank
+            range_end = self.num_embeddings_per_partition * (tp_rank + 1)
+            base_embeddings.extend(
+                range(range_start,
+                      range_start + shard_indices.num_org_elements))
+            padding.extend(
+                range(range_start + shard_indices.num_org_elements,
+                      range_start + shard_indices.num_org_elements_padded))
+            added_embeddings.extend(
+                range(
+                    range_start + shard_indices.num_org_elements_padded,
+                    range_start + shard_indices.num_org_elements_padded +
+                    shard_indices.num_added_elements))
+            padding.extend(
+                range(
+                    range_start + shard_indices.num_org_elements_padded +
+                    shard_indices.num_added_elements,
+                    range_start + shard_indices.num_org_elements_padded +
+                    shard_indices.num_added_elements_padded))
+            assert (range_start + shard_indices.num_org_elements_padded +
+                    shard_indices.num_added_elements_padded == range_end)
+        ret = base_embeddings + added_embeddings + padding
+        assert len(ret) == self.num_embeddings_padded
+        return ret
+
+    def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
+        output_dim = getattr(param, "output_dim", None)
+        packed_dim = getattr(param, "packed_dim", None)
+
+        # If the parameter is a gguf weight, then load it directly.
+        if getattr(param, "is_gguf_weight_type", None):
+            param.data.copy_(loaded_weight)
+            param.weight_type = loaded_weight.item()
+            return
+        elif isinstance(param, UninitializedParameter):
+            shape = list(loaded_weight.shape)
+            if output_dim is not None:
+                shape[output_dim] = self.num_embeddings_per_partition
+            param.materialize(tuple(shape), dtype=loaded_weight.dtype)
+
+        # If parameter does not have output dim, then it should
+        # be copied onto all gpus (e.g. g_idx for act_order gptq).
+        if output_dim is None:
+            assert param.data.shape == loaded_weight.shape
+            param.data.copy_(loaded_weight)
+            return
+
+        # Shard indexes for loading the weight
+        start_idx = self.shard_indices.org_vocab_start_index
+        shard_size = self.shard_indices.org_vocab_end_index - start_idx
+
+        # If param packed on the same dim we are sharding on, then
+        # need to adjust offsets of loaded weight by pack_factor.
+        if packed_dim is not None and packed_dim == output_dim:
+            packed_factor = param.packed_factor if isinstance(
+                param, BasevLLMParameter) else param.pack_factor
+            assert loaded_weight.shape[output_dim] == (self.org_vocab_size //
+                                                       param.packed_factor)
+            start_idx = start_idx // packed_factor
+            shard_size = shard_size // packed_factor
+        else:
+            assert loaded_weight.shape[output_dim] == self.org_vocab_size
+
+        # Copy the data. Select chunk corresponding to current shard.
+        loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
+        param[:loaded_weight.shape[0]].data.copy_(loaded_weight)
+        param[loaded_weight.shape[0]:].data.fill_(0)
+
+    def forward(self, input_):
+        if self.tp_size > 1:
+            # Build the mask.
+            masked_input, input_mask = get_masked_input_and_mask(
+                input_, self.shard_indices.org_vocab_start_index,
+                self.shard_indices.org_vocab_end_index,
+                self.shard_indices.num_org_vocab_padding,
+                self.shard_indices.added_vocab_start_index,
+                self.shard_indices.added_vocab_end_index)
+        else:
+            masked_input = input_
+        # Get the embeddings.
+        output_parallel = self.quant_method.embedding(self,
+                                                      masked_input.long())
+        # Mask the output embedding.
+        if self.tp_size > 1:
+            output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0)
+        # Reduce across all the model parallel GPUs.
+        output = tensor_model_parallel_all_reduce(output_parallel)
+        return output
+
+    def extra_repr(self) -> str:
+        s = f"num_embeddings={self.num_embeddings_per_partition}"
+        s += f", embedding_dim={self.embedding_dim}"
+        s += f", org_vocab_size={self.org_vocab_size}"
+        s += f', num_embeddings_padded={self.num_embeddings_padded}'
+        s += f', tp_size={self.tp_size}'
+        return s
+
+
+class ParallelLMHead(VocabParallelEmbedding):
+    """Parallelized LM head.
+
+    Output logits weight matrices used in the Sampler. The weight and bias
+    tensors are padded to make sure they are divisible by the number of
+    model parallel GPUs.
+
+    Args:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        bias: whether to use bias.
+        params_dtype: type of the parameters.
+        org_num_embeddings: original vocabulary size (without LoRA).
+        padding_size: padding size for the vocabulary.
+    """
+
+    def __init__(self,
+                 num_embeddings: int,
+                 embedding_dim: int,
+                 bias: bool = False,
+                 params_dtype: Optional[torch.dtype] = None,
+                 org_num_embeddings: Optional[int] = None,
+                 padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__(num_embeddings, embedding_dim, params_dtype,
+                         org_num_embeddings, padding_size, quant_config,
+                         prefix)
+        self.quant_config = quant_config
+        if bias:
+            self.bias = Parameter(
+                torch.empty(self.num_embeddings_per_partition,
+                            dtype=params_dtype))
+            set_weight_attrs(self.bias, {
+                "output_dim": 0,
+                "weight_loader": self.weight_loader,
+            })
+        else:
+            self.register_parameter("bias", None)
+
+    def tie_weights(self, embed_tokens: VocabParallelEmbedding):
+        """Tie the weights with word embeddings."""
+        # GGUF quantized embed_tokens.
+        if self.quant_config and self.quant_config.get_name() == "gguf":
+            return embed_tokens
+        else:
+            self.weight = embed_tokens.weight
+            return self
+
+    def forward(self, input_):
+        del input_
+        raise RuntimeError("LMHead's weights should be used in the sampler.")
diff --git a/vllm_v0.10.0/vllm/model_executor/model_loader/__init__.py b/vllm_v0.10.0/vllm/model_executor/model_loader/__init__.py
new file mode 100644
index 0000000..78681a0
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/model_loader/__init__.py
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+from torch import nn
+
+from vllm.config import LoadConfig, LoadFormat, ModelConfig, VllmConfig
+from vllm.model_executor.model_loader.base_loader import BaseModelLoader
+from vllm.model_executor.model_loader.bitsandbytes_loader import (
+    BitsAndBytesModelLoader)
+from vllm.model_executor.model_loader.default_loader import DefaultModelLoader
+from vllm.model_executor.model_loader.dummy_loader import DummyModelLoader
+from vllm.model_executor.model_loader.gguf_loader import GGUFModelLoader
+from vllm.model_executor.model_loader.runai_streamer_loader import (
+    RunaiModelStreamerLoader)
+from vllm.model_executor.model_loader.sharded_state_loader import (
+    ShardedStateLoader)
+from vllm.model_executor.model_loader.tensorizer_loader import TensorizerLoader
+from vllm.model_executor.model_loader.utils import (
+    get_architecture_class_name, get_model_architecture, get_model_cls)
+
+
+def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
+    """Get a model loader based on the load format."""
+    if isinstance(load_config.load_format, type):
+        return load_config.load_format(load_config)
+
+    if load_config.load_format == LoadFormat.DUMMY:
+        return DummyModelLoader(load_config)
+
+    if load_config.load_format == LoadFormat.TENSORIZER:
+        return TensorizerLoader(load_config)
+
+    if load_config.load_format == LoadFormat.SHARDED_STATE:
+        return ShardedStateLoader(load_config)
+
+    if load_config.load_format == LoadFormat.BITSANDBYTES:
+        return BitsAndBytesModelLoader(load_config)
+
+    if load_config.load_format == LoadFormat.GGUF:
+        return GGUFModelLoader(load_config)
+
+    if load_config.load_format == LoadFormat.RUNAI_STREAMER:
+        return RunaiModelStreamerLoader(load_config)
+
+    if load_config.load_format == LoadFormat.RUNAI_STREAMER_SHARDED:
+        return ShardedStateLoader(load_config, runai_model_streamer=True)
+
+    return DefaultModelLoader(load_config)
+
+
+def get_model(*,
+              vllm_config: VllmConfig,
+              model_config: Optional[ModelConfig] = None) -> nn.Module:
+    loader = get_model_loader(vllm_config.load_config)
+    if model_config is None:
+        model_config = vllm_config.model_config
+    return loader.load_model(vllm_config=vllm_config,
+                             model_config=model_config)
+
+
+__all__ = [
+    "get_model",
+    "get_model_loader",
+    "get_architecture_class_name",
+    "get_model_architecture",
+    "get_model_cls",
+    "BaseModelLoader",
+    "BitsAndBytesModelLoader",
+    "GGUFModelLoader",
+    "DefaultModelLoader",
+    "DummyModelLoader",
+    "RunaiModelStreamerLoader",
+    "ShardedStateLoader",
+    "TensorizerLoader",
+]
diff --git a/vllm_v0.10.0/vllm/model_executor/model_loader/base_loader.py b/vllm_v0.10.0/vllm/model_executor/model_loader/base_loader.py
new file mode 100644
index 0000000..4cf6c79
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/model_loader/base_loader.py
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+
+import torch
+import torch.nn as nn
+
+from vllm.config import LoadConfig, ModelConfig, VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader.utils import (
+    initialize_model, process_weights_after_loading, set_default_torch_dtype)
+
+logger = init_logger(__name__)
+
+
+class BaseModelLoader(ABC):
+    """Base class for model loaders."""
+
+    def __init__(self, load_config: LoadConfig):
+        self.load_config = load_config
+
+    @abstractmethod
+    def download_model(self, model_config: ModelConfig) -> None:
+        """Download a model so that it can be immediately loaded."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def load_weights(self, model: nn.Module,
+                     model_config: ModelConfig) -> None:
+        """Load weights into a model. This standalone API allows 
+        inplace weights loading for an already-initialized model"""
+        raise NotImplementedError
+
+    def load_model(self, vllm_config: VllmConfig,
+                   model_config: ModelConfig) -> nn.Module:
+        """Load a model with the given configurations."""
+        device_config = vllm_config.device_config
+        load_config = vllm_config.load_config
+        load_device = device_config.device if load_config.device is None else \
+                      load_config.device
+        target_device = torch.device(load_device)
+        with set_default_torch_dtype(model_config.dtype):
+            with target_device:
+                model = initialize_model(vllm_config=vllm_config,
+                                         model_config=model_config)
+
+            logger.debug("Loading weights on %s ...", load_device)
+            # Quantization does not happen in `load_weights` but after it
+            self.load_weights(model, model_config)
+            process_weights_after_loading(model, model_config, target_device)
+        return model.eval()
diff --git a/vllm_v0.10.0/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm_v0.10.0/vllm/model_executor/model_loader/bitsandbytes_loader.py
new file mode 100644
index 0000000..68fcb78
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -0,0 +1,762 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: SIM117
+import fnmatch
+import glob
+import itertools
+import math
+import os
+from collections.abc import Generator
+from typing import Any, Callable, Optional
+
+import numpy as np
+import torch
+from huggingface_hub import HfApi
+from torch import nn
+from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
+
+from vllm.config import LoadConfig, ModelConfig
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size)
+# yapf: enable
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.linear import (LinearBase,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.model_loader.base_loader import BaseModelLoader
+from vllm.model_executor.model_loader.utils import (ParamMapping,
+                                                    set_default_torch_dtype)
+from vllm.model_executor.model_loader.weight_utils import (
+    download_safetensors_index_file_from_hf, download_weights_from_hf,
+    filter_duplicate_safetensors_files, filter_files_not_needed_for_inference,
+    pt_weights_iterator, safetensors_weights_iterator)
+from vllm.model_executor.models import is_pooling_model
+from vllm.model_executor.utils import (get_packed_modules_mapping,
+                                       set_weight_attrs)
+from vllm.platforms import current_platform
+
+# yapf conflicts with isort for this block
+
+logger = init_logger(__name__)
+
+
+class BitsAndBytesModelLoader(BaseModelLoader):
+    """Model loader to load model weights with BitAndBytes quantization."""
+
+    possible_config_file_names = ["adapter_config.json"]
+
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+
+        # Save the module names without sharding.
+        self.unsharded_weights_modules: list[str] = []
+        # Save the module names that are sharded by column.
+        self.column_sharded_weights_modules: list[str] = []
+        # Modules whose weights might have fused on disk
+        # we need their output_sizes to make shard in flight correctly with TP
+        self.maybe_fused_weights_modules: dict[str, list[int]] = {}
+        # Store all module names (from transformers) that support
+        # BNB quantization.
+        self.target_modules: list[str] = []
+        # mapping weight names from transformers to vllm.
+        self.weight_mapper: Callable = lambda name: name
+        self.pre_quant: bool = False
+        self.load_8bit: bool = False
+        self.is_pool_model: bool = False
+
+    def _get_weight_files(
+        self,
+        model_name_or_path: str,
+        allowed_patterns: list[str],
+        revision: Optional[str] = None,
+    ) -> tuple[str, list[str], str]:
+        """Retrieve weight files. Download the files if necessary.
+
+        Return the weight files and the file pattern."""
+        is_local = os.path.isdir(model_name_or_path)
+
+        if is_local:
+            for pattern in allowed_patterns:
+                weight_files = glob.glob(
+                    os.path.join(model_name_or_path, pattern))
+                if weight_files:
+                    return model_name_or_path, weight_files, pattern
+        else:
+            hf_api = HfApi()
+            repo_files = hf_api.list_repo_files(repo_id=model_name_or_path)
+            for pattern in allowed_patterns:
+                matching_files = fnmatch.filter(repo_files, pattern)
+                if matching_files:
+                    hf_folder = download_weights_from_hf(
+                        model_name_or_path,
+                        self.load_config.download_dir,
+                        [pattern],
+                        revision,
+                        ignore_patterns=self.load_config.ignore_patterns,
+                    )
+                    return hf_folder, glob.glob(
+                        os.path.join(hf_folder, pattern)), pattern
+
+        raise RuntimeError(
+            f"No model weights found in: `{model_name_or_path}`")
+
+    def _prepare_weights(self, model_name_or_path: str,
+                         revision: Optional[str]) -> tuple[list[str], bool]:
+        """Prepare weight files for the model."""
+
+        allowed_patterns = ["*.safetensors", "*.bin", "*.pt"]
+
+        hf_folder, hf_weights_files, matched_pattern = self._get_weight_files(
+            model_name_or_path, allowed_patterns, revision)
+
+        use_safetensors = matched_pattern == "*.safetensors"
+        is_local = os.path.isdir(model_name_or_path)
+        index_file = SAFE_WEIGHTS_INDEX_NAME
+        if use_safetensors:
+            # For models like Mistral-7B-Instruct-v0.3
+            # there are both sharded safetensors files and a consolidated
+            # safetensors file. Using both breaks.
+            # Here, we download the `model.safetensors.index.json` and filter
+            # any files not found in the index.
+            if not is_local:
+                download_safetensors_index_file_from_hf(
+                    model_name_or_path,
+                    index_file,
+                    self.load_config.download_dir,
+                    revision,
+                )
+            hf_weights_files = filter_duplicate_safetensors_files(
+                hf_weights_files, hf_folder, index_file)
+        else:
+            hf_weights_files = filter_files_not_needed_for_inference(
+                hf_weights_files)
+
+        if len(hf_weights_files) == 0:
+            raise RuntimeError(
+                f"Cannot find any model weights with `{model_name_or_path}`")
+
+        return hf_weights_files, use_safetensors
+
+    def _hf_weight_iter(self, hf_weights_files, use_safetensors: bool):
+
+        def _maybe_pool_model(module_name: str):
+            # For pool model, we need to add the prefix `model.`
+            # for the weight name if possible.
+            if self.is_pool_model and self.target_modules[0]. \
+                startswith("model.") and not module_name.startswith(
+                    "model."):
+                return "model." + module_name
+
+            return module_name
+
+        if use_safetensors:
+            iterator = safetensors_weights_iterator(
+                hf_weights_files,
+                self.load_config.use_tqdm_on_load,
+            )
+        else:
+            iterator = pt_weights_iterator(
+                hf_weights_files,
+                self.load_config.use_tqdm_on_load,
+                self.load_config.pt_load_map_location,
+            )
+        for org_name, param in iterator:
+            # mapping weight names from transformers to vllm while preserving
+            # original names.
+            mapped_name = self.weight_mapper(org_name)
+            mapped_name = _maybe_pool_model(mapped_name)
+
+            yield org_name, mapped_name, param
+
+    def _get_quantized_weights_iterator(
+        self,
+        model_name_or_path: str,
+        revision: Optional[str],
+    ) -> tuple[Generator[tuple[str, torch.Tensor], None, None], dict[str,
+                                                                     Any]]:
+        """Get an iterator to the model weights with bitsandbytes quantization,
+        as well as the quantization state dictionary."""
+
+        # only load the bitsandbytes module when needed
+        try:
+            import bitsandbytes
+
+            if bitsandbytes.__version__ < "0.46.1":
+                raise ImportError("bitsandbytes version is wrong. Please "
+                                  "install bitsandbytes>=0.46.1.")
+        except ImportError as err:
+            raise ImportError("Please install bitsandbytes>=0.46.1 via "
+                              "`pip install bitsandbytes>=0.46.1` to use "
+                              "bitsandbytes quantizer.") from err
+
+        hf_weights_files, use_safetensors = self._prepare_weights(
+            model_name_or_path, revision)
+
+        quant_state_dict: dict[str, Any] = {}
+
+        if self.pre_quant:
+            if self.load_8bit:
+                return self._quantized_8bit_generator(
+                    hf_weights_files, use_safetensors,
+                    quant_state_dict), quant_state_dict
+            else:
+                return self._quantized_4bit_generator(
+                    hf_weights_files, use_safetensors,
+                    quant_state_dict), quant_state_dict
+
+        return self._unquantized_generator(hf_weights_files, use_safetensors,
+                                           quant_state_dict), quant_state_dict
+
+    def _is_8bit_weight_name(self, weight_name: str):
+        quantized_suffix = {".scb", ".weight_format"}
+        return any(weight_name.lower().endswith(suffix)
+                   for suffix in quantized_suffix)
+
+    def _is_4bit_weight_name(self, weight_name: str):
+        quantized_suffix = {
+            "absmax",
+            "quant_map",
+            "nested_absmax",
+            "nested_quant_map",
+            "bitsandbytes",
+        }
+        suffix = weight_name.split(".")[-1]
+        return any(q_suffix in suffix for q_suffix in quantized_suffix)
+
+    def _quantized_8bit_generator(self, hf_weights_files, use_safetensors,
+                                  quant_state_dict) -> Generator:
+        for (
+                org_weight_name,
+                mapped_weight_name,
+                weight_tensor,
+        ) in self._hf_weight_iter(hf_weights_files, use_safetensors):
+            if not mapped_weight_name.lower().endswith(".scb"):
+                continue
+
+            weight_key = mapped_weight_name.lower().replace(".scb", ".weight")
+            quant_state_dict[weight_key] = weight_tensor
+
+        for (
+                org_weight_name,
+                mapped_weight_name,
+                weight_tensor,
+        ) in self._hf_weight_iter(hf_weights_files, use_safetensors):
+            if self._is_8bit_weight_name(mapped_weight_name):
+                continue
+
+            if mapped_weight_name in quant_state_dict:
+                set_weight_attrs(weight_tensor, {"load_in_8bit": True})
+                yield org_weight_name, weight_tensor
+            else:
+                yield org_weight_name, weight_tensor
+
+    def _quantized_4bit_generator(self, hf_weights_files, use_safetensors,
+                                  quant_state_dict) -> Generator:
+        from bitsandbytes.functional import QuantState
+
+        # First iterate over all quant state weights
+        weight_iterator = self._hf_weight_iter(hf_weights_files,
+                                               use_safetensors)
+        temp_state_dict = {}
+        for (
+                org_weight_name,
+                mapped_weight_name,
+                weight_tensor,
+        ) in weight_iterator:
+            if not self._is_4bit_weight_name(mapped_weight_name):
+                continue
+            # bitsandbytes library requires
+            # weight.quant_state.bitsandbytes__* in CPU
+            if "quant_state.bitsandbytes" in mapped_weight_name:
+                temp_state_dict[mapped_weight_name] = weight_tensor.cpu().data
+            else:
+                temp_state_dict[mapped_weight_name] = weight_tensor
+
+        # Closure to parse quant_state for each prequant weight
+        def _parse_quant_state(param_name: str,
+                               temp_state_dict: dict) -> QuantState:
+            quant_state = {}
+            for k in temp_state_dict:
+                if param_name + "." in k:
+                    quant_state[k] = temp_state_dict[k]
+
+            return QuantState.from_dict(quant_state,
+                                        device=current_platform.device_type)
+
+        # Second iterate over all prequant and normal weights
+        # pre quantized weights would have a quant_state
+        for (
+                org_weight_name,
+                mapped_weight_name,
+                weight_tensor,
+        ) in self._hf_weight_iter(hf_weights_files, use_safetensors):
+            if self._is_4bit_weight_name(mapped_weight_name):
+                continue
+
+            if (f"{mapped_weight_name}.quant_state.bitsandbytes__nf4"
+                    in temp_state_dict) or (
+                        f"{mapped_weight_name}.quant_state.bitsandbytes__fp4"
+                        in temp_state_dict):
+                quant_state = _parse_quant_state(mapped_weight_name,
+                                                 temp_state_dict)
+                quant_state_dict[mapped_weight_name] = quant_state
+                yield org_weight_name, weight_tensor
+            else:
+                yield org_weight_name, weight_tensor
+
+    def _unquantized_generator(self, hf_weights_files, use_safetensors,
+                               quant_state_dict) -> Generator:
+        from bitsandbytes.functional import quantize_4bit
+
+        tp_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+
+        for (
+                org_weight_name,
+                mapped_weight_name,
+                weight_tensor,
+        ) in self._hf_weight_iter(hf_weights_files, use_safetensors):
+            if any(target_module in mapped_weight_name
+                   for target_module in self.target_modules
+                   ) and mapped_weight_name.endswith(".weight"):
+                # Without sharding
+                if any(
+                        mapped_weight_name.startswith(module)
+                        for module in self.unsharded_weights_modules):
+                    weight_sub_tensor = weight_tensor
+                # Shard by column
+                elif any(
+                        mapped_weight_name.startswith(module)
+                        for module in self.column_sharded_weights_modules):
+                    total_size = weight_tensor.size(-1)
+                    start_index = total_size // tp_size * tp_rank
+                    end_index = total_size // tp_size * (tp_rank + 1)
+                    weight_sub_tensor = weight_tensor[...,
+                                                      start_index:end_index]
+                # Weights have fused on disk. In this case, we assume that the
+                # weight and module use same name.
+                elif any(
+                        mapped_weight_name.startswith(module)
+                        for module in self.maybe_fused_weights_modules):
+                    # special case for fused weights
+                    # get the size of each shard weight tensor
+                    total_shard_sizes = next(
+                        (sizes for module, sizes in
+                         self.maybe_fused_weights_modules.items()
+                         if mapped_weight_name.startswith(module)))
+                    total_size = weight_tensor.size(0)
+                    assert total_size == sum(total_shard_sizes)
+                    # get the start/end index of each shard weight tensor
+                    total_start_index = list(
+                        itertools.accumulate([0] + total_shard_sizes))[:-1]
+                    shard_weights_index = [(
+                        idx + size // tp_size * tp_rank,
+                        idx + size // tp_size * (tp_rank + 1),
+                    ) for idx, size in zip(total_start_index,
+                                           total_shard_sizes)]
+                    # slice and reorder the weight tensor
+                    weight_tensor = [
+                        weight_tensor[start_index:end_index, ...]
+                        for start_index, end_index in shard_weights_index
+                    ]
+                    weight_sub_tensor = torch.cat(weight_tensor, dim=0)
+                # Shard by row
+                else:
+                    total_size = weight_tensor.size(0)
+                    start_index = total_size // tp_size * tp_rank
+                    end_index = total_size // tp_size * (tp_rank + 1)
+                    weight_sub_tensor = weight_tensor[start_index:end_index,
+                                                      ...]
+
+                # bitsandbytes requires data in GPU
+                if weight_sub_tensor.is_cuda:
+                    loaded_weight = weight_sub_tensor
+                else:
+                    loaded_weight = weight_sub_tensor.to(
+                        device=current_platform.device_type)
+
+                # remove the following after the issue is fixed:
+                # https://github.com/bitsandbytes-foundation/bitsandbytes/issues/1342
+                if loaded_weight.is_contiguous() is False:
+                    loaded_weight = loaded_weight.contiguous()
+
+                with set_default_torch_dtype(torch.float32):
+                    processed_weight, quant_state = quantize_4bit(
+                        loaded_weight,
+                        compress_statistics=True,
+                        quant_type="nf4",
+                    )
+
+                quant_state_dict[mapped_weight_name] = quant_state
+            else:
+                processed_weight = weight_tensor
+            yield org_weight_name, processed_weight
+
+    def _get_bnb_target_modules(self, model: nn.Module) -> None:
+        """
+        Identify and collect all modules that support BitsAndBytes 
+        quantization.
+        """
+        for name, module in model.named_modules():
+            if (isinstance(module, LinearBase)
+                    and hasattr(module.quant_method, "quant_config")):
+                if modules_info := self.modules_mapping.get_sub_modules(name):
+                    # Map vllm's names to transformers's names.
+                    rep_name, sub_modules = modules_info
+                    for sub_name in sub_modules:
+                        self.target_modules.append(
+                            name.replace(rep_name, sub_name))
+                # Add original module name even if the module has stacked map,
+                # in case model has a mixture of disk-merged and disk-split
+                # weights with same last name.
+                self.target_modules.append(name)
+            elif (isinstance(module, FusedMoE)
+                  and hasattr(module.quant_method, "quant_config")):
+                if not hasattr(model, "get_expert_mapping"):
+                    raise AttributeError(
+                        f"MoE Model {type(model).__name__} does not support "
+                        "BitsAndBytes quantization yet. Ensure this model has "
+                        "'get_expert_mapping' method.")
+                # TODO: support FusedMoE with prequant and 8bit.
+                if self.pre_quant:
+                    raise ValueError(
+                        "Prequant BitsAndBytes models with FusedMoE is not "
+                        "supported yet.")
+                if self.load_8bit:
+                    raise ValueError(
+                        "BitsAndBytes 8bit quantization with FusedMoE is not "
+                        "supported yet.")
+                # Get the corresponding weight name using module name and
+                # get_expert_mapping.
+                expert_mapping = model.get_expert_mapping()
+                for exp in expert_mapping:
+                    weight_name = exp[1]
+                    rep_name = name.replace("experts",
+                                            "") + weight_name.removesuffix(".")
+                    self.target_modules.append(rep_name)
+
+        assert (self.target_modules
+                ), "vLLM currently does not support BNB quantization for"
+        f" {type(model).__name__}"
+
+    def _classify_module_sharding(self, model: nn.Module):
+        """
+        Categorize modules based on their weight sharding requirements 
+        for tensor parallelism.
+        """
+        for name, module in model.named_modules():
+            # Some modules like `ReplicatedLinear` should not have their weights
+            # sharded. The reason for implementing it this way is to avoid new
+            # static variable in the model implementation.
+            if isinstance(module, (ReplicatedLinear, )):
+                self.unsharded_weights_modules.append(name)
+            # `QKVParallelLinear` and `MergedColumnParallelLinear` might have
+            # fused weights on disk. We need to use the output sizes of these
+            # modules to shard the weights correctly.
+            elif isinstance(module,
+                            (QKVParallelLinear, MergedColumnParallelLinear)):
+                self.maybe_fused_weights_modules[name] = module.output_sizes
+            # In TP, these weights are partitioned along the column
+            # dimension (dim=-1)
+            elif isinstance(module, (RowParallelLinear, )):
+                self.column_sharded_weights_modules.append(name)
+            elif isinstance(module, FusedMoE):
+                expert_mapping = model.get_expert_mapping()
+                for exp in expert_mapping:
+                    if exp[-1] == "w2":
+                        weight_name = exp[1]
+                        rep_name = name.replace(
+                            "experts", "") + weight_name.removesuffix(".")
+                        self.column_sharded_weights_modules.append(rep_name)
+
+    def _verify_model_compatibility(self, model: nn.Module,
+                                    model_config: ModelConfig) -> None:
+        """
+        Verify that the model is compatible with BitsAndBytes quantization.
+        """
+        if not hasattr(model, "load_weights"):
+            raise AttributeError(
+                "The required method 'load_weights' is not defined in class"
+                f" {type(model).__name__}.")
+
+        if not hasattr(model, "packed_modules_mapping"):
+            raise AttributeError(
+                f"Model {type(model).__name__} does not support BitsAndBytes "
+                "quantization yet. No 'packed_modules_mapping' found.")
+
+        quant_config = getattr(model_config.hf_config, "quantization_config",
+                               None)
+        if quant_config is not None:
+            quant_method = quant_config.get("quant_method")
+            if quant_method == "bitsandbytes":
+                self.pre_quant = True
+            else:
+                raise ValueError(
+                    f"BitsAndBytes loader does not support {quant_method} "
+                    "quantization")
+
+        # The quant_states in pre_quantized models cannot work with a split
+        # weight tensor. So TP does not work with pre_quantized bnb models.
+        if self.pre_quant and get_tensor_model_parallel_world_size() > 1:
+            raise ValueError(
+                "Prequant BitsAndBytes models with tensor parallelism is not "
+                "supported. Please try with pipeline parallelism.")
+        if self.pre_quant:
+            self.load_8bit = quant_config.get("load_in_8bit", False)
+
+    def _initialize_loader_state(self, model: nn.Module,
+                                 model_config: ModelConfig) -> None:
+        """
+        Initialize the loader's internal state based on the model and 
+        configuration.
+        """
+        self.is_pool_model = is_pooling_model(model)
+        self.modules_mapping = ParamMapping(get_packed_modules_mapping(model))
+
+        # For some models like Molmo, we need to use hf_to_vllm_mapper
+        # to ensure correct loading of weights.
+        if hf_to_vllm_mapper := getattr(model, "hf_to_vllm_mapper", None):
+            self.weight_mapper = lambda name: hf_to_vllm_mapper._map_name(name)
+
+        self._get_bnb_target_modules(model)
+        self._classify_module_sharding(model)
+
+    def _dequantize_dq(self, quant_states: Any):
+        """
+        When BNB employs Double Quantization, we perform the dequantization of 
+        these constants during weight loading rather than at inference time, 
+        thereby avoiding this computational overhead during inference. This 
+        comes at the cost of increased memory usage.
+        """
+        from bitsandbytes.functional import QuantState, dequantize_blockwise
+
+        def _dequantize_single_state(quant_state):
+            """Helper function to dequantize a single QuantState object."""
+            if not (isinstance(quant_state, QuantState)
+                    and quant_state.nested):
+                return
+
+            # Copied from: https://github.com/bitsandbytes-foundation/bitsandbytes/blob/0.45.3/bitsandbytes/functional.py#L1352-#L1356
+            absmax = dequantize_blockwise(quant_state.absmax,
+                                          quant_state.state2)
+            absmax += quant_state.offset
+
+            # Ensure float32 dtype
+            if absmax.dtype != torch.float32:
+                absmax = absmax.float()
+
+            quant_state.absmax = absmax
+            quant_state.nested = False
+            quant_state.offset = None
+            quant_state.state2 = None
+
+        if isinstance(quant_states, dict):
+            for quant_state in quant_states.values():
+                _dequantize_single_state(quant_state)
+        else:
+            _dequantize_single_state(quant_states)
+        return quant_states
+
+    def _fuse_moe_quant_states(self, model: nn.Module,
+                               quant_states_dict: dict) -> dict:
+        """
+        
+        This function consolidates individual expert quantization states into
+        fused representations for w13 and w2.
+        """
+        from bitsandbytes.functional import QuantState
+
+        if not hasattr(model, "get_expert_mapping"):
+            return dict()
+
+        expert_mapping = model.get_expert_mapping()
+        expert_qs_dict = {}
+        for name, module in model.named_modules():
+            if not isinstance(module, FusedMoE):
+                continue
+            w1_states_lst = []
+            w2_states_lst = []
+            w3_states_lst = []
+            for exp in expert_mapping:
+                shard_id = exp[-1]
+                if shard_id not in ("w1", "w2", "w3"):
+                    raise ValueError(f"shard_id must be ['w1','w2','w3'] but "
+                                     f"got {shard_id}.")
+                layer_prefix = name.split("experts")[0]
+                weight_qual_name = layer_prefix + exp[1] + "weight"
+                quant_state = self._dequantize_dq(
+                    quant_states_dict[weight_qual_name])
+                if shard_id == "w1":
+                    w1_states_lst.append(quant_state)
+                elif shard_id == "w2":
+                    w2_states_lst.append(quant_state)
+                else:
+                    w3_states_lst.append(quant_state)
+                del quant_states_dict[weight_qual_name]
+            assert (len(w1_states_lst) == len(w2_states_lst) ==
+                    len(w3_states_lst))
+            w13_absmax_lst = []
+            w2_absmax_lst = []
+            w13_total_dim0 = 0
+            w2_total_dim0 = 0
+            for w1_qs, w2_qs, w3_qs in zip(w1_states_lst, w2_states_lst,
+                                           w3_states_lst):
+                assert w1_qs.shape == w3_qs.shape
+                assert w1_qs.blocksize == w2_qs.blocksize == w3_qs.blocksize
+                assert w1_qs.dtype == w2_qs.dtype == w3_qs.dtype
+                # w1 and w3 are interleaved in storage
+                w13_absmax_lst.append(w1_qs.absmax)
+                w13_absmax_lst.append(w3_qs.absmax)
+                w2_absmax_lst.append(w2_qs.absmax)
+                w13_total_dim0 += w1_qs.shape[0] + w3_qs.shape[0]
+                w2_total_dim0 += w2_qs.shape[0]
+
+            w13_absmax = torch.cat(w13_absmax_lst)
+            w2_absmax = torch.cat(w2_absmax_lst)
+            # Create fused quantization state for w13.
+            w13_qs = QuantState(
+                absmax=w13_absmax,
+                shape=(w13_total_dim0, w1_states_lst[0].shape[1]),
+                code=w1_states_lst[0].code,
+                blocksize=w1_states_lst[0].blocksize,
+                quant_type="nf4",
+                dtype=w1_states_lst[0].dtype,
+            )
+            # Create fused quantization state for w2.
+            w2_qs = QuantState(
+                absmax=w2_absmax,
+                shape=(w2_total_dim0, w2_states_lst[0].shape[1]),
+                code=w2_states_lst[0].code,
+                blocksize=w2_states_lst[0].blocksize,
+                quant_type="nf4",
+                dtype=w2_states_lst[0].dtype,
+            )
+            # The weight suffixes .w13_weight and .w2_weight are consistent
+            # with the param in BitsAndBytesMoEMethod.
+            w13_weight_name = name + ".w13_weight"
+            w2_weight_name = name + ".w2_weight"
+            expert_qs_dict[w13_weight_name] = w13_qs
+            expert_qs_dict[w2_weight_name] = w2_qs
+        return expert_qs_dict
+
+    def _stack_quantization_states(
+            self, model: nn.Module,
+            quant_state_dict: dict) -> dict[str, dict[int, Any]]:
+        stacked_quant_state_dict: dict[str, dict[int, Any]] = {}
+        # TODO: Change this lazy import to normal import
+        # after the checks are updated to run on a new version
+        from vllm.model_executor.models.utils import is_pp_missing_parameter
+        param_dict = dict(model.named_parameters())
+        for quant_param_name in quant_state_dict:
+            if is_pp_missing_parameter(quant_param_name, model):
+                continue
+
+            non_stacked_param_name = quant_param_name
+
+            shard_index = 0
+            for shard_name, (
+                    weight_name,
+                    index,
+            ) in self.modules_mapping.inverse_packed_mapping.items():
+                # Some models, such as MiniCPM V2.5/2.6, contain both
+                # module names 'kv_proj' and 'qkv_proj'. To prevent 'kv_proj'
+                # from being incorrectly identified as being present in
+                # 'vpm.encoder.layers.0.self_attn.qkv_proj.weight
+                shard_pos = quant_param_name.find(shard_name)
+                can_correct_rename = (shard_pos
+                                      > 0) and (quant_param_name[shard_pos - 1]
+                                                == ".")
+                # If the quant_param_name is packed, it won't occur in the
+                # param_dict before renaming.
+                new_quant_param_name = quant_param_name.replace(
+                    shard_name, weight_name)
+                need_rename = (quant_param_name not in param_dict) \
+                              and (new_quant_param_name in param_dict)
+                if can_correct_rename and need_rename:
+                    shard_index = index
+                    quant_param_name = new_quant_param_name
+                    break
+
+            # Models like Clip/Siglip may skip some layers in initialization,
+            # causing unused quant_param_name in state_dict.
+            if quant_param_name not in param_dict:
+                continue
+
+            if quant_param_name not in stacked_quant_state_dict:
+                stacked_quant_state_dict[quant_param_name] = {}
+
+            stacked_quant_state_dict[quant_param_name][shard_index] = (
+                quant_state_dict[non_stacked_param_name])
+        return stacked_quant_state_dict
+
+    def _bind_quant_states_to_params(self, model: nn.Module,
+                                     stacked_quant_state_dict: dict) -> None:
+        # save quant_states and offsets as the attributes of the parameters
+        param_dict = dict(model.named_parameters())
+        for param_name, param in param_dict.items():
+            if param_name in stacked_quant_state_dict:
+                quant_states = stacked_quant_state_dict[param_name]
+                # Dequantize double quantized values during weight loading.
+                self._dequantize_dq(quant_states)
+                set_weight_attrs(param, {"bnb_quant_state": quant_states})
+                if not isinstance(quant_states, dict):
+                    continue
+
+                pack_ratio = getattr(param, "pack_factor", -1)
+                if pack_ratio == -1:
+                    raise ValueError(
+                        f"pack_factor not set for parameter {param_name}.")
+
+                num_elements = [0] * len(quant_states)
+                for seq, quant_state in quant_states.items():
+                    num_elements[seq] = (math.prod(quant_state.shape) //
+                                         pack_ratio)
+
+                offsets = np.concatenate(([0], np.cumsum(num_elements)))
+                # Make torch infer_schema happy
+                offsets = torch.tensor(offsets).cpu()
+                set_weight_attrs(param, {"bnb_shard_offsets": offsets})
+
+                if self.load_8bit:
+                    set_weight_attrs(
+                        param, {"matmul_state": [None] * len(quant_states)})
+
+    def load_weights(self, model: nn.Module,
+                     model_config: ModelConfig) -> None:
+
+        self._verify_model_compatibility(model, model_config)
+        self._initialize_loader_state(model, model_config)
+
+        logger.info("Loading weights with BitsAndBytes quantization. "
+                    "May take a while ...")
+        qweight_iterator, quant_state_dict = (
+            self._get_quantized_weights_iterator(
+                model_config.model,
+                model_config.revision,
+            ))
+        weights_to_load = {name for name, _ in model.named_parameters()}
+        loaded_weights = model.load_weights(qweight_iterator)
+        # Some models may have weights loading tracker unimplemented.
+        if loaded_weights is not None:
+            weights_not_loaded = weights_to_load - loaded_weights
+            if weights_not_loaded:
+                raise ValueError("Following weights were not initialized from "
+                                 f"checkpoint: {weights_not_loaded}")
+        expert_quant_state_dict = self._fuse_moe_quant_states(
+            model, quant_state_dict)
+
+        stacked_quant_state_dict = self._stack_quantization_states(
+            model, quant_state_dict)
+
+        stacked_quant_state_dict = {
+            **expert_quant_state_dict,
+            **stacked_quant_state_dict
+        }
+        self._bind_quant_states_to_params(model, stacked_quant_state_dict)
+        torch.cuda.empty_cache()
+
+    def download_model(self, model_config: ModelConfig) -> None:
+        self._prepare_weights(model_config.model, model_config.revision)
diff --git a/vllm_v0.10.0/vllm/model_executor/model_loader/default_loader.py b/vllm_v0.10.0/vllm/model_executor/model_loader/default_loader.py
new file mode 100644
index 0000000..2fcae7e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/model_loader/default_loader.py
@@ -0,0 +1,272 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import dataclasses
+import glob
+import os
+import time
+from collections.abc import Generator, Iterable
+from typing import Optional, cast
+
+import huggingface_hub
+import torch
+from torch import nn
+from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
+
+from vllm import envs
+from vllm.config import LoadConfig, LoadFormat, ModelConfig
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader.base_loader import BaseModelLoader
+from vllm.model_executor.model_loader.weight_utils import (
+    download_safetensors_index_file_from_hf, download_weights_from_hf,
+    fastsafetensors_weights_iterator, filter_duplicate_safetensors_files,
+    filter_files_not_needed_for_inference, get_lock, np_cache_weights_iterator,
+    pt_weights_iterator, safetensors_weights_iterator)
+from vllm.platforms import current_platform
+
+logger = init_logger(__name__)
+
+
+class DefaultModelLoader(BaseModelLoader):
+    """Model loader that can load different file types from disk."""
+
+    @dataclasses.dataclass
+    class Source:
+        """A source for weights."""
+
+        model_or_path: str
+        """The model ID or path."""
+
+        revision: Optional[str]
+        """The optional model revision."""
+
+        prefix: str = ""
+        """A prefix to prepend to all weights."""
+
+        fall_back_to_pt: bool = True
+        """Whether .pt weights can be used."""
+
+        allow_patterns_overrides: Optional[list[str]] = None
+        """If defined, weights will load exclusively using these patterns."""
+
+    counter_before_loading_weights: float = 0.0
+    counter_after_loading_weights: float = 0.0
+
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+        if load_config.model_loader_extra_config:
+            raise ValueError(f"Model loader extra config is not supported for "
+                             f"load format {load_config.load_format}")
+
+    def _maybe_download_from_modelscope(
+            self, model: str, revision: Optional[str]) -> Optional[str]:
+        """Download model from ModelScope hub if VLLM_USE_MODELSCOPE is True.
+
+        Returns the path to the downloaded model, or None if the model is not
+        downloaded from ModelScope."""
+        if envs.VLLM_USE_MODELSCOPE:
+            # download model from ModelScope hub,
+            # lazy import so that modelscope is not required for normal use.
+            # pylint: disable=C.
+            from modelscope.hub.snapshot_download import snapshot_download
+
+            if not os.path.exists(model):
+                # Use file lock to prevent multiple processes from
+                # downloading the same model weights at the same time.
+                with get_lock(model, self.load_config.download_dir):
+                    model_path = snapshot_download(
+                        model_id=model,
+                        cache_dir=self.load_config.download_dir,
+                        local_files_only=huggingface_hub.constants.
+                        HF_HUB_OFFLINE,
+                        revision=revision,
+                        ignore_file_pattern=self.load_config.ignore_patterns,
+                    )
+            else:
+                model_path = model
+            return model_path
+        return None
+
+    def _prepare_weights(
+        self,
+        model_name_or_path: str,
+        revision: Optional[str],
+        fall_back_to_pt: bool,
+        allow_patterns_overrides: Optional[list[str]],
+    ) -> tuple[str, list[str], bool]:
+        """Prepare weights for the model.
+
+        If the model is not local, it will be downloaded."""
+        model_name_or_path = (self._maybe_download_from_modelscope(
+            model_name_or_path, revision) or model_name_or_path)
+
+        is_local = os.path.isdir(model_name_or_path)
+        load_format = self.load_config.load_format
+        use_safetensors = False
+        index_file = SAFE_WEIGHTS_INDEX_NAME
+        # Some quantized models use .pt files for storing the weights.
+        if load_format == LoadFormat.AUTO:
+            allow_patterns = ["*.safetensors", "*.bin"]
+        elif (load_format == LoadFormat.SAFETENSORS
+              or load_format == LoadFormat.FASTSAFETENSORS):
+            use_safetensors = True
+            allow_patterns = ["*.safetensors"]
+        elif load_format == LoadFormat.MISTRAL:
+            use_safetensors = True
+            allow_patterns = ["consolidated*.safetensors"]
+            index_file = "consolidated.safetensors.index.json"
+        elif load_format == LoadFormat.PT:
+            allow_patterns = ["*.pt"]
+        elif load_format == LoadFormat.NPCACHE:
+            allow_patterns = ["*.bin"]
+        else:
+            raise ValueError(f"Unknown load_format: {load_format}")
+
+        if fall_back_to_pt:
+            allow_patterns += ["*.pt"]
+
+        if allow_patterns_overrides is not None:
+            allow_patterns = allow_patterns_overrides
+
+        if not is_local:
+            hf_folder = download_weights_from_hf(
+                model_name_or_path,
+                self.load_config.download_dir,
+                allow_patterns,
+                revision,
+                ignore_patterns=self.load_config.ignore_patterns,
+            )
+        else:
+            hf_folder = model_name_or_path
+
+        hf_weights_files: list[str] = []
+        for pattern in allow_patterns:
+            hf_weights_files += glob.glob(os.path.join(hf_folder, pattern))
+            if len(hf_weights_files) > 0:
+                if pattern == "*.safetensors":
+                    use_safetensors = True
+                break
+
+        if use_safetensors:
+            # For models like Mistral-7B-Instruct-v0.3
+            # there are both sharded safetensors files and a consolidated
+            # safetensors file. Using both breaks.
+            # Here, we download the `model.safetensors.index.json` and filter
+            # any files not found in the index.
+            if not is_local:
+                download_safetensors_index_file_from_hf(
+                    model_name_or_path,
+                    index_file,
+                    self.load_config.download_dir,
+                    revision,
+                )
+            hf_weights_files = filter_duplicate_safetensors_files(
+                hf_weights_files, hf_folder, index_file)
+        else:
+            hf_weights_files = filter_files_not_needed_for_inference(
+                hf_weights_files)
+
+        if len(hf_weights_files) == 0:
+            raise RuntimeError(
+                f"Cannot find any model weights with `{model_name_or_path}`")
+
+        return hf_folder, hf_weights_files, use_safetensors
+
+    def _get_weights_iterator(
+            self, source: "Source"
+    ) -> Generator[tuple[str, torch.Tensor], None, None]:
+        """Get an iterator for the model weights based on the load format."""
+        hf_folder, hf_weights_files, use_safetensors = self._prepare_weights(
+            source.model_or_path, source.revision, source.fall_back_to_pt,
+            source.allow_patterns_overrides)
+        if self.load_config.load_format == LoadFormat.NPCACHE:
+            # Currently np_cache only support *.bin checkpoints
+            assert use_safetensors is False
+            weights_iterator = np_cache_weights_iterator(
+                source.model_or_path,
+                self.load_config.download_dir,
+                hf_folder,
+                hf_weights_files,
+                self.load_config.use_tqdm_on_load,
+            )
+        elif use_safetensors:
+            if self.load_config.load_format == LoadFormat.FASTSAFETENSORS:
+                weights_iterator = fastsafetensors_weights_iterator(
+                    hf_weights_files,
+                    self.load_config.use_tqdm_on_load,
+                )
+            else:
+                weights_iterator = safetensors_weights_iterator(
+                    hf_weights_files,
+                    self.load_config.use_tqdm_on_load,
+                )
+        else:
+            weights_iterator = pt_weights_iterator(
+                hf_weights_files,
+                self.load_config.use_tqdm_on_load,
+                self.load_config.pt_load_map_location,
+            )
+
+        if current_platform.is_tpu():
+            # In PyTorch XLA, we should call `xm.mark_step` frequently so that
+            # not too many ops are accumulated in the XLA program.
+            import torch_xla.core.xla_model as xm
+
+            def _xla_weights_iterator(iterator: Generator):
+                for weights in iterator:
+                    yield weights
+                    xm.mark_step()
+
+            weights_iterator = _xla_weights_iterator(weights_iterator)
+
+        if self.counter_before_loading_weights == 0.0:
+            self.counter_before_loading_weights = time.perf_counter()
+        # Apply the prefix.
+        return ((source.prefix + name, tensor)
+                for (name, tensor) in weights_iterator)
+
+    def get_all_weights(
+        self,
+        model_config: ModelConfig,
+        model: nn.Module,
+    ) -> Generator[tuple[str, torch.Tensor], None, None]:
+        primary_weights = DefaultModelLoader.Source(
+            model_config.model,
+            model_config.revision,
+            prefix="",
+            fall_back_to_pt=getattr(model, "fall_back_to_pt_during_load",
+                                    True),
+            allow_patterns_overrides=getattr(model, "allow_patterns_overrides",
+                                             None),
+        )
+        yield from self._get_weights_iterator(primary_weights)
+
+        secondary_weights = cast(
+            Iterable[DefaultModelLoader.Source],
+            getattr(model, "secondary_weights", ()),
+        )
+        for source in secondary_weights:
+            yield from self._get_weights_iterator(source)
+
+    def download_model(self, model_config: ModelConfig) -> None:
+        self._prepare_weights(model_config.model,
+                              model_config.revision,
+                              fall_back_to_pt=True,
+                              allow_patterns_overrides=None)
+
+    def load_weights(self, model: nn.Module,
+                     model_config: ModelConfig) -> None:
+        weights_to_load = {name for name, _ in model.named_parameters()}
+        loaded_weights = model.load_weights(
+            self.get_all_weights(model_config, model))
+        self.counter_after_loading_weights = time.perf_counter()
+        logger.info(
+            "Loading weights took %.2f seconds",
+            self.counter_after_loading_weights -
+            self.counter_before_loading_weights)
+        # We only enable strict check for non-quantized models
+        # that have loaded weights tracking currently.
+        if model_config.quantization is None and loaded_weights is not None:
+            weights_not_loaded = weights_to_load - loaded_weights
+            if weights_not_loaded:
+                raise ValueError("Following weights were not initialized from "
+                                 f"checkpoint: {weights_not_loaded}")
diff --git a/vllm_v0.10.0/vllm/model_executor/model_loader/dummy_loader.py b/vllm_v0.10.0/vllm/model_executor/model_loader/dummy_loader.py
new file mode 100644
index 0000000..f4a7da5
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/model_loader/dummy_loader.py
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch.nn as nn
+
+from vllm.config import LoadConfig, ModelConfig
+from vllm.model_executor.model_loader.base_loader import BaseModelLoader
+from vllm.model_executor.model_loader.weight_utils import (
+    initialize_dummy_weights)
+
+
+class DummyModelLoader(BaseModelLoader):
+    """Model loader that will set model weights to random values."""
+
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+        if load_config.model_loader_extra_config:
+            raise ValueError(f"Model loader extra config is not supported for "
+                             f"load format {load_config.load_format}")
+
+    def download_model(self, model_config: ModelConfig) -> None:
+        pass  # Nothing to download
+
+    def load_weights(self, model: nn.Module,
+                     model_config: ModelConfig) -> None:
+        # NOTE(woosuk): For accurate performance evaluation, we assign
+        # random values to the weights.
+        initialize_dummy_weights(model)
diff --git a/vllm_v0.10.0/vllm/model_executor/model_loader/gguf_loader.py b/vllm_v0.10.0/vllm/model_executor/model_loader/gguf_loader.py
new file mode 100644
index 0000000..26af87c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/model_loader/gguf_loader.py
@@ -0,0 +1,131 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+from collections.abc import Generator
+
+import gguf
+import torch
+import torch.nn as nn
+from huggingface_hub import hf_hub_download
+from transformers import AutoModelForCausalLM
+
+from vllm.config import LoadConfig, ModelConfig, VllmConfig
+from vllm.model_executor.model_loader.base_loader import BaseModelLoader
+from vllm.model_executor.model_loader.utils import (
+    initialize_model, process_weights_after_loading, set_default_torch_dtype)
+from vllm.model_executor.model_loader.weight_utils import (
+    get_gguf_extra_tensor_names, gguf_quant_weights_iterator)
+
+
+class GGUFModelLoader(BaseModelLoader):
+    """
+    Model loader that can load GGUF files. This is useful for loading models
+    that are quantized with GGUF and saved in the GGUF format. This loader
+    supports loading both full models and sharded models.
+    """
+
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+        if load_config.model_loader_extra_config:
+            raise ValueError(f"Model loader extra config is not supported for "
+                             f"load format {load_config.load_format}")
+
+    def _prepare_weights(self, model_name_or_path: str):
+        if os.path.isfile(model_name_or_path):
+            return model_name_or_path
+        # for raw HTTPS link
+        if model_name_or_path.startswith(
+            ("http://", "https://")) and model_name_or_path.endswith(".gguf"):
+            return hf_hub_download(url=model_name_or_path)
+        # repo id/filename.gguf
+        if "/" in model_name_or_path and model_name_or_path.endswith(".gguf"):
+            repo_id, filename = model_name_or_path.rsplit("/", 1)
+            return hf_hub_download(repo_id=repo_id, filename=filename)
+        else:
+            raise ValueError(
+                f"Unrecognised GGUF reference: {model_name_or_path} "
+                "(expected local file, raw URL, or <repo_id>/<filename>.gguf)")
+
+    def _get_gguf_weights_map(self, model_config: ModelConfig):
+        """
+        GGUF uses this naming convention for their tensors from HF checkpoint:
+        `blk.N.BB.weight` and `blk.N.BB.bias`
+        where N signifies the block number of a layer, and BB signifies the
+        attention/mlp layer components.
+        See "Standardized tensor names" in
+        https://github.com/ggerganov/ggml/blob/master/docs/gguf.md for details.
+        """
+        config = model_config.hf_config
+        model_type = config.model_type
+        gguf_to_hf_name_map = {}
+        # hack: ggufs have a different name than transformers
+        if model_type == "cohere":
+            model_type = "command-r"
+        if model_type in ("deepseek_v3", "deepseek_v2"):
+            model_type = "deepseek2"
+            # GGUF layer map assumes that we will have a merged expert weights
+            # so we need to map them manually
+            for idx in range(config.num_hidden_layers):
+                gguf_to_hf_name_map[f"blk.{idx}.exp_probs_b.bias"] = \
+                        f"model.layers.{idx}.mlp.gate.e_score_correction_bias"
+                gguf_to_hf_name_map[f"blk.{idx}.ffn_down_exps.weight"] = \
+                        f"model.layers.{idx}.mlp.experts.0.down_proj.weight"
+                gguf_to_hf_name_map[f"blk.{idx}.ffn_gate_exps.weight"] = \
+                        f"model.layers.{idx}.mlp.experts.0.gate_proj.weight"
+                gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = \
+                        f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
+
+        arch = None
+        for key, value in gguf.MODEL_ARCH_NAMES.items():
+            if value == model_type:
+                arch = key
+                break
+        if arch is None:
+            raise RuntimeError(f"Unknown gguf model_type: {model_type}")
+        num_layers = config.num_hidden_layers
+        name_map = gguf.get_tensor_name_map(arch, num_layers)
+        with torch.device("meta"):
+            dummy_model = AutoModelForCausalLM.from_config(
+                config, trust_remote_code=model_config.trust_remote_code)
+        state_dict = dummy_model.state_dict()
+
+        for hf_name in state_dict:
+            name, suffix = hf_name.rsplit(".", 1)
+            gguf_name = name_map.get_name(name)
+            gguf_to_hf_name_map[f"{gguf_name}.{suffix}"] = hf_name
+        return gguf_to_hf_name_map
+
+    def _get_weights_iterator(
+        self, model_name_or_path: str, gguf_to_hf_name_map: dict[str, str]
+    ) -> Generator[tuple[str, torch.Tensor], None, None]:
+        return gguf_quant_weights_iterator(model_name_or_path,
+                                           gguf_to_hf_name_map)
+
+    def download_model(self, model_config: ModelConfig) -> None:
+        self._prepare_weights(model_config.model)
+
+    def load_weights(self, model: nn.Module,
+                     model_config: ModelConfig) -> None:
+        local_model_path = self._prepare_weights(model_config.model)
+        gguf_weights_map = self._get_gguf_weights_map(model_config)
+        model.load_weights(
+            self._get_weights_iterator(local_model_path, gguf_weights_map))
+
+    def load_model(self, vllm_config: VllmConfig,
+                   model_config: ModelConfig) -> nn.Module:
+        device_config = vllm_config.device_config
+        local_model_path = self._prepare_weights(model_config.model)
+        gguf_weights_map = self._get_gguf_weights_map(model_config)
+        # we can only know if tie word embeddings after mapping weights
+        if "lm_head.weight" in get_gguf_extra_tensor_names(
+                local_model_path, gguf_weights_map):
+            model_config.hf_config.update({"tie_word_embeddings": True})
+
+        target_device = torch.device(device_config.device)
+        with set_default_torch_dtype(model_config.dtype):
+            with target_device:
+                model = initialize_model(vllm_config=vllm_config)
+            self.load_weights(model, model_config)
+
+            process_weights_after_loading(model, model_config, target_device)
+        return model
diff --git a/vllm_v0.10.0/vllm/model_executor/model_loader/neuron.py b/vllm_v0.10.0/vllm/model_executor/model_loader/neuron.py
new file mode 100644
index 0000000..fad97ab
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/model_loader/neuron.py
@@ -0,0 +1,476 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Utilities for selecting and loading Neuron models in transformers-neuronx
+framework."""
+import ast
+import copy
+import importlib
+import os
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig
+
+from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
+                         SpeculativeConfig)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import get_quantization_config
+from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
+                           SequenceOutput)
+
+TORCH_DTYPE_TO_NEURON_AMP = {
+    "auto": "f32",
+    "half": "f16",
+    "float16": "f16",
+    "bfloat16": "bf16",
+    "float": "f32",
+    "float32": "f32",
+    torch.float16: "f16",
+    torch.bfloat16: "bf16",
+    torch.float32: "f32",
+}
+
+# Models supported by Neuron.
+_NEURON_SUPPORTED_MODELS: dict[str, tuple[str, str, str]] = {
+    "LlamaForCausalLM": ("transformers_neuronx.llama.model",
+                         "LlamaForSampling", "LlamaForCausalLM"),
+    "MistralForCausalLM": ("transformers_neuronx.mistral.model",
+                           "MistralForSampling", "MistralForCausalLM")
+}
+
+
+class NeuronCausalLM(nn.Module):
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 on_device_sampling_disabled: bool = False) -> None:
+        super().__init__()
+        self.config = config
+        self.logits_processor = LogitsProcessor(config.vocab_size,
+                                                logits_as_input=True)
+
+        self.on_device_sampling_disabled = on_device_sampling_disabled
+        if self.on_device_sampling_disabled:
+            # Use default sampler
+            self.sampler = Sampler()
+
+        # Lazy initialized
+        self.model: nn.Module
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        input_block_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        logits = self.model(input_ids,
+                            cache_ids=positions,
+                            start_ids=input_block_ids)
+        return logits
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(None, hidden_states, sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+
+        if self.on_device_sampling_disabled:
+            next_tokens = self.sampler(logits, sampling_metadata)
+            return next_tokens
+
+        # On-device sampling outputs the token ids directly.
+        sampled_token_ids = logits.flatten()
+        next_tokens = []
+        sample_idx = 0
+        for seq_group in sampling_metadata.seq_groups:
+            samples = []
+            for seq_id in seq_group.seq_ids:
+                token_id = sampled_token_ids[sample_idx].item()
+                samples.append(
+                    SequenceOutput(parent_seq_id=seq_id,
+                                   output_token=token_id,
+                                   logprobs={token_id: Logprob(token_id)}))
+                sample_idx += 1
+            next_tokens.append(
+                CompletionSequenceGroupOutput(samples=samples,
+                                              prompt_logprobs=None))
+
+        return SamplerOutput(outputs=next_tokens)
+
+    def load_weights(self, model_name_or_path: str, **kwargs):
+        arch = _get_model_architecture(self.config)
+        neuronx_module_path, neuronx_model_cls_name, hf_model_cls_name = (
+            _NEURON_SUPPORTED_MODELS[arch])
+        neuronx_module = importlib.import_module(neuronx_module_path)
+        neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls_name)
+
+        self.model = neuronx_model_cls.from_pretrained(model_name_or_path,
+                                                       **kwargs)
+        self.model.to_neuron()
+
+
+class NeuronSpeculationCausalLM(nn.Module):
+    """A Neuron-optimized causal language model with speculative decoding."""
+
+    SPECULATION_TERMINATION_ID = -1
+
+    def __init__(self, speculation_model) -> None:
+        super().__init__()
+        self.model = speculation_model
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        input_block_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        tokens, counts = self.model.speculative_iteration(
+            input_ids, positions, input_block_ids)
+
+        # Mark the end of accepted speculative tokens for each sequence with the
+        # speculation termination id.
+        batch_size, steps = tokens.shape
+        mask = torch.arange(steps).expand(batch_size, -1) >= counts
+        tokens[mask] = self.SPECULATION_TERMINATION_ID
+
+        return tokens
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[list[SamplerOutput]]:
+        batch_size, num_steps = logits.shape
+        seq_ids = [
+            seq_id for sg in sampling_metadata.seq_groups
+            for seq_id in sg.seq_ids
+        ]
+        # Organize input tensors by step instead of by sequence.
+        accepted_token_ids_by_step = logits.transpose(0, 1)
+        accepted_token_ids_by_step = accepted_token_ids_by_step.tolist()
+
+        sampler_output_list = []
+        for step_index in range(num_steps):
+            if all(token_id == self.SPECULATION_TERMINATION_ID
+                   for token_id in accepted_token_ids_by_step[step_index]):
+                break
+            step_output_token_ids = []
+            for sequence_index in range(batch_size):
+                token_id = accepted_token_ids_by_step[step_index][
+                    sequence_index]
+                step_output_token_ids.append(
+                    CompletionSequenceGroupOutput(samples=[
+                        SequenceOutput(parent_seq_id=seq_ids[sequence_index],
+                                       output_token=token_id,
+                                       logprobs={token_id: Logprob(token_id)})
+                    ],
+                                                  prompt_logprobs=None))
+            sampler_output_list.append(
+                SamplerOutput(outputs=step_output_token_ids))
+        return sampler_output_list
+
+
+def _get_model_architecture(config: PretrainedConfig) -> str:
+    architectures = getattr(config, "architectures", [])
+    for arch in architectures:
+        if arch in _NEURON_SUPPORTED_MODELS:
+            return arch
+    raise ValueError(
+        f"Model architectures {architectures} are not supported on Neuron "
+        f"for now. Supported architectures: "
+        f"{list(_NEURON_SUPPORTED_MODELS.keys())}")
+
+
+def _get_buckets(env: str, default_value: list[int]) -> list[int]:
+    env_value = os.getenv(env)
+    if env_value is None:
+        return default_value
+    buckets_remove_empty = filter(
+        lambda x: x is not None and len(x.strip()) > 0, env_value.split(","))
+    buckets_int = map(int, buckets_remove_empty)
+    buckets_list = list(buckets_int)
+    return buckets_list
+
+
+def _get_default_neuron_config(model_config: ModelConfig,
+                               parallel_config: ParallelConfig,
+                               scheduler_config: SchedulerConfig):
+    """Generate a neuron config based on vllm config args."""
+    from transformers_neuronx.config import ContinuousBatchingConfig
+    from transformers_neuronx.constants import LAYOUT_BSH
+
+    continuous_batching_config = ContinuousBatchingConfig(
+        batch_size_for_shared_caches=scheduler_config.max_num_seqs)
+    quant_config = dict(
+        dequant_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
+        quantize_method="vector_dynamic")
+    neuron_quantization_config_builder = lambda quant: get_quantization_config(
+        quant).from_config(quant_config).get_quant_method(None, "")
+    # TODO: Add Paged attention config to the default neuron arguments.
+    default_neuron_args = dict(
+        collectives_layout=LAYOUT_BSH,
+        attention_layout=LAYOUT_BSH,
+        fuse_qkv=True,
+        quant=neuron_quantization_config_builder(model_config.quantization)
+        if model_config.quantization else None,
+        continuous_batching=continuous_batching_config,
+        weight_tiling=bool(model_config.quantization),
+        on_device_generation=_get_neuron_on_device_generation_config(
+            model_config))
+    return default_neuron_args
+
+
+def _get_default_neuron_config_for_speculation(
+        model_config: ModelConfig, parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig):
+    """Generate a neuron config for speculative decoding based on
+    vllm config args."""
+    from transformers_neuronx.config import ContinuousBatchingConfig
+    from transformers_neuronx.constants import LAYOUT_BSH
+
+    continuous_batching_config = ContinuousBatchingConfig(
+        batch_size_for_shared_caches=scheduler_config.max_num_seqs)
+
+    default_neuron_args = dict(collectives_layout=LAYOUT_BSH,
+                               attention_layout=LAYOUT_BSH,
+                               fuse_qkv=True,
+                               on_device_embedding=True,
+                               continuous_batching=continuous_batching_config,
+                               on_device_generation=copy.deepcopy(
+                                   model_config.neuron_sampling_params))
+    return default_neuron_args
+
+
+def _get_neuron_on_device_generation_config(model_config: ModelConfig):
+    if not _is_neuron_on_device_sampling_disabled(model_config):
+        return copy.deepcopy(model_config.neuron_sampling_params)
+    return None
+
+
+def _is_neuron_on_device_sampling_disabled(model_config: ModelConfig) -> bool:
+    return not getattr(model_config, "neuron_sampling_params", None)
+
+
+def _get_neuron_config_after_override(default_neuron_config,
+                                      overridden_neuron_config):
+    from transformers_neuronx.config import (ContinuousBatchingConfig,
+                                             GenerationConfig,
+                                             KVCacheQuantizationConfig,
+                                             NeuronConfig, QuantizationConfig,
+                                             SparseAttnConfig)
+
+    sparse_attn = overridden_neuron_config.pop("sparse_attn", {})
+    if sparse_attn:
+        overridden_neuron_config["sparse_attn"] = SparseAttnConfig(
+            **sparse_attn)
+
+    kv_cache_quant = overridden_neuron_config.pop("kv_cache_quant", {})
+    if kv_cache_quant:
+        overridden_neuron_config["kv_cache_quant"] = KVCacheQuantizationConfig(
+            **kv_cache_quant)
+
+    continuous_batching = overridden_neuron_config.pop("continuous_batching",
+                                                       {})
+    if continuous_batching:
+        overridden_neuron_config[
+            "continuous_batching"] = ContinuousBatchingConfig(
+                **continuous_batching)
+
+    quant = overridden_neuron_config.pop("quant", {})
+    if quant:
+        overridden_neuron_config["quant"] = QuantizationConfig(**quant)
+
+    on_device_generation = overridden_neuron_config.pop(
+        "on_device_generation", {})
+    if on_device_generation:
+        overridden_neuron_config["on_device_generation"] = GenerationConfig(
+            **on_device_generation)
+    default_neuron_config.update(overridden_neuron_config)
+    return NeuronConfig(**default_neuron_config)
+
+
+def get_neuron_model(model_config: ModelConfig,
+                     parallel_config: ParallelConfig,
+                     scheduler_config: SchedulerConfig) -> nn.Module:
+    """Initializes a neuron-optimized model for inference."""
+    # Create a model instance.
+    model = NeuronCausalLM(
+        model_config.hf_config,
+        _is_neuron_on_device_sampling_disabled(model_config))
+
+    default_neuron_config_args = _get_default_neuron_config(
+        model_config, parallel_config, scheduler_config)
+
+    neuron_config = _get_neuron_config_after_override(
+        default_neuron_config_args, model_config.override_neuron_config)
+
+    context_length_estimates = _get_buckets("NEURON_CONTEXT_LENGTH_BUCKETS",
+                                            [scheduler_config.max_model_len])
+    n_positions = _get_buckets("NEURON_TOKEN_GEN_BUCKETS",
+                               [scheduler_config.max_model_len])
+
+    model.load_weights(model_config.model,
+                       tp_degree=parallel_config.tensor_parallel_size,
+                       amp=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
+                       neuron_config=neuron_config,
+                       context_length_estimate=context_length_estimates,
+                       n_positions=n_positions,
+                       batch_size=scheduler_config.max_num_seqs)
+
+    return model.eval()
+
+
+def get_neuron_speculation_model(model_config: ModelConfig,
+                                 parallel_config: ParallelConfig,
+                                 scheduler_config: SchedulerConfig,
+                                 speculation_config: SpeculativeConfig):
+    """Initializes a neuron-optimized speculation model for inference.
+
+    This method is only applicable for speculation with a standalone draft model
+    """
+    from transformers_neuronx.fused_speculation import FusedSpeculativeDecoder
+
+    # For Eagle SD, we need to pass in additional parameters in neuron config.
+    is_eagle = getattr(speculation_config.draft_model_config.hf_config,
+                       "is_eagle", False)
+
+    # Create target model instance.
+    target_model = NeuronCausalLM(model_config.hf_config)
+
+    default_neuron_config_args = _get_default_neuron_config_for_speculation(
+        model_config, parallel_config, scheduler_config)
+    if is_eagle:
+        default_neuron_config_args['is_eagle_target'] = True
+
+    neuron_config = _get_neuron_config_after_override(
+        default_neuron_config_args, model_config.override_neuron_config)
+
+    context_length_estimates = _get_buckets("NEURON_CONTEXT_LENGTH_BUCKETS",
+                                            [scheduler_config.max_model_len])
+    n_positions = _get_buckets("NEURON_TOKEN_GEN_BUCKETS",
+                               [scheduler_config.max_model_len])
+
+    target_model.load_weights(
+        model_config.model,
+        tp_degree=parallel_config.tensor_parallel_size,
+        amp=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
+        neuron_config=neuron_config,
+        context_length_estimate=context_length_estimates,
+        n_positions=n_positions,
+        batch_size=scheduler_config.max_num_seqs)
+
+    target_model.eval()
+
+    # Create draft model instance.
+    draft_model = NeuronCausalLM(
+        speculation_config.draft_model_config.hf_config)
+
+    default_draft_neuron_config_args = (
+        _get_default_neuron_config_for_speculation(
+            speculation_config.draft_model_config, parallel_config,
+            scheduler_config))
+    if is_eagle:
+        default_draft_neuron_config_args['is_eagle_draft'] = True
+        default_draft_neuron_config_args['has_pre_attention_norm'] = False
+
+    draft_neuron_config = _get_neuron_config_after_override(
+        default_draft_neuron_config_args,
+        speculation_config.draft_model_config.override_neuron_config)
+
+    draft_model.load_weights(speculation_config.draft_model_config.model,
+                             tp_degree=speculation_config.
+                             draft_parallel_config.tensor_parallel_size,
+                             amp=TORCH_DTYPE_TO_NEURON_AMP[
+                                 speculation_config.draft_model_config.dtype],
+                             neuron_config=draft_neuron_config,
+                             context_length_estimate=context_length_estimates,
+                             n_positions=n_positions,
+                             batch_size=scheduler_config.max_num_seqs)
+
+    draft_model.eval()
+
+    num_speculative_tokens = speculation_config.num_speculative_tokens
+    # Create speculation model instance.
+    speculation_model = FusedSpeculativeDecoder(draft_model.model,
+                                                target_model.model,
+                                                num_speculative_tokens)
+    speculation_model.to_neuron()
+
+    return NeuronSpeculationCausalLM(speculation_model)
+
+
+def get_neuron_eagle_speculation_model(model_config: ModelConfig,
+                                       parallel_config: ParallelConfig,
+                                       scheduler_config: SchedulerConfig,
+                                       speculation_config: SpeculativeConfig):
+    """Initializes a neuron-optimized EAGLE speculation model for inference."""
+    from transformers_neuronx.eagle_speculation import EagleSpeculativeDecoder
+
+    # Create target model instance.
+    target_model = NeuronCausalLM(model_config.hf_config)
+
+    default_neuron_config_args = _get_default_neuron_config_for_speculation(
+        model_config, parallel_config, scheduler_config)
+    default_neuron_config_args['is_eagle_target'] = True
+    neuron_config = _get_neuron_config_after_override(
+        default_neuron_config_args, model_config.override_neuron_config)
+
+    context_length_estimates = _get_buckets("NEURON_CONTEXT_LENGTH_BUCKETS",
+                                            [scheduler_config.max_model_len])
+    n_positions = _get_buckets("NEURON_TOKEN_GEN_BUCKETS",
+                               [scheduler_config.max_model_len])
+
+    target_model.load_weights(
+        model_config.model,
+        tp_degree=parallel_config.tensor_parallel_size,
+        amp=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
+        neuron_config=neuron_config,
+        context_length_estimate=context_length_estimates,
+        n_positions=n_positions,
+        batch_size=scheduler_config.max_num_seqs)
+
+    target_model.eval()
+
+    # Create draft model instance.
+    draft_model = NeuronCausalLM(
+        speculation_config.draft_model_config.hf_config)
+
+    default_draft_neuron_config_args = (
+        _get_default_neuron_config_for_speculation(
+            speculation_config.draft_model_config, parallel_config,
+            scheduler_config))
+    default_draft_neuron_config_args['is_eagle_draft'] = True
+    default_draft_neuron_config_args['has_pre_attention_norm'] = False
+    draft_neuron_config = _get_neuron_config_after_override(
+        default_draft_neuron_config_args,
+        speculation_config.draft_model_config.override_neuron_config)
+
+    draft_model.load_weights(speculation_config.draft_model_config.model,
+                             tp_degree=speculation_config.
+                             draft_parallel_config.tensor_parallel_size,
+                             amp=TORCH_DTYPE_TO_NEURON_AMP[
+                                 speculation_config.draft_model_config.dtype],
+                             neuron_config=draft_neuron_config,
+                             context_length_estimate=context_length_estimates,
+                             n_positions=n_positions,
+                             batch_size=scheduler_config.max_num_seqs)
+
+    draft_model.eval()
+
+    token_tree: dict[int, list[int]] = ast.literal_eval(
+        speculation_config.speculative_token_tree)
+
+    speculation_model = EagleSpeculativeDecoder(draft_model.model,
+                                                target_model.model,
+                                                token_tree=token_tree)
+    speculation_model.to_neuron()
+
+    return NeuronSpeculationCausalLM(speculation_model)
diff --git a/vllm_v0.10.0/vllm/model_executor/model_loader/neuronx_distributed.py b/vllm_v0.10.0/vllm/model_executor/model_loader/neuronx_distributed.py
new file mode 100644
index 0000000..f450961
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/model_loader/neuronx_distributed.py
@@ -0,0 +1,685 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Utilities for selecting and loading Neuron models in
+neuronx-distributed-inference framework."""
+# Disabling yapf because yapf and isort have conflicts for the below imports
+# yapf: disable
+import copy
+import hashlib
+import importlib
+import multiprocessing
+import os
+import shutil
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from neuronx_distributed_inference.models.config import (
+    FusedSpecNeuronConfig, OnDeviceSamplingConfig)
+from neuronx_distributed_inference.models.mllama.utils import (
+    create_vision_mask)
+from neuronx_distributed_inference.modules.lora_serving import (
+    LoraServingConfig)
+from neuronx_distributed_inference.utils.hf_adapter import (
+    load_pretrained_config)
+from transformers import AutoModelForCausalLM, AutoTokenizer, PretrainedConfig
+
+from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
+                         SpeculativeConfig)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
+                           SequenceOutput)
+
+# yapf: enable
+logger = init_logger(__name__)
+
+TORCH_DTYPE_TO_NEURON_AMP = {
+    "auto": "float32",
+    "half": "float16",
+    "float16": "float16",
+    "bfloat16": "bfloat16",
+    "float": "float32",
+    "float32": "float32",
+    torch.float16: "float16",
+    torch.bfloat16: "bfloat16",
+    torch.float32: "float32",
+}
+
+# Models supported by Neuronx distributed for inference.
+_NEURON_SUPPORTED_MODELS: dict[str, tuple[str, str]] = {
+    "LlamaForCausalLM":
+    ("neuronx_distributed_inference.models.llama.modeling_llama",
+     "NeuronLlamaForCausalLM"),
+    "MistralForCausalLM":
+    ("neuronx_distributed_inference.models.llama.modeling_llama",
+     "NeuronLlamaForCausalLM"),
+    "DbrxForCausalLM":
+    ("neuronx_distributed_inference.models.dbrx.modeling_dbrx",
+     "NeuronDbrxForCausalLM"),
+    "MixtralForCausalLM":
+    ("neuronx_distributed_inference.models.mixtral.modeling_mixtral",
+     "NeuronMixtralForCausalLM"),
+    "MllamaForConditionalGeneration":
+    ("neuronx_distributed_inference.models.mllama.modeling_mllama",
+     "NeuronMllamaForCausalLM"),
+}
+
+
+class NeuronCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.logits_processor = LogitsProcessor(config.vocab_size,
+                                                logits_as_input=True)
+        self.sampler = Sampler()
+
+        # Lazy initialized
+        self.model: nn.Module
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                input_block_ids: torch.Tensor,
+                sampling_params: torch.Tensor,
+                prev_hidden: Optional[torch.Tensor] = None,
+                adapter_ids: Optional[torch.Tensor] = None) -> torch.Tensor:
+        # sort block ids sequentially for perf/neuron support reasons
+        sorted_input_block_ids, sorted_indices = torch.sort(input_block_ids)
+        input_ids = torch.index_select(input_ids, 0, sorted_indices)
+        positions = torch.index_select(positions, 0, sorted_indices)
+        sampling_params = torch.index_select(sampling_params, 0,
+                                             sorted_indices)
+        output = self.model(input_ids,
+                            attention_mask=None,
+                            position_ids=positions,
+                            seq_ids=sorted_input_block_ids,
+                            sampling_params=sampling_params,
+                            prev_hidden=prev_hidden,
+                            adapter_ids=adapter_ids)
+        # on-device sampling
+        if self.config.neuron_config.on_device_sampling_config:
+            output = output.hidden_states
+        else:
+            output = output.logits[:, -1, :]
+
+        restored_indices = torch.argsort(sorted_indices)
+        if input_block_ids.shape[0] != 1:
+            output = torch.index_select(output, 0, restored_indices)
+
+        return output
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(None, hidden_states, sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        # on-device sampling
+        if self.config.neuron_config.on_device_sampling_config:
+            batch_size = logits.shape
+            seq_ids = [
+                seq_id for sg in sampling_metadata.seq_groups
+                for seq_id in sg.seq_ids
+            ]
+            assert len(seq_ids) == list(batch_size)[0], "batch size mismatch"
+            # Organize input tensors by step instead of by sequence.
+            accepted_token_ids_by_step = logits.flatten()
+            accepted_token_ids_by_step = accepted_token_ids_by_step.tolist()
+
+            step_output_token_ids = []
+            for i, seq_id in enumerate(seq_ids):
+                token_id = accepted_token_ids_by_step[i]
+                step_output_token_ids.append(
+                    CompletionSequenceGroupOutput(samples=[
+                        SequenceOutput(parent_seq_id=seq_id,
+                                       output_token=token_id,
+                                       logprobs={token_id: Logprob(token_id)})
+                    ],
+                                                  prompt_logprobs=None))
+            return SamplerOutput(outputs=step_output_token_ids)
+        else:
+            return self.sampler(logits, sampling_metadata)
+
+    def load_weights(self, model_name_or_path: str, **kwargs):
+        arch = _get_model_architecture(self.config)
+        neuronx_module_path, neuronx_model_cls_name = (
+            _NEURON_SUPPORTED_MODELS[arch])
+        neuronx_module = importlib.import_module(neuronx_module_path)
+        neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls_name)
+        neuron_config = neuronx_model_cls.get_neuron_config_cls()(
+            **kwargs['neuron_config'])
+        self.config.neuron_config = neuron_config
+        config = neuronx_model_cls.get_config_cls()(
+            neuron_config,
+            load_config=load_pretrained_config(model_name_or_path))
+        hashed_config = hashlib.md5(config.to_json_string().encode('utf-8'),
+                                    usedforsecurity=False).hexdigest()
+        if os.getenv("NEURON_COMPILED_ARTIFACTS") is not None:
+            compiled_model_path = os.getenv("NEURON_COMPILED_ARTIFACTS")
+        elif os.path.exists(model_name_or_path):
+            compiled_model_path = os.path.join(model_name_or_path,
+                                               "neuron-compiled-artifacts",
+                                               hashed_config)
+            shutil.rmtree(compiled_model_path, ignore_errors=True)
+        else:
+            compiled_model_path = os.path.join("local-models",
+                                               model_name_or_path,
+                                               "neuron-compiled-artifacts",
+                                               hashed_config)
+            shutil.rmtree(compiled_model_path, ignore_errors=True)
+        try:
+            self.model = neuronx_model_cls(compiled_model_path)
+            override_neuron_config = kwargs["override_neuron_config"]
+            for k, v in override_neuron_config.items():
+                setattr(self.model.config.neuron_config, k, v)
+            self.model.load(compiled_model_path)
+            return
+        except (FileNotFoundError, ValueError) as e:
+            logger.warning("Exception: %s", e)
+            logger.warning("Failed to load the model from %s, Recompiling...",
+                           compiled_model_path)
+        if not os.path.exists(model_name_or_path):
+            hf_model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
+            saved_path = os.path.join("local-models", model_name_or_path)
+            hf_model.save_pretrained(saved_path)
+            model_name_or_path = saved_path
+        self.model = neuronx_model_cls(model_name_or_path, config)
+        self.model.compile(compiled_model_path)
+        self.model.load(compiled_model_path)
+
+
+class NeuronMllamaForCausalLM(nn.Module):
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 on_device_sampling_disabled: bool = False) -> None:
+        super().__init__()
+        # has_image is the only multimodal input that is used in
+        # token-generation
+        # This is a cache (on CPU) that saves has_image data per sequence id
+        # The number of entries in this cache is <= Batch-Size
+        self.has_image_cache: dict[int, torch.Tensor] = {}
+        self.config = config
+        self.logits_processor = LogitsProcessor(
+            config.get_text_config().vocab_size, logits_as_input=True)
+
+        self.on_device_sampling_disabled = on_device_sampling_disabled
+        if self.on_device_sampling_disabled:
+            # Use default sampler
+            self.sampler = Sampler()
+
+        # Lazy initialized
+        self.model: nn.Module
+        self.is_reorder_needed: bool = True
+
+    def read_from_has_image_cache(self, seq_ids: torch.Tensor):
+        has_image_list = []
+        for index in range(len(seq_ids)):
+            seq_id = seq_ids[index].item()
+            if seq_id in self.has_image_cache:
+                has_image_list.append(self.has_image_cache[seq_id])
+            else:
+                has_image_list.append(torch.tensor([0]))
+        return torch.tensor(has_image_list)
+
+    def write_to_has_image_cache(self, seq_ids: torch.Tensor,
+                                 has_image: torch.Tensor):
+        for index in range(len(seq_ids)):
+            seq_id = seq_ids[index].item()
+            if index < len(has_image):
+                self.has_image_cache[seq_id] = has_image[index]
+            else:
+                self.has_image_cache[seq_id] = torch.zeros(1)
+
+    def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
+                seq_ids: torch.Tensor, pixel_values: torch.Tensor,
+                aspect_ratios: torch.Tensor, num_chunks: torch.Tensor,
+                has_image: torch.Tensor, sampling_params) -> torch.Tensor:
+
+        # We update the has_image cache during prefill
+        # and read the has_image cache during decode
+        if input_ids.shape[-1] > 1:  # prefill
+            self.write_to_has_image_cache(seq_ids, has_image)
+        else:
+            has_image = self.read_from_has_image_cache(seq_ids)
+            bs = input_ids.shape[0]
+            num_chunks = torch.zeros((bs, 1))
+            aspect_ratios = torch.zeros((bs, 1, 2))
+
+        input_block_ids = seq_ids
+        origin_input_block_ids = seq_ids
+        if self.is_reorder_needed:
+            # sort block ids sequentially for perf/neuron support reasons
+            input_block_ids, sorted_indices = torch.sort(input_block_ids)
+            input_ids = torch.index_select(input_ids, 0, sorted_indices)
+            positions = torch.index_select(positions, 0, sorted_indices)
+            sampling_params = torch.index_select(sampling_params, 0,
+                                                 sorted_indices)
+            pixel_values = torch.index_select(pixel_values, 0, sorted_indices)
+            aspect_ratios = torch.index_select(aspect_ratios, 0,
+                                               sorted_indices)
+            num_chunks = torch.index_select(num_chunks, 0, sorted_indices)
+            has_image = torch.index_select(has_image, 0, sorted_indices)
+
+        self.vision_mask = create_vision_mask(input_ids, self.vision_token_id)
+        output = self.model(
+            input_ids.to(torch.int32),
+            attention_mask=None,
+            position_ids=positions.to(torch.int32),
+            seq_ids=seq_ids.flatten().to(torch.int32),
+            pixel_values=pixel_values.to(
+                self.config.vision_config.torch_dtype),
+            aspect_ratios=aspect_ratios.to(torch.int32),
+            vision_mask=self.vision_mask.to(torch.int32),
+            sampling_params=sampling_params,
+            num_chunks=num_chunks.to(torch.int32),
+            has_image=has_image.to(torch.int32),
+        )
+        if self.config.neuron_config.on_device_sampling_config:
+            output = output.hidden_states
+        else:
+            output = output.logits[:, -1, :]
+
+        if self.is_reorder_needed and origin_input_block_ids.shape[0] != 1:
+            restored_indices = torch.argsort(sorted_indices)
+            output = torch.index_select(output, 0, restored_indices)
+        return output
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(None, hidden_states, sampling_metadata)
+        return logits
+
+    def sample(self, hidden_states, sampling_metadata):
+        if not self.on_device_sampling_disabled:
+            with torch.profiler.record_function("sample"):
+                hidden_states = hidden_states.flatten()
+                res = []
+                sample_idx = 0
+                for seq_group in sampling_metadata.seq_groups:
+                    seq_ids = seq_group.seq_ids
+                    samples = []
+                    for seq_id in seq_ids:
+                        token_id = hidden_states[sample_idx].item()
+                        samples.append(
+                            SequenceOutput(
+                                parent_seq_id=seq_id,
+                                output_token=token_id,
+                                logprobs={token_id: Logprob(token_id)}))
+                        sample_idx += 1
+                    res.append(
+                        CompletionSequenceGroupOutput(samples=samples,
+                                                      prompt_logprobs=None))
+                next_tokens = SamplerOutput(outputs=res)
+        else:
+            next_tokens = self.sampler(None, hidden_states, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, model_name_or_path: str, **kwargs):
+        arch = _get_model_architecture(self.config)
+        neuronx_module_path, neuronx_model_cls_name = (
+            _NEURON_SUPPORTED_MODELS[arch])
+        neuronx_module = importlib.import_module(neuronx_module_path)
+        neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls_name)
+        neuron_config = neuronx_model_cls.get_neuron_config_cls()(
+            **kwargs['neuron_config'])
+        self.config.neuron_config = neuron_config
+        logger.info("neuron_config buckets: %s",
+                    self.config.neuron_config.buckets)
+        config = neuronx_model_cls.get_config_cls()(
+            neuron_config,
+            load_config=load_pretrained_config(model_name_or_path))
+        hashed_config = hashlib.md5(config.to_json_string().encode('utf-8'),
+                                    usedforsecurity=False).hexdigest()
+        if os.getenv("NEURON_COMPILED_ARTIFACTS") is not None:
+            compiled_model_path = os.getenv("NEURON_COMPILED_ARTIFACTS")
+        elif os.path.exists(model_name_or_path):
+            compiled_model_path = os.path.join(model_name_or_path,
+                                               "neuron-compiled-artifacts",
+                                               hashed_config)
+        else:
+            compiled_model_path = os.path.join("local-models",
+                                               model_name_or_path,
+                                               "neuron-compiled-artifacts",
+                                               hashed_config)
+        try:
+            self.model = neuronx_model_cls(compiled_model_path)
+            tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+            self.vision_token_id = tokenizer(
+                "<|image|>", add_special_tokens=False).input_ids[0]
+            self.model.load(compiled_model_path)
+            return
+        except (FileNotFoundError, ValueError):
+            logger.warning("Failed to load the model from %s, Recompiling...",
+                           compiled_model_path)
+        if not os.path.exists(model_name_or_path):
+            hf_model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
+            saved_path = os.path.join("local-models", model_name_or_path)
+            hf_model.save_pretrained(saved_path)
+            model_name_or_path = saved_path
+        self.model = neuronx_model_cls(model_name_or_path, config)
+
+        logger.info("\nCompiling and saving model to %s", model_name_or_path)
+
+        p = multiprocessing.Process(target=compile_model,
+                                    args=(self, compiled_model_path))
+        p.start()
+        p.join()
+
+        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+        tokenizer.save_pretrained(compiled_model_path)
+        logger.info("Successfully compiled and saved the model in %s",
+                    compiled_model_path)
+
+        # Read "<|image|>" token_id from the tokenizer
+        self.vision_token_id = tokenizer("<|image|>",
+                                         add_special_tokens=False).input_ids[0]
+        logger.info("\nLoading model from compiled checkpoint...")
+        self.model.load(compiled_model_path)
+
+
+def compile_model(neuron_model, traced_model_path):
+    neuron_model.model.compile(traced_model_path)
+
+
+class NeuronSpeculationCausalLM(nn.Module):
+    """A Neuron-optimized causal language model with speculative decoding."""
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.logits_processor = LogitsProcessor(config.vocab_size,
+                                                logits_as_input=True)
+        # Lazy initialized
+        self.model: nn.Module
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        input_block_ids: torch.Tensor,
+        sampling_params: torch.Tensor,
+    ) -> torch.Tensor:
+        # sort block ids sequentially for perf/neuron support reasons
+        sorted_input_block_ids, sorted_indices = torch.sort(input_block_ids)
+        input_ids = torch.index_select(input_ids, 0, sorted_indices)
+        positions = torch.index_select(positions, 0, sorted_indices)
+        sampling_params = torch.index_select(sampling_params, 0,
+                                             sorted_indices)
+
+        output = self.model(input_ids,
+                            attention_mask=None,
+                            position_ids=positions,
+                            seq_ids=sorted_input_block_ids,
+                            sampling_params=sampling_params)
+        restored_indices = torch.argsort(sorted_indices)
+
+        # CTX encoding
+        if (positions[:, 0]).sum().item() == 0:
+            output = output.fused_outputs[0][:, 0:1]
+            if input_block_ids.shape[0] != 1:
+                output = torch.index_select(output, 0, restored_indices)
+            return output
+
+        # Fused Spec (Generation)
+        accepted_tokens_with_padding = output.fused_outputs[0]
+        next_pos_ids = output.fused_outputs[-1]
+        generated_token_counts = next_pos_ids - positions
+
+        assert torch.any(generated_token_counts == 0).item() is False, \
+            "NxDI model generated no output for one or more sequences."
+
+        batch_size, steps = accepted_tokens_with_padding.shape
+        mask = torch.arange(steps).expand(batch_size,
+                                          -1) >= generated_token_counts
+        accepted_tokens_with_padding[mask] = -1
+
+        if input_block_ids.shape[0] != 1:
+            accepted_tokens_with_padding = torch.index_select(
+                accepted_tokens_with_padding, 0, restored_indices)
+
+        return accepted_tokens_with_padding
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[list[SamplerOutput]]:
+        batch_size, num_steps = logits.shape
+        seq_ids = [
+            seq_id for sg in sampling_metadata.seq_groups
+            for seq_id in sg.seq_ids
+        ]
+        # Organize input tensors by step instead of by sequence.
+        accepted_token_ids_by_step = logits.transpose(0, 1)
+        accepted_token_ids_by_step = accepted_token_ids_by_step.tolist()
+
+        sampler_output_list = []
+        for step_index in range(num_steps):
+            if all(token_id == -1
+                   for token_id in accepted_token_ids_by_step[step_index]):
+                break
+            step_output_token_ids = []
+            for sequence_index in range(batch_size):
+                token_id = accepted_token_ids_by_step[step_index][
+                    sequence_index]
+                step_output_token_ids.append(
+                    CompletionSequenceGroupOutput(samples=[
+                        SequenceOutput(parent_seq_id=seq_ids[sequence_index],
+                                       output_token=token_id,
+                                       logprobs={token_id: Logprob(token_id)})
+                    ],
+                                                  prompt_logprobs=None))
+            sampler_output_list.append(
+                SamplerOutput(outputs=step_output_token_ids))
+        return sampler_output_list
+
+    def load_weights(self, model_name_or_path: str,
+                     draft_model_name_or_path: str, **kwargs):
+        arch = _get_model_architecture(self.config)
+        neuronx_module_path, neuronx_model_cls_name = (
+            _NEURON_SUPPORTED_MODELS[arch])
+        neuronx_module = importlib.import_module(neuronx_module_path)
+        neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls_name)
+        neuron_config = neuronx_model_cls.get_neuron_config_cls()(
+            **kwargs['neuron_config'])
+        config = neuronx_model_cls.get_config_cls()(
+            neuron_config,
+            load_config=load_pretrained_config(model_name_or_path))
+
+        draft_neuron_config = copy.deepcopy(config.neuron_config)
+        if not config.neuron_config.enable_eagle_speculation:
+            draft_neuron_config.speculation_length = 0
+        draft_neuron_config.trace_tokengen_model = True
+        draft_neuron_config.enable_fused_speculation = False
+        if getattr(config.neuron_config, "draft_model_modules_to_not_convert",
+                   None):
+            draft_neuron_config.modules_to_not_convert = (
+                draft_neuron_config.draft_model_modules_to_not_convert)
+        if config.neuron_config.enable_eagle_speculation:
+            draft_neuron_config.is_eagle_draft = True
+            draft_neuron_config.sequence_parallel_enabled = False
+        draft_config = neuronx_model_cls.get_config_cls()(
+            draft_neuron_config,
+            load_config=load_pretrained_config(draft_model_name_or_path))
+        fused_spec_config = (FusedSpecNeuronConfig(
+            neuronx_model_cls._model_cls,
+            draft_config=draft_config,
+            draft_model_path=draft_model_name_or_path))
+        config.fused_spec_config = fused_spec_config
+        self.config.neuron_config = neuron_config
+
+        hashed_config = hashlib.md5(config.to_json_string().encode('utf-8'),
+                                    usedforsecurity=False).hexdigest()
+        if os.getenv("NEURON_COMPILED_ARTIFACTS") is not None:
+            compiled_model_path = os.getenv("NEURON_COMPILED_ARTIFACTS")
+        elif os.path.exists(model_name_or_path):
+            compiled_model_path = os.path.join(model_name_or_path,
+                                               "neuron-compiled-artifacts",
+                                               hashed_config)
+            shutil.rmtree(compiled_model_path, ignore_errors=True)
+        else:
+            compiled_model_path = os.path.join("local-models",
+                                               model_name_or_path,
+                                               "neuron-compiled-artifacts",
+                                               hashed_config)
+            shutil.rmtree(compiled_model_path, ignore_errors=True)
+        try:
+            self.model = neuronx_model_cls(compiled_model_path)
+            override_neuron_config = kwargs["override_neuron_config"]
+            for k, v in override_neuron_config.items():
+                setattr(self.model.config.neuron_config, k, v)
+            self.model.load(compiled_model_path)
+            return
+        except (FileNotFoundError, ValueError) as e:
+            logger.warning("Exception: %s", e)
+            logger.warning("Failed to load the model from %s Recompiling...",
+                           compiled_model_path)
+        if not os.path.exists(model_name_or_path):
+            hf_model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
+            saved_path = os.path.join("local-models", model_name_or_path)
+            hf_model.save_pretrained(saved_path)
+            model_name_or_path = saved_path
+        if not os.path.exists(draft_model_name_or_path):
+            if draft_model_name_or_path != model_name_or_path:
+                hf_model = AutoModelForCausalLM.from_pretrained(
+                    draft_model_name_or_path)
+                saved_path = os.path.join("local-models",
+                                          draft_model_name_or_path)
+                hf_model.save_pretrained(saved_path)
+                draft_model_name_or_path = saved_path
+            else:
+                draft_model_name_or_path = model_name_or_path
+            config.fused_spec_config.draft_model_path = draft_model_name_or_path
+        self.model = neuronx_model_cls(model_name_or_path, config)
+        self.model.compile(compiled_model_path)
+        self.model.load(compiled_model_path)
+
+
+def _get_model_architecture(config: PretrainedConfig) -> str:
+    architectures = getattr(config, "architectures", [])
+    for arch in architectures:
+        if arch in _NEURON_SUPPORTED_MODELS:
+            return arch
+    raise ValueError(
+        f"Model architectures {architectures} are not supported on Neuron "
+        f"for now. Supported architectures: "
+        f"{list(_NEURON_SUPPORTED_MODELS.keys())}")
+
+
+def _get_default_neuron_config(model_config: ModelConfig,
+                               parallel_config: ParallelConfig,
+                               scheduler_config: SchedulerConfig,
+                               lora_serving_config: LoraServingConfig):
+    """Generate a neuron config based on vllm config args."""
+    on_device_sampling_config = OnDeviceSamplingConfig(dynamic=True,
+                                                       deterministic=False)
+    batch_size = scheduler_config.max_num_seqs
+
+    neuron_config = dict(
+        tp_degree=parallel_config.tensor_parallel_size,
+        ctx_batch_size=1,
+        batch_size=batch_size,
+        max_context_length=scheduler_config.max_model_len,
+        seq_len=scheduler_config.max_model_len,
+        enable_bucketing=True,
+        is_continuous_batching=True,
+        quantized=False,
+        torch_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
+        padding_side="right",
+        on_device_sampling_config=on_device_sampling_config,
+        sequence_parallel_enabled=True,
+        lora_serving_config=lora_serving_config)
+    return neuron_config
+
+
+def _get_default_speculation_config(model_config: ModelConfig,
+                                    parallel_config: ParallelConfig,
+                                    scheduler_config: SchedulerConfig,
+                                    speculation_config: SpeculativeConfig):
+    """Generate a neuron config for speculative decoding based on vllm config
+    args."""
+    neuron_config = dict(
+        tp_degree=parallel_config.tensor_parallel_size,
+        ctx_batch_size=1,
+        batch_size=scheduler_config.max_num_seqs,
+        max_context_length=scheduler_config.max_model_len,
+        seq_len=scheduler_config.max_model_len,
+        speculation_length=speculation_config.num_speculative_tokens,
+        trace_tokengen_model=False,
+        enable_fused_speculation=True,
+        enable_bucketing=True,
+        is_continuous_batching=True,
+        quantized=False,
+        torch_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
+        on_device_sampling_config=dict(
+            top_k=1,
+            do_sample=False,
+        ))
+    return neuron_config
+
+
+def _get_neuron_config_after_override(default_neuron_config,
+                                      overridden_neuron_config):
+    """Update default neuron config values with override args"""
+    overridden_neuron_config = overridden_neuron_config or {}
+    default_neuron_config.update(overridden_neuron_config)
+    return default_neuron_config
+
+
+def get_neuron_model(model_config: ModelConfig,
+                     parallel_config: ParallelConfig,
+                     scheduler_config: SchedulerConfig,
+                     lora_serving_config: LoraServingConfig) -> nn.Module:
+    """Initializes a neuron-optimized model for inference."""
+    model_arch = _get_model_architecture(model_config.hf_config)
+    if model_arch == "MllamaForConditionalGeneration":
+        model = NeuronMllamaForCausalLM(model_config.hf_config)
+    else:
+        model = NeuronCausalLM(model_config.hf_config)
+    default_neuron_config_args = _get_default_neuron_config(
+        model_config, parallel_config, scheduler_config, lora_serving_config)
+    neuron_config = _get_neuron_config_after_override(
+        default_neuron_config_args, model_config.override_neuron_config)
+
+    override_neuron_config = model_config.override_neuron_config
+    model.load_weights(model_config.model,
+                       neuron_config=neuron_config,
+                       override_neuron_config=override_neuron_config)
+    return model.eval()
+
+
+def get_neuron_speculation_model(model_config: ModelConfig,
+                                 parallel_config: ParallelConfig,
+                                 scheduler_config: SchedulerConfig,
+                                 speculation_config: SpeculativeConfig):
+    """Initializes a neuron-optimized speculation model for inference.
+    
+    This model handles speculation using both a draft model and an EAGLE draft. 
+    """
+    model = NeuronSpeculationCausalLM(model_config.hf_config)
+    default_neuron_config_args = _get_default_speculation_config(
+        model_config, parallel_config, scheduler_config, speculation_config)
+    neuron_config = _get_neuron_config_after_override(
+        default_neuron_config_args, model_config.override_neuron_config)
+
+    override_neuron_config = model_config.override_neuron_config
+    model.load_weights(model_config.model,
+                       speculation_config.draft_model_config.model,
+                       neuron_config=neuron_config,
+                       override_neuron_config=override_neuron_config)
+    return model.eval()
diff --git a/vllm_v0.10.0/vllm/model_executor/model_loader/runai_streamer_loader.py b/vllm_v0.10.0/vllm/model_executor/model_loader/runai_streamer_loader.py
new file mode 100644
index 0000000..83e0f38
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/model_loader/runai_streamer_loader.py
@@ -0,0 +1,109 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: SIM117
+import glob
+import os
+from collections.abc import Generator
+from typing import Optional
+
+import torch
+from torch import nn
+from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
+
+from vllm.config import LoadConfig, ModelConfig
+from vllm.model_executor.model_loader.base_loader import BaseModelLoader
+from vllm.model_executor.model_loader.weight_utils import (
+    download_safetensors_index_file_from_hf, download_weights_from_hf,
+    runai_safetensors_weights_iterator)
+from vllm.transformers_utils.s3_utils import glob as s3_glob
+from vllm.transformers_utils.utils import is_s3
+
+
+class RunaiModelStreamerLoader(BaseModelLoader):
+    """
+        Model loader that can load safetensors
+        files from local FS or S3 bucket.
+    """
+
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+        if load_config.model_loader_extra_config:
+            extra_config = load_config.model_loader_extra_config
+
+            if ("concurrency" in extra_config
+                    and isinstance(extra_config.get("concurrency"), int)):
+                os.environ["RUNAI_STREAMER_CONCURRENCY"] = str(
+                    extra_config.get("concurrency"))
+
+            if ("memory_limit" in extra_config
+                    and isinstance(extra_config.get("memory_limit"), int)):
+                os.environ["RUNAI_STREAMER_MEMORY_LIMIT"] = str(
+                    extra_config.get("memory_limit"))
+
+            runai_streamer_s3_endpoint = os.getenv(
+                'RUNAI_STREAMER_S3_ENDPOINT')
+            aws_endpoint_url = os.getenv('AWS_ENDPOINT_URL')
+            if (runai_streamer_s3_endpoint is None
+                    and aws_endpoint_url is not None):
+                os.environ["RUNAI_STREAMER_S3_ENDPOINT"] = aws_endpoint_url
+
+    def _prepare_weights(self, model_name_or_path: str,
+                         revision: Optional[str]) -> list[str]:
+        """Prepare weights for the model.
+
+        If the model is not local, it will be downloaded."""
+
+        is_s3_path = is_s3(model_name_or_path)
+        is_local = os.path.isdir(model_name_or_path)
+        safetensors_pattern = "*.safetensors"
+        index_file = SAFE_WEIGHTS_INDEX_NAME
+
+        hf_folder = (model_name_or_path if
+                     (is_local or is_s3_path) else download_weights_from_hf(
+                         model_name_or_path,
+                         self.load_config.download_dir,
+                         [safetensors_pattern],
+                         revision,
+                         ignore_patterns=self.load_config.ignore_patterns,
+                     ))
+        if is_s3_path:
+            hf_weights_files = s3_glob(path=hf_folder,
+                                       allow_pattern=[safetensors_pattern])
+        else:
+            hf_weights_files = glob.glob(
+                os.path.join(hf_folder, safetensors_pattern))
+
+        if not is_local and not is_s3_path:
+            download_safetensors_index_file_from_hf(
+                model_name_or_path, index_file, self.load_config.download_dir,
+                revision)
+
+        if not hf_weights_files:
+            raise RuntimeError(
+                f"Cannot find any safetensors model weights with "
+                f"`{model_name_or_path}`")
+
+        return hf_weights_files
+
+    def _get_weights_iterator(
+            self, model_or_path: str,
+            revision: str) -> Generator[tuple[str, torch.Tensor], None, None]:
+        """Get an iterator for the model weights based on the load format."""
+        hf_weights_files = self._prepare_weights(model_or_path, revision)
+        return runai_safetensors_weights_iterator(
+            hf_weights_files,
+            self.load_config.use_tqdm_on_load,
+        )
+
+    def download_model(self, model_config: ModelConfig) -> None:
+        """Download model if necessary"""
+        self._prepare_weights(model_config.model, model_config.revision)
+
+    def load_weights(self, model: nn.Module,
+                     model_config: ModelConfig) -> None:
+        """Load weights into a model."""
+        model_weights = model_config.model
+        if hasattr(model_config, "model_weights"):
+            model_weights = model_config.model_weights
+        model.load_weights(
+            self._get_weights_iterator(model_weights, model_config.revision))
diff --git a/vllm_v0.10.0/vllm/model_executor/model_loader/sharded_state_loader.py b/vllm_v0.10.0/vllm/model_executor/model_loader/sharded_state_loader.py
new file mode 100644
index 0000000..2fd9cfb
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/model_loader/sharded_state_loader.py
@@ -0,0 +1,201 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import collections
+import glob
+import os
+from collections.abc import Generator
+from typing import Any, Optional
+
+import torch
+from torch import nn
+
+from vllm.config import LoadConfig, ModelConfig
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader.base_loader import BaseModelLoader
+from vllm.model_executor.model_loader.weight_utils import (
+    download_weights_from_hf, runai_safetensors_weights_iterator)
+from vllm.transformers_utils.s3_utils import glob as s3_glob
+from vllm.transformers_utils.utils import is_s3
+
+logger = init_logger(__name__)
+
+
+class ShardedStateLoader(BaseModelLoader):
+    """
+    Model loader that directly loads each worker's model state dict, which
+    enables a fast load path for large tensor-parallel models where each worker
+    only needs to read its own shard rather than the entire checkpoint. See
+    `examples/offline_inference/save_sharded_state.py` for creating a sharded
+    checkpoint.
+    """
+
+    DEFAULT_PATTERN = "model-rank-{rank}-part-{part}.safetensors"
+
+    def __init__(self,
+                 load_config: LoadConfig,
+                 runai_model_streamer: bool = False):
+        super().__init__(load_config)
+
+        self.runai_model_streamer = runai_model_streamer
+        extra_config = ({} if load_config.model_loader_extra_config is None
+                        else load_config.model_loader_extra_config.copy())
+        self.pattern = extra_config.pop("pattern", self.DEFAULT_PATTERN)
+        if extra_config:
+            raise ValueError(f"Unexpected extra config keys for load format "
+                             f"{load_config.load_format}: "
+                             f"{load_config.model_loader_extra_config.keys()}")
+
+    @staticmethod
+    def _filter_subtensors(
+        tensors: dict[str, torch.Tensor], ) -> dict[str, torch.Tensor]:
+        """
+        Filter out all tensors that share the same memory or a subset of the
+        memory of another tensor.
+        """
+        same_storage_groups: dict[Any, list[tuple[str, torch.Tensor]]] = (
+            collections.defaultdict(list))
+        for key, tensor in tensors.items():
+            if tensor.numel():
+                ptr = tensor.untyped_storage().data_ptr()
+                same_storage_groups[tensor.device, ptr].append((key, tensor))
+
+        def get_end_ptr(tensor: torch.Tensor) -> int:
+            return tensor.view(-1)[-1].data_ptr() + tensor.element_size()
+
+        result: dict[str, torch.Tensor] = {}
+        for group in same_storage_groups.values():
+            for k, t in group:
+                a, b = t.data_ptr(), get_end_ptr(t)
+                for k2, t2 in group:
+                    if not t2.is_contiguous():
+                        continue
+                    a2, b2 = t2.data_ptr(), get_end_ptr(t2)
+                    if a < a2 or b2 < b:
+                        continue
+                    if a2 < a or b < b2 or not t.is_contiguous():
+                        break  # t2 covers strictly more memory than t.
+                    if k2 < k:
+                        # Same tensors, keep the one with the smaller key.
+                        break
+                else:
+                    result[k] = t
+        return result
+
+    def _prepare_weights(self, model_name_or_path: str,
+                         revision: Optional[str]):
+        if is_s3(model_name_or_path) or os.path.isdir(model_name_or_path):
+            return model_name_or_path
+        else:
+            allow_patterns = ["*.safetensors"]
+            return download_weights_from_hf(
+                model_name_or_path,
+                self.load_config.download_dir,
+                allow_patterns,
+                revision,
+                ignore_patterns=self.load_config.ignore_patterns,
+            )
+
+    def download_model(self, model_config: ModelConfig) -> None:
+        self._prepare_weights(model_config.model, model_config.revision)
+
+    def load_weights(self, model: nn.Module,
+                     model_config: ModelConfig) -> None:
+        from vllm.distributed import get_tensor_model_parallel_rank
+
+        model_weights = model_config.model
+        if hasattr(model_config, "model_weights"):
+            model_weights = model_config.model_weights
+        local_model_path = model_weights
+
+        rank = get_tensor_model_parallel_rank()
+        pattern = os.path.join(
+            local_model_path,
+            self.pattern.format(rank=rank, part="*"),
+        )
+
+        filepaths = []
+        if is_s3(local_model_path):
+            file_pattern = f"*{self.pattern.format(rank=rank, part=' * ')}"
+            filepaths = s3_glob(path=local_model_path,
+                                allow_pattern=[file_pattern])
+        else:
+            filepaths = glob.glob(pattern)
+        if not filepaths:
+            # TODO: support un-sharded checkpoints too
+            raise ValueError(
+                f"Could not find checkpoint files '{pattern}', only "
+                f"pre-sharded checkpoints are currently supported!")
+        state_dict = self._filter_subtensors(model.state_dict())
+        for key, tensor in self.iterate_over_files(filepaths):
+            # If loading with LoRA enabled, additional padding may
+            # be added to certain parameters. We only load into a
+            # narrowed view of the parameter data.
+            param_data = state_dict[key].data
+            param_shape = state_dict[key].shape
+            for dim, size in enumerate(tensor.shape):
+                if size < param_shape[dim]:
+                    param_data = param_data.narrow(dim, 0, size)
+            if tensor.shape != param_shape:
+                logger.warning(
+                    "loading tensor of shape %s into "
+                    "parameter '%s' of shape %s",
+                    tensor.shape,
+                    key,
+                    param_shape,
+                )
+            param_data.copy_(tensor)
+            state_dict.pop(key)
+        if state_dict:
+            raise ValueError(
+                f"Missing keys {tuple(state_dict)} in loaded state!")
+
+    def iterate_over_files(
+            self, paths) -> Generator[tuple[str, torch.Tensor], None, None]:
+        if self.runai_model_streamer:
+            yield from runai_safetensors_weights_iterator(paths, True)
+        else:
+            from safetensors.torch import safe_open
+            for path in paths:
+                with safe_open(path, framework="pt") as f:
+                    for key in f.keys():  # noqa: SIM118
+                        tensor = f.get_tensor(key)
+                        yield key, tensor
+
+    @staticmethod
+    def save_model(
+        model: torch.nn.Module,
+        path: str,
+        pattern: Optional[str] = None,
+        max_size: Optional[int] = None,
+    ) -> None:
+        from safetensors.torch import save_file
+
+        from vllm.distributed import get_tensor_model_parallel_rank
+
+        if pattern is None:
+            pattern = ShardedStateLoader.DEFAULT_PATTERN
+        rank = get_tensor_model_parallel_rank()
+        part_idx = 0
+        total_size = 0
+        state_dict = ShardedStateLoader._filter_subtensors(model.state_dict())
+        state_dict_part: dict[str, torch.Tensor] = {}
+        for key, tensor in state_dict.items():
+            param_size = tensor.nelement() * tensor.element_size()
+            if max_size is not None and total_size + param_size > max_size:
+                filename = pattern.format(rank=rank, part=part_idx)
+                save_file(
+                    state_dict_part,
+                    os.path.join(path, filename),
+                )
+                part_idx += 1
+                total_size = 0
+                state_dict_part = {}
+            state_dict_part[key] = tensor
+            total_size += param_size
+        if len(state_dict_part) > 0:
+            filename = pattern.format(rank=rank, part=part_idx)
+            save_file(
+                state_dict_part,
+                os.path.join(path, filename),
+            )
diff --git a/vllm_v0.10.0/vllm/model_executor/model_loader/tensorizer.py b/vllm_v0.10.0/vllm/model_executor/model_loader/tensorizer.py
new file mode 100644
index 0000000..3d491be
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/model_loader/tensorizer.py
@@ -0,0 +1,743 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import contextlib
+import contextvars
+import dataclasses
+import json
+import os
+import tempfile
+import threading
+import time
+from collections.abc import Generator, MutableMapping
+from dataclasses import asdict, dataclass, field, fields
+from typing import TYPE_CHECKING, Any, ClassVar, Optional, Union
+
+import regex as re
+import torch
+from huggingface_hub import snapshot_download
+from torch import nn
+from torch.utils._python_dispatch import TorchDispatchMode
+from transformers import PretrainedConfig
+
+import vllm.envs as envs
+from vllm.config import (ModelConfig, ParallelConfig, VllmConfig,
+                         set_current_vllm_config)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser, PlaceholderModule
+
+if TYPE_CHECKING:
+    from vllm.engine.arg_utils import EngineArgs
+
+try:
+    from tensorizer import (DecryptionParams, EncryptionParams,
+                            TensorDeserializer, TensorSerializer)
+    from tensorizer.stream_io import open_stream
+    from tensorizer.utils import (convert_bytes, get_mem_usage,
+                                  no_init_or_tensor)
+
+except ImportError:
+    tensorizer = PlaceholderModule("tensorizer")
+    DecryptionParams = tensorizer.placeholder_attr("DecryptionParams")
+    EncryptionParams = tensorizer.placeholder_attr("EncryptionParams")
+    TensorDeserializer = tensorizer.placeholder_attr("TensorDeserializer")
+    TensorSerializer = tensorizer.placeholder_attr("TensorSerializer")
+    open_stream = tensorizer.placeholder_attr("stream_io.open_stream")
+    convert_bytes = tensorizer.placeholder_attr("utils.convert_bytes")
+    get_mem_usage = tensorizer.placeholder_attr("utils.get_mem_usage")
+    no_init_or_tensor = tensorizer.placeholder_attr("utils.no_init_or_tensor")
+
+__all__ = [
+    'EncryptionParams', 'DecryptionParams', 'TensorDeserializer',
+    'TensorSerializer', 'open_stream', 'convert_bytes', 'get_mem_usage',
+    'no_init_or_tensor', 'TensorizerConfig'
+]
+
+logger = init_logger(__name__)
+
+
+def is_valid_deserialization_uri(uri: Optional[str]) -> bool:
+    if uri:
+        scheme = uri.lower().split("://")[0]
+        return scheme in {"s3", "http", "https"} or os.path.exists(uri)
+    return False
+
+
+def tensorizer_kwargs_arg(value):
+    loaded = json.loads(value)
+    if not isinstance(loaded, dict):
+        raise argparse.ArgumentTypeError(
+            f"Not deserializable to dict: {value}. serialization_kwargs and "
+            f"deserialization_kwargs must be "
+            f"deserializable from a JSON string to a dictionary. ")
+    return loaded
+
+
+class MetaTensorMode(TorchDispatchMode):
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        kwargs = kwargs or {}
+
+        if func._schema.name == "aten::empty" and "device" not in kwargs:
+            kwargs["device"] = "meta"
+
+        return func(*args, **kwargs)
+
+
+def meta_tensor_mode(loading_code=None, ):
+
+    if loading_code is None:
+        return _NoInitOrTensorImpl.context_manager()
+    elif callable(loading_code):
+        with _NoInitOrTensorImpl.context_manager():
+            return loading_code()
+    else:
+        raise TypeError(
+            "expected a callable to evaluate,"
+            " or None if being used as a context manager;"
+            f' got an object of type "{type(loading_code).__name__}" instead.')
+
+
+class _NoInitOrTensorImpl:
+    _MODULES = (torch.nn.Linear, torch.nn.Embedding, torch.nn.LayerNorm)
+    _MODULE_ORIGINALS = tuple((m, m.reset_parameters) for m in _MODULES)
+
+    is_active = contextvars.ContextVar("_NoInitOrTensorImpl.is_active",
+                                       default=False)
+    _count_active: int = 0
+    _count_active_lock = threading.Lock()
+
+    @classmethod
+    @contextlib.contextmanager
+    def context_manager(cls):
+        if cls.is_active.get():
+            yield
+            return
+
+        with cls._count_active_lock:
+            cls._count_active += 1
+            if cls._count_active == 1:
+                for mod in cls._MODULES:
+                    mod.reset_parameters = cls._disable(mod.reset_parameters)
+
+        reset_token = cls.is_active.set(True)
+
+        try:
+            with MetaTensorMode():
+                yield
+        finally:
+            cls.is_active.reset(reset_token)
+            with cls._count_active_lock:
+                cls._count_active -= 1
+                if cls._count_active == 0:
+                    for mod, original in cls._MODULE_ORIGINALS:
+                        mod.reset_parameters = original
+
+    @staticmethod
+    def _disable(func):
+
+        def wrapper(*args, **kwargs):
+            if not _NoInitOrTensorImpl.is_active.get():
+                return func(*args, **kwargs)
+
+        return wrapper
+
+
+@dataclass
+class TensorizerConfig(MutableMapping):
+    tensorizer_uri: Optional[str] = None
+    tensorizer_dir: Optional[str] = None
+    vllm_tensorized: Optional[bool] = None
+    verify_hash: Optional[bool] = None
+    num_readers: Optional[int] = None
+    encryption_keyfile: Optional[str] = None
+    s3_access_key_id: Optional[str] = None
+    s3_secret_access_key: Optional[str] = None
+    s3_endpoint: Optional[str] = None
+    lora_dir: Optional[str] = None
+    stream_kwargs: Optional[dict[str, Any]] = None
+    serialization_kwargs: Optional[dict[str, Any]] = None
+    deserialization_kwargs: Optional[dict[str, Any]] = None
+    _extra_serialization_attrs: Optional[dict[str, Any]] = field(init=False,
+                                                                 default=None)
+    model_class: Optional[type[torch.nn.Module]] = field(init=False,
+                                                         default=None)
+    hf_config: Optional[PretrainedConfig] = field(init=False, default=None)
+    dtype: Optional[Union[str, torch.dtype]] = field(init=False, default=None)
+    _is_sharded: bool = field(init=False, default=False)
+    _fields: ClassVar[tuple[str, ...]]
+    _keys: ClassVar[frozenset[str]]
+    """
+    Args for the TensorizerConfig class. These are used to configure the 
+    behavior of model serialization and deserialization using Tensorizer.
+    
+    Args:
+      tensorizer_uri: Path to serialized model tensors. Can be a local file 
+          path or a S3 URI. This is a required field unless lora_dir is 
+          provided and the config is meant to be used for the
+          `tensorize_lora_adapter` function. Unless a `tensorizer_dir` or 
+          `lora_dir` is passed to this object's initializer, this is a required 
+          argument.
+      tensorizer_dir: Path to a directory containing serialized model tensors,
+          and all other potential model artifacts to load the model, such as 
+          configs and tokenizer files. Can be passed instead of `tensorizer_uri`
+          where the `model.tensors` file will be assumed to be in this 
+          directory.
+      vllm_tensorized: If True, indicates that the serialized model is a 
+          vLLM model. This is used to determine the behavior of the 
+          TensorDeserializer when loading tensors from a serialized model.
+          It is far faster to deserialize a vLLM model as it utilizes
+          tensorizer's optimized GPU loading. Note that this is now
+          deprecated, as serialized vLLM models are now automatically
+          inferred as vLLM models.
+      verify_hash: If True, the hashes of each tensor will be verified against 
+          the hashes stored in the metadata. A `HashMismatchError` will be 
+          raised if any of the hashes do not match.
+      num_readers: Controls how many threads are allowed to read concurrently
+          from the source file. Default is `None`, which will dynamically set
+          the number of readers based on the number of available 
+          resources and model size. This greatly increases performance.
+      encryption_keyfile: File path to a binary file containing a  
+          binary key to use for decryption. `None` (the default) means 
+          no decryption. See the example script in 
+          examples/others/tensorize_vllm_model.py. 
+      s3_access_key_id: The access key for the S3 bucket. Can also be set via
+          the S3_ACCESS_KEY_ID environment variable.
+      s3_secret_access_key: The secret access key for the S3 bucket. Can also
+          be set via the S3_SECRET_ACCESS_KEY environment variable.
+      s3_endpoint: The endpoint for the S3 bucket. Can also be set via the
+          S3_ENDPOINT_URL environment variable.
+      lora_dir: Path to a directory containing LoRA adapter artifacts for 
+          serialization or deserialization. When serializing LoRA adapters 
+          this is the only necessary parameter to pass to this object's 
+          initializer.
+  """
+
+    def __post_init__(self):
+        # check if the configuration is for a sharded vLLM model
+        self._is_sharded = isinstance(self.tensorizer_uri, str) \
+            and re.search(r'%0\dd', self.tensorizer_uri) is not None
+
+        if self.tensorizer_dir and self.lora_dir:
+            raise ValueError(
+                "Only one of tensorizer_dir or lora_dir may be specified. "
+                "Use lora_dir exclusively when serializing LoRA adapters, "
+                "and tensorizer_dir or tensorizer_uri otherwise.")
+        if self.tensorizer_dir and self.tensorizer_uri:
+            logger.warning_once(
+                "Provided both tensorizer_dir and tensorizer_uri. "
+                "Inferring tensorizer_dir from tensorizer_uri as the "
+                "latter takes precedence.")
+            self.tensorizer_dir = os.path.dirname(self.tensorizer_uri)
+        if not self.tensorizer_uri:
+            if self.lora_dir:
+                self.tensorizer_uri = f"{self.lora_dir}/adapter_model.tensors"
+            elif self.tensorizer_dir:
+                self.tensorizer_uri = f"{self.tensorizer_dir}/model.tensors"
+            else:
+                raise ValueError("Unable to resolve tensorizer_uri. "
+                                 "A valid tensorizer_uri or tensorizer_dir "
+                                 "must be provided for deserialization, and a "
+                                 "valid tensorizer_uri, tensorizer_uri, or "
+                                 "lora_dir for serialization.")
+        else:
+            self.tensorizer_dir = os.path.dirname(self.tensorizer_uri)
+
+        if not self.serialization_kwargs:
+            self.serialization_kwargs = {}
+        if not self.deserialization_kwargs:
+            self.deserialization_kwargs = {}
+
+    def to_serializable(self) -> dict[str, Any]:
+        # Due to TensorizerConfig needing to be msgpack-serializable, it needs
+        # support for morphing back and forth between itself and its dict
+        # representation
+
+        # TensorizerConfig's representation as a dictionary is meant to be
+        # linked to TensorizerConfig in such a way that the following is
+        # technically initializable:
+        # TensorizerConfig(**my_tensorizer_cfg.to_serializable())
+
+        # This means the dict must not retain non-initializable parameters
+        # and post-init attribute states
+
+        # Also don't want to retain private and unset parameters, so only retain
+        # not None values and public attributes
+
+        raw_tc_dict = asdict(self)
+        blacklisted = []
+
+        if "tensorizer_uri" in raw_tc_dict and "tensorizer_dir" in raw_tc_dict:
+            blacklisted.append("tensorizer_dir")
+
+        if "tensorizer_dir" in raw_tc_dict and "lora_dir" in raw_tc_dict:
+            blacklisted.append("tensorizer_dir")
+
+        tc_dict = {}
+        for k, v in raw_tc_dict.items():
+            if (k not in blacklisted and k not in tc_dict
+                    and not k.startswith("_") and v is not None):
+                tc_dict[k] = v
+
+        return tc_dict
+
+    def _construct_tensorizer_args(self) -> "TensorizerArgs":
+        return TensorizerArgs(self)  # type: ignore
+
+    def verify_with_parallel_config(
+        self,
+        parallel_config: "ParallelConfig",
+    ) -> None:
+        if parallel_config.tensor_parallel_size > 1 \
+            and not self._is_sharded:
+            raise ValueError(
+                "For a sharded model, tensorizer_uri should include a"
+                " string format template like '%04d' to be formatted"
+                " with the rank of the shard")
+
+    def verify_with_model_config(self, model_config: "ModelConfig") -> None:
+        if (model_config.quantization is not None
+                and self.tensorizer_uri is not None):
+            logger.warning(
+                "Loading a model using Tensorizer with quantization on vLLM"
+                " is unstable and may lead to errors.")
+
+    def open_stream(self, tensorizer_args: Optional["TensorizerArgs"] = None):
+        if tensorizer_args is None:
+            tensorizer_args = self._construct_tensorizer_args()
+
+        return open_stream(self.tensorizer_uri,
+                           **tensorizer_args.stream_kwargs)
+
+    def keys(self):
+        return self._keys
+
+    def __len__(self):
+        return len(fields(self))
+
+    def __iter__(self):
+        return iter(self._fields)
+
+    def __getitem__(self, item: str) -> Any:
+        if item not in self.keys():
+            raise KeyError(item)
+        return getattr(self, item)
+
+    def __setitem__(self, key: str, value: Any) -> None:
+        if key not in self.keys():
+            # Disallow modifying invalid keys
+            raise KeyError(key)
+        setattr(self, key, value)
+
+    def __delitem__(self, key, /):
+        if key not in self.keys():
+            raise KeyError(key)
+        delattr(self, key)
+
+
+TensorizerConfig._fields = tuple(f.name for f in fields(TensorizerConfig))
+TensorizerConfig._keys = frozenset(TensorizerConfig._fields)
+
+
+@dataclass
+class TensorizerArgs:
+    tensorizer_uri: Optional[str] = None
+    tensorizer_dir: Optional[str] = None
+    encryption_keyfile: Optional[str] = None
+
+    def __init__(self, tensorizer_config: TensorizerConfig):
+        for k, v in tensorizer_config.items():
+            setattr(self, k, v)
+        self.file_obj = tensorizer_config.tensorizer_uri
+        self.s3_access_key_id = (tensorizer_config.s3_access_key_id
+                                 or envs.S3_ACCESS_KEY_ID)
+        self.s3_secret_access_key = (tensorizer_config.s3_secret_access_key
+                                     or envs.S3_SECRET_ACCESS_KEY)
+        self.s3_endpoint = tensorizer_config.s3_endpoint or envs.S3_ENDPOINT_URL
+
+        self.stream_kwargs = {
+            "s3_access_key_id": tensorizer_config.s3_access_key_id,
+            "s3_secret_access_key": tensorizer_config.s3_secret_access_key,
+            "s3_endpoint": tensorizer_config.s3_endpoint,
+            **(tensorizer_config.stream_kwargs or {})
+        }
+
+        self.deserialization_kwargs = {
+            "verify_hash": tensorizer_config.verify_hash,
+            "encryption": tensorizer_config.encryption_keyfile,
+            "num_readers": tensorizer_config.num_readers,
+            **(tensorizer_config.deserialization_kwargs or {})
+        }
+
+        if self.encryption_keyfile:
+            with open_stream(
+                    tensorizer_config.encryption_keyfile,
+                    **self.stream_kwargs,
+            ) as stream:
+                key = stream.read()
+                decryption_params = DecryptionParams.from_key(key)
+                self.deserialization_kwargs['encryption'] = decryption_params
+
+    @staticmethod
+    def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
+        """Tensorizer CLI arguments"""
+
+        # Tensorizer options arg group
+        group = parser.add_argument_group(
+            'tensorizer options',
+            description=('Options for configuring the behavior of the'
+                         ' tensorizer deserializer when '
+                         'load_format=tensorizer is specified when '
+                         'initializing an LLMEngine, either via the CLI '
+                         'when running the vLLM OpenAI inference server '
+                         'with a JSON string passed to '
+                         '--model-loader-extra-config or as arguments given '
+                         'to TensorizerConfig when passed to '
+                         'model_loader_extra_config in the constructor '
+                         'for LLMEngine.'))
+
+        group.add_argument(
+            "--tensorizer-uri",
+            type=str,
+            help="Path to serialized model tensors. Can be a local file path,"
+            " or an HTTP(S) or S3 URI.",
+        )
+        group.add_argument(
+            "--verify-hash",
+            action="store_true",
+            help="If enabled, the hashes of each tensor will be verified"
+            " against the hashes stored in the file metadata. An exception"
+            " will be raised if any of the hashes do not match.",
+        )
+        group.add_argument(
+            "--encryption-keyfile",
+            type=str,
+            default=None,
+            help="The file path to a binary file containing a binary key to "
+            "use for decryption. Can be a file path or S3 network URI.")
+        group.add_argument(
+            "--num-readers",
+            default=None,
+            type=int,
+            help="Controls how many threads are allowed to read concurrently "
+            "from the source file. Default is `None`, which will dynamically "
+            "set the number of readers based on the available resources "
+            "and model size. This greatly increases performance.")
+        group.add_argument(
+            "--s3-access-key-id",
+            type=str,
+            default=None,
+            help="The access key for the S3 bucket. Can also be set via the "
+            "S3_ACCESS_KEY_ID environment variable.",
+        )
+        group.add_argument(
+            "--s3-secret-access-key",
+            type=str,
+            default=None,
+            help="The secret access key for the S3 bucket. Can also be set via "
+            "the S3_SECRET_ACCESS_KEY environment variable.",
+        )
+        group.add_argument(
+            "--s3-endpoint",
+            type=str,
+            default=None,
+            help="The endpoint for the S3 bucket. Can also be set via the "
+            "S3_ENDPOINT_URL environment variable.",
+        )
+
+        return parser
+
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace) -> "TensorizerArgs":
+        attrs = [attr.name for attr in dataclasses.fields(cls)]
+        tensorizer_args = cls(**{
+            attr: getattr(args, attr)
+            for attr in attrs if hasattr(args, attr)
+        })
+        return tensorizer_args
+
+
+def _check_tensors_on_meta_device(model: nn.Module) -> None:
+    for tensor in model.state_dict().values():
+        if tensor.device.type == 'meta':
+            raise ValueError(
+                "The serialized model contains tensors on the meta device,"
+                " indicating that some tensors were not loaded properly."
+                " Please check that the parameters of the model being"
+                " specified match that of the serialized model, such as"
+                " its quantization.")
+
+
+def _resize_lora_embeddings(model: nn.Module):
+    """Modify LoRA embedding layers to use bigger tensors
+    to allow for adapter added tokens."""
+    for child in model.modules():
+        if (isinstance(child, VocabParallelEmbedding) and child.weight.shape[0]
+                < child.num_embeddings_per_partition):
+            new_weight = torch.empty(child.num_embeddings_per_partition,
+                                     child.embedding_dim,
+                                     dtype=child.weight.dtype,
+                                     device=child.weight.device)
+            new_weight[:child.weight.shape[0]].copy_(child.weight.data)
+            new_weight[child.weight.shape[0]:].fill_(0)
+            child.weight.data = new_weight
+
+
+def init_tensorizer_model(tensorizer_config: TensorizerConfig,
+                          vllm_config: VllmConfig) -> nn.Module:
+    assert tensorizer_config.hf_config is not None
+    model_args = tensorizer_config.hf_config
+    model_args.torch_dtype = tensorizer_config.dtype
+    assert tensorizer_config.model_class is not None
+    # TODO: Do we need to consider old-style model class?
+    with meta_tensor_mode(), set_current_vllm_config(vllm_config,
+                                                     check_compile=True):
+        return tensorizer_config.model_class(vllm_config=vllm_config)
+
+
+def deserialize_tensorizer_model(model: nn.Module,
+                                 tensorizer_config: TensorizerConfig) -> None:
+    tensorizer_args = tensorizer_config._construct_tensorizer_args()
+    if not is_valid_deserialization_uri(tensorizer_config.tensorizer_uri):
+        raise ValueError(
+            f"{tensorizer_config.tensorizer_uri} is not a valid "
+            f"tensorizer URI. Please check that the URI is correct. "
+            f"It must either point to a local existing file, or have a "
+            f"S3, HTTP or HTTPS scheme.")
+    before_mem = get_mem_usage()
+    start = time.perf_counter()
+    with open_stream(
+            tensorizer_config.tensorizer_uri,
+            mode="rb",
+            **tensorizer_args.stream_kwargs) as stream, TensorDeserializer(
+                stream,
+                dtype=tensorizer_config.dtype,
+                device=f'xpu:{torch.xpu.current_device()}'
+                if current_platform.is_xpu() else
+                f'cuda:{torch.cuda.current_device()}',
+                **tensorizer_args.deserialization_kwargs) as deserializer:
+        deserializer.load_into_module(model)
+        end = time.perf_counter()
+
+    total_bytes_str = convert_bytes(deserializer.total_tensor_bytes)
+    duration = end - start
+    per_second = convert_bytes(deserializer.total_tensor_bytes / duration)
+    after_mem = get_mem_usage()
+    deserializer.close()
+    logger.info("Deserialized %s in %0.2fs, %s/s", total_bytes_str,
+                end - start, per_second)
+    logger.info("Memory usage before: %s", before_mem)
+    logger.info("Memory usage after: %s", after_mem)
+
+    _check_tensors_on_meta_device(model)
+    _resize_lora_embeddings(model)
+    del model.vllm_tensorized_marker
+
+
+def tensorizer_weights_iterator(
+    tensorizer_args: "TensorizerArgs"
+) -> Generator[tuple[str, torch.Tensor], None, None]:
+    logger.warning("Deserializing HuggingFace models is not optimized for "
+                   "loading on vLLM, as tensorizer is forced to load to CPU. "
+                   "Consider deserializing a vLLM model instead for faster "
+                   "load times. See the "
+                   "examples/others/tensorize_vllm_model.py example script "
+                   "for serializing vLLM models.")
+
+    deserializer_args = tensorizer_args.deserialization_kwargs
+    stream_kwargs = tensorizer_args.stream_kwargs
+    stream = open_stream(tensorizer_args.tensorizer_uri, **stream_kwargs)
+    with TensorDeserializer(stream, **deserializer_args,
+                            device="cpu") as state:
+        yield from state.items()
+    del state
+
+
+def is_vllm_tensorized(tensorizer_config: "TensorizerConfig") -> bool:
+    """
+    Infer if the model is a vLLM model by checking the weights for
+    a vLLM tensorized marker.
+
+    Args:
+        tensorizer_config: The TensorizerConfig object containing the
+            tensorizer_uri to the serialized model.
+
+    Returns:
+        bool: True if the model is a vLLM model, False otherwise.
+    """
+    tensorizer_args = tensorizer_config._construct_tensorizer_args()
+    deserializer = TensorDeserializer(open_stream(
+        tensorizer_args.tensorizer_uri, **tensorizer_args.stream_kwargs),
+                                      **tensorizer_args.deserialization_kwargs,
+                                      lazy_load=True)
+    if tensorizer_config.vllm_tensorized:
+        logger.warning(
+            "Please note that newly serialized vLLM models are automatically "
+            "inferred as vLLM models, so setting vllm_tensorized=True is "
+            "only necessary for models serialized prior to this change.")
+        return True
+    return ".vllm_tensorized_marker" in deserializer
+
+
+def serialize_extra_artifacts(
+        tensorizer_args: TensorizerArgs,
+        served_model_name: Union[str, list[str], None]) -> None:
+    if not isinstance(served_model_name, str):
+        raise ValueError(
+            f"served_model_name must be a str for serialize_extra_artifacts, "
+            f"not {type(served_model_name)}.")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        snapshot_download(served_model_name,
+                          local_dir=tmpdir,
+                          ignore_patterns=[
+                              "*.pt", "*.safetensors", "*.bin", "*.cache",
+                              "*.gitattributes", "*.md"
+                          ])
+        for artifact in os.scandir(tmpdir):
+            if not artifact.is_file():
+                continue
+            with open(artifact.path, "rb") as f, open_stream(
+                    f"{tensorizer_args.tensorizer_dir}/{artifact.name}",
+                    mode="wb+",
+                    **tensorizer_args.stream_kwargs) as stream:
+                logger.info("Writing artifact %s", artifact.name)
+                stream.write(f.read())
+
+
+def serialize_vllm_model(
+    model: nn.Module,
+    tensorizer_config: TensorizerConfig,
+    model_config: "ModelConfig",
+) -> nn.Module:
+    model.register_parameter(
+        "vllm_tensorized_marker",
+        nn.Parameter(torch.tensor((1, ), device="meta"), requires_grad=False))
+
+    tensorizer_args = tensorizer_config._construct_tensorizer_args()
+
+    encryption_params = None
+    if (keyfile := tensorizer_config.encryption_keyfile) is not None:
+        with open(keyfile, "rb") as f:
+            key = f.read()
+        encryption_params = EncryptionParams(key=key)
+
+    output_file = tensorizer_args.tensorizer_uri
+    if tensorizer_config._is_sharded:
+        from vllm.distributed import get_tensor_model_parallel_rank
+        output_file = output_file % get_tensor_model_parallel_rank()
+
+    with open_stream(output_file, mode="wb+",
+                     **tensorizer_args.stream_kwargs) as stream:
+        serializer = TensorSerializer(stream,
+                                      encryption=encryption_params,
+                                      **tensorizer_config.serialization_kwargs)
+        serializer.write_module(model)
+        serializer.close()
+
+    serialize_extra_artifacts(tensorizer_args, model_config.served_model_name)
+
+    logger.info("Successfully serialized model to %s", str(output_file))
+    return model
+
+
+def tensorize_vllm_model(engine_args: "EngineArgs",
+                         tensorizer_config: TensorizerConfig,
+                         generate_keyfile: bool = True):
+    """Utility to load a model and then serialize it with Tensorizer
+
+       Intended to be used separately from running a vLLM server since it
+       creates its own Engine instance.
+    """
+    engine_config = engine_args.create_engine_config()
+    tensorizer_config.verify_with_model_config(engine_config.model_config)
+    tensorizer_config.verify_with_parallel_config(
+        engine_config.parallel_config)
+
+    # generate the encryption key before creating the engine to support sharding
+    if generate_keyfile and (keyfile :=
+                             tensorizer_config.encryption_keyfile) is not None:
+        encryption_params = EncryptionParams.random()
+        with open_stream(
+                keyfile,
+                mode="wb+",
+                s3_access_key_id=tensorizer_config.s3_access_key_id,
+                s3_secret_access_key=tensorizer_config.s3_secret_access_key,
+                s3_endpoint=tensorizer_config.s3_endpoint,
+        ) as stream:
+            stream.write(encryption_params.key)
+
+    from vllm import LLMEngine
+    from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
+
+    if not envs.VLLM_USE_V1:
+        engine = LLMEngine.from_engine_args(engine_args)
+        engine.model_executor.collective_rpc(
+            "save_tensorized_model",
+            kwargs={"tensorizer_config": tensorizer_config.to_serializable()},
+        )
+    else:
+        engine = V1LLMEngine.from_vllm_config(engine_config)
+        engine.collective_rpc(
+            "save_tensorized_model",
+            kwargs={"tensorizer_config": tensorizer_config.to_serializable()},
+        )
+
+
+def tensorize_lora_adapter(lora_path: str,
+                           tensorizer_config: TensorizerConfig):
+    """
+    Uses tensorizer to serialize a LoRA adapter. Assumes that the files
+    needed to load a LoRA adapter are a safetensors-format file called
+    adapter_model.safetensors and a json config file called adapter_config.json.
+
+    Serializes the files in the tensorizer_config.tensorizer_dir
+    """
+    import safetensors
+
+    from vllm.lora.utils import get_adapter_absolute_path
+
+    lora_dir = get_adapter_absolute_path(lora_path)
+
+    tensor_path = config_path = ""
+
+    for file in os.listdir(lora_dir):
+        if file.startswith("adapter_model"):
+            tensor_path = lora_dir + "/" + file
+        if file.startswith("adapter_config"):
+            config_path = lora_dir + "/" + file
+        if tensor_path and config_path:
+            break
+
+    if tensor_path.endswith(".safetensors"):
+        tensors = safetensors.torch.load_file(tensor_path)
+    elif tensor_path.endswith(".bin"):
+        tensors = torch.load(tensor_path)
+    else:
+        raise ValueError("Unsupported file: %s", tensor_path)
+
+    with open(config_path) as f:
+        config = json.load(f)
+
+    tensorizer_args = tensorizer_config._construct_tensorizer_args()
+
+    with open_stream(f"{tensorizer_config.tensorizer_dir}/adapter_config.json",
+                     mode="wb+",
+                     **tensorizer_args.stream_kwargs) as f:
+
+        f.write(json.dumps(config).encode("utf-8"))
+
+    lora_uri = (f"{tensorizer_config.tensorizer_dir}"
+                f"/adapter_model.tensors")
+    with open_stream(lora_uri, mode="wb+",
+                     **tensorizer_args.stream_kwargs) as f:
+        serializer = TensorSerializer(f)
+        serializer.write_state_dict(tensors)
+        serializer.close()
+
+    logger.info("Successfully serialized LoRA files to %s",
+                str(tensorizer_config.tensorizer_dir))
diff --git a/vllm_v0.10.0/vllm/model_executor/model_loader/tensorizer_loader.py b/vllm_v0.10.0/vllm/model_executor/model_loader/tensorizer_loader.py
new file mode 100644
index 0000000..fa01758
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/model_loader/tensorizer_loader.py
@@ -0,0 +1,142 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: SIM117
+import copy
+from collections.abc import Generator
+from typing import Union
+
+import torch
+from torch import nn
+
+from vllm.config import LoadConfig, ModelConfig, ParallelConfig, VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader.base_loader import BaseModelLoader
+from vllm.model_executor.model_loader.tensorizer import (
+    TensorizerConfig, deserialize_tensorizer_model, init_tensorizer_model,
+    is_vllm_tensorized, serialize_vllm_model, tensorizer_weights_iterator)
+from vllm.model_executor.model_loader.utils import (get_model_architecture,
+                                                    initialize_model,
+                                                    set_default_torch_dtype)
+
+logger = init_logger(__name__)
+
+BLACKLISTED_TENSORIZER_ARGS = {
+    "device",  # vLLM decides this
+    "dtype",  # vLLM decides this
+    "mode",  # Not meant to be configurable by the user
+}
+
+
+def validate_config(config: dict):
+    for k, v in config.items():
+        if v is not None and k in BLACKLISTED_TENSORIZER_ARGS:
+            raise ValueError(f"{k} is not an allowed Tensorizer argument.")
+
+
+class TensorizerLoader(BaseModelLoader):
+    """Model loader using CoreWeave's tensorizer library."""
+
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+        if isinstance(load_config.model_loader_extra_config, TensorizerConfig):
+            self.tensorizer_config = load_config.model_loader_extra_config
+        else:
+            validate_config(load_config.model_loader_extra_config)
+            self.tensorizer_config = TensorizerConfig(
+                **load_config.model_loader_extra_config["tensorizer_config"])
+
+    def _verify_config(self, model_config: ModelConfig,
+                       parallel_config: ParallelConfig):
+        self.tensorizer_config.verify_with_model_config(model_config)
+        self.tensorizer_config.verify_with_parallel_config(parallel_config)
+
+    def _get_weights_iterator(
+        self, ) -> Generator[tuple[str, torch.Tensor], None, None]:
+        tensorizer_args = self.tensorizer_config._construct_tensorizer_args()
+        return tensorizer_weights_iterator(tensorizer_args)
+
+    def _load_model_serialized_cpu(
+        self,
+        vllm_config: VllmConfig,
+    ) -> nn.Module:
+        """Load a serialized model with tensorizer to the CPU.
+
+        This is only necessary when the model isn't vLLM-tensorized (see
+        examples/others/tensorize_vllm_model.py) This should still
+        be faster than default HuggingFace loading, but will be slower than
+        loading a vLLM-tensorized model.
+        """
+        device_config = vllm_config.device_config
+        model_config = vllm_config.model_config
+        with set_default_torch_dtype(model_config.dtype):
+            with torch.device(device_config.device):
+                model = initialize_model(vllm_config=vllm_config)
+
+            model.load_weights(self._get_weights_iterator())
+        return model.eval()
+
+    def download_model(self, model_config: ModelConfig) -> None:
+        self.tensorizer_config.verify_with_model_config(model_config)
+
+        with self.tensorizer_config.open_stream():
+            pass
+
+    def _patch_tensorizer_config(
+            self, model_config: ModelConfig) -> TensorizerConfig:
+        model_class = get_model_architecture(model_config)[0]
+        tensorizer_config = copy.copy(self.tensorizer_config)
+        tensorizer_config.model_class = model_class
+        tensorizer_config.hf_config = model_config.hf_config
+        tensorizer_config.dtype = model_config.dtype
+        return tensorizer_config
+
+    def load_weights(self, model: nn.Module,
+                     model_config: ModelConfig) -> None:
+        """Load serialized model weights with tensorizer.
+
+        Expects a vLLM-tensorized model. See the
+        examples/others/tensorize_vllm_model.py example script
+        for serializing vLLM models."""
+        if is_vllm_tensorized(self.tensorizer_config):
+            tensorizer_config = self._patch_tensorizer_config(model_config)
+            deserialize_tensorizer_model(model, tensorizer_config)
+        else:
+            model.load_weights(self._get_weights_iterator())
+
+    def load_model(self, vllm_config: VllmConfig,
+                   model_config: ModelConfig) -> nn.Module:
+        parallel_config = vllm_config.parallel_config
+        self._verify_config(model_config, parallel_config)
+
+        if parallel_config.tensor_parallel_size > 1:
+            from vllm.distributed import get_tensor_model_parallel_rank
+
+            self.tensorizer_config.tensorizer_uri = (
+                self.tensorizer_config.tensorizer_uri %
+                get_tensor_model_parallel_rank())
+
+        if is_vllm_tensorized(self.tensorizer_config):
+            tensorizer_config = self._patch_tensorizer_config(model_config)
+            device_config = vllm_config.device_config
+            with set_default_torch_dtype(model_config.dtype):
+                with torch.device(device_config.device):
+                    model = init_tensorizer_model(
+                        tensorizer_config=tensorizer_config,
+                        vllm_config=vllm_config)
+            self.load_weights(model, model_config)
+            return model
+        return self._load_model_serialized_cpu(vllm_config=vllm_config)
+
+    @staticmethod
+    def save_model(
+        model: torch.nn.Module,
+        tensorizer_config: Union[TensorizerConfig, dict],
+        model_config: ModelConfig,
+    ) -> None:
+        if isinstance(tensorizer_config, dict):
+            tensorizer_config = TensorizerConfig(**tensorizer_config)
+        serialize_vllm_model(
+            model=model,
+            tensorizer_config=tensorizer_config,
+            model_config=model_config,
+        )
diff --git a/vllm_v0.10.0/vllm/model_executor/model_loader/tpu.py b/vllm_v0.10.0/vllm/model_executor/model_loader/tpu.py
new file mode 100644
index 0000000..b44c165
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/model_loader/tpu.py
@@ -0,0 +1,113 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import time
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch_xla.core.xla_model as xm
+import torch_xla.distributed.spmd as xs
+
+from vllm.config import ModelConfig, VllmConfig
+from vllm.distributed.tpu_distributed_utils import get_fqn, shard_model
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader.default_loader import DefaultModelLoader
+from vllm.model_executor.model_loader.utils import (
+    initialize_model, process_weights_after_loading, set_default_torch_dtype)
+
+logger = init_logger(__name__)
+
+
+class TPUModelLoader(DefaultModelLoader):
+    """
+    A TPU model loader for model loading under SPMD mode.
+    """
+
+    def load_model(
+        self,
+        vllm_config: VllmConfig,
+        model_config: ModelConfig,
+        mesh: Optional[xs.Mesh] = None,
+    ) -> nn.Module:
+        # Initialize model and load weights on CPU. Then, during SPMD partition,
+        # weights are sharded and transferred to TPUs.
+        self.counter_before_loading_weights = time.perf_counter()
+        model_config = vllm_config.model_config
+        assert model_config.quantization is None, "Quantization not supported"
+        target_device = torch.device('cpu')
+        with set_default_torch_dtype(model_config.dtype):
+            with target_device:
+                model = initialize_model(vllm_config=vllm_config)
+
+            load_format = vllm_config.load_config.load_format
+            if load_format != "dummy":
+                weights_to_load = {
+                    name
+                    for name, _ in model.named_parameters()
+                }
+                all_weights = self.get_all_weights(model_config, model)
+                loaded_weights = model.load_weights(all_weights)
+                self.counter_after_loading_weights = time.perf_counter()
+                logger.info(
+                    "Loading weights took %.2f seconds",
+                    self.counter_after_loading_weights -
+                    self.counter_before_loading_weights)
+                # We only enable strict check for non-quantized models
+                # that have loaded weights tracking currently.
+                if model_config.quantization is None and \
+                    loaded_weights is not None:
+                    weights_not_loaded = weights_to_load - loaded_weights
+                    if weights_not_loaded:
+                        raise ValueError(
+                            "Following weights were not initialized from "
+                            f"checkpoint: {weights_not_loaded}")
+            else:
+                logger.info("Use dummy weight during weight loading.")
+
+            process_weights_after_loading(model, model_config, target_device)
+
+        counter_before_partition = time.perf_counter()
+        model = model.eval()
+        model = model.to('xla')
+        shard_model(model, mesh)
+        counter_after_partition = time.perf_counter()
+        logger.info("Partition model took %.2f seconds",
+                    counter_after_partition - counter_before_partition)
+
+        # Ensure the model is properly loaded.
+        self._check_model_is_loaded(mesh, model)
+
+        # Need to torch compile after model sharding are done. Because the
+        # compiler hints ('xs.mark_sharding') are torch ops.
+        if not model_config.is_multimodal_model:
+            model.model = torch.compile(model.model, backend="openxla")
+        else:
+            model.language_model.model = \
+                torch.compile(model.language_model.model, backend="openxla")
+        return model
+
+    def _check_model_is_loaded(self, mesh: Optional[xs.Mesh],
+                               model: nn.Module) -> None:
+        """
+        Ensure the model is properly loaded.
+        1. All model parameters and buffers are on XLA device.
+        2. Non-SPMD friendly layers are replaced as expected.
+        """
+        device = xm.xla_device()
+        device_type = str(device.type)
+
+        # Check parameters
+        for name, param in model.named_parameters():
+            assert param.device.type == device_type, f"Parameter {name} is on \
+                {param.device.type} instead of {device_type}"
+
+        # Check buffers
+        for name, buffer in model.named_buffers():
+            assert buffer.device.type == device_type, \
+                f"Buffer {name} is on {buffer.device.type} instead of \
+                    {device_type}"
+
+        for module in model.modules():
+            if (mesh is not None) and (get_fqn(module) == 'QKVParallelLinear'):
+                raise AssertionError("QKVParallelLinear should be replaced by \
+                            XlaQKVParallelLinear under SPMD mode.")
diff --git a/vllm_v0.10.0/vllm/model_executor/model_loader/utils.py b/vllm_v0.10.0/vllm/model_executor/model_loader/utils.py
new file mode 100644
index 0000000..4b30336
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/model_loader/utils.py
@@ -0,0 +1,354 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Utilities for selecting and loading models."""
+import contextlib
+import inspect
+import warnings
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from typing import Optional
+
+import torch
+import transformers
+from torch import nn
+from transformers.dynamic_module_utils import get_class_from_dynamic_module
+
+from vllm.attention import Attention
+from vllm.config import (ModelConfig, ModelImpl, VllmConfig,
+                         set_current_vllm_config)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import QKVCrossParallelLinear
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.models import ModelRegistry
+from vllm.model_executor.models.adapters import (as_embedding_model,
+                                                 as_reward_model,
+                                                 as_seq_cls_model)
+from vllm.model_executor.models.interfaces import SupportsQuant
+from vllm.model_executor.models.registry import (_PREVIOUSLY_SUPPORTED_MODELS,
+                                                 _TRANSFORMERS_MODELS)
+from vllm.utils import is_pin_memory_available
+
+logger = init_logger(__name__)
+
+
+@contextlib.contextmanager
+def set_default_torch_dtype(dtype: torch.dtype):
+    """Sets the default torch dtype to the given dtype."""
+    old_dtype = torch.get_default_dtype()
+    torch.set_default_dtype(dtype)
+    yield
+    torch.set_default_dtype(old_dtype)
+
+
+def initialize_model(
+    vllm_config: VllmConfig,
+    *,
+    prefix: str = "",
+    model_class: Optional[type[nn.Module]] = None,
+    model_config: Optional[ModelConfig] = None,
+) -> nn.Module:
+    """Initialize a model with the given configurations."""
+    if model_config is None:
+        model_config = vllm_config.model_config
+    if model_class is None:
+        model_class, _ = get_model_architecture(model_config)
+
+    if vllm_config.quant_config is not None:
+        configure_quant_config(vllm_config.quant_config, model_class)
+
+    signatures = inspect.signature(model_class.__init__)
+    all_params = [param.name for param in signatures.parameters.values()]
+    if "vllm_config" in all_params and "prefix" in all_params:
+        # new-style model class
+        with set_current_vllm_config(vllm_config,
+                                     check_compile=True,
+                                     prefix=prefix):
+            return model_class(vllm_config=vllm_config, prefix=prefix)
+
+    msg = ("vLLM model class should accept `vllm_config` and `prefix` as "
+           "input arguments. Possibly you have an old-style model class"
+           " registered from out of tree and it is used for new vLLM version. "
+           "Check https://docs.vllm.ai/en/latest/design/arch_overview.html "
+           "for the design and update the model class accordingly.")
+    warnings.warn(msg, DeprecationWarning, stacklevel=2)
+
+    logger.warning(
+        "Trying to guess the arguments for old-style model class %s",
+        model_class,
+    )
+    # try to be compatible with old-style model class
+    kwargs = {}
+    if "prefix" in all_params:
+        kwargs["prefix"] = prefix
+    if "config" in all_params:
+        kwargs["config"] = model_config.hf_config
+    if "cache_config" in all_params:
+        kwargs["cache_config"] = vllm_config.cache_config
+    if "quant_config" in all_params:
+        kwargs["quant_config"] = vllm_config.quant_config
+    if "lora_config" in all_params:
+        kwargs["lora_config"] = vllm_config.lora_config
+    if "scheduler_config" in all_params:
+        kwargs["scheduler_config"] = vllm_config.scheduler_config
+    with set_current_vllm_config(vllm_config,
+                                 check_compile=True,
+                                 prefix=prefix):
+        return model_class(**kwargs)
+
+
+def process_weights_after_loading(model: nn.Module, model_config: ModelConfig,
+                                  target_device: torch.device) -> None:
+    for _, module in model.named_modules():
+        if isinstance(module, QKVCrossParallelLinear):
+            # NOTE(Isotr0py): special case for cross QKV layer because
+            # q and kv proj aren't registered as submodules intentionally
+            module.process_weights_after_loading()
+            continue
+        quant_method = getattr(module, "quant_method", None)
+        if isinstance(quant_method, QuantizeMethodBase):
+            # When quant methods need to process weights after loading
+            # (for repacking, quantizing, etc), they expect parameters
+            # to be on the global target device. This scope is for the
+            # case where cpu offloading is used, where we will move the
+            # parameters onto device for processing and back off after.
+            with device_loading_context(module, target_device):
+                quant_method.process_weights_after_loading(module)
+
+    # Currently only used by MLA.
+    # NOTE: This intentionally happens after other modules so we can easily
+    # decompress the weights for MLA.
+    for _, module in model.named_modules():
+        if isinstance(module, Attention) and \
+            hasattr(module, "process_weights_after_loading"):
+            # TODO(lucas): see if there is a way to unify the signatures
+            # of process_weights_after_loading
+            module.process_weights_after_loading(model_config.dtype)
+
+
+@contextmanager
+def device_loading_context(module: torch.nn.Module,
+                           target_device: torch.device):
+    if target_device.type == "cpu":
+        # If target is CPU, no need to move anything
+        yield module
+        return
+
+    original_device_states: dict[str, torch.device] = {}
+
+    # Store original device states and move parameters to GPU if they're on CPU
+    for name, p in module.named_parameters():
+        if p.device.type == "cpu":
+            original_device_states[name] = p.device
+            p.data = p.data.to(target_device)
+        # Parameters already on target device are not touched
+
+    try:
+        yield module
+
+    finally:
+        # Restore parameters to their original devices, ignoring new parameters
+        pin_memory = is_pin_memory_available()
+        for name, p in module.named_parameters():
+            if name in original_device_states:
+                original_device: torch.device = original_device_states[name]
+                if original_device.type == "cpu":
+                    # `torch.empty_like` does not support `pin_memory` argument
+                    cpu_data = torch.empty_strided(
+                        size=p.data.size(),
+                        stride=p.data.stride(),
+                        dtype=p.data.dtype,
+                        layout=p.data.layout,
+                        device="cpu",
+                        pin_memory=pin_memory,
+                    )
+                    cpu_data.copy_(p.data)
+                    p.data = cpu_data
+                else:
+                    p.data = p.data.to(original_device)
+        # New parameters or parameters already on target device are untouched
+
+
+def resolve_transformers_arch(model_config: ModelConfig,
+                              architectures: list[str]):
+    if model_config.model_impl == ModelImpl.VLLM:
+        raise ValueError(
+            "Attempting to resolve architecture from the Transformers library "
+            "but the model implementation is set to vLLM. This should never "
+            "happen.")
+
+    for i, arch in enumerate(architectures):
+        if arch in _TRANSFORMERS_MODELS:
+            continue
+
+        if model_config.model_impl == ModelImpl.AUTO:
+            logger.warning(
+                "%s has no vLLM implementation, falling back to Transformers "
+                "implementation. Some features may not be supported and "
+                "performance may not be optimal.", arch)
+
+        auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
+                                           None) or dict()
+        # Make sure that config class is always initialized before model class,
+        # otherwise the model class won't be able to access the config class,
+        # the expected auto_map should have correct order like:
+        # "auto_map": {
+        #     "AutoConfig": "<your-repo-name>--<config-name>",
+        #     "AutoModel": "<your-repo-name>--<config-name>",
+        #     "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
+        # },
+        auto_modules = {
+            name:
+            get_class_from_dynamic_module(module,
+                                          model_config.model,
+                                          revision=model_config.revision)
+            for name, module in sorted(auto_map.items(), key=lambda x: x[0])
+        }
+        model_module = getattr(transformers, arch, None)
+        if model_module is None:
+            if "AutoModel" not in auto_map:
+                raise ValueError(
+                    f"Cannot find model module. '{arch}' is not a registered "
+                    "model in the Transformers library (only relevant if the "
+                    "model is meant to be in Transformers) and 'AutoModel' is "
+                    "not present in the model config's 'auto_map' (relevant "
+                    "if the model is custom).")
+            model_module = auto_modules["AutoModel"]
+
+        if not model_module.is_backend_compatible():
+            raise ValueError(
+                f"The Transformers implementation of '{arch}' is not "
+                "compatible with vLLM.")
+
+        architectures[i] = model_config._get_transformers_backend_cls()
+    return architectures
+
+
+def get_model_architecture(
+        model_config: ModelConfig) -> tuple[type[nn.Module], str]:
+    architectures = getattr(model_config.hf_config, "architectures", [])
+
+    # Special handling for quantized Mixtral.
+    # FIXME(woosuk): This is a temporary hack.
+    mixtral_supported = [
+        "fp8",
+        "compressed-tensors",
+        "gptq_marlin",
+        "awq_marlin",
+        "quark",
+        "bitsandbytes",
+    ]
+
+    vllm_supported_archs = ModelRegistry.get_supported_archs()
+    is_supported = lambda arch: (arch in vllm_supported_archs and arch not in
+                                 _TRANSFORMERS_MODELS)
+    vllm_not_supported = not any(is_supported(arch) for arch in architectures)
+
+    if vllm_not_supported:
+        # try automatic conversion in adapters.py
+        for arch in architectures:
+            if not arch.endswith("ForSequenceClassification"):
+                continue
+
+            assert model_config.task == "classify"
+            causal_lm_arch = arch.replace("ForSequenceClassification",
+                                          "ForCausalLM")
+            causal_lm_arch_vllm_supported = (causal_lm_arch
+                                             in vllm_supported_archs)
+            if not causal_lm_arch_vllm_supported:
+                continue
+
+            architectures = [causal_lm_arch]
+            vllm_not_supported = False
+            break
+
+    if any(arch in _PREVIOUSLY_SUPPORTED_MODELS for arch in architectures):
+        previous_version = _PREVIOUSLY_SUPPORTED_MODELS[architectures[0]]
+        raise ValueError(
+            f"Model architecture {architectures[0]} was supported"
+            f" in vLLM until version {previous_version}, and is "
+            "not supported anymore. Please use an older version"
+            " of vLLM if you want to use this model architecture.")
+
+    if (model_config.model_impl == ModelImpl.TRANSFORMERS or
+            model_config.model_impl == ModelImpl.AUTO and vllm_not_supported):
+        architectures = resolve_transformers_arch(model_config, architectures)
+        logger.debug_once("Resolve transformers arch %s", str(architectures))
+    elif (model_config.quantization is not None
+          and model_config.quantization not in mixtral_supported
+          and "MixtralForCausalLM" in architectures):
+        architectures = ["QuantMixtralForCausalLM"]
+
+    model_cls, arch = ModelRegistry.resolve_model_cls(architectures)
+    if model_config.task == "embed":
+        logger.debug_once("Automatic conversion using `as_embedding_model`.")
+        model_cls = as_embedding_model(model_cls)
+    elif model_config.task == "classify":
+        logger.debug_once("Automatic conversion using `as_seq_cls_model`.")
+        model_cls = as_seq_cls_model(model_cls)
+    elif model_config.task == "reward":
+        logger.debug_once("Automatic conversion using `as_reward_model`.")
+        model_cls = as_reward_model(model_cls)
+
+    return model_cls, arch
+
+
+def get_model_cls(model_config: ModelConfig) -> type[nn.Module]:
+    return get_model_architecture(model_config)[0]
+
+
+def get_architecture_class_name(model_config: ModelConfig) -> str:
+    return get_model_architecture(model_config)[1]
+
+
+@dataclass
+class ParamMapping:
+    """
+    A class to handle parameter mapping for model weight loading.
+    It creates a bidirectional mapping between packed parameters and their 
+    constituent parts.
+    """
+    packed_mapping: dict[str, list[str]]
+    inverse_packed_mapping: dict[str, tuple[str,
+                                            int]] = field(default_factory=dict)
+
+    def __post_init__(self):
+        for packed_name, sub_params in self.packed_mapping.items():
+            # Skip self-contained cases (e.g., {"W_pack": ["W_pack"]})
+            if len(sub_params) == 1 and sub_params[0] == packed_name:
+                continue
+            for index, param_name in enumerate(sub_params):
+                self.inverse_packed_mapping[param_name] = (
+                    packed_name,
+                    index,
+                )
+
+    def get_sub_modules(self,
+                        module_name: str) -> Optional[tuple[str, list[str]]]:
+        for key, value in self.packed_mapping.items():
+            if module_name.endswith(key):
+                return key, value
+        return None
+
+
+def configure_quant_config(quant_config: QuantizationConfig,
+                           model_class: type[nn.Module]):
+    """
+    Pass packed_modules_mapping by reference to quant_config so that
+    quant_config can properly match fused modules
+
+    Note that model attributes are passed by reference to quant_config,
+    enabling them to be updated by model_class.__new__ (ex. chatglm, qwen)
+
+    Once the `SupportsQuant` mixin has been added to all models, this
+    function can be removed
+    """
+    if not issubclass(model_class, SupportsQuant):
+        hf_to_vllm_mapper = getattr(model_class, "hf_to_vllm_mapper", None)
+        packed_mapping = getattr(model_class, "packed_modules_mapping", None)
+
+        # pass mappings by reference to quant_config
+        if hf_to_vllm_mapper is not None:
+            quant_config.apply_vllm_mapper(hf_to_vllm_mapper)
+        if packed_mapping is not None:
+            quant_config.packed_modules_mapping = packed_mapping
diff --git a/vllm_v0.10.0/vllm/model_executor/model_loader/weight_utils.py b/vllm_v0.10.0/vllm/model_executor/model_loader/weight_utils.py
new file mode 100644
index 0000000..074126f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/model_loader/weight_utils.py
@@ -0,0 +1,802 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Utilities for downloading and initializing model weights."""
+import fnmatch
+import glob
+import hashlib
+import json
+import os
+import tempfile
+import time
+from collections import defaultdict
+from collections.abc import Generator
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
+
+import filelock
+import huggingface_hub.constants
+import numpy as np
+import torch
+from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download
+from safetensors.torch import load_file, safe_open, save_file
+from tqdm.auto import tqdm
+
+from vllm.config import LoadConfig, ModelConfig
+from vllm.distributed import get_tensor_model_parallel_rank
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization import (QuantizationConfig,
+                                                     get_quantization_config)
+from vllm.platforms import current_platform
+from vllm.utils import PlaceholderModule
+
+try:
+    from runai_model_streamer import SafetensorsStreamer
+except (ImportError, OSError):
+    # see https://github.com/run-ai/runai-model-streamer/issues/26
+    # OSError will be raised on arm64 platform
+    runai_model_streamer = PlaceholderModule(
+        "runai_model_streamer")  # type: ignore[assignment]
+    SafetensorsStreamer = runai_model_streamer.placeholder_attr(
+        "SafetensorsStreamer")
+
+try:
+    import gguf
+except ImportError:
+    gguf = PlaceholderModule("gguf")
+
+try:
+    from fastsafetensors import SafeTensorsFileLoader, SingleGroup
+except ImportError:
+    fastsafetensors = PlaceholderModule("fastsafetensors")
+    SafeTensorsFileLoader = fastsafetensors.placeholder_attr(
+        "SafeTensorsFileLoader")
+    SingleGroup = fastsafetensors.placeholder_attr("SingleGroup")
+
+logger = init_logger(__name__)
+
+# use system-level temp directory for file locks, so that multiple users
+# can share the same lock without error.
+# lock files in the temp directory will be automatically deleted when the
+# system reboots, so users will not complain about annoying lock files
+temp_dir = tempfile.gettempdir()
+
+
+def enable_hf_transfer():
+    """automatically activates hf_transfer
+    """
+    if "HF_HUB_ENABLE_HF_TRANSFER" not in os.environ:
+        try:
+            # enable hf hub transfer if available
+            import hf_transfer  # type: ignore # noqa
+            huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True
+        except ImportError:
+            pass
+
+
+enable_hf_transfer()
+
+
+class DisabledTqdm(tqdm):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs, disable=True)
+
+
+def get_lock(model_name_or_path: Union[str, Path],
+             cache_dir: Optional[str] = None):
+    lock_dir = cache_dir or temp_dir
+    model_name_or_path = str(model_name_or_path)
+    os.makedirs(os.path.dirname(lock_dir), exist_ok=True)
+    model_name = model_name_or_path.replace("/", "-")
+    hash_name = hashlib.sha256(model_name.encode()).hexdigest()
+    # add hash to avoid conflict with old users' lock files
+    lock_file_name = hash_name + model_name + ".lock"
+    # mode 0o666 is required for the filelock to be shared across users
+    lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name),
+                             mode=0o666)
+    return lock
+
+
+def _shared_pointers(tensors):
+    ptrs = defaultdict(list)
+    for k, v in tensors.items():
+        ptrs[v.data_ptr()].append(k)
+    failing = []
+    for _, names in ptrs.items():
+        if len(names) > 1:
+            failing.append(names)
+    return failing
+
+
+def convert_bin_to_safetensor_file(
+    pt_filename: str,
+    sf_filename: str,
+) -> None:
+    loaded = torch.load(pt_filename, map_location="cpu", weights_only=True)
+    if "state_dict" in loaded:
+        loaded = loaded["state_dict"]
+    shared = _shared_pointers(loaded)
+    for shared_weights in shared:
+        for name in shared_weights[1:]:
+            loaded.pop(name)
+
+    # For tensors to be contiguous
+    loaded = {k: v.contiguous() for k, v in loaded.items()}
+
+    dirname = os.path.dirname(sf_filename)
+    os.makedirs(dirname, exist_ok=True)
+    save_file(loaded, sf_filename, metadata={"format": "pt"})
+
+    # check file size
+    sf_size = os.stat(sf_filename).st_size
+    pt_size = os.stat(pt_filename).st_size
+    if (sf_size - pt_size) / pt_size > 0.01:
+        raise RuntimeError(f"""The file size different is more than 1%:
+         - {sf_filename}: {sf_size}
+         - {pt_filename}: {pt_size}
+         """)
+
+    # check if the tensors are the same
+    reloaded = load_file(sf_filename)
+    for k in loaded:
+        pt_tensor = loaded[k]
+        sf_tensor = reloaded[k]
+        if not torch.equal(pt_tensor, sf_tensor):
+            raise RuntimeError(f"The output tensors do not match for key {k}")
+
+
+# TODO(woosuk): Move this to other place.
+def get_quant_config(model_config: ModelConfig,
+                     load_config: LoadConfig) -> QuantizationConfig:
+
+    quant_cls = get_quantization_config(model_config.quantization)
+
+    # GGUF doesn't have config file
+    if model_config.quantization in ("gguf", "inc"):
+        return quant_cls()
+
+    # Read the quantization config from the HF model config, if available.
+    hf_quant_config = getattr(model_config.hf_config, "quantization_config",
+                              None)
+    # some vision model may keep quantization_config in their text_config
+    hf_text_config = getattr(model_config.hf_config, "text_config", None)
+    if hf_quant_config is None and hf_text_config is not None:
+        hf_quant_config = getattr(hf_text_config, "quantization_config", None)
+    if hf_quant_config is None:
+        # compressed-tensors uses a compressions_config
+        hf_quant_config = getattr(model_config.hf_config, "compression_config",
+                                  None)
+    if hf_quant_config is not None:
+        return quant_cls.from_config(hf_quant_config)
+    # Inflight BNB quantization
+    if model_config.quantization == "bitsandbytes":
+        return quant_cls.from_config({})
+    is_local = os.path.isdir(model_config.model)
+    if not is_local:
+        # Download the config files.
+        with get_lock(model_config.model, load_config.download_dir):
+            hf_folder = snapshot_download(
+                model_config.model,
+                revision=model_config.revision,
+                allow_patterns="*.json",
+                cache_dir=load_config.download_dir,
+                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                tqdm_class=DisabledTqdm,
+            )
+    else:
+        hf_folder = model_config.model
+
+    possible_config_filenames = quant_cls.get_config_filenames()
+
+    # If the quantization config is not found, use the default config.
+    if not possible_config_filenames:
+        return quant_cls()
+
+    config_files = glob.glob(os.path.join(hf_folder, "*.json"))
+
+    quant_config_files = [
+        f for f in config_files if any(
+            f.endswith(x) for x in possible_config_filenames)
+    ]
+    if len(quant_config_files) == 0:
+        raise ValueError(
+            f"Cannot find the config file for {model_config.quantization}")
+    if len(quant_config_files) > 1:
+        raise ValueError(
+            f"Found multiple config files for {model_config.quantization}: "
+            f"{quant_config_files}")
+
+    quant_config_file = quant_config_files[0]
+    with open(quant_config_file) as f:
+        config = json.load(f)
+
+        if model_config.quantization == "bitsandbytes":
+            config["adapter_name_or_path"] = model_config.model
+        elif model_config.quantization == "modelopt":
+            if config["producer"]["name"] == "modelopt":
+                return quant_cls.from_config(config)
+            else:
+                raise ValueError(
+                    f"Unsupported quantization config"
+                    f" found for {model_config.quantization} in {f}.")
+
+    return quant_cls.from_config(config)
+
+
+def get_sparse_attention_config(
+    model_config: ModelConfig,
+    load_config: LoadConfig,
+    sparse_attention_config_filename: str = "sparse_attention_config.json",
+) -> dict[str, Any]:
+    model_name_or_path = model_config.model
+    is_local = os.path.isdir(model_name_or_path)
+    if not is_local:
+        # Download the config files.
+        with get_lock(model_name_or_path, load_config.download_dir):
+            hf_folder = snapshot_download(
+                model_name_or_path,
+                revision=model_config.revision,
+                allow_patterns="*.json",
+                cache_dir=load_config.download_dir,
+                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                tqdm_class=DisabledTqdm,
+            )
+    else:
+        hf_folder = model_name_or_path
+
+    config_file = os.path.join(hf_folder, sparse_attention_config_filename)
+    if not os.path.exists(config_file):
+        return {}
+
+    # Load the sparse attention config.
+    with open(config_file) as f:
+        config = json.load(f)
+    logger.info("Loaded sparse attention config from %s", config_file)
+
+    return config
+
+
+def download_weights_from_hf(
+    model_name_or_path: str,
+    cache_dir: Optional[str],
+    allow_patterns: list[str],
+    revision: Optional[str] = None,
+    ignore_patterns: Optional[Union[str, list[str]]] = None,
+) -> str:
+    """Download model weights from Hugging Face Hub.
+
+    Args:
+        model_name_or_path (str): The model name or path.
+        cache_dir (Optional[str]): The cache directory to store the model
+            weights. If None, will use HF defaults.
+        allow_patterns (list[str]): The allowed patterns for the
+            weight files. Files matched by any of the patterns will be
+            downloaded.
+        revision (Optional[str]): The revision of the model.
+        ignore_patterns (Optional[Union[str, list[str]]]): The patterns to
+            filter out the weight files. Files matched by any of the patterns
+            will be ignored.
+
+    Returns:
+        str: The path to the downloaded model weights.
+    """
+    local_only = huggingface_hub.constants.HF_HUB_OFFLINE
+    if not local_only:
+        # Before we download we look at that is available:
+        fs = HfFileSystem()
+        file_list = fs.ls(model_name_or_path, detail=False, revision=revision)
+
+        # depending on what is available we download different things
+        for pattern in allow_patterns:
+            matching = fnmatch.filter(file_list, pattern)
+            if len(matching) > 0:
+                allow_patterns = [pattern]
+                break
+
+    logger.info("Using model weights format %s", allow_patterns)
+    # Use file lock to prevent multiple processes from
+    # downloading the same model weights at the same time.
+    with get_lock(model_name_or_path, cache_dir):
+        start_time = time.perf_counter()
+        hf_folder = snapshot_download(
+            model_name_or_path,
+            allow_patterns=allow_patterns,
+            ignore_patterns=ignore_patterns,
+            cache_dir=cache_dir,
+            tqdm_class=DisabledTqdm,
+            revision=revision,
+            local_files_only=local_only,
+        )
+        time_taken = time.perf_counter() - start_time
+        if time_taken > 0.5:
+            logger.info("Time spent downloading weights for %s: %.6f seconds",
+                        model_name_or_path, time_taken)
+    return hf_folder
+
+
+def download_safetensors_index_file_from_hf(
+    model_name_or_path: str,
+    index_file: str,
+    cache_dir: Optional[str],
+    revision: Optional[str] = None,
+) -> None:
+    """Download hf safetensors index file from Hugging Face Hub.
+
+    Args:
+        model_name_or_path (str): The model name or path.
+        index_file (str): The safetensors index file name
+        cache_dir (Optional[str]): The cache directory to store the model
+            weights. If None, will use HF defaults.
+        revision (Optional[str]): The revision of the model.
+    """
+    # Use file lock to prevent multiple processes from
+    # downloading the same model weights at the same time.
+    with get_lock(model_name_or_path, cache_dir):
+        try:
+            # Download the safetensors index file.
+            hf_hub_download(
+                repo_id=model_name_or_path,
+                filename=index_file,
+                cache_dir=cache_dir,
+                revision=revision,
+                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+            )
+        # If file not found on remote or locally, we should not fail since
+        # only some models will have index_file.
+        except huggingface_hub.utils.LocalEntryNotFoundError:
+            logger.info("No %s found in local cache.", index_file)
+        except huggingface_hub.utils.EntryNotFoundError:
+            logger.info("No %s found in remote.", index_file)
+
+
+# For models like Mistral-7B-v0.3, there are both sharded
+# safetensors files and a consolidated safetensors file.
+# Passing both of these to the weight loader functionality breaks.
+# So, we use the index_file to
+# look up which safetensors files should be used.
+def filter_duplicate_safetensors_files(hf_weights_files: list[str],
+                                       hf_folder: str,
+                                       index_file: str) -> list[str]:
+    # model.safetensors.index.json is a mapping from keys in the
+    # torch state_dict to safetensors file holding that weight.
+    index_file_name = os.path.join(hf_folder, index_file)
+    if not os.path.isfile(index_file_name):
+        return hf_weights_files
+
+    # Iterate through the weight_map (weight_name: safetensors files)
+    # to identify weights that we should use.
+    with open(index_file_name) as f:
+        weight_map = json.load(f)["weight_map"]
+    weight_files_in_index = set()
+    for weight_name in weight_map:
+        weight_files_in_index.add(
+            os.path.join(hf_folder, weight_map[weight_name]))
+    # Filter out any fields that are not found in the index file.
+    hf_weights_files = [
+        f for f in hf_weights_files if f in weight_files_in_index
+    ]
+    return hf_weights_files
+
+
+def filter_files_not_needed_for_inference(
+        hf_weights_files: list[str]) -> list[str]:
+    """
+    Exclude files that are not needed for inference.
+
+    See https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/trainer.py#L227-L233
+    """
+    blacklist = [
+        "training_args.bin",
+        "optimizer.bin",
+        "optimizer.pt",
+        "scheduler.pt",
+        "scaler.pt",
+    ]
+    hf_weights_files = [
+        f for f in hf_weights_files
+        if not any(f.endswith(x) for x in blacklist)
+    ]
+    return hf_weights_files
+
+
+# explicitly use pure text format, with a newline at the end
+# this makes it impossible to see the animation in the progress bar
+# but will avoid messing up with ray or multiprocessing, which wraps
+# each line of output with some prefix.
+_BAR_FORMAT = "{desc}: {percentage:3.0f}% Completed | {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]\n"  # noqa: E501
+
+
+def enable_tqdm(use_tqdm_on_load: bool):
+    return use_tqdm_on_load and (not torch.distributed.is_initialized()
+                                 or torch.distributed.get_rank() == 0)
+
+
+def np_cache_weights_iterator(
+    model_name_or_path: str,
+    cache_dir: Optional[str],
+    hf_folder: str,
+    hf_weights_files: list[str],
+    use_tqdm_on_load: bool,
+) -> Generator[tuple[str, torch.Tensor], None, None]:
+    """Iterate over the weights in the model np files.
+
+    Will dump the model weights to numpy files if they are not already dumped.
+    """
+    # Convert the model weights from torch tensors to numpy arrays for
+    # faster loading.
+    np_folder = os.path.join(hf_folder, "np")
+    os.makedirs(np_folder, exist_ok=True)
+    weight_names_file = os.path.join(np_folder, "weight_names.json")
+    # Use file lock to prevent multiple processes from
+    # dumping the same model weights to numpy at the same time.
+    with get_lock(model_name_or_path, cache_dir):
+        if not os.path.exists(weight_names_file):
+            weight_names: list[str] = []
+            for bin_file in tqdm(
+                    hf_weights_files,
+                    desc="Loading np_cache checkpoint shards",
+                    disable=not enable_tqdm(use_tqdm_on_load),
+                    bar_format=_BAR_FORMAT,
+            ):
+                state = torch.load(bin_file,
+                                   map_location="cpu",
+                                   weights_only=True)
+                for name, param in state.items():
+                    param_path = os.path.join(np_folder, name)
+                    with open(param_path, "wb") as f:
+                        np.save(f, param.cpu().detach().numpy())
+                    weight_names.append(name)
+            with open(weight_names_file, "w") as f:
+                json.dump(weight_names, f)
+
+    with open(weight_names_file) as f:
+        weight_names = json.load(f)
+
+    for name in weight_names:
+        param_path = os.path.join(np_folder, name)
+        with open(param_path, "rb") as f:
+            param = np.load(f)
+        yield name, torch.from_numpy(param)
+
+
+def safetensors_weights_iterator(
+    hf_weights_files: list[str],
+    use_tqdm_on_load: bool,
+) -> Generator[tuple[str, torch.Tensor], None, None]:
+    """Iterate over the weights in the model safetensor files."""
+    for st_file in tqdm(
+            hf_weights_files,
+            desc="Loading safetensors checkpoint shards",
+            disable=not enable_tqdm(use_tqdm_on_load),
+            bar_format=_BAR_FORMAT,
+    ):
+        with safe_open(st_file, framework="pt") as f:
+            for name in f.keys():  # noqa: SIM118
+                param = f.get_tensor(name)
+                yield name, param
+
+
+def runai_safetensors_weights_iterator(
+    hf_weights_files: list[str],
+    use_tqdm_on_load: bool,
+) -> Generator[tuple[str, torch.Tensor], None, None]:
+    """Iterate over the weights in the model safetensor files."""
+    with SafetensorsStreamer() as streamer:
+        streamer.stream_files(hf_weights_files)
+        total_tensors = sum(
+            len(tensors_meta)
+            for tensors_meta in streamer.files_to_tensors_metadata.values())
+
+        tensor_iter = tqdm(
+            streamer.get_tensors(),
+            total=total_tensors,
+            desc="Loading safetensors using Runai Model Streamer",
+            bar_format=_BAR_FORMAT,
+            disable=not enable_tqdm(use_tqdm_on_load),
+        )
+
+        yield from tensor_iter
+
+
+def fastsafetensors_weights_iterator(
+    hf_weights_files: list[str],
+    use_tqdm_on_load: bool,
+) -> Generator[tuple[str, torch.Tensor], None, None]:
+    """Iterate over the weights in the model safetensor files
+    using fastsafetensor library."""
+    if torch.distributed.is_initialized():
+        pg = torch.distributed.group.WORLD
+    else:
+        pg = SingleGroup()
+
+    device = torch.device(f'cuda:{pg.rank()}')
+    weight_files_sub_lists = [
+        hf_weights_files[i:i + pg.size()]
+        for i in range(0, len(hf_weights_files), pg.size())
+    ]
+
+    for f_list in tqdm(
+            weight_files_sub_lists,
+            desc="Loading safetensors using Fastsafetensor loader",
+            disable=not enable_tqdm(use_tqdm_on_load),
+            bar_format=_BAR_FORMAT,
+    ):
+        loader = SafeTensorsFileLoader(pg, device)
+        rank_file_map = {i: [f] for i, f in enumerate(f_list)}
+        loader.add_filenames(rank_file_map)
+        try:
+            fb = loader.copy_files_to_device()
+            try:
+                keys = list(fb.key_to_rank_lidx.keys())
+                for k in keys:
+                    t = fb.get_tensor(k)
+                    yield k, t
+            finally:
+                fb.close()
+        finally:
+            loader.close()
+
+
+def pt_weights_iterator(
+    hf_weights_files: list[str],
+    use_tqdm_on_load: bool,
+    pt_load_map_location: Union[str, dict[str, str]] = "cpu",
+) -> Generator[tuple[str, torch.Tensor], None, None]:
+    """Iterate over the weights in the model bin/pt files."""
+    for bin_file in tqdm(
+            hf_weights_files,
+            desc="Loading pt checkpoint shards",
+            disable=not enable_tqdm(use_tqdm_on_load),
+            bar_format=_BAR_FORMAT,
+    ):
+        state = torch.load(bin_file,
+                           map_location=pt_load_map_location,
+                           weights_only=True)
+        yield from state.items()
+        del state
+
+
+def get_gguf_extra_tensor_names(
+        gguf_file: str, gguf_to_hf_name_map: dict[str, str]) -> list[str]:
+    reader = gguf.GGUFReader(gguf_file)
+    expected_gguf_keys = set(gguf_to_hf_name_map.keys())
+    exact_gguf_keys = set([tensor.name for tensor in reader.tensors])
+    extra_keys = expected_gguf_keys - exact_gguf_keys
+    return [gguf_to_hf_name_map[key] for key in extra_keys]
+
+
+def gguf_quant_weights_iterator(
+    gguf_file: str, gguf_to_hf_name_map: dict[str, str]
+) -> Generator[tuple[str, torch.Tensor], None, None]:
+    """
+    Iterate over the quant weights in the model gguf files and convert
+    them to torch tensors
+    """
+
+    reader = gguf.GGUFReader(gguf_file)
+
+    for tensor in reader.tensors:
+        if tensor.name in gguf_to_hf_name_map:
+            weight_type = tensor.tensor_type
+            name = gguf_to_hf_name_map[tensor.name]
+
+            if weight_type.name != "F32":
+                weight_type_name = name.replace("weight", "qweight_type")
+                weight_type = torch.tensor(weight_type)
+                yield weight_type_name, weight_type
+
+    for tensor in reader.tensors:
+        if tensor.name in gguf_to_hf_name_map:
+            weight = tensor.data
+            weight_type = tensor.tensor_type
+            name = gguf_to_hf_name_map[tensor.name]
+            if weight_type.name != "F32":
+                name = name.replace("weight", "qweight")
+            param = torch.tensor(weight)
+            yield name, param
+
+
+def convert_pyslice_to_tensor(x: Any) -> torch.Tensor:
+    """convert PySafeSlice object from safetensors to torch.Tensor
+
+    PySafeSlice object supports indexing, which is done before loading the
+    actual tensor and can reduce the amount of memory being read into the
+    memory. However, it does not support more advanced functionalities
+    like `.view()` or `.t()`. Therefore, if we need to modify the loaded
+    tensor with these more complicated operators, we need to convert to
+    tensor first.
+    """
+    if not isinstance(x, torch.Tensor):
+        x = x[:]
+    return x
+
+
+def default_weight_loader(param: torch.Tensor,
+                          loaded_weight: torch.Tensor) -> None:
+    """Default weight loader."""
+    try:
+        if param.numel() == 1 and loaded_weight.numel() == 1:
+            # Sometimes scalar values aren't considered tensors with shapes
+            # so if both param and loaded_weight are a scalar,
+            # "broadcast" instead of copy
+            param.data.fill_(loaded_weight.item())
+        else:
+            assert param.size() == loaded_weight.size(), (
+                f"Attempted to load weight ({loaded_weight.size()}) "
+                f"into parameter ({param.size()})")
+
+            param.data.copy_(loaded_weight)
+    except Exception:
+        # NOTE: This exception is added for the purpose of setting breakpoint to
+        # debug weight loading issues.
+        raise
+
+
+def row_parallel_weight_loader(param: torch.Tensor,
+                               loaded_weight: torch.Tensor) -> None:
+    """Load weights that are row-parallelized."""
+    tp_rank = get_tensor_model_parallel_rank()
+    shard_dim = 0 if param.dim() != 1 else None
+
+    if shard_dim is not None:
+        shard_size = param.data.shape[shard_dim]
+        start_idx = tp_rank * shard_size
+        loaded_weight = loaded_weight.narrow(shard_dim, start_idx, shard_size)
+
+    return default_weight_loader(param, loaded_weight)
+
+
+LoaderFunction = Callable[[torch.Tensor, torch.Tensor], None]
+
+
+def sharded_weight_loader(shard_axis: int) -> LoaderFunction:
+    """Create a weight loader that shards the weights along the given axis"""
+
+    def loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
+        tp_rank = get_tensor_model_parallel_rank()
+
+        shard_size = param.data.shape[shard_axis]
+        start_idx = tp_rank * shard_size
+        loaded_weight = loaded_weight.narrow(shard_axis, start_idx, shard_size)
+
+        return default_weight_loader(param, loaded_weight)
+
+    return loader
+
+
+def composed_weight_loader(
+        loader: LoaderFunction, fn: Callable[[torch.Tensor],
+                                             torch.Tensor]) -> LoaderFunction:
+    """Create a weight loader that post-processes the weights after loading"""
+
+    def composed_loader(param: torch.Tensor,
+                        loaded_weight: torch.Tensor) -> None:
+        loader(param, loaded_weight)
+        param.data.copy_(fn(param))
+        return
+
+    return composed_loader
+
+
+def initialize_dummy_weights(
+    model: torch.nn.Module,
+    low: float = -1e-3,
+    high: float = 1e-3,
+    seed: int = 1234,
+) -> None:
+    """Initialize model weights with random values.
+
+    The model weights must be randomly initialized for accurate performance
+    measurements. Additionally, the model weights should not cause NaNs in the
+    forward pass. We empirically found that initializing the weights with
+    values between -1e-3 and 1e-3 works well for most models.
+
+    We use per-parameter random seed, so that dummy weights are consistent,
+    even if the model is partitioned across multiple devices. When the seed
+    is fixed, the random values generated by this function only depends on
+    the parameter's number of elements and its data type.
+    """
+    for param in model.state_dict().values():
+        if torch.is_floating_point(param):
+            if current_platform.is_tpu():
+                generator = torch.Generator(device="cpu")
+                generator.manual_seed(seed)
+                # Note: The param.uniform_ function cannot be used in this
+                # context because it demands more TPU HBM than directly copying
+                # from a CPU tensor.
+                # Note: We avoid using torch.rank_like as it doesn't currently
+                # support the generator argument.
+                param.copy_((high - low) *
+                            torch.rand(param.shape,
+                                       generator=generator,
+                                       dtype=param.dtype,
+                                       layout=param.layout,
+                                       requires_grad=param.requires_grad,
+                                       device="cpu") + low)
+                torch._sync(param)
+                continue
+
+            generator = torch.Generator(device=param.data.device)
+            generator.manual_seed(seed)
+            if torch.finfo(param.data.dtype).bits < 16:
+                # uniform_ doesn't support < 16-bit datatypes (FP8)
+                dtype = param.data.dtype
+                tmp_param = param.data.to(torch.float16)
+                tmp_param = tmp_param.uniform_(low, high,
+                                               generator=generator).to(dtype)
+                param.data.copy_(tmp_param)
+            else:
+                param.uniform_(low, high, generator=generator)
+
+
+def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
+    """Remap the name of FP8 k/v_scale parameters.
+
+    This function handles the remapping of FP8 k/v_scale parameter names.
+    It detects if the given name ends with a suffix and attempts to remap
+    it to the expected name format in the model. If the remapped name is not
+    found in the params_dict, a warning is printed and None is returned.
+
+    Args:
+        name (str): The original loaded checkpoint parameter name.
+        params_dict (dict): Dictionary containing the model's named parameters.
+
+    Returns:
+        str: The remapped parameter name if successful, or the original name
+             if no remapping is needed.
+        None: If the remapped name is not found in params_dict.
+    """
+    if name.endswith(".kv_scale"):
+        logger.warning_once(
+            "DEPRECATED. Found kv_scale in the checkpoint. "
+            "This format is deprecated in favor of separate k_scale and "
+            "v_scale tensors and will be removed in a future release. "
+            "Functionally, we will remap kv_scale to k_scale and duplicate "
+            "k_scale to v_scale")
+        # NOTE: we remap the deprecated kv_scale to k_scale
+        remapped_name = name.replace(".kv_scale", ".attn.k_scale")
+        if remapped_name not in params_dict:
+            logger.warning_once(
+                "Found kv_scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv_scale is not loaded.",  #  noqa: E501
+                name,
+                remapped_name,
+            )
+            return None
+        return remapped_name
+
+    possible_scale_names = [".k_scale", ".v_scale"]
+    modelopt_scale_names = [
+        ".self_attn.k_proj.k_scale", ".self_attn.v_proj.v_scale"
+    ]
+    # Also support qkv_proj scale parameters (from stacked parameter processing)
+    qkv_proj_scale_names = [
+        ".self_attn.qkv_proj.k_scale", ".self_attn.qkv_proj.v_scale"
+    ]
+    for scale_name in possible_scale_names:
+        if name.endswith(scale_name):
+            if any(mo_scale_name in name
+                   for mo_scale_name in modelopt_scale_names):
+                remapped_name = name.replace(
+                    f".self_attn.{scale_name[1]}_proj{scale_name}",
+                    f".self_attn.attn{scale_name}")
+            elif any(qkv_scale_name in name
+                     for qkv_scale_name in qkv_proj_scale_names):
+                # Handle qkv_proj scale parameters
+                remapped_name = name.replace(
+                    f".self_attn.qkv_proj{scale_name}",
+                    f".self_attn.attn{scale_name}")
+            else:
+                remapped_name = name.replace(scale_name, f".attn{scale_name}")
+            if remapped_name not in params_dict:
+                logger.warning_once(
+                    "Found %s in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). %s is not loaded.",  # noqa: E501
+                    scale_name,
+                    name,
+                    remapped_name,
+                    scale_name,
+                )
+                return None
+            return remapped_name
+
+    # If there were no matches, return the untouched param name
+    return name
diff --git a/vllm_v0.10.0/vllm/model_executor/models/__init__.py b/vllm_v0.10.0/vllm/model_executor/models/__init__.py
new file mode 100644
index 0000000..d3ee687
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/__init__.py
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal,
+                         SupportsPP, SupportsTranscription, SupportsV0Only,
+                         has_inner_state, supports_lora, supports_multimodal,
+                         supports_pp, supports_transcription, supports_v0_only)
+from .interfaces_base import (VllmModelForPooling, VllmModelForTextGeneration,
+                              is_pooling_model, is_text_generation_model)
+from .registry import ModelRegistry
+
+__all__ = [
+    "ModelRegistry",
+    "VllmModelForPooling",
+    "is_pooling_model",
+    "VllmModelForTextGeneration",
+    "is_text_generation_model",
+    "HasInnerState",
+    "has_inner_state",
+    "SupportsLoRA",
+    "supports_lora",
+    "SupportsMultiModal",
+    "supports_multimodal",
+    "SupportsPP",
+    "supports_pp",
+    "SupportsTranscription",
+    "supports_transcription",
+    "SupportsV0Only",
+    "supports_v0_only",
+]
diff --git a/vllm_v0.10.0/vllm/model_executor/models/adapters.py b/vllm_v0.10.0/vllm/model_executor/models/adapters.py
new file mode 100644
index 0000000..867de2c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/adapters.py
@@ -0,0 +1,405 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable
+from typing import TYPE_CHECKING, Any, Optional, TypeVar, cast
+
+import torch
+import torch.nn as nn
+
+from vllm.model_executor.models.config import VerifyAndUpdateConfig
+
+from .interfaces_base import VllmModelForPooling, is_pooling_model
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
+_T = TypeVar("_T", bound=type[nn.Module])
+
+_GENERATE_SUFFIXES = [
+    "ForCausalLM",
+    "ForConditionalGeneration",
+    "ChatModel",
+    "LMHeadModel",
+]
+
+
+def _get_pooling_model_name(orig_model_name: str, pooling_suffix: str) -> str:
+    model_name = orig_model_name
+
+    for generate_suffix in _GENERATE_SUFFIXES:
+        model_name = model_name.removesuffix(generate_suffix)
+
+    return model_name + pooling_suffix
+
+
+def _create_pooling_model_cls(orig_cls: _T) -> _T:
+    # Lazy import
+    from .utils import AutoWeightsLoader, WeightsMapper
+
+    class ModelForPooling(orig_cls, VllmModelForPooling):
+
+        is_pooling_model = True
+
+        def __init__(
+            self,
+            *,
+            vllm_config: "VllmConfig",
+            prefix: str = "",
+            **kwargs: Any,
+        ) -> None:
+            super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
+
+            self.vllm_config = vllm_config
+
+            # These are not used in pooling models
+            for attr in ("lm_head", "logits_processor"):
+                if hasattr(self, attr):
+                    delattr(self, attr)
+
+            # If the model already defines a pooler instance, don't overwrite it
+            if not getattr(self, "pooler", None):
+                self._init_pooler(vllm_config, prefix=prefix)
+
+        def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""):
+            raise NotImplementedError
+
+        def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+            # TODO: Support uninitialized params tracking
+
+            # We have deleted this attribute, so don't load it
+            weights = ((name, data) for name, data in weights
+                       if not name.startswith("lm_head."))
+
+            # If `*ForCausalLM` defines `load_weights` on the inner model
+            # and there are no other inner modules with parameters,
+            # we support loading from both `*Model` and `*ForCausalLM`
+            if hasattr(self, "model") and hasattr(self.model, "load_weights"):
+                # Whether only `self.model` contains parameters
+                model_is_only_param = all(
+                    name == "model" or next(child.parameters(), None) is None
+                    for name, child in self.named_children())
+
+                if model_is_only_param:
+                    mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
+                    weights = mapper.apply(weights)
+
+                    loaded_params = self.model.load_weights(weights)
+                    loaded_params = {f"model.{name}" for name in loaded_params}
+                    return loaded_params
+
+            # For most other models
+            if hasattr(orig_cls, "load_weights"):
+                return orig_cls.load_weights(self, weights)  # type: ignore
+            # Fallback
+            else:
+                loader = AutoWeightsLoader(self)
+                return loader.load_weights(weights)
+
+    return ModelForPooling  # type: ignore
+
+
+def as_embedding_model(cls: _T) -> _T:
+    """
+    Subclass an existing vLLM model to support embeddings.
+
+    By default, the embeddings of the whole prompt are extracted from the
+    normalized hidden state corresponding to the last token.
+
+    Note:
+        We assume that no extra layers are added to the original model;
+        please implement your own model if this is not the case.
+    """
+    # Avoid modifying existing embedding models
+    if is_pooling_model(cls):
+        return cls
+
+    # Lazy import
+    from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
+
+    class ModelForEmbedding(_create_pooling_model_cls(cls)):
+
+        def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""):
+            pooler_config = vllm_config.model_config.pooler_config
+            assert pooler_config is not None
+
+            self.pooler = DispatchPooler(
+                {
+                    "encode": Pooler.for_encode(pooler_config),
+                    "embed": Pooler.for_embed(pooler_config),
+                }, )
+
+    ModelForEmbedding.__name__ = \
+        _get_pooling_model_name(cls.__name__, "ForEmbedding")
+
+    return ModelForEmbedding  # type: ignore
+
+
+def as_seq_cls_model(cls: _T) -> _T:
+    """
+    Subclass an existing vLLM model to support classify and score tasks.
+
+    By default, the class probabilities are extracted from the softmaxed
+    hidden state corresponding to the last token.
+
+    Note:
+        We assume that the classification head is a single linear layer
+        stored as the attribute `score` of the top-level model;
+        please implement your own model if this is not the case.
+    """
+    # Avoid modifying existing classification models
+    if is_pooling_model(cls):
+        return cls
+
+    # Lazy import
+    from vllm.model_executor.layers.linear import RowParallelLinear
+    from vllm.model_executor.layers.pooler import (ClassifierPooler,
+                                                   DispatchPooler, Pooler,
+                                                   PoolingMethod, PoolingType)
+    from vllm.model_executor.models.interfaces import SupportsCrossEncoding
+    from vllm.sequence import IntermediateTensors
+
+    from .utils import maybe_prefix
+
+    class ModelForSequenceClassification(_create_pooling_model_cls(cls),
+                                         SupportsCrossEncoding):
+
+        def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""):
+            config = vllm_config.model_config.hf_config
+            quant_config = vllm_config.quant_config
+
+            self.score = RowParallelLinear(
+                config.hidden_size,
+                config.num_labels,
+                input_is_parallel=False,
+                bias=False,
+                params_dtype=torch.float32,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "score"),
+            )
+
+            pooler_config = vllm_config.model_config.pooler_config
+            assert pooler_config is not None
+
+            pooling_type_str = pooler_config.pooling_type
+            pooling_type = (PoolingType.LAST if pooling_type_str is None else
+                            PoolingType[pooling_type_str])
+
+            self.pooler = DispatchPooler({
+                "encode":
+                Pooler.for_encode(pooler_config),
+                "classify":
+                ClassifierPooler(
+                    pooling=PoolingMethod.from_pooling_type(pooling_type),
+                    classifier=self._classifier,
+                    act_fn=ClassifierPooler.act_fn_for_seq_cls(
+                        vllm_config.model_config),
+                ),
+                "score":
+                ClassifierPooler(
+                    pooling=PoolingMethod.from_pooling_type(pooling_type),
+                    classifier=self._classifier,
+                    act_fn=ClassifierPooler.act_fn_for_cross_encoder(
+                        vllm_config.model_config),
+                ),
+            })
+
+        def _classifier(self, x: torch.Tensor):
+            x, _ = self.score(x.float())
+            return x
+
+        def forward(
+            self,
+            input_ids: torch.Tensor,
+            positions: torch.Tensor,
+            intermediate_tensors: Optional[IntermediateTensors] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+        ) -> torch.Tensor:
+            return super().forward(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+
+        def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+            tokens = getattr(self.config, "classifier_from_token", None)
+            method = getattr(self.config, "method", None)
+
+            if tokens is None and method is None:
+                return super().load_weights(weights)
+            else:
+                # Online convert ForCausalLM into
+                # ForSequenceClassification model.
+                return seq_cls_model_loader(self, weights)
+
+
+    ModelForSequenceClassification.__name__ = \
+        _get_pooling_model_name(cls.__name__, "ForSequenceClassification")
+
+    return ModelForSequenceClassification  # type: ignore
+
+
+def as_reward_model(cls: _T) -> _T:
+    """
+    Subclass an existing vLLM model to support reward modeling.
+
+    By default, we return the hidden states of each token directly.
+
+    Note:
+        We assume that no extra layers are added to the original model;
+        please implement your own model if this is not the case.
+    """
+    # Avoid modifying existing reward models
+    if is_pooling_model(cls):
+        return cls
+
+    # Lazy import
+    from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
+
+    class ModelForReward(_create_pooling_model_cls(cls)):
+
+        def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""):
+            pooler_config = vllm_config.model_config.pooler_config
+            assert pooler_config is not None
+
+            self.pooler = DispatchPooler(
+                {"encode": Pooler.for_encode(pooler_config)}, )
+
+    ModelForReward.__name__ = \
+        _get_pooling_model_name(cls.__name__, "ForReward")
+
+    return ModelForReward  # type: ignore
+
+
+class SequenceClassificationConfig(VerifyAndUpdateConfig):
+
+    @staticmethod
+    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
+        config = vllm_config.model_config.hf_config
+        method = getattr(config, "method", None)
+        tokens = getattr(config, "classifier_from_token", None)
+
+        if method is None:
+            return
+
+        assert tokens is not None
+        assert method in SEQ_CLS_LOAD_METHODS, f"method {method} not supported"
+
+        if method == "from_2_way_softmax":
+            assert len(tokens) == 2
+            config.num_labels = 1
+        else:
+            config.num_labels = len(tokens)
+
+        # `llm as reranker` defaults to not using pad_token
+        use_pad_token = getattr(config, "use_pad_token", False)
+        config.use_pad_token = use_pad_token
+
+
+def load_weights_using_from_2_way_softmax(
+        model, weights: Iterable[tuple[str, torch.Tensor]]):
+    # refer to https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
+    from vllm.model_executor.layers.vocab_parallel_embedding import (
+        ParallelLMHead)
+    from vllm.model_executor.model_loader.weight_utils import (
+        default_weight_loader)
+    from vllm.model_executor.models.utils import AutoWeightsLoader
+
+    model_config = model.vllm_config.model_config
+    tokens = getattr(model.config, "classifier_from_token", [])
+    tokens = cast(list[int], tokens)
+    assert len(tokens) == 2
+
+    if model.config.tie_word_embeddings:
+        model.lm_head = model.model.embed_tokens
+    else:
+        model.lm_head = ParallelLMHead(model.config.vocab_size,
+                                       model.config.hidden_size,
+                                       quant_config=model.quant_config)
+
+    loader = AutoWeightsLoader(model)
+    loaded_weights = loader.load_weights(weights)
+
+    from vllm.transformers_utils.tokenizer import get_tokenizer
+    tokenizer = get_tokenizer(model_config.tokenizer,
+                              revision=model_config.tokenizer_revision,
+                              tokenizer_mode=model_config.tokenizer_mode,
+                              trust_remote_code=model_config.trust_remote_code)
+
+    false_id = tokenizer.convert_tokens_to_ids(tokens[0])
+    true_id = tokenizer.convert_tokens_to_ids(tokens[1])
+    score_weight = model.lm_head.weight.data[[true_id]].to(
+        torch.float32) - model.lm_head.weight.data[[false_id]].to(
+            torch.float32)
+
+    param = model.score.weight
+    weight_loader = getattr(param, "weight_loader", default_weight_loader)
+    weight_loader(param, score_weight)
+
+    del model.lm_head
+    loaded_weights.add("score.weight")
+    loaded_weights.discard("lm_head.weight")
+    return loaded_weights
+
+
+def load_weights_no_post_processing(model,
+                                    weights: Iterable[tuple[str,
+                                                            torch.Tensor]]):
+    from vllm.model_executor.layers.vocab_parallel_embedding import (
+        ParallelLMHead)
+    from vllm.model_executor.model_loader.weight_utils import (
+        default_weight_loader)
+    from vllm.model_executor.models.utils import AutoWeightsLoader
+
+    model_config = model.vllm_config.model_config
+    tokens = getattr(model.config, "classifier_from_token", [])
+    tokens = cast(list[int], tokens)
+    assert len(tokens) > 0
+
+    if model.config.tie_word_embeddings:
+        model.lm_head = model.model.embed_tokens
+    else:
+        model.lm_head = ParallelLMHead(model.config.vocab_size,
+                                       model.config.hidden_size,
+                                       quant_config=model.quant_config)
+
+    loader = AutoWeightsLoader(model)
+    loaded_weights = loader.load_weights(weights)
+
+    from vllm.transformers_utils.tokenizer import get_tokenizer
+    tokenizer = get_tokenizer(model_config.tokenizer,
+                              revision=model_config.tokenizer_revision,
+                              tokenizer_mode=model_config.tokenizer_mode,
+                              trust_remote_code=model_config.trust_remote_code)
+
+    token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]
+    score_weight = model.lm_head.weight.data[token_ids]
+
+    param = model.score.weight
+    weight_loader = getattr(param, "weight_loader", default_weight_loader)
+    weight_loader(param, score_weight)
+
+    del model.lm_head
+    loaded_weights.add("score.weight")
+    loaded_weights.discard("lm_head.weight")
+    return loaded_weights
+
+
+SEQ_CLS_LOAD_METHODS = {
+    "from_2_way_softmax": load_weights_using_from_2_way_softmax,
+    "no_post_processing": load_weights_no_post_processing,
+}
+
+
+def seq_cls_model_loader(model, weights: Iterable[tuple[str, torch.Tensor]]):
+    # Online convert ForCausalLM into ForSequenceClassification model.
+    # - from_2_way_softmax:
+    #   - Qwen3ForCausalLM
+    #     - Qwen3-Reranker
+    #   - Qwen2ForCausalLM
+    #     - mxbai-rerank-v2
+    # - no_post_processing:
+    #   - GemmaForCausalLM
+    #     - bge-reranker-v2-gemma
+
+    config = model.vllm_config.model_config.hf_config
+    method = getattr(config, "method", None)
+    assert method in SEQ_CLS_LOAD_METHODS, f"method {method} not supported"
+    return SEQ_CLS_LOAD_METHODS[method](model, weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/aimv2.py b/vllm_v0.10.0/vllm/model_executor/models/aimv2.py
new file mode 100644
index 0000000..b13d863
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/aimv2.py
@@ -0,0 +1,246 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# A modified implementation of the AIMv2 Transformer
+# inserted here also the image tokenizer used by Ovis2
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from vllm.attention.layer import MultiHeadAttention
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed.utils import divide
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.transformers_utils.configs.ovis import AIMv2Config
+
+
+class AIMv2SwiGLUFFN(nn.Module):
+
+    def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig,
+                 prefix: str):
+        super().__init__()
+        hidden_features = config.intermediate_size
+        in_features = config.hidden_size
+        bias = config.use_bias
+
+        self.fc13 = MergedColumnParallelLinear(
+            in_features,
+            [hidden_features] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc13",
+        )
+        self.fc2 = RowParallelLinear(
+            input_size=hidden_features,
+            output_size=in_features,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, _ = self.fc13(x)
+        x = self.act_fn(x)
+        x, _ = self.fc2(x)
+        return x
+
+
+class AIMv2PatchEmbed(nn.Module):
+
+    def __init__(self, config: AIMv2Config):
+        super().__init__()
+        self.proj = nn.Conv2d(
+            config.num_channels,
+            config.hidden_size,
+            kernel_size=(config.patch_size, config.patch_size),
+            stride=(config.patch_size, config.patch_size),
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        x = self.norm.forward_native(x)
+        return x
+
+
+class AIMv2ViTPreprocessor(nn.Module):
+
+    def __init__(self, config: AIMv2Config):
+        super().__init__()
+        num_patches = (config.image_size // config.patch_size)**2
+
+        self.patchifier = AIMv2PatchEmbed(config)
+        self.pos_embed = nn.Parameter(
+            torch.zeros((1, num_patches, config.hidden_size)))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        tokens = self.patchifier(x)
+        _, N, _ = tokens.shape
+        pos_embed = self.pos_embed.to(tokens.device)
+        tokens = tokens + pos_embed[:, :N]
+        return tokens
+
+
+class AIMv2Attention(nn.Module):
+
+    def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig,
+                 prefix: str):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                "embed_dim must be divisible by num_heads "
+                f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads}).")
+        self.scale = self.head_dim**-0.5
+
+        self.qkv = QKVParallelLinear(
+            hidden_size=self.embed_dim,
+            head_size=self.head_dim,
+            total_num_heads=self.num_heads,
+            bias=config.qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv",
+        )
+
+        self.proj = RowParallelLinear(
+            input_size=self.embed_dim,
+            output_size=self.embed_dim,
+            bias=config.use_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.proj",
+        )
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
+
+        self.attn = MultiHeadAttention(self.num_heads_per_partition,
+                                       self.head_dim, self.scale)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        qkv, _ = self.qkv(x)
+        q, k, v = qkv.chunk(3, dim=-1)
+
+        x = self.attn(q, k, v)
+        x, _ = self.proj(x)
+        return x
+
+
+class AIMv2Block(nn.Module):
+
+    def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig,
+                 prefix: str):
+        super().__init__()
+        self.attn = AIMv2Attention(config,
+                                   quant_config=quant_config,
+                                   prefix=f"{prefix}.attn")
+        self.norm_1 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.mlp = AIMv2SwiGLUFFN(config,
+                                  quant_config=quant_config,
+                                  prefix=f"{prefix}.mlp")
+        self.norm_2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.norm_1.forward_native(x))
+        x = x + self.mlp(self.norm_2.forward_native(x))
+        return x
+
+
+class AIMv2Transformer(nn.Module):
+
+    def __init__(
+        self,
+        config: AIMv2Config,
+        quant_config: QuantizationConfig,
+        *,
+        require_post_norm: Optional[bool] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.blocks = nn.ModuleList([
+            AIMv2Block(config, quant_config, prefix=f"{prefix}.blocks.{i}")
+            for i in range(config.num_hidden_layers)
+        ])
+        if require_post_norm:
+            self.post_trunk_norm = RMSNorm(config.hidden_size,
+                                           eps=config.rms_norm_eps)
+        else:
+            self.post_trunk_norm = None
+
+    def forward(self, tokens: torch.Tensor) -> torch.Tensor:
+        # they take the -1 as the ref embeddings, like a clip skip
+        for block in self.blocks:
+            tokens = block(tokens)
+        if self.post_trunk_norm is not None:
+            tokens = self.post_trunk_norm(tokens)
+        return tokens
+
+
+class AIMv2Model(torch.nn.Module):
+
+    def __init__(self,
+                 config: AIMv2Config,
+                 quant_config: QuantizationConfig,
+                 *,
+                 require_post_norm: Optional[bool] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.preprocessor = AIMv2ViTPreprocessor(config)
+        self.trunk = AIMv2Transformer(config,
+                                      quant_config=quant_config,
+                                      require_post_norm=require_post_norm,
+                                      prefix=f"{prefix}.trunk")
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+
+        x = self.preprocessor(pixel_values)
+        x = self.trunk(x)
+
+        return x
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".fc13", ".fc1", 0),
+            (".fc13", ".fc3", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            # post_layernorm is optional in SiglipVisionModel
+            if (name.startswith("trunk.post_trunk_norm")
+                    and self.trunk.post_trunk_norm is None):
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm_v0.10.0/vllm/model_executor/models/arcee.py b/vllm_v0.10.0/vllm/model_executor/models/arcee.py
new file mode 100644
index 0000000..4e3ba10
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/arcee.py
@@ -0,0 +1,347 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright 2023-2025 vLLM Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# You may not use this file except in compliance with the License.
+# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+#
+# Inference-only Arcee (AFM) model – adds support for ReLU^2 feed-forward
+# activation.
+
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.distributed import get_pp_group
+from vllm.model_executor.layers.activation import ReLUSquaredActivation
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, PPMissingLayer,
+                    make_empty_intermediate_tensors_factory, make_layers)
+
+
+class ArceeMLP(nn.Module):
+    """Feed-forward layer for Arcee using ReLU^2 activation
+    (no gating as in LLaMA)."""
+
+    def __init__(self,
+                 hidden_size: int,
+                 intermediate_size: int,
+                 hidden_act: str,
+                 quant_config: Optional[Any] = None,
+                 bias: bool = False,
+                 prefix: str = "",
+                 reduce_results: bool = True) -> None:
+        super().__init__()
+        # Single linear projection up to intermediate size
+        # (no separate gate projection)
+        self.up_proj = ColumnParallelLinear(
+            input_size=hidden_size,
+            output_size=intermediate_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.up_proj",
+        )
+        # Down projection back to hidden size
+        self.down_proj = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=f"{prefix}.down_proj",
+        )
+        if hidden_act != "relu2":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only 'relu2' is supported for AFM.")
+        # Define ReLU^2 activation: (ReLU(x))^2 elementwise
+        self.act_fn = ReLUSquaredActivation()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, _ = self.up_proj(x)  # Project to intermediate size
+        x = self.act_fn(x)  # Apply ReLU^2 activation elementwise
+        x, _ = self.down_proj(x)  # Project back down to hidden size
+        return x
+
+
+class ArceeDecoderLayer(nn.Module):
+    """Transformer decoder block for Arcee, with self-attention and
+    ReLU^2 MLP."""
+
+    def __init__(self,
+                 config: LlamaConfig,
+                 cache_config: Optional[Any] = None,
+                 quant_config: Optional[Any] = None,
+                 prefix: str = "") -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        # Rotary embedding parameters (reuse LLaMA defaults)
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+                config, "original_max_position_embeddings", None):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        # Determine if attention bias is needed (some variants use bias terms)
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False)
+        bias_o_proj = attention_bias
+        if hasattr(config, "qkv_bias"):
+            attention_bias = config.qkv_bias
+
+        # Self-Attention (using LLaMA's attention structure)
+        from vllm.model_executor.models.llama import (
+            LlamaAttention)  # import here to avoid circular import
+        self.self_attn = LlamaAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=getattr(config, "num_key_value_heads",
+                                 config.num_attention_heads),
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=attention_bias,
+            bias_o_proj=bias_o_proj,
+            cache_config=cache_config,
+            prefix=f"{prefix}.self_attn",
+            attn_type=getattr(
+                config, "attn_type",
+                "decoder"),  # assume decoder (causal) unless specified
+        )
+        # MLP with ReLU^2 activation
+        self.mlp = ArceeMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            bias=getattr(config, "mlp_bias", False),
+            prefix=f"{prefix}.mlp",
+        )
+        # Layer normalization layers (RMSNorm as in LLaMA)
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+            self, positions: torch.Tensor, hidden_states: torch.Tensor,
+            residual: Optional[torch.Tensor]
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self-Attention block
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            # Fused residual add + layernorm if supported
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(positions=positions,
+                                       hidden_states=hidden_states)
+        # Feed-forward block
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class ArceeModel(nn.Module):
+    """The transformer model backbone for Arcee (embedding layer + stacked
+    decoder blocks + final norm)."""
+
+    def __init__(self,
+                 *,
+                 vllm_config,
+                 prefix: str = "",
+                 layer_type: type[nn.Module] = ArceeDecoderLayer) -> None:
+        super().__init__()
+        config: LlamaConfig = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        self.quant_config = quant_config
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.org_vocab_size = config.vocab_size
+
+        # Word embeddings (parallelized if using pipeline parallel)
+        if get_pp_group().is_first_rank or (config.tie_word_embeddings
+                                            and get_pp_group().is_last_rank):
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                quant_config=quant_config,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer(
+            )  # placeholder on non-embedding ranks
+
+        # Build decoder layers across pipeline ranks
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: layer_type(config=config,
+                                      cache_config=cache_config,
+                                      quant_config=quant_config,
+                                      prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+        # Final RMSNorm on the last pipeline stage
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        # For optional capturing of intermediate hidden states
+        # (not used by default)
+        self.aux_hidden_state_layers: tuple[int, ...] = tuple()
+
+        # Prepare factory for empty intermediate tensors
+        # (for pipeline scheduling)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None
+    ) -> Union[torch.Tensor, IntermediateTensors, tuple[torch.Tensor,
+                                                        list[torch.Tensor]]]:
+        # Embedding lookup (on first pipeline rank)
+        if get_pp_group().is_first_rank:
+            hidden_states = (inputs_embeds if inputs_embeds is not None else
+                             self.get_input_embeddings(input_ids))
+            residual = None
+        else:
+            assert intermediate_tensors is not None, (
+                "IntermediateTensors must be provided for non-first "
+                "pipeline ranks")
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        aux_hidden_states: list[torch.Tensor] = []
+        for idx, layer in enumerate(
+                self.layers[self.start_layer:self.end_layer]):
+            if idx in self.aux_hidden_state_layers:
+                aux_hidden_states.append(
+                    hidden_states +
+                    residual)  # capture pre-layer hidden state if needed
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if not get_pp_group().is_last_rank:
+            # Send intermediate results to the next pipeline stage
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        # On last rank: apply final layer norm
+        hidden_states, _ = self.norm(hidden_states, residual)
+        if len(aux_hidden_states) > 0:
+            return hidden_states, aux_hidden_states
+        return hidden_states
+
+
+class ArceeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    """Arcee Model for causal language modeling, integrated with vLLM
+    runtime."""
+    # Map fused module names to their sub-module components
+    # (for quantization and LoRA)
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+    }
+
+    def __init__(self, *, vllm_config, prefix: str = "") -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        self.config = config
+
+        # Initialize the inner Transformer model (ArceeModel)
+        self.model = ArceeModel(vllm_config=vllm_config,
+                                prefix=f"{prefix}.model")
+        # On the last pipeline stage, set up the LM head and logits processor
+        if get_pp_group().is_last_rank:
+            # Determine vocabulary size (including any LoRA extra tokens
+            # for padded LM head)
+            self.unpadded_vocab_size = config.vocab_size
+
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+                quant_config=vllm_config.quant_config,
+                bias=getattr(config, "lm_head_bias", False),
+                prefix=f"{prefix}.lm_head",
+            )
+            if config.tie_word_embeddings:
+                # Tie output weights with input embedding matrix
+                self.lm_head = self.lm_head.tie_weights(
+                    self.model.embed_tokens)
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    config.vocab_size,
+                                                    logit_scale)
+        else:
+            # Placeholder for lm_head on non-last ranks
+            self.lm_head = PPMissingLayer()
+        # Provide a reference to the model's method for generating empty
+        # tensors (used in pipeline parallel schedule)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        # Forward pass through the Arcee model backbone
+        model_output = self.model(input_ids=input_ids,
+                                  positions=positions,
+                                  intermediate_tensors=intermediate_tensors,
+                                  inputs_embeds=inputs_embeds)
+        return model_output
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata) -> Optional[torch.Tensor]:
+        # Compute final logits from hidden states (last pipeline rank only)
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        """Load weights into the model (delegates to inner model and handles
+        tied embeddings)."""
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+            skip_substrs=["gate_proj"])
+        # AutoWeightLoader handles weight name remapping, including fusing
+        # separate q_proj, k_proj, v_proj into qkv_proj
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/arctic.py b/vllm_v0.10.0/vllm/model_executor/models/arctic.py
new file mode 100644
index 0000000..4693c94
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/arctic.py
@@ -0,0 +1,559 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Inference-only Snowflake Arctic model."""
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import fused_experts, fused_topk
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.deepspeedfp import (
+    DeepSpeedFPConfig, DeepSpeedFPParameter)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.arctic import ArcticConfig
+
+from .interfaces import SupportsPP, SupportsQuant
+from .utils import (extract_layer_index, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+logger = init_logger(__name__)
+
+
+class ArcticMLP(nn.Module):
+
+    def __init__(self,
+                 config: ArcticConfig,
+                 expert_id: int = -1,
+                 is_residual_mlp: bool = False,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 reduce_results: bool = True,
+                 prefix: str = ""):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.expert_id = expert_id
+
+        self.ffn_dim = config.intermediate_size if not is_residual_mlp \
+            else self.hidden_size
+
+        self.w13 = MergedColumnParallelLinear(self.hidden_size,
+                                              [self.ffn_dim] * 2,
+                                              bias=False,
+                                              quant_config=quant_config)
+        self.w2 = RowParallelLinear(self.ffn_dim,
+                                    self.hidden_size,
+                                    bias=False,
+                                    reduce_results=reduce_results,
+                                    quant_config=quant_config)
+        if config.hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {config.hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, hidden_states):
+        gate_up, _ = self.w13(hidden_states)
+        hidden_states = self.act_fn(gate_up)
+        hidden_states, _ = self.w2(hidden_states)
+        return hidden_states
+
+
+class ArcticMoE(nn.Module):
+    """
+    Model-parallel implementation of Arctic MoE Layer.
+    """
+
+    def __init__(self,
+                 config: ArcticConfig,
+                 tp_size: Optional[int] = None,
+                 params_dtype: Optional[torch.dtype] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 reduce_results: bool = True,
+                 prefix: str = ""):
+        super().__init__()
+
+        layer_id = extract_layer_index(prefix)
+        self.tp_size = tp_size or get_tensor_model_parallel_world_size()
+        self.hidden_size = config.hidden_size
+        self.num_experts = config.num_local_experts
+        self.layer_id = layer_id
+        self.top_k = config.num_experts_per_tok
+        self.intermediate_size = config.intermediate_size // self.tp_size
+
+        self.is_moe_layer = (layer_id + 1) % config.moe_layer_frequency == 0
+        self.is_quant = isinstance(quant_config, DeepSpeedFPConfig)
+        self.reduce_results = reduce_results
+        # Some other parameters
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
+
+        if not self.is_moe_layer:
+            self.mlp = ArcticMLP(config,
+                                 quant_config=quant_config,
+                                 reduce_results=reduce_results,
+                                 prefix=f"{prefix}.mlp")
+        else:
+            self.gate = ReplicatedLinear(self.hidden_size,
+                                         self.num_experts,
+                                         bias=False,
+                                         params_dtype=self.params_dtype,
+                                         quant_config=quant_config,
+                                         prefix=f"{prefix}.gate")
+            if self.is_quant:
+                self.ws = DeepSpeedFPParameter(
+                    torch.Size((self.num_experts, 2 * self.intermediate_size,
+                                self.hidden_size)),
+                    params_dtype=params_dtype,
+                    quant_config=quant_config,
+                )
+                self.w2s = DeepSpeedFPParameter(
+                    torch.Size((self.num_experts, self.hidden_size,
+                                self.intermediate_size)),
+                    params_dtype=params_dtype,
+                    quant_config=quant_config,
+                )
+            else:
+                self.ws = nn.Parameter(
+                    torch.empty(self.num_experts,
+                                2 * self.intermediate_size,
+                                self.hidden_size,
+                                device=current_platform.device_type,
+                                dtype=self.params_dtype))
+                self.w2s = nn.Parameter(
+                    torch.empty(self.num_experts,
+                                self.hidden_size,
+                                self.intermediate_size,
+                                device=current_platform.device_type,
+                                dtype=self.params_dtype))
+            set_weight_attrs(self.ws, {
+                "weight_loader": self.weight_loader,
+            })
+            set_weight_attrs(self.w2s, {
+                "weight_loader": self.weight_loader,
+            })
+
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
+                      weight_name: str, expert_id: int):
+        tp_rank = get_tensor_model_parallel_rank()
+        param_data = param.ds_dequantize() if self.is_quant else param.data
+        shard_size = self.intermediate_size
+        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
+        if weight_name.endswith("w1.weight"):
+            param_data[expert_id, 0:shard_size, :] = loaded_weight[shard, :]
+        if weight_name.endswith("w3.weight"):
+            param_data[expert_id,
+                       shard_size:2 * shard_size, :] = loaded_weight[shard, :]
+        if weight_name.endswith("w2.weight"):
+            param_data[expert_id, :, :] = loaded_weight[:, shard]
+        if self.is_quant:
+            param.ds_quantize_(param_data)
+
+    def local_moe_fused(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_size = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        do_normalize = self.top_k > 1
+        topk_weights, topk_ids, token_expert_indices = fused_topk(
+            hidden_states, router_logits, self.top_k, renormalize=do_normalize)
+        # topk_ids: (num_tokens, k)
+        if self.is_quant:
+            if 2 * num_tokens <= self.num_experts:
+                # If much fewer tokens than experts, use selective dequantize.
+                ws_dequantized = self.ws.ds_selective_dequantize(
+                    topk_ids.flatten())
+                w2s_dequantized = self.w2s.ds_selective_dequantize(
+                    topk_ids.flatten())
+                # We gathered the experts to the tokens so update the mapping.
+                topk_ids = torch.arange(
+                    0,
+                    topk_ids.numel(),
+                    device=topk_ids.device,
+                ).reshape(topk_ids.shape)
+            else:
+                ws_dequantized = self.ws.ds_dequantize()
+                w2s_dequantized = self.w2s.ds_dequantize()
+
+        final_hidden_states = fused_experts(
+            hidden_states,
+            ws_dequantized if self.is_quant else self.ws,
+            w2s_dequantized if self.is_quant else self.w2s,
+            topk_weights,
+            topk_ids,
+            inplace=True)
+        if self.reduce_results and self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
+        return final_hidden_states.view(num_tokens, hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor):
+        if self.is_moe_layer:
+            final_hidden_states = self.local_moe_fused(hidden_states)
+        else:
+            final_hidden_states = self.mlp(hidden_states)
+        return final_hidden_states
+
+
+class ArcticAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: ArcticConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = self.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.scaling = self.head_dim**-0.5
+
+        self.qkv_proj = QKVParallelLinear(self.hidden_size,
+                                          self.head_dim,
+                                          self.total_num_heads,
+                                          self.total_num_kv_heads,
+                                          bias=False,
+                                          quant_config=quant_config)
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=False,
+            reduce_results=True,
+            quant_config=quant_config,
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=int(self.rope_theta),
+            is_neox_style=True,
+        )
+
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class ArcticDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: ArcticConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        layer_idx = extract_layer_index(prefix)
+        is_moe_layer = (layer_idx + 1) % config.moe_layer_frequency == 0
+        self.use_residual = config.use_residual and is_moe_layer
+        self.self_attn = ArcticAttention(config,
+                                         cache_config,
+                                         quant_config=quant_config,
+                                         prefix=f"{prefix}.self_attn")
+        self.block_sparse_moe = ArcticMoE(
+            config,
+            quant_config=quant_config,
+            reduce_results=(not self.use_residual),
+            prefix=f"{prefix}.block_sparse_moe",
+        )
+
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+        if self.use_residual:
+            self.residual_layernorm = RMSNorm(config.hidden_size,
+                                              eps=config.rms_norm_eps)
+            self.residual_mlp = ArcticMLP(config,
+                                          is_residual_mlp=True,
+                                          reduce_results=False,
+                                          prefix=f"{prefix}.residual_mlp")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        residual_input = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+        hidden_states = residual_input + hidden_states
+
+        residual_attn = hidden_states
+        if self.use_residual:
+            hidden_states = self.residual_layernorm(hidden_states)
+            hidden_states = self.residual_mlp(hidden_states)
+            residual_mlp = hidden_states
+            hidden_states = self.post_attention_layernorm(residual_input)
+            hidden_states = self.block_sparse_moe(hidden_states)
+            hidden_states = residual_mlp + hidden_states
+            hidden_states = tensor_model_parallel_all_reduce(hidden_states)
+            hidden_states = residual_attn + hidden_states
+        else:
+            hidden_states = self.post_attention_layernorm(hidden_states)
+            hidden_states = self.block_sparse_moe(hidden_states)
+            hidden_states = residual_attn + hidden_states
+        return hidden_states
+
+
+@support_torch_compile
+class ArcticModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=self.vocab_size)
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: ArcticDecoderLayer(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.layers")
+        self._attn_implementation = config._attn_implementation
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(["hidden_states"],
+                                                    config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states = layer(positions, hidden_states)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class ArcticForCausalLM(nn.Module, SupportsPP, SupportsQuant):
+    packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.model = ArcticModel(vllm_config=vllm_config,
+                                 prefix=maybe_prefix(prefix, "model"))
+        self.vocab_size = config.vocab_size
+        self.lm_head = ParallelLMHead(
+            self.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+        )
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.num_experts = config.num_local_experts
+        self.num_experts_per_tok = config.num_experts_per_tok
+        self.unpadded_vocab_size = config.vocab_size
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        mlp_params_mapping: list[tuple[str, str, int]] = []
+        expert_params_mapping: list[tuple[str, str, int]] = []
+        num_layers = self.config.num_hidden_layers
+
+        for layer in range(num_layers):
+            mlp_params_mapping.append(
+                (f"layers.{layer}.residual_mlp.w13.weight",
+                 f"layers.{layer}.residual_mlp.w1.weight", 0))
+            mlp_params_mapping.append(
+                (f"layers.{layer}.residual_mlp.w13.weight",
+                 f"layers.{layer}.residual_mlp.w3.weight", 1))
+            if layer % 2 == 0:
+                # MLP layers
+                mlp_params_mapping.append(
+                    (f"layers.{layer}.block_sparse_moe.mlp.w13.weight",
+                     f"layers.{layer}.block_sparse_moe.mlp.w1.weight", 0))
+                mlp_params_mapping.append(
+                    (f"layers.{layer}.block_sparse_moe.mlp.w13.weight",
+                     f"layers.{layer}.block_sparse_moe.mlp.w3.weight", 1))
+            else:
+                # MoE layers
+                for expert_id in range(self.config.num_local_experts):
+                    expert_params_mapping.append(
+                        ("ws", f"experts.{expert_id}.w1.weight", expert_id))
+                    expert_params_mapping.append(
+                        ("w2s", f"experts.{expert_id}.w2.weight", expert_id))
+                    expert_params_mapping.append(
+                        ("ws", f"experts.{expert_id}.w3.weight", expert_id))
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        logger.info(
+            "It will take ~10 minutes loading from the 16-bit weights. "
+            "Alternatively, use the prequantized 8-bit weights of arctic "
+            "and set load-format to `sharded_state` will accelerate loading.")
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for param_name, weight_name, shard_id in mlp_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id)
+                    break
+                else:
+                    for param_name, weight_name, shard_id \
+                            in expert_params_mapping:
+                        if weight_name not in name:
+                            continue
+                        name = name.replace(weight_name, param_name)
+                        if is_pp_missing_parameter(name, self):
+                            continue
+                        param = params_dict[name]
+                        weight_loader = param.weight_loader
+                        weight_loader(param,
+                                      loaded_weight,
+                                      weight_name,
+                                      expert_id=shard_id)
+                        break
+                    else:
+                        if name.endswith(".bias") and name not in params_dict:
+                            continue
+                        if is_pp_missing_parameter(name, self):
+                            continue
+                        param = params_dict[name]
+
+                        weight_loader = getattr(param, "weight_loader",
+                                                default_weight_loader)
+                        weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm_v0.10.0/vllm/model_executor/models/aria.py b/vllm_v0.10.0/vllm/model_executor/models/aria.py
new file mode 100644
index 0000000..8ae1680
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/aria.py
@@ -0,0 +1,670 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Optional, TypedDict, Union
+
+import torch
+import torch.nn as nn
+from transformers import AriaConfig, AriaTextConfig, BatchFeature
+from transformers.models.aria.modeling_aria import AriaCrossAttention
+from transformers.models.aria.processing_aria import AriaProcessor
+
+from vllm.config import CacheConfig, QuantizationConfig, VllmConfig
+from vllm.distributed import get_tensor_model_parallel_rank
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+# yapf: disable
+from .idefics2_vision_model import Idefics2VisionConfig
+from .idefics2_vision_model import (
+    Idefics2VisionTransformer as Idefics3VisionTransformer)
+# yapf: enable
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsQuant
+from .llama import LlamaDecoderLayer, LlamaMLP, LlamaModel
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
+                    is_pp_missing_parameter, maybe_prefix,
+                    merge_multimodal_embeddings)
+
+
+class AriaImagePixelInputs(TypedDict):
+    pixel_values: torch.Tensor
+    pixel_mask: Optional[torch.Tensor]
+    """
+    Shape:
+        pixel_values: `(batch_size * num_images, num_channels, height, width)`
+        pixel_mask: `(batch_size * num_images, height, width)`
+    """
+
+
+class AriaVisionTransformer(Idefics3VisionTransformer, SupportsQuant):
+    packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
+
+    def __init__(
+        self,
+        config: Idefics2VisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, quant_config=quant_config, prefix=prefix)
+        # Unlike Idefics3VisionTransformer which uses LayerNorm after the
+        # final layer, Aria omits this normalization, so we replace it with an
+        # Identity layer
+        self.post_layernorm = nn.Identity()
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+
+            # NOTE: post_layernorm is not used in Aria
+            if "post_layernorm" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class AriaProjectorMLP(nn.Module):
+
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int,
+        output_dim: int,
+    ) -> None:
+        super().__init__()
+
+        self.linear_in = ColumnParallelLinear(in_features,
+                                              hidden_features,
+                                              bias=False)
+        self.linear_out = RowParallelLinear(hidden_features,
+                                            output_dim,
+                                            bias=False)
+        self.act = get_act_fn("gelu_new")
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.linear_in(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.linear_out(hidden_states)
+        return hidden_states
+
+
+class AriaProjector(nn.Module):
+    """
+    A projection module with one cross attention layer and one FFN layer, which
+    projects ViT's outputs into MoE's inputs.
+
+    Args:
+        patch_to_query_dict (dict): Maps patch numbers to their corresponding
+        query numbers,
+            e.g., {1225: 128, 4900: 256}. This allows for different query sizes
+            based on image resolution.
+        embed_dim (int): Embedding dimension.
+        num_heads (int): Number of attention heads.
+        kv_dim (int): Dimension of key and value.
+        ff_dim (int): Hidden dimension of the feed-forward network.
+        output_dim (int): Output dimension.
+        norm_layer (nn.Module): Normalization layer. Default is nn.LayerNorm.
+
+    Outputs:
+        A tensor with the shape of (batch_size, query_number, output_dim)
+    """
+
+    def __init__(self, config: AriaConfig) -> None:
+        super().__init__()
+
+        self.patch_to_query_dict = config.projector_patch_to_query_dict
+        self.in_features = config.vision_config.hidden_size
+        self.num_heads = config.vision_config.num_attention_heads
+        self.kv_dim = config.vision_config.hidden_size
+        self.hidden_features = config.text_config.hidden_size
+        self.output_dim = config.text_config.hidden_size
+
+        self.query = nn.Parameter(
+            torch.empty(config.max_value_projector_patch_to_query_dict,
+                        self.in_features))
+
+        self.cross_attn = AriaCrossAttention(config)
+
+        self.layer_norm = nn.LayerNorm(self.in_features)
+        self.feed_forward = AriaProjectorMLP(self.in_features,
+                                             self.hidden_features,
+                                             self.output_dim)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        batch_size, num_patches = x.shape[0], x.shape[1]
+
+        if num_patches not in self.patch_to_query_dict:
+            raise KeyError(f"Number of patches {num_patches} not found in "
+                           "patch_to_query_dict amongst possible values "
+                           f"{self.patch_to_query_dict.keys()}.")
+
+        query_num = self.patch_to_query_dict[num_patches]
+
+        queries = self.query[:query_num].unsqueeze(0).repeat(batch_size, 1, 1)
+
+        if attn_mask is not None:
+            attn_mask = attn_mask.repeat_interleave(self.num_heads, 0)
+            attn_mask = attn_mask.unsqueeze(1).expand(-1, queries.size(1), -1)
+
+        attention_out = self.cross_attn(x, queries, attn_mask=attn_mask)
+
+        out = self.feed_forward(self.layer_norm(attention_out))
+
+        return out
+
+
+class AriaFusedMoE(FusedMoE):
+
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
+                      shard_id: str) -> None:
+        # Override the weight_loader to handle the expert weights in the Aria
+        # model, which are already packed with experts, and merge the gate and
+        # up weights for each expert.
+        # Note: Loading expert weights with quantization is not supported
+        tp_rank = get_tensor_model_parallel_rank()
+        if shard_id == 'w13':
+            # the shape of loaded_weight is
+            # (num_experts, hidden_size, 2 * moe_intermediate_size)
+            if self.tp_size > 1:
+                up, gate = loaded_weight.chunk(2, dim=-1)
+                up_current_rank = up.chunk(self.tp_size, dim=-1)[tp_rank]
+                gate_current_rank = gate.chunk(self.tp_size, dim=-1)[tp_rank]
+                up_and_gate = torch.cat([up_current_rank, gate_current_rank],
+                                        dim=-1).transpose(1, 2)
+                param.data.copy_(up_and_gate)
+            else:
+                param.data.copy_(loaded_weight.transpose(1, 2))
+        elif shard_id == 'w2':
+            # the shape of loaded_weight is
+            # (num_experts, moe_intermediate_size, hidden_size)
+            if self.tp_size > 1:
+                down_current_rank = loaded_weight.chunk(self.tp_size,
+                                                        dim=1)[tp_rank]
+                param.data.copy_(down_current_rank.transpose(1, 2))
+            else:
+                param.data.copy_(loaded_weight.transpose(1, 2))
+
+
+class AriaTextMoELayer(nn.Module):
+    """
+    Mixture of Experts (MoE) Layer for the AriaMoE model.
+
+    This layer implements the MoE mechanism, which routes input tokens to
+    different experts based on a routing algorithm, processes them through the
+    experts, and then combines the outputs.
+    """
+
+    def __init__(
+        self,
+        config: AriaTextConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        self.router_weight = nn.Parameter(
+            torch.empty(
+                (self.config.moe_num_experts, self.config.hidden_size)))
+
+        self.experts = AriaFusedMoE(
+            num_experts=config.moe_num_experts,
+            top_k=config.moe_topk,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            quant_config=quant_config,
+            reduce_results=True,
+            prefix=f"{prefix}.experts",
+        )
+        self.shared_experts = LlamaMLP(
+            config.hidden_size,
+            config.intermediate_size * config.moe_num_shared_experts,
+            "silu",
+            quant_config=quant_config,
+            bias=config.mlp_bias,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass of the MoE Layer.
+
+        Args:
+            hidden_states (torch.Tensor): Input tensor of shape (batch_size,
+            sequence_length, hidden_size).
+
+        Returns:
+            torch.Tensor: Output tensor after passing through the MoE layer.
+        """
+
+        router_output = torch.nn.functional.linear(hidden_states,
+                                                   self.router_weight)
+
+        hidden_states_copy = hidden_states.clone()
+        # NOTE: hidden_states will be modified inplace by `FusedMoE`
+        sparse_expert_output = self.experts(hidden_states, router_output)
+        shared_expert_output = self.shared_experts(hidden_states_copy)
+
+        return sparse_expert_output + shared_expert_output
+
+
+class AriaTextDecoderLayer(LlamaDecoderLayer):
+    """
+    Custom Decoder Layer for the AriaMoE model which modifies the standard
+    `LlamaDecoderLayer` by replacing the traditional MLP with a Mixture of
+    Experts (MoE) Layer.
+    """
+
+    def __init__(
+        self,
+        config: AriaTextConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, cache_config, quant_config, prefix)
+        self.mlp = AriaTextMoELayer(config,
+                                    quant_config=quant_config,
+                                    prefix=f"{prefix}.mlp")
+
+
+class AriaTextModel(LlamaModel, SupportsQuant):
+    """
+    Custom LlamaModel for the AriaMoE model which modifies the standard
+    LlamaModel by replacing the `LlamaDecoderLayer` with `MoEDecoderLayer`.
+    """
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+        "experts.w13_weight": ["experts.fc1.weight"],
+        "experts.w2_weight": ["experts.fc2.weight"],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         layer_type=AriaTextDecoderLayer)
+
+    # Adapted from LlamaModel.load_weights with the modification of adding
+    # the expert weights mapping to `stacked_params_mapping`
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+            ("experts.w13_weight", "experts.fc1.weight", 'w13'),
+            ("experts.w2_weight", "experts.fc2.weight", 'w2'),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class AriaProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(AriaConfig)
+
+    def get_vision_config(self):
+        return self.get_hf_config().vision_config
+
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(AriaProcessor, **kwargs)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_num_image_tokens(self) -> int:
+        hf_config = self.get_hf_config()
+        return max(hf_config.projector_patch_to_query_dict.values())
+
+
+class AriaDummyInputsBuilder(BaseDummyInputsBuilder[AriaProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token: str = processor.tokenizer.image_token  # type: ignore
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        vision_config = self.info.get_vision_config()
+
+        max_image_size = vision_config.image_size
+        num_images = mm_counts.get("image", 0)
+
+        return {
+            "image":
+            self._get_dummy_images(width=max_image_size,
+                                   height=max_image_size,
+                                   num_images=num_images)
+        }
+
+
+class AriaMultiModalProcessor(BaseMultiModalProcessor[AriaProcessingInfo]):
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            pixel_mask=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_config = self.info.get_hf_config()
+        image_token_id = hf_config.image_token_index
+
+        num_image_tokens = self.info.get_num_image_tokens()
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=[image_token_id] * num_image_tokens,
+            )
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor,
+                                        info=AriaProcessingInfo,
+                                        dummy_inputs=AriaDummyInputsBuilder)
+class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
+    """
+    Aria model for conditional generation tasks.
+
+    This model combines a vision tower, a multi-modal projector, and a language
+    model to perform tasks that involve both image and text inputs.
+    """
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # mapping for new names in checkpoint saved after transformers v4.52
+            "model.language_model.": "language_model.model.",
+            "model.vision_tower.": "vision_tower.",
+            "model.multi_modal_projector.": "multi_modal_projector.",
+            # mapping for original checkpoint
+            "language_model.model": "language_model",
+            "language_model.lm_head": "lm_head",
+        },
+        orig_to_new_suffix={
+            "router.weight": "router_weight",
+        },
+    )
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<|fim_prefix|><|img|><|fim_suffix|>"
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+        self.vision_tower = AriaVisionTransformer(
+            config.vision_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.vision_tower",
+        )
+        self.multi_modal_projector = AriaProjector(config)
+        self.vocab_size = config.text_config.vocab_size
+        self.language_model = AriaTextModel(
+            vllm_config=vllm_config.with_hf_config(config.text_config),
+            prefix=maybe_prefix(prefix, "language_model.model"),
+        )
+        self.pad_token_id = (self.config.pad_token_id
+                             if self.config.pad_token_id is not None else -1)
+        self.unpadded_vocab_size = config.text_config.vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.text_config.hidden_size,
+            org_num_embeddings=self.language_model.org_vocab_size,
+            quant_config=quant_config,
+        )
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                self.vocab_size, logit_scale)
+
+    def _validate_image_sizes(
+            self, images: list[torch.Tensor]) -> list[torch.Tensor]:
+        if not all(img.shape == images[0].shape for img in images):
+            raise ValueError("All images must be the same size")
+        return images
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[AriaImagePixelInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        pixel_mask = kwargs.pop("pixel_mask", None)
+
+        if pixel_values is None:
+            return None
+
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
+
+        pixel_values = self._validate_image_sizes(pixel_values)
+        pixel_values = flatten_bn(pixel_values, concat=True)
+
+        if pixel_mask is not None:
+            if not isinstance(pixel_mask, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel mask. "
+                                 f"Got type: {type(pixel_mask)}")
+
+            pixel_mask = flatten_bn(pixel_mask, concat=True)
+
+        return AriaImagePixelInputs(
+            pixel_values=pixel_values,
+            pixel_mask=pixel_mask,
+        )
+
+    def _create_patch_attention_mask(
+            self, pixel_mask: Optional[torch.Tensor]) -> torch.Tensor:
+        if pixel_mask is None:
+            return None
+
+        patches_subgrid = pixel_mask.unfold(
+            dimension=1,
+            size=self.vision_tower.config.patch_size,
+            step=self.vision_tower.config.patch_size,
+        ).unfold(
+            dimension=2,
+            size=self.vision_tower.config.patch_size,
+            step=self.vision_tower.config.patch_size,
+        )
+        return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
+
+    def _process_image_input(
+        self, image_input: AriaImagePixelInputs
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        assert self.vision_tower is not None
+
+        pixel_values = image_input['pixel_values']
+        pixel_mask = image_input['pixel_mask']
+
+        patch_attention_mask = self._create_patch_attention_mask(pixel_mask)
+
+        image_outputs = self.vision_tower(
+            pixel_values=pixel_values,
+            patch_attention_mask=patch_attention_mask,
+        )
+        image_attn_mask = None
+        if patch_attention_mask is not None:
+            flattened_mask = patch_attention_mask.flatten(1)
+            image_attn_mask = torch.logical_not(flattened_mask)
+
+        return self.multi_modal_projector(image_outputs, image_attn_mask)
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return []
+        multimodal_embeddings = self._process_image_input(image_input)
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.config.image_token_index)
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if inputs_embeds is None:
+            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
+            # always pass the input via `inputs_embeds`
+            # to make sure the computation graph is consistent
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      multimodal_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model(
+            input_ids,
+            positions,
+            intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        loader = AutoWeightsLoader(self)
+        loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/aya_vision.py b/vllm_v0.10.0/vllm/model_executor/models/aya_vision.py
new file mode 100644
index 0000000..45dd660
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/aya_vision.py
@@ -0,0 +1,486 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from https://github.com/huggingface/transformers/tree/main/src/transformers/models/aya_vision
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Literal, Optional, TypedDict, Union, cast
+
+import torch
+from torch import nn
+from transformers import BatchFeature, GotOcr2ImageProcessor
+from transformers.activations import ACT2FN
+from transformers.image_processing_utils import get_size_dict
+from transformers.models.aya_vision import AyaVisionConfig
+from transformers.models.aya_vision.processing_aya_vision import (
+    AyaVisionProcessor)
+from transformers.models.got_ocr2.image_processing_got_ocr2 import (
+    get_optimal_tiled_canvas)
+
+from vllm.config import VllmConfig
+from vllm.jsontree import json_map_leaves
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs
+from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo,
+                                        MultiModalFieldConfig,
+                                        PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .siglip import SiglipVisionModel
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+
+
+class AyaVisionImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: torch.Tensor
+    """
+    Shape: `(num_patches_total, num_channels, height, width)`
+
+    `num_patches_total` is the total number of patches over each image over each
+    prompt in the batch.
+    """
+
+    num_patches: torch.Tensor
+    """Shape: `(batch_size * num_images)`"""
+
+
+class AyaVisionMultiModalProjector(nn.Module):
+
+    def __init__(self, config: AyaVisionConfig):
+        super().__init__()
+        self.config = config
+        self.downsample_factor = config.downsample_factor
+        self.alignment_intermediate_size = getattr(
+            config, "alignment_intermediate_size",
+            config.text_config.hidden_size)
+        self.layernorm = nn.LayerNorm(config.vision_config.hidden_size *
+                                      (config.downsample_factor**2),
+                                      eps=config.adapter_layer_norm_eps)
+
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size * (config.downsample_factor**2),
+            self.alignment_intermediate_size,
+            bias=True,
+        )
+
+        self.act = ACT2FN["silu"]  # SwiGLU uses SiLU activation
+        # For SwiGLU, project down to half size since we split intermediate dim
+        self.linear_2 = nn.Linear(self.alignment_intermediate_size // 2,
+                                  config.text_config.hidden_size,
+                                  bias=True)
+
+    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
+        image_features = self.pixel_shuffle(image_features)
+        image_features = self.layernorm(image_features)
+        hidden_states = self.linear_1(image_features)
+
+        # Split along last dimension and apply SwiGLU
+        x, gate = hidden_states.chunk(2, dim=-1)
+        hidden_states = self.act(gate) * x
+
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+    def pixel_shuffle(self,
+                      image_features: torch.Tensor) -> torch.Tensor:  # B, S, D
+        batch_size, seq_length, _ = image_features.shape
+        height = width = int(seq_length**0.5)
+        image_features = image_features.reshape(image_features.shape[0], width,
+                                                height, -1)
+        channels = image_features.shape[-1]
+        image_features = image_features.reshape(
+            batch_size, width, int(height / self.downsample_factor),
+            int(channels * self.downsample_factor))
+        image_features = image_features.permute(0, 2, 1, 3)
+        image_features = image_features.reshape(
+            batch_size, int(height / self.downsample_factor),
+            int(width / self.downsample_factor), -1)
+        image_features = image_features.permute(0, 2, 1, 3)
+        return image_features
+
+
+class AyaVisionProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self) -> AyaVisionConfig:
+        return self.ctx.get_hf_config(AyaVisionConfig)
+
+    def get_hf_processor(self, **kwargs: object) -> AyaVisionProcessor:
+        processor = self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs)
+
+        # Temporary workaround since this processor has multiple image tokens
+        # See https://github.com/huggingface/transformers/issues/38350
+        processor._check_special_mm_tokens = lambda *args, **kwargs: None
+
+        return processor
+
+    def get_image_processor(self) -> GotOcr2ImageProcessor:
+        return self.get_hf_processor().image_processor
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        image_processor = self.get_image_processor()
+        height = image_processor.size['height']
+        width = image_processor.size['width']
+        max_patches = image_processor.max_patches
+        return ImageSize(height=height * max_patches,
+                         width=width * max_patches)
+
+    def get_num_patches(self, *, image_width: int, image_height: int,
+                        size: dict, min_patches: int, max_patches: int) -> int:
+        """
+        Calculate the number of patches needed for a given image based on size
+        constraints.  This method replicates and adjusts the logic from:
+        transformers/models/got_ocr2/image_processing_got_ocr2
+        """
+        size = get_size_dict(size, default_to_square=False)
+        num_columns, num_rows = get_optimal_tiled_canvas(
+            (image_height, image_width), (size["height"], size["width"]),
+            min_patches, max_patches)
+        num_blocks = num_columns * num_rows
+        return num_blocks if num_blocks == 1 else num_blocks + 1
+
+
+class AyaVisionDummyInputsBuilder(
+        BaseDummyInputsBuilder[AyaVisionProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        image_size = \
+            self.info.get_image_size_with_most_features()
+
+        return {
+            "image":
+            self._get_dummy_images(width=image_size.width,
+                                   height=image_size.height,
+                                   num_images=num_images)
+        }
+
+
+class AyaVisionMultiModalProcessor(
+        BaseMultiModalProcessor[AyaVisionProcessingInfo]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        processed_outputs = super()._call_hf_processor(
+            prompt,
+            mm_data,
+            mm_kwargs,
+            tok_kwargs,
+        )
+        hf_processor = self.info.get_hf_processor(**mm_kwargs)
+        image_processor = hf_processor.image_processor
+
+        # HF processor pops the `num_patches` kwarg, which is needed by vLLM
+        if (images := mm_data.get("images")) is not None:
+            parsed_images = (self._get_data_parser().parse_mm_data({
+                "image":
+                images
+            }).get_items("image", ImageProcessorItems))
+            image_sizes = [
+                parsed_images.get_image_size(i)
+                for i in range(len(parsed_images))
+            ]
+
+            num_patches = [
+                self.info.get_num_patches(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    size=image_processor.size,
+                    min_patches=image_processor.min_patches,
+                    max_patches=image_processor.max_patches)
+                for image_size in image_sizes
+            ]
+            processed_outputs["num_patches"] = torch.tensor(num_patches)
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        num_patches = hf_inputs.get("num_patches", torch.empty(0))
+        return dict(
+            pixel_values=MultiModalFieldConfig.flat_from_sizes(
+                "image", num_patches),
+            num_patches=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_token = hf_processor.image_token
+        img_patch_token = hf_processor.img_patch_token
+        image_processor = hf_processor.image_processor
+
+        def get_replacement(item_idx: int):
+            images: ImageProcessorItems = mm_items.get("image",
+                                                       ImageProcessorItems)
+            image_size: ImageSize = images.get_image_size(item_idx)
+            num_patches = self.info.get_num_patches(
+                image_width=image_size.width,
+                image_height=image_size.height,
+                size=image_processor.size,
+                min_patches=image_processor.min_patches,
+                max_patches=image_processor.max_patches,
+            )
+            repl = hf_processor._prompt_split_image(num_patches=num_patches)
+
+            return PromptUpdateDetails.select_text(repl, img_patch_token)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=image_token,
+                replacement=get_replacement,
+            )
+        ]
+
+
+def _get_num_hidden_layers(hf_config: AyaVisionConfig) -> int:
+    feature_layers = hf_config.vision_feature_layer
+    num_hidden_layers = hf_config.vision_config.num_hidden_layers
+    # If we have one feature layer, initialize up to that layer
+    if isinstance(feature_layers, int):
+        return _get_layer_index(feature_layers, num_hidden_layers)
+    # If we have multiple feature layers, initialize up to the deepest m
+    elif isinstance(feature_layers, (list, tuple)):
+        return max(
+            _get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
+    raise TypeError(f"vision_layer_feature type: {type(feature_layers)}"
+                    " is not supported")
+
+
+def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
+    if feature_layer_index < 0:
+        return num_hidden_layers + feature_layer_index + 1
+    return feature_layer_index
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    AyaVisionMultiModalProcessor,
+    info=AyaVisionProcessingInfo,
+    dummy_inputs=AyaVisionDummyInputsBuilder)
+class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                        SupportsPP):
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # mapping for new names in checkpoint saved after transformers v4.52
+            "model.language_model.": "language_model.model.",
+            "model.vision_tower.": "vision_tower.",
+            "model.multi_modal_projector.": "multi_modal_projector.",
+            "lm_head.": "language_model.lm_head.",
+        })
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<image>"
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: AyaVisionConfig = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        num_hidden_layers = _get_num_hidden_layers(config)
+        self.config = config
+        self.quant_config = quant_config
+        self.multimodal_config = multimodal_config
+
+        self.vision_tower = SiglipVisionModel(
+            config.vision_config,
+            quant_config,
+            num_hidden_layers_override=num_hidden_layers,
+            prefix=maybe_prefix(prefix, "vision_model"))
+        self.vocab_size = config.text_config.vocab_size
+        self.multi_modal_projector = AyaVisionMultiModalProjector(config)
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "model"),
+            # Cohere2ForCausalLM and CohereForCausalLM are the same on vllm
+            architectures=["Cohere2ForCausalLM"])
+
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def _image_pixels_to_features(self, vision_tower: SiglipVisionModel,
+                                  pixel_values: torch.Tensor,
+                                  **kwargs) -> torch.Tensor:
+        target_dtype = vision_tower.get_input_embeddings().weight.dtype
+        image_features = vision_tower(pixel_values.to(dtype=target_dtype),
+                                      **kwargs)
+
+        def select_features(leaf: torch.Tensor):
+            return self._select_image_features(
+                leaf,
+                strategy=self.config.vision_feature_select_strategy,
+            )
+
+        return cast(
+            Union[torch.Tensor, tuple[torch.Tensor, ...]],
+            json_map_leaves(select_features, image_features),
+        )
+
+    def _select_image_features(self, image_features: torch.Tensor, *,
+                               strategy: str) -> torch.Tensor:
+        if strategy == "default":
+            return image_features[:, 1:]
+        elif strategy == "full":
+            return image_features
+
+        raise ValueError(f"Unexpected select feature strategy: {strategy}")
+
+    def _process_image_input(self, image_input: AyaVisionImagePixelInputs,
+                             **kwargs) -> list[torch.Tensor]:
+        assert self.vision_tower is not None
+        pixel_values = image_input["pixel_values"]
+        num_patches = image_input["num_patches"]
+        image_features = self._image_pixels_to_features(
+            self.vision_tower, pixel_values=pixel_values)
+        image_embeds = self.multi_modal_projector(image_features)
+        return [
+            e.flatten(0, 2) for e in image_embeds.split(num_patches.tolist())
+        ]
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            if d.shape != expected_dims:
+                raise ValueError(
+                    "The expected shape of pixel values per image per batch "
+                    f"is {expected_dims}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[AyaVisionImagePixelInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        num_patches = kwargs.pop("num_patches", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        assert image_embeds is None, "Aya Vision does not support image_embeds."
+
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
+        if num_patches is not None and not isinstance(num_patches,
+                                                      (torch.Tensor, list)):
+            raise ValueError("Incorrect type of num_patches. "
+                             f"Got type: {type(num_patches)}")
+
+        pixel_values = flatten_bn(pixel_values, concat=True)
+        num_patches = flatten_bn(num_patches, concat=True)
+
+        return AyaVisionImagePixelInputs(
+            type="pixel_values",
+            pixel_values=self._validate_pixel_values(pixel_values),
+            num_patches=num_patches,
+        )
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return []
+
+        return self._process_image_input(image_input, **kwargs)
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                multimodal_embeddings=multimodal_embeddings,
+                placeholder_token_id=self.config.image_token_index,
+            )
+
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model.model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/baichuan.py b/vllm_v0.10.0/vllm/model_executor/models/baichuan.py
new file mode 100644
index 0000000..804a2f1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/baichuan.py
@@ -0,0 +1,474 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only BaiChuan model compatible with HuggingFace weights."""
+import math
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size)
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, row_parallel_weight_loader)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers)
+
+
+def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
+    closest_power_of_2 = 2**math.floor(math.log2(total_num_heads))
+    base = torch.tensor(
+        2**(-(2**-(math.log2(closest_power_of_2) - 3))),
+        dtype=torch.float32,
+    )
+    powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
+    slopes = torch.pow(base, powers)
+
+    if closest_power_of_2 != total_num_heads:
+        extra_base = torch.tensor(
+            2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
+            dtype=torch.float32,
+        )
+        num_remaining_heads = min(closest_power_of_2,
+                                  total_num_heads - closest_power_of_2)
+        extra_powers = torch.arange(start=1,
+                                    end=1 + 2 * num_remaining_heads,
+                                    step=2,
+                                    dtype=torch.int32)
+        slopes = torch.cat(
+            [slopes, torch.pow(extra_base, extra_powers)], dim=0)
+    return slopes
+
+
+class BaiChuanMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config)
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config)
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class BaiChuanAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        position_embedding: str,
+        rope_theta: float = 10000,
+        max_position_embeddings: int = 8192,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size(
+        )
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tensor_model_parallel_world_size == 0
+        self.num_heads = (self.total_num_heads //
+                          tensor_model_parallel_world_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.position_embedding = position_embedding
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        # pylint: disable=invalid-name
+        self.W_pack = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_heads,
+            bias=False,
+            quant_config=quant_config,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+        )
+        # Create the alibi slopes and slice them.
+        if self.position_embedding == "ALIBI":
+            tp_rank = get_tensor_model_parallel_rank()
+            head_start = tp_rank * self.num_heads
+            head_end = (tp_rank + 1) * self.num_heads
+            alibi_slopes = _get_alibi_slopes(self.total_num_heads)
+            alibi_slopes = alibi_slopes[head_start:head_end].tolist()
+
+            scaling = self.head_dim**-0.5
+            self.attn = Attention(self.num_heads,
+                                  self.head_dim,
+                                  scaling,
+                                  alibi_slopes=alibi_slopes,
+                                  quant_config=quant_config,
+                                  prefix=f"{prefix}.attn")
+        else:
+            self.rotary_emb = get_rope(
+                self.head_dim,
+                rotary_dim=self.head_dim,
+                max_position=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+            self.scaling = self.head_dim**-0.5
+            self.attn = Attention(self.num_heads,
+                                  self.head_dim,
+                                  self.scaling,
+                                  cache_config=cache_config,
+                                  quant_config=quant_config,
+                                  prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.W_pack(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        if self.position_embedding != "ALIBI":
+            q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class BaiChuanDecoderLayer(nn.Module):
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 position_embedding: str,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.self_attn = BaiChuanAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            position_embedding=position_embedding,
+            rope_theta=rope_theta,
+            max_position_embeddings=max_position_embeddings,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.mlp = BaiChuanMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class BaiChuanModel(nn.Module):
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        position_embedding: str = "ROPE",
+    ) -> None:
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: BaiChuanDecoderLayer(config,
+                                                position_embedding,
+                                                cache_config,
+                                                quant_config,
+                                                prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual,
+            })
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP,
+                              SupportsQuant):
+    packed_modules_mapping = {
+        "W_pack": ["W_pack"],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        position_embedding: str = "ROPE",
+    ):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        self.config = config
+        self.lora_config = lora_config
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.quant_config = quant_config
+        self.model = BaiChuanModel(vllm_config=vllm_config,
+                                   prefix=prefix,
+                                   position_embedding=position_embedding)
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
+        self.lm_head.weight.weight_loader = self.lm_head_weight_loader
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+    def lm_head_weight_loader(self, param: nn.Parameter,
+                              loaded_weight: torch.Tensor):
+        # Unlike Baichuan, Baichuan2 normalizes the head weights.
+        # Refer to:
+        # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/84603cde5ebffb6084e476cfaeceaf0b8b91fe54/modeling_baichuan.py#L508
+        # Distinguish between Baichuan and Baichuan2 by checking the
+        # vocab size. This is suggested by
+        # https://github.com/vllm-project/vllm/pull/1022#discussion_r1325652704
+        is_baichuan2 = self.config.vocab_size == 125696
+        if is_baichuan2:
+            loaded_weight = torch.nn.functional.normalize(loaded_weight)
+        if self.tp_size > 1:
+            row_parallel_weight_loader(param, loaded_weight)
+        else:
+            default_weight_loader(param, loaded_weight)
+
+
+class BaichuanForCausalLM(BaiChuanBaseForCausalLM):
+    """Baichuan 13B and Baichuan2 7B/13B.
+    NOTE: the class name has a lower case 'c'.
+    """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
+        if config.hidden_size == 4096:  # baichuan2 7b
+            super().__init__(vllm_config=vllm_config,
+                             prefix=prefix,
+                             position_embedding="ROPE")
+        else:  # baichuan 13b, baichuan2 13b
+            super().__init__(vllm_config=vllm_config,
+                             prefix=prefix,
+                             position_embedding="ALIBI")
+
+
+class BaiChuanForCausalLM(BaiChuanBaseForCausalLM):
+    """Baichuan 7B.
+    NOTE: the class name has an upper case 'C'.
+    """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         position_embedding="ROPE")
diff --git a/vllm_v0.10.0/vllm/model_executor/models/bailing_moe.py b/vllm_v0.10.0/vllm/model_executor/models/bailing_moe.py
new file mode 100644
index 0000000..853c13b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/bailing_moe.py
@@ -0,0 +1,527 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/inclusionAI/Ling/blob/master/models/modeling_bailing_moe.py
+# Copyright 2023 The vLLM team.
+# Copyright 2023 Antgroup and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only BailingMoE model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers.configuration_utils import PretrainedConfig
+
+from vllm.attention import Attention
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class BailingAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.total_num_heads = config.num_attention_heads
+        self.total_kv_heads = config.num_key_value_heads
+        tp_size = get_tensor_model_parallel_world_size()
+
+        assert self.total_num_heads % tp_size == 0
+        assert self.total_kv_heads % tp_size == 0
+        assert self.total_num_heads >= self.total_kv_heads
+
+        self.num_heads = self.total_num_heads // tp_size
+        self.head_dim = config.head_dim or (self.hidden_size //
+                                            self.total_num_heads)
+        self.q_size_per_rank = self.head_dim * self.num_heads
+
+        self.num_kv_heads = self.total_kv_heads // tp_size
+        self.kv_size_per_rank = self.num_kv_heads * self.head_dim
+        self.scale = self.head_dim**-0.5
+
+        self.query_key_value = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_kv_heads,
+            bias=(config.use_bias or config.use_qkv_bias),
+            quant_config=quant_config,
+            prefix=f"{prefix}.query_key_value",
+        )
+
+        self.dense = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=config.use_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense",
+        )
+
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scale,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              prefix=f"{prefix}.attn")
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=config.max_position_embeddings,
+            base=config.rope_theta,
+            is_neox_style=True,
+            rope_scaling=config.rope_scaling,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: torch.Tensor,
+    ) -> torch.Tensor:
+
+        qkv, _ = self.query_key_value(hidden_states)
+        q, k, v = qkv.split([
+            self.q_size_per_rank, self.kv_size_per_rank, self.kv_size_per_rank
+        ],
+                            dim=-1)
+
+        q, k = self.rotary_emb(position_ids, q, k)
+
+        context_layer = self.attn(q, k, v)
+
+        attn_output, _ = self.dense(context_layer)
+        return attn_output
+
+
+class BailingMLP(nn.Module):
+
+    def __init__(
+        self,
+        intermediate_size: int,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: Optional[bool] = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            config.hidden_size,
+            [intermediate_size] * 2,
+            bias=config.use_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            config.hidden_size,
+            bias=config.use_bias,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=f"{prefix}.down_proj",
+        )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        x, _ = self.gate_up_proj(x)
+        x = self.act_fn(x)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class BailingMoE(nn.Module):
+
+    def __init__(
+        self,
+        intermediate_size: int,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: Optional[bool] = True,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.num_experts = config.num_experts
+        self.top_k = config.num_experts_per_tok
+        self.norm_expert_prob = config.norm_topk_prob
+        self.hidden_size = config.hidden_size
+        self.quant_config = quant_config
+        self.num_shared_experts = config.num_shared_experts
+        # Gate always runs at half / full precision for now.
+        self.gate = ReplicatedLinear(self.hidden_size,
+                                     self.num_experts,
+                                     bias=False,
+                                     quant_config=None)
+
+        self.experts = FusedMoE(num_experts=self.num_experts,
+                                top_k=self.top_k,
+                                hidden_size=self.hidden_size,
+                                intermediate_size=config.moe_intermediate_size,
+                                reduce_results=False,
+                                renormalize=self.norm_expert_prob,
+                                quant_config=quant_config,
+                                prefix=f"{prefix}.experts")
+
+        if self.num_shared_experts > 0:
+            intermediate_size = (config.moe_intermediate_size *
+                                 self.num_shared_experts)
+            self.shared_experts = BailingMLP(
+                intermediate_size=intermediate_size,
+                config=config,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=f"{prefix}.shared_experts")
+        else:
+            self.shared_experts = None
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_size = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_size)
+        if self.num_shared_experts > 0:
+            shared_output = self.shared_experts(hidden_states)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        final_hidden_states = self.experts(hidden_states=hidden_states,
+                                           router_logits=router_logits)
+
+        if self.num_shared_experts > 0:
+            final_hidden_states = final_hidden_states + shared_output
+
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
+        return final_hidden_states.view(num_tokens, hidden_size)
+
+
+class BailingMoeBlock(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        hidden_size = config.hidden_size
+        intermediate_size = config.intermediate_size
+        self.input_layernorm = RMSNorm(hidden_size, eps=config.rms_norm_eps)
+        self.attention = BailingAttention(config,
+                                          cache_config,
+                                          quant_config,
+                                          prefix=f"{prefix}.attention")
+        self.post_attention_layernorm = RMSNorm(hidden_size,
+                                                eps=config.rms_norm_eps)
+        self.mlp = BailingMoE(intermediate_size,
+                              config,
+                              quant_config,
+                              True,
+                              prefix=f"{prefix}.mlp")
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+
+        hidden_states = self.attention(
+            hidden_states=hidden_states,
+            position_ids=position_ids,
+        )
+
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class BailingMoeModel(nn.Module):
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.embed_dim = config.hidden_size
+
+        if get_pp_group().is_first_rank or (config.tie_word_embeddings
+                                            and get_pp_group().is_last_rank):
+            self.word_embeddings = VocabParallelEmbedding(
+                self.vocab_size, self.embed_dim)
+        else:
+            self.word_embeddings = PPMissingLayer()
+
+        self.embedding_dropout = torch.nn.Dropout(config.embedding_dropout)
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: BailingMoeBlock(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=f"{prefix}.layers")
+
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(self.embed_dim, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.word_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                hidden_states,
+                position_ids,
+                residual,
+            )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+        expert_params_mapping = self.get_expert_mapping()
+        for name, loaded_weight in weights:
+            if self.config.norm_head and "lm_head.weight" in name:
+                loaded_weight = F.normalize(loaded_weight,
+                                            dim=0,
+                                            p=2,
+                                            eps=1e-7)
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if name not in params_dict:
+                        continue
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class BailingMoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
+
+    packed_modules_mapping = {
+        "query_key_value": ["query_key_value"],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+        self.quant_config = quant_config
+        self.max_position_embeddings = config.max_position_embeddings
+        self.model = BailingMoeModel(vllm_config=vllm_config,
+                                     prefix=maybe_prefix(prefix, "model"))
+        if get_pp_group().is_last_rank:
+            self.lm_head = (self.word_embeddings if config.tie_word_embeddings
+                            else ParallelLMHead(config.vocab_size,
+                                                config.hidden_size,
+                                                quant_config=quant_config))
+            self.logits_processor = LogitsProcessor(config.vocab_size)
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        model_output = self.model(input_ids, positions, intermediate_tensors,
+                                  inputs_embeds)
+        return model_output
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
diff --git a/vllm_v0.10.0/vllm/model_executor/models/bamba.py b/vllm_v0.10.0/vllm/model_executor/models/bamba.py
new file mode 100644
index 0000000..0f54944
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/bamba.py
@@ -0,0 +1,561 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Inference-only Bamba model."""
+# Added by the IBM Team, 2024
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+from torch import nn
+from transformers import BambaConfig
+
+from vllm import envs
+from vllm.attention.layer import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.mamba2_metadata import (
+    Mamba2Metadata, prepare_mamba2_metadata)
+from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
+from vllm.model_executor.layers.mamba.mamba_utils import get_mamba_state_shape
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
+                                                    MambaCacheParams)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+from vllm.utils import LayerBlockType
+
+from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
+                         SupportsQuant)
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class BambaMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: BambaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=config.hidden_size,
+            output_sizes=[config.intermediate_size] * 2,
+            bias=bias,
+            quant_config=quant_config,
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=config.intermediate_size,
+            output_size=config.hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+        )
+        if config.hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {config.hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        x, _ = self.gate_up_proj(x)
+        x = self.act_fn(x)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class BambaMixerDecoderLayer(nn.Module):
+
+    def __init__(self,
+                 config: BambaConfig,
+                 layer_idx: int,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "") -> None:
+        super().__init__()
+        self.config = config
+        self.mamba = MambaMixer2(hidden_size= config.hidden_size,
+                                ssm_state_size = config.mamba_d_state,
+                                conv_kernel_size = config.mamba_d_conv,
+                                intermediate_size = config.mamba_expand *\
+                                                    config.hidden_size,
+                                use_conv_bias = config.mamba_conv_bias,
+                                use_bias = config.mamba_proj_bias,
+                                n_groups=config.mamba_n_groups,
+                                num_heads=config.mamba_n_heads,
+                                head_dim=config.mamba_d_head,
+                                rms_norm_eps=config.rms_norm_eps,
+                                activation=config.hidden_act,
+                                quant_config=quant_config,
+                                prefix=f"{prefix}.mixer")
+
+        self.feed_forward = BambaMLP(config, quant_config=quant_config)
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = RMSNorm(config.hidden_size,
+                                        eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        mamba_cache_params: MambaCacheParams,
+        mamba2_metadata: Mamba2Metadata,
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+
+        output = torch.empty_like(hidden_states)
+        self.mamba(hidden_states, output, mamba_cache_params, mamba2_metadata)
+        # Fully Connected
+        hidden_states, residual = self.pre_ff_layernorm(output, residual)
+        hidden_states = self.feed_forward(hidden_states)
+        return hidden_states, residual
+
+
+class BambaAttentionDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: BambaConfig,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = config.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        if hasattr(config, "partial_rotary_factor"):
+            rotary_dim = int(self.head_dim * config.partial_rotary_factor)
+        elif hasattr(config, "attn_rotary_emb"):
+            rotary_dim = config.attn_rotary_emb  # for backward compatibility
+        else:
+            rotary_dim = self.head_dim  # default
+
+        self.rotary_emb = get_rope(
+            head_size=self.head_dim,
+            rotary_dim=rotary_dim,
+            max_position=max_position_embeddings,
+            rope_scaling=rope_scaling,
+            base=rope_theta,
+            is_neox_style=True,
+            dtype=torch.get_default_dtype(),  # see impl of get_rope
+        )
+
+        self.qkv_proj = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+        )
+        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
+                                        config.hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config)
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            prefix=f"{prefix}.attn",
+        )
+
+        self.feed_forward = BambaMLP(config, quant_config=quant_config)
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = RMSNorm(config.hidden_size,
+                                        eps=config.rms_norm_eps)
+
+    def self_attention(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+
+        hidden_states = self.self_attention(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+        # Fully Connected
+        hidden_states, residual = self.pre_ff_layernorm(
+            hidden_states, residual)
+        hidden_states = self.feed_forward(hidden_states)
+        return hidden_states, residual
+
+
+ALL_DECODER_LAYER_TYPES = {
+    "attention": BambaAttentionDecoderLayer,
+    "mamba": BambaMixerDecoderLayer
+}
+
+
+@support_torch_compile
+class BambaModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config: BambaConfig = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+
+        def get_layer(prefix: str):
+            layer_idx = int(prefix.rsplit(".", 1)[1])
+            layer_class = ALL_DECODER_LAYER_TYPES[
+                config.layers_block_type[layer_idx]]
+            return layer_class(
+                config,
+                layer_idx,
+                cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers")
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+        self.final_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        attn_metadata = get_forward_context().attn_metadata
+
+        if not envs.VLLM_USE_V1:
+            mamba2_metadata = prepare_mamba2_metadata(
+                chunk_size=self.config.mamba_chunk_size,
+                attn_metadata=attn_metadata,
+            )
+        else:
+            # v1 get mamba2_metadata from forward_context
+            mamba2_metadata = None
+
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        residual = None
+        num_attn = 0
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            if isinstance(layer, BambaAttentionDecoderLayer):
+                num_attn += 1
+
+            layer_mamba_cache_params = None
+            if isinstance(layer,
+                          BambaMixerDecoderLayer) and mamba_cache_params:
+                layer_mamba_cache_params = mamba_cache_params.at_layer_idx(
+                    i - num_attn)
+
+            hidden_states, residual = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+                mamba_cache_params=layer_mamba_cache_params,
+                mamba2_metadata=mamba2_metadata,
+            )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.final_layernorm(hidden_states, residual)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if "A_log" in name:
+                name = name.replace("A_log", "A")
+
+            if ".self_attn." in name:
+                name = name.replace(".self_attn", "")
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
+                       IsHybrid, SupportsQuant):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": ["up_proj", "down_proj"]
+    }
+
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    @classmethod
+    def get_mamba_state_shape_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+        use_v1: bool = True,
+    ) -> tuple[tuple[int, int], tuple[int, int, int]]:
+        """Calculate shapes for Mamba's convolutional and state caches.
+
+        Args:
+            vllm_config: vLLM config
+            use_v1: Get shapes for V1 (or V0)
+
+        Returns:
+            Tuple containing:
+            - conv_state_shape: Shape for convolutional state cache
+            - temporal_state_shape: Shape for state space model cache
+        """
+        parallel_config = vllm_config.parallel_config
+        hf_config = vllm_config.model_config.hf_config
+        intermediate_size = hf_config.mamba_expand * hf_config.hidden_size
+
+        return get_mamba_state_shape(
+            intermediate_size=intermediate_size,
+            tp_world_size=parallel_config.tensor_parallel_size,
+            n_groups=hf_config.mamba_n_groups,
+            num_heads=hf_config.mamba_n_heads,
+            head_dim=hf_config.mamba_d_head,
+            state_size=hf_config.mamba_d_state,
+            conv_kernel=hf_config.mamba_d_conv,
+            use_v1=use_v1,
+        )
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        lora_config = vllm_config.lora_config
+        scheduler_config = vllm_config.scheduler_config
+        assert not cache_config.enable_prefix_caching, \
+            "Bamba currently does not support prefix caching"
+
+        self.quant_config = vllm_config.quant_config
+
+        super().__init__()
+        self.config = config
+        self.scheduler_config = scheduler_config
+        self.model = BambaModel(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+        )
+        # Used to track and store by the Mamba cache between steps.
+        self.mamba_cache: Optional[MambaCacheManager] = None
+
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs):
+
+        mamba_cache_params = None
+        if not envs.VLLM_USE_V1:
+            if self.mamba_cache is None:
+                num_mamba_layers = \
+                    self.model_config.get_num_layers_by_block_type(
+                        self.vllm_config.parallel_config,
+                        LayerBlockType.mamba
+                    )
+                mamba_state_shape = \
+                    self.get_mamba_state_shape_from_config(
+                        self.vllm_config, use_v1=False)
+                self.mamba_cache = MambaCacheManager(self.vllm_config,
+                                                     self.lm_head.weight.dtype,
+                                                     num_mamba_layers,
+                                                     *mamba_state_shape)
+
+            mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+
+        hidden_states = self.model(input_ids, positions, mamba_cache_params,
+                                   intermediate_tensors, inputs_embeds)
+
+        return hidden_states
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        return self.mamba_cache.copy_inputs_before_cuda_graphs(
+            input_buffers, **kwargs)
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/bart.py b/vllm_v0.10.0/vllm/model_executor/models/bart.py
new file mode 100644
index 0000000..3d328c8
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/bart.py
@@ -0,0 +1,908 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Derived from BART implementation posted on HuggingFace; license below:
+#
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BART model."""
+import math
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+from torch import nn
+from transformers import BartConfig
+from transformers.utils import logging
+
+from vllm.attention import Attention, AttentionType
+from vllm.config import CacheConfig, LoRAConfig, VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVCrossParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsQuant, SupportsV0Only
+from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
+
+logger = logging.get_logger(__name__)
+
+
+def get_bsz_seq_len(input_ids):
+    shp = input_ids.shape
+    ndim = len(shp)
+    if ndim == 1:
+        return 1, input_ids.numel()
+    else:
+        return shp[:2]
+
+
+class BartLearnedPositionalEmbedding(VocabParallelEmbedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int):
+        # Bart is set up so that if padding_idx is
+        # specified then offset the embedding ids by 2
+        # and adjust num_embeddings appropriately.
+        # Other models don't have this hack
+        self.offset = 2
+        super().__init__(num_embeddings + self.offset, embedding_dim)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+    ) -> torch.Tensor:
+        """`input_ids' shape is expected to be [bsz x seqlen]."""
+        return super().forward(positions + self.offset)
+
+
+class BartScaledWordEmbedding(VocabParallelEmbedding):
+    """
+    This module overrides VocabParallelEmbedding's 
+    forward by multiplying with embeddings scale.
+    """
+
+    def __init__(self,
+                 num_embeddings: int,
+                 embedding_dim: int,
+                 embed_scale: float = 1.0):
+        super().__init__(num_embeddings, embedding_dim)
+        self.embed_scale = embed_scale
+
+    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return super().forward(input_ids) * self.embed_scale
+
+
+class BartParallelLMHead(ParallelLMHead):
+    """
+    This module overrides ParallelLMHead's
+    forward by dividing by embeddings scale,
+    yielding effectively the inverse of
+    BartScaledWordEmbedding
+    """
+
+    def __init__(self,
+                 num_embeddings: int,
+                 embedding_dim: int,
+                 embed_scale: float = 1.0):
+        super().__init__(num_embeddings, embedding_dim)
+        self.embed_scale = embed_scale
+
+    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return super().forward(input_ids) / self.embed_scale
+
+
+class BartEncoderAttention(nn.Module):
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        bias: bool = True,
+        config: Optional[BartConfig] = None,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.d_model = config.d_model
+        self.embed_dim = embed_dim
+        self.total_num_heads = num_heads
+        self.total_num_kv_heads = self.total_num_heads
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(f"embed_dim must be divisible by num_heads "
+                             f"(got `embed_dim`: {self.embed_dim}"
+                             f" and `num_heads`: {num_heads}).")
+        self.scaling = self.head_dim**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            self.d_model,
+            self.d_model // self.total_num_heads,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+        )
+
+        self.out_proj = RowParallelLinear(
+            embed_dim,
+            embed_dim,
+            bias=bias,
+            quant_config=quant_config,
+        )
+
+        tp_world_size = get_tensor_model_parallel_world_size()
+        assert self.total_num_heads % tp_world_size == 0
+        self.num_heads = self.total_num_heads // tp_world_size
+
+        if self.total_num_kv_heads >= tp_world_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_world_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_world_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = self.num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn",
+                              attn_type=AttentionType.ENCODER)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """Input shape: Batch x Time x Channel"""
+
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        attn_output = self.attn(q, k, v)
+
+        output, _ = self.out_proj(attn_output)
+        return output
+
+
+class BartDecoderSelfAttention(nn.Module):
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        bias: bool = True,
+        config: Optional[BartConfig] = None,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.d_model = config.d_model
+        self.embed_dim = embed_dim
+        self.total_num_heads = num_heads
+        self.total_num_kv_heads = self.total_num_heads
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(f"embed_dim must be divisible by num_heads "
+                             f"(got `embed_dim`: {self.embed_dim}"
+                             f" and `num_heads`: {num_heads}).")
+        self.scaling = self.head_dim**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            self.d_model,
+            self.d_model // self.total_num_heads,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+        )
+
+        self.out_proj = RowParallelLinear(
+            embed_dim,
+            embed_dim,
+            bias=bias,
+            quant_config=quant_config,
+        )
+
+        tp_world_size = get_tensor_model_parallel_world_size()
+        assert self.total_num_heads % tp_world_size == 0
+        self.num_heads = self.total_num_heads // tp_world_size
+
+        if self.total_num_kv_heads >= tp_world_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_world_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_world_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = self.num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn",
+                              attn_type=AttentionType.DECODER)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """Input shape: Batch x Time x Channel"""
+
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        attn_output = self.attn(q, k, v)
+
+        output, _ = self.out_proj(attn_output)
+        return output
+
+
+class BartCrossAttention(nn.Module):
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        bias: bool = True,
+        config: Optional[BartConfig] = None,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.d_model = config.d_model
+        self.embed_dim = embed_dim
+        self.total_num_heads = num_heads
+        self.total_num_kv_heads = self.total_num_heads
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(f"embed_dim must be divisible by num_heads "
+                             f"(got `embed_dim`: {self.embed_dim}"
+                             f" and `num_heads`: {num_heads}).")
+        self.scaling = self.head_dim**-0.5
+
+        # TP sharding sizes is accounted for within "*Parallel" layers.
+        self.qkv_proj = QKVCrossParallelLinear(self.d_model,
+                                               self.d_model //
+                                               self.total_num_heads,
+                                               self.total_num_heads,
+                                               self.total_num_kv_heads,
+                                               bias,
+                                               quant_config=quant_config)
+
+        self.out_proj = RowParallelLinear(
+            embed_dim,
+            embed_dim,
+            bias=bias,
+            quant_config=quant_config,
+        )
+
+        tp_world_size = get_tensor_model_parallel_world_size()
+        assert self.total_num_heads % tp_world_size == 0
+        self.num_heads = self.total_num_heads // tp_world_size
+
+        if self.total_num_kv_heads >= tp_world_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_world_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_world_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = self.num_heads  # No GQA in bart
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn",
+                              attn_type=AttentionType.ENCODER_DECODER)
+
+    def forward(
+        self,
+        decoder_hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Input shape: Batch x Time x Channel"""
+
+        q, k, v = self.qkv_proj(decoder_hidden_states, encoder_hidden_states)
+
+        attn_output = self.attn(q, k, v)
+
+        output, _ = self.out_proj(attn_output)
+        return output
+
+
+class BartEncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: BartConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = BartEncoderAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            config=config,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.activation_fn = get_act_fn(config.activation_function)
+
+        ffn_hidden_size = self.embed_dim
+        ffn_intermediate_size = config.encoder_ffn_dim
+        ffn_has_bias = True
+        self.fc1 = ColumnParallelLinear(
+            ffn_hidden_size,
+            ffn_intermediate_size,
+            bias=ffn_has_bias,
+            quant_config=quant_config,
+        )
+        self.act = get_act_fn("gelu")
+        self.fc2 = RowParallelLinear(
+            ffn_intermediate_size,
+            ffn_hidden_size,
+            bias=ffn_has_bias,
+            quant_config=quant_config,
+        )
+
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        r"""
+        Args:
+            hidden_states
+                torch.Tensor of *encoder* input embeddings.
+        Returns:
+            Encoder layer output torch.Tensor
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn(hidden_states=hidden_states)
+
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        fc1_out, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(fc1_out)
+
+        hidden_states, _ = self.fc2(hidden_states)
+
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if hidden_states.dtype == torch.float16 and (
+                torch.isinf(hidden_states).any()
+                or torch.isnan(hidden_states).any()):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states,
+                                        min=-clamp_value,
+                                        max=clamp_value)
+
+        return hidden_states
+
+
+class BartDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: BartConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = BartDecoderSelfAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            config=config,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.activation_fn = get_act_fn(config.activation_function)
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        '''
+        afeldman-nm: personally I would call this "cross-attention",
+        however I left the name as "encoder_attn" to maintain consistency
+        with the name of the pretrained weights.
+        '''
+        self.encoder_attn = BartCrossAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            config=config,
+            prefix=f"{prefix}.encoder_attn",
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+
+        ffn_hidden_size = self.embed_dim
+        ffn_intermediate_size = config.encoder_ffn_dim
+        ffn_has_bias = True
+        self.fc1 = ColumnParallelLinear(
+            ffn_hidden_size,
+            ffn_intermediate_size,
+            bias=ffn_has_bias,
+            quant_config=quant_config,
+        )
+        self.fc2 = RowParallelLinear(
+            ffn_intermediate_size,
+            ffn_hidden_size,
+            bias=ffn_has_bias,
+            quant_config=quant_config,
+        )
+
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        decoder_hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            decoder_hidden_states
+                torch.Tensor of *decoder* input embeddings.
+            encoder_hidden_states
+                torch.Tensor of *encoder* input embeddings.
+        Returns:
+            Decoder layer output torch.Tensor
+        """
+        residual = decoder_hidden_states
+
+        # Self Attention
+        hidden_states = self.self_attn(hidden_states=decoder_hidden_states)
+
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Cross-Attention Block
+
+        residual = hidden_states
+
+        hidden_states = self.encoder_attn(
+            decoder_hidden_states=hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+
+        hidden_states = residual + hidden_states
+        hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+        # Fully Connected
+        residual = hidden_states
+        fc1_out, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(fc1_out)
+
+        hidden_states, _ = self.fc2(hidden_states)
+
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        return hidden_states
+
+
+class BartEncoder(nn.Module):
+    """
+    Transformer encoder consisting of *config.encoder_layers*
+    self attention layers. Each layer is a [`BartEncoderLayer`].
+    Args:
+        config: BartConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(self,
+                 config: BartConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 lora_config: Optional[LoRAConfig] = None,
+                 embed_tokens: Optional[nn.Embedding] = None,
+                 prefix: str = ""):
+        super().__init__()
+
+        self.cache_config = cache_config
+        self.quant_config = quant_config
+        self.lora_config = lora_config
+        embed_dim = config.d_model
+        self.max_source_positions = config.max_position_embeddings
+        embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+
+        self.embed_tokens = BartScaledWordEmbedding(config.vocab_size,
+                                                    embed_dim,
+                                                    embed_scale=embed_scale)
+
+        if embed_tokens is not None:
+            self.embed_tokens.weight = embed_tokens.weight
+
+        self.embed_positions = BartLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            embed_dim,
+        )
+        self.layers = nn.ModuleList([
+            BartEncoderLayer(config,
+                             cache_config,
+                             quant_config,
+                             prefix=f"{prefix}.layers.{layer_idx}")
+            for layer_idx in range(config.encoder_layers)
+        ])
+
+        self.layernorm_embedding = nn.LayerNorm(embed_dim)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            input_ids
+                Indices of *encoder* input sequence tokens in the vocabulary.
+                Padding will be ignored by default should you
+                provide it.
+            positions
+                Positions of *encoder* input sequence tokens.
+        Returns:
+            Decoder output torch.Tensor
+        """
+        # retrieve input_ids and inputs_embeds
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        embed_pos = self.embed_positions(positions)
+        embed_pos = embed_pos.to(inputs_embeds.device)
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = self.layernorm_embedding(hidden_states)
+
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(hidden_states=hidden_states)
+
+        return hidden_states
+
+
+class BartDecoder(nn.Module):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers.
+    Each layer is a [`BartDecoderLayer`]
+    Args:
+        config: BartConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(
+        self,
+        config: BartConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
+        embed_tokens: Optional[nn.Embedding] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.cache_config = cache_config
+        self.quant_config = quant_config
+        self.lora_config = lora_config
+        self.max_target_positions = config.max_position_embeddings
+        embed_scale = math.sqrt(
+            config.d_model) if config.scale_embedding else 1.0
+
+        self.embed_tokens = BartScaledWordEmbedding(config.vocab_size,
+                                                    config.d_model,
+                                                    embed_scale=embed_scale)
+
+        if embed_tokens is not None:
+            self.embed_tokens.weight = embed_tokens.weight
+
+        self.embed_positions = BartLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            config.d_model,
+        )
+
+        self.layers = nn.ModuleList(
+            [BartDecoderLayer(config,cache_config,quant_config,
+            prefix=f"{prefix}.layers.{layer_idx}") \
+             for layer_idx in range(config.decoder_layers)])
+
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+
+    def forward(
+        self,
+        decoder_input_ids: torch.Tensor,
+        decoder_positions: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            decoder_input_ids
+                Indices of *decoder* input sequence tokens in the vocabulary.
+                Padding will be ignored by default should you
+                provide it.
+            decoder_positions
+                Positions of *decoder* input sequence tokens.
+            encoder_hidden_states:
+                Tensor of encoder output embeddings
+        Returns:
+            Decoder output torch.Tensor
+        """
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(decoder_input_ids)
+        else:
+            decoder_positions = inputs_embeds[:, -1]
+
+        # embed positions
+        embed_pos = self.embed_positions(decoder_positions)
+        embed_pos = embed_pos.to(inputs_embeds.device)
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = self.layernorm_embedding(hidden_states)
+
+        # decoder layers
+
+        for decoder_layer in self.layers:
+            hidden_states = decoder_layer(
+                decoder_hidden_states=hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+            )
+
+        return hidden_states
+
+
+class BartModel(nn.Module, SupportsQuant):
+    _tied_weights_keys = [
+        "encoder.embed_tokens.weight",
+        "decoder.embed_tokens.weight",
+    ]
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        self.encoder = BartEncoder(config,
+                                   cache_config,
+                                   quant_config=quant_config,
+                                   prefix=f"{prefix}.encoder")
+        self.decoder = BartDecoder(config,
+                                   cache_config,
+                                   quant_config=quant_config,
+                                   prefix=f"{prefix}.decoder")
+
+    def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
+                encoder_input_ids: torch.Tensor,
+                encoder_positions: torch.Tensor) -> torch.Tensor:
+        r"""
+        Args:
+            input_ids
+                Indices of *decoder* input sequence tokens in the vocabulary.
+                Padding will be ignored by default should you
+                provide it.
+            positions
+                Positions of *decoder* input sequence tokens.
+            encoder_input_ids
+                Indices of *encoder* input sequence tokens in the vocabulary.
+            encoder_positions:
+                Positions of *encoder* input sequence tokens.
+        Returns:
+            Model output torch.Tensor
+        """
+
+        encoder_hidden_states = None
+
+        if encoder_input_ids.numel() > 0:
+            # Run encoder attention if a non-zero number of encoder tokens
+            # are provided as input
+            encoder_hidden_states = self.encoder(input_ids=encoder_input_ids,
+                                                 positions=encoder_positions)
+
+        # decoder outputs consists of
+        # (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            decoder_input_ids=input_ids,
+            decoder_positions=positions,
+            encoder_hidden_states=encoder_hidden_states)
+
+        return decoder_outputs
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        other_weights = []
+        loaded_stacked_params = []
+        model_params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if name not in model_params_dict:
+                    continue
+                param = model_params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                loaded_stacked_params.append(name)
+                break
+            else:
+                if name in model_params_dict:
+                    other_weights.append((name, loaded_weight))
+
+        loader = AutoWeightsLoader(self)
+        loaded_params = loader.load_weights(other_weights)
+        loaded_params.update(loaded_stacked_params)
+        return loaded_params
+
+
+class BartForConditionalGeneration(nn.Module, SupportsV0Only, SupportsQuant):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "decoder.": "model.decoder.",
+            "encoder.": "model.encoder.",
+            "shared.": "model.shared."
+        },
+        orig_to_new_substr={
+            "beta": "bias",
+            "gamma": "weight",
+            "LayerNorm": "layernorm",
+        },
+    )
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        lora_config = vllm_config.lora_config
+        # currently all existing BART models have `tie_word_embeddings` enabled
+        assert config.tie_word_embeddings
+        self.config = config
+        self.model = BartModel(vllm_config=vllm_config,
+                               prefix=maybe_prefix(prefix, "model"))
+
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
+        embed_scale = math.sqrt(
+            config.d_model) if config.scale_embedding else 1.0
+
+        self.lm_head = BartParallelLMHead(config.vocab_size,
+                                          config.d_model,
+                                          embed_scale=embed_scale)
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        *,
+        encoder_input_ids: torch.Tensor,
+        encoder_positions: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            input_ids
+                torch.Tensor of *decoder* input token ids.
+            positions
+                torch.Tensor of *decoder* position indices.
+            encoder_input_ids
+                torch.Tensor of *encoder* input token ids.
+            encoder_positions
+                torch.Tensor of *encoder* position indices
+        Returns:
+            Output torch.Tensor
+        """
+        return self.model(input_ids, positions, encoder_input_ids,
+                          encoder_positions)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        weights_tuple_list = list(weights)
+
+        shared_embedding_weight = None
+        for name, loaded_weight in weights_tuple_list:
+            if ('shared.weight' in name
+                    or 'encoder.embed_tokens.weight' in name
+                    or 'decoder.embed_tokens.weight' in name
+                    or 'lm_head.weight' in name):
+                assert shared_embedding_weight is None, (
+                    "Conflicting embedding weights.")
+                shared_embedding_weight = loaded_weight
+
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["cls.", "pooler."]),
+        )
+        loaded_params = loader.load_weights(weights_tuple_list,
+                                            mapper=self.hf_to_vllm_mapper)
+
+        if shared_embedding_weight is not None:
+            weight_loader = getattr(self.lm_head.weight, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(self.lm_head.weight, shared_embedding_weight)
+
+            self.model.encoder.embed_tokens.weight = self.lm_head.weight
+            self.model.decoder.embed_tokens.weight = self.lm_head.weight
+            loaded_params.update({
+                'model.encoder.embed_tokens.weight', 'lm_head.weight',
+                'model.decoder.embed_tokens.weight'
+            })
+
+        return loaded_params
diff --git a/vllm_v0.10.0/vllm/model_executor/models/bert.py b/vllm_v0.10.0/vllm/model_executor/models/bert.py
new file mode 100644
index 0000000..9dc6115
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/bert.py
@@ -0,0 +1,578 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable, Set
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import BertConfig
+
+from vllm.attention import Attention, AttentionType
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, PoolerConfig, VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.pooler import (ClassifierPooler,
+                                               DispatchPooler, Pooler,
+                                               PoolingMethod,
+                                               PoolingParamsUpdate,
+                                               PoolingType)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.pooling_metadata import PoolingMetadata
+from vllm.pooling_params import PoolingTask
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsCrossEncoding, SupportsQuant, SupportsV0Only
+from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
+
+
+class BertEmbedding(nn.Module):
+
+    def __init__(self, config: BertConfig):
+
+        super().__init__()
+        self.size = config.hidden_size
+        self.word_embeddings = VocabParallelEmbedding(config.vocab_size,
+                                                      config.hidden_size)
+        self.position_embeddings = VocabParallelEmbedding(
+            config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = VocabParallelEmbedding(
+            config.type_vocab_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size,
+                                      eps=config.layer_norm_eps)
+
+        self.register_buffer(
+            "position_ids",
+            torch.arange(config.max_position_embeddings).unsqueeze(0),
+        )
+        self.position_embedding_type = config.position_embedding_type
+        if self.position_embedding_type != "absolute":
+            raise ValueError("Only 'absolute' position_embedding_type" +
+                             " is supported")
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        seq_lens: torch.Tensor,
+        position_ids: torch.Tensor,
+        token_type_ids: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        input_shape = input_ids.size()
+
+        # Input embeddings.
+        inputs_embeds = self.word_embeddings(input_ids)
+
+        # Position embeddings.
+        position_embeddings = self.position_embeddings(position_ids)
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape,
+                                         dtype=torch.long,
+                                         device=inputs_embeds.device)
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings + position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        return embeddings
+
+
+class BertPooler(Pooler):
+
+    def __init__(self, config: BertConfig):
+        super().__init__()
+
+        self.pooling = PoolingMethod.from_pooling_type(PoolingType.CLS)
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return self.pooling.get_supported_tasks()
+
+    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
+        return self.pooling.get_pooling_updates(task)
+
+    def _head(self, pooled_output: torch.Tensor):
+        pooled_output = self.dense(pooled_output)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+    def forward(
+        self,
+        hidden_states: Union[torch.Tensor, list[torch.Tensor]],
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        pooled_output = self.pooling(hidden_states, pooling_metadata)
+
+        if isinstance(pooled_output, list):
+            pooled_output = [self._head(output) for output in pooled_output]
+        else:
+            pooled_output = self._head(pooled_output)
+
+        return pooled_output
+
+
+@support_torch_compile
+class BertEncoder(nn.Module):
+
+    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        self.layer = nn.ModuleList([
+            BertLayer(config=config,
+                      cache_config=cache_config,
+                      quant_config=quant_config,
+                      prefix=f"{prefix}.layer.{layer_idx}")
+            for layer_idx in range(config.num_hidden_layers)
+        ])
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        for layer in self.layer:
+            hidden_states = layer(hidden_states)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+
+    def __init__(self,
+                 config: BertConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+
+        self.attention = BertAttention(
+            hidden_size=config.hidden_size,
+            num_attention_heads=config.num_attention_heads,
+            layer_norm_eps=config.layer_norm_eps,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attention")
+
+        self.intermediate = BertIntermediate(
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=f"{prefix}.intermediate")
+
+        self.output = BertOutput(hidden_size=config.hidden_size,
+                                 intermediate_size=config.intermediate_size,
+                                 layer_norm_eps=config.layer_norm_eps,
+                                 quant_config=quant_config,
+                                 prefix=f"{prefix}.output")
+
+    def forward(self, hidden_states: torch.Tensor):
+        attn_output = self.attention(hidden_states)
+        intermediate_output = self.intermediate(attn_output)
+        output = self.output(intermediate_output, attn_output)
+        return output
+
+
+class BertAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_attention_heads: int,
+        layer_norm_eps: float,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.self = BertSelfAttention(hidden_size=hidden_size,
+                                      num_attention_heads=num_attention_heads,
+                                      cache_config=cache_config,
+                                      quant_config=quant_config,
+                                      prefix=f"{prefix}.output")
+
+        self.output = BertSelfOutput(hidden_size=hidden_size,
+                                     layer_norm_eps=layer_norm_eps,
+                                     quant_config=quant_config,
+                                     prefix=f"{prefix}.output")
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        self_output = self.self(hidden_states)
+        return self.output(self_output, hidden_states)
+
+
+class BertSelfAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_attention_heads: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+
+        self.total_num_heads = num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = self.total_num_heads
+        self.head_dim = self.hidden_size // self.total_num_heads
+        assert self.head_dim * self.total_num_heads == self.hidden_size
+
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=self.hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj")
+
+        self.attn = Attention(num_heads=self.num_heads,
+                              head_size=self.head_dim,
+                              scale=self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn",
+                              attn_type=AttentionType.ENCODER_ONLY)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        output = self.attn(q, k, v)
+        return output
+
+
+class BertSelfOutput(nn.Module):
+
+    def __init__(self,
+                 hidden_size: int,
+                 layer_norm_eps: float,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.dense = RowParallelLinear(input_size=hidden_size,
+                                       output_size=hidden_size,
+                                       bias=True,
+                                       quant_config=quant_config,
+                                       prefix=f"{prefix}.dense")
+        self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor,
+                input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.dense(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertIntermediate(nn.Module):
+
+    def __init__(self,
+                 hidden_size: int,
+                 intermediate_size: int,
+                 hidden_act: str,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.dense = ColumnParallelLinear(input_size=hidden_size,
+                                          output_size=intermediate_size,
+                                          bias=True,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.dense")
+        self.intermediate_act_fn = get_act_fn(hidden_act)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+
+    def __init__(self,
+                 hidden_size: int,
+                 intermediate_size: int,
+                 layer_norm_eps: float,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+
+        self.dense = RowParallelLinear(input_size=intermediate_size,
+                                       output_size=hidden_size,
+                                       bias=True,
+                                       quant_config=quant_config,
+                                       prefix=f"{prefix}.dense")
+
+        self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor,
+                input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.dense(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertModel(nn.Module, SupportsQuant):
+
+    is_pooling_model = True
+
+    packed_modules_mapping = {"qkv_proj": ["query", "key", "value"]}
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        embedding_class: type[nn.Module] = BertEmbedding,
+    ) -> None:
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        self.embeddings = embedding_class(config)
+        self.encoder = BertEncoder(vllm_config=vllm_config,
+                                   prefix=f"{prefix}.encoder")
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            attn_metadata = get_forward_context().attn_metadata
+            assert hasattr(attn_metadata, "seq_lens_tensor")
+            hidden_states = self.embeddings(
+                input_ids=input_ids,
+                seq_lens=attn_metadata.seq_lens_tensor,
+                position_ids=position_ids,
+                token_type_ids=token_type_ids)
+        return self.encoder(hidden_states)
+
+    def _load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "query", "q"),
+            ("qkv_proj", "key", "k"),
+            ("qkv_proj", "value", "v"),
+        ]
+
+        loaded_stacked_params = []
+        other_weights = []
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+
+                name = name.replace(weight_name, param_name)
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                loaded_stacked_params.append(name)
+                break
+            else:
+                if name in params_dict:
+                    other_weights.append((name, loaded_weight))
+
+        return other_weights, loaded_stacked_params
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        other_weights, loaded_stacked_params = self._load_weights(weights)
+
+        loader = AutoWeightsLoader(self, skip_prefixes=["pooler."])
+        loaded_params = loader.load_weights(other_weights)
+        loaded_params.update(loaded_stacked_params)
+        return loaded_params
+
+
+class BertPoolingModel(BertModel):
+
+    is_pooling_model = True
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        embedding_class: type[nn.Module] = BertEmbedding,
+    ) -> None:
+        super().__init__(
+            vllm_config=vllm_config,
+            prefix=prefix,
+            embedding_class=embedding_class,
+        )
+
+        config = vllm_config.model_config.hf_config
+        self.pooler = BertPooler(config)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        other_weights, loaded_stacked_params = self._load_weights(weights)
+
+        loader = AutoWeightsLoader(self)
+        loaded_params = loader.load_weights(other_weights)
+        loaded_params.update(loaded_stacked_params)
+        return loaded_params
+
+
+class BertEmbeddingModel(nn.Module, SupportsV0Only, SupportsQuant):
+    """A model that uses Bert to provide embedding functionalities.
+
+    This class encapsulates the BertModel and provides an interface for
+    embedding operations and customized pooling functions.
+
+    Attributes:
+        model: An instance of BertModel used for forward operations.
+        _pooler: An instance of Pooler used for pooling operations.
+    """
+
+    is_pooling_model = True
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        pooler_config = vllm_config.model_config.pooler_config
+        assert pooler_config is not None
+
+        self.model = self._build_model(vllm_config=vllm_config,
+                                       prefix=maybe_prefix(prefix, "model"))
+        self.pooler = self._build_pooler(pooler_config)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return self.model(input_ids=input_ids,
+                          position_ids=positions,
+                          inputs_embeds=inputs_embeds,
+                          intermediate_tensors=intermediate_tensors)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        weights_list = list(weights)
+
+        has_model_prefix = any(
+            name.startswith("model.") for name, _ in weights_list)
+        if not has_model_prefix:
+            mapper = WeightsMapper(orig_to_new_prefix={"": "model."})
+
+        loader = AutoWeightsLoader(self, skip_prefixes=["lm_head."])
+        return loader.load_weights(weights_list, mapper=mapper)
+
+    def _build_model(self,
+                     vllm_config: VllmConfig,
+                     prefix: str = "") -> BertModel:
+        return BertModel(vllm_config=vllm_config,
+                         prefix=prefix,
+                         embedding_class=BertEmbedding)
+
+    def _build_pooler(self, pooler_config: PoolerConfig) -> Pooler:
+        return DispatchPooler({
+            "encode":
+            Pooler.for_encode(pooler_config),
+            "embed":
+            Pooler.for_embed(
+                pooler_config,
+                default_pooling_type=PoolingType.CLS,
+            ),
+        })
+
+
+class BertForSequenceClassification(nn.Module, SupportsV0Only,
+                                    SupportsCrossEncoding, SupportsQuant):
+    """A model that uses Bert to provide embedding functionalities.
+
+   This class encapsulates the BertModel and provides an interface for
+   embedding operations and customized pooling functions.
+
+   Attributes:
+       model: An instance of BertModel used for forward operations.
+       _pooler: An instance of Pooler used for pooling operations.
+   """
+
+    is_pooling_model = True
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+
+        self.num_labels = config.num_labels
+        self.bert = BertPoolingModel(vllm_config=vllm_config,
+                                     prefix=maybe_prefix(prefix, "bert"),
+                                     embedding_class=BertEmbedding)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        pooler_config = vllm_config.model_config.pooler_config
+        assert pooler_config is not None
+
+        self.pooler = DispatchPooler({
+            "encode":
+            Pooler.for_encode(pooler_config),
+            "classify":
+            ClassifierPooler(
+                pooling=self.bert.pooler,
+                classifier=self.classifier,
+                act_fn=ClassifierPooler.act_fn_for_seq_cls(
+                    vllm_config.model_config),
+            ),
+            "score":
+            ClassifierPooler(
+                pooling=self.bert.pooler,
+                classifier=self.classifier,
+                act_fn=ClassifierPooler.act_fn_for_cross_encoder(
+                    vllm_config.model_config),
+            ),
+        })
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        loader = AutoWeightsLoader(self)
+        loaded_params = loader.load_weights(weights)
+        return loaded_params
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return self.bert(input_ids=input_ids,
+                         position_ids=positions,
+                         inputs_embeds=inputs_embeds,
+                         intermediate_tensors=intermediate_tensors,
+                         token_type_ids=token_type_ids)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/bert_with_rope.py b/vllm_v0.10.0/vllm/model_executor/models/bert_with_rope.py
new file mode 100644
index 0000000..0b7350f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/bert_with_rope.py
@@ -0,0 +1,617 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention, AttentionType
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import (get_act_and_mul_fn,
+                                                   get_act_fn)
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models import SupportsV0Only
+from vllm.model_executor.models.interfaces import SupportsQuant
+from vllm.model_executor.models.utils import WeightsMapper
+from vllm.sequence import IntermediateTensors
+
+
+class BertWithRopeEmbedding(nn.Module):
+
+    def __init__(self, config: PretrainedConfig):
+
+        super().__init__()
+        if config.position_embedding_type not in ["rope", "rotary"]:
+            raise ValueError("Only 'rotary'('rope') position_embedding_type" +
+                             " is supported")
+
+        self.word_embeddings = VocabParallelEmbedding(config.vocab_size,
+                                                      config.hidden_size)
+        if config.type_vocab_size > 0:
+            self.token_type_embeddings = VocabParallelEmbedding(
+                config.type_vocab_size, config.hidden_size)
+        else:
+            self.token_type_embeddings = None
+
+        self.LayerNorm = nn.LayerNorm(config.hidden_size,
+                                      eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        token_type_ids: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        input_shape = input_ids.size()
+        inputs_embeds = self.word_embeddings(input_ids)
+
+        embeddings = inputs_embeds
+        if self.token_type_embeddings is not None:
+            if token_type_ids is None:
+                token_type_ids = torch.zeros(input_shape,
+                                             dtype=torch.long,
+                                             device=inputs_embeds.device)
+
+            token_type_embeddings = self.token_type_embeddings(token_type_ids)
+            embeddings += token_type_embeddings
+
+        embeddings = self.LayerNorm(embeddings)
+        return embeddings
+
+
+class BertWithRopeAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_attention_heads: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = True,
+        rotary_kwargs: Optional[dict] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+
+        self.total_num_heads = num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = self.total_num_heads
+        self.head_dim = self.hidden_size // self.total_num_heads
+        assert self.head_dim * self.total_num_heads == self.hidden_size
+
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=self.hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj")
+
+        self.rotary_emb = get_rope(**rotary_kwargs)
+
+        self.attn = Attention(num_heads=self.num_heads,
+                              head_size=self.head_dim,
+                              scale=self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn",
+                              attn_type=AttentionType.ENCODER_ONLY)
+
+        self.out_proj = RowParallelLinear(input_size=hidden_size,
+                                          output_size=hidden_size,
+                                          bias=bias,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.dense")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.out_proj(attn_output)
+        return output
+
+
+class BertWithRopeGatedMLP(nn.Module):
+
+    def __init__(self,
+                 hidden_size: int,
+                 intermediate_size: int,
+                 hidden_act: str,
+                 bias: bool = True,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.act_fn = get_act_and_mul_fn(hidden_act)
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(input_size=intermediate_size,
+                                           output_size=hidden_size,
+                                           bias=bias,
+                                           quant_config=quant_config,
+                                           prefix=f"{prefix}.down_proj")
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(hidden_states)
+        hidden_states = self.act_fn(gate_up)
+        hidden_states, _ = self.down_proj(hidden_states)
+        return hidden_states
+
+
+class BertWithRopeMLP(nn.Module):
+
+    def __init__(self,
+                 hidden_size: int,
+                 intermediate_size: int,
+                 hidden_act: str,
+                 bias: bool = True,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.act_fn = get_act_fn(hidden_act)
+        self.up_proj = ColumnParallelLinear(input_size=hidden_size,
+                                            output_size=intermediate_size,
+                                            bias=bias,
+                                            quant_config=quant_config,
+                                            prefix=f"{prefix}.up_proj")
+        self.down_proj = RowParallelLinear(input_size=intermediate_size,
+                                           output_size=hidden_size,
+                                           bias=bias,
+                                           quant_config=quant_config,
+                                           prefix=f"{prefix}.down_proj")
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.up_proj(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states, _ = self.down_proj(hidden_states)
+        return hidden_states
+
+
+class NomicRouter(nn.Module):
+
+    def __init__(self, hidden_size: int, moe_num_experts: int, moe_top_k: int):
+        super().__init__()
+        self.moe_top_k = moe_top_k
+        self.layer = ReplicatedLinear(hidden_size, moe_num_experts, bias=False)
+
+    def forward(
+        self, x: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.LongTensor]:
+        weights = self.layer(x.view(-1, x.shape[-1]))[0].softmax(
+            dim=-1, dtype=torch.float32)
+        top_weights, top_experts = torch.topk(weights, self.moe_top_k, dim=-1)
+        weights = weights.to(x.dtype)
+        top_weights = top_weights.to(x.dtype)
+        return weights, top_weights, top_experts  # type: ignore
+
+
+class NomicExpertMLP(nn.Module):
+
+    def __init__(self, hidden_size: int, ffn_hidden_size: int,
+                 moe_num_experts: int, ffn_act_fn: str):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.moe_num_experts = moe_num_experts
+
+        self.w1 = nn.Parameter(
+            torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
+        self.w2 = nn.Parameter(
+            torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
+        self.activation_fn = get_act_fn(ffn_act_fn)
+
+    def forward(self, x: torch.Tensor, expert_idx: int) -> torch.Tensor:
+        expert_w1 = self.w1.view(self.moe_num_experts, self.ffn_hidden_size,
+                                 self.hidden_size)[expert_idx]
+        expert_w2 = self.w2.view(self.moe_num_experts, self.ffn_hidden_size,
+                                 self.hidden_size)[expert_idx]
+
+        x1 = x.matmul(expert_w1.t())
+        act_out = self.activation_fn(x1)
+        x2 = act_out.matmul(expert_w2)
+        return x2
+
+
+class NomicExperts(nn.Module):
+
+    def __init__(self, config, hidden_size: int, ffn_hidden_size: int,
+                 moe_num_experts: int):
+        super().__init__()
+        self.moe_num_experts = moe_num_experts
+
+        self.mlp = NomicExpertMLP(hidden_size=config.n_embd,
+                                  ffn_hidden_size=config.n_inner,
+                                  moe_num_experts=moe_num_experts,
+                                  ffn_act_fn=config.hidden_act)
+        self.bias = nn.Parameter(torch.zeros(config.n_embd))
+
+    def forward(self, x: torch.Tensor, weights: torch.Tensor,
+                top_weights: torch.Tensor,
+                top_experts: torch.LongTensor) -> torch.Tensor:
+        q_len, hidden_size = x.shape
+        x = x.view(-1, hidden_size)
+        out = torch.zeros_like(x)
+
+        expert_mask = nn.functional.one_hot(
+            top_experts, num_classes=self.moe_num_experts).permute(2, 1, 0)
+        for expert_idx in range(0, self.moe_num_experts):
+            topk_idx, token_idx = torch.where(expert_mask[expert_idx])
+            if token_idx.shape[0] == 0:
+                continue
+
+            token_list = token_idx.tolist()
+            topk_list = topk_idx.tolist()
+
+            expert_tokens = x[None, token_list].reshape(-1, hidden_size)
+            expert_out = self.mlp(
+                expert_tokens, expert_idx) * top_weights[token_list, topk_list,
+                                                         None]
+
+            out.index_add_(0, token_idx, expert_out)
+
+        out = out.reshape(q_len, hidden_size)
+        return out + self.bias
+
+
+class NomicMoELayer(nn.Module):
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+
+        self.router = NomicRouter(
+            config.n_embd,
+            moe_num_experts=config.num_experts,
+            moe_top_k=config.moe_top_k,
+        )
+
+        self.experts = NomicExperts(
+            config,
+            hidden_size=config.n_embd,
+            ffn_hidden_size=config.n_inner,
+            moe_num_experts=config.num_experts,
+        )
+
+    def forward(self, x: torch.Tensor):
+        weights, top_weights, top_experts = self.router(x)
+        out = self.experts(x, weights, top_weights, top_experts)
+        return out
+
+
+class BertWithRopeBlock(nn.Module):
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 moe: bool = False,
+                 bias: bool = True,
+                 rotary_kwargs: Optional[dict] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.attn = BertWithRopeAttention(
+            hidden_size=config.hidden_size,
+            num_attention_heads=config.num_attention_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            bias=bias,
+            rotary_kwargs=rotary_kwargs,
+            prefix=f"{prefix}.attention")
+
+        if moe:
+            self.mlp = NomicMoELayer(config=config, )
+        else:
+            if config.hidden_act in ["silu", "geglu"]:
+                self.mlp = BertWithRopeGatedMLP(
+                    hidden_size=config.hidden_size,
+                    intermediate_size=config.intermediate_size,
+                    hidden_act=config.hidden_act,
+                    bias=bias,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.mlp")
+            else:
+                self.mlp = BertWithRopeMLP(
+                    hidden_size=config.hidden_size,
+                    intermediate_size=config.intermediate_size,
+                    hidden_act=config.hidden_act,
+                    bias=bias,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.mlp")
+
+        self.attn_ln = nn.LayerNorm(config.hidden_size,
+                                    eps=config.layer_norm_eps)
+        self.mlp_ln = nn.LayerNorm(config.hidden_size,
+                                   eps=config.layer_norm_eps)
+
+    def forward(self, positions: torch.Tensor, hidden_states: torch.Tensor):
+        attn_output = self.attn(positions, hidden_states)
+        hidden_states = self.attn_ln(hidden_states + attn_output)
+        mlp_out = self.mlp(hidden_states)
+        hidden_states = self.mlp_ln(hidden_states + mlp_out)
+        return hidden_states
+
+
+@support_torch_compile
+class BertWithRopeEncoder(nn.Module):
+
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 bias: bool = True,
+                 rotary_kwargs: Optional[dict] = None,
+                 prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        every_n = getattr(config, "moe_every_n_layers", 0)
+        self.layers = nn.ModuleList([
+            BertWithRopeBlock(config=config,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              bias=bias,
+                              moe=every_n > 0 and (layer_idx % every_n == 1),
+                              rotary_kwargs=rotary_kwargs,
+                              prefix=f"{prefix}.layer.{layer_idx}")
+            for layer_idx in range(config.num_hidden_layers)
+        ])
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        for layer in self.layers:
+            hidden_states = layer(positions, hidden_states)
+        return hidden_states
+
+
+class BertWithRope(nn.Module, SupportsV0Only, SupportsQuant):
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        self.vllm_config = vllm_config
+        self.config = vllm_config.model_config.hf_config
+        self.embeddings = BertWithRopeEmbedding(self.config)
+        self.encoder = BertWithRopeEncoder(
+            vllm_config=vllm_config,
+            bias=getattr(self.config, "bias", True),
+            rotary_kwargs=self.config.rotary_kwargs,
+            prefix=f"{prefix}.encoder")
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.embeddings(input_ids=input_ids,
+                                            token_type_ids=token_type_ids)
+        return self.encoder(positions, hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        weights = self.hf_to_vllm_mapper.apply(weights)
+
+        if self.config.hidden_act in ["silu", "geglu"]:
+            stacked_params_mapping = [
+                # (param_name, shard_name, shard_id)
+                ("gate_up_proj", "gate_proj", 0),
+                ("gate_up_proj", "up_proj", 1),
+            ]
+        else:
+            stacked_params_mapping = []
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "pooler" in name:
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class NomicBertModel(BertWithRope):
+    # for https://huggingface.co/nomic-ai/nomic-bert-2048
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            "emb_ln": "embeddings.LayerNorm",
+            "attn.Wqkv": "attn.qkv_proj",
+            "norm1": "attn_ln",
+            "mlp.fc1.": "mlp.up_proj.",
+            "mlp.fc11": "mlp.up_proj",
+            "mlp.fc12": "mlp.gate_proj",
+            "mlp.fc2": "mlp.down_proj",
+            "norm2": "mlp_ln",
+        })
+
+
+class GteNewModel(BertWithRope):
+    # for https://huggingface.co/Alibaba-NLP/new-impl
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            "new.": "",
+            "layer": "layers",
+            "attention.qkv_proj": "attn.qkv_proj",
+            "attention.o_proj": "attn.out_proj",
+        })
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+
+        # GteNewModel only gate_up_proj does not have bias.
+        # Hack method learned from vllm/model_executor/models/glm.py
+        for layer in self.encoder.layers:
+            layer.mlp.gate_up_proj.bias = None
+            layer.mlp.gate_up_proj.skip_bias_add = True
+
+    def split_up_gate_proj(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        n = "mlp.up_gate_proj"
+        for name, weight in weights:
+            if n in name:
+                up, gate = weight.chunk(2, dim=0)
+                yield name.replace(n, "mlp.up_proj"), up
+                yield name.replace(n, "mlp.gate_proj"), gate
+            else:
+                yield name, weight
+
+    def ignore_unnecessary_layers(self,
+                                  weights: Iterable[tuple[str, torch.Tensor]]):
+        for name, weight in weights:
+            if name.startswith("classifier"):
+                continue
+            yield name, weight
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        weights = self.ignore_unnecessary_layers(weights)
+        weights = self.split_up_gate_proj(weights)
+        return super().load_weights(weights)
+
+
+class SnowflakeGteNewModel(GteNewModel):
+    # for Snowflake/snowflake-arctic-embed-m-v2.0
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            "layer": "layers",
+            "attention.qkv_proj": "attn.qkv_proj",
+            "attention.o_proj": "attn.out_proj",
+        })
+
+
+class JinaRobertaModel(BertWithRope):
+    # for https://huggingface.co/jinaai/jina-embeddings-v3
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            "emb_ln": "embeddings.LayerNorm",
+            "mixer.Wqkv": "attn.qkv_proj",
+            "mixer.out_proj": "attn.out_proj",
+            "norm1": "attn_ln",
+            "mlp.fc1.": "mlp.up_proj.",
+            "mlp.fc2": "mlp.down_proj",
+            "norm2": "mlp_ln",
+        })
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return super().forward(input_ids=input_ids,
+                               positions=position_ids,
+                               intermediate_tensors=intermediate_tensors,
+                               inputs_embeds=inputs_embeds,
+                               token_type_ids=token_type_ids)
+
+    @torch.inference_mode()
+    def jina_merge_lora_weights(self, weights: Iterable[tuple[str,
+                                                              torch.Tensor]]):
+        # use for jina-embeddings-v3
+        # Merge Lora weights into a single weight tensor.
+        # This is a temporary solution until we have a better way to handle
+
+        scaling = self.config.lora_alpha / self.config.lora_rank
+        device = self.vllm_config.device_config.device
+
+        weights = {name: weight for name, weight in weights}
+
+        o = ".original"
+        a = ".0.lora_A"
+        b = ".0.lora_B"
+
+        # text-matching
+        i = -1
+
+        for name in list(weights.keys()):
+            if o in name:
+                dtype = weights[name].dtype
+                shape = weights[name].shape
+                weight_name = name[:-len(o)]
+
+                if "embeddings" in weight_name:
+                    B = weights[weight_name + a][i].to(device).float()
+                    A = weights[weight_name + b][i].to(device).float()
+                else:
+                    B = weights[weight_name + b][i].to(device).float()
+                    A = weights[weight_name + a][i].to(device).float()
+
+                weight = (weights[weight_name + o].to(device) +
+                          torch.matmul(B, A).view(shape) * scaling)
+                weight = weight.cpu().to(dtype)
+
+                weights[weight_name.replace(".parametrizations", "")] = weight
+
+                del weights[weight_name + o], weights[weight_name +
+                                                      a], weights[weight_name +
+                                                                  b]
+
+        return [(name, weight) for name, weight in weights.items()]
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        weights = self.jina_merge_lora_weights(weights)
+        return super().load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/blip.py b/vllm_v0.10.0/vllm/model_executor/models/blip.py
new file mode 100644
index 0000000..2b457fd
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/blip.py
@@ -0,0 +1,339 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Minimal implementation of BlipVisionModel intended to be only used 
+within a vision language model."""
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+from transformers import Blip2VisionConfig, BlipVisionConfig
+
+from vllm.attention.layer import MultiHeadAttention
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+
+from .interfaces import SupportsQuant
+
+
+def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
+    assert image_size % patch_size == 0
+    return image_size // patch_size
+
+
+def get_blip_num_patches(*, image_size: int, patch_size: int) -> int:
+    grid_length = get_blip_patch_grid_length(image_size=image_size,
+                                             patch_size=patch_size)
+    return grid_length * grid_length
+
+
+# Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/blip/modeling_blip.py#L164 # noqa
+class BlipVisionEmbeddings(nn.Module):
+
+    def __init__(self, config: Union[BlipVisionConfig, Blip2VisionConfig]):
+        super().__init__()
+
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+        )
+
+        self.num_patches = get_blip_num_patches(image_size=self.image_size,
+                                                patch_size=self.patch_size)
+        self.num_positions = self.num_patches + 1
+
+        self.position_embedding = nn.Parameter(
+            torch.randn(1, self.num_positions, self.embed_dim))
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(
+            dtype=target_dtype))  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+
+        position_embeds = self.position_embedding.to(target_dtype)
+        embeddings = embeddings + position_embeds[:, :embeddings.size(1), :]
+
+        return embeddings
+
+
+class BlipAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        config: Union[BlipVisionConfig, Blip2VisionConfig],
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                "embed_dim must be divisible by num_heads "
+                f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads}).")
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.qkv = QKVParallelLinear(
+            self.embed_dim,
+            self.head_dim,
+            self.num_heads,
+            bias=config.qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv",
+        )
+        self.projection = RowParallelLinear(
+            self.embed_dim,
+            self.embed_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.projection",
+        )
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
+
+        self.attn = MultiHeadAttention(self.num_heads_per_partition,
+                                       self.head_dim, self.scale)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads,
+                           self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ):
+        """Input shape: Batch x Time x Channel"""
+
+        qkv_states, _ = self.qkv(hidden_states)
+        query_states, key_states, value_states = qkv_states.chunk(3, dim=-1)
+        out = self.attn(query_states, key_states, value_states)
+        attn_output, _ = self.projection(out)
+
+        return attn_output, None
+
+
+class BlipMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: BlipVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        self.activation_fn = get_act_fn(config.hidden_act)
+        self.fc1 = ColumnParallelLinear(config.hidden_size,
+                                        config.intermediate_size,
+                                        bias=True,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.fc1")
+        self.fc2 = RowParallelLinear(config.intermediate_size,
+                                     config.hidden_size,
+                                     bias=True,
+                                     quant_config=quant_config,
+                                     prefix=f"{prefix}.fc2")
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+
+        return hidden_states
+
+
+class BlipEncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: BlipVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        # fallback to sdpa attention if tp unavailable
+        self.self_attn = BlipAttention(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.layer_norm1 = nn.LayerNorm(config.hidden_size,
+                                        eps=config.layer_norm_eps)
+        self.mlp = BlipMLP(config,
+                           quant_config=quant_config,
+                           prefix=f"{prefix}.mlp")
+        self.layer_norm2 = nn.LayerNorm(config.hidden_size,
+                                        eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, _ = self.self_attn(hidden_states=hidden_states)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class BlipEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self 
+    attention layers. Each layer is a [`BlipEncoderLayer`].
+
+    Args:
+        config: BlipConfig
+    """
+
+    def __init__(
+        self,
+        config: BlipVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        num_hidden_layers_override: Optional[int] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        if num_hidden_layers_override is None:
+            num_hidden_layers = config.num_hidden_layers
+        else:
+            num_hidden_layers = num_hidden_layers_override
+
+        self.layers = nn.ModuleList([
+            BlipEncoderLayer(config=config,
+                             quant_config=quant_config,
+                             prefix=f"{prefix}.layers.{layer_idx}")
+            for layer_idx in range(num_hidden_layers)
+        ])
+
+    def forward(self, inputs_embeds: torch.Tensor):
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(hidden_states)
+
+        return hidden_states
+
+
+class BlipVisionModel(nn.Module, SupportsQuant):
+    config_class = BlipVisionConfig
+    main_input_name = "pixel_values"
+    packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
+
+    def __init__(
+        self,
+        config: BlipVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        num_hidden_layers_override: Optional[int] = None,
+        require_post_norm: Optional[bool] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        self.embeddings = BlipVisionEmbeddings(config)
+        self.encoder = BlipEncoder(
+            config=config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers_override,
+            prefix=f"{prefix}.encoder",
+        )
+
+        num_hidden_layers = config.num_hidden_layers
+        if len(self.encoder.layers) > config.num_hidden_layers:
+            raise ValueError(
+                f"The original encoder only has {num_hidden_layers} "
+                f"layers, but you requested {len(self.encoder.layers)} layers."
+            )
+
+        # If possible, skip post_layernorm to conserve memory
+        if require_post_norm is None:
+            require_post_norm = len(self.encoder.layers) == num_hidden_layers
+
+        if require_post_norm:
+            self.post_layernorm = nn.LayerNorm(config.hidden_size,
+                                               eps=config.layer_norm_eps)
+        else:
+            self.post_layernorm = None
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.encoder(inputs_embeds=hidden_states)
+
+        if self.post_layernorm is None:
+            return hidden_states
+
+        return self.post_layernorm(hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        layer_count = len(self.encoder.layers)
+
+        for name, loaded_weight in weights:
+            # post_layernorm is not needed in BlipVisionModel
+            if (name.startswith("post_layernorm")
+                    and self.post_layernorm is None):
+                continue
+
+            # omit layers when num_hidden_layers_override is set
+            if name.startswith("encoder.layers"):
+                layer_idx = int(name.split(".")[2])
+                if layer_idx >= layer_count:
+                    continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm_v0.10.0/vllm/model_executor/models/blip2.py b/vllm_v0.10.0/vllm/model_executor/models/blip2.py
new file mode 100644
index 0000000..27a9208
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/blip2.py
@@ -0,0 +1,728 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Literal, Optional, TypedDict, Union
+
+import torch
+import torch.nn as nn
+from transformers import (BatchFeature, Blip2Config, Blip2QFormerConfig,
+                          apply_chunking_to_forward)
+
+from vllm.config import CacheConfig, VllmConfig
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptIndexTargets,
+                                        PromptInsertion, PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+from .blip import BlipVisionModel
+from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
+                         SupportsQuant)
+from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
+                    maybe_prefix, merge_multimodal_embeddings)
+
+# We use this internally as placeholders since there is no image token
+# defined on the HuggingFace repo
+_IMAGE_TOKEN_ID = 50265
+
+
+class Blip2ImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: torch.Tensor
+    """Shape: `(batch_size * num_images, num_channels, height, width)`"""
+
+
+class Blip2ImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: torch.Tensor
+    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+Blip2ImageInputs = Union[Blip2ImagePixelInputs, Blip2ImageEmbeddingInputs]
+
+
+class Blip2QFormerMultiHeadAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: Blip2QFormerConfig,
+        *,
+        quant_config: Optional[QuantizationConfig],
+        cache_config: Optional[CacheConfig],
+        is_cross_attention: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of "
+                f"the number of attention heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = (config.hidden_size //
+                                    config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.scaling = self.attention_head_size**-0.5
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            kv_hidden_size = config.encoder_hidden_size
+        else:
+            kv_hidden_size = config.hidden_size
+        self.key = nn.Linear(kv_hidden_size, self.all_head_size)
+        self.value = nn.Linear(kv_hidden_size, self.all_head_size)
+
+        self.position_embedding_type = getattr(config,
+                                               "position_embedding_type",
+                                               "absolute")
+        if self.position_embedding_type != "absolute":
+            raise NotImplementedError("Unsupported position_embedding_type: "
+                                      f"{self.position_embedding_type}")
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        x = x.view(*x.size()[:-1], self.num_attention_heads,
+                   self.attention_head_size)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+    ):
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(
+                self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(
+                self.value(encoder_hidden_states))
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        mixed_query_layer = self.query(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+        attention_probs = torch.softmax(attention_scores * self.scaling,
+                                        dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        context_layer = context_layer.view(*context_layer.size()[:-2],
+                                           self.all_head_size)
+
+        return context_layer
+
+
+class Blip2QFormerSelfOutput(nn.Module):
+
+    def __init__(self, config: Blip2QFormerConfig, prefix: str = "") -> None:
+        super().__init__()
+
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size,
+                                      eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_tensor: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class Blip2QFormerAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: Blip2QFormerConfig,
+        *,
+        quant_config: Optional[QuantizationConfig],
+        cache_config: Optional[CacheConfig],
+        is_cross_attention: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.attention = Blip2QFormerMultiHeadAttention(
+            config,
+            quant_config=quant_config,
+            cache_config=cache_config,
+            is_cross_attention=is_cross_attention,
+            prefix=f"{prefix}.attention",
+        )
+
+        self.output = Blip2QFormerSelfOutput(config, prefix=f"{prefix}.output")
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+    ) -> tuple[torch.Tensor]:
+        self_output = self.attention(
+            hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        attention_output = self.output(self_output, hidden_states)
+
+        return attention_output
+
+
+class Blip2QFormerIntermediate(nn.Module):
+
+    def __init__(self, config: Blip2QFormerConfig, prefix: str = "") -> None:
+        super().__init__()
+
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.intermediate_act_fn = get_act_fn(config.hidden_act)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class Blip2QFormerOutput(nn.Module):
+
+    def __init__(self, config: Blip2QFormerConfig, prefix: str = "") -> None:
+        super().__init__()
+
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size,
+                                      eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_tensor: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class Blip2QFormerLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Blip2QFormerConfig,
+        *,
+        quant_config: Optional[QuantizationConfig],
+        cache_config: Optional[CacheConfig],
+        layer_idx: int,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = Blip2QFormerAttention(config,
+                                               quant_config=quant_config,
+                                               cache_config=cache_config,
+                                               prefix=f"{prefix}.attention")
+
+        self.layer_idx = layer_idx
+
+        if layer_idx % config.cross_attention_frequency == 0:
+            self.crossattention = Blip2QFormerAttention(
+                config,
+                quant_config=quant_config,
+                cache_config=cache_config,
+                is_cross_attention=True,
+                prefix=f"{prefix}.crossattention")
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+
+        self.intermediate_query = Blip2QFormerIntermediate(
+            config, prefix=f"{prefix}.intermediate_query")
+        self.output_query = Blip2QFormerOutput(config,
+                                               prefix=f"{prefix}.output_query")
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor,
+        query_length: int,
+    ):
+        attention_output = self.attention(hidden_states)
+
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+
+            if self.has_cross_attention:
+                query_attention_output = self.crossattention(
+                    query_attention_output,
+                    encoder_hidden_states=encoder_hidden_states,
+                )
+
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = torch.cat([layer_output, layer_output_text],
+                                         dim=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+
+        return layer_output
+
+    def feed_forward_chunk(self,
+                           attention_output: torch.Tensor) -> torch.Tensor:
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def feed_forward_chunk_query(
+            self, attention_output: torch.Tensor) -> torch.Tensor:
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+
+
+class Blip2QFormerEncoder(nn.Module):
+
+    def __init__(
+        self,
+        config: Blip2QFormerConfig,
+        *,
+        quant_config: Optional[QuantizationConfig],
+        cache_config: Optional[CacheConfig],
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        self.layer = nn.ModuleList([
+            Blip2QFormerLayer(config,
+                              quant_config=quant_config,
+                              cache_config=cache_config,
+                              layer_idx=layer_idx,
+                              prefix=f"{prefix}.layer.{layer_idx}")
+            for layer_idx in range(config.num_hidden_layers)
+        ])
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor,
+        query_length: int,
+    ) -> torch.Tensor:
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+
+            hidden_states = layer_module(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                query_length=query_length,
+            )
+
+        return hidden_states
+
+
+# Adapted from https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/blip_2/modeling_blip_2.py#L1025
+class Blip2QFormerModel(nn.Module):
+
+    def __init__(
+        self,
+        config: Blip2QFormerConfig,
+        *,
+        quant_config: Optional[QuantizationConfig],
+        cache_config: Optional[CacheConfig],
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        self.layernorm = nn.LayerNorm(config.hidden_size,
+                                      eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        self.encoder = Blip2QFormerEncoder(config,
+                                           quant_config=quant_config,
+                                           cache_config=cache_config,
+                                           prefix=f"{prefix}.encoder")
+
+    def forward(
+        self,
+        query_embeds: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor,
+    ) -> torch.Tensor:
+        query_length = query_embeds.shape[1]
+
+        embedding_output = self.layernorm(query_embeds)
+        embedding_output = self.dropout(embedding_output)
+
+        sequence_output = self.encoder(
+            embedding_output,
+            encoder_hidden_states=encoder_hidden_states,
+            query_length=query_length,
+        )
+
+        return sequence_output
+
+
+class Blip2ProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(Blip2Config)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1}
+
+    def get_num_image_tokens(self) -> int:
+        hf_config = self.get_hf_config()
+        return hf_config.num_query_tokens
+
+
+class Blip2DummyInputsBuilder(BaseDummyInputsBuilder[Blip2ProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        hf_config = self.info.get_hf_config()
+        vision_config = hf_config.vision_config
+
+        max_image_size = vision_config.image_size
+        num_images = mm_counts.get("image", 0)
+
+        return {
+            "image":
+            self._get_dummy_images(width=max_image_size,
+                                   height=max_image_size,
+                                   num_images=num_images)
+        }
+
+
+class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if not mm_data:
+            # HF processor always adds placeholders even when there's no image
+            tokenizer = self.info.get_tokenizer()
+            prompt_ids = tokenizer.encode(prompt)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        return super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+
+        image_token_id = vocab["<image>"]
+        num_image_tokens = self.info.get_num_image_tokens()
+        image_tokens = [image_token_id] * num_image_tokens
+
+        return [
+            PromptInsertion(
+                modality="image",
+                target=PromptIndexTargets.start(),
+                insertion=image_tokens,
+            )
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor,
+                                        info=Blip2ProcessingInfo,
+                                        dummy_inputs=Blip2DummyInputsBuilder)
+class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
+                                    SupportsQuant):
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return None
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        # TODO: Optionally initializes this for supporting embeddings.
+        self.vision_model = BlipVisionModel(config.vision_config, quant_config)
+
+        self.query_tokens = nn.Parameter(
+            torch.zeros(1, config.num_query_tokens,
+                        config.qformer_config.hidden_size))
+
+        self.qformer = Blip2QFormerModel(config.qformer_config,
+                                         cache_config=cache_config,
+                                         quant_config=quant_config,
+                                         prefix=f"{prefix}.qformer")
+
+        self.language_projection = nn.Linear(
+            config.qformer_config.hidden_size,
+            config.text_config.hidden_size,
+            bias=True,
+        )
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+        actual_dims = tuple(data.shape[1:])
+
+        if actual_dims != expected_dims:
+            expected_expr = ("batch_size", *map(str, expected_dims))
+            raise ValueError(
+                f"The expected shape of pixel values is {expected_expr}. "
+                f"You supplied {tuple(data.shape)}.")
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Blip2ImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            pixel_values = flatten_bn(pixel_values, concat=True)
+
+            return Blip2ImagePixelInputs(
+                type="pixel_values",
+                data=self._validate_pixel_values(pixel_values),
+            )
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+
+            image_embeds = flatten_bn(image_embeds, concat=True)
+
+            return Blip2ImageEmbeddingInputs(
+                type="image_embeds",
+                data=image_embeds,
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _image_pixels_to_features(self, vision_model: BlipVisionModel,
+                                  pixel_values: torch.Tensor) -> torch.Tensor:
+
+        # NOTE: we skip the step to select the vision feature layer since
+        # this is already done inside the vision tower
+        image_features = vision_model(pixel_values)
+
+        return image_features
+
+    def _process_image_pixels(self,
+                              inputs: Blip2ImagePixelInputs) -> torch.Tensor:
+        assert self.vision_model is not None
+
+        pixel_values = inputs["data"]
+
+        return self._image_pixels_to_features(self.vision_model, pixel_values)
+
+    def _process_image_input(self,
+                             image_input: Blip2ImageInputs) -> torch.Tensor:
+
+        if image_input["type"] == "image_embeds":
+            return image_input["data"]
+
+        assert self.vision_model is not None
+        image_features = self._process_image_pixels(image_input)
+
+        query_tokens = self.query_tokens.expand(image_features.shape[0], -1,
+                                                -1)
+        query_output = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_features,
+        )
+
+        return self.language_projection(query_output)
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return []
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                _IMAGE_TOKEN_ID)
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> IntermediateTensors:
+        """Run forward pass for BLIP-2.
+
+        One key thing to understand is the `input_ids` already accounts for the
+        positions of the to-be-inserted image embeddings.
+
+        Concretely, consider a text prompt:
+        `"Question: What's the content of the image? Answer:"`.
+
+        Tokenizer outputs:
+        `[2, 45641, 35, 653, 18, 5, 1383, 9, 5, 2274, 116, 31652, 35]`.
+
+        To reserve space in KV cache, we have to insert placeholder tokens
+        before they are inputted to the model, so the input processor prepends 
+        dummy tokens (denoted as `50265`), resulting in:
+        `[50265, ..., 50265, 2, 45641, 35, ..., 31652, 35]`.
+
+        We insert 32 tokens since it corresponds to the number of query
+        embeddings outputted by the Q-Former and inputted to the language model.
+
+        This way, the `positions` and `attn_metadata` are consistent
+        with the `input_ids`.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            pixel_values: The pixels in each input image.
+        
+        Info:
+            [Blip2ImageInputs][]
+        """
+
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/bloom.py b/vllm_v0.10.0/vllm/model_executor/models/bloom.py
new file mode 100644
index 0000000..6e4a399
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/bloom.py
@@ -0,0 +1,373 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/bloom/modeling_bloom.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 HuggingFace Inc. team and BigScience workshop.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only BLOOM model compatible with HuggingFace weights."""
+import math
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import BloomConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size)
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsPP, SupportsQuant, SupportsV0Only
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
+    closest_power_of_2 = 2**math.floor(math.log2(total_num_heads))
+    base = torch.tensor(
+        2**(-(2**-(math.log2(closest_power_of_2) - 3))),
+        dtype=torch.float32,
+    )
+    powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
+    slopes = torch.pow(base, powers)
+
+    if closest_power_of_2 != total_num_heads:
+        extra_base = torch.tensor(
+            2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
+            dtype=torch.float32,
+        )
+        num_remaining_heads = min(closest_power_of_2,
+                                  total_num_heads - closest_power_of_2)
+        extra_powers = torch.arange(start=1,
+                                    end=1 + 2 * num_remaining_heads,
+                                    step=2,
+                                    dtype=torch.int32)
+        slopes = torch.cat(
+            [slopes, torch.pow(extra_base, extra_powers)], dim=0)
+    return slopes
+
+
+class BloomAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: BloomConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.total_num_heads = config.n_head
+        self.head_dim = self.hidden_size // self.total_num_heads
+        assert self.head_dim * self.total_num_heads == self.hidden_size
+
+        tp_world_size = get_tensor_model_parallel_world_size()
+        assert self.total_num_heads % tp_world_size == 0
+        self.num_heads = self.total_num_heads // tp_world_size
+
+        self.query_key_value = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            bias=True,
+            quant_config=quant_config,
+        )
+        self.dense = RowParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+        )
+
+        # Create the alibi slopes and slice them.
+        tp_rank = get_tensor_model_parallel_rank()
+        head_start = tp_rank * self.num_heads
+        head_end = (tp_rank + 1) * self.num_heads
+        alibi_slopes = _get_alibi_slopes(self.total_num_heads)
+        alibi_slopes = alibi_slopes[head_start:head_end].tolist()
+
+        scaling = self.head_dim**-0.5
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              scaling,
+                              alibi_slopes=alibi_slopes,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        del position_ids  # Unused.
+        qkv, _ = self.query_key_value(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.dense(attn_output)
+        return output
+
+
+class BloomMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: BloomConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        hidden_size = config.hidden_size
+        self.dense_h_to_4h = ColumnParallelLinear(
+            hidden_size,
+            4 * hidden_size,
+            quant_config=quant_config,
+        )
+        self.gelu_impl = get_act_fn("gelu")
+        self.dense_4h_to_h = RowParallelLinear(
+            4 * hidden_size,
+            hidden_size,
+            quant_config=quant_config,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, _ = self.dense_h_to_4h(x)
+        x = self.gelu_impl(x)
+        x, _ = self.dense_4h_to_h(x)
+        return x
+
+
+class BloomBlock(nn.Module):
+
+    def __init__(
+        self,
+        config: BloomConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        hidden_size = config.hidden_size
+
+        self.input_layernorm = nn.LayerNorm(hidden_size,
+                                            eps=config.layer_norm_epsilon)
+        self.self_attention = BloomAttention(config,
+                                             cache_config,
+                                             quant_config,
+                                             prefix=f"{prefix}.self_attention")
+        self.post_attention_layernorm = nn.LayerNorm(
+            hidden_size, eps=config.layer_norm_epsilon)
+        self.mlp = BloomMLP(config, quant_config)
+        self.apply_residual_connection_post_layernorm = (
+            config.apply_residual_connection_post_layernorm)
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+
+        # Layer norm post the self attention.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        # Self attention.
+        attention_output = self.self_attention(
+            position_ids=position_ids,
+            hidden_states=layernorm_output,
+        )
+        attention_output = attention_output + residual
+        layernorm_output = self.post_attention_layernorm(attention_output)
+
+        # Get residual
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = attention_output
+
+        # MLP.
+        output = self.mlp(layernorm_output) + residual
+        return output
+
+
+@support_torch_compile
+class BloomModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+
+        self.embed_dim = config.hidden_size
+
+        # Embedding + LN Embedding
+        self.word_embeddings = VocabParallelEmbedding(
+            config.vocab_size,
+            self.embed_dim,
+        )
+        self.word_embeddings_layernorm = nn.LayerNorm(
+            self.embed_dim, eps=config.layer_norm_epsilon)
+
+        # Transformer blocks
+        self.start_layer, self.end_layer, self.h = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: BloomBlock(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.h")
+
+        # Final Layer Norm
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(["hidden_states"],
+                                                    config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.word_embeddings_layernorm(self.word_embeddings(input_ids))
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+        for layer in self.h[self.start_layer:self.end_layer]:
+            hidden_states = layer(position_ids, hidden_states)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
+        hidden_states = self.ln_f(hidden_states)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if is_pp_missing_parameter(name, self):
+                continue
+            param = params_dict[name]
+
+            if "query_key_value" in name:
+                # NOTE: BLOOM's fused QKV's output_dim has the shape of
+                # (num_heads * 3 * head_size), while the
+                # required shape is (3 * num_heads * head_size).
+                # Thus, we need weight conversion.
+                output_dim = getattr(param, "output_dim", None)
+                num_heads = self.config.num_attention_heads
+                if output_dim is not None:
+                    loaded_weight_shape = loaded_weight.shape
+                    loaded_weight = loaded_weight.view(
+                        loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
+                        loaded_weight_shape[output_dim + 1:])
+                    loaded_weight = loaded_weight.transpose(
+                        output_dim, output_dim + 1)
+                    loaded_weight = loaded_weight.reshape(loaded_weight_shape)
+
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        return loaded_params
+
+
+class BloomForCausalLM(nn.Module, SupportsPP, SupportsV0Only, SupportsQuant):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.transformer = BloomModel(vllm_config=vllm_config,
+                                      prefix=maybe_prefix(
+                                          prefix, "transformer"))
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.transformer.word_embeddings
+        else:
+            self.lm_head = ParallelLMHead(self.config.vocab_size,
+                                          self.config.hidden_size)
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.transformer.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.transformer(input_ids, positions,
+                                         intermediate_tensors, inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self, skip_prefixes=["lm_head.weight"])
+        weights = _add_transformer_prefix(weights)
+        return loader.load_weights(weights)
+
+
+def _add_transformer_prefix(
+    weights: Iterable[tuple[str, torch.Tensor]]
+) -> Iterable[tuple[str, torch.Tensor]]:
+    for name, tensor in weights:
+        if not name.startswith('transformer.'):
+            name = 'transformer.' + name
+        yield name, tensor
diff --git a/vllm_v0.10.0/vllm/model_executor/models/chameleon.py b/vllm_v0.10.0/vllm/model_executor/models/chameleon.py
new file mode 100644
index 0000000..74b18df
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/chameleon.py
@@ -0,0 +1,1146 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable, Mapping, Sequence
+from functools import cached_property
+from typing import Any, Literal, Optional, TypedDict, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import (BatchFeature, ChameleonConfig, ChameleonProcessor,
+                          ChameleonVQVAEConfig)
+
+from vllm.attention import Attention
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, row_parallel_weight_loader)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
+                         SupportsQuant)
+from .utils import (flatten_bn, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix, merge_multimodal_embeddings)
+
+logger = init_logger(__name__)
+
+
+class ChameleonImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: torch.Tensor
+    """Shape: `(batch_size * num_images, num_channels, height, width)`"""
+
+
+class ChameleonProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(ChameleonConfig)
+
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(ChameleonProcessor, **kwargs)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1}
+
+    def get_num_image_tokens(self) -> int:
+        processor = self.get_hf_processor()
+        return processor.image_seq_length
+
+
+class ChameleonDummyInputsBuilder(
+        BaseDummyInputsBuilder[ChameleonProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        config = self.info.get_hf_config()
+
+        width = height = config.vq_config.resolution
+        num_images = mm_counts.get("image", 0)
+
+        return {
+            "image":
+            self._get_dummy_images(width=width,
+                                   height=height,
+                                   num_images=num_images)
+        }
+
+
+class ChameleonMultiModalProcessor(
+        BaseMultiModalProcessor[ChameleonProcessingInfo]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if not mm_data:
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        return super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+        # HF processor adds sep token for chat mode
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+
+        sep_token_id = vocab[tokenizer.sep_token]  # type: ignore
+
+        return prompt_tokens + [sep_token_id]
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(pixel_values=MultiModalFieldConfig.batched("image"))
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+
+        image_start_id = vocab[processor.image_start_token]
+        image_token_id = vocab[processor.image_token]
+        image_end_id = vocab[processor.image_end_token]
+
+        num_image_tokens = self.info.get_num_image_tokens()
+        image_tokens = [image_token_id] * num_image_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=PromptUpdateDetails.select_token_id(
+                    [image_start_id] + image_tokens + [image_end_id],
+                    embed_token_id=image_token_id,
+                ),
+            )
+        ]
+
+
+class ChameleonLayerNorm(nn.LayerNorm):
+
+    def __init__(self, hidden_size, *args, **kwargs):
+        super().__init__(hidden_size, *args, **kwargs)
+        self.normalized_shape = (hidden_size[-1], )
+
+        set_weight_attrs(self.weight,
+                         {"weight_loader": row_parallel_weight_loader})
+        set_weight_attrs(self.bias,
+                         {"weight_loader": row_parallel_weight_loader})
+
+    def forward(self, hidden_states):
+        hidden_states = F.layer_norm(hidden_states,
+                                     self.normalized_shape,
+                                     None,
+                                     None,
+                                     eps=1e-5)
+        hidden_states = hidden_states * self.weight + self.bias
+        return hidden_states
+
+
+# Copied from vllm.model_executor.models.llama.LlamaMLP -> ChameleonMLP
+class ChameleonMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=hidden_size,
+            output_sizes=[intermediate_size] * 2,
+            bias=bias,
+            quant_config=quant_config)
+        self.down_proj = RowParallelLinear(input_size=intermediate_size,
+                                           output_size=hidden_size,
+                                           bias=bias,
+                                           quant_config=quant_config)
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+# Modified from vllm.model_executor.models.llama.LlamaAttention -> ChameleonAttention #noqa
+class ChameleonAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 4096,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+        )
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+        )
+        self.q_norm = ChameleonLayerNorm((self.num_heads, self.head_dim))
+        self.k_norm = ChameleonLayerNorm((self.num_kv_heads, self.head_dim))
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def _apply_qk_norm(self, q: torch.Tensor,
+                       k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        # reshape for layernorm
+        q = q.reshape(-1, self.num_heads, self.head_dim)
+        k = k.reshape(-1, self.num_kv_heads, self.head_dim)
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        q = q.view(*q.shape[:-2], -1)
+        k = k.view(*k.shape[:-2], -1)
+        return q, k
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self._apply_qk_norm(q, k)
+
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class ChameleonDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: ChameleonConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+                config, "original_max_position_embeddings", None):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          4096)
+
+        self.self_attn = ChameleonAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=getattr(config, "num_key_value_heads",
+                                 config.num_attention_heads),
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=False,
+            cache_config=cache_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.mlp = ChameleonMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            bias=getattr(config, "mlp_bias", False),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+
+        return hidden_states, residual
+
+
+class ChameleonSwinDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: ChameleonConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+                config, "original_max_position_embeddings", None):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          4096)
+
+        self.self_attn = ChameleonAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=getattr(config, "num_key_value_heads",
+                                 config.num_attention_heads),
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=False,
+            cache_config=cache_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.mlp = ChameleonMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            bias=getattr(config, "mlp_bias", False),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+
+        residual = hidden_states
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = hidden_states + residual
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states, residual
+
+
+# Copied from transformers.models.chameleon.modeling_chameleon.ChameleonVQVAEVectorQuantizer #noqa
+class ChameleonVQVAEVectorQuantizer(nn.Module):
+
+    def __init__(self, config: ChameleonVQVAEConfig):
+        super().__init__()
+        self.num_embeddings = config.num_embeddings
+        self.embedding_dim = config.embed_dim
+        self.beta = getattr(config, "beta", 0.25)
+
+        self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim)
+        self.re_embed = self.num_embeddings
+
+    def forward(self, hidden_state: torch.Tensor):
+        hidden_state = hidden_state.permute(0, 2, 3, 1).contiguous()
+        hidden_state_flattened = hidden_state.view(-1, self.embedding_dim)
+
+        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+        distances = (
+            torch.sum(hidden_state_flattened**2, dim=1, keepdim=True) +
+            torch.sum(self.embedding.weight**2, dim=1) -
+            2 * torch.einsum("bd,dn->bn", hidden_state_flattened,
+                             self.embedding.weight.transpose(0, 1)))
+
+        min_encoding_indices = torch.argmin(distances, dim=1)
+        hidden_state_quant = self.embedding(min_encoding_indices).view(
+            hidden_state.shape)
+
+        # compute loss for embedding
+        loss = torch.mean((hidden_state_quant.detach() - hidden_state)**
+                          2) + self.beta * torch.mean(
+                              (hidden_state_quant - hidden_state.detach())**2)
+
+        # preserve gradients
+        hidden_state_quant = hidden_state + (hidden_state_quant -
+                                             hidden_state).detach()
+
+        # reshape back to match original input shape
+        hidden_state_quant = hidden_state_quant.permute(0, 3, 1,
+                                                        2).contiguous()
+
+        return hidden_state_quant, loss, min_encoding_indices
+
+
+# Copied from transformers.models.chameleon.modeling_chameleon.ChameleonVQVAEEncoderConvDownsample #noqa
+class ChameleonVQVAEEncoderConvDownsample(nn.Module):
+
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels,
+                              in_channels,
+                              kernel_size=3,
+                              stride=2,
+                              padding=0)
+
+    def forward(self, hidden_states: torch.Tensor):
+        # no asymmetric padding in torch conv, must do it ourselves
+        hidden_states = F.pad(hidden_states,
+                              pad=(0, 1, 0, 1),
+                              mode="constant",
+                              value=0)
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.chameleon.modeling_chameleon.ChameleonVQVAEEncoderResnetBlock #noqa
+class ChameleonVQVAEEncoderResnetBlock(nn.Module):
+
+    def __init__(
+        self,
+        config: ChameleonVQVAEConfig,
+        in_channels: int,
+        out_channels=None,
+        conv_shortcut=False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels if out_channels is None \
+            else out_channels
+        self.use_conv_shortcut = conv_shortcut
+
+        self.norm1 = torch.nn.GroupNorm(num_groups=32,
+                                        num_channels=in_channels,
+                                        eps=1e-6,
+                                        affine=True)
+        self.conv1 = torch.nn.Conv2d(in_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        self.norm2 = torch.nn.GroupNorm(num_groups=32,
+                                        num_channels=out_channels,
+                                        eps=1e-6,
+                                        affine=True)
+        self.dropout = torch.nn.Dropout(config.dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(in_channels,
+                                                     out_channels,
+                                                     kernel_size=3,
+                                                     stride=1,
+                                                     padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(in_channels,
+                                                    out_channels,
+                                                    kernel_size=1,
+                                                    stride=1,
+                                                    padding=0)
+
+    def forward(self, hidden_states: torch.Tensor):
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+
+        hidden_states = self.norm2(hidden_states)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                residual = self.conv_shortcut(residual)
+            else:
+                residual = self.nin_shortcut(residual)
+
+        return residual + hidden_states
+
+
+# Copied from transformers.models.chameleon.modeling_chameleon.ChameleonVQVAEEncoderAttnBlock #noqa
+class ChameleonVQVAEEncoderAttnBlock(nn.Module):
+
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+
+        self.norm = torch.nn.GroupNorm(num_groups=32,
+                                       num_channels=in_channels,
+                                       eps=1e-6,
+                                       affine=True)
+        self.q = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.k = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.v = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0)
+
+    def forward(self, hidden_states: torch.Tensor):
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        query_states = self.q(hidden_states)
+        key_states = self.k(hidden_states)
+        value_states = self.v(hidden_states)
+
+        # compute attention
+        batch_size, channels, height, width = query_states.shape
+        query_states = query_states.reshape(batch_size, channels,
+                                            height * width).permute(0, 2, 1)
+        key_states = key_states.reshape(batch_size, channels, height * width)
+        attn_weights = torch.bmm(query_states, key_states)
+        attn_weights = attn_weights * (int(channels)**(-0.5))
+        attn_weights = F.softmax(attn_weights, dim=2)
+
+        # attend to values
+        value_states = value_states.reshape(batch_size, channels,
+                                            height * width)
+        attn_weights = attn_weights.permute(0, 2, 1)
+        attn_output = torch.bmm(value_states,
+                                attn_weights).reshape(batch_size, channels,
+                                                      height, width)
+
+        attn_output = self.proj_out(attn_output)
+        return residual + attn_output
+
+
+# Copied from transformers.models.chameleon.modeling_chameleon.ChameleonVQVAEEncoder #noqa
+class ChameleonVQVAEEncoder(nn.Module):
+
+    def __init__(self, config: ChameleonVQVAEConfig):
+        super().__init__()
+
+        self.num_resolutions = len(config.channel_multiplier)
+        self.num_res_blocks = config.num_res_blocks
+        base_channels = config.base_channels
+        resolution = config.resolution
+        in_channels = config.in_channels
+        double_latent = config.double_latent
+        latent_channels = config.latent_channels
+        channel_multiplier = config.channel_multiplier
+
+        self.conv_in = torch.nn.Conv2d(in_channels,
+                                       base_channels,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+
+        curr_res = resolution
+        in_channel_multiplier = (1, ) + tuple(channel_multiplier)
+        self.in_channel_multiplier = in_channel_multiplier
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = base_channels * in_channel_multiplier[i_level]
+            block_out = base_channels * channel_multiplier[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    ChameleonVQVAEEncoderResnetBlock(
+                        config=config,
+                        in_channels=block_in,
+                        out_channels=block_out,
+                    ))
+                block_in = block_out
+                if (config.attn_resolutions is not None
+                        and curr_res in config.attn_resolutions
+                        and config.attn_type == "vanilla"):
+                    attn.append(ChameleonVQVAEEncoderAttnBlock(block_in))
+
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = ChameleonVQVAEEncoderConvDownsample(block_in)
+                curr_res = curr_res // 2
+            self.down.append(down)
+
+        self.mid = nn.Module()
+        self.mid.block_1 = ChameleonVQVAEEncoderResnetBlock(
+            config=config,
+            in_channels=block_in,
+            out_channels=block_in,
+        )
+        self.mid.attn_1 = ChameleonVQVAEEncoderAttnBlock(
+            block_in) if config.attn_type == "vanilla" else nn.Identity()
+        self.mid.block_2 = ChameleonVQVAEEncoderResnetBlock(
+            config=config,
+            in_channels=block_in,
+            out_channels=block_in,
+        )
+
+        self.norm_out = torch.nn.GroupNorm(num_groups=32,
+                                           num_channels=block_in,
+                                           eps=1e-6,
+                                           affine=True)
+        self.conv_out = torch.nn.Conv2d(
+            block_in,
+            2 * latent_channels if double_latent else latent_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+    def forward(self, pixel_values: torch.Tensor):
+        pixel_values = pixel_values.to(self.conv_in.weight.dtype)
+
+        # downsampling
+        hidden_states = [self.conv_in(pixel_values)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                hidden_state = self.down[i_level].block[i_block](
+                    hidden_states[-1])
+                if len(self.down[i_level].attn) > 0:
+                    hidden_state = self.down[i_level].attn[i_block](
+                        hidden_state)
+                hidden_states.append(hidden_state)
+            if i_level != self.num_resolutions - 1:
+                hidden_states.append(self.down[i_level].downsample(
+                    hidden_states[-1]))
+
+        # middle
+        last_hidden_state = hidden_states[-1]
+        last_hidden_state = self.mid.block_1(last_hidden_state)
+        last_hidden_state = self.mid.attn_1(last_hidden_state)
+        last_hidden_state = self.mid.block_2(last_hidden_state)
+
+        # end
+        last_hidden_state = self.norm_out(last_hidden_state)
+        last_hidden_state *= torch.sigmoid(last_hidden_state)
+        last_hidden_state = self.conv_out(last_hidden_state)
+        return last_hidden_state
+
+
+# Adapted from transformers.models.chameleon.modeling_chameleon.ChameleonVQVAE #noqa
+class ChameleonVQVAE(nn.Module):
+
+    def __init__(self, config: ChameleonVQVAEConfig):
+        super().__init__()
+        self.encoder = ChameleonVQVAEEncoder(config)
+        self.quantize = ChameleonVQVAEVectorQuantizer(config)
+        self.quant_conv = torch.nn.Conv2d(config.latent_channels,
+                                          config.embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(config.embed_dim,
+                                               config.latent_channels, 1)
+        self.eval()  # Chameleon's VQ model is frozen
+
+    def encode(
+        self, pixel_values: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        hidden_states = self.encoder(pixel_values)
+        hidden_states = self.quant_conv(hidden_states)
+        quant, emb_loss, indices = self.quantize(hidden_states)
+        return quant, emb_loss, indices
+
+
+# Copied from transformers.models.chameleon.modeling_chameleon.ChameleonImageVocabularyMapping #noqa
+class ChameleonImageVocabularyMapping:
+    """
+    A class for mapping discrete image tokens from VQGAN to BPE tokens.
+    """
+
+    def __init__(self, vocab_map: dict[str, int]):
+        self.vocab_map = vocab_map
+        self.image_token_id = vocab_map.get("<image>")
+
+    @cached_property
+    def val2name(self):
+        return {v: k for k, v in self.vocab_map.items()}
+
+    @cached_property
+    def image_tokens(self):
+        return sorted([
+            val for name, val in self.vocab_map.items()
+            if name.startswith("IMGIMG")
+        ])
+
+    @cached_property
+    def bpe2img(self):
+        img_tkn_chr_mapping = {chr(ord("A") + i): str(i) for i in range(10)}
+
+        def remap(old_name: str) -> str:
+            return "".join(
+                img_tkn_chr_mapping.get(c, c)
+                for c in old_name[len("IMGIMG"):-1])
+
+        return {
+            tok: int(remap(self.val2name[tok]))
+            for tok in self.image_tokens
+        }
+
+    @cached_property
+    def img2bpe(self):
+        return {v: k for k, v in self.bpe2img.items()}
+
+    @cached_property
+    def bpe2img_search_tensors(self):
+        return torch.tensor(sorted(self.bpe2img.keys())), torch.tensor(
+            sorted(self.bpe2img.values()))
+
+    @cached_property
+    def img2bpe_mapping_tensor(self):
+        mapping = torch.zeros(max(self.img2bpe.keys()) + 1, dtype=torch.int)
+        for k, v in self.img2bpe.items():
+            mapping[k] = v
+        return mapping
+
+    def convert_img2bpe(self, img_batch: torch.Tensor) -> torch.Tensor:
+        device = img_batch.device
+        img_tokens = self.img2bpe_mapping_tensor[img_batch.to("cpu")]
+        return img_tokens.to(device)
+
+
+class ChameleonModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+        )
+        self.vocabulary_mapping = ChameleonImageVocabularyMapping(
+            config.vocabulary_map)
+        decoder_layer = ChameleonDecoderLayer if not self.config.swin_norm \
+            else ChameleonSwinDecoderLayer
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: decoder_layer(config=config,
+                                         cache_config=cache_config,
+                                         quant_config=quant_config,
+                                         prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.vqmodel = ChameleonVQVAE(config.vq_config)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def get_image_tokens(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        """
+        Tokenizes images into discrete tokens with VQGAN module. Converts
+        obtained image tokens into BPE tokens and wraps with "boi" and "eoi"
+        special tokens.
+        """
+        batch_size = pixel_values.shape[0]
+        _, _, image_toks = self.vqmodel.encode(pixel_values)
+        bpe_toks = self.vocabulary_mapping.convert_img2bpe(image_toks)
+        bpe_toks = bpe_toks.view(batch_size, -1)
+        return bpe_toks
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    ChameleonMultiModalProcessor,
+    info=ChameleonProcessingInfo,
+    dummy_inputs=ChameleonDummyInputsBuilder)
+class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                        SupportsPP, SupportsQuant):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
+    }
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<image>"
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        self.config = config
+        self.multimodal_config = multimodal_config
+        self.model = ChameleonModel(vllm_config=vllm_config,
+                                    prefix=maybe_prefix(prefix, "model"))
+        self.unpadded_vocab_size = config.vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+        )
+        if config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size, logit_scale)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        vq_config: ChameleonVQVAEConfig = self.config.vq_config
+        expected_dims = (3, vq_config.resolution, vq_config.resolution)
+        actual_dims = tuple(data.shape[1:])
+
+        if actual_dims != expected_dims:
+            expected_expr = ("batch_size", *map(str, expected_dims))
+            raise ValueError(
+                f"The expected shape of pixel values is {expected_expr}. "
+                f"You supplied {tuple(data.shape)}.")
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[ChameleonImagePixelInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+
+        if pixel_values is None:
+            return None
+
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
+
+        pixel_values = flatten_bn(pixel_values, concat=True)
+
+        return ChameleonImagePixelInputs(
+            type="pixel_values",
+            data=self._validate_pixel_values(pixel_values),
+        )
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return []
+        assert self.model.vqmodel is not None
+        image_tokens = self.model.get_image_tokens(image_input["data"].to(
+            self.config.torch_dtype))
+        vision_embeddings = self.model.get_input_embeddings(image_tokens)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+
+        inputs_embeds = self.model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.model.vocabulary_mapping.image_token_id)
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.model(input_ids,
+                                   positions,
+                                   intermediate_tensors,
+                                   inputs_embeds=inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+
+        # Disallow image tokens which does not include special
+        # begin-image and end-image tokens
+        if logits is not None:
+            image_tokens = self.model.vocabulary_mapping.image_tokens
+            logits[:, image_tokens] = torch.finfo(logits.dtype).min
+
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+
+            # With tie_word_embeddings, we can skip lm_head.weight
+            # The weight might appear unnecessarily in the files if the model is
+            # processed with quantization, LoRA, fine-tuning, etc.
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+
+            use_default_weight_loading = False
+            if "vqmodel" in name:
+                if self.model.vqmodel is not None:
+                    # We only do sharding for language model and
+                    # not vqvae for now.
+                    use_default_weight_loading = True
+            else:
+                for (param_name, weight_name,
+                     shard_id) in stacked_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    if name.endswith("kv_scale"):
+                        remapped_kv_scale_name = name.replace(
+                            ".kv_scale", ".attn.kv_scale")
+                        if remapped_kv_scale_name not in params_dict:
+                            logger.warning_once(
+                                "Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.",  # noqa: E501
+                                name,
+                                remapped_kv_scale_name,
+                            )
+                            continue
+                        else:
+                            name = remapped_kv_scale_name
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            if use_default_weight_loading and name in params_dict:
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm_v0.10.0/vllm/model_executor/models/chatglm.py b/vllm_v0.10.0/vllm/model_executor/models/chatglm.py
new file mode 100644
index 0000000..129f094
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/chatglm.py
@@ -0,0 +1,478 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from
+# https://github.com/THUDM/ChatGLM2-6B
+"""Inference-only ChatGLM model compatible with THUDM weights."""
+import json
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import LayerNorm
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs import ChatGLMConfig
+
+from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
+from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class GLMAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: ChatGLMConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.multi_query_attention = config.multi_query_attention
+        self.total_num_kv_heads = (config.multi_query_group_num
+                                   if config.multi_query_attention else
+                                   config.num_attention_heads)
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = config.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+
+        self.query_key_value = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=config.add_bias_linear or config.add_qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.query_key_value",
+        )
+        self.dense = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            config.hidden_size,
+            bias=config.add_bias_linear,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense",
+        )
+
+        # https://huggingface.co/THUDM/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141
+        rope_ratio = getattr(config, "rope_ratio", 1.0)
+        max_positions = getattr(config, "seq_length", 8192)
+        # NOTE: THUDM/cogagent-9b-20241220 uses original_rope=False,
+        # which is equivalent to is_neox_style=True
+        is_neox_style = not config.original_rope
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim // 2,
+            max_position=max_positions,
+            base=10000 * rope_ratio,
+            is_neox_style=is_neox_style,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.query_key_value(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(position_ids, q, k)
+        context_layer = self.attn(q, k, v)
+        attn_output, _ = self.dense(context_layer)
+        return attn_output
+
+
+class GLMMLP(nn.Module):
+    """MLP.
+
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform nonlinear transformation, and project the
+    state back into h hidden dimension.
+    """
+
+    def __init__(
+        self,
+        config: ChatGLMConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.add_bias = config.add_bias_linear
+
+        # Project to 4h.
+        self.dense_h_to_4h = MergedColumnParallelLinear(
+            config.hidden_size,
+            [config.ffn_hidden_size] * 2,
+            bias=config.add_bias_linear,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense_h_to_4h",
+        )
+
+        self.activation_func = SiluAndMul()
+
+        # Project back to h.
+        self.dense_4h_to_h = RowParallelLinear(
+            config.ffn_hidden_size,
+            config.hidden_size,
+            bias=config.add_bias_linear,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense_4h_to_h",
+        )
+
+    def forward(self, hidden_states):
+        # [s, b, 4hp]
+        intermediate_parallel, _ = self.dense_h_to_4h(hidden_states)
+        intermediate_parallel = self.activation_func(intermediate_parallel)
+        # [s, b, h]
+        output, _ = self.dense_4h_to_h(intermediate_parallel)
+        return output
+
+
+class GLMBlock(nn.Module):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+    """
+
+    def __init__(
+        self,
+        config: ChatGLMConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.apply_residual_connection_post_layernorm = (
+            config.apply_residual_connection_post_layernorm)
+
+        self.fp32_residual_connection = config.fp32_residual_connection
+
+        layer_norm_func = RMSNorm if config.rmsnorm else LayerNorm
+        # Layernorm on the input data.
+        self.input_layernorm = layer_norm_func(config.hidden_size,
+                                               eps=config.layernorm_epsilon)
+
+        # Self attention.
+        self.self_attention = GLMAttention(config,
+                                           cache_config,
+                                           quant_config,
+                                           prefix=f"{prefix}.self_attention")
+        self.hidden_dropout = config.hidden_dropout
+
+        # Layernorm on the attention output
+        self.post_attention_layernorm = layer_norm_func(
+            config.hidden_size, eps=config.layernorm_epsilon)
+
+        # MLP
+        self.mlp = GLMMLP(config, quant_config, prefix=f"{prefix}.mlp")
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        # hidden_states: [num_tokens, h]
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        # Self attention.
+        attention_output = self.self_attention(
+            hidden_states=layernorm_output,
+            position_ids=position_ids,
+        )
+
+        # Residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        layernorm_input = residual + attention_output
+
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+
+        # Second residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = layernorm_input
+
+        output = self.mlp(layernorm_output) + residual
+
+        return output
+
+
+class GLMTransformer(nn.Module):
+    """Transformer class."""
+
+    def __init__(
+        self,
+        config: ChatGLMConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.post_layer_norm = config.post_layer_norm
+
+        # Number of layers.
+        self.num_layers = config.num_layers
+
+        # Transformer layers.
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            self.num_layers,
+            lambda prefix: GLMBlock(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+
+        if self.post_layer_norm:
+            layer_norm_func = RMSNorm if config.rmsnorm else LayerNorm
+            # Final layer norm before output.
+            self.final_layernorm = layer_norm_func(
+                config.hidden_size, eps=config.layernorm_epsilon)
+
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(["hidden_states"],
+                                                    config.hidden_size))
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: torch.Tensor,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states = layer(hidden_states=hidden_states,
+                                  position_ids=position_ids)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
+
+        # Final layer norm.
+        if self.post_layer_norm:
+            hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states
+
+
+@support_torch_compile
+class ChatGLMModel(nn.Module, SupportsQuant):
+    packed_modules_mapping = {
+        "linear_proj.merged_proj":
+        ["linear_proj.gate_proj", "linear_proj.dense_h_to_4h"]
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+
+        self.embedding = VocabParallelEmbedding(config.padded_vocab_size,
+                                                config.hidden_size,
+                                                quant_config=quant_config,
+                                                prefix=f"{prefix}.embedding")
+
+        self.num_layers = config.num_layers
+        self.multi_query_group_num = config.multi_query_group_num
+        self.kv_channels = config.kv_channels
+        self.encoder = GLMTransformer(config,
+                                      cache_config,
+                                      quant_config,
+                                      prefix=f"{prefix}.encoder")
+
+        self.output_layer = ParallelLMHead(config.padded_vocab_size,
+                                           config.hidden_size,
+                                           quant_config=quant_config,
+                                           prefix=f"{prefix}.output_layer")
+
+        self.make_empty_intermediate_tensors = (
+            self.encoder.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embedding(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+
+        # Run encoder.
+        hidden_states = self.encoder(
+            hidden_states=hidden_states,
+            position_ids=positions,
+        )
+
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("linear_proj.merged_proj", "linear_proj.gate_proj", 0),
+            ("linear_proj.merged_proj", "linear_proj.dense_h_to_4h", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if "rotary_pos_emb.inv_freq" in name:
+                    continue
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class ChatGLMBaseModel(nn.Module):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={".word_embeddings": ""}, )
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        transformer_type: type[ChatGLMModel] = ChatGLMModel,
+    ) -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        self.config = config
+        self.lora_config = lora_config
+        self.multimodal_config = multimodal_config
+
+        self.quant_config = quant_config
+        self.max_position_embeddings = getattr(config, "max_sequence_length",
+                                               8192)
+        self.transformer = transformer_type(vllm_config=vllm_config,
+                                            prefix=maybe_prefix(
+                                                prefix, "transformer"))
+        if self.config.tie_word_embeddings:
+            self.transformer.output_layer.weight = (
+                self.transformer.embedding.weight)
+        self.lm_head = self.transformer.output_layer
+        self.logits_processor = LogitsProcessor(config.padded_vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.transformer.make_empty_intermediate_tensors)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+
+class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
+                         SupportsQuant):
+    packed_modules_mapping = {
+        "query_key_value": ["query_key_value"],
+        "dense_h_to_4h": ["dense_h_to_4h"]
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
+        if hasattr(config, "vision_config"):
+            hf_overrides = {"architectures": ["GLM4VForCausalLM"]}
+            raise RuntimeError(
+                "The configuration of this model indicates that it supports "
+                "vision inputs, but you instantiated the text-only version "
+                "of this model. Please use the vision model by setting "
+                f"`--hf-overrides '{json.dumps(hf_overrides)}'`")
+
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.transformer(input_ids, positions,
+                                         intermediate_tensors, inputs_embeds)
+        return hidden_states
diff --git a/vllm_v0.10.0/vllm/model_executor/models/clip.py b/vllm_v0.10.0/vllm/model_executor/models/clip.py
new file mode 100644
index 0000000..dcab008
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/clip.py
@@ -0,0 +1,407 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Minimal implementation of CLIPVisionModel intended to be only used
+within a vision language model."""
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+from transformers import CLIPVisionConfig
+
+from vllm.attention.layer import MultiHeadAttention
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.interfaces import SupportsQuant
+
+from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
+
+
+class CLIPEncoderInfo(VisionEncoderInfo[CLIPVisionConfig]):
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        return self.get_patch_grid_length()**2 + 1
+
+    def get_image_size(self) -> int:
+        return self.vision_config.image_size
+
+    def get_patch_size(self) -> int:
+        return self.vision_config.patch_size
+
+    def get_patch_grid_length(self) -> int:
+        image_size, patch_size = self.get_image_size(), self.get_patch_size()
+        assert image_size % patch_size == 0
+        return image_size // patch_size
+
+
+# Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa
+class CLIPVisionEmbeddings(nn.Module):
+
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        assert self.image_size % self.patch_size == 0
+
+        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=False,
+        )
+
+        self.num_patches = (self.image_size // self.patch_size)**2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Embedding(self.num_positions,
+                                               self.embed_dim)
+        self.register_buffer("position_ids",
+                             torch.arange(self.num_positions).expand((1, -1)),
+                             persistent=False)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(
+            dtype=target_dtype))  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding(self.position_ids)
+
+        return embeddings
+
+
+class CLIPAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        config: CLIPVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                "embed_dim must be divisible by num_heads "
+                f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads}).")
+        self.scale = self.head_dim**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=self.embed_dim,
+            head_size=self.head_dim,
+            total_num_heads=self.num_heads,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.out_proj = RowParallelLinear(
+            input_size=self.embed_dim,
+            output_size=self.embed_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
+
+        self.attn = MultiHeadAttention(self.num_heads_per_partition,
+                                       self.head_dim, self.scale)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ):
+        """Input shape: Batch x Time x Channel"""
+
+        qkv_states, _ = self.qkv_proj(hidden_states)
+        query_states, key_states, value_states = qkv_states.chunk(3, dim=-1)
+        out = self.attn(query_states, key_states, value_states)
+        attn_output, _ = self.out_proj(out)
+
+        return attn_output, None
+
+
+class CLIPMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: CLIPVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.activation_fn = get_act_fn(config.hidden_act)
+        self.fc1 = ColumnParallelLinear(config.hidden_size,
+                                        config.intermediate_size,
+                                        bias=True,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.fc1")
+        self.fc2 = RowParallelLinear(config.intermediate_size,
+                                     config.hidden_size,
+                                     bias=True,
+                                     quant_config=quant_config,
+                                     prefix=f"{prefix}.fc2")
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+
+        return hidden_states
+
+
+class CLIPEncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: CLIPVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.self_attn = CLIPAttention(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.layer_norm1 = nn.LayerNorm(config.hidden_size,
+                                        eps=config.layer_norm_eps)
+        self.mlp = CLIPMLP(config,
+                           quant_config=quant_config,
+                           prefix=f"{prefix}.mlp")
+        self.layer_norm2 = nn.LayerNorm(config.hidden_size,
+                                        eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, _ = self.self_attn(hidden_states=hidden_states)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class CLIPEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self
+    attention layers. Each layer is a [`CLIPEncoderLayer`].
+
+    Args:
+        config: CLIPConfig
+    """
+
+    def __init__(
+        self,
+        config: CLIPVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        num_hidden_layers_override: Optional[int] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        if num_hidden_layers_override is None:
+            num_hidden_layers = config.num_hidden_layers
+        else:
+            num_hidden_layers = num_hidden_layers_override
+        self.layers = nn.ModuleList([
+            CLIPEncoderLayer(config=config,
+                             quant_config=quant_config,
+                             prefix=f"{prefix}.layers.{layer_idx}")
+            for layer_idx in range(num_hidden_layers)
+        ])
+
+    def forward(
+        self, inputs_embeds: torch.Tensor, return_all_hidden_states: bool
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        hidden_states_pool = [inputs_embeds]
+        hidden_states = inputs_embeds
+
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(hidden_states)
+            if return_all_hidden_states:
+                hidden_states_pool.append(hidden_states)
+        # If we have multiple feature sample layers, we return all hidden
+        # states in order and grab the ones we need by index.
+        if return_all_hidden_states:
+            return hidden_states_pool
+        return hidden_states
+
+
+class CLIPVisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        config: CLIPVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        num_hidden_layers_override: Optional[int] = None,
+        require_post_norm: Optional[bool] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = CLIPVisionEmbeddings(config)
+
+        # NOTE: This typo of "layrnorm" is not fixed on purpose to match
+        # the original transformers code and name of the model weights.
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+        self.encoder = CLIPEncoder(
+            config=config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers_override,
+            prefix=f"{prefix}.encoder",
+        )
+
+        num_hidden_layers = config.num_hidden_layers
+        if len(self.encoder.layers) > config.num_hidden_layers:
+            raise ValueError(
+                f"The original encoder only has {num_hidden_layers} "
+                f"layers, but you requested {len(self.encoder.layers)} layers."
+            )
+
+        # If possible, skip post_layernorm to conserve memory
+        if require_post_norm is None:
+            require_post_norm = len(self.encoder.layers) == num_hidden_layers
+
+        if require_post_norm:
+            self.post_layernorm = nn.LayerNorm(embed_dim,
+                                               eps=config.layer_norm_eps)
+        else:
+            self.post_layernorm = None
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        feature_sample_layers: Optional[list[int]] = None,
+    ) -> torch.Tensor:
+
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        return_all_hidden_states = feature_sample_layers is not None
+
+        # Produces either the last layer output or all of the hidden states,
+        # depending on if we have feature_sample_layers or not
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            return_all_hidden_states=return_all_hidden_states)
+
+        # Handle post-norm (if applicable) and stacks feature layers if needed
+        encoder_outputs = resolve_visual_encoder_outputs(
+            encoder_outputs, feature_sample_layers, self.post_layernorm,
+            self.config.num_hidden_layers)
+
+        return encoder_outputs
+
+
+class CLIPVisionModel(nn.Module, SupportsQuant):
+    config_class = CLIPVisionConfig
+    main_input_name = "pixel_values"
+    packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
+
+    def __init__(
+        self,
+        config: CLIPVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        num_hidden_layers_override: Optional[int] = None,
+        require_post_norm: Optional[bool] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.vision_model = CLIPVisionTransformer(
+            config=config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers_override,
+            require_post_norm=require_post_norm,
+            prefix=f"{prefix}.vision_model")
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        feature_sample_layers: Optional[list[int]] = None,
+    ) -> torch.Tensor:
+        return self.vision_model(pixel_values, feature_sample_layers)
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    # (TODO) Add prefix argument for filtering out weights to be loaded
+    #        ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        layer_count = len(self.vision_model.encoder.layers)
+
+        for name, loaded_weight in weights:
+            # post_layernorm is not needed in CLIPVisionModel
+            if (name.startswith("vision_model.post_layernorm")
+                    and self.vision_model.post_layernorm is None):
+                continue
+
+            # omit layers when num_hidden_layers_override is set
+            if name.startswith("vision_model.encoder.layers"):
+                layer_idx = int(name.split(".")[3])
+                if layer_idx >= layer_count:
+                    continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm_v0.10.0/vllm/model_executor/models/commandr.py b/vllm_v0.10.0/vllm/model_executor/models/commandr.py
new file mode 100644
index 0000000..c4f6144
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/commandr.py
@@ -0,0 +1,474 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2024 Cohere and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is based on the LLama model definition file in transformers
+"""PyTorch Cohere model."""
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import CohereConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name,
+    row_parallel_weight_loader)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
+from .utils import (AutoWeightsLoader, extract_layer_index,
+                    is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+@torch.compile(backend=current_platform.simple_compile_backend)
+def layer_norm_func(hidden_states, weight, variance_epsilon):
+    input_dtype = hidden_states.dtype
+    hidden_states = hidden_states.to(torch.float32)
+    mean = hidden_states.mean(-1, keepdim=True)
+    variance = (hidden_states - mean).pow(2).mean(-1, keepdim=True)
+    hidden_states = (hidden_states - mean) * torch.rsqrt(variance +
+                                                         variance_epsilon)
+    hidden_states = weight.to(torch.float32) * hidden_states
+    return hidden_states.to(input_dtype)
+
+
+class LayerNorm(nn.Module):
+
+    def __init__(self, param_shape=None, eps=1e-5):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(param_shape))
+        self.variance_epsilon = eps
+        set_weight_attrs(self.weight,
+                         {"weight_loader": row_parallel_weight_loader})
+
+    def forward(self, hidden_states, residuals=None):
+        hidden_states = layer_norm_func(hidden_states, self.weight,
+                                        self.variance_epsilon)
+        return hidden_states, residuals
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaMLP Llama->Cohere
+class CohereMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: CohereConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_up_proj = MergedColumnParallelLinear(
+            self.hidden_size,
+            [self.intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class CohereAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: CohereConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        tp_size = get_tensor_model_parallel_world_size()
+        self.config = config
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.total_num_heads = config.num_attention_heads
+        self.num_heads = self.total_num_heads // tp_size
+        self.head_dim = self.hidden_size // self.total_num_heads
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.max_position_embeddings = getattr(
+            config, "model_max_length", None) or getattr(
+                config, "max_position_embeddings", 8192)
+        self.rope_theta = config.rope_theta
+        self.rope_scaling = getattr(config, "rope_scaling", None)
+        self.use_qk_norm = getattr(config, "use_qk_norm", False)
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+            rope_scaling=self.rope_scaling,
+            is_neox_style=False,
+        )
+
+        # Model v2 has interleaved sliding windows, v1 does not
+        interleaved_sliding_window = getattr(config,
+                                             "interleaved_sliding_window",
+                                             None)
+        self.v1 = interleaved_sliding_window is None
+
+        layer_idx = extract_layer_index(prefix)
+        layer_has_sliding_window = (
+            getattr(config, "sliding_window_pattern", False) and
+            (layer_idx + 1) % self.config.sliding_window_pattern
+            != 0) or (getattr(config, "layer_types", False)
+                      and config.layer_types[layer_idx] == "sliding_attention")
+
+        self.sliding_window = (interleaved_sliding_window
+                               or config.sliding_window
+                               if layer_has_sliding_window else None)
+
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              per_layer_sliding_window=self.sliding_window,
+                              prefix=f"{prefix}.attn")
+        if self.use_qk_norm:
+            self.q_norm = LayerNorm(param_shape=(self.num_heads,
+                                                 self.head_dim),
+                                    eps=config.layer_norm_eps)
+            self.k_norm = LayerNorm(param_shape=(self.num_kv_heads,
+                                                 self.head_dim),
+                                    eps=config.layer_norm_eps)
+
+    def _apply_qk_norm(self, q, k):
+        q = q.view(*q.shape[:-1], -1, self.head_dim)
+        k = k.view(*k.shape[:-1], -1, self.head_dim)
+        q, _ = self.q_norm(q)
+        k, _ = self.k_norm(k)
+        q = q.view(*q.shape[:-2], -1)
+        k = k.view(*k.shape[:-2], -1)
+        return q, k
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        if self.use_qk_norm:
+            q, k = self._apply_qk_norm(q, k)
+        if self.v1 or self.sliding_window:
+            q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class CohereDecoderLayer(nn.Module):
+
+    def __init__(self,
+                 config: CohereConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = CohereAttention(config,
+                                         cache_config,
+                                         quant_config=quant_config,
+                                         prefix=f"{prefix}.self_attn")
+
+        self.mlp = CohereMLP(config,
+                             quant_config=quant_config,
+                             prefix=f"{prefix}.mlp")
+        self.input_layernorm = LayerNorm(param_shape=(config.hidden_size),
+                                         eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        residual = hidden_states
+        hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states_attention = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+        hidden_states_mlp = self.mlp(hidden_states)
+        # Add everything together
+        hidden_states = residual + hidden_states_attention + hidden_states_mlp
+
+        return hidden_states, residual
+
+
+@support_torch_compile
+class CohereModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        self.quant_config = quant_config
+
+        self.config = config
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
+                                                   config.hidden_size)
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: CohereDecoderLayer(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.layers")
+        self.norm = LayerNorm(param_shape=(config.hidden_size),
+                              eps=config.layer_norm_eps)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
+            for param_name, shard_name, shard_id in stacked_params_mapping:
+                if shard_name not in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+    # LoRA specific attributes
+    embedding_modules = {"embed_tokens": "input_embeddings"}
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        self.config = config
+        # currently all existing command R models have `tie_word_embeddings`
+        # enabled
+        assert config.tie_word_embeddings
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.quant_config = quant_config
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size,
+                                                scale=config.logit_scale)
+        self.model = CohereModel(vllm_config=vllm_config,
+                                 prefix=maybe_prefix(prefix, "model"))
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        is_not_lora = hasattr(self.model.embed_tokens, 'weight')
+        if is_not_lora:
+            logits = self.logits_processor(self.model.embed_tokens,
+                                           hidden_states, sampling_metadata)
+        else:
+            logits = self.logits_processor(self.model.embed_tokens.base_layer,
+                                           hidden_states, sampling_metadata)
+
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self, skip_prefixes=["lm_head", "rotary_emb.inv_freq"])
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/config.py b/vllm_v0.10.0/vllm/model_executor/models/config.py
new file mode 100644
index 0000000..cb07fe7
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/config.py
@@ -0,0 +1,314 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from copy import deepcopy
+from typing import TYPE_CHECKING
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.model_executor.models import ModelRegistry
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
+from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec
+
+if TYPE_CHECKING:
+
+    from vllm.config import VllmConfig
+
+logger = init_logger(__name__)
+
+
+class VerifyAndUpdateConfig:
+
+    @staticmethod
+    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
+        raise NotImplementedError
+
+
+class GteNewModelConfig(VerifyAndUpdateConfig):
+
+    @staticmethod
+    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
+        config = vllm_config.model_config.hf_config
+
+        assert config.__class__.__name__ == "NewConfig"
+        assert config.hidden_act == "gelu"
+
+        config.hidden_act = "geglu"
+
+        head_dim = config.hidden_size // config.num_attention_heads
+        config.rotary_kwargs = {
+            "head_size": head_dim,
+            "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
+            "max_position": config.max_position_embeddings,
+            "base": config.rope_theta,
+            "rope_scaling": getattr(config, "rope_scaling", None)
+        }
+
+
+class JinaRobertaModelConfig(VerifyAndUpdateConfig):
+
+    @staticmethod
+    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
+        config = vllm_config.model_config.hf_config
+
+        if config.position_embedding_type == "rotary":
+            assert config.__class__.__name__ == "XLMRobertaFlashConfig"
+
+            head_dim = config.hidden_size // config.num_attention_heads
+            config.rotary_kwargs = {
+                "head_size": head_dim,
+                "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
+                "max_position": config.max_position_embeddings,
+                "base": getattr(config, "rope_theta", config.rotary_emb_base),
+                "rope_scaling": getattr(config, "rope_scaling", None)
+            }
+
+
+class NomicBertModelConfig(VerifyAndUpdateConfig):
+
+    @staticmethod
+    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
+        config = vllm_config.model_config.hf_config
+
+        assert config.__class__.__name__ == "NomicBertConfig"
+        assert config.activation_function in ["swiglu", "gelu"]
+        config.position_embedding_type = getattr(config,
+                                                 "position_embedding_type",
+                                                 "rope")
+
+        if config.activation_function == "swiglu":
+            config.hidden_act = "silu"
+        else:
+            config.hidden_act = config.activation_function
+
+        assert (config.mlp_fc1_bias == config.mlp_fc2_bias ==
+                config.qkv_proj_bias)
+        config.bias = config.qkv_proj_bias
+
+        assert config.rotary_emb_scale_base is None
+        assert not config.rotary_emb_interleaved
+
+        config.layer_norm_eps = config.layer_norm_epsilon
+        config.intermediate_size = config.n_inner
+        config.hidden_size = config.n_embd
+        config.num_hidden_layers = config.n_layer
+
+        head_dim = config.hidden_size // config.num_attention_heads
+        rotary_emb_dim = head_dim * config.rotary_emb_fraction
+        max_trained_positions = getattr(config, "max_trained_positions", 2048)
+        config.rotary_kwargs = {
+            "head_size": head_dim,
+            "rotary_dim": rotary_emb_dim,
+            "max_position": max_trained_positions,
+            "base": getattr(config, "rope_theta", config.rotary_emb_base),
+            "rope_scaling": getattr(config, "rope_scaling", None)
+        }
+
+        # we ignore config.rotary_scaling_factor so that for datasets shorter
+        # than max_trained_positions 2048, the results are consistent
+        # with SentenceTransformer.
+        # The context extension uses vllm style rope_theta and rope_scaling.
+        # See #17785 #18755
+        if (not vllm_config.model_config.hf_overrides
+                and vllm_config.model_config.original_max_model_len is None):
+            # Default
+            # Reset max_model_len to max_trained_positions.
+            # nomic-embed-text-v2-moe the length is set to 512
+            # by sentence_bert_config.json.
+            max_model_len_before = vllm_config.model_config.max_model_len
+            max_model_len = min(vllm_config.model_config.max_model_len,
+                                max_trained_positions)
+
+            vllm_config.recalculate_max_model_len(max_model_len)
+            logger.warning(
+                "Nomic context extension is disabled. "
+                "Changing max_model_len from %s to %s. "
+                "To enable context extension, see: "
+                "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/context_extension.html",
+                max_model_len_before, vllm_config.model_config.max_model_len)
+        else:
+            # We need to re-verify max_model_len to avoid lengths
+            # greater than position_embedding.
+            model_config = vllm_config.model_config
+            hf_text_config = model_config.hf_text_config
+
+            if isinstance(model_config.hf_overrides, dict):
+                # hf_overrides_kw
+                max_model_len = model_config.hf_overrides.get(
+                    "max_model_len", vllm_config.model_config.max_model_len)
+            else:
+                # hf_overrides_fn
+                # This might be overridden by sentence_bert_config.json.
+                max_model_len = vllm_config.model_config.max_model_len
+
+            # reset hf_text_config for recalculate_max_model_len.
+            if hasattr(hf_text_config, "max_model_len"):
+                delattr(hf_text_config, "max_model_len")
+            hf_text_config.max_position_embeddings = max_trained_positions
+            hf_text_config.rope_scaling = config.rotary_kwargs["rope_scaling"]
+
+            # The priority of sentence_bert_config.json is higher
+            # than max_position_embeddings
+            encoder_config = deepcopy(model_config.encoder_config)
+            encoder_config.pop("max_seq_length", None)
+            model_config.encoder_config = encoder_config
+
+            vllm_config.recalculate_max_model_len(max_model_len)
+
+
+class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig):
+
+    @staticmethod
+    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
+        config = vllm_config.model_config.hf_config
+
+        is_original_qwen3_reranker = getattr(config,
+                                             "is_original_qwen3_reranker",
+                                             False)
+
+        if not is_original_qwen3_reranker:
+            return
+
+        tokens = getattr(config, "classifier_from_token", None)
+        assert tokens is not None and len(tokens) == 2, \
+            ("Try loading the original Qwen3 Reranker?, see: "
+             "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/qwen3_reranker.py")
+        vllm_config.model_config.hf_config.method = "from_2_way_softmax"
+
+
+class JinaVLForSequenceClassificationConfig(VerifyAndUpdateConfig):
+
+    @staticmethod
+    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
+        config = vllm_config.model_config.hf_config
+
+        config.num_labels = 1
+
+
+class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig):
+
+    @staticmethod
+    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
+        config = vllm_config.model_config.hf_config
+
+        assert config.__class__.__name__ == "GteConfig"
+        assert config.hidden_act == "gelu"
+
+        config.hidden_act = "geglu"
+
+        head_dim = config.hidden_size // config.num_attention_heads
+        config.rotary_kwargs = {
+            "head_size": head_dim,
+            "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
+            "max_position": config.max_position_embeddings,
+            "base": config.rope_theta,
+            "rope_scaling": getattr(config, "rope_scaling", None)
+        }
+
+
+class GraniteMoeHybridModelConfig(VerifyAndUpdateConfig):
+
+    @staticmethod
+    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
+        config = vllm_config.model_config
+        config.max_seq_len_to_capture = config.max_model_len
+        logger.info(
+            "Setting max_seq_len_to_capture to %d "
+            "to ensure that CUDA graph capture "
+            "covers sequences of length up to max_model_len.",
+            config.max_model_len)
+
+
+class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
+
+    @classmethod
+    def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
+        """
+        Ensure that page size of attention layers is greater than or
+        equal to the mamba layers. If not, automatically set the attention
+        block size to ensure that it is. If the attention page size is
+        strictly greater than the mamba page size, we pad the mamba page size
+        to make them equal.
+
+        Args:
+            vllm_config: vLLM Config
+        """
+
+        if not envs.VLLM_USE_V1:
+            return
+
+        cache_config = vllm_config.cache_config
+        model_config = vllm_config.model_config
+        parallel_config = vllm_config.parallel_config
+
+        if cache_config.cache_dtype == "auto":
+            kv_cache_dtype = model_config.dtype
+        else:
+            kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
+
+        # get attention page size (for 1 token)
+        attn_page_size_1_token = FullAttentionSpec(
+            block_size=1,
+            num_kv_heads=model_config.get_num_kv_heads(parallel_config),
+            head_size=model_config.get_head_size(),
+            dtype=kv_cache_dtype,
+            use_mla=model_config.use_mla).page_size_bytes
+
+        model_cls = ModelRegistry.resolve_model_cls(
+            model_config._model_info.architecture)[0]
+
+        # get mamba page size
+        mamba_page_size = MambaSpec(
+            shapes=model_cls.get_mamba_state_shape_from_config(vllm_config),
+            dtype=kv_cache_dtype,
+            block_size=model_config.max_model_len,
+        ).page_size_bytes
+
+        # some attention backends (e.g. FA) only support setting
+        # block size to multiple of 16, so let's suggest a value
+        # that would work (note: FA is currently not compatible
+        # with mamba layers, use FlashInfer instead).
+        attn_block_size = 16 * cdiv(mamba_page_size,
+                                    16 * attn_page_size_1_token)
+
+        # override attention block size if either (a) the
+        # user has not set it or (b) the user has set it
+        # too small.
+        if (cache_config.block_size is None
+                or cache_config.block_size < attn_block_size):
+            cache_config.block_size = attn_block_size
+            logger.info(
+                "Setting attention block size to %d tokens "
+                "to ensure that attention page size is >= mamba page size.",
+                attn_block_size)
+
+        # compute new attention page size
+        attn_page_size = \
+            cache_config.block_size * attn_page_size_1_token
+
+        assert attn_page_size >= mamba_page_size
+
+        if attn_page_size == mamba_page_size:
+            # don't need to pad mamba page size
+            return
+
+        # pad mamba page size to exactly match attention
+        if (cache_config.mamba_page_size_padded is None
+                or cache_config.mamba_page_size_padded != attn_page_size):
+            cache_config.mamba_page_size_padded = (attn_page_size)
+            mamba_padding_pct = 100 * (attn_page_size -
+                                       mamba_page_size) / mamba_page_size
+            logger.info(
+                "Padding mamba page size by %.2f%% to ensure "
+                "that mamba page size and attention page size are "
+                "exactly equal.", mamba_padding_pct)
+
+
+MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
+    "GteModel": SnowflakeGteNewModelConfig,
+    "GteNewModel": GteNewModelConfig,
+    "NomicBertModel": NomicBertModelConfig,
+    "Qwen3ForSequenceClassification": Qwen3ForSequenceClassificationConfig,
+    "XLMRobertaModel": JinaRobertaModelConfig,
+    "JinaVLForRanking": JinaVLForSequenceClassificationConfig,
+    "GraniteMoeHybridForCausalLM": GraniteMoeHybridModelConfig,
+}
diff --git a/vllm_v0.10.0/vllm/model_executor/models/constant_size_cache.py b/vllm_v0.10.0/vllm/model_executor/models/constant_size_cache.py
new file mode 100644
index 0000000..f03c58a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/constant_size_cache.py
@@ -0,0 +1,137 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from typing import Any
+
+import torch
+
+from vllm.attention.backends.utils import PAD_SLOT_ID
+
+
+class ConstantSizeCache(ABC):
+    """
+    Abstract base class for managing constant size caches 
+    like Mamba and Minimax.
+    """
+
+    def __init__(self, max_batch_size: int):
+        # Maps between the request id and a dict that maps between the seq_id
+        # and its index inside the cache
+        self.cache_indices_mapping: dict[str, dict[int, int]] = {}
+        self.free_cache_indices = list(range(max_batch_size))
+
+    @property
+    @abstractmethod
+    def cache(self) -> Any:
+        """Return the underlying cache tensor(s)"""
+        pass
+
+    @abstractmethod
+    def _copy_cache(self, from_index: int, to_index: int):
+        """Copy cache data from one index to another"""
+        pass
+
+    def current_run_tensors(self, **kwargs) -> tuple:
+        """
+        Return the tensors for the current run's conv and ssm state.
+        """
+        if "seqlen_agnostic_capture_inputs" not in kwargs:
+            # We get here only on Prefill/Eager mode runs
+            request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
+            finished_requests_ids = kwargs["finished_requests_ids"]
+
+            self._release_finished_requests(finished_requests_ids)
+            state_indices = self._prepare_current_run_cache(
+                request_ids_to_seq_ids, finished_requests_ids)
+
+            state_indices_tensor = torch.as_tensor(state_indices,
+                                                   dtype=torch.int32,
+                                                   device="cuda")
+            cache_tensors = self.cache
+        else:
+            # CUDA graph capturing runs
+            cache_tensors, state_indices_tensor = kwargs[
+                "seqlen_agnostic_capture_inputs"]
+
+        return (cache_tensors, state_indices_tensor)
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        """
+        Copy the relevant state_indices into the CUDA graph input buffer 
+        """
+        assert all(
+            key in kwargs
+            for key in ["request_ids_to_seq_ids", "finished_requests_ids"])
+        finished_requests_ids = kwargs["finished_requests_ids"]
+        request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
+        assert "seqlen_agnostic_capture_inputs" in input_buffers
+        _, input_state_indices_buffer = input_buffers[
+            "seqlen_agnostic_capture_inputs"]
+
+        self._release_finished_requests(finished_requests_ids)
+        state_indices = self._prepare_current_run_cache(
+            request_ids_to_seq_ids, finished_requests_ids)
+        cuda_graph_pad_len = input_state_indices_buffer.shape[0] - len(
+            state_indices)
+        state_indices.extend([PAD_SLOT_ID] * cuda_graph_pad_len)
+
+        input_state_indices_buffer.copy_(
+            torch.as_tensor(state_indices, dtype=torch.int32, device="cuda"))
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        """
+        Provide the CUDA graph capture runs with a buffer in adjusted size.
+        The buffer is used to maintain the Cache during the CUDA graph replay
+        runs.
+        """
+        state_indices_tensor = torch.as_tensor([PAD_SLOT_ID] * batch_size,
+                                               dtype=torch.int32,
+                                               device="cuda")
+        return (self.cache, state_indices_tensor)
+
+    def _assign_seq_id_to_cache_index(self, cur_rid: str, seq_id: int,
+                                      finished_requests_ids) -> int:
+        """
+        Assign (req_id,seq_id) pair to a `destination_index` index, if
+        already occupied, move the occupying index to a free index.
+        """
+        if cur_rid in finished_requests_ids:
+            # set as pad, do not allocate destination index
+            return PAD_SLOT_ID
+        elif cur_rid not in self.cache_indices_mapping:
+            destination_index = self.free_cache_indices.pop()
+            self.cache_indices_mapping[cur_rid] = {seq_id: destination_index}
+            return destination_index
+        elif seq_id not in (seq_ids2indices :=
+                            self.cache_indices_mapping[cur_rid]):
+            # parallel sampling , where n > 1, assume prefill have
+            # already happened, so we copy the
+            # existing cache into the siblings seq_ids caches
+            index_exists = next(iter(seq_ids2indices.values()))
+            # case of decoding n>1, copy prefill cache to decoding indices
+            destination_index = self.free_cache_indices.pop()
+            self._copy_cache(from_index=index_exists,
+                             to_index=destination_index)
+            self.cache_indices_mapping[cur_rid][seq_id] = destination_index
+            return destination_index
+        else:
+            return self.cache_indices_mapping[cur_rid][seq_id]
+
+    def _prepare_current_run_cache(
+            self, request_ids_to_seq_ids: dict[str, list[int]],
+            finished_requests_ids: list[str]) -> list[int]:
+        return [
+            self._assign_seq_id_to_cache_index(req_id, seq_id,
+                                               finished_requests_ids)
+            for req_id, seq_ids in request_ids_to_seq_ids.items()
+            for seq_id in seq_ids
+        ]
+
+    def _release_finished_requests(self,
+                                   finished_seq_groups_req_ids: list[str]):
+        for req_id in finished_seq_groups_req_ids:
+            if req_id in self.cache_indices_mapping:
+                for seq_id in self.cache_indices_mapping[req_id]:
+                    self.free_cache_indices.append(
+                        self.cache_indices_mapping[req_id][seq_id])
+                self.cache_indices_mapping.pop(req_id)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/dbrx.py b/vllm_v0.10.0/vllm/model_executor/models/dbrx.py
new file mode 100644
index 0000000..7a4dd69
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/dbrx.py
@@ -0,0 +1,472 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+from vllm.attention import Attention
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size)
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.dbrx import DbrxConfig
+
+from .interfaces import SupportsPP
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class DbrxRouter(nn.Module):
+    """A Router implementation for DBRX that returns logits for each expert
+    per token.
+    """
+
+    def __init__(
+        self,
+        config: DbrxConfig,
+        params_dtype: Optional[torch.dtype] = None,
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_total_experts = config.ffn_config.moe_num_experts
+        self.d_model = config.d_model
+        self.layer = ReplicatedLinear(
+            self.d_model,
+            self.num_total_experts,
+            bias=False,
+            params_dtype=params_dtype,
+            quant_config=None,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        router_logits, _ = self.layer(hidden_states)
+        return router_logits
+
+
+class DbrxExperts(FusedMoE):
+
+    def __init__(
+        self,
+        config: DbrxConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        params_dtype: Optional[torch.dtype] = None,
+        prefix: str = "",
+    ):
+        super().__init__(
+            num_experts=config.ffn_config.moe_num_experts,
+            top_k=config.ffn_config.moe_top_k,
+            hidden_size=config.d_model,
+            intermediate_size=config.ffn_config.ffn_hidden_size,
+            params_dtype=params_dtype,
+            reduce_results=True,
+            renormalize=True,
+            quant_config=quant_config,
+            tp_size=get_tensor_model_parallel_world_size(),
+            prefix=prefix,
+        )
+        self.config = config
+        self.d_model = config.d_model
+        self.intermediate_size = (self.config.ffn_config.ffn_hidden_size //
+                                  self.tp_size)
+
+    # Define custom weight loader for dbrx model
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
+                      weight_name: str, param_name: str):
+        tp_rank = get_tensor_model_parallel_rank()
+        param_data = param.data
+        shard_size = self.intermediate_size
+        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
+        # DBRX uses GLU for each experts.
+        # GLU has 3 linear layers: w1, v1 and w2.
+        if weight_name.endswith("w1"):
+            if param_name.endswith("weight"):
+                loaded_weight = torch.reshape(
+                    loaded_weight,
+                    [-1, self.intermediate_size * self.tp_size, self.d_model],
+                )
+                param_data[:, 0:shard_size, :] = loaded_weight[:, shard, :]
+            elif param_name.endswith("weight_scale"):
+                param_data[:, 0] = loaded_weight
+            else:
+                param_data = loaded_weight
+        if weight_name.endswith("v1"):
+            if param_name.endswith("weight"):
+                loaded_weight = torch.reshape(
+                    loaded_weight,
+                    [-1, self.intermediate_size * self.tp_size, self.d_model],
+                )
+                param_data[:, shard_size:2 *
+                           shard_size, :] = loaded_weight[:, shard, :]
+            elif param_name.endswith("weight_scale"):
+                param_data[:, 1] = loaded_weight
+            else:
+                param_data[:] = loaded_weight
+        if weight_name.endswith("w2"):
+            if param_name.endswith("weight"):
+                loaded_weight = torch.reshape(
+                    loaded_weight,
+                    [-1, self.intermediate_size * self.tp_size, self.d_model],
+                ).transpose(1, 2)
+                param_data[:] = loaded_weight[:, :, shard]
+            else:
+                param_data[:] = loaded_weight
+
+
+class DbrxMoE(nn.Module):
+    """A tensor-parallel MoE implementation for DBRX.
+
+    Each expert's weights are sharded across all ranks and a fused MoE
+    kernel is used for the forward pass, and finally we reduce the outputs
+    across ranks.
+    """
+
+    def __init__(
+        self,
+        config: DbrxConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        params_dtype: Optional[torch.dtype] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.d_model = config.d_model
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
+
+        self.router = DbrxRouter(config, self.params_dtype)
+
+        self.experts = DbrxExperts(config=config,
+                                   quant_config=quant_config,
+                                   params_dtype=self.params_dtype,
+                                   prefix=f"{prefix}.experts")
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        orig_shape = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.d_model)
+        # router_logits: (num_tokens, n_experts)
+        router_logits = self.router(hidden_states)
+        final_hidden_states = self.experts(hidden_states, router_logits)
+        return final_hidden_states.view(orig_shape)
+
+
+class DbrxAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: DbrxConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.d_model = config.d_model
+        self.total_num_heads = config.n_heads
+        self.head_dim = self.d_model // self.total_num_heads
+        self.total_num_kv_heads = config.attn_config.kv_n_heads
+        self.clip_qkv = config.attn_config.clip_qkv
+        self.rope_theta = config.attn_config.rope_theta
+        self.max_position = config.max_seq_len
+
+        # pylint: disable=invalid-name
+        self.Wqkv = QKVParallelLinear(
+            self.d_model,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+        )
+        self.out_proj = RowParallelLinear(
+            self.d_model,
+            self.d_model,
+            bias=False,
+            quant_config=quant_config,
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position,
+            base=int(self.rope_theta),
+            is_neox_style=True,
+        )
+
+        tp_world_size = get_tensor_model_parallel_world_size()
+        self.tp_size = tp_world_size
+        assert self.total_num_heads % tp_world_size == 0
+        self.num_heads = self.total_num_heads // tp_world_size
+        if self.total_num_kv_heads >= tp_world_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_world_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_world_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_world_size)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.Wqkv(hidden_states)
+        if self.clip_qkv is not None:
+            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(position_ids, q, k)
+        attn_output = self.attn(q, k, v)
+        hidden_states, _ = self.out_proj(attn_output)
+        return hidden_states
+
+
+class DbrxFusedNormAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: DbrxConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.d_model = config.d_model
+        self.attn = DbrxAttention(config,
+                                  cache_config,
+                                  quant_config,
+                                  prefix=f"{prefix}.attn")
+        self.norm_1 = nn.LayerNorm(self.d_model)
+        self.norm_2 = nn.LayerNorm(self.d_model)
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.norm_1(hidden_states)
+        x = self.attn(
+            position_ids=position_ids,
+            hidden_states=hidden_states,
+        )
+        hidden_states = residual + x
+        residual = hidden_states
+        hidden_states = self.norm_2(hidden_states)
+        return hidden_states, residual
+
+
+class DbrxBlock(nn.Module):
+
+    def __init__(
+        self,
+        config: DbrxConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.norm_attn_norm = DbrxFusedNormAttention(
+            config,
+            cache_config,
+            quant_config,
+            prefix=f"{prefix}.norm_attn_norm")
+        self.ffn = DbrxMoE(config, quant_config, prefix=f"{prefix}.ffn")
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states, residual = self.norm_attn_norm(
+            position_ids=position_ids,
+            hidden_states=hidden_states,
+        )
+        hidden_states = self.ffn(hidden_states)
+        hidden_states = hidden_states + residual
+        return hidden_states
+
+
+class DbrxModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.quant_config = quant_config
+        self.wte = VocabParallelEmbedding(
+            config.vocab_size,
+            config.d_model,
+        )
+        self.start_layer, self.end_layer, self.blocks = make_layers(
+            config.n_layers,
+            lambda prefix: DbrxBlock(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.blocks",
+        )
+        self.norm_f = nn.LayerNorm(config.d_model, eps=1e-5)
+        for module in self.modules():
+            if hasattr(module, "bias") and isinstance(module.bias,
+                                                      nn.Parameter):
+                # Remove the bias term in Linear and LayerNorm.
+                module.register_parameter("bias", None)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(["hidden_states"],
+                                                    config.d_model))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.wte(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+        else:
+            assert intermediate_tensors
+            hidden_states = intermediate_tensors["hidden_states"]
+        for block in self.blocks[self.start_layer:self.end_layer]:
+            hidden_states = block(position_ids, hidden_states)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
+        hidden_states = self.norm_f(hidden_states)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        expert_params_mapping = [(
+            "w13" if weight_name in ["w1", "v1"] else "w2",
+            f"mlp.{weight_name}",
+        ) for weight_name in ["w1", "v1", "w2"]]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
+            if name.endswith(("w1", "w2", "v1")):
+                name = name + "_weight"
+            for param_name, weight_name in expert_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, weight_name, name)
+                break
+
+            else:
+                if is_pp_missing_parameter(name, self):
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class DbrxForCausalLM(nn.Module, SupportsPP):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        if config.tie_word_embeddings:
+            raise ValueError(
+                "tie_word_embeddings is not supported for Dbrx models.")
+        self.quant_config = quant_config
+        self.unpadded_vocab_size = config.vocab_size
+        self.transformer = DbrxModel(vllm_config=vllm_config,
+                                     prefix=maybe_prefix(
+                                         prefix, "transformer"))
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.d_model,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            quant_config=quant_config,
+        )
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.transformer.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.transformer(input_ids, positions,
+                                         intermediate_tensors, inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/deepseek.py b/vllm_v0.10.0/vllm/model_executor/models/deepseek.py
new file mode 100644
index 0000000..2f0202f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/deepseek.py
@@ -0,0 +1,486 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Deepseek model."""
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsPP
+from .utils import (AutoWeightsLoader, extract_layer_index,
+                    is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class DeepseekMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config)
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config,
+                                           reduce_results=reduce_results)
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class DeepseekMoE(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.n_routed_experts = config.n_routed_experts
+        self.top_k = config.num_experts_per_tok
+        if self.tp_size > self.n_routed_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {self.n_routed_experts}.")
+
+        self.experts = nn.ModuleList([
+            DeepseekMLP(hidden_size=config.hidden_size,
+                        intermediate_size=config.moe_intermediate_size,
+                        hidden_act=config.hidden_act,
+                        quant_config=quant_config,
+                        reduce_results=False)
+            for idx in range(self.n_routed_experts)
+        ])
+        self.pack_params()
+
+        self.gate = ReplicatedLinear(config.hidden_size,
+                                     self.n_routed_experts,
+                                     bias=False,
+                                     quant_config=None)
+
+        if config.n_shared_experts is not None:
+            intermediate_size = (config.moe_intermediate_size *
+                                 config.n_shared_experts)
+            self.shared_experts = DeepseekMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+            )
+
+    def pack_params(self):
+        w1 = []
+        w2 = []
+        for expert in self.experts:
+            w1.append(expert.gate_up_proj.weight)
+            w2.append(expert.down_proj.weight)
+        self.w1 = torch._utils._flatten_dense_tensors(w1)
+        w1s = torch._utils._unflatten_dense_tensors(self.w1, w1)
+        for data, param in zip(w1s, w1):
+            param.data = data
+        self.w1 = self.w1.view(len(w1), *w1s[0].shape)
+
+        self.w2 = torch._utils._flatten_dense_tensors(w2)
+        w2s = torch._utils._unflatten_dense_tensors(self.w2, w2)
+        for data, param in zip(w2s, w2):
+            param.data = data
+
+        self.w2 = self.w2.view(len(w2), *w2s[0].shape)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        if self.config.n_shared_experts is not None:
+            shared_output = self.shared_experts(hidden_states)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        final_hidden_states = fused_moe(hidden_states,
+                                        self.w1,
+                                        self.w2,
+                                        router_logits,
+                                        self.top_k,
+                                        renormalize=self.config.norm_topk_prob,
+                                        inplace=True)
+
+        if self.config.n_shared_experts is not None:
+            final_hidden_states = final_hidden_states + shared_output
+        final_hidden_states = tensor_model_parallel_all_reduce(
+            final_hidden_states)
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+class DeepseekAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class DeepseekDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        layer_idx = extract_layer_index(prefix)
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.self_attn = DeepseekAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        if (config.n_routed_experts is not None
+                and layer_idx >= config.first_k_dense_replace
+                and layer_idx % config.moe_layer_freq == 0):
+            self.mlp = DeepseekMoE(config=config,
+                                   quant_config=quant_config,
+                                   prefix=f"{prefix}.mlp")
+        else:
+            self.mlp = DeepseekMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class DeepseekModel(nn.Module):
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: DeepseekDecoderLayer(
+                config, cache_config, quant_config=quant_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers")
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(positions, hidden_states, residual)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip experts that are not assigned to this worker.
+                if (("mlp.experts." in name or "mlp.shared_experts." in name)
+                        and name not in params_dict):
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip experts that are not assigned to this worker.
+                if (("mlp.experts." in name or "mlp.shared_experts." in name)
+                        and name not in params_dict):
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class DeepseekForCausalLM(nn.Module, SupportsPP):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = DeepseekModel(vllm_config=vllm_config,
+                                   prefix=maybe_prefix(prefix, "model"))
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/models/deepseek_mtp.py b/vllm_v0.10.0/vllm/model_executor/models/deepseek_mtp.py
new file mode 100644
index 0000000..911f003
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/deepseek_mtp.py
@@ -0,0 +1,281 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig
+
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .deepseek_v2 import (DeepseekV2DecoderLayer,
+                          get_spec_layer_idx_from_weight_name)
+from .interfaces import SupportsPP
+from .utils import maybe_prefix
+
+
+class SharedHead(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.head = ParallelLMHead(config.vocab_size,
+                                   config.hidden_size,
+                                   quant_config=quant_config)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return self.norm(hidden_states)
+
+
+class DeepSeekMultiTokenPredictorLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        prefix: str,
+        model_config: ModelConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.eh_proj = nn.Linear(config.hidden_size * 2,
+                                 config.hidden_size,
+                                 bias=False)
+        self.shared_head = SharedHead(config=config, quant_config=quant_config)
+        self.mtp_block = DeepseekV2DecoderLayer(config, prefix, model_config,
+                                                cache_config, quant_config)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        previous_hidden_states: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        spec_step_index: int = 0,
+    ) -> torch.Tensor:
+        assert inputs_embeds is not None
+        # masking inputs at position 0, as not needed by MTP
+        inputs_embeds[positions == 0] = 0
+        inputs_embeds = self.enorm(inputs_embeds)
+        previous_hidden_states = self.hnorm(previous_hidden_states)
+
+        hidden_states = self.eh_proj(
+            torch.cat([inputs_embeds, previous_hidden_states], dim=-1))
+
+        hidden_states, residual = self.mtp_block(positions=positions,
+                                                 hidden_states=hidden_states,
+                                                 residual=None)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class DeepSeekMultiTokenPredictor(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        self.mtp_start_layer_idx = config.num_hidden_layers
+        self.num_mtp_layers = config.num_nextn_predict_layers
+        # to map the exact layer index from weights
+        self.layers = torch.nn.ModuleDict({
+            str(idx):
+            DeepSeekMultiTokenPredictorLayer(
+                config,
+                f"{prefix}.layers.{idx}",
+                model_config=vllm_config.model_config,
+                cache_config=vllm_config.cache_config,
+                quant_config=vllm_config.quant_config,
+            )
+            for idx in range(self.mtp_start_layer_idx,
+                             self.mtp_start_layer_idx + self.num_mtp_layers)
+        })
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        previous_hidden_states: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor:
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        current_step_idx = (spec_step_idx % self.num_mtp_layers)
+        return self.layers[str(self.mtp_start_layer_idx + current_step_idx)](
+            input_ids,
+            positions,
+            previous_hidden_states,
+            inputs_embeds,
+            current_step_idx,
+        )
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor:
+        current_step_idx = (spec_step_idx % self.num_mtp_layers)
+        mtp_layer = self.layers[str(self.mtp_start_layer_idx +
+                                    current_step_idx)]
+        logits = self.logits_processor(mtp_layer.shared_head.head,
+                                       mtp_layer.shared_head(hidden_states),
+                                       sampling_metadata)
+        return logits
+
+
+class DeepSeekMTP(nn.Module, SupportsPP):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        self.config = vllm_config.model_config.hf_config
+        self.model = DeepSeekMultiTokenPredictor(vllm_config=vllm_config,
+                                                 prefix=maybe_prefix(
+                                                     prefix, "model"))
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        previous_hidden_states: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions,
+                                   previous_hidden_states, inputs_embeds,
+                                   spec_step_idx)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+        spec_step_idx: int = 0,
+    ) -> Optional[torch.Tensor]:
+        return self.model.compute_logits(hidden_states, sampling_metadata,
+                                         spec_step_idx)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts)
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
+            if spec_layer is None:
+                continue
+            name = self._rewrite_spec_layer_name(spec_layer, name)
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if (("mlp.experts." in name) and name not in params_dict):
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    # According to DeepSeek-V3 Technical Report, MTP modules
+                    # shares embedding layer. We only load the first weights.
+                    if (spec_layer != self.model.mtp_start_layer_idx
+                            and ".layers" not in name):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+    def _rewrite_spec_layer_name(self, spec_layer: int, name: str) -> str:
+        """
+        Rewrite the weight name to match the format of the original model.
+        Add .mtp_block for modules in transformer layer block for spec layer
+        and rename shared layer weights to be top level.
+        """
+        spec_layer_weight_names = [
+            "embed_tokens", "enorm", "hnorm", "eh_proj", "shared_head"
+        ]
+        shared_weight_names = ["embed_tokens"]
+        spec_layer_weight = False
+        shared_weight = False
+        for weight_name in spec_layer_weight_names:
+            if weight_name in name:
+                spec_layer_weight = True
+                if weight_name in shared_weight_names:
+                    shared_weight = True
+                break
+        if not spec_layer_weight:
+            # treat rest weights as weights for transformer layer block
+            name = name.replace(f"model.layers.{spec_layer}.",
+                                f"model.layers.{spec_layer}.mtp_block.")
+        elif shared_weight:
+            # treat shared weights as top level weights
+            name = name.replace(f"model.layers.{spec_layer}.", "model.")
+        return name
diff --git a/vllm_v0.10.0/vllm/model_executor/models/deepseek_v2.py b/vllm_v0.10.0/vllm/model_executor/models/deepseek_v2.py
new file mode 100644
index 0000000..79ddd3d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/deepseek_v2.py
@@ -0,0 +1,982 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only DeepseekV2/DeepseekV3 model."""
+import typing
+from collections.abc import Callable, Iterable
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import (CacheConfig, ModelConfig, VllmConfig,
+                         get_current_vllm_config)
+from vllm.distributed import (get_ep_group, get_pp_group,
+                              get_tensor_model_parallel_world_size)
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               MergedReplicatedLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import MixtureOfExperts, SupportsPP
+from .utils import (PPMissingLayer, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class DeepseekV2MLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config,
+                                           reduce_results=reduce_results,
+                                           prefix=f"{prefix}.down_proj")
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class DeepseekV2MoE(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        enable_eplb: bool = False,
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.routed_scaling_factor = config.routed_scaling_factor
+
+        self.ep_group = get_ep_group().device_group
+        self.ep_rank = self.ep_group.rank()
+        self.ep_size = self.ep_group.size()
+        self.n_routed_experts: int = config.n_routed_experts
+        self.n_shared_experts: int = config.n_shared_experts
+
+        if config.hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {config.hidden_act}. "
+                             "Only silu is supported for now.")
+
+        self.gate = ReplicatedLinear(config.hidden_size,
+                                     config.n_routed_experts,
+                                     bias=False,
+                                     quant_config=None,
+                                     prefix=f"{prefix}.gate")
+        if config.topk_method == "noaux_tc":
+            self.gate.e_score_correction_bias = nn.Parameter(
+                torch.empty(config.n_routed_experts))
+        else:
+            self.gate.e_score_correction_bias = None
+
+        # Load balancing settings.
+        vllm_config = get_current_vllm_config()
+        parallel_config = vllm_config.parallel_config
+        self.enable_eplb = enable_eplb
+
+        self.n_redundant_experts = parallel_config.num_redundant_experts
+        self.n_logical_experts = self.n_routed_experts
+        self.n_physical_experts = (self.n_logical_experts +
+                                   self.n_redundant_experts)
+        self.n_local_physical_experts = self.n_physical_experts // self.ep_size
+
+        self.physical_expert_start = (self.ep_rank *
+                                      self.n_local_physical_experts)
+        self.physical_expert_end = (self.physical_expert_start +
+                                    self.n_local_physical_experts)
+
+        self.experts = FusedMoE(
+            num_experts=config.n_routed_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
+            renormalize=config.norm_topk_prob,
+            quant_config=quant_config,
+            use_grouped_topk=True,
+            num_expert_group=config.n_group,
+            topk_group=config.topk_group,
+            prefix=f"{prefix}.experts",
+            scoring_func=config.scoring_func,
+            e_score_correction_bias=self.gate.e_score_correction_bias,
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts)
+
+        if config.n_shared_experts is not None:
+            intermediate_size = (config.moe_intermediate_size *
+                                 config.n_shared_experts)
+            self.shared_experts = DeepseekV2MLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=self.experts.must_reduce_shared_expert_outputs(
+                ),
+                prefix=f"{prefix}.shared_experts",
+            )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        if self.n_shared_experts is not None:
+            shared_output = self.shared_experts(hidden_states)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+
+        if hidden_states.dtype != torch.float16:
+            final_hidden_states = self.experts(
+                hidden_states=hidden_states,
+                router_logits=router_logits) * self.routed_scaling_factor
+        else:
+            # Fix FP16 overflow
+            # See DeepseekV2DecoderLayer for more details.
+            final_hidden_states = self.experts(hidden_states=hidden_states,
+                                               router_logits=router_logits)
+        if shared_output is not None:
+            if hidden_states.dtype != torch.float16:
+                final_hidden_states = final_hidden_states + shared_output
+            else:
+                # Fix FP16 overflow
+                # See DeepseekV2DecoderLayer for more details.
+                final_hidden_states = final_hidden_states + shared_output \
+                    * (1. / self.routed_scaling_factor)
+
+        if self.tp_size > 1:
+            final_hidden_states = (
+                self.experts.maybe_all_reduce_tensor_model_parallel(
+                    final_hidden_states))
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
+    import math
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+
+
+class DeepseekV2Attention(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: int,
+        kv_lora_rank: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.num_heads = num_heads
+        tp_size = get_tensor_model_parallel_world_size()
+        assert num_heads % tp_size == 0
+        self.num_local_heads = num_heads // tp_size
+        self.scaling = self.qk_head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        if self.q_lora_rank is not None:
+            self.q_a_proj = ReplicatedLinear(self.hidden_size,
+                                             self.q_lora_rank,
+                                             bias=False,
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.q_a_proj")
+            self.q_a_layernorm = RMSNorm(self.q_lora_rank,
+                                         eps=config.rms_norm_eps)
+            self.q_b_proj = ColumnParallelLinear(q_lora_rank,
+                                                 self.num_heads *
+                                                 self.qk_head_dim,
+                                                 bias=False,
+                                                 quant_config=quant_config,
+                                                 prefix=f"{prefix}.q_b_proj")
+        else:
+            self.q_proj = ColumnParallelLinear(self.hidden_size,
+                                               self.num_heads *
+                                               self.qk_head_dim,
+                                               bias=False,
+                                               quant_config=quant_config,
+                                               prefix=f"{prefix}.q_proj")
+
+        self.kv_a_proj_with_mqa = ReplicatedLinear(
+            self.hidden_size,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_a_proj_with_mqa")
+        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank,
+                                      eps=config.rms_norm_eps)
+        self.kv_b_proj = ColumnParallelLinear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_b_proj")
+        # O projection.
+        self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim,
+                                        self.hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.o_proj")
+        if rope_scaling:
+            rope_scaling["rope_type"] = 'deepseek_yarn'
+
+        self.rotary_emb = get_rope(qk_rope_head_dim,
+                                   rotary_dim=qk_rope_head_dim,
+                                   max_position=max_position_embeddings,
+                                   base=rope_theta,
+                                   rope_scaling=rope_scaling,
+                                   is_neox_style=False)
+
+        if rope_scaling:
+            mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
+            scaling_factor = rope_scaling["factor"]
+            mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
+            self.scaling = self.scaling * mscale * mscale
+
+        self.attn = Attention(self.num_local_heads,
+                              self.qk_head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_local_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        if self.q_lora_rank is not None:
+            q = self.q_a_proj(hidden_states)[0]
+            q = self.q_a_layernorm(q)
+            q = self.q_b_proj(q)[0].view(-1, self.num_local_heads,
+                                         self.qk_head_dim)
+        else:
+            q = self.q_proj(hidden_states)[0].view(-1, self.num_local_heads,
+                                                   self.qk_head_dim)
+        q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim],
+                               dim=-1)
+        latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0]
+        kv_a, _ = latent_cache.split(
+            [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        latent_cache = latent_cache.unsqueeze(1)
+        kv_a = self.kv_a_layernorm(kv_a)
+        kv = self.kv_b_proj(kv_a)[0]
+        kv = kv.view(-1, self.num_local_heads,
+                     self.qk_nope_head_dim + self.v_head_dim)
+        k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+        k_pe = latent_cache[:, :, self.kv_lora_rank:]
+
+        q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
+
+        q[..., self.qk_nope_head_dim:] = q_pe
+        k = torch.empty_like(q)
+        k[..., :self.qk_nope_head_dim] = k_nope
+        k[..., self.qk_nope_head_dim:] = k_pe
+        # padding value to qk_head_dim for alignment
+        v = torch.nn.functional.pad(
+            v, [0, self.qk_head_dim - self.v_head_dim],
+            value=0).view(-1, self.num_local_heads * self.qk_head_dim)
+        attn_output = self.attn(q, k, v)
+        attn_output = attn_output.view(
+            -1, self.num_local_heads,
+            self.qk_head_dim)[..., :self.v_head_dim].reshape(
+                -1, self.num_local_heads * self.v_head_dim)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class DeepseekV2MLAAttention(nn.Module):
+    """
+    Main reference: DeepseekV2 paper, and FlashInfer Implementation
+    (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
+    
+    For more info see MLACommonImpl in: vllm/attention/backends/mla/utils.py
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: Optional[int],
+        kv_lora_rank: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+
+        self.num_heads = num_heads
+        tp_size = get_tensor_model_parallel_world_size()
+        assert num_heads % tp_size == 0
+        self.num_local_heads = num_heads // tp_size
+
+        self.scaling = self.qk_head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        if self.q_lora_rank is not None:
+            self.fused_qkv_a_proj = MergedReplicatedLinear(
+                self.hidden_size,
+                [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim],
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.fused_qkv_a_proj")
+        else:
+            self.kv_a_proj_with_mqa = ReplicatedLinear(
+                self.hidden_size,
+                self.kv_lora_rank + self.qk_rope_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.kv_a_proj_with_mqa")
+
+        if self.q_lora_rank is not None:
+            self.q_a_layernorm = RMSNorm(self.q_lora_rank,
+                                         eps=config.rms_norm_eps)
+            self.q_b_proj = ColumnParallelLinear(self.q_lora_rank,
+                                                 self.num_heads *
+                                                 self.qk_head_dim,
+                                                 bias=False,
+                                                 quant_config=quant_config,
+                                                 prefix=f"{prefix}.q_b_proj")
+        else:
+            self.q_proj = ColumnParallelLinear(self.hidden_size,
+                                               self.num_heads *
+                                               self.qk_head_dim,
+                                               bias=False,
+                                               quant_config=quant_config,
+                                               prefix=f"{prefix}.q_proj")
+        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank,
+                                      eps=config.rms_norm_eps)
+        self.kv_b_proj = ColumnParallelLinear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_b_proj")
+        self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim,
+                                        self.hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.o_proj")
+
+        if rope_scaling:
+            rope_scaling["rope_type"] = 'deepseek_yarn'
+        self.rotary_emb = get_rope(qk_rope_head_dim,
+                                   rotary_dim=qk_rope_head_dim,
+                                   max_position=max_position_embeddings,
+                                   base=rope_theta,
+                                   rope_scaling=rope_scaling,
+                                   is_neox_style=False)
+        if rope_scaling:
+            mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
+            scaling_factor = rope_scaling["factor"]
+            mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
+            self.scaling = self.scaling * mscale * mscale
+
+        # In the MLA backend, kv_cache includes both k_c and
+        # pe (i.e. decoupled position embeddings). In particular,
+        # the concat_and_cache_mla op requires
+        #     k_c.size(1) + k_pe.size(1) == kv_cache.size(2)
+        # i.e.
+        #     kv_lora_rank + qk_rope_head_dim == head_size
+        self.mla_attn = Attention(
+            num_heads=self.num_local_heads,
+            head_size=self.kv_lora_rank + self.qk_rope_head_dim,
+            scale=self.scaling,
+            num_kv_heads=1,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+            use_mla=True,
+            # MLA Args
+            q_lora_rank=self.q_lora_rank,
+            kv_lora_rank=self.kv_lora_rank,
+            qk_nope_head_dim=self.qk_nope_head_dim,
+            qk_rope_head_dim=self.qk_rope_head_dim,
+            qk_head_dim=self.qk_head_dim,
+            v_head_dim=self.v_head_dim,
+            kv_b_proj=self.kv_b_proj,
+        )
+
+        self.prefix = prefix
+        self.debug_layer_idx = int(self.prefix.split(".")[-2])
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        q_c = None
+        kv_lora = None
+
+        if self.q_lora_rank is not None:
+            qkv_lora = self.fused_qkv_a_proj(hidden_states)[0]
+            q_c, kv_lora = qkv_lora.split(
+                [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim],
+                dim=-1,
+            )
+            q_c = self.q_a_layernorm(q_c)
+            q = self.q_b_proj(q_c)[0]
+        else:
+            kv_lora = self.kv_a_proj_with_mqa(hidden_states)[0]
+            q = self.q_proj(hidden_states)[0]
+
+        kv_c, k_pe = kv_lora.split([self.kv_lora_rank, self.qk_rope_head_dim],
+                                   dim=-1)
+        kv_c_normed = self.kv_a_layernorm(kv_c)
+
+        q = q.view(-1, self.num_local_heads, self.qk_head_dim)
+        # Add head dim of 1 to k_pe
+        k_pe = k_pe.unsqueeze(1)
+
+        q[..., self.qk_nope_head_dim:], k_pe = self.rotary_emb(
+            positions, q[..., self.qk_nope_head_dim:], k_pe)
+
+        attn_out = self.mla_attn(
+            q,
+            kv_c_normed,
+            k_pe,
+            output_shape=(hidden_states.shape[0],
+                          self.num_local_heads * self.v_head_dim))
+        return self.o_proj(attn_out)[0]
+
+
+class DeepseekV2DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        prefix: str,
+        model_config: ModelConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        enable_eplb: bool = False,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        # DecoderLayers are created with `make_layers` which passes the prefix
+        # with the layer's index.
+        layer_idx = int(prefix.split(sep='.')[-1])
+        self.layer_idx = layer_idx
+        if model_config.use_mla:
+            attn_cls = DeepseekV2MLAAttention
+        else:
+            attn_cls = DeepseekV2Attention
+        self.self_attn = attn_cls(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            qk_nope_head_dim=config.qk_nope_head_dim,
+            qk_rope_head_dim=config.qk_rope_head_dim,
+            v_head_dim=config.v_head_dim,
+            q_lora_rank=config.q_lora_rank
+            if hasattr(config, "q_lora_rank") else None,
+            kv_lora_rank=config.kv_lora_rank,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+
+        if (config.n_routed_experts is not None
+                and layer_idx >= config.first_k_dense_replace
+                and layer_idx % config.moe_layer_freq == 0):
+            self.mlp = DeepseekV2MoE(
+                config=config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+                enable_eplb=enable_eplb,
+            )
+        else:
+            self.mlp = DeepseekV2MLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+        self.routed_scaling_factor = config.routed_scaling_factor
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        if hidden_states.dtype == torch.float16:
+            # Fix FP16 overflow
+            # We scale both hidden_states and residual before
+            # rmsnorm, and rmsnorm result would not affect by scale.
+            hidden_states *= 1. / self.routed_scaling_factor
+            if self.layer_idx == 0:
+                # The residual is shared by all layers, we only scale it on
+                # first layer.
+                residual *= 1. / self.routed_scaling_factor
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+
+        if isinstance(self.mlp,
+                      DeepseekV2MLP) and hidden_states.dtype == torch.float16:
+            # Fix FP16 overflow
+            # Scaling the DeepseekV2MLP output, it is the input of
+            # input_layernorm of next decoder layer.
+            # The scaling of DeepseekV2MOE output would be done in the forward
+            # of DeepseekV2MOE
+            hidden_states *= 1. / self.routed_scaling_factor
+
+        return hidden_states, residual
+
+
+@support_torch_compile
+class DeepseekV2Model(nn.Module):
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        enable_eplb = vllm_config.parallel_config.enable_eplb
+        self.config = config
+
+        self.vocab_size = config.vocab_size
+
+        if get_pp_group().is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.embed_tokens")
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: DeepseekV2DecoderLayer(
+                config,
+                prefix,
+                model_config=model_config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                enable_eplb=enable_eplb,
+            ),
+            prefix=f"{prefix}.layers")
+
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = DeepseekV2Model(vllm_config=vllm_config,
+                                     prefix=maybe_prefix(prefix, "model"))
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(config.vocab_size,
+                                          config.hidden_size,
+                                          quant_config=quant_config)
+        else:
+            self.lm_head = PPMissingLayer()
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+        self.expert_weights = []
+
+        # Set MoE hyperparameters
+        self.num_moe_layers = (config.num_hidden_layers -
+                               config.first_k_dense_replace)
+        self.num_expert_groups = config.n_group
+
+        self.moe_layers: list[FusedMoE] = []
+        example_moe = None
+        for layer in self.model.layers:
+            if isinstance(layer, PPMissingLayer):
+                continue
+
+            assert isinstance(layer, DeepseekV2DecoderLayer)
+            if isinstance(layer.mlp, DeepseekV2MoE):
+                # Pick last one layer since the first ones may be dense layers.
+                example_moe = layer.mlp
+                self.moe_layers.append(layer.mlp.experts)
+
+        if example_moe is None:
+            raise RuntimeError("No DeepseekV2MoE layer found in model.layers.")
+
+        self.num_logical_experts = example_moe.n_logical_experts
+        self.num_physical_experts = example_moe.n_physical_experts
+        self.num_local_physical_experts = example_moe.n_local_physical_experts
+        self.num_routed_experts = example_moe.n_routed_experts
+        self.num_shared_experts = example_moe.n_shared_experts
+        self.num_redundant_experts = example_moe.n_redundant_experts
+
+    def set_eplb_state(
+        self,
+        expert_load_view: torch.Tensor,
+        logical_to_physical_map: torch.Tensor,
+        logical_replica_count: torch.Tensor,
+    ) -> None:
+        for layer_idx, layer in enumerate(self.moe_layers):
+            # Register the expert weights.
+            self.expert_weights.append(layer.get_expert_weights())
+            layer.set_eplb_state(
+                moe_layer_idx=layer_idx,
+                expert_load_view=expert_load_view,
+                logical_to_physical_map=logical_to_physical_map,
+                logical_replica_count=logical_replica_count,
+            )
+
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ) -> None:
+        assert self.num_local_physical_experts == num_local_physical_experts
+        self.num_physical_experts = num_physical_experts
+        self.num_local_physical_experts = num_local_physical_experts
+        self.num_redundant_experts = (num_physical_experts -
+                                      self.num_logical_experts)
+        for layer in self.model.layers:
+            if isinstance(layer.mlp, DeepseekV2MoE):
+                moe = layer.mlp
+                moe.n_local_physical_experts = num_local_physical_experts
+                moe.n_physical_experts = num_physical_experts
+                moe.n_redundant_experts = self.num_redundant_experts
+                moe.experts.update_expert_map()
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+            ("fused_qkv_a_proj", "q_a_proj", 0),
+            ("fused_qkv_a_proj", "kv_a_proj_with_mqa", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts,
+            num_redundant_experts=self.num_redundant_experts)
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
+            if spec_layer is not None:
+                continue  # skip spec decode layers for main model
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if (("mlp.experts." in name) and name not in params_dict):
+                    continue
+                name_mapped = name.replace(weight_name, param_name)
+
+                # QKV fusion is optional, fall back to normal
+                # weight loading if it's not enabled
+                # if go with fusion option, then update name
+                if ((param_name == "fused_qkv_a_proj")
+                        and name_mapped not in params_dict):
+                    continue
+                else:
+                    name = name_mapped
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                is_expert_weight = False
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+
+                    # Anyway, this is an expert weight and should not be
+                    # attempted to load as other weights later
+                    is_expert_weight = True
+
+                    # Do not modify `name` since the loop may continue here
+                    # Instead, create a new variable
+                    name_mapped = name.replace(weight_name, param_name)
+
+                    if is_pp_missing_parameter(name_mapped, self):
+                        continue
+
+                    param = params_dict[name_mapped]
+                    # We should ask the weight loader to return success or not
+                    # here since otherwise we may skip experts with other
+                    # available replicas.
+                    weight_loader = typing.cast(Callable[..., bool],
+                                                param.weight_loader)
+                    success = weight_loader(param,
+                                            loaded_weight,
+                                            name_mapped,
+                                            shard_id=shard_id,
+                                            expert_id=expert_id,
+                                            return_success=True)
+                    if success:
+                        name = name_mapped
+                        break
+                else:
+                    if is_expert_weight:
+                        # We've checked that this is an expert weight
+                        # However it's not mapped locally to this rank
+                        # So we simply skip it
+                        continue
+
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        return loaded_params
+
+
+class DeepseekV3ForCausalLM(DeepseekV2ForCausalLM):
+    pass
+
+
+def get_spec_layer_idx_from_weight_name(config: PretrainedConfig,
+                                        weight_name: str) -> Optional[int]:
+    if (hasattr(config, "num_nextn_predict_layers")
+            and config.num_nextn_predict_layers > 0):
+        layer_idx = config.num_hidden_layers
+        for i in range(config.num_nextn_predict_layers):
+            if weight_name.startswith(f"model.layers.{layer_idx+i}."):
+                return layer_idx + i
+    return None
diff --git a/vllm_v0.10.0/vllm/model_executor/models/deepseek_vl2.py b/vllm_v0.10.0/vllm/model_executor/models/deepseek_vl2.py
new file mode 100644
index 0000000..a222c4c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/deepseek_vl2.py
@@ -0,0 +1,660 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py
+"""Inference-only Deepseek-VL2 model compatible with HuggingFace weights."""
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Literal, Optional, TypedDict, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from transformers import BatchFeature
+
+from vllm.config import VllmConfig
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.utils import set_default_torch_dtype
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   ImageSize, MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, MultiModalHashes,
+                                        PromptReplacement, PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config,
+                                                          MlpProjectorConfig,
+                                                          VisionEncoderConfig)
+from vllm.transformers_utils.processors.deepseek_vl2 import (
+    DeepseekVLV2Processor)
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
+from vllm.utils import is_list_of
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+
+# The image token id may be various
+_IMAGE_TOKEN = "<image>"
+
+
+class DeepseekVL2ImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    Shape: `(batch_size * num_images, num_channels, height, width)`
+    """
+    images_spatial_crop: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, 2)`
+    """
+
+
+class DeepseekVL2VImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: Union[torch.Tensor, list[torch.Tensor]]
+    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+DeepseekVL2ImageInputs = Union[DeepseekVL2ImagePixelInputs,
+                               DeepseekVL2VImageEmbeddingInputs]
+
+
+class MlpProjector(nn.Module):
+
+    def __init__(self, cfg: MlpProjectorConfig):
+
+        super().__init__()
+
+        self.cfg = cfg
+        assert not cfg.token_pooling, (
+            "Token pooling is not supported currently.")
+
+        if cfg.projector_type == "downsample_mlp_gelu":
+            mlp_depth = cfg.depth
+            mlp_ratio = cfg.mlp_ratio
+            modules = [
+                nn.Linear(
+                    cfg.input_dim * cfg.downsample_ratio *
+                    cfg.downsample_ratio, cfg.n_embed * mlp_ratio)
+            ]
+            for _ in range(1, mlp_depth - 1):
+                modules.append(nn.GELU())
+                modules.append(
+                    nn.Linear(cfg.n_embed * mlp_ratio,
+                              cfg.n_embed * mlp_ratio))
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(cfg.n_embed * mlp_ratio, cfg.n_embed))
+            modules = nn.Sequential(*modules)
+
+        else:
+            raise NotImplementedError(
+                f"Unsupported projector type: {cfg.projector_type}")
+
+        self.layers = modules
+
+    def forward(self, x):
+        bs, hw, input_dim = x.shape
+        h = w = int((hw)**0.5)
+        """compute padding"""
+        if h % self.cfg.downsample_ratio:
+            pad = self.cfg.downsample_ratio - h % self.cfg.downsample_ratio
+        else:
+            pad = 0
+        x = x.reshape(bs, h, w, input_dim)
+        if pad > 0:
+            x = F.pad(x, (0, 0, 0, pad, 0, pad), "constant", 0)
+        """4 to 1 concat"""
+        x = x.permute(0, 3, 1, 2)  # B, C, H, W
+        x = F.unfold(x,
+                     kernel_size=self.cfg.downsample_ratio,
+                     stride=self.cfg.downsample_ratio,
+                     padding=0)  # B, C*4, HW // 4
+        x = x.permute(0, 2, 1)
+
+        return self.layers(x)
+
+
+class DeepseekVL2ProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(DeepseekVLV2Config)
+
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(DeepseekVLV2Processor, **kwargs)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_num_image_tokens(self,
+                             *,
+                             image_width: int,
+                             image_height: int,
+                             cropping: bool = True) -> int:
+        hf_processor = self.get_hf_processor()
+        image_size = hf_processor.image_size
+        patch_size = hf_processor.patch_size
+        downsample_ratio = hf_processor.downsample_ratio
+
+        if cropping:
+            best_width, best_height = hf_processor.select_best_resolution(
+                (image_width, image_height))
+            num_width_tiles, num_height_tiles = (best_width // image_size,
+                                                 best_height // image_size)
+        else:
+            num_width_tiles = num_height_tiles = 1
+
+        h = w = math.ceil((image_size // patch_size) / downsample_ratio)
+
+        global_views_tokens = h * (w + 1)
+        local_views_tokens = (num_height_tiles * h) * (num_width_tiles * w + 1)
+        return global_views_tokens + local_views_tokens + 1
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        hf_config = self.get_hf_config()
+        candidate_resolutions = hf_config.candidate_resolutions
+        height, width = max(candidate_resolutions,
+                            key=lambda x: self.get_num_image_tokens(
+                                image_width=x[1], image_height=x[0]))
+        return ImageSize(width=width, height=height)
+
+
+class DeepseekVL2DummyInputsBuilder(
+        BaseDummyInputsBuilder[DeepseekVL2ProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
+        max_image_size = self.info.get_image_size_with_most_features()
+
+        return {
+            "image":
+            self._get_dummy_images(width=max_image_size.width,
+                                   height=max_image_size.height,
+                                   num_images=num_images)
+        }
+
+
+class DeepseekVL2MultiModalProcessor(
+        BaseMultiModalProcessor[DeepseekVL2ProcessingInfo]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if mm_data:
+            processed_outputs = self.info.ctx.call_hf_processor(
+                self.info.get_hf_processor(**mm_kwargs),
+                dict(prompt=prompt, **mm_data),
+                dict(**mm_kwargs, **tok_kwargs),
+            )
+            pixel_values = processed_outputs["pixel_values"]
+            # split pixel values into patches corresponding to each image
+            images_spatial_crop = processed_outputs["images_spatial_crop"]
+            patches_per_image = [
+                x.prod().item() + 1 for x in images_spatial_crop
+            ]
+            pixel_values = pixel_values.split(patches_per_image)
+            processed_outputs["pixel_values"] = pixel_values
+        else:
+            tokenizer = self.info.get_tokenizer()
+            processed_outputs = tokenizer(prompt,
+                                          add_special_tokens=True,
+                                          return_tensors="pt")
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            images_spatial_crop=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        image_token_id = hf_processor.image_token_id
+        assert isinstance(image_token_id, int)
+
+        def get_replacement_deepseek_vl2(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                num_image_tokens = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+
+                num_image_tokens = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    cropping=len(images) <= 2,
+                )
+            return [image_token_id] * num_image_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=get_replacement_deepseek_vl2,
+            )
+        ]
+
+    def _cached_apply_hf_processor(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+        *,
+        return_mm_hashes: bool,
+    ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
+        # The processor logic is different for len(images) <= 2 vs > 2
+        # Since the processing cache assumes that the processor output is
+        # invariant of how many images are passed per prompt, we only
+        # perform caching for the most common case
+        if mm_data_items.get_count("image", strict=False) > 2:
+            return self._apply_hf_processor(
+                prompt=prompt,
+                mm_data_items=mm_data_items,
+                hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+                tokenization_kwargs=tokenization_kwargs,
+                return_mm_hashes=return_mm_hashes,
+            )
+
+        return super()._cached_apply_hf_processor(
+            prompt=prompt,
+            mm_data_items=mm_data_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            tokenization_kwargs=tokenization_kwargs,
+            return_mm_hashes=return_mm_hashes,
+        )
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    DeepseekVL2MultiModalProcessor,
+    info=DeepseekVL2ProcessingInfo,
+    dummy_inputs=DeepseekVL2DummyInputsBuilder)
+class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
+
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
+        "language.": "language_model.",
+    })
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<image>"
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: DeepseekVLV2Config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.vision_config = config.vision_config
+        self.projector_config = config.projector_config
+        self.text_config = config.text_config
+
+        model_config = vllm_config.model_config
+        tokenizer = cached_tokenizer_from_config(model_config)
+        self.image_token_id = tokenizer.vocab[_IMAGE_TOKEN]
+
+        self.vision = self._init_vision_module(self.vision_config,
+                                               quant_config,
+                                               maybe_prefix(prefix, "vision"))
+
+        self.projector = MlpProjector(self.projector_config)
+        self.tile_tag = config.tile_tag
+        self.global_view_pos = config.global_view_pos
+
+        # special token for image token sequence format
+        embed_std = 1 / torch.sqrt(
+            torch.tensor(self.projector_config.n_embed, dtype=torch.float32))
+        if self.tile_tag == "2D":
+            # <|view_seperator|>, <|\n|>
+            self.image_newline = nn.Parameter(
+                torch.randn(self.projector_config.n_embed) * embed_std)
+            # This is a typo in original implementation
+            self.view_seperator = nn.Parameter(
+                torch.randn(self.projector_config.n_embed) * embed_std)
+        else:
+            raise ValueError(
+                f"Only 2D tile_tag is supported currently, got: {self.tile_tag}"
+            )
+
+        if self.text_config.topk_method == "noaux_tc":
+            architectures = ["DeepseekV3ForCausalLM"]
+        elif not self.text_config.use_mla:
+            architectures = ["DeepseekForCausalLM"]
+        else:
+            architectures = ["DeepseekV2ForCausalLM"]
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=self.text_config,
+            prefix=maybe_prefix(prefix, "language"),
+            architectures=architectures,
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _init_vision_module(
+        self,
+        vision_config: VisionEncoderConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ) -> nn.Module:
+        # TODO: refactor vision model through timm wrapper from transformers
+        try:
+            import timm
+        except ImportError:
+            raise ImportError("Please install timm") from ImportError
+
+        with set_default_torch_dtype(torch.float16):
+            model = timm.create_model(
+                "vit_so400m_patch14_siglip_384.webli",
+                pretrained=False,
+                num_classes=0,
+                dynamic_img_size=True,
+                dynamic_img_pad=True,
+            )
+
+        model = model.to(dtype=torch.get_default_dtype())
+        return model
+
+    def _validate_pixel_values(
+        self, data: Union[torch.Tensor, list[torch.Tensor]]
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+
+        h = w = self.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape[1:])
+
+            if actual_dims != expected_dims:
+                expected_expr = ("num_patches", *map(str, expected_dims))
+                raise ValueError(
+                    "The expected shape of pixel values per image per batch "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _validate_images_spatial_crop(
+        self, data: Union[torch.Tensor, list[torch.Tensor]]
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        expected_dims = 2
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = d.size(-1)
+
+            if actual_dims != expected_dims:
+                expected_expr = str(expected_dims)
+                raise ValueError(
+                    f"The expected shape of image sizes per image per batch "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[DeepseekVL2ImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        images_spatial_crop = kwargs.pop("images_spatial_crop", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            if not isinstance(images_spatial_crop, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image sizes. "
+                                 f"Got type: {type(images_spatial_crop)}")
+
+            return DeepseekVL2ImagePixelInputs(
+                type="pixel_values",
+                data=self._validate_pixel_values(flatten_bn(pixel_values)),
+                images_spatial_crop=self._validate_images_spatial_crop(
+                    flatten_bn(images_spatial_crop, concat=True)))
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+
+            return DeepseekVL2VImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds),
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _pixel_values_to_embedding(
+        self,
+        pixel_values: NestedTensors,
+        images_spatial_crop: torch.Tensor,
+    ) -> NestedTensors:
+        # Pixel_values: n_image * batch_size * [patch_per_img, 3, height, width]
+        total_tiles = [x for x in pixel_values]
+
+        # [batch_all_tiles, 3, height, width]
+        total_tiles = torch.cat(total_tiles, dim=0)
+
+        # [batch_all_tiles, vit_seq_len, c]
+        images_feature = self.vision.forward_features(total_tiles)
+
+        # [batch_all_tiles, hw, D]
+        images_embeds = self.projector(images_feature)
+
+        _, hw, n_dim = images_embeds.shape
+        h = w = int(hw**0.5)
+
+        # fill image token based on self.tile_tag & self.global_view_pos
+        tile_index = 0
+        vision_embeddings = []
+        for jdx in range(images_spatial_crop.size(0)):
+            # extra global & local features
+            num_width_tiles, num_height_tiles = images_spatial_crop[jdx]
+            if num_width_tiles == 0 or num_height_tiles == 0:
+                break
+            num_tiles_in_image = num_width_tiles * num_height_tiles
+
+            # [hw, D]
+            global_features = images_embeds[tile_index]
+
+            # [num_height_tiles * num_width_tiles, hw, D]
+            local_features = images_embeds[tile_index + 1:tile_index + 1 +
+                                           num_tiles_in_image]
+            tile_index += num_tiles_in_image + 1
+
+            # format global and local features
+            # ----------------- global view add newline -----------------
+            # [hw, D] -> [h, w, D]
+            global_features = global_features.view(h, w, n_dim)
+
+            # [D]     -> [h, 1, D]
+            new_lines_in_global = repeat(self.image_newline, "d -> h 1 d", h=h)
+
+            # cat([h, w, D], [h, 1, D], dim=1) -> [h, w + 1, D]
+            global_features = torch.cat([global_features, new_lines_in_global],
+                                        dim=1)
+
+            # [h, w + 1, D] -> [h * (w + 1), D]
+            global_features = global_features.view(-1, n_dim)
+
+            # ----------------- local view add newline -----------------
+            # [num_height_tiles * num_width_tiles, h * w, D] ->
+            # [num_height_tiles * h, num_width_tiles * w, D]
+            local_features = rearrange(local_features,
+                                       "(th tw) (h w) d -> (th h) (tw w) d",
+                                       th=num_height_tiles,
+                                       tw=num_width_tiles,
+                                       h=h,
+                                       w=w)
+
+            # [D] -> [num_height_tiles * h, 1, D]
+            new_lines_in_local = repeat(self.image_newline,
+                                        "d -> (th h) 1 d",
+                                        th=num_height_tiles,
+                                        h=h)
+
+            # [num_height_tiles * h, num_width_tiles * w + 1, D]
+            local_features = torch.cat([local_features, new_lines_in_local],
+                                       dim=1)
+
+            # [num_height_tiles * h, num_width_tiles * w + 1, D]
+            #   --> [(num_height_tiles * h) * (num_width_tiles * w + 1), D]
+            local_features = local_features.view(-1, n_dim)
+
+            # merge global and local tiles
+            if self.global_view_pos == "head":
+                global_local_features = torch.cat([
+                    global_features,
+                    self.view_seperator[None, :],
+                    local_features,
+                ])
+            else:
+                global_local_features = torch.cat([
+                    local_features,
+                    self.view_seperator[None, :],
+                    global_features,
+                ])
+
+            vision_embeddings.append(global_local_features)
+        return vision_embeddings
+
+    def _process_image_input(
+            self, image_input: DeepseekVL2ImageInputs) -> torch.Tensor:
+        if image_input["type"] == "image_embeds":
+            image_data = image_input["data"]
+            if is_list_of(image_data, torch.Tensor):
+                # it's already a list of tensors
+                return image_data
+            if len(image_data.shape) == 3:
+                # 3D tensor
+                return list(torch.unbind(image_data, dim=0))
+            raise ValueError(
+                "We expect batched 2D tensors; "
+                "this can be either a list of 2D tensors or a single 3D tensor."
+            )
+
+        pixel_values = image_input["data"]
+        images_spatial_crop = image_input["images_spatial_crop"]
+
+        return self._pixel_values_to_embedding(
+            pixel_values=pixel_values, images_spatial_crop=images_spatial_crop)
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return []
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.image_token_id)
+        return inputs_embeds
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs: object):
+
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model(input_ids,
+                                            positions,
+                                            intermediate_tensors,
+                                            inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+
+        loader = AutoWeightsLoader(self)
+        autoloaded_weights = loader.load_weights(weights,
+                                                 mapper=self.hf_to_vllm_mapper)
+        return autoloaded_weights
diff --git a/vllm_v0.10.0/vllm/model_executor/models/dots1.py b/vllm_v0.10.0/vllm/model_executor/models/dots1.py
new file mode 100644
index 0000000..4bdcbfa
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/dots1.py
@@ -0,0 +1,536 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2025 The rednote-hilab team.
+# Copyright 2023 The vLLM team.
+# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only dots1 model."""
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
+from vllm.distributed import (get_pp_group,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsPP
+from .utils import (PPMissingLayer, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class Dots1MLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config,
+                                           reduce_results=reduce_results,
+                                           prefix=f"{prefix}.down_proj")
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Dots1MoE(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.n_shared_experts = config.n_shared_experts
+
+        if config.hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {config.hidden_act}. "
+                             "Only silu is supported for now.")
+
+        self.gate = ReplicatedLinear(config.hidden_size,
+                                     config.n_routed_experts,
+                                     bias=False,
+                                     quant_config=None,
+                                     prefix=f"{prefix}.gate")
+        if config.topk_method == "noaux_tc":
+            self.gate.e_score_correction_bias = (nn.Parameter(
+                torch.empty(config.n_routed_experts)))
+        else:
+            self.gate.e_score_correction_bias = None
+
+        self.experts = FusedMoE(
+            num_experts=config.n_routed_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
+            renormalize=config.norm_topk_prob,
+            quant_config=quant_config,
+            use_grouped_topk=True,
+            num_expert_group=config.n_group,
+            topk_group=config.topk_group,
+            prefix=f"{prefix}.experts",
+            scoring_func=config.scoring_func,
+            e_score_correction_bias=self.gate.e_score_correction_bias)
+
+        if config.n_shared_experts is not None:
+            intermediate_size = (config.moe_intermediate_size *
+                                 config.n_shared_experts)
+            self.shared_experts = Dots1MLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=f"{prefix}.shared_experts",
+            )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        if self.n_shared_experts is not None:
+            shared_output = self.shared_experts(hidden_states)
+        router_logits, _ = self.gate(hidden_states)
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states,
+            router_logits=router_logits) * self.routed_scaling_factor
+        if shared_output is not None:
+            final_hidden_states = final_hidden_states + shared_output
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+class Dots1Attention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        config: PretrainedConfig,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = getattr(config, "head_dim",
+                                hidden_size // self.total_num_heads)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        attention_bias = config.attention_bias
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=attention_bias,
+            quant_config=quant_config,
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+        self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+    def forward(self, positions: torch.Tensor,
+                hidden_states: torch.Tensor) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q = self.q_norm(q.reshape(-1, self.num_heads,
+                                  self.head_dim)).reshape(q.shape)
+        k = self.k_norm(k.reshape(-1, self.num_kv_heads,
+                                  self.head_dim)).reshape(k.shape)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Dots1DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        prefix: str,
+        model_config: ModelConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        layer_idx = int(prefix.split(sep='.')[-1])
+        self.layer_idx = layer_idx
+
+        self.self_attn = Dots1Attention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            config=config,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        if (config.n_routed_experts is not None
+                and layer_idx >= config.first_k_dense_replace
+                and layer_idx % config.moe_layer_freq == 0):
+            self.mlp = Dots1MoE(config=config,
+                                quant_config=quant_config,
+                                prefix=f"{prefix}.mlp")
+        else:
+            self.mlp = Dots1MLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+        self.routed_scaling_factor = config.routed_scaling_factor
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(positions=positions,
+                                       hidden_states=hidden_states)
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class Dots1Model(nn.Module):
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+
+        self.vocab_size = config.vocab_size
+
+        if get_pp_group().is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.embed_tokens")
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Dots1DecoderLayer(
+                config,
+                prefix,
+                model_config=model_config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+            ),
+            prefix=f"{prefix}.layers")
+
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+@support_torch_compile
+class Dots1ForCausalLM(nn.Module, SupportsPP):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Dots1Model(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(config.vocab_size,
+                                          config.hidden_size,
+                                          quant_config=quant_config)
+        else:
+            self.lm_head = PPMissingLayer()
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            intermediate_tensors,
+            inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts)
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                if (("mlp.experts." in name) and name not in params_dict):
+                    continue
+                name = name.replace(weight_name, param_name)
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm_v0.10.0/vllm/model_executor/models/ernie45.py b/vllm_v0.10.0/vllm/model_executor/models/ernie45.py
new file mode 100644
index 0000000..2a89fff
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/ernie45.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The Baidu team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Erine model compatible with HuggingFace weights."""
+from vllm.config import VllmConfig
+from vllm.model_executor.models.llama import LlamaForCausalLM
+
+from .utils import PPMissingLayer
+
+
+class Ernie4_5_ForCausalLM(LlamaForCausalLM):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        # Hack Llama model to fit HF format Ernie4.5 dense implementation
+        # Attention difference between Ernie and Llama:
+        # 1. rotary_dim and no Neox style.
+        # 2. There is no bias for o_proj in attention
+        for layer in self.model.layers:
+            if not isinstance(layer, PPMissingLayer):
+                layer.self_attn.rotary_emb.is_neox_style = False
+                layer.self_attn.o_proj.bias = None
+                layer.self_attn.o_proj.skip_bias_add = True
diff --git a/vllm_v0.10.0/vllm/model_executor/models/ernie45_moe.py b/vllm_v0.10.0/vllm/model_executor/models/ernie45_moe.py
new file mode 100644
index 0000000..984003e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/ernie45_moe.py
@@ -0,0 +1,598 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The Baidu team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only ErineMoE model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
+                    is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+logger = init_logger(__name__)
+
+
+class Ernie4_5_MoeMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        use_bias: bool = False,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=use_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=use_bias,
+                                           quant_config=quant_config,
+                                           reduce_results=reduce_results,
+                                           prefix=f"{prefix}.down_proj")
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Ernie4_5_MoeMoE(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        layer_idx = extract_layer_index(prefix)
+        self.layer_idx = layer_idx
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.moe_num_shared_experts = getattr(config, "moe_num_shared_experts",
+                                              None)
+
+        if self.tp_size > config.moe_num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.moe_num_experts}.")
+
+        self.gate = ReplicatedLinear(config.hidden_size,
+                                     config.moe_num_experts,
+                                     bias=False,
+                                     quant_config=None,
+                                     prefix=f"{prefix}.gate")
+
+        self.experts = FusedMoE(num_experts=config.moe_num_experts,
+                                top_k=config.moe_k,
+                                hidden_size=config.hidden_size,
+                                intermediate_size=config.moe_intermediate_size,
+                                reduce_results=False,
+                                renormalize=True,
+                                quant_config=quant_config,
+                                prefix=f"{prefix}.experts")
+
+        if self.moe_num_shared_experts is not None:
+            intermediate_size = (config.moe_intermediate_size *
+                                 config.moe_num_shared_experts)
+            self.shared_experts = Ernie4_5_MoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.shared_experts",
+                reduce_results=self.experts.must_reduce_shared_expert_outputs(
+                ))
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        orig_shape = hidden_states.shape
+        hidden_dim = hidden_states.shape[-1]
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        if self.moe_num_shared_experts is not None:
+            shared_output = self.shared_experts(hidden_states)
+
+        router_logits, _ = self.gate(hidden_states)
+
+        final_hidden_states = self.experts(hidden_states=hidden_states,
+                                           router_logits=router_logits)
+
+        if self.moe_num_shared_experts is not None and \
+              shared_output is not None:
+            final_hidden_states = final_hidden_states + shared_output
+
+        if self.tp_size > 1:
+            final_hidden_states = (
+                self.experts.maybe_all_reduce_tensor_model_parallel(
+                    final_hidden_states))
+
+        return final_hidden_states.view(orig_shape)
+
+
+class Ernie4_5_MoeAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: Optional[int] = None,
+        rope_theta: float = 500000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 131072,
+        rms_norm_eps: float = 1e-05,
+        qkv_bias: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        layer_idx = extract_layer_index(prefix) if len(prefix) > 0 else 0
+        self.layer_idx = layer_idx
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim or (hidden_size // self.total_num_heads)
+
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(hidden_size,
+                                          self.head_dim,
+                                          self.total_num_heads,
+                                          self.total_num_kv_heads,
+                                          bias=qkv_bias,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.qkv_proj")
+
+        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
+                                        hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.o_proj")
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            is_neox_style=False,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+
+        qkv, _ = self.qkv_proj(hidden_states)
+
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+
+        # Attention
+        attn_output = self.attn(q, k, v)
+        # Output projection
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Ernie4_5_MoeDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 500000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          131072)
+        self.self_attn = Ernie4_5_MoeAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            head_dim=getattr(config, 'head_dim', None),
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            rms_norm_eps=config.rms_norm_eps,
+            qkv_bias=getattr(config, 'use_bias', False),
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+
+        layer_idx = extract_layer_index(prefix)
+        self.layer_idx = layer_idx
+
+        # MoE
+        moe_num_experts = getattr(config, "moe_num_experts", 0)
+        moe_layer_start_index = getattr(config, "moe_layer_start_index", 0)
+        moe_layer_end_index = getattr(config, "moe_layer_end_index",
+                                      config.num_hidden_layers - 1)
+        moe_layer_interval = getattr(config, "moe_layer_interval", 1)
+        use_moe = getattr(config, "use_moe", moe_num_experts > 0)
+
+        if (use_moe and ((layer_idx + 1) % moe_layer_interval == 0)
+                and layer_idx >= moe_layer_start_index
+                and layer_idx <= moe_layer_end_index):
+            self.mlp = Ernie4_5_MoeMoE(config=config,
+                                       quant_config=quant_config,
+                                       prefix=f"{prefix}.mlp")
+        else:
+            self.mlp = Ernie4_5_MoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                use_bias=getattr(config, 'use_bias', False),
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp")
+
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+
+        hidden_states = self.mlp(hidden_states)
+
+        return hidden_states, residual
+
+
+@support_torch_compile
+class Ernie4_5_MoeModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.config = config
+
+        if get_pp_group().is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.embed_tokens")
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Ernie4_5_MoeDecoderLayer(config=config,
+                                                    cache_config=cache_config,
+                                                    quant_config=quant_config,
+                                                    prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        return FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.moe_num_experts)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        expert_params_mapping = self.get_expert_mapping()
+        for name, loaded_weight in weights:
+            if self.config.tie_word_embeddings and name.endswith(
+                    "lm_head.weight"):
+                continue
+            # MTP will be supported soon.
+            if "mtp" in name:
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+
+                if (("mlp.experts." in name) and name not in params_dict):
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if ((name.endswith(".bias") or name.endswith("_bias"))
+                        and name not in params_dict):
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+
+                    if weight_name not in name:
+                        continue
+
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    param = params_dict[name]
+
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Ernie4_5_MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Ernie4_5_MoeModel(vllm_config=vllm_config,
+                                       prefix=maybe_prefix(prefix, "model"))
+
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(config.vocab_size,
+                                          config.hidden_size,
+                                          quant_config=quant_config)
+        else:
+            self.lm_head = PPMissingLayer()
+
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
diff --git a/vllm_v0.10.0/vllm/model_executor/models/exaone.py b/vllm_v0.10.0/vllm/model_executor/models/exaone.py
new file mode 100644
index 0000000..aaf105e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/exaone.py
@@ -0,0 +1,551 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/modeling_exaone.py
+# Copyright 2024 The LG U+ CTO AI Tech Lab.
+# Copyright 2021 The LG AI Research EXAONE Lab
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Exaone model compatible with HuggingFace weights."""
+
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.exaone import ExaoneConfig
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class ExaoneGatedMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=hidden_size,
+            output_sizes=[intermediate_size] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.c_proj = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.c_proj",
+        )
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.c_proj(x)
+        return x
+
+
+class ExaoneAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: ExaoneConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        self.head_dim = getattr(config, "head_dim", None)
+        if self.head_dim is None:
+            self.head_dim = self.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.out_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+
+        is_neox_style = True
+        if quant_config is not None and quant_config.get_name() == "gguf":
+            is_neox_style = False
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=is_neox_style,
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.out_proj(attn_output)
+        return output
+
+
+class ExaoneBlockAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: ExaoneConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.attention = ExaoneAttention(
+            config=config,
+            hidden_size=hidden_size,
+            num_heads=num_heads,
+            num_kv_heads=num_kv_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=bias,
+            cache_config=cache_config,
+            prefix=f"{prefix}.attention",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        return self.attention(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+
+class ExaoneDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: ExaoneConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+                config, "original_max_position_embeddings", None):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        # Support abacusai/Smaug-72B-v0.1 with attention_bias
+        # Support internlm/internlm-7b with bias
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False)
+        self.attn = ExaoneBlockAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=getattr(config, "num_key_value_heads",
+                                 config.num_attention_heads),
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=attention_bias,
+            cache_config=cache_config,
+            prefix=f"{prefix}.attn",
+        )
+        self.mlp = ExaoneGatedMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.activation_function,
+            quant_config=quant_config,
+            bias=getattr(config, "mlp_bias", False),
+            prefix=f"{prefix}.mlp",
+        )
+        self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        self.ln_2 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.ln_1(hidden_states)
+        else:
+            hidden_states, residual = self.ln_1(hidden_states, residual)
+        hidden_states = self.attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.ln_2(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class ExaoneModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.quant_config = quant_config
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.wte = config.vocab_size
+        if get_pp_group().is_first_rank or (config.tie_word_embeddings
+                                            and get_pp_group().is_last_rank):
+            self.wte = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                quant_config=quant_config,
+            )
+        else:
+            self.wte = PPMissingLayer()
+        self.start_layer, self.end_layer, self.h = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: ExaoneDecoderLayer(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=f"{prefix}.h",
+        )
+        if get_pp_group().is_last_rank:
+            self.ln_f = RMSNorm(config.hidden_size,
+                                eps=config.layer_norm_epsilon)
+        else:
+            self.ln_f = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.wte(input_ids)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for layer in self.h[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.ln_f(hidden_states, residual)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".c_fc_0", 0),
+            (".gate_up_proj", ".c_fc_1", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "c_fc_0",
+            "c_fc_1",
+        ],
+    }
+
+    # LoRA specific attributes
+    embedding_modules = {
+        "wte": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.lora_config = lora_config
+        self.quant_config = quant_config
+
+        self.transformer = ExaoneModel(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "model"),
+        )
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = config.vocab_size
+            if lora_config:
+                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE
+                # We need bigger padding if using lora for kernel
+                # compatibility
+                if not lora_config else lora_config.lora_vocab_padding_size,
+                quant_config=quant_config,
+            )
+            if config.tie_word_embeddings:
+                self.lm_head.weight = self.transformer.wte.weight
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    config.vocab_size,
+                                                    logit_scale)
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            self.transformer.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        model_output = self.transformer(input_ids, positions,
+                                        intermediate_tensors, inputs_embeds)
+        return model_output
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            # With tie_word_embeddings, we can skip lm_head.weight
+            # The weight might appear unnecessarily in the files if the model is
+            # processed with quantization, LoRA, fine-tuning, etc.
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/exaone4.py b/vllm_v0.10.0/vllm/model_executor/models/exaone4.py
new file mode 100644
index 0000000..97aeb6f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/exaone4.py
@@ -0,0 +1,547 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+
+# Adapted from
+# https://github.com/lgai-exaone/transformers/blob/add-exaone4/src/transformers/models/exaone4/modeling_exaone4.py
+# Copyright 2025 The LG CNS Gen AI Solution Delivery Team.
+# Copyright 2025 The LG AI Research and HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Exaone model compatible with HuggingFace weights."""
+
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.exaone4 import Exaone4Config
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
+                    is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class Exaone4GatedMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=hidden_size,
+            output_sizes=[intermediate_size] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Exaone4Attention(nn.Module):
+
+    def __init__(
+        self,
+        config: Exaone4Config,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 1000000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        self.head_dim = getattr(config, "head_dim", None)
+        if self.head_dim is None:
+            self.head_dim = self.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+        is_neox_style = True
+        if quant_config is not None and quant_config.get_name() == "gguf":
+            is_neox_style = False
+
+        self.apply_all_layers = False  # apply rotary embeddings to every layer.
+        layer_idx = extract_layer_index(prefix)
+        interleaved_sliding_window = getattr(config,
+                                             "interleaved_sliding_window",
+                                             4096)
+        sliding_window_pattern = getattr(config, "sliding_window_pattern",
+                                         "LLLG")
+
+        if sliding_window_pattern:
+            layer_has_sliding_window = (
+                layer_idx + 1) % sliding_window_pattern.__len__() != 0
+        else:
+            layer_has_sliding_window = False
+            self.apply_all_layers = True
+
+        if layer_has_sliding_window:
+            self.sliding_window = interleaved_sliding_window
+        else:
+            self.sliding_window = None
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=is_neox_style,
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            per_layer_sliding_window=self.sliding_window,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        q = q.unflatten(-1, (self.num_heads, self.head_dim))
+        q = self.q_norm(q)
+        q = q.flatten(-2, -1)
+        k = k.unflatten(-1, (self.num_kv_heads, self.head_dim))
+        k = self.k_norm(k)
+        k = k.flatten(-2, -1)
+
+        if self.sliding_window or self.apply_all_layers:
+            q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Exaone4DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Exaone4Config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 1000000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+                config, "original_max_position_embeddings", None):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        # Support abacusai/Smaug-72B-v0.1 with attention_bias
+        # Support internlm/internlm-7b with bias
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False)
+
+        self.self_attn = Exaone4Attention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=getattr(config, "num_key_value_heads",
+                                 config.num_attention_heads),
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=attention_bias,
+            cache_config=cache_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.mlp = Exaone4GatedMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            bias=getattr(config, "mlp_bias", False),
+            prefix=f"{prefix}.mlp",
+        )
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+        self.post_feedforward_layernorm = RMSNorm(config.hidden_size,
+                                                  eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        residual = hidden_states
+
+        # Self Attention
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Use post-LN
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+
+        # Fully Connected
+        hidden_states = self.mlp(hidden_states)
+
+        # Use post-LN
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states, residual
+
+
+@support_torch_compile
+class Exaone4Model(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.quant_config = quant_config
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        if get_pp_group().is_first_rank or (config.tie_word_embeddings
+                                            and get_pp_group().is_last_rank):
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                quant_config=quant_config,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Exaone4DecoderLayer(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=f"{prefix}.layers",
+        )
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.lora_config = lora_config
+        self.quant_config = quant_config
+
+        self.model = Exaone4Model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "model"),
+        )
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = config.vocab_size
+            if lora_config:
+                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE
+                # We need bigger padding if using lora for kernel
+                # compatibility
+                if not lora_config else lora_config.lora_vocab_padding_size,
+                quant_config=quant_config,
+            )
+            if config.tie_word_embeddings:
+                self.lm_head.weight = self.model.embed_tokens.weight
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    config.vocab_size,
+                                                    logit_scale)
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        model_output = self.model(input_ids, positions, intermediate_tensors,
+                                  inputs_embeds)
+        return model_output
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            # With tie_word_embeddings, we can skip lm_head.weight
+            # The weight might appear unnecessarily in the files if the model is
+            # processed with quantization, LoRA, fine-tuning, etc.
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/fairseq2_llama.py b/vllm_v0.10.0/vllm/model_executor/models/fairseq2_llama.py
new file mode 100644
index 0000000..d78ee10
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/fairseq2_llama.py
@@ -0,0 +1,154 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2024 The vLLM team.
+# Copyright 2024 Meta Platforms, Inc. and affiliates. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Llama model for fairseq2 weights."""
+
+from collections.abc import Iterable
+
+import torch
+from torch.nn import Parameter
+
+from vllm.config import VllmConfig
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size)
+from vllm.model_executor.layers.linear import set_weight_attrs
+from vllm.model_executor.models.llama import LlamaForCausalLM
+
+from .utils import AutoWeightsLoader, WeightsMapper
+
+
+class Fairseq2LlamaForCausalLM(LlamaForCausalLM):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        # For the model loader to read only the relevant checkpoint files
+        self.allow_patterns_overrides = [
+            # either the full checkpoint
+            "model.pt",
+            # or the tp-sharded checkpoint of the current rank
+            f"model.{self.tp_rank}.pt",
+        ]
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        # fairseq2's serialization adds a wrapper to usual .pt state_dict's:
+        # { "model_key": my_model_name, "my_model_name": state_dict }
+        # which we first need to unpack
+        weights_wrapped = dict(weights)
+        weights = weights_wrapped[
+            weights_wrapped["model_key"]].items()  # type: ignore
+
+        # remap keys
+        fs2_to_vllm_mapper = WeightsMapper(
+            orig_to_new_prefix={
+                "decoder_frontend.embed.": "model.embed_tokens.",
+                "decoder.": "model.",
+                "final_proj.": "lm_head.",
+            },
+            orig_to_new_substr={
+                ".self_attn_layer_norm.": ".input_layernorm.",
+                ".ffn_layer_norm.": ".post_attention_layernorm.",
+                ".self_attn.output_proj.": ".self_attn.o_proj.",
+                ".ffn.gate_proj.": ".mlp.gate_proj.",
+                ".ffn.inner_proj.": ".mlp.up_proj.",
+                ".ffn.output_proj.": ".mlp.down_proj.",
+                ".layer_norm.": ".norm.",
+            },
+        )
+        weights = fs2_to_vllm_mapper.apply(weights)
+
+        params = dict(self.named_parameters())
+
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(
+            (self.reshape_fairseq2_weights(name, loaded_weight, params)
+             for name, loaded_weight in weights))
+
+    def flag_sharded_weights(self, params: dict[str, Parameter]):
+        """Sets the `is_sharded_weight` flag to True for all sharded weights"""
+        for name, param in params.items():
+            modules = name.split(".")
+            if "norm" in name and len(param.size()) < 2:
+                # layer norms are not sharded
+                continue
+            elif any(emb in modules for emb in ["embed_tokens", "lm_head"]):
+                # for now we repeat embedding layers for compatibility
+                continue
+            else:
+                # all other layers are sharded
+                set_weight_attrs(param, {"is_sharded_weight": True})
+
+    def reshape_fairseq2_weights(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+        params: dict[str, Parameter],
+    ) -> tuple[str, torch.Tensor]:
+        """Reshape fairseq2's weights."""
+
+        def permute(w: torch.Tensor, n_heads: int) -> torch.Tensor:
+            attn_in = self.config.head_dim * n_heads
+            # check for a sharded weight on dim 0
+            if attn_in // self.tp_size == w.size()[0]:
+                attn_in //= self.tp_size
+                n_heads //= self.tp_size
+            attn_out = self.config.hidden_size
+            return (w.view(n_heads, attn_in // n_heads // 2, 2,
+                           attn_out).transpose(1,
+                                               2).reshape(attn_in, attn_out))
+
+        modules = name.split(".")
+
+        # rotary embeds should be sliced
+        if "k_proj" in modules:
+            loaded_weight = permute(loaded_weight,
+                                    self.config.num_key_value_heads)
+
+        elif "q_proj" in modules:
+            loaded_weight = permute(loaded_weight,
+                                    self.config.num_attention_heads)
+
+        # We make the loaded weights compatible with both
+        # full checkpoints and tp sharded checkpoints.
+        # Embeddings are repeated to fit the vocab size.
+        # Other weights are flagged for the weight_loader calls.
+        if any(emb in modules for emb in ["embed_tokens", "lm_head"]):
+            # Embeddings are sharded on dim 0
+            dim = 0
+            # In fairseq2, vocab size has to be divisible by tp_size
+            # so we don't worry about padding
+            if self.tp_size > 1 and loaded_weight.shape[
+                    dim] < self.config.vocab_size:
+                assert loaded_weight.shape[
+                    dim] * self.tp_size == self.config.vocab_size, \
+                        "vocab_size should be divisible by tp_size."
+                repeats = [1] * len(loaded_weight.size())
+                repeats[dim] = self.tp_size
+                # repeat to match vocab size and to be easily 'narrow'able
+                loaded_weight = loaded_weight.repeat(repeats)
+                set_weight_attrs(params[name], {"is_sharded_weight": False})
+                # if embeddings are sharded, the rest is too
+                if "embed_tokens" in modules:
+                    self.flag_sharded_weights(params)
+
+        return name, loaded_weight
diff --git a/vllm_v0.10.0/vllm/model_executor/models/falcon.py b/vllm_v0.10.0/vllm/model_executor/models/falcon.py
new file mode 100644
index 0000000..62a93da
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/falcon.py
@@ -0,0 +1,510 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/a5cc30d72ae2dc19af534e4b35c986cc28db1275/src/transformers/models/falcon/modeling_falcon.py
+# Copyright 2023 The vLLM team.
+# Copyright 2023 the Falcon authors and HuggingFace Inc. team.  All rights
+# reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Falcon model."""
+
+import math
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import LayerNorm
+from transformers import FalconConfig as HF_FalconConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs import RWConfig
+
+from .interfaces import SupportsPP
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+FalconConfig = Union[HF_FalconConfig, RWConfig]
+
+
+def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
+    closest_power_of_2 = 2**math.floor(math.log2(total_num_heads))
+    base = torch.tensor(2**(-(2**-(math.log2(closest_power_of_2) - 3))),
+                        dtype=torch.float32)
+    powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
+    slopes = torch.pow(base, powers)
+
+    if closest_power_of_2 != total_num_heads:
+        extra_base = torch.tensor(
+            2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
+            dtype=torch.float32)
+        num_remaining_heads = min(closest_power_of_2,
+                                  total_num_heads - closest_power_of_2)
+        extra_powers = torch.arange(1,
+                                    1 + 2 * num_remaining_heads,
+                                    2,
+                                    dtype=torch.int32)
+        slopes = torch.cat(
+            [slopes, torch.pow(extra_base, extra_powers)], dim=0)
+
+    return slopes
+
+
+class FalconAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: FalconConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.head_dim = self.hidden_size // self.total_num_heads
+        assert self.head_dim * self.total_num_heads == self.hidden_size
+
+        self.new_decoder_architecture = config.new_decoder_architecture
+        self.multi_query = config.multi_query
+
+        if self.new_decoder_architecture:
+            self.total_num_kv_heads = config.num_kv_heads
+        elif self.multi_query:
+            self.total_num_kv_heads = 1
+        else:
+            self.total_num_kv_heads = self.total_num_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+
+        self.query_key_value = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=config.bias,
+            skip_bias_add=True,
+            quant_config=quant_config,
+        )
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+
+        # Layer-wise attention scaling
+        self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim)
+        self.reduce_row_parallel_results = not (config.new_decoder_architecture
+                                                or config.parallel_attn)
+        self.dense = RowParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=config.bias,
+            skip_bias_add=True,
+            quant_config=quant_config,
+            reduce_results=self.reduce_row_parallel_results)
+
+        self.use_rotary = config.rotary
+        self.use_alibi = config.alibi
+        assert not (self.use_rotary and self.use_alibi), (
+            "Rotary and alibi are mutually exclusive.")
+
+        if self.use_rotary:
+            rope_theta = getattr(config, "rope_theta", 10000)
+            max_position_embeddings = getattr(config,
+                                              "max_position_embeddings", 8192)
+            self.rotary_emb = get_rope(
+                self.head_dim,
+                rotary_dim=self.head_dim,
+                max_position=max_position_embeddings,
+                base=rope_theta,
+            )
+            self.attn = Attention(self.num_heads,
+                                  self.head_dim,
+                                  self.inv_norm_factor,
+                                  num_kv_heads=self.num_kv_heads,
+                                  quant_config=quant_config,
+                                  prefix=f"{prefix}.attn")
+        elif self.use_alibi:
+            tp_rank = get_tensor_model_parallel_rank()
+            head_start = tp_rank * self.num_heads
+            head_end = (tp_rank + 1) * self.num_heads
+            alibi_slopes = (_get_alibi_slopes(self.total_num_heads) *
+                            self.inv_norm_factor)
+            alibi_slopes = alibi_slopes[head_start:head_end].tolist()
+            self.attn = Attention(self.num_heads,
+                                  self.head_dim,
+                                  self.inv_norm_factor,
+                                  num_kv_heads=self.num_kv_heads,
+                                  alibi_slopes=alibi_slopes,
+                                  quant_config=quant_config,
+                                  prefix=f"{prefix}.attn")
+        else:
+            self.attn = Attention(self.num_heads,
+                                  self.head_dim,
+                                  scale=self.inv_norm_factor,
+                                  num_kv_heads=self.num_kv_heads,
+                                  cache_config=cache_config,
+                                  quant_config=quant_config,
+                                  prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, bias = self.query_key_value(hidden_states)
+        if bias is not None:
+            qkv += bias
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        if self.use_rotary:
+            q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        attn_output, bias = self.dense(attn_output)
+        return attn_output, bias
+
+
+class FalconMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: FalconConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        hidden_size = config.hidden_size
+
+        self.dense_h_to_4h = ColumnParallelLinear(hidden_size,
+                                                  4 * hidden_size,
+                                                  bias=config.bias,
+                                                  skip_bias_add=True,
+                                                  quant_config=quant_config)
+        self.act = get_act_fn("gelu")
+        self.reduce_row_parallel_results = not (config.new_decoder_architecture
+                                                or config.parallel_attn)
+        self.dense_4h_to_h = RowParallelLinear(
+            4 * hidden_size,
+            hidden_size,
+            bias=config.bias,
+            skip_bias_add=True,
+            reduce_results=self.reduce_row_parallel_results,
+            quant_config=quant_config)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # NOTE(zhuohan): Following huggingface, we do not fuse bias add here.
+        x, bias = self.dense_h_to_4h(x)
+        if bias is not None:
+            x += bias
+        x = self.act(x)
+        x, bias = self.dense_4h_to_h(x)
+        return x, bias
+
+
+class FalconDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: FalconConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.self_attention = FalconAttention(
+            config,
+            cache_config,
+            quant_config,
+            prefix=f"{prefix}.self_attention")
+        self.mlp = FalconMLP(config, quant_config)
+        self.config = config
+
+        if (not hasattr(config, "num_ln_in_parallel_attn")):
+            config.num_ln_in_parallel_attn = None
+
+        if (config.num_ln_in_parallel_attn is None
+                and config.new_decoder_architecture):
+            config.num_ln_in_parallel_attn = 2
+
+        if not config.parallel_attn:
+            self.post_attention_layernorm = LayerNorm(
+                hidden_size, eps=config.layer_norm_epsilon)
+            self.input_layernorm = LayerNorm(hidden_size,
+                                             eps=config.layer_norm_epsilon)
+        else:
+            if config.num_ln_in_parallel_attn == 2:
+                # The layer norm before self-attention
+                self.ln_attn = LayerNorm(hidden_size,
+                                         eps=config.layer_norm_epsilon)
+                # The layer norm before the MLP
+                self.ln_mlp = LayerNorm(hidden_size,
+                                        eps=config.layer_norm_epsilon)
+            else:
+                self.input_layernorm = LayerNorm(hidden_size,
+                                                 eps=config.layer_norm_epsilon)
+
+        self.reduce_row_parallel_results = not (config.new_decoder_architecture
+                                                or config.parallel_attn)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        residual = hidden_states
+
+        if self.config.num_ln_in_parallel_attn == 2:
+            attention_layernorm_out = self.ln_attn(hidden_states)
+            mlp_layernorm_out = self.ln_mlp(hidden_states)
+        else:
+            attention_layernorm_out = self.input_layernorm(hidden_states)
+
+        # Self attention.
+        attention_output, attention_bias = self.self_attention(
+            positions=positions,
+            hidden_states=attention_layernorm_out,
+        )
+        if self.reduce_row_parallel_results and attention_bias is not None:
+            attention_output += attention_bias
+
+        if not self.config.new_decoder_architecture:
+            if self.config.parallel_attn:
+                mlp_layernorm_out = attention_layernorm_out
+            else:
+                residual += attention_output
+                mlp_layernorm_out = self.post_attention_layernorm(residual)
+
+        if (self.config.new_decoder_architecture and self.config.parallel_attn
+                and self.config.num_ln_in_parallel_attn == 1):
+            mlp_layernorm_out = attention_layernorm_out
+
+        # MLP.
+        mlp_output, mlp_bias = self.mlp(mlp_layernorm_out)
+        if self.reduce_row_parallel_results and mlp_bias is not None:
+            mlp_output += mlp_bias
+
+        if not self.reduce_row_parallel_results:
+            # When MLP and Attention layers are parallel, we can use
+            # only one all-reduce operator to reduce the results from
+            # both MLP and Attention layers.
+            mlp_output += attention_output
+            mlp_output = tensor_model_parallel_all_reduce(mlp_output)
+            if attention_bias is not None:
+                mlp_output += attention_bias
+            if mlp_bias is not None:
+                mlp_output += mlp_bias
+
+        output = mlp_output + residual
+        return output
+
+
+@support_torch_compile
+class FalconModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.use_alibi = config.alibi
+
+        # Embedding + LN Embedding
+        self.word_embeddings = VocabParallelEmbedding(
+            config.vocab_size,
+            self.embed_dim,
+        )
+
+        # Transformer blocks
+        self.start_layer, self.end_layer, self.h = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: FalconDecoderLayer(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.h")
+
+        # Final Layer Norm
+        self.ln_f = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(["hidden_states"],
+                                                    config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.word_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+        else:
+            hidden_states = intermediate_tensors["hidden_states"]
+        for layer in self.h[self.start_layer:self.end_layer]:
+            hidden_states = layer(positions, hidden_states)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
+        hidden_states = self.ln_f(hidden_states)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        total_num_heads = self.config.num_attention_heads
+        if self.config.new_decoder_architecture:
+            total_num_kv_heads = self.config.num_kv_heads
+        elif self.config.multi_query:
+            total_num_kv_heads = 1
+        else:
+            total_num_kv_heads = total_num_heads
+        num_query_heads_per_kv_head = total_num_heads // total_num_kv_heads
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            # Skip loading extra bias for GPTQ models.
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+            param = params_dict[name]
+            if "query_key_value" in name:
+                output_dim = getattr(param, "output_dim", None)
+                loaded_weight_shape = loaded_weight.shape
+                if output_dim is not None:
+                    loaded_weight = loaded_weight.view(
+                        loaded_weight_shape[:output_dim] +
+                        (total_num_kv_heads, num_query_heads_per_kv_head + 2,
+                         -1) + loaded_weight_shape[output_dim + 1:])
+                    wq = loaded_weight.narrow(
+                        output_dim + 1, 0,
+                        num_query_heads_per_kv_head).reshape(
+                            *loaded_weight_shape[:output_dim], -1,
+                            *loaded_weight_shape[output_dim + 1:])
+                    wk = loaded_weight.narrow(
+                        output_dim + 1, num_query_heads_per_kv_head,
+                        1).reshape(*loaded_weight_shape[:output_dim], -1,
+                                   *loaded_weight_shape[output_dim + 1:])
+                    wv = loaded_weight.narrow(
+                        output_dim + 1, num_query_heads_per_kv_head + 1,
+                        1).reshape(*loaded_weight_shape[:output_dim], -1,
+                                   *loaded_weight_shape[output_dim + 1:])
+                    loaded_weight = torch.cat([wq, wk, wv], dim=output_dim)
+
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class FalconForCausalLM(nn.Module, SupportsPP):
+    packed_modules_mapping = {
+        "query_key_value": ["query_key_value"],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.transformer = FalconModel(vllm_config=vllm_config,
+                                       prefix=maybe_prefix(
+                                           prefix, "transformer"))
+        # only Falcon-11B doesn't share lm_head weight with word embeddings
+        # and previous Falcon model doesn't have tie_word_embeddings config
+        # so we set tie_word_embeddings to True by default
+        self.tie_word_embeddings = (config.tie_word_embeddings
+                                    if config.tie_word_embeddings is not None
+                                    else True)
+        if self.tie_word_embeddings:
+            self.lm_head = self.transformer.word_embeddings
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+            )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.transformer.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.transformer(input_ids, positions,
+                                         intermediate_tensors, inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/falcon_h1.py b/vllm_v0.10.0/vllm/model_executor/models/falcon_h1.py
new file mode 100644
index 0000000..6a58b15
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/falcon_h1.py
@@ -0,0 +1,717 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Inference-only FalconH1 model."""
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+from torch import nn
+from transformers import FalconH1Config
+
+from vllm import envs
+from vllm.attention.layer import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.mamba2_metadata import (
+    Mamba2Metadata, prepare_mamba2_metadata)
+from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
+from vllm.model_executor.layers.mamba.mamba_utils import get_mamba_state_shape
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
+                                                    MambaCacheParams)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP
+from .utils import (PPMissingLayer, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class FalconH1MLP(nn.Module):
+
+    def __init__(
+        self,
+        config: FalconH1Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=config.hidden_size,
+            output_sizes=[config.intermediate_size] * 2,
+            bias=bias,
+            quant_config=quant_config,
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=config.intermediate_size,
+            output_size=config.hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+        )
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.intermediate_size = config.intermediate_size
+        self.gate_multiplier, self.down_multiplier = config.mlp_multipliers
+        if config.hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {config.hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        x, _ = self.gate_up_proj(x)
+        x[:, :self.intermediate_size // self.tp_size] *= self.gate_multiplier
+        x = self.act_fn(x)
+        x, _ = self.down_proj(x)
+        x = x * self.down_multiplier
+        return x
+
+
+class FalconH1SSMDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: FalconH1Config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        self.d_ssm = (int(config.mamba_expand * config.hidden_size)
+                      if config.mamba_d_ssm is None else config.mamba_d_ssm)
+
+        self.mamba = MambaMixer2(
+            hidden_size=config.hidden_size,
+            ssm_state_size=config.mamba_d_state,
+            conv_kernel_size=config.mamba_d_conv,
+            intermediate_size=self.d_ssm,
+            use_conv_bias=config.mamba_conv_bias,
+            use_bias=config.mamba_proj_bias,
+            n_groups=config.mamba_n_groups,
+            num_heads=config.mamba_n_heads,
+            head_dim=config.mamba_d_head,
+            rms_norm_eps=config.rms_norm_eps,
+            activation=config.hidden_act,
+            quant_config=quant_config,
+            use_rms_norm=config.mamba_rms_norm,
+            prefix=f"{prefix}.mixer",
+        )
+        # n_groups is overridden later by `MambaMixer2`
+        self.groups_time_state_size = self.mamba.n_groups * config.mamba_d_state
+        self.zxbcdt_multipliers = config.ssm_multipliers
+        self._init_mup_vector()
+
+    def _init_mup_vector(self):
+        """
+        Non learnable per-block scaling vector composed of element-wise 
+        multipliersapplied to each separate contiguous block of the output 
+        of the linear projection (in_proj) before further processing
+        (gating, convolution, SSM):
+
+            - Z block:  [0 : d_ssm]                      → zxbcdt_multipliers[0]
+            - X block:  [d_ssm : 2 * d_ssm]              → zxbcdt_multipliers[1]
+            - B block:  [2 * d_ssm : 2 * d_ssm + G * S]  → zxbcdt_multipliers[2]
+            - C block:  [2 * d_ssm + G * S : 2 * d_ssm + 2 * G * S] 
+                        → zxbcdt_multipliers[3]
+            - dt block: [2 * d_ssm + 2 * G * S : end]    → zxbcdt_multipliers[4]
+
+        where:
+            - d_ssm:     Dimension of state-space model latent
+            - G:         Number of groups (n_groups)
+            - S:         SSM state size per group
+            - All indices are divided by tp_size to support tensor parallelism
+        """
+        vector_shape = (2 * self.d_ssm + 2 * self.groups_time_state_size +
+                        self.config.mamba_n_heads) // self.tp_size
+        mup_vector = torch.ones(1, vector_shape)
+        # Z vector 0 -> d_ssm
+        mup_vector[:, :self.d_ssm //
+                   self.tp_size] *= self.zxbcdt_multipliers[0]
+        # X vector d_ssm -> 2 * d_ssm
+        mup_vector[:,
+                   (self.d_ssm //
+                    self.tp_size):(2 * self.d_ssm //
+                                   self.tp_size)] *= self.zxbcdt_multipliers[1]
+        # B vector 2 * d_ssm -> 2 * d_ssm + (n_group * d_state)
+        mup_vector[
+            :,
+            (2 * self.d_ssm) //
+            self.tp_size:(2 * self.d_ssm + self.groups_time_state_size) //
+            self.tp_size,
+        ] *= self.zxbcdt_multipliers[2]
+        # C vector 2 * d_ssm + (n_group * d_state)
+        # -> 2 * d_ssm + 2 * (n_group * d_state)
+        mup_vector[
+            :,
+            (2 * self.d_ssm + self.groups_time_state_size) //
+            self.tp_size:(2 * self.d_ssm + 2 * self.groups_time_state_size) //
+            self.tp_size,
+        ] *= self.zxbcdt_multipliers[3]
+        # dt vector 2 * d_ssm + 2 * (n_group * d_state)
+        # -> 2 * d_ssm + 2 * (n_group * d_state) + n_heads
+        mup_vector[
+            :,
+            (2 * self.d_ssm + 2 * self.groups_time_state_size) //
+            self.tp_size:,
+        ] *= self.zxbcdt_multipliers[4]
+
+        self.register_buffer("mup_vector", mup_vector, persistent=False)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        mamba_cache_params: MambaCacheParams,
+        mamba2_metadata: Mamba2Metadata,
+        **kwargs,
+    ):
+        output = torch.empty_like(hidden_states)
+        self.mamba(
+            hidden_states,
+            output,
+            mamba_cache_params,
+            mamba2_metadata=mamba2_metadata,
+            mup_vector=self.mup_vector,
+        )
+        return output, residual
+
+
+class FalconH1AttentionDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: FalconH1Config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        rope_theta = getattr(config, "rope_theta", 1e11)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = (config.hidden_size // self.total_num_heads if getattr(
+            config, "head_dim", None) is None else config.head_dim)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        if hasattr(config, "partial_rotary_factor"):
+            rotary_dim = self.head_dim * config.partial_rotary_factor
+        elif hasattr(config, "attn_rotary_emb"):
+            rotary_dim = config.attn_rotary_emb  # for backward compatibility
+        else:
+            rotary_dim = self.head_dim  # default
+
+        self.rotary_emb = get_rope(
+            head_size=self.head_dim,
+            rotary_dim=rotary_dim,
+            max_position=max_position_embeddings,
+            rope_scaling=rope_scaling,
+            base=rope_theta,
+            is_neox_style=True,
+            dtype=None,  # see impl of get_rope
+        )
+
+        self.qkv_proj = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            prefix=f"{prefix}.attn",
+        )
+        self.key_multiplier = config.key_multiplier
+
+    def self_attention(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        k = k * self.key_multiplier
+
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        **kwargs,
+    ):
+        hidden_states = self.self_attention(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+        return hidden_states, residual
+
+
+class FalconH1ParallelHybrid(nn.Module):
+    """
+    A hybrid decoder layer for FalconH1 where the input is processed
+    in parallel through both the self-attention branch and the SSM (Mamba)
+    branch. Their outputs are then summed to produce the final hidden state.
+
+    This layer uses:
+      - FalconH1AttentionDecoderLayer for the multi-head self-attention branch.
+      - FalconH1SSMDecoderLayer for the state-space (Mamba) branch.
+    """
+
+    def __init__(
+        self,
+        config: FalconH1Config,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        # Instantiate the attention branch
+        self.self_attn = FalconH1AttentionDecoderLayer(
+            config=config,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=prefix,
+        )
+
+        # In V1 all attention/ssm layers must have
+        # different index in prefix
+        ssm_layer_idx = config.num_hidden_layers + layer_idx
+        ssm_prefix = prefix.split(".")[0] + f".{ssm_layer_idx}"
+
+        # Instantiate the SSM branch
+        self.mamba = FalconH1SSMDecoderLayer(
+            config=config,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=ssm_prefix,
+        )
+        self.ssm_out_multiplier = config.ssm_out_multiplier
+        self.ssm_in_multiplier = config.ssm_in_multiplier
+
+        self.attention_in_multiplier = config.attention_in_multiplier
+        self.attn_out_multiplier = config.attention_out_multiplier
+
+        self.feed_forward = FalconH1MLP(config)
+
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = RMSNorm(config.hidden_size,
+                                        eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        mamba2_metadata: Mamba2Metadata,
+        **kwargs,
+    ):
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Process input through the attention branch.
+        # FalconH1AttentionDecoderLayer expects positions, hidden_states,
+        # kv_cache, attn_metadata, and residual.
+        attn_hidden, _ = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states * self.attention_in_multiplier,
+            residual=residual,
+            **kwargs,
+        )
+
+        # Process input through the SSM branch.
+        # FalconH1SSMDecoderLayer expects hidden_states, attn_metadata,
+        # residual, mamba_cache_params, and sequence_idx.
+        ssm_hidden, _ = self.mamba(
+            hidden_states=hidden_states * self.ssm_in_multiplier,
+            residual=residual,
+            mamba_cache_params=mamba_cache_params,
+            mamba2_metadata=mamba2_metadata,
+            **kwargs,
+        )
+        # Sum the outputs from both branches.
+        # We assume both branches produce outputs of the same
+        # dimensionality (config.hidden_size).
+        hidden_states = (attn_hidden * self.attn_out_multiplier) + (
+            ssm_hidden * self.ssm_out_multiplier)
+        hidden_states = hidden_states + residual
+
+        # feed-forward
+        residual = hidden_states
+        hidden_states = self.pre_ff_layernorm(hidden_states)
+        hidden_states = self.feed_forward(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+@support_torch_compile
+class FalconH1Model(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: FalconH1Config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+        if get_pp_group().is_first_rank:
+
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+            )
+            self.embedding_multiplier = config.embedding_multiplier
+        else:
+            self.embed_tokens = PPMissingLayer()
+            self.embedding_multiplier = 1.0
+
+        def get_layer(prefix: str):
+            layer_idx = int(prefix.rsplit(".", 1)[1])
+            layer_class = FalconH1ParallelHybrid
+            return layer_class(
+                config,
+                layer_idx,
+                cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers")
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+        if get_pp_group().is_last_rank:
+            self.final_layernorm = RMSNorm(config.hidden_size,
+                                           eps=config.rms_norm_eps)
+        else:
+            self.final_layernorm = PPMissingLayer()
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        # pass a sequence index tensor, that is required for
+        # proper continuous batching computation including
+        # chunked prefill
+        attn_metadata = get_forward_context().attn_metadata
+
+        if not envs.VLLM_USE_V1:
+            mamba2_metadata = prepare_mamba2_metadata(
+                chunk_size=self.config.mamba_chunk_size,
+                attn_metadata=attn_metadata,
+            )
+        else:
+            # v1 get mamba2_metadata from forward_context
+            mamba2_metadata = None
+
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds * self.embedding_multiplier
+            else:
+                hidden_states = (self.get_input_embeddings(input_ids) *
+                                 self.embedding_multiplier)
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            layer_mamba_cache_params = None
+            if mamba_cache_params:
+                layer_mamba_cache_params = mamba_cache_params.at_layer_idx(i)
+            hidden_states = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                mamba_cache_params=layer_mamba_cache_params,
+                mamba2_metadata=mamba2_metadata,
+            )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+            })
+        hidden_states = self.final_layernorm(hidden_states)
+        return hidden_states
+
+
+class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
+                          IsHybrid):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    @classmethod
+    def get_mamba_state_shape_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+        use_v1: bool = True,
+    ) -> tuple[tuple[int, int], tuple[int, int, int]]:
+        """Calculate shapes for Mamba's convolutional and state caches.
+
+        Args:
+            vllm_config: vLLM config
+            use_v1: Get shapes for V1 (or V0)
+
+        Returns:
+            Tuple containing:
+            - conv_state_shape: Shape for convolutional state cache
+            - temporal_state_shape: Shape for state space model cache
+        """
+        parallel_config = vllm_config.parallel_config
+        hf_config = vllm_config.model_config.hf_config
+
+        intermediate_size = (int(hf_config.mamba_expand *
+                                 hf_config.hidden_size)
+                             if hf_config.mamba_d_ssm is None else
+                             hf_config.mamba_d_ssm)
+
+        return get_mamba_state_shape(
+            intermediate_size=intermediate_size,
+            tp_world_size=parallel_config.tensor_parallel_size,
+            n_groups=hf_config.mamba_n_groups,
+            num_heads=hf_config.mamba_n_heads,
+            head_dim=hf_config.mamba_d_head,
+            state_size=hf_config.mamba_d_state,
+            conv_kernel=hf_config.mamba_d_conv,
+            use_v1=use_v1,
+        )
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        lora_config = vllm_config.lora_config
+        scheduler_config = vllm_config.scheduler_config
+        assert (not cache_config.enable_prefix_caching
+                ), "FalconH1 currently does not support prefix caching"
+
+        self.quant_config = vllm_config.quant_config
+
+        super().__init__()
+        self.config = config
+        self.scheduler_config = scheduler_config
+        self.model = FalconH1Model(vllm_config=vllm_config,
+                                   prefix=maybe_prefix(prefix, "model"))
+        self.tie_word_embeddings = config.tie_word_embeddings
+        self.unpadded_vocab_size = config.vocab_size
+        self.mamba_cache: Optional[MambaCacheManager] = None
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=(
+                    DEFAULT_VOCAB_PADDING_SIZE
+                    # We need bigger padding if using lora for kernel
+                    # compatibility
+                    if not lora_config else
+                    lora_config.lora_vocab_padding_size),
+            )
+            self.lm_head_multiplier = config.lm_head_multiplier
+            if self.tie_word_embeddings:
+                self.lm_head = self.lm_head.tie_weights(
+                    self.model.embed_tokens)
+            # Used to track and store by the Mamba cache between steps.
+
+            self.logits_processor = LogitsProcessor(
+                self.unpadded_vocab_size,
+                config.vocab_size,
+                scale=config.lm_head_multiplier,
+            )
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+
+        mamba_cache_params = None
+        if not envs.VLLM_USE_V1:
+            if self.mamba_cache is None:
+                mamba_state_shape = \
+                    self.get_mamba_state_shape_from_config(
+                        self.vllm_config, use_v1=False)
+                self.mamba_cache = MambaCacheManager(
+                    self.vllm_config,
+                    self.lm_head.weight.dtype if hasattr(
+                        self.lm_head, 'weight') else torch.bfloat16,
+                    self.config.num_hidden_layers,
+                    *mamba_state_shape,
+                )
+            mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            mamba_cache_params,
+            intermediate_tensors,
+            inputs_embeds,
+        )
+
+        return hidden_states
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        return self.mamba_cache.copy_inputs_before_cuda_graphs(
+            input_buffers, **kwargs)
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if "A_log" in name:
+                name = name.replace("A_log", "A")
+
+            if "mamba" in name:
+                name = name.replace("mamba", "mamba.mamba")
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                if self.tie_word_embeddings and "lm_head" in name:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        if self.tie_word_embeddings:
+            loaded_params.add("lm_head.weight")
+        return loaded_params
diff --git a/vllm_v0.10.0/vllm/model_executor/models/florence2.py b/vllm_v0.10.0/vllm/model_executor/models/florence2.py
new file mode 100644
index 0000000..1bedac2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/florence2.py
@@ -0,0 +1,1113 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+from collections import OrderedDict
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Literal, Optional, TypedDict, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from transformers import BartTokenizer, BatchFeature, PretrainedConfig
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.bart import (BartDecoder, BartEncoder,
+                                             BartParallelLMHead,
+                                             BartScaledWordEmbedding)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import MultiModalDataItems
+from vllm.multimodal.processing import (BaseProcessingInfo,
+                                        EncDecMultiModalProcessor,
+                                        PromptIndexTargets, PromptInsertion,
+                                        PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import (MultiModalEmbeddings, SupportsMultiModal,
+                         SupportsV0Only)
+from .utils import AutoWeightsLoader, flatten_bn, merge_multimodal_embeddings
+
+
+class Florence2ImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: torch.Tensor
+    """Shape: (batch_size, num_channel, height, width)"""
+
+
+# ViT implementation are all copied from
+# https://huggingface.co/microsoft/Florence-2-base/blob/main/modeling_florence2.py
+class LearnedAbsolutePositionEmbedding2D(nn.Module):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, embedding_dim=256, num_pos=50):
+        super().__init__()
+        self.row_embeddings = nn.Embedding(num_pos, embedding_dim // 2)
+        self.column_embeddings = nn.Embedding(
+            num_pos, embedding_dim - (embedding_dim // 2))
+
+    def forward(self, pixel_values):
+        """
+        pixel_values: (batch_size, height, width, num_channels) 
+        returns: (batch_size, height, width, embedding_dim * 2)
+        """
+        if len(pixel_values.shape) != 4:
+            raise ValueError('pixel_values must be a 4D tensor')
+        height, width = pixel_values.shape[1:3]
+        width_values = torch.arange(width, device=pixel_values.device)
+        height_values = torch.arange(height, device=pixel_values.device)
+        x_emb = self.column_embeddings(width_values)
+        y_emb = self.row_embeddings(height_values)
+        # (height, width, embedding_dim * 2)
+        pos = torch.cat([
+            x_emb.unsqueeze(0).repeat(height, 1, 1),
+            y_emb.unsqueeze(1).repeat(1, width, 1)
+        ],
+                        dim=-1)
+        # (embedding_dim * 2, height, width)
+        pos = pos.permute(2, 0, 1)
+        pos = pos.unsqueeze(0)
+        # (batch_size, embedding_dim * 2, height, width)
+        pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
+        # (batch_size, height, width, embedding_dim * 2)
+        pos = pos.permute(0, 2, 3, 1)
+        return pos
+
+
+class PositionalEmbeddingCosine1D(nn.Module):
+    """
+    This class implements a very simple positional encoding. It follows closely
+    the encoder from the link below:
+    https://pytorch.org/tutorials/beginner/translation_transformer.html
+    Args:
+        embed_dim: The dimension of the embeddings.
+        dropout_prob: The dropout probability.
+        max_seq_len: The maximum length to precompute the positional encodings.
+    """
+
+    def __init__(self, embed_dim: int = 512, max_seq_len: int = 1024) -> None:
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.max_seq_len = max_seq_len
+        # Generate the sinusoidal arrays.
+        factor = math.log(10000)
+        denominator = torch.exp(-factor * torch.arange(0, self.embed_dim, 2) /
+                                self.embed_dim)
+        # Matrix where rows correspond to a positional embedding as a function
+        # of the position index (i.e., the row index).
+        frequencies = \
+            torch.arange(0, self.max_seq_len) \
+            .reshape(self.max_seq_len, 1) * denominator
+        pos_idx_to_embed = torch.zeros((self.max_seq_len, self.embed_dim))
+        # Populate uneven entries.
+        pos_idx_to_embed[:, 0::2] = torch.sin(frequencies)
+        pos_idx_to_embed[:, 1::2] = torch.cos(frequencies)
+        # Save the positional embeddings in a constant buffer.
+        # self.register_buffer("pos_idx_to_embed", pos_idx_to_embed)
+        self.pos_idx_to_embed = nn.Parameter(pos_idx_to_embed,
+                                             requires_grad=False)
+
+    def forward(self, seq_embeds: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            seq_embeds: The sequence embeddings in order. Allowed size:
+                1. [T, D], where T is the length of the sequence, and D is the
+                frame embedding dimension.
+                2. [B, T, D], where B is the batch size and T and D are the
+                same as above.
+        Returns a tensor of with the same dimensions as the input: i.e.,
+        [1, T, D] or [T, D].
+        """
+        shape_len = len(seq_embeds.shape)
+        assert 2 <= shape_len <= 3
+        len_seq = seq_embeds.size(-2)
+        assert len_seq <= self.max_seq_len
+        pos_embeds = self.pos_idx_to_embed[0:seq_embeds.size(-2), :]
+        # Adapt pre-computed positional embeddings to the input.
+        if shape_len == 3:
+            pos_embeds = pos_embeds.view(
+                (1, pos_embeds.size(0), pos_embeds.size(1)))
+        return pos_embeds
+
+
+class MySequential(nn.Sequential):
+
+    def forward(self, *inputs):
+        for module in self._modules.values():
+            if isinstance(inputs, tuple):
+                inputs = module(*inputs)
+            else:
+                inputs = module(inputs)
+        return inputs
+
+
+class PreNorm(nn.Module):
+
+    def __init__(self, norm, fn):
+        super().__init__()
+        self.norm = norm
+        self.fn = fn
+
+    def forward(self, x, *args, **kwargs):
+        shortcut = x
+        if self.norm is not None:
+            x, size = self.fn(self.norm(x), *args, **kwargs)
+        else:
+            x, size = self.fn(x, *args, **kwargs)
+
+        x = shortcut + x
+
+        return x, size
+
+
+class Mlp(nn.Module):
+
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.net = nn.Sequential(
+            OrderedDict([("fc1", nn.Linear(in_features, hidden_features)),
+                         ("act", act_layer()),
+                         ("fc2", nn.Linear(hidden_features, out_features))]))
+
+    def forward(self, x, size):
+        return self.net(x), size
+
+
+class DepthWiseConv2d(nn.Module):
+
+    def __init__(
+        self,
+        dim_in,
+        kernel_size,
+        padding,
+        stride,
+        bias=True,
+    ):
+        super().__init__()
+        self.dw = nn.Conv2d(dim_in,
+                            dim_in,
+                            kernel_size=kernel_size,
+                            padding=padding,
+                            groups=dim_in,
+                            stride=stride,
+                            bias=bias)
+
+    def forward(self, x, size):
+        B, N, C = x.shape
+        H, W = size
+        assert N == H * W
+
+        x = self.dw(x.transpose(1, 2).view(B, C, H, W))
+        size = (x.size(-2), x.size(-1))
+        x = x.flatten(2).transpose(1, 2)
+        return x, size
+
+
+class ConvEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self,
+                 patch_size=7,
+                 in_chans=3,
+                 embed_dim=64,
+                 stride=4,
+                 padding=2,
+                 norm_layer=None,
+                 pre_norm=True):
+        super().__init__()
+        self.patch_size = patch_size
+
+        self.proj = nn.Conv2d(in_chans,
+                              embed_dim,
+                              kernel_size=patch_size,
+                              stride=stride,
+                              padding=padding)
+
+        dim_norm = in_chans if pre_norm else embed_dim
+        self.norm = norm_layer(dim_norm) if norm_layer else None
+
+        self.pre_norm = pre_norm
+
+    def forward(self, x, size):
+        H, W = size
+        if len(x.size()) == 3:
+            if self.norm and self.pre_norm:
+                x = self.norm(x)
+            x = rearrange(x, 'b (h w) c -> b c h w', h=H, w=W)
+
+        x = self.proj(x)
+
+        _, _, H, W = x.shape
+        x = rearrange(x, 'b c h w -> b (h w) c')
+        if self.norm and not self.pre_norm:
+            x = self.norm(x)
+
+        return x, (H, W)
+
+
+class ChannelAttention(nn.Module):
+
+    def __init__(self, dim, groups=8, qkv_bias=True):
+        super().__init__()
+
+        self.groups = groups
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+
+    def forward(self, x, size):
+        B, N, C = x.shape
+
+        qkv = self.qkv(x).reshape(B, N, 3, self.groups,
+                                  C // self.groups).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        q = q * (float(N)**-0.5)
+        attention = q.transpose(-1, -2) @ k
+        attention = attention.softmax(dim=-1)
+        x = (attention @ v.transpose(-1, -2)).transpose(-1, -2)
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        return x, size
+
+
+class ChannelBlock(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 groups,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 drop_path_rate=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 conv_at_attn=True,
+                 conv_at_ffn=True):
+        super().__init__()
+
+        self.conv1 = PreNorm(None, DepthWiseConv2d(
+            dim, 3, 1, 1)) if conv_at_attn else None
+        self.channel_attn = PreNorm(
+            norm_layer(dim),
+            ChannelAttention(dim, groups=groups, qkv_bias=qkv_bias),
+        )
+        self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1,
+                                                   1)) if conv_at_ffn else None
+        self.ffn = PreNorm(
+            norm_layer(dim),
+            Mlp(in_features=dim,
+                hidden_features=int(dim * mlp_ratio),
+                act_layer=act_layer),
+        )
+
+    def forward(self, x, size):
+        if self.conv1:
+            x, size = self.conv1(x, size)
+        x, size = self.channel_attn(x, size)
+
+        if self.conv2:
+            x, size = self.conv2(x, size)
+        x, size = self.ffn(x, size)
+
+        return x, size
+
+
+def window_partition(x, window_size: int):
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size,
+               C)
+    windows = x.permute(0, 1, 3, 2, 4,
+                        5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+
+
+def window_reverse(windows, batch_size: int, window_size: int, H: int, W: int):
+    B = batch_size
+
+    x = windows.view(B, H // window_size, W // window_size, window_size,
+                     window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+
+
+class WindowAttention(nn.Module):
+
+    def __init__(self, dim, num_heads, window_size, qkv_bias=True):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = float(head_dim)**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self, x, size):
+
+        H, W = size
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+
+        x = x.view(B, H, W, C)
+
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+
+        x = window_partition(x, self.window_size)
+        x = x.view(-1, self.window_size * self.window_size, C)
+
+        # W-MSA/SW-MSA
+        # attn_windows = self.attn(x_windows)
+
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        attn = self.softmax(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+
+        # merge windows
+        x = x.view(-1, self.window_size, self.window_size, C)
+        x = window_reverse(x, B, self.window_size, Hp, Wp)
+
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+
+        x = x.view(B, H * W, C)
+
+        return x, size
+
+
+class SpatialBlock(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 drop_path_rate=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 conv_at_attn=True,
+                 conv_at_ffn=True):
+        super().__init__()
+
+        self.conv1 = PreNorm(None, DepthWiseConv2d(
+            dim, 3, 1, 1)) if conv_at_attn else None
+        self.window_attn = PreNorm(
+            norm_layer(dim),
+            WindowAttention(dim, num_heads, window_size, qkv_bias=qkv_bias),
+        )
+        self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1,
+                                                   1)) if conv_at_ffn else None
+        self.ffn = PreNorm(
+            norm_layer(dim),
+            Mlp(in_features=dim,
+                hidden_features=int(dim * mlp_ratio),
+                act_layer=act_layer),
+        )
+
+    def forward(self, x, size):
+        if self.conv1:
+            x, size = self.conv1(x, size)
+        x, size = self.window_attn(x, size)
+
+        if self.conv2:
+            x, size = self.conv2(x, size)
+        x, size = self.ffn(x, size)
+        return x, size
+
+
+class DaViT(nn.Module):
+
+    def __init__(
+        self,
+        in_chans=3,
+        num_classes=1000,
+        depths=(1, 1, 3, 1),
+        patch_size=(7, 2, 2, 2),
+        patch_stride=(4, 2, 2, 2),
+        patch_padding=(3, 0, 0, 0),
+        patch_prenorm=(False, False, False, False),
+        embed_dims=(64, 128, 192, 256),
+        num_heads=(3, 6, 12, 24),
+        num_groups=(3, 6, 12, 24),
+        window_size=7,
+        mlp_ratio=4.,
+        qkv_bias=True,
+        drop_path_rate=0.1,
+        norm_layer=nn.LayerNorm,
+        enable_checkpoint=False,
+        conv_at_attn=True,
+        conv_at_ffn=True,
+    ):
+        super().__init__()
+
+        self.num_classes = num_classes
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.num_groups = num_groups
+        self.num_stages = len(self.embed_dims)
+        self.enable_checkpoint = enable_checkpoint
+        assert self.num_stages == len(self.num_heads) == len(self.num_groups)
+
+        num_stages = len(embed_dims)
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate,
+                                             sum(depths) * 2)
+        ]
+
+        depth_offset = 0
+        convs = []
+        blocks = []
+        for i in range(num_stages):
+            conv_embed = ConvEmbed(
+                patch_size=patch_size[i],
+                stride=patch_stride[i],
+                padding=patch_padding[i],
+                in_chans=in_chans if i == 0 else self.embed_dims[i - 1],
+                embed_dim=self.embed_dims[i],
+                norm_layer=norm_layer,
+                pre_norm=patch_prenorm[i])
+            convs.append(conv_embed)
+
+            block = MySequential(*[
+                MySequential(
+                    OrderedDict([('spatial_block',
+                                  SpatialBlock(
+                                      embed_dims[i],
+                                      num_heads[i],
+                                      window_size,
+                                      drop_path_rate=dpr[depth_offset + j * 2],
+                                      qkv_bias=qkv_bias,
+                                      mlp_ratio=mlp_ratio,
+                                      conv_at_attn=conv_at_attn,
+                                      conv_at_ffn=conv_at_ffn,
+                                  )),
+                                 ('channel_block',
+                                  ChannelBlock(
+                                      embed_dims[i],
+                                      num_groups[i],
+                                      drop_path_rate=dpr[depth_offset + j * 2 +
+                                                         1],
+                                      qkv_bias=qkv_bias,
+                                      mlp_ratio=mlp_ratio,
+                                      conv_at_attn=conv_at_attn,
+                                      conv_at_ffn=conv_at_ffn,
+                                  ))])) for j in range(depths[i])
+            ])
+            blocks.append(block)
+            depth_offset += depths[i] * 2
+
+        self.convs = nn.ModuleList(convs)
+        self.blocks = nn.ModuleList(blocks)
+
+        self.avgpool = nn.AdaptiveAvgPool1d(1)
+
+    @property
+    def dim_out(self):
+        return self.embed_dims[-1]
+
+    def forward_features_unpool(self, x):
+        """
+        forward until avg pooling 
+        Args:
+            x (_type_): input image tensor
+        """
+        input_size = (x.size(2), x.size(3))
+        for conv, block in zip(self.convs, self.blocks):
+            x, input_size = conv(x, input_size)
+            x, input_size = block(x, input_size)
+        return x
+
+    def forward_features(self, x):
+        x = self.forward_features_unpool(x)
+
+        # (batch_size, num_tokens, token_dim)
+        x = self.avgpool(x.transpose(1, 2))
+        # (batch_size, 1, num_tokens)
+        x = torch.flatten(x, 1)
+        x = self.norms(x)
+
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+    @classmethod
+    def from_config(cls, config):
+        return cls(
+            depths=config.depths,
+            embed_dims=config.dim_embed,
+            num_heads=config.num_heads,
+            num_groups=config.num_groups,
+            patch_size=config.patch_size,
+            patch_stride=config.patch_stride,
+            patch_padding=config.patch_padding,
+            patch_prenorm=config.patch_prenorm,
+            drop_path_rate=config.drop_path_rate,
+            window_size=config.window_size,
+        )
+
+
+# Language backbone and processor implementation
+class Florence2LanguageModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+
+        self.vocab_size = config.vocab_size
+
+        self.shared = BartScaledWordEmbedding(self.vocab_size, config.d_model)
+        self.encoder = BartEncoder(config,
+                                   cache_config=cache_config,
+                                   quant_config=quant_config,
+                                   prefix=f"{prefix}.encoder")
+        self.decoder = BartDecoder(config,
+                                   cache_config=cache_config,
+                                   quant_config=quant_config,
+                                   prefix=f"{prefix}.decoder")
+
+        if self.config.tie_word_embeddings:
+            self.encoder.embed_tokens.weight = self.shared.weight
+            self.decoder.embed_tokens.weight = self.shared.weight
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        encoder_input_ids: torch.Tensor,
+        encoder_positions: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            input_ids
+                Indices of *decoder* input sequence tokens in the vocabulary.
+                Padding will be ignored by default should you
+                provide it.
+            positions
+                Positions of *decoder* input sequence tokens.
+            encoder_input_ids
+                Indices of *encoder* input sequence tokens in the vocabulary.
+            encoder_positions:
+                Positions of *encoder* input sequence tokens.
+        Returns:
+            Model output torch.Tensor
+        """
+
+        encoder_hidden_states = None
+
+        if inputs_embeds is not None or encoder_input_ids.numel() > 0:
+            # Run encoder attention if a non-zero number of encoder tokens
+            # are provided as input
+            encoder_hidden_states = self.encoder(input_ids=encoder_input_ids,
+                                                 positions=encoder_positions,
+                                                 inputs_embeds=inputs_embeds)
+
+        # decoder outputs consists of
+        # (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            decoder_input_ids=input_ids,
+            decoder_positions=positions,
+            encoder_hidden_states=encoder_hidden_states)
+
+        return decoder_outputs
+
+
+class Florence2LanguageForConditionalGeneration(nn.Module, SupportsV0Only):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+
+        self.config = config
+        self.model = Florence2LanguageModel(vllm_config=vllm_config,
+                                            prefix=f"{prefix}.model")
+        embed_scale = math.sqrt(
+            config.d_model) if config.scale_embedding else 1.0
+
+        self.vocab_size = config.vocab_size
+        self.lm_head = BartParallelLMHead(self.vocab_size,
+                                          config.d_model,
+                                          embed_scale=embed_scale)
+
+        self.logits_processor = LogitsProcessor(self.vocab_size,
+                                                config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        encoder_input_ids: torch.Tensor,
+        encoder_positions: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            input_ids
+                torch.Tensor of *decoder* input token ids.
+            positions
+                torch.Tensor of *decoder* position indices.
+            encoder_input_ids
+                torch.Tensor of *encoder* input token ids.
+            encoder_positions
+                torch.Tensor of *encoder* position indices
+        Returns:
+            Output torch.Tensor
+        """
+
+        return self.model(input_ids,
+                          positions,
+                          encoder_input_ids,
+                          encoder_positions,
+                          inputs_embeds=inputs_embeds)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.encoder.embed_tokens(input_ids)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if "final_logits_bias" in name:
+                    continue
+                if self.config.tie_word_embeddings and "embed_tokens" in name:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Florence2ProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config()
+
+    def get_hf_processor(self):
+        return self.ctx.get_hf_processor()
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1}
+
+    def get_num_image_tokens(self) -> int:
+        processor_config = self.ctx.get_hf_image_processor_config()
+        return processor_config["image_seq_length"]
+
+
+class Florence2DummyInputsBuilder(
+        BaseDummyInputsBuilder[Florence2ProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
+        target_width = target_height = self.info.get_hf_config().projection_dim
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+
+class Florence2MultiModalProcessor(
+        EncDecMultiModalProcessor[Florence2ProcessingInfo]):
+
+    def _hf_processor_applies_updates(
+        self,
+        prompt_text: str,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+    ) -> bool:
+        return False
+
+    def create_encoder_prompt(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+    ) -> Union[str, list[int]]:
+        return prompt
+
+    def create_decoder_prompt(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+    ) -> Union[str, list[int]]:
+        return [self.info.get_hf_config().eos_token_id]
+
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+        hf_processor = self.info.get_hf_processor()
+        tokenizer: BartTokenizer = hf_processor.tokenizer
+        prompt_text = tokenizer.decode(prompt_tokens)
+        # convert task tokens to prompt
+        prompt_text = hf_processor._construct_prompts([prompt_text])[0]
+        prompt_tokens = tokenizer.encode(prompt_text, add_special_tokens=False)
+        return prompt_tokens
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if mm_data:
+            processed_outputs = super()._call_hf_processor(
+                prompt, mm_data, mm_kwargs, tok_kwargs)
+        else:
+            hf_processor = self.info.get_hf_processor()
+            tokenizer = hf_processor.tokenizer
+            prompt = hf_processor._construct_prompts([prompt])[0]
+            processed_outputs = tokenizer(prompt,
+                                          add_special_tokens=True,
+                                          return_tensors="pt")
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(pixel_values=MultiModalFieldConfig.batched("image"))
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_config = self.info.get_hf_config()
+        pad_token_id = hf_config.pad_token_id
+        num_image_tokens = self.info.get_num_image_tokens()
+        image_tokens = [pad_token_id] * num_image_tokens
+
+        return [
+            PromptInsertion(
+                modality="image",
+                target=PromptIndexTargets.start(),
+                insertion=image_tokens,
+            )
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Florence2MultiModalProcessor,
+    info=Florence2ProcessingInfo,
+    dummy_inputs=Florence2DummyInputsBuilder)
+class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                        SupportsV0Only):
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return None
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        processor_config = vllm_config.model_config.hf_image_processor_config
+
+        self.config = config
+        self.vision_config = config.vision_config
+        self.processor_config = processor_config
+        assert config.vision_config.model_type == 'davit', (
+            'only DaViT is supported for now')
+        self.vision_tower = DaViT.from_config(config=config.vision_config)
+        self._build_image_projection_layers(config)
+        self.language_model = Florence2LanguageForConditionalGeneration(
+            vllm_config=vllm_config.with_hf_config(config.text_config),
+            prefix=f"{prefix}.language_model",
+        )
+        self.pad_token_id = config.pad_token_id
+
+    def _build_image_projection_layers(self, config: PretrainedConfig):
+        image_dim_out = config.vision_config.dim_embed[-1]
+        dim_projection = config.vision_config.projection_dim
+        self.image_projection = nn.Parameter(
+            torch.empty(image_dim_out, dim_projection))
+        self.image_proj_norm = nn.LayerNorm(dim_projection)
+        image_pos_embed_config = config.vision_config.image_pos_embed
+        if image_pos_embed_config['type'] == 'learned_abs_2d':
+            self.image_pos_embed = LearnedAbsolutePositionEmbedding2D(
+                embedding_dim=image_dim_out,
+                num_pos=image_pos_embed_config['max_pos_embeddings'])
+        else:
+            raise NotImplementedError("Florence2 only supports learned_abs_2d "
+                                      "as image position embedding.")
+
+        self.image_feature_source = config.vision_config.image_feature_source
+
+        # temporal embedding
+        visual_temporal_embedding_config = (
+            self.vision_config.visual_temporal_embedding)
+        if visual_temporal_embedding_config['type'] == 'COSINE':
+            self.visual_temporal_embed = PositionalEmbeddingCosine1D(
+                embed_dim=image_dim_out,
+                max_seq_len=visual_temporal_embedding_config[
+                    'max_temporal_embeddings'])
+        else:
+            raise NotImplementedError(
+                'Florence2 only supports COSINE as temporal embedding.')
+
+    def _validate_pixel_values(
+        self, data: Union[torch.Tensor, list[torch.Tensor]]
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+
+        size = self.processor_config["size"]
+        h, w = size["height"], size["width"]
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape)
+
+            if actual_dims != expected_dims:
+                expected_expr = tuple(*map(str, expected_dims))
+                raise ValueError(
+                    "The expected shape of pixel values per batch "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(self, **kwargs: object):
+        pixel_values: Optional[Union[list[list[torch.Tensor]],
+                                     list[torch.Tensor],
+                                     torch.Tensor]] = kwargs.pop(
+                                         "pixel_values", None)
+        image_embeds: Optional[Union[list[list[torch.Tensor]],
+                                     list[torch.Tensor],
+                                     torch.Tensor]] = kwargs.pop(
+                                         "image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None and image_embeds is not None:
+            raise ValueError(
+                "Both pixel values and image embeds are provided.")
+
+        if pixel_values is not None:
+            return Florence2ImagePixelInputs(
+                type="pixel_values",
+                data=self._validate_pixel_values(
+                    flatten_bn(pixel_values, concat=True)),
+            )
+
+        if image_embeds is not None:
+            raise NotImplementedError
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _encode_image(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        dtype = next(self.vision_tower.parameters()).dtype
+        pixel_values = pixel_values.to(dtype)
+
+        batch_size, T = pixel_values.size(0), 1
+        x = self.vision_tower.forward_features_unpool(pixel_values)
+        if self.image_pos_embed is not None:
+            x = x.view(batch_size * T, -1, x.shape[-1])
+            num_tokens = x.shape[-2]
+            h, w = int(num_tokens**0.5), int(num_tokens**0.5)
+            assert h * w == num_tokens, (
+                'only support square feature maps for now')
+            x = x.view(batch_size * T, h, w, x.shape[-1])
+            pos_embed = self.image_pos_embed(x)
+            x = x + pos_embed
+            x = x.view(batch_size, T * h * w, x.shape[-1])
+
+        if self.visual_temporal_embed is not None:
+            visual_temporal_embed = self.visual_temporal_embed(
+                x.view(batch_size, T, -1, x.shape[-1])[:, :, 0])
+            x = x.view(batch_size, T, -1,
+                       x.shape[-1]) + visual_temporal_embed.view(
+                           1, T, 1, x.shape[-1])
+
+        x_feat_dict = {}
+
+        spatial_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=2)
+        x_feat_dict['spatial_avg_pool'] = spatial_avg_pool_x
+
+        temporal_avg_pool_x = x.view(batch_size, T, -1,
+                                     x.shape[-1]).mean(dim=1)
+        x_feat_dict['temporal_avg_pool'] = temporal_avg_pool_x
+
+        x = x.view(batch_size, T, -1, x.shape[-1])[:, -1]
+        x_feat_dict['last_frame'] = x
+
+        new_x = []
+        for _image_feature_source in self.image_feature_source:
+            if _image_feature_source not in x_feat_dict:
+                raise ValueError('invalid image feature source: {}'.format(
+                    _image_feature_source))
+            new_x.append(x_feat_dict[_image_feature_source])
+
+        x = torch.cat(new_x, dim=1)
+
+        x = x @ self.image_projection
+        x = self.image_proj_norm(x)
+
+        return x
+
+    def _process_image_input(
+            self, image_input: Florence2ImagePixelInputs) -> torch.Tensor:
+        assert image_input["type"] == "pixel_values"
+        pixel_values = image_input["data"]
+        return self._encode_image(pixel_values)
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return []
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.pad_token_id)
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        *,
+        encoder_input_ids: torch.Tensor,
+        encoder_positions: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            input_ids
+                torch.Tensor of *decoder* input token ids.
+            positions
+                torch.Tensor of *decoder* position indices.
+            encoder_input_ids
+                torch.Tensor of *encoder* input token ids.
+            encoder_positions
+                torch.Tensor of *encoder* position indices
+        Returns:
+            Output torch.Tensor
+        """
+        vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+        if encoder_input_ids.numel() > 0 or vision_embeddings is not None:
+            inputs_embeds = self.get_input_embeddings(encoder_input_ids,
+                                                      vision_embeddings)
+        else:
+            inputs_embeds = None
+
+        hidden_states = self.language_model(input_ids,
+                                            positions,
+                                            encoder_input_ids,
+                                            encoder_positions,
+                                            inputs_embeds=inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/fuyu.py b/vllm_v0.10.0/vllm/model_executor/models/fuyu.py
new file mode 100644
index 0000000..558d4fb
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/fuyu.py
@@ -0,0 +1,417 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/fuyu/modeling_fuyu.py
+# Copyright 2023 The vLLM team.
+# Copyright 2023 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Fuyu model."""
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Literal, Optional, TypedDict
+
+import torch
+import torch.nn as nn
+from transformers import (BatchFeature, FuyuConfig, FuyuImageProcessor,
+                          FuyuProcessor)
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.linear import ColumnParallelLinear
+from vllm.model_executor.models.persimmon import PersimmonForCausalLM
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, maybe_prefix,
+                    merge_multimodal_embeddings)
+
+# Cannot find the following 2 numbers from hf config.
+_IMAGE_TOKEN_ID = 71011
+_NEWLINE_TOKEN_ID = 71019
+
+
+class FuyuImagePatchInputs(TypedDict):
+    type: Literal["image_patches"]
+    flat_data: torch.Tensor
+    """
+    Shape: 
+    `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)`
+    """
+
+    patches_per_image: list[int]
+    """
+    The number of total patches for each image in the batch.
+
+    This is used to split the embeddings which has the first two dimensions
+    flattened just like `flat_data`.
+    """
+
+
+class FuyuProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(FuyuConfig)
+
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(FuyuProcessor, **kwargs)
+
+    def get_image_processor(self) -> FuyuImageProcessor:
+        return self.get_hf_processor().image_processor
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1}
+
+    def get_image_feature_grid_size(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> tuple[int, int]:
+        image_processor = self.get_image_processor()
+        target_width = image_processor.size["width"]
+        target_height = image_processor.size["height"]
+        patch_width = image_processor.patch_size["width"]
+        patch_height = image_processor.patch_size["height"]
+
+        if not (image_width <= target_width and image_height <= target_height):
+            height_scale_factor = target_height / image_height
+            width_scale_factor = target_width / image_width
+            optimal_scale_factor = min(height_scale_factor, width_scale_factor)
+
+            image_height = int(image_height * optimal_scale_factor)
+            image_width = int(image_width * optimal_scale_factor)
+
+        ncols = math.ceil(image_width / patch_width)
+        nrows = math.ceil(image_height / patch_height)
+        return ncols, nrows
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        ncols, nrows = self.get_image_feature_grid_size(
+            image_width=image_width,
+            image_height=image_height,
+        )
+
+        return ncols * nrows
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        image_processor = self.get_image_processor()
+        return ImageSize(width=image_processor.size["width"],
+                         height=image_processor.size["height"])
+
+
+class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        num_images = mm_counts.get("image", 0)
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+
+class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if not mm_data:
+            # Avoid warning from HF logger for text-only input
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+
+        image_patches = processed_outputs.get("image_patches")
+        if image_patches is not None:
+            images = mm_data["images"]
+            assert isinstance(images, list)
+
+            # Original output: (1, num_images, Pn, Px * Py * C)
+            # New output: (num_images, Pn, Px * Py * C)
+            # image_patches is a list with shape:
+            # (1, num_images, Pn, Px * Py * C)
+            # before Transformers 4.53
+            if isinstance(image_patches, list):
+                assert len(image_patches) == 1
+                assert (isinstance(image_patches[0], torch.Tensor)
+                        and len(image_patches[0]) == len(images))
+                processed_outputs["image_patches"] = image_patches[0]
+            # image_patches is a tensor with shape:
+            # (num_images, Pn, Px * Py * C)
+            # after Transformers 4.53
+            elif isinstance(image_patches, torch.Tensor):
+                assert len(image_patches) == len(images)
+            else:
+                raise AssertionError("This line should be unreachable.")
+
+        return processed_outputs
+
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+        # HF processor adds boa_token_id
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+
+        boa_token_id = vocab["<0x04>"]
+        if prompt_tokens[-1] != boa_token_id:
+            prompt_tokens.append(boa_token_id)
+
+        return prompt_tokens
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(image_patches=MultiModalFieldConfig.batched("image"))
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_config = self.info.get_hf_config()
+        bos_token_id = hf_config.bos_token_id
+        assert isinstance(bos_token_id, int)
+
+        tokenizer = self.info.get_tokenizer()
+        eot_token_id = tokenizer.bos_token_id
+        assert isinstance(eot_token_id, int)
+
+        def get_replacement_fuyu(item_idx: int):
+            images = mm_items.get_items("image", ImageProcessorItems)
+            image_size = images.get_image_size(item_idx)
+
+            ncols, nrows = self.info.get_image_feature_grid_size(
+                image_width=image_size.width,
+                image_height=image_size.height,
+            )
+            image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
+                            [_NEWLINE_TOKEN_ID]) * nrows
+
+            return PromptUpdateDetails.select_token_id(
+                image_tokens + [bos_token_id],
+                embed_token_id=_IMAGE_TOKEN_ID,
+            )
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[eot_token_id],
+                replacement=get_replacement_fuyu,
+            )
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor,
+                                        info=FuyuProcessingInfo,
+                                        dummy_inputs=FuyuDummyInputsBuilder)
+class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "model.vision_embed_tokens.": "vision_embed_tokens.",
+            "model.language_model.": "language_model.model.",
+            "lm_head.": "language_model.lm_head.",
+        })
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return None
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.vocab_size = config.text_config.vocab_size
+        self.image_token_id = _IMAGE_TOKEN_ID
+        self.image_feature_size = config.patch_size**2 * config.num_channels
+
+        self.vision_embed_tokens = ColumnParallelLinear(
+            self.image_feature_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            gather_output=True,
+        )
+        self.language_model = PersimmonForCausalLM(
+            vllm_config=vllm_config.with_hf_config(config.text_config),
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+
+        h = w = self.config.patch_size
+        num_channels = self.config.num_channels
+        expected_dims = num_channels * h * w
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = d.size(-1)
+
+            if actual_dims != expected_dims:
+                expected_expr = str(expected_dims)
+                raise ValueError(
+                    "The expected shape of pixel values per image per batch "
+                    f"per patch is {expected_expr}. "
+                    f"You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data.to(self.vision_embed_tokens.weight.dtype)
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[FuyuImagePatchInputs]:
+        image_patches = kwargs.pop("image_patches", None)
+        if image_patches is not None:
+            if not isinstance(image_patches, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image patches. "
+                                 f"Got type: {type(image_patches)}")
+
+            image_patches_flat = flatten_bn(image_patches)
+
+            return FuyuImagePatchInputs(
+                type="image_patches",
+                flat_data=self._validate_pixel_values(
+                    flatten_bn(image_patches_flat, concat=True)),
+                patches_per_image=[x.size(0) for x in image_patches_flat],
+            )
+
+        return None
+
+    def _process_image_input(
+            self, image_input: FuyuImagePatchInputs) -> MultiModalEmbeddings:
+        image_patches_flat = image_input["flat_data"]
+        patches_per_image = image_input["patches_per_image"]
+
+        assert self.vision_embed_tokens is not None
+        vision_embeddings_flat, _ = self.vision_embed_tokens(
+            image_patches_flat)
+
+        return vision_embeddings_flat.split(patches_per_image, dim=0)
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return []
+
+        return self._process_image_input(image_input)
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                _IMAGE_TOKEN_ID,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ):
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.language_model.logits_processor(
+            self.language_model.lm_head, hidden_states, sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/gemma.py b/vllm_v0.10.0/vllm/model_executor/models/gemma.py
new file mode 100644
index 0000000..59c3102
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/gemma.py
@@ -0,0 +1,427 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2023 The vLLM team.
+# Copyright (c) Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Gemma model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from functools import cache
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import GemmaConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import GeluAndMul
+from vllm.model_executor.layers.layernorm import GemmaRMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+logger = init_logger(__name__)
+
+
+@cache
+def _get_gemma_act_fn(
+    hidden_act: Optional[str],
+    hidden_activation: Optional[str],
+) -> nn.Module:
+    if hidden_activation is None:
+        if hidden_act is not None:
+            logger.warning(
+                "Gemma's activation function was incorrectly set to exact GeLU "
+                "in the config JSON file when it was initially released. "
+                "Changing the activation function to approximate GeLU "
+                "(`gelu_pytorch_tanh`). If you want to use the legacy "
+                "`%s`, edit the config JSON to set "
+                "`hidden_activation=%s` instead of `hidden_act`. "
+                "See https://github.com/huggingface/transformers/pull/29402 "
+                "for more details.", hidden_act, hidden_act)
+        return GeluAndMul(approximate="tanh")
+    elif hidden_activation == "gelu_pytorch_tanh":
+        return GeluAndMul(approximate="tanh")
+    elif hidden_activation == "gelu":
+        return GeluAndMul(approximate="none")
+    else:
+        raise ValueError(f"Activation function {hidden_act} is not "
+                         "supported for Gemma models.")
+
+
+class GemmaMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: Optional[str] = None,
+        hidden_activation: Optional[str] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+        self.act_fn = _get_gemma_act_fn(hidden_act, hidden_activation)
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class GemmaAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        max_position_embeddings: int = 8192,
+        rope_theta: float = 10000,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=self.rope_theta,
+            is_neox_style=True,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class GemmaDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: GemmaConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = GemmaAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            head_dim=config.head_dim,
+            max_position_embeddings=config.max_position_embeddings,
+            rope_theta=config.rope_theta,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.mlp = GemmaMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            hidden_activation=getattr(config, "hidden_activation", None),
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = GemmaRMSNorm(config.hidden_size,
+                                            eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GemmaRMSNorm(config.hidden_size,
+                                                     eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class GemmaModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: GemmaDecoderLayer(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.layers")
+        self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        # Normalize the embedding by sqrt(hidden_size)
+        # The normalizer's data type should be downcasted to the model's
+        # data type such as bfloat16, not float32.
+        # See https://github.com/huggingface/transformers/pull/29402
+        normalizer = self.config.hidden_size**0.5
+        self.register_buffer("normalizer",
+                             torch.tensor(normalizer),
+                             persistent=False)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            hidden_states *= self.normalizer
+            residual = None
+        else:
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            for (param_name, shard_name, shard_id) in stacked_params_mapping:
+                if shard_name not in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        return loaded_params
+
+
+class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        # currently all existing Gemma models have `tie_word_embeddings` enabled
+        assert config.tie_word_embeddings
+        self.lora_config = lora_config
+
+        self.quant_config = quant_config
+        self.model = GemmaModel(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.model.embed_tokens, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/gemma2.py b/vllm_v0.10.0/vllm/model_executor/models/gemma2.py
new file mode 100644
index 0000000..8beefb2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/gemma2.py
@@ -0,0 +1,427 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2024 The vLLM team.
+# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import Gemma2Config
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import GeluAndMul
+from vllm.model_executor.layers.layernorm import GemmaRMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, extract_layer_index,
+                    is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+logger = init_logger(__name__)
+
+
+class Gemma2MLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        hidden_activation: str,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config)
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config)
+        if not (hidden_act == hidden_activation == "gelu_pytorch_tanh"):
+            raise ValueError(
+                "Gemma2 uses `gelu_pytorch_tanh` as the hidden activation "
+                "function. Please set `hidden_act` and `hidden_activation` to "
+                "`gelu_pytorch_tanh`.")
+        self.act_fn = GeluAndMul(approximate="tanh")
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Gemma2Attention(nn.Module):
+
+    def __init__(self,
+                 config: Gemma2Config,
+                 hidden_size: int,
+                 num_heads: int,
+                 num_kv_heads: int,
+                 head_dim: int,
+                 max_position_embeddings: int,
+                 rope_theta: float,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 attn_logits_soft_cap: Optional[float] = None,
+                 prefix: str = "") -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = config.query_pre_attn_scalar**-0.5
+        self.rope_theta = rope_theta
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=self.rope_theta,
+            is_neox_style=True,
+        )
+
+        # reference:
+        # https://github.com/huggingface/transformers/blob/54be2d7ae87e873482b984cc956e165ca4dc0ba3/src/transformers/models/gemma2/modeling_gemma2.py#L312 # noqa
+        layer_idx = extract_layer_index(prefix)
+        use_sliding_window = (layer_idx % 2 == 0 and getattr(
+            config, "interleaved_sliding_window", None) is not None)
+        sliding_window = config.interleaved_sliding_window if \
+            use_sliding_window else None
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              logits_soft_cap=attn_logits_soft_cap,
+                              per_layer_sliding_window=sliding_window,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Gemma2DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Gemma2Config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Gemma2Attention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            head_dim=config.head_dim,
+            max_position_embeddings=config.max_position_embeddings,
+            rope_theta=config.rope_theta,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            attn_logits_soft_cap=config.attn_logit_softcapping,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.hidden_size = config.hidden_size
+        self.mlp = Gemma2MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            hidden_activation=config.hidden_activation,
+            quant_config=quant_config,
+        )
+        self.input_layernorm = GemmaRMSNorm(config.hidden_size,
+                                            eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GemmaRMSNorm(config.hidden_size,
+                                                     eps=config.rms_norm_eps)
+        self.pre_feedforward_layernorm = GemmaRMSNorm(config.hidden_size,
+                                                      eps=config.rms_norm_eps)
+        self.post_feedforward_layernorm = GemmaRMSNorm(config.hidden_size,
+                                                       eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)
+
+        hidden_states, residual = self.pre_feedforward_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class Gemma2Model(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Gemma2DecoderLayer(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.layers")
+        self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        # Normalize the embedding by sqrt(hidden_size)
+        # The normalizer's data type should be downcasted to the model's
+        # data type such as bfloat16, not float32.
+        # See https://github.com/huggingface/transformers/pull/29402
+        normalizer = self.config.hidden_size**0.5
+        self.register_buffer("normalizer",
+                             torch.tensor(normalizer),
+                             persistent=False)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            hidden_states *= self.normalizer
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = loaded_weight[0]
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for (param_name, shard_name, shard_id) in stacked_params_mapping:
+                if shard_name not in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        return loaded_params
+
+
+class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        del lora_config  # Unused.
+        super().__init__()
+        self.config = config
+        # currently all existing Gemma models have `tie_word_embeddings` enabled
+        assert config.tie_word_embeddings
+        self.quant_config = quant_config
+        self.model = Gemma2Model(vllm_config=vllm_config,
+                                 prefix=maybe_prefix(prefix, "model"))
+        self.logits_processor = LogitsProcessor(
+            config.vocab_size, soft_cap=config.final_logit_softcapping)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.model.embed_tokens, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/gemma3.py b/vllm_v0.10.0/vllm/model_executor/models/gemma3.py
new file mode 100644
index 0000000..1a2ce65
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/gemma3.py
@@ -0,0 +1,538 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright 2025 The vLLM team.
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import Gemma3TextConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import GeluAndMul
+from vllm.model_executor.layers.layernorm import GemmaRMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, extract_layer_index,
+                    is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+logger = init_logger(__name__)
+
+
+class Gemma3MLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_activation: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+        if hidden_activation != "gelu_pytorch_tanh":
+            raise ValueError(
+                "Gemma3 uses `gelu_pytorch_tanh` as the hidden activation "
+                "function. Please set `hidden_act` and `hidden_activation` to "
+                "`gelu_pytorch_tanh`.")
+        self.act_fn = GeluAndMul(approximate="tanh")
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Gemma3Attention(nn.Module):
+
+    def __init__(self,
+                 config: Gemma3TextConfig,
+                 hidden_size: int,
+                 num_heads: int,
+                 num_kv_heads: int,
+                 head_dim: int,
+                 max_position_embeddings: int,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 attn_logits_soft_cap: Optional[float] = None,
+                 prefix: str = "") -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = config.query_pre_attn_scalar**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.q_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+        # TODO(woosuk): Add reference to the original HF implementation.
+        layer_idx = extract_layer_index(prefix)
+        self.is_sliding = (getattr(
+            config, "interleaved_sliding_window", None) is not None and (bool(
+                (layer_idx + 1) % config.sliding_window_pattern))) or (
+                    getattr(config, "layer_types", None) is not None
+                    and config.layer_types[layer_idx] == "sliding_attention")
+        # Initialize the rotary embedding.
+        if self.is_sliding:
+            # Local attention. Override the values in config.json.
+            self.rope_theta = config.rope_local_base_freq
+            self.rope_scaling = {"rope_type": "default"}
+            self.sliding_window = (config.interleaved_sliding_window
+                                   or config.sliding_window)
+        else:
+            # Global attention. Use the values in config.json.
+            self.rope_theta = config.rope_theta
+            self.rope_scaling = config.rope_scaling
+            self.sliding_window = None
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=self.rope_theta,
+            is_neox_style=True,
+            rope_scaling=self.rope_scaling,
+        )
+
+        # Initialize the attention.
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              logits_soft_cap=attn_logits_soft_cap,
+                              per_layer_sliding_window=self.sliding_window,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        q = q.unflatten(-1, (self.num_heads, self.head_dim))
+        q = self.q_norm(q)
+        q = q.flatten(-2, -1)
+        k = k.unflatten(-1, (self.num_kv_heads, self.head_dim))
+        k = self.k_norm(k)
+        k = k.flatten(-2, -1)
+
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+
+        if not kwargs.get("has_images", False):
+            # Fast path for text-only inputs. The performance for the text-only
+            # inputs are not affected by the naive attention below.
+            output, _ = self.o_proj(attn_output)
+            return output
+
+        # NOTE(woosuk): Gemma3 uses bidirectional attention between image tokens
+        # that correspond to the same image while using causal attention
+        # otherwise. Current attention backends cannot handle this pattern, so
+        # we temporarily use a naive attention implementation with mask tensors.
+
+        # We intentionally keep the attention backend as-is and only override
+        # `attn_output` with the naive implementation's output. This minimizes
+        # changes to existing model runners and attention backends. The call to
+        # `self.attn(q, k, v)` is only used to populate the KV cache - its
+        # output is discarded and overwritten below. While this duplicates
+        # computation, it maintains compatibility.
+        # TODO(woosuk): Optimize by implementing custom attention kernels.
+        attn_output = self.naive_attn_with_masks(q,
+                                                 k,
+                                                 v,
+                                                 out=attn_output,
+                                                 **kwargs)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def naive_attn_with_masks(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        out: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        # NOTE(woosuk): As described in the comment above, this code is not
+        # meant to be performant. It is only meant to be correct.
+        q = q.view(-1, self.num_heads, self.head_dim)
+        # Expand the key and value to handle GQA.
+        num_queries_per_kv = self.num_heads // self.num_kv_heads
+        k = k.view(-1, self.num_kv_heads, self.head_dim)
+        k = k.repeat_interleave(num_queries_per_kv, dim=-2)
+        v = v.view(-1, self.num_kv_heads, self.head_dim)
+        v = v.repeat_interleave(num_queries_per_kv, dim=-2)
+
+        if self.is_sliding:
+            attn_masks = kwargs["local_attn_masks"]
+        else:
+            attn_masks = kwargs["global_attn_masks"]
+
+        seq_lens = kwargs["seq_lens"]
+        start_idx = 0
+        for seq_len, attn_mask in zip(seq_lens, attn_masks):
+            end_idx = start_idx + seq_len
+            query = q[start_idx:end_idx].unsqueeze(0)
+            key = k[start_idx:end_idx].unsqueeze(0)
+            value = v[start_idx:end_idx].unsqueeze(0)
+
+            # Transpose.
+            query = query.transpose(1, 2)
+            key = key.transpose(1, 2)
+            value = value.transpose(1, 2)
+
+            output = F.scaled_dot_product_attention(
+                query,
+                key,
+                value,
+                attn_mask,
+                self.scaling,
+            )
+            output = output.transpose(1, 2).flatten(-2, -1)
+            out[start_idx:end_idx] = output
+            start_idx = end_idx
+        return out
+
+
+class Gemma3DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Gemma3TextConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Gemma3Attention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            head_dim=config.head_dim,
+            max_position_embeddings=config.max_position_embeddings,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            attn_logits_soft_cap=None,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.hidden_size = config.hidden_size
+        self.mlp = Gemma3MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_activation=config.hidden_activation,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = GemmaRMSNorm(config.hidden_size,
+                                            eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GemmaRMSNorm(config.hidden_size,
+                                                     eps=config.rms_norm_eps)
+        self.pre_feedforward_layernorm = GemmaRMSNorm(config.hidden_size,
+                                                      eps=config.rms_norm_eps)
+        self.post_feedforward_layernorm = GemmaRMSNorm(config.hidden_size,
+                                                       eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            **kwargs,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)
+
+        hidden_states, residual = self.pre_feedforward_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class Gemma3Model(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=f"{prefix}.embed_tokens",
+        )
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Gemma3DecoderLayer(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.layers")
+        self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        # Normalize the embedding by sqrt(hidden_size)
+        # The normalizer's data type should be downcasted to the model's
+        # data type such as bfloat16, not float32.
+        # See https://github.com/huggingface/transformers/pull/29402
+        normalizer = self.config.hidden_size**0.5
+        self.register_buffer("normalizer",
+                             torch.tensor(normalizer),
+                             persistent=False)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        # NOTE(woosuk): Only apply the normalizer to the output of
+        # vocab embedding. Don't apply it to the vision embedding.
+        return self.embed_tokens(input_ids) * self.normalizer
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+                **kwargs,
+            )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = loaded_weight[0]
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for (param_name, shard_name, shard_id) in stacked_params_mapping:
+                if shard_name not in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        return loaded_params
+
+
+class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        del lora_config  # Unused.
+        super().__init__()
+        self.config = config
+        # currently all existing Gemma models have `tie_word_embeddings` enabled
+        assert config.tie_word_embeddings
+        self.quant_config = quant_config
+        self.model = Gemma3Model(vllm_config=vllm_config,
+                                 prefix=maybe_prefix(prefix, "model"))
+        self.logits_processor = LogitsProcessor(
+            config.vocab_size, soft_cap=config.final_logit_softcapping)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds, **kwargs)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.model.embed_tokens, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/gemma3_mm.py b/vllm_v0.10.0/vllm/model_executor/models/gemma3_mm.py
new file mode 100644
index 0000000..d14f5fa
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/gemma3_mm.py
@@ -0,0 +1,729 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Any, Literal, Optional, TypedDict
+
+import torch
+from torch import nn
+from transformers import BatchFeature, Gemma3Config, Gemma3Processor
+from transformers.models.gemma3.processing_gemma3 import Gemma3ProcessorKwargs
+
+import vllm.envs as envs
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.layernorm import GemmaRMSNorm
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems)
+# yapf: disable
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, BoundPromptUpdate,
+                                        PlaceholderFeaturesInfo,
+                                        PromptReplacement, PromptTargetMatch,
+                                        PromptUpdate, PromptUpdateDetails,
+                                        find_mm_placeholders,
+                                        replace_token_matches)
+# yapf: enable
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
+from .siglip import SiglipVisionModel
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+
+logger = init_logger(__name__)
+
+
+class Gemma3ImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: torch.Tensor
+    """
+    Shape: `(num_patches_total, num_channels, height, width)`
+
+    `num_patches_total` is the total number of patches
+    over each image over each prompt in the batch.
+    """
+
+    num_patches: torch.Tensor
+    """Shape: `(batch_size * num_images)`"""
+
+
+Gemma3ImageInputs = Gemma3ImagePixelInputs
+
+
+class Gemma3ProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(Gemma3Config)
+
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(Gemma3Processor, **kwargs)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def _resolve_image_kwargs(
+        self,
+        processor: Gemma3Processor,
+        keys: set[str],
+    ) -> dict[str, Any]:
+        image_processor = processor.image_processor
+        kwargs = processor._merge_kwargs(
+            Gemma3ProcessorKwargs,
+            tokenizer_init_kwargs=processor.tokenizer.init_kwargs,
+        )
+
+        images_kwargs = kwargs["images_kwargs"]
+
+        def _resolve_kw(key: str):
+            val = getattr(image_processor, key)
+            if val is None:
+                val = images_kwargs[key]
+
+            return val
+
+        return {k: _resolve_kw(k) for k in keys}
+
+    def get_num_crops(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[Gemma3Processor],
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        images_kwargs = self._resolve_image_kwargs(
+            processor, {
+                "do_pan_and_scan", "pan_and_scan_min_crop_size",
+                "pan_and_scan_max_num_crops",
+                "pan_and_scan_min_ratio_to_activate"
+            })
+
+        do_pan_and_scan = images_kwargs["do_pan_and_scan"]
+        pan_and_scan_min_crop_size = images_kwargs[
+            "pan_and_scan_min_crop_size"]
+        pan_and_scan_max_num_crops = images_kwargs[
+            "pan_and_scan_max_num_crops"]
+        pan_and_scan_min_ratio_to_activate = images_kwargs[
+            "pan_and_scan_min_ratio_to_activate"]
+
+        if not do_pan_and_scan:
+            return 0
+
+        if envs.VLLM_USE_V1:
+            logger.warning_once(
+                "`do_pan_and_scan=True` has suboptimal results on V1 "
+                "because of the simplified attention pattern being used.")
+
+        # Based on Gemma3ImageProcessor.pan_and_scan
+        if image_width >= image_height:
+            if image_width / image_height < pan_and_scan_min_ratio_to_activate:
+                return 0
+
+            num_crops_w = min(
+                int(math.floor(image_width / pan_and_scan_min_crop_size)),
+                int(math.floor(image_width / image_height + 0.5)),
+            )
+
+            num_crops_w = max(2, num_crops_w)
+            num_crops_w = min(pan_and_scan_max_num_crops, num_crops_w)
+            num_crops_h = 1
+        else:
+            if image_height / image_width < pan_and_scan_min_ratio_to_activate:
+                return 0
+
+            num_crops_h = min(
+                int(math.floor(image_height / pan_and_scan_min_crop_size)),
+                int(math.floor(image_height / image_width + 0.5)),
+            )
+
+            num_crops_h = max(2, num_crops_h)
+            num_crops_h = min(pan_and_scan_max_num_crops, num_crops_h)
+            num_crops_w = 1
+
+        crop_size_w = int(math.ceil(image_width / num_crops_w))
+        crop_size_h = int(math.ceil(image_height / num_crops_h))
+
+        if min(crop_size_w, crop_size_h) < pan_and_scan_min_crop_size:
+            return 0
+
+        return num_crops_w * num_crops_h
+
+    def get_image_repl(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[Gemma3Processor],
+    ) -> PromptUpdateDetails[str]:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        boi_token = processor.boi_token
+
+        num_crops = self.get_num_crops(
+            image_width=image_width,
+            image_height=image_height,
+            processor=processor,
+        )
+
+        if num_crops == 0:
+            image_text = boi_token
+        else:
+            crops_image_tokens = " ".join(boi_token for _ in range(num_crops))
+            image_text = (
+                f"Here is the original image {boi_token} and here are some "
+                f"crops to help you see better {crops_image_tokens}")
+
+        repl_full = image_text.replace(boi_token,
+                                       processor.full_image_sequence)
+
+        tokenizer = processor.tokenizer
+        vocab = tokenizer.get_vocab()
+        image_token_id = vocab[tokenizer.image_token]
+
+        return PromptUpdateDetails.select_token_id(repl_full, image_token_id)
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[Gemma3Processor],
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        num_crops = self.get_num_crops(
+            image_width=image_width,
+            image_height=image_height,
+            processor=processor,
+        )
+        image_seq_len = processor.image_seq_length
+
+        return (num_crops + 1) * image_seq_len
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        processor = self.get_hf_processor()
+
+        images_kwargs = self._resolve_image_kwargs(
+            processor, {"pan_and_scan_max_num_crops"})
+        max_num_crops = images_kwargs["pan_and_scan_max_num_crops"]
+
+        # Result in the max possible feature size (h:w = max_num_crops:1)
+        return ImageSize(height=50 * max_num_crops, width=50)
+
+
+class Gemma3DummyInputsBuilder(BaseDummyInputsBuilder[Gemma3ProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.boi_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+
+class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        processed_outputs = super()._call_hf_processor(
+            prompt,
+            mm_data,
+            mm_kwargs,
+            tok_kwargs,
+        )
+
+        # HF processor pops the `num_crops` kwarg, which is needed by vLLM
+        if (images := mm_data.get("images")) is not None:
+            parsed_images = (self._get_data_parser().parse_mm_data({
+                "image":
+                images
+            }).get_items("image", ImageProcessorItems))
+            image_sizes = [
+                parsed_images.get_image_size(i)
+                for i in range(len(parsed_images))
+            ]
+            hf_processor = self.info.get_hf_processor(**mm_kwargs)
+
+            num_crops = [
+                self.info.get_num_crops(image_width=size.width,
+                                        image_height=size.height,
+                                        processor=hf_processor)
+                for size in image_sizes
+            ]
+            processed_outputs["num_crops"] = torch.tensor(num_crops)
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        num_crops = hf_inputs.get("num_crops", torch.empty(0))
+
+        return dict(
+            pixel_values=MultiModalFieldConfig.flat_from_sizes(
+                "image", num_crops + 1),
+            num_crops=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_token = hf_processor.boi_token
+
+        def get_replacement_gemma3(item_idx: int):
+            images = mm_items.get_items("image", ImageProcessorItems)
+
+            image_size = images.get_image_size(item_idx)
+            return self.info.get_image_repl(
+                image_width=image_size.width,
+                image_height=image_size.height,
+                processor=hf_processor,
+            )
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=image_token,
+                replacement=get_replacement_gemma3,
+            )
+        ]
+
+    def _apply_token_matches(
+        self,
+        prompt: list[int],
+        mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
+        mm_item_counts: Mapping[str, int],
+    ) -> list[int]:
+        token_ids = super()._apply_token_matches(
+            prompt,
+            mm_matches,
+            mm_item_counts,
+        )
+
+        # "\n\n\n" and "\n\n\n\n" are single tokens
+        # Since our replacement can insert "\n\n" next to "\n"
+        # tokens, we have to combine them to be consistent with
+        # the output of the tokenizer
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+        newline_1 = vocab["\n"]
+        newline_2 = vocab["\n\n"]
+        newline_3 = vocab["\n\n\n"]
+        newline_4 = vocab["\n\n\n\n"]
+
+        token_ids = replace_token_matches(
+            token_ids,
+            [newline_1, newline_2],
+            [newline_3],
+        )
+        token_ids = replace_token_matches(
+            token_ids,
+            [newline_2, newline_1],
+            [newline_3],
+        )
+        token_ids = replace_token_matches(
+            token_ids,
+            [newline_2, newline_2],
+            [newline_4],
+        )
+
+        return token_ids
+
+    def _find_mm_placeholders(
+        self,
+        mm_prompt_updates: Mapping[str, Sequence[BoundPromptUpdate]],
+        new_token_ids: list[int],
+        mm_item_counts: Mapping[str, int],
+    ) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
+        # We need to detect "\n\n" inside "\n\n\n" and "\n\n\n\n"
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+        newline_1 = vocab["\n"]
+        newline_2 = vocab["\n\n"]
+        newline_3 = vocab["\n\n\n"]
+        newline_4 = vocab["\n\n\n\n"]
+
+        def get_repl_toks(tok: int) -> list[int]:
+            if tok == newline_3:
+                return [newline_1, newline_2]
+            if tok == newline_4:
+                return [newline_2, newline_2]
+
+            return [tok]
+
+        repl_token_ids = list[int]()
+        repl_orig_idxs = list[int]()
+        for orig_idx, orig_tok in enumerate(new_token_ids):
+            repl_toks = get_repl_toks(orig_tok)
+            repl_token_ids.extend(repl_toks)
+            repl_orig_idxs.extend(orig_idx for _ in range(len(repl_toks)))
+
+        repls = find_mm_placeholders(mm_prompt_updates, repl_token_ids,
+                                     mm_item_counts)
+
+        return {
+            modality: [
+                PlaceholderFeaturesInfo(
+                    modality=p.modality,
+                    item_idx=p.item_idx,
+                    start_idx=repl_orig_idxs[p.start_idx],
+                    tokens=p.tokens,
+                    is_embed=p.is_embed,
+                ) for p in placeholders
+            ]
+            for modality, placeholders in repls.items()
+        }
+
+
+class Gemma3MultiModalProjector(nn.Module):
+
+    def __init__(self, config: Gemma3Config):
+        super().__init__()
+
+        self.mm_input_projection_weight = nn.Parameter(
+            torch.zeros(config.vision_config.hidden_size,
+                        config.text_config.hidden_size))
+
+        self.mm_soft_emb_norm = GemmaRMSNorm(
+            config.vision_config.hidden_size,
+            eps=config.vision_config.layer_norm_eps)
+
+        self.patches_per_image = int(config.vision_config.image_size //
+                                     config.vision_config.patch_size)
+        self.tokens_per_side = int(config.mm_tokens_per_image**0.5)
+        self.kernel_size = self.patches_per_image // self.tokens_per_side
+        self.avg_pool = nn.AvgPool2d(kernel_size=self.kernel_size,
+                                     stride=self.kernel_size)
+
+    def forward(self, vision_outputs: torch.Tensor):
+        batch_size, _, seq_length = vision_outputs.shape
+
+        reshaped_vision_outputs = vision_outputs.transpose(1, 2)
+        reshaped_vision_outputs = reshaped_vision_outputs.reshape(
+            batch_size, seq_length, self.patches_per_image,
+            self.patches_per_image)
+        reshaped_vision_outputs = reshaped_vision_outputs.contiguous()
+
+        pooled_vision_outputs = self.avg_pool(reshaped_vision_outputs)
+        pooled_vision_outputs = pooled_vision_outputs.flatten(2)
+        pooled_vision_outputs = pooled_vision_outputs.transpose(1, 2)
+
+        normed_vision_outputs = self.mm_soft_emb_norm(pooled_vision_outputs)
+
+        projected_vision_outputs = torch.matmul(
+            normed_vision_outputs, self.mm_input_projection_weight)
+        return projected_vision_outputs.type_as(vision_outputs)
+
+
+@MULTIMODAL_REGISTRY.register_processor(Gemma3MultiModalProcessor,
+                                        info=Gemma3ProcessingInfo,
+                                        dummy_inputs=Gemma3DummyInputsBuilder)
+class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
+                                     SupportsLoRA):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # mapping for new names in checkpoint saved after transformers v4.52
+            "model.language_model.": "language_model.model.",
+            "model.vision_tower.": "vision_tower.",
+            "model.multi_modal_projector.": "multi_modal_projector.",
+            "lm_head.": "language_model.lm_head.",
+        })
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<start_of_image>"
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        self.config = config
+        self.quant_config = quant_config
+        self.multimodal_config = multimodal_config
+        self.sliding_window = getattr(config.text_config,
+                                      "interleaved_sliding_window", None)
+
+        self.vision_tower = SiglipVisionModel(config.vision_config,
+                                              quant_config,
+                                              prefix=maybe_prefix(
+                                                  prefix, "vision_tower"))
+        self.multi_modal_projector = Gemma3MultiModalProjector(config)
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+            architectures=["Gemma3ForCausalLM"],
+        )
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.language_model.logits_processor.scale *= logit_scale
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        image_size = self.config.vision_config.image_size
+        expected_dims = (3, image_size, image_size)
+        if data.shape[1:] != expected_dims:
+            raise ValueError(
+                "The expected shape of pixel values per image per batch is "
+                f"{expected_dims}. You supplied {tuple(data.shape)}.")
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Gemma3ImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        num_crops = kwargs.pop("num_crops", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        assert image_embeds is None, "Gemma3 does not support image_embeds."
+        if pixel_values is None:
+            return None
+
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
+
+        if not isinstance(num_crops, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of num_crops. "
+                             f"Got type: {type(num_crops)}")
+
+        pixel_values = flatten_bn(pixel_values, concat=True)
+        num_crops = flatten_bn(num_crops, concat=True)
+
+        return Gemma3ImagePixelInputs(
+            type="pixel_values",
+            pixel_values=self._validate_pixel_values(pixel_values),
+            num_patches=num_crops + 1,
+        )
+
+    def _image_pixels_to_features(
+        self,
+        vision_tower: SiglipVisionModel,
+        pixel_values: torch.Tensor,
+    ) -> torch.Tensor:
+        return vision_tower(pixel_values)
+
+    def _process_image_input(
+        self,
+        image_input: Gemma3ImageInputs,
+    ) -> list[torch.Tensor]:
+        assert self.vision_tower is not None
+
+        pixel_values = image_input["pixel_values"]
+        num_patches = image_input["num_patches"]
+
+        image_features = self._image_pixels_to_features(
+            self.vision_tower,
+            pixel_values,
+        )
+        image_embeds = self.multi_modal_projector(image_features)
+
+        return [
+            e.flatten(0, 1) for e in image_embeds.split(num_patches.tolist())
+        ]
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return []
+
+        return self._process_image_input(image_input)
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                self.config.image_token_index,
+            )
+        return inputs_embeds
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs: object) -> IntermediateTensors:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            if vision_embeddings is not None:
+                kwargs = self.prepare_attn_masks(
+                    input_ids,
+                    positions,
+                    mask_dtype=self.dtype,
+                    **kwargs,
+                )
+            input_ids = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds,
+                                                  **kwargs)
+
+        return hidden_states
+
+    def prepare_attn_masks(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        mask_dtype: torch.dtype,
+        **kwargs,
+    ):
+        kwargs["has_images"] = True
+        # NOTE(woosuk): Here, we distinguish the sequences by the position id 0.
+        # This is a HACK. Fix this.
+        start_indices = (positions == 0).cpu().nonzero()
+        num_seqs = len(start_indices)
+        seq_lens = []
+        for i in range(num_seqs):
+            start_idx = start_indices[i].item()
+            if i < num_seqs - 1:
+                end_idx = start_indices[i + 1].item()
+            else:
+                end_idx = len(input_ids)
+            seq_lens.append(end_idx - start_idx)
+        kwargs["seq_lens"] = seq_lens
+
+        global_attn_masks = []
+        local_attn_masks = []
+        start_idx = 0
+        for seq_len in seq_lens:
+            end_idx = start_idx + seq_len
+            input_token_ids = input_ids[start_idx:end_idx]
+            start_idx = end_idx
+            # Create a global causal mask.
+            global_attn_mask = torch.empty(
+                1,
+                1,
+                seq_len,
+                seq_len,
+                dtype=mask_dtype,
+                device=input_ids.device,
+            )
+            global_attn_mask.fill_(float("-inf"))
+            # Fill the lower triangle with 0.
+            global_attn_mask = global_attn_mask.triu(diagonal=1)
+
+            # Consider the bidirectional attention between image tokens.
+            img_mask = torch.zeros_like(global_attn_mask)
+            img_pos = (input_token_ids == self.config.image_token_index)
+            img_mask[:, :, :, img_pos] += 1
+            img_mask[:, :, img_pos, :] += 1
+            global_attn_mask = torch.where(img_mask == 2, 0, global_attn_mask)
+            global_attn_masks.append(global_attn_mask)
+
+            if self.sliding_window is not None:
+                # Create a local causal mask with sliding window (1024).
+                local_attn_mask = torch.ones_like(global_attn_mask)
+                local_attn_mask = torch.tril(local_attn_mask,
+                                             diagonal=-self.sliding_window)
+                local_attn_mask = torch.where(local_attn_mask == 0,
+                                              global_attn_mask, float("-inf"))
+                local_attn_masks.append(local_attn_mask)
+        kwargs["global_attn_masks"] = global_attn_masks
+        kwargs["local_attn_masks"] = local_attn_masks
+        return kwargs
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="multi_modal_projector",
+            tower_model="vision_tower")
diff --git a/vllm_v0.10.0/vllm/model_executor/models/gemma3n.py b/vllm_v0.10.0/vllm/model_executor/models/gemma3n.py
new file mode 100644
index 0000000..7d16332
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/gemma3n.py
@@ -0,0 +1,811 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright 2025 The vLLM team.
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers.models.gemma3n.configuration_gemma3n import Gemma3nTextConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import (_ACTIVATION_REGISTRY,
+                                                   GeluAndMul,
+                                                   GeluAndMulSparse)
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .utils import (AutoWeightsLoader, extract_layer_index,
+                    is_pp_missing_parameter, make_layers, maybe_prefix)
+
+logger = init_logger(__name__)
+
+
+class Gemma3nAltUp(nn.Module):
+    """Alternating updates (Altup)
+    The AltUp module wraps transformer layers. The `predict` step modifies the
+    input to the transformer layer, and the `correct` step propagates the output
+    of the transformer layer to the sparsely updated dimensions.
+    See more in the research paper:
+    https://proceedings.neurips.cc/paper_files/paper/2023/file/f2059277ac6ce66e7e5543001afa8bb5-Paper-Conference.pdf
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        rms_norm_eps: float,
+        altup_num_inputs: int,
+        altup_coef_clip: float,
+        altup_active_idx: int,
+        prefix: str,
+    ):
+        super().__init__()
+
+        self.altup_num_inputs = altup_num_inputs
+        self.altup_active_idx = altup_active_idx
+        self.altup_coef_clip = altup_coef_clip
+
+        self.correction_coefs = ReplicatedLinear(
+            altup_num_inputs,
+            altup_num_inputs,
+            bias=False,
+            prefix=f"{prefix}.correction_coefs",
+            return_bias=False,
+        )
+        self.prediction_coefs = ReplicatedLinear(
+            altup_num_inputs,
+            altup_num_inputs**2,
+            bias=False,
+            prefix=f"{prefix}.prediction_coefs",
+            return_bias=False,
+        )
+        self.modality_router = ReplicatedLinear(
+            hidden_size,
+            altup_num_inputs,
+            bias=False,
+            prefix=f"{prefix}.modality_router",
+            return_bias=False,
+        )
+        self.router_norm = RMSNorm(
+            hidden_size=hidden_size,
+            eps=rms_norm_eps,
+        )
+        self.router_input_scale = torch.tensor(
+            hidden_size**-1.0, dtype=self.modality_router.weight.dtype)
+        self.correct_output_scale = nn.Parameter(
+            torch.zeros(hidden_size, dtype=torch.float32))
+
+    def _compute_router_modalities(self, x: torch.Tensor) -> torch.Tensor:
+        router_inputs = self.router_norm(x) * self.router_input_scale
+        routed = self.modality_router(router_inputs)
+        return torch.tanh(routed.float()).type_as(x)
+
+    def scale_corrected_output(self, corrected: torch.Tensor) -> torch.Tensor:
+        return (corrected.type_as(self.correct_output_scale) *
+                self.correct_output_scale).type_as(corrected)
+
+    def predict(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # hidden:       [altup_num_inputs, num_tokens, hidden_size]
+        # modalities:   [num_tokens, num_altup_inputs]
+        # all_coefs:    [num_tokens, num_altup_inputs ** 2]
+        modalities = self._compute_router_modalities(
+            hidden_states[self.altup_active_idx])
+        all_coefs = self.prediction_coefs(modalities)
+
+        # Reshape and transpose the 2D matrix for the matmul.
+        # all_coefs_T:  [num_tokens, num_altup_inputs, num_altup_inputs]
+        all_coefs_T = all_coefs.reshape(
+            -1,
+            self.altup_num_inputs,
+            self.altup_num_inputs,
+        ).permute(0, 2, 1)
+
+        # hidden_states to [num_tokens, hidden_size, altup_num_inputs]
+        predictions = torch.matmul(hidden_states.permute(1, 2, 0), all_coefs_T)
+        # [altup_num_inputs, num_tokens, hidden_size]
+        predictions = predictions.permute(2, 0, 1)
+        predictions += hidden_states
+        return predictions.contiguous()
+
+    def correct(self, predictions: torch.Tensor,
+                activated: torch.Tensor) -> torch.Tensor:
+        # predictions:  [altup_num_inputs, num_tokens, hidden_size]
+        # activated:    [num_tokens, hidden_size]
+        # modalities:   [num_tokens, altup_num_inputs]
+        modalities = self._compute_router_modalities(activated)
+        # innovation:   [num_tokens, altup_num_inputs]
+        innovation = activated - predictions[self.altup_active_idx]
+        # innovation:   [altup_num_inputs, num_tokens, hidden_size]
+        innovation = innovation.repeat(self.altup_num_inputs, 1, 1)
+
+        # Permute to [altup_num_inputs, num_tokens] as the last dim
+        # is a scalar applied to each altup input and expand on
+        # num_tokens dim for broadcastability over hidden_size.
+        # all_coefs:    [num_tokens, altup_num_inputs]
+        all_coefs = self.correction_coefs(modalities) + 1.0
+        # all_coefs:    [altup_num_inputs, num_tokens, 1]
+        all_coefs = all_coefs.T.unsqueeze(-1)
+
+        # Elementwise (broadcast over hidden_size).
+        corrected = torch.mul(innovation, all_coefs)
+        corrected += predictions
+
+        return corrected.contiguous()
+
+
+class Gemma3nLaurelBlock(nn.Module):
+    """Learned Augmented Residual Layer"""
+
+    def __init__(self, hidden_size: int, laurel_rank: int, rms_norm_eps: float,
+                 prefix: str):
+        super().__init__()
+
+        self.linear_left = ColumnParallelLinear(
+            hidden_size,
+            laurel_rank,
+            bias=False,
+            prefix=f"{prefix}.linear_left",
+            return_bias=False,
+        )
+        self.linear_right = RowParallelLinear(laurel_rank,
+                                              hidden_size,
+                                              bias=False,
+                                              prefix=f"{prefix}.linear_right",
+                                              return_bias=False)
+        self.post_laurel_norm = RMSNorm(
+            hidden_size=hidden_size,
+            eps=rms_norm_eps,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        laurel_x = self.linear_left(x)
+        laurel_x = self.linear_right(laurel_x)
+        normed_laurel_x = self.post_laurel_norm(laurel_x)
+        return x + normed_laurel_x
+
+
+class Gemma3nMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_activation: str,
+        activation_sparsity: float = 0.0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+        if hidden_activation != "gelu_pytorch_tanh":
+            raise ValueError(
+                "Gemma3 uses `gelu_pytorch_tanh` as the hidden activation "
+                "function. Please set `hidden_act` and `hidden_activation` to "
+                "`gelu_pytorch_tanh`.")
+
+        self.act_fn = GeluAndMulSparse(
+            activation_sparsity=activation_sparsity,
+            approximate="tanh") if activation_sparsity > 0.0 else GeluAndMul(
+                approximate="tanh")
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Gemma3nAttention(nn.Module):
+
+    def __init__(self,
+                 config: Gemma3nTextConfig,
+                 hidden_size: int,
+                 num_heads: int,
+                 num_kv_heads: int,
+                 head_dim: int,
+                 max_position_embeddings: int,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "") -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        self.q_norm = RMSNorm(hidden_size=self.head_dim,
+                              eps=config.rms_norm_eps)
+        self.k_norm = RMSNorm(hidden_size=self.head_dim,
+                              eps=config.rms_norm_eps)
+        self.v_norm = RMSNorm(hidden_size=self.head_dim,
+                              eps=config.rms_norm_eps,
+                              has_weight=False)
+
+        layer_idx = extract_layer_index(prefix)
+        if config.layer_types[layer_idx] == "sliding_attention":
+            self.sliding_window = config.sliding_window
+            rope_theta = config.rope_local_base_freq
+            rope_scaling = {"rope_type": "default"}
+        else:
+            self.sliding_window = None
+            rope_theta = config.rope_theta
+            rope_scaling = config.rope_scaling
+
+        first_kv_shared_layer_idx = (config.num_hidden_layers -
+                                     config.num_kv_shared_layers)
+        self.is_kv_shared = layer_idx >= first_kv_shared_layer_idx
+
+        if self.is_kv_shared:
+            # Last full attention layer is 1 before sharing
+            # Last sliding attention layer is 2 before sharing
+            offset = 2 if self.sliding_window is not None else 1
+            kv_shared_layer_index = first_kv_shared_layer_idx - offset
+            kv_sharing_target_layer_name = f"model.language_model.layers.{kv_shared_layer_index}.self_attn.attn"  # noqa: E501
+        else:
+            kv_sharing_target_layer_name = None
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            is_neox_style=True,
+            rope_scaling=rope_scaling,
+        )
+
+        self.attn = Attention(
+            num_heads=self.num_heads,
+            head_size=self.head_dim,
+            scale=1.0,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            per_layer_sliding_window=self.sliding_window,
+            kv_sharing_target_layer_name=kv_sharing_target_layer_name,
+            prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        q = q.unflatten(-1, (self.num_heads, self.head_dim))
+        q = self.q_norm(q)
+        q = q.flatten(-2, -1)
+        k = k.unflatten(-1, (self.num_kv_heads, self.head_dim))
+        k = self.k_norm(k)
+        k = k.flatten(-2, -1)
+        v = v.unflatten(-1, (self.num_kv_heads, self.head_dim))
+        v = self.v_norm(v)
+        v = v.flatten(-2, -1)
+
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Gemma3nDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Gemma3nTextConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.altup_active_idx = config.altup_active_idx
+        assert config.altup_correct_scale
+
+        self.altup = Gemma3nAltUp(
+            hidden_size=config.hidden_size,
+            rms_norm_eps=config.rms_norm_eps,
+            altup_num_inputs=config.altup_num_inputs,
+            altup_coef_clip=config.altup_coef_clip,
+            altup_active_idx=config.altup_active_idx,
+            prefix=f"{prefix}.altup",
+        )
+        self.self_attn = Gemma3nAttention(
+            config=config,
+            hidden_size=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            head_dim=config.head_dim,
+            max_position_embeddings=config.max_position_embeddings,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.mlp = Gemma3nMLP(
+            hidden_size=config.hidden_size,
+            # NOTE: Matformer https://github.com/huggingface/transformers/blob/a52478253bbe522a420e88ea3940d4d98a935300/src/transformers/models/gemma3n/modular_gemma3n.py#L258 # noqa: E501
+            intermediate_size=config.intermediate_size[extract_layer_index(
+                prefix)],
+            hidden_activation=config.hidden_activation,
+            quant_config=quant_config,
+            activation_sparsity=config.activation_sparsity_pattern[
+                extract_layer_index(prefix)],
+            prefix=f"{prefix}.mlp",
+        )
+        self.laurel = Gemma3nLaurelBlock(
+            hidden_size=config.hidden_size,
+            laurel_rank=config.laurel_rank,
+            rms_norm_eps=config.rms_norm_eps,
+            prefix=f"{prefix}.laurel",
+        )
+
+        # NOTE(rob): should be ColumnParallelLinear and RowParallelLinear
+        # But, we need to add per_layer_input_gate(x) to per_layer_input.
+        # per_layer_input cannot be sharded, so we replicate for now.
+        self.per_layer_input_gate = ReplicatedLinear(
+            config.hidden_size,
+            config.hidden_size_per_layer_input,
+            bias=False,
+            prefix=f"{prefix}.per_layer_input_gate",
+            return_bias=False,
+        )
+        self.per_layer_projection = ReplicatedLinear(
+            config.hidden_size_per_layer_input,
+            config.hidden_size,
+            bias=False,
+            prefix=f"{prefix}.per_layer_projection",
+            return_bias=False,
+        )
+
+        # LayerNorms.
+        self.input_layernorm = RMSNorm(
+            config.hidden_size,
+            eps=config.rms_norm_eps,
+        )
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size,
+            eps=config.rms_norm_eps,
+        )
+        self.pre_feedforward_layernorm = RMSNorm(
+            config.hidden_size,
+            eps=config.rms_norm_eps,
+        )
+        self.post_feedforward_layernorm = RMSNorm(
+            config.hidden_size,
+            eps=config.rms_norm_eps,
+        )
+        self.post_per_layer_input_norm = RMSNorm(
+            config.hidden_size,
+            eps=config.rms_norm_eps,
+        )
+
+        self.act_fn = _ACTIVATION_REGISTRY[config.hidden_activation]
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        per_layer_input: torch.Tensor,
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+
+        # ActUp (predict).
+        predictions = self.altup.predict(hidden_states)
+        active_prediction = predictions[self.altup_active_idx]
+        active_prediction_normed = self.input_layernorm(active_prediction)
+        laurel_output = self.laurel(active_prediction_normed)
+
+        # Attention.
+        attn = self.self_attn(
+            positions=positions,
+            hidden_states=active_prediction_normed,
+            **kwargs,
+        )
+        attn = self.post_attention_layernorm(attn)
+        attn_gated = attn + active_prediction
+        attn_laurel = (attn_gated + laurel_output) / torch.sqrt(
+            torch.tensor(2.0))
+
+        # MLP.
+        attn_norm = self.pre_feedforward_layernorm(attn_laurel)
+        attn_ffw = self.mlp(attn_norm)
+        attn_ffw_norm = self.post_feedforward_layernorm(attn_ffw)
+        attn_ffw_laurel_gated = attn_laurel + attn_ffw_norm
+
+        # ActUp (connect).
+        corrected_predictions = self.altup.correct(predictions,
+                                                   attn_ffw_laurel_gated)
+        first_prediction = corrected_predictions[self.altup_active_idx]
+        first_prediction = self.altup.scale_corrected_output(first_prediction)
+
+        # per_layer_input_gate adapted from jax.numpy.einsum("btd,dp->btp", ...)
+        first_prediction = self.per_layer_input_gate(first_prediction)
+        first_prediction = self.act_fn(first_prediction)
+        first_prediction = torch.mul(first_prediction, per_layer_input)
+
+        # per_layer_projection adapted from jax.numpy.einsum("btp,pd->btd", ...)
+        first_prediction = self.per_layer_projection(first_prediction)
+        first_prediction = self.post_per_layer_input_norm(first_prediction)
+        corrected_predictions[1:] += first_prediction
+
+        return corrected_predictions
+
+
+@support_torch_compile
+class Gemma3nTextModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config.text_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=f"{prefix}.embed_tokens",
+        )
+        self.embed_scale = torch.tensor(
+            config.hidden_size**0.5,
+            dtype=self.embed_tokens.weight.dtype,
+        )
+        self.embed_tokens_per_layer = VocabParallelEmbedding(
+            config.vocab_size_per_layer_input,
+            config.num_hidden_layers * config.hidden_size_per_layer_input,
+            prefix=f"{prefix}.per_layer_embed_tokens",
+        )
+        self.embed_scale_per_layer = torch.tensor(
+            config.hidden_size_per_layer_input**0.5,
+            dtype=self.embed_tokens.weight.dtype,
+        )
+        self.per_layer_model_projection = ColumnParallelLinear(
+            config.hidden_size,
+            config.num_hidden_layers * config.hidden_size_per_layer_input,
+            bias=False,
+            gather_output=True,
+            return_bias=False,
+            prefix=f"{prefix}.per_layer_model_projection",
+        )
+        self.per_layer_projection_norm = RMSNorm(
+            hidden_size=config.hidden_size_per_layer_input,
+            eps=config.rms_norm_eps,
+        )
+        self.per_layer_input_scale = torch.rsqrt(torch.tensor(2.0)).to(
+            self.embed_tokens.weight.dtype)
+        self.per_layer_projection_scale = torch.tensor(
+            config.hidden_size**0.5,
+            dtype=self.embed_tokens.weight.dtype,
+        )
+        self.altup_projections = nn.ModuleList([
+            ColumnParallelLinear(
+                config.hidden_size,
+                config.hidden_size,
+                bias=False,
+                gather_output=True,
+                return_bias=False,
+                prefix=f"{prefix}.{idx-1}.altup_projections",
+            ) for idx in range(1, self.config.altup_num_inputs)
+        ])
+        self.altup_unembed_projections = nn.ModuleList([
+            ColumnParallelLinear(
+                config.hidden_size,
+                config.hidden_size,
+                bias=False,
+                gather_output=True,
+                return_bias=False,
+                prefix=f"{prefix}.{idx-1}.altup_unembed_projections",
+            ) for idx in range(1, self.config.altup_num_inputs)
+        ])
+
+        # Transformer blocks.
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Gemma3nDecoderLayer(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.layers")
+        self.norm = RMSNorm(
+            config.hidden_size,
+            eps=config.rms_norm_eps,
+        )
+        self.eps = torch.tensor(torch.finfo().min)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids) * self.embed_scale
+
+    def get_per_layer_input_embeddings(
+            self, input_ids: torch.Tensor) -> torch.Tensor:
+        # Deal with the fact that vocab_size_per_layer_input < vocab_size
+        # which causes us to have some out of vocab tokens by setting
+        # those token ids to 0. This matches the HF implementation.
+        per_layer_inputs_mask = torch.logical_and(
+            input_ids >= 0, input_ids < self.config.vocab_size_per_layer_input)
+        per_layer_inputs_tokens = torch.where(per_layer_inputs_mask, input_ids,
+                                              torch.zeros_like(input_ids))
+        return self.embed_tokens_per_layer(
+            per_layer_inputs_tokens) * self.embed_scale_per_layer
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if inputs_embeds is not None:
+            hidden_states_0 = inputs_embeds
+        else:
+            hidden_states_0 = self.get_input_embeddings(input_ids)
+
+        # Per layer inputs.
+        if input_ids is None:
+            raise ValueError("Passing None for input ids is not supported.")
+        per_layer_inputs = self.get_per_layer_input_embeddings(input_ids)
+        per_layer_inputs = per_layer_inputs.reshape(
+            -1, self.config.num_hidden_layers,
+            self.config.hidden_size_per_layer_input)
+        per_layer_projection = self.per_layer_model_projection(hidden_states_0)
+        per_layer_projection = per_layer_projection.reshape(
+            *hidden_states_0.shape[:-1],
+            self.config.num_hidden_layers,
+            self.config.hidden_size_per_layer_input,
+        )
+        per_layer_projection = self.per_layer_projection_norm(
+            per_layer_projection)
+        per_layer_inputs = per_layer_projection + per_layer_inputs
+        per_layer_inputs *= self.per_layer_input_scale
+
+        # Altup embed.
+        hidden_states = [hidden_states_0] * self.config.altup_num_inputs
+        target_magnitude = torch.mean(hidden_states_0**2, dim=-1,
+                                      keepdim=True)**0.5
+        for i in range(1, self.config.altup_num_inputs):
+            hidden_states[i] = self.altup_projections[i - 1](hidden_states[i])
+            new_magnitude = torch.mean(hidden_states[i]**2,
+                                       dim=-1,
+                                       keepdim=True)**0.5
+            hidden_states[i] *= target_magnitude / torch.maximum(
+                new_magnitude, self.eps)
+        hidden_states = torch.stack(hidden_states, dim=0)
+
+        # Transformer blocks.
+        for layer_idx, layer in enumerate(self.layers):
+            # [altup_num_inputs, num_tokens, hidden_size]
+            hidden_states = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                per_layer_input=per_layer_inputs[:, layer_idx, :],
+                **kwargs,
+            )
+
+        # Altup unembed.
+        target_magnitude = torch.mean(hidden_states[0]**2,
+                                      dim=-1,
+                                      keepdim=True)**0.5
+        for i in range(1, self.config.altup_num_inputs):
+            hidden_states[i] = self.altup_unembed_projections[i - 1](
+                hidden_states[i])
+            new_magnitude = torch.mean(hidden_states[i]**2,
+                                       dim=-1,
+                                       keepdim=True)**0.5
+            hidden_states[i] *= target_magnitude / torch.maximum(
+                new_magnitude, self.eps)
+        # [altup_num_inputs,num_tokens,hidden_size] -> [num_tokens,hidden_size]
+        hidden_states = torch.mean(hidden_states, dim=0)
+
+        return self.norm(hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = loaded_weight[0]
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for (param_name, shard_name, shard_id) in stacked_params_mapping:
+                if shard_name not in name:
+                    continue
+                # Avoid spurious match with ".up_proj".
+                if "altup_projections" in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        return loaded_params
+
+
+class Gemma3nModel(nn.Module):
+
+    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        self.language_model = Gemma3nTextModel(vllm_config=vllm_config,
+                                               prefix=maybe_prefix(
+                                                   prefix, "language_model"))
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        return self.language_model(input_ids=input_ids,
+                                   positions=positions,
+                                   inputs_embeds=inputs_embeds,
+                                   **kwargs)
+
+
+class Gemma3nForConditionalGeneration(nn.Module):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
+        lora_config = vllm_config.lora_config
+        del lora_config  # Unused.
+        super().__init__()
+        self.config = config
+        self.model = Gemma3nModel(vllm_config=vllm_config,
+                                  prefix=maybe_prefix(prefix, "model"))
+        self.logits_processor = LogitsProcessor(
+            config.text_config.vocab_size,
+            soft_cap=config.text_config.final_logit_softcapping)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.language_model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds, **kwargs)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: Optional[SamplingMetadata],
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.model.language_model.embed_tokens,
+                                       hidden_states, sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self,
+                                   skip_substrs=([
+                                       "embed_audio.", "embed_vision.",
+                                       "audio_tower.", "vision_tower."
+                                   ]))
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/glm.py b/vllm_v0.10.0/vllm/model_executor/models/glm.py
new file mode 100644
index 0000000..defa77b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/glm.py
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Inference-only HF format GLM-4 model compatible with THUDM weights."""
+from vllm.config import VllmConfig
+from vllm.model_executor.models.llama import LlamaForCausalLM
+
+from .utils import PPMissingLayer
+
+
+class GlmForCausalLM(LlamaForCausalLM):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        vllm_config.model_config.hf_config.partial_rotary_factor = 0.5
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        # Hack Llama model to fit HF format GLM implementation
+        # Attention difference between GLM and Llama:
+        # 1. Half partial rotary_dim and no Neox style.
+        # 2. There is no bias for o_proj in attention
+        for layer in self.model.layers:
+            if not isinstance(layer, PPMissingLayer):
+                layer.self_attn.rotary_emb.is_neox_style = False
+                layer.self_attn.o_proj.bias = None
+                layer.self_attn.o_proj.skip_bias_add = True
diff --git a/vllm_v0.10.0/vllm/model_executor/models/glm4.py b/vllm_v0.10.0/vllm/model_executor/models/glm4.py
new file mode 100644
index 0000000..5e2908a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/glm4.py
@@ -0,0 +1,305 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The Zhipu AI team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only GLM-4-0414 model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import Glm4Config
+
+from vllm.attention import Attention, AttentionType
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .llama import LlamaMLP as Glm4MLP
+from .llama import LlamaModel
+from .utils import AutoWeightsLoader, PPMissingLayer, maybe_prefix
+
+
+class Glm4Attention(nn.Module):
+
+    def __init__(self,
+                 config: Glm4Config,
+                 hidden_size: int,
+                 num_heads: int,
+                 num_kv_heads: int,
+                 max_position: int = 4096 * 32,
+                 head_dim: Optional[int] = None,
+                 qkv_bias: bool = False,
+                 rope_theta: float = 10000,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 rope_scaling: Optional[tuple] = None,
+                 prefix: str = "",
+                 attn_type: str = AttentionType.DECODER) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5)
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim or hidden_size // self.total_num_heads
+        self.rotary_dim = self.head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.rotary_dim,
+            max_position=max_position,
+            base=self.rope_theta,
+            rope_scaling=rope_scaling,
+            partial_rotary_factor=partial_rotary_factor,
+            is_neox_style=False,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn",
+                              attn_type=attn_type)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Glm4DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Glm4Config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 1000000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+
+        self.self_attn = Glm4Attention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            qkv_bias=getattr(config, 'attention_bias', False),
+            head_dim=getattr(config, 'head_dim', None),
+            cache_config=cache_config,
+            quant_config=quant_config,
+            rope_scaling=rope_scaling,
+            prefix=f"{prefix}.self_attn",
+            attn_type=AttentionType.DECODER,
+        )
+        self.mlp = Glm4MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+        self.post_self_attn_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+        self.post_mlp_layernorm = RMSNorm(config.hidden_size,
+                                          eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        hidden_states = self.post_self_attn_layernorm(hidden_states)
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_mlp_layernorm(hidden_states)
+
+        return hidden_states, residual
+
+
+ALL_DECODER_LAYER_TYPES = {
+    "attention": Glm4DecoderLayer,
+}
+
+
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    })
+class Glm4Model(LlamaModel):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         layer_type=Glm4DecoderLayer)
+
+
+class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.lora_config = lora_config
+
+        self.quant_config = quant_config
+        self.model = Glm4Model(vllm_config=vllm_config,
+                               prefix=maybe_prefix(prefix, "model"))
+
+        if get_pp_group().is_last_rank:
+            if config.tie_word_embeddings:
+                self.lm_head = self.model.embed_tokens
+            else:
+                self.lm_head = ParallelLMHead(config.vocab_size,
+                                              config.hidden_size,
+                                              quant_config=quant_config,
+                                              prefix=maybe_prefix(
+                                                  prefix, "lm_head"))
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/glm4_1v.py b/vllm_v0.10.0/vllm/model_executor/models/glm4_1v.py
new file mode 100644
index 0000000..0996bcf
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/glm4_1v.py
@@ -0,0 +1,1593 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/Glm4v/modeling_Glm4v.py
+# Copyright 2025 The vLLM team.
+# Copyright 2025 The ZhipuAI Team.
+# Copyright 2025 The HuggingFace Inc. team.
+# All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only GLM-4V model compatible with HuggingFace weights."""
+
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from functools import partial
+from typing import Any, Callable, Literal, Optional, TypedDict, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from transformers import BatchFeature
+from transformers.models.glm4v.configuration_glm4v import (Glm4vConfig,
+                                                           Glm4vVisionConfig)
+from transformers.models.glm4v.image_processing_glm4v import (
+    Glm4vImageProcessor, smart_resize)
+from transformers.models.glm4v.video_processing_glm4v import (
+    Glm4vVideoProcessor)
+from transformers.video_utils import VideoMetadata
+
+from vllm.config import VllmConfig
+from vllm.distributed import parallel_state
+from vllm.distributed import utils as dist_utils
+from vllm.logger import init_logger
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, VideoItem)
+from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
+                                   MultiModalDataParser)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.platforms import _Backend
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import uses_mrope
+
+from ..layers.activation import SiluAndMul
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
+from .qwen2_vl import _qwen2vl_field_config, apply_rotary_pos_emb_vision
+from .utils import (AutoWeightsLoader, WeightsMapper,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+from .vision import get_vit_attn_backend
+
+logger = init_logger(__name__)
+
+# For profile run
+_MAX_FRAMES_PER_VIDEO = 600
+
+# === Vision Inputs === #
+
+
+class Glm4vImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: torch.Tensor
+    """Shape:
+    `(num_patches, num_channels * patch_size * patch_size)`
+    """
+
+    image_grid_thw: torch.Tensor
+    """Shape: `(num_images, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+class Glm4vImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    image_embeds: torch.Tensor
+    """Supported types:
+    - List[`torch.Tensor`]: A list of tensors holding all images' features.
+        Each tensor holds an image's features.
+    - `torch.Tensor`: A tensor holding all images' features
+        (concatenation of all images' feature tensors).
+
+    Tensor shape: `(num_image_features, hidden_size)`
+    - `num_image_features` varies based on
+        the number and resolution of the images.
+    - `hidden_size` must match the hidden size of language model backbone.
+    """
+
+    image_grid_thw: torch.Tensor
+    """Shape: `(num_images, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+Glm4vImageInputs = Union[Glm4vImagePixelInputs, Glm4vImageEmbeddingInputs]
+
+
+class Glm4vVideoPixelInputs(TypedDict):
+    type: Literal["pixel_values_videos"]
+    pixel_values_videos: torch.Tensor
+    """Shape:
+    `(num_patches,
+      num_channels * temporal_patch_size * patch_size * patch_size)`
+    """
+    # video_metadata: Union[list[VideoMetadata], list[dict]]
+    video_grid_thw: Union[list[torch.Tensor], torch.Tensor]
+    """Shape: `(num_videos, num_frames, 3)` or `(1, num_frames, 3)` 
+    for single video.
+    Each entry represents [grid_t, grid_h, grid_w] format where:
+    - grid_t: Temporal grid size (usually 1 for processed video)
+    - grid_h: Height grid size  
+    - grid_w: Width grid size
+    This describes the grid structure of the video patches.
+    """
+
+
+class Glm4vVideoEmbeddingInputs(TypedDict):
+    type: Literal["video_embeds"]
+
+    video_embeds: torch.Tensor
+    """
+    Tensor shape: `(num_video_patches, hidden_size)`
+    - `num_video_patches`: Total number of video patches across all frames
+    - `hidden_size`: Must match the hidden size of language model backbone
+    """
+
+    video_grid_thw: torch.Tensor
+    """Shape: `(num_videos, 1, 3)` or `(1, 1, 3)` for single video
+    Each entry represents [grid_t, grid_h, grid_w] format where:
+    - grid_t: Temporal grid size (usually 1 for processed video)
+    - grid_h: Height grid size  
+    - grid_w: Width grid size
+    This describes the grid structure of the video patches.
+    """
+
+
+Glm4vVideoInputs = Union[Glm4vVideoPixelInputs, Glm4vVideoEmbeddingInputs]
+
+# === Vision Encoder === #
+
+
+class Glm4vVisionMLP(nn.Module):
+
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int,
+        bias: bool = False,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=in_features,
+            output_sizes=[hidden_features] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
+        self.down_proj = RowParallelLinear(hidden_features,
+                                           in_features,
+                                           bias=bias,
+                                           quant_config=quant_config,
+                                           prefix=f"{prefix}.down_proj")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x: torch.Tensor):
+        x, _ = self.gate_up_proj(x)
+        x = self.act_fn(x)
+        x, _ = self.down_proj(x)
+        return x
+
+
+def all_gather_interleave(local_tensor, hidden_size: int, tp_size: int):
+    """All-gather the input tensor interleavely across model parallel group."""
+    import torch.distributed as dist
+
+    gathered_tensors = [torch.zeros_like(local_tensor) for _ in range(tp_size)]
+    dist.all_gather(
+        gathered_tensors,
+        local_tensor,
+        group=parallel_state.get_tp_group().device_group,
+    )
+
+    gathered_tensors_split = [
+        torch.split(tensor, hidden_size // tp_size, -1)
+        for tensor in gathered_tensors
+    ]
+    ordered_tensors = [
+        tensor for pair in zip(*gathered_tensors_split) for tensor in pair
+    ]
+    result_tensor = torch.cat(ordered_tensors, dim=-1)
+    return result_tensor
+
+
+class Glm4vVisionAttention(nn.Module):
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        projection_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        # Per attention head and per partition values.
+        self.tp_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        self.hidden_size_per_attention_head = dist_utils.divide(
+            projection_size, num_heads)
+        self.num_attention_heads_per_partition = dist_utils.divide(
+            num_heads, self.tp_size)
+
+        self.qkv = QKVParallelLinear(
+            hidden_size=embed_dim,
+            head_size=self.hidden_size_per_attention_head,
+            total_num_heads=num_heads,
+            total_num_kv_heads=num_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv",
+        )
+        self.proj = RowParallelLinear(
+            input_size=projection_size,
+            output_size=embed_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.proj",
+            bias=False,
+        )
+
+        # Detect attention implementation.
+        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
+        if self.attn_backend not in {
+                _Backend.FLASH_ATTN,
+                _Backend.TORCH_SDPA,
+                _Backend.XFORMERS,
+        }:
+            raise RuntimeError(
+                f"GLM-4V does not support {self.attn_backend} backend now.")
+
+    def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
+        # [s, b, 3 * head * head_dim]
+        seq_len, bs, _ = qkv.shape
+        if self.tp_size > 1:
+            qkv = all_gather_interleave(qkv, self.qkv.hidden_size,
+                                        self.tp_size)
+
+        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim]
+        q, k, v = qkv.chunk(3, dim=2)
+
+        # 3 * [s, b, head * head_dim]
+        if self.tp_size > 1:
+            splitter = partial(
+                dist_utils.split_tensor_along_last_dim,
+                num_partitions=self.tp_size,
+            )
+            q = splitter(q)[self.tp_rank]
+            k = splitter(k)[self.tp_rank]
+            v = splitter(v)[self.tp_rank]
+
+        # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim]
+        new_shape = (
+            seq_len,
+            bs,
+            self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head,
+        )
+        q, k, v = (x.view(*new_shape) for x in (q, k, v))
+        return q, k, v
+
+    def forward(
+            self,
+            x: torch.Tensor,
+            cu_seqlens: torch.Tensor,
+            rotary_pos_emb: torch.Tensor,
+            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
+            seqlens: Optional[list[int]] = None,  # Only used for xFormers
+    ) -> torch.Tensor:
+        # [s, b, c] --> [s, b, head * 3 * head_dim]
+        x, _ = self.qkv(x)
+
+        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim]
+        q, k, v = self.split_qkv(x)
+        batch_size = q.shape[1]
+
+        q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous()
+                   for x in (q, k, v))
+        if rotary_pos_emb is not None:
+            q = apply_rotary_pos_emb_vision(q, rotary_pos_emb)
+            k = apply_rotary_pos_emb_vision(k, rotary_pos_emb)
+
+        if self.attn_backend == _Backend.FLASH_ATTN:
+            # from vllm_flash_attn.flash_attn_interface import (
+            #   flash_attn_varlen_func)
+            from flash_attn import flash_attn_varlen_func
+
+            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
+
+            output = flash_attn_varlen_func(
+                q,
+                k,
+                v,
+                cu_seqlens_q=cu_seqlens,
+                cu_seqlens_k=cu_seqlens,
+                max_seqlen_q=max_seqlen,
+                max_seqlen_k=max_seqlen,
+                dropout_p=0,
+                causal=False,
+            )
+
+            context_layer = rearrange(output,
+                                      "(b s) ... -> b s ...",
+                                      b=batch_size)
+        elif self.attn_backend == _Backend.TORCH_SDPA:
+            # Execute attention entry by entry for speed & less VRAM.
+            outputs = []
+            for i in range(1, len(cu_seqlens)):
+                start_idx = cu_seqlens[i - 1]
+                end_idx = cu_seqlens[i]
+                q_i = q[:, start_idx:end_idx]
+                k_i = k[:, start_idx:end_idx]
+                v_i = v[:, start_idx:end_idx]
+                q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
+                                 for x in [q_i, k_i, v_i])
+                output_i = F.scaled_dot_product_attention(q_i,
+                                                          k_i,
+                                                          v_i,
+                                                          dropout_p=0.0)
+                output_i = rearrange(output_i, "b h s d -> b s h d ")
+                outputs.append(output_i)
+            context_layer = torch.cat(outputs, dim=1)
+        elif self.attn_backend == _Backend.XFORMERS:
+            from xformers import ops as xops
+            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
+
+            attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens,
+                                                       kv_seqlen=None,
+                                                       device=q.device)
+
+            context_layer = xops.memory_efficient_attention_forward(
+                q, k, v, attn_bias=attn_bias, p=0, scale=None)
+
+        context_layer = rearrange(context_layer,
+                                  "b s h d -> s b (h d)").contiguous()
+
+        output, _ = self.proj(context_layer)
+        return output
+
+
+class Glm4vVisionBlock(nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_hidden_dim: int,
+        norm_layer: Optional[Callable[[int], nn.Module]] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.norm1 = norm_layer(dim)
+        self.norm2 = norm_layer(dim)
+        self.attn = Glm4vVisionAttention(
+            embed_dim=dim,
+            num_heads=num_heads,
+            projection_size=dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+        self.mlp = Glm4vVisionMLP(
+            dim,
+            mlp_hidden_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+
+    def forward(
+            self,
+            x: torch.Tensor,
+            cu_seqlens: torch.Tensor,
+            rotary_pos_emb: torch.Tensor,
+            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
+            seqlens: Optional[list[int]] = None,  # Only used for xFormers
+    ) -> torch.Tensor:
+        x = x + self.attn(
+            self.norm1(x),
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb=rotary_pos_emb,
+            max_seqlen=max_seqlen,
+            seqlens=seqlens,
+        )
+
+        x = x + self.mlp(self.norm2(x))
+        return x
+
+
+class Glm4vVisionPatchEmbed(nn.Module):
+
+    def __init__(
+        self,
+        patch_size: int = 14,
+        temporal_patch_size: int = 1,
+        in_channels: int = 3,
+        hidden_size: int = 1536,
+    ) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.hidden_size = hidden_size
+
+        kernel_size = (temporal_patch_size, patch_size, patch_size)
+        self.proj = nn.Conv3d(
+            in_channels,
+            hidden_size,
+            kernel_size=kernel_size,
+            stride=kernel_size,
+            bias=True,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        L, C = x.shape
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size,
+                   self.patch_size)
+        x = self.proj(x).view(L, self.hidden_size)
+        return x
+
+
+class Glm4vPatchMerger(nn.Module):
+
+    def __init__(
+        self,
+        d_model: int,
+        context_dim: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = d_model
+        self.proj = ColumnParallelLinear(self.hidden_size,
+                                         self.hidden_size,
+                                         bias=bias,
+                                         gather_output=True)
+        self.post_projection_norm = nn.LayerNorm(self.hidden_size)
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=self.hidden_size,
+            output_sizes=[context_dim] * 2,
+            bias=bias,
+            quant_config=quant_config,
+        )
+        self.down_proj = RowParallelLinear(
+            context_dim,
+            self.hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+        )
+        self.act_fn = SiluAndMul()
+        self.extra_activation_func = nn.GELU()
+
+    def forward(self, x: torch.Tensor):
+        x, _ = self.proj(x)
+        x = self.extra_activation_func(self.post_projection_norm(x))
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Glm4vVisionEmbeddings(nn.Module):
+
+    def __init__(self, config: Glm4vVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.num_patches = (self.image_size // self.patch_size)**2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions,
+                                               self.embed_dim)
+        self.register_buffer(
+            "position_ids",
+            torch.arange(self.num_positions).expand((1, -1)),
+            persistent=False,
+        )
+
+    def forward(self, embeddings, lengths, image_shapes, h_coords,
+                w_coords) -> torch.Tensor:
+        pos_embed_weight = self.position_embedding.weight
+        hidden_size = pos_embed_weight.shape[1]
+        total_seq = h_coords.shape[0]
+        device = pos_embed_weight.device
+
+        # Move coordinates to correct device
+        h_coords, w_coords = h_coords.to(device), w_coords.to(device)
+
+        # Handle empty sequence case
+        if total_seq == 0:
+            adapted_pos_embed = torch.empty(0,
+                                            hidden_size,
+                                            device=device,
+                                            dtype=pos_embed_weight.dtype)
+        else:
+            # Convert inputs to tensors if needed
+            if isinstance(lengths, list):
+                lengths = torch.tensor(lengths,
+                                       device=device,
+                                       dtype=torch.long)
+            if not isinstance(image_shapes, torch.Tensor):
+                image_shapes = torch.tensor(image_shapes,
+                                            device=device,
+                                            dtype=torch.long)
+
+            # Prepare 2D position embedding
+            orig_size_sq = pos_embed_weight.shape[0]
+            orig_size = int(orig_size_sq**0.5)
+            pos_embed_2d = (pos_embed_weight.view(
+                orig_size, orig_size,
+                hidden_size).permute(2, 0,
+                                     1).unsqueeze(0).to(device=device,
+                                                        dtype=torch.float32))
+
+            # Calculate target dimensions for each patch
+            target_h = torch.cat([
+                image_shapes[i, 1].repeat(lengths[i])
+                for i in range(len(lengths))
+            ]).to(device=device, dtype=torch.float32)
+            target_w = torch.cat([
+                image_shapes[i, 2].repeat(lengths[i])
+                for i in range(len(lengths))
+            ]).to(device=device, dtype=torch.float32)
+
+            # Normalize coordinates to [-1, 1] range for grid_sample
+            h_coords = h_coords.to(device=device, dtype=torch.float32)
+            w_coords = w_coords.to(device=device, dtype=torch.float32)
+            norm_w = ((w_coords + 0.5) / target_w) * 2 - 1
+            norm_h = ((h_coords + 0.5) / target_h) * 2 - 1
+
+            # Create sampling grid
+            grid = (torch.stack((norm_w, norm_h),
+                                dim=-1).unsqueeze(0).unsqueeze(2))
+
+            # Perform bicubic interpolation
+            interpolated_embed_fp32 = F.grid_sample(
+                pos_embed_2d,
+                grid,
+                mode="bicubic",
+                align_corners=False,
+                padding_mode="border",
+            )
+
+            # Reshape and convert back to original dtype
+            adapted_pos_embed_fp32 = (
+                interpolated_embed_fp32.squeeze(0).squeeze(-1).permute(1, 0))
+            adapted_pos_embed = adapted_pos_embed_fp32.to(
+                pos_embed_weight.dtype).to(embeddings.device)
+
+        # Add adapted position encoding to embeddings
+        embeddings = embeddings + adapted_pos_embed
+        return embeddings
+
+
+class Glm4vVisionRotaryEmbedding(nn.Module):
+
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        inv_freq = 1.0 / (theta
+                          **(torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self._seq_len_cached = 0
+        self._freqs_cached = None
+
+    def update_freqs_cache(self, seqlen: int) -> None:
+        if seqlen > self._seq_len_cached:
+            seqlen *= 2
+            self._seq_len_cached = seqlen
+            self.inv_freq = 1.0 / (self.theta**(torch.arange(
+                0,
+                self.dim,
+                2,
+                dtype=torch.float,
+                device=self.inv_freq.device,
+            ) / self.dim))
+            seq = torch.arange(seqlen,
+                               device=self.inv_freq.device,
+                               dtype=self.inv_freq.dtype)
+            freqs = torch.outer(seq, self.inv_freq)
+            self._freqs_cached = freqs
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        self.update_freqs_cache(seqlen)
+        return self._freqs_cached[:seqlen]
+
+
+class Glm4vVisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        vision_config: Glm4vVisionConfig,
+        norm_eps: float = 1e-6,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        patch_size = vision_config.patch_size
+        temporal_patch_size = vision_config.temporal_patch_size
+        in_channels = vision_config.in_channels
+        depth = vision_config.depth
+        self.hidden_size = vision_config.hidden_size
+        self.num_heads = vision_config.num_heads
+
+        self.patch_size = vision_config.patch_size
+        self.spatial_merge_size = vision_config.spatial_merge_size
+        self.out_hidden_size = vision_config.out_hidden_size
+
+        self.patch_embed = Glm4vVisionPatchEmbed(
+            patch_size=patch_size,
+            temporal_patch_size=temporal_patch_size,
+            in_channels=in_channels,
+            hidden_size=self.hidden_size,
+        )
+
+        norm_layer = partial(RMSNorm, eps=norm_eps)
+        head_dim = self.hidden_size // self.num_heads
+        self.rotary_pos_emb = Glm4vVisionRotaryEmbedding(head_dim // 2)
+        self.blocks = nn.ModuleList([
+            Glm4vVisionBlock(
+                dim=self.hidden_size,
+                num_heads=self.num_heads,
+                mlp_hidden_dim=vision_config.out_hidden_size,
+                norm_layer=norm_layer,
+                quant_config=quant_config,
+                prefix=f"{prefix}.blocks.{layer_idx}",
+            ) for layer_idx in range(depth)
+        ])
+        self.merger = Glm4vPatchMerger(
+            d_model=vision_config.out_hidden_size,
+            context_dim=vision_config.intermediate_size,
+            quant_config=quant_config,
+            bias=False,
+        )
+        self.embeddings = Glm4vVisionEmbeddings(vision_config)
+
+        self.post_conv_layernorm = RMSNorm(vision_config.hidden_size,
+                                           eps=vision_config.rms_norm_eps)
+        self.downsample = nn.Conv2d(
+            in_channels=vision_config.hidden_size,
+            out_channels=vision_config.out_hidden_size,
+            kernel_size=vision_config.spatial_merge_size,
+            stride=vision_config.spatial_merge_size,
+        )
+        self.post_layernorm = RMSNorm(vision_config.hidden_size,
+                                      eps=vision_config.rms_norm_eps)
+
+        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.patch_embed.proj.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.patch_embed.proj.weight.device
+
+    def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            hpos_ids = (hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            ).permute(0, 2, 1, 3).flatten())
+            wpos_ids = (wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            ).permute(0, 2, 1, 3).flatten())
+            pos_ids.append(
+                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb, pos_ids
+
+    def compute_attn_mask_seqlen(
+        self,
+        cu_seqlens: torch.Tensor,
+    ) -> tuple[Optional[int], Optional[list[int]]]:
+        max_seqlen, seqlens = None, None
+        seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        if self.attn_backend == _Backend.FLASH_ATTN:
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        return max_seqlen, seqlens
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        grid_thw: torch.Tensor,
+    ) -> torch.Tensor:
+        # patchify
+        x = x.to(device=self.device, dtype=self.dtype)
+        x = self.patch_embed(x)
+        x = self.post_conv_layernorm(x)
+
+        # compute position embedding
+        rotary_pos_emb, image_type_ids = self.rot_pos_emb(grid_thw)
+        # compute cu_seqlens
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2],
+                                             grid_thw[:, 0]).cumsum(
+                                                 dim=0, dtype=torch.int32)
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
+
+        # pre-compute seqlens for attn mask to reduce cuMemcpy operations
+        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        x = self.embeddings(x, seqlens, grid_thw, image_type_ids[:, 0],
+                            image_type_ids[:, 1])
+
+        # transformers
+        x = x.unsqueeze(1)
+        for blk in self.blocks:
+            x = blk(
+                x,
+                cu_seqlens=cu_seqlens,
+                rotary_pos_emb=rotary_pos_emb,
+                max_seqlen=max_seqlen,
+                seqlens=seqlens,
+            )
+
+        # adapter
+        x = self.post_layernorm(x)
+
+        x = x.view(-1, self.spatial_merge_size, self.spatial_merge_size,
+                   x.shape[-1])
+        x = x.permute(0, 3, 1, 2)
+        x = self.downsample(x).view(-1, self.out_hidden_size)
+        x = self.merger(x)
+
+        return x
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("attn.qkv.", "attn.q.", "q"),
+            ("attn.qkv.", "attn.k.", "k"),
+            ("attn.qkv.", "attn.v.", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Glm4vProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(Glm4vConfig)
+
+    def get_tokenizer(self):
+        return self.ctx.tokenizer
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": 1}
+
+    def get_image_processor(self) -> Glm4vImageProcessor:
+        return self.get_hf_processor().image_processor
+
+    def get_video_processor(self) -> Glm4vVideoProcessor:
+        return self.get_hf_processor().video_processor
+
+    def _get_vision_info(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int = 16,
+        do_resize: bool = True,
+        max_image_pixels: int = 28 * 28 * 2 * 30000,
+    ) -> tuple[ImageSize, int]:
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+        patch_size = vision_config.patch_size
+        merge_size = vision_config.spatial_merge_size
+        temporal_patch_size = vision_config.temporal_patch_size
+        if do_resize:
+            resized_height, resized_width = smart_resize(
+                num_frames=num_frames
+                if num_frames > temporal_patch_size else temporal_patch_size,
+                height=image_height,
+                width=image_width,
+                factor=patch_size * merge_size,
+                max_pixels=max_image_pixels,
+            )
+            preprocessed_size = ImageSize(width=resized_width,
+                                          height=resized_height)
+        else:
+            preprocessed_size = ImageSize(width=image_width,
+                                          height=image_height)
+
+        # NOTE: Frames are padded to be divisible by `temporal_patch_size`
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py#L294
+        padded_num_frames = num_frames + num_frames % temporal_patch_size
+
+        grid_t = max(padded_num_frames // temporal_patch_size, 1)
+        grid_h = preprocessed_size.height // patch_size
+        grid_w = preprocessed_size.width // patch_size
+
+        num_patches = grid_t * grid_h * grid_w
+        num_vision_tokens = num_patches // (merge_size**2)
+
+        return preprocessed_size, num_vision_tokens
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        max_image_size, _ = self._get_vision_info(image_width=9999999,
+                                                  image_height=9999999)
+        return max_image_size
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        _, num_image_tokens = self._get_vision_info(
+            image_width=image_width,
+            image_height=image_height,
+            max_image_pixels=28 * 28 * 2 * 6144,
+        )
+        return num_image_tokens
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+        )
+
+    def get_num_video_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int,
+    ) -> int:
+        _, num_video_tokens = self._get_vision_info(
+            image_width=image_width,
+            image_height=image_height,
+            num_frames=num_frames,
+            max_image_pixels=28 * 28 * 2 * 30000,
+        )
+        return num_video_tokens
+
+    def _get_max_video_frames(self, max_tokens: int) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        num_frames = 0
+
+        while True:
+            next_num_frames = num_frames + 1
+            next_max_tokens = self.get_num_video_tokens(
+                image_width=target_width,
+                image_height=target_height,
+                num_frames=next_num_frames,
+            )
+            if next_max_tokens > max_tokens or next_max_tokens == 0:
+                break
+
+            num_frames = next_num_frames
+
+        return num_frames
+
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
+
+        max_image_tokens = self.get_max_image_tokens() * max_images
+        max_total_frames = self._get_max_video_frames(seq_len -
+                                                      max_image_tokens)
+        max_frames_per_video = min(max_total_frames // max(max_videos, 1),
+                                   _MAX_FRAMES_PER_VIDEO)
+
+        return max(max_frames_per_video, 1)
+
+    def _get_video_second_idx(self, metadata: dict[str, Any],
+                              total_frames: int) -> list[int]:
+        video_processor = self.get_video_processor()
+
+        video_fps = metadata.get("fps", 2.0)
+        meta_frames = metadata.get("total_num_frames", total_frames)
+        max_frame_idx = meta_frames - 1
+        duration = metadata.get("duration",
+                                round(max_frame_idx / video_fps) + 1)
+        if duration <= video_processor.max_duration:
+            n = int(math.floor(duration * video_processor.fps))
+            frame_indices = [
+                min(
+                    max_frame_idx,
+                    int(math.ceil(i * video_fps / video_processor.fps)),
+                ) for i in range(n)
+            ]
+        else:
+            num_samples = int(video_processor.max_duration *
+                              video_processor.fps)
+            if num_samples >= meta_frames:
+                frame_indices = list(range(meta_frames))
+            else:
+                target_seconds = np.linspace(0,
+                                             duration,
+                                             num_samples,
+                                             endpoint=True)
+                frame_indices = [
+                    min(max_frame_idx, int(math.ceil(t * video_fps)))
+                    for t in target_seconds
+                ]
+
+        seen, uniq = set(), []
+        for idx in frame_indices:
+            if idx not in seen:
+                seen.add(idx)
+                uniq.append(idx)
+        if len(uniq) & 1:
+            uniq.append(uniq[-1])
+        frame_indices = uniq
+
+        full_second_idxs = [int(idx / video_fps) for idx in frame_indices]
+        timestamps_list = full_second_idxs[::2]
+        selected_timestamps = []
+        for idx in range(0, len(timestamps_list)):
+            selected_timestamps.append(timestamps_list[idx])
+        return selected_timestamps
+
+
+class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        hf_config = self.info.get_hf_config()
+        hf_processor = self.info.get_hf_processor()
+        tokenizer = self.info.get_tokenizer()
+
+        image_token: str = hf_processor.image_token
+        video_token_ids = [
+            hf_config.video_start_token_id,
+            hf_processor.video_token_id,
+            hf_config.video_end_token_id,
+        ]
+        video_token = tokenizer.decode(video_token_ids)
+
+        return image_token * num_images + video_token * num_videos
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        target_width, target_height = (
+            self.info.get_image_size_with_most_features())
+        target_num_frames = self.info.get_num_frames_with_most_features(
+            seq_len, mm_counts)
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "video":
+            self._get_dummy_videos(
+                width=target_width,
+                height=target_height,
+                num_frames=target_num_frames,
+                num_videos=num_videos,
+            ),
+        }
+
+    def _get_dummy_videos(
+        self,
+        *,
+        width: int,
+        height: int,
+        num_frames: int,
+        num_videos: int,
+    ) -> list[VideoItem]:
+        video = np.full((num_frames, width, height, 3), 255, dtype=np.uint8)
+        video_items = []
+        for i in range(num_videos):
+            video_metadata = {
+                "fps": 2.0,
+                "duration": num_frames / 2.0,
+                "total_num_frames": num_frames,
+                "video_backend": "opencv",
+            }
+            video_item = (video.copy(), video_metadata)
+            video_items.append(video_item)
+
+        return video_items
+
+
+class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return MultiModalDataParser(video_needs_metadata=True)
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        mm_data = dict(mm_data)
+        processor = self.info.get_hf_processor(**mm_kwargs)
+
+        # GLM-4.1V use `image_token_id` as video placeholder, we need to
+        # replace it with `video_token_id` for video processing. So we
+        # separate video processing from image processing.
+        if ("videos" in mm_data and isinstance(mm_data["videos"], list)
+                and len(mm_data["videos"]) > 0):
+            video_grid_thw_lst = []
+            pixel_values_videos_lst = []
+            for item in mm_data.pop("videos", []):
+                video_array, metadata = item
+
+                # FIXME(Isotr0py): Activate the below logic after we can disable
+                # resampling from video loader backend.
+                # assert metadata["total_num_frames"] == len(video_array), (
+                #     f"Total frames {metadata['total_num_frames']} does not "
+                #     f"match the length of video array {len(video_array)}.")
+
+                # NOTE: Temporary workaround for resampled videos.
+                # this can cause a divergence with HF implementation if
+                # the input video is resampled in advance.
+
+                if metadata["total_num_frames"] != len(video_array):
+                    logger.warning(
+                        "Total frames in metadata "
+                        "(%s) does not match the length of "
+                        "video array %s. This can "
+                        "be because the video is resampled "
+                        "in advance. This may cause "
+                        "a divergence with HF implementation.",
+                        metadata["total_num_frames"],
+                        len(video_array),
+                    )
+                    metadata["total_num_frames"] = len(video_array)
+                metadata = VideoMetadata(**metadata)
+
+                video_mm_data = dict()
+                video_mm_data["videos"] = [[video_array]]
+                video_mm_data["video_metadata"] = [[metadata]]
+
+                video_outputs = super()._call_hf_processor(
+                    prompt="<|begin_of_video|><|video|><|end_of_video|>",
+                    mm_data=video_mm_data,
+                    mm_kwargs=mm_kwargs,
+                    tok_kwargs=tok_kwargs,
+                )
+                input_ids = video_outputs.pop("input_ids")
+                input_ids[input_ids == processor.image_token_id] = (
+                    processor.video_token_id)
+                video_placeholder = processor.tokenizer.batch_decode(
+                    input_ids)[0]
+                prompt = prompt.replace(
+                    "<|begin_of_video|><|video|><|end_of_video|>",
+                    video_placeholder,
+                )
+
+                grid_t = len(video_outputs["video_grid_thw"])
+                _, grid_h, grid_w = video_outputs["video_grid_thw"][0]
+                grid_thw = torch.tensor([[grid_t, grid_h, grid_w]])
+
+                video_grid_thw_lst.append(grid_thw)
+                pixel_values_videos_lst.append(
+                    video_outputs["pixel_values_videos"])
+            video_outputs = dict(
+                pixel_values_videos=torch.cat(pixel_values_videos_lst),
+                video_grid_thw=torch.cat(video_grid_thw_lst),
+            )
+        else:
+            video_outputs = dict()
+
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+        combined_outputs = dict(
+            processed_outputs,
+            **video_outputs,
+        )
+        return BatchFeature(combined_outputs)
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return _qwen2vl_field_config(hf_inputs)
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_processor = self.info.get_image_processor(
+            **hf_processor_mm_kwargs)
+        tokenizer = self.info.get_tokenizer()
+        hf_config = self.info.get_hf_config()
+
+        boi_token_id = hf_config.image_start_token_id
+        eoi_token_id = hf_config.image_end_token_id
+
+        bov_token_id = hf_config.video_start_token_id
+        eov_token_id = hf_config.video_end_token_id
+
+        merge_length = image_processor.merge_size**2
+
+        def get_image_replacement_glm4v(item_idx: int):
+            grid_thw = out_mm_kwargs["image_grid_thw"][item_idx]
+            assert isinstance(grid_thw, torch.Tensor)
+
+            num_tokens = int(grid_thw.prod()) // merge_length
+            return [hf_processor.image_token_id] * num_tokens
+
+        def get_video_replacement_glm4v(item_idx: int):
+            grid_thw = out_mm_kwargs["video_grid_thw"][item_idx]
+            assert isinstance(grid_thw, torch.Tensor)
+
+            video, metadata = mm_items["video"][item_idx]
+            timestamps = self.info._get_video_second_idx(metadata, len(video))
+            frames_idx_token = [
+                tokenizer.encode(str(i), add_special_tokens=False)
+                for i in timestamps
+            ]
+            num_tokens_per_frame = int(grid_thw[1:].prod()) // merge_length
+            placeholder = []
+            placeholder.append(bov_token_id)
+            for frame_idx in frames_idx_token:
+                placeholder.append(boi_token_id)
+                placeholder.extend([hf_processor.video_token_id] *
+                                   num_tokens_per_frame)
+                placeholder.append(eoi_token_id)
+                placeholder.extend(frame_idx)
+            placeholder.append(eov_token_id)
+            return PromptUpdateDetails.select_token_id(
+                placeholder,
+                embed_token_id=hf_processor.video_token_id,
+            )
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=hf_processor.image_token,
+                replacement=get_image_replacement_glm4v,
+            ),
+            PromptReplacement(
+                modality="video",
+                target="<|begin_of_video|><|video|><|end_of_video|>",
+                replacement=get_video_replacement_glm4v,
+            ),
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Glm4vMultiModalProcessor,
+    info=Glm4vProcessingInfo,
+    dummy_inputs=Glm4vDummyInputsBuilder,
+)
+class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                    SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # To ensure correct weight loading and mapping.
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "lm_head.": "language_model.lm_head.",
+            "model.language_model.": "language_model.model.",
+            "model.visual.": "visual.",
+        })
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<|begin_of_image|><|image|><|end_of_image|>"
+        if modality.startswith("video"):
+            return "<|begin_of_video|><|video|><|end_of_video|>"
+
+        raise ValueError("Only image or video modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: Glm4vConfig = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.visual = Glm4vVisionTransformer(
+            config.vision_config,
+            norm_eps=getattr(config, "rms_norm_eps", 1e-5),
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "visual"),
+        )
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, ""),
+            architectures=["Glm4ForCausalLM"],
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _validate_and_reshape_mm_tensor(self, mm_input: object,
+                                        name: str) -> torch.Tensor:
+        if not isinstance(mm_input, (torch.Tensor, list)):
+            raise ValueError(
+                f"Incorrect type of {name}. Got type: {type(mm_input)}")
+        if isinstance(mm_input, torch.Tensor):
+            if mm_input.ndim == 2:
+                return mm_input
+            if mm_input.ndim != 3:
+                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
+                                 f"Got ndim: {mm_input.ndim} "
+                                 f"(shape={mm_input.shape})")
+            return torch.concat(list(mm_input))
+        else:
+            return torch.concat(mm_input)
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Glm4vImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        image_grid_thw = kwargs.pop("image_grid_thw", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            pixel_values = self._validate_and_reshape_mm_tensor(
+                pixel_values, "image pixel values")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return Glm4vImagePixelInputs(
+                type="pixel_values",
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_thw,
+            )
+
+        if image_embeds is not None:
+            image_embeds = self._validate_and_reshape_mm_tensor(
+                image_embeds, "image embeds")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            if not isinstance(image_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+            return Glm4vImageEmbeddingInputs(
+                type="image_embeds",
+                image_embeds=image_embeds,
+                image_grid_thw=image_grid_thw,
+            )
+
+    def _parse_and_validate_video_input(
+            self, **kwargs: object) -> Optional[Glm4vVideoInputs]:
+        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
+        video_embeds = kwargs.pop("video_embeds", None)
+        video_grid_thw = kwargs.pop("video_grid_thw", None)
+        if pixel_values_videos is None and video_embeds is None:
+            return None
+        if pixel_values_videos is not None:
+            pixel_values_videos = self._validate_and_reshape_mm_tensor(
+                pixel_values_videos, "video pixel values")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            return Glm4vVideoPixelInputs(
+                type="pixel_values_videos",
+                # video_metadata=video_metadata,
+                pixel_values_videos=pixel_values_videos,
+                video_grid_thw=video_grid_thw,
+            )
+
+        if video_embeds is not None:
+            video_embeds = self._validate_and_reshape_mm_tensor(
+                video_embeds, "video embeds")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            if not isinstance(video_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of video embeddings. "
+                                 f"Got type: {type(video_embeds)}")
+            return Glm4vVideoEmbeddingInputs(
+                type="video_embeds",
+                video_embeds=video_embeds,
+                video_grid_thw=video_grid_thw,
+            )
+
+    def _process_image_input(
+            self, image_input: Glm4vImageInputs) -> tuple[torch.Tensor, ...]:
+        grid_thw = image_input["image_grid_thw"]
+        assert grid_thw.ndim == 2
+
+        if image_input["type"] == "image_embeds":
+            image_embeds = image_input["image_embeds"].type(self.visual.dtype)
+        else:
+            pixel_values = image_input["pixel_values"].type(self.visual.dtype)
+            image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
+
+        merge_size = self.visual.spatial_merge_size
+        sizes = grid_thw.prod(-1) // merge_size // merge_size
+        return image_embeds.split(sizes.tolist())
+
+    def _process_video_input(
+            self, video_input: Glm4vVideoInputs) -> tuple[torch.Tensor, ...]:
+        grid_thw = video_input["video_grid_thw"]
+        assert grid_thw.ndim == 2
+
+        device = self.visual.device
+        flat_grid_thw = torch.cat([
+            torch.tensor([[1, h, w]] * t, device=device)
+            for t, h, w in grid_thw
+        ])
+        if video_input["type"] == "video_embeds":
+            video_embeds = video_input["video_embeds"].type(self.visual.dtype)
+        else:
+            pixel_values_videos = video_input["pixel_values_videos"].type(
+                self.visual.dtype)
+            video_embeds = self.visual(pixel_values_videos,
+                                       grid_thw=flat_grid_thw)
+
+        # Split concatenated embeddings for each video item.
+        merge_size = self.visual.spatial_merge_size
+        sizes = grid_thw.prod(-1) // merge_size // merge_size
+
+        return video_embeds.split(sizes.tolist())
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        mm_input_by_modality = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if (input_key in ("pixel_values", "image_embeds")
+                    and "image" not in mm_input_by_modality):
+                mm_input_by_modality["image"] = (
+                    self._parse_and_validate_image_input(**kwargs))
+            if (input_key in ("pixel_values_videos", "video_embeds")
+                    and "video" not in mm_input_by_modality):
+                mm_input_by_modality["video"] = (
+                    self._parse_and_validate_video_input(**kwargs))
+        return mm_input_by_modality
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(
+            **kwargs)
+        if not mm_input_by_modality:
+            return None
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in mm_input_by_modality:
+            multimodal_input = mm_input_by_modality[modality]
+            if modality == "image":
+                vision_embeddings = self._process_image_input(multimodal_input)
+                multimodal_embeddings += vision_embeddings
+            if modality == "video":
+                video_embeddings = self._process_video_input(multimodal_input)
+                multimodal_embeddings += video_embeddings
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if (multimodal_embeddings is not None
+                and len(multimodal_embeddings) != 0
+                and all(embed.numel() > 0 for embed in multimodal_embeddings)):
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                [self.config.image_token_id, self.config.video_token_id],
+            )
+        return inputs_embeds
+
+    def get_input_embeddings_v0(
+        self,
+        input_ids: torch.Tensor,
+        image_input: Optional[Glm4vImageInputs] = None,
+        video_input: Optional[Glm4vVideoInputs] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.get_input_embeddings(input_ids)
+        if image_input is not None:
+            image_embeds = self._process_image_input(image_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                image_embeds,
+                placeholder_token_id=self.config.image_token_id,
+            )
+
+        if video_input is not None:
+            video_embeds = self._process_video_input(video_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                video_embeds,
+                placeholder_token_id=self.config.video_token_id,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        """Run forward pass for GLM-4V.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            positions: Flattened (concatenated) position ids corresponding to a
+                batch.
+                **NOTE**: If mrope is enabled (default setting for GLM-4V
+                opensource models), the shape will be `(3, seq_len)`,
+                otherwise it will be `(seq_len,).
+            pixel_values: Pixel values to be fed to a model.
+                `None` if no images are passed.
+            image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
+                `None` if no images are passed.
+            pixel_values_videos: Pixel values of videos to be fed to a model.
+                `None` if no videos are passed.
+            video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
+                `None` if no videos are passed.
+            second_per_grid_ts: Tensor `(num_videos)` of video time interval (
+                in seconds) for each grid along the temporal dimension in the
+                3D position IDs. `None` if no videos are passed.
+        """
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner from
+        # `get_multimodal_embeddings` and `get_input_embeddings`, this
+        # condition is only for v0 compatibility.
+        elif inputs_embeds is None:
+            image_input = self._parse_and_validate_image_input(**kwargs)
+            video_input = self._parse_and_validate_video_input(**kwargs)
+
+            if image_input is None and video_input is None:
+                inputs_embeds = None
+            else:
+                if uses_mrope(self.config):
+                    assert positions.ndim == 2 and positions.size(0) == 3, (
+                        "multimodal section rotary embedding requires "
+                        f"(3, seq_len) positions, but got {positions.size()}")
+                inputs_embeds = self.get_input_embeddings_v0(
+                    input_ids,
+                    image_input=image_input,
+                    video_input=video_input)
+                input_ids = None
+
+        hidden_states = self.language_model.model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="visual.merger.",
+            tower_model="visual.",
+        )
diff --git a/vllm_v0.10.0/vllm/model_executor/models/glm4_moe.py b/vllm_v0.10.0/vllm/model_executor/models/glm4_moe.py
new file mode 100644
index 0000000..bdca293
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/glm4_moe.py
@@ -0,0 +1,685 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The ZhipuAI Team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only GLM-4.5 model compatible with HuggingFace weights."""
+import typing
+from collections.abc import Callable, Iterable
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
+from vllm.distributed import (get_ep_group, get_pp_group,
+                              get_tensor_model_parallel_world_size)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsPP
+from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+logger = init_logger(__name__)
+
+
+class Glm4MoeMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config,
+                                           reduce_results=reduce_results,
+                                           prefix=f"{prefix}.down_proj")
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Glm4MoE(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        enable_eplb: bool = False,
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.routed_scaling_factor = config.routed_scaling_factor
+
+        self.ep_group = get_ep_group().device_group
+        self.ep_rank = self.ep_group.rank()
+        self.ep_size = self.ep_group.size()
+        self.n_routed_experts: int = config.n_routed_experts
+        self.n_shared_experts: int = config.n_shared_experts
+
+        if config.hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {config.hidden_act}. "
+                             "Only silu is supported for now.")
+
+        self.gate = ReplicatedLinear(config.hidden_size,
+                                     config.n_routed_experts,
+                                     bias=False,
+                                     quant_config=None,
+                                     prefix=f"{prefix}.gate")
+
+        # noaux_tc is not set in transformers new config now
+        self.gate.e_score_correction_bias = (nn.Parameter(
+            torch.empty(config.n_routed_experts)))
+
+        # Load balancing settings.
+        vllm_config = get_current_vllm_config()
+        parallel_config = vllm_config.parallel_config
+        self.enable_eplb = enable_eplb
+
+        self.n_redundant_experts = parallel_config.num_redundant_experts
+        self.n_logical_experts = self.n_routed_experts
+        self.n_physical_experts = (self.n_logical_experts +
+                                   self.n_redundant_experts)
+        self.n_local_physical_experts = self.n_physical_experts // self.ep_size
+
+        self.physical_expert_start = (self.ep_rank *
+                                      self.n_local_physical_experts)
+        self.physical_expert_end = (self.physical_expert_start +
+                                    self.n_local_physical_experts)
+
+        self.experts = FusedMoE(
+            num_experts=config.n_routed_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
+            renormalize=config.norm_topk_prob,
+            quant_config=quant_config,
+            use_grouped_topk=True,
+            num_expert_group=config.n_group,
+            topk_group=config.topk_group,
+            prefix=f"{prefix}.experts",
+            scoring_func="sigmoid",
+            e_score_correction_bias=self.gate.e_score_correction_bias,
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts)
+
+        if config.n_shared_experts is not None:
+            intermediate_size = (config.moe_intermediate_size *
+                                 config.n_shared_experts)
+            self.shared_experts = Glm4MoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=self.experts.must_reduce_shared_expert_outputs(
+                ),
+                prefix=f"{prefix}.shared_experts",
+            )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        if self.n_shared_experts is not None:
+            shared_output = self.shared_experts(hidden_states)
+        router_logits, _ = self.gate(hidden_states)
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states,
+            router_logits=router_logits) * self.routed_scaling_factor
+        if shared_output is not None:
+            final_hidden_states = final_hidden_states + shared_output
+        if self.tp_size > 1:
+            final_hidden_states = (
+                self.experts.maybe_all_reduce_tensor_model_parallel(
+                    final_hidden_states))
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+class Glm4MoeAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 131072,
+        head_dim: Optional[int] = None,
+        rms_norm_eps: float = 1e-05,
+        qkv_bias: bool = False,
+        use_qk_norm: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim or (hidden_size // self.total_num_heads)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.use_qk_norm = use_qk_norm
+
+        self.qkv_proj = QKVParallelLinear(hidden_size,
+                                          self.head_dim,
+                                          self.total_num_heads,
+                                          self.total_num_kv_heads,
+                                          bias=qkv_bias,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.qkv_proj")
+
+        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
+                                        hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.o_proj")
+
+        partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5)
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            partial_rotary_factor=partial_rotary_factor,
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+        if self.use_qk_norm:
+            self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+            self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        if self.use_qk_norm:
+            q = self.q_norm(q.reshape(-1, self.num_heads,
+                                      self.head_dim)).reshape(q.shape)
+            k = self.k_norm(k.reshape(-1, self.num_kv_heads,
+                                      self.head_dim)).reshape(k.shape)
+
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Glm4MoeDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        enable_eplb: bool = False,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          131072)
+        # DecoderLayers are created with `make_layers` which passes the prefix
+        # with the layer's index.
+        layer_idx = int(prefix.split(sep='.')[-1])
+        self.layer_idx = layer_idx
+
+        self.self_attn = Glm4MoeAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            head_dim=config.head_dim,
+            rms_norm_eps=config.rms_norm_eps,
+            qkv_bias=config.attention_bias,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+            use_qk_norm=config.use_qk_norm,
+        )
+
+        if (config.n_routed_experts is not None
+                and layer_idx >= config.first_k_dense_replace):
+            self.mlp = Glm4MoE(
+                config=config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+                enable_eplb=enable_eplb,
+            )
+        else:
+            self.mlp = Glm4MoeMLP(hidden_size=config.hidden_size,
+                                  intermediate_size=config.intermediate_size,
+                                  hidden_act=config.hidden_act,
+                                  quant_config=quant_config,
+                                  prefix=f"{prefix}.mlp")
+
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+        self.routed_scaling_factor = config.routed_scaling_factor
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(positions=positions,
+                                       hidden_states=hidden_states)
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class Glm4MoeModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        enable_eplb = vllm_config.parallel_config.enable_eplb
+        self.config = config
+
+        self.vocab_size = config.vocab_size
+
+        if get_pp_group().is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.embed_tokens")
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Glm4MoeDecoderLayer(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+                enable_eplb=enable_eplb,
+            ),
+            prefix=f"{prefix}.layers")
+
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts)
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
+            if spec_layer is not None:
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if (("mlp.experts." in name) and name not in params_dict):
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                is_expert_weight = False
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+
+                    # Anyway, this is an expert weight and should not be
+                    # attempted to load as other weights later
+                    is_expert_weight = True
+
+                    # Do not modify `name` since the loop may continue here
+                    # Instead, create a new variable
+                    name_mapped = name.replace(weight_name, param_name)
+
+                    if is_pp_missing_parameter(name_mapped, self):
+                        continue
+
+                    param = params_dict[name_mapped]
+                    # We should ask the weight loader to return success or not
+                    # here since otherwise we may skip experts with other
+                    # available replicas.
+                    weight_loader = typing.cast(Callable[..., bool],
+                                                param.weight_loader)
+                    success = weight_loader(param,
+                                            loaded_weight,
+                                            name_mapped,
+                                            shard_id=shard_id,
+                                            expert_id=expert_id,
+                                            return_success=True)
+                    if success:
+                        name = name_mapped
+                        break
+                else:
+                    if is_expert_weight:
+                        # We've checked that this is an expert weight
+                        # However it's not mapped locally to this rank
+                        # So we simply skip it
+                        continue
+
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        return loaded_params
+
+
+class Glm4MoeForCausalLM(nn.Module, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Glm4MoeModel(vllm_config=vllm_config,
+                                  prefix=maybe_prefix(prefix, "model"))
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(config.vocab_size,
+                                          config.hidden_size,
+                                          quant_config=quant_config)
+        else:
+            self.lm_head = PPMissingLayer()
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+        self.expert_weights = []
+
+        # Set MoE hyperparameters
+        self.num_moe_layers = (config.num_hidden_layers -
+                               config.first_k_dense_replace)
+        self.num_expert_groups = config.n_group
+
+        self.moe_layers: list[FusedMoE] = []
+        for layer in self.model.layers:
+            assert isinstance(layer, Glm4MoeDecoderLayer)
+            if isinstance(layer.mlp, Glm4MoE):
+                self.moe_layers.append(layer.mlp.experts)
+
+        # Pick last one layer since the first ones may be dense layers.
+        example_moe = typing.cast(
+            Glm4MoE, self.model.layers[config.num_hidden_layers - 1].mlp)
+        self.num_logical_experts = example_moe.n_logical_experts
+        self.num_physical_experts = example_moe.n_physical_experts
+        self.num_local_physical_experts = example_moe.n_local_physical_experts
+        self.num_routed_experts = example_moe.n_routed_experts
+        self.num_shared_experts = example_moe.n_shared_experts
+        self.num_redundant_experts = example_moe.n_redundant_experts
+
+    def set_eplb_state(
+        self,
+        expert_load_view: torch.Tensor,
+        logical_to_physical_map: torch.Tensor,
+        logical_replica_count: torch.Tensor,
+    ) -> None:
+        for layer_idx, layer in enumerate(self.moe_layers):
+            # Register the expert weights.
+            self.expert_weights.append(layer.get_expert_weights())
+            layer.set_eplb_state(
+                moe_layer_idx=layer_idx,
+                expert_load_view=expert_load_view,
+                logical_to_physical_map=logical_to_physical_map,
+                logical_replica_count=logical_replica_count,
+            )
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+
+def get_spec_layer_idx_from_weight_name(config: PretrainedConfig,
+                                        weight_name: str) -> Optional[int]:
+    if hasattr(config,
+               "num_nextn_predict_layers") and (config.num_nextn_predict_layers
+                                                > 0):
+        layer_idx = config.num_hidden_layers
+        for i in range(config.num_nextn_predict_layers):
+            if f"layers.{layer_idx+i}." in weight_name:
+                return layer_idx + i
+    return None
diff --git a/vllm_v0.10.0/vllm/model_executor/models/glm4_moe_mtp.py b/vllm_v0.10.0/vllm/model_executor/models/glm4_moe_mtp.py
new file mode 100644
index 0000000..0624640
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/glm4_moe_mtp.py
@@ -0,0 +1,307 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The ZhipuAI Team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only GLM-4.5 MTP model compatible with HuggingFace weights."""
+
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig
+
+from vllm.config import CacheConfig, VllmConfig
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .glm4_moe import Glm4MoeDecoderLayer, get_spec_layer_idx_from_weight_name
+from .interfaces import SupportsPP
+from .utils import maybe_prefix
+
+
+class SharedHead(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.head = ParallelLMHead(config.vocab_size,
+                                   config.hidden_size,
+                                   quant_config=quant_config)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return self.norm(hidden_states)
+
+
+class Glm4MoeMultiTokenPredictorLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        prefix: str,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.eh_proj = nn.Linear(config.hidden_size * 2,
+                                 config.hidden_size,
+                                 bias=False)
+        self.shared_head = SharedHead(config=config, quant_config=quant_config)
+        self.mtp_block = Glm4MoeDecoderLayer(config=config,
+                                             cache_config=cache_config,
+                                             quant_config=quant_config,
+                                             prefix=prefix)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        previous_hidden_states: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        spec_step_index: int = 0,
+    ) -> torch.Tensor:
+        assert inputs_embeds is not None
+        # masking inputs at position 0, as not needed by MTP
+        inputs_embeds[positions == 0] = 0
+        inputs_embeds = self.enorm(inputs_embeds)
+        previous_hidden_states = self.hnorm(previous_hidden_states)
+
+        hidden_states = self.eh_proj(
+            torch.cat([inputs_embeds, previous_hidden_states], dim=-1))
+
+        hidden_states, residual = self.mtp_block(positions=positions,
+                                                 hidden_states=hidden_states,
+                                                 residual=None)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class Glm4MoeMultiTokenPredictor(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        self.mtp_start_layer_idx = config.num_hidden_layers
+        self.num_mtp_layers = config.num_nextn_predict_layers
+        # to map the exact layer index from weights
+        self.layers = torch.nn.ModuleDict({
+            str(idx):
+            Glm4MoeMultiTokenPredictorLayer(
+                config,
+                f"{prefix}.layers.{idx}",
+                cache_config=vllm_config.cache_config,
+                quant_config=vllm_config.quant_config,
+            )
+            for idx in range(self.mtp_start_layer_idx,
+                             self.mtp_start_layer_idx + self.num_mtp_layers)
+        })
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        previous_hidden_states: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor:
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        current_step_idx = (spec_step_idx % self.num_mtp_layers)
+        return self.layers[str(self.mtp_start_layer_idx + current_step_idx)](
+            input_ids,
+            positions,
+            previous_hidden_states,
+            inputs_embeds,
+            current_step_idx,
+        )
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor:
+        current_step_idx = (spec_step_idx % self.num_mtp_layers)
+        mtp_layer = self.layers[str(self.mtp_start_layer_idx +
+                                    current_step_idx)]
+        logits = self.logits_processor(mtp_layer.shared_head.head,
+                                       mtp_layer.shared_head(hidden_states),
+                                       sampling_metadata)
+        return logits
+
+
+class Glm4MoeMTP(nn.Module, SupportsPP):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        self.config = vllm_config.model_config.hf_config
+        self.model = Glm4MoeMultiTokenPredictor(vllm_config=vllm_config,
+                                                prefix=maybe_prefix(
+                                                    prefix, "model"))
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        previous_hidden_states: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions,
+                                   previous_hidden_states, inputs_embeds,
+                                   spec_step_idx)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+        spec_step_idx: int = 0,
+    ) -> Optional[torch.Tensor]:
+        return self.model.compute_logits(hidden_states, sampling_metadata,
+                                         spec_step_idx)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts)
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
+            if spec_layer is None:
+                continue
+            name = self._rewrite_spec_layer_name(spec_layer, name)
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if (("mlp.experts." in name) and name not in params_dict):
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    # According to DeepSeek-V3 Technical Report, MTP modules
+                    # shares embedding layer. We only load the first weights.
+                    if (spec_layer != self.model.mtp_start_layer_idx
+                            and ".layers" not in name):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+    def _rewrite_spec_layer_name(self, spec_layer: int, name: str) -> str:
+        """
+        Rewrite the weight name to match the format of the original model.
+        Add .mtp_block for modules in transformer layer block for spec layer
+        and rename shared layer weights to be top level.
+        """
+        spec_layer_weight_names = [
+            "embed_tokens", "enorm", "hnorm", "eh_proj", "shared_head"
+        ]
+        shared_weight_names = ["embed_tokens"]
+        spec_layer_weight = False
+        shared_weight = False
+        for weight_name in spec_layer_weight_names:
+            if weight_name in name:
+                spec_layer_weight = True
+                if weight_name in shared_weight_names:
+                    shared_weight = True
+                break
+        if not spec_layer_weight:
+            # treat rest weights as weights for transformer layer block
+            name = name.replace(f"model.layers.{spec_layer}.",
+                                f"model.layers.{spec_layer}.mtp_block.")
+        elif shared_weight:
+            # treat shared weights as top level weights
+            name = name.replace(f"model.layers.{spec_layer}.", "model.")
+        return name
diff --git a/vllm_v0.10.0/vllm/model_executor/models/glm4v.py b/vllm_v0.10.0/vllm/model_executor/models/glm4v.py
new file mode 100644
index 0000000..7584b51
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/glm4v.py
@@ -0,0 +1,657 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/THUDM/CogAgent
+"""Inference-only CogAgent model compatible with THUDM weights."""
+from argparse import Namespace
+from collections.abc import Mapping, Sequence
+from typing import Literal, Optional, TypedDict, Union
+
+import torch
+from torch import nn
+from torch.nn import LayerNorm
+from torchvision import transforms
+from torchvision.transforms import InterpolationMode
+from transformers import BatchFeature, PreTrainedTokenizer, TensorType
+from transformers.image_utils import ImageInput
+from transformers.tokenization_utils_base import TextInput
+
+from vllm.attention.layer import MultiHeadAttention
+from vllm.config import VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs import ChatGLMConfig
+
+from .chatglm import ChatGLMBaseModel, ChatGLMModel
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
+from .utils import flatten_bn, merge_multimodal_embeddings
+
+
+class GLMVImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: torch.Tensor
+    """Shape: `(batch_size, num_channels, height, width)`"""
+
+
+class EVA2CLIPPatchEmbedding(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.proj = nn.Conv2d(config.in_channels,
+                              config.hidden_size,
+                              kernel_size=config.patch_size,
+                              stride=config.patch_size)
+        self.cls_embedding = nn.Parameter(torch.zeros(1, config.hidden_size))
+        self.position_embedding = nn.Embedding(config.num_positions,
+                                               config.hidden_size)
+
+    def forward(self, images: torch.Tensor) -> torch.Tensor:
+        """
+        Parameters:
+        images : torch.Tensor
+            Input image tensor with shape (B, C, H, W)
+
+        Returns:
+        torch.Tensor
+            Transformed tensor with shape (B, L, D)
+        """
+        images = images.to(device=self.proj.weight.device,
+                           dtype=self.proj.weight.dtype)
+        x = self.proj(images)
+        x = x.flatten(2).transpose(1, 2)
+        cls_token = self.cls_embedding.expand(x.shape[0], -1, -1)
+        x = torch.cat((cls_token, x), dim=1)
+        x += self.position_embedding.weight.unsqueeze(0)
+        return x
+
+
+class EVA2CLIPAttention(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = '',
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_heads_per_rank = config.num_heads // self.tp_size
+        self.head_dim = config.hidden_size // config.num_heads
+        self.scale = self.head_dim**-0.5
+
+        self.query_key_value = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            config.num_heads,
+            quant_config=quant_config,
+            prefix=f"{prefix}.query_key_value",
+        )
+        self.dense = RowParallelLinear(
+            config.hidden_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense",
+        )
+
+        self.attn = MultiHeadAttention(self.num_heads_per_rank, self.head_dim,
+                                       self.scale)
+        self.output_dropout = torch.nn.Dropout(config.dropout_prob)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        qkv, _ = self.query_key_value(x)  # B, L, 3 * H * D
+        q, k, v = qkv.chunk(3, dim=-1)
+
+        out = self.attn(q, k, v)
+        output, _ = self.dense(out)
+        output = self.output_dropout(output)
+        return output
+
+
+class EVA2CLIPMLP(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = '',
+    ):
+        super().__init__()
+        self.config = config
+        self.activation_fn = get_act_fn(config.hidden_act)
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, _ = self.fc1(x)
+        x = self.activation_fn(x)
+        x, _ = self.fc2(x)
+        return x
+
+
+class EVA2CLIPTransformerLayer(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = '',
+    ):
+        super().__init__()
+        self.input_layernorm = LayerNorm(config.hidden_size,
+                                         eps=config.layer_norm_eps)
+        self.attention = EVA2CLIPAttention(config,
+                                           quant_config=quant_config,
+                                           prefix=f"{prefix}.attention")
+        self.mlp = EVA2CLIPMLP(config,
+                               quant_config=quant_config,
+                               prefix=f"{prefix}.mlp")
+        self.post_attention_layernorm = LayerNorm(config.hidden_size,
+                                                  eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        attention_input = hidden_states
+        attention_output = self.input_layernorm(
+            self.attention(attention_input))
+        hidden_states = attention_input + attention_output
+        mlp_input = hidden_states
+        mlp_output = self.post_attention_layernorm(self.mlp(mlp_input))
+        output = mlp_input + mlp_output
+        return output
+
+
+class EVA2CLIPTransformer(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = '',
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList([
+            EVA2CLIPTransformerLayer(config,
+                                     quant_config=quant_config,
+                                     prefix=f"{prefix}.layers.{layer_idx}")
+            for layer_idx in range(config.num_hidden_layers)
+        ])
+
+    def forward(self, hidden_states):
+        for layer_module in self.layers:
+            hidden_states = layer_module(hidden_states)
+        return hidden_states
+
+
+class EVA2CLIPGLU(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        in_features,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = '',
+    ):
+        """
+        The original implementation is the same as:
+        ```python
+        self.dense_h_to_4h = ColumnParallelLinear(
+            config.hidden_size,
+            config.ffn_hidden_size,
+            bias=False,
+            quant_config=quant_config
+        )
+
+        self.gate_proj = ColumnParallelLinear(
+            config.hidden_size,
+            config.ffn_hidden_size,
+            bias=False,
+            quant_config=quant_config
+        )
+        ```
+        ```
+        gate_proj_output, _ = self.gate_proj(x)
+        dense_h_to_4h_output, _ = self.dense_h_to_4h(x)
+        x = torch.cat([gate_proj_output, dense_h_to_4h_output], dim=-1)
+        ```
+
+        We merge two ColumnParallelLinear into one MergedColumnParallelLinear:
+        ```
+        self.merged_proj = MergedColumnParallelLinear(
+            config.hidden_size,
+            [config.ffn_hidden_size] * 2,
+            bias=False,
+            quant_config=quant_config
+        )
+        ```
+        ```
+        x, _ = self.merged_proj(x)
+        ```
+        """
+        super().__init__()
+        self.linear_proj = ReplicatedLinear(in_features,
+                                            config.hidden_size,
+                                            bias=False,
+                                            quant_config=quant_config,
+                                            prefix=f"{prefix}.linear_proj")
+        self.norm1 = nn.LayerNorm(config.hidden_size)
+        self.act1 = nn.GELU()
+        self.act2 = SiluAndMul()
+
+        self.merged_proj = MergedColumnParallelLinear(
+            config.hidden_size, [config.ffn_hidden_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.merged_proj")
+
+        self.dense_4h_to_h = RowParallelLinear(
+            config.ffn_hidden_size,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense_4h_to_h")
+
+    def forward(self, x):
+        x, _ = self.linear_proj(x)
+        x = self.act1(self.norm1(x))
+        x, _ = self.merged_proj(x)
+        x = self.act2(x)
+        x, _ = self.dense_4h_to_h(x)
+        return x
+
+
+class EVA2CLIPModel(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = '',
+    ):
+        super().__init__()
+        vision_config = Namespace(**config.vision_config)
+        self.patch_embedding = EVA2CLIPPatchEmbedding(vision_config)
+        self.transformer = EVA2CLIPTransformer(vision_config,
+                                               quant_config=quant_config,
+                                               prefix=f"{prefix}.transformer")
+        self.linear_proj = EVA2CLIPGLU(config,
+                                       in_features=config.hidden_size,
+                                       quant_config=quant_config,
+                                       prefix=f"{prefix}.linear_proj")
+        self.conv = nn.Conv2d(in_channels=vision_config.hidden_size,
+                              out_channels=config.hidden_size,
+                              kernel_size=2,
+                              stride=2)
+        self.boi = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.eoi = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.scaling_factor = vision_config.scaling_factor
+
+    def forward(self, images: torch.Tensor) -> torch.Tensor:
+        """
+        Parameters:
+        images : torch.Tensor
+            Input image tensor with shape (B, C, H, W)
+
+        Returns:
+        torch.Tensor
+            Transformed tensor with shape (B, L, D)
+        """
+        x = self.patch_embedding(images)
+        x = self.transformer(x)
+        x = x[:, 1:]
+
+        b, s, h = x.shape
+        grid_size = int(s**0.5)
+        x = x.view(b, grid_size, grid_size, h).permute(0, 3, 1, 2)
+        x = self.conv(x)
+
+        x = x.flatten(2).transpose(1, 2)
+        x = self.linear_proj(x)
+        boi = self.boi.expand(x.shape[0], -1, -1)
+        eoi = self.eoi.expand(x.shape[0], -1, -1)
+        x = torch.cat((boi, x, eoi), dim=1)
+        x = x / self.scaling_factor
+        return x
+
+
+class GLM4VModel(ChatGLMModel):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+
+        quant_config = vllm_config.quant_config
+
+        self.vision = EVA2CLIPModel(self.config,
+                                    quant_config,
+                                    prefix=f"{prefix}.vision")
+
+
+class GLM4VProcessor:
+    """
+    This model doesn't define its own HF processor,
+    so we implement our own one here.
+    """
+
+    def __init__(
+        self,
+        config: ChatGLMConfig,
+        tokenizer: PreTrainedTokenizer,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.tokenizer = tokenizer
+
+        vision_config = config.vision_config
+        image_size = vision_config["image_size"]
+
+        self.image_transform = transforms.Compose([
+            transforms.Resize(
+                (image_size, image_size),
+                interpolation=InterpolationMode.BICUBIC,
+            ),
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=(0.48145466, 0.4578275, 0.40821073),
+                std=(0.26862954, 0.26130258, 0.27577711),
+            ),
+        ])
+
+    def __call__(
+        self,
+        text: Optional[Union[TextInput, list[TextInput]]] = None,
+        images: Optional[Union[ImageInput, list[ImageInput]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ) -> BatchFeature:
+        if text is None:
+            text = []
+        if not isinstance(text, list):
+            text = [text]
+        if images is None:
+            images = []
+        if not isinstance(images, list):
+            images = [images]
+
+        text_inputs = self.tokenizer(text)
+
+        if len(images) == 0:
+            image_inputs = {}
+        else:
+            pixel_values = [self.image_transform(image) for image in images]
+            image_inputs = {"pixel_values": torch.stack(pixel_values)}
+
+        return BatchFeature(
+            {
+                **text_inputs,
+                **image_inputs,
+            },
+            tensor_type=return_tensors,
+        )
+
+
+class GLM4VProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(ChatGLMConfig)
+
+    def get_hf_processor(self, **kwargs: object) -> GLM4VProcessor:
+        return self.ctx.init_processor(
+            GLM4VProcessor,
+            config=self.get_hf_config(),
+            tokenizer=self.get_tokenizer(),
+            **kwargs,
+        )
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1}
+
+    def get_num_image_tokens(self) -> int:
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+
+        image_size = vision_config["image_size"]
+        patch_size = vision_config["patch_size"]
+        grid_length = image_size // patch_size // 2
+        return grid_length * grid_length
+
+    def get_num_image_feature_tokens(self) -> int:
+        # EVA2CLIPModel has embeddings for boi and eoi tokens as well
+        return self.get_num_image_tokens() + 2
+
+
+class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        base_text = "<|begin_of_image|><|endoftext|><|end_of_image|>"
+
+        return base_text * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        hf_config = self.info.get_hf_config()
+        vision_config = hf_config.vision_config
+
+        target_width = target_height = vision_config["image_size"]
+        num_images = mm_counts.get("image", 0)
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+
+class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
+
+    def _hf_processor_applies_updates(
+        self,
+        prompt_text: str,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+    ) -> bool:
+        return False
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(pixel_values=MultiModalFieldConfig.batched("image"))
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_config = self.info.get_hf_config()
+
+        boi_token_id = hf_config.boi_token_id
+        image_token_id = hf_config.pad_token_id
+        eoi_token_id = hf_config.eoi_token_id
+
+        def get_replacement(item_idx: int):
+            num_image_tokens = self.info.get_num_image_tokens()
+            image_tokens = [image_token_id] * num_image_tokens
+
+            return [boi_token_id] + image_tokens + [eoi_token_id]
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[boi_token_id, image_token_id, eoi_token_id],
+                replacement=get_replacement,
+            ),
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(GLM4VMultiModalProcessor,
+                                        info=GLM4VProcessingInfo,
+                                        dummy_inputs=GLM4VDummyInputsBuilder)
+class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
+                       SupportsMultiModal):
+
+    packed_modules_mapping = {
+        "query_key_value": ["query_key_value"],
+        "dense_h_to_4h": ["dense_h_to_4h"],
+        "merged_proj": ["gate_proj", "dense_h_to_4h"]
+    }
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="transformer.encoder",
+            connector="transformer.vision.linear_proj",
+            tower_model="transformer.vision.transformer")
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<|begin_of_image|><|endoftext|><|end_of_image|>"
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        transformer_type: type[GLM4VModel] = GLM4VModel,
+    ) -> None:
+        super().__init__(
+            vllm_config=vllm_config,
+            prefix=prefix,
+            transformer_type=transformer_type,
+        )
+
+        self.transformer: GLM4VModel
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.vision_config["image_size"]
+        expected_dims = (3, h, w)
+        actual_dims = tuple(data.shape[1:])
+
+        if actual_dims != expected_dims:
+            expected_expr = ("batch_size", *map(str, expected_dims))
+            raise ValueError(
+                f"The expected shape of pixel values is {expected_expr}. "
+                f"You supplied {tuple(data.shape)}.")
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[GLMVImagePixelInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+
+        if pixel_values is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return GLMVImagePixelInputs(
+                type="pixel_values",
+                data=self._validate_pixel_values(
+                    flatten_bn(pixel_values, concat=True)),
+            )
+
+        return None
+
+    def _process_image_input(
+            self, image_input: GLMVImagePixelInputs) -> torch.Tensor:
+        pixel_values = image_input["data"].to(dtype=self.config.torch_dtype)
+
+        return self.transformer.vision(pixel_values)
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.transformer
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return []
+
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.transformer.get_input_embeddings(input_ids)
+
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                multimodal_embeddings=multimodal_embeddings,
+                placeholder_token_id=[
+                    self.config.boi_token_id,
+                    self.config.pad_token_id,
+                    self.config.eoi_token_id,
+                ],
+            )
+
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.transformer(input_ids, positions,
+                                         intermediate_tensors, inputs_embeds)
+
+        return hidden_states
diff --git a/vllm_v0.10.0/vllm/model_executor/models/gpt2.py b/vllm_v0.10.0/vllm/model_executor/models/gpt2.py
new file mode 100644
index 0000000..98d7633
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/gpt2.py
@@ -0,0 +1,380 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
+# Copyright 2023 The vLLM team.
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only GPT-2 model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import GPT2Config
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed.parallel_state import (
+    get_pp_group, get_tensor_model_parallel_world_size)
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from ..layers.pooler import DispatchPooler, Pooler
+from .interfaces import SupportsPP
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class GPT2Attention(nn.Module):
+
+    def __init__(
+        self,
+        config: GPT2Config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        total_num_heads = config.num_attention_heads
+        tensor_model_parallel_world_size = (
+            get_tensor_model_parallel_world_size())
+        assert total_num_heads % tensor_model_parallel_world_size == 0
+        self.num_heads = total_num_heads // tensor_model_parallel_world_size
+        self.head_dim = self.hidden_size // total_num_heads
+        self.scale = self.head_dim**-0.5
+
+        self.c_attn = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            total_num_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.c_attn",
+        )
+        self.c_proj = RowParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.c_proj",
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              scale=self.scale,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.c_attn(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        attn_output = self.attn(q, k, v)
+        attn_output, _ = self.c_proj(attn_output)
+        return attn_output
+
+
+class GPT2MLP(nn.Module):
+
+    def __init__(
+        self,
+        intermediate_size: int,
+        config: GPT2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        hidden_size = config.hidden_size
+        self.c_fc = ColumnParallelLinear(
+            hidden_size,
+            intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.c_fc",
+        )
+        self.c_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.c_proj",
+        )
+        self.act = get_act_fn(config.activation_function)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.c_proj(hidden_states)
+        return hidden_states
+
+
+class GPT2Block(nn.Module):
+
+    def __init__(
+        self,
+        config: GPT2Config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        hidden_size = config.hidden_size
+        inner_dim = (config.n_inner if config.n_inner is not None else 4 *
+                     hidden_size)
+
+        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.attn = GPT2Attention(config,
+                                  cache_config,
+                                  quant_config,
+                                  prefix=f"{prefix}.attn")
+        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.mlp = GPT2MLP(inner_dim,
+                           config,
+                           quant_config,
+                           prefix=f"{prefix}.mlp")
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_output = self.attn(hidden_states=hidden_states)
+        # residual connection
+        hidden_states = attn_output + residual
+
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        # residual connection
+        hidden_states = residual + feed_forward_hidden_states
+        return hidden_states
+
+
+@support_torch_compile
+class GPT2Model(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+        assert not config.add_cross_attention
+        assert not config.scale_attn_by_inverse_layer_idx
+        assert not config.reorder_and_upcast_attn
+        self.embed_dim = config.hidden_size
+        self.wte = VocabParallelEmbedding(config.vocab_size,
+                                          self.embed_dim,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.wte")
+        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
+        self.start_layer, self.end_layer, self.h = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: GPT2Block(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.h")
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(["hidden_states"],
+                                                    config.n_embd))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.wte(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor],
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is None:
+                inputs_embeds = self.get_input_embeddings(input_ids)
+            position_embeds = self.wpe(position_ids)
+            hidden_states = inputs_embeds + position_embeds
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+
+        for layer in self.h[self.start_layer:self.end_layer]:
+            hidden_states = layer(hidden_states)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
+
+        hidden_states = self.ln_f(hidden_states)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if ".attn.bias" in name or ".attn.masked_bias" in name:
+                # Skip attention mask.
+                # NOTE: "c_attn.bias" should not be skipped.
+                continue
+
+            if is_pp_missing_parameter(name, self):
+                continue
+
+            param = params_dict[name]
+            # The HF's GPT-2 implementation uses Conv1D instead of Linear.
+            # Because of this, we need to transpose the weights.
+            # Note(zhuohan): the logic below might break quantized models.
+            for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]:
+                if conv1d_weight_name not in name:
+                    continue
+                if not name.endswith(".weight"):
+                    continue
+                loaded_weight = loaded_weight.t()
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class GPT2LMHeadModel(nn.Module, SupportsPP):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.transformer = GPT2Model(vllm_config=vllm_config,
+                                     prefix=maybe_prefix(
+                                         prefix, "transformer"))
+        self.lm_head = ParallelLMHead(self.config.vocab_size,
+                                      self.config.hidden_size,
+                                      quant_config=quant_config,
+                                      prefix=f"{prefix}.lm_head")
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.lm_head.tie_weights(self.transformer.wte)
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.transformer.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.transformer(input_ids, positions,
+                                         intermediate_tensors, inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        weights = _add_transformer_prefix(weights)
+        return loader.load_weights(weights)
+
+
+class GPT2ForSequenceClassification(nn.Module):
+    """GPT2 Model for sequence classification.
+
+    This class expands GPT2Model with pooling and score functions - last token
+    is being used for classification.
+
+    Attributes:
+        transformer: An instance of GPT2Model used for forward operations.
+        score: A layer for calculating logits.
+        _pooler: An instance of Pooler used for pooling operations.
+    """
+
+    is_pooling_model = True
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        self.transformer = GPT2Model(vllm_config=vllm_config,
+                                     prefix=maybe_prefix(prefix, "gpt2"))
+        self.score = nn.Linear(config.n_embd, config.num_labels, bias=False)
+
+        pooler_config = vllm_config.model_config.pooler_config
+        assert pooler_config is not None
+
+        self.pooler = DispatchPooler({
+            "encode":
+            Pooler.for_encode(pooler_config),
+            "classify":
+            Pooler.for_classify(pooler_config, classifier=None),
+        })
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.transformer(
+            input_ids=input_ids,
+            position_ids=positions,
+            inputs_embeds=inputs_embeds,
+            intermediate_tensors=intermediate_tensors)
+        logits = self.score(hidden_states)
+        return logits
+
+
+def _add_transformer_prefix(
+    weights: Iterable[tuple[str, torch.Tensor]]
+) -> Iterable[tuple[str, torch.Tensor]]:
+    for name, tensor in weights:
+        if not name.startswith('transformer.') and not name.startswith(
+                "lm_head"):
+            name = 'transformer.' + name
+        yield name, tensor
diff --git a/vllm_v0.10.0/vllm/model_executor/models/gpt_bigcode.py b/vllm_v0.10.0/vllm/model_executor/models/gpt_bigcode.py
new file mode 100644
index 0000000..661a67b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/gpt_bigcode.py
@@ -0,0 +1,335 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
+# Copyright 2023 The vLLM team.
+# Copyright 2023 CTranslate2, and Michael Feil
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only GPTBigCode model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import GPTBigCodeConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers)
+
+
+class GPTBigCodeAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: GPTBigCodeConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        total_num_heads = config.num_attention_heads
+        self.tensor_model_parallel_world_size = (
+            get_tensor_model_parallel_world_size())
+        assert total_num_heads % self.tensor_model_parallel_world_size == 0
+        self.num_heads = (total_num_heads //
+                          self.tensor_model_parallel_world_size)
+        self.head_dim = self.hidden_size // total_num_heads
+        self.scale = self.head_dim**-0.5
+
+        self.multi_query = config.multi_query
+        if self.multi_query:
+            total_num_kv_heads = 1
+            self.num_kv_heads = 1
+        else:
+            total_num_kv_heads = total_num_heads
+            self.num_kv_heads = self.num_heads
+        self.kv_dim = self.head_dim * self.num_kv_heads
+        self.c_attn = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            total_num_heads,
+            total_num_kv_heads,
+            bias=True,
+            quant_config=quant_config,
+        )
+
+        self.c_proj = RowParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              scale=self.scale,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.c_attn(hidden_states)
+        q, k, v = qkv.split(
+            [
+                self.hidden_size // self.tensor_model_parallel_world_size,
+                self.kv_dim, self.kv_dim
+            ],
+            dim=-1,
+        )
+        attn_output = self.attn(q, k, v)
+        attn_output, _ = self.c_proj(attn_output)
+        return attn_output
+
+
+class GPTBigMLP(nn.Module):
+
+    def __init__(
+        self,
+        intermediate_size: int,
+        config: GPTBigCodeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        hidden_size = config.hidden_size
+        self.c_fc = ColumnParallelLinear(
+            hidden_size,
+            intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+        )
+        self.c_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=True,
+            quant_config=quant_config,
+        )
+        self.act = get_act_fn(config.activation_function)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.c_proj(hidden_states)
+        return hidden_states
+
+
+class GPTBigCodeBlock(nn.Module):
+
+    def __init__(
+        self,
+        config: GPTBigCodeConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        hidden_size = config.hidden_size
+        inner_dim = (config.n_inner if config.n_inner is not None else 4 *
+                     hidden_size)
+
+        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.attn = GPTBigCodeAttention(config,
+                                        cache_config,
+                                        quant_config,
+                                        prefix=f"{prefix}.attn")
+        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.mlp = GPTBigMLP(inner_dim, config, quant_config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_output = self.attn(hidden_states=hidden_states, )
+        # residual connection
+        hidden_states = attn_output + residual
+
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        # residual connection
+        hidden_states = residual + feed_forward_hidden_states
+        return hidden_states
+
+
+@support_torch_compile
+class GPTBigCodeModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        assert not config.add_cross_attention
+
+        self.embed_dim = config.hidden_size
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.wte = VocabParallelEmbedding(self.vocab_size,
+                                          self.embed_dim,
+                                          org_num_embeddings=config.vocab_size)
+        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
+        self.start_layer, self.end_layer, self.h = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: GPTBigCodeBlock(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.h",
+        )
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(["hidden_states"],
+                                                    config.n_embd))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.wte(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is None:
+                inputs_embeds = self.get_input_embeddings(input_ids)
+            hidden_states = inputs_embeds + self.wpe(position_ids)
+        else:
+            hidden_states = intermediate_tensors["hidden_states"]
+
+        for layer in self.h[self.start_layer:self.end_layer]:
+            hidden_states = layer(hidden_states)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
+        hidden_states = self.ln_f(hidden_states)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if ".attn.bias" in name:
+                # Skip attention mask.
+                # NOTE: "c_attn.bias" should not be skipped.
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            # TODO (@robertgshaw2-neuralmagic): move to fp8 linear method
+            if "c_attn.input_scale" in name or "c_attn.weight_scale" in name:
+                weight_loader(param, loaded_weight, 'q')
+                weight_loader(param, loaded_weight, 'k')
+                weight_loader(param, loaded_weight, 'v')
+            else:
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {"c_attn": ["c_attn"]}
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.lora_config = lora_config
+
+        self.quant_config = quant_config
+        self.transformer = GPTBigCodeModel(vllm_config=vllm_config,
+                                           prefix=prefix)
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.transformer.wte
+        else:
+            self.lm_head = ParallelLMHead(
+                self.transformer.vocab_size,
+                self.transformer.embed_dim,
+                org_num_embeddings=self.config.vocab_size)
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.transformer.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.transformer(input_ids, positions,
+                                         intermediate_tensors, inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        skip_prefixes = None
+        if self.config.tie_word_embeddings:
+            skip_prefixes = ["lm_head."]
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=skip_prefixes,
+        )
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/gpt_j.py b/vllm_v0.10.0/vllm/model_executor/models/gpt_j.py
new file mode 100644
index 0000000..bd162a5
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/gpt_j.py
@@ -0,0 +1,339 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gptj/modeling_gptj.py
+# Copyright 2023 The vLLM team.
+# Copyright 2021 The EleutherAI and HuggingFace Teams. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only GPT-J model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import GPTJConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsPP
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class GPTJAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: GPTJConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.total_num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.head_size = self.hidden_size // self.total_num_heads
+
+        self.qkv_proj = QKVParallelLinear(
+            config.hidden_size,
+            self.head_size,
+            self.total_num_heads,
+            bias=False,
+            quant_config=quant_config,
+        )
+        self.out_proj = RowParallelLinear(
+            config.hidden_size,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+        )
+
+        tp_world_size = get_tensor_model_parallel_world_size()
+        assert self.total_num_heads % tp_world_size == 0
+        self.num_heads = self.total_num_heads // tp_world_size
+
+        scaling = self.head_size**-0.5
+        assert getattr(config, "rotary", True)
+        assert config.rotary_dim % 2 == 0
+        rope_theta = getattr(config, "rope_theta", 10000)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.rotary_emb = get_rope(
+            self.head_size,
+            rotary_dim=config.rotary_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            is_neox_style=False,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_size,
+                              scaling,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        q, k = self.rotary_emb(position_ids, q, k)
+        attn_output = self.attn(q, k, v)
+        attn_output, _ = self.out_proj(attn_output)
+        return attn_output
+
+
+class GPTJMLP(nn.Module):
+
+    def __init__(
+        self,
+        intermediate_size: int,
+        config: GPTJConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        hidden_size = config.n_embd
+        self.fc_in = ColumnParallelLinear(
+            hidden_size,
+            intermediate_size,
+            quant_config=quant_config,
+        )
+        self.fc_out = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            quant_config=quant_config,
+        )
+        self.act = get_act_fn(config.activation_function)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc_in(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.fc_out(hidden_states)
+        return hidden_states
+
+
+class GPTJBlock(nn.Module):
+
+    def __init__(
+        self,
+        config: GPTJConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        inner_dim = (4 * config.n_embd
+                     if config.n_inner is None else config.n_inner)
+        self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.attn = GPTJAttention(config,
+                                  cache_config,
+                                  quant_config,
+                                  prefix=f"{prefix}.attn")
+        self.mlp = GPTJMLP(inner_dim, config, quant_config)
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_output = self.attn(
+            position_ids=position_ids,
+            hidden_states=hidden_states,
+        )
+        mlp_output = self.mlp(hidden_states)
+        hidden_states = attn_output + mlp_output + residual
+        return hidden_states
+
+
+@support_torch_compile
+class GPTJModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+        self.quant_config = quant_config
+        self.embed_dim = config.n_embd
+        self.wte = VocabParallelEmbedding(
+            config.vocab_size,
+            self.embed_dim,
+        )
+        self.start_layer, self.end_layer, self.h = make_layers(
+            config.n_layer,
+            lambda prefix: GPTJBlock(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.h",
+        )
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(["hidden_states"],
+                                                    config.n_embd))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.wte(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+        else:
+            hidden_states = intermediate_tensors["hidden_states"]
+        for layer in self.h[self.start_layer:self.end_layer]:
+            hidden_states = layer(position_ids, hidden_states)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
+        hidden_states = self.ln_f(hidden_states)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "attn.bias" in name or "attn.masked_bias" in name:
+                continue
+
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class GPTJForCausalLM(nn.Module, SupportsPP):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        assert not config.tie_word_embeddings
+        self.transformer = GPTJModel(vllm_config=vllm_config,
+                                     prefix=maybe_prefix(
+                                         prefix, "transformer"))
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.n_embd,
+            bias=True,
+            quant_config=quant_config,
+        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.transformer.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.transformer(input_ids, positions,
+                                         intermediate_tensors, inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata, self.lm_head.bias)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/models/gpt_neox.py b/vllm_v0.10.0/vllm/model_executor/models/gpt_neox.py
new file mode 100644
index 0000000..d418d8b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/gpt_neox.py
@@ -0,0 +1,332 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only GPT-NeoX model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import GPTNeoXConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsPP
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class GPTNeoXAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: GPTNeoXConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.total_num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.head_size = self.hidden_size // self.total_num_heads
+        self.bias = getattr(config, "attention_bias", True)
+
+        tensor_model_parallel_world_size = (
+            get_tensor_model_parallel_world_size())
+        assert self.total_num_heads % tensor_model_parallel_world_size == 0
+        self.num_heads = (self.total_num_heads //
+                          tensor_model_parallel_world_size)
+
+        self.query_key_value = QKVParallelLinear(
+            config.hidden_size,
+            self.head_size,
+            self.total_num_heads,
+            bias=self.bias,
+            quant_config=quant_config,
+        )
+        self.dense = RowParallelLinear(
+            config.hidden_size,
+            config.hidden_size,
+            bias=self.bias,
+            quant_config=quant_config,
+        )
+        scaling = self.head_size**-0.5
+        rotary_dim = int(self.head_size * config.rotary_pct)
+        assert rotary_dim % 2 == 0
+        rope_theta = getattr(config, "rope_theta", 10000)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.rotary_emb = get_rope(
+            self.head_size,
+            rotary_dim=rotary_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_size,
+                              scaling,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.query_key_value(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        q, k = self.rotary_emb(position_ids, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.dense(attn_output)
+        return output
+
+
+class GPTNeoXMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: GPTNeoXConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.dense_h_to_4h = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            quant_config=quant_config,
+        )
+        self.dense_4h_to_h = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            quant_config=quant_config,
+        )
+        self.act = get_act_fn(config.hidden_act)
+
+    def forward(self, hidden_states):
+        hidden_states, _ = self.dense_h_to_4h(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.dense_4h_to_h(hidden_states)
+        return hidden_states
+
+
+class GPTNeoXLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: GPTNeoXConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.use_parallel_residual = config.use_parallel_residual
+        self.input_layernorm = nn.LayerNorm(config.hidden_size,
+                                            eps=config.layer_norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
+                                                     eps=config.layer_norm_eps)
+        self.attention = GPTNeoXAttention(config,
+                                          cache_config,
+                                          quant_config,
+                                          prefix=f"{prefix}.attention")
+        self.mlp = GPTNeoXMLP(config, quant_config)
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        attn_input = self.input_layernorm(hidden_states)
+        attn_output = self.attention(
+            position_ids=position_ids,
+            hidden_states=attn_input,
+        )
+
+        if self.use_parallel_residual:
+            # pseudocode:
+            # x = x + attn(ln1(x)) + mlp(ln2(x))
+            mlp_input = self.post_attention_layernorm(hidden_states)
+            mlp_output = self.mlp(mlp_input)
+            hidden_states = mlp_output + attn_output + hidden_states
+        else:
+            # pseudocode:
+            # x = x + attn(ln1(x))
+            # x = x + mlp(ln2(x))
+            attn_output = attn_output + hidden_states
+            mlp_input = self.post_attention_layernorm(attn_output)
+            mlp_output = self.mlp(mlp_input)
+            hidden_states = mlp_output + attn_output
+        return hidden_states
+
+
+@support_torch_compile
+class GPTNeoXModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+
+        self.embed_in = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: GPTNeoXLayer(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size,
+                                             eps=config.layer_norm_eps)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(["hidden_states"],
+                                                    config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_in(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+        else:
+            hidden_states = intermediate_tensors["hidden_states"]
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states = layer(position_ids, hidden_states)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
+        hidden_states = self.final_layer_norm(hidden_states)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if ("attention.bias" in name or "attention.masked_bias" in name
+                    or "rotary_emb.inv_freq" in name):
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using OpenRLHF may include
+                # these tensors in the checkpoint. Skip them.
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+            param = params_dict[name]
+
+            if "query_key_value" in name:
+                # NOTE: GPT-NeoX's fused QKV's output_dim has the shape of
+                # (num_heads * 3 * head_size), while the
+                # required shape is (3 * num_heads * head_size).
+                # Thus, we need weight conversion.
+                output_dim = getattr(param, "output_dim", None)
+                num_heads = self.config.num_attention_heads
+                if output_dim is not None:
+                    loaded_weight_shape = loaded_weight.shape
+                    loaded_weight = loaded_weight.view(
+                        loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
+                        loaded_weight_shape[output_dim + 1:])
+                    loaded_weight = loaded_weight.transpose(
+                        output_dim, output_dim + 1)
+                    loaded_weight = loaded_weight.reshape(loaded_weight_shape)
+
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class GPTNeoXForCausalLM(nn.Module, SupportsPP):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.gpt_neox = GPTNeoXModel(vllm_config=vllm_config,
+                                     prefix=maybe_prefix(prefix, "gpt_neox"))
+        self.embed_out = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+        )
+        if self.config.tie_word_embeddings:
+            self.embed_out.weight = self.gpt_neox.embed_in.weight
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.gpt_neox.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.gpt_neox.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.gpt_neox(input_ids, positions,
+                                      intermediate_tensors, inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.embed_out, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/granite.py b/vllm_v0.10.0/vllm/model_executor/models/granite.py
new file mode 100644
index 0000000..507a920
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/granite.py
@@ -0,0 +1,497 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only IBM Granite model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+from transformers import GraniteConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
+                    make_layers, maybe_prefix)
+
+
+class GraniteMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=hidden_size,
+            output_sizes=[intermediate_size] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
+        self.down_proj = RowParallelLinear(input_size=intermediate_size,
+                                           output_size=hidden_size,
+                                           bias=bias,
+                                           quant_config=quant_config,
+                                           prefix=f"{prefix}.down_proj")
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class GraniteAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: GraniteConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        self.head_dim = getattr(config, "head_dim", None)
+        if self.head_dim is None:
+            self.head_dim = self.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = config.attention_multiplier
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class GraniteDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: GraniteConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.residual_multiplier = config.residual_multiplier
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+                config, "original_max_position_embeddings", None):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        # Support abacusai/Smaug-72B-v0.1 with attention_bias
+        # Support internlm/internlm-7b with bias
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False)
+        self.self_attn = GraniteAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=getattr(config, "num_key_value_heads",
+                                 config.num_attention_heads),
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=attention_bias,
+            cache_config=cache_config,
+            prefix=f"{prefix}.self_attn",
+        )
+
+        self.mlp = GraniteMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            bias=getattr(config, "mlp_bias", False),
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+        hidden_states = residual + hidden_states * self.residual_multiplier
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states * self.residual_multiplier
+        return hidden_states
+
+
+@support_torch_compile
+class GraniteModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.quant_config = quant_config
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+        if get_pp_group().is_first_rank or (config.tie_word_embeddings
+                                            and get_pp_group().is_last_rank):
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE
+                # We need bigger padding if using lora for kernel
+                # compatibility
+                if not lora_config else lora_config.lora_vocab_padding_size,
+                quant_config=quant_config,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: GraniteDecoderLayer(config=config,
+                                               cache_config=cache_config,
+                                               quant_config=quant_config,
+                                               prefix=prefix),
+            prefix=f"{prefix}.layers")
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+
+            hidden_states *= self.config.embedding_multiplier
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states = layer(positions, hidden_states)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.lora_config = lora_config
+        self.quant_config = quant_config
+
+        self.model = GraniteModel(vllm_config=vllm_config,
+                                  prefix=maybe_prefix(prefix, "model"))
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = config.vocab_size
+            if lora_config:
+                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE
+                # We need bigger padding if using lora for kernel
+                # compatibility
+                if not lora_config else lora_config.lora_vocab_padding_size,
+                quant_config=quant_config,
+            )
+            if config.tie_word_embeddings:
+                self.lm_head.weight = self.model.embed_tokens.weight
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            if hasattr(config, "logits_scaling"):
+                logit_scale /= config.logits_scaling
+
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    config.vocab_size,
+                                                    scale=logit_scale)
+        else:
+            self.lm_head = PPMissingLayer()
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        model_output = self.model(input_ids, positions, intermediate_tensors,
+                                  inputs_embeds)
+        return model_output
+
+    def compute_logits(
+            self, hidden_states: torch.Tensor,
+            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        # With tie_word_embeddings, we can skip lm_head.weight
+        # The weight might appear unnecessarily in the files if the model is
+        # processed with quantization, LoRA, fine-tuning, etc.
+        skip_prefixes = (["lm_head."]
+                         if self.config.tie_word_embeddings else None)
+
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=skip_prefixes,
+        )
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/granite_speech.py b/vllm_v0.10.0/vllm/model_executor/models/granite_speech.py
new file mode 100644
index 0000000..6a4dee9
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/granite_speech.py
@@ -0,0 +1,788 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2025 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only IBM Granite speech model."""
+import math
+from collections.abc import Iterable, Mapping
+from typing import Optional, TypedDict, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import BatchFeature, PretrainedConfig
+
+from vllm.config import CacheConfig, VllmConfig
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
+                                   MultiModalDataParser)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+from .blip2 import Blip2QFormerModel
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
+from .utils import (AutoWeightsLoader, embed_multimodal,
+                    init_vllm_registered_model, maybe_prefix)
+
+
+### Audio Input
+class GraniteSpeechAudioInputs(TypedDict):
+
+    input_features: torch.Tensor
+    """Shape: `(bsz, num_features, 160)`"""
+
+    input_features_mask: torch.Tensor
+    """Shape: `(bsz, num_features)`"""
+
+    audio_embed_sizes: list[int]
+    """List of length `bsz`"""
+
+
+class GraniteSpeechMultiModalProcessingInfo(BaseProcessingInfo):
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"audio": 1}
+
+    # There is no limit to the maximum number of audio tokens that can be
+    # encoded as features; we pick ~5000 as a number that is probably higher
+    # than we would expect to encounter. The sequence of length
+    # get_max_audio_len() produces get_max_audio_tokens().
+    def get_max_audio_tokens(self):
+        return 5001
+
+    def get_max_audio_len(self):
+        return 8000000
+
+
+### Input Processing  & Multimodal utils
+class GraniteSpeechMultiModalProcessor(
+        BaseMultiModalProcessor[GraniteSpeechMultiModalProcessingInfo]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        feature_extractor = self.info.get_hf_processor().audio_processor
+        sampling_rate = feature_extractor.melspec_kwargs["sample_rate"]
+        return MultiModalDataParser(target_sr=sampling_rate)
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            input_features=MultiModalFieldConfig.batched("audio"),
+            audio_embed_sizes=MultiModalFieldConfig.batched("audio"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptUpdate]:
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        tokenizer = self.info.get_tokenizer()
+        feature_extractor = processor.audio_processor
+        vocab = tokenizer.get_vocab()
+
+        # Use getattr with default to be compatible with transformers<4.48
+        audio_token = getattr(processor, "audio_token", "<|audio|>")
+        audio_token_id = vocab[audio_token]
+
+        def get_replacement(item_idx: int):
+            audios = mm_items.get_items("audio", AudioProcessorItems)
+            audio = audios.get(item_idx)
+            audio_length = audio.shape[-1]
+            num_projector_features = feature_extractor._get_num_audio_features(
+                [audio_length])[0]
+            return [audio_token_id] * num_projector_features
+
+        return [
+            PromptReplacement(
+                modality="audio",
+                target=[audio_token_id],
+                replacement=get_replacement,
+            )
+        ]
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        mm_data = dict(mm_data)
+        audios = mm_data.pop("audios", [])
+
+        if audios:
+            # GraniteSpeechFeatureExtractor accepts "audio"
+            mm_data["audio"] = audios
+
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+
+        if "audio" in mm_data:
+            # Calculate the number of audio tokens per entry in the batch;
+            # This is used to split the batch back out after padding.
+            audio_token_index = self.info.get_hf_config().audio_token_index
+            processed_outputs["audio_embed_sizes"] = [
+                torch.sum(indices == audio_token_index).item()
+                for indices in processed_outputs["input_ids"]
+            ]
+
+        return processed_outputs
+
+
+class GraniteSpeechDummyInputsBuilder(
+        BaseDummyInputsBuilder[GraniteSpeechMultiModalProcessingInfo]):
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_audios = mm_counts.get("audio", 0)
+        return {
+            "audio":
+            self._get_dummy_audios(
+                length=self.info.get_max_audio_len(),
+                num_audios=num_audios,
+            )
+        }
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+        hf_processor = self.info.get_hf_processor()
+        audio_token = getattr(hf_processor, "audio_token", "<|audio|>")
+        return audio_token * num_audios
+
+
+### QFormer Projector
+class GraniteSpeechEncoderProjector(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: CacheConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.projector_config.hidden_size
+        self.downsample_rate = config.downsample_rate
+        self.window_size = config.window_size
+        self.num_queries = config.window_size // config.downsample_rate
+
+        self.query = nn.Parameter(
+            torch.zeros(1, self.num_queries,
+                        config.projector_config.hidden_size))
+
+        # NOTE - this is implemented generically in transformers,
+        # but for now we create the QFormer model directly since
+        # all existing models use this for the projector.
+        self.qformer = Blip2QFormerModel(
+            config.projector_config,
+            quant_config=quant_config,
+            cache_config=cache_config,
+            prefix=f"{prefix}.qformer",
+        )
+        self.linear = nn.Linear(config.projector_config.hidden_size,
+                                config.text_config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, seq_len, dim = hidden_states.size()
+        nblocks = math.ceil(seq_len / self.window_size)
+        pad = nblocks * self.window_size - seq_len
+        hidden_states = nn.functional.pad(hidden_states, (0, 0, 0, pad),
+                                          "constant", 0)
+        hidden_states = hidden_states.view(batch_size * nblocks,
+                                           self.window_size, dim)
+
+        last_hidden_state = self.qformer(
+            query_embeds=self.query.data,
+            encoder_hidden_states=hidden_states,
+        )
+
+        query_proj = self.linear(
+            last_hidden_state.view(
+                batch_size,
+                nblocks * self.window_size // self.downsample_rate,
+                -1,
+            ))
+        return query_proj
+
+
+# Encoder - conformer is adapted from: https://github.com/lucidrains/conformer.git
+# NOTE - it would be nice to see if we can align this with other models using
+# conformer in vLLM, e.g., phi4mm audio.
+class GraniteSpeechConformerFeedForward(nn.Module):
+    """Feedforward module for conformer encoder blocks."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(config.hidden_dim)
+
+        self.up_proj = ColumnParallelLinear(
+            input_size=config.hidden_dim,
+            output_size=config.hidden_dim * config.feedforward_mult,
+            quant_config=quant_config,
+            prefix=f"{prefix}.up_proj",
+        )
+        self.silu = nn.SiLU()
+
+        self.down_proj = RowParallelLinear(
+            input_size=config.hidden_dim * config.feedforward_mult,
+            output_size=config.hidden_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.pre_norm(hidden_states)
+        hidden_states, _ = self.up_proj(hidden_states)
+        hidden_states = self.silu(hidden_states)
+        hidden_states, _ = self.down_proj(hidden_states)
+        return hidden_states
+
+
+class GraniteSpeechConformerAttention(nn.Module):
+    """Attention for conformer blocks using Shaw's relative positional
+    embeddings. See the following [paper](https://arxiv.org/pdf/1803.02155)
+    for more details.
+    """
+
+    def __init__(self, config: PretrainedConfig, prefix: str = ""):
+        super().__init__()
+
+        inner_dim = config.dim_head * config.num_heads
+        self.max_pos_emb = config.max_pos_emb
+        self.context_size = config.context_size
+        self.num_heads = config.num_heads
+        self.dim_head = config.dim_head
+        self.scale = self.dim_head**-0.5
+        self.pre_norm = nn.LayerNorm(config.hidden_dim)
+        self.to_q = nn.Linear(config.hidden_dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(config.hidden_dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, config.hidden_dim)
+        self.rel_pos_emb = nn.Embedding(2 * self.max_pos_emb + 1,
+                                        self.dim_head)
+
+        if self.context_size <= 0 or self.context_size > self.max_pos_emb:
+            raise ValueError(
+                "Context size is either less than 0 or exceeds the max_pos_emb"
+            )
+
+    def forward(self, hidden_states: torch.Tensor,
+                attention_dists: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.pre_norm(hidden_states)
+        bsz, num_features, _ = hidden_states.shape
+
+        num_blocks = math.ceil(num_features / self.context_size)
+        remainder = num_features % self.context_size
+        if remainder > 0:
+            # right padding to reach block size
+            hidden_states = torch.nn.functional.pad(
+                hidden_states, (0, 0, 0, self.context_size - remainder))
+
+        # NOTE: would be nice to try to use qkvparallellinear
+        # here for this block attention implementation if possible
+        query_states = self.to_q(hidden_states)
+        key_states, value_states = self.to_kv(hidden_states).chunk(2, dim=-1)
+
+        query_states = query_states.reshape(bsz, num_blocks, self.context_size,
+                                            self.num_heads,
+                                            -1).transpose(2, 3)
+        key_states = key_states.reshape(bsz, num_blocks, self.context_size,
+                                        self.num_heads, -1).transpose(2, 3)
+        value_states = value_states.reshape(bsz, num_blocks, self.context_size,
+                                            self.num_heads,
+                                            -1).transpose(2, 3)
+
+        # shaw's relative positional embedding
+        dist = attention_dists.to(hidden_states.device)
+        rel_pos_emb = self.rel_pos_emb(dist)
+        rel_pos_emb_expanded = rel_pos_emb.view([1, 1, 1] +
+                                                list(rel_pos_emb.shape))
+        pos_attn = torch.sum(query_states.unsqueeze(-2) * rel_pos_emb_expanded,
+                             dim=-1) * self.scale
+
+        if remainder > 0:
+            # masked attention in the extended block
+            mask = torch.ones(self.context_size,
+                              self.context_size,
+                              dtype=bool,
+                              device=hidden_states.device)
+            mask[:remainder, :remainder] = 0
+            mask_value = -torch.finfo(pos_attn.dtype).max
+            pos_attn[:, -1, :].masked_fill_(mask, mask_value)
+
+        with torch.nn.attention.sdpa_kernel(
+                torch.nn.attention.SDPBackend.MATH):
+            out = F.scaled_dot_product_attention(query_states,
+                                                 key_states,
+                                                 value_states,
+                                                 attn_mask=pos_attn,
+                                                 scale=self.scale)
+        out = out.transpose(2, 3).reshape(bsz, hidden_states.shape[1], -1)
+        return self.to_out(out[:, :num_features, :])
+
+
+class GraniteSpeechConformerDepthWiseConv1d(nn.Module):
+    """Wrapper for padded 1D pointwise convolution."""
+
+    def __init__(self,
+                 chan_in: int,
+                 chan_out: int,
+                 kernel_size: int,
+                 prefix: str = ""):
+        super().__init__()
+        # Padding for the 1D conv is symmetric or close (i.e., offset by one).
+        pad = kernel_size // 2
+        pad_offset = (kernel_size + 1) % 2
+        self.padding = (pad, pad - pad_offset)
+
+        self.conv = nn.Conv1d(chan_in,
+                              chan_out,
+                              kernel_size,
+                              groups=chan_in,
+                              bias=False)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = F.pad(hidden_states, self.padding)
+        return self.conv(hidden_states)
+
+
+class GraniteSpeechConformerConvModule(nn.Module):
+    """Conformer conv module consisting of several 1D/depthwise 1D
+    convolutional layers.
+    """
+
+    def __init__(self, config: PretrainedConfig, prefix: str = ""):
+        super().__init__()
+        inner_dim = config.hidden_dim * config.conv_expansion_factor
+
+        self.norm = nn.LayerNorm(config.hidden_dim)
+        self.up_conv = nn.Conv1d(config.hidden_dim, inner_dim * 2, 1)
+        self.glu = nn.GLU(dim=1)
+        self.depth_conv = GraniteSpeechConformerDepthWiseConv1d(
+            inner_dim,
+            inner_dim,
+            kernel_size=config.conv_kernel_size,
+            prefix=f"{prefix}.depth_conv",
+        )
+        self.silu = nn.SiLU()
+        self.batch_norm = nn.BatchNorm1d(inner_dim)
+        self.down_conv = nn.Conv1d(inner_dim, config.hidden_dim, 1)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.norm(hidden_states)
+        hidden_states = self.up_conv(hidden_states.permute(0, 2, 1))
+        hidden_states = self.glu(hidden_states)
+        hidden_states = self.depth_conv(hidden_states)
+        hidden_states = self.silu(self.batch_norm(hidden_states))
+        hidden_states = self.down_conv(hidden_states).permute(0, 2, 1)
+        return hidden_states
+
+
+class GraniteSpeechConformerBlock(nn.Module):
+    """Conformer block, consisting largely of linear layers,
+    attention, and convolutional layers."""
+
+    def __init__(self, config: PretrainedConfig, prefix: str = ""):
+        super().__init__()
+        self.ff1 = GraniteSpeechConformerFeedForward(config,
+                                                     prefix=f"{prefix}.ff1")
+        self.attn = GraniteSpeechConformerAttention(config,
+                                                    prefix=f"{prefix}.attn")
+        self.conv = GraniteSpeechConformerConvModule(config,
+                                                     prefix=f"{prefix}.conv")
+        self.ff2 = GraniteSpeechConformerFeedForward(config,
+                                                     prefix=f"{prefix}.ff2")
+        self.post_norm = nn.LayerNorm(config.hidden_dim)
+
+    def forward(self, hidden_states: torch.Tensor,
+                attention_dists: torch.Tensor) -> torch.Tensor:
+        hidden_states = 0.5 * self.ff1(hidden_states) + hidden_states
+        hidden_states = self.attn(
+            hidden_states, attention_dists=attention_dists) + hidden_states
+        hidden_states = self.conv(hidden_states) + hidden_states
+        hidden_states = 0.5 * self.ff2(hidden_states) + hidden_states
+        hidden_states = self.post_norm(hidden_states)
+        return hidden_states
+
+
+class GraniteSpeechCTCEncoder(nn.Module):
+    """CTC Encoder comprising conformer blocks and additional linear layers."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 prefix: str,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.config = config
+
+        # Precompute clamped relative positional encoding distances
+        seq = torch.arange(config.context_size)
+        relpos_dist = seq.view(-1, 1) - seq.view(1, -1)
+        self.attention_dists = torch.clamp(
+            relpos_dist, -config.context_size,
+            config.context_size) + config.max_pos_emb
+
+        self.input_linear = nn.Linear(config.input_dim,
+                                      config.hidden_dim,
+                                      bias=True)
+        self.layers = nn.ModuleList([
+            GraniteSpeechConformerBlock(
+                config,
+                prefix=f"{prefix}.layers.{idx}",
+            ) for idx in range(config.num_layers)
+        ])
+
+        self.out = ColumnParallelLinear(
+            input_size=config.hidden_dim,
+            output_size=config.output_dim,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out",
+        )
+
+        self.out_mid = RowParallelLinear(
+            input_size=config.output_dim,
+            output_size=config.hidden_dim,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_mid",
+        )
+        self.softmax = nn.Softmax(dim=-1)
+        self.num_layers = config.num_layers
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = self.input_linear(hidden_states)
+        for idx, layer in enumerate(self.layers, start=1):
+            hidden_states = layer(hidden_states,
+                                  attention_dists=self.attention_dists)
+
+            if idx == self.num_layers // 2:
+                hidden_states_mid = hidden_states.clone()
+                hidden_states_mid, _ = self.out(hidden_states_mid)
+                hidden_states_mid = self.softmax(hidden_states_mid)
+                hidden_states_mid, _ = self.out_mid(hidden_states_mid)
+                hidden_states += hidden_states_mid
+        return hidden_states
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    GraniteSpeechMultiModalProcessor,
+    info=GraniteSpeechMultiModalProcessingInfo,
+    dummy_inputs=GraniteSpeechDummyInputsBuilder)
+class GraniteSpeechForConditionalGeneration(
+        nn.Module,
+        SupportsMultiModal,
+        SupportsPP,
+        SupportsLoRA,
+):
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("audio"):
+            return "<|audio|>"
+
+        raise ValueError("Only audio modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        cache_config = vllm_config.cache_config
+
+        self.config = config
+        self.quant_config = quant_config
+        self.cache_config = cache_config
+
+        # The language model is typically a Granite LLM
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
+        # Conformer encoder
+        self.encoder = GraniteSpeechCTCEncoder(
+            config=config.encoder_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.encoder",
+        )
+
+        # Blip2 QFormer
+        self.projector = GraniteSpeechEncoderProjector(
+            config=config,
+            quant_config=quant_config,
+            cache_config=cache_config,
+            prefix=f"{prefix}.projector",
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _parse_and_validate_audio_input(
+        self,
+        **kwargs: object,
+    ) -> Optional[GraniteSpeechAudioInputs]:
+        input_features = kwargs.pop("input_features", None)
+        input_features_mask = kwargs.pop("input_features_mask", None)
+        audio_embed_sizes = kwargs.pop("audio_embed_sizes", None)
+        if input_features is None:
+            return None
+
+        # If we have a batch of variable feature length audio clips, we need
+        # to mask the features; usually we would get an input_features_mask
+        # from the processor, but we handle rebuilding it here since
+        # vLLM generally processes everything independently + batches.
+        if input_features_mask is None:
+            input_features_mask = self._build_input_features_mask(
+                audio_embed_sizes)
+
+        if not isinstance(input_features, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of audio input features. "
+                             f"Got type: {type(input_features)}")
+
+        if input_features_mask is not None and not isinstance(
+                input_features_mask, torch.Tensor):
+            raise ValueError("Incorrect type of audio input features mask. "
+                             f"Got type: {type(input_features_mask)}")
+
+        if isinstance(input_features, torch.Tensor):
+            # Granite speech currently only allows one audio token per instance
+            # and features are already unsqueezed in the processor, so one
+            # instance will have shape [1, {num_features}, 160]. As such,
+            # input features will usually be of shape
+            # [bsz, 1, num_features, 160], which we squeeze to be 3D here.
+            if len(input_features.shape) == 4:
+                input_features = input_features.squeeze(1)
+            if len(input_features.shape) != 3:
+                raise ValueError(
+                    "Squeezed input features should be 3D but are of shape "
+                    f"{input_features.shape}")
+            input_features = input_features.to(
+                self.encoder.input_linear.weight.dtype)
+
+        else:
+            # Otherwise we have a list of tensors, which are almost certainly
+            # differing in their respective numbers of audio features;
+            # stack them into a 3D tensor of size [bsz, most_num_features, 160].
+            input_features = self._pad_and_stack_input_features(
+                input_features, ).to(self.encoder.input_linear.weight.dtype)
+
+        return GraniteSpeechAudioInputs(
+            input_features=input_features,
+            input_features_mask=input_features_mask,
+            audio_embed_sizes=audio_embed_sizes.flatten().tolist(),
+        )
+
+    def _build_input_features_mask(
+        self,
+        audio_embed_sizes: torch.Tensor,
+    ) -> torch.Tensor:
+        """Calculate the input features mask, which will generally be used
+        to mask the padded features for all entries in the batch except
+        for those with the most audio features.
+
+        Args:
+            audio_embed_sizes: torch.Tensor
+                Tensor of num features in each seq in the batch.
+        Returns:
+            torch.Tensor: Mask of shape (bsz, num_features) to be applied to
+            the audio features prior to splitting the audio embeddings.
+        """
+        most_audio_features = torch.max(audio_embed_sizes).item()
+        mask_indices = torch.arange(
+            most_audio_features,
+            device=audio_embed_sizes.device,
+        ).view(1, -1)
+        input_features_mask = mask_indices < audio_embed_sizes.view(-1, 1)
+        return input_features_mask
+
+    def _pad_and_stack_input_features(
+        self,
+        input_features: list[torch.Tensor],
+    ) -> torch.Tensor:
+        """Given a list of input features of varying length, pad them to the
+        same length and stack them into a torch.Tensor.
+
+        NOTE: Usually, padding is done in the input processor/feature extractor
+        and zero padded prior to the computation of the Mel features; the
+        resulting values are only constant within a batch and generally nonzero
+        (i.e., slightly negative nums); we should validate that this is okay
+        since we don't use a feature attention mask, but the more important
+        thing is that we apply the input_features_mask with variable len
+        batches.
+
+        Args:
+            input_features: list[torch.Tensor]
+                Input features to be coerced into a tensor.
+        Returns:
+            torch.Tensor: Tensor of shape [bsz, num_features, 160], where
+            num_features is the max number of features of any entry in the
+            batch.
+        """
+        # Input features are of shape [bsz, num_features, 160]
+        feat_lens = [feats.shape[1] for feats in input_features]
+        padding = [max(feat_lens) - length for length in feat_lens]
+        # TODO (Alex) - Validate that it's okay to zero pad like this;
+        # in transformers we zero pad prior to calculating the speech features,
+        # so the value is not zero and is dependent on the batched features.
+        padded = [
+            torch.nn.functional.pad(feats, (0, 0, 0, pad, 0, 0))
+            for feats, pad in zip(input_features, padding)
+        ]
+        stacked_features = torch.cat(padded, dim=0).to(input_features[0])
+        return stacked_features
+
+    def _process_audio_input(
+        self,
+        audio_input: GraniteSpeechAudioInputs,
+    ) -> tuple[torch.Tensor]:
+        """Compute the audio features to be merged into the LLM embeddings.
+        
+        Args:
+            audio_input: GraniteSpeechAudioInputs
+                Audio inputs object containing Mel features, an input features
+                mask, and the (flattened) number of audio tokens per instance.
+        Returns:
+            tuple[torch.Tensor]: List of length bsz.
+        """
+        # TODO (Alex) - support embedding inputs
+        encoder_embeds = self.encoder(audio_input["input_features"])
+        # [bsz, <max feature size>, 4096]
+        projected_embeds = self.projector(encoder_embeds)
+        # Apply mask on variable length audio features
+        masked_embeds = projected_embeds[audio_input["input_features_mask"]]
+        # Split variable length features into a tuple
+        return torch.split(masked_embeds, audio_input["audio_embed_sizes"])
+
+    def get_multimodal_embeddings(
+        self,
+        **kwargs: object,
+    ) -> MultiModalEmbeddings:
+        """Compute the audio embeddings if audio inputs are present."""
+        audio_input = self._parse_and_validate_audio_input(**kwargs)
+        if audio_input is None:
+            return []
+            return None
+        audio_features = self._process_audio_input(audio_input)
+        return audio_features
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        """Compute the merged LLM / audio embeddings."""
+        if multimodal_embeddings is None \
+            or len(multimodal_embeddings) == 0:
+            return self.language_model.get_input_embeddings(input_ids)
+
+        inputs_embeds = embed_multimodal(
+            input_ids,
+            self.config.audio_token_index,
+            self.language_model.model.get_input_embeddings,
+            multimodal_embeddings,
+        )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            audio_embeds = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids, audio_embeds)
+            input_ids = None
+
+        model_output = self.language_model(input_ids, positions,
+                                           intermediate_tensors, inputs_embeds)
+        return model_output
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(
+            hidden_states,
+            sampling_metadata,
+        )
+
+    def load_weights(
+        self,
+        weights: Iterable[tuple[str, torch.Tensor]],
+    ) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """Get the module prefix in multimodal models."""
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="projector",
+            tower_model="encoder",
+        )
diff --git a/vllm_v0.10.0/vllm/model_executor/models/granitemoe.py b/vllm_v0.10.0/vllm/model_executor/models/granitemoe.py
new file mode 100644
index 0000000..7d31854
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/granitemoe.py
@@ -0,0 +1,540 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only GraniteMoe model."""
+from collections.abc import Iterable
+from typing import Any, Optional
+
+import torch
+from torch import nn
+from transformers.models.granitemoe import GraniteMoeConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter, make_layers,
+                    maybe_prefix)
+
+
+class GraniteMoeMoE(nn.Module):
+    """A tensor-parallel MoE implementation for GraniteMoe that shards each
+    expert across all ranks.
+    Each expert's weights are sharded across all ranks and a fused MoE
+    kernel is used for the forward pass, and finally we reduce the outputs
+    across ranks.
+    """
+
+    def __init__(self,
+                 num_experts: int,
+                 top_k: int,
+                 hidden_size: int,
+                 intermediate_size: int,
+                 params_dtype: Optional[torch.dtype] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 tp_size: Optional[int] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.hidden_size = hidden_size
+
+        # Gate always runs at half / full precision for now.
+        self.gate = ReplicatedLinear(hidden_size,
+                                     num_experts,
+                                     bias=False,
+                                     params_dtype=params_dtype,
+                                     quant_config=None,
+                                     prefix=f"{prefix}.gate")
+
+        self.experts = FusedMoE(num_experts=num_experts,
+                                top_k=top_k,
+                                hidden_size=hidden_size,
+                                intermediate_size=intermediate_size,
+                                params_dtype=params_dtype,
+                                reduce_results=True,
+                                renormalize=True,
+                                quant_config=quant_config,
+                                tp_size=tp_size,
+                                prefix=f"{prefix}.experts")
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        final_hidden_states = self.experts(hidden_states, router_logits)
+        return final_hidden_states.view(orig_shape)
+
+
+class GraniteMoeAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        max_position: int = 4096 * 32,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        attention_multiplier: Optional[float] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = (attention_multiplier if attention_multiplier
+                        is not None else self.head_dim**-1)
+        self.rope_theta = rope_theta
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=int(self.rope_theta),
+            is_neox_style=True,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class GraniteMoeDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: GraniteMoeConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        # Requires transformers > 4.32.0
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        self.self_attn = GraniteMoeAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+            attention_multiplier=config.attention_multiplier)
+        self.block_sparse_moe = GraniteMoeMoE(
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.block_sparse_moe")
+
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+        self.residual_multiplier = config.residual_multiplier
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+        hidden_states = residual + hidden_states * self.residual_multiplier
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.block_sparse_moe(hidden_states)
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        return hidden_states
+
+
+@support_torch_compile
+class GraniteMoeModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.quant_config = quant_config  # Required by MixtralModel
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+        self.embedding_multiplier = config.embedding_multiplier
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: GraniteMoeDecoderLayer(
+                config, cache_config, quant_config=quant_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers")
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            hidden_states *= self.embedding_multiplier
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states = layer(positions, hidden_states)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+    def _load_weights(self,
+                      weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        """
+        This function is copied from `MixtralModel.load_weights`, mainly to 
+        decouple from mixtral, avoiding impact on support like BNB  
+        quantization.
+        """
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="w1",
+            ckpt_down_proj_name="w2",
+            ckpt_up_proj_name="w3",
+            num_experts=self.config.num_local_experts)
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if ((name.endswith(".bias") or name.endswith("_bias"))
+                        and name not in params_dict):
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                if name.endswith("scale"):
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        new_weights = {}
+        for n, p in weights:
+            if n.endswith('.block_sparse_moe.input_linear.weight'):
+                for e in range(p.size(0)):
+                    w1_name = n.replace(
+                        '.block_sparse_moe.input_linear.weight',
+                        f".block_sparse_moe.experts.{e}.w1.weight")
+                    w3_name = n.replace(
+                        '.block_sparse_moe.input_linear.weight',
+                        f".block_sparse_moe.experts.{e}.w3.weight")
+                    w1_param, w3_param = p[e].chunk(2, dim=0)
+                    assert w1_name not in new_weights
+                    assert w3_name not in new_weights
+                    new_weights[w1_name] = w1_param
+                    new_weights[w3_name] = w3_param
+            elif n.endswith('.block_sparse_moe.output_linear.weight'):
+                for e in range(p.size(0)):
+                    w2_name = n.replace(
+                        '.block_sparse_moe.output_linear.weight',
+                        f".block_sparse_moe.experts.{e}.w2.weight")
+                    w2_param = p[e]
+                    assert w2_name not in new_weights
+                    new_weights[w2_name] = w2_param
+            elif n.endswith('.block_sparse_moe.router.layer.weight'):
+                gate_name = n.replace('.block_sparse_moe.router.layer.weight',
+                                      ".block_sparse_moe.gate.weight")
+                assert gate_name not in new_weights
+                new_weights[gate_name] = p
+            else:
+                new_weights[n] = p
+        return self._load_weights(new_weights.items())
+
+
+class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    fall_back_to_pt_during_load = False
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.lora_config = lora_config
+
+        self.model = GraniteMoeModel(vllm_config=vllm_config,
+                                     prefix=maybe_prefix(prefix, "model"))
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+            quant_config=quant_config,
+        )
+        if config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size,
+                                                scale=1 /
+                                                self.config.logits_scaling)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+            self, hidden_states: torch.Tensor,
+            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/granitemoehybrid.py b/vllm_v0.10.0/vllm/model_executor/models/granitemoehybrid.py
new file mode 100644
index 0000000..59c1dce
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/granitemoehybrid.py
@@ -0,0 +1,658 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Inference-only GraniteMoeHybrid model."""
+# Added by the IBM Team, 2025
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+from torch import nn
+from transformers import GraniteMoeHybridConfig
+
+from vllm import envs
+from vllm.attention.layer import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.mamba2_metadata import (
+    Mamba2Metadata, prepare_mamba2_metadata)
+from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
+from vllm.model_executor.layers.mamba.mamba_utils import get_mamba_state_shape
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
+                                                    MambaCacheParams)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+from vllm.utils import LayerBlockType
+
+from .granitemoe import GraniteMoeMoE
+from .granitemoeshared import GraniteMoeSharedMLP
+from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
+                         SupportsQuant)
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class GraniteMoeHybridMambaDecoderLayer(nn.Module):
+
+    def __init__(self,
+                 config: GraniteMoeHybridConfig,
+                 layer_idx: int,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "") -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.residual_multiplier = config.residual_multiplier
+
+        self.mamba = MambaMixer2(hidden_size= config.hidden_size,
+                                ssm_state_size = config.mamba_d_state,
+                                conv_kernel_size = config.mamba_d_conv,
+                                intermediate_size = config.mamba_expand *\
+                                                    config.hidden_size,
+                                use_conv_bias = config.mamba_conv_bias,
+                                use_bias = config.mamba_proj_bias,
+                                n_groups=config.mamba_n_groups,
+                                num_heads=config.mamba_n_heads,
+                                head_dim=config.mamba_d_head,
+                                rms_norm_eps=config.rms_norm_eps,
+                                activation=config.hidden_act,
+                                quant_config=quant_config,
+                                prefix=f"{prefix}.mixer")
+
+        self.block_sparse_moe = None
+        if getattr(config, "num_local_experts", 0) > 0:
+            self.block_sparse_moe = GraniteMoeMoE(
+                num_experts=config.num_local_experts,
+                top_k=config.num_experts_per_tok,
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.block_sparse_moe")
+
+        self.shared_mlp = None if \
+            getattr(config, 'shared_intermediate_size', 0) == 0 \
+            else GraniteMoeSharedMLP(
+                config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.shared_mlp"
+            )
+
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        mamba_cache_params: MambaCacheParams,
+        mamba2_metadata: Mamba2Metadata,
+        **kwargs,
+    ):
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        output = torch.empty_like(hidden_states)
+        self.mamba(hidden_states, output, mamba_cache_params, mamba2_metadata)
+        hidden_states = residual + output * self.residual_multiplier
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        if self.shared_mlp is None:
+            if self.block_sparse_moe is not None:
+                hidden_states = self.block_sparse_moe(hidden_states)
+            # else: skip
+        else:
+            # create a copy since block_sparse_moe modifies in-place
+            if self.block_sparse_moe is not None:
+                moe_hidden_states = hidden_states.clone()
+                moe_hidden_states = self.block_sparse_moe(moe_hidden_states)
+                hidden_states = moe_hidden_states + self.shared_mlp(
+                    hidden_states)
+                del moe_hidden_states
+            else:
+                hidden_states = self.shared_mlp(hidden_states)
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        return hidden_states, residual
+
+
+class GraniteMoeHybridAttentionDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: GraniteMoeHybridConfig,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.residual_multiplier = config.residual_multiplier
+
+        self.self_attn = GraniteMoeHybridAttention(
+            config,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn")
+
+        self.block_sparse_moe = None
+        if getattr(config, "num_local_experts", 0) > 0:
+            self.block_sparse_moe = GraniteMoeMoE(
+                num_experts=config.num_local_experts,
+                top_k=config.num_experts_per_tok,
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.block_sparse_moe")
+
+        self.shared_mlp = None if \
+            getattr(config, 'shared_intermediate_size', 0) == 0 \
+            else GraniteMoeSharedMLP(
+                config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.shared_mlp"
+            )
+
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        mamba_cache_params: MambaCacheParams,
+        mamba2_metadata: Mamba2Metadata,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        if self.shared_mlp is None:
+            if self.block_sparse_moe is not None:
+                hidden_states = self.block_sparse_moe(hidden_states)
+            # else: skip
+        else:
+            # create a copy since block_sparse_moe modifies in-place
+            if self.block_sparse_moe is not None:
+                moe_hidden_states = hidden_states.clone()
+                moe_hidden_states = self.block_sparse_moe(moe_hidden_states)
+                hidden_states = moe_hidden_states + self.shared_mlp(
+                    hidden_states)
+                del moe_hidden_states
+            else:
+                hidden_states = self.shared_mlp(hidden_states)
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        return hidden_states, residual
+
+
+class GraniteMoeHybridAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: GraniteMoeHybridConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.causal = True
+        self.hidden_size = config.hidden_size
+        self.attention_bias = config.attention_bias
+        self.attention_multiplier = config.attention_multiplier
+        self.total_num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.total_num_heads
+        self.total_num_kv_heads = config.num_key_value_heads
+
+        # TensorParallel logic
+        tp_size = get_tensor_model_parallel_world_size()
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_key_value_heads = max(1, self.total_num_kv_heads // tp_size)
+
+        self.qkv_proj = QKVParallelLinear(self.hidden_size,
+                                          self.head_dim,
+                                          self.total_num_heads,
+                                          self.total_num_kv_heads,
+                                          bias=self.attention_bias,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.qkv_proj")
+
+        self.o_proj = RowParallelLinear(self.hidden_size,
+                                        self.hidden_size,
+                                        bias=self.attention_bias,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.o_proj")
+
+        if config.position_embedding_type == "rope":
+            self.rotary_emb = get_rope(
+                self.head_dim,
+                rotary_dim=self.head_dim,
+                max_position=config.max_position_embeddings,
+                base=int(config.rope_theta),
+                rope_scaling=config.rope_scaling \
+                    if hasattr(config, "rope_scaling") \
+                    and config.rope_scaling is not None else None,
+                is_neox_style=True,
+            )
+        else:
+            self.rotary_emb = None
+
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.attention_multiplier,
+                              num_kv_heads=self.num_key_value_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+
+        qkv, _ = self.qkv_proj(hidden_states)
+        query, key, value = qkv.split([
+            self.num_heads * self.head_dim, self.num_key_value_heads *
+            self.head_dim, self.num_key_value_heads * self.head_dim
+        ],
+                                      dim=-1)
+
+        if self.rotary_emb is not None:
+            query, key = self.rotary_emb(positions, query, key)
+
+        hidden_states = self.attn(query, key, value)
+        del query, key, value
+
+        hidden_states = self.o_proj(hidden_states)[0]
+        return hidden_states
+
+
+ALL_DECODER_LAYER_TYPES = {
+    "attention": GraniteMoeHybridAttentionDecoderLayer,
+    "mamba": GraniteMoeHybridMambaDecoderLayer,
+}
+
+
+@support_torch_compile
+class GraniteMoeHybridModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+        self.embedding_multiplier = config.embedding_multiplier
+
+        def get_layer(prefix: str):
+            layer_idx = int(prefix.rsplit(".", 1)[1])
+            layer_class = ALL_DECODER_LAYER_TYPES[
+                config.layer_types[layer_idx]]
+            return layer_class(
+                config,
+                layer_idx,
+                cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers")
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        attn_metadata = get_forward_context().attn_metadata
+
+        if not envs.VLLM_USE_V1:
+            mamba2_metadata = prepare_mamba2_metadata(
+                chunk_size=self.config.mamba_chunk_size,
+                attn_metadata=attn_metadata,
+            )
+        else:
+            # v1 get mamba2_metadata from forward_context
+            mamba2_metadata = None
+
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = hidden_states * self.embedding_multiplier
+            residual = None
+        else:
+            if intermediate_tensors is None:
+                raise RuntimeError('Intermediate tensors may not be None!')
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        num_attn = 0
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            if isinstance(layer, GraniteMoeHybridAttentionDecoderLayer):
+                num_attn += 1
+
+            layer_mamba_cache_params = None
+            if isinstance(
+                    layer,
+                    GraniteMoeHybridMambaDecoderLayer) and mamba_cache_params:
+                layer_mamba_cache_params = mamba_cache_params.at_layer_idx(
+                    i - num_attn)
+
+            hidden_states, residual = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+                mamba_cache_params=layer_mamba_cache_params,
+                mamba2_metadata=mamba2_metadata)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        def _load(n, p):
+            param = params_dict[n]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, p)
+            loaded_params.add(n)
+
+        def _load_shard(n, p, shard_id):
+            # Skip layers on other devices.
+            if not is_pp_missing_parameter(n, self):
+                param = params_dict[n]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, p, shard_id)
+                loaded_params.add(n)
+
+        def _load_expert(n, p, name, shard_id, expert_id):
+            param = params_dict[n]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param,
+                          p,
+                          name,
+                          shard_id=shard_id,
+                          expert_id=expert_id)
+            loaded_params.add(n)
+
+        for n, p in weights:
+            if "A_log" in n:
+                n = n.replace("A_log", "A")
+
+            # Logic analogous to: https://github.com/vllm-project/vllm/blob/f49e5aff11c986ed4d45202b1716c5d74786efa9/vllm/model_executor/models/granitemoeshared.py#L215
+            # Mapping different experts' layout:
+            #  from HF (input_linear, output_linear, router)
+            #  to vLLM (experts_w13({e}.w1, {e}.w2), experts_w3({e}.w3), gate)
+            if n.endswith('.block_sparse_moe.input_linear.weight'):
+                for e in range(p.size(0)):
+                    w1_name = n.replace(
+                        '.block_sparse_moe.input_linear.weight',
+                        f".block_sparse_moe.experts.{e}.w1.weight")
+                    w3_name = n.replace(
+                        '.block_sparse_moe.input_linear.weight',
+                        f".block_sparse_moe.experts.{e}.w3.weight")
+                    w1_param, w3_param = p[e].chunk(2, dim=0)
+                    _load_expert(n.replace('.input_linear.', '.experts.w13_'),
+                                 w1_param,
+                                 w1_name,
+                                 shard_id='w1',
+                                 expert_id=e)
+                    _load_expert(n.replace('.input_linear.', '.experts.w13_'),
+                                 w3_param,
+                                 w3_name,
+                                 shard_id='w3',
+                                 expert_id=e)
+            elif n.endswith('.block_sparse_moe.output_linear.weight'):
+                for e in range(p.size(0)):
+                    w2_name = n.replace(
+                        '.block_sparse_moe.output_linear.weight',
+                        f".block_sparse_moe.experts.{e}.w2.weight")
+                    w2_param = p[e]
+                    _load_expert(n.replace('.output_linear.', '.experts.w2_'),
+                                 w2_param,
+                                 w2_name,
+                                 shard_id='w2',
+                                 expert_id=e)
+            elif n.endswith('.block_sparse_moe.router.layer.weight'):
+                gate_name = n.replace('.block_sparse_moe.router.layer.weight',
+                                      ".block_sparse_moe.gate.weight")
+                _load(gate_name, p)
+            else:
+                loaded = False
+                for param_name, weight_name, shard_id in stacked_params_mapping:
+                    if weight_name in n:
+                        _load_shard(n.replace(weight_name, param_name),
+                                    p,
+                                    shard_id=shard_id)
+                        loaded = True
+                if not loaded:
+                    _load(n, p)
+
+        return loaded_params
+
+
+class GraniteMoeHybridForCausalLM(nn.Module, HasInnerState, SupportsLoRA,
+                                  SupportsPP, IsHybrid, SupportsQuant):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    @classmethod
+    def get_mamba_state_shape_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+        use_v1: bool = True,
+    ) -> tuple[tuple[int, int], tuple[int, int, int]]:
+        """Calculate shapes for Mamba's convolutional and state caches.
+
+        Args:
+            vllm_config: vLLM config
+            use_v1: Get shapes for V1 (or V0)
+
+        Returns:
+            Tuple containing:
+            - conv_state_shape: Shape for convolutional state cache
+            - temporal_state_shape: Shape for state space model cache
+        """
+        parallel_config = vllm_config.parallel_config
+        hf_config = vllm_config.model_config.hf_config
+        intermediate_size = hf_config.mamba_expand * hf_config.hidden_size
+
+        return get_mamba_state_shape(
+            intermediate_size=intermediate_size,
+            tp_world_size=parallel_config.tensor_parallel_size,
+            n_groups=hf_config.mamba_n_groups,
+            num_heads=hf_config.mamba_n_heads,
+            head_dim=hf_config.mamba_d_head,
+            state_size=hf_config.mamba_d_state,
+            conv_kernel=hf_config.mamba_d_conv,
+            use_v1=use_v1,
+        )
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        lora_config = vllm_config.lora_config
+        scheduler_config = vllm_config.scheduler_config
+        if cache_config.enable_prefix_caching:
+            raise RuntimeError(
+                "GraniteMoeHybrid currently does not support prefix caching")
+
+        self.quant_config = vllm_config.quant_config
+        self.config = config
+        self.scheduler_config = scheduler_config
+        self.model = GraniteMoeHybridModel(vllm_config=vllm_config,
+                                           prefix=maybe_prefix(
+                                               prefix, "model"))
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+            quant_config=self.quant_config,
+            prefix=maybe_prefix(prefix, "lm_head"))
+        if config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size,
+                                                scale=1 /
+                                                self.config.logits_scaling)
+
+        # Used to track and store by the Mamba cache between steps.
+        self.mamba_cache: Optional[MambaCacheManager] = None
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs):
+
+        mamba_cache_params = None
+        if not envs.VLLM_USE_V1:
+            if self.mamba_cache is None:
+                num_mamba_layers = (
+                    self.model_config.get_num_layers_by_block_type(
+                        self.vllm_config.parallel_config,
+                        LayerBlockType.mamba))
+                mamba_state_shape = \
+                    self.get_mamba_state_shape_from_config(
+                        self.vllm_config, use_v1=False)
+                self.mamba_cache = MambaCacheManager(self.vllm_config,
+                                                     self.model_config.dtype,
+                                                     num_mamba_layers,
+                                                     *mamba_state_shape)
+
+            mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+
+        hidden_states = self.model(input_ids, positions, mamba_cache_params,
+                                   intermediate_tensors, inputs_embeds)
+
+        return hidden_states
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        return self.mamba_cache.copy_inputs_before_cuda_graphs(
+            input_buffers, **kwargs)
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/granitemoeshared.py b/vllm_v0.10.0/vllm/model_executor/models/granitemoeshared.py
new file mode 100644
index 0000000..1e2e854
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/granitemoeshared.py
@@ -0,0 +1,342 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Inference-only GraniteMoeShared model.
+
+The architecture is the same as granitemoe but with the addition of shared
+experts.
+"""
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+from torch import nn
+from transformers.models.granitemoeshared import GraniteMoeSharedConfig
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .granitemoe import GraniteMoeAttention, GraniteMoeModel, GraniteMoeMoE
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import AutoWeightsLoader, make_layers, maybe_prefix
+
+
+class GraniteMoeSharedMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: GraniteMoeSharedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.input_size = config.hidden_size
+        self.hidden_size = config.shared_intermediate_size
+        self.input_linear = MergedColumnParallelLinear(
+            input_size=self.input_size,
+            output_sizes=[self.hidden_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.input_linear")
+        self.output_linear = RowParallelLinear(
+            self.hidden_size,
+            self.input_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.output_linear")
+        if config.hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {config.hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.input_linear(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states, _ = self.output_linear(hidden_states)
+        return hidden_states
+
+
+class GraniteMoeSharedDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: GraniteMoeSharedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        # Requires transformers > 4.32.0
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        self.self_attn = GraniteMoeAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+            attention_multiplier=config.attention_multiplier)
+        self.block_sparse_moe = GraniteMoeMoE(
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.block_sparse_moe")
+        self.shared_mlp = None if \
+            getattr(config, 'shared_intermediate_size', 0) == 0 \
+            else GraniteMoeSharedMLP(
+                config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.shared_mlp"
+            )
+
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+        self.residual_multiplier = config.residual_multiplier
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+        hidden_states = residual + hidden_states * self.residual_multiplier
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        if self.shared_mlp is None:
+            hidden_states = self.block_sparse_moe(hidden_states)
+        else:
+            # create a copy since block_sparse_moe modifies in-place
+            moe_hidden_states = hidden_states.clone()
+            moe_hidden_states = self.block_sparse_moe(moe_hidden_states)
+            hidden_states = moe_hidden_states + self.shared_mlp(hidden_states)
+            del moe_hidden_states
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        return hidden_states
+
+
+@support_torch_compile
+class GraniteMoeSharedModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.quant_config = quant_config  # Required by MixtralModel
+        self.padding_idx = config.pad_token_id
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            quant_config=quant_config,
+        )
+        self.embedding_multiplier = config.embedding_multiplier
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: GraniteMoeSharedDecoderLayer(
+                config, cache_config, quant_config=quant_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers")
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            hidden_states *= self.embedding_multiplier
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states = layer(positions, hidden_states)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        new_weights = {}
+        for n, p in weights:
+            if n.endswith('.block_sparse_moe.input_linear.weight'):
+                for e in range(p.size(0)):
+                    w1_name = n.replace(
+                        '.block_sparse_moe.input_linear.weight',
+                        f".block_sparse_moe.experts.{e}.w1.weight")
+                    w3_name = n.replace(
+                        '.block_sparse_moe.input_linear.weight',
+                        f".block_sparse_moe.experts.{e}.w3.weight")
+                    w1_param, w3_param = p[e].chunk(2, dim=0)
+                    assert w1_name not in new_weights
+                    assert w3_name not in new_weights
+                    new_weights[w1_name] = w1_param
+                    new_weights[w3_name] = w3_param
+            elif n.endswith('.block_sparse_moe.output_linear.weight'):
+                for e in range(p.size(0)):
+                    w2_name = n.replace(
+                        '.block_sparse_moe.output_linear.weight',
+                        f".block_sparse_moe.experts.{e}.w2.weight")
+                    w2_param = p[e]
+                    assert w2_name not in new_weights
+                    new_weights[w2_name] = w2_param
+            elif n.endswith('.block_sparse_moe.router.layer.weight'):
+                gate_name = n.replace('.block_sparse_moe.router.layer.weight',
+                                      ".block_sparse_moe.gate.weight")
+                assert gate_name not in new_weights
+                new_weights[gate_name] = p
+            else:
+                new_weights[n] = p
+        return GraniteMoeModel._load_weights(self, new_weights.items())
+
+
+class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    fall_back_to_pt_during_load = False
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.lora_config = lora_config
+
+        self.model = GraniteMoeSharedModel(vllm_config=vllm_config,
+                                           prefix=maybe_prefix(
+                                               prefix, "model"))
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "lm_head"))
+        if config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size,
+                                                scale=1 /
+                                                self.config.logits_scaling)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+            self, hidden_states: torch.Tensor,
+            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/gritlm.py b/vllm_v0.10.0/vllm/model_executor/models/gritlm.py
new file mode 100644
index 0000000..8a3fbc6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/gritlm.py
@@ -0,0 +1,264 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Set
+from typing import Optional, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from vllm.config import ModelConfig, VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler,
+                                               PoolerHead, PoolerNormalize,
+                                               PoolingParamsUpdate,
+                                               build_output, get_prompt_lens,
+                                               get_prompt_token_ids)
+from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.model_executor.pooling_metadata import PoolingMetadata
+from vllm.pooling_params import PoolingTask
+from vllm.sequence import PoolerOutput
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
+
+from .interfaces import SupportsV0Only
+
+logger = init_logger(__name__)
+
+
+class GritLMMeanPool(nn.Module):
+    """As `MeanPool`, but only includes non-instruction tokens."""
+
+    def __init__(self, model_config: ModelConfig):
+        super().__init__()
+
+        self.model_config = model_config
+
+        tokenizer = cached_tokenizer_from_config(self.model_config)
+
+        # Collect the tokens needed for pattern matching.
+        # "▁<" is different from "_<". The former uses "▁" to indicate that
+        # the next token is the start of a word.
+        # "<0x0A>" is the newline token (i.e. "\n")."
+        self.token_ids = {
+            tok: tokenizer.convert_tokens_to_ids([tok])[0]
+            for tok in ["<s>", "▁<", "<", "|", "embed", ">", "<0x0A>", "user"]
+        }
+
+        def tokens_to_ids(tokens: list[str]) -> np.ndarray:
+            return np.array([self.token_ids[token] for token in tokens])
+
+        self.user_pattern_ids = tokens_to_ids(
+            ["▁<", "|", "user", "|", ">", "<0x0A>"])
+        self.embed_newline_pattern_ids = tokens_to_ids(
+            ["<0x0A>", "<", "|", "embed", "|", ">", "<0x0A>"])
+        self.embed_pattern_ids = tokens_to_ids(
+            ["▁<", "|", "embed", "|", ">", "<0x0A>"])
+
+    def _find_array(
+        self,
+        arr: np.ndarray,
+        target: np.ndarray,
+        start_idx: int = 0,
+        end_idx: Optional[int] = None,
+    ) -> int:
+        """
+        Find the first occurrence of `target` in `arr` starting from
+        `start_idx`.
+
+        Args:
+            arr: The array to search within.
+            target: The consecutive subsequence to find.
+            start_idx: The starting index to search from (inclusive).
+            end_idx: The ending index to search from (exclusive).
+
+        Returns:
+            The index of the first occurrence of `target` in `arr`.
+        """
+        if start_idx < 0:
+            raise ValueError("`start_idx` must be non-negative")
+        if len(arr) == 0 or len(target) == 0:
+            raise ValueError("Empty `arr` or `target` not allowed")
+
+        arr_len = len(arr)
+        target_len = len(target)
+
+        if end_idx is None:
+            end_idx = arr_len
+
+        for i in range(start_idx, min(end_idx, arr_len - target_len + 1)):
+            if (arr[i:i + target_len] == target).all():
+                return i
+
+        return -1
+
+    def _get_instruction_len(self, prompt_token_ids: np.ndarray) -> int:
+        """
+        Get the length of the instruction in the prompt.
+
+        We do a pattern matching to find the instruction in the prompt,
+        and then return the length of the instruction.
+
+        The pattern matching is done using integers instead of strings
+        because the prompt is given as a list of token IDs.
+        """
+        instruction_len = 0
+
+        # Return no instruction in case of missing BOS token.
+        if prompt_token_ids[0] != self.token_ids["<s>"]:
+            logger.warning("BOS token not found in prompt, "
+                           "thus using empty string for instruction. "
+                           "GritLM requires BOS token in prompt.")
+            return instruction_len
+
+        # If user pattern is found in the prompt, that means there should be
+        # a newline token before the embed pattern.
+        embed_pattern_ids = self.embed_pattern_ids
+        if self._find_array(prompt_token_ids,
+                            self.user_pattern_ids,
+                            start_idx=1,
+                            end_idx=2) == 1:
+            embed_pattern_ids = self.embed_newline_pattern_ids
+
+        # Find the embed pattern in the prompt.
+        found_embed_pattern_idx = self._find_array(prompt_token_ids,
+                                                   embed_pattern_ids,
+                                                   start_idx=1)
+
+        if found_embed_pattern_idx != -1:
+            instruction_len = found_embed_pattern_idx + len(embed_pattern_ids)
+        else:
+            logger.warning("Query instruction not found in prompt, "
+                           "thus using BOS token as instruction instead. "
+                           "GritLM requires query instruction in prompt.")
+            instruction_len = 1
+
+        return instruction_len
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return {"encode", "embed"}
+
+    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
+        return PoolingParamsUpdate(requires_token_ids=True)
+
+    def forward_one(
+        self,
+        hidden_states: torch.Tensor,
+        prompt_len: Optional[torch.Tensor] = None,
+        instr_len: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        assert prompt_len is None or prompt_len == hidden_states.shape[0], \
+            "partial prefill not supported with MEAN pooling"
+
+        return hidden_states[instr_len:].mean(dim=0, dtype=torch.float32)
+
+    def forward_all(
+        self,
+        hidden_states: torch.Tensor,
+        prompt_lens: torch.Tensor,
+        instr_lens: torch.Tensor,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        offset = 0
+        pooled_data = list[torch.Tensor]()
+
+        for prompt_len, instr_len in zip(prompt_lens, instr_lens):
+            pooled_data.append(hidden_states[offset + instr_len:offset +
+                                             prompt_len].mean(
+                                                 dim=0, dtype=torch.float32))
+            offset += prompt_len
+
+        return pooled_data
+
+    def forward(
+        self,
+        hidden_states: Union[torch.Tensor, list[torch.Tensor]],
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        prompt_lens = get_prompt_lens(hidden_states, pooling_metadata)
+        instr_lens = torch.tensor(
+            [
+                self._get_instruction_len(token_ids.cpu().numpy())
+                for token_ids in get_prompt_token_ids(pooling_metadata)
+            ],
+            device=prompt_lens.device,
+        )
+
+        if isinstance(hidden_states, list):
+            return [
+                self.forward_one(h, prompt_len, instr_len) for h, prompt_len,
+                instr_len in zip(hidden_states, prompt_lens, instr_lens)
+            ]
+
+        return self.forward_all(hidden_states, prompt_lens, instr_lens)
+
+
+class GritLMPooler(Pooler):
+
+    def __init__(self, model_config: ModelConfig):
+        super().__init__()
+
+        self.pooling = GritLMMeanPool(model_config)
+        self.head = PoolerHead(PoolerNormalize())
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return self.pooling.get_supported_tasks()
+
+    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
+        return self.pooling.get_pooling_updates(task)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        pooled_data = self.pooling(hidden_states, pooling_metadata)
+        pooled_data = self.head(pooled_data, pooling_metadata)
+        return build_output(pooled_data)
+
+
+class GritLM(LlamaForCausalLM, SupportsV0Only):
+    """This class implements the embedding model for parasail-ai/GritLM-7B-vllm.
+
+    The class inherits from LlamaForCausalLM and provides a custom pooling
+    layer.
+
+    The main difference between the pooling layer in GritLM and the one in
+    LlamaForCausalLM is that GritLM ignores the query instruction in the prompt
+    when pooling the hidden states.
+
+    Embedding prompts should be in the following format:
+    - With instruction: "<|user|>\nINSTRUCTION\n<|embed|>\nPROMPT".
+    - Without instruction: "<|embed|>\nPROMPT".
+
+    Generation prompts should be in the following format:
+    - "<|user|>\nPROMPT\n<|assistant|>\n"
+    """
+
+    is_pooling_model = True
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        **kwargs,
+    ) -> None:
+        # Use full attention for pooling (this is why V1 is not supported yet)
+        if vllm_config.model_config.runner_type == "pooling":
+            hf_config = vllm_config.model_config.hf_config
+            hf_config.is_causal = False
+
+            vllm_config.cache_config.sliding_window = None
+
+            for attr in ("sliding_window", "interleaved_sliding_window"):
+                if hasattr(hf_config, attr):
+                    delattr(hf_config, attr)
+
+        super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
+
+        pooler_config = vllm_config.model_config.pooler_config
+        if pooler_config is not None:
+            self.pooler = DispatchPooler({
+                "encode":
+                Pooler.for_encode(pooler_config),
+                "embed":
+                GritLMPooler(vllm_config.model_config),
+            })
diff --git a/vllm_v0.10.0/vllm/model_executor/models/grok1.py b/vllm_v0.10.0/vllm/model_executor/models/grok1.py
new file mode 100644
index 0000000..3659249
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/grok1.py
@@ -0,0 +1,550 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from
+# https://github.com/ROCm/vllm/blob/cea7419f151cc50293a05b7fac8547f8f887c9f6/vllm/model_executor/models/grok1.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Grok1 model."""
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+# Default Grok1-specific constants, overridden by config values if present
+DEFAULT_ATTN_OUTPUT_MULTIPLIER = 0.08838834764831845
+DEFAULT_OUTPUT_MULTIPLIER_SCALE = 0.5773502691896257
+DEFAULT_EMBEDDING_MULTIPLIER_SCALE = 78.38367176906169
+
+
+class Grok1MoE(nn.Module):
+    """A tensor-parallel MoE implementation for Grok1 that shards each expert
+    across all ranks.
+
+    Each expert's weights are sharded across all ranks and a fused MoE
+    kernel is used for the forward pass, and finally we reduce the outputs
+    across ranks.
+    """
+
+    def __init__(self,
+                 num_experts: int,
+                 top_k: int,
+                 hidden_size: int,
+                 intermediate_size: int,
+                 params_dtype: Optional[torch.dtype] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 tp_size: Optional[int] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.hidden_size = hidden_size
+
+        # Gate always runs at half / full precision for now.
+        self.gate = ReplicatedLinear(hidden_size,
+                                     num_experts,
+                                     bias=False,
+                                     params_dtype=params_dtype,
+                                     quant_config=None,
+                                     prefix=f"{prefix}.gate")
+
+        self.experts = FusedMoE(num_experts=num_experts,
+                                top_k=top_k,
+                                hidden_size=hidden_size,
+                                intermediate_size=intermediate_size,
+                                params_dtype=params_dtype,
+                                reduce_results=True,
+                                renormalize=True,
+                                quant_config=quant_config,
+                                tp_size=tp_size,
+                                activation="gelu",
+                                prefix=f"{prefix}.experts")
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        router_logits = 30.0 * F.tanh(router_logits / 30.0)
+        final_hidden_states = self.experts(hidden_states, router_logits)
+        return final_hidden_states.view(orig_shape)
+
+
+class Grok1Attention(nn.Module):
+
+    def __init__(
+            self,
+            hidden_size: int,
+            num_heads: int,
+            num_kv_heads: int,
+            max_position: int = 4096 * 32,
+            rope_theta: float = 10000,
+            cache_config: Optional[CacheConfig] = None,
+            quant_config: Optional[QuantizationConfig] = None,
+            prefix: str = "",
+            config=None,  # Added config parameter
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.config = config  # Store config reference
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=int(self.rope_theta),
+            is_neox_style=True,
+        )
+
+        attn_logits_soft_cap = max(
+            getattr(config, "attn_logit_softcapping", 30.0), 0.0)
+
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              logits_soft_cap=attn_logits_soft_cap,
+                              prefix=f"{prefix}.attn")
+        self.attn_multiplier = getattr(self.config, "attn_output_multiplier",
+                                       1.0) if self.config else 1.0
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        output *= self.attn_multiplier
+        return output
+
+
+class Grok1DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        # Check for fp8 quantization
+        self.use_fp8 = False
+        if quant_config is not None:
+            self.use_fp8 = getattr(quant_config, "is_fp8_w8a8",
+                                   lambda: False)()
+            if not self.use_fp8 and hasattr(quant_config, "is_fp8"):
+                self.use_fp8 = quant_config.is_fp8
+
+        # Requires transformers > 4.32.0
+        # Default rope_theta value if not in config
+        rope_theta = 10000
+        self.attn = Grok1Attention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+            config=config)  # Pass config to Grok1Attention
+
+        # Grok1 uses "num_experts" in its config
+        num_experts = getattr(config, "num_experts", 8)
+        num_experts_per_tok = getattr(config, "num_experts_per_tok", 2)
+
+        self.moe_block = Grok1MoE(num_experts=num_experts,
+                                  top_k=num_experts_per_tok,
+                                  hidden_size=config.hidden_size,
+                                  intermediate_size=config.intermediate_size,
+                                  quant_config=quant_config,
+                                  prefix=f"{prefix}.moe_block")
+
+        self.pre_attn_norm = RMSNorm(config.hidden_size,
+                                     eps=config.rms_norm_eps)
+        self.post_attn_norm = RMSNorm(config.hidden_size,
+                                      eps=config.rms_norm_eps)
+        self.pre_moe_norm = RMSNorm(config.hidden_size,
+                                    eps=config.rms_norm_eps)
+        self.post_moe_norm = RMSNorm(config.hidden_size,
+                                     eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.pre_attn_norm(hidden_states)
+        else:
+            hidden_states, residual = self.pre_attn_norm(
+                hidden_states, residual)
+
+        hidden_states = self.attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Post attention normalization
+        hidden_states = self.post_attn_norm(hidden_states)
+
+        # MoE block with normalization
+        hidden_states, residual = self.pre_moe_norm(hidden_states, residual)
+        hidden_states = self.moe_block(hidden_states)
+        hidden_states = self.post_moe_norm(hidden_states)
+
+        return hidden_states, residual
+
+
+@support_torch_compile
+class Grok1Model(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.quant_config = quant_config
+        self.padding_idx = config.pad_token_id
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+        self.embedding_multiplier_scale = getattr(
+            config, "embedding_multiplier_scale",
+            DEFAULT_EMBEDDING_MULTIPLIER_SCALE)
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            quant_config=quant_config,
+        )
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Grok1DecoderLayer(
+                config, cache_config, quant_config=quant_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers")
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        hidden_states = hidden_states * self.embedding_multiplier_scale
+        return hidden_states
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        # Map Grok1's unique expert parameter names to standard names
+        # Grok1 uses "num_experts" in its config
+        num_experts = getattr(self.config, "num_experts", 8)
+        return FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="linear",  # Grok1 specific
+            ckpt_down_proj_name="linear_1",  # Grok1 specific
+            ckpt_up_proj_name="linear_v",  # Grok1 specific
+            num_experts=num_experts)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        expert_params_mapping = self.get_expert_mapping()
+        for name, loaded_weight in weights:
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if ((name.endswith(".bias") or name.endswith("_bias"))
+                        and name not in params_dict):
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                if name.endswith("scale"):
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    # Handle Grok1-specific norm.scale naming
+                    if "norm.scale" in name:
+                        name = name.replace("scale", "weight")
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    fall_back_to_pt_during_load = False
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.lora_config = lora_config
+        self.quant_config = quant_config
+
+        self.model = Grok1Model(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
+
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+
+        self.output_multiplier_scale = getattr(
+            config, "output_multiplier_scale", DEFAULT_OUTPUT_MULTIPLIER_SCALE)
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size,
+                                                self.output_multiplier_scale)
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        # Skip lm_head when tie_word_embeddings is True
+        skip_prefixes = (["lm_head"]
+                         if self.config.tie_word_embeddings else None)
+
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=skip_prefixes,
+        )
+        return loader.load_weights(weights)
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
diff --git a/vllm_v0.10.0/vllm/model_executor/models/h2ovl.py b/vllm_v0.10.0/vllm/model_executor/models/h2ovl.py
new file mode 100644
index 0000000..467b074
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/h2ovl.py
@@ -0,0 +1,549 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/modeling_h2ovl_chat.py
+# https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/image_process.py
+# --------------------------------------------------------
+# H2OVL-Mississippi
+# Copyright (c) 2024 H2O.AI
+# Licensed under Apache 2.0 License [see LICENSE for details]
+# --------------------------------------------------------
+from collections.abc import Mapping, Sequence
+from typing import Optional, Union
+
+import torch
+from PIL import Image
+from transformers import PretrainedConfig
+
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalKwargs
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   MultiModalDataItems)
+from vllm.multimodal.processing import (MultiModalHashes, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+from .intern_vit import InternVisionModel
+from .internvl import (IMG_CONTEXT, IMG_END, IMG_START,
+                       BaseInternVLDummyInputsBuilder,
+                       BaseInternVLMultiModalProcessor,
+                       BaseInternVLProcessingInfo, BaseInternVLProcessor,
+                       InternVLChatModel, build_transform,
+                       find_closest_aspect_ratio, get_internvl_target_ratios)
+
+
+def resolve_h2ovl_min_max_num(
+    *,
+    min_dynamic_patch: int,
+    max_dynamic_patch: int,
+    dynamic_image_size: bool,
+    use_thumbnail: bool,
+) -> tuple[int, int]:
+    min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
+    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
+
+    if use_thumbnail and max_dynamic_patch != 1:
+        max_dynamic_patch += 1
+
+    return min_dynamic_patch, max_dynamic_patch
+
+
+def get_h2ovl_target_ratios(
+    min_num: int,
+    max_num: int,
+    *,
+    prior_aspect_ratio: Optional[tuple[int, int]],
+) -> list[tuple[int, int]]:
+    target_ratios = get_internvl_target_ratios(min_num, max_num)
+
+    # if prior_aspect_ratio is provided, filter the target ratios
+    if prior_aspect_ratio is not None:
+        target_ratios = [
+            ratio for ratio in target_ratios if prior_aspect_ratio[0] %
+            ratio[0] != 0 and prior_aspect_ratio[1] % ratio[1] != 0
+        ]
+
+    return target_ratios
+
+
+# modified to include blocks generated in second pass
+def calculate_h2ovl_targets(
+    *,
+    orig_width: int,
+    orig_height: int,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> tuple[int, int, int, tuple[int, int]]:
+    aspect_ratio = orig_width / orig_height
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio,
+        target_ratios,
+        width=orig_width,
+        height=orig_height,
+        image_size=image_size,
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # add thumbnail image if num_blocks != 1
+    if use_thumbnail and blocks != 1:
+        blocks += 1
+
+    return blocks, target_width, target_height, target_aspect_ratio
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+# refactored to handle prior_aspect_ratio
+def dynamic_preprocess_h2ovl(
+    image: Image.Image,
+    *,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> tuple[list[Image.Image], tuple[int, int]]:
+    orig_width, orig_height = image.size
+
+    # calculate the number of blocks without thumbnail
+    (
+        blocks,
+        target_width,
+        target_height,
+        target_aspect_ratio,
+    ) = calculate_h2ovl_targets(
+        orig_width=orig_width,
+        orig_height=orig_height,
+        target_ratios=target_ratios,
+        image_size=image_size,
+        use_thumbnail=False,
+    )
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+
+    assert len(processed_images) == blocks
+
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+
+    return processed_images, target_aspect_ratio
+
+
+def _preprocess_image(
+    image: Image.Image,
+    *,
+    input_size: int,
+    min_num: int,
+    max_num: int,
+    use_thumbnail: bool,
+    prior_aspect_ratio: Optional[tuple[int, int]],
+) -> tuple[torch.Tensor, tuple[int, int]]:
+    target_ratios = get_h2ovl_target_ratios(
+        min_num,
+        max_num,
+        prior_aspect_ratio=prior_aspect_ratio,
+    )
+
+    transform = build_transform(input_size=input_size)
+    images, target_aspect_ratio = dynamic_preprocess_h2ovl(
+        image,
+        image_size=input_size,
+        use_thumbnail=use_thumbnail,
+        target_ratios=target_ratios,
+    )
+
+    pixel_values = torch.stack([transform(image) for image in images])
+    return pixel_values, target_aspect_ratio
+
+
+# refactored to use the _preprocess_image function
+def image_to_pixel_values_h2ovl(
+    image: Image.Image,
+    *,
+    input_size: int,
+    min_num: int,
+    max_num: int,
+    use_thumbnail: bool,
+    use_msac: bool,
+) -> torch.Tensor:
+    # when MSAC is turned on, we need to process the image twice
+    if use_msac:
+        # first pass
+        pixel_values1, aspect_ratio1 = _preprocess_image(
+            image,
+            input_size=input_size,
+            min_num=1,
+            max_num=max_num,
+            use_thumbnail=True,
+            prior_aspect_ratio=None,
+        )
+        # second pass
+        pixel_values2, _ = _preprocess_image(
+            image,
+            input_size=input_size,
+            min_num=3,
+            max_num=max_num,
+            use_thumbnail=True,
+            prior_aspect_ratio=aspect_ratio1,
+        )
+        # combine pixel values
+        pixel_values = torch.cat(
+            [pixel_values2[:-1], pixel_values1[:-1], pixel_values2[-1:]], 0)
+
+    else:
+        pixel_values, _ = _preprocess_image(
+            image,
+            input_size=input_size,
+            min_num=min_num,
+            max_num=max_num,
+            use_thumbnail=use_thumbnail,
+            prior_aspect_ratio=None,
+        )
+
+    return pixel_values
+
+
+class H2OVLProcessor(BaseInternVLProcessor):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: AnyTokenizer,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        use_msac: Optional[bool] = None,
+    ) -> None:
+        super().__init__(
+            config,
+            tokenizer,
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+        )
+
+        if use_msac is None:
+            use_msac = config.use_msac
+        assert isinstance(use_msac, bool)
+
+        self.use_msac = use_msac
+
+    @property
+    def image_token_id(self) -> int:
+        return self.tokenizer.get_vocab()[IMG_CONTEXT]
+
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: Optional[int],
+    ) -> PromptUpdateDetails[str]:
+        repl_features = IMG_CONTEXT * feature_size
+        repl_full = IMG_START + repl_features + IMG_END
+
+        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
+
+    def resolve_min_max_num(
+        self,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        use_thumbnail: Optional[bool] = None,
+    ) -> tuple[int, int]:
+        min_dynamic_patch = (self.min_dynamic_patch if min_dynamic_patch
+                             is None else min_dynamic_patch)
+        max_dynamic_patch = (self.max_dynamic_patch if max_dynamic_patch
+                             is None else max_dynamic_patch)
+        dynamic_image_size = (self.dynamic_image_size if dynamic_image_size
+                              is None else dynamic_image_size)
+        use_thumbnail = (self.use_thumbnail
+                         if use_thumbnail is None else use_thumbnail)
+
+        return resolve_h2ovl_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
+
+    def resolve_target_ratios(
+        self,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        use_thumbnail: Optional[bool] = None,
+        prior_aspect_ratio: Optional[tuple[int, int]] = None,
+        override_min_num: Optional[int] = None,
+    ) -> list[tuple[int, int]]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
+        if override_min_num is not None:
+            min_num = override_min_num
+
+        return get_h2ovl_target_ratios(
+            min_num,
+            max_num,
+            prior_aspect_ratio=prior_aspect_ratio,
+        )
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        use_msac: Optional[bool] = None,
+    ) -> int:
+        use_msac = (self.use_msac if use_msac is None else use_msac)
+
+        use_thumbnail = self.use_thumbnail
+
+        if use_msac:
+            target_ratios_1 = self.resolve_target_ratios(
+                use_thumbnail=False,  # Applied in calculate_targets
+                override_min_num=1,
+            )
+            num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets(
+                orig_width=image_width,
+                orig_height=image_height,
+                image_size=self.image_size,
+                target_ratios=target_ratios_1,
+                use_thumbnail=True,
+            )
+
+            target_ratios_2 = self.resolve_target_ratios(
+                use_thumbnail=False,  # Applied in calculate_targets
+                prior_aspect_ratio=aspect_ratio_1,
+                override_min_num=3,
+            )
+            num_patches_2, _, _, _ = calculate_h2ovl_targets(
+                orig_width=image_width,
+                orig_height=image_height,
+                image_size=self.image_size,
+                target_ratios=target_ratios_2,
+                use_thumbnail=True,
+            )
+
+            num_patches = num_patches_1 + num_patches_2 - 1
+        else:
+            target_ratios = self.resolve_target_ratios(
+                use_thumbnail=False,  # Applied in calculate_targets
+            )
+            num_patches, _, _, _ = calculate_h2ovl_targets(
+                orig_width=image_width,
+                orig_height=image_height,
+                image_size=self.image_size,
+                target_ratios=target_ratios,
+                use_thumbnail=use_thumbnail,
+            )
+
+        return num_patches * self.num_image_token
+
+    def _images_to_pixel_values_lst(
+        self,
+        images: list[Image.Image],
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+    ) -> list[torch.Tensor]:
+        use_msac = self.use_msac if len(images) == 1 else False
+
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=False,  # Applied in image_to_pixel_values
+        )
+
+        return [
+            image_to_pixel_values_h2ovl(
+                image,
+                input_size=self.image_size,
+                min_num=min_num,
+                max_num=max_num,
+                use_thumbnail=self.use_thumbnail,
+                use_msac=use_msac,
+            ) for image in images
+        ]
+
+
+class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
+
+    def get_hf_processor(
+        self,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        **kwargs: object,
+    ) -> H2OVLProcessor:
+        if min_dynamic_patch is not None:
+            kwargs["min_dynamic_patch"] = min_dynamic_patch
+        if max_dynamic_patch is not None:
+            kwargs["max_dynamic_patch"] = max_dynamic_patch
+        if dynamic_image_size is not None:
+            kwargs["dynamic_image_size"] = dynamic_image_size
+
+        return self.ctx.init_processor(
+            H2OVLProcessor,
+            config=self.get_hf_config(),
+            tokenizer=self.get_tokenizer(),
+            **kwargs,
+        )
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[H2OVLProcessor],
+        use_msac: Optional[bool] = None,
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        return processor.get_num_image_tokens(
+            image_width=image_width,
+            image_height=image_height,
+            use_msac=use_msac,
+        )
+
+
+class H2OVLMultiModalProcessor(
+        BaseInternVLMultiModalProcessor[H2OVLProcessingInfo]):
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        if "image_num_patches" in out_mm_kwargs:
+            image_num_patches = out_mm_kwargs["image_num_patches"]
+            assert isinstance(image_num_patches, torch.Tensor)
+            image_num_patches = image_num_patches.tolist()
+        elif "image_embeds" in out_mm_kwargs:
+            # TODO: Use image size information in dictionary embedding inputs
+            # to compute num_patches (similar to Qwen2-VL)
+            image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
+        else:
+            image_num_patches = []
+
+        num_images = len(image_num_patches)
+
+        def get_replacement_internvl(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                feature_size = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                feature_size = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    processor=hf_processor,
+                    use_msac=None if num_images == 1 else False,
+                )
+
+            num_patches = image_num_patches[item_idx]
+            if num_patches is not None:
+                assert isinstance(num_patches, int)
+
+            return hf_processor.get_image_repl(feature_size, num_patches)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target="<image>",
+                replacement=get_replacement_internvl,
+            )
+        ]
+
+    def _cached_apply_hf_processor(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+        *,
+        return_mm_hashes: bool,
+    ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
+        # The processor logic is different for len(images) <= 1 vs > 1
+        # Since the processing cache assumes that the processor output is
+        # invariant of how many images are passed per prompt, we only
+        # perform caching for the most common case
+        if mm_data_items.get_count("image", strict=False) > 1:
+            return self._apply_hf_processor(
+                prompt=prompt,
+                mm_data_items=mm_data_items,
+                hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+                tokenization_kwargs=tokenization_kwargs,
+                return_mm_hashes=return_mm_hashes,
+            )
+
+        return super()._cached_apply_hf_processor(
+            prompt=prompt,
+            mm_data_items=mm_data_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            tokenization_kwargs=tokenization_kwargs,
+            return_mm_hashes=return_mm_hashes,
+        )
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    H2OVLMultiModalProcessor,
+    info=H2OVLProcessingInfo,
+    dummy_inputs=BaseInternVLDummyInputsBuilder)
+class H2OVLChatModel(InternVLChatModel):
+
+    def _init_vision_model(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        *,
+        is_mono: bool,
+        prefix: str,
+    ):
+        if not is_mono:
+            vision_feature_layer = config.select_layer
+            if vision_feature_layer < 0:
+                num_hidden_layers = (config.vision_config.num_hidden_layers +
+                                     vision_feature_layer + 1)
+            else:
+                num_hidden_layers = vision_feature_layer + 1
+
+            return InternVisionModel(
+                config.vision_config,
+                quant_config=quant_config,
+                num_hidden_layers_override=num_hidden_layers,
+                prefix=prefix,
+            )
+        else:
+            msg = "Monolith mode is not applicable to H2OVL"
+            raise NotImplementedError(msg)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/hunyuan_v1.py b/vllm_v0.10.0/vllm/model_executor/models/hunyuan_v1.py
new file mode 100644
index 0000000..fbba849
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/hunyuan_v1.py
@@ -0,0 +1,937 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# coding=utf-8
+# Copyright 2024 The HunYuan team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only HunYuan model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+
+import regex as re
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention, AttentionType
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (get_pp_group,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA
+from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
+                    make_layers)
+
+
+def _is_moe(config: PretrainedConfig) -> bool:
+    num_experts = getattr(config, "num_experts", None)
+    if isinstance(num_experts, int):
+        return num_experts > 1
+    if isinstance(num_experts, list) and num_experts:
+        # Ensure all elements are integers before calling max.
+        if all(isinstance(e, int) for e in num_experts):
+            return max(num_experts) > 1
+        else:
+            return False
+    return False
+
+
+def _get_cla_factor(config: PretrainedConfig) -> int:
+    if not getattr(config, "use_cla", False):
+        return 1
+    return getattr(config, "cla_share_factor", 1)
+
+
+class HunYuanMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        prefix: str = "",
+        reduce_results: bool = True,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=hidden_size,
+            output_sizes=[intermediate_size] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+            reduce_results=reduce_results,
+        )
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class HunYuanAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        prefix: str = "",
+        layer_id: int = -1,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+
+        if hasattr(config, "head_dim") and config.head_dim:
+            self.head_dim = config.head_dim
+        elif hasattr(config, "attention_head_dim"):
+            self.head_dim = config.attention_head_dim
+        else:
+            self.head_dim = self.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.use_qk_norm = getattr(config, "use_qk_norm", False)
+        self.layer_id = layer_id
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=True,
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+        if self.use_qk_norm:
+            self.query_layernorm = RMSNorm(self.head_dim,
+                                           eps=config.rms_norm_eps)
+            self.key_layernorm = RMSNorm(self.head_dim,
+                                         eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_states: Optional[tuple[torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        ori_k = k
+        if self.use_qk_norm:
+            q = self.query_layernorm(
+                q.view(-1, self.num_heads, self.head_dim).contiguous())
+            k = self.key_layernorm(
+                k.view(-1, self.num_kv_heads, self.head_dim).contiguous())
+
+        attn_output = self.attn(q, k, v)
+        # For o_proj
+        attn_output = attn_output.view(q.shape[0], -1)
+        output, _ = self.o_proj(attn_output)
+        return output, (ori_k, v)
+
+
+class HunYuanCrossAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        prefix: str = "",
+        layer_id: int = -1,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        if hasattr(config, "head_dim"):
+            self.head_dim = config.head_dim
+        elif hasattr(config, "attention_head_dim"):
+            self.head_dim = config.attention_head_dim
+        else:
+            self.head_dim = self.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.use_qk_norm = getattr(config, "use_qk_norm", False)
+        self.layer_id = layer_id
+
+        self.q_proj = ColumnParallelLinear(
+            hidden_size,
+            hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.q_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=True,
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+            attn_type=AttentionType.ENCODER_DECODER,
+        )
+
+        if self.use_qk_norm:
+            self.query_layernorm = RMSNorm(self.head_dim,
+                                           eps=config.rms_norm_eps)
+            self.key_layernorm = RMSNorm(self.head_dim,
+                                         eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_states: Optional[tuple[torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        assert kv_states is not None
+        ori_k, v = kv_states  # use last layer kv,
+        k = ori_k
+        q, _ = self.q_proj(hidden_states)
+        k_tmp = torch.empty_like(k)  # Todo: reduant rotary embedding
+        q, _ = self.rotary_emb(positions, q, k_tmp)
+        if self.use_qk_norm:
+            q = self.query_layernorm(
+                q.view(-1, self.num_heads, self.head_dim).contiguous())
+            k = self.key_layernorm(
+                k.view(-1, self.num_kv_heads, self.head_dim).contiguous())
+
+        attn_output = self.attn(q, k, v)
+        # For o_proj
+        attn_output = attn_output.view(q.shape[0], -1)
+        output, _ = self.o_proj(attn_output)
+        return output, (ori_k, v)
+
+
+class HunYuanSparseMoeBlock(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_id: int = -1,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        if self.tp_size > config.num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.num_experts}.")
+
+        # Get layer_id topk if config.moe_topk is a list
+        if isinstance(config.moe_topk, list):
+            assert layer_id >= 0
+            assert len(config.moe_topk) > layer_id
+            top_k = config.moe_topk[layer_id]
+        else:
+            top_k = config.moe_topk
+
+        # If it is moe, moe_intermediate_size is preferred
+        intermediate_size = config.intermediate_size
+        if config.moe_intermediate_size is not None:
+            intermediate_size = (config.moe_intermediate_size if isinstance(
+                config.moe_intermediate_size, int) else
+                                 config.moe_intermediate_size[layer_id])
+
+        self.experts = FusedMoE(
+            num_experts=config.num_experts,
+            top_k=top_k,
+            hidden_size=config.hidden_size,
+            intermediate_size=intermediate_size,
+            reduce_results=False,
+            renormalize=top_k > 1,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts",
+        )
+
+        self.gate = ReplicatedLinear(config.hidden_size,
+                                     config.num_experts,
+                                     bias=False,
+                                     quant_config=None,
+                                     prefix=f"{prefix}.gate")
+        if config.use_mixed_mlp_moe > 0:
+            # Get layer_id num_shared_expert if config.num_shared_expert is
+            # a list.
+            if isinstance(config.num_shared_expert, list):
+                assert layer_id >= 0
+                assert len(config.num_shared_expert) > layer_id
+                num_shared_expert = config.num_shared_expert[layer_id]
+            else:
+                num_shared_expert = config.num_shared_expert
+
+            self.shared_mlp = HunYuanMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size * num_shared_expert,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+            )
+        else:
+            self.shared_mlp = None
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
+        hidden_dim = hidden_states.shape[-1]
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        shared_output = None
+        if self.shared_mlp is not None:
+            shared_output = self.shared_mlp(hidden_states)
+
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        final_hidden_states = self.experts(hidden_states=hidden_states,
+                                           router_logits=router_logits)
+        if shared_output is not None:
+            final_hidden_states = final_hidden_states + shared_output
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
+
+        return final_hidden_states.view(orig_shape)
+
+
+class HunYuanDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        layer_id: int = -1,
+    ) -> None:
+        super().__init__()
+        assert layer_id >= 0
+        self.layer_id = layer_id
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = (config.intermediate_size if isinstance(
+            config.intermediate_size, int) else
+                                  config.intermediate_size[layer_id])
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+                config, "original_max_position_embeddings", None):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False)
+        cla_factor = _get_cla_factor(config)
+        attention_type = (AttentionType.ENCODER_DECODER
+                          if layer_id >= 0 and layer_id % cla_factor != 0 else
+                          AttentionType.DECODER)
+        if attention_type == AttentionType.DECODER:
+            self.self_attn = HunYuanAttention(
+                config=config,
+                hidden_size=self.hidden_size,
+                num_heads=config.num_attention_heads,
+                num_kv_heads=getattr(config, "num_key_value_heads",
+                                     config.num_attention_heads),
+                rope_theta=rope_theta,
+                rope_scaling=rope_scaling,
+                max_position_embeddings=max_position_embeddings,
+                quant_config=quant_config,
+                bias=attention_bias,
+                cache_config=cache_config,
+                prefix=f"{prefix}.self_attn",
+                layer_id=layer_id,
+            )
+        elif attention_type == AttentionType.ENCODER_DECODER:
+            self.self_attn = HunYuanCrossAttention(
+                config=config,
+                hidden_size=self.hidden_size,
+                num_heads=config.num_attention_heads,
+                num_kv_heads=getattr(config, "num_key_value_heads",
+                                     config.num_attention_heads),
+                rope_theta=rope_theta,
+                rope_scaling=rope_scaling,
+                max_position_embeddings=max_position_embeddings,
+                quant_config=quant_config,
+                bias=attention_bias,
+                cache_config=cache_config,
+                prefix=f"{prefix}.self_attn",
+                layer_id=layer_id,
+            )
+        else:
+            raise RuntimeError(f"Unsupported attention type: {attention_type}")
+
+        if _is_moe(config):
+            self.mlp = HunYuanSparseMoeBlock(
+                config=config,
+                quant_config=quant_config,
+                layer_id=layer_id,
+                prefix=f"{prefix}.mlp",
+            )
+        else:
+            self.mlp = HunYuanMLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=self.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                bias=getattr(config, "mlp_bias", False),
+                prefix=f"{prefix}.mlp",
+            )
+
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        kv_states: Optional[tuple[torch.Tensor]] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states, ori_kv_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_states=kv_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual, ori_kv_states
+
+
+@support_torch_compile
+class HunYuanModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.quant_config = quant_config
+        self.padding_idx = config.pad_token_id
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+        if get_pp_group().is_first_rank or (config.tie_word_embeddings
+                                            and get_pp_group().is_last_rank):
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                quant_config=quant_config,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: HunYuanDecoderLayer(
+                config=config,
+                layer_id=int(prefix.split(".")[-1]),
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=f"{prefix}.layers",
+        )
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        cla_factor = _get_cla_factor(self.config)
+        prev_kv_states = None
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual, kv_states = layer(
+                positions,
+                hidden_states,
+                residual,
+                prev_kv_states,
+            )
+
+            if (getattr(self.config, "use_cla", False)
+                    and (i - self.start_layer) % cla_factor == 0):
+                prev_kv_states = kv_states
+            else:
+                prev_kv_states = None
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def _split_qkv_weight(self, qkv: torch.Tensor):
+        num_attention_heads = self.config.num_attention_heads
+        num_kv_heads = getattr(self.config, "num_key_value_heads",
+                               self.config.num_attention_heads)
+        num_key_value_groups = num_attention_heads // num_kv_heads
+        hidden_size = self.config.hidden_size
+
+        if hasattr(self.config, "head_dim"):
+            attention_head_dim = self.config.head_dim
+        elif hasattr(self.config, "attention_head_dim"):
+            attention_head_dim = self.config.attention_head_dim
+        else:
+            attention_head_dim = self.config.hidden_size // num_attention_heads
+
+        qkv = qkv.reshape(num_kv_heads, num_key_value_groups + 2,
+                          attention_head_dim, hidden_size)
+        q, k, v = torch.split(qkv, (num_key_value_groups, 1, 1), dim=1)
+        q = q.reshape(-1, hidden_size)
+        k = k.reshape(-1, hidden_size)
+        v = v.reshape(-1, hidden_size)
+        return torch.concat((q, k, v))
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        if _is_moe(self.config):
+            # Params for weights, fp8 weight scales, fp8 activation scales
+            # (param_name, weight_name, expert_id, shard_id)
+            return FusedMoE.make_expert_params_mapping(
+                ckpt_gate_proj_name="gate_proj",
+                ckpt_down_proj_name="down_proj",
+                ckpt_up_proj_name="up_proj",
+                num_experts=self.config.num_experts,
+            )
+        else:
+            return []
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        cla_factor = _get_cla_factor(self.config)
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+
+        num_attention_heads = self.config.num_attention_heads
+        num_kv_heads = getattr(self.config, "num_key_value_heads",
+                               self.config.num_attention_heads)
+        split_params_mapping = [
+            (".gate_up_proj", ".gate_and_up_proj", 2, [(1, 1), (0, 1)], None),
+            (
+                ".qkv_proj",
+                ".qkv_proj",
+                num_attention_heads + num_kv_heads * 2,
+                [("q", num_attention_heads), ("k", num_kv_heads),
+                 ("v", num_kv_heads)],
+                self._split_qkv_weight,
+            ),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        expert_params_mapping = self.get_expert_mapping()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "gate_proj_bias" in name:
+                name = name.replace("gate_proj_bias", "gate_proj.bias")
+            if "up_proj_bias" in name:
+                name = name.replace("up_proj_bias", "up_proj.bias")
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            # With tie_word_embeddings, we can skip lm_head.weight
+            # The weight might appear unnecessarily in the files if the model is
+            # processed with quantization, LoRA, fine-tuning, etc.
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            if self.quant_config is not None and (
+                    scale_name := self.quant_config.get_cache_scale(name)):
+                # Loading kv cache scales for compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = loaded_weight[0]
+                weight_loader(param, loaded_weight)
+                continue
+
+            is_found = False
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                if "mlp.experts" in name:
+                    continue
+                # cross layer only have q_proj, skip qkv pack
+                if weight_name == ".q_proj":
+                    match = re.search(r"layers\.\d+", name)
+                    if match:
+                        layer_id = int(match.group(0).split(".")[-1])
+                        if cla_factor > 1 and layer_id % cla_factor != 0:
+                            continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                loaded_params.add(name)
+                is_found = True
+                break
+            if is_found:
+                continue
+
+            for (
+                    param_name,
+                    weight_name,
+                    den,
+                    split_param,
+                    func,
+            ) in split_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                assert loaded_weight.shape[0] % den == 0
+                units = loaded_weight.shape[0] // den
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                offset = 0
+                for shard_id, num in split_param:
+                    new_offset = offset + num * units
+                    if func:
+                        weight_loader(param,
+                                      func(loaded_weight)[offset:new_offset],
+                                      shard_id)
+                    else:
+                        weight_loader(param, loaded_weight[offset:new_offset],
+                                      shard_id)
+                    offset = new_offset
+
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    if "mlp.gate.wg." in name:
+                        name = name.replace("wg.", "")
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class HunYuanV1Base(nn.Module, SupportsLoRA):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+
+        self.model = HunYuanModel(vllm_config=vllm_config, prefix="model")
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = config.vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+                quant_config=quant_config,
+            )
+            if config.tie_word_embeddings:
+                self.lm_head.weight = self.model.embed_tokens.weight
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    config.vocab_size,
+                                                    logit_scale)
+        else:
+            self.lm_head = PPMissingLayer()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        model_output = self.model(input_ids, positions, intermediate_tensors,
+                                  inputs_embeds)
+        return model_output
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
+
+
+class HunYuanDenseV1ForCausalLM(HunYuanV1Base):
+    pass
+
+
+class HunYuanMoEV1ForCausalLM(HunYuanV1Base):
+    pass
diff --git a/vllm_v0.10.0/vllm/model_executor/models/idefics2_vision_model.py b/vllm_v0.10.0/vllm/model_executor/models/idefics2_vision_model.py
new file mode 100644
index 0000000..9e27200
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/idefics2_vision_model.py
@@ -0,0 +1,389 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://github.com/huggingface/transformers/blob/v4.43.2/src/transformers/models/idefics2/modeling_idefics2.py
+# Copyright 2024 The vLLM team.
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Idefics2 model."""
+
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+from torch import nn
+from transformers.models.idefics2.configuration_idefics2 import (
+    Idefics2Config, Idefics2VisionConfig)
+
+from vllm.attention.layer import MultiHeadAttention
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+
+
+class Idefics2VisionEmbeddings(nn.Module):
+    """
+    This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings
+    ` to enable images of variable
+    resolution.
+
+    The modifications are adapted from [Patch n' Pack: NaViT, a Vision
+    Transformer for any Aspect Ratio and Resolution](https://arxiv.org/abs/2307.06304)
+    which allows treating images in their native aspect ratio and without the
+    need to resize them to the same fixed size. In particular, we start from the
+    original pre-trained SigLIP model(which uses images of fixed-size square
+    images) and adapt it by training on images of variable resolutions.
+    """
+
+    def __init__(self, config: Idefics2VisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+        self.num_patches_per_side = self.image_size // self.patch_size
+        self.num_patches = self.num_patches_per_side**2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions,
+                                               self.embed_dim)
+
+    def forward(self,
+                pixel_values: torch.FloatTensor,
+                patch_attention_mask: torch.BoolTensor,
+                tgt_sizes: Optional[torch.IntTensor] = None) -> torch.Tensor:
+        batch_size, _, max_im_h, max_im_w = pixel_values.shape
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(target_dtype))
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+        max_nb_patches_h, max_nb_patches_w = (
+            max_im_h // self.patch_size,
+            max_im_w // self.patch_size,
+        )
+        boundaries = torch.arange(1 / self.num_patches_per_side, 1.0,
+                                  1 / self.num_patches_per_side)
+        position_ids = torch.full(size=(batch_size,
+                                        max_nb_patches_h * max_nb_patches_w),
+                                  fill_value=0)
+
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+
+            if tgt_sizes is not None:
+                nb_patches_h = tgt_sizes[batch_idx][0]
+                nb_patches_w = tgt_sizes[batch_idx][1]
+            else:
+                nb_patches_h = p_attn_mask[:, 0].sum()
+                nb_patches_w = p_attn_mask[0].sum()
+            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
+            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
+            bucket_coords_h = torch.bucketize(fractional_coords_h,
+                                              boundaries,
+                                              right=True)
+            bucket_coords_w = torch.bucketize(fractional_coords_w,
+                                              boundaries,
+                                              right=True)
+            pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side +
+                       bucket_coords_w).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
+        position_ids = position_ids.to(self.position_embedding.weight.device)
+        embeddings = embeddings + self.position_embedding(position_ids)
+        return embeddings
+
+
+class Idefics2VisionAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        config: Idefics2VisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"  # noqa: E501
+                f" {self.num_heads}).")
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.qkv_proj = QKVParallelLinear(
+            self.embed_dim,
+            self.head_dim,
+            self.num_heads,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.out_proj = RowParallelLinear(
+            self.embed_dim,
+            self.embed_dim,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
+        self.attn = MultiHeadAttention(self.num_heads_per_partition,
+                                       self.head_dim, self.scale)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(
+            hidden_states
+        )  # batch_size, q_len, 3 * num_heads_per_partition * head_dim
+        query_states, key_states, value_states = qkv.chunk(3, dim=-1)
+        out = self.attn(query_states, key_states, value_states)
+        attn_output, _ = self.out_proj(out)
+        return attn_output
+
+
+class Idefics2VisionMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: Idefics2VisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.activation_fn = get_act_fn(config.hidden_act)
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        return hidden_states
+
+
+class Idefics2EncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Idefics2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = Idefics2VisionAttention(config,
+                                                 quant_config=quant_config,
+                                                 prefix=f"{prefix}.self_attn")
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+        self.mlp = Idefics2VisionMLP(config,
+                                     quant_config=quant_config,
+                                     prefix=f"{prefix}.mlp")
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+
+        """
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.self_attn(hidden_states)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class Idefics2Encoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention
+    layers. Each layer is a
+    [`Idefics2EncoderLayer`].
+
+    Args:
+        config: Idefics2Config
+    """
+
+    def __init__(
+        self,
+        config: Idefics2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        num_hidden_layers_override: Optional[int] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        if num_hidden_layers_override is None:
+            num_hidden_layers = config.num_hidden_layers
+        else:
+            num_hidden_layers = num_hidden_layers_override
+
+        self.layers = nn.ModuleList([
+            Idefics2EncoderLayer(config,
+                                 quant_config=quant_config,
+                                 prefix=f"{prefix}.layers.{layer_idx}")
+            for layer_idx in range(num_hidden_layers)
+        ])
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            inputs_embeds (torch.Tensor):
+                Optionally, instead of passing `input_ids` you can choose to
+                directly pass an embedded representation.
+                This is useful if you want more control over how to convert
+                `input_ids` indices into associated vectorsthan the model's
+                internal embedding lookup matrix.
+        """
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            layer_outputs = encoder_layer(hidden_states)
+            hidden_states = layer_outputs
+        return hidden_states
+
+
+class Idefics2VisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        config: Idefics2VisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        num_hidden_layers_override: Optional[int] = None,
+        require_post_norm: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        embed_dim = config.hidden_size
+        self.config = config
+        self.embeddings = Idefics2VisionEmbeddings(config)
+        self.encoder = Idefics2Encoder(
+            config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers_override,
+            prefix=f"{prefix}.encoder")
+
+        num_hidden_layers = config.num_hidden_layers
+        if len(self.encoder.layers) > config.num_hidden_layers:
+            raise ValueError(
+                f"The original encoder only has {num_hidden_layers} "
+                f"layers, but you requested {len(self.encoder.layers)} layers."
+            )
+
+        self.require_post_norm = require_post_norm
+        self.post_layernorm = nn.LayerNorm(
+            embed_dim,
+            eps=config.layer_norm_eps,
+        ) if require_post_norm else nn.Identity()
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def forward(
+        self,
+        pixel_values,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+        tgt_sizes: Optional[torch.IntTensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.embeddings(
+            pixel_values=pixel_values,
+            patch_attention_mask=patch_attention_mask,
+            tgt_sizes=tgt_sizes,
+        )
+        encoder_outputs = self.encoder(hidden_states)
+        last_hidden_state = self.post_layernorm(encoder_outputs)
+        return last_hidden_state
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        layer_count = len(self.encoder.layers)
+
+        for name, loaded_weight in weights:
+            # skip pooling header
+            if name.startswith("head."):
+                continue
+
+            # post_layernorm is optional
+            if (name.startswith("post_layernorm.")
+                    and not self.require_post_norm):
+                continue
+
+            # omit layers when num_hidden_layers_override is set
+            if name.startswith("encoder.layers."):
+                layer_idx = int(name.split(".")[2])
+                if layer_idx >= layer_count:
+                    continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm_v0.10.0/vllm/model_executor/models/idefics3.py b/vllm_v0.10.0/vllm/model_executor/models/idefics3.py
new file mode 100644
index 0000000..de216a8
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/idefics3.py
@@ -0,0 +1,779 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Idefics3 model compatible with HuggingFace weights."""
+
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Literal, Optional, TypedDict, Union
+
+import torch
+from torch import nn
+from transformers import (BatchFeature, Idefics3Config, Idefics3ImageProcessor,
+                          Idefics3Processor)
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import ImageProcessorItems, ImageSize
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo,
+                                        MultiModalDataItems, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
+# yapf: enable
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+# yapf: disable
+from .idefics2_vision_model import (
+    Idefics2VisionTransformer as Idefics3VisionTransformer)
+# yapf: enable
+from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal
+from .llama import LlamaModel
+from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
+                    merge_multimodal_embeddings)
+
+
+class Idefics3ImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: torch.Tensor
+    """
+    Shape: `(batch_size * num_images * num_patches, 
+             num_channels, height, width)`
+    """
+    pixel_attention_mask: torch.Tensor
+
+    num_patches: torch.Tensor
+    """Shape: `(batch_size * num_images)`"""
+
+
+class Idefics3ImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs]
+
+
+class Idefics3ProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_processor(
+        self,
+        *,
+        size: Optional[dict[str, int]] = None,
+        **kwargs: object,
+    ) -> Idefics3Processor:
+        if size is not None:
+            kwargs["size"] = size
+
+        return self.ctx.get_hf_processor(Idefics3Processor, **kwargs)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def _resize_output_size(self,
+                            *,
+                            height: int,
+                            width: int,
+                            max_len: Optional[int] = None,
+                            min_len: int = 1,
+                            max_size: Optional[int] = None) -> tuple[int, int]:
+        # Set default value for max_len if not provided
+        max_len = max(height, width) if max_len is None else max_len
+        aspect_ratio = width / height
+
+        # Handle the maximum size constraint
+        if max_size is not None:
+            max_len = min(max_len, max_size)
+
+        # Adjust dimensions according to the aspect ratio
+        if width >= height:
+            width = max_len
+            height = int(width / aspect_ratio)
+        else:
+            height = max_len
+            width = int(height * aspect_ratio)
+
+        # Ensure both width and height are even (if needed)
+        height += height % 2
+        width += width % 2
+
+        # Ensure dimensions are not smaller than the minimum length
+        height = max(height, min_len)
+        width = max(width, min_len)
+
+        return height, width
+
+    def _get_resize_output_image_size(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        resolution_max_side: int,
+    ) -> tuple[int, int]:
+        hf_processor = self.get_hf_processor()
+        image_processor: Idefics3ImageProcessor = hf_processor.image_processor
+        max_image_size = image_processor.size['longest_edge']
+        if resolution_max_side > max_image_size:
+            raise ValueError(
+                "`resolution_max_side` cannot be larger than `max_image_size`")
+
+        height, width = image_height, image_width
+
+        # Find the output size, when rescaling the longest edge to max_len and
+        # preserving the aspect ratio
+        height, width = self._resize_output_size(height=height,
+                                                 width=width,
+                                                 max_len=resolution_max_side)
+        return height, width
+
+    def _get_image_feature_grid_size(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[Idefics3Processor],
+    ) -> tuple[int, int]:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        image_processor: Idefics3ImageProcessor = processor.image_processor
+
+        max_image_size = image_processor.max_image_size['longest_edge']
+        size = image_processor.size['longest_edge']
+        assert size % max_image_size == 0, (
+            "`longest_edge` in image_processor's `size` must be divisible by "
+            "`longest_edge` in `max_image_size`, this may be caused by "
+            "incorrect mm_kwargs override.")
+
+        resized_height, resized_width = self._get_resize_output_image_size(
+            image_width=image_width,
+            image_height=image_height,
+            resolution_max_side=size,
+        )
+        if resized_height > max_image_size or resized_width > max_image_size:
+            grid_h = math.ceil(resized_height / max_image_size)
+            grid_w = math.ceil(resized_width / max_image_size)
+        else:
+            grid_h = grid_w = 0
+        return grid_w, grid_h
+
+    def get_num_patches(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[Idefics3Processor],
+    ) -> int:
+        grid_w, grid_h = self._get_image_feature_grid_size(
+            image_width=image_width,
+            image_height=image_height,
+            processor=processor,
+        )
+
+        return grid_w * grid_h + 1
+
+    def _get_image_token(
+            self,
+            processor: Optional[Idefics3Processor]) -> tuple[str, str, str]:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        image_token = processor.image_token
+        fake_image_token = processor.fake_image_token
+        global_image_token = processor.global_image_tag
+        return image_token, fake_image_token, global_image_token
+
+    def get_image_repl(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[Idefics3Processor],
+    ) -> str:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        image_token, fake_image_token, global_img_token = self._get_image_token(
+            processor)
+        image_seq_len = processor.image_seq_len
+        grid_placeholder = "<row_{n_h}_col_{n_w}>"
+
+        p_img = image_token * image_seq_len
+        global_img_placeholder = fake_image_token + global_img_token + p_img
+        tile_img_placeholder = fake_image_token + grid_placeholder + p_img
+
+        grid_w, grid_h = self._get_image_feature_grid_size(
+            image_width=image_width,
+            image_height=image_height,
+            processor=processor,
+        )
+        if grid_w == 0 and grid_h == 0:
+            return global_img_placeholder + fake_image_token
+
+        tiles_placeholder = list[str]()
+        for i in range(grid_h):
+            for j in range(grid_w):
+                placeholder_per_tile = tile_img_placeholder.format(n_h=i + 1,
+                                                                   n_w=j + 1)
+                tiles_placeholder.append(placeholder_per_tile)
+                # Add line break if it is the last tile in the row
+                if j == grid_w - 1:
+                    tiles_placeholder.append("\n")
+
+        return "".join([
+            *tiles_placeholder,
+            "\n",
+            global_img_placeholder,
+            fake_image_token,
+        ])
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[Idefics3Processor],
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        num_patches = self.get_num_patches(
+            image_width=image_width,
+            image_height=image_height,
+            processor=processor,
+        )
+
+        return num_patches * processor.image_seq_len
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        processor = self.get_hf_processor()
+        image_processor: Idefics3ImageProcessor = processor.image_processor
+
+        return ImageSize(
+            width=image_processor.size["longest_edge"],
+            height=image_processor.size["longest_edge"],
+        )
+
+
+class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo]
+                                 ):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token, _, _ = self.info._get_image_token(processor)
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        hf_processor = self.info.get_hf_processor()
+        image_processor: Idefics3ImageProcessor = hf_processor.image_processor
+        longest_edge = image_processor.max_image_size['longest_edge']
+
+        return {
+            "image":
+            self._get_dummy_images(width=longest_edge,
+                                   height=longest_edge,
+                                   num_images=num_images)
+        }
+
+
+class Idefics3MultiModalProcessor(
+        BaseMultiModalProcessor[Idefics3ProcessingInfo]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        # Text-only input not supported in composite processor
+        if not (images := mm_data.get("images", [])):
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        processed_outputs = super()._call_hf_processor(
+            prompt,
+            mm_data,
+            mm_kwargs,
+            tok_kwargs,
+        )
+
+        parsed_images = (self._get_data_parser().parse_mm_data({
+            "image": images
+        }).get_items("image", ImageProcessorItems))
+        image_sizes = [
+            parsed_images.get_image_size(i) for i in range(len(parsed_images))
+        ]
+        hf_processor = self.info.get_hf_processor(**mm_kwargs)
+
+        num_patches = [
+            self.info.get_num_patches(
+                image_width=size.width,
+                image_height=size.height,
+                processor=hf_processor,
+            ) for size in image_sizes
+        ]
+        processed_outputs["num_patches"] = torch.tensor(num_patches)
+
+        # Remove the extra batch dimension
+        processed_outputs["pixel_values"].squeeze_(0)
+        processed_outputs["pixel_attention_mask"].squeeze_(0)
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        num_patches = hf_inputs.get("num_patches", torch.empty(0))
+
+        return dict(
+            pixel_values=MultiModalFieldConfig.flat_from_sizes(
+                "image", num_patches),
+            pixel_attention_mask=MultiModalFieldConfig.flat_from_sizes(
+                "image", num_patches),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+            num_patches=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_token, _, _ = self.info._get_image_token(hf_processor)
+
+        def get_replacement_idefics3(item_idx: int) -> PromptUpdateDetails:
+            images = mm_items.get_items("image", ImageProcessorItems)
+
+            image_size = images.get_image_size(item_idx)
+
+            image_repl = self.info.get_image_repl(
+                image_width=image_size.width,
+                image_height=image_size.height,
+                processor=hf_processor,
+            )
+
+            return PromptUpdateDetails.select_text(
+                image_repl,
+                embed_text=image_token,
+            )
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=image_token,
+                replacement=get_replacement_idefics3,
+            )
+        ]
+
+
+class Idefics3SimpleMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: Idefics3Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        input_size = config.vision_config.hidden_size * (config.scale_factor**
+                                                         2)
+        output_size = config.text_config.hidden_size
+        self.proj = ReplicatedLinear(
+            input_size,
+            output_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "proj"),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out, _ = self.proj(x)
+        return out
+
+
+class Idefics3Connector(nn.Module):
+
+    def __init__(
+        self,
+        config: Idefics3Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.scale_factor = config.scale_factor
+        self.modality_projection = Idefics3SimpleMLP(
+            config,
+            quant_config,
+            prefix=maybe_prefix(prefix, "modality_projection"),
+        )
+
+    def pixel_shuffle(self,
+                      x: torch.Tensor,
+                      scale_factor: int = 2) -> torch.Tensor:
+        bsz, seq, embed_dim = x.size()
+        height = width = int(seq**0.5)
+        x = x.view(bsz, height, width, embed_dim)
+        x = x.view(bsz, height, int(width / scale_factor),
+                   embed_dim * scale_factor)
+        x = x.permute(0, 2, 1, 3)
+        x = x.reshape(
+            bsz,
+            int(width / scale_factor),
+            int(height / scale_factor),
+            embed_dim * (scale_factor**2),
+        )
+        x = x.permute(0, 2, 1, 3)
+        x = x.reshape(bsz, int(seq / (scale_factor**2)),
+                      embed_dim * (scale_factor**2))
+        return x
+
+    def forward(self, image_hidden_states: torch.Tensor) -> torch.Tensor:
+        image_hidden_states = self.pixel_shuffle(image_hidden_states,
+                                                 self.scale_factor)
+        image_hidden_states = self.modality_projection(image_hidden_states)
+        return image_hidden_states
+
+
+class Idefics3Model(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config: Idefics3Config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+        self.vocab_size = self.config.text_config.vocab_size
+        self.vision_model = Idefics3VisionTransformer(
+            config.vision_config,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "vision_model"))
+        self.connector = Idefics3Connector(
+            config,
+            quant_config,
+            prefix=maybe_prefix(prefix, "connector"),
+        )
+        self.text_model = LlamaModel(
+            vllm_config=vllm_config.with_hf_config(config.text_config),
+            prefix=maybe_prefix(prefix, "text_model"),
+        )
+
+        self.image_seq_len = int(
+            ((config.vision_config.image_size //
+              config.vision_config.patch_size)**2) / (config.scale_factor**2))
+        self.image_token_id = self.config.image_token_id
+
+    def image_pixels_to_features(
+        self,
+        pixel_values: torch.Tensor,
+        pixel_attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        # NOTE: we skip the step to select the vision feature layer since
+        # this is already done inside the vision tower
+        pixel_values = pixel_values.to(
+            dtype=self.vision_model.embeddings.patch_embedding.weight.dtype
+        )  # fp16 compatibility
+
+        # Remove padding images - padding images are full 0.
+        nb_values_per_image = pixel_values.shape[1:].numel()
+        real_images_inds = (pixel_values == 0.0).sum(
+            dim=(-1, -2, -3)) != nb_values_per_image
+        pixel_values = pixel_values[real_images_inds].contiguous()
+
+        # Handle the vision attention mask
+        # Remove padding images from the mask
+        pixel_attention_mask = pixel_attention_mask[
+            real_images_inds].contiguous()
+
+        patch_size = self.config.vision_config.patch_size
+        patches_subgrid = pixel_attention_mask.unfold(dimension=1,
+                                                      size=patch_size,
+                                                      step=patch_size)
+        patches_subgrid = patches_subgrid.unfold(dimension=2,
+                                                 size=patch_size,
+                                                 step=patch_size)
+        patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
+
+        # Get sequence from the vision encoder
+        image_hidden_states = self.vision_model(
+            pixel_values=pixel_values,
+            patch_attention_mask=patch_attention_mask,
+        )
+
+        return image_hidden_states
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        return self.text_model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+
+        hidden_states = self.text_model(
+            input_ids,
+            positions,
+            intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Idefics3MultiModalProcessor,
+    info=Idefics3ProcessingInfo,
+    dummy_inputs=Idefics3DummyInputsBuilder)
+class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                       SupportsLoRA):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<image>"
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.model = Idefics3Model(vllm_config=vllm_config,
+                                   prefix=maybe_prefix(prefix, "model"))
+        self.image_token_id = self.config.image_token_id
+
+        self.lm_head = ParallelLMHead(
+            config.text_config.vocab_size,
+            config.text_config.hidden_size,
+            quant_config=quant_config,
+        )
+        if self.config.text_config.tie_word_embeddings:
+            self.lm_head.weight = self.model.text_model.wte.weight
+        self.logits_processor = LogitsProcessor(config.text_config.vocab_size)
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape)
+
+            if actual_dims != expected_dims:
+                expected_expr = str(expected_dims)
+                raise ValueError(
+                    "The expected shape of pixel values per image per batch "
+                    f" per patch is {expected_expr}. "
+                    f"You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[ImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+
+            return Idefics3ImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds, concat=True),
+            )
+
+        if pixel_values is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            pixel_attention_mask = kwargs.pop("pixel_attention_mask")
+            if not isinstance(pixel_attention_mask, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel_attention_mask. "
+                                 f"Got type: {type(pixel_attention_mask)}")
+
+            num_patches = kwargs.pop("num_patches")
+            if not isinstance(num_patches, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of num_patches. "
+                                 f"Got type: {type(num_patches)}")
+
+            pixel_values = flatten_bn(pixel_values, concat=True)
+            pixel_attention_mask = flatten_bn(pixel_attention_mask,
+                                              concat=True)
+            num_patches = flatten_bn(num_patches, concat=True)
+
+            return Idefics3ImagePixelInputs(
+                type="pixel_values",
+                pixel_values=self._validate_pixel_values(pixel_values),
+                pixel_attention_mask=pixel_attention_mask,
+                num_patches=num_patches,
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_image_pixels(
+            self, inputs: Idefics3ImagePixelInputs) -> torch.Tensor:
+        pixel_values = inputs["pixel_values"]
+        pixel_attention_mask = inputs["pixel_attention_mask"]
+
+        return self.model.image_pixels_to_features(
+            pixel_values,
+            pixel_attention_mask=pixel_attention_mask,
+        )
+
+    def _process_image_input(
+        self,
+        image_input: ImageInputs,
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        if image_input["type"] == "image_embeds":
+            return image_input["data"]
+
+        image_features = self._process_image_pixels(image_input)
+        image_features = self.model.connector(image_features)
+
+        num_patches = image_input["num_patches"]
+        return [
+            e.flatten(0, 1) for e in image_features.split(num_patches.tolist())
+        ]
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return []
+
+        return self._process_image_input(image_input)
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                self.config.image_token_id,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.model.text_model(input_ids,
+                                              positions,
+                                              intermediate_tensors,
+                                              inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="model.text_model",
+            connector="model.connector",
+            tower_model="model.vision_model")
diff --git a/vllm_v0.10.0/vllm/model_executor/models/interfaces.py b/vllm_v0.10.0/vllm/model_executor/models/interfaces.py
new file mode 100644
index 0000000..957b572
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/interfaces.py
@@ -0,0 +1,770 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable, MutableSequence
+from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol,
+                    Union, overload, runtime_checkable)
+
+import numpy as np
+import torch
+from torch import Tensor
+from typing_extensions import Self, TypeIs
+
+from vllm.config import ModelConfig, SpeechToTextConfig
+from vllm.inputs import TokensPrompt
+from vllm.inputs.data import PromptType
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.utils import supports_kw
+
+from .interfaces_base import is_pooling_model
+
+if TYPE_CHECKING:
+    from vllm.attention import AttentionMetadata
+    from vllm.config import VllmConfig
+    from vllm.model_executor.models.utils import WeightsMapper
+    from vllm.sequence import IntermediateTensors
+
+logger = init_logger(__name__)
+
+MultiModalEmbeddings = Union[list[Tensor], Tensor, tuple[Tensor, ...]]
+"""
+The output embeddings must be one of the following formats:
+
+- A list or tuple of 2D tensors, where each tensor corresponds to
+    each input multimodal data item (e.g, image).
+- A single 3D tensor, with the batch dimension grouping the 2D tensors.
+"""
+
+
+@runtime_checkable
+class SupportsMultiModal(Protocol):
+    """The interface required for all multi-modal models."""
+
+    supports_multimodal: ClassVar[Literal[True]] = True
+    """
+    A flag that indicates this model supports multi-modal inputs.
+
+    Note:
+        There is no need to redefine this flag if this class is in the
+        MRO of your model class.
+    """
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        """
+        Get the placeholder text for the `i`th `modality` item in the prompt.
+        """
+        ...
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        """
+        Returns multimodal embeddings generated from multimodal kwargs 
+        to be merged with text embeddings.
+
+        Note:
+            The returned multimodal embeddings must be in the same order as
+            the appearances of their corresponding multimodal data item in the
+            input prompt.
+        """
+        ...
+
+    def get_language_model(self) -> torch.nn.Module:
+        """
+        Returns the underlying language model used for text generation.
+
+        This is typically the `torch.nn.Module` instance responsible for 
+        processing the merged multimodal embeddings and producing hidden states
+
+        Returns:
+            torch.nn.Module: The core language model component.
+        """
+        ...
+
+    # Only for models that support v0 chunked prefill
+    # TODO(ywang96): Remove this overload once v0 is deprecated
+    @overload
+    def get_input_embeddings(
+        self,
+        input_ids: Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        attn_metadata: Optional["AttentionMetadata"] = None,
+    ) -> Tensor:
+        ...
+
+    # TODO: Remove this overload once v0 is deprecated
+    @overload
+    def get_input_embeddings(
+        self,
+        input_ids: Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> Tensor:
+        ...
+
+    def get_input_embeddings(
+        self,
+        input_ids: Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        # Only necessary so that the v0 overload is valid
+        # TODO: Remove attn_metadata once v0 is deprecated
+        attn_metadata: Optional["AttentionMetadata"] = None,
+    ) -> Tensor:
+        """
+        Returns the input embeddings merged from the text embeddings from 
+        input_ids and the multimodal embeddings generated from multimodal 
+        kwargs.
+        """
+        ...
+
+
+@overload
+def supports_multimodal(
+        model: type[object]) -> TypeIs[type[SupportsMultiModal]]:
+    ...
+
+
+@overload
+def supports_multimodal(model: object) -> TypeIs[SupportsMultiModal]:
+    ...
+
+
+def supports_multimodal(
+    model: Union[type[object], object],
+) -> Union[TypeIs[type[SupportsMultiModal]], TypeIs[SupportsMultiModal]]:
+    return getattr(model, "supports_multimodal", False)
+
+
+@runtime_checkable
+class SupportsMultiModalWithRawInput(SupportsMultiModal, Protocol):
+    """The interface required for all multi-modal models."""
+
+    supports_multimodal_raw_input: ClassVar[Literal[True]] = True
+    """
+    A flag that indicates this model supports multi-modal inputs and processes
+    them in their raw form and not embeddings.
+
+    Note:
+        There is no need to redefine this flag if this class is in the
+        MRO of your model class.
+    """
+
+
+@overload
+def supports_multimodal_raw_input(
+        model: object) -> TypeIs[SupportsMultiModalWithRawInput]:
+    ...
+
+
+@overload
+def supports_multimodal_raw_input(
+        model: type[object]) -> TypeIs[type[SupportsMultiModalWithRawInput]]:
+    ...
+
+
+def supports_multimodal_raw_input(
+    model: Union[type[object], object]
+) -> Union[TypeIs[type[SupportsMultiModalWithRawInput]],
+           TypeIs[SupportsMultiModalWithRawInput]]:
+    return getattr(model, "supports_multimodal_raw_input", False)
+
+
+@runtime_checkable
+class SupportsScoreTemplate(Protocol):
+    """The interface required for all models that support score template."""
+
+    supports_score_template: ClassVar[Literal[True]] = True
+    """
+    A flag that indicates this model supports score template.
+
+    Note:
+        There is no need to redefine this flag if this class is in the
+        MRO of your model class.
+    """
+
+    @classmethod
+    def get_score_template(cls, query: str, document: str) -> Optional[str]:
+        """
+        Generate a full prompt by populating the score template with query and document content.
+        """ # noqa: E501
+        ...
+
+    @classmethod
+    def post_process_tokens(cls, prompt: TokensPrompt) -> None:
+        """
+        Perform architecture-specific manipulations on the input tokens.
+        """
+        ...
+
+
+@overload
+def supports_score_template(
+        model: type[object]) -> TypeIs[type[SupportsScoreTemplate]]:
+    ...
+
+
+@overload
+def supports_score_template(model: object) -> TypeIs[SupportsScoreTemplate]:
+    ...
+
+
+def supports_score_template(
+    model: Union[type[object], object],
+) -> Union[TypeIs[type[SupportsScoreTemplate]], TypeIs[SupportsScoreTemplate]]:
+    return getattr(model, "supports_score_template", False)
+
+
+@runtime_checkable
+class SupportsLoRA(Protocol):
+    """The interface required for all models that support LoRA."""
+
+    supports_lora: ClassVar[Literal[True]] = True
+    """
+    A flag that indicates this model supports LoRA.
+
+    Note:
+        There is no need to redefine this flag if this class is in the
+        MRO of your model class.
+    """
+    # The `embedding_module` and `embedding_padding_modules`
+    # are empty by default.
+    embedding_modules: ClassVar[dict[str, str]] = {}
+    embedding_padding_modules: ClassVar[list[str]] = []
+    packed_modules_mapping: ClassVar[dict[str, list[str]]] = {}
+
+
+# We can't use runtime_checkable with ClassVar for issubclass checks
+# so we need to treat the class as an instance and use isinstance instead
+@runtime_checkable
+class _SupportsLoRAType(Protocol):
+    supports_lora: Literal[True]
+
+    packed_modules_mapping: dict[str, list[str]]
+    embedding_modules: dict[str, str]
+    embedding_padding_modules: list[str]
+
+
+@overload
+def supports_lora(model: type[object]) -> TypeIs[type[SupportsLoRA]]:
+    ...
+
+
+@overload
+def supports_lora(model: object) -> TypeIs[SupportsLoRA]:
+    ...
+
+
+def supports_lora(
+    model: Union[type[object], object],
+) -> Union[TypeIs[type[SupportsLoRA]], TypeIs[SupportsLoRA]]:
+    result = _supports_lora(model)
+
+    if not result:
+        lora_attrs = (
+            "packed_modules_mapping",
+            "embedding_modules",
+            "embedding_padding_modules",
+        )
+        missing_attrs = tuple(attr for attr in lora_attrs
+                              if not hasattr(model, attr))
+
+        if getattr(model, "supports_lora", False):
+            if missing_attrs:
+                logger.warning(
+                    "The model (%s) sets `supports_lora=True`, "
+                    "but is missing LoRA-specific attributes: %s",
+                    model,
+                    missing_attrs,
+                )
+        else:
+            if not missing_attrs:
+                logger.warning(
+                    "The model (%s) contains all LoRA-specific attributes, "
+                    "but does not set `supports_lora=True`.", model)
+
+    return result
+
+
+def _supports_lora(model: Union[type[object], object]) -> bool:
+    if isinstance(model, type):
+        return isinstance(model, _SupportsLoRAType)
+
+    return isinstance(model, SupportsLoRA)
+
+
+@runtime_checkable
+class SupportsPP(Protocol):
+    """The interface required for all models that support pipeline parallel."""
+
+    supports_pp: ClassVar[Literal[True]] = True
+    """
+    A flag that indicates this model supports pipeline parallel.
+
+    Note:
+        There is no need to redefine this flag if this class is in the
+        MRO of your model class.
+    """
+
+    def make_empty_intermediate_tensors(
+        self,
+        batch_size: int,
+        dtype: torch.dtype,
+        device: torch.device,
+    ) -> "IntermediateTensors":
+        """Called when PP rank > 0 for profiling purposes."""
+        ...
+
+    def forward(
+        self,
+        *,
+        intermediate_tensors: Optional["IntermediateTensors"],
+    ) -> Union[Tensor, "IntermediateTensors"]:
+        """
+        Accept [`IntermediateTensors`][vllm.sequence.IntermediateTensors] when
+        PP rank > 0.
+
+        Return [`IntermediateTensors`][vllm.sequence.IntermediateTensors] only
+        for the last PP rank.
+        """
+        ...
+
+
+# We can't use runtime_checkable with ClassVar for issubclass checks
+# so we need to treat the class as an instance and use isinstance instead
+@runtime_checkable
+class _SupportsPPType(Protocol):
+    supports_pp: Literal[True]
+
+    def make_empty_intermediate_tensors(
+        self,
+        batch_size: int,
+        dtype: torch.dtype,
+        device: torch.device,
+    ) -> "IntermediateTensors":
+        ...
+
+    def forward(
+        self,
+        *,
+        intermediate_tensors: Optional["IntermediateTensors"],
+    ) -> Union[Tensor, "IntermediateTensors"]:
+        ...
+
+
+@overload
+def supports_pp(model: type[object]) -> TypeIs[type[SupportsPP]]:
+    ...
+
+
+@overload
+def supports_pp(model: object) -> TypeIs[SupportsPP]:
+    ...
+
+
+def supports_pp(
+    model: Union[type[object], object],
+) -> Union[bool, TypeIs[type[SupportsPP]], TypeIs[SupportsPP]]:
+    supports_attributes = _supports_pp_attributes(model)
+    supports_inspect = _supports_pp_inspect(model)
+
+    if supports_attributes and not supports_inspect:
+        logger.warning(
+            "The model (%s) sets `supports_pp=True`, but does not accept "
+            "`intermediate_tensors` in its `forward` method", model)
+
+    if not supports_attributes:
+        pp_attrs = ("make_empty_intermediate_tensors", )
+        missing_attrs = tuple(attr for attr in pp_attrs
+                              if not hasattr(model, attr))
+
+        if getattr(model, "supports_pp", False):
+            if missing_attrs:
+                logger.warning(
+                    "The model (%s) sets `supports_pp=True`, "
+                    "but is missing PP-specific attributes: %s",
+                    model,
+                    missing_attrs,
+                )
+        else:
+            if not missing_attrs:
+                logger.warning(
+                    "The model (%s) contains all PP-specific attributes, "
+                    "but does not set `supports_pp=True`.", model)
+
+    return supports_attributes and supports_inspect
+
+
+def _supports_pp_attributes(model: Union[type[object], object]) -> bool:
+    if isinstance(model, type):
+        return isinstance(model, _SupportsPPType)
+
+    return isinstance(model, SupportsPP)
+
+
+def _supports_pp_inspect(model: Union[type[object], object]) -> bool:
+    model_forward = getattr(model, "forward", None)
+    if not callable(model_forward):
+        return False
+
+    return supports_kw(model_forward, "intermediate_tensors")
+
+
+@runtime_checkable
+class HasInnerState(Protocol):
+    """The interface required for all models that has inner state."""
+
+    has_inner_state: ClassVar[Literal[True]] = True
+    """
+        A flag that indicates this model has inner state.
+        Models that has inner state usually need access to the scheduler_config
+        for max_num_seqs, etc. True for e.g. both Mamba and Jamba.
+    """
+
+
+@overload
+def has_inner_state(model: object) -> TypeIs[HasInnerState]:
+    ...
+
+
+@overload
+def has_inner_state(model: type[object]) -> TypeIs[type[HasInnerState]]:
+    ...
+
+
+def has_inner_state(
+    model: Union[type[object], object]
+) -> Union[TypeIs[type[HasInnerState]], TypeIs[HasInnerState]]:
+    return getattr(model, "has_inner_state", False)
+
+
+@runtime_checkable
+class IsAttentionFree(Protocol):
+    """The interface required for all models like Mamba that lack attention,
+    but do have state whose size is constant wrt the number of tokens."""
+
+    is_attention_free: ClassVar[Literal[True]] = True
+    """
+        A flag that indicates this model has no attention.
+        Used for block manager and attention backend selection.
+        True for Mamba but not Jamba.
+    """
+
+
+@overload
+def is_attention_free(model: object) -> TypeIs[IsAttentionFree]:
+    ...
+
+
+@overload
+def is_attention_free(model: type[object]) -> TypeIs[type[IsAttentionFree]]:
+    ...
+
+
+def is_attention_free(
+    model: Union[type[object], object]
+) -> Union[TypeIs[type[IsAttentionFree]], TypeIs[IsAttentionFree]]:
+    return getattr(model, "is_attention_free", False)
+
+
+@runtime_checkable
+class IsHybrid(Protocol):
+    """The interface required for all models like Jamba that have both
+    attention and mamba blocks, indicates that 
+    hf_config has 'layers_block_type'"""
+
+    is_hybrid: ClassVar[Literal[True]] = True
+    """
+        A flag that indicates this model has both mamba and attention blocks
+        , also indicates that the model's hf_config has 
+        'layers_block_type' """
+
+    @classmethod
+    def get_mamba_state_shape_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+        use_v1: bool = True,
+    ) -> tuple[tuple[int, int], tuple[int, int, int]]:
+        """Calculate shapes for Mamba's convolutional and state caches.
+
+        Args:
+            vllm_config: vLLM config
+            use_v1: Get shapes for V1 (or V0)
+
+        Returns:
+            Tuple containing:
+            - conv_state_shape: Shape for convolutional state cache
+            - temporal_state_shape: Shape for state space model cache
+        """
+        ...
+
+
+@overload
+def is_hybrid(model: object) -> TypeIs[IsHybrid]:
+    ...
+
+
+@overload
+def is_hybrid(model: type[object]) -> TypeIs[type[IsHybrid]]:
+    ...
+
+
+def is_hybrid(
+    model: Union[type[object], object]
+) -> Union[TypeIs[type[IsHybrid]], TypeIs[IsHybrid]]:
+    return getattr(model, "is_hybrid", False)
+
+
+@runtime_checkable
+class MixtureOfExperts(Protocol):
+    """
+    Check if the model is a mixture of experts (MoE) model.
+    """
+
+    expert_weights: MutableSequence[Iterable[Tensor]]
+    """
+    Expert weights saved in this rank.
+
+    The first dimension is the layer, and the second dimension is different
+    parameters in the layer, e.g. up/down projection weights.
+    """
+
+    num_moe_layers: int
+    """Number of MoE layers in this model."""
+
+    num_expert_groups: int
+    """Number of expert groups in this model."""
+
+    num_logical_experts: int
+    """Number of logical experts in this model."""
+
+    num_physical_experts: int
+    """Number of physical experts in this model."""
+
+    num_local_physical_experts: int
+    """Number of local physical experts in this model."""
+
+    num_routed_experts: int
+    """Number of routed experts in this model."""
+
+    num_shared_experts: int
+    """Number of shared experts in this model."""
+
+    num_redundant_experts: int
+    """Number of redundant experts in this model."""
+
+    def set_eplb_state(
+        self,
+        expert_load_view: Tensor,
+        logical_to_physical_map: Tensor,
+        logical_replica_count: Tensor,
+    ) -> None:
+        """
+        Register the EPLB state in the MoE model.
+        
+        Since these are views of the actual EPLB state, any changes made by
+        the EPLB algorithm are automatically reflected in the model's behavior
+        without requiring additional method calls to set new states.
+
+        You should also collect model's `expert_weights` here instead of in
+        the weight loader, since after initial weight loading, further
+        processing like quantization may be applied to the weights.
+
+        Args:
+            expert_load_view: A view of the expert load metrics tensor.
+            logical_to_physical_map: Mapping from logical to physical experts.
+            logical_replica_count: Count of replicas for each logical expert.
+        """
+        ...
+
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ) -> None:
+        ...
+
+
+def is_mixture_of_experts(model: object) -> TypeIs[MixtureOfExperts]:
+    return isinstance(model, MixtureOfExperts)
+
+
+@runtime_checkable
+class HasNoOps(Protocol):
+    has_noops: ClassVar[Literal[True]] = True
+
+
+@overload
+def has_noops(model: object) -> TypeIs[HasNoOps]:
+    ...
+
+
+@overload
+def has_noops(model: type[object]) -> TypeIs[type[HasNoOps]]:
+    ...
+
+
+def has_noops(
+    model: Union[type[object], object]
+) -> Union[TypeIs[type[HasNoOps]], TypeIs[HasNoOps]]:
+    return getattr(model, "has_noops", False)
+
+
+@runtime_checkable
+class SupportsCrossEncoding(Protocol):
+    """The interface required for all models that support cross encoding."""
+
+    supports_cross_encoding: ClassVar[Literal[True]] = True
+
+
+@overload
+def supports_cross_encoding(
+        model: type[object]) -> TypeIs[type[SupportsCrossEncoding]]:
+    ...
+
+
+@overload
+def supports_cross_encoding(model: object) -> TypeIs[SupportsCrossEncoding]:
+    ...
+
+
+def _supports_cross_encoding(
+    model: Union[type[object], object],
+) -> Union[TypeIs[type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]:
+    return getattr(model, "supports_cross_encoding", False)
+
+
+def supports_cross_encoding(
+    model: Union[type[object], object],
+) -> Union[TypeIs[type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]:
+    return is_pooling_model(model) and _supports_cross_encoding(model)
+
+
+class SupportsQuant:
+    """The interface required for all models that support quantization."""
+
+    hf_to_vllm_mapper: ClassVar[Optional["WeightsMapper"]] = None
+    packed_modules_mapping: ClassVar[Optional[dict[str, list[str]]]] = None
+    quant_config: Optional[QuantizationConfig] = None
+
+    def __new__(cls, *args, **kwargs) -> Self:
+        instance = super().__new__(cls)
+
+        # find config passed in arguments
+        quant_config = cls._find_quant_config(*args, **kwargs)
+        if quant_config is not None:
+
+            # attach config to model for general use
+            instance.quant_config = quant_config
+
+            # apply model mappings to config for proper config-model matching
+            if (hf_to_vllm_mapper := instance.hf_to_vllm_mapper) is not None:
+                instance.quant_config.apply_vllm_mapper(hf_to_vllm_mapper)
+            if instance.packed_modules_mapping is not None:
+                instance.quant_config.packed_modules_mapping.update(
+                    instance.packed_modules_mapping)
+
+        return instance
+
+    @staticmethod
+    def _find_quant_config(*args, **kwargs) -> Optional[QuantizationConfig]:
+        """Find quant config passed through model constructor args"""
+        from vllm.config import VllmConfig  # avoid circular import
+
+        args_values = list(args) + list(kwargs.values())
+        for arg in args_values:
+            if isinstance(arg, VllmConfig):
+                return arg.quant_config
+
+            if isinstance(arg, QuantizationConfig):
+                return arg
+
+        return None
+
+
+@runtime_checkable
+class SupportsTranscription(Protocol):
+    """The interface required for all models that support transcription."""
+
+    supports_transcription: ClassVar[Literal[True]] = True
+
+    supports_transcription_only: ClassVar[bool] = False
+    """
+    Transcription models can opt out of text generation by setting this to
+    `True`.
+    """
+
+    @classmethod
+    def get_generation_prompt(cls, audio: np.ndarray,
+                              stt_config: SpeechToTextConfig,
+                              model_config: ModelConfig, language: str,
+                              task_type: str,
+                              request_prompt: str) -> PromptType:
+        """Get the prompt for the ASR model.
+        The model has control over the construction, as long as it
+        returns a valid PromptType."""
+        ...
+
+    @classmethod
+    def validate_language(cls, language: str) -> bool:
+        """Check if the model supports a specific ISO639_1 language."""
+        ...
+
+    @classmethod
+    def get_speech_to_text_config(
+            cls, model_config: ModelConfig,
+            task_type: Literal["transcribe",
+                               "translate"]) -> SpeechToTextConfig:
+        """Get the speech to text config for the ASR model."""
+        ...
+
+    @classmethod
+    def get_num_audio_tokens(cls, audio_duration_s: float,
+                             stt_config: SpeechToTextConfig,
+                             model_config: ModelConfig) -> Optional[int]:
+        """
+        Map from audio duration to number of audio tokens produced by the ASR 
+        model, without running a forward pass.
+        This is used for estimating the amount of processing for this audio.
+        """
+        return None
+
+
+@overload
+def supports_transcription(
+        model: type[object]) -> TypeIs[type[SupportsTranscription]]:
+    ...
+
+
+@overload
+def supports_transcription(model: object) -> TypeIs[SupportsTranscription]:
+    ...
+
+
+def supports_transcription(
+    model: Union[type[object], object],
+) -> Union[TypeIs[type[SupportsTranscription]], TypeIs[SupportsTranscription]]:
+    return getattr(model, "supports_transcription", False)
+
+
+@runtime_checkable
+class SupportsV0Only(Protocol):
+    """Models with this interface are not compatible with V1 vLLM."""
+
+    supports_v0_only: ClassVar[Literal[True]] = True
+
+
+@overload
+def supports_v0_only(model: type[object]) -> TypeIs[type[SupportsV0Only]]:
+    ...
+
+
+@overload
+def supports_v0_only(model: object) -> TypeIs[SupportsV0Only]:
+    ...
+
+
+def supports_v0_only(
+    model: Union[type[object], object],
+) -> Union[TypeIs[type[SupportsV0Only]], TypeIs[SupportsV0Only]]:
+    return getattr(model, "supports_v0_only", False)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/interfaces_base.py b/vllm_v0.10.0/vllm/model_executor/models/interfaces_base.py
new file mode 100644
index 0000000..4d68227
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/interfaces_base.py
@@ -0,0 +1,163 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol,
+                    Union, overload, runtime_checkable)
+
+import torch
+import torch.nn as nn
+from typing_extensions import TypeIs, TypeVar
+
+from vllm.logger import init_logger
+from vllm.utils import supports_kw
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+    from vllm.model_executor.layers.pooler import Pooler
+    from vllm.model_executor.sampling_metadata import SamplingMetadata
+
+logger = init_logger(__name__)
+
+# The type of hidden states
+# Currently, T = torch.Tensor for all models except for Medusa
+# which has T = list[torch.Tensor]
+T = TypeVar("T", default=torch.Tensor)
+T_co = TypeVar("T_co", default=torch.Tensor, covariant=True)
+
+# NOTE: Unlike those in `interfaces.py`, we don't define `ClassVar` tags
+# for the base interfaces to avoid breaking OOT registration for existing models
+# that don't inherit from the base interface classes
+
+
+@runtime_checkable
+class VllmModel(Protocol[T_co]):
+    """The interface required for all models in vLLM."""
+
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        prefix: str = "",
+    ) -> None:
+        ...
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+    ) -> T_co:
+        ...
+
+
+def _check_vllm_model_init(model: Union[type[object], object]) -> bool:
+    model_init = model.__init__
+    return supports_kw(model_init, "vllm_config")
+
+
+def _check_vllm_model_forward(model: Union[type[object], object]) -> bool:
+    model_forward = getattr(model, "forward", None)
+    if not callable(model_forward):
+        return False
+
+    vllm_kws = ("input_ids", "positions")
+    missing_kws = tuple(kw for kw in vllm_kws
+                        if not supports_kw(model_forward, kw))
+
+    if missing_kws and (isinstance(model, type)
+                        and issubclass(model, nn.Module)):
+        logger.warning(
+            "The model (%s) is missing "
+            "vLLM-specific keywords from its `forward` method: %s",
+            model,
+            missing_kws,
+        )
+
+    return len(missing_kws) == 0
+
+
+@overload
+def is_vllm_model(model: type[object]) -> TypeIs[type[VllmModel]]:
+    ...
+
+
+@overload
+def is_vllm_model(model: object) -> TypeIs[VllmModel]:
+    ...
+
+
+def is_vllm_model(
+    model: Union[type[object], object],
+) -> Union[TypeIs[type[VllmModel]], TypeIs[VllmModel]]:
+    return _check_vllm_model_init(model) and _check_vllm_model_forward(model)
+
+
+@runtime_checkable
+class VllmModelForTextGeneration(VllmModel[T], Protocol[T]):
+    """The interface required for all generative models in vLLM."""
+
+    def compute_logits(
+        self,
+        hidden_states: T,
+        sampling_metadata: "SamplingMetadata",
+    ) -> Optional[T]:
+        """Return `None` if TP rank > 0."""
+        ...
+
+
+@overload
+def is_text_generation_model(
+        model: type[object]) -> TypeIs[type[VllmModelForTextGeneration]]:
+    ...
+
+
+@overload
+def is_text_generation_model(
+        model: object) -> TypeIs[VllmModelForTextGeneration]:
+    ...
+
+
+def is_text_generation_model(
+    model: Union[type[object], object],
+) -> Union[TypeIs[type[VllmModelForTextGeneration]],
+           TypeIs[VllmModelForTextGeneration]]:
+    if not is_vllm_model(model):
+        return False
+
+    if isinstance(model, type):
+        return isinstance(model, VllmModelForTextGeneration)
+
+    return isinstance(model, VllmModelForTextGeneration)
+
+
+@runtime_checkable
+class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]):
+    """The interface required for all pooling models in vLLM."""
+
+    is_pooling_model: ClassVar[Literal[True]] = True
+    """
+    A flag that indicates this model supports pooling.
+
+    Note:
+        There is no need to redefine this flag if this class is in the
+        MRO of your model class.
+    """
+
+    pooler: "Pooler"
+    """The pooler is only called on TP rank 0."""
+
+
+@overload
+def is_pooling_model(model: type[object]) -> TypeIs[type[VllmModelForPooling]]:
+    ...
+
+
+@overload
+def is_pooling_model(model: object) -> TypeIs[VllmModelForPooling]:
+    ...
+
+
+def is_pooling_model(
+    model: Union[type[object], object],
+) -> Union[TypeIs[type[VllmModelForPooling]], TypeIs[VllmModelForPooling]]:
+    if not is_vllm_model(model):
+        return False
+
+    return getattr(model, "is_pooling_model", False)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/intern_vit.py b/vllm_v0.10.0/vllm/model_executor/models/intern_vit.py
new file mode 100644
index 0000000..58e8163
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/intern_vit.py
@@ -0,0 +1,480 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from collections.abc import Iterable
+from functools import partial
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PretrainedConfig
+
+from vllm.attention.layer import MultiHeadAttention
+from vllm.distributed import (divide, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              split_tensor_along_last_dim,
+                              tensor_model_parallel_all_gather)
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+
+NORM2FN = {
+    'rms_norm': RMSNorm,
+    'layer_norm': nn.LayerNorm,
+}
+
+
+class InternVisionEmbeddings(nn.Module):
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(in_channels=3,
+                                         out_channels=self.embed_dim,
+                                         kernel_size=self.patch_size,
+                                         stride=self.patch_size)
+
+        self.num_patches = (self.image_size // self.patch_size)**2
+        self.num_positions = self.num_patches + 1
+
+        self.position_embedding = nn.Parameter(
+            torch.randn(1, self.num_positions, self.embed_dim))
+
+    def _get_pos_embed(self, pos_embed: torch.Tensor, H: int, W: int):
+        target_dtype = pos_embed.dtype
+        pos_embed = pos_embed.float().reshape(
+            1, self.image_size // self.patch_size,
+            self.image_size // self.patch_size, -1).permute(0, 3, 1, 2)
+        pos_embed = F.interpolate(pos_embed,
+                                  size=(H, W),
+                                  mode='bicubic',
+                                  align_corners=False)
+        return pos_embed.reshape(1, -1, H * W).permute(0, 2,
+                                                       1).to(target_dtype)
+
+    def _get_position_embedding(self, H: int, W: int) -> torch.Tensor:
+        position_embedding = self.position_embedding
+        if self.num_patches == H * W:
+            return position_embedding
+
+        return torch.cat(
+            [
+                position_embedding[:, :1, :],
+                self._get_pos_embed(position_embedding[:, 1:, :], H, W),
+            ],
+            dim=1,
+        )
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(
+            target_dtype))  # shape = [*, channel, width, height]
+        batch_size, _, height, width = patch_embeds.shape
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1,
+                                                   -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        position_embedding = self._get_position_embedding(height, width)
+        embeddings = embeddings + position_embedding.to(target_dtype)
+        return embeddings
+
+
+class InternVisionPatchModel(nn.Module):
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        self.embeddings = InternVisionEmbeddings(config)
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        if pixel_values is None and pixel_embeds is None:
+            raise ValueError(
+                'You have to specify pixel_values or pixel_embeds')
+
+        if pixel_embeds is not None:
+            hidden_states = pixel_embeds
+        elif pixel_values is not None:
+            if pixel_values.ndim == 4:
+                hidden_states = self.embeddings(pixel_values)
+            else:
+                raise ValueError(
+                    f'wrong pixel_values size: {pixel_values.shape}')
+
+        return hidden_states
+
+
+class InternParallelAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        num_dummy_heads: int = 0,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f'embed_dim must be divisible by num_heads '
+                f'(got `embed_dim`: {self.embed_dim} and `num_heads`:'
+                f' {self.num_heads}).')
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+        # Additional dummy heads are used to enable TP for common GPU counts.
+        self.dummy_dim = (num_dummy_heads + self.num_heads) * self.head_dim
+        self.num_heads_per_partition = divide(num_dummy_heads + self.num_heads,
+                                              self.tp_size)
+
+        self.scale = self.head_dim**-0.5
+        self.qkv = QKVParallelLinear(
+            self.embed_dim,
+            self.head_dim,
+            num_dummy_heads + self.num_heads,
+            bias=config.qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv",
+        )
+
+        self.qk_normalization = config.qk_normalization
+
+        if self.qk_normalization:
+            self.q_norm = RMSNorm(self.dummy_dim,
+                                  eps=config.layer_norm_eps,
+                                  var_hidden_size=self.embed_dim)
+            self.k_norm = RMSNorm(self.dummy_dim,
+                                  eps=config.layer_norm_eps,
+                                  var_hidden_size=self.embed_dim)
+
+        self.proj = RowParallelLinear(
+            self.dummy_dim,
+            self.embed_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.proj",
+        )
+
+        self.attn = MultiHeadAttention(self.num_heads_per_partition,
+                                       self.head_dim, self.scale)
+
+    def _apply_qk_norm(self, q: torch.Tensor, k: torch.Tensor):
+        if self.tp_size > 1:
+            q = tensor_model_parallel_all_gather(q.contiguous())
+            k = tensor_model_parallel_all_gather(k.contiguous())
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        if self.tp_size > 1:
+            splitter = partial(split_tensor_along_last_dim,
+                               num_partitions=self.tp_size)
+            q = splitter(q)[self.tp_rank]
+            k = splitter(k)[self.tp_rank]
+        return q, k
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, N, _ = x.shape
+        qkv, _ = self.qkv(x)
+        q, k, v = qkv.chunk(3, dim=-1)
+
+        if self.qk_normalization:
+            q, k = self._apply_qk_norm(q, k)
+
+        out = self.attn(q, k, v)
+        out, _ = self.proj(out)
+        return out
+
+
+class InternSdpaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        *,
+        num_dummy_heads: int = 0,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f'embed_dim must be divisible by num_heads '
+                f'(got `embed_dim`: {self.embed_dim} and `num_heads`:'
+                f' {self.num_heads}).')
+
+        # Additional dummy heads are used to enable TP for common GPU counts.
+        self.dummy_dim = (num_dummy_heads + self.num_heads) * self.head_dim
+
+        self.scale = self.head_dim**-0.5
+        self.qkv = nn.Linear(self.embed_dim,
+                             3 * self.dummy_dim,
+                             bias=config.qkv_bias)
+
+        self.qk_normalization = config.qk_normalization
+
+        if self.qk_normalization:
+            self.q_norm = RMSNorm(self.dummy_dim,
+                                  eps=config.layer_norm_eps,
+                                  var_hidden_size=self.embed_dim)
+            self.k_norm = RMSNorm(self.dummy_dim,
+                                  eps=config.layer_norm_eps,
+                                  var_hidden_size=self.embed_dim)
+
+        self.proj = nn.Linear(self.dummy_dim, self.embed_dim)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x)
+        q, k, v = qkv.chunk(3, dim=-1)
+
+        q = q.view(B, N, self.num_heads, self.head_dim)
+        k = k.view(B, N, self.num_heads, self.head_dim)
+        v = v.view(B, N, self.num_heads, self.head_dim)
+
+        if self.qk_normalization:
+            B_, N_, H_, D_ = q.shape
+            q = self.q_norm(q.flatten(-2, -1)).view(B_, N_, H_, D_)
+            k = self.k_norm(k.flatten(-2, -1)).view(B_, N_, H_, D_)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+
+        x = F.scaled_dot_product_attention(q, k, v, scale=self.scale)
+        x = x.transpose(1, 2).reshape(B, N, -1)
+
+        x = self.proj(x)
+        return x
+
+
+class InternMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.activation_fn = get_act_fn(config.hidden_act)
+        self.fc1 = ColumnParallelLinear(config.hidden_size,
+                                        config.intermediate_size,
+                                        bias=True,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.fc1")
+        self.fc2 = RowParallelLinear(config.intermediate_size,
+                                     config.hidden_size,
+                                     bias=True,
+                                     quant_config=quant_config,
+                                     prefix=f"{prefix}.fc2")
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+
+        return hidden_states
+
+
+class InternVisionEncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        num_dummy_heads: int = 0,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.embed_dim = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.norm_type = config.norm_type
+
+        self.attn = self._init_attn(config,
+                                    quant_config,
+                                    num_dummy_heads=num_dummy_heads,
+                                    prefix=f"{prefix}.attn")
+
+        self.mlp = InternMLP(config,
+                             quant_config=quant_config,
+                             prefix=f"{prefix}.mlp")
+        self.norm1 = NORM2FN[self.norm_type](self.embed_dim,
+                                             eps=config.layer_norm_eps)
+        self.norm2 = NORM2FN[self.norm_type](self.embed_dim,
+                                             eps=config.layer_norm_eps)
+
+        self.ls1 = nn.Parameter(config.initializer_factor *
+                                torch.ones(self.embed_dim))
+        self.ls2 = nn.Parameter(config.initializer_factor *
+                                torch.ones(self.embed_dim))
+
+    def _init_attn(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        *,
+        num_dummy_heads: int,
+        prefix: str = "",
+    ):
+        # fallback to sdpa attention if tp unavailable
+        tp_size = get_tensor_model_parallel_world_size()
+        num_heads = config.num_attention_heads
+
+        if (num_heads + num_dummy_heads) % tp_size == 0:
+            return InternParallelAttention(config,
+                                           quant_config=quant_config,
+                                           num_dummy_heads=num_dummy_heads,
+                                           prefix=prefix)
+
+        return InternSdpaAttention(config, num_dummy_heads=num_dummy_heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ):
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states)) * self.ls1
+
+        hidden_states = hidden_states + self.mlp(
+            self.norm2(hidden_states)) * self.ls2
+
+        return hidden_states
+
+
+class InternVisionEncoder(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        num_hidden_layers_override: Optional[int] = None,
+        num_dummy_heads: int = 0,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.config = config
+
+        if num_hidden_layers_override is None:
+            num_hidden_layers = config.num_hidden_layers
+        else:
+            num_hidden_layers = num_hidden_layers_override
+
+        self.layers = nn.ModuleList([
+            InternVisionEncoderLayer(config,
+                                     quant_config,
+                                     num_dummy_heads=num_dummy_heads,
+                                     prefix=f"{prefix}.layers.{layer_idx}")
+            for layer_idx in range(num_hidden_layers)
+        ])
+
+    def forward(self, inputs_embeds: torch.Tensor):
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(hidden_states)
+
+        return hidden_states
+
+
+class InternVisionModel(nn.Module):
+
+    packed_modules_mapping = {
+        "qkv": ["qkv"],
+    }
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        num_hidden_layers_override: Optional[int] = None,
+        num_dummy_heads: int = 0,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        self.embeddings = InternVisionEmbeddings(config)
+        self.encoder = InternVisionEncoder(
+            config=config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers_override,
+            num_dummy_heads=num_dummy_heads,
+            prefix=f"{prefix}.encoder",
+        )
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        if pixel_values is None and pixel_embeds is None:
+            raise ValueError(
+                'You have to specify pixel_values or pixel_embeds')
+
+        if pixel_embeds is not None:
+            hidden_states = pixel_embeds
+        elif pixel_values is not None:
+            if pixel_values.ndim == 4:
+                hidden_states = self.embeddings(pixel_values)
+            else:
+                raise ValueError(
+                    f'wrong pixel_values size: {pixel_values.shape}')
+
+        encoder_outputs = self.encoder(inputs_embeds=hidden_states)
+
+        return encoder_outputs
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm_v0.10.0/vllm/model_executor/models/internlm2.py b/vllm_v0.10.0/vllm/model_executor/models/internlm2.py
new file mode 100644
index 0000000..d29779a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/internlm2.py
@@ -0,0 +1,447 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable
+from functools import partial
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              split_tensor_along_last_dim,
+                              tensor_model_parallel_all_gather)
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class InternLM2MLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.w2 = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.w2",
+        )
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.w2(x)
+        return x
+
+
+class InternLM2Attention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % self.tp_size == 0
+        self.num_heads = self.total_num_heads // self.tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= self.tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % self.tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // self.tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.key_value_groups = int(self.num_heads / self.num_kv_heads)
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.wqkv = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.wqkv",
+        )
+        self.wo = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.wo",
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+    def split_qkv(self, qkv: torch.Tensor):
+        seq_len = qkv.shape[0]
+        if self.tp_size > 1:
+            qkv_map = [self.q_size, self.kv_size, self.kv_size] * self.tp_size
+            qkv = tensor_model_parallel_all_gather(qkv)
+            qkv = torch.split(qkv, qkv_map, dim=-1)
+            qkv = qkv[::3] + qkv[1::3] + qkv[2::3]
+            qkv = torch.cat(qkv, dim=-1)
+
+        qkv = qkv.view(seq_len, self.total_num_kv_heads,
+                       self.key_value_groups + 2, self.head_dim)
+        q, k, v = torch.split(qkv, [self.key_value_groups, 1, 1], dim=-2)
+        q = q.reshape(seq_len, self.q_size * self.tp_size)
+        k = k.reshape(seq_len, self.kv_size * self.tp_size)
+        v = v.reshape(seq_len, self.kv_size * self.tp_size)
+
+        if self.tp_size > 1:
+            splitter = partial(split_tensor_along_last_dim,
+                               num_partitions=self.tp_size)
+            q = splitter(q)[self.tp_rank]
+            k = splitter(k)[self.tp_rank]
+            v = splitter(v)[self.tp_rank]
+        return q, k, v
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.wqkv(hidden_states)
+        q, k, v = self.split_qkv(qkv)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.wo(attn_output)
+        return output
+
+
+class InternLMDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.attention = InternLM2Attention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attention",
+        )
+        self.feed_forward = InternLM2MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=f"{prefix}.feed_forward",
+        )
+        self.attention_norm = RMSNorm(config.hidden_size,
+                                      eps=config.rms_norm_eps)
+        self.ffn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.attention_norm(hidden_states)
+        else:
+            hidden_states, residual = self.attention_norm(
+                hidden_states, residual)
+        hidden_states = self.attention(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.ffn_norm(hidden_states, residual)
+        hidden_states = self.feed_forward(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class InternLM2Model(nn.Module):
+
+    def __init__(
+            self,
+            *,
+            vllm_config: VllmConfig,
+            prefix: str = "",
+            layer_type: type[InternLMDecoderLayer] = InternLMDecoderLayer):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.tok_embeddings = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: layer_type(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.layers")
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.tok_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(positions, hidden_states, residual)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
+    packed_modules_mapping = {
+        "wqkv": ["wqkv"],
+        "gate_up_proj": ["w1", "w3"],
+    }
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 model_type: type[InternLM2Model] = InternLM2Model):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.quant_config = quant_config
+        self.lora_config = lora_config
+
+        self.model = model_type(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+        self.output = ParallelLMHead(config.vocab_size,
+                                     config.hidden_size,
+                                     quant_config=quant_config,
+                                     prefix=maybe_prefix(prefix, "output"))
+        if self.config.tie_word_embeddings:
+            self.output.weight = self.model.tok_embeddings.weight
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.output, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "w1", 0),
+            ("gate_up_proj", "w3", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class InternLM2ForRewardModel(InternLM2ForCausalLM):
+
+    is_pooling_model = True
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        model_type: type[InternLM2Model] = InternLM2Model,
+    ):
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         model_type=model_type)
+
+        for attr in ("output", "logits_processor"):
+            delattr(self, attr)
+
+        config = vllm_config.model_config.hf_config
+        self.v_head = RowParallelLinear(
+            config.hidden_size,
+            1,
+            bias=False,
+            input_is_parallel=False,
+            prefix=maybe_prefix(prefix, "v_head"),
+        )
+
+        pooler_config = vllm_config.model_config.pooler_config
+        assert pooler_config is not None
+
+        self.pooler = DispatchPooler(
+            {"encode": Pooler.for_encode(pooler_config)}, )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        logits, _ = self.v_head(hidden_states)
+        return logits
diff --git a/vllm_v0.10.0/vllm/model_executor/models/internlm2_ve.py b/vllm_v0.10.0/vllm/model_executor/models/internlm2_ve.py
new file mode 100644
index 0000000..4bbb49d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/internlm2_ve.py
@@ -0,0 +1,147 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.models.internlm2 import (InternLM2Attention,
+                                                  InternLM2ForCausalLM,
+                                                  InternLM2MLP, InternLM2Model)
+from vllm.sequence import IntermediateTensors
+
+
+class InternLM2VEDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.attention = InternLM2Attention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attention",
+        )
+        self.feed_forward = InternLM2MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=f"{prefix}.feed_forward",
+        )
+        self.feed_forward_ve = InternLM2MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=f"{prefix}.feed_forward_ve",
+        )
+        self.attention_norm = RMSNorm(config.hidden_size,
+                                      eps=config.rms_norm_eps)
+        self.ffn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        visual_token_mask: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.attention_norm(hidden_states)
+        else:
+            hidden_states, residual = self.attention_norm(
+                hidden_states, residual)
+        hidden_states = self.attention(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.ffn_norm(hidden_states, residual)
+        if visual_token_mask is not None and visual_token_mask.any():
+            visual_token_mask = visual_token_mask.repeat(
+                1, self.hidden_size).bool()
+            text_token_mask = ~visual_token_mask
+            hidden_states[visual_token_mask] = self.feed_forward_ve(
+                hidden_states[visual_token_mask].reshape(
+                    -1, self.hidden_size)).flatten()
+            if text_token_mask.any():
+                hidden_states[text_token_mask] = self.feed_forward(
+                    hidden_states[text_token_mask].reshape(
+                        -1, self.hidden_size)).flatten()
+        else:
+            hidden_states = self.feed_forward(hidden_states)
+        return hidden_states, residual
+
+
+class InternLM2VEModel(InternLM2Model):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         layer_type=InternLM2VEDecoderLayer)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        visual_token_mask: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.tok_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+                visual_token_mask=visual_token_mask,
+            )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class InternLM2VEForCausalLM(InternLM2ForCausalLM):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         model_type=InternLM2VEModel)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/internvl.py b/vllm_v0.10.0/vllm/model_executor/models/internvl.py
new file mode 100644
index 0000000..f8b9ea2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/internvl.py
@@ -0,0 +1,1432 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from abc import ABC, abstractmethod
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Any, Literal, Optional, TypedDict, TypeVar, Union
+
+import numpy.typing as npt
+import torch
+import torch.nn as nn
+import torchvision.transforms as T
+from PIL import Image
+from transformers import BatchEncoding, PretrainedConfig, TensorType
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.awq import AWQConfig
+from vllm.model_executor.models.intern_vit import (InternVisionModel,
+                                                   InternVisionPatchModel)
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import convert_image_mode
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   ImageSize, MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
+from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
+                    maybe_prefix, merge_multimodal_embeddings)
+
+IMG_START = '<img>'
+IMG_END = '</img>'
+IMG_CONTEXT = '<IMG_CONTEXT>'
+
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+
+class InternVLImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values_flat: torch.Tensor
+    """
+    Shape:
+    `(batch_size * num_images * (1 + num_patches), num_channels, height, width)`
+    """
+
+    num_patches: torch.Tensor
+    """Shape: `(batch_size * num_images)`"""
+
+
+class InternVLImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: Union[torch.Tensor, list[torch.Tensor]]
+    """ 
+    A tensor of shape `(num_images, total_image_feature_size, hidden_size)`
+    or a list of tensors of shape `(total_image_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+InternVLImageInputs = Union[InternVLImagePixelInputs,
+                            InternVLImageEmbeddingInputs]
+
+
+class InternVLVideoPixelInputs(TypedDict):
+    type: Literal["pixel_values_videos"]
+    pixel_values_flat: torch.Tensor
+    """
+    Shape:
+    `(batch_size * num_video * num_frames, num_channels, height, width)`
+    """
+
+    num_patches: torch.Tensor
+    """Shape: `(batch_size * num_images)`"""
+
+
+class InternVLVideoEmbeddingInputs(TypedDict):
+    type: Literal["video_embeds"]
+    data: Union[torch.Tensor, list[torch.Tensor]]
+    """ 
+    A tensor of shape `(num_videos, total_video_feature_size, hidden_size)`
+    or a list of tensors of shape `(total_video_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+InternVLVideoInputs = Union[InternVLVideoPixelInputs,
+                            InternVLVideoEmbeddingInputs]
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+def build_transform(input_size: int):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    return T.Compose([
+        T.Lambda(lambda img: convert_image_mode(img, 'RGB')),
+        T.Resize((input_size, input_size),
+                 interpolation=T.InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=MEAN, std=STD)
+    ])
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+def find_closest_aspect_ratio(
+    aspect_ratio: float,
+    target_ratios: list[tuple[int, int]],
+    *,
+    width: int,
+    height: int,
+    image_size: int,
+) -> tuple[int, int]:
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def resolve_internvl_min_max_num(
+    *,
+    min_dynamic_patch: int,
+    max_dynamic_patch: int,
+    dynamic_image_size: bool,
+    use_thumbnail: bool,
+) -> tuple[int, int]:
+    min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
+    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
+
+    if use_thumbnail and max_dynamic_patch != 1:
+        max_dynamic_patch += 1
+
+    return min_dynamic_patch, max_dynamic_patch
+
+
+def get_internvl_target_ratios(
+    min_num: int,
+    max_num: int,
+) -> list[tuple[int, int]]:
+    target_ratios = {(i, j)
+                     for n in range(min_num, max_num + 1)
+                     for i in range(1, n + 1)
+                     for j in range(1, n + 1) if min_num <= i * j <= max_num}
+    return sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+
+def calculate_internvl_targets(
+    *,
+    orig_width: int,
+    orig_height: int,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> tuple[int, int, int]:
+    aspect_ratio = orig_width / orig_height
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio,
+        target_ratios,
+        width=orig_width,
+        height=orig_height,
+        image_size=image_size,
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # add thumbnail image if num_blocks != 1
+    if use_thumbnail and blocks != 1:
+        blocks += 1
+
+    return blocks, target_width, target_height
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+def dynamic_preprocess_internvl(
+    image: Image.Image,
+    *,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> list[Image.Image]:
+    orig_width, orig_height = image.size
+
+    # calculate the number of blocks without thumbnail
+    blocks, target_width, target_height = calculate_internvl_targets(
+        orig_width=orig_width,
+        orig_height=orig_height,
+        target_ratios=target_ratios,
+        image_size=image_size,
+        use_thumbnail=False,
+    )
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = ((i % (target_width // image_size)) * image_size,
+               (i // (target_width // image_size)) * image_size,
+               ((i % (target_width // image_size)) + 1) * image_size,
+               ((i // (target_width // image_size)) + 1) * image_size)
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+
+    assert len(processed_images) == blocks
+
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+
+    return processed_images
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+def image_to_pixel_values_internvl(
+    image: Image.Image,
+    *,
+    input_size: int,
+    min_num: int,
+    max_num: int,
+    use_thumbnail: bool,
+) -> torch.Tensor:
+    target_ratios = get_internvl_target_ratios(min_num, max_num)
+
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess_internvl(
+        image,
+        target_ratios=target_ratios,
+        image_size=input_size,
+        use_thumbnail=use_thumbnail,
+    )
+
+    pixel_values = torch.stack([transform(image) for image in images])
+    return pixel_values
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+def video_to_pixel_values_internvl(
+    video: npt.NDArray,
+    *,
+    input_size: int,
+    min_num: int,
+    max_num: int,
+    use_thumbnail: bool,
+) -> torch.Tensor:
+    target_ratios = get_internvl_target_ratios(min_num, max_num)
+
+    transform = build_transform(input_size=input_size)
+    frames_list = list[Image.Image]()
+    for frame in video:
+        pil_frame = dynamic_preprocess_internvl(
+            Image.fromarray(frame, mode="RGB"),
+            target_ratios=target_ratios,
+            image_size=input_size,
+            use_thumbnail=use_thumbnail,
+        )
+        assert len(pil_frame) == 1
+        frames_list.extend(pil_frame)
+
+    pixel_values = torch.stack([transform(image) for image in frames_list])
+    return pixel_values
+
+
+class BaseInternVLProcessor(ABC):
+    """
+    This model doesn't define its own HF processor,
+    so we implement our own one here.
+
+    The code to insert image tokens is based on:
+    https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: AnyTokenizer,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.tokenizer = tokenizer
+
+        image_size: int = config.vision_config.image_size
+        patch_size: int = config.vision_config.patch_size
+
+        if min_dynamic_patch is None:
+            min_dynamic_patch = config.min_dynamic_patch
+        assert isinstance(min_dynamic_patch, int)
+
+        if max_dynamic_patch is None:
+            max_dynamic_patch = config.max_dynamic_patch
+        assert isinstance(max_dynamic_patch, int)
+
+        if dynamic_image_size is None:
+            dynamic_image_size = config.dynamic_image_size
+        assert isinstance(dynamic_image_size, bool)
+
+        self.num_image_token = int(
+            (image_size // patch_size)**2 * (config.downsample_ratio**2))
+        self.image_size = image_size
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail: bool = config.use_thumbnail
+
+    @property
+    @abstractmethod
+    def image_token_id(self) -> int:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: Optional[int],
+    ) -> PromptUpdateDetails[str]:
+        raise NotImplementedError
+
+    def resolve_min_max_num(
+        self,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        use_thumbnail: Optional[bool] = None,
+    ) -> tuple[int, int]:
+        min_dynamic_patch = (self.min_dynamic_patch if min_dynamic_patch
+                             is None else min_dynamic_patch)
+        max_dynamic_patch = (self.max_dynamic_patch if max_dynamic_patch
+                             is None else max_dynamic_patch)
+        dynamic_image_size = (self.dynamic_image_size if dynamic_image_size
+                              is None else dynamic_image_size)
+        use_thumbnail = (self.use_thumbnail
+                         if use_thumbnail is None else use_thumbnail)
+
+        return resolve_internvl_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
+
+    def resolve_target_ratios(
+        self,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        use_thumbnail: Optional[bool] = None,
+    ) -> list[tuple[int, int]]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
+
+        return get_internvl_target_ratios(min_num, max_num)
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        target_ratios = self.resolve_target_ratios(
+            use_thumbnail=False,  # Applied in calculate_targets
+        )
+
+        num_patches, _, _ = calculate_internvl_targets(
+            orig_width=image_width,
+            orig_height=image_height,
+            image_size=self.image_size,
+            target_ratios=target_ratios,
+            use_thumbnail=self.use_thumbnail,
+        )
+
+        return num_patches * self.num_image_token
+
+    def _images_to_pixel_values_lst(
+        self,
+        images: list[Image.Image],
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+    ) -> list[torch.Tensor]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=False,  # Applied in image_to_pixel_values
+        )
+
+        return [
+            image_to_pixel_values_internvl(
+                image,
+                input_size=self.image_size,
+                min_num=min_num,
+                max_num=max_num,
+                use_thumbnail=self.use_thumbnail,
+            ) for image in images
+        ]
+
+    def _preprocess_image(
+        self,
+        text: list[str],
+        images: list[Image.Image],
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+    ) -> tuple[list[str], dict[str, torch.Tensor]]:
+        if len(images) == 0:
+            image_inputs = {}
+        else:
+            pixel_values_lst = self._images_to_pixel_values_lst(
+                images,
+                min_dynamic_patch=min_dynamic_patch,
+                max_dynamic_patch=max_dynamic_patch,
+                dynamic_image_size=dynamic_image_size,
+            )
+            image_inputs: dict[str, NestedTensors] = {
+                "pixel_values_flat":
+                torch.cat(pixel_values_lst),
+                "image_num_patches":
+                torch.tensor([len(item) for item in pixel_values_lst]),
+            }
+
+            for pixel_values in pixel_values_lst:
+                num_patches = pixel_values.shape[0]
+                feature_size = num_patches * self.num_image_token
+
+                image_repl = self.get_image_repl(feature_size, num_patches)
+                text = [t.replace('<image>', image_repl.full, 1) for t in text]
+        return text, image_inputs
+
+    def _make_batch_input(self,
+                          input_item: Optional[Union[Any, list[Any]]] = None):
+        if input_item is None:
+            input_item = []
+        if not isinstance(input_item, list):
+            input_item = [input_item]
+        return input_item
+
+    def __call__(
+        self,
+        text: Optional[Union[str, list[str]]] = None,
+        images: Optional[Union[Image.Image, list[Image.Image]]] = None,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ) -> Mapping[str, NestedTensors]:
+        text, images = [self._make_batch_input(x) for x in (text, images)]
+
+        text, image_inputs = self._preprocess_image(
+            text=text,
+            images=images,
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+        )
+
+        text_inputs = self.tokenizer(text)
+
+        return {
+            **BatchEncoding(text_inputs, tensor_type=return_tensors),
+            **image_inputs,
+        }
+
+
+class InternVLProcessor(BaseInternVLProcessor):
+    """
+    HF Processor for InternVLChatModel with extended video processing logic.
+
+    Code for video processing is adapted from video example:
+    https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: AnyTokenizer,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        video_token: Optional[str] = None,
+    ) -> None:
+        super().__init__(
+            config=config,
+            tokenizer=tokenizer,
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+        )
+        # add extra video token for video processing
+        self.video_token = video_token
+
+    @property
+    def image_token_id(self) -> int:
+        return self.tokenizer.get_vocab()[IMG_CONTEXT]
+
+    @property
+    def video_token_id(self) -> Optional[int]:
+        if self.video_token is None:
+            return None
+        return self.tokenizer.get_vocab().get(self.video_token, None)
+
+    @property
+    def supports_video(self) -> bool:
+        return self.video_token_id is not None
+
+    def _videos_to_pixel_values_lst(
+        self,
+        videos: list[npt.NDArray],
+        dynamic_image_size: Optional[bool] = None,
+    ) -> list[torch.Tensor]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=1,
+            max_dynamic_patch=1,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=False,  # Applied in image_to_pixel_values
+        )
+
+        return [
+            video_to_pixel_values_internvl(
+                video,
+                input_size=self.image_size,
+                min_num=min_num,
+                max_num=max_num,
+                use_thumbnail=False,
+            ) for video in videos
+        ]
+
+    def _preprocess_video(
+        self,
+        text: list[str],
+        videos: list[npt.NDArray],
+        dynamic_image_size: Optional[bool] = None,
+    ):
+        if len(videos) == 0 or not self.supports_video:
+            video_inputs = {}
+        else:
+            pixel_values_lst_video = self._videos_to_pixel_values_lst(
+                videos,
+                dynamic_image_size=dynamic_image_size,
+            )
+            video_inputs: dict[str, NestedTensors] = {
+                "pixel_values_flat_video":
+                torch.cat(pixel_values_lst_video),
+                "video_num_patches":
+                torch.tensor([len(item) for item in pixel_values_lst_video]),
+            }
+
+            for pixel_values in pixel_values_lst_video:
+                num_patches = pixel_values.shape[0]
+
+                video_repl = self.get_video_repl(self.num_image_token,
+                                                 num_patches, self.video_token)
+                text = [t.replace('<video>', video_repl.full, 1) for t in text]
+        return text, video_inputs
+
+    def __call__(
+        self,
+        text: Optional[Union[str, list[str]]] = None,
+        images: Optional[Union[Image.Image, list[Image.Image]]] = None,
+        videos: Optional[Union[npt.NDArray, list[npt.NDArray]]] = None,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ) -> Mapping[str, NestedTensors]:
+        text, images, videos = [
+            self._make_batch_input(x) for x in (text, images, videos)
+        ]
+
+        text, image_inputs = self._preprocess_image(
+            text=text,
+            images=images,
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+        )
+
+        text, video_inputs = self._preprocess_video(
+            text=text,
+            videos=videos,
+            dynamic_image_size=dynamic_image_size,
+        )
+
+        text_inputs = self.tokenizer(text)
+
+        return {
+            **BatchEncoding(text_inputs, tensor_type=return_tensors),
+            **image_inputs,
+            **video_inputs,
+        }
+
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: Optional[int],
+    ) -> PromptUpdateDetails[str]:
+        repl_features = IMG_CONTEXT * feature_size
+        repl_full = IMG_START + repl_features + IMG_END
+
+        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
+
+    def get_video_repl(
+        self,
+        feature_size: int,
+        num_patches: Optional[int] = None,
+        video_context_token: str = IMG_CONTEXT,
+    ) -> PromptUpdateDetails[str]:
+        repl_features = video_context_token * self.num_image_token
+        repl_features_with_sep = IMG_START + repl_features + IMG_END
+        # num_patches is equal to num_frames
+        repl_full = ''.join([
+            f'Frame{i+1}: {repl_features_with_sep}' for i in range(num_patches)
+        ])
+
+        return PromptUpdateDetails.select_text(repl_full, video_context_token)
+
+
+class BaseInternVLProcessingInfo(BaseProcessingInfo):
+    """Basic image-only ProcessingInfo for InternVL-style models."""
+
+    @abstractmethod
+    def get_hf_processor(
+        self,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        **kwargs: object,
+    ) -> BaseInternVLProcessor:
+        raise NotImplementedError
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[BaseInternVLProcessor],
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        return processor.get_num_image_tokens(
+            image_width=image_width,
+            image_height=image_height,
+        )
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        processor = self.get_hf_processor()
+
+        base_size = processor.image_size
+        target_ratios = processor.resolve_target_ratios()
+
+        largest_feature_size, largest_feature_pinpoint = 0, None
+        for wr, hr in target_ratios:
+            width, height = base_size * wr, base_size * hr
+
+            feat_size = self.get_num_image_tokens(
+                image_width=width,
+                image_height=height,
+                processor=processor,
+            )
+            if feat_size > largest_feature_size:
+                largest_feature_size = feat_size
+                largest_feature_pinpoint = ImageSize(width=width,
+                                                     height=height)
+
+        if largest_feature_size == 0 or largest_feature_pinpoint is None:
+            raise ValueError("Cannot have a largest feature size of 0!")
+
+        return largest_feature_pinpoint
+
+    def get_max_image_tokens(self) -> int:
+        processor = self.get_hf_processor()
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            processor=processor,
+        )
+
+
+_I = TypeVar("_I", bound=BaseInternVLProcessingInfo)
+
+
+class BaseInternVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
+    """Basic image-only DummyInputsBuilder for InternVL-style models."""
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        return "<image>" * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        num_images = mm_counts.get("image", 0)
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+
+class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
+    """ Basic image-only MultiModalProcessor for InternVL-style models."""
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> Mapping[str, NestedTensors]:
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+
+        hf_processor = self.info.get_hf_processor(**mm_kwargs)
+        image_token_id = hf_processor.image_token_id
+
+        # Since there may be extra tokens in the feature placeholders,
+        # we need to pass the image token ID to the model to select the
+        # tokens to merge from the vision encoder outputs
+        processed_outputs["image_token_id"] = torch.tensor(image_token_id)
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: Mapping[str, NestedTensors],
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0))
+        num_images = len(image_num_patches)
+
+        return dict(
+            pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
+                "image", image_num_patches),
+            image_num_patches=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+            image_token_id=MultiModalFieldConfig.shared("image", num_images),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        if "image_num_patches" in out_mm_kwargs:
+            image_num_patches = out_mm_kwargs["image_num_patches"]
+            assert isinstance(image_num_patches, torch.Tensor)
+            image_num_patches = image_num_patches.tolist()
+        elif "image_embeds" in out_mm_kwargs:
+            # TODO: Use image size information in dictionary embedding inputs
+            # to compute num_patches (similar to Qwen2-VL)
+            image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
+        else:
+            image_num_patches = []
+
+        def get_replacement_internvl(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                feature_size = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                feature_size = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    processor=hf_processor,
+                )
+
+            num_patches = image_num_patches[item_idx]
+            if num_patches is not None:
+                assert isinstance(num_patches, int)
+
+            return hf_processor.get_image_repl(feature_size, num_patches)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target="<image>",
+                replacement=get_replacement_internvl,
+            )
+        ]
+
+
+class InternVLProcessingInfo(BaseInternVLProcessingInfo):
+    """InternVL ProcessingInfo extended for video processing"""
+
+    @property
+    def supports_video(self):
+        return self.get_hf_processor().supports_video
+
+    def get_supported_mm_limits(self):
+        video_limit = {"video": None} if self.supports_video else {}
+        return {**super().get_supported_mm_limits(), **video_limit}
+
+    def get_video_token(self) -> Optional[str]:
+        text_model_type = self.get_hf_config().get_text_config().model_type
+        if text_model_type == "qwen2":
+            return "<|video_pad|>"
+        return None
+
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
+
+        processor = self.get_hf_processor()
+
+        max_image_tokens = self.get_max_image_tokens() * max_images
+        max_total_frames = (seq_len -
+                            max_image_tokens) // processor.num_image_token
+        max_frames_per_video = max_total_frames // max(max_videos, 1)
+
+        return max(max_frames_per_video, 1)
+
+    def get_hf_processor(
+        self,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        **kwargs: object,
+    ) -> InternVLProcessor:
+        if min_dynamic_patch is not None:
+            kwargs["min_dynamic_patch"] = min_dynamic_patch
+        if max_dynamic_patch is not None:
+            kwargs["max_dynamic_patch"] = max_dynamic_patch
+        if dynamic_image_size is not None:
+            kwargs["dynamic_image_size"] = dynamic_image_size
+
+        kwargs["video_token"] = self.get_video_token()
+
+        return self.ctx.init_processor(
+            InternVLProcessor,
+            config=self.get_hf_config(),
+            tokenizer=self.get_tokenizer(),
+            **kwargs,
+        )
+
+
+class InternVLDummyInputsBuilder(
+        BaseInternVLDummyInputsBuilder[InternVLProcessingInfo]):
+    """InternVL DummyInputsBuilder extended for video support"""
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_videos = mm_counts.get("video", 0)
+
+        return super().get_dummy_text(mm_counts) + "<video>" * num_videos
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        dummy_image = super().get_dummy_mm_data(seq_len=seq_len,
+                                                mm_counts=mm_counts)
+        if self.info.supports_video:
+            config = self.info.get_hf_config()
+            image_size: int = config.vision_config.image_size
+            target_num_frames = \
+                self.info.get_num_frames_with_most_features(seq_len, mm_counts)
+            num_videos = mm_counts.get("video", 0)
+            dummy_video = {
+                "video":
+                self._get_dummy_videos(width=image_size,
+                                       height=image_size,
+                                       num_frames=target_num_frames,
+                                       num_videos=num_videos)
+            }
+        else:
+            dummy_video = {}
+        return {**dummy_image, **dummy_video}
+
+
+class InternVLMultiModalProcessor(
+        BaseInternVLMultiModalProcessor[InternVLProcessingInfo]):
+    """InternVL MultiModalProcessor extended for video support"""
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> Mapping[str, NestedTensors]:
+        processed_outputs = super()._call_hf_processor(prompt, mm_data,
+                                                       mm_kwargs, tok_kwargs)
+
+        hf_processor = self.info.get_hf_processor(**mm_kwargs)
+        if self.info.supports_video and (
+                video_token_id := hf_processor.video_token_id) is not None:
+            processed_outputs["video_token_id"] = torch.tensor(video_token_id)
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: Mapping[str, NestedTensors],
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        image_fields = super()._get_mm_fields_config(hf_inputs,
+                                                     hf_processor_mm_kwargs)
+        if self.info.supports_video:
+            video_num_patches = hf_inputs.get("video_num_patches",
+                                              torch.empty(0))
+            num_videos = len(video_num_patches)
+            video_fields = dict(
+                pixel_values_flat_video=MultiModalFieldConfig.flat_from_sizes(
+                    "video", video_num_patches),
+                video_num_patches=MultiModalFieldConfig.batched("video"),
+                video_token_id=MultiModalFieldConfig.shared(
+                    "video", num_videos),
+            )
+        else:
+            video_fields = {}
+
+        return image_fields | video_fields
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        prompt_repl: list[PromptUpdate] = super()._get_prompt_updates(
+            mm_items, hf_processor_mm_kwargs, out_mm_kwargs)
+
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        if "video_num_patches" in out_mm_kwargs:
+            video_num_patches = out_mm_kwargs["video_num_patches"]
+            assert isinstance(video_num_patches, torch.Tensor)
+            video_num_patches = video_num_patches.tolist()
+        else:
+            video_num_patches = []
+
+        def get_video_replacement_internvl(item_idx: int):
+            feature_size = hf_processor.num_image_token
+            num_patches = video_num_patches[item_idx]
+            if num_patches is not None:
+                assert isinstance(num_patches, int)
+
+            return hf_processor.get_video_repl(
+                feature_size,
+                num_patches,
+                video_context_token=hf_processor.video_token)
+
+        if self.info.supports_video:
+            prompt_repl.append(
+                PromptReplacement(
+                    modality="video",
+                    target="<video>",
+                    replacement=get_video_replacement_internvl,
+                ))
+        return prompt_repl
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    InternVLMultiModalProcessor,
+    info=InternVLProcessingInfo,
+    dummy_inputs=InternVLDummyInputsBuilder)
+class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
+                        SupportsLoRA):
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<image>"
+        if modality.startswith("video"):
+            return "<video>"
+
+        raise ValueError("Only image or video modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+        self._patch_quant_config(config, quant_config)
+
+        image_size = config.force_image_size or config.vision_config.image_size
+        patch_size = config.vision_config.patch_size
+        self.patch_size = patch_size
+        self.num_image_token = int(
+            (image_size // patch_size)**2 * (config.downsample_ratio**2))
+        self.downsample_ratio = config.downsample_ratio
+        self.ps_version = config.ps_version
+
+        self.llm_arch_name = config.text_config.architectures[0]
+        self.is_mono = self.llm_arch_name == 'InternLM2VEForCausalLM'
+        self.vision_model = self._init_vision_model(
+            config,
+            quant_config=quant_config,
+            is_mono=self.is_mono,
+            prefix=maybe_prefix(prefix, "vision_model"),
+        )
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
+        self.mlp1 = self._init_mlp1(config)
+
+        self.img_context_token_id = None
+        self.video_context_token_id = None
+
+        self.visual_token_mask = None
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _patch_quant_config(self, config: PretrainedConfig,
+                            quant_config: QuantizationConfig):
+        # the awq models from OpenGVLab missing `modules_to_not_convert`
+        # patch the quant_config to add `modules_to_not_convert` back
+        if isinstance(quant_config, AWQConfig):
+            text_config = config.text_config
+            llm_quant_config = getattr(text_config, "quantization_config",
+                                       None)
+            if (not quant_config.modules_to_not_convert) and \
+                (llm_quant_config is not None):
+                quant_config.modules_to_not_convert.append("vision_model")
+
+    def _init_vision_model(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        *,
+        is_mono: bool,
+        prefix: str,
+    ):
+        if not is_mono:
+            vision_feature_layer = config.select_layer
+            if vision_feature_layer < 0:
+                num_hidden_layers = config.vision_config.num_hidden_layers \
+                    + vision_feature_layer + 1
+            else:
+                num_hidden_layers = vision_feature_layer + 1
+
+            return InternVisionModel(
+                config.vision_config,
+                quant_config=quant_config,
+                num_hidden_layers_override=num_hidden_layers,
+                prefix=prefix,
+            )
+        else:
+            return InternVisionPatchModel(config.vision_config)
+
+    def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential:
+        vit_hidden_size = config.vision_config.hidden_size
+        llm_hidden_size = config.text_config.hidden_size
+
+        return nn.Sequential(
+            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio)**2),
+            nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio)**2,
+                      llm_hidden_size),
+            nn.GELU(),
+            nn.Linear(llm_hidden_size, llm_hidden_size),
+        )
+
+    def pixel_shuffle(self, x, scale_factor=0.5):
+        n, w, h, c = x.size()
+        # N, W, H, C --> N, W, H * scale, C // scale
+        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        x = x.view(n, int(h * scale_factor), int(w * scale_factor),
+                   int(c / (scale_factor * scale_factor)))
+        if self.ps_version == 'v1':
+            pass
+        else:
+            x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+
+    def extract_feature(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        vit_embeds = self.vision_model(pixel_values=pixel_values)
+        vit_embeds = vit_embeds[:, 1:, :]
+
+        h = w = int(vit_embeds.shape[1]**0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = self.pixel_shuffle(vit_embeds,
+                                        scale_factor=self.downsample_ratio)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1,
+                                        vit_embeds.shape[-1])
+        vit_embeds = self.mlp1(vit_embeds)
+        return vit_embeds
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape)
+
+            if actual_dims != expected_dims:
+                expected_expr = str(expected_dims)
+                raise ValueError(
+                    "The expected shape of pixel values per image per batch "
+                    f" per patch is {expected_expr}. "
+                    f"You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[InternVLImageInputs]:
+        pixel_values_flat = kwargs.pop("pixel_values_flat", None)
+        image_num_patches = kwargs.pop("image_num_patches", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values_flat is None and image_embeds is None:
+            return None
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+
+            return InternVLImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds),
+            )
+
+        image_token_id = kwargs["image_token_id"]
+        assert isinstance(image_token_id, torch.Tensor)
+        self.img_context_token_id = image_token_id.flatten().unique().item()
+
+        if pixel_values_flat is not None:
+            if not isinstance(pixel_values_flat, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values_flat)}")
+
+            if not isinstance(image_num_patches, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image_num_patches. "
+                                 f"Got type: {type(image_num_patches)}")
+
+            pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
+            image_num_patches = flatten_bn(image_num_patches, concat=True)
+
+            return InternVLImagePixelInputs(
+                type="pixel_values",
+                pixel_values_flat=self._validate_pixel_values(
+                    pixel_values_flat),
+                num_patches=image_num_patches,
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _parse_and_validate_video_input(
+            self, **kwargs: object) -> Optional[InternVLVideoPixelInputs]:
+        pixel_values_flat_video = kwargs.pop("pixel_values_flat_video", None)
+        video_num_patches = kwargs.pop("video_num_patches", None)
+        video_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values_flat_video is None and video_embeds is None:
+            return None
+
+        if video_embeds is not None:
+            if not isinstance(video_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of video embeddings. "
+                                 f"Got type: {type(video_embeds)}")
+
+            return InternVLImageEmbeddingInputs(
+                type="video_embeds",
+                data=flatten_bn(video_embeds),
+            )
+
+        video_token_id = kwargs["video_token_id"]
+        assert isinstance(video_token_id, torch.Tensor)
+        self.video_context_token_id = video_token_id.flatten().unique().item()
+
+        if pixel_values_flat_video is not None:
+            if not isinstance(pixel_values_flat_video, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values_flat_video)}")
+
+            if not isinstance(video_num_patches, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image_num_patches. "
+                                 f"Got type: {type(video_num_patches)}")
+
+            pixel_values_flat_video = flatten_bn(pixel_values_flat_video,
+                                                 concat=True)
+            video_num_patches = flatten_bn(video_num_patches, concat=True)
+
+            return InternVLVideoPixelInputs(
+                type="pixel_values_videos",
+                pixel_values_flat=self._validate_pixel_values(
+                    pixel_values_flat_video),
+                num_patches=video_num_patches,
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_image_input(
+        self,
+        image_input: Union[InternVLImageInputs, InternVLVideoPixelInputs],
+    ) -> tuple[torch.Tensor, ...]:
+        if image_input["type"] == "image_embeds":
+            return image_input["data"]
+
+        assert self.vision_model is not None
+
+        image_embeds = self.extract_feature(image_input["pixel_values_flat"])
+
+        num_patches = image_input["num_patches"]
+
+        # Only one image in the current batch
+        if len(num_patches) == 1:
+            return (image_embeds.view(-1,
+                                      self.config.text_config.hidden_size), )
+
+        # NOTE: Image embeddings are split into separate tensors for each image
+        # by the size of each embedding.
+        feature_size = image_embeds.shape[1]
+        image_embeds = image_embeds.view(-1,
+                                         self.config.text_config.hidden_size)
+        image_feature_sizes = [
+            num_patches * feature_size for num_patches in num_patches
+        ]
+        return image_embeds.split(image_feature_sizes)
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("pixel_values_flat",
+                             "image_embeds") and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+            if input_key in ("pixel_values_flat_video",
+                             ) and "videos" not in modalities:
+                modalities["videos"] = self._parse_and_validate_video_input(
+                    **kwargs)
+
+        return modalities
+
+    def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
+        if self.is_mono:
+            assert self.img_context_token_id is not None
+            self.visual_token_mask = (
+                input_ids == self.img_context_token_id).reshape(-1, 1)
+        else:
+            self.visual_token_mask = None
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
+            return []
+            return None
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in modalities:
+            if modality == "images":
+                image_input = modalities["images"]
+                vision_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += vision_embeddings
+            if modality == "videos":
+                video_input = modalities["videos"]
+                video_embeddings = self._process_image_input(video_input)
+                multimodal_embeddings += video_embeddings
+
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            context_token_ids = [
+                token_id for token_id in (self.img_context_token_id,
+                                          self.video_context_token_id)
+                if token_id is not None
+            ]
+            assert len(context_token_ids) >= 1
+            self._set_visual_token_mask(input_ids)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                context_token_ids,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> IntermediateTensors:
+
+        if intermediate_tensors is not None:
+            input_ids = None
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        forward_kwargs = {
+            "input_ids": input_ids,
+            "positions": positions,
+            "intermediate_tensors": intermediate_tensors,
+            "inputs_embeds": inputs_embeds,
+        }
+
+        # Only required if the model is mono-architecture
+        if self.visual_token_mask is not None:
+            forward_kwargs.update(
+                {"visual_token_mask": self.visual_token_mask})
+            self.visual_token_mask = None
+
+        hidden_states = self.language_model.model(**forward_kwargs)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        # unused modules appear in OpenGVLab/InternVideo2_5_Chat_8B
+        skip_prefixes = [
+            "action_embed", "temporal_embed", "track_embed",
+            "track_embed_decoder", "box_token", "cg_criterion", "cg_model",
+            "loc_encoder", "loc_decoder", "sam", "temporal_token",
+            "track_token"
+        ]
+        loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
+        return loader.load_weights(weights)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="mlp1",
+            tower_model="vision_model")
diff --git a/vllm_v0.10.0/vllm/model_executor/models/jais.py b/vllm_v0.10.0/vllm/model_executor/models/jais.py
new file mode 100644
index 0000000..bed4a5d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/jais.py
@@ -0,0 +1,373 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://huggingface.co/inceptionai/jais-30b-chat-v3/blob/main/modeling_jais.py
+# Copyright 2023 The vLLM team.
+# Copyright 2023 the Jais authors and HuggingFace Inc. team.  All rights
+# reserved.
+# Copyright 2023 Cerebras Systems.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Jais model compatible with HuggingFace weights."""
+
+import math
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size)
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs import JAISConfig
+
+from .interfaces import SupportsPP
+from .utils import (is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class SwiGLUActivation(nn.Module):
+
+    def forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
+        return x1 * nn.functional.silu(x2)
+
+
+def _get_alibi_slopes(n):
+
+    def get_slopes_power_of_2(n):
+        start = 2**(-(2**-(math.log2(n) - 3)))
+        ratio = start
+        return [start * ratio**i for i in range(n)]
+
+    if math.log2(n).is_integer():
+        return get_slopes_power_of_2(n)
+    else:
+        closest_power_of_2 = 2**math.floor(math.log2(n))
+        return (get_slopes_power_of_2(closest_power_of_2) + _get_alibi_slopes(
+            2 * closest_power_of_2)[0::2][:n - closest_power_of_2])
+
+
+class JAISAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: JAISConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        total_num_heads = config.num_attention_heads
+        tensor_model_parallel_world_size = (
+            get_tensor_model_parallel_world_size())
+        assert total_num_heads % tensor_model_parallel_world_size == 0
+        self.num_heads = total_num_heads // tensor_model_parallel_world_size
+        self.head_dim = self.hidden_size // total_num_heads
+        if hasattr(config, "scale_qk_dot_by_d"):
+            config.mup_scale_qk_dot_by_d = config.scale_qk_dot_by_d
+        self.attn_scale_power = 1.0 if config.mup_scale_qk_dot_by_d else 0.5
+        self.scale = self.head_dim**-self.attn_scale_power
+
+        self.c_attn = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            total_num_heads,
+            bias=True,
+            quant_config=quant_config,
+        )
+        self.c_proj = RowParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+        )
+
+        tp_rank = get_tensor_model_parallel_rank()
+        head_start = tp_rank * self.num_heads
+        head_end = (tp_rank + 1) * self.num_heads
+        alibi_slopes = _get_alibi_slopes(total_num_heads)
+        alibi_slopes = alibi_slopes[head_start:head_end]
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              scale=self.scale,
+                              alibi_slopes=alibi_slopes,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.c_attn(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        attn_output = self.attn(q, k, v)
+        attn_output, _ = self.c_proj(attn_output)
+        return attn_output
+
+
+class JAISMLP(nn.Module):
+
+    def __init__(
+        self,
+        intermediate_size: int,
+        config: JAISConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        hidden_size = config.hidden_size
+        self.swiglu = config.activation_function == "swiglu"
+        self.c_fc = ColumnParallelLinear(
+            hidden_size,
+            intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+        )
+        self.c_fc2 = (ColumnParallelLinear(
+            hidden_size,
+            intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+        ) if self.swiglu else None)
+        self.c_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=True,
+            quant_config=quant_config,
+        )
+
+        self.act = SwiGLUActivation()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if self.swiglu:
+            hidden_states2, _ = self.c_fc2(hidden_states)
+        hidden_states, _ = self.c_fc(hidden_states)
+        hidden_states = (self.act(hidden_states, hidden_states2)
+                         if self.swiglu else self.act(hidden_states))
+        hidden_states, _ = self.c_proj(hidden_states)
+        return hidden_states
+
+
+class JAISBlock(nn.Module):
+
+    def __init__(
+        self,
+        config: JAISConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        hidden_size = config.hidden_size
+        inner_dim = (config.n_inner if config.n_inner is not None else 4 *
+                     hidden_size)
+
+        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.attn = JAISAttention(config,
+                                  cache_config,
+                                  quant_config,
+                                  prefix=f"{prefix}.attn")
+        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.mlp = JAISMLP(inner_dim, config, quant_config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_output = self.attn(hidden_states=hidden_states, )
+        # residual connection
+        hidden_states = attn_output + residual
+
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        # residual connection
+        hidden_states = residual + feed_forward_hidden_states
+        return hidden_states
+
+
+@support_torch_compile
+class JAISModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+        assert not config.add_cross_attention
+        assert not config.scale_attn_by_inverse_layer_idx
+        assert not config.reorder_and_upcast_attn
+        self.embed_dim = config.hidden_size
+        self.wte = VocabParallelEmbedding(config.vocab_size, self.embed_dim)
+        self.wpe = (nn.Embedding(config.max_position_embeddings,
+                                 self.embed_dim)
+                    if config.position_embedding_type != "alibi" else None)
+        if hasattr(config, "embeddings_scale"):
+            self.embeddings_scale = config.embeddings_scale
+        else:
+            self.embeddings_scale = config.mup_embeddings_scale
+
+        self.start_layer, self.end_layer, self.h = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: JAISBlock(config=config,
+                                     cache_config=cache_config,
+                                     quant_config=quant_config,
+                                     prefix=prefix),
+            prefix=f"{prefix}.h",
+        )
+
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(["hidden_states"],
+                                                    config.n_embd))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.wte(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[IntermediateTensors, torch.Tensor]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is None:
+                inputs_embeds = self.get_input_embeddings(input_ids)
+            if self.wpe is not None:
+                position_embeds = self.wpe(position_ids)
+                hidden_states = inputs_embeds + position_embeds
+            else:
+                hidden_states = inputs_embeds
+            hidden_states *= torch.tensor(float(self.embeddings_scale),
+                                          dtype=hidden_states.dtype)
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+
+        for layer in self.h[self.start_layer:self.end_layer]:
+            hidden_states = layer(hidden_states)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
+
+        hidden_states = self.ln_f(hidden_states)
+        return hidden_states
+
+
+class JAISLMHeadModel(nn.Module, SupportsPP):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.transformer = JAISModel(vllm_config=vllm_config,
+                                     prefix=maybe_prefix(
+                                         prefix, "transformer"))
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.transformer.wte
+        else:
+            self.lm_head = ParallelLMHead(self.config.vocab_size,
+                                          self.config.hidden_size)
+        if hasattr(config, "width_scale"):
+            self.output_logits_scale = config.width_scale
+        else:
+            self.output_logits_scale = (config.mup_output_alpha *
+                                        config.mup_width_scale)
+        self.logits_processor = LogitsProcessor(vocab_size=config.vocab_size,
+                                                scale=self.output_logits_scale)
+        self.make_empty_intermediate_tensors = (
+            self.transformer.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[IntermediateTensors, torch.Tensor]:
+        hidden_states = self.transformer(input_ids, positions,
+                                         intermediate_tensors, inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "lm_head.weight" in name:
+                # GPT-2 ties the weights of the embedding layer and the final
+                # linear layer.
+                continue
+            if ".attn.bias" in name or ".attn.masked_bias" in name:
+                # Skip attention mask.
+                # NOTE: "c_attn.bias" should not be skipped.
+                continue
+            if "relative_pe" in name:
+                continue
+            if not name.startswith("transformer."):
+                name = "transformer." + name
+
+            if is_pp_missing_parameter(name, self):
+                continue
+
+            param = params_dict[name]
+            # The HF's GPT-2 implementation uses Conv1D instead of Linear.
+            # Because of this, we need to transpose the weights.
+            # Note(zhuohan): the logic below might break quantized models.
+            for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]:
+                if conv1d_weight_name not in name:
+                    continue
+                if not name.endswith(".weight"):
+                    continue
+                loaded_weight = loaded_weight.t()
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm_v0.10.0/vllm/model_executor/models/jamba.py b/vllm_v0.10.0/vllm/model_executor/models/jamba.py
new file mode 100644
index 0000000..34281b2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/jamba.py
@@ -0,0 +1,598 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Inference-only Jamba model."""
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+from torch import nn
+from transformers import JambaConfig
+
+from vllm.attention.layer import Attention
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
+from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler,
+                                               PoolingType)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
+                                                    MambaCacheParams)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+from vllm.utils import LayerBlockType
+
+from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
+                         SupportsV0Only)
+from .utils import (is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class JambaMoE(nn.Module):
+
+    def __init__(self,
+                 config: JambaConfig,
+                 num_experts: Optional[int] = None,
+                 top_k: Optional[int] = None,
+                 params_dtype: Optional[torch.dtype] = None,
+                 tp_size: Optional[int] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.num_total_experts = num_experts or config.num_experts
+        self.top_k = top_k or config.num_experts_per_tok
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+
+        if self.num_total_experts > 1:
+            self.router = ReplicatedLinear(self.hidden_size,
+                                           self.num_total_experts,
+                                           bias=False,
+                                           quant_config=None,
+                                           params_dtype=params_dtype)
+
+        self.experts = FusedMoE(self.num_total_experts,
+                                self.top_k,
+                                self.hidden_size,
+                                self.intermediate_size,
+                                tp_size=tp_size,
+                                params_dtype=params_dtype,
+                                reduce_results=True,
+                                renormalize=False,
+                                use_grouped_topk=False,
+                                quant_config=quant_config,
+                                prefix=f"{prefix}.experts")
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        orig_shape = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        # router_logits: (batch * sequence_length, n_experts)
+        if self.num_total_experts > 1:
+            router_logits, _ = self.router(hidden_states)
+        else:
+            router_logits = torch.ones((hidden_states.shape[0], 1),
+                                       device=hidden_states.device,
+                                       dtype=hidden_states.dtype)
+        hidden_states = self.experts(hidden_states, router_logits)
+        return hidden_states.view(orig_shape)
+
+
+class JambaMLP(JambaMoE):
+
+    def __init__(self,
+                 config: JambaConfig,
+                 params_dtype: Optional[torch.dtype] = None,
+                 tp_size: Optional[int] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__(config,
+                         num_experts=1,
+                         top_k=1,
+                         params_dtype=params_dtype,
+                         tp_size=tp_size,
+                         quant_config=quant_config,
+                         prefix=prefix)
+
+
+class JambaMambaDecoderLayer(nn.Module):
+
+    def __init__(self,
+                 config: JambaConfig,
+                 layer_idx: int,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 is_lora_enabled: Optional[bool] = False,
+                 prefix: str = "",
+                 **kwargs) -> None:
+        super().__init__()
+        self.config = config
+        self.is_lora_enabled = is_lora_enabled
+        self.mamba = MambaMixer(hidden_size= config.hidden_size,
+                                ssm_state_size = config.mamba_d_state,
+                                conv_kernel_size = config.mamba_d_conv,
+                                intermediate_size = config.mamba_expand *\
+                                                    config.hidden_size,
+                                time_step_rank = config.mamba_dt_rank,
+                                use_conv_bias = config.mamba_conv_bias,
+                                use_bias = config.mamba_proj_bias,
+                                use_rms_norm=True,
+                                rms_norm_eps=config.rms_norm_eps,
+                                activation=config.hidden_act,
+                                is_lora_enabled = self.is_lora_enabled
+                                )
+
+        num_experts = config.layers_num_experts[layer_idx]
+        ffn_layer_class = JambaMoE if num_experts > 1 else JambaMLP
+        self.feed_forward = ffn_layer_class(config,
+                                            quant_config=quant_config,
+                                            prefix=f"{prefix}.feed_forward")
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = RMSNorm(config.hidden_size,
+                                        eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        mamba_cache_params: MambaCacheParams,
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+
+        hidden_states = self.mamba(hidden_states, mamba_cache_params)
+        # Fully Connected
+        hidden_states, residual = self.pre_ff_layernorm(
+            hidden_states, residual)
+        hidden_states = self.feed_forward(hidden_states)
+        return hidden_states, residual
+
+
+class JambaAttentionDecoderLayer(nn.Module):
+
+    def __init__(self,
+                 config: JambaConfig,
+                 layer_idx: int,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "",
+                 **kwargs) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = config.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+        )
+        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
+                                        config.hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config)
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            prefix=f"{prefix}.attn",
+        )
+
+        num_experts = config.layers_num_experts[layer_idx]
+        ffn_layer_class = JambaMoE if num_experts > 1 else JambaMLP
+        self.feed_forward = ffn_layer_class(config,
+                                            quant_config=quant_config,
+                                            prefix=f"{prefix}.feed_forward")
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = RMSNorm(config.hidden_size,
+                                        eps=config.rms_norm_eps)
+
+    def self_attention(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+
+        hidden_states = self.self_attention(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+        # Fully Connected
+        hidden_states, residual = self.pre_ff_layernorm(
+            hidden_states, residual)
+        hidden_states = self.feed_forward(hidden_states)
+        return hidden_states, residual
+
+
+ALL_DECODER_LAYER_TYPES = {
+    "attention": JambaAttentionDecoderLayer,
+    "mamba": JambaMambaDecoderLayer
+}
+
+
+class JambaModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+
+        extra_kwargs = {"is_lora_enabled": bool(vllm_config.lora_config)}
+
+        def get_layer(prefix: str):
+            layer_idx = int(prefix.rsplit(".", 1)[1])
+            layer_class = ALL_DECODER_LAYER_TYPES[
+                config.layers_block_type[layer_idx]]
+            return layer_class(config,
+                               layer_idx,
+                               cache_config,
+                               quant_config=quant_config,
+                               prefix=prefix,
+                               **extra_kwargs)
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers")
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+        self.final_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        kv_cache_index = 0
+        mamba_cache_index = 0
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            layer_mamba_cache_params = None
+            if isinstance(layer, JambaAttentionDecoderLayer):
+                kv_cache_index += 1
+            if isinstance(layer, JambaMambaDecoderLayer):
+                current_state_layer = mamba_cache_index
+                layer_mamba_cache_params = mamba_cache_params.at_layer_idx(
+                    current_state_layer)
+                mamba_cache_index += 1
+
+            hidden_states, residual = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+                mamba_cache_params=layer_mamba_cache_params)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.final_layernorm(hidden_states, residual)
+        return hidden_states
+
+
+class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
+                       IsHybrid, SupportsV0Only):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "in_proj": ["in_proj"],
+    }
+
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        lora_config = vllm_config.lora_config
+        scheduler_config = vllm_config.scheduler_config
+        assert not cache_config.enable_prefix_caching, \
+            "Jamba currently does not support prefix caching"
+
+        super().__init__()
+        self.config = config
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.scheduler_config = scheduler_config
+        self.model = JambaModel(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+        )
+        # Used to track and store by the Mamba cache between steps.
+        self.mamba_cache: Optional[MambaCacheManager] = None
+
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs):
+        if self.mamba_cache is None:
+            num_mamba_layers = self.model_config.get_num_layers_by_block_type(
+                self.vllm_config.parallel_config, LayerBlockType.mamba)
+            self.mamba_cache = MambaCacheManager(
+                self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers,
+                *self._get_mamba_cache_shape())
+
+        mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+
+        hidden_states = self.model(input_ids, positions, mamba_cache_params,
+                                   intermediate_tensors, inputs_embeds)
+        return hidden_states
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        return self.mamba_cache.copy_inputs_before_cuda_graphs(
+            input_buffers, **kwargs)
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
+
+    def _get_mamba_cache_shape(
+            self) -> tuple[tuple[int, int], tuple[int, int]]:
+        world_size = get_tensor_model_parallel_world_size()
+        hidden_size = self.config.hidden_size
+        conv_state_shape = (
+            self.config.mamba_expand * hidden_size // world_size,
+            self.config.mamba_d_conv - 1,
+        )
+        temporal_state_shape = (
+            self.config.mamba_expand * hidden_size // world_size,
+            self.config.mamba_d_state,
+        )
+        return conv_state_shape, temporal_state_shape
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts)
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if "A_log" in name:
+                name = name.replace("A_log", "A")
+
+            if ".self_attn." in name:
+                name = name.replace(".self_attn", "")
+
+            if "feed_forward" in name and not _is_moe_layer(name):
+                ## map MLP layers to expert with ID=0
+                name = name.replace("feed_forward", "feed_forward.experts.0")
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                if 'experts' in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for (
+                        param_name,
+                        weight_name,
+                        expert_id,
+                        shard_id,
+                ) in expert_params_mapping:
+                    if weight_name not in name:
+                        continue
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+def _is_moe_layer(name: str):
+    return any(
+        [experts_name in name for experts_name in [
+            "experts",
+            "router",
+        ]])
+
+
+class JambaForSequenceClassification(JambaForCausalLM):
+
+    is_pooling_model = True
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+
+        config = vllm_config.model_config.hf_config
+        num_labels: int = config.num_labels
+        score_bias: bool = getattr(config, 'score_bias', False)
+
+        # TODO: The original reward weights have float32 accuracy data, we
+        # would like to load them in fp32 to get that extra precision.
+        # Currently weight_loader passes the weight which is already in bf16
+        self.score = nn.Linear(
+            config.hidden_size,
+            num_labels,
+            bias=score_bias,
+            dtype=torch.float32,
+        )
+
+        pooler_config = vllm_config.model_config.pooler_config
+        assert pooler_config is not None
+
+        self.pooler = DispatchPooler({
+            "encode":
+            Pooler.for_encode(pooler_config),
+            "classify":
+            Pooler.for_classify(
+                pooler_config,
+                classifier=self.score,
+                default_pooling_type=PoolingType.LAST,
+                default_normalize=False,
+                default_softmax=False,
+            ),
+        })
diff --git a/vllm_v0.10.0/vllm/model_executor/models/jina_vl.py b/vllm_v0.10.0/vllm/model_executor/models/jina_vl.py
new file mode 100644
index 0000000..0c4284f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/jina_vl.py
@@ -0,0 +1,149 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable, Mapping
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from transformers import BatchFeature, PretrainedConfig
+
+from vllm.config import VllmConfig
+from vllm.inputs import TokensPrompt
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import (SupportsCrossEncoding, SupportsMultiModal,
+                         SupportsScoreTemplate)
+from .qwen2_vl import (Qwen2VLDummyInputsBuilder,
+                       Qwen2VLForConditionalGeneration,
+                       Qwen2VLMultiModalProcessor, Qwen2VLProcessingInfo)
+from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
+
+logger = init_logger(__name__)
+
+
+class JinaVLScorer(nn.Module):
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.dense = ColumnParallelLinear(config.hidden_size,
+                                          config.hidden_size,
+                                          bias=True)
+        self.out_proj = RowParallelLinear(config.hidden_size,
+                                          config.num_labels,
+                                          bias=True)
+
+    def forward(self, x, **kwargs):
+        x, _ = self.dense(x)
+        x = torch.relu(x)
+        x, _ = self.out_proj(x)
+        return x
+
+
+class JinaVLMultiModalProcessor(Qwen2VLMultiModalProcessor):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+
+        # NOTE: We should reverse the order of the mm_data because the
+        # query prompt is placed after the document prompt in the score
+        # template for JinaVLForRanking model, but in mm_data they are
+        # stored in the opposite order (query first, then document).
+        for _, value in mm_data.items():
+            value.reverse()
+        return super()._call_hf_processor(prompt, mm_data, mm_kwargs,
+                                          tok_kwargs)
+
+
+@MULTIMODAL_REGISTRY.register_processor(JinaVLMultiModalProcessor,
+                                        info=Qwen2VLProcessingInfo,
+                                        dummy_inputs=Qwen2VLDummyInputsBuilder)
+class JinaVLForSequenceClassification(Qwen2VLForConditionalGeneration,
+                                      SupportsCrossEncoding,
+                                      SupportsMultiModal,
+                                      SupportsScoreTemplate):
+
+    is_pooling_model = True
+    weight_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "score.0.": "score.dense.",
+            "score.2.": "score.out_proj.",
+            # mapping for new names in checkpoint saved after transformers v4.52
+            "model.language_model.": "language_model.model.",
+            "visual.": "visual.",
+            # mapping for original checkpoint
+            "lm_head.": "language_model.lm_head.",
+            "model.": "language_model.model.",
+        })
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config,
+                         prefix=maybe_prefix(prefix, "qwen2_vl"))
+        config = vllm_config.model_config.hf_config
+        pooler_config = vllm_config.model_config.pooler_config
+
+        # logit bias for sigmoid normalization
+        self.LOGIT_BIAS = 2.65
+
+        self.score = JinaVLScorer(config)
+
+        pooler_config = vllm_config.model_config.pooler_config
+        assert pooler_config is not None
+
+        self.pooler = DispatchPooler({
+            "encode":
+            Pooler.for_encode(pooler_config),
+            "classify":
+            Pooler.for_classify(pooler_config, classifier=None),
+            "score":
+            Pooler.for_classify(pooler_config, classifier=None),
+        })
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<|vision_start|><|image_pad|><|vision_end|>"
+
+        raise ValueError("Only image modality is supported")
+
+    @classmethod
+    def get_score_template(cls, query: str, document: str) -> Optional[str]:
+        return f"**Document**:\n{document}\n**Query**:\n{query}"
+
+    @classmethod
+    def post_process_tokens(cls, prompt: TokensPrompt) -> None:
+
+        # add score target token at the end of prompt tokens
+        prompt['prompt_token_ids'].append(100)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> torch.Tensor:
+        hidden_states = super().forward(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
+
+        logits = self.score(hidden_states) - self.LOGIT_BIAS
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.weight_mapper)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/keye.py b/vllm_v0.10.0/vllm/model_executor/models/keye.py
new file mode 100644
index 0000000..3e1c64b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/keye.py
@@ -0,0 +1,1739 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from functools import partial
+from typing import Any, Literal, Optional, TypedDict, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from einops import rearrange
+from transformers import PretrainedConfig
+from transformers.activations import GELUActivation
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.modeling_outputs import (BaseModelOutput,
+                                           BaseModelOutputWithPooling)
+from transformers.utils import torch_int
+
+from vllm.config import VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.gptq import GPTQConfig
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQMarlinConfig)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (ImageItem, ModalityData,
+                                    MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, VideoItem)
+from vllm.multimodal.parse import (DictEmbeddingItems, ImageSize,
+                                   ModalityDataItems, MultiModalDataItems,
+                                   MultiModalDataParser)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.platforms import _Backend
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import uses_mrope
+from vllm.transformers_utils.processor import (
+    cached_image_processor_from_config)
+
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
+from .siglip import SiglipMLP
+from .utils import (AutoWeightsLoader, WeightsMapper,
+                    init_vllm_registered_model, is_pp_missing_parameter,
+                    maybe_prefix, merge_multimodal_embeddings)
+from .vision import get_vit_attn_backend
+
+logger = init_logger(__name__)
+
+_MAX_FRAMES_PER_VIDEO = 16
+_MAX_IMAGE_SIZE = 9999999
+
+
+def smart_resize(
+    height: int,
+    width: int,
+    factor: int = 28,
+    min_pixels: int = 28 * 28 * 130,
+    max_pixels: int = 28 * 28 * 1280,
+):
+    if height < factor:
+        logger.warning(
+            "smart_resize: height=%s < factor=%s, reset height=factor",
+            height,
+            factor,
+        )
+        width = round((width * factor) / height)
+        height = factor
+
+    if width < factor:
+        logger.warning(
+            "smart_resize: width=%s < factor=%s, reset width=factor",
+            width,
+            factor,
+        )
+        height = round((height * factor) / width)
+        width = factor
+
+    if max(height, width) / min(height, width) > 200:
+        raise ValueError("absolute aspect ratio must be smaller than 200, got "
+                         "{max(height, width) / min(height, width)}")
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = math.floor(height / beta / factor) * factor
+        w_bar = math.floor(width / beta / factor) * factor
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    return h_bar, w_bar
+
+
+class KeyeImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: torch.Tensor
+    """Shape:
+    `(num_patches, num_channels * patch_size * patch_size)`
+    """
+
+    image_grid_thw: torch.Tensor
+    """Shape: `(num_images, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+class KeyeImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    image_embeds: torch.Tensor
+    """Supported types:
+    - list[`torch.Tensor`]: A list of tensors holding all images' features.
+        Each tensor holds an image's features.
+    - `torch.Tensor`: A tensor holding all images' features
+        (concatenation of all images' feature tensors).
+    
+    Tensor shape: `(num_image_features, hidden_size)`
+    - `num_image_features` varies based on
+        the number and resolution of the images.
+    - `hidden_size` must match the hidden size of language model backbone.
+    """
+
+    image_grid_thw: torch.Tensor
+    """Shape: `(num_images, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+KeyeImageInputs = Union[KeyeImagePixelInputs, KeyeImageEmbeddingInputs]
+
+
+class KeyeVideoPixelInputs(TypedDict):
+    type: Literal["pixel_values_videos"]
+    pixel_values_videos: torch.Tensor
+    """Shape:
+    `(num_patches,
+      num_channels * temporal_patch_size * patch_size * patch_size)`
+    """
+
+    video_grid_thw: torch.Tensor
+    """Shape: `(num_videos, 3)`
+
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+class KeyeVideoEmbeddingInputs(TypedDict):
+    type: Literal["video_embeds"]
+    video_embeds: torch.Tensor
+    """Supported types:
+    - list[`torch.Tensor`]: A list of tensors holding all videos' features.
+        Each tensor holds an video's features.
+    - `torch.Tensor`: A tensor holding all videos' features
+        (concatenation of all videos' feature tensors).
+    
+    Tensor shape: `(num_image_features, hidden_size)`
+    - `num_image_features` varies based on 
+        the number and resolution of the videos.
+    - `hidden_size` must match the hidden size of language model backbone.
+    """
+
+    video_grid_thw: torch.Tensor
+    """Shape: `(num_videos, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+KeyeVideoInputs = Union[KeyeVideoPixelInputs, KeyeVideoEmbeddingInputs]
+
+
+class KeyeVisionEmbeddings(nn.Module):
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+
+        self.num_patches = (self.image_size // self.patch_size)**2
+        self.num_positions = self.num_patches
+        self.cache_position_embedding = dict()
+        self.cache_position_count = dict()
+        self.position_embedding = nn.Embedding(self.num_positions,
+                                               self.embed_dim)
+        self.packing_position_embedding = nn.Embedding(32768, self.embed_dim)
+
+        self.register_buffer(
+            "position_ids",
+            torch.arange(self.num_positions).expand((1, -1)),
+            persistent=False,
+        )
+
+    def interpolate_pos_encoding(
+        self,
+        embeddings: torch.Tensor,
+        height: int,
+        width: int,
+        is_after_patchify: bool = False,
+    ) -> torch.Tensor:
+
+        num_positions = self.position_embedding.weight.shape[0]
+
+        patch_pos_embed = self.position_embedding.weight.unsqueeze(0)
+
+        dim = embeddings.shape[-1]
+
+        if is_after_patchify:
+            new_height = height
+            new_width = width
+        else:
+            new_height = height // self.patch_size
+            new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions,
+                                                  sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bilinear",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return patch_pos_embed
+
+    def fetch_position_embedding_lfu_cache(self,
+                                           embeddings,
+                                           h,
+                                           w,
+                                           max_cache: int = 20):
+        grid = (h, w)
+        if grid in self.cache_position_embedding:
+            self.cache_position_count[grid] += 1
+            return self.cache_position_embedding[grid]
+
+        if len(self.cache_position_embedding) >= max_cache:
+            min_hit_grid = min(
+                self.cache_position_count,
+                key=self.cache_position_count.get,
+            )
+            self.cache_position_count.pop(min_hit_grid)
+            self.cache_position_embedding.pop(min_hit_grid)
+
+        position_embedding = self.interpolate_pos_encoding(
+            embeddings, h, w, True)
+        self.cache_position_count[grid] = 1
+        self.cache_position_embedding[grid] = position_embedding
+        return position_embedding
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        position_ids: Optional[torch.Tensor] = None,
+        image_grid_thw: Optional[list[Union[
+            tuple[int, int, int],
+            list[tuple[int, int, int]],
+        ]]] = None,
+        interpolate_pos_encoding=False,
+    ) -> torch.Tensor:
+        if pixel_values.dim() == 4:
+            pixel_values = pixel_values.unsqueeze(0)
+        if pixel_values.dim() == 5:
+            if position_ids is None:
+                raise ValueError(
+                    "position_ids cannot be None when pixel_values.dim() is 5."
+                )
+            (
+                batch_size,
+                squence_len,
+                channel,
+                height,
+                width,
+            ) = pixel_values.shape
+            target_dtype = self.patch_embedding.weight.dtype
+            pixel_values = rearrange(pixel_values, "b l c h w -> (b l) c h w")
+            patch_embeds = self.patch_embedding(
+                pixel_values.to(dtype=target_dtype))
+            embeddings = patch_embeds.flatten(-2).squeeze(-1)
+
+            if interpolate_pos_encoding and image_grid_thw is not None:
+                start = 0
+                tmp_embeddings = list()
+                for image_grid in image_grid_thw:
+                    t, h, w = image_grid
+                    end = start + t * h * w
+                    image_embeddings = embeddings[start:end, :]
+                    position_embedding = (self.interpolate_pos_encoding(
+                        image_embeddings, h, w, True).squeeze(0).repeat(t, 1))
+                    image_embeddings = image_embeddings + position_embedding
+                    tmp_embeddings.append(image_embeddings)
+                    start = end
+                embeddings = torch.concat(tmp_embeddings, dim=0).unsqueeze(0)
+            else:
+                embeddings = embeddings + self.packing_position_embedding(
+                    position_ids)
+            return embeddings
+        else:
+            raise ValueError("Unsupported pixel_values dimension:"
+                             f" {pixel_values.dim()}. Expected 4 or 5.")
+
+
+def apply_rotary_pos_emb_flashatt(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    cos = cos.chunk(2, dim=-1)[0].contiguous()
+    sin = sin.chunk(2, dim=-1)[0].contiguous()
+
+    from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
+
+    q_embed = apply_rotary_emb(q.float(), cos.float(), sin.float()).type_as(q)
+    k_embed = apply_rotary_emb(k.float(), cos.float(), sin.float()).type_as(k)
+    return q_embed, k_embed
+
+
+class KeyeSiglipAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You
+    Need' paper."""
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        hidden_size = config.hidden_size
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_attention_heads
+        if self.total_num_kv_heads >= tp_size:
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = config.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scale = self.head_dim**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.out_proj = RowParallelLinear(
+            input_size=hidden_size,
+            output_size=hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+
+        # Detect attention implementation.
+        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
+        if self.attn_backend not in {_Backend.FLASH_ATTN, _Backend.XFORMERS}:
+            raise RuntimeError(
+                f"Keye-VL does not support {self.attn_backend} backend now.")
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        cu_seqlens: Optional[list[torch.Tensor]] = None,
+        rope_emb: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split(
+            [self.q_size, self.kv_size, self.kv_size],
+            dim=-1,
+        )
+
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        batch_size = q.shape[0]
+
+        if rope_emb is None:
+            q = q.view(*q.shape[:-1], self.num_heads, self.head_dim)
+            k = k.view(
+                *k.shape[:-1],
+                self.num_kv_heads,
+                self.head_dim,
+            )
+            v = v.view(
+                *v.shape[:-1],
+                self.num_kv_heads,
+                self.head_dim,
+            )
+        else:
+            if cu_seqlens is None:
+                raise ValueError(
+                    "cu_seqlens cannot be None when rope_emb is not None.")
+            cos, sin = rope_emb
+            q = q.view(*q.shape[:-1], self.num_heads, self.head_dim)
+            k = k.view(
+                *k.shape[:-1],
+                self.num_kv_heads,
+                self.head_dim,
+            )
+            q, k = apply_rotary_pos_emb_flashatt(q, k, cos, sin)
+            v = v.view(
+                *v.shape[:-1],
+                self.num_kv_heads,
+                self.head_dim,
+            )
+
+        if self.attn_backend == _Backend.FLASH_ATTN:
+            from flash_attn import flash_attn_varlen_func
+
+            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
+
+            output = flash_attn_varlen_func(
+                q,
+                k,
+                v,
+                cu_seqlens_q=cu_seqlens,
+                cu_seqlens_k=cu_seqlens,
+                max_seqlen_q=max_seqlen,
+                max_seqlen_k=max_seqlen,
+                causal=False,
+                softmax_scale=self.scale,
+            )
+            context_layer = rearrange(output,
+                                      "(b s) ... -> b s ...",
+                                      b=batch_size)
+        elif self.attn_backend == _Backend.XFORMERS:
+            from xformers import ops as xops
+            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
+
+            attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens,
+                                                       kv_seqlen=None,
+                                                       device=q.device)
+
+            context_layer = xops.memory_efficient_attention_forward(
+                q, k, v, attn_bias=attn_bias, p=0, scale=None)
+
+        context_layer = rearrange(context_layer,
+                                  "b s h d -> b s (h d)").contiguous()
+
+        output, _ = self.out_proj(context_layer)
+        return output
+
+
+class SigLIPRotaryEmbedding(nn.Module):
+
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.rope_init()
+
+    def rope_init(self):
+        inv_freq = 1.0 / (self.theta**(
+            torch.arange(0, self.dim, 2, dtype=torch.float) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(
+            seqlen,
+            device=self.inv_freq.device,
+            dtype=self.inv_freq.dtype,
+        )
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+
+
+class KeyeSiglipEncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Union[PretrainedConfig],
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+        self.self_attn = KeyeSiglipAttention(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+        cu_seqlens: Optional[list[torch.Tensor]] = None,
+        rope_emb: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> tuple[torch.FloatTensor]:
+
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            cu_seqlens=cu_seqlens,
+            rope_emb=rope_emb,
+        )
+
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class KeyeSiglipEncoder(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        num_heads = config.num_attention_heads
+        head_dim = embed_dim // num_heads
+        self.layers = nn.ModuleList([
+            KeyeSiglipEncoderLayer(
+                config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.layers.{layer_idx}",
+            ) for layer_idx in range(config.num_hidden_layers)
+        ])
+        self.rotary_pos_emb = SigLIPRotaryEmbedding(head_dim // 2)
+
+    @staticmethod
+    def flatten_list(image_grid_thw):
+        tmp_image_grid_thw = list()
+        for image_grid in image_grid_thw:
+            if isinstance(image_grid, list):
+                tmp_image_grid_thw.extend(image_grid)
+            else:
+                tmp_image_grid_thw.append(image_grid)
+        return tmp_image_grid_thw
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cu_seqlens: Optional[list[torch.Tensor]] = None,
+        image_grid_thw: Optional[list[Union[
+            tuple[int, int, int],
+            list[tuple[int, int, int]],
+        ]]] = None,
+        height_position_ids: Optional[torch.Tensor] = None,
+        width_position_ids: Optional[torch.Tensor] = None,
+        use_rope: Optional[bool] = False,
+        window_size: Optional[bool] = -1,
+        vision_or_text: str = "vision",
+    ) -> BaseModelOutput:
+        device = inputs_embeds.device
+        hidden_states = inputs_embeds
+        if use_rope is True:
+            flatten_image_grid_thw = self.flatten_list(image_grid_thw)
+
+            if width_position_ids is None or height_position_ids is None:
+                split_hids = list()
+                split_wids = list()
+                for t, h, w in flatten_image_grid_thw:
+                    image_pids = torch.arange(t * h * w,
+                                              device=device) % (h * w)
+                    sample_hids = image_pids // w
+                    sample_wids = image_pids % w
+                    split_hids.append(sample_hids)
+                    split_wids.append(sample_wids)
+                width_position_ids = torch.concat(split_wids, dim=0)
+                height_position_ids = torch.concat(split_hids, dim=0)
+
+            pids = torch.stack(
+                [height_position_ids, width_position_ids],
+                dim=-1,
+            )
+            max_grid_size = pids.max() + 1
+            rope_emb_max_grid = self.rotary_pos_emb(max_grid_size)
+            rope_emb = rope_emb_max_grid[pids].flatten(1)
+            rope_emb = rope_emb.repeat(1, 2)
+            rope_emb = (rope_emb.cos(), rope_emb.sin())
+        else:
+            rope_emb = None
+
+        attn_cu_seqlens = cu_seqlens
+        hidden_states = inputs_embeds
+        assert attention_mask is None
+
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(
+                hidden_states,
+                attention_mask,
+                output_attentions=output_attentions,
+                cu_seqlens=attn_cu_seqlens,
+                rope_emb=rope_emb,
+            )
+        return hidden_states
+
+
+class KeyeSiglipVisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = KeyeVisionEmbeddings(config)
+        self.encoder = KeyeSiglipEncoder(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.encoder",
+        )
+        self.post_layernorm = nn.LayerNorm(embed_dim,
+                                           eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        pixel_values,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = False,
+        attention_mask: Optional[torch.Tensor] = None,
+        sample_indices: Optional[torch.Tensor] = None,
+        image_indices: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        height_position_ids: Optional[torch.Tensor] = None,
+        width_position_ids: Optional[torch.Tensor] = None,
+        cu_seqlens: Optional[list[torch.Tensor]] = None,
+        padding_mask: Optional[torch.Tensor] = None,
+        vision_return_embed_list: Optional[bool] = False,
+        image_grid_thw: Optional[list[Union[
+            tuple[int, int, int],
+            list[tuple[int, int, int]],
+        ]]] = None,
+        return_pooler_output: Optional[bool] = True,
+        use_rope: Optional[bool] = False,
+        window_size: Optional[bool] = -1,
+    ) -> BaseModelOutputWithPooling:
+
+        hidden_states = self.embeddings(
+            pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            position_ids=position_ids,
+            image_grid_thw=image_grid_thw,
+        )
+
+        last_hidden_state = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            attention_mask=attention_mask,
+            cu_seqlens=cu_seqlens,
+            image_grid_thw=image_grid_thw,
+            use_rope=use_rope,
+            height_position_ids=height_position_ids,
+            width_position_ids=width_position_ids,
+            window_size=window_size,
+            vision_or_text="vision",
+        )
+
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        sample_hidden_state = list()
+        if cu_seqlens is None:
+            raise ValueError("cu_seqlens cannot be None for "
+                             "SiglipVisionTransformer output processing.")
+        for i in range(cu_seqlens.shape[0] - 1):
+            start = cu_seqlens[i]
+            end = cu_seqlens[i + 1]
+            tensor = last_hidden_state[:, start:end, :].squeeze(0)
+            sample_hidden_state.append(tensor)
+
+        return sample_hidden_state
+
+
+class KeyeSiglipVisionModel(nn.Module):
+    config_class = PretrainedConfig
+    main_input_name = "pixel_values"
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.vision_model = KeyeSiglipVisionTransformer(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.vision_model",
+        )
+        self.quant_config = quant_config
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.vision_model.embeddings.patch_embedding.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.vision_model.embeddings.patch_embedding.weight.device
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    def forward(
+        self,
+        pixel_values,
+        sample_indices: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        position_ids: Optional[torch.Tensor] = None,
+        vision_return_embed_list: Optional[bool] = False,
+        image_grid_thw: Optional[list[Union[
+            tuple[int, int, int],
+            list[tuple[int, int, int]],
+        ]]] = None,
+        cu_seqlens: Optional[list[torch.Tensor]] = None,
+        return_pooler_output: Optional[bool] = True,
+        use_rope: Optional[bool] = False,
+        window_size: Optional[bool] = -1,
+    ) -> BaseModelOutputWithPooling:
+
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            position_ids=position_ids,
+            vision_return_embed_list=vision_return_embed_list,
+            image_grid_thw=image_grid_thw,
+            sample_indices=sample_indices,
+            cu_seqlens=cu_seqlens,
+            return_pooler_output=return_pooler_output,
+            use_rope=use_rope,
+            window_size=window_size,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "head.attention" in name or "head.layernorm" in name:
+                continue
+            if "head.mlp" in name or "head.probe" in name:
+                continue
+            if self.quant_config is not None and (
+                    scale_name := self.quant_config.get_cache_scale(name)):
+                param = params_dict[scale_name]
+                weight_loader = getattr(
+                    param,
+                    "weight_loader",
+                    default_weight_loader,
+                )
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for (
+                    param_name,
+                    weight_name,
+                    shard_id,
+            ) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(
+                    param,
+                    "weight_loader",
+                    default_weight_loader,
+                )
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Projector(nn.Module):
+
+    def __init__(
+        self,
+        text_config: PretrainedConfig,
+        vision_config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.merge_kernel_size = (2, 2)
+
+        self.hidden_size = (self.vision_config.hidden_size *
+                            self.merge_kernel_size[0] *
+                            self.merge_kernel_size[1])
+
+        self.pre_norm = torch.nn.LayerNorm(self.vision_config.hidden_size,
+                                           eps=1e-05)
+        self.act = GELUActivation()
+
+        self.linear_1 = ColumnParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.linear_1",
+        )
+        self.linear_2 = RowParallelLinear(
+            self.hidden_size,
+            self.text_config.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.linear_2",
+        )
+
+    def forward(
+        self,
+        image_features: torch.Tensor,
+        image_grid_thw: list[tuple[int, int, int]],
+    ) -> torch.Tensor:
+        m1, m2 = self.merge_kernel_size
+        if isinstance(image_features, (list, tuple)):
+            processed_features = list()
+            for image_feature, image_grid in zip(image_features,
+                                                 image_grid_thw):
+                image_feature = self.pre_norm(image_feature)
+                t, h, w = image_grid
+
+                image_feature = rearrange(
+                    image_feature,
+                    "(t h p1 w p2) d -> (t h w) (p1 p2 d)",
+                    t=t,
+                    h=h // m1,
+                    p1=m1,
+                    w=w // m2,
+                    p2=m2,
+                )
+                hidden_states, _ = self.linear_1(image_feature)
+                hidden_states = self.act(hidden_states)
+                hidden_states, _ = self.linear_2(hidden_states)
+                processed_features.append(hidden_states)
+
+            return processed_features
+
+        dims = image_features.shape[:-1]
+        dim = image_features.shape[-1]
+        image_features = image_features.view(np.prod(dims), dim)
+        hidden_states = self.pre_norm(image_features).view(
+            -1, self.hidden_size)
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+
+        return hidden_states.view(*dims, -1)
+
+
+def _keye_field_config(hf_inputs: Mapping[str, torch.Tensor], ):
+    image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
+    image_grid_sizes = image_grid_thw.prod(-1)
+
+    video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
+    video_grid_sizes = video_grid_thw.prod(-1)
+
+    return dict(
+        pixel_values=MultiModalFieldConfig.flat_from_sizes(
+            "image", image_grid_sizes),
+        image_embeds=MultiModalFieldConfig.flat_from_sizes(
+            "image", image_grid_sizes),
+        image_grid_thw=MultiModalFieldConfig.batched("image"),
+        pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
+            "video", video_grid_sizes),
+        video_embeds=MultiModalFieldConfig.flat_from_sizes(
+            "video", video_grid_sizes),
+        video_grid_thw=MultiModalFieldConfig.batched("video"),
+    )
+
+
+class KeyeMultiModalDataParser(MultiModalDataParser):
+
+    def _parse_image_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
+    ) -> ModalityDataItems[Any, Any]:
+        if isinstance(data, dict):
+            return DictEmbeddingItems(
+                data,
+                modality="image",
+                required_fields={
+                    "image_embeds",
+                    "image_grid_thw",
+                },
+                fields_factory=_keye_field_config,
+            )
+
+        return super()._parse_image_data(data)
+
+    def _parse_video_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
+    ) -> ModalityDataItems[Any, Any]:
+        if isinstance(data, dict):
+            return DictEmbeddingItems(
+                data,
+                modality="video",
+                required_fields={
+                    "video_embeds",
+                    "video_grid_thw",
+                },
+                fields_factory=_keye_field_config,
+            )
+
+        return super()._parse_video_data(data)
+
+
+class KeyeProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(PretrainedConfig)
+
+    def get_hf_processor(
+        self,
+        *,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        size: Optional[dict[str, int]] = None,
+        **kwargs: object,
+    ):
+        return self.ctx.get_hf_processor(
+            image_processor=self.get_image_processor(
+                min_pixels=min_pixels,
+                max_pixels=max_pixels,
+                size=size,
+            ),
+            **kwargs,
+        )
+
+    def _get_image_processor_kwargs(
+        self,
+        *,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        size: Optional[dict[str, int]] = None,
+        **kwargs: object,
+    ):
+        if self.ctx.model_config.mm_processor_kwargs:
+            kwargs.update(self.ctx.model_config.mm_processor_kwargs)
+
+        if min_pixels is not None:
+            kwargs["min_pixels"] = min_pixels
+
+            if size is None:
+                size = {"shortest_edge": min_pixels}
+            else:
+                size["shortest_edge"] = min_pixels
+
+        if max_pixels is not None:
+            kwargs["max_pixels"] = max_pixels
+
+            if size is None:
+                size = {"longest_edge": max_pixels}
+            else:
+                size["longest_edge"] = max_pixels
+
+        if size is not None:
+            kwargs["size"] = size
+
+        return kwargs
+
+    def get_image_processor(
+        self,
+        *,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        size: Optional[dict[str, int]] = None,
+        **kwargs: object,
+    ):
+        return cached_image_processor_from_config(
+            self.ctx.model_config,
+            **self._get_image_processor_kwargs(
+                min_pixels=min_pixels,
+                max_pixels=max_pixels,
+                size=size,
+                **kwargs,
+            ),
+        )
+
+    def get_supported_mm_limits(self, ) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": None}
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        return {
+            "image": self.get_max_image_tokens(),
+            "video": self.get_max_video_tokens(seq_len),
+        }
+
+    def _get_vision_info(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int = 1,
+        do_resize: bool = True,
+        image_processor,
+    ) -> tuple[ImageSize, int]:
+        if image_processor is None:
+            image_processor = self.get_image_processor()
+
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+        patch_size = vision_config.patch_size
+        merge_size = vision_config.spatial_merge_size
+        temporal_patch_size = 1
+
+        if do_resize:
+            resized_height, resized_width = smart_resize(
+                height=image_height,
+                width=image_width,
+                factor=patch_size * merge_size,
+                min_pixels=image_processor.min_pixels,
+                max_pixels=image_processor.max_pixels,
+            )
+            preprocessed_size = ImageSize(width=resized_width,
+                                          height=resized_height)
+        else:
+            preprocessed_size = ImageSize(width=image_width,
+                                          height=image_height)
+
+        padded_num_frames = num_frames + num_frames % temporal_patch_size
+
+        grid_t = max(padded_num_frames // temporal_patch_size, 1)
+        grid_h = preprocessed_size.height // patch_size
+        grid_w = preprocessed_size.width // patch_size
+
+        num_patches = grid_t * grid_h * grid_w
+        num_vision_tokens = num_patches // (merge_size**2)
+
+        return preprocessed_size, num_vision_tokens
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        image_processor,
+    ) -> int:
+        _, num_image_tokens = self._get_vision_info(
+            image_width=image_width,
+            image_height=image_height,
+            image_processor=image_processor,
+        )
+        return num_image_tokens
+
+    def get_num_video_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int,
+        image_processor,
+    ) -> int:
+        _, num_video_tokens = self._get_vision_info(
+            image_width=image_width,
+            image_height=image_height,
+            num_frames=num_frames,
+            image_processor=image_processor,
+        )
+        return num_video_tokens
+
+    def get_image_size_with_most_features(self, ) -> ImageSize:
+        max_image_size, _ = self._get_vision_info(
+            image_width=_MAX_IMAGE_SIZE,
+            image_height=_MAX_IMAGE_SIZE,
+            image_processor=None,
+        )
+        return max_image_size
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            image_processor=None,
+        )
+
+    def _get_max_video_frames(self, max_tokens: int) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        num_frames = 0
+
+        while True:
+            next_num_frames = num_frames + 1
+            next_max_tokens = self.get_num_video_tokens(
+                image_width=target_width,
+                image_height=target_height,
+                num_frames=next_num_frames,
+                image_processor=None,
+            )
+
+            if next_max_tokens > max_tokens:
+                break
+
+            num_frames = next_num_frames
+
+        return num_frames
+
+    def get_num_frames_with_most_features(self, seq_len: int) -> int:
+        mm_config = self.ctx.get_mm_config()
+        max_images = mm_config.get_limit_per_prompt("image")
+        max_videos = mm_config.get_limit_per_prompt("video")
+
+        max_image_tokens = self.get_max_image_tokens() * max_images
+        max_total_frames = self._get_max_video_frames(seq_len -
+                                                      max_image_tokens)
+        max_frames_per_video = min(
+            max_total_frames // max(max_videos, 1),
+            _MAX_FRAMES_PER_VIDEO,
+        )
+
+        return max(max_frames_per_video, 1)
+
+    def get_max_video_tokens(self, seq_len: int) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_video_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            num_frames=self.get_num_frames_with_most_features(seq_len),
+            image_processor=None,
+        )
+
+
+class KeyeDummyInputsBuilder(BaseDummyInputsBuilder[KeyeProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        hf_processor = self.info.get_hf_processor()
+        image_token: str = hf_processor.image_token
+        video_token: str = hf_processor.video_token
+
+        return image_token * num_images + video_token * num_videos
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        target_width, target_height = (
+            self.info.get_image_size_with_most_features())
+        target_num_frames = self.info.get_num_frames_with_most_features(
+            seq_len)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+            ),
+            "video":
+            self._get_dummy_videos(
+                width=target_width,
+                height=target_height,
+                num_frames=target_num_frames,
+                num_videos=num_videos,
+            ),
+        }
+
+        return mm_data
+
+
+class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return KeyeMultiModalDataParser()
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        mm_kwargs = self.info._get_image_processor_kwargs(**mm_kwargs)
+        return self.info.ctx.call_hf_processor(
+            self.info.get_hf_processor(**mm_kwargs),
+            dict(text=prompt, **mm_data),
+            dict(**mm_kwargs, **tok_kwargs),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_processor = self.info.get_image_processor(
+            **hf_processor_mm_kwargs)
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+
+        placeholder = {
+            "image": vocab[hf_processor.image_token],
+            "video": vocab[hf_processor.video_token],
+        }
+
+        merge_length = image_processor.merge_size**2
+
+        def get_replacement_keye(item_idx: int, modality: str):
+            grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx]
+            assert isinstance(grid_thw, torch.Tensor)
+
+            num_tokens = int(grid_thw.prod()) // merge_length
+            return [placeholder[modality]] * num_tokens
+
+        return [
+            PromptReplacement(
+                modality=modality,
+                target=[placeholder[modality]],
+                replacement=partial(get_replacement_keye, modality=modality),
+            ) for modality in ("image", "video")
+        ]
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return _keye_field_config(hf_inputs)
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    KeyeMultiModalProcessor,
+    info=KeyeProcessingInfo,
+    dummy_inputs=KeyeDummyInputsBuilder,
+)
+class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
+                                   SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
+        "lm_head.": "language_model.lm_head.",
+        "model.": "language_model.model.",
+    })
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<|vision_start|><|image_pad|><|vision_end|>"
+        if modality.startswith("video"):
+            return "<|vision_start|><|video_pad|><|vision_end|>"
+
+        raise ValueError("Only image or video modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: PretrainedConfig = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.visual = KeyeSiglipVisionModel(
+            config.vision_config,
+            quant_config=self._maybe_ignore_quant_config(quant_config),
+            prefix=maybe_prefix(prefix, "visual"),
+        )
+        self.mlp_AR = Projector(
+            config,
+            config.vision_config,
+            quant_config=self._maybe_ignore_quant_config(quant_config),
+            prefix=maybe_prefix(prefix, "mlp_AR"),
+        )
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+            architectures=["Qwen3ForCausalLM"],
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
+        if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)):
+            return None
+        return quant_config
+
+    def _validate_and_reshape_mm_tensor(self, mm_input: object,
+                                        name: str) -> torch.Tensor:
+        if not isinstance(mm_input, (torch.Tensor, list)):
+            raise ValueError(f"Incorrect type of {name}. "
+                             f"Got type: {type(mm_input)}")
+        if isinstance(mm_input, torch.Tensor):
+            if mm_input.ndim == 2:
+                return mm_input
+            if mm_input.ndim == 5:
+                return mm_input
+            if mm_input.ndim != 3:
+                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
+                                 f"Got ndim: {mm_input.ndim} "
+                                 f"(shape={mm_input.shape})")
+            return torch.concat(list(mm_input))
+        else:
+            return torch.concat(mm_input)
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[KeyeImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        image_grid_thw = kwargs.pop("image_grid_thw", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            pixel_values = self._validate_and_reshape_mm_tensor(
+                pixel_values, "image pixel values")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return KeyeImagePixelInputs(
+                type="pixel_values",
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_thw,
+            )
+
+        if image_embeds is not None:
+            image_embeds = self._validate_and_reshape_mm_tensor(
+                image_embeds, "image embeds")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            if not isinstance(image_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+            return KeyeImageEmbeddingInputs(
+                type="image_embeds",
+                image_embeds=image_embeds,
+                image_grid_thw=image_grid_thw,
+            )
+
+    def _parse_and_validate_video_input(
+            self, **kwargs: object) -> Optional[KeyeVideoInputs]:
+        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
+        video_embeds = kwargs.pop("video_embeds", None)
+        video_grid_thw = kwargs.pop("video_grid_thw", None)
+
+        if pixel_values_videos is None and video_embeds is None:
+            return None
+
+        if pixel_values_videos is not None:
+            pixel_values_videos = self._validate_and_reshape_mm_tensor(
+                pixel_values_videos,
+                "video pixel values",
+            )
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            return KeyeVideoPixelInputs(
+                type="pixel_values_videos",
+                pixel_values_videos=pixel_values_videos,
+                video_grid_thw=video_grid_thw,
+            )
+
+        if video_embeds is not None:
+            video_embeds = self._validate_and_reshape_mm_tensor(
+                video_embeds, "video embeds")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            if not isinstance(video_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of video embeddings. "
+                                 f"Got type: {type(video_embeds)}")
+            return KeyeVideoEmbeddingInputs(
+                type="video_embeds",
+                video_embeds=video_embeds,
+                video_grid_thw=video_grid_thw,
+            )
+
+    def _process_image_input(
+            self, image_input: KeyeImageInputs) -> tuple[torch.Tensor, ...]:
+        siglip_position_ids = list()
+        image_grid_hws = list()
+        sample_indices = list()
+        cu_seqlens = [0]
+
+        image_grid_thw = image_input["image_grid_thw"]
+        assert image_grid_thw.ndim == 2
+
+        for idx, thaw in enumerate(image_grid_thw):
+            thw_tuple = tuple(thaw.detach().cpu().numpy().tolist())
+            numel = np.prod(thw_tuple)
+            image_grid_hws.append(thw_tuple)
+            image_position_ids = torch.arange(numel) % np.prod(thw_tuple[1:])
+            siglip_position_ids.append(image_position_ids)
+            sample_indices.append(torch.full((numel, ), idx,
+                                             dtype=torch.int64))
+            cu_seqlens.append(cu_seqlens[-1] + numel)
+
+        if image_input["type"] == "image_embeds":
+            raise ValueError(
+                "Image embeddings are not supported for this processing path.")
+        else:
+            pixel_values = image_input["pixel_values"].type(self.visual.dtype)
+            siglip_position_ids = torch.concat(siglip_position_ids,
+                                               dim=0).to(pixel_values.device)
+            cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32).to(
+                pixel_values.device)
+            sample_indices = torch.concat(sample_indices,
+                                          dim=0).to(pixel_values.device)
+
+            image_embeds = self.visual(
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_hws,
+                position_ids=siglip_position_ids,
+                vision_return_embed_list=False,
+                interpolate_pos_encoding=True,
+                sample_indices=sample_indices,
+                cu_seqlens=cu_seqlens,
+                use_rope=True,
+                window_size=-1,
+            )
+            image_embeds = tuple(self.mlp_AR(image_embeds, image_grid_thw))
+            return image_embeds
+
+    def _process_video_input(
+            self, video_input: KeyeVideoInputs) -> tuple[torch.Tensor, ...]:
+        siglip_position_ids = list()
+        video_grid_hws = list()
+        sample_indices = list()
+        cu_seqlens = [0]
+
+        video_grid_thw = video_input["video_grid_thw"]
+        assert video_grid_thw.ndim == 2
+
+        for idx, thaw in enumerate(video_grid_thw):
+            thw_tuple = tuple(thaw.detach().cpu().numpy().tolist())
+            numel = np.prod(thw_tuple)
+
+            video_grid_hws.append(thw_tuple)
+            video_position_ids = torch.arange(numel) % np.prod(thw_tuple[1:])
+            siglip_position_ids.append(video_position_ids)
+            sample_indices.append(torch.full((numel, ), idx,
+                                             dtype=torch.int64))
+            cu_seqlens.append(cu_seqlens[-1] + numel)
+
+        if video_input["type"] == "video_embeds":
+            raise ValueError(
+                "Video embeddings are not supported for this processing path.")
+        else:
+            pixel_values_videos = video_input["pixel_values_videos"].type(
+                self.visual.dtype)
+            siglip_position_ids = torch.concat(siglip_position_ids, dim=0).to(
+                pixel_values_videos.device)
+            cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32).to(
+                pixel_values_videos.device)
+            sample_indices = torch.concat(sample_indices,
+                                          dim=0).to(pixel_values_videos.device)
+
+            video_embeds = self.visual(
+                pixel_values=pixel_values_videos,
+                image_grid_thw=video_grid_hws,
+                position_ids=siglip_position_ids,
+                vision_return_embed_list=True,
+                interpolate_pos_encoding=True,
+                sample_indices=sample_indices,
+                cu_seqlens=cu_seqlens,
+                use_rope=True,
+                window_size=-1,
+            )
+            video_embeds = tuple(self.mlp_AR(video_embeds, video_grid_thw))
+            return video_embeds
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
+
+        for input_key in kwargs:
+            if (input_key in ("pixel_values", "image_embeds")
+                    and "images" not in modalities):
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+            if (input_key in ("pixel_values_videos", "video_embeds")
+                    and "videos" not in modalities):
+                modalities["videos"] = self._parse_and_validate_video_input(
+                    **kwargs)
+
+        return modalities
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
+            return None
+
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        for modality in modalities:
+            if modality == "images":
+                image_input = modalities["images"]
+                vision_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += vision_embeddings
+            if modality == "videos":
+                video_input = modalities["videos"]
+                video_embeddings = self._process_video_input(video_input)
+                multimodal_embeddings += video_embeddings
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                [
+                    self.config.image_token_id,
+                    self.config.video_token_id,
+                ],
+            )
+        return inputs_embeds
+
+    def get_input_embeddings_v0(
+        self,
+        input_ids: torch.Tensor,
+        image_input: Optional[KeyeImagePixelInputs] = None,
+        video_input: Optional[KeyeVideoPixelInputs] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.get_input_embeddings(input_ids)
+        if image_input is not None:
+            image_embeds = self._process_image_input(image_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                image_embeds,
+                placeholder_token_id=self.config.image_token_id,
+            )
+
+        if video_input is not None:
+            video_embeds = self._process_video_input(video_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                video_embeds,
+                placeholder_token_id=self.config.video_token_id,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        """Run forward pass for Qwen2-VL.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            positions: Flattened (concatenated) position ids corresponding to a
+                batch.
+                **NOTE**: If mrope is enabled (default setting for Qwen2-VL
+                opensource models), the shape will be `(3, seq_len)`,
+                otherwise it will be `(seq_len,).
+            pixel_values: Pixel values to be fed to a model.
+                `None` if no images are passed.
+            image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
+                `None` if no images are passed.
+            pixel_values_videos: Pixel values of videos to be fed to a model.
+                `None` if no videos are passed.
+            video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
+                `None` if no videos are passed.
+        """
+
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        elif inputs_embeds is None:
+            image_input = self._parse_and_validate_image_input(**kwargs)
+            video_input = self._parse_and_validate_video_input(**kwargs)
+
+            if image_input is None and video_input is None:
+                inputs_embeds = None
+            else:
+                if uses_mrope(self.config):
+                    assert positions.ndim == 2 and positions.size(0) == 3, (
+                        "multimodal section rotary embedding requires "
+                        f"(3, seq_len) positions, but got {positions.size()}")
+                inputs_embeds = self.get_input_embeddings_v0(
+                    input_ids,
+                    image_input=image_input,
+                    video_input=video_input,
+                )
+                input_ids = None
+
+        hidden_states = self.language_model.model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """Get the module prefix in multimodal models."""
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="visual.",
+            tower_model="mlp_AR.",
+        )
diff --git a/vllm_v0.10.0/vllm/model_executor/models/kimi_vl.py b/vllm_v0.10.0/vllm/model_executor/models/kimi_vl.py
new file mode 100644
index 0000000..9c0a6ba
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/kimi_vl.py
@@ -0,0 +1,585 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/modeling_kimi_vl.py
+# Copyright 2025 The Moonshot AI Team, DeepSeek-AI, and HuggingFace Inc. team. All rights reserved.
+#
+# The code is based on llava (llava/modeling_llava.py) and DeepSeek-V3 (DeepSeek-V3/modeling_deepseek.py), but modified for KimiVL.
+#
+# Licensing Information:
+# - Code derived from llava (llava/modeling_llava.py) and DeepSeek-V3 (DeepSeek-V3/modeling_deepseek.py) is licensed under the Apache License, Version 2.0.
+# - Other parts of the code are licensed under the MIT License.
+#
+# Apache License, Version 2.0:
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# MIT License:
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import copy
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from dataclasses import dataclass
+from typing import Any, Literal, Optional, TypedDict, Union
+
+import torch
+from torch import nn
+from transformers import BatchFeature
+from transformers.activations import GELUActivation
+
+from vllm.config import VllmConfig
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size)
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.models.deepseek_v2 import DeepseekV2Model
+from vllm.model_executor.models.interfaces import SupportsMultiModal
+from vllm.model_executor.models.moonvit import MoonVitPretrainedModel
+from vllm.model_executor.models.utils import merge_multimodal_embeddings
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig
+from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
+
+from .utils import is_pp_missing_parameter, maybe_prefix
+
+
+# For dummy input only
+@dataclass
+class MaxImageTokenMeta:
+    width: int = 1024
+    height: int = 1024
+
+
+class KimiVLMultiModalProjector(nn.Module):
+
+    def __init__(self, config: KimiVLConfig):
+        super().__init__()
+
+        self.hidden_size = (config.vision_config.hidden_size *
+                            config.vision_config.merge_kernel_size[0] *
+                            config.vision_config.merge_kernel_size[1])
+
+        self.pre_norm = torch.nn.LayerNorm(config.vision_config.hidden_size,
+                                           eps=1e-5)
+        self.linear_1 = nn.Linear(self.hidden_size,
+                                  self.hidden_size,
+                                  bias=True)
+        self.act = GELUActivation()
+        self.linear_2 = nn.Linear(self.hidden_size,
+                                  config.text_config.hidden_size,
+                                  bias=True)
+
+    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.pre_norm(image_features).view(
+            -1, self.hidden_size)
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class KimiVLImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    Shape:`(num_patches, num_channels, patch_size, patch_size)`
+    """
+
+    image_grid_hws: torch.Tensor
+    """Shape:`(num_images, 2)`"""
+
+
+# TODO: support embeds too
+# We only support pixel input for kimi-vl now
+KimiVLImageInputs = KimiVLImagePixelInputs
+
+
+class KimiVLProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(KimiVLConfig)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        hf_processor = self.get_hf_processor()
+        patch_size = hf_processor.image_processor.patch_size
+        kernel_size = hf_processor.image_processor.merge_kernel_size
+        in_token_limit = hf_processor.image_processor.in_token_limit
+        height = image_height
+        width = image_width
+        assert isinstance(height,
+                          int), f"height must be int, current height {height}"
+        assert isinstance(width,
+                          int), f"width must be int, current width {width}"
+        assert kernel_size is not None, "kernel_size must be specified"
+
+        if (width // patch_size) * (height // patch_size) > in_token_limit:
+            scale = math.sqrt(in_token_limit / ((width // patch_size) *
+                                                (height // patch_size)))
+            new_w, new_h = int(width * scale), int(height * scale)
+            width, height = new_w, new_h
+
+        kernel_height, kernel_width = kernel_size
+
+        pad_height = (kernel_height * patch_size - height %
+                      (kernel_height * patch_size)) % (kernel_height *
+                                                       patch_size)
+        pad_width = (kernel_width * patch_size - width %
+                     (kernel_width * patch_size)) % (kernel_width * patch_size)
+
+        # Calculate new dimensions after padding and patching
+        token_height = (height + pad_height) // (kernel_size[0] * patch_size)
+        token_width = (width + pad_width) // (kernel_size[1] * patch_size)
+        return int(token_height * token_width)
+
+    @property
+    def image_token_id(self) -> int:
+        return self.get_hf_config().media_placeholder_token_id
+
+
+class KimiVLDummyInputsBuilder(BaseDummyInputsBuilder[KimiVLProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
+        return {
+            "image":
+            self._get_dummy_images(width=MaxImageTokenMeta.width,
+                                   height=MaxImageTokenMeta.height,
+                                   num_images=num_images)
+        }
+
+
+class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        image_grid_hws = hf_inputs.get("image_grid_hws", torch.empty((0, 2)))
+        image_grid_sizes = image_grid_hws.prod(-1)
+
+        # pixel_values is merged as a single large tensor
+        # image_grid_hws is shapes for each subtensor in pixel_values
+        return dict(
+            pixel_values=MultiModalFieldConfig.flat_from_sizes(
+                "image", image_grid_sizes),
+            image_grid_hws=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        image_token_id = self.info.image_token_id
+
+        def get_replacement(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                num_image_tokens = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                num_image_tokens = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                )
+
+            return [image_token_id] * num_image_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=get_replacement,
+            ),
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(KimiVLMultiModalProcessor,
+                                        info=KimiVLProcessingInfo,
+                                        dummy_inputs=KimiVLDummyInputsBuilder)
+class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal):
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<|media_start|>image<|media_content|><|media_pad|><|media_end|>"
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        model_config = vllm_config.model_config
+        config: KimiVLConfig = model_config.hf_config
+        self.config = config
+        quant_config = vllm_config.quant_config
+
+        assert isinstance(config.vision_config, MoonViTConfig)
+
+        self.vision_tower = MoonVitPretrainedModel(config.vision_config)
+
+        self.multi_modal_projector = KimiVLMultiModalProjector(config=config)
+
+        self.quant_config = quant_config
+        sub_vllm_config = copy.deepcopy(vllm_config)
+        sub_vllm_config.model_config.hf_config = sub_vllm_config.model_config.hf_config.text_config
+        self.language_model = DeepseekV2Model(
+            vllm_config=sub_vllm_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+        self.unpadded_vocab_size = config.text_config.vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.text_config.hidden_size,
+            org_num_embeddings=self.config.text_config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE)
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size, logit_scale)
+        self.media_placeholder: int = self.config.media_placeholder_token_id
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.tp_world_size = get_tensor_model_parallel_world_size()
+
+    # ref: qwen2_vl.py
+    def _validate_and_reshape_mm_tensor(self, mm_input: object,
+                                        name: str) -> torch.Tensor:
+        if not isinstance(mm_input, (torch.Tensor, list)):
+            raise ValueError(f"Incorrect type of {name}. "
+                             f"Got type: {type(mm_input)}")
+        if isinstance(mm_input, torch.Tensor):
+            if mm_input.ndim == 2:
+                return mm_input
+            if mm_input.ndim != 3:
+                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
+                                 f"Got ndim: {mm_input.ndim} "
+                                 f"(shape={mm_input.shape})")
+            return mm_input.reshape(-1, mm_input.shape[-1])
+        else:
+            return torch.concat(mm_input)
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[KimiVLImageInputs]:
+        # image input type must be pixel values now
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_grid_hws = kwargs.pop("image_grid_hws", None)
+
+        if pixel_values is None:
+            return None
+
+        image_grid_hws = self._validate_and_reshape_mm_tensor(
+            image_grid_hws, "image grid hws")
+        # pixel_values may have complex shapes
+        num_channels = 3
+        patch_size = self.config.vision_config.patch_size
+        if isinstance(pixel_values, list):
+            pixel_values = torch.cat([
+                x.reshape(-1, num_channels, patch_size, patch_size)
+                for x in pixel_values
+            ])
+        else:
+            pixel_values = pixel_values.reshape(-1, num_channels, patch_size,
+                                                patch_size)
+        pixel_values = pixel_values.to(self.vision_tower.dtype)
+        # image_grid_hws.shape = (N, 2)
+        assert image_grid_hws.ndim == 2, f"unexpected shape for image_grid_hws: {image_grid_hws.shape}"
+
+        return KimiVLImagePixelInputs(
+            type="pixel_values",
+            pixel_values=pixel_values,
+            image_grid_hws=image_grid_hws,
+        )
+
+    # perform vt on processored pixel_values
+    @torch.inference_mode()
+    def _process_image_pixels(self,
+                              inputs: KimiVLImagePixelInputs) -> torch.Tensor:
+        assert self.vision_tower is not None
+
+        pixel_values = inputs["pixel_values"]
+        image_grid_hws = inputs["image_grid_hws"]
+        return self.vision_tower(pixel_values, image_grid_hws)
+
+    def _process_image_input(self,
+                             image_input: KimiVLImageInputs) -> torch.Tensor:
+        assert image_input["type"] == "pixel_values"
+        image_features = self._process_image_pixels(image_input)
+        assert isinstance(image_features, list)
+        lengths = [x.shape[0] for x in image_features]
+        return self.multi_modal_projector(
+            torch.cat(image_features)).split(lengths)
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> Optional[NestedTensors]:
+        # Validate the multimodal input keyword arguments
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+
+        # Run multimodal inputs through encoder and projector
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+
+        # `get_input_embeddings` should already be implemented for the language
+        # model as one of the requirements of basic vLLM model implementation.
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+
+        if multimodal_embeddings is not None and len(
+                multimodal_embeddings) != 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                multimodal_embeddings=multimodal_embeddings,
+                placeholder_token_id=self.config.media_placeholder_token_id)
+
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> IntermediateTensors:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+        # NOTE: In v1, inputs_embeds is always generated at model runner from
+        # `get_multimodal_embeddings` and `get_input_embeddings`, this
+        # condition is only for v0 compatibility.
+        elif inputs_embeds is None:
+            image_input = self._parse_and_validate_image_input(**kwargs)
+            if image_input is None:
+                inputs_embeds = None
+            else:
+                inputs_embeds = self.get_input_embeddings(input_ids)
+                image_embeds = self._process_image_input(image_input)
+                inputs_embeds = merge_multimodal_embeddings(
+                    input_ids,
+                    inputs_embeds,
+                    image_embeds,
+                    placeholder_token_id=self.config.
+                    media_placeholder_token_id,
+                )
+                input_ids = None
+
+        hidden_states = self.language_model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata,
+                       **kwargs) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata, **kwargs)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        config = self.config.text_config
+        _KEYS_TO_MODIFY_MAPPING = {
+            "language_model.lm_head": "lm_head",
+            "language_model.model": "language_model",
+        }
+        # only doing this for language model part for now.
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        if not config.use_mla:
+            stacked_params_mapping += [
+                (".qkv_proj", ".q_proj", "q"),
+                (".qkv_proj", ".k_proj", "k"),
+                (".qkv_proj", ".v_proj", "v"),
+            ]
+        if getattr(config, "n_routed_experts", None):
+            # Params for weights, fp8 weight scales, fp8 activation scales
+            # (param_name, weight_name, expert_id, shard_id)
+            expert_params_mapping = FusedMoE.make_expert_params_mapping(
+                ckpt_gate_proj_name="gate_proj",
+                ckpt_down_proj_name="down_proj",
+                ckpt_up_proj_name="up_proj",
+                num_experts=config.n_routed_experts)
+        else:
+            expert_params_mapping = []
+
+        params_dict = dict(self.named_parameters())
+        for args in weights:
+            name, loaded_weight = args[:2]
+            kwargs = args[2] if len(args) > 2 else {}
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            spec_layer = get_spec_layer_idx_from_weight_name(config, name)
+            if spec_layer is not None:
+                continue  # skip spec decode layers for main model
+
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
+                if key_to_modify in name:
+                    name = name.replace(key_to_modify, new_key)
+            use_default_weight_loading = False
+            if "vision" in name:
+                if self.vision_tower is not None:
+                    # We only do sharding for language model and
+                    # not vision model for now.
+                    use_default_weight_loading = True
+            else:
+                for (param_name, weight_name,
+                     shard_id) in stacked_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    # We have mlp.experts[0].gate_proj in the checkpoint.
+                    # Since we handle the experts below in expert_params_mapping,
+                    # we need to skip here BEFORE we update the name, otherwise
+                    # name will be updated to mlp.experts[0].gate_up_proj, which
+                    # will then be updated below in expert_params_mapping
+                    # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                    if (("mlp.experts." in name) and name not in params_dict):
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id, **kwargs)
+                    break
+                else:
+                    for idx, (param_name, weight_name, expert_id,
+                              shard_id) in enumerate(expert_params_mapping):
+                        if weight_name not in name:
+                            continue
+                        name = name.replace(weight_name, param_name)
+
+                        if is_pp_missing_parameter(name, self):
+                            continue
+
+                        param = params_dict[name]
+                        weight_loader = param.weight_loader
+                        weight_loader(param,
+                                      loaded_weight,
+                                      name,
+                                      expert_id=expert_id,
+                                      shard_id=shard_id,
+                                      **kwargs)
+                        break
+                    else:
+                        use_default_weight_loading = True
+            if use_default_weight_loading:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight, **kwargs)
+
+
+def get_spec_layer_idx_from_weight_name(config: DeepseekV2Config,
+                                        weight_name: str) -> Optional[int]:
+    if hasattr(config,
+               "num_nextn_predict_layers") and (config.num_nextn_predict_layers
+                                                > 0):
+        layer_idx = config.num_hidden_layers
+        for i in range(config.num_nextn_predict_layers):
+            if weight_name.startswith(f"model.layers.{layer_idx+i}."):
+                return layer_idx + i
+    return None
diff --git a/vllm_v0.10.0/vllm/model_executor/models/llama.py b/vllm_v0.10.0/vllm/model_executor/models/llama.py
new file mode 100644
index 0000000..48ec611
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/llama.py
@@ -0,0 +1,647 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only LLaMA model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from vllm.attention import Attention, AttentionType
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
+                    is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class LlamaMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        prefix: str = "",
+        reduce_results: bool = True,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=hidden_size,
+            output_sizes=[intermediate_size] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=f"{prefix}.down_proj",
+        )
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        x, _ = self.gate_up_proj(x)
+        x = self.act_fn(x)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class LlamaAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        bias_o_proj: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        prefix: str = "",
+        attn_type: str = AttentionType.DECODER,
+    ) -> None:
+        super().__init__()
+        layer_idx = extract_layer_index(prefix)
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        head_dim = getattr(config, "head_dim", None)
+        if head_dim is None:
+            head_dim = self.hidden_size // self.total_num_heads
+        self.head_dim = head_dim
+        # Phi models introduced a partial_rotary_factor parameter in the config
+        self.partial_rotary_factor = getattr(config, "partial_rotary_factor",
+                                             1)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias_o_proj,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self._init_rotary_emb(config,
+                              rope_scaling=rope_scaling,
+                              quant_config=quant_config)
+
+        if hasattr(config, "interleaved_sliding_window"):
+            interleaved_sliding_window = config.interleaved_sliding_window
+            if isinstance(interleaved_sliding_window, int):
+                sliding_window = interleaved_sliding_window
+            elif isinstance(interleaved_sliding_window, list):
+                sw_idx = layer_idx % len(interleaved_sliding_window)
+                sliding_window = interleaved_sliding_window[sw_idx]
+            else:
+                raise ValueError(
+                    f"{type(interleaved_sliding_window)} is not supported.")
+        else:
+            sliding_window = None
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            per_layer_sliding_window=sliding_window,
+            attn_type=attn_type,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def _init_rotary_emb(self, config: LlamaConfig,
+                         rope_scaling: Optional[dict[str, Any]],
+                         quant_config: Optional[QuantizationConfig]) -> None:
+        is_neox_style = True
+        is_gguf = quant_config and quant_config.get_name() == "gguf"
+        if is_gguf and config.model_type == "llama":
+            is_neox_style = False
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=is_neox_style,
+            partial_rotary_factor=self.partial_rotary_factor,
+        )
+
+
+class LlamaDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+                config, "original_max_position_embeddings", None):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        # Support abacusai/Smaug-72B-v0.1 with attention_bias
+        # Support internlm/internlm-7b with bias
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False)
+        bias_o_proj = attention_bias
+        # support internlm/internlm3-8b with qkv_bias
+        if hasattr(config, 'qkv_bias'):
+            attention_bias = config.qkv_bias
+
+        # By default, Llama uses causal attention as it is a decoder-only model.
+        # You can override the HF config with `is_causal=False` to enable
+        # bidirectional attention, which is used in some embedding models
+        # (e.g. parasail-ai/GritLM-7B-vllm)
+        if getattr(config, "is_causal", True):
+            attn_type = AttentionType.DECODER
+        else:
+            attn_type = AttentionType.ENCODER_ONLY
+
+        self.self_attn = LlamaAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=getattr(config, "num_key_value_heads",
+                                 config.num_attention_heads),
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=attention_bias,
+            bias_o_proj=bias_o_proj,
+            cache_config=cache_config,
+            prefix=f"{prefix}.self_attn",
+            attn_type=attn_type,
+        )
+        self.mlp = LlamaMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            bias=getattr(config, "mlp_bias", False),
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(positions=positions,
+                                       hidden_states=hidden_states)
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class LlamaModel(nn.Module):
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 layer_type: type[nn.Module] = LlamaDecoderLayer):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.quant_config = quant_config
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+        if get_pp_group().is_first_rank or (config.tie_word_embeddings
+                                            and get_pp_group().is_last_rank):
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                quant_config=quant_config,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: layer_type(config=config,
+                                      cache_config=cache_config,
+                                      quant_config=quant_config,
+                                      prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        self.aux_hidden_state_layers: tuple[int] = tuple()
+
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors, tuple[torch.Tensor,
+                                                        list[torch.Tensor]]]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        aux_hidden_states = []
+        for idx, layer in enumerate(
+                self.layers[self.start_layer:self.end_layer]):
+            if idx in self.aux_hidden_state_layers:
+                aux_hidden_states.append(hidden_states + residual)
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        if len(aux_hidden_states) > 0:
+            return hidden_states, aux_hidden_states
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            if "scale" in name:
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
+    }
+
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings"
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    # Mistral/Llama models can also be loaded with --load-format mistral
+    # from consolidated.safetensors checkpoints
+    mistral_mapping = {
+        "layers": "model.layers",
+        "attention": "self_attn",
+        "qscale_act": "input_scale",
+        "qscale_weight": "weight_scale",
+        "kv_fake_quantizer.qscale_act": "kv_scale",
+        "q_fake_quantizer.qscale_act": "attn.q_scale",
+        "k_fake_quantizer.qscale_act": "k_scale",
+        "v_fake_quantizer.qscale_act": "v_scale",
+        "wq": "q_proj",
+        "wk": "k_proj",
+        "wv": "v_proj",
+        "wo": "o_proj",
+        "attention_norm": "input_layernorm",
+        "feed_forward": "mlp",
+        "w1": "gate_proj",
+        "w2": "down_proj",
+        "w3": "up_proj",
+        "ffn_norm": "post_attention_layernorm",
+        "tok_embeddings": "model.embed_tokens",
+        "output": "lm_head",
+        "norm": "model.norm",
+    }
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 layer_type: type[nn.Module] = LlamaDecoderLayer):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        self.config = config
+        self.lora_config = lora_config
+
+        self.model = self._init_model(vllm_config=vllm_config,
+                                      prefix=maybe_prefix(prefix, "model"),
+                                      layer_type=layer_type)
+
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = config.vocab_size
+            if lora_config:
+                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=(
+                    DEFAULT_VOCAB_PADDING_SIZE
+                    # We need bigger padding if using lora for kernel
+                    # compatibility
+                    if not lora_config else
+                    lora_config.lora_vocab_padding_size),
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+            if config.tie_word_embeddings:
+                self.lm_head = self.lm_head.tie_weights(
+                    self.model.embed_tokens)
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    config.vocab_size,
+                                                    logit_scale)
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def set_aux_hidden_state_layers(self, layers: tuple[int]) -> None:
+        self.model.aux_hidden_state_layers = layers
+
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int]:
+        num_layers = len(self.model.layers)
+        return (2, num_layers // 2, num_layers - 3)
+
+    def _init_model(self,
+                    vllm_config: VllmConfig,
+                    prefix: str = "",
+                    layer_type: type[nn.Module] = LlamaDecoderLayer):
+        return LlamaModel(vllm_config=vllm_config,
+                          prefix=prefix,
+                          layer_type=layer_type)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        model_output = self.model(input_ids, positions, intermediate_tensors,
+                                  inputs_embeds)
+        return model_output
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(
+            self.maybe_remap_mistral(name, loaded_weight)
+            for name, loaded_weight in weights)
+
+    # This function is used to remap the mistral format as
+    # used by Mistral and Llama <=2
+    def maybe_remap_mistral(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+    ) -> tuple[str, torch.Tensor]:
+
+        def permute(w: torch.Tensor, n_heads: int):
+            attn_in = self.config.head_dim * n_heads
+            attn_out = self.config.hidden_size
+
+            return w.view(n_heads, attn_in // n_heads // 2, 2,
+                          attn_out).transpose(1, 2).reshape(attn_in, attn_out)
+
+        mapping = self.mistral_mapping
+        modules = name.split(".")
+
+        # rotary embeds should be sliced
+        if "wk" in modules and modules[-1] == "weight":
+            loaded_weight = permute(loaded_weight,
+                                    self.config.num_key_value_heads)
+        elif "wq" in modules and modules[-1] == "weight":
+            loaded_weight = permute(loaded_weight,
+                                    self.config.num_attention_heads)
+
+        num_modules = len(modules)
+        for i in range(num_modules):
+            item = modules[i]
+            next_item = modules[i + 1] if i < num_modules - 1 else None
+
+            combined_item = (f"{item}.{next_item}"
+                             if next_item is not None else None)
+
+            if combined_item in mapping:
+                name = name.replace(combined_item, mapping[combined_item])
+            elif item in mapping and mapping[item] not in name:
+                name = name.replace(item, mapping[item])
+
+        return name, loaded_weight
diff --git a/vllm_v0.10.0/vllm/model_executor/models/llama4.py b/vllm_v0.10.0/vllm/model_executor/models/llama4.py
new file mode 100644
index 0000000..fab1c16
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/llama4.py
@@ -0,0 +1,582 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Copyright 2025 the LLAMA4, Meta Inc., vLLM, and HuggingFace Inc. team.
+# All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only LLaMA model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Any, Optional
+
+import torch
+from torch import nn
+from transformers import Llama4TextConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+
+from .llama import LlamaForCausalLM, LlamaMLP, LlamaModel
+from .utils import (AutoWeightsLoader, extract_layer_index, fast_topk,
+                    is_pp_missing_parameter)
+
+
+class Llama4MoE(nn.Module):
+
+    @staticmethod
+    def custom_routing_function(
+        hidden_states: torch.Tensor,
+        gating_output: torch.Tensor,
+        topk: int,
+        renormalize: bool,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        router_scores, router_indices = fast_topk(gating_output, topk, dim=-1)
+        # pseudo-standard is that the router scores are floats
+        router_scores = torch.sigmoid(router_scores.float())
+        return (router_scores, router_indices.to(torch.int32))
+
+    def __init__(self,
+                 config: Llama4TextConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.top_k = config.num_experts_per_tok
+
+        intermediate_size_moe = config.intermediate_size
+        self.router = ReplicatedLinear(config.hidden_size,
+                                       config.num_local_experts,
+                                       bias=False,
+                                       quant_config=None,
+                                       prefix=f"{prefix}.router")
+
+        self.experts = FusedMoE(
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            custom_routing_function=Llama4MoE.custom_routing_function,
+            intermediate_size=intermediate_size_moe,
+            apply_router_weight_on_input=True,
+            reduce_results=False,
+            renormalize=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts")
+
+        self.shared_expert = LlamaMLP(
+            hidden_size=config.hidden_size,
+            intermediate_size=intermediate_size_moe,
+            hidden_act="silu",
+            quant_config=quant_config,
+            bias=False,
+            prefix=f"{prefix}.shared_expert",
+            reduce_results=self.experts.must_reduce_shared_expert_outputs(),
+        )
+
+    def forward(self, hidden_states):
+        router_logits, _ = self.router(hidden_states)
+        shared_out = self.shared_expert(hidden_states)
+        routed_out = self.experts(
+            hidden_states=hidden_states,
+            router_logits=router_logits,
+        )
+        experts_out = routed_out + shared_out
+
+        if self.tp_size > 1:
+            experts_out = self.experts.maybe_all_reduce_tensor_model_parallel(
+                experts_out)
+
+        return experts_out
+
+
+class Llama4Attention(nn.Module):
+
+    def __init__(self,
+                 config: Llama4TextConfig,
+                 hidden_size: int,
+                 num_heads: int,
+                 num_kv_heads: int,
+                 rope_theta: float = 10000,
+                 rope_scaling: Optional[dict[str, Any]] = None,
+                 max_position_embeddings: int = 8192,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 bias: bool = False,
+                 bias_o_proj: bool = False,
+                 cache_config: Optional[CacheConfig] = None,
+                 prefix: str = "") -> None:
+        super().__init__()
+        self.layer_idx = extract_layer_index(prefix)
+        self.hidden_size = hidden_size
+        self.no_rope_layers = config.no_rope_layers
+        self.nope = self.no_rope_layers[self.layer_idx] == 0
+        self.use_qk_norm = config.use_qk_norm and not self.nope
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = config.head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.attn_temperature_tuning = self.nope and \
+            config.attn_temperature_tuning
+
+        self.floor_scale = getattr(config, "floor_scale", 8192.0)
+        self.attn_scale = getattr(config, "attn_scale", 0.1)
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.n_rep = self.num_heads // self.num_kv_heads
+        self.qk_norm = RMSNorm(
+            hidden_size=self.head_dim,
+            eps=config.rms_norm_eps,
+            has_weight=False,
+            dtype=torch.float32,
+        ) if self.use_qk_norm else None
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias_o_proj,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        is_neox_style = True
+        is_gguf = quant_config and quant_config.get_name() == "gguf"
+        if is_gguf and config.model_type == "llama":
+            is_neox_style = False
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=int(rope_theta),
+            rope_scaling=rope_scaling if rope_scaling != "default" else None,
+            is_neox_style=is_neox_style,
+        ) if not self.nope else None
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            per_layer_sliding_window=None,
+            use_irope=not self.nope,
+            prefix=f"{prefix}.attn",
+        )
+
+    def _get_attn_scale(self, positions: torch.Tensor) -> torch.Tensor:
+        floor = torch.floor((positions + 1.0) / self.floor_scale)
+        attn_scale = torch.log(floor + 1.0) * self.attn_scale + 1.0
+
+        return attn_scale.unsqueeze(-1)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        if self.rotary_emb is not None:
+            q, k = self.rotary_emb(positions, q, k)
+        if self.qk_norm is not None:
+            q = q.reshape(-1, self.num_heads, self.head_dim)
+            q = self.qk_norm(q.float()).reshape(-1, self.q_size).to(q.dtype)
+            k = k.reshape(-1, self.num_kv_heads, self.head_dim)
+            k = self.qk_norm(k.float()).reshape(-1, self.kv_size).to(k.dtype)
+
+        # We are applying temperature tuning (https://arxiv.org/abs/2501.19399)
+        # to NoPE layers, where the inference-time temperature tuning function
+        # is customized to not affect short context
+        # while working at very long context
+        # https://arxiv.org/abs/2501.19399
+        #
+        # We should apply temperature tuning between (after) rotary / QK norm
+        # and (before) attention.
+        if self.attn_temperature_tuning and self.nope:
+            attn_scale = self._get_attn_scale(positions)
+            q = (q * attn_scale).to(q.dtype)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Llama4DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Llama4TextConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.layer_idx = extract_layer_index(prefix)
+        self.hidden_size = config.hidden_size
+        rope_theta = config.rope_theta
+        rope_scaling = config.rope_scaling
+        max_position_embeddings = config.max_position_embeddings
+
+        self.self_attn = Llama4Attention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=False,
+            bias_o_proj=False,
+            cache_config=cache_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        is_moe_layer = config.interleave_moe_layer_step > 0 and (
+            self.layer_idx + 1) % config.interleave_moe_layer_step == 0
+        if is_moe_layer:
+            self.feed_forward = Llama4MoE(
+                config=config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.feed_forward",
+            )
+        else:
+            self.feed_forward = LlamaMLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=config.intermediate_size_mlp,
+                hidden_act="silu",
+                quant_config=quant_config,
+                bias=False,
+                prefix=f"{prefix}.feed_forward",
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(positions=positions,
+                                       hidden_states=hidden_states)
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.feed_forward(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class Llama4Model(LlamaModel):
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 layer_type: type[Llama4DecoderLayer] = Llama4DecoderLayer):
+        self.num_experts = vllm_config.model_config.hf_config.num_local_experts
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         layer_type=layer_type)
+
+    def load_moe_expert_weights(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+        params_dict: dict[str, nn.Parameter],
+        loaded_params: set[str],
+        expert_params_mapping: list[tuple[str, str, int, str]],
+        fused: bool = True,
+    ) -> bool:
+        expert_param_loaded = False
+        if "experts.gate_up_proj" in name:
+            loaded_weight = loaded_weight.chunk(2, dim=-1)
+        for (param_name, weight_name, expert_id,
+             shard_id) in expert_params_mapping:
+            new_loaded_weight = loaded_weight
+            if fused:
+                e_str, _, proj_str, _ = weight_name.split('.')
+                weight_name = f"{e_str}.{proj_str}"
+                param_name = f"{param_name}weight"
+            if weight_name not in name:
+                continue
+            full_param_name = name.replace(weight_name, param_name)
+            # Skip layers on other devices.
+            if is_pp_missing_parameter(name, self):
+                continue
+            if ((name.endswith(".bias") or name.endswith("_bias"))
+                    and name not in params_dict):
+                continue
+            param = params_dict[full_param_name]
+            weight_loader = param.weight_loader
+            if fused:
+                if "w13" in full_param_name:
+                    shard_idx = 0 if shard_id == "w1" else 1
+                    new_loaded_weight = new_loaded_weight[shard_idx]
+                new_loaded_weight = new_loaded_weight.transpose(-1, -2)
+                layer_idx = extract_layer_index(name)
+                # EP mapping
+                expert_map = self.layers[
+                    layer_idx].feed_forward.experts.expert_map
+                if expert_map is not None:
+                    local_expert_indices = (expert_map != -1) \
+                                            .nonzero() \
+                                            .flatten() \
+                                            .to(new_loaded_weight.device)
+                    new_loaded_weight = new_loaded_weight[local_expert_indices]
+                    expert_id = local_expert_indices[0].item()
+            else:
+                # TODO: add EP support for non fused weights
+                pass
+            weight_loader(param,
+                          new_loaded_weight,
+                          full_param_name,
+                          shard_id=shard_id,
+                          expert_id=expert_id)
+
+            loaded_params.add(full_param_name)
+            expert_param_loaded = True
+        return expert_param_loaded
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        fused_experts_params = False
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.num_experts)
+        expert_params_mapping_fused = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_up_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="gate_up_proj",
+            num_experts=1)
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "experts.gate_up_proj" in name or "experts.down_proj" in name:
+                fused_experts_params = True
+                expert_params_mapping = expert_params_mapping_fused
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name or "experts" in name:
+                    continue
+                # This check is for ModelOpt ckpts with kv cache quant enabled
+                if not (name.endswith(
+                    (".k_scale", ".v_scale")) and "self_attn" in name):
+                    name = name.replace(weight_name, param_name)
+                if is_pp_missing_parameter(name, self):
+                    continue
+                if name.endswith("scale") and "expert" not in name:
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                if weight_loader == default_weight_loader:
+                    weight_loader(param, loaded_weight)
+                else:
+                    weight_loader(param, loaded_weight, shard_id)
+                loaded_params.add(name)
+                break
+            else:
+                moe_loaded = self.load_moe_expert_weights(
+                    name,
+                    loaded_weight,
+                    params_dict,
+                    loaded_params,
+                    expert_params_mapping,
+                    fused=fused_experts_params)
+
+                if not moe_loaded:
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    # Handle flat expert scale parameters that
+                    # don't match per-expert patterns
+                    if ("experts." in name and ("w13_input_scale" in name
+                                                or "w13_weight_scale" in name
+                                                or "w2_input_scale" in name
+                                                or "w2_weight_scale" in name)):
+                        # These are flat expert scales that apply to all experts
+                        param = params_dict[name]
+                        weight_loader = getattr(param, "weight_loader",
+                                                default_weight_loader)
+
+                        # Check for MoE-specific loading support via
+                        # attribute instead of expensive runtime reflection
+                        supports_moe = getattr(weight_loader,
+                                               'supports_moe_loading', False)
+
+                        if supports_moe:
+                            # This is a MoE weight loader
+                            if "w13_" in name:
+                                shard_id = "w1"
+                            elif "w2_" in name:
+                                shard_id = "w2"
+                            else:
+                                shard_id = "w1"
+
+                            weight_loader(param,
+                                          loaded_weight,
+                                          name,
+                                          shard_id=shard_id,
+                                          expert_id=0)
+                        else:
+                            # Regular weight loader (handles both
+                            # param.weight_loader and default_weight_loader)
+                            weight_loader(param, loaded_weight)
+                        loaded_params.add(name)
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+                    loaded_params.add(name)
+        return loaded_params
+
+
+class Llama4ForCausalLM(LlamaForCausalLM):
+
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        # update temperature tuning config from generation config
+        gen_config = vllm_config.model_config.try_get_generation_config()
+        gen_config.update(vllm_config.model_config.override_generation_config)
+        # enable temperature tuning by default when max_model_len > 32K
+        default_attn_temperature_tuning = \
+            vllm_config.model_config.max_model_len > 32768
+        vllm_config.model_config.hf_config.attn_temperature_tuning \
+            = gen_config.get(
+                "attn_temperature_tuning", default_attn_temperature_tuning)
+
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         layer_type=Llama4DecoderLayer)
+
+    def _init_model(self,
+                    vllm_config: VllmConfig,
+                    prefix: str = "",
+                    layer_type: type[Llama4DecoderLayer] = Llama4DecoderLayer):
+        return Llama4Model(vllm_config=vllm_config,
+                           prefix=prefix,
+                           layer_type=layer_type)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        weights = [
+            self.permute_qk_weight_for_rotary(name, loaded_weight)
+            for name, loaded_weight in weights
+        ]
+        return loader.load_weights(weights)
+
+    def permute_qk_weight_for_rotary(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+    ) -> tuple[str, torch.Tensor]:
+
+        def permute(w: torch.Tensor, n_heads: int):
+            attn_in = self.config.head_dim * n_heads
+            attn_out = self.config.hidden_size
+
+            return w.view(n_heads, attn_in // n_heads // 2, 2,
+                          attn_out).transpose(1, 2).reshape(attn_in, attn_out)
+
+        modules = name.split(".")
+
+        # rotary embeds should be sliced
+        if ("wk" in modules or "k_proj" in modules) \
+           and modules[-1] == "weight":
+            loaded_weight = permute(loaded_weight,
+                                    self.config.num_key_value_heads)
+        elif ("wq" in modules or "q_proj" in modules) \
+                and modules[-1] == "weight":
+            loaded_weight = permute(loaded_weight,
+                                    self.config.num_attention_heads)
+
+        return name, loaded_weight
diff --git a/vllm_v0.10.0/vllm/model_executor/models/llama4_eagle.py b/vllm_v0.10.0/vllm/model_executor/models/llama4_eagle.py
new file mode 100644
index 0000000..222ab5d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/llama4_eagle.py
@@ -0,0 +1,214 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright 2025 the LLAMA4, Meta Inc., vLLM, and HuggingFace Inc. team.
+# All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.logger import init_logger
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.quantization.torchao import TorchAOConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.llama4 import (Llama4DecoderLayer,
+                                               Llama4ForCausalLM)
+from vllm.model_executor.models.utils import extract_layer_index
+
+from .utils import AutoWeightsLoader, maybe_prefix
+
+logger = init_logger(__name__)
+
+
+@support_torch_compile
+class LlamaModel(nn.Module):
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        start_layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.config = (
+            vllm_config.speculative_config.draft_model_config.hf_config)
+        self.validate_and_update_config(start_layer_id, quant_config)
+        self.vocab_size = self.config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            prefix=maybe_prefix(prefix, "embed_tokens"),
+        )
+
+        self.layers = nn.ModuleList([
+            Llama4DecoderLayer(
+                self.config,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"),
+            ) for i in range(self.config.num_hidden_layers)
+        ])
+        self.fc = torch.nn.Linear(self.config.hidden_size * 2,
+                                  self.config.hidden_size,
+                                  bias=False)
+        self.norm = RMSNorm(self.config.hidden_size,
+                            eps=self.config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_embeds = self.embed_tokens(input_ids)
+        hidden_states = self.fc(
+            torch.cat((input_embeds, hidden_states), dim=-1))
+        residual = None
+        for layer in self.layers:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states, hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            name = name.removeprefix("model.")
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # if PP disabled then draft will share embed with target
+                if get_pp_group().world_size == 1 and \
+                    "embed_tokens." in name:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        for name in params_dict:
+            # if PP disabled then draft will share embed with target
+            if get_pp_group().world_size == 1 and \
+                "embed_tokens." in name:
+                continue
+            assert name in loaded_params, f"{name} is not loaded!"
+        return loaded_params
+
+    def validate_and_update_config(
+            self,
+            start_layer_id: int,
+            quant_config: Optional[QuantizationConfig] = None) -> None:
+        # yoco and moe is not supported by draft model yet
+        assert self.config.yoco_global_kv_layer is None
+        assert self.config.yoco_local_kv_layer is None
+        assert len(self.config.moe_layers) == 0
+        # draft model layer index is increased by start_layer_id,
+        # so we need to pad relevant configs accordingly
+        self.config.no_rope_layers = [
+            0
+        ] * start_layer_id + self.config.no_rope_layers
+        # currently only TorchAO quantization is supported
+        if isinstance(quant_config, TorchAOConfig):
+
+            def pad_layer_name(layer: str) -> str:
+                layer_index = extract_layer_index(layer)
+                return layer.replace(str(layer_index),
+                                     str(layer_index + start_layer_id))
+
+            quant_config.torchao_config.module_fqn_to_config = {
+                pad_layer_name(layer): quantization
+                for layer, quantization in
+                quant_config.torchao_config.module_fqn_to_config.items()
+            }
+
+
+class EagleLlama4ForCausalLM(Llama4ForCausalLM):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        nn.Module.__init__(self)
+        self.config = (
+            vllm_config.speculative_config.draft_model_config.hf_config)
+        target_layer_num = vllm_config.model_config.get_num_layers(
+            vllm_config.parallel_config)
+        # draft model quantization config may differ from target model
+        quant_config = VllmConfig.get_quantization_config(
+            vllm_config.speculative_config.draft_model_config,
+            vllm_config.load_config)
+        self.model = LlamaModel(vllm_config=vllm_config,
+                                prefix="model",
+                                start_layer_id=target_layer_num,
+                                quant_config=quant_config)
+        logit_scale = getattr(self.config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.config.vocab_size,
+                                                scale=logit_scale)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.model(input_ids, positions, hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> None:
+        loader = AutoWeightsLoader(
+            self,
+            # lm_head is tied with target model (Llama4ForCausalLM)
+            skip_prefixes=(["lm_head."]),
+        )
+
+        model_weights = {}
+        weights = [
+            self.permute_qk_weight_for_rotary(name, loaded_weight)
+            for name, loaded_weight in weights
+        ]
+        for name, loaded_weight in weights:
+            if "lm_head" not in name:
+                name = "model." + name
+            model_weights[name] = loaded_weight
+
+        loader.load_weights(model_weights.items())
diff --git a/vllm_v0.10.0/vllm/model_executor/models/llama_eagle.py b/vllm_v0.10.0/vllm/model_executor/models/llama_eagle.py
new file mode 100644
index 0000000..c769060
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/llama_eagle.py
@@ -0,0 +1,165 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable
+
+import torch
+import torch.nn as nn
+from transformers import LlamaConfig
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.logger import init_logger
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.llama import (LlamaDecoderLayer,
+                                              LlamaForCausalLM)
+
+from .utils import AutoWeightsLoader, maybe_prefix
+
+logger = init_logger(__name__)
+
+
+class LlamaDecoderLayer(LlamaDecoderLayer):
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        disable_input_layernorm: bool,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, prefix=prefix)
+
+        # Skip the input_layernorm
+        # https://github.com/SafeAILab/EAGLE/blob/35c78f6cdc19a73e05cf5c330b4c358dad970c6a/eagle/model/cnets.py#L427
+        if disable_input_layernorm:
+            del self.input_layernorm
+            self.input_layernorm = nn.Identity()
+
+
+@support_torch_compile
+class LlamaModel(nn.Module):
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        start_layer_id: int = 0,
+    ) -> None:
+        super().__init__()
+        self.config = vllm_config. \
+            speculative_config.draft_model_config.hf_config
+        self.vocab_size = self.config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            prefix=maybe_prefix(prefix, "embed_tokens"),
+        )
+
+        self.layers = nn.ModuleList([
+            LlamaDecoderLayer(
+                self.config,
+                i == 0,
+                prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"),
+            ) for i in range(self.config.num_hidden_layers)
+        ])
+        self.fc = torch.nn.Linear(self.config.hidden_size * 2,
+                                  self.config.hidden_size,
+                                  bias=False)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_embeds = self.embed_tokens(input_ids)
+        hidden_states = self.fc(
+            torch.cat((input_embeds, hidden_states), dim=-1))
+        residual = None
+        for layer in self.layers:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+        hidden_states = hidden_states + residual
+        return hidden_states, hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+
+                # if PP disabled then draft will share embed with target
+                if get_pp_group().world_size == 1 and \
+                    "embed_tokens." in name:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class EagleLlamaForCausalLM(LlamaForCausalLM):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        nn.Module.__init__(self)
+        self.config = vllm_config. \
+            speculative_config.draft_model_config.hf_config
+        target_layer_num = vllm_config.model_config.get_num_layers(
+            vllm_config.parallel_config)
+        self.model = LlamaModel(vllm_config=vllm_config,
+                                prefix="model",
+                                start_layer_id=target_layer_num)
+
+        logit_scale = getattr(self.config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.config.vocab_size,
+                                                scale=logit_scale)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.model(input_ids, positions, hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=None,
+        )
+
+        model_weights = {}
+        for name, loaded_weight in weights:
+            if "lm_head" not in name:
+                name = "model." + name
+            model_weights[name] = loaded_weight
+        loader.load_weights(model_weights.items())
diff --git a/vllm_v0.10.0/vllm/model_executor/models/llama_eagle3.py b/vllm_v0.10.0/vllm/model_executor/models/llama_eagle3.py
new file mode 100644
index 0000000..7fc9fe2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/llama_eagle3.py
@@ -0,0 +1,263 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from transformers import LlamaConfig
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import QKVParallelLinear
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.llama import (LlamaDecoderLayer,
+                                              LlamaForCausalLM)
+from vllm.v1.sample.metadata import SamplingMetadata
+
+from .utils import AutoWeightsLoader, maybe_prefix
+
+logger = init_logger(__name__)
+
+
+class LlamaDecoderLayer(LlamaDecoderLayer):
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, quant_config=quant_config, prefix=prefix)
+
+        # override qkv
+        self.self_attn.qkv_proj = QKVParallelLinear(
+            2 * self.hidden_size,
+            self.self_attn.head_dim,
+            self.self_attn.total_num_heads,
+            self.self_attn.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "qkv_proj"),
+        )
+
+        self.hidden_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        embeds: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+
+        residual = hidden_states
+        embeds = self.input_layernorm(embeds)
+        hidden_states = self.hidden_norm(hidden_states)
+
+        hidden_states = torch.cat([embeds, hidden_states], dim=-1)
+        # Self Attention
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+
+        # Fully Connected
+        hidden_states = self.mlp(hidden_states)
+
+        return hidden_states, residual
+
+
+@support_torch_compile
+class LlamaModel(nn.Module):
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        start_layer_id: int = 0,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = vllm_config. \
+            speculative_config.draft_model_config.hf_config
+        self.vocab_size = self.config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            prefix=maybe_prefix(prefix, "embed_tokens"),
+        )
+
+        self.layers = nn.ModuleList([
+            LlamaDecoderLayer(
+                self.config,
+                prefix=maybe_prefix(prefix, f"layers.{start_layer_id}"),
+            )
+        ])
+        if hasattr(self.config, "target_hidden_size"):
+            self.fc = torch.nn.Linear(self.config.target_hidden_size * 3,
+                                      self.config.hidden_size,
+                                      bias=False)
+        else:
+            self.fc = torch.nn.Linear(self.config.hidden_size * 3,
+                                      self.config.hidden_size,
+                                      bias=False)
+        self.norm = RMSNorm(
+            self.config.hidden_size,
+            eps=self.config.rms_norm_eps,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_embeds = self.embed_tokens(input_ids)
+        assert hidden_states.shape[-1] == input_embeds.shape[-1]
+
+        residual = None
+        hidden_states, residual = self.layers[0](
+            positions,
+            input_embeds,
+            hidden_states,
+            residual,
+        )
+
+        hidden_states, hidden_prenorm = self.norm(hidden_states, residual)
+        return hidden_states, hidden_prenorm
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if 'midlayer.' in name:
+                name = name.replace('midlayer.', 'layers.0.')
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Eagle3LlamaForCausalLM(LlamaForCausalLM):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        nn.Module.__init__(self)
+        self.config = vllm_config. \
+            speculative_config.draft_model_config.hf_config
+        target_layer_num = vllm_config.model_config.get_num_layers(
+            vllm_config.parallel_config)
+        self.model = LlamaModel(vllm_config=vllm_config,
+                                prefix="model",
+                                start_layer_id=target_layer_num)
+
+        logit_scale = getattr(self.config, "logit_scale", 1.0)
+        self.lm_head = ParallelLMHead(
+            self.config.draft_vocab_size,
+            self.config.hidden_size,
+            org_num_embeddings=self.config.draft_vocab_size,
+            padding_size=(DEFAULT_VOCAB_PADDING_SIZE),
+            prefix="")
+        self.logits_processor = LogitsProcessor(self.config.draft_vocab_size,
+                                                scale=logit_scale)
+        self.draft_id_to_target_id = nn.Parameter(
+            torch.zeros(self.config.draft_vocab_size, dtype=torch.long),
+            requires_grad=False,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.model(input_ids, positions, hidden_states)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        if self.draft_id_to_target_id is None:
+            assert logits.shape[1] == self.config.vocab_size, \
+                "Expected logits to have shape " \
+                f"(*, {self.config.vocab_size}), but got {logits.shape}"
+            return logits
+
+        base = torch.arange(self.config.draft_vocab_size, device=logits.device)
+        targets = base + self.draft_id_to_target_id
+        logits_new = logits.new_full((
+            logits.shape[0],
+            self.config.vocab_size,
+        ), float('-inf'))
+        logits_new[:, targets] = logits
+        return logits_new
+
+    def combine_hidden_states(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        # combine multiple auxiliary hidden states returned by eagle3
+        return self.model.fc(hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        model_weights = {}
+        includes_draft_id_mapping = False
+        includes_embed_tokens = False
+        for name, loaded_weight in weights:
+            if "t2d" in name:
+                continue
+            if "d2t" in name:
+                name = name.replace("d2t", "draft_id_to_target_id")
+                includes_draft_id_mapping = True
+            elif "lm_head" not in name:
+                name = "model." + name
+            if "embed_tokens" in name:
+                includes_embed_tokens = True
+            model_weights[name] = loaded_weight
+
+        skip_substrs = []
+        if not includes_draft_id_mapping:
+            skip_substrs.append("draft_id_to_target_id")
+        if not includes_embed_tokens:
+            skip_substrs.append("embed_tokens")
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=None,
+            skip_substrs=skip_substrs,
+        )
+        loader.load_weights(model_weights.items())
diff --git a/vllm_v0.10.0/vllm/model_executor/models/llava.py b/vllm_v0.10.0/vllm/model_executor/models/llava.py
new file mode 100644
index 0000000..0126ace
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/llava.py
@@ -0,0 +1,887 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import abstractmethod
+from collections.abc import Iterable, Mapping, Sequence
+from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar,
+                    Union, cast)
+
+import torch
+import torch.nn as nn
+from packaging.version import Version
+from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig,
+                          PixtralVisionConfig, PretrainedConfig,
+                          SiglipVisionConfig)
+from transformers import __version__ as TRANSFORMERS_VERSION
+from transformers.models.llava import LlavaProcessor
+from transformers.models.pixtral import PixtralProcessor
+
+from vllm.config import VllmConfig
+from vllm.inputs import InputProcessingContext
+from vllm.jsontree import json_map_leaves
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputs, MultiModalKwargs)
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   ImageSize, MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, ProcessingCache,
+                                        PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+from .clip import CLIPVisionModel
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel
+from .siglip import SiglipVisionModel
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+from .vision import get_vision_encoder_info
+
+
+class LlavaImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, num_channels, height, width)`
+
+    Note that `height` or `width` may be different per batch and image,
+    in which case the data is passed as a list instead of a batched tensor.
+    """
+
+
+class PixtralHFImagePixelInputs(TypedDict):
+    type: Literal["pixel_values_pixtral"]
+    pixel_values: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    Shape: `(batch_size * num_images, num_channels, height, width)`
+
+    Note that `height` or `width` may be different per batch and image,
+    in which case the data is passed as a list instead of a batched tensor.
+    """
+
+
+class LlavaImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: torch.Tensor
+    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+LlavaImageInputs = Union[LlavaImagePixelInputs, PixtralHFImagePixelInputs,
+                         LlavaImageEmbeddingInputs]
+
+
+class LlavaMultiModalProjector(nn.Module):
+
+    def __init__(self,
+                 vision_hidden_size: int,
+                 text_hidden_size: int,
+                 projector_hidden_act: str,
+                 multimodal_projector_bias: bool,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+
+        self.linear_1 = ColumnParallelLinear(vision_hidden_size,
+                                             text_hidden_size,
+                                             bias=multimodal_projector_bias,
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.linear_1")
+        self.act = get_act_fn(projector_hidden_act)
+        self.linear_2 = RowParallelLinear(text_hidden_size,
+                                          text_hidden_size,
+                                          bias=multimodal_projector_bias,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.linear_2")
+
+    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class LlavaLikeConfig(Protocol):
+    vision_config: Final[PretrainedConfig]
+    image_token_index: Final[int]
+    vision_feature_select_strategy: Final[str]
+    vision_feature_layer: Final[Union[int, list[int]]]
+
+
+class LlavaLikeProcessor(Protocol):
+    image_token: Final[str]
+
+
+class BaseLlavaProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self) -> LlavaLikeConfig:
+        return self.ctx.get_hf_config(LlavaConfig)
+
+    def get_vision_encoder_info(self):
+        return get_vision_encoder_info(self.get_hf_config())
+
+    @abstractmethod
+    def get_hf_processor(self, **kwargs: object) -> LlavaLikeProcessor:
+        raise NotImplementedError
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def _apply_feature_select_strategy(
+        self,
+        strategy: str,
+        encoder_num_image_tokens: int,
+    ) -> int:
+        if strategy == "default":
+            return encoder_num_image_tokens - 1
+        if strategy == "full":
+            return encoder_num_image_tokens
+
+        msg = f"Unexpected feature select strategy: {strategy!r}"
+        raise NotImplementedError(msg)
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        hf_config = self.get_hf_config()
+        vision_encoder_info = self.get_vision_encoder_info()
+
+        return self._apply_feature_select_strategy(
+            hf_config.vision_feature_select_strategy,
+            vision_encoder_info.get_num_image_tokens(
+                image_width=image_width,
+                image_height=image_height,
+            ),
+        )
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        vision_encoder_info = self.get_vision_encoder_info()
+        width = height = vision_encoder_info.get_image_size()
+        return ImageSize(width=width, height=height)
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+        )
+
+
+_I = TypeVar("_I", bound=BaseLlavaProcessingInfo)
+
+
+class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+
+class LlavaProcessingInfo(BaseLlavaProcessingInfo):
+
+    def get_hf_processor(self, **kwargs: object):
+        hf_processor = self.ctx.get_hf_processor(LlavaProcessor, **kwargs)
+        # In case patch_size is omitted from `processor_config.json`
+        # e.g. for E5-V: https://huggingface.co/royokong/e5-v
+        if hf_processor.patch_size is None:
+            patch_size = self.get_vision_encoder_info().get_patch_size()
+            hf_processor.patch_size = patch_size
+        return hf_processor
+
+
+class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor[_I]):
+
+    # Copied from BaseMultiModalProcessor
+    @abstractmethod
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        raise NotImplementedError
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_config = self.info.get_hf_config()
+        image_token_id = hf_config.image_token_index
+
+        def get_replacement(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                num_image_tokens = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                num_image_tokens = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                )
+
+            return [image_token_id] * num_image_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=get_replacement,
+            ),
+        ]
+
+
+class LlavaMultiModalProcessor(
+        BaseLlavaMultiModalProcessor[LlavaProcessingInfo]):
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+
+class PixtralHFProcessingInfo(BaseLlavaProcessingInfo):
+
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(PixtralProcessor, **kwargs)
+
+
+class PixtralHFMultiModalProcessor(
+        BaseMultiModalProcessor[PixtralHFProcessingInfo]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+
+        pixel_values = processed_outputs.get("pixel_values")
+        if pixel_values is not None:
+            # Before/after https://github.com/huggingface/transformers/pull/35122
+            if Version(TRANSFORMERS_VERSION) <= Version("4.48.3"):
+                images = mm_data["images"]
+                assert isinstance(images, list)
+
+                # Original output: (1, num_images, C, H, W)
+                # New output: (num_images, C, H, W)
+                assert (isinstance(pixel_values, list)
+                        and len(pixel_values) == 1)
+                assert (isinstance(pixel_values[0], list)
+                        and len(pixel_values[0]) == len(images))
+
+                processed_outputs["pixel_values"] = pixel_values[0]
+            else:
+                # Avoid padding since we need the output for each image to be
+                # independent of other images for the cache to work correctly
+                image_sizes = processed_outputs["image_sizes"]
+                assert len(pixel_values) == len(image_sizes)
+
+                processed_outputs["pixel_values"] = [
+                    p[:, :h, :w]
+                    for p, (h, w) in zip(pixel_values, image_sizes)
+                ]
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        hf_config = self.info.get_hf_config()
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+
+        image_break_id = vocab[processor.image_break_token]
+        image_token_id = hf_config.image_token_index
+        image_end_id = vocab[processor.image_end_token]
+
+        assert isinstance(hf_config.vision_config, PixtralVisionConfig)
+        encoder_info = PixtralHFEncoderInfo(hf_config)
+
+        def get_replacement(item_idx: int):
+            images = mm_items.get_items("image", ImageProcessorItems)
+            image_size = images.get_image_size(item_idx)
+
+            ncols, nrows = encoder_info.get_patch_grid_size(
+                image_width=image_size.width,
+                image_height=image_size.height,
+            )
+
+            tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
+            tokens[-1] = image_end_id
+
+            return PromptUpdateDetails.select_token_id(tokens, image_token_id)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=get_replacement,
+            ),
+        ]
+
+
+def _build_llava_or_pixtral_hf_info(
+    ctx: InputProcessingContext, ) -> BaseLlavaProcessingInfo:
+    hf_config = ctx.get_hf_config(LlavaConfig)
+
+    if isinstance(hf_config.vision_config, PixtralVisionConfig):
+        return PixtralHFProcessingInfo(ctx)
+
+    return LlavaProcessingInfo(ctx)
+
+
+def _build_llava_or_pixtral_hf_processor(
+    info: _I,
+    dummy_inputs: BaseDummyInputsBuilder[_I],
+    *,
+    cache: Optional[ProcessingCache] = None,
+) -> BaseMultiModalProcessor:
+    if isinstance(info, PixtralHFProcessingInfo):
+        return PixtralHFMultiModalProcessor(
+            info,
+            dummy_inputs,  # type: ignore
+            cache=cache,
+        )
+
+    if isinstance(info, LlavaProcessingInfo):
+        return LlavaMultiModalProcessor(
+            info,
+            dummy_inputs,  # type: ignore
+            cache=cache,
+        )
+
+    raise NotImplementedError(type(info))
+
+
+def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
+    """Determine the number of hidden layers to initialize up to in the
+    visual encoder.
+    
+    Args:
+        hf_config: Model config with vision feature layer(s).
+    """
+    feature_layers = hf_config.vision_feature_layer
+    num_hidden_layers = hf_config.vision_config.num_hidden_layers
+    # If we have one feature layer, initialize up to that layer
+    if isinstance(feature_layers, int):
+        return _get_layer_index(feature_layers, num_hidden_layers)
+    # If we have multiple feature layers, initialize up to the deepest one
+    elif isinstance(feature_layers, (list, tuple)):
+        return max(
+            _get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
+    raise TypeError(f"vision_layer_feature type: {type(feature_layers)}"
+                    " is not supported")
+
+
+def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
+    """Given a signed vision feature layer, get the number of hidden layers
+    needed to leverage it.
+
+    Args:
+        feature_layer_index: Index of a required layer in the visual encoder.
+        num_hidden_layers: The total number of hidden layers in the visual
+            encoder.
+    """
+    if feature_layer_index < 0:
+        return num_hidden_layers + feature_layer_index + 1
+    return feature_layer_index
+
+
+def init_vision_tower_for_llava(
+    hf_config: LlavaLikeConfig,
+    quant_config: Optional[QuantizationConfig],
+    *,
+    require_post_norm: Optional[bool] = None,
+    prefix: str = "",
+) -> Union[CLIPVisionModel, SiglipVisionModel, PixtralHFVisionModel]:
+    vision_config = hf_config.vision_config
+
+    # Initialize the vision tower only up to the deepest required feature layer
+    num_hidden_layers = _get_num_hidden_layers(hf_config)
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        return CLIPVisionModel(
+            vision_config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers,
+            require_post_norm=require_post_norm,
+            prefix=prefix,
+        )
+    elif isinstance(vision_config, SiglipVisionConfig):
+        return SiglipVisionModel(
+            vision_config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers,
+            require_post_norm=require_post_norm,
+            prefix=prefix,
+        )
+    elif isinstance(vision_config, PixtralVisionConfig):
+        return PixtralHFVisionModel(
+            vision_config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers,
+            require_post_norm=require_post_norm,
+            prefix=prefix,
+        )
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
+@MULTIMODAL_REGISTRY.register_processor(_build_llava_or_pixtral_hf_processor,
+                                        info=_build_llava_or_pixtral_hf_info,
+                                        dummy_inputs=LlavaDummyInputsBuilder)
+class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
+
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
+    }
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # mapping for new names in checkpoint saved after transformers v4.52
+            "model.language_model.": "language_model.model.",
+            "model.vision_tower.": "vision_tower.",
+            "model.multi_modal_projector.": "multi_modal_projector.",
+            "lm_head.": "language_model.lm_head.",
+        })
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<image>"
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        # NOTE: These are special cases for Pixtral-12B in the HF-format
+        # https://huggingface.co/mistral-community/pixtral-12b/blob/main/config.json  # noqa
+        if (config.text_config.architectures is None
+                and config.text_config.model_type == "mistral"):
+            config.text_config.architectures = ["MistralForCausalLM"]
+        if (config.projector_hidden_act is None
+                and config.vision_config.hidden_act == "gelu"):
+            config.projector_hidden_act = "gelu"
+
+        # TODO: Optionally initializes this for supporting embeddings.
+        self.vision_tower = init_vision_tower_for_llava(
+            config,
+            quant_config,
+            require_post_norm=False,
+            prefix=maybe_prefix(prefix, "vision_tower"))
+        self.multi_modal_projector = LlavaMultiModalProjector(
+            vision_hidden_size=config.vision_config.hidden_size,
+            text_hidden_size=config.text_config.hidden_size,
+            projector_hidden_act=config.projector_hidden_act,
+            multimodal_projector_bias=config.multimodal_projector_bias,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "multi_modal_projector"))
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+        actual_dims = tuple(data.shape[1:])
+
+        if actual_dims != expected_dims:
+            expected_expr = ("batch_size", *map(str, expected_dims))
+            raise ValueError(
+                f"The expected shape of pixel values is {expected_expr}. "
+                f"You supplied {tuple(data.shape)}.")
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[LlavaImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            if self.config.vision_config.model_type == "pixtral":
+                return PixtralHFImagePixelInputs(
+                    type="pixel_values_pixtral",
+                    pixel_values=flatten_bn(pixel_values),
+                )
+
+            return LlavaImagePixelInputs(
+                type="pixel_values",
+                pixel_values=self._validate_pixel_values(
+                    flatten_bn(pixel_values, concat=True)),
+            )
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+
+            if self.config.vision_config.model_type == "pixtral":
+                raise ValueError("Pixtral-HF does not support image_embeds.")
+
+            return LlavaImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds, concat=True),
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _select_image_features(self, image_features: torch.Tensor, *,
+                               strategy: str) -> torch.Tensor:
+        # Copied from https://github.com/huggingface/transformers/blob/39c3c0a72af6fbda5614dde02ff236069bb79827/src/transformers/models/llava/modeling_llava.py#L421  # noqa
+        if strategy == "default":
+            return image_features[:, 1:]
+        elif strategy == "full":
+            return image_features
+
+        raise ValueError(f"Unexpected select feature strategy: {strategy}")
+
+    def _image_pixels_to_features(
+        self,
+        vision_tower: Union[CLIPVisionModel, SiglipVisionModel,
+                            PixtralHFVisionModel],
+        pixel_values: Union[torch.Tensor, list[torch.Tensor]],
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        # NOTE: we skip the step to select the vision feature layer since
+        # this is already done inside the vision tower
+        image_features = vision_tower(pixel_values)
+
+        def select_features(leaf: torch.Tensor):
+            return self._select_image_features(
+                leaf,
+                strategy=self.config.vision_feature_select_strategy,
+            )
+
+        return cast(
+            Union[torch.Tensor, tuple[torch.Tensor, ...]],
+            json_map_leaves(select_features, image_features),
+        )
+
+    def _process_image_pixels(
+        self,
+        inputs: Union[LlavaImagePixelInputs, PixtralHFImagePixelInputs],
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        assert self.vision_tower is not None
+
+        pixel_values = inputs["pixel_values"]
+
+        return self._image_pixels_to_features(self.vision_tower, pixel_values)
+
+    def _process_image_input(
+        self,
+        image_input: LlavaImageInputs,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        if image_input["type"] == "image_embeds":
+            return image_input["data"]
+
+        assert self.vision_tower is not None
+        image_features = self._process_image_pixels(image_input)
+
+        if isinstance(image_features, torch.Tensor):
+            return self.multi_modal_projector(image_features)
+
+        feature_sizes = [
+            image_feature.shape[0] for image_feature in image_features
+        ]
+
+        image_embeds = self.multi_modal_projector(torch.cat(image_features))
+        image_embeds = torch.split(image_embeds, feature_sizes)
+        return image_embeds
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return []
+
+        return self._process_image_input(image_input)
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                self.config.image_token_index,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        """Run forward pass for LLaVA-1.5.
+
+        One key thing to understand is the `input_ids` already accounts for the
+        positions of the to-be-inserted image embeddings.
+
+        Concretely, consider a text prompt:
+        `"USER: <image>\\nWhat's the content of the image?\\nASSISTANT:"`.
+
+        Tokenizer outputs:
+        `[1, 3148, 1001, 29901, 29871, 32000, 29871, 13, 5618, 29915, 29879,
+        278, 2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566, 29901]`.
+
+        To reserve space in KV cache, we have to insert placeholder tokens
+        before they are inputted to the model, so the input processor prepends
+        additional image tokens (denoted as `32000`), resulting in:
+        `[1, 3148, 1001, 29901, 29871, 32000, ..., 32000, 29871, 13, 5618,
+        29915, 29879, 278, 2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566,
+        29901]`.
+
+        We insert 575 tokens so that including the original image token in the
+        input, there are a total of 576 (24 * 24) image tokens, which
+        corresponds to the number of image tokens inputted to the language
+        model, i.e. the number of image tokens outputted by the visual encoder.
+
+        This way, the `positions` and `attn_metadata` are consistent
+        with the `input_ids`.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            pixel_values: The pixels in each input image.
+
+        Info:
+            [LlavaImageInputs][]
+        """
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+
+class MantisProcessingInfo(LlavaProcessingInfo):
+
+    def get_hf_processor(self, **kwargs: object):
+        hf_config = self.get_hf_config()
+        vision_info = self.get_vision_encoder_info()
+
+        kwargs.setdefault("patch_size", vision_info.get_patch_size())
+
+        if Version(TRANSFORMERS_VERSION) < Version("4.48"):
+            # BUG: num_additional_image_tokens = 0 but treated as 1,
+            # so we set vision_feature_select_strategy to None to offset this
+            kwargs.setdefault("vision_feature_select_strategy", None)
+        else:
+            # FIXED: https://github.com/huggingface/transformers/pull/33424/files#diff-6a37acc21efcadaae622b079b2712a131131448ff64262bd219aa346aeec38faL150
+            kwargs.setdefault(
+                "vision_feature_select_strategy",
+                hf_config.vision_feature_select_strategy,
+            )
+
+        return self.ctx.get_hf_processor(LlavaProcessor, **kwargs)
+
+
+class MantisMultiModalProcessor(LlavaMultiModalProcessor):
+
+    def apply(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        return_mm_hashes: bool = False,
+    ) -> MultiModalInputs:
+        hf_config = self.info.get_hf_config()
+        image_token_id = hf_config.image_token_index
+
+        # Assume that it doesn't depend on the image size
+        num_image_tokens = self.info.get_num_image_tokens(
+            image_width=-1,
+            image_height=-1,
+        )
+
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
+                               tokenization_kwargs, return_mm_hashes)
+
+        mm_items = self._to_mm_items(mm_data)
+        mm_item_counts = mm_items.get_all_counts()
+        mm_kwargs = result["mm_kwargs"]
+        mm_hashes = result["mm_hashes"]
+
+        # We reimplement the functionality of MLlavaProcessor from
+        # https://github.com/TIGER-AI-Lab/Mantis.git
+        def get_replacement_mantis(item_idx: int):
+            return "".join([
+                f"(image {item_idx+1}: <Image>",  # 7 tokens
+                "<image>" * num_image_tokens,
+                "</Image>)",  # 3 tokens
+            ])
+
+        mantis_mm_repls = self._bind_and_group_updates([
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id] * num_image_tokens,
+                replacement=get_replacement_mantis,
+            )
+        ])
+
+        prompt_ids, prompt, _ = self._apply_prompt_updates(
+            result["prompt_token_ids"],
+            mantis_mm_repls,
+            mm_item_counts,
+        )
+
+        unbound_orig_repls = self._get_prompt_updates(
+            mm_items,
+            hf_processor_mm_kwargs,
+            mm_kwargs,
+        )
+        orig_repls = self._bind_and_group_updates(unbound_orig_repls)
+
+        mm_placeholders = self._find_mm_placeholders(
+            orig_repls,
+            prompt_ids,
+            mm_item_counts,
+        )
+        self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
+
+        mm_placeholder_ranges = {
+            modality: [item.to_range() for item in placeholders]
+            for modality, placeholders in mm_placeholders.items()
+        }
+
+        return MultiModalInputs(
+            type="multimodal",
+            prompt=prompt,
+            prompt_token_ids=prompt_ids,
+            mm_kwargs=mm_kwargs,
+            mm_hashes=mm_hashes,
+            mm_placeholders=mm_placeholder_ranges,
+        )
+
+
+# To use this model, please use
+# `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'`
+@MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor,
+                                        info=MantisProcessingInfo,
+                                        dummy_inputs=LlavaDummyInputsBuilder)
+class MantisForConditionalGeneration(LlavaForConditionalGeneration):
+    pass
diff --git a/vllm_v0.10.0/vllm/model_executor/models/llava_next.py b/vllm_v0.10.0/vllm/model_executor/models/llava_next.py
new file mode 100644
index 0000000..04fb6b5
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/llava_next.py
@@ -0,0 +1,604 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import abstractmethod
+from collections.abc import Iterable, Mapping
+from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar,
+                    Union)
+
+import torch
+import torch.nn as nn
+from transformers import BatchFeature, LlavaNextConfig, LlavaNextProcessor
+from transformers.models.llava_next.modeling_llava_next import (
+    get_anyres_image_grid_shape, unpad_image)
+from typing_extensions import NotRequired
+
+from vllm.config import VllmConfig
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalFieldConfig
+from vllm.multimodal.parse import ImageSize
+from vllm.sequence import IntermediateTensors
+
+from .clip import CLIPVisionModel
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .llava import (BaseLlavaMultiModalProcessor, BaseLlavaProcessingInfo,
+                    LlavaDummyInputsBuilder, LlavaLikeConfig,
+                    LlavaMultiModalProjector, init_vision_tower_for_llava)
+from .siglip import SiglipVisionModel
+from .utils import (AutoWeightsLoader, WeightsMapper, embed_multimodal,
+                    flatten_bn, init_vllm_registered_model, maybe_prefix)
+
+
+class LlavaNextImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    Shape:
+    `(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
+
+    Note that `num_patches` may be different per batch and image,
+    in which case the data is passed as a list instead of a batched tensor.
+    """
+
+    image_sizes: NotRequired[torch.Tensor]
+    """
+    Shape: `(batch_size * num_images, 2)`
+
+    This should be in `(height, width)` format.
+    """
+
+
+class LlavaNextImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: torch.Tensor
+    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+LlavaNextImageInputs = Union[LlavaNextImagePixelInputs,
+                             LlavaNextImageEmbeddingInputs]
+
+
+class LlavaNextLikeConfig(LlavaLikeConfig, Protocol):
+    image_grid_pinpoints: Final[list[list[int]]]
+
+
+class LlavaNextProcessingInfo(BaseLlavaProcessingInfo):
+
+    def get_hf_config(self) -> LlavaNextLikeConfig:
+        return self.ctx.get_hf_config(LlavaNextConfig)
+
+    def get_hf_processor(self, **kwargs: object):
+        hf_processor = self.ctx.get_hf_processor(LlavaNextProcessor, **kwargs)
+
+        # In case patch_size is omitted from `processor_config.json`
+        # e.g. for E5-V: https://huggingface.co/royokong/e5-v
+        if hf_processor.patch_size is None:
+            patch_size = self.get_vision_encoder_info().get_patch_size()
+            hf_processor.patch_size = patch_size
+
+        return hf_processor
+
+    # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L113
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        hf_config = self.get_hf_config()
+        vision_encoder_info = self.get_vision_encoder_info()
+
+        base_feature_size = self._apply_feature_select_strategy(
+            hf_config.vision_feature_select_strategy,
+            vision_encoder_info.get_num_image_tokens(
+                image_width=image_width,
+                image_height=image_height,
+            ),
+        )
+
+        num_patch_height, num_patch_width = get_anyres_image_grid_shape(
+            image_size=(image_height, image_width),
+            grid_pinpoints=hf_config.image_grid_pinpoints,
+            patch_size=vision_encoder_info.get_image_size(),
+        )
+
+        (
+            unpadded_feature_size,
+            newline_feature_size,
+        ) = self._get_num_unpadded_features(
+            original_height=image_height,
+            original_width=image_width,
+            npatches=vision_encoder_info.get_patch_grid_length(),
+            num_patch_height=num_patch_height,
+            num_patch_width=num_patch_width,
+        )
+
+        return unpadded_feature_size + newline_feature_size + base_feature_size
+
+    # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
+    def _get_num_unpadded_features(
+        self,
+        *,
+        original_height: int,
+        original_width: int,
+        npatches: int,
+        num_patch_height: int,
+        num_patch_width: int,
+    ) -> tuple[int, int]:
+        current_height = npatches * num_patch_height
+        current_width = npatches * num_patch_width
+
+        aspect_ratio = original_width / original_height
+        current_aspect_ratio = current_width / current_height
+
+        if aspect_ratio > current_aspect_ratio:
+            new_height = int(
+                round(original_height * (current_width / original_width), 7))
+            padding = (current_height - new_height) // 2
+            current_height = current_height - (2 * padding)
+        else:
+            new_width = int(
+                round(original_width * (current_height / original_height), 7))
+            padding = (current_width - new_width) // 2
+            current_width = current_width - (2 * padding)
+
+        unpadded_features = current_height * current_width
+        newline_features = current_height
+
+        return (unpadded_features, newline_features)
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        hf_config = self.get_hf_config()
+
+        largest_feature_size, largest_feature_pinpoint = 0, None
+        for (height, width) in hf_config.image_grid_pinpoints:
+            feat_size = self.get_num_image_tokens(image_width=width,
+                                                  image_height=height)
+            if feat_size > largest_feature_size:
+                largest_feature_size = feat_size
+                largest_feature_pinpoint = ImageSize(width=width,
+                                                     height=height)
+
+        if largest_feature_size == 0 or largest_feature_pinpoint is None:
+            raise ValueError("Cannot have a largest feature size of 0!")
+
+        return largest_feature_pinpoint
+
+
+_I = TypeVar("_I", bound=LlavaNextProcessingInfo)
+
+
+class BaseLlavaNextMultiModalProcessor(BaseLlavaMultiModalProcessor[_I]):
+
+    # Copied from BaseMultiModalProcessor
+    @abstractmethod
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        raise NotImplementedError
+
+
+class LlavaNextMultiModalProcessor(
+        BaseLlavaNextMultiModalProcessor[LlavaNextProcessingInfo]):
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_sizes=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+
+@MULTIMODAL_REGISTRY.register_processor(LlavaNextMultiModalProcessor,
+                                        info=LlavaNextProcessingInfo,
+                                        dummy_inputs=LlavaDummyInputsBuilder)
+class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                        SupportsPP):
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # mapping for new names in checkpoint saved after transformers v4.52
+            "model.language_model.": "language_model.model.",
+            "model.vision_tower.": "vision_tower.",
+            "model.multi_modal_projector.": "multi_modal_projector.",
+            "model.image_newline": "image_newline",
+            "lm_head.": "language_model.lm_head.",
+        })
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<image>"
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        vision_feature_layer = config.vision_feature_layer
+        # Determine the layer up to which we will initialize the vision tower
+        if isinstance(vision_feature_layer, int):
+            vision_hidden_size = config.vision_config.hidden_size
+            self.feature_sample_layers = None
+        # Used for multimodal granite models to control encoder outputs
+        elif isinstance(vision_feature_layer, (list, tuple)):
+            vision_hidden_size = config.vision_config.hidden_size * len(
+                vision_feature_layer)
+            self.feature_sample_layers = vision_feature_layer
+        else:
+            raise TypeError(
+                f"vision_layer_feature type: {type(vision_feature_layer)}"
+                " is not supported")
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        # TODO: Optionally initializes this for supporting embeddings.
+        self.vision_tower = init_vision_tower_for_llava(
+            config,
+            quant_config,
+            require_post_norm=False,
+            prefix=maybe_prefix(prefix, "vision_tower"))
+        self.image_newline = nn.Parameter(
+            torch.empty(config.text_config.hidden_size))
+        self.multi_modal_projector = LlavaMultiModalProjector(
+            vision_hidden_size=vision_hidden_size,
+            text_hidden_size=config.text_config.hidden_size,
+            projector_hidden_act=config.projector_hidden_act,
+            multimodal_projector_bias=config.multimodal_projector_bias)
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
+        expected_dims = (2, )
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape)
+
+            if actual_dims != expected_dims:
+                expected_expr = str(expected_dims)
+                raise ValueError(
+                    f"The expected shape of image sizes per image per batch "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _validate_pixel_values(
+        self, data: Union[torch.Tensor, list[torch.Tensor]]
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape[1:])
+
+            if actual_dims != expected_dims:
+                expected_expr = ("num_patches", *map(str, expected_dims))
+                raise ValueError(
+                    "The expected shape of pixel values per image per batch "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[LlavaNextImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            if not isinstance(image_sizes, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image sizes. "
+                                 f"Got type: {type(image_sizes)}")
+
+            return LlavaNextImagePixelInputs(
+                type="pixel_values",
+                pixel_values=self._validate_pixel_values(
+                    flatten_bn(pixel_values)),
+                image_sizes=self._validate_image_sizes(
+                    flatten_bn(image_sizes, concat=True)),
+            )
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of image embeds. "
+                                 f"Got type: {type(image_embeds)}")
+
+            return LlavaNextImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds),
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _select_image_features(self, image_features: torch.Tensor, *,
+                               strategy: str) -> torch.Tensor:
+        # Copied from https://github.com/huggingface/transformers/blob/39c3c0a72af6fbda5614dde02ff236069bb79827/src/transformers/models/llava/modeling_llava.py#L421  # noqa
+        if strategy == "default":
+            return image_features[:, 1:]
+        elif strategy == "full":
+            return image_features
+
+        raise ValueError(f"Unexpected select feature strategy: {strategy}")
+
+    def _image_pixels_to_features(
+        self,
+        vision_tower: Union[CLIPVisionModel, SiglipVisionModel],
+        pixel_values: torch.Tensor,
+    ) -> torch.Tensor:
+
+        # NOTE: we skip the step to select the vision feature layer since
+        # this is already done inside the vision tower
+        image_features = vision_tower(
+            pixel_values, feature_sample_layers=self.feature_sample_layers)
+
+        return self._select_image_features(
+            image_features,
+            strategy=self.config.vision_feature_select_strategy,
+        )
+
+    # Based on: https://github.com/haotian-liu/LLaVA/blob/main/llava/model/llava_arch.py
+    def _merge_image_patch_embeddings(self, image_size: torch.Tensor,
+                                      patch_embeddings: torch.Tensor, *,
+                                      strategy: str) -> torch.Tensor:
+        if strategy == "flat":
+            return patch_embeddings.flatten(0, 1)
+
+        if strategy.startswith("spatial"):
+            height = width = self.config.vision_config.image_size \
+                // self.config.vision_config.patch_size
+
+            base_patch_embeds = patch_embeddings[0]
+            if height * width != base_patch_embeds.shape[0]:
+                raise ValueError(
+                    "The number of patches is not consistent with the "
+                    "image size.")
+
+            if patch_embeddings.shape[0] > 1:
+                other_patch_embeds = patch_embeddings[1:]
+
+                # Move to CPU to avoid floating-point errors
+                orig_height, orig_width = image_size.tolist()
+
+                # image_aspect_ratio == "anyres"
+                num_patch_height, num_patch_width = get_anyres_image_grid_shape(
+                    (orig_height, orig_width),
+                    self.config.image_grid_pinpoints,
+                    self.config.vision_config.image_size,
+                )
+                num_patches = num_patch_height * num_patch_width
+
+                # Image patches might be padded for batch processing
+                other_patch_embeds = other_patch_embeds[:num_patches] \
+                    .view(num_patch_height, num_patch_width, height, width, -1)
+
+                if "unpad" in strategy:
+                    other_patch_embeds = other_patch_embeds \
+                        .permute(4, 0, 2, 1, 3).contiguous() \
+                        .flatten(1, 2).flatten(2, 3)
+                    other_patch_embeds = unpad_image(other_patch_embeds,
+                                                     (orig_height, orig_width))
+                    other_patch_embeds = torch.cat((
+                        other_patch_embeds,
+                        self.image_newline[:, None, None] \
+                            .expand(*other_patch_embeds.shape[:-1], 1) \
+                            .to(other_patch_embeds.device),
+                    ), dim=-1)
+                    other_patch_embeds = other_patch_embeds \
+                        .flatten(1, 2).transpose(0, 1)
+                else:
+                    other_patch_embeds = other_patch_embeds \
+                        .permute(0, 2, 1, 3, 4).contiguous() \
+                        .flatten(0, 3)
+
+                merged_patch_embeddings = torch.cat(
+                    (base_patch_embeds, other_patch_embeds), dim=0)
+            else:
+                if "unpad" in strategy:
+                    merged_patch_embeddings = torch.cat(
+                        (base_patch_embeds,
+                         self.image_newline[None] \
+                            .to(base_patch_embeds.device)
+                    ), dim=0)
+                else:
+                    merged_patch_embeddings = base_patch_embeds
+
+            return merged_patch_embeddings
+
+        raise ValueError(f"Unexpected patch merge strategy: {strategy}")
+
+    def _process_image_pixels(
+        self,
+        inputs: LlavaNextImagePixelInputs,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        assert self.vision_tower is not None
+
+        pixel_values = inputs["pixel_values"]
+
+        if isinstance(pixel_values, torch.Tensor):
+            b, num_patches, c, h, w = pixel_values.shape
+            stacked_pixel_values = pixel_values.view(b * num_patches, c, h, w)
+            stacked_image_features = self._image_pixels_to_features(
+                self.vision_tower, stacked_pixel_values)
+            stacked_patch_embeddings = self.multi_modal_projector(
+                stacked_image_features)
+
+            return stacked_patch_embeddings.view(
+                b, num_patches, *stacked_patch_embeddings.shape[1:])
+
+        num_patches_per_batch = [v.shape[0] for v in pixel_values]
+        stacked_pixel_values = torch.cat(pixel_values)
+        stacked_image_features = self._image_pixels_to_features(
+            self.vision_tower, stacked_pixel_values)
+
+        return torch.split(self.multi_modal_projector(stacked_image_features),
+                           num_patches_per_batch)
+
+    def _process_image_input(
+        self,
+        image_input: LlavaNextImageInputs,
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        if image_input["type"] == "image_embeds":
+            return [image_input["data"]]
+
+        patch_embeddings = self._process_image_pixels(image_input)
+
+        image_sizes = image_input.get("image_sizes")
+        if image_sizes is None:
+            batch_size = len(image_input["data"])
+            vision_config = self.config.vision_config
+            default_height = default_width = vision_config.image_size
+            image_sizes = torch.as_tensor([[default_height, default_width]
+                                           for _ in range(batch_size)])
+
+        return [
+            self._merge_image_patch_embeddings(image_sizes[i],
+                                               patch_features_batch,
+                                               strategy="spatial_unpad")
+            for i, patch_features_batch in enumerate(patch_embeddings)
+        ]
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return []
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+
+        if multimodal_embeddings is None \
+            or len(multimodal_embeddings) == 0:
+            return self.language_model.get_input_embeddings(input_ids)
+
+        inputs_embeds = embed_multimodal(
+            input_ids,
+            self.config.image_token_index,
+            self.language_model.model.get_input_embeddings,
+            multimodal_embeddings,
+        )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        """Run forward pass for LlaVA-NeXT.
+
+        One key thing to understand is the `input_ids` already accounts for the
+        positions of the to-be-inserted image embeddings.
+
+        Concretely, consider a text prompt:
+        `"A chat between a curious human and an artificial intelligence
+        assistant. The assistant gives helpful, detailed, and polite answers to
+        the human's questions.
+        USER: <image>\\nWhat is shown in this image? ASSISTANT:"`.
+
+        Tokenizer outputs:
+        `[1, 319, 13563, 1546, 263, 12758, 5199, 322, 385, 23116, 21082, 20255,
+        29889, 450, 20255, 4076, 8444, 29892, 13173, 29892, 322, 1248, 568,
+        6089, 304, 278, 5199, 29915, 29879, 5155, 29889, 3148, 1001, 29901,
+        29871, 32000, 13, 5618, 338, 4318, 297, 445, 1967, 29973, 319, 1799,
+        9047, 13566, 29901]`.
+
+        To reserve space in KV cache, we have to insert placeholder tokens
+        before they are inputted to the model, so the input processor prepends
+        additional image tokens (denoted as `32000`), resulting in:
+        `[1, 319, 13563, 1546, 263, 12758, 5199, 322, 385, 23116, 21082, 20255,
+        29889, 450, 20255, 4076, 8444, 29892, 13173, 29892, 322, 1248, 568,
+        6089, 304, 278, 5199, 29915, 29879, 5155, 29889, 3148, 1001, 29901,
+        29871, 32000, ..., 32000, 13, 5618, 338, 4318, 297, 445, 1967, 29973,
+        319, 1799, 9047, 13566, 29901]`.
+
+        Unlike in LLaVA-1.5, the number of image tokens inputted to the language
+        model depends on the original size of the input image. Including the
+        original image token in the input, the required number of image tokens
+        is given by [get_llava_next_image_feature_size][].
+
+        This way, the `positions` and `attn_metadata` are consistent
+        with the `input_ids`.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            pixel_values: The pixels in each grid patch for each input image.
+            image_sizes: The original `(height, width)` for each input image.
+
+        Info:
+            [LlavaNextImageInputs][]
+        """
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/llava_next_video.py b/vllm_v0.10.0/vllm/model_executor/models/llava_next_video.py
new file mode 100644
index 0000000..a96df0b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/llava_next_video.py
@@ -0,0 +1,492 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Literal, Optional, TypedDict, Union
+
+import torch
+import torch.nn as nn
+from transformers import (BatchFeature, LlavaNextVideoConfig,
+                          LlavaNextVideoProcessor)
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.models.clip import CLIPVisionModel
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
+                                   VideoEmbeddingItems, VideoProcessorItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.utils import is_list_of
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .llava import init_vision_tower_for_llava
+from .siglip import SiglipVisionModel
+from .utils import (AutoWeightsLoader, WeightsMapper,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+from .vision import get_vision_encoder_info
+
+
+class LlavaNextVideoPixelInputs(TypedDict):
+    type: Literal["pixel_values_videos"]
+    data: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    Shape: `(batch_size, num_frames, num_channels, height, width)`
+
+    Note that `num_frames` may be different for each batch, in which case
+    the data is passed as a list instead of a batched tensor.
+
+    Note that it only supports one video input for one batch.
+    """
+
+
+class LlavaNextVideoProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(LlavaNextVideoConfig)
+
+    def get_vision_encoder_info(self):
+        return get_vision_encoder_info(self.get_hf_config())
+
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(LlavaNextVideoProcessor, **kwargs)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"video": 1}
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        vision_encoder_info = self.get_vision_encoder_info()
+        width = height = vision_encoder_info.get_image_size()
+        return ImageSize(width=width, height=height)
+
+    def _get_num_frame_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        hf_config = self.get_hf_config()
+        spatial_pool_stride = hf_config.spatial_pool_stride
+
+        vision_encoder_info = self.get_vision_encoder_info()
+        patch_grid_length = vision_encoder_info.get_patch_grid_length()
+        pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride)
+
+        return pooled_grid_length * pooled_grid_length
+
+    def get_num_video_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int,
+    ) -> int:
+        num_frame_tokens = self._get_num_frame_tokens(
+            image_width=image_width,
+            image_height=image_height,
+        )
+
+        return num_frame_tokens * num_frames
+
+    def _get_max_video_frames(self, max_tokens: int) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        num_frames = 0
+
+        while True:
+            next_num_frames = num_frames + 1
+            next_max_tokens = self.get_num_video_tokens(
+                image_width=target_width,
+                image_height=target_height,
+                num_frames=next_num_frames,
+            )
+
+            if next_max_tokens > max_tokens:
+                break
+
+            num_frames = next_num_frames
+
+        return num_frames
+
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_videos = mm_counts.get("video", 0)
+
+        max_total_frames = self._get_max_video_frames(seq_len)
+
+        return max(max_total_frames // max(max_videos, 1), 1)
+
+
+class LlavaNextVideoDummyInputsBuilder(
+        BaseDummyInputsBuilder[LlavaNextVideoProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_videos = mm_counts.get("video", 0)
+
+        processor = self.info.get_hf_processor()
+        video_token = processor.video_token
+
+        return video_token * num_videos
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_videos = mm_counts.get("video", 0)
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        target_num_frames = \
+            self.info.get_num_frames_with_most_features(seq_len, mm_counts)
+
+        return {
+            "video":
+            self._get_dummy_videos(
+                width=target_width,
+                height=target_height,
+                num_frames=target_num_frames,
+                num_videos=num_videos,
+            )
+        }
+
+
+class LlavaNextVideoMultiModalProcessor(
+        BaseMultiModalProcessor[LlavaNextVideoProcessingInfo]):
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(pixel_values_videos=MultiModalFieldConfig.batched("video"))
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_config = self.info.get_hf_config()
+        video_token_id = hf_config.video_token_index
+
+        def get_replacement(item_idx: int):
+            videos = mm_items.get_items(
+                "video", (VideoEmbeddingItems, VideoProcessorItems))
+
+            if isinstance(videos, VideoEmbeddingItems):
+                num_video_tokens = videos.get_feature_size(item_idx)
+            else:
+                image_size = videos.get_frame_size(item_idx)
+                num_video_tokens = self.info.get_num_video_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    num_frames=videos.get_num_frames(item_idx),
+                )
+
+            return [video_token_id] * num_video_tokens
+
+        return [
+            PromptReplacement(
+                modality="video",
+                target=[video_token_id],
+                replacement=get_replacement,
+            ),
+        ]
+
+
+# adopted from transformers modeling_llava_next_video.py
+class LlavaNextVideoPooler(nn.Module):
+
+    def __init__(self, config: LlavaNextVideoConfig):
+        super().__init__()
+
+        mode = config.spatial_pool_mode
+        stride = config.spatial_pool_stride
+        image_size = config.vision_config.image_size
+        patch_size = config.vision_config.patch_size
+        self.image_size = image_size // patch_size**2
+
+        if mode == "average":
+            self.pool = nn.AvgPool2d(kernel_size=stride, stride=stride)
+        elif mode == "max":
+            self.pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
+        else:
+            # TODO: Support Conv2d pooling layer, need to load weights
+            raise ValueError(
+                f"Unknown pooling mode: {mode}. Expected [`average`, `max`]")
+
+    def forward(self, image_features: torch.Tensor):
+        ori_width = int(
+            math.sqrt(image_features.shape[1] * self.image_size //
+                      self.image_size))
+        ori_height = int(ori_width * self.image_size // self.image_size)
+
+        batch_size, _, dim = image_features.shape
+        image_features_spatial = image_features \
+            .view(batch_size, ori_height, ori_height, dim) \
+            .permute(0, 3, 1, 2)
+        image_features_spatial = self.pool(image_features_spatial)
+
+        return image_features_spatial.flatten(2).transpose(1, 2).contiguous()
+
+
+class LlavaNextMultiModalProjector(nn.Module):
+
+    def __init__(self, vision_hidden_size: int, text_hidden_size: int,
+                 projector_hidden_act: str, multimodal_projector_bias: bool):
+        super().__init__()
+
+        self.linear_1 = nn.Linear(vision_hidden_size,
+                                  text_hidden_size,
+                                  bias=multimodal_projector_bias)
+        self.act = get_act_fn(projector_hidden_act)
+        self.linear_2 = nn.Linear(text_hidden_size,
+                                  text_hidden_size,
+                                  bias=multimodal_projector_bias)
+
+    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    LlavaNextVideoMultiModalProcessor,
+    info=LlavaNextVideoProcessingInfo,
+    dummy_inputs=LlavaNextVideoDummyInputsBuilder,
+)
+class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                             SupportsPP):
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # mapping for new names in checkpoint saved after transformers v4.52
+            "model.language_model.": "language_model.model.",
+            "model.vision_tower.": "vision_tower.",
+            "model.multi_modal_projector.": "multi_modal_projector.",
+            "model.image_newline": "image_newline",
+            "lm_head.": "language_model.lm_head.",
+        })
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<image>"
+        if modality.startswith("video"):
+            return "<video>"
+
+        raise ValueError("Only image or video modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        # Initialize the vision tower only up to the required feature layer
+        self.vision_tower = init_vision_tower_for_llava(
+            config,
+            quant_config,
+            require_post_norm=False,
+            prefix=maybe_prefix(prefix, "vision_tower"))
+        self.vision_resampler = LlavaNextVideoPooler(config)
+        self.multi_modal_projector = LlavaNextMultiModalProjector(
+            vision_hidden_size=config.vision_config.hidden_size,
+            text_hidden_size=config.text_config.hidden_size,
+            projector_hidden_act=config.projector_hidden_act,
+            multimodal_projector_bias=config.multimodal_projector_bias)
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.model.make_empty_intermediate_tensors)
+
+    def _validate_video_pixel_values(
+        self, data: Union[torch.Tensor, list[torch.Tensor]]
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape[2:])
+
+            if actual_dims != expected_dims:
+                expected_expr = ("num_frames", *map(str, expected_dims))
+                raise ValueError(
+                    "The expected shape of pixel values in each video frame "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_video_input(
+            self, **kwargs: object) -> Optional[LlavaNextVideoPixelInputs]:
+        """
+        A legal video input should have the following dimensions:
+        {
+            "pixel_values_videos" : 
+                list[b, Tensor(nb_frames, nb_channels, height, width)]
+        }
+        """
+        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
+
+        if pixel_values_videos is None:
+            return None
+
+        if not isinstance(pixel_values_videos, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of pixel_values_videos. "
+                             f"Got type: {type(pixel_values_videos)}")
+
+        return LlavaNextVideoPixelInputs(
+            type="pixel_values_videos",
+            data=pixel_values_videos,
+        )
+
+    def _select_image_features(self, image_features: torch.Tensor, *,
+                               strategy: str) -> torch.Tensor:
+        if strategy == "default":
+            return image_features[:, 1:]
+        elif strategy == "full":
+            return image_features
+
+        raise ValueError(f"Unexpected select feature strategy: {strategy}")
+
+    def _video_pixels_to_features(
+        self,
+        vision_tower: Union[CLIPVisionModel, SiglipVisionModel],
+        pixel_values: torch.Tensor,
+    ) -> torch.Tensor:
+
+        # NOTE: we skip the step to select the vision feature layer since
+        # this is already done inside the vision tower
+        image_features = vision_tower(pixel_values)
+        image_features = self._select_image_features(
+            image_features,
+            strategy=self.config.vision_feature_select_strategy,
+        )
+        image_features = self.vision_resampler(image_features)
+        image_features = self.multi_modal_projector(image_features)
+        return image_features
+
+    def _process_video_pixels(self, inputs: LlavaNextVideoPixelInputs):
+        assert self.vision_tower is not None
+
+        video_pixels = inputs["data"]
+
+        if isinstance(video_pixels, torch.Tensor):
+            # TODO: support multiple videos per input
+            b, num_videos, num_frames, c, h, w = video_pixels.shape
+            assert (num_videos == 1)
+            stacked_pixels = video_pixels.view(b * num_videos * num_frames, c,
+                                               h, w)
+            stacked_embeddings = self._video_pixels_to_features(
+                self.vision_tower, stacked_pixels)
+            embeds = stacked_embeddings.view(b, num_frames,
+                                             *stacked_embeddings.shape[1:])
+
+        elif is_list_of(video_pixels, torch.Tensor):
+            frames_per_videos = [v.shape[0] for v in video_pixels]
+            stacked_pixels = torch.cat(video_pixels, dim=0)
+            stacked_embeddings = self._video_pixels_to_features(
+                self.vision_tower, stacked_pixels)
+            embeds = torch.split(stacked_embeddings, frames_per_videos, dim=0)
+        else:
+            raise ValueError(
+                f"Unsupported type of video input {type(video_pixels)}")
+
+        return [e.flatten(0, 1) for e in embeds]
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        video_input = self._parse_and_validate_video_input(**kwargs)
+        if video_input is None:
+            return []
+        vision_embeddings = self._process_video_pixels(video_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.config.video_token_index)
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        """Run forward pass for LlaVA-NeXT-Video.
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            pixel_values_videos: Pixels in each frames for each input videos.
+        """
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            # This model doesn't support images for now
+            ignore_unexpected_prefixes=["image_newline"],
+        )
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/llava_onevision.py b/vllm_v0.10.0/vllm/model_executor/models/llava_onevision.py
new file mode 100644
index 0000000..ecd24af
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/llava_onevision.py
@@ -0,0 +1,985 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Final, Literal, Optional, Protocol, TypedDict, Union
+
+import torch
+import torch.nn as nn
+from transformers import (BatchFeature, LlavaOnevisionConfig,
+                          LlavaOnevisionProcessor)
+from transformers.models.llava_onevision.modeling_llava_onevision import (
+    get_anyres_image_grid_shape, unpad_image)
+from typing_extensions import NotRequired
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
+                                   VideoEmbeddingItems, VideoProcessorItems)
+from vllm.multimodal.processing import PromptReplacement, PromptUpdate
+from vllm.sequence import IntermediateTensors
+
+from .clip import CLIPVisionModel
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .llava import LlavaDummyInputsBuilder, init_vision_tower_for_llava
+from .llava_next import (BaseLlavaNextMultiModalProcessor, LlavaNextLikeConfig,
+                         LlavaNextProcessingInfo)
+from .siglip import SiglipVisionModel
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+
+# For profile run
+_MAX_FRAMES_PER_VIDEO = 16
+
+
+class LlavaOnevisionVideoPixelInputs(TypedDict):
+    type: Literal["pixel_values_videos"]
+    pixel_values_videos: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    Shape: `(batch_size * num_videos, num_frames, num_channels, height, width)`
+
+    Note that `num_videos` may be different for each batch, and 'num_frames'
+    may be different for each video, in which case the data is passed as a
+    list instead of a batched tensor.
+    """
+
+
+class LlavaOnevisionImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    Shape:
+    `(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
+
+    Note that `num_patches` may be different per batch and image,
+    in which case the data is passed as a list instead of a batched tensor.
+    """
+
+    image_sizes: NotRequired[torch.Tensor]
+    """
+    Shape: `(batch_size * num_images, 2)`
+
+    This should be in `(height, width)` format.
+    """
+
+
+class LlavaOnevisionImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: torch.Tensor
+    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+LlavaOnevisionImageInputs = Union[LlavaOnevisionImagePixelInputs,
+                                  LlavaOnevisionImageEmbeddingInputs]
+
+LlavaOnevisionMultiInputs = Union[LlavaOnevisionImageInputs,
+                                  LlavaOnevisionVideoPixelInputs]
+
+
+class LlavaOnevisionLikeConfig(LlavaNextLikeConfig, Protocol):
+    video_token_index: Final[int]
+
+
+class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
+
+    def get_hf_config(self) -> LlavaOnevisionLikeConfig:
+        return self.ctx.get_hf_config(LlavaOnevisionConfig)
+
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(LlavaOnevisionProcessor, **kwargs)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": None}
+
+    # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
+    # with additional logic afterwards taken from LlavaOnevisionProcessor
+    def _get_num_unpadded_features(
+        self,
+        *,
+        original_height: int,
+        original_width: int,
+        npatches: int,
+        num_patch_height: int,
+        num_patch_width: int,
+    ) -> tuple[int, int]:
+        current_height = npatches * num_patch_height
+        current_width = npatches * num_patch_width
+
+        aspect_ratio = original_width / original_height
+        current_aspect_ratio = current_width / current_height
+
+        if aspect_ratio > current_aspect_ratio:
+            new_height = int(
+                round(original_height * (current_width / original_width), 7))
+            padding = (current_height - new_height) // 2
+            current_height = current_height - (2 * padding)
+        else:
+            new_width = int(
+                round(original_width * (current_height / original_height), 7))
+            padding = (current_width - new_width) // 2
+            current_width = current_width - (2 * padding)
+
+        unpadded_features = current_height * current_width
+        newline_features = current_height
+
+        ratio = math.sqrt(current_height * current_width / (9 * npatches**2))
+        if ratio > 1.1:
+            height_factor = int(current_height // ratio)
+            width_factor = int(current_width // ratio)
+            unpadded_features = height_factor * width_factor
+            newline_features = height_factor
+
+        return (unpadded_features, newline_features)
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        # NOTE: This hardcoded value is found via processor tests
+        return ImageSize(width=1153, height=944)
+
+    def _get_num_frame_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        hf_config = self.get_hf_config()
+        spatial_pool_stride = getattr(hf_config, "spatial_pool_stride", 2)
+
+        vision_encoder_info = self.get_vision_encoder_info()
+        patch_grid_length = vision_encoder_info.get_patch_grid_length()
+        pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride)
+
+        return pooled_grid_length * pooled_grid_length
+
+    def get_num_video_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int,
+    ) -> int:
+        num_frame_tokens = self._get_num_frame_tokens(
+            image_width=image_width,
+            image_height=image_height,
+        )
+
+        return num_frame_tokens * num_frames + 1  # Newline token
+
+    def _get_max_video_frames(self, max_tokens: int) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        num_frames = 0
+
+        while True:
+            next_num_frames = num_frames + 1
+            next_max_tokens = self.get_num_video_tokens(
+                image_width=target_width,
+                image_height=target_height,
+                num_frames=next_num_frames,
+            )
+
+            if next_max_tokens > max_tokens:
+                break
+
+            num_frames = next_num_frames
+
+        return num_frames
+
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
+
+        max_image_tokens = self.get_max_image_tokens() * max_images
+        max_total_frames = self._get_max_video_frames(seq_len -
+                                                      max_image_tokens)
+        max_frames_per_video = min(max_total_frames // max(max_videos, 1),
+                                   _MAX_FRAMES_PER_VIDEO)
+
+        return max(max_frames_per_video, 1)
+
+    def get_max_video_tokens(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_video_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            num_frames=self.get_num_frames_with_most_features(
+                seq_len, mm_counts),
+        )
+
+
+class LlavaOnevisionDummyInputsBuilder(
+        LlavaDummyInputsBuilder[LlavaOnevisionProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+        video_token = processor.video_token
+
+        return image_token * num_images + video_token * num_videos
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        target_num_frames = \
+            self.info.get_num_frames_with_most_features(seq_len,
+                                                        mm_counts)
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "video":
+            self._get_dummy_videos(
+                width=target_width,
+                height=target_height,
+                num_frames=target_num_frames,
+                num_videos=num_videos,
+            )
+        }
+
+
+class LlavaOnevisionMultiModalProcessor(
+        BaseLlavaNextMultiModalProcessor[LlavaOnevisionProcessingInfo]):
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_sizes=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+            pixel_values_videos=MultiModalFieldConfig.batched("video"),
+        )
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        mm_data = dict(mm_data)
+        videos = mm_data.pop("videos", [])
+        assert isinstance(videos, list)
+
+        if not videos:
+            return super()._call_hf_processor(
+                prompt=prompt,
+                mm_data=mm_data,
+                mm_kwargs=mm_kwargs,
+                tok_kwargs=tok_kwargs,
+            )
+
+        # LLaVA-OneVision processor doesn't support multiple videos
+        # with different sizes when converting back to tensors
+        # So, we process each component separately
+        # NOTE: No prompt replacement is applied in this case
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+        video_token = processor.video_token
+
+        text_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data={},
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+
+        images = mm_data.pop("images", [])
+        assert isinstance(images, list)
+        if images:
+            processor_outputs = super()._call_hf_processor(
+                prompt=image_token * len(images),
+                mm_data={"images": images},
+                mm_kwargs=mm_kwargs,
+                tok_kwargs=tok_kwargs,
+            )
+            image_outputs = {
+                k: v
+                for k, v in processor_outputs.items()
+                if k in ("pixel_values", "image_sizes")
+            }
+        else:
+            image_outputs = {}
+
+        pixel_values_videos = []
+        for video in videos:
+            item_outputs = super()._call_hf_processor(
+                prompt=video_token,
+                mm_data={"videos": video},
+                mm_kwargs=mm_kwargs,
+                tok_kwargs=tok_kwargs,
+            )
+
+            pixel_values_videos.append(item_outputs["pixel_values_videos"][0])
+
+        video_outputs = {"pixel_values_videos": pixel_values_videos}
+
+        combined_outputs = dict(
+            text_outputs,
+            **image_outputs,
+            **video_outputs,
+        )
+        return BatchFeature(combined_outputs)
+
+    def _hf_processor_applies_updates(
+        self,
+        prompt_text: str,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+    ) -> bool:
+        base_result = super()._hf_processor_applies_updates(
+            prompt_text=prompt_text,
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            tokenization_kwargs=tokenization_kwargs,
+        )
+
+        return base_result and mm_items.get_count("video", strict=False) == 0
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        image_repls = super()._get_prompt_updates(
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            out_mm_kwargs=out_mm_kwargs,
+        )
+
+        hf_config = self.info.get_hf_config()
+        video_token_id = hf_config.video_token_index
+
+        def get_video_replacement(item_idx: int):
+            videos = mm_items.get_items(
+                "video", (VideoEmbeddingItems, VideoProcessorItems))
+
+            if isinstance(videos, VideoEmbeddingItems):
+                num_video_tokens = videos.get_feature_size(item_idx)
+            else:
+                image_size = videos.get_frame_size(item_idx)
+                num_video_tokens = self.info.get_num_video_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    num_frames=videos.get_num_frames(item_idx),
+                )
+
+            return [video_token_id] * num_video_tokens
+
+        return [
+            *image_repls,
+            PromptReplacement(
+                modality="video",
+                target=[video_token_id],
+                replacement=get_video_replacement,
+            ),
+        ]
+
+
+class LlavaOnevisionMultiModalProjector(nn.Module):
+
+    def __init__(self, config: LlavaOnevisionConfig):
+        super().__init__()
+
+        self.linear_1 = nn.Linear(config.vision_config.hidden_size,
+                                  config.text_config.hidden_size,
+                                  bias=config.multimodal_projector_bias)
+        self.act = get_act_fn(config.projector_hidden_act)
+        self.linear_2 = nn.Linear(config.text_config.hidden_size,
+                                  config.text_config.hidden_size,
+                                  bias=config.multimodal_projector_bias)
+
+    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    LlavaOnevisionMultiModalProcessor,
+    info=LlavaOnevisionProcessingInfo,
+    dummy_inputs=LlavaOnevisionDummyInputsBuilder)
+class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                             SupportsPP):
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # mapping for new names in checkpoint saved after transformers v4.52
+            "model.language_model.": "language_model.model.",
+            "model.vision_tower.": "vision_tower.",
+            "model.multi_modal_projector.": "multi_modal_projector.",
+            "model.image_newline": "image_newline",
+            "lm_head.": "language_model.lm_head.",
+        })
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<image>"
+        if modality.startswith("video"):
+            return "<video>"
+
+        raise ValueError("Only image or video modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        # Initialize the vision tower only up to the required feature layer
+        self.vision_tower = init_vision_tower_for_llava(
+            config,
+            quant_config,
+            require_post_norm=False,
+            prefix=maybe_prefix(prefix, "vision_tower"))
+        self.multi_modal_projector = LlavaOnevisionMultiModalProjector(config)
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+        self.image_newline = nn.Parameter(
+            torch.empty(config.text_config.hidden_size))
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.model.make_empty_intermediate_tensors)
+
+    def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
+        expected_dims = (2, )
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape)
+
+            if actual_dims != expected_dims:
+                expected_expr = str(expected_dims)
+                raise ValueError(
+                    f"The expected shape of image sizes per image per batch "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _validate_image_pixel_values(
+        self, data: Union[torch.Tensor, list[torch.Tensor]]
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape[1:])
+
+            if actual_dims != expected_dims:
+                expected_expr = ("num_patches", *map(str, expected_dims))
+                raise ValueError(
+                    "The expected shape of pixel values per image per batch "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[LlavaOnevisionImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            if not isinstance(image_sizes, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image sizes. "
+                                 f"Got type: {type(image_sizes)}")
+
+            return LlavaOnevisionImagePixelInputs(
+                type="pixel_values",
+                pixel_values=self._validate_image_pixel_values(
+                    flatten_bn(pixel_values)),
+                image_sizes=self._validate_image_sizes(
+                    flatten_bn(image_sizes, concat=True)),
+            )
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of image embeds. "
+                                 f"Got type: {type(image_embeds)}")
+
+            return LlavaOnevisionImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds),
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _validate_video_pixel_values(
+        self, data: Union[torch.Tensor, list[torch.Tensor]]
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape[2:])
+
+            if actual_dims != expected_dims:
+                expected_expr = ("num_frames", *map(str, expected_dims))
+                raise ValueError(
+                    "The expected shape of pixel values in each video frame "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_video_input(
+            self,
+            **kwargs: object) -> Optional[LlavaOnevisionVideoPixelInputs]:
+        """
+        A legal video input should have the following dimensions:
+        {
+            "pixel_values_videos" : 
+                list[b, Tensor(nb_frames, nb_channels, height, width)]
+        }
+        """
+        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
+        if pixel_values_videos is None:
+            return None
+
+        if not isinstance(pixel_values_videos, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of pixel_values_videos. "
+                             f"Got type: {type(pixel_values_videos)}")
+
+        return LlavaOnevisionVideoPixelInputs(
+            type="pixel_values_videos",
+            pixel_values_videos=flatten_bn(pixel_values_videos),
+        )
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        mm_input_by_modality = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("pixel_values", "image_embeds"
+                             ) and "image" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "image"] = self._parse_and_validate_image_input(**kwargs)
+            if input_key in ("pixel_values_videos", "video_embeds"
+                             ) and "video" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "video"] = self._parse_and_validate_video_input(**kwargs)
+
+        return mm_input_by_modality
+
+    def _select_image_features(self, image_features: torch.Tensor, *,
+                               strategy: str) -> torch.Tensor:
+        if strategy == "default":
+            return image_features[:, 1:]
+        elif strategy == "full":
+            return image_features
+
+        raise ValueError(f"Unexpected select feature strategy: {strategy}")
+
+    def _image_pixels_to_features(
+        self,
+        vision_tower: Union[CLIPVisionModel, SiglipVisionModel],
+        pixel_values: torch.Tensor,
+    ) -> torch.Tensor:
+
+        # NOTE: we skip the step to select the vision feature layer since
+        # this is already done inside the vision tower
+        image_features = vision_tower(pixel_values)
+        return self._select_image_features(
+            image_features,
+            strategy=self.config.vision_feature_select_strategy,
+        )
+
+    # Based on: https://github.com/haotian-liu/LLaVA/blob/main/llava/model/llava_arch.py
+    def _merge_image_patch_embeddings(self,
+                                      image_size: torch.Tensor,
+                                      patch_embeddings: torch.Tensor,
+                                      *,
+                                      image_newline=None,
+                                      vision_aspect_ratio="anyres_max_9",
+                                      strategy: str) -> torch.Tensor:
+        if strategy == "flat":
+            return patch_embeddings.flatten(0, 1)
+
+        if strategy.startswith("spatial"):
+            height = width = self.config.vision_config.image_size \
+                // self.config.vision_config.patch_size
+
+            base_patch_embeds = patch_embeddings[0]
+            if height * width != base_patch_embeds.shape[0]:
+                raise ValueError(
+                    "The number of patches is not consistent with the "
+                    "image size.")
+
+            if patch_embeddings.shape[0] > 1:
+                other_patch_embeds = patch_embeddings[1:]
+
+                # Move to CPU to avoid floating-point errors
+                orig_height, orig_width = image_size.tolist()
+
+                # image_aspect_ratio == "anyres"
+                num_patch_height, num_patch_width = get_anyres_image_grid_shape(
+                    (orig_height, orig_width),
+                    self.config.image_grid_pinpoints,
+                    self.config.vision_config.image_size,
+                )
+                num_patches = num_patch_height * num_patch_width
+
+                # Image patches might be padded for batch processing
+                other_patch_embeds = other_patch_embeds[:num_patches] \
+                    .view(num_patch_height, num_patch_width, height, width, -1)
+
+                if "unpad" in strategy:
+                    other_patch_embeds = other_patch_embeds \
+                        .permute(4, 0, 2, 1, 3).contiguous() \
+                        .flatten(1, 2).flatten(2, 3)
+                    other_patch_embeds = unpad_image(other_patch_embeds,
+                                                     (orig_height, orig_width))
+                    max_num_patches = int(
+                        vision_aspect_ratio.removeprefix("anyres_max_"))
+                    channels, curr_height, curr_width = other_patch_embeds.shape
+                    ratio = math.sqrt(curr_height * curr_width /
+                                      (max_num_patches * height**2))
+                    if ratio > 1.1:
+                        other_patch_embeds = other_patch_embeds[None]
+                        other_patch_embeds = nn.functional.interpolate(
+                            other_patch_embeds, [
+                                int(curr_height // ratio),
+                                int(curr_width // ratio)
+                            ],
+                            mode="bilinear")[0]
+                    if image_newline is not None:
+                        other_patch_embeds = torch.cat(
+                            (
+                                other_patch_embeds,
+                                image_newline[:, None, None] \
+                                .expand(*other_patch_embeds.shape[:-1], 1) \
+                                .to(other_patch_embeds.device),
+                            ),
+                        dim=-1)
+                    other_patch_embeds = other_patch_embeds \
+                        .flatten(1, 2).transpose(0, 1)
+                else:
+                    other_patch_embeds = other_patch_embeds \
+                        .permute(0, 2, 1, 3, 4).contiguous() \
+                        .flatten(0, 3)
+
+                merged_patch_embeddings = torch.cat(
+                    (base_patch_embeds, other_patch_embeds), dim=0)
+            else:
+                if "unpad" in strategy:
+                    merged_patch_embeddings = torch.cat(
+                        (base_patch_embeds,
+                         self.image_newline[None] \
+                            .to(base_patch_embeds.device)
+                    ), dim=0)
+                else:
+                    merged_patch_embeddings = base_patch_embeds
+
+            return merged_patch_embeddings
+
+        raise ValueError(f"Unexpected patch merge strategy: {strategy}")
+
+    def _process_image_pixels(
+        self,
+        inputs: LlavaOnevisionImagePixelInputs,
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        assert self.vision_tower is not None
+
+        pixel_values = inputs["pixel_values"]
+
+        if isinstance(pixel_values, torch.Tensor):
+            b, num_patches, c, h, w = pixel_values.shape
+            stacked_pixel_values = pixel_values.view(b * num_patches, c, h, w)
+            stacked_image_features = self._image_pixels_to_features(
+                self.vision_tower, stacked_pixel_values)
+            stacked_patch_embeddings = self.multi_modal_projector(
+                stacked_image_features)
+
+            return stacked_patch_embeddings.view(
+                b, num_patches, *stacked_patch_embeddings.shape[1:])
+
+        num_patches_per_batch = [v.shape[0] for v in pixel_values]
+        stacked_pixel_values = torch.cat(pixel_values)
+        stacked_image_features = self._image_pixels_to_features(
+            self.vision_tower, stacked_pixel_values)
+
+        return [
+            self.multi_modal_projector(image_features) for image_features in
+            torch.split(stacked_image_features, num_patches_per_batch)
+        ]
+
+    def _process_image_input(
+        self,
+        image_input: LlavaOnevisionImageInputs,
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        if image_input["type"] == "image_embeds":
+            return [image_input["data"]]
+
+        patch_embeddings = self._process_image_pixels(image_input)
+
+        image_sizes = image_input.get("image_sizes")
+        if image_sizes is None:
+            batch_size = len(image_input["pixel_values"])
+            vision_config = self.config.vision_config
+            default_height = default_width = vision_config.image_size
+            image_sizes = torch.as_tensor([[default_height, default_width]
+                                           for _ in range(batch_size)])
+
+        return [
+            self._merge_image_patch_embeddings(
+                image_sizes[i],
+                patch_features_batch,
+                image_newline=self.image_newline,
+                strategy="spatial_unpad")
+            for i, patch_features_batch in enumerate(patch_embeddings)
+        ]
+
+    def _video_pixels_to_features(
+        self,
+        vision_tower: Union[CLIPVisionModel, SiglipVisionModel],
+        pixel_values: torch.Tensor,
+    ) -> torch.Tensor:
+
+        # NOTE: we skip the step to select the vision feature layer since
+        # this is already done inside the vision tower
+        video_features = vision_tower(pixel_values)
+        video_features = self._select_image_features(
+            video_features,
+            strategy=self.config.vision_feature_select_strategy,
+        )
+        video_features = self.multi_modal_projector(video_features)
+        video_features = self.apply_pooling(video_features)
+        return video_features
+
+    def _process_video_pixels(self, inputs: LlavaOnevisionVideoPixelInputs):
+        assert self.vision_tower is not None
+
+        video_pixels = inputs["pixel_values_videos"]
+
+        if isinstance(video_pixels, torch.Tensor):
+            total_videos, frames, c, h, w = video_pixels.shape
+            video_pixels_flat = video_pixels.view(total_videos * frames, c, h,
+                                                  w)
+
+            embeddings_flat = self._video_pixels_to_features(
+                self.vision_tower, video_pixels_flat)
+
+            embeddings_flat = embeddings_flat.reshape(
+                total_videos, frames * embeddings_flat.shape[1], -1)
+
+            image_newline = self.image_newline[None, None, :].expand(
+                total_videos, -1, -1)
+            return torch.cat((embeddings_flat, image_newline), dim=1)
+
+        frames_per_video = [len(video) for video in video_pixels]
+        video_pixels_flat = torch.cat(video_pixels)
+
+        embeddings_flat = self._video_pixels_to_features(
+            self.vision_tower, video_pixels_flat)
+
+        image_newline = self.image_newline[None, None, :]
+
+        return [
+            torch.cat(
+                (
+                    embeds.reshape(1, num_frame * embeddings_flat.shape[1],
+                                   -1),
+                    image_newline,
+                ),
+                dim=1,
+            ) for num_frame, embeds in zip(
+                frames_per_video,
+                torch.split(embeddings_flat, frames_per_video),
+            )
+        ]
+
+    def apply_pooling(self, image_features: torch.Tensor, stride: int = 2):
+        vision_config = self.config.vision_config
+        height = width = vision_config.image_size // vision_config.patch_size
+        batch_frames, _, dim = image_features.shape
+        image_features = image_features.view(batch_frames, height, width, -1)
+        image_features = image_features.permute(0, 3, 1, 2)
+
+        # TODO support other pooling types config
+        height, width = image_features.shape[2:]
+        scaled_shape = [math.ceil(height / stride), math.ceil(width / stride)]
+        image_feature = nn.functional.interpolate(image_features,
+                                                  size=scaled_shape,
+                                                  mode='bilinear')
+        image_feature = image_feature.permute(0, 2, 3, 1)
+        image_feature = image_feature.view(batch_frames, -1, dim)
+        return image_feature
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(
+            **kwargs)
+        if not mm_input_by_modality:
+            return []
+            return None
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in mm_input_by_modality:
+            multimodal_input = mm_input_by_modality[modality]
+            if modality == "image":
+                vision_embeddings = self._process_image_input(multimodal_input)
+                multimodal_embeddings += tuple(vision_embeddings)
+            if modality == "video":
+                video_embeddings = self._process_video_pixels(multimodal_input)
+                multimodal_embeddings += tuple(video_embeddings)
+
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                [self.config.image_token_index, self.config.video_token_index])
+        return inputs_embeds
+
+    def get_input_embeddings_v0(
+        self,
+        input_ids: torch.Tensor,
+        image_input: Optional[LlavaOnevisionImagePixelInputs] = None,
+        video_input: Optional[LlavaOnevisionVideoPixelInputs] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.get_input_embeddings(input_ids)
+        if image_input is not None:
+            image_embeds = self._process_image_input(image_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                image_embeds,
+                placeholder_token_id=self.config.image_token_index,
+            )
+
+        if video_input is not None:
+            video_embeds = self._process_video_pixels(video_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                video_embeds,
+                placeholder_token_id=self.config.video_token_index,
+            )
+
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        """Run forward pass for LlaVA-Onevision.
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            pixel_values_videos: Pixels in each frames for each input videos.
+        """
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner from
+        # `get_multimodal_embeddings` and `get_input_embeddings`, this
+        # condition is only for v0 compatibility.
+        elif inputs_embeds is None:
+            image_input = self._parse_and_validate_image_input(**kwargs)
+            video_input = self._parse_and_validate_video_input(**kwargs)
+
+            if image_input is None and video_input is None:
+                inputs_embeds = None
+            else:
+                inputs_embeds = self.get_input_embeddings_v0(
+                    input_ids,
+                    image_input=image_input,
+                    video_input=video_input)
+                input_ids = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/mamba.py b/vllm_v0.10.0/vllm/model_executor/models/mamba.py
new file mode 100644
index 0000000..8162ac3
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/mamba.py
@@ -0,0 +1,273 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""PyTorch MAMBA model."""
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+from torch import nn
+from transformers import MambaConfig
+
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.interfaces import (HasInnerState,
+                                                   IsAttentionFree, SupportsPP,
+                                                   SupportsV0Only)
+from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
+                                                    MambaCacheParams)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+from vllm.utils import LayerBlockType
+
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+KVCache = tuple[torch.Tensor, torch.Tensor]
+
+
+class MambaDecoderLayer(nn.Module):
+
+    def __init__(self,
+                 config: MambaConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 is_lora_enabled: Optional[bool] = False) -> None:
+        super().__init__()
+        self.config = config
+        self.is_falcon_mamba = config.model_type == "falcon_mamba"
+        self.is_lora_enabled = is_lora_enabled
+        mixer_rms_eps = config.mixer_rms_eps if self.is_falcon_mamba else None
+        self.mixer = MambaMixer(hidden_size=config.hidden_size,
+                                ssm_state_size=config.state_size,
+                                conv_kernel_size=config.conv_kernel,
+                                intermediate_size=config.intermediate_size,
+                                time_step_rank=config.time_step_rank,
+                                use_conv_bias=config.use_conv_bias,
+                                use_bias=config.use_bias,
+                                use_rms_norm=self.is_falcon_mamba,
+                                rms_norm_has_weight=not self.is_falcon_mamba,
+                                rms_norm_eps=mixer_rms_eps,
+                                activation=config.hidden_act,
+                                is_lora_enabled=self.is_lora_enabled)
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        mamba_cache_params: MambaCacheParams,
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+        else:
+            hidden_states, residual = self.norm(hidden_states, residual)
+
+        hidden_states = self.mixer(hidden_states, mamba_cache_params)
+        return hidden_states, residual
+
+
+class MambaModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        is_lora_enabled = bool(lora_config)
+
+        self.config = config
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        self.embeddings = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: MambaDecoderLayer(config,
+                                             cache_config=cache_config,
+                                             quant_config=quant_config,
+                                             is_lora_enabled=is_lora_enabled),
+            prefix=f"{prefix}.layers")
+
+        self.norm_f = RMSNorm(config.hidden_size,
+                              eps=config.layer_norm_epsilon)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+                mamba_cache_params=mamba_cache_params.at_layer_idx(
+                    i - self.start_layer))
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.norm_f(hidden_states, residual)
+
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "A_log" in name:
+                name = name.replace("A_log", "A")
+            # Skip loading extra bias for GPTQ models.
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP,
+                       SupportsV0Only):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        lora_config = vllm_config.lora_config
+        self.scheduler_config = vllm_config.scheduler_config
+        assert not cache_config.enable_prefix_caching, \
+            "Mamba does not support prefix caching"
+
+        super().__init__()
+        self.config = config
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.backbone = MambaModel(vllm_config=vllm_config,
+                                   prefix=maybe_prefix(prefix, "backbone"))
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        if config.tie_word_embeddings:
+            self.lm_head = self.backbone.embeddings
+        else:
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE
+                # We need bigger padding if using lora for kernel
+                # compatibility
+                if not lora_config else lora_config.lora_vocab_padding_size,
+            )
+
+        # Used to track and store by the Mamba cache between steps.
+        self.mamba_cache: Optional[MambaCacheManager] = None
+
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
+
+        self.make_empty_intermediate_tensors = (
+            self.backbone.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.backbone.get_input_embeddings(input_ids)
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs):
+        if self.mamba_cache is None:
+            num_mamba_layers = self.model_config.get_num_layers_by_block_type(
+                self.vllm_config.parallel_config, LayerBlockType.mamba)
+            self.mamba_cache = MambaCacheManager(
+                self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers,
+                *self._get_mamba_cache_shape())
+
+        mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+
+        hidden_states = self.backbone(input_ids, positions, mamba_cache_params,
+                                      intermediate_tensors, inputs_embeds)
+
+        return hidden_states
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        return self.mamba_cache.copy_inputs_before_cuda_graphs(
+            input_buffers, **kwargs)
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
+
+    def _get_mamba_cache_shape(
+            self) -> tuple[tuple[int, int], tuple[int, int]]:
+        world_size = get_tensor_model_parallel_world_size()
+        conv_state_shape = (
+            self.config.intermediate_size // world_size,
+            self.config.conv_kernel - 1,
+        )
+        temporal_state_shape = (
+            self.config.intermediate_size // world_size,
+            self.config.state_size,
+        )
+        return conv_state_shape, temporal_state_shape
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/mamba2.py b/vllm_v0.10.0/vllm/model_executor/models/mamba2.py
new file mode 100644
index 0000000..adad181
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/mamba2.py
@@ -0,0 +1,323 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""PyTorch MAMBA2 model."""
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+from torch import nn
+from transformers import MambaConfig
+
+from vllm import envs
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.mamba2_metadata import (
+    Mamba2Metadata, prepare_mamba2_metadata)
+from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
+from vllm.model_executor.layers.mamba.mamba_utils import get_mamba_state_shape
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.interfaces import (HasInnerState,
+                                                   IsAttentionFree)
+from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
+                                                    MambaCacheParams)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+from vllm.utils import LayerBlockType
+
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+KVCache = tuple[torch.Tensor, torch.Tensor]
+
+
+class Mamba2DecoderLayer(nn.Module):
+
+    def __init__(self,
+                 config: MambaConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "") -> None:
+        super().__init__()
+        self.config = config
+        self.mixer = MambaMixer2(hidden_size=config.hidden_size,
+                                 ssm_state_size=config.state_size,
+                                 conv_kernel_size=config.conv_kernel,
+                                 intermediate_size=getattr(
+                                     config, "intermediate_size",
+                                     config.expand * config.hidden_size),
+                                 use_conv_bias=config.use_conv_bias,
+                                 use_bias=config.use_bias,
+                                 n_groups=config.n_groups,
+                                 num_heads=config.num_heads,
+                                 head_dim=config.head_dim,
+                                 rms_norm_eps=config.layer_norm_epsilon,
+                                 activation=config.hidden_act,
+                                 quant_config=quant_config,
+                                 prefix=f"{prefix}.mixer")
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        mamba_cache_params: MambaCacheParams,
+        mamba2_metadata: Mamba2Metadata,
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+        else:
+            hidden_states, residual = self.norm(hidden_states, residual)
+
+        output = torch.empty_like(hidden_states)
+        self.mixer(hidden_states, output, mamba_cache_params, mamba2_metadata)
+        return output, residual
+
+
+@support_torch_compile
+class Mamba2Model(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        is_lora_enabled = bool(lora_config)
+        assert not is_lora_enabled
+
+        self.config = config
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        self.embeddings = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Mamba2DecoderLayer(
+                config, quant_config=quant_config, prefix=prefix),
+            prefix=f"{prefix}.layers")
+
+        self.norm_f = RMSNorm(config.hidden_size,
+                              eps=config.layer_norm_epsilon)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
+
+        if not envs.VLLM_USE_V1:
+            mamba2_metadata = prepare_mamba2_metadata(
+                chunk_size=self.config.chunk_size,
+                attn_metadata=attn_metadata,
+            )
+        else:
+            # v1 get mamba2_metadata from forward_context
+            mamba2_metadata = None
+
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+
+            hidden_states, residual = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+                mamba_cache_params=mamba_cache_params.at_layer_idx(
+                    i - self.start_layer) if mamba_cache_params else None,
+                mamba2_metadata=mamba2_metadata)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm_f(hidden_states, residual)
+
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "A_log" in name:
+                name = name.replace("A_log", "A")
+
+            # Skip loading extra bias for GPTQ models.
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree):
+
+    @classmethod
+    def get_mamba_state_shape_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+        use_v1: bool = True,
+    ) -> tuple[tuple[int, int], tuple[int, int, int]]:
+        """Calculate shapes for Mamba's convolutional and state caches.
+
+        Args:
+            vllm_config: vLLM config
+            use_v1: Get shapes for V1 (or V0)
+
+        Returns:
+            Tuple containing:
+            - conv_state_shape: Shape for convolutional state cache
+            - temporal_state_shape: Shape for state space model cache
+        """
+        parallel_config = vllm_config.parallel_config
+        hf_config = vllm_config.model_config.hf_config
+        intermediate_size = hf_config.expand * hf_config.hidden_size
+
+        return get_mamba_state_shape(
+            intermediate_size=intermediate_size,
+            tp_world_size=parallel_config.tensor_parallel_size,
+            n_groups=hf_config.n_groups,
+            num_heads=hf_config.num_heads,
+            head_dim=hf_config.head_dim,
+            state_size=hf_config.state_size,
+            conv_kernel=hf_config.conv_kernel,
+            use_v1=use_v1,
+        )
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        lora_config = vllm_config.lora_config
+        scheduler_config = vllm_config.scheduler_config
+        assert not cache_config.enable_prefix_caching, \
+            "Mamba does not support prefix caching"
+
+        super().__init__()
+        self.config = config
+        self.vllm_config = vllm_config
+        self.scheduler_config = scheduler_config
+        self.model_config = vllm_config.model_config
+        self.backbone = Mamba2Model(vllm_config=vllm_config,
+                                    prefix=maybe_prefix(prefix, "backbone"))
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+        )
+        if config.tie_word_embeddings:
+            self.lm_head = self.lm_head.tie_weights(self.backbone.embeddings)
+
+        # Used to track and store by the Mamba cache between steps.
+        self.mamba_cache: Optional[MambaCacheManager] = None
+
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
+
+        self.make_empty_intermediate_tensors = (
+            self.backbone.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.backbone.get_input_embeddings(input_ids)
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs):
+        if not envs.VLLM_USE_V1:
+            if self.mamba_cache is None:
+                num_mamba_layers = (
+                    self.model_config.get_num_layers_by_block_type(
+                        self.vllm_config.parallel_config,
+                        LayerBlockType.mamba))
+                mamba_state_shape = \
+                    self.get_mamba_state_shape_from_config(
+                        self.vllm_config, use_v1=False)
+                self.mamba_cache = MambaCacheManager(self.vllm_config,
+                                                     self.lm_head.weight.dtype,
+                                                     num_mamba_layers,
+                                                     *mamba_state_shape)
+
+            mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+        else:
+            # NOTE: mamba_cache_params is not needed for v1
+            mamba_cache_params = None
+
+        hidden_states = self.backbone(input_ids, positions, mamba_cache_params,
+                                      intermediate_tensors, inputs_embeds)
+
+        return hidden_states
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        return self.mamba_cache.copy_inputs_before_cuda_graphs(
+            input_buffers, **kwargs)
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/mamba_cache.py b/vllm_v0.10.0/vllm/model_executor/models/mamba_cache.py
new file mode 100644
index 0000000..27685c5
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/mamba_cache.py
@@ -0,0 +1,78 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+
+import torch
+
+from vllm.attention.backends.utils import PAD_SLOT_ID
+from vllm.config import VllmConfig
+from vllm.model_executor.models.constant_size_cache import ConstantSizeCache
+
+
+@dataclass
+class MambaCacheParams:
+    conv_state: torch.Tensor = torch.Tensor()
+    ssm_state: torch.Tensor = torch.Tensor()
+    state_indices_tensor: torch.Tensor = torch.Tensor()
+
+    def at_layer_idx(self, layer_idx):
+        return MambaCacheParams(self.conv_state[layer_idx],
+                                self.ssm_state[layer_idx],
+                                self.state_indices_tensor)
+
+
+class MambaCacheManager(ConstantSizeCache):
+
+    def __init__(self, vllm_config: VllmConfig, dtype: torch.dtype,
+                 num_mamba_layers: int, conv_state_shape: tuple[int, int],
+                 temporal_state_shape: tuple[int, int]):
+
+        # Determine max batch size to set size of MambaCache
+        max_batch_size = vllm_config.scheduler_config.max_num_seqs
+        if not vllm_config.model_config.enforce_eager:
+            max_batch_size = vllm_config.pad_for_cudagraph(max_batch_size)
+
+        # Initialize parent class
+        super().__init__(max_batch_size)
+
+        # assume conv_state = (dim, state_len)
+        assert conv_state_shape[0] > conv_state_shape[1]
+        conv_state = torch.empty(size=(num_mamba_layers, max_batch_size) +
+                                 (conv_state_shape[1], conv_state_shape[0]),
+                                 dtype=dtype,
+                                 device="cuda").transpose(-1, -2)
+        temporal_state = torch.empty(size=(num_mamba_layers, max_batch_size) +
+                                     temporal_state_shape,
+                                     dtype=dtype,
+                                     device="cuda")
+
+        self._mamba_cache = (conv_state, temporal_state)
+
+    @property
+    def cache(self):
+        return self._mamba_cache
+
+    def _copy_cache(self, from_index: int, to_index: int):
+        for cache_t in self.cache:
+            cache_t[:, to_index].copy_(cache_t[:, from_index],
+                                       non_blocking=True)
+
+    def current_run_tensors(self, **kwargs) -> MambaCacheParams:
+        """
+        Return the tensors for the current run's conv and ssm state.
+        """
+        cache_tensors, state_indices_tensor = super().current_run_tensors(
+            **kwargs)
+        return MambaCacheParams(cache_tensors[0], cache_tensors[1],
+                                state_indices_tensor)
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        """
+        Provide the CUDA graph capture runs with a buffer in adjusted size.
+        The buffer is used to maintain the Mamba Cache during the CUDA graph
+        replay runs.
+        """
+        return self._mamba_cache, torch.as_tensor([PAD_SLOT_ID] * batch_size,
+                                                  dtype=torch.int32,
+                                                  device="cuda")
diff --git a/vllm_v0.10.0/vllm/model_executor/models/medusa.py b/vllm_v0.10.0/vllm/model_executor/models/medusa.py
new file mode 100644
index 0000000..709a5a9
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/medusa.py
@@ -0,0 +1,219 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+
+
+class ResidualBlock(nn.Module):
+
+    def __init__(self, config: VllmConfig, hidden_size: int,
+                 num_layers: int) -> None:
+        super().__init__()
+
+        self.layers = nn.ModuleList([
+            nn.Linear(hidden_size,
+                      hidden_size,
+                      bias=getattr(config, "medusa_fc_bias", False))
+            for _ in range(num_layers)
+        ])
+        self.act = nn.SiLU()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for layer in self.layers:
+            x = x + self.act(layer(x))
+        return x
+
+
+class Medusa(nn.Module):
+    """This class implements the Medusa draft model from the paper: https://arxiv.org/abs/2401.10774
+    Reference implementation: https://github.com/FasterDecoding/Medusa
+    
+    Differences from reference implementation:
+    1. Currently this only supports generating proposals from top-1 tokens.
+    2. We have an optional token_map which reduces draft vocab to most 
+       frequently used tokens to give some additional speed-up by reducing 
+       sampling overhead. This is disabled unless the checkpoint file has 
+       explicit token_map tensor and config has an optional attribute 
+       truncated_vocab_size < vocab_size. To use this technique, one has to find
+       the top-k most frequent tokens in target dataset and add that as a tensor
+       in the draft checkpoint (using key token_map). Also, the draft config
+       needs to have truncated_vocab_size (=k) as an attribute."""
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        config = vllm_config.speculative_config.draft_model_config.hf_config
+        super().__init__()
+        self.config = config
+        self.blocks = nn.ModuleList([
+            ResidualBlock(config=config,
+                          hidden_size=self.config.hidden_size,
+                          num_layers=self.config.num_hidden_layers)
+            for _ in range(self.config.num_heads)
+        ])
+        self.orig_vocab_size = config.vocab_size
+        self.truncated_vocab_size = config.truncated_vocab_size
+        self.unpadded_vocab_size = self.truncated_vocab_size
+
+        if getattr(config, "original_lm_head", False):
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=self.truncated_vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            )
+            self.lm_heads = [
+                self.lm_head for _ in range(self.config.num_heads)
+            ]
+        else:
+            self.lm_heads = nn.ModuleList([
+                ParallelLMHead(
+                    self.unpadded_vocab_size,
+                    config.hidden_size,
+                    org_num_embeddings=self.truncated_vocab_size,
+                    padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+                ) for _ in range(self.config.num_heads)
+            ])
+
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                self.truncated_vocab_size,
+                                                logit_scale)
+
+        # Token map is a idx to token mapping to reduce the vocab size for
+        # the draft model. Using smaller vocab size for draft, containing
+        # only most frequent tokens reduces the speculation overhead. This
+        # doesn't affect the acceptance rate much and thus gives more speed
+        # -up. By default, this is disabled and is only used if the EAGLE
+        # checkpoint file has token_map tensor.
+        self.token_map = None
+
+    def forward(self, hidden_states: torch.Tensor) -> list[torch.Tensor]:
+        return [block(hidden_states) for block in self.blocks]
+
+    def compute_logits(
+            self, hidden_states: list[torch.Tensor],
+            sampling_metadata: SamplingMetadata) -> list[torch.Tensor]:
+        logits_lst: list[torch.Tensor] = []
+
+        for hs, lm_head in zip(hidden_states, self.lm_heads):
+            _logits = self.logits_processor(lm_head, hs, sampling_metadata)
+
+            if _logits is None:
+                # _logits should only be None on rank > 0, in which case
+                # it should remain true for every lm_head
+                assert len(logits_lst) == 0
+                continue
+
+            if self.token_map is None:
+                logits_lst.append(_logits)
+            else:
+                logits_lst.append(-torch.inf * torch.ones(
+                    size=(*_logits.shape[:-1], self.orig_vocab_size),
+                    device=_logits.device,
+                    dtype=_logits.dtype))
+
+                logits_lst[-1][..., self.token_map] = _logits
+
+        return logits_lst
+
+    def sample(
+        self,
+        logits: list[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> list[SamplerOutput]:
+        logits = torch.stack(logits, dim=0).float()
+        logprobs = torch.log_softmax(logits, dim=-1)
+        token_ids = logits.argmax(-1)  # support only top-1 for now
+        probs = torch.softmax(logits, dim=-1)
+
+        token_id_list = []
+        token_prob_list = []
+        token_logprob_list = []
+
+        for idx, seq_group in enumerate(sampling_metadata.seq_groups):
+            token_id_list.append(token_ids[:, seq_group.sample_indices])
+            token_prob_list.append(probs[:, seq_group.sample_indices])
+            token_logprob_list.append(logprobs[:, seq_group.sample_indices])
+
+        outputs: list[Optional[SamplerOutput]] = []
+        for idx in range(len(sampling_metadata.seq_groups)):
+            outputs.append(
+                SamplerOutput(
+                    outputs=None,
+                    sampled_token_probs=token_prob_list[idx].squeeze(1),
+                    logprobs=token_logprob_list[idx].squeeze(1),
+                    sampled_token_ids=token_id_list[idx].squeeze(1),
+                ))
+
+        return outputs
+
+    def generate_proposals(
+        self,
+        previous_hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[list[SamplerOutput]]:
+        # During preemption, we may receive an empty tensor (batch_size=0)
+        if previous_hidden_states.size(0) == 0:
+            # Return None to signal the Top1Proposer that no proposals
+            # were generated for this batch, allowing it to handle this
+            # special case appropriately
+            return None
+
+        return self.sample(
+            logits=self.compute_logits(
+                hidden_states=self.forward(previous_hidden_states),
+                sampling_metadata=sampling_metadata,
+            ),
+            sampling_metadata=sampling_metadata,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        weights_map = {}
+
+        for name, loaded_weight in weights:
+            name = name.replace("medusa_heads.", "")
+
+            if name == "token_map":
+                if self.truncated_vocab_size < self.orig_vocab_size:
+                    self.token_map = nn.Parameter(loaded_weight,
+                                                  requires_grad=False)
+            elif name in params_dict:
+                weights_map[name] = loaded_weight
+            elif (getattr(self.config, "original_lm_head", False)
+                  and name == "lm_heads.0.weight"):
+                weights_map["lm_head.weight"] = loaded_weight
+
+        for name, loaded_weight in weights_map.items():
+            if "lm_head" in name and self.token_map is not None and\
+                loaded_weight.shape[0] > self.token_map.shape[0]:
+
+                loaded_weight = loaded_weight[self.token_map]
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        if self.token_map is not None:
+            self.token_map.to(device=self.lm_heads[0].weight.device)
+
+        assert (self.truncated_vocab_size
+                == self.orig_vocab_size) or (self.token_map is not None)
+
+        return loaded_params
diff --git a/vllm_v0.10.0/vllm/model_executor/models/mimo.py b/vllm_v0.10.0/vllm/model_executor/models/mimo.py
new file mode 100644
index 0000000..5b497dd
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/mimo.py
@@ -0,0 +1,190 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py
+# Copyright 2025 Xiaomi Corporation.
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only MiMo model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+from vllm.distributed import get_pp_group
+from vllm.logger import init_logger
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM, Qwen2Model
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .utils import PPMissingLayer, is_pp_missing_parameter, maybe_prefix
+
+logger = init_logger(__name__)
+
+
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    })
+class MiMoModel(Qwen2Model):
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states = hidden_states + residual
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "mtp_layers" in name:
+                continue
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class MiMoForCausalLM(Qwen2ForCausalLM, nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        nn.Module.__init__(self)
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.lora_config = lora_config
+
+        self.quant_config = quant_config
+
+        self.model = MiMoModel(vllm_config=vllm_config,
+                               prefix=maybe_prefix(prefix, "model"))
+
+        if get_pp_group().is_last_rank:
+            if config.tie_word_embeddings:
+                self.lm_head = self.model.embed_tokens
+            else:
+                self.lm_head = ParallelLMHead(config.vocab_size,
+                                              config.hidden_size,
+                                              quant_config=quant_config,
+                                              prefix=maybe_prefix(
+                                                  prefix, "lm_head"))
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        hidden_states = self.model.norm(hidden_states)
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
diff --git a/vllm_v0.10.0/vllm/model_executor/models/mimo_mtp.py b/vllm_v0.10.0/vllm/model_executor/models/mimo_mtp.py
new file mode 100644
index 0000000..19afc5b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/mimo_mtp.py
@@ -0,0 +1,274 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/model_executor/models/deepseek_mtp.py
+# Copyright 2025 Xiaomi Corporation.
+# Copyright 2023 The vLLM team.
+# Copyright 2024 DeepSeek-AI team.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only MiMo-MTP model."""
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig
+
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.qwen2 import Qwen2DecoderLayer
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .utils import maybe_prefix
+
+
+class MiMoMultiTokenPredictorLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        prefix: str,
+        model_config: ModelConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+
+        self.token_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.hidden_layernorm = RMSNorm(config.hidden_size,
+                                        eps=config.rms_norm_eps)
+        self.input_proj = nn.Linear(config.hidden_size * 2,
+                                    config.hidden_size,
+                                    bias=False)
+        self.mtp_block = Qwen2DecoderLayer(config=config,
+                                           cache_config=cache_config,
+                                           quant_config=quant_config,
+                                           prefix=prefix)
+        self.final_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        positions: torch.Tensor,
+        previous_hidden_states: torch.Tensor,
+        spec_step_index: int = 0,
+    ) -> torch.Tensor:
+        assert inputs_embeds is not None
+        # masking inputs at position 0, as not needed by MTP
+        inputs_embeds[positions == 0] = 0
+        inputs_embeds = self.token_layernorm(inputs_embeds)
+        previous_hidden_states = self.hidden_layernorm(previous_hidden_states)
+
+        hidden_states = self.input_proj(
+            torch.cat([previous_hidden_states, inputs_embeds], dim=-1))
+
+        hidden_states, residual = self.mtp_block(positions=positions,
+                                                 hidden_states=hidden_states,
+                                                 residual=None)
+        hidden_states = residual + hidden_states
+        return self.final_layernorm(hidden_states)
+
+
+class MiMoMultiTokenPredictor(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        self.mtp_start_layer_idx = config.num_hidden_layers
+        self.num_mtp_layers = config.num_nextn_predict_layers
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+
+        self.mtp_layers = torch.nn.ModuleDict({
+            str(idx):
+            MiMoMultiTokenPredictorLayer(
+                config,
+                f"{prefix}.layers.{idx}",
+                model_config=vllm_config.model_config,
+                cache_config=vllm_config.cache_config,
+                quant_config=vllm_config.quant_config,
+            )
+            for idx in range(self.mtp_start_layer_idx,
+                             self.mtp_start_layer_idx + self.num_mtp_layers)
+        })
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        previous_hidden_states: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor:
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        return self.mtp_layers[str(self.mtp_start_layer_idx + spec_step_idx)](
+            inputs_embeds,
+            positions,
+            previous_hidden_states,
+            spec_step_idx,
+        )
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        lm_head: ParallelLMHead,
+        sampling_metadata: SamplingMetadata,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor:
+        self.mtp_layers[str(self.mtp_start_layer_idx + spec_step_idx)]
+        logits = self.logits_processor(lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+
+class MiMoMTP(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        self.config = vllm_config.model_config.hf_config
+        self.model = MiMoMultiTokenPredictor(vllm_config=vllm_config,
+                                             prefix=maybe_prefix(
+                                                 prefix, "model"))
+        self.lm_head = ParallelLMHead(self.config.vocab_size,
+                                      self.config.hidden_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        previous_hidden_states: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor:
+        assert spec_step_idx == 0, "mimo_mtp only support predict one token now"
+        hidden_states = self.model(input_ids, positions,
+                                   previous_hidden_states, inputs_embeds,
+                                   spec_step_idx)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+        spec_step_idx: int = 0,
+    ) -> Optional[torch.Tensor]:
+        return self.model.compute_logits(hidden_states, self.lm_head,
+                                         sampling_metadata, spec_step_idx)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+
+            if "rotary_emb.inv_freq" in name:
+                continue
+            name = self.map_model_name_to_mtp_param_name(name)
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                if "mtp_layers" not in name:
+                    break
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if (("mlp.experts." in name) and name not in params_dict):
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if "mtp_layers" not in name and ("embed_tokens" not in name
+                                                 and "lm_head" not in name):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+    def map_model_name_to_mtp_param_name(self, name: str) -> str:
+        import regex as re
+        name_without_prefix = [
+            "token_layernorm", "hidden_layernorm", "input_proj",
+            "final_layernorm"
+        ]
+        for sub_name in name_without_prefix:
+            if sub_name in name:
+                return name
+        pattern = r"model.mtp_layers.(\d+)."
+        group = re.match(pattern, name)
+        if group is not None:
+            name = name.replace(group.group(), group.group() + "mtp_block.")
+        return name
+
+    def _rewrite_spec_layer_name(self, spec_layer: int, name: str) -> str:
+        """
+        Rewrite the weight name to match the format of the original model.
+        Add .mtp_block for modules in transformer layer block for spec layer
+        """
+        spec_layer_weight_names = [
+            "embed_tokens", "enorm", "hnorm", "eh_proj", "shared_head"
+        ]
+        spec_layer_weight = False
+        for weight_name in spec_layer_weight_names:
+            if weight_name in name:
+                spec_layer_weight = True
+                break
+        if not spec_layer_weight:
+            # treat rest weights as weights for transformer layer block
+            name = name.replace(f"model.layers.{spec_layer}.",
+                                f"model.layers.{spec_layer}.mtp_block.")
+        return name
diff --git a/vllm_v0.10.0/vllm/model_executor/models/minicpm.py b/vllm_v0.10.0/vllm/model_executor/models/minicpm.py
new file mode 100644
index 0000000..d398a5d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/minicpm.py
@@ -0,0 +1,592 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only MiniCPM model compatible with HuggingFace weights."""
+import math
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.model_executor.layers.activation import FatreluAndMul, SiluAndMul
+from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class MiniCPMMoE(nn.Module):
+    """A tensor-parallel MoE implementation that shards each expert
+    across all ranks.
+
+    Each expert's weights are sharded across all ranks and a fused MoE
+    kernel is used for the forward pass, and finally we reduce the outputs
+    across ranks.
+    """
+
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: Optional[torch.dtype] = None,
+        tp_size: Optional[int] = None,
+    ):
+        super().__init__()
+        self.tp_size = tp_size or get_tensor_model_parallel_world_size()
+        self.num_total_experts = num_experts
+        self.top_k = top_k
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size // self.tp_size
+
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
+
+        self.gate = ReplicatedLinear(self.hidden_size,
+                                     self.num_total_experts,
+                                     bias=False,
+                                     params_dtype=self.params_dtype,
+                                     quant_config=None)
+
+        self.ws = nn.Parameter(
+            torch.empty(self.num_total_experts,
+                        2 * self.intermediate_size,
+                        self.hidden_size,
+                        device=current_platform.device_type,
+                        dtype=self.params_dtype))
+        self.w2s = nn.Parameter(
+            torch.empty(self.num_total_experts,
+                        self.hidden_size,
+                        self.intermediate_size,
+                        device=current_platform.device_type,
+                        dtype=self.params_dtype))
+
+        set_weight_attrs(self.ws, {
+            "weight_loader": self.weight_loader,
+        })
+        set_weight_attrs(self.w2s, {
+            "weight_loader": self.weight_loader,
+        })
+
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
+                      weight_name: str, expert_id: int):
+        tp_rank = get_tensor_model_parallel_rank()
+        param_data = param.data
+        shard_size = self.intermediate_size
+        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
+        if weight_name.endswith("w1.weight"):
+            param_data[expert_id, 0:shard_size, :] = loaded_weight[shard, :]
+        if weight_name.endswith("w3.weight"):
+            param_data[expert_id,
+                       shard_size:2 * shard_size, :] = loaded_weight[shard, :]
+        if weight_name.endswith("w2.weight"):
+            param_data[expert_id, :, :] = loaded_weight[:, shard]
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_size = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        final_hidden_states = fused_moe(hidden_states,
+                                        self.ws,
+                                        self.w2s,
+                                        router_logits,
+                                        self.top_k,
+                                        renormalize=True,
+                                        inplace=True)
+
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
+
+        return final_hidden_states.view(num_tokens, hidden_size)
+
+
+class MiniCPMMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        hidden_act_param: float,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config)
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config)
+        if hidden_act == "silu":
+            self.act_fn = SiluAndMul()
+        elif hidden_act == "fatrelu":
+            self.act_fn = FatreluAndMul(threshold=hidden_act_param)
+        else:
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu and fatrelu are supported for now.")
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class MiniCPMAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        orig_dtype = q.dtype
+        q, k = q.float(), k.float()
+        q, k = self.rotary_emb(positions, q, k)
+        q, k = q.to(orig_dtype), k.to(orig_dtype)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class MiniCPMDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.cache_config = cache_config
+        self.quant_config = quant_config
+        self.hidden_size = config.hidden_size
+        self.rope_theta = getattr(config, "rope_theta", 10000)
+        self.rope_scaling = getattr(config, "rope_scaling", None)
+        self.max_position_embeddings = getattr(config,
+                                               "max_position_embeddings", 8192)
+        self.prefix = prefix
+        self._init_attn_block()
+        self._init_ffn_block()
+
+    def _init_attn_block(self):
+        self.input_layernorm = RMSNorm(self.config.hidden_size,
+                                       eps=self.config.rms_norm_eps)
+        self.self_attn = MiniCPMAttention(
+            hidden_size=self.hidden_size,
+            num_heads=self.config.num_attention_heads,
+            num_kv_heads=self.config.num_key_value_heads,
+            rope_theta=self.rope_theta,
+            rope_scaling=self.rope_scaling,
+            max_position_embeddings=self.max_position_embeddings,
+            cache_config=self.cache_config,
+            quant_config=self.quant_config,
+            prefix=f"{self.prefix}.self_attn",
+        )
+
+    def _init_ffn_block(self):
+        self.post_attention_layernorm = RMSNorm(self.config.hidden_size,
+                                                eps=self.config.rms_norm_eps)
+        self.num_experts = getattr(self.config, "num_experts", 0)
+        if self.num_experts == 0:
+            self.mlp = MiniCPMMLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=self.config.intermediate_size,
+                hidden_act=self.config.hidden_act,
+                hidden_act_param=getattr(self.config, "hidden_act_param", 0.),
+                quant_config=self.quant_config,
+            )
+        else:
+            self.mlp = MiniCPMMoE(
+                num_experts=self.config.num_experts,
+                top_k=self.config.num_experts_per_tok,
+                hidden_size=self.config.hidden_size,
+                intermediate_size=self.config.intermediate_size)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+        hidden_states = residual + hidden_states * \
+            (self.config.scale_depth / math.sqrt(self.config.num_hidden_layers))
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states * \
+            (self.config.scale_depth / math.sqrt(self.config.num_hidden_layers))
+
+        return hidden_states, None
+
+
+@support_torch_compile
+class MiniCPMModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.cache_config = cache_config
+        self.quant_config = quant_config
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+        self.num_experts = getattr(self.config, "num_experts", 0)
+        self._init_layers(prefix, config, cache_config, quant_config)
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], self.config.hidden_size))
+
+    def _init_layers(
+        self,
+        prefix: str,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig],
+        quant_config: Optional[QuantizationConfig],
+    ):
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: MiniCPMDecoderLayer(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.layers")
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        embedding = self.embed_tokens(input_ids)
+        return embedding * self.config.scale_emb
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        expert_params_mapping = [
+            # (param_name, weight_name, expert_id)
+            ("ws" if weight_name in ["w1", "w3"] else "w2s",
+             f"experts.{expert_id}.{weight_name}.weight", expert_id)
+            for expert_id in range(self.num_experts)
+            for weight_name in ["w1", "w2", "w3"]
+        ]
+        params_dict = dict(self.named_parameters())
+
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for param_name, weight_name, expert_id in expert_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  weight_name,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.prefix = prefix
+        self.vllm_config = vllm_config
+        self.config = config
+        self.lora_config = lora_config
+        self.cache_config = cache_config
+        self.quant_config = quant_config
+
+        self.model = self._init_model(vllm_config=vllm_config,
+                                      prefix=maybe_prefix(prefix, "model"))
+
+        unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+            quant_config=quant_config,
+        )
+        if config.tie_word_embeddings:
+            self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
+        self.scale_width = self.config.hidden_size / self.config.dim_model_base
+
+        self.logits_processor = LogitsProcessor(unpadded_vocab_size,
+                                                config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        return MiniCPMModel(vllm_config=vllm_config, prefix=prefix)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds) / self.scale_width
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/minicpm3.py b/vllm_v0.10.0/vllm/model_executor/models/minicpm3.py
new file mode 100644
index 0000000..92c13e8
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/minicpm3.py
@@ -0,0 +1,230 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2024 The ModelBest team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only MiniCPM3 model compatible with HuggingFace weights."""
+from typing import Any, Optional
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.models.minicpm import (MiniCPMDecoderLayer,
+                                                MiniCPMForCausalLM,
+                                                MiniCPMModel)
+
+from .utils import make_layers
+
+
+class MiniCPM3Attention(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: int,
+        kv_lora_rank: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.num_heads = num_heads
+
+        tp_size = get_tensor_model_parallel_world_size()
+        assert self.num_heads % tp_size == 0
+        self.num_local_heads = num_heads // tp_size
+
+        self.scaling = self.qk_head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.q_a_proj = ReplicatedLinear(self.hidden_size,
+                                         self.q_lora_rank,
+                                         bias=False,
+                                         quant_config=quant_config)
+        self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps)
+        self.q_b_proj = ColumnParallelLinear(q_lora_rank,
+                                             self.num_heads * self.qk_head_dim,
+                                             bias=False,
+                                             quant_config=quant_config)
+
+        self.kv_a_proj_with_mqa = ReplicatedLinear(self.hidden_size,
+                                                   self.kv_lora_rank +
+                                                   self.qk_rope_head_dim,
+                                                   bias=False,
+                                                   quant_config=quant_config)
+        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank,
+                                      eps=config.rms_norm_eps)
+        self.kv_b_proj = ColumnParallelLinear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+            quant_config=quant_config)
+        # O projection.
+        self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim,
+                                        self.hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config)
+
+        self.rotary_emb = get_rope(
+            self.qk_rope_head_dim,
+            rotary_dim=self.qk_rope_head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = Attention(self.num_local_heads,
+                              self.qk_head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_local_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        q, _ = self.q_a_proj(hidden_states)
+        q = self.q_a_layernorm(q)
+        q, _ = self.q_b_proj(q)
+        q = q.view(-1, self.num_local_heads, self.qk_head_dim)
+        _, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim],
+                          dim=-1)
+        latent_cache, _ = self.kv_a_proj_with_mqa(hidden_states)
+        kv_a, _ = latent_cache.split(
+            [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        latent_cache = latent_cache.unsqueeze(1)
+        kv_a = self.kv_a_layernorm(kv_a.contiguous())
+        kv, _ = self.kv_b_proj(kv_a)
+        kv = kv.view(-1, self.num_local_heads,
+                     self.qk_nope_head_dim + self.v_head_dim)
+        k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+        k_pe = latent_cache[:, :, self.kv_lora_rank:]
+
+        q_pe, k_pe = self.rotary_emb(
+            positions,
+            q_pe.reshape(-1, self.num_local_heads * self.qk_rope_head_dim),
+            k_pe.reshape(-1, self.qk_rope_head_dim))
+        q_pe = q_pe.view(-1, self.num_local_heads, self.qk_rope_head_dim)
+        k_pe = k_pe.view(-1, 1, self.qk_rope_head_dim)
+
+        q[..., self.qk_nope_head_dim:] = q_pe
+
+        k = torch.empty_like(q)
+
+        k[..., :self.qk_nope_head_dim] = k_nope
+        k[..., self.qk_nope_head_dim:] = k_pe
+
+        q = q.reshape(-1, self.num_local_heads * self.qk_head_dim)
+        k = k.view(-1, self.num_local_heads * self.qk_head_dim)
+        v = torch.nn.functional.pad(
+            v, [0, self.qk_head_dim - self.v_head_dim],
+            value=0).view(-1, self.num_local_heads * self.qk_head_dim)
+
+        attn_output = self.attn(q, k, v)
+        attn_output = attn_output.view(
+            -1, self.num_local_heads,
+            self.qk_head_dim)[..., :self.v_head_dim].reshape(
+                -1, self.num_local_heads * self.v_head_dim)
+
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class MiniCPM3DecoderLayer(MiniCPMDecoderLayer):
+
+    def _init_attn_block(self):
+        self.input_layernorm = RMSNorm(self.config.hidden_size,
+                                       eps=self.config.rms_norm_eps)
+        self.self_attn = MiniCPM3Attention(
+            config=self.config,
+            hidden_size=self.hidden_size,
+            num_heads=self.config.num_attention_heads,
+            qk_nope_head_dim=self.config.qk_nope_head_dim,
+            qk_rope_head_dim=self.config.qk_rope_head_dim,
+            v_head_dim=self.config.v_head_dim,
+            q_lora_rank=self.config.q_lora_rank,
+            kv_lora_rank=self.config.kv_lora_rank,
+            rope_theta=self.rope_theta,
+            rope_scaling=self.rope_scaling,
+            max_position_embeddings=self.max_position_embeddings,
+            cache_config=self.cache_config,
+            quant_config=self.quant_config,
+            prefix=f"{self.prefix}.self_attn",
+        )
+
+
+class MiniCPM3Model(MiniCPMModel):
+
+    def _init_layers(
+        self,
+        prefix: str,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig],
+        quant_config: Optional[QuantizationConfig],
+    ):
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: MiniCPM3DecoderLayer(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.layers")
+
+
+class MiniCPM3ForCausalLM(MiniCPMForCausalLM):
+    packed_modules_mapping = {
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        return MiniCPM3Model(vllm_config=vllm_config, prefix=prefix)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/minicpm_eagle.py b/vllm_v0.10.0/vllm/model_executor/models/minicpm_eagle.py
new file mode 100644
index 0000000..06c2eb4
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/minicpm_eagle.py
@@ -0,0 +1,391 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only EagleMiniCPM model compatible with HuggingFace weights."""
+import math
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .minicpm import MiniCPMAttention as EagleMiniCPMAttention
+from .minicpm import MiniCPMMLP as EagleMiniCPMMLP
+from .minicpm import MiniCPMMoE as EagleMiniCPMMoE
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, maybe_prefix)
+
+
+class EagleMiniCPMDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.cache_config = cache_config
+        self.quant_config = quant_config
+        self.hidden_size = config.hidden_size
+        self.rope_theta = getattr(config, "rope_theta", 10000)
+        self.rope_scaling = getattr(config, "rope_scaling", None)
+        self.max_position_embeddings = getattr(config,
+                                               "max_position_embeddings", 8192)
+        self.prefix = prefix
+        self._init_attn_block()
+        self._init_ffn_block()
+
+    def _init_attn_block(self):
+        self.input_layernorm = RMSNorm(self.config.hidden_size,
+                                       eps=self.config.rms_norm_eps)
+        self.self_attn = EagleMiniCPMAttention(
+            hidden_size=self.hidden_size,
+            num_heads=self.config.num_attention_heads,
+            num_kv_heads=self.config.num_key_value_heads,
+            rope_theta=self.rope_theta,
+            rope_scaling=self.rope_scaling,
+            max_position_embeddings=self.max_position_embeddings,
+            cache_config=self.cache_config,
+            quant_config=self.quant_config,
+            prefix=f"{self.prefix}.self_attn",
+        )
+
+    def _init_ffn_block(self):
+        self.post_attention_layernorm = RMSNorm(self.config.hidden_size,
+                                                eps=self.config.rms_norm_eps)
+        self.num_experts = getattr(self.config, "num_experts", 0)
+        if self.num_experts == 0:
+            self.mlp = EagleMiniCPMMLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=self.config.intermediate_size,
+                hidden_act=self.config.hidden_act,
+                hidden_act_param=getattr(self.config, "hidden_act_param", 0.),
+                quant_config=self.quant_config,
+            )
+        else:
+            self.mlp = EagleMiniCPMMoE(
+                num_experts=self.config.num_experts,
+                top_k=self.config.num_experts_per_tok,
+                hidden_size=self.config.hidden_size,
+                intermediate_size=self.config.intermediate_size)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+        hidden_states = residual + hidden_states * \
+            (self.config.scale_depth / math.sqrt(self.config.mup_denominator))
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states * \
+            (self.config.scale_depth / math.sqrt(self.config.mup_denominator))
+
+        return hidden_states, None
+
+
+@support_torch_compile
+class EagleMiniCPMModel(nn.Module):
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 start_layer: int = 0):
+        super().__init__()
+
+        config = vllm_config.speculative_config.draft_model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.cache_config = cache_config
+        self.quant_config = quant_config
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+        self.fc = torch.nn.Linear(self.config.hidden_size * 2,
+                                  self.config.hidden_size,
+                                  bias=False)
+        self.input_norm1 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.input_norm2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+        self.num_experts = getattr(self.config, "num_experts", 0)
+        self._init_layers(prefix, config, cache_config, quant_config,
+                          start_layer)
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], self.config.hidden_size))
+
+    def _init_layers(
+        self,
+        prefix: str,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig],
+        quant_config: Optional[QuantizationConfig],
+        start_layer: int,
+    ):
+        self.eagle_layers = nn.ModuleList([
+            EagleMiniCPMDecoderLayer(
+                config,
+                cache_config,
+                quant_config,
+                f"{prefix}.eagle_layers.{i + start_layer}",
+            ) for i in range(self.config.num_hidden_layers)
+        ])
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        embedding = self.embed_tokens(input_ids)
+        return embedding * self.config.scale_emb
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        input_embeds = self.get_input_embeddings(input_ids)
+        input_embeds = self.input_norm1(input_embeds)
+        hidden_states = self.input_norm2(hidden_states)
+
+        hidden_states = self.fc(
+            torch.cat((input_embeds, hidden_states), dim=-1))
+        residual = None
+        for layer in self.eagle_layers:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+
+        return hidden_states, hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        expert_params_mapping = [
+            # (param_name, weight_name, expert_id)
+            ("ws" if weight_name in ["w1", "w3"] else "w2s",
+             f"experts.{expert_id}.{weight_name}.weight", expert_id)
+            for expert_id in range(self.num_experts)
+            for weight_name in ["w1", "w2", "w3"]
+        ]
+        params_dict = dict(self.named_parameters())
+
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for param_name, weight_name, expert_id in expert_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  weight_name,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.speculative_config.draft_model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.prefix = prefix
+        self.vllm_config = vllm_config
+        self.config = config
+        self.lora_config = lora_config
+        self.cache_config = cache_config
+        self.quant_config = quant_config
+
+        target_layer_num = vllm_config.model_config.get_num_layers(
+            vllm_config.parallel_config)
+
+        self.model = self._init_model(vllm_config=vllm_config,
+                                      prefix=maybe_prefix(prefix, "model"),
+                                      start_layer=target_layer_num)
+
+        unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+            quant_config=quant_config,
+        )
+        if config.tie_word_embeddings:
+            self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
+        self.scale_width = self.config.hidden_size / self.config.dim_model_base
+
+        self.logits_processor = LogitsProcessor(unpadded_vocab_size,
+                                                config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def _init_model(self,
+                    *,
+                    vllm_config: VllmConfig,
+                    prefix: str = "",
+                    start_layer: int = 0):
+        return EagleMiniCPMModel(vllm_config=vllm_config,
+                                 prefix=prefix,
+                                 start_layer=start_layer)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        hidden_states, hidden_states2 = self.model(input_ids, positions,
+                                                   hidden_states)
+        hidden_states = hidden_states / self.scale_width
+        hidden_states2 = hidden_states2 / self.scale_width
+        return hidden_states, hidden_states2
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/minicpmo.py b/vllm_v0.10.0/vllm/model_executor/models/minicpmo.py
new file mode 100644
index 0000000..4e4fc3d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/minicpmo.py
@@ -0,0 +1,773 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only MiniCPM-O model compatible with HuggingFace weights."""
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Any, Callable, Literal, Optional, TypedDict, Union
+
+import torch
+from torch import nn
+from transformers import BatchFeature, PretrainedConfig
+from transformers.modeling_outputs import BaseModelOutputWithPast
+from transformers.models.whisper.modeling_whisper import (ACT2FN,
+                                                          WhisperAttention,
+                                                          WhisperConfig,
+                                                          WhisperEncoder)
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.gptq import GPTQConfig
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQMarlinConfig)
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    NestedTensors)
+from vllm.multimodal.parse import (AudioItem, AudioProcessorItems,
+                                   DictEmbeddingItems, ModalityData,
+                                   ModalityDataItems, MultiModalDataItems,
+                                   MultiModalDataParser)
+from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
+
+from .minicpmv import (_MAX_FRAMES_PER_VIDEO, MiniCPMV2_6,
+                       MiniCPMVDummyInputsBuilder,
+                       MiniCPMVMultiModalDataParser,
+                       MiniCPMVMultiModalProcessor, MiniCPMVProcessingInfo,
+                       _minicpmv_field_config)
+from .utils import (AutoWeightsLoader, cast_overflow_tensors, flatten_bn,
+                    maybe_prefix)
+
+CPU_DEVICE = torch.device("cpu")
+
+
+class MiniCPMOAudioFeatureInputs(TypedDict):
+    type: Literal["audio_features"]
+    audio_features: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    Shape: `(batch_size * num_audios * num_slices, num_channels, length)`
+    Slice here means chunk. Audio that is too long will be split into slices,
+    which is the same as image.
+    Padding is used therefore `audio_features` is `torch.Tensor`.
+    """
+
+    audio_feature_lens: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    Shape: `(batch_size * num_audios, num_slices)`
+
+    This should be feature length of each audio slice, 
+    which equals to `audio_features.shape[-1]`
+    """
+
+
+class MiniCPMOAudioEmbeddingInputs(TypedDict):
+    type: Literal["audio_embeds"]
+    audio_embeds: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    Shape: `(batch_size * num_audios, num_slices, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    instead of a batched tensor.
+    Length of each slice may vary, so pass it as a list.
+    """
+
+
+MiniCPMOAudioInputs = Union[MiniCPMOAudioFeatureInputs,
+                            MiniCPMOAudioEmbeddingInputs]
+
+
+def _minicpmo_field_config(hf_inputs: Mapping[str, torch.Tensor]):
+    audio_features = hf_inputs.get("audio_features", torch.empty(0))
+    num_audios = len(audio_features)
+
+    return dict(
+        **_minicpmv_field_config(hf_inputs),
+        audio_features=MultiModalFieldConfig.batched("audio"),
+        audio_feature_lens=MultiModalFieldConfig.batched("audio"),
+        audio_embeds=MultiModalFieldConfig.batched("audio"),
+        audio_token_id=MultiModalFieldConfig.shared("audio", num_audios),
+    )
+
+
+class MiniCPMOAudioEmbeddingItems(DictEmbeddingItems):
+
+    def __init__(
+        self,
+        data: Mapping[str, torch.Tensor],
+        fields_factory: Callable[
+            [Mapping[str, torch.Tensor]],
+            Mapping[str, MultiModalFieldConfig],
+        ],
+    ) -> None:
+        super().__init__(
+            data,
+            modality="image",
+            required_fields={"audio_embeds"},
+            fields_factory=fields_factory,
+        )
+
+
+class MiniCPMOMultiModalDataParser(MiniCPMVMultiModalDataParser):
+
+    def _parse_audio_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[AudioItem]],
+    ) -> Optional[ModalityDataItems[Any, Any]]:
+        if isinstance(data, dict):
+            return MiniCPMOAudioEmbeddingItems(
+                data,
+                fields_factory=_minicpmo_field_config,
+            )
+
+        return super()._parse_audio_data(data)
+
+
+class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
+    audio_pattern = "(<audio>./</audio>)"
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {**super().get_supported_mm_limits(), "audio": None}
+
+    def get_audio_placeholder(
+        self,
+        audio_lens: int,
+        chunk_input: bool = True,
+        chunk_length: int = 1,
+    ) -> str:
+        hf_processor = self.get_hf_processor()
+
+        return hf_processor.get_audio_placeholder(
+            audio_lens,
+            chunk_input=chunk_input,
+            chunk_length=chunk_length,
+        )
+
+    def get_default_audio_pool_step(self) -> int:
+        return 2
+
+    def get_default_audio_sampling_rate(self) -> int:
+        return 16000
+
+    def get_chunk_length(self) -> int:
+        return self.get_hf_config().audio_chunk_length
+
+    def get_max_audio_tokens_per_chunk(self) -> int:
+        pool_step = self.get_default_audio_pool_step()
+        fbank_feat_in_chunk = 100
+        cnn_feat_in_chunk = (fbank_feat_in_chunk - 1) // 2 + 1
+        return (cnn_feat_in_chunk - pool_step) // pool_step + 1
+
+    def get_max_audio_chunks_with_most_features(self) -> int:
+        return 30
+
+    def get_max_audio_tokens(self) -> int:
+        num_chunks = self.get_max_audio_chunks_with_most_features()
+        return self.get_max_audio_tokens_per_chunk() * num_chunks
+
+    def get_audio_len_by_num_chunks(self, num_chunks: int) -> int:
+        sampling_rate = self.get_default_audio_sampling_rate()
+        num_tokens_per_chunk = self.get_max_audio_tokens_per_chunk()
+        return int(num_chunks * sampling_rate / num_tokens_per_chunk) + 1
+
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
+        max_audios = mm_counts.get("audio", 0)
+
+        max_image_tokens = self.get_max_image_tokens() * max_images
+        max_audio_tokens = self.get_max_audio_tokens() * max_audios
+        max_total_frames = self.get_max_video_frames(seq_len -
+                                                     max_image_tokens -
+                                                     max_audio_tokens)
+        max_frames_per_video = min(max_total_frames // max(max_videos, 1),
+                                   _MAX_FRAMES_PER_VIDEO)
+
+        return max(max_frames_per_video, 1)
+
+
+class MiniCPMODummyInputsBuilder(
+        MiniCPMVDummyInputsBuilder[MiniCPMOProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+
+        audio_prompt_texts = self.info.audio_pattern * num_audios
+
+        return super().get_dummy_text(mm_counts) + audio_prompt_texts
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_audios = mm_counts.get("audio", 0)
+        audio_len = self.info.get_max_audio_chunks_with_most_features() * \
+            self.info.get_default_audio_sampling_rate()
+
+        audio_mm_data = {
+            "audio":
+            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
+        }
+
+        return {
+            **super().get_dummy_mm_data(seq_len, mm_counts),
+            **audio_mm_data,
+        }
+
+
+class MiniCPMOMultiModalProcessor(
+        MiniCPMVMultiModalProcessor[MiniCPMOProcessingInfo]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return MiniCPMOMultiModalDataParser(
+            target_sr=self.info.get_default_audio_sampling_rate())
+
+    def get_audio_prompt_texts(
+        self,
+        audio_lens: int,
+        chunk_input: bool = True,
+        chunk_length: int = 1,
+    ) -> str:
+        return self.info.get_audio_placeholder(
+            audio_lens,
+            chunk_input=chunk_input,
+            chunk_length=chunk_length,
+        )
+
+    def process_audios(
+        self,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> Mapping[str, NestedTensors]:
+        if (audios := mm_data.get("audios")) is None:
+            return {}
+
+        parsed_audios = (self._get_data_parser().parse_mm_data({
+            "audio": audios
+        }).get_items("audio",
+                     (MiniCPMOAudioEmbeddingItems, AudioProcessorItems)))
+
+        if isinstance(parsed_audios, MiniCPMOAudioEmbeddingItems):
+            audio_inputs = {}
+        else:
+            audio_inputs = self._base_call_hf_processor(
+                prompts=[self.info.audio_pattern] * len(parsed_audios),
+                mm_data={"audios": [[audio] for audio in parsed_audios]},
+                mm_kwargs={
+                    **mm_kwargs, "chunk_input": True
+                },
+                tok_kwargs=tok_kwargs,
+                out_keys={"audio_features", "audio_feature_lens"},
+            )
+
+            # Avoid padding since we need the output for each audio to be
+            # independent of other audios for the cache to work correctly
+            unpadded_audio_features = [
+                feat[:, :feature_len] for feat, feature_len in zip(
+                    audio_inputs["audio_features"],
+                    audio_inputs["audio_feature_lens"],
+                )
+            ]
+            audio_inputs["audio_features"] = unpadded_audio_features
+
+        tokenizer = self.info.get_tokenizer()
+        unk_token_id = tokenizer.get_vocab()["<unk>"]
+        audio_inputs["audio_token_id"] = torch.tensor(unk_token_id)
+
+        return audio_inputs
+
+    def process_mm_inputs(
+        self,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> Mapping[str, NestedTensors]:
+        return {
+            **super().process_mm_inputs(mm_data, mm_kwargs, tok_kwargs),
+            **self.process_audios(mm_data, mm_kwargs, tok_kwargs),
+        }
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        base_updates = super()._get_prompt_updates(
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            out_mm_kwargs=out_mm_kwargs,
+        )
+
+        audio_placeholder = self.info.audio_pattern
+
+        def get_audio_replacement(item_idx: int):
+            audios = mm_items.get_items(
+                "audio", (MiniCPMOAudioEmbeddingItems, AudioProcessorItems))
+
+            if isinstance(audios, MiniCPMOAudioEmbeddingItems):
+                single_audio_embeds = audios.get(item_idx)["audio_embeds"]
+                audio_len = self.info.get_audio_len_by_num_chunks(
+                    sum(map(len, single_audio_embeds)))
+            else:
+                audio_len = audios.get_audio_length(item_idx)
+
+            return PromptUpdateDetails.select_text(
+                self.get_audio_prompt_texts(audio_len),
+                "<unk>",
+            )
+
+        return [
+            *base_updates,
+            PromptReplacement(modality="audio",
+                              target=audio_placeholder,
+                              replacement=get_audio_replacement),
+        ]
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return _minicpmo_field_config(hf_inputs)
+
+
+class MultiModalProjector(nn.Module):
+
+    def __init__(self, in_dim: int, out_dim: int):
+        super().__init__()
+        self.linear1 = nn.Linear(in_features=in_dim,
+                                 out_features=out_dim,
+                                 bias=True)
+        self.relu = nn.ReLU()
+        self.linear2 = nn.Linear(in_features=out_dim,
+                                 out_features=out_dim,
+                                 bias=True)
+
+    def forward(self, audio_features: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.relu(self.linear1(audio_features))
+        hidden_states = self.linear2(hidden_states)
+        return hidden_states
+
+
+class MiniCPMWhisperEncoderLayer(nn.Module):
+
+    def __init__(self, config: WhisperConfig, layer_idx: int):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = WhisperAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+            config=config,
+            layer_idx=layer_idx,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        past_key_values = None
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights, past_key_values = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            past_key_value=past_key_values,
+        )
+        hidden_states = nn.functional.dropout(hidden_states,
+                                              p=self.dropout,
+                                              training=self.training)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states,
+                                              p=self.activation_dropout,
+                                              training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states,
+                                              p=self.dropout,
+                                              training=self.training)
+        hidden_states = residual + hidden_states
+
+        if hidden_states.dtype == torch.float16:
+            hidden_states = cast_overflow_tensors(hidden_states)
+
+        outputs = (hidden_states, )
+
+        return outputs
+
+
+class MiniCPMWhisperEncoder(WhisperEncoder):
+
+    def __init__(self, config: WhisperConfig):
+        super().__init__(config)
+        self.layers = nn.ModuleList([
+            MiniCPMWhisperEncoderLayer(config, layer_idx=i)
+            for i in range(config.encoder_layers)
+        ])
+
+    def forward(
+        self,
+        input_features: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> BaseModelOutputWithPast:
+        # Ignore copy
+        input_features = input_features.to(dtype=self.conv1.weight.dtype,
+                                           device=self.conv1.weight.device)
+
+        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
+        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
+
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)
+
+        embed_pos = self.embed_positions.weight
+
+        embed_pos = embed_pos[:inputs_embeds.shape[1], :]
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = nn.functional.dropout(hidden_states,
+                                              p=self.dropout,
+                                              training=self.training)
+
+        encoder_states = ()
+
+        for idx, encoder_layer in enumerate(self.layers):
+            encoder_states = encoder_states + (hidden_states, )
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            # Ignore copy
+            if to_drop:
+                layer_outputs = (None, None)
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                )
+
+                hidden_states = layer_outputs[0]
+
+        hidden_states = self.layer_norm(hidden_states)
+        encoder_states = encoder_states + (hidden_states, )
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+        )
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    MiniCPMOMultiModalProcessor,
+    info=MiniCPMOProcessingInfo,
+    dummy_inputs=MiniCPMODummyInputsBuilder)
+class MiniCPMO(MiniCPMV2_6):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "(<image>./</image>)"
+        if modality.startswith("video"):
+            return "(<video>./</video>)"
+        if modality.startswith("audio"):
+            return "(<audio>./</audio>)"
+
+        raise ValueError("Only image, video or audio modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        self.apm = self.init_audio_module(vllm_config=vllm_config,
+                                          prefix=maybe_prefix(prefix, "apm"))
+
+        self.audio_token_id = None
+
+    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
+        # GPTQ configs do not have a list of ignored modules, however AutoGPTQ
+        # seems to avoid vision encoder sections for some models.
+        # See: https://huggingface.co/openbmb/MiniCPM-o-2_6-int4
+        if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)):
+            return None
+        return quant_config
+
+    def init_vision_module(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        # MiniCPMO GPTQ model leave vpm unquantized.
+        quant_config = self._maybe_ignore_quant_config(quant_config)
+        return super().init_vision_module(config, quant_config, prefix)
+
+    def init_resampler(
+        self,
+        embed_dim: int,
+        vision_dim: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        # MiniCPMO GPTQ model leave resampler unquantized.
+        quant_config = self._maybe_ignore_quant_config(quant_config)
+        return super().init_resampler(embed_dim, vision_dim, quant_config,
+                                      prefix)
+
+    def init_audio_module(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        # Do not use parameters temporarily
+        audio_config = self.config.audio_config
+        model = MiniCPMWhisperEncoder(audio_config)
+        audio_output_dim = int(audio_config.encoder_ffn_dim // 4)
+        self.audio_avg_pooler = \
+            nn.AvgPool1d(self.config.audio_pool_step,
+                         stride=self.config.audio_pool_step)
+        self.audio_projection_layer = \
+            MultiModalProjector(in_dim=audio_output_dim,out_dim=self.embed_dim)
+        self.audio_encoder_layer = -1
+        return model
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self, skip_prefixes=["tts"])
+        return loader.load_weights(weights)
+
+    def subsequent_chunk_mask(
+        self,
+        size: int,
+        chunk_size: int,
+        num_left_chunks: int = -1,
+        device: torch.device = CPU_DEVICE,
+        num_lookhead: int = 0,
+    ) -> torch.Tensor:
+        ret = torch.zeros(size, size, device=device, dtype=torch.bool)
+        for i in range(size):
+            if num_left_chunks < 0:
+                start = 0
+            else:
+                start = max((i // chunk_size - num_left_chunks) * chunk_size,
+                            0)
+            ending = min((i // chunk_size + 1) * chunk_size + num_lookhead,
+                         size)
+            ret[i, start:ending] = True
+        return ret
+
+    def _get_feat_extract_output_lengths(self,
+                                         input_lengths: torch.LongTensor):
+        input_lengths_after_cnn = (input_lengths - 1) // 2 + 1
+        input_lengths_after_pooling = (
+            input_lengths_after_cnn -
+            self.config.audio_pool_step) // self.config.audio_pool_step + 1
+        input_lengths_after_pooling = input_lengths_after_pooling.to(
+            dtype=torch.int32)
+
+        return input_lengths_after_cnn, input_lengths_after_pooling
+
+    def get_audio_hidden_states(
+            self, data: MiniCPMOAudioFeatureInputs) -> list[torch.Tensor]:
+        chunk_length = self.config.audio_chunk_length
+
+        # (bs, 80, frames) or [], multi audios need filled in advance
+        wavforms_raw = data["audio_features"]
+        if isinstance(wavforms_raw, list):
+            B = len(wavforms_raw)
+            C = wavforms_raw[0].shape[-2]
+            L = max(item.shape[-1] for item in wavforms_raw)
+            device = wavforms_raw[0].device
+            dtype = wavforms_raw[0].dtype
+
+            wavforms = torch.zeros((B, C, L), dtype=dtype, device=device)
+            for i, wavforms_item in enumerate(wavforms_raw):
+                L_item = wavforms_item.shape[-1]
+                wavforms[i, ..., :L_item] = wavforms_item
+        else:
+            wavforms = wavforms_raw
+
+        # list, [[x1, x2], [y1], [z1]]
+        audio_feature_lens_raw = data["audio_feature_lens"]
+        if isinstance(audio_feature_lens_raw, torch.Tensor):
+            audio_feature_lens_raw = audio_feature_lens_raw.unbind(0)
+
+        audio_feature_lens = torch.hstack(audio_feature_lens_raw)
+        batch_size, _, max_mel_seq_len = wavforms.shape
+        max_seq_len = (max_mel_seq_len - 1) // 2 + 1
+
+        # Create a sequence tensor of shape (batch_size, max_seq_len)
+        seq_range = (torch.arange(
+            0,
+            max_seq_len,
+            dtype=audio_feature_lens.dtype,
+            device=audio_feature_lens.device).unsqueeze(0).expand(
+                batch_size, max_seq_len))
+        lengths_expand = audio_feature_lens.unsqueeze(1).expand(
+            batch_size, max_seq_len)
+        # Create mask
+        padding_mask = seq_range >= lengths_expand  # 1 for padded values
+
+        audio_attention_mask_ = padding_mask.view(
+            batch_size, 1, 1, max_seq_len).expand(batch_size, 1, max_seq_len,
+                                                  max_seq_len)
+        audio_attention_mask = audio_attention_mask_.to(
+            dtype=self.apm.conv1.weight.dtype,
+            device=self.apm.conv1.weight.device)
+
+        if chunk_length > 0:
+            chunk_num_frame = int(chunk_length * 50)
+            chunk_mask = self.subsequent_chunk_mask(
+                size=max_seq_len,
+                chunk_size=chunk_num_frame,
+                num_left_chunks=-1,
+                device=audio_attention_mask_.device,
+            )
+            audio_attention_mask_ = torch.logical_or(
+                audio_attention_mask_, torch.logical_not(chunk_mask))
+
+        audio_attention_mask[audio_attention_mask_] = float("-inf")
+        audio_states = self.apm(
+            wavforms, attention_mask=audio_attention_mask).hidden_states[
+                self.audio_encoder_layer]
+        audio_embeds = self.audio_projection_layer(audio_states)
+
+        audio_embeds = audio_embeds.transpose(1, 2)
+        audio_embeds = self.audio_avg_pooler(audio_embeds)
+        audio_embeds = audio_embeds.transpose(1, 2)
+
+        _, feature_lens_after_pooling = \
+            self._get_feat_extract_output_lengths(audio_feature_lens)
+
+        num_audio_tokens = feature_lens_after_pooling
+
+        final_audio_embeds = list[torch.Tensor]()
+        idx = 0
+        for i in range(len(audio_feature_lens_raw)):
+            target_audio_embeds_lst = list[torch.Tensor]()
+            for _ in range(len(audio_feature_lens_raw[i])):
+                target_audio_embeds_lst.append(
+                    audio_embeds[idx, :num_audio_tokens[idx], :])
+                idx += 1
+
+            final_audio_embeds.append(torch.cat(target_audio_embeds_lst))
+
+        return final_audio_embeds
+
+    def _parse_and_validate_audio_input(
+            self, **kwargs: object) -> Optional[MiniCPMOAudioInputs]:
+        audio_features = kwargs.pop("audio_features", None)
+        audio_embeds = kwargs.pop("audio_embeds", None)
+
+        if audio_features is None and audio_embeds is None:
+            return None
+
+        audio_token_id = kwargs.pop("audio_token_id")
+        if audio_token_id is not None:
+            assert isinstance(audio_token_id, torch.Tensor)
+            self.mm_token_ids.add(audio_token_id.flatten().unique().item())
+
+        if audio_embeds is not None:
+            if not isinstance(audio_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of audio_embeds. "
+                                 f"Got type: {type(audio_embeds)}")
+
+            audio_embeds_flat = flatten_bn(audio_embeds)
+
+            return MiniCPMOAudioEmbeddingInputs(
+                type="audio_embeds",
+                audio_embeds=audio_embeds_flat,
+            )
+
+        if not isinstance(audio_features, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of audio_features. "
+                             f"Got type: {type(audio_features)}")
+
+        audio_feature_lens = kwargs.pop("audio_feature_lens")
+        if not isinstance(audio_feature_lens, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of audio_feature_lens. "
+                             f"Got type: {type(audio_feature_lens)}")
+
+        audio_features_flat = flatten_bn(audio_features)
+        audio_feature_lens_flat = flatten_bn(audio_feature_lens)
+
+        return MiniCPMOAudioFeatureInputs(
+            type="audio_features",
+            audio_features=audio_features_flat,
+            audio_feature_lens=audio_feature_lens_flat,
+        )
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = super()._parse_and_validate_multimodal_inputs(**kwargs)
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("audio_features",
+                             "audio_embeds") and "audios" not in modalities:
+                modalities["audios"] = self._parse_and_validate_audio_input(
+                    **kwargs)
+
+        return modalities
+
+    def _process_audio_input(
+        self,
+        audio_input: MiniCPMOAudioInputs,
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        if audio_input["type"] == "audio_embeds":
+            return audio_input["audio_embeds"]
+
+        return self.get_audio_hidden_states(audio_input)
+
+    def _process_multimodal_inputs(self, modalities: dict):
+        multimodal_embeddings = super()._process_multimodal_inputs(modalities)
+
+        for modality in modalities:
+            if modality == "audios":
+                audio_input = modalities["audios"]
+                audio_features = self._process_audio_input(audio_input)
+                multimodal_embeddings += tuple(audio_features)
+
+        return multimodal_embeddings
diff --git a/vllm_v0.10.0/vllm/model_executor/models/minicpmv.py b/vllm_v0.10.0/vllm/model_executor/models/minicpmv.py
new file mode 100644
index 0000000..70f2d4a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/minicpmv.py
@@ -0,0 +1,1307 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only MiniCPM-V model compatible with HuggingFace weights."""
+import math
+from collections import defaultdict
+from collections.abc import Iterable, Mapping, Sequence
+from functools import partial
+from typing import Any, Callable, Literal, Optional, TypedDict, Union
+
+import numpy as np
+import torch
+import torch.types
+from torch import nn
+from transformers import BatchFeature, PretrainedConfig
+from typing_extensions import TypeVar
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.resampler import (BaseResampler, Resampler2,
+                                                  get_2d_sincos_pos_embed)
+from vllm.model_executor.model_loader.utils import set_default_torch_dtype
+from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.model_executor.models.minicpm import MiniCPMForCausalLM
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    NestedTensors)
+from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem,
+                                   ImageProcessorItems, ImageSize,
+                                   ModalityData, ModalityDataItems,
+                                   MultiModalDataItems, MultiModalDataParser,
+                                   VideoItem, VideoProcessorItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.platforms import current_platform
+from vllm.sequence import IntermediateTensors
+from vllm.utils import flatten_2d_lists
+
+from .idefics2_vision_model import Idefics2VisionTransformer
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
+from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
+                    merge_multimodal_embeddings)
+
+# For profile run
+_MAX_FRAMES_PER_VIDEO = 16
+
+
+class MiniCPMVImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: list[torch.Tensor]
+    """
+    Shape: `(batch_size * num_images * num_slices, num_channels, height, width)`
+
+    Note that the image size may vary, so we pass it as a list
+    instead of a batched tensor.
+    """
+
+    tgt_sizes: torch.Tensor
+    """
+    Shape: `(batch_size * num_images * num_slices, 2)`
+
+    This should be in `(height, width)` format.
+    """
+
+    num_slices: torch.Tensor
+    """Shape: `(batch_size * num_images)`"""
+
+
+class MiniCPMVImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    image_embeds: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    Shape: `(batch_size * num_images, num_slices, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    instead of a batched tensor.
+    """
+
+
+MiniCPMVImageInputs = Union[MiniCPMVImagePixelInputs,
+                            MiniCPMVImageEmbeddingInputs]
+
+DEFAULT_LN = partial(nn.LayerNorm, eps=1e-6)
+
+
+class Resampler2_5(BaseResampler):
+
+    def __init__(self,
+                 num_queries: int,
+                 embed_dim: int,
+                 num_heads: int,
+                 kv_dim: Optional[int] = None,
+                 norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
+                 max_size: tuple[int, int] = (70, 70),
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "") -> None:
+        super().__init__(num_queries,
+                         embed_dim,
+                         num_heads,
+                         kv_dim,
+                         norm_layer,
+                         quant_config=quant_config,
+                         prefix=prefix)
+
+        self.max_size = max_size
+        self._set_2d_pos_cache(self.max_size)
+
+    def _set_2d_pos_cache(self,
+                          max_size: tuple[int, int],
+                          device: torch.types.Device = "cpu") -> None:
+        pos_embed_arr = get_2d_sincos_pos_embed(self.embed_dim,
+                                                max_size,
+                                                version=(2, 5))
+        pos_embed = torch.from_numpy(pos_embed_arr).float().to(device)
+        self.register_buffer("pos_embed", pos_embed, persistent=False)
+
+    def _adjust_pos_cache(self, tgt_sizes: torch.Tensor,
+                          device: torch.types.Device) -> None:
+        max_h = tgt_sizes[:, 0].max().item()
+        max_w = tgt_sizes[:, 1].max().item()
+        assert isinstance(max_h, int) and isinstance(max_w, int)
+
+        if max_h > self.max_size[0] or max_w > self.max_size[1]:
+            self.max_size = (
+                max(max_h, self.max_size[0]),
+                max(max_w, self.max_size[1]),
+            )
+            self._set_2d_pos_cache(self.max_size, device)
+
+    def forward(self, x: torch.Tensor,
+                tgt_sizes: torch.Tensor) -> torch.Tensor:
+        assert x.shape[0] == tgt_sizes.shape[0]
+        bs = x.shape[0]
+
+        device = x.device
+        dtype = x.dtype
+
+        patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1]
+
+        self._adjust_pos_cache(tgt_sizes, device=device)
+
+        max_patch_len = patch_len.max().item()
+        assert isinstance(max_patch_len, int)
+
+        key_padding_mask = torch.zeros((bs, max_patch_len),
+                                       dtype=torch.bool,
+                                       device=device)
+
+        pos_embed = []
+        for i in range(bs):
+            tgt_h, tgt_w = tgt_sizes[i].tolist()
+            pos_embed.append(self.pos_embed[:tgt_h, :tgt_w, :].reshape(
+                (tgt_h * tgt_w, -1)).to(dtype))  # patches * D
+            key_padding_mask[i, patch_len[i]:] = True
+        pos_embed = torch.nn.utils.rnn.pad_sequence(pos_embed,
+                                                    batch_first=True,
+                                                    padding_value=0.0).permute(
+                                                        1, 0,
+                                                        2)  # BLD => L * B * D
+        x, _ = self.kv_proj(x)  # B * L * D
+        x = self.ln_kv(x).permute(1, 0, 2)  # L * B * D
+
+        q = self.ln_q(self.query)  # Q * D
+
+        out = self.attn(
+            self._repeat(q, bs),  # Q * B * D
+            x + pos_embed,  # L * B * D +  L * B * D
+            x,
+            key_padding_mask=key_padding_mask,
+        )[0]
+        #  out: Q * B * D
+        x = out.permute(1, 0, 2)  # B * Q * D
+
+        x = self.ln_post(x)
+        x = x @ self.proj
+        return x
+
+
+def get_version_by_config(config: PretrainedConfig) -> tuple[int, ...]:
+    version_float = getattr(config, "version", None)
+
+    # The old configs do not include version number
+    # TODO: Remove this after the HF repos are updated
+    if version_float is None:
+        if config.hidden_size == 2304 and config.query_num == 64:
+            return (2, 0)
+        return (2, 5)
+    version_str = str(version_float)
+    return tuple(int(x) for x in version_str.split("."))
+
+
+def _minicpmv_field_config(hf_inputs: Mapping[str, torch.Tensor]):
+    pixel_values = hf_inputs.get("pixel_values", torch.empty(0))
+    num_images = len(pixel_values)
+
+    video_pixel_values = hf_inputs.get("video_pixel_values", torch.empty(0))
+    num_videos = len(video_pixel_values)
+
+    return dict(
+        pixel_values=MultiModalFieldConfig.batched("image"),
+        image_sizes=MultiModalFieldConfig.batched("image"),
+        tgt_sizes=MultiModalFieldConfig.batched("image"),
+        image_embeds=MultiModalFieldConfig.batched("image"),
+        video_pixel_values=MultiModalFieldConfig.batched("video"),
+        video_image_sizes=MultiModalFieldConfig.batched("video"),
+        video_tgt_sizes=MultiModalFieldConfig.batched("video"),
+        video_embeds=MultiModalFieldConfig.batched("video"),
+        image_token_id=MultiModalFieldConfig.shared("image", num_images),
+        video_token_id=MultiModalFieldConfig.shared("video", num_videos),
+    )
+
+
+class MiniCPMVImageEmbeddingItems(DictEmbeddingItems):
+
+    def __init__(
+        self,
+        data: Mapping[str, torch.Tensor],
+        fields_factory: Callable[
+            [Mapping[str, torch.Tensor]],
+            Mapping[str, MultiModalFieldConfig],
+        ],
+    ) -> None:
+        super().__init__(
+            data,
+            modality="image",
+            required_fields={"image_embeds", "image_sizes"},
+            fields_factory=fields_factory,
+        )
+
+    def get_image_size(self, index: int) -> ImageSize:
+        image_size = self.get(index)["image_sizes"].tolist()
+        return ImageSize(width=image_size[0], height=image_size[1])
+
+
+class MiniCPMVVideoEmbeddingItems(DictEmbeddingItems):
+
+    def __init__(
+        self,
+        data: Mapping[str, torch.Tensor],
+        fields_factory: Callable[
+            [Mapping[str, torch.Tensor]],
+            Mapping[str, MultiModalFieldConfig],
+        ],
+    ) -> None:
+        super().__init__(
+            data,
+            modality="video",
+            required_fields={"video_embeds", "video_image_sizes"},
+            fields_factory=fields_factory,
+        )
+
+    def get_frame_size(self, index: int) -> ImageSize:
+        frame_size = self.get(index)["video_image_sizes"].tolist()
+        return ImageSize(width=frame_size[0], height=frame_size[1])
+
+    def get_num_frames(self, index: int) -> int:
+        return len(self.get(index)["video_image_sizes"])
+
+
+class MiniCPMVMultiModalDataParser(MultiModalDataParser):
+
+    def _parse_image_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
+    ) -> Optional[ModalityDataItems[Any, Any]]:
+        if isinstance(data, dict):
+            return MiniCPMVImageEmbeddingItems(
+                data,
+                fields_factory=_minicpmv_field_config,
+            )
+
+        return super()._parse_image_data(data)
+
+    def _parse_video_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
+    ) -> Optional[ModalityDataItems[Any, Any]]:
+        if isinstance(data, dict):
+            return MiniCPMVVideoEmbeddingItems(
+                data,
+                fields_factory=_minicpmv_field_config,
+            )
+
+        return super()._parse_video_data(data)
+
+
+class MiniCPMVProcessingInfo(BaseProcessingInfo):
+    image_pattern = "(<image>./</image>)"
+    video_pattern = "(<video>./</video>)"
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config()
+
+    def get_hf_processor(self, **kwargs: object):
+        hf_processor = self.ctx.get_hf_processor(**kwargs)
+
+        # NumPy arrays are considered as Iterable but not Sequence in
+        # https://github.com/huggingface/transformers/blob/main/src/transformers/image_transforms.py#L428
+        image_processor = hf_processor.image_processor  # type: ignore
+        for attr in ("mean", "std"):
+            val = getattr(image_processor, attr)
+            if isinstance(val, np.ndarray):
+                setattr(image_processor, attr, val.tolist())
+
+        return hf_processor
+
+    def get_image_processor(self):
+        hf_processor = self.get_hf_processor()
+        image_processor = hf_processor.image_processor  # type: ignore
+        return image_processor
+
+    def get_model_version(self):
+        return get_version_by_config(self.get_hf_config())
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        mm_limits = {"image": None}
+        if self.get_model_version() == (2, 6):
+            mm_limits["video"] = None
+
+        return mm_limits
+
+    def get_slice_image_placeholder(
+        self,
+        image_size: ImageSize,
+        # For MiniCPM V/O 2.6
+        image_idx: int = 0,
+        max_slice_nums: Optional[int] = None,
+        use_image_id: bool = True,
+    ) -> str:
+        image_processor = self.get_image_processor()
+        version = self.get_model_version()
+
+        if version == (2, 0) or version == (2, 5):
+            return image_processor.get_slice_image_placeholder(image_size)
+
+        return image_processor.get_slice_image_placeholder(
+            image_size,
+            image_idx=image_idx,
+            max_slice_nums=max_slice_nums,
+            use_image_id=use_image_id,
+        )
+
+    def get_sliced_grid(
+        self,
+        image_size: ImageSize,
+        # For MiniCPM V/O 2.6
+        max_slice_nums: Optional[int] = None,
+    ) -> Optional[tuple[int, int]]:
+        image_processor = self.get_image_processor()
+        version = self.get_model_version()
+
+        if version == (2, 0) or version == (2, 5):
+            return image_processor.get_sliced_grid(image_size)
+
+        if max_slice_nums is None:
+            max_slice_nums = image_processor.max_slice_nums
+
+        return image_processor.get_sliced_grid(
+            image_size,
+            max_slice_nums=max_slice_nums,
+        )
+
+    def get_num_image_tokens(
+        self,
+        image_size: ImageSize,
+        max_slice_nums: Optional[int] = None,
+    ) -> int:
+        image_processor = self.get_image_processor()
+
+        grid = self.get_sliced_grid(
+            image_size,
+            max_slice_nums=max_slice_nums,
+        )
+        if grid is None:
+            ncols = nrows = 0
+        else:
+            ncols, nrows = grid
+
+        return (ncols * nrows + 1) * image_processor.image_feature_size
+
+    def get_max_image_tokens(self) -> int:
+        image_size = self.get_image_size_with_most_features()
+        return self.get_num_image_tokens(image_size)
+
+    def get_image_max_slice_num(self) -> int:
+        return getattr(self.get_hf_config(), "max_slice_num", 9)
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        image_size = getattr(self.get_hf_config(), "image_size", 448)
+        max_slice_num = self.get_image_max_slice_num()
+        return ImageSize(width=image_size, height=image_size * max_slice_num)
+
+    def get_max_video_frame_tokens(self) -> int:
+        frame_size = self.get_video_frame_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            frame_size,
+            max_slice_nums=self.get_video_max_slice_num(),
+        )
+
+    def get_max_video_tokens(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        num_frames = self.get_num_frames_with_most_features(seq_len, mm_counts)
+        num_video_tokens_total = self.get_max_video_frame_tokens() * num_frames
+        return num_video_tokens_total
+
+    def get_video_max_slice_num(self) -> int:
+        return 1
+
+    def get_video_frame_size_with_most_features(self) -> ImageSize:
+        image_size = getattr(self.get_hf_config(), "image_size", 448)
+        max_slice_num = self.get_video_max_slice_num()
+        return ImageSize(width=image_size, height=image_size * max_slice_num)
+
+    def get_max_video_frames(self, max_tokens: int) -> int:
+        num_frame_tokens = self.get_max_video_frame_tokens()
+        num_frames = max_tokens // num_frame_tokens
+        return num_frames
+
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
+
+        max_image_tokens = self.get_max_image_tokens() * max_images
+        max_total_frames = self.get_max_video_frames(seq_len -
+                                                     max_image_tokens)
+        max_frames_per_video = min(max_total_frames // max(max_videos, 1),
+                                   _MAX_FRAMES_PER_VIDEO)
+
+        return max(max_frames_per_video, 1)
+
+
+_I = TypeVar("_I",
+             bound=MiniCPMVProcessingInfo,
+             default=MiniCPMVProcessingInfo)
+
+
+class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        image_prompt_texts = self.info.image_pattern * num_images
+        video_prompt_texts = self.info.video_pattern * num_videos
+
+        return image_prompt_texts + video_prompt_texts
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        image_width, image_height = \
+            self.info.get_image_size_with_most_features()
+        video_width, video_height = \
+            self.info.get_video_frame_size_with_most_features()
+        num_video_frames = \
+            self.info.get_num_frames_with_most_features(seq_len, mm_counts)
+
+        return {
+            "image":
+            self._get_dummy_images(width=image_width,
+                                   height=image_height,
+                                   num_images=num_images),
+            "video": [
+                self._get_dummy_images(width=video_width,
+                                       height=video_height,
+                                       num_images=num_video_frames)
+            ] * num_videos,
+        }
+
+
+class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return MiniCPMVMultiModalDataParser()
+
+    def get_image_prompt_texts(self,
+                               image_size: ImageSize,
+                               image_idx: int = 0) -> str:
+        return self.info.get_slice_image_placeholder(
+            image_size,
+            image_idx=image_idx,
+        )
+
+    def get_video_prompt_texts(self, image_size: ImageSize,
+                               num_frames: int) -> str:
+        return self.info.get_slice_image_placeholder(
+            image_size=image_size,
+            image_idx=0,
+            max_slice_nums=self.info.get_video_max_slice_num(),
+            use_image_id=False,
+        ) * num_frames
+
+    def process_images(
+        self,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> Mapping[str, NestedTensors]:
+        if (images := mm_data.get("images")) is None:
+            return {}
+
+        parsed_images = (self._get_data_parser().parse_mm_data({
+            "image": images
+        }).get_items("image",
+                     (MiniCPMVImageEmbeddingItems, ImageProcessorItems)))
+
+        if isinstance(parsed_images, MiniCPMVImageEmbeddingItems):
+            image_inputs = {}
+        else:
+            image_inputs = self._base_call_hf_processor(
+                prompts=[self.info.image_pattern] * len(parsed_images),
+                mm_data={"images": [[image] for image in parsed_images]},
+                mm_kwargs=mm_kwargs,
+                tok_kwargs=tok_kwargs,
+                out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
+            )
+
+        tokenizer = self.info.get_tokenizer()
+        unk_token_id = tokenizer.get_vocab()["<unk>"]
+        image_inputs["image_token_id"] = torch.tensor(unk_token_id)
+
+        return image_inputs
+
+    def process_videos(
+        self,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> Mapping[str, NestedTensors]:
+        if (videos := mm_data.get("videos")) is None:
+            return {}
+
+        parsed_videos = (self._get_data_parser().parse_mm_data({
+            "video": videos
+        }).get_items("video",
+                     (MiniCPMVVideoEmbeddingItems, VideoProcessorItems)))
+
+        if isinstance(parsed_videos, MiniCPMVVideoEmbeddingItems):
+            video_inputs = {}
+        else:
+            video_inputs = self._base_call_hf_processor(
+                prompts=[
+                    self.info.image_pattern * len(video)
+                    for video in parsed_videos
+                ],
+                mm_data={"images": list(parsed_videos)},
+                mm_kwargs={
+                    **mm_kwargs,
+                    "max_slice_nums":
+                    self.info.get_video_max_slice_num(),
+                },
+                tok_kwargs=tok_kwargs,
+                out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
+            )
+
+        video_inputs = {f"video_{k}": v for k, v in video_inputs.items()}
+
+        tokenizer = self.info.get_tokenizer()
+        unk_token_id = tokenizer.get_vocab()["<unk>"]
+        video_inputs["video_token_id"] = torch.tensor(unk_token_id)
+
+        return video_inputs
+
+    def process_mm_inputs(
+        self,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> Mapping[str, NestedTensors]:
+        return {
+            **self.process_images(mm_data, mm_kwargs, tok_kwargs),
+            **self.process_videos(mm_data, mm_kwargs, tok_kwargs),
+        }
+
+    def _base_call_hf_processor(
+        self,
+        prompts: list[str],
+        mm_data: Mapping[str, Sequence[object]],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+        *,
+        out_keys: set[str],
+    ) -> dict[str, NestedTensors]:
+        # This processor supports zipping prompt and mm_data together
+        if self.info.get_model_version() == (2, 6):
+            inputs = super()._call_hf_processor(
+                prompt=prompts,  # type: ignore
+                mm_data=mm_data,
+                mm_kwargs=mm_kwargs,
+                tok_kwargs=tok_kwargs,
+            )
+        else:
+            inputs = defaultdict[str, list[torch.Tensor]](list)
+
+            for i, prompt in enumerate(prompts):
+                inputs_one = super()._call_hf_processor(
+                    prompt=prompt,
+                    mm_data={
+                        k: v[i]
+                        for k, v in mm_data.items()
+                    },
+                    mm_kwargs=mm_kwargs,
+                    tok_kwargs=tok_kwargs,
+                )
+
+                for k, v in inputs_one.items():
+                    assert len(v) == 1, (k, len(v))
+                    inputs[k].append(v[0])
+
+        return {k: inputs[k] for k in out_keys}
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        tokenizer = self.info.get_tokenizer()
+
+        input_ids = torch.tensor([tokenizer.encode(prompt, **tok_kwargs)])
+        mm_inputs = self.process_mm_inputs(mm_data, mm_kwargs, tok_kwargs)
+
+        return BatchFeature({
+            "input_ids": input_ids,
+            **mm_inputs,
+        })
+
+    def _hf_processor_applies_updates(
+        self,
+        prompt_text: str,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+    ) -> bool:
+        return False
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        placeholder = {
+            "image": self.info.image_pattern,
+            "video": self.info.video_pattern,
+        }
+
+        def get_image_replacement(item_idx: int):
+            images = mm_items.get_items(
+                "image", (MiniCPMVImageEmbeddingItems, ImageProcessorItems))
+
+            image_size = images.get_image_size(item_idx)
+
+            return PromptUpdateDetails.select_text(
+                self.get_image_prompt_texts(image_size, item_idx),
+                "<unk>",
+            )
+
+        def get_video_replacement(item_idx: int):
+            videos = mm_items.get_items(
+                "video", (MiniCPMVVideoEmbeddingItems, VideoProcessorItems))
+
+            frame_size = videos.get_frame_size(item_idx)
+            num_frames = videos.get_num_frames(item_idx)
+
+            return PromptUpdateDetails.select_text(
+                self.get_video_prompt_texts(frame_size, num_frames),
+                "<unk>",
+            )
+
+        get_replacement = {
+            "image": get_image_replacement,
+            "video": get_video_replacement,
+        }
+
+        return [
+            PromptReplacement(modality=modality,
+                              target=placeholder[modality],
+                              replacement=get_replacement[modality])
+            for modality in ("image", "video")
+        ]
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return _minicpmv_field_config(hf_inputs)
+
+
+class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
+    """
+    The abstract class of MiniCPMV can only be inherited, but cannot be
+    instantiated.
+    """
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "(<image>./</image>)"
+        if modality.startswith("video"):
+            return "(<video>./</video>)"
+
+        raise ValueError("Only image or video modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        quant_config = vllm_config.quant_config
+        super().__init__()
+        # All MiniCPM-V models disable `tie_word_embeddings` but
+        # `PretrainedConfig.tie_word_embeddings` defaults to True; we cannot
+        # check `tie_word_embeddings` until vLLM integrate MiniCPM-V model
+        # and config class
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.version = get_version_by_config(self.config)
+        self.llm = self.init_llm(vllm_config=vllm_config,
+                                 prefix=maybe_prefix(prefix, "llm"))
+        self.vpm = self.init_vision_module(config,
+                                           quant_config,
+                                           prefix=maybe_prefix(prefix, "vpm"))
+        self.vision_dim = (self.vpm.embed_dim if self.version == (2, 0) else
+                           self.vpm.embeddings.embed_dim)
+        self.embed_dim = self.config.hidden_size
+
+        self.resampler = self.init_resampler(self.embed_dim,
+                                             self.vision_dim,
+                                             quant_config=quant_config,
+                                             prefix=maybe_prefix(
+                                                 prefix, "resampler"))
+
+        self.mm_token_ids = set[int]()
+        self.make_empty_intermediate_tensors = (
+            self.llm.make_empty_intermediate_tensors)
+
+    def _parse_and_validate_vision_input(
+        self,
+        modality: str,
+        **kwargs: object,
+    ) -> Optional[MiniCPMVImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        image_token_id = kwargs.pop("image_token_id")
+        if image_token_id is not None:
+            assert isinstance(image_token_id, torch.Tensor)
+            self.mm_token_ids.add(image_token_id.flatten().unique().item())
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError(
+                    f"Incorrect type of image_embeds for {modality=}. "
+                    f"Got type: {type(image_embeds)}")
+
+            image_embeds_flat = flatten_bn(image_embeds)
+
+            return MiniCPMVImageEmbeddingInputs(
+                type="image_embeds",
+                image_embeds=image_embeds_flat,
+            )
+
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError(
+                f"Incorrect type of pixel_values for {modality=}. "
+                f"Got type: {type(pixel_values)}")
+
+        tgt_sizes = kwargs.pop("tgt_sizes")
+        if not isinstance(tgt_sizes, (torch.Tensor, list)):
+            raise ValueError(f"Incorrect type of tgt_sizes for {modality=}. "
+                             f"Got type: {type(tgt_sizes)}")
+
+        num_slices = [[len(p) for p in ps] for ps in pixel_values]
+        num_slices_flat = flatten_bn(torch.tensor(num_slices))
+
+        pixel_values_flat = flatten_bn(flatten_2d_lists(pixel_values))
+        tgt_sizes_flat = flatten_bn(flatten_2d_lists(tgt_sizes), concat=True)
+
+        if len(pixel_values_flat) != len(tgt_sizes_flat):
+            raise ValueError("Inconsistent flattened lengths, found: "
+                             f"{len(pixel_values_flat)} vs. "
+                             f"{len(tgt_sizes_flat)}")
+
+        return MiniCPMVImagePixelInputs(
+            type="pixel_values",
+            pixel_values=pixel_values_flat,
+            tgt_sizes=tgt_sizes_flat,
+            num_slices=num_slices_flat,
+        )
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("pixel_values",
+                             "image_embeds") and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_vision_input(
+                    "images", **kwargs)
+            if input_key in ("video_pixel_values",
+                             "video_embeds") and "videos" not in modalities:
+
+                def _image_key(video_key: str):
+                    if video_key == "video_token_id":
+                        return "image_token_id"
+
+                    return video_key.removeprefix("video_")
+
+                modalities["videos"] = self._parse_and_validate_vision_input(
+                    "videos", **{
+                        _image_key(k): v
+                        for k, v in kwargs.items()
+                    })
+
+        return modalities
+
+    def _process_vision_input(
+        self,
+        image_input: MiniCPMVImageInputs,
+    ) -> Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor, ...]]:
+        if image_input["type"] == "image_embeds":
+            return image_input["image_embeds"]
+
+        image_features_flat = self.get_vision_hidden_states(image_input)
+
+        num_slices = image_input["num_slices"]
+        return [
+            e.flatten(0, 1)
+            for e in image_features_flat.split(num_slices.tolist())
+        ]
+
+    def _process_multimodal_inputs(self, modalities: dict):
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in modalities:
+            if modality == "images":
+                image_input = modalities["images"]
+                image_features = self._process_vision_input(image_input)
+                multimodal_embeddings += tuple(image_features)
+            if modality == "videos":
+                video_input = modalities["videos"]
+                video_features = self._process_vision_input(video_input)
+                multimodal_embeddings += tuple(video_features)
+
+        return multimodal_embeddings
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.llm
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
+            return []
+
+        return self._process_multimodal_inputs(modalities)
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.llm.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            assert len(self.mm_token_ids) > 0
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                list(self.mm_token_ids),
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: Any,
+    ) -> torch.Tensor:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner from
+        # `get_multimodal_embeddings` and `get_input_embeddings`, this
+        # condition is only for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.llm.model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.llm.compute_logits(hidden_states, sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(language_model="llm",
+                                                connector="resampler",
+                                                tower_model="vpm")
+
+    def init_llm(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> nn.Module:
+        raise NotImplementedError
+
+    def init_vision_module(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ) -> nn.Module:
+        raise NotImplementedError
+
+    def init_resampler(self,
+                       embed_dim: int,
+                       vision_dim: int,
+                       quant_config: Optional[QuantizationConfig] = None,
+                       prefix: str = "") -> nn.Module:
+        raise NotImplementedError
+
+    def get_vision_hidden_states(
+            self, data: MiniCPMVImagePixelInputs) -> torch.Tensor:
+        raise NotImplementedError
+
+
+class MiniCPMV2_0(MiniCPMVBaseModel):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        assert self.version == (2, 0)
+
+    def init_llm(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> nn.Module:
+        return MiniCPMForCausalLM(vllm_config=vllm_config, prefix=prefix)
+
+    def init_vision_module(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ) -> nn.Module:
+        # TODO: refactor vision model through timm wrapper from transformers
+        try:
+            import timm
+        except ImportError:
+            raise ImportError("Please install timm==0.9.10") from ImportError
+
+        with set_default_torch_dtype(torch.float16):
+            model = timm.create_model(
+                "vit_so400m_patch14_siglip_384.webli",
+                pretrained=False,
+                num_classes=0,
+                dynamic_img_size=True,
+                dynamic_img_pad=True,
+            )
+
+        model = model.to(dtype=torch.get_default_dtype())
+
+        if (isinstance(model, timm.models.VisionTransformer)
+                and model.attn_pool is not None):
+            model.attn_pool = torch.nn.Identity()
+
+        if self.config.drop_vision_last_layer:
+            model.blocks = model.blocks[:-1]
+
+        return model
+
+    def init_resampler(self,
+                       embed_dim: int,
+                       vision_dim: int,
+                       quant_config: Optional[QuantizationConfig] = None,
+                       prefix: str = "") -> nn.Module:
+        with set_default_torch_dtype(torch.float16):
+            resampler = Resampler2(embed_dim=embed_dim,
+                                   num_heads=embed_dim // 128,
+                                   grid_size=int(
+                                       math.sqrt(self.config.query_num)),
+                                   kv_dim=vision_dim,
+                                   adaptive=False,
+                                   do_post_projection=True,
+                                   quant_config=quant_config,
+                                   prefix=prefix)
+
+        return resampler.to(device=current_platform.device_type,
+                            dtype=torch.get_default_dtype())
+
+    def get_vision_hidden_states(
+            self, data: MiniCPMVImagePixelInputs) -> torch.Tensor:
+        pixel_values = data["pixel_values"]
+
+        P_h, P_w = self.vpm.patch_embed.patch_size
+        dtype: torch.dtype = self.vpm.pos_embed.data.dtype
+        num_prefix_tokens = getattr(self.vpm, "num_prefix_tokens", 0)
+
+        res = list[torch.Tensor]()
+        for pixel_value in pixel_values:
+            H, W = pixel_value[0].shape[-2:]
+            tgt_size = (math.ceil(H / P_h), math.ceil(W / P_w))
+            vision_embedding = self.vpm.forward_features(
+                pixel_value.unsqueeze(0).type(dtype))
+
+            if num_prefix_tokens > 0:
+                vision_embedding = vision_embedding[:, num_prefix_tokens:]
+            res.append(self.resampler(vision_embedding, tgt_size))
+
+        return torch.vstack(res)
+
+
+class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        assert self.version == (2, 5)
+
+    def init_llm(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> nn.Module:
+        return LlamaForCausalLM(vllm_config=vllm_config, prefix=prefix)
+
+    def init_vision_module(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ) -> nn.Module:
+        model = Idefics2VisionTransformer(config.vision_config,
+                                          quant_config=quant_config,
+                                          prefix=prefix)
+        if self.config.drop_vision_last_layer:
+            model.encoder.layers = model.encoder.layers[:-1]
+        return model
+
+    def init_resampler(self,
+                       embed_dim: int,
+                       vision_dim: int,
+                       quant_config: Optional[QuantizationConfig] = None,
+                       prefix: str = "") -> nn.Module:
+        with set_default_torch_dtype(torch.float16):
+            resampler = Resampler2_5(num_queries=self.config.query_num,
+                                     embed_dim=embed_dim,
+                                     num_heads=embed_dim // 128,
+                                     kv_dim=vision_dim,
+                                     quant_config=quant_config,
+                                     prefix=prefix)
+
+        return resampler.to(device=current_platform.device_type,
+                            dtype=torch.get_default_dtype())
+
+    def get_vision_hidden_states(
+            self, data: MiniCPMVImagePixelInputs) -> torch.Tensor:
+        pixel_values = data["pixel_values"]
+        tgt_sizes = data["tgt_sizes"]
+
+        B = len(pixel_values)
+        P = pixel_values[0].shape[-2]
+        L = max(item.shape[-1] for item in pixel_values)
+        device = pixel_values[0].device
+        dtype = pixel_values[0].dtype
+
+        all_pixel_values = torch.zeros((B, 3, P, L),
+                                       dtype=dtype,
+                                       device=device)
+        for i, pixel_values_item in enumerate(pixel_values):
+            L_item = pixel_values_item.shape[-1]
+            all_pixel_values[i, ..., :L_item] = pixel_values_item
+
+        num_patches = tgt_sizes.prod(-1)
+        max_patches = num_patches.max().item()
+        assert isinstance(max_patches, int)
+
+        patch_attn_mask = torch.zeros((B, max_patches),
+                                      dtype=torch.bool,
+                                      device=device)
+        for i, num_patches_item in enumerate(num_patches):
+            patch_attn_mask[i, :num_patches_item] = True
+
+        vision_embedding = self.vpm(
+            all_pixel_values,
+            patch_attention_mask=patch_attn_mask.unsqueeze(1),
+            tgt_sizes=None,
+        )
+
+        return self.resampler(vision_embedding, tgt_sizes)
+
+
+class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        assert self.version == (2, 6)
+
+    def init_llm(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> nn.Module:
+        return Qwen2ForCausalLM(vllm_config=vllm_config, prefix=prefix)
+
+    def init_vision_module(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        model = Idefics2VisionTransformer(config.vision_config,
+                                          quant_config=quant_config,
+                                          prefix=prefix)
+        if self.config.drop_vision_last_layer:
+            model.encoder.layers = model.encoder.layers[:-1]
+        return model
+
+    def init_resampler(self,
+                       embed_dim: int,
+                       vision_dim: int,
+                       quant_config: Optional[QuantizationConfig] = None,
+                       prefix: str = "") -> nn.Module:
+        with set_default_torch_dtype(torch.float16):
+            # The resampler in 2.6 remains consistent with the one in 2.5.
+            resampler = Resampler2_5(num_queries=self.config.query_num,
+                                     embed_dim=embed_dim,
+                                     num_heads=embed_dim // 128,
+                                     kv_dim=vision_dim,
+                                     quant_config=quant_config,
+                                     prefix=prefix)
+
+        return resampler.to(device=current_platform.device_type,
+                            dtype=torch.get_default_dtype())
+
+    def get_vision_hidden_states(
+            self, data: MiniCPMVImagePixelInputs) -> torch.Tensor:
+        pixel_values = data["pixel_values"]
+        tgt_sizes = data["tgt_sizes"]
+
+        B = len(pixel_values)
+        P = pixel_values[0].shape[-2]
+        L = max(item.shape[-1] for item in pixel_values)
+        device = pixel_values[0].device
+        dtype = pixel_values[0].dtype
+
+        all_pixel_values = torch.zeros((B, 3, P, L),
+                                       dtype=dtype,
+                                       device=device)
+        for i, pixel_values_item in enumerate(pixel_values):
+            L_item = pixel_values_item.shape[-1]
+            all_pixel_values[i, ..., :L_item] = pixel_values_item
+
+        num_patches = tgt_sizes.prod(-1)
+        max_patches = num_patches.max().item()
+        assert isinstance(max_patches, int)
+
+        patch_attn_mask = torch.zeros((B, max_patches),
+                                      dtype=torch.bool,
+                                      device=device)
+        for i, num_patches_item in enumerate(num_patches):
+            patch_attn_mask[i, :num_patches_item] = True
+
+        vision_embedding = self.vpm(
+            all_pixel_values,
+            patch_attention_mask=patch_attn_mask.unsqueeze(1),
+            tgt_sizes=tgt_sizes,
+        )
+
+        return self.resampler(vision_embedding, tgt_sizes)
+
+
+_SUPPORT_VERSION = {
+    (2, 0): MiniCPMV2_0,
+    (2, 5): MiniCPMV2_5,
+    (2, 6): MiniCPMV2_6,
+}
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    MiniCPMVMultiModalProcessor,
+    info=MiniCPMVProcessingInfo,
+    dummy_inputs=MiniCPMVDummyInputsBuilder)
+class MiniCPMV(MiniCPMVBaseModel, SupportsMultiModal, SupportsLoRA):
+    """
+    Different versions of MiniCPMV use different visual encoders and LLMs,
+    which is not conducive to the current integration logic of LoRA and
+    bitsandbytes in vLLM. Therefore, it is necessary to separate them.
+    """
+
+    def __new__(cls, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
+        if not hasattr(config, "version"):
+            if config.hidden_size == 2304 and config.query_num == 64:
+                version = (2, 0)
+            else:
+                version = (2, 5)
+        else:
+            version = str(config.version).split(".")
+            version = tuple([int(x) for x in version])
+        # Dispatch class based on version
+        instance_cls = _SUPPORT_VERSION.get(version)
+        if instance_cls is None:
+            raise ValueError(
+                "Currently, MiniCPMV only supports versions 2.0, 2.5, and 2.6")
+
+        # quant_config references base class members,
+        # so update values before init is called
+        cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping)
+        cls.embedding_modules.update(instance_cls.embedding_modules)
+        cls.embedding_padding_modules += instance_cls.embedding_padding_modules
+        return instance_cls(vllm_config=vllm_config, prefix=prefix)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/minimax_cache.py b/vllm_v0.10.0/vllm/model_executor/models/minimax_cache.py
new file mode 100644
index 0000000..9164ac0
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/minimax_cache.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+
+import torch
+
+from vllm.model_executor.models.constant_size_cache import ConstantSizeCache
+
+
+@dataclass
+class MinimaxCacheParams:
+    minimax_cache: torch.Tensor = torch.Tensor()
+    state_indices_tensor: torch.Tensor = torch.Tensor()
+
+    def at_layer_idx(self, layer_idx):
+        return MinimaxCacheParams(self.minimax_cache[layer_idx, ...],
+                                  self.state_indices_tensor)
+
+
+class MinimaxCacheManager(ConstantSizeCache):
+
+    def __init__(self, dtype, cache_shape):
+        super().__init__(cache_shape[1])  # max_batch_size is cache_shape[1]
+        self._minimax_cache = torch.empty(size=cache_shape,
+                                          dtype=dtype,
+                                          device="cuda")
+
+    @property
+    def cache(self):
+        return self._minimax_cache
+
+    def _copy_cache(self, from_index: int, to_index: int):
+        assert len(self.cache) > 0
+        for cache_t in self.cache:
+            cache_t[:, to_index].copy_(cache_t[:, from_index],
+                                       non_blocking=True)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/minimax_text_01.py b/vllm_v0.10.0/vllm/model_executor/models/minimax_text_01.py
new file mode 100644
index 0000000..f2773af
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/minimax_text_01.py
@@ -0,0 +1,1323 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Inference-only MiniMaxText01 model."""
+import copy
+import math
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import regex as re
+import torch
+import torch.distributed
+import torch.nn.functional as F
+from einops import rearrange
+from torch import nn
+from transformers.configuration_utils import PretrainedConfig
+
+from vllm.attention import Attention, AttentionMetadata
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
+from vllm.distributed.parallel_state import (
+    get_pp_group, get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size)
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.lightning_attn import (
+    lightning_attention, linear_decode_forward_triton)
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.utils import maybe_prefix
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import HasInnerState, IsHybrid, SupportsV0Only
+from .minimax_cache import MinimaxCacheManager, MinimaxCacheParams
+from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers
+
+
+def replace_weight_name(name: str,
+                        key: str = None,
+                        to: str = None,
+                        count: int = None,
+                        prefix: str = None) -> str:
+    name = name.replace(key, to) if count is None else \
+        name.replace(key, to, count)
+    return name
+
+
+def weight_loader_with_alias(alias: str):
+
+    def wrapper(func: callable):
+
+        def inner_func(param: torch.Tensor,
+                       loaded_weight: torch.Tensor,
+                       *args,
+                       prefix: str = None,
+                       **kwargs):
+            value = func(param, loaded_weight, *args, **kwargs)
+            return value
+
+        return inner_func
+
+    return wrapper
+
+
+class MiniMaxText01RMSNormTP(CustomOp):
+    name = "MiniMaxText01RMSNormTP"
+
+    def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.tp_world = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.weight = nn.Parameter(torch.ones(int(hidden_size /
+                                                  self.tp_world)))
+
+        self.weight.weight_loader = self.weight_loader
+        self.variance_epsilon = eps
+        return
+
+    @staticmethod
+    def weight_loader(
+        param: nn.Parameter,
+        loaded_weight: torch.Tensor,
+    ) -> None:
+        tp_world = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+
+        shard_size = loaded_weight.shape[0] // tp_world
+        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
+        param.data.copy_(loaded_weight[shard])
+        return
+
+    def _forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        orig_dtype = x.dtype
+        x = x.to(torch.float32)
+        variance = x.pow(2).mean(dim=-1, keepdim=True, dtype=torch.float32)
+        if self.tp_world > 1:
+            variance = tensor_model_parallel_all_reduce(
+                variance) / self.tp_world
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+
+        weight = self.weight
+        if x.size(-1) != self.weight.size(0):
+            if self.weight.size(0) < x.size(-1):
+                repeat_count = (x.size(-1) + self.weight.size(0)) // x.size(-1)
+                full_weight = self.weight.repeat(repeat_count)
+                weight = full_weight[:x.size(-1)]
+            else:
+                weight = self.weight[:x.size(-1)]
+
+        x = x.to(orig_dtype) * weight
+        return x
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        assert residual is None, "RMSNorm does not support residual connection."
+        return self._forward(x)
+
+
+class MiniMaxText01RotaryEmbedding(CustomOp):
+    name = "MiniMaxText01RotaryEmbedding"
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position: int,
+        base: float,
+        is_neox_style: bool,
+        cache_dtype: torch.dtype,
+    ) -> None:
+        super().__init__()
+        self.head_size = head_size
+        self.rotary_dim = rotary_dim
+        self.max_position_embeddings = max_position
+        self.base = base
+        self.is_neox_style = is_neox_style
+        self.cache_dtype = cache_dtype
+        cache = self._compute_cos_sin_cache().to(cache_dtype)
+        self.register_buffer("cos_sin_cache", cache, persistent=False)
+
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
+        """Compute the inverse frequency."""
+        inv_freq = 1.0 / (base**(torch.arange(
+            0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim))
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        """Compute the cos and sin cache."""
+        inv_freq = self._compute_inv_freq(self.base)
+        t = torch.arange(self.max_position_embeddings, dtype=torch.float)
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        from vllm import _custom_ops as ops
+        self.cos_sin_cache = self.cos_sin_cache.to(positions.device)
+        query_cast = query.to(self.cache_dtype)
+        key_cast = key.to(self.cache_dtype)
+        ops.rotary_embedding(positions, query_cast, key_cast, self.head_size,
+                             self.cos_sin_cache, self.is_neox_style)
+        query = query_cast.to(query.dtype)
+        key = key_cast.to(key.dtype)
+        return query, key
+
+
+class MiniMaxText01MLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_idx: int = None,
+        prefix: str = "mlp",
+    ) -> None:
+        super().__init__()
+        self.layer_idx = layer_idx
+
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+        self.act_fn = SiluAndMul()
+        return
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class MiniMaxText01MoE(nn.Module):
+
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: Optional[torch.dtype] = None,
+        layer_idx: int = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "moe",
+    ) -> None:
+        super().__init__()
+
+        self.layer_idx = layer_idx
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_total_experts = num_experts
+        self.top_k = top_k
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size // self.tp_size
+        self.quant_config = quant_config
+
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
+
+        self.gate = ReplicatedLinear(
+            self.hidden_size,
+            self.num_total_experts,
+            bias=False,
+            params_dtype=torch.float32,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+        self.gate.weight.weight_loader = MiniMaxText01MoE.gate_weight_loader
+
+        self.experts = FusedMoE(
+            num_experts=self.num_total_experts,
+            top_k=self.top_k,
+            hidden_size=self.hidden_size,
+            intermediate_size=self.intermediate_size * self.tp_size,
+            params_dtype=self.params_dtype,
+            reduce_results=True,
+            renormalize=True,
+            quant_config=self.quant_config,
+            tp_size=self.tp_size,
+            prefix=f"{prefix}.experts",
+        )
+        return
+
+    @staticmethod
+    def gate_weight_loader(param: nn.Parameter,
+                           loaded_weight: torch.Tensor) -> None:
+        assert param.size() == loaded_weight.size()
+        param.data.copy_(loaded_weight.to(torch.float32))
+        return
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_size = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        router_logits_fp32, _ = self.gate(hidden_states.to(torch.float32))
+        final_hidden_states = self.experts(
+            hidden_states, router_logits_fp32.to(hidden_states.dtype))
+        final_hidden = final_hidden_states.view(num_tokens, hidden_size)
+        return final_hidden
+
+
+class MiniMaxText01LinearKernel:
+
+    @staticmethod
+    def jit_linear_forward_prefix(q: torch.Tensor,
+                                  k: torch.Tensor,
+                                  v: torch.Tensor,
+                                  kv_caches: torch.Tensor,
+                                  slope_rate: torch.Tensor,
+                                  block_size: int,
+                                  layer_idx: int = None,
+                                  **kwargs) -> torch.Tensor:
+
+        slope_rate = slope_rate.to(torch.float32)
+        should_pad_dim = q.dim() == 3
+        if should_pad_dim:
+            q = q.unsqueeze(0)
+            k = k.unsqueeze(0)
+            v = v.unsqueeze(0)
+        b, h, n, d = q.shape
+        e = d
+        kv_history = kv_caches.reshape(1, h, d, e).contiguous()
+        output, kv_history = lightning_attention(q,
+                                                 k,
+                                                 v,
+                                                 slope_rate,
+                                                 block_size=block_size,
+                                                 kv_history=kv_history)
+        kv_caches.copy_(kv_history[:, :, -1, :, :].reshape(h, d, e))
+        assert output.shape[0] == 1, "batch size must be 1"
+        return rearrange(output.squeeze(0), "h n d -> n (h d)")
+
+
+class MiniMaxText01LinearAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        hidden_inner_size: int,
+        num_heads: int,
+        head_dim: int,
+        max_position: int,
+        block_size: int,
+        num_hidden_layer: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_idx: int = 0,
+        linear_layer_idx: int = 0,
+        prefix: str = "linear_attn",
+    ) -> None:
+        super().__init__()
+
+        self.layer_idx = layer_idx
+        self.BLOCK = block_size
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.total_num_heads = num_heads
+        self.hidden_inner_size = hidden_inner_size
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+        assert self.total_num_heads % self.tp_size == 0
+        self.tp_heads = self.total_num_heads // self.tp_size
+        self.qkv_size = self.num_heads * self.head_dim
+        self.tp_hidden = self.head_dim * self.tp_heads
+
+        self.qkv_proj = ColumnParallelLinear(
+            hidden_size,
+            self.hidden_inner_size * 3,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.output_gate = ColumnParallelLinear(
+            hidden_size,
+            self.hidden_inner_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.output_gate",
+        )
+        self.out_proj = RowParallelLinear(
+            self.hidden_inner_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+        self.norm = MiniMaxText01RMSNormTP(
+            self.hidden_inner_size,
+            eps=1e-5,
+        )
+
+        slope_rate = MiniMaxText01LinearAttention._build_slope_tensor(
+            self.num_heads)
+        if num_hidden_layer <= 1:
+            self.slope_rate = slope_rate * (1 + 1e-5)
+        else:
+            self.slope_rate = slope_rate * (1 - layer_idx /
+                                            (num_hidden_layer - 1) + 1e-5)
+        self.tp_slope = self.slope_rate[self.tp_rank *
+                                        self.tp_heads:(self.tp_rank + 1) *
+                                        self.tp_heads].contiguous()
+
+    @staticmethod
+    def weight_direct_load(param: torch.Tensor,
+                           loaded_weight: torch.Tensor) -> None:
+        assert param.size() == loaded_weight.size()
+        param.data.copy_(loaded_weight)
+        return
+
+    @staticmethod
+    def _build_slope_tensor(n_attention_heads: int):
+
+        def get_slopes(n):
+
+            def get_slopes_power_of_2(n):
+                start = 2**(-(2**-(math.log2(n) - 3)))
+                ratio = start
+                return [start * ratio**i for i in range(n)]
+
+            if math.log2(n).is_integer():
+                return get_slopes_power_of_2(n)
+            else:
+                closest_power_of_2 = 2**math.floor(math.log2(n))
+                return (get_slopes_power_of_2(closest_power_of_2) + get_slopes(
+                    2 * closest_power_of_2)[0::2][:n - closest_power_of_2])
+
+        slopes = torch.tensor(get_slopes(n_attention_heads),
+                              dtype=torch.float32).reshape(
+                                  n_attention_heads, 1, 1)
+        return slopes
+
+    def _prefill_and_mix_infer(self, q, k, v, kv_cache, state_indices_tensor,
+                               attn_metadata):
+        hidden = []
+        for _prefill_idx in range(getattr(attn_metadata, "num_prefills", 0)):
+            if _prefill_idx >= len(attn_metadata.query_start_loc):
+                break
+            if _prefill_idx >= len(state_indices_tensor):
+                break
+            _start = attn_metadata.query_start_loc[_prefill_idx]
+            _end = attn_metadata.query_start_loc[_prefill_idx + 1]
+            slot_id = state_indices_tensor[_prefill_idx]
+            qs = q[_start:_end].transpose(0, 1).contiguous()
+            ks = k[_start:_end].transpose(0, 1).contiguous()
+            vs = v[_start:_end].transpose(0, 1).contiguous()
+            slot_id = state_indices_tensor[_prefill_idx]
+            slice_layer_cache = kv_cache[slot_id, ...]
+
+            out_slice = MiniMaxText01LinearKernel.jit_linear_forward_prefix(
+                qs,
+                ks,
+                vs,
+                slice_layer_cache,
+                self.tp_slope,
+                self.BLOCK,
+                layer_idx=self.layer_idx)
+            hidden.append(out_slice.contiguous())
+        if attn_metadata.num_decode_tokens > 0:
+            hidden.append(
+                self._decode_infer(q, k, v, kv_cache, state_indices_tensor,
+                                   attn_metadata))
+
+        if not hidden:
+            return torch.empty((0, q.size(-1)), device=q.device, dtype=q.dtype)
+
+        hidden = torch.concat(hidden, dim=0).contiguous()
+        return hidden
+
+    def _decode_infer(self, q, k, v, kv_cache, state_indices_tensor,
+                      attn_metadata):
+        q = q[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
+        k = k[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
+        v = v[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
+        slot_id = state_indices_tensor[getattr(attn_metadata, "num_prefills", 0
+                                               ):]
+        hidden = linear_decode_forward_triton(q, k, v, kv_cache, self.tp_slope,
+                                              slot_id, 32)
+        return hidden
+
+    def forward(self, hidden_states: torch.Tensor, positions: torch.Tensor,
+                kv_caches: MinimaxCacheParams, **kwargs) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        qkv32 = qkv.to(torch.float32)
+        qkvact = torch.nn.functional.silu(qkv32)
+        qkvact = qkvact.view((qkv.shape[0], self.tp_heads, -1))
+        q, k, v = torch.split(qkvact, [self.head_dim] * 3, dim=-1)
+        forward_context = get_forward_context()
+        attn_metadata = forward_context.attn_metadata
+        kv_cache = kv_caches.minimax_cache
+        state_indices_tensor = kv_caches.state_indices_tensor
+
+        decode_only = getattr(attn_metadata, "num_prefills", 0) == 0
+        if not decode_only:
+            hidden = self._prefill_and_mix_infer(q, k, v, kv_cache,
+                                                 state_indices_tensor,
+                                                 attn_metadata)
+        else:
+            hidden = self._decode_infer(q, k, v, kv_cache,
+                                        state_indices_tensor, attn_metadata)
+
+        hidden = self.norm._forward(hidden)
+        gate, _ = self.output_gate(hidden_states)
+        hidden = F.sigmoid(gate) * hidden
+        hidden = hidden.to(hidden_states.dtype)
+        hidden, _ = self.out_proj(hidden)
+        return hidden
+
+
+class MiniMaxText01Attention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        head_dim: int,
+        num_kv_heads: int,
+        rotary_dim: int,
+        max_position: int = 4096 * 32,
+        rope_theta: float = 10000,
+        sliding_window: Optional[int] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_idx: int = None,
+        cache_config: Optional[CacheConfig] = None,
+        prefix: str = "mha",
+    ) -> None:
+        super().__init__()
+        self.layer_idx = layer_idx
+
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim
+
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.sliding_window = sliding_window
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+        return
+
+    def forward(self, hidden_states: torch.Tensor, positions: torch.Tensor,
+                **kwargs) -> torch.Tensor:
+        forward_context = get_forward_context()
+        attn_metadata = forward_context.attn_metadata
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = attn_metadata.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class MiniMaxText01DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        expert_num: int = 1,
+        layer_id: int = None,
+        linear_layer_id: Optional[int] = None,
+        prefix: str = "decoder",
+    ) -> None:
+        self._ilayer = layer_id
+        self._irank = get_tensor_model_parallel_rank()
+        super().__init__()
+
+        self.hidden_size = config.hidden_size
+        self.expert_num = expert_num
+
+        rope_theta = getattr(config, "rope_theta", 10000)
+
+        head_dim = getattr(config, "head_dim", None)
+        if head_dim is None:
+            head_dim = config.hidden_size // config.num_attention_heads
+        if hasattr(config, "max_model_len") and isinstance(
+                config.max_model_len, int):
+            max_position_embeddings = min(config.max_position_embeddings,
+                                          config.max_model_len)
+        if config.attention_type == 0:
+            use_headxdim = True
+            hidden_inner = (head_dim * config.num_attention_heads
+                            if use_headxdim else config.hidden_size)
+            self.self_attn = MiniMaxText01LinearAttention(
+                hidden_size=self.hidden_size,
+                hidden_inner_size=hidden_inner,
+                num_heads=config.num_attention_heads,
+                head_dim=head_dim,
+                max_position=max_position_embeddings,
+                block_size=config.block if hasattr(config, "block") else 256,
+                num_hidden_layer=config.num_hidden_layers,
+                quant_config=quant_config,
+                layer_idx=self._ilayer,
+                linear_layer_idx=linear_layer_id,
+                prefix=prefix)
+        elif config.attention_type == 1:
+            self.self_attn = MiniMaxText01Attention(
+                hidden_size=self.hidden_size,
+                num_heads=config.num_attention_heads,
+                head_dim=head_dim,
+                rotary_dim=config.rotary_dim
+                if hasattr(config, "rotary_dim") else head_dim,
+                num_kv_heads=config.num_key_value_heads,
+                max_position=max_position_embeddings,
+                rope_theta=rope_theta,
+                sliding_window=config.sliding_window,
+                quant_config=quant_config,
+                layer_idx=self._ilayer,
+                cache_config=cache_config,
+                prefix=prefix)
+        else:
+            raise ValueError(
+                f"Unsupported attention type: {self.config.attention_type}")
+
+        if expert_num == 1:
+            self.mlp = MiniMaxText01MLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=config.intermediate_size,
+                quant_config=quant_config,
+                layer_idx=self._ilayer,
+                prefix=prefix)
+        else:
+            self.block_sparse_moe = MiniMaxText01MoE(
+                num_experts=expert_num,
+                top_k=config.num_experts_per_tok,
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                layer_idx=self._ilayer,
+                quant_config=quant_config,
+                prefix=prefix)
+
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+        if config.attention_type == 0:
+            self.layernorm_attention_alpha = getattr(
+                config, 'layernorm_linear_attention_alpha',
+                getattr(config, 'linear_attn_alpha_factor', 1))
+            self.layernorm_attention_beta = getattr(
+                config, 'layernorm_linear_attention_beta',
+                getattr(config, 'linear_attn_beta_factor', 1))
+        else:
+            self.layernorm_attention_alpha = getattr(
+                config, 'layernorm_full_attention_alpha',
+                getattr(config, 'full_attn_alpha_factor', 1))
+            self.layernorm_attention_beta = getattr(
+                config, 'layernorm_full_attention_beta',
+                getattr(config, 'full_attn_beta_factor', 1))
+        self.layernorm_mlp_alpha = getattr(
+            config, 'layernorm_mlp_alpha',
+            getattr(config, 'mlp_alpha_factor', 1))
+        self.layernorm_mlp_beta = getattr(
+            config, 'layernorm_mlp_beta', getattr(config, 'mlp_beta_factor',
+                                                  1))
+        self.postnorm = getattr(config, 'postnorm', False)
+        self.shared_moe = False
+
+        shared_intermediate = getattr(config, 'shared_intermediate_size', 0)
+        if isinstance(shared_intermediate, list):
+            shared_intermediate = shared_intermediate[
+                layer_id] if layer_id < len(shared_intermediate) else 0
+        if shared_intermediate > 0:
+            self.shared_moe = True
+            self.shared_mlp = MiniMaxText01MLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=shared_intermediate,
+                quant_config=quant_config,
+                layer_idx=self._ilayer,
+                prefix=prefix)
+            self.coefficient = ReplicatedLinear(
+                self.hidden_size,
+                1,
+                bias=False,
+                quant_config=quant_config,
+                params_dtype=torch.float32,
+            )
+            self.coefficient.weight.weight_loader = (
+                self.shared_moe_coefficient_loader)
+            self.shared_moe_mode = getattr(config, 'shared_moe_mode',
+                                           'softmax')
+        return
+
+    def forward(self,
+                hidden_states: torch.Tensor,
+                positions: torch.Tensor,
+                kv_caches: Union[list[dict], Optional[torch.Tensor]],
+                attn_metadata: AttentionMetadata,
+                residual: Optional[torch.Tensor],
+                is_warmup: bool = False,
+                **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
+
+        forward_context = get_forward_context()
+        attn_metadata = forward_context.attn_metadata
+        layernorm_input = hidden_states
+        layernorm_output = self.input_layernorm(layernorm_input)
+        residual = layernorm_output if self.postnorm else layernorm_input
+        self_attention_output = self.self_attn(
+            hidden_states=layernorm_output,
+            positions=positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+        )
+
+        residual = residual * self.layernorm_attention_alpha
+        self_attention_output = (self_attention_output *
+                                 self.layernorm_attention_beta)
+
+        layernorm_input = residual + self_attention_output
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+        residual = layernorm_output if self.postnorm else layernorm_input
+
+        if self.expert_num == 1:
+            hidden_states = self.mlp(layernorm_output)
+        else:
+            moe_hidden_states = self.block_sparse_moe(
+                copy.deepcopy(layernorm_output))
+            if self.shared_moe:
+                before_moe_dtype = layernorm_output.dtype
+                moe_hidden_fp32 = moe_hidden_states.to(torch.float32)
+                output_mlp = self.shared_mlp(layernorm_output).to(
+                    torch.float32)
+
+                coef, _ = self.coefficient(layernorm_output.to(torch.float32))
+
+                if self.shared_moe_mode == 'softmax':
+                    coef = torch.nn.functional.softmax(coef, dim=-1)
+                    hidden_states = moe_hidden_fp32 * (
+                        1 - coef) + output_mlp * coef
+                elif self.shared_moe_mode == 'sigmoid':
+                    coef = torch.nn.functional.sigmoid(coef)
+                    hidden_states = moe_hidden_fp32 * (
+                        1 - coef) + output_mlp * coef
+
+                hidden_states = hidden_states.to(before_moe_dtype)
+            else:
+                hidden_states = moe_hidden_states
+
+        residual = residual * self.layernorm_mlp_alpha
+        hidden_states = hidden_states * self.layernorm_mlp_beta
+
+        hidden_states = residual + hidden_states
+
+        return hidden_states, None
+
+    @staticmethod
+    def shared_moe_coefficient_loader(param: torch.Tensor,
+                                      loaded_weight: torch.Tensor) -> None:
+        assert param.size() == loaded_weight.size()
+
+        param.data.copy_(loaded_weight.to(torch.float32))
+        return
+
+
+class MiniMaxText01Model(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: Optional[CacheConfig] = None,
+        scheduler_config=None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.decoder_attention_types = getattr(
+            config, "attn_type_list", False) or getattr(
+                config, "decoder_attention_types", False)
+        # The HF format uses "layer_types" instead of "attn_type_list"
+        # where "linear_attention" is 0 and "full_attention" is 1
+        if not self.decoder_attention_types and hasattr(config, "layer_types"):
+            self.decoder_attention_types = []
+            for layer_type in config.layer_types:
+                if layer_type == "linear_attention":
+                    self.decoder_attention_types.append(0)
+                elif layer_type == "full_attention":
+                    self.decoder_attention_types.append(1)
+                else:
+                    raise ValueError(f"Unsupported layer type: {layer_type}")
+        # Default to full attention
+        if not self.decoder_attention_types:
+            self.decoder_attention_types = [1] * config.num_hidden_layers
+        self.num_layers = config.num_hidden_layers
+
+        self._layer_barrier = False
+        if get_pp_group().is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=self.vocab_size,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        def layer_fn(prefix):
+            layer_idx = int(prefix.split('.')[-1])
+            layer_config = config
+            layer_config.attention_type = self.decoder_attention_types[
+                layer_idx]
+            layer_config.layer_idx = layer_idx
+
+            decoder_kwargs = {
+                "quant_config": quant_config,
+                "layer_id": layer_idx,
+                "cache_config": cache_config
+            }
+
+            if layer_config.attention_type == 0:
+                decoder_kwargs["linear_layer_id"] = sum(
+                    1 for i in range(layer_idx)
+                    if self.decoder_attention_types[i] == 0)
+            else:
+                decoder_kwargs["linear_layer_id"] = None
+
+            if hasattr(config, "num_local_experts") and isinstance(
+                    config.num_local_experts, list):
+                decoder_kwargs["expert_num"] = config.num_local_experts[
+                    layer_idx]
+            elif hasattr(config, "num_local_experts") and isinstance(
+                    config.num_local_experts, int):
+                decoder_kwargs["expert_num"] = config.num_local_experts
+            else:
+                decoder_kwargs["expert_num"] = 1
+
+            return MiniMaxText01DecoderLayer(layer_config,
+                                             **decoder_kwargs,
+                                             prefix=prefix)
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers, layer_fn, prefix=f"{prefix}.layers")
+
+        linear_layer_nums = sum(1 for i in range(config.num_hidden_layers)
+                                if self.decoder_attention_types[i] == 0)
+        max_slots_number = scheduler_config.max_num_seqs
+        self.cache_shape = (linear_layer_nums, max_slots_number,
+                            config.num_attention_heads //
+                            get_tensor_model_parallel_world_size(),
+                            config.head_dim, config.head_dim)
+        _dummy = torch.zeros(1)
+        self._dtype = _dummy.dtype
+        del _dummy
+
+        self.minimax_cache = MinimaxCacheManager(dtype=torch.float32,
+                                                 cache_shape=self.cache_shape)
+
+        rope_theta = getattr(config, "rope_theta", 10000)
+        head_dim = getattr(config, "head_dim", None)
+        if head_dim is None:
+            head_dim = config.hidden_size // config.num_attention_heads
+        if hasattr(config, "max_model_len") and isinstance(
+                config.max_model_len, int):
+            max_position_embeddings = min(config.max_position_embeddings,
+                                          config.max_model_len)
+        self.rotary_emb = MiniMaxText01RotaryEmbedding(
+            head_dim,
+            rotary_dim=config.rotary_dim
+            if hasattr(config, "rotary_dim") else head_dim,
+            max_position=max_position_embeddings,
+            base=int(rope_theta),
+            is_neox_style=True,
+            cache_dtype=torch.float32,
+        )
+
+        norm_kwargs = {}
+        if hasattr(config, "rms_norm_eps"):
+            norm_kwargs["eps"] = config.rms_norm_eps
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, **norm_kwargs)
+        else:
+            self.norm = PPMissingLayer()
+        self.embed_scale = 1.0
+        return
+
+    def _clear_prefill_cache(self, attn_metadata,
+                             minimax_cache_tensors: torch.Tensor, **kwargs):
+        seq_to_slot_maps = {}
+        seq_id_map = sum(list(kwargs["request_ids_to_seq_ids"].values()), [])
+        for _, seq_to_slot_map in (
+                self.minimax_cache.cache_indices_mapping.items()):
+            seq_to_slot_maps.update(seq_to_slot_map)
+
+        slots_to_clear = []
+        for _prefill_id in range(getattr(attn_metadata, "num_prefills", 0)):
+            if _prefill_id >= len(seq_id_map):
+                break
+            seq_id = seq_id_map[_prefill_id]
+            if attn_metadata.context_lens_tensor[
+                    _prefill_id] == 0 and seq_id in seq_to_slot_maps:
+                slots_to_clear.append(seq_to_slot_maps[seq_id])
+
+        if slots_to_clear:
+            slots_tensor = torch.tensor(slots_to_clear,
+                                        device=minimax_cache_tensors.device,
+                                        dtype=torch.long)
+            minimax_cache_tensors[:, slots_tensor, ...] = 0
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(self,
+                input_ids: Optional[torch.Tensor],
+                positions: torch.Tensor,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs) -> Union[torch.Tensor, IntermediateTensors]:
+        forward_context = get_forward_context()
+        attn_metadata = forward_context.attn_metadata
+        if attn_metadata is None:
+            return None
+        if "request_ids_to_seq_ids" not in kwargs:
+            kwargs["request_ids_to_seq_ids"] = {}
+        if "finished_requests_ids" not in kwargs:
+            kwargs["finished_requests_ids"] = []
+
+        (
+            minimax_cache_tensors,
+            state_indices_tensor,
+        ) = self.minimax_cache.current_run_tensors(**kwargs)
+        if getattr(attn_metadata, "num_prefills", 0) > 0:
+            self._clear_prefill_cache(attn_metadata, minimax_cache_tensors,
+                                      **kwargs)
+
+        minimax_cache_params = MinimaxCacheParams(minimax_cache_tensors,
+                                                  state_indices_tensor)
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is None:
+                hidden_states = self.embed_scale * self.embed_tokens(input_ids)
+            else:
+                hidden_states = inputs_embeds
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        minimax_cache_index = 0
+        attn_metadata.rotary_emb = self.rotary_emb
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            _caches = None
+            if isinstance(layer.self_attn, MiniMaxText01LinearAttention):
+                current_state_layer = minimax_cache_index
+                _caches = minimax_cache_params.at_layer_idx(
+                    current_state_layer)
+                minimax_cache_index += 1
+            hidden_states, residual = layer(
+                hidden_states=hidden_states,
+                positions=positions,
+                kv_caches=_caches,
+                attn_metadata=attn_metadata,
+                residual=residual,
+            )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        if residual is not None:
+            hidden_states, _ = self.norm(hidden_states, residual)
+        else:
+            hidden_states = self.norm(hidden_states)
+
+        return hidden_states
+
+
+class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid,
+                               SupportsV0Only):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        self.config = config
+        self.lora_config = lora_config
+
+        if not hasattr(config, "sliding_window"):
+            config.sliding_window = None
+
+        self.CONCAT_FFN = True
+
+        self.unpadded_vocab_size = self.config.vocab_size
+        if hasattr(vllm_config.model_config, "max_model_len"):
+            self.config.max_model_len = vllm_config.model_config.max_model_len
+        self.model = MiniMaxText01Model(
+            self.config,
+            quant_config,
+            cache_config=vllm_config.cache_config,
+            scheduler_config=vllm_config.scheduler_config,
+            prefix=maybe_prefix(prefix, "model"))
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                self.config.hidden_size,
+                org_num_embeddings=self.config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            )
+
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    self.config.vocab_size)
+
+        else:
+            self.lm_head = PPMissingLayer()
+        self.lm_head.float()
+        flash_layer_count = sum(
+            1 for attn_type in self.model.decoder_attention_types
+            if attn_type == 1)
+        self.kv_cache = [torch.tensor([]) for _ in range(flash_layer_count)]
+        return
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        return self.model.minimax_cache.copy_inputs_before_cuda_graphs(
+            input_buffers, **kwargs)
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        return self.model.minimax_cache.get_seqlen_agnostic_capture_inputs(
+            batch_size)
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds, **kwargs)
+
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states.float(),
+                                       sampling_metadata)
+
+        return logits
+
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        def which_layer(name: str) -> int:
+            if "layers" in name:
+                after_layer = name.split("layers")[-1]
+                return int(after_layer.split(".")[1])
+            return None
+
+        def is_linear_attn_layer(layer_idx: int) -> bool:
+            if layer_idx is None or layer_idx >= len(
+                    self.model.decoder_attention_types):
+                return False
+            return self.model.decoder_attention_types[layer_idx] == 0
+
+        def is_moe_weight(name: str) -> bool:
+            return "block_sparse_moe" in name and not name.endswith(".bias")
+
+        def get_expert_id(param_name):
+            pattern = r'model\.layers\.\d+\.block_sparse_moe\.experts\.(\d+)\.'
+            match = re.search(pattern, param_name)
+            if match:
+                return match.group(1)
+            return None
+
+        def load_sparse_moe_weight(name: str, loaded_weight: torch.Tensor,
+                                   self) -> None:
+            if isinstance(self.config.num_local_experts, list):
+                expert_params_mapping = [
+                    ("w13_weight"
+                     if weight_name in ["w1", "w3"] else "w2_weight",
+                     f"experts.{expert_id}.{weight_name}.weight", expert_id)
+                    for expert_id in range(max(self.config.num_local_experts))
+                    for weight_name in ["w1", "w2", "w3"]
+                ]
+            else:
+                expert_params_mapping = [
+                    ("w13_scale" if weight_name in ["w1", "w3"] else
+                     "w2_scale", f"{expert_id}.{weight_name}.weight_scale",
+                     expert_id, weight_name)
+                    for expert_id in range(self.config.num_local_experts)
+                    for weight_name in ["w1", "w2", "w3"]
+                ] + [("w13_weight" if weight_name in ["w1", "w3"] else
+                      "w2_weight", f"{expert_id}.{weight_name}.weight",
+                      expert_id, weight_name)
+                     for expert_id in range(self.config.num_local_experts)
+                     for weight_name in ["w1", "w2", "w3"]]
+            for (param_name, weight_name, expert_id,
+                 shard_id) in expert_params_mapping:
+                name_expert_id = get_expert_id(name)
+                if name_expert_id is not None and int(name_expert_id) != int(
+                        expert_id):
+                    continue
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if is_pp_missing_parameter(name, self):
+                    return
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader = weight_loader_with_alias(name)(weight_loader)
+                weight_loader(param,
+                              loaded_weight,
+                              weight_name,
+                              expert_id=expert_id,
+                              shard_id=shard_id)
+                loaded_params.add(name)
+                break
+            else:
+                if is_pp_missing_parameter(name, self):
+                    return
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader = weight_loader_with_alias(name)(weight_loader)
+                weight_loader(param, loaded_weight)
+                loaded_params.add(name)
+            return
+
+        def is_shared_mlp_weight(name: str) -> bool:
+            return "shared_mlp" in name and not name.endswith(".bias")
+
+        def load_shared_mlp_weight(name: str, loaded_weight: torch.Tensor,
+                                   self) -> None:
+            if not self.CONCAT_FFN:
+                if "gate_proj" in name:
+                    name = name.replace("gate_proj", "w1", 1)
+                elif "up_proj" in name:
+                    name = name.replace("up_proj", "w3", 1)
+                elif "down_proj" in name:
+                    name = name.replace("down_proj", "w2", 1)
+            else:
+                if "gate_proj" in name:
+                    name = name.replace("gate_proj", "gate_up_proj", 1)
+                    loaded_shard_id = 0
+                elif "up_proj" in name:
+                    name = name.replace("up_proj", "gate_up_proj", 1)
+                    loaded_shard_id = 1
+            if is_pp_missing_parameter(name, self):
+                return
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader = weight_loader_with_alias(name)(weight_loader)
+            if not self.CONCAT_FFN:
+                weight_loader(param, loaded_weight)
+            else:
+                if "gate_up_proj" in name:
+                    weight_loader(param, loaded_weight, loaded_shard_id)
+                elif "down_proj" in name:
+                    weight_loader(param, loaded_weight)
+                else:
+                    raise AssertionError(
+                        "MLP weight not in [gate_up_proj, down_proj]")
+            loaded_params.add(name)
+            return
+
+        def is_mha_weight(name: str) -> bool:
+            return "self_attn" in name and not name.endswith(".bias")
+
+        def load_linear_attn_weight(name: str, loaded_weight: torch.Tensor,
+                                    self) -> None:
+            if is_pp_missing_parameter(name, self):
+                return
+            param = params_dict[name]
+
+            weight_loader = getattr(
+                param, "weight_loader",
+                MiniMaxText01LinearAttention.weight_direct_load)
+            weight_loader = weight_loader_with_alias(name)(weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+            return
+
+        def load_flash_attn_weight(name: str, loaded_weight: torch.Tensor,
+                                   self) -> None:
+
+            flash_mha_params_mapping = [
+                ("qkv_proj", "q_proj", "q"),
+                ("qkv_proj", "k_proj", "k"),
+                ("qkv_proj", "v_proj", "v"),
+                ("gate_up_proj", "gate_proj", 0),
+                ("gate_up_proj", "up_proj", 1),
+            ]
+            for (param_name, weight_name,
+                 shard_id) in flash_mha_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if is_pp_missing_parameter(name, self):
+                    return
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader = weight_loader_with_alias(name)(weight_loader)
+                weight_loader(param, loaded_weight, shard_id)
+                loaded_params.add(name)
+                break
+            else:
+                if is_pp_missing_parameter(name, self):
+                    return
+                param = params_dict[name]
+
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader = weight_loader_with_alias(name)(weight_loader)
+                weight_loader(param, loaded_weight)
+                loaded_params.add(name)
+            return
+
+        def is_layer_norm_weight(name: str) -> bool:
+            return "norm" in name and not name.endswith(
+                ".bias") and name in params_dict
+
+        def load_layer_norm_weight(name: str, loaded_weight: torch.Tensor,
+                                   self) -> None:
+            if is_pp_missing_parameter(name, self):
+                return
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader = weight_loader_with_alias(name)(weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+            return
+
+        def load_basic_weight(name: str, loaded_weight: torch.Tensor,
+                              self) -> None:
+            if is_pp_missing_parameter(name, self):
+                return
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader = weight_loader_with_alias(name)(weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+            return
+
+        for name, loaded_weight in weights:
+            weight_at_layer = which_layer(name)
+            if weight_at_layer and weight_at_layer >= len(
+                    self.model.decoder_attention_types):
+                continue
+
+            if is_layer_norm_weight(name):
+                load_layer_norm_weight(name, loaded_weight, self)
+                continue
+            if is_mha_weight(name):
+                if is_linear_attn_layer(weight_at_layer):
+                    load_linear_attn_weight(name, loaded_weight, self)
+                else:
+                    load_flash_attn_weight(name, loaded_weight, self)
+                continue
+            if is_moe_weight(name):
+                load_sparse_moe_weight(name, loaded_weight, self)
+                continue
+            if is_shared_mlp_weight(name):
+                load_shared_mlp_weight(name, loaded_weight, self)
+                continue
+
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            load_basic_weight(name, loaded_weight, self)
+        return loaded_params
diff --git a/vllm_v0.10.0/vllm/model_executor/models/minimax_vl_01.py b/vllm_v0.10.0/vllm/model_executor/models/minimax_vl_01.py
new file mode 100644
index 0000000..9aba82c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/minimax_vl_01.py
@@ -0,0 +1,374 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable, Mapping
+from typing import Literal, Optional, TypedDict, Union, cast
+
+import torch
+import torch.nn as nn
+from transformers import BatchFeature
+
+from vllm.config import VllmConfig
+from vllm.jsontree import json_map_leaves
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalFieldConfig
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.minimax_vl_01 import MiniMaxVL01Config
+
+from .clip import CLIPVisionModel
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .llava import (BaseLlavaMultiModalProcessor, LlavaDummyInputsBuilder,
+                    init_vision_tower_for_llava)
+from .llava_next import LlavaNextProcessingInfo
+from .pixtral import PixtralHFVisionModel
+from .siglip import SiglipVisionModel
+from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
+                    maybe_prefix, merge_multimodal_embeddings)
+
+
+class MiniMaxVL01ImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, num_channels, height, width)`
+
+    Note that `height` or `width` may be different per batch and image,
+    in which case the data is passed as a list instead of a batched tensor.
+    """
+
+
+class MiniMaxVL01ImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: torch.Tensor
+    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+MiniMaxVL01ImageInputs = Union[MiniMaxVL01ImagePixelInputs,
+                               MiniMaxVL01ImageEmbeddingInputs]
+
+
+class MiniMaxVL01MultiModalProjector(nn.Module):
+
+    def __init__(self,
+                 vision_hidden_size: int,
+                 text_hidden_size: int,
+                 projector_hidden_act: str,
+                 multimodal_projector_bias: bool,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+
+        self.linear_1 = ColumnParallelLinear(vision_hidden_size,
+                                             text_hidden_size,
+                                             bias=multimodal_projector_bias,
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.linear_1")
+        self.act = get_act_fn(projector_hidden_act)
+        self.linear_2 = RowParallelLinear(text_hidden_size,
+                                          text_hidden_size,
+                                          bias=multimodal_projector_bias,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.linear_2")
+
+    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class MiniMaxVL01DummyInputsBuilder(LlavaDummyInputsBuilder):
+    pass
+
+
+class MiniMaxVL01ProcessingInfo(LlavaNextProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(MiniMaxVL01Config)
+
+    def get_hf_processor(self, **kwargs: object):
+        hf_processor = self.ctx.get_hf_processor(**kwargs)
+        image_processor = hf_processor.image_processor
+        image_processor.anyres_preprocess = (
+            image_processor.anyres_for_vllm_preprocess)
+
+        return hf_processor
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+
+class MiniMaxVL01MultiModalProcessor(
+        BaseLlavaMultiModalProcessor[MiniMaxVL01ProcessingInfo]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+
+        pixel_values = processed_outputs.get("pixel_values")
+        if pixel_values is not None:
+            # Avoid padding since we need the output for each image to be
+            # independent of other images for the cache to work correctly
+            image_sizes = processed_outputs["image_sizes"]
+            assert len(pixel_values) == len(image_sizes)
+
+            processed_outputs["pixel_values"] = [
+                p[:, :h, :w] for p, (h, w) in zip(pixel_values, image_sizes)
+            ]
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return {
+            "pixel_values": MultiModalFieldConfig.batched("image"),
+            "image_embeds": MultiModalFieldConfig.batched("image"),
+        }
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    MiniMaxVL01MultiModalProcessor,
+    info=MiniMaxVL01ProcessingInfo,
+    dummy_inputs=MiniMaxVL01DummyInputsBuilder)
+class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                          SupportsPP):
+
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
+    }
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<image>"
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        # TODO: Optionally initializes this for supporting embeddings.
+        self.vision_tower = init_vision_tower_for_llava(
+            config,
+            quant_config,
+            require_post_norm=False,
+            prefix=maybe_prefix(prefix, "vision_tower"))
+        self.multi_modal_projector = MiniMaxVL01MultiModalProjector(
+            vision_hidden_size=config.vision_config.hidden_size,
+            text_hidden_size=config.text_config.hidden_size,
+            projector_hidden_act=config.projector_hidden_act,
+            multimodal_projector_bias=True,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "multi_modal_projector"))
+        self.image_newline = nn.Parameter(
+            torch.empty(config.text_config.hidden_size))
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+        self.vision_feature_layer = config.vision_feature_layer
+        self.vocab_size = config.text_config.vocab_size
+        self.pad_token_id = -1
+        if self.config.pad_token_id is not None:
+            self.pad_token_id = self.config.pad_token_id
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                self.config.image_token_index,
+            )
+        return inputs_embeds
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def _select_image_features(self, image_features: torch.Tensor, *,
+                               strategy: str) -> torch.Tensor:
+        if strategy == "default":
+            return image_features[:, 1:]
+        elif strategy == "full":
+            return image_features
+
+        raise ValueError(f"Unexpected select feature strategy: {strategy}")
+
+    def _image_pixels_to_features(
+        self,
+        vision_tower: Union[CLIPVisionModel, SiglipVisionModel,
+                            PixtralHFVisionModel],
+        pixel_values: Union[torch.Tensor, list[torch.Tensor]],
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        # NOTE: we skip the step to select the vision feature layer since
+        # this is already done inside the vision tower
+        image_features = vision_tower(pixel_values)
+
+        def select_features(leaf: torch.Tensor):
+            return self._select_image_features(
+                leaf,
+                strategy=self.config.vision_feature_select_strategy,
+            )
+
+        return cast(
+            Union[torch.Tensor, tuple[torch.Tensor, ...]],
+            json_map_leaves(select_features, image_features),
+        )
+
+    def _process_image_pixels(
+        self,
+        inputs: MiniMaxVL01ImagePixelInputs,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        assert self.vision_tower is not None
+
+        pixel_values = inputs["pixel_values"]
+
+        return self._image_pixels_to_features(self.vision_tower, pixel_values)
+
+    def _process_image_input(
+        self,
+        image_input: MiniMaxVL01ImageInputs,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        if image_input["type"] == "image_embeds":
+            return image_input["data"]
+
+        assert self.vision_tower is not None
+        image_features = self._process_image_pixels(image_input)
+
+        if isinstance(image_features, torch.Tensor):
+            return self.multi_modal_projector(image_features)
+
+        feature_sizes = [
+            image_feature.shape[0] for image_feature in image_features
+        ]
+
+        image_embeds = self.multi_modal_projector(torch.cat(image_features))
+        image_embeds = torch.split(image_embeds, feature_sizes)
+        return image_embeds
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+        actual_dims = tuple(data.shape[1:])
+
+        if actual_dims != expected_dims:
+            expected_expr = ("batch_size", *map(str, expected_dims))
+            raise ValueError(
+                f"The expected shape of pixel values is {expected_expr}. "
+                f"You supplied {tuple(data.shape)}.")
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[MiniMaxVL01ImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return MiniMaxVL01ImagePixelInputs(
+                type="pixel_values",
+                pixel_values=self._validate_pixel_values(
+                    flatten_bn(pixel_values, concat=True)),
+            )
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+
+            return MiniMaxVL01ImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds, concat=True),
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return []
+
+        return self._process_image_input(image_input)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/mistral3.py b/vllm_v0.10.0/vllm/model_executor/models/mistral3.py
new file mode 100644
index 0000000..88c3823
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/mistral3.py
@@ -0,0 +1,624 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import abstractmethod
+from collections.abc import Iterable, Mapping, Sequence
+from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar,
+                    Union)
+
+import torch
+import torch.nn as nn
+from transformers import (BatchFeature, Mistral3Config, PixtralVisionConfig,
+                          PretrainedConfig)
+from transformers.models.pixtral import PixtralProcessor
+
+from vllm.config import VllmConfig
+from vllm.inputs import InputProcessingContext
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, ProcessingCache,
+                                        PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
+from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+from .vision import get_vision_encoder_info
+
+
+class Mistral3ImagePixelInputs(TypedDict):
+    type: Literal["pixel_values_pixtral"]
+    pixel_values: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    Shape: `(batch_size * num_images, num_channels, height, width)`
+
+    Note that `height` or `width` may be different per batch and image,
+    in which case the data is passed as a list instead of a batched tensor.
+    """
+
+
+class Mistral3PatchMerger(nn.Module):
+    """
+    Learned merging of spatial_merge_size ** 2 patches
+    """
+
+    def __init__(self, vision_hidden_size: int, spatial_merge_size: int,
+                 patch_size: int):
+        super().__init__()
+
+        self.vision_hidden_size = vision_hidden_size
+        self.spatial_merge_size = spatial_merge_size
+        self.patch_size = patch_size
+        self.merging_layer = nn.Linear(vision_hidden_size *
+                                       self.spatial_merge_size**2,
+                                       vision_hidden_size,
+                                       bias=False)
+
+    def forward(self, image_features: torch.Tensor,
+                image_sizes: torch.Tensor) -> torch.Tensor:
+        image_sizes = [(image_size[0] // self.patch_size,
+                        image_size[1] // self.patch_size)
+                       for image_size in image_sizes]
+
+        tokens_per_image = [h * w for h, w in image_sizes]
+        d = image_features.shape[-1]
+
+        permuted_tensor = []
+        for image_index, image_tokens in enumerate(
+                image_features.split(tokens_per_image)):
+            # Reshape image_tokens into a 2D grid
+            h, w = image_sizes[image_index]
+            image_grid = image_tokens.view(h, w, d).permute(2, 0,
+                                                            1).unsqueeze(0)
+            grid = torch.nn.functional.unfold(
+                image_grid,
+                kernel_size=self.spatial_merge_size,
+                stride=self.spatial_merge_size)
+            grid = grid.view(d * self.spatial_merge_size**2, -1).t()
+            permuted_tensor.append(grid)
+
+        image_features = torch.cat(permuted_tensor, dim=0)
+        image_features = self.merging_layer(image_features)
+        return image_features
+
+
+class Mistral3MultiModalProjector(nn.Module):
+
+    def __init__(self,
+                 vision_hidden_size: int,
+                 text_hidden_size: int,
+                 spatial_merge_size: int,
+                 patch_size: int,
+                 projector_hidden_act: str,
+                 multimodal_projector_bias: bool,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+
+        self.norm = RMSNorm(vision_hidden_size, eps=1e-5)
+        self.patch_merger = Mistral3PatchMerger(
+            vision_hidden_size=vision_hidden_size,
+            spatial_merge_size=spatial_merge_size,
+            patch_size=patch_size)
+
+        self.linear_1 = ColumnParallelLinear(vision_hidden_size,
+                                             text_hidden_size,
+                                             bias=multimodal_projector_bias,
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.linear_1")
+        self.act = get_act_fn(projector_hidden_act)
+        self.linear_2 = RowParallelLinear(text_hidden_size,
+                                          text_hidden_size,
+                                          bias=multimodal_projector_bias,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.linear_2")
+
+    def forward(self, image_features: torch.Tensor,
+                image_sizes: torch.Tensor) -> torch.Tensor:
+        image_features = self.norm(image_features)
+        image_features = self.patch_merger(image_features, image_sizes)
+        hidden_states, _ = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class LlavaLikeConfig(Protocol):
+    vision_config: Final[PretrainedConfig]
+    image_token_index: Final[int]
+    vision_feature_select_strategy: Final[str]
+    vision_feature_layer: Final[Union[int, list[int]]]
+
+
+class LlavaLikeProcessor(Protocol):
+    image_token: Final[str]
+
+
+class BaseLlavaProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self) -> LlavaLikeConfig:
+        return self.ctx.get_hf_config(Mistral3Config)
+
+    def get_vision_encoder_info(self):
+        return get_vision_encoder_info(self.get_hf_config())
+
+    @abstractmethod
+    def get_hf_processor(self, **kwargs: object) -> LlavaLikeProcessor:
+        raise NotImplementedError
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        vision_encoder_info = self.get_vision_encoder_info()
+        return vision_encoder_info.get_num_image_tokens(
+            image_width=image_width,
+            image_height=image_height,
+        )
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        vision_encoder_info = self.get_vision_encoder_info()
+        width = height = vision_encoder_info.get_image_size()
+        return ImageSize(width=width, height=height)
+
+
+_I = TypeVar("_I", bound=BaseLlavaProcessingInfo)
+
+
+class Mistral3DummyInputsBuilder(BaseDummyInputsBuilder[_I]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+
+class Mistral3ProcessingInfo(BaseLlavaProcessingInfo):
+
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(PixtralProcessor, **kwargs)
+
+
+class Mistral3MultiModalProcessor(
+        BaseMultiModalProcessor[Mistral3ProcessingInfo]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+
+        pixel_values = processed_outputs.get("pixel_values")
+        if pixel_values is not None:
+
+            # Avoid padding since we need the output for each image to be
+            # independent of other images for the cache to work correctly
+            image_sizes = processed_outputs["image_sizes"]
+            assert len(pixel_values) == len(image_sizes)
+
+            processed_outputs["pixel_values"] = [
+                p[:, :h, :w] for p, (h, w) in zip(pixel_values, image_sizes)
+            ]
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        hf_config = self.info.get_hf_config()
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+
+        image_break_id = vocab[processor.image_break_token]
+        image_token_id = hf_config.image_token_index
+        image_end_id = vocab[processor.image_end_token]
+
+        assert isinstance(hf_config.vision_config, PixtralVisionConfig)
+        encoder_info = PixtralHFEncoderInfo(hf_config)
+
+        def get_replacement(item_idx: int):
+            images = mm_items.get_items("image", ImageProcessorItems)
+            image_size = images.get_image_size(item_idx)
+
+            ncols, nrows = encoder_info.get_patch_grid_size(
+                image_width=image_size.width,
+                image_height=image_size.height,
+            )
+
+            tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
+            tokens[-1] = image_end_id
+
+            return PromptUpdateDetails.select_token_id(tokens, image_token_id)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=get_replacement,
+            ),
+        ]
+
+
+def _build_mistral3_info(
+    ctx: InputProcessingContext, ) -> BaseLlavaProcessingInfo:
+    hf_config = ctx.get_hf_config(Mistral3Config)
+    assert isinstance(hf_config.vision_config, PixtralVisionConfig)
+    return Mistral3ProcessingInfo(ctx)
+
+
+def _build_mistral3_processor(
+    info: _I,
+    dummy_inputs: BaseDummyInputsBuilder[_I],
+    *,
+    cache: Optional[ProcessingCache] = None,
+) -> BaseMultiModalProcessor:
+    assert isinstance(info, Mistral3ProcessingInfo)
+    return Mistral3MultiModalProcessor(
+        info,
+        dummy_inputs,  # type: ignore
+        cache=cache,
+    )
+
+
+def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
+    """Determine the number of hidden layers to initialize up to in the
+    visual encoder.
+    
+    Args:
+        hf_config: Model config with vision feature layer(s).
+    """
+    feature_layers = hf_config.vision_feature_layer
+    num_hidden_layers = hf_config.vision_config.num_hidden_layers
+    # If we have one feature layer, initialize up to that layer
+    if isinstance(feature_layers, int):
+        return _get_layer_index(feature_layers, num_hidden_layers)
+    # If we have multiple feature layers, initialize up to the deepest one
+    elif isinstance(feature_layers, (list, tuple)):
+        return max(
+            _get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
+    raise TypeError(f"vision_layer_feature type: {type(feature_layers)}"
+                    " is not supported")
+
+
+def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
+    """Given a signed vision feature layer, get the number of hidden layers
+    needed to leverage it.
+
+    Args:
+        feature_layer_index: Index of a required layer in the visual encoder.
+        num_hidden_layers: The total number of hidden layers in the visual
+            encoder.
+    """
+    if feature_layer_index < 0:
+        return num_hidden_layers + feature_layer_index + 1
+    return feature_layer_index
+
+
+def init_vision_tower_for_llava(
+    hf_config: LlavaLikeConfig,
+    quant_config: Optional[QuantizationConfig],
+    *,
+    require_post_norm: Optional[bool] = None,
+    prefix: str = "",
+) -> PixtralHFVisionModel:
+    vision_config = hf_config.vision_config
+
+    # Initialize the vision tower only up to the deepest required feature layer
+    num_hidden_layers = _get_num_hidden_layers(hf_config)
+
+    assert isinstance(vision_config, PixtralVisionConfig)
+
+    return PixtralHFVisionModel(
+        vision_config,
+        quant_config=quant_config,
+        num_hidden_layers_override=num_hidden_layers,
+        require_post_norm=require_post_norm,
+        prefix=prefix,
+    )
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    _build_mistral3_processor,
+    info=_build_mistral3_info,
+    dummy_inputs=Mistral3DummyInputsBuilder)
+class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
+                                       SupportsMultiModal, SupportsPP):
+
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
+    }
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # mapping for new names in checkpoint saved after transformers v4.52
+            "model.language_model.": "language_model.model.",
+            "model.vision_tower.": "vision_tower.",
+            "model.multi_modal_projector.": "multi_modal_projector.",
+            "lm_head.": "language_model.lm_head.",
+        })
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return None
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        # NOTE: These are special cases for Pixtral-12B in the HF-format
+        # https://huggingface.co/mistral-community/pixtral-12b/blob/main/config.json  # noqa
+        if (config.text_config.architectures is None
+                and config.text_config.model_type == "mistral"):
+            config.text_config.architectures = ["MistralForCausalLM"]
+        if (config.projector_hidden_act is None
+                and config.vision_config.hidden_act == "gelu"):
+            config.projector_hidden_act = "gelu"
+
+        # TODO: Optionally initializes this for supporting embeddings.
+        self.vision_tower = init_vision_tower_for_llava(
+            config,
+            quant_config,
+            require_post_norm=False,
+            prefix=maybe_prefix(prefix, "vision_tower"))
+        self.multi_modal_projector = Mistral3MultiModalProjector(
+            vision_hidden_size=config.vision_config.hidden_size,
+            text_hidden_size=config.text_config.hidden_size,
+            projector_hidden_act=config.projector_hidden_act,
+            spatial_merge_size=config.spatial_merge_size,
+            patch_size=config.vision_config.patch_size,
+            multimodal_projector_bias=config.multimodal_projector_bias,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "multi_modal_projector"))
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+        actual_dims = tuple(data.shape[1:])
+
+        if actual_dims != expected_dims:
+            expected_expr = ("batch_size", *map(str, expected_dims))
+            raise ValueError(
+                f"The expected shape of pixel values is {expected_expr}. "
+                f"You supplied {tuple(data.shape)}.")
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Mistral3ImagePixelInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        assert pixel_values is not None
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
+
+        return Mistral3ImagePixelInputs(
+            type="pixel_values_pixtral",
+            pixel_values=flatten_bn(pixel_values),
+        )
+
+    def _process_image_input(
+        self,
+        image_input: Mistral3ImagePixelInputs,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        if image_input["type"] == "image_embeds":
+            return image_input["data"]
+
+        image_sizes = [(img.shape[-2], img.shape[-1])
+                       for img in image_input["pixel_values"]]
+
+        image_features = self.vision_tower(image_input["pixel_values"])
+
+        if isinstance(image_features, torch.Tensor):
+            return self.multi_modal_projector(image_features, image_sizes)
+
+        feature_sizes = [
+            image_feature.shape[0] // self.config.spatial_merge_size**2
+            for image_feature in image_features
+        ]
+
+        image_embeds = self.multi_modal_projector(torch.cat(image_features),
+                                                  image_sizes)
+        if len(feature_sizes) > 1:
+            image_embeds = torch.split(image_embeds, feature_sizes)
+        else:
+            image_embeds = (image_embeds, )
+        return image_embeds
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return []
+
+        vision_embeddings = self._process_image_input(image_input)
+
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                self.config.image_token_index,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        """Run forward pass for Mistral3.
+
+        One key thing to understand is the `input_ids` already accounts for the
+        positions of the to-be-inserted image embeddings.
+
+        Concretely, consider a text prompt:
+        `"USER: <image>\\nWhat's the content of the image?\\nASSISTANT:"`.
+
+        Tokenizer outputs:
+        `[1, 3148, 1001, 29901, 29871, 32000, 29871, 13, 5618, 29915, 29879,
+        278, 2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566, 29901]`.
+
+        To reserve space in KV cache, we have to insert placeholder tokens
+        before they are inputted to the model, so the input processor prepends
+        additional image tokens (denoted as `32000`), resulting in:
+        `[1, 3148, 1001, 29901, 29871, 32000, ..., 32000, 29871, 13, 5618,
+        29915, 29879, 278, 2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566,
+        29901]`.
+
+        We insert 575 tokens so that including the original image token in the
+        input, there are a total of 576 (24 * 24) image tokens, which
+        corresponds to the number of image tokens inputted to the language
+        model, i.e. the number of image tokens outputted by the visual encoder.
+
+        This way, the `positions` and `attn_metadata` are consistent
+        with the `input_ids`.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            pixel_values: The pixels in each input image.
+
+        Info:
+            [Mistral3ImagePixelInputs][]
+        """
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="multi_modal_projector",
+            tower_model="vision_tower")
diff --git a/vllm_v0.10.0/vllm/model_executor/models/mixtral.py b/vllm_v0.10.0/vllm/model_executor/models/mixtral.py
new file mode 100644
index 0000000..30de83d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/mixtral.py
@@ -0,0 +1,493 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Mixtral model."""
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import MixtralConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class MixtralMoE(nn.Module):
+    """A tensor-parallel MoE implementation for Mixtral that shards each expert
+    across all ranks.
+
+    Each expert's weights are sharded across all ranks and a fused MoE
+    kernel is used for the forward pass, and finally we reduce the outputs
+    across ranks.
+    """
+
+    def __init__(self,
+                 num_experts: int,
+                 top_k: int,
+                 hidden_size: int,
+                 intermediate_size: int,
+                 params_dtype: Optional[torch.dtype] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 tp_size: Optional[int] = None,
+                 dp_size: Optional[int] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.hidden_size = hidden_size
+
+        # Gate always runs at half / full precision for now.
+
+        self.gate = ReplicatedLinear(hidden_size,
+                                     num_experts,
+                                     bias=False,
+                                     params_dtype=params_dtype,
+                                     quant_config=None,
+                                     prefix=f"{prefix}.gate")
+
+        self.experts = FusedMoE(num_experts=num_experts,
+                                top_k=top_k,
+                                hidden_size=hidden_size,
+                                intermediate_size=intermediate_size,
+                                params_dtype=params_dtype,
+                                reduce_results=True,
+                                renormalize=True,
+                                quant_config=quant_config,
+                                tp_size=tp_size,
+                                dp_size=dp_size,
+                                prefix=f"{prefix}.experts")
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        final_hidden_states = self.experts(hidden_states, router_logits)
+        return final_hidden_states.view(orig_shape)
+
+
+class MixtralAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: MixtralConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        max_position: int = 4096 * 32,
+        rope_theta: float = 10000,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MixtralConfig has an optional head_dim argument
+        self.head_dim = getattr(config, "head_dim", None)
+        if self.head_dim is None:
+            self.head_dim = self.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=int(self.rope_theta),
+            is_neox_style=True,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class MixtralDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: MixtralConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        # Requires transformers > 4.32.0
+        rope_theta = getattr(config, "rope_theta", 10000)
+        self.self_attn = MixtralAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn")
+        self.block_sparse_moe = MixtralMoE(
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.block_sparse_moe")
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.block_sparse_moe(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class MixtralModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.quant_config = quant_config
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: MixtralDecoderLayer(
+                config, cache_config, quant_config=quant_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers")
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(positions, hidden_states, residual)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        return FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="w1",
+            ckpt_down_proj_name="w2",
+            ckpt_up_proj_name="w3",
+            num_experts=self.config.num_local_experts)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        expert_params_mapping = self.get_expert_mapping()
+        for name, loaded_weight in weights:
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if ((name.endswith(".bias") or name.endswith("_bias"))
+                        and name not in params_dict):
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                if name.endswith("scale"):
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    fall_back_to_pt_during_load = False
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        self.config = config
+        self.lora_config = lora_config
+        self.quant_config = quant_config
+
+        self.model = MixtralModel(vllm_config=vllm_config,
+                                  prefix=maybe_prefix(prefix, "model"))
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+            quant_config=quant_config,
+        )
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
diff --git a/vllm_v0.10.0/vllm/model_executor/models/mixtral_quant.py b/vllm_v0.10.0/vllm/model_executor/models/mixtral_quant.py
new file mode 100644
index 0000000..c8ad358
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/mixtral_quant.py
@@ -0,0 +1,453 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Mixtral model."""
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import MixtralConfig
+
+from vllm.attention import Attention
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsPP
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class MixtralMLP(nn.Module):
+
+    def __init__(
+        self,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.num_experts = num_experts
+        self.ffn_dim = intermediate_size
+        self.hidden_dim = hidden_size
+
+        self.w1 = ReplicatedLinear(self.hidden_dim,
+                                   self.ffn_dim,
+                                   bias=False,
+                                   quant_config=quant_config)
+        self.w2 = ReplicatedLinear(self.ffn_dim,
+                                   self.hidden_dim,
+                                   bias=False,
+                                   quant_config=quant_config)
+        self.w3 = ReplicatedLinear(self.hidden_dim,
+                                   self.ffn_dim,
+                                   bias=False,
+                                   quant_config=quant_config)
+
+        # TODO: Use vllm's SiluAndMul
+        self.act_fn = nn.SiLU()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        w1_out, _ = self.w1(hidden_states)
+        w1_out = self.act_fn(w1_out)
+        w3_out, _ = self.w3(hidden_states)
+        current_hidden_states = w1_out * w3_out
+        current_hidden_states, _ = self.w2(current_hidden_states)
+        return current_hidden_states
+
+
+class MixtralMoE(nn.Module):
+
+    def __init__(
+        self,
+        config: MixtralConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_total_experts = config.num_local_experts
+        self.top_k = config.num_experts_per_tok
+        if self.tp_size > self.num_total_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {self.num_total_experts}.")
+        # Split experts equally between ranks
+        self.expert_indices = np.array_split(range(self.num_total_experts),
+                                             self.tp_size)[self.rank].tolist()
+        if not self.expert_indices:
+            raise ValueError(
+                f"Rank {self.rank} has no experts assigned to it.")
+
+        self.experts = nn.ModuleList([
+            MixtralMLP(self.num_total_experts,
+                       config.hidden_size,
+                       config.intermediate_size,
+                       quant_config=quant_config)
+            if idx in self.expert_indices else None
+            for idx in range(self.num_total_experts)
+        ])
+        self.gate = ReplicatedLinear(config.hidden_size,
+                                     self.num_total_experts,
+                                     bias=False,
+                                     quant_config=None)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights,
+                                                       self.top_k,
+                                                       dim=-1)
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+
+        final_hidden_states = None
+        for expert_idx in self.expert_indices:
+            expert_layer = self.experts[expert_idx]
+            expert_mask = (selected_experts == expert_idx)
+            expert_weights = (routing_weights * expert_mask).sum(dim=-1,
+                                                                 keepdim=True)
+
+            current_hidden_states = expert_layer(hidden_states).mul_(
+                expert_weights)
+            if final_hidden_states is None:
+                final_hidden_states = current_hidden_states
+            else:
+                final_hidden_states.add_(current_hidden_states)
+
+        return tensor_model_parallel_all_reduce(final_hidden_states).view(
+            num_tokens, hidden_dim)
+
+
+class MixtralAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: MixtralConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        max_position: int = 4096 * 32,
+        rope_theta: float = 10000,
+        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: Optional[CacheConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MixtralConfig has an optional head_dim argument
+        self.head_dim = getattr(config, "head_dim", None)
+        if self.head_dim is None:
+            self.head_dim = self.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=int(self.rope_theta),
+            is_neox_style=True,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class MixtralDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: MixtralConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        # Requires transformers > 4.32.0
+        rope_theta = getattr(config, "rope_theta", 10000)
+        self.self_attn = MixtralAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.block_sparse_moe = MixtralMoE(config=config,
+                                           quant_config=quant_config)
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.block_sparse_moe(hidden_states)
+        return hidden_states, residual
+
+
+class MixtralModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: MixtralDecoderLayer(
+                config, cache_config, quant_config=quant_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers")
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(positions, hidden_states, residual)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if name.endswith("scale"):
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip experts that are not assigned to this worker.
+                if ("block_sparse_moe.experts." in name
+                        and name not in params_dict):
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class MixtralForCausalLM(nn.Module, SupportsPP):
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = MixtralModel(vllm_config=vllm_config,
+                                  prefix=maybe_prefix(prefix, "model"))
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/mllama.py b/vllm_v0.10.0/vllm/model_executor/models/mllama.py
new file mode 100644
index 0000000..30ae3f2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/mllama.py
@@ -0,0 +1,1682 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Mllama model."""
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Literal, Optional, TypedDict, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import transformers.models.mllama.configuration_mllama as config_mllama
+from PIL.Image import Image
+from torch import nn
+from transformers import BatchFeature, MllamaConfig
+from transformers.modeling_outputs import (BaseModelOutput,
+                                           CausalLMOutputWithPast)
+from transformers.models.mllama.image_processing_mllama import (
+    get_optimal_tiled_canvas)
+from transformers.models.mllama.processing_mllama import (
+    MllamaProcessor, get_cross_attention_token_mask)
+
+import vllm.distributed.parallel_state as ps
+from vllm.attention import Attention, AttentionMetadata, AttentionType
+from vllm.attention.ops.paged_attn import PagedAttention
+from vllm.attention.selector import _Backend
+from vllm.config import VllmConfig
+from vllm.distributed import get_pp_group, get_tp_group
+from vllm.forward_context import get_forward_context
+from vllm.logger import init_logger
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVCrossParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
+                                    MultiModalFieldConfig, MultiModalKwargs)
+from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems)
+from vllm.multimodal.processing import (BaseProcessingInfo,
+                                        EncDecMultiModalProcessor,
+                                        PromptReplacement, PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+
+from .clip import CLIPMLP
+from .interfaces import SupportsMultiModal, SupportsV0Only
+from .llama import LlamaDecoderLayer, LlamaMLP
+from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
+
+logger = init_logger(__name__)
+
+
+class MllamaImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: torch.Tensor
+    """Shape: """
+    """(batch_size, max_num_image, max_num_chunk, num_channel, height, width)"""
+    aspect_ratio_ids: torch.Tensor
+    """Shape: `(batch_size, max_num_image)`"""
+    aspect_ratio_mask: torch.Tensor
+    """Shape: `(batch_size, max_num_image, max_num_tiles)`"""
+
+
+# TODO: support LlamaImageEmbeddingInputs
+
+
+def calc_token_per_chunk(image_size: int) -> int:
+    assert image_size % 14 == 0, "chunk size should be multiple of 14"
+    token_per_chunk = (image_size // 14)**2 + 1
+    return token_per_chunk
+
+
+class MllamaProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self) -> MllamaConfig:
+        return self.ctx.get_hf_config(MllamaConfig)
+
+    def get_hf_processor(self, **kwargs: object) -> MllamaProcessor:
+        return self.ctx.get_hf_processor(MllamaProcessor, **kwargs)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_token_per_chunk_from_config(self) -> int:
+        image_size = self.get_hf_config().vision_config.image_size
+        return calc_token_per_chunk(image_size)
+
+    def get_num_tiles_per_image(self, image_height: int,
+                                image_width: int) -> int:
+        vision_config = self.get_hf_config().vision_config
+        max_num_tiles = vision_config.max_num_tiles
+        image_size = vision_config.image_size
+        tiled_height, tiled_width = get_optimal_tiled_canvas(
+            image_height,
+            image_width,
+            max_num_tiles,
+            tile_size=image_size,
+        )
+        num_tiles_height = tiled_height // image_size
+        num_tiles_width = tiled_width // image_size
+        return num_tiles_height * num_tiles_width
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        vision_config = self.get_hf_config().vision_config
+        image_size = vision_config.image_size
+        max_num_tiles = vision_config.max_num_tiles
+        # Result in the max possible feature size (h:w = 16:1)
+        return ImageSize(height=max_num_tiles * image_size, width=image_size)
+
+
+class MllamaDummyInputsBuilder(BaseDummyInputsBuilder[MllamaProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+
+class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
+                                ):
+
+    def apply(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        return_mm_hashes: bool = False,
+    ) -> MultiModalEncDecInputs:
+        mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
+                                  tokenization_kwargs, return_mm_hashes)
+
+        image_token_id = self.info.get_hf_config().image_token_index
+        # Check that the number of image tokens in the decoder prompt matches
+        # the number of images provided in mm_data
+        num_image_tokens = mm_inputs['prompt_token_ids'].count(image_token_id)
+        image_data = mm_data.get("image", [])
+        num_images = 1 if isinstance(image_data, Image) else len(image_data)
+        if num_image_tokens != num_images:
+            raise ValueError(
+                f"The number of image tokens ({num_image_tokens}) must be"
+                f" the same as the number of images ({num_images})")
+
+        # Given prompt: <IMG0> P0 P1 <IMG1> <IMG2> P3 P4 D5 D6...., (P-prefill, D-decode)  # noqa: E501
+        # P0 & P1 do cross attention with placeholder of <IMG0>
+        # P3 P4 D5 D6 do cross attention with placeholder of <IMG1> and <IMG2>
+        # Example input to encoder and decoder:
+        # {
+        #     'encoder': {
+        #         'type': 'token',
+        #         'prompt_token_ids': [128256, 128256, ..., 128256],
+        #         'prompt': '<|image|><|image|>...<|image|>',
+        #         'multi_modal_data': {'image': <PIL.Image.Image image mode=RGB size=1770x1180 at 0x7FDE2C624880>},  # noqa: E501
+        #     },
+        #     'decoder': {
+        #         'type': 'token',
+        #         'prompt_token_ids': [128000, 128256, 128000, 3923, 374, 279, 2262, 315, 420, 2217, 30],  # noqa: E501
+        #         'prompt': '<|image|><|begin_of_text|>What is the content of this image?',  # noqa: E501
+        #         'multi_modal_data': {'image': <PIL.Image.Image image mode=RGB size=1770x1180 at 0x7FDE2C624880>},  # noqa: E501
+        #     },
+        # }
+
+        if mm_data:
+            hf_processor = self.info.get_hf_processor()
+            image_token: str = hf_processor.image_token
+
+            # Since only the last group of consecutive images
+            # are attended by the decoded tokens, we only need to
+            # get the number of tokens for those images.
+            token_per_chunk = self.info.get_token_per_chunk_from_config()
+            num_decode_images = self._get_num_image_in_last_group(
+                mm_inputs["prompt_token_ids"])
+            num_encode_images = num_images - num_decode_images
+
+            # Set encoder prompt length based on the number of tiles.
+            # This tells the block manager to allocate correct number
+            # of slots for encoder tokens.
+            num_tiles = mm_inputs["mm_kwargs"]["num_tiles"]
+            decode_tiles = num_tiles[num_encode_images:num_images].sum().item()
+            num_tokens = decode_tiles * token_per_chunk
+            mm_inputs["encoder_prompt_token_ids"] = [image_token_id
+                                                     ] * num_tokens
+            mm_inputs["encoder_prompt"] = image_token * num_tokens
+
+        return mm_inputs
+
+    def _get_num_image_in_last_group(self, prompt_token_ids: list[int]) -> int:
+        num_images = 0
+        for token_id in prompt_token_ids[::-1]:
+            if token_id == self.info.get_hf_config().image_token_index:
+                num_images += 1
+            elif num_images > 0:
+                break
+        return num_images
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        tokenizer = self.info.get_tokenizer()
+        if mm_data:
+            num_tiles = [
+                self.info.get_num_tiles_per_image(img.height, img.width)
+                for img in mm_data["images"]
+            ]
+            processed_outputs = super()._call_hf_processor(
+                prompt, mm_data, mm_kwargs, tok_kwargs)
+            processed_outputs["num_tiles"] = torch.tensor(num_tiles)
+            for k in ('pixel_values', 'aspect_ratio_ids', "aspect_ratio_mask"):
+                processed_outputs[k] = processed_outputs[k].squeeze(0)
+
+            processed_token_ids = processed_outputs.pop("input_ids")
+            start_idx, end_idx = 0, processed_token_ids.size(1)
+            processed_prompt_text = tokenizer.decode(processed_token_ids[0])
+
+            hf_processor = self.info.get_hf_processor()
+            bos_token = hf_processor.bos_token
+            # Remove the bos_token from the start of prompt,
+            # because we all know there would be image_token.
+            if processed_prompt_text.startswith(bos_token):
+                start_idx += 1
+            # Remove the bos_token from the end of prompt,
+            # because text is empty in this case.
+            if processed_prompt_text.endswith(bos_token):
+                end_idx -= 1
+            processed_outputs[
+                "input_ids"] = processed_token_ids[:, start_idx:end_idx]
+        else:
+            processed_outputs = tokenizer(prompt,
+                                          add_special_tokens=False,
+                                          return_tensors="pt")
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            aspect_ratio_ids=MultiModalFieldConfig.batched("image"),
+            aspect_ratio_mask=MultiModalFieldConfig.batched("image"),
+            num_tiles=MultiModalFieldConfig.batched("image"),
+        )
+
+    def create_encoder_prompt(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+    ) -> Union[str, list[int]]:
+        data = mm_data.get("image", [])
+        num_images = 1 if isinstance(data, Image) else len(data)
+        image_token_id = self.info.get_hf_config().image_token_index
+        return [image_token_id] * num_images
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        token_per_chunk = self.info.get_token_per_chunk_from_config()
+        image_token_id = self.info.get_hf_config().image_token_index
+
+        def get_replacement_mllama(item_idx):
+            images = mm_items.get_items("image", ImageProcessorItems)
+            image_size = images.get_image_size(item_idx)
+            num_tile = self.info.get_num_tiles_per_image(
+                image_height=image_size.height,
+                image_width=image_size.width,
+            )
+            num_tokens = num_tile * token_per_chunk
+            return [image_token_id] * num_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=get_replacement_mllama,
+            )
+        ]
+
+
+def _prepare_aspect_ratio_attention_mask(
+    aspect_ratio_mask: torch.Tensor,
+    num_patches: int,
+    target_length: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    # Expand aspect ratio mask to target_length
+    batch_size, max_num_tiles = aspect_ratio_mask.shape
+    attention_mask = aspect_ratio_mask.view(batch_size, max_num_tiles, 1,
+                                            1).to(dtype)
+    attention_mask = attention_mask.repeat(1, 1, target_length, 1)
+
+    # Mask padding patches
+    pad_patches = target_length - num_patches
+    attention_mask[:, :, -pad_patches:] = 0
+
+    # Invert the mask (0 -> 1, 1 -> 0)
+    attention_mask = 1 - attention_mask
+
+    # Reshape to 2D and create 4D attention mask
+    # (batch_size, 1, max_num_tiles*target_length, max_num_tiles*target_length)
+    attention_mask = attention_mask.reshape(batch_size,
+                                            max_num_tiles * target_length, 1)
+    attention_mask = attention_mask @ attention_mask.transpose(
+        -1, -2) * torch.finfo(dtype).min
+    attention_mask = attention_mask.unsqueeze(1)
+
+    return attention_mask
+
+
+class ColumnParallelConv2dPatch(torch.nn.Module):
+    """Conv2D Patching layer with model parallelism.
+    Column parallel over unfolded input.
+    Arguments:
+        in_channels: Input channels.
+        out_channels: Output channels.
+        kernel_size: Size of convolution kernel.
+        stride (default 1): Stride for convolution.
+        bias (default False): Use bias in Conv2d.
+    Input: (bsz, in_channels, width, height)
+    Output: (bsz, num_tokens, out_channels)
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, tuple[int, int]],
+        stride: Union[int, tuple[int, int]],
+        bias: bool = False,
+    ) -> None:
+        super().__init__()
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size, kernel_size)
+        self._unfold = torch.nn.Unfold(kernel_size=kernel_size, stride=stride)
+        self._linear = ColumnParallelLinear(
+            in_channels * kernel_size[0] * kernel_size[1],
+            out_channels,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self._unfold(x)
+        x = x.permute(0, 2, 1)
+        x, _ = self._linear(x)
+        return x
+
+
+class MllamaPrecomputedAspectRatioEmbedding(nn.Module):
+
+    def __init__(self,
+                 config: config_mllama.MllamaVisionConfig,
+                 is_gated: bool = True):
+        super().__init__()
+        self.max_num_tiles = config.max_num_tiles
+        self.hidden_size = config.hidden_size
+        self.max_aspect_ratio_id = config.max_aspect_ratio_id
+        self.is_gated = is_gated
+
+        self.embedding = nn.Embedding(self.max_aspect_ratio_id + 1,
+                                      self.max_num_tiles * self.hidden_size)
+        if is_gated:
+            self.gate = nn.Parameter(torch.zeros(1))
+
+    def forward(self, hidden_state: torch.Tensor,
+                aspect_ratio_ids: torch.Tensor) -> torch.Tensor:
+        embeddings = self.embedding(aspect_ratio_ids)
+        embeddings = embeddings.reshape(-1, self.max_num_tiles, 1,
+                                        self.hidden_size)
+
+        if self.is_gated:
+            embeddings = embeddings * self.gate.tanh()
+
+        hidden_state = hidden_state + embeddings
+        return hidden_state
+
+
+class MllamaPrecomputedPositionEmbedding(nn.Module):
+
+    def __init__(self, config: config_mllama.MllamaVisionConfig):
+        super().__init__()
+        self.max_num_tiles = config.max_num_tiles
+        self.max_aspect_ratio_id = config.max_aspect_ratio_id
+        self.num_patches = (config.image_size // config.patch_size)**2 + 1
+        self.hidden_size = config.hidden_size
+        self.scale = config.hidden_size**-0.5
+
+        self.gate = nn.Parameter(torch.zeros(1))
+
+        # position embedding
+        position_embedding = torch.randn(self.num_patches, self.hidden_size)
+        self.embedding = nn.Parameter(self.scale * position_embedding)
+
+        # tile position embedding
+        self.tile_embedding = nn.Embedding(
+            self.max_aspect_ratio_id + 1,
+            self.max_num_tiles * self.num_patches * self.hidden_size)
+
+    def forward(self, hidden_state: torch.Tensor,
+                aspect_ratio_ids: torch.Tensor) -> torch.Tensor:
+        # position embeddings
+        gated_position_embedding = (1 - self.gate.tanh()) * self.embedding
+        hidden_state = hidden_state + gated_position_embedding.view(
+            1, 1, self.num_patches, self.hidden_size)
+
+        # precomputed tile position embeddings
+        tile_position_embedding = self.tile_embedding(aspect_ratio_ids)
+        batch_size = hidden_state.shape[0]
+        tile_position_embedding = tile_position_embedding.reshape(
+            batch_size, self.max_num_tiles, self.num_patches, self.hidden_size)
+        gated_tile_position_embedding = self.gate.tanh(
+        ) * tile_position_embedding
+        hidden_state = hidden_state + gated_tile_position_embedding
+
+        return hidden_state
+
+
+# TODO: support other attention backends for attention in vision model
+class MllamaVisionSdpaAttention(nn.Module):
+
+    def __init__(self,
+                 config: config_mllama.MllamaVisionConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+
+        tensor_parallel_size = get_tp_group().world_size
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.attention_heads
+        self.head_dim = config.hidden_size // config.attention_heads
+        self.num_local_heads = self.num_heads // tensor_parallel_size
+        self.q_size = self.num_local_heads * self.head_dim
+        self.kv_size = self.num_local_heads * self.head_dim
+
+        self.qkv_proj = QKVParallelLinear(
+            self.embed_dim,
+            self.head_dim,
+            self.num_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.num_heads * self.head_dim,
+            self.embed_dim,
+            bias=False,
+            input_is_parallel=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+    def forward(
+        self,
+        hidden_state: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_state)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q = q.view(q.shape[0], q.shape[1], self.num_local_heads,
+                   self.head_dim).transpose(1, 2)
+        k = k.view(k.shape[0], k.shape[1], self.num_local_heads,
+                   self.head_dim).transpose(1, 2)
+        v = v.view(v.shape[0], v.shape[1], self.num_local_heads,
+                   self.head_dim).transpose(1, 2)
+
+        # TODO: remove padding in image encoder
+        attn_output = F.scaled_dot_product_attention(q,
+                                                     k,
+                                                     v,
+                                                     attn_mask=attention_mask,
+                                                     dropout_p=0.0)
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(attn_output.shape[0],
+                                          attn_output.shape[1], -1)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class MllamaVisionEncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: config_mllama.MllamaVisionConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+        is_gated: bool = False,
+    ) -> None:
+        super().__init__()
+
+        self.hidden_size = config.hidden_size
+        self.num_attention_heads = config.attention_heads
+        self.is_gated = is_gated
+        self.intermediate_size = config.intermediate_size
+
+        self.self_attn = MllamaVisionSdpaAttention(
+            config, quant_config=quant_config, prefix=f"{prefix}.self_attn")
+        self.mlp = CLIPMLP(config,
+                           quant_config=quant_config,
+                           prefix=f"{prefix}.mlp")
+
+        self.input_layernorm = nn.LayerNorm(self.hidden_size,
+                                            eps=config.norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(self.hidden_size,
+                                                     eps=config.norm_eps)
+
+        # there used to be an if else here, no code path
+        if is_gated:
+            self.gate_attn = nn.Parameter(torch.ones(1) * math.pi / 4)
+            self.gate_ffn = nn.Parameter(torch.ones(1) * math.pi / 4)
+
+    def forward(
+        self,
+        hidden_state: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        # Self Attention
+        residual = hidden_state
+        hidden_state = self.input_layernorm(hidden_state)
+        hidden_state = self.self_attn(hidden_state,
+                                      attention_mask=attention_mask)
+        gate_attn = 1 if not self.is_gated else self.gate_attn.tanh()
+        hidden_state = residual + gate_attn * hidden_state
+
+        # Feed forward
+        residual = hidden_state
+        hidden_state = self.post_attention_layernorm(hidden_state)
+        hidden_state = self.mlp(hidden_state)
+        gate_ffn = 1 if not self.is_gated else self.gate_ffn.tanh()
+        hidden_state = residual + gate_ffn * hidden_state
+
+        return hidden_state
+
+
+class MllamaVisionEncoder(nn.Module):
+
+    def __init__(
+        self,
+        config: config_mllama.MllamaVisionConfig,
+        quant_config: Optional[QuantizationConfig],
+        num_layers: int = 32,
+        is_gated: bool = False,
+        output_hidden_states=None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([
+            MllamaVisionEncoderLayer(config,
+                                     quant_config=quant_config,
+                                     is_gated=is_gated,
+                                     prefix=f"{prefix}.layers.{layer_idx}")
+            for layer_idx in range(num_layers)
+        ])
+        self.output_hidden_states = output_hidden_states or []
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> Union[BaseModelOutput]:
+        encoder_states = ()
+
+        for i, encoder_layer in enumerate(self.layers):
+            if i in self.output_hidden_states:
+                encoder_states = encoder_states + (hidden_states, )
+            hidden_states = encoder_layer(
+                hidden_states,
+                attention_mask,
+            )
+
+        if len(self.layers) - 1 in self.output_hidden_states:
+            encoder_states = encoder_states + (hidden_states, )
+
+        return hidden_states, encoder_states
+
+
+class MllamaVisionModel(nn.Module):
+
+    def __init__(
+        self,
+        config: config_mllama.MllamaVisionConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.max_num_tiles = config.max_num_tiles
+        self.hidden_size = config.hidden_size
+        self.in_channels = config.num_channels
+        self.intermediate_layers_indices = config.intermediate_layers_indices
+
+        self.num_patches = (self.image_size // self.patch_size)**2 + 1
+        self.scale = config.hidden_size**-0.5
+
+        self.patch_embedding = ColumnParallelConv2dPatch(
+            in_channels=config.num_channels,
+            out_channels=self.hidden_size,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=False,
+        )
+
+        self.class_embedding = nn.Parameter(self.scale *
+                                            torch.randn(self.hidden_size))
+        self.gated_positional_embedding = MllamaPrecomputedPositionEmbedding(
+            config)
+
+        self.pre_tile_positional_embedding = \
+            MllamaPrecomputedAspectRatioEmbedding(config, is_gated=True)
+        self.post_tile_positional_embedding = \
+            MllamaPrecomputedAspectRatioEmbedding(config, is_gated=True)
+
+        # layer norms
+        self.layernorm_pre = nn.LayerNorm(self.hidden_size)
+        self.layernorm_post = nn.LayerNorm(self.hidden_size)
+
+        # encoders
+        self.transformer = MllamaVisionEncoder(
+            config,
+            quant_config,
+            config.num_hidden_layers,
+            is_gated=False,
+            output_hidden_states=config.intermediate_layers_indices,
+            prefix=f"{prefix}.transformer",
+        )
+        self.global_transformer = MllamaVisionEncoder(
+            config,
+            quant_config,
+            config.num_global_layers,
+            is_gated=True,
+            prefix=f"{prefix}.global_transformer",
+        )
+
+    def apply_class_embedding(self,
+                              hidden_state: torch.Tensor) -> torch.Tensor:
+        batch_size, _, hidden_size = hidden_state.shape
+        class_embedding = self.class_embedding.expand(batch_size, 1,
+                                                      hidden_size)
+        hidden_state = torch.cat([class_embedding, hidden_state], dim=1)
+        return hidden_state
+
+    def forward(self, pixel_values: torch.Tensor,
+                aspect_ratio_ids: torch.Tensor,
+                aspect_ratio_mask: torch.Tensor) -> torch.Tensor:
+        batch_size, num_concurrent_media, num_tiles, num_channels, \
+            height, width = pixel_values.shape
+
+        pixel_values = pixel_values.reshape(
+            batch_size * num_concurrent_media * num_tiles, num_channels,
+            height, width)
+        aspect_ratio_ids = aspect_ratio_ids.reshape(
+            batch_size * num_concurrent_media, -1)
+
+        # patch embedding
+        patch_embeds = self.patch_embedding(
+            pixel_values.to(self.layernorm_pre.weight.dtype))
+        hidden_state = patch_embeds
+        hidden_state = ps.get_tp_group().all_gather(hidden_state)
+
+        # tile embeddings
+        _, num_patches, dim = hidden_state.shape
+        hidden_state = hidden_state.reshape(batch_size * num_concurrent_media,
+                                            num_tiles, -1, dim)
+        hidden_state = self.pre_tile_positional_embedding(
+            hidden_state, aspect_ratio_ids)
+
+        # apply cls token
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media * num_tiles, num_patches, dim)
+        hidden_state = self.apply_class_embedding(hidden_state)
+        num_patches += 1
+
+        # apply position embeddings
+        hidden_state = hidden_state.reshape(batch_size * num_concurrent_media,
+                                            num_tiles, num_patches, dim)
+        hidden_state = self.gated_positional_embedding(hidden_state,
+                                                       aspect_ratio_ids)
+
+        # apply encoder
+        hidden_state = self.layernorm_pre(hidden_state)
+
+        # Compute the number of tokens to pad
+        num_padding_patches = (8 - (hidden_state.shape[-2] % 8)) % 8
+        # Compute padding tuple for pad function
+        padding = (
+            0, 0, 0, num_padding_patches
+        )  # (pad_left, pad_right, pad_left for dim -2, pad_right for dim -2)
+        # Pad the tensor
+        hidden_state = F.pad(hidden_state, padding, mode="constant", value=0)
+        slice_index = -num_padding_patches if num_padding_patches > 0 else None
+
+        attention_mask = aspect_ratio_mask.reshape(
+            batch_size * num_concurrent_media, -1)
+        attention_mask = _prepare_aspect_ratio_attention_mask(
+            aspect_ratio_mask=attention_mask,
+            num_patches=self.num_patches,
+            target_length=hidden_state.shape[2],
+            dtype=self.layernorm_pre.weight.dtype,
+        )
+
+        hidden_state = hidden_state.view(batch_size * num_concurrent_media, -1,
+                                         dim)
+        output = self.transformer(
+            hidden_state,
+            attention_mask=attention_mask,
+        )
+        hidden_state, intermediate_hidden_states = output[0], output[1]
+        intermediate_hidden_states = torch.stack(intermediate_hidden_states,
+                                                 dim=-1)
+
+        # apply global encoder
+        hidden_state = self.layernorm_post(hidden_state)
+        hidden_state = hidden_state.reshape(batch_size * num_concurrent_media,
+                                            num_tiles,
+                                            num_patches + num_padding_patches,
+                                            dim)
+        hidden_state = self.post_tile_positional_embedding(
+            hidden_state, aspect_ratio_ids)
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media,
+            num_tiles * (num_patches + num_padding_patches), dim)
+        hidden_state = self.global_transformer(
+            hidden_state, attention_mask=attention_mask)[0]
+        hidden_state = hidden_state.reshape(batch_size * num_concurrent_media,
+                                            num_tiles,
+                                            num_patches + num_padding_patches,
+                                            dim)
+        hidden_state = hidden_state[:, :, :slice_index]
+
+        # adding intermediate layer outputs
+        hidden_state = hidden_state.reshape(batch_size, num_concurrent_media,
+                                            num_tiles, num_patches, dim)
+        intermediate_hidden_states = intermediate_hidden_states.reshape(
+            batch_size * num_concurrent_media, num_tiles,
+            num_patches + num_padding_patches, -1)
+        intermediate_hidden_states = intermediate_hidden_states[:, :, :
+                                                                slice_index]
+        intermediate_hidden_states = intermediate_hidden_states.reshape(
+            batch_size, num_concurrent_media, num_tiles, num_patches, -1)
+        hidden_state = torch.cat([hidden_state, intermediate_hidden_states],
+                                 dim=-1)
+        return hidden_state
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        updated_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if 'patch_embedding._linear.weight' in name:
+                loaded_weight = loaded_weight.view(loaded_weight.shape[0], -1)
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                updated_params.add(name)
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict.pop(name)
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+                updated_params.add(name)
+        return updated_params
+
+
+class MllamaTextRMSNorm(nn.Module):
+
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        MllamaTextRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance +
+                                                    self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class MllamaTextCrossAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        config: Optional[config_mllama.MllamaTextConfig] = None,
+        layer_idx: Optional[int] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.pipeline_parallel_rank = get_pp_group().rank_in_group
+        self.tensor_parallel_size = get_tp_group().world_size
+        self.num_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_key_value_heads
+
+        self.num_local_heads = self.num_heads // self.tensor_parallel_size
+        self.num_local_key_value_heads = \
+            self.num_key_value_heads // self.tensor_parallel_size
+        self.hidden_size = config.hidden_size
+        self.head_dim = config.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+
+        self.layer_idx = layer_idx
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.q_local_size = self.num_local_heads * self.head_dim
+        self.kv_local_size = self.num_local_key_value_heads * self.head_dim
+
+        self.qkv_proj = QKVCrossParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.num_heads,
+            self.num_key_value_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.num_heads * self.head_dim,
+            self.hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        # vllm.model_executor.layers.layernorm.RMSNorm has precision issue,
+        # use huggingface's instead
+        self.q_norm = MllamaTextRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = MllamaTextRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.scaling = self.head_dim**-0.5
+
+        self.attn = Attention(
+            self.num_local_heads,
+            self.head_dim,
+            self.scaling,
+            self.num_local_key_value_heads,
+            prefix=f"{prefix}.attn",
+            attn_type=AttentionType.ENCODER_DECODER,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+        kv_range_for_decode: Optional[list[tuple[int, int]]],
+        cross_attention_states: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        q, k, v = self.qkv_proj(hidden_states, cross_attention_states)
+        if cross_attention_states is not None:
+            k = k.view(-1, self.num_local_key_value_heads, self.head_dim)
+            v = v.view(-1, self.num_local_key_value_heads, self.head_dim)
+            k = self.k_norm(k)
+
+        q = q.view(-1, self.num_local_heads, self.head_dim)
+        q = self.q_norm(q)
+
+        if attention_mask is not None:
+            output = self._attention_with_mask(q, k, v, attention_mask,
+                                               kv_range_for_decode)
+        else:
+            output = self.attn(
+                q.view(-1, self.num_local_heads * self.head_dim), k, v)
+        out, _ = self.o_proj(output)
+        return out
+
+    def _attention_with_mask(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        attention_mask: torch.Tensor,
+        kv_range_for_decode: list[tuple[int, int]],
+    ) -> torch.Tensor:
+        kv_cache = self.attn.kv_cache[self.pipeline_parallel_rank]
+        attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
+        # Skip writing kv-cache for the initial profiling run.
+        # TODO (NickLucche) replace with custom attn bias and use standard attn
+        if len(kv_cache.shape) > 1:
+            i = torch.ones(1, dtype=torch.float32)
+            if self.attn.backend in (_Backend.FLASH_ATTN,
+                                     _Backend.FLASH_ATTN_VLLM_V1):
+                cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode])
+                cached_v = torch.cat([v[s:e] for s, e in kv_range_for_decode])
+                torch.ops._C_cache_ops.reshape_and_cache_flash(
+                    cached_k,
+                    cached_v,
+                    kv_cache[0],
+                    kv_cache[1],
+                    attn_metadata.
+                    cross_slot_mapping,  # type: ignore[union-attr]
+                    "auto",
+                    i,
+                    i,
+                )
+            elif self.attn.backend in (_Backend.XFORMERS, _Backend.ROCM_FLASH,
+                                       _Backend.TORCH_SDPA):
+                key_cache, value_cache = PagedAttention.split_kv_cache(
+                    kv_cache, self.num_local_key_value_heads, self.head_dim)
+                cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode])
+                cached_v = torch.cat([v[s:e] for s, e in kv_range_for_decode])
+                PagedAttention.write_to_paged_cache(
+                    cached_k, cached_v, key_cache, value_cache,
+                    attn_metadata.cross_slot_mapping, "auto", i, i)
+            else:
+                raise ValueError(
+                    f"Unsupported Attention backend {self.attn.backend} "
+                    "enum found. Expected the Attention backend to be "
+                    "FLASH_ATTN, FLASH_ATTN_VLLM_V1, "
+                    "XFORMERS or TORCH_SDPA.")
+
+        # We have to call torch.sdpa for prefill when using a
+        # custom cross-attention mask. Because the mask is not a
+        # standard causal mask, neither a block diagonal mask which
+        # can be optimized by xformers.BlockDiagonalMask.
+        # The mask is specially calculated for supporting multi
+        # images and interleaved images.
+        q_len = q.shape[0]
+        kv_len = k.shape[0]
+        q = q.transpose(0, 1).view(self.num_local_key_value_heads,
+                                   self.num_key_value_groups, q_len,
+                                   self.head_dim).contiguous()
+        k = k.transpose(0,
+                        1)[:,
+                           None, :, :].expand(self.num_local_key_value_heads,
+                                              self.num_key_value_groups,
+                                              kv_len,
+                                              self.head_dim).contiguous()
+        v = v.transpose(0,
+                        1)[:,
+                           None, :, :].expand(self.num_local_key_value_heads,
+                                              self.num_key_value_groups,
+                                              kv_len,
+                                              self.head_dim).contiguous()
+        attention_mask = attention_mask.view(1, 1, q_len, kv_len)
+        output = F.scaled_dot_product_attention(q,
+                                                k,
+                                                v,
+                                                attn_mask=attention_mask,
+                                                is_causal=False)
+        output = output.permute(2, 0, 1, 3).reshape(
+            q_len, self.num_local_heads * self.head_dim)
+        return output
+
+
+class MllamaCrossAttentionDecoderLayer(torch.nn.Module):
+    """Cross-attention transformer block with tanh-gated attention
+    and feedforward."""
+
+    def __init__(
+        self,
+        config: config_mllama.MllamaTextConfig,
+        layer_idx: int,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.layer_idx = layer_idx
+        self.cross_attn = MllamaTextCrossAttention(
+            config=config,
+            layer_idx=layer_idx,
+            quant_config=quant_config,
+            prefix=f"{prefix}.cross_attn",
+        )
+
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.cross_attn_attn_gate = torch.nn.Parameter(torch.zeros(1))
+
+        self.mlp = LlamaMLP(
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+        self.cross_attn_mlp_gate = torch.nn.Parameter(torch.zeros(1))
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cross_attention_states: torch.Tensor,
+        cross_attention_mask: torch.Tensor,
+        kv_range_for_decode: Optional[list[tuple[int, int]]],
+        full_text_row_masked_out_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states = self.cross_attn(
+            hidden_states=hidden_states,
+            attention_mask=cross_attention_mask,
+            kv_range_for_decode=kv_range_for_decode,
+            cross_attention_states=cross_attention_states,
+        )
+        hidden_states = full_text_row_masked_out_mask * hidden_states
+        hidden_states = residual + self.cross_attn_attn_gate.tanh(
+        ) * hidden_states
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = full_text_row_masked_out_mask * hidden_states
+        hidden_states = residual + self.cross_attn_mlp_gate.tanh(
+        ) * hidden_states
+        return hidden_states
+
+
+class MllamaTextModel(nn.Module):
+    config_class = config_mllama.MllamaTextConfig
+    base_model_prefix = "model"
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config.text_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(config.vocab_size + 8,
+                                                   config.hidden_size)
+        self.cross_attention_layers = config.cross_attention_layers
+
+        layers = []
+        for layer_idx in range(config.num_hidden_layers):
+            if layer_idx in self.cross_attention_layers:
+                layers.append(
+                    MllamaCrossAttentionDecoderLayer(
+                        config,
+                        layer_idx,
+                        quant_config=quant_config,
+                        prefix=f"{prefix}.layers.{layer_idx}",
+                    ))
+            else:
+                # TODO: force LlamaDecoderLayer to config.attention_bias=False
+                layers.append(
+                    LlamaDecoderLayer(
+                        config,
+                        cache_config=cache_config,
+                        quant_config=quant_config,
+                        prefix=f"{prefix}.layers.{layer_idx}",
+                    ))
+
+        self.layers = nn.ModuleList(layers)
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: Optional[torch.LongTensor],
+        cross_attention_states: Optional[torch.LongTensor],
+        cross_attention_mask: Optional[torch.LongTensor],
+        kv_range_for_decode: Optional[list[tuple[int, int]]],
+        full_text_row_masked_out_mask: Optional[tuple[torch.Tensor,
+                                                      torch.Tensor]],
+        skip_cross_attention: bool,
+    ) -> torch.Tensor:
+        inputs_embeds = self.embed_tokens(input_ids)
+        hidden_states = inputs_embeds
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if idx in self.cross_attention_layers:
+                if not skip_cross_attention:
+                    hidden_states = decoder_layer(
+                        hidden_states=hidden_states,
+                        cross_attention_states=cross_attention_states,
+                        cross_attention_mask=cross_attention_mask,
+                        kv_range_for_decode=kv_range_for_decode,
+                        full_text_row_masked_out_mask=
+                        full_text_row_masked_out_mask,
+                    )
+            else:
+                hidden_states, residual = decoder_layer(
+                    positions=positions,
+                    hidden_states=hidden_states,
+                    residual=None,
+                )
+                hidden_states = hidden_states + residual
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class MllamaForCausalLM(nn.Module):
+    config_class = config_mllama.MllamaTextConfig
+    base_model_prefix = "language_model"
+    _no_split_modules = [
+        "MllamaCrossAttentionDecoderLayer", "MllamaSelfAttentionDecoderLayer"
+    ]
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config.text_config
+        quant_config = vllm_config.quant_config
+        self.quant_config = quant_config
+
+        self.vocab_size = config.vocab_size
+        self.model = MllamaTextModel(vllm_config=vllm_config,
+                                     prefix=f"{prefix}.model")
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            quant_config=quant_config,
+            prefix=f"{prefix}.lm_head",
+        )
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: Optional[torch.LongTensor],
+        cross_attention_states: Optional[torch.LongTensor],
+        cross_attention_mask: Optional[torch.LongTensor],
+        kv_range_for_decode: Optional[list[tuple[int, int]]],
+        full_text_row_masked_out_mask: Optional[tuple[torch.Tensor,
+                                                      torch.Tensor]],
+        skip_cross_attention: bool,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            cross_attention_states=cross_attention_states,
+            cross_attention_mask=cross_attention_mask,
+            kv_range_for_decode=kv_range_for_decode,
+            full_text_row_masked_out_mask=full_text_row_masked_out_mask,
+            skip_cross_attention=skip_cross_attention,
+        )
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        updated_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if 'patch_embedding.weight' in name:
+                name = name.replace('patch_embedding.weight',
+                                    'patch_embedding._linear.weight')
+                loaded_weight = loaded_weight.view(loaded_weight.shape[0], -1)
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                updated_params.add(scale_name)
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                updated_params.add(name)
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                orig_name = name
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    logger.debug("Missing name %s, orig name %s", name,
+                                 orig_name)
+                    continue
+
+                param = params_dict.pop(name)
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+                updated_params.add(name)
+        return updated_params
+
+
+@MULTIMODAL_REGISTRY.register_processor(MllamaMultiModalProcessor,
+                                        info=MllamaProcessingInfo,
+                                        dummy_inputs=MllamaDummyInputsBuilder)
+class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                     SupportsV0Only):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
+    }
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # mapping for new names in checkpoint saved after transformers v4.52
+            "model.vision_model.": "vision_model.",
+            "model.multi_modal_projector.": "multi_modal_projector.",
+            "model.language_model.": "language_model.model.",
+            "lm_head.": "language_model.lm_head.",
+        },
+        orig_to_new_suffix={
+            "patch_embedding.weight": "patch_embedding._linear.weight",
+        },
+    )
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<|image|>"
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: MllamaConfig = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.vocab_size = config.text_config.vocab_size
+        self.hidden_size = config.text_config.hidden_size
+        self.max_num_tiles = config.vision_config.max_num_tiles
+        self.vision_output_dim = config.vision_config.vision_output_dim
+        self.pad_token_id = \
+            config.pad_token_id if config.pad_token_id is not None else -1
+        self.image_size = config.vision_config.image_size
+        self.image_token_id = config.image_token_index
+
+        self.vision_model = MllamaVisionModel(config.vision_config,
+                                              quant_config,
+                                              prefix=maybe_prefix(
+                                                  prefix, "vision_model"))
+        self.language_model = MllamaForCausalLM(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+        self.multi_modal_projector = ColumnParallelLinear(
+            config.vision_config.vision_output_dim,
+            config.text_config.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            gather_output=True,
+            prefix=maybe_prefix(prefix, "multi_modal_projector"),
+        )
+        self.logits_processor = LogitsProcessor(config.output_hidden_states,
+                                                config.text_config.vocab_size)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.language_model.lm_head,
+                                       hidden_states, sampling_metadata)
+        return logits
+
+    def unpack_data(self,
+                    image_data: Union[list[torch.Tensor], torch.Tensor],
+                    padding_value=0) -> torch.Tensor:
+        if isinstance(image_data, torch.Tensor):
+            # torch.Tensor
+            return image_data
+        else:
+            assert isinstance(
+                image_data[0],
+                torch.Tensor), "Image data is not properly batched."
+            # list[torch.Tensor]
+            bsz = len(image_data)
+            max_length = max(t.size(0) for t in image_data)
+            trailing_dims = image_data[0].shape[1:]
+            for data in image_data:
+                cur_trailing_dims = data.shape[1:]
+                assert cur_trailing_dims == trailing_dims
+            output_tensor = torch.full((bsz, max_length, *trailing_dims),
+                                       padding_value,
+                                       dtype=image_data[0].dtype,
+                                       device=image_data[0].device)
+            for i, t in enumerate(image_data):
+                output_tensor[i, :t.size(0)] = t
+            return output_tensor
+
+    def _parse_and_validate_image_input(self, **kwargs: object):
+        # tensor with the same shape will be batched together by
+        # MultiModalKwargs.batch, so pixel_values here can be:
+        #   - list[torch.Tensor]:
+        #       with shape (num_image, num_tiles, 3, image_res, image_res)
+        #   - torch.Tensor:
+        #       with shape (bs, num_image, num_tiles, 3, image_res, image_res)
+        pixel_values: Optional[Union[list[list[torch.Tensor]],
+                                     list[torch.Tensor],
+                                     torch.Tensor]] = kwargs.pop(
+                                         "pixel_values", None)
+        image_embeds: Optional[Union[list[list[torch.Tensor]],
+                                     list[torch.Tensor],
+                                     torch.Tensor]] = kwargs.pop(
+                                         "image_embeds", None)
+        aspect_ratio_ids: Optional[Union[list[list[torch.Tensor]],
+                                         list[torch.Tensor],
+                                         torch.Tensor]] = kwargs.pop(
+                                             "aspect_ratio_ids", None)
+        aspect_ratio_mask: Optional[Union[list[list[torch.Tensor]],
+                                          list[torch.Tensor],
+                                          torch.Tensor]] = kwargs.pop(
+                                              "aspect_ratio_mask", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None and image_embeds is not None:
+            raise ValueError(
+                "Both pixel values and image embeds are provided.")
+
+        if pixel_values is not None:
+            assert aspect_ratio_ids is not None
+            assert aspect_ratio_mask is not None
+
+            return MllamaImagePixelInputs(
+                type="pixel_values",
+                data=self.unpack_data(pixel_values),
+                aspect_ratio_ids=self.unpack_data(aspect_ratio_ids),
+                aspect_ratio_mask=self.unpack_data(aspect_ratio_mask))
+
+        if image_embeds is not None:
+            raise NotImplementedError
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _get_and_validate_encoder_lens(
+        self,
+        encoder_seq_lens: list[int],
+        num_tiles: list[list[int]],
+        num_tokens_per_tile: int,
+    ) -> list[int]:
+        # Get the actual number of encoder tokens for each sample.
+        # Because attn_metadata.encoder_seq_lens only counts the last
+        # group of images for each sample, which is used to cheat the
+        # block manager to allocate blocks for those images only.
+        # See MllamaMultiModalProcessor for more details.
+        actual_encoder_seq_lens = [
+            sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles
+        ]
+
+        # remove 0 encoder len entries for text-only requests for these
+        # assertions
+        attn_metadata_lens = [x for x in encoder_seq_lens if x > 0]
+        assert len(actual_encoder_seq_lens) == len(attn_metadata_lens)
+        for actual_len, last_group_len in zip(actual_encoder_seq_lens,
+                                              attn_metadata_lens):
+            assert actual_len >= last_group_len
+
+        return actual_encoder_seq_lens
+
+    def flat_encoder_result(self, cross_attention_states: torch.Tensor,
+                            attn_metadata: AttentionMetadata,
+                            actual_encoder_seq_lens: list[int]):
+
+        cross_attention_states_flat = torch.zeros(
+            sum(actual_encoder_seq_lens),
+            cross_attention_states.shape[-1],
+            device=cross_attention_states.device,
+            dtype=cross_attention_states.dtype)
+        start_pos = 0
+        for seq_len, vision_token_in_batch in zip(actual_encoder_seq_lens,
+                                                  cross_attention_states):
+            end_pos = start_pos + seq_len
+            cross_attention_states_flat[
+                start_pos:end_pos] = vision_token_in_batch[:seq_len]
+            start_pos = end_pos
+        cross_attention_states = cross_attention_states_flat
+        return cross_attention_states
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_cross_attention_states(
+        self,
+        image_inputs: MllamaImagePixelInputs,
+        attn_metadata: AttentionMetadata,
+        actual_encoder_seq_lens: list[int],
+    ) -> tuple[torch.Tensor]:
+        # NOTE: llama's reference implementation runs vision model on CPU
+        pixel_values = image_inputs['data']
+        aspect_ratio_ids = image_inputs['aspect_ratio_ids']
+        aspect_ratio_mask = image_inputs['aspect_ratio_mask']
+        cross_attention_states = self.vision_model(pixel_values,
+                                                   aspect_ratio_ids,
+                                                   aspect_ratio_mask)
+        cross_attention_states, _ = self.multi_modal_projector(
+            cross_attention_states)
+
+        bsz, _, _, _, image_token_dim = tuple(cross_attention_states.shape)
+        cross_attention_states = cross_attention_states.view(
+            bsz, -1, image_token_dim)
+
+        cross_attention_states = self.flat_encoder_result(
+            cross_attention_states, attn_metadata, actual_encoder_seq_lens)
+
+        return cross_attention_states
+
+    def get_cross_attention_mask(
+        self,
+        input_ids: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        num_tiles: list[list[int]],
+        num_tokens_per_tile: int,
+        dtype: torch.dtype,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        token_ids = input_ids.tolist()
+        start = 0
+        batch_token_ids = []
+        for seq_len in attn_metadata.seq_lens:
+            batch_token_ids.append(token_ids[start:start + seq_len])
+            start += seq_len
+        sparse_mask = [
+            get_cross_attention_token_mask(t, self.image_token_id)
+            for t in batch_token_ids
+        ]
+
+        # Skip generating cross-attention mask if all samples
+        # are text-only or have only 1 leading image.
+        if skip_attention_mask(sparse_mask):
+            return None, None
+
+        dense_mask, tile_range_for_decode = \
+            convert_sparse_cross_attention_mask_to_dense(
+                sparse_mask, num_tiles, attn_metadata.seq_lens)
+        cross_attention_mask = \
+            convert_dense_cross_attention_mask_to_tensor(
+                dense_mask, num_tokens_per_tile, input_ids.device, dtype)
+        kv_range_for_decode = [[
+            t[0] * num_tokens_per_tile, t[1] * num_tokens_per_tile
+        ] for t in tile_range_for_decode]
+
+        return cross_attention_mask, kv_range_for_decode
+
+    def get_full_text_row_masked_out_mask(
+        self,
+        attn_metadata: AttentionMetadata,
+        device: torch.device,
+    ) -> torch.Tensor:
+        full_text_row_masked_out_mask = torch.ones(
+            (attn_metadata.num_prefill_tokens, 1), dtype=torch.bool)
+        start_pos = 0
+        for seq_len, encoder_seq_len in zip(attn_metadata.seq_lens,
+                                            attn_metadata.encoder_seq_lens):
+            if encoder_seq_len == 0:
+                full_text_row_masked_out_mask[start_pos:start_pos +
+                                              seq_len] = False
+            start_pos += seq_len
+        full_text_row_masked_out_mask = full_text_row_masked_out_mask.to(
+            device)
+        return full_text_row_masked_out_mask
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        **kwargs: object,
+    ) -> Union[CausalLMOutputWithPast]:
+        attn_metadata = get_forward_context().attn_metadata
+        if attn_metadata.num_prefill_tokens > 0 and \
+            attn_metadata.num_decode_tokens > 0:
+            raise ValueError("Chunk prefill not supported")
+        image_inputs = self._parse_and_validate_image_input(**kwargs)
+        cross_attention_states = None
+        cross_attention_mask = None
+        kv_range_for_decode = None
+
+        # For 1) text-only prefill and decode, 2) image-present decode.
+        if image_inputs is None:
+            full_text_row_masked_out_mask = (
+                attn_metadata.encoder_seq_lens_tensor
+                != 0).reshape(-1, 1).to(input_ids.device)
+            skip_cross_attention = attn_metadata.max_encoder_seq_len == 0
+
+        # For image-present prefill.
+        else:
+            skip_cross_attention = False
+
+            num_tiles = [t.tolist() for t in kwargs.pop("num_tiles")]
+            num_tokens_per_tile = calc_token_per_chunk(self.image_size)
+
+            actual_encoder_seq_lens = self._get_and_validate_encoder_lens(
+                attn_metadata.encoder_seq_lens,
+                num_tiles,
+                num_tokens_per_tile,
+            )
+
+            cross_attention_states = self.get_cross_attention_states(
+                image_inputs, attn_metadata, actual_encoder_seq_lens)
+
+            full_text_row_masked_out_mask = \
+                self.get_full_text_row_masked_out_mask(
+                    attn_metadata, input_ids.device)
+
+            cross_attention_mask, kv_range_for_decode = \
+                self.get_cross_attention_mask(
+                    input_ids, attn_metadata, num_tiles,
+                    num_tokens_per_tile, cross_attention_states.dtype)
+
+        outputs = self.language_model(
+            input_ids=input_ids,
+            positions=positions,
+            cross_attention_states=cross_attention_states,
+            cross_attention_mask=cross_attention_mask,
+            kv_range_for_decode=kv_range_for_decode,
+            full_text_row_masked_out_mask=full_text_row_masked_out_mask,
+            skip_cross_attention=skip_cross_attention,
+        )
+
+        return outputs
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="multi_modal_projector",
+            tower_model="vision_model")
+
+
+def skip_attention_mask(sparse_mask: list[list[int]]) -> bool:
+    for mask in sparse_mask:
+        # Skip text-only samples.
+        if len(mask) == 0:
+            continue
+        # If the sample contains more than 1 images,
+        # we can't skip mask.
+        if len(mask) != 1:
+            return False
+        # If the sample contains only 1 image,
+        # but the image is not the leading one,
+        # we can't skip mask.
+        if mask[0][0] != 0 or mask[0][1] != -1:
+            return False
+    return True
+
+
+def convert_sparse_cross_attention_mask_to_dense(
+    sparse_mask: list[list[list[int]]],
+    num_tiles: list[list[int]],
+    lengths: list[int],
+) -> tuple[np.ndarray, list[tuple[int, int]]]:
+    total_length = sum(lengths)
+    total_tiles = sum([sum(tiles) for tiles in num_tiles])
+    dense_mask = np.zeros(shape=(total_length, total_tiles), dtype=np.int64)
+    # A list of ranges, range[i] = [start, end] means that the i-th image will
+    # use tiles[start, end] for cross-attention decoding.
+    tile_range_for_decode = []
+
+    seq_start = 0
+    tile_start = 0
+
+    # sparse_mask has an [] entry for each sequence that does not have images,
+    # but num_tiles does not have these entries...
+    num_tiles_idx = 0
+    for masks, length in zip(sparse_mask, lengths):
+        if len(masks) == 0:
+            # Text only
+            continue
+
+        tiles = num_tiles[num_tiles_idx]
+        num_tiles_idx += 1
+        ts, td = -1, 0
+        for mask, tile in zip(masks, tiles):
+            if len(mask) != 2:
+                continue
+            start, end = mask
+            end = min(end, length)
+            if end == -1:
+                end = length
+            if end == length:
+                if ts == -1:
+                    ts = tile_start
+                td += tile
+            dense_mask[seq_start + start:seq_start + end,
+                       tile_start:tile_start + tile] = 1
+            tile_start += tile
+        assert ts != -1
+        assert td != 0
+        tile_range_for_decode.append((ts, ts + td))
+        seq_start += length
+    assert num_tiles_idx == len(num_tiles)
+
+    return dense_mask, tile_range_for_decode
+
+
+def convert_dense_cross_attention_mask_to_tensor(
+    cross_attention_token_mask: np.ndarray,
+    num_tokens_per_tile: int,
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    mask = torch.tensor(cross_attention_token_mask, dtype=dtype, device=device)
+    mask = mask.repeat_interleave(num_tokens_per_tile, dim=1)
+
+    mask = 1.0 - mask
+    mask = mask.masked_fill(mask.to(torch.bool), torch.finfo(dtype).min)
+
+    ninf = torch.finfo(dtype).min
+    full_text_mask = ((mask != ninf).any(dim=-1).type_as(mask)[..., None])
+    mask *= full_text_mask
+    # (num_prompt_tokens, num_encoder_tokens)
+    return mask
diff --git a/vllm_v0.10.0/vllm/model_executor/models/mllama4.py b/vllm_v0.10.0/vllm/model_executor/models/mllama4.py
new file mode 100644
index 0000000..dea85d3
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/mllama4.py
@@ -0,0 +1,1071 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Copyright 2025 the LLAMA4, Meta Inc., vLLM, and HuggingFace Inc. team.
+# All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from collections.abc import Iterable, Mapping
+from itertools import tee
+from typing import Literal, Optional, TypedDict, Union
+
+import torch
+from torch import nn
+from transformers import BatchFeature, Llama4Config, Llama4VisionConfig
+from transformers.image_utils import SizeDict
+from transformers.models.llama4 import Llama4Processor
+from transformers.models.llama4.image_processing_llama4_fast import (
+    find_supported_resolutions, get_best_fit)
+
+from vllm.attention.layer import MultiHeadAttention
+from vllm.config import VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.inputs import InputProcessingContext
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.model_loader.utils import initialize_model
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.multimodal.utils import run_dp_sharded_vision_model
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .llama4 import Llama4ForCausalLM
+from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
+                    merge_multimodal_embeddings)
+
+
+class Llama4ImagePatchInputs(TypedDict):
+    type: Literal["pixel_values"]
+    flat_data: torch.Tensor
+    """
+    Shape:
+    `(batch_size * num_chunks, num_channels, image size, image size)`
+    """
+    patches_per_image: torch.Tensor
+    """
+    The number of total patches for each image in the batch.
+
+    This is used to split the embeddings which has the first two dimensions
+    flattened just like `flat_data`.
+    """
+
+    aspect_ratios: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A list of aspect ratios corresponding to the number of tiles
+    in each dimension that each image in the batch corresponds to.
+
+    Shape:
+    `(batch_size, ratio)` where ratio is a pair `(ratio_h, ratio_w)`
+    """
+
+
+class Llama4VisionMLP(nn.Module):
+
+    def __init__(
+        self,
+        input_size: int,
+        intermediate_size: int,
+        output_size: int,
+        bias: bool,
+        output_activation: bool,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
+        super().__init__()
+        cls_fc1 = (ReplicatedLinear
+                   if use_data_parallel else ColumnParallelLinear)
+        self.fc1 = cls_fc1(
+            input_size=input_size,
+            output_size=intermediate_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
+        cls_fc2 = ReplicatedLinear if use_data_parallel else RowParallelLinear
+        self.fc2 = cls_fc2(
+            input_size=intermediate_size,
+            output_size=output_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
+        self.activation_fn = nn.GELU()
+        self.output_activation = output_activation
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        if self.output_activation:
+            return self.activation_fn(hidden_states)
+        return hidden_states
+
+
+class Llama4MultiModalProjector(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.linear_1 = ColumnParallelLinear(
+            input_size=config.vision_config.vision_output_dim,
+            output_size=config.text_config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            gather_output=True,
+            prefix=f"{prefix}.linear_1",
+        )
+
+    def forward(self, image_features):
+        hidden_states, _ = self.linear_1(image_features)
+        return hidden_states
+
+
+def pixel_shuffle(input_tensor, shuffle_ratio):
+    # input_tensor: [batch_size, num_patches, channels]
+    batch_size, num_patches, channels = input_tensor.shape
+    patch_size = int(math.sqrt(num_patches))
+
+    input_tensor = input_tensor.view(batch_size, patch_size, patch_size, -1)
+    batch_size, height, width, channels = input_tensor.size()
+
+    reshaped_tensor = input_tensor.view(batch_size, height,
+                                        int(width * shuffle_ratio),
+                                        int(channels / shuffle_ratio))
+    reshaped_tensor = reshaped_tensor.permute(0, 2, 1, 3).contiguous()
+
+    reshaped_tensor = reshaped_tensor.view(
+        batch_size,
+        int(height * shuffle_ratio),
+        int(width * shuffle_ratio),
+        int(channels / (shuffle_ratio**2)),
+    )
+    reshaped_tensor = reshaped_tensor.permute(0, 2, 1, 3).contiguous()
+
+    output_tensor = reshaped_tensor.view(batch_size, -1,
+                                         reshaped_tensor.shape[-1])
+    return output_tensor
+
+
+class Llama4VisionPixelShuffleMLP(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
+        super().__init__()
+        self.pixel_shuffle_ratio = config.pixel_shuffle_ratio
+        self.inner_dim = int(config.projector_input_dim //
+                             (self.pixel_shuffle_ratio**2))
+        self.output_dim = config.projector_output_dim
+        self.mlp = Llama4VisionMLP(
+            input_size=config.intermediate_size,
+            intermediate_size=config.projector_input_dim,
+            output_size=config.projector_output_dim,
+            bias=config.multi_modal_projector_bias,
+            output_activation=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+            use_data_parallel=use_data_parallel,
+        )
+
+    def forward(self, encoded_patches: torch.Tensor) -> torch.Tensor:
+        encoded_patches = pixel_shuffle(encoded_patches,
+                                        self.pixel_shuffle_ratio)
+        return self.mlp(encoded_patches)
+
+
+class Llama4VisionAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: Llama4VisionConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
+        super().__init__()
+        self.config = config
+        self.tp_size = (1 if use_data_parallel else
+                        get_tensor_model_parallel_world_size())
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = config.hidden_size // self.num_heads
+        assert self.num_heads % self.tp_size == 0
+        self.num_local_heads = self.num_heads // self.tp_size
+        self.q_size = self.num_local_heads * self.head_dim
+        self.kv_size = self.num_local_heads * self.head_dim
+        self.attention_dropout = config.attention_dropout
+        self.scaling = self.head_dim**-0.5
+
+        self.attn = MultiHeadAttention(self.num_local_heads, self.head_dim,
+                                       self.scaling)
+
+        if use_data_parallel:
+            self.qkv_proj = ReplicatedLinear(
+                self.embed_dim,
+                self.q_size + 2 * self.kv_size,
+                bias=True,
+                quant_config=quant_config,
+                prefix=f"{prefix}.qkv_proj",
+            )
+            self.o_proj = ReplicatedLinear(
+                self.num_heads * self.head_dim,
+                self.embed_dim,
+                bias=True,
+                quant_config=quant_config,
+                prefix=f"{prefix}.o_proj",
+            )
+        else:
+            self.qkv_proj = QKVParallelLinear(
+                self.embed_dim,
+                self.head_dim,
+                self.num_heads,
+                bias=True,
+                quant_config=quant_config,
+                prefix=f"{prefix}.qkv_proj",
+            )
+            self.o_proj = RowParallelLinear(
+                self.num_heads * self.head_dim,
+                self.embed_dim,
+                bias=True,
+                input_is_parallel=True,
+                quant_config=quant_config,
+                prefix=f"{prefix}.o_proj",
+            )
+
+        self.rotary_emb = get_rope(
+            head_size=self.head_dim,
+            rotary_dim=config.hidden_size // config.num_attention_heads // 2,
+            # number of image patches
+            max_position=(config.image_size // config.patch_size)**2,
+            base=config.rope_theta,
+            rope_scaling={"rope_type": "mllama4"},
+            is_neox_style=False,
+            dtype=torch.complex64,  # important
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        input_shape = hidden_states.shape[:-1]
+
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        q = q.view(q.shape[0], q.shape[1], self.num_local_heads, self.head_dim)
+        k = k.view(k.shape[0], k.shape[1], self.num_local_heads, self.head_dim)
+        q, k = self.rotary_emb(q, k)
+
+        q = q.view(q.shape[0], q.shape[1], -1)
+        k = k.view(k.shape[0], k.shape[1], -1)
+
+        attn_output = self.attn(q, k, v)
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output, _ = self.o_proj(attn_output)
+
+        return attn_output
+
+
+class Llama4VisionEncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Llama4VisionConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_attention_heads = config.num_attention_heads
+        self.intermediate_size = config.intermediate_size
+
+        self.self_attn = Llama4VisionAttention(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+            use_data_parallel=use_data_parallel,
+        )
+        self.mlp = Llama4VisionMLP(
+            input_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            output_size=config.hidden_size,
+            bias=True,
+            output_activation=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+            use_data_parallel=use_data_parallel,
+        )
+
+        self.input_layernorm = nn.LayerNorm(config.hidden_size)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size)
+
+    def forward(
+        self,
+        hidden_state: torch.Tensor,
+    ):
+        # Self Attention
+        residual = hidden_state
+        hidden_state = self.input_layernorm(hidden_state)
+        hidden_state = self.self_attn(hidden_state)
+        hidden_state = residual + hidden_state
+
+        # Feed forward
+        residual = hidden_state
+        hidden_state = self.post_attention_layernorm(hidden_state)
+        hidden_state = self.mlp(hidden_state)
+        hidden_state = residual + hidden_state
+
+        outputs = (hidden_state, )
+        return outputs
+
+
+class Llama4VisionEncoder(nn.Module):
+
+    def __init__(
+        self,
+        config: Llama4VisionConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([
+            Llama4VisionEncoderLayer(
+                config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.layers.{layer_idx}",
+                use_data_parallel=use_data_parallel,
+            ) for layer_idx in range(config.num_hidden_layers)
+        ])
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape
+                    `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to
+                directly pass an embedded representation. This is useful if you
+                want more control over how to convert `input_ids` indices into
+                associated vectors than the model's internal embedding
+                lookup matrix.
+        """
+
+        for encoder_layer in self.layers:
+            layer_outputs = encoder_layer(hidden_states)
+            hidden_states = layer_outputs[0]
+
+        return hidden_states
+
+
+class Llama4UnfoldConvolution(nn.Module):
+
+    def __init__(
+        self,
+        config: Llama4VisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
+        super().__init__()
+        kernel_size = config.patch_size
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size, kernel_size)
+        self.unfold = torch.nn.Unfold(kernel_size=kernel_size,
+                                      stride=config.patch_size)
+        params = {
+            "input_size":
+            config.num_channels * kernel_size[0] * kernel_size[1],
+            "output_size": config.hidden_size,
+            "bias": False,
+            "quant_config": quant_config,
+            "prefix": f"{prefix}.linear",
+        }
+        if use_data_parallel:
+            cls = ReplicatedLinear
+        else:
+            cls = ColumnParallelLinear
+            params["gather_output"] = True
+        self.linear = cls(**params)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.unfold(hidden_states)
+        hidden_states = hidden_states.permute(0, 2, 1)
+        hidden_states, _ = self.linear(hidden_states)
+        return hidden_states
+
+
+class Llama4VisionModel(nn.Module):
+
+    def __init__(
+        self,
+        config: Llama4VisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
+        super().__init__()
+        self.config = config
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.hidden_size = config.hidden_size
+        self.num_channels = config.num_channels
+
+        self.num_patches = (self.image_size // self.patch_size)**2 + 1
+        self.scale = config.hidden_size**-0.5
+
+        self.patch_embedding = Llama4UnfoldConvolution(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.patch_embedding",
+            use_data_parallel=use_data_parallel,
+        )
+
+        self.class_embedding = nn.Parameter(self.scale *
+                                            torch.randn(self.hidden_size))
+        self.positional_embedding_vlm = nn.Parameter(
+            self.scale * torch.randn(self.num_patches, self.hidden_size))
+
+        # layer norms
+        self.layernorm_pre = nn.LayerNorm(self.hidden_size, eps=1e-5)
+        self.layernorm_post = nn.LayerNorm(self.hidden_size, eps=1e-5)
+
+        # encoders
+        self.model = Llama4VisionEncoder(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.model",
+            use_data_parallel=use_data_parallel,
+        )
+        self.vision_adapter = Llama4VisionPixelShuffleMLP(
+            config,
+            quant_config,
+            prefix=f"{prefix}.vision_adapter",
+            use_data_parallel=use_data_parallel,
+        )
+
+    def forward(
+        self,
+        images_flattened: torch.Tensor,
+    ) -> torch.Tensor:
+        # Patch embedding
+        hidden_state = self.patch_embedding(images_flattened)
+        num_tiles, num_patches, hidden_dim = hidden_state.shape
+
+        # Add cls token
+        class_embedding = self.class_embedding.expand(hidden_state.shape[0], 1,
+                                                      hidden_state.shape[-1])
+        hidden_state = torch.cat([hidden_state, class_embedding], dim=1)
+        num_patches += 1
+
+        # Position embeddings
+        hidden_state = hidden_state.reshape(
+            num_tiles,
+            1,
+            num_patches,
+            hidden_dim,
+        )
+        positional_embedding = self.positional_embedding_vlm.to(
+            dtype=hidden_state.dtype, device=hidden_state.device)
+        hidden_state = hidden_state + positional_embedding
+        hidden_state = self.layernorm_pre(hidden_state)
+        hidden_state = hidden_state.view(num_tiles, -1, hidden_dim)
+
+        # Apply encoder
+        hidden_state = self.model(hidden_state)
+        hidden_state = self.layernorm_post(hidden_state)
+
+        # Remove CLS token output
+        hidden_state = hidden_state[:, :-1, :]
+
+        # now, we use Llama4VisionPixelShuffle + mlp to project embeddings
+        hidden_state = self.vision_adapter(hidden_state)
+
+        return hidden_state
+
+
+class Mllama4ProcessingInfo(BaseProcessingInfo):
+
+    def __init__(self, ctx: InputProcessingContext) -> None:
+        super().__init__(ctx)
+
+    def get_hf_config(self) -> Llama4Config:
+        return self.ctx.get_hf_config(Llama4Config)
+
+    def get_hf_processor(self, **kwargs: object) -> Llama4Processor:
+        return self.ctx.get_hf_processor(Llama4Processor,
+                                         use_fast=True,
+                                         **kwargs)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        # Although vLLM can support more images from an infra capability
+        # perspective, we do not recommend using >10 images in practice.
+        return {"image": None}
+
+    @staticmethod
+    def get_patch_per_chunk(vision_config: Llama4VisionConfig) -> int:
+        image_size = vision_config.image_size
+        patch_size = vision_config.patch_size
+
+        assert (
+            image_size %
+            patch_size == 0), f"chunk size {image_size} should be multiple of "
+        f"patch_size {patch_size}"
+
+        ds_ratio = int(round(1.0 / (vision_config.pixel_shuffle_ratio**2)))
+        return (image_size // patch_size)**2 // ds_ratio
+
+    def get_max_num_tiles(self) -> int:
+        image_processor = self.get_hf_processor().image_processor
+        return image_processor.max_patches
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        vision_config = self.get_hf_config().vision_config
+        image_size = vision_config.image_size
+        # Result in the max possible feature size (h:w = 16:1)
+        return ImageSize(height=self.get_max_num_tiles() * image_size,
+                         width=image_size)
+
+
+class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
+                                 ):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        tokenizer = self.info.get_tokenizer()
+
+        if mm_data is None:
+            return tokenizer(prompt, add_special_tokens=False)  # exclude bos
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+
+        processor = self.info.get_hf_processor(**mm_kwargs)
+        image_processor = processor.image_processor
+        vision_config = self.info.get_hf_config().vision_config
+
+        if processed_outputs.get("pixel_values") is not None:
+            assert (
+                "images" in mm_data
+            ), "images expected to be in mm_data when pixel_values is present"
+
+            images = mm_data["images"]
+            parsed_images = (self._get_data_parser().parse_mm_data({
+                "image":
+                images
+            }).get_items("image", ImageProcessorItems))
+
+            tile_size = vision_config.image_size
+            possible_resolutions = find_supported_resolutions(
+                max_num_chunks=self.info.get_max_num_tiles(),
+                patch_size=SizeDict(height=tile_size, width=tile_size),
+            )
+            best_fit_sizes = [
+                get_best_fit(
+                    (image.size[1], image.size[0]),
+                    torch.tensor(possible_resolutions),
+                    resize_to_max_canvas=image_processor.resize_to_max_canvas,
+                ) for image in parsed_images
+            ]
+            # TODO tile height/width do not necessarily need to match
+            aspect_ratios = [(image_size[0] // tile_size,
+                              image_size[1] // tile_size)
+                             for image_size in best_fit_sizes]
+            patches_per_image = [
+                1 if r_h * r_w == 1 else 1 + r_h * r_w
+                for (r_h, r_w) in aspect_ratios
+            ]
+
+            processed_outputs["aspect_ratios"] = aspect_ratios
+            processed_outputs["patches_per_image"] = torch.tensor(
+                patches_per_image)
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        patches_per_image = hf_inputs.get("patches_per_image", torch.empty(0))
+        return dict(
+            pixel_values=MultiModalFieldConfig.flat_from_sizes(
+                "image", patches_per_image),
+            patches_per_image=MultiModalFieldConfig.batched("image"),
+            aspect_ratios=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptUpdate]:
+        assert (
+            mm_items.get_count("image", strict=False) == 0
+            or "aspect_ratios" in out_mm_kwargs
+        ), "Transformers expect to include aspect_ratios in out_mm_kwargs"
+
+        config = self.info.get_hf_config()
+        vision_config = config.vision_config
+
+        num_patches_per_chunk = self.info.get_patch_per_chunk(vision_config)
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_token = hf_processor.image_token
+        img_patch_token = hf_processor.img_patch_token
+
+        def get_replacement(item_idx: int):
+            aspect_ratio = out_mm_kwargs["aspect_ratios"][item_idx]
+
+            repl = hf_processor._prompt_split_image(
+                aspect_ratio=aspect_ratio,
+                num_patches_per_chunk=num_patches_per_chunk,
+            )
+
+            return PromptUpdateDetails.select_text(repl, img_patch_token)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=image_token,
+                replacement=get_replacement,
+            )
+        ]
+
+
+class Mllama4DummyInputsBuilder(BaseDummyInputsBuilder[Mllama4ProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.fake_image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
+        (target_width,
+         target_height) = self.info.get_image_size_with_most_features()
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Mllama4MultiModalProcessor,
+    info=Mllama4ProcessingInfo,
+    dummy_inputs=Mllama4DummyInputsBuilder,
+)
+class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                     SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<|image|>"
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        self.use_data_parallel = (vllm_config.parallel_config.
+                                  enable_multimodal_encoder_data_parallel)
+        self.config = config
+        self.quant_config = quant_config
+        self.multimodal_config = multimodal_config
+        self.vision_model = Llama4VisionModel(
+            config.vision_config,
+            None,
+            prefix=maybe_prefix(prefix, "vision_model"),
+            use_data_parallel=self.use_data_parallel,
+        )
+        self.multi_modal_projector = Llama4MultiModalProjector(
+            self.config,
+            None,
+            prefix=maybe_prefix(prefix, "multi_modal_projector"))
+        self.language_model = initialize_model(
+            vllm_config=vllm_config.with_hf_config(config.text_config,
+                                                   ["LlamaForCausalLM"]),
+            prefix=maybe_prefix(prefix, "language_model"),
+            model_class=Llama4ForCausalLM,
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Llama4ImagePatchInputs]:
+        # num_images, 1, num_chunks, channel, image_size, image_size
+        pixel_values = kwargs.pop("pixel_values", None)
+        if pixel_values is None:
+            return None
+
+        # num_images x num_chunks, channel, image_size, image_size
+        # TODO: confirm handling for variable lengths
+        flat_pixel_values = flatten_bn(pixel_values, concat=True)
+        patches_per_image = flatten_bn(kwargs.pop("patches_per_image"))
+
+        aspect_ratios = kwargs.pop("aspect_ratios", None)
+        if not isinstance(aspect_ratios, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of aspect_ratios. "
+                             f"Got type: {type(aspect_ratios)}")
+
+        return Llama4ImagePatchInputs(
+            type="pixel_values",
+            flat_data=flat_pixel_values,
+            patches_per_image=patches_per_image,
+            aspect_ratios=aspect_ratios,
+        )
+
+    def _process_image_input(
+            self, image_input: Llama4ImagePatchInputs) -> MultiModalEmbeddings:
+        flat_data = image_input["flat_data"]
+        patches_per_image = image_input["patches_per_image"].tolist()
+
+        # shard image input
+        if self.use_data_parallel:
+            vision_embeddings_flat = run_dp_sharded_vision_model(
+                flat_data, self.vision_model)
+        else:
+            vision_embeddings_flat = self.vision_model(flat_data)
+
+        vision_embeddings_flat = self.multi_modal_projector(
+            vision_embeddings_flat)
+
+        return [
+            img.flatten(0, 1)
+            for img in vision_embeddings_flat.split(patches_per_image, dim=0)
+        ]
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self, **kwargs) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return []
+
+        return self._process_image_input(image_input)
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+
+        if multimodal_embeddings is not None and len(
+                multimodal_embeddings) != 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                self.config.image_token_index,
+            )
+
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner,
+        # this condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        return self.language_model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def separate_weights(
+        self,
+        weights: Iterable[tuple[str, torch.Tensor]],
+        prefix: str,
+    ) -> tuple[Iterable[tuple[str, torch.Tensor]], Iterable[tuple[
+            str, torch.Tensor]]]:
+        weights1, weights2 = tee(weights, 2)
+
+        def get_prefix_weights() -> Iterable[tuple[str, torch.Tensor]]:
+            for name, data in weights1:
+                if name.startswith(prefix):
+                    yield (name, data)
+
+        def get_other_weights() -> Iterable[tuple[str, torch.Tensor]]:
+            for name, data in weights2:
+                if not name.startswith(prefix):
+                    yield (name, data)
+
+        return get_prefix_weights(), get_other_weights()
+
+    def _consolidate_qkv_weights(
+        self, weights: Iterable[tuple[str, torch.Tensor]]
+    ) -> Iterable[tuple[str, torch.Tensor]]:
+        qkv_idx_mappings = {
+            ".self_attn.q_proj": 0,
+            ".self_attn.k_proj": 1,
+            ".self_attn.v_proj": 2,
+        }
+        qkv_weights = {}
+        for name, loaded_weight in weights:
+            for weight_name, idx in qkv_idx_mappings.items():
+                if weight_name not in name:
+                    continue
+                new_name = name.replace(weight_name, ".self_attn.qkv_proj")
+                if new_name not in qkv_weights:
+                    qkv_weights[new_name] = [None] * 3
+                qkv_weights[new_name][idx] = loaded_weight
+                break
+            else:
+                yield name, loaded_weight
+        for key, weight in qkv_weights.items():
+            qkv_weight = torch.cat(weight, dim=0)
+            yield key, qkv_weight
+
+    def _rename_weight_for_modelopt_checkpoint(self, name: str) -> str:
+        """Rename weights from ModelOpt llama4 fp8 checkpoints to vLLM
+        format."""
+        if name.startswith("model."):
+            # Handle expert scale parameters with flat naming
+            if "feed_forward.experts." in name and ("_input_scale" in name or
+                                                    "_weight_scale" in name):
+                renamed = name.replace("model.", "language_model.model.", 1)
+                # Map checkpoint naming to vLLM's expected naming
+                if "down_proj_input_scale" in renamed:
+                    return renamed.replace("down_proj_input_scale",
+                                           "w2_input_scale")
+                elif "down_proj_weight_scale" in renamed:
+                    return renamed.replace("down_proj_weight_scale",
+                                           "w2_weight_scale")
+                elif "gate_up_proj_input_scale" in renamed:
+                    return renamed.replace("gate_up_proj_input_scale",
+                                           "w13_input_scale")
+                elif "gate_up_proj_weight_scale" in renamed:
+                    return renamed.replace("gate_up_proj_weight_scale",
+                                           "w13_weight_scale")
+                return renamed
+
+            # Handle attention scale parameters
+            elif "self_attn." in name and (".k_scale" in name
+                                           or ".v_scale" in name):
+                renamed = name.replace("model.", "language_model.model.", 1)
+                if ".k_proj.k_scale" in renamed:
+                    return renamed.replace(".k_proj.k_scale", ".attn.k_scale")
+                elif ".v_proj.v_scale" in renamed:
+                    return renamed.replace(".v_proj.v_scale", ".attn.v_scale")
+                return renamed
+
+            # Standard model.* to language_model.model.* renaming
+            return name.replace("model.", "language_model.model.", 1)
+
+        elif name.startswith("lm_head.weight"):
+            return name.replace("lm_head.weight",
+                                "language_model.lm_head.weight")
+
+        return name
+
+    def _separate_and_rename_weights(
+        self, weights: Iterable[tuple[str, torch.Tensor]]
+    ) -> tuple[list[tuple[str, torch.Tensor]], list[tuple[str, torch.Tensor]]]:
+        """Rename weights and separate them into language_model and other
+        weights."""
+        language_model_weights = []
+        other_weights = []
+
+        for name, weight in weights:
+            renamed = self._rename_weight_for_modelopt_checkpoint(name)
+
+            if renamed.startswith("language_model."):
+                language_model_weights.append((renamed, weight))
+            else:
+                other_weights.append((renamed, weight))
+
+        return language_model_weights, other_weights
+
+    def _handle_expert_scale_broadcasting(
+            self, weights: list[tuple[str, torch.Tensor]], params_dict: dict
+    ) -> tuple[list[tuple[str, torch.Tensor]], set[str]]:
+        """Handle expert scale parameters that need broadcasting.
+
+        ModelOpt checkpoints use a single value tensor scalar for BMM style
+        experts, vLLM expects the scale to be broadcasted across all experts.
+        """
+        regular_weights = []
+        expert_scale_weights = []
+        updated_params = set()
+
+        for name, weight in weights:
+            # Check if this is an expert scale parameter that needs broadcasting
+            if ("feed_forward.experts." in name and "scale" in name
+                    and ".shared_expert" not in name):
+                if name in params_dict:
+                    param = params_dict[name]
+                    if (hasattr(param, 'data') and param.data.numel() > 1
+                            and weight.numel() == 1):
+                        # Broadcast single value to all experts
+                        param.data.fill_(weight.item())
+                        updated_params.add(name)
+                        continue
+
+                expert_scale_weights.append((name, weight))
+            else:
+                regular_weights.append((name, weight))
+
+        return regular_weights, expert_scale_weights, updated_params
+
+    def _load_other_weights(self, other_weights: Iterable[tuple[str,
+                                                                torch.Tensor]],
+                            params_dict: dict,
+                            stacked_params_mapping: list) -> set[str]:
+        """Load non-language-model weights with stacking support."""
+        updated_params = set()
+
+        if self.use_data_parallel:
+            other_weights = self._consolidate_qkv_weights(other_weights)
+
+        for name, loaded_weight in other_weights:
+            # Try stacked parameter mapping first
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name or self.use_data_parallel:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                updated_params.add(name)
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Use regular weight loading
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+                updated_params.add(name)
+
+        return updated_params
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".self_attn.qkv_proj", ".self_attn.q_proj", "q"),
+            (".self_attn.qkv_proj", ".self_attn.k_proj", "k"),
+            (".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
+            # Shared expert gate_up_proj stacking
+            (".shared_expert.gate_up_proj", ".shared_expert.gate_proj", 0),
+            (".shared_expert.gate_up_proj", ".shared_expert.up_proj", 1),
+            # Feed forward gate_up_proj stacking (for non-MoE layers if any)
+            (".feed_forward.gate_up_proj", ".feed_forward.gate_proj", 0),
+            (".feed_forward.gate_up_proj", ".feed_forward.up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        updated_params: set[str] = set()
+
+        # Separate and rename weights
+        language_model_weights, other_weights = (
+            self._separate_and_rename_weights(weights))
+
+        # Handle expert scale parameters
+        regular_weights, expert_scale_weights, updated_params_from_experts = (
+            self._handle_expert_scale_broadcasting(language_model_weights,
+                                                   params_dict))
+        updated_params.update(updated_params_from_experts)
+
+        loader = AutoWeightsLoader(self)
+        loaded_language_model_params = loader.load_weights(regular_weights)
+        assert loaded_language_model_params is not None
+        updated_params.update(loaded_language_model_params)
+
+        if expert_scale_weights:
+            loaded_expert_scale_params = loader.load_weights(
+                expert_scale_weights)
+            if loaded_expert_scale_params:
+                updated_params.update(loaded_expert_scale_params)
+
+        updated_params.update(
+            self._load_other_weights(other_weights, params_dict,
+                                     stacked_params_mapping))
+
+        return updated_params
diff --git a/vllm_v0.10.0/vllm/model_executor/models/mlp_speculator.py b/vllm_v0.10.0/vllm/model_executor/models/mlp_speculator.py
new file mode 100644
index 0000000..c6a9738
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/mlp_speculator.py
@@ -0,0 +1,206 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+from collections.abc import Iterable
+
+import torch
+import torch.nn as nn
+
+from vllm.config import VllmConfig
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+
+SQRT2 = 2**0.5
+
+
+class MLPSpeculatorLayerNorm(nn.Module):
+    """
+    A L2 normalization implementation
+    ...
+    Args
+    ----
+    normalized_shape : int
+        Dimensionality of input data (size of final tensor axis)
+    eps : float
+        Safety term to prevent division by zero. Make sure the chosen value
+         fits in the range of your encoding scheme
+         (i.e. fp16 requires eps >= 6e-8).
+    elementwise_scale_and_shift : bool
+        Include a learned scaling and shift term after normalization.
+    """
+
+    def __init__(
+        self,
+        normalized_shape,
+        eps=1e-06,
+        elementwise_scale_and_shift=True,
+    ):
+        super().__init__()
+        self.elementwise_scale_and_shift = elementwise_scale_and_shift
+        if self.elementwise_scale_and_shift:
+            self.weight = nn.Parameter(torch.empty(normalized_shape))
+            self.bias = nn.Parameter(torch.empty(normalized_shape))
+        self.eps = eps
+
+    def forward(self, x):
+        xf = x
+        xf = xf * torch.rsqrt(xf.pow(2).mean(-1, keepdim=True) + self.eps)
+        x = xf.type_as(x)
+        if self.elementwise_scale_and_shift:
+            x = self.weight * x
+            x = x + self.bias
+        return x
+
+
+class MLPSpeculator(nn.Module):
+    """
+    An implementation of the speculative models introduced in
+    "Accelerating Production LLMs with Combined Token/Embedding
+    Speculators"
+    https://arxiv.org/pdf/2404.19124
+
+    Trained speculators of this type are available on HF hub at:
+    https://huggingface.co/ibm-ai-platform and https://huggingface.co/ibm-granite
+    """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        self.n_predict = config.n_predict
+        self.vocab_size = config.vocab_size
+        self.emb_dim = config.emb_dim
+        self.inner_dim = config.inner_dim if config.inner_dim != 0 \
+            else config.emb_dim
+
+        self.max_speculative_tokens = config.num_lookahead_tokens
+
+        self.tie_weights = config.tie_weights
+        self.scale_input = config.scale_input
+
+        if self.tie_weights:
+            assert (
+                self.n_predict > 1
+            ), "You cannot tie weights between stages when only 1 exists"
+            embedding = VocabParallelEmbedding(
+                config.vocab_size,
+                self.inner_dim,
+                org_num_embeddings=config.vocab_size)
+            self.emb = nn.ModuleList([embedding] * self.max_speculative_tokens)
+
+            # the initial projection from the base model may
+            # have a different size, so that stays separate.
+            proj_first = nn.Linear(self.emb_dim, self.inner_dim, bias=False)
+            proj_tied = nn.Linear(self.inner_dim, self.inner_dim, bias=False)
+            self.proj = nn.ModuleList([proj_first] + [proj_tied] *
+                                      (self.max_speculative_tokens - 1))
+
+            head = ParallelLMHead(self.vocab_size, self.inner_dim, bias=False)
+            self.head = nn.ModuleList([head] * self.max_speculative_tokens)
+
+            ln = MLPSpeculatorLayerNorm(self.inner_dim,
+                                        elementwise_scale_and_shift=True)
+            self.ln = nn.ModuleList([ln] * self.max_speculative_tokens)
+
+        else:
+            self.emb = nn.ModuleList([
+                VocabParallelEmbedding(config.vocab_size,
+                                       self.inner_dim,
+                                       org_num_embeddings=config.vocab_size)
+                for _ in range(self.max_speculative_tokens)
+            ])
+
+            self.proj = nn.ModuleList([
+                nn.Linear((self.emb_dim if i == 0 else self.inner_dim),
+                          self.inner_dim,
+                          bias=False)
+                for i in range(self.max_speculative_tokens)
+            ])
+
+            self.head = nn.ModuleList([
+                ParallelLMHead(self.vocab_size, self.inner_dim, bias=False)
+                for _ in range(self.max_speculative_tokens)
+            ])
+            self.ln = nn.ModuleList([
+                MLPSpeculatorLayerNorm(self.inner_dim,
+                                       elementwise_scale_and_shift=True)
+                for _ in range(self.max_speculative_tokens)
+            ])
+        if self.scale_input:
+            self.ln0 = MLPSpeculatorLayerNorm(
+                self.emb_dim, elementwise_scale_and_shift=False)
+
+        self.state_weight = 0.5**(0.5 / config.n_predict)
+        self.emb_weight = math.sqrt(
+            (1 - self.state_weight**2) * (self.inner_dim / 2))
+        self.activation = nn.GELU()
+        self.config = config
+        self.logits_processor = LogitsProcessor(config.vocab_size,
+                                                config.vocab_size, 1.0)
+        self.sampler = get_sampler()
+
+    def generate_proposals(
+        self,
+        input_ids: torch.Tensor,
+        previous_hidden_states: torch.Tensor,
+        num_predict_tokens: int,
+        sampling_metadata: SamplingMetadata,
+    ) -> list[SamplerOutput]:
+        if num_predict_tokens > self.max_speculative_tokens:
+            raise ValueError(f"Max speculative tokens for model is "
+                             f"{self.max_speculative_tokens}, but "
+                             f"{num_predict_tokens} were requested")
+
+        # b x 1 x d
+        previous_hidden_states = previous_hidden_states.unsqueeze(1)
+
+        if self.scale_input:
+            previous_hidden_states = self.ln0(previous_hidden_states) / SQRT2
+
+        # b x 1
+        last_tokens = input_ids.unsqueeze(1)
+
+        next_tokens = []
+
+        for head_index in range(num_predict_tokens):
+
+            # Project and predict
+            z = self.emb[head_index](last_tokens)  # b k d
+            states = self.proj[head_index](previous_hidden_states)
+
+            # Weighted add of state_weight*state and emb_weight*z
+            # Let subsequent LN take care of denominator
+            # state_weight is close to 1, so shouldn't be any precision issues
+            states.add_(z, alpha=self.emb_weight / self.state_weight)
+
+            states = self.activation(self.ln[head_index](states))  # b k d
+            previous_hidden_states = states
+            # TODO: not yet supporting top_k_tokens_per_head
+            states = states.flatten(0, 1)
+
+            logits = self.logits_processor(self.head[head_index], states,
+                                           sampling_metadata)
+
+            output = self.sampler(logits, sampling_metadata)
+            last_tokens = output.sampled_token_ids
+            next_tokens.append(output)
+
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            name = name.replace("speculator.", "")
+            param = params_dict.get(name)
+            if param is not None:
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+                loaded_params.add(name)
+        return loaded_params
diff --git a/vllm_v0.10.0/vllm/model_executor/models/modernbert.py b/vllm_v0.10.0/vllm/model_executor/models/modernbert.py
new file mode 100644
index 0000000..be1c343
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/modernbert.py
@@ -0,0 +1,372 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable, Set
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import ModernBertConfig
+
+from vllm.attention import Attention, AttentionType
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.pooler import (ClassifierPooler,
+                                               DispatchPooler, Pooler,
+                                               PoolingMethod,
+                                               PoolingParamsUpdate,
+                                               PoolingType)
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.pooling_metadata import PoolingMetadata
+from vllm.pooling_params import PoolingTask
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsCrossEncoding, SupportsV0Only
+from .utils import WeightsMapper, maybe_prefix
+
+
+class ModernBertEmbeddings(nn.Module):
+
+    def __init__(self, config: ModernBertConfig):
+
+        super().__init__()
+        self.config = config
+        self.tok_embeddings = VocabParallelEmbedding(config.vocab_size,
+                                                     config.hidden_size)
+        self.norm = nn.LayerNorm(config.hidden_size,
+                                 eps=config.layer_norm_eps,
+                                 bias=config.norm_bias)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if inputs_embeds:
+            return self.norm(inputs_embeds)
+        else:
+            inputs_embeds = self.tok_embeddings(input_ids)
+            embeddings = self.norm(inputs_embeds)
+            return embeddings
+
+
+class ModernBertRotaryEmbedding(RotaryEmbedding):
+
+    def __init__(self, config: ModernBertConfig, head_size: int, dim: int,
+                 base: float):
+        super().__init__(
+            head_size=head_size,
+            rotary_dim=dim,
+            max_position_embeddings=config.max_position_embeddings,
+            base=base,
+            is_neox_style=True,
+            dtype=torch.float16)
+        self.config = config
+
+
+class ModernBertAttention(nn.Module):
+
+    def __init__(self,
+                 config: ModernBertConfig,
+                 layer_id: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.layer_id = layer_id
+        self.deterministic_flash_attn = config.deterministic_flash_attn
+        self.num_heads = config.num_attention_heads
+        assert self.num_heads % tp_size == 0
+        self.head_dim = config.hidden_size // config.num_attention_heads
+        self.all_head_size = self.head_dim * self.num_heads
+        self.scaling = self.head_dim**-0.5
+        self.Wqkv = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            self.num_heads,
+            bias=config.attention_bias,
+        )
+
+        if layer_id % config.global_attn_every_n_layers != 0:
+            self.local_attention = (config.local_attention // 2,
+                                    config.local_attention // 2)
+        else:
+            self.local_attention = (-1, -1)
+
+        rope_theta = config.global_rope_theta
+        if self.local_attention != (
+                -1, -1) and config.local_rope_theta is not None:
+            rope_theta = config.local_rope_theta
+        self.rotary_emb = ModernBertRotaryEmbedding(config=config,
+                                                    head_size=self.head_dim,
+                                                    dim=self.head_dim,
+                                                    base=rope_theta)
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              prefix=f"{layer_id}.attn",
+                              attn_type=AttentionType.ENCODER_ONLY)
+        self.Wo = RowParallelLinear(config.hidden_size,
+                                    config.hidden_size,
+                                    bias=config.attention_bias)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        qkv, _ = self.Wqkv(hidden_states)
+        q, k, v = qkv.split([self.all_head_size] * 3, dim=-1)
+        q, k = self.rotary_emb(position_ids, q, k)
+        attn_outputs = self.attn(q, k, v)
+        hidden_states = attn_outputs
+        hidden_states, _ = self.Wo(hidden_states)
+        return hidden_states
+
+
+class ModernBertMLP(nn.Module):
+
+    def __init__(self, config: ModernBertConfig):
+        super().__init__()
+        self.config = config
+        self.Wi = nn.Linear(config.hidden_size,
+                            int(config.intermediate_size) * 2,
+                            bias=config.mlp_bias)
+        self.act = nn.GELU()
+        self.Wo = RowParallelLinear(config.intermediate_size,
+                                    config.hidden_size,
+                                    bias=config.mlp_bias)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        input, gate = self.Wi(hidden_states).chunk(2, dim=-1)
+        return self.Wo(self.act(input) * gate)[0]
+
+
+class ModernBertLayer(nn.Module):
+
+    def __init__(self,
+                 config: ModernBertConfig,
+                 prefix: str = "",
+                 layer_id: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        if layer_id == 0:
+            self.attn_norm = nn.Identity()
+        else:
+            self.attn_norm = nn.LayerNorm(config.hidden_size,
+                                          eps=config.norm_eps,
+                                          bias=config.norm_bias)
+        self.attn = ModernBertAttention(config=config, layer_id=layer_id)
+        self.mlp_norm = nn.LayerNorm(config.hidden_size,
+                                     eps=config.norm_eps,
+                                     bias=config.norm_bias)
+        self.mlp = ModernBertMLP(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+    ):
+        attn_outputs = self.attn(self.attn_norm(hidden_states),
+                                 position_ids=position_ids)
+        hidden_states = hidden_states + attn_outputs
+        mlp_output = self.mlp(self.mlp_norm(hidden_states))
+        hidden_states = hidden_states + mlp_output
+        return hidden_states
+
+
+class ModernBertEncoderLayer(nn.Module):
+
+    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        self.layers = nn.ModuleList([
+            ModernBertLayer(config=config, layer_id=layer_id)
+            for layer_id in range(config.num_hidden_layers)
+        ])
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        for i, layer in enumerate(self.layers):
+            hidden_states = layer(hidden_states, position_ids)
+        return hidden_states
+
+
+@support_torch_compile
+class ModernBertModel(nn.Module):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={"layers.": "encoder_layer.layers."})
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        self.config = config
+        self.embeddings = ModernBertEmbeddings(config)
+        self.encoder_layer = ModernBertEncoderLayer(vllm_config)
+        self.final_norm = nn.LayerNorm(config.hidden_size,
+                                       eps=config.norm_eps,
+                                       bias=config.norm_bias)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        weights = self.hf_to_vllm_mapper.apply(weights)
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        positions: Optional[torch.Tensor] = None,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        position_ids = positions if positions is not None else position_ids
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.embeddings(input_ids=input_ids,
+                                            inputs_embeds=inputs_embeds)
+
+        outputs = self.encoder_layer(
+            hidden_states=hidden_states,
+            position_ids=position_ids,
+        )
+        norm_outputs = self.final_norm(outputs)
+        return norm_outputs
+
+
+class ModernBertPooler(Pooler):
+
+    def __init__(self, config: ModernBertConfig):
+        super().__init__()
+
+        pooling_type = PoolingType[config.classifier_pooling.upper()]
+        self.pooling = PoolingMethod.from_pooling_type(pooling_type)
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size,
+                               config.classifier_bias)
+        self.pooling_type = config.classifier_pooling
+        self.act = nn.GELU()
+        self.norm = nn.LayerNorm(config.hidden_size,
+                                 eps=config.norm_eps,
+                                 bias=config.norm_bias)
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return self.pooling.get_supported_tasks()
+
+    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
+        return self.pooling.get_pooling_updates(task)
+
+    def _head(self, pooled_output: torch.Tensor):
+        return self.norm(self.act(self.dense(pooled_output)))
+
+    def forward(
+        self,
+        hidden_states: Union[torch.Tensor, list[torch.Tensor]],
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        pooled_output = self.pooling(hidden_states, pooling_metadata)
+
+        if isinstance(pooled_output, list):
+            pooled_output = [self._head(output) for output in pooled_output]
+        else:
+            pooled_output = self._head(pooled_output)
+
+        return pooled_output
+
+
+class ModernBertForSequenceClassification(nn.Module, SupportsV0Only,
+                                          SupportsCrossEncoding):
+
+    is_pooling_model = True
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        self.config = config
+        self.model = ModernBertModel(vllm_config=vllm_config,
+                                     prefix=maybe_prefix(prefix, "modernbert"))
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        pooler_config = vllm_config.model_config.pooler_config
+        assert pooler_config is not None
+
+        self.pooler = DispatchPooler({
+            "encode":
+            Pooler.for_encode(pooler_config),
+            "classify":
+            ClassifierPooler(
+                pooling=ModernBertPooler(config),
+                classifier=self.classifier,
+                act_fn=ClassifierPooler.act_fn_for_seq_cls(
+                    vllm_config.model_config),
+            ),
+            "score":
+            ClassifierPooler(
+                pooling=ModernBertPooler(config),
+                classifier=self.classifier,
+                act_fn=ClassifierPooler.act_fn_for_cross_encoder(
+                    vllm_config.model_config),
+            ),
+        })
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+
+        self_weights = []
+
+        def weight_filter():
+            for name, weight in weights:
+                if name.startswith("model."):
+                    yield name[len("model."):], weight
+                else:
+                    self_weights.append((name, weight))
+
+        self.model.load_weights(weight_filter())
+
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in self_weights:
+            if name.startswith("classifier"):
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            if name.startswith("head"):
+                param = params_dict["_pooler.pooler." + name[len("head") + 1:]]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return self.model(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            position_ids=positions,
+        )
diff --git a/vllm_v0.10.0/vllm/model_executor/models/module_mapping.py b/vllm_v0.10.0/vllm/model_executor/models/module_mapping.py
new file mode 100644
index 0000000..11a2a38
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/module_mapping.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+#  https://github.com/modelscope/ms-swift/blob/v2.4.2/swift/utils/module_mapping.py
+
+from dataclasses import dataclass, field
+from typing import Union
+
+
+@dataclass
+class ModelKeys:
+    model_type: str = None
+
+    module_list: str = None
+
+    embedding: str = None
+
+    mlp: str = None
+
+    down_proj: str = None
+
+    attention: str = None
+
+    o_proj: str = None
+
+    q_proj: str = None
+
+    k_proj: str = None
+
+    v_proj: str = None
+
+    qkv_proj: str = None
+
+    qk_proj: str = None
+
+    qa_proj: str = None
+
+    qb_proj: str = None
+
+    kva_proj: str = None
+
+    kvb_proj: str = None
+
+    output: str = None
+
+
+@dataclass
+class MultiModelKeys(ModelKeys):
+    language_model: list[str] = field(default_factory=list)
+    connector: list[str] = field(default_factory=list)
+    # vision tower and audio tower
+    tower_model: list[str] = field(default_factory=list)
+    generator: list[str] = field(default_factory=list)
+
+    @staticmethod
+    def from_string_field(language_model: Union[str, list[str]] = None,
+                          connector: Union[str, list[str]] = None,
+                          tower_model: Union[str, list[str]] = None,
+                          generator: Union[str, list[str]] = None,
+                          **kwargs) -> 'MultiModelKeys':
+
+        def to_list(value):
+            if value is None:
+                return []
+            return [value] if isinstance(value, str) else list(value)
+
+        return MultiModelKeys(language_model=to_list(language_model),
+                              connector=to_list(connector),
+                              tower_model=to_list(tower_model),
+                              generator=to_list(generator),
+                              **kwargs)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/molmo.py b/vllm_v0.10.0/vllm/model_executor/models/molmo.py
new file mode 100644
index 0000000..78dc0dc
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/molmo.py
@@ -0,0 +1,1576 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from dataclasses import dataclass
+from functools import cached_property, partial
+from typing import Optional, TypedDict, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from transformers import (BatchFeature, PretrainedConfig, ProcessorMixin,
+                          TensorType)
+from transformers.image_utils import ImageInput
+from transformers.tokenization_utils_base import TextInput
+
+from vllm.attention import Attention
+from vllm.attention.layer import MultiHeadAttention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              split_tensor_along_last_dim,
+                              tensor_model_parallel_all_gather)
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.activation import (MulAndSilu, QuickGELU,
+                                                   SiluAndMul)
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptIndexTargets,
+                                        PromptInsertion, PromptUpdate,
+                                        PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP, SupportsQuant)
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
+                    is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix, merge_multimodal_embeddings)
+
+# TODO: hard-coded for now. Consider making it configurable.
+VIT_LAYERS = [-2, -9]
+NUM_PREFIX_TOKENS = 1
+ADDITIONAL_VOCAB_SIZE = 128
+IMAGE_PATCH_TOKEN = "<im_patch>"
+IM_COL_TOKEN = "<im_col>"
+IM_START_TOKEN = "<im_start>"
+IM_END_TOKEN = "<im_end>"
+POOLING_SIZE = 2
+
+
+class MolmoImageInputs(TypedDict):
+    images: Union[torch.Tensor, list[torch.Tensor]]
+    """Shape: `(batch_size * num_images, num_crops, num_patch, patch_dim)`"""
+
+    image_masks: Optional[Union[torch.Tensor, list[torch.Tensor]]]
+    """Shape: `(batch_size * num_images, num_crops, num_patch)`"""
+
+    feat_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image features correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_images, num_crops, num_patch)`
+    """
+
+    num_crops: torch.Tensor
+    """Shape: `(batch_size * num_images)`"""
+
+
+@dataclass
+class VisionBackboneConfig:
+    image_default_input_size: tuple[int, int] = (336, 336)
+    image_patch_size: int = 14
+    image_pos_patch_size: int = 14
+    image_emb_dim: int = 1024
+    image_num_heads: int = 16
+    image_num_key_value_heads: int = 16
+    image_num_layers: int = 23
+    image_mlp_dim: int = 4096
+    image_mlp_activations: str = "quick_gelu"
+    image_num_pos: int = 577
+    image_norm_eps: float = 1e-5
+
+    def __post_init__(self):
+        self.image_default_input_size = tuple(
+            self.image_default_input_size)  # type: ignore[assignment]
+
+    @property
+    def image_num_patch(self):
+        h, w = self.image_default_input_size
+        return h // self.image_patch_size, w // self.image_patch_size
+
+
+class ViTMLP(nn.Module):
+    """MLP used in Vision Transformer."""
+
+    def __init__(
+        self,
+        config: VisionBackboneConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.w1 = ColumnParallelLinear(
+            config.image_emb_dim,
+            config.image_mlp_dim,
+            bias=True,
+            quant_config=quant_config,
+        )
+        # Activation function.
+        assert config.image_mlp_activations == "quick_gelu"
+        self.act = QuickGELU()
+        self.w2 = RowParallelLinear(
+            config.image_mlp_dim,
+            config.image_emb_dim,
+            bias=True,
+            quant_config=quant_config,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, _ = self.w1(x)
+        x = self.act(x)
+        x, _ = self.w2(x)
+        return x
+
+
+class MultiHeadDotProductAttention(nn.Module):
+    """Multi-head attention used in Vision Transformer."""
+
+    def __init__(
+        self,
+        config: VisionBackboneConfig,
+        use_bias: bool = True,
+        nlayers: int = 1,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+
+        self.hidden_size = config.image_emb_dim
+        self.total_num_heads = config.image_num_heads
+        tp_size = get_tensor_model_parallel_world_size()
+
+        assert self.hidden_size % self.total_num_heads == 0
+        assert self.total_num_heads % tp_size == 0
+
+        self.num_heads = self.total_num_heads // tp_size
+        self.head_dim = self.hidden_size // self.total_num_heads
+
+        self.total_num_kv_heads = config.image_num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            assert tp_size % self.total_num_kv_heads == 0
+
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+
+        self.wq = ColumnParallelLinear(
+            nlayers * self.hidden_size,
+            self.total_num_heads * self.head_dim,
+            bias=use_bias,
+            quant_config=quant_config,
+        )
+        self.wk = ColumnParallelLinear(
+            nlayers * self.hidden_size,
+            self.total_num_kv_heads * self.head_dim,
+            bias=use_bias,
+            quant_config=quant_config,
+        )
+        self.wv = ColumnParallelLinear(
+            nlayers * self.hidden_size,
+            self.total_num_kv_heads * self.head_dim,
+            bias=use_bias,
+            quant_config=quant_config,
+        )
+        self.wo = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=use_bias,
+            quant_config=quant_config,
+        )
+
+        self.scale = self.head_dim**-0.5
+        self.attn = MultiHeadAttention(self.num_heads,
+                                       self.head_dim,
+                                       self.scale,
+                                       num_kv_heads=self.num_kv_heads)
+
+    def forward(self,
+                inputs_q: torch.Tensor,
+                inputs_kv: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        if inputs_kv is not None:
+            inputs_k = inputs_kv
+            inputs_v = inputs_kv
+        else:
+            inputs_k = inputs_q
+            inputs_v = inputs_q
+
+        xq, _ = self.wq(inputs_q)
+        xk, _ = self.wk(inputs_k)
+        xv, _ = self.wv(inputs_v)
+
+        output = self.attn(xq, xk, xv)
+        output, _ = self.wo(output)
+
+        return output
+
+
+class ResidualAttentionBlock(nn.Module):
+    """Residual attention block used in Vision Transformer."""
+
+    def __init__(
+        self,
+        config: VisionBackboneConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.attention = MultiHeadDotProductAttention(
+            config, quant_config=quant_config)
+        self.feed_forward = ViTMLP(config, quant_config)
+        self.attention_norm = nn.LayerNorm(
+            config.image_emb_dim,
+            eps=config.image_norm_eps,
+        )
+        self.ffn_norm = nn.LayerNorm(
+            config.image_emb_dim,
+            eps=config.image_norm_eps,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attention(self.attention_norm(x))
+        x = x + self.feed_forward(self.ffn_norm(x))
+        return x
+
+
+class BlockCollection(nn.Module):
+    """Collection of residual attention blocks used in Vision Transformer."""
+
+    def __init__(
+        self,
+        config: VisionBackboneConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.resblocks = nn.ModuleList([
+            ResidualAttentionBlock(config, quant_config)
+            for _ in range(config.image_num_layers)
+        ])
+
+    def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
+        hidden_states = []
+        for r in self.resblocks:
+            x = r(x)
+            hidden_states.append(x)
+        return hidden_states
+
+
+def _expand_token(token: torch.Tensor, batch_size: int) -> torch.Tensor:
+    return token.view(1, 1, -1).expand(batch_size, -1, -1)
+
+
+class VisionTransformer(nn.Module):
+    """Vision Transformer used in Vision Backbone."""
+
+    def __init__(
+        self,
+        config: VisionBackboneConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        scale = config.image_emb_dim**-0.5
+        self.patch_num = config.image_num_patch
+        self.class_embedding = nn.Parameter(
+            torch.randn(config.image_emb_dim) * scale)
+        self.num_prefix_tokens: int = NUM_PREFIX_TOKENS
+        self.positional_embedding = nn.Parameter(
+            torch.randn(config.image_num_pos, config.image_emb_dim) * scale)
+        image_patch_size = config.image_patch_size
+        self.patch_embedding = nn.Linear(
+            image_patch_size * image_patch_size * 3,
+            config.image_emb_dim,
+            bias=False,
+        )
+        self.pre_ln = nn.LayerNorm(config.image_emb_dim,
+                                   eps=config.image_norm_eps)
+        self.transformer = BlockCollection(config, quant_config)
+
+    def add_pos_emb(self, x: torch.Tensor, patch_num: int) -> torch.Tensor:
+        cls_emb = self.positional_embedding[0:1]
+        pos_emb = self.positional_embedding[1:]
+
+        pos_emb = pos_emb.reshape(
+            (int(math.sqrt(pos_emb.shape[0])),
+             int(math.sqrt(pos_emb.shape[0])), pos_emb.shape[1]))
+
+        (patch_num_0, patch_num_1) = patch_num
+
+        if pos_emb.shape[0] != patch_num_0 or pos_emb.shape[1] != patch_num_1:
+            # from https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+            pos_emb = pos_emb.unsqueeze(0).permute(0, 3, 1, 2)
+            pos_emb = F.interpolate(
+                pos_emb,
+                size=(patch_num_0, patch_num_1),
+                mode="bicubic",
+                align_corners=False,
+                antialias=True,
+            )
+            pos_emb = pos_emb.permute(0, 2, 3, 1).squeeze(0)
+
+        pos_emb = pos_emb.reshape(-1, pos_emb.shape[-1])
+        x = x + torch.cat([cls_emb[None, :, :], pos_emb[None, :, :]],
+                          dim=1).to(x.dtype)
+        return x
+
+    def forward(self,
+                x: torch.Tensor,
+                patch_num: Optional[int] = None) -> list[torch.Tensor]:
+        """
+        : param x: (batch_size, num_patch, n_pixels)
+        """
+        if patch_num is None:
+            patch_num = self.patch_num
+        B, N, D = x.shape
+
+        x = self.patch_embedding(x)
+
+        # class embeddings and positional embeddings
+        x = torch.cat(
+            [_expand_token(self.class_embedding, x.shape[0]).to(x.dtype), x],
+            dim=1)
+        x = self.add_pos_emb(x, patch_num)
+
+        x = self.pre_ln(x)
+
+        hidden_states = self.transformer(x)
+        return hidden_states
+
+
+class MolmoAttention(nn.Module):
+    """Molmo's LLM attention."""
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+
+        assert self.hidden_size % self.total_num_heads == 0
+        assert self.total_num_heads % self.tp_size == 0
+
+        self.num_heads = self.total_num_heads // self.tp_size
+        self.total_num_kv_heads = config.num_key_value_heads \
+            or self.total_num_heads
+        if self.total_num_kv_heads >= self.tp_size:
+            assert self.total_num_kv_heads % self.tp_size == 0
+        else:
+            assert self.tp_size % self.total_num_kv_heads == 0
+
+        self.num_kv_heads = max(1, self.total_num_kv_heads // self.tp_size)
+        self.head_dim = self.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+
+        # Attention input projection. Projects x -> (q, k, v)
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=config.qkv_bias,
+            quant_config=quant_config,
+        )
+
+        self.tp_rank: Optional[int] = None
+        self.k_norm: Optional[nn.Module] = None
+        self.q_norm: Optional[nn.Module] = None
+        if config.attention_layer_norm:
+            self.tp_rank = get_tensor_model_parallel_rank()
+            self.k_norm = RMSNorm(self.total_num_kv_heads * self.head_dim,
+                                  eps=config.layer_norm_eps)
+            self.q_norm = RMSNorm(config.hidden_size,
+                                  eps=config.layer_norm_eps)
+
+        # Rotary embeddings.
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+        self.scaling = self.head_dim**-0.5
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+        # Attention output projection.
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+        )
+
+    def _apply_qk_norm(self, q: torch.Tensor,
+                       k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.tp_size > 1:
+            q = tensor_model_parallel_all_gather(q.contiguous())
+            k = tensor_model_parallel_all_gather(k.contiguous())
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        if self.tp_size > 1:
+            splitter = partial(split_tensor_along_last_dim,
+                               num_partitions=self.tp_size)
+            q = splitter(q)[self.tp_rank]
+            k = splitter(k)[self.tp_rank]
+        return q, k
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        if self.q_norm is not None and self.k_norm is not None:
+            q, k = self._apply_qk_norm(q, k)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class LanguageModelMLP(nn.Module):
+    """Molmo's LLM mlp."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 input_dim: Optional[int] = None,
+                 quant_config: Optional[QuantizationConfig] = None) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size // 2
+
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_dim or self.hidden_size,
+            [self.intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+        )
+        # Activation function.
+        self.act_fn = MulAndSilu()
+        # Feed-forward output projection.
+        self.down_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class ImageProjectorMLP(nn.Module):
+    """Molmo's image_projector mlp."""
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        input_dim: Optional[int] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size // 2
+
+        self.merged_linear = MergedColumnParallelLinear(
+            input_dim or self.hidden_size,
+            [self.intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+        )
+        # Activation function.
+        self.act_fn = SiluAndMul()
+
+        # Feed-forward output projection.
+        self.down_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        gate_up, _ = self.merged_linear(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class MolmoDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        # Attention block.
+        self.self_attn = MolmoAttention(config,
+                                        cache_config,
+                                        quant_config,
+                                        prefix=f"{prefix}.self_attn")
+
+        # MLP block.
+        self.mlp = LanguageModelMLP(config, quant_config=quant_config)
+
+        # LayerNorm
+        assert config.layer_norm_type == "rms"
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.layer_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class MolmoDecoderNormAfterLayer(MolmoDecoderLayer):
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = hidden_states + residual
+        residual = None
+        return hidden_states, residual
+
+
+class MolmoVisionBackbone(nn.Module, SupportsQuant):
+    packed_modules_mapping = {"merged_linear": ["gate_proj", "up_proj"]}
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        vision_config: VisionBackboneConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.vit_layers = VIT_LAYERS
+        self.image_num_patch = vision_config.image_num_patch
+        self.llm_patches_per_crop = (
+            (self.image_num_patch[0] + 1) // POOLING_SIZE,
+            (self.image_num_patch[1] + 1) // POOLING_SIZE,
+        )
+        self.image_vit = VisionTransformer(vision_config,
+                                           quant_config=quant_config)
+        self.num_prefix_tokens = self.image_vit.num_prefix_tokens
+        assert self.num_prefix_tokens in {
+            0, 1
+        }, "Only 0 or 1 prefix tokens are supported"
+        self.image_pooling_2d = MultiHeadDotProductAttention(
+            vision_config,
+            nlayers=len(self.vit_layers),
+            quant_config=quant_config)
+        self.image_projector = ImageProjectorMLP(
+            config,
+            input_dim=vision_config.image_emb_dim,
+            quant_config=quant_config,
+        )
+
+        image_dim = vision_config.image_emb_dim * len(self.vit_layers)
+        self.pad_embed = nn.Parameter(torch.zeros((2, image_dim)))
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.image_vit.patch_embedding.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.image_vit.patch_embedding.weight.device
+
+    def encode_image(self, images: torch.Tensor) -> torch.Tensor:
+        """
+        : param images: (batch_size, num_crops, num_patch, n_pixels)
+        """
+        B, T, N, D = images.shape
+
+        mask = ~torch.all(
+            images.view(B * T, N, D) == -1, dim=(1, 2), keepdim=True)
+
+        images = images.view(B * T, N, D)
+        image_features = self.image_vit(images)
+
+        if self.vit_layers is not None:
+            features = []
+            for layer in self.vit_layers:
+                features.append(image_features[layer])
+            image_features = torch.cat(features, dim=-1)
+        else:
+            image_features = image_features[-1]
+
+        if self.num_prefix_tokens > 0:
+            image_features = image_features[:, 1:]
+
+        image_features = image_features * mask
+        image_features = image_features.view(B, T, N, -1)
+
+        return image_features
+
+    def forward(
+        self,
+        images: torch.Tensor,
+        image_masks: torch.Tensor,
+    ) -> torch.Tensor:
+        # image_features: (batch_size, num_crops(=num_image), num_patch, nximage_emb_dim) # noqa: E501
+        batch_size, num_image = images.shape[:2]
+        images = images.to(device=self.device, dtype=self.dtype)
+        image_features = self.encode_image(images)
+
+        og_dtype = image_features.dtype
+        assert image_masks is not None
+        pad_embed = self.pad_embed[:, None, None, None, :]
+        all_pad = image_masks == 0
+        partial_pad = torch.logical_and(
+            image_masks < 1,
+            torch.logical_not(all_pad)).to(dtype=torch.float32)
+        all_pad = all_pad.to(dtype=torch.float32)
+        image_features = image_features + pad_embed[0] * torch.unsqueeze(
+            all_pad, -1)
+        image_features = image_features + pad_embed[1] * torch.unsqueeze(
+            partial_pad, -1)
+
+        image_features = image_features.to(og_dtype)
+
+        image_features = image_features.reshape(
+            (batch_size, num_image) + self.image_num_patch + (-1, ), )
+
+        if (missing_w := self.image_num_patch[0] % POOLING_SIZE):
+            # Padding for image pooling (see below)
+            image_features = F.pad(
+                image_features,
+                (0, 0, 0, missing_w, 0, missing_w, 0, 0, 0, 0),
+            )
+
+        # image pooling
+        image_features = rearrange(
+            image_features,
+            'b n (h dh) (w dw) c -> (b n h w) (dh dw) c',
+            dh=POOLING_SIZE,
+            dw=POOLING_SIZE,
+        )
+
+        query = image_features.mean(-2, keepdim=True)
+        image_features = self.image_pooling_2d(query, image_features)
+
+        h, w = self.llm_patches_per_crop
+        image_features = image_features.view(batch_size, num_image, h * w, -1)
+
+        image_features = self.image_projector(image_features)
+
+        # image_features: (batch_size, num_image, num_patch, d_model)
+        return image_features
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("merged_linear", "gate_proj", 0),
+            ("merged_linear", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+@support_torch_compile
+class MolmoModel(nn.Module, SupportsQuant):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+
+        self.embedding_size = config.embedding_size or config.vocab_size
+        self.embedding_size += ADDITIONAL_VOCAB_SIZE
+        self.embed_tokens = VocabParallelEmbedding(
+            self.embedding_size,
+            config.hidden_size,
+            quant_config=quant_config,
+        )
+
+        decoder_layer = MolmoDecoderNormAfterLayer if config.norm_after \
+            else MolmoDecoderLayer
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: decoder_layer(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+
+        assert config.layer_norm_type == "rms"
+        self.norm = RMSNorm(config.hidden_size, config.layer_norm_eps)
+
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_tokens(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        # Apply blocks one-by-one.
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        if residual is not None:
+            hidden_states, _ = self.norm(hidden_states, residual)
+        else:
+            hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+def _lowest_multiple(x: int, k: int) -> int:
+    return (x // k) * k
+
+
+def get_num_patches(
+    num_tiles: int,
+    *,
+    crop_patches: int,
+    left_margin: int,
+    right_margin: int,
+    pooling_size: int,
+) -> int:
+    if num_tiles == 1:
+        return _lowest_multiple(crop_patches + pooling_size - 1, pooling_size)
+
+    crop_window_patches = crop_patches - (left_margin + right_margin)
+
+    left_num = _lowest_multiple(
+        crop_window_patches + left_margin + pooling_size - 1,
+        pooling_size,
+    )
+    middle_num = _lowest_multiple(
+        crop_window_patches + pooling_size - 1,
+        pooling_size,
+    )
+    right_num = _lowest_multiple(
+        crop_window_patches + right_margin + pooling_size - 1,
+        pooling_size,
+    )
+
+    return left_num + (num_tiles - 2) * middle_num + right_num
+
+
+def get_patches_grid_size(
+    *,
+    tiling_h: int,
+    tiling_w: int,
+    crop_patches: int,
+    left_margin: int,
+    right_margin: int,
+    pooling_size: int,
+) -> tuple[int, int]:
+    nrows = get_num_patches(
+        tiling_h,
+        crop_patches=crop_patches,
+        left_margin=left_margin,
+        right_margin=right_margin,
+        pooling_size=pooling_size,
+    )
+    ncols = get_num_patches(
+        tiling_w,
+        crop_patches=crop_patches,
+        left_margin=left_margin,
+        right_margin=right_margin,
+        pooling_size=pooling_size,
+    )
+
+    return nrows, ncols
+
+
+def get_candidate_tilings(max_num: int) -> list[tuple[int, int]]:
+    tilings = [(i, j) for i in range(1, max_num + 1)
+               for j in range(1, max_num + 1) if i * j <= max_num]
+    return sorted(tilings, key=lambda x: x[0] * x[1])
+
+
+def select_tiling(
+    *,
+    height: int,
+    width: int,
+    patch_size: int,
+    max_num_patches: int,
+):
+    tilings = get_candidate_tilings(max_num_patches)
+    candidate_tilings = np.array(tilings, dtype=np.int32)
+    candidate_resolutions = candidate_tilings * patch_size
+
+    original_size = np.array([height, width], dtype=np.float32)
+    required_scale_d = candidate_resolutions.astype(np.float32) / original_size
+    required_scale = required_scale_d.min(axis=-1, keepdims=True)
+
+    if (required_scale < 1).all():
+        ix = required_scale.argmax()
+    else:
+        ix = np.where(required_scale < 1.0, 10e9, required_scale).argmin()
+
+    return candidate_tilings[ix]
+
+
+class MolmoProcessorWrapper:
+    """
+    Wraps `MolmoProcessor` so that it can be called directly.
+
+    The original definition can be found here:
+    https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/preprocessing_molmo.py
+    """
+
+    def __init__(self, processor: ProcessorMixin):
+        super().__init__()
+
+        self.processor = processor
+
+    @cached_property
+    def vocab(self) -> dict[str, int]:
+        return self.processor.tokenizer.vocab  # type: ignore
+
+    @cached_property
+    def max_crops(self) -> int:
+        image_processor = self.processor.image_processor  # type: ignore
+
+        max_crops = image_processor.max_crops
+        assert isinstance(max_crops, int)
+
+        return max_crops
+
+    @cached_property
+    def base_image_input_size(self) -> tuple[int, int]:
+        image_processor = self.processor.image_processor  # type: ignore
+
+        base_image_input_size = image_processor.base_image_input_size
+        if isinstance(base_image_input_size, int):
+            return base_image_input_size, base_image_input_size
+
+        return tuple(base_image_input_size)
+
+    @cached_property
+    def image_patch_size(self) -> int:
+        image_processor = self.processor.image_processor  # type: ignore
+
+        image_patch_size = image_processor.image_patch_size
+        assert isinstance(image_patch_size, int)
+
+        return image_patch_size
+
+    @cached_property
+    def overlap_margins(self) -> tuple[int, int]:
+        image_processor = self.processor.image_processor  # type: ignore
+
+        left_margin, right_margin = image_processor.overlap_margins
+        assert isinstance(left_margin, int)
+        assert isinstance(right_margin, int)
+
+        return left_margin, right_margin
+
+    @cached_property
+    def image_token_length_w(self) -> int:
+        image_processor = self.processor.image_processor  # type: ignore
+
+        image_token_length_w = image_processor.image_token_length_w
+        assert isinstance(image_token_length_w, int)
+
+        return image_token_length_w
+
+    @cached_property
+    def image_token_length_h(self) -> int:
+        image_processor = self.processor.image_processor  # type: ignore
+
+        image_token_length_h = image_processor.image_token_length_h
+        assert isinstance(image_token_length_h, int)
+
+        return image_token_length_h
+
+    @property
+    def message_format(self) -> Optional[str]:
+        return "role"
+
+    @property
+    def always_start_with_space(self) -> bool:
+        return True
+
+    @cached_property
+    def image_patch_id(self) -> int:
+        return self.vocab[IMAGE_PATCH_TOKEN]
+
+    @cached_property
+    def im_col_id(self) -> int:
+        return self.vocab[IM_COL_TOKEN]
+
+    @cached_property
+    def im_start_id(self) -> int:
+        return self.vocab[IM_START_TOKEN]
+
+    @cached_property
+    def im_end_id(self) -> int:
+        return self.vocab[IM_END_TOKEN]
+
+    @property
+    def pooling_size(self) -> int:
+        return POOLING_SIZE
+
+    def select_tiling(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> tuple[int, int]:
+        max_crops = self.max_crops
+        left_margin, right_margin = self.overlap_margins
+        base_image_input_size = self.base_image_input_size
+        base_image_input_d = self.image_patch_size
+
+        total_margin_pixels = base_image_input_d * (right_margin + left_margin)
+        crop_patches = base_image_input_size[0] // base_image_input_d
+        crop_window_patches = crop_patches - (right_margin + left_margin)
+        crop_window_size = crop_window_patches * base_image_input_d
+        tiling_h, tiling_w = select_tiling(
+            height=image_height - total_margin_pixels,
+            width=image_width - total_margin_pixels,
+            patch_size=crop_window_size,
+            max_num_patches=max_crops,
+        )
+
+        return tiling_w, tiling_h
+
+    def get_patches_grid_size(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> tuple[int, int]:
+        left_margin, right_margin = self.overlap_margins
+        base_image_input_size = self.base_image_input_size
+        base_image_input_d = self.image_patch_size
+        pooling_size = self.pooling_size
+
+        crop_patches = base_image_input_size[0] // base_image_input_d
+        tiling_w, tiling_h = self.select_tiling(
+            image_height=image_height,
+            image_width=image_width,
+        )
+
+        nrows, ncols = get_patches_grid_size(
+            tiling_h=tiling_h,
+            tiling_w=tiling_w,
+            crop_patches=crop_patches,
+            left_margin=left_margin,
+            right_margin=right_margin,
+            pooling_size=pooling_size,
+        )
+
+        return ncols, nrows
+
+    def __call__(
+        self,
+        text: Optional[Union[TextInput, list[TextInput]]] = None,
+        images: Optional[Union[ImageInput, list[ImageInput]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        outputs = self.processor.process(  # type: ignore
+            text, images, **kwargs)
+
+        if images is None:
+            images = []
+        if not isinstance(images, list):
+            images = [images]
+
+        input_ids: torch.Tensor = outputs.pop("input_ids")
+        outputs["input_ids"] = input_ids.unsqueeze(0)
+
+        image_input_idx = outputs.pop("image_input_idx", None)
+        if image_input_idx is not None:
+            feat_is_patch = image_input_idx >= 0
+
+            tilings = [
+                self.select_tiling(
+                    image_width=image.size[0],
+                    image_height=image.size[1],
+                ) for image in images
+            ]
+            # For each image: tiling_h * tiling_w + extra
+            num_crops = torch.tensor(tilings).prod(-1) + 1
+            assert num_crops.sum() == len(feat_is_patch)
+
+            outputs["feat_is_patch"] = feat_is_patch
+            outputs["num_crops"] = num_crops
+            outputs["img_patch_id"] = self.image_patch_id
+
+        return BatchFeature(outputs)
+
+
+class MolmoProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_processor(self, **kwargs: object) -> MolmoProcessorWrapper:
+        processor = self.ctx.get_hf_processor(**kwargs)
+        return MolmoProcessorWrapper(processor)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[MolmoProcessorWrapper],
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        ncols, nrows = processor.get_patches_grid_size(
+            image_width=image_width,
+            image_height=image_height,
+        )
+        pooling_size = processor.pooling_size
+
+        image_token_length_w = processor.image_token_length_w
+        image_token_length_h = processor.image_token_length_h
+
+        extra = image_token_length_w * image_token_length_h
+        joint = ((ncols + 1) // pooling_size) * ((nrows + 1) // pooling_size)
+
+        return extra + joint
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        processor = self.get_hf_processor()
+
+        tilings = get_candidate_tilings(processor.max_crops)
+        base_h, base_w = processor.base_image_input_size
+
+        largest_feature_size, largest_feature_pinpoint = 0, None
+        for wr, hr in tilings:
+            width, height = base_w * wr, base_h * hr
+
+            feat_size = self.get_num_image_tokens(
+                image_width=width,
+                image_height=height,
+                processor=processor,
+            )
+            if feat_size > largest_feature_size:
+                largest_feature_size = feat_size
+                largest_feature_pinpoint = ImageSize(width=width,
+                                                     height=height)
+
+        if largest_feature_size == 0 or largest_feature_pinpoint is None:
+            raise ValueError("Cannot have a largest feature size of 0!")
+
+        return largest_feature_pinpoint
+
+
+class MolmoDummyInputsBuilder(BaseDummyInputsBuilder[MolmoProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        num_images = mm_counts.get("image", 0)
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+
+class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
+
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+        processor = self.info.get_hf_processor()
+
+        # Apply the chat template to the tokens
+        tokens = processor.processor.get_tokens_input(  # type: ignore
+            self.info.get_tokenizer().decode(prompt_tokens),
+            message_format=processor.message_format,
+            always_start_with_space=processor.always_start_with_space,
+        )
+
+        processed_data = self.info.ctx.call_hf_processor(
+            processor,  # type: ignore
+            dict(tokens=tokens),
+        )
+        prompt_ids, = processed_data.pop("input_ids").tolist()
+
+        return prompt_ids
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        num_crops = hf_inputs.get("num_crops", torch.empty(0))
+        num_images = len(num_crops)
+
+        return dict(
+            images=MultiModalFieldConfig.flat_from_sizes("image", num_crops),
+            image_masks=MultiModalFieldConfig.flat_from_sizes(
+                "image", num_crops),
+            feat_is_patch=MultiModalFieldConfig.flat_from_sizes(
+                "image", num_crops),
+            num_crops=MultiModalFieldConfig.batched("image"),
+            img_patch_id=MultiModalFieldConfig.shared("image", num_images),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        image_token_length_w = processor.image_token_length_w
+        image_token_length_h = processor.image_token_length_h
+        pooling_size = processor.pooling_size
+
+        img_patch_id = processor.image_patch_id
+        img_col_id = processor.im_col_id
+        img_start_id = processor.im_start_id
+        img_end_id = processor.im_end_id
+
+        extra_row = [img_patch_id] * image_token_length_w + [img_col_id]
+        extra_joint = ([img_start_id] + extra_row * image_token_length_h +
+                       [img_end_id])
+
+        def get_insertion_molmo(item_idx: int):
+            images = mm_items.get_items("image", ImageProcessorItems)
+            image_size = images.get_image_size(item_idx)
+
+            ncols, nrows = processor.get_patches_grid_size(
+                image_width=image_size.width,
+                image_height=image_size.height,
+            )
+
+            joint_row = ([img_patch_id] * ((ncols + 1) // pooling_size) +
+                         [img_col_id])
+            joint = ([img_start_id] + joint_row *
+                     ((nrows + 1) // pooling_size) + [img_end_id])
+
+            return PromptUpdateDetails.select_token_id(
+                extra_joint + joint,
+                embed_token_id=img_patch_id,
+            )
+
+        return [
+            PromptInsertion(
+                modality="image",
+                target=PromptIndexTargets.prefix("<|endoftext|>"),
+                insertion=get_insertion_molmo,
+            )
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(MolmoMultiModalProcessor,
+                                        info=MolmoProcessingInfo,
+                                        dummy_inputs=MolmoDummyInputsBuilder)
+class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
+                       SupportsQuant):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            # vision backbone mapping
+            "image_projector.w1.": "image_projector.gate_proj.",
+            "image_projector.w3.": "image_projector.up_proj.",
+            "image_projector.w2.": "image_projector.down_proj.",
+            # language backbone mapping
+            "att_proj": "self_attn.qkv_proj",
+            "attn_out": "self_attn.o_proj",
+            "q_norm": "self_attn.q_norm",
+            "k_norm": "self_attn.k_norm",
+            "ff_proj": "mlp.gate_up_proj",
+            "ff_out": "mlp.down_proj",
+            "attn_norm": "input_layernorm",
+            "ff_norm": "post_attention_layernorm",
+        },
+        orig_to_new_prefix={
+            # vision backbone mapping
+            "model.vision_backbone.": "vision_backbone.",
+            # language backbone mapping
+            "model.transformer.blocks.": "model.layers.",
+            "model.transformer.ln_f.": "model.norm.",
+            # lm_head is renamed to model.transformer.mlp.down_proj firstly,
+            # we need to run a second renaming for it
+            "model.transformer.mlp.down_proj.": "lm_head.",
+        },
+    )
+
+    packed_modules_mapping = {
+        "qkv_proj": ["qkv_proj"],
+        "gate_up_proj": ["gate_up_proj"],  # language model
+        "merged_linear": ["gate_proj", "up_proj"]  # image_projector
+    }
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return None
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        lora_config = vllm_config.lora_config
+        self.config = config
+        self.multimodal_config = multimodal_config
+        self.lora_config = lora_config
+
+        vision_config = VisionBackboneConfig()
+        self.vision_backbone = MolmoVisionBackbone(config, vision_config,
+                                                   quant_config)
+        self.model = MolmoModel(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+        self.img_patch_id = None
+
+        if self.config.weight_tying:
+            self.lm_head = self.model.transformer.wte
+        else:
+            self.lm_head = ParallelLMHead(
+                config.embedding_size or config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+            )
+
+        self.logits_processor = LogitsProcessor(config.embedding_size
+                                                or config.vocab_size)
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def _parse_and_validate_image_input(
+        self,
+        **kwargs: object,
+    ) -> Optional[MolmoImageInputs]:
+        images = kwargs.pop("images", None)
+        if images is None:
+            return None
+
+        if not isinstance(images, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of images. "
+                             f"Got type: {type(images)}")
+
+        image_masks = kwargs.pop("image_masks", None)
+        if not (image_masks is None or isinstance(image_masks,
+                                                  (torch.Tensor, list))):
+            raise ValueError("Incorrect type of image_masks. "
+                             f"Got type: {type(image_masks)}")
+
+        feat_is_patch = kwargs.pop("feat_is_patch", None)
+        if not isinstance(feat_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of feat_is_patch. "
+                             f"Got type: {type(feat_is_patch)}")
+
+        num_crops = kwargs.pop("num_crops", None)
+        if not isinstance(num_crops, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of num_crops. "
+                             f"Got type: {type(num_crops)}")
+
+        img_patch_id = kwargs.pop("img_patch_id", None)
+        if not isinstance(img_patch_id, torch.Tensor):
+            raise ValueError("Incorrect type of img_patch_id. "
+                             f"Got type: {type(img_patch_id)}")
+        self.img_patch_id = img_patch_id.flatten().unique().item()
+
+        num_crops = flatten_bn(num_crops, concat=True)
+
+        return MolmoImageInputs(
+            images=images,
+            image_masks=image_masks,
+            feat_is_patch=feat_is_patch,
+            num_crops=num_crops,
+        )
+
+    def _process_image_input(
+        self,
+        image_input: MolmoImageInputs,
+    ) -> list[torch.Tensor]:
+        images = image_input["images"]
+        image_masks = image_input["image_masks"]
+        feat_is_patch = image_input["feat_is_patch"]
+        num_crops = image_input["num_crops"]
+
+        # Call the vision backbone on the whole batch at once
+        images_flat = flatten_bn(images, concat=True)
+        image_masks_flat = (None if image_masks is None else flatten_bn(
+            image_masks, concat=True))
+        feat_is_patch_flat = flatten_bn(feat_is_patch, concat=True)
+
+        image_features_flat = self.vision_backbone(
+            images=images_flat.unsqueeze(0),
+            image_masks=(None if image_masks_flat is None else
+                         image_masks_flat.unsqueeze(0)),
+        ).squeeze(0)
+
+        # Only the features corresponding to patch tokens are relevant
+        return [
+            feats[f_is_patch] for feats, f_is_patch in zip(
+                image_features_flat.split(num_crops.tolist()),
+                feat_is_patch_flat.split(num_crops.tolist()),
+            )
+        ]
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return []
+
+        return self._process_image_input(image_input)
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            assert self.img_patch_id is not None
+
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                self.img_patch_id,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: torch.LongTensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> torch.Tensor:
+
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.model(input_ids,
+                                   positions,
+                                   intermediate_tensors,
+                                   inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+
+        loader = AutoWeightsLoader(self)
+        weights = _get_weights_with_merged_embedding(weights)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="model",
+            connector="vision_backbone.image_projector",
+            tower_model="vision_backbone",
+        )
+
+
+def _get_weights_with_merged_embedding(
+    weights: Iterable[tuple[str, torch.Tensor]]
+) -> Iterable[tuple[str, torch.Tensor]]:
+    embedding_weights = {}
+    for name, weight in weights:
+        if "wte.embedding" in name:
+            embedding_weights["embedding"] = weight
+        elif "wte.new_embedding" in name:
+            embedding_weights["new_embedding"] = weight
+        else:
+            yield (name, weight)
+    # this is compatible with most of quantization,
+    # because they won't quantize embed_tokens
+    embedding_weights = torch.cat(
+        [embedding_weights["embedding"], embedding_weights["new_embedding"]],
+        dim=0,
+    )
+    yield ("model.embed_tokens.weight", embedding_weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/moonvit.py b/vllm_v0.10.0/vllm/model_executor/models/moonvit.py
new file mode 100644
index 0000000..d0fdab1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/moonvit.py
@@ -0,0 +1,630 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/modeling_kimi_vl.py
+# This file is meant to be used in kimi_vl.py only
+# Copyright 2025 The Moonshot AI Team, DeepSeek-AI, and HuggingFace Inc. team. All rights reserved.
+#
+# The code is based on llava (llava/modeling_llava.py) and DeepSeek-V3 (DeepSeek-V3/modeling_deepseek.py), but modified for KimiVL.
+#
+# Licensing Information:
+# - Code derived from llava (llava/modeling_llava.py) and DeepSeek-V3 (DeepSeek-V3/modeling_deepseek.py) is licensed under the Apache License, Version 2.0.
+# - Other parts of the code are licensed under the MIT License.
+#
+# Apache License, Version 2.0:
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# MIT License:
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import math
+from collections.abc import Sequence
+from copy import deepcopy
+from functools import cached_property
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.activations import ACT2FN, PytorchGELUTanh
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import is_flash_attn_2_available
+
+from vllm.transformers_utils.configs.moonvit import MoonViTConfig
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_varlen_func
+else:
+    flash_attn_varlen_func = None
+
+
+def multihead_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    q_cu_seqlens: Optional[torch.Tensor] = None,
+    k_cu_seqlens: Optional[torch.Tensor] = None,
+):
+    """Multi-head attention using flash attention 2.
+
+    Args:
+        q, k, v: tensor of shape (batch_size, seqlen, num_heads, head_dim),
+            or (tot_seqlens, num_heads, head_dim) if packing.
+        q_cu_seqlens (torch.Tensor): cumulative sequence lengths of q.
+            The first element should be 0 and the last element should be q.shape[0].
+        k_cu_seqlens (torch.Tensor): cumulative sequence lengths of k.
+            The first element should be 0 and the last element should be k.shape[0].
+
+    Returns:
+        output: shape (batch_size, seqlen, dim) or (tot_seqlens, dim) if packing,
+            where dim = num_heads * head_dim
+    """
+    # Unified format legal check
+    assert q.dim() == k.dim() == v.dim() == 3, "q, k, v must have 3 dims"
+    assert q_cu_seqlens[-1] == q.shape[
+        0], "q_cu_seqlens must sum to q.shape[0]"
+    assert (k_cu_seqlens[-1] == k.shape[0] ==
+            v.shape[0]), "k_cu_seqlens must sum to k.shape[0]"
+    assert q.dtype in [
+        torch.bfloat16,
+        torch.float16,
+    ], f"unsupported dtype {q.dtype} for multihead attn"
+
+    max_seqlen_q = (q_cu_seqlens[1:] - q_cu_seqlens[:-1]).max().item()
+    max_seqlen_k = (k_cu_seqlens[1:] - k_cu_seqlens[:-1]).max().item()
+    attn_out = flash_attn_varlen_func(
+        q,
+        k,
+        v,
+        q_cu_seqlens,
+        k_cu_seqlens,
+        max_seqlen_q,
+        max_seqlen_k,
+        causal=False,
+    )
+    attn_out = attn_out.flatten(start_dim=-2)
+
+    return attn_out
+
+
+def sdpa_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    q_cu_seqlens: Optional[torch.Tensor] = None,
+    k_cu_seqlens: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """SDPA attention.
+
+    Args:
+        q, k, v: tensor of shape (batch_size, seqlen, num_heads, head_dim),
+            or (tot_seqlens, num_heads, head_dim) if packing.
+    """
+    seq_length = q.shape[0]
+    attention_mask = torch.zeros([1, seq_length, seq_length],
+                                 device=q.device,
+                                 dtype=torch.bool)
+    for i in range(1, len(q_cu_seqlens)):
+        attention_mask[
+            ...,
+            q_cu_seqlens[i - 1]:q_cu_seqlens[i],
+            q_cu_seqlens[i - 1]:q_cu_seqlens[i],
+        ] = True
+    q = q.transpose(0, 1)
+    k = k.transpose(0, 1)
+    v = v.transpose(0, 1)
+    attn_output = F.scaled_dot_product_attention(q,
+                                                 k,
+                                                 v,
+                                                 attention_mask,
+                                                 dropout_p=0.0)
+    attn_output = attn_output.transpose(0, 1)
+    attn_output = attn_output.reshape(seq_length, -1)
+    return attn_output
+
+
+VL_VISION_ATTENTION_FUNCTIONS = {
+    "flash_attention_2": multihead_attention,
+    "sdpa": sdpa_attention,
+}
+
+
+def _apply_rope_input_validation(x, freqs_cis):
+    assert x.ndim == freqs_cis.ndim + 1, (x.shape, freqs_cis.shape)
+    assert x.shape[:-2] == freqs_cis.shape[:-1], (x.shape, freqs_cis.shape)
+    assert x.shape[-1] == 2 * freqs_cis.shape[-1], (x.shape, freqs_cis.shape)
+    assert freqs_cis.dtype == torch.complex64, freqs_cis.dtype
+
+
+def apply_rope(xq: torch.Tensor, xk: torch.Tensor,
+               freqs_cis: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Args: (The leading dimensions of all inputs should be the same)
+        xq: query, tensor of shape (..., num_heads, head_dim)
+        xk: key, tensor of shape (..., num_heads, head_dim)
+        freqs_cis: tensor of shape (..., head_dim/2), dtype=torch.complex64. It contains the precomputed cis(freqs) for each position in the 2D grid.
+    Returns:
+        xq_out, xk_out: tensors of shape (..., num_heads, head_dim)
+    """
+    _apply_rope_input_validation(xq, freqs_cis)
+    _apply_rope_input_validation(xk, freqs_cis)
+
+    freqs_cis = freqs_cis.unsqueeze(-2)  # ..., 1, head_dim/2
+    # ..., num_heads, head_dim/2
+    xq_ = torch.view_as_complex(xq.float().view(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().view(*xq.shape[:-1], -1, 2))
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(
+        -2)  # ..., num_heads, head_dim
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(
+        -2)  # ..., num_heads, head_dim
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+
+
+class Learnable2DInterpPosEmb(nn.Module):
+
+    def __init__(self,
+                 height: int,
+                 width: int,
+                 dim: int,
+                 interpolation_mode: str = "bicubic") -> None:
+        super().__init__()
+        self.height = height
+        self.width = width
+        self.interpolation_mode = interpolation_mode
+        self.weight = nn.Parameter(torch.empty(height, width, dim))
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.normal_(self.weight)
+
+    def forward(self, x: torch.Tensor, grid_hws: torch.Tensor) -> torch.Tensor:
+        pos_embs = []
+        for shape in grid_hws.tolist():
+            if shape == self.weight.shape[:-1]:
+                pos_embs.append(self.weight.flatten(end_dim=1))
+            else:
+                pos_embs.append(
+                    F.interpolate(
+                        self.weight.permute((2, 0, 1)).unsqueeze(0),
+                        size=shape,
+                        mode=self.interpolation_mode,
+                    ).squeeze(0).permute((1, 2, 0)).flatten(end_dim=1))
+        out = x + torch.cat(pos_embs)
+        return out
+
+
+class MoonVisionPatchEmbed(nn.Module):
+
+    def __init__(
+        self,
+        out_dim: int,
+        in_dim: int = 3,
+        patch_size: Union[int, tuple[int, int]] = (14, 14),
+        pos_emb_height: int = 14,
+        pos_emb_width: int = 14,
+    ):
+        super().__init__()
+        assert isinstance(
+            patch_size,
+            (int, Sequence)), f"Invalid patch_size type: {type(patch_size)}"
+        if isinstance(patch_size, int):
+            patch_size = (patch_size, patch_size)
+        assert (len(patch_size) == 2
+                ), f"Expected patch_size to be a tuple of 2, got {patch_size}"
+        self.patch_size = patch_size
+
+        self.proj = nn.Conv2d(in_dim,
+                              out_dim,
+                              kernel_size=patch_size,
+                              stride=patch_size)
+
+        self.pos_emb = Learnable2DInterpPosEmb(height=pos_emb_height,
+                                               width=pos_emb_width,
+                                               dim=out_dim)
+
+    def forward(self, x: torch.Tensor, grid_hw: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x (L, Channels): input tensor
+            grid_hw (N, 2): grid height and width
+
+        Returns:
+            (L, Cout) tensor
+        """
+        x = self.proj(x).view(x.size(0), -1)
+        # apply positional embedding
+        x = self.pos_emb(x, grid_hw)
+        return x
+
+
+class Rope2DPosEmb(nn.Module):
+    """2D rotary position embedding with multi-resolution support.
+
+    This class is intended to be used in the following way:
+    1. Before training, create an instance of Rope2DPosEmb. This instance will hold the precomputed cis.
+    2. Before each forward pass, call `get_freqs_cis_by_*` to get the `freqs_cis` tensor for this iteration.
+    3. During the forward pass, pass the `freqs_cis` tensor to each attention layer, and call `apply` just before each attention operation.
+        The rope is shared across all attention layers and all heads.
+
+    Refs:
+    - RoFormer: https://arxiv.org/abs/2104.09864
+    - VisionLLaMA: https://arxiv.org/abs/2403.00522
+    - https://github.com/Meituan-AutoML/VisionLLaMA/blob/main/dit/models.py
+
+    Args:
+        dim (int): usually the multi-head attention dimension, should be divisible by 4 (TODO: relax this constraint if needed)
+        max_height (int): the maximum height of the 2D grid
+        max_width (int): the maximum width of the 2D grid
+        theta_base (float): the base of the theta
+        device (str): the device to store the precomputed cis
+    """
+
+    def __init__(self,
+                 dim: int,
+                 max_height: int,
+                 max_width: int,
+                 theta_base=10000,
+                 device="cuda"):
+        super().__init__()
+        self.dim = dim
+        assert self.dim % 4 == 0, "dim must be divisible by 4"
+        self.max_height = max_height
+        self.max_width = max_width
+        self.theta_base = theta_base
+        self.device = device
+
+    def extra_repr(self):
+        return f"dim={self.dim}, max_height={self.max_height}, max_width={self.max_width}, theta_base={self.theta_base}"
+
+    @cached_property
+    def precomputed_freqs_cis(self) -> torch.Tensor:
+        """Calculate the cis(freqs) for each position in the 2D grid.
+
+        Return: complex tensor of shape (max_height, max_width, dim//2) and value:
+            height axis: ret[h, w, 2*i] = cis(h * theta_base**(-4*i/dim))
+            weight axis: ret[h, w, 2*i+1] = cis(w * theta_base**(-4*i/dim))   with (i in [0, dim//4))
+            note: `cis` is a mathematical notation defined by cis x = cos x + i sin x,
+        """
+        N = self.max_height * self.max_width
+        flat_pos = torch.arange(0, N).float().to(self.device)
+        x_pos = flat_pos % self.max_width
+        y_pos = flat_pos // self.max_width
+        dim_range = (torch.arange(0, self.dim,
+                                  4)[:(self.dim // 4)].float().to(self.device)
+                     )  # C/4
+        freqs = 1.0 / (self.theta_base**(dim_range / self.dim))
+        x_freqs = torch.outer(x_pos, freqs).float()  # N, C/4
+        y_freqs = torch.outer(y_pos, freqs).float()  # N, C/4
+        x_cis = torch.polar(torch.ones_like(x_freqs), x_freqs)  # N, C/4
+        y_cis = torch.polar(torch.ones_like(y_freqs), y_freqs)  # N, C/4
+        # N, C/4, 2
+        freqs_cis = torch.cat(
+            [x_cis.unsqueeze(dim=-1),
+             y_cis.unsqueeze(dim=-1)], dim=-1)
+        # max_height, max_width, C/2
+        freqs_cis = freqs_cis.reshape(self.max_height, self.max_width, -1)
+        return freqs_cis
+
+    def get_freqs_cis_by_seqlens(self, grid_hws: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            grid_hws (torch.Tensor): containing list of (height, width) or (t, height, width) tuples.
+        Returns:
+            freqs_cis: tensor of shape (sum(t * height * width), dim//2)
+        """
+        shapes = grid_hws.tolist()
+        assert all(1 <= h <= self.max_height and 1 <= w <= self.max_width
+                   for h, w in shapes), (
+                       shapes,
+                       self.max_height,
+                       self.max_width,
+                   )
+        freqs_cis = torch.cat(
+            [
+                self.precomputed_freqs_cis[:h, :w].reshape(-1, self.dim // 2)
+                for h, w in shapes
+            ],
+            dim=0,
+        )
+        return freqs_cis
+
+    def get_freqs_cis_by_idx(self, pos_idx: torch.Tensor,
+                             pos_idx_mask: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            pos_idx: tensor of shape (..., 2), It contains the (h, w) position indices of each 2D token.
+            pos_idx_mask: a mask of shape (...), the leading dimensions should be the same as pos_idx.
+                Rope will only be applied to the tokens with True mask. `freqs_cis` for the tokens with False mask with be ones.
+        Return:
+            freqs_cis: tensor of shape (..., dim//2)
+        """
+        assert (pos_idx.shape[:-1] == pos_idx_mask.shape
+                and pos_idx.shape[-1] == 2 and pos_idx.ndim
+                == pos_idx_mask.ndim + 1), (pos_idx.shape, pos_idx_mask.shape)
+        assert pos_idx_mask.dtype == torch.bool, pos_idx_mask.dtype
+
+        shp = pos_idx_mask.shape + (self.dim // 2, )  # ..., head_dim/2
+        freqs_cis = torch.ones(shp, dtype=torch.complex64,
+                               device=self.device)  # ..., head_dim/2
+        freqs_cis[pos_idx_mask] = self.precomputed_freqs_cis[pos_idx[
+            ..., 0][pos_idx_mask], pos_idx[..., 1][pos_idx_mask]]
+        return freqs_cis
+
+
+class MLP2(nn.Module):
+    """
+    Args:
+        dims: [in_dim, hidden_dim, out_dim]
+        bias: whether to use bias in linear layer.
+    """
+
+    def __init__(self, dims: list[int], activation, bias=True):
+        super().__init__()
+        assert len(dims) == 3
+        self.fc0 = nn.Linear(dims[0], dims[1], bias=bias)
+        self.fc1 = nn.Linear(dims[1], dims[2], bias=bias)
+        self.activation = activation
+        for m in [self.fc0, self.fc1]:
+            nn.init.trunc_normal_(m.weight, std=math.sqrt(2 / m.in_features))
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc0(x)
+        x = self.activation(x)
+        return self.fc1(x)
+
+
+class MoonVitEncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        num_heads: int,
+        hidden_dim: int,
+        mlp_dim: int,
+        *,
+        attn_implementation: str = "sdpa",
+        activation=F.gelu,
+        attn_bias: bool = False,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.hidden_size_per_attention_head = self.hidden_dim // self.num_heads
+        self.attn_implementation = attn_implementation
+        # use fa2 in vllm by default
+        if is_flash_attn_2_available():
+            self.attn_implementation = "flash_attention_2"
+
+        self.norm0 = nn.LayerNorm(hidden_dim)
+        self.norm1 = nn.LayerNorm(hidden_dim)
+        self.mlp = MLP2([hidden_dim, mlp_dim, hidden_dim], activation)
+        self.wqkv = nn.Linear(hidden_dim, hidden_dim * 3, bias=attn_bias)
+        self.wo = nn.Linear(hidden_dim, hidden_dim, bias=attn_bias)
+
+    def attention_qkvpacked(
+        self,
+        x: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rope_freqs_cis: Optional[torch.Tensor] = None,
+    ):
+        """
+        Args:
+            x (torch.Tensor): (batch_size, seqlen, hidden_dim)
+            cu_seqlens (torch.Tensor):
+        """
+        xqkv = self.wqkv(x)
+
+        qkv_shape = xqkv.size()[:-1] + (
+            3,
+            self.num_heads,
+            self.hidden_size_per_attention_head,
+        )
+        # xqkv: (batch_size, seqlen, 3, nheads, headdim)
+        xqkv = xqkv.view(*qkv_shape)
+        xq, xk, xv = torch.unbind(xqkv, dim=-3)
+
+        xq, xk = apply_rope(xq, xk, rope_freqs_cis)
+
+        attn_func = VL_VISION_ATTENTION_FUNCTIONS[self.attn_implementation]
+        attn_out = attn_func(xq,
+                             xk,
+                             xv,
+                             q_cu_seqlens=cu_seqlens,
+                             k_cu_seqlens=cu_seqlens)
+
+        attn_out = self.wo(attn_out)
+        return attn_out
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rope_freqs_cis: Union[torch.Tensor, None] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states: non-packed (B, N, D) or packed (L, D). if non-packed, seqlens should be None, if packed, seqlens should be set
+
+        Returns:
+            output: same shape of input, non-packed (B, N, D) for non-packed input, (L, D) for packed input
+        """
+        residual = hidden_states
+        hidden_states = self.norm0(hidden_states)
+        attn_out = self.attention_qkvpacked(hidden_states,
+                                            cu_seqlens,
+                                            rope_freqs_cis=rope_freqs_cis)
+        hidden_states = residual + attn_out
+
+        residual = hidden_states
+        hidden_states = self.mlp(self.norm1(hidden_states))
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class MoonVitEncoder(nn.Module):
+
+    def __init__(
+        self,
+        hidden_dim: int,
+        num_layers: int,
+        block_cfg: dict,
+    ) -> None:
+        super().__init__()
+
+        self.rope_2d = Rope2DPosEmb(
+            block_cfg["hidden_dim"] // block_cfg["num_heads"], 512, 512)
+        self.blocks = nn.ModuleList(
+            [MoonVitEncoderLayer(**block_cfg) for _ in range(num_layers)])
+        self.final_layernorm = nn.LayerNorm(hidden_dim)
+
+    def forward(self, hidden_states: torch.Tensor,
+                grid_hw: torch.Tensor) -> torch.Tensor:
+        rope_freqs_cis = self.rope_2d.get_freqs_cis_by_seqlens(
+            grid_hws=grid_hw)
+
+        lengths = torch.cat((
+            torch.zeros(1, device=hidden_states.device, dtype=grid_hw.dtype),
+            grid_hw[:, 0] * grid_hw[:, 1],
+        ))
+        cu_seqlens = lengths.cumsum(dim=0, dtype=torch.int32)
+
+        for _, block in enumerate(self.blocks):
+            hidden_states = block(hidden_states,
+                                  cu_seqlens,
+                                  rope_freqs_cis=rope_freqs_cis)
+
+        hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states
+
+
+def patch_merger(
+        x: torch.Tensor,
+        grid_hw: torch.Tensor,
+        merge_kernel_size: list[int, int] = (2, 2),
+) -> list[torch.Tensor]:
+    d_model = x.size(-1)
+
+    outputs = []
+    pre_sum = 0
+    for x_shape in grid_hw.tolist():
+        height, width = x_shape[0], x_shape[1]
+        # Get the current sequence
+        seq = x[pre_sum:pre_sum + height * width]
+        # Reshape along self.merge_kernel_size and concat to the last dimension
+        kernel_height, kernel_width = merge_kernel_size
+        new_height, new_width = height // kernel_height, width // kernel_width
+        reshaped_seq = seq.view(new_height, kernel_height, new_width,
+                                kernel_width, d_model)
+        reshaped_seq = reshaped_seq.permute(0, 2, 1, 3, 4).contiguous()
+        padded_seq = reshaped_seq.view(new_height * new_width,
+                                       kernel_height * kernel_width, -1)
+        outputs.append(padded_seq)
+        pre_sum += height * width
+
+    return outputs
+
+
+class MoonVitVLProjector(nn.Module):
+
+    def __init__(
+        self,
+        in_channels: int,
+        merge_kernel_size: list[int, int],
+        hidden_act: str = "gelu",
+        ln_eps: float = 1e-5,
+        out_dim: int = 4096,
+    ):
+        super().__init__()
+        self.hidden_size = in_channels * merge_kernel_size[
+            0] * merge_kernel_size[1]
+
+        self.pre_norm = nn.nn.LayerNorm(in_channels, eps=ln_eps)
+        self.linear_1 = nn.Linear(self.hidden_size,
+                                  self.hidden_size,
+                                  bias=True)
+        self.act = ACT2FN[hidden_act]
+        self.linear_2 = nn.Linear(self.hidden_size, out_dim, bias=True)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.pre_norm(hidden_states).view(-1, self.hidden_size)
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class MoonVitPretrainedModel(PreTrainedModel):
+    config_class = MoonViTConfig
+    model_type = "moonvit"
+    _no_split_modules = ["PackingTransformer"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+
+    def __init__(self, config: MoonViTConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        config = deepcopy(config)
+        self.merge_kernel_size = config.merge_kernel_size
+        self.patch_size = config.patch_size
+        self.patch_embed = MoonVisionPatchEmbed(
+            out_dim=config.hidden_size,
+            patch_size=config.patch_size,
+            pos_emb_height=config.init_pos_emb_height,
+            pos_emb_width=config.init_pos_emb_width,
+        )
+
+        self.encoder = MoonVitEncoder(
+            hidden_dim=config.hidden_size,
+            num_layers=config.num_hidden_layers,
+            block_cfg={
+                "num_heads": config.num_attention_heads,
+                "hidden_dim": config.hidden_size,
+                "mlp_dim": config.intermediate_size,
+                "activation": PytorchGELUTanh(),
+                "attn_bias": True,
+                "attn_implementation": config._attn_implementation,
+            },
+        )
+
+    def forward(self, pixel_values: torch.Tensor,
+                grid_hw: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            pixel_values (torch.Tensor): The input pixel values.
+            grid_hw (torch.Tensor): The grid height and width.
+
+        Returns:
+            torch.Tensor: The output tokens.
+        """
+        hidden_states = self.patch_embed(pixel_values, grid_hw)
+        hidden_states = self.encoder(hidden_states, grid_hw)
+        hidden_states = patch_merger(hidden_states,
+                                     grid_hw,
+                                     merge_kernel_size=self.merge_kernel_size)
+        return hidden_states
diff --git a/vllm_v0.10.0/vllm/model_executor/models/mpt.py b/vllm_v0.10.0/vllm/model_executor/models/mpt.py
new file mode 100644
index 0000000..0878ada
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/mpt.py
@@ -0,0 +1,331 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main
+import math
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size)
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.mpt import MPTConfig
+
+from .interfaces import SupportsPP
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+def _get_alibi_slopes(
+    total_num_heads: int,
+    alibi_bias_max: int,
+) -> torch.Tensor:
+    next_power_of_2 = 2**math.ceil(math.log2(total_num_heads))
+    m = torch.arange(1, next_power_of_2 + 1, dtype=torch.float32)
+    m = m.mul(alibi_bias_max / next_power_of_2)
+    slopes = 1.0 / torch.pow(2, m)
+    if next_power_of_2 != total_num_heads:
+        slopes = torch.concat([slopes[1::2], slopes[::2]])[:total_num_heads]
+    return slopes
+
+
+class MPTAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: MPTConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.d_model = config.d_model
+        self.total_num_heads = config.n_heads
+        self.head_dim = self.d_model // self.total_num_heads
+        self.clip_qkv = config.attn_config["clip_qkv"]
+        self.qk_ln = config.attn_config["qk_ln"]
+        self.alibi_bias_max = config.attn_config["alibi_bias_max"]
+        if "kv_n_heads" in config.attn_config:
+            self.total_num_kv_heads = config.attn_config['kv_n_heads']
+        else:
+            self.total_num_kv_heads = self.total_num_heads
+        assert not config.attn_config["prefix_lm"]
+        assert config.attn_config["alibi"]
+
+        # pylint: disable=invalid-name
+        self.Wqkv = QKVParallelLinear(
+            self.d_model,
+            self.d_model // self.total_num_heads,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=not config.no_bias,
+            quant_config=quant_config,
+        )
+        if self.qk_ln:
+            self.q_ln = nn.LayerNorm(self.d_model)
+            self.k_ln = nn.LayerNorm(self.d_model)
+        self.out_proj = RowParallelLinear(
+            self.d_model,
+            self.d_model,
+            bias=not config.no_bias,
+            quant_config=quant_config,
+        )
+
+        tp_world_size = get_tensor_model_parallel_world_size()
+        assert self.total_num_heads % tp_world_size == 0
+        self.num_heads = self.total_num_heads // tp_world_size
+
+        if self.total_num_kv_heads >= tp_world_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_world_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_world_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_world_size)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        # Create the alibi slopes and slice them.
+        tp_rank = get_tensor_model_parallel_rank()
+        head_start = tp_rank * self.num_heads
+        head_end = (tp_rank + 1) * self.num_heads
+        alibi_slopes = _get_alibi_slopes(self.total_num_heads,
+                                         self.alibi_bias_max)
+        alibi_slopes = alibi_slopes[head_start:head_end].tolist()
+
+        self.head_dim = self.d_model // self.total_num_heads
+        scaling = self.head_dim**-0.5
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              scaling,
+                              alibi_slopes=alibi_slopes,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        del position_ids  # unused.
+        qkv, _ = self.Wqkv(hidden_states)
+        if self.clip_qkv is not None:
+            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        if self.qk_ln:
+            q = self.q_ln(q)
+            k = self.k_ln(k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.out_proj(attn_output)
+        return output
+
+
+class MPTMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: MPTConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        hidden_size = config.d_model
+        expansion_ratio = config.expansion_ratio
+        intermediate_size = expansion_ratio * hidden_size
+        self.up_proj = ColumnParallelLinear(
+            hidden_size,
+            intermediate_size,
+            bias=not config.no_bias,
+            quant_config=quant_config,
+        )
+        self.act = get_act_fn("gelu")
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=not config.no_bias,
+            quant_config=quant_config,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, _ = self.up_proj(x)
+        x = self.act(x)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class MPTBlock(nn.Module):
+
+    def __init__(
+        self,
+        config: MPTConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        hidden_size = config.d_model
+        self.norm_1 = nn.LayerNorm(hidden_size)
+        self.attn = MPTAttention(config,
+                                 cache_config,
+                                 quant_config,
+                                 prefix=f"{prefix}.attn")
+        self.norm_2 = nn.LayerNorm(hidden_size)
+        self.ffn = MPTMLP(config, quant_config)
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        x = self.norm_1(hidden_states)
+        x = self.attn(
+            position_ids=position_ids,
+            hidden_states=x,
+        )
+        hidden_states = hidden_states + x
+        x = self.norm_2(hidden_states)
+        x = self.ffn(x)
+        hidden_states = hidden_states + x
+        return hidden_states
+
+
+@support_torch_compile
+class MPTModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        assert config.embedding_fraction == 1.0
+        assert config.norm_type == "low_precision_layernorm"
+
+        self.wte = VocabParallelEmbedding(
+            config.vocab_size,
+            config.d_model,
+        )
+        self.start_layer, self.end_layer, self.blocks = make_layers(
+            config.n_layers,
+            lambda prefix: MPTBlock(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.blocks")
+        self.norm_f = nn.LayerNorm(config.d_model)
+        if config.no_bias:
+            for module in self.modules():
+                if hasattr(module, "bias") and isinstance(
+                        module.bias, nn.Parameter):
+                    # Remove the bias term in Linear and LayerNorm.
+                    module.register_parameter("bias", None)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(["hidden_states"],
+                                                    config.d_model))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.wte(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+
+        for block in self.blocks[self.start_layer:self.end_layer]:
+            hidden_states = block(position_ids, hidden_states)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
+        hidden_states = self.norm_f(hidden_states)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            # Skip loading extra bias for GPTQ models.
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class MPTForCausalLM(nn.Module, SupportsPP):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        assert config.tie_word_embeddings
+        self.quant_config = quant_config
+
+        self.transformer = MPTModel(vllm_config=vllm_config,
+                                    prefix=maybe_prefix(prefix, "transformer"))
+        self.lm_head = self.transformer.wte
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.transformer.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.transformer(input_ids, positions,
+                                         intermediate_tensors, inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/nemotron.py b/vllm_v0.10.0/vllm/model_executor/models/nemotron.py
new file mode 100644
index 0000000..eabf47b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/nemotron.py
@@ -0,0 +1,508 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Nemotron model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs import NemotronConfig
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+# The architecture is pretty similar to Llama, with these changes:
+# - There is no gate_proj, just up_proj
+# - Normal LayerNorm (with a +1 to the weights) instead of RMSNorm
+# - Squared ReLU instead of SwiGLU
+# - Adds a partial_rotary_factor to RoPE
+
+
+def _cast_if_autocast_enabled(*args):
+    if not torch.is_autocast_enabled():
+        return args
+    else:
+        return torch.amp.autocast_mode._cast(
+            args, device_type="cuda", dtype=torch.get_autocast_gpu_dtype())
+
+
+class NemotronLayerNorm1P(nn.LayerNorm):
+
+    def __init__(self,
+                 normalized_shape: Union[int, list[int], torch.Size],
+                 eps: float = 1e-5,
+                 elementwise_affine: bool = True,
+                 bias: bool = True,
+                 device=None,
+                 dtype=None):
+        super().__init__(normalized_shape, eps, elementwise_affine, bias,
+                         device, dtype)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if residual is not None:
+            x = x + residual
+            residual = x
+        args = _cast_if_autocast_enabled(x, self.normalized_shape,
+                                         self.weight + 1, self.bias, self.eps)
+        with torch.amp.autocast("cuda", enabled=False):
+            x = torch.nn.functional.layer_norm(*args)
+            return x if residual is None else (x, residual)
+
+
+class NemotronMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.up_proj = ColumnParallelLinear(input_size=hidden_size,
+                                            output_size=intermediate_size,
+                                            bias=bias,
+                                            quant_config=quant_config,
+                                            prefix=f"{prefix}.up_proj")
+        self.down_proj = RowParallelLinear(input_size=intermediate_size,
+                                           output_size=hidden_size,
+                                           bias=bias,
+                                           quant_config=quant_config,
+                                           prefix=f"{prefix}.down_proj")
+        self.act_fn = get_act_fn(hidden_act)
+
+    def forward(self, x):
+        up, _ = self.up_proj(x)
+        x = self.act_fn(up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class NemotronAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: NemotronConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        self.head_dim = getattr(config, "head_dim", None)
+        if self.head_dim is None:
+            self.head_dim = self.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.partial_rotary_factor = config.partial_rotary_factor
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            partial_rotary_factor=self.partial_rotary_factor,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class NemotronDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: NemotronConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+                config, "original_max_position_embeddings", None):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        # Support abacusai/Smaug-72B-v0.1 with attention_bias
+        # Support internlm/internlm-7b with bias
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False)
+        self.self_attn = NemotronAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=getattr(config, "num_key_value_heads",
+                                 config.num_attention_heads),
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=attention_bias,
+            cache_config=cache_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.mlp = NemotronMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            bias=getattr(config, "mlp_bias", False),
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = NemotronLayerNorm1P(config.hidden_size,
+                                                   eps=config.norm_eps)
+        self.post_attention_layernorm = NemotronLayerNorm1P(
+            config.hidden_size, eps=config.norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class NemotronModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.quant_config = quant_config
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+        if get_pp_group().is_first_rank or (config.tie_word_embeddings
+                                            and get_pp_group().is_last_rank):
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: NemotronDecoderLayer(config=config,
+                                                cache_config=cache_config,
+                                                quant_config=quant_config,
+                                                prefix=prefix),
+            prefix=f"{prefix}.layers")
+        if get_pp_group().is_last_rank:
+            self.norm = NemotronLayerNorm1P(config.hidden_size,
+                                            eps=config.norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        assert isinstance(config, NemotronConfig)
+
+        self.config = config
+        self.lora_config = lora_config
+        self.quant_config = quant_config
+
+        self.model = NemotronModel(vllm_config=vllm_config,
+                                   prefix=maybe_prefix(prefix, "model"))
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = config.vocab_size
+            if lora_config:
+                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE
+                # We need bigger padding if using lora for kernel
+                # compatibility
+                if not lora_config else lora_config.lora_vocab_padding_size,
+                quant_config=quant_config,
+            )
+            if config.tie_word_embeddings:
+                self.lm_head.weight = self.model.embed_tokens.weight
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    config.vocab_size,
+                                                    logit_scale)
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        model_output = self.model(input_ids, positions, intermediate_tensors,
+                                  inputs_embeds)
+        return model_output
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/nemotron_h.py b/vllm_v0.10.0/vllm/model_executor/models/nemotron_h.py
new file mode 100644
index 0000000..6a999e2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/nemotron_h.py
@@ -0,0 +1,591 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from https://github.com/vllm-project/vllm/blob/94d8ec8d2bcb4ec55e33022b313c7e978edf05e1/vllm/model_executor/models/bamba.py
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only NemotronH model."""
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+from torch import nn
+
+from vllm import envs
+from vllm.attention.layer import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.layers.activation import ReLUSquaredActivation
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.mamba2_metadata import (
+    Mamba2Metadata, prepare_mamba2_metadata)
+from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
+from vllm.model_executor.layers.mamba.mamba_utils import get_mamba_state_shape
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid,
+                                                   SupportsLoRA, SupportsPP,
+                                                   SupportsQuant)
+from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
+                                                    MambaCacheParams)
+from vllm.model_executor.models.utils import (
+    AutoWeightsLoader, make_empty_intermediate_tensors_factory, make_layers,
+    maybe_prefix)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs import NemotronHConfig
+from vllm.utils import LayerBlockType
+
+
+class NemotronHMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.up_proj = ColumnParallelLinear(
+            input_size=config.hidden_size,
+            output_size=config.intermediate_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=config.intermediate_size,
+            output_size=config.hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+        self.act_fn = ReLUSquaredActivation()
+
+    def forward(self, x: torch.Tensor):
+        x, _ = self.up_proj(x)
+        x = self.act_fn(x)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class NemotronHMLPDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        self.mixer = NemotronHMLP(
+            config,
+            quant_config=quant_config,
+            bias=config.mlp_bias,
+            prefix=f"{prefix}.mixer",
+        )
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+        else:
+            hidden_states, residual = self.norm(hidden_states, residual)
+
+        hidden_states = self.mixer(hidden_states)
+        return hidden_states, residual
+
+
+class NemotronHMambaDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.mixer = MambaMixer2(
+            hidden_size=config.hidden_size,
+            ssm_state_size=config.ssm_state_size,
+            conv_kernel_size=config.conv_kernel,
+            intermediate_size=config.expand * config.hidden_size,
+            use_conv_bias=config.use_conv_bias,
+            use_bias=config.use_bias,
+            n_groups=config.n_groups,
+            num_heads=config.mamba_num_heads,
+            head_dim=config.mamba_head_dim,
+            rms_norm_eps=config.rms_norm_eps,
+            activation=config.mamba_hidden_act,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mixer",
+        )
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        mamba_cache_params: MambaCacheParams,
+        mamba2_metadata: Mamba2Metadata,
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+        else:
+            hidden_states, residual = self.norm(hidden_states, residual)
+
+        output = torch.empty_like(hidden_states)
+        self.mixer(hidden_states, output, mamba_cache_params, mamba2_metadata)
+        return output, residual
+
+
+class NemotronHAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = config.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class NemotronHAttentionDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.mixer = NemotronHAttention(
+            config,
+            layer_idx,
+            cache_config,
+            quant_config,
+            prefix=f"{prefix}.mixer",
+        )
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+        else:
+            hidden_states, residual = self.norm(hidden_states, residual)
+
+        hidden_states = self.mixer(hidden_states=hidden_states)
+        return hidden_states, residual
+
+
+ALL_DECODER_LAYER_TYPES = {
+    "M": NemotronHMambaDecoderLayer,
+    "-": NemotronHMLPDecoderLayer,
+    "*": NemotronHAttentionDecoderLayer,
+}
+
+
+@support_torch_compile
+class NemotronHModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config: NemotronHConfig = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+
+        def get_layer(prefix: str):
+            layer_idx = int(prefix.rsplit(".", 1)[1])
+            layer_class = ALL_DECODER_LAYER_TYPES[
+                config.hybrid_override_pattern[layer_idx]]
+            return layer_class(
+                config,
+                layer_idx,
+                cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            len(config.hybrid_override_pattern),
+            get_layer,
+            prefix=f"{prefix}.layers")
+        self.make_empty_intmd_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size)
+
+        self.norm_f = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        attn_metadata = get_forward_context().attn_metadata
+
+        if not envs.VLLM_USE_V1:
+            mamba2_metadata = prepare_mamba2_metadata(
+                chunk_size=self.config.chunk_size,
+                attn_metadata=attn_metadata,
+            )
+        else:
+            # v1 get mamba2_metadata from forward_context
+            mamba2_metadata = None
+
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        residual = None
+        num_non_mamba_layers = 0
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            layer_mamba_cache_params = None
+            if isinstance(layer,
+                          NemotronHMambaDecoderLayer) and mamba_cache_params:
+                layer_mamba_cache_params = mamba_cache_params.at_layer_idx(
+                    i - num_non_mamba_layers)
+            else:
+                num_non_mamba_layers += 1
+
+            hidden_states, residual = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+                mamba_cache_params=layer_mamba_cache_params,
+                mamba2_metadata=mamba2_metadata,
+            )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.norm_f(hidden_states, residual)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        attb_params_mapping = {
+            "q_proj": "q",
+            "k_proj": "k",
+            "v_proj": "v",
+        }
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "embeddings" in name:
+                name = name.replace("embeddings", "embed_tokens")
+
+            if "A_log" in name:
+                name = name.replace("A_log", "A")
+                loaded_weight = loaded_weight.to(torch.float32)
+
+            if "D" in name:
+                loaded_weight = loaded_weight.to(torch.float32)
+
+            if "dt_bias" in name:
+                loaded_weight = loaded_weight.to(torch.float32)
+
+            # load attn params
+            if any(proj in name for proj in ["q_proj", "k_proj", "v_proj"]):
+                weight_name = next(proj
+                                   for proj in ["q_proj", "k_proj", "v_proj"]
+                                   if proj in name)
+                name = name.replace(weight_name, "qkv_proj")
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight,
+                              attb_params_mapping[weight_name])
+            # load other params
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+            loaded_params.add(name)
+        return loaded_params
+
+
+class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
+                           IsHybrid, SupportsQuant):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    @classmethod
+    def get_mamba_state_shape_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+        use_v1: bool = True,
+    ) -> tuple[tuple[int, int], tuple[int, int, int]]:
+        """Calculate shapes for Mamba's convolutional and state caches.
+
+        Args:
+            vllm_config: vLLM config
+            use_v1: Get shapes for V1 (or V0)
+
+        Returns:
+            Tuple containing:
+            - conv_state_shape: Shape for convolutional state cache
+            - temporal_state_shape: Shape for state space model cache
+        """
+        parallel_config = vllm_config.parallel_config
+        hf_config = vllm_config.model_config.hf_config
+        intermediate_size = hf_config.expand * hf_config.hidden_size
+
+        return get_mamba_state_shape(
+            intermediate_size=intermediate_size,
+            tp_world_size=parallel_config.tensor_parallel_size,
+            n_groups=hf_config.n_groups,
+            num_heads=hf_config.mamba_num_heads,
+            head_dim=hf_config.mamba_head_dim,
+            state_size=hf_config.ssm_state_size,
+            conv_kernel=hf_config.conv_kernel,
+            use_v1=use_v1,
+        )
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        lora_config = vllm_config.lora_config
+        scheduler_config = vllm_config.scheduler_config
+        assert not cache_config.enable_prefix_caching, \
+            "NemotronH currently does not support prefix caching"
+
+        self.quant_config = vllm_config.quant_config
+
+        super().__init__()
+        self.config = config
+        self.scheduler_config = scheduler_config
+        self.model = NemotronHModel(vllm_config=vllm_config,
+                                    prefix=maybe_prefix(prefix, "model"))
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+        )
+        # Used to track and store by the Mamba cache between steps.
+        self.mamba_cache: Optional[MambaCacheManager] = None
+
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
+
+        self.make_empty_intmd_tensors = (self.model.make_empty_intmd_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs):
+
+        mamba_cache_params = None
+        if not envs.VLLM_USE_V1:
+            if self.mamba_cache is None:
+
+                num_mamba_layers = \
+                    self.model_config.get_num_layers_by_block_type(
+                        self.vllm_config.parallel_config,
+                        LayerBlockType.mamba
+                    )
+                mamba_state_shape = \
+                    self.get_mamba_state_shape_from_config(
+                        self.vllm_config, use_v1=False)
+                self.mamba_cache = MambaCacheManager(self.vllm_config,
+                                                     self.lm_head.weight.dtype,
+                                                     num_mamba_layers,
+                                                     *mamba_state_shape)
+
+            mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+
+        hidden_states = self.model(input_ids, positions, mamba_cache_params,
+                                   intermediate_tensors, inputs_embeds)
+
+        return hidden_states
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        return self.mamba_cache.copy_inputs_before_cuda_graphs(
+            input_buffers, **kwargs)
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        # update name in weights before passing to loader
+        updated_weights = []
+        for name, loaded_weight in weights:
+            name = name.replace("backbone", "model")
+            updated_weights.append((name, loaded_weight))
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(updated_weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/nemotron_nas.py b/vllm_v0.10.0/vllm/model_executor/models/nemotron_nas.py
new file mode 100644
index 0000000..a766ed9
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/nemotron_nas.py
@@ -0,0 +1,484 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only deci model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from vllm.attention import AttentionType
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.models.llama import LlamaAttention, LlamaMLP
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import HasNoOps, SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int:
+    # DeciLM-specific code
+    intermediate_size = int(2 * ffn_mult * n_embd / 3)
+    return _find_multiple(intermediate_size, 256)
+
+
+def _find_multiple(n: int, k: int) -> int:
+    # DeciLM-specific code
+    if n % k == 0:
+        return n
+    return n + k - (n % k)
+
+
+class DeciLMAttention(LlamaAttention):
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        bias_o_proj: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        prefix: str = "",
+        attn_type: str = AttentionType.DECODER,
+    ) -> None:
+        super().__init__(config, hidden_size, num_heads, num_kv_heads,
+                         rope_theta, rope_scaling, max_position_embeddings,
+                         quant_config, bias, bias_o_proj, cache_config, prefix,
+                         attn_type)
+
+    def _init_rotary_emb(self, config, rope_scaling: Optional[dict[str, Any]],
+                         quant_config: Optional[QuantizationConfig]) -> None:
+        # Enables YARN for Mistral and LLaMA4 derivatives.
+        is_neox_style = True
+        if hasattr(config, "position_embedding_type"):
+            is_neox_style = config.position_embedding_type not in [
+                "mistral_yarn", "rope_llama4"
+            ]
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=is_neox_style,
+            partial_rotary_factor=self.partial_rotary_factor)
+
+
+class DeciLMDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        block_config = config.block_configs[layer_idx]
+        self._is_no_op_attention = block_config.attention.no_op
+        self._is_no_op_ffn = block_config.ffn.no_op
+
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+                config, "original_max_position_embeddings", None):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        # Support abacusai/Smaug-72B-v0.1 with attention_bias
+        # Support internlm/internlm-7b with bias
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False)
+        bias_o_proj = attention_bias
+        # support internlm/internlm3-8b with qkv_bias
+        if hasattr(config, "qkv_bias"):
+            attention_bias = config.qkv_bias
+
+        if not self._is_no_op_attention:
+            num_kv_heads = (config.num_attention_heads //
+                            block_config.attention.n_heads_in_group)
+            self.self_attn = DeciLMAttention(
+                config=config,
+                hidden_size=self.hidden_size,
+                num_heads=config.num_attention_heads,
+                num_kv_heads=num_kv_heads,
+                rope_theta=rope_theta,
+                rope_scaling=rope_scaling,
+                max_position_embeddings=max_position_embeddings,
+                quant_config=quant_config,
+                bias=attention_bias,
+                bias_o_proj=bias_o_proj,
+                cache_config=cache_config,
+                prefix=f"{prefix}.self_attn",
+            )
+            self.input_layernorm = RMSNorm(config.hidden_size,
+                                           eps=config.rms_norm_eps)
+
+        if not self._is_no_op_ffn:
+            ffn_mult = block_config.ffn.ffn_mult
+            intermediate_size = _ffn_mult_to_intermediate_size(
+                ffn_mult, config.hidden_size)
+
+            self.mlp = LlamaMLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                bias=getattr(config, "mlp_bias", False),
+                prefix=f"{prefix}.mlp",
+            )
+            self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                    eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+
+        if self._is_no_op_attention:
+            pass
+        else:
+            if (residual is None):
+                residual = hidden_states
+                hidden_states = self.input_layernorm(hidden_states)
+            else:
+                hidden_states, residual = self.input_layernorm(
+                    hidden_states, residual)
+            hidden_states = self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+            )
+
+        # Fully Connected
+        if not self._is_no_op_ffn:
+            hidden_states, residual = self.post_attention_layernorm(
+                hidden_states, residual)
+            hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class DeciModel(nn.Module):
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        layer_type: type[DeciLMDecoderLayer] = DeciLMDecoderLayer,
+    ):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.quant_config = quant_config
+        self.padding_idx = config.pad_token_id
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+        if get_pp_group().is_first_rank or (config.tie_word_embeddings
+                                            and get_pp_group().is_last_rank):
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                quant_config=quant_config,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        def get_layer(prefix: str):
+            layer_idx = int(prefix.rsplit(".", 1)[1])
+            return layer_type(
+                config,
+                layer_idx,
+                cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            get_layer,
+            prefix=f"{prefix}.layers",
+        )
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        kv_cache_index = 0
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            if not layer._is_no_op_attention:
+                hidden_states, residual = layer(positions, hidden_states,
+                                                residual)
+                kv_cache_index += 1
+            else:
+                hidden_states, residual = layer(positions, hidden_states,
+                                                residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if self.quant_config is not None and (
+                    scale_name := self.quant_config.get_cache_scale(name)):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            if "scale" in name:
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class DeciLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, HasNoOps):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    # Mistral/Llama models can also be loaded with --load-format mistral
+    # from consolidated.safetensors checkpoints
+    mistral_mapping = {
+        "layers": "model.layers",
+        "attention": "self_attn",
+        "wq": "q_proj",
+        "wk": "k_proj",
+        "wv": "v_proj",
+        "wo": "o_proj",
+        "attention_norm": "input_layernorm",
+        "feed_forward": "mlp",
+        "w1": "gate_proj",
+        "w2": "down_proj",
+        "w3": "up_proj",
+        "ffn_norm": "post_attention_layernorm",
+        "tok_embeddings": "model.embed_tokens",
+        "output": "lm_head",
+        "norm": "model.norm",
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        self.config = config
+        self.lora_config = lora_config
+
+        self.model = self._init_model(vllm_config=vllm_config,
+                                      prefix=maybe_prefix(prefix, "model"))
+
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = config.vocab_size
+            if lora_config:
+                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=(
+                    DEFAULT_VOCAB_PADDING_SIZE
+                    # We need bigger padding if using lora for kernel
+                    # compatibility
+                    if not lora_config else
+                    lora_config.lora_vocab_padding_size),
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+            if config.tie_word_embeddings:
+                self.lm_head = self.lm_head.tie_weights(
+                    self.model.embed_tokens)
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    config.vocab_size,
+                                                    logit_scale)
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def _init_model(self, vllm_config: VllmConfig, prefix: str = ""):
+        return DeciModel(vllm_config=vllm_config, prefix=prefix)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        model_output = self.model(input_ids, positions, intermediate_tensors,
+                                  inputs_embeds)
+        return model_output
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/nemotron_vl.py b/vllm_v0.10.0/vllm/model_executor/models/nemotron_vl.py
new file mode 100644
index 0000000..5d0513d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/nemotron_vl.py
@@ -0,0 +1,505 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from abc import ABC
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from PIL import Image
+from transformers import AutoModel, PretrainedConfig
+from transformers.image_processing_utils_fast import BaseImageProcessorFast
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.awq import AWQConfig
+from vllm.model_executor.models.internvl import (
+    BaseInternVLDummyInputsBuilder, BaseInternVLMultiModalProcessor,
+    BaseInternVLProcessingInfo, InternVLImageEmbeddingInputs,
+    InternVLImageInputs, InternVLImagePixelInputs, InternVLProcessor)
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import NestedTensors
+from vllm.multimodal.processing import PromptUpdateDetails
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.processor import (
+    cached_image_processor_from_config)
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
+from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
+                    maybe_prefix, merge_multimodal_embeddings)
+
+IMG_START = '<img>'
+IMG_END = '</img>'
+IMG_CONTEXT = '<image>'
+
+
+class NemotronVLProcessor(InternVLProcessor):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: AnyTokenizer,
+        image_processor: BaseImageProcessorFast,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+    ) -> None:
+        ABC.__init__(self)
+        self.config = config
+        self.tokenizer = tokenizer
+        self.image_processor = image_processor
+        image_size: int = config.force_image_size
+        patch_size: int = config.patch_size
+
+        if min_dynamic_patch is None:
+            min_dynamic_patch = 1
+        assert isinstance(min_dynamic_patch, int)
+
+        if max_dynamic_patch is None:
+            max_dynamic_patch = self.image_processor.max_num_tiles
+        assert isinstance(max_dynamic_patch, int)
+
+        if dynamic_image_size is None:
+            dynamic_image_size = True
+        assert isinstance(dynamic_image_size, bool)
+
+        self.num_image_token = int(
+            (image_size // patch_size)**2 * (config.downsample_ratio**2))
+        self.image_size = image_size
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail: bool = self.image_processor.use_thumbnail
+
+    @property
+    def image_token_id(self) -> int:
+        return self.tokenizer.get_vocab()[IMG_CONTEXT]
+
+    def _preprocess_image(
+        self,
+        text: list[str],
+        images: list[Image.Image],
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+    ) -> tuple[list[str], dict[str, torch.Tensor]]:
+        if len(images) == 0:
+            image_inputs = {}
+        else:
+            pixel_values_lst = self._images_to_pixel_values_lst(
+                images,
+                min_dynamic_patch=min_dynamic_patch,
+                max_dynamic_patch=max_dynamic_patch,
+                dynamic_image_size=dynamic_image_size,
+            )
+            image_inputs: dict[str, NestedTensors] = {
+                "pixel_values_flat":
+                torch.cat(pixel_values_lst),
+                "image_num_patches":
+                torch.tensor([len(item) for item in pixel_values_lst]),
+            }
+
+            for pixel_values in pixel_values_lst:
+                num_patches = pixel_values.shape[0]
+                feature_size = num_patches * self.num_image_token
+                image_repl = self.get_image_repl(feature_size, num_patches)
+                NVL_IMAGE_CONTEXT = image_repl.full.replace(
+                    "<image>", "<NVL_IMG_CONTEXT>")
+                text = [
+                    t.replace('<image>', NVL_IMAGE_CONTEXT, 1) for t in text
+                ]
+            text = [t.replace("<NVL_IMG_CONTEXT>", IMG_CONTEXT) for t in text]
+        return text, image_inputs
+
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: Optional[int],
+    ) -> PromptUpdateDetails[str]:
+        repl_features = IMG_CONTEXT * feature_size
+        repl_full = IMG_START + repl_features + IMG_END
+
+        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
+
+
+class NemotronVLProcessingInfo(BaseInternVLProcessingInfo):
+    """Processing info for Nemotron VL models."""
+
+    def get_hf_processor(
+        self,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        **kwargs: object,
+    ) -> NemotronVLProcessor:
+        if min_dynamic_patch is not None:
+            kwargs["min_dynamic_patch"] = min_dynamic_patch
+        if max_dynamic_patch is not None:
+            kwargs["max_dynamic_patch"] = max_dynamic_patch
+        if dynamic_image_size is not None:
+            kwargs["dynamic_image_size"] = dynamic_image_size
+
+        image_processor = self.get_image_processor()
+        return self.ctx.init_processor(
+            NemotronVLProcessor,
+            config=self.get_hf_config(),
+            tokenizer=self.get_tokenizer(),
+            image_processor=image_processor,
+            **kwargs,
+        )
+
+    def get_image_processor(
+        self,
+        **kwargs: object,
+    ):
+        return cached_image_processor_from_config(
+            self.ctx.model_config,
+            **kwargs,
+        )
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    BaseInternVLMultiModalProcessor[NemotronVLProcessingInfo],
+    info=NemotronVLProcessingInfo,
+    dummy_inputs=BaseInternVLDummyInputsBuilder[NemotronVLProcessingInfo])
+class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
+                               SupportsLoRA):
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<image>"
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+        self._patch_quant_config(config, quant_config)
+
+        image_size = config.force_image_size or config.vision_config.image_size
+        patch_size = config.vision_config.patch_size
+        self.patch_size = patch_size
+        self.num_image_token = int(
+            (image_size // patch_size)**2 * (config.downsample_ratio**2))
+        self.downsample_ratio = config.downsample_ratio
+        self.ps_version = config.ps_version
+
+        self.llm_arch_name = config.text_config.architectures[0]
+        self.vision_model = self._init_vision_model(
+            config,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "vision_model"),
+        )
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
+        self.mlp1 = self._init_mlp1(config)
+
+        self.img_context_token_id = None
+
+        self.visual_token_mask = None
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _patch_quant_config(self, config: PretrainedConfig,
+                            quant_config: QuantizationConfig):
+        # the awq models from OpenGVLab missing `modules_to_not_convert`
+        # patch the quant_config to add `modules_to_not_convert` back
+        if isinstance(quant_config, AWQConfig):
+            text_config = config.text_config
+            llm_quant_config = getattr(text_config, "quantization_config",
+                                       None)
+            if (not quant_config.modules_to_not_convert) and \
+                (llm_quant_config is not None):
+                quant_config.modules_to_not_convert.append("vision_model")
+
+    def _init_vision_model(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        *,
+        prefix: str,
+    ):
+        return AutoModel.from_config(config.vision_config,
+                                     trust_remote_code=True)
+
+    def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential:
+        vit_hidden_size = config.vit_hidden_size
+        vision_projection_hidden_size = config.projector_hidden_size
+        llm_hidden_size = config.text_config.hidden_size
+
+        return nn.Sequential(
+            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio)**2,
+                         bias=True),
+            nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio)**2,
+                      vision_projection_hidden_size,
+                      bias=True),
+            nn.GELU(),
+            nn.Linear(vision_projection_hidden_size, llm_hidden_size),
+        )
+
+    def pixel_shuffle(self, x, scale_factor=0.5):
+        n, w, h, c = x.size()
+        # N, W, H, C --> N, W, H * scale, C // scale
+        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        x = x.view(n, int(h * scale_factor), int(w * scale_factor),
+                   int(c / (scale_factor * scale_factor)))
+        if self.ps_version == 'v1':
+            pass
+        else:
+            x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+
+    def extract_feature(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        # https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1/blob/main/modeling.py#L177
+        vit_embeds = self.vision_model(x=pixel_values).features
+        vit_embeds = vit_embeds.to(dtype=torch.bfloat16)
+
+        h = w = int(vit_embeds.shape[1]**0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = self.pixel_shuffle(vit_embeds,
+                                        scale_factor=self.downsample_ratio)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1,
+                                        vit_embeds.shape[-1])
+        vit_embeds = self.mlp1(vit_embeds)
+        return vit_embeds
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+
+        #use force_image_size to get image_size
+        h = w = self.config.force_image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape)
+
+            if actual_dims != expected_dims:
+                expected_expr = str(expected_dims)
+                raise ValueError(
+                    "The expected shape of pixel values per image per batch "
+                    f" per patch is {expected_expr}. "
+                    f"You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[InternVLImageInputs]:
+        pixel_values_flat = kwargs.pop("pixel_values_flat", None)
+        image_num_patches = kwargs.pop("image_num_patches", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values_flat is None and image_embeds is None:
+            return None
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+
+            return InternVLImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds),
+            )
+
+        image_token_id = kwargs["image_token_id"]
+        assert isinstance(image_token_id, torch.Tensor)
+        self.img_context_token_id = image_token_id.flatten().unique().item()
+
+        if pixel_values_flat is not None:
+            if not isinstance(pixel_values_flat, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values_flat)}")
+
+            if not isinstance(image_num_patches, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image_num_patches. "
+                                 f"Got type: {type(image_num_patches)}")
+
+            pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
+            image_num_patches = flatten_bn(image_num_patches, concat=True)
+
+            return InternVLImagePixelInputs(
+                type="pixel_values",
+                pixel_values_flat=self._validate_pixel_values(
+                    pixel_values_flat),
+                num_patches=image_num_patches,
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_image_input(
+        self,
+        image_input: InternVLImageInputs,
+    ) -> tuple[torch.Tensor, ...]:
+        if image_input["type"] == "image_embeds":
+            return image_input["data"]
+
+        assert self.vision_model is not None
+
+        image_embeds = self.extract_feature(image_input["pixel_values_flat"])
+
+        num_patches = image_input["num_patches"]
+
+        # Only one image in the current batch
+        if len(num_patches) == 1:
+            return (image_embeds.view(-1,
+                                      self.config.text_config.hidden_size), )
+
+        # NOTE: Image embeddings are split into separate tensors for each image
+        # by the size of each embedding.
+        feature_size = image_embeds.shape[1]
+        image_embeds = image_embeds.view(-1,
+                                         self.config.text_config.hidden_size)
+        image_feature_sizes = [
+            num_patches * feature_size for num_patches in num_patches
+        ]
+        return image_embeds.split(image_feature_sizes)
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("pixel_values_flat",
+                             "image_embeds") and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+
+        return modalities
+
+    def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
+        self.visual_token_mask = None
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
+            return []
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in modalities:
+            if modality == "images":
+                image_input = modalities["images"]
+                vision_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += vision_embeddings
+
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            context_token_ids = [self.img_context_token_id]
+            assert len(context_token_ids) >= 1
+            self._set_visual_token_mask(input_ids)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                context_token_ids,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> IntermediateTensors:
+
+        if intermediate_tensors is not None:
+            input_ids = None
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        forward_kwargs = {
+            "input_ids": input_ids,
+            "positions": positions,
+            "intermediate_tensors": intermediate_tensors,
+            "inputs_embeds": inputs_embeds,
+        }
+
+        # Only required if the model is mono-architecture
+        if self.visual_token_mask is not None:
+            forward_kwargs.update(
+                {"visual_token_mask": self.visual_token_mask})
+            self.visual_token_mask = None
+
+        hidden_states = self.language_model.model(**forward_kwargs)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        ## Ignore registered_buffers
+        ## see https://huggingface.co/nvidia/C-RADIOv2-H/blob/main/input_conditioner.py#L28 # noqa: E501
+        skip_substrs = ["norm_mean", "norm_std"]
+        loader = AutoWeightsLoader(self, skip_substrs=skip_substrs)
+        return loader.load_weights(weights)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="mlp1",
+            tower_model="vision_model")
diff --git a/vllm_v0.10.0/vllm/model_executor/models/nvlm_d.py b/vllm_v0.10.0/vllm/model_executor/models/nvlm_d.py
new file mode 100644
index 0000000..2f7f8e4
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/nvlm_d.py
@@ -0,0 +1,216 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://huggingface.co/nvidia/NVLM-D-72B/blob/main/modeling_nvlm_d.py
+# --------------------------------------------------------
+# NVLM-D
+# Copyright (c) 2024 NVIDIA
+# Licensed under Apache 2.0 License [see LICENSE for details]
+# --------------------------------------------------------
+from collections.abc import Mapping, Sequence
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig
+
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   MultiModalDataItems)
+from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
+
+from .intern_vit import InternVisionModel
+from .internvl import (BaseInternVLDummyInputsBuilder,
+                       BaseInternVLMultiModalProcessor,
+                       BaseInternVLProcessingInfo, BaseInternVLProcessor,
+                       InternVLChatModel)
+
+IMG_PAD = "<|vision_pad|>"
+
+
+class NVLMProcessor(BaseInternVLProcessor):
+
+    @property
+    def image_token_id(self) -> int:
+        return self.tokenizer.get_vocab()[IMG_PAD]
+
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: Optional[int],
+    ) -> PromptUpdateDetails[str]:
+        if num_patches is None:
+            raise NotImplementedError("Embedding inputs are not supported")
+
+        tile_pos_identifiers = [f"<tile_{i}>" for i in range(1, num_patches)]
+        if self.use_thumbnail:
+            tile_pos_identifiers += ["<tile_global_thumbnail>"]
+
+        context_size = feature_size // num_patches
+        features = "".join(identifier + IMG_PAD * context_size
+                           for identifier in tile_pos_identifiers)
+
+        # We include the start and end as well because "<Image><tile" is
+        # tokenized as ["<Image", "><", "tile"], resulting in assertion error
+        # when trying to find "<tile" as a subsequence of "<Image><tile"
+        repl = "<Image>" + features + "</Image>"
+
+        return PromptUpdateDetails.select_text(repl, IMG_PAD)
+
+
+class NVLMProcessingInfo(BaseInternVLProcessingInfo):
+
+    def get_hf_processor(
+        self,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        **kwargs: object,
+    ) -> NVLMProcessor:
+        if min_dynamic_patch is not None:
+            kwargs["min_dynamic_patch"] = min_dynamic_patch
+        if max_dynamic_patch is not None:
+            kwargs["max_dynamic_patch"] = max_dynamic_patch
+        if dynamic_image_size is not None:
+            kwargs["dynamic_image_size"] = dynamic_image_size
+
+        return self.ctx.init_processor(
+            NVLMProcessor,
+            config=self.get_hf_config(),
+            tokenizer=self.get_tokenizer(),
+            **kwargs,
+        )
+
+
+class NVLMDummyInputsBuilder(BaseInternVLDummyInputsBuilder[NVLMProcessingInfo]
+                             ):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        # The newline is necessary to separate ">" of the current item
+        # and "<" of the next item
+        return "<image>\n" * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        num_images = mm_counts.get("image", 0)
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+
+class NVLMMultiModalProcessor(
+        BaseInternVLMultiModalProcessor[NVLMProcessingInfo]):
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        if "image_num_patches" in out_mm_kwargs:
+            image_num_patches = out_mm_kwargs["image_num_patches"]
+            assert isinstance(image_num_patches, torch.Tensor)
+            image_num_patches = image_num_patches.tolist()
+        elif "image_embeds" in out_mm_kwargs:
+            # TODO: Use image size information in dictionary embedding inputs
+            # to compute num_patches (similar to Qwen2-VL)
+            image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
+        else:
+            image_num_patches = []
+
+        def get_replacement_nvlm(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                feature_size = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                feature_size = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    processor=hf_processor,
+                )
+
+            num_patches = image_num_patches[item_idx]
+            if num_patches is not None:
+                assert isinstance(num_patches, int)
+
+            repl = hf_processor.get_image_repl(feature_size, num_patches)
+
+            return PromptUpdateDetails.select_text(repl.full + "\n", IMG_PAD)
+
+        # See note in dummy data regarding why we have the extra newline
+        return [
+            PromptReplacement(
+                modality="image",
+                target="<image>\n",
+                replacement=get_replacement_nvlm,
+            )
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(NVLMMultiModalProcessor,
+                                        info=NVLMProcessingInfo,
+                                        dummy_inputs=NVLMDummyInputsBuilder)
+class NVLM_D_Model(InternVLChatModel):
+
+    def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential:
+        vit_hidden_size = config.vision_config.hidden_size
+        llm_intermediate_size = config.text_config.intermediate_size
+        llm_hidden_size = config.text_config.hidden_size
+
+        return nn.Sequential(
+            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio)**2),
+            nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio)**2,
+                      llm_intermediate_size,
+                      bias=False),
+            nn.GELU(),
+            nn.Linear(llm_intermediate_size, llm_hidden_size, bias=False),
+        )
+
+    def _init_vision_model(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        *,
+        is_mono: bool,
+        prefix: str,
+    ):
+        if not is_mono:
+            vision_feature_layer = config.select_layer
+            if vision_feature_layer < 0:
+                num_hidden_layers = config.vision_config.num_hidden_layers \
+                    + vision_feature_layer + 1
+            else:
+                num_hidden_layers = vision_feature_layer + 1
+
+            # We added additional dummy heads to the original num of heads to
+            # make the number of heads divisible by 8.
+            return InternVisionModel(
+                config.vision_config,
+                quant_config=quant_config,
+                num_hidden_layers_override=num_hidden_layers,
+                num_dummy_heads=7,
+                prefix=prefix,
+            )
+        else:
+            msg = "Monolith mode is not applicable to NVLM_D"
+            raise NotImplementedError(msg)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/olmo.py b/vllm_v0.10.0/vllm/model_executor/models/olmo.py
new file mode 100644
index 0000000..1dc4df8
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/olmo.py
@@ -0,0 +1,389 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/olmo/modeling_olmo.py
+# Copyright 2024 The vLLM team.
+# Copyright 2024 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only OLMo model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import OlmoConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsPP
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class OlmoAttention(nn.Module):
+    """
+    This is the attention block where the output is computed as
+    ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(
+        self,
+        config: OlmoConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        tensor_model_parallel_world_size = (
+            get_tensor_model_parallel_world_size())
+        self.total_num_heads = config.num_attention_heads
+
+        assert self.hidden_size % self.total_num_heads == 0
+        assert self.total_num_heads % tensor_model_parallel_world_size == 0
+
+        self.num_heads = (self.total_num_heads //
+                          tensor_model_parallel_world_size)
+        self.head_dim = self.hidden_size // self.total_num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.clip_qkv = config.clip_qkv
+
+        # Attention input projection. Projects x -> (q, k, v)
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+        )
+
+        # Rotary embeddings.
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+        self.scaling = self.head_dim**-0.5
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              scale=self.scaling,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+        # Attention output projection.
+        self.o_proj = RowParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        if self.clip_qkv is not None:
+            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class OlmoMLP(nn.Module):
+    """
+    This is the MLP block where the output is computed as
+    ``MLP(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(
+        self,
+        config: OlmoConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+
+        # Feed-forward input projection.
+        self.gate_up_proj = MergedColumnParallelLinear(
+            self.hidden_size,
+            [self.intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+        )
+
+        # Activation function.
+        self.act_fn = SiluAndMul()
+
+        # Feed-forward output projection.
+        self.down_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class OlmoDecoderLayer(nn.Module):
+    """
+    This is a typical transformer block where the output is
+    computed as ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(self,
+                 config: OlmoConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        # Attention block.
+        self.self_attn = OlmoAttention(config,
+                                       cache_config,
+                                       quant_config,
+                                       prefix=f"{prefix}.self_attn")
+
+        # MLP block.
+        self.mlp = OlmoMLP(config, quant_config)
+
+        # LayerNorm
+        self.input_layernorm = nn.LayerNorm(config.hidden_size,
+                                            elementwise_affine=False,
+                                            bias=False)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
+                                                     elementwise_affine=False,
+                                                     bias=False)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]:
+        # Attention block.
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(positions, hidden_states)
+        hidden_states = hidden_states + residual
+
+        # MLP block.
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+@support_torch_compile
+class OlmoModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+
+        self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
+                                                   config.hidden_size)
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: OlmoDecoderLayer(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.layers")
+        self.norm = nn.LayerNorm(config.hidden_size,
+                                 elementwise_affine=False,
+                                 bias=False)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(["hidden_states"],
+                                                    config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        """
+        :param input_ids: A tensor of shape `(batch_size, seq_len)`.
+        """
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+
+        # Apply blocks one-by-one.
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            # shape: (batch_size, seq_len, d_model)
+            hidden_states = layer(positions, hidden_states)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
+        # Apply final layer norm.
+        # shape: (batch_size, seq_len or 1, d_model)
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class OlmoForCausalLM(nn.Module, SupportsPP):
+    """
+    Extremely barebones HF model wrapper.
+    """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.model = OlmoModel(vllm_config=vllm_config,
+                               prefix=maybe_prefix(prefix, "model"))
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.unpadded_vocab_size = config.vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                quant_config=quant_config,
+            )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head.weight"]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/olmo2.py b/vllm_v0.10.0/vllm/model_executor/models/olmo2.py
new file mode 100644
index 0000000..499e6d3
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/olmo2.py
@@ -0,0 +1,414 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modeling_olmo2.py
+# Copyright 2024 The vLLM team.
+# Copyright 2024 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only OLMo2 model compatible with HuggingFace weights."""
+
+from collections.abc import Iterable
+from functools import partial
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import Olmo2Config
+
+from vllm.attention import Attention
+from vllm.config import VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.distributed.communication_op import tensor_model_parallel_all_gather
+from vllm.distributed.parallel_state import get_tensor_model_parallel_rank
+from vllm.distributed.utils import split_tensor_along_last_dim
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.interfaces import SupportsPP
+from vllm.model_executor.models.utils import (
+    AutoWeightsLoader, is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory, make_layers, maybe_prefix)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+
+class Olmo2Attention(nn.Module):
+    """
+    This is the attention block where the output is computed as
+    ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        self.config = vllm_config.model_config.hf_config
+        assert isinstance(self.config, Olmo2Config)
+
+        hidden_size = self.config.hidden_size
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = self.config.num_attention_heads
+
+        assert hidden_size % self.total_num_heads == 0
+        assert self.total_num_heads % self.tp_size == 0
+
+        self.num_heads = self.total_num_heads // self.tp_size
+        self.total_num_kv_heads = (self.config.num_key_value_heads
+                                   or self.total_num_heads)
+        if self.total_num_kv_heads >= self.tp_size:
+            assert self.total_num_kv_heads % self.tp_size == 0
+        else:
+            assert self.tp_size % self.total_num_kv_heads == 0
+
+        self.num_kv_heads = max(1, self.total_num_kv_heads // self.tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.max_position_embeddings = self.config.max_position_embeddings
+        self.rope_theta = self.config.rope_theta
+
+        # Attention input projection. Projects x -> (q, k, v)
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=vllm_config.quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.k_norm = RMSNorm(
+            self.total_num_kv_heads * self.head_dim,
+            eps=self.config.rms_norm_eps,
+        )
+        self.q_norm = RMSNorm(self.config.hidden_size,
+                              eps=self.config.rms_norm_eps)
+
+        # Rotary embeddings.
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,  # type: ignore
+        )
+        self.scaling = self.head_dim**-0.5
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=vllm_config.cache_config,
+            quant_config=vllm_config.quant_config,
+            prefix=prefix,
+        )
+
+        # Attention output projection.
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=vllm_config.quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+    def _apply_qk_norm(self, q: torch.Tensor,
+                       k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.tp_size > 1:
+            q = tensor_model_parallel_all_gather(q.contiguous())
+            k = tensor_model_parallel_all_gather(k.contiguous())
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        if self.tp_size > 1:
+            splitter = partial(split_tensor_along_last_dim,
+                               num_partitions=self.tp_size)
+            q = splitter(q)[self.tp_rank]
+            k = splitter(k)[self.tp_rank]
+        return q, k
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self._apply_qk_norm(q, k)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Olmo2MLP(nn.Module):
+    """
+    This is the MLP block where the output is computed as
+    ``MLP(x)`` in ``LN(MLP(x + LN(Attention(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        assert isinstance(config, Olmo2Config)
+        hidden_size = config.hidden_size
+        intermediate_size = config.intermediate_size
+
+        # Feed-forward input projection.
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=vllm_config.quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+
+        # Activation function.
+        self.act_fn = SiluAndMul()
+
+        # Feed-forward output projection.
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=vllm_config.quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Olmo2DecoderLayer(nn.Module):
+    """
+    This is a typical transformer block where the output is
+    computed as ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        assert isinstance(config, Olmo2Config)
+        # Attention block.
+        self.self_attn = Olmo2Attention(vllm_config=vllm_config,
+                                        prefix=f"{prefix}.self_attn")
+
+        # MLP block.
+        self.mlp = Olmo2MLP(vllm_config=vllm_config, prefix=f"{prefix}.mlp")
+
+        # LayerNorm
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+        self.post_feedforward_layernorm = RMSNorm(config.hidden_size,
+                                                  eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        # Attention block.
+        residual = hidden_states
+        hidden_states = self.self_attn(positions, hidden_states)
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = hidden_states + residual
+
+        # MLP block.
+        residual = hidden_states
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class Olmo2Model(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        self.config = vllm_config.model_config.hf_config
+        assert isinstance(self.config, Olmo2Config)
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            prefix=f"{prefix}.embed_tokens",
+        )
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            self.config.num_hidden_layers,
+            lambda prefix: Olmo2DecoderLayer(vllm_config=vllm_config,
+                                             prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+        self.norm = RMSNorm(
+            self.config.hidden_size,
+            eps=self.config.rms_norm_eps,
+        )
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(["hidden_states"],
+                                                    self.config.hidden_size))
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        """
+        :param input_ids: A tensor of shape `(batch_size, seq_len)`.
+        """
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            # Get embeddings of input.
+            # shape: (batch_size, seq_len, d_model)
+            else:
+                hidden_states = self.embed_tokens(input_ids)
+
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            assert isinstance(hidden_states, torch.Tensor)
+
+        # Apply blocks one-by-one.
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            # shape: (batch_size, seq_len, d_model)
+            hidden_states = layer(positions, hidden_states)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
+
+        # Apply final layer norm.
+        # shape: (batch_size, seq_len or 1, d_model)
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if is_pp_missing_parameter(name, self):
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader  # type: ignore
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Olmo2ForCausalLM(nn.Module, SupportsPP):
+    """
+    Extremely barebones HF model wrapper.
+    """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        assert isinstance(config, Olmo2Config)
+        self.config = config
+        self.model = Olmo2Model(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.unpadded_vocab_size = config.vocab_size
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                quant_config=vllm_config.quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head.weight"]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/olmoe.py b/vllm_v0.10.0/vllm/model_executor/models/olmoe.py
new file mode 100644
index 0000000..7552f64
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/olmoe.py
@@ -0,0 +1,484 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only OLMoE model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from functools import partial
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_gather)
+from vllm.distributed.utils import split_tensor_along_last_dim
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsPP
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+logger = init_logger(__name__)
+
+
+class OlmoeMoE(nn.Module):
+    """A tensor-parallel MoE implementation for Olmoe that shards each expert
+    across all ranks.
+
+    Each expert's weights are sharded across all ranks and a fused MoE
+    kernel is used for the forward pass, and finally we reduce the outputs
+    across ranks.
+    """
+
+    def __init__(self,
+                 num_experts: int,
+                 top_k: int,
+                 hidden_size: int,
+                 intermediate_size: int,
+                 params_dtype: Optional[torch.dtype] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 tp_size: Optional[int] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.hidden_size = hidden_size
+
+        # Gate always runs at half / full precision for now.
+        self.gate = ReplicatedLinear(hidden_size,
+                                     num_experts,
+                                     bias=False,
+                                     quant_config=None)
+
+        self.experts = FusedMoE(num_experts=num_experts,
+                                top_k=top_k,
+                                hidden_size=hidden_size,
+                                intermediate_size=intermediate_size,
+                                reduce_results=True,
+                                renormalize=False,
+                                quant_config=quant_config,
+                                tp_size=tp_size,
+                                prefix=f"{prefix}.experts")
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
+        hidden_dim = hidden_states.shape[-1]
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        final_hidden_states = self.experts(hidden_states=hidden_states,
+                                           router_logits=router_logits)
+        return final_hidden_states.view(orig_shape)
+
+
+class OlmoeAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 4096,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+        )
+        self.tp_size = tp_size
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.q_norm = RMSNorm(self.total_num_heads * self.head_dim, eps=1e-5)
+        self.k_norm = RMSNorm(self.total_num_kv_heads * self.head_dim,
+                              eps=1e-5)
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=True,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def _apply_qk_norm(self, q: torch.Tensor,
+                       k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.tp_size > 1:
+            q = tensor_model_parallel_all_gather(q.contiguous())
+            k = tensor_model_parallel_all_gather(k.contiguous())
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        if self.tp_size > 1:
+            splitter = partial(split_tensor_along_last_dim,
+                               num_partitions=self.tp_size)
+            q = splitter(q)[self.tp_rank]
+            k = splitter(k)[self.tp_rank]
+        return q, k
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self._apply_qk_norm(q, k)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class OlmoeDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          4096)
+
+        self.self_attn = OlmoeAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+
+        self.mlp = OlmoeMoE(
+            num_experts=config.num_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=1e-5)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=1e-5)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class OlmoeModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.vocab_size = config.vocab_size
+        self.config = config
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: OlmoeDecoderLayer(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.layers")
+        self.norm = RMSNorm(config.hidden_size, eps=1e-5)
+
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        return FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        expert_params_mapping = self.get_expert_mapping()
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    if name.endswith("kv_scale"):
+                        remapped_kv_scale_name = name.replace(
+                            ".kv_scale", ".attn.kv_scale")
+                        if remapped_kv_scale_name not in params_dict:
+                            logger.warning_once(
+                                "Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.",  # noqa: E501
+                                name,
+                                remapped_kv_scale_name,
+                            )
+                            continue
+                        else:
+                            name = remapped_kv_scale_name
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class OlmoeForCausalLM(nn.Module, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = OlmoeModel(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
diff --git a/vllm_v0.10.0/vllm/model_executor/models/opt.py b/vllm_v0.10.0/vllm/model_executor/models/opt.py
new file mode 100644
index 0000000..9eaac1e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/opt.py
@@ -0,0 +1,412 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights
+# reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only OPT model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import OPTConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsPP
+from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class OPTLearnedPositionalEmbedding(nn.Embedding):
+
+    def __init__(self, num_embeddings: int, embedding_dim: int):
+        # OPT is set up so that if padding_idx is specified then offset the
+        # embedding ids by 2 and adjust num_embeddings appropriately. Other
+        # models don't have this hack
+        self.offset = 2
+        super().__init__(num_embeddings + self.offset, embedding_dim)
+
+    def forward(self, positions: torch.Tensor):
+        return super().forward(positions + self.offset)
+
+
+class OPTAttention(nn.Module):
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        bias: bool = True,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.embed_dim = embed_dim
+        tensor_model_parallel_world_size = (
+            get_tensor_model_parallel_world_size())
+        total_num_heads = num_heads
+        assert num_heads % tensor_model_parallel_world_size == 0
+        self.num_heads = total_num_heads // tensor_model_parallel_world_size
+        self.head_dim = embed_dim // total_num_heads
+        self.scaling = self.head_dim**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            embed_dim,
+            self.head_dim,
+            total_num_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.out_proj = RowParallelLinear(
+            embed_dim,
+            embed_dim,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              scale=self.scaling,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.out_proj(attn_output)
+        return output
+
+
+class OPTDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: OPTConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.self_attn = OPTAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.num_attention_heads,
+            bias=config.enable_bias,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.do_layer_norm_before = config.do_layer_norm_before
+
+        self.self_attn_layer_norm = nn.LayerNorm(
+            self.embed_dim,
+            elementwise_affine=config.layer_norm_elementwise_affine)
+        self.fc1 = ColumnParallelLinear(
+            self.embed_dim,
+            config.ffn_dim,
+            bias=config.enable_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
+        self.activation_fn = get_act_fn(config.activation_function)
+        self.fc2 = RowParallelLinear(
+            config.ffn_dim,
+            self.embed_dim,
+            bias=config.enable_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
+        self.final_layer_norm = nn.LayerNorm(
+            self.embed_dim,
+            elementwise_affine=config.layer_norm_elementwise_affine)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        # Self Attention
+        residual = hidden_states
+        # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
+        if self.do_layer_norm_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states = self.self_attn(hidden_states=hidden_states)
+        hidden_states = residual + hidden_states
+        # 350m applies layer norm AFTER attention
+        if not self.do_layer_norm_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Fully Connected
+        residual = hidden_states
+        # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
+        if self.do_layer_norm_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        hidden_states = residual + hidden_states
+        # 350m applies layer norm AFTER attention
+        if not self.do_layer_norm_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+        return hidden_states
+
+
+class OPTDecoder(nn.Module):
+
+    def __init__(
+        self,
+        config: OPTConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.max_target_positions = config.max_position_embeddings
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.word_embed_proj_dim,
+        )
+        # Positional embeddings are replicated (not sharded).
+        self.embed_positions = OPTLearnedPositionalEmbedding(
+            config.max_position_embeddings, config.hidden_size)
+
+        # Project out & in will be replicated if they exist.
+        if config.word_embed_proj_dim != config.hidden_size:
+            self.project_out = ReplicatedLinear(config.hidden_size,
+                                                config.word_embed_proj_dim,
+                                                bias=False,
+                                                quant_config=quant_config,
+                                                prefix=f"{prefix}.project_out")
+        else:
+            self.project_out = None
+
+        if config.word_embed_proj_dim != config.hidden_size:
+            self.project_in = ReplicatedLinear(config.word_embed_proj_dim,
+                                               config.hidden_size,
+                                               bias=False,
+                                               quant_config=quant_config,
+                                               prefix=f"{prefix}.project_in")
+        else:
+            self.project_in = None
+
+        # Note that the only purpose of `config._remove_final_layer_norm` is to
+        # keep backward compatibility with checkpoints that have been fine-tuned
+        # before transformers v4.20.1
+        # see https://github.com/facebookresearch/metaseq/pull/164
+        if config.do_layer_norm_before and not config._remove_final_layer_norm:
+            self.final_layer_norm = nn.LayerNorm(
+                config.hidden_size,
+                elementwise_affine=config.layer_norm_elementwise_affine)
+        else:
+            self.final_layer_norm = None
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: OPTDecoderLayer(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.layers")
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is None:
+                inputs_embeds = self.get_input_embeddings(input_ids)
+            pos_embeds = self.embed_positions(positions)
+            if self.project_in is not None:
+                inputs_embeds, _ = self.project_in(inputs_embeds)
+            hidden_states = inputs_embeds + pos_embeds
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states = layer(hidden_states)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
+        if self.final_layer_norm is not None:
+            hidden_states = self.final_layer_norm(hidden_states)
+        if self.project_out is not None:
+            hidden_states, _ = self.project_out(hidden_states)
+        return hidden_states
+
+
+@support_torch_compile
+class OPTModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.decoder = OPTDecoder(config,
+                                  cache_config,
+                                  quant_config,
+                                  prefix=f"{prefix}.decoder")
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(["hidden_states"],
+                                                    config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.decoder.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        return self.decoder(input_ids,
+                            positions,
+                            intermediate_tensors,
+                            inputs_embeds=inputs_embeds)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class OPTForCausalLM(nn.Module, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
+    }
+
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
+        "decoder.": "model.decoder.",
+    })
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = OPTModel(vllm_config=vllm_config,
+                              prefix=maybe_prefix(prefix, "model"))
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.decoder.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(config.vocab_size,
+                                          config.word_embed_proj_dim)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head.weight"]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/orion.py b/vllm_v0.10.0/vllm/model_executor/models/orion.py
new file mode 100644
index 0000000..d121188
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/orion.py
@@ -0,0 +1,349 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/modeling_orion.py
+# Copyright (c) OrionStar Inc.
+# LICENSE: https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/LICENSE
+"""Inference-only Orion-14B model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsPP
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class OrionMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config)
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config)
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class OrionAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class OrionDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.self_attn = OrionAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.mlp = OrionMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+        )
+
+        self.input_layernorm = nn.LayerNorm(config.hidden_size,
+                                            eps=config.rms_norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
+                                                     eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+@support_torch_compile
+class OrionModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: OrionDecoderLayer(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.layers")
+        self.norm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory([
+                "hidden_states",
+            ], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states = layer(positions, hidden_states)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+            })
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class OrionForCausalLM(nn.Module, SupportsPP):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = OrionModel(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/ovis.py b/vllm_v0.10.0/vllm/model_executor/models/ovis.py
new file mode 100644
index 0000000..111628d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/ovis.py
@@ -0,0 +1,577 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/ovis/modeling_ovis.py
+# Copyright 2023 The vLLM team.
+# Copyright 2023 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Ovis model."""
+import math
+from collections.abc import Iterable, Mapping
+from typing import Literal, Optional, TypedDict, Union
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.nn.functional import gumbel_softmax, pad, softmax
+from transformers import BaseImageProcessor, BatchFeature
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.quantization.gptq import GPTQConfig
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQMarlinConfig)
+from vllm.model_executor.models.aimv2 import AIMv2Model
+from vllm.model_executor.models.siglip import SiglipVisionModel
+from vllm.model_executor.models.utils import (AutoWeightsLoader, flatten_bn,
+                                              init_vllm_registered_model,
+                                              maybe_prefix)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import ImageSize, MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.ovis import (BaseVisualTokenizerConfig,
+                                                  OvisConfig)
+from vllm.transformers_utils.processors.ovis import OvisProcessor
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .utils import merge_multimodal_embeddings
+
+# Cannot find the following number from hf config.
+IMAGE_TOKEN = "<image>"
+IMAGE_INDICATOR_IDS = [-301, -302, -303, -304, -305]
+
+IMAGE_PAD_TOKEN_MAP = {
+    "gemma2": "<unused0>",
+    "llama": "<|reserved_special_token_0|>",
+    "qwen2": "<|image_pad|>",
+}
+IMAGE_PAD_TOKEN_ID_MAP = {
+    "gemma2": 7,
+    "llama": 128002,
+    "qwen2": 151655,
+}
+
+
+def st_argmax(y_soft: torch.Tensor, dim: int):  # straight-through softmax
+    index = y_soft.argmax(dim, keepdim=True)
+    return torch.zeros_like(
+        y_soft,
+        memory_format=torch.legacy_contiguous_format,
+    ).scatter_(dim, index, 1.0)
+
+
+class VisualTokenizer(torch.nn.Module):
+
+    def __init__(
+        self,
+        config: BaseVisualTokenizerConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.backbone = self._init_backbone(
+            config=config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.backbone",
+        )
+        # reserved tokens for IMAGE_INDICATORS
+        head_dim = config.vocab_size - len(IMAGE_INDICATOR_IDS)
+        self.head = torch.nn.Sequential(
+            ReplicatedLinear(
+                config.backbone_config.hidden_size * config.hidden_stride *
+                config.hidden_stride,
+                head_dim,
+                bias=False,
+                return_bias=False,
+            ), torch.nn.LayerNorm(head_dim))
+
+    def _init_backbone(
+        self,
+        config: BaseVisualTokenizerConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        model_type = config.backbone_config.model_type
+        if model_type == "aimv2":
+            # No post rms_norm in Ovis2's AIMv2 ViT.
+            return AIMv2Model(
+                config=config.backbone_config,
+                quant_config=quant_config,
+                require_post_norm=False,
+                prefix=prefix,
+            )
+        elif model_type == "siglip_vision_model":
+            return SiglipVisionModel(
+                config=config.backbone_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+        raise ValueError(
+            f"Unsupported visual tokenizer model_type: {model_type}")
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return next(self.head.parameters()).dtype
+
+    @property
+    def device(self) -> torch.device:
+        return next(self.head.parameters()).device
+
+    def tokenize(self, logits: torch.Tensor) -> torch.Tensor:
+        if self.config.tokenize_function == 'softmax':
+            tokens = softmax(logits, dim=-1)
+        elif self.config.tokenize_function == 'gumbel_argmax':
+            tokens = gumbel_softmax(logits, tau=self.config.tau, hard=True)
+        elif self.config.tokenize_function == 'st_argmax':
+            tokens = st_argmax(logits, dim=-1)
+        else:
+            raise ValueError(
+                'Invalid `max_type`, expected softmax or gumbel_argmax '
+                f'or st_argmax, but got {self.config.tokenize_function}')
+        return tokens
+
+    def encode(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        features = self.backbone(pixel_values)
+        if self.config.drop_cls_token:
+            features = features[:, 1:, :]
+
+        # merge number of `hidden_stride * hidden_stride` hidden states together
+        # to reduce token sequence length
+        # e.g., for hidden_stride=2, this leads to a token length reduction:
+        # 1024 -> 256 for aimv2
+        if self.config.hidden_stride > 1:
+            # this `d` maybe different from the above `d``
+            n, L, d = features.shape
+            sqrt_l = int(L**0.5)
+            assert sqrt_l**2 == L, (
+                "The token sequence length should be a perfect square.")
+            features = features.reshape(n, sqrt_l, sqrt_l, d)
+            pl = (self.config.hidden_stride -
+                  (sqrt_l %
+                   self.config.hidden_stride)) % self.config.hidden_stride
+            features = pad(features, (0, 0, 0, pl, 0, pl), "constant", 0)
+            sqrt_l += pl
+            features = features.reshape(n, sqrt_l // self.config.hidden_stride,
+                                        self.config.hidden_stride,
+                                        sqrt_l // self.config.hidden_stride,
+                                        self.config.hidden_stride, d)
+            # [n, sqrt_l/hs, sqrt_l/hs, hs, hs, d]
+            features = features.permute(0, 1, 3, 2, 4, 5)
+            # [n, sqrt_l/hs, sqrt_l/hs, hs*hs*d]
+            features = features.flatten(3)
+            # [n, sqrt_l/hs*sqrt_l/hs, hs*hs*d]
+            features = features.reshape(
+                n, -1,
+                self.config.hidden_stride * self.config.hidden_stride * d)
+
+        return features
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        """[BatchSize, ImageShape] -> [BatchSize, Token, VocabSize]"""
+        features = self.encode(pixel_values)
+        logits = self.head(features)
+        tokens = self.tokenize(logits)
+        # tokens' shape is [BatchSize, #Token, VocabSize-5], so padding with
+        # [BatchSize, #Token, 5], after which, tokens' shape should become
+        # [BatchSize, #Token, VocabSize]
+        tokens = torch.nn.functional.pad(
+            tokens,
+            (0, len(IMAGE_INDICATOR_IDS)),
+            mode="constant",
+            value=0,
+        )
+        return tokens
+
+
+class OvisImagePatchInputs(TypedDict):
+    type: Literal["image_patches"]
+    flat_data: torch.Tensor
+    """
+    Shape: 
+    `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)`
+    """
+
+    inducator_tokens: torch.Tensor
+    """
+    Shape: 
+    `(batch_size * (num_patches + 1))`
+    """
+
+    patches_per_image: list[int]
+    """
+    List of number of total patches for each image in the batch.
+    This is used to restore the first two dimensions of `flat_data`.
+    """
+
+
+class VisualEmbedding(torch.nn.Embedding):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def forward(self, visual_tokens: Tensor) -> Tensor:
+        if visual_tokens.dtype in [
+                torch.int8, torch.int16, torch.int32, torch.int64, torch.long
+        ]:
+            return super().forward(visual_tokens)
+        return torch.matmul(visual_tokens, self.weight)
+
+    @property
+    def device(self):
+        return self.weight.device
+
+    @property
+    def dtype(self):
+        return self.weight.dtype
+
+
+class OvisProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(OvisConfig)
+
+    def get_hf_processor(self, **kwargs):
+        return self.ctx.get_hf_processor(
+            OvisProcessor,
+            image_pad_token=self.get_image_pad_token(),
+            image_segment_len=self.get_image_segment_len(),
+        )
+
+    def get_image_segment_len(self) -> int:
+        visual_tokenizer_config = self.get_hf_config().visual_tokenizer_config
+        image_size = visual_tokenizer_config.backbone_config.image_size
+        patch_size = visual_tokenizer_config.backbone_config.patch_size
+        hidden_stride = visual_tokenizer_config.hidden_stride
+        patch_grid_length = math.ceil(image_size / patch_size)
+        assert patch_grid_length % hidden_stride == 0, (
+            f"patch_grid_length {patch_grid_length} is not divisible by "
+            f"hidden_stride {hidden_stride}")
+        # minus 1 for presented image token
+        return (patch_grid_length // hidden_stride)**2 - 1
+
+    def get_image_pad_token(self) -> str:
+        hf_text_config = self.get_hf_config().get_text_config()
+        text_model_type = hf_text_config.model_type
+        return IMAGE_PAD_TOKEN_MAP.get(text_model_type)
+
+    def get_image_processor(self) -> BaseImageProcessor:
+        return self.get_hf_processor().image_processor  # type: ignore
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        height, width = self.get_hf_processor().get_image_size()
+        hs = self.get_hf_config().visual_tokenizer_config.hidden_stride
+        # NOTE(Isotr0py): 9 is `max_partition` hardcoded in original code
+        # https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/modeling_ovis.py#L96
+        return ImageSize(width=width * hs * 9, height=height * hs * 9)
+
+
+class OvisDummyInputsBuilder(BaseDummyInputsBuilder[OvisProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+        return IMAGE_TOKEN * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+        }
+        return mm_data
+
+
+class OvisMultiModalProcessor(BaseMultiModalProcessor[OvisProcessingInfo]):
+
+    def image_indicators_to_visual_tokens(
+        self,
+        image_indicators: list[int],
+    ) -> list[int]:
+        """
+        Filter image indicators placeholders and convert them to corresponding 
+        tokens in visual tokenizer.
+        For example, [-301, -300, -302, -300, -303, -300, -304, -300, -305]
+        should return [vocab_size-1, vocab_size-2, ..., vocab_size-5]
+        """
+        hf_config = self.info.get_hf_config()
+        vte_vocab_size = hf_config.visual_tokenizer_config.vocab_size
+        # -300 is image_atom token, filter them out
+        return [vte_vocab_size + x + 300 for x in image_indicators if x < -300]
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if not mm_data:
+            # Avoid warning from HF logger for text-only input
+            tokenizer = self.info.get_tokenizer()
+            prompt_ids = tokenizer.encode(prompt, add_special_tokens=False)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+
+        hf_processor = self.info.get_hf_processor()
+        image_indicators = [
+            hf_processor.construct_image_indicators(grid)
+            for grid in processed_outputs["grids"]
+        ]
+        indicator_tokens = [
+            self.image_indicators_to_visual_tokens(indicator)
+            for indicator in image_indicators
+        ]
+        processed_outputs["indicator_tokens"] = indicator_tokens
+        return processed_outputs
+
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+
+        return prompt_tokens
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(pixel_values=MultiModalFieldConfig.batched("image"),
+                    grids=MultiModalFieldConfig.batched("image"),
+                    indicator_tokens=MultiModalFieldConfig.batched("image"))
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+
+        def get_replacement_ovis(item_idx):
+            grid = out_mm_kwargs["grids"][item_idx]
+
+            hf_processor = self.info.get_hf_processor()
+            return hf_processor.construct_image_placeholders(grid)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=IMAGE_TOKEN,
+                replacement=get_replacement_ovis,
+            ),
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(OvisMultiModalProcessor,
+                                        info=OvisProcessingInfo,
+                                        dummy_inputs=OvisDummyInputsBuilder)
+class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<image>"
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
+        self.config: OvisConfig = config
+        self.llm = init_vllm_registered_model(
+            vllm_config=vllm_config.with_hf_config(config.get_text_config()),
+            prefix=maybe_prefix(prefix, "llm"),
+        )
+
+        self.visual_tokenizer = VisualTokenizer(
+            config=config.visual_tokenizer_config,
+            quant_config=self._maybe_ignore_quant_config(quant_config),
+            prefix=f"{prefix}.visual_tokenizer",
+        )
+
+        self.vte = VisualEmbedding(
+            self.config.visual_tokenizer_config.vocab_size,
+            self.config.hidden_size)
+
+        text_model_type = self.config.get_text_config().model_type
+        self.image_pad_token_id = IMAGE_PAD_TOKEN_ID_MAP[text_model_type]
+
+        self.make_empty_intermediate_tensors = (
+            self.get_language_model().make_empty_intermediate_tensors)
+
+    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
+        # GPTQ configs do not have a list of ignored modules, however AutoGPTQ
+        # seems to avoid vision encoder sections for some models.
+        # See: https://huggingface.co/AIDC-AI/Ovis2-2B-GPTQ-Int4
+        if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)):
+            return None
+        return quant_config
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[OvisImagePatchInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        indicator_tokens = kwargs.pop("indicator_tokens", None)
+
+        if pixel_values is None and indicator_tokens is None:
+            return None
+
+        if pixel_values is not None and indicator_tokens is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            if not isinstance(indicator_tokens, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of indicator_tokens. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return OvisImagePatchInputs(
+                type="image_patches",
+                flat_data=flatten_bn(flatten_bn(pixel_values), concat=True),
+                patches_per_image=[
+                    x.shape[0] for x in flatten_bn(pixel_values)
+                ],
+                indicator_tokens=flatten_bn(flatten_bn(indicator_tokens),
+                                            concat=True),
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_image_input(
+            self, image_input: OvisImagePatchInputs) -> MultiModalEmbeddings:
+        image_patches_flat = image_input["flat_data"]
+        patches_per_image = image_input["patches_per_image"]
+        indicator_tokens = image_input["indicator_tokens"]
+
+        indicator_per_image = list(
+            map(lambda x: x + 1 if x > 1 else x + 2, patches_per_image))
+
+        target_dtype = self.visual_tokenizer.dtype
+        visual_tokens = self.visual_tokenizer(
+            image_patches_flat.to(target_dtype))
+        visual_embeds = self.vte(visual_tokens)  # 1:1 numeric eq.
+
+        indicator_embeds = self.vte(indicator_tokens)
+        indicator_embeds_per_image = indicator_embeds.split(
+            indicator_per_image)
+
+        visual_embeds_per_image = visual_embeds.split(patches_per_image, dim=0)
+        vision_embeddings = []
+        for indicator, visual in zip(indicator_embeds_per_image,
+                                     visual_embeds_per_image):
+            vision_embeddings_per_image = []
+            for i in range(visual.shape[0]):
+                vision_embeddings_per_image.append(
+                    torch.cat([indicator[i:i + 1], visual[i]], dim=0))
+            vision_embeddings_per_image.append(indicator[i + 1:])
+            vision_embeddings.append(
+                torch.cat(vision_embeddings_per_image, dim=0))
+
+        return tuple(vision_embeddings)
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return []
+
+        image_features = self._process_image_input(image_input)
+
+        return image_features
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.llm.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.image_pad_token_id)
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        # up until here we have a inputs_embeds 100% numerical identity
+        # between the OG HF Transformers implementation and ours
+        hidden_states = self.llm(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.llm.compute_logits(hidden_states, sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.llm
diff --git a/vllm_v0.10.0/vllm/model_executor/models/paligemma.py b/vllm_v0.10.0/vllm/model_executor/models/paligemma.py
new file mode 100644
index 0000000..b1f2e53
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/paligemma.py
@@ -0,0 +1,419 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Literal, Optional, TypedDict, Union
+
+import torch
+from torch import nn
+from transformers import BatchFeature, PaliGemmaConfig
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputs, MultiModalKwargs)
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptIndexTargets,
+                                        PromptInsertion, PromptUpdate,
+                                        PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .siglip import SiglipVisionModel
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+from .vision import get_vision_encoder_info
+
+logger = init_logger(__name__)
+
+
+class PaliGemmaImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: torch.Tensor
+    """Shape: `(batch_size * num_images, num_channels, height, width)`"""
+
+
+class PaliGemmaImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: torch.Tensor
+    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+PaliGemmaImageInputs = Union[PaliGemmaImagePixelInputs,
+                             PaliGemmaImageEmbeddingInputs]
+
+
+class PaliGemmaMultiModalProjector(nn.Module):
+
+    def __init__(self, vision_hidden_size: int, projection_dim: int):
+        super().__init__()
+
+        self.linear = nn.Linear(vision_hidden_size, projection_dim, bias=True)
+
+    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.linear(image_features)
+        return hidden_states
+
+
+class PaliGemmaProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(PaliGemmaConfig)
+
+    def get_vision_encoder_info(self):
+        return get_vision_encoder_info(self.get_hf_config())
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1}
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        vision_encoder_info = self.get_vision_encoder_info()
+
+        return vision_encoder_info.get_num_image_tokens(
+            image_width=image_width,
+            image_height=image_height,
+        )
+
+
+class PaliGemmaDummyInputsBuilder(
+        BaseDummyInputsBuilder[PaliGemmaProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        hf_config = self.info.get_hf_config()
+        vision_config = hf_config.vision_config
+        max_image_size = vision_config.image_size
+
+        num_images = mm_counts.get("image", 0)
+
+        return {
+            "image":
+            self._get_dummy_images(width=max_image_size,
+                                   height=max_image_size,
+                                   num_images=num_images)
+        }
+
+
+class PaliGemmaMultiModalProcessor(
+        BaseMultiModalProcessor[PaliGemmaProcessingInfo]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        tokenizer = self.info.get_tokenizer()
+        if not mm_data:
+            prompt_ids = tokenizer.encode(prompt, add_special_tokens=False)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        return super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(pixel_values=MultiModalFieldConfig.batched("image"))
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_config = self.info.get_hf_config()
+        image_token_id = hf_config.image_token_index
+
+        tokenizer = self.info.get_tokenizer()
+
+        bos_token_id = tokenizer.bos_token_id
+        assert isinstance(bos_token_id, int)
+
+        def get_insertion(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                num_image_tokens = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                num_image_tokens = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                )
+
+            image_tokens = [image_token_id] * num_image_tokens
+
+            return PromptUpdateDetails.select_token_id(
+                image_tokens + [bos_token_id],
+                embed_token_id=image_token_id,
+            )
+
+        # Paligemma 1 and 2 have different tokenizer.add_bos_token
+        # Insert <image>*n + <bos> after <bos> for Paligemma 1
+        # Insert <image>*n + <bos> for Paligemma 2
+        return [
+            PromptInsertion(
+                modality="image",
+                target=PromptIndexTargets.prefix(
+                    [bos_token_id] if tokenizer.add_bos_token else []),
+                insertion=get_insertion,
+            )
+        ]
+
+    def apply(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        return_mm_hashes: bool = False,
+    ) -> MultiModalInputs:
+        mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
+                                  tokenization_kwargs, return_mm_hashes)
+        prompt_token_ids = mm_inputs["prompt_token_ids"]
+
+        tokenizer = self.info.get_tokenizer()
+        newline_prompt = "\n"
+        newline_token_id = tokenizer.encode(newline_prompt)[-1]  # 108
+        # Force to add newline at the end of prompt for paligemma's format
+        # This step can NOT be replacemented by current PromptUpdate methods
+        if len(prompt_token_ids) and prompt_token_ids[-1] != newline_token_id:
+            prompt_token_ids.append(newline_token_id)
+            mm_inputs["prompt_token_ids"] = prompt_token_ids
+            mm_inputs["prompt"] += newline_prompt
+
+        return mm_inputs
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    PaliGemmaMultiModalProcessor,
+    info=PaliGemmaProcessingInfo,
+    dummy_inputs=PaliGemmaDummyInputsBuilder)
+class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                        SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # mapping for new names in checkpoint saved after transformers v4.52
+            "model.language_model.": "language_model.model.",
+            "model.vision_tower.": "vision_tower.",
+            "model.multi_modal_projector.": "multi_modal_projector.",
+            "lm_head.": "language_model.lm_head.",
+        })
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return None
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.vision_tower = SiglipVisionModel(config.vision_config,
+                                              quant_config,
+                                              prefix=maybe_prefix(
+                                                  prefix, "vision_tower"))
+        self.multi_modal_projector = PaliGemmaMultiModalProjector(
+            vision_hidden_size=config.vision_config.hidden_size,
+            projection_dim=config.vision_config.projection_dim)
+
+        self.quant_config = quant_config
+
+        if config.text_config.model_type == "gemma":
+            config.text_config.architectures = ["GemmaForCausalLM"]
+        else:
+            config.text_config.architectures = ["Gemma2ForCausalLM"]
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.language_model.logits_processor.scale *= logit_scale
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+        actual_dims = tuple(data.shape[1:])
+
+        if actual_dims != expected_dims:
+            expected_expr = ("batch_size", *map(str, expected_dims))
+            raise ValueError(
+                f"The expected shape of pixel values is {expected_expr}. "
+                f"You supplied {tuple(data.shape)}.")
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[PaliGemmaImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            pixel_values = flatten_bn(pixel_values, concat=True)
+
+            return PaliGemmaImagePixelInputs(
+                type="pixel_values",
+                data=self._validate_pixel_values(pixel_values),
+            )
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+
+            image_embeds = flatten_bn(image_embeds, concat=True)
+
+            return PaliGemmaImageEmbeddingInputs(
+                type="image_embeds",
+                data=image_embeds,
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _image_pixels_to_features(
+        self,
+        vision_tower: SiglipVisionModel,
+        pixel_values: torch.Tensor,
+    ) -> torch.Tensor:
+
+        target_dtype = vision_tower.get_input_embeddings().weight.dtype
+        image_features = vision_tower(pixel_values.to(dtype=target_dtype))
+
+        return image_features
+
+    def _process_image_input(
+        self,
+        image_input: PaliGemmaImageInputs,
+    ) -> torch.Tensor:
+
+        if image_input["type"] == "image_embeds":
+            return image_input["data"]
+
+        assert self.vision_tower is not None
+        pixel_values = image_input["data"]
+        image_features = self._image_pixels_to_features(
+            self.vision_tower,
+            pixel_values,
+        )
+
+        return self.multi_modal_projector(image_features)
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return []
+        vision_embeddings = self._process_image_input(image_input)
+        # https://github.com/huggingface/transformers/blob/main/src/transformers/models/paligemma/modeling_paligemma.py#L294 # noqa
+        vision_embeddings = vision_embeddings * (self.config.hidden_size**-0.5)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.config.image_token_index)
+        return inputs_embeds
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs: object) -> IntermediateTensors:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/persimmon.py b/vllm_v0.10.0/vllm/model_executor/models/persimmon.py
new file mode 100644
index 0000000..f8db99e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/persimmon.py
@@ -0,0 +1,344 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/persimmon/modeling_persimmon.py
+# Copyright 2023 The vLLM team.
+# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only persimmon model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import PersimmonConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsPP
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class PersimmonMLP(nn.Module):
+
+    def __init__(self,
+                 config: PersimmonConfig,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.dense_h_to_4h = ColumnParallelLinear(config.hidden_size,
+                                                  config.intermediate_size,
+                                                  quant_config=quant_config)
+        self.dense_4h_to_h = RowParallelLinear(config.intermediate_size,
+                                               config.hidden_size,
+                                               quant_config=quant_config)
+        self.act = get_act_fn(config.hidden_act)
+
+    def forward(self, hidden_states) -> torch.Tensor:
+        hidden_states, _ = self.dense_h_to_4h(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.dense_4h_to_h(hidden_states)
+        return hidden_states
+
+
+class PersimmonAttention(nn.Module):
+
+    def __init__(self,
+                 config: PersimmonConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.config = config
+        tensor_parallel_world_size = get_tensor_model_parallel_world_size()
+
+        self.hidden_size = config.hidden_size
+        self.total_num_heads = config.num_attention_heads
+        self.num_heads = self.total_num_heads // tensor_parallel_world_size
+        self.head_dim = self.hidden_size // self.total_num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.partial_rotary_factor = config.partial_rotary_factor
+        self.is_causal = True
+
+        assert (self.head_dim * self.total_num_heads) == self.hidden_size
+        assert self.total_num_heads % tensor_parallel_world_size == 0
+
+        self.query_key_value = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            bias=True,
+            quant_config=quant_config,
+        )
+        self.dense = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+        )
+        self.is_qk_layernorm = config.qk_layernorm
+
+        if self.is_qk_layernorm:
+            self.q_layernorm = nn.LayerNorm(self.head_dim)
+            self.k_layernorm = nn.LayerNorm(self.head_dim)
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+            partial_rotary_factor=self.partial_rotary_factor,
+        )
+        self.scaling = self.head_dim**-0.5
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              scale=self.scaling,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def _split_heads(self, x: torch.Tensor) -> torch.Tensor:
+        # [seq_length, hidden_size] -> [seq_length, num_heads, head_dim]
+        seq_length = x.shape[0]
+        return x.view(seq_length, self.num_heads, self.head_dim)
+
+    def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
+        # [seq_length, num_heads, head_dim] -> [seq_length, hidden_size]
+        seq_length = x.shape[0]
+        return x.view(seq_length, self.num_heads * self.head_dim)
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        # [seq_length, 3 x hidden_size]
+        qkv, _ = self.query_key_value(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+
+        if self.is_qk_layernorm:
+            # [seq_length, num_heads, head_dim]
+            q = self._split_heads(q)
+            k = self._split_heads(k)
+
+            q = self.q_layernorm(q)
+            k = self.k_layernorm(k)
+
+            q = self._merge_heads(q)
+            k = self._merge_heads(k)
+
+        q, k = self.rotary_emb(position_ids, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.dense(attn_output)
+        return output
+
+
+class PersimmonDecoderLayer(nn.Module):
+
+    def __init__(self,
+                 config: PersimmonConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = PersimmonAttention(config=config,
+                                            cache_config=cache_config,
+                                            quant_config=quant_config,
+                                            prefix=f"{prefix}.self_attn")
+        self.mlp = PersimmonMLP(config, quant_config=quant_config)
+        self.input_layernorm = nn.LayerNorm(config.hidden_size,
+                                            eps=config.layer_norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
+                                                     eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states = self.self_attn(
+            position_ids=position_ids,
+            hidden_states=hidden_states,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = hidden_states + residual
+
+        outputs = hidden_states
+        return outputs
+
+
+@support_torch_compile
+class PersimmonModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.vocab_size = config.vocab_size
+        self.config = config
+        self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
+                                                   config.hidden_size)
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: PersimmonDecoderLayer(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.layers")
+        self.final_layernorm = nn.LayerNorm(config.hidden_size,
+                                            eps=config.layer_norm_eps)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(["hidden_states"],
+                                                    config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states = layer(positions, hidden_states)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
+        hidden_states = self.final_layernorm(hidden_states)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if is_pp_missing_parameter(name, self):
+                continue
+            param = params_dict[name]
+
+            if "query_key_value" in name:
+                # copy from vllm/model_executor/models/bloom.py
+                # NOTE: Persimmon's fused QKV's output_dim has the shape of
+                # (num_heads * 3 * head_size), while the
+                # required shape is (3 * num_heads * head_size).
+                # Thus, we need weight conversion.
+                output_dim = getattr(param, "output_dim", None)
+                num_heads = self.config.num_attention_heads
+                if output_dim is not None:
+                    loaded_weight_shape = loaded_weight.shape
+                    loaded_weight = loaded_weight.view(
+                        loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
+                        loaded_weight_shape[output_dim + 1:])
+                    loaded_weight = loaded_weight.transpose(
+                        output_dim, output_dim + 1)
+                    loaded_weight = loaded_weight.reshape(loaded_weight_shape)
+
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class PersimmonForCausalLM(nn.Module, SupportsPP):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.model = PersimmonModel(vllm_config=vllm_config,
+                                    prefix=maybe_prefix(prefix, "model"))
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      bias=False)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ):
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/phi.py b/vllm_v0.10.0/vllm/model_executor/models/phi.py
new file mode 100644
index 0000000..21d517b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/phi.py
@@ -0,0 +1,356 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_phi.py
+# Copyright 2023 The vLLM team.
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2022, Tri Dao, trid@cs.stanford.edu.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""Inference-only Phi-1.5 model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import PhiConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class PhiAttention(nn.Module):
+
+    def __init__(self,
+                 config: PhiConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.total_num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.head_size = self.hidden_size // self.total_num_heads
+
+        tensor_model_parallel_world_size = (
+            get_tensor_model_parallel_world_size())
+        assert self.total_num_heads % tensor_model_parallel_world_size == 0
+        self.num_heads = (self.total_num_heads //
+                          tensor_model_parallel_world_size)
+
+        # pylint: disable=C0103
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_size,
+            self.total_num_heads,
+            bias=True,
+            quant_config=quant_config,
+        )
+        self.dense = RowParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            quant_config=quant_config,
+        )
+
+        scaling = self.head_size**-0.5
+        rotary_dim = int(config.partial_rotary_factor *
+                         (config.hidden_size // config.num_attention_heads))
+        assert rotary_dim % 2 == 0
+
+        # pylint: disable=C0301
+        # Refer to:
+        # https://huggingface.co/microsoft/phi-1_5/blob/d212a789620c380ff32ca1d1ee9943a777360987/modeling_phi.py#L518
+        rope_theta = getattr(config, "rope_theta", 10000.0)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          2048)
+        self.rotary_emb = get_rope(
+            self.head_size,
+            rotary_dim=rotary_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_size,
+                              scaling,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        q, k = self.rotary_emb(position_ids, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.dense(attn_output)
+        return output
+
+
+class PhiMLP(nn.Module):
+
+    def __init__(self,
+                 config: PhiConfig,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+
+        n_inner = getattr(config, "n_inner", None)
+        n_inner = n_inner if n_inner is not None else 4 * config.hidden_size
+
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            n_inner,
+            quant_config=quant_config,
+        )
+        self.fc2 = RowParallelLinear(
+            n_inner,
+            config.hidden_size,
+            quant_config=quant_config,
+        )
+        self.act = get_act_fn(config.hidden_act)
+
+    def forward(self, hidden_states):
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        return hidden_states
+
+
+class PhiLayer(nn.Module):
+
+    def __init__(self,
+                 config: PhiConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.input_layernorm = nn.LayerNorm(config.hidden_size,
+                                            eps=config.layer_norm_eps)
+        self.self_attn = PhiAttention(config,
+                                      cache_config,
+                                      quant_config,
+                                      prefix=f"{prefix}.self_attn")
+        self.mlp = PhiMLP(config, quant_config)
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        attn_outputs = self.self_attn(
+            position_ids=position_ids,
+            hidden_states=hidden_states,
+        )
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        hidden_states = attn_outputs + feed_forward_hidden_states + residual
+        return hidden_states
+
+
+@support_torch_compile
+class PhiModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+        self.quant_config = quant_config
+        self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
+                                                   config.hidden_size)
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: PhiLayer(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.layers")
+        self.final_layernorm = nn.LayerNorm(config.hidden_size,
+                                            eps=config.layer_norm_eps)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(["hidden_states"],
+                                                    config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states = layer(positions, hidden_states)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
+
+        hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v")
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # pylint: disable=E1136
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ]
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        self.config = config
+        # lm_head use bias, cannot share word embeddings
+        assert not config.tie_word_embeddings
+        self.lora_config = lora_config
+
+        self.quant_config = quant_config
+
+        self.model = PhiModel(vllm_config=vllm_config,
+                              prefix=maybe_prefix(prefix, "model"))
+
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      bias=True,
+                                      quant_config=quant_config)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata, self.lm_head.bias)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/phi3.py b/vllm_v0.10.0/vllm/model_executor/models/phi3.py
new file mode 100644
index 0000000..f4e870c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/phi3.py
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from llama.py
+"""Inference-only Phi3 model code inherit from Llama.py"""
+
+from vllm.model_executor.models.llama import LlamaForCausalLM
+
+
+class Phi3ForCausalLM(LlamaForCausalLM):
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "qkv_proj",
+        ],
+        "gate_up_proj": [
+            "gate_up_proj",
+        ],
+    }
diff --git a/vllm_v0.10.0/vllm/model_executor/models/phi3v.py b/vllm_v0.10.0/vllm/model_executor/models/phi3v.py
new file mode 100644
index 0000000..745cf7a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/phi3v.py
@@ -0,0 +1,733 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2024 The vLLM team.
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Any, Literal, Optional, TypedDict, Union
+
+import regex as re
+import torch
+import torch.nn as nn
+from transformers import (BatchFeature, CLIPVisionConfig, PretrainedConfig,
+                          ProcessorMixin)
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   ImageSize, MultiModalDataItems)
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, BoundPromptUpdate,
+                                        PlaceholderFeaturesInfo,
+                                        PromptReplacement, PromptUpdate)
+# yapf: enable
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.utils import is_list_of
+
+from .clip import CLIPVisionModel
+from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
+                         SupportsQuant)
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+
+logger = init_logger(__name__)
+
+# Cannot find the following 2 numbers from hf config.
+_IMAGE_TOKEN_ID = 32044
+
+CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(dropout=0.0,
+                                                     hidden_act="quick_gelu",
+                                                     hidden_size=1024,
+                                                     image_size=336,
+                                                     intermediate_size=4096,
+                                                     num_attention_heads=16,
+                                                     num_channels=3,
+                                                     num_hidden_layers=24,
+                                                     patch_size=14,
+                                                     projection_dim=768)
+
+
+def _init_img_processor(hf_config: PretrainedConfig,
+                        quant_config: Optional[QuantizationConfig],
+                        prefix: str = "") -> CLIPVisionModel:
+    clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG
+    layer_idx = hf_config.img_processor.get('layer_idx', -2)
+
+    # Initialize the CLIP only up to the required feature layer
+    if layer_idx < 0:
+        num_hidden_layers = clip_config.num_hidden_layers + \
+            layer_idx + 1
+    else:
+        num_hidden_layers = layer_idx + 1
+
+    img_processor = CLIPVisionModel(
+        clip_config,
+        quant_config,
+        num_hidden_layers_override=num_hidden_layers,
+        prefix=prefix,
+    )
+
+    return img_processor
+
+
+class Phi3VImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    Shape:
+    `(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
+
+    Note that `num_patches` may be different per batch and image,
+    in which case the data is passed as a list instead of a batched tensor.
+    """
+
+    image_sizes: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, 2)`
+
+    This should be in `(height, width)` format.
+    """
+
+
+class Phi3VImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: Union[torch.Tensor, list[torch.Tensor]]
+    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+Phi3VImageInputs = Union[Phi3VImagePixelInputs, Phi3VImageEmbeddingInputs]
+
+
+class Phi3ImageEmbeddingBase(nn.Module):
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.layer_idx: int
+        self.type_feature: str
+        self.img_processor: CLIPVisionModel
+
+    def get_img_features(self,
+                         img_embeds: torch.FloatTensor) -> torch.FloatTensor:
+        TYPE_FEATURE = self.type_feature
+
+        # NOTE: we skip the step to select the vision feature layer since
+        # this is already done inside the img_processor
+        img_feature = self.img_processor(img_embeds)
+
+        if TYPE_FEATURE == "patch":
+            patch_feature = img_feature[:, 1:]
+            return patch_feature
+
+        if TYPE_FEATURE == "cls_patch":
+            return img_feature
+
+        raise NotImplementedError
+
+
+# adapted from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_embedding_phi3_v.py
+class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
+    """Phi3 Image embedding with HD transform."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 quant_config: Optional[QuantizationConfig],
+                 prefix: str = "") -> None:
+        super().__init__()
+
+        # n_embed or hidden_size
+        hidden_size = config.n_embd if hasattr(
+            config, 'n_embd') else config.hidden_size
+
+        self.img_processor = _init_img_processor(
+            config, quant_config, prefix=f"{prefix}.img_processor")
+
+        image_dim_out = config.img_processor['image_dim_out']
+        self.num_img_tokens = config.img_processor['num_img_tokens']
+
+        self.image_dim_out = image_dim_out
+
+        # global_gn and sub_gn for hd transform, serves as line separator
+        self.use_hd_transform = config.embd_layer.get('use_hd_transform',
+                                                      False)
+        self.with_learnable_separator = config.embd_layer.get(
+            'with_learnable_separator', False)
+        self.hd_transform_order = config.embd_layer.get(
+            'hd_transform_order', 'glb_sub')
+        # with_hd_transform and with_learnable_separator should have same value
+        assert self.use_hd_transform and self.with_learnable_separator
+
+        # 1024 * 4, merge spatial to channel dimension
+        self.glb_GN = nn.Parameter(torch.empty([1, 1, self.image_dim_out * 4]))
+        self.sub_GN = nn.Parameter(
+            torch.empty([1, 1, 1, self.image_dim_out * 4]))
+
+        dim_projection = hidden_size
+        depth = 2
+        layers = [nn.Linear(image_dim_out * 4, dim_projection)]
+        for _ in range(1, depth):
+            layers.extend(
+                [nn.GELU(),
+                 nn.Linear(dim_projection, dim_projection)])
+        self.img_projection = nn.Sequential(*layers)
+
+        self.type_feature = config.img_processor.get('type_feature', 'patch')
+
+    def forward(self, pixel_values: torch.FloatTensor,
+                image_sizes: torch.Tensor) -> torch.FloatTensor:
+        """
+        process image and return vision embeddings.
+
+        pixel_values: (num_images, num_crops, c, h, w)
+        output: (num_images, num_img_tokens, hidden_size)
+        """
+        num_images, num_crops, c, h, w = pixel_values.shape
+        pixel_values = pixel_values.flatten(0, 1)
+        img_features = self.get_img_features(pixel_values)
+        img_features = img_features.reshape(num_images, num_crops, -1,
+                                            self.image_dim_out)
+        image_features_proj = self.hd_feature_transform(
+            img_features, image_sizes)
+        return image_features_proj
+
+    def hd_feature_transform(self, image_features, image_sizes):
+        """
+        image_features: (num_images, num_crops+1, 24*24, 1024)
+        """
+        assert (
+            self.hd_transform_order == 'sub_glb'
+        ), f'hd_transform_order `{self.hd_transform_order}` not implemented'
+        if isinstance(self.img_projection, nn.Sequential):
+            target_device = self.img_projection[0].bias.device
+            target_dtype = self.img_projection[0].bias.dtype
+        else:  # It's a single nn.Linear layer
+            target_device = self.img_projection.bias.device
+            target_dtype = self.img_projection.bias.dtype
+
+        global_image_features = image_features[:,
+                                               0]  # (num_images, 24*24, 1024)
+        # global feature can be viewed as a special HD case with num_crops 1x1
+        global_image_features_hd = self.reshape_hd_patches_2x2merge(
+            global_image_features, 1, 1)
+        global_image_features_hd_newline = self.add_image_newline(
+            global_image_features_hd)
+
+        batch_image_features_proj = []
+        # need a for loop to process each image because of different image sizes
+        # (patch arrangement is different for each image)
+        for i, img_size in enumerate(image_sizes):
+            h, w = img_size
+            h_crop = h // 336
+            w_crop = w // 336
+            num_crops = h_crop * w_crop
+
+            # NOTE: real num_crops is padded
+            # (num_crops, 24*24, 1024)
+            sub_image_features = image_features[i, 1:1 + num_crops]
+            sub_image_features_hd = self.reshape_hd_patches_2x2merge(
+                sub_image_features, h_crop, w_crop)
+            sub_image_features_hd_newline = self.add_image_newline(
+                sub_image_features_hd)
+
+            # [sub features, separator, global features]
+            image_embeddings = torch.cat([
+                sub_image_features_hd_newline.squeeze(
+                    0),  # (h_crop*12*(w_crop*12+1), 4096)
+                self.glb_GN.squeeze(0),
+                global_image_features_hd_newline[i],
+            ])
+            img_proj = self.img_projection(
+                image_embeddings.to(target_device, target_dtype))
+            batch_image_features_proj.append(img_proj)
+
+        return batch_image_features_proj
+
+    def reshape_hd_patches_2x2merge(self, image_features, h_crop, w_crop):
+        """
+        image_features: (num_images*num_crops, 24*24, 1024)
+        output: (num_images, h_crop*12, w_crop*12, 4096)
+        where h_crop*w_crop == num_crops
+        """
+        N, L, C = image_features.shape
+        assert L == 576 and C == 1024 and N % (h_crop * w_crop) == 0
+        num_images = N // (h_crop * w_crop)
+        H = int(L**0.5)
+        image_features_hd = (
+            image_features.reshape(N, H, H, C)  # N, 24, 24, 1024
+            .reshape(N, H // 2, 2, H // 2, 2, C)  # N, 12, 2, 12, 2, 1024
+            .permute(0, 1, 3, 2, 4, 5)  # N, 12, 12, 2, 2, 1024
+            .reshape(N, -1, 4 * C)  # N, 144, 4096
+            .reshape(num_images, h_crop, w_crop, H // 2, H // 2,
+                     -1)  # n_img, h_crop, w_crop, 12, 12, 4096
+            .permute(0, 1, 3, 2, 4, 5)  # n_img, h_crop, 12, w_crop, 12, 4096
+            .reshape(num_images, h_crop * H // 2, w_crop * H // 2,
+                     4 * C)  # n_img, h_crop*12, w_crop*12, 4096
+        )
+        return image_features_hd
+
+    def add_image_newline(self, image_features_hd):
+        """
+        image_features_hd: (num_images, h_crop*12, w_crop*12, 4096)
+        output: (num_images, (h_crop*12) * (w_crop*12+1), 4096)
+        """
+        num_images, h, w, hid_dim = image_features_hd.shape
+        # add the newline token to the HD image feature patches
+        newline_embeddings = self.sub_GN.expand(num_images, h, -1,
+                                                -1)  # (n_img, h, 1, hid_dim)
+        image_features_hd_newline = torch.cat(
+            [image_features_hd, newline_embeddings],
+            dim=2).reshape(num_images, -1, hid_dim)
+        return image_features_hd_newline
+
+
+class Phi3VProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_processor(
+        self,
+        *,
+        num_crops: Optional[int] = None,
+        **kwargs: object,
+    ) -> ProcessorMixin:
+        if num_crops is not None:
+            kwargs["num_crops"] = num_crops
+
+        return self.ctx.get_hf_processor(**kwargs)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[ProcessorMixin] = None,
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        return processor.calc_num_image_tokens_from_image_size(  # type: ignore
+            width=image_width,
+            height=image_height,
+        )
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        # Result in the max possible feature size (h:w = 16:1)
+        return ImageSize(height=8000, width=50)
+
+
+class Phi3VDummyInputsBuilder(BaseDummyInputsBuilder[Phi3VProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        hf_processor = self.info.get_hf_processor()
+        image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
+
+        return "".join(image_tokens[:num_images])
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+
+class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+
+        input_ids = processed_outputs["input_ids"]
+        assert isinstance(input_ids, torch.Tensor)
+
+        # Phi3v processor has inserted -1, -2 etc as placeholder in prompt_ids,
+        # which will cause OverflowError when decoding the prompt_ids.
+        # Therefore, we need to do an early replacement here
+        input_ids.masked_fill_(input_ids < 0, _IMAGE_TOKEN_ID)
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_sizes=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
+
+        def get_replacement_phi3v(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                num_image_tokens = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                num_image_tokens = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    processor=hf_processor,
+                )
+
+            return [_IMAGE_TOKEN_ID] * num_image_tokens
+
+        num_images = mm_items.get_count("image", strict=False)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=image_token,
+                replacement=get_replacement_phi3v,
+            ) for image_token in image_tokens[:num_images]
+        ]
+
+    def _apply_prompt_updates(
+        self,
+        token_ids: list[int],
+        mm_prompt_updates: Mapping[str, Sequence[BoundPromptUpdate]],
+        mm_item_counts: Mapping[str, int],
+    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
+        # align to hf behavior when there are images
+        if len(mm_item_counts):
+            tokenizer = self.info.get_tokenizer()
+            # to decode token_ids to the original text, we need to
+            # 1. remove the first bos token
+            # 2. remove space after each special token
+            #    introduced by the tokenizer
+            if len(token_ids) and token_ids[0] == tokenizer.bos_token_id:
+                token_ids = token_ids[1:]
+            text = tokenizer.decode(token_ids)
+            for special_tokens in tokenizer.special_tokens_map.values():
+                if isinstance(special_tokens, str):
+                    text = text.replace(f"{special_tokens} ", special_tokens)
+                elif isinstance(special_tokens, list):
+                    for special_token in special_tokens:
+                        text = text.replace(f"{special_token} ", special_token)
+            # perform hf behavior
+            # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/64f88b6/processing_phi3_v.py#L407
+            pattern = r"<\|image_\d+\|>"
+            prompt_chunks = [
+                tokenizer(chunk).input_ids
+                for chunk in re.split(pattern, text)
+            ]
+            image_tags = [
+                tokenizer(chunk, add_special_tokens=False).input_ids
+                for chunk in re.findall(pattern, text)
+            ]
+            if len(prompt_chunks) > len(image_tags):
+                image_tags.append([])
+            token_ids = [
+                e for sublist in zip(prompt_chunks, image_tags)
+                for ele in sublist for e in ele
+            ]
+
+        token_ids, text, placeholders = super()._apply_prompt_updates(
+            token_ids=token_ids,
+            mm_prompt_updates=mm_prompt_updates,
+            mm_item_counts=mm_item_counts,
+        )
+
+        # Keep the behavior in line with HF processor
+        if text.startswith("<s> <|image|>"):
+            text = text.replace("<s> <|image|>", "<s><|image|>", 1)
+            token_ids = [token_ids[0], *token_ids[2:]]
+            placeholders = {
+                modality: [
+                    PlaceholderFeaturesInfo(
+                        modality=p.modality,
+                        item_idx=p.item_idx,
+                        start_idx=p.start_idx - 1,
+                        tokens=p.tokens,
+                        is_embed=p.is_embed,
+                    ) for p in ps
+                ]
+                for modality, ps in placeholders.items()
+            }
+
+        return token_ids, text, placeholders
+
+
+@MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor,
+                                        info=Phi3VProcessingInfo,
+                                        dummy_inputs=Phi3VDummyInputsBuilder)
+class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
+                       SupportsQuant):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "model.vision_embed_tokens.wte": "embed_tokens",
+            "model.vision_embed_tokens.": "vision_embed_tokens.",
+            "lm_head.": "language_model.lm_head.",
+            "model.": "language_model.model.",
+        })
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return f"<|image_{i}|>"
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        self.config = config
+        self.multimodal_config = multimodal_config
+        self.image_token_id = _IMAGE_TOKEN_ID
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            quant_config=self.quant_config,
+            prefix=maybe_prefix(prefix, "model.embed_tokens"),
+        )
+
+        # TODO: Optionally initializes this for supporting input embeddings.
+        self.vision_embed_tokens = Phi3HDImageEmbedding(
+            config,
+            self.quant_config,
+            prefix=maybe_prefix(prefix, "model.vision_embed_tokens"))
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            # The prefix is empty intentionally because default prefix of
+            # LlamaForCausalLM is "model"
+            prefix="",
+            # We don't directly initialize vLLM's LlamaForCausalLM so we
+            # can automatically apply embedding wrapper if this model is
+            # initialized as an embedding model
+            architectures=["LlamaForCausalLM"],
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
+        expected_dims = (2, )
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape)
+
+            if actual_dims != expected_dims:
+                expected_expr = str(expected_dims)
+                raise ValueError(
+                    f"The expected shape of image sizes per image per batch "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _validate_pixel_values(
+        self, data: Union[torch.Tensor, list[torch.Tensor]]
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+
+        h = w = CLIP_VIT_LARGE_PATCH14_336_CONFIG.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape[1:])
+
+            if actual_dims != expected_dims:
+                expected_expr = ("num_patches", *map(str, expected_dims))
+                raise ValueError(
+                    "The expected shape of pixel values per image per batch "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Phi3VImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            if not isinstance(image_sizes, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image sizes. "
+                                 f"Got type: {type(image_sizes)}")
+
+            return Phi3VImagePixelInputs(
+                type="pixel_values",
+                data=self._validate_pixel_values(flatten_bn(pixel_values)),
+                image_sizes=self._validate_image_sizes(
+                    flatten_bn(image_sizes, concat=True)))
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+
+            return Phi3VImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds),
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_image_input(
+        self,
+        image_input: Phi3VImageInputs,
+    ) -> torch.Tensor:
+
+        if image_input["type"] == "image_embeds":
+            image_data = image_input["data"]
+            if is_list_of(image_data, torch.Tensor):
+                # it's already a list of tensors
+                return image_data
+            if len(image_data.shape) == 3:
+                # 3D tensor
+                return list(torch.unbind(image_data, dim=0))
+            raise ValueError(
+                "We expect batched 2D tensors; "
+                "this can be either a list of 2D tensors or a single 3D tensor."
+            )
+
+        assert self.vision_embed_tokens is not None
+        image_embeds = self.vision_embed_tokens(image_input["data"],
+                                                image_input["image_sizes"])
+
+        return image_embeds
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return []
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.embed_tokens(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.image_token_id)
+        return inputs_embeds
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs: object):
+
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+
+        loader = AutoWeightsLoader(self)
+        autoloaded_weights = loader.load_weights(weights,
+                                                 mapper=self.hf_to_vllm_mapper)
+
+        # The HF config doesn't specify whether these are tied,
+        # so we detect it this way
+        if "embed_tokens.weight" not in autoloaded_weights:
+            self.embed_tokens = self.language_model.model.embed_tokens
+            autoloaded_weights.add("embed_tokens.weight")
+        return autoloaded_weights
diff --git a/vllm_v0.10.0/vllm/model_executor/models/phi4flash.py b/vllm_v0.10.0/vllm/model_executor/models/phi4flash.py
new file mode 100644
index 0000000..a4ded2b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/phi4flash.py
@@ -0,0 +1,736 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+from transformers.activations import ACT2FN
+
+import vllm.envs as envs
+from vllm.attention import Attention, AttentionMetadata, AttentionType
+from vllm.attention.selector import _Backend
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.forward_context import ForwardContext, get_forward_context
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
+    causal_conv1d_fn, causal_conv1d_update)
+from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
+    selective_scan_fn, selective_state_update)
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid,
+                                                   SupportsV0Only)
+from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
+                                                    MambaCacheParams)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .utils import make_layers, maybe_prefix
+
+logger = init_logger(__name__)
+
+
+class SwiGLUActivation(nn.Module):
+
+    def forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
+        return x1 * nn.functional.silu(x2)
+
+
+class SambaYMLP(nn.Module):
+    """Gated Linear Unit.
+
+    Reference:
+        Language Modeling with Gated Convolutional Networks.
+        https://arxiv.org/pdf/1612.08083v3.pdf.
+
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+        self.fc1 = nn.Linear(config.hidden_size,
+                             2 * config.intermediate_size,
+                             bias=False)
+        self.fc2 = nn.Linear(config.intermediate_size,
+                             config.hidden_size,
+                             bias=False)
+
+        self.activation_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states):
+        y = self.fc1(hidden_states)
+        gate, y = y.chunk(2, dim=-1)
+        y = y * self.activation_fn(gate)
+        return self.fc2(y)
+
+
+def get_virtual_engine():
+    forward_context: ForwardContext = get_forward_context()
+    return forward_context.virtual_engine
+
+
+class SambaYAttention(nn.Module):
+
+    def __init__(self,
+                 config,
+                 layer_idx: Optional[int] = None,
+                 yoco_cross: bool = False,
+                 cache_config: Optional[CacheConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing "
+                "a `layer_idx` is not recommended and will lead to errors "
+                "during the forward call if caching is used. Please make "
+                "sure to provide a `layer_idx` when creating this class.")
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.yoco_cross = yoco_cross
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError("hidden_size must be divisible by num_heads "
+                             f"(got `hidden_size`: {self.hidden_size} and "
+                             f"`num_heads`: {self.num_heads}).")
+
+        op_size = self.num_heads * self.head_dim + 2 * (
+            self.num_key_value_heads * self.head_dim)
+        self.out_proj = nn.Linear(self.num_heads * self.head_dim,
+                                  self.hidden_size,
+                                  bias=True)
+        if yoco_cross:
+            self.Wqkv = nn.Linear(self.hidden_size,
+                                  self.num_heads * self.head_dim,
+                                  bias=True)
+        else:
+            self.Wqkv = nn.Linear(self.hidden_size, op_size, bias=True)
+
+        # disable sliding window for the second half of the model
+        sliding_window = config.interleaved_sliding_window[layer_idx]
+        if layer_idx >= config.num_hidden_layers // 2:
+            assert sliding_window is None, \
+                "sliding_window must be none for the second decoder"
+        else:
+            assert sliding_window is not None, \
+                "sliding_window must be set for the first decoder"
+
+        assert self.num_heads % 2 == 0, 'num_heads should be even'
+        assert self.num_key_value_heads % 2 == 0, 'num_heads should be even'
+
+        self.lambda_init = self.lambda_init_fn(layer_idx)
+        self.lambda_q1 = nn.Parameter(
+            torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0,
+                                                                    std=0.1))
+        self.lambda_k1 = nn.Parameter(
+            torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0,
+                                                                    std=0.1))
+        self.lambda_q2 = nn.Parameter(
+            torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0,
+                                                                    std=0.1))
+        self.lambda_k2 = nn.Parameter(
+            torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0,
+                                                                    std=0.1))
+        self.subln = nn.RMSNorm(2 * self.head_dim,
+                                eps=1e-5,
+                                elementwise_affine=True)
+
+        params = {
+            'differential_flash_attention_config': {
+                'lambda_init': self.lambda_init,
+                'lambda_q1': self.lambda_q1,
+                'lambda_k1': self.lambda_k1,
+                'lambda_q2': self.lambda_q2,
+                'lambda_k2': self.lambda_k2,
+                "subln": self.subln,
+            }
+        }
+
+        if yoco_cross:
+            kv_shared_layer_index = config.num_hidden_layers // 2 + 1
+            kv_sharing_target_layer_name = \
+                f"model.layers.{kv_shared_layer_index}.self_attn.attn"
+        else:
+            kv_sharing_target_layer_name = None
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.head_dim**-0.5,
+            num_kv_heads=self.num_key_value_heads,
+            cache_config=cache_config,
+            per_layer_sliding_window=sliding_window,
+            prefix=f"{prefix}.attn",
+            attn_type=AttentionType.DECODER,
+            kv_sharing_target_layer_name=kv_sharing_target_layer_name,
+            **params)
+        assert self.attn.backend == _Backend.DIFFERENTIAL_FLASH_ATTN,\
+              "DIFFERENTIAL_FLASH_ATTN required"
+
+    def lambda_init_fn(self, depth):
+        return 0.8 - 0.6 * math.exp(-0.3 * depth)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ):
+
+        if not self.yoco_cross:  # need to generate kv-cache
+            qkv = self.Wqkv(hidden_states)
+            q, k, v = qkv.split([
+                self.hidden_size, self.num_key_value_heads * self.head_dim,
+                self.num_key_value_heads * self.head_dim
+            ],
+                                dim=-1)
+            attn_output = self.attn(q, k, v)
+        else:  # reuse the kv cache, full attention
+            q = self.Wqkv(hidden_states)
+            attn_output = self.attn(q, None, None)
+        attn_output = attn_output.view(-1, self.num_heads * self.head_dim)
+        return self.out_proj(attn_output)
+
+
+class Phi4Mamba(nn.Module):
+
+    def __init__(
+        self,
+        d_model,
+        d_state=16,
+        d_conv=4,
+        expand=2,
+        dt_rank="auto",
+        dt_min=0.001,
+        dt_max=0.1,
+        dt_init="random",  # difference
+        dt_scale=1.0,  # difference
+        dt_init_floor=1e-4,
+        conv_bias=True,
+        bias=False,
+        use_fast_path=True,  # Fused kernel options
+        layer_idx=None,
+        device=None,
+        dtype=None,
+        yoco_cross=False,
+        yoco_kv=False,
+    ):
+        factory_kwargs = {"params_dtype": dtype}  # difference
+        super().__init__()
+        self.yoco_cross = yoco_cross
+        self.yoco_kv = yoco_kv
+        self.d_model = d_model
+        self.d_state = d_state
+        self.d_conv = d_conv
+        self.expand = expand
+        self.d_inner = int(self.expand * self.d_model)
+        self.dt_rank = math.ceil(self.d_model /
+                                 16) if dt_rank == "auto" else dt_rank
+        self.use_fast_path = use_fast_path
+        self.layer_idx = layer_idx
+        self.swiGluActivation = SwiGLUActivation()
+        if self.yoco_cross:
+            self.in_proj = MergedColumnParallelLinear(self.d_model,
+                                                      [self.d_inner],
+                                                      bias=bias,
+                                                      **factory_kwargs)
+            self.out_proj = RowParallelLinear(self.d_inner,
+                                              self.d_model,
+                                              bias=bias,
+                                              **factory_kwargs)
+            return
+        self.conv1d = ColumnParallelLinear(
+            input_size=d_conv,
+            output_size=self.d_inner,
+            bias=conv_bias,
+            params_dtype=dtype,
+        )
+        # unsqueeze to fit conv1d weights shape into the linear weights shape.
+        # Can't do this in `weight_loader` since it already exists in
+        # `ColumnParallelLinear` and `set_weight_attrs`
+        # doesn't allow to override it
+        self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1)
+
+        self.in_proj = MergedColumnParallelLinear(
+            self.d_model,
+            [self.d_inner] * 2,
+            bias=bias,
+            params_dtype=dtype,
+        )
+
+        # selective projection used to make dt, B and C input dependent
+        self.x_proj = RowParallelLinear(
+            self.d_inner,
+            self.dt_rank + self.d_state * 2,
+            bias=False,
+            params_dtype=dtype,
+        )
+
+        # time step projection (discretization) -
+        # In the forward we need to apply dt_proj without the bias,
+        # as the bias is added in the selective scan kernel.
+        self.dt_proj = ColumnParallelLinear(
+            self.dt_rank,
+            self.d_inner,
+            bias=True,
+            skip_bias_add=True,
+            params_dtype=dtype,
+        )
+
+        # # D "skip" parameter
+        # self.D = nn.Parameter(torch.ones(self.d_inner))  # Keep in fp32
+        self.A = nn.Parameter(
+            torch.empty(
+                self.d_inner,
+                self.d_state,
+                dtype=torch.float32,
+            ))
+        self.D = nn.Parameter(torch.ones(self.d_inner, dtype=torch.float32))
+
+        self.out_proj = RowParallelLinear(
+            self.d_inner,
+            self.d_model,
+            bias=bias,
+            input_is_parallel=True,
+            params_dtype=dtype,
+        )
+        self.activation = "silu"
+
+    def forward(self,
+                hidden_states: torch.Tensor,
+                attn_metadata: AttentionMetadata,
+                mamba_cache_params: MambaCacheParams,
+                yoco_key_values=None) -> torch.Tensor:
+
+        if self.yoco_cross:
+            out = self.in_proj(hidden_states)[0]
+            out = self.swiGluActivation(yoco_key_values, out)
+            out = self.out_proj(out)
+            return out[0], yoco_key_values
+
+        # 1. Gated MLP's linear projection
+        # projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1)
+        projected_states = self.in_proj(
+            hidden_states.to(self.in_proj.weight.dtype))[0].transpose(-2, -1)
+        hidden_states, gate = projected_states.chunk(2, dim=-2)
+
+        # 2. Convolution sequence transformation
+        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
+                                               self.conv1d.weight.size(2))
+
+        if attn_metadata.query_start_loc is not None \
+            and attn_metadata.context_lens_tensor is not None:
+            # |---------- N-1 iteration --------|
+            # |---------------- N iteration ---------------------|
+            # |- tokenA -|......................|-- newTokens ---|
+            # |---------- context_len ----------|
+            # |-------------------- seq_len ---------------------|
+            #                                   |-- query_len ---|
+            hidden_states = causal_conv1d_fn(
+                hidden_states,
+                conv_weights,
+                self.conv1d.bias,
+                activation=self.activation,
+                conv_states=mamba_cache_params.conv_state,
+                has_initial_state=attn_metadata.context_lens_tensor > 0,
+                cache_indices=mamba_cache_params.state_indices_tensor,
+                query_start_loc=attn_metadata.query_start_loc)
+        else:
+            hidden_states = causal_conv1d_update(
+                hidden_states.transpose(0, 1),
+                mamba_cache_params.conv_state,
+                conv_weights,
+                self.conv1d.bias,
+                self.activation,
+                conv_state_indices=mamba_cache_params.state_indices_tensor)
+            hidden_states = hidden_states.transpose(0, 1)
+
+        # 3. State Space Model sequence transformation
+        # 3.a. input varying initialization of time_step, B and C
+        ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0]
+
+        time_step, B, C = torch.split(
+            ssm_parameters,
+            [self.dt_rank, self.d_state, self.d_state],
+            dim=-1,
+        )
+
+        # Note that Jamba normalizes B, C, and time_step here but Mamba doesn't.
+
+        discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1)
+        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+        time_proj_bias = (self.dt_proj.bias.float() if hasattr(
+            self.dt_proj, "bias") else None)
+
+        if attn_metadata.query_start_loc is not None \
+            and attn_metadata.context_lens_tensor is not None:
+            scan_outputs = selective_scan_fn(
+                hidden_states,
+                mamba_cache_params.ssm_state,
+                discrete_time_step,
+                self.A,
+                B.transpose(-2, -1),
+                C.transpose(-2, -1),
+                self.D.float(),
+                # z,
+                None if self.yoco_kv else gate,
+                time_proj_bias,
+                delta_softplus=True,
+                cache_indices=mamba_cache_params.state_indices_tensor,
+                has_initial_state=attn_metadata.context_lens_tensor > 0,
+                query_start_loc=attn_metadata.query_start_loc)
+        else:
+            scan_outputs = selective_state_update(
+                mamba_cache_params.ssm_state,
+                hidden_states.transpose(0, 1),
+                discrete_time_step.transpose(0, 1),
+                self.A,
+                B,
+                C,
+                self.D,
+                # z
+                # gate.transpose(0, 1),
+                None if self.yoco_kv else gate.transpose(0, 1),
+                time_proj_bias,
+                dt_softplus=True,
+                state_batch_indices=mamba_cache_params.state_indices_tensor)
+            scan_outputs = scan_outputs.transpose(0, 1)
+
+        # 4. Final linear projection
+        if self.yoco_kv:
+            # gate = gate.transpose(-1,-2).contiguous()
+            yoco_key_values = scan_outputs.transpose(-2, -1)
+            scan_outputs = self.swiGluActivation(scan_outputs, gate)
+
+        contextualized_states = self.out_proj(scan_outputs.transpose(-2,
+                                                                     -1))[0]
+
+        return contextualized_states, yoco_key_values
+
+
+class SambaYDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        layer_idx,
+        cache_config,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.layer_idx = layer_idx
+
+        self.mlp = SambaYMLP(config)
+        self.input_layernorm = nn.LayerNorm(config.hidden_size,
+                                            eps=config.layer_norm_eps)
+
+        self.yoco_mb = False
+        self.yoco_cross = False
+        if layer_idx >= config.num_hidden_layers // 2:
+            self.yoco_mb = True
+            self.yoco_cross = (layer_idx
+                               >= (config.num_hidden_layers // 2 + 2))
+        self.use_mamba = config.mb_per_layer > 0 and \
+            layer_idx % config.mb_per_layer == 0
+        if self.use_mamba:
+            factory_kwargs = {"dtype": None}
+            self.attn = Phi4Mamba(config.hidden_size,
+                                  layer_idx=layer_idx,
+                                  yoco_cross=self.yoco_cross,
+                                  yoco_kv=self.yoco_mb,
+                                  **factory_kwargs)
+        else:
+            self.attn = SambaYAttention(config,
+                                        layer_idx=layer_idx,
+                                        yoco_cross=self.yoco_cross,
+                                        cache_config=cache_config,
+                                        prefix=f"{prefix}.self_attn")
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
+                                                     eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        positions: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        mamba_cache_params: MambaCacheParams,
+        ssm_output: Optional[torch.LongTensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if self.use_mamba:
+            assert mamba_cache_params is not None
+        else:
+            assert mamba_cache_params is None
+
+        residual = hidden_states
+        hidden_states = self.input_layernorm(
+            hidden_states.to(dtype=self.input_layernorm.weight.dtype))
+
+        if self.use_mamba:
+            attn_outputs, ssm_output = self.attn(hidden_states,
+                                                 attn_metadata,
+                                                 mamba_cache_params,
+                                                 yoco_key_values=ssm_output)
+            residual = residual.to(torch.float32)
+        else:
+            attn_outputs = self.attn(hidden_states, )
+        hidden_states = residual + attn_outputs
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(
+            hidden_states.to(dtype=self.post_attention_layernorm.weight.dtype))
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states, ssm_output
+
+
+class SambaYModel(nn.Module):
+
+    def __init__(self,
+                 config,
+                 cache_config=None,
+                 quant_config=None,
+                 lora_config=None,
+                 prefix: str = "") -> None:
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+
+        # Pipeline parallel is not supported since the second half of
+        # the layers share the kv cache.
+        if get_pp_group().world_size != 1:
+            raise ValueError("Pipeline Parallel not supported")
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: SambaYDecoderLayer(config,
+                                              int(prefix.split('.')[-1]),
+                                              cache_config,
+                                              prefix=prefix),
+            prefix=f"{prefix}.layers")
+        self.final_layernorm = nn.LayerNorm(config.hidden_size,
+                                            eps=config.layer_norm_eps)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        mamba_cache_params: MambaCacheParams,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+
+        mamba_state_idx = 0
+        ssm_output = None
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            if i == self.config.num_hidden_layers // 2 + 2:
+                # profile run
+                kv_cache_idx = self.config.num_hidden_layers // 2 + 1
+                cache_layer = self.layers[kv_cache_idx]
+                kv_cache = cache_layer.attn.attn.kv_cache
+                if kv_cache[0].numel() == 0:
+                    break
+
+                # Starting from this layer, we do not need to calculate
+                # the kv cache since we reuse the kv cache from last layer.
+                # If in prefill phase, we can <s>prune></s> truncate
+                # the hidden state to save computation cost.
+                if attn_metadata.prefill_metadata and not envs.VLLM_USE_V1:
+                    selected_token_indices = torch.cumsum(
+                        attn_metadata.seq_lens_tensor, dim=0) - 1
+                    hidden_states = hidden_states.index_select(
+                        0, selected_token_indices)
+                    ssm_output = ssm_output.index_select(
+                        0, selected_token_indices)
+
+            if layer.use_mamba:
+                if i < self.config.num_hidden_layers // 2 or \
+                    not layer.yoco_cross:
+                    mamba_cache = mamba_cache_params.at_layer_idx(
+                        mamba_state_idx)
+                    mamba_state_idx += 1
+                else:
+                    mamba_cache = mamba_cache_params.at_layer_idx(
+                        mamba_state_idx - 1)
+
+                hidden_states, ssm_output = layer(hidden_states,
+                                                  positions,
+                                                  attn_metadata,
+                                                  mamba_cache,
+                                                  ssm_output=ssm_output)
+            else:
+                hidden_states, ssm_output = layer(
+                    hidden_states,
+                    positions,
+                    attn_metadata,
+                    None,  # mamba_cache_params
+                    ssm_output=ssm_output)
+
+        hidden_states = self.final_layernorm(
+            hidden_states.to(dtype=self.final_layernorm.weight.dtype))
+        return hidden_states
+
+
+class Phi4FlashForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        lora_config = vllm_config.lora_config
+        quant_config = vllm_config.quant_config
+        scheduler_config = vllm_config.scheduler_config
+        self.compilation_config = vllm_config.compilation_config
+        self.vllm_config = vllm_config
+        # Prefix caching and chunked prefill is not supported for this model.
+        assert not cache_config.enable_prefix_caching, \
+            "Phi4flash currently does not support prefix caching"
+        assert not scheduler_config.chunked_prefill_enabled, \
+            "Phi4Flash currently does not support prefix caching"
+        super().__init__()
+        self.config = config
+        self.model_config = vllm_config.model_config
+        self.scheduler_config = scheduler_config
+        self.model = SambaYModel(config,
+                                 cache_config=cache_config,
+                                 prefix=maybe_prefix(prefix, "model"))
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=(
+                DEFAULT_VOCAB_PADDING_SIZE
+                # We need bigger padding if using lora for kernel
+                # compatibility
+                if not lora_config else lora_config.lora_vocab_padding_size),
+            quant_config=quant_config,
+        )
+        self.embedding_bias = None
+        # Used to track and store by the Mamba cache between steps.
+        self.mamba_cache: Optional[MambaCacheManager] = None
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size,
+                                                logits_as_input=False)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if self.mamba_cache is None:
+            num_mamba_layers = self.config.num_hidden_layers \
+                // 2 // self.config.mb_per_layer + 1
+            self.mamba_cache = MambaCacheManager(
+                self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers,
+                *self._get_mamba_cache_shape())
+        mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+
+        attn_metadata = get_forward_context().attn_metadata
+        # input_ids and hidden_states isn't a one-to-one mapping in prefill
+        # stage due to YOCO optimization.
+        hidden_states = self.model(input_ids, positions, attn_metadata,
+                                   mamba_cache_params, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def _get_mamba_cache_shape(
+            self
+    ) -> tuple[Optional[tuple[int, int]], Optional[tuple[int, int]]]:
+        world_size = get_tensor_model_parallel_world_size()
+        hidden_size = self.config.hidden_size
+        mamba_expand = self.config.mamba_expand  # 2
+        mamba_d_conv = self.config.mamba_d_conv  # 4
+        mamba_d_state = self.config.mamba_d_state  # 16
+        conv_state_shape = (
+            mamba_expand * hidden_size // world_size,
+            mamba_d_conv - 1,
+        )
+        temporal_state_shape = (
+            mamba_expand * hidden_size // world_size,
+            mamba_d_state,
+        )
+        return conv_state_shape, temporal_state_shape
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        return self.mamba_cache.copy_inputs_before_cuda_graphs(
+            input_buffers, **kwargs)
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        # If the shape is the same, it means that we have already
+        # prune hidden states manually.
+        prune_hidden_states = hidden_states.size(
+            0) != sampling_metadata.selected_token_indices.size(0)
+        processed_logits = self.logits_processor(
+            self.lm_head,
+            hidden_states,
+            sampling_metadata,
+            self.embedding_bias,
+            prune_hidden_states=prune_hidden_states)
+        return processed_logits
+
+    def load_weights(
+        self,
+        weights: Iterable[tuple[str, torch.Tensor]],
+    ):
+        weights = {name: weight for name, weight in weights}
+        adjusted_weights = {}
+        for name, weight in weights.items():
+            if "A_log" in name:
+                name = name.replace("A_log", "A")
+                weight = -torch.exp(weight.float())
+            if "inner_cross_attn." in name:
+                name = name.replace("inner_cross_attn.", "")
+            adjusted_weights[name] = weight
+        adjusted_weights["lm_head.weight"] = weights[
+            "model.embed_tokens.weight"]
+        loaded_params: set[str] = set()
+        for name, param in self.named_parameters():
+            weight = adjusted_weights.get(name)
+            if weight is not None and weight.shape != param.shape:
+                logger.warning("Shape mismatch: %s %s %s", name, weight.shape,
+                               param.shape)
+            loaded_params.add(name)
+        missing_keys, unexpected_keys = self.load_state_dict(adjusted_weights,
+                                                             strict=False)
+        assert len(unexpected_keys) == 0, f"Unexpected keys: {unexpected_keys}"
+        assert len(missing_keys) == 0, f"Missing keys: {missing_keys}"
+        return loaded_params
diff --git a/vllm_v0.10.0/vllm/model_executor/models/phi4mm.py b/vllm_v0.10.0/vllm/model_executor/models/phi4mm.py
new file mode 100644
index 0000000..9b61c36
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/phi4mm.py
@@ -0,0 +1,1258 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Any, Literal, Optional, TypedDict, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from transformers import (BatchFeature, PretrainedConfig, ProcessorMixin,
+                          SequenceFeatureExtractor, SiglipVisionConfig)
+
+from vllm.config import VllmConfig
+from vllm.distributed import get_pp_group
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
+from vllm.model_executor.models.llama import LlamaModel
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.parse import (AudioProcessorItems, ImageEmbeddingItems,
+                                   ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems, MultiModalDataParser)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.utils import is_list_of
+
+from .idefics2_vision_model import Idefics2VisionTransformer
+from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal
+from .phi4mm_audio import AudioEmbedding
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, maybe_prefix,
+                    merge_multimodal_embeddings)
+
+# <|endoftext10|> (see vocab.json in hf model)
+_IMAGE_PLACEHOLDER_TOKEN_ID = 200010
+# <|endoftext11|>
+_AUDIO_PLACEHOLDER_TOKEN_ID = 200011
+
+_AUDIO_MAX_SOUNDFILE_SIZE = 241_000
+
+SIGLIP_NAME = "siglip-so400m-patch14-448"
+VISION_ENCODER_TO_PROCESSING_CONFIG = {
+    'siglip-so400m-patch14-448': {
+        'vit_image_size': 448,
+        'vit_patch_size': 14,
+        'token_compression_factor': 2,
+    },
+}
+
+
+def _get_padding_size(orig_width: int, orig_height: int, target_height: int,
+                      target_width: int):
+    ratio_width = target_width / orig_width
+    ratio_height = target_height / orig_height
+
+    if ratio_width < ratio_height:
+        padding_width = 0
+        padding_height = target_height - int(orig_height * ratio_width)
+    else:
+        padding_width = target_width - int(orig_width * ratio_height)
+        padding_height = 0
+    return padding_height, padding_width
+
+
+def get_navit_vision_model(layer_idx: int = -1, **kwargs):
+    vision_config = {
+        "hidden_size": 1152,
+        "image_size": 448,
+        "intermediate_size": 4304,
+        "model_type": "siglip_vision_model",
+        "num_attention_heads": 16,
+        "num_hidden_layers": 27,
+        "patch_size": 14,
+    }
+
+    model_config = SiglipVisionConfig(**vision_config, **kwargs)
+    if layer_idx < 0:
+        num_hidden_layers = model_config.num_hidden_layers \
+            + layer_idx + 1
+    else:
+        num_hidden_layers = layer_idx + 1
+
+    vision_model = Idefics2VisionTransformer(
+        config=model_config,
+        require_post_norm=False,
+        num_hidden_layers_override=num_hidden_layers,
+    )
+
+    return vision_model
+
+
+class Phi4MMImageEncoder(nn.Module):
+    """Image embedding."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 quant_config: Optional[QuantizationConfig],
+                 prefix: str = "",
+                 model_dir: str = "") -> None:
+        super().__init__()
+
+        # n_embed or hidden_size
+        hidden_size = config.n_embd if hasattr(
+            config, 'n_embd') else config.hidden_size
+
+        # layer_idx to output the img features
+        if isinstance(config.img_processor, dict):
+            self.layer_idx = config.img_processor.get('layer_idx', -2)
+            self.type_feature = config.img_processor.get(
+                'type_feature', 'patch')
+        else:
+            self.layer_idx = -2
+            self.type_feature = 'patch'
+
+        self.img_processor = get_navit_vision_model(layer_idx=self.layer_idx)
+
+        pe_weight = self.img_processor.embeddings.position_embedding.weight
+        L, D = pe_weight.size()
+        H = int(math.sqrt(L))
+        assert H**2 == L, f'position embedding size {L} is not square'
+        if H % 2 != 0:
+            self.img_processor_padding = nn.ReflectionPad2d((0, 1, 0, 1))
+            H += 1
+        image_dim_out = D
+        # ((448/14)//2)**2
+        self.num_img_tokens = (H // 2)**2
+        self.base_feat_height_target = H
+
+        self.image_dim_out = image_dim_out
+        self.img_sizes = None
+        self.image_attention_mask = None
+
+        # global_gn and sub_gn for hd transform, serves as line separator
+        self.use_hd_transform = True
+        self.with_learnable_separator = True
+        self.hd_transform_order = "sub_glb"
+        self.freeze_img_processor = False
+        self.crop_size = 448
+
+        # image token compression
+        self.image_token_compression_cls = 'avg_pool_2d'
+        self.image_token_compression = nn.AvgPool2d(kernel_size=2, stride=2)
+        self.base_feat_height_reduction = 1
+        self.base_feat_height_target = self.base_feat_height_target // 2
+
+        # with_hd_transform and with_learnable_separator should have same value
+        assert self.use_hd_transform == self.with_learnable_separator, \
+        'use_hd_transform and with_learnable_separator should have same value'
+        assert self.use_hd_transform, \
+            'learnable separator is only for hd transform'
+        # 1024 * 4, merge spatial to channel dimension
+        self.glb_GN = nn.Parameter(
+            torch.zeros([
+                1, 1, self.image_dim_out * self.base_feat_height_reduction**2
+            ]))
+        self.sub_GN = nn.Parameter(
+            torch.zeros([
+                1, 1, 1,
+                self.image_dim_out * self.base_feat_height_reduction**2
+            ]))
+
+        dim_projection = hidden_size
+        depth = 2
+        layers = [
+            nn.Linear(image_dim_out * self.base_feat_height_reduction**2,
+                      dim_projection)
+        ]
+        for _ in range(1, depth):
+            layers.extend(
+                [nn.GELU(),
+                 nn.Linear(dim_projection, dim_projection)])
+        self.img_projection = nn.Sequential(*layers)
+
+        self.vocab_size = config.vocab_size
+        self.img_features = None
+
+        self.use_out_place_operations = False
+
+    def get_img_features(self,
+                         img_embeds: torch.FloatTensor,
+                         attention_mask=None) -> torch.FloatTensor:
+
+        img_feature = self.img_processor(img_embeds,
+                                         patch_attention_mask=attention_mask)
+
+        if self.type_feature == "patch":
+            patch_feature = img_feature
+
+            use_token_compression = self.image_token_compression is not None
+            use_padding = getattr(self, 'img_processor_padding',
+                                  None) is not None
+            if use_token_compression or use_padding:
+                # reshape to 2D tensor
+                width = int(math.sqrt(patch_feature.size(1)))
+                patch_feature = patch_feature.view(-1, width, width,
+                                                   patch_feature.size(-1))
+                # convert to NCHW
+                patch_feature = patch_feature.permute(0, 3, 1, 2)
+
+                if use_padding:
+                    patch_feature = self.img_processor_padding(patch_feature)
+                if use_token_compression:
+                    patch_feature = self.image_token_compression(patch_feature)
+
+                # convert to NHWC
+                patch_feature = patch_feature.permute(0, 2, 3, 1)
+                patch_feature = patch_feature.view(
+                    -1,
+                    patch_feature.size(1) * patch_feature.size(2),
+                    patch_feature.size(-1))
+
+            return patch_feature
+
+        raise NotImplementedError
+
+    def forward(self, pixel_values: torch.FloatTensor,
+                image_sizes: torch.Tensor,
+                image_attention_mask: torch.Tensor) -> list[torch.FloatTensor]:
+        """
+        process image and return vision embeddings.
+
+        pixel_values: (num_images, num_crops, c, h, w)
+        image_sizes: [[h1, w1], [h2, w2]]
+        image_attention_mask: num_images x num_crops x 32 x 32
+        output: (num_images, num_img_tokens, hidden_size)
+        """
+
+        # eg
+        # pixel_values: torch.Size([1, 7, 3, 448, 448])
+        # image_sizes: tensor([[ 896, 1344]], device='cuda:0')
+        # output: torch.Size([1, 1841, 3072])
+
+        if isinstance(self.img_projection, nn.Sequential):
+            target_device = self.img_projection[0].bias.device
+            target_dtype = self.img_projection[0].bias.dtype
+        else:  # It's a single nn.Linear layer
+            target_device = self.img_projection.bias.device
+            target_dtype = self.img_projection.bias.dtype
+
+        img_sizes = image_sizes
+        num_images, num_crops, c, h, w = pixel_values.shape
+        bs = num_images
+        pixel_values = pixel_values.flatten(0, 1)
+
+        img_features = self.get_img_features(
+            pixel_values,
+            image_attention_mask.type(torch.BoolTensor).flatten(
+                0, 1).to(target_device))
+
+        base_feat_height_target = self.base_feat_height_target
+        base_resolution = self.crop_size
+        base_feat_height_reduction = self.base_feat_height_reduction
+
+        base_feat_height = base_feat_width = int(np.sqrt(
+            img_features.shape[1]))
+        assert base_feat_height == base_feat_height_target \
+            and base_feat_width == base_feat_height_target, \
+                f'base_feat_height: {base_feat_height},"\
+                f" base_feat_width: {base_feat_width}, "\
+                f"expect {base_feat_height_target} features for hd transform'
+
+        # bs x max_num_crops x (24x24) x C
+        img_features = img_features.view(bs, -1,
+                                         base_feat_height * base_feat_width,
+                                         self.image_dim_out)
+        C = self.image_dim_out
+        H = base_feat_height
+
+        output_imgs = []
+        output_len = []
+        # training is tensor, inference is list
+        if isinstance(img_sizes, torch.Tensor):
+            img_sizes = img_sizes.view(-1, 2)
+        for _bs in range(bs):
+            h, w = img_sizes[_bs]
+            h = h // base_resolution
+            w = w // base_resolution
+            B_ = h * w
+
+            # 1 x (24x24) x 1024
+            global_img_feature = img_features[_bs, :1]
+
+            # 1 x 12 x 12 x 4096
+            glb_img = global_img_feature.reshape(1, H, H, C).reshape(
+                1, H // base_feat_height_reduction, base_feat_height_reduction,
+                H // base_feat_height_reduction, base_feat_height_reduction,
+                C).contiguous().permute(0, 1, 3, 2, 4, 5).reshape(
+                    1, H // base_feat_height_reduction,
+                    H // base_feat_height_reduction,
+                    base_feat_height_reduction * base_feat_height_reduction *
+                    C).contiguous()
+            temp_glb_GN = self.sub_GN.repeat(1,
+                                             H // base_feat_height_reduction,
+                                             1, 1)
+
+            # 1 x 156 x 4096
+            glb_img = torch.cat([glb_img, temp_glb_GN], dim=2).reshape(
+                1, -1,
+                base_feat_height_reduction * base_feat_height_reduction * C)
+
+            # (max_num_crops-1) x (12x12) x C
+            sub_img = img_features[_bs, 1:]
+            # 16x574x1024
+            # get rid of padding sub_img
+            sub_img = sub_img[:B_]
+
+            # (num_crops, 12, 2, 12, 2, 1024) ->
+            # (num_crops, 12, 12, 2, 2, 1024) -> (num_crops, 12*12, 4*1024)
+            sub_img = sub_img.reshape(B_, H, H, C).reshape(
+                B_, H // base_feat_height_reduction,
+                base_feat_height_reduction, H // base_feat_height_reduction,
+                base_feat_height_reduction,
+                C).contiguous().permute(0, 1, 3, 2, 4, 5).reshape(
+                    B_, -1, base_feat_height_reduction *
+                    base_feat_height_reduction * C).contiguous()
+            sub_img = sub_img.reshape(
+                1, h, w, base_feat_height // base_feat_height_reduction,
+                base_feat_width // base_feat_height_reduction,
+                -1).permute(0, 1, 3, 2, 4, 5).reshape(
+                    1, h * base_feat_height // base_feat_height_reduction,
+                    w * base_feat_width // base_feat_height_reduction,
+                    base_feat_height_reduction * base_feat_height_reduction *
+                    C)
+
+            if image_attention_mask is not None and len(
+                    image_attention_mask) > 0:
+                reshaped_image_attention_mask = image_attention_mask[
+                    _bs, 1:B_ + 1, 0::2, 0::2].reshape(
+                        1, h, w,
+                        base_feat_height // base_feat_height_reduction,
+                        base_feat_width // base_feat_height_reduction).permute(
+                            0, 1, 3, 2, 4).reshape(
+                                1, h * base_feat_height //
+                                base_feat_height_reduction, w *
+                                base_feat_width // base_feat_height_reduction)
+                useful_height = int(
+                    reshaped_image_attention_mask[0, :, 0].sum().item())
+                useful_width = int(
+                    reshaped_image_attention_mask[0, 0, :].sum().item())
+                sub_img = sub_img[:, :useful_height, :useful_width]
+                temp_sub_GN = self.sub_GN.repeat(1, useful_height, 1, 1)
+                temp_len = int(
+                    image_attention_mask[_bs, :B_ + 1, 0::2, 0::2].sum().item(
+                    )) + (useful_height +
+                          1) + base_feat_height // base_feat_height_reduction
+            else:
+                temp_sub_GN = self.sub_GN.repeat(
+                    1, h * base_feat_height // base_feat_height_reduction, 1,
+                    1)
+                temp_len = int((h * w + 1) * self.num_img_tokens + 1 +
+                               (h + 1) * base_feat_height //
+                               base_feat_height_reduction)
+
+            sub_img = torch.cat([sub_img, temp_sub_GN], dim=2).reshape(
+                1, -1,
+                base_feat_height_reduction * base_feat_height_reduction * C)
+            # (1, num_img_tokens, 1024*4)
+
+            # glb + sub
+            if self.hd_transform_order == 'glb_sub':
+                output_imgs.append(
+                    torch.cat([glb_img, self.glb_GN, sub_img], dim=1))
+            elif self.hd_transform_order == 'sub_glb':
+                output_imgs.append(
+                    torch.cat([sub_img, self.glb_GN, glb_img], dim=1))
+            else:
+                raise NotImplementedError(
+                    f'hd_transform_order = {self.hd_transform_order}, "\
+                        "not implemented')
+
+            #temp_len = int((h*w+1)*144 + 1 + (h+1)*12)
+            assert temp_len == output_imgs[-1].shape[
+                1], f'temp_len: {temp_len}, output_imgs[-1].shape[1]: "\
+                    "{output_imgs[-1].shape[1]}'
+
+            output_len.append(temp_len)
+
+        img_set_tensor = []
+        for _output_img in output_imgs:
+            img_feature_proj = self.img_projection(
+                _output_img.to(target_device).to(target_dtype))
+            img_set_tensor.append(img_feature_proj.squeeze(0))
+
+        return img_set_tensor
+
+
+class Phi4MMImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    Shape:
+    `(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
+
+    Note that `num_patches` may be different per batch and image,
+    in which case the data is passed as a list instead of a batched tensor.
+    """
+
+    image_sizes: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, 2)`
+
+    This should be in `(height, width)` format.
+    """
+
+    num_img_tokens: list[int]
+    """Shape: `(batch_size * num_images)`"""
+
+    image_attention_mask: torch.Tensor
+    """Shape: `(batch_size * num_images, H_mask, W_mask)`"""
+
+
+class Phi4MMAudioFeatureInputs(TypedDict):
+    type: Literal["audio_features"]
+    data: Union[torch.Tensor, list[torch.Tensor]]
+    """Shape: `(batch_size * num_audios, 80, M)"""
+
+
+class Phi4MMAudioEmbeddingInputs(TypedDict):
+    type: Literal["audio_embeds"]
+    data: NestedTensors
+    """Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)"""
+
+
+Phi4MMAudioInputs = Union[Phi4MMAudioFeatureInputs, Phi4MMAudioEmbeddingInputs]
+
+
+def cat_with_pad(tensors, dim, padding_value=0):
+    """
+    cat along dim, while pad to max for all other dims
+    """
+    ndim = tensors[0].dim()
+    assert all(
+        t.dim() == ndim for t in
+        tensors[1:]), "All tensors must have the same number of dimensions"
+
+    out_size = [max(t.shape[i] for t in tensors) for i in range(ndim)]
+    out_size[dim] = sum(t.shape[dim] for t in tensors)
+    output = tensors[0].new_full(out_size, padding_value)
+
+    index = 0
+    for t in tensors:
+        # Create a slice list where every dimension except dim is full slice
+        slices = [slice(0, t.shape[d]) for d in range(ndim)]
+        # Update only the concat dimension slice
+        slices[dim] = slice(index, index + t.shape[dim])
+
+        output[slices] = t
+        index += t.shape[dim]
+
+    return output
+
+
+class Phi4MMProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_processor(
+        self,
+        *,
+        dynamic_hd: Optional[int] = None,
+        **kwargs: object,
+    ) -> ProcessorMixin:
+        if dynamic_hd is not None:
+            kwargs["dynamic_hd"] = dynamic_hd
+
+        return self.ctx.get_hf_processor(**kwargs)
+
+    @property
+    def image_tokens(self) -> list[str]:
+        return [f"<|image_{i+1}|>" for i in range(100)]
+
+    @property
+    def audio_tokens(self) -> list[str]:
+        return [f"<|audio_{i+1}|>" for i in range(100)]
+
+    def get_dynamic_hd(
+        self,
+        processor: Optional[ProcessorMixin] = None,
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor()
+        image_processor = processor.image_processor
+        return image_processor.dynamic_hd
+
+    def get_feature_extractor(self) -> SequenceFeatureExtractor:
+        return self.get_hf_processor().audio_processor
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"audio": None, "image": None}
+
+    def _find_target_aspect_ratio(
+        self,
+        orig_width: int,
+        orig_height: int,
+        image_size: int,
+        max_num: int,
+        min_num: int,
+    ):
+        w_crop_num = math.ceil(orig_width / float(image_size))
+        h_crop_num = math.ceil(orig_height / float(image_size))
+        if w_crop_num * h_crop_num > max_num:
+            aspect_ratio = orig_width / orig_height
+
+            # calculate the existing image aspect ratio
+            target_ratios = set((i, j) for i in range(1, max_num + 1)
+                                for j in range(1, max_num + 1)
+                                if i * j <= max_num and i * j >= min_num)
+            target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+            # find the closest aspect ratio to the target
+            image_processor = self.get_hf_processor().image_processor
+            target_aspect_ratio = image_processor.find_closest_aspect_ratio(
+                aspect_ratio,
+                target_ratios,
+                orig_width,
+                orig_height,
+                image_size,
+            )
+
+            # calculate the target width and height
+            target_width = image_size * target_aspect_ratio[0]
+            target_height = image_size * target_aspect_ratio[1]
+        else:
+            target_width = image_size * w_crop_num
+            target_height = image_size * h_crop_num
+            target_aspect_ratio = (w_crop_num, h_crop_num)
+        return target_aspect_ratio, target_height, target_width
+
+    def _compute_num_image_tokens(
+        self,
+        orig_width: int,
+        orig_height: int,
+        dynamic_hd_size: int,
+        vit_image_size: int,
+        vit_patch_size: int,
+        token_compression_factor: int = 2,
+    ):
+        """
+        compute the number of tokens an image is expected to take up considering
+        the image encoder architecture and exclude output features containing 
+        only padding pixels
+
+        for siglip, vit_image_size=448, vit_patch_size=14, so output will be 
+        32x32 feature map
+        NOTE right now, Phi4MM uses hard-coded token_compression_factor=2
+        """
+        assert vit_image_size % vit_patch_size == 0, (
+            "vit_image_size must be divisible by vit_patch_size")
+        assert (vit_image_size // vit_patch_size %
+                token_compression_factor == 0), (
+                    "vit_image_size // vit_patch_size must be divisible by "
+                    "token_compression_factor")
+
+        target_aspect_ratio, target_height, target_width = (
+            self._find_target_aspect_ratio(orig_width,
+                                           orig_height,
+                                           vit_image_size,
+                                           dynamic_hd_size,
+                                           min_num=1))
+        assert target_aspect_ratio[0] * vit_image_size == target_width, (
+            f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}")
+        assert target_aspect_ratio[1] * vit_image_size == target_height, (
+            f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}")
+        assert (target_height % vit_image_size == 0
+                and target_width % vit_image_size == 0)
+
+        padding_height, padding_width = _get_padding_size(
+            orig_width, orig_height, target_height, target_width)
+        assert padding_width == 0 or padding_height == 0, \
+            "padding_width or padding_height must be 0"
+
+        target_feat_width = target_width // vit_patch_size
+        target_feat_height = target_height // vit_patch_size
+        if padding_width >= vit_patch_size:
+            assert padding_height == 0, "padding_height not 0"
+            non_pad_feat_width = target_feat_width - math.floor(
+                padding_width / vit_patch_size)
+            non_pad_feat_height = target_feat_height
+        elif padding_height >= vit_patch_size:
+            assert padding_width == 0, "padding_width not 0"
+            non_pad_feat_height = target_feat_height - math.floor(
+                padding_height / vit_patch_size)
+            non_pad_feat_width = target_feat_width
+        else:
+            # small padding shorter than a vit patch
+            non_pad_feat_width = target_feat_width
+            non_pad_feat_height = target_feat_height
+
+        feat_width = non_pad_feat_width // token_compression_factor
+        feat_height = non_pad_feat_height // token_compression_factor
+        # NOTE it's possible that the non-padding feature is not divisible
+        if non_pad_feat_width % token_compression_factor != 0:
+            feat_width += 1
+        if non_pad_feat_height % token_compression_factor != 0:
+            feat_height += 1
+        num_hd_patch_tokens = feat_width * feat_height
+        num_hd_newline_tokens = feat_height
+        vit_feature_size = vit_image_size // vit_patch_size
+        num_global_image_tokens = (vit_feature_size //
+                                   token_compression_factor)**2
+        num_sep_tokens = 1
+        num_global_image_newline_tokens = \
+            vit_feature_size // token_compression_factor
+
+        return (num_global_image_tokens + num_sep_tokens +
+                num_hd_patch_tokens + num_hd_newline_tokens +
+                num_global_image_newline_tokens)
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[ProcessorMixin] = None,
+    ) -> int:
+        hf_config = self.get_hf_config()
+        vision_encoder_name = hf_config.img_processor
+        if vision_encoder_name is None:
+            vision_encoder_name = SIGLIP_NAME
+        prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[
+            vision_encoder_name]
+        vit_image_size = prepro_config['vit_image_size']
+        vit_patch_size = prepro_config['vit_patch_size']
+        token_compression_factor = prepro_config['token_compression_factor']
+
+        dynamic_hd_size = self.get_dynamic_hd(processor=processor)
+
+        image_num_tokens = self._compute_num_image_tokens(
+            image_width,
+            image_height,
+            dynamic_hd_size=dynamic_hd_size,
+            vit_image_size=vit_image_size,
+            vit_patch_size=vit_patch_size,
+            token_compression_factor=token_compression_factor,
+        )
+
+        return image_num_tokens
+
+    def get_image_size_with_most_features(
+        self,
+        processor: Optional[ProcessorMixin] = None,
+    ) -> ImageSize:
+        hf_config = self.get_hf_config()
+        vision_encoder_name = hf_config.img_processor
+        if vision_encoder_name is None:
+            vision_encoder_name = SIGLIP_NAME
+        prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[
+            vision_encoder_name]
+        vit_image_size = prepro_config['vit_image_size']
+
+        max_side = vit_image_size * self.get_dynamic_hd(processor=processor)
+        return ImageSize(height=max_side, width=vit_image_size)
+
+    def get_audio_num_frames(self, audio_len: int, sr: float) -> int:
+        """
+        Compute the output size of the `extract_features` method.
+
+        Args:
+            audio_len (int): Length of the input waveform in samples.
+            sr (float): Sampling rate of the waveform, either 16000 or 8000.
+
+        Returns:
+            tuple (int, int): Output size as (T, D), where:
+                T: Number of time frames.
+                D: Number of Mel filterbank bins (80).
+        """
+
+        # Resample to 16000 or 8000 if needed
+        if sr > 16000:
+            audio_len //= sr // 16000
+        elif 8000 <= sr < 16000:
+            # We'll resample to 16K from 8K
+            audio_len *= 2
+        elif sr < 8000:
+            raise RuntimeError(f"Unsupported sample rate {sr}")
+
+        # Spectrogram parameters for 16 kHz
+        win_length = 400  # Frame length in samples
+        hop_length = 160  # Frame shift in samples
+
+        # Calculate number of frames (T)
+        num_frames = (audio_len - win_length) // hop_length + 1
+        if num_frames < 1:
+            raise ValueError("Waveform too short for given parameters.")
+
+        # Return time frames (T)
+        return num_frames
+
+    def _compute_audio_embed_size(self, audio_frames: int) -> int:
+        """
+        Compute the audio embedding size based on the audio frames and
+        compression rate.
+        """
+        hf_config = self.get_hf_config()
+        compression_rate = hf_config.embd_layer['audio_embd_layer'][
+            'compression_rate']
+        # NOTE: this is a hard-coded value but might be configurable
+        # in the future
+        qformer_compression_rate = 1
+        integer = audio_frames // compression_rate
+        remainder = audio_frames % compression_rate
+
+        result = integer if remainder == 0 else integer + 1
+
+        integer = result // qformer_compression_rate
+        remainder = result % qformer_compression_rate
+        # qformer compression
+        result = integer if remainder == 0 else integer + 1
+
+        return result
+
+
+class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+        num_images = mm_counts.get("image", 0)
+
+        image_tokens: list[str] = self.info.image_tokens[:num_images]
+        audio_tokens: list[str] = self.info.audio_tokens[:num_audios]
+
+        return "".join(image_tokens + audio_tokens)
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_audios = mm_counts.get("audio", 0)
+        num_images = mm_counts.get("image", 0)
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "audio":
+            self._get_dummy_audios(length=_AUDIO_MAX_SOUNDFILE_SIZE,
+                                   num_audios=num_audios),
+        }
+
+        return mm_data
+
+
+class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        feature_extractor = self.info.get_feature_extractor()
+        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate,
+                                    audio_resample_method="scipy")
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if not mm_data:
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        sr = self.info.get_feature_extractor().sampling_rate
+        if (audio_data := mm_data.get("audios", [])):
+            mm_data['audios'] = [(data, sr) for data in audio_data]
+
+        processed_outputs = super()._call_hf_processor(prompt, mm_data,
+                                                       mm_kwargs, tok_kwargs)
+
+        num_img_tokens = [
+            self.info.get_num_image_tokens(image_width=img_size[0],
+                                           image_height=img_size[1])
+            for img_size in processed_outputs["image_sizes"]
+        ]
+        processed_outputs["num_img_tokens"] = num_img_tokens
+
+        audio_features = processed_outputs['input_audio_embeds']
+        feature_sizes = [
+            self.info.get_audio_num_frames(len(audio), sr)
+            for audio in audio_data
+        ]
+        processed_outputs['input_audio_embeds'] = [
+            audio_features[idx, :size]
+            for idx, size in enumerate(feature_sizes)
+        ]
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            input_image_embeds=MultiModalFieldConfig.batched("image"),
+            image_attention_mask=MultiModalFieldConfig.batched("image"),
+            image_sizes=MultiModalFieldConfig.batched("image"),
+            num_img_tokens=MultiModalFieldConfig.batched("image"),
+            input_audio_embeds=MultiModalFieldConfig.batched("audio"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        image_tokens: list[str] = self.info.image_tokens  # type: ignore
+        audio_tokens: list[str] = self.info.audio_tokens  # type: ignore
+        feature_extractor = self.info.get_feature_extractor()
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        def get_image_replacement_phi4mm(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                num_image_tokens = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                num_image_tokens = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    processor=hf_processor,
+                )
+
+            image_tokens = [_IMAGE_PLACEHOLDER_TOKEN_ID] * num_image_tokens
+
+            return image_tokens
+
+        def get_audio_replacement_phi4mm(item_idx: int):
+            audios = mm_items.get_items("audio", AudioProcessorItems)
+            # TODO(Isotr0py): support embedding inputs
+            audio_len = audios.get_audio_length(item_idx)
+            audio_frames = self.info.get_audio_num_frames(
+                audio_len, feature_extractor.sampling_rate)
+            audio_embed_size = self.info._compute_audio_embed_size(
+                audio_frames)
+
+            audio_tokens = [_AUDIO_PLACEHOLDER_TOKEN_ID] * audio_embed_size
+
+            return audio_tokens
+
+        num_images = mm_items.get_count("image", strict=False)
+        num_audios = mm_items.get_count("audio", strict=False)
+
+        image_repl = [
+            PromptReplacement(
+                modality="image",
+                target=image_token,
+                replacement=get_image_replacement_phi4mm,
+            ) for image_token in image_tokens[:num_images]
+        ]
+        audio_repl = [
+            PromptReplacement(
+                modality="audio",
+                target=audio_token,
+                replacement=get_audio_replacement_phi4mm,
+            ) for audio_token in audio_tokens[:num_audios]
+        ]
+        return image_repl + audio_repl
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Phi4MMMultiModalProcessor,
+    info=Phi4MMProcessingInfo,
+    dummy_inputs=Phi4MMDummyInputsBuilder,
+)
+class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
+    """
+    Implements the Phi-4-multimodal-instruct model in vLLM.
+    """
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "qkv_proj",
+        ],
+        "gate_up_proj": [
+            "gate_up_proj",
+        ],
+    }
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            "base_layer.": "",
+        },
+        orig_to_new_prefix={
+            "model.embed_tokens_extend.audio_embed.audio_projection.vision.":
+            "embed_tokens_extend.audio_projection_for_vision.",
+            "model.embed_tokens_extend.audio_embed.audio_projection.speech.":
+            "embed_tokens_extend.audio_projection.",
+            "model.embed_tokens_extend.audio_embed.": "embed_tokens_extend.",
+            "model.embed_tokens_extend.image_embed.": "vision_encoder.",
+        },
+    )
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return f"<|image_{i}|>"
+        if modality.startswith("audio"):
+            return f"<|audio_{i}|>"
+
+        raise ValueError("Only image or audio modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        assert multimodal_config, "multimodal_config is required"
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+        self.quant_config = quant_config
+        self.lora_config = lora_config
+
+        # Tensor/Pipeline parallel not supported for now.
+        assert get_pp_group(
+        ).world_size == 1, "pipeline parallel is not supported"
+
+        self.vision_encoder = Phi4MMImageEncoder(
+            config,
+            quant_config,
+            prefix="model.vision_embed_tokens",
+            model_dir=config._name_or_path)
+
+        if isinstance(config.embd_layer["audio_embd_layer"], dict):
+            embedding_config = {
+                "embedding_cls":
+                config.embd_layer["audio_embd_layer"]["embedding_cls"],
+                **config.embd_layer["audio_embd_layer"],
+            }
+        else:
+            embedding_config = {
+                "embedding_cls": self.config.embd_layer["embedding_cls"]
+            }
+
+        self.embed_tokens_extend = AudioEmbedding(config, **embedding_config)
+        self.model = LlamaModel(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            quant_config=quant_config,
+        )
+        if config.tie_word_embeddings:
+            self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size, logit_scale)
+
+    def _parse_and_validate_audio_input(
+            self, **kwargs: object) -> Optional[Phi4MMAudioInputs]:
+        """
+        Parse and validate the audio input to the model.  This handles both 
+        audio features and audio embeddings, but only the former is used for
+        now.
+
+        Args:
+            kwargs (object): Keyword arguments.
+
+        Returns:
+            Optional[Phi4MMAudioInputs]: Parsed and validated audio inputs.
+        """
+        audio_features = kwargs.pop("input_audio_embeds", None)
+        audio_embeds = kwargs.pop("audio_embeds", None)
+
+        if audio_features is None and audio_embeds is None:
+            return None
+
+        if audio_features is not None:
+            if not isinstance(audio_features, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of audio features. "
+                                 f"Got type: {type(audio_features)}")
+
+            return Phi4MMAudioFeatureInputs(type="audio_features",
+                                            data=flatten_bn(audio_features))
+
+        if audio_embeds is not None:
+            if not isinstance(audio_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of audio embeds. "
+                                 f"Got type: {type(audio_embeds)}")
+
+            return Phi4MMAudioEmbeddingInputs(type="audio_embeds",
+                                              data=audio_embeds)
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_audio_input(self, audio_input: Phi4MMAudioInputs,
+                             audio_projection_mode: str) -> NestedTensors:
+        """
+        Create the audio embeddings from the audio input, where the audio input
+        is pairs of audio features and audio embed lengths.  The audio input is
+        created by `input_mapper_for_phi4mm_audio`.
+
+        Args:
+            audio_input (Phi4MMAudioInputs): Audio input.
+
+        Returns:
+            NestedTensors: Audio embeddings
+        """
+        if audio_input["type"] == "audio_embeds":
+            return audio_input["data"]
+
+        audio_features = audio_input["data"]
+        # (e.g. multiple examples) and the second dim is the multi-audio dim
+        # (e.g. multiple audios in the same example)
+
+        dtype = next(self.embed_tokens_extend.parameters()).dtype
+        audio_embeds = [
+            self.embed_tokens_extend(
+                features.to(dtype),
+                audio_projection_mode=audio_projection_mode,
+            ) for features in audio_features
+        ]
+        return audio_embeds
+
+    def _parse_and_validate_image_input(self,
+                                        **kwargs: object) -> Optional[dict]:
+        input_image_embeds: NestedTensors = kwargs.get("input_image_embeds")
+        if input_image_embeds is None:
+            return None
+
+        image_sizes = kwargs.get("image_sizes")
+        image_attention_mask = kwargs.get("image_attention_mask")
+        num_img_tokens = kwargs.get("num_img_tokens")
+        assert image_sizes is not None and image_attention_mask is not None\
+              and num_img_tokens is not None, "Missing image inputs"
+
+        if is_list_of(input_image_embeds, torch.Tensor):
+            assert all(p.dim() == 5
+                       for p in input_image_embeds), "Incorrect image inputs"
+            # list len is batch_size.
+            # each tensor has dimension: num_img_per_example, num_hd_patches,
+            # channels, height, width.
+            # need to pad along num_hd_patches.
+            # mask size num_img_per_prompt, num_hd_patches, feat_h, heat_w.
+            input_image_embeds = cat_with_pad(input_image_embeds, dim=0)
+        elif isinstance(input_image_embeds, torch.Tensor):
+            # dimension: batch_size, num_img_per_example, num_hd_patches,
+            # channels, height, width.
+            # we flatten first 2 dims to make it a single large batch for
+            # SigLIP Encoder.
+            assert input_image_embeds.dim() == 6, "Incorrect image inputs"
+            input_image_embeds = input_image_embeds.flatten(0, 1)
+        else:
+            raise ValueError("Incorrect input_image_embeds inputs")
+
+        if isinstance(image_attention_mask, list):
+            image_attention_mask = cat_with_pad(image_attention_mask, dim=0)
+        elif isinstance(image_attention_mask, torch.Tensor):
+            image_attention_mask = image_attention_mask.flatten(0, 1)
+        else:
+            raise ValueError("Incorrect image_attention_mask inputs")
+
+        if isinstance(image_sizes, list):
+            image_sizes = torch.cat(image_sizes, dim=0)
+        elif isinstance(image_sizes, torch.Tensor):
+            image_sizes = image_sizes.flatten(0, 1)
+        else:
+            raise ValueError("Incorrect image_attention_mask inputs")
+
+        if isinstance(num_img_tokens, list):
+            num_img_tokens = [
+                n for num_tensor in num_img_tokens
+                for n in num_tensor.tolist()
+            ]
+        elif isinstance(num_img_tokens, torch.Tensor):
+            num_img_tokens = num_img_tokens.flatten(0, 1).tolist()
+        else:
+            raise ValueError("Incorrect image_attention_mask inputs")
+
+        return Phi4MMImagePixelInputs(
+            type="pixel_values",
+            data=input_image_embeds,
+            image_sizes=image_sizes,
+            image_attention_mask=image_attention_mask,
+            num_img_tokens=num_img_tokens,
+        )
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("input_image_embeds",
+                             "image_embeds") and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+            if input_key in ("input_audio_embeds",
+                             "audio_embeds") and "audios" not in modalities:
+                modalities["audios"] = self._parse_and_validate_audio_input(
+                    **kwargs)
+
+        return modalities
+
+    def _process_image_input(
+            self, image_input: Phi4MMImagePixelInputs) -> list[torch.Tensor]:
+
+        dtype = next(self.vision_encoder.parameters()).dtype
+        pixel_values = image_input['data'].to(dtype)
+        image_sizes = image_input['image_sizes']
+        image_attention_mask = image_input['image_attention_mask']
+        image_embeds = self.vision_encoder(pixel_values, image_sizes,
+                                           image_attention_mask)
+        return image_embeds
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
+            return []
+            return None
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        audio_projection_mode = 'speech'
+        for modality in modalities:
+            # make sure process images first
+            if modality == "images":
+                audio_projection_mode = "vision"
+                image_input = modalities["images"]
+                vision_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += tuple(vision_embeddings)
+            if modality == "audios":
+                audio_input = modalities["audios"]
+                audio_embeddings = self._process_audio_input(
+                    audio_input, audio_projection_mode=audio_projection_mode)
+                multimodal_embeddings += tuple(audio_embeddings)
+
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.model.embed_tokens(input_ids)
+        if multimodal_embeddings is not None and len(
+                multimodal_embeddings) != 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                [_IMAGE_PLACEHOLDER_TOKEN_ID, _AUDIO_PLACEHOLDER_TOKEN_ID])
+        return inputs_embeds
+
+    def get_input_embeddings_v0(
+        self,
+        input_ids: torch.Tensor,
+        image_input: Optional[Phi4MMImagePixelInputs] = None,
+        audio_input: Optional[Phi4MMAudioFeatureInputs] = None,
+    ) -> torch.Tensor:
+        audio_projection_mode = 'speech'
+        inputs_embeds = self.get_input_embeddings(input_ids)
+        if image_input is not None:
+            image_embeds = self._process_image_input(image_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                image_embeds,
+                placeholder_token_id=_IMAGE_PLACEHOLDER_TOKEN_ID,
+            )
+            audio_projection_mode = 'vision'
+
+        if audio_input is not None:
+            audio_embeds = self._process_audio_input(
+                audio_input, audio_projection_mode=audio_projection_mode)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                audio_embeds,
+                placeholder_token_id=_AUDIO_PLACEHOLDER_TOKEN_ID,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> torch.Tensor:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner from
+        # `get_multimodal_embeddings` and `get_input_embeddings`, this
+        # condition is only for v0 compatibility.
+        elif inputs_embeds is None:
+            image_input = self._parse_and_validate_image_input(**kwargs)
+            audio_input = self._parse_and_validate_audio_input(**kwargs)
+
+            if image_input is None and audio_input is None:
+                inputs_embeds = None
+            else:
+                inputs_embeds = self.get_input_embeddings_v0(
+                    input_ids,
+                    image_input=image_input,
+                    audio_input=audio_input)
+                input_ids = None
+
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> None:
+        loader = AutoWeightsLoader(self, skip_substrs=["lora"])
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="model.",
+            connector=["audio_projection_for_vision", "audio_projection"],
+            tower_model=["vision_encoder", "embed_tokens_extend"],
+        )
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.model
diff --git a/vllm_v0.10.0/vllm/model_executor/models/phi4mm_audio.py b/vllm_v0.10.0/vllm/model_executor/models/phi4mm_audio.py
new file mode 100644
index 0000000..0b0d66a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/phi4mm_audio.py
@@ -0,0 +1,1233 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+# Code copied from Microsoft/MoE by Jacob Platin (jacobplatin@microsoft.com)
+# but implemented by the Phi-Speech team
+#!/usr/bin/env python3
+import abc
+import math
+from typing import Literal, Optional
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    CheckpointWrapper)
+from torch.distributed.fsdp.fully_sharded_data_parallel import (
+    FullyShardedDataParallel)
+from transformers import PretrainedConfig
+
+from vllm.model_executor.models.phi4mm_utils import (
+    AbsolutePositionalEncoding, ConvModule, FeedForward, MeanVarianceNormLayer,
+    MultiHeadedAttention, MultiSequential, NemoConvSubsampling,
+    T5RelativeAttentionLogitBias, adaptive_enc_mask, get_offset, unfold_tensor)
+
+_AUDIO_PLACEHOLDER_TOKEN_ID = 200011  # <|endoftext11|>
+
+
+class ConformerEncoderLayer(nn.Module):
+    """ConformerEncoder Layer module.
+    for more details see conformer paper:
+        https://arxiv.org/abs/2005.08100
+    This module implement the Conformer block layer.
+
+    Args:
+        d_model: int
+            attention dim.
+        ext_pw_out_channel: int
+            if > 0, ext_pw_out_channel is a dim channel size
+             for the last pointwise conv after swish activation.
+        depthwise_seperable_out_channel: int
+            if set different to 0, the number of 
+             depthwise_seperable_out_channel will be used as a
+             channel_out of the second conv1d layer. 
+             otherwise, it equal to 0, the second conv1d layer is skipped.
+        depthwise_multiplier: int
+            number of input_dim channels duplication. this value
+             will be used to compute the hidden channels of the Conv1D.
+        n_head: int
+            the number of heads for multihead attention module.
+        d_ffn: int
+            output size of the feed_forward blocks.
+        ext_pw_kernel_size: int
+            kernel size of the conv pointwise of the conformer.
+        kernel_size: int
+            kernel size.
+        dropout_rate: float
+            dropout rate.
+        causal: bool, optional
+            if set to True, convolution have no access
+             to future frames. default False.
+        batch_norm: bool, optional
+            if set to True, apply batchnorm before activation
+            in ConvModule layer of the conformer.
+            default False
+        activation: str, optional
+            activation function name,
+            one of ["relu", "swish", "sigmoid"],
+            sigmoid activation is only used with "glu_in_fnn=True",
+            default "relu".
+        chunk_se: int, optional
+            0 for offline SE.
+            1 for streaming SE, where mean is computed
+             by accumulated history until current chunk_se.
+            2 for streaming SE, where mean is computed
+             by only the current chunk.
+            default 0.
+        chunk_size: int, optional
+            chunk_size for cnn. default 18
+        conv_activation: str, optional
+            activation function used in ConvModule part
+            of the conformer, default "relu".
+        conv_glu_type: str, optional
+            activation function used for the glu inside
+            the ConvModule part of the conformer.
+            default: "sigmoid".
+        bias_in_glu: bool, optional
+            if set to True, use additive bias in the weight module
+             before GLU.
+        linear_glu_in_convm: bool, optional
+            if set to True, use GLULinear module,
+             otherwise, used GLUPointWiseConv module.
+              default to False.
+        attention_inner_dim: int, optional
+            if equal to -1, attention dim for linears k/q/v is
+            equal to d_model. otherwise attention_inner_dim is used.
+            default -1.
+        attention_glu_type: str, optional
+            activation function for glu used in the multihead attention,
+             default "swish".
+        activation_checkpointing: str, optional
+            a dictionarry of {"module","interval","offload"}, where
+                "module": str
+                    accept ["transformer", "attention"] to select
+                    which module should do activation checkpointing.
+                "interval": int, default 1,
+                    interval of applying activation checkpointing,
+                    interval = 1 means that we apply checkpointing
+                    on every layer (if activation), otherwise,
+                    we apply it every x interval.
+                "offload": bool, default False,
+                    if set to True, we offload activation to cpu and
+                    reload it during backward, otherwise,
+                    we recalculate activation in backward.
+            default "".
+        export: bool, optional
+            if set to True, it remove the padding from convolutional layers
+             and allow the onnx conversion for inference.
+              default False.
+        use_pt_scaled_dot_product_attention: bool, optional
+            if set to True, use pytorch's scaled dot product attention 
+            implementation in training.
+        attn_group_sizes: int, optional
+            the number of groups to use for attention, default 1 
+            (Multi-Head Attention),
+            1 = typical Multi-Head Attention,
+            1 < attn_group_sizes < attention_heads = Grouped-Query Attention
+            attn_group_sizes = attention_heads = Multi-Query Attention
+    """
+
+    def __init__(
+        self,
+        d_model=512,
+        ext_pw_out_channel=0,
+        depthwise_seperable_out_channel=256,
+        depthwise_multiplier=1,
+        n_head=4,
+        d_ffn=2048,
+        ext_pw_kernel_size=1,
+        kernel_size=3,
+        dropout_rate=0.1,
+        causal=False,
+        batch_norm=False,
+        activation="relu",
+        chunk_se=0,
+        chunk_size=18,
+        conv_activation="relu",
+        conv_glu_type="sigmoid",
+        bias_in_glu=True,
+        linear_glu_in_convm=False,
+        attention_inner_dim=-1,
+        attention_glu_type="swish",
+        activation_checkpointing="",
+        export=False,
+        use_pt_scaled_dot_product_attention=False,
+        attn_group_sizes: int = 1,
+    ):
+        super().__init__()
+
+        self.feed_forward_in = FeedForward(
+            d_model=d_model,
+            d_inner=d_ffn,
+            dropout_rate=dropout_rate,
+            activation=activation,
+            bias_in_glu=bias_in_glu,
+        )
+
+        self.self_attn = MultiHeadedAttention(
+            n_head,
+            d_model,
+            dropout_rate,
+            attention_inner_dim,
+            attention_glu_type,
+            bias_in_glu,
+            use_pt_scaled_dot_product_attention=
+            use_pt_scaled_dot_product_attention,
+            group_size=attn_group_sizes,
+        )
+        self.conv = ConvModule(
+            d_model,
+            ext_pw_out_channel,
+            depthwise_seperable_out_channel,
+            ext_pw_kernel_size,
+            kernel_size,
+            depthwise_multiplier,
+            dropout_rate,
+            causal,
+            batch_norm,
+            chunk_se,
+            chunk_size,
+            conv_activation,
+            conv_glu_type,
+            bias_in_glu,
+            linear_glu_in_convm,
+            export=export,
+        )
+
+        self.feed_forward_out = FeedForward(
+            d_model=d_model,
+            d_inner=d_ffn,
+            dropout_rate=dropout_rate,
+            activation=activation,
+            bias_in_glu=bias_in_glu,
+        )
+
+        self.layer_norm_att = nn.LayerNorm(d_model)
+        self.layer_norm = nn.LayerNorm(d_model)
+
+    def forward(
+        self,
+        x,
+        pos_k,
+        pos_v,
+        mask,
+        relative_attention_bias: Optional[Tensor] = None,
+    ):
+        """ConformerEncoder forward.
+
+        Args:
+            x: torch.Tensor
+                input feature of shape (batch, max_time_in, size)
+            pos_k: torch.Tensor
+                positional key embedding.
+            mask: torch.Tensor
+                mask for x (batch, max_time_in)
+            relative_attention_bias: Optional[torch.Tensor]
+                bias added to attention logits w.r.t. relative positions 
+                (1, n_head, time1, time2)
+        """
+        x = x + 0.5 * self.feed_forward_in(x)
+        norm_x = self.layer_norm_att(x)
+
+        x = x + self.self_attn(
+            norm_x,
+            norm_x,
+            norm_x,
+            pos_k,
+            pos_v,
+            mask,
+            relative_attention_bias=relative_attention_bias,
+        )
+        x = x + self.conv(x)
+        x = x + 0.5 * self.feed_forward_out(x)
+
+        out = self.layer_norm(x)
+
+        return out, pos_k, pos_v, mask
+
+
+class TransformerEncoderBase(abc.ABC, nn.Module):
+    """The Base class for Transformer based encoders
+
+    Please set causal = True in streaming model
+    Args:
+        input_size: int
+            input feature dimension.
+        chunk_size: int, list(int)
+            Number of frames for each chunk
+            This variable can take 2 forms:
+            int:  Used for inference, or single chunk size training
+            list(int) : Used only for variable chunk size training
+            Some examples for the 2 cases:
+            chunk_size = 12
+            chunk_size = [6, 8, 12, 24]
+        left_chunk: int, list(int)
+            Number of chunks used for masking in streaming mode.
+            This variable can take 2 forms:
+            int:  Used for inference, or single chunk size training
+            list(int) : Used only for variable chunk size training. When
+            chunk_size is a list, left_chunk must be a list with same length.
+            Some examples for the 2 cases:
+            left_chunk = 6
+            left_chunk = [12, 9, 6, 3]
+        attention_dim: int, optional
+            attention dimension. default 256.
+        attention_heads: int, optional
+            the number of heads. default 4
+        input_layer: str, optional
+            input layer type before Conformer,
+            one of ["linear", "conv2d", "custom", "vgg2l", "embed"],
+            default "conv2d"
+        cnn_out: int, optional
+            the number of CNN channels before Conformer.
+            default -1.
+        cnn_layer_norm: bool, optional
+            layer norm between Conformer and the first CNN.
+            default False.
+        time_reduction: int, optional
+            time reduction factor
+            default 4
+        dropout_rate: float, optional
+            dropout rate. default 0.1
+        padding_idx: int, optional
+            padding index for input_layer=embed
+            default -1
+        relative_attention_bias_args: dict, optional
+            use more efficient scalar bias-based relative multihead attention
+            (Q*K^T + B) implemented in cmb.basics.embedding.
+            [T5/ALiBi]RelativeAttentionLogitBias
+            usage: relative_attention_bias_args={"type": t5/alibi}
+            additional method-specific arguments can be provided (see 
+            transformer_base.py)
+        positional_dropout_rate: float, optional
+            dropout rate after positional encoding. default 0.0
+        nemo_conv_settings: dict, optional
+            A dictionary of settings for NeMo Subsampling.
+            default None
+        conv2d_extra_padding: str, optional
+            Add extra padding in conv2d subsampling layers. Choices are
+            (feat, feat_time, none, True).
+            if True or feat_time, the extra padding is added into non full
+            supraframe utts in batch.
+            Default: none
+        attention_group_size: int, optional
+            the number of groups to use for attention, default 1 
+            (Multi-Head Attention),
+            1 = typical Multi-Head Attention,
+            1 < attention_group_size < attention_heads = Grouped-Query 
+            Attention
+            attention_group_size = attention_heads = Multi-Query Attention
+    """
+
+    def __init__(
+        self,
+        input_size,
+        chunk_size,
+        left_chunk,
+        attention_dim=256,
+        attention_heads=4,
+        input_layer="nemo_conv",
+        cnn_out=-1,
+        cnn_layer_norm=False,
+        time_reduction=4,
+        dropout_rate=0.0,
+        padding_idx=-1,
+        relative_attention_bias_args=None,
+        positional_dropout_rate=0.0,
+        nemo_conv_settings=None,
+        conv2d_extra_padding: Literal["feat", "feat_time", "none",
+                                      True] = "none",
+        attention_group_size=1,
+        encoder_embedding_config=None,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.input_layer = input_layer
+        self.chunk_size = chunk_size
+        self.left_chunk = left_chunk
+        self.attention_dim = attention_dim
+        self.num_heads = attention_heads
+        self.attention_group_size = attention_group_size
+        self.time_reduction = time_reduction
+        self.nemo_conv_settings = nemo_conv_settings
+        self.encoder_embedding_config = encoder_embedding_config
+
+        if self.input_layer == "nemo_conv":
+            default_nemo_conv_settings = {
+                "subsampling": "dw_striding",
+                "subsampling_factor": self.time_reduction,
+                "feat_in": input_size,
+                "feat_out": attention_dim,
+                "conv_channels": 256,
+                "subsampling_conv_chunking_factor": 1,
+                "activation": nn.ReLU(),
+                "is_causal": False,
+            }
+            # Override any of the defaults with the incoming, user settings
+            if nemo_conv_settings:
+                default_nemo_conv_settings.update(nemo_conv_settings)
+                for i in ["subsampling_factor", "feat_in", "feat_out"]:
+                    assert (
+                        i not in nemo_conv_settings
+                    ), "{i} should be specified outside of the NeMo dictionary"
+
+            self.embed = NemoConvSubsampling(**default_nemo_conv_settings, )
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+
+        self.pos_emb = AbsolutePositionalEncoding(attention_dim,
+                                                  positional_dropout_rate)
+
+        self.relative_attention_bias_type = (
+            relative_attention_bias_args.get("type")
+            if relative_attention_bias_args else None)
+        if self.relative_attention_bias_type == "t5":
+            assert (self.num_heads % self.attention_group_size == 0
+                    ), "attention_group_size must divide n_head"
+            self.relative_attention_bias_layer = T5RelativeAttentionLogitBias(
+                self.num_heads // self.attention_group_size,
+                max_distance=relative_attention_bias_args.get(
+                    "t5_bias_max_distance", 1000),
+                symmetric=relative_attention_bias_args.get(
+                    "t5_bias_symmetric", False),
+            )
+        else:
+            raise NotImplementedError
+
+        self.encoder_embedding = MeanVarianceNormLayer(
+            self.encoder_embedding_config["input_size"])
+
+    def compute_lens_change(self, feature_lens):
+        """feature_lens: int
+        return updated feature lens.
+
+        This used to return a different lambda function for each case that 
+        computed the right thing.  That does not work within Torchscript. 
+        If you really need this to be faster, create nn.Module()-s for all
+        the cases and return one of them.  Torchscript does support that.
+        """
+        if self.input_layer == "nemo_conv":
+            # Handle the special causal case
+            subsampling_causal_cond = self.nemo_conv_settings.get(
+                "subsampling", "dw_striding") in [
+                    "dw_striding",
+                    "striding",
+                    "striding_conv1d",
+                ]
+            is_causal = self.nemo_conv_settings.get("is_causal", False)
+            if is_causal and subsampling_causal_cond:
+                lens_change = (torch.ceil(feature_lens /
+                                          self.time_reduction).long()
+                               if isinstance(feature_lens, Tensor) else
+                               math.ceil(feature_lens / self.time_reduction))
+                feature_lens_remainder = feature_lens % self.time_reduction
+                if isinstance(feature_lens, Tensor):
+                    lens_change[feature_lens_remainder != 1] += 1
+                elif feature_lens_remainder != 1:
+                    lens_change += 1
+                return lens_change
+            ceil_func = (math.ceil
+                         if isinstance(feature_lens, int) else torch.ceil)
+            return ceil_func(feature_lens / self.time_reduction)
+
+    @abc.abstractmethod
+    def forward(self):
+        """Abstract forward method implementation."""
+
+    def _chunk_size_selection(self, chunk_size=None, left_chunk=None):
+        """If chunk size is a list, we will randomly select a chunk size."""
+
+        if chunk_size is None:
+            chunk_size = self.chunk_size
+        if left_chunk is None:
+            left_chunk = self.left_chunk
+        if isinstance(chunk_size, list):
+            # Variable chunk size during training
+            chunk_size_index = int(
+                torch.randint(low=0, high=len(chunk_size), size=(1, )))
+            chunk_size_train_eff = chunk_size[chunk_size_index]
+            if not isinstance(left_chunk, list):
+                raise ValueError(
+                    "Since chunk_size is a list, left_chunk must be a list")
+            if len(left_chunk) != len(chunk_size):
+                raise ValueError(
+                    "The length of left_chunk must be the same as length of "\
+                        "chunk_size."
+                )
+            left_chunk_train_eff = left_chunk[chunk_size_index]
+        else:
+            chunk_size_train_eff = chunk_size
+            left_chunk_train_eff = left_chunk
+
+        return chunk_size_train_eff, left_chunk_train_eff
+
+    def _get_embed_class(self, embed):
+        # pylint: disable=protected-access
+        is_embed_using_act_chkpt = isinstance(embed, CheckpointWrapper)
+        is_embed_fsdp_wrapped = isinstance(embed, FullyShardedDataParallel)
+        embed_class = embed
+        if is_embed_using_act_chkpt:
+            embed_class = embed._checkpoint_wrapped_module
+        if is_embed_fsdp_wrapped:
+            embed_class = embed.module
+        return embed_class
+
+    def _forward_embeddings_core(self, input_tensor, masks):
+        embed_class = self._get_embed_class(self.embed)
+        assert isinstance(embed_class, NemoConvSubsampling)
+        input_tensor, masks = self.embed(input_tensor, masks)
+        return input_tensor, masks
+
+    def _position_embedding(self, input_tensor):
+        pos_k = None
+        pos_v = None
+        if self.relative_attention_bias_layer is None:
+            input_tensor = self.pos_emb(
+                input_tensor)  # default to add abs sinusoid embedding
+        return pos_k, pos_v
+
+    def _streaming_mask(self, seq_len, batch_size, chunk_size, left_chunk):
+        chunk_size_train_eff, left_chunk_train_eff = \
+            self._chunk_size_selection(chunk_size, left_chunk)
+
+        # Create mask matrix for streaming
+        # S stores start index. if chunksize is 18, s is [0,18,36,....]
+        chunk_start_idx = np.arange(0, seq_len, chunk_size_train_eff)
+
+        enc_streaming_mask = (adaptive_enc_mask(
+            seq_len, chunk_start_idx,
+            left_window=left_chunk_train_eff).unsqueeze(0).expand(
+                [batch_size, -1, -1]))
+        return enc_streaming_mask
+
+    def forward_embeddings(self,
+                           xs_pad,
+                           masks,
+                           chunk_size_nc=None,
+                           left_chunk_nc=None):
+        """Forwarding the inputs through the top embedding layers
+
+        Args:
+            xs_pad: torch.Tensor
+                input tensor
+            masks: torch.Tensor
+                input mask
+            chunk_size_nc: (optional, default is None) chunk size for 
+                            non-causal layers
+            left_chunk_nc: (optional, default is None) # of left chunks for
+                            non-causal layers
+        """
+        # pylint: disable=R0915
+        # get new lens.
+        seq_len = int(self.compute_lens_change(xs_pad.shape[1]))
+        if seq_len <= 0:
+            raise ValueError(
+                f"""The sequence length after time reduction is invalid: 
+                {seq_len}. Your input feature is too short. Consider 
+                filtering out the very short sentence from data 
+                loader""", )
+
+        batch_size = xs_pad.shape[0]
+
+        enc_streaming_mask = self._streaming_mask(seq_len, batch_size,
+                                                  self.chunk_size,
+                                                  self.left_chunk)
+
+        if xs_pad.is_cuda:
+            enc_streaming_mask = enc_streaming_mask.cuda()
+            xs_pad = xs_pad.cuda()
+
+        input_tensor = xs_pad
+        input_tensor, masks = self._forward_embeddings_core(
+            input_tensor, masks)
+
+        streaming_mask = enc_streaming_mask
+        if streaming_mask is not None and masks is not None:
+            hs_mask = masks & streaming_mask
+        elif masks is not None:
+            hs_mask = masks
+        else:
+            hs_mask = streaming_mask
+
+        if chunk_size_nc is not None:
+            enc_streaming_mask_nc = self._streaming_mask(
+                seq_len, batch_size, chunk_size_nc, left_chunk_nc)
+            if xs_pad.is_cuda:
+                enc_streaming_mask_nc = enc_streaming_mask_nc.cuda()
+            if masks is not None:
+                hs_mask_nc = masks & enc_streaming_mask_nc
+            else:
+                hs_mask_nc = enc_streaming_mask_nc
+        else:
+            hs_mask_nc = None
+
+        pos_k, pos_v = self._position_embedding(input_tensor)
+
+        if chunk_size_nc is None:
+            return input_tensor, pos_k, pos_v, hs_mask, masks
+        return input_tensor, pos_k, pos_v, hs_mask, masks, hs_mask_nc
+
+    def get_offset(self):
+        """Returns offset used when retaining inputs for decoding.
+
+        This is essentially, how many additional frames have to be added to
+        the front-end CNN input to ensure it can produce a single output.
+        So if the "padding" parameter is 0, typically offset will be > 0.
+        """
+        return get_offset(self.input_layer, self.time_reduction)
+
+
+class ConformerEncoder(TransformerEncoderBase):
+    """ConformerEncoder module.
+    see original paper for more details:
+        https://arxiv.org/abs/2005.08100
+
+    Please set causal = True in streaming model
+    Args:
+        input_size: int
+            input feature dimension.
+        chunk_size: int, list(int)
+            Number of frames for each chunk
+            This variable can take 2 forms:
+            int:  Used for inference, or single chunk size training
+            list(int) : Used only for variable chunk size training
+            Some examples for the 2 cases:
+            chunk_size = 12
+            chunk_size = [6, 8, 12, 24]
+        left_chunk: int, list(int)
+            Number of chunks used for masking in streaming mode.
+            This variable can take 2 forms:
+            int:  Used for inference, or single chunk size training
+            list(int) : Used only for variable chunk size training. When
+            chunk_size is a list, left_chunk must be a list with same length.
+            Some examples for the 2 cases:
+            left_chunk = 6
+            left_chunk = [12, 9, 6, 3]
+        left_chunk: int
+            number of chunks used for masking in streaming mode.
+        num_lang: int
+            This parameter is used to store the number of languages in the 
+            lang_dict, only used for multiseed/multilingual models. 
+            default None.
+        attention_dim: int, optional
+            attention dimension. default 256.
+        attention_heads: int, optional
+            the number of heads. default 4
+        linear_units:
+            the number of units of position-wise feed forward.
+            default 2048
+        num_block:
+            number of Transformer layer. default 6
+        dropout_rate: float, optional
+            dropout rate. default 0.1
+        input_layer: str, optional
+            input layer type before Conformer,
+            one of ["linear", "conv2d", "custom", "vgg2l", "embed"],
+            default "conv2d"
+        causal: bool, optional
+            if set to True, convolution have no access
+             to future frames. default False.
+        batch_norm: bool, optional
+            if set to True, apply batchnorm before activation
+            in ConvModule layer of the conformer.
+            default False
+        cnn_out: int, optional
+            the number of CNN channels before Conformer.
+            default -1.
+        cnn_layer_norm: bool, optional
+            layer norm between Conformer and the first CNN.
+            default False.
+        ext_pw_out_channel: int, optional
+            the number of channel for CNN
+            before depthwise_seperable_CNN.
+            If 0 then use linear. default 0.
+        ext_pw_kernel_size: int, optional
+            kernel size of N before depthwise_seperable_CNN.
+            only work for ext_pw_out_channel > 0.
+            default 1
+        depthwise_seperable_out_channel: int, optional
+            the number of channel for
+            depthwise_seperable_CNN.
+            default 256.
+        depthwise_multiplier: int, optional
+            the number of multiplier for
+            depthwise_seperable_CNN.
+            default 1.
+        chunk_se: int, optional
+            0 for offline SE.
+            1 for streaming SE, where mean is computed
+             by accumulated history until current chunk_se.
+            2 for streaming SE, where mean is computed
+             by only the current chunk.
+            default 0.
+        kernel_size: int, optional
+            the number of kernels for depthwise_seperable_CNN.
+            default 3.
+        activation: str, optional
+            FeedForward block activation.
+            one of ["relu", "swish", "sigmoid"]
+            default "relu".
+        conv_activation: str, optional
+            activation function used in ConvModule part
+            of the conformer, default "relu".
+        conv_glu_type: str, optional
+            activation used use glu in depthwise_seperable_CNN,
+            default "sigmoid"
+        bias_in_glu: bool, optional
+            if set to True, use additive bias in the weight module
+             before GLU. default True
+        linear_glu_in_convm: bool, optional
+            if set to True, use GLULinear module,
+             otherwise, used GLUPointWiseConv module.
+              default to False.
+        attention_glu_type: str
+            only work for glu_in_attention !=0
+            default "swish".
+        export: bool, optional
+            if set to True, it remove the padding from convolutional layers
+             and allow the onnx conversion for inference.
+              default False.
+        activation_checkpointing: str, optional
+            a dictionarry of {"module","interval","offload"}, where
+                "module": str
+                    accept ["transformer", "attention"] to select
+                    which module should do activation checkpointing.
+                "interval": int, default 1,
+                    interval of applying activation checkpointing,
+                    interval = 1 means that we apply checkpointing
+                    on every layer (if activation), otherwise,
+                    we apply it every x interval.
+                "offload": bool, default False,
+                    if set to True, we offload activation to cpu and
+                    reload it during backward, otherwise,
+                    we recalculate activation in backward.
+            default "".
+        extra_layer_output_idx: int
+            the layer index to be exposed.
+        relative_attention_bias_args: dict, optional
+            use more efficient scalar bias-based relative multihead attention 
+            (Q*K^T + B) implemented in cmb.basics.embedding.
+            [T5/ALiBi]RelativeAttentionLogitBias
+            usage: relative_attention_bias_args={"type": t5/alibi}
+            additional method-specific arguments can be provided (see 
+            transformer_base.py)
+        time_reduction: int optional
+            time reduction factor
+            default 4
+        use_pt_scaled_dot_product_attention: whether to use pytorch scaled 
+            dot product attention in training.
+            Default: False
+        nemo_conv_settings: dict, optional
+            A dictionary of settings for NeMo Subsampling.
+            default: None
+            usage: nemo_conv_settings=
+                {
+                    "subsampling":
+                    dw_striding/striding/dw_striding_conv1d/striding_conv1d,
+                    "conv_channels": int,
+                    "subsampling_conv_chunking_factor": int,
+                    "is_causal": True/False
+                }
+        conv2d_extra_padding: str, optional
+            Add extra padding in conv2d subsampling layers. Choices are
+            (feat, feat_time, none, True)
+            Default: none
+        replication_pad_for_subsample_embedding:  For batched-streaming 
+            decoding, use "replication" padding for the cache at start of
+            utterance.
+            Default: False
+        attention_group_size: int, optional
+            the number of groups to use for attention, default 1 
+            (Multi-Head Attention),
+            1 = typical Multi-Head Attention,
+            1 < attention_group_size < attention_heads = Grouped-Query
+            Attention
+            attention_group_size = attention_heads = Multi-Query Attention
+    """
+
+    extra_multi_layer_output_idxs: list[int]
+
+    def __init__(  # pylint: disable-all
+        self,
+        input_size,
+        chunk_size,
+        left_chunk,
+        num_lang=None,
+        attention_dim=256,
+        attention_heads=4,
+        linear_units=2048,
+        num_blocks=6,
+        dropout_rate=0.1,
+        input_layer="nemo_conv",
+        causal=True,
+        batch_norm=False,
+        cnn_out=-1,
+        cnn_layer_norm=False,
+        ext_pw_out_channel=0,
+        ext_pw_kernel_size=1,
+        depthwise_seperable_out_channel=256,
+        depthwise_multiplier=1,
+        chunk_se=0,
+        kernel_size=3,
+        activation="relu",
+        conv_activation="relu",
+        conv_glu_type="sigmoid",
+        bias_in_glu=True,
+        linear_glu_in_convm=False,
+        attention_glu_type="swish",
+        export=False,
+        extra_layer_output_idx=-1,
+        extra_multi_layer_output_idxs=[],  # noqa
+        activation_checkpointing="",
+        relative_attention_bias_args=None,
+        time_reduction=4,
+        use_pt_scaled_dot_product_attention=False,
+        nemo_conv_settings=None,
+        conv2d_extra_padding: Literal["feat", "feat_time", "none",
+                                      True] = "none",
+        replication_pad_for_subsample_embedding=False,
+        attention_group_size=1,
+        encoder_embedding_config=None,
+    ):
+        super().__init__(
+            input_size,
+            chunk_size,
+            left_chunk,
+            attention_dim,
+            attention_heads,
+            input_layer,
+            cnn_out,
+            cnn_layer_norm,
+            time_reduction,
+            dropout_rate=dropout_rate,
+            relative_attention_bias_args=relative_attention_bias_args,
+            positional_dropout_rate=0.0,
+            nemo_conv_settings=nemo_conv_settings,
+            conv2d_extra_padding=conv2d_extra_padding,
+            attention_group_size=attention_group_size,
+            encoder_embedding_config=encoder_embedding_config,
+        )
+        self.num_blocks = num_blocks
+        self.num_lang = num_lang
+        self.kernel_size = kernel_size
+        self.replication_pad_for_subsample_embedding: bool = (
+            replication_pad_for_subsample_embedding)
+        assert (self.num_heads % attention_group_size == 0
+                ), "attention_group_size must divide n_head"
+        self.num_heads_k = self.num_heads // attention_group_size
+
+        self.encoders = MultiSequential(*[
+            ConformerEncoderLayer(
+                d_model=attention_dim,
+                ext_pw_out_channel=ext_pw_out_channel,
+                depthwise_seperable_out_channel=depthwise_seperable_out_channel,
+                depthwise_multiplier=depthwise_multiplier,
+                n_head=attention_heads,
+                d_ffn=linear_units,
+                ext_pw_kernel_size=ext_pw_kernel_size,
+                kernel_size=kernel_size,
+                dropout_rate=dropout_rate,
+                causal=causal,
+                batch_norm=batch_norm,
+                activation=activation,
+                chunk_se=chunk_se,
+                chunk_size=chunk_size,
+                conv_activation=conv_activation,
+                conv_glu_type=conv_glu_type,
+                bias_in_glu=bias_in_glu,
+                linear_glu_in_convm=linear_glu_in_convm,
+                attention_glu_type=attention_glu_type,
+                activation_checkpointing=activation_checkpointing,
+                export=export,
+                use_pt_scaled_dot_product_attention=
+                use_pt_scaled_dot_product_attention,
+                attn_group_sizes=attention_group_size,
+            ) for _ in range(num_blocks)
+        ])
+        self.extra_layer_output_idx = extra_layer_output_idx
+        self.extra_multi_layer_output_idxs = extra_multi_layer_output_idxs
+        # Make a zeros scalar we can use in get_initial_state to determine
+        # the device and the needed dtype:
+        self.register_buffer("dev_type", torch.zeros(()), persistent=False)
+
+    def init_relative_attention_bias(self, input_tensor):
+        if self.relative_attention_bias_layer:
+            return self.relative_attention_bias_layer(input_tensor)
+
+    def calculate_hs_mask(self, xs_pad, device, mask):
+        max_audio_length = xs_pad.shape[1]
+        batch_size = xs_pad.shape[0]
+        enc_streaming_mask = self._streaming_mask(max_audio_length, batch_size,
+                                                  self.chunk_size,
+                                                  self.left_chunk)
+        enc_streaming_mask = enc_streaming_mask.to(device)
+        if mask is None:
+            return enc_streaming_mask
+
+        feature_lens = mask.sum(1)
+        padding_length = feature_lens
+        pad_mask = (torch.arange(0, max_audio_length,
+                                 device=device).expand(padding_length.size(0),
+                                                       -1)
+                    < padding_length.unsqueeze(1))
+        pad_mask = pad_mask.unsqueeze(1)
+        pad_mask = pad_mask & enc_streaming_mask
+        return pad_mask
+
+    @torch.jit.ignore
+    def forward(self, xs_pad, masks):
+        """Conformer Forward function
+
+        Args:
+            xs_pad: torch.Tensor
+                input tensor
+            masks: torch.Tensor
+                post-embedding input lengths
+        """
+        xs_pad = self.encoder_embedding(xs_pad)
+        input_tensor, pos_k, pos_v, hs_mask, masks = self.forward_embeddings(
+            xs_pad, masks)
+
+        unfolded = False
+        ori_bz, seq_len, D = input_tensor.shape
+        max_seq_len = 500  #maximum position for absolute positional encoding
+        if seq_len > max_seq_len:
+            # audio sequence is longer than max_seq_len, unfold it into chunks
+            # of max_seq_len
+            unfolded = True
+            # the unfold op will drop residual frames, pad it to the multiple
+            # of max_seq_len
+            if seq_len % max_seq_len > 0:
+                chunk_pad_size = max_seq_len - (seq_len % max_seq_len)
+            else:
+                chunk_pad_size = 0
+            if chunk_pad_size > 0:
+                input_tensor_pad = F.pad(input_tensor,
+                                         (0, 0, 0, chunk_pad_size), "constant",
+                                         0)
+                input_tensor = input_tensor_pad.to(input_tensor.device)
+            input_tensor = unfold_tensor(input_tensor, max_seq_len)
+            if masks is not None:
+                # revise hs_mask here because the previous calculated hs_mask
+                # did not consider extra pad
+                subsampled_pad_mask = masks.squeeze(
+                    1)  # [bz, subsampled_unmask_seq_len]
+                extra_padded_subsamlped_pad_mask = F.pad(
+                    subsampled_pad_mask, (0, chunk_pad_size), "constant",
+                    False)  # extra padding to the pad mask
+                extra_padded_subsamlped_pad_mask = \
+                    extra_padded_subsamlped_pad_mask.unsqueeze(-1).float()
+                masks_unfold = unfold_tensor(
+                    extra_padded_subsamlped_pad_mask, max_seq_len
+                )  # unfold the pad mask like we did to the input tensor
+                masks_unfold = masks_unfold.squeeze(
+                    -1).bool()  # unfold op does not support bool tensor
+            else:
+                masks_unfold = None
+            hs_mask = self.calculate_hs_mask(
+                input_tensor, input_tensor.device, masks_unfold
+            )  # calculate hs_mask based on the unfolded pad mask
+
+        # layer_emb = None
+
+        relative_attention_bias = self.init_relative_attention_bias(
+            input_tensor)
+
+        _simplified_path = (self.extra_layer_output_idx == -1
+                            and relative_attention_bias is None)
+
+        if _simplified_path:
+            input_tensor, *_ = self.encoders(input_tensor, pos_k, pos_v,
+                                             hs_mask)
+        else:
+            for i, layer in enumerate(self.encoders):
+                input_tensor, _, _, _ = layer(
+                    input_tensor,
+                    pos_k,
+                    pos_v,
+                    hs_mask,
+                    relative_attention_bias=relative_attention_bias,
+                )
+
+                # if i == self.extra_layer_output_idx:
+                #     layer_emb = input_tensor
+
+        if unfolded:
+            embed_dim = input_tensor.shape[-1]
+            input_tensor = input_tensor.reshape(ori_bz, -1, embed_dim)
+            # if we ever padded before unfolding, we need to remove the padding
+            if chunk_pad_size > 0:
+                input_tensor = input_tensor[:, :-chunk_pad_size, :]
+
+        return input_tensor, masks  # , layer_emb
+
+
+class WindowQformer(nn.Module):
+    """Window-level Qformer"""
+
+    def __init__(
+        self,
+        window_size: int = 8,
+        num_queries: int = 1,
+        num_blocks: int = 2,
+        attention_dim: int = 512,
+        attention_heads: int = 8,
+        linear_units: int = 2048,
+        dropout_rate: float = 0.0,
+        normalize_before: bool = True,
+    ):
+        super().__init__()
+
+        self.decoders = nn.ModuleList([
+            nn.TransformerDecoderLayer(
+                d_model=attention_dim,
+                nhead=attention_heads,
+                dim_feedforward=linear_units,
+                dropout=dropout_rate,
+                activation="relu",
+                batch_first=True,
+                norm_first=normalize_before,  # TODO need to verify
+            ) for _ in range(num_blocks)
+        ])
+
+        self.queries = nn.Parameter(torch.zeros(1, num_queries, attention_dim))
+        self.after_norm = (nn.LayerNorm(attention_dim, eps=1e-12)
+                           if normalize_before else None)
+        self.window_size = window_size
+
+    def forward(self, audio_embed, mask, embed_len=None):
+        """forward decoder"""
+        # audio_embed: N x T x D => N x D x T
+
+        audio_embed = audio_embed.transpose(1, 2)
+        # audio_embed: N x D x 1 x T => N x DK x T'
+        padding = audio_embed.shape[-1] % self.window_size
+        if padding > 0:
+            audio_embed = F.pad(audio_embed, (0, self.window_size - padding),
+                                "constant", 0)
+
+        embed_chunk = F.unfold(
+            audio_embed[..., None, :],
+            kernel_size=(1, self.window_size),
+            stride=(1, self.window_size),
+        )
+        bsz, _, slen = embed_chunk.shape
+        # N x D x K x T'
+        embed_chunk = embed_chunk.view(bsz, -1, self.window_size, slen)
+        # N x T' x K x D
+        embed_chunk = embed_chunk.transpose(1, 3).contiguous()
+        # NT' x K x D
+        embed_chunk = embed_chunk.view(bsz * slen, self.window_size, -1)
+        # NT' x 1 x D
+        q = self.queries.expand(bsz * slen, -1, -1)
+        for layer in self.decoders:
+            q = layer(tgt=q,
+                      memory=embed_chunk,
+                      tgt_mask=None,
+                      memory_mask=mask)
+
+        if self.after_norm is not None:
+            q = self.after_norm(q)
+
+        if embed_len is not None:
+            embed_len = embed_len // self.window_size
+        # N x T' x D
+        out = q.view(bsz, slen, -1)
+
+        return out, embed_len
+
+
+class AudioEmbedding(nn.Module):
+    """Image embedding."""
+
+    def __init__(self, config: PretrainedConfig, **kwargs) -> None:
+        super().__init__()
+        self.config = config
+        # n_embed or hidden_size for text LM
+        hidden_size = (config.n_embd
+                       if hasattr(config, "n_embd") else config.hidden_size)
+
+        # self.wte = nn.Embedding(config.vocab_size, hidden_size)
+
+        audio_dim_out = (
+            None  # Set this variable according to the actual audio processor
+        )
+        self.layer_idx = -2
+
+        if (isinstance(config.audio_processor, dict)
+                and config.audio_processor.get("name", None) == "cascades"):
+            encoder_config = config.audio_processor.get("config", None)
+            assert encoder_config is not None
+            self.encoder = ConformerEncoder(**encoder_config)
+
+            audio_dim_out = encoder_config["attention_dim"]
+            n_mels = encoder_config["input_size"]
+        else:
+            raise NotImplementedError("")
+
+        assert (audio_dim_out
+                is not None), "Remember to set values for audio_dim_out"
+        self.audio_dim_out = audio_dim_out
+        self.audio_dim_in = n_mels
+
+        self.freeze_audio_processor = kwargs.get("freeze_audio_processor",
+                                                 False)
+
+        self.downsample_rate = kwargs.get("downsample_rate", 1)
+
+        if kwargs.get("use_qformer", False):
+            qformer_config = kwargs.get("qformer_config", {})
+            qformer_config["attention_dim"] = audio_dim_out
+            self.qformer = WindowQformer(**qformer_config)
+        else:
+            self.qformer = None
+
+        if kwargs.get("use_conv_downsample", False):
+            assert (self.qformer is None
+                    ), "don't support use qformer and conv downsample together"
+            nemo_conv_settings = kwargs.get("nemo_conv_settings", {})
+            default_nemo_conv_settings = {
+                "subsampling": "dw_striding",
+                "subsampling_factor": self.downsample_rate,
+                "feat_in": audio_dim_out,
+                "feat_out": audio_dim_out,
+                "conv_channels": 256,
+                "subsampling_conv_chunking_factor": 1,
+                "activation": nn.ReLU(),
+                "is_causal": False,
+            }
+            # Override any of the defaults with the incoming, user settings
+            if nemo_conv_settings:
+                default_nemo_conv_settings.update(nemo_conv_settings)
+                for i in ["subsampling_factor", "feat_in", "feat_out"]:
+                    assert (
+                        i not in nemo_conv_settings
+                    ), "{i} should be specified outside of the NeMo dictionary"
+
+            self.conv_ds = NemoConvSubsampling(**default_nemo_conv_settings, )
+        else:
+            self.conv_ds = None
+
+        projection_cls = kwargs.get("projection_cls", "linear")
+        if projection_cls == "linear":
+            self.audio_projection = nn.Linear(audio_dim_out, hidden_size)
+        elif projection_cls == "mlp":
+            # follow llava-v1.5's implementation
+            # (do not use image_projection and image_proj_norm)
+            dim_projection = hidden_size
+            depth = 2
+            self.linear_downsample_rate = (1 if (self.qformer or self.conv_ds)
+                                           else self.downsample_rate)
+            layers = [
+                nn.Linear(audio_dim_out * self.linear_downsample_rate,
+                          dim_projection)
+            ]
+            for _ in range(1, depth):
+                layers.extend(
+                    [nn.GELU(),
+                     nn.Linear(dim_projection, dim_projection)])
+            self.audio_projection = nn.Sequential(*layers)
+            # NOTE vision-speech tasks use a separate projection layer
+            layers = [
+                nn.Linear(audio_dim_out * self.linear_downsample_rate,
+                          dim_projection)
+            ]
+            for _ in range(1, depth):
+                layers.extend(
+                    [nn.GELU(),
+                     nn.Linear(dim_projection, dim_projection)])
+            self.audio_projection_for_vision = nn.Sequential(*layers)
+        else:
+            raise NotImplementedError(
+                f"projection_cls = {projection_cls}, not implemented")
+
+        # TODO: audio sequence compression - Qformer
+        self.vocab_size = config.vocab_size
+        self.input_embeds = None
+        self.audio_embed_sizes = None
+
+    def set_audio_embeds(self, input_embeds: torch.FloatTensor) -> None:
+        self.input_embeds = input_embeds
+
+    def set_audio_embed_sizes(self,
+                              audio_embed_sizes: torch.LongTensor) -> None:
+        self.audio_embed_sizes = audio_embed_sizes
+
+    def get_audio_features(
+        self,
+        input_embeds: torch.FloatTensor,
+        audio_attention_mask: torch.Tensor = None,
+        audio_projection_mode: str = "speech",
+    ) -> torch.FloatTensor:
+        """
+        arguments:
+            input_embeds: audio features (B, T, D)  B: num audios in a sequence
+        """
+        if self.freeze_audio_processor:
+            with torch.no_grad():
+                audio_features, masks = self.encoder(input_embeds,
+                                                     audio_attention_mask)
+        else:
+            audio_features, masks = self.encoder(input_embeds,
+                                                 audio_attention_mask)
+
+        if self.qformer is not None:
+            audio_features, _ = self.qformer(audio_features, mask=None)
+
+        if self.conv_ds is not None:
+            if masks is not None:
+                masks = masks.squeeze(1)
+
+            audio_features, masks = self.conv_ds(audio_features, mask=masks)
+
+        if self.linear_downsample_rate != 1:
+            bs, seq_len, feat_dim = audio_features.size()
+            padding = seq_len % self.linear_downsample_rate
+            if padding > 0:
+                audio_features = F.pad(
+                    audio_features,
+                    (0, 0, 0, self.linear_downsample_rate - padding),
+                    "constant",
+                    0,
+                )
+
+            seq_len = audio_features.size(1)
+            audio_features = audio_features.view(
+                bs,
+                seq_len // self.linear_downsample_rate,
+                feat_dim * self.linear_downsample_rate,
+            )
+
+        if audio_projection_mode == 'speech':
+            audio_set_tensor = self.audio_projection(audio_features)
+        elif audio_projection_mode == 'vision':
+            audio_set_tensor = self.audio_projection_for_vision(audio_features)
+        else:
+            raise ValueError(
+                f"audio_projection_mode = {audio_projection_mode} not "\
+                    "implemented"
+            )
+
+        return audio_set_tensor
+
+    def forward(
+        self,
+        audio_features: torch.FloatTensor,
+        audio_attention_mask: torch.Tensor = None,
+        audio_projection_mode: str = "speech",
+    ) -> torch.FloatTensor:
+        """
+        arguments:
+            audio_features: audio features (T, D)
+        
+        returns:
+            audio_embeds: audio embeddings (num_audio_tokens, hidden_dim)
+        """
+        audio_embeds = self.get_audio_features(
+            audio_features.unsqueeze(0),
+            audio_attention_mask=audio_attention_mask,
+            audio_projection_mode=audio_projection_mode,
+        )
+        return audio_embeds.squeeze(0)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/phi4mm_utils.py b/vllm_v0.10.0/vllm/model_executor/models/phi4mm_utils.py
new file mode 100644
index 0000000..c4890d8
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/phi4mm_utils.py
@@ -0,0 +1,1884 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+# Code copied from Microsoft/MoE by Jacob Platin (jacobplatin@microsoft.com)
+# but implemented by the Phi-Speech team
+#!/usr/bin/env python3
+import math
+from typing import Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+
+class BlockBase(nn.Module):
+    """Block abstract module"""
+
+    def __init__(self, input_size, output_size):
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+
+
+def get_activation(name="relu"):
+    """Select an activation function by name
+
+    Args:
+        name: str
+            activation function name,
+            one of ["relu", "gelu", "swish", "sigmoid"],
+            default "relu".
+    """
+    name = name.lower()
+    if name == "relu":
+        return nn.ReLU(inplace=True)
+    if name == "gelu":
+        return nn.GELU()
+    if name == "swish":
+        return Swish()
+    if name == "sigmoid":
+        return torch.nn.Sigmoid()
+    return nn.Identity()
+
+
+def adaptive_enc_mask(x_len, chunk_start_idx, left_window=0, right_window=0):
+    """
+    The function is very important for Transformer Transducer Streaming mode
+    Args:
+        xs_len (int): sequence length
+        chunk_start_idx (list): first idx of each chunk, such as [0,18,36,48]. 
+        It also supports adaptive chunk size [0,10,15,45]
+        left_window (int): how many left chunks can be seen
+        right_window (int): how many right chunks can be seen. It is used for 
+        chunk overlap model.
+        Returns:
+            mask (torch.Tensor): a mask tensor for streaming model
+            Torch 1.0.1
+            tensor([[1., 1., 0., 0.],
+                    [0., 1., 1., 0.],
+                    [0., 0., 1., 1.]])
+            Torch 1.4.1
+            tensor([[True., True., False., False.],
+                    [False., True., True., False.],
+                    [False., False., True., True.]])
+    """
+    chunk_start_idx = torch.Tensor(chunk_start_idx).long(
+    )  # first idx of each chunk, such as [0,18,36,48].
+    start_pad = torch.nn.functional.pad(
+        chunk_start_idx,
+        (1, 0))  # append 0 to the beginning, so it becomes [0, 0, 18, 36, 48]
+    end_pad = torch.nn.functional.pad(
+        chunk_start_idx, (0, 1), value=x_len
+    )  # append x_len to the end, so it becomes [0,18,36,48, x_len]
+    seq_range = torch.arange(0,
+                             x_len).unsqueeze(-1)  # seq_range size: [x_len, 1]
+    idx = ((seq_range < end_pad) &
+           (seq_range >= start_pad)).nonzero()[:, 1]  # idx size: [x_len]
+    # boundary = end_pad[idx]  # boundary size: [x_len]
+    seq_range_expand = (torch.arange(0, x_len).unsqueeze(0).expand(x_len, -1)
+                        )  # seq_range_expand size [x_len, x_len]
+    idx_left = idx - left_window
+    idx_left[idx_left < 0] = 0
+    boundary_left = start_pad[idx_left]
+    mask_left = seq_range_expand >= boundary_left.unsqueeze(-1)
+    idx_right = idx + right_window
+    idx_right[idx_right > len(chunk_start_idx)] = len(chunk_start_idx)
+    boundary_right = end_pad[idx_right]
+    mask_right = seq_range_expand < boundary_right.unsqueeze(-1)
+    return mask_left & mask_right
+
+
+class Swish(nn.Module):
+    """Implement Swish activation module.
+    From https://arxiv.org/pdf/2005.03191.pdf
+
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.act_fn = nn.Sigmoid()
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Apply Swish function
+
+        Args:
+            x: torch.Tensor
+                Input.
+        """
+        return x * self.act_fn(x)
+
+
+class GLU(nn.Module):
+    """Implement Gated Linear Unit (GLU) module"""
+
+    def __init__(self, dim: int = -1, act_name: str = "sigmoid") -> None:
+        super().__init__()
+        self.dim = dim
+        self.act_name = act_name.lower()
+
+        if self.act_name == "relu":
+            self.act_fn = nn.ReLU(inplace=True)
+        elif self.act_name == "gelu":
+            self.act_fn = nn.GELU()
+        elif self.act_name == "swish":
+            self.act_fn = Swish()
+        elif self.act_name == "sigmoid":
+            self.act_fn = nn.Sigmoid()
+        else:
+            self.act_fn = nn.Identity()
+
+    def forward(self, x: Tensor) -> Tensor:
+        """GLU forward
+        Apply Swish function on the first half of input matrices
+        with sigmoid of the second half.
+
+        Args:
+            x: torch.Tensor
+                Input.
+
+        """
+        half_x, gate = x.chunk(2, dim=self.dim)
+        return half_x * self.act_fn(gate)
+
+
+# TODO: Abdel, this can be improved using GLU module
+class GLUPointWiseConv(nn.Module):
+    """GLUPointWiseConv module
+    used for conformer architecture,
+    for more details see:
+    https://arxiv.org/pdf/2005.08100v1.pdf
+
+    Args:
+        input_dim: int
+            input channel size.
+        output_dim: int
+            output channel size.
+        kernel_size: int
+            kernel size
+        glu_type: str, optional
+            activation function one of
+             ["sigmoid", "relu", "gelu"]
+              default "sigmoid".
+        bias_in_glu: bool, optional
+            use addtive bias in glu
+        causal: bool, optional
+            if set to True, padding is set to the half of
+             kernel size, ie, convolution can't see future frames.
+              default False.
+
+    """
+
+    def __init__(
+        self,
+        input_dim,
+        output_dim,
+        kernel_size,
+        glu_type="sigmoid",
+        bias_in_glu=True,
+        causal=False,
+    ):
+        super().__init__()
+
+        self.glu_type = glu_type
+        self.output_dim = output_dim
+        self.bias_in_glu = bias_in_glu
+        if causal:
+            self.ext_pw_conv_1d = nn.Conv1d(
+                input_dim,
+                output_dim * 2,
+                kernel_size,
+                1,
+                padding=(kernel_size - 1),
+            )
+        else:
+            self.ext_pw_conv_1d = nn.Conv1d(
+                input_dim,
+                output_dim * 2,
+                kernel_size,
+                1,
+                padding=(kernel_size - 1) // 2,
+            )
+
+        if glu_type == "sigmoid":
+            self.glu_act = nn.Sigmoid()
+        elif glu_type == "relu":
+            self.glu_act = nn.ReLU()
+        elif glu_type == "gelu":
+            self.glu_act = nn.GELU()
+        elif glu_type == "swish":
+            self.glu_act = Swish()
+        else:
+            raise ValueError(f"Unsupported activation type {self.glu_act}")
+
+        if bias_in_glu:
+            self.b1 = nn.Parameter(torch.zeros(1, output_dim, 1))
+            self.b2 = nn.Parameter(torch.zeros(1, output_dim, 1))
+
+    def forward(self, x):
+        """
+        Args:
+            x: torch.Tensor
+                input tensor
+        """
+        # to be consistent with GLULinear, we assume the input always has the
+        # #channel (#dim) in the last dimension of the tensor, so need to
+        # switch the dimension first for 1D-Conv case
+        x = x.permute([0, 2, 1])
+        x = self.ext_pw_conv_1d(x)
+        if self.glu_type == "bilinear":
+            if self.bias_in_glu:
+                x = (x[:, 0:self.output_dim, :] + self.b1) * (
+                    x[:, self.output_dim:self.output_dim * 2, :] + self.b2)
+            else:
+                x = (x[:, 0:self.output_dim, :]) * (
+                    x[:, self.output_dim:self.output_dim * 2, :])
+        else:
+            if self.bias_in_glu:
+                x = (x[:, 0:self.output_dim, :] + self.b1) * self.glu_act(
+                    x[:, self.output_dim:self.output_dim * 2, :] + self.b2)
+            else:
+                x = (x[:, 0:self.output_dim, :]) * self.glu_act(
+                    x[:, self.output_dim:self.output_dim * 2, :])
+
+        x = x.permute([0, 2, 1])
+        return x
+
+
+class DepthWiseSeperableConv1d(nn.Module):
+    """DepthWiseSeperableConv1d module used in Convnet module
+    for the conformer, for more details see:
+    https://arxiv.org/pdf/2005.08100v1.pdf
+
+    Args:
+        input_dim: int
+            input channel size.
+        depthwise_seperable_out_channel: int
+            if set different to 0, the number of 
+             depthwise_seperable_out_channel will be used as a channel_out
+             of the second conv1d layer.
+             otherwise, it equal to 0, the second conv1d layer is skipped.
+        kernel_size: int
+            kernel_size
+        depthwise_multiplier: int
+            number of input_dim channels duplication. this value
+            will be used to compute the hidden channels of the Conv1D.
+        padding: int, optional
+            padding for the conv1d,
+             default: 0.
+
+    """
+
+    def __init__(
+        self,
+        input_dim,
+        depthwise_seperable_out_channel,
+        kernel_size,
+        depthwise_multiplier,
+        padding=0,
+    ):
+        super().__init__()
+
+        self.dw_conv = nn.Conv1d(
+            input_dim,
+            input_dim * depthwise_multiplier,
+            kernel_size,
+            1,
+            padding=padding,
+            groups=input_dim,
+        )
+
+        if depthwise_seperable_out_channel != 0:
+            self.pw_conv = nn.Conv1d(
+                input_dim * depthwise_multiplier,
+                depthwise_seperable_out_channel,
+                1,
+                1,
+                0,
+            )
+        else:
+            self.pw_conv = nn.Identity()
+        self.depthwise_seperable_out_channel = depthwise_seperable_out_channel
+
+    def forward(self, x):
+        """
+
+        Args:
+            x: torch.Tensor
+                input tensor
+        """
+        x = self.dw_conv(x)
+        if self.depthwise_seperable_out_channel != 0:
+            x = self.pw_conv(x)
+        return x
+
+
+class ConvModule(nn.Module):
+    """ConvModule Module for the conformer block.
+    for more details see:
+    https://arxiv.org/pdf/2005.08100v1.pdf
+
+    Args:
+        input_dim: int
+            input channel size.
+        ext_pw_out_channel: int
+            if > 0, ext_pw_out_channel is a dim channel size
+             for the last pointwise conv after swish activation.
+        depthwise_seperable_out_channel: int
+            if set different to 0, the number of 
+             depthwise_seperable_out_channel
+             will be used as a channel_out of the second conv1d layer.
+             otherwise, it equal to 0, the second conv1d layer is skipped.
+        ext_pw_kernel_size: int
+            kernel size of the conv pointwise of the conformer.
+        kernel_size: int
+            kernel size.
+        depthwise_multiplier: int
+            number of input_dim channels duplication. this value
+             will be used to compute the hidden channels of the Conv1D.
+        dropout_rate: float
+            dropout rate.
+        causal: bool, optional
+            if set to True, convolution have no access
+             to future frames. default False.
+        batch_norm: bool, optional
+            if set to True, apply batchnorm before activation.
+            default False
+        chunk_se: int, optional
+            0 for offline SE.
+            1 for streaming SE, where mean is computed
+             by accumulated history until current chunk_se.
+            2 for streaming SE, where mean is computed
+             by only the current chunk.
+        chunk_size: int, optional
+            chunk size for cnn. default 18
+        activation: str, optional
+            activation function used in ConvModule,
+            default: "relu".
+        glu_type: str, optional
+            activation function used for the glu,
+            default: "sigmoid".
+        bias_in_glu: bool, optional
+            if set to True, use additive bias in the weight module
+             before GLU.
+        linear_glu_in_convm: bool, optional
+            if set to True, use GLULinear module,
+             otherwise, used GLUPointWiseConv module.
+              default to False.
+        export: bool, optional,
+            if set to True, padding is equal to 0.  This is for inference,
+             or onnx export.  Typically this is set by the export program or
+             the decoder program, and it isn't present in your config file.
+             default False
+    """
+
+    def __init__(
+        self,
+        input_dim,
+        ext_pw_out_channel,
+        depthwise_seperable_out_channel,
+        ext_pw_kernel_size,
+        kernel_size,
+        depthwise_multiplier,
+        dropout_rate,
+        causal=False,
+        batch_norm=False,
+        chunk_se=0,
+        chunk_size=18,
+        activation="relu",
+        glu_type="sigmoid",
+        bias_in_glu=True,
+        linear_glu_in_convm=False,
+        export=False,
+    ):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(input_dim)
+        self.input_dim = input_dim
+        self.ext_pw_out_channel = ext_pw_out_channel
+        self.ext_pw_kernel_size = ext_pw_kernel_size
+        self.depthwise_seperable_out_channel = depthwise_seperable_out_channel
+        self.glu_type = glu_type
+        self.bias_in_glu = bias_in_glu
+        self.linear_glu_in_convm = linear_glu_in_convm
+        self.causal = causal
+
+        self._add_ext_pw_layer()
+
+        self.batch_norm = batch_norm
+        self.kernel_size = kernel_size
+
+        if batch_norm:
+            self.bn_layer = nn.BatchNorm1d(input_dim)
+
+        self.act = get_activation(activation)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.export = export
+
+        if causal:
+            padding = 0 if export else kernel_size - 1
+        else:
+            padding = (kernel_size - 1) // 2
+
+        self.dw_sep_conv_1d = DepthWiseSeperableConv1d(
+            input_dim,
+            depthwise_seperable_out_channel,
+            kernel_size,
+            depthwise_multiplier,
+            padding=padding,
+        )
+
+        if depthwise_seperable_out_channel != 0:
+            if input_dim != depthwise_seperable_out_channel:
+                self.ln2 = nn.Linear(depthwise_seperable_out_channel,
+                                     input_dim)
+        else:
+            if depthwise_multiplier != 1:
+                self.ln2 = nn.Linear(input_dim * depthwise_multiplier,
+                                     input_dim)
+
+    def _add_ext_pw_layer(self):
+        """
+        This function is an extension of __init__ function
+        and dedicated to the convolution module creation
+        of the conformer.
+        """
+        self.ln1 = self.glu = self.bn_layer = self.ext_pw_conv_1d = (
+            nn.Identity())  # jit hacks.
+        self.squeeze_excitation = nn.Identity()  # jit.
+        self.apply_ln1 = self.fix_len1 = False  # jit.
+
+        if self.ext_pw_out_channel != 0:
+            if self.causal:
+                self.ext_pw_conv_1d = nn.Conv1d(
+                    self.input_dim,
+                    self.ext_pw_out_channel,
+                    self.ext_pw_kernel_size,
+                    1,
+                    padding=(self.ext_pw_kernel_size - 1),
+                )
+                if self.ext_pw_kernel_size > 1:
+                    self.fix_len1 = True
+                else:
+                    self.fix_len1 = False
+            else:
+                self.ext_pw_conv_1d = nn.Conv1d(
+                    self.input_dim,
+                    self.ext_pw_out_channel,
+                    self.ext_pw_kernel_size,
+                    1,
+                    padding=(self.ext_pw_kernel_size - 1) // 2,
+                )
+                self.fix_len1 = False
+
+            if self.linear_glu_in_convm:
+                self.glu = GLULinear(
+                    self.input_dim,
+                    self.ext_pw_out_channel,
+                    self.glu_type,
+                    self.bias_in_glu,
+                )
+            else:
+                self.glu = GLUPointWiseConv(
+                    self.input_dim,
+                    self.ext_pw_out_channel,
+                    self.ext_pw_kernel_size,
+                    self.glu_type,
+                    self.bias_in_glu,
+                    self.causal,
+                )
+
+            if self.input_dim != self.ext_pw_out_channel:
+                self.apply_ln1 = True
+                self.ln1 = nn.Linear(self.ext_pw_out_channel, self.input_dim)
+            else:
+                self.apply_ln1 = False
+        else:
+            self.pw_conv_simplify_w = torch.nn.Parameter(torch.ones(3))
+            self.pw_conv_simplify_b = torch.nn.Parameter(torch.zeros(3))
+
+    def forward(self, x):
+        """ConvModule Forward.
+
+        Args:
+            x: torch.Tensor
+                input tensor.
+        """
+        x = self.layer_norm(x)
+
+        if self.ext_pw_out_channel != 0:
+            x = self.glu(x)
+            if self.causal and self.ext_pw_kernel_size > 1:
+                x = x[:, :-(self.ext_pw_kernel_size - 1), :]
+            if self.apply_ln1:
+                x = self.ln1(x)
+        else:
+            x_0 = x * self.pw_conv_simplify_w[0] + self.pw_conv_simplify_b[0]
+            x_1 = x * self.pw_conv_simplify_w[1] + self.pw_conv_simplify_b[1]
+            x = x_0 + x_1
+
+        x = x.permute([0, 2, 1])
+
+        x = self.dw_sep_conv_1d(x)
+        if self.causal and self.kernel_size > 1:
+            x = x[:, :, :-(self.kernel_size - 1)]
+        if hasattr(self, "ln2"):
+            x = x.permute([0, 2, 1])
+            x = self.ln2(x)
+            x = x.permute([0, 2, 1])
+        if self.batch_norm:
+            x = self.bn_layer(x)
+        x = self.act(x)
+
+        if self.ext_pw_out_channel != 0:
+            x = self.ext_pw_conv_1d(x)
+            if self.fix_len1:
+                x = x[:, :, :-(self.ext_pw_kernel_size - 1)]
+
+            if self.apply_ln1:
+                x = x.permute([0, 2, 1])
+                x = self.ln1(x)
+                x = x.permute([0, 2, 1])
+
+            x = x.permute([0, 2, 1])
+        else:
+            x = x.unsqueeze(1).permute([0, 1, 3, 2])
+            x = x * self.pw_conv_simplify_w[2] + self.pw_conv_simplify_b[2]
+            x = x.squeeze(1)
+
+        x = self.dropout(x)
+        return x
+
+
+class GLULinear(nn.Module):
+    """Linear + GLU module
+
+    Args:
+        input_dim: int
+            input size
+        output_dim: int
+            output size.
+        glu_type:
+            activation function name used in glu module.
+            default "sigmoid" (swish function).
+        bias_in_glu: bool, optional
+            If True, the addtive bias is added. Default False.
+    """
+
+    def __init__(
+        self,
+        input_dim,
+        output_dim,
+        glu_type="sigmoid",
+        bias_in_glu=True,
+    ):
+        super().__init__()
+        self.linear = nn.Linear(input_dim, output_dim * 2, bias_in_glu)
+        self.glu_act = GLU(-1, glu_type)
+
+    def forward(self, x):
+        """GLULinear forward
+
+        Args:
+            x: torch.Tensor
+                inpute tensor.
+        """
+        x = self.linear(x)
+        return self.glu_act(x)
+
+
+class FeedForward(nn.Module):
+    """FeedForward Module.
+    For more details see Conformer paper:
+        https://arxiv.org/pdf/2005.08100.pdf
+
+    Args:
+        d_model: int
+            input size.
+        d_inner: int
+            output size.
+        dropout_rate: float,
+            dropout rate.
+        activation: str,
+            activation function name,
+            one of ["relu", "swish", "sigmoid"],
+            sigmoid activation is only used with "glu_in_fnn=True",
+            default "sigmoid".
+        bias_in_glu: bool, optional
+    """
+
+    def __init__(
+        self,
+        d_model,
+        d_inner,
+        dropout_rate,
+        activation="sigmoid",
+        bias_in_glu=True,
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.d_inner = d_inner
+
+        self.layer_norm = nn.LayerNorm(d_model)
+        module = GLULinear(d_model, d_inner, activation, bias_in_glu)
+        self.net = nn.Sequential(
+            module,
+            nn.Dropout(dropout_rate),
+            nn.Linear(d_inner, d_model),
+            nn.Dropout(dropout_rate),
+        )
+
+    def forward(self, x):
+        """FeedForward forward function.
+
+        Args:
+            x: torch.Tensor
+                input tensor.
+        """
+        out = self.net(self.layer_norm(x))
+
+        return out
+
+
+#### positional encoding starts here
+def _pre_hook(
+    state_dict,
+    prefix,
+    local_metadata,
+    strict,
+    missing_keys,
+    unexpected_keys,
+    error_msgs,
+):
+    """Perform pre-hook in load_state_dict for backward compatibility.
+
+    Note:
+        We saved self.pe until v.0.5.2 but we have omitted it later.
+        Therefore, we remove the item "pe" from `state_dict` for backward 
+        compatibility.
+
+    """
+    k = prefix + "pe"
+    if k in state_dict:
+        state_dict.pop(k)
+
+
+class T5RelativeAttentionLogitBias(nn.Module):
+    """
+    This module implements the relative position bias described in Section 
+    2.1 of the T5 paper: https://arxiv.org/pdf/1910.10683.pdf
+
+    The Huggingface implementation is used as a reference
+    https://github.com/huggingface/transformers/blob/v4.30.0/src/
+    transformers/models/t5/modeling_t5.py#L435
+
+    Modifies attention as Q*K^T + B, where B is a learned scalar bias based
+    on relative position of the query and key. It is HxNxN, where H is the 
+    number of heads, N is the sequence length.
+
+    I've made these modifications to the original T5 bias:
+    - Skipping of the bucketing step. Original T5 bias converted rel 
+      position distances into logarithmically increasing buckets. This is 
+      supposed to help with length generalization.
+    - I just directly use rel position index as bias values, as we don't 
+      need length generalization (40s max is good enough for ASR encoder), 
+      and it keeps ONNX export simple.
+    - I've also extended it so that biases can be asymmetric, the default 
+      implementation treats L->R and R->L the same. Asymmetric was found to 
+      yield better results in my experiments.
+
+    Args:
+        num_heads: int
+            Number of attention heads
+        num_buckets: int
+            Number of buckets to use for relative attention bias. This is the
+            size of the learnable bias parameter. Bucketing is not yet 
+            supported, so this defaults to -1 which means no bucketing is
+            used (max_distance determines size of bias param).
+        max_distance: int
+            Maximum distance to use for relative attention bias. With 
+            num_buckets=-1, this directly controls the max size of the bias 
+            parameter. When num_buckets > 0 is supported, this will control 
+            the maximum distance for logarithmic bucketing after which all 
+            positions are in the same bucket.
+        symmetric: bool
+            Whether to use symmetric or asymmetric biases. symmetric=False uses
+            2x number of bias params to distinguish L->R from R->L. This was 
+            found to be better for the encoder.
+    """
+
+    def __init__(self,
+                 num_heads,
+                 num_buckets=-1,
+                 max_distance=1000,
+                 symmetric=False):
+        super().__init__()
+        self.num_heads = num_heads
+        self.num_buckets = num_buckets
+        self.max_distance = max_distance
+        self.symmetric = symmetric
+        self._skip_bucketing = self.num_buckets < 0
+        if self._skip_bucketing:
+            self.num_buckets = max_distance
+        else:
+            raise NotImplementedError(
+                "T5 attention bias with bucketed positions is not yet tested")
+        if not self.symmetric:
+            self.num_buckets *= 2
+        self.bias_values = nn.Embedding(self.num_buckets, self.num_heads)
+
+    def forward(self, x):
+        # instantiate bias compatible with shape of x
+        maxpos = x.size(1)
+        context_position = torch.arange(maxpos,
+                                        device=x.device,
+                                        dtype=torch.long)[:, None]
+        memory_position = torch.arange(maxpos,
+                                       device=x.device,
+                                       dtype=torch.long)[None, :]
+        relative_position = memory_position - context_position
+        # clipping to a maximum distance using ops that play well with ONNX
+        # export
+        relative_position = relative_position.masked_fill(
+            relative_position < -self.max_distance, -self.max_distance)
+        relative_position = relative_position.masked_fill(
+            relative_position > self.max_distance - 1, self.max_distance - 1)
+
+        # mapping from relative position to index in the bias parameter
+        if self._skip_bucketing:
+            bias_idx = relative_position
+        else:
+            bias_idx = self._bucket_relative_position(relative_position)
+        if self.symmetric:
+            bias_idx = bias_idx.abs()
+        else:
+            bias_idx += self.num_buckets // 2
+
+        t5_rel_att_bias = self.bias_values(bias_idx)  # [L, L, H]
+        t5_rel_att_bias = t5_rel_att_bias.permute(2, 0, 1).unsqueeze(
+            0)  # [1, H, L, L]
+
+        return t5_rel_att_bias
+
+    def _bucket_relative_position(self, relative_position):
+        # this is a placeholder (isn't tested, likely buggy) using HuggingFace
+        # implem as a reference this also needs to be extended to support
+        # asymmetric +/- ve positions
+        relative_buckets = 0
+        if not self.causal:
+            self.num_buckets //= 2
+            relative_buckets += (relative_position > 0).to(
+                torch.long) * self.num_buckets
+            relative_position = torch.abs(relative_position)
+        else:
+            relative_position = -torch.min(relative_position,
+                                           torch.zeros_like(relative_position))
+        # now relative_position is in the range [0, inf)
+
+        # half of the buckets are for exact increments in positions
+        max_exact = self.num_buckets // 2
+        is_small = relative_position < max_exact
+
+        # The other half of the buckets are for logarithmically bigger bins in
+        # positions up to max_distance
+        relative_position_if_large = max_exact + (
+            torch.log(relative_position.float() / max_exact) /
+            math.log(self.max_distance / max_exact) *
+            (self.num_buckets - max_exact)).to(torch.long)
+        relative_position_if_large = torch.min(
+            relative_position_if_large,
+            torch.full_like(relative_position_if_large, self.num_buckets - 1),
+        )
+
+        relative_buckets += torch.where(is_small, relative_position,
+                                        relative_position_if_large)
+        return relative_buckets
+
+
+class AbsolutePositionalEncoding(nn.Module):
+    """Absolute Positional encoding module.
+    This module implement Absolute sinusoidal positional encoding
+    from: https://arxiv.org/pdf/1706.03762.pdf
+
+    Args:
+        d_model: int
+            Input embedding size.
+        dropout_rate: float
+            dropout rate
+        max_len: int, optional
+            Maximum input length sequence, Default 5000
+
+    """
+
+    def __init__(self, d_model, dropout_rate, max_len=5000):
+        """Construct an PositionalEncoding object."""
+        super().__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+        self._register_load_state_dict_pre_hook(_pre_hook)
+
+    def extend_pe(self, x):
+        """Reset the positional encodings.
+
+        Args:
+            x: torch.Tensor
+        """
+        if self.pe is not None and self.pe.size(1) >= x.size(1):
+            if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+            return
+        pe = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32) *
+            -(math.log(10000.0) / self.d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, x: torch.Tensor):
+        """Add positional encoding.
+
+        Args:
+            x: torch.Tensor
+                Input tensor. shape is (batch, time, ...)
+
+        Returns:
+            torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
+
+        """
+        self.extend_pe(x)
+        x = x * self.xscale + self.pe[:, :x.size(1)]
+        return self.dropout(x)
+
+
+#### forward embedding layers starts here
+class MeanVarianceNormLayer(nn.Module):
+    """Mean/variance normalization layer.
+
+    Will subtract mean and multiply input by inverted standard deviation.
+    Typically used as a very first layer in a model.
+
+    Args:
+        input_size: int
+            layer input size.
+    """
+
+    def __init__(self, input_size):
+        super().__init__()
+        self.input_size = input_size
+        self.global_mean = nn.Parameter(torch.zeros(input_size))
+        self.global_invstd = nn.Parameter(torch.ones(input_size))
+
+    def forward(self, input_: Tensor) -> Tensor:
+        """MeanVarianceNormLayer Forward
+
+        Args:
+            input_: torch.Tensor
+                input tensor.
+        """
+        return (input_ - self.global_mean) * self.global_invstd
+
+
+class CausalConv1D(nn.Conv1d):
+    """
+    A causal version of nn.Conv1d where each step would have limited access to
+    locations on its right or left
+    All arguments are the same as nn.Conv1d except padding.
+
+    If padding is set None, then paddings are set automatically to make it a 
+    causal convolution where each location would not see any steps on its right.
+
+    If padding is set as a list (size of 2), then padding[0] would be used as 
+    left padding and padding[1] as right padding.
+    It would make it possible to control the number of steps to be accessible
+    on the right and left.
+    This mode is not supported when stride > 1. padding[0]+padding[1] should 
+    be equal to (kernel_size - 1).
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: Union[str, int] = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        self.cache_drop_size = None
+        if padding is None:
+            self._left_padding = kernel_size - 1
+            self._right_padding = stride - 1
+        else:
+            if stride != 1 and padding != kernel_size - 1:
+                raise ValueError(
+                    "No striding allowed for non-symmetric convolutions!")
+            if isinstance(padding, int):
+                self._left_padding = padding
+                self._right_padding = padding
+            elif (isinstance(padding, list) and len(padding) == 2
+                  and padding[0] + padding[1] == kernel_size - 1):
+                self._left_padding = padding[0]
+                self._right_padding = padding[1]
+            else:
+                raise ValueError(f"Invalid padding param: {padding}!")
+
+        self._max_cache_len = self._left_padding
+
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=0,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            padding_mode=padding_mode,
+            device=device,
+            dtype=dtype,
+        )
+
+    def update_cache(self, x, cache=None):
+        if cache is None:
+            new_x = F.pad(x, pad=(self._left_padding, self._right_padding))
+            next_cache = cache
+        else:
+            new_x = F.pad(x, pad=(0, self._right_padding))
+            new_x = torch.cat([cache, new_x], dim=-1)
+            if self.cache_drop_size > 0:
+                next_cache = new_x[:, :, :-self.cache_drop_size]
+            else:
+                next_cache = new_x
+            next_cache = next_cache[:, :, -cache.size(-1):]
+        return new_x, next_cache
+
+    def forward(self, x, cache=None):
+        x, cache = self.update_cache(x, cache=cache)
+        x = super().forward(x)
+        if cache is None:
+            return x
+        else:
+            return x, cache
+
+
+class CausalConv2D(nn.Conv2d):
+    """
+    A causal version of nn.Conv2d where each location in the 2D matrix would
+    have no access to locations on its right or down
+    All arguments are the same as nn.Conv2d except padding which should be 
+    set as None
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: Union[str, int] = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        if padding is not None:
+            raise ValueError(
+                "Argument padding should be set to None for CausalConv2D.")
+        self._left_padding = kernel_size - 1
+        self._right_padding = stride - 1
+
+        padding = 0
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            padding_mode,
+            device,
+            dtype,
+        )
+
+    def forward(
+        self,
+        x,
+    ):
+        x = F.pad(
+            x,
+            pad=(self._left_padding, self._right_padding, 0, 0),
+        )
+        x = super().forward(x)
+        return x
+
+
+class NemoConvSubsampling(torch.nn.Module):
+    """Convlutional subsampling module, taken from NeMo ASR
+    (https://github.com/NVIDIA/NeMo/blob/b367413645d5c72db3c2c96e46e95a
+    34501479cf/nemo/collections/asr/parts/submodules/subsampling.py)
+
+    Striding Subsampling: "Speech-Transformer: A No-Recurrence 
+    Sequence-to-Sequence Model for Speech Recognition" by Linhao Dong 
+    et al. (https://ieeexplore.ieee.org/document/8462506)
+
+
+    Compared with the EncoderConv2D (`input_layer: custom`), this is a 
+    much simplified approach, and uses no LayerNorm and far fewer Conv2Ds.
+    Moreover, depthwise convolutions are used to reduce FLOPs, but the first
+      layer is kept as a regular convolution so as not to degrade accuracy.
+
+    `Striding` and `dw_striding` are the same except that the latter uses 
+    depthwise convolutions after the first layer, whereas the former does not.
+
+    Args:
+        subsampling_factor (int): Time reduction factor
+        feat_in (int): size of the input features
+        feat_out (int): size of the output features
+        subsampling (str): The subsampling technique, choose from
+            {"striding", "dw-striding", "striding_conv1d", 
+            "dw_striding_conv1d"}
+        conv_channels (int): Number of channels for the convolution layers, 
+                            default is 256.
+        subsampling_conv_chunking_factor (int): Input chunking factor which 
+            can be -1 (no chunking) 1 (auto) or a power of 2. Default is 1
+        activation (Module): activation function, default is nn.ReLU()
+        is_causal (bool): whether to use causal Conv1/2D, where each step will
+            have limited access to locations on its right or left
+    """
+
+    def __init__(
+            self,
+            feat_in,
+            feat_out,
+            subsampling_factor=4,
+            subsampling="dw_striding",
+            conv_channels=256,
+            subsampling_conv_chunking_factor=1,
+            activation=nn.ReLU(),  # noqa: B008
+            is_causal=False,
+    ):
+        super().__init__()
+        self._subsampling = subsampling
+        self._conv_channels = conv_channels
+        self._feat_in = feat_in
+        self._feat_out = feat_out
+
+        if subsampling_factor % 2 != 0:
+            raise ValueError("Sampling factor should be a multiply of 2!")
+        self._sampling_num = int(math.log(subsampling_factor, 2))
+        self.subsampling_factor = subsampling_factor
+        self.is_causal = is_causal
+        self.subsampling_causal_cond = subsampling in (
+            "dw_striding",
+            "striding",
+            "striding_conv1d",
+        )
+
+        if (subsampling_conv_chunking_factor != -1
+                and subsampling_conv_chunking_factor != 1
+                and subsampling_conv_chunking_factor % 2 != 0):
+            raise ValueError(
+                "subsampling_conv_chunking_factor should be -1, 1, or a "\
+                    "power of 2"
+            )
+        self.subsampling_conv_chunking_factor = \
+            subsampling_conv_chunking_factor
+
+        in_channels = 1
+        layers = []
+
+        if subsampling == "dw_striding":
+            self._stride = 2
+            self._kernel_size = 3
+            self._ceil_mode = False
+
+            if self.is_causal:
+                self._left_padding = self._kernel_size - 1
+                self._right_padding = self._stride - 1
+                self._max_cache_len = subsampling_factor + 1
+            else:
+                self._left_padding = (self._kernel_size - 1) // 2
+                self._right_padding = (self._kernel_size - 1) // 2
+                self._max_cache_len = 0
+
+            # Layer 1
+            if self.is_causal:
+                layers.append(
+                    CausalConv2D(
+                        in_channels=in_channels,
+                        out_channels=conv_channels,
+                        kernel_size=self._kernel_size,
+                        stride=self._stride,
+                        padding=None,
+                    ))
+            else:
+                layers.append(
+                    torch.nn.Conv2d(
+                        in_channels=in_channels,
+                        out_channels=conv_channels,
+                        kernel_size=self._kernel_size,
+                        stride=self._stride,
+                        padding=self._left_padding,
+                    ))
+            in_channels = conv_channels
+            layers.append(activation)
+
+            for i in range(self._sampling_num - 1):
+                if self.is_causal:
+                    layers.append(
+                        CausalConv2D(
+                            in_channels=in_channels,
+                            out_channels=in_channels,
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=None,
+                            groups=in_channels,
+                        ))
+                else:
+                    layers.append(
+                        torch.nn.Conv2d(
+                            in_channels=in_channels,
+                            out_channels=in_channels,
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=self._left_padding,
+                            groups=in_channels,
+                        ))
+
+                layers.append(
+                    torch.nn.Conv2d(
+                        in_channels=in_channels,
+                        out_channels=conv_channels,
+                        kernel_size=1,
+                        stride=1,
+                        padding=0,
+                        groups=1,
+                    ))
+                layers.append(activation)
+                in_channels = conv_channels
+
+        elif subsampling == "striding":
+            self._stride = 2
+            self._kernel_size = 3
+            self._ceil_mode = False
+
+            if self.is_causal:
+                self._left_padding = self._kernel_size - 1
+                self._right_padding = self._stride - 1
+                self._max_cache_len = subsampling_factor + 1
+            else:
+                self._left_padding = (self._kernel_size - 1) // 2
+                self._right_padding = (self._kernel_size - 1) // 2
+                self._max_cache_len = 0
+
+            for i in range(self._sampling_num):
+                if self.is_causal:
+                    layers.append(
+                        CausalConv2D(
+                            in_channels=in_channels,
+                            out_channels=conv_channels,
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=None,
+                        ))
+                else:
+                    layers.append(
+                        torch.nn.Conv2d(
+                            in_channels=in_channels,
+                            out_channels=conv_channels,
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=self._left_padding,
+                        ))
+                layers.append(activation)
+                in_channels = conv_channels
+
+        elif subsampling == "striding_conv1d":
+            in_channels = feat_in
+
+            self._stride = 2
+            self._kernel_size = 5
+            self._ceil_mode = False
+
+            if self.is_causal:
+                self._left_padding = self._kernel_size - 1
+                self._right_padding = self._stride - 1
+                self._max_cache_len = subsampling_factor + 1
+            else:
+                self._left_padding = (self._kernel_size - 1) // 2
+                self._right_padding = (self._kernel_size - 1) // 2
+                self._max_cache_len = 0
+
+            for i in range(self._sampling_num):
+                if self.is_causal:
+                    layers.append(
+                        CausalConv1D(
+                            in_channels=in_channels,
+                            out_channels=(feat_out if self._sampling_num == i +
+                                          1 else conv_channels),
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=None,
+                        ))
+                else:
+                    layers.append(
+                        torch.nn.Conv1d(
+                            in_channels=in_channels,
+                            out_channels=(feat_out if self._sampling_num == i +
+                                          1 else conv_channels),
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=self._left_padding,
+                        ))
+                layers.append(activation)
+                in_channels = conv_channels
+
+        elif subsampling == "dw_striding_conv1d":
+            in_channels = feat_in
+
+            self._stride = 2
+            self._kernel_size = 5
+            self._ceil_mode = False
+
+            self._left_padding = (self._kernel_size - 1) // 2
+            self._right_padding = (self._kernel_size - 1) // 2
+
+            # Layer 1
+            layers.extend([
+                torch.nn.Conv1d(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    kernel_size=self._kernel_size,
+                    stride=self._stride,
+                    padding=self._left_padding,
+                    groups=in_channels,
+                ),
+                torch.nn.Conv1d(
+                    in_channels=in_channels,
+                    out_channels=(feat_out if self._sampling_num == 1 else
+                                  conv_channels),
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    groups=1,
+                ),
+            ])
+            in_channels = conv_channels
+            layers.append(activation)
+
+            for i in range(self._sampling_num - 1):
+                layers.extend([
+                    torch.nn.Conv1d(
+                        in_channels=in_channels,
+                        out_channels=in_channels,
+                        kernel_size=self._kernel_size,
+                        stride=self._stride,
+                        padding=self._left_padding,
+                        groups=in_channels,
+                    ),
+                    torch.nn.Conv1d(
+                        in_channels=in_channels,
+                        out_channels=(feat_out if self._sampling_num == i +
+                                      2 else conv_channels),
+                        kernel_size=1,
+                        stride=1,
+                        padding=0,
+                        groups=1,
+                    ),
+                ])
+                layers.append(activation)
+                in_channels = conv_channels
+
+        else:
+            raise ValueError(f"Not valid sub-sampling: {subsampling}!")
+
+        if subsampling in ["dw_striding", "striding"]:
+            in_length = torch.tensor(feat_in, dtype=torch.float)
+            out_length = calc_length(
+                lengths=in_length,
+                all_paddings=self._left_padding + self._right_padding,
+                kernel_size=self._kernel_size,
+                stride=self._stride,
+                ceil_mode=self._ceil_mode,
+                repeat_num=self._sampling_num,
+            )
+            self.out = torch.nn.Linear(conv_channels * int(out_length),
+                                       feat_out)
+            self.conv2d_subsampling = True
+        elif subsampling in ["striding_conv1d", "dw_striding_conv1d"]:
+            self.out = None
+            self.conv2d_subsampling = False
+        else:
+            raise ValueError(f"Not valid sub-sampling: {subsampling}!")
+
+        self.conv = torch.nn.Sequential(*layers)
+
+    def get_sampling_frames(self):
+        return [1, self.subsampling_factor]
+
+    def get_streaming_cache_size(self):
+        return [0, self.subsampling_factor + 1]
+
+    def forward(self, x, mask):
+        """
+        Forward method for NeMo subsampling.
+
+        Args:
+            x[Batch, Time, Filters]: torch.Tensor
+                input tensor
+            x_mask: torch.Tensor
+                input mask
+
+        Returns:
+            x: torch.Tensor
+                Resulting tensor from subsampling (B, T // 
+                time_reduction_factor, feat_out)
+            pad_mask: torch.Tensor
+                tensor of padded hidden state sequences (B, 1, T // 
+                time_reduction_factor)
+        """
+        x = x.unsqueeze(1) if self.conv2d_subsampling else x.transpose(1, 2)
+
+        # split inputs if chunking_factor is set
+        if (self.subsampling_conv_chunking_factor != -1
+                and self.conv2d_subsampling):
+            if self.subsampling_conv_chunking_factor == 1:
+                # if subsampling_conv_chunking_factor is 1, we split only
+                # if needed.
+                # avoiding a bug / feature limiting indexing of tensors
+                # to 2**31.
+                # see https://github.com/pytorch/pytorch/issues/80020
+                x_ceil = (2**31 / self._conv_channels * self._stride *
+                          self._stride)
+                need_to_split = torch.numel(x) > x_ceil
+            else:
+                # if subsampling_conv_chunking_factor > 1 we always split
+                need_to_split = True
+
+            if need_to_split:
+                x, success = self.conv_split_by_batch(x)
+                if not success:  # if unable to split by batch, try by channel
+                    if self._subsampling == "dw_striding":
+                        x = self.conv_split_by_channel(x)
+                    else:
+                        x = self.conv(x)  # try anyway
+            else:
+                x = self.conv(x)
+        else:
+            x = self.conv(x)
+
+        # Flatten Channel and Frequency Axes
+        if self.conv2d_subsampling:
+            b, c, t, f = x.size()
+            x = self.out(x.transpose(1, 2).reshape(b, t, -1))
+        # Transpose to Channel Last mode
+        else:
+            x = x.transpose(1, 2)
+
+        if mask is None:
+            return x, None
+
+        max_audio_length = x.shape[1]
+        feature_lens = mask.sum(1)
+        padding_length = torch.ceil(feature_lens / self.subsampling_factor)
+        if self.is_causal and self.subsampling_causal_cond:
+            feature_lens_remainder = feature_lens % self.subsampling_factor
+            padding_length[feature_lens_remainder != 1] += 1
+        pad_mask = torch.arange(0, max_audio_length, device=x.device).expand(
+            padding_length.size(0), -1) < padding_length.unsqueeze(1)
+        return x, pad_mask.unsqueeze(1)
+
+    def reset_parameters(self):
+        # initialize weights
+        if self._subsampling == "dw_striding":
+            with torch.no_grad():
+                # init conv
+                scale = 1.0 / self._kernel_size
+                dw_max = (self._kernel_size**2)**-0.5
+                pw_max = self._conv_channels**-0.5
+
+                torch.nn.init.uniform_(self.conv[0].weight, -scale, scale)
+                torch.nn.init.uniform_(self.conv[0].bias, -scale, scale)
+
+                for idx in range(2, len(self.conv), 3):
+                    torch.nn.init.uniform_(self.conv[idx].weight, -dw_max,
+                                           dw_max)
+                    torch.nn.init.uniform_(self.conv[idx].bias, -dw_max,
+                                           dw_max)
+                    torch.nn.init.uniform_(self.conv[idx + 1].weight, -pw_max,
+                                           pw_max)
+                    torch.nn.init.uniform_(self.conv[idx + 1].bias, -pw_max,
+                                           pw_max)
+
+                # init fc (80 * 64 = 5120 from https://github.com/kssteven418/
+                # Squeezeformer/blob/13c97d6cf92f2844d2cb3142b4c5bfa9ad1a8951/
+                # src/models/conformer_encoder.py#L487
+                fc_scale = (self._feat_out * self._feat_in /
+                            self._sampling_num)**-0.5
+                torch.nn.init.uniform_(self.out.weight, -fc_scale, fc_scale)
+                torch.nn.init.uniform_(self.out.bias, -fc_scale, fc_scale)
+
+    def conv_split_by_batch(self, x):
+        """Tries to split input by batch, run conv and concat results"""
+        b, _, _, _ = x.size()
+        if b == 1:  # can't split if batch size is 1
+            return x, False
+
+        if self.subsampling_conv_chunking_factor > 1:
+            cf = self.subsampling_conv_chunking_factor
+        else:
+            # avoiding a bug / feature limiting indexing of tensors to 2**31
+            # see https://github.com/pytorch/pytorch/issues/80020
+            x_ceil = 2**31 / self._conv_channels * self._stride * self._stride
+            p = math.ceil(math.log(torch.numel(x) / x_ceil, 2))
+            cf = 2**p
+
+        new_batch_size = b // cf
+        if new_batch_size == 0:  # input is too big
+            return x, False
+
+        return (
+            torch.cat([
+                self.conv(chunk)
+                for chunk in torch.split(x, new_batch_size, 0)
+            ]),
+            True,
+        )
+
+    def conv_split_by_channel(self, x):
+        """For dw convs, tries to split input by time, run conv and concat 
+        results"""
+        x = self.conv[0](x)  # full conv2D
+        x = self.conv[1](x)  # activation
+
+        for i in range(self._sampling_num - 1):
+            _, c, t, _ = x.size()
+
+            if self.subsampling_conv_chunking_factor > 1:
+                cf = self.subsampling_conv_chunking_factor
+            else:
+                # avoiding a bug / feature limiting indexing of tensors
+                # to 2**31
+                # see https://github.com/pytorch/pytorch/issues/80020
+                p = math.ceil(math.log(torch.numel(x) / 2**31, 2))
+                cf = 2**p
+
+            new_c = int(c // cf)
+            if new_c == 0:
+                new_c = 1
+
+            new_t = int(t // cf)
+            if new_t == 0:
+                new_t = 1
+
+            x = self.channel_chunked_conv(self.conv[i * 3 + 2], new_c,
+                                          x)  # conv2D, depthwise
+
+            # splitting pointwise convs by time
+            x = torch.cat(
+                [
+                    self.conv[i * 3 + 3](chunk)
+                    for chunk in torch.split(x, new_t, 2)
+                ],
+                2,
+            )  # conv2D, pointwise
+            x = self.conv[i * 3 + 4](x)  # activation
+        return x
+
+    def channel_chunked_conv(self, conv, chunk_size, x):
+        """Performs channel chunked convolution"""
+
+        ind = 0
+        out_chunks = []
+        for chunk in torch.split(x, chunk_size, 1):
+            step = chunk.size()[1]
+
+            if self.is_causal:
+                chunk = nn.functional.pad(
+                    chunk,
+                    pad=(
+                        self._kernel_size - 1,
+                        self._stride - 1,
+                        self._kernel_size - 1,
+                        self._stride - 1,
+                    ),
+                )
+                ch_out = nn.functional.conv2d(
+                    chunk,
+                    conv.weight[ind:ind + step, :, :, :],
+                    bias=conv.bias[ind:ind + step],
+                    stride=self._stride,
+                    padding=0,
+                    groups=step,
+                )
+            else:
+                ch_out = nn.functional.conv2d(
+                    chunk,
+                    conv.weight[ind:ind + step, :, :, :],
+                    bias=conv.bias[ind:ind + step],
+                    stride=self._stride,
+                    padding=self._left_padding,
+                    groups=step,
+                )
+            out_chunks.append(ch_out)
+            ind += step
+
+        return torch.cat(out_chunks, 1)
+
+    def change_subsampling_conv_chunking_factor(
+            self, subsampling_conv_chunking_factor: int):
+        if (subsampling_conv_chunking_factor != -1
+                and subsampling_conv_chunking_factor != 1
+                and subsampling_conv_chunking_factor % 2 != 0):
+            raise ValueError(
+                "subsampling_conv_chunking_factor should be -1, 1, or a "\
+                    "power of 2"
+            )
+        self.subsampling_conv_chunking_factor = subsampling_conv_chunking_factor
+
+
+def calc_length(lengths,
+                all_paddings,
+                kernel_size,
+                stride,
+                ceil_mode,
+                repeat_num=1):
+    """Calculates the output length of a Tensor passed through a convolution or
+      max pooling layer"""
+    add_pad: float = all_paddings - kernel_size
+    one: float = 1.0
+    for i in range(repeat_num):
+        lengths = (torch.div(lengths.to(dtype=torch.float) + add_pad, stride) +
+                   one)
+        lengths = torch.ceil(lengths) if ceil_mode else torch.floor(lengths)
+    return lengths.to(dtype=torch.int)
+
+
+####  multihead attention starts here
+class AttModule(nn.Module):
+    """Attention abstraction module"""
+
+    def __init__(self):
+        super().__init__()
+        self.export_mode = False
+
+    def set_export(self, mode=True):
+        """set the export mode"""
+        self.export_mode = mode
+
+    def forward(
+        self,
+        x: Tensor,
+        memory: Optional[Tensor] = None,
+        pos_emb: Optional[Tensor] = None,
+        att_mask: Optional[Tensor] = None,
+    ) -> tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]:
+        """AttModule forward
+
+        Args:
+            x: torch.Tensor
+                input tensor.
+            memory: torch.Tensor, optional
+                memory tensor.
+            pos_emb: torch.Tensor, optional
+                positional encoder embedding.
+            att_mask: torch.Tensor, optional
+                attention mask tensor.
+        """
+        return x, memory, pos_emb, att_mask
+
+
+class AttBlock(BlockBase, AttModule):
+    """Attention Block module to support both Attention and Block module."""
+
+    def memory_dims(self, max_len=False):
+        """memory dimensions"""
+        return (1, self.input_size)
+
+
+def masked_softmax(
+    scores,
+    mask: Optional[Tensor],
+):
+    if mask is not None:
+        mask = mask.unsqueeze(1).eq(0)  # (batch, 1, time1, time2)
+        scores = scores.masked_fill(mask, -torch.inf)
+        attn = torch.softmax(scores, dim=-1).masked_fill(
+            mask, 0.0)  # (batch, head, time1, time2)
+    else:
+        attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+    return attn
+
+
+class MultiHeadedAttention(nn.Module):
+    """Multi-Head Attention layer with optional relative position embedding 
+    and GLU.
+
+    Args:
+        n_head: int
+            the number of heads.
+        n_feat: int
+            input size features.
+        dropout_rate: float
+            dropout rate.
+        use_LN: bool
+            apply layer norm or not
+        dropout_at_output: bool
+            whether to apply dropout at output
+        attention_inner_dim: int, optional
+            the attention dimension used in the class,
+            it can be different from the input dimension n_feat.
+            default: -1 (equal to n_feat).
+        use_pt_scaled_dot_product_attention: bool, optional
+            if set True, use pytorch scaled dot product attention in training.
+            NOTE: this will NOT be used in ONNX decoding due to a lack of 
+            support.  In that case, we use the original attention 
+            implementation, which shows no regression.
+            default: False.
+        n_value: int, optional
+            if set to values other than -1, use a different dimension for 
+            value. With the default value (i.e. -1), it is backward compatible.
+        group_size: int, optional. must divide `n_head`
+            if group_size > 1:       GQA
+            if group_size = 1:       MHA
+            if group_size = n_head:  MQA
+    """
+
+    inv_sqrt_d_k: torch.jit.Final[float]
+    h: torch.jit.Final[int]
+    h_k: torch.jit.Final[int]
+    g: torch.jit.Final[int]
+
+    def __init__(
+        self,
+        n_head,
+        n_feat,
+        dropout_rate,
+        attention_inner_dim=-1,
+        glu_type="swish",
+        bias_in_glu=True,
+        use_pt_scaled_dot_product_attention=False,
+        n_value=-1,
+        group_size: int = 1,
+    ):
+        super().__init__()
+        if n_value == -1:
+            n_value = n_feat
+        if attention_inner_dim == -1:
+            attention_inner_dim = n_feat
+        assert attention_inner_dim % n_head == 0
+
+        # We assume d_v always equals d_k
+        self.d_k = attention_inner_dim // n_head
+        self.inv_sqrt_d_k = 1.0 / math.sqrt(self.d_k)
+        self.h = n_head
+        assert n_head % group_size == 0, "group_size must divide n_head"
+        self.g = group_size
+        self.h_k = n_head // group_size
+
+        self.linear_q = nn.Linear(n_feat, attention_inner_dim)
+        self.linear_k = nn.Linear(n_feat, attention_inner_dim // group_size)
+        self.linear_v = nn.Linear(n_value, attention_inner_dim // group_size)
+        self.linear_out = nn.Linear(attention_inner_dim // group_size, n_value)
+
+        self.attn = torch.jit.Attribute(None, Optional[Tensor])
+        self.dropout = nn.Dropout(p=dropout_rate)
+        self.dropout_rate = dropout_rate
+        self.use_pt_scaled_dot_product_attention = (
+            use_pt_scaled_dot_product_attention)
+
+        if use_pt_scaled_dot_product_attention and group_size > 1:
+            raise ValueError("Cannot use PT Scaled Attention with GQA")
+
+        # Torchscript eager quantization.  Note that these functions below are
+        # NOOPs and have very little impact on performance unless quantization
+        # is enabled.
+        self.quant_q = torch.ao.quantization.QuantStub()
+        self.quant_x = torch.ao.quantization.QuantStub()
+        self.dequant = torch.ao.quantization.DeQuantStub()
+        self.ffunc = torch.ao.nn.quantized.FloatFunctional()
+
+    def forward(
+        self,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
+        pos_k: Tensor,
+        pos_v: Tensor,
+        mask: Optional[Tensor],
+        relative_attention_bias: Optional[Tensor] = None,
+    ):
+        """Compute 'Scaled Dot Product Attention'.
+
+        Args:
+            query: torch.Tensor
+                query tensor (batch, time1, size)
+            key: torch.Tensor
+                key tensor (batch, time2, size)
+            value: torch.Tensor
+                value tensor (batch, time1, size)
+            pos_k: torch.Tensor
+                key tensor used for relative positional embedding.
+            pos_v: torch.Tensor
+                value tensor used for relative positional embedding.
+            mask: torch.Tensor
+                mask tensor (batch, time1, time2)
+            relative_attention_bias: torch.Tensor
+                bias added to attention logits w.r.t. relative positions
+                (1, n_head, time1, time2)
+        """
+        n_batch = query.size(0)
+
+        q = self.linear_q(query).view(n_batch, -1, self.h,
+                                      self.d_k)  # (b, t, d)
+        k = self.linear_k(key).view(n_batch, -1, self.h_k,
+                                    self.d_k)  # (b, t, d)
+        v = self.linear_v(value).view(n_batch, -1, self.h_k, self.d_k)
+        q = (q.transpose(1, 2) if self.use_pt_scaled_dot_product_attention
+             and not torch.jit.is_scripting() else q.transpose(1, 2) *
+             self.inv_sqrt_d_k)
+        k = k.transpose(1, 2)  # (batch, head_k, time2, d_k)
+        v = v.transpose(1, 2)  # (batch, head_k, time2, d_k)
+
+        if (self.use_pt_scaled_dot_product_attention
+                and not torch.jit.is_scripting()):
+            attn_mask = None
+            if mask is not None:
+                mask = mask.unsqueeze(1)
+                if relative_attention_bias is not None:
+                    attn_mask = mask + relative_attention_bias
+                else:
+                    attn_mask = mask
+                if mask.dtype != q.dtype:
+                    attn_mask = attn_mask.to(q.dtype)
+
+            with torch.nn.attention.sdpa_kernel([
+                    torch.nn.attention.SDPBackend.FLASH_ATTENTION,
+                    torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION,
+                    torch.nn.attention.SDPBackend.MATH,
+                    torch.nn.attention.SDPBackend.CUDNN_ATTENTION,
+            ]):
+                x = torch.nn.functional.scaled_dot_product_attention(
+                    q,
+                    k,
+                    v,
+                    attn_mask=attn_mask,
+                    dropout_p=self.dropout_rate,
+                )
+        else:
+            if self.h != self.h_k:
+                q = q.reshape(n_batch, self.g, self.h_k, -1, self.d_k)
+                A = torch.einsum("b g h t d, b h s d -> b h t s", q, k)
+            else:
+                A = torch.matmul(q, k.transpose(-2, -1))
+            if pos_k is not None:
+                if self.h != self.h_k:
+                    B = torch.einsum("b g h t d, t s d -> b h t s", q, pos_k)
+                else:
+                    reshape_q = (q.contiguous().view(n_batch * self.h, -1,
+                                                     self.d_k).transpose(0, 1)
+                                 )  # (t1,nh,dk)
+                    B = torch.matmul(reshape_q,
+                                     pos_k.transpose(-2,
+                                                     -1))  # pos_k: (t1,dk,t2)
+                    B = B.transpose(0, 1).view(n_batch, self.h, pos_k.size(0),
+                                               pos_k.size(1))
+                scores = A + B
+            else:
+                scores = A
+
+            if relative_attention_bias is not None:
+                scores = scores + relative_attention_bias
+
+            attn = masked_softmax(scores, mask)  # (batch, head, time1, time2)
+
+            self.attn = attn
+
+            p_attn = self.dropout(attn)
+            x = torch.matmul(p_attn.to(v.dtype),
+                             v)  # (batch, head, time1, d_k)
+            if pos_v is not None:
+                reshape_attn = (p_attn.contiguous().view(
+                    n_batch * self.h, pos_v.size(0),
+                    pos_v.size(1)).transpose(0, 1))  # (t1, bh, t2)
+
+                attn_v = (torch.matmul(reshape_attn, pos_v).transpose(
+                    0, 1).contiguous().view(n_batch, self.h, pos_v.size(0),
+                                            self.d_k))
+                x = x + attn_v
+        x = (x.transpose(1, 2).contiguous().view(n_batch, -1,
+                                                 self.h_k * self.d_k)
+             )  # (batch, time1, d_model)
+
+        return self.linear_out(x)  # (batch, time1, d_model)
+
+
+class MultiSequential(torch.nn.Sequential):
+    """Multi-input multi-output torch.nn.Sequential"""
+
+    @torch.jit.ignore
+    def forward(self, *args):
+        """Forward method implementation."""
+        for m in self:
+            args = m(*args)
+        return args
+
+
+def get_offset(input_layer: str, time_reduction: int):
+    """Get an offset. We will use the offset for determining #frames of a 
+    subsampled feature.
+
+    Args:
+        input_layer (str): Type of an input layer
+        time_reduction (int): time reduction factor for downsampling a feature
+    Returns:
+        int: offset
+    """
+    if input_layer in ("conv2d", "nemo_conv") and time_reduction == 4:
+        return 3
+    if input_layer in ("conv2d", ) and time_reduction == 6:
+        return 1
+    if input_layer in ("conv2d", "nemo_conv") and time_reduction == 8:
+        return 7
+    return 0
+
+
+def unfold_tensor(xs_pad, max_seq_len):
+    """
+    For a given tensor with shape of (N, T, D), if sequence length T is 
+    longer than max_seq_len, this function unfold it to a 
+    (NT', max_seq_len, D) where T' is T // max_seq_len.
+    Args:
+        xs_pad: N, T, D
+    """
+    _, _, D = xs_pad.shape
+    xs_pad = xs_pad.transpose(-1, -2)  # convert to N, D, T
+    # N x D x 1 x T => N x (D x max_seq_len) x T'
+    xs_pad = F.unfold(
+        xs_pad[..., None, :],
+        kernel_size=(1, max_seq_len),
+        stride=(1, max_seq_len),
+    )
+    new_bsz, _, slen = xs_pad.shape
+    # N x D x max_seq_len x T'
+    xs_pad = xs_pad.view(new_bsz, -1, max_seq_len, slen)
+    # N x T' x max_seq_len x D
+    xs_pad = xs_pad.permute(0, 3, 2, 1).contiguous()
+    # NT' x max_seq_len x D
+    xs_pad = xs_pad.view(-1, max_seq_len, D)
+    return xs_pad
diff --git a/vllm_v0.10.0/vllm/model_executor/models/phimoe.py b/vllm_v0.10.0/vllm/model_executor/models/phimoe.py
new file mode 100644
index 0000000..cfe0982
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/phimoe.py
@@ -0,0 +1,680 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only PhiMoE model."""
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers.configuration_utils import PretrainedConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class PhiMoEConfig(PretrainedConfig):
+
+    model_type = "phimoe"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        head_dim=None,
+        hidden_act="silu",
+        max_position_embeddings=4096 * 32,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=1e6,
+        sliding_window=None,
+        attention_dropout=0.0,
+        num_experts_per_tok=2,
+        num_local_experts=16,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        router_jitter_noise=0.0,
+        attention_bias=False,
+        lm_head_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+        self.attention_bias = attention_bias
+        self.lm_head_bias = lm_head_bias
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        if head_dim is None:
+            head_dim = hidden_size // num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_local_experts = num_local_experts
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.router_jitter_noise = router_jitter_noise
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class mp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx,
+        scores: torch.Tensor,
+        multiplier: torch.Tensor,
+        selected_experts: torch.Tensor,
+        masked_gates: torch.Tensor,
+        mask_for_one: torch.Tensor,
+    ):
+        ctx.save_for_backward(multiplier, selected_experts, masked_gates)
+        return multiplier * mask_for_one
+
+    @staticmethod
+    def backward(
+        ctx,
+        grad_at_output: torch.Tensor,
+    ):
+        multiplier, selected_experts, masked_gates = ctx.saved_tensors
+
+        grad_at_output = grad_at_output * multiplier
+
+        grad_at_scores_expanded = masked_gates * grad_at_output.mul(-1)
+        grad_at_scores_expanded.scatter_add_(
+            dim=-1,
+            index=selected_experts,
+            src=grad_at_output,
+        )
+
+        return (
+            grad_at_scores_expanded,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+def sparsemixer(scores, jitter_eps=0.01):
+    ################ first expert ################
+
+    with torch.no_grad():
+        # compute mask for sparsity
+        mask_logits_threshold, max_ind = scores.max(dim=-1, keepdim=True)
+        factor = scores.abs().clamp(min=mask_logits_threshold)
+        mask_logits_threshold = ((mask_logits_threshold - scores) /
+                                 factor) > (2 * jitter_eps)
+
+    # apply mask
+    masked_gates = scores.masked_fill(mask_logits_threshold, float("-inf"))
+    selected_experts = max_ind
+
+    # compute scores for gradients
+    masked_gates = torch.softmax(masked_gates, dim=-1)
+    multiplier_o = masked_gates.gather(dim=-1, index=selected_experts)
+
+    multiplier = multiplier_o
+
+    # masked out first expert
+    masked_scores = torch.scatter(
+        scores,
+        -1,
+        selected_experts,
+        float("-inf"),
+    )
+    with torch.no_grad():
+        # compute mask for sparsity
+        mask_logits_threshold, max_ind = masked_scores.max(dim=-1,
+                                                           keepdim=True)
+        factor = scores.abs().clamp(min=mask_logits_threshold)
+        mask_logits_threshold = ((mask_logits_threshold - scores) /
+                                 factor) > (2 * jitter_eps)
+
+    # apply mask
+    masked_gates_top2 = masked_scores.masked_fill(mask_logits_threshold,
+                                                  float("-inf"))
+    selected_experts_top2 = max_ind
+    # compute scores for gradients
+    masked_gates_top2 = torch.softmax(masked_gates_top2, dim=-1)
+    multiplier_top2 = masked_gates_top2.gather(dim=-1,
+                                               index=selected_experts_top2)
+
+    multiplier = torch.concat((multiplier, multiplier_top2), dim=-1)
+    selected_experts = torch.concat((selected_experts, selected_experts_top2),
+                                    dim=-1)
+
+    return (
+        multiplier,
+        selected_experts,
+    )
+
+
+def phimoe_routing_function(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+):
+    assert hidden_states.shape[0] == gating_output.shape[0], (
+        "Number of tokens mismatch")
+    assert topk == 2, "Only top-2 routing is supported"
+    assert renormalize is False, "Renormalization is not supported"
+
+    topk_weights, topk_ids = sparsemixer(gating_output)
+    return topk_weights, topk_ids
+
+
+class PhiMoE(nn.Module):
+    """A tensor-parallel MoE implementation for PhiMoE that shards each expert
+    across all ranks.
+
+    Each expert's weights are sharded across all ranks and a fused MoE
+    kernel is used for the forward pass, and finally we reduce the outputs
+    across ranks.
+    """
+
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        tp_size: Optional[int] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+
+        # Gate always runs at half / full precision for now.
+        self.gate = ReplicatedLinear(
+            hidden_size,
+            num_experts,
+            bias=False,
+            params_dtype=params_dtype,
+            quant_config=None,
+        )
+
+        self.experts = FusedMoE(
+            num_experts=num_experts,
+            top_k=top_k,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            params_dtype=params_dtype,
+            reduce_results=True,
+            renormalize=False,
+            quant_config=quant_config,
+            tp_size=tp_size,
+            custom_routing_function=phimoe_routing_function,
+            prefix=f"{prefix}.experts")
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        final_hidden_states = self.experts(hidden_states, router_logits)
+        return final_hidden_states.view(orig_shape)
+
+
+class PhiMoEAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: Optional[int] = None,
+        max_position: int = 4096 * 32,
+        rope_theta: float = 10000,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        rope_scaling: Optional[dict] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        if head_dim is None:
+            head_dim = hidden_size // num_heads
+        self.head_dim = head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=True,
+            quant_config=quant_config,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=True,
+            quant_config=quant_config,
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=int(self.rope_theta),
+            is_neox_style=True,
+            rope_scaling=self.rope_scaling,
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class PhiMoEDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PhiMoEConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        # Requires transformers > 4.32.0
+        rope_theta = getattr(config, "rope_theta", 10000)
+        self.self_attn = PhiMoEAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            head_dim=getattr(config, "head_dim",
+                             self.hidden_size // config.num_attention_heads),
+            rope_theta=rope_theta,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            rope_scaling=config.rope_scaling,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.block_sparse_moe = PhiMoE(
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.block_sparse_moe",
+        )
+        self.input_layernorm = nn.LayerNorm(config.hidden_size,
+                                            eps=config.rms_norm_eps,
+                                            elementwise_affine=True)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
+                                                     eps=config.rms_norm_eps,
+                                                     elementwise_affine=True)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        residual = hidden_states
+
+        # Self Attention
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+        hidden_states = hidden_states + residual
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.block_sparse_moe(hidden_states)
+
+        hidden_states = hidden_states + residual
+        return hidden_states, residual
+
+
+@support_torch_compile
+class PhiMoEModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+        self.config = config
+        self.quant_config = quant_config
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: PhiMoEDecoderLayer(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.layers")
+        self.norm = nn.LayerNorm(config.hidden_size,
+                                 eps=config.rms_norm_eps,
+                                 elementwise_affine=True)
+
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="w1",
+            ckpt_down_proj_name="w2",
+            ckpt_up_proj_name="w3",
+            num_experts=self.config.num_local_experts,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        expert_params_mapping = self.get_expert_mapping()
+        for name, loaded_weight in weights:
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    fall_back_to_pt_during_load = False
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        lora_config = vllm_config.lora_config
+        self.config = config
+        self.lora_config = lora_config
+        self.quant_config = vllm_config.quant_config
+
+        self.model = PhiMoEModel(vllm_config=vllm_config,
+                                 prefix=maybe_prefix(prefix, "model"))
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=(
+                DEFAULT_VOCAB_PADDING_SIZE
+                # We need bigger padding if using lora for kernel
+                # compatibility
+                if not lora_config else lora_config.lora_vocab_padding_size),
+            quant_config=None,
+            bias=True,
+        )
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
diff --git a/vllm_v0.10.0/vllm/model_executor/models/pixtral.py b/vllm_v0.10.0/vllm/model_executor/models/pixtral.py
new file mode 100644
index 0000000..325a264
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/pixtral.py
@@ -0,0 +1,1334 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from dataclasses import dataclass, fields
+from functools import cached_property
+from typing import Literal, Optional, TypedDict, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
+                                                       UserMessage)
+from mistral_common.protocol.instruct.request import ChatCompletionRequest
+from mistral_common.tokens.tokenizers.multimodal import ImageEncoder
+from PIL import Image
+from transformers import PixtralVisionConfig, TensorType
+from transformers.image_utils import ImageInput
+from transformers.models.pixtral.image_processing_pixtral import (
+    _num_image_tokens as _get_pixtral_hf_num_image_tokens)
+from transformers.models.pixtral.modeling_pixtral import (
+    PixtralRotaryEmbedding, apply_rotary_pos_emb, position_ids_in_meshgrid)
+from transformers.tokenization_utils_base import TextInput
+
+from vllm.config import VllmConfig
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import get_act_and_mul_fn
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    NestedTensors)
+from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, MultiModalHashes,
+                                        PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.platforms import current_platform
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.tokenizer import (MistralTokenizer,
+                                               cached_tokenizer_from_config)
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .utils import (flatten_bn, init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
+
+try:
+    from xformers import ops as xops
+    if (current_platform.is_cuda()
+            and current_platform.has_device_capability(100)):
+        # Xformers FA is not compatible with B200
+        USE_XFORMERS_OPS = False
+    else:
+        USE_XFORMERS_OPS = True
+except ImportError:
+    USE_XFORMERS_OPS = False
+
+PATCH_MERGE = "patch_merge"
+
+
+class PixtralImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+
+    images: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    Shape: `(batch_size * num_images, num_channels, image_width, image_height)`
+
+    The result of stacking `ImageEncoding.tokens` from each prompt.
+    """
+
+
+class PixtralProcessorAdapter:
+    """
+    Provide a HF-compatible interface for
+    `mistral_common.tokens.tokenizers.multimodal.ImageEncoder`.
+    """
+
+    def __init__(self, tokenizer: MistralTokenizer) -> None:
+        super().__init__()
+
+        self.tokenizer = tokenizer
+
+    @property
+    def image_processor(self) -> ImageEncoder:
+        image_encoder = self.tokenizer.instruct.mm_encoder
+        assert isinstance(image_encoder, ImageEncoder)
+        return image_encoder
+
+    @cached_property
+    def image_break_id(self) -> int:
+        return self.image_processor.special_ids.img_break
+
+    @cached_property
+    def image_token_id(self) -> int:
+        return self.image_processor.special_ids.img
+
+    @cached_property
+    def image_end_id(self) -> int:
+        return self.image_processor.special_ids.img_end
+
+    @cached_property
+    def image_size(self) -> int:
+        return self.image_processor.mm_config.max_image_size
+
+    @cached_property
+    def patch_size(self) -> int:
+        return self.image_processor.mm_config.image_patch_size
+
+    def __call__(
+        self,
+        text: Optional[Union[TextInput, list[TextInput]]] = None,
+        images: Optional[Union[ImageInput, list[ImageInput]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> Mapping[str, NestedTensors]:
+        if text is None:
+            text = []
+        if not isinstance(text, list):
+            text = [text]
+        if images is None:
+            images = []
+        if not isinstance(images, list):
+            images = [images]
+
+        if not images:
+            input_ids = self.tokenizer(text).input_ids
+
+            return {"input_ids": torch.tensor(input_ids)}
+
+        # Allow dummy text, which is used for profiling as well as token inputs
+        if any(len(t) > 0 for t in text):
+            raise ValueError(
+                "You've passed text inputs instead of token inputs. "
+                "Make sure to process your input via `mistral_common`'s "
+                "tokenizer or pass a chat completion request. "
+                "For more info, see: "
+                "https://github.com/vllm-project/vllm/issues/8411.")
+
+        images_processed = list[torch.Tensor]()
+        images_tokens = list[torch.Tensor]()
+
+        for image in images:
+            image_inputs = self.image_processor(ImageChunk(image=image))
+            image_processed = torch.tensor(image_inputs.image)
+            image_tokens = torch.tensor(image_inputs.tokens)
+
+            images_processed.append(image_processed)
+            images_tokens.append(image_tokens)
+
+        return {
+            "input_ids": torch.cat(images_tokens)[None].expand(len(text), -1),
+            "images": images_processed,
+        }
+
+
+class PixtralProcessingInfo(BaseProcessingInfo):
+
+    def get_tokenizer(self) -> MistralTokenizer:
+        tokenizer = cached_tokenizer_from_config(self.ctx.model_config)
+        if not isinstance(tokenizer, MistralTokenizer):
+            raise ValueError("This model requires `--tokenizer-mode mistral`")
+
+        return tokenizer
+
+    def get_hf_processor(self) -> PixtralProcessorAdapter:
+        return PixtralProcessorAdapter(self.get_tokenizer())
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_vision_config(
+        self,
+        processor: Optional[PixtralProcessorAdapter] = None,
+    ):
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        return PixtralVisionConfig(
+            image_size=processor.image_size,
+            patch_size=processor.patch_size,
+        )
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[PixtralProcessorAdapter] = None,
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        ncols, nrows = processor.image_processor._image_to_num_tokens(
+            Image.new("RGB", (image_width, image_height)))
+
+        return ncols * nrows
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        image_processor = self.get_hf_processor().image_processor
+        max_image_size = image_processor.mm_config.max_image_size
+
+        return ImageSize(width=max_image_size, height=max_image_size)
+
+
+class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        tokenizer = self.info.get_tokenizer()
+
+        dummy_text = self.get_dummy_text(mm_counts)
+        dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts)
+        dummy_images = dummy_mm_data.get("image", [])
+        tokenization_kwargs = {"truncation": False}
+
+        request = ChatCompletionRequest(messages=[
+            UserMessage(content=[
+                TextChunk(text=dummy_text),
+                *(ImageChunk(image=image) for image in dummy_images),
+            ]),
+        ])
+        res = tokenizer.mistral.encode_chat_completion(request)
+        dummy_tokens = res.tokens
+
+        return ProcessorInputs(prompt=dummy_tokens,
+                               mm_data=dummy_mm_data,
+                               tokenization_kwargs=tokenization_kwargs)
+
+
+class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
+                                 ):
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: Mapping[str, NestedTensors],
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(images=MultiModalFieldConfig.batched("image"))
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        image_break_id = processor.image_break_id
+        image_token_id = processor.image_token_id
+        image_end_id = processor.image_end_id
+
+        def get_replacement(item_idx: int):
+            images = mm_items.get_items("image", ImageProcessorItems)
+            image_size = images.get_image_size(item_idx)
+
+            ncols, nrows = processor.image_processor._image_to_num_tokens(
+                Image.new("RGB", (image_size.width, image_size.height)))
+
+            tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
+            tokens[-1] = image_end_id
+
+            return PromptUpdateDetails.select_token_id(tokens, image_token_id)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target="",  # Never match the prompt (see below note)
+                replacement=get_replacement,
+            ),
+        ]
+
+    def _cached_apply_hf_processor(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+        *,
+        return_mm_hashes: bool,
+    ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
+        (
+            prompt_ids,
+            mm_kwargs,
+            mm_hashes,
+            _,
+        ) = super()._cached_apply_hf_processor(
+            prompt=prompt,
+            mm_data_items=mm_data_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            tokenization_kwargs=tokenization_kwargs,
+            return_mm_hashes=return_mm_hashes,
+        )
+
+        # NOTE: The tokens are already inserted by the chat template
+        return prompt_ids, mm_kwargs, mm_hashes, True
+
+
+@MULTIMODAL_REGISTRY.register_processor(PixtralMultiModalProcessor,
+                                        info=PixtralProcessingInfo,
+                                        dummy_inputs=PixtralDummyInputsBuilder)
+class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                      SupportsPP):
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return None
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        dataclass_fields = {field.name for field in fields(VisionEncoderArgs)}
+        vision_args = {
+            key: value
+            for key, value in self.config.vision_config.to_dict().items()
+            if key in dataclass_fields
+        }
+
+        self.vision_args = VisionEncoderArgs(**vision_args)
+
+        # init MistralForCausalLM
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
+        self.vision_encoder = VisionTransformer(self.vision_args)
+
+        if self.vision_args.add_pre_mm_projector_layer_norm:
+            self.pre_mm_projector_norm = RMSNorm(self.vision_args.hidden_size,
+                                                 eps=1e-5)
+
+        if self.vision_args.mm_projector_id == PATCH_MERGE:
+            self.patch_merger = PatchMerger(
+                vision_encoder_dim=self.vision_args.hidden_size,
+                spatial_merge_size=self.vision_args.spatial_merge_size,
+                use_mlp_bias=False,
+            )
+
+        self.vision_language_adapter = VisionLanguageAdapter(
+            self.vision_args, dim=config.text_config.hidden_size)
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[PixtralImagePixelInputs]:
+        images = kwargs.pop("images", None)
+        if images is None:
+            return None
+
+        if not isinstance(images, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of images. "
+                             f"Got type: {type(images)}")
+
+        return PixtralImagePixelInputs(
+            type="pixel_values",
+            images=flatten_bn(images),
+        )
+
+    def _process_image_input(
+        self,
+        image_input: PixtralImagePixelInputs,
+    ) -> tuple[torch.Tensor, ...]:
+        images = image_input["images"]
+        image_features = self.vision_encoder(images)
+        feature_sizes = [
+            image_feature.shape[0] for image_feature in image_features
+        ]
+        image_features = torch.cat(image_features)
+        if self.vision_args.add_pre_mm_projector_layer_norm:
+            image_features = self.pre_mm_projector_norm(image_features)
+        if self.vision_args.mm_projector_id == PATCH_MERGE:
+            patch_size = self.vision_args.patch_size
+            spatial_merge_size_square = self.vision_args.spatial_merge_size**2
+            img_patch_dims = [(img.shape[1] // patch_size,
+                               img.shape[2] // patch_size) for img in images]
+            feature_sizes = [
+                feature_size // spatial_merge_size_square
+                for feature_size in feature_sizes
+            ]
+            image_features = self.patch_merger(image_features,
+                                               image_sizes=img_patch_dims)
+        image_embeds = self.vision_language_adapter(image_features)
+        image_embeds = torch.split(image_embeds, feature_sizes)
+        return image_embeds
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return []
+
+        return self._process_image_input(image_input)
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                self.vision_args.image_token_id,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        """Run forward pass for pixtral."""
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+
+        def is_vision_encoder_weights(weight: tuple[str, torch.Tensor]):
+            return weight[0].startswith("vision_encoder")
+
+        def is_vision_lang_adapter_weights(weight: tuple[str, torch.Tensor]):
+            return weight[0].startswith("vision_language_adapter")
+
+        def is_patch_merger(weight: tuple[str, torch.Tensor]):
+            return weight[0].startswith("patch_merger")
+
+        def is_pre_mm_projector_norm(weight: tuple[str, torch.Tensor]):
+            return weight[0].startswith("pre_mm_projector_norm")
+
+        # Get references to parameters for direct loading
+        vision_encoder_dict = dict(self.vision_encoder.named_parameters())
+        patch_merger_dict = dict(self.patch_merger.named_parameters(
+        )) if self.vision_args.mm_projector_id == PATCH_MERGE else dict()
+        pre_mm_projector_norm_dict = dict(
+            self.pre_mm_projector_norm.named_parameters(
+            )) if self.vision_args.add_pre_mm_projector_layer_norm else dict()
+        vision_lang_adapter_dict = dict(
+            self.vision_language_adapter.named_parameters())
+
+        def llm_weights_generator():
+            # Single pass over weights
+            for name, w in weights:
+                if is_vision_encoder_weights((name, w)):
+                    # Load vision encoder weights directly
+                    trimmed_name = '.'.join(name.split(".")[1:])
+                    param = vision_encoder_dict[trimmed_name]
+                    with torch.no_grad():
+                        default_weight_loader(param, w)
+                elif is_patch_merger((name, w)):
+                    # Load vision patch merger weights directly
+                    trimmed_name = '.'.join(name.split(".")[1:])
+                    param = patch_merger_dict[trimmed_name]
+                    with torch.no_grad():
+                        default_weight_loader(param, w)
+                elif is_pre_mm_projector_norm((name, w)):
+                    # Load vision pre_mm_projector_norm weights directly
+                    trimmed_name = '.'.join(name.split(".")[1:])
+                    param = pre_mm_projector_norm_dict[trimmed_name]
+                    with torch.no_grad():
+                        default_weight_loader(param, w)
+                elif is_vision_lang_adapter_weights((name, w)):
+                    # Load vision-language adapter weights directly
+                    trimmed_name = '.'.join(name.split(".")[1:])
+                    param = vision_lang_adapter_dict[trimmed_name]
+                    with torch.no_grad():
+                        default_weight_loader(param, w)
+                else:
+                    # LLM weights: yield them to be loaded
+                    # by language_model.load_weights
+                    yield (name, w)
+
+        # Now we call the language model load with the generator
+        self.language_model.load_weights(llm_weights_generator())
+
+
+# Vision encoder
+@dataclass
+class VisionEncoderArgs:
+    hidden_size: int
+    num_channels: int
+    image_size: int
+    patch_size: int
+    intermediate_size: int
+    num_hidden_layers: int
+    num_attention_heads: int
+    rope_theta: float  # for rope-2D
+    image_token_id: int
+    adapter_bias: bool = True
+    spatial_merge_size: int = 1
+    add_pre_mm_projector_layer_norm: bool = False
+    mm_projector_id: str = ""
+
+
+def _reshape_for_broadcast(freqs_cis: torch.Tensor,
+                           x: torch.Tensor) -> torch.Tensor:
+    """
+    freqs_cis: complex - (seq_len, head_dim / 2)
+    x: complex - (bsz, seq_len, head_dim / 2)
+    """
+    ndim = x.ndim
+    assert ndim > 1
+    assert freqs_cis.shape == (x.shape[1], x.shape[-1]), (
+        freqs_cis.shape,
+        (x.shape[1], x.shape[-1]),
+    )
+    shape = [
+        d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)
+    ]
+    return freqs_cis.view(*shape)
+
+
+def precompute_freqs_cis_2d(
+    dim: int,
+    height: int,
+    width: int,
+    theta: float,
+) -> torch.Tensor:
+    """
+    freqs_cis: 2D complex tensor of shape (height, width, dim // 2)
+        to be indexed by (height, width) position tuples
+    """
+    # (dim / 2) frequency bases
+    freqs = 1.0 / (theta**(torch.arange(0, dim, 2).float() / dim))
+
+    h = torch.arange(height, device=freqs.device)
+    w = torch.arange(width, device=freqs.device)
+
+    freqs_h = torch.outer(h, freqs[::2]).float()
+    freqs_w = torch.outer(w, freqs[1::2]).float()
+    freqs_2d = torch.cat(
+        [
+            freqs_h[:, None, :].repeat(1, width, 1),
+            freqs_w[None, :, :].repeat(height, 1, 1),
+        ],
+        dim=-1,
+    )
+    return torch.polar(torch.ones_like(freqs_2d), freqs_2d)
+
+
+def apply_rotary_emb_vit(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    assert freqs_cis.dtype == torch.complex64
+    freqs_cis = _reshape_for_broadcast(freqs_cis, xq_)
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+
+
+class FeedForward(nn.Module):
+
+    def __init__(self, args: VisionEncoderArgs):
+        super().__init__()
+        assert args.intermediate_size is not None
+        self.w1 = nn.Linear(args.hidden_size,
+                            args.intermediate_size,
+                            bias=False)
+        self.w2 = nn.Linear(args.intermediate_size,
+                            args.hidden_size,
+                            bias=False)
+        self.w3 = nn.Linear(args.hidden_size,
+                            args.intermediate_size,
+                            bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+
+
+class Attention(nn.Module):
+
+    def __init__(self, args: VisionEncoderArgs):
+        super().__init__()
+        self.args = args
+        assert not args.hidden_size % args.num_attention_heads
+        self.n_heads = args.num_attention_heads
+        self.head_dim = args.hidden_size // args.num_attention_heads
+
+        self.wq = nn.Linear(args.hidden_size, args.hidden_size, bias=False)
+        self.wk = nn.Linear(args.hidden_size, args.hidden_size, bias=False)
+        self.wv = nn.Linear(args.hidden_size, args.hidden_size, bias=False)
+        self.wo = nn.Linear(args.hidden_size, args.hidden_size, bias=False)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor,
+        freqs_cis: torch.Tensor,
+    ) -> torch.Tensor:
+        batch, patches, _ = x.shape
+
+        q, k, v = self.wq(x), self.wk(x), self.wv(x)
+        q = q.reshape(batch, patches, self.n_heads, self.head_dim)
+        k = k.reshape(batch, patches, self.n_heads, self.head_dim)
+        v = v.reshape(batch, patches, self.n_heads, self.head_dim)
+
+        q, k = apply_rotary_emb_vit(q, k, freqs_cis=freqs_cis)
+        out = xops.memory_efficient_attention(q, k, v, attn_bias=mask)
+        out = out.reshape(batch, patches, self.n_heads * self.head_dim)
+        return self.wo(out)
+
+
+class TransformerBlock(nn.Module):
+
+    def __init__(self, args: VisionEncoderArgs):
+        super().__init__()
+        self.attention = Attention(args)
+        self.feed_forward = FeedForward(args)
+        self.attention_norm = RMSNorm(args.hidden_size, eps=1e-5)
+        self.ffn_norm = RMSNorm(args.hidden_size, eps=1e-5)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor,
+        freqs_cis: torch.Tensor,
+    ) -> torch.Tensor:
+        r = self.attention.forward(self.attention_norm(x),
+                                   mask=mask,
+                                   freqs_cis=freqs_cis)
+        h = x + r
+        r = self.feed_forward.forward(self.ffn_norm(h))
+        out = h + r
+        return out
+
+
+class Transformer(nn.Module):
+
+    def __init__(self, args: VisionEncoderArgs):
+        super().__init__()
+        self.layers = torch.nn.ModuleList()
+        for _ in range(args.num_hidden_layers):
+            self.layers.append(TransformerBlock(args))
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor,
+        freqs_cis: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        for layer in self.layers:
+            x = layer(x, mask=mask, freqs_cis=freqs_cis)
+        return x
+
+
+def position_meshgrid(patch_embeds_list: list[torch.Tensor], ) -> torch.Tensor:
+    positions = torch.cat([
+        torch.stack(
+            torch.meshgrid(
+                torch.arange(p.shape[-2]),
+                torch.arange(p.shape[-1]),
+                indexing="ij",
+            ),
+            dim=-1,
+        ).reshape(-1, 2) for p in patch_embeds_list
+    ])
+    return positions
+
+
+class VisionTransformer(nn.Module):
+
+    def __init__(self, args: VisionEncoderArgs):
+        super().__init__()
+        self.args = args
+        self.patch_conv = nn.Conv2d(
+            in_channels=args.num_channels,
+            out_channels=args.hidden_size,
+            kernel_size=args.patch_size,
+            stride=args.patch_size,
+            bias=False,
+        )
+        self.ln_pre = RMSNorm(args.hidden_size, eps=1e-5)
+        self.transformer = Transformer(args)
+
+        head_dim = self.args.hidden_size // self.args.num_attention_heads
+        assert head_dim % 2 == 0, "ROPE requires even head_dim"
+        self._freqs_cis: Optional[torch.Tensor] = None
+
+    @property
+    def max_patches_per_side(self) -> int:
+        return self.args.image_size // self.args.patch_size
+
+    @property
+    def device(self) -> torch.types.Device:
+        return next(self.parameters()).device
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return next(self.parameters()).dtype
+
+    @property
+    def freqs_cis(self) -> torch.Tensor:
+        if self._freqs_cis is None:
+            self._freqs_cis = precompute_freqs_cis_2d(
+                dim=self.args.hidden_size // self.args.num_attention_heads,
+                height=self.max_patches_per_side,
+                width=self.max_patches_per_side,
+                theta=self.args.rope_theta,
+            )
+
+        if self._freqs_cis.device != self.device:
+            self._freqs_cis = self._freqs_cis.to(device=self.device)
+
+        return self._freqs_cis
+
+    def forward(
+        self,
+        images: list[torch.Tensor],
+    ) -> torch.Tensor:
+        """
+        Args:
+            images: list of N_img images of variable sizes,
+                each of shape (C, H, W)
+        Returns:
+            image_features: tensor of token features for
+                all tokens of all images of shape (N_toks, D)
+        """
+        # pass images through initial convolution independently
+        patch_embeds_list = [
+            self.patch_conv(img.unsqueeze(0).to(self.dtype)) for img in images
+        ]
+
+        patch_embeds = [
+            p.flatten(2).permute(0, 2, 1) for p in patch_embeds_list
+        ]
+        embed_sizes = [p.shape[1] for p in patch_embeds]
+
+        # flatten to a single sequence
+        patch_embeds = torch.cat(patch_embeds, dim=1)
+        patch_embeds = self.ln_pre(patch_embeds)
+
+        # positional embeddings
+        positions = position_meshgrid(patch_embeds_list).to(self.device)
+        freqs_cis = self.freqs_cis[positions[:, 0], positions[:, 1]]
+
+        # pass through Transformer with a block diagonal mask delimiting images
+        if USE_XFORMERS_OPS:
+            mask = xops.fmha.attn_bias.BlockDiagonalMask.from_seqlens(
+                [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], )
+        else:
+            raise ImportError("Xformers is required for Pixtral inference "
+                              "with the Mistral format")
+        out = self.transformer(patch_embeds, mask=mask, freqs_cis=freqs_cis)
+
+        # squeeze dim 0 and split into separate tensors for each image
+        return torch.split(out.squeeze(0), embed_sizes)
+
+
+class VisionLanguageAdapter(nn.Module):
+
+    def __init__(self, args: VisionEncoderArgs, dim: int):
+        super().__init__()
+        assert isinstance(args, VisionEncoderArgs)
+        self.w_in = nn.Linear(
+            args.hidden_size,
+            dim,
+            bias=args.adapter_bias,
+        )
+        self.gelu = nn.GELU()
+        self.w_out = nn.Linear(dim, dim, bias=args.adapter_bias)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w_out(self.gelu(self.w_in(x)))
+
+
+class PatchMerger(nn.Module):
+    """
+    Learned merging of spatial_merge_size ** 2 patches
+    """
+
+    def __init__(
+        self,
+        vision_encoder_dim: int,
+        spatial_merge_size: int,
+        use_mlp_bias: bool = False,
+    ) -> None:
+        super().__init__()
+
+        mlp_input_dim = vision_encoder_dim * (spatial_merge_size**2)
+
+        self.spatial_merge_size = spatial_merge_size
+        self.mlp_input_dim = mlp_input_dim
+
+        self.merging_layer = nn.Linear(
+            mlp_input_dim,
+            vision_encoder_dim,
+            bias=use_mlp_bias,
+        )
+
+    def forward(self, x: torch.Tensor,
+                image_sizes: list[tuple[int, int]]) -> torch.Tensor:
+        # image_sizes specified in tokens
+        assert sum([h * w for h, w in image_sizes]) == len(x)
+
+        # x is (N, vision_encoder_dim)
+        x = self.permute(x, image_sizes)
+
+        # x is (N / spatial_merge_size ** 2,
+        #       vision_encoder_dim * spatial_merge_size ** 2)
+        x = self.merging_layer(x)
+
+        # x is (N / spatial_merge_size ** 2, vision_encoder_dim)
+        return x
+
+    def permute(
+        self,
+        x: torch.Tensor,
+        image_sizes: list[tuple[int, int]],
+    ) -> torch.Tensor:
+        """
+        Args:
+            x: (N, D) where N is flattened and concatenated patch tokens
+                for all images
+            image_sizes: list of tuple of (height, width) in tokens for
+                each image
+        Returns:
+            image_features: reorders patch tokens so each grid of
+                (spatial_merge_size, spatial_merge_size) is contiguous.
+                now (N / spatial_merge_size ** 2, D * spatial_merge_size ** 2)
+        """
+
+        sub_grids = get_sub_grids(
+            x=x,
+            image_sizes=image_sizes,
+            spatial_merge_size=self.spatial_merge_size
+        )  # list of [d x sub_grid_size x sub_grid_size x n_patches]
+        permuted_tensor: list[torch.Tensor] = []
+        for grid in sub_grids:
+            n_patches = grid.shape[-1]
+            permuted_tensor.append(grid.view(-1, n_patches).t(
+            ))  # n_patches x d * sub_grid_size * sub_grid_size
+        return torch.cat(
+            permuted_tensor, dim=0
+        )  # (N / spatial_merge_size ** 2, d * spatial_merge_size ** 2)
+
+
+def get_sub_grids(
+    x: torch.Tensor,
+    image_sizes: list[tuple[int, int]],
+    spatial_merge_size: int,
+) -> list[torch.Tensor]:
+    # image_sizes specified in tokens
+    tokens_per_image = [h * w for h, w in image_sizes]
+    d = x.shape[-1]
+    all_img_sub_grids: list[torch.Tensor] = []
+    sub_grid_size = spatial_merge_size
+
+    for image_index, image_tokens in enumerate(x.split(tokens_per_image)):
+        # Reshape image_tokens into a 2D grid
+        h, w = image_sizes[image_index]
+        image_grid = image_tokens.view(h, w, d).permute(
+            2, 0, 1)[None, :, :, :]  # 1 x d x h x w
+        sub_grids = torch.nn.functional.unfold(image_grid,
+                                               kernel_size=sub_grid_size,
+                                               stride=sub_grid_size)
+        sub_grids = sub_grids.view(
+            1, d, sub_grid_size, sub_grid_size,
+            -1)  # 1 x d x sub_grid_size x sub_grid_size x n_patches
+
+        all_img_sub_grids.append(sub_grids[0])
+
+    return all_img_sub_grids
+
+
+#### HF Transformers version of Pixtral ####
+# Based off https://github.com/huggingface/transformers/blob/d7950bff82b18c823193d17d72188c5e46d06c83/src/transformers/models/pixtral/modeling_pixtral.py
+# This model follows the Llava family, meaning image embeddings are placed
+# instead of the `[IMG]` token placeholders.
+# The model uses [`PixtralVisionModel`] for its vision encoder,
+# and [`MistralForCausalLM`] for its language decoder.
+
+
+class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]):
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        ncols, nrows = self.get_patch_grid_size(
+            image_width=image_width,
+            image_height=image_height,
+        )
+        return ncols * nrows
+
+    def get_image_size(self) -> int:
+        return self.vision_config.image_size
+
+    def get_patch_size(self) -> int:
+        # spatial_merge_size is needed for Mistral3
+        spatial_merge_size = getattr(self.hf_config, "spatial_merge_size", 1)
+        return self.vision_config.patch_size * spatial_merge_size
+
+    def get_patch_grid_length(self) -> int:
+        image_size, patch_size = self.get_image_size(), self.get_patch_size()
+
+        # Since interpolation is applied, the image size need not be divisible
+        # assert image_size % patch_size == 0
+        return image_size // patch_size
+
+    # Adapted from: https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/pixtral/image_processing_pixtral.py#L99
+    def get_patch_grid_size(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> tuple[int, int]:
+        max_width = max_height = self.get_image_size()
+        patch_width = patch_height = self.get_patch_size()
+
+        ratio = max(image_width / max_width, image_height / max_height)
+
+        if ratio > 1:
+            image_width = int(math.floor(image_width / ratio))
+            image_height = int(math.floor(image_height / ratio))
+
+        nrows, ncols = _get_pixtral_hf_num_image_tokens(
+            (image_height, image_width),
+            (patch_height, patch_width),
+        )  # type: ignore
+
+        return ncols, nrows
+
+
+class PixtralHFMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: PixtralVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        assert config.intermediate_size is not None
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=config.hidden_size,
+            output_sizes=[config.intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
+        self.down_proj = RowParallelLinear(input_size=config.intermediate_size,
+                                           output_size=config.hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config,
+                                           prefix=f"{prefix}.down_proj")
+        self.act_and_mul = get_act_and_mul_fn(config.hidden_act)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_and_mul(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class PixtralHFAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: PixtralVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        assert not config.hidden_size % config.num_attention_heads
+        self.total_num_heads = config.num_attention_heads
+        tp_size = get_tensor_model_parallel_world_size()
+        self.n_heads = divide(config.num_attention_heads, tp_size)
+        self.head_dim = config.hidden_size // config.num_attention_heads
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=config.hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        assert self.total_num_heads * self.head_dim == config.hidden_size
+        self.o_proj = RowParallelLinear(
+            input_size=config.hidden_size,
+            output_size=config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_embeddings: torch.Tensor,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        batch, patches, _ = hidden_states.size()
+
+        qkv_states, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv_states.chunk(3, dim=-1)
+
+        # Transpose q and k to apply HF's Rotary Position Embedding
+        q = q.view(batch, patches, self.n_heads, self.head_dim).transpose(1, 2)
+        k = k.view(batch, patches, self.n_heads, self.head_dim).transpose(1, 2)
+        v = v.view(batch, patches, self.n_heads, self.head_dim)
+        cos, sin = position_embeddings
+        q, k = apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=0)
+
+        if USE_XFORMERS_OPS:
+            # Transpose q and k back for attention
+            q = q.transpose(1, 2).contiguous()
+            k = k.transpose(1, 2).contiguous()
+            out = xops.memory_efficient_attention(q,
+                                                  k,
+                                                  v,
+                                                  attn_bias=attention_mask)
+        else:
+            v = v.transpose(1, 2)
+            out = nn.functional.scaled_dot_product_attention(
+                q, k, v, attn_mask=attention_mask)
+            out = out.transpose(1, 2)
+
+        out = out.view(batch, patches, self.n_heads * self.head_dim)
+        attn_output, _ = self.o_proj(out)
+
+        return attn_output, None
+
+
+class PixtralHFTransformerBlock(nn.Module):
+
+    def __init__(
+        self,
+        config: PixtralVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.attention_norm = RMSNorm(config.hidden_size, eps=1e-5)
+        self.attention = PixtralHFAttention(config,
+                                            quant_config=quant_config,
+                                            prefix=f"{prefix}.attention")
+        self.feed_forward = PixtralHFMLP(config,
+                                         quant_config=quant_config,
+                                         prefix=f"{prefix}.feed_forward")
+        self.ffn_norm = RMSNorm(config.hidden_size, eps=1e-5)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_embeddings: torch.Tensor,
+    ) -> torch.Tensor:
+        r, _ = self.attention.forward(self.attention_norm(hidden_states),
+                                      attention_mask=attention_mask,
+                                      position_embeddings=position_embeddings)
+        h = hidden_states + r
+        r = self.feed_forward.forward(self.ffn_norm(h))
+        out = h + r
+        return out
+
+
+class PixtralHFTransformer(nn.Module):
+
+    def __init__(
+        self,
+        config: PixtralVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        num_hidden_layers_override: Optional[int] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        if num_hidden_layers_override is None:
+            num_hidden_layers = config.num_hidden_layers
+        else:
+            num_hidden_layers = num_hidden_layers_override
+
+        self.layers = nn.ModuleList([
+            PixtralHFTransformerBlock(config=config,
+                                      quant_config=quant_config,
+                                      prefix=f"{prefix}.layers.{layer_idx}")
+            for layer_idx in range(num_hidden_layers)
+        ])
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_embeddings: torch.Tensor,
+        return_all_hidden_states: bool,
+    ) -> torch.Tensor:
+        hidden_states_pool = [x]
+
+        for layer in self.layers:
+            x = layer(x, attention_mask, position_embeddings)
+            if return_all_hidden_states:
+                hidden_states_pool.append(x)
+        # If we have multiple feature sample layers, we return all hidden
+        # states in order and grab the ones we need by index.
+        if return_all_hidden_states:
+            return hidden_states_pool
+        return x
+
+
+class PixtralHFVisionModel(nn.Module):
+
+    def __init__(
+        self,
+        config: PixtralVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        num_hidden_layers_override: Optional[int] = None,
+        require_post_norm: Optional[bool] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        self.patch_conv = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=config.hidden_size,
+            kernel_size=config.patch_size,
+            stride=config.patch_size,
+            bias=False,
+        )
+        self.ln_pre = RMSNorm(config.hidden_size, eps=1e-5)
+        self.transformer = PixtralHFTransformer(
+            config,
+            quant_config,
+            num_hidden_layers_override=num_hidden_layers_override,
+            prefix=f"{prefix}.transformer",
+        )
+
+        num_hidden_layers = config.num_hidden_layers
+        if len(self.transformer.layers) > config.num_hidden_layers:
+            raise ValueError(
+                f"The original encoder only has {num_hidden_layers} "
+                f"layers, but you requested {len(self.transformer.layers)} "
+                "layers.")
+
+        if require_post_norm is True:
+            msg = "PixtralHFVisionModel does not have post-layernorm"
+            raise ValueError(msg)
+
+        self.dtype = next(self.parameters()).dtype
+        self.device = next(self.parameters()).device
+        self.patch_positional_embedding = PixtralRotaryEmbedding(
+            config, self.device)
+
+    def forward(
+        self,
+        pixel_values: list[torch.Tensor],
+        feature_sample_layers: Optional[list[int]] = None,
+    ) -> tuple[torch.Tensor, ...]:
+        """
+        Args:
+            pixel_values: Each image to be processed will be a separate tensor
+                in pixel_values. This means it will be a list of tensors
+                because multiple requests batched can have multiple images,
+                each with their own shape potentially
+            feature_sample_layers: Layer indices whose features should be
+                concatenated and used as the visual encoder output. If none
+                are provided, the last layer is used.
+
+        Returns:
+            image_features: tensor of token features for
+                all tokens of all images of shape (N_toks, D)
+        """
+        # pass images through initial convolution independently
+        patch_embeds_list = [
+            self.patch_conv(img.unsqueeze(0).to(self.dtype))
+            for img in pixel_values
+        ]
+
+        patch_embeds = [
+            p.flatten(2).permute(0, 2, 1) for p in patch_embeds_list
+        ]
+        embed_sizes = [p.shape[1] for p in patch_embeds]
+
+        # flatten to a single sequence
+        patch_embeds = torch.cat(patch_embeds, dim=1)
+        patch_embeds = self.ln_pre(patch_embeds)
+
+        # positional embeddings
+        position_ids = position_ids_in_meshgrid(
+            patch_embeds_list,
+            max_width=self.config.image_size // self.config.patch_size).to(
+                self.device)
+        position_embedding = self.patch_positional_embedding(
+            patch_embeds, position_ids)
+
+        if USE_XFORMERS_OPS:
+            attention_mask = xops.fmha.attn_bias.BlockDiagonalMask.from_seqlens(
+                [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], )
+        else:
+            from transformers.models.pixtral.modeling_pixtral import (
+                generate_block_attention_mask)
+            attention_mask = generate_block_attention_mask(
+                [p.shape[-2] * p.shape[-1] for p in patch_embeds_list],
+                patch_embeds)
+
+        return_all_hidden_states = feature_sample_layers is not None
+        out = self.transformer(
+            patch_embeds,
+            attention_mask,
+            position_embedding,
+            return_all_hidden_states=return_all_hidden_states)
+
+        out = resolve_visual_encoder_outputs(out, feature_sample_layers, None,
+                                             self.config.num_hidden_layers)
+
+        # squeeze dim 0 and split into separate tensors for each image
+        return torch.split(out.squeeze(0), embed_sizes)
+
+    # (TODO) Add prefix argument for filtering out weights to be loaded
+    #        ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        layer_count = len(self.transformer.layers)
+
+        for name, loaded_weight in weights:
+            # omit layers when num_hidden_layers_override is set
+            if name.startswith("transformer.layers"):
+                layer_idx = int(name.split(".")[2])
+                if layer_idx >= layer_count:
+                    continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm_v0.10.0/vllm/model_executor/models/plamo2.py b/vllm_v0.10.0/vllm/model_executor/models/plamo2.py
new file mode 100644
index 0000000..670576c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/plamo2.py
@@ -0,0 +1,738 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Inference-only PLaMo2 model."""
+import math
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig, PreTrainedModel
+
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.attention.layer import Attention
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
+    causal_conv1d_fn, causal_conv1d_update)
+from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
+    selective_scan_fn, selective_state_update)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    composed_weight_loader, default_weight_loader, sharded_weight_loader)
+from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid,
+                                                   SupportsV0Only)
+from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
+                                                    MambaCacheParams)
+from vllm.model_executor.models.utils import maybe_prefix
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.sequence import IntermediateTensors
+from vllm.utils import LayerBlockType
+
+
+# Only used for type hinting.
+class Plamo2Config(PretrainedConfig):  # type: ignore
+    model_type: str = "plamo2"
+
+    hidden_size: int
+    num_hidden_layers: int
+    rms_norm_eps: float
+    # Attention
+    num_attention_heads: int
+    hidden_size_per_head: int
+    num_key_value_heads: int
+    # Mamba
+    mamba_d_state: int
+    mamba_d_conv: int
+    mamba_num_heads: int
+    mamba_step: int
+    # MLP
+    intermediate_size: int
+    # Tokenizer
+    vocab_size: int
+
+
+class Plamo2PreTrainedModel(PreTrainedModel):  # type: ignore
+
+    def _init_weights(self, module: torch.nn.Module) -> None:
+        std = 0.02
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+def get_initial_dt_bias(num_heads: int) -> torch.Tensor:
+    dt_min = 0.001
+    dt_max = 0.1
+    dt = torch.exp(
+        torch.rand(num_heads) * (math.log(dt_max) - math.log(dt_min)) +
+        math.log(dt_min))
+    dt = torch.clamp(dt, 1e-4)
+    inv_dt = dt + torch.log(-torch.expm1(-dt))
+    return inv_dt
+
+
+def is_mamba(config: Plamo2Config, i: int) -> bool:
+    assert config.mamba_step > 1
+
+    if config.num_hidden_layers <= (config.mamba_step // 2):
+        # use attention in last layer
+        return i != config.num_hidden_layers - 1
+    return (i % config.mamba_step) != (config.mamba_step // 2)
+
+
+# TODO(Shinichi): Replace this with RMSNorm.
+def _rms_norm(hidden_states: torch.Tensor, weight: torch.Tensor,
+              eps: float) -> torch.Tensor:
+    input_shape = hidden_states.shape
+    hidden_states = hidden_states.reshape(input_shape[:-1] + weight.shape)
+    input_dtype = hidden_states.dtype
+    hidden_states = hidden_states.to(torch.float32)
+    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+    hidden_states = hidden_states * torch.rsqrt(variance + eps)
+    hidden_states = hidden_states.to(input_dtype)
+    hidden_states = weight * hidden_states
+    return hidden_states.reshape(input_shape)
+
+
+def _swiglu(h: torch.Tensor) -> torch.Tensor:
+    h0, h1 = h.chunk(2, dim=-1)
+    return torch.nn.functional.silu(h0) * h1
+
+
+# Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
+class Plamo2MambaMixer(nn.Module):
+    # TODO(Shinichi): Rebase on Mamba2 implementation.
+
+    def __init__(self,
+                 config: Plamo2Config,
+                 cache_config: CacheConfig,
+                 quant_config: QuantizationConfig,
+                 max_model_len: int,
+                 prefix: str = "",
+                 **kwargs) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.ssm_state_size = config.mamba_d_state
+        self.conv_kernel_size = config.mamba_d_conv
+        self.intermediate_size = (config.mamba_num_heads *
+                                  config.hidden_size_per_head)
+        self.hidden_size_per_head = config.hidden_size_per_head
+        self.num_heads = config.mamba_num_heads
+        self.time_step_rank = max(64, self.hidden_size // 16)
+        self.use_conv_bias = False
+        self.use_bias = False
+        self.conv1d = ColumnParallelLinear(
+            input_size=self.conv_kernel_size,
+            output_size=self.intermediate_size,
+            bias=self.use_conv_bias,
+        )
+        # unsqueeze to fit conv1d weights shape into the linear weights shape.
+        # Can't do this in `weight_loader` since it already exists in
+        # `ColumnParallelLinear` and `set_weight_attrs`
+        # doesn't allow to override it
+        self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1)
+
+        self.in_proj = MergedColumnParallelLinear(
+            self.hidden_size,
+            [self.intermediate_size] * 2,
+            bias=self.use_bias,
+            prefix=f"{prefix}.in_proj",
+        )
+        # selective projection used to make dt, B and C input dependent
+        self.bcdt_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.time_step_rank + self.ssm_state_size * 2,
+            bias=False,
+            prefix=f"{prefix}.bcdt_proj",
+        )
+        # time step projection (discretization) -
+        # In the forward we need to apply dt_proj without the bias,
+        # as the bias is added in the selective scan kernel.
+        self.dt_proj = ColumnParallelLinear(
+            self.time_step_rank,
+            self.num_heads,
+            bias=False,
+            prefix=f"{prefix}.dt_proj",
+        )
+        self.dt_bias = torch.nn.Parameter(get_initial_dt_bias(self.num_heads))
+
+        tp_size = get_tensor_model_parallel_world_size()
+        self.A = nn.Parameter(
+            torch.empty(
+                self.intermediate_size // tp_size,
+                self.ssm_state_size,
+                dtype=torch.float32,
+            ))
+        self.D = nn.Parameter(torch.ones(self.intermediate_size // tp_size))
+
+        set_weight_attrs(self.D, {"weight_loader": sharded_weight_loader(0)})
+        a_weight_loader = composed_weight_loader(
+            sharded_weight_loader(0), lambda x: -torch.exp(x.float()))
+        set_weight_attrs(self.A, {"weight_loader": a_weight_loader})
+
+        self.out_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=self.use_bias,
+            input_is_parallel=True,
+            prefix=f"{prefix}.out_proj",
+        )
+        # The activation function is fixed to SiLU.
+        self.activation = "silu"
+
+        self.dt_norm = RMSNorm(self.time_step_rank, eps=config.rms_norm_eps)
+        self.B_norm = RMSNorm(self.ssm_state_size, eps=config.rms_norm_eps)
+        self.C_norm = RMSNorm(self.ssm_state_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        **kwargs,
+    ) -> torch.Tensor:
+
+        attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
+
+        # 1. Gated MLP's linear projection
+        projected_states = self.in_proj(hidden_states)[0]
+        # Reshaping the projected states as in modeling_plamo.py.
+        length = len(hidden_states)
+        projected_states = projected_states.reshape(length, self.num_heads, -1)
+        gate, hidden_states = torch.split(
+            projected_states,
+            [self.hidden_size_per_head, self.hidden_size_per_head],
+            dim=-1)
+        hidden_states = hidden_states.reshape(length, -1).transpose(0, 1)
+        gate = gate.reshape(length, -1).transpose(0, 1)
+
+        # 2. Convolution sequence transformation
+        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
+                                               self.conv1d.weight.size(2))
+
+        if attn_metadata.query_start_loc is not None \
+            and attn_metadata.context_lens_tensor is not None:
+            # |---------- N-1 iteration --------|
+            # |---------------- N iteration ---------------------|
+            # |- tokenA -|......................|-- newTokens ---|
+            # |---------- context_len ----------|
+            # |-------------------- seq_len ---------------------|
+            #                                   |-- query_len ---|
+            hidden_states = causal_conv1d_fn(
+                hidden_states,
+                conv_weights,
+                self.conv1d.bias,
+                activation=self.activation,
+                conv_states=mamba_cache_params.conv_state,
+                has_initial_state=attn_metadata.context_lens_tensor > 0,
+                cache_indices=mamba_cache_params.state_indices_tensor,
+                query_start_loc=attn_metadata.query_start_loc)
+        else:
+            hidden_states = causal_conv1d_update(
+                hidden_states.transpose(0, 1),
+                mamba_cache_params.conv_state,
+                conv_weights,
+                self.conv1d.bias,
+                self.activation,
+                conv_state_indices=mamba_cache_params.state_indices_tensor)
+            hidden_states = hidden_states.transpose(0, 1)
+
+        # 3. State Space Model sequence transformation
+        # 3.a. input varying initialization of time_step, B and C
+        ssm_parameters = self.bcdt_proj(hidden_states.transpose(-2, -1))[0]
+
+        # Splitting the ssm_parameters as in modeling_plamo.py.
+        B, C, time_step = torch.split(
+            ssm_parameters,
+            [self.ssm_state_size, self.ssm_state_size, self.time_step_rank],
+            dim=-1,
+        )
+        time_step = self.dt_norm(time_step.contiguous())
+        B = self.B_norm(B.contiguous())
+        C = self.C_norm(C.contiguous())
+
+        discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1)
+        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+        time_proj_bias = (self.dt_bias.float() if hasattr(
+            self.dt_proj, "bias") else None)
+
+        # Broadcasting as in modeling_plamo.py.
+        discrete_time_step = discrete_time_step.transpose(
+            0, 1)[..., None].expand(-1, -1, self.hidden_size_per_head)
+        discrete_time_step = discrete_time_step.reshape(
+            -1, self.intermediate_size).transpose(0, 1)
+        time_proj_bias = time_proj_bias[...,
+                                        None].expand(-1,
+                                                     self.hidden_size_per_head)
+        time_proj_bias = time_proj_bias.reshape(self.intermediate_size)
+
+        if attn_metadata.query_start_loc is not None \
+            and attn_metadata.context_lens_tensor is not None:
+            scan_outputs = selective_scan_fn(
+                hidden_states,
+                mamba_cache_params.ssm_state,
+                discrete_time_step,
+                self.A,
+                B.transpose(-2, -1),
+                C.transpose(-2, -1),
+                self.D.float(),
+                gate,
+                time_proj_bias,
+                delta_softplus=True,
+                cache_indices=mamba_cache_params.state_indices_tensor,
+                has_initial_state=attn_metadata.context_lens_tensor > 0,
+                query_start_loc=attn_metadata.query_start_loc)
+        else:
+            scan_outputs = selective_state_update(
+                mamba_cache_params.ssm_state,
+                hidden_states.transpose(0, 1),
+                discrete_time_step.transpose(0, 1),
+                self.A,
+                B,
+                C,
+                self.D,
+                gate.transpose(0, 1),
+                time_proj_bias,
+                dt_softplus=True,
+                state_batch_indices=mamba_cache_params.state_indices_tensor)
+            scan_outputs = scan_outputs.transpose(0, 1)
+
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(scan_outputs.transpose(-2,
+                                                                     -1))[0]
+        return contextualized_states
+
+
+class DenseMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: Plamo2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_up_proj = MergedColumnParallelLinear(
+            self.hidden_size, [self.intermediate_size] * 2,
+            bias=False,
+            prefix=f"{prefix}.gate_up_proj",
+            quant_config=quant_config)
+        self.down_proj = RowParallelLinear(self.intermediate_size,
+                                           self.hidden_size,
+                                           bias=False,
+                                           prefix=f"{prefix}.down_proj",
+                                           quant_config=quant_config)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        h = self.gate_up_proj(hidden_states)[0]
+        h = _swiglu(h)
+        output, _ = self.down_proj(h)
+        return output  # type: ignore
+
+
+class Plamo2AttentionMixer(nn.Module):
+
+    def __init__(self,
+                 config: Plamo2Config,
+                 cache_config: CacheConfig,
+                 quant_config: QuantizationConfig,
+                 max_model_len: int | None = None,
+                 prefix: str = "",
+                 **kwargs) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = config.hidden_size_per_head
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+        )
+        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
+                                        config.hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config)
+
+        self.rope_theta = config.rope_theta if hasattr(config,
+                                                       "rope_theta") else 10000
+        self.rope_scaling = config.rope_scaling if hasattr(
+            config, "rope_scaling") else None
+
+        assert max_model_len is not None, "max_model_len must be provided"
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_model_len,
+            base=self.rope_theta,
+            rope_scaling=self.rope_scaling,
+        )
+        self.q_weight = torch.nn.Parameter(
+            torch.ones((self.num_heads, config.hidden_size_per_head)))
+        self.k_weight = torch.nn.Parameter(
+            torch.ones((self.num_kv_heads, config.hidden_size_per_head)))
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        **kwargs,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q = _rms_norm(q, self.q_weight, 1e-6)
+        k = _rms_norm(k, self.k_weight, 1e-6)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Plamo2DecoderLayer(nn.Module):
+
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 layer_idx: int,
+                 max_model_len: int | None = None,
+                 prefix: str = "",
+                 **kwargs) -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        max_model_len = vllm_config.scheduler_config.max_model_len
+
+        self.is_mamba = is_mamba(config, layer_idx)
+        if self.is_mamba:
+            self.mixer = Plamo2MambaMixer(config=config,
+                                          cache_config=cache_config,
+                                          quant_config=quant_config,
+                                          max_model_len=max_model_len,
+                                          prefix=f"{prefix}.mixer")
+        else:
+            self.mixer = Plamo2AttentionMixer(config=config,
+                                              cache_config=cache_config,
+                                              quant_config=quant_config,
+                                              max_model_len=max_model_len,
+                                              prefix=f"{prefix}.mixer")
+
+        self.mlp = DenseMLP(config=config,
+                            quant_config=quant_config,
+                            prefix=f"{prefix}.mlp")
+        self.pre_mixer_norm = RMSNorm(config.hidden_size,
+                                      eps=config.rms_norm_eps)
+        self.post_mixer_norm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.pre_mlp_norm = RMSNorm(config.hidden_size,
+                                    eps=config.rms_norm_eps)
+        self.post_mlp_norm = RMSNorm(config.hidden_size,
+                                     eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        mamba_cache_params: MambaCacheParams,
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.pre_mixer_norm(hidden_states)
+        else:
+            hidden_states, residual = self.pre_mixer_norm(
+                hidden_states, residual)
+
+        hidden_states = self.mixer(positions=positions,
+                                   hidden_states=hidden_states,
+                                   residual=residual,
+                                   mamba_cache_params=mamba_cache_params)
+        hidden_states = self.post_mixer_norm(hidden_states)
+        # Fully Connected
+        hidden_states, residual = self.pre_mlp_norm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_mlp_norm(hidden_states)
+        return hidden_states, residual
+
+
+class Plamo2Decoder(torch.nn.Module):
+
+    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        num_hidden_layers = vllm_config.model_config.hf_config.num_hidden_layers
+
+        self.layers = nn.ModuleList([
+            Plamo2DecoderLayer(vllm_config=vllm_config,
+                               layer_idx=i,
+                               prefix=f"{prefix}.layers.{i}")
+            for i in range(num_hidden_layers)
+        ])
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        mamba_cache_params: MambaCacheParams,
+    ) -> torch.Tensor:
+        mamba_cache_index = 0
+        for layer in self.layers:
+            layer_mamba_cache_params = None
+            if layer.is_mamba:
+                layer_mamba_cache_params = mamba_cache_params.at_layer_idx(
+                    mamba_cache_index)
+                mamba_cache_index += 1
+
+            hidden_states, residual = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+                mamba_cache_params=layer_mamba_cache_params)
+        return hidden_states, residual
+
+
+class Plamo2Model(Plamo2PreTrainedModel):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config.model_config.hf_config)
+
+        config = vllm_config.model_config.hf_config
+
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.org_vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            prefix=f"{prefix}.embed_tokens",
+        )
+        self.layers = Plamo2Decoder(vllm_config, prefix=f"{prefix}.layers")
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # TODO(Shinichi): Implement pipeline parallelism.
+        hidden_states = self.embed_tokens(input_ids)
+        residual = None
+
+        hidden_states, residual = self.layers(
+            positions=positions,
+            hidden_states=hidden_states,
+            residual=residual,
+            mamba_cache_params=mamba_cache_params)
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, IsHybrid,
+                        SupportsV0Only):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        config = vllm_config.model_config.hf_config
+        scheduler_config = vllm_config.scheduler_config
+        assert not vllm_config.cache_config.enable_prefix_caching, \
+            "PLaMo2 currently does not support prefix caching"
+
+        super().__init__(config)
+        self.config = config
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.scheduler_config = scheduler_config
+
+        # ModelConfig.get_head_size assumes head_dim is set or calculated as
+        # hidden_size // num_attention_heads. However, this is not always
+        # the case for PLaMo2, as indicated by the FIXME comment.
+        self.config.head_dim = self.config.hidden_size_per_head
+
+        self.model = Plamo2Model(vllm_config=vllm_config,
+                                 prefix=maybe_prefix(prefix, "model"))
+        self.vocab_size = self.config.vocab_size
+        self.unpadded_vocab_size = self.config.vocab_size
+        num_embeddings = ((self.vocab_size + 15) // 16) * 16
+        self.lm_head = ParallelLMHead(
+            num_embeddings,
+            self.config.hidden_size,
+            org_num_embeddings=self.config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            prefix=f"{prefix}.lm_head",
+        )
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
+
+        # Used to track and store by the Mamba cache between steps.
+        self.mamba_cache: Optional[MambaCacheManager] = None
+
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                self.config.vocab_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs):
+        if self.mamba_cache is None:
+            num_mamba_layers = self.model_config.get_num_layers_by_block_type(
+                self.vllm_config.parallel_config, LayerBlockType.mamba)
+
+            self.mamba_cache = MambaCacheManager(
+                self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers,
+                *self._get_mamba_cache_shape())
+
+        mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+
+        hidden_states = self.model(input_ids, positions, mamba_cache_params,
+                                   intermediate_tensors, inputs_embeds)
+        return hidden_states
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        return self.mamba_cache.copy_inputs_before_cuda_graphs(
+            input_buffers, **kwargs)
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
+
+    def _get_mamba_cache_shape(
+            self) -> tuple[tuple[int, int], tuple[int, int]]:
+        world_size = get_tensor_model_parallel_world_size()
+        hidden_size = (self.config.mamba_num_heads *
+                       self.config.hidden_size_per_head)
+        conv_state_shape = (
+            hidden_size // world_size,
+            self.config.mamba_d_conv - 1,
+        )
+        temporal_state_shape = (
+            hidden_size // world_size,
+            self.config.mamba_d_state,
+        )
+        return conv_state_shape, temporal_state_shape
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+
+            # Both tie_word_embeddings=True and lm_head.weight in the safetensor
+            # at the same time causes dict key access error.
+            if name == "lm_head.weight" and self.config.tie_word_embeddings:
+                assert "lm_head.weight" not in params_dict
+                continue
+
+            # Update the weight names to be compatible with the vllm version
+            # of the model.
+            # Do not change the order of the replacements.
+            replacements = {
+                # Rename incompatible weight names.
+                ".A_log": ".A",
+                ".B_norm_weight": ".B_norm.weight",
+                ".C_norm_weight": ".C_norm.weight",
+                ".dt_norm_weight": ".dt_norm.weight",
+            }
+            # Apply replacements based on the defined mappings
+            for old, new in replacements.items():
+                if old in name:
+                    name = name.replace(old, new)
+
+            # Broadcast the loaded weight to match the model's parameter shape.
+            if ".A" in name:
+                loaded_weight = loaded_weight[:, None, None].expand(
+                    -1, self.config.hidden_size_per_head,
+                    self.config.mamba_d_state)
+                loaded_weight = loaded_weight.reshape(
+                    -1, self.config.mamba_d_state)
+            elif ".D" in name:
+                loaded_weight = loaded_weight[:, None].expand(
+                    -1, self.config.hidden_size_per_head)
+                loaded_weight = loaded_weight.reshape(-1)
+            # Offset parameter with vllm's RMSNorm haven't been supported yet.
+            if ".pre_mixer_norm" in name:
+                loaded_weight += 1.0
+            elif ".post_mixer_norm" in name:
+                loaded_weight += 1.0 / 5
+            elif ".pre_mlp_norm" in name:
+                loaded_weight += 1.0
+            elif ".post_mlp_norm" in name:
+                loaded_weight += 1.0 / (5**1.5)
+            elif "model.norm.weight" in name:
+                loaded_weight += 1.0
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm_v0.10.0/vllm/model_executor/models/prithvi_geospatial_mae.py
new file mode 100644
index 0000000..0f00fd4
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -0,0 +1,276 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The vLLM team.
+# Copyright 2025 IBM.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only IBM/NASA Prithvi Geospatial model."""
+
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+from transformers import BatchFeature
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.pooler import (AllPool, PoolerHead,
+                                               PoolerIdentity, SimplePooler)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.interfaces import (
+    IsAttentionFree, MultiModalEmbeddings, SupportsMultiModalWithRawInput)
+from vllm.model_executor.models.utils import AutoWeightsLoader
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalFieldElem, MultiModalInputs,
+                                    MultiModalKwargs, MultiModalKwargsItem,
+                                    MultiModalSharedField, PlaceholderRange)
+from vllm.multimodal.parse import MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+
+class PrithviGeoSpatialMAEProcessingInfo(BaseProcessingInfo):
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+
+class PrithviGeoSpatialMAEInputBuilder(
+        BaseDummyInputsBuilder[PrithviGeoSpatialMAEProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        # This model input is fixed and is in the form of a torch Tensor.
+        # The size of pixel_values might change in the cases where we resize
+        # the input but never exceeds the dimensions below.
+        return {
+            "pixel_values": torch.full((6, 512, 512), 1.0,
+                                       dtype=torch.float16),
+            "location_coords": torch.full((1, 2), 1.0, dtype=torch.float16),
+        }
+
+
+class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.shared(batch_size=1,
+                                                      modality="image"),
+            location_coords=MultiModalFieldConfig.shared(batch_size=1,
+                                                         modality="image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        return []
+
+    def apply(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        return_mm_hashes: bool = False,
+    ) -> MultiModalInputs:
+        mm_kwargs = {}
+
+        for k, v in mm_data.items():
+            mm_kwargs[k] = v
+        mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]}
+
+        # This model receives in input a multi-dimensional tensor representing
+        # a single image patch and therefore it is not to be split
+        # into multiple elements, but rather to be considered a single one.
+        # Hence, the decision of using a MultiModalSharedField.
+        # The expected shape is (num_channels, width, height).
+
+        # This model however allows the user to also submit multiple image
+        # patches as a batch, adding a further dimension to the above shape.
+        # At this stage we only support submitting one patch per request and
+        # batching is achieved via vLLM batching.
+        # TODO (christian-pinto): enable support for multi patch requests
+        # in tandem with vLLM batching.
+        multimodal_kwargs_items = [
+            MultiModalKwargsItem.from_elems([
+                MultiModalFieldElem(
+                    modality="image",
+                    key=key,
+                    data=data,
+                    field=MultiModalSharedField(1),
+                ) for key, data in mm_kwargs.items()
+            ])
+        ]
+
+        return MultiModalInputs(
+            type="multimodal",
+            prompt=prompt,
+            prompt_token_ids=[1],
+            mm_kwargs=MultiModalKwargs.from_items(multimodal_kwargs_items),
+            mm_hashes=None,
+            mm_placeholders=mm_placeholders,
+        )
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    PrithviGeoSpatialMAEMultiModalProcessor,
+    info=PrithviGeoSpatialMAEProcessingInfo,
+    dummy_inputs=PrithviGeoSpatialMAEInputBuilder,
+)
+class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree,
+                           SupportsMultiModalWithRawInput):
+    """Prithvi Masked Autoencoder"""
+
+    is_pooling_model = True
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return None
+
+        raise ValueError("Only image modality is supported")
+
+    def _instantiate_model(self, config: dict) -> Optional[nn.Module]:
+        # We might be able/need to support different tasks with this same model
+        if config["task_args"]["task"] == "SemanticSegmentationTask":
+            from terratorch.cli_tools import SemanticSegmentationTask
+
+            task = SemanticSegmentationTask(
+                config["model_args"],
+                config["task_args"]["model_factory"],
+                loss=config["task_args"]["loss"],
+                lr=config["task_args"]["lr"],
+                ignore_index=config["task_args"]["ignore_index"],
+                optimizer=config["task_args"]["optimizer"],
+                optimizer_hparams=config["optimizer_params"],
+                scheduler=config["task_args"]["scheduler"],
+                scheduler_hparams=config["scheduler_params"],
+                plot_on_val=config["task_args"]["plot_on_val"],
+                freeze_decoder=config["task_args"]["freeze_decoder"],
+                freeze_backbone=config["task_args"]["freeze_backbone"],
+            )
+
+            return task.model
+        else:
+            return None
+
+    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        # the actual model is dynamically instantiated using terratorch
+        # allowing us to perform changes to the model architecture
+        # at startup time (e.g., change the model decoder class.)
+        self.model = self._instantiate_model(
+            vllm_config.model_config.hf_config.to_dict()["pretrained_cfg"])
+        if self.model is None:
+            raise ValueError(
+                "Unsupported task. "
+                "Only SemanticSegmentationTask is supported for now "
+                "by PrithviGeospatialMAE.")
+
+        self.pooler = SimplePooler(AllPool(), PoolerHead(PoolerIdentity()))
+
+    def _parse_and_validate_multimodal_data(
+            self, **kwargs) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        if not isinstance(pixel_values, torch.Tensor):
+            raise ValueError(f"Incorrect type of pixel_values. "
+                             f"Got type: {type(pixel_values)}")
+
+        location_coords = kwargs.pop("location_coords", None)
+        if not isinstance(location_coords, torch.Tensor):
+            raise ValueError(f"Incorrect type of location_coords. "
+                             f"Got type: {type(location_coords)}")
+        location_coords = torch.unbind(location_coords, dim=0)[0]
+        if location_coords.shape == torch.Size([0]):
+            location_coords = None
+
+        return pixel_values, location_coords
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        # We do not really use any input tokens and therefore no embeddings
+        # to be calculated. However, due to the mandatory token ids in
+        # the input prompt we pass one token and the size of the dummy
+        # embedding tensors must reflect that.
+        return torch.empty((input_ids.shape[0], 0))
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ):
+        pixel_values, location_coords = (
+            self._parse_and_validate_multimodal_data(**kwargs))
+        model_output = self.model(pixel_values,
+                                  location_coords=location_coords)
+
+        return model_output.output
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        params_list = []
+        model_buffers = dict(self.named_buffers())
+        loaded_buffers = []
+        for key, value in weights:
+            if key == "state_dict":
+                weights_to_parse = value
+                for name, weight in weights_to_parse.items():
+                    if "pos_embed" in name:
+                        continue
+
+                    if "_timm_module." in name:
+                        name = name.replace("_timm_module.", "")
+
+                    # this model requires a couple of buffers to be loaded
+                    # that are not loadable with the AutoWeightsLoader
+                    if name in model_buffers:
+                        if "_timm_module." in name:
+                            name = name.replace("_timm_module.", "")
+                        buffer = model_buffers[name]
+                        weight_loader = getattr(buffer, "weight_loader",
+                                                default_weight_loader)
+                        weight_loader(buffer, weight)
+                        loaded_buffers.append(name)
+                    else:
+                        params_list.append((name, weight))
+                break
+
+        # Load the remaining model parameters
+        loader = AutoWeightsLoader(self)
+        autoloaded_weights = loader.load_weights(params_list)
+
+        return autoloaded_weights.union(set(loaded_buffers))
diff --git a/vllm_v0.10.0/vllm/model_executor/models/qwen.py b/vllm_v0.10.0/vllm/model_executor/models/qwen.py
new file mode 100644
index 0000000..e804f03
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/qwen.py
@@ -0,0 +1,362 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py
+# Copyright (c) Alibaba Cloud.
+# LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
+"""Inference-only QWen model compatible with HuggingFace weights."""
+import json
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class QWenMLP(nn.Module):
+    """MLP for the language component of the Qwen model, which contains a
+    MergedColumnParallelLinear merging 2 outputs via silu activation."""
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str = "silu",
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config)
+        self.c_proj = RowParallelLinear(intermediate_size,
+                                        hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config)
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.c_proj(x)
+        return x
+
+
+class QWenAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        max_position_embeddings: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size(
+        )
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tensor_model_parallel_world_size == 0
+        self.num_heads = (self.total_num_heads //
+                          tensor_model_parallel_world_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.c_attn = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            bias=True,
+            quant_config=quant_config,
+        )
+        self.c_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+        )
+        self.scaling = self.head_dim**-0.5
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.c_attn(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.c_proj(attn_output)
+        return output
+
+
+class QWenBlock(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        self.attn = QWenAttention(config.hidden_size,
+                                  config.num_attention_heads,
+                                  config.max_position_embeddings,
+                                  rope_theta=rope_theta,
+                                  rope_scaling=rope_scaling,
+                                  cache_config=cache_config,
+                                  quant_config=quant_config,
+                                  prefix=f"{prefix}.attn")
+
+        self.ln_2 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+
+        self.mlp = QWenMLP(config.hidden_size,
+                           config.intermediate_size // 2,
+                           quant_config=quant_config)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.ln_1(hidden_states)
+        else:
+            hidden_states, residual = self.ln_1(hidden_states, residual)
+        hidden_states = self.attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.ln_2(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class QWenModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+        self.vocab_size = config.vocab_size
+
+        self.wte = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.start_layer, self.end_layer, self.h = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: QWenBlock(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.h")
+        self.ln_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.wte(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for layer in self.h[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.ln_f(hidden_states, residual)
+        return hidden_states
+
+
+class QWenBaseModel(nn.Module):
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        transformer_type: type[QWenModel] = QWenModel,
+    ) -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        self.config = config
+        self.multimodal_config = multimodal_config
+        self.quant_config = quant_config
+        self.transformer = transformer_type(vllm_config=vllm_config,
+                                            prefix=maybe_prefix(
+                                                prefix, "transformer"))
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.transformer.wte.weight
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.transformer.make_empty_intermediate_tensors)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "w2", 0),
+            ("gate_up_proj", "w1", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class QWenLMHeadModel(QWenBaseModel, SupportsPP, SupportsLoRA):
+    packed_modules_mapping = {
+        "c_attn": ["c_attn"],
+        "gate_up_proj": [
+            "w2",
+            "w1",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
+        if hasattr(config, "visual"):
+            hf_overrides = {
+                "architectures": ["QwenVLForConditionalGeneration"]
+            }
+            raise RuntimeError(
+                "The configuration of this model indicates that it supports "
+                "vision inputs, but you instantiated the text-only version "
+                "of this model. Please use the vision model by setting "
+                f"`--hf-overrides '{json.dumps(hf_overrides)}'`")
+
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.transformer(input_ids, positions,
+                                         intermediate_tensors, inputs_embeds)
+        return hidden_states
diff --git a/vllm_v0.10.0/vllm/model_executor/models/qwen2.py b/vllm_v0.10.0/vllm/model_executor/models/qwen2.py
new file mode 100644
index 0000000..23f65b9
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/qwen2.py
@@ -0,0 +1,497 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen2 model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+from transformers import Qwen2Config
+
+from vllm.attention import Attention, AttentionType
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
+                    is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class Qwen2MLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Qwen2Attention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        max_position: int = 4096 * 32,
+        rope_theta: float = 10000,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        rope_scaling: Optional[tuple] = None,
+        prefix: str = "",
+        attn_type: str = AttentionType.DECODER,
+        dual_chunk_attention_config: Optional[dict[str, Any]] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.dual_chunk_attention_config = dual_chunk_attention_config
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=self.rope_theta,
+            rope_scaling=rope_scaling,
+            dual_chunk_attention_config=dual_chunk_attention_config,
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            attn_type=attn_type,
+            prefix=f"{prefix}.attn",
+            **{
+                "layer_idx": extract_layer_index(prefix),
+                "dual_chunk_attention_config": dual_chunk_attention_config,
+            } if dual_chunk_attention_config else {})
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Qwen2DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Qwen2Config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        # Requires transformers > 4.32.0
+        rope_theta = getattr(config, "rope_theta", 1000000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        dual_chunk_attention_config = getattr(config,
+                                              "dual_chunk_attention_config",
+                                              None)
+
+        # By default, Qwen2 uses causal attention as it is a decoder-only model.
+        # You can override the HF config with `is_causal=False` to enable
+        # bidirectional attention, which is used in some embedding models
+        # (e.g. Alibaba-NLP/gte-Qwen2-7B-instruct)
+        if getattr(config, "is_causal", True):
+            attn_type = AttentionType.DECODER
+        else:
+            attn_type = AttentionType.ENCODER_ONLY
+
+        self.self_attn = Qwen2Attention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            rope_scaling=rope_scaling,
+            prefix=f"{prefix}.self_attn",
+            attn_type=attn_type,
+            dual_chunk_attention_config=dual_chunk_attention_config,
+        )
+        self.mlp = Qwen2MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl,
+        # otherwise (seq_len, ).
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    })
+class Qwen2Model(nn.Module):
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 decoder_layer_type: type[nn.Module] = Qwen2DecoderLayer):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        # TODO (@robertgshaw2): see if this can be moved out
+        if (cache_config.sliding_window is not None
+                and hasattr(config, "max_window_layers")):
+            assert config.max_window_layers == config.num_hidden_layers, (
+                "Sliding window for some but all layers is not supported. "
+                "This model uses sliding window but `max_window_layers` = {} "
+                "is less than `num_hidden_layers` = {}. Please open an issue "
+                "to discuss this feature.".format(
+                    config.max_window_layers,
+                    config.num_hidden_layers,
+                ))
+
+        self.config = config
+        self.quant_config = quant_config
+        self.vocab_size = config.vocab_size
+
+        if get_pp_group().is_first_rank or (config.tie_word_embeddings
+                                            and get_pp_group().is_last_rank):
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.embed_tokens",
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        # Use the provided decoder layer type or default to Qwen2DecoderLayer
+        decoder_layer_type = decoder_layer_type or Qwen2DecoderLayer
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: decoder_layer_type(config=config,
+                                              cache_config=cache_config,
+                                              quant_config=quant_config,
+                                              prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.lora_config = lora_config
+
+        self.quant_config = quant_config
+        self.model = Qwen2Model(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+
+        if get_pp_group().is_last_rank:
+            if config.tie_word_embeddings:
+                self.lm_head = self.model.embed_tokens
+            else:
+                self.lm_head = ParallelLMHead(config.vocab_size,
+                                              config.hidden_size,
+                                              quant_config=quant_config,
+                                              prefix=maybe_prefix(
+                                                  prefix, "lm_head"))
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm_v0.10.0/vllm/model_executor/models/qwen2_5_omni_thinker.py
new file mode 100644
index 0000000..c5a5c10
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -0,0 +1,931 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen2.5-Omni model (thinker part)."""
+
+from collections.abc import Iterable, Mapping, Sequence
+from copy import copy
+from functools import partial
+from typing import Any, Optional, Union
+
+import torch
+import torch.nn as nn
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import (
+    Qwen2_5OmniConfig, Qwen2_5OmniThinkerConfig)
+from transformers.models.qwen2_5_omni.modeling_qwen2_5_omni import (
+    Qwen2_5OmniAudioEncoder)
+from transformers.models.qwen2_5_omni.processing_qwen2_5_omni import (
+    Qwen2_5OmniProcessor)
+from transformers.models.whisper import WhisperFeatureExtractor
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
+from vllm.model_executor.models.qwen2_5_vl import (
+    Qwen2_5_VisionTransformer, Qwen2_5_VLImageEmbeddingInputs,
+    Qwen2_5_VLImageInputs, Qwen2_5_VLImagePixelInputs,
+    Qwen2_5_VLProcessingInfo, Qwen2_5_VLVideoEmbeddingInputs,
+    Qwen2_5_VLVideoInputs, Qwen2_5_VLVideoPixelInputs)
+from vllm.model_executor.models.qwen2_audio import (
+    Qwen2AudioInputs, Qwen2AudioProcessingInfo,
+    _get_feat_extract_output_lengths)
+from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalDataParser
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (ImageItem, ModalityData,
+                                    MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.parse import (AudioProcessorItems, DictEmbeddingItems,
+                                   ModalityDataItems, MultiModalDataItems,
+                                   MultiModalDataParser)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        PlaceholderFeaturesInfo,
+                                        PromptReplacement, PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.tokenizer import decode_tokens, encode_tokens
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .utils import (AutoWeightsLoader, WeightsMapper,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+
+try:
+    import flash_attn
+except (ImportError, ModuleNotFoundError):
+    flash_attn = None
+
+logger = init_logger(__name__)
+
+
+def _qwen2_5_omni_thinker_field_config(hf_inputs: Mapping[str, torch.Tensor]):
+    audio_feature_lengths = hf_inputs.get("audio_feature_lengths",
+                                          torch.empty((0, )))
+
+    image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
+    image_grid_sizes = image_grid_thw.prod(-1)
+
+    video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
+    video_grid_sizes = video_grid_thw.prod(-1)
+
+    return dict(
+        input_audio_features=MultiModalFieldConfig.flat_from_sizes(
+            "audio", audio_feature_lengths, dim=1),
+        feature_attention_mask=MultiModalFieldConfig.batched("audio"),
+        audio_feature_lengths=MultiModalFieldConfig.batched("audio"),
+        pixel_values=MultiModalFieldConfig.flat_from_sizes(
+            "image", image_grid_sizes),
+        image_embeds=MultiModalFieldConfig.flat_from_sizes(
+            "image", image_grid_sizes),
+        image_grid_thw=MultiModalFieldConfig.batched("image"),
+        pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
+            "video", video_grid_sizes),
+        video_embeds=MultiModalFieldConfig.flat_from_sizes(
+            "video", video_grid_sizes),
+        video_grid_thw=MultiModalFieldConfig.batched("video"),
+        second_per_grid_ts=MultiModalFieldConfig.batched("video"),
+    )
+
+
+class Qwen2_5OmniThinkerMultiModalDataParser(Qwen2VLMultiModalDataParser):
+
+    def _parse_audio_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
+    ) -> ModalityDataItems[Any, Any]:
+        if isinstance(data, dict):
+            return DictEmbeddingItems(
+                data,
+                modality="audio",
+                required_fields={
+                    "input_audio_features", "audio_feature_lengths"
+                },
+                fields_factory=_qwen2_5_omni_thinker_field_config,
+            )
+
+        return super()._parse_audio_data(data)
+
+
+class Qwen2_5OmniThinkerProcessingInfo(Qwen2AudioProcessingInfo,
+                                       Qwen2_5_VLProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(Qwen2_5OmniConfig).thinker_config
+
+    def get_hf_processor(
+        self,
+        *,
+        sampling_rate: Optional[int] = None,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        size: Optional[dict[str, int]] = None,
+        fps: Optional[Union[float, list[float]]] = None,
+        **kwargs: object,
+    ) -> Qwen2_5OmniProcessor:
+        if fps is not None:
+            kwargs["fps"] = fps
+
+        # Monkey patch for Transformers v4.53
+        processor_class = Qwen2_5OmniProcessor
+        if processor_class.image_processor_class != "AutoImageProcessor":
+            processor_class.image_processor_class = "AutoImageProcessor"
+        if processor_class.video_processor_class != "AutoVideoProcessor":
+            processor_class.video_processor_class = "AutoVideoProcessor"
+
+        processor = self.ctx.get_hf_processor(
+            processor_class,
+            image_processor=self.get_image_processor(min_pixels=min_pixels,
+                                                     max_pixels=max_pixels,
+                                                     size=size,
+                                                     use_fast=kwargs.get(
+                                                         "use_fast", True)),
+            **kwargs,
+        )
+        if not hasattr(processor, "audio_token"):
+            processor.audio_token = "<|AUDIO|>"
+        if not hasattr(processor, "image_token"):
+            processor.image_token = "<|IMAGE|>"
+        if not hasattr(processor, "video_token"):
+            processor.video_token = "<|VIDEO|>"
+        return processor
+
+    def get_feature_extractor(
+        self,
+        *,
+        sampling_rate: Optional[int] = None,
+        **kwargs: object,
+    ):
+        hf_processor = self.get_hf_processor(sampling_rate=sampling_rate)
+        feature_extractor = hf_processor.feature_extractor  # type: ignore
+        assert isinstance(feature_extractor, WhisperFeatureExtractor)
+        return feature_extractor
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"audio": None, "image": None, "video": None}
+
+
+class Qwen2_5OmniThinkerDummyInputsBuilder(
+        BaseDummyInputsBuilder[Qwen2_5OmniThinkerProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        hf_processor = self.info.get_hf_processor()
+
+        audio_token: str = hf_processor.audio_token
+        image_token: str = hf_processor.image_token
+        video_token: str = hf_processor.video_token
+
+        return (audio_token * num_audios + image_token * num_images +
+                video_token * num_videos)
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_audios = mm_counts.get("audio", 0)
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        feature_extractor = self.info.get_feature_extractor()
+
+        target_audio_length = min(
+            feature_extractor.chunk_length,
+            30,
+        ) * feature_extractor.sampling_rate
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        target_num_frames = \
+            self.info.get_num_frames_with_most_features(seq_len, mm_counts)
+
+        mm_data = {
+            "audio":
+            self._get_dummy_audios(length=target_audio_length,
+                                   num_audios=num_audios),
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "video":
+            self._get_dummy_videos(width=target_width,
+                                   height=target_height,
+                                   num_frames=target_num_frames,
+                                   num_videos=num_videos),
+        }
+
+        return mm_data
+
+
+class Qwen2_5OmniThinkerMultiModalProcessor(
+        BaseMultiModalProcessor[Qwen2_5OmniThinkerProcessingInfo]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        feature_extractor = self.info.get_feature_extractor()
+        return Qwen2_5OmniThinkerMultiModalDataParser(
+            target_sr=feature_extractor.sampling_rate)
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        mm_data = dict(mm_data)
+        audios = mm_data.pop("audios", [])
+
+        # NOTE: WhisperFeatureExtractor cannot handle empty list of audios
+        if audios:
+            # NOTE: Qwen2.5-Omni processor accept "audio"
+            mm_data["audio"] = audios
+            mm_kwargs = dict(**mm_kwargs, )
+
+        hf_inputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+
+        input_features = hf_inputs.pop('input_features', None)
+        feature_attention_mask = hf_inputs.get('feature_attention_mask', None)
+        if ('input_audio_features' not in hf_inputs
+                and input_features is not None):
+            if feature_attention_mask is not None:
+                input_features = input_features.permute(
+                    0, 2, 1)[feature_attention_mask.bool()].permute(1, 0)
+            hf_inputs['input_audio_features'] = input_features
+        if ('audio_feature_lengths' not in hf_inputs
+                and feature_attention_mask is not None):
+            hf_inputs['audio_feature_lengths'] = feature_attention_mask.sum(-1)
+        return hf_inputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return _qwen2_5_omni_thinker_field_config(hf_inputs)
+
+    def _maybe_apply_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        prompt_ids: list[int],
+        mm_kwargs: MultiModalKwargs,
+        is_update_applied: bool,
+    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
+        """
+        Qwen2.5-Omni reimplements this function to handle `use_audio_in_video`.
+        """
+        unbound_prompt_updates = self._get_prompt_updates(
+            mm_items,
+            hf_processor_mm_kwargs,
+            mm_kwargs,
+        )
+        mm_prompt_updates = self._bind_and_group_updates(
+            unbound_prompt_updates)
+
+        mm_item_counts = mm_items.get_all_counts()
+        self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
+
+        use_audio_in_video = hf_processor_mm_kwargs.get(
+            "use_audio_in_video", False)
+
+        if is_update_applied:
+            mm_placeholders = self._find_mm_placeholders(
+                mm_prompt_updates,
+                prompt_ids,
+                mm_item_counts,
+            )
+            self._validate_mm_placeholders(
+                mm_placeholders,
+                mm_item_counts,
+                use_audio_in_video=use_audio_in_video)
+
+            tokenizer = self.info.get_tokenizer()
+            prompt = decode_tokens(tokenizer, prompt_ids)
+        else:
+            (
+                prompt_ids,
+                prompt,
+                mm_placeholders,
+            ) = self._apply_prompt_updates(
+                prompt_ids,
+                mm_prompt_updates,
+                mm_item_counts,
+            )
+            self._validate_mm_placeholders(
+                mm_placeholders,
+                mm_item_counts,
+                use_audio_in_video=use_audio_in_video)
+
+        tokenizer = self.info.get_tokenizer()
+        prompt = decode_tokens(tokenizer, prompt_ids)
+
+        if use_audio_in_video:
+            mm_kwargs["use_audio_in_video"] = True
+
+        return prompt_ids, prompt, mm_placeholders
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        tokenizer = self.info.get_tokenizer()
+        image_processor = self.info.get_image_processor(
+            **hf_processor_mm_kwargs)
+        vocab = tokenizer.get_vocab()
+
+        audio_token = processor.audio_token
+        image_token = processor.image_token
+        video_token = processor.video_token
+        audio_token_id = vocab[audio_token]
+        image_token_id = vocab[image_token]
+        video_token_id = vocab[video_token]
+
+        audio_feature_lengths = out_mm_kwargs.get("audio_feature_lengths")
+        feature_attention_mask = out_mm_kwargs.get("feature_attention_mask")
+        if audio_feature_lengths is None and feature_attention_mask is None:
+            audio_output_lengths = []
+        elif audio_feature_lengths is not None:
+            _, audio_output_lens = _get_feat_extract_output_lengths(
+                audio_feature_lengths)
+            audio_output_lengths = audio_output_lens.tolist()
+        elif feature_attention_mask is not None:
+            assert isinstance(feature_attention_mask, torch.Tensor)
+            _, audio_output_lens = _get_feat_extract_output_lengths(
+                feature_attention_mask.sum(-1))
+            audio_output_lengths = audio_output_lens.tolist()
+
+        # number of audios read from video.
+        audio_in_video_item_idx = 0
+
+        def get_replacement_qwen2_audio(item_idx: int):
+            item_idx += audio_in_video_item_idx
+
+            num_features = audio_output_lengths[item_idx]
+            if num_features == 0:
+                audios = mm_items.get_items("audio", AudioProcessorItems)
+                audio = audios.get(item_idx)
+                raise ValueError(
+                    f"The audio {audio} (len={len(audio)}) is too short "
+                    "to be represented inside the model")
+
+            return [audio_token_id] * num_features
+
+        def get_replacement_qwen2_vision(item_idx: int, modality: str):
+            grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx]
+            assert isinstance(grid_thw, torch.Tensor)
+            merge_length = image_processor.merge_size**2
+
+            token_id = image_token_id if modality == "image" else video_token_id
+            return [token_id] * (int(grid_thw.prod()) // merge_length)
+
+        use_audio_in_video = hf_processor_mm_kwargs.get(
+            "use_audio_in_video", False)
+        thinker_config = self.info.get_hf_config()
+
+        def get_replacement_qwen2_use_audio_in_video(item_idx: int):
+            nonlocal audio_in_video_item_idx
+
+            audio_num_features = audio_output_lengths[audio_in_video_item_idx +
+                                                      item_idx]
+            video_grid_thw = out_mm_kwargs["video_grid_thw"][item_idx]
+
+            audio_in_video_item_idx += 1
+
+            second_per_grid_ts = hf_processor_mm_kwargs.get(
+                "second_per_grid_ts", None)
+            if second_per_grid_ts:
+                video_second_per_grid_t = second_per_grid_ts[item_idx]
+            else:
+                video_second_per_grid_t = 1.0
+
+            return MRotaryEmbedding.omni_get_updates_use_audio_in_video(
+                thinker_config=thinker_config,
+                audio_len=audio_num_features,
+                video_grid_thw=video_grid_thw,
+                video_second_per_grid_t=video_second_per_grid_t,
+            )
+
+        video_replacement_fn = (
+            get_replacement_qwen2_use_audio_in_video if use_audio_in_video else
+            partial(get_replacement_qwen2_vision, modality="video"))
+
+        return [
+            PromptReplacement(
+                modality="audio",
+                target=audio_token,
+                replacement=get_replacement_qwen2_audio,
+            ),
+            PromptReplacement(
+                modality="image",
+                target=image_token,
+                replacement=partial(get_replacement_qwen2_vision,
+                                    modality="image"),
+            ),
+            PromptReplacement(
+                modality="video",
+                target=video_token,
+                replacement=video_replacement_fn,
+            ),
+        ]
+
+    def _apply_hf_processor_main(
+        self,
+        prompt: Union[str, list[int]],
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+        *,
+        enable_hf_prompt_update: bool,
+    ) -> tuple[list[int], MultiModalKwargs, bool]:
+        """
+        Qwen2.5-Omni reimplements this function to handle text only.
+        """
+        if isinstance(prompt, str):
+            if enable_hf_prompt_update:
+                return self._apply_hf_processor_text_mm(
+                    prompt_text=prompt,
+                    mm_items=mm_items,
+                    hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+                    tokenization_kwargs=tokenization_kwargs,
+                )
+            tokenizer = self.info.get_tokenizer()
+            prompt_ids = encode_tokens(tokenizer, prompt)
+        else:
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt)
+
+        mm_kwargs = self._apply_hf_processor_mm_only(
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            tokenization_kwargs=tokenization_kwargs,
+        )
+
+        return prompt_ids, mm_kwargs, False
+
+    def _apply_hf_processor_mm_only(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+    ) -> MultiModalKwargs:
+        """
+        Qwen2.5-Omni reimplements this function to handle `use_audio_in_video`.
+        """
+        mm_counts = mm_items.get_all_counts()
+
+        use_audio_in_video = hf_processor_mm_kwargs.get(
+            "use_audio_in_video", False)
+        if use_audio_in_video and "video" in mm_counts:
+            assert "audio" in mm_counts
+            mm_counts["audio"] -= mm_counts["video"]
+
+        _, mm_kwargs, _ = self._apply_hf_processor_text_mm(
+            prompt_text=self.dummy_inputs.get_dummy_text(mm_counts),
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            tokenization_kwargs=tokenization_kwargs,
+        )
+
+        return mm_kwargs
+
+    def _validate_mm_placeholders(
+        self,
+        mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
+        mm_item_counts: Mapping[str, int],
+        use_audio_in_video: bool = False,
+    ) -> None:
+        if use_audio_in_video:
+            mm_item_counts = copy(mm_item_counts)
+            if "video" in mm_item_counts:
+                assert "audio" in mm_item_counts
+                mm_item_counts["audio"] -= mm_item_counts["video"]
+        super()._validate_mm_placeholders(mm_placeholders, mm_item_counts)
+
+
+class Qwen2_5OmniConditionalGenerationMixin:
+
+    def _validate_and_reshape_mm_tensor(self,
+                                        mm_input: object,
+                                        name: str,
+                                        dim: int = 0) -> torch.Tensor:
+        if not isinstance(mm_input, (torch.Tensor, list)):
+            raise ValueError(f"Incorrect type of {name}. "
+                             f"Got type: {type(mm_input)}")
+        if isinstance(mm_input, torch.Tensor):
+            return torch.concat(list(mm_input), dim=dim)
+        else:
+            return torch.concat(mm_input, dim=dim)
+
+    def _parse_and_validate_audio_input(
+            self, **kwargs: object) -> Optional[Qwen2AudioInputs]:
+        input_audio_features = kwargs.pop('input_audio_features', None)
+        audio_feature_lengths = kwargs.pop('audio_feature_lengths', None)
+        feature_attention_mask = kwargs.pop('feature_attention_mask', None)
+        if input_audio_features is None:
+            return None
+        input_audio_features = self._validate_and_reshape_mm_tensor(
+            input_audio_features, 'input_audio_features', dim=1)
+        if feature_attention_mask is not None:
+            feature_attention_mask = self._validate_and_reshape_mm_tensor(
+                feature_attention_mask, 'feature_attention_mask')
+        if not isinstance(input_audio_features, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of audio input features. "
+                             f"Got type: {type(input_audio_features)}")
+        return Qwen2AudioInputs(input_features=input_audio_features,
+                                audio_feature_lengths=audio_feature_lengths,
+                                feature_attention_mask=feature_attention_mask)
+
+    def _parse_and_validate_image_input(
+        self,
+        **kwargs: dict[str, Any],
+    ) -> Optional[Qwen2_5_VLImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        image_grid_thw = kwargs.pop("image_grid_thw", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            pixel_values = self._validate_and_reshape_mm_tensor(
+                pixel_values, "image pixel values")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return Qwen2_5_VLImagePixelInputs(type="pixel_values",
+                                              pixel_values=pixel_values,
+                                              image_grid_thw=image_grid_thw)
+
+        if image_embeds is not None:
+            image_embeds = self._validate_and_reshape_mm_tensor(
+                image_embeds, "image embeds")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            if not isinstance(image_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+            return Qwen2_5_VLImageEmbeddingInputs(
+                type="image_embeds",
+                image_embeds=image_embeds,
+                image_grid_thw=image_grid_thw)
+
+    def _parse_and_validate_video_input(
+        self,
+        **kwargs: dict[str, Any],
+    ) -> Optional[Qwen2_5_VLVideoInputs]:
+        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
+        video_embeds = kwargs.pop("video_embeds", None)
+        video_grid_thw = kwargs.pop("video_grid_thw", None)
+
+        if pixel_values_videos is None and video_embeds is None:
+            return None
+
+        if pixel_values_videos is not None:
+            pixel_values_videos = self._validate_and_reshape_mm_tensor(
+                pixel_values_videos, "video pixel values")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            return Qwen2_5_VLVideoPixelInputs(
+                type="pixel_values_videos",
+                pixel_values_videos=pixel_values_videos,
+                video_grid_thw=video_grid_thw,
+            )
+
+        if video_embeds is not None:
+            video_embeds = self._validate_and_reshape_mm_tensor(
+                video_embeds, "video embeds")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            if not isinstance(video_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of video embeddings. "
+                                 f"Got type: {type(video_embeds)}")
+            return Qwen2_5_VLVideoEmbeddingInputs(
+                type="video_embeds",
+                video_embeds=video_embeds,
+                video_grid_thw=video_grid_thw)
+
+    def _process_audio_input(
+        self,
+        audio_input: Qwen2AudioInputs,
+        audio_hashes: list[str] = None,
+        cached_audio_features: torch.Tensor = None,
+    ) -> torch.Tensor:
+
+        input_features = audio_input["input_features"]
+        audio_feature_lengths = audio_input["audio_feature_lengths"]
+        if input_features.ndim == 3:
+            assert input_features.shape[0] == 1
+            input_features = input_features.squeeze(0)
+        if audio_feature_lengths.ndim == 2:
+            assert audio_feature_lengths.shape[
+                0] == 1 or audio_feature_lengths.shape[1] == 1
+            if audio_feature_lengths.shape[0] == 1:
+                audio_feature_lengths = audio_feature_lengths.squeeze(0)
+            else:
+                audio_feature_lengths = audio_feature_lengths.squeeze(1)
+
+        audio_feat_lengths, audio_output_lengths = (
+            self.audio_tower._get_feat_extract_output_lengths(
+                audio_feature_lengths))
+
+        audio_outputs = self.audio_tower(
+            input_features.to(self.audio_tower.dtype),
+            feature_lens=audio_feature_lengths,
+            aftercnn_lens=audio_feat_lengths,
+        )
+        audio_features = audio_outputs.last_hidden_state
+        return audio_features.split(audio_output_lengths.tolist())
+
+    def _process_image_input(
+            self,
+            image_input: Qwen2_5_VLImageInputs) -> tuple[torch.Tensor, ...]:
+        if image_input["type"] == "image_embeds":
+            return image_input["image_embeds"].type(self.visual.dtype)
+
+        grid_thw = image_input["image_grid_thw"]
+        assert grid_thw.ndim == 2
+
+        pixel_values = image_input["pixel_values"].type(self.visual.dtype)
+        image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
+        # Split concatenated embeddings for each image item.
+        merge_size = self.visual.spatial_merge_size
+        sizes = grid_thw.prod(-1) // merge_size // merge_size
+
+        return image_embeds.split(sizes.tolist())
+
+    def _process_video_input(
+            self,
+            video_input: Qwen2_5_VLVideoInputs,
+            video_hashes: list[str] = None,
+            cached_video_embeds: torch.Tensor = None) -> torch.Tensor:
+        if video_input["type"] == "video_embeds":
+            return video_input["video_embeds"].type(self.visual.dtype)
+
+        grid_thw = video_input["video_grid_thw"]
+        assert grid_thw.ndim == 2
+
+        pixel_values_videos = video_input["pixel_values_videos"].type(
+            self.visual.dtype)
+        video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)
+        # Split concatenated embeddings for each video item.
+        merge_size = self.visual.spatial_merge_size
+        sizes = grid_thw.prod(-1) // merge_size // merge_size
+
+        return video_embeds.split(sizes.tolist())
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Qwen2_5OmniThinkerMultiModalProcessor,
+    info=Qwen2_5OmniThinkerProcessingInfo,
+    dummy_inputs=Qwen2_5OmniThinkerDummyInputsBuilder,
+)
+class Qwen2_5OmniThinkerForConditionalGeneration(
+        nn.Module, SupportsMultiModal, SupportsPP,
+        Qwen2_5OmniConditionalGenerationMixin):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "thinker.lm_head.": "language_model.lm_head.",
+            "thinker.model.": "language_model.model.",
+            "thinker.": "",
+        })
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<|vision_start|><|IMAGE|><|vision_end|>"
+        if modality.startswith("video"):
+            return "<|vision_start|><|VIDEO|><|vision_end|>"
+        if modality.startswith("audio"):
+            return f"Audio {i}: <|audio_bos|><|AUDIO|><|audio_eos|>"
+
+        raise ValueError("Only image, video or audio modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        thinker_config: Qwen2_5OmniThinkerConfig = (
+            vllm_config.model_config.hf_config.thinker_config)
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        self.config = thinker_config
+        self.multimodal_config = multimodal_config
+
+        # force "use_flash_attention_2=True" to audio tower to align
+        # the results.
+        if flash_attn is not None:
+            audio_config = thinker_config.audio_config
+            audio_config._attn_implementation_autoset = True
+            audio_config._attn_implementation = "flash_attention_2"
+        else:
+            logger.warning(
+                "flash_attn is not available, the model may not yield the "
+                "exactly same result as the transformers implementation "
+                "in the audio tower part.")
+
+        self.audio_tower = Qwen2_5OmniAudioEncoder(thinker_config.audio_config)
+        self.visual = Qwen2_5_VisionTransformer(
+            vision_config=thinker_config.vision_config,
+            norm_eps=getattr(thinker_config.text_config, "rms_norm_eps", 1e-6),
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "visual"),
+        )
+        self.quant_config = quant_config
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+            hf_config=thinker_config.text_config,
+            architectures=["Qwen2ForCausalLM"],
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        mm_input_by_modality = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("pixel_values", "image_embeds"
+                             ) and "image" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "image"] = self._parse_and_validate_image_input(**kwargs)
+            if input_key in ("pixel_values_videos", "video_embeds"
+                             ) and "video" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "video"] = self._parse_and_validate_video_input(**kwargs)
+            if input_key in ("input_audio_features"
+                             ) and "audio" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "audio"] = self._parse_and_validate_audio_input(**kwargs)
+        return mm_input_by_modality
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+
+        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(
+            **kwargs)
+        if not mm_input_by_modality:
+            return []
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in mm_input_by_modality:
+            multimodal_input = mm_input_by_modality[modality]
+            if modality == "image":
+                vision_embeddings = self._process_image_input(multimodal_input)
+                multimodal_embeddings += vision_embeddings
+            if modality == "video":
+                video_embeddings = self._process_video_input(multimodal_input)
+                multimodal_embeddings += video_embeddings
+            if modality == "audio":
+                audio_embeddings = self._process_audio_input(multimodal_input)
+                multimodal_embeddings += audio_embeddings
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+
+            # TODO (ywang96): support overlapping modalitiy embeddings so that
+            # `use_audio_in_video` will work on V1.
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings, [
+                    self.config.image_token_index,
+                    self.config.video_token_index,
+                    self.config.audio_token_index
+                ])
+        return inputs_embeds
+
+    def get_multimodal_embeddings_v0(
+            self, **kwargs: object) -> Optional[NestedTensors]:
+        audio_input = self._parse_and_validate_audio_input(**kwargs)
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        video_input = self._parse_and_validate_video_input(**kwargs)
+
+        if audio_input is None and image_input is None and video_input is None:
+            return None
+
+        multimodal_embeddings: list[tuple[NestedTensors, str]] = []
+
+        if audio_input is not None:
+            audio_embeds = self._process_audio_input(audio_input)
+            multimodal_embeddings.append((audio_embeds, "audio"))
+        if image_input is not None:
+            image_embeds = self._process_image_input(image_input)
+            multimodal_embeddings.append((image_embeds, "image"))
+        if video_input is not None:
+            video_embeds = self._process_video_input(video_input)
+            multimodal_embeddings.append((video_embeds, "video"))
+        return multimodal_embeddings
+
+    def get_input_embeddings_v0(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
+            return inputs_embeds
+
+        for embeddings, modality in multimodal_embeddings:
+            if modality == "audio":
+                placeholder_token_id = self.config.audio_token_index
+            if modality == "image":
+                placeholder_token_id = self.config.image_token_index
+            if modality == "video":
+                placeholder_token_id = self.config.video_token_index
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, embeddings, placeholder_token_id)
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            multimodal_embeddings = self.get_multimodal_embeddings_v0(**kwargs)
+            inputs_embeds = self.get_input_embeddings_v0(
+                input_ids, multimodal_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=["talker.", "token2wav."],
+        )
+        loaded_weights = loader.load_weights(weights,
+                                             mapper=self.hf_to_vllm_mapper)
+
+        return loaded_weights
diff --git a/vllm_v0.10.0/vllm/model_executor/models/qwen2_5_vl.py b/vllm_v0.10.0/vllm/model_executor/models/qwen2_5_vl.py
new file mode 100644
index 0000000..8ae0965
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/qwen2_5_vl.py
@@ -0,0 +1,1175 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
+# Copyright 2025 The vLLM team.
+# Copyright 2025 The Qwen Team.
+# Copyright 2025 The HuggingFace Inc. team.
+# All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen2.5-VL model compatible with HuggingFace weights."""
+from collections.abc import Iterable, Mapping
+from functools import lru_cache, partial
+from typing import Callable, Literal, Optional, TypedDict, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from transformers import BatchFeature
+from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
+    Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig)
+
+from vllm.config import VllmConfig
+from vllm.distributed import parallel_state
+from vllm.distributed import utils as dist_utils
+from vllm.logger import init_logger
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.gptq import GPTQConfig
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQMarlinConfig)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalFieldConfig
+from vllm.platforms import _Backend
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import uses_mrope
+
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP, SupportsQuant)
+from .qwen2_vl import Qwen2VLDummyInputsBuilder as Qwen2_5_VLDummyInputsBuilder
+from .qwen2_vl import (Qwen2VLMultiModalProcessor, Qwen2VLProcessingInfo,
+                       apply_rotary_pos_emb_vision)
+from .utils import (AutoWeightsLoader, WeightsMapper, cast_overflow_tensors,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+from .vision import get_vit_attn_backend
+
+logger = init_logger(__name__)
+
+# === Vision Inputs === #
+
+
+class Qwen2_5_VLImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: torch.Tensor
+    """Shape:
+    `(num_patches, num_channels * patch_size * patch_size)`
+    """
+
+    image_grid_thw: torch.Tensor
+    """Shape: `(num_images, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+class Qwen2_5_VLImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    image_embeds: torch.Tensor
+    """Supported types:
+    - list[`torch.Tensor`]: A list of tensors holding all images' features.
+        Each tensor holds an image's features.
+    - `torch.Tensor`: A tensor holding all images' features
+        (concatenation of all images' feature tensors).
+
+    Tensor shape: `(num_image_features, hidden_size)`
+    - `num_image_features` varies based on
+        the number and resolution of the images.
+    - `hidden_size` must match the hidden size of language model backbone.
+    """
+
+    image_grid_thw: torch.Tensor
+    """Shape: `(num_images, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+Qwen2_5_VLImageInputs = Union[Qwen2_5_VLImagePixelInputs,
+                              Qwen2_5_VLImageEmbeddingInputs]
+
+
+class Qwen2_5_VLVideoPixelInputs(TypedDict):
+    type: Literal["pixel_values_videos"]
+    pixel_values_videos: torch.Tensor
+    """Shape:
+    `(num_patches,
+      num_channels * temporal_patch_size * patch_size * patch_size)`
+    """
+
+    video_grid_thw: torch.Tensor
+    """Shape: `(num_videos, 3)`
+
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+    second_per_grid_ts: torch.Tensor
+    """
+    The video time interval (in seconds) for each grid along the temporal 
+    dimension in the 3D position IDs. Returned when `videos` is not `None`.
+    """
+
+
+class Qwen2_5_VLVideoEmbeddingInputs(TypedDict):
+    type: Literal["video_embeds"]
+    video_embeds: torch.Tensor
+    """Supported types:
+    - list[`torch.Tensor`]: A list of tensors holding all videos' features.
+        Each tensor holds an video's features.
+    - `torch.Tensor`: A tensor holding all videos' features
+      (concatenation of all videos' feature tensors).
+
+    Tensor shape: `(num_image_features, hidden_size)`
+    - `num_image_features` varies based on
+        the number and resolution of the videos.
+    - `hidden_size` must match the hidden size of language model backbone.
+    """
+
+    video_grid_thw: torch.Tensor
+    """Shape: `(num_videos, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+Qwen2_5_VLVideoInputs = Union[Qwen2_5_VLVideoPixelInputs,
+                              Qwen2_5_VLVideoEmbeddingInputs]
+
+# === Vision Encoder === #
+
+
+class Qwen2_5_VisionMLP(nn.Module):
+
+    def __init__(self,
+                 in_features: int,
+                 hidden_features: int,
+                 bias: bool = False,
+                 act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.gate_proj = ColumnParallelLinear(in_features,
+                                              hidden_features,
+                                              bias=bias,
+                                              quant_config=quant_config,
+                                              prefix=f"{prefix}.gate_proj")
+        self.up_proj = ColumnParallelLinear(in_features,
+                                            hidden_features,
+                                            bias=bias,
+                                            quant_config=quant_config,
+                                            prefix=f"{prefix}.up_proj")
+        self.down_proj = RowParallelLinear(hidden_features,
+                                           in_features,
+                                           bias=bias,
+                                           quant_config=quant_config,
+                                           prefix=f"{prefix}.down_proj")
+        self.act_fn = act_fn
+
+    def forward(self, x: torch.Tensor):
+        x_gate, _ = self.gate_proj(x)
+        x_gate = self.act_fn(x_gate)
+        x_up, _ = self.up_proj(x)
+        x_down, _ = self.down_proj(x_gate * x_up)
+        return x_down
+
+
+def all_gather_interleave(local_tensor, hidden_size: int, tp_size: int):
+    """All-gather the input tensor interleavely across model parallel group."""
+    import torch.distributed as dist
+    gathered_tensors = [torch.zeros_like(local_tensor) for _ in range(tp_size)]
+    dist.all_gather(gathered_tensors,
+                    local_tensor,
+                    group=parallel_state.get_tp_group().device_group)
+
+    gathered_tensors_split = [
+        torch.split(tensor, hidden_size // tp_size, -1)
+        for tensor in gathered_tensors
+    ]
+    ordered_tensors = [
+        tensor for pair in zip(*gathered_tensors_split) for tensor in pair
+    ]
+    result_tensor = torch.cat(ordered_tensors, dim=-1)
+    return result_tensor
+
+
+class Qwen2_5_VisionAttention(nn.Module):
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        projection_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        # Per attention head and per partition values.
+        self.tp_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        self.hidden_size_per_attention_head = dist_utils.divide(
+            projection_size, num_heads)
+        self.num_attention_heads_per_partition = dist_utils.divide(
+            num_heads, self.tp_size)
+
+        self.qkv = QKVParallelLinear(
+            hidden_size=embed_dim,
+            head_size=self.hidden_size_per_attention_head,
+            total_num_heads=num_heads,
+            total_num_kv_heads=num_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv")
+        self.proj = RowParallelLinear(input_size=projection_size,
+                                      output_size=embed_dim,
+                                      quant_config=quant_config,
+                                      prefix=f"{prefix}.proj")
+
+        # Detect attention implementation.
+        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
+        if self.attn_backend not in {
+                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS
+        }:
+            raise RuntimeError(
+                f"Qwen2.5-VL does not support {self.attn_backend} backend now."
+            )
+
+    def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
+        # [s, b, 3 * head * head_dim]
+        seq_len, bs, _ = qkv.shape
+        if self.tp_size > 1:
+            qkv = all_gather_interleave(qkv, self.qkv.hidden_size,
+                                        self.tp_size)
+
+        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim]
+        q, k, v = qkv.chunk(3, dim=2)
+
+        # 3 * [s, b, head * head_dim]
+        if self.tp_size > 1:
+            splitter = partial(dist_utils.split_tensor_along_last_dim,
+                               num_partitions=self.tp_size)
+            q = splitter(q)[self.tp_rank]
+            k = splitter(k)[self.tp_rank]
+            v = splitter(v)[self.tp_rank]
+
+        # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim]
+        new_shape = (seq_len, bs, self.num_attention_heads_per_partition,
+                     self.hidden_size_per_attention_head)
+        q, k, v = (x.view(*new_shape) for x in (q, k, v))
+        return q, k, v
+
+    def forward(
+            self,
+            x: torch.Tensor,
+            cu_seqlens: torch.Tensor,
+            rotary_pos_emb: torch.Tensor,
+            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
+            seqlens: Optional[list[int]] = None,  # Only used for xFormers
+    ) -> torch.Tensor:
+        # [s, b, c] --> [s, b, head * 3 * head_dim]
+        x, _ = self.qkv(x)
+
+        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim]
+        q, k, v = self.split_qkv(x)
+        batch_size = q.shape[1]
+
+        q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous()
+                   for x in (q, k, v))
+        if rotary_pos_emb is not None:
+            q = apply_rotary_pos_emb_vision(q, rotary_pos_emb)
+            k = apply_rotary_pos_emb_vision(k, rotary_pos_emb)
+
+        if self.attn_backend == _Backend.FLASH_ATTN:
+            # from vllm_flash_attn.flash_attn_interface import (
+            #   flash_attn_varlen_func)
+            from flash_attn import flash_attn_varlen_func
+
+            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
+
+            output = flash_attn_varlen_func(q,
+                                            k,
+                                            v,
+                                            cu_seqlens_q=cu_seqlens,
+                                            cu_seqlens_k=cu_seqlens,
+                                            max_seqlen_q=max_seqlen,
+                                            max_seqlen_k=max_seqlen,
+                                            dropout_p=0,
+                                            causal=False)
+
+            context_layer = rearrange(output,
+                                      "(b s) ... -> b s ...",
+                                      b=batch_size)
+        elif self.attn_backend == _Backend.TORCH_SDPA:
+            # Execute attention entry by entry for speed & less VRAM.
+            outputs = []
+            for i in range(1, len(cu_seqlens)):
+                start_idx = cu_seqlens[i - 1]
+                end_idx = cu_seqlens[i]
+                q_i = q[:, start_idx:end_idx]
+                k_i = k[:, start_idx:end_idx]
+                v_i = v[:, start_idx:end_idx]
+                q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
+                                 for x in [q_i, k_i, v_i])
+                output_i = F.scaled_dot_product_attention(q_i,
+                                                          k_i,
+                                                          v_i,
+                                                          dropout_p=0.0)
+                output_i = rearrange(output_i, "b h s d -> b s h d ")
+                outputs.append(output_i)
+            context_layer = torch.cat(outputs, dim=1)
+        elif self.attn_backend == _Backend.XFORMERS:
+            from xformers import ops as xops
+            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
+
+            attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens,
+                                                       kv_seqlen=None,
+                                                       device=q.device)
+
+            context_layer = xops.memory_efficient_attention_forward(
+                q, k, v, attn_bias=attn_bias, p=0, scale=None)
+        context_layer = rearrange(context_layer,
+                                  "b s h d -> s b (h d)").contiguous()
+
+        output, _ = self.proj(context_layer)
+        return output
+
+
+class Qwen2_5_VisionBlock(nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_hidden_dim: int,
+        act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
+        norm_layer: Optional[Callable[[int], nn.Module]] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.norm1 = norm_layer(dim)
+        self.norm2 = norm_layer(dim)
+        self.attn = Qwen2_5_VisionAttention(embed_dim=dim,
+                                            num_heads=num_heads,
+                                            projection_size=dim,
+                                            quant_config=quant_config,
+                                            prefix=f"{prefix}.attn")
+        self.mlp = Qwen2_5_VisionMLP(dim,
+                                     mlp_hidden_dim,
+                                     act_fn=act_fn,
+                                     bias=True,
+                                     quant_config=quant_config,
+                                     prefix=f"{prefix}.mlp")
+
+    def forward(
+            self,
+            x: torch.Tensor,
+            cu_seqlens: torch.Tensor,
+            rotary_pos_emb: torch.Tensor,
+            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
+            seqlens: Optional[list[int]] = None,  # Only used for xFormers
+    ) -> torch.Tensor:
+        x = x + self.attn(self.norm1(x),
+                          cu_seqlens=cu_seqlens,
+                          rotary_pos_emb=rotary_pos_emb,
+                          max_seqlen=max_seqlen,
+                          seqlens=seqlens)
+
+        x = x + self.mlp(self.norm2(x))
+        return x
+
+
+class Qwen2_5_VisionPatchEmbed(nn.Module):
+
+    def __init__(
+        self,
+        patch_size: int = 14,
+        temporal_patch_size: int = 2,
+        in_channels: int = 3,
+        hidden_size: int = 1152,
+    ) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.hidden_size = hidden_size
+
+        kernel_size = (temporal_patch_size, patch_size, patch_size)
+        self.proj = nn.Conv3d(in_channels,
+                              hidden_size,
+                              kernel_size=kernel_size,
+                              stride=kernel_size,
+                              bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        L, C = x.shape
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size,
+                   self.patch_size)
+        x = self.proj(x).view(L, self.hidden_size)
+        return x
+
+
+class Qwen2_5_VisionPatchMerger(nn.Module):
+
+    def __init__(
+        self,
+        d_model: int,
+        context_dim: int,
+        norm_layer: Optional[Callable[[int], nn.Module]] = None,
+        spatial_merge_size: int = 2,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size**2)
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.ln_q = norm_layer(context_dim)
+        self.mlp = nn.ModuleList([
+            ColumnParallelLinear(self.hidden_size,
+                                 self.hidden_size,
+                                 bias=True,
+                                 quant_config=quant_config,
+                                 prefix=f"{prefix}.mlp.0"),
+            nn.GELU(),
+            RowParallelLinear(self.hidden_size,
+                              d_model,
+                              bias=True,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.mlp.2"),
+        ])
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.ln_q(x)
+        x = x.view(-1, self.hidden_size)
+
+        mlp_fc1, mlp_act, mlp_fc2 = self.mlp
+        x_parallel, _ = mlp_fc1(x)
+        x_parallel = mlp_act(x_parallel)
+        out, _ = mlp_fc2(x_parallel)
+        return out
+
+
+class Qwen2_5_VisionRotaryEmbedding(nn.Module):
+
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        inv_freq = 1.0 / (theta**(
+            torch.arange(0, dim, 2, dtype=torch.float, device='cpu') / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self._seq_len_cached = 0
+        self._freqs_cached = None
+
+    def update_freqs_cache(self, seqlen: int) -> None:
+        if seqlen > self._seq_len_cached:
+            seqlen *= 2
+            self._seq_len_cached = seqlen
+            self.inv_freq = 1.0 / (self.theta**(torch.arange(
+                0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device)
+                                                / self.dim))
+            seq = torch.arange(seqlen,
+                               device=self.inv_freq.device,
+                               dtype=self.inv_freq.dtype)
+            freqs = torch.outer(seq, self.inv_freq)
+            self._freqs_cached = freqs
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        self.update_freqs_cache(seqlen)
+        return self._freqs_cached[:seqlen]
+
+
+class Qwen2_5_VisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        vision_config: Qwen2_5_VLVisionConfig,
+        norm_eps: float = 1e-6,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        patch_size = vision_config.patch_size
+        temporal_patch_size = vision_config.temporal_patch_size
+        in_channels = vision_config.in_channels
+        depth = vision_config.depth
+        self.hidden_size = vision_config.hidden_size
+        self.num_heads = vision_config.num_heads
+
+        # args for get_window_index_thw
+        self.window_size = vision_config.window_size
+        self.patch_size = vision_config.patch_size
+        self.spatial_merge_size = vision_config.spatial_merge_size
+        self.fullatt_block_indexes = vision_config.fullatt_block_indexes
+        self.spatial_merge_unit = self.spatial_merge_size**2
+
+        self.patch_embed = Qwen2_5_VisionPatchEmbed(
+            patch_size=patch_size,
+            temporal_patch_size=temporal_patch_size,
+            in_channels=in_channels,
+            hidden_size=self.hidden_size,
+        )
+
+        norm_layer = partial(RMSNorm, eps=norm_eps)
+        head_dim = self.hidden_size // self.num_heads
+        self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
+
+        self.blocks = nn.ModuleList([
+            Qwen2_5_VisionBlock(
+                dim=self.hidden_size,
+                num_heads=self.num_heads,
+                mlp_hidden_dim=vision_config.intermediate_size,
+                act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act],
+                norm_layer=norm_layer,
+                quant_config=quant_config,
+                prefix=f"{prefix}.blocks.{layer_idx}")
+            for layer_idx in range(depth)
+        ])
+        self.merger = Qwen2_5_VisionPatchMerger(
+            d_model=vision_config.out_hidden_size,
+            context_dim=self.hidden_size,
+            norm_layer=norm_layer,
+            spatial_merge_size=self.spatial_merge_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.merger",
+        )
+        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.patch_embed.proj.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.patch_embed.proj.weight.device
+
+    def rotary_pos_emb_thw(self, t, h, w):
+        hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+        wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+        hpos_ids = hpos_ids.reshape(
+            h // self.spatial_merge_size,
+            self.spatial_merge_size,
+            w // self.spatial_merge_size,
+            self.spatial_merge_size,
+        ).permute(0, 2, 1, 3).flatten()
+        wpos_ids = wpos_ids.reshape(
+            h // self.spatial_merge_size,
+            self.spatial_merge_size,
+            w // self.spatial_merge_size,
+            self.spatial_merge_size,
+        ).permute(0, 2, 1, 3).flatten()
+        pos_ids = torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)
+        max_size = max(h, w)
+        rotary_pos_emb_full = self.rotary_pos_emb(max_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        rotary_pos_emb = rotary_pos_emb.reshape(
+            rotary_pos_emb.shape[0] // self.spatial_merge_unit,
+            self.spatial_merge_unit, -1)
+
+        return rotary_pos_emb
+
+    def get_window_index_thw(self, grid_t, grid_h, grid_w):
+        vit_merger_window_size = (self.window_size //
+                                  self.spatial_merge_size // self.patch_size)
+
+        llm_grid_h = grid_h // self.spatial_merge_size
+        llm_grid_w = grid_w // self.spatial_merge_size
+        index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(
+            grid_t, llm_grid_h, llm_grid_w)
+        pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
+        pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
+        num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
+        num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
+        index_padded = F.pad(index, (0, pad_w, 0, pad_h), 'constant', -100)
+        index_padded = index_padded.reshape(grid_t, num_windows_h,
+                                            vit_merger_window_size,
+                                            num_windows_w,
+                                            vit_merger_window_size)
+        index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
+            grid_t, num_windows_h * num_windows_w, vit_merger_window_size,
+            vit_merger_window_size)
+        seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
+        index_padded = index_padded.reshape(-1)
+        index_new = index_padded[index_padded != -100]
+        cu_seqlens_tmp = seqlens.cumsum(0) * self.spatial_merge_unit
+        cu_seqlens_tmp = cu_seqlens_tmp.to(dtype=torch.int32)
+        cu_seqlens_tmp = torch.unique_consecutive(cu_seqlens_tmp)
+
+        return index_new, cu_seqlens_tmp
+
+    @lru_cache(maxsize=1024)  # noqa: B019
+    def get_rope_by_thw(self, t, h, w):
+        window_index_thw, cu_seqlens_window_thw = self.get_window_index_thw(
+            t, h, w)
+        rotary_pos_emb_thw = self.rotary_pos_emb_thw(t, h, w)
+        rotary_pos_emb_thw = rotary_pos_emb_thw[window_index_thw, :, :]
+        rotary_pos_emb_thw = rotary_pos_emb_thw.flatten(start_dim=0, end_dim=1)
+        cu_seqlens_thw = torch.repeat_interleave(
+            torch.tensor([h * w], dtype=torch.int32), t)
+        return (rotary_pos_emb_thw, window_index_thw, cu_seqlens_window_thw,
+                cu_seqlens_thw)
+
+    def compute_attn_mask_seqlen(
+        self,
+        cu_seqlens: torch.Tensor,
+    ) -> tuple[Optional[int], Optional[list[int]]]:
+        max_seqlen, seqlens = None, None
+        if self.attn_backend == _Backend.FLASH_ATTN:
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        elif self.attn_backend == _Backend.XFORMERS:
+            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        return max_seqlen, seqlens
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        grid_thw: list[list[int]],
+    ) -> torch.Tensor:
+        # patchify
+        seq_len, _ = x.size()
+        rotary_pos_emb = []
+        window_index: list = []
+        cu_window_seqlens: list = [torch.tensor([0], dtype=torch.int32)]
+        cu_seqlens: list = []
+
+        hidden_states = x.to(device=self.device, dtype=self.dtype)
+        hidden_states = self.patch_embed(hidden_states)
+
+        window_index_id = 0
+        cu_window_seqlens_last = 0
+        for t, h, w in grid_thw:
+            t, h, w = int(t), int(h), int(w)
+            llm_h = h // self.spatial_merge_size
+            llm_w = w // self.spatial_merge_size
+
+            (
+                rotary_pos_emb_thw,
+                window_index_thw,
+                cu_seqlens_window_thw,
+                cu_seqlens_thw,
+            ) = self.get_rope_by_thw(t, h, w)
+
+            window_index.append(window_index_thw + window_index_id)
+            window_index_id += (t * llm_h * llm_w)
+
+            cu_seqlens_window_thw = (cu_seqlens_window_thw +
+                                     cu_window_seqlens_last)
+            cu_window_seqlens_last = cu_seqlens_window_thw[-1]
+            cu_window_seqlens.append(cu_seqlens_window_thw)
+
+            rotary_pos_emb.append(rotary_pos_emb_thw)
+
+            cu_seqlens.append(cu_seqlens_thw)
+
+        rotary_pos_emb = torch.cat(rotary_pos_emb)
+        window_index = torch.cat(window_index)
+        cu_window_seqlens = torch.cat(cu_window_seqlens)
+        cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+        cu_seqlens = torch.cat(cu_seqlens)
+        cu_seqlens = torch.cumsum(cu_seqlens, dim=0, dtype=torch.int32)
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
+
+        # transformers
+        # pre-compute seqlens for window/full attn to reduce cuMemcpy operations
+        max_seqlen_full, seqlens_full = self.compute_attn_mask_seqlen(
+            cu_seqlens)
+        max_seqlen_window, seqlens_window = self.compute_attn_mask_seqlen(
+            cu_window_seqlens)
+
+        cu_seqlens = cu_seqlens.to(device=self.device, non_blocking=True)
+        cu_window_seqlens = cu_window_seqlens.to(device=self.device,
+                                                 non_blocking=True)
+        rotary_pos_emb = rotary_pos_emb.to(device=self.device,
+                                           non_blocking=True)
+        window_index = window_index.to(device=hidden_states.device,
+                                       non_blocking=True)
+
+        hidden_states = hidden_states.reshape(
+            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        hidden_states = hidden_states[window_index, :, :]
+        hidden_states = hidden_states.reshape(seq_len, -1)
+
+        hidden_states = hidden_states.unsqueeze(1)
+
+        for layer_num, blk in enumerate(self.blocks):
+            if layer_num in self.fullatt_block_indexes:
+                cu_seqlens_now = cu_seqlens
+                max_seqlen_now = max_seqlen_full
+                seqlens_now = seqlens_full
+            else:
+                cu_seqlens_now = cu_window_seqlens
+                max_seqlen_now = max_seqlen_window
+                seqlens_now = seqlens_window
+
+            hidden_states = blk(
+                hidden_states,
+                cu_seqlens=cu_seqlens_now,
+                rotary_pos_emb=rotary_pos_emb,
+                max_seqlen=max_seqlen_now,
+                seqlens=seqlens_now,
+            )
+
+        # For Qwen2.5-VL-3B, float16 will overflow at last block
+        # for long visual tokens sequences.
+        if hidden_states.dtype == torch.float16:
+            hidden_states = cast_overflow_tensors(hidden_states)
+
+        # adapter
+        hidden_states = self.merger(hidden_states)
+        reverse_indices = torch.argsort(window_index)
+        hidden_states = hidden_states[reverse_indices, :]
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("attn.qkv.", "attn.q.", "q"),
+            ("attn.qkv.", "attn.k.", "k"),
+            ("attn.qkv.", "attn.v.", "v"),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Qwen2_5_VLProcessingInfo(Qwen2VLProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(Qwen2_5_VLConfig)
+
+    def get_hf_processor(
+        self,
+        *,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        size: Optional[dict[str, int]] = None,
+        fps: Optional[Union[float, list[float]]] = None,
+        **kwargs: object,
+    ) -> Qwen2_5_VLProcessor:
+        if fps is not None:
+            kwargs["fps"] = fps
+
+        return self.ctx.get_hf_processor(
+            Qwen2_5_VLProcessor,
+            image_processor=self.get_image_processor(min_pixels=min_pixels,
+                                                     max_pixels=max_pixels,
+                                                     size=size,
+                                                     use_fast=kwargs.get(
+                                                         "use_fast", True)),
+            **kwargs,
+        )
+
+
+class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor):
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            **super()._get_mm_fields_config(hf_inputs, hf_processor_mm_kwargs),
+            second_per_grid_ts=MultiModalFieldConfig.batched("video"),
+        )
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Qwen2_5_VLMultiModalProcessor,
+    info=Qwen2_5_VLProcessingInfo,
+    dummy_inputs=Qwen2_5_VLDummyInputsBuilder)
+class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                         SupportsLoRA, SupportsPP,
+                                         SupportsQuant):
+
+    # To ensure correct weight loading and mapping.
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # mapping for new names in checkpoint saved after transformers v4.52
+            "model.language_model.": "language_model.model.",
+            "model.visual.": "visual.",
+            # mapping for original checkpoint
+            "lm_head.": "language_model.lm_head.",
+            "model.": "language_model.model.",
+        })
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<|vision_start|><|image_pad|><|vision_end|>"
+        if modality.startswith("video"):
+            return "<|vision_start|><|video_pad|><|vision_end|>"
+
+        raise ValueError("Only image or video modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.visual = Qwen2_5_VisionTransformer(
+            config.vision_config,
+            norm_eps=getattr(config, "rms_norm_eps", 1e-6),
+            quant_config=self._maybe_ignore_quant_config(self.quant_config),
+            prefix=maybe_prefix(prefix, "visual"),
+        )
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+            architectures=["Qwen2ForCausalLM"],
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _maybe_ignore_quant_config(self, config: Optional[QuantizationConfig]):
+        # GPTQ configs do not have a list of ignored modules, however AutoGPTQ
+        # seems to avoid vision encoder sections for some models.
+        if isinstance(config, (GPTQConfig, GPTQMarlinConfig)):
+            return None
+        return config
+
+    def _validate_and_reshape_mm_tensor(self, mm_input: object,
+                                        name: str) -> torch.Tensor:
+        if not isinstance(mm_input, (torch.Tensor, list)):
+            raise ValueError(f"Incorrect type of {name}. "
+                             f"Got type: {type(mm_input)}")
+        if isinstance(mm_input, torch.Tensor):
+            if mm_input.ndim == 2:
+                return mm_input
+            if mm_input.ndim != 3:
+                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
+                                 f"Got ndim: {mm_input.ndim} "
+                                 f"(shape={mm_input.shape})")
+            return torch.concat(list(mm_input))
+        else:
+            return torch.concat(mm_input)
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Qwen2_5_VLImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        image_grid_thw = kwargs.pop("image_grid_thw", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            pixel_values = self._validate_and_reshape_mm_tensor(
+                pixel_values, "image pixel values")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return Qwen2_5_VLImagePixelInputs(type="pixel_values",
+                                              pixel_values=pixel_values,
+                                              image_grid_thw=image_grid_thw)
+
+        if image_embeds is not None:
+            image_embeds = self._validate_and_reshape_mm_tensor(
+                image_embeds, "image embeds")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            if not isinstance(image_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+            return Qwen2_5_VLImageEmbeddingInputs(
+                type="image_embeds",
+                image_embeds=image_embeds,
+                image_grid_thw=image_grid_thw)
+
+    def _parse_and_validate_video_input(
+            self, **kwargs: object) -> Optional[Qwen2_5_VLVideoInputs]:
+        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
+        video_embeds = kwargs.pop("video_embeds", None)
+        video_grid_thw = kwargs.pop("video_grid_thw", None)
+        second_per_grid_ts = kwargs.pop("second_per_grid_ts", None)
+
+        if pixel_values_videos is None and video_embeds is None:
+            return None
+
+        if pixel_values_videos is not None:
+            pixel_values_videos = self._validate_and_reshape_mm_tensor(
+                pixel_values_videos, "video pixel values")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            return Qwen2_5_VLVideoPixelInputs(
+                type="pixel_values_videos",
+                pixel_values_videos=pixel_values_videos,
+                video_grid_thw=video_grid_thw,
+                second_per_grid_ts=second_per_grid_ts,
+            )
+
+        if video_embeds is not None:
+            video_embeds = self._validate_and_reshape_mm_tensor(
+                video_embeds, "video embeds")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            if not isinstance(video_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of video embeddings. "
+                                 f"Got type: {type(video_embeds)}")
+            return Qwen2_5_VLVideoEmbeddingInputs(
+                type="video_embeds",
+                video_embeds=video_embeds,
+                video_grid_thw=video_grid_thw)
+
+    def _process_image_input(
+            self,
+            image_input: Qwen2_5_VLImageInputs) -> tuple[torch.Tensor, ...]:
+
+        grid_thw = image_input["image_grid_thw"]
+        assert grid_thw.ndim == 2
+        grid_thw_list = grid_thw.tolist()
+
+        if image_input["type"] == "image_embeds":
+            image_embeds = image_input["image_embeds"].type(self.visual.dtype)
+        else:
+            pixel_values = image_input["pixel_values"]
+            image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
+
+        # Split concatenated embeddings for each image item.
+        merge_size = self.visual.spatial_merge_size
+        sizes = grid_thw.prod(-1) // merge_size // merge_size
+
+        return image_embeds.split(sizes.tolist())
+
+    def _process_video_input(
+            self,
+            video_input: Qwen2_5_VLVideoInputs) -> tuple[torch.Tensor, ...]:
+
+        grid_thw = video_input["video_grid_thw"]
+        assert grid_thw.ndim == 2
+        grid_thw_list = grid_thw.tolist()
+
+        if video_input["type"] == "video_embeds":
+            video_embeds = video_input["video_embeds"].type(self.visual.dtype)
+        else:
+            pixel_values_videos = video_input["pixel_values_videos"]
+            video_embeds = self.visual(pixel_values_videos,
+                                       grid_thw=grid_thw_list)
+
+        # Split concatenated embeddings for each video item.
+        merge_size = self.visual.spatial_merge_size
+        sizes = grid_thw.prod(-1) // merge_size // merge_size
+
+        return video_embeds.split(sizes.tolist())
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        mm_input_by_modality = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("pixel_values", "image_embeds"
+                             ) and "image" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "image"] = self._parse_and_validate_image_input(**kwargs)
+            if input_key in ("pixel_values_videos", "video_embeds"
+                             ) and "video" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "video"] = self._parse_and_validate_video_input(**kwargs)
+        return mm_input_by_modality
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+
+        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(
+            **kwargs)
+        if not mm_input_by_modality:
+            return []
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in mm_input_by_modality:
+            multimodal_input = mm_input_by_modality[modality]
+            if modality == "image":
+                vision_embeddings = self._process_image_input(multimodal_input)
+                multimodal_embeddings += vision_embeddings
+            if modality == "video":
+                video_embeddings = self._process_video_input(multimodal_input)
+                multimodal_embeddings += video_embeddings
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                [self.config.image_token_id, self.config.video_token_id])
+        return inputs_embeds
+
+    def get_input_embeddings_v0(
+        self,
+        input_ids: torch.Tensor,
+        image_input: Optional[Qwen2_5_VLImageInputs] = None,
+        video_input: Optional[Qwen2_5_VLVideoInputs] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.get_input_embeddings(input_ids)
+        if image_input is not None:
+            image_embeds = self._process_image_input(image_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                image_embeds,
+                placeholder_token_id=self.config.image_token_id,
+            )
+
+        if video_input is not None:
+            video_embeds = self._process_video_input(video_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                video_embeds,
+                placeholder_token_id=self.config.video_token_id,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        """Run forward pass for Qwen2.5-VL.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            positions: Flattened (concatenated) position ids corresponding to a
+                batch.
+                **NOTE**: If mrope is enabled (default setting for Qwen2.5-VL
+                opensource models), the shape will be `(3, seq_len)`,
+                otherwise it will be `(seq_len,).
+            pixel_values: Pixel values to be fed to a model.
+                `None` if no images are passed.
+            image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
+                `None` if no images are passed.
+            pixel_values_videos: Pixel values of videos to be fed to a model.
+                `None` if no videos are passed.
+            video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
+                `None` if no videos are passed.
+            second_per_grid_ts: Tensor `(num_videos)` of video time interval (
+                in seconds) for each grid along the temporal dimension in the
+                3D position IDs. `None` if no videos are passed.
+        """
+
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner from
+        # `get_multimodal_embeddings` and `get_input_embeddings`, this
+        # condition is only for v0 compatibility.
+        elif inputs_embeds is None:
+            image_input = self._parse_and_validate_image_input(**kwargs)
+            video_input = self._parse_and_validate_video_input(**kwargs)
+
+            if image_input is None and video_input is None:
+                inputs_embeds = None
+            else:
+                if uses_mrope(self.config):
+                    assert positions.ndim == 2 and positions.size(0) == 3, (
+                        "multimodal section rotary embedding requires "
+                        f"(3, seq_len) positions, but got {positions.size()}")
+                inputs_embeds = self.get_input_embeddings_v0(
+                    input_ids,
+                    image_input=image_input,
+                    video_input=video_input)
+                input_ids = None
+
+        hidden_states = self.language_model.model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="visual.merger.",
+            tower_model="visual.",
+        )
diff --git a/vllm_v0.10.0/vllm/model_executor/models/qwen2_audio.py b/vllm_v0.10.0/vllm/model_executor/models/qwen2_audio.py
new file mode 100644
index 0000000..d7fec30
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/qwen2_audio.py
@@ -0,0 +1,420 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Any, Optional, TypedDict, Union
+
+import torch
+import torch.nn as nn
+from transformers import BatchFeature
+from transformers.models.qwen2_audio import (Qwen2AudioConfig,
+                                             Qwen2AudioEncoder,
+                                             Qwen2AudioProcessor)
+from transformers.models.whisper import WhisperFeatureExtractor
+
+from vllm.config import VllmConfig
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
+                                   MultiModalDataParser)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .utils import (AutoWeightsLoader, init_vllm_registered_model,
+                    maybe_prefix, merge_multimodal_embeddings)
+
+
+# # === Audio Inputs === #
+class Qwen2AudioInputs(TypedDict):
+    input_features: torch.Tensor
+    """Shape: `(num_audios, num_mel_bins, 3000)`"""
+
+    feature_attention_mask: torch.Tensor
+    """Shape: `(num_audios, 3000)`"""
+
+
+# === Audio Encoder === #
+
+
+class Qwen2AudioMultiModalProjector(nn.Module):
+
+    def __init__(self, audio_hidden_size: int, text_hidden_size: int):
+        super().__init__()
+        self.linear = nn.Linear(audio_hidden_size, text_hidden_size, bias=True)
+
+    def forward(self, audio_features):
+        hidden_states = self.linear(audio_features)
+        return hidden_states
+
+
+# From Qwen2AudioEncoder._get_feat_extract_output_lengths
+def _get_feat_extract_output_lengths(input_lengths: torch.Tensor):
+    feat_lengths = (input_lengths - 1) // 2 + 1
+    output_lengths = (feat_lengths - 2) // 2 + 1
+    return feat_lengths, output_lengths
+
+
+class Qwen2AudioProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(Qwen2AudioConfig)
+
+    def get_hf_processor(
+        self,
+        *,
+        # Ignored in initialization
+        sampling_rate: Optional[int] = None,
+        **kwargs: object,
+    ) -> Qwen2AudioProcessor:
+        return self.ctx.get_hf_processor(Qwen2AudioProcessor, **kwargs)
+
+    def get_feature_extractor(
+        self,
+        *,
+        # Ignored in initialization
+        sampling_rate: Optional[int] = None,
+    ) -> WhisperFeatureExtractor:
+        hf_processor = self.get_hf_processor(sampling_rate=sampling_rate)
+        feature_extractor = hf_processor.feature_extractor  # type: ignore
+        assert isinstance(feature_extractor, WhisperFeatureExtractor)
+        return feature_extractor
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"audio": None}
+
+
+class Qwen2AudioDummyInputsBuilder(
+        BaseDummyInputsBuilder[Qwen2AudioProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+
+        hf_processor = self.info.get_hf_processor()
+        audio_token = hf_processor.audio_token
+
+        return audio_token * num_audios
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        feature_extractor = self.info.get_feature_extractor()
+
+        sampling_rate = feature_extractor.sampling_rate
+        audio_len = feature_extractor.chunk_length * sampling_rate
+        num_audios = mm_counts.get("audio", 0)
+
+        return {
+            "audio":
+            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
+        }
+
+
+class Qwen2AudioMultiModalProcessor(
+        BaseMultiModalProcessor[Qwen2AudioProcessingInfo]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        feature_extractor = self.info.get_feature_extractor()
+        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, Any],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        # NOTE - we rename audios -> audio in mm data because transformers has
+        # deprecated audios for the qwen2audio processor and will remove
+        # support for it in transformers 4.54.
+        audios = mm_data.pop("audios", [])
+        if audios:
+            mm_data["audio"] = audios
+
+        # Text-only input not supported in composite processor
+        if not mm_data.get("audio", []):
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        feature_extractor = self.info.get_feature_extractor(**mm_kwargs)
+        mm_kwargs = dict(
+            **mm_kwargs,
+            sampling_rate=feature_extractor.sampling_rate,
+        )
+
+        return super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            input_features=MultiModalFieldConfig.batched("audio"),
+            feature_attention_mask=MultiModalFieldConfig.batched("audio"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+
+        # Use getattr with default to be compatible with transformers<4.48
+        audio_token = getattr(processor, "audio_token", "<|AUDIO|>")
+        audio_bos_token = getattr(processor, "audio_bos_token",
+                                  "<|audio_bos|>")
+        audio_eos_token = getattr(processor, "audio_eos_token",
+                                  "<|audio_eos|>")
+
+        audio_token_id = vocab[audio_token]
+        audio_bos_id = vocab[audio_bos_token]
+        audio_eos_id = vocab[audio_eos_token]
+
+        feature_attention_mask = out_mm_kwargs.get("feature_attention_mask")
+        if feature_attention_mask is None:
+            audio_output_lengths = []
+        else:
+            assert isinstance(feature_attention_mask, torch.Tensor)
+            _, audio_output_lens = _get_feat_extract_output_lengths(
+                feature_attention_mask.sum(-1))
+
+            audio_output_lengths = audio_output_lens.tolist()
+
+        def get_replacement_qwen2_audio(item_idx: int):
+            num_features = audio_output_lengths[item_idx]
+            if num_features == 0:
+                audios = mm_items.get_items("audio", AudioProcessorItems)
+                audio_len = audios.get_audio_length(item_idx)
+
+                raise ValueError(f"The audio (len={audio_len}) is too short "
+                                 "to be represented inside the model")
+
+            audio_tokens = [audio_token_id] * num_features
+
+            return PromptUpdateDetails.select_token_id(
+                [audio_bos_id] + audio_tokens + [audio_eos_id],
+                embed_token_id=audio_token_id,
+            )
+
+        return [
+            PromptReplacement(
+                modality="audio",
+                target=audio_token,
+                replacement=get_replacement_qwen2_audio,
+            )
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Qwen2AudioMultiModalProcessor,
+    info=Qwen2AudioProcessingInfo,
+    dummy_inputs=Qwen2AudioDummyInputsBuilder)
+class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                         SupportsPP):
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("audio"):
+            return f"Audio {i}: <|audio_bos|><|AUDIO|><|audio_eos|>"
+
+        raise ValueError("Only audio modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.audio_tower = Qwen2AudioEncoder(config.audio_config)
+        self.multi_modal_projector = Qwen2AudioMultiModalProjector(
+            config.audio_config.d_model, config.text_config.hidden_size)
+
+        self.quant_config = quant_config
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+            architectures=["Qwen2ForCausalLM"],
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _validate_and_reshape_mm_tensor(self, mm_input: object,
+                                        name: str) -> torch.Tensor:
+        if not isinstance(mm_input, (torch.Tensor, list)):
+            raise ValueError(f"Incorrect type of {name}. "
+                             f"Got type: {type(mm_input)}")
+        if isinstance(mm_input, torch.Tensor):
+            return torch.concat(list(mm_input))
+        else:
+            return torch.concat(mm_input)
+
+    def _parse_and_validate_audio_input(
+            self, **kwargs: object) -> Optional[Qwen2AudioInputs]:
+        input_features = kwargs.pop('input_features', None)
+        feature_attention_mask = kwargs.pop('feature_attention_mask', None)
+        if input_features is None:
+            return None
+        input_features = self._validate_and_reshape_mm_tensor(
+            input_features, 'input_features')
+        feature_attention_mask = self._validate_and_reshape_mm_tensor(
+            feature_attention_mask, 'feature_attention_mask')
+        if not isinstance(input_features, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of audio input features. "
+                             f"Got type: {type(input_features)}")
+        return Qwen2AudioInputs(input_features=input_features,
+                                feature_attention_mask=feature_attention_mask)
+
+    def _process_audio_input(self,
+                             audio_input: Qwen2AudioInputs) -> torch.Tensor:
+
+        input_features = audio_input["input_features"]
+        feature_attention_mask = audio_input["feature_attention_mask"]
+
+        audio_feat_lengths, audio_output_lengths = (
+            self.audio_tower._get_feat_extract_output_lengths(
+                feature_attention_mask.sum(-1)))
+
+        batch_size, _, max_mel_seq_len = input_features.shape
+        max_seq_len = (max_mel_seq_len - 2) // 2 + 1
+        # Create a sequence tensor of shape (batch_size, max_seq_len)
+        seq_range = (torch.arange(
+            0,
+            max_seq_len,
+            dtype=audio_feat_lengths.dtype,
+            device=audio_feat_lengths.device).unsqueeze(0).expand(
+                batch_size, max_seq_len))
+        lengths_expand = audio_feat_lengths.unsqueeze(-1).expand(
+            batch_size, max_seq_len)
+        # Create mask
+        padding_mask = seq_range >= lengths_expand
+
+        audio_attention_mask_ = padding_mask.view(
+            batch_size, 1, 1, max_seq_len).expand(batch_size, 1, max_seq_len,
+                                                  max_seq_len)
+        audio_attention_mask = audio_attention_mask_.to(
+            dtype=self.audio_tower.conv1.weight.dtype,
+            device=self.audio_tower.conv1.weight.device)
+        audio_attention_mask[audio_attention_mask_] = float("-inf")
+
+        audio_outputs = self.audio_tower(input_features,
+                                         attention_mask=audio_attention_mask)
+        selected_audio_feature = audio_outputs.last_hidden_state
+        audio_features = self.multi_modal_projector(selected_audio_feature)
+        num_audios, max_audio_tokens, embed_dim = audio_features.shape
+        audio_output_lengths = audio_output_lengths.unsqueeze(1)
+        audio_features_mask = torch.arange(max_audio_tokens).expand(
+            num_audios, max_audio_tokens).to(
+                audio_output_lengths.device) < audio_output_lengths
+        masked_audio_features = audio_features[audio_features_mask].view(
+            -1, embed_dim)
+
+        # Split to tuple of embeddings for individual audio input.
+        return torch.split(masked_audio_features,
+                           audio_output_lengths.flatten().tolist())
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        audio_input = self._parse_and_validate_audio_input(**kwargs)
+        if audio_input is None:
+            return []
+        masked_audio_features = self._process_audio_input(audio_input)
+        return masked_audio_features
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.config.audio_token_index)
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      multimodal_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/qwen2_moe.py b/vllm_v0.10.0/vllm/model_executor/models/qwen2_moe.py
new file mode 100644
index 0000000..b061e2f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/qwen2_moe.py
@@ -0,0 +1,557 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen2MoE model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, extract_layer_index,
+                    is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+logger = init_logger(__name__)
+
+
+class Qwen2MoeMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config)
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config,
+                                           reduce_results=reduce_results)
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Qwen2MoeSparseMoeBlock(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        if self.tp_size > config.num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.num_experts}.")
+
+        self.experts = FusedMoE(num_experts=config.num_experts,
+                                top_k=config.num_experts_per_tok,
+                                hidden_size=config.hidden_size,
+                                intermediate_size=config.moe_intermediate_size,
+                                reduce_results=False,
+                                renormalize=config.norm_topk_prob,
+                                quant_config=quant_config,
+                                prefix=f"{prefix}.experts")
+
+        self.gate = ReplicatedLinear(config.hidden_size,
+                                     config.num_experts,
+                                     bias=False,
+                                     quant_config=None)
+        if config.shared_expert_intermediate_size > 0:
+            self.shared_expert = Qwen2MoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.shared_expert_intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=self.experts.must_reduce_shared_expert_outputs(
+                ),
+            )
+        else:
+            self.shared_expert = None
+        self.shared_expert_gate = torch.nn.Linear(config.hidden_size,
+                                                  1,
+                                                  bias=False)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
+        hidden_dim = hidden_states.shape[-1]
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        shared_output = None
+        if self.shared_expert is not None:
+            shared_output = self.shared_expert(hidden_states)
+            if self.shared_expert_gate is not None:
+                shared_output = F.sigmoid(
+                    self.shared_expert_gate(hidden_states)) * shared_output
+
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        final_hidden_states = self.experts(hidden_states=hidden_states,
+                                           router_logits=router_logits)
+        if shared_output is not None:
+            final_hidden_states = final_hidden_states + shared_output
+        if self.tp_size > 1:
+            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(  # noqa E501
+                final_hidden_states)
+
+        return final_hidden_states.view(orig_shape)
+
+
+class Qwen2MoeAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        dual_chunk_attention_config: Optional[dict[str, Any]] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.dual_chunk_attention_config = dual_chunk_attention_config
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=True,
+            quant_config=quant_config,
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            dual_chunk_attention_config=dual_chunk_attention_config,
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+            **{
+                "layer_idx": extract_layer_index(prefix),
+                "dual_chunk_attention_config": dual_chunk_attention_config,
+            } if dual_chunk_attention_config else {})
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Qwen2MoeDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        dual_chunk_attention_config = getattr(config,
+                                              "dual_chunk_attention_config",
+                                              None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.self_attn = Qwen2MoeAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+            dual_chunk_attention_config=dual_chunk_attention_config,
+        )
+
+        # Note: Qwen/Qwen2-57B-A14B-Instruct does not have
+        # `mlp_only_layers` in the config.
+        layer_idx = extract_layer_index(prefix)
+        mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else
+                           config.mlp_only_layers)
+        if (layer_idx not in mlp_only_layers) and (
+                config.num_experts > 0 and
+            (layer_idx + 1) % config.decoder_sparse_step == 0):
+            self.mlp = Qwen2MoeSparseMoeBlock(config=config,
+                                              quant_config=quant_config,
+                                              prefix=f"{prefix}.mlp")
+        else:
+            self.mlp = Qwen2MoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class Qwen2MoeModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.vocab_size = config.vocab_size
+        self.config = config
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Qwen2MoeDecoderLayer(config=config,
+                                                cache_config=cache_config,
+                                                quant_config=quant_config,
+                                                prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(positions, hidden_states, residual)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        return FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        expert_params_mapping = self.get_expert_mapping()
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if ((name.endswith(".bias") or name.endswith("_bias"))
+                        and name not in params_dict):
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    if name.endswith("kv_scale"):
+                        remapped_kv_scale_name = name.replace(
+                            ".kv_scale", ".attn.kv_scale")
+                        if remapped_kv_scale_name not in params_dict:
+                            logger.warning_once(
+                                "Found kv_scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv_scale is not loaded.",  #  noqa: E501
+                                name,
+                                remapped_kv_scale_name,
+                            )
+                            continue
+                        else:
+                            name = remapped_kv_scale_name
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Qwen2MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
+
+    fall_back_to_pt_during_load = False
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Qwen2MoeModel(vllm_config=vllm_config,
+                                   prefix=maybe_prefix(prefix, "model"))
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
diff --git a/vllm_v0.10.0/vllm/model_executor/models/qwen2_rm.py b/vllm_v0.10.0/vllm/model_executor/models/qwen2_rm.py
new file mode 100644
index 0000000..f12e9a0
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/qwen2_rm.py
@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+"""Inference-only Qwen2-RM model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler,
+                                               PoolingType)
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .qwen2 import Qwen2Model
+from .utils import AutoWeightsLoader, maybe_prefix
+
+
+class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP):
+
+    is_pooling_model = True
+    pooler: Pooler
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.lora_config = lora_config
+
+        self.quant_config = quant_config
+        self.model = Qwen2Model(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+
+        self.score = nn.Sequential(
+            ColumnParallelLinear(config.hidden_size,
+                                 config.hidden_size,
+                                 quant_config=quant_config,
+                                 return_bias=False),
+            nn.ReLU(),
+            RowParallelLinear(config.hidden_size,
+                              config.num_labels,
+                              quant_config=quant_config,
+                              return_bias=False),
+        )
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        logits = self.score(hidden_states)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self,
+                                   ignore_unexpected_prefixes=["lm_head."])
+        return loader.load_weights(weights)
+
+
+class Qwen2ForRewardModel(Qwen2RewardBaseModel):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        vllm_config.model_config.hf_config.num_labels = 1
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+
+        pooler_config = vllm_config.model_config.pooler_config
+        assert pooler_config is not None
+
+        self.pooler = DispatchPooler(
+            {"encode": Pooler.for_encode(pooler_config)}, )
+
+
+class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        vllm_config.model_config.hf_config.num_labels = 2
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+
+        pooler_config = vllm_config.model_config.pooler_config
+        assert pooler_config is not None
+
+        self.pooler = DispatchPooler({
+            "encode":
+            Pooler.for_encode(
+                pooler_config,
+                default_pooling_type=PoolingType.STEP,
+                default_normalize=False,
+                default_softmax=True,
+                default_step_tag_id=151651,
+            )
+        })
diff --git a/vllm_v0.10.0/vllm/model_executor/models/qwen2_vl.py b/vllm_v0.10.0/vllm/model_executor/models/qwen2_vl.py
new file mode 100644
index 0000000..ad63bb4
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/qwen2_vl.py
@@ -0,0 +1,1514 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/19e6e80e10118f855137b90740936c0b11ac397f/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen2-VL model compatible with HuggingFace weights."""
+from collections.abc import Iterable, Mapping, Sequence
+from functools import partial
+from typing import Any, Callable, Literal, Optional, TypedDict, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from transformers import AutoConfig, BatchFeature
+from transformers.models.qwen2_vl import (Qwen2VLImageProcessor,
+                                          Qwen2VLProcessor)
+from transformers.models.qwen2_vl.configuration_qwen2_vl import (
+    Qwen2VLConfig, Qwen2VLVisionConfig)
+from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
+from transformers.models.qwen2_vl.video_processing_qwen2_vl import (
+    Qwen2VLVideoProcessor)
+
+from vllm.config import VllmConfig
+from vllm.distributed import parallel_state, tensor_model_parallel_all_gather
+from vllm.distributed import utils as dist_utils
+from vllm.logger import init_logger
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.activation import QuickGELU
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.gptq import GPTQConfig
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQMarlinConfig)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (ImageItem, ModalityData,
+                                    MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, VideoItem)
+from vllm.multimodal.parse import (DictEmbeddingItems, ImageSize,
+                                   ModalityDataItems, MultiModalDataItems,
+                                   MultiModalDataParser)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.platforms import _Backend, current_platform
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import uses_mrope
+from vllm.transformers_utils.processor import (
+    cached_image_processor_from_config)
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
+from .utils import (AutoWeightsLoader, WeightsMapper,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+from .vision import get_vit_attn_backend
+
+logger = init_logger(__name__)
+
+# For profile run
+_MAX_FRAMES_PER_VIDEO = 16
+
+# === Vision Inputs === #
+
+
+class Qwen2VLImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: torch.Tensor
+    """Shape:
+    `(num_patches, num_channels * patch_size * patch_size)`
+    """
+
+    image_grid_thw: torch.Tensor
+    """Shape: `(num_images, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+class Qwen2VLImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    image_embeds: torch.Tensor
+    """Supported types:
+    - list[`torch.Tensor`]: A list of tensors holding all images' features.
+        Each tensor holds an image's features.
+    - `torch.Tensor`: A tensor holding all images' features
+        (concatenation of all images' feature tensors).
+    
+    Tensor shape: `(num_image_features, hidden_size)`
+    - `num_image_features` varies based on
+        the number and resolution of the images.
+    - `hidden_size` must match the hidden size of language model backbone.
+    """
+
+    image_grid_thw: torch.Tensor
+    """Shape: `(num_images, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+Qwen2VLImageInputs = Union[Qwen2VLImagePixelInputs,
+                           Qwen2VLImageEmbeddingInputs]
+
+
+class Qwen2VLVideoPixelInputs(TypedDict):
+    type: Literal["pixel_values_videos"]
+    pixel_values_videos: torch.Tensor
+    """Shape:
+    `(num_patches,
+      num_channels * temporal_patch_size * patch_size * patch_size)`
+    """
+
+    video_grid_thw: torch.Tensor
+    """Shape: `(num_videos, 3)`
+
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+class Qwen2VLVideoEmbeddingInputs(TypedDict):
+    type: Literal["video_embeds"]
+    video_embeds: torch.Tensor
+    """Supported types:
+    - list[`torch.Tensor`]: A list of tensors holding all videos' features.
+        Each tensor holds an video's features.
+    - `torch.Tensor`: A tensor holding all videos' features
+        (concatenation of all videos' feature tensors).
+    
+    Tensor shape: `(num_image_features, hidden_size)`
+    - `num_image_features` varies based on 
+        the number and resolution of the videos.
+    - `hidden_size` must match the hidden size of language model backbone.
+    """
+
+    video_grid_thw: torch.Tensor
+    """Shape: `(num_videos, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+Qwen2VLVideoInputs = Union[Qwen2VLVideoPixelInputs,
+                           Qwen2VLVideoEmbeddingInputs]
+
+# === Vision Encoder === #
+
+
+class Qwen2VisionMLP(nn.Module):
+
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int,
+        act_layer: type[nn.Module] = QuickGELU,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.fc1 = ColumnParallelLinear(in_features,
+                                        hidden_features,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.fc1")
+        self.act = act_layer()
+        self.fc2 = RowParallelLinear(hidden_features,
+                                     in_features,
+                                     quant_config=quant_config,
+                                     prefix=f"{prefix}.fc2")
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_parallel, _ = self.fc1(x)
+        x_parallel = self.act(x_parallel)
+        x, _ = self.fc2(x_parallel)
+        return x
+
+
+def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor:
+    if not interleaved:
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    else:
+        x1, x2 = x[..., ::2], x[..., 1::2]
+        return rearrange(torch.stack((-x2, x1), dim=-1),
+                         "... d two -> ... (d two)",
+                         two=2)
+
+
+def apply_rotary_emb_torch(x: torch.Tensor,
+                           cos: torch.Tensor,
+                           sin: torch.Tensor,
+                           interleaved: bool = False) -> torch.Tensor:
+    """
+    x: (batch_size, seqlen, nheads, headdim)
+    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
+    """
+    ro_dim = cos.shape[-1] * 2
+    assert ro_dim <= x.shape[-1]
+    cos = repeat(
+        cos,
+        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+    sin = repeat(
+        sin,
+        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+    return torch.cat(
+        [
+            x[..., :ro_dim] * cos +
+            rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:]
+        ],
+        dim=-1,
+    )
+
+
+def apply_rotary_pos_emb_vision(t: torch.Tensor,
+                                freqs: torch.Tensor) -> torch.Tensor:
+    t_ = t.float()
+    cos = freqs.cos()
+    sin = freqs.sin()
+    apply_rotary_emb = apply_rotary_emb_torch
+    if current_platform.is_cuda():
+        from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
+    output = apply_rotary_emb(t_, cos, sin).type_as(t)
+    return output
+
+
+class Qwen2VisionAttention(nn.Module):
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        projection_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        # Per attention head and per partition values.
+        world_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.tp_size = world_size
+        self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        self.hidden_size_per_attention_head = dist_utils.divide(
+            projection_size, num_heads)
+        self.num_attention_heads_per_partition = dist_utils.divide(
+            num_heads, world_size)
+
+        self.qkv = ColumnParallelLinear(input_size=embed_dim,
+                                        output_size=3 * projection_size,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.qkv")
+        self.proj = RowParallelLinear(input_size=projection_size,
+                                      output_size=embed_dim,
+                                      quant_config=quant_config,
+                                      prefix=f"{prefix}.proj")
+
+        # Detect attention implementation.
+        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
+        if self.attn_backend not in {
+                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS
+        }:
+            raise RuntimeError(
+                f"Qwen2-VL does not support {self.attn_backend} backend now.")
+
+    def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
+        # [s, b, 3 * head * head_dim]
+        seq_len, bs, _ = qkv.shape
+        if self.tp_size > 1:
+            qkv = tensor_model_parallel_all_gather(qkv)
+
+        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim]
+        q, k, v = qkv.chunk(3, dim=2)
+
+        # 3 * [s, b, head * head_dim]
+        if self.tp_size > 1:
+            splitter = partial(dist_utils.split_tensor_along_last_dim,
+                               num_partitions=self.tp_size)
+            q = splitter(q)[self.tp_rank]
+            k = splitter(k)[self.tp_rank]
+            v = splitter(v)[self.tp_rank]
+
+        # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim]
+        new_shape = (seq_len, bs, self.num_attention_heads_per_partition,
+                     self.hidden_size_per_attention_head)
+        q, k, v = (x.view(*new_shape) for x in (q, k, v))
+        return q, k, v
+
+    def forward(
+            self,
+            x: torch.Tensor,
+            cu_seqlens: torch.Tensor,
+            rotary_pos_emb: torch.Tensor,
+            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
+            seqlens: Optional[list[int]] = None,  # Only used for xFormers
+    ) -> torch.Tensor:
+
+        # [s, b, c] --> [s, b, 3 * head * head_dim]
+        x, _ = self.qkv(x)
+
+        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim]
+        q, k, v = self.split_qkv(x)
+        batch_size = q.shape[1]
+
+        q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous()
+                   for x in (q, k, v))
+        if rotary_pos_emb is not None:
+            q = apply_rotary_pos_emb_vision(q, rotary_pos_emb)
+            k = apply_rotary_pos_emb_vision(k, rotary_pos_emb)
+
+        if self.attn_backend == _Backend.FLASH_ATTN:
+            # from vllm_flash_attn.flash_attn_interface import (
+            #   flash_attn_varlen_func)
+            from flash_attn import flash_attn_varlen_func
+
+            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
+
+            output = flash_attn_varlen_func(q,
+                                            k,
+                                            v,
+                                            cu_seqlens_q=cu_seqlens,
+                                            cu_seqlens_k=cu_seqlens,
+                                            max_seqlen_q=max_seqlen,
+                                            max_seqlen_k=max_seqlen,
+                                            dropout_p=0,
+                                            causal=False)
+
+            context_layer = rearrange(output,
+                                      "(b s) ... -> b s ...",
+                                      b=batch_size)
+        elif self.attn_backend == _Backend.TORCH_SDPA:
+            # Execute attention entry by entry for speed & less VRAM.
+            outputs = []
+            for i in range(1, len(cu_seqlens)):
+                start_idx = cu_seqlens[i - 1]
+                end_idx = cu_seqlens[i]
+                q_i = q[:, start_idx:end_idx]
+                k_i = k[:, start_idx:end_idx]
+                v_i = v[:, start_idx:end_idx]
+                q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
+                                 for x in [q_i, k_i, v_i])
+                output_i = F.scaled_dot_product_attention(q_i,
+                                                          k_i,
+                                                          v_i,
+                                                          dropout_p=0.0)
+                output_i = rearrange(output_i, "b h s d -> b s h d ")
+                outputs.append(output_i)
+            context_layer = torch.cat(outputs, dim=1)
+        elif self.attn_backend == _Backend.XFORMERS:
+            from xformers import ops as xops
+            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
+
+            attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens,
+                                                       kv_seqlen=None,
+                                                       device=q.device)
+
+            context_layer = xops.memory_efficient_attention_forward(
+                q, k, v, attn_bias=attn_bias, p=0, scale=None)
+        context_layer = rearrange(context_layer,
+                                  "b s h d -> s b (h d)").contiguous()
+
+        output, _ = self.proj(context_layer)
+        return output
+
+
+class Qwen2VisionBlock(nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float,
+        act_layer: type[nn.Module] = QuickGELU,
+        norm_layer: Optional[Callable[[int], nn.Module]] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.norm1 = norm_layer(dim)
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+
+        self.attn = Qwen2VisionAttention(embed_dim=dim,
+                                         num_heads=num_heads,
+                                         projection_size=dim,
+                                         quant_config=quant_config,
+                                         prefix=f"{prefix}.attn")
+        self.mlp = Qwen2VisionMLP(dim,
+                                  mlp_hidden_dim,
+                                  act_layer=act_layer,
+                                  quant_config=quant_config,
+                                  prefix=f"{prefix}.mlp")
+
+    def forward(
+            self,
+            x: torch.Tensor,
+            cu_seqlens: torch.Tensor,
+            rotary_pos_emb: torch.Tensor,
+            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
+            seqlens: Optional[list[int]] = None,  # Only used for xFormers
+    ) -> torch.Tensor:
+        x = x + self.attn(
+            self.norm1(x),
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb=rotary_pos_emb,
+            max_seqlen=max_seqlen,
+            seqlens=seqlens,
+        )
+
+        x = x + self.mlp(self.norm2(x))
+        return x
+
+
+class Qwen2VisionPatchEmbed(nn.Module):
+
+    def __init__(
+        self,
+        patch_size: int = 14,
+        temporal_patch_size: int = 2,
+        in_channels: int = 3,
+        embed_dim: int = 1152,
+    ) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.embed_dim = embed_dim
+
+        kernel_size = (temporal_patch_size, patch_size, patch_size)
+        self.proj = nn.Conv3d(in_channels,
+                              embed_dim,
+                              kernel_size=kernel_size,
+                              stride=kernel_size,
+                              bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        L, C = x.shape
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size,
+                   self.patch_size)
+        x = self.proj(x).view(L, self.embed_dim)
+        return x
+
+
+class Qwen2VisionPatchMerger(nn.Module):
+
+    def __init__(
+        self,
+        d_model: int,
+        context_dim: int,
+        norm_layer: Optional[Callable[[int], nn.Module]] = None,
+        spatial_merge_size: int = 2,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size**2)
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.ln_q = norm_layer(context_dim)
+        self.mlp = nn.ModuleList([
+            ColumnParallelLinear(self.hidden_size,
+                                 self.hidden_size,
+                                 bias=True,
+                                 quant_config=quant_config,
+                                 prefix=f"{prefix}.mlp.0"),
+            nn.GELU(),
+            RowParallelLinear(self.hidden_size,
+                              d_model,
+                              bias=True,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.mlp.2"),
+        ])
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.ln_q(x)
+        x = x.view(-1, self.hidden_size)
+
+        mlp_fc1, mlp_act, mlp_fc2 = self.mlp
+        x_parallel, _ = mlp_fc1(x)
+        x_parallel = mlp_act(x_parallel)
+        out, _ = mlp_fc2(x_parallel)
+        return out
+
+
+class Qwen2VisionRotaryEmbedding(nn.Module):
+
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        inv_freq = 1.0 / (theta
+                          **(torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self._seq_len_cached = 0
+        self._freqs_cached = None
+
+    def update_freqs_cache(self, seqlen: int) -> None:
+        if seqlen > self._seq_len_cached:
+            seqlen *= 2
+            self._seq_len_cached = seqlen
+            self.inv_freq = 1.0 / (self.theta**(torch.arange(
+                0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device)
+                                                / self.dim))
+            seq = torch.arange(seqlen,
+                               device=self.inv_freq.device,
+                               dtype=self.inv_freq.dtype)
+            freqs = torch.outer(seq, self.inv_freq)
+            self._freqs_cached = freqs
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        self.update_freqs_cache(seqlen)
+        return self._freqs_cached[:seqlen]
+
+
+class Qwen2VisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        vision_config: Qwen2VLVisionConfig,
+        norm_eps: float = 1e-6,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        patch_size = vision_config.patch_size
+        temporal_patch_size = vision_config.temporal_patch_size
+        spatial_merge_size = vision_config.spatial_merge_size
+        in_channels = vision_config.in_channels
+        hidden_size = vision_config.hidden_size
+        embed_dim = vision_config.embed_dim
+        depth = vision_config.depth
+        num_heads = vision_config.num_heads
+        mlp_ratio = vision_config.mlp_ratio
+
+        self.spatial_merge_size = spatial_merge_size
+        self.num_heads = num_heads
+        self.embed_dim = embed_dim
+
+        self.patch_embed = Qwen2VisionPatchEmbed(
+            patch_size=patch_size,
+            temporal_patch_size=temporal_patch_size,
+            in_channels=in_channels,
+            embed_dim=embed_dim,
+        )
+
+        norm_layer = partial(nn.LayerNorm, eps=norm_eps)
+        head_dim = embed_dim // num_heads
+        self.rotary_pos_emb = Qwen2VisionRotaryEmbedding(head_dim // 2)
+
+        self.blocks = nn.ModuleList([
+            Qwen2VisionBlock(dim=embed_dim,
+                             num_heads=num_heads,
+                             mlp_ratio=mlp_ratio,
+                             norm_layer=norm_layer,
+                             quant_config=quant_config,
+                             prefix=f"{prefix}.blocks.{layer_idx}")
+            for layer_idx in range(depth)
+        ])
+        self.merger = Qwen2VisionPatchMerger(
+            d_model=hidden_size,
+            context_dim=embed_dim,
+            norm_layer=norm_layer,
+            quant_config=quant_config,
+            prefix=f"{prefix}.merger",
+        )
+        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.patch_embed.proj.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.patch_embed.proj.weight.device
+
+    def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            ).permute(0, 2, 1, 3).flatten()
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            ).permute(0, 2, 1, 3).flatten()
+            pos_ids.append(
+                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+
+    def compute_attn_mask_seqlen(
+            self, cu_seqlens: torch.Tensor
+    ) -> tuple[Optional[int], Optional[list[int]]]:
+        max_seqlen, seqlens = None, None
+        if self.attn_backend == _Backend.FLASH_ATTN:
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        elif self.attn_backend == _Backend.XFORMERS:
+            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        return max_seqlen, seqlens
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        grid_thw: torch.Tensor,
+    ) -> torch.Tensor:
+        # patchify
+        x = x.to(device=self.device, dtype=self.dtype)
+        x = self.patch_embed(x)
+
+        # compute position embedding
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+
+        # compute cu_seqlens
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2],
+                                             grid_thw[:, 0]).cumsum(
+                                                 dim=0, dtype=torch.int32)
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
+
+        # transformers
+        x = x.unsqueeze(1)
+
+        # pre-compute seqlens for attn mask to reduce cuMemcpy operations
+        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        for blk in self.blocks:
+            x = blk(
+                x,
+                cu_seqlens=cu_seqlens,
+                rotary_pos_emb=rotary_pos_emb,
+                max_seqlen=max_seqlen,
+                seqlens=seqlens,
+            )
+
+        # adapter
+        x = self.merger(x)
+
+        return x
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+def _qwen2vl_field_config(hf_inputs: Mapping[str, torch.Tensor]):
+    image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
+    image_grid_sizes = image_grid_thw.prod(-1)
+
+    video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
+    video_grid_sizes = video_grid_thw.prod(-1)
+
+    return dict(
+        pixel_values=MultiModalFieldConfig.flat_from_sizes(
+            "image", image_grid_sizes),
+        image_embeds=MultiModalFieldConfig.flat_from_sizes(
+            "image", image_grid_sizes),
+        image_grid_thw=MultiModalFieldConfig.batched("image"),
+        pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
+            "video", video_grid_sizes),
+        video_embeds=MultiModalFieldConfig.flat_from_sizes(
+            "video", video_grid_sizes),
+        video_grid_thw=MultiModalFieldConfig.batched("video"),
+    )
+
+
+class Qwen2VLMultiModalDataParser(MultiModalDataParser):
+
+    def _parse_image_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
+    ) -> Optional[ModalityDataItems[Any, Any]]:
+        if isinstance(data, dict):
+            return DictEmbeddingItems(
+                data,
+                modality="image",
+                required_fields={"image_embeds", "image_grid_thw"},
+                fields_factory=_qwen2vl_field_config,
+            )
+
+        return super()._parse_image_data(data)
+
+    def _parse_video_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
+    ) -> Optional[ModalityDataItems[Any, Any]]:
+        if isinstance(data, dict):
+            return DictEmbeddingItems(
+                data,
+                modality="video",
+                required_fields={"video_embeds", "video_grid_thw"},
+                fields_factory=_qwen2vl_field_config,
+            )
+
+        return super()._parse_video_data(data)
+
+
+class Qwen2VLProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(Qwen2VLConfig)
+
+    def get_hf_processor(
+        self,
+        *,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        size: Optional[dict[str, int]] = None,
+        **kwargs: object,
+    ) -> Qwen2VLProcessor:
+        return self.ctx.get_hf_processor(
+            Qwen2VLProcessor,
+            image_processor=self.get_image_processor(min_pixels=min_pixels,
+                                                     max_pixels=max_pixels,
+                                                     size=size,
+                                                     use_fast=kwargs.get(
+                                                         "use_fast", True)),
+            **kwargs,
+        )
+
+    def _get_image_processor_kwargs(
+        self,
+        *,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        size: Optional[dict[str, int]] = None,
+        **kwargs: object,
+    ):
+        mm_config = self.ctx.model_config.get_multimodal_config()
+        if mm_config.mm_processor_kwargs:
+            kwargs.update(mm_config.mm_processor_kwargs)
+
+        if min_pixels is not None:
+            kwargs["min_pixels"] = min_pixels
+
+            if size is None:
+                size = {"shortest_edge": min_pixels}
+            else:
+                size["shortest_edge"] = min_pixels
+
+        if max_pixels is not None:
+            kwargs["max_pixels"] = max_pixels
+
+            if size is None:
+                size = {"longest_edge": max_pixels}
+            else:
+                size["longest_edge"] = max_pixels
+
+        if size is not None:
+            kwargs["size"] = size
+
+        return kwargs
+
+    def get_image_processor(
+        self,
+        *,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        size: Optional[dict[str, int]] = None,
+        **kwargs: object,
+    ) -> Qwen2VLImageProcessor:
+        kwargs["use_fast"] = kwargs.get("use_fast", True)
+        return cached_image_processor_from_config(
+            self.ctx.model_config,
+            **self._get_image_processor_kwargs(min_pixels=min_pixels,
+                                               max_pixels=max_pixels,
+                                               size=size,
+                                               **kwargs),
+        )
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": None}
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        max_image_tokens = self.get_max_image_tokens()
+        max_video_tokens = self.get_max_video_tokens(seq_len, mm_counts)
+        return {"image": max_image_tokens, "video": max_video_tokens}
+
+    def _get_vision_info(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int = 1,
+        do_resize: bool = True,
+        image_processor: Optional[Qwen2VLImageProcessor],
+    ) -> tuple[ImageSize, int]:
+        if image_processor is None:
+            image_processor = self.get_image_processor()
+
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+        patch_size = vision_config.patch_size
+        merge_size = vision_config.spatial_merge_size
+        temporal_patch_size = vision_config.temporal_patch_size
+
+        if do_resize:
+            resized_height, resized_width = smart_resize(
+                height=image_height,
+                width=image_width,
+                factor=patch_size * merge_size,
+                min_pixels=image_processor.min_pixels,
+                max_pixels=image_processor.max_pixels,
+            )
+            preprocessed_size = ImageSize(width=resized_width,
+                                          height=resized_height)
+        else:
+            preprocessed_size = ImageSize(width=image_width,
+                                          height=image_height)
+
+        # NOTE: Frames are padded to be divisible by `temporal_patch_size`
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py#L294
+        padded_num_frames = num_frames + num_frames % temporal_patch_size
+
+        grid_t = max(padded_num_frames // temporal_patch_size, 1)
+        grid_h = preprocessed_size.height // patch_size
+        grid_w = preprocessed_size.width // patch_size
+
+        num_patches = grid_t * grid_h * grid_w
+        num_vision_tokens = num_patches // (merge_size**2)
+
+        return preprocessed_size, num_vision_tokens
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        image_processor: Optional[Qwen2VLImageProcessor],
+    ) -> int:
+        _, num_image_tokens = self._get_vision_info(
+            image_width=image_width,
+            image_height=image_height,
+            image_processor=image_processor,
+        )
+        return num_image_tokens
+
+    def get_num_video_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int,
+        image_processor: Optional[Qwen2VLImageProcessor],
+    ) -> int:
+        _, num_video_tokens = self._get_vision_info(
+            image_width=image_width,
+            image_height=image_height,
+            num_frames=num_frames,
+            image_processor=image_processor,
+        )
+        return num_video_tokens
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        max_image_size, _ = self._get_vision_info(
+            image_width=9999999,
+            image_height=9999999,
+            image_processor=None,
+        )
+        return max_image_size
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            image_processor=None,
+        )
+
+    def _get_max_video_frames(self, max_tokens: int) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        num_frames = 0
+
+        while True:
+            next_num_frames = num_frames + 1
+            next_max_tokens = self.get_num_video_tokens(
+                image_width=target_width,
+                image_height=target_height,
+                num_frames=next_num_frames,
+                image_processor=None,
+            )
+
+            if next_max_tokens > max_tokens:
+                break
+
+            num_frames = next_num_frames
+
+        return num_frames
+
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
+
+        max_image_tokens = self.get_max_image_tokens() * max_images
+        max_total_frames = self._get_max_video_frames(seq_len -
+                                                      max_image_tokens)
+        max_frames_per_video = min(max_total_frames // max(max_videos, 1),
+                                   _MAX_FRAMES_PER_VIDEO)
+
+        return max(max_frames_per_video, 1)
+
+    def get_max_video_tokens(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_video_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            num_frames=self.get_num_frames_with_most_features(
+                seq_len, mm_counts),
+            image_processor=None,
+        )
+
+
+class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        hf_processor = self.info.get_hf_processor()
+        image_token: str = hf_processor.image_token
+        video_token: str = hf_processor.video_token
+
+        return image_token * num_images + video_token * num_videos
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        target_num_frames = \
+            self.info.get_num_frames_with_most_features(seq_len, mm_counts)
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "video":
+            self._get_dummy_videos(
+                width=target_width,
+                height=target_height,
+                num_frames=target_num_frames,
+                num_videos=num_videos,
+            )
+        }
+
+
+class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
+                                 ):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return Qwen2VLMultiModalDataParser()
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        mm_kwargs = self.info._get_image_processor_kwargs(**mm_kwargs)
+        return self.info.ctx.call_hf_processor(
+            self.info.get_hf_processor(**mm_kwargs),
+            dict(text=prompt, **mm_data),
+            dict(**mm_kwargs, **tok_kwargs),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_processor = self.info.get_image_processor(
+            **hf_processor_mm_kwargs)
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+
+        placeholder = {
+            "image": vocab[hf_processor.image_token],
+            "video": vocab[hf_processor.video_token],
+        }
+
+        merge_length = image_processor.merge_size**2
+
+        def get_replacement_qwen2vl(item_idx: int, modality: str):
+            grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx]
+            assert isinstance(grid_thw, torch.Tensor)
+
+            num_tokens = int(grid_thw.prod()) // merge_length
+            return [placeholder[modality]] * num_tokens
+
+        return [
+            PromptReplacement(
+                modality=modality,
+                target=[placeholder[modality]],
+                replacement=partial(get_replacement_qwen2vl,
+                                    modality=modality),
+            ) for modality in ("image", "video")
+        ]
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return _qwen2vl_field_config(hf_inputs)
+
+
+@MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor,
+                                        info=Qwen2VLProcessingInfo,
+                                        dummy_inputs=Qwen2VLDummyInputsBuilder)
+class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                      SupportsLoRA, SupportsPP):
+
+    # To ensure correct weight loading and mapping.
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # mapping for new names in checkpoint saved after transformers v4.52
+            "model.language_model.": "language_model.model.",
+            "model.visual.": "visual.",
+            # mapping for original checkpoint
+            "lm_head.": "language_model.lm_head.",
+            "model.": "language_model.model.",
+        })
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<|vision_start|><|image_pad|><|vision_end|>"
+        if modality.startswith("video"):
+            return "<|vision_start|><|video_pad|><|vision_end|>"
+
+        raise ValueError("Only image or video modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: Qwen2VLConfig = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.visual = Qwen2VisionTransformer(
+            config.vision_config,
+            norm_eps=getattr(config, "rms_norm_eps", 1e-6),
+            quant_config=self._maybe_ignore_quant_config(quant_config),
+            prefix=maybe_prefix(prefix, "visual"),
+        )
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+            architectures=["Qwen2ForCausalLM"],
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
+        # GPTQ configs do not have a list of ignored modules, however AutoGPTQ
+        # seems to avoid vision encoder sections for some models.
+        # See: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4
+        if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)):
+            return None
+        return quant_config
+
+    def _validate_and_reshape_mm_tensor(self, mm_input: object,
+                                        name: str) -> torch.Tensor:
+        if not isinstance(mm_input, (torch.Tensor, list)):
+            raise ValueError(f"Incorrect type of {name}. "
+                             f"Got type: {type(mm_input)}")
+        if isinstance(mm_input, torch.Tensor):
+            if mm_input.ndim == 2:
+                return mm_input
+            if mm_input.ndim != 3:
+                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
+                                 f"Got ndim: {mm_input.ndim} "
+                                 f"(shape={mm_input.shape})")
+            return torch.concat(list(mm_input))
+        else:
+            return torch.concat(mm_input)
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Qwen2VLImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        image_grid_thw = kwargs.pop("image_grid_thw", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            pixel_values = self._validate_and_reshape_mm_tensor(
+                pixel_values, "image pixel values")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return Qwen2VLImagePixelInputs(type="pixel_values",
+                                           pixel_values=pixel_values,
+                                           image_grid_thw=image_grid_thw)
+
+        if image_embeds is not None:
+            image_embeds = self._validate_and_reshape_mm_tensor(
+                image_embeds, "image embeds")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            if not isinstance(image_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+            return Qwen2VLImageEmbeddingInputs(type="image_embeds",
+                                               image_embeds=image_embeds,
+                                               image_grid_thw=image_grid_thw)
+
+    def _parse_and_validate_video_input(
+            self, **kwargs: object) -> Optional[Qwen2VLVideoInputs]:
+        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
+        video_embeds = kwargs.pop("video_embeds", None)
+        video_grid_thw = kwargs.pop("video_grid_thw", None)
+
+        if pixel_values_videos is None and video_embeds is None:
+            return None
+
+        if pixel_values_videos is not None:
+            pixel_values_videos = self._validate_and_reshape_mm_tensor(
+                pixel_values_videos, "video pixel values")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            return Qwen2VLVideoPixelInputs(
+                type="pixel_values_videos",
+                pixel_values_videos=pixel_values_videos,
+                video_grid_thw=video_grid_thw,
+            )
+
+        if video_embeds is not None:
+            video_embeds = self._validate_and_reshape_mm_tensor(
+                video_embeds, "video embeds")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            if not isinstance(video_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of video embeddings. "
+                                 f"Got type: {type(video_embeds)}")
+            return Qwen2VLVideoEmbeddingInputs(type="video_embeds",
+                                               video_embeds=video_embeds,
+                                               video_grid_thw=video_grid_thw)
+
+    def _process_image_input(
+            self, image_input: Qwen2VLImageInputs) -> tuple[torch.Tensor, ...]:
+
+        grid_thw = image_input["image_grid_thw"]
+        assert grid_thw.ndim == 2
+
+        if image_input["type"] == "image_embeds":
+            image_embeds = image_input["image_embeds"]
+        else:
+            pixel_values = image_input["pixel_values"]
+            image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
+
+        # Split concatenated embeddings for each image item.
+        merge_size = self.visual.spatial_merge_size
+        sizes = grid_thw.prod(-1) // merge_size // merge_size
+
+        return image_embeds.split(sizes.tolist())
+
+    def _process_video_input(
+            self, video_input: Qwen2VLVideoInputs) -> tuple[torch.Tensor, ...]:
+
+        grid_thw = video_input["video_grid_thw"]
+        assert grid_thw.ndim == 2
+
+        if video_input["type"] == "video_embeds":
+            video_embeds = video_input["video_embeds"]
+        else:
+            pixel_values_videos = video_input["pixel_values_videos"]
+            video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)
+
+        # Split concatenated embeddings for each video item.
+        merge_size = self.visual.spatial_merge_size
+        sizes = grid_thw.prod(-1) // merge_size // merge_size
+
+        return video_embeds.split(sizes.tolist())
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("pixel_values",
+                             "image_embeds") and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+            if input_key in ("pixel_values_videos",
+                             "video_embeds") and "videos" not in modalities:
+                modalities["videos"] = self._parse_and_validate_video_input(
+                    **kwargs)
+
+        return modalities
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
+            return []
+            return None
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in modalities:
+            if modality == "images":
+                image_input = modalities["images"]
+                vision_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += vision_embeddings
+            if modality == "videos":
+                video_input = modalities["videos"]
+                video_embeddings = self._process_video_input(video_input)
+                multimodal_embeddings += video_embeddings
+
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                [self.config.image_token_id, self.config.video_token_id])
+        return inputs_embeds
+
+    def get_input_embeddings_v0(
+        self,
+        input_ids: torch.Tensor,
+        image_input: Optional[Qwen2VLImagePixelInputs] = None,
+        video_input: Optional[Qwen2VLVideoPixelInputs] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.get_input_embeddings(input_ids)
+        if image_input is not None:
+            image_embeds = self._process_image_input(image_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                image_embeds,
+                placeholder_token_id=self.config.image_token_id,
+            )
+
+        if video_input is not None:
+            video_embeds = self._process_video_input(video_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                video_embeds,
+                placeholder_token_id=self.config.video_token_id,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        """Run forward pass for Qwen2-VL.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            positions: Flattened (concatenated) position ids corresponding to a
+                batch.
+                **NOTE**: If mrope is enabled (default setting for Qwen2-VL
+                opensource models), the shape will be `(3, seq_len)`,
+                otherwise it will be `(seq_len,).
+            pixel_values: Pixel values to be fed to a model.
+                `None` if no images are passed.
+            image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
+                `None` if no images are passed.
+            pixel_values_videos: Pixel values of videos to be fed to a model.
+                `None` if no videos are passed.
+            video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
+                `None` if no videos are passed.
+        """
+
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner from
+        # `get_multimodal_embeddings` and `get_input_embeddings`, this
+        # condition is only for v0 compatibility.
+        elif inputs_embeds is None:
+            image_input = self._parse_and_validate_image_input(**kwargs)
+            video_input = self._parse_and_validate_video_input(**kwargs)
+
+            if image_input is None and video_input is None:
+                inputs_embeds = None
+            else:
+                if uses_mrope(self.config):
+                    assert positions.ndim == 2 and positions.size(0) == 3, (
+                        "multimodal section rotary embedding requires "
+                        f"(3, seq_len) positions, but got {positions.size()}")
+                inputs_embeds = self.get_input_embeddings_v0(
+                    input_ids,
+                    image_input=image_input,
+                    video_input=video_input)
+                input_ids = None
+
+        hidden_states = self.language_model.model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="visual.merger.",
+            tower_model="visual.",
+        )
+
+
+class Tarsier2MultiModalProcessor(Qwen2VLMultiModalProcessor):
+    pass
+
+
+class Tarsier2ImageProcessor(Qwen2VLImageProcessor):
+
+    def __init__(
+        self,
+        size: Optional[dict[str, int]] = None,
+        **kwargs,
+    ) -> None:
+        if size is not None and "min_pixels" in size and "max_pixels" in size:
+            # Remap if Tarsier2-specific format is provided
+            remapped_size = {
+                "shortest_edge": size["min_pixels"],
+                "longest_edge": size["max_pixels"]
+            }
+            super().__init__(size=remapped_size, **kwargs)
+        else:
+            super().__init__(size=size, **kwargs)
+
+
+class Tarsier2Processor(Qwen2VLProcessor):
+
+    def __init__(
+        self,
+        vision_config: dict,
+        tokenizer: AnyTokenizer,
+        **kwargs,
+    ):
+        self.image_processor = Tarsier2ImageProcessor(**vision_config)
+        super().__init__(image_processor=self.image_processor,
+                         tokenizer=tokenizer,
+                         video_processor=Qwen2VLVideoProcessor(),
+                         chat_template=None,
+                         **kwargs)
+
+
+class Tarsier2ProcessingInfo(Qwen2VLProcessingInfo):
+
+    def get_hf_config(self) -> Qwen2VLConfig:
+        model_path = self.ctx.model_config.model
+        original_config = AutoConfig.from_pretrained(model_path)
+        config_dict = original_config.to_dict()
+        correct_config = Qwen2VLConfig.from_dict(config_dict)
+
+        return correct_config
+
+    def get_hf_processor(self, **kwargs: object) -> Tarsier2Processor:
+        return Tarsier2Processor(
+            vision_config=self.ctx.get_hf_image_processor_config(),
+            tokenizer=self.get_tokenizer(),
+            **kwargs,
+        )
+
+    def get_image_processor(self) -> Tarsier2ImageProcessor:
+        return Tarsier2ImageProcessor(
+            **self.ctx.get_hf_image_processor_config())
+
+
+@MULTIMODAL_REGISTRY.register_processor(Tarsier2MultiModalProcessor,
+                                        info=Tarsier2ProcessingInfo,
+                                        dummy_inputs=Qwen2VLDummyInputsBuilder)
+class Tarsier2ForConditionalGeneration(Qwen2VLForConditionalGeneration):
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
+        "vision_tower.": "visual.",
+    })
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        # Tarsier2 uses llava as model_type, which will create a Qwen2VLConfig
+        # as text_config, we need to reconstruct Qwen2VLConfig from LlavaConfig.
+        config = vllm_config.model_config.hf_config
+        qwen2vl_config = config.text_config
+        qwen2vl_config.architectures = config.architectures
+        vllm_config.model_config.hf_config = qwen2vl_config
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/qwen3.py b/vllm_v0.10.0/vllm/model_executor/models/qwen3.py
new file mode 100644
index 0000000..393ce41
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/qwen3.py
@@ -0,0 +1,321 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen3 model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import Qwen3Config
+
+from vllm.attention import Attention, AttentionType
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .qwen2 import Qwen2MLP as Qwen3MLP
+from .qwen2 import Qwen2Model
+from .utils import AutoWeightsLoader, PPMissingLayer, maybe_prefix
+
+logger = init_logger(__name__)
+
+
+class Qwen3Attention(nn.Module):
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_heads: int,
+                 num_kv_heads: int,
+                 max_position: int = 4096 * 32,
+                 head_dim: Optional[int] = None,
+                 rms_norm_eps: float = 1e-06,
+                 qkv_bias: bool = False,
+                 rope_theta: float = 10000,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 rope_scaling: Optional[tuple] = None,
+                 prefix: str = "",
+                 attn_type: str = AttentionType.DECODER) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim or hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=self.rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn",
+                              attn_type=attn_type)
+        self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        # Add qk-norm
+        q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim,
+                           self.head_dim)
+        q_by_head = self.q_norm(q_by_head)
+        q = q_by_head.view(q.shape)
+        k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim,
+                           self.head_dim)
+        k_by_head = self.k_norm(k_by_head)
+        k = k_by_head.view(k.shape)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Qwen3DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Qwen3Config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        # Requires transformers > 4.32.0
+        rope_theta = getattr(config, "rope_theta", 1000000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+
+        # By default, Qwen3 uses causal attention as it is a decoder-only model.
+        # You can override the HF config with `is_causal=False` to enable
+        # bidirectional attention, which is used in some embedding models
+        # (e.g. Alibaba-NLP/gte-Qwen3-7B-instruct)
+        if getattr(config, "is_causal", True):
+            attn_type = AttentionType.DECODER
+        else:
+            attn_type = AttentionType.ENCODER_ONLY
+
+        self.self_attn = Qwen3Attention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rms_norm_eps=config.rms_norm_eps,
+            qkv_bias=getattr(config, 'attention_bias', False),
+            head_dim=getattr(config, 'head_dim', None),
+            cache_config=cache_config,
+            quant_config=quant_config,
+            rope_scaling=rope_scaling,
+            prefix=f"{prefix}.self_attn",
+            attn_type=attn_type,
+        )
+        self.mlp = Qwen3MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+ALL_DECODER_LAYER_TYPES = {
+    "attention": Qwen3DecoderLayer,
+}
+
+
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl,
+        # otherwise (seq_len, ).
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    })
+class Qwen3Model(Qwen2Model):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         decoder_layer_type=Qwen3DecoderLayer)
+
+
+class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.lora_config = lora_config
+
+        self.quant_config = quant_config
+        self.model = Qwen3Model(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+
+        if get_pp_group().is_last_rank:
+            if config.tie_word_embeddings:
+                self.lm_head = self.model.embed_tokens
+            else:
+                self.lm_head = ParallelLMHead(config.vocab_size,
+                                              config.hidden_size,
+                                              quant_config=quant_config,
+                                              prefix=maybe_prefix(
+                                                  prefix, "lm_head"))
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/qwen3_moe.py b/vllm_v0.10.0/vllm/model_executor/models/qwen3_moe.py
new file mode 100644
index 0000000..12899c2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/qwen3_moe.py
@@ -0,0 +1,546 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen3MoE model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, extract_layer_index,
+                    is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+logger = init_logger(__name__)
+
+
+class Qwen3MoeMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config,
+                                           reduce_results=reduce_results,
+                                           prefix=f"{prefix}.down_proj")
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Qwen3MoeSparseMoeBlock(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        if self.tp_size > config.num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.num_experts}.")
+
+        self.experts = FusedMoE(num_experts=config.num_experts,
+                                top_k=config.num_experts_per_tok,
+                                hidden_size=config.hidden_size,
+                                intermediate_size=config.moe_intermediate_size,
+                                reduce_results=False,
+                                renormalize=config.norm_topk_prob,
+                                quant_config=quant_config,
+                                prefix=f"{prefix}.experts")
+
+        self.gate = ReplicatedLinear(config.hidden_size,
+                                     config.num_experts,
+                                     bias=False,
+                                     quant_config=None,
+                                     prefix=f"{prefix}.gate")
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
+        hidden_dim = hidden_states.shape[-1]
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        final_hidden_states = self.experts(hidden_states=hidden_states,
+                                           router_logits=router_logits)
+
+        if self.tp_size > 1:
+            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(  # noqa E501
+                final_hidden_states)
+
+        return final_hidden_states.view(orig_shape)
+
+
+class Qwen3MoeAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        head_dim: Optional[int] = None,
+        rms_norm_eps: float = 1e-06,
+        qkv_bias: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim or (hidden_size // self.total_num_heads)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(hidden_size,
+                                          self.head_dim,
+                                          self.total_num_heads,
+                                          self.total_num_kv_heads,
+                                          bias=qkv_bias,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.qkv_proj")
+
+        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
+                                        hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.o_proj")
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+        self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        # Add qk-norm
+        q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim,
+                           self.head_dim)
+        q_by_head = self.q_norm(q_by_head)
+        q = q_by_head.view(q.shape)
+
+        k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim,
+                           self.head_dim)
+        k_by_head = self.k_norm(k_by_head)
+        k = k_by_head.view(k.shape)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Qwen3MoeDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.self_attn = Qwen3MoeAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            rms_norm_eps=config.rms_norm_eps,
+            qkv_bias=getattr(config, 'attention_bias', False),
+            head_dim=getattr(config, 'head_dim', None),
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+
+        # `mlp_only_layers` in the config.
+        layer_idx = extract_layer_index(prefix)
+        mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else
+                           config.mlp_only_layers)
+        if (layer_idx not in mlp_only_layers) and (
+                config.num_experts > 0 and
+            (layer_idx + 1) % config.decoder_sparse_step == 0):
+            self.mlp = Qwen3MoeSparseMoeBlock(config=config,
+                                              quant_config=quant_config,
+                                              prefix=f"{prefix}.mlp")
+        else:
+            self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size,
+                                   intermediate_size=config.intermediate_size,
+                                   hidden_act=config.hidden_act,
+                                   quant_config=quant_config,
+                                   prefix=f"{prefix}.mlp")
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class Qwen3MoeModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.config = config
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=f"{prefix}.embed_tokens")
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Qwen3MoeDecoderLayer(config=config,
+                                                cache_config=cache_config,
+                                                quant_config=quant_config,
+                                                prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(positions, hidden_states, residual)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        return FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Skip loading extra parameters for GPTQ/modelopt models.
+        ignore_suffixes = (".bias", "_bias", ".k_scale", "_k_scale",
+                           ".v_scale", "_v_scale", ".weight_scale",
+                           "_weight_scale", ".input_scale", "_input_scale")
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        expert_params_mapping = self.get_expert_mapping()
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                # Skip loading extra parameters for GPTQ/modelopt models.
+                if name.endswith(ignore_suffixes) and name not in params_dict:
+                    continue
+
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    # Skip loading extra parameters for GPTQ/modelopt models.
+                    if name.endswith(
+                            ignore_suffixes) and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra parameters for GPTQ/modelopt models.
+                    if name.endswith(
+                            ignore_suffixes) and name not in params_dict:
+                        continue
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    if name.endswith("kv_scale"):
+                        remapped_kv_scale_name = name.replace(
+                            ".kv_scale", ".attn.kv_scale")
+                        if remapped_kv_scale_name not in params_dict:
+                            logger.warning_once(
+                                "Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.",  # noqa: E501
+                                name,
+                                remapped_kv_scale_name,
+                            )
+                            continue
+                        else:
+                            name = remapped_kv_scale_name
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Qwen3MoeModel(vllm_config=vllm_config,
+                                   prefix=maybe_prefix(prefix, "model"))
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
diff --git a/vllm_v0.10.0/vllm/model_executor/models/qwen_vl.py b/vllm_v0.10.0/vllm/model_executor/models/qwen_vl.py
new file mode 100644
index 0000000..4c3fd6b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/qwen_vl.py
@@ -0,0 +1,796 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://huggingface.co/Qwen/Qwen-VL/blob/main/modeling_qwen.py
+# Copyright (c) Alibaba Cloud.
+"""Inference-only Qwen-VL model compatible with HuggingFace weights."""
+
+import copy
+import math
+import unicodedata
+from collections.abc import Collection, Mapping, Sequence, Set
+from functools import lru_cache, partial
+from typing import Callable, Literal, Optional, TypedDict, Union
+
+import regex as re
+import torch
+from torch import nn
+from torchvision import transforms
+from torchvision.transforms import InterpolationMode
+from transformers import (BatchFeature, PretrainedConfig, PreTrainedTokenizer,
+                          TensorType)
+from transformers.image_utils import ImageInput
+from transformers.tokenization_utils_base import TextInput
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.resampler import Resampler2, get_abs_pos
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
+from .qwen import QWenBaseModel, QWenModel
+from .utils import flatten_bn, merge_multimodal_embeddings
+
+
+class QwenImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, 3, image_size, image_size)`
+
+    Note that image_size is the value in the vision config to which we resize
+    the image to in the normalization transform. Currently multi-image support
+    can only be leveraged by passing image embeddings directly.
+    """
+
+
+class QwenImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: torch.Tensor
+    """Shape: `(batch_size * num_images, 256, hidden_size)`
+
+    `hidden_size` must match the hidden size of the language model backbone
+    and is stored in the visual config of the model if we have one.
+    """
+
+
+QwenImageInputs = Union[QwenImagePixelInputs, QwenImageEmbeddingInputs]
+
+
+class VisualAttention(nn.Module):
+    """self-attention layer class.
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        bias: bool = True,
+        kdim: Optional[int] = None,
+        vdim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim \
+            and self.vdim == embed_dim
+
+        self.num_heads = num_heads
+
+        # Per attention head and per partition values.
+        assert embed_dim % num_heads == 0
+        self.hidden_size_per_attention_head = embed_dim // num_heads
+        self.num_attention_heads_per_partition = num_heads
+        self.hidden_size_per_partition = embed_dim
+
+        # Strided linear layer.
+        assert self._qkv_same_embed_dim, \
+                'Visual Attention implementation only supports self-attention'
+        self.in_proj = ReplicatedLinear(embed_dim, 3 * embed_dim)
+        self.out_proj = ReplicatedLinear(embed_dim, embed_dim)
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # query/key/value: [sq, b, h]
+        sq, b, _ = x.size()
+        mixed_x_layer, _ = self.in_proj(x)
+
+        # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
+        new_tensor_shape = mixed_x_layer.size()[:-1] + \
+            (self.num_attention_heads_per_partition,
+             3 * self.hidden_size_per_attention_head)
+        mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+
+        # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
+        query_layer, key_layer, value_layer = mixed_x_layer.split(
+            self.hidden_size_per_attention_head, dim=-1)
+
+        # [sq, b, np, hn] -> [sq, b * np, hn]
+        query_layer = query_layer.view(
+            sq, b * self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head).transpose(0, 1)
+        # [sk, b, np, hn] -> [sk, b * np, hn]
+        key_layer = key_layer.view(
+            sq, b * self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head).transpose(0, 1)
+
+        q_scaled = query_layer / self.norm_factor
+        if attn_mask is not None:
+            attention_probs = torch.baddbmm(attn_mask, q_scaled,
+                                            key_layer.transpose(-2, -1))
+        else:
+            attention_probs = torch.bmm(q_scaled, key_layer.transpose(-2, -1))
+        attention_probs = attention_probs.softmax(dim=-1)
+
+        value_layer = value_layer.view(
+            sq, b * self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head).transpose(0, 1)
+
+        # matmul: [b * np, sq, hn]
+        context_layer = torch.bmm(attention_probs, value_layer)
+
+        # change view [b, np, sq, hn]
+        context_layer = context_layer.view(
+            b, self.num_attention_heads_per_partition, sq,
+            self.hidden_size_per_attention_head)
+
+        # [b, np, sq, hn] --> [sq, b, np, hn]
+        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+
+        # [sq, b, np, hn] --> [sq, b, hp]
+        new_context_layer_shape = context_layer.size()[:-2] + \
+            (self.hidden_size_per_partition,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        output, _ = self.out_proj(context_layer)
+
+        return output
+
+
+class QwenVLMLP(nn.Module):
+    """MLP for the visual component of the Qwen model."""
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.c_fc = ColumnParallelLinear(hidden_size,
+                                         intermediate_size,
+                                         bias=True,
+                                         quant_config=quant_config)
+        self.act_fn = get_act_fn("gelu")
+        self.c_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=True,
+            quant_config=quant_config,
+        )
+
+    def forward(self, x):
+        x, _ = self.c_fc(x)
+        x = self.act_fn(x)
+        x, _ = self.c_proj(x)
+        return x
+
+
+class VisualAttentionBlock(nn.Module):
+
+    def __init__(
+        self,
+        d_model: int,
+        n_head: int,
+        mlp_ratio: float = 4.0,
+        norm_layer: Callable[[int], nn.Module] = nn.LayerNorm,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+
+        self.ln_1 = norm_layer(d_model)
+        self.ln_2 = norm_layer(d_model)
+        mlp_width = int(d_model * mlp_ratio)
+        self.attn = VisualAttention(d_model, n_head)
+        self.mlp = QwenVLMLP(
+            hidden_size=d_model,
+            intermediate_size=mlp_width,
+            quant_config=quant_config,
+        )
+
+    def attention(
+        self,
+        x: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        attn_mask = attn_mask.to(x.dtype) if attn_mask is not None else None
+        return self.attn(x, attn_mask=attn_mask)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        x = x + self.attention(self.ln_1(x), attn_mask=attn_mask)
+        x = x + self.mlp(self.ln_2(x))
+        return x
+
+
+class TransformerBlock(nn.Module):
+
+    def __init__(
+        self,
+        width: int,
+        layers: int,
+        heads: int,
+        mlp_ratio: float = 4.0,
+        norm_layer: Callable[[int], nn.Module] = nn.LayerNorm,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+
+        self.resblocks = nn.ModuleList([
+            VisualAttentionBlock(width,
+                                 heads,
+                                 mlp_ratio,
+                                 norm_layer=norm_layer,
+                                 quant_config=quant_config)
+            for _ in range(layers)
+        ])
+
+    def get_cast_dtype(self) -> torch.dtype:
+        return self.resblocks[0].mlp.c_fc.weight.dtype
+
+    def get_cast_device(self) -> torch.device:
+        return self.resblocks[0].mlp.c_fc.weight.device
+
+    def forward(self,
+                x: torch.Tensor,
+                attn_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        for r in self.resblocks:
+            x = r(x, attn_mask=attn_mask)
+        return x
+
+
+class VisionTransformer(nn.Module):
+
+    def __init__(self,
+                 image_size: int,
+                 patch_size: int,
+                 width: int,
+                 layers: int,
+                 heads: int,
+                 mlp_ratio: float,
+                 n_queries: int = 256,
+                 output_dim: int = 512,
+                 image_start_id: int = 151857,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 **kwargs):
+        super().__init__()
+        image_height, image_width = self.image_size = (image_size, image_size)
+        patch_height, patch_width = self.patch_size = (patch_size, patch_size)
+        self.grid_size = (image_height // patch_height,
+                          image_width // patch_width)
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(in_channels=3,
+                               out_channels=width,
+                               kernel_size=patch_size,
+                               stride=patch_size,
+                               bias=False)
+
+        # class embeddings and positional embeddings
+        scale = width**-0.5
+        self.positional_embedding = nn.Parameter(scale *
+                                                 torch.randn(256, width))
+
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+
+        self.ln_pre = norm_layer(width)
+        self.transformer = TransformerBlock(width,
+                                            layers,
+                                            heads,
+                                            mlp_ratio,
+                                            norm_layer=norm_layer,
+                                            quant_config=quant_config)
+
+        self.attn_pool = Resampler2(
+            grid_size=int(math.sqrt(n_queries)),
+            embed_dim=output_dim,
+            num_heads=output_dim // 128,
+            kv_dim=width,
+            norm_layer=norm_layer,
+            adaptive=False,
+            do_post_projection=False,
+        ).to(
+            device=self.positional_embedding.device,
+            dtype=self.positional_embedding.dtype,
+        )
+
+        self.ln_post = norm_layer(output_dim)
+        self.proj = nn.Parameter(
+            (output_dim**-0.5) * torch.randn(output_dim, output_dim))
+
+        self.image_start_id = image_start_id
+        self.image_end_id = image_start_id + 1
+        self.image_pad_id = image_start_id + 2
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.to(
+            dtype=self.transformer.get_cast_dtype(),
+            device=self.transformer.get_cast_device(),
+        )
+
+        # to patches
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1],
+                      -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+
+        x = x + get_abs_pos(self.positional_embedding, int(math.sqrt(
+            x.size(1))))
+
+        x = self.ln_pre(x)
+
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+
+        x = self.attn_pool(x)
+        x = self.ln_post(x)
+        x = x @ self.proj
+
+        return x
+
+
+class QwenVLModel(QWenModel):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
+        self.visual = VisionTransformer(**config.visual,
+                                        quant_config=quant_config)
+
+
+@lru_cache(maxsize=1)
+def _get_tokenizer_without_image_pad(
+        tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer:
+    """
+    The logic of adding image pad tokens should only be applied in
+    [`QwenVLProcessor`][vllm.model_executor.models.qwen_vl.QwenVLProcessor],
+    so they are patched out here.
+
+    The definition of the wrapped tokenizer can be found here:
+    https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py
+    """
+    new_tokenizer = copy.deepcopy(tokenizer)
+
+    class TokenizerWithoutImagePad(tokenizer.__class__):  # type: ignore
+
+        def tokenize(
+            self,
+            text: str,
+            allowed_special: Union[Set[str], str] = "all",
+            disallowed_special: Union[Collection[str], str] = (),
+            **kwargs,
+        ) -> list[Union[bytes, str]]:
+            text = unicodedata.normalize("NFC", text)
+
+            return [
+                self.decoder[t] for t in self.tokenizer.encode(
+                    text,
+                    allowed_special=allowed_special,
+                    disallowed_special=disallowed_special,
+                )
+            ]
+
+        def _decode(
+            self,
+            token_ids: Union[int, list[int]],
+            skip_special_tokens: bool = False,
+            errors: Optional[str] = None,
+            **kwargs,
+        ) -> str:
+            if isinstance(token_ids, int):
+                token_ids = [token_ids]
+
+            return self.tokenizer.decode(
+                token_ids,
+                errors=errors or self.errors,
+            )
+
+    TokenizerWithoutImagePad.__name__ = \
+        f"{tokenizer.__class__.__name__}WithoutImagePad"
+
+    new_tokenizer.__class__ = TokenizerWithoutImagePad
+    return new_tokenizer
+
+
+class QwenVLProcessor:
+    """
+    This model doesn't define its own HF processor,
+    so we implement our own one here.
+
+    We call the wrapped tokenizer to automatically insert image pad tokens:
+    https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py#L245
+
+    The image processor is defined here:
+    https://huggingface.co/Qwen/Qwen-VL/blob/main/visual.py#L354
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: PreTrainedTokenizer,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.tokenizer = tokenizer
+
+        vision_config = config.visual
+        image_size = vision_config["image_size"]
+
+        self.image_transform = transforms.Compose([
+            transforms.Resize(
+                (image_size, image_size),
+                interpolation=InterpolationMode.BICUBIC,
+            ),
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=(0.48145466, 0.4578275, 0.40821073),
+                std=(0.26862954, 0.26130258, 0.27577711),
+            ),
+        ])
+
+    @property
+    def image_start_tag(self) -> str:
+        return self.tokenizer.image_start_tag  # type: ignore
+
+    @property
+    def image_end_tag(self) -> str:
+        return self.tokenizer.image_end_tag  # type: ignore
+
+    @property
+    def image_pad_tag(self) -> str:
+        return self.tokenizer.image_pad_tag  # type: ignore
+
+    def __call__(
+        self,
+        text: Optional[Union[TextInput, list[TextInput]]] = None,
+        images: Optional[Union[ImageInput, list[ImageInput]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ) -> BatchFeature:
+        if text is None:
+            text = []
+        if not isinstance(text, list):
+            text = [text]
+        if images is None:
+            images = []
+        if not isinstance(images, list):
+            images = [images]
+
+        text_inputs = self.tokenizer(text)
+
+        if len(images) == 0:
+            image_inputs = {}
+        else:
+            pixel_values = [self.image_transform(image) for image in images]
+            image_inputs = {"pixel_values": torch.stack(pixel_values)}
+
+        return BatchFeature(
+            {
+                **text_inputs,
+                **image_inputs,
+            },
+            tensor_type=return_tensors,
+        )
+
+
+class QwenVLProcessingInfo(BaseProcessingInfo):
+
+    def get_tokenizer(self) -> PreTrainedTokenizer:
+        tokenizer = self.ctx.tokenizer
+        assert isinstance(tokenizer, PreTrainedTokenizer)
+
+        return _get_tokenizer_without_image_pad(tokenizer)
+
+    def get_hf_processor(self, **kwargs: object) -> QwenVLProcessor:
+        return self.ctx.init_processor(
+            QwenVLProcessor,
+            config=self.get_hf_config(),
+            tokenizer=self.get_tokenizer(),
+            **kwargs,
+        )
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_num_image_tokens(self) -> int:
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.visual
+
+        image_size = vision_config["image_size"]
+        patch_size = vision_config["patch_size"]
+        grid_length = image_size // patch_size // 2
+        return grid_length * grid_length
+
+
+class QwenVLDummyInputsBuilder(BaseDummyInputsBuilder[QwenVLProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        hf_processor = self.info.get_hf_processor()
+        img_start = hf_processor.image_start_tag
+        img_end = hf_processor.image_end_tag
+
+        return "".join(f"Picture {i}: {img_start}{img_end}\n"
+                       for i in range(1, num_images + 1))
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        hf_config = self.info.get_hf_config()
+        vision_config = hf_config.visual
+
+        target_width = target_height = vision_config["image_size"]
+        num_images = mm_counts.get("image", 0)
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+
+class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        # Drops anything between <img>/</img> tags; encoding with the tokenizer
+        # will automatically add the image pads for the context.
+        prompt, num_matched_images = re.subn(
+            r"(Picture \d*: <img>).*?(<\/img>\n)",
+            r"\1\2",
+            prompt,
+        )
+
+        image_data = mm_data.get("images")
+        if image_data is not None:
+            assert isinstance(image_data, list)
+
+            num_images = len(image_data)
+            assert num_matched_images == num_images
+
+        return super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+
+    def _hf_processor_applies_updates(
+        self,
+        prompt_text: str,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+    ) -> bool:
+        return False
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        tokenizer = self.info.get_tokenizer()
+        special_tokens: dict[str,
+                             int] = tokenizer.special_tokens  # type: ignore
+
+        processor = self.info.get_hf_processor()
+        img_start_id = special_tokens[processor.image_start_tag]
+        img_end_id = special_tokens[processor.image_end_tag]
+        img_pad_id = special_tokens[processor.image_pad_tag]
+
+        num_image_tokens = self.info.get_num_image_tokens()
+        image_tokens = [img_pad_id] * num_image_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[img_start_id, img_end_id],
+                replacement=PromptUpdateDetails.select_token_id(
+                    [img_start_id] + image_tokens + [img_end_id],
+                    embed_token_id=img_pad_id,
+                ),
+            )
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(QwenVLMultiModalProcessor,
+                                        info=QwenVLProcessingInfo,
+                                        dummy_inputs=QwenVLDummyInputsBuilder)
+class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
+                                     SupportsMultiModal):
+    packed_modules_mapping = {
+        "c_attn": ["c_attn"],
+        "gate_up_proj": [
+            "w2",
+            "w1",
+        ],
+    }
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="transformer.h",
+            connector="transformer.visual.attn_pool",
+            tower_model="transformer.visual.transformer")
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return f"Picture {i}: <img></img>"
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        transformer_type: type[QwenVLModel] = QwenVLModel,
+    ) -> None:
+        super().__init__(
+            vllm_config=vllm_config,
+            prefix=prefix,
+            transformer_type=transformer_type,
+        )
+
+        self.transformer: QwenVLModel
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.visual["image_size"]
+        expected_dims = (3, h, w)
+        actual_dims = tuple(data.shape[1:])
+
+        if actual_dims != expected_dims:
+            expected_expr = ("batch_size", *map(str, expected_dims))
+            raise ValueError(
+                f"The expected shape of pixel values is {expected_expr}. "
+                f"You supplied {tuple(data.shape)}.")
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[QwenImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return QwenImagePixelInputs(
+                type="pixel_values",
+                data=self._validate_pixel_values(
+                    flatten_bn(pixel_values, concat=True)),
+            )
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+
+            return QwenImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds, concat=True),
+            )
+
+        return None
+
+    def _process_image_input(self,
+                             image_input: QwenImageInputs) -> torch.Tensor:
+        if image_input["type"] == "image_embeds":
+            return image_input["data"]
+
+        return self.transformer.visual(image_input["data"])
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.transformer
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return []
+
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.transformer.get_input_embeddings(input_ids)
+
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.transformer.visual.image_pad_id)
+
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.transformer(input_ids, positions,
+                                         intermediate_tensors, inputs_embeds)
+        return hidden_states
diff --git a/vllm_v0.10.0/vllm/model_executor/models/registry.py b/vllm_v0.10.0/vllm/model_executor/models/registry.py
new file mode 100644
index 0000000..2aaac77
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/registry.py
@@ -0,0 +1,696 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Whenever you add an architecture to this page, please also update
+`tests/models/registry.py` with example HuggingFace models for it.
+"""
+import importlib
+import os
+import pickle
+import subprocess
+import sys
+import tempfile
+from abc import ABC, abstractmethod
+from collections.abc import Set
+from dataclasses import asdict, dataclass, field
+from functools import lru_cache
+from typing import Callable, Optional, TypeVar, Union
+
+import torch.nn as nn
+
+from vllm.logger import init_logger
+
+from .interfaces import (has_inner_state, has_noops, is_attention_free,
+                         is_hybrid, supports_cross_encoding,
+                         supports_multimodal, supports_multimodal_raw_input,
+                         supports_pp, supports_transcription, supports_v0_only)
+from .interfaces_base import is_text_generation_model
+
+logger = init_logger(__name__)
+
+# yapf: disable
+_TEXT_GENERATION_MODELS = {
+    # [Decoder-only]
+    "AquilaModel": ("llama", "LlamaForCausalLM"),
+    "AquilaForCausalLM": ("llama", "LlamaForCausalLM"),  # AquilaChat2
+    "ArceeForCausalLM": ("arcee", "ArceeForCausalLM"),
+    "ArcticForCausalLM": ("arctic", "ArcticForCausalLM"),
+    "MiniMaxForCausalLM": ("minimax_text_01", "MiniMaxText01ForCausalLM"),
+    "MiniMaxText01ForCausalLM": ("minimax_text_01", "MiniMaxText01ForCausalLM"),
+    "MiniMaxM1ForCausalLM": ("minimax_text_01", "MiniMaxText01ForCausalLM"),
+    # baichuan-7b, upper case 'C' in the class name
+    "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"),
+    # baichuan-13b, lower case 'c' in the class name
+    "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"),
+    "BailingMoeForCausalLM": ("bailing_moe", "BailingMoeForCausalLM"),
+    "BambaForCausalLM": ("bamba", "BambaForCausalLM"),
+    "BloomForCausalLM": ("bloom", "BloomForCausalLM"),
+    "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
+    "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),
+    "CohereForCausalLM": ("commandr", "CohereForCausalLM"),
+    "Cohere2ForCausalLM": ("commandr", "CohereForCausalLM"),
+    "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),
+    "DeciLMForCausalLM": ("nemotron_nas", "DeciLMForCausalLM"),
+    "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
+    "DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"),
+    "DeepseekV3ForCausalLM": ("deepseek_v2", "DeepseekV3ForCausalLM"),
+    "Dots1ForCausalLM": ("dots1", "Dots1ForCausalLM"),
+    "Ernie4_5_ForCausalLM": ("ernie45", "Ernie4_5_ForCausalLM"),
+    "Ernie4_5_MoeForCausalLM": ("ernie45_moe", "Ernie4_5_MoeForCausalLM"),
+    "ExaoneForCausalLM": ("exaone", "ExaoneForCausalLM"),
+    "Exaone4ForCausalLM": ("exaone4", "Exaone4ForCausalLM"),
+    "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
+    "Fairseq2LlamaForCausalLM": ("fairseq2_llama", "Fairseq2LlamaForCausalLM"),
+    "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
+    "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"),
+    "Gemma3ForCausalLM": ("gemma3", "Gemma3ForCausalLM"),
+    #TODO(ywang96): Support multimodal gemma3n
+    "Gemma3nForConditionalGeneration": ("gemma3n", "Gemma3nForConditionalGeneration"),    # noqa: E501
+    "GlmForCausalLM": ("glm", "GlmForCausalLM"),
+    "Glm4ForCausalLM": ("glm4", "Glm4ForCausalLM"),
+    "Glm4MoeForCausalLM": ("glm4_moe", "Glm4MoeForCausalLM"),
+    "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
+    "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
+    "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"),
+    "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
+    "GraniteForCausalLM": ("granite", "GraniteForCausalLM"),
+    "GraniteMoeForCausalLM": ("granitemoe", "GraniteMoeForCausalLM"),
+    "GraniteMoeHybridForCausalLM": ("granitemoehybrid", "GraniteMoeHybridForCausalLM"),   # noqa: E501
+    "GraniteMoeSharedForCausalLM": ("granitemoeshared", "GraniteMoeSharedForCausalLM"),   # noqa: E501
+    "GritLM": ("gritlm", "GritLM"),
+    "Grok1ModelForCausalLM": ("grok1", "Grok1ForCausalLM"),
+    "HunYuanMoEV1ForCausalLM": ("hunyuan_v1", "HunYuanMoEV1ForCausalLM"),
+    "HunYuanDenseV1ForCausalLM": ("hunyuan_v1", "HunYuanDenseV1ForCausalLM"),
+    "InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
+    "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
+    "InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"),
+    "InternLM3ForCausalLM": ("llama", "LlamaForCausalLM"),
+    "JAISLMHeadModel": ("jais", "JAISLMHeadModel"),
+    "JambaForCausalLM": ("jamba", "JambaForCausalLM"),
+    "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
+    # For decapoda-research/llama-*
+    "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
+    "MambaForCausalLM": ("mamba", "MambaForCausalLM"),
+    "FalconMambaForCausalLM": ("mamba", "MambaForCausalLM"),
+    "FalconH1ForCausalLM":("falcon_h1", "FalconH1ForCausalLM"),
+    "Mamba2ForCausalLM": ("mamba2", "Mamba2ForCausalLM"),
+    "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
+    "MiniCPM3ForCausalLM": ("minicpm3", "MiniCPM3ForCausalLM"),
+    "MistralForCausalLM": ("llama", "LlamaForCausalLM"),
+    "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
+    "QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"),
+    # transformers's mpt class has lower case
+    "MptForCausalLM": ("mpt", "MPTForCausalLM"),
+    "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
+    "MiMoForCausalLM": ("mimo", "MiMoForCausalLM"),
+    "NemotronForCausalLM": ("nemotron", "NemotronForCausalLM"),
+    "NemotronHForCausalLM": ("nemotron_h", "NemotronHForCausalLM"),
+    "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
+    "Olmo2ForCausalLM": ("olmo2", "Olmo2ForCausalLM"),
+    "OlmoeForCausalLM": ("olmoe", "OlmoeForCausalLM"),
+    "OPTForCausalLM": ("opt", "OPTForCausalLM"),
+    "OrionForCausalLM": ("orion", "OrionForCausalLM"),
+    "PersimmonForCausalLM": ("persimmon", "PersimmonForCausalLM"),
+    "PhiForCausalLM": ("phi", "PhiForCausalLM"),
+    "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
+    "PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"),
+    "Phi4FlashForCausalLM": ("phi4flash", "Phi4FlashForCausalLM"),
+    "Plamo2ForCausalLM": ("plamo2", "Plamo2ForCausalLM"),
+    "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
+    "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
+    "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),
+    "Qwen3ForCausalLM": ("qwen3", "Qwen3ForCausalLM"),
+    "Qwen3MoeForCausalLM": ("qwen3_moe", "Qwen3MoeForCausalLM"),
+    "RWForCausalLM": ("falcon", "FalconForCausalLM"),
+    "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),
+    "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),
+    "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),
+    "SolarForCausalLM": ("solar", "SolarForCausalLM"),
+    "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
+    "TeleFLMForCausalLM": ("teleflm", "TeleFLMForCausalLM"),
+    "XverseForCausalLM": ("llama", "LlamaForCausalLM"),
+    "Zamba2ForCausalLM": ("zamba2", "Zamba2ForCausalLM"),
+    # [Encoder-decoder]
+    "BartModel": ("bart", "BartForConditionalGeneration"),
+    "BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"),
+}
+
+_EMBEDDING_MODELS = {
+    # [Text-only]
+    "BertModel": ("bert", "BertEmbeddingModel"),
+    "DeciLMForCausalLM": ("nemotron_nas", "DeciLMForCausalLM"),
+    "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"),
+    "GlmForCausalLM": ("glm", "GlmForCausalLM"),
+    "GPT2ForSequenceClassification": ("gpt2", "GPT2ForSequenceClassification"),
+    "GritLM": ("gritlm", "GritLM"),
+    "GteModel": ("bert_with_rope", "SnowflakeGteNewModel"),
+    "GteNewModel": ("bert_with_rope", "GteNewModel"),
+    "InternLM2ForRewardModel": ("internlm2", "InternLM2ForRewardModel"),
+    "JambaForSequenceClassification": ("jamba", "JambaForSequenceClassification"),  # noqa: E501
+    "LlamaModel": ("llama", "LlamaForCausalLM"),
+    **{
+        # Multiple models share the same architecture, so we include them all
+        k: (mod, arch) for k, (mod, arch) in _TEXT_GENERATION_MODELS.items()
+        if arch == "LlamaForCausalLM"
+    },
+    "MistralModel": ("llama", "LlamaForCausalLM"),
+    "ModernBertModel": ("modernbert", "ModernBertModel"),
+    "NomicBertModel": ("bert_with_rope", "NomicBertModel"),
+    "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
+    "Qwen2Model": ("qwen2", "Qwen2ForCausalLM"),
+    "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
+    "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"),
+    "Qwen2ForProcessRewardModel": ("qwen2_rm", "Qwen2ForProcessRewardModel"),
+    "RobertaForMaskedLM": ("roberta", "RobertaEmbeddingModel"),
+    "RobertaModel": ("roberta", "RobertaEmbeddingModel"),
+    "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
+    "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"),
+    # [Multimodal]
+    "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
+    "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
+    "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
+    # Technically PrithviGeoSpatialMAE is a model that works on images, both in
+    # input and output. I am adding it here because it piggy-backs on embedding
+    # models for the time being.
+    "PrithviGeoSpatialMAE": ("prithvi_geospatial_mae", "PrithviGeoSpatialMAE"),
+}
+
+_CROSS_ENCODER_MODELS = {
+    "BertForSequenceClassification": ("bert", "BertForSequenceClassification"),
+    "RobertaForSequenceClassification": ("roberta",
+                                         "RobertaForSequenceClassification"),
+    "XLMRobertaForSequenceClassification": ("roberta",
+                                            "RobertaForSequenceClassification"),
+    "ModernBertForSequenceClassification": ("modernbert",
+                                            "ModernBertForSequenceClassification"),
+    # [Auto-converted (see adapters.py)]
+    "JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"), # noqa: E501,
+}
+
+_MULTIMODAL_MODELS = {
+    # [Decoder-only]
+    "AriaForConditionalGeneration": ("aria", "AriaForConditionalGeneration"),
+    "AyaVisionForConditionalGeneration": ("aya_vision", "AyaVisionForConditionalGeneration"),  # noqa: E501
+    "Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"),
+    "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"),  # noqa: E501
+    "DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"),
+    "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
+    "Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"),  # noqa: E501
+    "GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"),
+    "Glm4vForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"),  # noqa: E501
+    "GraniteSpeechForConditionalGeneration": ("granite_speech", "GraniteSpeechForConditionalGeneration"),  # noqa: E501
+    "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
+    "InternVLChatModel": ("internvl", "InternVLChatModel"),
+    "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
+    "SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"),  # noqa: E501
+    "KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"),
+    "KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"),  # noqa: E501
+    "Llama_Nemotron_Nano_VL": ("nemotron_vl", "LlamaNemotronVLChatModel"),
+    "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
+    "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
+    "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),  # noqa: E501
+    "LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),  # noqa: E501
+    "MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"),  # noqa: E501
+    "MiniMaxVL01ForConditionalGeneration": ("minimax_vl_01", "MiniMaxVL01ForConditionalGeneration"),  # noqa: E501
+    "MiniCPMO": ("minicpmo", "MiniCPMO"),
+    "MiniCPMV": ("minicpmv", "MiniCPMV"),
+    "Mistral3ForConditionalGeneration": ("mistral3", "Mistral3ForConditionalGeneration"),  # noqa: E501
+    "MolmoForCausalLM": ("molmo", "MolmoForCausalLM"),
+    "NVLM_D": ("nvlm_d", "NVLM_D_Model"),
+    "Ovis": ("ovis", "Ovis"),
+    "PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"),  # noqa: E501
+    "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
+    "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"),  # noqa: E501
+    "QwenVLForConditionalGeneration": ("qwen_vl", "QwenVLForConditionalGeneration"),  # noqa: E501
+    "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
+    "Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"),  # noqa: E501
+    "Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"),  # noqa: E501
+    "Qwen2_5OmniModel": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"),  # noqa: E501
+    "Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"),  # noqa: E501
+    "UltravoxModel": ("ultravox", "UltravoxModel"),
+    "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
+    "TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"),  # noqa: E501
+    "Tarsier2ForConditionalGeneration": ("qwen2_vl", "Tarsier2ForConditionalGeneration"),  # noqa: E501
+    "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"),  # noqa: E501
+    # [Encoder-decoder]
+    "Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"),  # noqa: E501
+    "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"),  # noqa: E501
+    "Llama4ForConditionalGeneration": ("mllama4", "Llama4ForConditionalGeneration"),  # noqa: E501
+    "SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"),
+    "WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"),  # noqa: E501
+}
+
+_SPECULATIVE_DECODING_MODELS = {
+    "MiMoMTPModel": ("mimo_mtp", "MiMoMTP"),
+    "EagleLlamaForCausalLM": ("llama_eagle", "EagleLlamaForCausalLM"),
+    "EagleLlama4ForCausalLM": ("llama4_eagle", "EagleLlama4ForCausalLM"),
+    "EagleMiniCPMForCausalLM": ("minicpm_eagle", "EagleMiniCPMForCausalLM"),
+    "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
+    "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"),
+    "Glm4MoeMTPModel": ("glm4_moe_mtp", "Glm4MoeMTP"),
+    "MedusaModel": ("medusa", "Medusa"),
+    # Temporarily disabled.
+    # # TODO(woosuk): Re-enable this once the MLP Speculator is supported in V1.
+    # "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
+}
+
+_TRANSFORMERS_MODELS = {
+    "TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), # noqa: E501
+    "TransformersForCausalLM": ("transformers", "TransformersForCausalLM"),
+}
+# yapf: enable
+
+_VLLM_MODELS = {
+    **_TEXT_GENERATION_MODELS,
+    **_EMBEDDING_MODELS,
+    **_CROSS_ENCODER_MODELS,
+    **_MULTIMODAL_MODELS,
+    **_SPECULATIVE_DECODING_MODELS,
+    **_TRANSFORMERS_MODELS,
+}
+
+# This variable is used as the args for subprocess.run(). We
+# can modify  this variable to alter the args if needed. e.g.
+# when we use par format to pack things together, sys.executable
+# might not be the target we want to run.
+_SUBPROCESS_COMMAND = [
+    sys.executable, "-m", "vllm.model_executor.models.registry"
+]
+
+_PREVIOUSLY_SUPPORTED_MODELS = {"Phi3SmallForCausalLM": "0.9.2"}
+
+
+@dataclass(frozen=True)
+class _ModelInfo:
+    architecture: str
+    is_text_generation_model: bool
+    is_pooling_model: bool
+    supports_cross_encoding: bool
+    supports_multimodal: bool
+    supports_multimodal_raw_input: bool
+    supports_pp: bool
+    has_inner_state: bool
+    is_attention_free: bool
+    is_hybrid: bool
+    has_noops: bool
+    supports_transcription: bool
+    supports_transcription_only: bool
+    supports_v0_only: bool
+
+    @staticmethod
+    def from_model_cls(model: type[nn.Module]) -> "_ModelInfo":
+        return _ModelInfo(
+            architecture=model.__name__,
+            is_text_generation_model=is_text_generation_model(model),
+            is_pooling_model=True,  # Can convert any model into a pooling model
+            supports_cross_encoding=supports_cross_encoding(model),
+            supports_multimodal=supports_multimodal(model),
+            supports_multimodal_raw_input=supports_multimodal_raw_input(model),
+            supports_pp=supports_pp(model),
+            has_inner_state=has_inner_state(model),
+            is_attention_free=is_attention_free(model),
+            is_hybrid=is_hybrid(model),
+            supports_transcription=supports_transcription(model),
+            supports_transcription_only=(supports_transcription(model) and
+                                         model.supports_transcription_only),
+            supports_v0_only=supports_v0_only(model),
+            has_noops=has_noops(model),
+        )
+
+
+class _BaseRegisteredModel(ABC):
+
+    @abstractmethod
+    def inspect_model_cls(self) -> _ModelInfo:
+        raise NotImplementedError
+
+    @abstractmethod
+    def load_model_cls(self) -> type[nn.Module]:
+        raise NotImplementedError
+
+
+@dataclass(frozen=True)
+class _RegisteredModel(_BaseRegisteredModel):
+    """
+    Represents a model that has already been imported in the main process.
+    """
+
+    interfaces: _ModelInfo
+    model_cls: type[nn.Module]
+
+    @staticmethod
+    def from_model_cls(model_cls: type[nn.Module]):
+        return _RegisteredModel(
+            interfaces=_ModelInfo.from_model_cls(model_cls),
+            model_cls=model_cls,
+        )
+
+    def inspect_model_cls(self) -> _ModelInfo:
+        return self.interfaces
+
+    def load_model_cls(self) -> type[nn.Module]:
+        return self.model_cls
+
+
+@dataclass(frozen=True)
+class _LazyRegisteredModel(_BaseRegisteredModel):
+    """
+    Represents a model that has not been imported in the main process.
+    """
+    module_name: str
+    class_name: str
+
+    # Performed in another process to avoid initializing CUDA
+    def inspect_model_cls(self) -> _ModelInfo:
+        return _run_in_subprocess(
+            lambda: _ModelInfo.from_model_cls(self.load_model_cls()))
+
+    def load_model_cls(self) -> type[nn.Module]:
+        mod = importlib.import_module(self.module_name)
+        return getattr(mod, self.class_name)
+
+
+@lru_cache(maxsize=128)
+def _try_load_model_cls(
+    model_arch: str,
+    model: _BaseRegisteredModel,
+) -> Optional[type[nn.Module]]:
+    from vllm.platforms import current_platform
+    current_platform.verify_model_arch(model_arch)
+    try:
+        return model.load_model_cls()
+    except Exception:
+        logger.exception("Error in loading model architecture '%s'",
+                         model_arch)
+        return None
+
+
+@lru_cache(maxsize=128)
+def _try_inspect_model_cls(
+    model_arch: str,
+    model: _BaseRegisteredModel,
+) -> Optional[_ModelInfo]:
+    try:
+        return model.inspect_model_cls()
+    except Exception:
+        logger.exception("Error in inspecting model architecture '%s'",
+                         model_arch)
+        return None
+
+
+@dataclass
+class _ModelRegistry:
+    # Keyed by model_arch
+    models: dict[str, _BaseRegisteredModel] = field(default_factory=dict)
+
+    def get_supported_archs(self) -> Set[str]:
+        return self.models.keys()
+
+    def register_model(
+        self,
+        model_arch: str,
+        model_cls: Union[type[nn.Module], str],
+    ) -> None:
+        """
+        Register an external model to be used in vLLM.
+
+        `model_cls` can be either:
+
+        - A [`torch.nn.Module`][] class directly referencing the model.
+        - A string in the format `<module>:<class>` which can be used to
+          lazily import the model. This is useful to avoid initializing CUDA
+          when importing the model and thus the related error
+          `RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
+        """
+        if not isinstance(model_arch, str):
+            msg = f"`model_arch` should be a string, not a {type(model_arch)}"
+            raise TypeError(msg)
+
+        if model_arch in self.models:
+            logger.warning(
+                "Model architecture %s is already registered, and will be "
+                "overwritten by the new model class %s.", model_arch,
+                model_cls)
+
+        if isinstance(model_cls, str):
+            split_str = model_cls.split(":")
+            if len(split_str) != 2:
+                msg = "Expected a string in the format `<module>:<class>`"
+                raise ValueError(msg)
+
+            model = _LazyRegisteredModel(*split_str)
+        elif isinstance(model_cls, type) and issubclass(model_cls, nn.Module):
+            model = _RegisteredModel.from_model_cls(model_cls)
+        else:
+            msg = ("`model_cls` should be a string or PyTorch model class, "
+                   f"not a {type(model_arch)}")
+            raise TypeError(msg)
+
+        self.models[model_arch] = model
+
+    def _raise_for_unsupported(self, architectures: list[str]):
+        all_supported_archs = self.get_supported_archs()
+
+        if any(arch in all_supported_archs for arch in architectures):
+            raise ValueError(
+                f"Model architectures {architectures} failed "
+                "to be inspected. Please check the logs for more details.")
+
+        raise ValueError(
+            f"Model architectures {architectures} are not supported for now. "
+            f"Supported architectures: {all_supported_archs}")
+
+    def _try_load_model_cls(self,
+                            model_arch: str) -> Optional[type[nn.Module]]:
+        if model_arch not in self.models:
+            return None
+
+        return _try_load_model_cls(model_arch, self.models[model_arch])
+
+    def _try_inspect_model_cls(self, model_arch: str) -> Optional[_ModelInfo]:
+        if model_arch in self.models:
+            return _try_inspect_model_cls(model_arch, self.models[model_arch])
+
+        if model_arch.endswith("ForSequenceClassification"):
+            causal_lm_arch = model_arch.replace("ForSequenceClassification",
+                                                "ForCausalLM")
+            if causal_lm_arch not in self.models:
+                return None
+
+            info = _try_inspect_model_cls(causal_lm_arch,
+                                          self.models[causal_lm_arch])
+
+            info = _ModelInfo(**dict(
+                asdict(info), **{
+                    "architecture": model_arch,
+                    "supports_cross_encoding": True
+                }))
+            return info
+
+        return None
+
+    def _normalize_archs(
+        self,
+        architectures: Union[str, list[str]],
+    ) -> list[str]:
+        if isinstance(architectures, str):
+            architectures = [architectures]
+        if not architectures:
+            logger.warning("No model architectures are specified")
+
+        # filter out support architectures
+        normalized_arch = list(
+            filter(lambda model: model in self.models, architectures))
+
+        # try automatic conversion in adapters.py
+        for arch in architectures:
+            if not arch.endswith("ForSequenceClassification"):
+                continue
+            causal_lm_arch = arch.replace("ForSequenceClassification",
+                                          "ForCausalLM")
+            if causal_lm_arch in self.models:
+                normalized_arch.append(arch)
+
+        # NOTE(Isotr0py): Be careful of architectures' order!
+        # Make sure Transformers backend architecture is at the end of the
+        # list, otherwise pooling models automatic conversion will fail!
+        for arch in normalized_arch:
+            if arch.startswith("TransformersFor"):
+                normalized_arch.remove(arch)
+                normalized_arch.append(arch)
+
+        return normalized_arch
+
+    def inspect_model_cls(
+        self,
+        architectures: Union[str, list[str]],
+    ) -> tuple[_ModelInfo, str]:
+        architectures = self._normalize_archs(architectures)
+
+        for arch in architectures:
+            model_info = self._try_inspect_model_cls(arch)
+            if model_info is not None:
+                return (model_info, arch)
+
+        return self._raise_for_unsupported(architectures)
+
+    def resolve_model_cls(
+        self,
+        architectures: Union[str, list[str]],
+    ) -> tuple[type[nn.Module], str]:
+        architectures = self._normalize_archs(architectures)
+
+        for arch in architectures:
+            model_cls = self._try_load_model_cls(arch)
+            if model_cls is not None:
+                return (model_cls, arch)
+
+        return self._raise_for_unsupported(architectures)
+
+    def is_text_generation_model(
+        self,
+        architectures: Union[str, list[str]],
+    ) -> bool:
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.is_text_generation_model
+
+    def is_pooling_model(
+        self,
+        architectures: Union[str, list[str]],
+    ) -> bool:
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.is_pooling_model
+
+    def is_cross_encoder_model(
+        self,
+        architectures: Union[str, list[str]],
+    ) -> bool:
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.supports_cross_encoding
+
+    def is_multimodal_model(
+        self,
+        architectures: Union[str, list[str]],
+    ) -> bool:
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.supports_multimodal
+
+    def supports_multimodal_raw_input(
+        self,
+        architectures: Union[str, list[str]],
+    ) -> bool:
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.supports_multimodal_raw_input
+
+    def is_pp_supported_model(
+        self,
+        architectures: Union[str, list[str]],
+    ) -> bool:
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.supports_pp
+
+    def model_has_inner_state(
+        self,
+        architectures: Union[str, list[str]],
+    ) -> bool:
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.has_inner_state
+
+    def is_attention_free_model(
+        self,
+        architectures: Union[str, list[str]],
+    ) -> bool:
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.is_attention_free
+
+    def is_hybrid_model(
+        self,
+        architectures: Union[str, list[str]],
+    ) -> bool:
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.is_hybrid
+
+    def is_noops_model(
+        self,
+        architectures: Union[str, list[str]],
+    ) -> bool:
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.has_noops
+
+    def is_transcription_model(
+        self,
+        architectures: Union[str, list[str]],
+    ) -> bool:
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.supports_transcription
+
+    def is_transcription_only_model(
+        self,
+        architectures: Union[str, list[str]],
+    ) -> bool:
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.supports_transcription_only
+
+    def is_v1_compatible(
+        self,
+        architectures: Union[str, list[str]],
+    ) -> bool:
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return not model_cls.supports_v0_only
+
+
+ModelRegistry = _ModelRegistry({
+    model_arch:
+    _LazyRegisteredModel(
+        module_name=f"vllm.model_executor.models.{mod_relname}",
+        class_name=cls_name,
+    )
+    for model_arch, (mod_relname, cls_name) in _VLLM_MODELS.items()
+})
+
+_T = TypeVar("_T")
+
+
+def _run_in_subprocess(fn: Callable[[], _T]) -> _T:
+    # NOTE: We use a temporary directory instead of a temporary file to avoid
+    # issues like https://stackoverflow.com/questions/23212435/permission-denied-to-write-to-my-temporary-file
+    with tempfile.TemporaryDirectory() as tempdir:
+        output_filepath = os.path.join(tempdir, "registry_output.tmp")
+
+        # `cloudpickle` allows pickling lambda functions directly
+        import cloudpickle
+        input_bytes = cloudpickle.dumps((fn, output_filepath))
+
+        # cannot use `sys.executable __file__` here because the script
+        # contains relative imports
+        returned = subprocess.run(_SUBPROCESS_COMMAND,
+                                  input=input_bytes,
+                                  capture_output=True)
+
+        # check if the subprocess is successful
+        try:
+            returned.check_returncode()
+        except Exception as e:
+            # wrap raised exception to provide more information
+            raise RuntimeError(f"Error raised in subprocess:\n"
+                               f"{returned.stderr.decode()}") from e
+
+        with open(output_filepath, "rb") as f:
+            return pickle.load(f)
+
+
+def _run() -> None:
+    # Setup plugins
+    from vllm.plugins import load_general_plugins
+    load_general_plugins()
+
+    fn, output_file = pickle.loads(sys.stdin.buffer.read())
+
+    result = fn()
+
+    with open(output_file, "wb") as f:
+        f.write(pickle.dumps(result))
+
+
+if __name__ == "__main__":
+    _run()
diff --git a/vllm_v0.10.0/vllm/model_executor/models/roberta.py b/vllm_v0.10.0/vllm/model_executor/models/roberta.py
new file mode 100644
index 0000000..c6b4116
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/roberta.py
@@ -0,0 +1,247 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import RobertaConfig
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.pooler import (ClassifierPooler, CLSPool,
+                                               DispatchPooler, Pooler)
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.models.bert import BertEmbeddingModel, BertModel
+from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper,
+                                              maybe_prefix)
+from vllm.sequence import IntermediateTensors
+
+from .bert_with_rope import BertWithRope, JinaRobertaModel
+from .interfaces import SupportsCrossEncoding, SupportsV0Only
+
+
+class RobertaEmbedding(nn.Module):
+
+    def __init__(self, config: RobertaConfig):
+        super().__init__()
+        self.size = config.hidden_size
+        self.word_embeddings = VocabParallelEmbedding(config.vocab_size,
+                                                      config.hidden_size)
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size,
+                                                padding_idx=self.padding_idx)
+
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size,
+                                      eps=config.layer_norm_eps)
+        self.register_buffer(
+            "position_ids",
+            torch.arange(config.max_position_embeddings).unsqueeze(0),
+        )
+
+        self.position_embedding_type = config.position_embedding_type
+        if self.position_embedding_type != "absolute":
+            raise ValueError("Only 'absolute' position_embedding_type" +
+                             " is supported")
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        seq_lens: torch.Tensor,
+        position_ids: torch.Tensor,
+        token_type_ids: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        input_shape = input_ids.size()
+        inputs_embeds = self.word_embeddings(input_ids)
+
+        # Replace position ids because in RoBERTa models
+        # they have to start at padding_idx + 1 and ignore
+        # existing padding tokens
+        # References:
+        # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133
+        # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669
+        seq_lens_list = seq_lens.tolist()
+        new_pos_list = []
+        for positions, tokens in zip(position_ids.split(seq_lens_list),
+                                     input_ids.split(seq_lens_list)):
+            # Verify assumption that incoming position are
+            # always a sequence from 0 to N.
+            expected_pos = torch.arange(positions.size()[0],
+                                        dtype=torch.long,
+                                        device=inputs_embeds.device)
+            assert torch.equal(positions, expected_pos)
+            new_pos_list.append(
+                create_position_ids_from_input_ids(tokens, self.padding_idx))
+        position_ids = torch.cat(new_pos_list)
+
+        # Position embeddings.
+        position_embeddings = self.position_embeddings(position_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape,
+                                         dtype=torch.long,
+                                         device=inputs_embeds.device)
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = inputs_embeds + token_type_embeddings + position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        return embeddings
+
+
+# Adapted from transformers
+class RobertaClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config: RobertaConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # CLSPool has already been applied in `pooling`
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.out_proj(x)
+        return x
+
+
+class RobertaEmbeddingModel(BertEmbeddingModel):
+    """A model that uses Roberta to provide embedding functionalities.
+
+   This class encapsulates the BertModel and provides an interface for
+   embedding operations and customized pooling functions.
+
+   Attributes:
+       model: An instance of BertModel used for forward operations.
+       _pooler: An instance of Pooler used for pooling operations.
+   """
+
+    def _build_model(self,
+                     vllm_config: VllmConfig,
+                     prefix: str = "") -> Union[BertModel, BertWithRope]:
+        if (vllm_config.model_config.hf_config.position_embedding_type ==
+                "rotary"):
+            return JinaRobertaModel(vllm_config=vllm_config, prefix=prefix)
+        else:
+            return BertModel(vllm_config=vllm_config,
+                             prefix=prefix,
+                             embedding_class=RobertaEmbedding)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        weights_list = list(weights)
+        has_roberta_prefix = any(
+            name.startswith("roberta.") for name, _ in weights_list)
+        if has_roberta_prefix:
+            # For models with the `roberta.` prefix e.g.
+            # `FacebookAI/roberta-base`
+            mapper = WeightsMapper(orig_to_new_prefix={"roberta.": "model."})
+        else:
+            # For models without the `roberta.` prefix e.g.
+            # `sentence-transformers/stsb-roberta-base-v2`
+            mapper = WeightsMapper(orig_to_new_prefix={"": "model."})
+
+        loader = AutoWeightsLoader(self, skip_prefixes=["lm_head."])
+        return loader.load_weights(weights_list, mapper=mapper)
+
+
+class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
+                                       SupportsV0Only):
+    """A model that uses Roberta to provide embedding functionalities.
+
+   This class encapsulates the BertModel and provides an interface for
+   embedding operations and customized pooling functions.
+
+   Attributes:
+       roberta: An instance of BertModel used for forward operations.
+       _pooler: An instance of Pooler used for pooling operations.
+   """
+
+    is_pooling_model = True
+    jina_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            'emb_ln': "embeddings.LayerNorm",
+            'layers': "layer",
+            'mixer.Wqkv': "attention.self.qkv_proj",
+            'mixer.out_proj': "attention.output.dense",
+            'norm1': "attention.output.LayerNorm",
+            'mlp.fc1': "intermediate.dense",
+            'mlp.fc2': "output.dense",
+            'norm2': "output.LayerNorm",
+        })
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+
+        self.num_labels = config.num_labels
+        self.roberta = BertModel(vllm_config=vllm_config,
+                                 prefix=maybe_prefix(prefix, "bert"),
+                                 embedding_class=RobertaEmbedding)
+        self.classifier = RobertaClassificationHead(config)
+
+        pooler_config = vllm_config.model_config.pooler_config
+        assert pooler_config is not None
+
+        self.pooler = DispatchPooler({
+            "encode":
+            Pooler.for_encode(pooler_config),
+            "classify":
+            ClassifierPooler(
+                pooling=CLSPool(),
+                classifier=self.classifier,
+                act_fn=ClassifierPooler.act_fn_for_seq_cls(
+                    vllm_config.model_config),
+            ),
+            "score":
+            ClassifierPooler(
+                pooling=CLSPool(),
+                classifier=self.classifier,
+                act_fn=ClassifierPooler.act_fn_for_cross_encoder(
+                    vllm_config.model_config),
+            ),
+        })
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.jina_to_vllm_mapper)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return self.roberta(input_ids=input_ids,
+                            position_ids=positions,
+                            inputs_embeds=inputs_embeds,
+                            intermediate_tensors=intermediate_tensors,
+                            token_type_ids=token_type_ids)
+
+
+# Adapted from transformers
+def create_position_ids_from_input_ids(input_ids,
+                                       padding_idx,
+                                       past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers.
+    Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: torch.Tensor x:
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully
+    # balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+
+    incremental_indices = (torch.cumsum(mask, dim=0).type_as(mask) +
+                           past_key_values_length) * mask
+
+    return incremental_indices.long() + padding_idx
diff --git a/vllm_v0.10.0/vllm/model_executor/models/siglip.py b/vllm_v0.10.0/vllm/model_executor/models/siglip.py
new file mode 100644
index 0000000..3630f59
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/siglip.py
@@ -0,0 +1,524 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Implementation of SiglipVisionModel intended to be only used
+within a vision language model."""
+
+import math
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import SiglipVisionConfig
+
+from vllm.attention.layer import MultiHeadAttention
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+
+from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
+
+
+class SiglipEncoderInfo(VisionEncoderInfo[SiglipVisionConfig]):
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        return self.get_patch_grid_length()**2
+
+    def get_image_size(self) -> int:
+        return self.vision_config.image_size
+
+    def get_patch_size(self) -> int:
+        return self.vision_config.patch_size
+
+    def get_patch_grid_length(self) -> int:
+        image_size, patch_size = self.get_image_size(), self.get_patch_size()
+        return image_size // patch_size
+
+
+# Adapted from https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/models/siglip/modeling_siglip.py#L249 # noqa
+class SiglipVisionEmbeddings(nn.Module):
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+
+        self.num_patches = (self.image_size // self.patch_size)**2
+        self.num_positions = self.num_patches
+        self.position_embedding = VocabParallelEmbedding(
+            self.num_positions, self.embed_dim)
+        self.register_buffer(
+            "position_ids",
+            torch.arange(self.num_positions, dtype=torch.int64).expand(
+                (1, -1)),
+            persistent=False,
+        )
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int,
+                                 width: int) -> torch.Tensor:
+        """
+        This method is an adapted method for SigLIP (due to SigLIP not having
+        class embedding unlike other ViTs) that allows the model to interpolate
+        the pre-trained position encodings such that it can be usable on higher
+        resolution images.
+
+        Source:
+        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        """
+        position_embeddings = self.position_embedding.weight.unsqueeze(0)
+        num_patches = embeddings.shape[1]
+        num_positions = position_embeddings.shape[1]
+        if num_patches == num_positions and height == width:
+            return position_embeddings
+
+        dim = embeddings.shape[-1]
+        height = height // self.patch_size
+        width = width // self.patch_size
+        # we add a small number to avoid floating point error
+        # in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        height, width = height + 0.1, width + 0.1
+
+        patch_pos_embed = position_embeddings.reshape(
+            1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)),
+            dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            scale_factor=(
+                height / math.sqrt(num_positions),
+                width / math.sqrt(num_positions),
+            ),
+            mode="bicubic",
+            align_corners=False,
+        )
+        if (int(height) != patch_pos_embed.shape[-2]
+                or int(width) != patch_pos_embed.shape[-1]):
+            raise ValueError("Width or height does not match with "
+                             "the interpolated position embeddings")
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return patch_pos_embed
+
+    def forward(self,
+                pixel_values: torch.Tensor,
+                interpolate_pos_encoding: bool = False) -> torch.Tensor:
+        _, _, height, width = pixel_values.shape
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(
+            dtype=target_dtype))  # shape = [*, width, grid, grid]
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+
+        if interpolate_pos_encoding:
+            embeddings += self.interpolate_pos_encoding(
+                embeddings, height, width)
+        else:
+            embeddings += self.position_embedding(self.position_ids)
+        return embeddings
+
+
+class SiglipAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: SiglipVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(f"embed_dim must be divisible by num_heads (got "
+                             "`embed_dim`: {self.embed_dim} and `num_heads`:"
+                             f" {self.num_heads}).")
+
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=self.embed_dim,
+            head_size=self.head_dim,
+            total_num_heads=self.num_heads,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.out_proj = RowParallelLinear(
+            input_size=self.embed_dim,
+            output_size=self.embed_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
+
+        self.attn = MultiHeadAttention(self.num_heads_per_partition,
+                                       self.head_dim, self.scale)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        """Input shape: Batch x Time x Channel"""
+        qkv_states, _ = self.qkv_proj(hidden_states)
+        query_states, key_states, value_states = qkv_states.chunk(3, dim=-1)
+
+        out = self.attn(query_states, key_states, value_states)
+        attn_output, _ = self.out_proj(out)
+
+        return attn_output, None
+
+
+class SiglipMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: SiglipVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.activation_fn = get_act_fn(config.hidden_act)
+        # Special handling for BNB and torchao quantization
+        if quant_config and quant_config.get_name() in [
+                "bitsandbytes", "torchao"
+        ]:
+            quantizable = True
+        else:
+            # For other quantization, we require the hidden size to be a
+            # multiple of 64
+            quantizable = (config.hidden_size % 64 == 0
+                           and config.intermediate_size % 64 == 0)
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            quant_config=quant_config if quantizable else None,
+            prefix=f"{prefix}.fc1",
+        )
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            quant_config=quant_config if quantizable else None,
+            prefix=f"{prefix}.fc2",
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        return hidden_states
+
+
+class SiglipEncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: SiglipVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.embed_dim = config.hidden_size
+
+        self.self_attn = SiglipAttention(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, None]:
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, _ = self.self_attn(hidden_states=hidden_states)
+        hidden_states += residual
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states += residual
+
+        return hidden_states, None
+
+
+class SiglipEncoder(nn.Module):
+
+    def __init__(
+        self,
+        config: SiglipVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        num_hidden_layers_override: Optional[int] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        if num_hidden_layers_override is None:
+            num_hidden_layers = config.num_hidden_layers
+        else:
+            num_hidden_layers = num_hidden_layers_override
+
+        self.layers = nn.ModuleList([
+            SiglipEncoderLayer(config,
+                               quant_config=quant_config,
+                               prefix=f"{prefix}.layers.{layer_idx}")
+            for layer_idx in range(num_hidden_layers)
+        ])
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        return_all_hidden_states: bool,
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        hidden_states_pool = [inputs_embeds]
+        hidden_states = inputs_embeds
+
+        for encoder_layer in self.layers:
+            hidden_states, _ = encoder_layer(hidden_states)
+            if return_all_hidden_states:
+                hidden_states_pool.append(hidden_states)
+        # If we have multiple feature sample layers, we return all hidden
+        # states in order and grab the ones we need by index.
+        if return_all_hidden_states:
+            return hidden_states_pool
+        return hidden_states
+
+
+class SiglipMultiheadAttentionPoolingHead(nn.Module):
+    """Multihead Attention Pooling."""
+
+    def __init__(
+        self,
+        config: SiglipVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        # TODO(ChristopherCho): Implement vLLM version of MultiheadAttention
+        self.attention = torch.nn.MultiheadAttention(
+            config.hidden_size, config.num_attention_heads, batch_first=True)
+        self.layernorm = nn.LayerNorm(config.hidden_size,
+                                      eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(config=config,
+                             quant_config=quant_config,
+                             prefix=f"{prefix}.mlp")
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        batch_size = hidden_state.shape[0]
+        probe = self.probe.repeat(batch_size, 1, 1)
+
+        hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
+
+        residual = hidden_state
+        hidden_state = self.layernorm(hidden_state)
+        hidden_state = self.mlp(hidden_state)
+        hidden_state += residual
+
+        return hidden_state[:, 0]
+
+
+class SiglipVisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        config: SiglipVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        num_hidden_layers_override: Optional[int] = None,
+        require_post_norm: Optional[bool] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = SiglipVisionEmbeddings(config)
+
+        self.encoder = SiglipEncoder(
+            config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers_override,
+            prefix=f"{prefix}.encoder",
+        )
+
+        num_hidden_layers = config.num_hidden_layers
+        if len(self.encoder.layers) > config.num_hidden_layers:
+            raise ValueError(
+                f"The original encoder only has {num_hidden_layers} "
+                f"layers, but you requested {len(self.encoder.layers)} layers."
+            )
+
+        # If possible, skip post_layernorm to conserve memory
+        if require_post_norm is None:
+            require_post_norm = len(self.encoder.layers) == num_hidden_layers
+
+        if require_post_norm:
+            self.post_layernorm = nn.LayerNorm(embed_dim,
+                                               eps=config.layer_norm_eps)
+        else:
+            self.post_layernorm = None
+
+        self.use_head = (True if not hasattr(config, "vision_use_head") else
+                         config.vision_use_head)
+        if self.use_head:
+            self.head = SiglipMultiheadAttentionPoolingHead(
+                config=config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.head",
+            )
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        interpolate_pos_encoding: bool = True,
+        feature_sample_layers: Optional[list[int]] = None,
+    ) -> torch.Tensor:
+
+        hidden_states = self.embeddings(
+            pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+        )
+
+        return_all_hidden_states = feature_sample_layers is not None
+
+        # Produces either the last layer output or all of the hidden states,
+        # depending on if we have feature_sample_layers or not
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            return_all_hidden_states=return_all_hidden_states,
+        )
+
+        # Handle post-norm (if applicable) and stacks feature layers if needed
+        encoder_outputs = resolve_visual_encoder_outputs(
+            encoder_outputs, feature_sample_layers, self.post_layernorm,
+            self.config.num_hidden_layers)
+
+        # TODO: add this back when pooled_output is used in inference.
+        # if self.use_head:
+        # pooled_output = self.head(encoder_outputs)
+
+        return encoder_outputs
+
+
+class SiglipVisionModel(nn.Module):
+    config_class = SiglipVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(
+        self,
+        config: SiglipVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        num_hidden_layers_override: Optional[int] = None,
+        require_post_norm: Optional[bool] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.vision_model = SiglipVisionTransformer(
+            config,
+            quant_config,
+            num_hidden_layers_override=num_hidden_layers_override,
+            require_post_norm=require_post_norm,
+            prefix=f"{prefix}.vision_model",
+        )
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        interpolate_pos_encoding: bool = False,
+        feature_sample_layers: Optional[list[int]] = None,
+    ) -> torch.Tensor:
+        return self.vision_model(
+            pixel_values=pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            feature_sample_layers=feature_sample_layers,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        layer_count = len(self.vision_model.encoder.layers)
+
+        for name, loaded_weight in weights:
+            # post_layernorm is optional in SiglipVisionModel
+            if (name.startswith("vision_model.post_layernorm")
+                    and self.vision_model.post_layernorm is None):
+                continue
+
+            # omit layers when num_hidden_layers_override is set
+            if name.startswith("vision_model.encoder.layers"):
+                layer_idx = int(name.split(".")[3])
+                if layer_idx >= layer_count:
+                    continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm_v0.10.0/vllm/model_executor/models/skyworkr1v.py b/vllm_v0.10.0/vllm/model_executor/models/skyworkr1v.py
new file mode 100644
index 0000000..5ae5c0b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/skyworkr1v.py
@@ -0,0 +1,961 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py
+# --------------------------------------------------------
+# SkyworkR1V
+# Copyright (c) 2025 Skywork
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from abc import ABC, abstractmethod
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Literal, Optional, TypedDict, TypeVar, Union
+
+import torch
+import torch.nn as nn
+import torchvision.transforms as T
+from PIL import Image
+from transformers import BatchEncoding, PretrainedConfig, TensorType
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.awq import AWQConfig
+from vllm.model_executor.models.intern_vit import (InternVisionModel,
+                                                   InternVisionPatchModel)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import convert_image_mode
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   ImageSize, MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
+                    maybe_prefix, merge_multimodal_embeddings)
+
+IMG_START = '<img>'
+IMG_END = '</img>'
+IMG_CONTEXT = '<IMG_CONTEXT>'
+
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+
+class SkyworkR1VImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values_flat: torch.Tensor
+    """
+    Shape:
+    `(batch_size * num_images * (1 + num_patches), num_channels, height, width)`
+    """
+
+    num_patches: torch.Tensor
+    """Shape: `(batch_size * num_images)`"""
+
+
+class SkyworkR1VImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: Union[torch.Tensor, list[torch.Tensor]]
+    """ 
+    A tensor of shape `(num_images, total_image_feature_size, hidden_size)`
+    or a list of tensors of shape `(total_image_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+SkyworkR1VImageInputs = Union[SkyworkR1VImagePixelInputs,
+                              SkyworkR1VImageEmbeddingInputs]
+
+
+# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
+def build_transform(input_size: int):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    return T.Compose([
+        T.Lambda(lambda img: convert_image_mode(img, 'RGB')),
+        T.Resize((input_size, input_size),
+                 interpolation=T.InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=MEAN, std=STD)
+    ])
+
+
+# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
+def find_closest_aspect_ratio(
+    aspect_ratio: float,
+    target_ratios: list[tuple[int, int]],
+    *,
+    width: int,
+    height: int,
+    image_size: int,
+) -> tuple[int, int]:
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def resolve_skyworkr1v_min_max_num(
+    *,
+    min_dynamic_patch: int,
+    max_dynamic_patch: int,
+    dynamic_image_size: bool,
+    use_thumbnail: bool,
+) -> tuple[int, int]:
+    min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
+    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
+
+    if use_thumbnail and max_dynamic_patch != 1:
+        max_dynamic_patch += 1
+
+    return min_dynamic_patch, max_dynamic_patch
+
+
+def get_skyworkr1v_target_ratios(
+    min_num: int,
+    max_num: int,
+) -> list[tuple[int, int]]:
+    target_ratios = {(i, j)
+                     for n in range(min_num, max_num + 1)
+                     for i in range(1, n + 1)
+                     for j in range(1, n + 1) if min_num <= i * j <= max_num}
+    return sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+
+def calculate_skyworkr1v_targets(
+    *,
+    orig_width: int,
+    orig_height: int,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> tuple[int, int, int]:
+    aspect_ratio = orig_width / orig_height
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio,
+        target_ratios,
+        width=orig_width,
+        height=orig_height,
+        image_size=image_size,
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # add thumbnail image if num_blocks != 1
+    if use_thumbnail and blocks != 1:
+        blocks += 1
+
+    return blocks, target_width, target_height
+
+
+def dynamic_preprocess_skyworkr1v(
+    image: Image.Image,
+    *,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> list[Image.Image]:
+    orig_width, orig_height = image.size
+
+    # calculate the number of blocks without thumbnail
+    blocks, target_width, target_height = calculate_skyworkr1v_targets(
+        orig_width=orig_width,
+        orig_height=orig_height,
+        target_ratios=target_ratios,
+        image_size=image_size,
+        use_thumbnail=False,
+    )
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = ((i % (target_width // image_size)) * image_size,
+               (i // (target_width // image_size)) * image_size,
+               ((i % (target_width // image_size)) + 1) * image_size,
+               ((i // (target_width // image_size)) + 1) * image_size)
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+
+    assert len(processed_images) == blocks
+
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+
+    return processed_images
+
+
+# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B
+def image_to_pixel_values_skyworkr1v(
+    image: Image.Image,
+    *,
+    input_size: int,
+    min_num: int,
+    max_num: int,
+    use_thumbnail: bool,
+) -> torch.Tensor:
+    target_ratios = get_skyworkr1v_target_ratios(min_num, max_num)
+
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess_skyworkr1v(
+        image,
+        target_ratios=target_ratios,
+        image_size=input_size,
+        use_thumbnail=use_thumbnail,
+    )
+
+    pixel_values = torch.stack([transform(image) for image in images])
+    return pixel_values
+
+
+class BaseSkyworkR1VProcessor(ABC):
+    """
+    This model doesn't define its own HF processor,
+    so we implement our own one here.
+
+    The code to insert image tokens is based on:
+    https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py#L252
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: AnyTokenizer,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.tokenizer = tokenizer
+
+        image_size: int = config.vision_config.image_size
+        patch_size: int = config.vision_config.patch_size
+
+        if min_dynamic_patch is None:
+            min_dynamic_patch = config.min_dynamic_patch
+        assert isinstance(min_dynamic_patch, int)
+
+        if max_dynamic_patch is None:
+            max_dynamic_patch = config.max_dynamic_patch
+        assert isinstance(max_dynamic_patch, int)
+
+        if dynamic_image_size is None:
+            dynamic_image_size = config.dynamic_image_size
+        assert isinstance(dynamic_image_size, bool)
+
+        self.num_image_token = int(
+            (image_size // patch_size)**2 * (config.downsample_ratio**2))
+        self.image_size = image_size
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail: bool = config.use_thumbnail
+
+    @property
+    @abstractmethod
+    def image_token_id(self) -> int:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: Optional[int],
+    ) -> PromptUpdateDetails[str]:
+        raise NotImplementedError
+
+    def resolve_min_max_num(
+        self,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        use_thumbnail: Optional[bool] = None,
+    ) -> tuple[int, int]:
+        min_dynamic_patch = (self.min_dynamic_patch if min_dynamic_patch
+                             is None else min_dynamic_patch)
+        max_dynamic_patch = (self.max_dynamic_patch if max_dynamic_patch
+                             is None else max_dynamic_patch)
+        dynamic_image_size = (self.dynamic_image_size if dynamic_image_size
+                              is None else dynamic_image_size)
+        use_thumbnail = (self.use_thumbnail
+                         if use_thumbnail is None else use_thumbnail)
+
+        return resolve_skyworkr1v_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
+
+    def resolve_target_ratios(
+        self,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        use_thumbnail: Optional[bool] = None,
+    ) -> list[tuple[int, int]]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
+
+        return get_skyworkr1v_target_ratios(min_num, max_num)
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        target_ratios = self.resolve_target_ratios(
+            use_thumbnail=False,  # Applied in calculate_targets
+        )
+
+        num_patches, _, _ = calculate_skyworkr1v_targets(
+            orig_width=image_width,
+            orig_height=image_height,
+            image_size=self.image_size,
+            target_ratios=target_ratios,
+            use_thumbnail=self.use_thumbnail,
+        )
+
+        return num_patches * self.num_image_token
+
+    def _images_to_pixel_values_lst(
+        self,
+        images: list[Image.Image],
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+    ) -> list[torch.Tensor]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=False,  # Applied in image_to_pixel_values
+        )
+
+        return [
+            image_to_pixel_values_skyworkr1v(
+                image,
+                input_size=self.image_size,
+                min_num=min_num,
+                max_num=max_num,
+                use_thumbnail=self.use_thumbnail,
+            ) for image in images
+        ]
+
+    def __call__(
+        self,
+        text: Optional[Union[str, list[str]]] = None,
+        images: Optional[Union[Image.Image, list[Image.Image]]] = None,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ) -> Mapping[str, NestedTensors]:
+        if text is None:
+            text = []
+        if not isinstance(text, list):
+            text = [text]
+        if images is None:
+            images = []
+        if not isinstance(images, list):
+            images = [images]
+
+        if len(images) == 0:
+            image_inputs = {}
+        else:
+            pixel_values_lst = self._images_to_pixel_values_lst(
+                images,
+                min_dynamic_patch=min_dynamic_patch,
+                max_dynamic_patch=max_dynamic_patch,
+                dynamic_image_size=dynamic_image_size,
+            )
+            image_inputs: dict[str, NestedTensors] = {
+                "pixel_values_flat":
+                torch.cat(pixel_values_lst),
+                "image_num_patches":
+                torch.tensor([len(item) for item in pixel_values_lst]),
+            }
+
+            for pixel_values in pixel_values_lst:
+                num_patches = pixel_values.shape[0]
+                feature_size = num_patches * self.num_image_token
+
+                image_repl = self.get_image_repl(feature_size, num_patches)
+
+                text = [t.replace('<image>', image_repl.full, 1) for t in text]
+
+        text_inputs = self.tokenizer(text)
+
+        return {
+            **BatchEncoding(text_inputs, tensor_type=return_tensors),
+            **image_inputs,
+        }
+
+
+class SkyworkR1VProcessor(BaseSkyworkR1VProcessor):
+
+    @property
+    def image_token_id(self) -> int:
+        return self.tokenizer.get_vocab()[IMG_CONTEXT]
+
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: Optional[int],
+    ) -> PromptUpdateDetails[str]:
+        repl_features = IMG_CONTEXT * feature_size
+        repl_full = IMG_START + repl_features + IMG_END
+
+        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
+
+
+class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo):
+
+    @abstractmethod
+    def get_hf_processor(
+        self,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        **kwargs: object,
+    ) -> BaseSkyworkR1VProcessor:
+        raise NotImplementedError
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[BaseSkyworkR1VProcessor],
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        return processor.get_num_image_tokens(
+            image_width=image_width,
+            image_height=image_height,
+        )
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        processor = self.get_hf_processor()
+
+        base_size = processor.image_size
+        target_ratios = processor.resolve_target_ratios()
+
+        largest_feature_size, largest_feature_pinpoint = 0, None
+        for wr, hr in target_ratios:
+            width, height = base_size * wr, base_size * hr
+
+            feat_size = self.get_num_image_tokens(
+                image_width=width,
+                image_height=height,
+                processor=processor,
+            )
+            if feat_size > largest_feature_size:
+                largest_feature_size = feat_size
+                largest_feature_pinpoint = ImageSize(width=width,
+                                                     height=height)
+
+        if largest_feature_size == 0 or largest_feature_pinpoint is None:
+            raise ValueError("Cannot have a largest feature size of 0!")
+
+        return largest_feature_pinpoint
+
+
+_I = TypeVar("_I", bound=BaseSkyworkR1VProcessingInfo)
+
+
+class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        return "<image>" * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        num_images = mm_counts.get("image", 0)
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+
+class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[_I]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> Mapping[str, NestedTensors]:
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+
+        hf_processor = self.info.get_hf_processor(**mm_kwargs)
+        image_token_id = hf_processor.image_token_id
+
+        # Since there may be extra tokens in the feature placeholders,
+        # we need to pass the image token ID to the model to select the
+        # tokens to merge from the vision encoder outputs
+        processed_outputs["image_token_id"] = torch.tensor(image_token_id)
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: Mapping[str, NestedTensors],
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0))
+        num_images = len(image_num_patches)
+
+        return dict(
+            pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
+                "image", image_num_patches),
+            image_num_patches=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+            image_token_id=MultiModalFieldConfig.shared("image", num_images),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        if "image_num_patches" in out_mm_kwargs:
+            image_num_patches = out_mm_kwargs["image_num_patches"]
+            assert isinstance(image_num_patches, torch.Tensor)
+            image_num_patches = image_num_patches.tolist()
+        elif "image_embeds" in out_mm_kwargs:
+            # TODO: Use image size information in dictionary embedding inputs
+            # to compute num_patches (similar to Qwen2-VL)
+            image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
+        else:
+            image_num_patches = []
+
+        def get_replacement_skyworkr1v(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                feature_size = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                feature_size = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    processor=hf_processor,
+                )
+
+            num_patches = image_num_patches[item_idx]
+            if num_patches is not None:
+                assert isinstance(num_patches, int)
+
+            return hf_processor.get_image_repl(feature_size, num_patches)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target="<image>",
+                replacement=get_replacement_skyworkr1v,
+            )
+        ]
+
+
+class SkyworkR1VProcessingInfo(BaseSkyworkR1VProcessingInfo):
+
+    def get_hf_processor(
+        self,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        **kwargs: object,
+    ) -> SkyworkR1VProcessor:
+        if min_dynamic_patch is not None:
+            kwargs["min_dynamic_patch"] = min_dynamic_patch
+        if max_dynamic_patch is not None:
+            kwargs["max_dynamic_patch"] = max_dynamic_patch
+        if dynamic_image_size is not None:
+            kwargs["dynamic_image_size"] = dynamic_image_size
+
+        return self.ctx.init_processor(
+            SkyworkR1VProcessor,
+            config=self.get_hf_config(),
+            tokenizer=self.get_tokenizer(),
+            **kwargs,
+        )
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    SkyworkR1VMultiModalProcessor,
+    info=SkyworkR1VProcessingInfo,
+    dummy_inputs=SkyworkR1VDummyInputsBuilder)
+class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<image>"
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+        self._patch_quant_config(config, quant_config)
+
+        image_size = config.force_image_size or config.vision_config.image_size
+        patch_size = config.vision_config.patch_size
+        self.patch_size = patch_size
+        self.num_image_token = int(
+            (image_size // patch_size)**2 * (config.downsample_ratio**2))
+        self.downsample_ratio = config.downsample_ratio
+        self.ps_version = config.ps_version
+
+        self.llm_arch_name = config.text_config.architectures[0]
+        self.is_mono = self.llm_arch_name == 'SkyworkLM2VEForCausalLM'
+        self.vision_model = self._init_vision_model(
+            config,
+            quant_config=quant_config,
+            is_mono=self.is_mono,
+            prefix=maybe_prefix(prefix, "vision_model"),
+        )
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
+        self.mlp1 = self._init_mlp1(config)
+
+        self.img_context_token_id = None
+        self.visual_token_mask = None
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _patch_quant_config(self, config: PretrainedConfig,
+                            quant_config: QuantizationConfig):
+        # the awq models from OpenGVLab missing `modules_to_not_convert`
+        # patch the quant_config to add `modules_to_not_convert` back
+        if isinstance(quant_config, AWQConfig):
+            text_config = config.text_config
+            llm_quant_config = getattr(text_config, "quantization_config",
+                                       None)
+            if (not quant_config.modules_to_not_convert) and \
+                (llm_quant_config is not None):
+                quant_config.modules_to_not_convert.append("vision_model")
+
+    def _init_vision_model(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        *,
+        is_mono: bool,
+        prefix: str,
+    ):
+        if not is_mono:
+            vision_feature_layer = config.select_layer
+            if vision_feature_layer < 0:
+                num_hidden_layers = config.vision_config.num_hidden_layers \
+                    + vision_feature_layer + 1
+            else:
+                num_hidden_layers = vision_feature_layer + 1
+
+            return InternVisionModel(
+                config.vision_config,
+                quant_config=quant_config,
+                num_hidden_layers_override=num_hidden_layers,
+                prefix=prefix,
+            )
+        else:
+            return InternVisionPatchModel(config.vision_config)
+
+    def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential:
+        vit_hidden_size = config.vision_config.hidden_size
+        llm_hidden_size = config.text_config.hidden_size
+
+        return nn.Sequential(
+            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio)**2),
+            ReplicatedLinear(vit_hidden_size *
+                             int(1 / self.downsample_ratio)**2,
+                             llm_hidden_size,
+                             return_bias=False),
+            nn.GELU(),
+            ReplicatedLinear(llm_hidden_size,
+                             llm_hidden_size,
+                             return_bias=False),
+        )
+
+    def pixel_shuffle(self, x, scale_factor=0.5):
+        n, w, h, c = x.size()
+        # N, W, H, C --> N, W, H * scale, C // scale
+        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        x = x.view(n, int(h * scale_factor), int(w * scale_factor),
+                   int(c / (scale_factor * scale_factor)))
+        if self.ps_version == 'v1':
+            pass
+        else:
+            x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+
+    def extract_feature(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        vit_embeds = self.vision_model(pixel_values=pixel_values)
+        vit_embeds = vit_embeds[:, 1:, :]
+
+        h = w = int(vit_embeds.shape[1]**0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = self.pixel_shuffle(vit_embeds,
+                                        scale_factor=self.downsample_ratio)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1,
+                                        vit_embeds.shape[-1])
+        vit_embeds = self.mlp1(vit_embeds)
+        return vit_embeds
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape)
+
+            if actual_dims != expected_dims:
+                expected_expr = str(expected_dims)
+                raise ValueError(
+                    "The expected shape of pixel values per image per batch "
+                    f" per patch is {expected_expr}. "
+                    f"You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[SkyworkR1VImageInputs]:
+        pixel_values_flat = kwargs.pop("pixel_values_flat", None)
+        image_num_patches = kwargs.pop("image_num_patches", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values_flat is None and image_embeds is None:
+            return None
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+
+            return SkyworkR1VImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds),
+            )
+
+        image_token_id = kwargs["image_token_id"]
+        assert isinstance(image_token_id, torch.Tensor)
+        self.img_context_token_id = image_token_id.flatten().unique().item()
+
+        if pixel_values_flat is not None:
+            if not isinstance(pixel_values_flat, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values_flat)}")
+
+            if not isinstance(image_num_patches, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image_num_patches. "
+                                 f"Got type: {type(image_num_patches)}")
+
+            pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
+            image_num_patches = flatten_bn(image_num_patches, concat=True)
+
+            return SkyworkR1VImagePixelInputs(
+                type="pixel_values",
+                pixel_values_flat=self._validate_pixel_values(
+                    pixel_values_flat),
+                num_patches=image_num_patches,
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_image_input(
+        self,
+        image_input: SkyworkR1VImageInputs,
+    ) -> Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor, ...]]:
+        if image_input["type"] == "image_embeds":
+            return image_input["data"]
+
+        assert self.vision_model is not None
+
+        image_embeds = self.extract_feature(image_input["pixel_values_flat"])
+
+        num_patches = image_input["num_patches"]
+
+        # Only one image in the current batch
+        if len(num_patches) == 1:
+            return image_embeds.view(
+                -1, self.config.text_config.hidden_size).unsqueeze(0)
+
+        # NOTE: Image embeddings are split into separate tensors for each image
+        # by the size of each embedding.
+        feature_size = image_embeds.shape[1]
+        image_embeds = image_embeds.view(-1,
+                                         self.config.text_config.hidden_size)
+        image_feature_sizes = [
+            num_patches * feature_size for num_patches in num_patches
+        ]
+        return image_embeds.split(image_feature_sizes)
+
+    def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
+        if self.is_mono:
+            self.visual_token_mask = (
+                input_ids == self.img_context_token_id).reshape(-1, 1)
+        else:
+            self.visual_token_mask = None
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return []
+
+        return self._process_image_input(image_input)
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            assert self.img_context_token_id is not None
+            self._set_visual_token_mask(input_ids)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                self.img_context_token_id,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> IntermediateTensors:
+
+        if intermediate_tensors is not None:
+            input_ids = None
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        forward_kwargs = {
+            "input_ids": input_ids,
+            "positions": positions,
+            "intermediate_tensors": intermediate_tensors,
+            "inputs_embeds": inputs_embeds,
+        }
+
+        # Only required if the model is mono-architecture
+        if self.visual_token_mask is not None:
+            forward_kwargs.update(
+                {"visual_token_mask": self.visual_token_mask})
+            self.visual_token_mask = None
+
+        hidden_states = self.language_model.model(**forward_kwargs)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        skip_prefixes = [
+            "action_embed", "temporal_embed", "track_embed",
+            "track_embed_decoder", "box_token", "cg_criterion", "cg_model",
+            "loc_encoder", "loc_decoder", "sam", "temporal_token",
+            "track_token"
+        ]
+        loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/smolvlm.py b/vllm_v0.10.0/vllm/model_executor/models/smolvlm.py
new file mode 100644
index 0000000..0f22ba5
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/smolvlm.py
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+from transformers import SmolVLMProcessor
+
+from vllm.config import VllmConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+# yapf: disable
+from .idefics3 import Idefics3DummyInputsBuilder as SmolVLMDummyInputsBuilder
+from .idefics3 import Idefics3ForConditionalGeneration
+from .idefics3 import Idefics3MultiModalProcessor as SmolVLMMultiModalProcessor
+from .idefics3 import Idefics3ProcessingInfo
+
+# yapf: enable
+
+
+class SmolVLMProcessingInfo(Idefics3ProcessingInfo):
+
+    def get_hf_processor(
+        self,
+        *,
+        max_image_size: Optional[dict[str, int]] = None,
+        **kwargs: object,
+    ) -> SmolVLMProcessor:
+        if max_image_size is not None:
+            kwargs["max_image_size"] = max_image_size
+
+        return self.ctx.get_hf_processor(SmolVLMProcessor, **kwargs)
+
+    def _get_image_token(
+            self, processor: Optional[SmolVLMProcessor]) -> tuple[str, str]:
+        if processor is None:
+            processor = self.get_hf_processor()
+        image_token = processor.image_token
+        fake_image_token = processor.fake_image_token
+        global_image_token = processor.global_image_token
+        return image_token, fake_image_token, global_image_token
+
+
+@MULTIMODAL_REGISTRY.register_processor(SmolVLMMultiModalProcessor,
+                                        info=SmolVLMProcessingInfo,
+                                        dummy_inputs=SmolVLMDummyInputsBuilder)
+class SmolVLMForConditionalGeneration(Idefics3ForConditionalGeneration):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(
+            vllm_config=vllm_config,
+            prefix=prefix,
+        )
diff --git a/vllm_v0.10.0/vllm/model_executor/models/solar.py b/vllm_v0.10.0/vllm/model_executor/models/solar.py
new file mode 100644
index 0000000..8dd52f1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/solar.py
@@ -0,0 +1,506 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Solar model compatible with HuggingFace weights."""
+
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class SolarMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=hidden_size,
+            output_sizes=[intermediate_size] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class SolarAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        self.head_dim = getattr(config, "head_dim", None)
+        if self.head_dim is None:
+            self.head_dim = self.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class SolarDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+
+        if rope_scaling is not None and getattr(
+                config, "original_max_position_embeddings", None):
+            rope_scaling["original_max_position_embeddings"] \
+                = config.original_max_position_embeddings
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        # Support abacusai/Smaug-72B-v0.1 with attention_bias
+        # Support internlm/internlm-7b with bias
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False)
+        self.self_attn = SolarAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=getattr(config, "num_key_value_heads",
+                                 config.num_attention_heads),
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=attention_bias,
+            cache_config=cache_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.mlp = SolarMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            bias=getattr(config, "mlp_bias", False),
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class SolarModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.quant_config = quant_config
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+        if get_pp_group().is_first_rank or (config.tie_word_embeddings
+                                            and get_pp_group().is_last_rank):
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: SolarDecoderLayer(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=f"{prefix}.layers",
+        )
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        bskcn_h_1 = None
+        bskcn_h_2 = None
+        bskcn_r_1 = None
+        bskcn_r_2 = None
+        bskcn_tv = (self.config.bskcn_tv[0]
+                    if self.training else self.config.bskcn_tv[1])
+
+        for i in range(self.start_layer, self.end_layer):
+            if i in self.config.bskcn_1:
+                bskcn_h_1 = hidden_states.clone()
+                bskcn_r_1 = residual.clone()
+            if i in self.config.bskcn_2:
+                bskcn_h_2 = hidden_states.clone()
+                bskcn_r_2 = residual.clone()
+            if i in self.config.bskcn_3:
+                hidden_states = bskcn_h_1 * bskcn_tv + hidden_states * (
+                    1 - bskcn_tv)
+                residual = bskcn_r_1 * bskcn_tv + residual * (1 - bskcn_tv)
+            if i in self.config.bskcn_4:
+                hidden_states = bskcn_h_2 * bskcn_tv + hidden_states * (
+                    1 - bskcn_tv)
+                residual = bskcn_r_2 * bskcn_tv + residual * (1 - bskcn_tv)
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        self.config = config
+        self.lora_config = lora_config
+        self.quant_config = quant_config
+
+        self.model = SolarModel(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "model"),
+        )
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = config.vocab_size
+            if lora_config:
+                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE
+                # We need bigger padding if using lora for kernel
+                # compatibility
+                if not lora_config else lora_config.lora_vocab_padding_size,
+                quant_config=quant_config,
+            )
+            if config.tie_word_embeddings:
+                self.lm_head.weight = self.model.embed_tokens.weight
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    config.vocab_size,
+                                                    logit_scale)
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        model_output = self.model(input_ids, positions, intermediate_tensors,
+                                  inputs_embeds)
+        return model_output
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/stablelm.py b/vllm_v0.10.0/vllm/model_executor/models/stablelm.py
new file mode 100644
index 0000000..d6ec743
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/stablelm.py
@@ -0,0 +1,343 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This code is based off the following work:
+# https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/modeling_stablelm_epoch.py
+# https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json
+"""Inference-only StabeLM (https://github.com/Stability-AI/StableLM)
+model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import StableLmConfig
+
+from vllm.attention import Attention
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsPP
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class StablelmMLP(nn.Module):
+
+    def __init__(self,
+                 config: StableLmConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "") -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_up_proj = MergedColumnParallelLinear(
+            config.hidden_size, [config.intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
+        self.down_proj = RowParallelLinear(config.intermediate_size,
+                                           config.hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config,
+                                           prefix=f"{prefix}.down_proj")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class StablelmAttention(nn.Module):
+
+    def __init__(self,
+                 config: StableLmConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "") -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        self.num_heads = self.total_num_heads // tp_size
+
+        self.total_num_key_value_heads = config.num_key_value_heads
+        if self.total_num_key_value_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_key_value_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_key_value_heads == 0
+        self.num_key_value_heads = max(
+            1, self.total_num_key_value_heads // tp_size)
+        self.head_dim = self.hidden_size // self.total_num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.partial_rotary_factor = getattr(
+            config, "rope_pct", getattr(config, "partial_rotary_factor", 1))
+        self.scaling = self.head_dim**-0.5
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_key_value_heads * self.head_dim
+        self.qkv_bias = getattr(config, "use_qkv_bias", False)
+        if (self.head_dim * self.num_heads * tp_size) != self.hidden_size:
+            raise ValueError(f"hidden_size must be divisible by num_heads "
+                             f"(got `hidden_size`: {self.hidden_size}"
+                             f" and `num_heads`: {self.num_heads}).")
+
+        self.qkv_proj = QKVParallelLinear(self.hidden_size,
+                                          self.head_dim,
+                                          self.total_num_heads,
+                                          self.total_num_key_value_heads,
+                                          self.qkv_bias,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.qkv_proj")
+        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
+                                        self.hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.o_proj")
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.config.max_position_embeddings,
+            base=self.config.rope_theta,
+            partial_rotary_factor=self.partial_rotary_factor,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_key_value_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class StablelmDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: StableLmConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.self_attn = StablelmAttention(config,
+                                           cache_config,
+                                           quant_config,
+                                           prefix=f"{prefix}.self_attn")
+        self.mlp = StablelmMLP(config, quant_config, prefix=f"{prefix}.mlp")
+        norm_eps = getattr(config, "norm_eps",
+                           getattr(config, "layer_norm_eps", 1e-05))
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
+                                                     eps=norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states, residual
+
+
+class StableLMEpochModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.embed_tokens",
+        )
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: StablelmDecoderLayer(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+        norm_eps = getattr(config, "norm_eps",
+                           getattr(config, "layer_norm_eps", 1e-05))
+        self.norm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(["hidden_states"],
+                                                    config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(positions, hidden_states)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class StablelmForCausalLM(nn.Module, SupportsPP):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = StableLMEpochModel(vllm_config=vllm_config,
+                                        prefix=maybe_prefix(prefix, "model"))
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config,
+                                      prefix=f"{prefix}.lm_head")
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/starcoder2.py b/vllm_v0.10.0/vllm/model_executor/models/starcoder2.py
new file mode 100644
index 0000000..9d9a2bf
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/starcoder2.py
@@ -0,0 +1,356 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Starcoder2 model."""
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import Starcoder2Config
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsPP
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class Starcoder2Attention(nn.Module):
+
+    def __init__(self,
+                 config: Starcoder2Config,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.config = config
+
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = self.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = config.rope_theta
+        self.max_position_embeddings = config.max_position_embeddings
+        self.use_bias = config.use_bias
+
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=self.use_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=self.use_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=int(self.rope_theta),
+            is_neox_style=True,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Starcoder2MLP(nn.Module):
+
+    def __init__(self,
+                 config: Starcoder2Config,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.c_fc = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            bias=config.use_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.c_fc",
+        )
+        self.c_proj = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            bias=config.use_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.c_proj",
+        )
+        self.act = get_act_fn(config.hidden_act)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.c_proj(hidden_states)
+        return hidden_states
+
+
+class Starcoder2DecoderLayer(nn.Module):
+
+    def __init__(self,
+                 config: Starcoder2Config,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Starcoder2Attention(config,
+                                             cache_config,
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.self_attn")
+        self.mlp = Starcoder2MLP(config,
+                                 quant_config=quant_config,
+                                 prefix=f"{prefix}.mlp")
+        self.input_layernorm = nn.LayerNorm(config.hidden_size,
+                                            eps=config.norm_epsilon)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
+                                                     eps=config.norm_epsilon)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+@support_torch_compile
+class Starcoder2Model(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.embed_tokens")
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Starcoder2DecoderLayer(
+                config, cache_config, quant_config=quant_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers",
+        )
+        self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(["hidden_states"],
+                                                    config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states = layer(positions, hidden_states)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Starcoder2ForCausalLM(nn.Module, SupportsPP):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.model = Starcoder2Model(vllm_config=vllm_config,
+                                     prefix=maybe_prefix(prefix, "model"))
+        self.vocab_size = config.vocab_size
+        self.unpadded_vocab_size = config.vocab_size
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.unpadded_vocab_size = config.vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+                quant_config=quant_config,
+                prefix=f"{prefix}.lm_head",
+            )
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            # Models trained using ColossalAI may include these tensors in
+            # the checkpoint. Skip them.
+            skip_prefixes=(["lm_head.weight"]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/tarsier.py b/vllm_v0.10.0/vllm/model_executor/models/tarsier.py
new file mode 100644
index 0000000..979d789
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/tarsier.py
@@ -0,0 +1,648 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar,
+                    Union, cast)
+
+import torch
+import torch.nn as nn
+from transformers import BatchFeature, CLIPVisionConfig
+from transformers import LlavaConfig as HfLlavaConfig
+from transformers import PretrainedConfig, SiglipVisionConfig
+from transformers.image_utils import ImageInput, get_image_size, to_numpy_array
+from transformers.models.llava import LlavaProcessor
+from transformers.processing_utils import ProcessingKwargs, Unpack
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+
+from vllm.config import VllmConfig
+from vllm.inputs import InputProcessingContext
+from vllm.jsontree import json_map_leaves
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.models.llava import LlavaDummyInputsBuilder
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   ImageSize, MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, ProcessingCache,
+                                        PromptReplacement, PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+from .clip import CLIPVisionModel
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .siglip import SiglipVisionModel
+from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
+                    maybe_prefix, merge_multimodal_embeddings)
+from .vision import VisionEncoderInfo, get_vision_encoder_info
+
+
+class TarsierImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: torch.Tensor
+
+
+class TarsierImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: torch.Tensor
+
+
+TarsierImageInputs = Union[TarsierImagePixelInputs,
+                           TarsierImageEmbeddingInputs]
+
+
+class TarsierHfConfig(Protocol):  # Based on the Tarsier's LlavaConfig
+    vision_config: Final[PretrainedConfig]
+    text_config: Final[PretrainedConfig]  # Added from Tarsier's LlavaConfig
+    image_token_index: Final[int]
+    vision_feature_select_strategy: Final[str]
+    vision_feature_layer: Final[Union[int, list[int]]]
+    projector_hidden_act: Final[str]
+    image_newline_idx: Final[int]
+    image_new_idx: Final[int]
+    multimodal_projector_bias: bool = True
+
+
+class TarsierProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "images_kwargs": {},
+    }
+
+
+class TarsierProcessor(LlavaProcessor):
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput],
+                    list[PreTokenizedInput]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[TarsierProcessorKwargs],
+    ) -> BatchFeature:
+        if images is None and text is None:
+            raise ValueError(
+                "You have to specify at least one of `images` or `text`.")
+
+        output_kwargs = self._merge_kwargs(
+            TarsierProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if images is not None:
+            image_inputs = self.image_processor(
+                images, **output_kwargs["images_kwargs"])
+        else:
+            image_inputs = {}
+
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError("Invalid input text. Please provide a string,"
+                             " or a list of strings")
+
+        # try to expand inputs in processing if we have the necessary parts
+        prompt_strings = text
+        if image_inputs.get("pixel_values") is not None:
+            # Replace the image token with the expanded image token sequence
+            pixel_values = image_inputs["pixel_values"]
+            height, width = get_image_size(to_numpy_array(pixel_values[0]))
+            num_image_tokens = (height // self.patch_size) * (
+                width // self.patch_size +
+                1) + self.num_additional_image_tokens + 1
+            if self.vision_feature_select_strategy == "default":
+                num_image_tokens -= 1
+
+            prompt_strings = []
+            for sample in text:
+                sample = sample.replace(self.image_token,
+                                        self.image_token * num_image_tokens)
+                prompt_strings.append(sample)
+
+        return_tensors = output_kwargs["text_kwargs"].pop(
+            "return_tensors", None)
+        text_inputs = self.tokenizer(prompt_strings,
+                                     **output_kwargs["text_kwargs"])
+        return BatchFeature(data={
+            **text_inputs,
+            **image_inputs
+        },
+                            tensor_type=return_tensors)
+
+
+class TarsierMultiModalProjector(nn.Module):
+
+    def __init__(self,
+                 vision_hidden_size: int,
+                 text_hidden_size: int,
+                 projector_hidden_act: str,
+                 multimodal_projector_bias: bool,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+
+        self.linear_1 = ColumnParallelLinear(vision_hidden_size,
+                                             text_hidden_size,
+                                             bias=multimodal_projector_bias,
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.linear_1")
+        self.act = get_act_fn(projector_hidden_act)
+        self.linear_2 = RowParallelLinear(text_hidden_size,
+                                          text_hidden_size,
+                                          bias=multimodal_projector_bias,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.linear_2")
+
+    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class TarsierProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self) -> TarsierHfConfig:
+        return self.ctx.get_hf_config(HfLlavaConfig)
+
+    def get_vision_encoder_info(self) -> VisionEncoderInfo:
+        return get_vision_encoder_info(self.get_hf_config())
+
+    def get_hf_processor(self, **kwargs: object) -> TarsierProcessor:
+        hf_processor = self.ctx.get_hf_processor(TarsierProcessor, **kwargs)
+        # Patch for patch_size if needed (copied from vLLM LLaVA)
+        if hasattr(hf_processor,
+                   'patch_size') and hf_processor.patch_size is None:
+            patch_size = self.get_vision_encoder_info().get_patch_size()
+            hf_processor.patch_size = patch_size
+        return hf_processor
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def _apply_feature_select_strategy(
+        self,
+        strategy: str,
+        encoder_num_image_tokens: int,
+    ) -> int:
+        if strategy == "default":
+            return encoder_num_image_tokens - 1
+        if strategy == "full":
+            return encoder_num_image_tokens
+        msg = f"Unexpected feature select strategy: {strategy!r}"
+        raise NotImplementedError(msg)
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        hf_config = self.get_hf_config()
+        vision_encoder_info = self.get_vision_encoder_info()
+        num_projected_patches = self._apply_feature_select_strategy(
+            hf_config.vision_feature_select_strategy,
+            vision_encoder_info.get_num_image_tokens(
+                image_width=image_width,
+                image_height=image_height,
+            ),
+        )
+        if num_projected_patches <= 0:
+            default_size = self.get_image_size_with_most_features()
+            num_projected_patches_default = self._apply_feature_select_strategy(
+                hf_config.vision_feature_select_strategy,
+                vision_encoder_info.get_num_image_tokens(
+                    image_width=default_size.width,
+                    image_height=default_size.height,
+                ),
+            )
+            if num_projected_patches_default <= 0:
+                raise ValueError(
+                    "Could not determine a valid number of image patches.")
+            num_projected_patches = num_projected_patches_default
+        num_height_patches = int(math.sqrt(num_projected_patches))
+        total_image_tokens_for_llm = num_projected_patches \
+            + num_height_patches + 1
+        return total_image_tokens_for_llm
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        vision_encoder_info = self.get_vision_encoder_info()
+        width = height = vision_encoder_info.get_image_size()
+        return ImageSize(width=width, height=height)
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+        )
+
+    def get_image_newline_idx(self) -> int:
+        return self.get_hf_config().image_newline_idx
+
+    def get_image_new_idx(self) -> int:
+        return self.get_hf_config().image_new_idx
+
+
+_I_Tarsier = TypeVar("_I_Tarsier", bound=TarsierProcessingInfo)
+
+
+class TarsierDummyInputsBuilder(LlavaDummyInputsBuilder[_I_Tarsier]):
+
+    pass
+
+
+class TarsierMultiModalProcessor(BaseMultiModalProcessor[_I_Tarsier]):
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_config = self.info.get_hf_config()
+        image_token_id = hf_config.image_token_index  # The <IMAGE> token ID
+
+        def get_replacement(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                num_projected_patches = images.get_feature_size(item_idx)
+                # This assumes num_projected_patches is a perfect square
+                num_height_patches = int(math.sqrt(num_projected_patches))
+                num_final_image_tokens = num_projected_patches \
+                + num_height_patches + 1
+            else:
+                image_size = images.get_image_size(item_idx)
+                num_final_image_tokens = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                )
+
+            return [image_token_id] * num_final_image_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],  # Replace each single <IMAGE> token
+                replacement=get_replacement,
+            ),
+        ]
+
+
+def _build_tarsier_hf_info(
+        ctx: InputProcessingContext) -> TarsierProcessingInfo:
+    return TarsierProcessingInfo(ctx)
+
+
+def _build_tarsier_hf_processor(
+    info: _I_Tarsier,
+    dummy_inputs: BaseDummyInputsBuilder[_I_Tarsier],
+    *,
+    cache: Optional[ProcessingCache] = None,
+) -> BaseMultiModalProcessor:
+    if isinstance(info, TarsierProcessingInfo):
+        return TarsierMultiModalProcessor(
+            info,
+            dummy_inputs,
+            cache=cache,
+        )
+    raise NotImplementedError(type(info))
+
+
+def init_vision_tower_for_tarsier(
+    hf_config: TarsierHfConfig,  # Use the Tarsier specific config protocol
+    quant_config: Optional[QuantizationConfig],
+    *,
+    require_post_norm: Optional[bool] = None,
+    prefix: str = "",
+) -> Union[CLIPVisionModel, SiglipVisionModel]:
+    vision_config = hf_config.vision_config
+
+    feature_layers = hf_config.vision_feature_layer
+    base_num_hidden_layers = vision_config.num_hidden_layers
+
+    def _get_layer_index(feature_layer_index: int,
+                         num_hidden_layers_total: int) -> int:
+        if feature_layer_index < 0:
+            return num_hidden_layers_total + feature_layer_index + 1
+        return feature_layer_index
+
+    if isinstance(feature_layers, int):
+        num_hidden_layers_to_init = _get_layer_index(feature_layers,
+                                                     base_num_hidden_layers)
+    elif isinstance(feature_layers, (list, tuple)):
+        num_hidden_layers_to_init = max(
+            _get_layer_index(idx, base_num_hidden_layers)
+            for idx in feature_layers)
+    else:
+        raise TypeError(f"vision_layer_feature type: {type(feature_layers)}"
+                        " is not supported")
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        return CLIPVisionModel(
+            vision_config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers_to_init,
+            require_post_norm=require_post_norm,
+            prefix=prefix,
+        )
+    elif isinstance(vision_config, SiglipVisionConfig):
+        return SiglipVisionModel(
+            vision_config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers_to_init,
+            require_post_norm=require_post_norm,
+            prefix=prefix,
+        )
+
+    msg = f"Unsupported vision config for Tarsier: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
+@MULTIMODAL_REGISTRY.register_processor(_build_tarsier_hf_processor,
+                                        info=_build_tarsier_hf_info,
+                                        dummy_inputs=TarsierDummyInputsBuilder)
+class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                      SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
+    }
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<image>"
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        config: TarsierHfConfig = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config  # Storing the Tarsier-specific HF config
+        self.vision_tower = init_vision_tower_for_tarsier(
+            config,
+            quant_config,
+            require_post_norm=False,
+            prefix=maybe_prefix(prefix, "vision_tower"))
+        projector_bias = getattr(config, "multimodal_projector_bias", True)
+
+        self.multi_modal_projector = TarsierMultiModalProjector(
+            vision_hidden_size=config.vision_config.hidden_size,
+            text_hidden_size=config.text_config.hidden_size,
+            projector_hidden_act=config.projector_hidden_act,
+            multimodal_projector_bias=projector_bias,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "multi_modal_projector"))
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.
+            text_config,  # Use text_config from Tarsier's main config
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+        self.register_buffer('image_newline_idx_tensor',
+                             torch.tensor([config.image_newline_idx],
+                                          dtype=torch.long),
+                             persistent=False)
+        self.register_buffer('image_new_idx_tensor',
+                             torch.tensor([config.image_new_idx],
+                                          dtype=torch.long),
+                             persistent=False)
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)  # Assuming 3 channels
+        actual_dims = tuple(data.shape[1:])
+
+        if actual_dims != expected_dims:
+            expected_expr = ("batch_size", *map(str, expected_dims))
+            raise ValueError(
+                f"The expected shape of pixel values is {expected_expr}. "
+                f"You supplied {tuple(data.shape)}.")
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[TarsierImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return TarsierImagePixelInputs(
+                type="pixel_values",
+                pixel_values=self._validate_pixel_values(
+                    flatten_bn(pixel_values, concat=True)),
+            )
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+            return TarsierImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds, concat=True),
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _select_image_features(self, image_features: torch.Tensor, *,
+                               strategy: str) -> torch.Tensor:
+        if strategy == "default":
+            return image_features[:, 1:]
+        elif strategy == "full":
+            return image_features
+        raise ValueError(f"Unexpected select feature strategy: {strategy}")
+
+    def _image_pixels_to_features(
+        self,
+        vision_tower: Union[CLIPVisionModel, SiglipVisionModel],
+        pixel_values: Union[torch.Tensor, list[torch.Tensor]],
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        # From vLLM LLaVA, vision tower output handling
+        image_hidden_states = vision_tower(pixel_values)
+        if not isinstance(image_hidden_states, torch.Tensor):
+            raise TypeError(
+                f"image_hidden_states type: {type(image_hidden_states)}"
+                " is not supported")
+
+        def select_features_fn(leaf: torch.Tensor):
+            return self._select_image_features(
+                leaf,
+                strategy=self.config.vision_feature_select_strategy,
+            )
+
+        selected_features = cast(
+            Union[torch.Tensor, tuple[torch.Tensor, ...]],
+            json_map_leaves(select_features_fn, image_hidden_states),
+        )
+        return selected_features
+
+    def _add_tarsier_split_tokens(
+            self, projected_image_features: torch.Tensor) -> torch.Tensor:
+        """
+        Implements Tarsier's `add_split_tokens` logic.
+        """
+        num_images, num_projected_patches, embed_dim = \
+            projected_image_features.shape
+        num_height_patches = int(math.sqrt(num_projected_patches))
+        num_width_patches = num_projected_patches // num_height_patches
+        device = projected_image_features.device
+        embedding_layer = self.language_model.model.embed_tokens
+        image_newline_emb = embedding_layer(
+            self.image_newline_idx_tensor.to(device)).squeeze(0)
+        image_new_emb = embedding_layer(
+            self.image_new_idx_tensor.to(device)).squeeze(0)
+        try:
+            current_image_features_grid = projected_image_features.view(
+                num_images, num_height_patches, num_width_patches, embed_dim)
+        except RuntimeError as e:
+            raise RuntimeError(
+                "Cannot reshape projected_image_features"
+                f" with shape {projected_image_features.shape} "
+                f"to ({num_images}, {num_height_patches},"
+                f" {num_width_patches}, {embed_dim}). "
+                "Ensure num_projected_patches is compatible"
+                " with a grid structure. "
+                f"num_projected_patches={num_projected_patches}, "
+                f"derived num_height_patches={num_height_patches}. ") from e
+
+        image_newline_expanded = image_newline_emb.expand(
+            (num_images, num_height_patches, 1, embed_dim))
+        features_with_newlines = torch.cat(
+            [current_image_features_grid, image_newline_expanded],
+            dim=2  # Concatenate along width dim
+        )
+        new_num_patches_after_newline = num_projected_patches \
+            + num_height_patches
+        features_with_newlines_flat = features_with_newlines.view(
+            num_images, new_num_patches_after_newline, embed_dim)
+        image_new_expanded = image_new_emb.expand((num_images, 1, embed_dim))
+        final_image_features = torch.cat(
+            [features_with_newlines_flat, image_new_expanded],
+            dim=1  # Concatenate along patch sequence dim
+        )
+        return final_image_features
+
+    def _process_image_pixels(
+        self,
+        inputs: TarsierImagePixelInputs,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        assert self.vision_tower is not None
+        pixel_values = inputs["pixel_values"]
+        image_features_selected = self._image_pixels_to_features(
+            self.vision_tower, pixel_values)  # type: ignore
+        if isinstance(image_features_selected, torch.Tensor):
+            projected_features = self.multi_modal_projector(
+                image_features_selected)
+            final_features = self._add_tarsier_split_tokens(projected_features)
+            return final_features
+        else:
+            raise TypeError(
+                f"_image_pixels_to_features type:"
+                f" {type(image_features_selected)} is not supported")
+
+    def _process_image_input(
+        self,
+        image_input: TarsierImageInputs,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        if image_input["type"] == "image_embeds":
+            projected_features = image_input["data"]
+            if isinstance(projected_features, torch.Tensor):
+                return self._add_tarsier_split_tokens(projected_features)
+            else:
+                raise ValueError("Incorrect type of image_embeds. "
+                                 f"Got type: {type(projected_features)}. ")
+        assert self.vision_tower is not None
+        return self._process_image_pixels(image_input)
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return []
+        return self._process_image_input(image_input)
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                self.config.image_token_index,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+        hidden_states = self.language_model.model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/telechat2.py b/vllm_v0.10.0/vllm/model_executor/models/telechat2.py
new file mode 100644
index 0000000..f0b31b1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/telechat2.py
@@ -0,0 +1,140 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections.abc import Iterable
+
+import torch
+import torch.nn as nn
+
+from vllm.config import VllmConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.llama import LlamaForCausalLM, LlamaModel
+
+from .llama import LlamaDecoderLayer
+from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
+                    is_pp_missing_parameter)
+
+
+class TeleChat2Model(LlamaModel):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        # 1. Initialize the LlamaModel with bias
+        vllm_config.model_config.hf_config.bias = True
+        vllm_config.model_config.hf_config.mlp_bias = True
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        # 2. Remove the bias from the qkv_proj and gate_up_proj based on config
+        # Telechat2's gate_up_proj and qkv_proj don't have bias
+        # see: https://github.com/vllm-project/vllm/pull/10311#issuecomment-2490297566
+        for layer in self.layers:
+            if not isinstance(layer, PPMissingLayer):
+                layer.self_attn.qkv_proj.bias = None
+                layer.self_attn.qkv_proj.skip_bias_add = True
+                layer.mlp.gate_up_proj.bias = None
+                layer.mlp.gate_up_proj.skip_bias_add = True
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            ('gate_up_proj', 'gate_proj', 0),
+            ('gate_up_proj', 'up_proj', 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        total_num_heads = self.config.n_head
+        head_dim = self.config.hidden_size // total_num_heads
+        for name, loaded_weight in weights:
+            if "self_attn.key_value" in name:
+                k_weight = []
+                v_weight = []
+                for i in range(total_num_heads):
+                    start = i * head_dim * 2
+                    k_weight.append(loaded_weight[start:start + head_dim, :])
+                    v_weight.append(loaded_weight[start + head_dim:start +
+                                                  2 * head_dim:])
+                k_weight = torch.cat(k_weight, dim=0)
+                v_weight = torch.cat(v_weight, dim=0)
+                name = name.replace("key_value", "qkv_proj")
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, k_weight, "k")
+                weight_loader(param, v_weight, "v")
+            elif "query" in name:
+                name = name.replace("query", "qkv_proj")
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, "q")
+            else:
+                for param_name, weight_name, shard_id in stacked_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id)
+                    break
+                else:
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class TeleChat2ForCausalLM(LlamaForCausalLM):
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "transformer.": "model.",
+        },
+        orig_to_new_substr={
+            ".h.": ".layers.",
+            ".self_attention.": ".self_attn.",
+            ".word_embeddings.": ".embed_tokens.",
+            ".dense.": ".o_proj.",
+            ".ln_f.": ".norm.",
+        },
+    )
+
+    def _init_model(self,
+                    vllm_config: VllmConfig,
+                    prefix: str = "",
+                    layer_type: type[nn.Module] = LlamaDecoderLayer):
+        return TeleChat2Model(vllm_config=vllm_config, prefix=prefix)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/teleflm.py b/vllm_v0.10.0/vllm/model_executor/models/teleflm.py
new file mode 100644
index 0000000..3666f70
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/teleflm.py
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.models.llama import (LlamaDecoderLayer,
+                                              LlamaForCausalLM, LlamaModel)
+
+
+class TeleFLMModel(LlamaModel):
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        layer_type: type[nn.Module] = LlamaDecoderLayer,
+    ):
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         layer_type=layer_type)
+        """
+        This implementation is based on the µScaling paper presented at  
+        the ICLR 2025 Workshop:  
+        NanoLM: An Affordable LLM Study Benchmark \
+        via Accurate Loss Prediction across Scales
+        by Yiqun Yao et al.  
+        Available at: https://openreview.net/forum?id=IwaPYg1SCA  
+        arXiv preprint: https://arxiv.org/abs/2304.06875
+        """
+        self.use_mup = self.config.use_mup
+        if self.use_mup:
+            self.input_mult = self.config.input_mult
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        embedding = self.embed_tokens(input_ids)
+        if self.use_mup:
+            embedding = embedding * self.input_mult
+        return embedding
+
+
+class TeleFLMForCausalLM(LlamaForCausalLM):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        # mup
+        self.use_mup = self.config.use_mup
+        if self.use_mup:
+            self.mup_scale_factor = self.config.mup_scale_factor
+            self.output_mult = self.config.output_mult / self.mup_scale_factor
+            logit_scale = self.output_mult
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    self.config.vocab_size,
+                                                    logit_scale)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/transformers.py b/vllm_v0.10.0/vllm/model_executor/models/transformers.py
new file mode 100644
index 0000000..610f8e7
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/transformers.py
@@ -0,0 +1,921 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Wrapper around `transformers` models"""
+from collections.abc import Iterable, Mapping
+from contextlib import contextmanager, nullcontext
+from typing import Literal, Optional, Union
+
+import regex as re
+import torch
+from torch import nn
+from transformers import AutoModel, PretrainedConfig, PreTrainedModel
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
+                         ParallelConfig, VllmConfig)
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.distributed.utils import get_pp_indices
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputs, PlaceholderRange)
+from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.processor import cached_get_processor
+from vllm.utils import is_list_of
+
+from .interfaces import (SupportsLoRA, SupportsMultiModal, SupportsPP,
+                         SupportsQuant)
+from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
+                    flatten_bn, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, maybe_prefix)
+
+logger = init_logger(__name__)
+
+
+def vllm_flash_attention_forward(
+        # Transformers args
+        module: torch.nn.Module,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attention_mask: torch.Tensor,
+        # Transformers kwargs
+        scaling: Optional[float] = None,
+        # vLLM kwargs
+        attention_instances: Optional[dict[Attention]] = None,
+        **kwargs):
+    self_attn = attention_instances[module.layer_idx]
+    if scaling is not None:
+        self_attn.impl.scale = float(scaling)
+    hidden = query.shape[-2]
+    query, key, value = (x.transpose(1, 2) for x in (query, key, value))
+    query, key, value = (x.reshape(hidden, -1) for x in (query, key, value))
+    return self_attn.forward(query, key, value), None
+
+
+ALL_ATTENTION_FUNCTIONS["vllm"] = vllm_flash_attention_forward
+
+
+def log_replacement(name: str, old_module: nn.Module, new_module: nn.Module):
+    logger.debug("%s: %s -> %s", name, old_module, new_module)
+
+
+def replace_linear_class(
+    linear: nn.Linear, style: Literal["colwise", "rowwise"],
+    quant_config: QuantizationConfig
+) -> Union[ColumnParallelLinear, RowParallelLinear]:
+    """
+    Replace nn.Linear with one of vLLM's tensor parallel linear classes.
+
+    Args:
+        linear (nn.Linear): `nn.Linear` to be replaced.
+        style (str): Tensor parallel style of the new linear, e.g. "colwise".
+        quant_config (QuantConfig): Quantization config for the new linear.
+    Returns:
+        Union[ColumnParallelLinear, RowParallelLinear]: The new linear.
+    """
+
+    if not isinstance(style, str):
+        raise ValueError(
+            f"Unsupported parallel style type {type(style)}, expected str")
+
+    vllm_linear_cls = {
+        "colwise": ColumnParallelLinear,
+        "rowwise": RowParallelLinear,
+    }.get(style, ReplicatedLinear)
+
+    return vllm_linear_cls(
+        input_size=linear.in_features,
+        output_size=linear.out_features,
+        bias=linear.bias is not None,
+        quant_config=quant_config,
+        return_bias=False,
+    )
+
+
+# Copied from `accelerate`
+@contextmanager
+def init_on_device_without_buffers(device: torch.device):
+    """
+    A context manager under which models are initialized with all
+    parameters on the specified device. However buffers are not
+    initialized on specified device.
+
+    Args:
+        device (`torch.device`):
+            Device to initialize all parameters on.
+    """
+
+    old_register_parameter = nn.Module.register_parameter
+
+    def register_empty_parameter(module, name, param):
+        old_register_parameter(module, name, param)
+        if param is not None:
+            param_cls = type(module._parameters[name])
+            kwargs = module._parameters[name].__dict__
+            kwargs["requires_grad"] = param.requires_grad
+            module._parameters[name] = param_cls(
+                module._parameters[name].to(device), **kwargs)
+
+    tensor_constructors_to_patch = {}
+
+    def patch_tensor_constructor(fn):
+
+        def wrapper(*args, **kwargs):
+            kwargs["device"] = device
+            return fn(*args, **kwargs)
+
+        return wrapper
+
+    try:
+        nn.Module.register_parameter = register_empty_parameter
+        for torch_function_name in tensor_constructors_to_patch:
+            setattr(
+                torch, torch_function_name,
+                patch_tensor_constructor(getattr(torch, torch_function_name)))
+        yield
+    finally:
+        nn.Module.register_parameter = old_register_parameter
+        for torch_function_name, old_torch_function in (
+                tensor_constructors_to_patch.items()):
+            setattr(torch, torch_function_name, old_torch_function)
+
+
+class MultiModalProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.model_config.hf_config
+
+    def get_supported_mm_limits(self):
+        return {"image": None}
+
+    def get_mm_max_tokens_per_item(self, seq_len, mm_counts):
+        return {"image": self.get_max_image_tokens()}
+
+    def get_max_image_tokens(self) -> int:
+        width, height = self.get_max_image_size()
+        processor = self.get_hf_processor()
+        mm_processor_kwargs = self.ctx.model_config.mm_processor_kwargs or {}
+        mm_tokens = processor._get_num_multimodal_tokens(
+            image_sizes=([height, width], ), **mm_processor_kwargs)
+        image_tokens = mm_tokens["num_image_tokens"][0]
+        return image_tokens
+
+    def get_hf_processor(self):
+        processor = cached_get_processor(self.ctx.model_config.model)
+        return processor
+
+    def get_max_image_size(self):
+        return 10_000, 10_000  # hardcode for arbitrary very large size
+
+
+class MultiModalDummyInputsBuilder(
+        BaseDummyInputsBuilder[MultiModalProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        if "gemma3" in processor.__class__.__name__.lower():
+            image_token = processor.boi_token
+        else:
+            image_token = getattr(processor, "image_token", "")
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
+        target_width, target_height = self.info.get_max_image_size()
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+        }
+
+
+class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ):
+        """
+        Given the original multi-modal items for this modality
+        and HF-processed data, output the updates to perform.
+
+        The information returned by this method is used to update token inputs
+        which bypass the HF processor. It is also used to update the output of
+        HF processor if the HF process does not apply prompt updates to text
+        inputs.
+
+        Moreover, this information is critical to determine the token positions
+        in order to construct  :class:`~vllm-multimodal.input.PlaceholderRange`
+        for each multi-modal item.
+        """
+        return None
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs,
+        hf_processor_mm_kwargs,
+        num_image_patches: torch.Tensor = None,
+    ):
+        # HF Processors always return a mask but vLLM doesn't need it
+        hf_inputs.pop("attention_mask", None)
+        mm_fields = {
+            key: MultiModalFieldConfig.flat_from_sizes("image",
+                                                       num_image_patches)
+            for key in hf_inputs
+        }
+        mm_fields["image_embeds"] = MultiModalFieldConfig.flat_from_sizes(
+            "image", num_image_patches)
+        mm_fields["num_image_patches"] = MultiModalFieldConfig.batched("image")
+        return mm_fields
+
+    def _apply_hf_processor_text_mm(
+        self,
+        prompt_text: str,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+    ):
+        """
+        Apply the HF processor on the prompt text and multi-modal data
+        together.
+
+        In addition, return whether prompt replacements have been applied.
+        """
+        processor_data, passthrough_data = self._get_hf_mm_data(mm_items)
+        processor_data["return_mm_token_type_ids"] = True
+
+        processed_data = self._call_hf_processor(
+            prompt=prompt_text,
+            mm_data=processor_data,
+            mm_kwargs=hf_processor_mm_kwargs,
+            tok_kwargs=tokenization_kwargs,
+        )
+        processed_data.update(passthrough_data)
+
+        prompt_ids, = processed_data.pop("input_ids").tolist()
+        mm_token_type_ids = processed_data.pop(
+            "mm_token_type_ids"
+        ) if "mm_token_type_ids" in processed_data else processed_data.pop(
+            "token_type_ids")  # for gemma3 only
+
+        return prompt_ids, processed_data, mm_token_type_ids
+
+    def apply(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        return_mm_hashes: bool = False,
+    ) -> MultiModalInputs:
+        """
+        Process multi-modal inputs to be used in vLLM.
+
+        Apply HF Processor on prompt text and multi-modal data together,
+        outputting token IDs and processed tensors.
+        """
+        if tokenization_kwargs is None:
+            tokenization_kwargs = {}
+
+        mm_items = self._to_mm_items(mm_data)
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        if not isinstance(prompt, str):
+            # the prompt is the tokenized ids which is not supported
+            # by the hf_processor, which is why we would need to decode the ids
+            # into string
+            prompt = hf_processor.decode(prompt)
+
+        (prompt_ids, processed_data,
+         mm_token_type_ids) = self._apply_hf_processor_text_mm(
+             prompt_text=prompt,
+             mm_items=mm_items,
+             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+             tokenization_kwargs=tokenization_kwargs,
+         )
+
+        # HF processor will return `mm_token_type_ids` from which
+        # we can infer mm_placeholders. Until then hardcode to make code run
+        # Below tested on Llava. Prompts and `mm_token_type_ids` are always bs=1
+        mm_positions = torch.where(mm_token_type_ids == 1)[1]
+        images = mm_items.get_items("image", ImageProcessorItems)
+        mm_processor_kwargs = (self.info.ctx.model_config.mm_processor_kwargs
+                               or {})
+        image_sizes = []
+        for item_idx in range(len(images)):
+            image_size = images.get_image_size(item_idx)
+            image_sizes.append((image_size.height, image_size.width))
+
+        mm_tokens_per_modality = hf_processor._get_num_multimodal_tokens(
+            image_sizes=image_sizes, **mm_processor_kwargs)
+
+        mm_placeholders = {}
+        split_sizes = mm_tokens_per_modality["num_image_tokens"]
+        if split_sizes:
+            chunked_mm_positions = torch.split(mm_positions, split_sizes)
+            mm_tokens = torch.tensor(prompt_ids)[mm_token_type_ids[0].bool()]
+            chunked_mm_tokens = torch.split(mm_tokens, split_sizes)
+            ranges = [
+                PlaceholderRange(
+                    offset=positions[0].item(),
+                    length=positions.shape[0],
+                    is_embed=(mm_tokens == hf_processor.image_token_id).bool())
+                for positions, mm_tokens in zip(chunked_mm_positions,
+                                                chunked_mm_tokens)
+            ]
+            mm_placeholders = {"image": ranges}
+
+        num_image_patches = torch.tensor(
+            mm_tokens_per_modality["num_image_patches"]
+        ) if "num_image_patches" in mm_tokens_per_modality else None
+        processed_data['num_image_patches'] = num_image_patches
+        mm_kwargs = MultiModalKwargs.from_hf_inputs(
+            processed_data,
+            self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs,
+                                       num_image_patches),
+        )
+
+        mm_hashes = self._hash_mm_items(mm_items, hf_processor_mm_kwargs,
+                                        tokenization_kwargs)
+        return MultiModalInputs(
+            type="multimodal",
+            prompt=prompt,
+            prompt_token_ids=prompt_ids,
+            mm_kwargs=mm_kwargs,
+            mm_hashes=mm_hashes,
+            mm_placeholders=mm_placeholders,
+        )
+
+
+class ConfigOverride:
+    """Context manager to temporarily override config attributes."""
+
+    def __init__(self, config: PretrainedConfig, **kwargs):
+        self.config = config
+        self.kwargs = kwargs
+        self.kwargs_original = {}
+        self.kwargs_delete = set()
+
+    def __enter__(self):
+        """Override config attributes."""
+        for key, value in self.kwargs.items():
+            if not hasattr(self.config, key):
+                self.kwargs_delete.add(key)
+            self.kwargs_original[key] = getattr(self.config, key, None)
+            setattr(self.config, key, value)
+        return self.config
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        """Restore original config attributes."""
+        for key, value in self.kwargs_original.items():
+            if key in self.kwargs_delete:
+                delattr(self.config, key)
+            else:
+                setattr(self.config, key, value)
+
+
+class TransformersModel:
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        logger.info("Using Transformers backend.")
+
+        config: PretrainedConfig = vllm_config.model_config.hf_config
+        cache_config: CacheConfig = vllm_config.cache_config
+        device_config: DeviceConfig = vllm_config.device_config
+        model_config: ModelConfig = vllm_config.model_config
+        parallel_config: ParallelConfig = vllm_config.parallel_config
+        quant_config: QuantizationConfig = vllm_config.quant_config
+
+        self.config = config
+        self.text_config = config.get_text_config()
+        self.cache_config = cache_config
+        self.device_config = device_config
+        self.model_config = model_config
+        self.parallel_config = parallel_config
+        self.quant_config = quant_config
+
+        self.pp_group = get_pp_group()
+        self.pp_size = self.pp_group.world_size
+        self.pp_rank = self.pp_group.rank_in_group
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        # vLLM handles interleaved sliding window attention by creating a new
+        # interleaved_sliding_window attribute and deleting the sliding_window
+        # attribute. This breaks the constructors in Transformers so we
+        # temporarily add the attribute back to construct the model.
+        config_override = nullcontext()
+        if hasattr(config, "interleaved_sliding_window"):
+            config_override = ConfigOverride(
+                config, sliding_window=config.interleaved_sliding_window)
+
+        # Set correct attn and init on "meta" to delay allocating GPU tensors
+        # TODO: @raushan, use the public `model.set_attn_implementation()`
+        # method after v4.54.0 is released
+        self.text_config._attn_implementation = "vllm"
+        with init_on_device_without_buffers("meta"), config_override:
+            self.model: PreTrainedModel = AutoModel.from_config(
+                config,
+                torch_dtype=model_config.dtype,
+                trust_remote_code=model_config.trust_remote_code,
+            )
+
+        self.pipeline_parallel()
+        self.tensor_parallel()
+
+        # Input embeddings
+        text_config = config.get_text_config()
+        if not isinstance(self.model.get_input_embeddings(), PPMissingLayer):
+            self.model.set_input_embeddings(
+                VocabParallelEmbedding(
+                    text_config.vocab_size,
+                    text_config.hidden_size,
+                    org_num_embeddings=text_config.vocab_size,
+                    quant_config=quant_config,
+                ))
+
+        # Attention layers
+        self.attention_instances = self.create_attention_instances()
+
+        # Initialize any parameters that have not had their modules replaced
+        self.init_parameters(self.model)
+
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(["hidden_states"],
+                                                    text_config.hidden_size))
+
+    def pipeline_parallel(self):
+        """
+        Apply the model's pipeline parallelization plan.
+        """
+        if self.pp_size <= 1:
+            return
+
+        if not self.model.supports_pp_plan:
+            raise ValueError(
+                f"{type(self.model)} does not support pipeline parallel yet!")
+
+        module_lists = []
+        module_list_idx = None
+        pp_plan = list(self.model._pp_plan.keys())
+        for i, name in enumerate(pp_plan):
+            if isinstance(getattr(self.model, name), nn.ModuleList):
+                module_lists.append(name)
+                module_list_idx = i
+
+        if len(module_lists) > 1:
+            raise ValueError(
+                "Pipeline parallel of models with multiple `ModuleList`s "
+                "in the base model are not supported yet!")
+        if module_list_idx is None:
+            raise ValueError(
+                f"Could not find `ModuleList` in {type(self.model)}")
+
+        # Layers before module list
+        for name in pp_plan[:module_list_idx]:
+            if self.pp_group.is_first_rank or (
+                    self.text_config.tie_word_embeddings
+                    and self.pp_group.is_last_rank):
+                continue
+            setattr(self.model, name, PPMissingLayer())
+
+        # Module list
+        start_layer, end_layer = get_pp_indices(
+            self.text_config.num_hidden_layers, self.pp_rank, self.pp_size)
+        layers_name = pp_plan[module_list_idx]
+        layers = getattr(self.model, layers_name)
+        for i in range(len(layers)):
+            if start_layer <= i and i < end_layer:
+                continue
+            layers[i] = PPMissingLayer(return_tuple=True)
+
+        # Layers after module list
+        for name in pp_plan[module_list_idx + 1:]:
+            # Modules that should be on last rank
+            if not self.pp_group.is_last_rank:
+                setattr(self.model, name, PPMissingLayer())
+
+    def tensor_parallel(self):
+        """
+        Apply the model's tensor parallelization plan.
+        Currently only supports linear layers.
+        """
+        if not self.model.supports_tp_plan:
+            if self.tp_size <= 1:
+                return
+
+            raise ValueError(
+                f"{type(self.model)} does not support tensor parallel yet!")
+
+        tp_plan = self.model._tp_plan
+
+        def _tensor_parallel(module: nn.Module, prefix: str = ""):
+            for child_name, child_module in module.named_children():
+                qual_name = maybe_prefix(prefix, child_name)
+                for pattern, style in tp_plan.items():
+                    if re.match(pattern, qual_name) and isinstance(
+                            child_module, nn.Linear):
+                        new_module = replace_linear_class(
+                            child_module, style, self.quant_config)
+                        setattr(module, child_name, new_module)
+                        log_replacement(qual_name, child_module, new_module)
+                else:
+                    _tensor_parallel(child_module, prefix=qual_name)
+
+        _tensor_parallel(self.model)
+
+    def create_attention_instances(self) -> dict[int, Attention]:
+        """
+        Create `Attention` instances to inform KV cache allocation.
+        """
+        num_heads = self.model_config.get_num_attention_heads(
+            self.parallel_config)
+        head_size = self.model_config.get_head_size()
+        num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config)
+        start, end = get_pp_indices(self.text_config.num_hidden_layers,
+                                    self.pp_rank, self.pp_size)
+
+        attention_instances = {}
+        for i in range(start, end):
+            # Handle interleaved sliding window attention
+            sliding_window = None
+            if (hasattr(self.config, "interleaved_sliding_window")
+                    and hasattr(self.config, "sliding_window_pattern")
+                    and ((i + 1) % self.config.sliding_window_pattern > 0)):
+                sliding_window = self.config.interleaved_sliding_window
+
+            attention_instances[i] = Attention(
+                num_heads=num_heads,
+                head_size=head_size,
+                # NOTE: We use Llama scale as default, if it's set by
+                # Transformers, it's updated in vllm_flash_attention_forward
+                scale=head_size**-0.5,
+                num_kv_heads=num_kv_heads,
+                cache_config=self.cache_config,
+                quant_config=self.quant_config,
+                per_layer_sliding_window=sliding_window,
+                prefix=f"{i}.attn")
+        return attention_instances
+
+    def init_parameters(self, module: nn.Module):
+        """
+        If a `parameter` is on the `meta` device, then its parent
+        `module` is the original module created by:
+
+        ```python
+        with torch.device("meta"):
+            self.model: PreTrainedModel = AutoModel.from_config(...)
+        ```
+        """
+        for name, param in module.named_parameters(recurse=False):
+            if param.device == torch.device("meta"):
+                new_param = nn.Parameter(
+                    torch.empty_like(param.data,
+                                     dtype=self.model_config.dtype,
+                                     device=self.device_config.device))
+                setattr(module, name, new_param)
+        for child in module.children():
+            self.init_parameters(child)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if not get_pp_group().is_first_rank:
+            assert intermediate_tensors is not None
+            input_ids = None
+            inputs_embeds = intermediate_tensors["hidden_states"]
+
+        if input_ids is not None:
+            input_ids = input_ids[None, ...]
+        if inputs_embeds is not None:
+            inputs_embeds = inputs_embeds[None, ...]
+
+        if self.model_config.uses_mrope:
+            position_ids = positions[:, None]
+        else:
+            position_ids = positions[None, ...]
+
+        hidden_states = self.model(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            use_cache=False,
+            position_ids=position_ids,
+            attention_instances=self.attention_instances,
+            return_dict=False)[0][0, ...]  # we remove batch dimension for now
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
+
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        params_dict = dict(self.named_parameters())
+
+        loaded_params = set[str]()
+        for name, loaded_weight in weights:
+            # Use "model" instead of base_model_prefix because
+            # the base model attribute in vLLM is always `model`
+            if not name.startswith(prefix := "model."):
+                name = prefix + name
+
+            if is_pp_missing_parameter(name, self):
+                continue
+            if name in params_dict:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+                loaded_params.add(name)
+        return loaded_params
+
+
+@support_torch_compile
+class TransformersForCausalLM(nn.Module, SupportsQuant, SupportsLoRA,
+                              SupportsPP):
+    embedding_padding_modules = ["lm_head"]
+    embedding_modules = ["embed_tokens"
+                         ]  # TODO transformers will have a util to get it
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: PretrainedConfig = vllm_config.model_config.hf_config
+        quant_config: QuantizationConfig = vllm_config.quant_config
+
+        self.config = config
+
+        self.transformers_model = TransformersModel(vllm_config=vllm_config,
+                                                    prefix=prefix)
+        self.model = self.transformers_model.model
+
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = config.vocab_size
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+            if config.tie_word_embeddings:
+                self.lm_head = self.lm_head.tie_weights(
+                    self.model.get_input_embeddings())
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    config.vocab_size,
+                                                    logit_scale)
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            self.transformers_model.make_empty_intermediate_tensors)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        model_output = self.transformers_model.forward(input_ids, positions,
+                                                       intermediate_tensors,
+                                                       inputs_embeds)
+        return model_output
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        skip_prefixes = ["lm_head."
+                         ] if self.config.tie_word_embeddings else None
+        loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
+        return loader.load_weights(weights)
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    MultiModalProcessor,
+    info=MultiModalProcessingInfo,
+    dummy_inputs=MultiModalDummyInputsBuilder)
+class TransformersForMultimodalLM(nn.Module, SupportsQuant, SupportsLoRA,
+                                  SupportsPP, SupportsMultiModal):
+    embedding_padding_modules = ["lm_head"]
+    embedding_modules = ["embed_tokens"]
+
+    # Backwards compatibility for prev released models. State dicts back then
+    # had different formats and cannot be loaded with `AutoModel` mapping as is
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "language_model.model": "model.language_model",
+            "text_model.model": "model.text_model",
+            "vision_tower": "model.vision_tower",
+            "vqmodel": "model.vqmodel",
+            "visual": "model.visual",
+            "vision_model": "model.vision_model",
+            "vision_embed_tokens": "model.vision_embed_tokens",
+            "image_newline": "model.image_newline",
+            "multi_modal_projector": "model.multi_modal_projector",
+            "text_model.lm_head": "lm_head",
+            "language_model.lm_head": "lm_head",
+            # Qwen models used "model" as the name for the language model.
+            # Therefore, we must map each of submodule explicitly to avoid
+            # conflicts with newer models that use "model.language_model".
+            "model.embed_tokens": "model.language_model.embed_tokens",
+            "model.layers": "model.language_model.layers",
+            "model.norm": "model.language_model.norm",
+        })
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: PretrainedConfig = vllm_config.model_config.hf_config
+        quant_config: QuantizationConfig = vllm_config.quant_config
+
+        self.config = config
+        self.dtype = vllm_config.model_config.dtype
+
+        self.transformers_model = TransformersModel(vllm_config=vllm_config,
+                                                    prefix=prefix)
+        self.model = self.transformers_model.model
+        text_config = config.get_text_config()
+
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = text_config.vocab_size
+            self.lm_head = ParallelLMHead(
+                text_config.vocab_size,
+                text_config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+            if text_config.tie_word_embeddings:
+                self.lm_head = self.lm_head.tie_weights(
+                    self.model.get_input_embeddings())
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    text_config.vocab_size,
+                                                    logit_scale)
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            self.transformers_model.make_empty_intermediate_tensors)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        # NOTE: In v1, inputs_embeds is always generated at model runner from
+        # `get_multimodal_embeddings` and `get_input_embeddings`, this
+        # condition is only for v0 compatibility.
+        if inputs_embeds is None:
+            multimodal_embeds = self.get_multimodal_embeddings(**kwargs)
+            if multimodal_embeds is not None:
+                inputs_embeds = self.get_input_embeddings(
+                    input_ids, multimodal_embeds)
+                input_ids = None
+
+        model_output = self.transformers_model.forward(input_ids, positions,
+                                                       intermediate_tensors,
+                                                       inputs_embeds)
+        return model_output
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=([
+                "lm_head."
+            ] if self.config.get_text_config().tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_multimodal_embeddings(self, **kwargs):
+        pixel_values = kwargs.pop("pixel_values", None)
+        pixel_values = pixel_values if pixel_values is not None else kwargs.pop(
+            "image_patches", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if image_embeds is not None:
+            return image_embeds
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        num_image_patches = kwargs.pop("num_image_patches")
+        if pixel_values is not None:
+            if isinstance(pixel_values, torch.Tensor):
+                pixel_values = flatten_bn(pixel_values).to(self.dtype)
+            elif is_list_of(pixel_values, torch.Tensor):
+                pixel_values = flatten_bn(flatten_bn(pixel_values),
+                                          concat=True).to(self.dtype)
+            else:
+                raise ValueError(
+                    f"Unsupported pixel_values type {type(pixel_values)}. "
+                    "Expected `torch.Tensor` or list of `torch.Tensor`.")
+
+            if isinstance(num_image_patches, list):
+                num_image_patches = torch.cat(num_image_patches)
+
+            vision_embeddings = self.model.get_image_features(
+                pixel_values,
+                **{
+                    k: v.flatten(0, 1)
+                    for k, v in kwargs.items()
+                },
+            )
+
+            if isinstance(vision_embeddings, torch.Tensor):
+                if vision_embeddings.ndim == 2:
+                    vision_embeddings = vision_embeddings.unsqueeze(0)
+
+                # Embeddings have to be 2D tensors of length `num_images`
+                # but transformers returns concat tensors if each patch
+                # is of different size. We split it back to make vLLM happy
+                vision_embeddings = torch.split(
+                    vision_embeddings,
+                    num_image_patches.flatten().tolist())
+                vision_embeddings = [
+                    embed.flatten(start_dim=0, end_dim=-2)
+                    for embed in vision_embeddings
+                ]
+
+            return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings=None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.model.get_input_embeddings()(input_ids)
+        if (multimodal_embeddings is not None
+                and len(multimodal_embeddings) != 0):
+            mask = (input_ids == self.config.image_token_id)
+            mask = mask.unsqueeze(-1).expand_as(inputs_embeds)
+            multimodal_embeddings = torch.cat(multimodal_embeddings)
+
+            inputs_embeds = inputs_embeds.masked_scatter(
+                mask, multimodal_embeddings)
+        return inputs_embeds
diff --git a/vllm_v0.10.0/vllm/model_executor/models/ultravox.py b/vllm_v0.10.0/vllm/model_executor/models/ultravox.py
new file mode 100644
index 0000000..3697e3f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/ultravox.py
@@ -0,0 +1,670 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py
+"""PyTorch Ultravox model."""
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Any, Literal, Optional, TypedDict, Union
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from transformers import BatchFeature, ProcessorMixin
+from transformers.models.whisper import WhisperFeatureExtractor
+from transformers.models.whisper.modeling_whisper import WhisperEncoder
+
+from vllm import envs
+from vllm.config import VllmConfig
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.model_loader import DefaultModelLoader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.ultravox import UltravoxConfig
+
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings,
+                    merge_multimodal_embeddings_from_map)
+
+_AUDIO_PLACEHOLDER_OVERRIDE = "<|reserved_special_token_0|>"
+_AUDIO_PLACEHOLDER_TOKEN = 128002
+_AUDIO_TOKENS_PER_SECOND = 6.25
+_MAX_ENCODER_BATCH_SIZE = 16
+
+
+class UltravoxAudioFeatureInputs(TypedDict):
+    type: Literal["audio_features"]
+    data: Union[torch.Tensor, list[torch.Tensor], list[list[torch.Tensor]]]
+    """Shape: `(batch_size, num_chunks, 80, M)`"""
+    lens: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    Length of the audio frames. Used for attention mask in WhisperEncoder.
+    Shape: `(batch_size, num_chunks)`
+    """
+    token_len: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    Length of the audio tokens. Used for flattening the audio features.
+    Shape: `(batch_size, num_chunks)`
+    """
+
+
+class UltravoxAudioEmbeddingInputs(TypedDict):
+    type: Literal["audio_embeds"]
+    data: NestedTensors
+    """Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)`"""
+
+
+UltravoxAudioInputs = Union[UltravoxAudioFeatureInputs,
+                            UltravoxAudioEmbeddingInputs]
+
+
+class UltravoxProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_processor(
+        self,
+        *,
+        # Ignored in initialization
+        sampling_rate: Optional[int] = None,
+        **kwargs: object,
+    ) -> ProcessorMixin:
+        hf_processor = self.ctx.get_hf_processor(**kwargs)
+
+        # NOTE: Ultravox processing definition uses '<|eot_id|>' as the
+        # placeholder that will cause confusion with the actual end of turn
+        # token, thus we override placeholder with a reserved special
+        # token.
+        hf_processor.audio_token_replacement = _AUDIO_PLACEHOLDER_OVERRIDE
+        hf_processor.audio_replacement_token_id = _AUDIO_PLACEHOLDER_TOKEN
+        return hf_processor
+
+    def get_feature_extractor(
+        self,
+        *,
+        # Ignored in initialization
+        sampling_rate: Optional[int] = None,
+    ) -> WhisperFeatureExtractor:
+        hf_processor = self.get_hf_processor(sampling_rate=sampling_rate)
+        audio_processor = hf_processor.audio_processor  # type: ignore
+        feature_extractor = audio_processor.feature_extractor  # type: ignore
+        assert isinstance(feature_extractor, WhisperFeatureExtractor)
+        return feature_extractor
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"audio": None}
+
+
+class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo]
+                                 ):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+
+        return "<|audio|>" * num_audios
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        feature_extractor = self.info.get_feature_extractor()
+
+        sampling_rate = feature_extractor.sampling_rate
+        audio_len = (feature_extractor.chunk_length * sampling_rate *
+                     _MAX_ENCODER_BATCH_SIZE)
+        num_audios = mm_counts.get("audio", 0)
+
+        return {
+            "audio":
+            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
+        }
+
+
+class UltravoxMultiModalProcessor(
+        BaseMultiModalProcessor[UltravoxProcessingInfo]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        feature_extractor = self.info.get_feature_extractor()
+        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        # Text-only input not supported in composite processor
+        if not mm_data.get("audios", []):
+            prompt_ids = self.info.get_tokenizer().encode(
+                prompt, add_special_tokens=False)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        mm_data = dict(mm_data)
+        audios = mm_data.pop("audios", [])
+        assert isinstance(audios, list)
+
+        feature_extractor = self.info.get_feature_extractor()
+        mm_kwargs = dict(
+            **mm_kwargs,
+            sampling_rate=feature_extractor.sampling_rate,
+            include_audio_num_chunks=True,
+        )
+
+        item_processor_data = dict(**mm_data, audios=audios)
+
+        # some tokenizer kwargs are incompatible with UltravoxProcessor
+        tok_kwargs.pop("padding", None)
+        tok_kwargs.pop("truncation", None)
+
+        output = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=item_processor_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+        output['audio_features'] = output.pop('audio_values')
+
+        return output
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        num_chunks = hf_inputs.get('audio_num_chunks', torch.zeros(0))
+        return dict(
+            # to handle longer than 30s audio, each audio might be split
+            # into multiple chunks as such, their batch dimension can be
+            # higher than the number of audio samples
+            audio_features=MultiModalFieldConfig.flat_from_sizes(
+                "audio", num_chunks),
+            audio_token_len=MultiModalFieldConfig.flat_from_sizes(
+                "audio", num_chunks),
+            audio_lens=MultiModalFieldConfig.flat_from_sizes(
+                "audio", num_chunks),
+            # num_chunks can convert audio_chunked to audio batch dimension
+            audio_num_chunks=MultiModalFieldConfig.batched("audio"),
+            audio_embeds=MultiModalFieldConfig.batched("audio"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        replacement_id = hf_processor.audio_replacement_token_id  # type: ignore
+
+        # Each audio can be split into multiple chunks.
+        # chunks_start_idx[i] indicates the start index of the chunks
+        # belonging to the i-th audio.
+        num_chunks = out_mm_kwargs.get("audio_num_chunks", torch.zeros(0))
+        chunks_start_idx: torch.Tensor = torch.cumsum(num_chunks,
+                                                      dim=0,
+                                                      dtype=torch.int32)
+        chunks_start_idx = torch.cat(
+            [torch.tensor([0], dtype=torch.int32), chunks_start_idx])
+
+        def get_replacement_ultravox(item_idx: int):
+            start = chunks_start_idx[item_idx]
+            end = chunks_start_idx[item_idx + 1]
+            audio_token_len = out_mm_kwargs["audio_token_len"][start:end].sum()
+            return [replacement_id] * int(audio_token_len)  # type: ignore
+
+        return [
+            PromptReplacement(
+                modality="audio",
+                target="<|audio|>",
+                replacement=get_replacement_ultravox,
+            )
+        ]
+
+
+class StackAudioFrames(nn.Module):
+    """
+    Stack the audio embedding frames to reduce the sequence length by a factor
+    of `stack_factor`.
+    """
+
+    def __init__(self, stack_factor: int = 8):
+        super().__init__()
+        self.stack_factor = stack_factor
+
+    def forward(self, audio_embeds: torch.Tensor) -> torch.Tensor:
+        B, T, C = audio_embeds.shape
+        T_pad = (T + self.stack_factor -
+                 1) // self.stack_factor * self.stack_factor
+        audio_embeds = F.pad(audio_embeds, (0, 0, 0, T_pad - T))
+        B, T, C = audio_embeds.shape
+        audio_embeds = audio_embeds.view(B, T // self.stack_factor,
+                                         C * self.stack_factor)
+        return audio_embeds
+
+
+class UltravoxProjector(nn.Module):
+
+    def __init__(self, config: UltravoxConfig):
+        super().__init__()
+        self.hidden_dim = config.hidden_size
+        self._pad_and_stack = StackAudioFrames(config.stack_factor)
+        dim_in = config.audio_config.hidden_size * config.stack_factor
+        self.ln_pre = RMSNorm(dim_in)
+        self.linear_1 = nn.Linear(dim_in, self.hidden_dim, bias=False)
+        dim_mid = self.hidden_dim
+
+        if config.projector_act == "swiglu":
+            self.act = MulAndSilu()
+            dim_mid = dim_mid // 2
+        else:
+            self.act = get_act_fn(config.projector_act)
+
+        dim_out = config.text_config.hidden_size
+        self.linear_2 = nn.Linear(dim_mid, dim_out, bias=False)
+
+        # Ultravox v0.4.1 and below use layer_norm after the second linear layer
+        # while v0.5.0 and above uses layer_norm after the first linear layer.
+        if config.projector_ln_mid:
+            self.ln_mid: nn.Module = RMSNorm(dim_mid)
+            self.ln_post = nn.Identity()
+        else:
+            self.ln_mid = nn.Identity()
+            self.ln_post = RMSNorm(dim_out)
+
+    def forward(self, audio_features: torch.Tensor) -> torch.Tensor:
+        audio_features = self._pad_and_stack(audio_features)
+        audio_features = self.ln_pre(audio_features)
+        hidden_states = self.linear_1(audio_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.ln_mid(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        hidden_states = self.ln_post(hidden_states)
+        return hidden_states
+
+
+class ModifiedWhisperEncoder(WhisperEncoder):
+    """
+    Encoder portion of OpenAI's Whisper model.
+
+    This implementation is a slightly modified version of HF Transformers'
+    Whisper Encoder, with only a few fixes:
+    1. base_model_prefix updated to allow for doing `.from_pretrained`
+       directly on the encoder
+    2. allow less than 30 second of audio padding to be passed in:
+        - relaxed ValueError check for `input_features` length to be less
+           than or equal to `expected_seq_length` instead of strictly equal
+        - embed_pos is now sliced to match the length of `inputs_embeds`
+
+    Original: https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py
+    See commentary: https://github.com/huggingface/transformers/issues/25744
+    """
+
+    base_model_prefix = "model.encoder"
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.config.is_decoder = False
+
+    @property
+    def max_context_length(self):
+        return (self.config.max_source_positions * self.conv1.stride[0] *
+                self.conv2.stride[0])
+
+    def get_attention_mask_by_audio_len(self,
+                                        audio_lens: Optional[torch.Tensor],
+                                        hidden_states: torch.Tensor):
+        """
+        Create attention mask based on audio lengths to mask out padding tokens
+        For each sample in batch:
+        - Convert raw audio length to feature length after convolutions
+        - Create bool mask: True for valid positions and False for padding
+        - Convert to attention mask format expected by transformer layers
+        (1.0 for positions to attend to, large negative for positions to ignore)
+        This masking ensures consistent behavior between training and inference
+        by preventing the model from attending to padding tokens in both cases
+        """
+        if audio_lens is None:
+            return None
+
+        audio_feature_len = self._get_feat_extract_output_lengths(audio_lens)
+        max_seq_len = hidden_states.shape[1]
+        attention_mask = torch.arange(max_seq_len,
+                                      device=hidden_states.device)[None, :].lt(
+                                          audio_feature_len.view(-1, 1))
+        attention_mask = self.get_extended_attention_mask(
+            attention_mask,
+            None,
+            dtype=hidden_states.dtype,
+        )
+        return attention_mask
+
+    def forward(
+        self,
+        input_features: torch.Tensor,
+        audio_lens: Optional[torch.Tensor] = None,
+    ):
+        expected_seq_length = self.max_context_length
+        if input_features.shape[-1] > expected_seq_length:
+            raise ValueError(
+                f"Whisper expects the mel input features to be of length "
+                f"{expected_seq_length} or less, but found "
+                f"{input_features.shape[-1]}. Make sure to pad the input mel "
+                f"features to {expected_seq_length}.")
+
+        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
+        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
+
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)
+        embed_pos = self.embed_positions.weight[:inputs_embeds.size(-2)]
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = nn.functional.dropout(hidden_states,
+                                              p=self.dropout,
+                                              training=self.training)
+
+        attention_mask = self.get_attention_mask_by_audio_len(
+            audio_lens, hidden_states)
+
+        for encoder_layer in self.layers:
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+                layer_head_mask=None,
+            )
+
+            hidden_states = layer_outputs[0]
+
+        hidden_states = self.layer_norm(hidden_states)
+        return hidden_states
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    UltravoxMultiModalProcessor,
+    info=UltravoxProcessingInfo,
+    dummy_inputs=UltravoxDummyInputsBuilder)
+class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
+
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
+    }
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={"audio_tower.model.encoder.": "audio_tower."})
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("audio"):
+            return "<|audio|>"
+
+        raise ValueError("Only audio modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        self.config = config
+        self.multi_modal_config = multimodal_config
+        assert self.multi_modal_config
+
+        self.secondary_weights = []
+        self.audio_tower = ModifiedWhisperEncoder(config.audio_config)
+        if config.audio_model_id is not None:
+            # this prefix is not for initialization, but for loading weights
+            # note the trailing dot
+            self.secondary_weights.append(
+                DefaultModelLoader.Source(
+                    model_or_path=config.audio_model_id,
+                    revision=None,
+                    prefix="audio_tower.",
+                ))
+        self.multi_modal_projector = UltravoxProjector(config)
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+        if config.text_model_id is not None:
+            # this prefix is not for initialization, but for loading weights
+            # note the trailing dot
+            self.secondary_weights.append(
+                DefaultModelLoader.Source(model_or_path=config.text_model_id,
+                                          revision=None,
+                                          prefix="language_model."))
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model.",
+            connector="multi_modal_projector.",
+            tower_model="audio_tower.",
+        )
+
+    def _audio_features_to_embeddings(
+            self, input_features: torch.Tensor,
+            audio_lens: torch.Tensor) -> torch.Tensor:
+        audio_features = input_features.to(self.audio_tower.dtype)
+        batch_size = audio_features.size(0)
+        audio_embeddings = []
+
+        # Process audio features in batches to keep memory usage predictable
+        for start in range(0, batch_size, _MAX_ENCODER_BATCH_SIZE):
+            end = min(start + _MAX_ENCODER_BATCH_SIZE, batch_size)
+            # Process through audio tower
+            batch_features = self.audio_tower(audio_features[start:end],
+                                              audio_lens[start:end])
+            batch_features = batch_features.to(self.audio_tower.dtype)
+
+            # Process through projector
+            batch_embeddings = self.multi_modal_projector(batch_features)
+            audio_embeddings.append(batch_embeddings)
+
+        # Concatenate results
+        audio_embeddings = torch.cat(audio_embeddings, dim=0)
+        return audio_embeddings
+
+    def _parse_and_validate_audio_input(
+            self, **kwargs: object) -> Optional[UltravoxAudioInputs]:
+        audio_features = kwargs.pop("audio_features", None)
+        audio_embeds = kwargs.pop("audio_embeds", None)
+        audio_lens = kwargs.pop("audio_lens", None)
+        audio_token_len = kwargs.pop("audio_token_len", None)
+
+        if audio_features is None and audio_embeds is None:
+            return None
+
+        if audio_features is not None:
+            if not isinstance(audio_features, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of audio features. "
+                                 f"Got type: {type(audio_features)}")
+            if not isinstance(audio_lens, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of audio_lens. "
+                                 f"Got type: {type(audio_features)}")
+            if not isinstance(audio_token_len, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of audio_token_len. "
+                                 f"Got type: {type(audio_features)}")
+
+            return UltravoxAudioFeatureInputs(type="audio_features",
+                                              data=audio_features,
+                                              lens=audio_lens,
+                                              token_len=audio_token_len)
+
+        if audio_embeds is not None:
+            if not isinstance(audio_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of audio embeds. "
+                                 f"Got type: {type(audio_embeds)}")
+
+            return UltravoxAudioEmbeddingInputs(type="audio_embeds",
+                                                data=audio_embeds)
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_audio_input(
+        self,
+        audio_input: UltravoxAudioInputs,
+    ) -> Union[NestedTensors, tuple[torch.Tensor, ...]]:
+        if audio_input["type"] == "audio_embeds":
+            return audio_input["data"]
+
+        # Pad and concatenate audio features
+        # [[B1, 80, M1], [B2, 80, M2]] -> [B1+B2, 80, max(M1, M2)]
+        audio_features = pad_and_concat_to_dim3(audio_input["data"])
+
+        # [B1, B2] -> [B1+B2]
+        audio_lens = flatten_bn(audio_input['lens'], concat=True)
+        audio_token_len = flatten_bn(audio_input['token_len'], concat=True)
+
+        embeddings = self._audio_features_to_embeddings(
+            audio_features, audio_lens)
+
+        # We should flatten and concatenate embeddings based on token lengths
+        # For example, with token_len = [4, 2, 3], flattened_embeddings will be
+        # concat(embeddings[0][:4], embeddings[1][:2], embeddings[2][:3])
+
+        # Create a mask of valid indices based on token lengths
+        max_len = embeddings.shape[1]
+        indices = torch.arange(max_len, device=embeddings.device).expand(
+            embeddings.shape[0], -1)
+        mask = indices < audio_token_len[:, None]
+        # Apply mask and flatten
+        flattened_embeddings = embeddings[mask]
+
+        # Return one tensor per input audio
+        embed_lens = [
+            token_len_item.sum().item()
+            for token_len_item in audio_input['token_len']
+        ]
+        return flattened_embeddings.split(embed_lens)
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        audio_input = self._parse_and_validate_audio_input(**kwargs)
+        if audio_input is None:
+            return []
+        audio_embeddings = self._process_audio_input(audio_input)
+        return audio_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+
+            # TODO(ywang96): remove this block after v0 is deprecated.
+            if not envs.VLLM_USE_V1:
+                attn_metadata = get_forward_context().attn_metadata
+                merge_multimodal_embeddings_from_map(
+                    inputs_embeds, multimodal_embeddings,
+                    attn_metadata.multi_modal_placeholder_index_maps["audio"])
+            else:
+                inputs_embeds = merge_multimodal_embeddings(
+                    input_ids, inputs_embeds, multimodal_embeddings,
+                    _AUDIO_PLACEHOLDER_TOKEN)
+        return inputs_embeds
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                intermediate_tensors: Optional[torch.Tensor] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs) -> Union[torch.Tensor, IntermediateTensors]:
+        """Run forward pass for Ultravox
+
+        One key thing to understand is the `input_ids` already accounts for the
+        positions of the to-be-inserted audio embeddings. The to-be-inserted
+        audio has a size that is essentially 6.25 tokens per second of audio.
+
+        This way, the `positions` and `attn_metadata` are consistent
+        with the `input_ids`.
+
+        Args:
+            audio_features: A batch of audio input chunks [B, N, 80, M].
+            audio_lens: Length of audio frames for each audio chunk [B].
+            audio_token_len: Length of audio tokens for each audio chunk [B'].
+                Note: batch dim is different from batch dim in audio chunks.
+
+        """
+
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
+
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      multimodal_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds)
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+
+        loader = AutoWeightsLoader(self,
+                                   ignore_unexpected_prefixes=["audio_tower."])
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+
+def pad_and_concat_to_dim3(
+    features: Union[torch.Tensor, list[torch.Tensor], list[list[torch.Tensor]]]
+) -> torch.Tensor:
+    """
+    Pad and concatenate a list of tensors.
+
+    output:
+        Tensor of shape [B, C, M] where M is the maximum length of the input
+        tensors, B is the sum of the batch sizes of the input tensors.
+        C must be the same for all input tensors.
+    """
+    if isinstance(features, torch.Tensor):
+        if features.ndim > 3:
+            # Flatten [B, N, 80, M] -> [B * N, 80, M]
+            features = flatten_bn(features)
+        return features
+
+    features = [pad_and_concat_to_dim3(f) for f in features]
+
+    max_len = max(f.shape[-1] for f in features)
+    # Ensure all features have dim=3
+    features = [f.view(-1, *f.shape[-2:]) for f in features]
+    # Pad and oncatenate:
+    # [[B1, 80, M1], [B2, 80, M2]] -> [B1+B2, 80, max(M1, M2)]
+    features = [F.pad(f, (0, max_len - f.shape[-1])) for f in features]
+    return torch.cat(features)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/utils.py b/vllm_v0.10.0/vllm/model_executor/models/utils.py
new file mode 100644
index 0000000..62deb68
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/utils.py
@@ -0,0 +1,744 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import itertools
+from collections.abc import Iterable, Mapping
+from dataclasses import dataclass, field
+from typing import Any, Callable, Literal, Optional, Protocol, Union, overload
+
+import torch
+import torch.nn as nn
+from torch.func import functional_call
+from transformers import PretrainedConfig
+
+import vllm.envs as envs
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors
+from vllm.sequence import IntermediateTensors
+from vllm.utils import (get_cuda_view_from_cpu_tensor, is_pin_memory_available,
+                        is_uva_available)
+
+logger = init_logger(__name__)
+
+WeightsMapping = Mapping[str, Optional[str]]
+"""If a key maps to a value of `None`, the corresponding weight is ignored."""
+
+
+@dataclass
+class WeightsMapper:
+    """Maps the name of each weight if they match the following patterns."""
+
+    orig_to_new_substr: WeightsMapping = field(default_factory=dict)
+    orig_to_new_prefix: WeightsMapping = field(default_factory=dict)
+    orig_to_new_suffix: WeightsMapping = field(default_factory=dict)
+
+    def _map_name(self, key: str) -> Optional[str]:
+        for substr, new_key in self.orig_to_new_substr.items():
+            if substr in key:
+                if new_key is None:
+                    return None
+
+                key = key.replace(substr, new_key, 1)
+
+        for prefix, new_key in self.orig_to_new_prefix.items():
+            if key.startswith(prefix):
+                if new_key is None:
+                    return None
+
+                key = key.replace(prefix, new_key, 1)
+
+        for suffix, new_key in self.orig_to_new_suffix.items():
+            if key.endswith(suffix):
+                if new_key is None:
+                    return None
+
+                key = new_key.join(key.rsplit(suffix, 1))
+
+        return key
+
+    def apply(
+        self, weights: Iterable[tuple[str, torch.Tensor]]
+    ) -> Iterable[tuple[str, torch.Tensor]]:
+        return ((out_name, data) for name, data in weights
+                if (out_name := self._map_name(name)) is not None)
+
+    def apply_list(self, values: list[str]) -> list[str]:
+        return [
+            out_name for name in values
+            if (out_name := self._map_name(name)) is not None
+        ]
+
+    def apply_dict(self, values: dict[str, Any]) -> dict[str, Any]:
+        return {
+            out_name: value
+            for name, value in values.items()
+            if (out_name := self._map_name(name)) is not None
+        }
+
+
+class AutoWeightsLoader:
+    """
+    Helper class to load weights into a [`torch.nn.Module`][]. It is able
+    to automatically detect child modules and parameters while iterating over
+    the weights only once.
+
+    The weight loading logic for individual modules can be overridden
+    by defining a ``load_weights`` method.
+
+    Similarly, the weight loading logic for individual parameters can be
+    overridden by defining a ``weight_loader`` method.
+
+    Detailed weight loading information can be viewed by setting the
+    environment variable ``VLLM_LOGGING_LEVEL=DEBUG``.
+    """
+
+    # Models trained using early version ColossalAI
+    # may include these tensors in checkpoint. Skip them.
+    ROTARY_EMBEDS_UNUSED_WEIGHTS = [
+        "rotary_emb.inv_freq",
+        "rotary_emb.cos_cached",
+        "rotary_emb.sin_cached",
+    ]
+
+    def __init__(
+        self,
+        module: nn.Module,
+        *,
+        skip_prefixes: Optional[list[str]] = None,
+        skip_substrs: Optional[list[str]] = None,
+        ignore_unexpected_prefixes: Optional[list[str]] = None,
+    ) -> None:
+        super().__init__()
+
+        self.module = module
+        self.skip_prefixes = skip_prefixes or []
+        self.skip_substrs = skip_substrs or []
+        self.ignore_unexpected_prefixes = ignore_unexpected_prefixes or []
+        # update default skip_substrs
+        self.skip_substrs += self.ROTARY_EMBEDS_UNUSED_WEIGHTS
+
+    def _groupby_prefix(
+        self,
+        weights: Iterable[tuple[str, torch.Tensor]],
+    ) -> Iterable[tuple[str, Iterable[tuple[str, torch.Tensor]]]]:
+        weights_by_parts = ((weight_name.split(".", 1), weight_data)
+                            for weight_name, weight_data in weights)
+
+        for prefix, group in itertools.groupby(weights_by_parts,
+                                               key=lambda x: x[0][0]):
+            yield (
+                prefix,
+                # Because maxsplit=1 in weight_name.split(...),
+                # the length of `parts` must either be 1 or 2
+                (("" if len(parts) == 1 else parts[1], weights_data)
+                 for parts, weights_data in group),
+            )
+
+    def _get_qualname(self, prefix: str, rest: str) -> str:
+        if prefix == "":
+            return rest
+        if rest == "":
+            return prefix
+
+        return ".".join((prefix, rest))
+
+    def _can_skip(self, qualname: str) -> bool:
+        return (any(qualname.startswith(p) for p in self.skip_prefixes)
+                or any(substr in qualname for substr in self.skip_substrs))
+
+    def _can_ignore_unexpected(self, qualname: str) -> bool:
+        return any(
+            qualname.startswith(p) for p in self.ignore_unexpected_prefixes)
+
+    def _load_param(
+        self,
+        base_prefix: str,
+        param: nn.Parameter,
+        weights: Iterable[tuple[str, torch.Tensor]],
+    ) -> Iterable[str]:
+        for weight_name, weight_data in weights:
+            weight_qualname = self._get_qualname(base_prefix, weight_name)
+
+            if self._can_skip(weight_qualname):
+                logger.debug("Skipping weight %s", weight_qualname)
+
+                continue
+
+            if weight_name != "":
+                if self._can_ignore_unexpected(weight_qualname):
+                    logger.debug("Ignoring weight %s", weight_qualname)
+
+                    continue
+
+                raise ValueError(
+                    f"Attempted to load nested weight '{weight_qualname}' "
+                    f"into a single parameter '{base_prefix}'")
+
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, weight_data)
+
+            logger.debug("Loaded weight %s with shape %s", weight_qualname,
+                         param.shape)
+
+            yield weight_qualname
+
+    def _add_loadable_non_param_tensors(self, module: nn.Module,
+                                        child_params: dict[str, torch.Tensor]):
+        """
+        Add tensor names that are not in the model params that may be in the
+        safetensors, e.g., batch normalization stats.
+        """
+        if isinstance(module, (
+                nn.BatchNorm1d,
+                nn.BatchNorm2d,
+                nn.BatchNorm3d,
+                nn.LazyBatchNorm1d,
+                nn.LazyBatchNorm2d,
+                nn.LazyBatchNorm3d,
+                nn.SyncBatchNorm,
+        )):
+            module_state_dict = module.state_dict()
+            for stat_name in ("running_mean", "running_var",
+                              "num_batches_tracked"):
+                child_params[stat_name] = module_state_dict[stat_name]
+
+    def _load_module(
+        self,
+        base_prefix: str,
+        module: nn.Module,
+        weights: Iterable[tuple[str, torch.Tensor]],
+    ) -> Iterable[str]:
+        if isinstance(module, PPMissingLayer):
+            return
+
+        # Avoid infinite recursion since this function is typically
+        # called inside load_weights of the module itself
+        if module != self.module:
+            module_load_weights = getattr(module, "load_weights", None)
+            if callable(module_load_weights):
+                loaded_params = module_load_weights(weights)
+                if loaded_params is None:
+                    logger.warning(
+                        "Unable to collect loaded parameters "
+                        "for module %s", module)
+                else:
+                    yield from map(
+                        lambda x: self._get_qualname(base_prefix, x),
+                        loaded_params,
+                    )
+
+        child_modules = dict(module.named_children())
+        child_params = dict(module.named_parameters(recurse=False))
+
+        # Add missing tensors the weight loader needs to be able to load
+        # that aren't registered as params, e.g., batchnorm statistics.
+        self._add_loadable_non_param_tensors(module, child_params)
+
+        for child_prefix, child_weights in self._groupby_prefix(weights):
+            prefix = self._get_qualname(base_prefix, child_prefix)
+
+            if child_prefix in child_modules:
+                if self._can_skip(prefix + "."):
+                    logger.debug("Skipping module %s", prefix)
+
+                    continue
+
+                yield from self._load_module(prefix,
+                                             child_modules[child_prefix],
+                                             child_weights)
+            elif child_prefix in child_params:
+                if self._can_skip(prefix):
+                    logger.debug("Skipping param %s", prefix)
+
+                    continue
+
+                yield from self._load_param(prefix, child_params[child_prefix],
+                                            child_weights)
+            else:
+                can_skip_module = self._can_skip(prefix + ".")
+                can_skip_param = self._can_skip(prefix)
+                if can_skip_module or can_skip_param:
+                    logger.debug("Skipping missing %s", prefix)
+
+                    continue
+
+                can_ignore_module = self._can_ignore_unexpected(prefix + ".")
+                can_ignore_param = self._can_ignore_unexpected(prefix)
+                if can_ignore_module or can_ignore_param:
+                    logger.debug("Ignoring missing %s", prefix)
+
+                    continue
+
+                msg = (f"There is no module or parameter named '{prefix}' "
+                       f"in {type(self.module).__name__}")
+                raise ValueError(msg)
+
+    def load_weights(
+        self,
+        weights: Iterable[tuple[str, torch.Tensor]],
+        *,
+        mapper: Optional[WeightsMapper] = None,
+    ) -> set[str]:
+        if mapper is not None:
+            weights = mapper.apply(weights)
+        # filter out weights with first-prefix/substr to skip in name
+        weights = ((name, weight) for name, weight in weights
+                   if not self._can_skip(name))
+
+        autoloaded_weights = set(self._load_module("", self.module, weights))
+        return autoloaded_weights
+
+
+def init_vllm_registered_model(
+    vllm_config: VllmConfig,
+    *,
+    prefix: str = "",
+    hf_config: Optional[PretrainedConfig] = None,
+    architectures: Optional[list[str]] = None,
+) -> nn.Module:
+    """
+    Helper function to initialize an inner model registered to vLLM,
+    based on the arguments passed to the outer vLLM model.
+    """
+    from vllm.model_executor.model_loader.utils import initialize_model
+
+    if hf_config is None and architectures is not None:
+        # So that the architectures field is overridden
+        hf_config = vllm_config.model_config.hf_config
+
+    if hf_config is not None:
+        vllm_config = vllm_config.with_hf_config(hf_config,
+                                                 architectures=architectures)
+
+    return initialize_model(vllm_config=vllm_config, prefix=prefix)
+
+
+@overload
+def flatten_bn(x: torch.Tensor) -> torch.Tensor:
+    ...
+
+
+@overload
+def flatten_bn(x: list[torch.Tensor]) -> list[torch.Tensor]:
+    ...
+
+
+@overload
+def flatten_bn(
+    x: Union[list[torch.Tensor], torch.Tensor],
+    *,
+    concat: Literal[True],
+) -> torch.Tensor:
+    ...
+
+
+@overload
+def flatten_bn(
+    x: Union[list[torch.Tensor], torch.Tensor],
+    *,
+    concat: bool = False,
+) -> Union[list[torch.Tensor], torch.Tensor]:
+    ...
+
+
+def flatten_bn(
+    x: Union[list[torch.Tensor], torch.Tensor],
+    *,
+    concat: bool = False,
+) -> Union[list[torch.Tensor], torch.Tensor]:
+    """
+    Flatten the ``B`` and ``N`` dimensions of batched multimodal inputs.
+
+    The input tensor should have shape ``(B, N, ...)```.
+    """
+    if isinstance(x, torch.Tensor):
+        return x.flatten(0, 1)
+
+    if concat:
+        return torch.cat(x)
+
+    return [x_n for x_b in x for x_n in x_b]
+
+
+def _flatten_embeddings(embeddings: NestedTensors) -> torch.Tensor:
+    """
+    Recursively flattens and concatenates NestedTensors on all but the last
+    dimension.
+    """
+
+    if isinstance(embeddings, torch.Tensor):
+        # Flatten all but the last dimension.
+        return embeddings.flatten(0, -2)
+
+    return torch.cat(tuple(_flatten_embeddings(t) for t in embeddings))
+
+
+def _embedding_count_expression(embeddings: NestedTensors) -> str:
+    """
+    Constructs a debugging representation of the number of embeddings in the
+    NestedTensors.
+    """
+
+    if isinstance(embeddings, torch.Tensor):
+        return " x ".join([str(dim) for dim in embeddings.shape[:-1]])
+
+    return " + ".join(
+        _embedding_count_expression(inner) for inner in embeddings)
+
+
+def merge_multimodal_embeddings_from_map(
+        inputs_embeds: torch.Tensor, multimodal_embeddings: NestedTensors,
+        placeholder_map: MultiModalPlaceholderMap.IndexMap) -> torch.Tensor:
+    """
+    Merge ``multimodal_embeddings`` into ``inputs_embeds`` using the provided 
+    placeholder map .
+
+    Note:
+        This updates ``inputs_embeds`` in place.
+    """
+    flattened_embeddings = _flatten_embeddings(multimodal_embeddings)
+    inputs_embeds[placeholder_map.dest] = flattened_embeddings[
+        placeholder_map.src]
+    return inputs_embeds
+
+
+def _merge_multimodal_embeddings(
+    inputs_embeds: torch.Tensor,
+    is_multimodal: torch.Tensor,
+    multimodal_embeddings: NestedTensors,
+) -> torch.Tensor:
+    """
+    Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the
+    positions in ``inputs_embeds`` corresponding to placeholder tokens in
+    ``input_ids``.
+
+    Note:
+        This updates ``inputs_embeds`` in place.
+    """
+    num_expected_tokens = is_multimodal.sum().item()
+    assert isinstance(num_expected_tokens, int)
+
+    flattened = _flatten_embeddings(multimodal_embeddings)
+    if flattened.shape[0] != num_expected_tokens:
+        expr = _embedding_count_expression(multimodal_embeddings)
+        raise ValueError(
+            f"Attempted to assign {expr} = {flattened.shape[0]} "
+            f"multimodal tokens to {num_expected_tokens} placeholders")
+
+    inputs_embeds[is_multimodal] = flattened
+    return inputs_embeds
+
+
+def embed_multimodal(
+    input_ids: torch.Tensor,
+    multimodal_token_id: int,
+    get_text_embeds: Callable[[torch.Tensor], torch.Tensor],
+    multimodal_embeds: NestedTensors,
+) -> torch.Tensor:
+    """
+    Embed token IDs and multimodal inputs and combine their embeddings.
+
+    ``multimodal_token_id`` is used to determine whether a token ID should
+    be embedded using ``get_text_embeds`` or ``get_multimodal_embeds``.
+
+    Compared to ``merge_multimodal_embeddings`, this avoids running
+    ``get_text_embeds`` on ``input_ids[input_ids == multimodal_token_id]``
+    which causes issues when the placeholder token ID exceeds the
+    vocabulary size of the language model.
+    """
+    is_multimodal = input_ids == multimodal_token_id
+    is_text = ~is_multimodal
+
+    text_embeds = get_text_embeds(input_ids[is_text])
+    merged_embeds = torch.empty(
+        (input_ids.shape[0], text_embeds.shape[1]),
+        dtype=text_embeds.dtype,
+        device=text_embeds.device,
+    )
+
+    merged_embeds[is_text] = text_embeds
+
+    return _merge_multimodal_embeddings(
+        merged_embeds,
+        is_multimodal,
+        multimodal_embeds,
+    )
+
+
+def merge_multimodal_embeddings(
+    input_ids: torch.Tensor,
+    inputs_embeds: torch.Tensor,
+    multimodal_embeddings: NestedTensors,
+    placeholder_token_id: Union[int, list[int]],
+) -> torch.Tensor:
+    """
+    Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the
+    positions in ``inputs_embeds`` corresponding to placeholder tokens in
+    ``input_ids``.
+    
+    ``placeholder_token_id`` can be a list of token ids (e.g, token ids 
+    of img_start, img_break, and img_end tokens) when needed: This means 
+    the order of these tokens in the ``input_ids`` MUST MATCH the order of 
+    their embeddings in ``multimodal_embeddings`` since we need to 
+    slice-merge instead of individually scattering.
+
+    For example, if input_ids is "TTTTTSIIIBIIIBIIIETTT", where
+    - T is text token
+    - S is image start token
+    - I is image embedding token
+    - B is image break token
+    - E is image end token.
+    
+    Then the image embeddings (that correspond to I's) from vision encoder 
+    must be padded with embeddings of S, B, and E in the same order of 
+    input_ids for a correct embedding merge.
+
+    Note:
+        This updates ``inputs_embeds`` in place.
+    """
+    if isinstance(placeholder_token_id, list):
+        placeholder_token_id = torch.tensor(placeholder_token_id,
+                                            device=input_ids.device)
+        return _merge_multimodal_embeddings(
+            inputs_embeds,
+            torch.isin(input_ids, placeholder_token_id),
+            multimodal_embeddings,
+        )
+
+    return _merge_multimodal_embeddings(
+        inputs_embeds,
+        (input_ids == placeholder_token_id),
+        multimodal_embeddings,
+    )
+
+
+class LayerFn(Protocol):
+
+    def __call__(self, prefix: str) -> torch.nn.Module:
+        ...
+
+
+class PPMissingLayer(torch.nn.Identity):
+    """
+    A placeholder layer for missing layers in a pipeline parallel model.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.return_tuple = kwargs.get("return_tuple", False)
+
+    def forward(self, *args, **kwargs):
+        """
+        Return the first arg from args or the first value from kwargs.
+
+        Wraps the input in a tuple if `self.return_tuple` is True.
+        """
+        input = args[0] if args else next(iter(kwargs.values()))
+        return (input, ) if self.return_tuple else input
+
+
+_CPU_OFFLOAD_BYTES = 0
+_CPU_OFFLOAD_MAX_BYTES = 0
+
+
+def set_cpu_offload_max_bytes(max_bytes: int) -> None:
+    global _CPU_OFFLOAD_MAX_BYTES, _CPU_OFFLOAD_BYTES
+    _CPU_OFFLOAD_BYTES = 0
+    _CPU_OFFLOAD_MAX_BYTES = max_bytes
+
+
+def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
+    if (params := next(module.parameters(), None)) is None:
+        return module
+
+    device = params.device
+
+    if device == torch.device("cpu"):
+        return module
+
+    global _CPU_OFFLOAD_MAX_BYTES, _CPU_OFFLOAD_BYTES
+    if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES:
+        return module
+
+    pin_memory = is_pin_memory_available()
+    uva_available = is_uva_available()
+
+    if envs.VLLM_USE_V1:
+        assert uva_available, ("V1 CPU offloading requires"
+                               " uva (pin memory) support")
+        uva_offloading = True
+    else:
+        uva_offloading = False
+
+    # offload parameters to CPU
+    # use pin_memory if possible, which helps cudagraph capture speed
+    offloaded_parameters = False
+    for p in module.parameters():
+        if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES:
+            # we use per-parameter offloading
+            # one module might have some parameters offloaded and some not
+            break
+
+        # `torch.empty_like` does not support `pin_memory` argument
+        cpu_data = torch.empty_strided(size=p.data.size(),
+                                       stride=p.data.stride(),
+                                       dtype=p.data.dtype,
+                                       layout=p.data.layout,
+                                       device='cpu',
+                                       pin_memory=pin_memory)
+        cpu_data.copy_(p.data)
+        if not uva_offloading:
+            p.data = cpu_data
+        else:
+            # keep the cpu data alive
+            p._vllm_offloaded_cpu_data = cpu_data
+            p.data = get_cuda_view_from_cpu_tensor(cpu_data)
+        _CPU_OFFLOAD_BYTES += p.data.numel() * p.data.element_size()
+        offloaded_parameters = True
+
+    if offloaded_parameters and not uva_offloading:
+        original_forward = module.forward
+
+        def forward(*args, **kwargs):
+            module.forward = original_forward
+            device_state = {
+                # here we blindly call `to(device)`
+                # if the parameter is already on the device, it will be a no-op
+                k: v.to(device, non_blocking=True)
+                for k, v in module.state_dict().items()
+            }
+            output = functional_call(module,
+                                     device_state,
+                                     args=args,
+                                     kwargs=kwargs)
+            module.forward = forward
+            return output
+
+        module.forward = forward
+
+    return module
+
+
+def make_layers(
+    num_hidden_layers: int,
+    layer_fn: LayerFn,
+    prefix: str,
+) -> tuple[int, int, torch.nn.ModuleList]:
+    """Make a list of layers with the given layer function, taking
+    pipeline parallelism into account.
+    """
+    from vllm.distributed.parallel_state import get_pp_group
+    from vllm.distributed.utils import get_pp_indices
+    start_layer, end_layer = get_pp_indices(num_hidden_layers,
+                                            get_pp_group().rank_in_group,
+                                            get_pp_group().world_size)
+    modules = torch.nn.ModuleList(
+        [PPMissingLayer() for _ in range(start_layer)] + [
+            maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
+            for idx in range(start_layer, end_layer)
+        ] + [PPMissingLayer() for _ in range(end_layer, num_hidden_layers)])
+    return start_layer, end_layer, modules
+
+
+# NOTE: don't use lru_cache here because it can prevent garbage collection
+_model_to_pp_missing_layer_names: dict[int, list[str]] = {}
+
+
+def get_pp_missing_layer_names(model: torch.nn.Module) -> list[str]:
+    """Get the names of the missing layers in a pipeline parallel model."""
+    model_id = id(model)
+    if model_id in _model_to_pp_missing_layer_names:
+        return _model_to_pp_missing_layer_names[model_id]
+
+    missing_layer_names = []
+    for name, module in model.named_modules():
+        if isinstance(module, PPMissingLayer):
+            # NOTE: the trailing dot is used to match the prefix of the layer.
+            # without the dot, we could match a layer that is not missing,
+            # e.g., 'encoder.layer.1' would match 'encoder.layer.11'
+            missing_layer_names.append(name + '.')
+    _model_to_pp_missing_layer_names[model_id] = missing_layer_names
+
+    return missing_layer_names
+
+
+def is_pp_missing_parameter(name: str, model: torch.nn.Module) -> bool:
+    """Check if a parameter is missing in a pipeline parallel model."""
+    if isinstance(model, PPMissingLayer):
+        return True
+
+    return any(
+        name.startswith(missing_layer_name)
+        for missing_layer_name in get_pp_missing_layer_names(model))
+
+
+def make_empty_intermediate_tensors_factory(keys: list[str], hidden_size: int):
+
+    def make_empty_intermediate_tensors(
+        batch_size: int,
+        dtype: torch.dtype,
+        device: torch.device,
+    ) -> IntermediateTensors:
+        return IntermediateTensors({
+            key:
+            torch.zeros((batch_size, hidden_size), dtype=dtype, device=device)
+            for key in keys
+        })
+
+    return make_empty_intermediate_tensors
+
+
+def maybe_prefix(prefix: str, name: str) -> str:
+    """Add a prefix to a name if the prefix is non-empty.
+
+    Args:
+        prefix: The prefix to add. If empty, no prefix will be added.
+        name: The name to potentially prefix.
+
+    Returns:
+        The string "prefix.name" if prefix was non-empty, otherwise just "name".
+    """
+    return name if not prefix else f"{prefix}.{name}"
+
+
+def extract_layer_index(layer_name: str) -> int:
+    """
+    Extract the layer index from the module name.
+    Examples:
+    - "encoder.layers.0" -> 0
+    - "encoder.layers.1.self_attn" -> 1
+    - "2.self_attn" -> 2
+    - "model.encoder.layers.0.sub.1" -> ValueError
+    """
+    subnames = layer_name.split(".")
+    int_vals: list[int] = []
+    for subname in subnames:
+        try:
+            int_vals.append(int(subname))
+        except ValueError:
+            continue
+    assert len(int_vals) == 1, (f"layer name {layer_name} should"
+                                " only contain one integer")
+    return int_vals[0]
+
+
+def cast_overflow_tensors(
+    tensors: torch.Tensor,
+    offset: float = 1000,
+) -> torch.Tensor:
+    if tensors.isinf().any() or tensors.isnan().any():
+        clamp_value = torch.finfo(tensors.dtype).max - offset
+        tensors = torch.clamp(tensors, min=-clamp_value, max=clamp_value)
+    return tensors
+
+
+def fast_topk(values, topk, dim):
+    if topk == 1:
+        # Use max along the specified dimension to get both value and index
+        return torch.max(values, dim=dim, keepdim=True)
+    else:
+        # Use topk for efficiency with larger k values
+        return torch.topk(values, topk, dim=dim)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/vision.py b/vllm_v0.10.0/vllm/model_executor/models/vision.py
new file mode 100644
index 0000000..ac6a659
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/vision.py
@@ -0,0 +1,147 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import ABC, abstractmethod
+from typing import Final, Generic, Optional, Protocol, TypeVar, Union
+
+import torch
+from transformers import PretrainedConfig
+
+import vllm.envs as envs
+from vllm.attention.selector import (backend_name_to_enum,
+                                     get_global_forced_attn_backend)
+from vllm.logger import init_logger
+from vllm.platforms import _Backend, current_platform
+
+logger = init_logger(__name__)
+
+_C = TypeVar("_C", bound=PretrainedConfig)
+
+
+class VisionEncoderInfo(ABC, Generic[_C]):
+
+    def __init__(self, hf_config: _C) -> None:
+        super().__init__()
+
+        self.hf_config = hf_config
+        self.vision_config = hf_config.vision_config
+
+    @abstractmethod
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_image_size(self) -> int:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_patch_size(self) -> int:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_patch_grid_length(self) -> int:
+        raise NotImplementedError
+
+
+class VisionLanguageConfig(Protocol):
+    vision_config: Final[PretrainedConfig]
+
+
+def get_vision_encoder_info(
+        hf_config: VisionLanguageConfig) -> VisionEncoderInfo:
+    # Avoid circular imports
+    from .clip import CLIPEncoderInfo, CLIPVisionConfig
+    from .pixtral import PixtralHFEncoderInfo, PixtralVisionConfig
+    from .siglip import SiglipEncoderInfo, SiglipVisionConfig
+
+    if isinstance(hf_config.vision_config, CLIPVisionConfig):
+        return CLIPEncoderInfo(hf_config)
+    if isinstance(hf_config.vision_config, PixtralVisionConfig):
+        return PixtralHFEncoderInfo(hf_config)
+    if isinstance(hf_config.vision_config, SiglipVisionConfig):
+        return SiglipEncoderInfo(hf_config)
+
+    msg = f"Unsupported vision config: {type(hf_config.vision_config)}"
+    raise NotImplementedError(msg)
+
+
+def get_vit_attn_backend(support_fa: bool = False) -> _Backend:
+    """
+    Get the available attention backend for Vision Transformer.
+    """
+    # TODO(Isotr0py): Remove `support_fa` after support FA for all ViTs attn.
+    selected_backend: Optional[_Backend] = get_global_forced_attn_backend()
+    if selected_backend is None:
+        backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
+        if backend_by_env_var is not None:
+            selected_backend = backend_name_to_enum(backend_by_env_var)
+    if selected_backend is None:
+        if current_platform.is_cuda():
+            device_available = current_platform.has_device_capability(80)
+            if device_available and support_fa:
+                from transformers.utils import is_flash_attn_2_available
+                if is_flash_attn_2_available():
+                    selected_backend = _Backend.FLASH_ATTN
+                else:
+                    logger.warning_once(
+                        "Current `vllm-flash-attn` has a bug inside vision "
+                        "module, so we use xformers backend instead. You can "
+                        "run `pip install flash-attn` to use flash-attention "
+                        "backend.")
+                    selected_backend = _Backend.XFORMERS
+            else:
+                # For Volta and Turing GPUs, use xformers instead.
+                selected_backend = _Backend.XFORMERS
+        else:
+            # Default to torch SDPA for other non-GPU platforms.
+            selected_backend = _Backend.TORCH_SDPA
+    return selected_backend
+
+
+def resolve_visual_encoder_outputs(
+    encoder_outputs: Union[torch.Tensor, list[torch.Tensor]],
+    feature_sample_layers: Optional[list[int]],
+    post_layer_norm: Optional[torch.nn.LayerNorm],
+    max_possible_layers: int,
+) -> torch.Tensor:
+    """Given the outputs a visual encoder module that may correspond to the
+    output of the last layer, or a list of hidden states to be stacked,
+    handle post normalization and resolve it into a single output tensor.
+
+    Args:
+        encoder_outputs: Output of encoder's last layer or all hidden states.
+        feature_sample_layers: Optional layer indices to grab from the encoder
+            outputs; if provided, encoder outputs must be a list.
+        post_layer_norm: Post norm to apply to the output of the encoder.
+        max_possible_layers: Total layers in the fully loaded visual encoder.
+
+    """
+    if feature_sample_layers is None:
+        if post_layer_norm is not None:
+            return post_layer_norm(encoder_outputs)
+        return encoder_outputs
+
+    # Get the hidden states corresponding to the layer indices.
+    # Negative values are relative to the full visual encoder,
+    # so offset them depending on how many layers were loaded.
+    # NOTE: this assumes that encoder_outputs is a list containing
+    # the inputs to the visual encoder, followed by the hidden states
+    # of each layer.
+    num_loaded_layers = len(encoder_outputs) - 1
+    offset = max_possible_layers - num_loaded_layers
+    hs_pool = [
+        encoder_outputs[layer_idx]
+        if layer_idx >= 0 else encoder_outputs[layer_idx + offset]
+        for layer_idx in feature_sample_layers
+    ]
+
+    # Apply post-norm on the final hidden state if we are using it
+    uses_last_layer = feature_sample_layers[-1] in (len(hs_pool) - 1, -1)
+    if post_layer_norm is not None and uses_last_layer:
+        hs_pool[-1] = post_layer_norm(encoder_outputs)
+    return torch.cat(hs_pool, dim=-1)
diff --git a/vllm_v0.10.0/vllm/model_executor/models/voxtral.py b/vllm_v0.10.0/vllm/model_executor/models/voxtral.py
new file mode 100644
index 0000000..97cab62
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/voxtral.py
@@ -0,0 +1,691 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from functools import cached_property
+from math import ceil
+from typing import Optional, Union, cast
+
+import numpy as np
+import regex as re
+import torch
+import torch.nn as nn
+from mistral_common.audio import mel_filter_bank
+from mistral_common.protocol.instruct.messages import (AudioChunk, RawAudio,
+                                                       TextChunk, UserMessage)
+from mistral_common.protocol.instruct.request import ChatCompletionRequest
+from mistral_common.protocol.transcription.request import TranscriptionRequest
+from mistral_common.tokens.tokenizers.audio import Audio, AudioEncoder
+from transformers import TensorType, WhisperConfig
+from transformers.tokenization_utils_base import TextInput
+
+from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
+from vllm.inputs.data import PromptType
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models import SupportsPP
+# yapf: disable
+from vllm.model_executor.models.whisper import (
+    WhisperEncoder, WhisperForConditionalGeneration)
+# yapf: enable
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
+                                   MultiModalDataParser)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, MultiModalHashes,
+                                        PromptReplacement, PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.tokenizer import (MistralTokenizer,
+                                               cached_tokenizer_from_config)
+
+from .interfaces import (MultiModalEmbeddings, SupportsMultiModal,
+                         SupportsTranscription)
+from .utils import (flatten_bn, init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+
+logger = init_logger(__name__)
+
+
+class VoxtralProcessorAdapter:
+    """
+    Provide a HF-compatible interface for
+    :class:`mistral_common.tokens.tokenizers.multimodal.AudioEncoder`.
+    """
+
+    def __init__(self, tokenizer: MistralTokenizer) -> None:
+        super().__init__()
+        self.tokenizer = tokenizer
+
+    @cached_property
+    def _audio_processor(self) -> AudioEncoder:
+        audio_encoder = self.tokenizer.instruct.audio_encoder
+        assert isinstance(audio_encoder, AudioEncoder)
+        return audio_encoder
+
+    @cached_property
+    def audio_token_id(self) -> int:
+        return self._audio_processor.special_ids.audio
+
+    @cached_property
+    def begin_audio_token_id(self) -> int:
+        return self._audio_processor.special_ids.begin_audio
+
+    # @cached_property
+    # def begin_transcript_token_id(self) -> int:
+    #     return self._audio_processor.special_ids.begin_transcript
+
+    # @cached_property
+    # def end_transcript_token_id(self) -> int:
+    #     return self._audio_processor.special_ids.end_transcript
+
+    @cached_property
+    def sampling_rate(self) -> int:
+        return self._audio_processor.audio_config.sampling_rate
+
+    @cached_property
+    def frame_rate(self) -> float:
+        return self._audio_processor.audio_config.frame_rate
+
+    def get_num_audio_tokens(
+        self,
+        audio_length: int,
+    ) -> int:
+        pad_audio_length = self._audio_processor.next_multiple_of_chunk_frames(
+            audio_length, self.sampling_rate)
+        return ceil(pad_audio_length / (self.sampling_rate // self.frame_rate))
+
+    def __call__(
+        self,
+        text: Optional[Union[TextInput, list[TextInput]]] = None,
+        audios: Optional[Union[np.ndarray, list[np.ndarray]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> Mapping[str, NestedTensors]:
+        if text is None:
+            text = []
+        if not isinstance(text, list):
+            text = [text]
+        if audios is None:
+            audios = []
+        if not isinstance(audios, list):
+            audios = [audios]
+
+        if not audios:
+            input_ids = self.tokenizer(text).input_ids
+            return {"input_ids": torch.tensor(input_ids)}
+
+        # Allow dummy text, which is used for profiling as well as token inputs
+        if any(len(t) > 0 for t in text):
+            raise ValueError(
+                "You've passed text inputs instead of token inputs. "
+                "Make sure to process your input via `mistral_common`'s "
+                "tokenizer or pass a chat completion request. "
+                "For more info, see: "
+                "https://github.com/vllm-project/vllm/issues/8411.")
+
+        audios_tokens = list[torch.Tensor]()
+        audios_processed = list[torch.Tensor]()
+        for audio in audios:
+            assert isinstance(audio, np.ndarray)
+            assert audio.ndim == 1
+
+            # pad if necessary
+            audio = self._audio_processor.pad(audio, self.sampling_rate)
+
+            audio_tokens = [
+                self.begin_audio_token_id
+            ] + [self.audio_token_id] * self.get_num_audio_tokens(len(audio))
+
+            audios_tokens.append(torch.tensor(audio_tokens))
+            audios_processed.append(torch.tensor(audio))
+
+        return {
+            "input_ids": torch.cat(audios_tokens)[None].expand(len(text), -1),
+            "audio_arrays": audios_processed,
+        }
+
+
+class VoxtralProcessingInfo(BaseProcessingInfo):
+
+    def get_tokenizer(self) -> MistralTokenizer:
+        tokenizer = cached_tokenizer_from_config(self.ctx.model_config)
+        if not isinstance(tokenizer, MistralTokenizer):
+            raise ValueError("This model requires `--tokenizer-mode mistral`")
+
+        return tokenizer
+
+    def get_hf_processor(self) -> VoxtralProcessorAdapter:
+        return VoxtralProcessorAdapter(self.get_tokenizer())
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"audio": 5}  # Performance tends to degrade after 5
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        return {"audio": self.get_max_audio_tokens()}
+
+    def get_max_audio_tokens(self) -> int:
+        return self.ctx.model_config.max_model_len
+
+    def get_max_audio_array_len(self) -> int:
+        processor = self.get_hf_processor()
+        return self.get_max_audio_tokens() * int(
+            processor.sampling_rate // processor.frame_rate)
+
+
+class VoxtralDummyInputsBuilder(BaseDummyInputsBuilder[VoxtralProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_audios = mm_counts.get("audio", 0)
+
+        target_length = self.info.get_max_audio_array_len()
+
+        return {
+            "audio":
+            self._get_dummy_audios(length=target_length, num_audios=num_audios)
+        }
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        tokenizer = self.info.get_tokenizer()
+
+        dummy_text = self.get_dummy_text(mm_counts)
+        dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts)
+        dummy_audios = dummy_mm_data.get("audio", [])
+
+        audio_chunks: list[AudioChunk] = []
+        format = "wav"
+        for audio in dummy_audios:
+            audio_item = Audio(
+                audio_array=audio,
+                sampling_rate=self.info.get_hf_processor().sampling_rate,
+                format=format,
+            )
+            chunk = AudioChunk(input_audio=RawAudio.from_audio(audio_item))
+            audio_chunks.append(chunk)
+
+        request = ChatCompletionRequest(messages=[
+            UserMessage(content=[TextChunk(text=dummy_text), *audio_chunks]),
+        ])
+        res = tokenizer.mistral.encode_chat_completion(request)
+        dummy_tokens = res.tokens
+        # whixtral tokenizer adds padding to the audio
+        # so we need to update the audio arrays
+        dummy_mm_data["audio"] = [a.audio_array for a in res.audios]
+
+        return ProcessorInputs(prompt=dummy_tokens, mm_data=dummy_mm_data)
+
+
+class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]
+                                 ):
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: Mapping[str, NestedTensors],
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(audio_arrays=MultiModalFieldConfig.batched("audio"))
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        audio_id = processor.audio_token_id
+
+        def get_replacement(item_idx: int):
+            audios = mm_items.get_items("audio", AudioProcessorItems)
+            audio_len = audios.get_audio_length(item_idx)
+
+            nb_audio_tokens = processor.get_num_audio_tokens(audio_len)
+
+            return [audio_id] * nb_audio_tokens
+
+        return [
+            PromptReplacement(
+                modality="audio",
+                target="",  # Never match the prompt (see below note)
+                replacement=get_replacement,
+            ),
+        ]
+
+    def _cached_apply_hf_processor(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+        *,
+        return_mm_hashes: bool,
+    ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
+        prompt_ids, mm_kwargs, mm_hashes, _ = super(
+        )._cached_apply_hf_processor(
+            prompt=prompt,
+            mm_data_items=mm_data_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            tokenization_kwargs=tokenization_kwargs,
+            return_mm_hashes=return_mm_hashes,
+        )
+
+        # NOTE: The tokens are already inserted by the chat template
+        return prompt_ids, mm_kwargs, mm_hashes, True
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        sampling_rate = self.info.get_hf_processor().sampling_rate
+        return MultiModalDataParser(target_sr=sampling_rate)
+
+
+@MULTIMODAL_REGISTRY.register_processor(VoxtralMultiModalProcessor,
+                                        info=VoxtralProcessingInfo,
+                                        dummy_inputs=VoxtralDummyInputsBuilder)
+class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                      SupportsPP, SupportsTranscription):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        self.tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
+
+        config = vllm_config.model_config.hf_config
+        self.config = config
+        self.downsample_factor = self.config.audio_config.downsample_factor
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+        self.whisper_encoder = VoxtralEncoderModel(
+            vllm_config.with_hf_config(config.audio_config),
+            prefix=maybe_prefix(prefix, "whisper_encoder"),
+        )
+        self.audio_language_adapter = AudioLanguageAdapter(
+            hidden_size=config.audio_config.d_model * self.downsample_factor,
+            dim=config.text_config.hidden_size,
+        )
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            audio_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      audio_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...],
+               None]:
+        audio_inputs = self._parse_and_validate_audio_arrays(**kwargs)
+        if audio_inputs is None:
+            return None
+
+        audio_embeddings = self.whisper_encoder(audio_inputs)
+
+        for i, audio_embedding in enumerate(audio_embeddings):
+            seq_len, dim = audio_embedding.shape
+            # Pad such that seq_len is divisible by downsample_factor
+            target_seq_len = self.downsample_factor * math.ceil(
+                seq_len / self.downsample_factor)
+            audio_embedding = torch.nn.functional.pad(
+                audio_embedding,
+                (0, 0, 0, target_seq_len - seq_len),
+            )
+            audio_embeddings[i] = audio_embedding.reshape(
+                target_seq_len // self.downsample_factor,
+                dim * self.downsample_factor)
+
+        # Concat, project and resplit
+        audio_embeddings_packed = torch.cat(audio_embeddings, dim=0)
+        audio_embeddings_packed = self.audio_language_adapter(
+            audio_embeddings_packed)
+        audio_embeddings = torch.split(audio_embeddings_packed,
+                                       [a.shape[0] for a in audio_embeddings],
+                                       dim=0)
+
+        return audio_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        audio_encoder = self.tokenizer.instruct.audio_encoder
+        audio_tok_id = audio_encoder.audio_token
+
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings, audio_tok_id)
+        return inputs_embeds
+
+    def _parse_and_validate_audio_arrays(
+            self, **kwargs: object) -> Union[list[torch.Tensor], None]:
+        audio_arrays = kwargs.pop("audio_arrays", None)
+        if audio_arrays is None:
+            return None
+
+        if not isinstance(audio_arrays, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of audio_arrays. "
+                             f"Got type: {type(audio_arrays)}")
+
+        audio_arrays = flatten_bn(audio_arrays)
+        if isinstance(audio_arrays, torch.Tensor):
+            audio_arrays = list(audio_arrays.unbind(0))
+        return audio_arrays
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    @classmethod
+    def get_speech_to_text_config(cls, model_config: ModelConfig,
+                                  task_type: str) -> SpeechToTextConfig:
+        tokenizer = cached_tokenizer_from_config(model_config)
+        audio_config = tokenizer.instruct.audio_encoder.audio_config
+        max_audio_clip_s = audio_config.chunk_length_s
+        sample_rate = audio_config.sampling_rate
+        return SpeechToTextConfig(
+            max_audio_clip_s=max_audio_clip_s,
+            sample_rate=sample_rate,
+            # mistral_common and whisper encoder take care of chunking
+            min_energy_split_window_size=None,
+        )
+
+    @classmethod
+    # for speech-to-text transcription
+    def get_generation_prompt(cls, audio: np.ndarray,
+                              model_config: ModelConfig,
+                              stt_config: SpeechToTextConfig, language: str,
+                              task_type: str,
+                              request_prompt: str) -> PromptType:
+        tokenizer = cached_tokenizer_from_config(model_config)
+        audio = Audio(audio, int(stt_config.sample_rate),
+                      format="wav")  # lossless
+        req = TranscriptionRequest(model=model_config.model,
+                                   audio=RawAudio.from_audio(audio),
+                                   language=language)
+
+        tokenized = tokenizer.instruct.encode_transcription(req)
+        audio = (tokenized.audios[0].audio_array, stt_config.sample_rate)
+        prompts_dict = {"multi_modal_data": {"audio": audio}}
+        prompts_dict["prompt_token_ids"] = tokenized.tokens
+        return cast(PromptType, prompts_dict)
+
+    @classmethod
+    def validate_language(cls, language: str) -> bool:
+        # same as whisper
+        return WhisperForConditionalGeneration.validate_language(language)
+
+    @classmethod
+    def get_num_audio_tokens(cls, audio_duration_s: float,
+                             stt_config: SpeechToTextConfig,
+                             model_config: ModelConfig) -> Optional[int]:
+        """
+        Map from audio duration to number of audio tokens produced by the ASR 
+        model, without running a forward pass.
+        This is used for estimating the amount of processing for this audio.
+        """
+        tokenizer = cached_tokenizer_from_config(model_config)
+        adapter = VoxtralProcessorAdapter(tokenizer)
+        return adapter.get_num_audio_tokens(
+            int(audio_duration_s * stt_config.sample_rate))
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        # fmt: off
+        remapping_rules = [
+            (r"mm_whisper_embeddings\.(.*)", r"\1"),
+            (r"audio_language_projection\.(.*)", r"audio_language_adapter.\1"),
+            (r"audio_language_adapter\.0\.weight", r"audio_language_adapter.w_in.weight"),  # noqa: E501
+            (r"audio_language_adapter\.2\.weight", r"audio_language_adapter.w_out.weight"),  # noqa: E501
+        ]
+        # fmt: on
+
+        audio_params = dict(
+            nn.ModuleDict({
+                "audio_language_adapter":
+                self.audio_language_adapter,
+            }).named_parameters())
+
+        loaded_weights = set()
+
+        def llm_weights_generator():
+            nonlocal loaded_weights
+            for name, w in weights:
+                is_encoder = (
+                    name.startswith("mm_whisper_embeddings") and
+                    not name.startswith("mm_whisper_embeddings.tok_embeddings")
+                    and not name.startswith(
+                        "mm_whisper_embeddings.audio_language_projection"))
+
+                for pattern, repl in remapping_rules:
+                    if re.fullmatch(pattern, name):
+                        name = re.sub(pattern, repl, name)
+
+                if is_encoder:
+                    name = self.whisper_encoder.load_weight((name, w))
+                    loaded_weights.add(f"whisper_encoder.{name}")
+                    continue
+
+                if name in audio_params:
+                    param = audio_params[name]
+                    with torch.no_grad():
+                        default_weight_loader(param, w)
+                    loaded_weights.add(name)
+                else:
+                    yield (name, w)
+
+        for name in self.language_model.load_weights(llm_weights_generator()):
+            loaded_weights.add(f"language_model.{name}")
+
+        # potentially manually add position embeddings
+        sin_key = "whisper_encoder.whisper_encoder.embed_positions.weight"
+        if sin_key not in loaded_weights:
+            # make sure we don't hit an error here
+            loaded_weights.add(sin_key)
+
+        return loaded_weights
+
+
+class AudioLanguageAdapter(nn.Module):
+
+    def __init__(self, hidden_size: int, dim: int) -> None:
+        super().__init__()
+        self.w_in = nn.Linear(hidden_size, dim, bias=False)
+        self.gelu = nn.GELU()
+        self.w_out = nn.Linear(dim, dim, bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w_out(self.gelu(self.w_in(x)))
+
+
+class VoxtralEncoderModel(nn.Module):
+    packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
+
+    # fmt: off
+    mistral_remapping = [
+        (r"whisper_encoder\.conv_layers\.0\.(weight|bias)", r"whisper_encoder.conv1.\1"), # noqa: E501
+        (r"whisper_encoder\.conv_layers\.1\.(weight|bias)", r"whisper_encoder.conv2.\1"), # noqa: E501
+        (r"whisper_encoder\.transformer\.layers\.(\d+)\.attention\.w([qkv])\.(weight|bias)", r"whisper_encoder.layers.\1.self_attn.\2_proj.\3"), # noqa: E501
+        (r"whisper_encoder\.transformer\.layers\.(\d+)\.attention\.wo\.(weight|bias)", r"whisper_encoder.layers.\1.self_attn.out_proj.\2"), # noqa: E501
+        (r"whisper_encoder\.transformer\.layers\.(\d+)\.attention_norm\.(weight|bias)", r"whisper_encoder.layers.\1.self_attn_layer_norm.\2"), # noqa: E501
+        (r"whisper_encoder\.transformer\.layers\.(\d+)\.feed_forward\.w1\.(weight|bias)", r"whisper_encoder.layers.\1.mlp.fc1.\2"), # noqa: E501
+        (r"whisper_encoder\.transformer\.layers\.(\d+)\.feed_forward\.w2\.(weight|bias)", r"whisper_encoder.layers.\1.mlp.fc2.\2"), # noqa: E501
+        (r"whisper_encoder\.transformer\.layers\.(\d+)\.ffn_norm\.(weight|bias)", r"whisper_encoder.layers.\1.final_layer_norm.\2"), # noqa: E501
+        (r"whisper_encoder\.transformer\.norm\.(weight|bias)", r"whisper_encoder.layer_norm.\1"), # noqa: E501
+    ]
+    # fmt: on
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        *,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = cast(WhisperConfig, vllm_config.model_config.hf_config)
+        self.dtype: torch.dtype = vllm_config.model_config.dtype
+        self.whisper_encoder = WhisperEncoder(vllm_config=vllm_config,
+                                              prefix=maybe_prefix(
+                                                  prefix, "whisper_encoder"),
+                                              is_standalone_encoder=True,
+                                              init_in_fp32=True)
+        mel_filters = mel_filter_bank(
+            num_frequency_bins=1 + self.config.window_size // 2,
+            num_mel_bins=self.config.num_mel_bins,
+            min_frequency=0.0,
+            max_frequency=8000.0,
+            sampling_rate=self.config.sampling_rate,
+        )
+        self.mel_filters = torch.tensor(mel_filters, dtype=torch.float32)
+
+    def compute_whisper_melspec(
+        self,
+        audio_waveforms: torch.Tensor,
+    ) -> torch.Tensor:
+        input_dtype = audio_waveforms.dtype
+        window = torch.hann_window(self.config.window_size).to(
+            audio_waveforms.device)
+        stft = torch.stft(
+            audio_waveforms,
+            self.config.window_size,
+            self.config.hop_length,
+            window=window,
+            return_complex=True,
+        )
+        magnitudes = stft[..., :-1].abs()**2
+        mel_spec = self.mel_filters.T @ magnitudes
+        log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+        log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+        log_spec = (log_spec + 4.0) / 4.0
+        return log_spec.to(input_dtype)
+
+    @property
+    def downsample_factor(self) -> int:
+        return self.whisper_encoder.conv1.stride[
+            0] * self.whisper_encoder.conv2.stride[0]
+
+    @property
+    def chunk_size(self) -> int:
+        return self.config.max_source_positions * self.downsample_factor
+
+    def prepare_inputs_for_conv(
+        self,
+        audio_waveforms: list[torch.Tensor],
+    ) -> tuple[torch.Tensor, list[int]]:
+        assert isinstance(audio_waveforms, list)
+        # list[num_mel_bins, seq_len]
+        input_features = [
+            self.compute_whisper_melspec(audio).to(self.dtype)
+            for audio in audio_waveforms
+        ]
+
+        chunked_features: list[torch.Tensor] = []
+        chunks_per_example: list[int] = []
+        for feature in input_features:
+            chunks = feature.split(self.chunk_size, dim=-1)
+            chunked_features += chunks
+            chunks_per_example.append(len(chunks))
+
+        # [total_num_chunks, num_mel_bins, chunk_size]
+        return torch.stack(chunked_features), chunks_per_example
+
+    def forward(
+        self, input_features: Union[torch.Tensor, list[torch.Tensor]]
+    ) -> list[torch.Tensor]:
+        if not isinstance(input_features, list):
+            input_features = [input_features]
+
+        # Split long inputs into chunks
+        input_embeds, chunks_per_example = (
+            self.prepare_inputs_for_conv(input_features))
+
+        # [total_num_chunks, ceil(chunk_size / downsample_factor), hidden_size]
+        out = self.whisper_encoder([input_embeds])
+
+        # Re-concatenate the chunks
+        chunk_idx = 0
+        results = []
+        for n_chunks in chunks_per_example:
+            result = out[chunk_idx:chunk_idx + n_chunks].flatten(0, 1)
+            results.append(result)
+            chunk_idx += n_chunks
+
+        return results
+
+    def load_weight(self, weight: tuple[str, torch.Tensor]) -> str:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+
+        name, loaded_weight = weight
+        for pattern, repl in self.mistral_remapping:
+            if re.fullmatch(pattern, name):
+                name = re.sub(pattern, repl, name)
+
+        for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            if weight_name not in name:
+                continue
+            name = name.replace(weight_name, param_name)
+
+            param = params_dict[name]
+            weight_loader = param.weight_loader
+            weight_loader(param, loaded_weight, shard_id)
+            break
+        else:
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+
+        return name
diff --git a/vllm_v0.10.0/vllm/model_executor/models/whisper.py b/vllm_v0.10.0/vllm/model_executor/models/whisper.py
new file mode 100644
index 0000000..d98dab5
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/whisper.py
@@ -0,0 +1,974 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from contextlib import nullcontext
+from typing import Optional, TypedDict, Union, cast
+
+import numpy as np
+import torch
+from torch import nn
+from transformers import (BatchFeature, WhisperConfig, WhisperFeatureExtractor,
+                          WhisperProcessor)
+from transformers.models.whisper.modeling_whisper import sinusoids
+
+from vllm.attention import Attention, AttentionType
+from vllm.attention.layer import MultiHeadAttention
+from vllm.config import (CacheConfig, ModelConfig, SpeechToTextConfig,
+                         VllmConfig)
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.inputs.data import PromptType
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.model_loader.utils import set_default_torch_dtype
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
+from vllm.multimodal.processing import (BaseProcessingInfo,
+                                        EncDecMultiModalProcessor,
+                                        PromptReplacement, PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.transformers_utils.processor import cached_get_processor
+
+from .interfaces import (MultiModalEmbeddings, SupportsMultiModal,
+                         SupportsTranscription, SupportsV0Only)
+from .utils import (AutoWeightsLoader, WeightsMapper, cast_overflow_tensors,
+                    make_layers)
+
+logger = init_logger(__name__)
+
+# From https://platform.openai.com/docs/guides/speech-to-text/supported-languages
+
+ISO639_1_SUPPORTED_LANGS = {
+    "af": "Afrikaans",
+    "ar": "Arabic",
+    "hy": "Armenian",
+    "az": "Azerbaijani",
+    "be": "Belarusian",
+    "bs": "Bosnian",
+    "bg": "Bulgarian",
+    "ca": "Catalan",
+    "zh": "Chinese",
+    "hr": "Croatian",
+    "cs": "Czech",
+    "da": "Danish",
+    "nl": "Dutch",
+    "en": "English",
+    "et": "Estonian",
+    "fi": "Finnish",
+    "fr": "French",
+    "gl": "Galician",
+    "de": "German",
+    "el": "Greek",
+    "he": "Hebrew",
+    "hi": "Hindi",
+    "hu": "Hungarian",
+    "is": "Icelandic",
+    "id": "Indonesian",
+    "it": "Italian",
+    "ja": "Japanese",
+    "kn": "Kannada",
+    "kk": "Kazakh",
+    "ko": "Korean",
+    "lv": "Latvian",
+    "lt": "Lithuanian",
+    "mk": "Macedonian",
+    "ms": "Malay",
+    "mr": "Marathi",
+    "mi": "Maori",
+    "ne": "Nepali",
+    "no": "Norwegian",
+    "fa": "Persian",
+    "pl": "Polish",
+    "pt": "Portuguese",
+    "ro": "Romanian",
+    "ru": "Russian",
+    "sr": "Serbian",
+    "sk": "Slovak",
+    "sl": "Slovenian",
+    "es": "Spanish",
+    "sw": "Swahili",
+    "sv": "Swedish",
+    "tl": "Tagalog",
+    "ta": "Tamil",
+    "th": "Thai",
+    "tr": "Turkish",
+    "uk": "Ukrainian",
+    "ur": "Urdu",
+    "vi": "Vietnamese",
+    "cy": "Welsh"
+}
+ISO639_1_OTHER_LANGS = {
+    "lo": "Lao",
+    "jw": "Javanese",
+    "tk": "Turkmen",
+    "yi": "Yiddish",
+    "so": "Somali",
+    "bn": "Bengali",
+    "nn": "Norwegian Nynorsk",
+    "si": "Sinhala",
+    "yo": "Yoruba",
+    "sa": "Sanskrit",
+    "mi": "Māori",
+    "fo": "Faroese",  # codespell:ignore
+    "mt": "Maltese",
+    "tg": "Tajik",
+    "mg": "Malagasy",
+    "haw": "Hawaiian",
+    "km": "Khmer",
+    "br": "Breton",
+    "ps": "Pashto",
+    "ln": "Lingala",
+    "la": "Latin",
+    "ml": "Malayalam",
+    "sq": "Albanian",
+    "su": "Sundanese",
+    "eu": "Basque",
+    "ka": "Georgian",
+    "uz": "Uzbek",
+    "sn": "Shona",
+    "ht": "Haitian",
+    "as": "Assamese",
+    "mn": "Mongolian",
+    "te": "Telugu",
+    "pa": "Panjabi",
+    "tt": "Tatar",
+    "gu": "Gujarati",
+    "oc": "Occitan",
+    "ha": "Hausa",
+    "ba": "Bashkir",
+    "my": "Burmese",
+    "sd": "Sindhi",
+    "am": "Amharic",
+    "lb": "Luxembourgish",
+    "bo": "Tibetan"
+}
+
+
+class WhisperAudioInputs(TypedDict):
+    input_features: NestedTensors
+    """Shape: `(batch_size, 128, M)`"""
+
+
+class WhisperPositionalEmbedding(nn.Embedding):
+
+    def __init__(self, num_positions: int, embedding_dim: int):
+        super().__init__(num_positions, embedding_dim)
+
+    def forward(self, position_ids):
+        return self.weight[position_ids]
+
+
+class WhisperAttention(nn.Module):
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        bias: bool = True,
+        attn_type: AttentionType = AttentionType.DECODER,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        standalone_encoder: bool = False,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        if self.total_num_heads >= tp_size:
+            # Number of heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_heads % tp_size == 0
+        else:
+            # Number of heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_heads == 0
+        self.num_kv_heads = max(1, self.total_num_heads // tp_size)
+        self.head_dim = self.embed_dim // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.attn_type = attn_type
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: "
+                f"{self.embed_dim} and `num_heads`: {num_heads}).")
+        self.scaling = self.head_dim**-0.5
+
+        self._init_qkv(embed_dim, bias, quant_config, prefix=prefix)
+        self.out_proj = RowParallelLinear(
+            input_size=embed_dim,
+            output_size=embed_dim,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+        if standalone_encoder:
+            self.attn = MultiHeadAttention(
+                self.num_heads,
+                self.head_dim,
+                self.scaling,
+                num_kv_heads=self.num_kv_heads,
+            )
+        else:
+            self.attn = Attention(
+                self.num_heads,
+                self.head_dim,
+                self.scaling,
+                num_kv_heads=self.num_kv_heads,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.attn",
+                attn_type=self.attn_type,
+            )
+
+    def _init_qkv(
+        self,
+        embed_dim: int,
+        bias: bool = True,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=embed_dim,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ):
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        attn_output = self.attn(q, k, v)
+
+        output, _ = self.out_proj(attn_output)
+
+        return output
+
+
+class WhisperCrossAttention(WhisperAttention):
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        bias: bool = True,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__(
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            bias=bias,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=prefix,
+            attn_type=AttentionType.ENCODER_DECODER,
+        )
+
+    def _init_qkv(
+        self,
+        embed_dim: int,
+        bias: bool = True,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        self.q_proj = ColumnParallelLinear(
+            input_size=embed_dim,
+            output_size=embed_dim,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.q_proj",
+        )
+        self.kv_proj = QKVParallelLinear(
+            hidden_size=embed_dim,
+            head_size=self.head_dim,
+            total_num_heads=0,
+            total_num_kv_heads=self.total_num_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_proj",
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor],
+    ):
+        q, _ = self.q_proj(hidden_states)
+
+        # Encoder hidden states are only computed once during prefill phase.
+        # Afterwards, the keys and values should be available in the kv-cache.
+        if encoder_hidden_states is not None:
+            kv, _ = self.kv_proj(encoder_hidden_states)
+            k, v = kv.split([self.kv_size, self.kv_size], dim=-1)
+        else:
+            k = v = None
+
+        attn_output = self.attn(q, k, v)
+
+        output, _ = self.out_proj(attn_output)
+
+        return output
+
+
+class WhisperMLP(nn.Module):
+
+    def __init__(
+        self,
+        embed_dim: int,
+        ffn_dim: int,
+        act_fn: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.activation_fn = get_act_fn(act_fn)
+        self.fc1 = ColumnParallelLinear(
+            input_size=embed_dim,
+            output_size=ffn_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
+        self.fc2 = RowParallelLinear(
+            input_size=ffn_dim,
+            output_size=embed_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        return hidden_states
+
+
+class WhisperEncoderLayer(nn.Module):
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 is_standalone_encoder: bool = False):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.embed_dim = config.d_model
+        self.self_attn = WhisperAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            attn_type=AttentionType.ENCODER,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+            standalone_encoder=is_standalone_encoder,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.mlp = WhisperMLP(
+            embed_dim=config.d_model,
+            ffn_dim=config.encoder_ffn_dim,
+            act_fn=config.activation_function,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ):
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states = self.self_attn(hidden_states=hidden_states)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        hidden_states = cast_overflow_tensors(hidden_states)
+
+        return hidden_states
+
+
+class WhisperDecoderLayer(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.self_attn = WhisperAttention(
+            embed_dim=config.d_model,
+            num_heads=config.decoder_attention_heads,
+            attn_type=AttentionType.DECODER,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(config.d_model)
+        self.encoder_attn = WhisperCrossAttention(
+            embed_dim=config.d_model,
+            num_heads=config.decoder_attention_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.encoder_attn",
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(config.d_model)
+        self.mlp = WhisperMLP(
+            embed_dim=config.d_model,
+            ffn_dim=config.decoder_ffn_dim,
+            act_fn=config.activation_function,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.final_layer_norm = nn.LayerNorm(config.d_model)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor],
+    ):
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states = self.self_attn(hidden_states=hidden_states)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.encoder_attn_layer_norm(hidden_states)
+        hidden_states = self.encoder_attn(
+            hidden_states=hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class WhisperEncoder(nn.Module):
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 is_standalone_encoder: bool = False,
+                 init_in_fp32: bool = False):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        embed_dim = config.d_model
+        self.is_standalone_encoder = is_standalone_encoder
+        self.num_mel_bins = config.num_mel_bins
+        self.max_source_positions = config.max_source_positions
+        self.embed_scale = (math.sqrt(embed_dim)
+                            if config.scale_embedding else 1.0)
+
+        self.conv1 = nn.Conv1d(self.num_mel_bins,
+                               embed_dim,
+                               kernel_size=3,
+                               padding=1)
+        self.conv2 = nn.Conv1d(embed_dim,
+                               embed_dim,
+                               kernel_size=3,
+                               stride=2,
+                               padding=1)
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.encoder_layers,
+            lambda prefix: WhisperEncoderLayer(vllm_config=vllm_config,
+                                               prefix=f"{prefix}.layers",
+                                               is_standalone_encoder=
+                                               is_standalone_encoder),
+            prefix=f"{prefix}.layers",
+        )
+        self.layer_norm = nn.LayerNorm(config.d_model)
+
+        maybe_fp32_init_ctx = set_default_torch_dtype(
+            torch.float32) if init_in_fp32 else nullcontext()
+
+        with (
+                torch.no_grad(),
+                maybe_fp32_init_ctx,
+        ):
+            self.embed_positions = nn.Embedding(self.max_source_positions,
+                                                embed_dim)
+            self.embed_positions.weight.copy_(
+                sinusoids(*self.embed_positions.weight.shape))
+
+    def forward(self, input_features: Union[torch.Tensor, list[torch.Tensor]]):
+        hidden_states = []
+        for features in input_features:
+            embeds = nn.functional.gelu(self.conv1(features))
+            embeds = nn.functional.gelu(self.conv2(embeds))
+            embeds = embeds.transpose(-1, -2)
+            embeds = (embeds +
+                      self.embed_positions.weight[:embeds.size(-2), :]).to(
+                          embeds.dtype)
+            hidden_states.append(embeds)
+        hidden_states = torch.cat(hidden_states)
+
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(hidden_states)
+
+        hidden_states = self.layer_norm(hidden_states)
+        return hidden_states
+
+
+class WhisperDecoder(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_target_positions
+        self.max_source_positions = config.max_source_positions
+        self.embed_scale = (math.sqrt(config.d_model)
+                            if config.scale_embedding else 1.0)
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model,
+                                         self.padding_idx)
+        self.embed_positions = WhisperPositionalEmbedding(
+            self.max_target_positions, config.d_model)
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.decoder_layers,
+            lambda prefix: WhisperDecoderLayer(vllm_config=vllm_config,
+                                               prefix=f"{prefix}.layers"),
+            prefix=f"{prefix}.layers",
+        )
+        self.layer_norm = nn.LayerNorm(config.d_model)
+
+    def forward(
+        self,
+        input_ids,
+        positions: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor],
+    ):
+        inputs_embeds = self.get_input_embeddings(input_ids)
+        positions = self.embed_positions(positions)
+        hidden_states = inputs_embeds + positions
+
+        for decoder_layer in self.layers:
+            hidden_states = decoder_layer(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+            )
+
+        hidden_states = self.layer_norm(hidden_states)
+        return hidden_states
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+
+class WhisperModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        self.encoder = WhisperEncoder(vllm_config=vllm_config,
+                                      prefix=f"{prefix}.encoder")
+        self.decoder = WhisperDecoder(vllm_config=vllm_config,
+                                      prefix=f"{prefix}.decoder")
+
+    def forward(
+        self,
+        input_features: Optional[Union[torch.Tensor, list[torch.Tensor]]],
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+    ) -> torch.Tensor:
+        encoder_outputs = self.get_encoder_outputs(input_features)
+        decoder_outputs = self.decoder(
+            input_ids=input_ids,
+            positions=positions,
+            encoder_hidden_states=encoder_outputs,
+        )
+        return decoder_outputs
+
+    def get_encoder_outputs(
+        self,
+        input_features: Optional[Union[torch.Tensor, list[torch.Tensor]]],
+    ) -> Optional[torch.Tensor]:
+        if input_features is None:
+            return None
+        return self.encoder(input_features)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".self_attn.qkv_proj", ".self_attn.q_proj", "q"),
+            (".self_attn.qkv_proj", ".self_attn.k_proj", "k"),
+            (".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
+            (".encoder_attn.kv_proj", ".encoder_attn.k_proj", "k"),
+            (".encoder_attn.kv_proj", ".encoder_attn.v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class WhisperProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self) -> WhisperConfig:
+        return self.ctx.get_hf_config(WhisperConfig)
+
+    def get_hf_processor(self,
+                         sampling_rate: Optional[int] = None
+                         ) -> WhisperProcessor:
+        # HACK: Transformers 4.53.0 has issue with whisper tokenizer to
+        # initialize processor. We use a monkeypatch to fix it here.
+        # See: https://github.com/vllm-project/vllm/issues/20224
+        processor_class = WhisperProcessor
+        tokenizer_class = ("WhisperTokenizer", "WhisperTokenizerFast")
+        if processor_class.tokenizer_class != tokenizer_class:
+            processor_class.tokenizer_class = tokenizer_class
+        return self.ctx.get_hf_processor(processor_class)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"audio": 1}
+
+    def get_feature_extractor(self) -> WhisperFeatureExtractor:
+        hf_processor = self.get_hf_processor()
+        feature_extractor = hf_processor.feature_extractor  # type: ignore
+        assert isinstance(feature_extractor, WhisperFeatureExtractor)
+        return feature_extractor
+
+    def get_num_audio_tokens(self) -> int:
+        return self.get_hf_config().max_source_positions
+
+
+class WhisperDummyInputsBuilder(BaseDummyInputsBuilder[WhisperProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+
+        return "<|startoftranscript|>" * num_audios
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        feature_extractor = self.info.get_feature_extractor()
+
+        sampling_rate = feature_extractor.sampling_rate
+        audio_len = feature_extractor.chunk_length * sampling_rate
+        num_audios = mm_counts.get("audio", 0)
+
+        return {
+            "audio":
+            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
+        }
+
+
+class WhisperMultiModalProcessor(
+        EncDecMultiModalProcessor[WhisperProcessingInfo]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        feature_extractor = self.info.get_feature_extractor()
+        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
+
+    @property
+    def pad_dummy_encoder_prompt(self) -> bool:
+        return True
+
+    def create_encoder_prompt(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+    ) -> Union[str, list[int]]:
+        # Strictly speaking, whisper encoder only accept audio features.
+        # We create a dummy encoder prompt here which will be padded to
+        # num_audio_tokens. So that we can create dummy data from this
+        # for encoder profiling.
+        return [0]
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if mm_data:
+            feature_extractor = self.info.get_feature_extractor()
+            mm_data = dict(audio=mm_data.pop("audios"))
+            mm_kwargs = dict(
+                **mm_kwargs,
+                sampling_rate=feature_extractor.sampling_rate,
+            )
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+        if "labels" in processed_outputs:
+            processed_outputs["input_ids"] = processed_outputs.pop("labels")
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(input_features=MultiModalFieldConfig.batched("audio"))
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        num_tokens = self.info.get_num_audio_tokens()
+        return [
+            PromptReplacement(
+                modality="audio",
+                target=[0],
+                replacement=[0] * num_tokens,
+            )
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(WhisperMultiModalProcessor,
+                                        info=WhisperProcessingInfo,
+                                        dummy_inputs=WhisperDummyInputsBuilder)
+class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
+                                      SupportsMultiModal, SupportsV0Only):
+    packed_modules_mapping = {
+        "self_attn.qkv_proj": [
+            "self_attn.q_proj",
+            "self_attn.k_proj",
+            "self_attn.v_proj",
+        ],
+        "encoder_attn.kv_proj": ["encoder_attn.k_proj", "encoder_attn.v_proj"],
+    }
+
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_substr={
+        ".fc1.": ".mlp.fc1.",
+        ".fc2.": ".mlp.fc2."
+    })
+
+    # Whisper only supports audio-conditioned generation.
+    supports_transcription_only = True
+
+    @classmethod
+    def validate_language(cls, language: str) -> bool:
+        if language in ISO639_1_SUPPORTED_LANGS:
+            return True
+        elif language in ISO639_1_OTHER_LANGS:
+            logger.warning(
+                "The selected language %s has limited accuracy with"
+                " reported WER>=0.5. Results may be less accurate "
+                "for this choice.", language)
+            return True
+        else:
+            raise ValueError(f"Unsupported language: {language}."
+                             "Language should be one of:" +
+                             f" {list(ISO639_1_SUPPORTED_LANGS.values())}" +
+                             f"or {list(ISO639_1_OTHER_LANGS.values())}")
+
+    @classmethod
+    def get_generation_prompt(
+            cls,
+            audio: np.ndarray,
+            model_config: ModelConfig,  # not needed here
+            stt_config: SpeechToTextConfig,
+            language: str,
+            task_type: str,
+            request_prompt: str) -> PromptType:
+        prompt = {
+            "encoder_prompt": {
+                # Whisper does not support encoder prompt.
+                "prompt": "",
+                "multi_modal_data": {
+                    "audio": (audio, stt_config.sample_rate),
+                },
+            },
+            "decoder_prompt":
+            ((f"<|prev|>{request_prompt}" if request_prompt else "") +
+             f"<|startoftranscript|><|{language}|>" +
+             f"<|{task_type}|><|notimestamps|>")
+        }
+        return cast(PromptType, prompt)
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("audio"):
+            return None
+
+        raise ValueError("Only audio modality is supported")
+
+    @classmethod
+    def get_speech_to_text_config(cls, model_config: ModelConfig,
+                                  task_type: str) -> SpeechToTextConfig:
+        processor = cached_get_processor(model_config.model)
+
+        return SpeechToTextConfig(
+            max_audio_clip_s=processor.feature_extractor.chunk_length,
+            sample_rate=processor.feature_extractor.sampling_rate,
+        )
+
+    @classmethod
+    def get_num_audio_tokens(cls, audio_duration_s: float,
+                             stt_config: SpeechToTextConfig,
+                             model_config: ModelConfig) -> Optional[int]:
+        processor = cached_get_processor(model_config.model)
+        hop_length = processor.feature_extractor.hop_length
+        assert hop_length is not None
+        # NOTE(NickLucche) user can't pass encoder
+        # prompts directly at least not to Whisper.
+        # One indicator of the encoder amount of processing
+        # is the log-mel spectogram length.
+        return math.ceil(audio_duration_s * stt_config.sample_rate /
+                         hop_length)
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.dtype = vllm_config.model_config.dtype
+
+        self.model = WhisperModel(vllm_config=vllm_config, prefix=prefix)
+        self.unpadded_vocab_size = config.vocab_size
+        self.proj_out = ParallelLMHead(config.vocab_size,
+                                       config.d_model,
+                                       quant_config=quant_config)
+        self.proj_out = self.proj_out.tie_weights(
+            self.model.decoder.embed_tokens)
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size, logit_scale)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        audio_input = self._parse_and_validate_audio_input(**kwargs)
+        decoder_outputs = self.model(
+            input_features=audio_input["input_features"],
+            input_ids=input_ids,
+            positions=positions,
+        )
+        return decoder_outputs
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.model.decoder
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        # TODO: This method does not obey the interface for SupportsMultiModal.
+        # Refactor this once encoder/decoder support is implemented in V1.
+        audio_input = self._parse_and_validate_audio_input(**kwargs)
+        return self.model.get_encoder_outputs(audio_input["input_features"])
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        # TODO: This method just returns the decoder sequence embeddings since
+        # Whisper does not have encoder text tokens. Refactor this once
+        # encoder/decoder support is implemented in V1.
+        return self.model.decoder.get_input_embeddings(input_ids)
+
+    def _parse_and_validate_audio_input(
+            self, **kwargs: object) -> WhisperAudioInputs:
+        input_features = kwargs.pop("input_features", None)
+
+        if input_features is not None:
+            if not isinstance(input_features, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of audio features. "
+                                 f"Got type: {type(input_features)}")
+            input_features = torch.cat(
+                [feat.to(self.dtype) for feat in input_features])
+
+        return WhisperAudioInputs(input_features=input_features)
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.proj_out, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self, skip_prefixes=["proj_out."])
+
+        # add fake zeros bias for k_proj to state_dict
+        weights = _create_fake_bias_for_k_proj(weights)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+
+def _create_fake_bias_for_k_proj(
+    weights: Iterable[tuple[str, torch.Tensor]]
+) -> Iterable[tuple[str, torch.Tensor]]:
+    """
+    Create full zeros bias for k_proj weight in self-attn and x-attn layers.
+    So that the bias for k_proj in qkv_proj can be initialized with zeros.
+    """
+    for name, weight in weights:
+        if name.endswith(".k_proj.weight"):
+            bias = torch.zeros(weight.size(0))
+            bias_name = name.replace("weight", "bias")
+            yield from [(name, weight), (bias_name, bias)]
+        yield name, weight
diff --git a/vllm_v0.10.0/vllm/model_executor/models/zamba2.py b/vllm_v0.10.0/vllm/model_executor/models/zamba2.py
new file mode 100644
index 0000000..7764fd9
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/models/zamba2.py
@@ -0,0 +1,1033 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""PyTorch Zamba2 model implementation for vLLM.
+
+This module implements the Zamba2 architecture from 
+https://arxiv.org/abs/2411.15242, which combines Mamba and Transformer 
+architectures in a hybrid model optimized for efficient sequence modeling. The 
+model alternates between state space model layers and attention-based layers.
+"""
+from collections.abc import Iterable
+from itertools import cycle
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import Zamba2Config
+
+from vllm import envs
+from vllm.attention.layer import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.layers.activation import GeluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.mamba2_metadata import (
+    Mamba2Metadata, prepare_mamba2_metadata)
+from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
+from vllm.model_executor.layers.mamba.mamba_utils import get_mamba_state_shape
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
+                                                    MambaCacheParams)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import HasInnerState, IsHybrid
+from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
+
+
+class Zamba2LoRA(nn.Module):
+    """LoRA layer for the Zamba2 model.
+    
+    Implements a LoRA layer that is used in shared attention and gated MLP
+    blocks.
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        rank: int,
+        output_dim: Union[int, list[int]],
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        """Initialize the attention layer.
+        
+        Args:
+            input_dim: input dimension
+            rank: LoRA rank
+            output_dim: output dimension
+            quant_config: Configuration for model quantization
+        """
+        super().__init__()
+
+        self.A = ColumnParallelLinear(input_dim,
+                                      rank,
+                                      bias=False,
+                                      quant_config=quant_config,
+                                      gather_output=True)
+
+        if isinstance(output_dim, list):
+            B_class = MergedColumnParallelLinear
+        else:
+            B_class = ColumnParallelLinear
+        self.B = B_class(rank,
+                         output_dim,
+                         bias=False,
+                         quant_config=quant_config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ):
+        lora_output, _ = self.A(hidden_states)
+        lora_output, _ = self.B(lora_output)
+        return lora_output
+
+
+class Zamba2Attention(nn.Module):
+    """Multi-head attention mechanism for the Zamba2 model.
+    
+    Implements attention with parallel computation, QKV projections, optional 
+    adapters and rotary position embeddings. The attention is computed across
+    distributed blocks for efficient processing.
+    """
+
+    def __init__(
+        self,
+        config: Zamba2Config,
+        bare_block_idx: int,
+        num_hybrid_layers: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        """Initialize the attention layer.
+        
+        Args:
+            config: The Zamba2 model configuration
+            bare_block_idx: Index of the bare attention block
+            num_hybrid_layers: Total number of hybrid layers
+            cache_config: Configuration for key-value caching
+            quant_config: Configuration for model quantization
+            prefix: Optional prefix for parameter names
+        """
+        super().__init__()
+        tp_size = get_tensor_model_parallel_world_size()
+        self.config = config
+        self.num_hybrid_layers = num_hybrid_layers
+        self.rope_theta = config.rope_theta
+
+        self.attention_hidden_size = config.attention_hidden_size
+        self.total_num_attention_heads = config.num_attention_heads
+        assert self.total_num_attention_heads % tp_size == 0
+        self.num_attention_heads = config.num_attention_heads // tp_size
+        self.attention_head_dim = config.attention_head_dim
+        self.qkv_size = self.attention_hidden_size // tp_size
+        self.scale = (self.attention_head_dim / 2)**-0.5
+
+        if (self.attention_head_dim *
+                self.total_num_attention_heads) != self.attention_hidden_size:
+            raise ValueError(
+                f"attention_hidden_size must be divisible by"
+                f" num_attention_heads"
+                f" (got `attention_hidden_size`: {self.attention_hidden_size}"
+                f" and `num_heads`: {self.num_attention_heads}).")
+
+        self.qkv_proj = QKVParallelLinear(
+            self.attention_hidden_size,
+            self.attention_head_dim,
+            self.total_num_attention_heads,
+            bias=False,
+            quant_config=quant_config,
+        )
+        self.o_proj = RowParallelLinear(self.attention_hidden_size,
+                                        config.hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config)
+
+        # Even though in Zamba2 weights are shared between attention layers, KV
+        # cache is unique for every attention layer. Hence, we need to define
+        # separate Attention objects, because in recent vLLM KV cache tensors
+        # are tied to specific Attention objects.
+
+        # Initialize attention blocks with proper indexing
+        self.dpa_list = nn.ModuleList([])
+        j = bare_block_idx * (self.num_hybrid_layers + config.num_mem_blocks -
+                              1) // config.num_mem_blocks
+        for block_idx in range(self.num_hybrid_layers):
+            if block_idx % config.num_mem_blocks == bare_block_idx:
+                dpa = Attention(
+                    self.num_attention_heads,
+                    self.attention_head_dim,
+                    self.scale,
+                    cache_config=cache_config,
+                    prefix=f"{prefix}.attn.{j}",
+                )
+                j += 1
+            else:
+                dpa = nn.Identity()
+            self.dpa_list.append(dpa)
+
+        # Initialize adapter layers if enabled
+        if config.use_shared_attention_adapter:
+            self.linear_q_adapter_list = nn.ModuleList([])
+            self.linear_k_adapter_list = nn.ModuleList([])
+            self.linear_v_adapter_list = nn.ModuleList([])
+
+            for block_idx in range(self.num_hybrid_layers):
+                if block_idx % config.num_mem_blocks == bare_block_idx:
+                    linear_q_adapter = Zamba2LoRA(
+                        self.attention_hidden_size,
+                        config.adapter_rank,
+                        self.attention_hidden_size,
+                        quant_config=quant_config,
+                    )
+                    linear_k_adapter = Zamba2LoRA(
+                        self.attention_hidden_size,
+                        config.adapter_rank,
+                        self.attention_hidden_size,
+                        quant_config=quant_config,
+                    )
+                    linear_v_adapter = Zamba2LoRA(
+                        self.attention_hidden_size,
+                        config.adapter_rank,
+                        self.attention_hidden_size,
+                        quant_config=quant_config,
+                    )
+                else:
+                    linear_q_adapter = nn.Identity()
+                    linear_k_adapter = nn.Identity()
+                    linear_v_adapter = nn.Identity()
+
+                self.linear_q_adapter_list.append(linear_q_adapter)
+                self.linear_k_adapter_list.append(linear_k_adapter)
+                self.linear_v_adapter_list.append(linear_v_adapter)
+
+        if config.use_mem_rope:
+            self.rotary_emb = get_rope(
+                head_size=self.attention_head_dim,
+                rotary_dim=self.attention_head_dim,
+                max_position=config.max_position_embeddings,
+                base=self.rope_theta,
+                rope_scaling=None,
+                is_neox_style=True,
+            )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        block_idx: int,
+        position_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        """Forward pass through the attention layer.
+        
+        Args:
+            hidden_states: Input tensor [batch_size, seq_len, hidden_size]
+            position_ids: Position IDs for positional embeddings
+            block_idx: Current shared transformer block index
+            
+        Returns:
+            Output tensor [batch_size, seq_len, hidden_size]
+        """
+        qkv, _ = self.qkv_proj(hidden_states)
+        query_states, key_states, value_states = qkv.split([self.qkv_size] * 3,
+                                                           dim=-1)
+
+        if self.config.use_shared_attention_adapter:
+            # Apply adapter transformations to Q, K, V if enabled
+            q_adapter = self.linear_q_adapter_list[block_idx]
+            assert not isinstance(q_adapter, nn.Identity)
+            q_lora_output = q_adapter(hidden_states)
+            query_states = query_states + q_lora_output
+
+            k_adapter = self.linear_k_adapter_list[block_idx]
+            assert not isinstance(k_adapter, nn.Identity)
+            k_lora_output = k_adapter(hidden_states)
+            key_states = key_states + k_lora_output
+
+            v_adapter = self.linear_v_adapter_list[block_idx]
+            assert not isinstance(v_adapter, nn.Identity)
+            v_lora_output = v_adapter(hidden_states)
+            value_states = value_states + v_lora_output
+
+        if self.config.use_mem_rope:
+            query_states, key_states = self.rotary_emb(position_ids,
+                                                       query_states,
+                                                       key_states)
+
+        y = self.dpa_list[block_idx](query_states, key_states, value_states)
+        y, _ = self.o_proj(y)
+        return y
+
+
+class Zamba2MLP(nn.Module):
+    """Feed-forward MLP layer for the Zamba2 model.
+    
+    Implements a gated feed-forward network that projects inputs to a larger 
+    intermediate size, applies GELU activation with gating, then projects back 
+    to the original size. Includes optional adapter layers for model adaptation.
+    """
+
+    def __init__(
+        self,
+        config: Zamba2Config,
+        bare_block_idx: int,
+        num_hybrid_layers: dict[int, int],
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        """Initialize the MLP layer.
+        
+        Args:
+            config: The Zamba2 model configuration
+            bare_block_idx: Index of the bare block in the model
+            num_hybrid_layers: Total number of hybrid layers
+            quant_config: Configuration for model quantization
+        """
+        super().__init__()
+        self.config = config
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_hybrid_layers = num_hybrid_layers
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+
+        # Main projection layers with gating
+        self.gate_up_proj = MergedColumnParallelLinear(
+            self.hidden_size,
+            2 * [self.intermediate_size],  # 2x for gate and input projections
+            bias=self.config.add_bias_linear,
+            quant_config=quant_config)
+
+        self.down_proj = RowParallelLinear(self.intermediate_size,
+                                           self.hidden_size,
+                                           bias=self.config.add_bias_linear,
+                                           quant_config=quant_config)
+
+        # Only allow GELU activations
+        if config.hidden_act != "gelu":
+            raise ValueError(f"Only GELU activation is supported "
+                             f"(got `hidden_act`: {config.hidden_act})")
+        self.act_fn = GeluAndMul()
+
+        # Initialize adapter layers
+        self.gate_up_proj_adapter_list = nn.ModuleList([])
+        for block_idx in range(self.num_hybrid_layers):
+            if block_idx % config.num_mem_blocks == bare_block_idx:
+                gate_up_proj_adapter = Zamba2LoRA(
+                    config.hidden_size,
+                    config.adapter_rank,
+                    2 * [self.intermediate_size],
+                    quant_config,
+                )
+            else:
+                gate_up_proj_adapter = nn.Identity()
+            self.gate_up_proj_adapter_list.append(gate_up_proj_adapter)
+
+    def forward(self, hidden_states: torch.Tensor,
+                block_idx: int) -> torch.Tensor:
+        """Forward pass through the MLP layer.
+        
+        Args:
+            hidden_states: Input tensor [batch_size, seq_len, hidden_size]
+            block_idx: Current shared transformer block index
+            
+        Returns:
+            Output tensor [batch_size, seq_len, hidden_size] after applying
+            gated feed-forward transformation
+        """
+        # Project input to intermediate size with gating
+        gate_up_states, _ = self.gate_up_proj(hidden_states)
+
+        # Apply adapter transformation if present
+        adapter = self.gate_up_proj_adapter_list[block_idx]
+        assert not isinstance(adapter, nn.Identity)
+        lora_output = adapter(hidden_states)
+        gate_up_states = gate_up_states + lora_output
+
+        # Apply GELU activation with gating
+        hidden_states = self.act_fn(gate_up_states)
+
+        # Project back to hidden size
+        output, _ = self.down_proj(hidden_states)
+        return output
+
+
+class Zamba2AttentionDecoderLayer(nn.Module):
+    """Single decoder layer combining attention and feed-forward networks.
+    
+    This layer implements a standard transformer block with:
+    - Input layer normalization
+    - Multi-head self-attention
+    - Pre-feed-forward layer normalization
+    - Feed-forward network (MLP)
+    """
+
+    def __init__(
+        self,
+        config: Zamba2Config,
+        bare_block_idx: int,
+        num_hybrid_layers: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        """Initialize the decoder layer.
+        
+        Args:
+            config: The Zamba2 model configuration
+            bare_block_idx: Index of the bare block
+            num_hybrid_layers: Total number of hybrid layers
+            cache_config: Configuration for key-value caching
+            quant_config: Configuration for model quantization
+            prefix: Optional prefix for parameter names
+        """
+        super().__init__()
+
+        # Initialize attention sublayer
+        self.self_attn = Zamba2Attention(
+            config,
+            bare_block_idx=bare_block_idx,
+            num_hybrid_layers=num_hybrid_layers,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=prefix,
+        )
+
+        # Initialize feed-forward sublayer
+        self.feed_forward = Zamba2MLP(
+            config,
+            bare_block_idx=bare_block_idx,
+            num_hybrid_layers=num_hybrid_layers,
+            quant_config=quant_config,
+        )
+
+        # Initialize layer normalizations
+        # Input normalization operates on concatenated states
+        self.input_layernorm = RMSNorm(2 * config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        # Pre-FF normalization operates on attention output
+        self.pre_ff_layernorm = RMSNorm(config.hidden_size,
+                                        eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        original_hidden_states: torch.Tensor,
+        block_idx: int,
+        positions: torch.Tensor,
+    ) -> torch.Tensor:
+        """Forward pass through the decoder layer.
+        
+        Args:
+            hidden_states: Input tensor from previous layer
+            original_hidden_states: Original input tensor for residual 
+                connection
+            block_idx: Current shared transformer block index
+            positions: IDs for positional embeddings
+            
+        Returns:
+            Transformed hidden states after attention and feed-forward
+        """
+
+        # The argument original_hidden_states is concatenated with hidden_states
+        # (which is the output of the previous (mamba) layer).
+        # The concatenated tensor is then used as input of the pre-attention
+        # RMSNorm (see fig. 2 in https://arxiv.org/pdf/2405.16712).
+        hidden_states = torch.concatenate(
+            [hidden_states, original_hidden_states], dim=-1)
+
+        # Layer norm before attention
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self attention
+        hidden_states = self.self_attn(
+            hidden_states,
+            position_ids=positions,
+            block_idx=block_idx,
+        )
+
+        # Layer norm before feed-forward
+        hidden_states = self.pre_ff_layernorm(hidden_states)
+
+        # Feed-forward network
+        hidden_states = self.feed_forward(hidden_states, block_idx=block_idx)
+
+        return hidden_states
+
+
+class Zamba2MambaDecoderLayer(nn.Module):
+    """Single Mamba decoder layer with normalization.
+    
+    This implements a  Mamba block. It includes input normalization 
+    and can process sequences using either chunked or full 
+    computation depending on configuration.
+    """
+
+    def __init__(self,
+                 config: Zamba2Config,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "") -> None:
+        """Initialize the Mamba decoder layer.
+        
+        Args:
+            config: The Zamba2 model configuration
+            quant_config: Configuration for model quantization
+        """
+        super().__init__()
+
+        # Initialize Mamba mixer with expanded intermediate size
+        intermediate_size = config.mamba_expand * config.hidden_size
+        self.mamba = MambaMixer2(hidden_size=config.hidden_size,
+                                 ssm_state_size=config.mamba_d_state,
+                                 conv_kernel_size=config.mamba_d_conv,
+                                 intermediate_size=intermediate_size,
+                                 use_conv_bias=config.use_conv_bias,
+                                 use_bias=config.add_bias_linear,
+                                 n_groups=config.mamba_ngroups,
+                                 num_heads=config.n_mamba_heads,
+                                 head_dim=intermediate_size //
+                                 config.n_mamba_heads,
+                                 rms_norm_eps=config.rms_norm_eps,
+                                 activation="silu",
+                                 quant_config=quant_config,
+                                 prefix=f"{prefix}.mixer")
+
+        # Input normalization
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        mamba2_metadata: Mamba2Metadata,
+        transformer_hidden_states: Optional[torch.Tensor] = None,
+        positions: Optional[torch.Tensor] = None,
+        original_hidden_states: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass through the Mamba decoder layer.
+        
+        Args:
+            hidden_states: Input tensor [batch_size, seq_len, hidden_size]
+            mamba_cache_params: Parameters for Mamba's state caches 
+                (one for conv, one for ssm)
+            sequence_idx: Index tensor for identifying sequences in batch
+                Required for proper chunked processing in prefill
+            transformer_hidden_states: Optional output from transformer path
+                Added to input if provided (used in hybrid architecture)
+            positions: Optional position IDs (unused in Mamba)
+            original_hidden_states: Optional original inputs (unused in Mamba)
+            
+        Returns:
+            Transformed hidden states with residual connection applied
+        """
+        # Store input for residual connection
+        residual = hidden_states
+
+        # `transformer_hidden_states` is the output from shared
+        # transformer + linear layer (see fig. 2 in
+        # https://arxiv.org/pdf/2405.16712).
+        # `transformer_hidden_states` is then added to the input to the mamba
+        # layer below (as described in eq. (6) of
+        # https://arxiv.org/pdf/2405.16712).
+        if transformer_hidden_states is not None:
+            hidden_states = hidden_states + transformer_hidden_states
+
+        # Apply input normalization
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Process through Mamba mixer
+        output = torch.empty_like(hidden_states)
+        self.mamba(
+            hidden_states,
+            output,
+            mamba_cache_params=mamba_cache_params,
+            mamba2_metadata=mamba2_metadata,
+        )
+
+        # residual connection after mamba
+        hidden_states = residual + output
+
+        return hidden_states
+
+
+class Zamba2HybridLayer(nn.Module):
+    """Hybrid layer combining Transformer and Mamba architectures.
+    
+    This layer implements the hybrid architecture described in the Zamba paper,
+    where a shared transformer pathway processes input in parallel with a Mamba
+    pathway. The transformer output is projected and added to the Mamba input
+    for enhanced representation learning.
+    """
+
+    def __init__(
+        self,
+        shared_transformer: Zamba2AttentionDecoderLayer,
+        config: Zamba2Config,
+        block_idx: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        """Initialize the hybrid layer.
+        
+        Args:
+            shared_transformer: Transformer decoder layer for attention pathway
+            linear: Linear projection for transformer output before Mamba
+            mamba: Mamba decoder layer for state space pathway
+        """
+        super().__init__()
+        self.block_idx = block_idx
+        self.shared_transformer = shared_transformer
+        self.linear = ReplicatedLinear(config.hidden_size,
+                                       config.hidden_size,
+                                       bias=False,
+                                       quant_config=quant_config)
+        self.mamba_decoder = Zamba2MambaDecoderLayer(config,
+                                                     quant_config=quant_config,
+                                                     prefix=prefix)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        original_hidden_states: torch.Tensor,
+        positions: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        mamba2_metadata: Mamba2Metadata,
+    ) -> torch.Tensor:
+        """Forward pass through the hybrid layer.
+        
+        Processes input through parallel transformer and Mamba paths:
+        1. Transformer path processes input with attention
+        2. Transformer output is projected to match hidden size
+        3. Projected output is added to Mamba path input
+        4. Final output combines both paths' representations
+        
+        Args:
+            hidden_states: Input tensor [batch_size, seq_len, hidden_size]
+            original_hidden_states: Original input for transformer residual 
+                connection
+            positions: Position IDs for positional embeddings
+            mamba_cache_params: Parameters for Mamba's state caches 
+                (one for conv, one for ssm)
+            sequence_idx: Indices for identifying sequences in batch,
+                required for proper chunked processing in prefill
+            
+        Returns:
+            Output tensor combining transformer and Mamba representations
+        """
+        # Process through transformer pathway
+        transformer_hidden_states = self.shared_transformer(
+            hidden_states,
+            original_hidden_states=original_hidden_states,
+            block_idx=self.block_idx,
+            positions=positions,
+        )
+
+        # Project transformer output
+        transformer_hidden_states, _ = self.linear(transformer_hidden_states)
+
+        # Process through Mamba pathway with transformer injection
+        layer_outputs = self.mamba_decoder(
+            hidden_states,
+            transformer_hidden_states=transformer_hidden_states,
+            mamba_cache_params=mamba_cache_params,
+            mamba2_metadata=mamba2_metadata,
+        )
+
+        return layer_outputs
+
+
+@support_torch_compile
+class Zamba2Model(nn.Module):
+    """Core Zamba2 model combining transformer and Mamba architectures.
+    
+    The model processes input through a sequence of hybrid and Mamba-only 
+    layers, using token embeddings and final layer normalization.
+    """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        """Initialize the Zamba2 model.
+        
+        Args:
+            vllm_config: Configuration object containing model, cache, 
+                quantization and LoRA settings
+            prefix: Optional prefix for parameter names in state dict
+        """
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        is_lora_enabled = bool(lora_config)
+        assert not is_lora_enabled
+
+        self.config = config
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        # Initialize token embeddings
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+
+        # Map hybrid layer indices to block indices
+        layer2block_map = {
+            layer_idx: block_idx
+            for block_idx, layer_idx in enumerate(config.hybrid_layer_ids)
+        }
+
+        # Create cyclic iterator of transformer blocks
+        blocks = cycle([
+            Zamba2AttentionDecoderLayer(config,
+                                        bare_block_idx=idx,
+                                        num_hybrid_layers=len(layer2block_map),
+                                        cache_config=cache_config,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}")
+            for idx in range(config.num_mem_blocks)
+        ])
+
+        # Initialize layers according to block type configuration
+        layers = []
+        for layer_idx, layer_type in enumerate(config.layers_block_type):
+            # tdoublep: avoid layers getting same index
+            # somewhat hacky but correct (I think)
+            prefix = str(len(layer2block_map) + layer_idx)
+            if layer_type == "hybrid":
+                block = next(blocks)
+                block_idx = layer2block_map[layer_idx]
+                layers.append(
+                    Zamba2HybridLayer(block,
+                                      config,
+                                      block_idx,
+                                      quant_config,
+                                      prefix=prefix))
+            else:
+                layers.append(
+                    Zamba2MambaDecoderLayer(config,
+                                            quant_config=quant_config,
+                                            prefix=prefix))
+        self.layers = nn.ModuleList(layers)
+
+        # Final layer normalization
+        self.final_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        """Convert input token IDs to embeddings.
+        
+        Args:
+            input_ids: Tensor of input token IDs
+            
+        Returns:
+            Embedded representation of the input tokens
+        """
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        """Forward pass through the model.
+        
+        Args:
+            input_ids: Input token IDs
+            positions: Position IDs for embeddings
+            mamba_cache_params: Parameters for Mamba's state caches 
+                (one for conv, one for ssm)
+            inputs_embeds: Optional pre-computed input embeddings
+            
+        Returns:
+            Either final hidden states or intermediate tensors for pipeline 
+            parallelism
+        """
+        # Handle pipeline parallelism for first rank
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings(input_ids)
+        hidden_states = inputs_embeds
+
+        attn_metadata = get_forward_context().attn_metadata
+
+        if not envs.VLLM_USE_V1:
+            mamba2_metadata = prepare_mamba2_metadata(
+                chunk_size=self.config.chunk_size,
+                attn_metadata=attn_metadata,
+            )
+        else:
+            # v1 get mamba2_metadata from forward_context
+            mamba2_metadata = None
+
+        # Process through layers
+        original_hidden_states = torch.clone(hidden_states)
+        for layer_idx, layer in enumerate(self.layers):
+
+            layer_mamba_cache_params = None
+            if (isinstance(layer, (Zamba2HybridLayer, Zamba2MambaDecoderLayer))
+                    and mamba_cache_params):
+                layer_mamba_cache_params = mamba_cache_params.at_layer_idx(
+                    layer_idx)
+
+            layer_outputs = layer(
+                hidden_states,
+                original_hidden_states=original_hidden_states,
+                positions=positions,
+                mamba_cache_params=layer_mamba_cache_params,
+                mamba2_metadata=mamba2_metadata,
+            )
+            hidden_states = layer_outputs
+
+        hidden_states = self.final_layernorm(hidden_states)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for chkpt_weight_name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in chkpt_weight_name:
+                    continue
+                chkpt_weight_name = chkpt_weight_name.replace(
+                    weight_name, param_name)
+                param = params_dict[chkpt_weight_name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if chkpt_weight_name not in params_dict:
+                    continue
+                param = params_dict[chkpt_weight_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(chkpt_weight_name)
+        return loaded_params
+
+
+class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid):
+    """Zamba2 model with causal language modeling head.
+    
+    This class wraps the core Zamba2 model and adds:
+    - A language modeling head for next token prediction
+    - Mamba state caching functionality
+    - Support for model parallelism and quantization
+    - Sampling capabilities for text generation
+    """
+    # To ensure correct weight loading and mapping.
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_substr={
+        "A_log": "A",
+        "0.weight": "A.weight",
+        "1.weight": "B.weight",
+    })
+
+    @classmethod
+    def get_mamba_state_shape_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+        use_v1: bool = True,
+    ) -> tuple[tuple[int, int], tuple[int, int, int]]:
+        """Calculate shapes for Mamba's convolutional and state caches.
+
+        Args:
+            vllm_config: vLLM config
+            use_v1: Get shapes for V1 (or V0)
+
+        Returns:
+            Tuple containing:
+            - conv_state_shape: Shape for convolutional state cache
+            - temporal_state_shape: Shape for state space model cache
+        """
+
+        parallel_config = vllm_config.parallel_config
+        hf_config = vllm_config.model_config.hf_config
+        intermediate_size = hf_config.mamba_expand * hf_config.hidden_size
+
+        return get_mamba_state_shape(
+            intermediate_size=intermediate_size,
+            tp_world_size=parallel_config.tensor_parallel_size,
+            n_groups=hf_config.mamba_ngroups,
+            num_heads=hf_config.n_mamba_heads,
+            head_dim=hf_config.mamba_headdim,
+            state_size=hf_config.mamba_d_state,
+            conv_kernel=hf_config.mamba_d_conv,
+            use_v1=use_v1,
+        )
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        """Initialize the Zamba2 model for causal language modeling.
+        
+        Args:
+            vllm_config: Configuration containing model, cache, quantization,
+                        LoRA and scheduler settings
+            prefix: Optional prefix for parameter names
+        
+        Raises:
+            AssertionError: If prefix caching is enabled (not supported by 
+            Mamba)
+        """
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        lora_config = vllm_config.lora_config
+        scheduler_config = vllm_config.scheduler_config
+        assert not cache_config.enable_prefix_caching, \
+            "Mamba does not support prefix caching"
+
+        super().__init__()
+        self.config = config
+        self.vllm_config = vllm_config
+        self.scheduler_config = scheduler_config
+        self.model_config = vllm_config.model_config
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
+        # Initialize core model
+        self.model = Zamba2Model(vllm_config=vllm_config,
+                                 prefix=maybe_prefix(prefix, "model"))
+
+        # Initialize language modeling head
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+        )
+        # Tie weights with input embeddings if using same dimensions
+        self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
+
+        # Used to track and store by the Mamba cache between steps.
+        self.mamba_cache: Optional[MambaCacheManager] = None
+
+        # Initialize logits processing and sampling
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        """Convert input token IDs to embeddings.
+        Args:
+            input_ids: Tensor of input token IDs
+        Returns:
+            Embedded representation of the input tokens
+        """
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs) -> torch.Tensor:
+        """Forward pass through the model.
+        
+        Args:
+            input_ids: Input token IDs
+            positions: Position IDs for embeddings
+            inputs_embeds: Optional pre-computed input embeddings
+            **kwargs: Additional arguments passed to cache manager
+            
+        Returns:
+            Output hidden states
+        """
+        # Initialize Mamba cache if needed
+        mamba_cache_params = None
+        if not envs.VLLM_USE_V1:
+            if self.mamba_cache is None:
+                num_mamba_layers = self.config.num_hidden_layers
+                mamba_state_shape = \
+                    self.get_mamba_state_shape_from_config(
+                        self.vllm_config, use_v1=False)
+                self.mamba_cache = MambaCacheManager(self.vllm_config,
+                                                     self.lm_head.weight.dtype,
+                                                     num_mamba_layers,
+                                                     *mamba_state_shape)
+
+            # Get cache parameters for current run
+            mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+
+        # Forward pass through model
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            mamba_cache_params,
+            inputs_embeds,
+        )
+
+        return hidden_states
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers: dict[str,
+                                                                 torch.Tensor],
+                                       **kwargs) -> dict[str, torch.Tensor]:
+        """Copy inputs before CUDA graph capture.
+        
+        Args:
+            input_buffers: Dictionary of input tensors
+            **kwargs: Additional arguments passed to cache manager
+            
+        Returns:
+            Updated input buffers
+        """
+        return self.mamba_cache.copy_inputs_before_cuda_graphs(
+            input_buffers, **kwargs)
+
+    def get_seqlen_agnostic_capture_inputs(
+            self, batch_size: int) -> dict[str, torch.Tensor]:
+        """Get inputs for sequence-length-agnostic graph capture.
+        
+        Args:
+            batch_size: Size of batch to capture
+        Returns:
+            Dictionary of capture inputs
+        """
+        return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        """Compute logits for next token prediction.
+        
+        Args:
+            hidden_states: Hidden states from model forward pass
+            sampling_metadata: Metadata for sampling process
+            
+        Returns:
+            Logits for next token prediction
+        """
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm_v0.10.0/vllm/model_executor/parameter.py b/vllm_v0.10.0/vllm/model_executor/parameter.py
new file mode 100644
index 0000000..750ee78
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/parameter.py
@@ -0,0 +1,459 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from fractions import Fraction
+from typing import Callable, Optional, Union
+
+import torch
+from torch.nn import Parameter
+
+from vllm.distributed import get_tensor_model_parallel_rank
+from vllm.logger import init_logger
+from vllm.model_executor.utils import _make_synced_weight_loader
+
+__all__ = [
+    "BasevLLMParameter", "PackedvLLMParameter", "PerTensorScaleParameter",
+    "ModelWeightParameter", "ChannelQuantScaleParameter",
+    "GroupQuantScaleParameter", "PackedColumnParameter", "RowvLLMParameter"
+]
+
+logger = init_logger(__name__)
+
+
+class BasevLLMParameter(Parameter):
+    """
+    Base parameter for vLLM linear layers. Extends the torch.nn.parameter
+    by taking in a linear weight loader. Will copy the loaded weight
+    into the parameter when the provided weight loader is called.
+    """
+
+    def __new__(cls, data: torch.Tensor, **kwargs):
+
+        return super().__new__(cls, data=data, requires_grad=False)
+
+    def __init__(self, data: torch.Tensor, weight_loader: Callable):
+        """
+        Initialize the BasevLLMParameter
+
+        :param data: torch tensor with the parameter data
+        :param weight_loader: weight loader callable
+
+        :returns: a torch.nn.parameter
+        """
+
+        # During weight loading, we often do something like:
+        # narrowed_tensor = param.data.narrow(0, offset, len)
+        # narrowed_tensor.copy_(real_weight)
+        # expecting narrowed_tensor and param.data to share the same storage.
+        # However, on TPUs, narrowed_tensor will lazily propagate to the base
+        # tensor, which is param.data, leading to the redundant memory usage.
+        # This sometimes causes OOM errors during model loading. To avoid this,
+        # we sync the param tensor after its weight loader is called.
+        from vllm.platforms import current_platform
+        if current_platform.is_tpu():
+            weight_loader = _make_synced_weight_loader(weight_loader)
+
+        self._weight_loader = weight_loader
+
+    @property
+    def weight_loader(self):
+        return self._weight_loader
+
+    def _is_1d_and_scalar(self, loaded_weight: torch.Tensor):
+        cond1 = self.data.ndim == 1 and self.data.numel() == 1
+        cond2 = loaded_weight.ndim == 0 and loaded_weight.numel() == 1
+        return (cond1 and cond2)
+
+    def _assert_and_load(self, loaded_weight: torch.Tensor):
+        assert (self.data.shape == loaded_weight.shape
+                or self._is_1d_and_scalar(loaded_weight))
+        self.data.copy_(loaded_weight)
+
+    def load_column_parallel_weight(self, loaded_weight: torch.Tensor):
+        self._assert_and_load(loaded_weight)
+
+    def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
+        self._assert_and_load(loaded_weight)
+
+    def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        self._assert_and_load(loaded_weight)
+
+    def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        self._assert_and_load(loaded_weight)
+
+
+class _ColumnvLLMParameter(BasevLLMParameter):
+    """
+    Private class defining weight loading functionality 
+    (load_merged_column_weight, load_qkv_weight)
+    for parameters being loaded into linear layers with column
+    parallelism. This includes QKV and MLP layers which are
+    not already fused on disk. Requires an output dimension 
+    to be defined. Called within the weight loader of
+    each of the column parallel linear layers.
+    """
+
+    def __init__(self, output_dim: int, **kwargs):
+        self._output_dim = output_dim
+        super().__init__(**kwargs)
+
+    @property
+    def output_dim(self):
+        return self._output_dim
+
+    def load_column_parallel_weight(self, loaded_weight: torch.Tensor):
+        tp_rank = get_tensor_model_parallel_rank()
+        shard_size = self.data.shape[self.output_dim]
+        loaded_weight = loaded_weight.narrow(self.output_dim,
+                                             tp_rank * shard_size, shard_size)
+        assert self.data.shape == loaded_weight.shape
+        self.data.copy_(loaded_weight)
+
+    def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
+
+        shard_offset = kwargs.get("shard_offset")
+        shard_size = kwargs.get("shard_size")
+        if isinstance(
+                self,
+            (PackedColumnParameter,
+             PackedvLLMParameter)) and self.packed_dim == self.output_dim:
+            shard_size, shard_offset = self.adjust_shard_indexes_for_packing(
+                shard_offset=shard_offset, shard_size=shard_size)
+
+        param_data = self.data
+
+        tp_rank = get_tensor_model_parallel_rank()
+        param_data = param_data.narrow(self.output_dim, shard_offset,
+                                       shard_size)
+        loaded_weight = loaded_weight.narrow(self.output_dim,
+                                             tp_rank * shard_size, shard_size)
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+    def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
+
+        shard_offset = kwargs.get("shard_offset")
+        shard_size = kwargs.get("shard_size")
+        shard_id = kwargs.get("shard_id")
+        num_heads = kwargs.get("num_heads")
+
+        if isinstance(
+                self,
+            (PackedColumnParameter,
+             PackedvLLMParameter)) and self.output_dim == self.packed_dim:
+            shard_size, shard_offset = self.adjust_shard_indexes_for_packing(
+                shard_offset=shard_offset, shard_size=shard_size)
+
+        param_data = self.data
+        tp_rank = get_tensor_model_parallel_rank()
+        shard_id = tp_rank if shard_id == "q" else tp_rank // num_heads
+        param_data = param_data.narrow(self.output_dim, shard_offset,
+                                       shard_size)
+        loaded_weight = loaded_weight.narrow(self.output_dim,
+                                             shard_id * shard_size, shard_size)
+
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+
+class RowvLLMParameter(BasevLLMParameter):
+    """
+    Parameter class defining weight_loading functionality
+    (load_row_parallel_weight) for parameters being loaded
+    into linear layers with row parallel functionality.
+    Requires an input_dim to be defined.
+    """
+
+    def __init__(self, input_dim: int, **kwargs):
+        self._input_dim = input_dim
+        super().__init__(**kwargs)
+
+    @property
+    def input_dim(self):
+        return self._input_dim
+
+    def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
+        tp_rank = get_tensor_model_parallel_rank()
+        shard_size = self.data.shape[self.input_dim]
+        loaded_weight = loaded_weight.narrow(self.input_dim,
+                                             tp_rank * shard_size, shard_size)
+
+        if len(loaded_weight.shape) == 0:
+            loaded_weight = loaded_weight.reshape(1)
+
+        assert self.data.shape == loaded_weight.shape
+        self.data.copy_(loaded_weight)
+
+
+class ModelWeightParameter(_ColumnvLLMParameter, RowvLLMParameter):
+    """
+    Parameter class for linear layer weights. Uses both column and
+    row parallelism.
+    """
+    pass
+
+
+class GroupQuantScaleParameter(_ColumnvLLMParameter, RowvLLMParameter):
+    """
+    Parameter class for weight scales loaded for weights with
+    grouped quantization. Uses both column and row parallelism.
+    """
+    pass
+
+
+class ChannelQuantScaleParameter(_ColumnvLLMParameter):
+    """
+    Parameter class for weight scales loaded for weights with
+    channel-wise quantization. Equivalent to _ColumnvLLMParameter.
+    """
+    pass
+
+
+class PerTensorScaleParameter(BasevLLMParameter):
+    """
+    Parameter class for scales where the number of scales is
+    equivalent to the number of logical matrices in fused linear
+    layers (e.g. for QKV, there are 3 scales loaded from disk).
+    This is relevant to weights with per-tensor quantization.
+    Adds functionality to map the scalers to a shard during
+    weight loading. 
+
+    Note: additional parameter manipulation may be handled 
+    for each quantization config specifically, within 
+    process_weights_after_loading 
+    """
+
+    def __init__(self, **kwargs):
+        self.qkv_idxs = {"q": 0, "k": 1, "v": 2}
+        super().__init__(**kwargs)
+
+    def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
+        if isinstance(shard_id, int):
+            return shard_id
+
+        # if not int, assume shard_id for qkv
+        # map to int and return
+        assert isinstance(shard_id, str)
+        assert shard_id in self.qkv_idxs
+        return self.qkv_idxs[shard_id]
+
+    # For row parallel layers, no sharding needed
+    # load weight into parameter as is
+    def load_row_parallel_weight(self, *args, **kwargs):
+        super().load_row_parallel_weight(*args, **kwargs)
+
+    def load_merged_column_weight(self, *args, **kwargs):
+        self._load_into_shard_id(*args, **kwargs)
+
+    def load_qkv_weight(self, *args, **kwargs):
+        self._load_into_shard_id(*args, **kwargs)
+
+    def load_column_parallel_weight(self, *args, **kwargs):
+        super().load_row_parallel_weight(*args, **kwargs)
+
+    def _load_into_shard_id(self, loaded_weight: torch.Tensor,
+                            shard_id: Union[str, int], **kwargs):
+        """
+        Slice the parameter data based on the shard id for 
+        loading.
+        """
+
+        param_data = self.data
+        shard_id = self._shard_id_as_int(shard_id)
+
+        # AutoFP8 scales do not have a shape
+        # compressed-tensors scales do have a shape
+        if len(loaded_weight.shape) != 0:
+            assert loaded_weight.shape[0] == 1
+            loaded_weight = loaded_weight[0]
+
+        param_data = param_data[shard_id]
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+
+class PackedColumnParameter(_ColumnvLLMParameter):
+    """
+    Parameter for model parameters which are packed on disk
+    and support column parallelism only. See PackedvLLMParameter
+    for more details on the packed properties.
+    """
+
+    def __init__(self,
+                 packed_factor: Union[int, Fraction],
+                 packed_dim: int,
+                 marlin_tile_size: Optional[int] = None,
+                 bitblas_tile_size: Optional[int] = None,
+                 **kwargs):
+        self._packed_factor = packed_factor
+        self._packed_dim = packed_dim
+        self._marlin_tile_size = marlin_tile_size
+        self._bitblas_tile_size = bitblas_tile_size
+        super().__init__(**kwargs)
+
+    @property
+    def packed_dim(self):
+        return self._packed_dim
+
+    @property
+    def packed_factor(self):
+        return self._packed_factor
+
+    @property
+    def marlin_tile_size(self):
+        return self._marlin_tile_size
+
+    @property
+    def bitblas_tile_size(self):
+        return self._bitblas_tile_size
+
+    def adjust_shard_indexes_for_packing(self, shard_size, shard_offset):
+        return _adjust_shard_indexes_for_packing(
+            shard_size=shard_size,
+            shard_offset=shard_offset,
+            packed_factor=self.packed_factor,
+            marlin_tile_size=self.marlin_tile_size,
+            bitblas_tile_size=self.bitblas_tile_size)
+
+
+class PackedvLLMParameter(ModelWeightParameter):
+    """
+    Parameter for model weights which are packed on disk.
+    Example: GPTQ Marlin weights are int4 or int8, packed into int32.
+    Extends the ModelWeightParameter to take in the
+    packed factor, the packed dimension, and optionally, marlin
+    tile size for marlin kernels. Adjusts the shard_size and 
+    shard_offset for fused linear layers model weight loading
+    by accounting for packing and optionally, marlin tile size.
+    """
+
+    def __init__(self,
+                 packed_factor: Union[int, Fraction],
+                 packed_dim: int,
+                 marlin_tile_size: Optional[int] = None,
+                 bitblas_tile_size: Optional[int] = None,
+                 **kwargs):
+        self._packed_factor = packed_factor
+        self._packed_dim = packed_dim
+        self._marlin_tile_size = marlin_tile_size
+        self._bitblas_tile_size = bitblas_tile_size
+        super().__init__(**kwargs)
+
+    @property
+    def packed_dim(self):
+        return self._packed_dim
+
+    @property
+    def packed_factor(self):
+        return self._packed_factor
+
+    @property
+    def marlin_tile_size(self):
+        return self._marlin_tile_size
+
+    @property
+    def bitblas_tile_size(self):
+        return self._bitblas_tile_size
+
+    def adjust_shard_indexes_for_packing(self, shard_size, shard_offset):
+        return _adjust_shard_indexes_for_packing(
+            shard_size=shard_size,
+            shard_offset=shard_offset,
+            packed_factor=self.packed_factor,
+            marlin_tile_size=self.marlin_tile_size,
+            bitblas_tile_size=self.bitblas_tile_size)
+
+
+class BlockQuantScaleParameter(_ColumnvLLMParameter, RowvLLMParameter):
+    """
+    Parameter class for weight scales loaded for weights with
+    block-wise quantization. Uses both column and row parallelism.
+    """
+
+    pass
+
+
+def permute_param_layout_(param: BasevLLMParameter, input_dim: int,
+                          output_dim: int, **kwargs) -> BasevLLMParameter:
+    """
+    Permute a parameter's layout to the specified input and output dimensions, 
+    useful for forcing the parameter into a known layout, for example, if I need
+    a packed (quantized) weight matrix to be in the layout 
+        {input_dim = 0, output_dim = 1, packed_dim = 0}
+    then I can call:
+        permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
+    to ensure x is in the correct layout (permuting it to the correct layout if 
+    required, asserting if it cannot get it to the correct layout)
+    """
+
+    curr_input_dim = getattr(param, "input_dim", None)
+    curr_output_dim = getattr(param, "output_dim", None)
+
+    if curr_input_dim is None or curr_output_dim is None:
+        assert param.data.dim() == 2,\
+            "permute_param_layout_ only supports 2D parameters when either "\
+            "input_dim or output_dim is not set"
+
+    # if one of the dimensions is not set, set it to the opposite of the other
+    #  we can only do this since we asserted the parameter is 2D above
+    if curr_input_dim is None:
+        assert curr_output_dim is not None,\
+            "either input or output dim must be set"
+        curr_input_dim = (curr_output_dim + 1) % 2
+    if curr_output_dim is None:
+        assert curr_input_dim is not None,\
+            "either input or output dim must be set"
+        curr_output_dim = (curr_input_dim + 1) % 2
+
+    # create permutation from the current layout to the layout with
+    # self.input_dim at input_dim and self.output_dim at output_dim preserving
+    # other dimensions
+    perm = [
+        i for i in range(param.data.dim())
+        if i not in [curr_input_dim, curr_output_dim]
+    ]
+    perm.insert(input_dim, curr_input_dim)
+    perm.insert(output_dim, curr_output_dim)
+
+    if "packed_dim" in kwargs:
+        assert hasattr(param, "packed_dim") and\
+            param.packed_dim == perm[kwargs["packed_dim"]],\
+            "permute_param_layout_ currently doesn't support repacking"
+
+    param.data = param.data.permute(*perm)
+    if hasattr(param, "_input_dim"):
+        param._input_dim = input_dim
+    if hasattr(param, "_output_dim"):
+        param._output_dim = output_dim
+    if "packed_dim" in kwargs and hasattr(param, "_packed_dim"):
+        param._packed_dim = kwargs["packed_dim"]
+
+    return param
+
+
+def _adjust_shard_indexes_for_marlin(shard_size, shard_offset,
+                                     marlin_tile_size):
+    return shard_size * marlin_tile_size, shard_offset * marlin_tile_size
+
+
+def _adjust_shard_indexes_for_bitblas(shard_size, shard_offset,
+                                      bitblas_tile_size):
+    return shard_size // bitblas_tile_size, shard_offset // bitblas_tile_size
+
+
+def _adjust_shard_indexes_for_packing(shard_size, shard_offset, packed_factor,
+                                      marlin_tile_size, bitblas_tile_size):
+    shard_size = shard_size // packed_factor
+    shard_offset = shard_offset // packed_factor
+    if marlin_tile_size is not None:
+        return _adjust_shard_indexes_for_marlin(
+            shard_size=shard_size,
+            shard_offset=shard_offset,
+            marlin_tile_size=marlin_tile_size)
+    elif bitblas_tile_size is not None:
+        return _adjust_shard_indexes_for_bitblas(
+            shard_size=shard_size,
+            shard_offset=shard_offset,
+            bitblas_tile_size=bitblas_tile_size)
+
+    return shard_size, shard_offset
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/model_executor/pooling_metadata.py b/vllm_v0.10.0/vllm/model_executor/pooling_metadata.py
new file mode 100644
index 0000000..e6f1ca6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/pooling_metadata.py
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from typing import Any
+
+import torch
+
+from vllm.pooling_params import PoolingParams
+from vllm.utils import is_pin_memory_available
+
+
+class PoolingMetadata:
+    """Metadata for pooling operations in the Pooler layer.
+
+    This class holds the necessary information for pooling operations,
+    providing context for how to perform pooling and other related operations.
+
+    Attributes:
+        seq_groups: List of (seq_ids, pooling_params).
+        seq_data: A mapping of sequence ID to additional sequence data.
+        prompt_lens: List of the lengths of each prompt.
+    """
+
+    def __init__(
+        self,
+        seq_groups: list[tuple[list[int], PoolingParams]],
+        seq_data: dict[int, Any],  # Specific data related to sequences
+        prompt_lens: list[int],
+    ) -> None:
+        self.seq_groups = seq_groups
+        self.seq_data = seq_data
+        self.prompt_lens = prompt_lens
+
+    def __repr__(self) -> str:
+        return ("PoolingMetadata("
+                f"seq_groups={self.seq_groups}, "
+                f"seq_data={self.seq_data}, "
+                f"prompt_lens={self.prompt_lens})")
+
+    def __getitem__(self, indices: slice):
+        return PoolingMetadata(
+            seq_groups=self.seq_groups[indices],
+            seq_data=dict(list(self.seq_data.items())[indices]),
+            prompt_lens=self.prompt_lens[indices],
+        )
+
+
+@dataclass
+class PoolingTensors:
+    """Tensors for pooling."""
+
+    prompt_lens: torch.Tensor
+
+    @classmethod
+    def from_pooling_metadata(
+        cls,
+        pooling_metadata: "PoolingMetadata",
+        device: torch.device,
+    ) -> "PoolingTensors":
+        """
+        Create PoolingTensors from PoolingMetadata.
+
+        Args:
+            pooling_metadata: PoolingMetadata instance to convert.
+            device: Device to store the tensors.
+        """
+        # Convert prompt lengths to tensor
+        pin_memory = is_pin_memory_available()
+
+        prompt_lens_t = torch.tensor(
+            pooling_metadata.prompt_lens,
+            device="cpu",
+            dtype=torch.long,
+            pin_memory=pin_memory,
+        )
+
+        return cls(prompt_lens=prompt_lens_t.to(device=device,
+                                                non_blocking=True), )
diff --git a/vllm_v0.10.0/vllm/model_executor/sampling_metadata.py b/vllm_v0.10.0/vllm/model_executor/sampling_metadata.py
new file mode 100644
index 0000000..56f0f09
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/sampling_metadata.py
@@ -0,0 +1,597 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from array import array
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+
+from vllm.sampling_params import SamplingParams, SamplingType
+from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData,
+                           SequenceGroupMetadata)
+from vllm.utils import (PyObjectCache, async_tensor_h2d,
+                        is_pin_memory_available, make_tensor_with_pad)
+
+_SAMPLING_EPS = 1e-5
+
+
+@dataclass
+class SequenceGroupToSample:
+    # |---------- N-1 iteration --------|
+    # |---------------- N iteration ---------------------|
+    # |- tokenA -|......................|-- newTokens ---|
+    # |---------- context_len ----------|
+    # |-------------------- seq_len ----------------------|
+    #                                   |-- query_len ---|
+
+    # Sequence ids for the sequence group in a previous step.
+    seq_ids: list[int]
+    sampling_params: SamplingParams
+    # seq_id -> sequence data.
+    seq_data: dict[int, SequenceData]
+    # The length of the sequence (all tokens seen in the past + new token to
+    # compute attention) of the sequence group. None if it is in a decode
+    # stage.
+    seq_len: Optional[int]
+    # The length of new query tokens to compute in the current step. None if it
+    # is in a decode stage. The length of query_len <= seq_len if chunked
+    # prefill is enabled.
+    query_len: Optional[int]
+    # A random number generator for sampling.
+    generator: Optional[torch.Generator]
+    # True if the sequence group is in prefill stage. False if it is in a
+    # decode stage.
+    is_prompt: bool
+    # Query token indices from logits. to compute prompt logprob. Empty if
+    # prompt logprob is not required.
+    prompt_logprob_indices: list[int]
+    # Sample token indices from logits. Empty if sampling is not required.
+    sample_indices: list[int]
+
+    @property
+    def do_sample(self):
+        return len(self.sample_indices) > 0
+
+    def __post_init__(self):
+        if len(self.prompt_logprob_indices) > 0:
+            assert self.sampling_params.prompt_logprobs is not None
+        if self.is_prompt:
+            assert self.seq_len is not None
+            assert self.query_len is not None
+
+
+def gen_seq_group_to_sample_builder(num_seqs: int):
+    return lambda: SequenceGroupToSample(
+        seq_ids=[0] * num_seqs,
+        sampling_params=None,
+        seq_data=None,  # type: ignore
+        seq_len=0,
+        query_len=0,
+        generator=None,
+        is_prompt=True,
+        prompt_logprob_indices=[],
+        sample_indices=[],
+    )
+
+
+class SamplingMetadataCache:
+    """Used to cache SamplingMetadata objects between scheduler iterations"""
+
+    def __init__(self):
+        self._seq_group_to_sample_cache: dict[int, PyObjectCache] = {}
+
+    def get_cached_seq_group_to_sample(self, num_seqs):
+        if num_seqs not in self._seq_group_to_sample_cache:
+            self._seq_group_to_sample_cache[num_seqs] = PyObjectCache(
+                gen_seq_group_to_sample_builder(num_seqs))
+
+        obj = self._seq_group_to_sample_cache[num_seqs].get_object()
+        return obj
+
+    def reset(self):
+        for cache in self._seq_group_to_sample_cache.values():
+            cache.reset()
+
+
+class SamplingMetadata:
+    """Metadata for input sequences. Used in sampler.
+
+    The usage is as follow;
+    ```
+    hidden_states = execute_model(...)
+    logits = hidden_states[sampling_metadata.selected_token_indices]
+    sample(logits)
+
+    def sample(logits):
+        # Use categorized_sample_indices for sampling....
+    ```
+
+    Args:
+        seq_groups: List of batched sequence groups.
+        selected_token_indices: (num_query_tokens_to_logprob). Indices to find
+            logits from the initial model output hidden states.
+        categorized_sample_indices: SamplingType -> token indices to sample.
+            Each token indices is 2D tensor of (num_indices, num_indices) where
+            the first item means the sample index within the returned logit
+            (before pruning padding), and the second item means the sample
+            index after pruning using selected_token_indices.
+            For example, if the returned logit is [1, 2, 3], and we select
+            [1, 2] for sampling, the pruned logit will be [2, 3]. In this case,
+            The first tuple is [1, 2] (sampled index within original logit),
+            and the second tuple is [0, 1] (sampled index within pruned logit).
+        num_prompts: Number of prompt sequence groups in seq_groups.
+        skip_sampler_cpu_output: Indicates if we want to skip the GPU=>CPU
+            serialization of token outputs.
+        reuse_sampling_tensors: Indicates if we want to reuse sampling
+            tensors that are part of the sampler forward pass. Currently,
+            it is mainly used for multi-step decode.
+
+    """
+
+    def __init__(
+        self,
+        seq_groups: list[SequenceGroupToSample],
+        selected_token_indices: torch.Tensor,
+        categorized_sample_indices: dict[SamplingType, torch.Tensor],
+        num_prompts: int,
+        skip_sampler_cpu_output: bool = False,
+        reuse_sampling_tensors: bool = False,
+    ) -> None:
+        self.seq_groups = seq_groups
+        self.selected_token_indices = selected_token_indices
+        self.categorized_sample_indices = categorized_sample_indices
+        self.num_prompts = num_prompts
+        self.skip_sampler_cpu_output = skip_sampler_cpu_output
+        self.reuse_sampling_tensors = reuse_sampling_tensors
+
+    @staticmethod
+    def prepare(
+        seq_group_metadata_list: list[SequenceGroupMetadata],
+        seq_lens: list[int],
+        query_lens: list[int],
+        device: str,
+        pin_memory: bool,
+        generators: Optional[dict[str, torch.Generator]] = None,
+        cache: Optional[SamplingMetadataCache] = None,
+    ) -> "SamplingMetadata":
+        (
+            seq_groups,
+            selected_token_indices,
+            categorized_sample_indices,
+            num_prompts,
+        ) = _prepare_seq_groups(seq_group_metadata_list, seq_lens, query_lens,
+                                device, generators, cache)
+        selected_token_indices = async_tensor_h2d(
+            selected_token_indices,
+            dtype=torch.long,
+            target_device=device,
+            pin_memory=pin_memory,
+        )
+        categorized_sample_indices = {
+            t:
+            async_tensor_h2d(
+                seq_ids,
+                dtype=torch.int,
+                target_device=device,
+                pin_memory=pin_memory,
+            )
+            for t, seq_ids in categorized_sample_indices.items()
+        }
+
+        sampling_metadata = SamplingMetadata(
+            seq_groups=seq_groups,
+            selected_token_indices=selected_token_indices,
+            categorized_sample_indices=categorized_sample_indices,
+            num_prompts=num_prompts,
+        )
+        return sampling_metadata
+
+    def __repr__(self) -> str:
+        return (
+            "SamplingMetadata("
+            f"seq_groups={self.seq_groups}, "
+            f"selected_token_indices={self.selected_token_indices}, "
+            f"categorized_sample_indices={self.categorized_sample_indices})")
+
+
+def _prepare_seq_groups(
+    seq_group_metadata_list: list[SequenceGroupMetadata],
+    seq_lens: list[int],
+    query_lens: list[int],
+    device: str,
+    generators: Optional[dict[str, torch.Generator]] = None,
+    cache: Optional[SamplingMetadataCache] = None,
+) -> tuple[
+        list[SequenceGroupToSample],
+        list[int],
+        dict[SamplingType, list[int]],
+        int,
+]:
+    """Prepare sequence groups and indices for sampling.
+
+    Args:
+        seq_group_metadata_list: A list of sequence group to batch.
+        seq_lens: A list of sequence lens per sequence group.
+            Index of prompt len should match with seq_group_metadata_list.
+        query_lens: A list of query lengths. Prompt lens include the length
+            of entire prompt tokens, and it could be shorter.
+        device: A device to use for random number generators,
+            `SequenceGroupToSample.generator`.
+        generators: A store of per-request random number generators used
+            for seeded requests.
+
+    Returns:
+        seq_groups: A list of sequence group to sample.
+        selected_token_indices: See the definition from `SamplingMetadata`.
+        categorized_sample_indices: See the definition from `SamplingMetadata`.
+        num_prompts: Total number of prompts from `seq_group_metadata_list`.
+    """
+    # Batched sequence groups for the current model forward stsep.
+    seq_groups: list[SequenceGroupToSample] = []
+    # A list of token indices to sample/compute logprob. It is used to
+    # prune the outcome logits from the model for the performance.
+    selected_token_indices: list[int] = []
+    # Used for selected_token_indices.
+    model_output_idx = 0
+
+    # Sampling type -> (
+    # indices to sample/prompt logprob within pruned output logits,
+    # indices to sample within pruned logits)
+    categorized_sample_indices: dict[SamplingType, list[int]] = {
+        t: []
+        for t in SamplingType
+    }
+    # Index of logits to compute logprob. Logits include both prompt logprob
+    # and sample logprob indices.
+    logit_idx = 0
+    # Total number of prompts from given sequence groups.
+    num_prompts = 0
+
+    for i, seq_group_metadata in enumerate(seq_group_metadata_list):
+        seq_ids = seq_group_metadata.seq_data.keys()
+
+        if cache is not None:
+            sample_obj = cache.get_cached_seq_group_to_sample(len(seq_ids))
+
+            for j, seq_id in enumerate(seq_ids):
+                sample_obj.seq_ids[j] = seq_id
+
+            sample_obj.prompt_logprob_indices.clear()
+            sample_obj.sample_indices.clear()
+
+        sampling_params = seq_group_metadata.sampling_params
+        is_prompt = seq_group_metadata.is_prompt
+        generator: Optional[torch.Generator] = None
+        # If the current seq group is in decode stage, it is None.
+        seq_len: Optional[int] = None
+        query_len: Optional[int] = None
+        prompt_logprob_indices: list[int] = (sample_obj.prompt_logprob_indices
+                                             if cache is not None else [])
+        sample_indices: list[int] = (sample_obj.sample_indices
+                                     if cache is not None else [])
+        do_sample = seq_group_metadata.do_sample
+
+        if seq_group_metadata.is_prompt:
+            if sampling_params.seed is not None:
+                generator = torch.Generator(device=device).manual_seed(
+                    sampling_params.seed)
+                if generators is not None:
+                    generators[seq_group_metadata.request_id] = generator
+
+            num_prompts += 1
+            num_prefill_sample = len(seq_ids)
+            assert num_prefill_sample == 1
+            assert query_lens is not None and seq_lens is not None
+            query_len, seq_len = query_lens[i], seq_lens[i]
+            # If we need sampling, exclude num_prefill_sample tokens from
+            # prompt logprob.
+            prompt_logprob_len = (query_len - num_prefill_sample
+                                  if do_sample else query_len)
+            sample_len = num_prefill_sample if do_sample else 0
+        else:
+            # Decode
+            prompt_logprob_len = 0
+            query_len = query_lens[i] if query_lens is not None and len(
+                query_lens) > 0 else 1
+            sample_len = len(seq_ids) * query_len if do_sample else 0
+
+            if sampling_params.seed is not None and generators is not None:
+                generator = generators.get(seq_group_metadata.request_id)
+
+        # Update indices to select from the model output.
+        """
+        This blocks computes selected_token_indices which is used in the
+        following way.
+
+        hidden_states = model(...)
+        logits = hidden_states[selected_token_indices]
+        """
+
+        if sampling_params.prompt_logprobs is not None:
+            selected_token_indices.extend(
+                range(model_output_idx, model_output_idx + prompt_logprob_len))
+        model_output_idx += prompt_logprob_len
+        if do_sample:
+            selected_token_indices.extend(
+                range(model_output_idx, model_output_idx + sample_len))
+        model_output_idx += sample_len
+
+        # We now find indices for logprob computation and sampling.
+        """
+        This block computes categorized_sample_indices which is used in the
+        following way.
+
+        hidden_states = model(...)
+        logits = hidden_states[selected_token_indices]
+        def sample(logits):
+           # Use categorized_sample_indices for sampling.
+           # prompt_logprob_indices to find prompt logprob indices.
+           # sample_indices to find sample indices.
+        """
+
+        if sampling_params.prompt_logprobs is not None:
+            prompt_logprob_indices.extend(
+                range(logit_idx, logit_idx + prompt_logprob_len))
+            logit_idx += prompt_logprob_len
+        if do_sample:
+            sample_indices.extend(range(logit_idx, logit_idx + sample_len))
+            categorized_sample_indices[sampling_params.sampling_type].extend(
+                list(range(logit_idx, logit_idx + sample_len)))
+            logit_idx += sample_len
+
+        if cache is not None:
+            sample_obj.sampling_params = sampling_params
+            sample_obj.seq_data = seq_group_metadata.seq_data
+            sample_obj.seq_len = seq_len
+            sample_obj.query_len = query_len
+            sample_obj.generator = generator
+            sample_obj.is_prompt = is_prompt
+        else:
+            sample_obj = SequenceGroupToSample(
+                seq_ids=list(seq_ids),
+                sampling_params=sampling_params,
+                seq_data=seq_group_metadata.seq_data,
+                seq_len=seq_len,
+                query_len=query_len,
+                generator=generator,
+                is_prompt=is_prompt,
+                prompt_logprob_indices=list(prompt_logprob_indices),
+                sample_indices=list(sample_indices),
+            )
+
+        seq_groups.append(sample_obj)
+
+    if cache is not None:
+        cache.reset()
+
+    return (seq_groups, selected_token_indices, categorized_sample_indices,
+            num_prompts)
+
+
+@dataclass
+class SamplingTensors:
+    """Tensors for sampling."""
+
+    temperatures: torch.Tensor
+    top_ps: torch.Tensor
+    top_ks: torch.Tensor
+    min_ps: torch.Tensor
+    presence_penalties: torch.Tensor
+    frequency_penalties: torch.Tensor
+    repetition_penalties: torch.Tensor
+    prompt_tokens: torch.Tensor
+    output_tokens: torch.Tensor
+
+    @classmethod
+    def from_sampling_metadata(
+        cls,
+        sampling_metadata: "SamplingMetadata",
+        vocab_size: int,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> tuple["SamplingTensors", bool, bool, bool]:
+        prompt_tokens: list[array] = []
+        output_tokens: list[array] = []
+        top_ks: list[int] = []
+        temperatures: list[float] = []
+        top_ps: list[float] = []
+        min_ps: list[float] = []
+        presence_penalties: list[float] = []
+        frequency_penalties: list[float] = []
+        repetition_penalties: list[float] = []
+        do_penalties = False
+        do_top_p_top_k = False
+        do_min_p = False
+
+        assert sampling_metadata.seq_groups is not None
+        for seq_group in sampling_metadata.seq_groups:
+            seq_ids = seq_group.seq_ids
+            sampling_params = seq_group.sampling_params
+            temperature = sampling_params.temperature
+            p = sampling_params.presence_penalty
+            f = sampling_params.frequency_penalty
+            r = sampling_params.repetition_penalty
+            top_p = sampling_params.top_p
+            min_p = sampling_params.min_p
+
+            # k should not be greater than the vocab size.
+            top_k = min(sampling_params.top_k, vocab_size)
+            top_k = vocab_size if top_k < 1 else top_k
+            if temperature < _SAMPLING_EPS:
+                # NOTE: Zero temperature means deterministic sampling
+                # (i.e., greedy sampling or beam search).
+                # Set the temperature to 1 to avoid division by zero.
+                temperature = 1.0
+            if not do_top_p_top_k and (top_p < 1.0 - _SAMPLING_EPS
+                                       or top_k != vocab_size):
+                do_top_p_top_k = True
+            if not do_min_p and min_p > _SAMPLING_EPS:
+                do_min_p = True
+            if not do_penalties and (abs(p) >= _SAMPLING_EPS
+                                     or abs(f) >= _SAMPLING_EPS
+                                     or abs(r - 1.0) >= _SAMPLING_EPS):
+                do_penalties = True
+
+            is_prompt = seq_group.is_prompt
+            if is_prompt and sampling_params.prompt_logprobs is not None:
+                # For tokens in the prompt that we only need to get
+                # their logprobs
+                query_len = seq_group.query_len
+                assert query_len is not None
+                prefill_len = len(seq_group.prompt_logprob_indices)
+                temperatures += [temperature] * prefill_len
+                top_ps += [top_p] * prefill_len
+                top_ks += [top_k] * prefill_len
+                min_ps += [min_p] * prefill_len
+                presence_penalties += [0] * prefill_len
+                frequency_penalties += [0] * prefill_len
+                repetition_penalties += [1] * prefill_len
+
+            if seq_group.do_sample:
+                sample_lens = len(seq_group.sample_indices)
+                assert sample_lens >= len(seq_ids)
+                temperatures += [temperature] * sample_lens
+                top_ps += [top_p] * sample_lens
+                top_ks += [top_k] * sample_lens
+                min_ps += [min_p] * sample_lens
+                presence_penalties += [p] * sample_lens
+                frequency_penalties += [f] * sample_lens
+                repetition_penalties += [r] * sample_lens
+
+        if do_penalties:
+            for seq_group in sampling_metadata.seq_groups:
+                seq_ids = seq_group.seq_ids
+                sampling_params = seq_group.sampling_params
+                if (seq_group.is_prompt
+                        and sampling_params.prompt_logprobs is not None):
+                    prefill_len = len(seq_group.prompt_logprob_indices)
+                    prompt_tokens.extend(
+                        array(VLLM_TOKEN_ID_ARRAY_TYPE)
+                        for _ in range(prefill_len))
+                    output_tokens.extend(
+                        array(VLLM_TOKEN_ID_ARRAY_TYPE)
+                        for _ in range(prefill_len))
+                if seq_group.do_sample:
+                    for seq_id in seq_ids:
+                        seq_data = seq_group.seq_data[seq_id]
+                        prompt_tokens.append(seq_data.prompt_token_ids_array)
+                        output_tokens.append(seq_data.output_token_ids_array)
+
+        sampling_tensors = SamplingTensors.from_lists(
+            temperatures,
+            top_ps,
+            top_ks,
+            min_ps,
+            presence_penalties,
+            frequency_penalties,
+            repetition_penalties,
+            prompt_tokens,
+            output_tokens,
+            vocab_size,
+            device,
+            dtype,
+        )
+        return (sampling_tensors, do_penalties, do_top_p_top_k, do_min_p)
+
+    @classmethod
+    def from_lists(
+        cls,
+        temperatures: list[float],
+        top_ps: list[float],
+        top_ks: list[int],
+        min_ps: list[float],
+        presence_penalties: list[float],
+        frequency_penalties: list[float],
+        repetition_penalties: list[float],
+        prompt_tokens: list[array],
+        output_tokens: list[array],
+        vocab_size: int,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> "SamplingTensors":
+        # Note that the performance will be very bad without
+        # pinned memory.
+        pin_memory = is_pin_memory_available()
+
+        do_penalties = prompt_tokens or output_tokens
+
+        if do_penalties:
+            prompt_t = make_tensor_with_pad(
+                prompt_tokens,
+                vocab_size,
+                device="cpu",
+                dtype=torch.int64,
+                pin_memory=pin_memory,
+            )
+            output_t = make_tensor_with_pad(
+                output_tokens,
+                vocab_size,
+                device="cpu",
+                dtype=torch.int64,
+                pin_memory=pin_memory,
+            )
+        else:
+            empty_tensor = torch.empty(0, device=device, dtype=torch.long)
+            prompt_t = empty_tensor
+            output_t = empty_tensor
+
+        temperatures_t = torch.tensor(
+            temperatures,
+            device="cpu",
+            dtype=dtype,
+            pin_memory=pin_memory,
+        )
+        top_ps_t = torch.tensor(
+            top_ps,
+            device="cpu",
+            dtype=dtype,
+            pin_memory=pin_memory,
+        )
+        min_ps_t = torch.tensor(
+            min_ps,
+            device="cpu",
+            dtype=dtype,
+            pin_memory=pin_memory,
+        )
+        presence_penalties_t = torch.tensor(
+            presence_penalties,
+            device="cpu",
+            dtype=dtype,
+            pin_memory=pin_memory,
+        )
+        frequency_penalties_t = torch.tensor(
+            frequency_penalties,
+            device="cpu",
+            dtype=dtype,
+            pin_memory=pin_memory,
+        )
+        repetition_penalties_t = torch.tensor(
+            repetition_penalties,
+            device="cpu",
+            dtype=dtype,
+            pin_memory=pin_memory,
+        )
+        top_ks_t = torch.tensor(
+            top_ks,
+            device="cpu",
+            dtype=torch.int,
+            pin_memory=pin_memory,
+        )
+        # Because the memory is pinned, we can do non-blocking
+        # transfer to device.
+
+        return cls(
+            temperatures=temperatures_t.to(device=device, non_blocking=True),
+            top_ps=top_ps_t.to(device=device, non_blocking=True),
+            top_ks=top_ks_t.to(device=device, non_blocking=True),
+            min_ps=min_ps_t.to(device=device, non_blocking=True),
+            presence_penalties=presence_penalties_t.to(device=device,
+                                                       non_blocking=True),
+            frequency_penalties=frequency_penalties_t.to(device=device,
+                                                         non_blocking=True),
+            repetition_penalties=repetition_penalties_t.to(device=device,
+                                                           non_blocking=True),
+            prompt_tokens=prompt_t.to(device=device, non_blocking=True),
+            output_tokens=output_t.to(device=device, non_blocking=True),
+        )
diff --git a/vllm_v0.10.0/vllm/model_executor/utils.py b/vllm_v0.10.0/vllm/model_executor/utils.py
new file mode 100644
index 0000000..2b20ca2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/model_executor/utils.py
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Utils for model executor."""
+import copy
+from typing import Any, Optional
+
+import torch
+
+
+def set_random_seed(seed: int) -> None:
+    from vllm.platforms import current_platform
+    current_platform.seed_everything(seed)
+
+
+def set_weight_attrs(
+    weight: torch.Tensor,
+    weight_attrs: Optional[dict[str, Any]],
+):
+    """Set attributes on a weight tensor.
+
+    This method is used to set attributes on a weight tensor. This method
+    will not overwrite existing attributes.
+
+    Args:
+        weight: The weight tensor.
+        weight_attrs: A dictionary of attributes to set on the weight tensor.
+    """
+    if weight_attrs is None:
+        return
+    for key, value in weight_attrs.items():
+        assert not hasattr(
+            weight, key), (f"Overwriting existing tensor attribute: {key}")
+
+        # NOTE(woosuk): During weight loading, we often do something like:
+        # narrowed_tensor = param.data.narrow(0, offset, len)
+        # narrowed_tensor.copy_(real_weight)
+        # expecting narrowed_tensor and param.data to share the same storage.
+        # However, on TPUs, narrowed_tensor will lazily propagate to the base
+        # tensor, which is param.data, leading to the redundant memory usage.
+        # This sometimes causes OOM errors during model loading. To avoid this,
+        # we sync the param tensor after its weight loader is called.
+        # TODO(woosuk): Remove this hack once we have a better solution.
+        from vllm.platforms import current_platform
+        if current_platform.is_tpu() and key == "weight_loader":
+            value = _make_synced_weight_loader(value)
+        setattr(weight, key, value)
+
+
+def _make_synced_weight_loader(original_weight_loader):
+
+    def _synced_weight_loader(param, *args, **kwargs):
+        original_weight_loader(param, *args, **kwargs)
+        # torch._sync doesn't support, is not needed for CPU tensors.
+        if param.device != torch.device("cpu"):
+            torch._sync(param)
+
+    return _synced_weight_loader
+
+
+def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]:
+    parent_map = getattr(model, "packed_modules_mapping", None)
+    parent_map = copy.deepcopy(parent_map) if parent_map is not None else {}
+
+    # don't infer mapping if the model has defined it explicitly.
+    if parent_map:
+        return parent_map
+
+    # We only check main components instead of whole model submodules
+    for child in model.children():
+        child_map = getattr(child, "packed_modules_mapping", None)
+        child_map = copy.deepcopy(child_map) if child_map is not None else {}
+
+        if any((k in parent_map and parent_map[k] != v)
+               for k, v in child_map.items()):
+            raise ValueError(
+                f"Can't update {type(model).__name__}'s packed_modules_mapping "
+                f"safely because of conflicts from {type(child).__name__}.")
+        else:
+            parent_map.update(child_map)
+    return parent_map
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/multimodal/__init__.py b/vllm_v0.10.0/vllm/multimodal/__init__.py
new file mode 100644
index 0000000..2ef9f1c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/multimodal/__init__.py
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from .base import MultiModalPlaceholderMap
+from .hasher import MultiModalHashDict, MultiModalHasher
+from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins,
+                     MultiModalDataDict, MultiModalKwargs,
+                     MultiModalPlaceholderDict, NestedTensors)
+from .registry import MultiModalRegistry
+
+MULTIMODAL_REGISTRY = MultiModalRegistry()
+"""
+The global [`MultiModalRegistry`][vllm.multimodal.registry.MultiModalRegistry]
+is used by model runners to dispatch data processing according to the target
+model.
+
+Info:
+    [mm_processing](../../../design/mm_processing.html)
+"""
+
+__all__ = [
+    "BatchedTensorInputs",
+    "ModalityData",
+    "MultiModalDataBuiltins",
+    "MultiModalDataDict",
+    "MultiModalHashDict",
+    "MultiModalHasher",
+    "MultiModalKwargs",
+    "MultiModalPlaceholderDict",
+    "MultiModalPlaceholderMap",
+    "NestedTensors",
+    "MULTIMODAL_REGISTRY",
+    "MultiModalRegistry",
+]
diff --git a/vllm_v0.10.0/vllm/multimodal/audio.py b/vllm_v0.10.0/vllm/multimodal/audio.py
new file mode 100644
index 0000000..f3b273e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/multimodal/audio.py
@@ -0,0 +1,116 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import base64
+from io import BytesIO
+from pathlib import Path
+from typing import Literal, Optional
+
+import numpy as np
+import numpy.typing as npt
+
+from vllm.utils import PlaceholderModule
+
+from .base import MediaIO
+
+try:
+    import librosa
+except ImportError:
+    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
+
+try:
+    import soundfile
+except ImportError:
+    soundfile = PlaceholderModule("soundfile")  # type: ignore[assignment]
+
+
+def resample_audio_librosa(
+    audio: npt.NDArray[np.floating],
+    *,
+    orig_sr: float,
+    target_sr: float,
+) -> npt.NDArray[np.floating]:
+    return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
+
+
+def resample_audio_scipy(
+    audio: npt.NDArray[np.floating],
+    *,
+    orig_sr: float,
+    target_sr: float,
+):
+    # lazy import scipy.signal, otherwise it will crash doc build.
+    import scipy.signal
+
+    if orig_sr > target_sr:
+        return scipy.signal.resample_poly(audio, 1, orig_sr // target_sr)
+    elif orig_sr < target_sr:
+        return scipy.signal.resample_poly(audio, target_sr // orig_sr, 1)
+    return audio
+
+
+class AudioResampler:
+    """Resample audio data to a target sample rate."""
+
+    def __init__(
+        self,
+        target_sr: Optional[float] = None,
+        method: Literal["librosa", "scipy"] = "librosa",
+    ):
+        self.target_sr = target_sr
+        self.method = method
+
+    def resample(
+        self,
+        audio: npt.NDArray[np.floating],
+        *,
+        orig_sr: float,
+    ) -> npt.NDArray[np.floating]:
+        if self.target_sr is None:
+            raise RuntimeError("Audio resampling is not supported when "
+                               "`target_sr` is not provided")
+        if self.method == "librosa":
+            return resample_audio_librosa(audio,
+                                          orig_sr=orig_sr,
+                                          target_sr=self.target_sr)
+        elif self.method == "scipy":
+            return resample_audio_scipy(audio,
+                                        orig_sr=orig_sr,
+                                        target_sr=self.target_sr)
+        else:
+            raise ValueError(f"Invalid resampling method: {self.method}. "
+                             "Supported methods are 'librosa' and 'scipy'.")
+
+
+class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__()
+
+        # `kwargs` contains custom arguments from
+        # --media-io-kwargs for this modality.
+        # They can be passed to the underlying
+        # media loaders (e.g. custom implementations)
+        # for flexible control.
+        self.kwargs = kwargs
+
+    def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
+        return librosa.load(BytesIO(data), sr=None)
+
+    def load_base64(
+        self,
+        media_type: str,
+        data: str,
+    ) -> tuple[npt.NDArray, float]:
+        return self.load_bytes(base64.b64decode(data))
+
+    def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
+        return librosa.load(filepath, sr=None)
+
+    def encode_base64(self, media: tuple[npt.NDArray, float]) -> str:
+        audio, sr = media
+
+        with BytesIO() as buffer:
+            soundfile.write(buffer, audio, sr, format="WAV")
+            data = buffer.getvalue()
+
+        return base64.b64encode(data).decode('utf-8')
diff --git a/vllm_v0.10.0/vllm/multimodal/base.py b/vllm_v0.10.0/vllm/multimodal/base.py
new file mode 100644
index 0000000..7188ed1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/multimodal/base.py
@@ -0,0 +1,219 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import ABC, abstractmethod
+from collections.abc import Sequence
+from pathlib import Path
+from typing import TYPE_CHECKING, Generic, NamedTuple, TypeVar
+
+if TYPE_CHECKING:
+    from vllm.sequence import SequenceGroupMetadata
+
+from .inputs import MultiModalKwargs, PlaceholderRange
+
+_T = TypeVar("_T")
+
+
+class MultiModalPlaceholderMap:
+    """
+    Relates multi-modal embeddings to their corresponding placeholders.
+
+    Note: This is only used in V0.
+    """
+
+    class IndexMap(NamedTuple):
+        src: list[int]
+        dest: list[int]
+
+    src_ranges: list[range]
+    """
+    The indices of the multi-modal embeddings that will replace the
+    corresponding placeholder embeddings pointed to by ``dest_ranges``.
+    """
+
+    src_len: int
+    """
+    The total number of flattened multi-modal embeddings.
+    """
+
+    dest_ranges: list[range]
+    """
+    The indices of the placeholder embeddings that will be replaced by the
+    multimodal embeddings.
+    """
+
+    dest_len: int
+    """
+    The total number of embeddings in the destination tensor.
+    """
+
+    def __init__(self):
+        self.src_ranges = []
+        self.src_len = 0
+        self.dest_ranges = []
+        self.dest_len = 0
+
+    @classmethod
+    def from_seq_group(
+        cls, seq_group: "SequenceGroupMetadata", positions: range
+    ) -> tuple[MultiModalKwargs, dict[str, "MultiModalPlaceholderMap"]]:
+        """
+        Returns the multi-modal items that intersect with the portion of a
+        prompt (``seq_group``) represented by ``positions``, as well as a
+        ``MultiModalPlaceholderMap`` that relates the multi-modal embedding
+        vectors to their corresponding placeholders.
+
+        Examples:
+
+        ```
+        Prompt:    |AAAA BBBB What's in these images?|
+        Positions: |.................................|
+
+            images      = [A, B]
+            src_ranges  = [(0, 4), (4, 8)]
+            dest_ranges = [(0, 4), (5, 9)]
+
+        Prompt:    |AAAA BBBB What's in these images?|
+        Positions: |  .....                          |
+
+            images      = [A, B]
+            src_ranges  = [(2, 4), (4, 6)]
+            dest_ranges = [(0, 2), (3, 5)]
+
+        Prompt:    |AAAA BBBB What's in these images?|
+        Positions: |     .........                   |
+
+            images      = [B]
+            src_ranges  = [(0, 4)]
+            dest_ranges = [(0, 4)]
+
+        Prompt:    |AAAA BBBB What's in these images?|
+        Positions: |          .......................|
+
+            images      = []
+            src_ranges  = []
+            dest_ranges = []
+        ```
+        """
+        seq_mm_data = seq_group.multi_modal_data
+        seq_mm_placeholders = seq_group.multi_modal_placeholders
+
+        if not seq_mm_data or not seq_mm_placeholders:
+            return MultiModalKwargs({}), {}
+
+        placeholder_maps = dict[str, MultiModalPlaceholderMap]()
+
+        for modality, placeholders in seq_mm_placeholders.items():
+            placeholder_map = MultiModalPlaceholderMap()
+
+            if positions:
+                placeholder_map.append_items_from_seq_group(
+                    positions,
+                    # Dummy, since we don't care about intersecting items
+                    [None] * len(placeholders),
+                    placeholders,
+                )
+
+            placeholder_maps[modality] = placeholder_map
+
+        return seq_mm_data, placeholder_maps
+
+    def append_items_from_seq_group(
+        self,
+        positions: range,
+        multi_modal_items: list[_T],
+        multi_modal_placeholders: Sequence[PlaceholderRange],
+    ) -> list[_T]:
+        """
+        Adds the multi-modal items that intersect ```positions`` to this
+        placeholder map and returns the intersecting items.
+        """
+        intersecting_items = []
+
+        if len(multi_modal_items) != len(multi_modal_placeholders):
+            raise ValueError(
+                "Multi-modal placeholders and items must have the same length."
+            )
+        for placeholder_dict, mm_item in zip(multi_modal_placeholders,
+                                             multi_modal_items):
+            placeholder = range(
+                placeholder_dict.offset,
+                placeholder_dict.offset + placeholder_dict.length,
+            )
+            intersection = range(
+                max(positions.start, placeholder.start),
+                min(positions.stop, placeholder.stop),
+            )
+
+            if not intersection:
+                # Skip this multi-modal item.
+                continue
+
+            token_embedding_range = range(
+                intersection.start - positions.start,
+                intersection.stop - positions.start,
+            )
+
+            multimodal_embedding_range = range(
+                intersection.start - placeholder.start + self.src_len,
+                intersection.stop - placeholder.start + self.src_len,
+            )
+
+            intersecting_items.append(mm_item)
+            self.dest_ranges.append(token_embedding_range)
+            self.src_ranges.append(multimodal_embedding_range)
+            self.src_len += len(placeholder)
+
+        self.dest_len += len(positions)
+        return intersecting_items
+
+    def extend(self, other: "MultiModalPlaceholderMap"):
+        """
+        Adds the placeholders from another ``MultiModalPlaceholderMap`` to this
+        instance based on the source and destination tensors being
+        concatenated.
+        """
+
+        self.src_ranges.extend(
+            range(self.src_len + r.start, self.src_len + r.stop)
+            for r in other.src_ranges)
+        self.src_len += other.src_len
+        self.dest_ranges.extend(
+            range(self.dest_len + r.start, self.dest_len + r.stop)
+            for r in other.dest_ranges)
+        self.dest_len += other.dest_len
+
+    def index_map(self) -> "IndexMap":
+        """
+        Finalizes the placeholder map into lists of indices that can be used to
+        index the source and destination tensors.
+        """
+
+        src_indices = [i for r in self.src_ranges for i in r]
+        dest_indices = [i for r in self.dest_ranges for i in r]
+
+        if len(src_indices) != len(dest_indices):
+            raise ValueError(
+                f"The number of source ({len(src_indices)}) and destination "
+                f"indices ({len(dest_indices)}) must be the same.")
+
+        return self.IndexMap(src=src_indices, dest=dest_indices)
+
+
+class MediaIO(ABC, Generic[_T]):
+
+    @abstractmethod
+    def load_bytes(self, data: bytes) -> _T:
+        raise NotImplementedError
+
+    @abstractmethod
+    def load_base64(self, media_type: str, data: str) -> _T:
+        """
+        List of media types:
+        https://www.iana.org/assignments/media-types/media-types.xhtml
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def load_file(self, filepath: Path) -> _T:
+        raise NotImplementedError
diff --git a/vllm_v0.10.0/vllm/multimodal/hasher.py b/vllm_v0.10.0/vllm/multimodal/hasher.py
new file mode 100644
index 0000000..ac27bb6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/multimodal/hasher.py
@@ -0,0 +1,91 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pickle
+from collections.abc import Iterable, Mapping
+from typing import Union
+
+import numpy as np
+import torch
+from blake3 import blake3
+from PIL import Image
+
+from vllm.logger import init_logger
+from vllm.multimodal.image import convert_image_mode
+
+logger = init_logger(__name__)
+
+MultiModalHashDict = Mapping[str, list[str]]
+"""
+A dictionary containing hashes for items in each modality.
+"""
+
+
+class MultiModalHasher:
+
+    @classmethod
+    def serialize_item(cls, obj: object) -> Union[bytes, memoryview]:
+        # Simple cases
+        if isinstance(obj, str):
+            return obj.encode("utf-8")
+        if isinstance(obj, (bytes, memoryview)):
+            return obj
+        if isinstance(obj, (int, float)):
+            return np.array(obj).tobytes()
+
+        if isinstance(obj, Image.Image):
+            return cls.item_to_bytes(
+                "image", np.asarray(convert_image_mode(obj, "RGBA")))
+        if isinstance(obj, torch.Tensor):
+            return cls.item_to_bytes("tensor", obj.numpy())
+        if isinstance(obj, np.ndarray):
+            # If the array is non-contiguous, we need to copy it first
+            arr_data = obj.data if obj.flags.c_contiguous else obj.tobytes()
+            return cls.item_to_bytes("ndarray", {
+                "dtype": obj.dtype.str,
+                "shape": obj.shape,
+                "data": arr_data,
+            })
+
+        logger.warning(
+            "No serialization method found for %s. "
+            "Falling back to pickle.", type(obj))
+
+        return pickle.dumps(obj)
+
+    @classmethod
+    def item_to_bytes(
+        cls,
+        key: str,
+        obj: object,
+    ) -> bytes:
+        return b''.join(kb + vb for kb, vb in cls.iter_item_to_bytes(key, obj))
+
+    @classmethod
+    def iter_item_to_bytes(
+        cls,
+        key: str,
+        obj: object,
+    ) -> Iterable[tuple[bytes, Union[bytes, memoryview]]]:
+        # Recursive cases
+        if isinstance(obj, (list, tuple)):
+            for i, elem in enumerate(obj):
+                yield from cls.iter_item_to_bytes(f"{key}.{i}", elem)
+        elif isinstance(obj, dict):
+            for k, v in obj.items():
+                yield from cls.iter_item_to_bytes(f"{key}.{k}", v)
+        else:
+            key_bytes = key.encode("utf-8")
+            value_bytes = cls.serialize_item(obj)
+            yield key_bytes, value_bytes
+
+    @classmethod
+    def hash_kwargs(cls, **kwargs: object) -> str:
+        hasher = blake3()
+
+        for k, v in kwargs.items():
+            for k_bytes, v_bytes in cls.iter_item_to_bytes(k, v):
+                hasher.update(k_bytes)
+                hasher.update(v_bytes)
+
+        return hasher.hexdigest()
diff --git a/vllm_v0.10.0/vllm/multimodal/image.py b/vllm_v0.10.0/vllm/multimodal/image.py
new file mode 100644
index 0000000..a0448a8
--- /dev/null
+++ b/vllm_v0.10.0/vllm/multimodal/image.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from io import BytesIO
+from pathlib import Path
+
+import pybase64
+import torch
+from PIL import Image
+
+from .base import MediaIO
+
+
+def rescale_image_size(image: Image.Image,
+                       size_factor: float,
+                       transpose: int = -1) -> Image.Image:
+    """Rescale the dimensions of an image by a constant factor."""
+    new_width = int(image.width * size_factor)
+    new_height = int(image.height * size_factor)
+    image = image.resize((new_width, new_height))
+    if transpose >= 0:
+        image = image.transpose(Image.Transpose(transpose))
+    return image
+
+
+# TODO: Support customizable background color to fill in.
+def rgba_to_rgb(
+    image: Image.Image, background_color=(255, 255, 255)) -> Image.Image:
+    """Convert an RGBA image to RGB with filled background color."""
+    assert image.mode == "RGBA"
+    converted = Image.new("RGB", image.size, background_color)
+    converted.paste(image, mask=image.split()[3])  # 3 is the alpha channel
+    return converted
+
+
+def convert_image_mode(image: Image.Image, to_mode: str):
+    if image.mode == to_mode:
+        return image
+    elif image.mode == "RGBA" and to_mode == "RGB":
+        return rgba_to_rgb(image)
+    else:
+        return image.convert(to_mode)
+
+
+class ImageMediaIO(MediaIO[Image.Image]):
+
+    def __init__(self, image_mode: str = "RGB", **kwargs) -> None:
+        super().__init__()
+
+        self.image_mode = image_mode
+        # `kwargs` contains custom arguments from
+        # --media-io-kwargs for this modality.
+        # They can be passed to the underlying
+        # media loaders (e.g. custom implementations)
+        # for flexible control.
+        self.kwargs = kwargs
+
+    def load_bytes(self, data: bytes) -> Image.Image:
+        image = Image.open(BytesIO(data))
+        image.load()
+        return convert_image_mode(image, self.image_mode)
+
+    def load_base64(self, media_type: str, data: str) -> Image.Image:
+        return self.load_bytes(pybase64.b64decode(data, validate=True))
+
+    def load_file(self, filepath: Path) -> Image.Image:
+        image = Image.open(filepath)
+        image.load()
+        return convert_image_mode(image, self.image_mode)
+
+    def encode_base64(
+        self,
+        media: Image.Image,
+        *,
+        image_format: str = "JPEG",
+    ) -> str:
+        image = media
+
+        with BytesIO() as buffer:
+            image = convert_image_mode(image, self.image_mode)
+            image.save(buffer, image_format)
+            data = buffer.getvalue()
+
+        return pybase64.b64encode(data).decode('utf-8')
+
+
+class ImageEmbeddingMediaIO(MediaIO[torch.Tensor]):
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def load_bytes(self, data: bytes) -> torch.Tensor:
+        buffer = BytesIO(data)
+        return torch.load(buffer, weights_only=True)
+
+    def load_base64(self, media_type: str, data: str) -> torch.Tensor:
+        return self.load_bytes(pybase64.b64decode(data, validate=True))
+
+    def load_file(self, filepath: Path) -> torch.Tensor:
+        return torch.load(filepath, weights_only=True)
+
+    def encode_base64(self, media: torch.Tensor) -> str:
+        return pybase64.b64encode(media.numpy()).decode('utf-8')
diff --git a/vllm_v0.10.0/vllm/multimodal/inputs.py b/vllm_v0.10.0/vllm/multimodal/inputs.py
new file mode 100644
index 0000000..18aae35
--- /dev/null
+++ b/vllm_v0.10.0/vllm/multimodal/inputs.py
@@ -0,0 +1,878 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import ABC, abstractmethod
+from collections import UserDict, defaultdict
+from collections.abc import Mapping, Sequence
+from dataclasses import dataclass
+from functools import partial
+from itertools import accumulate
+from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar,
+                    Union, cast, final)
+
+import numpy as np
+from typing_extensions import NotRequired, TypeAlias
+
+from vllm.jsontree import JSONTree, json_map_leaves
+from vllm.utils import LazyLoader, full_groupby, is_list_of
+
+if TYPE_CHECKING:
+    import torch
+    import torch.types
+    from PIL.Image import Image
+    from transformers.feature_extraction_utils import BatchFeature
+
+    from .hasher import MultiModalHashDict
+else:
+    torch = LazyLoader("torch", globals(), "torch")
+
+_T = TypeVar("_T")
+
+HfImageItem: TypeAlias = Union["Image", np.ndarray, "torch.Tensor"]
+"""
+A `transformers.image_utils.ImageInput` representing a single image
+item, which can be passed to a HuggingFace `ImageProcessor`.
+"""
+
+HfVideoItem: TypeAlias = Union[list["Image"], np.ndarray, "torch.Tensor",
+                               list[np.ndarray], list["torch.Tensor"]]
+"""
+A `transformers.image_utils.VideoInput` representing a single video
+item, which can be passed to a HuggingFace `VideoProcessor`.
+"""
+
+HfAudioItem: TypeAlias = Union[list[float], np.ndarray, "torch.Tensor"]
+"""
+Represents a single audio
+item, which can be passed to a HuggingFace `AudioProcessor`.
+"""
+
+ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor"]
+"""
+A `transformers.image_utils.ImageInput` representing a single image
+item, which can be passed to a HuggingFace `ImageProcessor`.
+
+Alternatively, a 3-D tensor or batch of 2-D tensors,
+which are treated as image embeddings;
+these are directly passed to the model without HF processing.
+"""
+
+VideoItem: TypeAlias = Union[HfVideoItem, "torch.Tensor",
+                             tuple[HfVideoItem, dict[str, Any]]]
+"""
+A `transformers.video_utils.VideoInput` representing a single video item. 
+This can be passed to a HuggingFace `VideoProcessor` 
+with `transformers.video_utils.VideoMetadata`.
+
+Alternatively, a 3-D tensor or batch of 2-D tensors,
+which are treated as video embeddings;
+these are directly passed to the model without HF processing.
+"""
+
+AudioItem: TypeAlias = Union[HfAudioItem, tuple[np.ndarray, float],
+                             "torch.Tensor"]
+"""
+Represents a single audio
+item, which can be passed to a HuggingFace `AudioProcessor`.
+
+Alternatively, a tuple `(audio, sampling_rate)`, where the sampling rate
+is different from that expected by the model;
+these are resampled to the model's sampling rate before being processed by HF.
+
+Alternatively, a 3-D tensor or batch of 2-D tensors,
+which are treated as audio embeddings;
+these are directly passed to the model without HF processing.
+"""
+
+ModalityData: TypeAlias = Union[_T, list[_T]]
+"""
+Either a single data item, or a list of data items.
+
+The number of data items allowed per modality is restricted by
+`--limit-mm-per-prompt`.
+"""
+
+
+@final
+class MultiModalDataBuiltins(TypedDict, total=False):
+    """Type annotations for modality types predefined by vLLM."""
+
+    image: ModalityData[ImageItem]
+    """The input image(s)."""
+
+    video: ModalityData[VideoItem]
+    """The input video(s)."""
+
+    audio: ModalityData[AudioItem]
+    """The input audio(s)."""
+
+
+MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]]
+"""
+A dictionary containing an entry for each modality type to input.
+
+The built-in modalities are defined by
+[`MultiModalDataBuiltins`][vllm.multimodal.inputs.MultiModalDataBuiltins].
+"""
+
+
+@dataclass(frozen=True)
+class PlaceholderRange:
+    """
+    Placeholder location information for multi-modal data.
+
+    Example:
+
+    Prompt: `AAAA BBBB What is in these images?`
+
+    Images A and B will have:
+
+    ```
+    A: PlaceholderRange(offset=0, length=4)
+    B: PlaceholderRange(offset=5, length=4)
+    ```
+    """
+
+    offset: int
+    """The start index of the placeholder in the prompt."""
+
+    length: int
+    """The length of the placeholder."""
+
+    is_embed: Optional["torch.Tensor"] = None
+    """
+    A boolean mask of shape `(length,)` indicating which positions
+    between `offset` and `offset + length` to assign embeddings to.
+    """
+
+    def get_num_embeds(self) -> int:
+        if self.is_embed is None:
+            return self.length
+
+        return int(self.is_embed.sum().item())
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, self.__class__):
+            return False
+        if not (self.offset, self.length) == (other.offset, other.length):
+            return False
+
+        if self.is_embed is None:
+            return other.is_embed is None
+        if other.is_embed is None:
+            return self.is_embed is None
+
+        return nested_tensors_equal(self.is_embed, other.is_embed)
+
+
+NestedTensors: TypeAlias = Union[list["NestedTensors"], list["torch.Tensor"],
+                                 "torch.Tensor", tuple["torch.Tensor", ...]]
+"""
+Uses a list instead of a tensor if the dimensions of each element do not match.
+"""
+
+
+def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
+    """Equality check between
+    [`NestedTensors`][vllm.multimodal.inputs.NestedTensors] objects."""
+    if isinstance(a, torch.Tensor):
+        return isinstance(b, torch.Tensor) and torch.equal(a, b)
+    elif isinstance(b, torch.Tensor):
+        return isinstance(a, torch.Tensor) and torch.equal(b, a)
+
+    if isinstance(a, list):
+        return (isinstance(b, list)
+                and all(nested_tensors_equal(a_, b_) for a_, b_ in zip(a, b)))
+    if isinstance(b, list):
+        return (isinstance(a, list)
+                and all(nested_tensors_equal(b_, a_) for b_, a_ in zip(b, a)))
+
+    # Both a and b are scalars
+    return a == b
+
+
+BatchedTensorInputs: TypeAlias = Mapping[str, NestedTensors]
+"""
+A dictionary containing nested tensors which have been batched via
+[`MultiModalKwargs.batch`][vllm.multimodal.inputs.MultiModalKwargs.batch].
+"""
+
+
+@dataclass(frozen=True)
+class MultiModalFieldElem:
+    """
+    Represents a keyword argument corresponding to a multi-modal item
+    in [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs].
+    """
+
+    modality: str
+    """
+    The modality of the corresponding multi-modal item.
+    Each multi-modal item can consist of multiple keyword arguments.
+    """
+
+    key: str
+    """
+    The key of this field in
+    [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs],
+    i.e. the name of the keyword argument to be passed to the model.
+    """
+
+    data: NestedTensors
+    """
+    The tensor data of this field in
+    [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs],
+    i.e. the value of the keyword argument to be passed to the model.
+    """
+
+    field: "BaseMultiModalField"
+    """
+    Defines how to combine the tensor data of this field with others
+    in order to batch multi-modal items together for model inference.
+    """
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, self.__class__):
+            return False
+
+        return ((self.modality, self.key) == (other.modality, other.key)
+                and nested_tensors_equal(self.data, other.data)
+                and type(self.field) == type(other.field))  # noqa: E721
+
+
+@dataclass(frozen=True)
+class BaseMultiModalField(ABC):
+    """
+    Defines how to interpret tensor data belonging to a keyword argument in
+    [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs] for multiple
+    multi-modal items, and vice versa.
+    """
+
+    def _field_factory(self, *, modality: str, key: str):
+        f = partial(
+            MultiModalFieldElem,
+            modality=modality,
+            key=key,
+            field=self,
+        )
+
+        # Allow passing data as positional argument
+        def factory(data: NestedTensors) -> MultiModalFieldElem:
+            return f(data=data)
+
+        return factory
+
+    @abstractmethod
+    def build_elems(
+        self,
+        modality: str,
+        key: str,
+        data: NestedTensors,
+    ) -> Sequence[MultiModalFieldElem]:
+        """
+        Construct
+        [`MultiModalFieldElem`][vllm.multimodal.inputs.MultiModalFieldElem]
+        instances to represent the provided data.
+
+        This is the inverse of
+        [`reduce_data`][vllm.multimodal.inputs.BaseMultiModalField.reduce_data].
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+        raise NotImplementedError
+
+    def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors:
+        """
+        Merge the data from multiple instances of
+        [`MultiModalFieldElem`][vllm.multimodal.inputs.MultiModalFieldElem].
+
+        This is the inverse of
+        [`build_elems`][vllm.multimodal.inputs.BaseMultiModalField.build_elems].
+        """
+        field_types = [type(item.field) for item in elems]
+        if len(set(field_types)) > 1:
+            raise ValueError(f"Cannot merge different {field_types=}")
+
+        return self._reduce_data([item.data for item in elems])
+
+
+@dataclass(frozen=True)
+class MultiModalBatchedField(BaseMultiModalField):
+    """
+    Info:
+        [`MultiModalFieldConfig.batched`][vllm.multimodal.inputs.MultiModalFieldConfig.batched]
+    """
+
+    def build_elems(
+        self,
+        modality: str,
+        key: str,
+        data: NestedTensors,
+    ) -> Sequence[MultiModalFieldElem]:
+        field_factory = self._field_factory(modality=modality, key=key)
+        return [field_factory(item) for item in data]
+
+    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+        if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"):
+            if len(batch) == 1:
+                # An optimization when `batch` contains only one tensor:
+                # - produce exactly same result as `torch.stack(batch)`
+                # - will achieve zero-copy if the tensor is contiguous
+                return batch[0].unsqueeze(0).contiguous()
+            first_shape = batch[0].shape
+            if all(elem.shape == first_shape for elem in batch):
+                return torch.stack(batch)
+
+        return batch
+
+
+@dataclass(frozen=True)
+class MultiModalFlatField(BaseMultiModalField):
+    """
+    Info:
+        [`MultiModalFieldConfig.flat`][vllm.multimodal.inputs.MultiModalFieldConfig.flat]
+        [`MultiModalFieldConfig.flat_from_sizes`][vllm.multimodal.inputs.MultiModalFieldConfig.flat_from_sizes]
+    """
+    slices: Union[Sequence[slice], Sequence[Sequence[slice]]]
+    dim: int = 0
+
+    def build_elems(
+        self,
+        modality: str,
+        key: str,
+        data: NestedTensors,
+    ) -> Sequence[MultiModalFieldElem]:
+        field_factory = self._field_factory(modality=modality, key=key)
+        if not is_list_of(self.slices, slice, check="all"):
+            assert isinstance(data, torch.Tensor), \
+                "torch.Tensor is required for multiple slices"
+        return [field_factory(data[cast(slice, s)]) for s in self.slices]
+
+    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+        if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"):
+            if len(batch) == 1:
+                # An optimization when `batch` contains only one tensor:
+                # - produce exactly same result as `torch.concat(batch)`
+                # - will achieve zero-copy if the tensor is contiguous
+                return batch[0].contiguous()
+
+            def _expect_same_shape(tensor: torch.Tensor):
+                return tensor.shape[:self.dim] + tensor.shape[self.dim + 1:]
+
+            first_shape = _expect_same_shape(batch[0])
+
+            if all(_expect_same_shape(elem) == first_shape for elem in batch):
+                return torch.concat(batch, dim=self.dim)
+
+        assert self.dim == 0, "dim == 0 is required for nested list"
+        return [e for elem in batch for e in elem]
+
+
+@dataclass(frozen=True)
+class MultiModalSharedField(BaseMultiModalField):
+    """
+    Info:
+        [`MultiModalFieldConfig.shared`][vllm.multimodal.inputs.MultiModalFieldConfig.shared]
+    """
+    batch_size: int
+
+    def build_elems(
+        self,
+        modality: str,
+        key: str,
+        data: NestedTensors,
+    ) -> Sequence[MultiModalFieldElem]:
+        field_factory = self._field_factory(modality=modality, key=key)
+        return [field_factory(data)] * self.batch_size
+
+    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+        return batch[0]
+
+
+class MultiModalFieldConfig:
+
+    @staticmethod
+    def batched(modality: str):
+        """
+        Defines a field where an element in the batch is obtained by
+        indexing into the first dimension of the underlying data.
+
+        Args:
+            modality: The modality of the multi-modal item that uses this
+                keyword argument.
+
+        Example:
+
+        ```
+        Input:
+            Data: [[AAAA]
+                [BBBB]
+                [CCCC]]
+
+        Output:
+            Element 1: [AAAA]
+            Element 2: [BBBB]
+            Element 3: [CCCC]
+        ```
+        """
+        return MultiModalFieldConfig(
+            field=MultiModalBatchedField(),
+            modality=modality,
+        )
+
+    @staticmethod
+    def flat(modality: str,
+             slices: Union[Sequence[slice], Sequence[Sequence[slice]]],
+             dim: int = 0):
+        """
+        Defines a field where an element in the batch is obtained by
+        slicing along the first dimension of the underlying data.
+
+        Args:
+            modality: The modality of the multi-modal item that uses this
+                keyword argument.
+            slices: For each multi-modal item, a slice (dim=0) or a tuple of
+                slices (dim>0) that is used to extract the data corresponding
+                to it.
+            dim: The dimension to extract data, default to 0.
+
+        Example:
+
+        ```
+        Given:
+            slices: [slice(0, 3), slice(3, 7), slice(7, 9)]
+
+        Input:
+            Data: [AAABBBBCC]
+
+        Output:
+            Element 1: [AAA]
+            Element 2: [BBBB]
+            Element 3: [CC]
+        ```
+
+        ```
+        Given:
+            slices: [
+                (slice(None), slice(0, 3)),
+                (slice(None), slice(3, 7)),
+                (slice(None), slice(7, 9))]
+            dim: 1
+
+        Input:
+            Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]]
+
+        Output:
+            Element 1: [[A],[A],[A]]
+            Element 2: [[B],[B],[B],[B]]
+            Element 3: [[C],[C]]
+        ```
+        """
+        return MultiModalFieldConfig(
+            field=MultiModalFlatField(slices=slices, dim=dim),
+            modality=modality,
+        )
+
+    @staticmethod
+    def flat_from_sizes(modality: str,
+                        size_per_item: "torch.Tensor",
+                        dim: int = 0):
+        """
+        Defines a field where an element in the batch is obtained by
+        slicing along the first dimension of the underlying data.
+
+        Args:
+            modality: The modality of the multi-modal item that uses this
+                keyword argument.
+            slices: For each multi-modal item, the size of the slice that
+                is used to extract the data corresponding to it.
+            dim: The dimension to slice, default to 0.
+
+        Example:
+
+        ```
+        Given:
+            size_per_item: [3, 4, 2]
+
+        Input:
+            Data: [AAABBBBCC]
+
+        Output:
+            Element 1: [AAA]
+            Element 2: [BBBB]
+            Element 3: [CC]
+        ```
+
+        ```
+        Given:
+            slices: [3, 4, 2]
+            dim: 1
+
+        Input:
+            Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]]
+
+        Output:
+            Element 1: [[A],[A],[A]]
+            Element 2: [[B],[B],[B],[B]]
+            Element 3: [[C],[C]]
+        ```
+
+        Info:
+            [`MultiModalFieldConfig.flat`][vllm.multimodal.inputs.MultiModalFieldConfig.flat]
+        """
+
+        if size_per_item.ndim != 1:
+            raise ValueError("size_per_item should be a 1-D tensor, "
+                             f"but found shape: {size_per_item.shape}")
+
+        slice_idxs = [0, *accumulate(size_per_item)]
+        slices = [(slice(None, None, None), ) * dim +
+                  (slice(slice_idxs[i], slice_idxs[i + 1]), )
+                  for i in range(len(size_per_item))]
+
+        return MultiModalFieldConfig.flat(modality, slices, dim=dim)
+
+    @staticmethod
+    def shared(modality: str, batch_size: int):
+        """
+        Defines a field where an element in the batch is obtained by
+        taking the entirety of the underlying data.
+
+        This means that the data is the same for each element in the batch.
+
+        Args:
+            modality: The modality of the multi-modal item that uses this
+                keyword argument.
+            batch_size: The number of multi-modal items which share this data.
+
+        Example:
+
+        ```
+        Given:
+            batch_size: 4
+
+        Input:
+            Data: [XYZ]
+
+        Output:
+            Element 1: [XYZ]
+            Element 2: [XYZ]
+            Element 3: [XYZ]
+            Element 4: [XYZ]
+        ```
+        """
+        return MultiModalFieldConfig(
+            field=MultiModalSharedField(batch_size),
+            modality=modality,
+        )
+
+    def __init__(self, field: BaseMultiModalField, modality: str) -> None:
+        super().__init__()
+
+        self.field = field
+        self.modality = modality
+
+    def build_elems(
+        self,
+        key: str,
+        batch: NestedTensors,
+    ) -> Sequence[MultiModalFieldElem]:
+        return self.field.build_elems(self.modality, key, batch)
+
+
+class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
+    """
+    A collection of
+    [`MultiModalFieldElem`][vllm.multimodal.inputs.MultiModalFieldElem]
+    corresponding to a data item in
+    [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems].
+    """
+
+    @staticmethod
+    def from_elems(elems: Sequence[MultiModalFieldElem]):
+        return MultiModalKwargsItem({elem.key: elem for elem in elems})
+
+    @property
+    def modality(self) -> str:
+        modalities = {elem.modality for elem in self.data.values()}
+        assert len(modalities) == 1, f"Found different modalities={modalities}"
+        return next(iter(modalities))
+
+
+# NOTE: UserDict is for V0 compatibility.
+# V1 should access individual items via `get_item`.
+class MultiModalKwargs(UserDict[str, NestedTensors]):
+    """
+    A dictionary that represents the keyword arguments to
+    [`torch.nn.Module.forward`][].
+
+    The metadata `items` enables us to obtain the keyword arguments
+    corresponding to each data item in
+    [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems], via
+    [`get_item`][vllm.multimodal.inputs.MultiModalKwargs.get_item] and
+    [`get_items`][vllm.multimodal.inputs.MultiModalKwargs.get_items].
+    """
+
+    @staticmethod
+    def from_hf_inputs(
+        hf_inputs: "BatchFeature",
+        config_by_key: Mapping[str, MultiModalFieldConfig],
+    ):
+        # NOTE: This skips fields in `hf_inputs` that are not in `config_by_key`
+        # We assume that those fields are not used in vLLM
+        elems_by_key = dict[str, Sequence[MultiModalFieldElem]]()
+        keys_by_modality = defaultdict[str, set[str]](set)
+        for key, config in config_by_key.items():
+            batch = hf_inputs.get(key)
+            if batch is not None:
+                elems = config.build_elems(key, batch)
+                if len(elems) > 0:
+                    elems_by_key[key] = elems
+                    keys_by_modality[config.modality].add(key)
+
+        items = list[MultiModalKwargsItem]()
+        for modality, keys in keys_by_modality.items():
+            elems_in_modality = {k: elems_by_key[k] for k in keys}
+            batch_sizes = {k: len(v) for k, v in elems_in_modality.items()}
+
+            if len(set(batch_sizes.values())) > 1:
+                raise ValueError(
+                    f"Cannot merge different batch sizes for {modality=}! "
+                    f"Found: {batch_sizes=}")
+
+            batch_size = next(iter(batch_sizes.values()))
+            for item_idx in range(batch_size):
+                elems = [v[item_idx] for v in elems_in_modality.values()]
+                items.append(MultiModalKwargsItem.from_elems(elems))
+
+        return MultiModalKwargs.from_items(items)
+
+    @staticmethod
+    def from_items(items: Sequence[MultiModalKwargsItem]):
+        """Construct a new
+        [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs]
+        from multiple items."""
+        elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
+        for item in items:
+            for key, elem in item.items():
+                elems_by_key[key].append(elem)
+
+        data = {
+            key: elems[0].field.reduce_data(elems)
+            for key, elems in elems_by_key.items() if len(elems) > 0
+        }
+
+        return MultiModalKwargs(data, items=items)
+
+    def __init__(
+        self,
+        data: Mapping[str, NestedTensors],
+        *,
+        items: Optional[Sequence[MultiModalKwargsItem]] = None,
+    ) -> None:
+        super().__init__(data)
+
+        items_by_modality = full_groupby(items or [], key=lambda x: x.modality)
+        self._items_by_modality = dict(items_by_modality)
+
+    @property
+    def modalities(self):
+        return self._items_by_modality.keys()
+
+    @staticmethod
+    def _try_stack(nested_tensors: NestedTensors,
+                   pin_memory: bool = False) -> NestedTensors:
+        """
+        Stack the inner dimensions that have the same shape in
+        a nested list of tensors.
+
+        Thus, a dimension represented by a list means that the inner
+        dimensions are different for each element along that dimension.
+        """
+        if isinstance(nested_tensors, torch.Tensor):
+            return nested_tensors
+
+        # TODO: Remove these once all models have been migrated
+        if isinstance(nested_tensors, np.ndarray):
+            return torch.from_numpy(nested_tensors)
+        if isinstance(nested_tensors, (int, float)):
+            return torch.tensor(nested_tensors)
+
+        stacked = [
+            MultiModalKwargs._try_stack(t, pin_memory) for t in nested_tensors
+        ]
+        if not is_list_of(stacked, torch.Tensor, check="all"):
+            # Only tensors (not lists) can be stacked.
+            return stacked
+
+        tensors_ = cast(list[torch.Tensor], stacked)
+        if len(tensors_) == 1:
+            # An optimization when `tensors_` contains only one tensor:
+            # - produce exactly same result as `torch.stack(tensors_)`
+            # - will achieve zero-copy if the tensor is contiguous
+            return tensors_[0].unsqueeze(0).contiguous()
+
+        if any(t.shape != tensors_[0].shape for t in tensors_):
+            # The tensors have incompatible shapes and can't be stacked.
+            return tensors_
+
+        outputs = torch.empty(len(tensors_),
+                              *tensors_[0].shape,
+                              dtype=tensors_[0].dtype,
+                              device=tensors_[0].device,
+                              pin_memory=pin_memory)
+        return torch.stack(tensors_, out=outputs)
+
+    @staticmethod
+    def batch(inputs_list: list["MultiModalKwargs"],
+              pin_memory: bool = False) -> BatchedTensorInputs:
+        """
+        Batch multiple inputs together into a dictionary.
+
+        The resulting dictionary has the same keys as the inputs.
+        If the corresponding value from each input is a tensor and they all
+        share the same shape, the output value is a single batched tensor;
+        otherwise, the output value is a list containing the original value
+        from each input.
+        """
+        if len(inputs_list) == 0:
+            return {}
+
+        # We need to consider the case where each item in the batch
+        # contains different modalities (i.e. different keys).
+        item_lists = defaultdict[str, list[NestedTensors]](list)
+
+        for inputs in inputs_list:
+            for k, v in inputs.items():
+                item_lists[k].append(v)
+
+        return {
+            k: MultiModalKwargs._try_stack(item_list, pin_memory)
+            for k, item_list in item_lists.items()
+        }
+
+    @staticmethod
+    def as_kwargs(
+        batched_inputs: BatchedTensorInputs,
+        *,
+        device: torch.types.Device,
+    ) -> BatchedTensorInputs:
+        json_inputs = cast(JSONTree[torch.Tensor], batched_inputs)
+
+        json_mapped = json_map_leaves(
+            lambda x: x.to(device=device, non_blocking=True),
+            json_inputs,
+        )
+
+        return cast(BatchedTensorInputs, json_mapped)
+
+    def __delitem__(self, key: str) -> None:
+        super().__delitem__(key)
+
+        for items in self._items_by_modality.values():
+            for item in items:
+                item.pop(key, None)
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, self.__class__):
+            return False
+        if self._items_by_modality != other._items_by_modality:
+            return False
+
+        ks = self.keys()
+        return (ks == other.keys()
+                and all(nested_tensors_equal(self[k], other[k]) for k in ks))
+
+    def _validate_modality(self, method_name: str, modality: str) -> None:
+        if not self._items_by_modality:
+            raise RuntimeError(
+                f"`{method_name}` is not supported when "
+                "MultiModalKwargs is not initialized with `items`")
+
+        if modality not in self._items_by_modality:
+            available_modalities = set(self._items_by_modality.keys())
+            raise KeyError(f"Modality {modality!r} not found. "
+                           f"Available modalities: {available_modalities}")
+
+    def get_item_count(self, modality: str) -> int:
+        """Get the number of items belonging to a modality."""
+        self._validate_modality("get_item_count", modality)
+        return len(self._items_by_modality[modality])
+
+    def get_item(self, modality: str, item_index: int) -> MultiModalKwargsItem:
+        """
+        Get the keyword arguments corresponding to an item identified by
+        its modality and index.
+        """
+        self._validate_modality("get_item", modality)
+        return self._items_by_modality[modality][item_index]
+
+    def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]:
+        """
+        Get the keyword arguments corresponding to each item belonging to
+        a modality.
+        """
+        self._validate_modality("get_items", modality)
+        return self._items_by_modality[modality]
+
+
+MultiModalPlaceholderDict: TypeAlias = Mapping[str, Sequence[PlaceholderRange]]
+"""
+A dictionary containing placeholder ranges for each modality.
+"""
+
+
+class MultiModalInputs(TypedDict):
+    """
+    Represents the outputs of
+    [`BaseMultiModalProcessor`][vllm.multimodal.processing.BaseMultiModalProcessor],
+    ready to be passed to vLLM internals.
+    """
+
+    type: Literal["multimodal"]
+    """The type of inputs."""
+
+    prompt: str
+    """The processed prompt text."""
+
+    prompt_token_ids: list[int]
+    """The processed token IDs which includes placeholder tokens."""
+
+    token_type_ids: NotRequired[list[int]]
+    """The token type IDs of the prompt."""
+
+    mm_kwargs: MultiModalKwargs
+    """Keyword arguments to be directly passed to the model after batching."""
+
+    mm_hashes: Optional["MultiModalHashDict"]
+    """The hashes of the multi-modal data."""
+
+    mm_placeholders: "MultiModalPlaceholderDict"
+    """
+    For each modality, information about the placeholder tokens in
+    `prompt_token_ids`.
+    """
+
+    cache_salt: NotRequired[str]
+    """
+    Optional cache salt to be used for prefix caching.
+    """
+
+
+class MultiModalEncDecInputs(MultiModalInputs):
+    """
+    Represents the outputs of
+    [`EncDecMultiModalProcessor`][vllm.multimodal.processing.EncDecMultiModalProcessor]
+    ready to be passed to vLLM internals.
+    """
+
+    encoder_prompt: str
+    """The processed encoder prompt text."""
+
+    encoder_prompt_token_ids: list[int]
+    """The processed token IDs of the encoder prompt."""
+
+    encoder_token_type_ids: NotRequired[list[int]]
+    """The token type IDs of the encoder prompt."""
diff --git a/vllm_v0.10.0/vllm/multimodal/parse.py b/vllm_v0.10.0/vllm/multimodal/parse.py
new file mode 100644
index 0000000..37f5612
--- /dev/null
+++ b/vllm_v0.10.0/vllm/multimodal/parse.py
@@ -0,0 +1,499 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import ABC, abstractmethod
+from collections import UserDict
+from collections.abc import Callable, Iterator, Mapping, Sequence
+from typing import (TYPE_CHECKING, Any, Generic, Literal, NamedTuple, Optional,
+                    TypeVar, Union)
+
+import numpy as np
+import torch
+from typing_extensions import TypeAlias, TypeGuard, assert_never
+
+from vllm.utils import LazyLoader, is_list_of
+
+from .audio import AudioResampler
+from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem,
+                     ImageItem, ModalityData, MultiModalDataDict,
+                     MultiModalFieldConfig, MultiModalKwargs, VideoItem)
+
+_T = TypeVar("_T")
+_I = TypeVar("_I")
+
+if TYPE_CHECKING:
+    import PIL.Image as PILImage
+else:
+    PILImage = LazyLoader("PILImage", globals(), "PIL.Image")
+
+
+class ModalityDataItems(ABC, Generic[_T, _I]):
+    """
+    Represents data items for a modality in
+    [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems].
+    """
+
+    def __init__(self, data: _T, modality: str) -> None:
+        super().__init__()
+
+        self.data = data
+        self.modality = modality
+
+    def __repr__(self) -> str:
+        return (f"{type(self).__name__}(modality={self.modality!r}, "
+                f"len={len(self)})")
+
+    def __len__(self) -> int:
+        return self.get_count()
+
+    def __getitem__(self, index: int) -> _I:
+        return self.get(index)
+
+    if TYPE_CHECKING:
+        # Auto-generated
+        def __iter__(self) -> Iterator[_I]:
+            ...
+
+    @abstractmethod
+    def get_count(self) -> int:
+        """Get the number of data items."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def get(self, index: int) -> _I:
+        """Get a data item by its index."""
+        raise NotImplementedError
+
+    def get_all(self) -> list[_I]:
+        """Get all data items."""
+        return [self.get(idx) for idx in range(self.get_count())]
+
+    @abstractmethod
+    def get_processor_data(self) -> Mapping[str, object]:
+        """Get the data to pass to the HF processor."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_passthrough_data(self) -> Mapping[str, object]:
+        """Get the data to pass directly to the model."""
+        raise NotImplementedError
+
+
+class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
+    """Base class for data items that are arranged in a list."""
+
+    def get_count(self) -> int:
+        return len(self.data)
+
+    def get(self, index: int) -> _T:
+        return self.data[index]
+
+    def get_processor_data(self) -> Mapping[str, object]:
+        return {f"{self.modality}s": self.data}
+
+    def get_passthrough_data(self) -> Mapping[str, object]:
+        return {}
+
+
+class EmbeddingItems(ModalityDataItems[Union[torch.Tensor, list[torch.Tensor]],
+                                       torch.Tensor]):
+    """
+    Base class for data items that are expressed as a batched embedding tensor,
+    or a list of embedding tensors (one per item).
+    """
+
+    def get_count(self) -> int:
+        return len(self.data)
+
+    def get(self, index: int) -> torch.Tensor:
+        return self.data[index]
+
+    def get_processor_data(self) -> Mapping[str, object]:
+        return {}
+
+    def get_passthrough_data(self) -> Mapping[str, object]:
+        return {f"{self.modality}_embeds": self.data}
+
+    def get_feature_size(self, item_idx: int) -> int:
+        return len(self.get(item_idx))
+
+
+class DictEmbeddingItems(ModalityDataItems[Mapping[str, torch.Tensor],
+                                           Mapping[str, torch.Tensor]]):
+    """
+    Base class for data items that are expressed as a dictionary of tensors.
+
+    Usually, the dictionary keys correspond to the outputs of HF processor.
+    """
+
+    def __init__(
+        self,
+        data: Mapping[str, torch.Tensor],
+        modality: str,
+        required_fields: set[str],
+        fields_factory: Callable[
+            [Mapping[str, torch.Tensor]],
+            Mapping[str, MultiModalFieldConfig],
+        ],
+    ) -> None:
+        from transformers.feature_extraction_utils import BatchFeature
+
+        super().__init__(data, modality)
+
+        missing_required_data_keys = required_fields - data.keys()
+        if missing_required_data_keys:
+            data_keys = set(data.keys())
+            msg = (f"The data should contain the fields: {required_fields}, "
+                   f"but only found the following keys: {data_keys}")
+            raise ValueError(msg)
+
+        fields_config = fields_factory(data)
+        missing_required_fields = required_fields - fields_config.keys()
+        if missing_required_fields:
+            fields = set(fields_config.keys())
+            msg = f"{required_fields=} should be a subset of {fields=}"
+            raise ValueError(msg)
+
+        self.fields_config = fields_config
+        self.required_fields = required_fields
+
+        self._kwargs = MultiModalKwargs.from_hf_inputs(
+            BatchFeature(dict(data)),
+            fields_config,
+        )
+
+    def get_count(self) -> int:
+        return self._kwargs.get_item_count(self.modality)
+
+    def get(self, index: int) -> Mapping[str, torch.Tensor]:
+        return {
+            k: v.data
+            for k, v in self._kwargs.get_item(self.modality, index).items()
+        }
+
+    def get_processor_data(self) -> Mapping[str, object]:
+        return {}
+
+    def get_passthrough_data(self) -> Mapping[str, object]:
+        return self.data
+
+
+class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]):
+
+    def __init__(self, data: Sequence[HfAudioItem]) -> None:
+        super().__init__(data, "audio")
+
+    def get_audio_length(self, item_idx: int) -> int:
+        audio = self.get(item_idx)
+        return len(audio)
+
+
+class AudioEmbeddingItems(EmbeddingItems):
+
+    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
+        super().__init__(data, "audio")
+
+
+class ImageSize(NamedTuple):
+    width: int
+    height: int
+
+
+class ImageProcessorItems(ProcessorBatchItems[HfImageItem]):
+
+    def __init__(self, data: Sequence[HfImageItem]) -> None:
+        super().__init__(data, "image")
+
+    def get_image_size(self, item_idx: int) -> ImageSize:
+        image = self.get(item_idx)
+
+        if isinstance(image, PILImage.Image):
+            return ImageSize(*image.size)
+        if isinstance(image, (np.ndarray, torch.Tensor)):
+            _, h, w = image.shape
+            return ImageSize(w, h)
+
+        assert_never(image)
+
+
+class ImageEmbeddingItems(EmbeddingItems):
+
+    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
+        super().__init__(data, "image")
+
+
+class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]):
+
+    def __init__(
+        self,
+        data: Sequence[HfVideoItem],
+        metadata: Optional[Union[dict[str, Any],
+                                 list[Optional[dict[str, Any]]]]] = None,
+    ) -> None:
+        super().__init__(data, "video")
+        self.metadata = metadata
+
+    def get_num_frames(self, item_idx: int) -> int:
+        return len(self.get(item_idx))
+
+    def get_frame_size(self, item_idx: int) -> ImageSize:
+        image = self.get(item_idx)[0]  # Assume that the video isn't empty
+
+        if isinstance(image, PILImage.Image):
+            return ImageSize(*image.size)
+        if isinstance(image, (np.ndarray, torch.Tensor)):
+            _, h, w = image.shape
+            return ImageSize(w, h)
+
+        assert_never(image)
+
+
+class VideoEmbeddingItems(EmbeddingItems):
+
+    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
+        super().__init__(data, "video")
+
+
+_D = TypeVar("_D", bound=ModalityDataItems[Any, Any])
+
+
+class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
+    """
+    As [`MultiModalDataDict`][vllm.multimodal.inputs.MultiModalDataDict], but
+    normalized such that each entry corresponds to a list.
+    """
+
+    def get_count(self, modality: str, *, strict: bool = True) -> int:
+        """
+        Get the number of data items belonging to a modality.
+
+        If `strict=False`, return `0` instead of raising [`KeyError`][]
+        even if the modality is not found.
+        """
+        if modality not in self:
+            if strict:
+                available_modalities = set(self.keys())
+                raise KeyError(f"Modality {modality!r} not found. "
+                               f"Available modalities: {available_modalities}")
+
+            return 0
+
+        return self[modality].get_count()
+
+    def get_all_counts(self) -> Mapping[str, int]:
+        """Get the number of items belonging to each modality."""
+        return {m: items.get_count() for m, items in self.items()}
+
+    def get_items(
+        self,
+        modality: str,
+        typ: Union[type[_D], tuple[type[_D], ...]],
+    ) -> _D:
+        """
+        Get the data items belonging to a modality,
+        requiring that they belong to a certain type.
+        """
+        if modality not in self:
+            available_modalities = set(self.keys())
+            raise KeyError(f"Modality {modality!r} not found. "
+                           f"Available modalities: {available_modalities}")
+
+        items = self[modality]
+        if not isinstance(items, typ):
+            raise TypeError(f"Invalid type of data items for {modality=}. "
+                            f"Expected type: {typ}, but "
+                            f"found type: {type(items)}")
+
+        return items  # type: ignore[return-value]
+
+
+ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]],
+                                         Optional[ModalityDataItems[Any, Any]]]
+
+
+class MultiModalDataParser:
+    """
+    Parses [`MultiModalDataDict`][vllm.multimodal.inputs.MultiModalDataDict]
+    into [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems].
+
+    Args:
+        target_sr (float, optional): Enables automatic resampling of audio
+            items to the model's expected sampling rate.
+    """
+
+    def __init__(
+        self,
+        *,
+        target_sr: Optional[float] = None,
+        audio_resample_method: Literal["librosa", "scipy"] = "librosa",
+        video_needs_metadata: bool = False,
+    ) -> None:
+        super().__init__()
+
+        self.audio_resampler = AudioResampler(
+            target_sr=target_sr,
+            method=audio_resample_method,
+        )
+        self.video_needs_metadata = video_needs_metadata
+
+    def _is_embeddings(
+            self, data: object
+    ) -> TypeGuard[Union[torch.Tensor, list[torch.Tensor]]]:
+        if isinstance(data, torch.Tensor):
+            return data.ndim == 3
+        if is_list_of(data, torch.Tensor):
+            return data[0].ndim == 2
+
+        return False
+
+    def _is_empty(self, data: object) -> TypeGuard[None]:
+        if isinstance(data, list):
+            return len(data) == 0
+        if isinstance(data, (np.ndarray, torch.Tensor)):
+            return data.size == 0
+
+        return False
+
+    def _get_audio_with_sr(
+        self,
+        audio: AudioItem,
+    ) -> tuple[np.ndarray, Optional[float]]:
+        if isinstance(audio, tuple):
+            return audio
+        if isinstance(audio, list):
+            return np.array(audio), None
+        if isinstance(audio, np.ndarray):
+            return audio, None
+        if isinstance(audio, torch.Tensor):
+            return audio.numpy(), None
+
+        assert_never(audio)
+
+    def _get_video_with_metadata(
+        self,
+        video: VideoItem,
+    ) -> tuple[np.ndarray, Optional[dict[str, Any]]]:
+        if isinstance(video, tuple):
+            return video
+        if isinstance(video, list):
+            return np.array(video), None
+        if isinstance(video, np.ndarray):
+            return video, None
+        if isinstance(video, torch.Tensor):
+            return video.numpy(), None
+
+        assert_never(video)
+
+    def _parse_audio_data(
+        self,
+        data: ModalityData[AudioItem],
+    ) -> Optional[ModalityDataItems[Any, Any]]:
+        # also check single audio item with sampling rate
+        if self._is_empty(data) or (isinstance(data, tuple)
+                                    and self._is_empty(data[0])):
+            return None
+
+        if self._is_embeddings(data):
+            return AudioEmbeddingItems(data)
+
+        if (is_list_of(data, float)
+                or isinstance(data,
+                              (np.ndarray, torch.Tensor)) and data.ndim == 1
+                or isinstance(data, tuple)):
+            data_items = [data]
+        elif isinstance(data, (np.ndarray, torch.Tensor)):
+            data_items = [elem for elem in data]
+        else:
+            data_items = data
+
+        new_audios = list[np.ndarray]()
+        for data_item in data_items:
+            audio, orig_sr = self._get_audio_with_sr(data_item)
+            if orig_sr is None:
+                new_audio = audio
+            else:
+                new_audio = self.audio_resampler.resample(audio,
+                                                          orig_sr=orig_sr)
+
+            new_audios.append(new_audio)
+
+        return AudioProcessorItems(new_audios)
+
+    def _parse_image_data(
+        self,
+        data: ModalityData[ImageItem],
+    ) -> Optional[ModalityDataItems[Any, Any]]:
+        if self._is_empty(data):
+            return None
+
+        if self._is_embeddings(data):
+            return ImageEmbeddingItems(data)
+
+        if (isinstance(data, PILImage.Image)
+                or isinstance(data,
+                              (np.ndarray, torch.Tensor)) and data.ndim == 3):
+            data_items = [data]
+        elif isinstance(data, (np.ndarray, torch.Tensor)):
+            data_items = [elem for elem in data]
+        else:
+            data_items = data
+
+        return ImageProcessorItems(data_items)
+
+    def _parse_video_data(
+        self,
+        data: ModalityData[VideoItem],
+    ) -> Optional[ModalityDataItems[Any, Any]]:
+        if self._is_empty(data):
+            return None
+
+        if self._is_embeddings(data):
+            return VideoEmbeddingItems(data)
+
+        if (is_list_of(data, PILImage.Image)
+                or isinstance(data,
+                              (np.ndarray, torch.Tensor)) and data.ndim == 4):
+            data_items = [data]
+        elif isinstance(data, (np.ndarray, torch.Tensor)):
+            data_items = [elem for elem in data]
+        elif isinstance(data, tuple) and len(data) == 2:
+            data_items = [data]
+        else:
+            data_items = data
+
+        new_videos = list[tuple[np.ndarray, Optional[dict[str, Any]]]]()
+        metadata_lst: list[Optional[dict[str, Any]]] = []
+        for data_item in data_items:
+            video, metadata = self._get_video_with_metadata(data_item)
+            if self.video_needs_metadata:
+                new_videos.append((video, metadata))
+                metadata_lst.append(metadata)
+            else:
+                new_videos.append(video)
+
+        if not self.video_needs_metadata:
+            metadata = None
+
+        return VideoProcessorItems(new_videos, metadata=metadata_lst)
+
+    def _get_subparsers(self) -> Mapping[str, ModalityDataParser]:
+        return {
+            "audio": self._parse_audio_data,
+            "image": self._parse_image_data,
+            "video": self._parse_video_data,
+        }
+
+    def parse_mm_data(self,
+                      mm_data: MultiModalDataDict) -> MultiModalDataItems:
+        subparsers = self._get_subparsers()
+
+        mm_items = MultiModalDataItems()
+        for k, v in mm_data.items():
+            if k not in subparsers:
+                raise ValueError(f"Unsupported modality: {k}")
+
+            # ignore empty embedding data
+            if (parsed_data := subparsers[k](v)) is not None:
+                mm_items[k] = parsed_data
+
+        return mm_items
diff --git a/vllm_v0.10.0/vllm/multimodal/processing.py b/vllm_v0.10.0/vllm/multimodal/processing.py
new file mode 100644
index 0000000..78d244a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/multimodal/processing.py
@@ -0,0 +1,1953 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+import sys
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from collections.abc import (Callable, Generator, ItemsView, Iterable, Mapping,
+                             Sequence)
+from dataclasses import dataclass, field
+from enum import Enum
+from functools import lru_cache
+from typing import (TYPE_CHECKING, Generic, NamedTuple, Optional, Protocol,
+                    TypeVar, Union, cast)
+
+import regex as re
+import torch
+from typing_extensions import assert_never
+
+from vllm.inputs import InputProcessingContext
+from vllm.jsontree import json_map_leaves, json_reduce_leaves
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens,
+                                               encode_tokens)
+from vllm.utils import GiB_bytes, LRUCache, flatten_2d_lists, full_groupby
+
+from .hasher import MultiModalHasher
+from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
+                     MultiModalFieldConfig, MultiModalInputs, MultiModalKwargs,
+                     MultiModalKwargsItem, NestedTensors, PlaceholderRange)
+from .parse import (DictEmbeddingItems, EmbeddingItems, MultiModalDataItems,
+                    MultiModalDataParser)
+
+if TYPE_CHECKING:
+    from transformers.configuration_utils import PretrainedConfig
+    from transformers.feature_extraction_utils import BatchFeature
+    from transformers.processing_utils import ProcessorMixin
+
+    from .profiling import BaseDummyInputsBuilder
+
+logger = init_logger(__name__)
+
+_S = TypeVar("_S", str, list[int])
+
+PromptSeq = Union[str, list[int]]
+"""A token sequence (list of token IDs) or text."""
+
+
+@dataclass
+class PromptIndex:
+    """Resolves to an index in the prompt."""
+    get_match_index: Callable[[AnyTokenizer, PromptSeq], Optional[int]]
+
+
+class PromptIndexTargets:
+
+    @staticmethod
+    def start() -> PromptIndex:
+        """
+        Resolves to the start of the prompt (before the first token).
+
+        This results in a match even if the prompt is empty.
+        """
+        return PromptIndex(lambda tok, prompt: 0)
+
+    @staticmethod
+    def prefix(seq: PromptSeq) -> PromptIndex:
+        """
+        Resolves to a location in the prompt after the given prefix.
+        """
+
+        def get_match_index(
+            tokenizer: AnyTokenizer,
+            prompt: PromptSeq,
+        ) -> Optional[int]:
+            prefix = seq
+
+            if isinstance(prompt, str):
+                if not isinstance(prefix, str):
+                    # Make both `str`
+                    prefix = decode_tokens(tokenizer, prefix)
+            else:
+                if isinstance(prefix, str):
+                    # Make both `list[int]`
+                    prefix = encode_tokens(tokenizer,
+                                           prefix,
+                                           add_special_tokens=False)
+
+            match_idx = len(prefix)
+            return match_idx if prompt[:match_idx] == prefix else None
+
+        return PromptIndex(get_match_index)
+
+    @staticmethod
+    def end() -> PromptIndex:
+        """
+        Resolves to the end of the prompt (after the last token).
+
+        This results in a match even if the prompt is empty.
+        """
+        return PromptIndex(lambda tok, prompt: len(prompt))
+
+
+PromptTarget = Union[PromptSeq, PromptIndex]
+"""
+The token sequence or text to update.
+"""
+
+
+@dataclass
+class PromptUpdateDetails(Generic[_S]):
+    """Details about the token sequence or text that are part of the update."""
+
+    full: _S
+    """The full content."""
+
+    is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]] = None
+    """
+    Given [`full`][vllm.multimodal.processing.PromptUpdateDetails.full],
+    return a boolean mask of shape `(len(full),)` indicating which positions
+    of `full` to assign embeddings to.
+
+    `None` (default) means to assign embeddings to all positions of `full`.
+
+    The embeddings are obtained by calling
+    [`SupportsMultiModal.get_multimodal_embeddings`][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings].
+    """
+
+    @staticmethod
+    def from_seq(seq: _S) -> "PromptUpdateDetails[_S]":
+        return PromptUpdateDetails(full=seq)
+
+    @staticmethod
+    def select_text(
+        seq: _S,
+        embed_text: str,
+    ) -> "PromptUpdateDetails[_S]":
+
+        def is_embed(full: "_BoundPromptSequence") -> torch.Tensor:
+            embed_token_ids = encode_tokens(full.tokenizer, embed_text)
+
+            return torch.isin(
+                torch.tensor(full.token_ids),
+                torch.tensor(embed_token_ids),
+            )
+
+        return PromptUpdateDetails(full=seq, is_embed=is_embed)
+
+    @staticmethod
+    def select_token_id(
+        seq: _S,
+        embed_token_id: int,
+    ) -> "PromptUpdateDetails[_S]":
+        return PromptUpdateDetails(
+            full=seq,
+            is_embed=lambda f: torch.tensor(f.token_ids) == embed_token_id,
+        )
+
+
+PromptUpdateInfo = Union[PromptSeq, PromptUpdateDetails]
+"""
+The token sequence or text that are part of the update.
+
+If only part of the content corresponds to feature placeholders, you can
+use [`PromptUpdateDetails`][vllm.multimodal.processing.PromptUpdateDetails] to
+specify which part.
+"""
+
+PromptUpdateContent = Union[Callable[[int], PromptUpdateInfo],
+                            PromptUpdateInfo]
+"""
+Given the index of the processed item within
+[`modality`][vllm.multimodal.processing.PromptUpdate.modality],
+output the corresponding token sequence (or text).
+
+For convenience, you can directly pass in the token sequence (or text)
+instead of a function if it does not depend on the input.
+"""
+
+
+class UpdateMode(str, Enum):
+    INSERT = "insert"
+    REPLACE = "replace"
+
+
+@dataclass
+class PromptUpdate(ABC):
+    """
+    Defines how to update a prompt with placeholder tokens.
+    """
+
+    modality: str
+    """The modality for which the update is made."""
+
+    target: PromptTarget
+    """The token sequence (or text) to update."""
+
+    @property
+    @abstractmethod
+    def content(self) -> PromptUpdateContent:
+        """The placeholder tokens that are part of the update."""
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def mode(self) -> UpdateMode:
+        """Defines how to update the prompt."""
+        raise NotImplementedError
+
+    def bind(self, tokenizer: AnyTokenizer) -> "BoundPromptUpdate":
+        return BoundPromptUpdate(
+            _origin=self,
+            tokenizer=tokenizer,
+        )
+
+
+@dataclass
+class PromptInsertion(PromptUpdate):
+    """
+    Defines how to insert placeholder tokens into a prompt.
+
+    Example:
+
+    For each image, insert a number of ``<image>`` feature placeholders
+    equal to the feature size of the vision encoder after the ``<s>`` token:
+
+    ```python
+    PromptInsertion(
+        modality="image",
+        target="<s>",
+        insertion="<image>" * image_feature_size,
+    )
+    ```
+
+    Insert these tokens at the start of the prompt:
+
+    ```python
+    PromptInsertion(
+        modality="image",
+        target=PromptIndexTargets.start(),
+        insertion="<image>" * image_feature_size,
+    )
+    ```
+
+    Insert these tokens after a prefix ``Images:``:
+
+    ```python
+    PromptInsertion(
+        modality="image",
+        target=PromptIndexTargets.prefix("Images:"),
+        insertion="<image>" * image_feature_size,
+    )
+    ```
+
+    Insert these tokens at the end of the prompt:
+
+    ```python
+    PromptInsertion(
+        modality="image",
+        target=PromptIndexTargets.end(),
+        insertion="<image>" * image_feature_size,
+    )
+    ```
+    """
+
+    insertion: PromptUpdateContent = field(repr=False)
+    """
+    Given the index of the processed item within
+    [`modality`][vllm.multimodal.processing.PromptUpdate.modality],
+    output the token sequence (or text) to insert right after
+    [`target`][vllm.multimodal.processing.PromptUpdate.target].
+
+    For convenience, you can directly pass in the token sequence (or text)
+    instead of a function if it does not depend on the input.
+    """
+
+    @property
+    def content(self) -> PromptUpdateContent:
+        return self.insertion
+
+    @property
+    def mode(self) -> UpdateMode:
+        return UpdateMode.INSERT
+
+
+@dataclass
+class PromptReplacement(PromptUpdate):
+    """
+    Defines how to replace portions of an input prompt with placeholder tokens.
+
+    Example:
+
+    For each image, replace one ``<image>`` input placeholder in the prompt
+    with a number of ``<image>`` feature placeholders
+    equal to the feature size of the vision encoder:
+
+    ```python
+    PromptReplacement(
+        modality="image",
+        target="<image>",
+        replacement="<image>" * image_feature_size,
+    )
+    ```
+
+    As above, but further pad the feature placeholders with ``<image_bos>``
+    and `<image_eos>``, which are not supposed to be passed to the vision
+    encoder:
+
+    ```python
+    PromptReplacement(
+        modality="image",
+        target="<image>",
+        replacement=PromptUpdateDetails(
+            full="".join([
+                "<image_bos>",
+                "<image>" * image_feature_size,
+                "<image_eos>",
+            ]),
+            features="<image>" * image_feature_size,
+        ),
+    )
+    ```
+
+    To avoid unnecessary tokenization during prompt replacement,
+    we recommended passing token sequences instead of text:
+
+    ```python
+    PromptReplacement(
+        modality="image",
+        target=[image_token_id],
+        replacement=PromptUpdateDetails(
+            full=([image_bos_id] + [image_token_id] * image_feature_size
+                    + [image_eos_id]),
+            features=[image_token_id] * image_feature_size,
+        ),
+    )
+    ```
+    """
+
+    replacement: PromptUpdateContent = field(repr=False)
+    """
+    Given the index of the processed item within
+    [`modality`][vllm.multimodal.processing.PromptUpdate.modality],
+    output the token sequence (or text) to replace
+    [`target`][vllm.multimodal.processing.PromptUpdate.target].
+
+    For convenience, you can directly pass in the token sequence (or text)
+    instead of a function if it does not depend on the input.
+    """
+
+    @property
+    def content(self) -> PromptUpdateContent:
+        return self.replacement
+
+    @property
+    def mode(self) -> UpdateMode:
+        return UpdateMode.REPLACE
+
+
+@lru_cache(maxsize=2048)
+def _cached_encode(
+    tokenizer: AnyTokenizer,
+    text: str,
+    *,
+    add_special_tokens: Optional[bool] = None,
+) -> list[int]:
+    return encode_tokens(tokenizer,
+                         text,
+                         add_special_tokens=add_special_tokens)
+
+
+@lru_cache(maxsize=2048)
+def _cached_decode(
+    tokenizer: AnyTokenizer,
+    token_ids: tuple[int, ...],
+    *,
+    skip_special_tokens: Optional[bool] = None,
+) -> str:
+    return decode_tokens(tokenizer,
+                         list(token_ids),
+                         skip_special_tokens=skip_special_tokens)
+
+
+class _HasModalityAttr(Protocol):
+    modality: str
+
+
+class _HasModalityProp(Protocol):
+
+    @property
+    def modality(self) -> str:
+        ...
+
+
+_M = TypeVar("_M", bound=Union[_HasModalityAttr, _HasModalityProp])
+
+
+def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]:
+    """Convenience function to apply [`full_groupby`][vllm.utils.full_groupby]
+    based on modality."""
+    return full_groupby(values, key=lambda x: x.modality)
+
+
+@dataclass
+class _BoundPromptSequence:
+    """
+    A [`_PromptSeq`][vllm.multimodal.processing.PromptSeq] bound
+    to a tokenizer to automatically
+    convert between token sequence and text representations.
+    """
+    tokenizer: AnyTokenizer = field(repr=False)
+
+    _text: Optional[str]
+    _token_ids: Optional[list[int]]
+
+    @staticmethod
+    def from_seq(
+        tokenizer: AnyTokenizer,
+        seq: PromptSeq,
+    ) -> "_BoundPromptSequence":
+        return _BoundPromptSequence(
+            tokenizer=tokenizer,
+            _text=seq if isinstance(seq, str) else None,
+            _token_ids=seq if isinstance(seq, list) else None,
+        )
+
+    def __post_init__(self) -> None:
+        if self._text is None and self._token_ids is None:
+            raise ValueError("At least one of 'text' and 'token_ids' must be "
+                             "specified")
+
+    @property
+    def text(self) -> str:
+        if self._text is None:
+            assert self._token_ids is not None
+            self._text = _cached_decode(self.tokenizer, tuple(self._token_ids))
+
+        return self._text
+
+    @property
+    def token_ids(self) -> list[int]:
+        if self._token_ids is None:
+            assert self._text is not None
+            self._token_ids = _cached_encode(self.tokenizer,
+                                             self._text,
+                                             add_special_tokens=False)
+
+        return self._token_ids
+
+
+@dataclass
+class _BoundPromptContent:
+    full: _BoundPromptSequence
+    is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]]
+
+
+@dataclass
+class BoundPromptUpdate:
+    """
+    A [`PromptUpdate`][vllm.multimodal.processing.PromptUpdate] bound
+    to a tokenizer to automatically convert
+    [`target`][vllm.multimodal.processing.PromptUpdate.target] and the result of
+    [`get_content`][vllm.multimodal.processing.BoundPromptUpdate.get_content]
+    between token sequence and text representations.
+    """
+    _origin: PromptUpdate
+    tokenizer: AnyTokenizer = field(repr=False)
+
+    def __post_init__(self) -> None:
+        self._content_cache = dict[int, _BoundPromptContent]()
+
+    @property
+    def modality(self) -> str:
+        return self._origin.modality
+
+    @property
+    def target(self) -> Union[_BoundPromptSequence, PromptIndex]:
+        """The token sequence (or text) to update."""
+        target = self._origin.target
+
+        if isinstance(target, PromptIndex):
+            return target
+
+        return _BoundPromptSequence.from_seq(self.tokenizer, target)
+
+    @property
+    def content(self) -> PromptUpdateContent:
+        """The placeholder tokens that are part of the update."""
+        return self._origin.content
+
+    @property
+    def mode(self) -> UpdateMode:
+        """Defines how to update the prompt."""
+        return self._origin.mode
+
+    def get_content(self, item_idx: int) -> _BoundPromptContent:
+        """
+        Given the index of the processed item within
+        [`modality`][vllm.multimodal.processing.PromptUpdate.modality],
+        output the token sequence (or text) to update.
+        """
+        content = self.content
+        if callable(content):
+            cache_key = item_idx
+            if cache_key in self._content_cache:
+                return self._content_cache[cache_key]
+
+            content = content(item_idx)
+        else:
+            cache_key = None
+
+        if not isinstance(content, PromptUpdateDetails):
+            content = PromptUpdateDetails.from_seq(content)
+
+        bound_full = _BoundPromptSequence.from_seq(self.tokenizer,
+                                                   content.full)
+        bound_content = _BoundPromptContent(full=bound_full,
+                                            is_embed=content.is_embed)
+
+        if cache_key is not None:
+            self._content_cache[cache_key] = bound_content
+
+        return bound_content
+
+
+class _TokenMatch(NamedTuple):
+    start_idx: int
+    end_idx: int
+
+
+def iter_token_matches(
+    token_ids: list[int],
+    match_ids: list[int],
+) -> Generator[_TokenMatch]:
+    """
+    Yield each occurrence of `match_ids` in `token_ids`.
+
+    Note that empty matches are ignored.
+    """
+    prompt_len = len(token_ids)
+    match_len = len(match_ids)
+
+    if match_len == 0:
+        return
+
+    start_idx = 0
+    while start_idx < prompt_len - match_len + 1:
+        end_idx = start_idx + match_len
+
+        if token_ids[start_idx:end_idx] == match_ids:
+            yield _TokenMatch(start_idx=start_idx, end_idx=end_idx)
+
+            # Exclude overlapping matches
+            start_idx = end_idx
+        else:
+            start_idx += 1
+
+
+def replace_token_matches(
+    token_ids: list[int],
+    match_ids: list[int],
+    new_ids: list[int],
+) -> list[int]:
+    """
+    Replace each occurrence of `match_ids` in `token_ids`
+    with `new_ids`.
+
+    Note that empty matches are ignored.
+    """
+    out_seqs = list[list[int]]()
+    prev_end_idx = 0
+
+    for match in iter_token_matches(token_ids, match_ids):
+        start_idx = match.start_idx
+        end_idx = match.end_idx
+
+        out_seqs.append(token_ids[prev_end_idx:start_idx])
+        out_seqs.append(new_ids)
+        prev_end_idx = end_idx
+
+    out_seqs.append(token_ids[prev_end_idx:])
+
+    return flatten_2d_lists(out_seqs)
+
+
+@dataclass(repr=False)
+class PromptTargetMatch(ABC):
+    _origin: BoundPromptUpdate
+
+    @property
+    def modality(self) -> str:
+        return self._origin.modality
+
+    @property
+    @abstractmethod
+    def start_idx(self) -> int:
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def end_idx(self) -> int:
+        raise NotImplementedError
+
+    def __repr__(self) -> str:
+        return (f"{type(self).__name__}(modality={self.modality!r}, "
+                f"start_idx={self.start_idx!r}, end_idx={self.end_idx!r})")
+
+
+@dataclass(repr=False)
+class _PromptTargetIndexMatch(PromptTargetMatch):
+    match_idx: int
+
+    @property
+    def start_idx(self) -> int:
+        return self.match_idx
+
+    @property
+    def end_idx(self) -> int:
+        return self.match_idx
+
+
+@dataclass(repr=False)
+class _PromptTargetTokenMatch(PromptTargetMatch):
+    match: _TokenMatch
+
+    @property
+    def start_idx(self) -> int:
+        return self.match.start_idx
+
+    @property
+    def end_idx(self) -> int:
+        return self.match.end_idx
+
+
+@dataclass(repr=False)
+class _PromptTargetTextMatch(PromptTargetMatch):
+    match: re.Match[str]
+
+    @property
+    def start_idx(self) -> int:
+        return self.match.start()
+
+    @property
+    def end_idx(self) -> int:
+        return self.match.end()
+
+
+@dataclass
+class PlaceholderFeaturesInfo:
+    modality: str
+    item_idx: int
+    start_idx: int
+    tokens: list[int]
+    is_embed: Optional[torch.Tensor]
+
+    @property
+    def length(self) -> int:
+        return len(self.tokens)
+
+    def to_range(self) -> PlaceholderRange:
+        # TODO: Is it worth it to optimize this by stripping the
+        # leading and ending positions where `is_embed=False`?
+        return PlaceholderRange(
+            offset=self.start_idx,
+            length=self.length,
+            is_embed=self.is_embed,
+        )
+
+
+def find_token_matches(
+    prompt: list[int],
+    prompt_updates: Sequence[BoundPromptUpdate],
+) -> Sequence[PromptTargetMatch]:
+    """Return each target of `prompt_updates` found in `prompt`."""
+
+    def get_matches(update: BoundPromptUpdate):
+        target = update.target
+
+        if isinstance(target, PromptIndex):
+            match_idx = target.get_match_index(update.tokenizer, prompt)
+            if match_idx is None:
+                return []
+
+            return [_PromptTargetIndexMatch(update, match_idx)]
+
+        return [
+            _PromptTargetTokenMatch(update, match)
+            for match in iter_token_matches(prompt, target.token_ids)
+        ]
+
+    return [
+        match for update in prompt_updates for match in get_matches(update)
+    ]
+
+
+def find_text_matches(
+    prompt: str,
+    prompt_updates: Sequence[BoundPromptUpdate],
+) -> Sequence[PromptTargetMatch]:
+    """Return each target of `prompt_updates` found in `prompt`."""
+
+    def get_matches(update: BoundPromptUpdate):
+        target = update.target
+
+        if isinstance(target, PromptIndex):
+            match_idx = target.get_match_index(update.tokenizer, prompt)
+            if match_idx is None:
+                return []
+
+            return [_PromptTargetIndexMatch(update, match_idx)]
+
+        return [
+            _PromptTargetTextMatch(update, match)
+            for match in re.finditer(re.escape(target.text), prompt)
+        ]
+
+    return [
+        match for update in prompt_updates for match in get_matches(update)
+    ]
+
+
+def _resolve_matches(
+    prompt: PromptSeq,
+    mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
+) -> list[PromptTargetMatch]:
+    """
+    Resolve `mm_matches` to ensure that there are no overlapping matches,
+    and sort them such that earlier matches take priority over later ones.
+    """
+    matches = [m for matches in mm_matches.values() for m in matches]
+
+    seen_matches: list[Optional[PromptTargetMatch]] = [None] * len(prompt)
+
+    for match in matches:
+        for idx in range(match.start_idx, match.end_idx):
+            if seen_matches[idx] is not None:
+                raise ValueError("Found overlapping matches "
+                                 f"({seen_matches[idx]} and {match}) "
+                                 f"at index={idx} of prompt={prompt}")
+
+            seen_matches[idx] = match
+
+    return sorted(matches, key=lambda x: x.start_idx)
+
+
+def _apply_matches(
+    prompt: _S,
+    mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
+    mm_item_counts: Mapping[str, int],
+) -> list[_S]:
+    """Apply the updates in `mm_matches` to `prompt`."""
+    out_seqs = list[Union[str, list[int]]]()
+    prev_end_idx = 0
+    next_idx_by_modality = defaultdict[str, int](lambda: 0)
+
+    for match in _resolve_matches(prompt, mm_matches):
+        modality = match.modality
+
+        item_start_idx = next_idx_by_modality[modality]
+        max_item_count = mm_item_counts.get(modality, 0)
+        if item_start_idx >= max_item_count:
+            continue
+
+        start_idx = match.start_idx
+        end_idx = match.end_idx
+        origin = match._origin
+        mode = origin.mode
+
+        if mode == UpdateMode.INSERT:
+            out_seqs.append(prompt[prev_end_idx:end_idx])
+            num_inserts = max_item_count
+        elif mode == UpdateMode.REPLACE:
+            out_seqs.append(prompt[prev_end_idx:start_idx])
+            num_inserts = max_item_count if start_idx == end_idx else 1
+        else:
+            assert_never(mode)
+
+        item_end_idx = min(item_start_idx + num_inserts, max_item_count)
+
+        for item_idx in range(item_start_idx, item_end_idx):
+            content = origin.get_content(item_idx)
+            insert_seq = (content.full.text if isinstance(prompt, str) else
+                          content.full.token_ids)
+
+            out_seqs.append(insert_seq)
+
+        prev_end_idx = end_idx
+        next_idx_by_modality[modality] += item_end_idx - item_start_idx
+
+    out_seqs.append(prompt[prev_end_idx:])
+
+    return cast(list[_S], out_seqs)
+
+
+def apply_token_matches(
+    prompt: list[int],
+    mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
+    mm_item_counts: Mapping[str, int],
+) -> list[int]:
+    """Apply the updates in `mm_matches` to `prompt`."""
+    if not mm_matches:
+        return prompt
+
+    token_id_seqs = _apply_matches(prompt, mm_matches, mm_item_counts)
+
+    return flatten_2d_lists(token_id_seqs)
+
+
+def apply_text_matches(
+    prompt: str,
+    mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
+    mm_item_counts: Mapping[str, int],
+) -> str:
+    """Apply the updates in `mm_matches` to `prompt`."""
+    if not mm_matches:
+        return prompt
+
+    texts = _apply_matches(prompt, mm_matches, mm_item_counts)
+
+    return "".join(texts)
+
+
+def _iter_placeholders(
+    mm_prompt_updates: Mapping[str, Sequence[BoundPromptUpdate]],
+    prompt: list[int],
+    mm_item_counts: Mapping[str, int],
+) -> Iterable[PlaceholderFeaturesInfo]:
+    """
+    Yield each set of placeholder tokens found in `prompt`.
+
+    Matches are exclusive even when multiple modalities share
+    the same placeholder tokens. In that case, the modality that
+    appears earlier in `mm_prompt_updates` takes priority.
+
+    Note that empty matches are ignored.
+    """
+    prompt_len = len(prompt)
+    item_idx_by_modality = defaultdict[str, int](lambda: 0)
+
+    start_idx = 0
+    while start_idx < prompt_len:
+        found = False
+
+        for modality, modality_updates in mm_prompt_updates.items():
+            item_idx = item_idx_by_modality[modality]
+            if item_idx >= mm_item_counts.get(modality, 0):
+                continue
+
+            for update_info in modality_updates:
+                content = update_info.get_content(item_idx)
+                content_tokens_full = content.full.token_ids
+                content_len_full = len(content_tokens_full)
+                end_idx_full = start_idx + content_len_full
+
+                if content_len_full == 0 or end_idx_full > prompt_len:
+                    continue
+
+                if prompt[start_idx:end_idx_full] == content_tokens_full:
+                    content_is_embed = content.is_embed
+                    if content_is_embed is not None:
+                        content_is_embed = content_is_embed(content.full)
+
+                    yield PlaceholderFeaturesInfo(
+                        modality=modality,
+                        item_idx=item_idx,
+                        start_idx=start_idx,
+                        tokens=content_tokens_full,
+                        is_embed=content_is_embed,
+                    )
+
+                    # Exclude overlapping matches
+                    start_idx = end_idx_full
+                    item_idx_by_modality[modality] += 1
+                    found = True
+                    break
+
+            if found:
+                break  # Go back to the outer while loop
+
+        if not found:
+            start_idx += 1
+
+
+def find_mm_placeholders(
+    mm_prompt_updates: Mapping[str, Sequence[BoundPromptUpdate]],
+    prompt: list[int],
+    mm_item_counts: Mapping[str, int],
+) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
+    it = _iter_placeholders(mm_prompt_updates, prompt, mm_item_counts)
+    return dict(full_groupby_modality(it))
+
+
+_V = TypeVar("_V", bound="Union[MultiModalKwargs, MultiModalKwargsItem]")
+
+
+class ProcessingCacheOptionalItem(NamedTuple):
+    key: str
+    value: Optional[MultiModalKwargsItem]
+
+
+class ProcessingCacheItem(NamedTuple):
+    key: str
+    value: MultiModalKwargsItem
+
+
+class ProcessingCache:
+
+    @staticmethod
+    def get_lru_cache(
+        capacity_gb: float,
+        value_type: type[_V],
+        *,
+        debug: bool = False,
+    ) -> LRUCache[str, _V]:
+
+        def get_leaf_size(leaf: object) -> int:
+            # MultiModalKwargs is not a subclass of dict
+            if isinstance(leaf, MultiModalKwargs):
+                return get_item_size(leaf.data)
+
+            # MultiModalKwargsItem is not a subclass of dict
+            if isinstance(leaf, MultiModalKwargsItem):
+                leaf_data = {k: v.data for k, v in leaf.items()}
+                return get_item_size(leaf_data)
+
+            # sys.getsizeof doesn't work for tensors
+            if isinstance(leaf, torch.Tensor):
+                return leaf.nbytes
+
+            return sys.getsizeof(leaf)
+
+        def get_item_size(
+            value: Union[MultiModalKwargs, MultiModalKwargsItem,
+                         Mapping[str, NestedTensors]]
+        ) -> int:
+            size = json_reduce_leaves(
+                lambda a, b: a + b,
+                json_map_leaves(get_leaf_size, value),
+            )
+
+            if debug:
+                logger.debug("Calculated size of %s to be %.2f GiB",
+                             type(value), size / GiB_bytes)
+
+            return size
+
+        return LRUCache(GiB_bytes * capacity_gb, getsizeof=get_item_size)
+
+    def __init__(
+        self,
+        capacity_gb: float,
+        *,
+        debug_cache_hit_ratio_steps: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+
+        self.debug_cache_hit_ratio_steps = debug_cache_hit_ratio_steps
+        self.debug_cache_hits = 0
+        self.debug_cache_total = 0
+
+        self._cache = self.get_lru_cache(
+            capacity_gb,
+            MultiModalKwargsItem,
+            debug=bool(debug_cache_hit_ratio_steps),
+        )
+
+    def _maybe_log_cache_stats(self) -> None:
+        steps = self.debug_cache_hit_ratio_steps
+        if not steps:
+            return
+
+        total = self.debug_cache_total
+        if total > 0 and total % steps == 0:
+            logger.debug("ProcessingCache: hit_ratio = %.2f",
+                         self.debug_cache_hits / total)
+            logger.debug("ProcessingCache: size = %.2f / %.2f GiB",
+                         self._cache.currsize / GiB_bytes,
+                         self._cache.maxsize / GiB_bytes)
+
+    def get(
+        self,
+        model_id: str,
+        modality: str,
+        input_item: object,
+        input_kwargs: Mapping[str, object],
+    ) -> Optional[MultiModalKwargsItem]:
+        """
+        Get a processed multi-modal item from the cache
+        according to its dependencies, including:
+
+        - The model ID
+        - The modality of the item
+        - The original data item passed to the HF processor
+        - The configuration options of the HF processor
+        """
+        self._maybe_log_cache_stats()
+
+        cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
+                                                 **{modality: input_item},
+                                                 **input_kwargs)
+
+        if self.debug_cache_hit_ratio_steps:
+            if cache_key in self._cache:
+                self.debug_cache_hits += 1
+
+            self.debug_cache_total += 1
+
+        return self._cache.get(cache_key)
+
+    def get_item(
+        self,
+        model_id: str,
+        modality: str,
+        input_item: object,
+        input_kwargs: Mapping[str, object],
+    ) -> ProcessingCacheOptionalItem:
+        cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
+                                                 **{modality: input_item},
+                                                 **input_kwargs)
+
+        return ProcessingCacheOptionalItem(
+            key=cache_key,
+            value=self._cache.get(cache_key),
+        )
+
+    def put(
+        self,
+        model_id: str,
+        modality: str,
+        input_item: object,
+        input_kwargs: Mapping[str, object],
+        output_kwargs: MultiModalKwargsItem,
+    ) -> None:
+        """
+        Put a processed multi-modal item into the cache
+        according to its dependencies
+        (see [`get`][vllm.multimodal.processing.ProcessingCache.get]).
+        """
+        cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
+                                                 **{modality: input_item},
+                                                 **input_kwargs)
+        self._cache[cache_key] = output_kwargs
+
+    def put_item(self, item: ProcessingCacheItem) -> None:
+        self._cache[item.key] = item.value
+
+    def reset(self) -> bool:
+        self._cache.clear()
+
+        return True
+
+
+class BaseProcessingInfo:
+    """Base class to provide the information necessary for data processing."""
+
+    def __init__(self, ctx: InputProcessingContext) -> None:
+        super().__init__()
+
+        self.ctx = ctx
+
+    @property
+    def model_id(self) -> str:
+        return self.ctx.model_config.model
+
+    def get_tokenizer(self) -> AnyTokenizer:
+        return self.ctx.tokenizer
+
+    def get_hf_config(self) -> "PretrainedConfig":
+        return self.ctx.get_hf_config()
+
+    def get_hf_processor(self, **kwargs: object) -> "ProcessorMixin":
+        """
+        Subclasses can override this method to handle
+        specific kwargs from model config or user inputs.
+        """
+        return self.ctx.get_hf_processor(**kwargs)
+
+    @abstractmethod
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        """
+        Return the maximum supported number of items for each modality.
+
+        A value of `None` means unlimited number of items.
+
+        Omitting a modality from the returned dictionary means that
+        it is not supported at all.
+        """
+        raise NotImplementedError
+
+    def get_allowed_mm_limits(self) -> Mapping[str, int]:
+        """Return the maximum allowed number of items for each modality."""
+        supported_mm_limits = self.get_supported_mm_limits()
+        mm_config = self.ctx.get_mm_config()
+
+        allowed_limits = dict[str, int]()
+        for modality, supported_limit in supported_mm_limits.items():
+            user_limit = mm_config.get_limit_per_prompt(modality)
+
+            allowed_limits[modality] = (user_limit if supported_limit is None
+                                        else min(user_limit, supported_limit))
+
+        return allowed_limits
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Optional[Mapping[str, int]]:
+        """
+        Return the maximum number of tokens per item of for each modality.
+        
+        When `None` (the default) is returned, vLLM will generate dummy inputs
+        (images/videos) at maximum possible sizes and process them to determine
+        the maximum token count per modality.
+
+        This approach works but can be very slow for certain models (e.g.,
+        Qwen2.5-VL), leading to very long startup time. For better performance,
+        each model can override this method to return pre-computed maximum token
+        counts, avoiding the need for dummy input generation and processing.
+
+        Note:
+            The maximum number of tokens per item of each modality returned 
+            from this function should respect the model's maximum sequence
+            length and the maximum number of items of each modality allowed,
+            and agree with dummy inputs (images/videos) at maximum possible
+            sizes.
+        """
+        return None
+
+
+_I = TypeVar("_I", bound=BaseProcessingInfo)
+
+MultiModalHashes = dict[str, list[str]]
+"""
+A collection of hashes with a similar structure as
+[`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs].
+"""
+
+
+class BaseMultiModalProcessor(ABC, Generic[_I]):
+    """
+    Abstract base class to process multi-modal inputs to be used in vLLM.
+
+    Not to be confused with `transformers.ProcessorMixin`.
+    """
+
+    def __init__(self,
+                 info: _I,
+                 dummy_inputs: "BaseDummyInputsBuilder[_I]",
+                 *,
+                 cache: Optional[ProcessingCache] = None) -> None:
+        super().__init__()
+
+        self.info = info
+        self.dummy_inputs = dummy_inputs
+        self.cache = cache
+
+        self.data_parser = self._get_data_parser()
+
+    def __call__(
+        self,
+        prompt: str,
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputs:
+        return self.apply(prompt, mm_data, hf_processor_mm_kwargs)
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        """
+        Construct a parser to preprocess multi-modal data items
+        before passing them to
+        [`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data].
+
+        You can support additional modalities by creating a subclass
+        of [`MultiModalDataParser`][vllm.multimodal.parse.MultiModalDataParser]
+        that has additional subparsers.
+        """
+        return MultiModalDataParser()
+
+    def _to_mm_items(
+        self,
+        mm_data: MultiModalDataDict,
+    ) -> MultiModalDataItems:
+        """
+        Normalize
+        [`MultiModalDataDict`][vllm.multimodal.inputs.MultiModalDataDict]
+        to [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems]
+        before passing them to
+        [`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data].
+        """
+        mm_items = self.data_parser.parse_mm_data(mm_data)
+        supported_mm_limits = self.info.get_supported_mm_limits()
+        allowed_mm_limits = self.info.get_allowed_mm_limits()
+
+        for modality, items in mm_items.items():
+            supported_limit = supported_mm_limits.get(modality, 0)
+            allowed_limit = allowed_mm_limits.get(modality, 0)
+            num_items = len(items)
+
+            if supported_limit is not None and num_items > supported_limit:
+                raise ValueError(
+                    f"The model only supports at most {supported_limit} "
+                    f"{modality} items, but you passed {num_items} "
+                    f"{modality} items in the same prompt.")
+
+            if num_items > allowed_limit:
+                raise ValueError(
+                    "You set or defaulted to "
+                    f"'{json.dumps({modality: allowed_limit})}' in "
+                    f"`--limit-mm-per-prompt`, but passed {num_items} "
+                    f"{modality} items in the same prompt.")
+
+        return mm_items
+
+    @abstractmethod
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: "BatchFeature",
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        """Given the HF-processed data, output the metadata of each field."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        """
+        Given the original multi-modal items for this modality
+        and HF-processed data, output the updates to perform.
+
+        The information returned by this method is used to update token inputs
+        which bypass the HF processor. It is also used to update the output of
+        HF processor if the HF process does not apply prompt updates to text
+        inputs.
+
+        Moreover, this information is critical to determine the token positions
+        in order to construct
+        [`PlaceholderRange`][vllm.multimodal.inputs.PlaceholderRange]
+        for each multi-modal item.
+        """
+        raise NotImplementedError
+
+    def _find_mm_placeholders(
+        self,
+        mm_prompt_updates: Mapping[str, Sequence[BoundPromptUpdate]],
+        new_token_ids: list[int],
+        mm_item_counts: Mapping[str, int],
+    ) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
+        return find_mm_placeholders(mm_prompt_updates, new_token_ids,
+                                    mm_item_counts)
+
+    def _get_hf_mm_data(
+        self,
+        mm_items: MultiModalDataItems,
+    ) -> tuple[Mapping[str, object], Mapping[str, object]]:
+        processor_data = dict[str, object]()
+        passthrough_data = dict[str, object]()
+
+        for items in mm_items.values():
+            processor_data.update(items.get_processor_data())
+            passthrough_data.update(items.get_passthrough_data())
+
+        return processor_data, passthrough_data
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        # Not to be confused with `mm_data` in `self.apply`.
+        # This refers to the data to be passed to HF processor.
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> "BatchFeature":
+        """
+        Call the HF processor on the prompt text and
+        associated multi-modal data.
+        """
+        return self.info.ctx.call_hf_processor(
+            self.info.get_hf_processor(**mm_kwargs),
+            dict(text=prompt, **mm_data),
+            dict(**mm_kwargs, **tok_kwargs),
+        )
+
+    def _hf_processor_applies_updates(
+        self,
+        prompt_text: str,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+    ) -> bool:
+        """
+        Return whether the HF processor applies prompt updates.
+
+        For most HF processors, this should be `True` when multi-modal
+        data items are passed, but `False` when multi-modal embeddings
+        are passed.
+        """
+        return not any(
+            isinstance(items, (EmbeddingItems, DictEmbeddingItems))
+            for items in mm_items.values())
+
+    def _apply_hf_processor_text_mm(
+        self,
+        prompt_text: str,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+    ) -> tuple[list[int], MultiModalKwargs, bool]:
+        """
+        Apply the HF processor on the prompt text and multi-modal data
+        together.
+
+        In addition, return whether prompt updates have been applied.
+        """
+        processor_data, passthrough_data = self._get_hf_mm_data(mm_items)
+
+        processed_data = self._call_hf_processor(
+            prompt=prompt_text,
+            mm_data=processor_data,
+            mm_kwargs=hf_processor_mm_kwargs,
+            tok_kwargs=tokenization_kwargs,
+        )
+        processed_data.update(passthrough_data)
+
+        prompt_ids, = processed_data.pop("input_ids").tolist()
+
+        mm_kwargs = MultiModalKwargs.from_hf_inputs(
+            processed_data,
+            self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs),
+        )
+
+        is_update_applied = self._hf_processor_applies_updates(
+            prompt_text=prompt_text,
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            tokenization_kwargs=tokenization_kwargs,
+        )
+
+        return prompt_ids, mm_kwargs, is_update_applied
+
+    def _apply_hf_processor_text_only(
+            self, prompt_text: str,
+            tokenization_kwargs: Mapping[str, object]) -> list[int]:
+        """
+        Apply the HF processor on the prompt text only.
+
+        Since HF processor requires that text and multi-modal items
+        correspond to each other, we create dummy multi-modal items
+        to go along with the text.
+        """
+        prompt_ids, _, _ = self._apply_hf_processor_text_mm(
+            prompt_text=prompt_text,
+            mm_items=MultiModalDataItems({}),
+            hf_processor_mm_kwargs={},
+            tokenization_kwargs=tokenization_kwargs,
+        )
+
+        return prompt_ids
+
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+        """
+        Apply the HF processor on the prompt tokens only.
+
+        Most HF processors accept prompt text but not prompt tokens.
+        If the HF processor adds or removes tokens that are not related to
+        multi-modal data, you should override this method so it is consistent
+        with the output of
+        [`_apply_hf_processor_text_only`][vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_text_only]
+        on the
+        corresponding text.
+        """
+        return prompt_tokens
+
+    def _apply_hf_processor_mm_only(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+    ) -> MultiModalKwargs:
+        """
+        Apply the HF processor on the multi-modal data only.
+
+        Since HF processor requires that text and multi-modal items
+        correspond to each other, we generate dummy text using
+        [`DummyInputsBuilder`][vllm.multimodal.profiling.BaseDummyInputsBuilder]
+        to go along with the multi-modal data.
+        """
+        mm_counts = mm_items.get_all_counts()
+
+        _, mm_kwargs, _ = self._apply_hf_processor_text_mm(
+            prompt_text=self.dummy_inputs.get_dummy_text(mm_counts),
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            tokenization_kwargs=tokenization_kwargs,
+        )
+
+        return mm_kwargs
+
+    def _apply_hf_processor_main(
+        self,
+        prompt: Union[str, list[int]],
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+        *,
+        enable_hf_prompt_update: bool,
+    ) -> tuple[list[int], MultiModalKwargs, bool]:
+        """
+        Apply the HF processor on the prompt text and multi-modal data.
+
+        In addition, return whether prompt updates have been applied
+        (for most HF processors, this should be `True`).
+
+        Note:
+            If `enable_hf_prompt_update=False`, we use HF processor
+            to perform prompt updates if available; HF processor requires
+            that the prompt corresponds to multi-modal items.
+        """
+        if isinstance(prompt, str):
+            if enable_hf_prompt_update:
+                return self._apply_hf_processor_text_mm(
+                    prompt_text=prompt,
+                    mm_items=mm_items,
+                    hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+                    tokenization_kwargs=tokenization_kwargs,
+                )
+
+            prompt_ids = self._apply_hf_processor_text_only(
+                prompt, tokenization_kwargs)
+        else:
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt)
+
+        mm_kwargs = self._apply_hf_processor_mm_only(
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            tokenization_kwargs=tokenization_kwargs,
+        )
+
+        return prompt_ids, mm_kwargs, False
+
+    def _get_cache_missing_items(
+        self,
+        cache: ProcessingCache,
+        mm_data_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+    ) -> tuple[dict[str, list[ProcessingCacheOptionalItem]], dict[
+            str, list[object]]]:
+        model_id = self.info.model_id
+
+        mm_cache_items = {
+            modality: [
+                cache.get_item(
+                    model_id, modality, item,
+                    dict(**hf_processor_mm_kwargs, **tokenization_kwargs))
+                for item in items
+            ]
+            for modality, items in mm_data_items.items()
+        }
+
+        mm_missing_idxs = {
+            modality: [
+                idx for idx, item in enumerate(cache_items)
+                if item.value is None
+            ]
+            for modality, cache_items in mm_cache_items.items()
+        }
+        mm_missing_data = {
+            modality: [mm_data_items[modality][idx] for idx in idxs]
+            for modality, idxs in mm_missing_idxs.items()
+        }
+
+        return mm_cache_items, mm_missing_data
+
+    def _hash_mm_items(
+            self, mm_items: MultiModalDataItems,
+            hf_processor_mm_kwargs: Mapping[str, object],
+            tokenization_kwargs: Mapping[str, object]) -> MultiModalHashes:
+        """Create MM hashes to be returned (only used in V1)."""
+        model_id = self.info.model_id
+
+        return {
+            modality: [
+                MultiModalHasher.hash_kwargs(model_id=model_id,
+                                             **{modality: item},
+                                             **hf_processor_mm_kwargs,
+                                             **tokenization_kwargs)
+                for item in items
+            ]
+            for modality, items in mm_items.items()
+        }
+
+    def _merge_mm_kwargs(
+        self,
+        cache: ProcessingCache,
+        mm_cache_items: dict[str, list[ProcessingCacheOptionalItem]],
+        mm_missing_data: dict[str, list[object]],
+        mm_missing_kwargs: MultiModalKwargs,
+    ) -> dict[str, list[ProcessingCacheItem]]:
+        mm_missing_next_idx = {modality: 0 for modality in mm_missing_data}
+
+        merged_items = defaultdict[str, list[ProcessingCacheItem]](list)
+        for modality, cache_items in mm_cache_items.items():
+            for cache_item in cache_items:
+                if cache_item.value is None:
+                    kw_item = mm_missing_kwargs.get_item(
+                        modality,
+                        mm_missing_next_idx[modality],
+                    )
+                    cache_item_new = ProcessingCacheItem(
+                        key=cache_item.key,
+                        value=kw_item,
+                    )
+
+                    cache.put_item(cache_item_new)
+                    mm_missing_next_idx[modality] += 1
+                else:
+                    cache_item_new = ProcessingCacheItem(
+                        key=cache_item.key,
+                        value=cache_item.value,
+                    )
+
+                merged_items[modality].append(cache_item_new)
+
+        return dict(merged_items)
+
+    def _apply_hf_processor(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+        *,
+        return_mm_hashes: bool,
+    ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
+        (
+            prompt_ids,
+            mm_kwargs,
+            is_update_applied,
+        ) = self._apply_hf_processor_main(
+            prompt=prompt,
+            mm_items=mm_data_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            tokenization_kwargs=tokenization_kwargs,
+            enable_hf_prompt_update=True,
+        )
+
+        mm_hashes = (self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
+                                         tokenization_kwargs)
+                     if return_mm_hashes else None)
+
+        return prompt_ids, mm_kwargs, mm_hashes, is_update_applied
+
+    def _cached_apply_hf_processor(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+        *,
+        return_mm_hashes: bool,
+    ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
+        """
+        Apply the HF processor on the full prompt text,
+        caching the results and reusing cached results.
+        """
+        cache = self.cache
+
+        _, passthrough_data = self._get_hf_mm_data(mm_data_items)
+        if cache is None or passthrough_data:
+            return self._apply_hf_processor(
+                prompt=prompt,
+                mm_data_items=mm_data_items,
+                hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+                tokenization_kwargs=tokenization_kwargs,
+                return_mm_hashes=return_mm_hashes,
+            )
+
+        (
+            mm_cache_items,
+            mm_missing_data,
+        ) = self._get_cache_missing_items(
+            cache=cache,
+            mm_data_items=mm_data_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            tokenization_kwargs=tokenization_kwargs,
+        )
+
+        # NOTE: `prompt` does not correspond to `mm_missing_data_items`,
+        # so we can't apply prompt updates until the new multimodal
+        # items are combined with the cached multimodal items
+        (
+            prompt_ids,
+            mm_missing_kwargs,
+            is_update_applied,
+        ) = self._apply_hf_processor_main(
+            prompt=prompt,
+            mm_items=self._to_mm_items(mm_missing_data),
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            tokenization_kwargs=tokenization_kwargs,
+            enable_hf_prompt_update=False,
+        )
+
+        mm_cache_items_merged = self._merge_mm_kwargs(
+            cache,
+            mm_cache_items=mm_cache_items,
+            mm_missing_data=mm_missing_data,
+            mm_missing_kwargs=mm_missing_kwargs,
+        )
+
+        mm_kwargs = MultiModalKwargs.from_items([
+            item.value for cache_items in mm_cache_items_merged.values()
+            for item in cache_items
+        ])
+
+        mm_hashes = {
+            modality: [item.key for item in cache_items]
+            for modality, cache_items in mm_cache_items_merged.items()
+        } if return_mm_hashes else None
+
+        return prompt_ids, mm_kwargs, mm_hashes, is_update_applied
+
+    def _bind_and_group_updates(
+        self,
+        prompt_updates: Sequence[PromptUpdate],
+    ) -> dict[str, Sequence[BoundPromptUpdate]]:
+        tokenizer = self.info.get_tokenizer()
+
+        it = (update.bind(tokenizer) for update in prompt_updates)
+        return dict(full_groupby_modality(it))
+
+    def _apply_token_matches(
+        self,
+        prompt: list[int],
+        mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
+        mm_item_counts: Mapping[str, int],
+    ) -> list[int]:
+        return apply_token_matches(prompt, mm_matches, mm_item_counts)
+
+    def _apply_text_matches(
+        self,
+        prompt: str,
+        mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
+        mm_item_counts: Mapping[str, int],
+    ) -> str:
+        return apply_text_matches(prompt, mm_matches, mm_item_counts)
+
+    def _apply_prompt_updates(
+        self,
+        token_ids: list[int],
+        mm_prompt_updates: Mapping[str, Sequence[BoundPromptUpdate]],
+        mm_item_counts: Mapping[str, int],
+    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
+        tokenizer = self.info.get_tokenizer()
+
+        mm_token_matches = {
+            modality: find_token_matches(token_ids, updates)
+            for modality, updates in mm_prompt_updates.items()
+        }
+        mm_match_counts = {
+            modality: len(matches)
+            for modality, matches in mm_token_matches.items()
+        }
+
+        # If the search text does not represent a special token,
+        # it may have different token IDs in the prompt, because
+        # the tokens may go across the boundaries of the search text.
+        # ----
+        # e.g. when searching for "foo" in "food", if "food" itself makes
+        # up a token, then the token ID of "foo" will not appear at all
+        # ----
+        # Since it is inefficient to search for all possible tokenizations
+        # of the search text in the prompt, we instead perform string-based
+        # updates on the decoded token IDs, then encode them back.
+        if all(
+            mm_match_counts.get(modality, 0) >= item_count
+            for modality, item_count in mm_item_counts.items()
+        ):  # yapf: disable
+            token_ids = self._apply_token_matches(
+                token_ids,
+                mm_token_matches,
+                mm_item_counts,
+            )
+
+            text = decode_tokens(tokenizer, token_ids)
+            matched_updates = {
+                modality: [match._origin for match in token_matches]
+                for modality, token_matches in mm_token_matches.items()
+            }
+        else:
+            text = decode_tokens(tokenizer, token_ids)
+
+            mm_text_matches = {
+                modality: find_text_matches(text, updates)
+                for modality, updates in mm_prompt_updates.items()
+            }
+            text = self._apply_text_matches(
+                text,
+                mm_text_matches,
+                mm_item_counts,
+            )
+
+            token_ids = encode_tokens(tokenizer,
+                                      text,
+                                      add_special_tokens=False)
+            matched_updates = {
+                modality: [match._origin for match in token_matches]
+                for modality, token_matches in mm_text_matches.items()
+            }
+
+        placeholders = self._find_mm_placeholders(
+            matched_updates,
+            token_ids,
+            mm_item_counts,
+        )
+
+        return token_ids, text, placeholders
+
+    def _validate_mm_kwargs(
+        self,
+        mm_kwargs: MultiModalKwargs,
+        mm_item_counts: Mapping[str, int],
+    ) -> None:
+        for modality, item_count in mm_item_counts.items():
+            if modality in mm_kwargs.modalities:
+                items = mm_kwargs.get_items(modality)
+            else:
+                items = []
+
+            if len(items) != item_count:
+                raise RuntimeError(
+                    f"Expected there to be {item_count} {modality} items in "
+                    f"keyword arguments corresponding to {item_count} "
+                    f"{modality} data items, but only found {len(items)}! "
+                    "There is likely a problem with your "
+                    "implementation of merged multi-modal processor for this "
+                    "model (usually arising from an inconsistency between "
+                    "`_call_hf_processor` and `_get_mm_fields_config`).")
+
+    def _validate_mm_placeholders(
+        self,
+        mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
+        mm_item_counts: Mapping[str, int],
+    ) -> None:
+        for modality, item_count in mm_item_counts.items():
+            placeholders = mm_placeholders.get(modality, [])
+
+            if len(placeholders) != item_count:
+                # NOTE: If you are a model developer, this can also arise from
+                # an inconsistency between `_call_hf_processor` and
+                # `_get_mm_fields_config` implementations
+                raise RuntimeError(
+                    f"Expected there to be {item_count} prompt updates "
+                    f"corresponding to {item_count} {modality} items, but "
+                    f"instead found {len(placeholders)} prompt updates! "
+                    "This is likely because you forgot to include input "
+                    "placeholder tokens (e.g., `<image>`, `<|image_pad|>`) "
+                    "in the prompt. If the model has a chat template, make "
+                    "sure you have applied it before calling `LLM.generate`.")
+
+    def _maybe_apply_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        prompt_ids: list[int],
+        mm_kwargs: MultiModalKwargs,
+        is_update_applied: bool,
+    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
+        unbound_prompt_updates = self._get_prompt_updates(
+            mm_items,
+            hf_processor_mm_kwargs,
+            mm_kwargs,
+        )
+        mm_prompt_updates = self._bind_and_group_updates(
+            unbound_prompt_updates)
+
+        mm_item_counts = mm_items.get_all_counts()
+        self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
+
+        if is_update_applied:
+            mm_placeholders = self._find_mm_placeholders(
+                mm_prompt_updates,
+                prompt_ids,
+                mm_item_counts,
+            )
+            self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
+
+            tokenizer = self.info.get_tokenizer()
+            prompt = decode_tokens(tokenizer, prompt_ids)
+        else:
+            (
+                prompt_ids,
+                prompt,
+                mm_placeholders,
+            ) = self._apply_prompt_updates(
+                prompt_ids,
+                mm_prompt_updates,
+                mm_item_counts,
+            )
+            self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
+
+        return prompt_ids, prompt, mm_placeholders
+
+    def apply(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        return_mm_hashes: bool = False,
+    ) -> MultiModalInputs:
+        """
+        Process multi-modal inputs to be used in vLLM.
+
+        The main steps are:
+
+        1. Apply HF Processor on prompt text and multi-modal data together,
+           outputting token IDs and processed tensors.
+        2. Find and update sequences in the token IDs with placeholder tokens.
+           The number of placeholder tokens equals the feature size of the
+           multi-modal data outputted by the multi-modal encoder.
+        3. Extract information about the placeholder tokens from the
+           processed token IDs.
+        """
+        mm_items = self._to_mm_items(mm_data)
+
+        if tokenization_kwargs is None:
+            tokenization_kwargs = {}
+
+        (
+            prompt_ids,
+            mm_kwargs,
+            mm_hashes,
+            is_update_applied,
+        ) = self._cached_apply_hf_processor(
+            prompt,
+            mm_items,
+            hf_processor_mm_kwargs,
+            tokenization_kwargs=tokenization_kwargs,
+            return_mm_hashes=return_mm_hashes,
+        )
+
+        # NOTE: tokenization_kwargs are not required to init processor
+        prompt_ids, prompt, mm_placeholders = self._maybe_apply_prompt_updates(
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            prompt_ids=prompt_ids,
+            mm_kwargs=mm_kwargs,
+            is_update_applied=is_update_applied,
+        )
+
+        mm_placeholder_ranges = {
+            modality: [item.to_range() for item in placeholders]
+            for modality, placeholders in mm_placeholders.items()
+        }
+
+        return MultiModalInputs(
+            type="multimodal",
+            prompt=prompt,
+            prompt_token_ids=prompt_ids,
+            mm_kwargs=mm_kwargs,
+            mm_hashes=mm_hashes,
+            mm_placeholders=mm_placeholder_ranges,
+        )
+
+
+class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
+
+    @abstractmethod
+    def create_encoder_prompt(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+    ) -> Union[str, list[int]]:
+        """
+        Create input prompt for the encoder. HF processor will be applied on
+        this prompt during profiling and generation.
+        """
+        raise NotImplementedError
+
+    @property
+    def pad_dummy_encoder_prompt(self) -> bool:
+        return False
+
+    def create_decoder_prompt(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+    ) -> Union[str, list[int]]:
+        """Create input prompt for the decoder."""
+        return prompt
+
+    def _get_enc_dec_inputs(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+        encoder_inputs: MultiModalInputs,
+    ):
+        tokenizer = self.info.get_tokenizer()
+        decoder_prompt = self.create_decoder_prompt(prompt, mm_data)
+        if isinstance(decoder_prompt, str):
+            decoder_prompt_ids = encode_tokens(tokenizer,
+                                               decoder_prompt,
+                                               add_special_tokens=False)
+        else:
+            decoder_prompt_ids = decoder_prompt
+            decoder_prompt = decode_tokens(tokenizer, decoder_prompt)
+
+        mm_inputs = MultiModalEncDecInputs(
+            encoder_prompt=encoder_inputs["prompt"],
+            encoder_prompt_token_ids=encoder_inputs["prompt_token_ids"],
+            **encoder_inputs)
+        mm_inputs.update({
+            "prompt": decoder_prompt,
+            "prompt_token_ids": decoder_prompt_ids
+        })
+        return mm_inputs
+
+    def apply(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        return_mm_hashes: bool = False,
+    ) -> MultiModalEncDecInputs:
+        """
+        Process multi-modal inputs to be used in vLLM.
+        The main processing steps are modified to fit encoder-decoder model:
+        1. Create encoder prompt from input prompt text.
+        2. Apply the HF processor on encoder prompt.
+        3. Copy the input prompt text as decoder prompt inputs.
+        """
+        encoder_prompt = self.create_encoder_prompt(prompt, mm_data)
+        encoder_inputs = super().apply(
+            encoder_prompt,
+            mm_data,
+            hf_processor_mm_kwargs,
+            tokenization_kwargs,
+            return_mm_hashes,
+        )
+
+        return self._get_enc_dec_inputs(
+            prompt=prompt,
+            mm_data=mm_data,
+            encoder_inputs=encoder_inputs,
+        )
diff --git a/vllm_v0.10.0/vllm/multimodal/profiling.py b/vllm_v0.10.0/vllm/multimodal/profiling.py
new file mode 100644
index 0000000..7f6fb47
--- /dev/null
+++ b/vllm_v0.10.0/vllm/multimodal/profiling.py
@@ -0,0 +1,288 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from collections.abc import Mapping
+from dataclasses import dataclass, field
+from typing import Generic, NamedTuple, Optional, TypeVar, Union, cast
+
+import numpy as np
+import numpy.typing as npt
+from PIL import Image
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+
+from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
+                     MultiModalInputs, MultiModalKwargs,
+                     MultiModalPlaceholderDict)
+from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
+                         EncDecMultiModalProcessor)
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class ProcessorInputs:
+    """
+    Represents the keyword arguments to
+    [`vllm.multimodal.processing.BaseMultiModalProcessor.apply`][].
+    """
+    prompt: Union[str, list[int]]
+    mm_data: MultiModalDataDict
+    hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
+    tokenization_kwargs: Mapping[str, object] = field(default_factory=dict)
+
+
+class DummyEncoderData(NamedTuple):
+    """Dummy data used for profiling."""
+
+    prompt_token_ids: list[int]
+
+
+class DummyDecoderData(NamedTuple):
+    """Dummy data used for profiling."""
+
+    prompt_token_ids: list[int]
+    multi_modal_data: MultiModalKwargs
+    multi_modal_placeholders: MultiModalPlaceholderDict
+
+
+_I = TypeVar("_I", bound=BaseProcessingInfo)
+
+
+class BaseDummyInputsBuilder(ABC, Generic[_I]):
+    """
+    Abstract base class that constructs the dummy data to profile
+    multi-modal models.
+    """
+
+    def __init__(self, info: _I) -> None:
+        super().__init__()
+
+        self.info = info
+
+    @abstractmethod
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        """
+        Build the text input corresponding to `mm_counts`.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        """
+        Build the multimodal input which, after processing, results in
+        the maximum possible number of placeholder tokens.
+        """
+        raise NotImplementedError
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        """
+        Build the input which, after processing, results in
+        the maximum possible number of placeholder tokens.
+        """
+        dummy_text = self.get_dummy_text(mm_counts)
+        dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts)
+        tokenization_kwargs = {"truncation": False}
+
+        return ProcessorInputs(prompt=dummy_text,
+                               mm_data=dummy_mm_data,
+                               tokenization_kwargs=tokenization_kwargs)
+
+    def _get_dummy_audios(
+        self,
+        *,
+        length: int,
+        num_audios: int,
+    ) -> list[npt.NDArray]:
+        if num_audios == 0:
+            return []
+        audio = np.zeros((length, ))
+        return [audio] * num_audios
+
+    def _get_dummy_images(
+        self,
+        *,
+        width: int,
+        height: int,
+        num_images: int,
+    ) -> list[Image.Image]:
+        if num_images == 0:
+            return []
+        image = Image.new("RGB", (width, height), color=255)
+        return [image] * num_images
+
+    def _get_dummy_videos(
+        self,
+        *,
+        width: int,
+        height: int,
+        num_frames: int,
+        num_videos: int,
+    ) -> list[npt.NDArray]:
+        if num_videos == 0:
+            return []
+        video = np.full((num_frames, width, height, 3), 255)
+        return [video] * num_videos
+
+
+class MultiModalProfiler(Generic[_I]):
+    """
+    Contains code for running memory profiling for multi-modal models.
+    """
+
+    def __init__(
+        self,
+        processor: BaseMultiModalProcessor[_I],
+    ) -> None:
+        super().__init__()
+
+        self.processor = processor
+
+    @property
+    def processing_info(self) -> BaseProcessingInfo:
+        return self.processor.info
+
+    @property
+    def dummy_inputs(self) -> BaseDummyInputsBuilder[_I]:
+        return self.processor.dummy_inputs
+
+    def get_mm_limits(self) -> Mapping[str, int]:
+        return self.processing_info.get_allowed_mm_limits()
+
+    def _get_dummy_mm_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Optional[Mapping[str, int]] = None,
+    ) -> MultiModalInputs:
+        if mm_counts is None:
+            mm_counts = self.get_mm_limits()
+
+        factory = self.dummy_inputs
+        processor_inputs = factory.get_dummy_processor_inputs(
+            seq_len, mm_counts)
+
+        return self.processor.apply(
+            prompt=processor_inputs.prompt,
+            mm_data=processor_inputs.mm_data,
+            hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
+            tokenization_kwargs=processor_inputs.tokenization_kwargs,
+        )
+
+    def _get_mm_num_tokens(
+        self,
+        mm_inputs: MultiModalInputs,
+    ) -> Mapping[str, int]:
+        placeholders_by_modality = mm_inputs["mm_placeholders"]
+
+        return {
+            modality: sum(item.get_num_embeds() for item in placeholders)
+            for modality, placeholders in placeholders_by_modality.items()
+        }
+
+    def get_encoder_dummy_data(
+        self,
+        seq_len: int,
+        mm_counts: Optional[Mapping[str, int]] = None,
+    ) -> DummyEncoderData:
+        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
+        mm_inputs = cast(MultiModalEncDecInputs, mm_inputs)
+
+        # For encoder-decoder models, use encoder prompt token ids instead of
+        # decoder prompt to construct dummy seq_data for encoder profiling.
+        encoder_prompt_token_ids = mm_inputs["encoder_prompt_token_ids"]
+
+        total_len = len(encoder_prompt_token_ids)
+
+        processor = cast(EncDecMultiModalProcessor, self.processor)
+        if processor.pad_dummy_encoder_prompt:
+            num_tokens_to_pad = max(total_len, seq_len) - total_len
+            encoder_prompt_token_ids.extend([0] * num_tokens_to_pad)
+        # NOTE: Whisper allows total_len > seq_len.
+        elif total_len > seq_len and not envs.VLLM_USE_V1:
+            # `max_num_batched_tokens` is defined by `SchedulerConfig`
+            logger.warning_once(
+                "The encoder sequence length used for profiling (max_num_batched_tokens / max_num_seqs = %d) "  # noqa: E501
+                "is too short to hold the multi-modal embeddings in the worst case (%d tokens in total, out of which %s are reserved for multi-modal embeddings). "  # noqa: E501
+                "This may cause certain multi-modal inputs to fail during inference, even when the input text is short. "  # noqa: E501
+                "To avoid this, you should increase `max_model_len`, reduce `max_num_seqs`, and/or reduce `mm_counts`.",  # noqa: E501
+                seq_len,
+                total_len,
+                str(self._get_mm_num_tokens(mm_inputs)),
+            )
+
+        return DummyEncoderData(encoder_prompt_token_ids)
+
+    def get_decoder_dummy_data(
+        self,
+        seq_len: int,
+        mm_counts: Optional[Mapping[str, int]] = None,
+    ) -> DummyDecoderData:
+        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
+
+        prompt_token_ids = mm_inputs["prompt_token_ids"]
+        total_len = len(prompt_token_ids)
+
+        # V0 does not support chunked prefill.
+        if total_len > seq_len and not envs.VLLM_USE_V1:
+            # `max_num_batched_tokens` is defined by `SchedulerConfig`
+            logger.warning_once(
+                "The sequence length used for profiling (max_num_batched_tokens / max_num_seqs = %d) "  # noqa: E501
+                "is too short to hold the multi-modal embeddings in the worst case (%d tokens in total, out of which %s are reserved for multi-modal embeddings). "  # noqa: E501
+                "This may cause certain multi-modal inputs to fail during inference, even when the input text is short. "  # noqa: E501
+                "To avoid this, you should increase `max_model_len`, reduce `max_num_seqs`, and/or reduce `mm_counts`.",  # noqa: E501
+                seq_len,
+                total_len,
+                str(self._get_mm_num_tokens(mm_inputs)),
+            )
+
+        if total_len < seq_len:
+            prompt_token_ids.extend([0] * (seq_len - total_len))
+
+        return DummyDecoderData(
+            prompt_token_ids=prompt_token_ids,
+            multi_modal_data=mm_inputs["mm_kwargs"],
+            multi_modal_placeholders=mm_inputs["mm_placeholders"],
+        )
+
+    def get_mm_max_tokens(
+        self,
+        seq_len: int,
+        mm_counts: Optional[Mapping[str, int]] = None,
+    ) -> Mapping[str, int]:
+        if mm_counts is None:
+            mm_counts = self.get_mm_limits()
+
+        max_tokens_per_item = self.processing_info.get_mm_max_tokens_per_item(
+            seq_len=seq_len,
+            mm_counts=mm_counts,
+        )
+        if max_tokens_per_item is not None:
+            if mm_counts is None:
+                total_mm_tokens = sum(max_tokens_per_item.values())
+            else:
+                total_mm_tokens = sum(max_tokens_per_item[k] * mm_counts[k]
+                                      for k in max_tokens_per_item.keys()
+                                      & mm_counts.keys())
+            if total_mm_tokens > seq_len:
+                logger.warning_once(
+                    "The sequence length (%d) is smaller than the pre-defined"
+                    " worst-case total number of multimodal tokens (%d). "
+                    "This may cause certain multi-modal inputs to fail during "
+                    "inference. To avoid this, you should increase "
+                    "`max_model_len` or reduce `mm_counts`.",
+                    seq_len,
+                    total_mm_tokens,
+                )
+            return max_tokens_per_item
+
+        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
+        return self._get_mm_num_tokens(mm_inputs)
diff --git a/vllm_v0.10.0/vllm/multimodal/registry.py b/vllm_v0.10.0/vllm/multimodal/registry.py
new file mode 100644
index 0000000..c44fcac
--- /dev/null
+++ b/vllm_v0.10.0/vllm/multimodal/registry.py
@@ -0,0 +1,331 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Mapping
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Generic, Optional, Protocol, TypeVar
+
+import torch.nn as nn
+from typing_extensions import deprecated
+
+from vllm.envs import VLLM_MM_INPUT_CACHE_GIB
+from vllm.inputs import InputProcessingContext
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import (AnyTokenizer,
+                                               cached_tokenizer_from_config)
+from vllm.utils import ClassRegistry
+
+from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
+                         ProcessingCache)
+from .profiling import (BaseDummyInputsBuilder, DummyDecoderData,
+                        DummyEncoderData, MultiModalProfiler)
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+
+logger = init_logger(__name__)
+
+N = TypeVar("N", bound=type[nn.Module])
+_I = TypeVar("_I", bound=BaseProcessingInfo)
+_I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True)
+
+
+class ProcessingInfoFactory(Protocol[_I_co]):
+    """
+    Constructs a
+    [`BaseMultiModalProcessor`][vllm.multimodal.processing.BaseMultiModalProcessor]
+    instance from the context.
+    """
+
+    def __call__(
+        self,
+        ctx: InputProcessingContext,
+    ) -> _I_co:
+        ...
+
+
+class DummyInputsBuilderFactory(Protocol[_I]):
+    """
+    Constructs a
+    [`BaseDummyInputsBuilder`][vllm.multimodal.profiling.BaseDummyInputsBuilder]
+    instance from the context.
+    """
+
+    def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]:
+        ...
+
+
+class MultiModalProcessorFactory(Protocol[_I]):
+    """
+    Constructs a
+    [`BaseMultiModalProcessor`][vllm.multimodal.processing.BaseMultiModalProcessor]
+    instance from the context.
+    """
+
+    def __call__(
+        self,
+        info: _I,
+        dummy_inputs: BaseDummyInputsBuilder[_I],
+        *,
+        cache: Optional[ProcessingCache] = None,
+    ) -> BaseMultiModalProcessor[_I]:
+        ...
+
+
+@dataclass(frozen=True)
+class _ProcessorFactories(Generic[_I]):
+    info: ProcessingInfoFactory[_I]
+    processor: MultiModalProcessorFactory[_I]
+    dummy_inputs: DummyInputsBuilderFactory[_I]
+
+    def build_processor(
+        self,
+        ctx: InputProcessingContext,
+        *,
+        cache: Optional[ProcessingCache] = None,
+    ):
+        info = self.info(ctx)
+        dummy_inputs_builder = self.dummy_inputs(info)
+        return self.processor(info, dummy_inputs_builder, cache=cache)
+
+
+class MultiModalRegistry:
+    """
+    A registry that dispatches data processing according to the model.
+    """
+
+    def __init__(self) -> None:
+        self._processor_factories = ClassRegistry[nn.Module,
+                                                  _ProcessorFactories]()
+
+        self._processing_cache = ProcessingCache(VLLM_MM_INPUT_CACHE_GIB)
+
+    def reset_processor_cache(self) -> bool:
+        """Reset the multi-modal processing cache."""
+        self._processing_cache.reset()
+
+        return True  # Success
+
+    @deprecated("Legacy input processor/mapper pipeline has been removed. "
+                "Please update your model runner to use "
+                "`seq_group_metadata.multi_modal_data` directly without "
+                "further processing.")
+    def create_input_mapper(self, model_config: "ModelConfig"):
+        return lambda data, mm_processor_kwargs: data
+
+    def get_max_tokens_per_item_by_modality(
+        self,
+        model_config: "ModelConfig",
+    ) -> Mapping[str, int]:
+        """
+        Get the maximum number of tokens per data item from each modality based
+        on underlying model configuration.
+        """
+        if not model_config.is_multimodal_model:
+            return {}
+
+        processor = self.create_processor(model_config, disable_cache=False)
+        profiler = MultiModalProfiler(processor)
+
+        seq_len = model_config.max_model_len
+        mm_limits = self.get_mm_limits_per_prompt(model_config)
+
+        return profiler.get_mm_max_tokens(
+            seq_len,
+            {
+                modality: 1
+                for modality, limit in mm_limits.items() if limit > 0
+            },
+        )
+
+    def get_max_tokens_per_item_by_nonzero_modality(
+        self,
+        model_config: "ModelConfig",
+    ) -> Mapping[str, int]:
+        """
+        Get the maximum number of tokens per data item from each modality based
+        on underlying model configuration, excluding modalities that user
+        explicitly disabled via `limit_mm_per_prompt`.
+
+        Note:
+            This is currently directly used only in V1 for profiling the memory
+            usage of a model.
+        """
+        mm_limits = self.get_mm_limits_per_prompt(model_config)
+
+        return {
+            key: max_tokens_per_mm_item
+            for key, max_tokens_per_mm_item in
+            self.get_max_tokens_per_item_by_modality(model_config).items()
+            if mm_limits[key] > 0
+        }
+
+    def get_max_tokens_by_modality(
+        self,
+        model_config: "ModelConfig",
+    ) -> Mapping[str, int]:
+        """
+        Get the maximum number of tokens from each modality
+        for profiling the memory usage of a model.
+        """
+        mm_limits = self.get_mm_limits_per_prompt(model_config)
+
+        return {
+            key: mm_limits[key] * max_tokens_per_mm_item
+            for key, max_tokens_per_mm_item in
+            self.get_max_tokens_per_item_by_modality(model_config).items()
+        }
+
+    def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
+        """
+        Get the maximum number of multi-modal tokens
+        for profiling the memory usage of a model.
+        """
+        return sum(self.get_max_tokens_by_modality(model_config).values())
+
+    @deprecated("Legacy input processor/mapper pipeline has been removed. "
+                "Please update your model runner to use "
+                "`seq_group_metadata.multi_modal_data` directly without "
+                "further processing.")
+    def init_mm_limits_per_prompt(
+        self,
+        model_config: "ModelConfig",
+    ) -> None:
+        pass
+
+    def get_mm_limits_per_prompt(
+        self,
+        model_config: "ModelConfig",
+    ) -> Mapping[str, int]:
+        """
+        Get the maximum number of multi-modal input instances for each modality
+        that are allowed per prompt for a model class.
+        """
+        if not model_config.is_multimodal_model:
+            return {}
+
+        processor = self.create_processor(model_config, disable_cache=False)
+        profiler = MultiModalProfiler(processor)
+        return profiler.get_mm_limits()
+
+    def register_processor(
+        self,
+        processor: MultiModalProcessorFactory[_I],
+        *,
+        info: ProcessingInfoFactory[_I],
+        dummy_inputs: DummyInputsBuilderFactory[_I],
+    ):
+        """
+        Register a multi-modal processor to a model class. The processor
+        is constructed lazily, hence a factory method should be passed.
+
+        When the model receives multi-modal data, the provided function is
+        invoked to transform the data into a dictionary of model inputs.
+        """
+
+        def wrapper(model_cls: N) -> N:
+            if self._processor_factories.contains(model_cls, strict=True):
+                logger.warning(
+                    "Model class %s already has a multi-modal processor "
+                    "registered to %s. It is overwritten by the new one.",
+                    model_cls, self)
+
+            self._processor_factories[model_cls] = _ProcessorFactories(
+                info=info,
+                dummy_inputs=dummy_inputs,
+                processor=processor,
+            )
+
+            return model_cls
+
+        return wrapper
+
+    def _get_model_cls(self, model_config: "ModelConfig"):
+        # Avoid circular import
+        from vllm.model_executor.model_loader import get_model_architecture
+
+        model_cls, _ = get_model_architecture(model_config)
+        return model_cls
+
+    @deprecated("Legacy input processor/mapper pipeline has been removed. "
+                "Please update your model runner to use "
+                "`seq_group_metadata.multi_modal_data` directly without "
+                "further processing.")
+    def has_processor(self, model_config: "ModelConfig") -> bool:
+        return True
+
+    def create_processor(
+        self,
+        model_config: "ModelConfig",
+        *,
+        tokenizer: Optional[AnyTokenizer] = None,
+        disable_cache: Optional[bool] = None,
+    ) -> BaseMultiModalProcessor[BaseProcessingInfo]:
+        """
+        Create a multi-modal processor for a specific model and tokenizer.
+        """
+        if not model_config.is_multimodal_model:
+            raise ValueError(f"{model_config.model} is not a multimodal model")
+
+        if tokenizer is None and not model_config.skip_tokenizer_init:
+            tokenizer = cached_tokenizer_from_config(model_config)
+        if disable_cache is None:
+            mm_config = model_config.get_multimodal_config()
+            disable_cache = mm_config.disable_mm_preprocessor_cache
+
+        model_cls = self._get_model_cls(model_config)
+        factories = self._processor_factories[model_cls]
+
+        ctx = InputProcessingContext(model_config, tokenizer)
+        cache = None if disable_cache else self._processing_cache
+
+        return factories.build_processor(ctx, cache=cache)
+
+    def get_decoder_dummy_data(
+        self,
+        model_config: "ModelConfig",
+        seq_len: int,
+        mm_counts: Optional[Mapping[str, int]] = None,
+    ) -> DummyDecoderData:
+        """
+        Create dummy data for profiling the memory usage of a model.
+
+        The model is identified by ``model_config``.
+        """
+        processor = self.create_processor(model_config, disable_cache=False)
+        profiler = MultiModalProfiler(processor)
+        dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts)
+
+        # Having more tokens is over-conservative but otherwise fine
+        token_ids = dummy_data.prompt_token_ids
+        if len(token_ids) < seq_len:
+            raise AssertionError(
+                f"Expected at least {seq_len} dummy tokens for profiling, "
+                f"but found {len(token_ids)} tokens instead.")
+
+        return dummy_data
+
+    def get_encoder_dummy_data(
+        self,
+        model_config: "ModelConfig",
+        seq_len: int,
+        mm_counts: Optional[Mapping[str, int]] = None,
+    ) -> DummyEncoderData:
+        """
+        Create dummy data for profiling the memory usage of a model.
+
+        The model is identified by ``model_config``.
+        """
+        processor = self.create_processor(model_config, disable_cache=False)
+        profiler = MultiModalProfiler(processor)
+        dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts)
+
+        # Having more tokens is over-conservative but otherwise fine
+        token_ids = dummy_data.prompt_token_ids
+        if len(token_ids) < seq_len:
+            logger.warning_once(
+                "Expected at least %d dummy encoder tokens for profiling, but found %d tokens instead.",  # noqa: E501
+                seq_len,
+                len(token_ids),
+            )
+
+        return dummy_data
diff --git a/vllm_v0.10.0/vllm/multimodal/utils.py b/vllm_v0.10.0/vllm/multimodal/utils.py
new file mode 100644
index 0000000..8dfbc65
--- /dev/null
+++ b/vllm_v0.10.0/vllm/multimodal/utils.py
@@ -0,0 +1,492 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from itertools import groupby
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
+from urllib.parse import ParseResult, urlparse
+
+import numpy as np
+import numpy.typing as npt
+import torch
+from PIL import Image, UnidentifiedImageError
+
+import vllm.envs as envs
+from vllm.connections import HTTPConnection, global_http_connection
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_gather)
+
+from .audio import AudioMediaIO
+from .base import MediaIO
+from .image import ImageEmbeddingMediaIO, ImageMediaIO
+from .inputs import PlaceholderRange
+from .video import VideoMediaIO
+
+_M = TypeVar("_M")
+
+if TYPE_CHECKING:
+    from .hasher import MultiModalHashDict
+    from .inputs import MultiModalKwargs, MultiModalPlaceholderDict
+else:
+    MultiModalHashDict = Any
+    MultiModalKwargs = Any
+    MultiModalPlaceholderDict = Any
+
+
+class MediaConnector:
+
+    def __init__(
+        self,
+        media_io_kwargs: Optional[dict[str, dict[str, Any]]] = None,
+        connection: HTTPConnection = global_http_connection,
+        *,
+        allowed_local_media_path: str = "",
+    ) -> None:
+        """
+        Args:
+            media_io_kwargs: Additional args passed to process media 
+                             inputs, keyed by modalities. For example, 
+                             to set num_frames for video, set 
+                             `--media-io-kwargs '{"video":{"num_frames":40}}'`
+            connection: HTTP connection client to download media contents.
+            allowed_local_media_path: A local directory to load media files
+                                      from.
+        """
+        super().__init__()
+
+        self.media_io_kwargs: dict[str, dict[
+            str, Any]] = media_io_kwargs if media_io_kwargs else {}
+        self.connection = connection
+
+        if allowed_local_media_path:
+            allowed_local_media_path_ = Path(allowed_local_media_path)
+
+            if not allowed_local_media_path_.exists():
+                raise ValueError(
+                    "Invalid `--allowed-local-media-path`: The path "
+                    f"{allowed_local_media_path_} does not exist.")
+            if not allowed_local_media_path_.is_dir():
+                raise ValueError(
+                    "Invalid `--allowed-local-media-path`: The path "
+                    f"{allowed_local_media_path_} must be a directory.")
+        else:
+            allowed_local_media_path_ = None
+
+        self.allowed_local_media_path = allowed_local_media_path_
+
+    def _load_data_url(
+        self,
+        url_spec: ParseResult,
+        media_io: MediaIO[_M],
+    ) -> _M:
+        data_spec, data = url_spec.path.split(",", 1)
+        media_type, data_type = data_spec.split(";", 1)
+
+        if data_type != "base64":
+            msg = "Only base64 data URLs are supported for now."
+            raise NotImplementedError(msg)
+
+        return media_io.load_base64(media_type, data)
+
+    def _load_file_url(
+        self,
+        url_spec: ParseResult,
+        media_io: MediaIO[_M],
+    ) -> _M:
+        allowed_local_media_path = self.allowed_local_media_path
+        if allowed_local_media_path is None:
+            raise RuntimeError("Cannot load local files without "
+                               "`--allowed-local-media-path`.")
+
+        filepath = Path(url_spec.path)
+        if allowed_local_media_path not in filepath.resolve().parents:
+            raise ValueError(
+                f"The file path {filepath} must be a subpath "
+                f"of `--allowed-local-media-path` {allowed_local_media_path}.")
+
+        return media_io.load_file(filepath)
+
+    def load_from_url(
+        self,
+        url: str,
+        media_io: MediaIO[_M],
+        *,
+        fetch_timeout: Optional[int] = None,
+    ) -> _M:
+        url_spec = urlparse(url)
+
+        if url_spec.scheme.startswith("http"):
+            connection = self.connection
+            data = connection.get_bytes(url, timeout=fetch_timeout)
+
+            return media_io.load_bytes(data)
+
+        if url_spec.scheme == "data":
+            return self._load_data_url(url_spec, media_io)
+
+        if url_spec.scheme == "file":
+            return self._load_file_url(url_spec, media_io)
+
+        msg = "The URL must be either a HTTP, data or file URL."
+        raise ValueError(msg)
+
+    async def load_from_url_async(
+        self,
+        url: str,
+        media_io: MediaIO[_M],
+        *,
+        fetch_timeout: Optional[int] = None,
+    ) -> _M:
+        url_spec = urlparse(url)
+
+        if url_spec.scheme.startswith("http"):
+            connection = self.connection
+            data = await connection.async_get_bytes(url, timeout=fetch_timeout)
+
+            return media_io.load_bytes(data)
+
+        if url_spec.scheme == "data":
+            return self._load_data_url(url_spec, media_io)
+
+        if url_spec.scheme == "file":
+            return self._load_file_url(url_spec, media_io)
+
+        msg = "The URL must be either a HTTP, data or file URL."
+        raise ValueError(msg)
+
+    def fetch_audio(
+        self,
+        audio_url: str,
+    ) -> tuple[np.ndarray, Union[int, float]]:
+        """
+        Load audio from a URL.
+        """
+        audio_io = AudioMediaIO(**self.media_io_kwargs.get("audio", {}))
+
+        return self.load_from_url(
+            audio_url,
+            audio_io,
+            fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
+        )
+
+    async def fetch_audio_async(
+        self,
+        audio_url: str,
+    ) -> tuple[np.ndarray, Union[int, float]]:
+        """
+        Asynchronously fetch audio from a URL.
+        """
+        audio_io = AudioMediaIO(**self.media_io_kwargs.get("audio", {}))
+
+        return await self.load_from_url_async(
+            audio_url,
+            audio_io,
+            fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
+        )
+
+    def fetch_image(
+        self,
+        image_url: str,
+        *,
+        image_mode: str = "RGB",
+    ) -> Image.Image:
+        """
+        Load a PIL image from a HTTP or base64 data URL.
+
+        By default, the image is converted into RGB format.
+        """
+        image_io = ImageMediaIO(image_mode=image_mode,
+                                **self.media_io_kwargs.get("image", {}))
+
+        try:
+            return self.load_from_url(
+                image_url,
+                image_io,
+                fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
+            )
+        except UnidentifiedImageError as e:
+            # convert to ValueError to be properly caught upstream
+            raise ValueError(str(e)) from e
+
+    async def fetch_image_async(
+        self,
+        image_url: str,
+        *,
+        image_mode: str = "RGB",
+    ) -> Image.Image:
+        """
+        Asynchronously load a PIL image from a HTTP or base64 data URL.
+
+        By default, the image is converted into RGB format.
+        """
+        image_io = ImageMediaIO(image_mode=image_mode,
+                                **self.media_io_kwargs.get("image", {}))
+
+        try:
+            return await self.load_from_url_async(
+                image_url,
+                image_io,
+                fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
+            )
+        except UnidentifiedImageError as e:
+            # convert to ValueError to be properly caught upstream
+            raise ValueError(str(e)) from e
+
+    def fetch_video(
+        self,
+        video_url: str,
+        *,
+        image_mode: str = "RGB",
+    ) -> tuple[npt.NDArray, dict[str, Any]]:
+        """
+        Load video from a HTTP or base64 data URL.
+        """
+        image_io = ImageMediaIO(image_mode=image_mode,
+                                **self.media_io_kwargs.get("image", {}))
+        video_io = VideoMediaIO(image_io,
+                                **self.media_io_kwargs.get("video", {}))
+
+        return self.load_from_url(
+            video_url,
+            video_io,
+            fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
+        )
+
+    async def fetch_video_async(
+        self,
+        video_url: str,
+        *,
+        image_mode: str = "RGB",
+    ) -> tuple[npt.NDArray, dict[str, Any]]:
+        """
+        Asynchronously load video from a HTTP or base64 data URL.
+
+        By default, the image is converted into RGB format.
+        """
+        image_io = ImageMediaIO(image_mode=image_mode,
+                                **self.media_io_kwargs.get("image", {}))
+        video_io = VideoMediaIO(image_io,
+                                **self.media_io_kwargs.get("video", {}))
+
+        return await self.load_from_url_async(
+            video_url,
+            video_io,
+            fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
+        )
+
+    def fetch_image_embedding(
+        self,
+        data: str,
+    ) -> torch.Tensor:
+        """
+        Load image embedding from a URL.
+        """
+        image_embedding_io = ImageEmbeddingMediaIO()
+
+        return image_embedding_io.load_base64("", data)
+
+
+def encode_audio_base64(
+    audio: np.ndarray,
+    sampling_rate: float,
+) -> str:
+    """Encode audio as base64."""
+    audio_io = AudioMediaIO()
+    return audio_io.encode_base64((audio, sampling_rate))
+
+
+def encode_image_base64(
+    image: Image.Image,
+    *,
+    image_mode: str = "RGB",
+    format: str = "JPEG",
+) -> str:
+    """
+    Encode a pillow image to base64 format.
+
+    By default, the image is converted into RGB format before being encoded.
+    """
+    image_io = ImageMediaIO(image_mode=image_mode)
+    return image_io.encode_base64(image, image_format=format)
+
+
+def encode_video_base64(frames: npt.NDArray) -> str:
+    image_io = ImageMediaIO()
+    video_io = VideoMediaIO(image_io)
+    return video_io.encode_base64(frames)
+
+
+def merge_and_sort_multimodal_metadata(
+    mm_positions: MultiModalPlaceholderDict,
+    mm_hashes: Optional[MultiModalHashDict],
+) -> tuple[list[str], list[PlaceholderRange], Optional[list[str]]]:
+    """Given a MultiModalPlaceholderDict, merge all PlaceholderRange
+    objects from all available modalities into a single list of 
+    PlaceholderRange, sorted by their offset (starting index in the input
+    sequence) in the ascending order.
+
+    Optionally if a `MultiModalHashDict` is given, same operation will be
+    applied to the object and the sorted list of hashes will be returned.
+    
+    Returns:
+        list[str]: List of item modalities in order of their positions in the
+        input sequence.
+        list[PlaceholderRange]: Sorted list of all PlaceholderRanges from
+        mm_positions.
+        Optional[list[str]]: Sorted list of all hashes from mm_hashes if given,
+        None otherwise.
+    """
+
+    modalities = list(mm_positions.keys())
+
+    assert len(modalities) > 0, "No modalities found in the mm_positions."
+
+    # For single modality, placeholder ranges and hashes are already sorted
+    # so we can return the list directly.
+    if len(modalities) == 1:
+        modality = modalities[0]
+        placeholder_list = list(mm_positions[modality])
+
+        return [modality] * len(
+            placeholder_list
+        ), placeholder_list, None if not mm_hashes else mm_hashes[modality]
+
+    # Create a list of (modality, placeholder, hash) tuples for all placeholders
+    all_items = []
+    for modality in modalities:
+        placeholder_list = list(mm_positions[modality])
+        hash_list: list[Optional[str]] = list(
+            mm_hashes[modality]) if mm_hashes and modality in mm_hashes else [
+                None
+            ] * len(placeholder_list)
+
+        for placeholder, hash_value in zip(placeholder_list, hash_list):
+            all_items.append((modality, placeholder, hash_value))
+
+    # Sort all items by offset
+    all_items.sort(key=lambda x: x[1].offset)
+
+    # Split into separate lists
+    sorted_modalities = [item[0] for item in all_items]
+    merged_placeholders = [item[1] for item in all_items]
+    merged_hashes = [str(item[2])
+                     for item in all_items] if mm_hashes is not None else None
+
+    return sorted_modalities, merged_placeholders, merged_hashes
+
+
+def group_mm_inputs_by_modality(
+        mm_inputs: list[MultiModalKwargs]) -> list[list[MultiModalKwargs]]:
+    """Group consecutive MultiModalKwargs from mm_inputs with the same modality
+    together into the same list for batching purpose. For MultiModalKwargs with
+    multiple modalities, put them into their own list.
+
+    Args:
+        mm_inputs: List of MultiModalKwargs.
+
+    Returns:
+        list[list[vllm.multimodal.MultiModalKwargs]]: List of list of
+        `MultiModalKwargs`, each inner list contains consecutive
+        `MultiModalKwargs` with same modality.
+    """
+    if not mm_inputs:
+        return []
+
+    def modality_group_func(mm_input: MultiModalKwargs) -> Union[str, int]:
+        # If the input has multiple modalities, return a id as the unique key
+        # for the mm_input input.
+        if len(mm_input.modalities) > 1:
+            return id(mm_input)
+
+        elif len(mm_input.modalities) == 1:
+            return list(mm_input.modalities)[0]
+
+        # FIXME(Isotr0py): Modality of mm_input from legacy pipeline is empty,
+        # this is used to make InternVL with legacy pipeline still work with v1.
+        else:
+            return ""
+
+    return [
+        list(group) for _, group in groupby(mm_inputs, key=modality_group_func)
+    ]
+
+
+def run_dp_sharded_vision_model(image_input: torch.Tensor,
+                                vision_model: torch.nn.Module) -> torch.Tensor:
+    """Run a vision model with data parallelism (DP) sharding. The function 
+    will shard the input image tensor on the first dimension and run the vision
+    model
+
+    Args:
+        image_input (torch.Tensor): Image input tensor.
+        vision_model (torch.nn.Module): Vision model.
+
+    Returns:
+        torch.Tensor: Output image embeddings
+    """
+
+    num_chunks = image_input.shape[0]
+    mp_world_size = get_tensor_model_parallel_world_size()
+    num_chunks_per_rank = (num_chunks + mp_world_size - 1) // mp_world_size
+    num_padded_chunks = num_chunks_per_rank * mp_world_size - num_chunks
+    pad = (0, ) * (2 * (image_input.dim() - 1)) + (0, num_padded_chunks)
+    image_input_padded = torch.nn.functional.pad(image_input, pad)
+    rank = get_tensor_model_parallel_rank()
+    image_input_per_rank = image_input_padded[rank *
+                                              num_chunks_per_rank:(rank + 1) *
+                                              num_chunks_per_rank, ...]
+
+    vision_embeddings = vision_model(image_input_per_rank)
+    vision_embeddings = tensor_model_parallel_all_gather(vision_embeddings,
+                                                         dim=0)
+    vision_embeddings = vision_embeddings[:num_chunks, ...]
+    return vision_embeddings
+
+
+def fetch_audio(
+    audio_url: str,
+    audio_io_kwargs: Optional[dict[str, Any]] = None,
+) -> tuple[np.ndarray, Union[int, float]]:
+    """
+    Args:
+        audio_url: URL of the audio file to fetch.
+        audio_io_kwargs: Additional kwargs passed to handle audio IO.
+    """
+    media_io_kwargs = None if not audio_io_kwargs else {
+        "audio": audio_io_kwargs
+    }
+    media_connector = MediaConnector(media_io_kwargs=media_io_kwargs)
+    return media_connector.fetch_audio(audio_url)
+
+
+def fetch_image(
+    image_url: str,
+    image_io_kwargs: Optional[dict[str, Any]] = None,
+) -> Image.Image:
+    """
+    Args:
+        image_url: URL of the image file to fetch.
+        image_io_kwargs: Additional kwargs passed to handle image IO.
+    """
+    media_io_kwargs = None if not image_io_kwargs else {
+        "image": image_io_kwargs
+    }
+    media_connector = MediaConnector(media_io_kwargs=media_io_kwargs)
+    return media_connector.fetch_image(image_url)
+
+
+def fetch_video(
+    video_url: str,
+    video_io_kwargs: Optional[dict[str, Any]] = None,
+) -> tuple[npt.NDArray, dict[str, Any]]:
+    """
+    Args:
+        video_url: URL of the video file to fetch.
+        video_io_kwargs: Additional kwargs passed to handle video IO.
+    """
+    media_io_kwargs = None if not video_io_kwargs else {
+        "video": video_io_kwargs
+    }
+    media_connector = MediaConnector(media_io_kwargs=media_io_kwargs)
+    return media_connector.fetch_video(video_url)
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/multimodal/video.py b/vllm_v0.10.0/vllm/multimodal/video.py
new file mode 100644
index 0000000..ef1380b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/multimodal/video.py
@@ -0,0 +1,227 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import base64
+from abc import abstractmethod
+from functools import partial
+from io import BytesIO
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import numpy.typing as npt
+from PIL import Image
+
+from vllm import envs
+
+from .base import MediaIO
+from .image import ImageMediaIO
+
+
+def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray:
+    num_frames, _, _, channels = frames.shape
+    new_height, new_width = size
+    resized_frames = np.empty((num_frames, new_height, new_width, channels),
+                              dtype=frames.dtype)
+    # lazy import cv2 to avoid bothering users who only use text models
+    import cv2
+
+    for i, frame in enumerate(frames):
+        resized_frame = cv2.resize(frame, (new_width, new_height))
+        resized_frames[i] = resized_frame
+    return resized_frames
+
+
+def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray:
+    _, height, width, _ = frames.shape
+    new_height = int(height * size_factor)
+    new_width = int(width * size_factor)
+
+    return resize_video(frames, (new_height, new_width))
+
+
+def sample_frames_from_video(frames: npt.NDArray,
+                             num_frames: int) -> npt.NDArray:
+    total_frames = frames.shape[0]
+    if num_frames == -1:
+        return frames
+
+    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+    sampled_frames = frames[frame_indices, ...]
+    return sampled_frames
+
+
+class VideoLoader:
+
+    @classmethod
+    @abstractmethod
+    def load_bytes(cls,
+                   data: bytes,
+                   num_frames: int = -1,
+                   **kwargs) -> tuple[npt.NDArray, dict[str, Any]]:
+        raise NotImplementedError
+
+
+class VideoLoaderRegistry:
+
+    def __init__(self) -> None:
+        self.name2class: dict[str, type] = {}
+
+    def register(self, name: str):
+
+        def wrap(cls_to_register):
+            self.name2class[name] = cls_to_register
+            return cls_to_register
+
+        return wrap
+
+    @staticmethod
+    def load(cls_name: str) -> VideoLoader:
+        cls = VIDEO_LOADER_REGISTRY.name2class.get(cls_name)
+        assert cls is not None, f"VideoLoader class {cls_name} not found"
+        return cls()
+
+
+VIDEO_LOADER_REGISTRY = VideoLoaderRegistry()
+
+
+@VIDEO_LOADER_REGISTRY.register("opencv")
+class OpenCVVideoBackend(VideoLoader):
+
+    def get_cv2_video_api(self):
+        import cv2.videoio_registry as vr
+
+        api_pref = None
+        for backend in vr.getStreamBufferedBackends():
+            if not vr.hasBackend(backend):
+                continue
+            if not vr.isBackendBuiltIn(backend):
+                _, abi, api = vr.getStreamBufferedBackendPluginVersion(backend)
+                if abi < 1 or (abi == 1 and api < 2):
+                    continue
+            api_pref = backend
+            break
+        return api_pref
+
+    @classmethod
+    def load_bytes(cls,
+                   data: bytes,
+                   num_frames: int = -1,
+                   **kwargs) -> tuple[npt.NDArray, dict[str, Any]]:
+        import cv2
+
+        backend = cls().get_cv2_video_api()
+        cap = cv2.VideoCapture(BytesIO(data), backend, [])
+        if not cap.isOpened():
+            raise ValueError("Could not open video stream")
+
+        total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        original_fps = cap.get(cv2.CAP_PROP_FPS)
+        duration = total_frames_num / original_fps if original_fps > 0 else 0
+
+        full_read = num_frames == -1 or total_frames_num < num_frames
+        if full_read:
+            num_frames = total_frames_num
+            frame_idx = list(range(0, num_frames))
+        else:
+            uniform_sampled_frames = np.linspace(0,
+                                                 total_frames_num - 1,
+                                                 num_frames,
+                                                 dtype=int)
+            frame_idx = uniform_sampled_frames.tolist()
+
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        frames = np.empty((len(frame_idx), height, width, 3), dtype=np.uint8)
+
+        i = 0
+        for idx in range(total_frames_num):
+            ok = cap.grab()
+            if not ok:
+                break
+            if idx in frame_idx:
+                ret, frame = cap.retrieve()
+                if ret:
+                    frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    i += 1
+
+        assert i == num_frames, (f"Expected reading {num_frames} frames, "
+                                 f"but only loaded {i} frames from video.")
+
+        # Use transformers transformers.video_utils.VideoMetadata format
+        metadata = {
+            "total_num_frames": total_frames_num,
+            "fps": original_fps,
+            "duration": duration,
+            "video_backend": "opencv"
+        }
+
+        return frames, metadata
+
+
+class VideoMediaIO(MediaIO[npt.NDArray]):
+
+    def __init__(
+        self,
+        image_io: ImageMediaIO,
+        num_frames: int = 32,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+
+        self.image_io = image_io
+        self.num_frames = num_frames
+        # `kwargs` contains custom arguments from
+        # --media-io-kwargs for this modality.
+        # They can be passed to the underlying
+        # media loaders (e.g. custom implementations)
+        # for flexible control.
+        self.kwargs = kwargs
+        video_loader_backend = envs.VLLM_VIDEO_LOADER_BACKEND
+        self.video_loader = VIDEO_LOADER_REGISTRY.load(video_loader_backend)
+
+    def load_bytes(self, data: bytes) -> tuple[npt.NDArray, dict[str, Any]]:
+        return self.video_loader.load_bytes(data,
+                                            num_frames=self.num_frames,
+                                            **self.kwargs)
+
+    def load_base64(self, media_type: str,
+                    data: str) -> tuple[npt.NDArray, dict[str, Any]]:
+        if media_type.lower() == "video/jpeg":
+            load_frame = partial(
+                self.image_io.load_base64,
+                "image/jpeg",
+            )
+
+            return np.stack([
+                np.asarray(load_frame(frame_data))
+                for frame_data in data.split(",")
+            ]), {}
+
+        return self.load_bytes(base64.b64decode(data))
+
+    def load_file(self, filepath: Path) -> tuple[npt.NDArray, dict[str, Any]]:
+        with filepath.open("rb") as f:
+            data = f.read()
+
+        return self.load_bytes(data)
+
+    def encode_base64(
+        self,
+        media: npt.NDArray,
+        *,
+        video_format: str = "JPEG",
+    ) -> str:
+        video = media
+
+        if video_format == "JPEG":
+            encode_frame = partial(
+                self.image_io.encode_base64,
+                image_format=video_format,
+            )
+
+            return ",".join(
+                encode_frame(Image.fromarray(frame)) for frame in video)
+
+        msg = "Only JPEG format is supported for now."
+        raise NotImplementedError(msg)
diff --git a/vllm_v0.10.0/vllm/outputs.py b/vllm_v0.10.0/vllm/outputs.py
new file mode 100644
index 0000000..9784a88
--- /dev/null
+++ b/vllm_v0.10.0/vllm/outputs.py
@@ -0,0 +1,516 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import time
+from collections.abc import MutableSequence
+from collections.abc import Sequence as GenericSequence
+from dataclasses import dataclass
+from typing import Any, Generic, Optional, Union
+
+import torch
+from typing_extensions import TypeVar
+
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.multimodal.inputs import MultiModalPlaceholderDict
+from vllm.sampling_params import RequestOutputKind
+from vllm.sequence import (PromptLogprobs, RequestMetrics, SampleLogprobs,
+                           SequenceGroup, SequenceGroupBase, SequenceStatus)
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class CompletionOutput:
+    """The output data of one completion output of a request.
+
+    Args:
+        index: The index of the output in the request.
+        text: The generated output text.
+        token_ids: The token IDs of the generated output text.
+        cumulative_logprob: The cumulative log probability of the generated
+            output text.
+        logprobs: The log probabilities of the top probability words at each
+            position if the logprobs are requested.
+        finish_reason: The reason why the sequence is finished.
+        stop_reason: The stop string or token id that caused the completion
+            to stop, None if the completion finished for some other reason
+            including encountering the EOS token.
+        lora_request: The LoRA request that was used to generate the output.
+    """
+
+    index: int
+    text: str
+    token_ids: GenericSequence[int]
+    cumulative_logprob: Optional[float]
+    logprobs: Optional[SampleLogprobs]
+    finish_reason: Optional[str] = None
+    stop_reason: Union[int, str, None] = None
+    lora_request: Optional[LoRARequest] = None
+
+    def finished(self) -> bool:
+        return self.finish_reason is not None
+
+    def __repr__(self) -> str:
+        return (f"CompletionOutput(index={self.index}, "
+                f"text={self.text!r}, "
+                f"token_ids={self.token_ids}, "
+                f"cumulative_logprob={self.cumulative_logprob}, "
+                f"logprobs={self.logprobs}, "
+                f"finish_reason={self.finish_reason}, "
+                f"stop_reason={self.stop_reason})")
+
+
+@dataclass
+class PoolingOutput:
+    """The output data of one pooling output of a request.
+
+    Args:
+        data: The extracted hidden states.
+    """
+    data: torch.Tensor
+
+    def __repr__(self) -> str:
+        return (f"PoolingOutput(data={self.data})")
+
+    def __eq__(self, other: object) -> bool:
+        return (isinstance(other, self.__class__) and bool(
+            (self.data == other.data).all()))
+
+
+class RequestOutput:
+    """The output data of a completion request to the LLM.
+
+    Args:
+        request_id: The unique ID of the request.
+        prompt: The prompt string of the request.
+                For encoder/decoder models, this is the
+                decoder input prompt.
+        prompt_token_ids: The token IDs of the prompt.
+                          For encoder/decoder models, this is the
+                          decoder input prompt token ids.
+        prompt_logprobs: The log probabilities to return per prompt token.
+        outputs: The output sequences of the request.
+        finished: Whether the whole request is finished.
+        metrics: Metrics associated with the request.
+        lora_request: The LoRA request that was used to generate the output.
+        encoder_prompt: The encoder prompt string of the request.
+                        None if decoder-only.
+        encoder_prompt_token_ids: The token IDs of the encoder prompt.
+                                  None if decoder-only.
+        num_cached_tokens: The number of tokens with prefix cache hit.
+        kv_transfer_params: The params for remote K/V transfer.
+    """
+
+    def __init__(
+        self,
+        request_id: str,
+        prompt: Optional[str],
+        prompt_token_ids: Optional[list[int]],
+        prompt_logprobs: Optional[PromptLogprobs],
+        outputs: list[CompletionOutput],
+        finished: bool,
+        metrics: Optional[RequestMetrics] = None,
+        lora_request: Optional[LoRARequest] = None,
+        encoder_prompt: Optional[str] = None,
+        encoder_prompt_token_ids: Optional[list[int]] = None,
+        num_cached_tokens: Optional[int] = None,
+        *,
+        multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None,
+        kv_transfer_params: Optional[dict[str, Any]] = None,
+        # Forward compatibility, code that uses args added in new release can
+        # still run with older versions of vLLM without breaking.
+        **kwargs: Any,
+    ) -> None:
+        if kwargs:
+            logger.warning_once("RequestOutput: Ignoring extra arguments: %s",
+                                str(kwargs))
+        self.request_id = request_id
+        self.prompt = prompt
+        self.prompt_token_ids = prompt_token_ids
+        self.multi_modal_placeholders = multi_modal_placeholders or {}
+        self.prompt_logprobs = prompt_logprobs
+        self.outputs = outputs
+        self.finished = finished
+        self.metrics = metrics
+        self.lora_request = lora_request
+        self.encoder_prompt = encoder_prompt
+        self.encoder_prompt_token_ids = encoder_prompt_token_ids
+        self.num_cached_tokens = num_cached_tokens
+        self.kv_transfer_params = kv_transfer_params
+
+    def add(self, next_output: "RequestOutput", aggregate: bool) -> None:
+        """Merge subsequent RequestOutput into this one"""
+
+        self.finished |= next_output.finished
+        self.kv_transfer_params = next_output.kv_transfer_params
+
+        for next_completion in next_output.outputs:
+            for i, completion in enumerate(self.outputs):
+                if completion.index == next_completion.index:
+                    if aggregate:
+                        # Merge outputs with same index
+                        completion.text += next_completion.text
+                        if not isinstance(completion.token_ids,
+                                          MutableSequence):
+                            completion.token_ids = list(completion.token_ids)
+                        completion.token_ids.extend(next_completion.token_ids)
+                        if next_completion.logprobs:
+                            assert completion.logprobs is not None
+                            completion.logprobs.extend(
+                                next_completion.logprobs)
+                        completion.cumulative_logprob = (
+                            next_completion.cumulative_logprob)
+                        completion.finish_reason = next_completion.finish_reason
+                        completion.stop_reason = next_completion.stop_reason
+                    else:
+                        # Replace the output with the new one
+                        self.outputs[i] = next_completion
+                    break
+            else:
+                self.outputs.append(next_completion)
+
+    @classmethod
+    def from_seq_group(
+        cls, seq_group: SequenceGroup, use_cache: bool,
+        seq_id_to_seq_group: dict[str, SequenceGroupBase]
+    ) -> Optional["RequestOutput"]:
+        finished = seq_group.is_finished()
+
+        if seq_group.request_id in seq_id_to_seq_group:
+            group: SequenceGroupBase = seq_id_to_seq_group[
+                seq_group.request_id]
+            assembled_seq_group = group.maybe_assemble_group(seq_group)
+            if finished:
+                group.finish_seq(seq_group)
+            if assembled_seq_group is None:
+                return None
+
+            # clear finished seq in seq_id_to_seq_group
+            if len(group.to_be_finished) == 0:
+                for sub_request_id in list(group.seq_id_to_index.keys()):
+                    if sub_request_id in seq_id_to_seq_group:
+                        del seq_id_to_seq_group[sub_request_id]
+
+            return cls.from_seq_group(assembled_seq_group, use_cache,
+                                      seq_id_to_seq_group)
+
+        sampling_params = seq_group.sampling_params
+        if sampling_params is None:
+            raise ValueError(
+                "Sampling parameters are missing for a CompletionRequest.")
+
+        if sampling_params.output_kind == RequestOutputKind.FINAL_ONLY and (
+                not finished):
+            return None
+
+        # Init cache (if needed)
+        if use_cache and seq_group.cached_request_output is None:
+            seq_group.cached_request_output = RequestOutput(  # type: ignore
+                request_id="",
+                prompt=None,
+                prompt_token_ids=[],
+                prompt_logprobs=None,
+                outputs=[],
+                finished=False)
+
+        top_n_seqs = seq_group.get_seqs()
+
+        # Create the outputs.
+        # NOTE: We need omit logprobs here explicitly because the sequence
+        # always has the logprobs of the sampled tokens even if the
+        # logprobs are not requested.
+        include_logprobs = sampling_params.logprobs is not None
+        text_buffer_length = sampling_params.output_text_buffer_length
+        delta = sampling_params.output_kind == RequestOutputKind.DELTA
+
+        outputs = []
+        include_prompt = True
+        # num_cached_tokens should be the same for all the sequences
+        num_cached_tokens = None
+        for i, seq in enumerate(top_n_seqs):
+            output_text = seq.get_output_text_to_return(
+                text_buffer_length, delta)
+
+            output_token_ids = seq.get_output_token_ids_to_return(delta)
+            num_output_tokens = 1 if isinstance(output_token_ids,
+                                                int) else len(output_token_ids)
+            num_cached_tokens = seq.data.get_num_cached_tokens()
+
+            output_logprobs = seq.output_logprobs if include_logprobs else None
+
+            if delta:
+                # Slice logprobs delta if applicable
+                if output_logprobs:
+                    # num_output_tokens can be 0 when n > 1 and request finishes
+                    # before the others
+                    if num_output_tokens > 0:
+                        output_logprobs = output_logprobs[-num_output_tokens:]
+                    else:
+                        output_logprobs = None
+                # Don't include prompt if this is after the first output
+                # containing decode token ids
+                if include_prompt and seq.get_output_len() > num_output_tokens:
+                    include_prompt = False
+
+            if use_cache:
+                # Get cached output object
+                cached_outputs = seq_group.cached_request_output.outputs  # type: ignore
+                if i >= len(cached_outputs):
+                    cached_outputs.append(
+                        CompletionOutput(index=i,
+                                         text="",
+                                         token_ids=[],
+                                         cumulative_logprob=None,
+                                         logprobs=None,
+                                         finish_reason=None,
+                                         stop_reason=None))
+                output = cached_outputs[i]
+
+                # Init cached output object
+                assert output.index == i
+                output.text = output_text
+
+                if isinstance(output_token_ids, int):
+                    output.token_ids.clear()
+                    output.token_ids.append(output_token_ids)
+                else:
+                    output.token_ids = output_token_ids
+
+                output.cumulative_logprob = seq.get_cumulative_logprob() \
+                    if include_logprobs else None
+                output.logprobs = output_logprobs
+                output.finish_reason = SequenceStatus.get_finished_reason(
+                    seq.status)
+                output.stop_reason = seq.stop_reason
+
+            else:
+                output = CompletionOutput(
+                    top_n_seqs.index(seq), output_text, [output_token_ids]
+                    if isinstance(output_token_ids, int) else output_token_ids,
+                    seq.get_cumulative_logprob() if include_logprobs else None,
+                    output_logprobs,
+                    SequenceStatus.get_finished_reason(seq.status),
+                    seq.stop_reason)
+
+            outputs.append(output)
+
+        # Every sequence in the sequence group should have the same prompt.
+        if include_prompt:
+            prompt = seq_group.prompt
+            prompt_token_ids = seq_group.prompt_token_ids
+            encoder_prompt = seq_group.encoder_prompt
+            encoder_prompt_token_ids = seq_group.encoder_prompt_token_ids
+            prompt_logprobs = seq_group.prompt_logprobs
+        else:
+            prompt = None
+            prompt_token_ids = None
+            encoder_prompt = None
+            encoder_prompt_token_ids = None
+            prompt_logprobs = None
+        finished_time = time.time() if finished else None
+        seq_group.set_finished_time(finished_time)
+
+        init_kwargs = {
+            "request_id": seq_group.request_id,
+            "prompt": prompt,
+            "prompt_token_ids": prompt_token_ids,
+            "prompt_logprobs": prompt_logprobs,
+            "outputs": outputs,
+            "finished": finished,
+            "metrics": seq_group.metrics,
+            "lora_request": seq_group.lora_request,
+            "encoder_prompt": encoder_prompt,
+            "encoder_prompt_token_ids": encoder_prompt_token_ids,
+            "num_cached_tokens": num_cached_tokens,
+            "multi_modal_placeholders": seq_group.multi_modal_placeholders
+        }
+
+        if use_cache:
+            request_output = seq_group.cached_request_output
+            request_output.__init__(**init_kwargs)  # type: ignore
+        else:
+            request_output = cls(**init_kwargs)  # type: ignore
+
+        return request_output
+
+    def __repr__(self) -> str:
+        return (f"RequestOutput(request_id={self.request_id}, "
+                f"prompt={self.prompt!r}, "
+                f"prompt_token_ids={self.prompt_token_ids}, "
+                f"encoder_prompt={self.encoder_prompt!r}, "
+                f"encoder_prompt_token_ids={self.encoder_prompt_token_ids}, "
+                f"prompt_logprobs={self.prompt_logprobs}, "
+                f"outputs={self.outputs}, "
+                f"finished={self.finished}, "
+                f"metrics={self.metrics}, "
+                f"lora_request={self.lora_request}, "
+                f"num_cached_tokens={self.num_cached_tokens}, "
+                f"multi_modal_placeholders={self.multi_modal_placeholders})")
+
+
+_O = TypeVar("_O", default=PoolingOutput)
+
+
+class PoolingRequestOutput(Generic[_O]):
+    """
+    The output data of a pooling request to the LLM.
+
+    Args:
+        request_id (str): A unique identifier for the pooling request.
+        outputs (PoolingOutput): The pooling results for the given input.
+        prompt_token_ids (list[int]): A list of token IDs used in the prompt.
+        finished (bool): A flag indicating whether the pooling is completed.
+    """
+
+    def __init__(self, request_id: str, outputs: _O,
+                 prompt_token_ids: list[int], finished: bool):
+        self.request_id = request_id
+        self.prompt_token_ids = prompt_token_ids
+        self.finished = finished
+        self.outputs = outputs
+
+    @staticmethod
+    def from_seq_group(seq_group: SequenceGroup) -> "PoolingRequestOutput":
+        pooled_data = seq_group.pooled_data
+        assert pooled_data is not None
+
+        data = pooled_data.to(dtype=torch.float32, device="cpu")
+        output = PoolingOutput(data)
+        prompt_token_ids = seq_group.prompt_token_ids
+        finished = seq_group.is_finished()
+
+        return PoolingRequestOutput(seq_group.request_id, output,
+                                    prompt_token_ids, finished)
+
+    def __repr__(self):
+        return (f"{type(self).__name__}(request_id={self.request_id!r}, "
+                f"outputs={self.outputs!r}, "
+                f"prompt_token_ids={self.prompt_token_ids}, "
+                f"finished={self.finished})")
+
+
+class RequestOutputFactory:
+
+    @staticmethod
+    def create(seq_group: SequenceGroup,
+               seq_id_to_seq_group: dict[str, SequenceGroupBase],
+               use_cache: bool = False):
+        if seq_group.pooled_data is not None:
+            return PoolingRequestOutput.from_seq_group(seq_group)
+        else:
+            return RequestOutput.from_seq_group(seq_group, use_cache,
+                                                seq_id_to_seq_group)
+
+
+@dataclass
+class EmbeddingOutput:
+    """The output data of one embedding output of a request.
+
+    Args:
+        embedding: The embedding vector, which is a list of floats.
+        Its length depends on the hidden dimension of the model.
+    """
+    embedding: list[float]
+
+    @staticmethod
+    def from_base(pooling_output: PoolingOutput):
+        pooled_data = pooling_output.data
+        if pooled_data.ndim != 1:
+            raise ValueError("pooled_data should be a 1-D embedding vector")
+
+        return EmbeddingOutput(pooled_data.tolist())
+
+    @property
+    def hidden_size(self) -> int:
+        return len(self.embedding)
+
+    def __repr__(self) -> str:
+        return f"EmbeddingOutput(hidden_size={self.hidden_size})"
+
+
+class EmbeddingRequestOutput(PoolingRequestOutput[EmbeddingOutput]):
+
+    @staticmethod
+    def from_base(request_output: PoolingRequestOutput):
+        return EmbeddingRequestOutput(
+            request_id=request_output.request_id,
+            outputs=EmbeddingOutput.from_base(request_output.outputs),
+            prompt_token_ids=request_output.prompt_token_ids,
+            finished=request_output.finished,
+        )
+
+
+@dataclass
+class ClassificationOutput:
+    """The output data of one classification output of a request.
+
+    Args:
+        probs: The probability vector, which is a list of floats.
+        Its length depends on the number of classes.
+    """
+    probs: list[float]
+
+    @staticmethod
+    def from_base(pooling_output: PoolingOutput):
+        # pooling_output shape: (num_classes)
+        pooled_data = pooling_output.data
+        if pooled_data.ndim != 1:
+            raise ValueError("pooled_data should be a 1-D probability vector")
+
+        return ClassificationOutput(pooled_data.tolist())
+
+    @property
+    def num_classes(self) -> int:
+        return len(self.probs)
+
+    def __repr__(self) -> str:
+        return f"ClassificationOutput(num_classes={self.num_classes})"
+
+
+class ClassificationRequestOutput(PoolingRequestOutput[ClassificationOutput]):
+
+    @staticmethod
+    def from_base(request_output: PoolingRequestOutput):
+        return ClassificationRequestOutput(
+            request_id=request_output.request_id,
+            outputs=ClassificationOutput.from_base(request_output.outputs),
+            prompt_token_ids=request_output.prompt_token_ids,
+            finished=request_output.finished,
+        )
+
+
+@dataclass
+class ScoringOutput:
+    """The output data of one scoring output of a request.
+
+    Args:
+        score: The similarity score, which is a scalar value.
+    """
+    score: float
+
+    @staticmethod
+    def from_base(pooling_output: PoolingOutput):
+        # pooling_output shape:
+        #   classify task: (num_classes) num_classes == 1
+        #   embed task: a scalar value
+        pooled_data = pooling_output.data.squeeze()
+        if pooled_data.ndim != 0:
+            raise ValueError("pooled_data should be a scalar score")
+
+        return ScoringOutput(pooled_data.item())
+
+    def __repr__(self) -> str:
+        return f"ScoringOutput(score={self.score})"
+
+
+class ScoringRequestOutput(PoolingRequestOutput[ScoringOutput]):
+
+    @staticmethod
+    def from_base(request_output: PoolingRequestOutput):
+        return ScoringRequestOutput(
+            request_id=request_output.request_id,
+            outputs=ScoringOutput.from_base(request_output.outputs),
+            prompt_token_ids=request_output.prompt_token_ids,
+            finished=request_output.finished,
+        )
diff --git a/vllm_v0.10.0/vllm/platforms/__init__.py b/vllm_v0.10.0/vllm/platforms/__init__.py
new file mode 100644
index 0000000..c13659f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/platforms/__init__.py
@@ -0,0 +1,282 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import logging
+import traceback
+from itertools import chain
+from typing import TYPE_CHECKING, Optional
+
+from vllm.plugins import load_plugins_by_group
+from vllm.utils import resolve_obj_by_qualname, supports_xccl
+
+from .interface import _Backend  # noqa: F401
+from .interface import CpuArchEnum, Platform, PlatformEnum
+
+logger = logging.getLogger(__name__)
+
+
+def vllm_version_matches_substr(substr: str) -> bool:
+    """
+    Check to see if the vLLM version matches a substring.
+    """
+    from importlib.metadata import PackageNotFoundError, version
+    try:
+        vllm_version = version("vllm")
+    except PackageNotFoundError as e:
+        logger.warning(
+            "The vLLM package was not found, so its version could not be "
+            "inspected. This may cause platform detection to fail.")
+        raise e
+    return substr in vllm_version
+
+
+def tpu_platform_plugin() -> Optional[str]:
+    is_tpu = False
+    logger.debug("Checking if TPU platform is available.")
+    try:
+        # While it's technically possible to install libtpu on a
+        # non-TPU machine, this is a very uncommon scenario. Therefore,
+        # we assume that libtpu is installed if and only if the machine
+        # has TPUs.
+        import libtpu  # noqa: F401
+        is_tpu = True
+        logger.debug("Confirmed TPU platform is available.")
+    except Exception as e:
+        logger.debug("TPU platform is not available because: %s", str(e))
+
+    return "vllm.platforms.tpu.TpuPlatform" if is_tpu else None
+
+
+def cuda_platform_plugin() -> Optional[str]:
+    is_cuda = False
+    logger.debug("Checking if CUDA platform is available.")
+    try:
+        from vllm.utils import import_pynvml
+        pynvml = import_pynvml()
+        pynvml.nvmlInit()
+        try:
+            # NOTE: Edge case: vllm cpu build on a GPU machine.
+            # Third-party pynvml can be imported in cpu build,
+            # we need to check if vllm is built with cpu too.
+            # Otherwise, vllm will always activate cuda plugin
+            # on a GPU machine, even if in a cpu build.
+            is_cuda = (pynvml.nvmlDeviceGetCount() > 0
+                       and not vllm_version_matches_substr("cpu"))
+            if pynvml.nvmlDeviceGetCount() <= 0:
+                logger.debug(
+                    "CUDA platform is not available because no GPU is found.")
+            if vllm_version_matches_substr("cpu"):
+                logger.debug("CUDA platform is not available because"
+                             " vLLM is built with CPU.")
+            if is_cuda:
+                logger.debug("Confirmed CUDA platform is available.")
+        finally:
+            pynvml.nvmlShutdown()
+    except Exception as e:
+        logger.debug("Exception happens when checking CUDA platform: %s",
+                     str(e))
+        if "nvml" not in e.__class__.__name__.lower():
+            # If the error is not related to NVML, re-raise it.
+            raise e
+
+        # CUDA is supported on Jetson, but NVML may not be.
+        import os
+
+        def cuda_is_jetson() -> bool:
+            return os.path.isfile("/etc/nv_tegra_release") \
+                or os.path.exists("/sys/class/tegra-firmware")
+
+        if cuda_is_jetson():
+            logger.debug("Confirmed CUDA platform is available on Jetson.")
+            is_cuda = True
+        else:
+            logger.debug("CUDA platform is not available because: %s", str(e))
+
+    return "vllm.platforms.cuda.CudaPlatform" if is_cuda else None
+
+
+def rocm_platform_plugin() -> Optional[str]:
+    is_rocm = False
+    logger.debug("Checking if ROCm platform is available.")
+    try:
+        import amdsmi
+        amdsmi.amdsmi_init()
+        try:
+            if len(amdsmi.amdsmi_get_processor_handles()) > 0:
+                is_rocm = True
+                logger.debug("Confirmed ROCm platform is available.")
+            else:
+                logger.debug("ROCm platform is not available because"
+                             " no GPU is found.")
+        finally:
+            amdsmi.amdsmi_shut_down()
+    except Exception as e:
+        logger.debug("ROCm platform is not available because: %s", str(e))
+
+    return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None
+
+
+def xpu_platform_plugin() -> Optional[str]:
+    is_xpu = False
+    logger.debug("Checking if XPU platform is available.")
+    try:
+        # installed IPEX if the machine has XPUs.
+        import intel_extension_for_pytorch  # noqa: F401
+        import torch
+        if supports_xccl():
+            dist_backend = "xccl"
+        else:
+            dist_backend = "ccl"
+            import oneccl_bindings_for_pytorch  # noqa: F401
+
+        if hasattr(torch, 'xpu') and torch.xpu.is_available():
+            is_xpu = True
+            from vllm.platforms.xpu import XPUPlatform
+            XPUPlatform.dist_backend = dist_backend
+            logger.debug("Confirmed %s backend is available.",
+                         XPUPlatform.dist_backend)
+            logger.debug("Confirmed XPU platform is available.")
+    except Exception as e:
+        logger.debug("XPU platform is not available because: %s", str(e))
+
+    return "vllm.platforms.xpu.XPUPlatform" if is_xpu else None
+
+
+def cpu_platform_plugin() -> Optional[str]:
+    is_cpu = False
+    logger.debug("Checking if CPU platform is available.")
+    try:
+        is_cpu = vllm_version_matches_substr("cpu")
+        if is_cpu:
+            logger.debug("Confirmed CPU platform is available because"
+                         " vLLM is built with CPU.")
+        if not is_cpu:
+            import sys
+            is_cpu = sys.platform.startswith("darwin")
+            if is_cpu:
+                logger.debug("Confirmed CPU platform is available"
+                             " because the machine is MacOS.")
+
+    except Exception as e:
+        logger.debug("CPU platform is not available because: %s", str(e))
+
+    return "vllm.platforms.cpu.CpuPlatform" if is_cpu else None
+
+
+def neuron_platform_plugin() -> Optional[str]:
+    tnx_installed = False
+    nxd_installed = False
+    logger.debug("Checking if Neuron platform is available.")
+    try:
+        import transformers_neuronx  # noqa: F401
+        tnx_installed = True
+        logger.debug("Confirmed Neuron platform is available because"
+                     " transformers_neuronx is found.")
+    except ImportError:
+        pass
+
+    try:
+        import neuronx_distributed_inference  # noqa: F401
+        nxd_installed = True
+        logger.debug("Confirmed Neuron platform is available because"
+                     " neuronx_distributed_inference is found.")
+    except ImportError:
+        pass
+
+    is_neuron = tnx_installed or nxd_installed
+    return "vllm.platforms.neuron.NeuronPlatform" if is_neuron else None
+
+
+builtin_platform_plugins = {
+    'tpu': tpu_platform_plugin,
+    'cuda': cuda_platform_plugin,
+    'rocm': rocm_platform_plugin,
+    'xpu': xpu_platform_plugin,
+    'cpu': cpu_platform_plugin,
+    'neuron': neuron_platform_plugin,
+}
+
+
+def resolve_current_platform_cls_qualname() -> str:
+    platform_plugins = load_plugins_by_group('vllm.platform_plugins')
+
+    activated_plugins = []
+
+    for name, func in chain(builtin_platform_plugins.items(),
+                            platform_plugins.items()):
+        try:
+            assert callable(func)
+            platform_cls_qualname = func()
+            if platform_cls_qualname is not None:
+                activated_plugins.append(name)
+        except Exception:
+            pass
+
+    activated_builtin_plugins = list(
+        set(activated_plugins) & set(builtin_platform_plugins.keys()))
+    activated_oot_plugins = list(
+        set(activated_plugins) & set(platform_plugins.keys()))
+
+    if len(activated_oot_plugins) >= 2:
+        raise RuntimeError(
+            "Only one platform plugin can be activated, but got: "
+            f"{activated_oot_plugins}")
+    elif len(activated_oot_plugins) == 1:
+        platform_cls_qualname = platform_plugins[activated_oot_plugins[0]]()
+        logger.info("Platform plugin %s is activated",
+                    activated_oot_plugins[0])
+    elif len(activated_builtin_plugins) >= 2:
+        raise RuntimeError(
+            "Only one platform plugin can be activated, but got: "
+            f"{activated_builtin_plugins}")
+    elif len(activated_builtin_plugins) == 1:
+        platform_cls_qualname = builtin_platform_plugins[
+            activated_builtin_plugins[0]]()
+        logger.info("Automatically detected platform %s.",
+                    activated_builtin_plugins[0])
+    else:
+        platform_cls_qualname = "vllm.platforms.interface.UnspecifiedPlatform"
+        logger.info(
+            "No platform detected, vLLM is running on UnspecifiedPlatform")
+    return platform_cls_qualname
+
+
+_current_platform = None
+_init_trace: str = ''
+
+if TYPE_CHECKING:
+    current_platform: Platform
+
+
+def __getattr__(name: str):
+    if name == 'current_platform':
+        # lazy init current_platform.
+        # 1. out-of-tree platform plugins need `from vllm.platforms import
+        #    Platform` so that they can inherit `Platform` class. Therefore,
+        #    we cannot resolve `current_platform` during the import of
+        #    `vllm.platforms`.
+        # 2. when users use out-of-tree platform plugins, they might run
+        #    `import vllm`, some vllm internal code might access
+        #    `current_platform` during the import, and we need to make sure
+        #    `current_platform` is only resolved after the plugins are loaded
+        #    (we have tests for this, if any developer violate this, they will
+        #    see the test failures).
+        global _current_platform
+        if _current_platform is None:
+            platform_cls_qualname = resolve_current_platform_cls_qualname()
+            _current_platform = resolve_obj_by_qualname(
+                platform_cls_qualname)()
+            global _init_trace
+            _init_trace = "".join(traceback.format_stack())
+        return _current_platform
+    elif name in globals():
+        return globals()[name]
+    else:
+        raise AttributeError(
+            f"No attribute named '{name}' exists in {__name__}.")
+
+
+__all__ = [
+    'Platform', 'PlatformEnum', 'current_platform', 'CpuArchEnum',
+    "_init_trace"
+]
diff --git a/vllm_v0.10.0/vllm/platforms/cpu.py b/vllm_v0.10.0/vllm/platforms/cpu.py
new file mode 100644
index 0000000..31a6718
--- /dev/null
+++ b/vllm_v0.10.0/vllm/platforms/cpu.py
@@ -0,0 +1,336 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+import os
+import platform
+import subprocess
+import sys
+from dataclasses import dataclass
+from importlib.util import find_spec
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
+
+from .interface import CpuArchEnum, Platform, PlatformEnum, _Backend
+
+logger = init_logger(__name__)
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+
+
+def get_max_threads(pid=0):
+    if hasattr(os, 'sched_getaffinity'):
+        return len(os.sched_getaffinity(pid))
+    elif platform.system() == 'Darwin':
+        return os.cpu_count()
+    else:
+        raise NotImplementedError("Unsupported OS")
+
+
+@dataclass
+class LogicalCPUInfo:
+    id: int = -1
+    physical_core: int = -1
+    numa_node: int = -1
+
+    @classmethod
+    def _int(cls, value: str) -> int:
+        try:
+            int_value = int(value)
+        except Exception:
+            int_value = -1
+        return int_value
+
+    @staticmethod
+    def json_decoder(obj_dict: dict):
+        id = obj_dict.get("cpu")
+        physical_core = obj_dict.get("core")
+        numa_node = obj_dict.get("node")
+
+        if not (id is None or physical_core is None or numa_node is None):
+            return LogicalCPUInfo(
+                id=LogicalCPUInfo._int(id),
+                physical_core=LogicalCPUInfo._int(physical_core),
+                numa_node=LogicalCPUInfo._int(numa_node))
+        else:
+            return obj_dict
+
+
+class CpuPlatform(Platform):
+    _enum = PlatformEnum.CPU
+    device_name: str = "cpu"
+    device_type: str = "cpu"
+    dispatch_key: str = "CPU"
+    dist_backend: str = "gloo"
+
+    @property
+    def supported_dtypes(self) -> list[torch.dtype]:
+        if self.get_cpu_architecture() == CpuArchEnum.POWERPC:
+            return [torch.bfloat16, torch.float32]
+        elif sys.platform.startswith(
+                "darwin") and self.get_cpu_architecture() == CpuArchEnum.ARM:
+            # TODO: change this condition to check if the platform support bf16
+            # instead of checking the OS. For instance M2 shall supports bf16
+            # already. But we need to modify `cpu_extension.cmake` to activate
+            # the feature in the build.
+            return [torch.float16, torch.float32]
+        # x86/aarch64 CPU has supported both bf16 and fp16 natively.
+        return [torch.bfloat16, torch.float16, torch.float32]
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return "cpu"
+
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
+        if selected_backend and selected_backend != _Backend.TORCH_SDPA:
+            logger.info("Cannot use %s backend on CPU.", selected_backend)
+        if use_mla:
+            raise NotImplementedError("MLA is not supported on CPU.")
+        logger.info("Using Torch SDPA backend.")
+        if not use_v1:
+            raise ValueError("CPU backend only supports V1.")
+        return "vllm.v1.attention.backends.cpu_attn.TorchSDPABackend"
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        import vllm.envs as envs
+        from vllm.utils import GiB_bytes
+
+        kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
+        if kv_cache_space is None:
+            kv_cache_space = 4 * GiB_bytes  # type: ignore
+            logger.warning_once(
+                "Environment variable VLLM_CPU_KVCACHE_SPACE (GiB) "
+                "for CPU backend is not set, using 4 by default.")
+        else:
+            kv_cache_space *= GiB_bytes
+
+        return kv_cache_space
+
+    @classmethod
+    def set_device(cls, device: torch.device) -> None:
+        """
+        Set the device for the current platform.
+        """
+        torch.cpu.set_device(device)
+
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return False
+
+    @classmethod
+    def inference_mode(cls):
+        return torch.no_grad()
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        model_config = vllm_config.model_config
+
+        if model_config is not None:
+            model_config.disable_cascade_attn = True
+
+        cache_config = vllm_config.cache_config
+
+        ipex_available = find_spec("intel_extension_for_pytorch") is not None
+
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 128 if ipex_available else 16
+
+        if not ipex_available and cache_config.block_size != 16:
+            raise RuntimeError(
+                f"--block-size={cache_config.block_size} requires"
+                " intel_extension_for_pytorch")
+
+        scheduler_config = vllm_config.scheduler_config
+        if ((scheduler_config.chunked_prefill_enabled
+             or cache_config.enable_prefix_caching)
+                and cache_config.cache_dtype != "auto"):
+            raise RuntimeError("Chunked-prefill and prefix-cache on the CPU "
+                               "backend is not compatible with FP8 KV cache.")
+
+        if cache_config.cache_dtype == "fp8_e4m3":
+            cache_config.cache_dtype = "fp8_e5m2"
+            logger.warning(
+                "CPU backend doesn't support fp8_e4m3 KV cache type, "
+                "cast to fp8_e5m2.")
+
+        if (cache_config.cache_dtype != "auto" and model_config is not None
+                and model_config.dtype == torch.half):
+            logger.warning("FP8 KV cache on the CPU backend only does not"
+                           " support fp16 for now, cast to bf16.")
+            model_config.dtype = torch.bfloat16
+
+        cache_config.cpu_kvcache_space_bytes = \
+            CpuPlatform.get_device_total_memory()
+
+        parallel_config = vllm_config.parallel_config
+        if (parallel_config.world_size > 1
+                and parallel_config.distributed_executor_backend is not None
+                and parallel_config.distributed_executor_backend != "mp"):
+            logger.warning(("%s is not supported on CPU, fallback to mp "
+                            "distributed executor backend."),
+                           parallel_config.distributed_executor_backend)
+            parallel_config.distributed_executor_backend = "mp"
+        if parallel_config.worker_cls == "auto":
+            parallel_config.worker_cls = "vllm.v1.worker.cpu_worker.CPUWorker"
+
+        # Note: workaround for v1 gpu_model_runner
+        from vllm.config import CompilationLevel
+        vllm_config.compilation_config.cudagraph_capture_sizes = []
+
+        compilation_config = vllm_config.compilation_config
+        if vllm_config.compilation_config.level == CompilationLevel.PIECEWISE:
+
+            # Note: vLLM V1 is using PIECEWISE level compilation, which will
+            # take time to compile kernels just-in-time with the inductor
+            # backend. For CPU CI tests, most of them are executed fast and
+            # compilations consume too much time, even with torch compile
+            # cache. So use VLLM_CPU_CI_ENV to indicate the CI environment,
+            # and just execute model with dynamo + eager mode to save time.
+            # VLLM_CPU_CI_ENV is only used as an internal variable.
+            if os.environ.get("VLLM_CPU_CI_ENV", "0") != "0":
+                backend = "eager"
+            else:
+                backend = "inductor"
+
+            compilation_config.level = CompilationLevel.DYNAMO_ONCE
+            compilation_config.backend = backend
+            compilation_config.inductor_compile_config.update({
+                "dce":
+                True,
+                "size_asserts":
+                False,
+                "nan_asserts":
+                False,
+                "epilogue_fusion":
+                True,
+            })
+            if compilation_config.use_inductor:
+                compilation_config.custom_ops = ["none"]
+
+        if vllm_config.lora_config is not None:
+            compilation_config.level = CompilationLevel.NO_COMPILATION
+
+        assert vllm_config.device_config.device_type == "cpu"
+
+        #
+        # Environment variables for CPU executor
+        #
+
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+        # Note: to avoid the error 'nthreads cannot be larger than environment
+        #  variable "NUMEXPR_MAX_THREADS" (64)'.
+        os.environ["NUMEXPR_MAX_THREADS"] = str(get_max_threads())
+
+        # Set default threads num for OpenMP parallel
+        os.environ["OMP_NUM_THREADS"] = str(torch.get_num_threads())
+
+        # Disable torch async compiling which won't work with daemonic processes
+        os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
+
+        # Intel OpenMP setting
+        ld_prealod_str = os.getenv("LD_PRELOAD", "")
+        if "libiomp5.so" in ld_prealod_str:
+            # The time(milliseconds) that a thread should wait after
+            # completing the execution of a parallel region, before sleeping.
+            os.environ['KMP_BLOCKTIME'] = "1"
+            # Prevents the CPU to run into low performance state
+            os.environ['KMP_TPAUSE'] = "0"
+            # Provides fine granularity parallelism
+            os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist"
+            os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist"
+            os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist"
+
+        # To hint IPEX uses shared memory based AllReduce
+        os.environ["LOCAL_WORLD_SIZE"] = str(
+            vllm_config.parallel_config.tensor_parallel_size)
+
+        if model_config is not None and model_config.use_mla:
+            logger.info(
+                "MLA is enabled on a non-GPU platform; forcing chunked "
+                "prefill and prefix caching to be disabled.")
+            vllm_config.scheduler_config.enable_chunked_prefill = False
+            vllm_config.scheduler_config.chunked_prefill_enabled = False
+            vllm_config.scheduler_config.max_num_batched_tokens = max(
+                vllm_config.scheduler_config.max_model_len,
+                DEFAULT_MAX_NUM_BATCHED_TOKENS)
+
+    @classmethod
+    def get_allowed_cpu_memory_node_list(
+            cls) -> tuple[list[int], list[LogicalCPUInfo]]:
+        assert platform.system() == "Linux"
+
+        # Init LogicalCPUInfo from lscpu
+        lscpu_output = subprocess.check_output("lscpu -J -e=CPU,CORE,NODE",
+                                               shell=True,
+                                               text=True)
+        logical_cpu_list: list[LogicalCPUInfo] = json.loads(
+            lscpu_output, object_hook=LogicalCPUInfo.json_decoder)['cpus']
+
+        # Filter CPUs with invalid attributes
+        logical_cpu_list = [
+            x for x in logical_cpu_list
+            if -1 not in (x.id, x.physical_core, x.numa_node)
+        ]
+
+        # Filter allowed CPUs
+        allowed_cpu_id_list = os.sched_getaffinity(0)
+        logical_cpu_list = [
+            x for x in logical_cpu_list if x.id in allowed_cpu_id_list
+        ]
+
+        # Get allowed NUMA nodes
+        allowed_numa_nodes = set()
+        for x in logical_cpu_list:
+            allowed_numa_nodes.add(x.numa_node)  # type: ignore
+        allowed_numa_nodes_list = sorted(allowed_numa_nodes)
+
+        return allowed_numa_nodes_list, logical_cpu_list
+
+    @classmethod
+    def is_pin_memory_available(cls) -> bool:
+        logger.warning("Pin memory is not supported on CPU.")
+        return False
+
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        return "vllm.lora.punica_wrapper.punica_cpu.PunicaWrapperCPU"
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        """
+        Get device specific communicator class for distributed communication.
+        """
+        return "vllm.distributed.device_communicators.cpu_communicator.CpuCommunicator"  # noqa
+
+    @classmethod
+    def supports_structured_output(cls) -> bool:
+        return True
+
+    @classmethod
+    def supports_v1(cls, model_config) -> bool:
+        """Returns whether the current platform can support v1 for the supplied
+        model configuration.
+        """
+        return True
+
+    @classmethod
+    def default_v1(cls, model_config) -> bool:
+        """Returns whether the current platform can use v1 by default for the
+        supplied model configuration.
+        """
+        arch = cls.get_cpu_architecture()
+        return (cls.supports_v1(model_config) and arch
+                in (CpuArchEnum.X86, CpuArchEnum.POWERPC, CpuArchEnum.ARM))
diff --git a/vllm_v0.10.0/vllm/platforms/cuda.py b/vllm_v0.10.0/vllm/platforms/cuda.py
new file mode 100644
index 0000000..9a8941e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/platforms/cuda.py
@@ -0,0 +1,616 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Code inside this file can safely assume cuda platform, e.g. importing
+pynvml. However, it should not initialize cuda context.
+"""
+
+import os
+from datetime import timedelta
+from functools import cache, wraps
+from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
+
+import torch
+from torch.distributed import PrefixStore, ProcessGroup
+from torch.distributed.distributed_c10d import is_nccl_available
+from typing_extensions import ParamSpec
+
+# import custom ops, trigger op registration
+import vllm._C  # noqa
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.utils import cuda_device_count_stateless, import_pynvml
+
+from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig, VllmConfig
+
+logger = init_logger(__name__)
+
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
+
+pynvml = import_pynvml()
+
+# pytorch 2.5 uses cudnn sdpa by default, which will cause crash on some models
+# see https://github.com/huggingface/diffusers/issues/9704 for details
+torch.backends.cuda.enable_cudnn_sdp(False)
+
+
+def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
+
+    @wraps(fn)
+    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
+        pynvml.nvmlInit()
+        try:
+            return fn(*args, **kwargs)
+        finally:
+            pynvml.nvmlShutdown()
+
+    return wrapper
+
+
+class CudaPlatformBase(Platform):
+    _enum = PlatformEnum.CUDA
+    device_name: str = "cuda"
+    device_type: str = "cuda"
+    dispatch_key: str = "CUDA"
+    ray_device_key: str = "GPU"
+    dist_backend: str = "nccl"
+    device_control_env_var: str = "CUDA_VISIBLE_DEVICES"
+
+    @property
+    def supported_dtypes(self) -> list[torch.dtype]:
+        if self.has_device_capability(80):
+            # Ampere and Hopper or later NVIDIA GPUs.
+            return [torch.bfloat16, torch.float16, torch.float32]
+        elif (not self.has_device_capability(80)
+              ) and self.has_device_capability(60):
+            # Pascal, Volta and Turing NVIDIA GPUs, BF16 is not supported
+            return [torch.float16, torch.float32]
+        # Kepler and Maxwell NVIDIA GPUs, only FP32 is supported,
+        # though vLLM doesn't support these GPUs.
+        return [torch.float32]
+
+    @classmethod
+    def set_device(cls, device: torch.device) -> None:
+        """
+        Set the device for the current platform.
+        """
+        torch.cuda.set_device(device)
+        # With this trick we can force the device to be set eagerly
+        # see https://github.com/pytorch/pytorch/issues/155668
+        # for why and when it is needed
+        _ = torch.zeros(1, device=device)
+
+    @classmethod
+    def get_device_capability(cls,
+                              device_id: int = 0
+                              ) -> Optional[DeviceCapability]:
+        raise NotImplementedError
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        raise NotImplementedError
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        raise NotImplementedError
+
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        if enforce_eager and not envs.VLLM_USE_V1:
+            logger.warning(
+                "To see benefits of async output processing, enable CUDA "
+                "graph. Since, enforce-eager is enabled, async output "
+                "processor cannot be used")
+            return False
+        return True
+
+    @classmethod
+    def is_fully_connected(cls, device_ids: list[int]) -> bool:
+        raise NotImplementedError
+
+    @classmethod
+    def log_warnings(cls):
+        pass
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
+        parallel_config = vllm_config.parallel_config
+        scheduler_config = vllm_config.scheduler_config
+        model_config = vllm_config.model_config
+
+        if parallel_config.worker_cls == "auto":
+            if scheduler_config.is_multi_step:
+                if envs.VLLM_USE_V1:
+                    raise NotImplementedError(
+                        "Multi-step scheduling is not supported (and not "
+                        "needed) on vLLM V1. Please launch without "
+                        "--num-scheduler-steps.")
+                else:
+                    parallel_config.worker_cls = \
+                        "vllm.worker.multi_step_worker.MultiStepWorker"
+            elif vllm_config.speculative_config:
+                if not envs.VLLM_USE_V1:
+                    raise NotImplementedError(
+                        "Speculative decoding is not supported on vLLM V0.")
+                parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"
+            else:
+                if envs.VLLM_USE_V1:
+                    parallel_config.worker_cls = \
+                            "vllm.v1.worker.gpu_worker.Worker"
+                else:
+                    parallel_config.worker_cls = "vllm.worker.worker.Worker"
+
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
+        # TODO(lucas): handle this more gracefully
+        # Note: model_config may be None during testing
+        if model_config is not None and model_config.use_mla:
+            # if `VLLM_ATTENTION_BACKEND` is not set and we are using MLA, then
+            # we default to FlashMLA backend, so we need to force the blocksize
+            # here
+            use_flashmla = (envs.VLLM_ATTENTION_BACKEND is None \
+                or envs.VLLM_ATTENTION_BACKEND == "FLASHMLA")
+            from vllm.attention.ops.flashmla import is_flashmla_supported
+            if use_flashmla and is_flashmla_supported()[0] \
+                and cache_config.block_size != 64:
+                cache_config.block_size = 64
+                logger.info(
+                    "Forcing kv cache block size to 64 for FlashMLA backend.")
+
+            use_cutlass_mla = (envs.VLLM_ATTENTION_BACKEND is not None \
+                and envs.VLLM_ATTENTION_BACKEND == "CUTLASS_MLA_VLLM_V1")
+            if use_cutlass_mla and cache_config.block_size != 128:
+                cache_config.block_size = 128
+                logger.info("Forcing kv cache block size to 128 for "
+                            "CUTLASS_MLA_VLLM_V1 backend.")
+
+        compilation_config = vllm_config.compilation_config
+        if (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput"
+                and parallel_config.data_parallel_size > 1
+                and compilation_config.use_cudagraph):
+            logger.info(
+                "Data Parallel: Forcing enforce eager to be True since DP "
+                "with DeepEP high-throughput kernels are not CUDA Graph "
+                "compatible. The DeepEP low-latency kernels are CUDA Graph "
+                "compatible. Set the all_to_all backend to deepep_low_latency "
+                "to use those kernels instead.")
+            compilation_config.use_cudagraph = False
+            if model_config is not None:
+                model_config.enforce_eager = True
+
+    @classmethod
+    def get_current_memory_usage(cls,
+                                 device: Optional[torch.types.Device] = None
+                                 ) -> float:
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats(device)
+        return torch.cuda.max_memory_allocated(device)
+
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
+                             kv_cache_dtype, block_size, use_v1,
+                             use_mla) -> str:
+        if use_mla:
+            # TODO(lucas): refactor to  be more concise
+            #  we should probably consider factoring out V1 here
+            if selected_backend == _Backend.CUTLASS_MLA_VLLM_V1:
+                if use_v1:
+                    logger.info_once("Using Cutlass MLA backend on V1 engine.")
+                    return ("vllm.v1.attention.backends.mla."
+                            "cutlass_mla.CutlassMLABackend")
+                else:
+                    logger.warning(
+                        "Cutlass MLA backend is only supported on V1 engine")
+            if selected_backend == _Backend.TRITON_MLA or block_size != 64:
+                if use_v1:
+                    logger.info_once("Using Triton MLA backend on V1 engine.")
+                    return ("vllm.v1.attention.backends.mla."
+                            "triton_mla.TritonMLABackend")
+                else:
+                    logger.info("Using Triton MLA backend.")
+                    return "vllm.attention.backends.triton_mla.TritonMLABackend"
+            else:
+                from vllm.attention.backends.flashmla import (
+                    is_flashmla_supported)
+                if not is_flashmla_supported()[0]:
+                    logger.warning(
+                        "FlashMLA backend is not supported due to %s",
+                        is_flashmla_supported()[1])
+                elif block_size != 64:
+                    logger.warning(
+                        "FlashMLA backend is not supported for block size %d"
+                        " (currently only supports block size 64).",
+                        block_size)
+                else:
+                    if use_v1:
+                        logger.info_once(
+                            "Using FlashMLA backend on V1 engine.")
+                        return ("vllm.v1.attention.backends.mla."
+                                "flashmla.FlashMLABackend")
+                    else:
+                        logger.info("Using FlashMLA backend.")
+                        return ("vllm.attention.backends."
+                                "flashmla.FlashMLABackend")
+        if use_v1:
+            FLASHINFER_V1 = "vllm.v1.attention.backends.flashinfer.FlashInferBackend"  # noqa: E501
+            FLEX_ATTENTION_V1 = "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"  # noqa: E501
+            TRITON_ATTN_VLLM_V1 = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"  # noqa: E501
+            FLASH_ATTN_V1 = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"  # noqa: E501
+
+            if selected_backend == _Backend.FLASHINFER:
+                logger.info_once("Using FlashInfer backend on V1 engine.")
+                if cls.has_device_capability(100):
+                    from vllm.v1.attention.backends.utils import (
+                        set_kv_cache_layout)
+                    set_kv_cache_layout("HND")
+                return FLASHINFER_V1
+            elif selected_backend == _Backend.FLEX_ATTENTION:
+                logger.info_once("Using FlexAttention backend on V1 engine.")
+                return FLEX_ATTENTION_V1
+            elif selected_backend == _Backend.TRITON_ATTN_VLLM_V1:
+                logger.info_once("Using Triton backend on V1 engine.")
+                return TRITON_ATTN_VLLM_V1
+            elif selected_backend == _Backend.FLASH_ATTN:
+                logger.info_once("Using Flash Attention backend on V1 engine.")
+                return FLASH_ATTN_V1
+
+            from vllm.attention.selector import is_attn_backend_supported
+
+            # Default backends for V1 engine
+            # Prefer FlashInfer for Blackwell GPUs if installed
+            if cls.is_device_capability(100):
+                if is_default_backend_supported := is_attn_backend_supported(
+                        FLASHINFER_V1, head_size, dtype):
+                    from vllm.v1.attention.backends.utils import (
+                        set_kv_cache_layout)
+
+                    logger.info_once(
+                        "Using FlashInfer backend with HND KV cache layout on "
+                        "V1 engine by default for Blackwell (SM 10.0) GPUs.")
+                    set_kv_cache_layout("HND")
+
+                    return FLASHINFER_V1
+
+                if not is_default_backend_supported.can_import:
+                    logger.warning_once(
+                        "FlashInfer failed to import for V1 engine on "
+                        "Blackwell (SM 10.0) GPUs; it is recommended to "
+                        "install FlashInfer for better performance.")
+
+            # FlashAttention is the default for SM 8.0+ GPUs
+            if cls.has_device_capability(80):
+                if is_default_backend_supported := is_attn_backend_supported(
+                        FLASH_ATTN_V1, head_size, dtype,
+                        allow_import_error=False):
+                    logger.info_once("Using Flash Attention backend on "
+                                     "V1 engine.")
+                    return FLASH_ATTN_V1
+
+            # FlexAttention is the default for older GPUs
+            else:
+                logger.info_once("Using FlexAttention backend on V1 engine.")
+                return FLEX_ATTENTION_V1
+
+            assert not is_default_backend_supported
+
+            use_flex_attention_reason = {}
+            if not is_default_backend_supported.head_size:
+                use_flex_attention_reason["head_size"] = head_size
+            if not is_default_backend_supported.dtype:
+                use_flex_attention_reason["dtype"] = dtype
+
+            logger.info_once(
+                "Using FlexAttention backend for %s on V1 engine.",
+                ", ".join(f"{k}={v}"
+                          for k, v in use_flex_attention_reason.items()),
+            )
+            return FLEX_ATTENTION_V1
+
+        # Backends for V0 engine
+        if selected_backend == _Backend.FLASHINFER:
+            logger.info("Using FlashInfer backend.")
+            if cls.has_device_capability(100):
+                from vllm.v1.attention.backends.utils import (
+                    set_kv_cache_layout)
+                logger.info_once(
+                    "Using HND KV cache layout on V1 engine by default for "
+                    "Blackwell (SM 10.0) GPUs.")
+                set_kv_cache_layout("HND")
+            return "vllm.attention.backends.flashinfer.FlashInferBackend"
+        elif selected_backend == _Backend.XFORMERS:
+            logger.info("Using XFormers backend.")
+            return "vllm.attention.backends.xformers.XFormersBackend"
+        elif selected_backend == _Backend.DUAL_CHUNK_FLASH_ATTN:
+            logger.info("Using DualChunkFlashAttention backend.")
+            return ("vllm.attention.backends.dual_chunk_flash_attn."
+                    "DualChunkFlashAttentionBackend")
+        elif selected_backend == _Backend.DIFFERENTIAL_FLASH_ATTN:
+            logger.info("Using DifferentialFlashAttention backend.")
+            return ("vllm.attention.backends.differential_flash_attn."
+                    "DifferentialFlashAttentionBackend")
+        elif selected_backend == _Backend.FLASH_ATTN:
+            pass
+        elif selected_backend:
+            raise ValueError(
+                f"Invalid attention backend for {cls.device_name}, "
+                f"with use_v1: {use_v1} use_mla: {use_mla}")
+
+        target_backend = _Backend.FLASH_ATTN
+        if not cls.has_device_capability(80):
+            # Volta and Turing NVIDIA GPUs.
+            logger.info(
+                "Cannot use FlashAttention-2 backend for Volta and Turing "
+                "GPUs.")
+            target_backend = _Backend.XFORMERS
+        elif dtype not in (torch.float16, torch.bfloat16):
+            logger.info(
+                "Cannot use FlashAttention-2 backend for dtype other than "
+                "torch.float16 or torch.bfloat16.")
+            target_backend = _Backend.XFORMERS
+        elif block_size % 16 != 0:
+            logger.info(
+                "Cannot use FlashAttention-2 backend for block size not "
+                "divisible by 16.")
+            target_backend = _Backend.XFORMERS
+
+        # FlashAttn is valid for the model, checking if the package is
+        # installed.
+        if target_backend == _Backend.FLASH_ATTN:
+            try:
+                import vllm.vllm_flash_attn  # noqa: F401
+                from vllm.attention.backends.flash_attn import (  # noqa: F401
+                    FlashAttentionBackend, flash_attn_supports_fp8)
+
+                supported_sizes = \
+                    FlashAttentionBackend.get_supported_head_sizes()
+                if head_size not in supported_sizes:
+                    logger.info(
+                        "Cannot use FlashAttention-2 backend for head size %d.",
+                        head_size)
+                    target_backend = _Backend.XFORMERS
+                fp8_kv_cache = (kv_cache_dtype is not None
+                                and kv_cache_dtype.startswith("fp8"))
+                if (fp8_kv_cache and not flash_attn_supports_fp8()):
+                    logger.info(
+                        "Cannot use FlashAttention backend for FP8 KV cache.")
+                    logger.warning(
+                        "Please use FlashInfer backend with FP8 KV Cache for "
+                        "better performance by setting environment variable "
+                        "VLLM_ATTENTION_BACKEND=FLASHINFER")
+                    target_backend = _Backend.XFORMERS
+            except ImportError:
+                logger.info(
+                    "Cannot use FlashAttention-2 backend because the "
+                    "vllm.vllm_flash_attn package is not found. "
+                    "Make sure that vllm_flash_attn was built and installed "
+                    "(on by default).")
+                target_backend = _Backend.XFORMERS
+
+        if target_backend == _Backend.XFORMERS:
+            logger.info("Using XFormers backend.")
+            return "vllm.attention.backends.xformers.XFormersBackend"
+
+        logger.info("Using Flash Attention backend.")
+        return "vllm.attention.backends.flash_attn.FlashAttentionBackend"
+
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU"
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        return "vllm.distributed.device_communicators.cuda_communicator.CudaCommunicator"  # noqa
+
+    @classmethod
+    def supports_fp8(cls) -> bool:
+        return cls.has_device_capability(89)
+
+    @classmethod
+    def supports_v1(cls, model_config: "ModelConfig") -> bool:
+        return True
+
+    @classmethod
+    def use_custom_allreduce(cls) -> bool:
+        return True
+
+    @classmethod
+    def get_piecewise_backend_cls(cls) -> str:
+        return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend"  # noqa
+
+    @classmethod
+    def stateless_init_device_torch_dist_pg(
+        cls,
+        backend: str,
+        prefix_store: PrefixStore,
+        group_rank: int,
+        group_size: int,
+        timeout: timedelta,
+    ) -> ProcessGroup:
+        assert is_nccl_available()
+        pg: ProcessGroup = ProcessGroup(
+            prefix_store,
+            group_rank,
+            group_size,
+        )
+        from torch.distributed.distributed_c10d import ProcessGroupNCCL
+
+        backend_options = ProcessGroupNCCL.Options()
+        backend_options._timeout = timeout
+
+        backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size,
+                                         backend_options)
+        backend_type = ProcessGroup.BackendType.NCCL
+        device = torch.device("cuda")
+        pg._set_default_backend(backend_type)
+        backend_class._set_sequence_number_for_group()
+
+        pg._register_backend(device, backend_type, backend_class)
+        return pg
+
+    @classmethod
+    def device_count(cls) -> int:
+        return cuda_device_count_stateless()
+
+    @classmethod
+    def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str) -> bool:
+        fp8_attention = kv_cache_dtype.startswith("fp8")
+        will_use_fa = (not envs.is_set("VLLM_ATTENTION_BACKEND")
+                       ) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1"
+        supported = False
+        if cls.is_device_capability(100):
+            supported = True
+        elif fp8_attention and will_use_fa:
+            from vllm.attention.utils.fa_utils import flash_attn_supports_fp8
+            supported = flash_attn_supports_fp8()
+        return supported
+
+
+# NVML utils
+# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
+# all the related functions work on real physical device ids.
+# the major benefit of using NVML is that it will not initialize CUDA
+class NvmlCudaPlatform(CudaPlatformBase):
+
+    @classmethod
+    @cache
+    @with_nvml_context
+    def get_device_capability(cls,
+                              device_id: int = 0
+                              ) -> Optional[DeviceCapability]:
+        try:
+            physical_device_id = cls.device_id_to_physical_device_id(device_id)
+            handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
+            major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
+            return DeviceCapability(major=major, minor=minor)
+        except RuntimeError:
+            return None
+
+    @classmethod
+    @with_nvml_context
+    def has_device_capability(
+        cls,
+        capability: Union[tuple[int, int], int],
+        device_id: int = 0,
+    ) -> bool:
+        try:
+            return super().has_device_capability(capability, device_id)
+        except RuntimeError:
+            return False
+
+    @classmethod
+    @with_nvml_context
+    def get_device_name(cls, device_id: int = 0) -> str:
+        physical_device_id = cls.device_id_to_physical_device_id(device_id)
+        return cls._get_physical_device_name(physical_device_id)
+
+    @classmethod
+    @with_nvml_context
+    def get_device_uuid(cls, device_id: int = 0) -> str:
+        physical_device_id = cls.device_id_to_physical_device_id(device_id)
+        handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
+        return pynvml.nvmlDeviceGetUUID(handle)
+
+    @classmethod
+    @with_nvml_context
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        physical_device_id = cls.device_id_to_physical_device_id(device_id)
+        handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
+        return int(pynvml.nvmlDeviceGetMemoryInfo(handle).total)
+
+    @classmethod
+    @with_nvml_context
+    def is_fully_connected(cls, physical_device_ids: list[int]) -> bool:
+        """
+        query if the set of gpus are fully connected by nvlink (1 hop)
+        """
+        handles = [
+            pynvml.nvmlDeviceGetHandleByIndex(i) for i in physical_device_ids
+        ]
+        for i, handle in enumerate(handles):
+            for j, peer_handle in enumerate(handles):
+                if i < j:
+                    try:
+                        p2p_status = pynvml.nvmlDeviceGetP2PStatus(
+                            handle,
+                            peer_handle,
+                            pynvml.NVML_P2P_CAPS_INDEX_NVLINK,
+                        )
+                        if p2p_status != pynvml.NVML_P2P_STATUS_OK:
+                            return False
+                    except pynvml.NVMLError:
+                        logger.exception(
+                            "NVLink detection failed. This is normal if"
+                            " your machine has no NVLink equipped.")
+                        return False
+        return True
+
+    @classmethod
+    def _get_physical_device_name(cls, device_id: int = 0) -> str:
+        handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
+        return pynvml.nvmlDeviceGetName(handle)
+
+    @classmethod
+    @with_nvml_context
+    def log_warnings(cls):
+        device_ids: int = pynvml.nvmlDeviceGetCount()
+        if device_ids > 1:
+            device_names = [
+                cls._get_physical_device_name(i) for i in range(device_ids)
+            ]
+            if (len(set(device_names)) > 1
+                    and os.environ.get("CUDA_DEVICE_ORDER") != "PCI_BUS_ID"):
+                logger.warning(
+                    "Detected different devices in the system: %s. Please"
+                    " make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to "
+                    "avoid unexpected behavior.",
+                    ", ".join(device_names),
+                )
+
+
+class NonNvmlCudaPlatform(CudaPlatformBase):
+
+    @classmethod
+    @cache
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
+        major, minor = torch.cuda.get_device_capability(device_id)
+        return DeviceCapability(major=major, minor=minor)
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return torch.cuda.get_device_name(device_id)
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        device_props = torch.cuda.get_device_properties(device_id)
+        return device_props.total_memory
+
+    @classmethod
+    def is_fully_connected(cls, physical_device_ids: list[int]) -> bool:
+        logger.exception(
+            "NVLink detection not possible, as context support was"
+            " not found. Assuming no NVLink available.")
+        return False
+
+
+# Autodetect either NVML-enabled or non-NVML platform
+# based on whether NVML is available.
+nvml_available = False
+try:
+    try:
+        pynvml.nvmlInit()
+        nvml_available = True
+    except Exception:
+        # On Jetson, NVML is not supported.
+        nvml_available = False
+finally:
+    if nvml_available:
+        pynvml.nvmlShutdown()
+
+CudaPlatform = NvmlCudaPlatform if nvml_available else NonNvmlCudaPlatform
+
+CudaPlatform.log_warnings()
diff --git a/vllm_v0.10.0/vllm/platforms/interface.py b/vllm_v0.10.0/vllm/platforms/interface.py
new file mode 100644
index 0000000..02cc392
--- /dev/null
+++ b/vllm_v0.10.0/vllm/platforms/interface.py
@@ -0,0 +1,556 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import enum
+import os
+import platform
+import random
+import sys
+from datetime import timedelta
+from platform import uname
+from typing import TYPE_CHECKING, NamedTuple, Optional, Union
+
+import numpy as np
+import torch
+from torch.distributed import PrefixStore, ProcessGroup
+
+from vllm.inputs import ProcessorInputs, PromptType
+from vllm.logger import init_logger
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig, VllmConfig
+    from vllm.lora.request import LoRARequest
+    from vllm.pooling_params import PoolingParams
+    from vllm.sampling_params import SamplingParams
+    from vllm.utils import FlexibleArgumentParser
+else:
+    ModelConfig = None
+    VllmConfig = None
+    LoRARequest = None
+    PoolingParams = None
+    SamplingParams = None
+    FlexibleArgumentParser = None
+
+logger = init_logger(__name__)
+
+
+def in_wsl() -> bool:
+    # Reference: https://github.com/microsoft/WSL/issues/4071
+    return "microsoft" in " ".join(uname()).lower()
+
+
+class _Backend(enum.Enum):
+    FLASH_ATTN = enum.auto()
+    FLASH_ATTN_VLLM_V1 = enum.auto()
+    TRITON_ATTN_VLLM_V1 = enum.auto()
+    XFORMERS = enum.auto()
+    ROCM_FLASH = enum.auto()
+    ROCM_AITER_MLA = enum.auto()  # Supported by V1
+    ROCM_AITER_MLA_VLLM_V1 = enum.auto()
+    TORCH_SDPA = enum.auto()
+    FLASHINFER = enum.auto()
+    FLASHINFER_VLLM_V1 = enum.auto()
+    TRITON_MLA = enum.auto()  # Supported by V1
+    TRITON_MLA_VLLM_V1 = enum.auto()
+    FLASHMLA_VLLM_V1 = enum.auto()
+    FLASHMLA = enum.auto()  # Supported by V1
+    CUTLASS_MLA_VLLM_V1 = enum.auto()
+    PALLAS = enum.auto()
+    PALLAS_VLLM_V1 = enum.auto()
+    IPEX = enum.auto()
+    DUAL_CHUNK_FLASH_ATTN = enum.auto()
+    DIFFERENTIAL_FLASH_ATTN = enum.auto()
+    NO_ATTENTION = enum.auto()
+    FLEX_ATTENTION = enum.auto()
+
+
+class PlatformEnum(enum.Enum):
+    CUDA = enum.auto()
+    ROCM = enum.auto()
+    TPU = enum.auto()
+    XPU = enum.auto()
+    CPU = enum.auto()
+    NEURON = enum.auto()
+    OOT = enum.auto()
+    UNSPECIFIED = enum.auto()
+
+
+class CpuArchEnum(enum.Enum):
+    X86 = enum.auto()
+    ARM = enum.auto()
+    POWERPC = enum.auto()
+    OTHER = enum.auto()
+    UNKNOWN = enum.auto()
+
+
+class DeviceCapability(NamedTuple):
+    major: int
+    minor: int
+
+    def as_version_str(self) -> str:
+        return f"{self.major}.{self.minor}"
+
+    def to_int(self) -> int:
+        """
+        Express device capability as an integer `<major><minor>`.
+
+        It is assumed that the minor version is always a single digit.
+        """
+        assert 0 <= self.minor < 10
+        return self.major * 10 + self.minor
+
+
+class Platform:
+    _enum: PlatformEnum
+    device_name: str
+    device_type: str
+
+    # available dispatch keys:
+    # check https://github.com/pytorch/pytorch/blob/313dac6c1ca0fa0cde32477509cce32089f8532a/torchgen/model.py#L134 # noqa
+    # use "CPU" as a fallback for platforms not registered in PyTorch
+    dispatch_key: str = "CPU"
+
+    # available ray device keys:
+    # https://github.com/ray-project/ray/blob/10ba5adadcc49c60af2c358a33bb943fb491a171/python/ray/_private/ray_constants.py#L438 # noqa
+    # empty string means the device does not support ray
+    ray_device_key: str = ""
+
+    # platform-agnostic way to specify the device control environment variable,
+    # .e.g. CUDA_VISIBLE_DEVICES for CUDA.
+    # hint: search for "get_visible_accelerator_ids_env_var" in
+    # https://github.com/ray-project/ray/tree/master/python/ray/_private/accelerators # noqa
+    device_control_env_var: str = "VLLM_DEVICE_CONTROL_ENV_VAR_PLACEHOLDER"
+
+    # The torch.compile backend for compiling simple and
+    # standalone functions. The default value is "inductor" to keep
+    # the same behavior as PyTorch.
+    # NOTE: for the forward part of the model, vLLM has another separate
+    # compilation strategy.
+    simple_compile_backend: str = "inductor"
+
+    # The backend used for distributed communication.
+    dist_backend: str = ""
+
+    supported_quantization: list[str] = []
+
+    additional_env_vars: list[str] = []
+
+    @property
+    def supported_dtypes(self) -> list[torch.dtype]:
+        """Returns the supported dtypes for the current platform."""
+        # Be careful with the order of the dtypes. The first dtype will
+        # be used as the default dtype fallback for the current platform,
+        # when encountering unsupported dtypes in "auto" dtype.
+        return [torch.bfloat16, torch.float16, torch.float32]
+
+    def is_cuda(self) -> bool:
+        return self._enum == PlatformEnum.CUDA
+
+    def is_rocm(self) -> bool:
+        return self._enum == PlatformEnum.ROCM
+
+    def is_tpu(self) -> bool:
+        return self._enum == PlatformEnum.TPU
+
+    def is_xpu(self) -> bool:
+        return self._enum == PlatformEnum.XPU
+
+    def is_cpu(self) -> bool:
+        return self._enum == PlatformEnum.CPU
+
+    def is_neuron(self) -> bool:
+        return self._enum == PlatformEnum.NEURON
+
+    def is_out_of_tree(self) -> bool:
+        return self._enum == PlatformEnum.OOT
+
+    def get_max_output_tokens(self, prompt_len: int) -> int:
+        return sys.maxsize
+
+    def is_cuda_alike(self) -> bool:
+        """Stateless version of [torch.cuda.is_available][]."""
+        return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
+
+    def is_sleep_mode_available(self) -> bool:
+        return self._enum == PlatformEnum.CUDA
+
+    @classmethod
+    def device_id_to_physical_device_id(cls, device_id: int):
+        # Treat empty device control env var as unset. This is a valid
+        # configuration in Ray setups where the engine is launched in
+        # a CPU-only placement group located on a GPU node.
+        if cls.device_control_env_var in os.environ and os.environ[
+                cls.device_control_env_var] != "":
+            device_ids = os.environ[cls.device_control_env_var].split(",")
+            physical_device_id = device_ids[device_id]
+            return int(physical_device_id)
+        else:
+            return device_id
+
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
+        """Get the attention backend class of a device."""
+        return ""
+
+    @classmethod
+    def get_device_capability(
+        cls,
+        device_id: int = 0,
+    ) -> Optional[DeviceCapability]:
+        """Stateless version of [torch.cuda.get_device_capability][]."""
+        return None
+
+    @classmethod
+    def has_device_capability(
+        cls,
+        capability: Union[tuple[int, int], int],
+        device_id: int = 0,
+    ) -> bool:
+        """
+        Test whether this platform is compatible with a device capability.
+
+        The `capability` argument can either be:
+
+        - A tuple `(major, minor)`.
+        - An integer `<major><minor>`. (See
+        [`DeviceCapability.to_int`][vllm.platforms.interface.DeviceCapability.to_int])
+        """
+        current_capability = cls.get_device_capability(device_id=device_id)
+        if current_capability is None:
+            return False
+
+        if isinstance(capability, tuple):
+            return current_capability >= capability
+
+        return current_capability.to_int() >= capability
+
+    @classmethod
+    def is_device_capability(
+        cls,
+        capability: Union[tuple[int, int], int],
+        device_id: int = 0,
+    ) -> bool:
+        """
+        Test whether this platform has exactly the specified device capability.
+
+        The `capability` argument can either be:
+
+        - A tuple `(major, minor)`.
+        - An integer `<major><minor>`. (See
+        [`DeviceCapability.to_int`][vllm.platforms.interface.DeviceCapability.to_int])
+        """
+        current_capability = cls.get_device_capability(device_id=device_id)
+        if current_capability is None:
+            return False
+
+        if isinstance(capability, tuple):
+            return current_capability == capability
+
+        return current_capability.to_int() == capability
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        """Get the name of a device."""
+        raise NotImplementedError
+
+    @classmethod
+    def get_device_uuid(cls, device_id: int = 0) -> str:
+        """Get the uuid of a device, e.g. the PCI bus ID."""
+        raise NotImplementedError
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        """Get the total memory of a device in bytes."""
+        raise NotImplementedError
+
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        """
+        Check if the current platform supports async output.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def inference_mode(cls):
+        """A device-specific wrapper of `torch.inference_mode`.
+
+        This wrapper is recommended because some hardware backends such as TPU
+        do not support `torch.inference_mode`. In such a case, they will fall
+        back to `torch.no_grad` by overriding this method.
+        """
+        return torch.inference_mode(mode=True)
+
+    @classmethod
+    def seed_everything(cls, seed: Optional[int] = None) -> None:
+        """
+        Set the seed of each random module.
+        `torch.manual_seed` will set seed on all devices.
+
+        Loosely based on: https://github.com/Lightning-AI/pytorch-lightning/blob/2.4.0/src/lightning/fabric/utilities/seed.py#L20
+        """
+        if seed is not None:
+            random.seed(seed)
+            np.random.seed(seed)
+            torch.manual_seed(seed)
+
+    @classmethod
+    def set_device(cls, device: torch.device) -> None:
+        """
+        Set the device for the current platform.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def pre_register_and_update(cls,
+                                parser: Optional[FlexibleArgumentParser] = None
+                                ) -> None:
+        """
+        Do some pre-registration or update action for the current platform.
+
+        This function is called before global VllmConfig is initialized or cli
+        arguments are parsed. It's used for out-of-tree platforms to register or
+        update the configuration.
+
+        For example, the out-of-tree quantization config can be imported and
+        registered here dynamically.
+        """
+        pass
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        """
+        Check and update the configuration for the current platform.
+
+        It can raise an exception if the configuration is not compatible with
+        the current platform, or it can update the configuration to make it
+        compatible with the current platform.
+
+        The config is passed by reference, so it can be modified in place.
+        """
+        pass
+
+    @classmethod
+    def verify_model_arch(cls, model_arch: str) -> None:
+        """
+        Verify whether the current platform supports the specified model
+        architecture.
+
+        - This will raise an Error or Warning based on the model support on
+        the current platform.
+        - By default all models are considered supported.
+        """
+        pass
+
+    @classmethod
+    def verify_quantization(cls, quant: str) -> None:
+        """
+        Verify whether the quantization is supported by the current platform.
+        """
+        if cls.supported_quantization and \
+            quant not in cls.supported_quantization:
+            raise ValueError(
+                f"{quant} quantization is currently not supported in "
+                f"{cls.device_name}.")
+
+    @classmethod
+    def get_cpu_architecture(cls) -> CpuArchEnum:
+        """
+        Determine the CPU architecture of the current system.
+        Returns CpuArchEnum indicating the architecture type.
+        """
+        machine = platform.machine().lower()
+
+        if machine in ("x86_64", "amd64", "i386", "i686"):
+            return CpuArchEnum.X86
+        elif machine.startswith("arm") or machine.startswith("aarch"):
+            return CpuArchEnum.ARM
+        elif machine.startswith("ppc"):
+            return CpuArchEnum.POWERPC
+
+        return CpuArchEnum.OTHER if machine else CpuArchEnum.UNKNOWN
+
+    @classmethod
+    def is_pin_memory_available(cls) -> bool:
+        """Checks whether pin memory is available on the current platform."""
+        if in_wsl():
+            # Pinning memory in WSL is not supported.
+            # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
+            logger.warning("Using 'pin_memory=False' as WSL is detected. "
+                           "This may slow down the performance.")
+            return False
+        return True
+
+    @classmethod
+    def get_current_memory_usage(cls,
+                                 device: Optional[torch.types.Device] = None
+                                 ) -> float:
+        """
+        Return the memory usage in bytes.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        """
+        Return the punica wrapper for current platform.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def get_infinity_values(cls, dtype: torch.dtype) -> tuple[float, float]:
+        """
+        Return the platform specific values for (-inf, inf)
+        """
+        return float("-inf"), float("inf")
+
+    @classmethod
+    def can_update_inplace(cls) -> bool:
+        """
+        Checks if the platform allows inplace memory updates
+        """
+        return True
+
+    @classmethod
+    def get_lora_vocab_padding_size(cls) -> int:
+        """
+        Returns how much padding the LoRA logits need for kernels
+        """
+        return 256
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        """
+        Get device specific communicator class for distributed communication.
+        """
+        return "vllm.distributed.device_communicators.base_device_communicator.DeviceCommunicatorBase"  # noqa
+
+    @classmethod
+    def supports_mx(cls) -> bool:
+        """
+        Returns whether the current platform supports MX types.
+        """
+        return False
+
+    @classmethod
+    def supports_fp8(cls) -> bool:
+        """
+        Returns whether the current platform supports FP8 types.
+        """
+        return False
+
+    @classmethod
+    def is_fp8_fnuz(cls) -> bool:
+        """
+        Returns whether the preferred FP8 type is FNUZ on the current platform.
+
+        There are two representations of FP8, OCP FP8 and FNUZ FP8.
+        The OCP specification can be found at https://tinyurl.com/b7jvwpft.
+        The FNUZ specification can be found at https://tinyurl.com/5n6hwwu5.
+
+        AMD's MI300 and MI325 have native hardware support for FNUZ. All other
+        hardware has converged on the OCP FP8 standard.
+        """
+        return False
+
+    @classmethod
+    def fp8_dtype(cls) -> torch.dtype:
+        """
+        Returns the preferred FP8 type on the current platform.
+
+        See the documentation for is_fp8_fnuz for details.
+        """
+        return torch.float8_e4m3fn
+
+    @classmethod
+    def use_all_gather(cls) -> bool:
+        """
+        Whether to use allgather in LogitsProcessor to gather the logits.
+        """
+        import vllm.envs as envs
+        from vllm.config import get_current_vllm_config
+
+        parallel_config = get_current_vllm_config().parallel_config
+        return (envs.VLLM_USE_V1
+                or parallel_config.distributed_executor_backend
+                == "external_launcher")
+
+    @classmethod
+    def supports_v1(cls, model_config: ModelConfig) -> bool:
+        """Returns whether the current platform can support v1 for the supplied
+        model configuration.
+        """
+        return False
+
+    @classmethod
+    def default_v1(cls, model_config: ModelConfig) -> bool:
+        """
+        Returns whether the current platform supports v1 by default.
+        """
+        return cls.supports_v1(model_config)
+
+    @classmethod
+    def use_custom_allreduce(cls) -> bool:
+        """
+        Returns if custom allreduce is supported on the current platform
+        """
+        return False
+
+    @classmethod
+    def validate_request(
+        cls,
+        prompt: PromptType,
+        params: Union[SamplingParams, PoolingParams],
+        processed_inputs: ProcessorInputs,
+    ) -> None:
+        """Raises if this request is unsupported on this platform"""
+
+    def __getattr__(self, key: str):
+        device = getattr(torch, self.device_type, None)
+        if device is not None and hasattr(device, key):
+            return getattr(device, key)
+        else:
+            logger.warning("Current platform %s does not have '%s'" \
+            " attribute.", self.device_type, key)
+            return None
+
+    @classmethod
+    def get_cu_count(cls, device_id: int = 0) -> int:
+        """
+        Returns the total number of compute units (CU) on single GPU.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def get_piecewise_backend_cls(cls) -> str:
+        """
+        Get piecewise backend class for piecewise graph.
+        """
+        return "vllm.compilation.base_piecewise_backend.AbstractPiecewiseBackend"  # noqa
+
+    @classmethod
+    def stateless_init_device_torch_dist_pg(
+        cls,
+        backend: str,
+        prefix_store: PrefixStore,
+        group_rank: int,
+        group_size: int,
+        timeout: timedelta,
+    ) -> ProcessGroup:
+        """
+        Init platform-specific torch distributed process group.
+        """
+        raise RuntimeError(f"Unsupported torch distributed backend: {backend}")
+
+    @classmethod
+    def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str) -> bool:
+        """
+        Returns if the kv_cache_dtype is supported by the current platform.
+        """
+        return False
+
+
+class UnspecifiedPlatform(Platform):
+    _enum = PlatformEnum.UNSPECIFIED
+    device_type = ""
diff --git a/vllm_v0.10.0/vllm/platforms/neuron.py b/vllm_v0.10.0/vllm/platforms/neuron.py
new file mode 100644
index 0000000..cb8ac8d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/platforms/neuron.py
@@ -0,0 +1,151 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import enum
+import os
+from functools import lru_cache
+from typing import TYPE_CHECKING, Optional
+
+from vllm import envs
+from vllm.logger import init_logger
+from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
+
+from .interface import Platform, PlatformEnum
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+
+logger = init_logger(__name__)
+
+
+class NeuronFramework(enum.Enum):
+    TRANSFORMERS_NEURONX = "transformers-neuronx"
+    NEURONX_DISTRIBUTED_INFERENCE = "neuronx-distributed-inference"
+
+
+class NeuronPlatform(Platform):
+    _enum = PlatformEnum.NEURON
+    device_name: str = "neuron"
+    device_type: str = "neuron"
+    ray_device_key: str = "neuron_cores"
+    supported_quantization: list[str] = ["neuron_quant", "fbgemm_fp8"]
+    dist_backend: str = "gloo"
+    device_control_env_var: str = "NEURON_RT_VISIBLE_CORES"
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return "neuron"
+
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return False
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        parallel_config = vllm_config.parallel_config
+        if parallel_config.worker_cls == "auto":
+            parallel_config.worker_cls = \
+                "vllm.worker.neuron_worker.NeuronWorker"
+
+        if parallel_config.world_size > 1:
+            parallel_config.distributed_executor_backend = "uni"
+
+        if vllm_config.cache_config and vllm_config.model_config:
+            # neuron needs block_size = max_model_len
+            vllm_config.cache_config.block_size = \
+                vllm_config.model_config.max_model_len  # type: ignore
+
+        if vllm_config.model_config and vllm_config.model_config.use_mla:
+            logger.info(
+                "MLA is enabled on a non-GPU platform; forcing chunked "
+                "prefill and prefix caching to be disabled.")
+            vllm_config.scheduler_config.enable_chunked_prefill = False
+            vllm_config.scheduler_config.chunked_prefill_enabled = False
+            vllm_config.scheduler_config.max_num_batched_tokens = max(
+                vllm_config.scheduler_config.max_model_len,
+                DEFAULT_MAX_NUM_BATCHED_TOKENS)
+
+    @classmethod
+    def is_pin_memory_available(cls) -> bool:
+        logger.warning("Pin memory is not supported on Neuron.")
+        return False
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        if envs.VLLM_USE_V1:
+            return "vllm.distributed.device_communicators.neuron_communicator.NeuronCommunicator"  # noqa
+        else:
+            return Platform.get_device_communicator_cls()
+
+    @classmethod
+    def use_all_gather(cls) -> bool:
+        return True
+
+    @classmethod
+    @lru_cache
+    def is_neuronx_distributed_inference(cls) -> bool:
+        try:
+            import neuronx_distributed_inference
+        except ImportError:
+            neuronx_distributed_inference = None
+        return neuronx_distributed_inference is not None
+
+    @classmethod
+    @lru_cache
+    def is_transformers_neuronx(cls) -> bool:
+        try:
+            import transformers_neuronx
+        except ImportError:
+            transformers_neuronx = None
+        return transformers_neuronx is not None
+
+    def get_neuron_framework_to_use(self):
+        """Return the specified framework if corresponding installations are
+        available.
+
+        If no framework is specified, use neuronx-distributed-inference by
+        default.
+        If that's unavailable, check and switch to transformers-neuronx.
+        """
+        if not self.is_neuron():
+            raise AssertionError(
+                f"Neuron Framework unavailable for platform: {self}")
+
+        tnx_installed = self.is_transformers_neuronx()
+        nxd_installed = self.is_neuronx_distributed_inference()
+
+        specified_framework = os.environ.get("VLLM_NEURON_FRAMEWORK")
+        tnx_framework = NeuronFramework.TRANSFORMERS_NEURONX.value
+        nxd_framework = NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE.value
+        if specified_framework == tnx_framework and tnx_installed:
+            return self.TRANSFORMERS_NEURONX
+
+        if ((specified_framework == nxd_framework and nxd_installed)
+                or (specified_framework is None and nxd_installed)):
+            return NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE
+
+        if specified_framework is None and tnx_installed:
+            return NeuronFramework.TRANSFORMERS_NEURONX
+
+        return None
+
+    def use_neuronx_distributed(self):
+        """
+        Return True if the framework determined in get_neuron_framework_to_use()
+        is NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE, False otherwise. This
+        is used to select the Neuron model framework and framework-specific
+        configuration to apply during model compilation.
+        """
+        nxd_framework = NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE
+        return self.get_neuron_framework_to_use() == nxd_framework
+
+    def use_transformers_neuronx(self):
+        """
+        Return True if the framework determined in get_neuron_framework_to_use()
+        is NeuronFramework.TRANSFORMERS_NEURONX, False otherwise. This is used
+        to select the Neuron model framework and framework-specific
+        configuration to apply during model compilation.
+        """
+        return self.get_neuron_framework_to_use(
+        ) == NeuronFramework.TRANSFORMERS_NEURONX
diff --git a/vllm_v0.10.0/vllm/platforms/rocm.py b/vllm_v0.10.0/vllm/platforms/rocm.py
new file mode 100644
index 0000000..b2e69f6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/platforms/rocm.py
@@ -0,0 +1,460 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from datetime import timedelta
+from functools import cache, lru_cache, wraps
+from typing import TYPE_CHECKING, Optional
+
+import torch
+from torch.distributed import PrefixStore, ProcessGroup
+from torch.distributed.distributed_c10d import is_nccl_available
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.utils import cuda_device_count_stateless
+
+from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig, VllmConfig
+
+logger = init_logger(__name__)
+
+try:
+    from amdsmi import (AmdSmiException, amdsmi_get_gpu_asic_info,
+                        amdsmi_get_processor_handles, amdsmi_init,
+                        amdsmi_shut_down, amdsmi_topo_get_link_type)
+except ImportError as e:
+    logger.warning("Failed to import from amdsmi with %r", e)
+
+try:
+    import vllm._C  # noqa: F401
+except ImportError as e:
+    logger.warning("Failed to import from vllm._C with %r", e)
+
+# import custom ops, trigger op registration
+try:
+    import vllm._rocm_C  # noqa: F401
+except ImportError as e:
+    logger.warning("Failed to import from vllm._rocm_C with %r", e)
+
+# Models not supported by ROCm.
+_ROCM_UNSUPPORTED_MODELS: list[str] = []
+
+# Models partially supported by ROCm.
+# Architecture -> Reason.
+_ROCM_SWA_REASON = ("Sliding window attention (SWA) is not yet supported in "
+                    "Triton flash attention. For half-precision SWA support, "
+                    "please use CK flash attention by setting "
+                    "`VLLM_USE_TRITON_FLASH_ATTN=0`")
+_ROCM_PARTIALLY_SUPPORTED_MODELS: dict[str, str] = {
+    "Qwen2ForCausalLM":
+    _ROCM_SWA_REASON,
+    "MistralForCausalLM":
+    _ROCM_SWA_REASON,
+    "MixtralForCausalLM":
+    _ROCM_SWA_REASON,
+    "PaliGemmaForConditionalGeneration":
+    ("ROCm flash attention does not yet "
+     "fully support 32-bit precision on PaliGemma"),
+    "Phi3VForCausalLM":
+    ("ROCm Triton flash attention may run into compilation errors due to "
+     "excessive use of shared memory. If this happens, disable Triton FA "
+     "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`")
+}
+_ROCM_DEVICE_ID_NAME_MAP: dict[str, str] = {
+    "0x74a0": "AMD_Instinct_MI300A",
+    "0x74a1": "AMD_Instinct_MI300X",
+    "0x74b5": "AMD_Instinct_MI300X",  # MI300X VF
+    "0x74a5": "AMD_Instinct_MI325X",
+    "0x74b9": "AMD_Instinct_MI325X",  # MI325X VF
+    "0x74a9": "AMD_Instinct_MI300X_HF",
+    "0x74bd": "AMD_Instinct_MI300X_HF",
+}
+
+# Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES``
+if "HIP_VISIBLE_DEVICES" in os.environ:
+    val = os.environ["HIP_VISIBLE_DEVICES"]
+    if cuda_val := os.environ.get("CUDA_VISIBLE_DEVICES", None):
+        assert val == cuda_val
+    else:
+        os.environ["CUDA_VISIBLE_DEVICES"] = val
+
+# AMDSMI utils
+# Note that NVML is not affected by `{CUDA/HIP}_VISIBLE_DEVICES`,
+# all the related functions work on real physical device ids.
+# the major benefit of using AMDSMI is that it will not initialize CUDA
+
+
+def with_amdsmi_context(fn):
+
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        amdsmi_init()
+        try:
+            return fn(*args, **kwargs)
+        finally:
+            amdsmi_shut_down()
+
+    return wrapper
+
+
+@cache
+def on_gfx1x() -> bool:
+    GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
+    return any(arch in GPU_ARCH for arch in ["gfx11", "gfx12"])
+
+
+@cache
+def on_mi3xx() -> bool:
+    GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
+    return any(arch in GPU_ARCH for arch in ["gfx942", "gfx950"])
+
+
+@cache
+def on_gfx9() -> bool:
+    GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
+    return any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942", "gfx950"])
+
+
+@cache
+def use_rocm_custom_paged_attention(
+        qtype: torch.dtype,
+        head_size: int,
+        block_size: int,
+        gqa_ratio: int,
+        max_seq_len: int,
+        sliding_window: int,
+        kv_cache_dtype: str,
+        alibi_slopes: Optional[torch.Tensor] = None) -> bool:
+
+    GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
+    ON_GFX9 = any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942", "gfx950"])
+    ON_GFX11_GFX12 = any(arch in GPU_ARCH for arch in ["gfx11", "gfx12"])
+
+    # custom paged attn always supported on V0. On V1, requires sliding window
+    # disabled due to observed numerical discrepancy.
+    if ON_GFX9:
+        return ((not envs.VLLM_USE_V1 or sliding_window == 0
+                 or sliding_window == (-1, -1))
+                and (qtype == torch.half or qtype == torch.bfloat16)
+                and (head_size == 64 or head_size == 128)
+                and (block_size == 16 or block_size == 32)
+                and (gqa_ratio >= 1 and gqa_ratio <= 16)
+                and max_seq_len <= 128 * 1024
+                and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
+                and not (envs.VLLM_ROCM_USE_AITER_PAGED_ATTN
+                         and envs.VLLM_ROCM_USE_AITER))
+
+    else:
+        return (ON_GFX11_GFX12 and (not envs.VLLM_USE_V1 or sliding_window == 0
+                                    or sliding_window == (-1, -1))
+                and (qtype == torch.half or qtype == torch.bfloat16)
+                and head_size == 128 and block_size == 16
+                and (gqa_ratio >= 3 and gqa_ratio <= 16)
+                and max_seq_len <= 128 * 1024 and alibi_slopes is None
+                and kv_cache_dtype == "auto"
+                and envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
+
+
+class RocmPlatform(Platform):
+    _enum = PlatformEnum.ROCM
+    device_name: str = "rocm"
+    device_type: str = "cuda"
+    dispatch_key: str = "CUDA"
+    ray_device_key: str = "GPU"
+    dist_backend: str = "nccl"
+    # rocm shares the same device control env var as CUDA
+    device_control_env_var: str = "CUDA_VISIBLE_DEVICES"
+
+    supported_quantization: list[str] = [
+        "awq", "gptq", "fp8", "compressed-tensors", "fbgemm_fp8", "gguf",
+        "quark", "ptpc_fp8"
+    ]
+
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
+                             kv_cache_dtype, block_size, use_v1,
+                             use_mla) -> str:
+        if use_mla:
+            from vllm.attention.backends.rocm_aiter_mla import (
+                is_aiter_mla_enabled)
+
+            if selected_backend is None:
+                selected_backend = (_Backend.ROCM_AITER_MLA if
+                                    is_aiter_mla_enabled() or block_size == 1
+                                    else _Backend.TRITON_MLA)
+
+            if selected_backend == _Backend.TRITON_MLA:
+                if block_size != 1:
+                    if use_v1:
+                        logger.info_once(
+                            "Using Triton MLA backend on V1 engine.")
+                        return ("vllm.v1.attention.backends.mla."
+                                "triton_mla.TritonMLABackend")
+                    else:
+                        logger.info("Using Triton MLA backend.")
+                        return "vllm.attention.backends.triton_mla.TritonMLABackend"  # noqa: E501
+                else:
+                    raise ValueError(
+                        f" The selected backend, {selected_backend.name},"
+                        f"does not support block size {block_size}.")
+            elif selected_backend == _Backend.ROCM_AITER_MLA \
+                or selected_backend == _Backend.ROCM_AITER_MLA_VLLM_V1:
+                if block_size == 1:
+                    if use_v1:
+                        logger.info("Using AITER MLA backend on V1 engine.")
+                        return "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend"  # noqa: E501
+                    else:
+                        logger.info("Using AITER MLA backend")
+                        return "vllm.attention.backends.rocm_aiter_mla.AiterMLABackend"  # noqa: E501
+                else:
+                    raise ValueError(
+                        f" The selected backend, {selected_backend.name},"
+                        f"does not support block size {block_size}."
+                        "(currently only supports block size 1)")
+            else:
+                raise ValueError(
+                    f" The selected backend, {selected_backend.name},"
+                    f"is not MLA type while requested for MLA backend.")
+
+        if selected_backend is None or selected_backend == _Backend.FLASH_ATTN:
+            selected_backend = _Backend.ROCM_FLASH
+
+        if envs.VLLM_USE_V1:
+            if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA \
+                and on_gfx9():
+                logger.info("Using Flash Attention backend on V1 engine.")
+                return ("vllm.v1.attention.backends."
+                        "rocm_aiter_fa.AiterFlashAttentionBackend")
+            else:
+                logger.info("Using Triton Attention backend on V1 engine.")
+                return ("vllm.v1.attention.backends."
+                        "triton_attn.TritonAttentionBackend")
+        if selected_backend == _Backend.ROCM_FLASH:
+            if not cls.has_device_capability(90):
+                # not Instinct series GPUs.
+                logger.info("flash_attn is not supported on NAVI GPUs.")
+        else:
+            logger.info("%s is not supported in AMD GPUs.", selected_backend)
+        logger.info("Using ROCmFlashAttention backend.")
+        return "vllm.attention.backends.rocm_flash_attn.ROCmFlashAttentionBackend"  # noqa: E501
+
+    @classmethod
+    def set_device(cls, device: torch.device) -> None:
+        """
+        Set the device for the current platform.
+        """
+        torch.cuda.set_device(device)
+
+    @classmethod
+    @lru_cache(maxsize=8)
+    def get_device_capability(cls,
+                              device_id: int = 0
+                              ) -> Optional[DeviceCapability]:
+        major, minor = torch.cuda.get_device_capability(device_id)
+        return DeviceCapability(major=major, minor=minor)
+
+    @classmethod
+    @with_amdsmi_context
+    def is_fully_connected(cls, physical_device_ids: list[int]) -> bool:
+        """
+        Query if the set of gpus are fully connected by xgmi (1 hop)
+        """
+        handles = [
+            amdsmi_get_processor_handles()[i] for i in physical_device_ids
+        ]
+        for i, handle in enumerate(handles):
+            for j, peer_handle in enumerate(handles):
+                if i < j:
+                    try:
+                        link_type = amdsmi_topo_get_link_type(
+                            handle, peer_handle)
+                        # type is 2 for XGMI
+                        if link_type["hops"] != 1 or link_type["type"] != 2:
+                            return False
+                    except AmdSmiException as error:
+                        logger.error("AMD 1 hop XGMI detection failed.",
+                                     exc_info=error)
+                        return False
+        return True
+
+    @classmethod
+    @with_amdsmi_context
+    @lru_cache(maxsize=8)
+    def get_device_name(cls, device_id: int = 0) -> str:
+        physical_device_id = cls.device_id_to_physical_device_id(device_id)
+        handle = amdsmi_get_processor_handles()[physical_device_id]
+        asic_info = amdsmi_get_gpu_asic_info(handle)
+        device_name: str = asic_info["device_id"]
+        if device_name in _ROCM_DEVICE_ID_NAME_MAP:
+            return _ROCM_DEVICE_ID_NAME_MAP[device_name]
+        return asic_info["market_name"]
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        device_props = torch.cuda.get_device_properties(device_id)
+        return device_props.total_memory
+
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        if enforce_eager and not envs.VLLM_USE_V1:
+            logger.warning(
+                "To see benefits of async output processing, enable CUDA "
+                "graph. Since, enforce-eager is enabled, async output "
+                "processor cannot be used")
+            return False
+        return True
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
+        parallel_config = vllm_config.parallel_config
+        scheduler_config = vllm_config.scheduler_config
+        if parallel_config.worker_cls == "auto":
+            if scheduler_config.is_multi_step:
+                if envs.VLLM_USE_V1:
+                    raise NotImplementedError(
+                        "Multi-step scheduling is not supported (and not "
+                        "needed) on vLLM V1. Please launch without "
+                        "--num-scheduler-steps.")
+                else:
+                    parallel_config.worker_cls = \
+                        "vllm.worker.multi_step_worker.MultiStepWorker"
+            elif vllm_config.speculative_config:
+                if not envs.VLLM_USE_V1:
+                    raise NotImplementedError(
+                        "Speculative decoding is not supported on vLLM V0.")
+                parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"
+            else:
+                if envs.VLLM_USE_V1:
+                    parallel_config.worker_cls = \
+                            "vllm.v1.worker.gpu_worker.Worker"
+                else:
+                    parallel_config.worker_cls = "vllm.worker.worker.Worker"
+
+    @classmethod
+    def verify_model_arch(cls, model_arch: str) -> None:
+        if model_arch in _ROCM_UNSUPPORTED_MODELS:
+            raise ValueError(f"Model architecture '{model_arch}' is not "
+                             "supported by ROCm for now.")
+
+        if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS:
+            msg = _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch]
+            logger.warning(
+                "Model architecture '%s' is partially "
+                "supported by ROCm: %s", model_arch, msg)
+
+    @classmethod
+    def verify_quantization(cls, quant: str) -> None:
+        super().verify_quantization(quant)
+        if quant == "awq" and not envs.VLLM_USE_TRITON_AWQ:
+            logger.warning(
+                "Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ"
+                " is not set, enabling VLLM_USE_TRITON_AWQ.")
+        envs.VLLM_USE_TRITON_AWQ = True
+
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU"
+
+    @classmethod
+    def get_current_memory_usage(cls,
+                                 device: Optional[torch.types.Device] = None
+                                 ) -> float:
+        torch.cuda.reset_peak_memory_stats(device)
+        return torch.cuda.mem_get_info(device)[1] - torch.cuda.mem_get_info(
+            device)[0]
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        return "vllm.distributed.device_communicators.cuda_communicator.CudaCommunicator"  # noqa
+
+    @classmethod
+    def supports_mx(cls) -> bool:
+        gcn_arch = torch.cuda.get_device_properties(0).gcnArchName
+        return any(gfx in gcn_arch for gfx in ["gfx95"])
+
+    @classmethod
+    def supports_fp8(cls) -> bool:
+        gcn_arch = torch.cuda.get_device_properties(0).gcnArchName
+        return any(gfx in gcn_arch for gfx in ['gfx94', 'gfx95', 'gfx12'])
+
+    @classmethod
+    def is_fp8_fnuz(cls) -> bool:
+        # only device 0 is checked, this assumes MI300 platforms are homogeneous
+        return 'gfx94' in torch.cuda.get_device_properties(0).gcnArchName
+
+    @classmethod
+    def fp8_dtype(cls) -> torch.dtype:
+        if cls.is_fp8_fnuz():
+            return torch.float8_e4m3fnuz
+        else:
+            return torch.float8_e4m3fn
+
+    @classmethod
+    def supports_v1(cls, model_config: "ModelConfig") -> bool:
+        # V1 support on AMD gpus is experimental
+        return True
+
+    @classmethod
+    def use_custom_allreduce(cls) -> bool:
+        # We only enable custom allreduce for MI300 series
+        gcn_arch = torch.cuda.get_device_properties(0).gcnArchName
+        supported_archs = ['gfx94', 'gfx95']
+        return any(gfx in gcn_arch for gfx in supported_archs)
+
+    @classmethod
+    def get_cu_count(cls, device_id: int = 0) -> int:
+        return torch.cuda.get_device_properties(
+            device_id).multi_processor_count
+
+    @classmethod
+    def is_navi(cls) -> bool:
+        return 'gfx1' in torch.cuda.get_device_properties(0).gcnArchName
+
+    @classmethod
+    def get_piecewise_backend_cls(cls) -> str:
+        return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend"  # noqa
+
+    @classmethod
+    def stateless_init_device_torch_dist_pg(
+        cls,
+        backend: str,
+        prefix_store: PrefixStore,
+        group_rank: int,
+        group_size: int,
+        timeout: timedelta,
+    ) -> ProcessGroup:
+        assert is_nccl_available()
+        pg: ProcessGroup = ProcessGroup(
+            prefix_store,
+            group_rank,
+            group_size,
+        )
+        from torch.distributed.distributed_c10d import ProcessGroupNCCL
+
+        backend_options = ProcessGroupNCCL.Options()
+        backend_options._timeout = timeout
+
+        backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size,
+                                         backend_options)
+        backend_type = ProcessGroup.BackendType.NCCL
+        device = torch.device("cuda")
+        pg._set_default_backend(backend_type)
+        backend_class._set_sequence_number_for_group()
+
+        pg._register_backend(device, backend_type, backend_class)
+        return pg
+
+    @classmethod
+    def device_count(cls) -> int:
+        return cuda_device_count_stateless()
+
+    @classmethod
+    def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str) -> bool:
+        return True
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/platforms/tpu.py b/vllm_v0.10.0/vllm/platforms/tpu.py
new file mode 100644
index 0000000..146801c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/platforms/tpu.py
@@ -0,0 +1,203 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import TYPE_CHECKING, Optional, Union, cast
+
+import torch
+from tpu_info import device
+
+from vllm.inputs import ProcessorInputs, PromptType
+from vllm.logger import init_logger
+from vllm.sampling_params import SamplingParams, SamplingType
+from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
+
+from .interface import Platform, PlatformEnum, _Backend
+
+if TYPE_CHECKING:
+    from vllm.config import BlockSize, ModelConfig, VllmConfig
+    from vllm.pooling_params import PoolingParams
+else:
+    BlockSize = None
+    ModelConfig = None
+    VllmConfig = None
+    PoolingParams = None
+
+logger = init_logger(__name__)
+
+
+class TpuPlatform(Platform):
+    _enum = PlatformEnum.TPU
+    device_name: str = "tpu"
+    device_type: str = "tpu"
+    dispatch_key: str = "XLA"
+    ray_device_key: str = "TPU"
+    dist_backend: str = "gloo"
+    device_control_env_var: str = "TPU_VISIBLE_CHIPS"
+    simple_compile_backend: str = "openxla"
+
+    supported_quantization: list[str] = [
+        "fp8", "tpu_int8", "compressed-tensors"
+    ]
+
+    additional_env_vars: list[str] = [
+        "TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS"
+    ]
+
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
+        if (selected_backend != _Backend.PALLAS
+                and selected_backend != _Backend.PALLAS_VLLM_V1):
+            logger.info("Cannot use %s backend on TPU.", selected_backend)
+
+        if not use_v1:
+            raise ValueError("TPU backend only supports V1.")
+        logger.info("Using Pallas V1 backend.")
+        return "vllm.v1.attention.backends.pallas.PallasAttentionBackend"
+
+    @classmethod
+    def set_device(cls, device: torch.device) -> None:
+        """
+        Set the device for the current platform.
+        """
+        torch.tpu.set_device(device)
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        chip_type, _ = device.get_local_chips()
+        return f"TPU {chip_type.name}"
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        raise NotImplementedError
+
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return False
+
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        return "vllm.lora.punica_wrapper.punica_tpu.PunicaWrapperTPU"
+
+    @classmethod
+    def get_infinity_values(cls, dtype: torch.dtype) -> tuple[float, float]:
+        return torch.finfo(dtype).min, torch.finfo(dtype).max
+
+    @classmethod
+    def can_update_inplace(cls):
+        return False
+
+    @classmethod
+    def get_lora_vocab_padding_size(cls) -> int:
+        return 1
+
+    @classmethod
+    def inference_mode(cls):
+        return torch.no_grad()
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        from vllm.config import CompilationLevel
+
+        cache_config = vllm_config.cache_config
+        # For v0, the default block size is 16.
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = cast(BlockSize, 16)
+        compilation_config = vllm_config.compilation_config
+
+        # TPU only supports DYNAMO_ONCE compilation level
+        if compilation_config.level != CompilationLevel.DYNAMO_ONCE:
+            logger.info("[TPU] Forcing DYNAMO_ONCE compilation level")
+            compilation_config.level = CompilationLevel.DYNAMO_ONCE
+
+        if compilation_config.backend == "":
+            compilation_config.backend = "openxla"
+
+        assert vllm_config.speculative_config is None, \
+            "TPU does not support speculative decoding"
+
+        model_config = vllm_config.model_config
+        if model_config is not None and model_config.dtype in (torch.float16,
+                                                               torch.float32):
+            logger.warning(
+                "The TPU backend currently does not support %s. "
+                "Using bfloat16 instead.", model_config.dtype)
+            model_config.dtype = torch.bfloat16
+
+        from vllm.v1.attention.backends.pallas import PallasAttentionBackend
+        cache_config.block_size = PallasAttentionBackend.get_page_size(
+            vllm_config)  # type: ignore[assignment]
+
+        parallel_config = vllm_config.parallel_config
+        scheduler_config = vllm_config.scheduler_config
+        if parallel_config.worker_cls == "auto":
+            if scheduler_config.is_multi_step:
+                raise NotImplementedError(
+                    "Multi-step scheduling is not supported (and not "
+                    "needed) on vLLM V1. Please launch without "
+                    "--num-scheduler-steps.")
+            parallel_config.worker_cls = "vllm.v1.worker.tpu_worker.TPUWorker"
+
+        assert not vllm_config.speculative_config, (
+            "Speculative decoding is not yet supported for TPU backend")
+
+        if scheduler_config.is_multimodal_model and not \
+            scheduler_config.disable_chunked_mm_input:
+            logger.warning("TPU does not support running Multimodal models"\
+            " without setting `--disable_chunked_mm_input`. " \
+            "Forcing --disable_chunked_mm_input.")
+            scheduler_config.disable_chunked_mm_input = True
+
+        if model_config and model_config.use_mla:
+            logger.info(
+                "MLA is enabled on a non-GPU platform; forcing chunked "
+                "prefill and prefix caching to be disabled.")
+            vllm_config.scheduler_config.enable_chunked_prefill = False
+            vllm_config.scheduler_config.chunked_prefill_enabled = False
+            vllm_config.scheduler_config.max_num_batched_tokens = max(
+                vllm_config.scheduler_config.max_model_len,
+                DEFAULT_MAX_NUM_BATCHED_TOKENS)
+
+    @classmethod
+    def is_pin_memory_available(cls):
+        logger.warning("Pin memory is not supported on TPU.")
+        return False
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        return "vllm.distributed.device_communicators.tpu_communicator.TpuCommunicator"  # noqa
+
+    @classmethod
+    def use_all_gather(cls) -> bool:
+        return True
+
+    @classmethod
+    def supports_v1(cls, model_config: ModelConfig) -> bool:
+        # V1 support on TPU is experimental
+        return True
+
+    @classmethod
+    def validate_request(
+        cls,
+        prompt: PromptType,
+        params: Union[SamplingParams, PoolingParams],
+        processed_inputs: ProcessorInputs,
+    ) -> None:
+        """Raises if this request is unsupported on this platform"""
+        if (isinstance(params, SamplingParams)
+                and params.sampling_type == SamplingType.RANDOM_SEED):
+            raise ValueError("Torch XLA does not support per-request seed.")
+
+    @classmethod
+    def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str) -> bool:
+        return True
+
+
+try:
+    from tpu_commons.platforms import TpuPlatform as TpuCommonsPlatform
+    TpuPlatform = TpuCommonsPlatform  # type: ignore
+except ImportError:
+    logger.info("tpu_commons not found, using vLLM's TpuPlatform")
+    pass
diff --git a/vllm_v0.10.0/vllm/platforms/xpu.py b/vllm_v0.10.0/vllm/platforms/xpu.py
new file mode 100644
index 0000000..c4530c1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/platforms/xpu.py
@@ -0,0 +1,196 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
+
+from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig, VllmConfig
+else:
+    ModelConfig = None
+    VllmConfig = None
+
+logger = init_logger(__name__)
+
+
+class XPUPlatform(Platform):
+    _enum = PlatformEnum.XPU
+    device_name: str = "xpu"
+    device_type: str = "xpu"
+    dispatch_key: str = "XPU"
+    # Intel XPU's device key is "GPU" for Ray.
+    # see https://github.com/ray-project/ray/blob/6a5eb5865eeb9ccf058a79b44f107e327e360673/python/ray/_private/accelerators/intel_gpu.py#L20 # noqa: E501
+    ray_device_key: str = "GPU"
+    dist_backend: str = "ccl"  # ccl | xccl
+    device_control_env_var: str = "ONEAPI_DEVICE_SELECTOR"
+
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
+        if selected_backend is not None and selected_backend != _Backend.IPEX:
+            logger.info("Cannot use %s backend on XPU.", selected_backend)
+        use_v1 = envs.VLLM_USE_V1
+        if not use_v1:
+            raise ValueError("XPU backend only supports V1.")
+        logger.info("Using Flash Attention backend on V1 engine.")
+        return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
+
+    @classmethod
+    def set_device(cls, device: torch.device) -> None:
+        """
+        Set the device for the current platform.
+        """
+        torch.xpu.set_device(device)
+
+    @classmethod
+    def get_device_capability(
+        cls,
+        device_id: int = 0,
+    ) -> Optional[DeviceCapability]:
+        # capacity format differs from cuda's and will cause unexpected
+        # failure, so use None directly
+        return None
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return torch.xpu.get_device_name(device_id)
+
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU"
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        device_props = torch.xpu.get_device_properties(device_id)
+        return device_props.total_memory
+
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return True
+
+    @classmethod
+    def inference_mode(cls):
+        return torch.no_grad()
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        cache_config = vllm_config.cache_config
+        model_config = vllm_config.model_config
+        # in V1(or with ipex chunked prefill) block_size is 64
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 64
+
+        # FIXME: Temporarily forcing eager mode
+        # remove after t.compile support stabilizes.
+        if (envs.VLLM_USE_V1 and model_config is not None
+                and not vllm_config.model_config.enforce_eager):
+            from vllm.config import CompilationLevel
+            vllm_config.compilation_config.level = CompilationLevel.NO_COMPILATION  # noqa: E501
+
+        # Instances created using VllmConfig() typically have model_config as
+        # None by default. The modification involves adding a check to prevent
+        # potential null exceptions check and update model config.
+        if model_config is not None:
+            if model_config.dtype == torch.bfloat16:
+                bf16_supported = cls.device_support_bf16()
+                if not bf16_supported:
+                    model_config.dtype = torch.float16
+            if not model_config.enforce_eager:
+                logger.warning(
+                    "CUDA graph is not supported on XPU, fallback to the eager "
+                    "mode.")
+                model_config.enforce_eager = True
+
+        # check and update parallel config
+        parallel_config = vllm_config.parallel_config
+        parallel_config.worker_cls = "vllm.v1.worker.xpu_worker.XPUWorker"
+
+        if parallel_config.distributed_executor_backend is None:
+            if parallel_config.world_size > 1:
+                parallel_config.distributed_executor_backend = "ray"
+            else:
+                parallel_config.distributed_executor_backend = "uni"
+        elif parallel_config.distributed_executor_backend == "mp":
+            # FIXME(kunshang):
+            # spawn needs calling `if __name__ == '__main__':``
+            # fork is not supported for xpu start new process.
+            if envs.VLLM_WORKER_MULTIPROC_METHOD != "spawn":
+                os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+                logger.warning(
+                    "Please use spawn as start method if you want to use mp.")
+        elif (parallel_config.distributed_executor_backend != "ray"
+              and parallel_config.distributed_executor_backend != "uni"
+              and parallel_config.distributed_executor_backend
+              != "external_launcher"):
+            logger.warning(
+                "%s is not supported on XPU, fallback to ray distributed"
+                " executor backend.",
+                parallel_config.distributed_executor_backend)
+            parallel_config.distributed_executor_backend = "ray"
+
+        if model_config and model_config.use_mla:
+            logger.info(
+                "MLA is enabled on a non-GPU platform; forcing chunked "
+                "prefill and prefix caching to be disabled.")
+            vllm_config.scheduler_config.enable_chunked_prefill = False
+            vllm_config.scheduler_config.chunked_prefill_enabled = False
+            vllm_config.scheduler_config.max_num_batched_tokens = max(
+                vllm_config.scheduler_config.max_model_len,
+                DEFAULT_MAX_NUM_BATCHED_TOKENS)
+
+    @classmethod
+    def is_pin_memory_available(cls):
+        return True
+
+    @classmethod
+    def get_current_memory_usage(cls,
+                                 device: Optional[torch.types.Device] = None
+                                 ) -> float:
+        torch.xpu.reset_peak_memory_stats(device)
+        return torch.xpu.max_memory_allocated(device)
+
+    @classmethod
+    def device_support_bf16(cls) -> bool:
+        device_name = cls.get_device_name().lower()
+        if cls.is_client_gpu_a770():
+            logger.warning("Intel Arc A770 have bfloat16 accuracy known issue,"
+                           " fallback to float16")
+            return False
+        else:
+            logger.info(
+                "Device name %s supports bfloat16. Please file an issue "
+                "if you encounter any accuracy problems with bfloat16.",
+                device_name)
+            return True
+
+    @classmethod
+    def is_data_center_gpu(cls) -> bool:
+        device_name = cls.get_device_name().lower()
+        return device_name.count("data center gpu") > 0
+
+    @classmethod
+    def is_client_gpu_a770(cls) -> bool:
+        device_name = cls.get_device_name().lower()
+        return device_name.count("a770") > 0
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        return "vllm.distributed.device_communicators.xpu_communicator.XpuCommunicator"  # noqa
+
+    @classmethod
+    def supports_v1(cls, model_config: ModelConfig) -> bool:
+        return True
+
+    @classmethod
+    def device_count(cls) -> int:
+        return torch.xpu.device_count()
diff --git a/vllm_v0.10.0/vllm/plugins/__init__.py b/vllm_v0.10.0/vllm/plugins/__init__.py
new file mode 100644
index 0000000..51c78dd
--- /dev/null
+++ b/vllm_v0.10.0/vllm/plugins/__init__.py
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import logging
+from typing import Any, Callable
+
+import torch
+
+import vllm.envs as envs
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_PLUGINS_GROUP = 'vllm.general_plugins'
+
+# make sure one process only loads plugins once
+plugins_loaded = False
+
+
+def load_plugins_by_group(group: str) -> dict[str, Callable[[], Any]]:
+    import sys
+    if sys.version_info < (3, 10):
+        from importlib_metadata import entry_points
+    else:
+        from importlib.metadata import entry_points
+
+    allowed_plugins = envs.VLLM_PLUGINS
+
+    discovered_plugins = entry_points(group=group)
+    if len(discovered_plugins) == 0:
+        logger.debug("No plugins for group %s found.", group)
+        return {}
+
+    # Check if the only discovered plugin is the default one
+    is_default_group = (group == DEFAULT_PLUGINS_GROUP)
+    # Use INFO for non-default groups and DEBUG for the default group
+    log_level = logger.debug if is_default_group else logger.info
+
+    log_level("Available plugins for group %s:", group)
+    for plugin in discovered_plugins:
+        log_level("- %s -> %s", plugin.name, plugin.value)
+
+    if allowed_plugins is None:
+        log_level("All plugins in this group will be loaded. "
+                  "Set `VLLM_PLUGINS` to control which plugins to load.")
+
+    plugins = dict[str, Callable[[], Any]]()
+    for plugin in discovered_plugins:
+        if allowed_plugins is None or plugin.name in allowed_plugins:
+            if allowed_plugins is not None:
+                log_level("Loading plugin %s", plugin.name)
+
+            try:
+                func = plugin.load()
+                plugins[plugin.name] = func
+            except Exception:
+                logger.exception("Failed to load plugin %s", plugin.name)
+
+    return plugins
+
+
+def load_general_plugins():
+    """WARNING: plugins can be loaded for multiple times in different
+    processes. They should be designed in a way that they can be loaded
+    multiple times without causing issues.
+    """
+    global plugins_loaded
+    if plugins_loaded:
+        return
+    plugins_loaded = True
+
+    # some platform-specific configurations
+    from vllm.platforms import current_platform
+
+    if current_platform.is_xpu():
+        # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158
+        torch._dynamo.config.disable = True
+
+    plugins = load_plugins_by_group(group=DEFAULT_PLUGINS_GROUP)
+    # general plugins, we only need to execute the loaded functions
+    for func in plugins.values():
+        func()
diff --git a/vllm_v0.10.0/vllm/plugins/lora_resolvers/README.md b/vllm_v0.10.0/vllm/plugins/lora_resolvers/README.md
new file mode 100644
index 0000000..7e7c55f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/plugins/lora_resolvers/README.md
@@ -0,0 +1,15 @@
+# LoRA Resolver Plugins
+
+This directory contains vLLM general plugins for dynamically discovering and loading LoRA adapters
+via the LoRAResolver plugin framework.
+
+Note that `VLLM_ALLOW_RUNTIME_LORA_UPDATING` must be set to true to allow LoRA resolver plugins
+to work, and `VLLM_PLUGINS` must be set to include the desired resolver plugins.
+
+# lora_filesystem_resolver
+This LoRA Resolver is installed with vLLM by default.
+To use, set `VLLM_PLUGIN_LORA_CACHE_DIR` to a local directory. When vLLM receives a request
+for a LoRA adapter `foobar` it doesn't currently recognize, it will look in that local directory
+for a subdirectory `foobar` containing a LoRA adapter. If such an adapter exists, it will
+load that adapter, and then service the request as normal. That adapter will then be available
+for future requests as normal.
diff --git a/vllm_v0.10.0/vllm/plugins/lora_resolvers/__init__.py b/vllm_v0.10.0/vllm/plugins/lora_resolvers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/plugins/lora_resolvers/filesystem_resolver.py b/vllm_v0.10.0/vllm/plugins/lora_resolvers/filesystem_resolver.py
new file mode 100644
index 0000000..b999d07
--- /dev/null
+++ b/vllm_v0.10.0/vllm/plugins/lora_resolvers/filesystem_resolver.py
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+import os
+from typing import Optional
+
+import vllm.envs as envs
+from vllm.lora.request import LoRARequest
+from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
+
+
+class FilesystemResolver(LoRAResolver):
+
+    def __init__(self, lora_cache_dir: str):
+        self.lora_cache_dir = lora_cache_dir
+
+    async def resolve_lora(self, base_model_name: str,
+                           lora_name: str) -> Optional[LoRARequest]:
+        lora_path = os.path.join(self.lora_cache_dir, lora_name)
+        if os.path.exists(lora_path):
+            adapter_config_path = os.path.join(self.lora_cache_dir, lora_name,
+                                               "adapter_config.json")
+            if os.path.exists(adapter_config_path):
+                with open(adapter_config_path) as file:
+                    adapter_config = json.load(file)
+                if adapter_config["peft_type"] == "LORA" and adapter_config[
+                        "base_model_name_or_path"] == base_model_name:
+                    lora_request = LoRARequest(lora_name=lora_name,
+                                               lora_int_id=abs(
+                                                   hash(lora_name)),
+                                               lora_path=lora_path)
+                    return lora_request
+        return None
+
+
+def register_filesystem_resolver():
+    """Register the filesystem LoRA Resolver with vLLM"""
+
+    lora_cache_dir = envs.VLLM_LORA_RESOLVER_CACHE_DIR
+    if lora_cache_dir:
+        if not os.path.exists(lora_cache_dir) or not os.path.isdir(
+                lora_cache_dir):
+            raise ValueError(
+                "VLLM_LORA_RESOLVER_CACHE_DIR must be set to a valid directory \
+                for Filesystem Resolver plugin to function")
+        fs_resolver = FilesystemResolver(lora_cache_dir)
+        LoRAResolverRegistry.register_resolver("Filesystem Resolver",
+                                               fs_resolver)
+
+    return
diff --git a/vllm_v0.10.0/vllm/pooling_params.py b/vllm_v0.10.0/vllm/pooling_params.py
new file mode 100644
index 0000000..868facb
--- /dev/null
+++ b/vllm_v0.10.0/vllm/pooling_params.py
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import TYPE_CHECKING, Literal, Optional
+
+import msgspec
+
+from vllm.sampling_params import RequestOutputKind
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+
+PoolingTask = Literal["encode", "embed", "classify", "score"]
+
+
+class PoolingParams(
+        msgspec.Struct,
+        omit_defaults=True,  # type: ignore[call-arg]
+        array_like=True):  # type: ignore[call-arg]
+    """API parameters for pooling models.
+
+    Attributes:
+        dimensions: Reduce the dimensions of embeddings
+                    if model support matryoshka representation.
+    """
+
+    dimensions: Optional[int] = None
+
+    output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY
+
+    task: Optional[PoolingTask] = None
+    """Internal use only."""
+
+    requires_token_ids: bool = False
+    """Internal use only."""
+
+    def clone(self) -> "PoolingParams":
+        """Returns a deep copy of the PoolingParams instance."""
+        return PoolingParams(
+            dimensions=self.dimensions,
+            task=self.task,
+            requires_token_ids=self.requires_token_ids,
+        )
+
+    def verify(self, task: PoolingTask, model_config: "ModelConfig") -> None:
+        if self.task is None:
+            self.task = task
+        elif self.task != task:
+            msg = f"You cannot overwrite {self.task=!r} with {task=!r}!"
+            raise ValueError(msg)
+
+        # NOTE: Task validation needs to done against the model instance,
+        # which is not available in model config. So, it's not included
+        # in this method
+
+        if self.dimensions is not None:
+            if not model_config.is_matryoshka:
+                raise ValueError(
+                    f'Model "{model_config.served_model_name}" does not '
+                    f'support matryoshka representation, '
+                    f'changing output dimensions will lead to poor results.')
+
+            mds = model_config.matryoshka_dimensions
+            if mds is not None:
+                if self.dimensions not in mds:
+                    raise ValueError(
+                        f'Model "{model_config.served_model_name}" '
+                        f'only supports {str(mds)} matryoshka dimensions, '
+                        f'use other output dimensions will '
+                        f'lead to poor results.')
+            elif self.dimensions < 1:
+                raise ValueError("Dimensions must be greater than 0")
+
+    def __repr__(self) -> str:
+        return (f"PoolingParams("
+                f"dimensions={self.dimensions}, "
+                f"task={self.task}, "
+                f"requires_token_ids={self.requires_token_ids})")
+
+    def __post_init__(self) -> None:
+        assert self.output_kind == RequestOutputKind.FINAL_ONLY,\
+            "For pooling output_kind has to be FINAL_ONLY"
diff --git a/vllm_v0.10.0/vllm/profiler/__init__.py b/vllm_v0.10.0/vllm/profiler/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/profiler/layerwise_profile.py b/vllm_v0.10.0/vllm/profiler/layerwise_profile.py
new file mode 100644
index 0000000..2f9ebe5
--- /dev/null
+++ b/vllm_v0.10.0/vllm/profiler/layerwise_profile.py
@@ -0,0 +1,375 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import copy
+from collections import defaultdict
+from dataclasses import asdict, dataclass, field
+from typing import Any, Callable, Optional, TypeAlias, Union
+
+import pandas as pd
+from torch._C._autograd import DeviceType, _KinetoEvent, _ProfilerResult
+from torch._C._profiler import _EventType, _ExperimentalConfig, _ProfilerEvent
+from torch.autograd.profiler import FunctionEvent
+from torch.profiler import ProfilerActivity, profile
+
+from vllm.profiler.utils import (TablePrinter, event_has_module,
+                                 event_is_torch_op, event_module_repr,
+                                 event_torch_op_stack_trace, indent_string)
+
+
+@dataclass
+class _ModuleTreeNode:
+    event: _ProfilerEvent
+    parent: Optional['_ModuleTreeNode'] = None
+    children: list['_ModuleTreeNode'] = field(default_factory=list)
+    trace: str = ""
+
+    @property
+    def is_leaf(self):
+        return (self.event.children is None or len(self.event.children) == 0)
+
+    @property
+    def is_torch_op(self):
+        return event_is_torch_op(self.event)
+
+    @property
+    def is_cuda(self):
+        return (self.event.tag == _EventType.Kineto
+                and self.event.typed[1].device_type == DeviceType.CUDA)
+
+
+@dataclass
+class SummaryStatsEntry:
+    name: str
+    cuda_time_us: float
+    pct_cuda_time: float
+    invocations: int
+
+
+@dataclass
+class ModelStatsEntry:
+    name: str
+    cpu_time_us: float
+    cuda_time_us: float
+    pct_cuda_time: float
+    trace: str
+
+
+StatsEntry: TypeAlias = Union[ModelStatsEntry, SummaryStatsEntry]
+
+
+@dataclass
+class _StatsTreeNode:
+    entry: StatsEntry
+    children: list[StatsEntry]
+    parent: Optional[StatsEntry]
+
+
+@dataclass
+class LayerwiseProfileResults(profile):
+    _kineto_results: _ProfilerResult
+    _kineto_event_correlation_map: dict[int,
+                                        list[_KinetoEvent]] = field(init=False)
+    _event_correlation_map: dict[int, list[FunctionEvent]] = field(init=False)
+    _module_tree: list[_ModuleTreeNode] = field(init=False)
+    _model_stats_tree: list[_StatsTreeNode] = field(init=False)
+    _summary_stats_tree: list[_StatsTreeNode] = field(init=False)
+
+    # profile metadata
+    num_running_seqs: Optional[int] = None
+
+    def __post_init__(self):
+        self._build_correlation_map()
+        self._build_module_tree()
+        self._build_stats_trees()
+
+    def print_model_table(self, column_widths: dict[str, int] = None):
+        _column_widths = dict(name=60,
+                              cpu_time_us=12,
+                              cuda_time_us=12,
+                              pct_cuda_time=12,
+                              trace=60)
+        if column_widths:
+            _column_widths.update(**column_widths)
+        filtered_model_table = [
+            (depth, row)
+            for depth, row in self._flatten_stats_tree(self._model_stats_tree)
+            if row.cuda_time_us > 0 or row.cpu_time_us > 0
+        ]
+        TablePrinter(ModelStatsEntry, _column_widths).print_table(
+            self._indent_row_names_based_on_depth(
+                filtered_model_table,
+                indent_style=lambda indent: "|" + "-" * indent + " "))
+
+    def print_summary_table(self, column_widths: dict[str, int] = None):
+        _column_widths = dict(name=80,
+                              cuda_time_us=12,
+                              pct_cuda_time=12,
+                              invocations=15)
+        if column_widths:
+            _column_widths.update(**column_widths)
+        filtered_summary_table = [(depth, row)
+                                  for depth, row in self._flatten_stats_tree(
+                                      self._summary_stats_tree)
+                                  if row.cuda_time_us > 0]
+        TablePrinter(SummaryStatsEntry, _column_widths).print_table(
+            self._indent_row_names_based_on_depth(
+                filtered_summary_table,
+                indent_style=lambda indent: "|" + "-" * indent + " "))
+
+    def export_model_stats_table_csv(self, filename: str):
+        df = pd.DataFrame([
+            asdict(row)
+            for _, row in self._flatten_stats_tree(self._model_stats_tree)
+        ])
+        df.to_csv(filename)
+
+    def export_summary_stats_table_csv(self, filename: str):
+        df = pd.DataFrame([
+            asdict(row)
+            for _, row in self._flatten_stats_tree(self._summary_stats_tree)
+        ])
+        df.to_csv(filename)
+
+    def convert_stats_to_dict(self) -> dict[str, Any]:
+        return {
+            "metadata": {
+                "num_running_seqs": self.num_running_seqs
+            },
+            "summary_stats":
+            self._convert_stats_tree_to_dict(self._summary_stats_tree),
+            "model_stats":
+            self._convert_stats_tree_to_dict(self._model_stats_tree)
+        }
+
+    @staticmethod
+    def _indent_row_names_based_on_depth(depths_rows: list[tuple[int,
+                                                                 StatsEntry]],
+                                         indent_style: Union[Callable[[int],
+                                                                      str],
+                                                             str] = " "):
+        indented_rows = []
+        for depth, row in depths_rows:
+            if row.cuda_time_us == 0:
+                continue
+            indented_row = copy.deepcopy(row)
+            indented_row.name = indent_string(indented_row.name, depth,
+                                              indent_style)
+            indented_rows.append(indented_row)
+        return indented_rows
+
+    def _build_correlation_map(self):
+        self._kineto_event_correlation_map = defaultdict(list)
+        for event in self._kineto_results.events():
+            self._kineto_event_correlation_map[event.correlation_id()].append(
+                event)
+
+    def _build_module_tree(self):
+        self._module_tree = []
+        event_tree = self._kineto_results.experimental_event_tree()
+
+        def _df_traversal(event: _ProfilerEvent,
+                          curr_node: Optional[_ModuleTreeNode] = None):
+
+            # For the tensor parallel case for now only look at task 1
+            if event.start_tid != 1:
+                return
+
+            if event_has_module(event):
+                node = _ModuleTreeNode(event=event, parent=curr_node)
+                if curr_node:
+                    curr_node.children.append(node)
+                else:
+                    self._module_tree.append(node)
+                curr_node = node
+
+            is_leaf = (event.children is None or len(event.children) == 0)
+            if is_leaf and curr_node:
+                node = _ModuleTreeNode(
+                    event=event,
+                    parent=curr_node,
+                    trace=event_torch_op_stack_trace(
+                        event, until=lambda x: event_has_module(x)))
+                curr_node.children.append(node)
+                curr_node = node
+
+            for child in event.children:
+                _df_traversal(child, curr_node)
+
+        for root in event_tree:
+            _df_traversal(root)
+
+    def _get_kineto_gpu_event(self, node: _ModuleTreeNode):
+        if node.event.tag != _EventType.Kineto:
+            return None
+        correlated_kineto_events = self._kineto_event_correlation_map.get(
+            node.event.correlation_id, [])
+        iterator = (x for x in correlated_kineto_events
+                    if x.device_type() == DeviceType.CUDA
+                    and x.name() == node.event.name)
+        return next(iterator, None)
+
+    def _cumulative_cuda_time(self, node: _ModuleTreeNode):
+        'Return cuda time in microseconds'
+
+        def _cumulative_cuda_time_recursive(node: _ModuleTreeNode):
+            if node.is_leaf and (gpu_kineto_event :=
+                                 self._get_kineto_gpu_event(node)):
+                return gpu_kineto_event.duration_ns() / 1000.0
+            else:
+                cumulative_cuda_time = 0
+                for child in node.children:
+                    cumulative_cuda_time += _cumulative_cuda_time_recursive(
+                        child)
+                return cumulative_cuda_time
+
+        return _cumulative_cuda_time_recursive(node)
+
+    def _total_cuda_time(self):
+        return sum(
+            [self._cumulative_cuda_time(root) for root in self._module_tree])
+
+    def _build_stats_trees(self):
+        summary_dict: dict[str, _StatsTreeNode] = {}
+        total_cuda_time = self._total_cuda_time()
+
+        def pct_cuda_time(cuda_time_us):
+            return (cuda_time_us / total_cuda_time) * 100
+
+        def build_summary_stats_tree_df(
+            node: _ModuleTreeNode,
+            parent: Optional[_StatsTreeNode] = None,
+            summary_trace: tuple[str] = ()):
+
+            if event_has_module(node.event):
+                name = event_module_repr(node.event)
+                cuda_time_us = self._cumulative_cuda_time(node)
+            elif (gpu_kineto_event := self._get_kineto_gpu_event(node)):
+                name = gpu_kineto_event.name()
+                cuda_time_us = gpu_kineto_event.duration_ns() / 1000.0
+            else:
+                return None
+
+            summary_trace = summary_trace + (name, )
+            if summary_trace in summary_dict:
+                entry = summary_dict[summary_trace].entry
+                entry.cuda_time_us += cuda_time_us
+                entry.invocations += 1
+                entry.pct_cuda_time = pct_cuda_time(entry.cuda_time_us)
+            else:
+                new_node = _StatsTreeNode(entry=SummaryStatsEntry(
+                    name=name,
+                    cuda_time_us=cuda_time_us,
+                    pct_cuda_time=pct_cuda_time(cuda_time_us),
+                    invocations=1),
+                                          children=[],
+                                          parent=parent)
+                if parent:
+                    parent.children.append(new_node)
+                summary_dict[summary_trace] = new_node
+
+            for child in node.children:
+                build_summary_stats_tree_df(child, summary_dict[summary_trace],
+                                            summary_trace)
+
+            return summary_dict[summary_trace]
+
+        self._summary_stats_tree = []
+        for root in self._module_tree:
+            self._summary_stats_tree.append(build_summary_stats_tree_df(root))
+
+        def build_model_stats_tree_df(node: _ModuleTreeNode,
+                                      parent: Optional[_StatsTreeNode] = None):
+            if event_has_module(node.event, ):
+                name = event_module_repr(node.event)
+                cuda_time_us = self._cumulative_cuda_time(node)
+                cpu_time_us = node.event.duration_time_ns / 1000
+                trace = ""
+            elif (gpu_kineto_event := self._get_kineto_gpu_event(node)):
+                name = gpu_kineto_event.name()
+                cuda_time_us = gpu_kineto_event.duration_ns() / 1000.0
+                cpu_time_us = 0
+                trace = node.trace
+            else:
+                return None
+
+            new_node = _StatsTreeNode(entry=ModelStatsEntry(
+                name=name,
+                cpu_time_us=cpu_time_us,
+                cuda_time_us=cuda_time_us,
+                pct_cuda_time=pct_cuda_time(cuda_time_us),
+                trace=trace),
+                                      parent=parent,
+                                      children=[])
+            if parent:
+                parent.children.append(new_node)
+
+            for child in node.children:
+                build_model_stats_tree_df(child, new_node)
+
+            return new_node
+
+        self._model_stats_tree = []
+        for root in self._module_tree:
+            self._model_stats_tree.append(build_model_stats_tree_df(root))
+
+    def _flatten_stats_tree(
+            self, tree: list[_StatsTreeNode]) -> list[tuple[int, StatsEntry]]:
+        entries: list[tuple[int, StatsEntry]] = []
+
+        def df_traversal(node: _StatsTreeNode, depth=0):
+            entries.append((depth, node.entry))
+            for child in node.children:
+                df_traversal(child, depth=depth + 1)
+
+        for root in tree:
+            df_traversal(root)
+
+        return entries
+
+    def _convert_stats_tree_to_dict(self,
+                                    tree: list[_StatsTreeNode]) -> list[dict]:
+        root_dicts: list[dict] = []
+
+        def df_traversal(node: _StatsTreeNode, curr_json_list: list[dict]):
+            curr_json_list.append({
+                "entry": asdict(node.entry),
+                "children": []
+            })
+            for child in node.children:
+                df_traversal(child, curr_json_list[-1]["children"])
+
+        for root in tree:
+            df_traversal(root, root_dicts)
+
+        return root_dicts
+
+
+class layerwise_profile(profile):
+
+    def __init__(self, num_running_seqs: Optional[int] = None):
+        """
+        layerwise profile constructor.
+
+        Args:
+            num_running_seqs (Optional[int], optional): When given,
+            num_running_seqs will be passed to LayerProfileResults for metadata
+            update. Defaults to None.
+        """
+        super().__init__(
+            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+            record_shapes=True,
+            with_stack=True,
+            with_modules=True,
+            experimental_config=_ExperimentalConfig(verbose=True))
+
+        self.num_running_seqs = num_running_seqs
+
+    def __enter__(self):
+        return super().__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        super().__exit__(exc_type, exc_val, exc_tb)
+        self.results = LayerwiseProfileResults(
+            self.profiler.kineto_results,
+            num_running_seqs=self.num_running_seqs)
diff --git a/vllm_v0.10.0/vllm/profiler/utils.py b/vllm_v0.10.0/vllm/profiler/utils.py
new file mode 100644
index 0000000..9f0f56a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/profiler/utils.py
@@ -0,0 +1,148 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import dataclasses
+from typing import Callable, Union
+
+from torch._C._profiler import _EventType, _ProfilerEvent, _TensorMetadata
+
+#
+# String / Print Manipulation
+#
+
+
+def trim_string_front(string, width):
+    if len(string) > width:
+        offset = len(string) - width + 3
+        string = string[offset:]
+        if len(string) > 3:
+            string = "..." + string[3:]
+    return string
+
+
+def trim_string_back(string, width):
+    if len(string) > width:
+        offset = len(string) - width + 3
+        string = string[:-offset]
+        if len(string) > 3:
+            string = string + "..."
+    return string
+
+
+class TablePrinter:
+
+    def __init__(self, row_cls: type[dataclasses.dataclass],
+                 column_widths: dict[str, int]):
+        self.row_cls = row_cls
+        self.fieldnames = [x.name for x in dataclasses.fields(row_cls)]
+        self.column_widths = column_widths
+        assert set(self.column_widths.keys()) == set(self.fieldnames)
+
+    def print_table(self, rows: list[dataclasses.dataclass]):
+        self._print_header()
+        self._print_line()
+        for row in rows:
+            self._print_row(row)
+
+    def _print_header(self):
+        for i, f in enumerate(self.fieldnames):
+            last = (i == len(self.fieldnames) - 1)
+            col_width = self.column_widths[f]
+            print(trim_string_back(f, col_width).ljust(col_width),
+                  end=" | " if not last else "\n")
+
+    def _print_row(self, row):
+        assert isinstance(row, self.row_cls)
+
+        for i, f in enumerate(self.fieldnames):
+            last = (i == len(self.fieldnames) - 1)
+            col_width = self.column_widths[f]
+            val = getattr(row, f)
+
+            val_str = ""
+            if isinstance(val, str):
+                val_str = trim_string_back(val, col_width).ljust(col_width)
+            elif type(val) in [float, int]:
+                val_str = f"{float(val):>.2f}".rjust(col_width)
+            else:
+                val_str = f"{val}".rjust(col_width)
+            print(val_str, end=" | " if not last else "\n")
+
+    def _print_line(self):
+        total_col_width = 0
+        for column_width in self.column_widths.values():
+            total_col_width += column_width
+        print("=" * (total_col_width + 3 * (len(self.column_widths) - 1)))
+
+
+def indent_string(string: str,
+                  indent: int,
+                  indent_style: Union[Callable[[int], str], str] = " ") -> str:
+    if indent:
+        if isinstance(indent_style, str):
+            return indent_style * indent + string
+        else:
+            return indent_style(indent) + string
+    else:
+        return string
+
+
+#
+# _ProfilerEvent utils
+#
+
+
+def event_has_module(event: _ProfilerEvent) -> bool:
+    event_type, typed_event = event.typed
+    if event_type == _EventType.PyCall:
+        return typed_event.module is not None
+    return False
+
+
+def event_is_torch_op(event: _ProfilerEvent) -> bool:
+    return event.tag == _EventType.TorchOp
+
+
+def event_arg_repr(arg) -> str:
+    if arg is None or type(arg) in [float, int, bool, str]:
+        return f"{arg}"
+    elif isinstance(arg, list):
+        return f"[{', '.join([event_arg_repr(x) for x in arg])}]"
+    elif isinstance(arg, tuple):
+        return f"({', '.join([event_arg_repr(x) for x in arg])})"
+    else:
+        assert isinstance(arg,
+                          _TensorMetadata), f"Unsupported type: {type(arg)}"
+        sizes_str = ', '.join([str(x) for x in arg.sizes])
+        return f"{str(arg.dtype).replace('torch.', '')}[{sizes_str}]"
+
+
+def event_torch_op_repr(event: _ProfilerEvent) -> str:
+    assert event.tag == _EventType.TorchOp
+    args_str = ', '.join([event_arg_repr(x) for x in event.typed[1].inputs])
+    return f"{event.name}({args_str})".replace("aten::", "")
+
+
+def event_module_repr(event: _ProfilerEvent) -> str:
+    assert event_has_module(event)
+    module = event.typed[1].module
+    if module.parameters and len(module.parameters) > 0:
+        args_str = ', '.join(
+            [f'{x[0]}={event_arg_repr(x[1])}' for x in module.parameters])
+        return f"{module.cls_name}({args_str})"
+    else:
+        return module.cls_name
+
+
+def event_torch_op_stack_trace(curr_event: _ProfilerEvent,
+                               until: Callable[[_ProfilerEvent], bool]) -> str:
+    trace = ""
+    curr_event = curr_event.parent
+    while curr_event and not until(curr_event):
+        if event_is_torch_op(curr_event):
+            if len(trace) > 0:
+                trace += " <- "
+            trace += event_torch_op_repr(curr_event)
+        curr_event = curr_event.parent
+
+    return trace
diff --git a/vllm_v0.10.0/vllm/py.typed b/vllm_v0.10.0/vllm/py.typed
new file mode 100644
index 0000000..33b3ad7
--- /dev/null
+++ b/vllm_v0.10.0/vllm/py.typed
@@ -0,0 +1,2 @@
+# Marker file for PEP 561.
+# The vllm package uses inline types.
diff --git a/vllm_v0.10.0/vllm/ray/__init__.py b/vllm_v0.10.0/vllm/ray/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/ray/ray_env.py b/vllm_v0.10.0/vllm/ray/ray_env.py
new file mode 100644
index 0000000..f6a994b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/ray/ray_env.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+import os
+from typing import Optional
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+CONFIG_HOME = envs.VLLM_CONFIG_ROOT
+
+# This file contains a list of env vars that should not be copied
+# from the driver to the Ray workers.
+RAY_NON_CARRY_OVER_ENV_VARS_FILE = os.path.join(
+    CONFIG_HOME, "ray_non_carry_over_env_vars.json")
+
+try:
+    if os.path.exists(RAY_NON_CARRY_OVER_ENV_VARS_FILE):
+        with open(RAY_NON_CARRY_OVER_ENV_VARS_FILE) as f:
+            RAY_NON_CARRY_OVER_ENV_VARS = set(json.load(f))
+    else:
+        RAY_NON_CARRY_OVER_ENV_VARS = set()
+except json.JSONDecodeError:
+    logger.warning(
+        "Failed to parse %s. Using an empty set for non-carry-over env vars.",
+        RAY_NON_CARRY_OVER_ENV_VARS_FILE)
+    RAY_NON_CARRY_OVER_ENV_VARS = set()
+
+
+def get_env_vars_to_copy(exclude_vars: Optional[set[str]] = None,
+                         additional_vars: Optional[set[str]] = None,
+                         destination: Optional[str] = None) -> set[str]:
+    """
+    Get the environment variables to copy to downstream Ray actors.
+
+    Example use cases:
+    - Copy environment variables from RayDistributedExecutor to Ray workers.
+    - Copy environment variables from RayDPClient to Ray DPEngineCoreActor.
+
+    Args:
+        exclude_vars: A set of vllm defined environment variables to exclude
+            from copying.
+        additional_vars: A set of additional environment variables to copy.
+            If a variable is in both exclude_vars and additional_vars, it will
+            be excluded.
+        destination: The destination of the environment variables.
+    Returns:
+        A set of environment variables to copy.
+    """
+    exclude_vars = exclude_vars or set()
+    additional_vars = additional_vars or set()
+
+    env_vars_to_copy = {
+        v
+        for v in set(envs.environment_variables).union(additional_vars)
+        if v not in exclude_vars and v not in RAY_NON_CARRY_OVER_ENV_VARS
+    }
+
+    to_destination = " to " + destination if destination is not None else ""
+
+    logger.info("RAY_NON_CARRY_OVER_ENV_VARS from config: %s",
+                RAY_NON_CARRY_OVER_ENV_VARS)
+    logger.info("Copying the following environment variables%s: %s",
+                to_destination,
+                [v for v in env_vars_to_copy if v in os.environ])
+    logger.info(
+        "If certain env vars should NOT be copied, add them to "
+        "%s file", RAY_NON_CARRY_OVER_ENV_VARS_FILE)
+
+    return env_vars_to_copy
diff --git a/vllm_v0.10.0/vllm/reasoning/__init__.py b/vllm_v0.10.0/vllm/reasoning/__init__.py
new file mode 100644
index 0000000..d61e4f1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/reasoning/__init__.py
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
+from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
+from .glm4_moe_reasoning_parser import Glm4MoeModelReasoningParser
+from .granite_reasoning_parser import GraniteReasoningParser
+from .hunyuan_a13b_reasoning_parser import HunyuanA13BReasoningParser
+from .mistral_reasoning_parser import MistralReasoningParser
+from .qwen3_reasoning_parser import Qwen3ReasoningParser
+
+__all__ = [
+    "ReasoningParser",
+    "ReasoningParserManager",
+    "DeepSeekR1ReasoningParser",
+    "GraniteReasoningParser",
+    "HunyuanA13BReasoningParser",
+    "Qwen3ReasoningParser",
+    "Glm4MoeModelReasoningParser",
+    "MistralReasoningParser",
+]
diff --git a/vllm_v0.10.0/vllm/reasoning/abs_reasoning_parsers.py b/vllm_v0.10.0/vllm/reasoning/abs_reasoning_parsers.py
new file mode 100644
index 0000000..4f4522d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/reasoning/abs_reasoning_parsers.py
@@ -0,0 +1,202 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import os
+from abc import abstractmethod
+from collections.abc import Sequence
+from functools import cached_property
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+
+from vllm.logger import init_logger
+from vllm.utils import import_from_path, is_list_of
+
+if TYPE_CHECKING:
+    from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                                  DeltaMessage,
+                                                  ResponsesRequest)
+    from vllm.transformers_utils.tokenizer import AnyTokenizer
+else:
+    ChatCompletionRequest = Any
+    DeltaMessage = Any
+    ResponsesRequest = Any
+    AnyTokenizer = Any
+
+logger = init_logger(__name__)
+
+
+class ReasoningParser:
+    """
+    Abstract reasoning parser class that should not be used directly.
+    Provided and methods should be used in derived classes.
+
+    It is used to extract reasoning content from the model output.
+    """
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        self.model_tokenizer = tokenizer
+
+    @cached_property
+    def vocab(self) -> dict[str, int]:
+        # NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab
+        # whereas all tokenizers have .get_vocab()
+        return self.model_tokenizer.get_vocab()
+
+    @abstractmethod
+    def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
+        """
+        Check if the reasoning content ends in the input_ids.
+
+        It is used in structured engines like `xgrammar` to check if the
+        reasoning content ends in the model output.
+
+        Parameters:
+        input_ids: list[int]
+            The input_ids of the model output.
+
+        Returns:
+        bool
+            True if the reasoning content ends in the input_ids.
+        """
+
+    @abstractmethod
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        """
+        Extract content token ids from the input_ids.
+        Parameters:
+        input_ids: list[int]
+            The input_ids of the model output.
+        Returns:
+        list[int]
+            The extracted content from the input_ids.
+        """
+
+    @abstractmethod
+    def extract_reasoning_content(
+        self,
+        model_output: str,
+        request: Union[ChatCompletionRequest, ResponsesRequest],
+    ) -> tuple[Optional[str], Optional[str]]:
+        """
+        Extract reasoning content from a complete model-generated string.
+
+        Used for non-streaming responses where we have the entire model response
+        available before sending to the client.
+
+        Parameters:
+        model_output: str
+            The model-generated string to extract reasoning content from.
+
+        request: ChatCompletionRequest
+            The request object that was used to generate the model_output.
+
+        Returns:
+        tuple[Optional[str], Optional[str]]
+            A tuple containing the reasoning content and the content.
+        """
+
+    @abstractmethod
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+        """
+        Instance method that should be implemented for extracting reasoning
+        from an incomplete response; for use when handling reasoning calls and
+        streaming. Has to be an instance method because  it requires state -
+        the current tokens/diffs, but also the information about what has
+        previously been parsed and extracted (see constructor)
+        """
+
+
+class ReasoningParserManager:
+    reasoning_parsers: dict[str, type] = {}
+
+    @classmethod
+    def get_reasoning_parser(cls, name: str | None) -> type[ReasoningParser]:
+        """
+        Get reasoning parser by name which is registered by `register_module`.
+
+        Raise a KeyError exception if the name is not registered.
+        """
+        if name in cls.reasoning_parsers:
+            return cls.reasoning_parsers[name]
+
+        raise KeyError(
+            f"reasoning helper: '{name}' not found in reasoning_parsers")
+
+    @classmethod
+    def _register_module(
+        cls,
+        module: type,
+        module_name: Optional[Union[str, list[str]]] = None,
+        force: bool = True,
+    ) -> None:
+        if not issubclass(module, ReasoningParser):
+            raise TypeError("module must be subclass of ReasoningParser, "
+                            f"but got {type(module)}")
+        if module_name is None:
+            module_name = module.__name__
+        if isinstance(module_name, str):
+            module_name = [module_name]
+        for name in module_name:
+            if not force and name in cls.reasoning_parsers:
+                existed_module = cls.reasoning_parsers[name]
+                raise KeyError(f"{name} is already registered "
+                               f"at {existed_module.__module__}")
+            cls.reasoning_parsers[name] = module
+
+    @classmethod
+    def register_module(
+        cls,
+        name: Optional[Union[str, list[str]]] = None,
+        force: bool = True,
+        module: Union[type, None] = None,
+    ) -> Union[type, Callable]:
+        """
+        Register module with the given name or name list. it can be used as a
+        decoder(with module as None) or normal function(with module as not
+        None).
+        """
+        if not isinstance(force, bool):
+            raise TypeError(f"force must be a boolean, but got {type(force)}")
+
+        # raise the error ahead of time
+        if not (name is None or isinstance(name, str)
+                or is_list_of(name, str)):
+            raise TypeError(
+                "name must be None, an instance of str, or a sequence of str, "
+                f"but got {type(name)}")
+
+        # use it as a normal method: x.register_module(module=SomeClass)
+        if module is not None:
+            cls._register_module(module=module, module_name=name, force=force)
+            return module
+
+        # use it as a decorator: @x.register_module()
+        def _register(module):
+            cls._register_module(module=module, module_name=name, force=force)
+            return module
+
+        return _register
+
+    @classmethod
+    def import_reasoning_parser(cls, plugin_path: str) -> None:
+        """
+        Import a user-defined reasoning parser by the path
+        of the reasoning parser define file.
+        """
+        module_name = os.path.splitext(os.path.basename(plugin_path))[0]
+
+        try:
+            import_from_path(module_name, plugin_path)
+        except Exception:
+            logger.exception("Failed to load module '%s' from %s.",
+                             module_name, plugin_path)
+            return
diff --git a/vllm_v0.10.0/vllm/reasoning/deepseek_r1_reasoning_parser.py b/vllm_v0.10.0/vllm/reasoning/deepseek_r1_reasoning_parser.py
new file mode 100644
index 0000000..1a5ca46
--- /dev/null
+++ b/vllm_v0.10.0/vllm/reasoning/deepseek_r1_reasoning_parser.py
@@ -0,0 +1,173 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+from typing import Optional, Union
+
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage)
+from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+
+logger = init_logger(__name__)
+
+
+@ReasoningParserManager.register_module("deepseek_r1")
+class DeepSeekR1ReasoningParser(ReasoningParser):
+    """
+    Reasoning parser for DeepSeek R1 model.
+
+    The DeepSeek R1 model uses <think>...</think> tokens to denote reasoning
+    text. This parser extracts the reasoning content from the model output.
+    """
+
+    start_token_id: int
+    end_token_id: int
+
+    start_token: str = "<think>"
+    end_token: str = "</think>"
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
+        super().__init__(tokenizer)
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ReasoningParser "
+                "constructor during construction.")
+
+        self.start_token_id = self.vocab.get(self.start_token)
+        self.end_token_id = self.vocab.get(self.end_token)
+        if self.start_token_id is None or self.end_token_id is None:
+            raise RuntimeError(
+                "DeepSeek R1 reasoning parser could not locate think start/end "
+                "tokens in the tokenizer!")
+
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        return self.end_token_id in input_ids
+
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        """
+        Extract the content after the end tokens
+        """
+        if self.end_token_id not in input_ids[:-1]:
+            return []
+        else:
+            return input_ids[input_ids.index(self.end_token_id) + 1:]
+
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+        """
+        Extract reasoning content from a delta message.
+        Handles streaming output where previous + delta = current.
+        Uses token IDs for faster processing.
+        For text <think>abc</think>xyz:
+        - 'abc' goes to reasoning_content
+        - 'xyz' goes to content
+        """
+        # Skip single special tokens
+        if len(delta_token_ids) == 1 and (delta_token_ids[0] in [
+                self.start_token_id, self.end_token_id
+        ]):
+            return None
+
+        # Check if <think> is present in previous or delta.
+        # Keep compatibility with models that don't generate <think> tokens.
+        if self.start_token_id in previous_token_ids:
+            if self.end_token_id in delta_token_ids:
+                # <think> in previous, </think> in delta,
+                # extract reasoning content
+                end_index = delta_text.find(self.end_token)
+                reasoning_content = delta_text[:end_index]
+                content = delta_text[end_index + len(self.end_token):]
+                return DeltaMessage(
+                    reasoning_content=reasoning_content,
+                    content=content if content else None,
+                )
+            elif self.end_token_id in previous_token_ids:
+                # <think> in previous, </think> in previous,
+                # reasoning content continues
+                return DeltaMessage(content=delta_text)
+            else:
+                # <think> in previous, no </think> in previous or delta,
+                # reasoning content continues
+                return DeltaMessage(reasoning_content=delta_text)
+        elif self.start_token_id in delta_token_ids:
+            if self.end_token_id in delta_token_ids:
+                # <think> in delta, </think> in delta, extract reasoning content
+                start_index = delta_text.find(self.start_token)
+                end_index = delta_text.find(self.end_token)
+                reasoning_content = delta_text[start_index +
+                                               len(self.start_token):end_index]
+                content = delta_text[end_index + len(self.end_token):]
+                return DeltaMessage(
+                    reasoning_content=reasoning_content,
+                    content=content if content else None,
+                )
+            else:
+                # <think> in delta, no </think> in delta,
+                # reasoning content continues
+                return DeltaMessage(reasoning_content=delta_text)
+        else:
+            # No <think> in previous or delta, also need to check for </think>.
+            # Because the model may have generated </think> without <think>
+            # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
+            if self.end_token_id in delta_token_ids:
+                # </think> in delta with more tokens,
+                # extract reasoning content and content
+                end_index = delta_text.find(self.end_token)
+                reasoning_content = delta_text[:end_index]
+                content = delta_text[end_index + len(self.end_token):]
+                return DeltaMessage(
+                    reasoning_content=reasoning_content,
+                    content=content if content else None,
+                )
+            elif self.end_token_id in previous_token_ids:
+                # </think> in previous, thinking content ends
+                return DeltaMessage(content=delta_text)
+            else:
+                # no </think> in previous or delta, reasoning content continues
+                return DeltaMessage(reasoning_content=delta_text)
+
+    def extract_reasoning_content(
+            self, model_output: str, request: ChatCompletionRequest
+    ) -> tuple[Optional[str], Optional[str]]:
+        """
+        Extract reasoning content from the model output.
+
+        For text <think>abc</think>xyz:
+        - 'abc' goes to reasoning_content
+        - 'xyz' goes to content
+
+        Returns:
+            tuple[Optional[str], Optional[str]]: reasoning content and content
+        """
+
+        # Check if the start token is present in the model output, remove it
+        # if it is present.
+        model_output_parts = model_output.partition(self.start_token)
+        model_output = model_output_parts[2] if model_output_parts[
+            1] else model_output_parts[0]
+
+        # DeepSeek R1 doesn't generate <think> now.
+        # Thus we assume the reasoning content is always at the start.
+        # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
+        if self.end_token not in model_output:
+            return model_output, None
+        else:
+            reasoning_content, _, content = model_output.partition(
+                self.end_token)
+            # If the end token is not found, return the model output as is.
+            # It should not happen since we already checked for the presence
+            # of the end token.
+            # If generation stops right after end-of-think, return null content
+            final_content = content or None
+            return reasoning_content, final_content
diff --git a/vllm_v0.10.0/vllm/reasoning/glm4_moe_reasoning_parser.py b/vllm_v0.10.0/vllm/reasoning/glm4_moe_reasoning_parser.py
new file mode 100644
index 0000000..6511fb4
--- /dev/null
+++ b/vllm_v0.10.0/vllm/reasoning/glm4_moe_reasoning_parser.py
@@ -0,0 +1,151 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+from typing import Optional, Union
+
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage)
+from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+
+logger = init_logger(__name__)
+
+
+@ReasoningParserManager.register_module("glm4_moe")
+class Glm4MoeModelReasoningParser(ReasoningParser):
+    """
+    Reasoning parser for the Glm4MoeModel model.
+
+    The Glm4MoeModel model uses <think>...</think> tokens to denote reasoning
+    text within its output. The model provides a strict switch to disable
+    reasoning output via the 'enable_thinking=False' parameter. This parser
+    extracts the reasoning content enclosed by <think> and </think> tokens
+    from the model's output.
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
+        super().__init__(tokenizer)
+        self.think_start_token = "<think>"
+        self.think_end_token = "</think>"
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ReasoningParser "
+                "constructor during construction.")
+
+        self.think_start_token_id = self.vocab.get(self.think_start_token)
+        self.think_end_token_id = self.vocab.get(self.think_end_token)
+        if (self.think_start_token_id is None
+                or self.think_end_token_id is None):
+            raise RuntimeError(
+                "Glm4MoeModel reasoning parser could not locate "
+                "think start/end tokens in the tokenizer!")
+
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        return self.think_end_token_id in input_ids
+
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        """
+        Extract the content after the end tokens
+        """
+        if self.think_end_token_id not in input_ids[:-1]:
+            return []
+        else:
+            return input_ids[input_ids.index(self.think_end_token_id) + 1:]
+
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+        """
+        Extract reasoning content from a delta message.
+        Handles streaming output where previous + delta = current.
+        Uses token IDs for faster processing.
+        For text <think>abc</think>xyz:
+        - 'abc' goes to reasoning_content
+        - 'xyz' goes to content
+        """
+        # Skip single special tokens
+        if len(delta_token_ids) == 1 and (delta_token_ids[0] in [
+                self.think_start_token_id, self.think_end_token_id
+        ]):
+            return None
+
+        if self.think_start_token_id in previous_token_ids:
+            if self.think_end_token_id in delta_token_ids:
+                # <think> in previous, </think> in delta,
+                # extract reasoning content
+                end_index = delta_text.find(self.think_end_token)
+                reasoning_content = delta_text[:end_index]
+                content = delta_text[end_index + len(self.think_end_token):]
+                return DeltaMessage(reasoning_content=reasoning_content,
+                                    content=content if content else None)
+            elif self.think_end_token_id in previous_token_ids:
+                # <think> in previous, </think> in previous,
+                # reasoning content continues
+                return DeltaMessage(content=delta_text)
+            else:
+                # <think> in previous, no </think> in previous or delta,
+                # reasoning content continues
+                return DeltaMessage(reasoning_content=delta_text)
+        elif self.think_start_token_id in delta_token_ids:
+            if self.think_end_token_id in delta_token_ids:
+                # <think> in delta, </think> in delta, extract reasoning content
+                start_index = delta_text.find(self.think_start_token)
+                end_index = delta_text.find(self.think_end_token)
+                reasoning_content = delta_text[start_index +
+                                               len(self.think_start_token
+                                                   ):end_index]
+                content = delta_text[end_index + len(self.think_end_token):]
+                return DeltaMessage(reasoning_content=reasoning_content,
+                                    content=content if content else None)
+            else:
+                # <think> in delta, no </think> in delta,
+                # reasoning content continues
+                return DeltaMessage(reasoning_content=delta_text)
+        else:
+            # thinking is disabled, just content
+            return DeltaMessage(content=delta_text)
+
+    def extract_reasoning_content(
+            self, model_output: str, request: ChatCompletionRequest
+    ) -> tuple[Optional[str], Optional[str]]:
+        """
+        Extract reasoning content from the model output.
+
+        For text <think>abc</think>xyz:
+        - 'abc' goes to reasoning_content
+        - 'xyz' goes to content
+
+        Returns:
+            tuple[Optional[str], Optional[str]]: reasoning content and content
+        """
+
+        # Check if the model output contains the <think> and </think> tokens.
+        if (self.think_start_token not in model_output
+                or self.think_end_token not in model_output):
+            return None, model_output
+        # Check if the <think> is present in the model output, remove it
+        # if it is present.
+        model_output_parts = model_output.partition(self.think_start_token)
+        model_output = model_output_parts[2] if model_output_parts[
+            1] else model_output_parts[0]
+        # Check if the model output contains the </think> tokens.
+        # If the end token is not found, return the model output as is.
+        if self.think_end_token not in model_output:
+            return None, model_output
+
+        # Extract reasoning content from the model output.
+        reasoning_content, _, content = model_output.partition(
+            self.think_end_token)
+
+        final_content = content or None
+        return reasoning_content, final_content
diff --git a/vllm_v0.10.0/vllm/reasoning/granite_reasoning_parser.py b/vllm_v0.10.0/vllm/reasoning/granite_reasoning_parser.py
new file mode 100644
index 0000000..5820001
--- /dev/null
+++ b/vllm_v0.10.0/vllm/reasoning/granite_reasoning_parser.py
@@ -0,0 +1,363 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+from typing import Optional, Union
+
+import regex as re
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage)
+from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+
+logger = init_logger(__name__)
+
+
+@ReasoningParserManager.register_module("granite")
+class GraniteReasoningParser(ReasoningParser):
+    """
+    Reasoning parser for IBM Granite.
+
+    IBM granite models currently use "Here is my thought process:"
+    and "Here is my response:" to separate its thinking / response outputs.
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
+        super().__init__(tokenizer)
+
+        # NOTE: There have been some observed occurrences of quantized
+        # instances of the current models using "Here's" instead of "Here is",
+        # so to be safe, we match on both.
+        self.think_start_expr = r"(?:Here's|Here is) my thought process:"
+        self.response_start_expr = r"(?:Here's|Here is) my response:"
+
+        self.reasoning_regex = re.compile(
+            rf"{self.think_start_expr}(.*?){self.response_start_expr}(.*)",
+            re.DOTALL)
+
+        self.valid_think_starts = [
+            "Here's my thought process:", "Here is my thought process:"
+        ]
+        self.valid_response_starts = [
+            "Here's my response:", "Here is my response:"
+        ]
+
+        # Substrings to match for sequence boundaries on raw text
+        self.seq_boundary_end = ":"
+        self.seq_boundary_start = "Here"
+
+        # The longest any thinking / start of response message can be
+        self.longest_think_start = max(
+            len(think_start) for think_start in self.valid_think_starts)
+
+    def extract_reasoning_content(
+            self, model_output: str, request: ChatCompletionRequest
+    ) -> tuple[Optional[str], Optional[str]]:
+        """Extract the reasoning content & content sections, respectively.
+        If the sequence doesn't match what we expect, i.e., the model generates
+        something else, all content is considered non-reasoning content.
+
+        Args:
+            model_output (str): Output of the model to be parsed.
+            request (ChatCompletionRequest): Request being processed.
+
+        Returns:
+            tuple[Optional[str], Optional[str]]: Tuple pair containing the
+            reasoning content and non-reasoning content.
+        """
+        re_match = self.reasoning_regex.findall(model_output)
+        if not re_match:
+            return None, model_output
+        reasoning_content, response_content = re_match[0]
+        if not response_content:
+            return reasoning_content, None
+        return reasoning_content, response_content
+
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+        """Extract the reasoning content / content emitted by granite models;
+        If the sequence doesn't match what we expect, i.e., the model generates
+        something else, all content is considered non-reasoning content.
+
+        NOTE: Granite models do not use a special token to start their reasoning
+        and response sections; instead they have token sequences, e.g.,
+
+                Here is my thought process: Foo Here is my response: Bar
+
+        This increases the complexity of correctly handling streams, since we
+        need to watch for specific sequences and correctly parse them without
+        dropping content that is potentially overlapping & spanning multiple
+        delta messages.
+
+        Args:
+            previous_text (str): Previous text outside of this delta message.
+            current_text (str): Previous text + delta text.
+            delta_text (str): Text to consider and parse content from.
+            previous_token_ids (Sequence[int]): Token IDs of previous_text.
+            current_token_ids (Sequence[int]): Token IDs of current_text.
+            delta_token_ids (Sequence[int]): Token IDs of delta_text.
+
+        Returns:
+            Union[DeltaMessage, None]
+                DeltaMessage with either reasoning content or content, or None.
+        """
+        reasoning_content, resp_seq_len, content = self._get_content_sections(
+            current_text)
+        # Either we haven't finished the start of the reasoning sequence,
+        # or the model is generating something unexpected.
+        if not reasoning_content:
+            delta_message = self._get_delta_message_with_no_reasoning_bounds(
+                current_text, delta_text)
+        # We have a start of reasoning message, but have not yet finished
+        # the start of response sequence.
+        elif not content:
+            delta_message = self._get_delta_message_with_no_response_bounds(
+                current_text, reasoning_content, delta_text)
+        # We've finished both the start of reasoning and start of response seq.
+        else:
+            # This should never happen since we matched on the response
+            assert resp_seq_len is not None
+            delta_message = self._get_delta_message_with_both_bounds(
+                delta_text, reasoning_content, content, current_text,
+                resp_seq_len)
+        if not delta_message.content and not delta_message.reasoning_content:
+            return None
+        return delta_message
+
+    #### Implementation details of stream parsing for granite models
+    def _is_reasoning_start_substr(self, text: str) -> bool:
+        """Check if a text matches one of the possible start reasoning seqs.
+
+        Args:
+            text (str): Text to check for leading substr.
+        
+        Returns:
+            bool: True if any of the possible reasoning start seqs match.
+        """
+        return any(
+            think_start.startswith(text)
+            for think_start in self.valid_think_starts)
+
+    def _is_response_start_substr(self, text: str) -> bool:
+        """Check if a text matches one of the possible start response seqs.
+
+        Args:
+            text (str): Text to check for leading substr.
+        
+        Returns:
+            bool: True if any of the possible response start seqs match.
+        """
+        return any(
+            response_start.startswith(text)
+            for response_start in self.valid_response_starts)
+
+    def _get_delta_message_with_no_reasoning_bounds(
+        self,
+        current_text: str,
+        delta_text: str,
+    ) -> DeltaMessage:
+        """Parse the delta message when the current text has not yet completed
+        its start of reasoning sequence.
+
+        Args:
+            current_text (str): The full previous + delta text.
+            delta_text (str): Text to consider and parse content from.
+
+        Returns:
+            DeltaMessage: Message containing the parsed content.
+        """
+        prev_longest_length = len(current_text) - len(delta_text)
+        is_substr = self._is_reasoning_start_substr(current_text)
+        was_substr = self._is_reasoning_start_substr(
+            current_text[:prev_longest_length])
+
+        # Check if we just generated something NOT in the special token seq;
+        # if so, add everything that we previously skipped with this delta
+        # message and append everything to content in the future.
+        if was_substr and not is_substr:
+            return DeltaMessage(
+                reasoning_content=None,
+                content=current_text,
+            )
+        if is_substr:
+            # Might still be in the special token sequence; return nothing
+            return DeltaMessage(reasoning_content=None, content=None)
+        # Otherwise the sequence has already been broken and we already
+        # corrected; just return the delta text as normal content.
+        return DeltaMessage(reasoning_content=None, content=delta_text)
+
+    def _get_delta_message_with_no_response_bounds(
+        self,
+        current_text: str,
+        reasoning_content: str,
+        delta_text: str,
+    ) -> DeltaMessage:
+        """Parse the delta message when the current text has both reasoning
+        content with no (response) content. NOTE that we may have overlapping
+        tokens with the start of reasoning / start of response sequences on
+        either side of the delta text.
+
+        Args:
+            current_text (str): The full previous + delta text.
+            reasoning_content (str): reasoning content from current_text.
+            delta_text (str): Text to consider and parse content from.
+
+        Returns:
+            DeltaMessage: Message containing the parsed content.
+        """
+        # If we have no reasoning content or explicitly end with the start of
+        # response sequence, we are in transition to the response; need to be
+        # careful here, since the final token (:) will match the reasoning
+        # content and fully parse it out; we should not pass the : back.
+        ends_with_start_response_seq = any(
+            current_text.endswith(response_start)
+            for response_start in self.valid_response_starts)
+        if reasoning_content is None or ends_with_start_response_seq:
+            return DeltaMessage(reasoning_content=None, content=None)
+
+        # Consider previous / current text only within context of the reasoning
+        previous_text = reasoning_content[:-len(delta_text)]
+        current_text = reasoning_content
+
+        # We need to be careful about adding unfinished response sequences;
+        # Find the place at which we MIGHT be starting a response sequence
+        prev_idx = previous_text.rfind(self.seq_boundary_start)
+        delta_idx = delta_text.rfind(self.seq_boundary_start)
+
+        # Check the state of potential start of response substring matches.
+        prev_was_substr = self._is_response_start_substr(
+            previous_text[prev_idx:]) if prev_idx >= 0 else False
+        delta_continues_substr = self._is_response_start_substr(
+            current_text[prev_idx:]) if prev_idx >= 0 else False
+        delta_new_substr = self._is_response_start_substr(
+            delta_text[delta_idx:]) if delta_idx >= 0 else False
+
+        # Delta only contains potential continued response sequence text.
+        if delta_continues_substr:
+            return DeltaMessage(reasoning_content=None, content=None)
+
+        if not prev_was_substr:
+            # Delta may be starting a new response seq but has other text too.
+            if delta_new_substr:
+                return DeltaMessage(reasoning_content=delta_text[:delta_idx],
+                                    content=None)
+            # Normal case for most reasoning text (no potential special seqs).
+            return DeltaMessage(reasoning_content=delta_text, content=None)
+        # The substring that previously seemed to be a potential response
+        # seq wasn't one; we need to add the content to the delta message,
+        # and also slice off the potential response sequence
+        elif delta_new_substr:
+            reasoning_content = previous_text[
+                prev_idx:] + delta_text[:delta_idx]
+            return DeltaMessage(reasoning_content=reasoning_content,
+                                content=None)
+        # No new substring yet, and we broke our old one; take the whole delta
+        return DeltaMessage(
+            reasoning_content=previous_text[prev_idx:] + delta_text,
+            content=None,
+        )
+
+    def _get_delta_message_with_both_bounds(
+        self,
+        delta_text: str,
+        reasoning_content: str,
+        response_content: str,
+        current_text: str,
+        response_seq_len: int,
+    ) -> DeltaMessage:
+        """Parse the delta message when the current text has both reasoning
+        content and normal (response) content.
+
+        Args:
+            delta_text (str): Text to consider and parse content from.
+            reasoning_content (str): reasoning content from current_text.
+            response_content (str): response content from current_text.
+            current_text (str): The full previous + delta text.
+            response_seq_len(str): Len of the complete response sequence used.
+
+        Returns:
+            DeltaMessage: Message containing the parsed content.
+        """
+        # Always have content; take length to the end
+        delta_content = delta_text[-len(response_content):]
+        reasoning_end_idx = len(delta_text) - (len(response_content) +
+                                               response_seq_len)
+
+        if reasoning_end_idx < 0:
+            delta_reasoning_content = None
+        else:
+            # Get the starting offset
+            start_reasoning_content_idx = len(
+                reasoning_content) + response_seq_len + len(
+                    response_content) - 1
+            delta_offset = len(current_text) - len(delta_text)
+            start_offset = start_reasoning_content_idx - delta_offset
+            if start_offset < 0:
+                start_offset = 0
+            delta_reasoning_content = delta_text[
+                start_offset:reasoning_end_idx]
+
+        return DeltaMessage(
+            reasoning_content=delta_reasoning_content,
+            content=delta_content,
+        )
+
+    def _get_content_sections(
+        self, current_text: str
+    ) -> tuple[Optional[str], Optional[int], Optional[str]]:
+        """Parse the text to extract the reasoning content / content
+        if we have them.
+
+        Args:
+            current_text (str): The full previous + delta text.
+
+        Returns:
+            tuple[Optional[str], Optional[int], Optional[str]]: Tuple of len 3
+            containing the reasoning content, the length of the response seq
+            (if there is one) and the non-reasoning content.
+        """
+        current_chunk_start = 0
+        start_reasoning_content = None
+        parsed_content = False
+        delimiter_idxs = [
+            idx for idx, char in enumerate(current_text)
+            if char == self.seq_boundary_end
+        ]
+
+        for current_chunk_end in delimiter_idxs:
+            current_chunk = current_text[current_chunk_start:current_chunk_end]
+            # Check to see if the start of reasoning seq if complete
+            if start_reasoning_content is None:
+                for think_start in self.valid_think_starts:
+                    if current_chunk == think_start[:-1]:
+                        start_reasoning_content = current_chunk_end + 1
+                        current_chunk_start = current_chunk_end + 1
+                        break
+
+            # Check to see if the start of response seq if complete
+            elif not parsed_content:
+                for response_start in self.valid_response_starts:
+                    if current_chunk[-len(response_start) +
+                                     1:] == response_start[:-1]:
+                        # Mark end of reasoning and start response content
+                        # after the start of response sequence.
+                        end_reasoning_content = current_chunk_end - len(
+                            response_start)
+                        reasoning_content = current_text[
+                            start_reasoning_content:end_reasoning_content]
+                        response_content = current_text[current_chunk_end + 1:]
+                        return reasoning_content, len(
+                            response_start), response_content
+
+        if start_reasoning_content and not parsed_content:
+            return current_text[start_reasoning_content:], None, None
+        return None, None, None
diff --git a/vllm_v0.10.0/vllm/reasoning/hunyuan_a13b_reasoning_parser.py b/vllm_v0.10.0/vllm/reasoning/hunyuan_a13b_reasoning_parser.py
new file mode 100644
index 0000000..b2452b9
--- /dev/null
+++ b/vllm_v0.10.0/vllm/reasoning/hunyuan_a13b_reasoning_parser.py
@@ -0,0 +1,245 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+from typing import Optional, Union
+
+import regex as re
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage)
+from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+
+logger = init_logger(__name__)
+
+
+@ReasoningParserManager.register_module("hunyuan_a13b")
+class HunyuanA13BReasoningParser(ReasoningParser):
+    """
+    Reasoning parser for Hunyuan A13B Model
+
+    HunyuanReasoningParser
+
+    This class implements a reasoning parser specifically designed 
+    for the Hunyuan A13B Model. It is responsible for parsing and 
+    extracting structured reasoning and answer segments from model 
+    outputs that follow a specific pattern.
+
+    Key Features:
+        - For non-stream output , Recognizes and extracts reasoning ("think")
+         and answer ("answer") sections from text using regular expressions.
+        - For stream process, it require a token id sequences to change the 
+          reasoning state and other state so it maintains internal state to 
+          manage parsing across multiple token.
+
+
+    think start: "<think>\n": [14023, 771, 397]
+    think ends: "\n</think>\n<answer>\n": [198, 524, 27963, 397, 27, 9399, 397]
+    response ends: "\n</answer>": [524, 9399, 29]
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
+        super().__init__(tokenizer)
+        self.think_start_expr = r"<think>\n"
+        self.think_end_expr = r"\n</think>\n"
+
+        self.response_start_expr = r"\n</think>\n<answer>\n"
+        self.response_end_expr = r"\n</answer>"
+
+        self.full_match_reasoning_regex = re.compile(
+            rf"(?:{self.think_start_expr}(.*?){self.response_start_expr})?(.*?){self.response_end_expr}",
+            re.DOTALL)
+
+        self.half_match_reasoning_regex = re.compile(
+            rf"{self.think_start_expr}(.*?){self.response_start_expr}(.*)",
+            re.DOTALL)
+
+        self.think_start_ids = [14023, 771, 397]
+        self.think_start_ids_fast = [14023, 771, 1363]
+        self.response_start_ids = [198, 524, 27963, 397, 27, 9399, 397]
+        self.response_start_ids_fast = [524, 27963, 397, 27, 9399, 397]
+        self.response_end_ids = [198, 524, 9399, 29]
+        self.fast_think_ids = [
+            14023, 771, 1363, 524, 27963, 397, 27, 9399, 397
+        ]
+
+        # when state change, send out all the buffered text in last state
+        self.buffered_text = []
+        self.buffered_ids = []
+
+        self.current_state = "reasoning"
+        self.all_states = ["reasoning", "response"]
+
+        self.current_state = "idle"
+        self.expected_sequence = self.think_start_ids
+        # this sequence only for the think start, it has two way to start.
+        self.expected_sequence_side = self.think_start_ids_fast
+        self.sequence_index = 0
+        self.token_buffer = []
+        self.text_buffer = ""
+
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        return self.current_state == "response"
+
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        # for hunyuan streaming reason parsing, the stream parse
+        # will call first, and the same token will be called in
+        # is_reasoning_end and extract_content_ids
+        # this id is not part of content, so just return [] here.
+        return []
+
+    def extract_reasoning_content(
+            self, model_output: str, request: ChatCompletionRequest
+    ) -> tuple[Optional[str], Optional[str]]:
+        """Extract the reasoning content & content sections, respectively.
+        If the sequence doesn't match what we expect, i.e., the model generates
+        something else, all content is considered non-reasoning content.
+
+        Args:
+            model_output (str): Output of the model to be parsed.
+            request (ChatCompletionRequest): Request being processed.
+
+        Returns:
+            tuple[Optional[str], Optional[str]]: Tuple pair containing the
+            reasoning content and non-reasoning content.
+        """
+
+        re_match = self.full_match_reasoning_regex.findall(model_output)
+        if re_match:
+            reasoning_content, response_content = re_match[0]
+            if len(reasoning_content) == 0:
+                reasoning_content = None
+            if len(response_content) == 0:
+                response_content = None
+            return reasoning_content, response_content
+
+        fallback_regex = self.half_match_reasoning_regex
+        fallback_match = fallback_regex.findall(model_output)
+        if fallback_match:
+            reasoning_content, response_content = fallback_match[0]
+
+            if response_content.endswith(self.response_end_expr):
+                response_content = response_content[:-len(self.
+                                                          response_end_expr)]
+
+            if len(reasoning_content) == 0:
+                reasoning_content = None
+            if len(response_content) == 0:
+                response_content = None
+
+            return reasoning_content, response_content
+
+        return None, model_output
+
+    def _is_strict_increasing_subsequence(self, subsequence: Sequence[int],
+                                          sequence: Sequence[int]) -> bool:
+        if not subsequence:
+            return False
+
+        sub_idx = 0
+        for num in sequence:
+            if sub_idx < len(subsequence) and num == subsequence[sub_idx]:
+                sub_idx += 1
+        return sub_idx == len(subsequence)
+
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+        """Extract content using token ID sequence state machine"""
+        # Define sequences
+        think_start_sequence = self.think_start_ids
+        response_start_sequence = self.response_start_ids
+        response_end_sequence = self.response_end_ids
+
+        assert (len(delta_token_ids) == 1)
+        # Process each token in the delta
+        token = delta_token_ids[0]
+
+        def check_token_with_sequence(token):
+            if self.current_state == "idle" or self.current_state == "think":
+                return (token == self.expected_sequence[self.sequence_index]
+                         or token ==  \
+                         self.expected_sequence_side[self.sequence_index])
+            else:
+                return token == self.expected_sequence[self.sequence_index]
+
+        def check_last_token(token):
+            if self.current_state == "idle" or self.current_state == "think":
+                # only return true if it's judge using a side sequence.
+                if (self.sequence_index - 1 < len(self.expected_sequence_side)
+                        and token
+                        == self.expected_sequence_side[self.sequence_index -
+                                                       1]):
+                    return self.sequence_index == len(
+                        self.expected_sequence_side)
+                else:
+                    return self.sequence_index == len(self.expected_sequence)
+            else:
+                return self.sequence_index == len(self.expected_sequence)
+
+        # Check if token matches expected sequence
+        token_in_state_seq = check_token_with_sequence(token)
+
+        if token_in_state_seq:
+            # Store matching token
+            self.token_buffer.append(token)
+            self.text_buffer += delta_text
+            self.sequence_index += 1
+            ## state change from idle->think->response->idle
+
+            # Check if sequence fully matched
+            if check_last_token(token):
+                # State transition
+                if self.current_state == "idle":
+                    self.current_state = "think"
+                    self.expected_sequence = response_start_sequence
+                    self.expected_sequence_side = self.response_start_ids_fast
+                elif self.current_state == "think":
+                    self.current_state = "response"
+                    self.expected_sequence = response_end_sequence
+                elif self.current_state == "response":
+                    self.current_state = "idle"
+                    self.expected_sequence = think_start_sequence
+                    self.expected_sequence_side = self.think_start_ids_fast
+
+                # Reset matching state
+                self.sequence_index = 0
+                self.token_buffer = []
+                self.text_buffer = ""
+                # Do not send content for state transition texts.
+        else:
+            # Sequence broken - handle buffered content
+            if self.token_buffer and len(self.token_buffer) > 0:
+                # Send buffered tokens
+                buffered_content = self.text_buffer + delta_text
+                # Reset matching state
+                self.sequence_index = 0
+                self.token_buffer = []
+                self.text_buffer = ""
+
+                # Return content based on current state
+                if self.current_state == "think":
+                    return DeltaMessage(reasoning_content=buffered_content,
+                                        content=None)
+                else:
+                    return DeltaMessage(reasoning_content=None,
+                                        content=buffered_content)
+            else:
+                # No buffered content, send normally
+                if self.current_state == "think":
+                    return DeltaMessage(reasoning_content=delta_text,
+                                        content=None)
+                else:
+                    return DeltaMessage(reasoning_content=None,
+                                        content=delta_text)
+
+        # If no content to send in this delta
+        return None
diff --git a/vllm_v0.10.0/vllm/reasoning/mistral_reasoning_parser.py b/vllm_v0.10.0/vllm/reasoning/mistral_reasoning_parser.py
new file mode 100644
index 0000000..6c707a4
--- /dev/null
+++ b/vllm_v0.10.0/vllm/reasoning/mistral_reasoning_parser.py
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+from vllm.reasoning.deepseek_r1_reasoning_parser import (
+    DeepSeekR1ReasoningParser)
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+
+logger = init_logger(__name__)
+
+
+@ReasoningParserManager.register_module("mistral")
+class MistralReasoningParser(DeepSeekR1ReasoningParser):
+    """
+    Reasoning parser for Mistral models.
+
+    The Mistral models uses [THINK]...[/THINK] tokens to denote reasoning
+    text. This parser extracts the reasoning content from the model output.
+    """
+
+    def __init__(self, tokenizer: MistralTokenizer):
+        if not isinstance(tokenizer, MistralTokenizer):
+            raise ValueError(
+                "The tokenizer must be an instance of MistralTokenizer.")
+
+        ReasoningParser.__init__(self, tokenizer)
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ReasoningParser "
+                "constructor during construction.")
+
+        from mistral_common.tokens.tokenizers.base import SpecialTokens
+
+        self.start_token = SpecialTokens.begin_think
+        self.end_token = SpecialTokens.end_think
+
+        self.start_token_id = tokenizer.tokenizer.get_control_token(
+            self.start_token)
+        self.end_token_id = tokenizer.tokenizer.get_control_token(
+            self.end_token)
+
+        if self.start_token_id is None or self.end_token_id is None:
+            raise RuntimeError(
+                "Mistral reasoning parser could not locate think start/end "
+                "tokens in the tokenizer!")
diff --git a/vllm_v0.10.0/vllm/reasoning/qwen3_reasoning_parser.py b/vllm_v0.10.0/vllm/reasoning/qwen3_reasoning_parser.py
new file mode 100644
index 0000000..61bafc7
--- /dev/null
+++ b/vllm_v0.10.0/vllm/reasoning/qwen3_reasoning_parser.py
@@ -0,0 +1,151 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+from typing import Optional, Union
+
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage)
+from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+
+logger = init_logger(__name__)
+
+
+@ReasoningParserManager.register_module("qwen3")
+class Qwen3ReasoningParser(ReasoningParser):
+    """
+    Reasoning parser for the Qwen3 model.
+
+    The Qwen3 model uses <think>...</think> tokens to denote reasoning text
+    within its output. The model provides a strict switch to disable reasoning
+    output via the 'enable_thinking=False' parameter. This parser extracts the
+    reasoning content enclosed by <think> and </think> tokens from the model's
+    output.
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
+        super().__init__(tokenizer)
+        self.think_start_token = "<think>"
+        self.think_end_token = "</think>"
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ReasoningParser "
+                "constructor during construction.")
+
+        self.think_start_token_id = self.vocab.get(self.think_start_token)
+        self.think_end_token_id = self.vocab.get(self.think_end_token)
+        if (self.think_start_token_id is None
+                or self.think_end_token_id is None):
+            raise RuntimeError(
+                "Qwen3 reasoning parser could not locate think start/end "
+                "tokens in the tokenizer!")
+
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        return self.think_end_token_id in input_ids
+
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        """
+        Extract the content after the end tokens
+        """
+        if self.think_end_token_id not in input_ids[:-1]:
+            return []
+        else:
+            return input_ids[input_ids.index(self.think_end_token_id) + 1:]
+
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+        """
+        Extract reasoning content from a delta message.
+        Handles streaming output where previous + delta = current.
+        Uses token IDs for faster processing.
+        For text <think>abc</think>xyz:
+        - 'abc' goes to reasoning_content
+        - 'xyz' goes to content
+        """
+        # Skip single special tokens
+        if len(delta_token_ids) == 1 and (delta_token_ids[0] in [
+                self.think_start_token_id, self.think_end_token_id
+        ]):
+            return None
+
+        if self.think_start_token_id in previous_token_ids:
+            if self.think_end_token_id in delta_token_ids:
+                # <think> in previous, </think> in delta,
+                # extract reasoning content
+                end_index = delta_text.find(self.think_end_token)
+                reasoning_content = delta_text[:end_index]
+                content = delta_text[end_index + len(self.think_end_token):]
+                return DeltaMessage(reasoning_content=reasoning_content,
+                                    content=content if content else None)
+            elif self.think_end_token_id in previous_token_ids:
+                # <think> in previous, </think> in previous,
+                # reasoning content continues
+                return DeltaMessage(content=delta_text)
+            else:
+                # <think> in previous, no </think> in previous or delta,
+                # reasoning content continues
+                return DeltaMessage(reasoning_content=delta_text)
+        elif self.think_start_token_id in delta_token_ids:
+            if self.think_end_token_id in delta_token_ids:
+                # <think> in delta, </think> in delta, extract reasoning content
+                start_index = delta_text.find(self.think_start_token)
+                end_index = delta_text.find(self.think_end_token)
+                reasoning_content = delta_text[start_index +
+                                               len(self.think_start_token
+                                                   ):end_index]
+                content = delta_text[end_index + len(self.think_end_token):]
+                return DeltaMessage(reasoning_content=reasoning_content,
+                                    content=content if content else None)
+            else:
+                # <think> in delta, no </think> in delta,
+                # reasoning content continues
+                return DeltaMessage(reasoning_content=delta_text)
+        else:
+            # thinking is disabled, just content
+            return DeltaMessage(content=delta_text)
+
+    def extract_reasoning_content(
+            self, model_output: str, request: ChatCompletionRequest
+    ) -> tuple[Optional[str], Optional[str]]:
+        """
+        Extract reasoning content from the model output.
+
+        For text <think>abc</think>xyz:
+        - 'abc' goes to reasoning_content
+        - 'xyz' goes to content
+
+        Returns:
+            tuple[Optional[str], Optional[str]]: reasoning content and content
+        """
+
+        # Check if the model output contains the <think> and </think> tokens.
+        if (self.think_start_token not in model_output
+                or self.think_end_token not in model_output):
+            return None, model_output
+        # Check if the <think> is present in the model output, remove it
+        # if it is present.
+        model_output_parts = model_output.partition(self.think_start_token)
+        model_output = model_output_parts[2] if model_output_parts[
+            1] else model_output_parts[0]
+        # Check if the model output contains the </think> tokens.
+        # If the end token is not found, return the model output as is.
+        if self.think_end_token not in model_output:
+            return None, model_output
+
+        # Extract reasoning content from the model output.
+        reasoning_content, _, content = model_output.partition(
+            self.think_end_token)
+
+        final_content = content or None
+        return reasoning_content, final_content
diff --git a/vllm_v0.10.0/vllm/sampling_params.py b/vllm_v0.10.0/vllm/sampling_params.py
new file mode 100644
index 0000000..322e53b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/sampling_params.py
@@ -0,0 +1,580 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Sampling parameters for text generation."""
+import copy
+from dataclasses import dataclass
+from enum import Enum, IntEnum
+from functools import cached_property
+from typing import Annotated, Any, Optional, Union
+
+import msgspec
+from pydantic import BaseModel
+
+from vllm.logger import init_logger
+from vllm.logits_process import LogitsProcessor
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+logger = init_logger(__name__)
+
+_SAMPLING_EPS = 1e-5
+_MAX_TEMP = 1e-2
+
+
+class SamplingType(IntEnum):
+    GREEDY = 0
+    RANDOM = 1
+    RANDOM_SEED = 2
+
+
+# maybe make msgspec?
+@dataclass
+class GuidedDecodingParams:
+    """One of these fields will be used to build a logit processor."""
+    json: Optional[Union[str, dict]] = None
+    regex: Optional[str] = None
+    choice: Optional[list[str]] = None
+    grammar: Optional[str] = None
+    json_object: Optional[bool] = None
+    """These are other options that can be set"""
+    backend: Optional[str] = None
+    backend_was_auto: bool = False
+    disable_fallback: bool = False
+    disable_any_whitespace: bool = False
+    disable_additional_properties: bool = False
+    whitespace_pattern: Optional[str] = None
+    structural_tag: Optional[str] = None
+
+    @staticmethod
+    def from_optional(
+        json: Optional[Union[dict, BaseModel, str]] = None,
+        regex: Optional[str] = None,
+        choice: Optional[list[str]] = None,
+        grammar: Optional[str] = None,
+        json_object: Optional[bool] = None,
+        backend: Optional[str] = None,
+        whitespace_pattern: Optional[str] = None,
+        structural_tag: Optional[str] = None,
+    ) -> Optional["GuidedDecodingParams"]:
+        if all(arg is None for arg in (json, regex, choice, grammar,
+                                       json_object, structural_tag)):
+            return None
+        # Extract json schemas from pydantic models
+        if isinstance(json, (BaseModel, type(BaseModel))):
+            json = json.model_json_schema()
+        return GuidedDecodingParams(
+            json=json,
+            regex=regex,
+            choice=choice,
+            grammar=grammar,
+            json_object=json_object,
+            backend=backend,
+            whitespace_pattern=whitespace_pattern,
+            structural_tag=structural_tag,
+        )
+
+    def __post_init__(self):
+        """Validate that some fields are mutually exclusive."""
+        guide_count = sum([
+            self.json is not None, self.regex is not None, self.choice
+            is not None, self.grammar is not None, self.json_object is not None
+        ])
+        if guide_count > 1:
+            raise ValueError(
+                "You can only use one kind of guided decoding but multiple are "
+                f"specified: {self.__dict__}")
+
+
+class RequestOutputKind(Enum):
+    # Return entire output so far in every RequestOutput
+    CUMULATIVE = 0
+    # Return only deltas in each RequestOutput
+    DELTA = 1
+    # Do not return intermediate RequestOutput
+    FINAL_ONLY = 2
+
+
+class SamplingParams(
+        msgspec.Struct,
+        omit_defaults=True,  # type: ignore[call-arg]
+        # required for @cached_property.
+        dict=True):  # type: ignore[call-arg]
+    """Sampling parameters for text generation.
+
+    Overall, we follow the sampling parameters from the OpenAI text completion
+    API (https://platform.openai.com/docs/api-reference/completions/create).
+    In addition, we support beam search, which is not supported by OpenAI.
+
+    Args:
+        n: Number of output sequences to return for the given prompt.
+        best_of: Number of output sequences that are generated from the prompt.
+            From these `best_of` sequences, the top `n` sequences are returned.
+            `best_of` must be greater than or equal to `n`. By default,
+            `best_of` is set to `n`. Warning, this is only supported in V0.
+        presence_penalty: Float that penalizes new tokens based on whether they
+            appear in the generated text so far. Values > 0 encourage the model
+            to use new tokens, while values < 0 encourage the model to repeat
+            tokens.
+        frequency_penalty: Float that penalizes new tokens based on their
+            frequency in the generated text so far. Values > 0 encourage the
+            model to use new tokens, while values < 0 encourage the model to
+            repeat tokens.
+        repetition_penalty: Float that penalizes new tokens based on whether
+            they appear in the prompt and the generated text so far. Values > 1
+            encourage the model to use new tokens, while values < 1 encourage
+            the model to repeat tokens.
+        temperature: Float that controls the randomness of the sampling. Lower
+            values make the model more deterministic, while higher values make
+            the model more random. Zero means greedy sampling.
+        top_p: Float that controls the cumulative probability of the top tokens
+            to consider. Must be in (0, 1]. Set to 1 to consider all tokens.
+        top_k: Integer that controls the number of top tokens to consider. Set
+            to 0 (or -1) to consider all tokens.
+        min_p: Float that represents the minimum probability for a token to be
+            considered, relative to the probability of the most likely token.
+            Must be in [0, 1]. Set to 0 to disable this.
+        seed: Random seed to use for the generation.
+        stop: list of strings that stop the generation when they are generated.
+            The returned output will not contain the stop strings.
+        stop_token_ids: list of tokens that stop the generation when they are
+            generated. The returned output will contain the stop tokens unless
+            the stop tokens are special tokens.
+        bad_words: list of words that are not allowed to be generated.
+            More precisely, only the last token of a corresponding
+            token sequence is not allowed when the next generated token
+            can complete the sequence.
+        include_stop_str_in_output: Whether to include the stop strings in
+            output text. Defaults to False.
+        ignore_eos: Whether to ignore the EOS token and continue generating
+            tokens after the EOS token is generated.
+        max_tokens: Maximum number of tokens to generate per output sequence.
+        min_tokens: Minimum number of tokens to generate per output sequence
+            before EOS or stop_token_ids can be generated
+        logprobs: Number of log probabilities to return per output token.
+            When set to None, no probability is returned. If set to a non-None
+            value, the result includes the log probabilities of the specified
+            number of most likely tokens, as well as the chosen tokens.
+            Note that the implementation follows the OpenAI API: The API will
+            always return the log probability of the sampled token, so there
+            may be up to `logprobs+1` elements in the response.
+        prompt_logprobs: Number of log probabilities to return per prompt token.
+        detokenize: Whether to detokenize the output. Defaults to True.
+        skip_special_tokens: Whether to skip special tokens in the output.
+        spaces_between_special_tokens: Whether to add spaces between special
+            tokens in the output.  Defaults to True.
+        logits_processors: list of functions that modify logits based on
+            previously generated tokens, and optionally prompt tokens as
+            a first argument.
+        truncate_prompt_tokens: If set to -1, will use the truncation size
+            supported by the model. If set to an integer k, will use only
+            the last k tokens from the prompt (i.e., left truncation).
+            Defaults to None (i.e., no truncation).
+        guided_decoding: If provided, the engine will construct a guided
+            decoding logits processor from these parameters. Defaults to None.
+        logit_bias: If provided, the engine will construct a logits processor
+            that applies these logit biases. Defaults to None.
+        allowed_token_ids: If provided, the engine will construct a logits
+            processor which only retains scores for the given token ids.
+            Defaults to None.
+        extra_args: Arbitrary additional args, that can be used by custom
+            sampling implementations, plugins, etc. Not used by any in-tree
+            sampling implementations.
+    """
+
+    n: int = 1
+    best_of: Optional[int] = None
+    _real_n: Optional[int] = None
+    presence_penalty: float = 0.0
+    frequency_penalty: float = 0.0
+    repetition_penalty: float = 1.0
+    temperature: float = 1.0
+    top_p: float = 1.0
+    top_k: int = 0
+    min_p: float = 0.0
+    seed: Optional[int] = None
+    stop: Optional[Union[str, list[str]]] = None
+    stop_token_ids: Optional[list[int]] = None
+    ignore_eos: bool = False
+    max_tokens: Optional[int] = 16
+    min_tokens: int = 0
+    logprobs: Optional[int] = None
+    prompt_logprobs: Optional[int] = None
+    # NOTE: This parameter is only exposed at the engine level for now.
+    # It is not exposed in the OpenAI API server, as the OpenAI API does
+    # not support returning only a list of token IDs.
+    detokenize: bool = True
+    skip_special_tokens: bool = True
+    spaces_between_special_tokens: bool = True
+    # Optional[list[LogitsProcessor]] type. We use Any here because
+    # Optional[list[LogitsProcessor]] type is not supported by msgspec.
+    logits_processors: Optional[Any] = None
+    include_stop_str_in_output: bool = False
+    truncate_prompt_tokens: Optional[Annotated[int, msgspec.Meta(ge=1)]] = None
+    output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE
+
+    # The below fields are not supposed to be used as an input.
+    # They are set in post_init.
+    output_text_buffer_length: int = 0
+    _all_stop_token_ids: set[int] = msgspec.field(default_factory=set)
+
+    # Fields used to construct logits processors
+    guided_decoding: Optional[GuidedDecodingParams] = None
+    logit_bias: Optional[dict[int, float]] = None
+    allowed_token_ids: Optional[list[int]] = None
+    extra_args: Optional[dict[str, Any]] = None
+
+    # Fields used for bad words
+    bad_words: Optional[list[str]] = None
+    _bad_words_token_ids: Optional[list[list[int]]] = None
+
+    @staticmethod
+    def from_optional(
+        n: Optional[int] = 1,
+        best_of: Optional[int] = None,
+        presence_penalty: Optional[float] = 0.0,
+        frequency_penalty: Optional[float] = 0.0,
+        repetition_penalty: Optional[float] = 1.0,
+        temperature: Optional[float] = 1.0,
+        top_p: Optional[float] = 1.0,
+        top_k: int = 0,
+        min_p: float = 0.0,
+        seed: Optional[int] = None,
+        stop: Optional[Union[str, list[str]]] = None,
+        stop_token_ids: Optional[list[int]] = None,
+        bad_words: Optional[list[str]] = None,
+        include_stop_str_in_output: bool = False,
+        ignore_eos: bool = False,
+        max_tokens: Optional[int] = 16,
+        min_tokens: int = 0,
+        logprobs: Optional[int] = None,
+        prompt_logprobs: Optional[int] = None,
+        detokenize: bool = True,
+        skip_special_tokens: bool = True,
+        spaces_between_special_tokens: bool = True,
+        logits_processors: Optional[list[LogitsProcessor]] = None,
+        truncate_prompt_tokens: Optional[Annotated[int,
+                                                   msgspec.Meta(ge=1)]] = None,
+        output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE,
+        guided_decoding: Optional[GuidedDecodingParams] = None,
+        logit_bias: Optional[Union[dict[int, float], dict[str, float]]] = None,
+        allowed_token_ids: Optional[list[int]] = None,
+        extra_args: Optional[dict[str, Any]] = None,
+    ) -> "SamplingParams":
+        if logit_bias is not None:
+            # Convert token_id to integer
+            # Clamp the bias between -100 and 100 per OpenAI API spec
+            logit_bias = {
+                int(token): min(100.0, max(-100.0, bias))
+                for token, bias in logit_bias.items()
+            }
+
+        return SamplingParams(
+            n=1 if n is None else n,
+            best_of=best_of,
+            presence_penalty=0.0
+            if presence_penalty is None else presence_penalty,
+            frequency_penalty=0.0
+            if frequency_penalty is None else frequency_penalty,
+            repetition_penalty=1.0
+            if repetition_penalty is None else repetition_penalty,
+            temperature=1.0 if temperature is None else temperature,
+            top_p=1.0 if top_p is None else top_p,
+            top_k=top_k,
+            min_p=min_p,
+            seed=seed,
+            stop=stop,
+            stop_token_ids=stop_token_ids,
+            bad_words=bad_words,
+            include_stop_str_in_output=include_stop_str_in_output,
+            ignore_eos=ignore_eos,
+            max_tokens=max_tokens,
+            min_tokens=min_tokens,
+            logprobs=logprobs,
+            prompt_logprobs=prompt_logprobs,
+            detokenize=detokenize,
+            skip_special_tokens=skip_special_tokens,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+            logits_processors=logits_processors,
+            truncate_prompt_tokens=truncate_prompt_tokens,
+            output_kind=output_kind,
+            guided_decoding=guided_decoding,
+            logit_bias=logit_bias,
+            allowed_token_ids=allowed_token_ids,
+            extra_args=extra_args,
+        )
+
+    def __post_init__(self) -> None:
+        # how we deal with `best_of``:
+        # if `best_of`` is not set, we default to `n`;
+        # if `best_of`` is set, we set `n`` to `best_of`,
+        # and set `_real_n`` to the original `n`.
+        # when we return the result, we will check
+        # if we need to return `n` or `_real_n` results
+        if self.best_of:
+            if self.best_of < self.n:
+                raise ValueError(
+                    f"best_of must be greater than or equal to n, "
+                    f"got n={self.n} and best_of={self.best_of}.")
+            if not self._real_n:
+                self._real_n = self.n
+                self.n = self.best_of
+
+        if 0 < self.temperature < _MAX_TEMP:
+            logger.warning(
+                "temperature %s is less than %s, which may cause numerical "
+                "errors nan or inf in tensors. We have maxed it out to %s.",
+                self.temperature, _MAX_TEMP, _MAX_TEMP)
+            self.temperature = max(self.temperature, _MAX_TEMP)
+
+        if self.seed == -1:
+            self.seed = None
+
+        if self.stop is None:
+            self.stop = []
+        elif isinstance(self.stop, str):
+            self.stop = [self.stop]
+
+        if self.stop_token_ids is None:
+            self.stop_token_ids = []
+
+        if self.bad_words is None:
+            self.bad_words = []
+
+        if self.logprobs is True:
+            self.logprobs = 1
+
+        if self.prompt_logprobs is True:
+            self.prompt_logprobs = 1
+
+        # Number of characters to hold back for stop string evaluation
+        # until sequence is finished.
+        if self.stop and not self.include_stop_str_in_output:
+            self.output_text_buffer_length = max(len(s) for s in self.stop) - 1
+
+        self._verify_args()
+
+        if self.temperature < _SAMPLING_EPS:
+            # Zero temperature means greedy sampling.
+            self.top_p = 1.0
+            self.top_k = 0
+            self.min_p = 0.0
+            self._verify_greedy_sampling()
+
+        # eos_token_id is added to this by the engine
+        self._all_stop_token_ids.update(self.stop_token_ids)
+
+    def _verify_args(self) -> None:
+        if not isinstance(self.n, int):
+            raise ValueError(f"n must be an int, but is of "
+                             f"type {type(self.n)}")
+        if self.n < 1:
+            raise ValueError(f"n must be at least 1, got {self.n}.")
+        if self.best_of is not None:
+            if not isinstance(self.best_of, int):
+                raise ValueError(
+                    f"best_of must be an integer, got {type(self.best_of)}")
+            if self.best_of < 1:
+                raise ValueError(
+                    f"best_of must be at least 1, got {self.best_of}")
+            if self.best_of < self.n:
+                raise ValueError(
+                    f"best_of must be greater than or equal to n, "
+                    f"got n={self.n} and best_of={self.best_of}.")
+        if not -2.0 <= self.presence_penalty <= 2.0:
+            raise ValueError("presence_penalty must be in [-2, 2], got "
+                             f"{self.presence_penalty}.")
+        if not -2.0 <= self.frequency_penalty <= 2.0:
+            raise ValueError("frequency_penalty must be in [-2, 2], got "
+                             f"{self.frequency_penalty}.")
+        if self.repetition_penalty <= 0.0:
+            raise ValueError(
+                "repetition_penalty must be greater than zero, got "
+                f"{self.repetition_penalty}.")
+        if self.temperature < 0.0:
+            raise ValueError(
+                f"temperature must be non-negative, got {self.temperature}.")
+        if not 0.0 < self.top_p <= 1.0:
+            raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.")
+        # quietly accept -1 as disabled, but prefer 0
+        if self.top_k < -1:
+            raise ValueError(f"top_k must be 0 (disable), or at least 1, "
+                             f"got {self.top_k}.")
+        if not isinstance(self.top_k, int):
+            raise TypeError(
+                f"top_k must be an integer, got {type(self.top_k).__name__}")
+        if not 0.0 <= self.min_p <= 1.0:
+            raise ValueError("min_p must be in [0, 1], got "
+                             f"{self.min_p}.")
+        if self.max_tokens is not None and self.max_tokens < 1:
+            raise ValueError(
+                f"max_tokens must be at least 1, got {self.max_tokens}.")
+        if self.min_tokens < 0:
+            raise ValueError(f"min_tokens must be greater than or equal to 0, "
+                             f"got {self.min_tokens}.")
+        if self.max_tokens is not None and self.min_tokens > self.max_tokens:
+            raise ValueError(
+                f"min_tokens must be less than or equal to "
+                f"max_tokens={self.max_tokens}, got {self.min_tokens}.")
+        if self.logprobs is not None and self.logprobs < 0:
+            raise ValueError(
+                f"logprobs must be non-negative, got {self.logprobs}.")
+        if self.prompt_logprobs is not None and self.prompt_logprobs < 0:
+            raise ValueError(f"prompt_logprobs must be non-negative, got "
+                             f"{self.prompt_logprobs}.")
+        if (self.truncate_prompt_tokens is not None
+                and self.truncate_prompt_tokens < 1):
+            raise ValueError(f"truncate_prompt_tokens must be >= 1, "
+                             f"got {self.truncate_prompt_tokens}")
+        assert isinstance(self.stop_token_ids, list)
+        if not all(isinstance(st_id, int) for st_id in self.stop_token_ids):
+            raise ValueError(f"stop_token_ids must contain only integers, "
+                             f"got {self.stop_token_ids}.")
+        assert isinstance(self.stop, list)
+        if any(not stop_str for stop_str in self.stop):
+            raise ValueError("stop cannot contain an empty string.")
+        if self.stop and not self.detokenize:
+            raise ValueError(
+                "stop strings are only supported when detokenize is True. "
+                "Set detokenize=True to use stop.")
+        if self.best_of != self._real_n and self.output_kind == (
+                RequestOutputKind.DELTA):
+            raise ValueError("best_of must equal n to use output_kind=DELTA")
+
+    def _verify_greedy_sampling(self) -> None:
+        if self.n > 1:
+            raise ValueError("n must be 1 when using greedy sampling, "
+                             f"got {self.n}.")
+
+    def update_from_generation_config(
+            self,
+            generation_config: dict[str, Any],
+            model_eos_token_id: Optional[int] = None) -> None:
+        """Update if there are non-default values from generation_config"""
+
+        if model_eos_token_id is not None:
+            # Add the eos token id into the sampling_params to support
+            # min_tokens processing.
+            self._all_stop_token_ids.add(model_eos_token_id)
+
+        # Update eos_token_id for generation
+        if (eos_ids := generation_config.get("eos_token_id")) is not None:
+            # it can be either int or list of int
+            eos_ids = {eos_ids} if isinstance(eos_ids, int) else set(eos_ids)
+            if model_eos_token_id is not None:
+                # We don't need to include the primary eos_token_id in
+                # stop_token_ids since it's handled separately for stopping
+                # purposes.
+                eos_ids.discard(model_eos_token_id)
+            if eos_ids:
+                self._all_stop_token_ids.update(eos_ids)
+                if not self.ignore_eos:
+                    eos_ids.update(self.stop_token_ids)
+                    self.stop_token_ids = list(eos_ids)
+
+    def update_from_tokenizer(self, tokenizer: AnyTokenizer) -> None:
+        if not self.bad_words:
+            return
+        self._bad_words_token_ids = []
+        for bad_word in self.bad_words:
+            # To prohibit words both at the beginning
+            # and in the middle of text
+            # (related to add_prefix_space tokenizer parameter)
+            for add_prefix_space in [False, True]:
+                prefix = " " if add_prefix_space else ""
+                prompt = prefix + bad_word.lstrip()
+                prompt_token_ids = tokenizer.encode(text=prompt,
+                                                    add_special_tokens=False)
+
+                # If no space at the beginning
+                # or if prefix space produces a new word token
+                if (not add_prefix_space) or (
+                        add_prefix_space and prompt_token_ids[0]
+                        != self._bad_words_token_ids[-1][0]
+                        and len(prompt_token_ids) == len(
+                            self._bad_words_token_ids[-1])):
+                    self._bad_words_token_ids.append(prompt_token_ids)
+
+        invalid_token_ids = [
+            token_id for bad_words_token_ids in self._bad_words_token_ids
+            for token_id in bad_words_token_ids
+            if token_id < 0 or token_id > tokenizer.max_token_id
+        ]
+        if len(invalid_token_ids) > 0:
+            raise ValueError(
+                f"The model vocabulary size is {tokenizer.max_token_id+1},"
+                f" but the following tokens"
+                f" were specified as bad: {invalid_token_ids}."
+                f" All token id values should be integers satisfying:"
+                f" 0 <= token_id <= {tokenizer.max_token_id}.")
+
+    @cached_property
+    def sampling_type(self) -> SamplingType:
+        if self.temperature < _SAMPLING_EPS:
+            return SamplingType.GREEDY
+        if self.seed is not None:
+            return SamplingType.RANDOM_SEED
+        return SamplingType.RANDOM
+
+    @property
+    def all_stop_token_ids(self) -> set[int]:
+        return self._all_stop_token_ids
+
+    @property
+    def bad_words_token_ids(self) -> Optional[list[list[int]]]:
+        # For internal use only. Backward compatibility not guaranteed
+        return self._bad_words_token_ids
+
+    def clone(self) -> "SamplingParams":
+        """Deep copy, but maybe not the LogitsProcessor objects.
+
+        LogitsProcessor objects may contain an arbitrary, nontrivial amount of
+        data that is expensive to copy. However, if not copied, the processor
+        needs to support parallel decoding for multiple sequences
+        See https://github.com/vllm-project/vllm/issues/3087
+        """
+
+        logit_processor_refs = None if self.logits_processors is None else {
+            id(lp): lp.clone() if hasattr(lp, 'clone') else lp
+            for lp in self.logits_processors
+        }
+        return copy.deepcopy(self, memo=logit_processor_refs)
+
+    def __repr__(self) -> str:
+        return (
+            f"SamplingParams(n={self.n}, "
+            f"presence_penalty={self.presence_penalty}, "
+            f"frequency_penalty={self.frequency_penalty}, "
+            f"repetition_penalty={self.repetition_penalty}, "
+            f"temperature={self.temperature}, "
+            f"top_p={self.top_p}, "
+            f"top_k={self.top_k}, "
+            f"min_p={self.min_p}, "
+            f"seed={self.seed}, "
+            f"stop={self.stop}, "
+            f"stop_token_ids={self.stop_token_ids}, "
+            f"bad_words={self.bad_words}, "
+            f"include_stop_str_in_output={self.include_stop_str_in_output}, "
+            f"ignore_eos={self.ignore_eos}, "
+            f"max_tokens={self.max_tokens}, "
+            f"min_tokens={self.min_tokens}, "
+            f"logprobs={self.logprobs}, "
+            f"prompt_logprobs={self.prompt_logprobs}, "
+            f"skip_special_tokens={self.skip_special_tokens}, "
+            "spaces_between_special_tokens="
+            f"{self.spaces_between_special_tokens}, "
+            f"truncate_prompt_tokens={self.truncate_prompt_tokens}, "
+            f"guided_decoding={self.guided_decoding}, "
+            f"extra_args={self.extra_args})")
+
+
+class BeamSearchParams(
+        msgspec.Struct,
+        omit_defaults=True,  # type: ignore[call-arg]
+        # required for @cached_property.
+        dict=True):  # type: ignore[call-arg]
+    """Beam search parameters for text generation."""
+    beam_width: int
+    max_tokens: int
+    ignore_eos: bool = False
+    temperature: float = 0.0
+    length_penalty: float = 1.0
+    include_stop_str_in_output: bool = False
diff --git a/vllm_v0.10.0/vllm/scalar_type.py b/vllm_v0.10.0/vllm/scalar_type.py
new file mode 100644
index 0000000..9060b55
--- /dev/null
+++ b/vllm_v0.10.0/vllm/scalar_type.py
@@ -0,0 +1,347 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import functools
+import struct
+from dataclasses import dataclass
+from enum import Enum
+from typing import Optional, Union
+
+_SCALAR_TYPES_ID_MAP = {}
+
+
+# Mirrors enum in `core/scalar_type.hpp`
+class NanRepr(Enum):
+    NONE = 0  # nans are not supported
+    IEEE_754 = 1  # nans are: Exp all 1s, mantissa not all 0s
+    EXTD_RANGE_MAX_MIN = 2  # nans are: Exp all 1s, mantissa all 1s
+
+
+# This ScalarType class is a parallel implementation of the C++ ScalarType
+# class found in csrc/core/scalar_type.hpp.  These two classes should be kept
+# in sync until the inductor fully supports custom C++ classes.
+@dataclass(frozen=True)
+class ScalarType:
+    """
+    ScalarType can represent a wide range of floating point and integer
+    types, in particular it can be used to represent sub-byte data types
+    (something that torch.dtype currently does not support). It is also
+    capable of  representing types with a bias, i.e.:
+      `stored_value = value + bias`,
+    this is useful for quantized types (e.g. standard GPTQ 4bit uses a bias
+    of 8). The implementation for this class can be found in
+    csrc/core/scalar_type.hpp, these type signatures should be kept in sync
+    with that file.
+    """
+
+    exponent: int
+    """
+    Number of bits in the exponent if this is a floating point type
+    (zero if this an integer type)
+    """
+
+    mantissa: int
+    """
+    Number of bits in the mantissa if this is a floating point type,
+    or the number bits representing an integer excluding the sign bit if
+    this an integer type.
+    """
+
+    signed: bool
+    "If the type is signed (i.e. has a sign bit)"
+
+    bias: int
+    """
+    bias used to encode the values in this scalar type
+    (value = stored_value - bias, default 0) for example if we store the
+    type as an unsigned integer with a bias of 128 then the value 0 will be
+    stored as 128 and -1 will be stored as 127 and 1 will be stored as 129.
+    """
+
+    _finite_values_only: bool = False
+    """
+    Private: if infs are supported, used `has_infs()` instead.
+    """
+
+    nan_repr: NanRepr = NanRepr.IEEE_754
+    """
+    How NaNs are represent in this scalar type, returns NanRepr value.
+    (not applicable for integer types)
+    """
+
+    def _floating_point_max_int(self) -> int:
+        assert (
+            self.mantissa <= 52 and self.exponent <= 11
+        ), f"Cannot represent max/min as a double for type {self.__str__()}"
+
+        max_mantissa = (1 << self.mantissa) - 1
+        if self.nan_repr == NanRepr.EXTD_RANGE_MAX_MIN:
+            max_mantissa = max_mantissa - 1
+
+        max_exponent = (1 << self.exponent) - 2
+        if (self.nan_repr == NanRepr.EXTD_RANGE_MAX_MIN
+                or self.nan_repr == NanRepr.NONE):
+            assert (
+                self.exponent < 11
+            ), f"Cannot represent max/min as a double for type {self.__str__()}"
+            max_exponent = max_exponent + 1
+
+        # adjust the exponent to match that of a double
+        # for now we assume the exponent bias is the standard 2^(e-1) -1, (where
+        # e is the exponent bits), there is some precedent for non-standard
+        # biases, example `float8_e4m3b11fnuz` here:
+        # https://github.com/jax-ml/ml_dtypes but to avoid premature over
+        # complication we are just assuming the standard exponent bias until
+        # there is a need to support non-standard biases
+        exponent_bias = (1 << (self.exponent - 1)) - 1
+        exponent_bias_double = (1 << 10) - 1  # double e = 11
+
+        max_exponent_double = (max_exponent - exponent_bias +
+                               exponent_bias_double)
+
+        # shift the mantissa and exponent into the proper positions for an
+        # IEEE double and bitwise-or them together.
+        return (max_mantissa <<
+                (52 - self.mantissa)) | (max_exponent_double << 52)
+
+    def _floating_point_max(self) -> float:
+        double_raw = self._floating_point_max_int()
+        return struct.unpack('!d', struct.pack('!Q', double_raw))[0]
+
+    def _raw_max(self) -> Union[int, float]:
+        if self.is_floating_point():
+            return self._floating_point_max()
+        else:
+            assert (self.size_bits < 64 or self.size_bits == 64
+                    and self.is_signed()), "Cannot represent max as an int"
+            return (1 << self.mantissa) - 1
+
+    def _raw_min(self) -> Union[int, float]:
+        if self.is_floating_point():
+            assert self.is_signed(
+            ), "We currently assume all floating point types are signed"
+            sign_bit_double = 1 << 63
+
+            max_raw = self._floating_point_max_int()
+            min_raw = max_raw | sign_bit_double
+            return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
+        else:
+            assert (not self.is_signed() or self.size_bits
+                    <= 64), "Cannot represent min as a int64_t"
+
+            if self.is_signed():
+                return -(1 << (self.size_bits - 1))
+            else:
+                return 0
+
+    @functools.cached_property
+    def id(self) -> int:
+        """
+        Convert the ScalarType to an int which can be passed to pytorch custom
+        ops. This layout of the int must be kept in sync with the C++
+        ScalarType's from_id method.
+        """
+        val = 0
+        offset = 0
+
+        def or_and_advance(member, bit_width):
+            nonlocal val
+            nonlocal offset
+            bit_mask = (1 << bit_width) - 1
+            val = val | (int(member) & bit_mask) << offset
+            offset = offset + bit_width
+
+        or_and_advance(self.exponent, 8)
+        or_and_advance(self.mantissa, 8)
+        or_and_advance(self.signed, 1)
+        or_and_advance(self.bias, 32)
+        or_and_advance(self._finite_values_only, 1)
+        or_and_advance(self.nan_repr.value, 8)
+
+        assert offset <= 64, \
+            f"ScalarType fields too big {offset} to fit into an int64"
+
+        _SCALAR_TYPES_ID_MAP[val] = self
+
+        return val
+
+    @property
+    def size_bits(self) -> int:
+        return self.exponent + self.mantissa + int(self.signed)
+
+    def min(self) -> Union[int, float]:
+        """
+        Min representable value for this scalar type.
+        (accounting for bias if there is one)
+        """
+        return self._raw_min() - self.bias
+
+    def max(self) -> Union[int, float]:
+        """
+        Max representable value for this scalar type.
+        (accounting for bias if there is one)
+        """
+        return self._raw_max() - self.bias
+
+    def is_signed(self) -> bool:
+        """
+        If the type is signed (i.e. has a sign bit), same as `signed`
+        added for consistency with:
+        https://pytorch.org/docs/stable/generated/torch.Tensor.is_signed.html
+        """
+        return self.signed
+
+    def is_floating_point(self) -> bool:
+        "If the type is a floating point type"
+        return self.exponent != 0
+
+    def is_integer(self) -> bool:
+        "If the type is an integer type"
+        return self.exponent == 0
+
+    def has_bias(self) -> bool:
+        "If the type has a non-zero bias"
+        return self.bias != 0
+
+    def has_infs(self) -> bool:
+        "If the type is floating point and supports infinity"
+        return not self._finite_values_only
+
+    def has_nans(self) -> bool:
+        return self.nan_repr != NanRepr.NONE.value
+
+    def is_ieee_754(self) -> bool:
+        """
+        If the type is a floating point type that follows IEEE 754
+        conventions
+        """
+        return self.nan_repr == NanRepr.IEEE_754.value and \
+            not self._finite_values_only
+
+    def __str__(self) -> str:
+        """
+        naming generally follows: https://github.com/jax-ml/ml_dtypes
+        for floating point types (leading f) the scheme is:
+        `float<size_bits>_e<exponent_bits>m<mantissa_bits>[flags]`
+        flags:
+          - no-flags: means it follows IEEE 754 conventions
+          - f: means finite values only (no infinities)
+          - n: means nans are supported (non-standard encoding)
+        for integer types the scheme is:
+          `[u]int<size_bits>[b<bias>]`
+          - if bias is not present it means its zero
+        """
+        if self.is_floating_point():
+            ret = "float" + str(self.size_bits) + "_e" + str(
+                self.exponent) + "m" + str(self.mantissa)
+
+            if not self.is_ieee_754():
+                if self._finite_values_only:
+                    ret = ret + "f"
+                if self.nan_repr != NanRepr.NONE:
+                    ret = ret + "n"
+
+            return ret
+        else:
+            ret = ("int" if self.is_signed() else "uint") + str(self.size_bits)
+            if self.has_bias():
+                ret = ret + "b" + str(self.bias)
+            return ret
+
+    def __repr__(self) -> str:
+        return "ScalarType." + self.__str__()
+
+    # __len__ needs to be defined (and has to throw TypeError) for pytorch's
+    # opcheck to work.
+    def __len__(self) -> int:
+        raise TypeError
+
+    #
+    # Convenience Constructors
+    #
+
+    @classmethod
+    def int_(cls, size_bits: int, bias: Optional[int]) -> 'ScalarType':
+        "Create a signed integer scalar type (size_bits includes sign-bit)."
+        ret = cls(0, size_bits - 1, True, bias if bias else 0)
+        ret.id  # noqa B018: make sure the id is cached
+        return ret
+
+    @classmethod
+    def uint(cls, size_bits: int, bias: Optional[int]) -> 'ScalarType':
+        """Create a unsigned integer scalar type."""
+        ret = cls(0, size_bits, False, bias if bias else 0)
+        ret.id  # noqa B018: make sure the id is cached
+        return ret
+
+    @classmethod
+    def float_IEEE754(cls, exponent: int, mantissa: int) -> 'ScalarType':
+        """
+        Create a standard floating point type
+        (i.e. follows IEEE 754 conventions).
+        """
+        assert (mantissa > 0 and exponent > 0)
+        ret = cls(exponent, mantissa, True, 0)
+        ret.id  # noqa B018: make sure the id is cached
+        return ret
+
+    @classmethod
+    def float_(cls, exponent: int, mantissa: int, finite_values_only: bool,
+               nan_repr: NanRepr) -> 'ScalarType':
+        """
+        Create a non-standard floating point type
+        (i.e. does not follow IEEE 754 conventions).
+        """
+        assert (mantissa > 0 and exponent > 0)
+        assert (nan_repr != NanRepr.IEEE_754), (
+            "use `float_IEEE754` constructor for floating point types that "
+            "follow IEEE 754 conventions")
+        ret = cls(exponent, mantissa, True, 0, finite_values_only, nan_repr)
+        ret.id  # noqa B018: make sure the id is cached
+        return ret
+
+    @classmethod
+    def from_id(cls, scalar_type_id: int):
+        if scalar_type_id not in _SCALAR_TYPES_ID_MAP:
+            raise ValueError(
+                f"scalar_type_id {scalar_type_id} doesn't exists.")
+        return _SCALAR_TYPES_ID_MAP[scalar_type_id]
+
+
+# naming generally follows: https://github.com/jax-ml/ml_dtypes
+# for floating point types (leading f) the scheme is:
+#  `float<size_bits>_e<exponent_bits>m<mantissa_bits>[flags]`
+#  flags:
+#  - no-flags: means it follows IEEE 754 conventions
+#  - f: means finite values only (no infinities)
+#  - n: means nans are supported (non-standard encoding)
+# for integer types the scheme is:
+#  `[u]int<size_bits>[b<bias>]`
+#  - if bias is not present it means its zero
+
+
+class scalar_types:
+    int4 = ScalarType.int_(4, None)
+    uint4 = ScalarType.uint(4, None)
+    int8 = ScalarType.int_(8, None)
+    uint8 = ScalarType.uint(8, None)
+    float8_e4m3fn = ScalarType.float_(4, 3, True, NanRepr.EXTD_RANGE_MAX_MIN)
+    float8_e5m2 = ScalarType.float_IEEE754(5, 2)
+    float16_e8m7 = ScalarType.float_IEEE754(8, 7)
+    float16_e5m10 = ScalarType.float_IEEE754(5, 10)
+
+    # fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main
+    float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE)
+
+    # fp4, https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+    float4_e2m1f = ScalarType.float_(2, 1, True, NanRepr.NONE)
+
+    # "gptq" types
+    uint2b2 = ScalarType.uint(2, 2)
+    uint3b4 = ScalarType.uint(3, 4)
+    uint4b8 = ScalarType.uint(4, 8)
+    uint8b128 = ScalarType.uint(8, 128)
+
+    # colloquial names
+    bfloat16 = float16_e8m7
+    float16 = float16_e5m10
diff --git a/vllm_v0.10.0/vllm/scripts.py b/vllm_v0.10.0/vllm/scripts.py
new file mode 100644
index 0000000..7a7fdcc
--- /dev/null
+++ b/vllm_v0.10.0/vllm/scripts.py
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.entrypoints.cli.main import main as vllm_main
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+# Backwards compatibility for the move from vllm.scripts to
+# vllm.entrypoints.cli.main
+def main():
+    logger.warning("vllm.scripts.main() is deprecated. Please re-install "
+                   "vllm or use vllm.entrypoints.cli.main.main() instead.")
+    vllm_main()
diff --git a/vllm_v0.10.0/vllm/sequence.py b/vllm_v0.10.0/vllm/sequence.py
new file mode 100644
index 0000000..fe87b52
--- /dev/null
+++ b/vllm_v0.10.0/vllm/sequence.py
@@ -0,0 +1,1534 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Sequence and its related classes."""
+import copy
+import enum
+from abc import ABC, abstractmethod
+from array import array
+from collections import defaultdict
+from collections.abc import Mapping
+from collections.abc import Sequence as GenericSequence
+from dataclasses import dataclass, field
+from functools import reduce
+from typing import Any, Callable, Optional, Union
+
+import msgspec
+import torch
+
+from vllm.inputs import SingletonInputs
+from vllm.lora.request import LoRARequest
+from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
+from vllm.pooling_params import PoolingParams
+from vllm.sampling_params import RequestOutputKind, SamplingParams
+
+VLLM_TOKEN_ID_ARRAY_TYPE = "l"
+
+VLLM_INVALID_TOKEN_ID = -1
+
+
+def array_full(token_id: int, count: int):
+    """[`array`][] equivalent of [numpy.full][]."""
+    return array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count
+
+
+# We use dataclass for now because it is used for
+# openai server output, and msgspec is not serializable.
+# TODO(sang): Fix it.
+@dataclass
+class Logprob:
+    """Infos for supporting OpenAI compatible logprobs and token ranks.
+
+    Attributes:
+        logprob: The logprob of chosen token
+        rank: The vocab rank of chosen token (>=1)
+        decoded_token: The decoded chosen token index
+    """
+    logprob: float
+    rank: Optional[int] = None
+    decoded_token: Optional[str] = None
+
+
+# {token_id -> logprob} per each sequence group. None if the corresponding
+# sequence group doesn't require prompt logprob.
+PromptLogprobs = list[Optional[dict[int, Logprob]]]
+# {token_id -> logprob} for each sequence group.
+SampleLogprobs = list[dict[int, Logprob]]
+
+
+class SequenceStatus(enum.IntEnum):
+    """Status of a sequence."""
+    WAITING = 0
+    RUNNING = 1
+    SWAPPED = 2
+    # Note: anything after SWAPPED (2) will be considered
+    # as a finished status.
+    FINISHED_STOPPED = 3
+    FINISHED_LENGTH_CAPPED = 4
+    FINISHED_ABORTED = 5
+    FINISHED_IGNORED = 6
+
+    @staticmethod
+    def is_finished(status: "SequenceStatus") -> bool:
+        return status > SequenceStatus.SWAPPED
+
+    @staticmethod
+    def get_finished_reason(status: "SequenceStatus") -> Union[str, None]:
+        if status == SequenceStatus.FINISHED_STOPPED:
+            finish_reason = "stop"
+        elif status == SequenceStatus.FINISHED_LENGTH_CAPPED:
+            finish_reason = "length"
+        elif status == SequenceStatus.FINISHED_ABORTED:
+            finish_reason = "abort"
+        elif status == SequenceStatus.FINISHED_IGNORED:
+            # The ignored sequences are the sequences whose prompt lengths
+            # are longer than the model's length cap. Therefore, the stop
+            # reason should also be "length" as in OpenAI API.
+            finish_reason = "length"
+        else:
+            finish_reason = None
+        return finish_reason
+
+
+class SequenceStage(enum.Enum):
+    PREFILL = enum.auto()
+    DECODE = enum.auto()
+
+
+@dataclass
+class RequestMetrics:
+    """Metrics associated with a request.
+
+    Attributes:
+        arrival_time: The time when the request arrived.
+        first_scheduled_time: The time when the request was first scheduled.
+        first_token_time: The time when the first token was generated.
+        time_in_queue: The time the request spent in the queue.
+        finished_time: The time when the request was finished.
+        scheduler_time: The time spent in the scheduler when this request was
+                        being considered by the scheduler.
+        model_forward_time: The time spent in the model forward pass when this
+                            request was in the batch.
+        model_execute_time: The time spent in the model execute function. This
+                            will include model forward, block/sync across
+                            workers, cpu-gpu sync time and sampling time.
+    """
+    arrival_time: float
+    last_token_time: float
+    first_scheduled_time: Optional[float]
+    first_token_time: Optional[float]
+    time_in_queue: Optional[float]
+    finished_time: Optional[float] = None
+    scheduler_time: Optional[float] = None
+    model_forward_time: Optional[float] = None
+    model_execute_time: Optional[float] = None
+
+
+class SequenceDataDelta(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True):  # type: ignore[call-arg]
+    """Delta SequenceData to send to workers per step."""
+    # A new token to be appended to existing SequenceData.
+    new_output_token_ids: list[int]
+    # Overwriting existing `cumulative_logprob`
+    new_cumulative_logprob: float
+    # Overwriting existing `num_computed_tokens`.
+    new_num_computed_tokens: int
+    # Overwriting existing `stage`.
+    new_stage: SequenceStage
+
+
+class SequenceData(msgspec.Struct,
+                   omit_defaults=True):  # type: ignore[call-arg]
+    """Data associated with a sequence.
+
+    Args:
+        prompt_token_ids: The token IDs of the prompt.
+        output_token_ids: The token IDs of the output. Set to an empty list if
+            None.
+
+    Attributes:
+        prompt_token_ids: The token IDs of the prompt.
+        output_token_ids: The token IDs of the output.
+        cumulative_logprob: The cumulative log probability of the output.
+    """
+    # NOTE: we cannot use Union[list, array] because msgspec cannot support
+    # union of 2 list types.
+    _prompt_token_ids: array
+    _output_token_ids: array = msgspec.field(
+        default_factory=lambda: array(VLLM_TOKEN_ID_ARRAY_TYPE, []))
+
+    _prompt_embeds: Optional[torch.Tensor] = None
+    _output_embeds: Optional[torch.Tensor] = None
+
+    ### The below fields should not be passed as an argument ###
+    _cumulative_logprob: float = 0.0
+    _prompt_token_ids_tuple: tuple[int,
+                                   ...] = msgspec.field(default_factory=tuple)
+    # The number of tokens that are computed (that run against the model).
+    _num_computed_tokens: int = 0
+    # The number of tokens with prefix cache hit.
+    _num_cached_tokens: int = 0
+    _stage: SequenceStage = SequenceStage.PREFILL
+    _cached_all_token_ids: list[int] = msgspec.field(default_factory=list)
+    _cached_all_token_embeds: Optional[torch.Tensor] = None
+
+    # It is used to get delta input. It is reset when `get_delta_and_reset`
+    # is called.
+    _new_appended_tokens: list[int] = msgspec.field(default_factory=list)
+
+    # It is used to compute mrope_position_ids.
+    _mrope_position_delta: Optional[int] = None
+
+    @staticmethod
+    def from_prompt_token_counts(
+            *token_counts: tuple[int, int]) -> "SequenceData":
+        """
+        Construct a [`SequenceData`][vllm.sequence.SequenceData] instance
+        by concatenating prompt token sequences.
+
+        Each tuple represents one token sequence, expressed in the form
+        `(token_id, count)`.
+        """
+        if len(token_counts) == 0:
+            return SequenceData.from_seqs([])
+
+        prompt_token_ids_arr = reduce(
+            array.__iadd__,
+            (array_full(token_id, count) for token_id, count in token_counts),
+        )
+
+        return SequenceData(prompt_token_ids_arr)
+
+    @staticmethod
+    def from_seqs(
+        prompt_token_ids: GenericSequence[int],
+        output_token_ids: Optional[GenericSequence[int]] = None,
+        *,
+        prompt_embeds: Optional[torch.Tensor] = None,
+    ) -> "SequenceData":
+        """
+        Construct a [`SequenceData`][vllm.sequence.SequenceData] instance
+        from prompt and output token sequences.
+        """
+        prompt_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE,
+                                     prompt_token_ids)
+
+        if output_token_ids is None:
+            return SequenceData(prompt_token_ids_arr,
+                                _prompt_embeds=prompt_embeds)
+
+        output_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE,
+                                     output_token_ids)
+
+        return SequenceData(prompt_token_ids_arr,
+                            _output_token_ids=output_token_ids_arr,
+                            _prompt_embeds=prompt_embeds)
+
+    def __post_init__(self) -> None:
+        assert self._prompt_token_ids.typecode == "l"
+        assert self._output_token_ids.typecode == "l"
+        self._prompt_token_ids_tuple: tuple[int, ...] = tuple(
+            self._prompt_token_ids)
+        self._update_cached_all_tokens()
+        if self._prompt_embeds is not None:
+            self._update_cached_all_token_embeds()
+
+    def _update_cached_all_tokens(self):
+        assert isinstance(self._prompt_token_ids, array)
+        assert isinstance(self._output_token_ids, array)
+        self._cached_all_token_ids: list[int] = list(self._prompt_token_ids +
+                                                     self._output_token_ids)
+
+    def _update_cached_all_token_embeds(self):
+        assert isinstance(self._prompt_embeds, torch.Tensor)
+        self._cached_all_token_embeds: torch.Tensor = self._prompt_embeds
+        if self._output_embeds is not None:
+            self._cached_all_token_embeds = torch.cat(
+                (self._cached_all_token_embeds, self._output_embeds), dim=0)
+
+    @property
+    def cumulative_logprob(self) -> float:
+        return self._cumulative_logprob
+
+    @property
+    def prompt_token_ids(self) -> tuple[int, ...]:
+        return self._prompt_token_ids_tuple
+
+    @prompt_token_ids.setter
+    def prompt_token_ids(self, new_prompt_token_ids) -> None:
+        raise NotImplementedError
+
+    @property
+    def prompt_token_ids_array(self) -> array:
+        """Return the prompt token ids in array type.
+
+        Note that the array is in "I" type, and it is not compatible
+        with torch.long (2 bytes vs 4 bytes). So beware of the usage.
+        """
+        return self._prompt_token_ids
+
+    @property
+    def output_token_ids(self) -> tuple[int, ...]:
+        return tuple(self._output_token_ids)
+
+    @output_token_ids.setter
+    def output_token_ids(self,
+                         new_output_token_ids: GenericSequence[int]) -> None:
+        self._output_token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
+                                       new_output_token_ids)
+        self._update_cached_all_tokens()
+
+    @property
+    def output_embeds(self) -> Optional[torch.Tensor]:
+        return self._output_embeds
+
+    @output_embeds.setter
+    def output_embeds(self, new_output_token_embeds: torch.Tensor) -> None:
+        self._output_token_embeds = new_output_token_embeds
+        self._update_cached_all_token_embeds()
+
+    @property
+    def output_token_ids_array(self) -> array:
+        """Return the prompt token ids in array type.
+
+        Note that the array is in "I" type, and it is not compatible
+        with torch.long (2 bytes vs 4 bytes). So beware of the usage.
+        """
+        assert isinstance(self._output_token_ids, array)
+        return self._output_token_ids
+
+    @property
+    def prompt_embeds(self) -> Optional[torch.Tensor]:
+        return self._prompt_embeds
+
+    @prompt_embeds.setter
+    def prompt_embeds(self, prompt_embeds: torch.Tensor) -> None:
+        self._prompt_embeds = prompt_embeds
+        self._update_cached_all_token_embeds()
+
+    @property
+    def mrope_position_delta(self) -> Optional[int]:
+        return self._mrope_position_delta
+
+    @mrope_position_delta.setter
+    def mrope_position_delta(self, new_mrope_position_delta):
+        self._mrope_position_delta = new_mrope_position_delta
+
+    def append_token_id(self,
+                        token_id: int,
+                        logprob: float,
+                        token_embed: Optional[torch.Tensor] = None) -> None:
+        self._output_token_ids.append(token_id)
+        self._new_appended_tokens.append(token_id)
+        self._cached_all_token_ids.append(token_id)
+        self._cumulative_logprob += logprob
+        if token_embed is not None:
+            # Do not pass in with batch or sequence dimensions
+            assert token_embed.ndim == 1
+            token_embed = token_embed.detach().cpu().unsqueeze(0)
+            if self._output_embeds is None:
+                self._output_embeds = token_embed
+            else:
+                self._output_embeds = torch.cat(
+                    (self._output_embeds, token_embed), dim=0)
+            assert self._cached_all_token_embeds is not None
+            self._cached_all_token_embeds = torch.cat(
+                (self._cached_all_token_embeds,
+                 token_embed.to(device=self._cached_all_token_embeds.device)),
+                dim=0)
+
+    def get_len(self) -> int:
+        return len(self._output_token_ids) + len(self._prompt_token_ids)
+
+    def get_prompt_len(self) -> int:
+        return len(self._prompt_token_ids)
+
+    def get_output_len(self) -> int:
+        return len(self._output_token_ids)
+
+    def get_token_ids(self) -> list[int]:
+        return self._cached_all_token_ids
+
+    def get_token_embeddings(self) -> Optional[torch.Tensor]:
+        return self._cached_all_token_embeds
+
+    def get_prefix_token_ids(
+            self, num_tokens: int
+    ) -> tuple[tuple[int, ...], Optional[tuple[int, ...]]]:
+        """Get prefix tokens, and make the return value hashable"""
+        prompt_length = self.get_prompt_len()
+        if num_tokens > prompt_length:
+            return (self._prompt_token_ids_tuple,
+                    tuple(self._output_token_ids[:num_tokens - prompt_length]))
+        else:
+            return (self._prompt_token_ids_tuple[:num_tokens], None)
+
+    def get_num_computed_tokens(self) -> int:
+        """Return the number of prefill tokens that are already computed."""
+        return self._num_computed_tokens
+
+    def update_num_computed_tokens(self, num_new_computed_tokens: int):
+        """Update number of tokens computed so far."""
+        self._num_computed_tokens += num_new_computed_tokens
+        assert self._num_computed_tokens <= self.get_len(), (
+            self._num_computed_tokens, self.get_len())
+        # If all tokens are computed, it means it is in decoding phase.
+        if self.get_num_uncomputed_tokens() == 0:
+            self._stage = SequenceStage.DECODE
+
+    def get_num_cached_tokens(self) -> int:
+        """Return the number of tokens with prefix cache hit."""
+        return self._num_cached_tokens
+
+    def update_num_cached_tokens(self, num_cached_tokens: int):
+        """Update the number of tokens with prefix cache hit."""
+        self._num_cached_tokens = num_cached_tokens
+
+    def reset_state_for_recompute(self) -> None:
+        """Reset the number of computed tokens from this sequence. It is
+        supposed to be called when a sequence needs to be started from
+        the beginning again (e.g., sequence is preempted).
+        """
+        self._num_computed_tokens = 0
+        self._stage = SequenceStage.PREFILL
+        self._new_appended_tokens = []
+
+    def get_num_uncomputed_tokens(self) -> int:
+        """Return the number of prefill tokens that are not computed."""
+        # we use `get_len()` which includes prompt_len + output_len instead
+        # of prompt_len here. This is because during recompute we need to
+        # prefill for both prompt and output.
+        return self.get_len() - self.get_num_computed_tokens()
+
+    def get_last_token_id(self) -> int:
+        if not self._output_token_ids:
+            return self._prompt_token_ids[-1]
+        return self._output_token_ids[-1]
+
+    def get_prompt_token_ids(self) -> tuple[int, ...]:
+        return self.prompt_token_ids
+
+    def get_output_token_ids(self) -> tuple[int, ...]:
+        return self.output_token_ids
+
+    def get_delta_and_reset(self) -> SequenceDataDelta:
+        delta = SequenceDataDelta(self._new_appended_tokens,
+                                  self._cumulative_logprob,
+                                  self.get_num_computed_tokens(), self.stage)
+        # Reset delta state.
+        self._new_appended_tokens = []
+        return delta
+
+    def apply_delta(self, delta: SequenceDataDelta):
+        self._num_computed_tokens = delta.new_num_computed_tokens
+        self._cumulative_logprob = delta.new_cumulative_logprob
+        self._stage = delta.new_stage
+        self._output_token_ids.extend(delta.new_output_token_ids)
+        self._cached_all_token_ids.extend(delta.new_output_token_ids)
+
+    @property
+    def stage(self) -> SequenceStage:
+        return self._stage
+
+    def __repr__(self) -> str:
+        return (f"SequenceData("
+                f"prompt_token_ids={self._prompt_token_ids}, "
+                f"prompt_embeds.shape="
+                f"{getattr(self._prompt_embeds, 'shape', None)}, "
+                f"output_token_ids={self.output_token_ids}, "
+                f"cumulative_logprob={self.cumulative_logprob}, "
+                f"get_num_computed_tokens={self.get_num_computed_tokens()})")
+
+
+class Sequence:
+    """Stores the data, status, and block information of a sequence.
+
+    The sequence is constructed from the
+    [`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] (for decoder-only)
+    or [`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs]
+    (for encoder-decoder) instance passed in through the `inputs`
+    constructor argument.
+
+    Args:
+        seq_id: The ID of the sequence.
+        inputs: The inputs of the sequence.
+        block_size: The block size of the sequence. Should be the same as the
+            block size used by the block manager and cache engine.
+        eos_token_id: The end-of-sequence (EOS) token id recognized by this LLM.
+        lora_request: LoRA request.
+    """
+
+    def __init__(
+        self,
+        seq_id: int,
+        inputs: SingletonInputs,
+        block_size: int,
+        eos_token_id: Optional[int] = None,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> None:
+        self.seq_id = seq_id
+        self.inputs = inputs
+        self.block_size = block_size
+        self.eos_token_id = eos_token_id
+        self.lora_request = lora_request
+
+        self.data = SequenceData.from_seqs(
+            self.prompt_token_ids,
+            prompt_embeds=self.inputs["prompt_embeds"]
+            if self.inputs["type"] == "embeds" else None)
+        self.output_logprobs: SampleLogprobs = []
+        self.output_text = ""
+
+        self.status = SequenceStatus.WAITING
+        self.stop_reason: Union[int, str, None] = None
+
+        # These are used to keep track of delta outputs
+        self._last_output_token_ids_offset: int = 0
+        self._last_output_text_offset: int = 0
+
+        # Used for incremental detokenization
+        self.prefix_offset = 0
+        self.read_offset = 0
+        # Input + output tokens
+        self.tokens: Optional[list[str]] = None
+
+    @property
+    def n_blocks(self) -> int:
+        return (self.get_len() + self.block_size - 1) // self.block_size
+
+    @property
+    def prompt(self) -> Optional[str]:
+        if self.inputs["type"] == "embeds":
+            return None
+        return self.inputs.get("prompt")
+
+    @property
+    def prompt_token_ids(self) -> list[int]:
+        if self.inputs["type"] == "embeds":
+            return [0] * len(self.inputs["prompt_embeds"])
+        return self.inputs["prompt_token_ids"]
+
+    @property
+    def token_type_ids(self) -> list[int]:
+        if self.inputs["type"] == "embeds":
+            return []
+        return self.inputs.get("token_type_ids", [])
+
+    @property
+    def multi_modal_data(self) -> MultiModalKwargs:
+        if self.inputs["type"] == "multimodal":
+            return self.inputs["mm_kwargs"]
+
+        return MultiModalKwargs({})
+
+    @property
+    def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
+        if self.inputs["type"] == "multimodal":
+            return self.inputs["mm_placeholders"]
+
+        return {}
+
+    @property
+    def lora_int_id(self) -> int:
+        return self.lora_request.lora_int_id if self.lora_request else 0
+
+    def get_output_text_to_return(self, buffer_length: int,
+                                  delta: bool) -> str:
+        """If delta is True, only new text since the last call to
+        this method is returned"""
+
+        # We return the full output text if the sequence is finished.
+        truncate = buffer_length and not self.is_finished()
+        if not delta:
+            return self.output_text[:-buffer_length] if truncate else (
+                self.output_text)
+        length = len(self.output_text)
+        if truncate:
+            length -= buffer_length
+        last_offset = self._last_output_text_offset
+        if last_offset < length:
+            self._last_output_text_offset = length
+            return self.output_text[last_offset:length]
+        return ""
+
+    def get_output_token_ids_to_return(
+            self, delta: bool) -> Union[GenericSequence[int], int]:
+        """If delta is True, only new tokens since the last call to
+        this method are returned"""
+        if not delta:
+            return self.get_output_token_ids()
+
+        output_len = self.get_output_len()
+
+        # Get the number of new tokens
+        num_new_tokens = output_len - self._last_output_token_ids_offset
+        self._last_output_token_ids_offset = output_len
+
+        # Return new tokens
+        if num_new_tokens == 1:
+            # Optimization for single decode token case
+            # (which is what we have most of the time)
+            return self.data._cached_all_token_ids[-1]
+
+        if num_new_tokens == 0:
+            return []
+
+        return self.data._cached_all_token_ids[-num_new_tokens:]
+
+    def hash_of_block(self, logical_idx: int) -> int:
+        # TODO This can produce incorrect hash when block size > prompt size
+
+        # Compute the number of tokens in the sequence
+        # TODO: The current hashing function is O(L^2). We should optimize
+        # this in the future.
+        num_tokens = self.num_hashed_tokens_of_block(logical_idx)
+        hashed_tokens = self.data.get_prefix_token_ids(num_tokens)
+        return hash((hashed_tokens, self.lora_int_id))
+
+    def extra_hash(self) -> Optional[int]:
+        """
+        This function computes an extra hash for a sequence, specifically
+        designed for prefix caching mode. The final sequence hash is determined
+        by applying token_ids from the sequence's blocks.
+        """
+        if self.lora_int_id == 0:
+            return None
+
+        # NOTE: If there are additional factors influencing the block aside from
+        # token_ids, include them as input parameters to the hash.
+        return hash(self.lora_int_id)
+
+    def num_hashed_tokens_of_block(self, logical_idx: int):
+        return logical_idx * self.block_size + self.block_size
+
+    def reset_state_for_recompute(self):
+        """Reset the sequence states for recomputation."""
+        self.data.reset_state_for_recompute()
+
+    def append_token_id(self,
+                        token_id: int,
+                        logprobs: dict[int, Logprob],
+                        token_embed: Optional[torch.Tensor] = None) -> None:
+        assert token_id in logprobs
+        self.output_logprobs.append(logprobs)
+        self.data.append_token_id(token_id, logprobs[token_id].logprob,
+                                  token_embed)
+
+    def get_len(self) -> int:
+        return self.data.get_len()
+
+    def get_prompt_len(self) -> int:
+        return self.data.get_prompt_len()
+
+    def get_output_len(self) -> int:
+        return self.data.get_output_len()
+
+    def get_token_ids(self) -> list[int]:
+        return self.data.get_token_ids()
+
+    def get_prompt_token_ids(self) -> tuple[int, ...]:
+        return self.data.get_prompt_token_ids()
+
+    def get_last_token_id(self) -> int:
+        return self.data.get_last_token_id()
+
+    def get_output_token_ids(self) -> tuple[int, ...]:
+        return self.data.get_output_token_ids()
+
+    def get_cumulative_logprob(self) -> float:
+        return self.data.cumulative_logprob
+
+    def is_finished(self) -> bool:
+        return SequenceStatus.is_finished(self.status)
+
+    def fork(self, new_seq_id: int) -> "Sequence":
+        new_seq = copy.deepcopy(self)
+        new_seq.seq_id = new_seq_id
+        return new_seq
+
+    def get_num_new_tokens(self) -> int:
+        """Get the number of new tokens to be computed.
+
+        Returns:
+            The new number of tokens to be computed. I.e., 1 for decode, or
+            the remaining prompt size for prefill.
+        """
+        if self.data.stage == SequenceStage.DECODE:
+            return 1
+        return self.data.get_num_uncomputed_tokens()
+
+    def get_num_computed_tokens(self) -> int:
+        return self.data.get_num_computed_tokens()
+
+    def is_prefill(self) -> bool:
+        return self.data.stage == SequenceStage.PREFILL
+
+    def __repr__(self) -> str:
+        return (f"Sequence(seq_id={self.seq_id}, "
+                f"status={self.status.name}, "
+                f"num_blocks={self.n_blocks})")
+
+
+class SequenceGroupState(msgspec.Struct,
+                         omit_defaults=True):  # type: ignore[call-arg]
+    """Mutable state tied to a specific sequence group"""
+
+    # for multi-step decoding
+    num_steps: int = 1
+    current_step: int = 0
+
+    @property
+    def remaining_steps(self) -> int:
+        return self.num_steps - self.current_step
+
+
+class SequenceGroup:
+    """A group of sequences that are generated from the same prompt.
+
+    Args:
+        request_id: The ID of the request.
+        seqs: The list of sequences.
+        sampling_params: The sampling parameters used to generate the outputs.
+        arrival_time: The arrival time of the request.
+        lora_request: LoRA request.
+        pooling_params: The parameters used to generate the pooler
+            for a pooling model.
+        pooled_data: The extracted hidden states from a pooling model.
+        encoder_seq: Optional, the single encoder sequence. Should be None
+                     unless you are working with an encoder/decoder model.
+        trace_headers: OpenTelemetry trace headers.
+        priority: User-defined priority of the request.
+        draft_size: The number of speculative tokens plus one from the target
+                    model; equal to max number of tokens a step can generate
+                    for single-draft speculative decoding but larger than
+                    that for multi-draft SD (currently not supported).
+    """
+
+    def __init__(self,
+                 request_id: str,
+                 seqs: list[Sequence],
+                 arrival_time: float,
+                 sampling_params: Optional[SamplingParams] = None,
+                 lora_request: Optional[LoRARequest] = None,
+                 pooling_params: Optional[PoolingParams] = None,
+                 pooled_data: Optional[torch.Tensor] = None,
+                 encoder_seq: Optional[Sequence] = None,
+                 trace_headers: Optional[Mapping[str, str]] = None,
+                 priority: int = 0,
+                 draft_size: int = 1) -> None:
+        self.request_id = request_id
+        self.seqs = seqs
+        self.first_seq = seqs[0]
+        self.arrival_time = arrival_time
+        self.is_single_seq = len(seqs) == 1
+        self.seqs_dict = {seq.seq_id: seq for seq in seqs}
+
+        self.sampling_params = sampling_params
+        self.metrics = RequestMetrics(arrival_time=arrival_time,
+                                      last_token_time=arrival_time,
+                                      first_scheduled_time=None,
+                                      first_token_time=None,
+                                      time_in_queue=None)
+        self.last_token_latency = 0.0
+        self.lora_request = lora_request
+        self.prompt_logprobs: Optional[PromptLogprobs] = None
+        self.state = SequenceGroupState()
+        self.pooling_params = pooling_params
+        self.pooled_data = pooled_data
+        self.encoder_seq = encoder_seq
+        self.trace_headers = trace_headers
+        self.priority = priority
+
+        self.cached_request_output = None
+
+    @property
+    def prompt(self) -> Optional[str]:
+        return self.first_seq.prompt
+
+    @property
+    def prompt_token_ids(self) -> list[int]:
+        return self.first_seq.prompt_token_ids
+
+    @property
+    def encoder_prompt(self) -> Optional[str]:
+        # There are either 0 or 1 encoder sequences
+        # If one is present, its prompt is distinct
+        # from the decoder's.
+        return (self.encoder_seq.prompt
+                if self.encoder_seq is not None else None)
+
+    @property
+    def encoder_prompt_token_ids(self) -> Optional[list[int]]:
+        # There are either 0 or 1 encoder sequences
+        # If one is present, its prompt token ids are
+        # distinct from the decoder's.
+        return (self.encoder_seq.prompt_token_ids
+                if self.encoder_seq is not None else None)
+
+    @property
+    def token_type_ids(self) -> Optional[list[int]]:
+        return self.first_seq.token_type_ids
+
+    @property
+    def multi_modal_data(self) -> MultiModalKwargs:
+        if self.first_seq.multi_modal_data:
+            return self.first_seq.multi_modal_data
+        elif self.encoder_seq is not None:
+            return self.encoder_seq.multi_modal_data
+        return MultiModalKwargs({})
+
+    @property
+    def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
+        if self.first_seq.multi_modal_data:
+            return self.first_seq.multi_modal_placeholders
+        elif self.encoder_seq is not None:
+            return self.encoder_seq.multi_modal_placeholders
+        return {}
+
+    @property
+    def lora_int_id(self) -> int:
+        return self.lora_request.lora_int_id if self.lora_request else 0
+
+    def init_multi_step(self, num_steps: int) -> None:
+        self.state.num_steps = num_steps
+        self.state.current_step = 0
+
+    def init_multi_step_from_lookahead_slots(self, num_lookahead_slots: int,
+                                             num_scheduler_steps: int,
+                                             is_multi_step: bool,
+                                             enable_chunking: bool) -> None:
+
+        if not is_multi_step:
+            self.init_multi_step(num_steps=num_scheduler_steps)
+            return
+
+        # Multi-Step case
+        is_prefill = self.is_prefill()
+
+        # The asserts below reflect the expectations of the current system.
+        if is_prefill and enable_chunking:
+            assert num_lookahead_slots == num_scheduler_steps
+            self.init_multi_step(num_steps=num_lookahead_slots)
+        else:
+            is_decode: bool = not is_prefill
+            # If it is a prefill, num_lookahead_slots must be 0
+            assert num_lookahead_slots == 0 or is_decode
+            # If it is a decode, num_lookahead_slots + 1 must match
+            # the scheduler steps.
+            assert num_lookahead_slots + 1 == num_scheduler_steps or is_prefill
+            self.init_multi_step(num_steps=num_lookahead_slots + 1)
+
+    def set_last_token_time(self, now: float) -> None:
+        """Sets the last token time for Request level timings."""
+        # If still in prefill phase, assertion fails.
+        assert not self.is_prefill(), (
+            "seq_group.set_last_token_time() should not be called "
+            "if the seq_group is in prefill phase.")
+        self.last_token_latency = now - self.metrics.last_token_time
+        self.metrics.last_token_time = now
+
+    def get_last_token_latency(self) -> float:
+        """Returns the latency of the last token."""
+        assert not self.is_prefill(), (
+            "seq_group.get_last_token_latency() should not be called "
+            "if the seq_group is in prefill phase.")
+        return self.last_token_latency
+
+    def maybe_set_first_token_time(self, time: float) -> None:
+        """Sets the first token time for Request level timings."""
+        # Note: in a case where a sequence_group is swapped and
+        #   recomputed, the time between iterations is counted
+        #   in TPOT, rather than recalculating TTFT (since from the )
+        #   POV of the user, there is simply a long generation delay.
+        if (self.metrics.first_token_time is None
+                and self.first_seq.get_output_len() == 1):
+            self.metrics.first_token_time = time
+
+    def maybe_set_first_scheduled_time(self, time: float) -> None:
+        """Sets the first scheduled time and time in queue for Request
+        level timings."""
+        if self.metrics.first_scheduled_time is None:
+            self.metrics.first_scheduled_time = time
+            self.metrics.time_in_queue = time - self.metrics.arrival_time
+
+    def set_finished_time(self, time: Optional[float]) -> None:
+        """Sets the finished time for Request level timings."""
+        self.metrics.finished_time = time
+
+    def get_max_num_running_seqs(self) -> int:
+        """The maximum number of sequences running in parallel in the remaining
+        lifetime of the request."""
+        if self.is_single_seq:
+            return 0 if self.first_seq.is_finished() else 1
+        return self.num_seqs() - self.num_finished_seqs()
+
+    def get_seqs(
+        self,
+        status: Optional[SequenceStatus] = None,
+    ) -> list[Sequence]:
+        if status is None:
+            return self.seqs
+
+        if self.is_single_seq:
+            return self.seqs if self.first_seq.status == status else []
+
+        return [seq for seq in self.seqs if seq.status == status]
+
+    def is_encoder_decoder(self) -> bool:
+        return self.encoder_seq is not None
+
+    def get_encoder_seq(self) -> Optional[Sequence]:
+        return self.encoder_seq
+
+    def get_finished_seqs(self) -> list[Sequence]:
+        if self.is_single_seq:
+            return self.seqs if self.first_seq.is_finished() else []
+
+        return [seq for seq in self.seqs if seq.is_finished()]
+
+    def update_num_computed_tokens(self, num_new_computed_tokens: int):
+        """Update number of tokens computed so far."""
+        for seq in self.seqs:
+            if not seq.is_finished():
+                seq.data.update_num_computed_tokens(num_new_computed_tokens)
+
+    def get_num_uncomputed_tokens(self) -> int:
+        num_uncomputed_tokens = 0
+        for seq in self.seqs:
+            if not seq.is_finished():
+                num_uncomputed_tokens += seq.data.get_num_uncomputed_tokens()
+        return num_uncomputed_tokens
+
+    def num_seqs(self, status: Optional[SequenceStatus] = None) -> int:
+        # Optimization. We don't need to call get_seqs if we don't need to
+        # filter by states.
+        if status is None:
+            return len(self.seqs)
+
+        if self.is_single_seq:
+            return 1 if self.seqs[0].status == status else 0
+
+        return len(self.get_seqs(status))
+
+    def num_finished_seqs(self) -> int:
+        if self.is_single_seq:
+            return 1 if self.seqs[0].is_finished() else 0
+        return len(self.get_finished_seqs())
+
+    def is_finished(self) -> bool:
+        if self.is_single_seq:
+            return self.first_seq.is_finished()
+        return all(seq.is_finished() for seq in self.seqs)
+
+    def is_prefill(self) -> bool:
+        return self.first_seq.is_prefill()
+
+    def __repr__(self) -> str:
+        return (f"SequenceGroup(request_id={self.request_id}, "
+                f"sampling_params={self.sampling_params}, "
+                f"num_seqs={len(self.seqs)})")
+
+    def uses_prompt_embeds(self) -> bool:
+        """Returns True if the sequence group uses input embeds."""
+        return any(seq.data.prompt_embeds is not None for seq in self.seqs)
+
+
+class SequenceGroupMetadataDelta(
+        msgspec.Struct,
+        tag=True,  # type: ignore[call-arg]
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True):  # type: ignore[call-arg]
+    """Delta of SequenceGroupMetadata.
+
+    After sending the first SequenceGroupMetadata, vLLM scheduler
+    only sends delta to reduce the data payload size.
+    """
+    seq_data_delta: dict[int, SequenceDataDelta]
+    request_id: str
+    block_tables: dict[int, list[int]]
+    is_prompt: bool
+    do_sample: bool = True
+    token_chunk_size: Optional[int] = None
+    computed_block_nums: Optional[list[int]] = None
+    state: Optional[SequenceGroupState] = msgspec.field(
+        default_factory=lambda: SequenceGroupState())
+
+
+class SequenceGroupMetadata(
+        msgspec.Struct,
+        tag=True,  # type: ignore[call-arg]
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True):  # type: ignore[call-arg]
+    """Metadata for a sequence group. Used to create `AttentionMetadata`.
+
+    Args:
+        request_id: The ID of the request.
+        is_prompt: Whether the request is at prompt stage.
+        seq_data: The sequence data. (Seq id -> sequence data)
+        sampling_params: The sampling parameters used to generate the outputs.
+        block_tables: The block tables. (Seq id -> list of physical block
+            numbers)
+        do_sample: True if sampling is required. Sampling is not required when
+            e.g., prefill is chunked, and the current iteration only computes
+            query tokens for prefill, we don't need sampling.
+        token_chunk_size: The number of tokens to be processed (per sequence).
+            None if chunking is not required.
+        lora_request: LoRA request.
+        computed_block_nums: The block numbers that are already computed,
+            used in prefix caching.
+        state: Internal state tied to this sequence group.
+        multi_modal_data: Multi modal data.
+        mm_processor_kwargs: Multimodal input processor / mapper overrides.
+        encoder_seq_data: Optional sequence data for encoder prompt
+                          (SequenceGroup.encoder_seq). Should be None
+                          unless you are working with an encoder/decoder
+                          model.
+        cross_block_table: Optional cross-attention block table associated
+                           with the encoder prompt
+                           (SequenceGroup.encoder_seq). Should be None
+                           unless you are working with an encoder/decoder
+                           model.
+    """
+
+    request_id: str
+    is_prompt: bool
+    seq_data: dict[int, SequenceData]
+    sampling_params: Optional[SamplingParams]
+    block_tables: dict[int, list[int]]
+    do_sample: bool = True
+    pooling_params: Optional[PoolingParams] = None
+    lora_request: Optional[LoRARequest] = None
+    computed_block_nums: Optional[list[int]] = None
+    state: Optional[SequenceGroupState] = msgspec.field(
+        default_factory=lambda: SequenceGroupState())
+    token_type_ids: Optional[list[int]] = None
+    multi_modal_data: Optional[MultiModalKwargs] = None
+    multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
+    encoder_seq_data: Optional[SequenceData] = None
+    cross_block_table: Optional[list[int]] = None
+    token_chunk_size: Optional[int] = None
+
+    ### Stateful fields that are lazily defined. ###
+    # The number of speculative tokens adopted in this request.
+    # None means specuative decoding is not used.
+    # Zero means speculative decoding is disabled for some reasons.
+    # TODO: We should maintain this states out of the sequence group.
+    num_speculative_tokens: Optional[int] = None
+
+    def __post_init__(self):
+        if self.seq_data is not None and self.token_chunk_size is None:
+            if self.is_prompt:
+                self.token_chunk_size = next(iter(
+                    self.seq_data.values())).get_len()
+            else:
+                self.token_chunk_size = 1
+
+    @property
+    def lora_int_id(self) -> int:
+        return self.lora_request.lora_int_id if self.lora_request else 0
+
+    # Multi-Step Chunked-Prefill property
+    @property
+    def is_single_step_prompt(self) -> bool:
+        # do_sample is true, only when the token_chunk_size matches the
+        # num_uncomputed_tokens of the sequence. This indicates that
+        # the prompt will finish processing in a single `execute_model`
+        # step.
+        return self.is_prompt and self.do_sample
+
+    def get_first_seq_id(self) -> int:
+        # This is an efficient way of fetching the seq_id when
+        # we know this SequenceGroup has only one sequence.
+        return next(iter(self.seq_data))
+
+    def apply_delta(self,
+                    sequence_group_metadata_delta: SequenceGroupMetadataDelta):
+        for id, delta in sequence_group_metadata_delta.seq_data_delta.items():
+            self.seq_data[id].apply_delta(delta)
+        assert self.request_id == sequence_group_metadata_delta.request_id
+        self.block_tables = sequence_group_metadata_delta.block_tables
+        self.token_chunk_size = sequence_group_metadata_delta.token_chunk_size
+        self.do_sample = sequence_group_metadata_delta.do_sample
+        self.is_prompt = sequence_group_metadata_delta.is_prompt
+
+    def finish_step(self) -> None:
+        assert self.state is not None
+        assert self.state.current_step < self.state.num_steps, \
+            f"current step {self.state.current_step}, num_steps {self.state.num_steps}" # noqa
+        self.state.current_step += 1
+
+
+class SequenceOutput(
+        msgspec.Struct,
+        omit_defaults=True,  # type: ignore[call-arg]
+        array_like=True):  # type: ignore[call-arg]
+    """The model output associated with a sequence.
+
+    Args:
+        parent_seq_id: The ID of the parent sequence (for forking in beam
+            search).
+        output_token: The output token ID.
+        logprobs: The logprobs of the output token.
+            (Token id -> logP(x_i+1 | x_0, ..., x_i))
+    """
+    parent_seq_id: int
+    output_token: int
+    logprobs: dict[int, Logprob]
+    output_embed: Optional[torch.Tensor] = None
+
+    def __repr__(self) -> str:
+        output_embed_shape = \
+            self.output_embed.shape if self.output_embed is not None else None
+        return (f"SequenceOutput(parent_seq_id={self.parent_seq_id}, "
+                f"output_token={self.output_token}, "
+                f"output_embed.shape={output_embed_shape}, "
+                f"logprobs={self.logprobs})")
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, SequenceOutput):
+            raise NotImplementedError()
+        equal = (self.parent_seq_id == other.parent_seq_id
+                 and self.output_token == other.output_token)
+        log_probs_equal = other.logprobs == self.logprobs
+        return equal and log_probs_equal
+
+
+class SequenceGroupOutput(ABC):
+    """The base class for model outputs associated with a sequence group."""
+
+    @abstractmethod
+    def __repr__(self) -> str:
+        pass
+
+    @abstractmethod
+    def __eq__(self, other: object) -> bool:
+        pass
+
+
+class CompletionSequenceGroupOutput(
+        msgspec.Struct,
+        omit_defaults=True,  # type: ignore[call-arg]
+        array_like=True):  # type: ignore[call-arg]
+    """The model output associated with a completion sequence group."""
+    __metaclass__ = SequenceGroupOutput
+    samples: list[SequenceOutput]
+    # Prompt logprob for each prompt query token.
+    prompt_logprobs: Optional[PromptLogprobs]
+    step_index: Optional[int] = 0
+
+    def __repr__(self) -> str:
+        return (f"CompletionSequenceGroupOutput(samples={self.samples}, "
+                f"prompt_logprobs={self.prompt_logprobs})")
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, CompletionSequenceGroupOutput):
+            raise NotImplementedError()
+        return (self.samples == other.samples
+                and self.prompt_logprobs == other.prompt_logprobs)
+
+
+class PoolingSequenceGroupOutput(
+        msgspec.Struct,
+        omit_defaults=True,  # type: ignore[call-arg]
+        array_like=True,  # type: ignore[call-arg]
+):
+    """The model output associated with a pooling sequence group."""
+    __metaclass__ = SequenceGroupOutput
+    # Annotated as Any to be compatible with msgspec
+    # The actual type is in SequenceGroup.pooled_data
+    data: Any
+
+    def get_data_nbytes(self) -> int:
+        data: torch.Tensor = self.data
+        return data.nbytes
+
+    def __repr__(self) -> str:
+        return f"PoolingSequenceGroupOutput(data={self.data}"
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, PoolingSequenceGroupOutput):
+            raise NotImplementedError()
+        return self.data == other.data
+
+
+# cannot use msgspec.Struct here because Dynamo does not support it
+@dataclass
+class IntermediateTensors:
+    """For all pipeline stages except the last, we need to return the hidden
+    states and residuals to be sent to the next stage. This data structure
+    contains the hidden states and residuals for a request.
+    
+    Each stage also needs to handle its own finished_sending and 
+    finished_recving in case of kv transfer.
+    """
+
+    tensors: dict[str, torch.Tensor]
+    # [req_ids]
+    finished_sending: Optional[set[str]] = None
+    finished_recving: Optional[set[str]] = None
+
+    def __init__(self, tensors):
+        # manually define this function, so that
+        # Dynamo knows `IntermediateTensors()` comes from this file.
+        # Otherwise, dataclass will generate this function by evaluating
+        # a string, and we will lose the information about the source file.
+        self.tensors = tensors
+
+    def __getitem__(self, key: Union[str, slice]):
+        if isinstance(key, str):
+            return self.tensors[key]
+        elif isinstance(key, slice):
+            return self.__class__({k: v[key] for k, v in self.tensors.items()})
+
+    def __setitem__(self, key: str, value: torch.Tensor):
+        self.tensors[key] = value
+
+    def items(self):
+        return self.tensors.items()
+
+    def __len__(self):
+        return len(self.tensors)
+
+    def __eq__(self, other: object):
+        return isinstance(other, self.__class__) and self
+
+    def __repr__(self) -> str:
+        return f"IntermediateTensors(tensors={self.tensors})"
+
+
+class PoolerOutput(
+        msgspec.Struct,
+        omit_defaults=True,  # type: ignore[call-arg]
+        array_like=True):  # type: ignore[call-arg]
+    """The output from a pooling operation in the pooling model."""
+    outputs: list[PoolingSequenceGroupOutput]
+
+    def get_data_nbytes(self) -> int:
+        return sum(o.get_data_nbytes() for o in self.outputs)
+
+    def __getitem__(self, idx: int) -> PoolingSequenceGroupOutput:
+        return self.outputs[idx]
+
+    def __setitem__(self, idx: int, value: PoolingSequenceGroupOutput):
+        self.outputs[idx] = value
+
+    def __len__(self):
+        return len(self.outputs)
+
+    def __eq__(self, other: object):
+        return isinstance(other,
+                          self.__class__) and self.outputs == other.outputs
+
+
+def get_all_seq_ids(
+        seq_group_metadata_list: list[SequenceGroupMetadata]) -> list[int]:
+    """Given a list of SequenceGroupMetadata, create a list of all
+    sequence ids.
+    """
+    return [seq_id for sg in seq_group_metadata_list for seq_id in sg.seq_data]
+
+
+def get_all_seq_ids_and_request_ids(
+    seq_group_metadata_list: list[SequenceGroupMetadata]
+) -> tuple[list[int], dict[str, set[int]]]:
+    """Given a list of SequenceGroupMetadata, create a list of all
+    sequence ids.
+    """
+    seq_ids: list[int] = []
+    request_id_seq_ids_mapping: defaultdict[str, set[int]] = defaultdict(set)
+    for sg in seq_group_metadata_list:
+        for seq_id in sg.seq_data:
+            seq_ids.append(seq_id)
+            request_id_seq_ids_mapping[sg.request_id].add(seq_id)
+    return seq_ids, request_id_seq_ids_mapping
+
+
+class HiddenStates(msgspec.Struct, array_like=True,
+                   omit_defaults=True):  # type: ignore[call-arg]
+    """Hidden states corresponding to in-progress sequences.
+    Used in speculative decoding to pass hidden states from
+    the target model to the proposer model.
+
+    seq_ids are the sequence ids of each entry of the batch
+    dimension of the hidden_states tensor"""
+    # Scorer hidden states. For prefill step, it is used for hidden states of
+    # all tokens, whereas for decode step, it use used for last accepted tokens.
+    hidden_states: torch.Tensor
+    # The sequence group metadata list. Only needed for decode step.
+    seq_group_metadata_list: Optional[list[SequenceGroupMetadata]] = None
+    # Scorer hidden states of the 2nd last token proposed by the proposer (
+    # irrespective of whether it was accepted or not). Only used for cases when
+    # last proposed token is accepted (i.e., in case of bonus tokens). For the
+    # case of no bonus tokens, these are ignored.
+    second_last_token_hidden_states: Optional[torch.Tensor] = None
+
+    _seq_ids: list[int] = msgspec.field(default_factory=list)
+
+    def __post_init__(self):
+        if self.seq_group_metadata_list is not None:
+            assert len(self.seq_group_metadata_list) == len(self.hidden_states)
+            self._seq_ids = get_all_seq_ids(self.seq_group_metadata_list)
+
+    @property
+    def seq_ids(self) -> list[int]:
+        return self._seq_ids
+
+    def update(self,
+               hidden_states: torch.Tensor,
+               seq_group_metadata_list: list[SequenceGroupMetadata],
+               second_last_token_hidden_states: Optional[torch.Tensor] = None):
+        """Update hidden states from target model invocation. Only used for
+        decode steps"""
+        assert len(seq_group_metadata_list) == len(hidden_states)
+        self._seq_ids.extend(get_all_seq_ids(seq_group_metadata_list))
+        self.hidden_states = torch.cat([self.hidden_states, hidden_states])
+
+        if self.second_last_token_hidden_states is not None:
+            # Adding dummy hidden_states to this to maintain same shape
+            self.second_last_token_hidden_states = torch.cat([
+                self.second_last_token_hidden_states,
+                torch.zeros_like(hidden_states)
+                if second_last_token_hidden_states is None else
+                second_last_token_hidden_states
+            ])
+
+    def prune(self,
+              seq_group_metadata_list: list[SequenceGroupMetadata]) -> None:
+        """Prune to provided list of sequence ids. Only used for decode steps.
+        """
+        # Currently this prunes all seq_ids not present in
+        # seq_group_metadata_list which might cause problems where a sequence
+        # may be "paused" then "resumed" later. This should only prune sequences
+        # which are confirmed to be aborted.
+        seq_ids = get_all_seq_ids(seq_group_metadata_list)
+        # Only keep sequence IDs that exist in self._seq_ids
+        seq_ids = [seq_id for seq_id in seq_ids if seq_id in self._seq_ids]
+        if seq_ids != self._seq_ids:
+            # Batch contents changed - prune removed sequences.
+            index = [self._seq_ids.index(seq_id) for seq_id in seq_ids]
+            self.hidden_states = self.hidden_states[index]
+            if self.second_last_token_hidden_states is not None:
+                self.second_last_token_hidden_states = self\
+                    .second_last_token_hidden_states[index]
+            self._seq_ids = seq_ids
+
+    def expand_with_bonus_tokens(
+            self, seq_with_bonus_token_in_last_step: set) -> None:
+        """Expand hidden states for sequences with bonus tokens. This is in
+        alignment with `MultiStepWorker._expand_execute_model_request`."""
+        if self.second_last_token_hidden_states is None \
+            or not seq_with_bonus_token_in_last_step:
+            return
+
+        index = []
+        for seq_id in self._seq_ids:
+            i = self._seq_ids.index(seq_id)
+            if seq_id in seq_with_bonus_token_in_last_step:
+                index.append(i + len(self._seq_ids))
+            index.append(i)
+
+        self.hidden_states = torch.cat(
+            [self.hidden_states, self.second_last_token_hidden_states])[index]
+
+
+class ExecuteModelRequest(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True):  # type: ignore[call-arg]
+    """The model execution request, containing CPU metadata only. The LLM
+    engine should create an instance of this class for each request batch."""
+    # The sequence group metadata list.
+    seq_group_metadata_list: list[Union[SequenceGroupMetadata,
+                                        SequenceGroupMetadataDelta]]
+    # Blocks to swap in. List of CPU -> GPU block number.
+    blocks_to_swap_in: list[tuple[int,
+                                  int]] = msgspec.field(default_factory=list)
+    # Blocks to swap out. List of GPU -> CPU block number.
+    blocks_to_swap_out: list[tuple[int,
+                                   int]] = msgspec.field(default_factory=list)
+    # Blocks to copy. Source to dest block.
+    blocks_to_copy: list[tuple[int, int]] = msgspec.field(default_factory=list)
+    # Virtual engine ID for pipeline parallel.
+    virtual_engine: int = 0
+    # The number of slots for lookahead decoding.
+    num_lookahead_slots: int = 0
+    # The number of requests in the running queue.
+    running_queue_size: int = 0
+    # Optional hidden states from prior step.
+    previous_hidden_states: Optional[HiddenStates] = None
+    # The number of forward steps to run.
+    num_steps: int = 1
+    # Finished request ids since last step.
+    finished_requests_ids: list[str] = msgspec.field(default_factory=list)
+    # The last sampled token ids for multi step decoding.
+    last_sampled_token_ids: Optional[torch.Tensor] = None
+    # Async callback
+    async_callback: Optional[Callable] = None
+
+    @property
+    def is_first_multi_step(self) -> bool:
+        # TODO(will) make this be able to handle batches with variable number of
+        # steps
+        assert len(self.seq_group_metadata_list) > 0
+        first_seq_group = self.seq_group_metadata_list[0]
+        assert first_seq_group.state is not None
+        return first_seq_group.state.current_step == 0
+
+    @property
+    def is_last_step(self) -> bool:
+        # TODO(will) make this be able to handle batches with variable number of
+        # steps
+        assert len(self.seq_group_metadata_list) > 0
+        first_seq_group = self.seq_group_metadata_list[0]
+        assert first_seq_group.state is not None
+        return first_seq_group.state.remaining_steps == 1
+
+    @property
+    def current_step(self) -> int:
+        # TODO(will) make this be able to handle batches with variable number of
+        # steps
+        assert len(self.seq_group_metadata_list) > 0
+        state = self.seq_group_metadata_list[0].state
+        assert state is not None
+        return state.current_step
+
+    def clone(
+        self, seq_group_metadata_list: list[Union[SequenceGroupMetadata,
+                                                  SequenceGroupMetadataDelta]]
+    ) -> "ExecuteModelRequest":
+        """Clone the request with a new sequence group metadata list."""
+        return ExecuteModelRequest(
+            seq_group_metadata_list=seq_group_metadata_list,
+            blocks_to_swap_in=self.blocks_to_swap_in.copy(),
+            blocks_to_swap_out=self.blocks_to_swap_out.copy(),
+            blocks_to_copy=self.blocks_to_copy.copy(),
+            virtual_engine=self.virtual_engine,
+            num_lookahead_slots=self.num_lookahead_slots,
+            running_queue_size=self.running_queue_size,
+            previous_hidden_states=self.previous_hidden_states,
+            num_steps=self.num_steps,
+            finished_requests_ids=self.finished_requests_ids,
+            last_sampled_token_ids=self.last_sampled_token_ids.clone()
+            if self.last_sampled_token_ids is not None else None,
+            async_callback=self.async_callback)
+
+
+@dataclass
+class SequenceGroupBase:
+    group_id: str  # the original request id before splitting
+
+    assembled_seq_group: Optional[SequenceGroup] = None
+
+    # seq id to a unique index inside this group
+    seq_id_to_index: dict[str, int] = field(default_factory=dict)
+
+    # seq ids to be finished
+    to_be_finished: dict[str, SequenceGroup] = field(default_factory=dict)
+
+    # seq id to finished sequences
+    finished_reqs: dict[str, SequenceGroup] = field(default_factory=dict)
+
+    streaming: bool = False
+
+    output_produced: bool = False
+
+    @staticmethod
+    def add_request(request_id: str, engine, params, *args, **kwargs):
+        """When we are ready to add a request with request_id and params
+        into the engine, we can split the request into multiple requests.
+        """
+        raise NotImplementedError
+
+    def finish_seq(self, seq: SequenceGroup):
+        """The sequence `seq` finishes, we should record the information.
+        """
+        del self.to_be_finished[seq.request_id]
+        self.finished_reqs[seq.request_id] = seq
+
+    def maybe_assemble_group(
+            self, seq_group: SequenceGroup) -> Optional[SequenceGroup]:
+        """Assemble the sequence group, for producing the final
+        output, or adding request in the engine again.
+        """
+        raise NotImplementedError
+
+
+class ParallelSampleSequenceGroup(SequenceGroupBase):
+
+    @staticmethod
+    def add_request(request_id: str, engine, params, **kwargs):
+        original_params = params
+        group = ParallelSampleSequenceGroup(request_id)
+        seqs = []
+        for i in range(original_params.n):
+            request_id_i = f"{request_id}_parallel_sample_{i}"
+            group.seq_id_to_index[request_id_i] = i
+            params = original_params.clone()
+            params.n = 1
+            if params.seed is not None:
+                params.seed += i
+            seq_group = engine._add_processed_request(
+                request_id_i,
+                params=params,
+                **kwargs,
+            )  # type: ignore
+            assert seq_group is not None
+            engine.seq_id_to_seq_group[request_id_i] = group
+            group.to_be_finished[request_id_i] = seq_group
+            seqs.append(seq_group.seqs[0])
+
+        # for parallel sampling, the `assembled_seq_group` is always
+        # available, since we have all the sequences ready, and they
+        # will not change.
+        group.assembled_seq_group = SequenceGroup(
+            request_id=request_id,
+            seqs=seqs,
+            arrival_time=seq_group.arrival_time,
+            sampling_params=original_params,
+            lora_request=seq_group.lora_request,
+            pooling_params=seq_group.pooling_params,
+            pooled_data=seq_group.pooled_data,
+            encoder_seq=seq_group.encoder_seq,
+            trace_headers=seq_group.trace_headers,
+            priority=seq_group.priority,
+        )
+
+        group.streaming = params.output_kind == RequestOutputKind.DELTA
+        group.output_produced = False
+
+    def maybe_assemble_group(
+            self, seq_group: SequenceGroup) -> Optional[SequenceGroup]:
+
+        # in the streaming mode, we will return the assembled sequence
+        # for the first remaining sequence, and then return None for the
+        # rest of sequences
+        if self.streaming:
+            first_remaining_id = next(iter(self.to_be_finished))
+            if seq_group.request_id == first_remaining_id:
+                return self.assembled_seq_group
+            return None
+
+        # in the non-streaming mode, we will return the assembled sequence
+        # when the last sequences finishes, and then return None for the
+        # rest of the time
+        if (len(self.to_be_finished) == 1
+                and seq_group.request_id in self.to_be_finished
+                and seq_group.is_finished()):
+            assert self.assembled_seq_group is not None
+            params = self.assembled_seq_group.sampling_params
+            assert isinstance(params, SamplingParams)
+            if not self.output_produced:
+                self.output_produced = True
+                if params._real_n is not None:
+                    # Get the top-n sequences.
+                    n = params._real_n or params.n
+                    seqs = self.assembled_seq_group.seqs
+                    sorting_key = lambda seq: seq.get_cumulative_logprob()
+                    sorted_seqs = sorted(seqs, key=sorting_key, reverse=True)
+                    top_n_seqs = sorted_seqs[:n]
+                    self.assembled_seq_group.seqs = top_n_seqs
+                return self.assembled_seq_group
+            if self.output_produced:
+                return None
+        return None
diff --git a/vllm_v0.10.0/vllm/test_utils.py b/vllm_v0.10.0/vllm/test_utils.py
new file mode 100644
index 0000000..1e61ca6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/test_utils.py
@@ -0,0 +1,130 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+MODELS_ON_S3 = [
+    "adept/fuyu-8b",
+    "ai21labs/AI21-Jamba-1.5-Mini",
+    "ai21labs/Jamba-tiny-random",
+    "ai21labs/Jamba-tiny-reward-dev",
+    "allenai/Molmo-7B-D-0924",
+    "allenai/OLMo-1B-hf",
+    "allenai/OLMoE-1B-7B-0924-Instruct",
+    "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test",
+    "AMead10/Llama-3.2-1B-Instruct-AWQ",
+    "hmellor/Ilama-3.2-1B",
+    "BAAI/bge-base-en-v1.5",
+    "BAAI/bge-multilingual-gemma2",
+    "BAAI/bge-reranker-v2-m3",
+    "bigcode/starcoder2-3b",
+    "cross-encoder/ms-marco-MiniLM-L-6-v2",
+    "cross-encoder/quora-roberta-base",
+    "deepseek-ai/deepseek-vl2-tiny",
+    "distilbert/distilgpt2",
+    "facebook/bart-base",
+    "facebook/bart-large-cnn",
+    # "fixie-ai/ultravox-v0_5-llama-3_2-1b",
+    "google/gemma-1.1-2b-it",
+    "google/gemma-2-2b-it",
+    "google/paligemma-3b-pt-224",
+    "h2oai/h2ovl-mississippi-800m",
+    "HuggingFaceM4/Idefics3-8B-Llama3",
+    "internlm/internlm2-1_8b-reward",
+    "intfloat/e5-mistral-7b-instruct",
+    "intfloat/multilingual-e5-small",
+    "jason9693/Qwen2.5-1.5B-apeach",
+    "llava-hf/llava-1.5-7b-hf",
+    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+    "llava-hf/llava-v1.6-mistral-7b-hf",
+    "llava-hf/LLaVA-NeXT-Video-7B-hf",
+    # "meta-llama/Llama-2-7b-hf",
+    "meta-llama/Llama-3.2-11B-Vision-Instruct",
+    "meta-llama/Llama-3.2-1B",
+    "meta-llama/Llama-3.2-1B-Instruct",
+    "meta-llama/Meta-Llama-3-8B",
+    "microsoft/phi-2",
+    "microsoft/Phi-3-mini-4k-instruct",
+    "microsoft/Phi-3-small-8k-instruct",
+    "microsoft/Phi-3-vision-128k-instruct",
+    "microsoft/Phi-3.5-MoE-instruct",
+    "microsoft/Phi-3.5-vision-instruct",
+    # "mistralai/Mistral-7B-Instruct-v0.1",
+    "mistralai/Mixtral-8x7B-Instruct-v0.1",
+    "mistralai/Pixtral-12B-2409",
+    "mistral-community/Mixtral-8x22B-v0.1-AWQ",
+    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head",
+    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
+    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
+    "ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024",
+    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
+    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
+    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
+    "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
+    "nm-testing/llama2.c-stories42M-pruned2.4-compressed",
+    "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
+    "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test",
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
+    "nm-testing/Phi-3-mini-128k-instruct-FP8",
+    "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
+    "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
+    "nm-testing/tinyllama-oneshot-w4a16-channel-v2",
+    "nm-testing/tinyllama-oneshot-w4a16-group128-v2",
+    "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
+    "nm-testing/tinyllama-oneshot-w8a16-per-channel",
+    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
+    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
+    "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
+    "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym",
+    "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
+    "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme",
+    "nvidia/NVLM-D-72B",
+    "openai-community/gpt2",
+    # "openai/whisper-large-v3",
+    "openbmb/MiniCPM-o-2_6",
+    "openbmb/MiniCPM-V-2_6",
+    "OpenGVLab/InternVL2-1B",
+    "parasail-ai/GritLM-7B-vllm",
+    "Qwen/Qwen1.5-MoE-A2.7B-Chat",
+    "Qwen/Qwen2-7B-Instruct",
+    "Qwen/Qwen2-Audio-7B-Instruct",
+    "Qwen/Qwen2-VL-2B-Instruct",
+    "Qwen/Qwen2.5-1.5B-Instruct",
+    "Qwen/Qwen2.5-Math-PRM-7B",
+    "Qwen/Qwen2.5-Math-RM-72B",
+    "Qwen/Qwen2.5-VL-3B-Instruct",
+    "royokong/e5-v",
+    "sentence-transformers/all-roberta-large-v1",
+    "sentence-transformers/stsb-roberta-base-v2",
+    "allenai/OLMo-2-0425-1B",
+    "shuyuej/Llama-3.2-1B-Instruct-GPTQ",
+    "ssmits/Qwen2-7B-Instruct-embed-base",
+    "stabilityai/stablelm-3b-4e1t",
+    "stabilityai/stablelm-zephyr-3b",
+    "state-spaces/mamba-130m-hf",
+    "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
+    "THUDM/glm-4v-9b",
+    "TIGER-Lab/Mantis-8B-siglip-llama3",
+    "TIGER-Lab/VLM2Vec-Full",
+    "tiiuae/falcon-40b",
+    "tiiuae/falcon-mamba-7b-instruct",
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "upstage/solar-pro-preview-instruct",
+]
+
+MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"
diff --git a/vllm_v0.10.0/vllm/third_party/__init__.py b/vllm_v0.10.0/vllm/third_party/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/third_party/pynvml.py b/vllm_v0.10.0/vllm/third_party/pynvml.py
new file mode 100644
index 0000000..d215e5d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/third_party/pynvml.py
@@ -0,0 +1,6140 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# copied from https://pypi.org/project/nvidia-ml-py
+# version 12.570.86
+
+#####
+# Copyright (c) 2011-2023, NVIDIA Corporation.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#    * Redistributions of source code must retain the above copyright notice,
+#      this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+#    * Neither the name of the NVIDIA Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+#####
+
+##
+# Python bindings for the NVML library
+##
+from ctypes import *
+from ctypes.util import find_library
+from functools import wraps
+import sys
+import os
+import threading
+import string
+
+## C Type mappings ##
+## Enums
+_nvmlEnableState_t = c_uint
+NVML_FEATURE_DISABLED    = 0
+NVML_FEATURE_ENABLED     = 1
+
+_nvmlBrandType_t = c_uint
+NVML_BRAND_UNKNOWN             = 0
+NVML_BRAND_QUADRO              = 1
+NVML_BRAND_TESLA               = 2
+NVML_BRAND_NVS                 = 3
+NVML_BRAND_GRID                = 4   # Deprecated from API reporting. Keeping definition for backward compatibility.
+NVML_BRAND_GEFORCE             = 5
+NVML_BRAND_TITAN               = 6
+NVML_BRAND_NVIDIA_VAPPS        = 7   # NVIDIA Virtual Applications
+NVML_BRAND_NVIDIA_VPC          = 8   # NVIDIA Virtual PC
+NVML_BRAND_NVIDIA_VCS          = 9   # NVIDIA Virtual Compute Server
+NVML_BRAND_NVIDIA_VWS          = 10  # NVIDIA RTX Virtual Workstation
+NVML_BRAND_NVIDIA_CLOUD_GAMING = 11  # NVIDIA Cloud Gaming
+NVML_BRAND_NVIDIA_VGAMING      = NVML_BRAND_NVIDIA_CLOUD_GAMING # Deprecated from API reporting. Keeping definition for backward compatibility.
+NVML_BRAND_QUADRO_RTX          = 12
+NVML_BRAND_NVIDIA_RTX          = 13
+NVML_BRAND_NVIDIA              = 14
+NVML_BRAND_GEFORCE_RTX         = 15  # Unused
+NVML_BRAND_TITAN_RTX           = 16  # Unused
+NVML_BRAND_COUNT               = 17
+
+_nvmlTemperatureThresholds_t = c_uint
+NVML_TEMPERATURE_THRESHOLD_SHUTDOWN      = 0
+NVML_TEMPERATURE_THRESHOLD_SLOWDOWN      = 1
+NVML_TEMPERATURE_THRESHOLD_MEM_MAX       = 2
+NVML_TEMPERATURE_THRESHOLD_GPU_MAX       = 3
+NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_MIN  = 4
+NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_CURR = 5
+NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_MAX  = 6
+NVML_TEMPERATURE_THRESHOLD_GPS_CURR      = 7
+NVML_TEMPERATURE_THRESHOLD_COUNT         = 8
+
+_nvmlTemperatureSensors_t = c_uint
+NVML_TEMPERATURE_GPU     = 0
+NVML_TEMPERATURE_COUNT   = 1
+
+
+_nvmlComputeMode_t = c_uint
+NVML_COMPUTEMODE_DEFAULT           = 0
+NVML_COMPUTEMODE_EXCLUSIVE_THREAD  = 1  ## Support Removed
+NVML_COMPUTEMODE_PROHIBITED        = 2
+NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3
+NVML_COMPUTEMODE_COUNT             = 4
+
+_nvmlMemoryLocation_t = c_uint
+NVML_MEMORY_LOCATION_L1_CACHE = 0
+NVML_MEMORY_LOCATION_L2_CACHE = 1
+NVML_MEMORY_LOCATION_DEVICE_MEMORY = 2
+NVML_MEMORY_LOCATION_DRAM = 2
+NVML_MEMORY_LOCATION_REGISTER_FILE = 3
+NVML_MEMORY_LOCATION_TEXTURE_MEMORY = 4
+NVML_MEMORY_LOCATION_TEXTURE_SHM = 5
+NVML_MEMORY_LOCATION_CBU = 6
+NVML_MEMORY_LOCATION_SRAM = 7
+NVML_MEMORY_LOCATION_COUNT = 8
+
+NVML_NVLINK_MAX_LINKS = 18
+
+# For backwards compatibility, maintain the incorrectly-named "LANES" define
+NVML_NVLINK_MAX_LANES = NVML_NVLINK_MAX_LINKS
+
+_nvmlNvLinkErrorCounter_t = c_uint
+NVML_NVLINK_ERROR_DL_REPLAY = 0
+NVML_NVLINK_ERROR_DL_RECOVERY = 1
+NVML_NVLINK_ERROR_DL_CRC_FLIT = 2
+NVML_NVLINK_ERROR_DL_CRC_DATA = 3
+NVML_NVLINK_ERROR_DL_ECC_DATA = 4
+NVML_NVLINK_ERROR_COUNT = 5
+
+_nvmlNvLinkEccLaneErrorCounter_t = c_uint
+NVML_NVLINK_ERROR_DL_ECC_LANE0 = 0
+NVML_NVLINK_ERROR_DL_ECC_LANE1 = 1
+NVML_NVLINK_ERROR_DL_ECC_LANE2 = 2
+NVML_NVLINK_ERROR_DL_ECC_LANE3 = 3
+NVML_NVLINK_ERROR_DL_ECC_COUNT = 5
+
+_nvmlNvLinkCapability_t = c_uint
+NVML_NVLINK_CAP_P2P_SUPPORTED = 0
+NVML_NVLINK_CAP_SYSMEM_ACCESS = 1
+NVML_NVLINK_CAP_P2P_ATOMICS   = 2
+NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3
+NVML_NVLINK_CAP_SLI_BRIDGE    = 4
+NVML_NVLINK_CAP_VALID         = 5
+NVML_NVLINK_CAP_COUNT         = 6
+
+_nvmlNvLinkUtilizationCountPktTypes_t = c_uint
+NVML_NVLINK_COUNTER_PKTFILTER_NOP        = 0x1
+NVML_NVLINK_COUNTER_PKTFILTER_READ       = 0x2
+NVML_NVLINK_COUNTER_PKTFILTER_WRITE      = 0x4
+NVML_NVLINK_COUNTER_PKTFILTER_RATOM      = 0x8
+NVML_NVLINK_COUNTER_PKTFILTER_NRATOM     = 0x10
+NVML_NVLINK_COUNTER_PKTFILTER_FLUSH      = 0x20
+NVML_NVLINK_COUNTER_PKTFILTER_RESPDATA   = 0x40
+NVML_NVLINK_COUNTER_PKTFILTER_RESPNODATA = 0x80
+NVML_NVLINK_COUNTER_PKTFILTER_ALL        = 0xFF
+
+_nvmlNvLinkUtilizationCountUnits_t = c_uint
+NVML_NVLINK_COUNTER_UNIT_CYCLES   = 0
+NVML_NVLINK_COUNTER_UNIT_PACKETS  = 1
+NVML_NVLINK_COUNTER_UNIT_BYTES    = 2
+NVML_NVLINK_COUNTER_UNIT_RESERVED = 3
+NVML_NVLINK_COUNTER_UNIT_COUNT    = 4
+
+_nvmlNvLinkDeviceType_t = c_uint
+NVML_NVLINK_DEVICE_TYPE_GPU     = 0x00
+NVML_NVLINK_DEVICE_TYPE_IBMNPU  = 0x01
+NVML_NVLINK_DEVICE_TYPE_SWITCH  = 0x02
+NVML_NVLINK_DEVICE_TYPE_UNKNOWN = 0xFF
+
+# These are deprecated, instead use _nvmlMemoryErrorType_t
+_nvmlEccBitType_t = c_uint
+NVML_SINGLE_BIT_ECC    = 0
+NVML_DOUBLE_BIT_ECC    = 1
+NVML_ECC_ERROR_TYPE_COUNT = 2
+
+_nvmlEccCounterType_t = c_uint
+NVML_VOLATILE_ECC      = 0
+NVML_AGGREGATE_ECC     = 1
+NVML_ECC_COUNTER_TYPE_COUNT = 2
+
+_nvmlMemoryErrorType_t = c_uint
+NVML_MEMORY_ERROR_TYPE_CORRECTED   = 0
+NVML_MEMORY_ERROR_TYPE_UNCORRECTED = 1
+NVML_MEMORY_ERROR_TYPE_COUNT       = 2
+
+_nvmlClockType_t = c_uint
+NVML_CLOCK_GRAPHICS  = 0
+NVML_CLOCK_SM        = 1
+NVML_CLOCK_MEM       = 2
+NVML_CLOCK_VIDEO     = 3
+NVML_CLOCK_COUNT     = 4
+
+_nvmlClockId_t = c_uint
+NVML_CLOCK_ID_CURRENT            = 0
+NVML_CLOCK_ID_APP_CLOCK_TARGET   = 1
+NVML_CLOCK_ID_APP_CLOCK_DEFAULT  = 2
+NVML_CLOCK_ID_CUSTOMER_BOOST_MAX = 3
+NVML_CLOCK_ID_COUNT              = 4
+
+_nvmlDriverModel_t = c_uint
+NVML_DRIVER_WDDM       = 0
+NVML_DRIVER_WDM        = 1
+NVML_DRIVER_MCDM       = 2
+
+NVML_MAX_GPU_PERF_PSTATES = 16
+
+_nvmlPstates_t = c_uint
+NVML_PSTATE_0               = 0
+NVML_PSTATE_1               = 1
+NVML_PSTATE_2               = 2
+NVML_PSTATE_3               = 3
+NVML_PSTATE_4               = 4
+NVML_PSTATE_5               = 5
+NVML_PSTATE_6               = 6
+NVML_PSTATE_7               = 7
+NVML_PSTATE_8               = 8
+NVML_PSTATE_9               = 9
+NVML_PSTATE_10              = 10
+NVML_PSTATE_11              = 11
+NVML_PSTATE_12              = 12
+NVML_PSTATE_13              = 13
+NVML_PSTATE_14              = 14
+NVML_PSTATE_15              = 15
+NVML_PSTATE_UNKNOWN         = 32
+
+_nvmlInforomObject_t = c_uint
+NVML_INFOROM_OEM            = 0
+NVML_INFOROM_ECC            = 1
+NVML_INFOROM_POWER          = 2
+NVML_INFOROM_DEN            = 3
+NVML_INFOROM_COUNT          = 4
+
+_nvmlReturn_t = c_uint
+NVML_SUCCESS                         = 0
+NVML_ERROR_UNINITIALIZED             = 1
+NVML_ERROR_INVALID_ARGUMENT          = 2
+NVML_ERROR_NOT_SUPPORTED             = 3
+NVML_ERROR_NO_PERMISSION             = 4
+NVML_ERROR_ALREADY_INITIALIZED       = 5
+NVML_ERROR_NOT_FOUND                 = 6
+NVML_ERROR_INSUFFICIENT_SIZE         = 7
+NVML_ERROR_INSUFFICIENT_POWER        = 8
+NVML_ERROR_DRIVER_NOT_LOADED         = 9
+NVML_ERROR_TIMEOUT                   = 10
+NVML_ERROR_IRQ_ISSUE                 = 11
+NVML_ERROR_LIBRARY_NOT_FOUND         = 12
+NVML_ERROR_FUNCTION_NOT_FOUND        = 13
+NVML_ERROR_CORRUPTED_INFOROM         = 14
+NVML_ERROR_GPU_IS_LOST               = 15
+NVML_ERROR_RESET_REQUIRED            = 16
+NVML_ERROR_OPERATING_SYSTEM          = 17
+NVML_ERROR_LIB_RM_VERSION_MISMATCH   = 18
+NVML_ERROR_IN_USE                    = 19
+NVML_ERROR_MEMORY                    = 20
+NVML_ERROR_NO_DATA                   = 21
+NVML_ERROR_VGPU_ECC_NOT_SUPPORTED    = 22
+NVML_ERROR_INSUFFICIENT_RESOURCES    = 23
+NVML_ERROR_FREQ_NOT_SUPPORTED        = 24
+NVML_ERROR_ARGUMENT_VERSION_MISMATCH = 25
+NVML_ERROR_DEPRECATED                = 26
+NVML_ERROR_NOT_READY                 = 27
+NVML_ERROR_GPU_NOT_FOUND             = 28
+NVML_ERROR_INVALID_STATE             = 29
+NVML_ERROR_UNKNOWN                   = 999
+
+_nvmlFanState_t = c_uint
+NVML_FAN_NORMAL             = 0
+NVML_FAN_FAILED             = 1
+
+_nvmlFanControlPolicy_t = c_uint
+NVML_FAN_POLICY_TEMPERATURE_CONTINOUS_SW = 0
+NVML_FAN_POLICY_MANUAL                   = 1
+
+_nvmlLedColor_t = c_uint
+NVML_LED_COLOR_GREEN        = 0
+NVML_LED_COLOR_AMBER        = 1
+
+_nvmlGpuOperationMode_t = c_uint
+NVML_GOM_ALL_ON                 = 0
+NVML_GOM_COMPUTE                = 1
+NVML_GOM_LOW_DP                 = 2
+
+_nvmlPageRetirementCause_t = c_uint
+NVML_PAGE_RETIREMENT_CAUSE_MULTIPLE_SINGLE_BIT_ECC_ERRORS = 0
+NVML_PAGE_RETIREMENT_CAUSE_DOUBLE_BIT_ECC_ERROR           = 1
+NVML_PAGE_RETIREMENT_CAUSE_COUNT                          = 2
+
+_nvmlRestrictedAPI_t = c_uint
+NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS                = 0
+NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS               = 1
+NVML_RESTRICTED_API_COUNT                                 = 2
+
+_nvmlBridgeChipType_t = c_uint
+NVML_BRIDGE_CHIP_PLX = 0
+NVML_BRIDGE_CHIP_BRO4 = 1
+NVML_MAX_PHYSICAL_BRIDGE = 128
+
+_nvmlValueType_t = c_uint
+NVML_VALUE_TYPE_DOUBLE = 0
+NVML_VALUE_TYPE_UNSIGNED_INT = 1
+NVML_VALUE_TYPE_UNSIGNED_LONG = 2
+NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3
+NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4
+NVML_VALUE_TYPE_SIGNED_INT = 5
+NVML_VALUE_TYPE_UNSIGNED_SHORT = 6
+NVML_VALUE_TYPE_COUNT = 7
+
+_nvmlNvlinkVersion_t = c_uint
+NVML_NVLINK_VERSION_INVALID = 0
+NVML_NVLINK_VERSION_1_0 = 1
+NVML_NVLINK_VERSION_2_0 = 2
+NVML_NVLINK_VERSION_2_2 = 3
+NVML_NVLINK_VERSION_3_0 = 4
+NVML_NVLINK_VERSION_3_1 = 5
+NVML_NVLINK_VERSION_4_0 = 6
+NVML_NVLINK_VERSION_5_0 = 7
+
+_nvmlPerfPolicyType_t = c_uint
+NVML_PERF_POLICY_POWER = 0
+NVML_PERF_POLICY_THERMAL = 1
+NVML_PERF_POLICY_SYNC_BOOST = 2
+NVML_PERF_POLICY_BOARD_LIMIT = 3
+NVML_PERF_POLICY_LOW_UTILIZATION = 4
+NVML_PERF_POLICY_RELIABILITY = 5
+NVML_PERF_POLICY_TOTAL_APP_CLOCKS = 10
+NVML_PERF_POLICY_TOTAL_BASE_CLOCKS = 11
+NVML_PERF_POLICY_COUNT = 12
+
+_nvmlEncoderQueryType_t = c_uint
+NVML_ENCODER_QUERY_H264 = 0
+NVML_ENCODER_QUERY_HEVC = 1
+NVML_ENCODER_QUERY_AV1 = 2
+NVML_ENCODER_QUERY_UNKNOWN = 255
+
+_nvmlFBCSessionType_t = c_uint
+NVML_FBC_SESSION_TYPE_UNKNOWN = 0
+NVML_FBC_SESSION_TYPE_TOSYS = 1
+NVML_FBC_SESSION_TYPE_CUDA = 2
+NVML_FBC_SESSION_TYPE_VID = 3
+NVML_FBC_SESSION_TYPE_HWENC = 4
+
+_nvmlDetachGpuState_t = c_uint
+NVML_DETACH_GPU_KEEP = 0
+NVML_DETACH_GPU_REMOVE = 1
+
+_nvmlPcieLinkState_t = c_uint
+NVML_PCIE_LINK_KEEP = 0
+NVML_PCIE_LINK_SHUT_DOWN = 1
+
+_nvmlSamplingType_t = c_uint
+NVML_TOTAL_POWER_SAMPLES = 0
+NVML_GPU_UTILIZATION_SAMPLES = 1
+NVML_MEMORY_UTILIZATION_SAMPLES = 2
+NVML_ENC_UTILIZATION_SAMPLES = 3
+NVML_DEC_UTILIZATION_SAMPLES = 4
+NVML_PROCESSOR_CLK_SAMPLES = 5
+NVML_MEMORY_CLK_SAMPLES = 6
+NVML_MODULE_POWER_SAMPLES = 7
+NVML_JPG_UTILIZATION_SAMPLES = 8
+NVML_OFA_UTILIZATION_SAMPLES = 9
+NVML_SAMPLINGTYPE_COUNT = 10
+
+_nvmlPcieUtilCounter_t = c_uint
+NVML_PCIE_UTIL_TX_BYTES = 0
+NVML_PCIE_UTIL_RX_BYTES = 1
+NVML_PCIE_UTIL_COUNT = 2
+
+_nvmlGpuTopologyLevel_t = c_uint
+NVML_TOPOLOGY_INTERNAL = 0
+NVML_TOPOLOGY_SINGLE = 10
+NVML_TOPOLOGY_MULTIPLE = 20
+NVML_TOPOLOGY_HOSTBRIDGE = 30
+NVML_TOPOLOGY_NODE = 40
+NVML_TOPOLOGY_CPU = NVML_TOPOLOGY_NODE
+NVML_TOPOLOGY_SYSTEM = 50
+
+_nvmlGpuP2PCapsIndex_t = c_uint
+NVML_P2P_CAPS_INDEX_READ = 0,
+NVML_P2P_CAPS_INDEX_WRITE = 1
+NVML_P2P_CAPS_INDEX_NVLINK =2
+NVML_P2P_CAPS_INDEX_ATOMICS = 3
+#
+# NVML_P2P_CAPS_INDEX_PROP is deprecated.
+# Use NVML_P2P_CAPS_INDEX_PCI instead.
+#
+NVML_P2P_CAPS_INDEX_PROP = 4
+NVML_P2P_CAPS_INDEX_PCI = 4
+NVML_P2P_CAPS_INDEX_UNKNOWN = 5
+
+_nvmlGpuP2PStatus_t = c_uint
+NVML_P2P_STATUS_OK     = 0
+NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED = 1
+NVML_P2P_STATUS_CHIPSET_NOT_SUPPORTED = NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED
+NVML_P2P_STATUS_GPU_NOT_SUPPORTED = 2
+NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED =3
+NVML_P2P_STATUS_DISABLED_BY_REGKEY =4
+NVML_P2P_STATUS_NOT_SUPPORTED =5
+NVML_P2P_STATUS_UNKNOWN =6
+
+_nvmlDeviceArchitecture_t = c_uint
+NVML_DEVICE_ARCH_KEPLER   = 2
+NVML_DEVICE_ARCH_MAXWELL  = 3
+NVML_DEVICE_ARCH_PASCAL   = 4
+NVML_DEVICE_ARCH_VOLTA    = 5
+NVML_DEVICE_ARCH_TURING   = 6
+NVML_DEVICE_ARCH_AMPERE   = 7
+NVML_DEVICE_ARCH_ADA      = 8
+NVML_DEVICE_ARCH_HOPPER   = 9
+NVML_DEVICE_ARCH_BLACKWELL   = 10
+NVML_DEVICE_ARCH_T23X     = 11
+NVML_DEVICE_ARCH_UNKNOWN  = 0xffffffff
+
+# PCI bus Types
+_nvmlBusType_t = c_uint
+NVML_BUS_TYPE_UNKNOWN = 0
+NVML_BUS_TYPE_PCI     = 1
+NVML_BUS_TYPE_PCIE    = 2
+NVML_BUS_TYPE_FPCI    = 3
+NVML_BUS_TYPE_AGP     = 4
+
+_nvmlPowerSource_t = c_uint
+NVML_POWER_SOURCE_AC         = 0x00000000
+NVML_POWER_SOURCE_BATTERY    = 0x00000001
+NVML_POWER_SOURCE_UNDERSIZED = 0x00000002
+
+_nvmlAdaptiveClockInfoStatus_t = c_uint
+NVML_ADAPTIVE_CLOCKING_INFO_STATUS_DISABLED = 0x00000000
+NVML_ADAPTIVE_CLOCKING_INFO_STATUS_ENABLED = 0x00000001
+
+_nvmlClockLimitId_t = c_uint
+NVML_CLOCK_LIMIT_ID_RANGE_START = 0xffffff00
+NVML_CLOCK_LIMIT_ID_TDP         = 0xffffff01
+NVML_CLOCK_LIMIT_ID_UNLIMITED   = 0xffffff02
+
+_nvmlPcieLinkMaxSpeed_t = c_uint
+NVML_PCIE_LINK_MAX_SPEED_INVALID   = 0x00000000
+NVML_PCIE_LINK_MAX_SPEED_2500MBPS  = 0x00000001
+NVML_PCIE_LINK_MAX_SPEED_5000MBPS  = 0x00000002
+NVML_PCIE_LINK_MAX_SPEED_8000MBPS  = 0x00000003
+NVML_PCIE_LINK_MAX_SPEED_16000MBPS = 0x00000004
+NVML_PCIE_LINK_MAX_SPEED_32000MBPS = 0x00000005
+NVML_PCIE_LINK_MAX_SPEED_64000MBPS = 0x00000006
+
+_nvmlPcieAtomicsCapability_t = c_uint
+NVML_PCIE_ATOMICS_CAP_FETCHADD32  = 0x01
+NVML_PCIE_ATOMICS_CAP_FETCHADD64  = 0x02
+NVML_PCIE_ATOMICS_CAP_SWAP32      = 0x04
+NVML_PCIE_ATOMICS_CAP_SWAP64      = 0x08
+NVML_PCIE_ATOMICS_CAP_CAS32       = 0x10
+NVML_PCIE_ATOMICS_CAP_CAS64       = 0x20
+NVML_PCIE_ATOMICS_CAP_CAS128      = 0x40
+NVML_PCIE_ATOMICS_OPS_MAX         = 7
+
+_nvmlAffinityScope_t = c_uint
+NVML_AFFINITY_SCOPE_NODE   = 0
+NVML_AFFINITY_SCOPE_SOCKET = 1
+
+_nvmlDeviceGpuRecoveryAction_t = c_uint
+NVML_GPU_RECOVERY_ACTION_NONE        = 0
+NVML_GPU_RECOVERY_ACTION_GPU_RESET   = 1
+NVML_GPU_RECOVERY_ACTION_NODE_REBOOT = 2
+NVML_GPU_RECOVERY_ACTION_DRAIN_P2P   = 3
+NVML_GPU_RECOVERY_ACTION_DRAIN_AND_RESET = 4
+
+# C preprocessor defined values
+nvmlFlagDefault             = 0
+nvmlFlagForce               = 1
+NVML_INIT_FLAG_NO_GPUS      = 1
+NVML_INIT_FLAG_NO_ATTACH    = 2
+
+NVML_MAX_GPC_COUNT          = 32
+
+# buffer size
+NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE      = 16
+NVML_DEVICE_UUID_BUFFER_SIZE                 = 80
+NVML_DEVICE_UUID_V2_BUFFER_SIZE              = 96
+NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE       = 80
+NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE         = 80
+NVML_DEVICE_NAME_BUFFER_SIZE                 = 64
+NVML_DEVICE_NAME_V2_BUFFER_SIZE              = 96
+NVML_DEVICE_SERIAL_BUFFER_SIZE               = 30
+NVML_DEVICE_PART_NUMBER_BUFFER_SIZE          = 80
+NVML_DEVICE_GPU_PART_NUMBER_BUFFER_SIZE      = 80
+NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE        = 32
+NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE           = 32
+NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE        = 16
+NVML_GRID_LICENSE_BUFFER_SIZE                = 128
+NVML_VGPU_NAME_BUFFER_SIZE                   = 64
+NVML_GRID_LICENSE_FEATURE_MAX_COUNT          = 3
+NVML_VGPU_METADATA_OPAQUE_DATA_SIZE          = sizeof(c_uint) + 256
+NVML_VGPU_PGPU_METADATA_OPAQUE_DATA_SIZE     = 256
+NVML_DEVICE_GPU_FRU_PART_NUMBER_BUFFER_SIZE  = 0x14 # NV2080_GPU_MAX_PRODUCT_PART_NUMBER_LENGTH
+NVML_PERF_MODES_BUFFER_SIZE                  = 2048
+
+# Format strings
+NVML_DEVICE_PCI_BUS_ID_LEGACY_FMT   = "%04X:%02X:%02X.0"
+NVML_DEVICE_PCI_BUS_ID_FMT          = "%08X:%02X:%02X.0"
+
+NVML_VALUE_NOT_AVAILABLE_ulonglong = c_ulonglong(-1)
+NVML_VALUE_NOT_AVAILABLE_uint = c_uint(-1)
+
+'''
+ Field Identifiers.
+
+ All Identifiers pertain to a device. Each ID is only used once and is guaranteed never to change.
+'''
+NVML_FI_DEV_ECC_CURRENT          = 1   # Current ECC mode. 1=Active. 0=Inactive
+NVML_FI_DEV_ECC_PENDING          = 2   # Pending ECC mode. 1=Active. 0=Inactive
+
+#ECC Count Totals
+NVML_FI_DEV_ECC_SBE_VOL_TOTAL    = 3   # Total single bit volatile ECC errors
+NVML_FI_DEV_ECC_DBE_VOL_TOTAL    = 4   # Total double bit volatile ECC errors
+NVML_FI_DEV_ECC_SBE_AGG_TOTAL    = 5   # Total single bit aggregate (persistent) ECC errors
+NVML_FI_DEV_ECC_DBE_AGG_TOTAL    = 6   # Total double bit aggregate (persistent) ECC errors
+#Individual ECC locations
+NVML_FI_DEV_ECC_SBE_VOL_L1       = 7   # L1 cache single bit volatile ECC errors
+NVML_FI_DEV_ECC_DBE_VOL_L1       = 8   # L1 cache double bit volatile ECC errors
+NVML_FI_DEV_ECC_SBE_VOL_L2       = 9   # L2 cache single bit volatile ECC errors
+NVML_FI_DEV_ECC_DBE_VOL_L2       = 10  # L2 cache double bit volatile ECC errors
+NVML_FI_DEV_ECC_SBE_VOL_DEV      = 11  # Device memory single bit volatile ECC errors
+NVML_FI_DEV_ECC_DBE_VOL_DEV      = 12  # Device memory double bit volatile ECC errors
+NVML_FI_DEV_ECC_SBE_VOL_REG      = 13  # Register file single bit volatile ECC errors
+NVML_FI_DEV_ECC_DBE_VOL_REG      = 14  # Register file double bit volatile ECC errors
+NVML_FI_DEV_ECC_SBE_VOL_TEX      = 15  # Texture memory single bit volatile ECC errors
+NVML_FI_DEV_ECC_DBE_VOL_TEX      = 16  # Texture memory double bit volatile ECC errors
+NVML_FI_DEV_ECC_DBE_VOL_CBU      = 17  # CBU double bit volatile ECC errors
+NVML_FI_DEV_ECC_SBE_AGG_L1       = 18  # L1 cache single bit aggregate (persistent) ECC errors
+NVML_FI_DEV_ECC_DBE_AGG_L1       = 19  # L1 cache double bit aggregate (persistent) ECC errors
+NVML_FI_DEV_ECC_SBE_AGG_L2       = 20  # L2 cache single bit aggregate (persistent) ECC errors
+NVML_FI_DEV_ECC_DBE_AGG_L2       = 21  # L2 cache double bit aggregate (persistent) ECC errors
+NVML_FI_DEV_ECC_SBE_AGG_DEV      = 22  # Device memory single bit aggregate (persistent) ECC errors
+NVML_FI_DEV_ECC_DBE_AGG_DEV      = 23  # Device memory double bit aggregate (persistent) ECC errors
+NVML_FI_DEV_ECC_SBE_AGG_REG      = 24  # Register File single bit aggregate (persistent) ECC errors
+NVML_FI_DEV_ECC_DBE_AGG_REG      = 25  # Register File double bit aggregate (persistent) ECC errors
+NVML_FI_DEV_ECC_SBE_AGG_TEX      = 26  # Texture memory single bit aggregate (persistent) ECC errors
+NVML_FI_DEV_ECC_DBE_AGG_TEX      = 27  # Texture memory double bit aggregate (persistent) ECC errors
+NVML_FI_DEV_ECC_DBE_AGG_CBU      = 28  # CBU double bit aggregate ECC errors
+
+# Page Retirement
+NVML_FI_DEV_RETIRED_SBE          = 29  # Number of retired pages because of single bit errors
+NVML_FI_DEV_RETIRED_DBE          = 30  # Number of retired pages because of double bit errors
+NVML_FI_DEV_RETIRED_PENDING      = 31  # If any pages are pending retirement. 1=yes. 0=no.
+
+# NvLink Flit Error Counters
+NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0   = 32 # NVLink flow control CRC  Error Counter for Lane 0
+NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1   = 33 # NVLink flow control CRC  Error Counter for Lane 1
+NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2   = 34 # NVLink flow control CRC  Error Counter for Lane 2
+NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3   = 35 # NVLink flow control CRC  Error Counter for Lane 3
+NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4   = 36 # NVLink flow control CRC  Error Counter for Lane 4
+NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5   = 37 # NVLink flow control CRC  Error Counter for Lane 5
+NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL = 38 # NVLink flow control CRC  Error Counter total for all Lanes
+
+# NvLink CRC Data Error Counters
+NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0   = 39 # NVLink data CRC Error Counter for Lane 0
+NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1   = 40 # NVLink data CRC Error Counter for Lane 1
+NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2   = 41 # NVLink data CRC Error Counter for Lane 2
+NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3   = 42 # NVLink data CRC Error Counter for Lane 3
+NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4   = 43 # NVLink data CRC Error Counter for Lane 4
+NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5   = 44 # NVLink data CRC Error Counter for Lane 5
+NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL = 45 # NvLink data CRC Error Counter total for all Lanes
+
+# NvLink Replay Error Counters
+NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0     = 46 # NVLink Replay Error Counter for Lane 0
+NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1     = 47 # NVLink Replay Error Counter for Lane 1
+NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2     = 48 # NVLink Replay Error Counter for Lane 2
+NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3     = 49 # NVLink Replay Error Counter for Lane 3
+NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4     = 50 # NVLink Replay Error Counter for Lane 4
+NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5     = 51 # NVLink Replay Error Counter for Lane 5
+NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL  = 52 # NVLink Replay Error Counter total for all Lanes
+
+# NvLink Recovery Error Counters
+NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0   = 53 # NVLink Recovery Error Counter for Lane 0
+NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1   = 54 # NVLink Recovery Error Counter for Lane 1
+NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2   = 55 # NVLink Recovery Error Counter for Lane 2
+NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3   = 56 # NVLink Recovery Error Counter for Lane 3
+NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4   = 57 # NVLink Recovery Error Counter for Lane 4
+NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5   = 58 # NVLink Recovery Error Counter for Lane 5
+NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL = 59 # NVLink Recovery Error Counter total for all Lanes
+
+# NvLink Bandwidth Counters
+NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L0    = 60 # NVLink Bandwidth Counter for Counter Set 0, Lane 0
+NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L1    = 61 # NVLink Bandwidth Counter for Counter Set 0, Lane 1
+NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L2    = 62 # NVLink Bandwidth Counter for Counter Set 0, Lane 2
+NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L3    = 63 # NVLink Bandwidth Counter for Counter Set 0, Lane 3
+NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L4    = 64 # NVLink Bandwidth Counter for Counter Set 0, Lane 4
+NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L5    = 65 # NVLink Bandwidth Counter for Counter Set 0, Lane 5
+NVML_FI_DEV_NVLINK_BANDWIDTH_C0_TOTAL = 66 # NVLink Bandwidth Counter Total for Counter Set 0, All Lanes
+
+# NvLink Bandwidth Counters
+NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L0    = 67 # NVLink Bandwidth Counter for Counter Set 1, Lane 0
+NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L1    = 68 # NVLink Bandwidth Counter for Counter Set 1, Lane 1
+NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L2    = 69 # NVLink Bandwidth Counter for Counter Set 1, Lane 2
+NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L3    = 70 # NVLink Bandwidth Counter for Counter Set 1, Lane 3
+NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L4    = 71 # NVLink Bandwidth Counter for Counter Set 1, Lane 4
+NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L5    = 72 # NVLink Bandwidth Counter for Counter Set 1, Lane 5
+NVML_FI_DEV_NVLINK_BANDWIDTH_C1_TOTAL = 73 # NVLink Bandwidth Counter Total for Counter Set 1, All Lanes
+
+# Perf Policy Counters
+NVML_FI_DEV_PERF_POLICY_POWER             = 74   # Perf Policy Counter for Power Policy
+NVML_FI_DEV_PERF_POLICY_THERMAL           = 75   # Perf Policy Counter for Thermal Policy
+NVML_FI_DEV_PERF_POLICY_SYNC_BOOST        = 76   # Perf Policy Counter for Sync boost Policy
+NVML_FI_DEV_PERF_POLICY_BOARD_LIMIT       = 77   # Perf Policy Counter for Board Limit
+NVML_FI_DEV_PERF_POLICY_LOW_UTILIZATION   = 78   # Perf Policy Counter for Low GPU Utilization Policy
+NVML_FI_DEV_PERF_POLICY_RELIABILITY       = 79   # Perf Policy Counter for Reliability Policy
+NVML_FI_DEV_PERF_POLICY_TOTAL_APP_CLOCKS  = 80   # Perf Policy Counter for Total App Clock Policy
+NVML_FI_DEV_PERF_POLICY_TOTAL_BASE_CLOCKS = 81   # Perf Policy Counter for Total Base Clocks Policy
+
+# Memory temperatures
+NVML_FI_DEV_MEMORY_TEMP  = 82 # Memory temperature for the device
+
+# Energy Counter
+NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION = 83 # Total energy consumption for the GPU in mJ since the driver was last reloaded
+
+# NVLink Speed
+NVML_FI_DEV_NVLINK_SPEED_MBPS_L0     = 84
+NVML_FI_DEV_NVLINK_SPEED_MBPS_L1     = 85
+NVML_FI_DEV_NVLINK_SPEED_MBPS_L2     = 86
+NVML_FI_DEV_NVLINK_SPEED_MBPS_L3     = 87
+NVML_FI_DEV_NVLINK_SPEED_MBPS_L4     = 88
+NVML_FI_DEV_NVLINK_SPEED_MBPS_L5     = 89
+NVML_FI_DEV_NVLINK_SPEED_MBPS_COMMON = 90
+
+# NVLink Link Count
+NVML_FI_DEV_NVLINK_LINK_COUNT = 91
+
+# Page Retirement pending fields
+NVML_FI_DEV_RETIRED_PENDING_SBE = 92
+NVML_FI_DEV_RETIRED_PENDING_DBE = 93
+
+# PCIe replay and replay rollover counters
+NVML_FI_DEV_PCIE_REPLAY_COUNTER = 94
+NVML_FI_DEV_PCIE_REPLAY_ROLLOVER_COUNTER = 95
+
+# NvLink Flit Error Counters
+NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6   = 96 # NVLink flow control CRC  Error Counter for Lane 6
+NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7   = 97 # NVLink flow control CRC  Error Counter for Lane 7
+NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8   = 98 # NVLink flow control CRC  Error Counter for Lane 8
+NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9   = 99 # NVLink flow control CRC  Error Counter for Lane 9
+NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10  = 100 # NVLink flow control CRC  Error Counter for Lane 10
+NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11  = 101 # NVLink flow control CRC  Error Counter for Lane 11
+
+# NvLink CRC Data Error Counters
+NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6   = 102 # NVLink data CRC Error Counter for Lane 6
+NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7   = 103 # NVLink data CRC Error Counter for Lane 7
+NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8   = 104 # NVLink data CRC Error Counter for Lane 8
+NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9   = 105 # NVLink data CRC Error Counter for Lane 9
+NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10  = 106 # NVLink data CRC Error Counter for Lane 10
+NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11  = 107 # NVLink data CRC Error Counter for Lane 11
+
+# NvLink Replay Error Counters
+NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6     = 108 # NVLink Replay Error Counter for Lane 6
+NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7     = 109 # NVLink Replay Error Counter for Lane 7
+NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8     = 110 # NVLink Replay Error Counter for Lane 8
+NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9     = 111 # NVLink Replay Error Counter for Lane 9
+NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10    = 112 # NVLink Replay Error Counter for Lane 10
+NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11    = 113 # NVLink Replay Error Counter for Lane 11
+
+# NvLink Recovery Error Counters
+NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6   = 114 # NVLink Recovery Error Counter for Lane 6
+NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7   = 115 # NVLink Recovery Error Counter for Lane 7
+NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8   = 116 # NVLink Recovery Error Counter for Lane 8
+NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9   = 117 # NVLink Recovery Error Counter for Lane 9
+NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10  = 118 # NVLink Recovery Error Counter for Lane 10
+NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11  = 119 # NVLink Recovery Error Counter for Lane 11
+
+# NvLink Bandwidth Counters
+NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L6    = 120 # NVLink Bandwidth Counter for Counter Set 0, Lane 6
+NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L7    = 121 # NVLink Bandwidth Counter for Counter Set 0, Lane 7
+NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L8    = 122 # NVLink Bandwidth Counter for Counter Set 0, Lane 8
+NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L9    = 123 # NVLink Bandwidth Counter for Counter Set 0, Lane 9
+NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L10   = 124 # NVLink Bandwidth Counter for Counter Set 0, Lane 10
+NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L11   = 125 # NVLink Bandwidth Counter for Counter Set 0, Lane 11
+
+# NvLink Bandwidth Counters
+NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L6    = 126 # NVLink Bandwidth Counter for Counter Set 1, Lane 6
+NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L7    = 127 # NVLink Bandwidth Counter for Counter Set 1, Lane 7
+NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L8    = 128 # NVLink Bandwidth Counter for Counter Set 1, Lane 8
+NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L9    = 129 # NVLink Bandwidth Counter for Counter Set 1, Lane 9
+NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L10   = 130 # NVLink Bandwidth Counter for Counter Set 1, Lane 10
+NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L11   = 131 # NVLink Bandwidth Counter for Counter Set 1, Lane 11
+
+# NVLink Speed
+NVML_FI_DEV_NVLINK_SPEED_MBPS_L6     = 132
+NVML_FI_DEV_NVLINK_SPEED_MBPS_L7     = 133
+NVML_FI_DEV_NVLINK_SPEED_MBPS_L8     = 134
+NVML_FI_DEV_NVLINK_SPEED_MBPS_L9     = 135
+NVML_FI_DEV_NVLINK_SPEED_MBPS_L10    = 136
+NVML_FI_DEV_NVLINK_SPEED_MBPS_L11    = 137
+
+# NVLink Throughput Counters
+NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_TX = 138 # NVLink TX Data throughput in KiB
+NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_RX = 139 # NVLink RX Data throughput in KiB
+NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX  = 140 # NVLink TX Data + protocol overhead in KiB
+NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX  = 141 # NVLink RX Data + protocol overhead in KiB
+
+# Row Remapper
+NVML_FI_DEV_REMAPPED_COR        = 142
+NVML_FI_DEV_REMAPPED_UNC        = 143
+NVML_FI_DEV_REMAPPED_PENDING    = 144
+NVML_FI_DEV_REMAPPED_FAILURE    = 145
+
+#Remote device NVLink ID
+NVML_FI_DEV_NVLINK_REMOTE_NVLINK_ID = 146
+
+# Number of NVLinks connected to NVSwitch
+NVML_FI_DEV_NVSWITCH_CONNECTED_LINK_COUNT = 147
+
+# NvLink ECC Data Error Counters
+NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L0    = 148 #< NVLink data ECC Error Counter for Link 0
+NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L1    = 149 #< NVLink data ECC Error Counter for Link 1
+NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L2    = 150 #< NVLink data ECC Error Counter for Link 2
+NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L3    = 151 #< NVLink data ECC Error Counter for Link 3
+NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L4    = 152 #< NVLink data ECC Error Counter for Link 4
+NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L5    = 153 #< NVLink data ECC Error Counter for Link 5
+NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L6    = 154 #< NVLink data ECC Error Counter for Link 6
+NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L7    = 155 #< NVLink data ECC Error Counter for Link 7
+NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L8    = 156 #< NVLink data ECC Error Counter for Link 8
+NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L9    = 157 #< NVLink data ECC Error Counter for Link 9
+NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L10   = 158 #< NVLink data ECC Error Counter for Link 10
+NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L11   = 159 #< NVLink data ECC Error Counter for Link 11
+NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_TOTAL = 160 #< NvLink data ECC Error Counter total for all Links
+
+NVML_FI_DEV_NVLINK_ERROR_DL_REPLAY            = 161
+NVML_FI_DEV_NVLINK_ERROR_DL_RECOVERY          = 162
+NVML_FI_DEV_NVLINK_ERROR_DL_CRC               = 163
+NVML_FI_DEV_NVLINK_GET_SPEED                  = 164
+NVML_FI_DEV_NVLINK_GET_STATE                  = 165
+NVML_FI_DEV_NVLINK_GET_VERSION                = 166
+
+NVML_FI_DEV_NVLINK_GET_POWER_STATE            = 167
+NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD        = 168
+
+NVML_FI_DEV_PCIE_L0_TO_RECOVERY_COUNTER       = 169
+
+NVML_FI_DEV_C2C_LINK_COUNT                    = 170
+NVML_FI_DEV_C2C_LINK_GET_STATUS               = 171
+NVML_FI_DEV_C2C_LINK_GET_MAX_BW               = 172
+
+NVML_FI_DEV_PCIE_COUNT_CORRECTABLE_ERRORS     = 173
+NVML_FI_DEV_PCIE_COUNT_NAKS_RECEIVED          = 174
+NVML_FI_DEV_PCIE_COUNT_RECEIVER_ERROR         = 175
+NVML_FI_DEV_PCIE_COUNT_BAD_TLP                = 176
+NVML_FI_DEV_PCIE_COUNT_NAKS_SENT              = 177
+NVML_FI_DEV_PCIE_COUNT_BAD_DLLP               = 178
+NVML_FI_DEV_PCIE_COUNT_NON_FATAL_ERROR        = 179
+NVML_FI_DEV_PCIE_COUNT_FATAL_ERROR            = 180
+NVML_FI_DEV_PCIE_COUNT_UNSUPPORTED_REQ        = 181
+NVML_FI_DEV_PCIE_COUNT_LCRC_ERROR             = 182
+NVML_FI_DEV_PCIE_COUNT_LANE_ERROR             = 183
+
+NVML_FI_DEV_IS_RESETLESS_MIG_SUPPORTED        = 184
+
+NVML_FI_DEV_POWER_AVERAGE                     = 185
+NVML_FI_DEV_POWER_INSTANT                     = 186
+NVML_FI_DEV_POWER_MIN_LIMIT                   = 187
+NVML_FI_DEV_POWER_MAX_LIMIT                   = 188
+NVML_FI_DEV_POWER_DEFAULT_LIMIT               = 189
+NVML_FI_DEV_POWER_CURRENT_LIMIT               = 190
+NVML_FI_DEV_ENERGY                            = 191
+NVML_FI_DEV_POWER_REQUESTED_LIMIT             = 192
+
+NVML_FI_DEV_TEMPERATURE_SHUTDOWN_TLIMIT       = 193
+NVML_FI_DEV_TEMPERATURE_SLOWDOWN_TLIMIT       = 194
+NVML_FI_DEV_TEMPERATURE_MEM_MAX_TLIMIT        = 195
+NVML_FI_DEV_TEMPERATURE_GPU_MAX_TLIMIT        = 196
+
+NVML_FI_DEV_PCIE_COUNT_TX_BYTES               = 197
+NVML_FI_DEV_PCIE_COUNT_RX_BYTES               = 198
+
+NVML_FI_DEV_IS_MIG_MODE_INDEPENDENT_MIG_QUERY_CAPABLE   = 199
+
+NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD_MAX              = 200
+
+NVML_FI_DEV_NVLINK_COUNT_XMIT_PACKETS                    = 201
+NVML_FI_DEV_NVLINK_COUNT_XMIT_BYTES                      = 202
+NVML_FI_DEV_NVLINK_COUNT_RCV_PACKETS                     = 203
+NVML_FI_DEV_NVLINK_COUNT_RCV_BYTES                       = 204
+NVML_FI_DEV_NVLINK_COUNT_VL15_DROPPED                    = 205 # Deprecated, do not use
+NVML_FI_DEV_NVLINK_COUNT_MALFORMED_PACKET_ERRORS         = 206
+NVML_FI_DEV_NVLINK_COUNT_BUFFER_OVERRUN_ERRORS           = 207
+NVML_FI_DEV_NVLINK_COUNT_RCV_ERRORS                      = 208
+NVML_FI_DEV_NVLINK_COUNT_RCV_REMOTE_ERRORS               = 209
+NVML_FI_DEV_NVLINK_COUNT_RCV_GENERAL_ERRORS              = 210
+NVML_FI_DEV_NVLINK_COUNT_LOCAL_LINK_INTEGRITY_ERRORS     = 211
+NVML_FI_DEV_NVLINK_COUNT_XMIT_DISCARDS                   = 212
+
+NVML_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_SUCCESSFUL_EVENTS = 213
+NVML_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_FAILED_EVENTS     = 214
+NVML_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_EVENTS            = 215
+
+NVML_FI_DEV_NVLINK_COUNT_RAW_BER_LANE0                   = 216  # Deprecated, do not use
+NVML_FI_DEV_NVLINK_COUNT_RAW_BER_LANE1                   = 217  # Deprecated, do not use
+NVML_FI_DEV_NVLINK_COUNT_RAW_BER                         = 218  # Deprecated, do not use
+NVML_FI_DEV_NVLINK_COUNT_EFFECTIVE_ERRORS                = 219
+NVML_FI_DEV_NVLINK_COUNT_EFFECTIVE_BER                   = 220
+NVML_FI_DEV_NVLINK_COUNT_SYMBOL_ERRORS                   = 221
+NVML_FI_DEV_NVLINK_COUNT_SYMBOL_BER                      = 222
+
+NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD_MIN               = 223
+NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD_UNITS             = 224 # Values are in the form NVML_NVLINK_LOW_POWER_THRESHOLD_UNIT_*
+NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD_SUPPORTED         = 225
+
+NVML_FI_DEV_RESET_STATUS                                 = 226 # Deprecated use NVML_FI_DEV_GET_GPU_RECOVERY_ACTION instead 
+NVML_FI_DEV_DRAIN_AND_RESET_STATUS                       = 227 # Deprecated use NVML_FI_DEV_GET_GPU_RECOVERY_ACTION instead
+NVML_FI_DEV_PCIE_OUTBOUND_ATOMICS_MASK                   = 228
+NVML_FI_DEV_PCIE_INBOUND_ATOMICS_MASK                    = 229
+NVML_FI_DEV_GET_GPU_RECOVERY_ACTION                      = 230
+
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_0                   = 235
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_1                   = 236
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_2                   = 237
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_3                   = 238
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_4                   = 239
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_5                   = 240
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_6                   = 241
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_7                   = 242
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_8                   = 243
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_9                   = 244
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_10                  = 245
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_11                  = 246
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_12                  = 247
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_13                  = 248
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_14                  = 249
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_15                  = 250
+NVML_FI_PWR_SMOOTHING_ENABLED                                   = 251 # Enablement (0/DISABLED or 1/ENABLED)
+NVML_FI_PWR_SMOOTHING_PRIV_LVL                                  = 252 # Current privilege level
+NVML_FI_PWR_SMOOTHING_IMM_RAMP_DOWN_ENABLED                     = 253 # Immediate ramp down enablement (0/DISABLED or 1/ENABLED)
+NVML_FI_PWR_SMOOTHING_APPLIED_TMP_CEIL                          = 254 # Applied TMP ceiling value
+NVML_FI_PWR_SMOOTHING_APPLIED_TMP_FLOOR                         = 255 # Applied TMP floor value
+NVML_FI_PWR_SMOOTHING_MAX_PERCENT_TMP_FLOOR_SETTING             = 256 # Max % TMP Floor value
+NVML_FI_PWR_SMOOTHING_MIN_PERCENT_TMP_FLOOR_SETTING             = 257 # Min % TMP Floor value
+NVML_FI_PWR_SMOOTHING_HW_CIRCUITRY_PERCENT_LIFETIME_REMAINING   = 258 # HW Circuitry % lifetime remaining
+NVML_FI_PWR_SMOOTHING_MAX_NUM_PRESET_PROFILES                   = 259 # Max number of preset profiles
+NVML_FI_PWR_SMOOTHING_PROFILE_PERCENT_TMP_FLOOR                 = 260 # % TMP floor for a given profile
+NVML_FI_PWR_SMOOTHING_PROFILE_RAMP_UP_RATE                      = 261 # Ramp up rate in mW/s for a given profile
+NVML_FI_PWR_SMOOTHING_PROFILE_RAMP_DOWN_RATE                    = 262 # Ramp down rate in mW/s for a given profile
+NVML_FI_PWR_SMOOTHING_PROFILE_RAMP_DOWN_HYST_VAL                = 263 # Ramp down hysteresis value in ms for a given profile
+NVML_FI_PWR_SMOOTHING_ACTIVE_PRESET_PROFILE                     = 264 # Active preset profile number
+NVML_FI_PWR_SMOOTHING_ADMIN_OVERRIDE_PERCENT_TMP_FLOOR          = 265 # % TMP floor for a given profile
+NVML_FI_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_UP_RATE               = 266 # Ramp up rate in mW/s for a given profile
+NVML_FI_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_DOWN_RATE             = 267 # Ramp down rate in mW/s for a given profile
+NVML_FI_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_DOWN_HYST_VAL         = 268 # Ramp down hysteresis value in ms for a given profile
+
+NVML_FI_MAX = 269 # One greater than the largest field ID defined above
+
+# NVML_FI_DEV_NVLINK_GET_STATE state enums
+NVML_NVLINK_STATE_INACTIVE = 0x0
+NVML_NVLINK_STATE_ACTIVE   = 0x1
+NVML_NVLINK_STATE_SLEEP    = 0x2
+
+NVML_NVLINK_LOW_POWER_THRESHOLD_UNIT_100US = 0 # NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD_UNITS
+NVML_NVLINK_LOW_POWER_THRESHOLD_UNIT_50US  = 1 # NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD_UNITS
+
+## Enums needed for the method nvmlDeviceGetVirtualizationMode and nvmlDeviceSetVirtualizationMode
+NVML_GPU_VIRTUALIZATION_MODE_NONE        = 0  # Represents Bare Metal GPU
+NVML_GPU_VIRTUALIZATION_MODE_PASSTHROUGH = 1  # Device is associated with GPU-Passthorugh
+NVML_GPU_VIRTUALIZATION_MODE_VGPU        = 2  # Device is associated with vGPU inside virtual machine.
+NVML_GPU_VIRTUALIZATION_MODE_HOST_VGPU   = 3  # Device is associated with VGX hypervisor in vGPU mode
+NVML_GPU_VIRTUALIZATION_MODE_HOST_VSGA   = 4  # Device is associated with VGX hypervisor in vSGA mode
+
+## Lib loading ##
+nvmlLib = None
+libLoadLock = threading.Lock()
+_nvmlLib_refcount = 0 # Incremented on each nvmlInit and decremented on nvmlShutdown
+
+## vGPU Management
+_nvmlVgpuTypeId_t   = c_uint
+_nvmlVgpuInstance_t = c_uint
+
+_nvmlVgpuVmIdType_t = c_uint
+NVML_VGPU_VM_ID_DOMAIN_ID    = 0
+NVML_VGPU_VM_ID_UUID         = 1
+
+_nvmlGridLicenseFeatureCode_t = c_uint
+NVML_GRID_LICENSE_FEATURE_CODE_UNKNOWN      = 0
+NVML_GRID_LICENSE_FEATURE_CODE_VGPU         = 1
+NVML_GRID_LICENSE_FEATURE_CODE_NVIDIA_RTX   = 2
+NVML_GRID_LICENSE_FEATURE_CODE_VWORKSTATION = 2 # deprecated, use NVML_GRID_LICENSE_FEATURE_CODE_NVIDIA_RTX.
+NVML_GRID_LICENSE_FEATURE_CODE_GAMING       = 3
+NVML_GRID_LICENSE_FEATURE_CODE_COMPUTE      = 4
+
+_nvmlGridLicenseExpiryStatus_t = c_uint8
+NVML_GRID_LICENSE_EXPIRY_NOT_AVAILABLE    = 0,   # Expiry information not available
+NVML_GRID_LICENSE_EXPIRY_INVALID          = 1,   # Invalid expiry or error fetching expiry
+NVML_GRID_LICENSE_EXPIRY_VALID            = 2,   # Valid expiry
+NVML_GRID_LICENSE_EXPIRY_NOT_APPLICABLE   = 3,   # Expiry not applicable
+NVML_GRID_LICENSE_EXPIRY_PERMANENT        = 4,   # Permanent expiry
+
+_nvmlVgpuCapability_t = c_uint
+NVML_VGPU_CAP_NVLINK_P2P                    = 0  # vGPU P2P over NVLink is supported
+NVML_VGPU_CAP_GPUDIRECT                     = 1  # GPUDirect capability is supported
+NVML_VGPU_CAP_MULTI_VGPU_EXCLUSIVE          = 2  # vGPU profile cannot be mixed with other vGPU profiles in same VM
+NVML_VGPU_CAP_EXCLUSIVE_TYPE                = 3  # vGPU profile cannot run on a GPU alongside other profiles of different type
+NVML_VGPU_CAP_EXCLUSIVE_SIZE                = 4  # vGPU profile cannot run on a GPU alongside other profiles of different size
+NVML_VGPU_CAP_COUNT                         = 5
+
+_nvmlVgpuDriverCapability_t = c_uint
+NVML_VGPU_DRIVER_CAP_HETEROGENEOUS_MULTI_VGPU   = 0  # Supports mixing of different vGPU profiles within one guest VM
+NVML_VGPU_DRIVER_CAP_WARM_UPDATE                = 1  # Supports FSR and warm update of vGPU host driver without terminating the running guest VM
+NVML_VGPU_DRIVER_CAP_COUNT                      = 2
+
+_nvmlDeviceVgpuCapability_t = c_uint
+NVML_DEVICE_VGPU_CAP_FRACTIONAL_MULTI_VGPU             = 0  # Query whether the fractional vGPU profiles on this GPU can be used in multi-vGPU configurations
+NVML_DEVICE_VGPU_CAP_HETEROGENEOUS_TIMESLICE_PROFILES  = 1  # Query whether the GPU supports concurrent execution of timesliced vGPU profiles of differing types
+NVML_DEVICE_VGPU_CAP_HETEROGENEOUS_TIMESLICE_SIZES     = 2  # Query whether the GPU supports concurrent execution of timesliced vGPU profiles of differing framebuffer sizes
+NVML_DEVICE_VGPU_CAP_READ_DEVICE_BUFFER_BW             = 3  # Query the GPU's read_device_buffer expected bandwidth capacity in megabytes per second
+NVML_DEVICE_VGPU_CAP_WRITE_DEVICE_BUFFER_BW            = 4  # Query the GPU's write_device_buffer expected bandwidth capacity in megabytes per second
+NVML_DEVICE_VGPU_CAP_DEVICE_STREAMING                  = 5  # Query whether the vGPU profiles on the GPU supports migration data streaming
+NVML_DEVICE_VGPU_CAP_MINI_QUARTER_GPU                  = 6  # Set/Get support of mini-quarter vGPU profiles
+NVML_DEVICE_VGPU_CAP_COMPUTE_MEDIA_ENGINE_GPU          = 7  # Set/Get support for compute media engine vGPU profiles
+NVML_DEVICE_VGPU_CAP_WARM_UPDATE                       = 8  # Query whether the GPU supports FSR and warm update
+NVML_DEVICE_VGPU_CAP_HOMOGENEOUS_PLACEMENTS            = 9  # Query whether the GPU supports reporting of placements of timesliced vGPU profiles with identical framebuffer sizes
+NVML_DEVICE_VGPU_CAP_COUNT                             = 10
+
+_nvmlVgpuGuestInfoState_t = c_uint
+NVML_VGPU_INSTANCE_GUEST_INFO_STATE_UNINITIALIZED = 0
+NVML_VGPU_INSTANCE_GUEST_INFO_STATE_INITIALIZED   = 1
+
+_nvmlVgpuVmCompatibility_t = c_uint
+NVML_VGPU_VM_COMPATIBILITY_NONE         = 0x0
+NVML_VGPU_VM_COMPATIBILITY_COLD         = 0x1
+NVML_VGPU_VM_COMPATIBILITY_HIBERNATE    = 0x2
+NVML_VGPU_VM_COMPATIBILITY_SLEEP        = 0x4
+NVML_VGPU_VM_COMPATIBILITY_LIVE         = 0x8
+
+_nvmlVgpuPgpuCompatibilityLimitCode_t = c_uint
+NVML_VGPU_COMPATIBILITY_LIMIT_NONE          = 0x0
+NVML_VGPU_COMPATIBILITY_LIMIT_HOST_DRIVER   = 0x1
+NVML_VGPU_COMPATIBILITY_LIMIT_GUEST_DRIVER  = 0x2
+NVML_VGPU_COMPATIBILITY_LIMIT_GPU           = 0x4
+NVML_VGPU_COMPATIBILITY_LIMIT_OTHER         = 0x80000000
+
+_nvmlHostVgpuMode_t = c_uint
+NVML_HOST_VGPU_MODE_NON_SRIOV   = 0
+NVML_HOST_VGPU_MODE_SRIOV       = 1
+
+_nvmlConfComputeGpusReadyState_t = c_uint
+NVML_CC_ACCEPTING_CLIENT_REQUESTS_FALSE = 0
+NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE = 1
+
+_nvmlConfComputeGpuCaps_t = c_uint
+NVML_CC_SYSTEM_GPUS_CC_NOT_CAPABLE = 0
+NVML_CC_SYSTEM_GPUS_CC_CAPABLE = 1
+
+_nvmlConfComputeCpuCaps_t = c_uint
+NVML_CC_SYSTEM_CPU_CAPS_NONE = 0
+NVML_CC_SYSTEM_CPU_CAPS_AMD_SEV = 1
+NVML_CC_SYSTEM_CPU_CAPS_INTEL_TDX = 2
+NVML_CC_SYSTEM_CPU_CAPS_AMD_SEV_SNP = 3
+NVML_CC_SYSTEM_CPU_CAPS_AMD_SNP_VTOM = 4
+
+_nvmlConfComputeDevToolsMode_t = c_uint
+NVML_CC_SYSTEM_DEVTOOLS_MODE_OFF = 0
+NVML_CC_SYSTEM_DEVTOOLS_MODE_ON = 1
+
+NVML_CC_SYSTEM_MULTIGPU_NONE = 0
+NVML_CC_SYSTEM_MULTIGPU_PROTECTED_PCIE = 1
+ 
+NVML_CC_SYSTEM_ENVIRONMENT_UNAVAILABLE = 0
+NVML_CC_SYSTEM_ENVIRONMENT_SIM = 1
+NVML_CC_SYSTEM_ENVIRONMENT_PROD = 2
+ 
+_nvmlConfComputeCcFeature_t = c_uint
+NVML_CC_SYSTEM_FEATURE_DISABLED = 0
+NVML_CC_SYSTEM_FEATURE_ENABLED = 1
+
+_nvmlConfComputeCcKeyRotationThreshAttackerAdv_t = c_uint
+NVML_CC_KEY_ROTATION_THRESH_ATTACKER_ADVANTAGE_MIN = 50
+NVML_CC_KEY_ROTATION_THRESH_ATTACKER_ADVANTAGE_MAX = 65
+
+# GSP firmware
+NVML_GSP_FIRMWARE_VERSION_BUF_SIZE = 0x40
+
+class NVMLLibraryMismatchError(Exception):
+    pass
+
+## Error Checking ##
+class NVMLError(Exception):
+    _valClassMapping = dict()
+    # List of currently known error codes
+    _errcode_to_string = {
+        NVML_ERROR_UNINITIALIZED:       "Uninitialized",
+        NVML_ERROR_INVALID_ARGUMENT:    "Invalid Argument",
+        NVML_ERROR_NOT_SUPPORTED:       "Not Supported",
+        NVML_ERROR_NO_PERMISSION:       "Insufficient Permissions",
+        NVML_ERROR_ALREADY_INITIALIZED: "Already Initialized",
+        NVML_ERROR_NOT_FOUND:           "Not Found",
+        NVML_ERROR_INSUFFICIENT_SIZE:   "Insufficient Size",
+        NVML_ERROR_INSUFFICIENT_POWER:  "Insufficient External Power",
+        NVML_ERROR_DRIVER_NOT_LOADED:   "Driver Not Loaded",
+        NVML_ERROR_TIMEOUT:             "Timeout",
+        NVML_ERROR_IRQ_ISSUE:           "Interrupt Request Issue",
+        NVML_ERROR_LIBRARY_NOT_FOUND:   "NVML Shared Library Not Found",
+        NVML_ERROR_FUNCTION_NOT_FOUND:  "Function Not Found",
+        NVML_ERROR_CORRUPTED_INFOROM:   "Corrupted infoROM",
+        NVML_ERROR_GPU_IS_LOST:         "GPU is lost",
+        NVML_ERROR_RESET_REQUIRED:      "GPU requires restart",
+        NVML_ERROR_OPERATING_SYSTEM:    "The operating system has blocked the request.",
+        NVML_ERROR_LIB_RM_VERSION_MISMATCH: "RM has detected an NVML/RM version mismatch.",
+        NVML_ERROR_MEMORY:              "Insufficient Memory",
+        NVML_ERROR_UNKNOWN:             "Unknown Error",
+        }
+    def __new__(typ, value):
+        '''
+        Maps value to a proper subclass of NVMLError.
+        See _extractNVMLErrorsAsClasses function for more details
+        '''
+        if typ == NVMLError:
+            typ = NVMLError._valClassMapping.get(value, typ)
+        obj = Exception.__new__(typ)
+        obj.value = value
+        return obj
+    def __str__(self):
+        try:
+            if self.value not in NVMLError._errcode_to_string:
+                NVMLError._errcode_to_string[self.value] = str(nvmlErrorString(self.value))
+            return NVMLError._errcode_to_string[self.value]
+        except NVMLError:
+            return "NVML Error with code %d" % self.value
+    def __eq__(self, other):
+        return self.value == other.value
+
+def nvmlExceptionClass(nvmlErrorCode):
+    if nvmlErrorCode not in NVMLError._valClassMapping:
+        raise ValueError('nvmlErrorCode %s is not valid' % nvmlErrorCode)
+    return NVMLError._valClassMapping[nvmlErrorCode]
+
+def _extractNVMLErrorsAsClasses():
+    '''
+    Generates a hierarchy of classes on top of NVMLError class.
+
+    Each NVML Error gets a new NVMLError subclass. This way try,except blocks can filter appropriate
+    exceptions more easily.
+
+    NVMLError is a parent class. Each NVML_ERROR_* gets it's own subclass.
+    e.g. NVML_ERROR_ALREADY_INITIALIZED will be turned into NVMLError_AlreadyInitialized
+    '''
+    this_module = sys.modules[__name__]
+    nvmlErrorsNames = [x for x in dir(this_module) if x.startswith("NVML_ERROR_")]
+    for err_name in nvmlErrorsNames:
+        # e.g. Turn NVML_ERROR_ALREADY_INITIALIZED into NVMLError_AlreadyInitialized
+        class_name = "NVMLError_" + string.capwords(err_name.replace("NVML_ERROR_", ""), "_").replace("_", "")
+        err_val = getattr(this_module, err_name)
+        def gen_new(val):
+            def new(typ):
+                obj = NVMLError.__new__(typ, val)
+                return obj
+            return new
+        new_error_class = type(class_name, (NVMLError,), {'__new__': gen_new(err_val)})
+        new_error_class.__module__ = __name__
+        setattr(this_module, class_name, new_error_class)
+        NVMLError._valClassMapping[err_val] = new_error_class
+_extractNVMLErrorsAsClasses()
+
+def _nvmlCheckReturn(ret):
+    if (ret != NVML_SUCCESS):
+        raise NVMLError(ret)
+    return ret
+
+## Function access ##
+_nvmlGetFunctionPointer_cache = dict() # function pointers are cached to prevent unnecessary libLoadLock locking
+def _nvmlGetFunctionPointer(name):
+    global nvmlLib
+
+    if name in _nvmlGetFunctionPointer_cache:
+        return _nvmlGetFunctionPointer_cache[name]
+
+    libLoadLock.acquire()
+    try:
+        # ensure library was loaded
+        if (nvmlLib == None):
+            raise NVMLError(NVML_ERROR_UNINITIALIZED)
+        try:
+            _nvmlGetFunctionPointer_cache[name] = getattr(nvmlLib, name)
+            return _nvmlGetFunctionPointer_cache[name]
+        except AttributeError:
+            raise NVMLError(NVML_ERROR_FUNCTION_NOT_FOUND)
+    finally:
+        # lock is always freed
+        libLoadLock.release()
+
+## Alternative object
+# Allows the object to be printed
+# Allows mismatched types to be assigned
+#  - like None when the Structure variant requires c_uint
+class nvmlFriendlyObject(object):
+    def __init__(self, dictionary):
+        for x in dictionary:
+            setattr(self, x, dictionary[x])
+    def __str__(self):
+        return self.__dict__.__str__()
+
+def nvmlStructToFriendlyObject(struct):
+    d = {}
+    for x in struct._fields_:
+        key = x[0]
+        value = getattr(struct, key)
+        # only need to convert from bytes if bytes, no need to check python version.
+        d[key] = value.decode() if isinstance(value, bytes) else value
+    obj = nvmlFriendlyObject(d)
+    return obj
+
+# pack the object so it can be passed to the NVML library
+def nvmlFriendlyObjectToStruct(obj, model):
+    for x in model._fields_:
+        key = x[0]
+        value = obj.__dict__[key]
+        # any c_char_p in python3 needs to be bytes, default encoding works fine.
+        if sys.version_info >= (3,):
+            setattr(model, key, value.encode())
+        else:
+            setattr(model, key, value)
+    return model
+
+## Unit structures
+class struct_c_nvmlUnit_t(Structure):
+    pass # opaque handle
+c_nvmlUnit_t = POINTER(struct_c_nvmlUnit_t)
+
+class _PrintableStructure(Structure):
+    """
+    Abstract class that produces nicer __str__ output than ctypes.Structure.
+    e.g. instead of:
+      >>> print str(obj)
+      <class_name object at 0x7fdf82fef9e0>
+    this class will print
+      class_name(field_name: formatted_value, field_name: formatted_value)
+
+    _fmt_ dictionary of <str _field_ name> -> <str format>
+    e.g. class that has _field_ 'hex_value', c_uint could be formatted with
+      _fmt_ = {"hex_value" : "%08X"}
+    to produce nicer output.
+    Default formatting string for all fields can be set with key "<default>" like:
+      _fmt_ = {"<default>" : "%d MHz"} # e.g all values are numbers in MHz.
+    If not set it's assumed to be just "%s"
+
+    Exact format of returned str from this class is subject to change in the future.
+    """
+    _fmt_ = {}
+    def __str__(self):
+        result = []
+        for x in self._fields_:
+            key = x[0]
+            value = getattr(self, key)
+            fmt = "%s"
+            if key in self._fmt_:
+                fmt = self._fmt_[key]
+            elif "<default>" in self._fmt_:
+                fmt = self._fmt_["<default>"]
+            result.append(("%s: " + fmt) % (key, value))
+        return self.__class__.__name__ + "(" +  ", ".join(result) + ")"
+
+    def __getattribute__(self, name):
+        res = super(_PrintableStructure, self).__getattribute__(name)
+        # need to convert bytes to unicode for python3 don't need to for python2
+        # Python 2 strings are of both str and bytes
+        # Python 3 strings are not of type bytes
+        # ctypes should convert everything to the correct values otherwise
+        if isinstance(res, bytes):
+            if isinstance(res, str):
+                return res
+            return res.decode()
+        return res
+
+    def __setattr__(self, name, value):
+        if isinstance(value, str):
+            # encoding a python2 string returns the same value, since python2 strings are bytes already
+            # bytes passed in python3 will be ignored.
+            value = value.encode()
+        super(_PrintableStructure, self).__setattr__(name, value)
+
+class c_nvmlUnitInfo_t(_PrintableStructure):
+    _fields_ = [
+        ('name', c_char * 96),
+        ('id', c_char * 96),
+        ('serial', c_char * 96),
+        ('firmwareVersion', c_char * 96),
+    ]
+
+class c_nvmlC2cModeInfo_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('isC2cEnabled', c_uint)
+    ]
+
+nvmlC2cModeInfo_v1 = 0x1000008;
+
+class c_nvmlLedState_t(_PrintableStructure):
+    _fields_ = [
+        ('cause', c_char * 256),
+        ('color', _nvmlLedColor_t),
+    ]
+
+class c_nvmlPSUInfo_t(_PrintableStructure):
+    _fields_ = [
+        ('state', c_char * 256),
+        ('current', c_uint),
+        ('voltage', c_uint),
+        ('power', c_uint),
+    ]
+
+class c_nvmlUnitFanInfo_t(_PrintableStructure):
+    _fields_ = [
+        ('speed', c_uint),
+        ('state', _nvmlFanState_t),
+    ]
+
+class c_nvmlUnitFanSpeeds_t(_PrintableStructure):
+    _fields_ = [
+        ('fans', c_nvmlUnitFanInfo_t * 24),
+        ('count', c_uint)
+    ]
+
+## Device structures
+class struct_c_nvmlDevice_t(Structure):
+    pass # opaque handle
+c_nvmlDevice_t = POINTER(struct_c_nvmlDevice_t)
+
+class nvmlPciInfoExt_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('domain', c_uint),
+        ('bus', c_uint),
+        ('device', c_uint),
+        ('pciDeviceId', c_uint),
+        ('pciSubSystemId', c_uint),
+        ('baseClass', c_uint),
+        ('subClass', c_uint),
+        ('busId', c_char * NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE),
+    ]
+    _fmt_ = {
+            'version'        : "0x%04X",
+            'domain'         : "0x%04X",
+            'bus'            : "0x%02X",
+            'device'         : "0x%02X",
+            'pciDeviceId'    : "0x%08X",
+            'pciSubSystemId' : "0x%08X",
+            'baseClass'      : "0x%01X",
+            'subClass'       : "0x%01X",
+            }
+
+nvmlPciInfoExt_v1 = 0x1000040
+
+# Legacy pciInfo used for _v1 and _v2
+class nvmlPciInfo_v2_t(_PrintableStructure):
+    _fields_ = [
+        ('busId', c_char * NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE),
+        ('domain', c_uint),
+        ('bus', c_uint),
+        ('device', c_uint),
+        ('pciDeviceId', c_uint),
+
+        # Added in 2.285
+        ('pciSubSystemId', c_uint),
+        ('reserved0', c_uint),
+        ('reserved1', c_uint),
+        ('reserved2', c_uint),
+        ('reserved3', c_uint),
+    ]
+    _fmt_ = {
+            'domain'         : "0x%04X",
+            'bus'            : "0x%02X",
+            'device'         : "0x%02X",
+            'pciDeviceId'    : "0x%08X",
+            'pciSubSystemId' : "0x%08X",
+            }
+
+class nvmlPciInfo_t(_PrintableStructure):
+    _fields_ = [
+        # Moved to the new busId location below
+        ('busIdLegacy', c_char * NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE),
+        ('domain', c_uint),
+        ('bus', c_uint),
+        ('device', c_uint),
+        ('pciDeviceId', c_uint),
+
+        # Added in 2.285
+        ('pciSubSystemId', c_uint),
+        # New busId replaced the long deprecated and reserved fields with a
+        # field of the same size in 9.0
+        ('busId', c_char * NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE),
+    ]
+    _fmt_ = {
+            'domain'         : "0x%08X",
+            'bus'            : "0x%02X",
+            'device'         : "0x%02X",
+            'pciDeviceId'    : "0x%08X",
+            'pciSubSystemId' : "0x%08X",
+            }
+
+class c_nvmlSystemDriverBranchInfo_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ("branch", c_char * NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE),
+    ]
+
+SystemDriverBranchInfo_v1 = 0x1000054
+
+class c_nvmlExcludedDeviceInfo_t(_PrintableStructure):
+    _fields_ = [
+        ('pci', nvmlPciInfo_t),
+        ('uuid', c_char * NVML_DEVICE_UUID_BUFFER_SIZE)
+    ]
+
+class nvmlNvLinkUtilizationControl_t(_PrintableStructure):
+    _fields_ = [
+        ('units', _nvmlNvLinkUtilizationCountUnits_t),
+        ('pktfilter', _nvmlNvLinkUtilizationCountPktTypes_t),
+    ]
+
+class c_nvmlMemory_t(_PrintableStructure):
+    _fields_ = [
+        ('total', c_ulonglong),
+        ('free', c_ulonglong),
+        ('used', c_ulonglong),
+    ]
+    _fmt_ = {'<default>': "%d B"}
+
+class c_nvmlMemory_v2_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('total', c_ulonglong),
+        ('reserved', c_ulonglong),
+        ('free', c_ulonglong),
+        ('used', c_ulonglong),
+    ]
+    _fmt_ = {'<default>': "%d B"}
+
+nvmlMemory_v2 = 0x02000028
+
+class c_nvmlBAR1Memory_t(_PrintableStructure):
+    _fields_ = [
+        ('bar1Total', c_ulonglong),
+        ('bar1Free', c_ulonglong),
+        ('bar1Used', c_ulonglong),
+    ]
+    _fmt_ = {'<default>': "%d B"}
+
+class nvmlClkMonFaultInfo_t(Structure):
+    _fields_ = [("clkApiDomain", c_uint),
+                ("clkDomainFaultMask", c_uint)
+    ]
+
+MAX_CLK_DOMAINS = 32
+
+class nvmlClkMonStatus_t(Structure):
+    _fields_ = [("bGlobalStatus", c_uint),
+                ("clkMonListSize", c_uint),
+                ("clkMonList", nvmlClkMonFaultInfo_t * MAX_CLK_DOMAINS)
+    ]
+
+# On Windows with the WDDM driver, usedGpuMemory is reported as None
+# Code that processes this structure should check for None, I.E.
+#
+# if (info.usedGpuMemory == None):
+#     # TODO handle the error
+#     pass
+# else:
+#    print("Using %d MiB of memory" % (info.usedGpuMemory / 1024 / 1024))
+# endif
+#
+# See NVML documentation for more information
+class c_nvmlProcessInfo_v2_t(_PrintableStructure):
+    _fields_ = [
+        ('pid', c_uint),
+        ('usedGpuMemory', c_ulonglong),
+        ('gpuInstanceId', c_uint),
+        ('computeInstanceId', c_uint),
+    ]
+    _fmt_ = {'usedGpuMemory': "%d B"}
+
+c_nvmlProcessInfo_v3_t = c_nvmlProcessInfo_v2_t
+
+c_nvmlProcessInfo_t = c_nvmlProcessInfo_v3_t
+
+_nvmlProcessMode_t = c_uint
+NVML_PROCESS_MODE_COMPUTE  = 0
+NVML_PROCESS_MODE_GRAPHICS = 1
+NVML_PROCESS_MODE_MPS      = 2
+
+class c_nvmlProcessDetail_v1_t(Structure):
+    _fields_ = [
+        ('pid', c_uint),
+        ('usedGpuMemory', c_ulonglong),
+        ('gpuInstanceId', c_uint),
+        ('computeInstanceId', c_uint),
+        ('usedGpuCcProtectedMemory', c_ulonglong),
+    ]
+
+class c_nvmlProcessDetailList_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('mode', _nvmlProcessMode_t),
+        ('numProcArrayEntries', c_uint),
+        ('procArray', POINTER(c_nvmlProcessDetail_v1_t)),
+    ]
+    _fmt_ = {'numProcArrayEntries': "%d B"}
+
+c_nvmlProcessDetailList_t = c_nvmlProcessDetailList_v1_t
+
+nvmlProcessDetailList_v1 = 0x1000018
+
+class c_nvmlBridgeChipInfo_t(_PrintableStructure):
+    _fields_ = [
+        ('type', _nvmlBridgeChipType_t),
+        ('fwVersion', c_uint),
+    ]
+
+class c_nvmlBridgeChipHierarchy_t(_PrintableStructure):
+    _fields_ = [
+        ('bridgeCount', c_uint),
+        ('bridgeChipInfo', c_nvmlBridgeChipInfo_t * 128),
+    ]
+
+class c_nvmlEccErrorCounts_t(_PrintableStructure):
+    _fields_ = [
+        ('l1Cache', c_ulonglong),
+        ('l2Cache', c_ulonglong),
+        ('deviceMemory', c_ulonglong),
+        ('registerFile', c_ulonglong),
+    ]
+
+class c_nvmlUtilization_t(_PrintableStructure):
+    _fields_ = [
+        ('gpu', c_uint),
+        ('memory', c_uint),
+    ]
+    _fmt_ = {'<default>': "%d %%"}
+
+# Added in 2.285
+class c_nvmlHwbcEntry_t(_PrintableStructure):
+    _fields_ = [
+        ('hwbcId', c_uint),
+        ('firmwareVersion', c_char * 32),
+    ]
+
+class c_nvmlValue_t(Union):
+    _fields_ = [
+        ('dVal', c_double),
+        ('uiVal', c_uint),
+        ('ulVal', c_ulong),
+        ('ullVal', c_ulonglong),
+        ('sllVal', c_longlong),
+        ('siVal', c_int),
+        ('usVal', c_ushort),
+    ]
+
+class c_nvmlSample_t(_PrintableStructure):
+    _fields_ = [
+        ('timeStamp', c_ulonglong),
+        ('sampleValue', c_nvmlValue_t),
+    ]
+
+class c_nvmlViolationTime_t(_PrintableStructure):
+    _fields_ = [
+        ('referenceTime', c_ulonglong),
+        ('violationTime', c_ulonglong),
+    ]
+
+class c_nvmlFieldValue_t(_PrintableStructure):
+    _fields_ = [
+        ('fieldId', c_uint32),
+        ('scopeId', c_uint32),
+        ('timestamp', c_int64),
+        ('latencyUsec', c_int64),
+        ('valueType', _nvmlValueType_t),
+        ('nvmlReturn', _nvmlReturn_t),
+        ('value', c_nvmlValue_t)
+    ]
+
+NVML_NVLINK_TOTAL_SUPPORTED_BW_MODES = 23
+
+nvmlNvlinkSupportedBwModes_v1 = 0x100001c
+class c_nvmlNvlinkSupportedBwModes_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('bwModes', c_uint8 * NVML_NVLINK_TOTAL_SUPPORTED_BW_MODES),
+        ('totalBwModes', c_uint8)
+    ]
+
+    def __init__(self):
+        super(c_nvmlNvlinkSupportedBwModes_v1_t, self).__init__(version=nvmlNvlinkSupportedBwModes_v1)
+
+nvmlNvlinkGetBwMode_v1 = 0x100000c
+class c_nvmlNvlinkGetBwMode_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('bIsBest', c_uint),
+        ('bwMode', c_uint8)
+    ]
+
+    def __init__(self):
+        super(c_nvmlNvlinkGetBwMode_v1_t, self).__init__(version=nvmlNvlinkGetBwMode_v1)
+
+nvmlNvlinkSetBwMode_v1 = 0x100000c
+class c_nvmlNvlinkSetBwMode_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('bSetBest', c_uint),
+        ('bwMode', c_uint8)
+    ]
+
+    def __init__(self):
+        super(c_nvmlNvlinkSetBwMode_v1_t, self).__init__(version=nvmlNvlinkSetBwMode_v1)
+
+class c_nvmlVgpuHeterogeneousMode_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('mode', c_uint),
+    ]
+
+VgpuHeterogeneousMode_v1 = 0x1000008
+
+class c_nvmlVgpuPlacementId_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('placementId', c_uint),
+    ]
+
+VgpuPlacementId_v1 = 0x1000008
+
+class c_nvmlVgpuPlacementList_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('count', c_uint),
+        ('placementSize', c_uint),
+        ('placementIds', POINTER(c_uint)),
+    ]
+
+VgpuPlacementList_v1 = 0x1000018
+
+NVML_VGPU_PGPU_HETEROGENEOUS_MODE   = 0
+NVML_VGPU_PGPU_HOMOGENEOUS_MODE     = 1
+
+class c_nvmlVgpuPlacementList_v2_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('placementSize', c_uint),
+        ('count', c_uint),
+        ('placementIds', POINTER(c_uint)),
+        ('mode', c_uint),
+    ]
+
+VgpuPlacementList_v2 = 0x2000020
+
+class c_nvmlVgpuTypeBar1Info_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('bar1Size', c_ulonglong),
+    ]
+
+VgpuTypeBar1Info_v1 = 0x1000010
+
+class c_nvmlVgpuInstanceUtilizationSample_t(_PrintableStructure):
+    _fields_ = [
+        ('vgpuInstance', _nvmlVgpuInstance_t),
+        ('timeStamp', c_ulonglong),
+        ('smUtil', c_nvmlValue_t),
+        ('memUtil', c_nvmlValue_t),
+        ('encUtil', c_nvmlValue_t),
+        ('decUtil', c_nvmlValue_t),
+    ]
+
+class c_nvmlVgpuInstanceUtilizationInfo_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('timeStamp', c_ulonglong),
+        ('vgpuInstance', _nvmlVgpuInstance_t),
+        ('smUtil', c_nvmlValue_t),
+        ('memUtil', c_nvmlValue_t),
+        ('encUtil', c_nvmlValue_t),
+        ('decUtil', c_nvmlValue_t),
+        ('jpgUtil', c_nvmlValue_t),
+        ('ofaUtil', c_nvmlValue_t),
+    ]
+
+class c_nvmlVgpuInstancesUtilizationInfo_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('sampleValType', _nvmlValueType_t),
+        ('vgpuInstanceCount', c_uint),
+        ('lastSeenTimeStamp', c_ulonglong),
+        ('vgpuUtilArray', POINTER(c_nvmlVgpuInstanceUtilizationInfo_v1_t)),
+    ]
+
+VgpuInstancesUtilizationInfo_v1 = 0x01000020
+
+class c_nvmlVgpuProcessUtilizationSample_t(_PrintableStructure):
+    _fields_ = [
+        ('vgpuInstance', _nvmlVgpuInstance_t),
+        ('pid', c_uint),
+        ('processName', c_char * NVML_VGPU_NAME_BUFFER_SIZE),
+        ('timeStamp', c_ulonglong),
+        ('smUtil', c_uint),
+        ('memUtil', c_uint),
+        ('encUtil', c_uint),
+        ('decUtil', c_uint),
+    ]
+
+class c_nvmlVgpuProcessUtilizationInfo_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('processName', c_char * NVML_VGPU_NAME_BUFFER_SIZE),
+        ('timeStamp', c_ulonglong),
+        ('vgpuInstance', _nvmlVgpuInstance_t),
+        ('pid', c_uint),
+        ('smUtil', c_uint),
+        ('memUtil', c_uint),
+        ('encUtil', c_uint),
+        ('decUtil', c_uint),
+        ('jpgUtil', c_uint),
+        ('ofaUtil', c_uint),
+    ]
+
+class c_nvmlVgpuProcessesUtilizationInfo_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('vgpuProcessCount', c_uint),
+        ('lastSeenTimeStamp', c_ulonglong),
+        ('vgpuProcUtilArray', POINTER(c_nvmlVgpuProcessUtilizationInfo_v1_t)),
+    ]
+
+VgpuProcessesUtilizationInfo_v1 = 0x01000018
+
+class nvmlVgpuRuntimeState_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('size', c_ulonglong),
+    ]
+
+VgpuRuntimeState_v1 = 0x1000010
+
+class c_nvmlVgpuLicenseExpiry_t(_PrintableStructure):
+    _fields_ = [
+        ('year',    c_uint32),
+        ('month',   c_uint16),
+        ('day',     c_uint16),
+        ('hour',    c_uint16),
+        ('min',     c_uint16),
+        ('sec',     c_uint16),
+        ('status',  c_uint8),
+    ]
+
+NVML_GRID_LICENSE_STATE_UNKNOWN                 = 0
+NVML_GRID_LICENSE_STATE_UNINITIALIZED           = 1
+NVML_GRID_LICENSE_STATE_UNLICENSED_UNRESTRICTED = 2
+NVML_GRID_LICENSE_STATE_UNLICENSED_RESTRICTED   = 3
+NVML_GRID_LICENSE_STATE_UNLICENSED              = 4
+NVML_GRID_LICENSE_STATE_LICENSED                = 5
+
+class c_nvmlVgpuLicenseInfo_t(_PrintableStructure):
+    _fields_ = [
+        ('isLicensed',      c_uint8),
+        ('licenseExpiry',   c_nvmlVgpuLicenseExpiry_t),
+        ('currentState',    c_uint),
+    ]
+
+class c_nvmlEncoderSession_t(_PrintableStructure):
+    _fields_ = [
+        ('sessionId', c_uint),
+        ('pid', c_uint),
+        ('vgpuInstance', _nvmlVgpuInstance_t),
+        ('codecType', c_uint),
+        ('hResolution', c_uint),
+        ('vResolution', c_uint),
+        ('averageFps', c_uint),
+        ('encodeLatency', c_uint),
+    ]
+
+class c_nvmlProcessUtilizationSample_t(_PrintableStructure):
+    _fields_ = [
+        ('pid', c_uint),
+        ('timeStamp', c_ulonglong),
+        ('smUtil', c_uint),
+        ('memUtil', c_uint),
+        ('encUtil', c_uint),
+        ('decUtil', c_uint),
+    ]
+
+class c_nvmlProcessUtilizationInfo_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('timeStamp', c_ulonglong),
+        ('pid', c_uint),
+        ('smUtil', c_uint),
+        ('memUtil', c_uint),
+        ('encUtil', c_uint),
+        ('decUtil', c_uint),
+        ('jpgUtil', c_uint),
+        ('ofaUtil', c_uint),
+    ]
+
+class c_nvmlProcessesUtilizationInfo_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('processSamplesCount', c_uint),
+        ('lastSeenTimeStamp', c_ulonglong),
+        ('procUtilArray', POINTER(c_nvmlProcessUtilizationInfo_v1_t)),
+    ]
+
+ProcessesUtilizationInfo_v1 = 0x01000018
+
+class c_nvmlGridLicenseExpiry_t(_PrintableStructure):
+    _fields_ = [
+        ('year',    c_uint32),
+        ('month',   c_uint16),
+        ('day',     c_uint16),
+        ('hour',    c_uint16),
+        ('min',     c_uint16),
+        ('sec',     c_uint16),
+        ('status',  c_uint8),
+    ]
+
+class c_nvmlGridLicensableFeature_v4_t(_PrintableStructure):
+    _fields_ = [
+        ('featureCode',    _nvmlGridLicenseFeatureCode_t),
+        ('featureState',   c_uint),
+        ('licenseInfo',    c_char * NVML_GRID_LICENSE_BUFFER_SIZE),
+        ('productName',    c_char * NVML_GRID_LICENSE_BUFFER_SIZE),
+        ('featureEnabled', c_uint),
+        ('licenseExpiry',  c_nvmlGridLicenseExpiry_t),
+    ]
+
+class c_nvmlGridLicensableFeatures_v4_t(_PrintableStructure):
+    _fields_ = [
+        ('isGridLicenseSupported',  c_int),
+        ('licensableFeaturesCount', c_uint),
+        ('gridLicensableFeatures',  c_nvmlGridLicensableFeature_v4_t * NVML_GRID_LICENSE_FEATURE_MAX_COUNT),
+    ]
+
+class c_nvmlGridLicensableFeature_v3_t(_PrintableStructure):
+    _fields_ = [
+        ('featureCode', _nvmlGridLicenseFeatureCode_t),
+        ('featureState', c_uint),
+        ('licenseInfo', c_char * NVML_GRID_LICENSE_BUFFER_SIZE),
+        ('productName', c_char * NVML_GRID_LICENSE_BUFFER_SIZE),
+        ('featureEnabled', c_uint),
+    ]
+
+class c_nvmlGridLicensableFeatures_v3_t(_PrintableStructure):
+    _fields_ = [
+        ('isGridLicenseSupported', c_int),
+        ('licensableFeaturesCount', c_uint),
+        ('gridLicensableFeatures', c_nvmlGridLicensableFeature_v3_t * NVML_GRID_LICENSE_FEATURE_MAX_COUNT),
+    ]
+
+class c_nvmlGridLicensableFeature_v2_t(_PrintableStructure):
+    _fields_ = [
+        ('featureCode', _nvmlGridLicenseFeatureCode_t),
+        ('featureState', c_uint),
+        ('licenseInfo', c_char * NVML_GRID_LICENSE_BUFFER_SIZE),
+        ('productName', c_char * NVML_GRID_LICENSE_BUFFER_SIZE),
+    ]
+
+class c_nvmlGridLicensableFeatures_v2_t(_PrintableStructure):
+    _fields_ = [
+        ('isGridLicenseSupported', c_int),
+        ('licensableFeaturesCount', c_uint),
+        ('gridLicensableFeatures', c_nvmlGridLicensableFeature_v2_t * NVML_GRID_LICENSE_FEATURE_MAX_COUNT),
+    ]
+
+class c_nvmlGridLicensableFeature_t(_PrintableStructure):
+    _fields_ = [
+        ('featureCode', _nvmlGridLicenseFeatureCode_t),
+        ('featureState', c_uint),
+        ('licenseInfo', c_char * NVML_GRID_LICENSE_BUFFER_SIZE),
+    ]
+
+class c_nvmlGridLicensableFeatures_t(_PrintableStructure):
+    _fields_ = [
+        ('isGridLicenseSupported', c_int),
+        ('licensableFeaturesCount', c_uint),
+        ('gridLicensableFeatures', c_nvmlGridLicensableFeature_t * NVML_GRID_LICENSE_FEATURE_MAX_COUNT),
+    ]
+
+class c_nvmlMarginTemperature_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('marginTemperature', c_int),
+    ]
+
+nvmlMarginTemperature_v1 = 0x1000008
+
+## Event structures
+class struct_c_nvmlEventSet_t(Structure):
+    pass # opaque handle
+c_nvmlEventSet_t = POINTER(struct_c_nvmlEventSet_t)
+
+nvmlEventTypeSingleBitEccError      = 0x0000000000000001
+nvmlEventTypeDoubleBitEccError      = 0x0000000000000002
+nvmlEventTypePState                 = 0x0000000000000004
+nvmlEventTypeXidCriticalError       = 0x0000000000000008
+nvmlEventTypeClock                  = 0x0000000000000010
+nvmlEventTypePowerSourceChange      = 0x0000000000000080
+nvmlEventMigConfigChange            = 0x0000000000000100
+nvmlEventTypeSingleBitEccErrorStorm = 0x0000000000000200
+nvmlEventTypeDramRetirementEvent    = 0x0000000000000400
+nvmlEventTypeDramRetirementFailure  = 0x0000000000000800
+nvmlEventTypeNonFatalPoisonError    = 0x0000000000001000
+nvmlEventTypeFatalPoisonError       = 0x0000000000002000
+nvmlEventTypeGpuUnavailableError    = 0x0000000000004000
+nvmlEventTypeGpuRecoveryAction      = 0x0000000000008000
+nvmlEventTypeNone                   = 0x0000000000000000
+nvmlEventTypeAll                    = (
+                                        nvmlEventTypeNone
+                                        | nvmlEventTypeSingleBitEccError
+                                        | nvmlEventTypeDoubleBitEccError
+                                        | nvmlEventTypePState
+                                        | nvmlEventTypeClock
+                                        | nvmlEventTypePowerSourceChange
+                                        | nvmlEventTypeXidCriticalError
+                                        | nvmlEventMigConfigChange
+                                        | nvmlEventTypeSingleBitEccErrorStorm
+                                        | nvmlEventTypeDramRetirementEvent
+                                        | nvmlEventTypeDramRetirementFailure
+                                        | nvmlEventTypeNonFatalPoisonError
+                                        | nvmlEventTypeFatalPoisonError
+                                        | nvmlEventTypeGpuUnavailableError
+                                        | nvmlEventTypeGpuRecoveryAction
+                                        )
+
+## Clock Event Reasons defines
+nvmlClocksEventReasonGpuIdle              = 0x0000000000000001
+nvmlClocksEventReasonApplicationsClocksSetting = 0x0000000000000002
+nvmlClocksEventReasonUserDefinedClocks         = nvmlClocksEventReasonApplicationsClocksSetting # deprecated, use nvmlClocksEventReasonApplicationsClocksSetting
+nvmlClocksEventReasonSwPowerCap           = 0x0000000000000004
+nvmlClocksEventReasonHwSlowdown           = 0x0000000000000008
+nvmlClocksEventReasonSyncBoost            = 0x0000000000000010
+nvmlClocksEventReasonSwThermalSlowdown    = 0x0000000000000020
+nvmlClocksEventReasonHwThermalSlowdown    = 0x0000000000000040
+nvmlClocksEventReasonHwPowerBrakeSlowdown = 0x0000000000000080
+nvmlClocksEventReasonDisplayClockSetting  = 0x0000000000000100
+nvmlClocksEventReasonNone                 = 0x0000000000000000
+nvmlClocksEventReasonAll                  = (
+                                                  nvmlClocksEventReasonNone |
+                                                  nvmlClocksEventReasonGpuIdle |
+                                                  nvmlClocksEventReasonApplicationsClocksSetting |
+                                                  nvmlClocksEventReasonSwPowerCap |
+                                                  nvmlClocksEventReasonHwSlowdown |
+                                                  nvmlClocksEventReasonSyncBoost |
+                                                  nvmlClocksEventReasonSwThermalSlowdown |
+                                                  nvmlClocksEventReasonHwThermalSlowdown |
+                                                  nvmlClocksEventReasonHwPowerBrakeSlowdown |
+                                                  nvmlClocksEventReasonDisplayClockSetting
+                                               )
+
+## Following have been deprecated
+nvmlClocksThrottleReasonGpuIdle              = 0x0000000000000001
+nvmlClocksThrottleReasonApplicationsClocksSetting = 0x0000000000000002
+nvmlClocksThrottleReasonUserDefinedClocks         = nvmlClocksThrottleReasonApplicationsClocksSetting # deprecated, use nvmlClocksThrottleReasonApplicationsClocksSetting
+nvmlClocksThrottleReasonSwPowerCap           = 0x0000000000000004
+nvmlClocksThrottleReasonHwSlowdown           = 0x0000000000000008
+nvmlClocksThrottleReasonSyncBoost            = 0x0000000000000010
+nvmlClocksThrottleReasonSwThermalSlowdown    = 0x0000000000000020
+nvmlClocksThrottleReasonHwThermalSlowdown    = 0x0000000000000040
+nvmlClocksThrottleReasonHwPowerBrakeSlowdown = 0x0000000000000080
+nvmlClocksThrottleReasonDisplayClockSetting  = 0x0000000000000100
+nvmlClocksThrottleReasonNone                 = 0x0000000000000000
+nvmlClocksThrottleReasonAll                  = (
+                                                  nvmlClocksThrottleReasonNone |
+                                                  nvmlClocksThrottleReasonGpuIdle |
+                                                  nvmlClocksThrottleReasonApplicationsClocksSetting |
+                                                  nvmlClocksThrottleReasonSwPowerCap |
+                                                  nvmlClocksThrottleReasonHwSlowdown |
+                                                  nvmlClocksThrottleReasonSyncBoost |
+                                                  nvmlClocksThrottleReasonSwThermalSlowdown |
+                                                  nvmlClocksThrottleReasonHwThermalSlowdown |
+                                                  nvmlClocksThrottleReasonHwPowerBrakeSlowdown |
+                                                  nvmlClocksThrottleReasonDisplayClockSetting
+                                               )
+
+class c_nvmlEventData_t(_PrintableStructure):
+    _fields_ = [
+        ('device', c_nvmlDevice_t),
+        ('eventType', c_ulonglong),
+        ('eventData', c_ulonglong),
+        ('gpuInstanceId', c_uint),
+        ('computeInstanceId', c_uint)
+    ]
+    _fmt_ = {'eventType': "0x%08X"}
+
+class c_nvmlAccountingStats_t(_PrintableStructure):
+    _fields_ = [
+        ('gpuUtilization', c_uint),
+        ('memoryUtilization', c_uint),
+        ('maxMemoryUsage', c_ulonglong),
+        ('time', c_ulonglong),
+        ('startTime', c_ulonglong),
+        ('isRunning', c_uint),
+        ('reserved', c_uint * 5)
+    ]
+
+class c_nvmlVgpuVersion_t(Structure):
+    _fields_ = [("minVersion", c_uint),
+                ("maxVersion", c_uint)
+               ]
+
+class c_nvmlVgpuMetadata_t(_PrintableStructure):
+    _fields_ = [("version", c_uint),
+                ("revision", c_uint),
+                ("guestInfoState", _nvmlVgpuGuestInfoState_t),
+                ("guestDriverVersion", c_char * NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE),
+                ("hostDriverVersion", c_char * NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE),
+                ("reserved", c_uint * 6),
+                ("vgpuVirtualizationCaps", c_uint),
+                ("guestVgpuVersion", c_uint),
+                ("opaqueDataSize", c_uint),
+                ("opaqueData", c_char * NVML_VGPU_METADATA_OPAQUE_DATA_SIZE)
+               ]
+
+class c_nvmlVgpuPgpuMetadata_t(_PrintableStructure):
+    _fields_ = [("version", c_uint),
+                ("revision", c_uint),
+                ("hostDriverVersion", c_char * NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE),
+                ("pgpuVirtualizationCaps", c_uint),
+                ("reserved", c_uint * 5),
+                ("hostSupportedVgpuRange", c_nvmlVgpuVersion_t),
+                ("opaqueDataSize", c_uint),
+                ("opaqueData", c_char * NVML_VGPU_PGPU_METADATA_OPAQUE_DATA_SIZE)
+               ]
+
+class c_nvmlVgpuPgpuCompatibility_t(Structure):
+    _fields_ = [("vgpuVmCompatibility", _nvmlVgpuVmCompatibility_t),
+                ("compatibilityLimitCode", _nvmlVgpuPgpuCompatibilityLimitCode_t)
+               ]
+
+## vGPU scheduler policy defines
+NVML_VGPU_SCHEDULER_POLICY_UNKNOWN      = 0
+NVML_VGPU_SCHEDULER_POLICY_BEST_EFFORT  = 1
+NVML_VGPU_SCHEDULER_POLICY_EQUAL_SHARE  = 2
+NVML_VGPU_SCHEDULER_POLICY_FIXED_SHARE  = 3
+
+## Supported vGPU scheduler policy count
+NVML_SUPPORTED_VGPU_SCHEDULER_POLICY_COUNT  = 3
+
+NVML_SCHEDULER_SW_MAX_LOG_ENTRIES           = 200
+
+NVML_VGPU_SCHEDULER_ARR_DEFAULT   = 0
+NVML_VGPU_SCHEDULER_ARR_DISABLE   = 1
+NVML_VGPU_SCHEDULER_ARR_ENABLE    = 2
+
+class c_nvmlVgpuSchedDataWithARR_t(_PrintableStructure):
+    _fields_ = [
+        ('avgFactor',   c_uint),
+        ('timeslice',   c_uint),
+    ]
+
+class c_nvmlVgpuSchedData_t(_PrintableStructure):
+    _fields_ = [
+        ('timeslice',   c_uint),
+    ]
+
+class c_nvmlVgpuSchedulerParams_t(Union):
+    _fields_ = [
+        ('vgpuSchedDataWithARR', c_nvmlVgpuSchedDataWithARR_t),
+        ('vgpuSchedData',        c_nvmlVgpuSchedData_t),
+    ]
+
+class c_nvmlVgpuSchedulerLogEntry_t(_PrintableStructure):
+    _fields_ = [
+        ('timestamp',                   c_ulonglong),
+        ('timeRunTotal',                c_ulonglong),
+        ('timeRun',                     c_ulonglong),
+        ('swRunlistId',                 c_uint),
+        ('targetTimeSlice',             c_ulonglong),
+        ('cumulativePreemptionTime',    c_ulonglong),
+    ]
+
+class c_nvmlVgpuSchedulerLog_t(_PrintableStructure):
+    _fields_ = [
+        ('engineId',        c_uint),
+        ('schedulerPolicy', c_uint),
+        ('arrMode',         c_uint),
+        ('schedulerParams', c_nvmlVgpuSchedulerParams_t),
+        ('entriesCount',    c_uint),
+        ('logEntries',      c_nvmlVgpuSchedulerLogEntry_t * NVML_SCHEDULER_SW_MAX_LOG_ENTRIES),
+    ]
+
+class c_nvmlVgpuSchedulerGetState_t(_PrintableStructure):
+    _fields_ = [
+        ('schedulerPolicy', c_uint),
+        ('arrMode',         c_uint),
+        ('schedulerParams', c_nvmlVgpuSchedulerParams_t),
+    ]
+
+class c_nvmlVgpuSchedSetDataWithARR_t(_PrintableStructure):
+    _fields_ = [
+        ('avgFactor',   c_uint),
+        ('frequency',   c_uint),
+    ]
+
+class c_nvmlVgpuSchedSetData_t(_PrintableStructure):
+    _fields_ = [
+        ('timeslice',   c_uint),
+    ]
+
+class c_nvmlVgpuSchedulerSetParams_t(Union):
+    _fields_ = [
+        ('vgpuSchedDataWithARR', c_nvmlVgpuSchedSetDataWithARR_t),
+        ('vgpuSchedData',        c_nvmlVgpuSchedSetData_t),
+    ]
+
+class c_nvmlVgpuSchedulerSetState_t(_PrintableStructure):
+    _fields_ = [
+        ('schedulerPolicy', c_uint),
+        ('enableARRMode',   c_uint),
+        ('schedulerParams', c_nvmlVgpuSchedulerSetParams_t),
+    ]
+
+class c_nvmlVgpuSchedulerCapabilities_t(_PrintableStructure):
+    _fields_ = [
+        ('supportedSchedulers', c_uint * NVML_SUPPORTED_VGPU_SCHEDULER_POLICY_COUNT),
+        ('maxTimeslice',        c_uint),
+        ('minTimeslice',        c_uint),
+        ('isArrModeSupported',  c_uint),
+        ('maxFrequencyForARR',  c_uint),
+        ('minFrequencyForARR',  c_uint),
+        ('maxAvgFactorForARR',  c_uint),
+        ('minAvgFactorForARR',  c_uint),
+    ]
+
+class c_nvmlFBCStats_t(Structure):
+    _fields_ = [("sessionsCount", c_uint),
+                ("averageFPS", c_uint),
+                ("averageLatency", c_uint)
+               ]
+
+class c_nvmlFBCSession_t(_PrintableStructure):
+    _fields_ = [
+        ('sessionId', c_uint),
+        ('pid', c_uint),
+        ('vgpuInstance', _nvmlVgpuInstance_t),
+        ('displayOrdinal', c_uint),
+        ('sessionType', c_uint),
+        ('sessionFlags', c_uint),
+        ('hMaxResolution', c_uint),
+        ('vMaxResolution', c_uint),
+        ('hResolution', c_uint),
+        ('vResolution', c_uint),
+        ('averageFPS', c_uint),
+        ('averageLatency', c_uint),
+    ]
+
+NVML_DEVICE_MIG_DISABLE = 0x0
+NVML_DEVICE_MIG_ENABLE  = 0x1
+
+NVML_GPU_INSTANCE_PROFILE_1_SLICE      = 0x0
+NVML_GPU_INSTANCE_PROFILE_2_SLICE      = 0x1
+NVML_GPU_INSTANCE_PROFILE_3_SLICE      = 0x2
+NVML_GPU_INSTANCE_PROFILE_4_SLICE      = 0x3
+NVML_GPU_INSTANCE_PROFILE_7_SLICE      = 0x4
+NVML_GPU_INSTANCE_PROFILE_8_SLICE      = 0x5
+NVML_GPU_INSTANCE_PROFILE_6_SLICE      = 0x6
+NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV1 = 0x7
+NVML_GPU_INSTANCE_PROFILE_2_SLICE_REV1 = 0x8
+NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV2 = 0x9
+NVML_GPU_INSTANCE_PROFILE_1_SLICE_GFX  = 0xA
+NVML_GPU_INSTANCE_PROFILE_2_SLICE_GFX  = 0xB
+NVML_GPU_INSTANCE_PROFILE_4_SLICE_GFX  = 0xC
+NVML_GPU_INSTANCE_PROFILE_COUNT        = 0xD
+
+class c_nvmlGpuInstancePlacement_t(Structure):
+    _fields_ = [("start", c_uint),
+                ("size", c_uint)
+               ]
+
+class c_nvmlGpuInstanceProfileInfo_t(Structure):
+    _fields_ = [("id", c_uint),
+                ("isP2pSupported", c_uint),
+                ("sliceCount", c_uint),
+                ("instanceCount", c_uint),
+                ("multiprocessorCount", c_uint),
+                ("copyEngineCount", c_uint),
+                ("decoderCount", c_uint),
+                ("encoderCount", c_uint),
+                ("jpegCount", c_uint),
+                ("ofaCount", c_uint),
+                ("memorySizeMB", c_ulonglong),
+               ]
+
+nvmlGpuInstanceProfileInfo_v2 = 0x02000098
+
+class c_nvmlGpuInstanceProfileInfo_v2_t(_PrintableStructure):
+    _fields_ = [("version", c_uint),
+                ("id", c_uint),
+                ("isP2pSupported", c_uint),
+                ("sliceCount", c_uint),
+                ("instanceCount", c_uint),
+                ("multiprocessorCount", c_uint),
+                ("copyEngineCount", c_uint),
+                ("decoderCount", c_uint),
+                ("encoderCount", c_uint),
+                ("jpegCount", c_uint),
+                ("ofaCount", c_uint),
+                ("memorySizeMB", c_ulonglong),
+                ("name", c_char * NVML_DEVICE_NAME_V2_BUFFER_SIZE)
+               ]
+    
+    def __init__(self):
+        super(c_nvmlGpuInstanceProfileInfo_v2_t, self).__init__(version=nvmlGpuInstanceProfileInfo_v2)
+
+class c_nvmlGpuInstanceInfo_t(Structure):
+    _fields_ = [("device", c_nvmlDevice_t),
+                ("id", c_uint),
+                ("profileId", c_uint),
+                ("placement", c_nvmlGpuInstancePlacement_t)
+               ]
+
+class struct_c_nvmlGpuInstance_t(Structure):
+    pass # opaque handle
+c_nvmlGpuInstance_t = POINTER(struct_c_nvmlGpuInstance_t)
+
+NVML_COMPUTE_INSTANCE_PROFILE_1_SLICE      = 0x0
+NVML_COMPUTE_INSTANCE_PROFILE_2_SLICE      = 0x1
+NVML_COMPUTE_INSTANCE_PROFILE_3_SLICE      = 0x2
+NVML_COMPUTE_INSTANCE_PROFILE_4_SLICE      = 0x3
+NVML_COMPUTE_INSTANCE_PROFILE_7_SLICE      = 0x4
+NVML_COMPUTE_INSTANCE_PROFILE_8_SLICE      = 0x5
+NVML_COMPUTE_INSTANCE_PROFILE_6_SLICE      = 0x6
+NVML_COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1 = 0x7
+NVML_COMPUTE_INSTANCE_PROFILE_COUNT        = 0x8
+
+NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED = 0x0
+NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT = 0x1
+
+class c_nvmlComputeInstancePlacement_t(Structure):
+    _fields_ = [("start", c_uint),
+                ("size", c_uint)
+               ]
+
+class c_nvmlComputeInstanceProfileInfo_t(Structure):
+    _fields_ = [("id", c_uint),
+                ("sliceCount", c_uint),
+                ("instanceCount", c_uint),
+                ("multiprocessorCount", c_uint),
+                ("sharedCopyEngineCount", c_uint),
+                ("sharedDecoderCount", c_uint),
+                ("sharedEncoderCount", c_uint),
+                ("sharedJpegCount", c_uint),
+                ("sharedOfaCount", c_uint)
+               ]
+
+nvmlComputeInstanceProfileInfo_v2 = 0x02000088
+
+class c_nvmlComputeInstanceProfileInfo_v2_t(_PrintableStructure):
+    _fields_ = [("version", c_uint),
+                ("id", c_uint),
+                ("sliceCount", c_uint),
+                ("instanceCount", c_uint),
+                ("multiprocessorCount", c_uint),
+                ("sharedCopyEngineCount", c_uint),
+                ("sharedDecoderCount", c_uint),
+                ("sharedEncoderCount", c_uint),
+                ("sharedJpegCount", c_uint),
+                ("sharedOfaCount", c_uint),
+                ("name", c_char * NVML_DEVICE_NAME_V2_BUFFER_SIZE)
+               ]
+
+    def __init__(self):
+        super(c_nvmlComputeInstanceProfileInfo_v2_t, self).__init__(version=nvmlComputeInstanceProfileInfo_v2)
+
+class c_nvmlComputeInstanceInfo_t(Structure):
+    _fields_ = [("device", c_nvmlDevice_t),
+                ("gpuInstance", c_nvmlGpuInstance_t),
+                ("id", c_uint),
+                ("profileId", c_uint),
+                ("placement", c_nvmlComputeInstancePlacement_t)
+               ]
+
+NVML_MAX_GPU_UTILIZATIONS = 8
+NVML_GPU_UTILIZATION_DOMAIN_GPU    = 0
+NVML_GPU_UTILIZATION_DOMAIN_FB     = 1
+NVML_GPU_UTILIZATION_DOMAIN_VID    = 2
+NVML_GPU_UTILIZATION_DOMAIN_BUS    = 3
+class c_nvmlGpuDynamicPstatesUtilization_t(Structure):
+    _fields_ = [("bIsPresent", c_uint, 1),
+                ("percentage", c_uint),
+                ("incThreshold", c_uint),
+                ("decThreshold", c_uint)]
+class c_nvmlGpuDynamicPstatesInfo_t(Structure):
+    _fields_ = [("flags", c_uint),
+                ("utilization", c_nvmlGpuDynamicPstatesUtilization_t * NVML_MAX_GPU_UTILIZATIONS)]
+
+NVML_MAX_THERMAL_SENSORS_PER_GPU = 3
+
+NVML_THERMAL_TARGET_NONE          = 0
+NVML_THERMAL_TARGET_GPU           = 1
+NVML_THERMAL_TARGET_MEMORY        = 2
+NVML_THERMAL_TARGET_POWER_SUPPLY  = 4
+NVML_THERMAL_TARGET_BOARD         = 8
+NVML_THERMAL_TARGET_VCD_BOARD     = 9
+NVML_THERMAL_TARGET_VCD_INLET     = 10
+NVML_THERMAL_TARGET_VCD_OUTLET    = 11
+NVML_THERMAL_TARGET_ALL           = 15
+NVML_THERMAL_TARGET_UNKNOWN       = -1
+
+NVML_THERMAL_CONTROLLER_NONE            = 0
+NVML_THERMAL_CONTROLLER_GPU_INTERNAL    = 1
+NVML_THERMAL_CONTROLLER_ADM1032         = 2
+NVML_THERMAL_CONTROLLER_ADT7461         = 3
+NVML_THERMAL_CONTROLLER_MAX6649         = 4
+NVML_THERMAL_CONTROLLER_MAX1617         = 5
+NVML_THERMAL_CONTROLLER_LM99            = 6
+NVML_THERMAL_CONTROLLER_LM89            = 7
+NVML_THERMAL_CONTROLLER_LM64            = 8
+NVML_THERMAL_CONTROLLER_G781            = 9
+NVML_THERMAL_CONTROLLER_ADT7473         = 10
+NVML_THERMAL_CONTROLLER_SBMAX6649       = 11
+NVML_THERMAL_CONTROLLER_VBIOSEVT        = 12
+NVML_THERMAL_CONTROLLER_OS              = 13
+NVML_THERMAL_CONTROLLER_NVSYSCON_CANOAS = 14
+NVML_THERMAL_CONTROLLER_NVSYSCON_E551   = 15
+NVML_THERMAL_CONTROLLER_MAX6649R        = 16
+NVML_THERMAL_CONTROLLER_ADT7473S        = 17
+NVML_THERMAL_CONTROLLER_UNKNOWN         = -1
+
+class c_nvmlGpuThermalSensor_t(Structure):
+    _fields_ = [("controller", c_int),
+                ("defaultMinTemp", c_int),
+                ("defaultMaxTemp", c_int),
+                ("currentTemp", c_int),
+                ("target", c_int)]
+class c_nvmlGpuThermalSettings_t(Structure):
+    _fields_ = [("count", c_uint),
+                ("sensor", c_nvmlGpuThermalSensor_t * NVML_MAX_THERMAL_SENSORS_PER_GPU)]
+
+_nvmlCoolerControl_t = c_uint
+NVML_THERMAL_COOLER_SIGNAL_NONE        = 0
+NVML_THERMAL_COOLER_SIGNAL_TOGGLE      = 1
+NVML_THERMAL_COOLER_SIGNAL_VARIABLE    = 2
+NVML_THERMAL_COOLER_SIGNAL_COUNT       = 3
+
+_nvmlCoolerTarget_t = c_uint
+NVML_THERMAL_COOLER_TARGET_NONE          = (1 << 0)
+NVML_THERMAL_COOLER_TARGET_GPU           = (1 << 1)
+NVML_THERMAL_COOLER_TARGET_MEMORY        = (1 << 2)
+NVML_THERMAL_COOLER_TARGET_POWER_SUPPLY  = (1 << 3)
+NVML_THERMAL_COOLER_TARGET_GPU_RELATED   = (NVML_THERMAL_COOLER_TARGET_GPU | NVML_THERMAL_COOLER_TARGET_MEMORY | NVML_THERMAL_COOLER_TARGET_POWER_SUPPLY)
+
+class c_nvmlCoolerInfo_t(_PrintableStructure):
+    _fields_ = [("version", c_uint),
+                ("index", c_uint),
+                ("coolerControlType", _nvmlCoolerControl_t),
+                ("coolerTarget", _nvmlCoolerTarget_t)
+               ]
+
+nvmlCoolerInfo_v1 = 0x1000010
+
+def nvmlDeviceGetCoolerInfo(handle):
+    c_coolerInfo = c_nvmlCoolerInfo_t()
+    c_coolerInfo.version = nvmlCoolerInfo_v1
+    c_coolerInfo.index = 0
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetCoolerInfo")
+    ret = fn(handle, byref(c_coolerInfo))
+    _nvmlCheckReturn(ret)
+    return [c_coolerInfo.coolerControlType, c_coolerInfo.coolerTarget]
+
+class struct_c_nvmlComputeInstance_t(Structure):
+    pass # opaque handle
+c_nvmlComputeInstance_t = POINTER(struct_c_nvmlComputeInstance_t)
+
+class c_nvmlDeviceAttributes(Structure):
+    _fields_ = [("multiprocessorCount", c_uint),
+                ("sharedCopyEngineCount", c_uint),
+                ("sharedDecoderCount", c_uint),
+                ("sharedEncoderCount", c_uint),
+                ("sharedJpegCount", c_uint),
+                ("sharedOfaCount", c_uint),
+                ("gpuInstanceSliceCount", c_uint),
+                ("computeInstanceSliceCount", c_uint),
+                ("memorySizeMB", c_ulonglong),
+               ]
+
+class c_nvmlRowRemapperHistogramValues(Structure):
+    _fields_ = [("max", c_uint),
+                ("high", c_uint),
+                ("partial", c_uint),
+                ("low", c_uint),
+                ("none", c_uint)
+               ]
+
+NVML_GPU_CERT_CHAIN_SIZE                = 0x1000
+NVML_GPU_ATTESTATION_CERT_CHAIN_SIZE    = 0x1400
+NVML_CC_GPU_CEC_NONCE_SIZE              = 0x20
+NVML_CC_GPU_ATTESTATION_REPORT_SIZE     = 0x2000
+NVML_CC_GPU_CEC_ATTESTATION_REPORT_SIZE = 0x1000
+NVML_CC_CEC_ATTESTATION_REPORT_NOT_PRESENT = 0
+NVML_CC_CEC_ATTESTATION_REPORT_PRESENT     = 1
+
+class c_nvmlConfComputeSystemState_t(Structure):
+    _fields_ = [('environment', c_uint),
+                ('ccFeature', c_uint),
+                ('devToolsMode', c_uint),
+               ]
+
+nvmlSystemConfComputeSettings_v1 = 0x1000014
+
+class c_nvmlSystemConfComputeSettings_v1_t(Structure):
+    _fields_ = [('version', c_uint),
+                ('environment', c_uint),
+                ('ccFeature', c_uint),
+                ('devToolsMode', c_uint),
+                ('multiGpuMode', c_uint),
+               ]
+    def __init__(self):
+        super(c_nvmlSystemConfComputeSettings_v1_t, self).__init__(version=nvmlSystemConfComputeSettings_v1)
+
+class c_nvmlConfComputeSystemCaps_t(Structure):
+    _fields_ = [('cpuCaps', c_uint),
+                ('gpusCaps', c_uint),
+               ]
+
+class c_nvmlConfComputeMemSizeInfo_t(Structure):
+    _fields_ = [('protectedMemSizeKib', c_ulonglong),
+                ('unprotectedMemSizeKib', c_ulonglong),
+               ]
+
+class c_nvmlConfComputeGpuCertificate_t(Structure):
+    _fields_ = [('certChainSize', c_uint),
+                ('attestationCertChainSize', c_uint),
+                ('certChain', c_uint8 * NVML_GPU_CERT_CHAIN_SIZE),
+                ('attestationCertChain', c_uint8 * NVML_GPU_ATTESTATION_CERT_CHAIN_SIZE),
+               ]
+
+class c_nvmlConfComputeGpuAttestationReport_t(Structure):
+    _fields_ = [('isCecAttestationReportPresent', c_uint),
+                ('attestationReportSize', c_uint),
+                ('cecAttestationReportSize', c_uint),
+                ('nonce', c_uint8 * NVML_CC_GPU_CEC_NONCE_SIZE),
+                ('attestationReport', c_uint8 * NVML_CC_GPU_ATTESTATION_REPORT_SIZE),
+                ('cecAttestationReport', c_uint8 * NVML_CC_GPU_CEC_ATTESTATION_REPORT_SIZE),
+               ]
+
+class c_nvmlConfComputeSetKeyRotationThresholdInfo_t(Structure):
+    _fields_ = [('version', c_uint),
+                ('maxAttackerAdvantage', c_ulong),
+               ]
+ConfComputeSetKeyRotationThresholdInfo_v1 = 0x1000010
+
+class c_nvmlConfComputeGetKeyRotationThresholdInfo_t(Structure):
+    _fields_ = [('version', c_uint),
+                ('attackerAdvantage', c_ulong),
+               ]
+ConfComputeGetKeyRotationThresholdInfo_v1 = 0x1000010
+
+
+## string/bytes conversion for ease of use
+def convertStrBytes(func):
+    '''
+    In python 3, strings are unicode instead of bytes, and need to be converted for ctypes
+    Args from caller: (1, 'string', <__main__.c_nvmlDevice_t at 0xFFFFFFFF>)
+    Args passed to function: (1, b'string', <__main__.c_nvmlDevice_t at 0xFFFFFFFF)>
+    ----
+    Returned from function: b'returned string'
+    Returned to caller: 'returned string'
+    '''
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        # encoding a str returns bytes in python 2 and 3
+        args = [arg.encode() if isinstance(arg, str) else arg for arg in args]
+        res = func(*args, **kwargs)
+        # In python 2, str and bytes are the same
+        # In python 3, str is unicode and should be decoded.
+        # Ctypes handles most conversions, this only effects c_char and char arrays.
+        if isinstance(res, bytes):
+            if isinstance(res, str):
+                return res
+            return res.decode()
+        return res
+
+    if sys.version_info >= (3,):
+        return wrapper
+    return func
+
+def throwOnVersionMismatch(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        try:
+            return func(*args, **kwargs)
+        except NVMLError_FunctionNotFound:
+            raise NVMLLibraryMismatchError("Unversioned function called and the "
+                                           "pyNVML version does not match the NVML lib version. "
+                                           "Either use matching pyNVML and NVML lib versions or "
+                                           "use a versioned function such as " + func.__name__ + "_v2")
+    return wrapper
+
+## C function wrappers ##
+def nvmlInitWithFlags(flags):
+    _LoadNvmlLibrary()
+
+    #
+    # Initialize the library
+    #
+    fn = _nvmlGetFunctionPointer("nvmlInitWithFlags")
+    ret = fn(flags)
+    _nvmlCheckReturn(ret)
+
+    # Atomically update refcount
+    global _nvmlLib_refcount
+    libLoadLock.acquire()
+    _nvmlLib_refcount += 1
+    libLoadLock.release()
+    return None
+
+def nvmlInit():
+    nvmlInitWithFlags(0)
+    return None
+
+def _LoadNvmlLibrary():
+    '''
+    Load the library if it isn't loaded already
+    '''
+    global nvmlLib
+
+    if (nvmlLib == None):
+        # lock to ensure only one caller loads the library
+        libLoadLock.acquire()
+
+        try:
+            # ensure the library still isn't loaded
+            if (nvmlLib == None):
+                try:
+                    if (sys.platform[:3] == "win"):
+                        # cdecl calling convention
+                        try:
+                            # Check for nvml.dll in System32 first for DCH drivers
+                            nvmlLib = CDLL(os.path.join(os.getenv("WINDIR", "C:/Windows"), "System32/nvml.dll"))
+                        except OSError as ose:
+                            # If nvml.dll is not found in System32, it should be in ProgramFiles
+                            # load nvml.dll from %ProgramFiles%/NVIDIA Corporation/NVSMI/nvml.dll
+                            nvmlLib = CDLL(os.path.join(os.getenv("ProgramFiles", "C:/Program Files"), "NVIDIA Corporation/NVSMI/nvml.dll"))
+                    else:
+                        # assume linux
+                        nvmlLib = CDLL("libnvidia-ml.so.1")
+                except OSError as ose:
+                    _nvmlCheckReturn(NVML_ERROR_LIBRARY_NOT_FOUND)
+                if (nvmlLib == None):
+                    _nvmlCheckReturn(NVML_ERROR_LIBRARY_NOT_FOUND)
+        finally:
+            # lock is always freed
+            libLoadLock.release()
+
+def nvmlShutdown():
+    #
+    # Leave the library loaded, but shutdown the interface
+    #
+    fn = _nvmlGetFunctionPointer("nvmlShutdown")
+    ret = fn()
+    _nvmlCheckReturn(ret)
+
+    # Atomically update refcount
+    global _nvmlLib_refcount
+    libLoadLock.acquire()
+    if (0 < _nvmlLib_refcount):
+        _nvmlLib_refcount -= 1
+    libLoadLock.release()
+    return None
+
+# Added in 2.285
+@convertStrBytes
+def nvmlErrorString(result):
+    fn = _nvmlGetFunctionPointer("nvmlErrorString")
+    fn.restype = c_char_p # otherwise return is an int
+    ret = fn(result)
+    return ret
+
+# Added in 2.285
+@convertStrBytes
+def nvmlSystemGetNVMLVersion():
+    c_version = create_string_buffer(NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE)
+    fn = _nvmlGetFunctionPointer("nvmlSystemGetNVMLVersion")
+    ret = fn(c_version, c_uint(NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE))
+    _nvmlCheckReturn(ret)
+    return c_version.value
+
+def nvmlSystemGetCudaDriverVersion():
+    c_cuda_version = c_int()
+    fn = _nvmlGetFunctionPointer("nvmlSystemGetCudaDriverVersion")
+    ret = fn(byref(c_cuda_version))
+    _nvmlCheckReturn(ret)
+    return c_cuda_version.value
+
+def nvmlSystemGetCudaDriverVersion_v2():
+    c_cuda_version = c_int()
+    fn = _nvmlGetFunctionPointer("nvmlSystemGetCudaDriverVersion_v2")
+    ret = fn(byref(c_cuda_version))
+    _nvmlCheckReturn(ret)
+    return c_cuda_version.value
+
+# Added in 2.285
+@convertStrBytes
+def nvmlSystemGetProcessName(pid):
+    c_name = create_string_buffer(1024)
+    fn = _nvmlGetFunctionPointer("nvmlSystemGetProcessName")
+    ret = fn(c_uint(pid), c_name, c_uint(1024))
+    _nvmlCheckReturn(ret)
+    return c_name.value
+
+@convertStrBytes
+def nvmlSystemGetDriverVersion():
+    c_version = create_string_buffer(NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE)
+    fn = _nvmlGetFunctionPointer("nvmlSystemGetDriverVersion")
+    ret = fn(c_version, c_uint(NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE))
+    _nvmlCheckReturn(ret)
+    return c_version.value
+
+# Added in 2.285
+def nvmlSystemGetHicVersion():
+    c_count = c_uint(0)
+    hics = None
+    fn = _nvmlGetFunctionPointer("nvmlSystemGetHicVersion")
+
+    # get the count
+    ret = fn(byref(c_count), None)
+
+    # this should only fail with insufficient size
+    if ((ret != NVML_SUCCESS) and
+        (ret != NVML_ERROR_INSUFFICIENT_SIZE)):
+        raise NVMLError(ret)
+
+    # If there are no hics
+    if (c_count.value == 0):
+        return []
+
+    hic_array = c_nvmlHwbcEntry_t * c_count.value
+    hics = hic_array()
+    ret = fn(byref(c_count), hics)
+    _nvmlCheckReturn(ret)
+    return hics
+
+def nvmlSystemGetDriverBranch():
+    c_branchInfo = c_nvmlSystemDriverBranchInfo_v1_t(0)
+    c_branchInfo.version = SystemDriverBranchInfo_v1
+    fn  = _nvmlGetFunctionPointer("nvmlSystemGetDriverBranch")
+    ret = fn(byref(c_branchInfo), c_uint(NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE))
+    _nvmlCheckReturn(ret)
+    return c_branchInfo
+
+## Unit get functions
+def nvmlUnitGetCount():
+    c_count = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlUnitGetCount")
+    ret = fn(byref(c_count))
+    _nvmlCheckReturn(ret)
+    return c_count.value
+
+def nvmlUnitGetHandleByIndex(index):
+    c_index = c_uint(index)
+    unit = c_nvmlUnit_t()
+    fn = _nvmlGetFunctionPointer("nvmlUnitGetHandleByIndex")
+    ret = fn(c_index, byref(unit))
+    _nvmlCheckReturn(ret)
+    return unit
+
+def nvmlUnitGetUnitInfo(unit):
+    c_info = c_nvmlUnitInfo_t()
+    fn = _nvmlGetFunctionPointer("nvmlUnitGetUnitInfo")
+    ret = fn(unit, byref(c_info))
+    _nvmlCheckReturn(ret)
+    return c_info
+
+def nvmlUnitGetLedState(unit):
+    c_state =  c_nvmlLedState_t()
+    fn = _nvmlGetFunctionPointer("nvmlUnitGetLedState")
+    ret = fn(unit, byref(c_state))
+    _nvmlCheckReturn(ret)
+    return c_state
+
+def nvmlUnitGetPsuInfo(unit):
+    c_info = c_nvmlPSUInfo_t()
+    fn = _nvmlGetFunctionPointer("nvmlUnitGetPsuInfo")
+    ret = fn(unit, byref(c_info))
+    _nvmlCheckReturn(ret)
+    return c_info
+
+def nvmlUnitGetTemperature(unit, type):
+    c_temp = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlUnitGetTemperature")
+    ret = fn(unit, c_uint(type), byref(c_temp))
+    _nvmlCheckReturn(ret)
+    return c_temp.value
+
+def nvmlUnitGetFanSpeedInfo(unit):
+    c_speeds = c_nvmlUnitFanSpeeds_t()
+    fn = _nvmlGetFunctionPointer("nvmlUnitGetFanSpeedInfo")
+    ret = fn(unit, byref(c_speeds))
+    _nvmlCheckReturn(ret)
+    return c_speeds
+
+# added to API
+def nvmlUnitGetDeviceCount(unit):
+    c_count = c_uint(0)
+    # query the unit to determine device count
+    fn = _nvmlGetFunctionPointer("nvmlUnitGetDevices")
+    ret = fn(unit, byref(c_count), None)
+    if (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        ret = NVML_SUCCESS
+    _nvmlCheckReturn(ret)
+    return c_count.value
+
+def nvmlUnitGetDevices(unit):
+    c_count = c_uint(nvmlUnitGetDeviceCount(unit))
+    device_array = c_nvmlDevice_t * c_count.value
+    c_devices = device_array()
+    fn = _nvmlGetFunctionPointer("nvmlUnitGetDevices")
+    ret = fn(unit, byref(c_count), c_devices)
+    _nvmlCheckReturn(ret)
+    return c_devices
+
+## Device get functions
+def nvmlDeviceGetCount():
+    c_count = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetCount_v2")
+    ret = fn(byref(c_count))
+    _nvmlCheckReturn(ret)
+    return c_count.value
+
+def nvmlDeviceGetHandleByIndex(index):
+    c_index = c_uint(index)
+    device = c_nvmlDevice_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetHandleByIndex_v2")
+    ret = fn(c_index, byref(device))
+    _nvmlCheckReturn(ret)
+    return device
+
+@convertStrBytes
+def nvmlDeviceGetHandleBySerial(serial):
+    c_serial = c_char_p(serial)
+    device = c_nvmlDevice_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetHandleBySerial")
+    ret = fn(c_serial, byref(device))
+    _nvmlCheckReturn(ret)
+    return device
+
+@convertStrBytes
+def nvmlDeviceGetHandleByUUID(uuid):
+    c_uuid = c_char_p(uuid)
+    device = c_nvmlDevice_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetHandleByUUID")
+    ret = fn(c_uuid, byref(device))
+    _nvmlCheckReturn(ret)
+    return device
+
+@convertStrBytes
+def nvmlDeviceGetHandleByPciBusId(pciBusId):
+    c_busId = c_char_p(pciBusId)
+    device = c_nvmlDevice_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetHandleByPciBusId_v2")
+    ret = fn(c_busId, byref(device))
+    _nvmlCheckReturn(ret)
+    return device
+
+@convertStrBytes
+def nvmlDeviceGetName(handle):
+    c_name = create_string_buffer(NVML_DEVICE_NAME_V2_BUFFER_SIZE)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetName")
+    ret = fn(handle, c_name, c_uint(NVML_DEVICE_NAME_V2_BUFFER_SIZE))
+    _nvmlCheckReturn(ret)
+    return c_name.value
+
+class c_nvmlDevicePerfModes_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('str', c_char * NVML_PERF_MODES_BUFFER_SIZE),
+    ]
+
+nvmlDevicePerfModes_v1 = 0x1000804
+
+@convertStrBytes
+def nvmlDeviceGetPerformanceModes(handle):
+    perfModes = c_nvmlDevicePerfModes_v1_t()
+    perfModes.version = nvmlDevicePerfModes_v1
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPerformanceModes")
+    ret = fn(handle, byref(perfModes))
+    _nvmlCheckReturn(ret)
+    return perfModes.str
+
+class c_nvmlDeviceCurrentClockFreqs_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('str', c_char * NVML_PERF_MODES_BUFFER_SIZE),
+    ]
+
+nvmlDeviceCurrentClockFreqs_v1 = 0x1000804
+
+@convertStrBytes
+def nvmlDeviceGetCurrentClockFreqs(handle):
+    currentClockFreqs = c_nvmlDeviceCurrentClockFreqs_v1_t()
+    currentClockFreqs.version = nvmlDeviceCurrentClockFreqs_v1
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetCurrentClockFreqs")
+    ret = fn(handle, byref(currentClockFreqs))
+    _nvmlCheckReturn(ret)
+    return currentClockFreqs.str
+
+def nvmlDeviceGetBoardId(handle):
+    c_id = c_uint();
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetBoardId")
+    ret = fn(handle, byref(c_id))
+    _nvmlCheckReturn(ret)
+    return c_id.value
+
+def nvmlDeviceGetMultiGpuBoard(handle):
+    c_multiGpu = c_uint();
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMultiGpuBoard")
+    ret = fn(handle, byref(c_multiGpu))
+    _nvmlCheckReturn(ret)
+    return c_multiGpu.value
+
+def nvmlDeviceGetBrand(handle):
+    c_type = _nvmlBrandType_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetBrand")
+    ret = fn(handle, byref(c_type))
+    _nvmlCheckReturn(ret)
+    return c_type.value
+
+def nvmlDeviceGetC2cModeInfoV1(handle):
+    c_info = c_nvmlC2cModeInfo_v1_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetC2cModeInfoV")
+    ret = fn(handle, byref(c_info))
+    _nvmlCheckReturn(ret)
+    return c_info
+
+def nvmlDeviceGetC2cModeInfoV(handle):
+    return nvmlDeviceGetC2cModeInfoV1(handle)
+
+@convertStrBytes
+def nvmlDeviceGetBoardPartNumber(handle):
+    c_part_number = create_string_buffer(NVML_DEVICE_PART_NUMBER_BUFFER_SIZE)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetBoardPartNumber")
+    ret = fn(handle, c_part_number, c_uint(NVML_DEVICE_PART_NUMBER_BUFFER_SIZE))
+    _nvmlCheckReturn(ret)
+    return c_part_number.value
+
+@convertStrBytes
+def nvmlDeviceGetSerial(handle):
+    c_serial = create_string_buffer(NVML_DEVICE_SERIAL_BUFFER_SIZE)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetSerial")
+    ret = fn(handle, c_serial, c_uint(NVML_DEVICE_SERIAL_BUFFER_SIZE))
+    _nvmlCheckReturn(ret)
+    return c_serial.value
+
+def nvmlDeviceGetModuleId(handle, moduleId=c_uint()):
+    isReference = type(moduleId) is not c_uint
+    moduleIdRef = moduleId if isReference else byref(moduleId)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetModuleId")
+    ret = fn(handle, moduleIdRef)
+    if isReference:
+        return ret
+    else:
+        _nvmlCheckReturn(ret)
+        return moduleId.value
+
+def nvmlDeviceGetMemoryAffinity(handle, nodeSetSize, scope):
+    affinity_array = c_ulonglong * nodeSetSize
+    c_affinity = affinity_array()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMemoryAffinity")
+    ret = fn(handle, nodeSetSize, byref(c_affinity), _nvmlAffinityScope_t(scope))
+    _nvmlCheckReturn(ret)
+    return c_affinity
+
+def nvmlDeviceGetCpuAffinityWithinScope(handle, cpuSetSize, scope):
+    affinity_array = c_ulonglong * cpuSetSize
+    c_affinity = affinity_array()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetCpuAffinityWithinScope")
+    ret = fn(handle, cpuSetSize, byref(c_affinity), _nvmlAffinityScope_t(scope))
+    _nvmlCheckReturn(ret)
+    return c_affinity
+
+def nvmlDeviceGetCpuAffinity(handle, cpuSetSize):
+    affinity_array = c_ulonglong * cpuSetSize
+    c_affinity = affinity_array()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetCpuAffinity")
+    ret = fn(handle, cpuSetSize, byref(c_affinity))
+    _nvmlCheckReturn(ret)
+    return c_affinity
+
+def nvmlDeviceSetCpuAffinity(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetCpuAffinity")
+    ret = fn(handle)
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceClearCpuAffinity(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceClearCpuAffinity")
+    ret = fn(handle)
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceGetNumaNodeId(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetNumaNodeId")
+    node = c_int()
+    ret = fn(handle, byref(node))
+    _nvmlCheckReturn(ret)
+    return node.value
+
+def nvmlDeviceGetMinorNumber(handle):
+    c_minor_number = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMinorNumber")
+    ret = fn(handle, byref(c_minor_number))
+    _nvmlCheckReturn(ret)
+    return c_minor_number.value
+
+@convertStrBytes
+def nvmlDeviceGetUUID(handle):
+    c_uuid = create_string_buffer(NVML_DEVICE_UUID_V2_BUFFER_SIZE)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetUUID")
+    ret = fn(handle, c_uuid, c_uint(NVML_DEVICE_UUID_V2_BUFFER_SIZE))
+    _nvmlCheckReturn(ret)
+    return c_uuid.value
+
+@convertStrBytes
+def nvmlDeviceGetInforomVersion(handle, infoRomObject):
+    c_version = create_string_buffer(NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetInforomVersion")
+    ret = fn(handle, _nvmlInforomObject_t(infoRomObject),
+                 c_version, c_uint(NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE))
+    _nvmlCheckReturn(ret)
+    return c_version.value
+
+# Added in 4.304
+@convertStrBytes
+def nvmlDeviceGetInforomImageVersion(handle):
+    c_version = create_string_buffer(NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetInforomImageVersion")
+    ret = fn(handle, c_version, c_uint(NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE))
+    _nvmlCheckReturn(ret)
+    return c_version.value
+
+# Added in 4.304
+def nvmlDeviceGetInforomConfigurationChecksum(handle):
+    c_checksum = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetInforomConfigurationChecksum")
+    ret = fn(handle, byref(c_checksum))
+    _nvmlCheckReturn(ret)
+    return c_checksum.value
+
+# Added in 4.304
+def nvmlDeviceValidateInforom(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceValidateInforom")
+    ret = fn(handle)
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceGetLastBBXFlushTime(handle):
+    c_timestamp = c_ulonglong()
+    c_durationUs = c_ulong()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetLastBBXFlushTime")
+    ret = fn(handle, byref(c_timestamp), byref(c_durationUs))
+    _nvmlCheckReturn(ret)
+    return [c_timestamp.value, c_durationUs.value]
+
+def nvmlDeviceGetDisplayMode(handle):
+    c_mode = _nvmlEnableState_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetDisplayMode")
+    ret = fn(handle, byref(c_mode))
+    _nvmlCheckReturn(ret)
+    return c_mode.value
+
+def nvmlDeviceGetDisplayActive(handle):
+    c_mode = _nvmlEnableState_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetDisplayActive")
+    ret = fn(handle, byref(c_mode))
+    _nvmlCheckReturn(ret)
+    return c_mode.value
+
+
+def nvmlDeviceGetPersistenceMode(handle):
+    c_state = _nvmlEnableState_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPersistenceMode")
+    ret = fn(handle, byref(c_state))
+    _nvmlCheckReturn(ret)
+    return c_state.value
+
+def nvmlDeviceGetPciInfoExt(handle, c_info):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPciInfoExt")
+    ret = fn(handle, c_info)
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceGetPciInfo_v3(handle):
+    c_info = nvmlPciInfo_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPciInfo_v3")
+    ret = fn(handle, byref(c_info))
+    _nvmlCheckReturn(ret)
+    return c_info
+
+def nvmlDeviceGetPciInfo(handle):
+    return nvmlDeviceGetPciInfo_v3(handle)
+
+def nvmlDeviceGetClockInfo(handle, type):
+    c_clock = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetClockInfo")
+    ret = fn(handle, _nvmlClockType_t(type), byref(c_clock))
+    _nvmlCheckReturn(ret)
+    return c_clock.value
+
+# Added in 2.285
+def nvmlDeviceGetMaxClockInfo(handle, type):
+    c_clock = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMaxClockInfo")
+    ret = fn(handle, _nvmlClockType_t(type), byref(c_clock))
+    _nvmlCheckReturn(ret)
+    return c_clock.value
+
+# Added in 4.304
+def nvmlDeviceGetApplicationsClock(handle, type):
+    c_clock = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetApplicationsClock")
+    ret = fn(handle, _nvmlClockType_t(type), byref(c_clock))
+    _nvmlCheckReturn(ret)
+    return c_clock.value
+
+def nvmlDeviceGetMaxCustomerBoostClock(handle, type):
+    c_clock = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMaxCustomerBoostClock")
+    ret = fn(handle, _nvmlClockType_t(type), byref(c_clock))
+    _nvmlCheckReturn(ret)
+    return c_clock.value
+
+def nvmlDeviceGetClock(handle, type, id):
+    c_clock = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetClock")
+    ret = fn(handle, _nvmlClockType_t(type), _nvmlClockId_t(id), byref(c_clock))
+    _nvmlCheckReturn(ret)
+    return c_clock.value
+
+# Added in 5.319
+def nvmlDeviceGetDefaultApplicationsClock(handle, type):
+    c_clock = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetDefaultApplicationsClock")
+    ret = fn(handle, _nvmlClockType_t(type), byref(c_clock))
+    _nvmlCheckReturn(ret)
+    return c_clock.value
+
+# Added in 4.304
+def nvmlDeviceGetSupportedMemoryClocks(handle):
+    # first call to get the size
+    c_count = c_uint(0)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetSupportedMemoryClocks")
+    ret = fn(handle, byref(c_count), None)
+
+    if (ret == NVML_SUCCESS):
+        # special case, no clocks
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        clocks_array = c_uint * c_count.value
+        c_clocks = clocks_array()
+
+        # make the call again
+        ret = fn(handle, byref(c_count), c_clocks)
+        _nvmlCheckReturn(ret)
+
+        procs = []
+        for i in range(c_count.value):
+            procs.append(c_clocks[i])
+
+        return procs
+    else:
+        # error case
+        raise NVMLError(ret)
+
+# Added in 4.304
+def nvmlDeviceGetSupportedGraphicsClocks(handle, memoryClockMHz):
+    # first call to get the size
+    c_count = c_uint(0)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetSupportedGraphicsClocks")
+    ret = fn(handle, c_uint(memoryClockMHz), byref(c_count), None)
+
+    if (ret == NVML_SUCCESS):
+        # special case, no clocks
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        clocks_array = c_uint * c_count.value
+        c_clocks = clocks_array()
+
+        # make the call again
+        ret = fn(handle, c_uint(memoryClockMHz), byref(c_count), c_clocks)
+        _nvmlCheckReturn(ret)
+
+        procs = []
+        for i in range(c_count.value):
+            procs.append(c_clocks[i])
+
+        return procs
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlDeviceGetFanSpeed(handle):
+    c_speed = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetFanSpeed")
+    ret = fn(handle, byref(c_speed))
+    _nvmlCheckReturn(ret)
+    return c_speed.value
+
+def nvmlDeviceGetFanSpeed_v2(handle, fan):
+    c_speed = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetFanSpeed_v2")
+    ret = fn(handle, fan, byref(c_speed))
+    _nvmlCheckReturn(ret)
+    return c_speed.value
+
+class c_nvmlFanSpeedInfo_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('fan', c_uint),
+        ('speed', c_uint),
+    ]
+
+nvmlFanSpeedInfo_v1 = 0x100000C
+
+def nvmlDeviceGetFanSpeedRPM(handle):
+    c_fanSpeed = c_nvmlFanSpeedInfo_t()
+    c_fanSpeed.fan = 0
+    c_fanSpeed.version = nvmlFanSpeedInfo_v1
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetFanSpeedRPM")
+    ret = fn(handle, byref(c_fanSpeed))
+    _nvmlCheckReturn(ret)
+    return c_fanSpeed.speed
+
+def nvmlDeviceGetTargetFanSpeed(handle, fan):
+    c_speed = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetTargetFanSpeed")
+    ret = fn(handle, fan, byref(c_speed))
+    _nvmlCheckReturn(ret)
+    return c_speed.value
+
+def nvmlDeviceGetNumFans(device):
+    c_numFans = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetNumFans")
+    ret = fn(device, byref(c_numFans))
+    _nvmlCheckReturn(ret)
+    return c_numFans.value
+
+def nvmlDeviceSetDefaultFanSpeed_v2(handle, index):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetDefaultFanSpeed_v2");
+    ret = fn(handle, index)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceGetMinMaxFanSpeed(handle, minSpeed=c_uint(), maxSpeed=c_uint()):
+    isReference = (type(minSpeed) is not c_uint) or (type(maxSpeed) is not c_uint)
+    minSpeedRef = minSpeed if isReference else byref(minSpeed)
+    maxSpeedRef = maxSpeed if isReference else byref(maxSpeed)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMinMaxFanSpeed")
+    ret = fn(handle, minSpeedRef, maxSpeedRef)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS if isReference else [minSpeed.value, maxSpeed.value]
+
+def nvmlDeviceGetFanControlPolicy_v2(handle, fan, fanControlPolicy=c_uint()):
+    isReference = type(fanControlPolicy) is not c_uint
+    fanControlPolicyRef = fanControlPolicy if isReference else byref(fanControlPolicy)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetFanControlPolicy_v2")
+    ret = fn(handle, fan, fanControlPolicyRef)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS if isReference else fanControlPolicy.value
+
+def nvmlDeviceSetFanControlPolicy(handle, fan, fanControlPolicy):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetFanControlPolicy")
+    ret = fn(handle, fan, _nvmlFanControlPolicy_t(fanControlPolicy))
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+class c_nvmlTemperature_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('sensorType', _nvmlTemperatureSensors_t),
+        ('temperature', c_int),
+    ]
+nvmlTemperature_v1 = 0x100000C
+
+def nvmlDeviceGetTemperatureV1(handle, sensor):
+    c_temp = c_nvmlTemperature_v1_t()
+    c_temp.version = nvmlTemperature_v1
+    c_temp.sensorType = _nvmlTemperatureSensors_t(sensor) 
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetTemperatureV")
+    ret = fn(handle, byref(c_temp))
+    _nvmlCheckReturn(ret)
+    return c_temp.temperature
+
+def nvmlDeviceGetTemperatureV(handle, sensor, version=nvmlTemperature_v1):
+    if version == nvmlTemperature_v1:
+        return nvmlDeviceGetTemperatureV1(handle, sensor)
+    else:
+        raise NVMLError(NVML_ERROR_ARGUMENT_VERSION_MISMATCH)
+
+# DEPRECATED use nvmlDeviceGetTemperatureV instead
+def nvmlDeviceGetTemperature(handle, sensor):
+    c_temp = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetTemperature")
+    ret = fn(handle, _nvmlTemperatureSensors_t(sensor), byref(c_temp))
+    _nvmlCheckReturn(ret)
+    return c_temp.value
+
+def nvmlDeviceGetTemperatureThreshold(handle, threshold):
+    c_temp = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetTemperatureThreshold")
+    ret = fn(handle, _nvmlTemperatureThresholds_t(threshold), byref(c_temp))
+    _nvmlCheckReturn(ret)
+    return c_temp.value
+
+def nvmlDeviceSetTemperatureThreshold(handle, threshold, temp):
+    c_temp = c_uint()
+    c_temp.value = temp
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetTemperatureThreshold")
+    ret = fn(handle, _nvmlTemperatureThresholds_t(threshold), byref(c_temp))
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceGetMarginTemperature(handle):
+    c_marginTempInfo = c_nvmlMarginTemperature_v1_t()
+    c_marginTempInfo.version = nvmlMarginTemperature_v1
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMarginTemperature")
+    ret = fn(handle, byref(c_marginTempInfo))
+    _nvmlCheckReturn(ret)
+    return c_marginTempInfo.marginTemperature
+
+# DEPRECATED use nvmlDeviceGetPerformanceState
+def nvmlDeviceGetPowerState(handle):
+    c_pstate = _nvmlPstates_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerState")
+    ret = fn(handle, byref(c_pstate))
+    _nvmlCheckReturn(ret)
+    return c_pstate.value
+
+def nvmlDeviceGetPerformanceState(handle):
+    c_pstate = _nvmlPstates_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPerformanceState")
+    ret = fn(handle, byref(c_pstate))
+    _nvmlCheckReturn(ret)
+    return c_pstate.value
+
+def nvmlDeviceGetPowerManagementMode(handle):
+    c_pcapMode = _nvmlEnableState_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerManagementMode")
+    ret = fn(handle, byref(c_pcapMode))
+    _nvmlCheckReturn(ret)
+    return c_pcapMode.value
+
+def nvmlDeviceGetPowerManagementLimit(handle):
+    c_limit = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerManagementLimit")
+    ret = fn(handle, byref(c_limit))
+    _nvmlCheckReturn(ret)
+    return c_limit.value
+
+# Added in 4.304
+def nvmlDeviceGetPowerManagementLimitConstraints(handle):
+    c_minLimit = c_uint()
+    c_maxLimit = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerManagementLimitConstraints")
+    ret = fn(handle, byref(c_minLimit), byref(c_maxLimit))
+    _nvmlCheckReturn(ret)
+    return [c_minLimit.value, c_maxLimit.value]
+
+# Added in 4.304
+def nvmlDeviceGetPowerManagementDefaultLimit(handle):
+    c_limit = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerManagementDefaultLimit")
+    ret = fn(handle, byref(c_limit))
+    _nvmlCheckReturn(ret)
+    return c_limit.value
+
+
+# Added in 331
+def nvmlDeviceGetEnforcedPowerLimit(handle):
+    c_limit = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetEnforcedPowerLimit")
+    ret = fn(handle, byref(c_limit))
+    _nvmlCheckReturn(ret)
+    return c_limit.value
+
+def nvmlDeviceGetPowerUsage(handle):
+    c_watts = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerUsage")
+    ret = fn(handle, byref(c_watts))
+    _nvmlCheckReturn(ret)
+    return c_watts.value
+
+def nvmlDeviceGetTotalEnergyConsumption(handle):
+    c_millijoules = c_uint64()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetTotalEnergyConsumption")
+    ret = fn(handle, byref(c_millijoules))
+    _nvmlCheckReturn(ret)
+    return c_millijoules.value
+
+# Added in 4.304
+def nvmlDeviceGetGpuOperationMode(handle):
+    c_currState = _nvmlGpuOperationMode_t()
+    c_pendingState = _nvmlGpuOperationMode_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuOperationMode")
+    ret = fn(handle, byref(c_currState), byref(c_pendingState))
+    _nvmlCheckReturn(ret)
+    return [c_currState.value, c_pendingState.value]
+
+# Added in 4.304
+def nvmlDeviceGetCurrentGpuOperationMode(handle):
+    return nvmlDeviceGetGpuOperationMode(handle)[0]
+
+# Added in 4.304
+def nvmlDeviceGetPendingGpuOperationMode(handle):
+    return nvmlDeviceGetGpuOperationMode(handle)[1]
+
+def nvmlDeviceGetMemoryInfo(handle, version=None):
+    if not version:
+        c_memory = c_nvmlMemory_t()
+        fn = _nvmlGetFunctionPointer("nvmlDeviceGetMemoryInfo")
+    else:
+        c_memory = c_nvmlMemory_v2_t()
+        c_memory.version = version
+        fn = _nvmlGetFunctionPointer("nvmlDeviceGetMemoryInfo_v2")
+    ret = fn(handle, byref(c_memory))
+    _nvmlCheckReturn(ret)
+    return c_memory
+
+def nvmlDeviceGetBAR1MemoryInfo(handle):
+    c_bar1_memory = c_nvmlBAR1Memory_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetBAR1MemoryInfo")
+    ret = fn(handle, byref(c_bar1_memory))
+    _nvmlCheckReturn(ret)
+    return c_bar1_memory
+
+def nvmlDeviceGetComputeMode(handle):
+    c_mode = _nvmlComputeMode_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetComputeMode")
+    ret = fn(handle, byref(c_mode))
+    _nvmlCheckReturn(ret)
+    return c_mode.value
+
+def nvmlDeviceGetCudaComputeCapability(handle):
+    c_major = c_int()
+    c_minor = c_int()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetCudaComputeCapability")
+    ret = fn(handle, byref(c_major), byref(c_minor))
+    _nvmlCheckReturn(ret)
+    return (c_major.value, c_minor.value)
+
+def nvmlDeviceGetEccMode(handle):
+    c_currState = _nvmlEnableState_t()
+    c_pendingState = _nvmlEnableState_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetEccMode")
+    ret = fn(handle, byref(c_currState), byref(c_pendingState))
+    _nvmlCheckReturn(ret)
+    return [c_currState.value, c_pendingState.value]
+
+# added to API
+def nvmlDeviceGetCurrentEccMode(handle):
+    return nvmlDeviceGetEccMode(handle)[0]
+
+# added to API
+def nvmlDeviceGetPendingEccMode(handle):
+    return nvmlDeviceGetEccMode(handle)[1]
+
+def nvmlDeviceGetDefaultEccMode(handle):
+    c_defaultState = _nvmlEnableState_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetDefaultEccMode")
+    ret = fn(handle, byref(c_defaultState))
+    _nvmlCheckReturn(ret)
+    return [c_defaultState.value]
+
+def nvmlDeviceGetTotalEccErrors(handle, errorType, counterType):
+    c_count = c_ulonglong()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetTotalEccErrors")
+    ret = fn(handle, _nvmlMemoryErrorType_t(errorType),
+                 _nvmlEccCounterType_t(counterType), byref(c_count))
+    _nvmlCheckReturn(ret)
+    return c_count.value
+
+# This is deprecated, instead use nvmlDeviceGetMemoryErrorCounter
+def nvmlDeviceGetDetailedEccErrors(handle, errorType, counterType):
+    c_counts = c_nvmlEccErrorCounts_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetDetailedEccErrors")
+    ret = fn(handle, _nvmlMemoryErrorType_t(errorType),
+                 _nvmlEccCounterType_t(counterType), byref(c_counts))
+    _nvmlCheckReturn(ret)
+    return c_counts
+
+# Added in 4.304
+def nvmlDeviceGetMemoryErrorCounter(handle, errorType, counterType, locationType):
+    c_count = c_ulonglong()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMemoryErrorCounter")
+    ret = fn(handle,
+             _nvmlMemoryErrorType_t(errorType),
+             _nvmlEccCounterType_t(counterType),
+             _nvmlMemoryLocation_t(locationType),
+             byref(c_count))
+    _nvmlCheckReturn(ret)
+    return c_count.value
+
+def nvmlDeviceGetUtilizationRates(handle):
+    c_util = c_nvmlUtilization_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetUtilizationRates")
+    ret = fn(handle, byref(c_util))
+    _nvmlCheckReturn(ret)
+    return c_util
+
+def nvmlDeviceGetEncoderUtilization(handle):
+    c_util = c_uint()
+    c_samplingPeriod = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetEncoderUtilization")
+    ret = fn(handle, byref(c_util), byref(c_samplingPeriod))
+    _nvmlCheckReturn(ret)
+    return [c_util.value, c_samplingPeriod.value]
+
+def nvmlDeviceGetDecoderUtilization(handle):
+    c_util = c_uint()
+    c_samplingPeriod = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetDecoderUtilization")
+    ret = fn(handle, byref(c_util), byref(c_samplingPeriod))
+    _nvmlCheckReturn(ret)
+    return [c_util.value, c_samplingPeriod.value]
+
+def nvmlDeviceGetJpgUtilization(handle):
+    c_util = c_uint()
+    c_samplingPeriod = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetJpgUtilization")
+    ret = fn(handle, byref(c_util), byref(c_samplingPeriod))
+    _nvmlCheckReturn(ret)
+    return [c_util.value, c_samplingPeriod.value]
+
+def nvmlDeviceGetOfaUtilization(handle):
+    c_util = c_uint()
+    c_samplingPeriod = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetOfaUtilization")
+    ret = fn(handle, byref(c_util), byref(c_samplingPeriod))
+    _nvmlCheckReturn(ret)
+    return [c_util.value, c_samplingPeriod.value]
+
+def nvmlDeviceGetPcieReplayCounter(handle):
+    c_replay = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPcieReplayCounter")
+    ret = fn(handle, byref(c_replay))
+    _nvmlCheckReturn(ret)
+    return c_replay.value
+
+def nvmlDeviceGetDriverModel(handle):
+    c_currModel = _nvmlDriverModel_t()
+    c_pendingModel = _nvmlDriverModel_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetDriverModel")
+    ret = fn(handle, byref(c_currModel), byref(c_pendingModel))
+    _nvmlCheckReturn(ret)
+    return [c_currModel.value, c_pendingModel.value]
+
+# added to API
+def nvmlDeviceGetCurrentDriverModel(handle):
+    return nvmlDeviceGetDriverModel(handle)[0]
+
+# added to API
+def nvmlDeviceGetPendingDriverModel(handle):
+    return nvmlDeviceGetDriverModel(handle)[1]
+
+# Added in 2.285
+@convertStrBytes
+def nvmlDeviceGetVbiosVersion(handle):
+    c_version = create_string_buffer(NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetVbiosVersion")
+    ret = fn(handle, c_version, c_uint(NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE))
+    _nvmlCheckReturn(ret)
+    return c_version.value
+
+# Added in 2.285
+def nvmlDeviceGetComputeRunningProcesses_v2(handle):
+    # first call to get the size
+    c_count = c_uint(0)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetComputeRunningProcesses_v2")
+    ret = fn(handle, byref(c_count), None)
+    if (ret == NVML_SUCCESS):
+        # special case, no running processes
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        # oversize the array incase more processes are created
+        c_count.value = c_count.value * 2 + 5
+        proc_array = c_nvmlProcessInfo_v2_t * c_count.value
+        c_procs = proc_array()
+        # make the call again
+        ret = fn(handle, byref(c_count), c_procs)
+        _nvmlCheckReturn(ret)
+        procs = []
+        for i in range(c_count.value):
+            # use an alternative struct for this object
+            obj = nvmlStructToFriendlyObject(c_procs[i])
+            if (obj.usedGpuMemory == NVML_VALUE_NOT_AVAILABLE_ulonglong.value):
+                # special case for WDDM on Windows, see comment above
+                obj.usedGpuMemory = None
+            procs.append(obj)
+        return procs
+    else:
+        # error case
+        raise NVMLError(ret)
+
+# Added in 2.285
+def nvmlDeviceGetComputeRunningProcesses_v3(handle):
+    # first call to get the size
+    c_count = c_uint(0)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetComputeRunningProcesses_v3")
+    ret = fn(handle, byref(c_count), None)
+
+    if (ret == NVML_SUCCESS):
+        # special case, no running processes
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        # oversize the array incase more processes are created
+        c_count.value = c_count.value * 2 + 5
+        proc_array = c_nvmlProcessInfo_v3_t * c_count.value
+        c_procs = proc_array()
+
+        # make the call again
+        ret = fn(handle, byref(c_count), c_procs)
+        _nvmlCheckReturn(ret)
+
+        procs = []
+        for i in range(c_count.value):
+            # use an alternative struct for this object
+            obj = nvmlStructToFriendlyObject(c_procs[i])
+            if (obj.usedGpuMemory == NVML_VALUE_NOT_AVAILABLE_ulonglong.value):
+                # special case for WDDM on Windows, see comment above
+                obj.usedGpuMemory = None
+            procs.append(obj)
+
+        return procs
+    else:
+        # error case
+        raise NVMLError(ret)
+
+@throwOnVersionMismatch
+def nvmlDeviceGetComputeRunningProcesses(handle):
+    return nvmlDeviceGetComputeRunningProcesses_v3(handle)
+
+def nvmlDeviceGetGraphicsRunningProcesses_v2(handle):
+    # first call to get the size
+    c_count = c_uint(0)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGraphicsRunningProcesses_v2")
+    ret = fn(handle, byref(c_count), None)
+    if (ret == NVML_SUCCESS):
+        # special case, no running processes
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        # oversize the array incase more processes are created
+        c_count.value = c_count.value * 2 + 5
+        proc_array = c_nvmlProcessInfo_v2_t * c_count.value
+        c_procs = proc_array()
+        # make the call again
+        ret = fn(handle, byref(c_count), c_procs)
+        _nvmlCheckReturn(ret)
+        procs = []
+        for i in range(c_count.value):
+            # use an alternative struct for this object
+            obj = nvmlStructToFriendlyObject(c_procs[i])
+            if (obj.usedGpuMemory == NVML_VALUE_NOT_AVAILABLE_ulonglong.value):
+                # special case for WDDM on Windows, see comment above
+                obj.usedGpuMemory = None
+            procs.append(obj)
+        return procs
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlDeviceGetGraphicsRunningProcesses_v3(handle):
+    # first call to get the size
+    c_count = c_uint(0)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGraphicsRunningProcesses_v3")
+    ret = fn(handle, byref(c_count), None)
+
+    if (ret == NVML_SUCCESS):
+        # special case, no running processes
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        # oversize the array incase more processes are created
+        c_count.value = c_count.value * 2 + 5
+        proc_array = c_nvmlProcessInfo_v3_t * c_count.value
+        c_procs = proc_array()
+
+        # make the call again
+        ret = fn(handle, byref(c_count), c_procs)
+        _nvmlCheckReturn(ret)
+
+        procs = []
+        for i in range(c_count.value):
+            # use an alternative struct for this object
+            obj = nvmlStructToFriendlyObject(c_procs[i])
+            if (obj.usedGpuMemory == NVML_VALUE_NOT_AVAILABLE_ulonglong.value):
+                # special case for WDDM on Windows, see comment above
+                obj.usedGpuMemory = None
+            procs.append(obj)
+
+        return procs
+    else:
+        # error case
+        raise NVMLError(ret)
+
+@throwOnVersionMismatch
+def nvmlDeviceGetGraphicsRunningProcesses(handle):
+    return nvmlDeviceGetGraphicsRunningProcesses_v3(handle)
+
+@throwOnVersionMismatch
+def nvmlDeviceGetMPSComputeRunningProcesses(handle):
+    return nvmlDeviceGetMPSComputeRunningProcesses_v3(handle)
+
+def nvmlDeviceGetMPSComputeRunningProcesses_v2(handle):
+    # first call to get the size
+    c_count = c_uint(0)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMPSComputeRunningProcesses_v2")
+    ret = fn(handle, byref(c_count), None)
+
+    if (ret == NVML_SUCCESS):
+        # special case, no running processes
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        # oversize the array incase more processes are created
+        c_count.value = c_count.value * 2 + 5
+        proc_array = c_nvmlProcessInfo_v2_t * c_count.value
+        c_procs = proc_array()
+
+        # make the call again
+        ret = fn(handle, byref(c_count), c_procs)
+        _nvmlCheckReturn(ret)
+
+        procs = []
+        for i in range(c_count.value):
+            # use an alternative struct for this object
+            obj = nvmlStructToFriendlyObject(c_procs[i])
+            if (obj.usedGpuMemory == NVML_VALUE_NOT_AVAILABLE_ulonglong.value):
+                # special case for WDDM on Windows, see comment above
+                obj.usedGpuMemory = None
+            procs.append(obj)
+
+        return procs
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlDeviceGetMPSComputeRunningProcesses_v3(handle):
+    # first call to get the size
+    c_count = c_uint(0)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMPSComputeRunningProcesses_v3")
+    ret = fn(handle, byref(c_count), None)
+
+    if (ret == NVML_SUCCESS):
+        # special case, no running processes
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        # oversize the array incase more processes are created
+        c_count.value = c_count.value * 2 + 5
+        proc_array = c_nvmlProcessInfo_v3_t * c_count.value
+        c_procs = proc_array()
+
+        # make the call again
+        ret = fn(handle, byref(c_count), c_procs)
+        _nvmlCheckReturn(ret)
+
+        procs = []
+        for i in range(c_count.value):
+            # use an alternative struct for this object
+            obj = nvmlStructToFriendlyObject(c_procs[i])
+            if (obj.usedGpuMemory == NVML_VALUE_NOT_AVAILABLE_ulonglong.value):
+                # special case for WDDM on Windows, see comment above
+                obj.usedGpuMemory = None
+            procs.append(obj)
+
+        return procs
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlDeviceGetRunningProcessDetailList(handle, version, mode):
+    c_processDetailList = c_nvmlProcessDetailList_t()
+    c_processDetailList.version = version
+    c_processDetailList.mode = mode
+
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetRunningProcessDetailList")
+
+    # first call to get the size
+    ret = fn(handle, byref(c_processDetailList))
+    if (ret == NVML_SUCCESS):
+        # special case, no running processes
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        c_procs = c_nvmlProcessDetail_v1_t * c_processDetailList.numProcArrayEntries
+        c_processDetailList.procArray = cast((c_procs)(), POINTER(c_nvmlProcessDetail_v1_t))
+
+        # make the call again
+        ret = fn(handle, byref(c_processDetailList))
+        _nvmlCheckReturn(ret)
+
+        procs = []
+        for i in range(c_processDetailList.numProcArrayEntries):
+            # use an alternative struct for this object
+            obj = c_processDetailList.procArray[i]
+            if (obj.usedGpuMemory == NVML_VALUE_NOT_AVAILABLE_ulonglong.value):
+                obj.usedGpuMemory = None
+            if (obj.usedGpuCcProtectedMemory == NVML_VALUE_NOT_AVAILABLE_ulonglong.value):
+                obj.usedGpuCcProtectedMemory = None
+            procs.append(obj)
+
+        return procs
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlDeviceGetAutoBoostedClocksEnabled(handle):
+    c_isEnabled = _nvmlEnableState_t()
+    c_defaultIsEnabled = _nvmlEnableState_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetAutoBoostedClocksEnabled")
+    ret = fn(handle, byref(c_isEnabled), byref(c_defaultIsEnabled))
+    _nvmlCheckReturn(ret)
+    return [c_isEnabled.value, c_defaultIsEnabled.value]
+    #Throws NVML_ERROR_NOT_SUPPORTED if hardware doesn't support setting auto boosted clocks
+
+## Set functions
+def nvmlUnitSetLedState(unit, color):
+    fn = _nvmlGetFunctionPointer("nvmlUnitSetLedState")
+    ret = fn(unit, _nvmlLedColor_t(color))
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceSetPersistenceMode(handle, mode):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetPersistenceMode")
+    ret = fn(handle, _nvmlEnableState_t(mode))
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceSetComputeMode(handle, mode):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetComputeMode")
+    ret = fn(handle, _nvmlComputeMode_t(mode))
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceSetEccMode(handle, mode):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetEccMode")
+    ret = fn(handle, _nvmlEnableState_t(mode))
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceClearEccErrorCounts(handle, counterType):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceClearEccErrorCounts")
+    ret = fn(handle, _nvmlEccCounterType_t(counterType))
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceSetDriverModel(handle, model):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetDriverModel")
+    ret = fn(handle, _nvmlDriverModel_t(model))
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceSetAutoBoostedClocksEnabled(handle, enabled):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetAutoBoostedClocksEnabled")
+    ret = fn(handle, _nvmlEnableState_t(enabled))
+    _nvmlCheckReturn(ret)
+    return None
+    #Throws NVML_ERROR_NOT_SUPPORTED if hardware doesn't support setting auto boosted clocks
+
+def nvmlDeviceSetDefaultAutoBoostedClocksEnabled(handle, enabled, flags):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetDefaultAutoBoostedClocksEnabled")
+    ret = fn(handle, _nvmlEnableState_t(enabled), c_uint(flags))
+    _nvmlCheckReturn(ret)
+    return None
+    #Throws NVML_ERROR_NOT_SUPPORTED if hardware doesn't support setting auto boosted clocks
+
+def nvmlDeviceSetGpuLockedClocks(handle, minGpuClockMHz, maxGpuClockMHz):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetGpuLockedClocks")
+    ret = fn(handle, c_uint(minGpuClockMHz), c_uint(maxGpuClockMHz))
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceResetGpuLockedClocks(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceResetGpuLockedClocks")
+    ret = fn(handle)
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceSetMemoryLockedClocks(handle, minMemClockMHz, maxMemClockMHz):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetMemoryLockedClocks")
+    ret = fn(handle, c_uint(minMemClockMHz), c_uint(maxMemClockMHz))
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceResetMemoryLockedClocks(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceResetMemoryLockedClocks")
+    ret = fn(handle)
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceGetClkMonStatus(handle, c_clkMonInfo=nvmlClkMonStatus_t()):
+    isReference = type(c_clkMonInfo) is not nvmlClkMonStatus_t
+    c_clkMonInfoRef = c_clkMonInfo if isReference else byref(c_clkMonInfo)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetClkMonStatus")
+    ret = fn(handle, c_clkMonInfoRef)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS if isReference else c_clkMonInfo
+
+# Added in 4.304
+def nvmlDeviceSetApplicationsClocks(handle, maxMemClockMHz, maxGraphicsClockMHz):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetApplicationsClocks")
+    ret = fn(handle, c_uint(maxMemClockMHz), c_uint(maxGraphicsClockMHz))
+    _nvmlCheckReturn(ret)
+    return None
+
+# Added in 4.304
+def nvmlDeviceResetApplicationsClocks(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceResetApplicationsClocks")
+    ret = fn(handle)
+    _nvmlCheckReturn(ret)
+    return None
+
+# Added in 4.304
+def nvmlDeviceSetPowerManagementLimit(handle, limit):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetPowerManagementLimit")
+    ret = fn(handle, c_uint(limit))
+    _nvmlCheckReturn(ret)
+    return None
+
+# Added in 4.304
+def nvmlDeviceSetGpuOperationMode(handle, mode):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetGpuOperationMode")
+    ret = fn(handle, _nvmlGpuOperationMode_t(mode))
+    _nvmlCheckReturn(ret)
+    return None
+
+# Added in 2.285
+def nvmlEventSetCreate():
+    fn = _nvmlGetFunctionPointer("nvmlEventSetCreate")
+    eventSet = c_nvmlEventSet_t()
+    ret = fn(byref(eventSet))
+    _nvmlCheckReturn(ret)
+    return eventSet
+
+# Added in 2.285
+def nvmlDeviceRegisterEvents(handle, eventTypes, eventSet):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceRegisterEvents")
+    ret = fn(handle, c_ulonglong(eventTypes), eventSet)
+    _nvmlCheckReturn(ret)
+    return None
+
+# Added in 2.285
+def nvmlDeviceGetSupportedEventTypes(handle):
+    c_eventTypes = c_ulonglong()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetSupportedEventTypes")
+    ret = fn(handle, byref(c_eventTypes))
+    _nvmlCheckReturn(ret)
+    return c_eventTypes.value
+
+# raises NVML_ERROR_TIMEOUT exception on timeout
+def nvmlEventSetWait_v2(eventSet, timeoutms):
+    fn = _nvmlGetFunctionPointer("nvmlEventSetWait_v2")
+    data = c_nvmlEventData_t()
+    ret = fn(eventSet, byref(data), c_uint(timeoutms))
+    _nvmlCheckReturn(ret)
+    return data
+
+def nvmlEventSetWait(eventSet, timeoutms):
+    return nvmlEventSetWait_v2(eventSet, timeoutms)
+
+# Added in 2.285
+def nvmlEventSetFree(eventSet):
+    fn = _nvmlGetFunctionPointer("nvmlEventSetFree")
+    ret = fn(eventSet)
+    _nvmlCheckReturn(ret)
+    return None
+
+# Added in 3.295
+def nvmlDeviceOnSameBoard(handle1, handle2):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceOnSameBoard")
+    onSameBoard = c_int()
+    ret = fn(handle1, handle2, byref(onSameBoard))
+    _nvmlCheckReturn(ret)
+    return (onSameBoard.value != 0)
+
+# Added in 3.295
+def nvmlDeviceGetCurrPcieLinkGeneration(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetCurrPcieLinkGeneration")
+    gen = c_uint()
+    ret = fn(handle, byref(gen))
+    _nvmlCheckReturn(ret)
+    return gen.value
+
+# Added in 3.295
+def nvmlDeviceGetMaxPcieLinkGeneration(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMaxPcieLinkGeneration")
+    gen = c_uint()
+    ret = fn(handle, byref(gen))
+    _nvmlCheckReturn(ret)
+    return gen.value
+
+# Added in 3.295
+def nvmlDeviceGetCurrPcieLinkWidth(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetCurrPcieLinkWidth")
+    width = c_uint()
+    ret = fn(handle, byref(width))
+    _nvmlCheckReturn(ret)
+    return width.value
+
+# Added in 3.295
+def nvmlDeviceGetMaxPcieLinkWidth(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMaxPcieLinkWidth")
+    width = c_uint()
+    ret = fn(handle, byref(width))
+    _nvmlCheckReturn(ret)
+    return width.value
+
+def nvmlDeviceGetGpuMaxPcieLinkGeneration(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuMaxPcieLinkGeneration")
+    gen = c_uint()
+    ret = fn(handle, byref(gen))
+    _nvmlCheckReturn(ret)
+    return gen.value
+
+# Added in 4.304
+def nvmlDeviceGetSupportedClocksThrottleReasons(handle):
+    c_reasons= c_ulonglong()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetSupportedClocksThrottleReasons")
+    ret = fn(handle, byref(c_reasons))
+    _nvmlCheckReturn(ret)
+    return c_reasons.value
+
+def nvmlDeviceGetSupportedClocksEventReasons(handle):
+    c_reasons= c_ulonglong()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetSupportedClocksEventReasons")
+    ret = fn(handle, byref(c_reasons))
+    _nvmlCheckReturn(ret)
+    return c_reasons.value
+
+# Added in 4.304
+def nvmlDeviceGetCurrentClocksThrottleReasons(handle):
+    c_reasons= c_ulonglong()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetCurrentClocksThrottleReasons")
+    ret = fn(handle, byref(c_reasons))
+    _nvmlCheckReturn(ret)
+    return c_reasons.value
+
+def nvmlDeviceGetCurrentClocksEventReasons(handle):
+    c_reasons= c_ulonglong()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetCurrentClocksEventReasons")
+    ret = fn(handle, byref(c_reasons))
+    _nvmlCheckReturn(ret)
+    return c_reasons.value
+
+# Added in 5.319
+def nvmlDeviceGetIndex(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetIndex")
+    c_index = c_uint()
+    ret = fn(handle, byref(c_index))
+    _nvmlCheckReturn(ret)
+    return c_index.value
+
+# Added in 5.319
+def nvmlDeviceGetAccountingMode(handle):
+    c_mode = _nvmlEnableState_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetAccountingMode")
+    ret = fn(handle, byref(c_mode))
+    _nvmlCheckReturn(ret)
+    return c_mode.value
+
+def nvmlDeviceSetAccountingMode(handle, mode):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetAccountingMode")
+    ret = fn(handle, _nvmlEnableState_t(mode))
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceClearAccountingPids(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceClearAccountingPids")
+    ret = fn(handle)
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceGetAccountingStats(handle, pid):
+    stats = c_nvmlAccountingStats_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetAccountingStats")
+    ret = fn(handle, c_uint(pid), byref(stats))
+    _nvmlCheckReturn(ret)
+    if (stats.maxMemoryUsage == NVML_VALUE_NOT_AVAILABLE_ulonglong.value):
+        # special case for WDDM on Windows, see comment above
+        stats.maxMemoryUsage = None
+    return stats
+
+def nvmlDeviceGetAccountingPids(handle):
+    count = c_uint(nvmlDeviceGetAccountingBufferSize(handle))
+    pids = (c_uint * count.value)()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetAccountingPids")
+    ret = fn(handle, byref(count), pids)
+    _nvmlCheckReturn(ret)
+    return list(map(int, pids[0:count.value]))
+
+def nvmlDeviceGetAccountingBufferSize(handle):
+    bufferSize = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetAccountingBufferSize")
+    ret = fn(handle, byref(bufferSize))
+    _nvmlCheckReturn(ret)
+    return int(bufferSize.value)
+
+def nvmlDeviceGetRetiredPages(device, sourceFilter):
+    c_source = _nvmlPageRetirementCause_t(sourceFilter)
+    c_count = c_uint(0)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetRetiredPages")
+
+    # First call will get the size
+    ret = fn(device, c_source, byref(c_count), None)
+
+    # this should only fail with insufficient size
+    if ((ret != NVML_SUCCESS) and
+        (ret != NVML_ERROR_INSUFFICIENT_SIZE)):
+        raise NVMLError(ret)
+
+    # call again with a buffer
+    # oversize the array for the rare cases where additional pages
+    # are retired between NVML calls
+    c_count.value = c_count.value * 2 + 5
+    page_array = c_ulonglong * c_count.value
+    c_pages = page_array()
+    ret = fn(device, c_source, byref(c_count), c_pages)
+    _nvmlCheckReturn(ret)
+    return list(map(int, c_pages[0:c_count.value]))
+
+def nvmlDeviceGetRetiredPages_v2(device, sourceFilter):
+    c_source = _nvmlPageRetirementCause_t(sourceFilter)
+    c_count = c_uint(0)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetRetiredPages_v2")
+
+    # First call will get the size
+    ret = fn(device, c_source, byref(c_count), None)
+
+    # this should only fail with insufficient size
+    if ((ret != NVML_SUCCESS) and
+        (ret != NVML_ERROR_INSUFFICIENT_SIZE)):
+        raise NVMLError(ret)
+
+    # call again with a buffer
+    # oversize the array for the rare cases where additional pages
+    # are retired between NVML calls
+    c_count.value = c_count.value * 2 + 5
+    page_array = c_ulonglong * c_count.value
+    c_pages = page_array()
+    times_array = c_ulonglong * c_count.value
+    c_times = times_array()
+    ret = fn(device, c_source, byref(c_count), c_pages, c_times)
+    _nvmlCheckReturn(ret)
+    return [ { 'address': int(c_pages[i]), 'timestamp': int(c_times[i]) } for i in range(c_count.value) ];
+
+def nvmlDeviceGetRetiredPagesPendingStatus(device):
+    c_pending = _nvmlEnableState_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetRetiredPagesPendingStatus")
+    ret = fn(device, byref(c_pending))
+    _nvmlCheckReturn(ret)
+    return int(c_pending.value)
+
+def nvmlDeviceGetAPIRestriction(device, apiType):
+    c_permission = _nvmlEnableState_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetAPIRestriction")
+    ret = fn(device, _nvmlRestrictedAPI_t(apiType), byref(c_permission))
+    _nvmlCheckReturn(ret)
+    return int(c_permission.value)
+
+def nvmlDeviceSetAPIRestriction(handle, apiType, isRestricted):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetAPIRestriction")
+    ret = fn(handle, _nvmlRestrictedAPI_t(apiType), _nvmlEnableState_t(isRestricted))
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceGetBridgeChipInfo(handle):
+    bridgeHierarchy = c_nvmlBridgeChipHierarchy_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetBridgeChipInfo")
+    ret = fn(handle, byref(bridgeHierarchy))
+    _nvmlCheckReturn(ret)
+    return bridgeHierarchy
+
+def nvmlDeviceGetSamples(device, sampling_type, timeStamp):
+    c_sampling_type = _nvmlSamplingType_t(sampling_type)
+    c_time_stamp = c_ulonglong(timeStamp)
+    c_sample_count = c_uint(0)
+    c_sample_value_type = _nvmlValueType_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetSamples")
+
+    ## First Call gets the size
+    ret = fn(device, c_sampling_type, c_time_stamp, byref(c_sample_value_type), byref(c_sample_count), None)
+
+    # Stop if this fails
+    if (ret != NVML_SUCCESS):
+        raise NVMLError(ret)
+
+    sampleArray = c_sample_count.value * c_nvmlSample_t
+    c_samples = sampleArray()
+    ret = fn(device, c_sampling_type, c_time_stamp,  byref(c_sample_value_type), byref(c_sample_count), c_samples)
+    _nvmlCheckReturn(ret)
+    return (c_sample_value_type.value, c_samples[0:c_sample_count.value])
+
+def nvmlDeviceGetViolationStatus(device, perfPolicyType):
+    c_perfPolicy_type = _nvmlPerfPolicyType_t(perfPolicyType)
+    c_violTime = c_nvmlViolationTime_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetViolationStatus")
+
+    ## Invoke the method to get violation time
+    ret = fn(device, c_perfPolicy_type, byref(c_violTime))
+    _nvmlCheckReturn(ret)
+    return c_violTime
+
+def nvmlDeviceGetPcieThroughput(device, counter):
+    c_util = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPcieThroughput")
+    ret = fn(device, _nvmlPcieUtilCounter_t(counter), byref(c_util))
+    _nvmlCheckReturn(ret)
+    return c_util.value
+
+def nvmlSystemGetTopologyGpuSet(cpuNumber):
+    c_count = c_uint(0)
+    fn = _nvmlGetFunctionPointer("nvmlSystemGetTopologyGpuSet")
+
+    # First call will get the size
+    ret = fn(cpuNumber, byref(c_count), None)
+
+    if ret != NVML_SUCCESS:
+        raise NVMLError(ret)
+    # call again with a buffer
+    device_array = c_nvmlDevice_t * c_count.value
+    c_devices = device_array()
+    ret = fn(cpuNumber, byref(c_count), c_devices)
+    _nvmlCheckReturn(ret)
+    return list(c_devices[0:c_count.value])
+
+def nvmlDeviceGetTopologyNearestGpus(device, level):
+    c_count = c_uint(0)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetTopologyNearestGpus")
+
+    # First call will get the size
+    ret = fn(device, level, byref(c_count), None)
+
+    if ret != NVML_SUCCESS:
+        raise NVMLError(ret)
+
+    # call again with a buffer
+    device_array = c_nvmlDevice_t * c_count.value
+    c_devices = device_array()
+    ret = fn(device, level, byref(c_count), c_devices)
+    _nvmlCheckReturn(ret)
+    return list(c_devices[0:c_count.value])
+
+def nvmlDeviceGetTopologyCommonAncestor(device1, device2):
+    c_level = _nvmlGpuTopologyLevel_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetTopologyCommonAncestor")
+    ret = fn(device1, device2, byref(c_level))
+    _nvmlCheckReturn(ret)
+    return c_level.value
+
+def nvmlDeviceGetNvLinkUtilizationCounter(device, link, counter):
+    c_rxcounter = c_ulonglong()
+    c_txcounter = c_ulonglong()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetNvLinkUtilizationCounter")
+    ret = fn(device, link, counter, byref(c_rxcounter), byref(c_txcounter))
+    _nvmlCheckReturn(ret)
+    return (c_rxcounter.value, c_txcounter.value)
+
+def nvmlDeviceFreezeNvLinkUtilizationCounter(device, link, counter, freeze):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceFreezeNvLinkUtilizationCounter")
+    ret = fn(device, link, counter, freeze)
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceResetNvLinkUtilizationCounter(device, link, counter):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceResetNvLinkUtilizationCounter")
+    ret = fn(device, link, counter)
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceSetNvLinkUtilizationControl(device, link, counter, control, reset):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetNvLinkUtilizationControl")
+    ret = fn(device, link, counter, byref(control), reset)
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceGetNvLinkUtilizationControl(device, link, counter):
+    c_control = nvmlNvLinkUtilizationControl_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetNvLinkUtilizationControl")
+    ret = fn(device, link, counter, byref(c_control))
+    _nvmlCheckReturn(ret)
+    return c_control
+
+def nvmlDeviceGetNvLinkCapability(device, link, capability):
+    c_capResult = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetNvLinkCapability")
+    ret = fn(device, link, capability, byref(c_capResult))
+    _nvmlCheckReturn(ret)
+    return c_capResult.value
+
+def nvmlDeviceGetNvLinkErrorCounter(device, link, counter):
+    c_result = c_ulonglong()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetNvLinkErrorCounter")
+    ret = fn(device, link, counter, byref(c_result))
+    _nvmlCheckReturn(ret)
+    return c_result.value
+
+def nvmlDeviceResetNvLinkErrorCounters(device, link):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceResetNvLinkErrorCounters")
+    ret = fn(device, link)
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceGetNvLinkRemotePciInfo(device, link):
+    c_pci = nvmlPciInfo_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetNvLinkRemotePciInfo_v2")
+    ret = fn(device, link, byref(c_pci))
+    _nvmlCheckReturn(ret)
+    return c_pci
+
+def nvmlDeviceGetNvLinkRemoteDeviceType(handle, link):
+    c_type = _nvmlNvLinkDeviceType_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetNvLinkRemoteDeviceType")
+    ret = fn(handle, link, byref(c_type))
+    _nvmlCheckReturn(ret)
+    return c_type.value
+
+def nvmlDeviceGetNvLinkState(device, link):
+    c_isActive = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetNvLinkState")
+    ret = fn(device, link, byref(c_isActive))
+    _nvmlCheckReturn(ret)
+    return c_isActive.value
+
+def nvmlDeviceGetNvLinkVersion(device, link):
+    c_version = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetNvLinkVersion")
+    ret = fn(device, link, byref(c_version))
+    _nvmlCheckReturn(ret)
+    return c_version.value
+
+def nvmlDeviceModifyDrainState(pciInfo, newState):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceModifyDrainState")
+    ret = fn(pointer(pciInfo), newState)
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceQueryDrainState(pciInfo):
+    c_newState = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceQueryDrainState")
+    ret = fn(pointer(pciInfo), byref(c_newState))
+    _nvmlCheckReturn(ret)
+    return c_newState.value
+
+def nvmlDeviceRemoveGpu(pciInfo):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceRemoveGpu")
+    ret = fn(pointer(pciInfo))
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceDiscoverGpus(pciInfo):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceDiscoverGpus")
+    ret = fn(pointer(pciInfo))
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceGetFieldValues(handle, fieldIds):
+    values_arr = c_nvmlFieldValue_t * len(fieldIds)
+    values = values_arr()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetFieldValues")
+
+    for i, fieldId in enumerate(fieldIds):
+        try:
+            (values[i].fieldId, values[i].scopeId) = fieldId
+        except TypeError:
+            values[i].fieldId = fieldId
+
+    ret = fn(handle, c_int32(len(fieldIds)), byref(values))
+    _nvmlCheckReturn(ret)
+    return values
+
+def nvmlDeviceClearFieldValues(handle, fieldIds):
+    values_arr = c_nvmlFieldValue_t * len(fieldIds)
+    values = values_arr()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceClearFieldValues")
+
+    for i, fieldId in enumerate(fieldIds):
+        try:
+            (values[i].fieldId, values[i].scopeId) = fieldId
+        except TypeError:
+            values[i].fieldId = fieldId
+
+    ret = fn(handle, c_int32(len(fieldIds)), byref(values))
+    _nvmlCheckReturn(ret)
+    return values
+
+def nvmlDeviceGetVirtualizationMode(handle):
+    c_virtualization_mode = c_ulonglong()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetVirtualizationMode")
+    ret = fn(handle, byref(c_virtualization_mode))
+    _nvmlCheckReturn(ret)
+    return c_virtualization_mode.value
+
+def nvmlDeviceSetVirtualizationMode(handle, virtualization_mode):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetVirtualizationMode")
+    return fn(handle, virtualization_mode)
+
+def nvmlDeviceGetVgpuHeterogeneousMode(handle):
+    c_vgpuHeterogeneousMode = c_nvmlVgpuHeterogeneousMode_v1_t(0)
+    c_vgpuHeterogeneousMode.version = VgpuHeterogeneousMode_v1
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuHeterogeneousMode")
+    ret = fn(handle, byref(c_vgpuHeterogeneousMode))
+    _nvmlCheckReturn(ret)
+    return c_vgpuHeterogeneousMode.mode
+
+def nvmlDeviceSetVgpuHeterogeneousMode(handle, heterogeneous_mode):
+    c_vgpuHeterogeneousMode = c_nvmlVgpuHeterogeneousMode_v1_t(0)
+    c_vgpuHeterogeneousMode.version = VgpuHeterogeneousMode_v1
+    c_vgpuHeterogeneousMode.mode = heterogeneous_mode
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetVgpuHeterogeneousMode")
+    ret = fn(handle, byref(c_vgpuHeterogeneousMode))
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlVgpuInstanceGetPlacementId(vgpuInstance):
+    c_placement = c_nvmlVgpuPlacementId_v1_t(0)
+    c_placement.version = VgpuPlacementId_v1
+    fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetPlacementId")
+    ret = fn(vgpuInstance, byref(c_placement))
+    _nvmlCheckReturn(ret)
+    return c_placement.placementId
+
+def nvmlDeviceGetVgpuTypeSupportedPlacements(handle, vgpuTypeId, mode=0, version=1):
+    c_max_instances = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetMaxInstances")
+    ret = fn(handle, vgpuTypeId, byref(c_max_instances))
+    _nvmlCheckReturn(ret)
+
+    if version == 2:
+        c_vgpu_placements = c_nvmlVgpuPlacementList_v2_t()
+        c_vgpu_placements.version = VgpuPlacementList_v2
+        c_vgpu_placements.count = c_max_instances.value
+        c_vgpu_placements.mode = mode
+    elif version == 1:
+        c_vgpu_placements = c_nvmlVgpuPlacementList_v1_t()
+        c_vgpu_placements.version = VgpuPlacementList_v1
+    else:
+        raise NVMLError(NVML_ERROR_ARGUMENT_VERSION_MISMATCH)
+
+    c_placements = c_uint * c_max_instances.value
+    c_vgpu_placements.placementIds = c_placements()
+    fn  = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuTypeSupportedPlacements")
+    ret = fn(handle, vgpuTypeId, byref(c_vgpu_placements))
+    _nvmlCheckReturn(ret)
+    return c_vgpu_placements
+
+def nvmlDeviceGetVgpuTypeCreatablePlacements(handle, vgpuTypeId, version=1):
+    c_max_instances = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetMaxInstances")
+    ret = fn(handle, vgpuTypeId, byref(c_max_instances))
+    _nvmlCheckReturn(ret)
+
+    if version == 2:
+        c_vgpu_placements = c_nvmlVgpuPlacementList_v2_t()
+        c_vgpu_placements.version = VgpuPlacementList_v2
+        c_vgpu_placements.count = c_max_instances.value
+    elif version == 1:
+        c_vgpu_placements = c_nvmlVgpuPlacementList_v1_t()
+        c_vgpu_placements.version = VgpuPlacementList_v1
+
+    c_placements = c_uint * c_max_instances.value
+    c_vgpu_placements.placementIds = c_placements()
+    fn  = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuTypeCreatablePlacements")
+    ret = fn(handle, vgpuTypeId, byref(c_vgpu_placements))
+    _nvmlCheckReturn(ret)
+    return c_vgpu_placements
+
+def nvmlGetVgpuDriverCapabilities(capability):
+    c_capResult = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlGetVgpuDriverCapabilities")
+    ret = fn(_nvmlVgpuDriverCapability_t(capability), byref(c_capResult))
+    _nvmlCheckReturn(ret)
+    return c_capResult.value
+
+def nvmlDeviceGetVgpuCapabilities(handle, capability):
+    c_capResult = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuCapabilities")
+    ret = fn(handle, _nvmlDeviceVgpuCapability_t(capability), byref(c_capResult))
+    _nvmlCheckReturn(ret)
+    return c_capResult.value
+
+def nvmlDeviceSetVgpuCapabilities(handle, capability, state):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetVgpuCapabilities")
+    ret = fn(handle, _nvmlDeviceVgpuCapability_t(capability), state)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceGetSupportedVgpus(handle):
+    # first call to get the size
+    c_vgpu_count = c_uint(0)
+
+    fn =  _nvmlGetFunctionPointer("nvmlDeviceGetSupportedVgpus")
+    ret = fn(handle, byref(c_vgpu_count), None)
+
+    if (ret == NVML_SUCCESS):
+        # special case, no supported vGPUs
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        vgpu_type_ids_array = _nvmlVgpuTypeId_t * c_vgpu_count.value
+        c_vgpu_type_ids = vgpu_type_ids_array()
+
+        # make the call again
+        ret = fn(handle, byref(c_vgpu_count), c_vgpu_type_ids)
+        _nvmlCheckReturn(ret)
+        vgpus = []
+        for i in range(c_vgpu_count.value):
+            vgpus.append(c_vgpu_type_ids[i])
+        return vgpus
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlDeviceGetCreatableVgpus(handle):
+    # first call to get the size
+    c_vgpu_count = c_uint(0)
+
+    fn =  _nvmlGetFunctionPointer("nvmlDeviceGetCreatableVgpus")
+    ret = fn(handle, byref(c_vgpu_count), None)
+
+    if (ret == NVML_SUCCESS):
+        # special case, no supported vGPUs
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        vgpu_type_ids_array = _nvmlVgpuTypeId_t * c_vgpu_count.value
+        c_vgpu_type_ids = vgpu_type_ids_array()
+
+        # make the call again
+        ret = fn(handle, byref(c_vgpu_count), c_vgpu_type_ids)
+        _nvmlCheckReturn(ret)
+        vgpus = []
+        for i in range(c_vgpu_count.value):
+            vgpus.append(c_vgpu_type_ids[i])
+        return vgpus
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlVgpuTypeGetGpuInstanceProfileId(vgpuTypeId):
+    c_profile_id = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetGpuInstanceProfileId")
+    ret = fn(vgpuTypeId, byref(c_profile_id))
+    _nvmlCheckReturn(ret)
+    return (c_profile_id.value)
+
+@convertStrBytes
+def nvmlVgpuTypeGetClass(vgpuTypeId):
+    c_class = create_string_buffer(NVML_DEVICE_NAME_BUFFER_SIZE)
+    c_buffer_size = c_uint(NVML_DEVICE_NAME_BUFFER_SIZE)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetClass")
+    ret = fn(vgpuTypeId, c_class, byref(c_buffer_size))
+    _nvmlCheckReturn(ret)
+    return c_class.value
+
+@convertStrBytes
+def nvmlVgpuTypeGetName(vgpuTypeId):
+    c_name = create_string_buffer(NVML_DEVICE_NAME_BUFFER_SIZE)
+    c_buffer_size = c_uint(NVML_DEVICE_NAME_BUFFER_SIZE)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetName")
+    ret = fn(vgpuTypeId, c_name, byref(c_buffer_size))
+    _nvmlCheckReturn(ret)
+    return c_name.value
+
+def nvmlVgpuTypeGetDeviceID(vgpuTypeId):
+    c_device_id    = c_ulonglong(0)
+    c_subsystem_id = c_ulonglong(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetDeviceID")
+    ret = fn(vgpuTypeId, byref(c_device_id), byref(c_subsystem_id))
+    _nvmlCheckReturn(ret)
+    return (c_device_id.value, c_subsystem_id.value)
+
+def nvmlVgpuTypeGetFramebufferSize(vgpuTypeId):
+    c_fb_size = c_ulonglong(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetFramebufferSize")
+    ret = fn(vgpuTypeId, byref(c_fb_size))
+    _nvmlCheckReturn(ret)
+    return c_fb_size.value
+
+def nvmlVgpuTypeGetNumDisplayHeads(vgpuTypeId):
+    c_num_heads = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetNumDisplayHeads")
+    ret = fn(vgpuTypeId, byref(c_num_heads))
+    _nvmlCheckReturn(ret)
+    return c_num_heads.value
+
+def nvmlVgpuTypeGetResolution(vgpuTypeId):
+    c_xdim = c_uint(0)
+    c_ydim = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetResolution")
+    ret = fn(vgpuTypeId, 0, byref(c_xdim), byref(c_ydim))
+    _nvmlCheckReturn(ret)
+    return (c_xdim.value, c_ydim.value)
+
+@convertStrBytes
+def nvmlVgpuTypeGetLicense(vgpuTypeId):
+    c_license = create_string_buffer(NVML_GRID_LICENSE_BUFFER_SIZE)
+    c_buffer_size = c_uint(NVML_GRID_LICENSE_BUFFER_SIZE)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetLicense")
+    ret = fn(vgpuTypeId, c_license, c_buffer_size)
+    _nvmlCheckReturn(ret)
+    return c_license.value
+
+def nvmlVgpuTypeGetFrameRateLimit(vgpuTypeId):
+    c_frl_config = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetFrameRateLimit")
+    ret = fn(vgpuTypeId, byref(c_frl_config))
+    _nvmlCheckReturn(ret)
+    return c_frl_config.value
+
+def nvmlVgpuTypeGetGspHeapSize(vgpuTypeId):
+    c_gsp_heap = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetGspHeapSize")
+    ret = fn(vgpuTypeId, byref(c_gsp_heap))
+    _nvmlCheckReturn(ret)
+    return c_gsp_heap.value
+
+def nvmlVgpuTypeGetFbReservation(vgpuTypeId):
+    c_fb_reservation = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetFbReservation")
+    ret = fn(vgpuTypeId, byref(c_fb_reservation))
+    _nvmlCheckReturn(ret)
+    return c_fb_reservation.value
+
+def nvmlVgpuInstanceGetRuntimeStateSize(vgpuInstance):
+    c_runtime_state = nvmlVgpuRuntimeState_v1_t()
+    c_runtime_state.version = VgpuRuntimeState_v1
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetRuntimeStateSize")
+    ret = fn(vgpuInstance, byref(c_runtime_state))
+    _nvmlCheckReturn(ret)
+    return c_runtime_state
+
+def nvmlVgpuTypeGetMaxInstances(handle, vgpuTypeId):
+    c_max_instances = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetMaxInstances")
+    ret = fn(handle, vgpuTypeId, byref(c_max_instances))
+    _nvmlCheckReturn(ret)
+    return c_max_instances.value
+
+def nvmlVgpuTypeGetMaxInstancesPerVm(vgpuTypeId):
+    c_max_instances_per_vm = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetMaxInstancesPerVm")
+    ret = fn(vgpuTypeId, byref(c_max_instances_per_vm))
+    _nvmlCheckReturn(ret)
+    return c_max_instances_per_vm.value
+
+def nvmlVgpuTypeGetBAR1Info(vgpuTypeId):
+    c_bar1Info = c_nvmlVgpuTypeBar1Info_v1_t(0)
+    c_bar1Info.version = VgpuTypeBar1Info_v1
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetBAR1Info")
+    ret = fn(vgpuTypeId, byref(c_bar1Info))
+    _nvmlCheckReturn(ret)
+    return c_bar1Info
+
+def nvmlDeviceGetActiveVgpus(handle):
+    # first call to get the size
+    c_vgpu_count = c_uint(0)
+
+    fn  = _nvmlGetFunctionPointer("nvmlDeviceGetActiveVgpus")
+    ret = fn(handle, byref(c_vgpu_count), None)
+
+    if (ret == NVML_SUCCESS):
+        # special case, no active vGPUs
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        vgpu_instance_array = _nvmlVgpuInstance_t * c_vgpu_count.value
+        c_vgpu_instances = vgpu_instance_array()
+
+        # make the call again
+        ret = fn(handle, byref(c_vgpu_count), c_vgpu_instances)
+        _nvmlCheckReturn(ret)
+        vgpus = []
+        for i in range(c_vgpu_count.value):
+            vgpus.append(c_vgpu_instances[i])
+        return vgpus
+    else:
+        # error case
+        raise NVMLError(ret)
+
+@convertStrBytes
+def nvmlVgpuInstanceGetVmID(vgpuInstance):
+    c_vm_id = create_string_buffer(NVML_DEVICE_UUID_BUFFER_SIZE)
+    c_buffer_size = c_uint(NVML_GRID_LICENSE_BUFFER_SIZE)
+    c_vm_id_type  = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetVmID")
+    ret = fn(vgpuInstance, byref(c_vm_id), c_buffer_size, byref(c_vm_id_type))
+    _nvmlCheckReturn(ret)
+    return (c_vm_id.value, c_vm_id_type.value)
+
+@convertStrBytes
+def nvmlVgpuInstanceGetUUID(vgpuInstance):
+    c_uuid = create_string_buffer(NVML_DEVICE_UUID_BUFFER_SIZE)
+    c_buffer_size = c_uint(NVML_DEVICE_UUID_BUFFER_SIZE)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetUUID")
+    ret = fn(vgpuInstance, byref(c_uuid), c_buffer_size)
+    _nvmlCheckReturn(ret)
+    return c_uuid.value
+
+@convertStrBytes
+def nvmlVgpuInstanceGetMdevUUID(vgpuInstance):
+    c_uuid = create_string_buffer(NVML_DEVICE_UUID_BUFFER_SIZE)
+    c_buffer_size = c_uint(NVML_DEVICE_UUID_BUFFER_SIZE)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetMdevUUID")
+    ret = fn(vgpuInstance, byref(c_uuid), c_buffer_size)
+    _nvmlCheckReturn(ret)
+    return c_uuid.value
+
+@convertStrBytes
+def nvmlVgpuInstanceGetVmDriverVersion(vgpuInstance):
+    c_driver_version = create_string_buffer(NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE)
+    c_buffer_size = c_uint(NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetVmDriverVersion")
+    ret = fn(vgpuInstance, byref(c_driver_version), c_buffer_size)
+    _nvmlCheckReturn(ret)
+    return c_driver_version.value
+
+def nvmlVgpuInstanceGetLicenseStatus(vgpuInstance):
+    c_license_status = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetLicenseStatus")
+    ret = fn(vgpuInstance, byref(c_license_status))
+    _nvmlCheckReturn(ret)
+    return c_license_status.value
+
+def nvmlVgpuInstanceGetLicenseInfo_v2(vgpuInstance):
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetLicenseInfo_v2")
+    c_license_info = c_nvmlVgpuLicenseInfo_t()
+    ret = fn(vgpuInstance, byref(c_license_info))
+    _nvmlCheckReturn(ret)
+    return c_license_info
+
+def nvmlVgpuInstanceGetLicenseInfo(vgpuInstance):
+    return nvmlVgpuInstanceGetLicenseInfo_v2(vgpuInstance)
+
+def nvmlVgpuInstanceGetFrameRateLimit(vgpuInstance):
+    c_frl = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetFrameRateLimit")
+    ret = fn(vgpuInstance, byref(c_frl))
+    _nvmlCheckReturn(ret)
+    return c_frl.value
+
+def nvmlVgpuInstanceGetEccMode(vgpuInstance):
+    c_mode = _nvmlEnableState_t()
+    fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetEccMode")
+    ret = fn(vgpuInstance, byref(c_mode))
+    _nvmlCheckReturn(ret)
+    return c_mode.value
+
+def nvmlVgpuInstanceGetType(vgpuInstance):
+    c_vgpu_type = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetType")
+    ret = fn(vgpuInstance, byref(c_vgpu_type))
+    _nvmlCheckReturn(ret)
+    return c_vgpu_type.value
+
+def nvmlVgpuInstanceGetEncoderCapacity(vgpuInstance):
+    c_encoder_capacity = c_ulonglong(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetEncoderCapacity")
+    ret = fn(vgpuInstance, byref(c_encoder_capacity))
+    _nvmlCheckReturn(ret)
+    return c_encoder_capacity.value
+
+def nvmlVgpuInstanceSetEncoderCapacity(vgpuInstance, encoder_capacity):
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceSetEncoderCapacity")
+    return fn(vgpuInstance, encoder_capacity)
+
+def nvmlVgpuInstanceGetFbUsage(vgpuInstance):
+    c_fb_usage = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetFbUsage")
+    ret = fn(vgpuInstance, byref(c_fb_usage))
+    _nvmlCheckReturn(ret)
+    return c_fb_usage.value
+
+def nvmlVgpuTypeGetCapabilities(vgpuTypeId, capability):
+    c_cap_result = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetCapabilities")
+    ret = fn(vgpuTypeId, _nvmlVgpuCapability_t(capability), byref(c_cap_result))
+    _nvmlCheckReturn(ret)
+    return (c_cap_result.value)
+
+def nvmlVgpuInstanceGetGpuInstanceId(vgpuInstance):
+    c_id = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetGpuInstanceId")
+    ret = fn(vgpuInstance, byref(c_id))
+    _nvmlCheckReturn(ret)
+    return (c_id.value)
+
+@convertStrBytes
+def nvmlVgpuInstanceGetGpuPciId(vgpuInstance):
+    c_vgpuPciId = create_string_buffer(NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE)
+    fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetGpuPciId")
+    ret = fn(vgpuInstance, c_vgpuPciId, byref(c_uint(NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE)))
+    _nvmlCheckReturn(ret)
+    return c_vgpuPciId.value
+
+def nvmlDeviceGetVgpuUtilization(handle, timeStamp):
+    # first call to get the size
+    c_vgpu_count = c_uint(0)
+    c_time_stamp = c_ulonglong(timeStamp)
+    c_sample_value_type = _nvmlValueType_t()
+
+    fn  = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuUtilization")
+    ret = fn(handle, c_time_stamp, byref(c_sample_value_type), byref(c_vgpu_count), None)
+
+    if (ret == NVML_SUCCESS):
+        # special case, no active vGPUs
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        sampleArray = c_vgpu_count.value * c_nvmlVgpuInstanceUtilizationSample_t
+        c_samples = sampleArray()
+
+        # make the call again
+        ret = fn(handle, c_time_stamp, byref(c_sample_value_type), byref(c_vgpu_count), c_samples)
+        _nvmlCheckReturn(ret)
+
+        return c_samples[0:c_vgpu_count.value]
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlDeviceGetVgpuInstancesUtilizationInfo(handle, timeStamp):
+    # first call to get the size
+    c_time_stamp = c_ulonglong(timeStamp)
+    c_vgpuUtilInfo = c_nvmlVgpuInstancesUtilizationInfo_v1_t(0)
+    c_vgpuUtilInfo.version = VgpuInstancesUtilizationInfo_v1
+    c_vgpuUtilInfo.sampleValType = _nvmlValueType_t()
+    c_vgpuUtilInfo.vgpuInstanceCount = c_uint(0)
+    c_vgpuUtilInfo.lastSeenTimeStamp = c_time_stamp
+
+    fn  = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuInstancesUtilizationInfo")
+    ret = fn(handle, byref(c_vgpuUtilInfo))
+
+    if (ret == NVML_SUCCESS):
+        # special case, no active vGPUs
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        sampleArray = c_vgpuUtilInfo.vgpuInstanceCount * c_nvmlVgpuInstanceUtilizationInfo_v1_t
+        c_samples = sampleArray()
+        c_vgpuUtilInfo.vgpuUtilArray = c_samples
+
+        # make the call again
+        ret = fn(handle, byref(c_vgpuUtilInfo))
+        _nvmlCheckReturn(ret)
+
+        return c_samples[0:c_vgpuUtilInfo.vgpuInstanceCount]
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlDeviceGetP2PStatus(device1, device2, p2pIndex):
+    c_p2pstatus = _nvmlGpuP2PStatus_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetP2PStatus")
+    ret = fn(device1, device2,p2pIndex, byref(c_p2pstatus))
+    _nvmlCheckReturn(ret)
+    return c_p2pstatus.value
+
+def nvmlDeviceGetGridLicensableFeatures_v4(handle):
+    c_get_grid_licensable_features = c_nvmlGridLicensableFeatures_v4_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGridLicensableFeatures_v4")
+    ret = fn(handle, byref(c_get_grid_licensable_features))
+    _nvmlCheckReturn(ret)
+
+    return (c_get_grid_licensable_features)
+
+def nvmlDeviceGetGridLicensableFeatures(handle):
+    return nvmlDeviceGetGridLicensableFeatures_v4(handle)
+
+def nvmlDeviceGetGspFirmwareVersion(handle, version=None):
+    isUserDefined = version is not None
+    if not isUserDefined:
+        version = (c_char * NVML_GSP_FIRMWARE_VERSION_BUF_SIZE)()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGspFirmwareVersion")
+    ret = fn(handle, version)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS if isUserDefined else version.value
+
+def nvmlDeviceGetGspFirmwareMode(handle, isEnabled=c_uint(), defaultMode=c_uint()):
+    isReference = type(isEnabled) is not c_uint
+    isEnabledRef = isEnabled if isReference else byref(isEnabled)
+    defaultModeRef = defaultMode if isReference else byref(defaultMode)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGspFirmwareMode")
+    ret = fn(handle, isEnabledRef, defaultModeRef)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS if isReference else [isEnabled.value, defaultMode.value]
+
+def nvmlDeviceGetEncoderCapacity(handle, encoderQueryType):
+    c_encoder_capacity = c_ulonglong(0)
+    c_encoderQuery_type = _nvmlEncoderQueryType_t(encoderQueryType)
+
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetEncoderCapacity")
+    ret = fn(handle, c_encoderQuery_type, byref(c_encoder_capacity))
+    _nvmlCheckReturn(ret)
+    return c_encoder_capacity.value
+
+def nvmlDeviceGetVgpuProcessUtilization(handle, timeStamp):
+    # first call to get the size
+    c_vgpu_count = c_uint(0)
+    c_time_stamp = c_ulonglong(timeStamp)
+
+    fn  = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuProcessUtilization")
+    ret = fn(handle, c_time_stamp, byref(c_vgpu_count), None)
+
+    if (ret == NVML_SUCCESS):
+        # special case, no active vGPUs
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        sampleArray = c_vgpu_count.value * c_nvmlVgpuProcessUtilizationSample_t
+        c_samples = sampleArray()
+
+        # make the call again
+        ret = fn(handle, c_time_stamp, byref(c_vgpu_count), c_samples)
+        _nvmlCheckReturn(ret)
+
+        return c_samples[0:c_vgpu_count.value]
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlDeviceGetVgpuProcessesUtilizationInfo(handle, timeStamp):
+    # first call to get the size
+    c_time_stamp = c_ulonglong(timeStamp)
+    c_vgpuProcUtilInfo = c_nvmlVgpuProcessesUtilizationInfo_v1_t(0)
+    c_vgpuProcUtilInfo.version = VgpuProcessesUtilizationInfo_v1
+    c_vgpuProcUtilInfo.vgpuProcessCount = c_uint(0)
+    c_vgpuProcUtilInfo.lastSeenTimeStamp = c_time_stamp
+
+    fn  = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuProcessesUtilizationInfo")
+    ret = fn(handle, byref(c_vgpuProcUtilInfo))
+
+    if (ret == NVML_SUCCESS):
+        # special case, no active vGPUs
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        sampleArray = c_vgpuProcUtilInfo.vgpuProcessCount * c_nvmlVgpuProcessUtilizationInfo_v1_t
+        c_samples = sampleArray()
+        c_vgpuProcUtilInfo.vgpuProcUtilArray = c_samples
+
+        # make the call again
+        ret = fn(handle, byref(c_vgpuProcUtilInfo))
+        _nvmlCheckReturn(ret)
+
+        return c_samples[0:c_vgpuProcUtilInfo.vgpuProcessCount]
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlDeviceGetEncoderStats(handle):
+    c_encoderCount = c_ulonglong(0)
+    c_encodeFps = c_ulonglong(0)
+    c_encoderLatency = c_ulonglong(0)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetEncoderStats")
+    ret = fn(handle, byref(c_encoderCount), byref(c_encodeFps), byref(c_encoderLatency))
+    _nvmlCheckReturn(ret)
+    return (c_encoderCount.value, c_encodeFps.value, c_encoderLatency.value)
+
+def nvmlDeviceGetEncoderSessions(handle):
+    # first call to get the size
+    c_session_count = c_uint(0)
+
+    fn  = _nvmlGetFunctionPointer("nvmlDeviceGetEncoderSessions")
+    ret = fn(handle, byref(c_session_count), None)
+
+    if (ret == NVML_SUCCESS):
+        if (c_session_count.value != 0):
+            # typical case
+            session_array = c_nvmlEncoderSession_t * c_session_count.value
+            c_sessions = session_array()
+
+            # make the call again
+            ret = fn(handle, byref(c_session_count), c_sessions)
+            _nvmlCheckReturn(ret)
+            sessions = []
+            for i in range(c_session_count.value):
+                sessions.append(c_sessions[i])
+            return sessions
+        else:
+            return []  # no active sessions
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlDeviceGetFBCStats(handle):
+    c_fbcStats = c_nvmlFBCStats_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetFBCStats")
+    ret = fn(handle, byref(c_fbcStats))
+    _nvmlCheckReturn(ret)
+    return c_fbcStats
+
+def nvmlDeviceGetFBCSessions(handle):
+    # first call to get the size
+    c_session_count = c_uint(0)
+
+    fn  = _nvmlGetFunctionPointer("nvmlDeviceGetFBCSessions")
+    ret = fn(handle, byref(c_session_count), None)
+
+    if (ret == NVML_SUCCESS):
+        if (c_session_count.value != 0):
+            # typical case
+            session_array = c_nvmlFBCSession_t * c_session_count.value
+            c_sessions = session_array()
+
+            # make the call again
+            ret = fn(handle, byref(c_session_count), c_sessions)
+            _nvmlCheckReturn(ret)
+            sessions = []
+            for i in range(c_session_count.value):
+                sessions.append(c_sessions[i])
+            return sessions
+        else:
+            return []  # no active sessions
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlVgpuInstanceGetEncoderStats(vgpuInstance):
+    c_encoderCount    = c_ulonglong(0)
+    c_encodeFps       = c_ulonglong(0)
+    c_encoderLatency  = c_ulonglong(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetEncoderStats")
+    ret = fn(vgpuInstance, byref(c_encoderCount), byref(c_encodeFps), byref(c_encoderLatency))
+    _nvmlCheckReturn(ret)
+    return (c_encoderCount.value, c_encodeFps.value, c_encoderLatency.value)
+
+def nvmlVgpuInstanceGetEncoderSessions(vgpuInstance):
+    # first call to get the size
+    c_session_count = c_uint(0)
+
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetEncoderSessions")
+    ret = fn(vgpuInstance, byref(c_session_count), None)
+
+    if (ret == NVML_SUCCESS):
+        if (c_session_count.value != 0):
+            # typical case
+            session_array = c_nvmlEncoderSession_t * c_session_count.value
+            c_sessions = session_array()
+
+            # make the call again
+            ret = fn(vgpuInstance, byref(c_session_count), c_sessions)
+            _nvmlCheckReturn(ret)
+            sessions = []
+            for i in range(c_session_count.value):
+                sessions.append(c_sessions[i])
+            return sessions
+        else:
+            return []  # no active sessions
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlVgpuInstanceGetFBCStats(vgpuInstance):
+    c_fbcStats = c_nvmlFBCStats_t()
+    fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetFBCStats")
+    ret = fn(vgpuInstance, byref(c_fbcStats))
+    _nvmlCheckReturn(ret)
+    return c_fbcStats
+
+def nvmlVgpuInstanceGetFBCSessions(vgpuInstance):
+    # first call to get the size
+    c_session_count = c_uint(0)
+
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetFBCSessions")
+    ret = fn(vgpuInstance, byref(c_session_count), None)
+
+    if (ret == NVML_SUCCESS):
+        if (c_session_count.value != 0):
+            # typical case
+            session_array = c_nvmlFBCSession_t * c_session_count.value
+            c_sessions = session_array()
+
+            # make the call again
+            ret = fn(vgpuInstance, byref(c_session_count), c_sessions)
+            _nvmlCheckReturn(ret)
+            sessions = []
+            for i in range(c_session_count.value):
+                sessions.append(c_sessions[i])
+            return sessions
+        else:
+            return []  # no active sessions
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlDeviceGetProcessUtilization(handle, timeStamp):
+    # first call to get the size
+    c_count = c_uint(0)
+    c_time_stamp = c_ulonglong(timeStamp)
+
+    fn  = _nvmlGetFunctionPointer("nvmlDeviceGetProcessUtilization")
+    ret = fn(handle, None, byref(c_count), c_time_stamp)
+
+    if (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        sampleArray = c_count.value * c_nvmlProcessUtilizationSample_t
+        c_samples = sampleArray()
+
+        # make the call again
+        ret = fn(handle, c_samples, byref(c_count), c_time_stamp)
+        _nvmlCheckReturn(ret)
+
+        return c_samples[0:c_count.value]
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlDeviceGetProcessesUtilizationInfo(handle, timeStamp):
+    # first call to get the size
+    c_time_stamp = c_ulonglong(timeStamp)
+    c_processesUtilInfo = c_nvmlProcessesUtilizationInfo_v1_t(0)
+    c_processesUtilInfo.version = ProcessesUtilizationInfo_v1
+    c_processesUtilInfo.processSamplesCount = c_uint(0)
+    c_processesUtilInfo.lastSeenTimeStamp = c_time_stamp
+
+    fn  = _nvmlGetFunctionPointer("nvmlDeviceGetProcessesUtilizationInfo")
+    ret = fn(handle, byref(c_processesUtilInfo))
+
+    if (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        sampleArray = c_processesUtilInfo.processSamplesCount * c_nvmlProcessUtilizationInfo_v1_t
+        c_samples = sampleArray()
+        c_processesUtilInfo.procUtilArray = c_samples
+
+        # make the call again
+        ret = fn(handle, byref(c_processesUtilInfo))
+        _nvmlCheckReturn(ret)
+
+        return c_samples[0:c_processesUtilInfo.processSamplesCount]
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlVgpuInstanceGetMetadata(vgpuInstance):
+    fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetMetadata")
+    c_vgpuMetadata = c_nvmlVgpuMetadata_t()
+    c_bufferSize = c_uint(0)
+    # Make the first NVML API call to get the c_bufferSize value.
+    # We have already allocated required buffer above.
+    ret = fn(vgpuInstance, byref(c_vgpuMetadata), byref(c_bufferSize))
+    if (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        ret = fn(vgpuInstance, byref(c_vgpuMetadata), byref(c_bufferSize))
+        _nvmlCheckReturn(ret)
+    else:
+        raise NVMLError(ret)
+    return c_vgpuMetadata
+
+def nvmlDeviceGetVgpuMetadata(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuMetadata")
+    c_vgpuPgpuMetadata = c_nvmlVgpuPgpuMetadata_t()
+    c_bufferSize = c_uint(0)
+    # Make the first NVML API call to get the c_bufferSize value.
+    # We have already allocated required buffer above.
+    ret = fn(handle, byref(c_vgpuPgpuMetadata), byref(c_bufferSize))
+    if (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        ret = fn(handle, byref(c_vgpuPgpuMetadata), byref(c_bufferSize))
+        _nvmlCheckReturn(ret)
+    else:
+        raise NVMLError(ret)
+    return c_vgpuPgpuMetadata
+
+def nvmlGetVgpuCompatibility(vgpuMetadata, pgpuMetadata):
+    fn = _nvmlGetFunctionPointer("nvmlGetVgpuCompatibility")
+    c_vgpuPgpuCompatibility = c_nvmlVgpuPgpuCompatibility_t()
+    ret = fn(byref(vgpuMetadata), byref(pgpuMetadata), byref(c_vgpuPgpuCompatibility))
+    _nvmlCheckReturn(ret)
+    return c_vgpuPgpuCompatibility
+
+@convertStrBytes
+def nvmlDeviceGetPgpuMetadataString(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPgpuMetadataString")
+    c_pgpuMetadata = create_string_buffer(NVML_VGPU_PGPU_METADATA_OPAQUE_DATA_SIZE)
+    c_bufferSize = c_uint(0)
+    # Make the first NVML API call to get the c_bufferSize value.
+    # We have already allocated required buffer above.
+    ret = fn(handle, byref(c_pgpuMetadata), byref(c_bufferSize))
+    if (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        ret = fn(handle, byref(c_pgpuMetadata), byref(c_bufferSize))
+        _nvmlCheckReturn(ret)
+    else:
+        raise NVMLError(ret)
+    return (c_pgpuMetadata.value, c_bufferSize.value)
+
+def nvmlDeviceGetVgpuSchedulerLog(handle):
+    c_vgpu_sched_log = c_nvmlVgpuSchedulerLog_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuSchedulerLog")
+    ret = fn(handle, byref(c_vgpu_sched_log))
+    _nvmlCheckReturn(ret)
+    return c_vgpu_sched_log
+
+def nvmlDeviceGetVgpuSchedulerState(handle):
+    c_vgpu_sched_state = c_nvmlVgpuSchedulerGetState_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuSchedulerState")
+    ret = fn(handle, byref(c_vgpu_sched_state))
+    _nvmlCheckReturn(ret)
+    return c_vgpu_sched_state
+
+def nvmlDeviceGetVgpuSchedulerCapabilities(handle):
+    c_vgpu_sched_caps = c_nvmlVgpuSchedulerCapabilities_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuSchedulerCapabilities")
+    ret = fn(handle, byref(c_vgpu_sched_caps))
+    _nvmlCheckReturn(ret)
+    return c_vgpu_sched_caps
+
+def nvmlDeviceSetVgpuSchedulerState(handle, sched_state):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetVgpuSchedulerState")
+    ret = fn(handle, byref(sched_state))
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlSetVgpuVersion(vgpuVersion):
+    fn = _nvmlGetFunctionPointer("nvmlSetVgpuVersion")
+    ret = fn(byref(vgpuVersion))
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlGetVgpuVersion(supported=None, current=None):
+    isUserDefined = (supported is not None) or (current is not None)
+    if not isUserDefined:
+        supported = c_nvmlVgpuVersion_t()
+        current = c_nvmlVgpuVersion_t()
+    fn = _nvmlGetFunctionPointer("nvmlGetVgpuVersion")
+    ret = fn(byref(supported), byref(current))
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS if isUserDefined else [(supported.minVersion,
+                                                supported.maxVersion),
+                                               (current.minVersion,
+                                                current.maxVersion)]
+
+def nvmlVgpuInstanceGetAccountingMode(vgpuInstance):
+    c_mode = _nvmlEnableState_t()
+    fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetAccountingMode")
+    ret = fn(vgpuInstance, byref(c_mode))
+    _nvmlCheckReturn(ret)
+    return c_mode.value
+
+def nvmlVgpuInstanceGetAccountingPids(vgpuInstance):
+    c_pidCount = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetAccountingPids")
+    ret = fn(vgpuInstance, byref(c_pidCount), None)
+    if (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        sampleArray = c_pidCount.value * c_uint
+        c_pidArray = sampleArray()
+        ret = fn(vgpuInstance, byref(c_pidCount), byref(c_pidArray))
+        _nvmlCheckReturn(ret)
+    else:
+        raise NVMLError(ret)
+    return (c_pidCount, c_pidArray)
+
+def nvmlVgpuInstanceGetAccountingStats(vgpuInstance, pid):
+    c_accountingStats = c_nvmlAccountingStats_t()
+    fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetAccountingStats")
+    ret = fn(vgpuInstance, pid, byref(c_accountingStats))
+    _nvmlCheckReturn(ret)
+    return c_accountingStats
+
+def nvmlVgpuInstanceClearAccountingPids(vgpuInstance):
+    fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceClearAccountingPids")
+    ret = fn(vgpuInstance)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlGetExcludedDeviceCount():
+    c_count = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlGetExcludedDeviceCount")
+    ret = fn(byref(c_count))
+    _nvmlCheckReturn(ret)
+    return c_count.value
+
+def nvmlGetExcludedDeviceInfoByIndex(index):
+    c_index = c_uint(index)
+    info = c_nvmlExcludedDeviceInfo_t()
+    fn = _nvmlGetFunctionPointer("nvmlGetExcludedDeviceInfoByIndex")
+    ret = fn(c_index, byref(info))
+    _nvmlCheckReturn(ret)
+    return info
+
+def nvmlDeviceGetHostVgpuMode(handle):
+    c_host_vgpu_mode = _nvmlHostVgpuMode_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetHostVgpuMode")
+    ret = fn(handle, byref(c_host_vgpu_mode))
+    _nvmlCheckReturn(ret)
+    return c_host_vgpu_mode.value
+
+def nvmlDeviceSetMigMode(device, mode):
+    c_activationStatus = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetMigMode")
+    ret = fn(device, mode, byref(c_activationStatus))
+    _nvmlCheckReturn(ret)
+    return c_activationStatus.value
+
+def nvmlDeviceGetMigMode(device):
+    c_currentMode = c_uint()
+    c_pendingMode = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMigMode")
+    ret = fn(device, byref(c_currentMode), byref(c_pendingMode))
+    _nvmlCheckReturn(ret)
+    return [c_currentMode.value, c_pendingMode.value]
+
+def nvmlDeviceGetGpuInstanceProfileInfo(device, profile, version=2):
+    if version == 2:
+        c_info = c_nvmlGpuInstanceProfileInfo_v2_t()
+        fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuInstanceProfileInfoV")
+    elif version == 1:
+        c_info = c_nvmlGpuInstanceProfileInfo_t()
+        fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuInstanceProfileInfo")
+    else:
+        raise NVMLError(NVML_ERROR_FUNCTION_NOT_FOUND)
+    ret = fn(device, profile, byref(c_info))
+    _nvmlCheckReturn(ret)
+    return c_info
+
+# Define function alias for the API exposed by NVML
+nvmlDeviceGetGpuInstanceProfileInfoV = nvmlDeviceGetGpuInstanceProfileInfo
+
+def nvmlDeviceGetGpuInstanceRemainingCapacity(device, profileId):
+    c_count = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuInstanceRemainingCapacity")
+    ret = fn(device, profileId, byref(c_count))
+    _nvmlCheckReturn(ret)
+    return c_count.value
+
+def nvmlDeviceGetGpuInstancePossiblePlacements(device, profileId, placementsRef, countRef):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuInstancePossiblePlacements_v2")
+    ret = fn(device, profileId, placementsRef, countRef)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceCreateGpuInstance(device, profileId):
+    c_instance = c_nvmlGpuInstance_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceCreateGpuInstance")
+    ret = fn(device, profileId, byref(c_instance))
+    _nvmlCheckReturn(ret)
+    return c_instance
+
+def nvmlDeviceCreateGpuInstanceWithPlacement(device, profileId, placement):
+    c_instance = c_nvmlGpuInstance_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceCreateGpuInstanceWithPlacement")
+    ret = fn(device, profileId, placement, byref(c_instance))
+    _nvmlCheckReturn(ret)
+    return c_instance
+
+def nvmlGpuInstanceDestroy(gpuInstance):
+    fn = _nvmlGetFunctionPointer("nvmlGpuInstanceDestroy")
+    ret = fn(gpuInstance)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceGetGpuInstances(device, profileId, gpuInstancesRef, countRef):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuInstances")
+    ret = fn(device, profileId, gpuInstancesRef, countRef)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceGetGpuInstanceById(device, gpuInstanceId):
+    c_instance = c_nvmlGpuInstance_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuInstanceById")
+    ret = fn(device, gpuInstanceId, byref(c_instance))
+    _nvmlCheckReturn(ret)
+    return c_instance
+
+def nvmlGpuInstanceGetInfo(gpuInstance):
+    c_info = c_nvmlGpuInstanceInfo_t()
+    fn = _nvmlGetFunctionPointer("nvmlGpuInstanceGetInfo")
+    ret = fn(gpuInstance, byref(c_info))
+    _nvmlCheckReturn(ret)
+    return c_info
+
+def nvmlGpuInstanceGetComputeInstanceProfileInfo(device, profile, engProfile, version=2):
+    if version == 2:
+        c_info = c_nvmlComputeInstanceProfileInfo_v2_t()
+        fn = _nvmlGetFunctionPointer("nvmlGpuInstanceGetComputeInstanceProfileInfoV")
+    elif version == 1:
+        c_info = c_nvmlComputeInstanceProfileInfo_t()
+        fn = _nvmlGetFunctionPointer("nvmlGpuInstanceGetComputeInstanceProfileInfo")
+    else:
+        raise NVMLError(NVML_ERROR_FUNCTION_NOT_FOUND) 
+    ret = fn(device, profile, engProfile, byref(c_info))
+    _nvmlCheckReturn(ret)
+    return c_info
+
+# Define function alias for the API exposed by NVML
+nvmlGpuInstanceGetComputeInstanceProfileInfoV = nvmlGpuInstanceGetComputeInstanceProfileInfo
+
+def nvmlGpuInstanceGetComputeInstanceRemainingCapacity(gpuInstance, profileId):
+    c_count = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlGpuInstanceGetComputeInstanceRemainingCapacity")
+    ret = fn(gpuInstance, profileId, byref(c_count))
+    _nvmlCheckReturn(ret)
+    return c_count.value
+
+def nvmlGpuInstanceGetComputeInstancePossiblePlacements(gpuInstance, profileId, placementsRef, countRef):
+    fn = _nvmlGetFunctionPointer("nvmlGpuInstanceGetComputeInstancePossiblePlacements")
+    ret = fn(gpuInstance, profileId, placementsRef, countRef)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlGpuInstanceCreateComputeInstance(gpuInstance, profileId):
+    c_instance = c_nvmlComputeInstance_t()
+    fn = _nvmlGetFunctionPointer("nvmlGpuInstanceCreateComputeInstance")
+    ret = fn(gpuInstance, profileId, byref(c_instance))
+    _nvmlCheckReturn(ret)
+    return c_instance
+
+def nvmlGpuInstanceCreateComputeInstanceWithPlacement(gpuInstance, profileId, placement):
+    c_instance = c_nvmlComputeInstance_t()
+    fn = _nvmlGetFunctionPointer("nvmlGpuInstanceCreateComputeInstanceWithPlacement")
+    ret = fn(gpuInstance, profileId, placement, byref(c_instance))
+    _nvmlCheckReturn(ret)
+    return c_instance
+
+def nvmlComputeInstanceDestroy(computeInstance):
+    fn = _nvmlGetFunctionPointer("nvmlComputeInstanceDestroy")
+    ret = fn(computeInstance)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlGpuInstanceGetComputeInstances(gpuInstance, profileId, computeInstancesRef, countRef):
+    fn = _nvmlGetFunctionPointer("nvmlGpuInstanceGetComputeInstances")
+    ret = fn(gpuInstance, profileId, computeInstancesRef, countRef)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlGpuInstanceGetComputeInstanceById(gpuInstance, computeInstanceId):
+    c_instance = c_nvmlComputeInstance_t()
+    fn = _nvmlGetFunctionPointer("nvmlGpuInstanceGetComputeInstanceById")
+    ret = fn(gpuInstance, computeInstanceId, byref(c_instance))
+    _nvmlCheckReturn(ret)
+    return c_instance
+
+def nvmlComputeInstanceGetInfo_v2(computeInstance):
+    c_info = c_nvmlComputeInstanceInfo_t()
+    fn = _nvmlGetFunctionPointer("nvmlComputeInstanceGetInfo_v2")
+    ret = fn(computeInstance, byref(c_info))
+    _nvmlCheckReturn(ret)
+    return c_info
+
+def nvmlComputeInstanceGetInfo(computeInstance):
+    return nvmlComputeInstanceGetInfo_v2(computeInstance)
+
+def nvmlDeviceIsMigDeviceHandle(device):
+    c_isMigDevice = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceIsMigDeviceHandle")
+    ret = fn(device, byref(c_isMigDevice))
+    _nvmlCheckReturn(ret)
+    return c_isMigDevice
+
+def nvmlDeviceGetGpuInstanceId(device):
+    c_gpuInstanceId = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuInstanceId")
+    ret = fn(device, byref(c_gpuInstanceId))
+    _nvmlCheckReturn(ret)
+    return c_gpuInstanceId.value
+
+def nvmlDeviceGetComputeInstanceId(device):
+    c_computeInstanceId = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetComputeInstanceId")
+    ret = fn(device, byref(c_computeInstanceId))
+    _nvmlCheckReturn(ret)
+    return c_computeInstanceId.value
+
+def nvmlDeviceGetMaxMigDeviceCount(device):
+    c_count = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMaxMigDeviceCount")
+    ret = fn(device, byref(c_count))
+    _nvmlCheckReturn(ret)
+    return c_count.value
+
+def nvmlDeviceGetMigDeviceHandleByIndex(device, index):
+    c_index = c_uint(index)
+    migDevice = c_nvmlDevice_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMigDeviceHandleByIndex")
+    ret = fn(device, c_index, byref(migDevice))
+    _nvmlCheckReturn(ret)
+    return migDevice
+
+def nvmlDeviceGetDeviceHandleFromMigDeviceHandle(migDevice):
+    device = c_nvmlDevice_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetDeviceHandleFromMigDeviceHandle")
+    ret = fn(migDevice, byref(device))
+    _nvmlCheckReturn(ret)
+    return device
+
+def nvmlDeviceGetAttributes_v2(device):
+    c_attrs = c_nvmlDeviceAttributes()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetAttributes_v2")
+    ret = fn(device, byref(c_attrs))
+    _nvmlCheckReturn(ret)
+    return c_attrs
+
+def nvmlDeviceGetAttributes(device):
+    return nvmlDeviceGetAttributes_v2(device)
+
+def nvmlDeviceGetRemappedRows(device):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetRemappedRows")
+    c_corr = c_uint()
+    c_unc = c_uint()
+    c_bpending = c_uint()
+    c_bfailure = c_uint()
+    ret = fn(device, byref(c_corr), byref(c_unc), byref(c_bpending), byref(c_bfailure))
+    _nvmlCheckReturn(ret)
+    return (c_corr.value, c_unc.value, c_bpending.value, c_bfailure.value)
+
+def nvmlDeviceGetRowRemapperHistogram(device):
+    c_vals = c_nvmlRowRemapperHistogramValues()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetRowRemapperHistogram")
+    ret = fn(device, byref(c_vals))
+    _nvmlCheckReturn(ret)
+    return c_vals
+
+def nvmlDeviceGetArchitecture(device):
+    arch = _nvmlDeviceArchitecture_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetArchitecture")
+    ret = fn(device, byref(arch))
+    _nvmlCheckReturn(ret)
+    return arch.value
+
+def nvmlDeviceGetBusType(device):
+    c_busType = _nvmlBusType_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetBusType")
+    ret = fn(device, byref(c_busType))
+    _nvmlCheckReturn(ret)
+    return c_busType.value
+
+def nvmlDeviceGetIrqNum(device):
+    c_irqNum = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetIrqNum")
+    ret = fn(device, byref(c_irqNum))
+    _nvmlCheckReturn(ret)
+    return c_irqNum.value
+
+def nvmlDeviceGetNumGpuCores(device):
+    c_numCores = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetNumGpuCores")
+    ret = fn(device, byref(c_numCores))
+    _nvmlCheckReturn(ret)
+    return c_numCores.value
+
+def nvmlDeviceGetPowerSource(device):
+    c_powerSource = _nvmlPowerSource_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerSource")
+    ret = fn(device, byref(c_powerSource))
+    _nvmlCheckReturn(ret)
+    return c_powerSource.value
+
+def nvmlDeviceGetMemoryBusWidth(device):
+    c_memBusWidth = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMemoryBusWidth")
+    ret = fn(device, byref(c_memBusWidth))
+    _nvmlCheckReturn(ret)
+    return c_memBusWidth.value
+
+def nvmlDeviceGetPcieLinkMaxSpeed(device):
+    c_speed = _nvmlPcieLinkMaxSpeed_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPcieLinkMaxSpeed")
+    ret = fn(device, byref(c_speed))
+    _nvmlCheckReturn(ret)
+    return c_speed.value
+
+def nvmlDeviceGetAdaptiveClockInfoStatus(device):
+    c_adaptiveClockInfoStatus = _nvmlAdaptiveClockInfoStatus_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetAdaptiveClockInfoStatus")
+    ret = fn(device, byref(c_adaptiveClockInfoStatus))
+    _nvmlCheckReturn(ret)
+    return c_adaptiveClockInfoStatus.value
+
+def nvmlDeviceGetPcieSpeed(device):
+    c_speed = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPcieSpeed")
+    ret = fn(device, byref(c_speed))
+    _nvmlCheckReturn(ret)
+    return c_speed.value
+
+def nvmlDeviceGetDynamicPstatesInfo(device, c_dynamicpstatesinfo=c_nvmlGpuDynamicPstatesInfo_t()):
+    isReference = type(c_dynamicpstatesinfo) is not c_nvmlGpuDynamicPstatesInfo_t
+    dynamicpstatesinfoRef = c_dynamicpstatesinfo if isReference else byref(c_dynamicpstatesinfo)
+
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetDynamicPstatesInfo");
+    ret = fn(device, dynamicpstatesinfoRef)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS if isReference else c_dynamicpstatesinfo
+
+def nvmlDeviceSetFanSpeed_v2(handle, index, speed):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetFanSpeed_v2");
+    ret = fn(handle, index, speed)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceGetThermalSettings(device, sensorindex, c_thermalsettings=c_nvmlGpuThermalSettings_t()):
+    isReference = type(c_thermalsettings) is not c_nvmlGpuThermalSettings_t
+    thermalsettingsRef = c_thermalsettings if isReference else byref(c_thermalsettings)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetThermalSettings");
+    ret = fn(device, sensorindex, thermalsettingsRef)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS if isReference else c_thermalsettings.sensor[:]
+
+def nvmlDeviceGetMinMaxClockOfPState(device, clockType, pstate, minClockMHz=c_uint(), maxClockMHz=c_uint()):
+    isReference = (type(minClockMHz) is not c_uint) or (type(maxClockMHz) is not c_uint)
+    minClockMHzRef = minClockMHz if isReference else byref(minClockMHz)
+    maxClockMHzRef = maxClockMHz if isReference else byref(maxClockMHz)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMinMaxClockOfPState");
+    ret = fn(device, _nvmlClockType_t(clockType), _nvmlClockType_t(pstate), minClockMHzRef, maxClockMHzRef)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS if isReference else (minClockMHz.value, maxClockMHz.value)
+
+class c_nvmlClockOffset_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('type', _nvmlClockType_t),
+        ('pstate', _nvmlPstates_t),
+        ('clockOffsetMHz', c_int),
+        ('minClockOffsetMHz', c_int),
+        ('maxClockOffsetMHz', c_int),
+    ]
+
+nvmlClockOffset_v1 = 0x1000018
+
+def nvmlDeviceGetClockOffsets(device, info):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetClockOffsets");
+    ret = fn(device, info)
+    return NVML_SUCCESS
+
+def nvmlDeviceSetClockOffsets(device, info):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetClockOffsets");
+    ret = fn(device, info)
+    return NVML_SUCCESS
+
+def nvmlDeviceGetSupportedPerformanceStates(device):
+    pstates = []
+    c_count = c_uint(NVML_MAX_GPU_PERF_PSTATES)
+    c_size = sizeof(c_uint)*c_count.value
+
+    # NOTE: use 'c_uint' to represent the size of the nvmlPstate_t enumeration.
+    pstates_array = _nvmlPstates_t * c_count.value
+    c_pstates = pstates_array()
+
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetSupportedPerformanceStates")
+    ret = fn(device, c_pstates, c_size)
+    _nvmlCheckReturn(ret)
+
+    for value in c_pstates:
+        if value != NVML_PSTATE_UNKNOWN:
+            pstates.append(value)
+
+    return pstates
+
+def nvmlDeviceGetGpcClkVfOffset(device):
+    offset = c_int32()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpcClkVfOffset")
+    ret = fn(device, byref(offset))
+    _nvmlCheckReturn(ret)
+    return offset.value
+
+def nvmlDeviceSetGpcClkVfOffset(device, offset):
+    c_offset = c_int32(offset)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetGpcClkVfOffset")
+    ret = fn(device, c_offset)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceGetGpcClkMinMaxVfOffset(device, minOffset=c_int(), maxOffset=c_int()):
+    isReference = (type(minOffset) is not c_int) or (type(maxOffset) is not c_int)
+    minOffsetRef = minOffset if isReference else byref(minOffset)
+    maxOffsetRef = maxOffset if isReference else byref(maxOffset)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpcClkMinMaxVfOffset")
+    ret = fn(device, minOffsetRef, maxOffsetRef)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS if isReference else (minOffset.value, maxOffset.value)
+
+def nvmlDeviceGetMemClkVfOffset(device):
+    offset = c_int32()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMemClkVfOffset")
+    ret = fn(device, byref(offset))
+    _nvmlCheckReturn(ret)
+    return offset.value
+
+def nvmlDeviceSetMemClkVfOffset(device, offset):
+    c_offset = c_int32(offset)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetMemClkVfOffset")
+    ret = fn(device, c_offset)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceGetMemClkMinMaxVfOffset(device, minOffset=c_int(), maxOffset=c_int()):
+    isReference = (type(minOffset) is not c_int) or (type(maxOffset) is not c_int)
+    minOffsetRef = minOffset if isReference else byref(minOffset)
+    maxOffsetRef = maxOffset if isReference else byref(maxOffset)
+
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMemClkMinMaxVfOffset")
+    ret = fn(device, minOffsetRef, maxOffsetRef)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS if isReference else (minOffset.value, maxOffset.value)
+
+def nvmlSystemSetConfComputeGpusReadyState(state):
+    c_state = c_uint(state)
+    fn = _nvmlGetFunctionPointer("nvmlSystemSetConfComputeGpusReadyState")
+    ret = fn(c_state)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlSystemGetConfComputeGpusReadyState():
+    c_state = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlSystemGetConfComputeGpusReadyState")
+    ret = fn(byref(c_state))
+    _nvmlCheckReturn(ret)
+    return c_state.value
+
+def nvmlSystemGetConfComputeCapabilities():
+    c_ccSysCaps = c_nvmlConfComputeSystemCaps_t()
+    fn = _nvmlGetFunctionPointer("nvmlSystemGetConfComputeCapabilities")
+    ret = fn(byref(c_ccSysCaps))
+    _nvmlCheckReturn(ret)
+    return c_ccSysCaps
+
+def nvmlSystemGetConfComputeState():
+    c_state = c_nvmlConfComputeSystemState_t()
+    fn = _nvmlGetFunctionPointer("nvmlSystemGetConfComputeState")
+    ret = fn(byref(c_state))
+    _nvmlCheckReturn(ret)
+    return c_state
+
+def nvmlSystemGetConfComputeSettings(settings):
+    fn = _nvmlGetFunctionPointer("nvmlSystemGetConfComputeSettings")
+    return fn(settings)
+
+def nvmlDeviceSetConfComputeUnprotectedMemSize(device, c_ccMemSize):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetConfComputeUnprotectedMemSize")
+    ret = fn(device, c_ccMemSize)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceGetConfComputeMemSizeInfo(device):
+    c_ccMemSize = c_nvmlConfComputeMemSizeInfo_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetConfComputeMemSizeInfo")
+    ret = fn(device, byref(c_ccMemSize))
+    _nvmlCheckReturn(ret)
+    return c_ccMemSize
+
+def nvmlDeviceGetConfComputeProtectedMemoryUsage(device):
+    c_memory = c_nvmlMemory_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetConfComputeProtectedMemoryUsage")
+    ret = fn(device, byref(c_memory))
+    _nvmlCheckReturn(ret)
+    return c_memory
+
+def nvmlDeviceGetConfComputeGpuCertificate(device):
+    c_cert = c_nvmlConfComputeGpuCertificate_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetConfComputeGpuCertificate")
+    ret = fn(device, byref(c_cert))
+    _nvmlCheckReturn(ret)
+    return c_cert
+
+def nvmlDeviceGetConfComputeGpuAttestationReport(device, c_nonce):
+    c_attestReport = c_nvmlConfComputeGpuAttestationReport_t()
+    c_nonce_arr = (c_uint8 * len(c_nonce))(*(c_nonce))
+    setattr(c_attestReport, 'nonce', c_nonce_arr)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetConfComputeGpuAttestationReport")
+    ret = fn(device, byref(c_attestReport))
+    _nvmlCheckReturn(ret)
+    return c_attestReport
+
+def nvmlSystemSetConfComputeKeyRotationThresholdInfo(max_atk_adv):
+    c_keyRotationThrInfo = c_nvmlConfComputeSetKeyRotationThresholdInfo_t(0)
+    c_keyRotationThrInfo.version = ConfComputeSetKeyRotationThresholdInfo_v1
+    c_keyRotationThrInfo.maxAttackerAdvantage = max_atk_adv
+    fn = _nvmlGetFunctionPointer("nvmlSystemSetConfComputeKeyRotationThresholdInfo")
+    ret = fn(byref(c_keyRotationThrInfo))
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlSystemGetConfComputeKeyRotationThresholdInfo():
+    c_keyRotationThrInfo = c_nvmlConfComputeGetKeyRotationThresholdInfo_t(0)
+    c_keyRotationThrInfo.version = ConfComputeGetKeyRotationThresholdInfo_v1
+    fn = _nvmlGetFunctionPointer("nvmlSystemGetConfComputeKeyRotationThresholdInfo")
+    ret = fn(byref(c_keyRotationThrInfo))
+    _nvmlCheckReturn(ret)
+    return c_keyRotationThrInfo
+
+## GPM ##
+#########
+
+## Enums/defines
+
+#### GPM Metric Identifiers
+NVML_GPM_METRIC_GRAPHICS_UTIL               = 1 # Percentage of time any compute/graphics app was active on the GPU. 0.0 - 100.0
+NVML_GPM_METRIC_SM_UTIL                     = 2 # Percentage of SMs that were busy. 0.0 - 100.0
+NVML_GPM_METRIC_SM_OCCUPANCY                = 3 # Percentage of warps that were active vs theoretical maximum. 0.0 - 100.0
+NVML_GPM_METRIC_INTEGER_UTIL                = 4 # Percentage of time the GPU's SMs were doing integer operations. 0.0 - 100.0
+NVML_GPM_METRIC_ANY_TENSOR_UTIL             = 5 # Percentage of time the GPU's SMs were doing ANY tensor operations. 0.0 - 100.0
+NVML_GPM_METRIC_DFMA_TENSOR_UTIL            = 6 # Percentage of time the GPU's SMs were doing DFMA tensor operations. 0.0 - 100.0
+NVML_GPM_METRIC_HMMA_TENSOR_UTIL            = 7 # Percentage of time the GPU's SMs were doing HMMA tensor operations. 0.0 - 100.0
+NVML_GPM_METRIC_IMMA_TENSOR_UTIL            = 9 # Percentage of time the GPU's SMs were doing IMMA tensor operations. 0.0 - 100.0
+NVML_GPM_METRIC_DRAM_BW_UTIL                = 10 # Percentage of DRAM bw used vs theoretical maximum. 0.0 - 100.0
+NVML_GPM_METRIC_FP64_UTIL                   = 11 # Percentage of time the GPU's SMs were doing non-tensor FP64 math. 0.0 - 100.0
+NVML_GPM_METRIC_FP32_UTIL                   = 12 # Percentage of time the GPU's SMs were doing non-tensor FP32 math. 0.0 - 100.0
+NVML_GPM_METRIC_FP16_UTIL                   = 13 # Percentage of time the GPU's SMs were doing non-tensor FP16 math. 0.0 - 100.0
+NVML_GPM_METRIC_PCIE_TX_PER_SEC             = 20 # PCIe traffic from this GPU in MiB/sec
+NVML_GPM_METRIC_PCIE_RX_PER_SEC             = 21 # PCIe traffic to this GPU in MiB/sec
+NVML_GPM_METRIC_NVDEC_0_UTIL                = 30 # Percent utilization of NVDEC 0. 0.0 - 100.0
+NVML_GPM_METRIC_NVDEC_1_UTIL                = 31 # Percent utilization of NVDEC 1. 0.0 - 100.0
+NVML_GPM_METRIC_NVDEC_2_UTIL                = 32 # Percent utilization of NVDEC 2. 0.0 - 100.0
+NVML_GPM_METRIC_NVDEC_3_UTIL                = 33 # Percent utilization of NVDEC 3. 0.0 - 100.0
+NVML_GPM_METRIC_NVDEC_4_UTIL                = 34 # Percent utilization of NVDEC 4. 0.0 - 100.0
+NVML_GPM_METRIC_NVDEC_5_UTIL                = 35 # Percent utilization of NVDEC 5. 0.0 - 100.0
+NVML_GPM_METRIC_NVDEC_6_UTIL                = 36 # Percent utilization of NVDEC 6. 0.0 - 100.0
+NVML_GPM_METRIC_NVDEC_7_UTIL                = 37 # Percent utilization of NVDEC 7. 0.0 - 100.0
+NVML_GPM_METRIC_NVJPG_0_UTIL                = 40 # Percent utilization of NVJPG 0. 0.0 - 100.0
+NVML_GPM_METRIC_NVJPG_1_UTIL                = 41 # Percent utilization of NVJPG 1. 0.0 - 100.0
+NVML_GPM_METRIC_NVJPG_2_UTIL                = 42 # Percent utilization of NVJPG 2. 0.0 - 100.0
+NVML_GPM_METRIC_NVJPG_3_UTIL                = 43 # Percent utilization of NVJPG 3. 0.0 - 100.0
+NVML_GPM_METRIC_NVJPG_4_UTIL                = 44 # Percent utilization of NVJPG 4. 0.0 - 100.0
+NVML_GPM_METRIC_NVJPG_5_UTIL                = 45 # Percent utilization of NVJPG 5. 0.0 - 100.0
+NVML_GPM_METRIC_NVJPG_6_UTIL                = 46 # Percent utilization of NVJPG 6. 0.0 - 100.0
+NVML_GPM_METRIC_NVJPG_7_UTIL                = 47 # Percent utilization of NVJPG 7. 0.0 - 100.0
+NVML_GPM_METRIC_NVOFA_0_UTIL                = 50 # Percent utilization of NVOFA 0. 0.0 - 100.0
+NVML_GPM_METRIC_NVOFA_1_UTIL                = 51 # Percent utilization of NVOFA 1. 0.0 - 100.0
+NVML_GPM_METRIC_NVLINK_TOTAL_RX_PER_SEC     = 60 # NvLink read bandwidth for all links in MiB/sec
+NVML_GPM_METRIC_NVLINK_TOTAL_TX_PER_SEC     = 61 # NvLink write bandwidth for all links in MiB/sec
+NVML_GPM_METRIC_NVLINK_L0_RX_PER_SEC        = 62 # NvLink read bandwidth for link 0 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L0_TX_PER_SEC        = 63 # NvLink write bandwidth for link 0 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L1_RX_PER_SEC        = 64 # NvLink read bandwidth for link 1 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L1_TX_PER_SEC        = 65 # NvLink write bandwidth for link 1 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L2_RX_PER_SEC        = 66 # NvLink read bandwidth for link 2 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L2_TX_PER_SEC        = 67 # NvLink write bandwidth for link 2 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L3_RX_PER_SEC        = 68 # NvLink read bandwidth for link 3 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L3_TX_PER_SEC        = 69 # NvLink write bandwidth for link 3 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L4_RX_PER_SEC        = 70 # NvLink read bandwidth for link 4 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L4_TX_PER_SEC        = 71 # NvLink write bandwidth for link 4 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L5_RX_PER_SEC        = 72 # NvLink read bandwidth for link 5 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L5_TX_PER_SEC        = 73 # NvLink write bandwidth for link 5 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L6_RX_PER_SEC        = 74 # NvLink read bandwidth for link 6 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L6_TX_PER_SEC        = 75 # NvLink write bandwidth for link 6 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L7_RX_PER_SEC        = 76 # NvLink read bandwidth for link 7 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L7_TX_PER_SEC        = 77 # NvLink write bandwidth for link 7 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L8_RX_PER_SEC        = 78 # NvLink read bandwidth for link 8 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L8_TX_PER_SEC        = 79 # NvLink write bandwidth for link 8 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L9_RX_PER_SEC        = 80 # NvLink read bandwidth for link 9 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L9_TX_PER_SEC        = 81 # NvLink write bandwidth for link 9 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L10_RX_PER_SEC       = 82 # NvLink read bandwidth for link 10 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L10_TX_PER_SEC       = 83 # NvLink write bandwidth for link 10 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L11_RX_PER_SEC       = 84 # NvLink read bandwidth for link 11 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L11_TX_PER_SEC       = 85 # NvLink write bandwidth for link 11 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L12_RX_PER_SEC       = 86 # NvLink read bandwidth for link 12 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L12_TX_PER_SEC       = 87 # NvLink write bandwidth for link 12 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L13_RX_PER_SEC       = 88 # NvLink read bandwidth for link 13 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L13_TX_PER_SEC       = 89 # NvLink write bandwidth for link 13 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L14_RX_PER_SEC       = 90 # NvLink read bandwidth for link 14 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L14_TX_PER_SEC       = 91 # NvLink write bandwidth for link 14 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L15_RX_PER_SEC       = 92 # NvLink read bandwidth for link 15 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L15_TX_PER_SEC       = 93 # NvLink write bandwidth for link 15 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L16_RX_PER_SEC       = 94 # NvLink read bandwidth for link 16 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L16_TX_PER_SEC       = 95 # NvLink write bandwidth for link 16 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L17_RX_PER_SEC       = 96 # NvLink read bandwidth for link 17 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L17_TX_PER_SEC       = 97 # NvLink write bandwidth for link 17 in MiB/sec
+NVML_GPM_METRIC_MAX                         = 98
+
+## Structs
+
+class c_nvmlUnitInfo_t(_PrintableStructure):
+    _fields_ = [
+        ('name', c_char * 96),
+        ('id', c_char * 96),
+        ('serial', c_char * 96),
+        ('firmwareVersion', c_char * 96),
+    ]
+
+class struct_c_nvmlGpmSample_t(Structure):
+    pass # opaque handle
+c_nvmlGpmSample_t = POINTER(struct_c_nvmlGpmSample_t)
+
+class c_metricInfo_t(Structure):
+    _fields_ = [
+        ("shortName", c_char_p),
+        ("longName", c_char_p),
+        ("unit", c_char_p),
+    ]
+
+class c_nvmlGpmMetric_t(_PrintableStructure):
+    _fields_ = [
+        ('metricId', c_uint),
+        ('nvmlReturn', _nvmlReturn_t),
+        ('value', c_double),
+        ('metricInfo', c_metricInfo_t)
+    ]
+
+class c_nvmlGpmMetricsGet_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('numMetrics', c_uint),
+        ('sample1', c_nvmlGpmSample_t),
+        ('sample2', c_nvmlGpmSample_t),
+        ('metrics', c_nvmlGpmMetric_t * NVML_GPM_METRIC_MAX)
+    ]
+
+NVML_GPM_METRICS_GET_VERSION = 1
+
+class c_nvmlGpmSupport_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('isSupportedDevice', c_uint),
+    ]
+
+NVML_GPM_SUPPORT_VERSION = 1
+
+## Functions
+
+def nvmlGpmMetricsGet(metricsGet):
+    fn = _nvmlGetFunctionPointer("nvmlGpmMetricsGet")
+    ret = fn(byref(metricsGet))
+    _nvmlCheckReturn(ret)
+    return metricsGet
+
+def nvmlGpmSampleFree(gpmSample):
+    fn = _nvmlGetFunctionPointer("nvmlGpmSampleFree")
+    ret = fn(gpmSample)
+    _nvmlCheckReturn(ret)
+    return
+
+def nvmlGpmSampleAlloc():
+    gpmSample = c_nvmlGpmSample_t()
+    fn = _nvmlGetFunctionPointer("nvmlGpmSampleAlloc")
+    ret = fn(byref(gpmSample))
+    _nvmlCheckReturn(ret)
+    return gpmSample
+
+def nvmlGpmSampleGet(device, gpmSample):
+    fn = _nvmlGetFunctionPointer("nvmlGpmSampleGet")
+    ret = fn(device, gpmSample)
+    _nvmlCheckReturn(ret)
+    return gpmSample
+
+def nvmlGpmMigSampleGet(device, gpuInstanceId, gpmSample):
+    fn = _nvmlGetFunctionPointer("nvmlGpmMigSampleGet")
+    ret = fn(device, gpuInstanceId, gpmSample)
+    _nvmlCheckReturn(ret)
+    return gpmSample
+
+def nvmlGpmQueryDeviceSupport(device):
+    gpmSupport = c_nvmlGpmSupport_t()
+    gpmSupport.version = NVML_GPM_SUPPORT_VERSION
+    fn = _nvmlGetFunctionPointer("nvmlGpmQueryDeviceSupport")
+    ret = fn(device, byref(gpmSupport))
+    _nvmlCheckReturn(ret)
+    return gpmSupport
+
+def nvmlGpmSetStreamingEnabled(device, state):
+    c_state = c_uint(state)
+    fn = _nvmlGetFunctionPointer("nvmlGpmSetStreamingEnabled")
+    ret = fn(device, c_state)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlGpmQueryIfStreamingEnabled(device):
+    c_state = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlGpmQueryIfStreamingEnabled")
+    ret = fn(device, byref(c_state))
+    _nvmlCheckReturn(ret)
+    return c_state.value
+
+# Low Power Structure and Function
+
+NVML_NVLINK_POWER_STATE_HIGH_SPEED    = 0x0
+NVML_NVLINK_POWER_STATE_LOW           = 0x1
+
+NVML_NVLINK_LOW_POWER_THRESHOLD_MIN     = 0x1
+NVML_NVLINK_LOW_POWER_THRESHOLD_MAX     = 0x1FFF
+NVML_NVLINK_LOW_POWER_THRESHOLD_RESET   = 0xFFFFFFFF
+NVML_NVLINK_LOW_POWER_THRESHOLD_DEFAULT = NVML_NVLINK_LOW_POWER_THRESHOLD_RESET
+
+class c_nvmlNvLinkPowerThres_t(Structure):
+    _fields_ = [
+        ("lowPwrThreshold", c_uint),
+    ]
+
+def nvmlDeviceSetNvLinkDeviceLowPowerThreshold(device, l1threshold):
+    c_info = c_nvmlNvLinkPowerThres_t()
+    c_info.lowPwrThreshold = l1threshold
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetNvLinkDeviceLowPowerThreshold")
+    ret = fn(device, byref(c_info))
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+NVML_GPU_FABRIC_UUID_LEN = 16
+
+_nvmlGpuFabricState_t = c_uint
+NVML_GPU_FABRIC_STATE_NOT_SUPPORTED = 0
+NVML_GPU_FABRIC_STATE_NOT_STARTED   = 1
+NVML_GPU_FABRIC_STATE_IN_PROGRESS   = 2
+NVML_GPU_FABRIC_STATE_COMPLETED     = 3
+
+class c_nvmlGpuFabricInfo_t(_PrintableStructure):
+    _fields_ = [
+        ("clusterUuid", c_char * NVML_DEVICE_UUID_BUFFER_SIZE),
+        ("status", _nvmlReturn_t),
+        ("cliqueId", c_uint32),
+        ("state", _nvmlGpuFabricState_t)
+    ]
+
+NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_NOT_SUPPORTED = 0
+NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_TRUE          = 1
+NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_FALSE         = 2
+NVML_GPU_FABRIC_HEALTH_MASK_SHIFT_DEGRADED_BW         = 0
+NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_DEGRADED_BW         = 0x11
+
+NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_RECOVERY_NOT_SUPPORTED   = 0
+NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_RECOVERY_TRUE            = 1
+NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_RECOVERY_FALSE           = 2
+NVML_GPU_FABRIC_HEALTH_MASK_SHIFT_ROUTE_RECOVERY           = 2
+NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_ROUTE_RECOVERY           = 0x11
+
+NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_UNHEALTHY_NOT_SUPPORTED  = 0
+NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_UNHEALTHY_TRUE           = 1
+NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_UNHEALTHY_FALSE          = 2
+NVML_GPU_FABRIC_HEALTH_MASK_SHIFT_ROUTE_UNHEALTHY          = 4
+NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_ROUTE_UNHEALTHY          = 0x11
+
+NVML_GPU_FABRIC_HEALTH_MASK_ACCESS_TIMEOUT_RECOVERY_NOT_SUPPORTED = 0
+NVML_GPU_FABRIC_HEALTH_MASK_ACCESS_TIMEOUT_RECOVERY_TRUE          = 1
+NVML_GPU_FABRIC_HEALTH_MASK_ACCESS_TIMEOUT_RECOVERY_FALSE         = 2
+NVML_GPU_FABRIC_HEALTH_MASK_SHIFT_ACCESS_TIMEOUT_RECOVERY         = 6
+NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_ACCESS_TIMEOUT_RECOVERY         = 0x11
+
+nvmlGpuFabricInfo_v2 = 0x02000024
+
+class c_nvmlGpuFabricInfoV_t(_PrintableStructure):
+    _fields_ = [
+        ("version", c_uint),
+        ("clusterUuid", c_char * NVML_GPU_FABRIC_UUID_LEN),
+        ("status", _nvmlReturn_t),
+        ("cliqueId", c_uint32),
+        ("state", _nvmlGpuFabricState_t),
+        ("healthMask", c_uint32)
+    ]
+
+    def __init__(self):
+        super(c_nvmlGpuFabricInfoV_t, self).__init__(version=nvmlGpuFabricInfo_v2)
+
+def nvmlDeviceGetGpuFabricInfo(device, gpuFabricInfo):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuFabricInfo");
+    ret = fn(device, gpuFabricInfo)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceGetGpuFabricInfoV(device, gpuFabricInfo):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuFabricInfoV");
+    ret = fn(device, gpuFabricInfo)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+######################
+## Enums/defines
+#### NVML GPU NVLINK BW MODE
+NVML_GPU_NVLINK_BW_MODE_FULL      = 0x0
+NVML_GPU_NVLINK_BW_MODE_OFF       = 0x1
+NVML_GPU_NVLINK_BW_MODE_MIN       = 0x2
+NVML_GPU_NVLINK_BW_MODE_HALF      = 0x3
+NVML_GPU_NVLINK_BW_MODE_3QUARTER  = 0x4
+NVML_GPU_NVLINK_BW_MODE_COUNT     = 0x5
+
+def nvmlSystemSetNvlinkBwMode(mode):
+    fn = _nvmlGetFunctionPointer("nvmlSystemSetNvlinkBwMode")
+    ret = fn(mode)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlSystemGetNvlinkBwMode():
+    mode = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlSystemGetNvlinkBwMode")
+    ret = fn(byref(mode))
+    _nvmlCheckReturn(ret)
+    return mode.value
+
+_nvmlPowerScopeType_t = c_uint
+NVML_POWER_SCOPE_GPU     = 0
+NVML_POWER_SCOPE_MODULE  = 1
+NVML_POWER_SCOPE_MEMORY  = 2
+
+class c_nvmlPowerValue_v2_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('powerScope', _nvmlPowerScopeType_t),
+        ('powerValueMw', c_uint),
+    ]
+    _fmt_ = {'<default>': "%d B"}
+
+nvmlPowerValue_v2 = 0x0200000C
+
+def nvmlDeviceSetPowerManagementLimit_v2(device, powerScope, powerLimit, version=nvmlPowerValue_v2):
+    c_powerScope = _nvmlPowerScopeType_t(powerScope)
+    c_powerValue = c_nvmlPowerValue_v2_t()
+    c_powerValue.version = c_uint(version)
+    c_powerValue.powerScope = c_powerScope
+    c_powerValue.powerValueMw = c_uint(powerLimit)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetPowerManagementLimit_v2")
+    ret = fn(device, byref(c_powerValue))
+    return NVML_SUCCESS
+
+class c_nvmlEccSramErrorStatus_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('aggregateUncParity', c_ulonglong),
+        ('aggregateUncSecDed', c_ulonglong),
+        ('aggregateCor', c_ulonglong),
+        ('volatileUncParity', c_ulonglong),
+        ('volatileUncSecDed', c_ulonglong),
+        ('volatileCor', c_ulonglong),
+        ('aggregateUncBucketL2', c_ulonglong),
+        ('aggregateUncBucketSm', c_ulonglong),
+        ('aggregateUncBucketPcie', c_ulonglong),
+        ('aggregateUncBucketMcu', c_ulonglong),
+        ('aggregateUncBucketOther', c_ulonglong),
+        ('bThresholdExceeded', c_uint)
+    ]
+
+    def __init__(self):
+        super(c_nvmlEccSramErrorStatus_v1_t, self).__init__(version=nvmlEccSramErrorStatus_v1)
+
+nvmlEccSramErrorStatus_v1 = 0x1000068
+def nvmlDeviceGetSramEccErrorStatus(device, status):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetSramEccErrorStatus")
+    ret = fn(device, status)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+NVML_DEV_CAP_EGM = (1 << 0)
+nvmlDeviceCapabilities_v1 = 0x1000008
+
+class c_nvmlDeviceCapabilities_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('capMask', c_uint),
+    ]
+
+    def __init__(self):
+        super(c_nvmlDeviceCapabilities_v1_t, self).__init__(version=nvmlDeviceCapabilities_v1)
+
+
+def nvmlDeviceGetCapabilities(device, caps):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetCapabilities")
+    return fn(device, caps)
+
+class c_nvmlPlatformInfo_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('ibGuid', c_char * 16),
+        ('rackGuid', c_char * 16),
+        ('chassisPhysicalSlotNumber', c_char),
+        ('computeSlotIndex', c_char),
+        ('nodeIndex', c_char),
+        ('peerType', c_char),
+        ('moduleId', c_char)
+    ]
+
+    def __init__(self):
+        super(c_nvmlPlatformInfo_v1_t, self).__init__(version=nvmlPlatformInfo_v1)
+
+nvmlPlatformInfo_v1 = 0x100002c
+def nvmlDeviceGetPlatformInfo(device, platformInfo):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPlatformInfo")
+    ret = fn(device, platformInfo)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+class c_nvmlMask255_t(_PrintableStructure):
+    _fields_ = [
+        ('mask', c_uint * 8),
+    ]
+
+NVML_WORKLOAD_POWER_MAX_PROFILES    = 255
+NVML_POWER_PROFILE_MAX_P            = 0
+NVML_POWER_PROFILE_MAX_Q            = 1
+NVML_POWER_PROFILE_COMPUTE          = 2
+NVML_POWER_PROFILE_MEMORY_BOUND     = 3
+NVML_POWER_PROFILE_NETWORK          = 4
+NVML_POWER_PROFILE_BALANCED         = 5
+NVML_POWER_PROFILE_LLM_INFERENCE    = 6
+NVML_POWER_PROFILE_LLM_TRAINING     = 7
+NVML_POWER_PROFILE_RBM              = 8
+NVML_POWER_PROFILE_DCPCIE           = 9
+NVML_POWER_PROFILE_HMMA_SPARSE      = 10
+NVML_POWER_PROFILE_HMMA_DENSE       = 11
+NVML_POWER_PROFILE_SYNC_BALANCED    = 12
+NVML_POWER_PROFILE_HPC              = 13
+NVML_POWER_PROFILE_MIG              = 14
+NVML_POWER_PROFILE_MAX              = 15
+
+nvmlWorkloadPowerProfileInfo_v1 = 0x100002c
+class c_nvmlWorkloadPowerProfileInfo_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('profileId', c_uint),
+        ('priority', c_uint),
+        ('conflictingmask', c_nvmlMask255_t)
+    ]
+
+    def __init__(self):
+        super(c_nvmlWorkloadPowerProfileInfo_v1_t, self).__init__(version=nvmlWorkloadPowerProfileInfo_v1)
+
+nvmlWorkloadPowerProfileProfilesInfo_v1 = 0x1002bf8
+class c_nvmlWorkloadPowerProfileProfilesInfo_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('perfProfilesMask', c_nvmlMask255_t),
+        ('perfProfile', c_nvmlWorkloadPowerProfileInfo_v1_t * NVML_WORKLOAD_POWER_MAX_PROFILES)
+    ]
+
+    def __init__(self):
+        super(c_nvmlWorkloadPowerProfileProfilesInfo_v1_t, self).__init__(version=nvmlWorkloadPowerProfileProfilesInfo_v1)
+
+nvmlWorkloadPowerProfileCurrentProfiles_v1 = 0x1000064
+class c_nvmlWorkloadPowerProfileCurrentProfiles_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('perfProfilesMask', c_nvmlMask255_t),
+        ('requestedProfilesMask', c_nvmlMask255_t),
+        ('enforcedProfilesMask', c_nvmlMask255_t)
+    ]
+
+    def __init__(self):
+        super(c_nvmlWorkloadPowerProfileCurrentProfiles_v1_t, self).__init__(version=nvmlWorkloadPowerProfileCurrentProfiles_v1)
+
+nvmlWorkloadPowerProfileRequestedProfiles_v1 = 0x1000024
+class c_nvmlWorkloadPowerProfileRequestedProfiles_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('requestedProfilesMask', c_nvmlMask255_t),
+    ]
+
+    def __init__(self):
+        super(c_nvmlWorkloadPowerProfileRequestedProfiles_v1_t, self).__init__(version=nvmlWorkloadPowerProfileRequestedProfiles_v1)
+
+def nvmlDeviceWorkloadPowerProfileGetProfilesInfo(device, profilesInfo):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceWorkloadPowerProfileGetProfilesInfo")
+    ret = fn(device, profilesInfo)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceWorkloadPowerProfileGetCurrentProfiles(device, currentProfiles):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceWorkloadPowerProfileGetCurrentProfiles")
+    ret = fn(device, currentProfiles)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceWorkloadPowerProfileSetRequestedProfiles(device, requestedProfiles):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceWorkloadPowerProfileSetRequestedProfiles")
+    ret = fn(device, requestedProfiles)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceWorkloadPowerProfileClearRequestedProfiles(device, requestedProfiles):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceWorkloadPowerProfileClearRequestedProfiles")
+    ret = fn(device, requestedProfiles)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceGetNvlinkSupportedBwModes(device, supportedBwModes):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetNvlinkSupportedBwModes")
+    ret = fn(device, supportedBwModes)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceGetNvlinkBwMode(device, getBwMode):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetNvlinkBwMode")
+    ret = fn(device, getBwMode)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceSetNvlinkBwMode(device, setBwMode):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetNvlinkBwMode")
+    ret = fn(device, setBwMode)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+nvmlDramEncryptionInfo_v1 = 0x01000008
+
+class c_nvmlDramEncryptionInfo_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('encryptionState',  _nvmlEnableState_t),
+    ]
+
+    def __init__(self):
+        super(c_nvmlDramEncryptionInfo_t, self).__init__(version=nvmlDramEncryptionInfo_v1)
+
+def nvmlDeviceGetDramEncryptionMode(handle):
+    c_currState = c_nvmlDramEncryptionInfo_t()
+    c_pendingState = c_nvmlDramEncryptionInfo_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetDramEncryptionMode")
+    ret = fn(handle, byref(c_currState), byref(c_pendingState))
+    _nvmlCheckReturn(ret)
+    return [c_currState.encryptionState, c_pendingState.encryptionState]
+
+# added to API
+def nvmlDeviceGetCurrentDramEncryptionMode(handle):
+    return nvmlDeviceGetDramEncryptionMode(handle)[0]
+
+# added to API
+def nvmlDeviceGetPendingDramEncryptionMode(handle):
+    return nvmlDeviceGetDramEncryptionMode(handle)[1]
+
+def nvmlDeviceSetDramEncryptionMode(handle, mode):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetDramEncryptionMode")
+    c_dramEncryptionMode = c_nvmlDramEncryptionInfo_t()
+    c_dramEncryptionMode.encryptionState = mode;
+    ret = fn(handle, byref(c_dramEncryptionMode))
+    _nvmlCheckReturn(ret)
+    return None
+
+# Power Smoothing defines
+NVML_POWER_SMOOTHING_MAX_NUM_PROFILES                   = 5
+NVML_POWER_SMOOTHING_ADMIN_OVERRIDE_NOT_SET             = 0xFFFFFFFF
+NVML_POWER_SMOOTHING_PROFILE_PARAM_PERCENT_TMP_FLOOR    = 0
+NVML_POWER_SMOOTHING_PROFILE_PARAM_RAMP_UP_RATE         = 1
+NVML_POWER_SMOOTHING_PROFILE_PARAM_RAMP_DOWN_RATE       = 2
+NVML_POWER_SMOOTHING_PROFILE_PARAM_RAMP_DOWN_HYSTERESIS = 3
+
+nvmlPowerSmoothingState_v1=0x1000008
+class c_nvmlPowerSmoothingState_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('state', c_uint),
+    ]
+
+    def __init__(self):
+        super(c_nvmlPowerSmoothingState_v1_t, self).__init__(version=nvmlPowerSmoothingState_v1)
+
+nvmlPowerSmoothingProfile_v1=0x1000018
+class c_nvmlPowerSmoothingProfile_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('profileId', c_uint),
+        ('paramId', c_uint),
+        ('value', c_double),
+    ]
+
+    def __init__(self):
+        super(c_nvmlPowerSmoothingProfile_v1_t, self).__init__(version=nvmlPowerSmoothingProfile_v1)
+
+def nvmlDevicePowerSmoothingActivatePresetProfile(device, profile):
+    fn = _nvmlGetFunctionPointer("nvmlDevicePowerSmoothingActivatePresetProfile")
+    ret = fn(device, profile)
+    _nvmlCheckReturn(ret)
+
+def nvmlDevicePowerSmoothingUpdatePresetProfileParam(device, profile):
+    fn = _nvmlGetFunctionPointer("nvmlDevicePowerSmoothingUpdatePresetProfileParam")
+    ret = fn(device, profile)
+    _nvmlCheckReturn(ret)
+
+def nvmlDevicePowerSmoothingSetState(device, state):
+    fn = _nvmlGetFunctionPointer("nvmlDevicePowerSmoothingSetState")
+    ret = fn(device, state)
+    _nvmlCheckReturn(ret)
+
diff --git a/vllm_v0.10.0/vllm/tracing.py b/vllm_v0.10.0/vllm/tracing.py
new file mode 100644
index 0000000..6a287d8
--- /dev/null
+++ b/vllm_v0.10.0/vllm/tracing.py
@@ -0,0 +1,131 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from collections.abc import Mapping
+from typing import Optional
+
+from vllm.logger import init_logger
+from vllm.utils import run_once
+
+TRACE_HEADERS = ["traceparent", "tracestate"]
+
+logger = init_logger(__name__)
+
+_is_otel_imported = False
+otel_import_error_traceback: Optional[str] = None
+try:
+    from opentelemetry.context.context import Context
+    from opentelemetry.sdk.environment_variables import (
+        OTEL_EXPORTER_OTLP_TRACES_PROTOCOL)
+    from opentelemetry.sdk.trace import TracerProvider
+    from opentelemetry.sdk.trace.export import BatchSpanProcessor
+    from opentelemetry.trace import SpanKind, Tracer, set_tracer_provider
+    from opentelemetry.trace.propagation.tracecontext import (
+        TraceContextTextMapPropagator)
+    _is_otel_imported = True
+except ImportError:
+    # Capture and format traceback to provide detailed context for the import
+    # error. Only the string representation of the error is retained to avoid
+    # memory leaks.
+    # See https://github.com/vllm-project/vllm/pull/7266#discussion_r1707395458
+    import traceback
+    otel_import_error_traceback = traceback.format_exc()
+
+    class Context:  # type: ignore
+        pass
+
+    class BaseSpanAttributes:  # type: ignore
+        pass
+
+    class SpanKind:  # type: ignore
+        pass
+
+    class Tracer:  # type: ignore
+        pass
+
+
+def is_otel_available() -> bool:
+    return _is_otel_imported
+
+
+def init_tracer(instrumenting_module_name: str,
+                otlp_traces_endpoint: str) -> Optional[Tracer]:
+    if not is_otel_available():
+        raise ValueError(
+            "OpenTelemetry is not available. Unable to initialize "
+            "a tracer. Ensure OpenTelemetry packages are installed. "
+            f"Original error:\n{otel_import_error_traceback}")
+    trace_provider = TracerProvider()
+
+    span_exporter = get_span_exporter(otlp_traces_endpoint)
+    trace_provider.add_span_processor(BatchSpanProcessor(span_exporter))
+    set_tracer_provider(trace_provider)
+
+    tracer = trace_provider.get_tracer(instrumenting_module_name)
+    return tracer
+
+
+def get_span_exporter(endpoint):
+    protocol = os.environ.get(OTEL_EXPORTER_OTLP_TRACES_PROTOCOL, "grpc")
+    if protocol == "grpc":
+        from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
+            OTLPSpanExporter)
+    elif protocol == "http/protobuf":
+        from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
+            OTLPSpanExporter)  # type: ignore
+    else:
+        raise ValueError(
+            f"Unsupported OTLP protocol '{protocol}' is configured")
+
+    return OTLPSpanExporter(endpoint=endpoint)
+
+
+def extract_trace_context(
+        headers: Optional[Mapping[str, str]]) -> Optional[Context]:
+    if is_otel_available():
+        headers = headers or {}
+        return TraceContextTextMapPropagator().extract(headers)
+    else:
+        return None
+
+
+def extract_trace_headers(headers: Mapping[str, str]) -> Mapping[str, str]:
+
+    return {h: headers[h] for h in TRACE_HEADERS if h in headers}
+
+
+class SpanAttributes:
+    # Attribute names copied from here to avoid version conflicts:
+    # https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/gen-ai-spans.md
+    GEN_AI_USAGE_COMPLETION_TOKENS = "gen_ai.usage.completion_tokens"
+    GEN_AI_USAGE_PROMPT_TOKENS = "gen_ai.usage.prompt_tokens"
+    GEN_AI_REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens"
+    GEN_AI_REQUEST_TOP_P = "gen_ai.request.top_p"
+    GEN_AI_REQUEST_TEMPERATURE = "gen_ai.request.temperature"
+    GEN_AI_RESPONSE_MODEL = "gen_ai.response.model"
+    # Attribute names added until they are added to the semantic conventions:
+    GEN_AI_REQUEST_ID = "gen_ai.request.id"
+    GEN_AI_REQUEST_N = "gen_ai.request.n"
+    GEN_AI_USAGE_NUM_SEQUENCES = "gen_ai.usage.num_sequences"
+    GEN_AI_LATENCY_TIME_IN_QUEUE = "gen_ai.latency.time_in_queue"
+    GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN = "gen_ai.latency.time_to_first_token"
+    GEN_AI_LATENCY_E2E = "gen_ai.latency.e2e"
+    GEN_AI_LATENCY_TIME_IN_SCHEDULER = "gen_ai.latency.time_in_scheduler"
+    # Time taken in the forward pass for this across all workers
+    GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD = (
+        "gen_ai.latency.time_in_model_forward")
+    # Time taken in the model execute function. This will include model
+    # forward, block/sync across workers, cpu-gpu sync time and sampling time.
+    GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE = (
+        "gen_ai.latency.time_in_model_execute")
+
+
+def contains_trace_headers(headers: Mapping[str, str]) -> bool:
+    return any(h in headers for h in TRACE_HEADERS)
+
+
+@run_once
+def log_tracing_disabled_warning() -> None:
+    logger.warning(
+        "Received a request with trace context but tracing is disabled")
diff --git a/vllm_v0.10.0/vllm/transformers_utils/__init__.py b/vllm_v0.10.0/vllm/transformers_utils/__init__.py
new file mode 100644
index 0000000..6d4231b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/__init__.py
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import envs
+
+if envs.VLLM_USE_MODELSCOPE:
+    try:
+        # Patch here, before each import happens
+        import modelscope
+        from packaging import version
+
+        # patch_hub begins from modelscope>=1.18.1
+        if version.parse(modelscope.__version__) <= version.parse('1.18.0'):
+            raise ImportError(
+                'Using vLLM with ModelScope needs modelscope>=1.18.1, please '
+                'install by `pip install modelscope -U`')
+        from modelscope.utils.hf_util import patch_hub
+
+        # Patch hub to download models from modelscope to speed up.
+        patch_hub()
+    except ImportError as err:
+        raise ImportError(
+            "Please install modelscope>=1.18.1 via "
+            "`pip install modelscope>=1.18.1` to use ModelScope.") from err
diff --git a/vllm_v0.10.0/vllm/transformers_utils/chat_templates/__init__.py b/vllm_v0.10.0/vllm/transformers_utils/chat_templates/__init__.py
new file mode 100644
index 0000000..2783d12
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/chat_templates/__init__.py
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from .registry import get_chat_template_fallback_path
+
+__all__ = ["get_chat_template_fallback_path"]
diff --git a/vllm_v0.10.0/vllm/transformers_utils/chat_templates/registry.py b/vllm_v0.10.0/vllm/transformers_utils/chat_templates/registry.py
new file mode 100644
index 0000000..e0ef7f0
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/chat_templates/registry.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from pathlib import Path
+from typing import Callable, Optional, Union
+
+from vllm.logger import init_logger
+
+logger = init_logger(__file__)
+
+CHAT_TEMPLATES_DIR = Path(__file__).parent
+
+ChatTemplatePath = Union[Path, Callable[[str], Optional[Path]]]
+
+
+def _get_qwen_chat_template_fallback(
+        tokenizer_name_or_path: str) -> Optional[Path]:
+    if tokenizer_name_or_path.endswith("-Chat"):
+        return CHAT_TEMPLATES_DIR / "template_chatml.jinja"
+
+    return CHAT_TEMPLATES_DIR / "template_basic.jinja"
+
+
+# yapf: disable
+_MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
+    "blip-2": CHAT_TEMPLATES_DIR / "template_blip2.jinja",
+    "chameleon": CHAT_TEMPLATES_DIR / "template_basic.jinja",
+    "deepseek_vl_v2": CHAT_TEMPLATES_DIR / "template_deepseek_vl2.jinja",
+    "florence2": CHAT_TEMPLATES_DIR / "template_basic.jinja",
+    "fuyu": CHAT_TEMPLATES_DIR / "template_fuyu.jinja",
+    "paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja",
+    "qwen": _get_qwen_chat_template_fallback,
+}
+# yapf: enable
+
+
+def register_chat_template_fallback_path(
+    model_type: str,
+    chat_template: ChatTemplatePath,
+) -> None:
+    if model_type in _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK:
+        logger.warning(
+            "Model type %s already has a chat template registered. "
+            "It will be overwritten by the new chat template %s.", model_type,
+            chat_template)
+
+    _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK[model_type] = chat_template
+
+
+def get_chat_template_fallback_path(
+    model_type: str,
+    tokenizer_name_or_path: str,
+) -> Optional[Path]:
+    chat_template = _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK.get(model_type)
+    if callable(chat_template):
+        chat_template = chat_template(tokenizer_name_or_path)
+
+    if chat_template is None:
+        return None
+
+    return chat_template
diff --git a/vllm_v0.10.0/vllm/transformers_utils/chat_templates/template_basic.jinja b/vllm_v0.10.0/vllm/transformers_utils/chat_templates/template_basic.jinja
new file mode 100644
index 0000000..3fa2ccc
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/chat_templates/template_basic.jinja
@@ -0,0 +1,3 @@
+{%- for message in messages -%}
+    {{- message['content'] -}}
+{%- endfor -%}
diff --git a/vllm_v0.10.0/vllm/transformers_utils/chat_templates/template_blip2.jinja b/vllm_v0.10.0/vllm/transformers_utils/chat_templates/template_blip2.jinja
new file mode 100644
index 0000000..fd41a7f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/chat_templates/template_blip2.jinja
@@ -0,0 +1,11 @@
+{%- for message in messages -%}
+    {%- if message['role'] == 'user' -%}
+        {{- 'Question: ' + message['content'] + ' ' -}}
+    {%- elif message['role'] == 'assistant' -%}
+        {{- 'Answer: ' + message['content'] + ' ' -}}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {{- 'Answer:' -}}
+{% endif %}
diff --git a/vllm_v0.10.0/vllm/transformers_utils/chat_templates/template_chatml.jinja b/vllm_v0.10.0/vllm/transformers_utils/chat_templates/template_chatml.jinja
new file mode 100644
index 0000000..e76ab0c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/chat_templates/template_chatml.jinja
@@ -0,0 +1,10 @@
+{%- for message in messages -%}
+    {{- '<|im_start|>' + message['role'] + '\n' + message['content'] -}}
+    {%- if (loop.last and add_generation_prompt) or not loop.last -%}
+        {{- '<|im_end|>' + '\n' -}}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
+    {{- '<|im_start|>assistant\n' -}}
+{%- endif -%}
diff --git a/vllm_v0.10.0/vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja b/vllm_v0.10.0/vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja
new file mode 100644
index 0000000..6dbfb02
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja
@@ -0,0 +1,23 @@
+{%- if messages[0]['role'] == 'system' -%}
+    {%- set system_message = messages[0]['content'] -%}
+    {%- set messages = messages[1:] -%}
+{%- else -%}
+    {% set system_message = '' -%}
+{%- endif -%}
+
+{{ bos_token + system_message }}
+{%- for message in messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+    {%- endif -%}
+
+    {%- if message['role'] == 'user' -%}
+        {{ '<|User|>: ' + message['content'] + '\n\n' }}
+    {%- elif message['role'] == 'assistant' -%}
+        {{ '<|Assistant|>: ' + message['content'] + eos_token + '\n\n' }}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {{ '<|Assistant|>: ' }}
+{%- endif -%}
diff --git a/vllm_v0.10.0/vllm/transformers_utils/chat_templates/template_fuyu.jinja b/vllm_v0.10.0/vllm/transformers_utils/chat_templates/template_fuyu.jinja
new file mode 100644
index 0000000..ec337d0
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/chat_templates/template_fuyu.jinja
@@ -0,0 +1,3 @@
+{%- for message in messages -%}
+    {{- message['content'] + '\n' -}}
+{%- endfor -%}
diff --git a/vllm_v0.10.0/vllm/transformers_utils/config.py b/vllm_v0.10.0/vllm/transformers_utils/config.py
new file mode 100644
index 0000000..8d1f59e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/config.py
@@ -0,0 +1,836 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import enum
+import json
+import os
+import time
+from functools import cache, partial
+from pathlib import Path
+from typing import Any, Callable, Optional, TypeVar, Union
+
+import huggingface_hub
+from huggingface_hub import get_safetensors_metadata, hf_hub_download
+from huggingface_hub import list_repo_files as hf_list_repo_files
+from huggingface_hub import try_to_load_from_cache
+from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
+                                   HFValidationError, LocalEntryNotFoundError,
+                                   RepositoryNotFoundError,
+                                   RevisionNotFoundError)
+from transformers import GenerationConfig, PretrainedConfig
+from transformers.models.auto.image_processing_auto import (
+    get_image_processor_config)
+from transformers.models.auto.modeling_auto import (
+    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
+from transformers.models.auto.tokenization_auto import get_tokenizer_config
+from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
+
+from vllm import envs
+from vllm.logger import init_logger
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config,
+                                             DbrxConfig, DeepseekVLV2Config,
+                                             EAGLEConfig, Exaone4Config,
+                                             ExaoneConfig, JAISConfig,
+                                             KimiVLConfig, MedusaConfig,
+                                             MiniMaxText01Config,
+                                             MiniMaxVL01Config, MllamaConfig,
+                                             MLPSpeculatorConfig, MPTConfig,
+                                             Nemotron_Nano_VL_Config,
+                                             NemotronConfig, NVLM_D_Config,
+                                             OvisConfig, RWConfig,
+                                             SkyworkR1VChatConfig, SolarConfig,
+                                             Telechat2Config, UltravoxConfig)
+# yapf: enable
+from vllm.transformers_utils.configs.mistral import adapt_config_dict
+from vllm.transformers_utils.utils import check_gguf_file
+
+if envs.VLLM_USE_MODELSCOPE:
+    from modelscope import AutoConfig
+else:
+    from transformers import AutoConfig
+
+MISTRAL_CONFIG_NAME = "params.json"
+
+logger = init_logger(__name__)
+
+
+def _get_hf_token() -> Optional[str]:
+    """
+    Get the HuggingFace token from environment variable.
+
+    Returns None if the token is not set, is an empty string, 
+    or contains only whitespace.
+    This follows the same pattern as huggingface_hub library which
+    treats empty string tokens as None to avoid authentication errors.
+    """
+    token = os.getenv('HF_TOKEN')
+    if token and token.strip():
+        return token
+    return None
+
+
+_CONFIG_REGISTRY_OVERRIDE_HF: dict[str, type[PretrainedConfig]] = {
+    "mllama": MllamaConfig
+}
+
+_CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = {
+    "chatglm": ChatGLMConfig,
+    "cohere2": Cohere2Config,
+    "dbrx": DbrxConfig,
+    "deepseek_vl_v2": DeepseekVLV2Config,
+    "kimi_vl": KimiVLConfig,
+    "Llama_Nemotron_Nano_VL": Nemotron_Nano_VL_Config,
+    "mpt": MPTConfig,
+    "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
+    "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
+    "jais": JAISConfig,
+    "mlp_speculator": MLPSpeculatorConfig,
+    "medusa": MedusaConfig,
+    "eagle": EAGLEConfig,
+    "exaone": ExaoneConfig,
+    "exaone4": Exaone4Config,
+    "minimax_text_01": MiniMaxText01Config,
+    "minimax_vl_01": MiniMaxVL01Config,
+    "nemotron": NemotronConfig,
+    "NVLM_D": NVLM_D_Config,
+    "ovis": OvisConfig,
+    "solar": SolarConfig,
+    "skywork_chat": SkyworkR1VChatConfig,
+    "telechat": Telechat2Config,
+    "ultravox": UltravoxConfig,
+    **_CONFIG_REGISTRY_OVERRIDE_HF
+}
+
+_CONFIG_ATTRS_MAPPING: dict[str, str] = {
+    "llm_config": "text_config",
+}
+
+
+class ConfigFormat(str, enum.Enum):
+    AUTO = "auto"
+    HF = "hf"
+    MISTRAL = "mistral"
+
+
+_R = TypeVar("_R")
+
+
+def with_retry(
+    func: Callable[[], _R],
+    log_msg: str,
+    max_retries: int = 2,
+    retry_delay: int = 2,
+) -> _R:
+    for attempt in range(max_retries):
+        try:
+            return func()
+        except Exception as e:
+            if attempt == max_retries - 1:
+                logger.error("%s: %s", log_msg, e)
+                raise
+            logger.error("%s: %s, retrying %d of %d", log_msg, e, attempt + 1,
+                         max_retries)
+            time.sleep(retry_delay)
+            retry_delay *= 2
+
+    raise AssertionError("Should not be reached")
+
+
+# @cache doesn't cache exceptions
+@cache
+def list_repo_files(
+    repo_id: str,
+    *,
+    revision: Optional[str] = None,
+    repo_type: Optional[str] = None,
+    token: Union[str, bool, None] = None,
+) -> list[str]:
+
+    def lookup_files() -> list[str]:
+        # directly list files if model is local
+        if (local_path := Path(repo_id)).exists():
+            return [
+                str(file.relative_to(local_path))
+                for file in local_path.rglob('*') if file.is_file()
+            ]
+        # if model is remote, use hf_hub api to list files
+        try:
+            if envs.VLLM_USE_MODELSCOPE:
+                from vllm.transformers_utils.utils import (
+                    modelscope_list_repo_files)
+                return modelscope_list_repo_files(repo_id,
+                                                  revision=revision,
+                                                  token=os.getenv(
+                                                      "MODELSCOPE_API_TOKEN",
+                                                      None))
+            return hf_list_repo_files(repo_id,
+                                      revision=revision,
+                                      repo_type=repo_type,
+                                      token=token)
+        except huggingface_hub.errors.OfflineModeIsEnabled:
+            # Don't raise in offline mode,
+            # all we know is that we don't have this
+            # file cached.
+            return []
+
+    return with_retry(lookup_files, "Error retrieving file list")
+
+
+def file_exists(
+    repo_id: str,
+    file_name: str,
+    *,
+    repo_type: Optional[str] = None,
+    revision: Optional[str] = None,
+    token: Union[str, bool, None] = None,
+) -> bool:
+    file_list = list_repo_files(repo_id,
+                                repo_type=repo_type,
+                                revision=revision,
+                                token=token)
+    return file_name in file_list
+
+
+# In offline mode the result can be a false negative
+def file_or_path_exists(model: Union[str, Path], config_name: str,
+                        revision: Optional[str]) -> bool:
+    if (local_path := Path(model)).exists():
+        return (local_path / config_name).is_file()
+
+    # Offline mode support: Check if config file is cached already
+    cached_filepath = try_to_load_from_cache(repo_id=model,
+                                             filename=config_name,
+                                             revision=revision)
+    if isinstance(cached_filepath, str):
+        # The config file exists in cache- we can continue trying to load
+        return True
+
+    # NB: file_exists will only check for the existence of the config file on
+    # hf_hub. This will fail in offline mode.
+
+    # Call HF to check if the file exists
+    return file_exists(str(model),
+                       config_name,
+                       revision=revision,
+                       token=_get_hf_token())
+
+
+def patch_rope_scaling(config: PretrainedConfig) -> None:
+    """Provide backwards compatibility for RoPE."""
+    text_config = getattr(config, "text_config", None)
+    if text_config is not None:
+        patch_rope_scaling(text_config)
+
+    rope_scaling = getattr(config, "rope_scaling", None)
+    if rope_scaling is not None:
+        patch_rope_scaling_dict(rope_scaling)
+
+
+def patch_rope_scaling_dict(rope_scaling: dict[str, Any]) -> None:
+    if "rope_type" in rope_scaling and "type" in rope_scaling:
+        rope_type = rope_scaling["rope_type"]
+        rope_type_legacy = rope_scaling["type"]
+        if rope_type != rope_type_legacy:
+            raise ValueError(
+                f"Found conflicts between 'rope_type={rope_type}' (modern "
+                f"field) and 'type={rope_type_legacy}' (legacy field). "
+                "You should only specify one of them.")
+
+    if "rope_type" not in rope_scaling and "type" in rope_scaling:
+        rope_scaling["rope_type"] = rope_scaling["type"]
+        logger.info("Replacing legacy 'type' key with 'rope_type'")
+
+    if "rope_type" not in rope_scaling:
+        raise ValueError("rope_scaling should have a 'rope_type' key")
+
+    if rope_scaling["rope_type"] == "su":
+        rope_scaling["rope_type"] = "longrope"
+        logger.warning("Replacing legacy rope_type 'su' with 'longrope'")
+    elif rope_scaling["rope_type"] == "mrope":
+        assert "mrope_section" in rope_scaling
+        rope_scaling["rope_type"] = "default"
+        logger.warning("Replacing legacy rope_type 'mrope' with 'default'")
+
+
+def _uses_mrope(config: PretrainedConfig) -> bool:
+    rope_scaling = getattr(config, "rope_scaling", None)
+    if rope_scaling is None:
+        return False
+
+    return "mrope_section" in rope_scaling
+
+
+def uses_mrope(config: PretrainedConfig) -> bool:
+    """Detect if the model with this config uses M-ROPE."""
+    return _uses_mrope(config) or thinker_uses_mrope(config)
+
+
+def thinker_uses_mrope(config: PretrainedConfig) -> bool:
+    """Detect if the model contains a thinker config and it uses M-ROPE."""
+    thinker_config = getattr(config, "thinker_config", None)
+    if thinker_config is None:
+        return False
+
+    thinker_text_config = getattr(thinker_config, "text_config", None)
+    if thinker_text_config is None:
+        return False
+
+    return uses_mrope(thinker_text_config)
+
+
+def is_encoder_decoder(config: PretrainedConfig) -> bool:
+    """Detect if the model with this config is used as an encoder/decoder."""
+    text_config = getattr(config, "text_config", None)
+    if text_config is not None:
+        return is_encoder_decoder(text_config)
+
+    return getattr(config, "is_encoder_decoder", False)
+
+
+def _maybe_remap_hf_config_attrs(config: PretrainedConfig) -> PretrainedConfig:
+    """Remap config attributes to match the expected names."""
+    for old_attr, new_attr in _CONFIG_ATTRS_MAPPING.items():
+        if hasattr(config, old_attr):
+            if not hasattr(config, new_attr):
+                config.update({new_attr: getattr(config, old_attr)})
+            delattr(config, old_attr)
+            logger.debug("Remapped config attribute '%s' to '%s'", old_attr,
+                         new_attr)
+    return config
+
+
+def get_config(
+    model: Union[str, Path],
+    trust_remote_code: bool,
+    revision: Optional[str] = None,
+    code_revision: Optional[str] = None,
+    config_format: ConfigFormat = ConfigFormat.AUTO,
+    hf_overrides_kw: Optional[dict[str, Any]] = None,
+    hf_overrides_fn: Optional[Callable[[PretrainedConfig],
+                                       PretrainedConfig]] = None,
+    **kwargs,
+) -> PretrainedConfig:
+    # Separate model folder from file path for GGUF models
+
+    is_gguf = check_gguf_file(model)
+    if is_gguf:
+        kwargs["gguf_file"] = Path(model).name
+        model = Path(model).parent
+
+    if config_format == ConfigFormat.AUTO:
+        try:
+            if is_gguf or file_or_path_exists(
+                    model, HF_CONFIG_NAME, revision=revision):
+                config_format = ConfigFormat.HF
+            elif file_or_path_exists(model,
+                                     MISTRAL_CONFIG_NAME,
+                                     revision=revision):
+                config_format = ConfigFormat.MISTRAL
+            else:
+                raise ValueError(
+                    "Could not detect config format for no config file found. "
+                    "Ensure your model has either config.json (HF format) "
+                    "or params.json (Mistral format).")
+
+        except Exception as e:
+            error_message = (
+                "Invalid repository ID or local directory specified:"
+                " '{model}'.\nPlease verify the following requirements:\n"
+                "1. Provide a valid Hugging Face repository ID.\n"
+                "2. Specify a local directory that contains a recognized "
+                "configuration file.\n"
+                "   - For Hugging Face models: ensure the presence of a "
+                "'config.json'.\n"
+                "   - For Mistral models: ensure the presence of a "
+                "'params.json'.\n"
+                "3. For GGUF: pass the local path of the GGUF checkpoint.\n"
+                "   Loading GGUF from a remote repo directly is not yet "
+                "supported.\n").format(model=model)
+
+            raise ValueError(error_message) from e
+
+    if config_format == ConfigFormat.HF:
+        config_dict, _ = PretrainedConfig.get_config_dict(
+            model,
+            revision=revision,
+            code_revision=code_revision,
+            token=_get_hf_token(),
+            **kwargs,
+        )
+
+        # Use custom model class if it's in our registry
+        model_type = config_dict.get("model_type")
+        if model_type in _CONFIG_REGISTRY:
+            config_class = _CONFIG_REGISTRY[model_type]
+            config = config_class.from_pretrained(
+                model,
+                revision=revision,
+                code_revision=code_revision,
+                token=_get_hf_token(),
+                **kwargs,
+            )
+        else:
+            try:
+                config = AutoConfig.from_pretrained(
+                    model,
+                    trust_remote_code=trust_remote_code,
+                    revision=revision,
+                    code_revision=code_revision,
+                    token=_get_hf_token(),
+                    # some old custom model's config needs
+                    # `has_no_defaults_at_init=True` to work.
+                    has_no_defaults_at_init=trust_remote_code,
+                    **kwargs,
+                )
+            except ValueError as e:
+                if (not trust_remote_code
+                        and "requires you to execute the configuration file"
+                        in str(e)):
+                    err_msg = (
+                        "Failed to load the model config. If the model "
+                        "is a custom model not yet available in the "
+                        "HuggingFace transformers library, consider setting "
+                        "`trust_remote_code=True` in LLM or using the "
+                        "`--trust-remote-code` flag in the CLI.")
+                    raise RuntimeError(err_msg) from e
+                else:
+                    raise e
+        config = _maybe_remap_hf_config_attrs(config)
+
+    elif config_format == ConfigFormat.MISTRAL:
+        # This function loads a params.json config which
+        # should be used when loading models in mistral format
+        config_dict = _download_mistral_config_file(model, revision)
+        if (max_position_embeddings :=
+                config_dict.get("max_position_embeddings")) is None:
+            max_position_embeddings = _maybe_retrieve_max_pos_from_hf(
+                model, revision, **kwargs)
+            config_dict["max_position_embeddings"] = max_position_embeddings
+
+        config = adapt_config_dict(config_dict)
+    else:
+        supported_formats = [
+            fmt.value for fmt in ConfigFormat if fmt != ConfigFormat.AUTO
+        ]
+        raise ValueError(
+            f"Unsupported config format: {config_format}. "
+            f"Supported formats are: {', '.join(supported_formats)}. "
+            f"Ensure your model uses one of these configuration formats "
+            f"or specify the correct format explicitly.")
+
+    # Special architecture mapping check for GGUF models
+    if is_gguf:
+        if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
+            raise RuntimeError(
+                f"Can't get gguf config for {config.model_type}.")
+        model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type]
+        config.update({"architectures": [model_type]})
+
+    if hf_overrides_kw:
+        logger.debug("Overriding HF config with %s", hf_overrides_kw)
+        config.update(hf_overrides_kw)
+    if hf_overrides_fn:
+        logger.debug("Overriding HF config with %s", hf_overrides_fn)
+        config = hf_overrides_fn(config)
+
+    patch_rope_scaling(config)
+
+    if trust_remote_code:
+        maybe_register_config_serialize_by_value()
+
+    return config
+
+
+def try_get_local_file(model: Union[str, Path],
+                       file_name: str,
+                       revision: Optional[str] = 'main') -> Optional[Path]:
+    file_path = Path(model) / file_name
+    if file_path.is_file():
+        return file_path
+    else:
+        try:
+            cached_filepath = try_to_load_from_cache(repo_id=model,
+                                                     filename=file_name,
+                                                     revision=revision)
+            if isinstance(cached_filepath, str):
+                return Path(cached_filepath)
+        except HFValidationError:
+            ...
+    return None
+
+
+def get_hf_file_to_dict(file_name: str,
+                        model: Union[str, Path],
+                        revision: Optional[str] = 'main'):
+    """
+    Downloads a file from the Hugging Face Hub and returns
+    its contents as a dictionary.
+
+    Parameters:
+    - file_name (str): The name of the file to download.
+    - model (str): The name of the model on the Hugging Face Hub.
+    - revision (str): The specific version of the model.
+
+    Returns:
+    - config_dict (dict): A dictionary containing
+    the contents of the downloaded file.
+    """
+
+    file_path = try_get_local_file(model=model,
+                                   file_name=file_name,
+                                   revision=revision)
+
+    if file_path is None:
+        try:
+            hf_hub_file = hf_hub_download(model, file_name, revision=revision)
+        except huggingface_hub.errors.OfflineModeIsEnabled:
+            return None
+        except (RepositoryNotFoundError, RevisionNotFoundError,
+                EntryNotFoundError, LocalEntryNotFoundError) as e:
+            logger.debug("File or repository not found in hf_hub_download", e)
+            return None
+        except HfHubHTTPError as e:
+            logger.warning(
+                "Cannot connect to Hugging Face Hub. Skipping file "
+                "download for '%s':",
+                file_name,
+                exc_info=e)
+            return None
+        file_path = Path(hf_hub_file)
+
+    if file_path is not None and file_path.is_file():
+        with open(file_path) as file:
+            return json.load(file)
+
+    return None
+
+
+@cache
+def get_pooling_config(model: str, revision: Optional[str] = 'main'):
+    """
+    This function gets the pooling and normalize
+    config from the model - only applies to
+    sentence-transformers models.
+
+    Args:
+        model (str): The name of the Hugging Face model.
+        revision (str, optional): The specific version
+        of the model to use. Defaults to 'main'.
+
+    Returns:
+        dict: A dictionary containing the pooling
+        type and whether normalization is used.
+    """
+
+    modules_file_name = "modules.json"
+
+    modules_dict = None
+    if file_or_path_exists(model=model,
+                           config_name=modules_file_name,
+                           revision=revision):
+        modules_dict = get_hf_file_to_dict(modules_file_name, model, revision)
+
+    if modules_dict is None:
+        return None
+
+    logger.info("Found sentence-transformers modules configuration.")
+
+    pooling = next((item for item in modules_dict
+                    if item["type"] == "sentence_transformers.models.Pooling"),
+                   None)
+    normalize = bool(
+        next((item for item in modules_dict
+              if item["type"] == "sentence_transformers.models.Normalize"),
+             False))
+
+    if pooling:
+
+        pooling_file_name = "{}/config.json".format(pooling["path"])
+        pooling_dict = get_hf_file_to_dict(pooling_file_name, model, revision)
+        pooling_type_name = next(
+            (item for item, val in pooling_dict.items() if val is True), None)
+
+        if pooling_type_name is not None:
+            pooling_type_name = get_pooling_config_name(pooling_type_name)
+
+        logger.info("Found pooling configuration.")
+        return {"pooling_type": pooling_type_name, "normalize": normalize}
+
+    return None
+
+
+def get_pooling_config_name(pooling_name: str) -> Union[str, None]:
+    if "pooling_mode_" in pooling_name:
+        pooling_name = pooling_name.replace("pooling_mode_", "")
+
+    if "_" in pooling_name:
+        pooling_name = pooling_name.split("_")[0]
+
+    if "lasttoken" in pooling_name:
+        pooling_name = "last"
+
+    supported_pooling_types = ['LAST', 'ALL', 'CLS', 'STEP', 'MEAN']
+    pooling_type_name = pooling_name.upper()
+
+    try:
+        if pooling_type_name in supported_pooling_types:
+            return pooling_type_name
+    except NotImplementedError as e:
+        logger.debug("Pooling type not supported", e)
+        return None
+    return None
+
+
+@cache
+def get_sentence_transformer_tokenizer_config(model: str,
+                                              revision: Optional[str] = 'main'
+                                              ):
+    """
+    Returns the tokenization configuration dictionary for a
+    given Sentence Transformer BERT model.
+
+    Parameters:
+    - model (str): The name of the Sentence Transformer
+    BERT model.
+    - revision (str, optional): The revision of the m
+    odel to use. Defaults to 'main'.
+
+    Returns:
+    - dict: A dictionary containing the configuration parameters
+    for the Sentence Transformer BERT model.
+    """
+    sentence_transformer_config_files = [
+        "sentence_bert_config.json",
+        "sentence_roberta_config.json",
+        "sentence_distilbert_config.json",
+        "sentence_camembert_config.json",
+        "sentence_albert_config.json",
+        "sentence_xlm-roberta_config.json",
+        "sentence_xlnet_config.json",
+    ]
+    encoder_dict = None
+
+    for config_file in sentence_transformer_config_files:
+        if try_get_local_file(model=model,
+                              file_name=config_file,
+                              revision=revision) is not None:
+            encoder_dict = get_hf_file_to_dict(config_file, model, revision)
+            if encoder_dict:
+                break
+
+    if not encoder_dict and not model.startswith("/"):
+        try:
+            # If model is on HuggingfaceHub, get the repo files
+            repo_files = list_repo_files(model,
+                                         revision=revision,
+                                         token=_get_hf_token())
+        except Exception:
+            repo_files = []
+
+        for config_name in sentence_transformer_config_files:
+            if config_name in repo_files:
+                encoder_dict = get_hf_file_to_dict(config_name, model,
+                                                   revision)
+                if encoder_dict:
+                    break
+
+    if not encoder_dict:
+        return None
+
+    logger.info("Found sentence-transformers tokenize configuration.")
+
+    if all(k in encoder_dict for k in ("max_seq_length", "do_lower_case")):
+        return encoder_dict
+    return None
+
+
+def maybe_register_config_serialize_by_value() -> None:
+    """Try to register HF model configuration class to serialize by value
+
+        If trust_remote_code is set, and the model's config file specifies an
+        `AutoConfig` class, then the config class is typically an instance of
+        a custom class imported from the HF modules cache.
+
+        Examples:
+
+        >>> from transformers import AutoConfig
+        >>> klass = AutoConfig.from_pretrained('meta-llama/Meta-Llama-3-8B', trust_remote_code=True)
+        >>> klass.__class__ # transformers.models.llama.configuration_llama.LlamaConfig
+        >>> import transformers_modules # error, not initialized
+        >>> klass = AutoConfig.from_pretrained('deepseek-ai/DeepSeek-V2.5', trust_remote_code=True)
+        >>> import transformers_modules # success, initialized
+        >>> klass.__class__ # transformers_modules.deepseek-ai.DeepSeek-V2.5.98b11844770b2c3ffc18b175c758a803640f4e77.configuration_deepseek.DeepseekV2Config
+
+        In the DeepSeek example, the config class is an instance of a custom
+        class that is not serializable by default. This class will not be
+        importable in spawned workers, and won't exist at all on
+        other nodes, which breaks serialization of the config.
+
+        In this function we tell the cloudpickle serialization library to pass
+        instances of these generated classes by value instead of by reference,
+        i.e. the class definition is serialized along with its data so that the
+        class module does not need to be importable on the receiving end.
+
+        See: https://github.com/cloudpipe/cloudpickle?tab=readme-ov-file#overriding-pickles-serialization-mechanism-for-importable-constructs
+    """ # noqa
+    try:
+        import transformers_modules
+        transformers_modules_available = True
+    except ImportError:
+        transformers_modules_available = False
+
+    try:
+        import multiprocessing
+        import pickle
+
+        import cloudpickle
+
+        from vllm.config import VllmConfig
+
+        # Register multiprocessing reducers to handle cross-process
+        # serialization of VllmConfig objects that may contain custom configs
+        # from transformers_modules
+        def _reduce_config(config: VllmConfig):
+            return (pickle.loads, (cloudpickle.dumps(config), ))
+
+        multiprocessing.reducer.register(VllmConfig, _reduce_config)
+
+        # Register transformers_modules with cloudpickle if available
+        if transformers_modules_available:
+            cloudpickle.register_pickle_by_value(transformers_modules)
+
+            # ray vendors its own version of cloudpickle
+            from vllm.executor.ray_utils import ray
+            if ray:
+                ray.cloudpickle.register_pickle_by_value(transformers_modules)
+
+    except Exception as e:
+        logger.warning(
+            "Unable to register remote classes used by"
+            " trust_remote_code with by-value serialization. This may"
+            " lead to a later error. If remote code is not needed"
+            " remove `--trust-remote-code`",
+            exc_info=e)
+
+
+def get_hf_image_processor_config(
+    model: Union[str, Path],
+    hf_token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    **kwargs,
+) -> dict[str, Any]:
+    # ModelScope does not provide an interface for image_processor
+    if envs.VLLM_USE_MODELSCOPE:
+        return dict()
+    # Separate model folder from file path for GGUF models
+    if check_gguf_file(model):
+        model = Path(model).parent
+    return get_image_processor_config(model,
+                                      token=hf_token,
+                                      revision=revision,
+                                      **kwargs)
+
+
+def get_hf_text_config(config: PretrainedConfig):
+    """Get the "sub" config relevant to llm for multi modal models.
+    No op for pure text models.
+    """
+    text_config = config.get_text_config()
+
+    if text_config is not config:
+        # The code operates under the assumption that text_config should have
+        # `num_attention_heads` (among others). Assert here to fail early
+        # if transformers config doesn't align with this assumption.
+        assert hasattr(text_config, "num_attention_heads")
+
+    return text_config
+
+
+def try_get_generation_config(
+    model: str,
+    trust_remote_code: bool,
+    revision: Optional[str] = None,
+) -> Optional[GenerationConfig]:
+    try:
+        return GenerationConfig.from_pretrained(
+            model,
+            revision=revision,
+        )
+    except OSError:  # Not found
+        try:
+            config = get_config(
+                model,
+                trust_remote_code=trust_remote_code,
+                revision=revision,
+            )
+            return GenerationConfig.from_model_config(config)
+        except OSError:  # Not found
+            return None
+
+
+def try_get_safetensors_metadata(
+    model: str,
+    *,
+    revision: Optional[str] = None,
+):
+    get_safetensors_metadata_partial = partial(
+        get_safetensors_metadata,
+        model,
+        revision=revision,
+        token=_get_hf_token(),
+    )
+
+    try:
+        return with_retry(get_safetensors_metadata_partial,
+                          "Error retrieving safetensors")
+    except Exception:
+        return None
+
+
+def try_get_tokenizer_config(
+    pretrained_model_name_or_path: Union[str, os.PathLike],
+    trust_remote_code: bool,
+    revision: Optional[str] = None,
+) -> Optional[dict[str, Any]]:
+    try:
+        return get_tokenizer_config(
+            pretrained_model_name_or_path,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+        )
+    except Exception:
+        return None
+
+
+def _download_mistral_config_file(model, revision) -> dict:
+    config_file_name = "params.json"
+    config_dict = get_hf_file_to_dict(config_file_name, model, revision)
+    if config_dict is None:
+        raise ValueError(
+            f"Failed to load mistral '{config_file_name}' config for model "
+            f"{model}. Please check if the model is a mistral-format model "
+            f"and if the config file exists.")
+    assert isinstance(config_dict, dict)
+    return config_dict
+
+
+def _maybe_retrieve_max_pos_from_hf(model, revision, **kwargs) -> int:
+    max_position_embeddings = 128_000
+    try:
+        trust_remote_code_val = kwargs.get("trust_remote_code", False)
+        hf_config = get_config(model=model,
+                               trust_remote_code=trust_remote_code_val,
+                               revision=revision,
+                               config_format=ConfigFormat.HF)
+        if hf_value := hf_config.get_text_config().max_position_embeddings:
+            max_position_embeddings = hf_value
+    except Exception as e:
+        logger.warning(
+            "The params.json file is missing 'max_position_embeddings'"
+            " and could not get a value from the HF config."
+            " Defaulting to 128000",
+            exc_info=e)
+
+    return max_position_embeddings
diff --git a/vllm_v0.10.0/vllm/transformers_utils/configs/__init__.py b/vllm_v0.10.0/vllm/transformers_utils/configs/__init__.py
new file mode 100644
index 0000000..8930321
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/configs/__init__.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
+from vllm.transformers_utils.configs.cohere2 import Cohere2Config
+from vllm.transformers_utils.configs.dbrx import DbrxConfig
+from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
+from vllm.transformers_utils.configs.eagle import EAGLEConfig
+from vllm.transformers_utils.configs.exaone import ExaoneConfig
+from vllm.transformers_utils.configs.exaone4 import Exaone4Config
+# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
+# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
+# `FalconConfig` class from the official HuggingFace transformers library.
+from vllm.transformers_utils.configs.falcon import RWConfig
+from vllm.transformers_utils.configs.jais import JAISConfig
+from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig
+from vllm.transformers_utils.configs.medusa import MedusaConfig
+from vllm.transformers_utils.configs.minimax_text_01 import MiniMaxText01Config
+from vllm.transformers_utils.configs.minimax_vl_01 import MiniMaxVL01Config
+from vllm.transformers_utils.configs.mllama import MllamaConfig
+from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
+from vllm.transformers_utils.configs.moonvit import MoonViTConfig
+from vllm.transformers_utils.configs.mpt import MPTConfig
+from vllm.transformers_utils.configs.nemotron import NemotronConfig
+from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
+from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config
+from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
+from vllm.transformers_utils.configs.ovis import OvisConfig
+from vllm.transformers_utils.configs.skyworkr1v import SkyworkR1VChatConfig
+from vllm.transformers_utils.configs.solar import SolarConfig
+from vllm.transformers_utils.configs.telechat2 import Telechat2Config
+from vllm.transformers_utils.configs.ultravox import UltravoxConfig
+
+__all__ = [
+    "ChatGLMConfig",
+    "Cohere2Config",
+    "DbrxConfig",
+    "DeepseekVLV2Config",
+    "MPTConfig",
+    "RWConfig",
+    "JAISConfig",
+    "MedusaConfig",
+    "EAGLEConfig",
+    "ExaoneConfig",
+    "Exaone4Config",
+    "MiniMaxText01Config",
+    "MiniMaxVL01Config",
+    "MllamaConfig",
+    "MLPSpeculatorConfig",
+    "MoonViTConfig",
+    "KimiVLConfig",
+    "NemotronConfig",
+    "NemotronHConfig",
+    "Nemotron_Nano_VL_Config",
+    "NVLM_D_Config",
+    "OvisConfig",
+    "SkyworkR1VChatConfig",
+    "SolarConfig",
+    "Telechat2Config",
+    "UltravoxConfig",
+]
diff --git a/vllm_v0.10.0/vllm/transformers_utils/configs/arctic.py b/vllm_v0.10.0/vllm/transformers_utils/configs/arctic.py
new file mode 100644
index 0000000..a789b93
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/configs/arctic.py
@@ -0,0 +1,207 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# yapf: disable
+# ruff: noqa: E501
+# coding=utf-8
+# Copied from
+# https://huggingface.co/Snowflake/snowflake-arctic-instruct/blob/main/configuration_arctic.py
+""" Arctic model configuration"""
+
+from dataclasses import asdict, dataclass
+from typing import Any
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+ARCTIC_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "arctic": "https://huggingface.co/Snowflake/snowflake-arctic-instruct/tree/main/config.json",
+}
+
+
+@dataclass
+class ArcticLoRAConfig:
+    lora_r: int = 64
+    lora_alpha: float = 16
+    shard_base_weights: bool = False
+
+
+@dataclass
+class ArcticQuantizationConfig:
+    q_bits: int = 8
+    rounding: str = "nearest"
+    mantissa_bits: int = 3
+    group_size: int = 128
+
+
+class ArcticConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ArcticModel`]. It is used to instantiate an
+    Arctic model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the #TODO(rsamdani): add what model has the default config..
+
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Arctic model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`ArcticModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 14336):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
+            The maximum sequence length that this model might ever be used with. Arctic's sliding window attention
+            allows sequence of up to 4096*32 tokens.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings.
+        sliding_window (`int`, *optional*):
+            Sliding window attention window size. If not specified, will default to `4096`.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        num_experts_per_tok (`int`, *optional*, defaults to 2):
+            The number of experts to root per-token, can be also interpreted as the `top-p` routing
+            parameter
+        num_local_experts (`int`, *optional*, defaults to 8):
+            Number of experts per Sparse MLP layer.
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+
+    ```python
+    >>> from transformers import ArcticModel, ArcticConfig
+
+    >>> # Initializing a Arctic 7B style configuration TODO(rsamdani): verify which model does the default configuration correspond to.
+    >>> configuration = ArcticConfig()
+
+    >>> # Initializing a model from the Arctic 7B style configuration
+    >>> model = ArcticModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "arctic"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=4096,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=1e6,
+        sliding_window=None,
+        attention_dropout=0.0,
+        num_experts_per_tok=1,
+        num_local_experts=8,
+        router_aux_loss_coef=0.001,
+        moe_layer_frequency=2,
+        parallel_attn_mlp_res=False,
+        moe_train_capacity_factor=1,
+        moe_eval_capacity_factor=1,
+        enable_expert_tensor_parallelism=False,
+        moe_min_capacity=0,
+        moe_token_dropping=True,
+        quantization=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_local_experts = num_local_experts
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.moe_layer_frequency = moe_layer_frequency
+        self.moe_train_capacity_factor = moe_train_capacity_factor
+        self.moe_eval_capacity_factor = moe_eval_capacity_factor
+        self.enable_expert_tensor_parallelism = enable_expert_tensor_parallelism
+        self.moe_min_capacity = moe_min_capacity
+        self.moe_token_dropping = moe_token_dropping
+        self.parallel_attn_mlp_res = parallel_attn_mlp_res
+        if isinstance(quantization, dict):
+            self.quantization = ArcticQuantizationConfig(**quantization)
+        else:
+            self.quantization = quantization
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    @classmethod
+    def from_dict(cls, config_dict: dict[str, Any], **kwargs) -> "ArcticConfig":
+        result = super().from_dict(config_dict, **kwargs)
+        config = result[0] if isinstance(result, tuple) else result
+        if isinstance(config.quantization, dict):
+            config.quantization = ArcticQuantizationConfig(**config.quantization)
+        return result
+
+    def to_dict(self) -> dict[str, Any]:
+        ret = super().to_dict()
+        if isinstance(ret["quantization"], ArcticQuantizationConfig):
+            ret["quantization"] = asdict(ret["quantization"])
+        return ret
diff --git a/vllm_v0.10.0/vllm/transformers_utils/configs/chatglm.py b/vllm_v0.10.0/vllm/transformers_utils/configs/chatglm.py
new file mode 100644
index 0000000..7c5de3e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/configs/chatglm.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/THUDM/ChatGLM2-6B
+from transformers import PretrainedConfig
+
+
+class ChatGLMConfig(PretrainedConfig):
+    model_type = "chatglm"
+    attribute_map = {
+        "num_hidden_layers": "num_layers",
+        "n_head_kv": "multi_query_group_num",
+    }
+
+    def __init__(self,
+                 num_layers=28,
+                 padded_vocab_size=65024,
+                 hidden_size=4096,
+                 ffn_hidden_size=13696,
+                 kv_channels=128,
+                 num_attention_heads=32,
+                 seq_length=2048,
+                 hidden_dropout=0.0,
+                 attention_dropout=0.0,
+                 layernorm_epsilon=1e-5,
+                 rmsnorm=True,
+                 apply_residual_connection_post_layernorm=False,
+                 post_layer_norm=True,
+                 add_bias_linear=False,
+                 add_qkv_bias=False,
+                 interleaved_qkv=False,
+                 bias_dropout_fusion=True,
+                 multi_query_attention=False,
+                 multi_query_group_num=1,
+                 apply_query_key_layer_scaling=True,
+                 attention_softmax_in_fp32=True,
+                 fp32_residual_connection=False,
+                 quantization_bit=0,
+                 pre_seq_len=None,
+                 prefix_projection=False,
+                 **kwargs):
+        self.num_layers = num_layers
+        self.vocab_size = padded_vocab_size
+        self.padded_vocab_size = padded_vocab_size
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.kv_channels = kv_channels
+        self.num_attention_heads = num_attention_heads
+        self.seq_length = seq_length
+        # It is to be compatible with long lora.
+        self.max_position_embeddings = seq_length
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.layernorm_epsilon = layernorm_epsilon
+        self.rmsnorm = rmsnorm
+        self.apply_residual_connection_post_layernorm = (
+            apply_residual_connection_post_layernorm)
+        self.post_layer_norm = post_layer_norm
+        self.add_bias_linear = add_bias_linear
+        self.add_qkv_bias = add_qkv_bias
+        self.bias_dropout_fusion = bias_dropout_fusion
+        self.multi_query_attention = multi_query_attention
+        self.multi_query_group_num = multi_query_group_num
+        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
+        self.fp32_residual_connection = fp32_residual_connection
+        self.quantization_bit = quantization_bit
+        self.pre_seq_len = pre_seq_len
+        self.prefix_projection = prefix_projection
+        self.interleaved_qkv = interleaved_qkv
+        super().__init__(**kwargs)
diff --git a/vllm_v0.10.0/vllm/transformers_utils/configs/cohere2.py b/vllm_v0.10.0/vllm/transformers_utils/configs/cohere2.py
new file mode 100644
index 0000000..e547a9c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/configs/cohere2.py
@@ -0,0 +1,195 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# ruff: noqa
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/cohere2/configuration_cohere2.py
+from transformers import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+
+
+class Cohere2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere
+    model according to the specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 256000):
+            Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`CohereModel`]
+        hidden_size (`int`, *optional*, defaults to 8192):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 22528):
+            Dimension of the MLP representations.
+        logit_scale (`float`, *optional*, defaults to 0.0625):
+            The scaling factor for the output logits.
+        num_hidden_layers (`int`, *optional*, defaults to 40):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 64):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 8192):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 5):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 255001):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Size of the sliding window attention context.
+        sliding_window_pattern (`int`, *optional*, defaults to 4):
+            Pattern for the sliding window attention.
+        cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
+
+    ```python
+    >>> from transformers import Cohere2Model, Cohere2Config
+
+    >>> # Initializing a Cohere Nextmodel configuration
+    >>> configuration = Cohere2Config()
+
+    >>> # Initializing a model from the Cohere2 configuration
+    >>> model = Cohere2Model(configuration) # doctest: +SKIP
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config # doctest: +SKIP
+    ```
+    """
+
+    model_type = "cohere2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=256000,
+        hidden_size=8192,
+        intermediate_size=22528,
+        logit_scale=0.0625,
+        num_hidden_layers=40,
+        num_attention_heads=64,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=8192,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=5,
+        eos_token_id=255001,
+        tie_word_embeddings=True,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        sliding_window=4096,
+        sliding_window_pattern=4,
+        cache_implementation="hybrid",
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.logit_scale = logit_scale
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.sliding_window = sliding_window
+        self.sliding_window_pattern = sliding_window_pattern
+        # Need to specify head_dim in the config so it can be used in the attention forward functions
+        self.head_dim = hidden_size // num_attention_heads
+        self.cache_implementation = cache_implementation
+
+        # Validate the correctness of rotary position embeddings parameters
+        rope_config_validation(self)
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["Cohere2Config"]
diff --git a/vllm_v0.10.0/vllm/transformers_utils/configs/dbrx.py b/vllm_v0.10.0/vllm/transformers_utils/configs/dbrx.py
new file mode 100644
index 0000000..7dbda99
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/configs/dbrx.py
@@ -0,0 +1,280 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# yapf: disable
+# ruff: noqa: E501
+# coding=utf-8
+# Copied from
+# https://huggingface.co/databricks/dbrx-base/blob/main/configuration_dbrx.py
+"""Dbrx configuration."""
+
+from typing import Any, Optional
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {} # type: ignore
+
+
+class DbrxAttentionConfig(PretrainedConfig):
+    """Configuration class for Dbrx Attention.
+
+    [`DbrxAttention`] class. It is used to instantiate attention layers
+    according to the specified arguments, defining the layers architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        attn_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the attention layers.
+        clip_qkv (`float`, *optional*, defaults to None):
+            If not `None`, clip the queries, keys, and values in the attention layer to this value.
+        kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
+        rope_theta (float): The base frequency for rope.
+    """
+
+    def __init__(
+        self,
+        attn_pdrop: float = 0,
+        clip_qkv: Optional[float] = None,
+        kv_n_heads: int = 1,
+        rope_theta: float = 10000.0,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.attn_pdrop = attn_pdrop
+        self.clip_qkv = clip_qkv
+        self.kv_n_heads = kv_n_heads
+        self.rope_theta = rope_theta
+
+        for k in ["model_type"]:
+            if k in kwargs:
+                kwargs.pop(k)
+        if len(kwargs) != 0:
+            raise ValueError(f"Found unknown {kwargs=}")
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: str, **kwargs: Any
+    ) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs
+        )
+
+        if config_dict.get("model_type") == "dbrx":
+            config_dict = config_dict["attn_config"]
+
+        if (
+            "model_type" in config_dict
+            and hasattr(cls, "model_type")
+            and config_dict["model_type"] != cls.model_type
+        ):
+            logger.warning(
+                "You are using a model of type %s to instantiate a model of "
+                "type %s. This is not supported for all configurations of "
+                "models and can yield errors.",
+                config_dict["model_type"], cls.model_type)
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class DbrxFFNConfig(PretrainedConfig):
+    """Configuration class for Dbrx FFN.
+
+    [`DbrxFFN`] class. It is used to instantiate feedforward layers according to
+    the specified arguments, defining the layers architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        ffn_act_fn (dict, optional): A dict specifying activation function for the FFN.
+            The dict should have a key 'name' with the value being the name of
+            the activation function along with any additional keyword arguments.
+        ffn_hidden_size (int, optional): The hidden size of the feedforward network.
+        moe_num_experts (int, optional): The number of experts in the mixture of experts layer.
+        moe_top_k (int, optional): The number of experts to use in the mixture of experts layer.
+        moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer.
+        moe_loss_weight (float, optional): The loss weight for the mixture of experts layer.
+        moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights.
+        uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment.
+            This should only be used for benchmarking purposes.
+    """
+
+    def __init__(
+        self,
+        ffn_act_fn: Optional[dict] = None,
+        ffn_hidden_size: int = 3584,
+        moe_num_experts: int = 4,
+        moe_top_k: int = 1,
+        moe_jitter_eps: Optional[float] = None,
+        moe_loss_weight: float = 0.01,
+        moe_normalize_expert_weights: Optional[float] = 1,
+        uniform_expert_assignment: bool = False,
+        **kwargs: Any,
+    ):
+        super().__init__()
+        if ffn_act_fn is None:
+            ffn_act_fn = {"name": "silu"}
+        self.ffn_act_fn = ffn_act_fn
+        self.ffn_hidden_size = ffn_hidden_size
+        self.moe_num_experts = moe_num_experts
+        self.moe_top_k = moe_top_k
+        self.moe_jitter_eps = moe_jitter_eps
+        self.moe_loss_weight = moe_loss_weight
+        self.moe_normalize_expert_weights = moe_normalize_expert_weights
+        self.uniform_expert_assignment = uniform_expert_assignment
+
+        for k in ["model_type"]:
+            if k in kwargs:
+                kwargs.pop(k)
+        if len(kwargs) != 0:
+            raise ValueError(f"Found unknown {kwargs=}")
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: str, **kwargs: Any
+    ) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs
+        )
+
+        if config_dict.get("model_type") == "dbrx":
+            config_dict = config_dict["ffn_config"]
+
+        if (
+            "model_type" in config_dict
+            and hasattr(cls, "model_type")
+            and config_dict["model_type"] != cls.model_type
+        ):
+            logger.warning(
+                "You are using a model of type %s to instantiate a model of "
+                "type %s. This is not supported for all "
+                "configurations of models and can yield errors.", config_dict["model_type"], cls.model_type)
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class DbrxConfig(PretrainedConfig):
+    """Configuration class for Dbrx.
+
+    [`DbrxModel`]. It is used to instantiate a Dbrx model according to the
+    specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        d_model (`int`, *optional*, defaults to 6144):
+            Dimensionality of the embeddings and hidden states.
+        n_heads (`int`, *optional*, defaults to 48):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_layers (`int`, *optional*, defaults to 40):
+            Number of hidden layers in the Transformer encoder.
+        max_seq_len (`int`, *optional*, defaults to 32768):
+            The maximum sequence length of the model.
+        vocab_size (`int`, *optional*, defaults to 100352):
+            Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`DbrxModel`].
+        resid_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout probability applied to the attention output before combining with residual.
+        emb_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the embedding layer.
+        attn_config (`dict`, *optional*):
+            A dictionary used to configure the model's attention module.
+        ffn_config (`dict`, *optional*):
+            A dictionary used to configure the model's FFN module.
+        use_cache (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabling this will also allow the model to output the auxiliary loss.
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+
+
+    Example:
+    ```python
+    >>> from transformers import DbrxConfig, DbrxModel
+
+    >>> # Initializing a Dbrx configuration
+    >>> configuration = DbrxConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = DbrxModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "dbrx"
+    attribute_map = {
+        "num_attention_heads": "n_heads",
+        "hidden_size": "d_model",
+        "num_hidden_layers": "n_layers",
+        "max_position_embeddings": "max_seq_len",
+    }
+
+    def __init__(
+        self,
+        d_model: int = 2048,
+        n_heads: int = 16,
+        n_layers: int = 24,
+        max_seq_len: int = 2048,
+        vocab_size: int = 32000,
+        resid_pdrop: float = 0.0,
+        emb_pdrop: float = 0.0,
+        attn_config: Optional[DbrxAttentionConfig] = None,
+        ffn_config: Optional[DbrxFFNConfig] = None,
+        use_cache: bool = True,
+        initializer_range: float = 0.02,
+        output_router_logits: bool = False,
+        router_aux_loss_coef: float = 0.05,
+        **kwargs: Any,
+    ):
+        if attn_config is None:
+            self.attn_config = DbrxAttentionConfig()
+        elif isinstance(attn_config, dict):
+            self.attn_config = DbrxAttentionConfig(**attn_config)
+        else:
+            self.attn_config = attn_config
+
+        if ffn_config is None:
+            self.ffn_config = DbrxFFNConfig()
+        elif isinstance(ffn_config, dict):
+            self.ffn_config = DbrxFFNConfig(**ffn_config)
+        else:
+            self.ffn_config = ffn_config
+
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.resid_pdrop = resid_pdrop
+        self.emb_pdrop = emb_pdrop
+        self.use_cache = use_cache
+        self.initializer_range = initializer_range
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+
+        tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
+        if tie_word_embeddings:
+            raise ValueError(
+                "tie_word_embeddings is not supported for Dbrx models."
+            )
+
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/vllm_v0.10.0/vllm/transformers_utils/configs/deepseek_vl2.py b/vllm_v0.10.0/vllm/transformers_utils/configs/deepseek_vl2.py
new file mode 100644
index 0000000..957d638
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/configs/deepseek_vl2.py
@@ -0,0 +1,216 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class VisionEncoderConfig(PretrainedConfig):
+    model_type: str = "vision"
+
+    model_name: str = "vit_so400m_patch14_siglip_384.webli"
+    image_size: int = 384
+    patch_size: int = 16
+    width: int = 1024
+    layers: int = 24
+    heads: int = 16
+    mlp_ratio: int = 4
+    global_pool: str = "map"
+    ignore_head: bool = True
+    class_token: bool = False
+    num_classes: int = 0
+    use_checkpoint: bool = False
+    weight_init: str = "skip"
+    deterministic: bool = False
+    num_recomputing_layers: int = 0
+
+    def __init__(self,
+                 model_name: str = "vit_so400m_patch14_siglip_384.webli",
+                 image_size: int = 384,
+                 patch_size: int = 16,
+                 width: int = 1024,
+                 layers: int = 24,
+                 heads: int = 16,
+                 mlp_ratio: int = 4,
+                 global_pool: str = "map",
+                 ignore_head: bool = True,
+                 class_token: bool = False,
+                 num_classes: int = 0,
+                 use_checkpoint: bool = False,
+                 **kwargs):
+        self.model_name = model_name
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.width = width
+        self.layers = layers
+        self.heads = heads
+        self.mlp_ratio = mlp_ratio
+        self.global_pool = global_pool
+        self.ignore_head = ignore_head
+        self.class_token = class_token
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+
+        super().__init__(**kwargs)
+
+
+class MlpProjectorConfig(PretrainedConfig):
+    model_type = "mlp_projector"
+    projector_type: str = "downsample_mlp_gelu"
+    input_dim: int = 1152
+    n_embed: int = 2048
+    depth: int = 2
+    mlp_ratio: int = 1
+    downsample_ratio: int = 2
+    token_pooling: bool = False
+
+    def __init__(self,
+                 projector_type: str = "downsample_mlp_gelu",
+                 input_dim: int = 1152,
+                 n_embed: int = 2048,
+                 depth: int = 2,
+                 mlp_ratio: int = 1,
+                 downsample_ratio: int = 2,
+                 **kwargs):
+        self.projector_type = projector_type
+        self.input_dim = input_dim
+        self.n_embed = n_embed
+        self.depth = depth
+        self.mlp_ratio = mlp_ratio
+        self.downsample_ratio = downsample_ratio
+
+        super().__init__(**kwargs)
+
+
+class DeepseekV2Config(PretrainedConfig):
+
+    model_type = "deepseek_v2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=102400,
+        hidden_size=4096,
+        intermediate_size=11008,
+        moe_intermediate_size=1407,
+        num_hidden_layers=30,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        n_shared_experts=None,
+        n_routed_experts=None,
+        ep_size=1,
+        routed_scaling_factor=1.0,
+        kv_lora_rank=512,
+        q_lora_rank=1536,
+        qk_rope_head_dim=64,
+        v_head_dim=128,
+        qk_nope_head_dim=128,
+        topk_method='gready',
+        n_group=None,
+        topk_group=None,
+        num_experts_per_tok=None,
+        moe_layer_freq=1,
+        first_k_dense_replace=0,
+        norm_topk_prob=False,
+        scoring_func='softmax',
+        aux_loss_alpha=0.001,
+        seq_aux=True,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=100000,
+        eos_token_id=100001,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        use_mla=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.n_shared_experts = n_shared_experts
+        self.n_routed_experts = n_routed_experts
+        self.ep_size = ep_size
+        self.routed_scaling_factor = routed_scaling_factor
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.topk_method = topk_method
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.num_experts_per_tok = num_experts_per_tok
+        self.moe_layer_freq = moe_layer_freq
+        self.first_k_dense_replace = first_k_dense_replace
+        self.norm_topk_prob = norm_topk_prob
+        self.scoring_func = scoring_func
+        self.aux_loss_alpha = aux_loss_alpha
+        self.seq_aux = seq_aux
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = float(rms_norm_eps)
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.use_mla = use_mla
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class DeepseekVLV2Config(PretrainedConfig):
+    model_type = "deepseek_vl_v2"
+    vision_config: VisionEncoderConfig
+    projector_config: MlpProjectorConfig
+
+    tile_tag: str = "2D"
+    global_view_pos: str = "head"
+    candidate_resolutions: tuple[tuple[int, int]] = ((384, 384), )
+
+    def __init__(self,
+                 tile_tag: str = "tile_tag",
+                 global_view_pos: str = "head",
+                 candidate_resolutions: tuple[tuple[int,
+                                                    int]] = ((384, 384), ),
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        vision_config = kwargs.get("vision_config", {})
+        self.vision_config = VisionEncoderConfig(**vision_config)
+
+        projector_config = kwargs.get("projector_config", {})
+        self.projector_config = MlpProjectorConfig(**projector_config)
+
+        language_config = kwargs.get("language_config", {})
+        self.text_config = DeepseekV2Config(**language_config)
+
+        self.tile_tag = tile_tag
+        self.global_view_pos = global_view_pos
+        self.candidate_resolutions = candidate_resolutions
+        self.vocab_size = self.text_config.vocab_size
diff --git a/vllm_v0.10.0/vllm/transformers_utils/configs/eagle.py b/vllm_v0.10.0/vllm/transformers_utils/configs/eagle.py
new file mode 100644
index 0000000..5445a33
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/configs/eagle.py
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from typing import Optional, Union
+
+from transformers import AutoConfig, PretrainedConfig
+
+from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
+
+
+class EAGLEConfig(PretrainedConfig):
+    model_type = "eagle"
+
+    def __init__(self,
+                 model: Union[PretrainedConfig, dict, None] = None,
+                 truncated_vocab_size: Optional[int] = None,
+                 method: Optional[str] = 'eagle',
+                 **kwargs):
+
+        model_config: Union[PretrainedConfig, DeepseekV2Config, None]
+        if isinstance(model, dict):
+            archs = model.get("architectures", [])
+            target_archs = ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]
+            if any(target_arch in archs for target_arch in target_archs):
+                # AutoConfig does not support DeepSeek MoE models yet
+                model_config = DeepseekV2Config(**model)
+            else:
+                model_config = AutoConfig.for_model(**model)
+        else:
+            model_config = model
+
+        for k, v in kwargs.items():
+            if k != "architectures" and k != "model_type" and hasattr(
+                    model_config, k):
+                setattr(model_config, k, v)
+
+        self.model = model_config
+
+        if self.model is None:
+            self.truncated_vocab_size = None
+        else:
+            self.truncated_vocab_size = self.model.vocab_size if \
+                truncated_vocab_size is None else truncated_vocab_size
+
+        # Eagle model name should follow naming convention of
+        # LlamaForCausalLM -> EagleLlamaForCausalLM
+        if method == "eagle":
+            assert self.model is not None, \
+                "model should not be None when method is eagle"
+            kwargs["architectures"] = [
+                f"Eagle{arch}" if not arch.startswith("Eagle") \
+                    else arch for arch in self.model.architectures
+            ]
+        elif method == "eagle3":
+            assert self.model is not None, \
+                "model should not be None when method is eagle3"
+            kwargs["architectures"] = [
+                f"Eagle3{arch}" if not arch.startswith("Eagle3") \
+                    else arch for arch in self.model.architectures
+            ]
+        else:
+            raise ValueError(f"Invalid method {method}. \
+                Supported methods are eagle and eagle3.")
+
+        super().__init__(**kwargs)
+
+        if self.model is not None:
+            for k, v in self.model.to_dict().items():
+                if k not in kwargs:
+                    setattr(self, k, v)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        **kwargs,
+    ) -> "EAGLEConfig":
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs)
+        return cls.from_dict(config_dict, **kwargs)
diff --git a/vllm_v0.10.0/vllm/transformers_utils/configs/exaone.py b/vllm_v0.10.0/vllm/transformers_utils/configs/exaone.py
new file mode 100644
index 0000000..7450904
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/configs/exaone.py
@@ -0,0 +1,190 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copied from
+# https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py
+# Copyright 2021 The LG AI Research EXAONE Lab. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Exaone model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: dict[str, str] = {}
+
+
+class ExaoneConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:
+    `~transformers.ExaoneModel`. It is used to instantiate a GPT Lingvo model
+    according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar
+    configuration to that of the Exaone
+
+    Configuration objects inherit from {class}`~transformers.PretrainedConfig`
+    and can be used to control the model outputs. Read the documentation from :
+    class:`~transformers.PretrainedConfig` for more information.
+
+    Args:
+        vocab_size ({obj}`int`, `optional`, defaults to 50257):
+            Vocabulary size of the GPT Lingvo model. Defines the number of
+            different tokens that can be represented by the {obj}`inputs_ids`
+            passed when calling {class}`~transformers.ExaoneModel`. Vocabulary
+            size of the model.
+            Defines the different tokens that can be represented by the
+            `inputs_ids` passed to the forward method of :class:
+            `~transformers.EXAONEModel`.
+        hidden_size ({obj}`int`, `optional`, defaults to 2048):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_layers ({obj}`int`, `optional`, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the
+            Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to
+            implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi
+            Head Attention (MHA), if `num_key_value_heads=1 the model will use
+            Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint,
+            each group key and value head should be constructed by meanpooling
+            all the original heads within that group. For more details checkout
+            [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not
+            specified, will default to `num_attention_heads`.
+        rotary_pct (`float`, *optional*, defaults to 0.25):
+            percentage of hidden dimensions to allocate to rotary embeddings
+        intermediate_size ({obj}`int`, `optional`, defaults to 8192):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in
+            the Transformer encoder.
+        activation_function ({obj}`str` or {obj}`function`, `optional`,
+        defaults to {obj}`"gelu_new"`):
+            The non-linear activation function (function or string) in the
+            encoder and pooler. If string, {obj}`"gelu"`, {obj}`"relu"`,
+            {obj}`"selu"` and {obj}`"gelu_new"` are supported.
+        embed_dropout ({obj}`float`, `optional`, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the
+            embeddings, encoder, and pooler.
+        attention_dropout ({obj}`float`, `optional`, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings ({obj}`int`, `optional`, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048).
+        type_vocab_size ({obj}`int`, `optional`, defaults to 2):
+            The vocabulary size of the {obj}`token_type_ids` passed when calling
+            {class}`~transformers.EXAONEModel`.
+        initializer_range ({obj}`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
+        layer_norm_epsilon ({obj}`float`, `optional`, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        use_cache ({obj}`bool`, `optional`, defaults to {obj}`True`):
+            Whether or not the model should return the last key/values
+            attentions (not used by all models).
+            Only relevant if ``config.is_decoder=True``.
+        gradient_checkpointing ({obj}`bool`, `optional`,
+        defaults to {obj}`False`):
+            If True, use gradient checkpointing to save memory at the expense
+            of slower backward pass.
+        Example::
+
+            >>> from transformers import ExoneModel, ExaoneConfig
+
+            >>> # Initializing a EXAONE configuration
+            >>> configuration = ExaoneConfig()
+
+            >>> # Initializing a model from configuration
+            >>> model = ExoneModel(configuration)
+
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
+    """
+
+    model_type = "exaone"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_hidden_layers": "num_layers"}
+
+    def __init__(
+        self,
+        vocab_size=102400,
+        max_position_embeddings=2048,
+        hidden_size=2048,
+        num_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        intermediate_size=None,
+        activation_function="silu",
+        rotary_pct=0.25,
+        resid_dropout=0.0,
+        embed_dropout=0.0,
+        attention_dropout=0.0,
+        layer_norm_epsilon=1e-6,
+        initializer_range=0.02,
+        use_cache=True,
+        bos_token_id=0,
+        eos_token_id=2,
+        tie_word_embeddings=True,
+        **kwargs,
+    ):
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_hidden_layers = num_layers
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        if intermediate_size:
+            self.intermediate_size = intermediate_size
+        else:
+            self.intermediate_size = hidden_size * 4
+        self.activation_function = activation_function
+        self.resid_dropout = resid_dropout
+        self.embed_dropout = embed_dropout
+        self.attention_dropout = attention_dropout
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.rotary_pct = rotary_pct
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        self.use_logit_cap = kwargs.pop("use_logit_cap", False)
+        self.ln_no_scale = kwargs.pop("ln_no_scale", False)
+        self.use_gated = kwargs.pop("use_gated", False)
+        self.use_emb_norm = kwargs.pop("use_emb_norm", False)
+        self.use_rotary_pos = kwargs.pop("use_rotary_pos", False)
+        self.rotary_type = kwargs.pop("rotary_type", None)
+        self.scaling_factor = kwargs.pop("scaling_factor", 1)
+        self.use_absolute_pos = kwargs.pop("use_absolute_pos", True)
+        self.use_extra_logit = kwargs.pop("use_extra_logit", True)
+        self.rotary_expand_length = kwargs.pop("rotary_expand_length", None)
+        self.rotary_base = kwargs.pop("rotary_base", 10000.0)
+        self.use_qkv_fuse = kwargs.pop("use_qkv_fuse", False)
+        self.rescale_before_lm_head = kwargs.pop("rescale_before_lm_head",
+                                                 (rotary_pct == 0.25))
+        if self.use_rotary_pos:
+            self.use_absolute_pos = False
diff --git a/vllm_v0.10.0/vllm/transformers_utils/configs/exaone4.py b/vllm_v0.10.0/vllm/transformers_utils/configs/exaone4.py
new file mode 100644
index 0000000..a22ebaa
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/configs/exaone4.py
@@ -0,0 +1,252 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+
+# Copied from
+# https://github.com/lgai-exaone/transformers/blob/add-exaone4/src/transformers/models/exaone4/configuration_exaone4.py
+# Copyright 2025 The LG CNS Gen AI Solution Delivery Team.
+# Copyright 2025 The LG AI Research and HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from transformers.configuration_utils import (PretrainedConfig,
+                                              layer_type_validation)
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+def check_is_sliding(config, layer_idx):
+    """
+    Check if the current layer is a sliding window attention (local attention) layer.
+    """
+    if config.sliding_window is None:
+        return False
+    if config.layer_types is not None:
+        return config.layer_types[layer_idx] == "sliding_attention"
+    if isinstance(config.sliding_window_pattern, int):
+        return ((layer_idx + 1) % config.sliding_window_pattern) != 0
+    elif isinstance(config.sliding_window_pattern, str):
+        assert isinstance(config.sliding_window, int), (
+            f"Sliding window must be positive integer, but got {config.sliding_window}"
+        )
+        return (layer_idx != config.num_hidden_layers - 1
+                and config.sliding_window_pattern[layer_idx % len(
+                    config.sliding_window_pattern)] == "L")
+    else:
+        logger.warning_once(
+            "Sliding window is set, but none of `sliding_window_pattern` or `layer_types` is set. "
+            "Defaulting to use 'full_attention' for all layers.")
+    return False
+
+
+class Exaone4Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Exaone4Model`]. It is used to
+    instantiate a EXAONE 4.0 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the EXAONE-4.0-Instruct [LGAI-EXAONE/EXAONE-4.0-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-Instruct)
+    NOTE: `EXAONE-4.0-Instruct` is a placeholder model ID. The exact model ID will be updated in the future.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 102400):
+            Vocabulary size of the EXAONE 4.0 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Exaone4Model`].
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to `hidden_size * 4`):
+            Dimensionality of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 32768 for EXAONE 3.5).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if ``config.is_decoder=True``.
+        bos_token_id (`int`, *optional*, defaults to 0):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        sliding_window (`int`, *optional*):
+            The size of the sliding window for the sliding window attention.
+        sliding_window_pattern (`str`, *optional*):
+            The pattern to use for sliding window attention. Can be one of:
+                - `None`: No sliding window attention is used
+                - `int`: Every `sliding_window` layers, use global attention, else use local attention.
+                - `str`: A sequence of "L" (local attention) and "G" (global attention) characters that defines the
+                  attention pattern. The pattern starts from layer 0 and repeats every `sliding_window` layers. The
+                  final layer always uses global attention regardless of the pattern.
+            For instance, sliding_window_pattern="LLLG" same as sliding_window=4, which means:
+                - Layer 0, 1, 2: local attention,
+                - Layer 3: global attention,
+                ...(repeated)
+        layer_types (`list`, *optional*):
+            Attention pattern for each layer. Prioritized over `sliding_window_pattern`.
+
+    Example:
+
+    ```python
+    >>> from transformers import Exaone4Model, Exaone4Config
+
+    >>> # Initializing a EXAONE configuration
+    >>> configuration = Exaone4Config()
+
+    >>> # Initializing a model from configuration
+    >>> model = Exaone4Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "exaone4"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `LlamaModel`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=102400,
+        hidden_size=4096,
+        intermediate_size=None,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        bos_token_id=0,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_dropout=0.0,
+        sliding_window=None,
+        sliding_window_pattern=None,
+        layer_types=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        if intermediate_size:
+            self.intermediate_size = intermediate_size
+        else:
+            self.intermediate_size = hidden_size * 4
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.attention_dropout = attention_dropout
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.sliding_window = sliding_window
+        self.sliding_window_pattern = sliding_window_pattern
+
+        self.layer_types = layer_types
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention"
+                if check_is_sliding(self, i) else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ]
+        layer_type_validation(self.layer_types)
+
+        super().__init__(bos_token_id=bos_token_id,
+                         eos_token_id=eos_token_id,
+                         tie_word_embeddings=tie_word_embeddings,
+                         **kwargs)
+
+
+__all__ = ["Exaone4Config"]
diff --git a/vllm_v0.10.0/vllm/transformers_utils/configs/falcon.py b/vllm_v0.10.0/vllm/transformers_utils/configs/falcon.py
new file mode 100644
index 0000000..2f54004
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/configs/falcon.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Falcon configuration"""
+from transformers.configuration_utils import PretrainedConfig
+
+
+class RWConfig(PretrainedConfig):
+    model_type = "falcon"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "num_hidden_layers": "n_layer",
+        "num_attention_heads": "n_head",
+        "num_kv_heads": "n_head_kv",
+    }
+
+    def __init__(
+        self,
+        vocab_size=250880,
+        hidden_size=64,
+        n_layer=2,
+        n_head=8,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        use_cache=True,
+        bos_token_id=1,
+        eos_token_id=2,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        multi_query=True,
+        n_head_kv=None,
+        alibi=False,
+        bias=False,
+        parallel_attn=False,
+        new_decoder_architecture=False,
+        **kwargs,
+    ) -> None:
+        self.vocab_size = vocab_size
+        # Backward compatibility with n_embed kwarg
+        n_embed = kwargs.pop("n_embed", None)
+        self.hidden_size = hidden_size if n_embed is None else n_embed
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.multi_query = multi_query
+        self.n_head_kv = 1 if n_head_kv is None else n_head_kv
+        self.alibi = alibi
+        self.bias = bias
+        self.parallel_attn = parallel_attn
+        self.new_decoder_architecture = new_decoder_architecture
+
+        if self.hidden_size == 8192:
+            # Hack for falcon-40b
+            self.new_decoder_architecture = True
+
+        super().__init__(bos_token_id=bos_token_id,
+                         eos_token_id=eos_token_id,
+                         **kwargs)
+
+    @property
+    def head_dim(self):
+        return self.hidden_size // self.n_head
+
+    @property
+    def rotary(self):
+        return not self.alibi
diff --git a/vllm_v0.10.0/vllm/transformers_utils/configs/jais.py b/vllm_v0.10.0/vllm/transformers_utils/configs/jais.py
new file mode 100644
index 0000000..767c4dd
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/configs/jais.py
@@ -0,0 +1,238 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# Copyright 2023 Cerebras Systems.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""JAIS configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class JAISConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a
+    [`JAISModel`]. It is used to instantiate a JAIS model according to the
+    specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used
+    to control the model outputs. Read the documentation from
+    [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50257):
+            Vocabulary size of the JAIS model. Defines the number of different
+            tokens that can be represented by the
+            `inputs_ids` passed when calling [`JAISModel`].
+        n_positions (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used
+            with. Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048).
+        n_embd (`int`, *optional*, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the
+            Transformer encoder.
+        n_inner (`int`, *optional*, defaults to None):
+            Dimensionality of the inner feed-forward layers. `None` will set
+            it to 4 times n_embd
+        activation_function (`str`, *optional*, defaults to `"gelu"`):
+            Activation function, to be selected in the list
+            `["relu", "silu", "gelu", "tanh", "gelu_new", "swiglu"]`.
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in
+            the embeddings, encoder, and pooler.
+        embd_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
+        scale_attn_weights (`bool`, *optional*, defaults to `True`):
+            Scale attention weights by dividing by sqrt(hidden_size)..
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values
+            attentions (not used by all models).
+        scale_attn_by_inverse_layer_idx (`bool`, *optional*,
+            defaults to `False`):
+            Whether to additionally scale attention weights by
+            `1 / layer_idx + 1`.
+        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
+            Whether to scale keys (K) prior to computing attention
+            (dot-product)
+            and upcast attention dot-product/softmax to float() when training
+            with mixed precision.
+        position_embedding_type (`str`, *optional*, defaults to `"learned"`):
+            Positional embedding can be either `"alibi"` or `"learned"`.
+        mup_width_scale (`float`, *optional*, defaults to 1.0):
+            muP parameter to scale learning rate and initializers. Calculated
+            as (`d_model,0 / d_model`), where
+            `d_model` is the model's width and `d_model,0` is the proxy
+            model's width.
+        mup_embeddings_scale (`float`, *optional*, defaults to 1.0):
+            muP parameter to scale token and position embeddings.
+        mup_output_alpha (`float`, *optional*, defaults to 1.0):
+            muP parameter to scale output logits
+            (`output_logits_scale = mup_output_alpha * mup_width_scale`).
+        mup_scale_qk_dot_by_d (`bool`, *optional*, defaults to `False`):
+            Scale attention weights by dividing by hidden_size instead of
+            sqrt(hidden_size). Need to set scale_attn_weights to `True` as
+            well.
+        alibi_scaling (`dict`, *optional*):
+            Dictionary containing the scaling configuration for ALiBi
+            embeddings. Currently only supports linear
+            scaling strategy. Can specify either the scaling `factor` (must be
+            a float greater than 1) for fixed scaling
+            or `train_seq_len` for dynamic scaling on input samples with
+            sequence length > `train_seq_len`. The expected
+            formats are `{"type": strategy name, "factor": scaling factor}` or
+            `{"type": strategy name,
+            "train_seq_len": training sequence length}`.
+        architectures (`list`, *optional*, defaults to ['JAISLMHeadModel']):
+            architecture names for Jais.
+
+    Example:
+
+    ```python
+    >>> from transformers import JAISConfig, JAISModel
+
+    >>> # Initializing a JAIS configuration
+    >>> configuration = JAISConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = JAISModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "jais"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "hidden_size": "n_embd",
+        "max_position_embeddings": "n_positions",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+
+    def __init__(
+        self,
+        vocab_size=50257,
+        n_positions=1024,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        n_inner=None,
+        activation_function="gelu_new",
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        scale_attn_weights=True,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        scale_attn_by_inverse_layer_idx=False,
+        reorder_and_upcast_attn=False,
+        position_embedding_type="learned",
+        mup_width_scale=1.0,
+        mup_embeddings_scale=1.0,
+        mup_output_alpha=1.0,
+        mup_scale_qk_dot_by_d=False,
+        alibi_scaling=None,
+        architectures=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_inner = n_inner
+        self.activation_function = activation_function
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.scale_attn_weights = scale_attn_weights
+        self.use_cache = use_cache
+        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
+        self.reorder_and_upcast_attn = reorder_and_upcast_attn
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        self.position_embedding_type = position_embedding_type
+        self.mup_width_scale = mup_width_scale
+        self.mup_embeddings_scale = mup_embeddings_scale
+        self.mup_output_alpha = mup_output_alpha
+        self.mup_scale_qk_dot_by_d = mup_scale_qk_dot_by_d
+
+        self.alibi_scaling = alibi_scaling
+        self._alibi_scaling_validation()
+        if architectures is None:
+            architectures = ["JAISLMHeadModel"]
+
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            architectures=architectures,
+            **kwargs,
+        )
+
+    def _alibi_scaling_validation(self):
+        """
+        Validate the `alibi_scaling` configuration.
+        """
+        if self.alibi_scaling is None:
+            return
+
+        if (not isinstance(self.alibi_scaling, dict)
+                or len(self.alibi_scaling) != 2):
+            raise ValueError(
+                "`alibi_scaling` must be a dictionary with two fields, "
+                "`type` and `factor` or `type` and `train_seq_len`, "
+                f"got {self.alibi_scaling}")
+        alibi_scaling_type = self.alibi_scaling.get("type", None)
+        alibi_scaling_factor = self.alibi_scaling.get("factor", None)
+        alibi_dynamic_scaling = self.alibi_scaling.get("train_seq_len", None)
+        if alibi_scaling_type is None or alibi_scaling_type != "linear":
+            raise ValueError(f"`alibi_scaling`'s type field must be 'linear', "
+                             f"got {alibi_scaling_type}")
+        if (alibi_scaling_factor is not None
+                and not isinstance(alibi_scaling_factor, float)
+                or (alibi_scaling_factor is not None
+                    and alibi_scaling_factor <= 1.0)):
+            raise ValueError(
+                f"`alibi_scaling`'s factor field must be a float > 1.0, "
+                f"got {alibi_scaling_factor}")
+        if (alibi_dynamic_scaling is not None
+                and not isinstance(alibi_dynamic_scaling, int)
+                or (alibi_dynamic_scaling is not None
+                    and alibi_dynamic_scaling <= 1)):
+            raise ValueError(
+                f"`alibi_scaling`'s `train_seq_len` field must be an "
+                f"integer > 1, got {alibi_dynamic_scaling}")
diff --git a/vllm_v0.10.0/vllm/transformers_utils/configs/kimi_vl.py b/vllm_v0.10.0/vllm/transformers_utils/configs/kimi_vl.py
new file mode 100644
index 0000000..ae8dac0
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/configs/kimi_vl.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
+from typing import Optional, Union
+
+from transformers.configuration_utils import PretrainedConfig
+
+from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
+from vllm.transformers_utils.configs.moonvit import MoonViTConfig
+
+
+class KimiVLConfig(PretrainedConfig):
+    model_type = "kimi_vl"
+
+    def __init__(self,
+                 vision_config: Optional[Union[dict, MoonViTConfig]] = None,
+                 text_config: Optional[Union[dict, DeepseekV2Config]] = None,
+                 ignore_index: int = -100,
+                 media_placeholder_token_id: int = 163605,
+                 pad_token_id: int = 0,
+                 **kwargs):
+        if vision_config is None:
+            vision_config = MoonViTConfig()
+        elif isinstance(vision_config, dict):
+            vision_config = MoonViTConfig(**vision_config)
+        self.vision_config = vision_config
+
+        if text_config is None:
+            text_config = DeepseekV2Config()
+        elif isinstance(text_config, dict):
+            text_config = DeepseekV2Config(**text_config)
+        self.text_config = text_config
+
+        self.ignore_index = ignore_index
+        self.media_placeholder_token_id = media_placeholder_token_id
+
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
diff --git a/vllm_v0.10.0/vllm/transformers_utils/configs/medusa.py b/vllm_v0.10.0/vllm/transformers_utils/configs/medusa.py
new file mode 100644
index 0000000..9ba5295
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/configs/medusa.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from typing import Optional, Union
+
+from transformers import PretrainedConfig
+
+
+class MedusaConfig(PretrainedConfig):
+    model_type = "medusa"
+
+    def __init__(self,
+                 hidden_size: int = 4096,
+                 vocab_size: int = 32001,
+                 num_heads: int = 5,
+                 num_hidden_layers: int = 1,
+                 max_paths: int = 64,
+                 topk: int = 10,
+                 truncated_vocab_size: Optional[int] = None,
+                 **kwargs):
+
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.num_heads = num_heads
+        self.num_hidden_layers = num_hidden_layers
+        self.max_paths = max_paths
+        self.topk = topk
+        self.max_seq_len = int(2**20)
+        self.truncated_vocab_size = vocab_size if truncated_vocab_size is None\
+            else truncated_vocab_size
+        if "architectures" not in kwargs:
+            kwargs["architectures"] = ["MedusaModel"]
+
+        super().__init__(**kwargs)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        **kwargs,
+    ) -> "MedusaConfig":
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs)
+        for k in list(config_dict.keys()):
+            if 'num' in k:
+                if 'heads' in k:
+                    config_dict["num_heads"] = config_dict.pop(k)
+                elif 'layers' in k:
+                    config_dict["num_hidden_layers"] = config_dict.pop(k)
+        return cls.from_dict(config_dict, **kwargs)
+
+    @property
+    def num_attention_heads(self):
+        return 0
+
+    @property
+    def num_lookahead_tokens(self):
+        return self.num_heads
+
+    @num_lookahead_tokens.setter
+    def num_lookahead_tokens(self, num_lookahead_tokens: int):
+        self.num_heads = num_lookahead_tokens
diff --git a/vllm_v0.10.0/vllm/transformers_utils/configs/minimax_text_01.py b/vllm_v0.10.0/vllm/transformers_utils/configs/minimax_text_01.py
new file mode 100644
index 0000000..e3b63df
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/configs/minimax_text_01.py
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+""" MiniMaxText01 model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class MiniMaxText01Config(PretrainedConfig):
+    model_type = "MiniMaxText01"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=4096 * 32,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=None,
+        eos_token_id=None,
+        tie_word_embeddings=False,
+        rope_theta=1e6,
+        sliding_window=None,
+        attention_dropout=0.0,
+        num_experts_per_tok=2,
+        num_local_experts=8,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        router_jitter_noise=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_local_experts = num_local_experts
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.router_jitter_noise = router_jitter_noise
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/vllm_v0.10.0/vllm/transformers_utils/configs/minimax_vl_01.py b/vllm_v0.10.0/vllm/transformers_utils/configs/minimax_vl_01.py
new file mode 100644
index 0000000..c624971
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/configs/minimax_vl_01.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""MiniMaxVL01 model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.auto import CONFIG_MAPPING
+
+from .minimax_text_01 import MiniMaxText01Config
+
+
+class MiniMaxVL01Config(PretrainedConfig):
+    model_type = "minimax_vl_01"
+
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        ignore_index=-100,
+        image_token_index=32000,
+        projector_hidden_act="gelu",
+        vision_feature_select_strategy="default",
+        vision_feature_layer=-2,
+        image_grid_pinpoints=None,
+        tie_word_embeddings=False,
+        image_seq_length=576,
+        **kwargs,
+    ):
+        self.ignore_index = ignore_index
+        self.image_token_index = image_token_index
+        self.projector_hidden_act = projector_hidden_act
+        self.image_seq_length = image_seq_length
+
+        if vision_feature_select_strategy not in ["default", "full"]:
+            raise ValueError("vision_feature_select_strategy should " +
+                             "be one of 'default', 'full'." +
+                             f"Got: {vision_feature_select_strategy}")
+
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.vision_feature_layer = vision_feature_layer
+        image_grid_pinpoints = (
+            image_grid_pinpoints if image_grid_pinpoints is not None else
+            [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]])
+        self.image_grid_pinpoints = image_grid_pinpoints
+
+        if isinstance(vision_config, dict):
+            if "model_type" not in vision_config:
+                vision_config["model_type"] = "clip_vision_model"
+            vision_config = CONFIG_MAPPING[vision_config["model_type"]](
+                **vision_config)
+        elif vision_config is None:
+            vision_config = CONFIG_MAPPING["clip_vision_model"](
+                intermediate_size=4096,
+                hidden_size=1024,
+                patch_size=14,
+                image_size=336,
+                num_hidden_layers=24,
+                num_attention_heads=16,
+                vocab_size=32000,
+                projection_dim=768,
+            )
+
+        self.vision_config = vision_config
+
+        if text_config is not None:
+            text_config = MiniMaxText01Config(**text_config)
+        else:
+            text_config = MiniMaxText01Config()
+
+        self.text_config = text_config
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
diff --git a/vllm_v0.10.0/vllm/transformers_utils/configs/mistral.py b/vllm_v0.10.0/vllm/transformers_utils/configs/mistral.py
new file mode 100644
index 0000000..8a9c660
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/configs/mistral.py
@@ -0,0 +1,164 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+from transformers import PretrainedConfig, WhisperConfig
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def adapt_config_dict(config_dict: dict[str, Any],
+                      **kwargs) -> PretrainedConfig:
+    config_dict.update(kwargs)
+    config_dict = _remap_general_mistral_args(config_dict)
+
+    if bool(config_dict.get("quantization")):
+        config_dict = _remap_mistral_quantization_args(config_dict)
+
+    if bool(config_dict.get("moe")):
+        config_dict["architectures"] = ["MixtralForCausalLM"]
+    else:
+        config_dict["architectures"] = ["MistralForCausalLM"]
+
+    if bool(config_dict.get("yarn")):
+        config_dict = _remap_mistral_yarn_args(config_dict)
+
+    is_vision = ((config_dict.get("multimodal")
+                  or {}).get("vision_encoder_args")
+                 or config_dict.get("vision_encoder"))
+    is_audio = bool(
+        ((config_dict.get("multimodal") or {}).get("whisper_model_args")
+         or {}).get("encoder_args"))
+
+    assert not (is_vision and is_audio), \
+        "Vision and audio are mutually exclusive"
+
+    if is_vision:
+        config_dict = _remap_mistral_vision_args(config_dict)
+    if is_audio:
+        config_dict = _remap_mistral_audio_args(config_dict)
+
+    config = PretrainedConfig.from_dict(config_dict)
+
+    logger.debug("Initialized config %s", config)
+
+    return config
+
+
+def _remap_mistral_vision_args(config: dict) -> dict:
+    if config.get("multimodal"):
+        vision_config = config.pop("multimodal")
+    else:
+        vision_config = config.pop("vision_encoder")
+
+    quant_config = config.get("quantization_config")
+    config = {
+        "model_type": "pixtral",
+        "architectures": ["PixtralForConditionalGeneration"],
+        "text_config": PretrainedConfig.from_dict(config),
+        "vision_config": PretrainedConfig.from_dict(vision_config),
+    }
+    if quant_config:
+        config["quantization_config"] = quant_config
+    return config
+
+
+def _remap_mistral_yarn_args(config: dict) -> dict:
+    # Direct remaps: yarn.X -> rope_scaling.Y
+    # Source keys are from mistral.model.args.YarnArgs
+    _map = {
+        "beta": "beta_fast",
+        "alpha": "beta_slow",
+    }
+    yarn_config = config.get("yarn") or {}
+    renamed_yarn_config = {_map.get(k, k): v for k, v in yarn_config.items()}
+    config["rope_scaling"] = {
+        "rope_type": "yarn",
+        "mscale_all_dim": 1,  # We hardcoded this to 1
+        **renamed_yarn_config
+    }
+    return config
+
+
+def _remap_general_mistral_args(config: dict) -> dict:
+    # Mistral key -> HF key
+    config_mapping = {
+        "dim": "hidden_size",
+        "norm_eps": "rms_norm_eps",
+        "n_kv_heads": "num_key_value_heads",
+        "n_layers": "num_hidden_layers",
+        "n_heads": "num_attention_heads",
+        "hidden_dim": "intermediate_size",
+    }
+    # HF key -> (Mistral key, default value)
+    top_level_mapping_with_default = {
+        "model_type": ("model_type", "transformer"),
+        "hidden_act": ("activation", "silu"),
+        "tie_word_embeddings": ("tied_embeddings", False),
+        "max_seq_len": ("max_seq_len", 128_000),
+        "max_position_embeddings": ("max_position_embeddings", 128_000),
+    }
+
+    for key, new_key in config_mapping.items():
+        if key in config:
+            config[new_key] = config.pop(key)
+
+    for new_key, (key,
+                  default_value) in top_level_mapping_with_default.items():
+        config[new_key] = config.pop(key, default_value)
+
+    return config
+
+
+def _remap_mistral_quantization_args(config: dict) -> dict:
+    quantization = config.get("quantization", {})
+    if quantization.get("qformat_weight") == "fp8_e4m3":
+        # This maps to the FP8 static per-tensor quantization scheme
+        quantization_config = {
+            "quant_method": "fp8",
+            "activation_scheme": "static"
+        }
+    elif quantization.get("quant_method") == "compressed-tensors":
+        # Pass through the quantization config to compressed-tensors
+        quantization_config = quantization
+    else:
+        raise ValueError(
+            f"Found unknown quantization='{quantization}' in config")
+
+    config["quantization_config"] = quantization_config
+
+    return config
+
+
+def _remap_mistral_audio_args(config: dict) -> dict:
+    whisper_args = config["multimodal"].pop("whisper_model_args")
+    encoder_args = whisper_args["encoder_args"]
+    downsample_args = whisper_args["downsample_args"]
+
+    quant_config = config.get("quantization_config")
+    config = {
+        "model_type":
+        "whixtral",
+        "architectures": ["VoxtralForConditionalGeneration"],
+        "text_config":
+        PretrainedConfig.from_dict(config),
+        "audio_config":
+        WhisperConfig(
+            num_mel_bins=encoder_args["audio_encoding_args"]["num_mel_bins"],
+            window_size=encoder_args["audio_encoding_args"]["window_size"],
+            sampling_rate=encoder_args["audio_encoding_args"]["sampling_rate"],
+            hop_length=encoder_args["audio_encoding_args"]["hop_length"],
+            downsample_factor=downsample_args["downsample_factor"],
+            d_model=encoder_args["dim"],
+            encoder_layers=encoder_args["n_layers"],
+            encoder_ffn_dim=encoder_args["hidden_dim"],
+            encoder_attention_heads=encoder_args["n_heads"],
+            vocab_size=encoder_args["vocab_size"],
+            max_source_positions=encoder_args["max_source_positions"],
+        )
+    }
+    if quant_config:
+        config["quantization_config"] = quant_config
+    return config
diff --git a/vllm_v0.10.0/vllm/transformers_utils/configs/mllama.py b/vllm_v0.10.0/vllm/transformers_utils/configs/mllama.py
new file mode 100644
index 0000000..f0cd2d5
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/configs/mllama.py
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from transformers.models.mllama import configuration_mllama as mllama_hf_config
+
+
+class MllamaTextConfig(mllama_hf_config.MllamaTextConfig):
+    '''
+    Use this class to override is_encoder_decoder:
+    - transformers regards mllama as is_encoder_decoder=False
+    - vllm needs is_encoder_decoder=True to enable cross-attention
+    '''
+
+    def __init__(
+        self,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.is_encoder_decoder = True
+
+
+class MllamaConfig(mllama_hf_config.MllamaConfig):
+
+    def __init__(
+        self,
+        text_config=None,
+        **kwargs,
+    ):
+        if isinstance(text_config, dict):
+            text_config = MllamaTextConfig(**text_config)
+        super().__init__(text_config=text_config, **kwargs)
diff --git a/vllm_v0.10.0/vllm/transformers_utils/configs/mlp_speculator.py b/vllm_v0.10.0/vllm/transformers_utils/configs/mlp_speculator.py
new file mode 100644
index 0000000..2fa284e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/configs/mlp_speculator.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+from transformers import PretrainedConfig
+
+
+class MLPSpeculatorConfig(PretrainedConfig):
+    model_type = "mlp_speculator"
+
+    attribute_map = {
+        "hidden_size": "emb_dim",
+    }
+
+    def __init__(self,
+                 vocab_size: int = 32000,
+                 emb_dim: int = 4096,
+                 inner_dim: int = 0,
+                 n_predict: int = 3,
+                 top_k_tokens_per_head: Optional[list[int]] = None,
+                 n_candidates: int = 5,
+                 tie_weights: bool = False,
+                 scale_input: bool = False,
+                 **kwargs):
+        """
+        Initialize an MLPSpeculatorConfig
+
+        Args:
+            vocab_size: int
+                the model vocab size
+            emb_dim: int
+                the model embedding dimension
+            inner_dim: int
+                the inner dimension of the model. If 0, will be the emb_dim.
+            n_predict: int
+                the number of lookaheads for the speculator
+            top_k_tokens_per_head: list[int]
+                Number of tokens to consider from each head when forming the
+                candidate tree.
+                For each candidate branch in the tree, head n produces topk[n]
+                additional sub-branches.
+                NOTE: This parameter is currently unused.
+            n_candidates: int
+                number of child candidates to create per sequence
+            tie_weights: bool
+                If true, use a single set of weights for every model
+                head/stage after the first. The initial projection
+                from the base model may have a different size, so that
+                stays separate.
+            scale_input: bool
+                if True, will scale the initial hidden states from
+                the base model.
+        """
+        if top_k_tokens_per_head is None:
+            top_k_tokens_per_head = [5, 4, 3]
+        assert len(top_k_tokens_per_head) == n_predict
+        self.vocab_size = vocab_size
+        self.emb_dim = emb_dim
+        self.inner_dim = inner_dim
+        self.n_predict = n_predict
+        self.top_k_tokens_per_head = top_k_tokens_per_head
+        self.n_candidates = n_candidates
+        self.num_lookahead_tokens = n_predict
+        self.tie_weights = tie_weights
+        self.scale_input = scale_input
+
+        super().__init__(**kwargs)
diff --git a/vllm_v0.10.0/vllm/transformers_utils/configs/moonvit.py b/vllm_v0.10.0/vllm/transformers_utils/configs/moonvit.py
new file mode 100644
index 0000000..a6f712f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/configs/moonvit.py
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
+from transformers.configuration_utils import PretrainedConfig
+
+
+class MoonViTConfig(PretrainedConfig):
+    model_type = "moonvit"
+
+    def __init__(
+            self,
+            patch_size: int = 14,
+            init_pos_emb_height: int = 64,
+            init_pos_emb_width: int = 64,
+            num_attention_heads: int = 16,
+            num_hidden_layers: int = 27,
+            hidden_size: int = 1152,
+            intermediate_size: int = 4304,
+            merge_kernel_size: tuple[int, int] = (2, 2),
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.patch_size = patch_size
+        # Positional embedding config
+        self.init_pos_emb_height = init_pos_emb_height
+        self.init_pos_emb_width = init_pos_emb_width
+        # Transformer config
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        # Patch merger config
+        self.merge_kernel_size = merge_kernel_size
diff --git a/vllm_v0.10.0/vllm/transformers_utils/configs/mpt.py b/vllm_v0.10.0/vllm/transformers_utils/configs/mpt.py
new file mode 100644
index 0000000..9131640
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/configs/mpt.py
@@ -0,0 +1,180 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copied from
+# https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py
+"""A HuggingFace-style model configuration."""
+import warnings
+from typing import Any, Optional, Union
+
+from transformers import PretrainedConfig
+
+attn_config_defaults: dict = {
+    'attn_type': 'multihead_attention',
+    'attn_pdrop': 0.0,
+    'attn_impl': 'triton',
+    'qk_ln': False,
+    'clip_qkv': None,
+    'softmax_scale': None,
+    'prefix_lm': False,
+    'attn_uses_sequence_id': False,
+    'alibi': False,
+    'alibi_bias_max': 8
+}
+ffn_config_defaults: dict = {'ffn_type': 'mptmlp'}
+init_config_defaults: dict = {
+    'name': 'kaiming_normal_',
+    'fan_mode': 'fan_in',
+    'init_nonlinearity': 'relu',
+    'init_div_is_residual': True,
+    'emb_init_std': None,
+    'emb_init_uniform_lim': None,
+    'init_std': None,
+    'init_gain': 0.0
+}
+
+
+class MPTConfig(PretrainedConfig):
+    model_type = 'mpt'
+    attribute_map = {
+        'num_attention_heads': 'n_heads',
+        'hidden_size': 'd_model',
+        'num_hidden_layers': 'n_layers',
+    }
+
+    # pylint: disable=dangerous-default-value
+    def __init__(self,
+                 d_model: int = 2048,
+                 n_heads: int = 16,
+                 n_layers: int = 24,
+                 expansion_ratio: int = 4,
+                 max_seq_len: int = 2048,
+                 vocab_size: int = 50368,
+                 resid_pdrop: float = 0.0,
+                 emb_pdrop: float = 0.0,
+                 learned_pos_emb: bool = True,
+                 attn_config: dict = attn_config_defaults,
+                 ffn_config: dict = ffn_config_defaults,
+                 init_device: str = 'cpu',
+                 logit_scale: Optional[Union[float, str]] = None,
+                 no_bias: bool = False,
+                 embedding_fraction: float = 1.0,
+                 norm_type: str = 'low_precision_layernorm',
+                 use_cache: bool = False,
+                 init_config: dict = init_config_defaults,
+                 fc_type: str = 'torch',
+                 verbose: Optional[int] = None,
+                 **kwargs: Any):
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.expansion_ratio = expansion_ratio
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.resid_pdrop = resid_pdrop
+        self.emb_pdrop = emb_pdrop
+        self.learned_pos_emb = learned_pos_emb
+        self.attn_config = attn_config
+        self.ffn_config = ffn_config
+        self.init_device = init_device
+        self.logit_scale = logit_scale
+        self.no_bias = no_bias
+        self.embedding_fraction = embedding_fraction
+        self.norm_type = norm_type
+        self.use_cache = use_cache
+        self.init_config = init_config
+        self.fc_type = fc_type
+        if verbose is not None:
+            warnings.warn(DeprecationWarning(
+                'verbose argument for MPTConfig is now ignored and '
+                'will be removed. Use python_log_level instead.'),
+                          stacklevel=2)
+        if 'name' in kwargs:
+            del kwargs['name']
+        if 'loss_fn' in kwargs:
+            del kwargs['loss_fn']
+        if self.attn_config.get('alibi', False):
+            self.learned_pos_emb = False
+            warnings.warn(
+                f'alibi is turned on, setting `learned_pos_emb` '
+                f'to {self.learned_pos_emb}`',
+                stacklevel=2)
+        super().__init__(**kwargs)
+        self._validate_config()
+
+    def _set_config_defaults(
+            self, config: dict[str, Any],
+            config_defaults: dict[str, Any]) -> dict[str, Any]:
+        for (k, v) in config_defaults.items():
+            if k not in config:
+                config[k] = v
+        return config
+
+    def _validate_config(self) -> None:
+        self.attn_config = self._set_config_defaults(self.attn_config,
+                                                     attn_config_defaults)
+        self.ffn_config = self._set_config_defaults(self.ffn_config,
+                                                    ffn_config_defaults)
+        self.init_config = self._set_config_defaults(self.init_config,
+                                                     init_config_defaults)
+        if self.d_model % self.n_heads != 0:
+            raise ValueError('d_model must be divisible by n_heads')
+        if any(
+                prob < 0 or prob > 1 for prob in
+            [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop
+             ]):
+            raise ValueError(
+                "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are "
+                "probabilities and must be between 0 and 1")
+        if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
+            raise ValueError(
+                f"Unknown attn_impl={self.attn_config['attn_impl']}")
+        if self.attn_config['prefix_lm'] and self.attn_config[
+                'attn_impl'] not in ['torch', 'triton']:
+            raise NotImplementedError(
+                'prefix_lm only implemented with torch and triton attention.')
+        if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in [
+                'torch', 'triton'
+        ]:
+            raise NotImplementedError(
+                'alibi only implemented with torch and triton attention.')
+        if self.attn_config['attn_uses_sequence_id'] and self.attn_config[
+                'attn_impl'] not in ['torch', 'triton']:
+            raise NotImplementedError(
+                'attn_uses_sequence_id only implemented with torch '
+                'and triton attention.')
+        if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
+            raise ValueError(
+                'model.embedding_fraction must be between 0 (exclusive) '
+                'and 1 (inclusive)!')
+        if isinstance(self.logit_scale,
+                      str) and self.logit_scale != 'inv_sqrt_d_model':
+            raise ValueError(
+                f"self.logit_scale={self.logit_scale!r} is not recognized as "
+                "an option; use numeric value or 'inv_sqrt_d_model'.")
+        if self.init_config.get('name', None) is None:
+            raise ValueError(
+                f"self.init_config={self.init_config!r} 'name' needs to be set."
+            )
+        if not self.learned_pos_emb and (not self.attn_config['alibi']):
+            warnings.warn(
+                'Positional information not being provided to the model.',
+                stacklevel=2)
+        if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp':
+            try:
+                # pylint: disable=import-outside-toplevel
+                import transformer_engine.pytorch as te
+                del te
+            except Exception as exc:
+                raise ImportError(
+                    'TransformerEngine import fail. `fc_type: te` requires '
+                    'TransformerEngine be installed. '
+                    'The required version of transformer_engine also requires '
+                    'FlashAttention v1.0.6 is installed:\n'
+                    'pip install flash-attn==1.0.6 --no-build-isolation \n'
+                    'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156'
+                ) from exc
+        if self.ffn_config['ffn_type'] == 'mptmlp':
+            self.ffn_config['fc_type'] = self.fc_type
+        elif self.ffn_config['ffn_type'] == 'te_ln_mlp':
+            self.ffn_config['bias'] = not self.no_bias
diff --git a/vllm_v0.10.0/vllm/transformers_utils/configs/nemotron.py b/vllm_v0.10.0/vllm/transformers_utils/configs/nemotron.py
new file mode 100644
index 0000000..9a7243b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/configs/nemotron.py
@@ -0,0 +1,205 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Nemotron model configuration"""
+
+from transformers import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class NemotronConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a
+    [`NemotronModel`]. It is used to instantiate an Nemotron model
+    according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar
+    configuration to that of the Nemotron-8B.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be
+    used to control the model outputs. Read the documentation from
+    [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 256000):
+            Vocabulary size of the Nemotron model. Defines the number of
+            different tokens that can be represented by the
+            `inputs_ids` passed when calling [`NemotronModel`]
+        hidden_size (`int`, *optional*, defaults to 6144):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 24576):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 48):
+            Number of attention heads for each attention layer in the
+            Transformer decoder.
+        head_dim (`int`, *optional*):
+            Projection weights dimension in multi-head attention. Set to
+            hidden_size // num_attention_heads if None
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to
+            implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use
+            Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention
+            (MQA) otherwise GQA is used. When converting a multi-head
+            checkpoint to a GQA checkpoint, each group key and value
+            head should be constructed by meanpooling all the original
+            heads within that group. For more details checkout 
+            [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it
+            is not specified, will default to `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
+            The non-linear activation function (function or string) in the
+            decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used
+            with.
+        initializer_range (`float`, *optional*, defaults to 0.0134):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
+        norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values
+            attentions (not used by all models). Only relevant if
+            `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 2):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 3):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        partial_rotary_factor (`float`, *optional*, defaults to 0.5):
+            Percentage of the query and keys which will have rotary embedding.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output
+            projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj and down_proj layers in the MLP
+            layers.
+
+    ```python
+    >>> from transformers import NemotronModel, NemotronConfig
+    >>> # Initializing a Nemotron nemotron-15b style configuration
+    >>> configuration = NemotronConfig()
+    >>> # Initializing a model from the nemotron-15b style configuration
+    >>> model = NemotronModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "nemotron"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=256000,
+        hidden_size=6144,
+        intermediate_size=24576,
+        num_hidden_layers=32,
+        num_attention_heads=48,
+        head_dim=None,
+        num_key_value_heads=None,
+        hidden_act="relu2",
+        max_position_embeddings=4096,
+        initializer_range=0.0134,
+        norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=2,
+        eos_token_id=3,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        partial_rotary_factor=0.5,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        head_dim = head_dim or kwargs.get("kv_channels")
+        self.head_dim = head_dim if head_dim is not None else (
+            hidden_size // num_attention_heads)
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.norm_eps = norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        # for backward compatibility
+        partial_rotary_factor = kwargs.get("rope_percent") or kwargs.get(
+            "rope_percentage") or partial_rotary_factor
+        self.partial_rotary_factor = partial_rotary_factor
+        self._rope_scaling_validation()
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(
+                self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with two fields, "
+                f"`type` and `factor`, got {self.rope_scaling}")
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in [
+                "linear", "dynamic"
+        ]:
+            raise ValueError(
+                "`rope_scaling`'s type field must be one of ['linear', "
+                f"'dynamic'], got {rope_scaling_type}")
+        if rope_scaling_factor is None or not isinstance(
+                rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(
+                "`rope_scaling`'s factor field must be a float > 1, got "
+                f"{rope_scaling_factor}")
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/transformers_utils/configs/nemotron_h.py b/vllm_v0.10.0/vllm/transformers_utils/configs/nemotron_h.py
new file mode 100644
index 0000000..457b337
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/configs/nemotron_h.py
@@ -0,0 +1,259 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""NemotronH model configuration"""
+
+import regex as re
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class NemotronHConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a
+    [`NemotronHModel`]. It is used to instantiate a NemotronH model according
+    to the specified arguments, defining the model architecture. Instantiating
+    a configuration with the defaults will yield a similar configuration to
+    that of the NemotronH-v0.1 model.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 131072):
+            Vocabulary size of the NemotronH model. Defines the number of
+            different tokens that can be represented by the `inputs_ids`
+            passed when calling [`NemotronHModel`]
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be
+            tied. Note that this is only relevant if the model has a output
+            word embedding layer.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 21504):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 52):
+            Number of hidden layers in the Transformer encoder.
+        hybrid_override_pattern (`str`, *optional*, defaults to
+            `"M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"`):
+            The pattern of the hybrid model. The pattern is a string of
+            characters where each character represents
+            M: Mamba2, *: Attention, -: MLP
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the
+            Transformer encoder.
+        attention_head_dim (`int`, *optional*, defaults to 128):
+            Dimension of each attention head.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to
+            implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use
+            Multi Head Attention (MHA), if `num_key_value_heads=1` the model
+            will use Multi Query Attention (MQA) otherwise GQA is used.
+        mlp_hidden_act (`str`, *optional*, defaults to "relu2"):
+            The non-linear activation function in the MLP layers.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in attention layers.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in MLP layers.
+        use_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in the model.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        residual_in_fp32 (`bool`, *optional*, defaults to `False`):
+            Whether or not residuals should be in `float32`. If set to `False`
+            residuals will keep the same `dtype` as the rest of the model.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values
+            attentions (not used by all models). Only relevant if
+            `config.is_decoder=True`.
+        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
+            Number of prompt logits to calculate during generation. If `None`,
+            all logits will be calculated. If an integer value, only last
+            `num_logits_to_keep` logits will be calculated.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        sliding_window (`int`, *optional*, defaults to None):
+            Sliding window attention window size.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used
+            with.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        hidden_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the hidden states.
+        use_mamba_kernels (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use the fast mamba kernels.
+            These are available only if `mamba-ssm` and `causal-conv1d`
+            are installed, and the mamba modules are running on a CUDA device.
+        ssm_state_size (`int`, *optional*, defaults to 128):
+            The dimension of the mamba state space latents.
+        mamba_num_heads (`int`, *optional*, defaults to 128):
+            Number of heads in Mamba layers.
+        mamba_n_groups (`int`, *optional*, defaults to 8):
+            Number of groups in Mamba layers.
+        mamba_head_dim (`int`, *optional*, defaults to 64):
+            Dimension of each Mamba head.
+        mamba_d_conv (`int`, *optional*, defaults to 4):
+            The size of the mamba convolution kernel.
+        mamba_expand (`int`, *optional*, defaults to 2):
+            Expanding factor used to determine the mamba intermediate size.
+        mamba_hidden_act (`str`, *optional*, defaults to "silu"):
+            The non-linear activation function in the Mamba layers.
+        mamba_dt_min (`float`, *optional*, defaults to 0.001):
+            Minimum value for the time step in Mamba.
+        mamba_dt_max (`float`, *optional*, defaults to 0.1):
+            Maximum value for the time step in Mamba.
+        mamba_dt_limit (`tuple`, *optional*, defaults to (0.0, float("inf"))):
+            Limits for the time step in Mamba.
+        mamba_dt_init_floor (`float`, *optional*, defaults to 1e-4):
+            Floor value for time step initialization in Mamba.
+        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the convolution layer of the mamba mixer
+            block.
+        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in the input and output projections of the
+            mamba mixer block.
+        mamba_chunk_size (`int`, *optional*, defaults to 256):
+            Size of chunks for Mamba processing.
+        rescale_prenorm_residual (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the pre-normalization residual connections.
+    """
+
+    model_type = "nemotron_h"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=131072,
+        tie_word_embeddings=False,
+        hidden_size=4096,
+        intermediate_size=21504,
+        num_hidden_layers=52,
+        hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-",
+        num_attention_heads=32,
+        attention_head_dim=128,
+        num_key_value_heads=8,  # nemo: num_query_groups
+        mlp_hidden_act="relu2",
+        attention_bias=False,
+        mlp_bias=False,
+        use_bias=False,
+        initializer_range=0.02,  # nemo: init_method_std
+        layer_norm_epsilon=1e-5,  # nemo: layernorm_epsilon
+        residual_in_fp32=False,  #  Megatron Core default value
+        use_cache=True,
+        num_logits_to_keep=1,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        sliding_window=None,
+        max_position_embeddings=4096,
+        attention_dropout=0.0,
+        hidden_dropout=0.0,  # * ADDED
+        use_mamba_kernels=True,
+        ssm_state_size=128,  # mamba_state_size
+        mamba_num_heads=128,
+        mamba_n_groups=8,  # nemo: mamba_ssm_ngroups = num_heads
+        mamba_head_dim=64,
+        mamba_d_conv=4,
+        mamba_expand=2,
+        mamba_hidden_act="silu",
+        mamba_dt_min=0.001,
+        mamba_dt_max=0.1,
+        mamba_dt_limit=(0.0, float("inf")),
+        mamba_dt_init_floor=1e-4,
+        mamba_conv_bias=True,
+        mamba_proj_bias=False,
+        mamba_chunk_size=256,
+        rescale_prenorm_residual=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.tie_word_embeddings = tie_word_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.hybrid_override_pattern = hybrid_override_pattern
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        self.sliding_window = sliding_window
+        self.max_position_embeddings = max_position_embeddings
+        self.attention_dropout = attention_dropout
+        self.hidden_dropout = hidden_dropout
+
+        # Validate hybrid_override_pattern
+        # M: Mamba2, *: Attention, -: MLP
+        assert len(self.hybrid_override_pattern) == self.num_hidden_layers, (
+            "hybrid_override_pattern must have same length as "
+            "num_hidden_layers")
+        assert re.match(r"^[*-M]+$", self.hybrid_override_pattern), (
+            "hybrid_override_pattern must only contain characters "
+            "'M', '*', or '-'")
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.mlp_hidden_act = mlp_hidden_act
+        self.attention_bias = attention_bias
+        self.mlp_bias = mlp_bias
+        self.use_bias = use_bias
+        self.initializer_range = initializer_range
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.residual_in_fp32 = residual_in_fp32
+
+        self.use_cache = use_cache
+        self.num_logits_to_keep = num_logits_to_keep
+
+        self.use_mamba_kernels = use_mamba_kernels
+        self.n_groups = mamba_n_groups
+        self.mamba_head_dim = mamba_head_dim
+        self.ssm_state_size = ssm_state_size
+        self.mamba_num_heads = mamba_num_heads
+        self.conv_kernel = mamba_d_conv
+        self.expand = mamba_expand
+        self.mamba_hidden_act = mamba_hidden_act
+        self.time_step_min = mamba_dt_min
+        self.time_step_max = mamba_dt_max
+        self.time_step_limit = mamba_dt_limit
+        self.time_step_floor = mamba_dt_init_floor
+        self.use_conv_bias = mamba_conv_bias
+        self.mamba_proj_bias = mamba_proj_bias
+        self.chunk_size = mamba_chunk_size
+        self.rescale_prenorm_residual = rescale_prenorm_residual
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    @property
+    def layers_block_type(self):
+        return [
+            "mamba" if self.hybrid_override_pattern[i] == "M" else
+            "attention" if self.hybrid_override_pattern[i] == "*" else "mlp"
+            for i in range(self.num_hidden_layers)
+        ]
diff --git a/vllm_v0.10.0/vllm/transformers_utils/configs/nemotron_vl.py b/vllm_v0.10.0/vllm/transformers_utils/configs/nemotron_vl.py
new file mode 100644
index 0000000..6a642f2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/configs/nemotron_vl.py
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# yapf: disable
+# ruff: noqa: E501
+# Adapted from
+# https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1/blob/main/configuration.py
+# --------------------------------------------------------
+# Adapted from https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B under MIT License
+#     LICENSE is in incl_licenses directory.
+# --------------------------------------------------------
+
+from transformers import LlamaConfig
+from transformers.configuration_utils import PretrainedConfig
+from transformers.dynamic_module_utils import get_class_from_dynamic_module
+
+
+class Nemotron_Nano_VL_Config(PretrainedConfig):
+    model_type = 'Llama_Nemotron_Nano_VL'
+    is_composition = True
+
+    def __init__(
+        self,
+        vision_config=None,
+        llm_config=None,
+        force_image_size=None,
+        downsample_ratio=0.5,
+        template=None,
+        ps_version='v1',
+        image_tag_type="internvl",
+        projector_hidden_size=4096,
+        vit_hidden_size=1280,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        if vision_config is not None:
+            assert "auto_map" in vision_config and "AutoConfig" in vision_config["auto_map"]
+            vision_auto_config = get_class_from_dynamic_module(*vision_config["auto_map"]["AutoConfig"].split("--")[::-1])
+            self.vision_config = vision_auto_config(**vision_config)
+        else:
+            self.vision_config = PretrainedConfig()
+
+        if llm_config is None:
+            self.text_config = LlamaConfig()
+        else:
+            self.text_config = LlamaConfig(**llm_config)
+
+        # Assign configuration values
+        self.force_image_size = force_image_size
+        self.downsample_ratio = downsample_ratio
+        self.template = template  # TODO move out of here and into the tokenizer
+        self.ps_version = ps_version  # Pixel shuffle version
+        self.image_tag_type = image_tag_type # TODO: into the tokenizer too?
+        self.projector_hidden_size = projector_hidden_size
+        self.vit_hidden_size = vit_hidden_size
diff --git a/vllm_v0.10.0/vllm/transformers_utils/configs/nvlm_d.py b/vllm_v0.10.0/vllm/transformers_utils/configs/nvlm_d.py
new file mode 100644
index 0000000..edfc506
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/configs/nvlm_d.py
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py
+# --------------------------------------------------------
+# NVLM-D
+# Copyright (c) 2024 NVIDIA
+# Licensed under Apache 2.0 License [see LICENSE for details]
+# --------------------------------------------------------
+from transformers import Qwen2Config
+from transformers.configuration_utils import PretrainedConfig
+
+
+class NVLM_D_Config(PretrainedConfig):
+    model_type = 'NVLM_D'
+    is_composition = True
+
+    def __init__(self, vision_config=None, llm_config=None, **kwargs):
+        super().__init__(**kwargs)
+
+        # Handle vision_config initialization
+        if vision_config is None:
+            vision_config = {}
+
+        # Handle llm_config initialization
+        if llm_config is None:
+            llm_config = {}
+
+        self.vision_config = PretrainedConfig(**vision_config)
+        self.text_config = Qwen2Config(**llm_config)
diff --git a/vllm_v0.10.0/vllm/transformers_utils/configs/ovis.py b/vllm_v0.10.0/vllm/transformers_utils/configs/ovis.py
new file mode 100644
index 0000000..021d402
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/configs/ovis.py
@@ -0,0 +1,184 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# yapf: disable
+# ruff: noqa: E501
+# copied from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py
+# and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py
+from typing import Any, Optional, Union
+
+from transformers import AutoConfig, PretrainedConfig
+
+
+class AIMv2Config(PretrainedConfig):
+    """This is the configuration class to store the configuration of an [`AIMv2Model`].
+
+    Instantiating a configuration with the defaults will yield a similar configuration
+    to that of the [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224).
+
+    Args:
+        hidden_size: Dimension of the hidden representations.
+        intermediate_size: Dimension of the SwiGLU representations.
+        num_hidden_layers: Number of hidden layers in the Transformer.
+        num_attention_heads: Number of attention heads for each attention layer
+            in the Transformer.
+        num_channels: Number of input channels.
+        image_size: Image size.
+        patch_size: Patch size.
+        rms_norm_eps: Epsilon value used for the RMS normalization layer.
+        attention_dropout: Dropout ratio for attention probabilities.
+        projection_dropout: Dropout ratio for the projection layer after the attention.
+        qkv_bias: Whether to add a bias to the queries, keys and values.
+        use_bias: Whether to add a bias in the feed-forward and projection layers.
+        kwargs: Keyword arguments for the [`PretrainedConfig`].
+    """
+
+    model_type: str = "aimv2"
+
+    def __init__(
+        self,
+        hidden_size: int = 1024,
+        intermediate_size: int = 2816,
+        num_hidden_layers: int = 24,
+        num_attention_heads: int = 8,
+        num_channels: int = 3,
+        image_size: int = 224,
+        patch_size: int = 14,
+        rms_norm_eps: float = 1e-5,
+        attention_dropout: float = 0.0,
+        projection_dropout: float = 0.0,
+        qkv_bias: bool = False,
+        use_bias: bool = False,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.rms_norm_eps = rms_norm_eps
+
+        self.projection_dropout = projection_dropout
+        self.qkv_bias = qkv_bias
+        self.use_bias = use_bias
+
+
+IGNORE_ID = -100
+IMAGE_TOKEN_ID = -200
+IMAGE_TOKEN = "<image>"
+IMAGE_ATOM_ID = -300
+IMAGE_INDICATOR_IDS = [-301, -302, -303, -304, -305]
+
+
+# ----------------------------------------------------------------------
+#                     Visual Tokenizer Configuration
+# ----------------------------------------------------------------------
+class BaseVisualTokenizerConfig(PretrainedConfig):
+
+    def __init__(self,
+                 vocab_size=16384,
+                 tokenize_function="softmax",
+                 tau=1.0,
+                 depths=None,
+                 drop_cls_token=False,
+                 backbone_config: Optional[Union[PretrainedConfig,
+                                                 dict]] = None,
+                 hidden_stride: int = 1,
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.tokenize_function = tokenize_function
+        self.tau = tau
+        if isinstance(depths, str):
+            depths = [int(x) for x in depths.split('|')]
+        self.depths = depths
+        self.backbone_kwargs = dict[str, Any]()
+        self.drop_cls_token = drop_cls_token
+        if backbone_config is not None:
+            assert isinstance(backbone_config, (PretrainedConfig, dict)), \
+                f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type"
+            if not isinstance(backbone_config, PretrainedConfig):
+                model_type = backbone_config['model_type']
+                if model_type != "aimv2":
+                    backbone_config.pop('model_type')
+                    backbone_config = AutoConfig.for_model(model_type, **backbone_config)
+                else:
+                    backbone_config = AIMv2Config(**backbone_config)
+        self.backbone_config = backbone_config
+        self.hidden_stride = hidden_stride
+
+
+class Aimv2VisualTokenizerConfig(BaseVisualTokenizerConfig):
+    model_type = "aimv2_visual_tokenizer"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if self.drop_cls_token:
+            self.drop_cls_token = False
+        if self.depths:
+            assert len(self.depths) == 1
+            self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
+
+
+class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
+    model_type = "siglip_visual_tokenizer"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if self.drop_cls_token:
+            self.drop_cls_token = False
+        if self.depths:
+            assert len(self.depths) == 1
+            self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
+
+
+AutoConfig.register("siglip_visual_tokenizer", SiglipVisualTokenizerConfig)
+AutoConfig.register("aimv2_visual_tokenizer", Aimv2VisualTokenizerConfig)
+
+
+# ----------------------------------------------------------------------
+#                           Ovis Configuration
+# ----------------------------------------------------------------------
+class OvisConfig(PretrainedConfig):
+    model_type = "ovis"
+
+    def __init__(self,
+                 llm_config: Optional[Union[PretrainedConfig, dict]] = None,
+                 visual_tokenizer_config: Optional[Union[PretrainedConfig,
+                                                         dict]] = None,
+                 multimodal_max_length=8192,
+                 hidden_size=None,
+                 conversation_formatter_class=None,
+                 llm_attn_implementation=None,
+                 disable_tie_weight=False,
+                 **kwargs):
+        super().__init__(**kwargs)
+        if llm_config is not None:
+            assert isinstance(llm_config, (PretrainedConfig, dict)), \
+                f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
+            if not isinstance(llm_config, PretrainedConfig):
+                model_type = llm_config['model_type']
+                llm_config.pop('model_type')
+                llm_config = AutoConfig.for_model(model_type, **llm_config)
+
+        # map llm_config to text_config
+        self.text_config = llm_config
+        if visual_tokenizer_config is not None:
+            assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \
+                f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
+            if not isinstance(visual_tokenizer_config, PretrainedConfig):
+                model_type = visual_tokenizer_config['model_type']
+                visual_tokenizer_config.pop('model_type')
+                visual_tokenizer_config = AutoConfig.for_model(
+                    model_type, **visual_tokenizer_config)
+
+        self.visual_tokenizer_config = visual_tokenizer_config
+        self.multimodal_max_length = multimodal_max_length
+        self.hidden_size = hidden_size
+        self.conversation_formatter_class = conversation_formatter_class
+        self.llm_attn_implementation = llm_attn_implementation
+        self.disable_tie_weight = disable_tie_weight
diff --git a/vllm_v0.10.0/vllm/transformers_utils/configs/skyworkr1v.py b/vllm_v0.10.0/vllm/transformers_utils/configs/skyworkr1v.py
new file mode 100644
index 0000000..33a4522
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/configs/skyworkr1v.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/configuration_skywork_chat.py
+# --------------------------------------------------------
+# SkyworkR1V
+# Copyright (c) 2025 Skywork
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from transformers.configuration_utils import PretrainedConfig
+
+
+class SkyworkR1VChatConfig(PretrainedConfig):
+    model_type = 'internvl_chat'
+    is_composition = True
+
+    def __init__(self,
+                 vision_config=None,
+                 llm_config=None,
+                 use_backbone_lora=0,
+                 use_llm_lora=0,
+                 select_layer=-1,
+                 force_image_size=None,
+                 downsample_ratio=0.5,
+                 template=None,
+                 dynamic_image_size=False,
+                 use_thumbnail=False,
+                 ps_version='v1',
+                 min_dynamic_patch=1,
+                 max_dynamic_patch=6,
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        if vision_config is None:
+            vision_config = {}
+
+        if llm_config is None:
+            llm_config = {}
+
+        self.vision_config = PretrainedConfig(**vision_config)
+        self.text_config = PretrainedConfig(**llm_config)
+
+        self.use_backbone_lora = use_backbone_lora
+        self.use_llm_lora = use_llm_lora
+        self.select_layer = select_layer
+        self.force_image_size = force_image_size
+        self.downsample_ratio = downsample_ratio
+        self.template = template
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.ps_version = ps_version  # pixel shuffle version
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch
diff --git a/vllm_v0.10.0/vllm/transformers_utils/configs/solar.py b/vllm_v0.10.0/vllm/transformers_utils/configs/solar.py
new file mode 100644
index 0000000..a83dfa4
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/configs/solar.py
@@ -0,0 +1,247 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Solar model configuration"""
+
+from transformers import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class SolarConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store
+    the configuration of a [`SolarModel`].
+    It is used to instantiate an LLaMA model
+    according to the specified arguments,
+    defining the model architecture.
+    Instantiating a configuration with the
+    defaults will yield a similar
+    configuration to that of the LLaMA-7B.
+    Configuration objects inherit from [`PretrainedConfig`]
+    and can be used to control the model outputs.
+    Read the documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the LLaMA model.
+            Defines the number of different tokens
+            that can be represented by the `inputs_ids`
+            passed when calling [`SolarModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer
+            in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that
+            should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`,
+            the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model
+            will use Multi Query Attention (MQA)
+            otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint,
+            each group key and value head should be constructed
+            by meanpooling all the original heads within that group.
+            For more details checkout [this paper]
+            (https://arxiv.org/pdf/2305.13245.pdf).
+            If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string)
+            in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+            Solar 1 supports up to 2048 tokens,
+            Solar 2 up to 4096, CodeSolar up to 16384.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of
+            the truncated_normal_initializer for initializing
+            all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return
+            the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank
+            used during pretraining.
+            Please refer to [this
+            document](https://huggingface.co/docs/
+            transformers/main/
+            perf_train_gpu_many#tensor-parallelism)
+             to understand more about it. This value is
+            necessary to ensure exact reproducibility
+            of the pretraining results.
+            Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`dict`, *optional*):
+            Dictionary containing the scaling configuration for
+            the RoPE embeddings.
+            Currently supports two scaling
+            strategies: linear and dynamic.
+            Their scaling factor must be a float greater than 1.
+            The expected format is
+            `{"type": strategy name, "factor": scaling factor}`.
+            When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum.
+            See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/
+            dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking
+            API changes in future versions.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value
+            and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj, down_proj and gate_proj
+            layers in the MLP layers.
+        sliding_window (`int`, *optional*, defaults to 2047):
+            Sliding window attention window size. If not specified,
+            will default to `2047`.
+    ```python
+    >>> from transformers import SolarModel, SolarConfig
+    >>> # Initializing a Solar-pro style configuration
+    >>> configuration = SolarConfig()
+    >>> # Initializing a model from the Solar-pro style configuration
+    >>> model = SolarModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "solar"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        sliding_window=2047,
+        bskcn_1=None,
+        bskcn_2=None,
+        bskcn_3=None,
+        bskcn_4=None,
+        bskcn_tv=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+        self.sliding_window = sliding_window
+        self.bskcn_1 = bskcn_1 if bskcn_1 is not None else [12, 20, 32, 44]
+        self.bskcn_2 = bskcn_2 if bskcn_2 is not None else [20, 32]
+        self.bskcn_3 = bskcn_3 if bskcn_3 is not None else [16, 24, 36, 48]
+        self.bskcn_4 = bskcn_4 if bskcn_4 is not None else [28, 40]
+        self.bskcn_tv = bskcn_tv if bskcn_tv is not None else [0.9, 0.8]
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if (not isinstance(self.rope_scaling, dict)
+                or len(self.rope_scaling) != 2):
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with two fields,"
+                " `type` and `factor`, "
+                f"got {self.rope_scaling}")
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in [
+                "linear",
+                "dynamic",
+        ]:
+            raise ValueError(f"`rope_scaling`'s type field must be one of "
+                             f"['linear', 'dynamic'], got {rope_scaling_type}")
+        if (rope_scaling_factor is None
+                or not isinstance(rope_scaling_factor, float)
+                or rope_scaling_factor <= 1.0):
+            raise ValueError(
+                f"`rope_scaling`'s factor field must be a float > 1,"
+                f" got {rope_scaling_factor}")
diff --git a/vllm_v0.10.0/vllm/transformers_utils/configs/telechat2.py b/vllm_v0.10.0/vllm/transformers_utils/configs/telechat2.py
new file mode 100644
index 0000000..050a785
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/configs/telechat2.py
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://www.modelscope.cn/models/TeleAI/TeleChat2-3B/resolve/master/configuration_telechat2.py
+""" Telechat configuration compatible with LlamaConfig. """
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class Telechat2Config(PretrainedConfig):
+
+    model_type = "telechat"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "num_hidden_layers": "n_layer",
+        "num_attention_heads": "n_head",
+        "intermediate_size": "ffn_hidden_size",
+        "rms_norm_eps": "layer_norm_epsilon"
+    }
+
+    def __init__(
+        self,
+        vocab_size=160256,
+        hidden_size=4096,
+        n_layer=30,
+        n_head=32,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        use_cache=True,
+        bos_token_id=1,
+        eos_token_id=2,
+        apply_residual_connection_post_layernorm=False,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        ffn_hidden_size=12288,
+        training_seqlen=8192,
+        logn=True,
+        embed_layernorm=False,
+        hidden_act="silu",
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        n_embed = kwargs.pop("n_embed", None)
+        self.hidden_size = hidden_size if n_embed is None else n_embed
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.apply_residual_connection_post_layernorm = (
+            apply_residual_connection_post_layernorm)
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.logn = logn
+        self.training_seqlen = training_seqlen
+        self.embed_layernorm = embed_layernorm
+        self.num_key_value_heads = kwargs.pop("num_key_value_heads", None)
+        self.ffn_hidden_size = ffn_hidden_size
+        self.hidden_act = hidden_act
+        super().__init__(bos_token_id=bos_token_id,
+                         eos_token_id=eos_token_id,
+                         **kwargs)
diff --git a/vllm_v0.10.0/vllm/transformers_utils/configs/ultravox.py b/vllm_v0.10.0/vllm/transformers_utils/configs/ultravox.py
new file mode 100644
index 0000000..62f63b0
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/configs/ultravox.py
@@ -0,0 +1,108 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_config.py
+from typing import Any, Optional
+
+import transformers
+
+
+class UltravoxConfig(transformers.PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a
+    [`UltravoxForConditionalGeneration`]. It is used to instantiate an
+    Ultravox model according to the specified arguments, defining the model
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to
+    control the model outputs. Read the documentation from [`PretrainedConfig`]
+    for more information.
+
+    Args:
+        audio_config (`Union[AutoConfig, dict]`,  *optional*):
+            Custom audio config or dict
+        text_config (`Union[AutoConfig, dict]`, *optional*):
+            The config object of the text backbone. Can be any of `LlamaConfig`
+            or `MistralConfig`.
+        ignore_index (`int`, *optional*, defaults to -100):
+            The ignore index for the loss function.
+        audio_token_index (`int`, *optional*, defaults to 32000):
+            The audio token index to encode the audio prompt.
+        stack_factor (`int`, *optional*, defaults to 8):
+            Audio downsampling factor for the multimodal projector.
+        norm_init (`float`, *optional*, defaults to 0.4):
+            The initialization value for the layer normalization.
+        projector_act (`str`, *optional*, defaults to `"swiglu"`):
+            The activation function used by the multimodal projector.
+        text_model_lora_config (`LoraConfigSimplified`, *optional*):
+            The LoRA configuration for finetuning the text model.
+        audio_model_lora_config (`LoraConfigSimplified`, *optional*):
+            The LoRA configuration for finetuning the audio model.
+        projector_ln_mid (`bool`, *optional*, defaults to `False`):
+            Whether to apply layer normalization at the middle of the
+            projector or at the end. Versions v0.4.1 and below
+            use `False`, but v0.5 and above use `True`.
+    """
+
+    model_type = "ultravox"
+    is_composition = False
+
+    def __init__(
+        self,
+        audio_config: Optional[dict[str, Any]] = None,
+        text_config: Optional[dict[str, Any]] = None,
+        audio_model_id: Optional[str] = None,
+        text_model_id: Optional[str] = None,
+        ignore_index: int = -100,
+        audio_token_index: int = 32000,
+        hidden_size: int = 4096,
+        stack_factor: int = 8,
+        norm_init: float = 0.4,
+        projector_act: str = "swiglu",
+        text_model_lora_config: Optional[dict[str, Any]] = None,
+        audio_model_lora_config: Optional[dict[str, Any]] = None,
+        projector_ln_mid: bool = False,
+        **kwargs,
+    ):
+        self.ignore_index = ignore_index
+
+        self.audio_model_id = audio_model_id
+        self.text_model_id = text_model_id
+        self.audio_token_index = audio_token_index
+
+        self.hidden_size = hidden_size
+        self.stack_factor = stack_factor
+        self.norm_init = norm_init
+        self.projector_act = projector_act
+        self.projector_ln_mid = projector_ln_mid
+
+        if text_model_id is not None:
+            # Avoid circular import
+            from vllm.transformers_utils.config import get_config
+
+            self.text_config = get_config(text_model_id,
+                                          trust_remote_code=False)
+        else:
+            text_config = text_config or {}
+            self.text_config = transformers.CONFIG_MAPPING[text_config.get(
+                "model_type", "llama")](**text_config)
+
+        if audio_model_id is not None:
+            # Avoid circular import
+            from vllm.transformers_utils.config import get_config
+
+            self.audio_config = get_config(audio_model_id,
+                                           trust_remote_code=False)
+        else:
+            audio_config = audio_config or {}
+            self.audio_config = transformers.CONFIG_MAPPING[audio_config.get(
+                "model_type", "whisper")](**audio_config)
+
+        self.text_model_lora_config = text_model_lora_config or {}
+        self.audio_model_lora_config = audio_model_lora_config or {}
+
+        self.vocab_size = self.text_config.vocab_size
+
+        self.initializer_range = self.text_config.initializer_range
+
+        super().__init__(**kwargs)
diff --git a/vllm_v0.10.0/vllm/transformers_utils/detokenizer.py b/vllm_v0.10.0/vllm/transformers_utils/detokenizer.py
new file mode 100644
index 0000000..380c62a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/detokenizer.py
@@ -0,0 +1,168 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+from vllm.sequence import (VLLM_INVALID_TOKEN_ID, Logprob, SamplingParams,
+                           Sequence, SequenceGroup)
+
+from .detokenizer_utils import (convert_prompt_ids_to_tokens,
+                                detokenize_incrementally)
+from .tokenizer import AnyTokenizer
+from .tokenizer_group import TokenizerGroup
+
+
+class Detokenizer:
+    """Provides methods to decode the output of a model into text."""
+
+    def __init__(self, tokenizer_group: TokenizerGroup):
+        self.tokenizer_group = tokenizer_group
+
+    def get_tokenizer_for_seq(self, sequence: Sequence) -> AnyTokenizer:
+        """Returns the HF tokenizer to use for a given sequence."""
+        return self.tokenizer_group.get_lora_tokenizer(sequence.lora_request)
+
+    def decode_prompt_logprobs_inplace(self, seq_group: SequenceGroup,
+                                       prompt_logprobs: list[Optional[dict[
+                                           int, Logprob]]],
+                                       position_offset: int) -> None:
+        """Decodes the logprobs for the prompt of a sequence group.
+
+        Args:
+            seq_group: The sequence group to decode.
+            prompt_logprobs: The logprobs to decode.
+            position_offset: Offset of the first index of the logprobs 
+                relative to the start of the sequence (for chunked prefill).
+        
+        Returns:
+            The prompt logprobs with the decoded tokens.
+        """
+        prms = seq_group.sampling_params
+        assert prms is not None
+
+        # We can pick any sequence for the prompt.
+        seq = seq_group.get_seqs()[0]
+        # Only prompt, without the generated token.
+        all_token_ids = seq.get_token_ids()
+        prompt_token_ids = all_token_ids[:-1]
+        tokenizer = self.get_tokenizer_for_seq(seq)
+        prefix_offset = 0
+        read_offset = 0
+        next_iter_prefix_offset = 0
+        next_iter_read_offset = 0
+        next_iter_tokens: list[str] = []
+        prev_tokens = None
+
+        for token_position_in_logprob, prompt_logprobs_for_token in enumerate(
+                prompt_logprobs):
+
+            # Absolute token position equals the index in the logprobs
+            # list plus the offset of the entire logprobs list relative
+            # to the start of the sequence.
+            token_position = token_position_in_logprob + position_offset
+            if not prompt_logprobs_for_token:
+                continue
+            for token_id, sample_logprob in prompt_logprobs_for_token.items():
+                if (sample_logprob.decoded_token is None
+                        and token_id != VLLM_INVALID_TOKEN_ID):
+                    prompt_token_ids_with_token = (
+                        prompt_token_ids[:token_position] + [token_id])
+                    (new_tokens, new_text, new_prefix_offset,
+                     new_read_offset) = detokenize_incrementally(
+                         tokenizer=tokenizer,
+                         all_input_ids=prompt_token_ids_with_token,
+                         prev_tokens=prev_tokens,
+                         prefix_offset=prefix_offset,
+                         read_offset=read_offset,
+                         skip_special_tokens=prms.skip_special_tokens,
+                         spaces_between_special_tokens=prms.
+                         spaces_between_special_tokens,
+                     )
+
+                    sample_logprob.decoded_token = new_text
+
+                    # Use the offsets & prev tokens corresponding to
+                    # real tokens to ensure detokenization is consistent
+                    # actual with prompt.
+                    if token_id == all_token_ids[token_position]:
+                        next_iter_prefix_offset = new_prefix_offset
+                        next_iter_read_offset = new_read_offset
+                        next_iter_tokens = new_tokens
+
+            # Advance to the next token position.
+            prefix_offset = next_iter_prefix_offset
+            read_offset = next_iter_read_offset
+            if prev_tokens is None:
+                prev_tokens = next_iter_tokens.copy()
+            else:
+                prev_tokens.extend(next_iter_tokens)
+
+    def decode_sequence_inplace(self, seq: Sequence,
+                                prms: SamplingParams) -> int:
+        """Decodes the new token for a sequence. In-place operation.
+
+        Args:
+            seq: The sequence to decode.
+            prms: The sampling parameters used to generate the sequence.
+
+        Returns:
+            The number of characters added to the output text.
+        """
+        all_input_ids = seq.get_token_ids()
+        token_id_generated_this_iteration = all_input_ids[-1]
+        tokenizer = self.get_tokenizer_for_seq(seq)
+
+        # Convert prompt token IDs to tokens if necessary.
+        # Do it here so that we don't have to repeat this
+        # computation for each logprob.
+        if seq.tokens is None:
+            (seq.tokens, seq.prefix_offset,
+             seq.read_offset) = convert_prompt_ids_to_tokens(
+                 tokenizer=tokenizer,
+                 prompt_ids=all_input_ids[:-1],
+                 skip_special_tokens=prms.skip_special_tokens,
+             )
+
+        (new_tokens, new_decoded_token_text, prefix_offset,
+         read_offset) = detokenize_incrementally(
+             tokenizer=tokenizer,
+             all_input_ids=all_input_ids,
+             prev_tokens=seq.tokens,
+             prefix_offset=seq.prefix_offset,
+             read_offset=seq.read_offset,
+             skip_special_tokens=prms.skip_special_tokens,
+             spaces_between_special_tokens=prms.spaces_between_special_tokens,
+         )
+
+        # Decode logprobs
+        logprobs = seq.output_logprobs[-1]
+        if logprobs:
+            previous_tokens = all_input_ids[:-1]
+            for token_id, sample_logprob in logprobs.items():
+                # If the token was generated this iteration,
+                # use the provided text.
+                if token_id == token_id_generated_this_iteration:
+                    sample_logprob.decoded_token = new_decoded_token_text
+                    continue
+
+                if (sample_logprob.decoded_token is None
+                        and token_id != VLLM_INVALID_TOKEN_ID):
+                    all_input_ids_with_logprob = previous_tokens + [token_id]
+                    (_, new_text, _, _) = detokenize_incrementally(
+                        tokenizer=tokenizer,
+                        all_input_ids=all_input_ids_with_logprob,
+                        prev_tokens=seq.tokens,
+                        prefix_offset=seq.prefix_offset,
+                        read_offset=seq.read_offset,
+                        skip_special_tokens=prms.skip_special_tokens,
+                        spaces_between_special_tokens=prms.
+                        spaces_between_special_tokens,
+                    )
+                    sample_logprob.decoded_token = new_text
+
+        seq.tokens.extend(new_tokens)
+        seq.prefix_offset = prefix_offset
+        seq.read_offset = read_offset
+        seq.output_text += new_decoded_token_text
+
+        return len(new_decoded_token_text)
diff --git a/vllm_v0.10.0/vllm/transformers_utils/detokenizer_utils.py b/vllm_v0.10.0/vllm/transformers_utils/detokenizer_utils.py
new file mode 100644
index 0000000..be1040c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/detokenizer_utils.py
@@ -0,0 +1,194 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+from .tokenizer import AnyTokenizer
+
+
+def _replace_none_with_empty(tokens: list[Optional[str]]):
+    for i, token in enumerate(tokens):
+        if token is None:
+            tokens[i] = ""
+
+
+def _convert_tokens_to_string_with_added_encoders(
+    tokenizer: AnyTokenizer,
+    output_tokens: list[str],
+    skip_special_tokens: bool,
+    spaces_between_special_tokens: bool,
+) -> str:
+    # Adapted from
+    # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
+    # NOTE(woosuk): The following code is slow because it runs a for loop over
+    # the output_tokens. In Python, running a for loop over a list can be slow
+    # even when the loop body is very simple.
+    sub_texts: list[str] = []
+    current_sub_text: list[str] = []
+    all_special_tokens = set(tokenizer.all_special_tokens)
+    for token in output_tokens:
+        if skip_special_tokens and token in all_special_tokens:
+            continue
+        if token in tokenizer.get_added_vocab():
+            if current_sub_text:
+                sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
+                sub_texts.append(sub_text)
+                current_sub_text = []
+            sub_texts.append(token)
+        else:
+            current_sub_text.append(token)
+    if current_sub_text:
+        sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
+        sub_texts.append(sub_text)
+    if spaces_between_special_tokens:
+        return " ".join(sub_texts)
+    else:
+        return "".join(sub_texts)
+
+
+# 5 is an arbitrary value that should work for all
+# tokenizers (bigger = more conservative).
+INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET = 5
+
+
+def convert_prompt_ids_to_tokens(
+    tokenizer: AnyTokenizer,
+    prompt_ids: list[int],
+    skip_special_tokens: bool = False,
+) -> tuple[list[str], int, int]:
+    """Converts the prompt ids to tokens and returns the tokens and offsets
+    for incremental detokenization.
+
+    Note that not all tokens are converted to strings. Only the tokens that
+    are necessary for incremental detokenization are converted to strings.
+    """
+    # We do not need to convert the whole prompt to tokens.
+    # Offset a little more in case we have special tokens.
+    new_tokens = tokenizer.convert_ids_to_tokens(
+        prompt_ids[-INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET - 2:],
+        skip_special_tokens=skip_special_tokens)
+    read_offset = len(new_tokens)
+    prefix_offset = max(
+        read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0)
+    # This is required to guard against out-of-vocab prompt token ids
+    _replace_none_with_empty(new_tokens)  # type: ignore[arg-type]
+    return new_tokens, prefix_offset, read_offset
+
+
+def convert_ids_list_to_tokens(
+    tokenizer: AnyTokenizer,
+    token_ids: list[int],
+) -> list[str]:
+    """Detokenize the input ids individually.
+
+    Args:
+      tokenizer: tokenizer used by model under test
+      token_ids: convert these tokens (Python list form)
+
+    Returns:
+      Python list of token string representations
+    
+    """
+    token_str_lst = []
+    for token_id in token_ids:
+        # use default skip_special_tokens.
+        token_str = tokenizer.decode([token_id])
+        if token_str is None:
+            token_str = ""
+        token_str_lst.append(token_str)
+    return token_str_lst
+
+
+# Based on
+# https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15
+# under Apache 2.0 license
+def detokenize_incrementally(
+    tokenizer: AnyTokenizer,
+    all_input_ids: list[int],
+    prev_tokens: Optional[list[str]],
+    prefix_offset: int,
+    read_offset: int,
+    skip_special_tokens: bool = False,
+    spaces_between_special_tokens: bool = True,
+) -> tuple[list[str], str, int, int]:
+    """Detokenizes the input ids incrementally and returns the new tokens
+    and the new text.
+
+    If `prev_tokens` is None, this function will convert the input ids to
+    tokens and return the tokens and the new text. Otherwise, it will return the
+    new tokens and the new text.
+
+    This function will also return the new prefix offset and the new read
+    offset to be used in the next iteration.
+
+    The offsets are necessary to defeat cleanup algorithms in the decode which
+    decide to add a space or not depending on the surrounding ids.
+
+    Args:
+        tokenizer: The tokenizer to use.
+        all_input_ids: The input ids. The last id is the new token id.
+        prev_tokens: The previous tokens. If None, this function will convert
+            the input ids to tokens and return the tokens and the new text.
+        prefix_offset: The prefix offset.
+        read_offset: The read offset.
+        skip_special_tokens: Whether to skip special tokens.
+        spaces_between_special_tokens: Whether to add spaces between special
+            tokens.
+    """
+    new_token_id = all_input_ids[-1]
+    # This is the first iteration for this sequence
+    is_first_iter = prev_tokens is None
+    if is_first_iter:
+        (prev_tokens, prefix_offset,
+         read_offset) = convert_prompt_ids_to_tokens(
+             tokenizer,
+             all_input_ids[:-1],
+             skip_special_tokens=skip_special_tokens)
+    assert prev_tokens is not None
+
+    # If the new token id is out of bounds, return an empty string.
+    if 0 <= new_token_id < len(tokenizer):
+        # Put new_token_id in a list so skip_special_tokens is respected
+        new_tokens = tokenizer.convert_ids_to_tokens(
+            [new_token_id], skip_special_tokens=skip_special_tokens)
+        if isinstance(new_tokens, str):
+            new_tokens = [new_tokens]
+    else:
+        new_tokens = [""]
+    output_tokens = prev_tokens + new_tokens
+
+    # If this is the first iteration, return all tokens.
+    if is_first_iter:
+        new_tokens = output_tokens
+
+    # The prefix text is necessary only to defeat cleanup algorithms in
+    # the decode which decide to add a space or not depending on the
+    # surrounding ids.
+    if tokenizer.is_fast or not tokenizer.get_added_vocab():
+        prefix_text = tokenizer.convert_tokens_to_string(
+            output_tokens[prefix_offset:read_offset])
+        new_text = tokenizer.convert_tokens_to_string(
+            output_tokens[prefix_offset:])
+    else:
+        prefix_text = _convert_tokens_to_string_with_added_encoders(
+            tokenizer,
+            output_tokens[prefix_offset:read_offset],
+            skip_special_tokens=skip_special_tokens,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+        )
+        new_text = _convert_tokens_to_string_with_added_encoders(
+            tokenizer,
+            output_tokens[prefix_offset:],
+            skip_special_tokens=skip_special_tokens,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+        )
+
+    if len(new_text) <= len(prefix_text) or new_text.endswith("�"):
+        # utf-8 char at the end means it's a potential unfinished byte sequence
+        # from byte fallback tokenization.
+        # If it's in the middle, it's probably a real invalid id generated
+        # by the model
+        return new_tokens, "", prefix_offset, read_offset
+
+    new_text = new_text[len(prefix_text):]
+    return new_tokens, new_text, read_offset, len(output_tokens)
diff --git a/vllm_v0.10.0/vllm/transformers_utils/processor.py b/vllm_v0.10.0/vllm/transformers_utils/processor.py
new file mode 100644
index 0000000..70cd082
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/processor.py
@@ -0,0 +1,221 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from functools import lru_cache
+from typing import TYPE_CHECKING, Any, Optional, Union, cast
+
+from transformers.processing_utils import ProcessorMixin
+from typing_extensions import TypeVar
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+
+_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
+
+
+class HashableDict(dict):
+    """
+    A dictionary that can be hashed by lru_cache.
+    """
+
+    # NOTE: pythonic dict is not hashable,
+    # we override on it directly for simplicity
+    def __hash__(self) -> int:  # type: ignore[override]
+        return hash(frozenset(self.items()))
+
+
+class HashableList(list):
+    """
+    A list that can be hashed by lru_cache.
+    """
+
+    def __hash__(self) -> int:  # type: ignore[override]
+        return hash(tuple(self))
+
+
+def _merge_mm_kwargs(model_config: "ModelConfig", **kwargs):
+    mm_config = model_config.get_multimodal_config()
+    base_kwargs = mm_config.mm_processor_kwargs
+    if base_kwargs is None:
+        base_kwargs = {}
+
+    merged_kwargs = {**base_kwargs, **kwargs}
+
+    # NOTE: Pythonic dict is not hashable and will raise unhashable type
+    # error when calling `cached_get_processor`, therefore we need to
+    # wrap it to a hashable dict.
+    for key, value in merged_kwargs.items():
+        if isinstance(value, dict):
+            merged_kwargs[key] = HashableDict(value)
+        if isinstance(value, list):
+            merged_kwargs[key] = HashableList(value)
+    return merged_kwargs
+
+
+def get_processor(
+    processor_name: str,
+    *args: Any,
+    revision: Optional[str] = None,
+    trust_remote_code: bool = False,
+    processor_cls: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
+    **kwargs: Any,
+) -> _P:
+    """Load a processor for the given model name via HuggingFace."""
+    # don't put this import at the top level
+    # it will call torch.cuda.device_count()
+    from transformers import AutoProcessor
+
+    processor_factory = (AutoProcessor if processor_cls == ProcessorMixin or
+                         isinstance(processor_cls, tuple) else processor_cls)
+
+    try:
+        processor = processor_factory.from_pretrained(
+            processor_name,
+            *args,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            **kwargs,
+        )
+    except ValueError as e:
+        # If the error pertains to the processor class not existing or not
+        # currently being imported, suggest using the --trust-remote-code flag.
+        # Unlike AutoTokenizer, AutoProcessor does not separate such errors
+        if not trust_remote_code:
+            err_msg = (
+                "Failed to load the processor. If the processor is "
+                "a custom processor not yet available in the HuggingFace "
+                "transformers library, consider setting "
+                "`trust_remote_code=True` in LLM or using the "
+                "`--trust-remote-code` flag in the CLI.")
+            raise RuntimeError(err_msg) from e
+        else:
+            raise e
+
+    if not isinstance(processor, processor_cls):
+        raise TypeError("Invalid type of HuggingFace processor. "
+                        f"Expected type: {processor_cls}, but "
+                        f"found type: {type(processor)}")
+
+    return processor
+
+
+cached_get_processor = lru_cache(get_processor)
+
+
+def cached_processor_from_config(
+    model_config: "ModelConfig",
+    processor_cls: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
+    **kwargs: Any,
+) -> _P:
+    return cached_get_processor(
+        model_config.model,
+        revision=model_config.revision,
+        trust_remote_code=model_config.trust_remote_code,
+        processor_cls=processor_cls,  # type: ignore[arg-type]
+        **_merge_mm_kwargs(model_config, **kwargs),
+    )
+
+
+def get_feature_extractor(
+    processor_name: str,
+    *args: Any,
+    revision: Optional[str] = None,
+    trust_remote_code: bool = False,
+    **kwargs: Any,
+):
+    """Load an audio feature extractor for the given model name 
+    via HuggingFace."""
+    # don't put this import at the top level
+    # it will call torch.cuda.device_count()
+    from transformers import AutoFeatureExtractor
+    from transformers.feature_extraction_utils import FeatureExtractionMixin
+    try:
+        feature_extractor = AutoFeatureExtractor.from_pretrained(
+            processor_name,
+            *args,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            **kwargs)
+    except ValueError as e:
+        # If the error pertains to the processor class not existing or not
+        # currently being imported, suggest using the --trust-remote-code flag.
+        # Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
+        if not trust_remote_code:
+            err_msg = (
+                "Failed to load the feature extractor. If the feature "
+                "extractor is a custom extractor not yet available in the "
+                "HuggingFace transformers library, consider setting "
+                "`trust_remote_code=True` in LLM or using the "
+                "`--trust-remote-code` flag in the CLI.")
+            raise RuntimeError(err_msg) from e
+        else:
+            raise e
+    return cast(FeatureExtractionMixin, feature_extractor)
+
+
+cached_get_feature_extractor = lru_cache(get_feature_extractor)
+
+
+def cached_feature_extractor_from_config(
+    model_config: "ModelConfig",
+    **kwargs: Any,
+):
+    return cached_get_feature_extractor(
+        model_config.model,
+        revision=model_config.revision,
+        trust_remote_code=model_config.trust_remote_code,
+        **_merge_mm_kwargs(model_config, **kwargs),
+    )
+
+
+def get_image_processor(
+    processor_name: str,
+    *args: Any,
+    revision: Optional[str] = None,
+    trust_remote_code: bool = False,
+    **kwargs: Any,
+):
+    """Load an image processor for the given model name via HuggingFace."""
+    # don't put this import at the top level
+    # it will call torch.cuda.device_count()
+    from transformers import AutoImageProcessor
+    from transformers.image_processing_utils import BaseImageProcessor
+
+    try:
+        processor = AutoImageProcessor.from_pretrained(
+            processor_name,
+            *args,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            **kwargs)
+    except ValueError as e:
+        # If the error pertains to the processor class not existing or not
+        # currently being imported, suggest using the --trust-remote-code flag.
+        # Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
+        if not trust_remote_code:
+            err_msg = (
+                "Failed to load the image processor. If the image processor is "
+                "a custom processor not yet available in the HuggingFace "
+                "transformers library, consider setting "
+                "`trust_remote_code=True` in LLM or using the "
+                "`--trust-remote-code` flag in the CLI.")
+            raise RuntimeError(err_msg) from e
+        else:
+            raise e
+
+    return cast(BaseImageProcessor, processor)
+
+
+cached_get_image_processor = lru_cache(get_image_processor)
+
+
+def cached_image_processor_from_config(
+    model_config: "ModelConfig",
+    **kwargs: Any,
+):
+    return cached_get_image_processor(
+        model_config.model,
+        revision=model_config.revision,
+        trust_remote_code=model_config.trust_remote_code,
+        **_merge_mm_kwargs(model_config, **kwargs),
+    )
diff --git a/vllm_v0.10.0/vllm/transformers_utils/processors/__init__.py b/vllm_v0.10.0/vllm/transformers_utils/processors/__init__.py
new file mode 100644
index 0000000..14d15f2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/processors/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.transformers_utils.processors.deepseek_vl2 import (
+    DeepseekVLV2Processor)
+from vllm.transformers_utils.processors.ovis import OvisProcessor
+
+__all__ = ["DeepseekVLV2Processor", "OvisProcessor"]
diff --git a/vllm_v0.10.0/vllm/transformers_utils/processors/deepseek_vl2.py b/vllm_v0.10.0/vllm/transformers_utils/processors/deepseek_vl2.py
new file mode 100644
index 0000000..b4669d1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/processors/deepseek_vl2.py
@@ -0,0 +1,363 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# yapf: disable
+# ruff: noqa: E501
+# coding=utf-8
+# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/ff23960c5cf9e6874b44be38af930cfb0ccbb620/deepseek_vl2/models/processing_deepseek_vl_v2.py
+# Copyright (c) 2023-2024 DeepSeek.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import math
+
+import torch
+import torchvision.transforms as T
+from PIL import Image, ImageOps
+from transformers import AutoProcessor, BatchFeature, LlamaTokenizerFast
+from transformers.processing_utils import ProcessorMixin
+
+
+class ImageTransform:
+
+    def __init__(self,
+                 mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
+                 std: tuple[float, float, float] = (0.5, 0.5, 0.5),
+                 normalize: bool = True):
+        self.mean = mean
+        self.std = std
+        self.normalize = normalize
+
+        transform_pipelines = [T.ToTensor()]
+
+        if normalize:
+            transform_pipelines.append(T.Normalize(mean, std))
+
+        self.transform = T.Compose(transform_pipelines)
+
+    def __call__(self, pil_img: Image.Image):
+        x = self.transform(pil_img)
+        return x
+
+
+class DeepseekVLV2Processor(ProcessorMixin):
+    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
+    attributes = ["tokenizer"]
+
+    def __init__(
+        self,
+        tokenizer: LlamaTokenizerFast,
+        candidate_resolutions: tuple[tuple[int, int]],
+        patch_size: int,
+        downsample_ratio: int,
+        image_mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
+        image_std: tuple[float, float, float] = (0.5, 0.5, 0.5),
+        normalize: bool = True,
+        image_token: str = "<image>",
+        pad_token: str = "<｜▁pad▁｜>",
+        add_special_token: bool = False,
+        sft_format: str = "deepseek",
+        mask_prompt: bool = True,
+        ignore_id: int = -100,
+        **kwargs,
+    ):
+
+        self.candidate_resolutions = candidate_resolutions
+        self.image_size = candidate_resolutions[0][0]
+        self.patch_size = patch_size
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.normalize = normalize
+        self.downsample_ratio = downsample_ratio
+
+        self.image_transform = ImageTransform(mean=image_mean, std=image_std, normalize=normalize)
+        self.tokenizer = tokenizer
+        self.tokenizer.padding_side = 'left'  # must set this，padding side with make a difference in batch inference
+
+        # add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
+        if tokenizer.pad_token is None:
+            self.tokenizer.add_special_tokens({'pad_token': pad_token})
+
+        # add image token
+        image_token_id = self.tokenizer.vocab.get(image_token)
+        if image_token_id is None:
+            special_tokens = [image_token]
+            special_tokens_dict = {"additional_special_tokens": special_tokens}
+            self.tokenizer.add_special_tokens(special_tokens_dict)
+        self.image_token_id = self.tokenizer.vocab.get(image_token)
+
+        # add five special tokens for grounding-related tasks
+        # <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
+        special_tokens = ['<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>']
+        special_tokens_dict = {"additional_special_tokens": special_tokens}
+        self.tokenizer.add_special_tokens(special_tokens_dict)
+
+        # add special tokens for SFT data
+        special_tokens = ["<|User|>", "<|Assistant|>"]
+        special_tokens_dict = {"additional_special_tokens": special_tokens}
+        self.tokenizer.add_special_tokens(special_tokens_dict)
+
+        self.image_token = image_token
+        self.pad_token = pad_token
+        self.add_special_token = add_special_token
+        self.sft_format = sft_format
+        self.mask_prompt = mask_prompt
+        self.ignore_id = ignore_id
+
+        super().__init__(
+            tokenizer,
+            **kwargs,
+        )
+
+    def select_best_resolution(self, image_size):
+        # used for cropping
+        original_width, original_height = image_size
+        best_fit = None
+        max_effective_resolution = 0
+        min_wasted_resolution = float("inf")
+
+        for width, height in self.candidate_resolutions:
+            scale = min(width / original_width, height / original_height)
+            downscaled_width, downscaled_height = int(
+                original_width * scale), int(original_height * scale)
+            effective_resolution = min(downscaled_width * downscaled_height,
+                                       original_width * original_height)
+            wasted_resolution = (width * height) - effective_resolution
+
+            if effective_resolution > max_effective_resolution or (
+                    effective_resolution == max_effective_resolution
+                    and wasted_resolution < min_wasted_resolution):
+                max_effective_resolution = effective_resolution
+                min_wasted_resolution = wasted_resolution
+                best_fit = (width, height)
+
+        return best_fit
+
+    @property
+    def bos_id(self):
+        return self.tokenizer.bos_token_id
+
+    @property
+    def eos_id(self):
+        return self.tokenizer.eos_token_id
+
+    @property
+    def pad_id(self):
+        return self.tokenizer.pad_token_id
+
+    def encode(self, text: str, bos: bool = True, eos: bool = False):
+        t = self.tokenizer.encode(text, add_special_tokens=False)
+
+        if bos:
+            t = [self.bos_id] + t
+        if eos:
+            t = t + [self.eos_id]
+
+        return t
+
+    def decode(self, t: list[int], **kwargs) -> str:
+        return self.tokenizer.decode(t, **kwargs)
+
+    def process_one(
+        self,
+        prompt: str,
+        images: list[Image.Image],
+        inference_mode: bool = True,
+        **kwargs,
+    ):
+        """
+
+        Args:
+            prompt (str): the formatted prompt;
+            conversations (list[dict]): conversations with a list of messages;
+            images (list[ImageType]): the list of images;
+            inference_mode (bool): if True, then remove the last eos token;
+            system_prompt (str): the system prompt;
+            **kwargs:
+
+        Returns:
+            outputs (BaseProcessorOutput): the output of the processor,
+                - input_ids (torch.LongTensor): [N + image tokens]
+                - target_ids (torch.LongTensor): [N + image tokens]
+                - pixel_values (torch.FloatTensor): [n_patches, 3, H, W]
+                - image_id (int): the id of the image token
+                - num_image_tokens (list[int]): the number of image tokens
+        """
+
+        assert (prompt is not None and images is not None
+                ), "prompt and images must be used at the same time."
+
+        sft_format = prompt
+        tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens = self.tokenize_with_images(
+            sft_format, images, bos=True, eos=True, cropping=len(images) <= 2)
+        masked_tokenized_str = []
+        for token_index in tokenized_str:
+            if token_index != self.image_token_id:
+                masked_tokenized_str.append(token_index)
+            else:
+                masked_tokenized_str.append(self.ignore_id)
+
+        assert len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str), \
+            (f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
+             f"imags_seq_mask's length {len(images_seq_mask)}, are not equal")
+
+        input_ids = torch.LongTensor(tokenized_str)
+        target_ids = torch.LongTensor(masked_tokenized_str)
+        images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
+
+        # set input_ids < 0 | input_ids == self.image_token_id as ignore_id
+        target_ids[(input_ids < 0) |
+                   (input_ids == self.image_token_id)] = self.ignore_id
+        input_ids[input_ids < 0] = self.pad_id
+
+        if inference_mode:
+            # Remove the ending eos token
+            assert input_ids[-1] == self.eos_id
+            input_ids = input_ids[:-1]
+            target_ids = target_ids[:-1]
+            images_seq_mask = images_seq_mask[:-1]
+
+        if len(images_list) == 0:
+            pixel_values = torch.zeros((1, 3, self.image_size, self.image_size))
+            images_spatial_crop = torch.zeros((1, 2), dtype=torch.long)
+        else:
+            pixel_values = torch.stack(images_list, dim=0)
+            images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
+
+        input_ids = input_ids.unsqueeze(0)
+
+        prepare = BatchFeature(
+            data=dict(
+                input_ids=input_ids,
+                pixel_values=pixel_values,
+                images_seq_mask=images_seq_mask,
+                images_spatial_crop=images_spatial_crop,
+                num_image_tokens=num_image_tokens,
+            ),
+            tensor_type="pt",
+        )
+        return prepare
+
+    def __call__(
+        self,
+        *,
+        prompt: str,
+        images: list[Image.Image],
+        inference_mode: bool = True,
+        **kwargs,
+    ):
+        """
+
+        Args:
+            prompt (str): the formatted prompt;
+            images (list[ImageType]): the list of images;
+            inference_mode (bool): if True, then remove the last eos token;
+            **kwargs:
+
+        Returns:
+            outputs (BaseProcessorOutput): the output of the processor,
+                - input_ids (torch.LongTensor): [N + image tokens]
+                - images (torch.FloatTensor): [n_images, 3, H, W]
+                - image_id (int): the id of the image token
+                - num_image_tokens (list[int]): the number of image tokens
+        """
+
+        prepare = self.process_one(
+            prompt=prompt,
+            images=images,
+            inference_mode=inference_mode,
+        )
+
+        return prepare
+
+    def tokenize_with_images(
+        self,
+        conversation: str,
+        images: list[Image.Image],
+        bos: bool = True,
+        eos: bool = True,
+        cropping: bool = True,
+    ):
+        """Tokenize text with <image> tags."""
+        assert conversation.count(self.image_token) == len(images)
+        text_splits = conversation.split(self.image_token)
+        images_list, images_seq_mask, images_spatial_crop = [], [], []
+        num_image_tokens = []
+        tokenized_str = []
+        for text_sep, image in zip(text_splits, images):
+            """encode text_sep"""
+            tokenized_sep = self.encode(text_sep, bos=False, eos=False)
+            tokenized_str += tokenized_sep
+            images_seq_mask += [False] * len(tokenized_sep)
+
+            """select best resolution for anyres"""
+            if cropping:
+                best_width, best_height = self.select_best_resolution(image.size)
+            else:
+                best_width, best_height = self.image_size, self.image_size
+
+            """process the global view"""
+            global_view = ImageOps.pad(image, (self.image_size, self.image_size),
+                                       color=tuple(int(x * 255) for x in self.image_transform.mean))
+            images_list.append(self.image_transform(global_view))
+
+            """process the local views"""
+            local_view = ImageOps.pad(image, (best_width, best_height),
+                                      color=tuple(int(x * 255) for x in self.image_transform.mean))
+            for i in range(0, best_height, self.image_size):
+                for j in range(0, best_width, self.image_size):
+                    images_list.append(
+                        self.image_transform(local_view.crop((j, i, j + self.image_size, i + self.image_size))))
+
+            """record height / width crop num"""
+            num_width_tiles, num_height_tiles = best_width // self.image_size, best_height // self.image_size
+            images_spatial_crop.append([num_width_tiles, num_height_tiles])
+
+            """add image tokens"""
+            h = w = math.ceil((self.image_size // self.patch_size) / self.downsample_ratio)
+            # global views tokens h * (w + 1), 1 is for line separator
+            tokenized_image = [self.image_token_id] * h * (w + 1)
+            # add a separator between global and local views
+            tokenized_image += [self.image_token_id]
+            # local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
+            tokenized_image += [self.image_token_id] * (num_height_tiles * h) * (num_width_tiles * w + 1)
+
+            tokenized_str += tokenized_image
+            images_seq_mask += [True] * len(tokenized_image)
+            num_image_tokens.append(len(tokenized_image))
+
+        """process the last text split"""
+        tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False)
+        tokenized_str += tokenized_sep
+        images_seq_mask += [False] * len(tokenized_sep)
+
+        """add the bos and eos tokens"""
+        if bos:
+            tokenized_str = [self.bos_id] + tokenized_str
+            images_seq_mask = [False] + images_seq_mask
+        if eos:
+            tokenized_str = tokenized_str + [self.eos_id]
+            images_seq_mask = images_seq_mask + [False]
+
+        assert len(tokenized_str) == len(
+            images_seq_mask), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
+
+        return tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens
+
+
+AutoProcessor.register("DeepseekVLV2Processor", DeepseekVLV2Processor)
diff --git a/vllm_v0.10.0/vllm/transformers_utils/processors/ovis.py b/vllm_v0.10.0/vllm/transformers_utils/processors/ovis.py
new file mode 100644
index 0000000..557d251
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/processors/ovis.py
@@ -0,0 +1,420 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# yapf: disable
+# ruff: noqa: E501
+# coding=utf-8
+# adapted from https://github.com/AIDC-AI/Ovis/blob/35ab51a1a1e3542fa6db260a1084cefbc8f164bb/ovis/vllm/processing_ovis.py
+# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import cached_property
+from typing import Union
+
+import PIL
+import torch
+from transformers import AutoProcessor, BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import (ProcessingKwargs, ProcessorMixin,
+                                           Unpack)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+
+from vllm.multimodal.image import convert_image_mode
+
+__all__ = ['OvisProcessor']
+IGNORE_ID = -100
+
+class OvisProcessorKwargs(ProcessingKwargs, total=False):   # type: ignore[call-arg]
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "images_kwargs": {
+            'max_partition':9,
+            'covering_threshold':0.9,
+            'convert_to_rgb':True,
+        'return_tensors':'pt'},
+    }
+
+
+
+class OvisProcessor(ProcessorMixin):
+    r"""
+    Constructs a Ovis processor which wraps a Ovis image processor and a Qwen2 tokenizer into a single processor.
+    [`OvisProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
+    [`~OvisProcessor.__call__`] and [`~OvisProcessor.decode`] for more information.
+    Args:
+        image_processor ([`Qwen2VLImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`Qwen2TokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template", "image_pad_token", "image_segment_len"]
+
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        chat_template=None,
+        image_pad_token=None,
+        image_segment_len=255,
+        **kwargs,
+    ):
+        self.image_token = "<image>"
+        self.image_pad_token = image_pad_token
+        self.image_segment_len = image_segment_len
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+    @cached_property
+    def extra_special_tokens(self):
+        image_pad_token_id = self.tokenizer.get_vocab()[self.image_pad_token]
+        extra_special_tokens = {
+            "image_token": -200,
+            "image_atom": -300,
+            "image_start": -301,
+            "image_prefix": -302,
+            "image_col_sep": -303,
+            "image_row_sep": -304,
+            "image_end": -305,
+            'image_pad': image_pad_token_id,
+        }
+        return extra_special_tokens
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        **kwargs: Unpack[OvisProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
+        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
+            Args:
+                images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                    The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                    tensor. Both channels-first and channels-last formats are supported.
+                text (`str`, `list[str]`, `list[list[str]]`):
+                    The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                    (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                    `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+                videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                    The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
+                    tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
+                return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                    If set, will return tensors of a particular framework. Acceptable values are:
+                    - `'tf'`: Return TensorFlow `tf.constant` objects.
+                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                    - `'np'`: Return NumPy `np.ndarray` objects.
+                    - `'jax'`: Return JAX `jnp.ndarray` objects.
+            Returns:
+                [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+                - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+                - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+                  `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+                  `None`).
+                - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+                - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
+                - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
+                - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
+                - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            OvisProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        # Process all images first
+        image_features = {}
+        if images is not None:
+            processed_images = []
+            image_placeholders_list = []
+            grids = []
+
+            # Process each image
+            for image in images if isinstance(images, list) else [images]:
+                pixel_values, image_placeholders, grid = self.preprocess_image(
+                    image=image, **output_kwargs["images_kwargs"]
+                )
+                processed_images.append(pixel_values)
+                image_placeholders_list.append(image_placeholders)
+                grids.append(grid)
+
+            # assign all processed images
+            if processed_images:
+                image_features["image_placeholders"] = image_placeholders_list
+
+        # Process text input
+        if text is not None:
+
+            if not isinstance(text, list):
+                text = [text]
+
+            tokenized_batched_text = self._tokenize_with_image_symbol(text)
+            image_token_id = self.get_token_value("image_token")
+            replaced_ids_list = []
+            idx = 0
+            for ids_tensor in tokenized_batched_text:
+                if image_token_id in ids_tensor and "image_placeholders" in image_features:
+                    if idx < len(image_features["image_placeholders"]):
+                        # Converts in list for ease of use
+                        ids_list = ids_tensor.tolist()
+
+                        new_ids = []
+
+                        # replace placeholders
+                        for i, token_id in enumerate(ids_list):
+                            if token_id == image_token_id:
+                                placeholder_ids = image_features["image_placeholders"][idx]
+                                new_ids.extend(placeholder_ids)
+                                idx += 1
+                            else:
+                                new_ids.append(token_id)
+
+                        # Converts back to tensors
+                        ids_tensor = torch.tensor(new_ids, dtype=torch.long)
+                    else:
+                        raise RuntimeError(
+                            'Mismatch between the images you provided and the number of placeholder present in the text')
+
+                replaced_ids_list.append(ids_tensor)
+
+            if replaced_ids_list:
+                replaced_and_tokenized_ids = torch.stack(replaced_ids_list)
+            else:
+                replaced_and_tokenized_ids = torch.tensor([], dtype=torch.long)
+
+            # Create the output with text features
+            output = BatchFeature(
+                data={
+                    "input_ids": replaced_and_tokenized_ids,
+                }
+            )
+
+            # Add image features if present
+            if image_features:
+                output["pixel_values"] = processed_images
+                output['grids'] = grids
+
+            return output
+
+        # If only images were provided
+        return BatchFeature(data=image_features)
+
+    def _tokenize_with_image_symbol(self, text_list: list[str]) -> torch.LongTensor:
+        batch_token_ids = []
+        for text in text_list:
+            text_chunks = [self.tokenizer(chunk, add_special_tokens=False).input_ids for chunk in
+                           text.split(self.image_token)]
+            token_ids = []
+            num_chuck = len(text_chunks)
+            for i, chunk in enumerate(text_chunks):
+                token_ids.extend(chunk)
+                if i < num_chuck - 1:
+                    token_ids.append(self.get_token_value("image_token"))
+            batch_token_ids.append(token_ids)
+        return torch.tensor(batch_token_ids, dtype=torch.long)
+
+    def get_image_size(self):
+        size = self.image_processor.size
+        if 'shortest_edge' in size:
+            width = height = size['shortest_edge']
+        elif "height" in size and "width" in size:
+            width = size['width']
+            height = size['height']
+        else:
+            raise ValueError( "Can't parse image size from image_processor config.")
+        return height, width
+
+    def get_token_value(self, tok):
+        return self.extra_special_tokens[tok]
+
+    def construct_image_indicators(self, grid):
+        image_placeholders = [self.get_token_value('image_start'),
+                              self.get_token_value('image_atom'),
+                              self.get_token_value('image_prefix')]
+        if grid[0] * grid[1] > 1:
+            for r in range(grid[0]):
+                for c in range(grid[1]):
+                    image_placeholders.append(self.get_token_value('image_atom') )
+                    if c < grid[1] - 1:
+                        image_placeholders.append(self.get_token_value('image_col_sep'))
+                if r < grid[0] - 1:
+                    image_placeholders.append(self.get_token_value('image_row_sep'))
+        image_placeholders.append(self.get_token_value('image_end'))
+        return image_placeholders
+
+    def construct_image_placeholders(self, grid):
+
+        image_placeholders = self.construct_image_indicators(grid)
+
+        image_atom_token_id = self.get_token_value('image_atom')
+        # Extract the padding token ID from tokenizer
+        image_padding_token_id = self.get_token_value('image_pad')
+
+        # Create a new list with padding tokens inserted
+        padded_placeholder_tokens = []
+        for token in image_placeholders:
+            padded_placeholder_tokens.append(image_padding_token_id)
+            if token == image_atom_token_id:
+                padded_placeholder_tokens.extend([image_padding_token_id] * self.image_segment_len)
+        return padded_placeholder_tokens
+
+    def preprocess_image(self, image: PIL.Image.Image, max_partition, covering_threshold, convert_to_rgb, return_tensors):
+        def _preprocess(img: PIL.Image.Image, side):
+            # first resize and preprocess
+            w, h = img.size
+            if w == h:
+                new_width = new_height = side
+            elif w > h:
+                new_width = side
+                new_height = int(h / w * new_width)
+            else:
+                new_height = side
+                new_width = int(w / h * new_height)
+            new_size = dict(height=new_height, width=new_width)
+            pixel_values = self.image_processor.preprocess(img, size=new_size, return_tensors=return_tensors)['pixel_values']
+
+            # then pad to square
+            square_values = torch.zeros([1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device)
+            new_height, new_width = pixel_values.shape[2:]
+            if new_height == new_width:
+                square_values[:, :, :, :] = pixel_values
+            elif new_height > new_width:
+                from_index = (side - new_width) // 2
+                square_values[:, :, :, from_index:from_index + new_width] = pixel_values
+            else:
+                from_index = (side - new_height) // 2
+                square_values[:, :, from_index:from_index + new_height, :] = pixel_values
+
+            return square_values
+
+        def _partition(img, grid) -> list[tuple[int, int, int, int]]:
+            w, h = img.size
+            row_height = h // grid[0]
+            col_width = w // grid[1]
+
+            partition = []
+            for row in range(grid[0]):
+                for col in range(grid[1]):
+                    left = col * col_width
+                    upper = row * row_height
+                    right = w if col == grid[1] - 1 else (col + 1) * col_width
+                    lower = h if row == grid[0] - 1 else (row + 1) * row_height
+                    partition.append((left, upper, right, lower))
+
+            return partition
+
+        def _covering_area(left, upper, right, lower, side):
+            w = right - left
+            h = lower - upper
+            w, h = max(w, h), min(w, h)
+            if w > side:
+                h = h / w * side
+                w = side
+            return w * h
+
+        def _get_best_grid(img, side):
+            img_area = img.size[0] * img.size[1]
+
+            candidate_grids = []
+            for i in range(1, max_partition + 1):
+                for j in range(1, max_partition + 1):
+                    if i * j <= max_partition:
+                        candidate_grids.append((i, j))
+
+            all_grids = []
+            good_grids = []
+            for grid in candidate_grids:
+                partition = _partition(img, grid)
+                covering_ratio = sum([_covering_area(*p, side) for p in partition]) / img_area
+                assert covering_ratio <= 1.0
+                all_grids.append((grid, covering_ratio))
+                if covering_ratio > covering_threshold:
+                    good_grids.append((grid, covering_ratio))
+
+            if len(good_grids) > 0:
+                # pick the good partition with minimum #sub_images and break the tie using covering_ratio
+                return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][0]
+            else:
+                # pick the partition with maximum covering_ratio and break the tie using #sub_images
+                return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0]
+
+        if convert_to_rgb:
+            image = convert_image_mode(image, 'RGB')
+
+
+        sides = self.get_image_size()
+        if sides[0] != sides[1]:
+            raise ValueError('get_image_size() returns non-square size')
+        side = sides[0]
+        grid = _get_best_grid(image, side)
+        partition = _partition(image, grid)
+        crops = [image.crop(p) for p in partition]
+        if len(crops) > 1:
+            crops.insert(0, image)
+        pixel_values = torch.cat([_preprocess(crop, side) for crop in crops], dim=0)
+        image_placeholders = self.construct_image_placeholders(grid)
+        return pixel_values, image_placeholders, grid
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    def post_process_image_text_to_text(self, generated_outputs):
+        """
+        Post-process the output of the model to decode the text.
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
+                or `(sequence_length,)`.
+        Returns:
+            `list[str]`: The decoded text.
+        """
+        return self.tokenizer.batch_decode(
+            generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+        return names_from_processor + ["second_per_grid_ts"]
+
+
+AutoProcessor.register("OvisProcessor", OvisProcessor)
diff --git a/vllm_v0.10.0/vllm/transformers_utils/s3_utils.py b/vllm_v0.10.0/vllm/transformers_utils/s3_utils.py
new file mode 100644
index 0000000..f95aae7
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/s3_utils.py
@@ -0,0 +1,162 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import fnmatch
+import os
+import shutil
+import signal
+import tempfile
+from pathlib import Path
+from typing import Optional
+
+from vllm.utils import PlaceholderModule
+
+try:
+    import boto3
+except ImportError:
+    boto3 = PlaceholderModule("boto3")  # type: ignore[assignment]
+
+
+def _filter_allow(paths: list[str], patterns: list[str]) -> list[str]:
+    return [
+        path for path in paths if any(
+            fnmatch.fnmatch(path, pattern) for pattern in patterns)
+    ]
+
+
+def _filter_ignore(paths: list[str], patterns: list[str]) -> list[str]:
+    return [
+        path for path in paths
+        if not any(fnmatch.fnmatch(path, pattern) for pattern in patterns)
+    ]
+
+
+def glob(s3=None,
+         path: str = "",
+         allow_pattern: Optional[list[str]] = None) -> list[str]:
+    """
+    List full file names from S3 path and filter by allow pattern.
+
+    Args:
+        s3: S3 client to use.
+        path: The S3 path to list from.
+        allow_pattern: A list of patterns of which files to pull.
+
+    Returns:
+        list[str]: List of full S3 paths allowed by the pattern
+    """
+    if s3 is None:
+        s3 = boto3.client("s3")
+    if not path.endswith("/"):
+        path = path + "/"
+    bucket_name, _, paths = list_files(s3,
+                                       path=path,
+                                       allow_pattern=allow_pattern)
+    return [f"s3://{bucket_name}/{path}" for path in paths]
+
+
+def list_files(
+        s3,
+        path: str,
+        allow_pattern: Optional[list[str]] = None,
+        ignore_pattern: Optional[list[str]] = None
+) -> tuple[str, str, list[str]]:
+    """
+    List files from S3 path and filter by pattern.
+
+    Args:
+        s3: S3 client to use.
+        path: The S3 path to list from.
+        allow_pattern: A list of patterns of which files to pull.
+        ignore_pattern: A list of patterns of which files not to pull.
+
+    Returns:
+        tuple[str, str, list[str]]: A tuple where:
+            - The first element is the bucket name
+            - The second element is string represent the bucket 
+              and the prefix as a dir like string
+            - The third element is a list of files allowed or 
+              disallowed by pattern
+    """
+    parts = path.removeprefix('s3://').split('/')
+    prefix = '/'.join(parts[1:])
+    bucket_name = parts[0]
+
+    objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
+    paths = [obj['Key'] for obj in objects.get('Contents', [])]
+
+    paths = _filter_ignore(paths, ["*/"])
+    if allow_pattern is not None:
+        paths = _filter_allow(paths, allow_pattern)
+
+    if ignore_pattern is not None:
+        paths = _filter_ignore(paths, ignore_pattern)
+
+    return bucket_name, prefix, paths
+
+
+class S3Model:
+    """
+    A class representing a S3 model mirrored into a temporary directory.
+
+    Attributes:
+        s3: S3 client.
+        dir: The temporary created directory.
+
+    Methods:
+        pull_files(): Pull model from S3 to the temporary directory.
+    """
+
+    def __init__(self) -> None:
+        self.s3 = boto3.client('s3')
+        for sig in (signal.SIGINT, signal.SIGTERM):
+            existing_handler = signal.getsignal(sig)
+            signal.signal(sig, self._close_by_signal(existing_handler))
+
+        self.dir = tempfile.mkdtemp()
+
+    def __del__(self):
+        self._close()
+
+    def _close(self) -> None:
+        if os.path.exists(self.dir):
+            shutil.rmtree(self.dir)
+
+    def _close_by_signal(self, existing_handler=None):
+
+        def new_handler(signum, frame):
+            self._close()
+            if existing_handler:
+                existing_handler(signum, frame)
+
+        return new_handler
+
+    def pull_files(self,
+                   s3_model_path: str = "",
+                   allow_pattern: Optional[list[str]] = None,
+                   ignore_pattern: Optional[list[str]] = None) -> None:
+        """
+        Pull files from S3 storage into the temporary directory.
+
+        Args:
+            s3_model_path: The S3 path of the model.
+            allow_pattern: A list of patterns of which files to pull.
+            ignore_pattern: A list of patterns of which files not to pull.
+
+        """
+        if not s3_model_path.endswith("/"):
+            s3_model_path = s3_model_path + "/"
+
+        bucket_name, base_dir, files = list_files(self.s3, s3_model_path,
+                                                  allow_pattern,
+                                                  ignore_pattern)
+        if len(files) == 0:
+            return
+
+        for file in files:
+            destination_file = os.path.join(
+                self.dir,
+                file.removeprefix(base_dir).lstrip("/"))
+            local_dir = Path(destination_file).parent
+            os.makedirs(local_dir, exist_ok=True)
+            self.s3.download_file(bucket_name, file, destination_file)
diff --git a/vllm_v0.10.0/vllm/transformers_utils/tokenizer.py b/vllm_v0.10.0/vllm/transformers_utils/tokenizer.py
new file mode 100644
index 0000000..25dd71d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/tokenizer.py
@@ -0,0 +1,320 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import contextlib
+import copy
+import os
+import warnings
+from functools import lru_cache
+from pathlib import Path
+from types import MethodType
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+import huggingface_hub
+from transformers import (AutoTokenizer, PreTrainedTokenizer,
+                          PreTrainedTokenizerFast)
+
+from vllm import envs
+from vllm.logger import init_logger
+from vllm.transformers_utils.config import (
+    get_sentence_transformer_tokenizer_config)
+from vllm.transformers_utils.tokenizers import MistralTokenizer
+from vllm.transformers_utils.utils import check_gguf_file
+from vllm.utils import make_async
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+    from vllm.lora.request import LoRARequest
+    from vllm.transformers_utils.tokenizer_base import TokenizerBase
+else:
+    ModelConfig = Any
+    LoRARequest = Any
+    TokenizerBase = Any
+
+logger = init_logger(__name__)
+
+AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast,
+                     TokenizerBase]
+
+
+def decode_tokens(
+    tokenizer: AnyTokenizer,
+    token_ids: list[int],
+    *,
+    skip_special_tokens: Optional[bool] = None,
+) -> str:
+    """
+    Backend-agnostic equivalent of HF's
+    `tokenizer.decode(token_ids, ...)`.
+
+    `skip_special_tokens=None` means to use the backend's default
+    settings.
+    """
+    if skip_special_tokens is not None:
+        return tokenizer.decode(token_ids,
+                                skip_special_tokens=skip_special_tokens)
+
+    return tokenizer.decode(token_ids)
+
+
+def encode_tokens(
+    tokenizer: AnyTokenizer,
+    text: str,
+    *,
+    truncation: Optional[bool] = None,
+    max_length: Optional[int] = None,
+    add_special_tokens: Optional[bool] = None,
+) -> list[int]:
+    """
+    Backend-agnostic equivalent of HF's
+    `tokenizer.encode(text, ...)`.
+
+    `add_special_tokens=None` means to use the backend's default
+    settings.
+    """
+
+    kw_args: dict[str, Any] = {}
+    if max_length is not None:
+        kw_args["max_length"] = max_length
+
+    if truncation is not None:
+        kw_args["truncation"] = truncation
+
+    if add_special_tokens is not None:
+        kw_args["add_special_tokens"] = add_special_tokens
+
+    return tokenizer.encode(text, **kw_args)
+
+
+def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
+    """
+    By default, transformers will recompute multiple tokenizer properties
+    each time they are called, leading to a significant slowdown.
+    This proxy caches these properties for faster access.
+    """
+    cached_tokenizer = copy.copy(tokenizer)
+
+    tokenizer_all_special_ids = tokenizer.all_special_ids
+    tokenizer_all_special_tokens = tokenizer.all_special_tokens
+    tokenizer_all_special_tokens_extended = (
+        tokenizer.all_special_tokens_extended)
+    tokenizer_vocab = tokenizer.get_vocab()
+    tokenizer_len = len(tokenizer)
+
+    max_token_id = max(tokenizer_vocab.values())
+    # Some tokenizers (e.g., QwenTokenizer) have special tokens that
+    # are added and included in the implementation of the vocab_size
+    # property, but not in get_vocab(); if there is an implementation
+    # of vocab size, we should take the greater value.
+    if hasattr(tokenizer, "vocab_size"):
+        with contextlib.suppress(NotImplementedError):
+            max_token_id = max(max_token_id, tokenizer.vocab_size)
+
+    class CachedTokenizer(tokenizer.__class__):  # type: ignore
+
+        @property
+        def all_special_ids(self) -> list[int]:
+            return tokenizer_all_special_ids
+
+        @property
+        def all_special_tokens(self) -> list[str]:
+            return tokenizer_all_special_tokens
+
+        @property
+        def all_special_tokens_extended(self) -> list[str]:
+            return tokenizer_all_special_tokens_extended
+
+        @property
+        def max_token_id(self) -> int:
+            return max_token_id
+
+        def get_vocab(self) -> dict[str, int]:
+            return tokenizer_vocab
+
+        def __len__(self) -> int:
+            return tokenizer_len
+
+        def __reduce__(self):
+            return get_cached_tokenizer, (tokenizer, )
+
+    CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"
+
+    cached_tokenizer.__class__ = CachedTokenizer
+    return cached_tokenizer
+
+
+def patch_padding_side(tokenizer: PreTrainedTokenizer) -> None:
+    """Patch _pad method to accept `padding_side` for older tokenizers."""
+    orig_pad = tokenizer._pad
+
+    def _pad(
+        self: PreTrainedTokenizer,
+        *args,
+        padding_side: Optional[str] = None,
+        **kwargs,
+    ):
+        if padding_side is not None and padding_side != self.padding_side:
+            msg = ("`padding_side` argument is not supported by "
+                   f"{type(tokenizer).__name__} and will be ignored.")
+            warnings.warn(msg, stacklevel=2)
+
+        return orig_pad(*args, **kwargs)
+
+    tokenizer._pad = MethodType(_pad, tokenizer)
+
+
+def get_tokenizer(
+    tokenizer_name: Union[str, Path],
+    *args,
+    tokenizer_mode: str = "auto",
+    trust_remote_code: bool = False,
+    revision: Optional[str] = None,
+    download_dir: Optional[str] = None,
+    **kwargs,
+) -> AnyTokenizer:
+    """Gets a tokenizer for the given model name via HuggingFace or ModelScope.
+    """
+    if envs.VLLM_USE_MODELSCOPE:
+        # download model from ModelScope hub,
+        # lazy import so that modelscope is not required for normal use.
+        # pylint: disable=C.
+        from modelscope.hub.snapshot_download import snapshot_download
+
+        # avoid circuit import
+        from vllm.model_executor.model_loader.weight_utils import get_lock
+
+        # Only set the tokenizer here, model will be downloaded on the workers.
+        if not os.path.exists(tokenizer_name):
+            # Use file lock to prevent multiple processes from
+            # downloading the same file at the same time.
+            with get_lock(tokenizer_name, download_dir):
+                tokenizer_path = snapshot_download(
+                    model_id=tokenizer_name,
+                    cache_dir=download_dir,
+                    revision=revision,
+                    local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                    # Ignore weights - we only need the tokenizer.
+                    ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
+                tokenizer_name = tokenizer_path
+
+    if tokenizer_mode == "slow":
+        if kwargs.get("use_fast", False):
+            raise ValueError(
+                "Cannot use the fast tokenizer in slow tokenizer mode.")
+        kwargs["use_fast"] = False
+
+    if "truncation_side" not in kwargs:
+        kwargs["truncation_side"] = "left"
+
+    # Separate model folder from file path for GGUF models
+    is_gguf = check_gguf_file(tokenizer_name)
+    if is_gguf:
+        kwargs["gguf_file"] = Path(tokenizer_name).name
+        tokenizer_name = Path(tokenizer_name).parent
+
+    # if tokenizer is from official mistral org
+    is_from_mistral_org = str(tokenizer_name).split("/")[0] == "mistralai"
+    if is_from_mistral_org and tokenizer_mode != "mistral":
+        warnings.warn(
+            'It is strongly recommended to run mistral models with '
+            '`--tokenizer-mode "mistral"` to ensure correct '
+            'encoding and decoding.',
+            FutureWarning,
+            stacklevel=2)
+
+    tokenizer: AnyTokenizer
+    if tokenizer_mode == "mistral":
+        tokenizer = MistralTokenizer.from_pretrained(str(tokenizer_name),
+                                                     revision=revision)
+    elif tokenizer_mode == "custom":
+        from vllm.transformers_utils.tokenizer_base import TokenizerRegistry
+        tokenizer = TokenizerRegistry.get_tokenizer(str(tokenizer_name),
+                                                    *args,
+                                                    revision=revision,
+                                                    download_dir=download_dir,
+                                                    **kwargs)
+    else:
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(
+                tokenizer_name,
+                *args,
+                trust_remote_code=trust_remote_code,
+                revision=revision,
+                **kwargs,
+            )
+        except ValueError as e:
+            # If the error pertains to the tokenizer class not existing or not
+            # currently being imported,
+            # suggest using the --trust-remote-code flag.
+            if not trust_remote_code and (
+                    "does not exist or is not currently imported." in str(e)
+                    or "requires you to execute the tokenizer file" in str(e)):
+                err_msg = ("Failed to load the tokenizer. If the tokenizer "
+                           "is a custom tokenizer not yet available in the "
+                           "HuggingFace transformers library, consider "
+                           "setting `trust_remote_code=True` in LLM or using "
+                           "the `--trust-remote-code` flag in the CLI.")
+                raise RuntimeError(err_msg) from e
+            else:
+                raise e
+
+        # The special_tokens in tokenizer should also be
+        # controlled by do_lower_case in encoder_config
+        encoder_config = get_sentence_transformer_tokenizer_config(
+            tokenizer_name, revision)
+        if isinstance(encoder_config, dict) and encoder_config.get(
+                "do_lower_case", False):
+            special_tokens_map = {
+                k: v.lower()
+                for k, v in tokenizer.special_tokens_map.items()
+            }
+            tokenizer.add_special_tokens(special_tokens_map)
+
+        # NOTE: We can remove this after https://github.com/THUDM/ChatGLM3/issues/1324
+        if type(tokenizer).__name__ in ("ChatGLMTokenizer",
+                                        "ChatGLM4Tokenizer"):
+            assert isinstance(tokenizer, PreTrainedTokenizer)
+            patch_padding_side(tokenizer)
+
+        if not isinstance(tokenizer, PreTrainedTokenizerFast):
+            logger.warning(
+                "Using a slow tokenizer. This might cause a significant "
+                "slowdown. Consider using a fast tokenizer instead.")
+        tokenizer = get_cached_tokenizer(tokenizer)
+
+    return tokenizer
+
+
+cached_get_tokenizer = lru_cache(get_tokenizer)
+
+
+def cached_tokenizer_from_config(
+    model_config: ModelConfig,
+    **kwargs: Any,
+):
+    return cached_get_tokenizer(
+        model_config.tokenizer,
+        tokenizer_mode=model_config.tokenizer_mode,
+        tokenizer_revision=model_config.tokenizer_revision,
+        trust_remote_code=model_config.trust_remote_code,
+        **kwargs,
+    )
+
+
+def get_lora_tokenizer(lora_request: LoRARequest, *args,
+                       **kwargs) -> Optional[AnyTokenizer]:
+    if lora_request is None:
+        return None
+    try:
+        tokenizer = get_tokenizer(lora_request.lora_path, *args, **kwargs)
+    except Exception as e:
+        # No tokenizer was found in the LoRA folder,
+        # use base model tokenizer
+        logger.warning(
+            "No tokenizer found in %s, using base model tokenizer instead. "
+            "(Exception: %s)", lora_request.lora_path, e)
+        tokenizer = None
+    return tokenizer
+
+
+get_lora_tokenizer_async = make_async(get_lora_tokenizer)
diff --git a/vllm_v0.10.0/vllm/transformers_utils/tokenizer_base.py b/vllm_v0.10.0/vllm/transformers_utils/tokenizer_base.py
new file mode 100644
index 0000000..20e5fea
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/tokenizer_base.py
@@ -0,0 +1,149 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import importlib
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+if TYPE_CHECKING:
+    from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+
+
+class TokenizerBase(ABC):
+
+    @property
+    @abstractmethod
+    def all_special_tokens_extended(self) -> list[str]:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def all_special_tokens(self) -> list[str]:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def all_special_ids(self) -> list[int]:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def bos_token_id(self) -> int:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def eos_token_id(self) -> int:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def sep_token(self) -> str:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def pad_token(self) -> str:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def is_fast(self) -> bool:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def vocab_size(self) -> int:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def max_token_id(self) -> int:
+        raise NotImplementedError()
+
+    def __len__(self) -> int:
+        return self.vocab_size
+
+    @abstractmethod
+    def __call__(
+        self,
+        text: Union[str, list[str], list[int]],
+        text_pair: Optional[str] = None,
+        add_special_tokens: bool = False,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+    ):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def get_vocab(self) -> dict[str, int]:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def get_added_vocab(self) -> dict[str, int]:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def encode_one(
+        self,
+        text: str,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+    ) -> list[int]:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def encode(self,
+               text: str,
+               truncation: Optional[bool] = None,
+               max_length: Optional[int] = None,
+               add_special_tokens: Optional[bool] = None) -> list[int]:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def apply_chat_template(self,
+                            messages: list["ChatCompletionMessageParam"],
+                            tools: Optional[list[dict[str, Any]]] = None,
+                            **kwargs) -> list[int]:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def decode(self,
+               ids: Union[list[int], int],
+               skip_special_tokens: bool = True) -> str:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def convert_ids_to_tokens(
+        self,
+        ids: list[int],
+        skip_special_tokens: bool = True,
+    ) -> list[str]:
+        raise NotImplementedError()
+
+
+class TokenizerRegistry:
+    # Tokenizer name -> (tokenizer module, tokenizer class)
+    REGISTRY: dict[str, tuple[str, str]] = {}
+
+    @staticmethod
+    def register(name: str, module: str, class_name: str) -> None:
+        TokenizerRegistry.REGISTRY[name] = (module, class_name)
+
+    @staticmethod
+    def get_tokenizer(
+        tokenizer_name: str,
+        *args,
+        **kwargs,
+    ) -> TokenizerBase:
+        tokenizer_cls = TokenizerRegistry.REGISTRY.get(tokenizer_name)
+        if tokenizer_cls is None:
+            raise ValueError(f"Tokenizer {tokenizer_name} not found.")
+
+        tokenizer_module = importlib.import_module(tokenizer_cls[0])
+        class_ = getattr(tokenizer_module, tokenizer_cls[1])
+        return class_.from_pretrained(*args, **kwargs)
diff --git a/vllm_v0.10.0/vllm/transformers_utils/tokenizer_group.py b/vllm_v0.10.0/vllm/transformers_utils/tokenizer_group.py
new file mode 100644
index 0000000..eb53cce
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/tokenizer_group.py
@@ -0,0 +1,120 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig
+from vllm.lora.request import LoRARequest
+from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens,
+                                               get_lora_tokenizer,
+                                               get_lora_tokenizer_async,
+                                               get_tokenizer)
+from vllm.utils import LRUCache
+
+
+class TokenizerGroup:
+    """A group of tokenizers that can be used for LoRA adapters."""
+
+    def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
+                 max_input_length: Optional[int], **tokenizer_config):
+        self.tokenizer_id = tokenizer_id
+        self.tokenizer_config = tokenizer_config
+        self.enable_lora = enable_lora
+        self.max_input_length = max_input_length
+        self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)
+        max_loras = tokenizer_config.get("max_loras", 0)
+        self.lora_tokenizers = LRUCache[int, AnyTokenizer](
+            capacity=max(max_loras, max_num_seqs) if enable_lora else 0)
+
+    def get_max_input_len(self,
+                          lora_request: Optional[LoRARequest] = None
+                          ) -> Optional[int]:
+        """Get the maximum input length for the LoRA request."""
+        return self.max_input_length
+
+    def _raise_if_input_too_long(self,
+                                 encoded_tokens: list[int],
+                                 lora_request: Optional[LoRARequest] = None):
+        input_length = len(encoded_tokens)
+        if lora_request:
+            max_input_length = (lora_request.long_lora_max_len
+                                or self.max_input_length)
+        else:
+            max_input_length = self.max_input_length
+        if max_input_length is not None and input_length > max_input_length:
+            raise ValueError("Input too long.", input_length, max_input_length)
+
+    def encode(self,
+               prompt: str,
+               max_length: Optional[int] = None,
+               truncation: Optional[bool] = None,
+               lora_request: Optional[LoRARequest] = None,
+               add_special_tokens: Optional[bool] = None) -> list[int]:
+
+        tokenizer = self.get_lora_tokenizer(lora_request)
+        ret = encode_tokens(tokenizer,
+                            prompt,
+                            max_length=max_length,
+                            truncation=truncation,
+                            add_special_tokens=add_special_tokens)
+        self._raise_if_input_too_long(ret, lora_request)
+        return ret
+
+    async def encode_async(
+            self,
+            prompt: str,
+            max_length: Optional[int] = None,
+            truncation: Optional[bool] = None,
+            lora_request: Optional[LoRARequest] = None,
+            add_special_tokens: Optional[bool] = None) -> list[int]:
+        tokenizer = await self.get_lora_tokenizer_async(lora_request)
+        ret = encode_tokens(tokenizer,
+                            prompt,
+                            max_length=max_length,
+                            truncation=truncation,
+                            add_special_tokens=add_special_tokens)
+        self._raise_if_input_too_long(ret, lora_request)
+        return ret
+
+    def get_lora_tokenizer(
+        self,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> AnyTokenizer:
+        if not lora_request or not self.enable_lora:
+            return self.tokenizer
+        if lora_request.lora_int_id not in self.lora_tokenizers:
+            tokenizer = (get_lora_tokenizer(
+                lora_request, **self.tokenizer_config) or self.tokenizer)
+            self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
+            return tokenizer
+        else:
+            return self.lora_tokenizers[lora_request.lora_int_id]
+
+    async def get_lora_tokenizer_async(
+        self,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> AnyTokenizer:
+        if not lora_request or not self.enable_lora:
+            return self.tokenizer
+        if lora_request.lora_int_id not in self.lora_tokenizers:
+            tokenizer = (await get_lora_tokenizer_async(
+                lora_request, **self.tokenizer_config) or self.tokenizer)
+            self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
+            return tokenizer
+        else:
+            return self.lora_tokenizers[lora_request.lora_int_id]
+
+
+def init_tokenizer_from_configs(model_config: ModelConfig,
+                                scheduler_config: SchedulerConfig,
+                                lora_config: Optional[LoRAConfig]):
+    return TokenizerGroup(
+        tokenizer_id=model_config.tokenizer,
+        enable_lora=bool(lora_config),
+        max_num_seqs=scheduler_config.max_num_seqs,
+        max_loras=lora_config.max_loras if lora_config else 0,
+        max_input_length=None,
+        tokenizer_mode=model_config.tokenizer_mode,
+        trust_remote_code=model_config.trust_remote_code,
+        revision=model_config.tokenizer_revision,
+        truncation_side=model_config.truncation_side)
diff --git a/vllm_v0.10.0/vllm/transformers_utils/tokenizers/__init__.py b/vllm_v0.10.0/vllm/transformers_utils/tokenizers/__init__.py
new file mode 100644
index 0000000..941156c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/tokenizers/__init__.py
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from .mistral import (MistralTokenizer, maybe_serialize_tool_calls,
+                      truncate_tool_call_ids, validate_request_params)
+
+__all__ = [
+    "MistralTokenizer", "maybe_serialize_tool_calls", "truncate_tool_call_ids",
+    "validate_request_params"
+]
diff --git a/vllm_v0.10.0/vllm/transformers_utils/tokenizers/mistral.py b/vllm_v0.10.0/vllm/transformers_utils/tokenizers/mistral.py
new file mode 100644
index 0000000..f83405c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/tokenizers/mistral.py
@@ -0,0 +1,518 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Optional, Union, cast
+
+import huggingface_hub
+import regex as re
+from huggingface_hub import HfApi, hf_hub_download
+
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer_base import TokenizerBase
+from vllm.utils import is_list_of
+
+if TYPE_CHECKING:
+    # make sure `mistral_common` is lazy imported,
+    # so that users who only use non-mistral models
+    # will not be bothered by the dependency.
+    from mistral_common.protocol.instruct.request import ChatCompletionRequest
+    from mistral_common.tokens.tokenizers.mistral import (
+        MistralTokenizer as PublicMistralTokenizer)
+
+    from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class Encoding:
+    input_ids: Union[list[int], list[list[int]]]
+
+
+def maybe_serialize_tool_calls(request: "ChatCompletionRequest"):
+    # SEE: https://github.com/vllm-project/vllm/pull/9951
+    # Credits go to: @gcalmettes
+    # NOTE: There is currently a bug in pydantic where attributes
+    # declared as iterables are replaced in in the instances by
+    # pydantic-core ValidatorIterator instance. In particular, this
+    # affects tool_calls defined in ChatCompletionAssistantMessageParam
+    # model:
+    # see:
+    #   - https://github.com/pydantic/pydantic/issues/9467
+    # As a result, tool_calls from assistant messages are never
+    # deserialized in the request object if the tool_calls iterator is
+    # not consumed. This affect messages passed to the MistralTokenizer
+    # since no chat template is applied and therefore the tools_calls
+    # iterator is not directly consumed.
+    # Issue is tracked on Pydantic side, with resolution planned for
+    # v2.11 release. In the meantime, the official workaround is to
+    # consume the iterator so the tool_calls are correctly deserialized
+    # in the OpenAI ChatCompletionAssistantMessageParam object
+    # https://github.com/pydantic/pydantic/issues/9467#issuecomment-2442097291 # noqa: E501
+    # Official Pydantic Issues:
+    #   - https://github.com/pydantic/pydantic/issues/9541
+    # TODO: remove when pydantic v2.11 is released
+    for i, message in enumerate(request.messages):
+        if message.get("role") == 'assistant':
+            tool_calls_validator = message.get("tool_calls", ().__iter__())
+            validated_tool_calls = []
+            while True:
+                try:
+                    tool_call = next(tool_calls_validator)  # type: ignore
+                    validated_tool_calls.append(tool_call)
+                except StopIteration:
+                    break
+
+            request.messages[i]["tool_calls"] = validated_tool_calls
+
+
+def truncate_tool_call_ids(request: "ChatCompletionRequest"):
+    """Truncates tool call IDs for Mistral's ID requirements."""
+    for i, message in enumerate(request.messages):
+        if message.get("role") == 'assistant':
+            tool_calls = message.get("tool_calls", [])
+            for tool_call in tool_calls:
+                if len(tool_call["id"]) > 9:
+                    logger.warning(
+                        "Truncating tool call ID: %s to %s",
+                        tool_call["id"],
+                        tool_call["id"][-9:],
+                    )
+                    tool_call["id"] = tool_call["id"][-9:]
+
+            request.messages[i]["tool_calls"] = tool_calls
+
+        elif message.get("role") in {"tool_results", "tool"}:
+            if "tool_call_id" in message:
+                tool_call_id = message["tool_call_id"]
+
+                if len(tool_call_id) > 9:
+                    logger.warning(
+                        "Truncating tool_call_id: %s to %s",
+                        tool_call_id,
+                        tool_call_id[-9:],
+                    )
+                    tool_call_id = tool_call_id[-9:]
+                request.messages[i]["tool_call_id"] = tool_call_id
+
+
+def validate_request_params(request: "ChatCompletionRequest"):
+    if (request.skip_special_tokens is not None
+            and not request.skip_special_tokens):
+        raise ValueError("skip_special_tokens=False is not supported "
+                         "for Mistral tokenizers.")
+
+
+def list_local_repo_files(repo_id: str, revision: Optional[str]) -> list[str]:
+    repo_cache = os.path.join(
+        huggingface_hub.constants.HF_HUB_CACHE,
+        huggingface_hub.constants.REPO_ID_SEPARATOR.join(
+            ["models", *repo_id.split("/")]))
+
+    if revision is None:
+        revision_file = os.path.join(repo_cache, "refs", "main")
+        if os.path.isfile(revision_file):
+            with open(revision_file) as file:
+                revision = file.read()
+
+    if revision:
+        revision_dir = os.path.join(repo_cache, "snapshots", revision)
+        if os.path.isdir(revision_dir):
+            return os.listdir(revision_dir)
+
+    return []
+
+
+def find_tokenizer_file(files: list[str]):
+    file_pattern = re.compile(
+        r"^tokenizer\.model\.v.*$|^tekken\.json$|^tokenizer\.mm\.model\.v.*$")
+
+    matched_files = [file for file in files if file_pattern.match(file)]
+    if len(matched_files) > 1:
+        raise OSError(
+            f"Found {len(matched_files)} files matching the "
+            f"pattern: `{file_pattern.pattern}`. Make sure only one Mistral "
+            f"tokenizer is present in {files}.")
+    elif len(matched_files) == 0:
+        raise OSError(
+            f"Found {len(matched_files)} files matching the "
+            f"pattern: `{file_pattern.pattern}`. Make sure that a Mistral "
+            f"tokenizer is present in {files}.")
+
+    return matched_files[0]
+
+
+def _aggregate_content(content: list) -> list[dict[str, Any]]:
+    aggregated_content: list[dict[str, Any]] = []
+    for chunk in content:
+        if chunk.get("type"
+                     ) == "text" and aggregated_content and aggregated_content[
+                         -1].get("type") == "text":
+            aggregated_content[-1]["text"] += "\n\n" + chunk.get("text")
+        else:
+            aggregated_content.append(chunk)
+    if len(aggregated_content) == 1 and aggregated_content[0].get(
+            "type") == "text":
+        content = aggregated_content[0]["text"]
+    return content
+
+
+def make_mistral_chat_completion_request(
+        messages: list["ChatCompletionMessageParam"],
+        tools: Optional[list[dict[str,
+                                  Any]]] = None) -> "ChatCompletionRequest":
+    last_message = cast(dict[str, Any], messages[-1])
+    if last_message["role"] == "assistant":
+        last_message["prefix"] = True
+
+    # mistral-common requires AssistantMessage content to be string [1].
+    #
+    # [1]: https://github.com/mistralai/mistral-common/blob/f4a06998b75ed78bbf5aaf569590b772ea26c9f6/src/mistral_common/protocol/instruct/messages.py#L80
+    for message in messages:
+        # Remove reasoning_content as unsupported by Mistral
+        _ = message.pop("reasoning_content", None)  # type: ignore
+
+        # Convert list text content to string
+        if message.get("role") in ("assistant", "tool"):
+            content: Any = message.get("content")
+            if isinstance(content, list):
+                content = _aggregate_content(content)
+            message["content"] = content
+
+    # The Mistral client, in comparison to the OpenAI client, requires the
+    # "parameters" dict to be present, even if it's empty.
+    if tools:
+        for function in [
+                tool["function"] for tool in tools
+                if tool["type"] == "function"
+        ]:
+            if function.get("parameters") is None:
+                function["parameters"] = {}
+
+    from mistral_common.protocol.instruct.request import ChatCompletionRequest
+    return ChatCompletionRequest(messages=messages,
+                                 tools=tools)  # type: ignore[type-var]
+
+
+class MistralTokenizer(TokenizerBase):
+
+    def __init__(self, tokenizer: "PublicMistralTokenizer") -> None:
+        self.mistral = tokenizer
+        self.instruct = tokenizer.instruct_tokenizer
+        _mistral_version_str = self.instruct.tokenizer.version.value
+        self.version: int = int(_mistral_version_str.split("v")[-1])
+
+        tokenizer_ = tokenizer.instruct_tokenizer.tokenizer
+        from mistral_common.tokens.tokenizers.tekken import (
+            SpecialTokenPolicy, Tekkenizer)
+        self.is_tekken = isinstance(tokenizer_, Tekkenizer)
+        from mistral_common.tokens.tokenizers.sentencepiece import (
+            SentencePieceTokenizer)
+        self.is_spm = isinstance(tokenizer_, SentencePieceTokenizer)
+        if self.is_tekken:
+            # Make sure special tokens will not raise
+            tokenizer_.special_token_policy = SpecialTokenPolicy.IGNORE
+        elif self.is_spm:
+            pass
+        else:
+            raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}")
+
+        self._vocab = tokenizer_.vocab()
+        # Convert to a dict[str, int] to match protocol, but this is a lossy
+        # conversion. There may be multiple token ids that decode to the same
+        # string due to partial UTF-8 byte sequences being converted to �
+        self._vocab_dict = {
+            token: idx
+            for idx, token in enumerate(self._vocab)
+        }
+        self.tokenizer = tokenizer_
+        self._max_token_id = self.vocab_size - 1
+
+    @classmethod
+    def from_pretrained(cls,
+                        path_or_repo_id: str,
+                        *,
+                        revision: Optional[str] = None) -> "MistralTokenizer":
+        if not Path(path_or_repo_id).exists():
+            assert len(path_or_repo_id.split("/")) == 2, (
+                "You have either provided a non-existent path: "
+                "{path_or_repo_id} or an invalid HF Hub repo id.")
+            tokenizer_file = cls._download_mistral_tokenizer_from_hf(
+                path_or_repo_id, revision)
+        elif Path(path_or_repo_id).is_dir():
+            tokenizer_file_name = find_tokenizer_file(
+                os.listdir(path_or_repo_id))
+            tokenizer_file = str(Path(path_or_repo_id) / tokenizer_file_name)
+        else:
+            assert Path(
+                path_or_repo_id).is_file(), f"Invalid path: {path_or_repo_id}"
+            tokenizer_file = str(Path(path_or_repo_id))
+
+        from mistral_common.tokens.tokenizers.mistral import (
+            MistralTokenizer as PublicMistralTokenizer)
+        mistral_tokenizer = PublicMistralTokenizer.from_file(tokenizer_file)
+        return cls(mistral_tokenizer)
+
+    @staticmethod
+    def _download_mistral_tokenizer_from_hf(tokenizer_name: str,
+                                            revision: Optional[str]) -> str:
+        try:
+            hf_api = HfApi()
+            files = hf_api.list_repo_files(repo_id=tokenizer_name,
+                                           revision=revision)
+        except ConnectionError as exc:
+            files = list_local_repo_files(repo_id=tokenizer_name,
+                                          revision=revision)
+
+            if len(files) == 0:
+                raise exc
+
+        filename = find_tokenizer_file(files)
+
+        tokenizer_file = hf_hub_download(tokenizer_name,
+                                         filename=filename,
+                                         revision=revision)
+        return tokenizer_file
+
+    # the following attributes are set to fit vLLM's design and are used
+    # by the guided structured output backends.
+    @property
+    def all_special_tokens_extended(self) -> list[str]:
+        from mistral_common.tokens.tokenizers.base import SpecialTokens
+
+        # tekken defines its own extended special tokens list
+        if hasattr(self.tokenizer, "SPECIAL_TOKENS"):
+            special_tokens = self.tokenizer.SPECIAL_TOKENS
+        else:
+            special_tokens = list(SpecialTokens)
+        return [
+            s.value if isinstance(s, SpecialTokens) else s
+            for s in special_tokens
+        ]
+
+    @property
+    def all_special_tokens(self) -> list[str]:
+        return self.all_special_tokens_extended
+
+    @property
+    def all_special_ids(self) -> list[int]:
+        return [
+            self.all_special_tokens.index(t) for t in self.all_special_tokens
+        ]
+
+    @property
+    def bos_token_id(self) -> int:
+        return self.tokenizer.bos_id
+
+    @property
+    def eos_token_id(self) -> int:
+        return self.tokenizer.eos_id
+
+    @property
+    def sep_token(self) -> str:
+        raise NotImplementedError()
+
+    @property
+    def pad_token(self) -> str:
+        raise NotImplementedError()
+
+    @property
+    def is_fast(self) -> bool:
+        return True
+
+    @property
+    def vocab_size(self) -> int:
+        return len(self._vocab)
+
+    @property
+    def max_token_id(self) -> int:
+        return self._max_token_id
+
+    def __len__(self) -> int:
+        return self.vocab_size
+
+    def __call__(
+        self,
+        text: Union[str, list[str], list[int]],
+        text_pair: Optional[str] = None,
+        add_special_tokens: bool = False,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+    ):
+        input_ids: Union[list[int], list[list[int]]]
+        # For list[str], original prompt text
+        if is_list_of(text, str):
+            input_ids_: list[list[int]] = []
+            for p in text:
+                each_input_ids = self.encode_one(p, truncation, max_length)
+                input_ids_.append(each_input_ids)
+            input_ids = input_ids_
+        # For list[int], apply chat template output, already tokens.
+        elif is_list_of(text, int):
+            input_ids = text
+        # For str, single prompt text
+        else:
+            input_ids = self.encode_one(text, truncation, max_length)
+        return Encoding(input_ids=input_ids)
+
+    def get_vocab(self) -> dict[str, int]:
+        # NB: the dictionary form of the vocabulary collapses token ids that map
+        # to the same string but have different bytes
+        return self._vocab_dict
+
+    def get_added_vocab(self) -> dict[str, int]:
+        # Mistral tokenizers have no added vocabulary
+        return {}
+
+    def encode_one(
+        self,
+        text: str,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+    ) -> list[int]:
+        # Mistral Tokenizers should not add special tokens
+        input_ids = self.encode(text)
+
+        if truncation:
+            input_ids = input_ids[:max_length]
+        return input_ids
+
+    def encode(self,
+               text: str,
+               truncation: Optional[bool] = None,
+               max_length: Optional[int] = None,
+               add_special_tokens: Optional[bool] = None) -> list[int]:
+        # `encode` should only be used for prompt completion
+        # it should never be used for chat_completion.
+        # For chat completion use `apply_chat_template`
+        if add_special_tokens is not None:
+            return self.tokenizer.encode(text,
+                                         bos=add_special_tokens,
+                                         eos=add_special_tokens)
+        else:
+            return self.tokenizer.encode(text, bos=True, eos=False)
+
+    def apply_chat_template(self,
+                            messages: list["ChatCompletionMessageParam"],
+                            tools: Optional[list[dict[str, Any]]] = None,
+                            **kwargs) -> list[int]:
+
+        request = make_mistral_chat_completion_request(messages, tools)
+        encoded = self.mistral.encode_chat_completion(request)
+
+        # encode-decode to get clean prompt
+        return encoded.tokens
+
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
+        from mistral_common.tokens.tokenizers.base import SpecialTokens
+        if self.is_tekken:
+            tokens = [
+                t for t in tokens
+                if (t is SpecialTokens.tool_calls
+                    or t not in self.tokenizer._all_special_tokens)
+            ]
+
+            if any(isinstance(t, bytes) for t in tokens):
+                # we need to encode and decode all tokens again
+                shift = self.tokenizer.num_special_tokens
+
+                def _token_to_id(t: str):
+                    t_bytes = t.encode("utf-8") \
+                        if not isinstance(t, bytes) else t
+                    try:
+                        return shift + \
+                            self.tokenizer._tekken_token2id_nospecial[t_bytes]
+                    except KeyError:
+                        logger.warning(
+                            "Failed to convert token %s to id,"
+                            " replacing with <unk>", t_bytes)
+                        return self.tokenizer.unk_id
+
+                ids = [_token_to_id(t) for t in tokens]
+                decoded = self.tokenizer.decode(ids)
+            else:
+                decoded = "".join(tokens)
+        else:
+            # make sure certain special tokens like Tool calls are
+            # not decoded
+            special_tokens = {SpecialTokens.tool_calls}
+            regular_tokens: list[str] = []
+            decoded_list = []
+
+            for token in tokens:
+                if token in special_tokens:
+                    if regular_tokens:
+                        decoded_list.append(
+                            self.tokenizer.decode(regular_tokens))
+                        regular_tokens = []
+                    decoded_list.append(token)
+                else:
+                    regular_tokens.append(token)
+
+            if regular_tokens:
+                decoded_list.append(
+                    self.tokenizer.decode(regular_tokens))  # type: ignore
+
+            decoded = ''.join(decoded_list)
+
+        return decoded
+
+    # WARN: Outlines logits processors can overwrite this method.
+    # See: guided_decoding/outlines_logits_processors.py::_adapt_tokenizer
+    # for more.
+    def decode(self,
+               ids: Union[list[int], int],
+               skip_special_tokens: bool = True) -> str:
+        assert (
+            skip_special_tokens
+        ), "skip_special_tokens=False is not supported for Mistral tokenizers."
+
+        if isinstance(ids, int):
+            ids = [ids]
+        return self.tokenizer.decode(ids)
+
+    def convert_ids_to_tokens(
+        self,
+        ids: list[int],
+        skip_special_tokens: bool = True,
+    ) -> list[str]:
+        from mistral_common.tokens.tokenizers.base import SpecialTokens
+        from mistral_common.tokens.tokenizers.instruct import (
+            InstructTokenizerV13)
+
+        # TODO(Patrick) - potentially allow special tokens to not be skipped
+        assert (
+            skip_special_tokens
+        ), "skip_special_tokens=False is not supported for Mistral tokenizers."
+
+        assert self.is_tekken or self.is_spm, type(self.tokenizer)
+
+        if self.is_tekken:
+            # skip special tokens except tool call and think tokens
+            non_skip_special_tokens = {
+                self.tokenizer.get_control_token(SpecialTokens.tool_calls)
+            }
+            if isinstance(self.instruct, InstructTokenizerV13):
+                if self.instruct.BEGIN_THINK:
+                    non_skip_special_tokens.add(self.instruct.BEGIN_THINK)
+                if self.instruct.END_THINK:
+                    non_skip_special_tokens.add(self.instruct.END_THINK)
+            ids = [
+                i for i in ids if i > self.tokenizer.num_special_tokens
+                or i in non_skip_special_tokens
+            ]
+
+        tokens = [self.tokenizer.id_to_piece(id) for id in ids]
+
+        if any("�" in t for t in tokens) and self.is_tekken:
+            # if a decoded token contains the replacement character, then the
+            # token has an incomplete UTF-8 character so we must use bytes
+            # See: https://github.com/vllm-project/vllm/pull/8640
+            #      https://github.com/vllm-project/vllm/pull/9625
+            # if underlying tokenizeir is sentencepiece, we just add "�"
+            tokens = [self.tokenizer.id_to_byte_piece(id) for id in ids]
+
+        return tokens
diff --git a/vllm_v0.10.0/vllm/transformers_utils/utils.py b/vllm_v0.10.0/vllm/transformers_utils/utils.py
new file mode 100644
index 0000000..66c8fb7
--- /dev/null
+++ b/vllm_v0.10.0/vllm/transformers_utils/utils.py
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from functools import cache
+from os import PathLike
+from pathlib import Path
+from typing import Optional, Union
+
+from vllm.envs import VLLM_MODEL_REDIRECT_PATH
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def is_s3(model_or_path: str) -> bool:
+    return model_or_path.lower().startswith('s3://')
+
+
+def check_gguf_file(model: Union[str, PathLike]) -> bool:
+    """Check if the file is a GGUF model."""
+    model = Path(model)
+    if not model.is_file():
+        return False
+    elif model.suffix == ".gguf":
+        return True
+
+    try:
+        with model.open("rb") as f:
+            header = f.read(4)
+
+        return header == b"GGUF"
+    except Exception as e:
+        logger.debug("Error reading file %s: %s", model, e)
+        return False
+
+
+def modelscope_list_repo_files(
+    repo_id: str,
+    revision: Optional[str] = None,
+    token: Union[str, bool, None] = None,
+) -> list[str]:
+    """List files in a modelscope repo."""
+    from modelscope.hub.api import HubApi
+    api = HubApi()
+    api.login(token)
+    # same as huggingface_hub.list_repo_files
+    files = [
+        file['Path'] for file in api.get_model_files(
+            model_id=repo_id, revision=revision, recursive=True)
+        if file['Type'] == 'blob'
+    ]
+    return files
+
+
+def _maybe_json_dict(path: Union[str, PathLike]) -> dict[str, str]:
+    with open(path) as f:
+        try:
+            return json.loads(f.read())
+        except Exception:
+            return dict[str, str]()
+
+
+def _maybe_space_split_dict(path: Union[str, PathLike]) -> dict[str, str]:
+    parsed_dict = dict[str, str]()
+    with open(path) as f:
+        for line in f.readlines():
+            try:
+                model_name, redirect_name = line.strip().split()
+                parsed_dict[model_name] = redirect_name
+            except Exception:
+                pass
+    return parsed_dict
+
+
+@cache
+def maybe_model_redirect(model: str) -> str:
+    """
+    Use model_redirect to redirect the model name to a local folder.
+
+    :param model: hf model name
+    :return: maybe redirect to a local folder
+    """
+
+    model_redirect_path = VLLM_MODEL_REDIRECT_PATH
+
+    if not model_redirect_path:
+        return model
+
+    if not Path(model_redirect_path).exists():
+        return model
+
+    redirect_dict = (_maybe_json_dict(model_redirect_path)
+                     or _maybe_space_split_dict(model_redirect_path))
+    if (redirect_model := redirect_dict.get(model)):
+        logger.info("model redirect: [ %s ] -> [ %s ]", model, redirect_model)
+        return redirect_model
+
+    return model
diff --git a/vllm_v0.10.0/vllm/triton_utils/__init__.py b/vllm_v0.10.0/vllm/triton_utils/__init__.py
new file mode 100644
index 0000000..0fcf5d1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/triton_utils/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.triton_utils.importing import (HAS_TRITON, TritonLanguagePlaceholder,
+                                         TritonPlaceholder)
+
+if HAS_TRITON:
+    import triton
+    import triton.language as tl
+else:
+    triton = TritonPlaceholder()
+    tl = TritonLanguagePlaceholder()
+
+__all__ = ["HAS_TRITON", "triton", "tl"]
diff --git a/vllm_v0.10.0/vllm/triton_utils/importing.py b/vllm_v0.10.0/vllm/triton_utils/importing.py
new file mode 100644
index 0000000..3722000
--- /dev/null
+++ b/vllm_v0.10.0/vllm/triton_utils/importing.py
@@ -0,0 +1,95 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import types
+from importlib.util import find_spec
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+HAS_TRITON = (
+    find_spec("triton") is not None
+    or find_spec("pytorch-triton-xpu") is not None  # Not compatible
+)
+if HAS_TRITON:
+    try:
+        from triton.backends import backends
+
+        # It's generally expected that x.driver exists and has
+        # an is_active method.
+        # The `x.driver and` check adds a small layer of safety.
+        active_drivers = [
+            x.driver for x in backends.values()
+            if x.driver and x.driver.is_active()
+        ]
+
+        # Check if we're in a distributed environment where CUDA_VISIBLE_DEVICES
+        # might be temporarily empty (e.g., Ray sets it to "" during actor init)
+        cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
+        is_distributed_env = (cuda_visible_devices is not None
+                              and len(cuda_visible_devices.strip()) == 0)
+
+        # Apply lenient driver check for distributed environments
+        if is_distributed_env and len(active_drivers) == 0:
+            # Allow 0 drivers in distributed environments - they may become
+            # active later when CUDA context is properly initialized
+            logger.debug(
+                "Triton found 0 active drivers in distributed environment. "
+                "This is expected during initialization.")
+        elif not is_distributed_env and len(active_drivers) != 1:
+            # Strict check for non-distributed environments
+            logger.info(
+                "Triton is installed but %d active driver(s) found "
+                "(expected 1). Disabling Triton to prevent runtime errors.",
+                len(active_drivers))
+            HAS_TRITON = False
+    except ImportError:
+        # This can occur if Triton is partially installed or triton.backends
+        # is missing.
+        logger.warning(
+            "Triton is installed, but `triton.backends` could not be imported. "
+            "Disabling Triton.")
+        HAS_TRITON = False
+    except Exception as e:
+        # Catch any other unexpected errors during the check.
+        logger.warning(
+            "An unexpected error occurred while checking Triton active drivers:"
+            " %s. Disabling Triton.", e)
+        HAS_TRITON = False
+
+if not HAS_TRITON:
+    logger.info("Triton not installed or not compatible; certain GPU-related"
+                " functions will not be available.")
+
+
+class TritonPlaceholder(types.ModuleType):
+
+    def __init__(self):
+        super().__init__("triton")
+        self.__version__ = "3.3.0"
+        self.jit = self._dummy_decorator("jit")
+        self.autotune = self._dummy_decorator("autotune")
+        self.heuristics = self._dummy_decorator("heuristics")
+        self.Config = self._dummy_decorator("Config")
+        self.language = TritonLanguagePlaceholder()
+
+    def _dummy_decorator(self, name):
+
+        def decorator(*args, **kwargs):
+            if args and callable(args[0]):
+                return args[0]
+            return lambda f: f
+
+        return decorator
+
+
+class TritonLanguagePlaceholder(types.ModuleType):
+
+    def __init__(self):
+        super().__init__("triton.language")
+        self.constexpr = None
+        self.dtype = None
+        self.int64 = None
+        self.int32 = None
diff --git a/vllm_v0.10.0/vllm/usage/__init__.py b/vllm_v0.10.0/vllm/usage/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/usage/usage_lib.py b/vllm_v0.10.0/vllm/usage/usage_lib.py
new file mode 100644
index 0000000..9224549
--- /dev/null
+++ b/vllm_v0.10.0/vllm/usage/usage_lib.py
@@ -0,0 +1,259 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import datetime
+import json
+import logging
+import os
+import platform
+import time
+from enum import Enum
+from pathlib import Path
+from threading import Thread
+from typing import Any, Optional, Union
+from uuid import uuid4
+
+import cpuinfo
+import psutil
+import requests
+import torch
+
+import vllm.envs as envs
+from vllm.connections import global_http_connection
+from vllm.logger import init_logger
+from vllm.utils import cuda_device_count_stateless, cuda_get_device_properties
+from vllm.version import __version__ as VLLM_VERSION
+
+logger = init_logger(__name__)
+
+_config_home = envs.VLLM_CONFIG_ROOT
+_USAGE_STATS_JSON_PATH = os.path.join(_config_home, "usage_stats.json")
+_USAGE_STATS_DO_NOT_TRACK_PATH = os.path.join(_config_home, "do_not_track")
+_USAGE_STATS_ENABLED = None
+_USAGE_STATS_SERVER = envs.VLLM_USAGE_STATS_SERVER
+
+_GLOBAL_RUNTIME_DATA = dict[str, Union[str, int, bool]]()
+
+_USAGE_ENV_VARS_TO_COLLECT = [
+    "VLLM_USE_MODELSCOPE",
+    "VLLM_USE_TRITON_FLASH_ATTN",
+    "VLLM_ATTENTION_BACKEND",
+    "VLLM_USE_FLASHINFER_SAMPLER",
+    "VLLM_PP_LAYER_PARTITION",
+    "VLLM_USE_TRITON_AWQ",
+    "VLLM_USE_V1",
+    "VLLM_ENABLE_V1_MULTIPROCESSING",
+]
+
+
+def set_runtime_usage_data(key: str, value: Union[str, int, bool]) -> None:
+    """Set global usage data that will be sent with every usage heartbeat."""
+    _GLOBAL_RUNTIME_DATA[key] = value
+
+
+def is_usage_stats_enabled():
+    """Determine whether or not we can send usage stats to the server.
+    The logic is as follows:
+    - By default, it should be enabled.
+    - Three environment variables can disable it:
+        - VLLM_DO_NOT_TRACK=1
+        - DO_NOT_TRACK=1
+        - VLLM_NO_USAGE_STATS=1
+    - A file in the home directory can disable it if it exists:
+        - $HOME/.config/vllm/do_not_track
+    """
+    global _USAGE_STATS_ENABLED
+    if _USAGE_STATS_ENABLED is None:
+        do_not_track = envs.VLLM_DO_NOT_TRACK
+        no_usage_stats = envs.VLLM_NO_USAGE_STATS
+        do_not_track_file = os.path.exists(_USAGE_STATS_DO_NOT_TRACK_PATH)
+
+        _USAGE_STATS_ENABLED = not (do_not_track or no_usage_stats
+                                    or do_not_track_file)
+    return _USAGE_STATS_ENABLED
+
+
+def _get_current_timestamp_ns() -> int:
+    return int(datetime.datetime.now(datetime.timezone.utc).timestamp() * 1e9)
+
+
+def _detect_cloud_provider() -> str:
+    # Try detecting through vendor file
+    vendor_files = [
+        "/sys/class/dmi/id/product_version", "/sys/class/dmi/id/bios_vendor",
+        "/sys/class/dmi/id/product_name",
+        "/sys/class/dmi/id/chassis_asset_tag", "/sys/class/dmi/id/sys_vendor"
+    ]
+    # Mapping of identifiable strings to cloud providers
+    cloud_identifiers = {
+        "amazon": "AWS",
+        "microsoft corporation": "AZURE",
+        "google": "GCP",
+        "oraclecloud": "OCI",
+    }
+
+    for vendor_file in vendor_files:
+        path = Path(vendor_file)
+        if path.is_file():
+            file_content = path.read_text().lower()
+            for identifier, provider in cloud_identifiers.items():
+                if identifier in file_content:
+                    return provider
+
+    # Try detecting through environment variables
+    env_to_cloud_provider = {
+        "RUNPOD_DC_ID": "RUNPOD",
+    }
+    for env_var, provider in env_to_cloud_provider.items():
+        if os.environ.get(env_var):
+            return provider
+
+    return "UNKNOWN"
+
+
+class UsageContext(str, Enum):
+    UNKNOWN_CONTEXT = "UNKNOWN_CONTEXT"
+    LLM_CLASS = "LLM_CLASS"
+    API_SERVER = "API_SERVER"
+    OPENAI_API_SERVER = "OPENAI_API_SERVER"
+    OPENAI_BATCH_RUNNER = "OPENAI_BATCH_RUNNER"
+    ENGINE_CONTEXT = "ENGINE_CONTEXT"
+
+
+class UsageMessage:
+    """Collect platform information and send it to the usage stats server."""
+
+    def __init__(self) -> None:
+        # NOTE: vLLM's server _only_ support flat KV pair.
+        # Do not use nested fields.
+
+        self.uuid = str(uuid4())
+
+        # Environment Information
+        self.provider: Optional[str] = None
+        self.num_cpu: Optional[int] = None
+        self.cpu_type: Optional[str] = None
+        self.cpu_family_model_stepping: Optional[str] = None
+        self.total_memory: Optional[int] = None
+        self.architecture: Optional[str] = None
+        self.platform: Optional[str] = None
+        self.cuda_runtime: Optional[str] = None
+        self.gpu_count: Optional[int] = None
+        self.gpu_type: Optional[str] = None
+        self.gpu_memory_per_device: Optional[int] = None
+        self.env_var_json: Optional[str] = None
+
+        # vLLM Information
+        self.model_architecture: Optional[str] = None
+        self.vllm_version: Optional[str] = None
+        self.context: Optional[str] = None
+
+        # Metadata
+        self.log_time: Optional[int] = None
+        self.source: Optional[str] = None
+
+    def report_usage(self,
+                     model_architecture: str,
+                     usage_context: UsageContext,
+                     extra_kvs: Optional[dict[str, Any]] = None) -> None:
+        t = Thread(target=self._report_usage_worker,
+                   args=(model_architecture, usage_context, extra_kvs or {}),
+                   daemon=True)
+        t.start()
+
+    def _report_usage_worker(self, model_architecture: str,
+                             usage_context: UsageContext,
+                             extra_kvs: dict[str, Any]) -> None:
+        self._report_usage_once(model_architecture, usage_context, extra_kvs)
+        self._report_continuous_usage()
+
+    def _report_usage_once(self, model_architecture: str,
+                           usage_context: UsageContext,
+                           extra_kvs: dict[str, Any]) -> None:
+        # Platform information
+        from vllm.platforms import current_platform
+        if current_platform.is_cuda_alike():
+            self.gpu_count = cuda_device_count_stateless()
+            self.gpu_type, self.gpu_memory_per_device = (
+                cuda_get_device_properties(0, ("name", "total_memory")))
+        if current_platform.is_cuda():
+            self.cuda_runtime = torch.version.cuda
+        if current_platform.is_tpu():
+            try:
+                import torch_xla
+                self.gpu_count = torch_xla.runtime.world_size()
+                self.gpu_type = torch_xla.tpu.get_tpu_type()
+                self.gpu_memory_per_device = (
+                    torch_xla.core.xla_model.get_memory_info()["bytes_limit"])
+            except Exception:
+                logger.exception("Failed to collect TPU information")
+        self.provider = _detect_cloud_provider()
+        self.architecture = platform.machine()
+        self.platform = platform.platform()
+        self.total_memory = psutil.virtual_memory().total
+
+        info = cpuinfo.get_cpu_info()
+        self.num_cpu = info.get("count", None)
+        self.cpu_type = info.get("brand_raw", "")
+        self.cpu_family_model_stepping = ",".join([
+            str(info.get("family", "")),
+            str(info.get("model", "")),
+            str(info.get("stepping", ""))
+        ])
+
+        # vLLM information
+        self.context = usage_context.value
+        self.vllm_version = VLLM_VERSION
+        self.model_architecture = model_architecture
+
+        # Environment variables
+        self.env_var_json = json.dumps({
+            env_var: getattr(envs, env_var)
+            for env_var in _USAGE_ENV_VARS_TO_COLLECT
+        })
+
+        # Metadata
+        self.log_time = _get_current_timestamp_ns()
+        self.source = envs.VLLM_USAGE_SOURCE
+
+        data = vars(self)
+        if extra_kvs:
+            data.update(extra_kvs)
+
+        self._write_to_file(data)
+        self._send_to_server(data)
+
+    def _report_continuous_usage(self):
+        """Report usage every 10 minutes.
+
+        This helps us to collect more data points for uptime of vLLM usages.
+        This function can also help send over performance metrics over time.
+        """
+        while True:
+            time.sleep(600)
+            data = {
+                "uuid": self.uuid,
+                "log_time": _get_current_timestamp_ns(),
+            }
+            data.update(_GLOBAL_RUNTIME_DATA)
+
+            self._write_to_file(data)
+            self._send_to_server(data)
+
+    def _send_to_server(self, data: dict[str, Any]) -> None:
+        try:
+            global_http_client = global_http_connection.get_sync_client()
+            global_http_client.post(_USAGE_STATS_SERVER, json=data)
+        except requests.exceptions.RequestException:
+            # silently ignore unless we are using debug log
+            logging.debug("Failed to send usage data to server")
+
+    def _write_to_file(self, data: dict[str, Any]) -> None:
+        os.makedirs(os.path.dirname(_USAGE_STATS_JSON_PATH), exist_ok=True)
+        Path(_USAGE_STATS_JSON_PATH).touch(exist_ok=True)
+        with open(_USAGE_STATS_JSON_PATH, "a") as f:
+            json.dump(data, f)
+            f.write("\n")
+
+
+usage_message = UsageMessage()
diff --git a/vllm_v0.10.0/vllm/utils/__init__.py b/vllm_v0.10.0/vllm/utils/__init__.py
new file mode 100644
index 0000000..5b9c3b6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/utils/__init__.py
@@ -0,0 +1,3280 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import asyncio
+import concurrent
+import contextlib
+import datetime
+import enum
+import gc
+import getpass
+import hashlib
+import importlib
+import importlib.metadata
+import importlib.util
+import inspect
+import ipaddress
+import json
+import multiprocessing
+import os
+import pickle
+import signal
+import socket
+import subprocess
+import sys
+import tempfile
+import textwrap
+import threading
+import time
+import traceback
+import types
+import uuid
+import warnings
+import weakref
+from argparse import (Action, ArgumentDefaultsHelpFormatter, ArgumentParser,
+                      ArgumentTypeError, RawDescriptionHelpFormatter,
+                      _ArgumentGroup)
+from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
+from collections import UserDict, defaultdict
+from collections.abc import (AsyncGenerator, Awaitable, Collection, Generator,
+                             Hashable, Iterable, Iterator, KeysView, Mapping,
+                             Sequence)
+from concurrent.futures import ThreadPoolExecutor
+from concurrent.futures.process import ProcessPoolExecutor
+from dataclasses import dataclass, field
+from functools import cache, lru_cache, partial, wraps
+from types import MappingProxyType
+from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple,
+                    Optional, Tuple, TypeVar, Union, cast, overload)
+from urllib.parse import urlparse
+from uuid import uuid4
+
+import cachetools
+import cbor2
+import cloudpickle
+import numpy as np
+import numpy.typing as npt
+import psutil
+import regex as re
+import torch
+import torch.types
+import yaml
+import zmq
+import zmq.asyncio
+from packaging import version
+from packaging.version import Version
+from torch.library import Library
+from transformers.tokenization_utils_base import BatchEncoding
+from typing_extensions import Never, ParamSpec, TypeIs, assert_never
+
+import vllm.envs as envs
+from vllm.logger import enable_trace_function_call, init_logger
+
+if TYPE_CHECKING:
+    from argparse import Namespace
+
+    from vllm.config import ModelConfig, VllmConfig
+
+logger = init_logger(__name__)
+
+# This value is chosen to have a balance between ITL and TTFT. Note it is
+# not optimized for throughput.
+DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048
+POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
+MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120
+
+# Exception strings for non-implemented encoder/decoder scenarios
+
+# Reminder: Please update docs/features/compatibility_matrix.md
+# If the feature combo become valid
+
+STR_NOT_IMPL_ENC_DEC_SWA = \
+    "Sliding window attention for encoder/decoder models " + \
+    "is not currently supported."
+
+STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE = \
+    "Prefix caching for encoder/decoder models " + \
+    "is not currently supported."
+
+STR_NOT_IMPL_ENC_DEC_CHUNKED_PREFILL = \
+    "Chunked prefill for encoder/decoder models " + \
+    "is not currently supported."
+
+STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP = (
+    "Models with logits_soft_cap "
+    "require FlashInfer backend, which is "
+    "currently not supported for encoder/decoder "
+    "models.")
+
+STR_NOT_IMPL_ENC_DEC_LORA = ("LoRA is not currently "
+                             "supported with encoder/decoder "
+                             "models.")
+
+STR_NOT_IMPL_ENC_DEC_PP = ("Pipeline parallelism is not "
+                           "currently supported with "
+                           "encoder/decoder models.")
+
+STR_NOT_IMPL_ENC_DEC_MM = ("Multimodal is not currently "
+                           "supported with encoder/decoder "
+                           "models.")
+
+STR_NOT_IMPL_ENC_DEC_SPEC_DEC = ("Speculative decoding is not "
+                                 "currently supported with encoder/"
+                                 "decoder models.")
+
+STR_NOT_IMPL_ENC_DEC_BACKEND = ("XFormers and Flash-Attention are the only "
+                                "backends currently supported with encoder/"
+                                "decoder models.")
+
+# Efficiently import all enc/dec error strings
+# rather than having to import all of the above
+STR_NOT_IMPL_ENC_DEC_ERR_STRS = {
+    "STR_NOT_IMPL_ENC_DEC_SWA": STR_NOT_IMPL_ENC_DEC_SWA,
+    "STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE": STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
+    "STR_NOT_IMPL_ENC_DEC_CHUNKED_PREFILL":
+    STR_NOT_IMPL_ENC_DEC_CHUNKED_PREFILL,
+    "STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP": STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP,
+    "STR_NOT_IMPL_ENC_DEC_LORA": STR_NOT_IMPL_ENC_DEC_LORA,
+    "STR_NOT_IMPL_ENC_DEC_PP": STR_NOT_IMPL_ENC_DEC_PP,
+    "STR_NOT_IMPL_ENC_DEC_MM": STR_NOT_IMPL_ENC_DEC_MM,
+    "STR_NOT_IMPL_ENC_DEC_SPEC_DEC": STR_NOT_IMPL_ENC_DEC_SPEC_DEC,
+    "STR_NOT_IMPL_ENC_DEC_BACKEND": STR_NOT_IMPL_ENC_DEC_BACKEND,
+}
+
+# Constants related to forcing the attention backend selection
+
+# String name of register which may be set in order to
+# force auto-selection of attention backend by Attention
+# wrapper
+STR_BACKEND_ENV_VAR: str = "VLLM_ATTENTION_BACKEND"
+
+# Possible string values of STR_BACKEND_ENV_VAR
+# register, corresponding to possible backends
+STR_FLASHINFER_ATTN_VAL: str = "FLASHINFER"
+STR_TORCH_SDPA_ATTN_VAL: str = "TORCH_SDPA"
+STR_ROCM_FLASH_ATTN_VAL: str = "ROCM_FLASH"
+STR_XFORMERS_ATTN_VAL: str = "XFORMERS"
+STR_FLASH_ATTN_VAL: str = "FLASH_ATTN"
+STR_DUAL_CHUNK_FLASH_ATTN_VAL: str = "DUAL_CHUNK_FLASH_ATTN"
+STR_INVALID_VAL: str = "INVALID"
+
+GB_bytes = 1_000_000_000
+"""The number of bytes in one gigabyte (GB)."""
+
+GiB_bytes = 1 << 30
+"""The number of bytes in one gibibyte (GiB)."""
+
+STR_DTYPE_TO_TORCH_DTYPE = {
+    "half": torch.half,
+    "bfloat16": torch.bfloat16,
+    "float": torch.float,
+    "fp8": torch.uint8,
+    "fp8_e4m3": torch.uint8,
+    "fp8_e5m2": torch.uint8,
+    "int8": torch.int8,
+    "fp8_inc": torch.float8_e4m3fn,
+}
+
+TORCH_DTYPE_TO_NUMPY_DTYPE = {
+    torch.float16: np.float16,
+    torch.float32: np.float32,
+    torch.float64: np.float64,
+    torch.uint8: np.uint8,
+    torch.int32: np.int32,
+    torch.int64: np.int64,
+}
+
+
+@contextlib.contextmanager
+def set_default_torch_num_threads(num_threads: int):
+    """Sets the default number of threads for PyTorch to the given value."""
+    old_num_threads = torch.get_num_threads()
+    torch.set_num_threads(num_threads)
+    yield
+    torch.set_num_threads(old_num_threads)
+
+
+P = ParamSpec('P')
+T = TypeVar("T")
+U = TypeVar("U")
+
+_K = TypeVar("_K", bound=Hashable)
+_V = TypeVar("_V")
+_T = TypeVar("_T")
+
+
+class _Sentinel:
+    ...
+
+
+ALL_PINNED_SENTINEL = _Sentinel()
+
+
+class Device(enum.Enum):
+    GPU = enum.auto()
+    CPU = enum.auto()
+
+
+class LayerBlockType(enum.Enum):
+    attention = "attention"
+    mamba = "mamba"
+
+
+class Counter:
+
+    def __init__(self, start: int = 0) -> None:
+        self.counter = start
+
+    def __next__(self) -> int:
+        i = self.counter
+        self.counter += 1
+        return i
+
+    def reset(self) -> None:
+        self.counter = 0
+
+
+class _MappingOrderCacheView(UserDict[_K, _V]):
+
+    def __init__(self, data: Mapping[_K, _V], ordered_keys: Mapping[_K, None]):
+        super().__init__(data)
+        self.ordered_keys = ordered_keys
+
+    def __iter__(self) -> Iterator[_K]:
+        return iter(self.ordered_keys)
+
+    def keys(self) -> KeysView[_K]:
+        return KeysView(self.ordered_keys)
+
+
+class CacheInfo(NamedTuple):
+    hits: int
+    total: int
+
+    @property
+    def hit_ratio(self) -> float:
+        if self.total == 0:
+            return 0
+
+        return self.hits / self.total
+
+    def __sub__(self, other: CacheInfo):
+        return CacheInfo(
+            hits=self.hits - other.hits,
+            total=self.total - other.total,
+        )
+
+
+class LRUCache(cachetools.LRUCache[_K, _V], Generic[_K, _V]):
+
+    def __init__(self,
+                 capacity: float,
+                 getsizeof: Optional[Callable[[_V], float]] = None):
+        super().__init__(capacity, getsizeof)
+
+        self.pinned_items = set[_K]()
+
+        self._hits = 0
+        self._total = 0
+        self._last_info = CacheInfo(hits=0, total=0)
+
+    def __getitem__(self, key: _K, *, update_info: bool = True) -> _V:
+        value = super().__getitem__(key)
+
+        if update_info:
+            self._hits += 1
+            self._total += 1
+
+        return value
+
+    def __delitem__(self, key: _K) -> None:
+        run_on_remove = key in self
+        value = self.__getitem__(key,
+                                 update_info=False)  # type: ignore[call-arg]
+        super().__delitem__(key)
+        if key in self.pinned_items:
+            # Todo: add warning to inform that del pinned item
+            self._unpin(key)
+        if run_on_remove:
+            self._on_remove(key, value)
+
+    @property
+    def cache(self) -> Mapping[_K, _V]:
+        """Return the internal cache dictionary in order (read-only)."""
+        return _MappingOrderCacheView(
+            self._Cache__data,  # type: ignore
+            self.order)
+
+    @property
+    def order(self) -> Mapping[_K, None]:
+        """Return the internal order dictionary (read-only)."""
+        return MappingProxyType(self._LRUCache__order)  # type: ignore
+
+    @property
+    def capacity(self) -> float:
+        return self.maxsize
+
+    @property
+    def usage(self) -> float:
+        if self.maxsize == 0:
+            return 0
+
+        return self.currsize / self.maxsize
+
+    def stat(self, *, delta: bool = False) -> CacheInfo:
+        """
+        Gets the cumulative number of hits and queries against this cache.
+
+        If `delta=True`, instead gets these statistics
+        since the last call that also passed `delta=True`.
+        """
+        info = CacheInfo(hits=self._hits, total=self._total)
+
+        if delta:
+            info_delta = info - self._last_info
+            self._last_info = info
+            info = info_delta
+
+        return info
+
+    def touch(self, key: _K) -> None:
+        try:
+            self._LRUCache__order.move_to_end(key)  # type: ignore
+        except KeyError:
+            self._LRUCache__order[key] = None  # type: ignore
+
+    @overload
+    def get(self, key: _K, /) -> Optional[_V]:
+        ...
+
+    @overload
+    def get(self, key: _K, /, default: Union[_V, _T]) -> Union[_V, _T]:
+        ...
+
+    def get(self,
+            key: _K,
+            /,
+            default: Optional[Union[_V,
+                                    _T]] = None) -> Optional[Union[_V, _T]]:
+        value: Optional[Union[_V, _T]]
+        if key in self:
+            value = self.__getitem__(
+                key, update_info=False)  # type: ignore[call-arg]
+
+            self._hits += 1
+        else:
+            value = default
+
+        self._total += 1
+        return value
+
+    @overload
+    def pop(self, key: _K) -> _V:
+        ...
+
+    @overload
+    def pop(self, key: _K, default: Union[_V, _T]) -> Union[_V, _T]:
+        ...
+
+    def pop(self,
+            key: _K,
+            default: Optional[Union[_V,
+                                    _T]] = None) -> Optional[Union[_V, _T]]:
+        value: Optional[Union[_V, _T]]
+        if key not in self:
+            return default
+
+        value = self.__getitem__(key,
+                                 update_info=False)  # type: ignore[call-arg]
+        self.__delitem__(key)
+        return value
+
+    def put(self, key: _K, value: _V) -> None:
+        self.__setitem__(key, value)
+
+    def pin(self, key: _K) -> None:
+        """
+        Pins a key in the cache preventing it from being
+        evicted in the LRU order.
+        """
+        if key not in self:
+            raise ValueError(f"Cannot pin key: {key} not in cache.")
+        self.pinned_items.add(key)
+
+    def _unpin(self, key: _K) -> None:
+        """
+        Unpins a key in the cache allowing it to be
+        evicted in the LRU order.
+        """
+        self.pinned_items.remove(key)
+
+    def _on_remove(self, key: _K, value: Optional[_V]) -> None:
+        pass
+
+    def remove_oldest(self, *, remove_pinned: bool = False) -> None:
+        if len(self) == 0:
+            return
+
+        self.popitem(remove_pinned=remove_pinned)
+
+    def _remove_old_if_needed(self) -> None:
+        while self.currsize > self.capacity:
+            self.remove_oldest()
+
+    def popitem(self, remove_pinned: bool = False):
+        """Remove and return the `(key, value)` pair least recently used."""
+        if not remove_pinned:
+            # pop the oldest item in the cache that is not pinned
+            lru_key = next(
+                (key for key in self.order if key not in self.pinned_items),
+                ALL_PINNED_SENTINEL)
+            if lru_key is ALL_PINNED_SENTINEL:
+                raise RuntimeError("All items are pinned, "
+                                   "cannot remove oldest from the cache.")
+        else:
+            lru_key = next(iter(self.order))
+        value = self.pop(cast(_K, lru_key))
+        return (lru_key, value)
+
+    def clear(self) -> None:
+        while len(self) > 0:
+            self.remove_oldest(remove_pinned=True)
+
+        self._hits = 0
+        self._total = 0
+        self._last_info = CacheInfo(hits=0, total=0)
+
+
+class PyObjectCache:
+    """Used to cache python objects to avoid object allocations
+    across scheduler iterations.
+    """
+
+    def __init__(self, obj_builder):
+        self._obj_builder = obj_builder
+        self._index = 0
+
+        self._obj_cache = []
+        for _ in range(128):
+            self._obj_cache.append(self._obj_builder())
+
+    def _grow_cache(self):
+        # Double the size of the cache
+        num_objs = len(self._obj_cache)
+        for _ in range(num_objs):
+            self._obj_cache.append(self._obj_builder())
+
+    def get_object(self):
+        """Returns a pre-allocated cached object. If there is not enough
+        objects, then the cache size will double.
+        """
+        if self._index >= len(self._obj_cache):
+            self._grow_cache()
+            assert self._index < len(self._obj_cache)
+
+        obj = self._obj_cache[self._index]
+        self._index += 1
+
+        return obj
+
+    def reset(self):
+        """Makes all cached-objects available for the next scheduler iteration.
+        """
+        self._index = 0
+
+
+@cache
+def get_max_shared_memory_bytes(gpu: int = 0) -> int:
+    """Returns the maximum shared memory per thread block in bytes."""
+    from vllm import _custom_ops as ops
+    max_shared_mem = (
+        ops.get_max_shared_memory_per_block_device_attribute(gpu))
+    # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py
+    # will fail
+    assert max_shared_mem > 0, "max_shared_mem can not be zero"
+    return int(max_shared_mem)
+
+
+def get_cpu_memory() -> int:
+    """Returns the total CPU memory of the node in bytes."""
+    return psutil.virtual_memory().total
+
+
+def random_uuid() -> str:
+    return str(uuid.uuid4().hex)
+
+
+class AsyncMicrobatchTokenizer:
+    """Asynchronous tokenizer with micro-batching.
+
+    Pulls pending encode/decode requests from a queue and batches them 
+    up to reduce overhead. A single-thread ThreadPoolExecutor is used 
+    so the event loop stays responsive.
+    """
+
+    def __init__(
+        self,
+        tokenizer,
+        max_batch_size: int = 32,
+        batch_wait_timeout_s: float = 0.002,
+    ) -> None:
+        self.tokenizer = tokenizer
+        self.max_batch_size = max_batch_size
+        self.batch_wait_timeout_s = batch_wait_timeout_s
+
+        self._loop = asyncio.get_running_loop()
+        self._queues: dict[tuple,
+                           asyncio.Queue[Union[tuple[str, dict,
+                                                     asyncio.Future],
+                                               tuple[list[int],
+                                                     asyncio.Future]]]] = {}
+        self._batcher_tasks: list[asyncio.Task] = []
+
+        # Single-thread executor for blocking tokenizer calls.
+        self._executor = ThreadPoolExecutor(max_workers=1)
+
+    # === Public async API ===
+    async def __call__(self, prompt, **kwargs):
+        result_future: asyncio.Future = self._loop.create_future()
+        key = self._queue_key("encode", kwargs)
+        queue = self._get_queue(self._loop, key)
+        await queue.put((prompt, kwargs, result_future))
+        return await result_future
+
+    async def decode(self, token_ids, **kwargs):
+        result_future: asyncio.Future = self._loop.create_future()
+        key = self._queue_key("decode", kwargs)
+        queue = self._get_queue(self._loop, key)
+        await queue.put((token_ids, result_future))
+        return await result_future
+
+    # === Internal helpers ===
+    def _get_queue(
+        self, loop: asyncio.AbstractEventLoop, key: tuple
+    ) -> asyncio.Queue[Union[tuple[str, dict, asyncio.Future], tuple[
+            list[int], asyncio.Future]]]:
+        """Get the request queue for the given operation key, creating a new
+        queue and batcher task if needed."""
+        queue = self._queues.get(key)
+        if queue is None:
+            self._queues[key] = queue = asyncio.Queue()
+            if key[0] == "encode":
+                can_batch = key[1] != "other"
+                coro = self._batch_encode_loop(queue, can_batch)
+            else:
+                assert key[0] == "decode", \
+                    f"Unknown operation type: {key[0]}."
+                coro = self._batch_decode_loop(queue)
+            self._batcher_tasks.append(loop.create_task(coro))
+        return queue
+
+    async def _batch_encode_loop(self, queue: asyncio.Queue, can_batch: bool):
+        """Batch incoming encode requests for efficiency."""
+        while True:
+            prompt, kwargs, result_future = await queue.get()
+            prompts = [prompt]
+            kwargs_list = [kwargs]
+            result_futures = [result_future]
+            deadline = self._loop.time() + self.batch_wait_timeout_s
+
+            while len(prompts) < self.max_batch_size:
+                timeout = deadline - self._loop.time()
+                if timeout <= 0:
+                    break
+                try:
+                    prompt, kwargs, result_future = await asyncio.wait_for(
+                        queue.get(), timeout)
+                    prompts.append(prompt)
+                    result_futures.append(result_future)
+                    if not can_batch:
+                        kwargs_list.append(kwargs)
+                except asyncio.TimeoutError:
+                    break
+
+            try:
+                # If every request uses identical kwargs we can run a single
+                # batched tokenizer call for a big speed-up.
+                if can_batch and len(prompts) > 1:
+                    encode_fn = partial(self.tokenizer, prompts, **kwargs)
+                    results = await self._loop.run_in_executor(
+                        self._executor, encode_fn)
+
+                    for i, fut in enumerate(result_futures):
+                        if not fut.done():
+                            data = {k: v[i] for k, v in results.items()}
+                            fut.set_result(BatchEncoding(data))
+                else:
+                    encode_fn = lambda prompts=prompts, kwargs=kwargs_list: [
+                        self.tokenizer(p, **kw)
+                        for p, kw in zip(prompts, kwargs)
+                    ]
+                    results = await self._loop.run_in_executor(
+                        self._executor, encode_fn)
+
+                    for fut, res in zip(result_futures, results):
+                        if not fut.done():
+                            fut.set_result(res)
+            except Exception as e:
+                for fut in result_futures:
+                    if not fut.done():
+                        fut.set_exception(e)
+
+    async def _batch_decode_loop(self, queue: asyncio.Queue):
+        """Batch incoming decode requests for efficiency."""
+        while True:
+            token_ids, result_future = await queue.get()
+            token_ids_list = [token_ids]
+            result_futures = [result_future]
+            deadline = self._loop.time() + self.batch_wait_timeout_s
+
+            while len(token_ids_list) < self.max_batch_size:
+                timeout = deadline - self._loop.time()
+                if timeout <= 0:
+                    break
+                try:
+                    token_ids, result_future = await asyncio.wait_for(
+                        queue.get(), timeout)
+                    token_ids_list.append(token_ids)
+                    result_futures.append(result_future)
+                except asyncio.TimeoutError:
+                    break
+
+            try:
+                # Perform a single batched decode call for all requests
+                results = await self._loop.run_in_executor(
+                    self._executor, self.tokenizer.batch_decode,
+                    token_ids_list)
+                for fut, res in zip(result_futures, results):
+                    if not fut.done():
+                        fut.set_result(res)
+            except Exception as e:
+                for fut in result_futures:
+                    if not fut.done():
+                        fut.set_exception(e)
+
+    def _queue_key(self, op: str, kwargs: dict) -> tuple:
+        """
+        Return a normalized key describing operation + kwargs.
+        
+        - `add_special_tokens`: {True/False}
+        - `truncation`: {True/False}
+          - If `truncation` is False (`max_length` is None), 
+            returns a key for a can_batch queue.
+          - If `truncation` is True and `max_length` is None or equals
+            `tokenizer.model_max_length`, returns a key for a can_batch queue.
+          - Otherwise, returns a key for a cannot_batch queue.
+        
+        Examples:
+          - Decode: ("decode",)
+          - Encode typical: 
+            ("encode", add_special_tokens, bool_truncation, max_length_label)
+          - Fallback: ("encode", "other")
+        """
+
+        if op == "decode":
+            return ("decode", )
+
+        add_special_tokens = kwargs.get("add_special_tokens", True)
+        truncation = kwargs.get("truncation", False)
+        max_length = kwargs.get("max_length")
+
+        if not truncation:
+            return ("encode", add_special_tokens, False, None)
+
+        model_max = getattr(self.tokenizer, "model_max_length", None)
+        if max_length is None or (model_max is not None
+                                  and max_length == model_max):
+            return ("encode", add_special_tokens, True, "model_max")
+
+        return ("encode", "other")
+
+    def __del__(self):
+        for task in self._batcher_tasks:
+            if not task.done():
+                task.cancel()
+
+
+def make_async(
+    func: Callable[P, T],
+    executor: Optional[concurrent.futures.Executor] = None
+) -> Callable[P, Awaitable[T]]:
+    """Take a blocking function, and run it on in an executor thread.
+
+    This function prevents the blocking function from blocking the
+    asyncio event loop.
+    The code in this function needs to be thread safe.
+    """
+
+    def _async_wrapper(*args: P.args, **kwargs: P.kwargs) -> asyncio.Future:
+        loop = asyncio.get_event_loop()
+        p_func = partial(func, *args, **kwargs)
+        return loop.run_in_executor(executor=executor, func=p_func)
+
+    return _async_wrapper
+
+
+def _next_task(iterator: AsyncGenerator[T, None],
+               loop: AbstractEventLoop) -> Task:
+    # Can use anext() in python >= 3.10
+    return loop.create_task(iterator.__anext__())  # type: ignore[arg-type]
+
+
+async def merge_async_iterators(
+    *iterators: AsyncGenerator[T,
+                               None], ) -> AsyncGenerator[tuple[int, T], None]:
+    """Merge multiple asynchronous iterators into a single iterator.
+
+    This method handle the case where some iterators finish before others.
+    When it yields, it yields a tuple (i, item) where i is the index of the
+    iterator that yields the item.
+    """
+    if len(iterators) == 1:
+        # Fast-path single iterator case.
+        async for item in iterators[0]:
+            yield 0, item
+        return
+
+    loop = asyncio.get_running_loop()
+
+    awaits = {_next_task(pair[1], loop): pair for pair in enumerate(iterators)}
+    try:
+        while awaits:
+            done, _ = await asyncio.wait(awaits.keys(),
+                                         return_when=FIRST_COMPLETED)
+            for d in done:
+                pair = awaits.pop(d)
+                try:
+                    item = await d
+                    i, it = pair
+                    awaits[_next_task(it, loop)] = pair
+                    yield i, item
+                except StopAsyncIteration:
+                    pass
+    finally:
+        # Cancel any remaining iterators
+        for f, (_, it) in awaits.items():
+            with contextlib.suppress(BaseException):
+                f.cancel()
+                await it.aclose()
+
+
+async def collect_from_async_generator(
+        iterator: AsyncGenerator[T, None]) -> list[T]:
+    """Collect all items from an async generator into a list."""
+    items = []
+    async for item in iterator:
+        items.append(item)
+    return items
+
+
+def get_ip() -> str:
+    host_ip = envs.VLLM_HOST_IP
+    if "HOST_IP" in os.environ and "VLLM_HOST_IP" not in os.environ:
+        logger.warning(
+            "The environment variable HOST_IP is deprecated and ignored, as"
+            " it is often used by Docker and other software to"
+            " interact with the container's network stack. Please "
+            "use VLLM_HOST_IP instead to set the IP address for vLLM processes"
+            " to communicate with each other.")
+    if host_ip:
+        return host_ip
+
+    # IP is not set, try to get it from the network interface
+
+    # try ipv4
+    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+    try:
+        s.connect(("8.8.8.8", 80))  # Doesn't need to be reachable
+        return s.getsockname()[0]
+    except Exception:
+        pass
+
+    # try ipv6
+    try:
+        s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
+        # Google's public DNS server, see
+        # https://developers.google.com/speed/public-dns/docs/using#addresses
+        s.connect(("2001:4860:4860::8888", 80))  # Doesn't need to be reachable
+        return s.getsockname()[0]
+    except Exception:
+        pass
+
+    warnings.warn(
+        "Failed to get the IP address, using 0.0.0.0 by default."
+        "The value can be set by the environment variable"
+        " VLLM_HOST_IP or HOST_IP.",
+        stacklevel=2)
+    return "0.0.0.0"
+
+
+def test_loopback_bind(address, family):
+    try:
+        s = socket.socket(family, socket.SOCK_DGRAM)
+        s.bind((address, 0))  # Port 0 = auto assign
+        s.close()
+        return True
+    except OSError:
+        return False
+
+
+def get_loopback_ip() -> str:
+    loopback_ip = envs.VLLM_LOOPBACK_IP
+    if loopback_ip:
+        return loopback_ip
+
+    # VLLM_LOOPBACK_IP is not set, try to get it based on network interface
+
+    if test_loopback_bind("127.0.0.1", socket.AF_INET):
+        return "127.0.0.1"
+    elif test_loopback_bind("::1", socket.AF_INET6):
+        return "::1"
+    else:
+        raise RuntimeError(
+            "Neither 127.0.0.1 nor ::1 are bound to a local interface. "
+            "Set the VLLM_LOOPBACK_IP environment variable explicitly.")
+
+
+def is_valid_ipv6_address(address: str) -> bool:
+    try:
+        ipaddress.IPv6Address(address)
+        return True
+    except ValueError:
+        return False
+
+
+def split_host_port(host_port: str) -> Tuple[str, int]:
+    # ipv6
+    if host_port.startswith('['):
+        host, port = host_port.rsplit(']', 1)
+        host = host[1:]
+        port = port.split(':')[1]
+        return host, int(port)
+    else:
+        host, port = host_port.split(':')
+        return host, int(port)
+
+
+def join_host_port(host: str, port: int) -> str:
+    if is_valid_ipv6_address(host):
+        return f"[{host}]:{port}"
+    else:
+        return f"{host}:{port}"
+
+
+def get_distributed_init_method(ip: str, port: int) -> str:
+    return get_tcp_uri(ip, port)
+
+
+def get_tcp_uri(ip: str, port: int) -> str:
+    if is_valid_ipv6_address(ip):
+        return f"tcp://[{ip}]:{port}"
+    else:
+        return f"tcp://{ip}:{port}"
+
+
+def get_open_zmq_ipc_path() -> str:
+    base_rpc_path = envs.VLLM_RPC_BASE_PATH
+    return f"ipc://{base_rpc_path}/{uuid4()}"
+
+
+def get_open_zmq_inproc_path() -> str:
+    return f"inproc://{uuid4()}"
+
+
+def get_open_port() -> int:
+    """
+    Get an open port for the vLLM process to listen on.
+    An edge case to handle, is when we run data parallel,
+    we need to avoid ports that are potentially used by
+    the data parallel master process.
+    Right now we reserve 10 ports for the data parallel master
+    process. Currently it uses 2 ports.
+    """
+    if "VLLM_DP_MASTER_PORT" in os.environ:
+        dp_master_port = envs.VLLM_DP_MASTER_PORT
+        reserved_port_range = range(dp_master_port, dp_master_port + 10)
+        while True:
+            candidate_port = _get_open_port()
+            if candidate_port not in reserved_port_range:
+                return candidate_port
+    return _get_open_port()
+
+
+def _get_open_port() -> int:
+    port = envs.VLLM_PORT
+    if port is not None:
+        while True:
+            try:
+                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+                    s.bind(("", port))
+                    return port
+            except OSError:
+                port += 1  # Increment port number if already in use
+                logger.info("Port %d is already in use, trying port %d",
+                            port - 1, port)
+    # try ipv4
+    try:
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.bind(("", 0))
+            return s.getsockname()[1]
+    except OSError:
+        # try ipv6
+        with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s:
+            s.bind(("", 0))
+            return s.getsockname()[1]
+
+
+def find_process_using_port(port: int) -> Optional[psutil.Process]:
+    # TODO: We can not check for running processes with network
+    # port on macOS. Therefore, we can not have a full graceful shutdown
+    # of vLLM. For now, let's not look for processes in this case.
+    # Ref: https://www.florianreinhard.de/accessdenied-in-psutil/
+    if sys.platform.startswith("darwin"):
+        return None
+
+    for conn in psutil.net_connections():
+        if conn.laddr.port == port:
+            try:
+                return psutil.Process(conn.pid)
+            except psutil.NoSuchProcess:
+                return None
+    return None
+
+
+def update_environment_variables(envs: dict[str, str]):
+    for k, v in envs.items():
+        if k in os.environ and os.environ[k] != v:
+            logger.warning(
+                "Overwriting environment variable %s "
+                "from '%s' to '%s'", k, os.environ[k], v)
+        os.environ[k] = v
+
+
+def chunk_list(lst: list[T], chunk_size: int):
+    """Yield successive chunk_size chunks from lst."""
+    for i in range(0, len(lst), chunk_size):
+        yield lst[i:i + chunk_size]
+
+
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+
+
+def next_power_of_2(n) -> int:
+    """The next power of 2 (inclusive)"""
+    if n < 1:
+        return 1
+    return 1 << (n - 1).bit_length()
+
+
+def prev_power_of_2(n: int) -> int:
+    """The previous power of 2 (inclusive)"""
+    if n <= 0:
+        return 0
+    return 1 << (n.bit_length() - 1)
+
+
+def round_up(x: int, y: int) -> int:
+    return ((x + y - 1) // y) * y
+
+
+def round_down(x: int, y: int) -> int:
+    return (x // y) * y
+
+
+def _generate_random_fp8(
+    tensor: torch.Tensor,
+    low: float,
+    high: float,
+) -> None:
+    # NOTE(zhaoyang): Due to NaN and Inf representation for fp8 data type,
+    # it may occur Inf or NaN if we directly use torch.randint
+    # to generate random data for fp8 data.
+    # For example, s.11111.00 in fp8e5m2 format represents Inf.
+    #     | E4M3        | E5M2
+    # -----|-------------|-------------------
+    # Inf | N/A         | s.11111.00
+    # NaN | s.1111.111  | s.11111.{01,10,11}
+    from vllm import _custom_ops as ops
+    tensor_tmp = torch.empty_like(tensor, dtype=torch.float16)
+    tensor_tmp.uniform_(low, high)
+    ops.convert_fp8(tensor, tensor_tmp)
+    del tensor_tmp
+
+
+def get_kv_cache_torch_dtype(
+        cache_dtype: Optional[Union[str, torch.dtype]],
+        model_dtype: Optional[Union[str, torch.dtype]] = None) -> torch.dtype:
+    if isinstance(cache_dtype, str):
+        if cache_dtype == "auto":
+            if isinstance(model_dtype,
+                          str) and model_dtype in STR_DTYPE_TO_TORCH_DTYPE:
+                torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[model_dtype]
+            elif isinstance(model_dtype, torch.dtype):
+                torch_dtype = model_dtype
+            else:
+                raise ValueError(f"Invalid model dtype: {model_dtype}")
+        elif cache_dtype in STR_DTYPE_TO_TORCH_DTYPE:
+            torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype]
+        else:
+            raise ValueError(f"Invalid kv cache dtype: {cache_dtype}")
+    elif isinstance(cache_dtype, torch.dtype):
+        torch_dtype = cache_dtype
+    else:
+        raise ValueError(f"Invalid kv cache dtype: {cache_dtype}")
+    return torch_dtype
+
+
+def create_kv_caches_with_random_flash(
+    num_blocks: int,
+    block_size: int,
+    num_layers: int,
+    num_heads: int,
+    head_size: int,
+    cache_dtype: Optional[Union[str, torch.dtype]],
+    model_dtype: Optional[Union[str, torch.dtype]] = None,
+    seed: Optional[int] = None,
+    device: Optional[str] = "cuda",
+    cache_layout: Optional[str] = "NHD",
+) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
+    from vllm.platforms import current_platform
+    current_platform.seed_everything(seed)
+
+    torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
+    generic_kv_cache_shape = (num_blocks, 2, block_size, num_heads, head_size)
+    assert cache_layout in ("NHD", "HND")
+    stride_order = (0, 1, 2, 3, 4) if cache_layout == "NHD" else (0, 1, 3, 2,
+                                                                  4)
+
+    kv_cache_allocation_shape = tuple(generic_kv_cache_shape[i]
+                                      for i in stride_order)
+    scale = head_size**-0.5
+
+    key_caches: list[torch.Tensor] = []
+    value_caches: list[torch.Tensor] = []
+
+    for _ in range(num_layers):
+        key_value_cache = torch.empty(size=kv_cache_allocation_shape,
+                                      dtype=torch_dtype,
+                                      device=device).permute(*stride_order)
+        if cache_dtype in ["auto", "half", "bfloat16", "float"]:
+            key_value_cache.uniform_(-scale, scale)
+        elif cache_dtype == 'fp8':
+            _generate_random_fp8(key_value_cache, -scale, scale)
+        else:
+            raise ValueError(
+                f"Does not support key cache of type {cache_dtype}")
+        key_caches.append(key_value_cache[:, 0])
+        value_caches.append(key_value_cache[:, 1])
+    return key_caches, value_caches
+
+
+def create_kv_caches_with_random(
+    num_blocks: int,
+    block_size: int,
+    num_layers: int,
+    num_heads: int,
+    head_size: int,
+    cache_dtype: Optional[Union[str, torch.dtype]],
+    model_dtype: Optional[Union[str, torch.dtype]] = None,
+    seed: Optional[int] = None,
+    device: Optional[str] = "cuda",
+) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
+    if cache_dtype == "fp8" and head_size % 16:
+        raise ValueError(
+            f"Does not support key cache of type fp8 with head_size {head_size}"
+        )
+    from vllm.platforms import current_platform
+    current_platform.seed_everything(seed)
+
+    torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
+
+    scale = head_size**-0.5
+    x = 16 // torch.tensor([], dtype=torch_dtype).element_size()
+    key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
+    key_caches: list[torch.Tensor] = []
+    for _ in range(num_layers):
+        key_cache = torch.empty(size=key_cache_shape,
+                                dtype=torch_dtype,
+                                device=device)
+        if cache_dtype in ["auto", "half", "bfloat16", "float"]:
+            key_cache.uniform_(-scale, scale)
+        elif cache_dtype == 'fp8':
+            _generate_random_fp8(key_cache, -scale, scale)
+        else:
+            raise ValueError(
+                f"Does not support key cache of type {cache_dtype}")
+        key_caches.append(key_cache)
+
+    value_cache_shape = (num_blocks, num_heads, head_size, block_size)
+    value_caches: list[torch.Tensor] = []
+    for _ in range(num_layers):
+        value_cache = torch.empty(size=value_cache_shape,
+                                  dtype=torch_dtype,
+                                  device=device)
+        if cache_dtype in ["auto", "half", "bfloat16", "float"]:
+            value_cache.uniform_(-scale, scale)
+        elif cache_dtype == 'fp8':
+            _generate_random_fp8(value_cache, -scale, scale)
+        else:
+            raise ValueError(
+                f"Does not support value cache of type {cache_dtype}")
+        value_caches.append(value_cache)
+    return key_caches, value_caches
+
+
+@cache
+def is_pin_memory_available() -> bool:
+    from vllm.platforms import current_platform
+    return current_platform.is_pin_memory_available()
+
+
+@cache
+def is_uva_available() -> bool:
+    """Check if Unified Virtual Addressing (UVA) is available."""
+    # UVA requires pinned memory.
+    # TODO: Add more requirements for UVA if needed.
+    return is_pin_memory_available()
+
+
+class DeviceMemoryProfiler:
+
+    def __init__(self, device: Optional[torch.types.Device] = None):
+        self.device = device
+
+    def current_memory_usage(self) -> float:
+        # Return the memory usage in bytes.
+        from vllm.platforms import current_platform
+        gc.collect()
+        return current_platform.get_current_memory_usage(self.device)
+
+    def __enter__(self):
+        self.initial_memory = self.current_memory_usage()
+        # This allows us to call methods of the context manager if needed
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.final_memory = self.current_memory_usage()
+        self.consumed_memory = self.final_memory - self.initial_memory
+
+        # Force garbage collection
+        gc.collect()
+
+
+def make_ndarray_with_pad(
+    x: list[list[T]],
+    pad: T,
+    dtype: npt.DTypeLike,
+    *,
+    max_len: Optional[int] = None,
+) -> npt.NDArray:
+    """
+    Make a padded array from 2D inputs.
+
+    The padding is applied to the end of each inner list until it reaches
+    `max_len`.
+    """
+    if max_len is None:
+        # Unlike for most functions, map is faster than a genexpr over `len`
+        max_len = max(map(len, x), default=0)
+
+    padded_x = np.full((len(x), max_len), pad, dtype=dtype)
+    for ind, blocktb in enumerate(x):
+        assert len(blocktb) <= max_len
+        padded_x[ind, :len(blocktb)] = blocktb
+
+    return padded_x
+
+
+def make_tensor_with_pad(
+    x: list[list[T]],
+    pad: T,
+    dtype: torch.dtype,
+    *,
+    max_len: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    pin_memory: bool = False,
+) -> torch.Tensor:
+    """
+    Make a padded tensor from 2D inputs.
+
+    The padding is applied to the end of each inner list until it reaches
+    `max_len`.
+    """
+    np_dtype = TORCH_DTYPE_TO_NUMPY_DTYPE[dtype]
+    padded_x = make_ndarray_with_pad(x, pad, np_dtype, max_len=max_len)
+
+    tensor = torch.from_numpy(padded_x).to(device)
+    if pin_memory:
+        tensor = tensor.pin_memory()
+
+    return tensor
+
+
+def async_tensor_h2d(
+    data: list,
+    dtype: torch.dtype,
+    target_device: Union[str, torch.device],
+    pin_memory: bool,
+) -> torch.Tensor:
+    """Asynchronously create a tensor and copy it from host to device."""
+    t = torch.tensor(data, dtype=dtype, pin_memory=pin_memory, device="cpu")
+    return t.to(device=target_device, non_blocking=True)
+
+
+def get_dtype_size(dtype: torch.dtype) -> int:
+    """Get the size of the data type in bytes."""
+    return torch.tensor([], dtype=dtype).element_size()
+
+
+# bool = 0, int = 1, float = 2, complex = 3
+def _get_precision_level(dtype: torch.dtype) -> int:
+    # NOTE: Complex dtypes return `is_floating_point=False`
+    return ((dtype != torch.bool) + dtype.is_floating_point +
+            dtype.is_complex * 2)
+
+
+def is_lossless_cast(src_dtype: torch.dtype, tgt_dtype: torch.dtype):
+    """
+    Test whether it is lossless to cast a tensor from
+    `src_dtype` to `tgt_dtype`.
+    """
+    if src_dtype == tgt_dtype:
+        return True
+
+    src_level = _get_precision_level(src_dtype)
+    tgt_level = _get_precision_level(tgt_dtype)
+
+    if src_level < tgt_level:
+        return True
+    if src_level > tgt_level:
+        return False
+
+    # Compare integral types
+    if not src_dtype.is_floating_point and not src_dtype.is_complex:
+        src_info = torch.iinfo(src_dtype)
+        tgt_info = torch.iinfo(tgt_dtype)
+        return src_info.min >= tgt_info.min and src_info.max <= tgt_info.max
+
+    # Compare floating-point types
+    src_info = torch.finfo(src_dtype)
+    tgt_info = torch.finfo(tgt_dtype)
+    return (src_info.min >= tgt_info.min and src_info.max <= tgt_info.max
+            and src_info.resolution >= tgt_info.resolution)
+
+
+def common_broadcastable_dtype(dtypes: Collection[torch.dtype]):
+    """
+    Get the common `dtype` where all of the other `dtypes` can be
+    cast to it without losing any information.
+    """
+    return max(
+        dtypes,
+        key=lambda dtype: sum(is_lossless_cast(dt, dtype) for dt in dtypes),
+    )
+
+
+# `collections` helpers
+def is_list_of(
+    value: object,
+    typ: Union[type[T], tuple[type[T], ...]],
+    *,
+    check: Literal["first", "all"] = "first",
+) -> TypeIs[list[T]]:
+    if not isinstance(value, list):
+        return False
+
+    if check == "first":
+        return len(value) == 0 or isinstance(value[0], typ)
+    elif check == "all":
+        return all(isinstance(v, typ) for v in value)
+
+    assert_never(check)
+
+
+def flatten_2d_lists(lists: Iterable[Iterable[T]]) -> list[T]:
+    """Flatten a list of lists to a single list."""
+    return [item for sublist in lists for item in sublist]
+
+
+def full_groupby(values: Iterable[_V], *, key: Callable[[_V], _K]):
+    """
+    Unlike [`itertools.groupby`][], groups are not broken by
+    non-contiguous data.
+    """
+    groups = defaultdict[_K, list[_V]](list)
+
+    for value in values:
+        groups[key(value)].append(value)
+
+    return groups.items()
+
+
+# TODO: This function can be removed if transformer_modules classes are
+# serialized by value when communicating between processes
+def init_cached_hf_modules() -> None:
+    """
+    Lazy initialization of the Hugging Face modules.
+    """
+    from transformers.dynamic_module_utils import init_hf_modules
+    init_hf_modules()
+
+
+@cache
+def find_library(lib_name: str) -> str:
+    """
+    Find the library file in the system.
+    `lib_name` is full filename, with both prefix and suffix.
+    This function resolves `lib_name` to the full path of the library.
+    """
+    # Adapted from https://github.com/openai/triton/blob/main/third_party/nvidia/backend/driver.py#L19 # noqa
+    # According to https://en.wikipedia.org/wiki/Filesystem_Hierarchy_Standard
+    # `/sbin/ldconfig` should exist in all Linux systems.
+    # `/sbin/ldconfig` searches the library in the system
+    libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode()
+    # each line looks like the following:
+    # libcuda.so.1 (libc6,x86-64) => /lib/x86_64-linux-gnu/libcuda.so.1
+    locs = [line.split()[-1] for line in libs.splitlines() if lib_name in line]
+    # `LD_LIBRARY_PATH` searches the library in the user-defined paths
+    env_ld_library_path = envs.LD_LIBRARY_PATH
+    if not locs and env_ld_library_path:
+        locs = [
+            os.path.join(dir, lib_name)
+            for dir in env_ld_library_path.split(":")
+            if os.path.exists(os.path.join(dir, lib_name))
+        ]
+    if not locs:
+        raise ValueError(f"Cannot find {lib_name} in the system.")
+    return locs[0]
+
+
+def find_nccl_library() -> str:
+    """
+    We either use the library file specified by the `VLLM_NCCL_SO_PATH`
+    environment variable, or we find the library file brought by PyTorch.
+    After importing `torch`, `libnccl.so.2` or `librccl.so.1` can be
+    found by `ctypes` automatically.
+    """
+    so_file = envs.VLLM_NCCL_SO_PATH
+
+    # manually load the nccl library
+    if so_file:
+        logger.info(
+            "Found nccl from environment variable VLLM_NCCL_SO_PATH=%s",
+            so_file)
+    else:
+        if torch.version.cuda is not None:
+            so_file = "libnccl.so.2"
+        elif torch.version.hip is not None:
+            so_file = "librccl.so.1"
+        else:
+            raise ValueError("NCCL only supports CUDA and ROCm backends.")
+        logger.info("Found nccl from library %s", so_file)
+    return so_file
+
+
+prev_set_stream = torch.cuda.set_stream
+
+_current_stream_tls = threading.local()
+
+
+def _patched_set_stream(stream: torch.cuda.Stream) -> None:
+    _current_stream_tls.value = stream
+    prev_set_stream(stream)
+
+
+torch.cuda.set_stream = _patched_set_stream
+
+
+def current_stream() -> torch.cuda.Stream:
+    """
+    replace `torch.cuda.current_stream()` with `vllm.utils.current_stream()`.
+    it turns out that `torch.cuda.current_stream()` is quite expensive,
+    as it will construct a new stream object at each call.
+    here we patch `torch.cuda.set_stream` to keep track of the current stream
+    directly, so that we can avoid calling `torch.cuda.current_stream()`.
+
+    the underlying hypothesis is that we do not call `torch._C._cuda_setStream`
+    from C/C++ code.
+    """
+    from vllm.platforms import current_platform
+    if not hasattr(_current_stream_tls,
+                   "value") or _current_stream_tls.value is None:
+        # when this function is called before any stream is set,
+        # we return the default stream.
+        # On ROCm using the default 0 stream in combination with RCCL
+        # is hurting performance. Therefore creating a dedicated stream
+        # per process
+        _current_stream_tls.value = torch.cuda.Stream(
+        ) if current_platform.is_rocm() else torch.cuda.current_stream()
+    return _current_stream_tls.value
+
+
+def enable_trace_function_call_for_thread(vllm_config: VllmConfig) -> None:
+    """Set up function tracing for the current thread,
+    if enabled via the VLLM_TRACE_FUNCTION environment variable
+    """
+
+    if envs.VLLM_TRACE_FUNCTION:
+        tmp_dir = tempfile.gettempdir()
+        # add username to tmp_dir to avoid permission issues
+        tmp_dir = os.path.join(tmp_dir, getpass.getuser())
+        filename = (f"VLLM_TRACE_FUNCTION_for_process_{os.getpid()}"
+                    f"_thread_{threading.get_ident()}_"
+                    f"at_{datetime.datetime.now()}.log").replace(" ", "_")
+        log_path = os.path.join(tmp_dir, "vllm",
+                                f"vllm-instance-{vllm_config.instance_id}",
+                                filename)
+        os.makedirs(os.path.dirname(log_path), exist_ok=True)
+        enable_trace_function_call(log_path)
+
+
+# `functools` helpers
+def identity(value: T, **kwargs) -> T:
+    """Returns the first provided value."""
+    return value
+
+
+F = TypeVar('F', bound=Callable[..., Any])
+
+
+def deprecate_args(
+    start_index: int,
+    is_deprecated: Union[bool, Callable[[], bool]] = True,
+    additional_message: Optional[str] = None,
+) -> Callable[[F], F]:
+    if not callable(is_deprecated):
+        is_deprecated = partial(identity, is_deprecated)
+
+    def wrapper(fn: F) -> F:
+
+        params = inspect.signature(fn).parameters
+        pos_types = (
+            inspect.Parameter.POSITIONAL_ONLY,
+            inspect.Parameter.POSITIONAL_OR_KEYWORD,
+        )
+        pos_kws = [
+            kw for kw, param in params.items() if param.kind in pos_types
+        ]
+
+        @wraps(fn)
+        def inner(*args, **kwargs):
+            if is_deprecated():
+                deprecated_args = pos_kws[start_index:len(args)]
+                if deprecated_args:
+                    msg = (
+                        f"The positional arguments {deprecated_args} are "
+                        "deprecated and will be removed in a future update.")
+                    if additional_message is not None:
+                        msg += f" {additional_message}"
+
+                    warnings.warn(
+                        DeprecationWarning(msg),
+                        stacklevel=3,  # The inner function takes up one level
+                    )
+
+            return fn(*args, **kwargs)
+
+        return inner  # type: ignore
+
+    return wrapper
+
+
+def deprecate_kwargs(
+    *kws: str,
+    is_deprecated: Union[bool, Callable[[], bool]] = True,
+    additional_message: Optional[str] = None,
+) -> Callable[[F], F]:
+    deprecated_kws = set(kws)
+
+    if not callable(is_deprecated):
+        is_deprecated = partial(identity, is_deprecated)
+
+    def wrapper(fn: F) -> F:
+
+        @wraps(fn)
+        def inner(*args, **kwargs):
+            if is_deprecated():
+                deprecated_kwargs = kwargs.keys() & deprecated_kws
+                if deprecated_kwargs:
+                    msg = (
+                        f"The keyword arguments {deprecated_kwargs} are "
+                        "deprecated and will be removed in a future update.")
+                    if additional_message is not None:
+                        msg += f" {additional_message}"
+
+                    warnings.warn(
+                        DeprecationWarning(msg),
+                        stacklevel=3,  # The inner function takes up one level
+                    )
+
+            return fn(*args, **kwargs)
+
+        return inner  # type: ignore
+
+    return wrapper
+
+
+@lru_cache(maxsize=8)
+def _cuda_device_count_stateless(
+        cuda_visible_devices: Optional[str] = None) -> int:
+    # Note: cuda_visible_devices is not used, but we keep it as an argument for
+    # LRU Cache purposes.
+
+    # Code below is based on
+    # https://github.com/pytorch/pytorch/blob/
+    # c1cd946818442aca8c7f812b16d187ce1586c3bc/
+    # torch/cuda/__init__.py#L831C1-L831C17
+    import torch.cuda
+    import torch.version
+
+    from vllm.platforms import current_platform
+    if not torch.cuda._is_compiled():
+        return 0
+    if current_platform.is_rocm():
+        # ROCm uses amdsmi instead of nvml for stateless device count
+        # This requires a sufficiently modern version of Torch 2.4.0
+        raw_count = torch.cuda._device_count_amdsmi() if (hasattr(
+            torch.cuda, "_device_count_amdsmi")) else -1
+    else:
+        raw_count = torch.cuda._device_count_nvml()
+    r = torch._C._cuda_getDeviceCount() if raw_count < 0 else raw_count
+    return r
+
+
+def cuda_device_count_stateless() -> int:
+    """Get number of CUDA devices, caching based on the value of
+    CUDA_VISIBLE_DEVICES at the time of call.
+
+    This should be used instead of torch.cuda.device_count()
+    unless CUDA_VISIBLE_DEVICES has already been set to the desired
+    value."""
+
+    # This can be removed and simply replaced with torch.cuda.get_device_count
+    # after https://github.com/pytorch/pytorch/pull/122815 is released.
+    return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
+
+
+def cuda_is_initialized() -> bool:
+    """Check if CUDA is initialized."""
+    if not torch.cuda._is_compiled():
+        return False
+    return torch.cuda.is_initialized()
+
+
+def xpu_is_initialized() -> bool:
+    """Check if XPU is initialized."""
+    if not torch.xpu._is_compiled():
+        return False
+    return torch.xpu.is_initialized()
+
+
+def cuda_get_device_properties(device,
+                               names: Sequence[str],
+                               init_cuda=False) -> tuple[Any, ...]:
+    """Get specified CUDA device property values without initializing CUDA in
+    the current process."""
+    if init_cuda or cuda_is_initialized():
+        props = torch.cuda.get_device_properties(device)
+        return tuple(getattr(props, name) for name in names)
+
+    # Run in subprocess to avoid initializing CUDA as a side effect.
+    mp_ctx = multiprocessing.get_context("fork")
+    with ProcessPoolExecutor(max_workers=1, mp_context=mp_ctx) as executor:
+        return executor.submit(cuda_get_device_properties, device, names,
+                               True).result()
+
+
+def weak_bind(bound_method: Callable[..., Any], ) -> Callable[..., None]:
+    """Make an instance method that weakly references
+    its associated instance and no-ops once that
+    instance is collected."""
+    ref = weakref.ref(bound_method.__self__)  # type: ignore[attr-defined]
+    unbound = bound_method.__func__  # type: ignore[attr-defined]
+
+    def weak_bound(*args, **kwargs) -> None:
+        if inst := ref():
+            unbound(inst, *args, **kwargs)
+
+    return weak_bound
+
+
+# From: https://stackoverflow.com/a/4104188/2749989
+def run_once(f: Callable[P, None]) -> Callable[P, None]:
+
+    def wrapper(*args: P.args, **kwargs: P.kwargs) -> None:
+        if not wrapper.has_run:  # type: ignore[attr-defined]
+            wrapper.has_run = True  # type: ignore[attr-defined]
+            return f(*args, **kwargs)
+
+    wrapper.has_run = False  # type: ignore[attr-defined]
+    return wrapper
+
+
+class StoreBoolean(Action):
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        if values.lower() == "true":
+            setattr(namespace, self.dest, True)
+        elif values.lower() == "false":
+            setattr(namespace, self.dest, False)
+        else:
+            raise ValueError(f"Invalid boolean value: {values}. "
+                             "Expected 'true' or 'false'.")
+
+
+class SortedHelpFormatter(ArgumentDefaultsHelpFormatter,
+                          RawDescriptionHelpFormatter):
+    """SortedHelpFormatter that sorts arguments by their option strings."""
+
+    def _split_lines(self, text, width):
+        """
+        1. Sentences split across lines have their single newlines removed.
+        2. Paragraphs and explicit newlines are split into separate lines.
+        3. Each line is wrapped to the specified width (width of terminal).
+        """
+        # The patterns also include whitespace after the newline
+        single_newline = re.compile(r"(?<!\n)\n(?!\n)\s*")
+        multiple_newlines = re.compile(r"\n{2,}\s*")
+        text = single_newline.sub(' ', text)
+        lines = re.split(multiple_newlines, text)
+        return sum([textwrap.wrap(line, width) for line in lines], [])
+
+    def add_arguments(self, actions):
+        actions = sorted(actions, key=lambda x: x.option_strings)
+        super().add_arguments(actions)
+
+
+class FlexibleArgumentParser(ArgumentParser):
+    """ArgumentParser that allows both underscore and dash in names."""
+
+    _deprecated: set[Action] = set()
+
+    def __init__(self, *args, **kwargs):
+        # Set the default 'formatter_class' to SortedHelpFormatter
+        if 'formatter_class' not in kwargs:
+            kwargs['formatter_class'] = SortedHelpFormatter
+        super().__init__(*args, **kwargs)
+
+    if sys.version_info < (3, 13):
+        # Enable the deprecated kwarg for Python 3.12 and below
+
+        def parse_known_args(self, args=None, namespace=None):
+            namespace, args = super().parse_known_args(args, namespace)
+            for action in FlexibleArgumentParser._deprecated:
+                if (hasattr(namespace, dest := action.dest)
+                        and getattr(namespace, dest) != action.default):
+                    logger.warning_once("argument '%s' is deprecated", dest)
+            return namespace, args
+
+        def add_argument(self, *args, **kwargs):
+            deprecated = kwargs.pop("deprecated", False)
+            action = super().add_argument(*args, **kwargs)
+            if deprecated:
+                FlexibleArgumentParser._deprecated.add(action)
+            return action
+
+        class _FlexibleArgumentGroup(_ArgumentGroup):
+
+            def add_argument(self, *args, **kwargs):
+                deprecated = kwargs.pop("deprecated", False)
+                action = super().add_argument(*args, **kwargs)
+                if deprecated:
+                    FlexibleArgumentParser._deprecated.add(action)
+                return action
+
+        def add_argument_group(self, *args, **kwargs):
+            group = self._FlexibleArgumentGroup(self, *args, **kwargs)
+            self._action_groups.append(group)
+            return group
+
+    def parse_args(  # type: ignore[override]
+        self,
+        args: list[str] | None = None,
+        namespace: Namespace | None = None,
+    ):
+        if args is None:
+            args = sys.argv[1:]
+
+        # Check for --model in command line arguments first
+        if args and args[0] == "serve":
+            model_in_cli_args = any(arg == '--model' for arg in args)
+
+            if model_in_cli_args:
+                raise ValueError(
+                    "With `vllm serve`, you should provide the model as a "
+                    "positional argument or in a config file instead of via "
+                    "the `--model` option.")
+
+        if '--config' in args:
+            args = self._pull_args_from_config(args)
+
+        def repl(match: re.Match) -> str:
+            """Replaces underscores with dashes in the matched string."""
+            return match.group(0).replace("_", "-")
+
+        # Everything between the first -- and the first .
+        pattern = re.compile(r"(?<=--)[^\.]*")
+
+        # Convert underscores to dashes and vice versa in argument names
+        processed_args = list[str]()
+        for i, arg in enumerate(args):
+            if arg.startswith('--'):
+                if '=' in arg:
+                    key, value = arg.split('=', 1)
+                    key = pattern.sub(repl, key, count=1)
+                    processed_args.append(f'{key}={value}')
+                else:
+                    key = pattern.sub(repl, arg, count=1)
+                    processed_args.append(key)
+            elif arg.startswith('-O') and arg != '-O' and arg[2] != '.':
+                # allow -O flag to be used without space, e.g. -O3 or -Odecode
+                # -O.<...> handled later
+                # also handle -O=<level> here
+                level = arg[3:] if arg[2] == '=' else arg[2:]
+                processed_args.append(f'-O.level={level}')
+            elif arg == '-O' and i + 1 < len(args) and args[i + 1] in {
+                    "0", "1", "2", "3"
+            }:
+                # Convert -O <n> to -O.level <n>
+                processed_args.append('-O.level')
+            else:
+                processed_args.append(arg)
+
+        def create_nested_dict(keys: list[str], value: str) -> dict[str, Any]:
+            """Creates a nested dictionary from a list of keys and a value.
+
+            For example, `keys = ["a", "b", "c"]` and `value = 1` will create:
+            `{"a": {"b": {"c": 1}}}`
+            """
+            nested_dict: Any = value
+            for key in reversed(keys):
+                nested_dict = {key: nested_dict}
+            return nested_dict
+
+        def recursive_dict_update(
+            original: dict[str, Any],
+            update: dict[str, Any],
+        ) -> set[str]:
+            """Recursively updates a dictionary with another dictionary.
+            Returns a set of duplicate keys that were overwritten.
+            """
+            duplicates = set[str]()
+            for k, v in update.items():
+                if isinstance(v, dict) and isinstance(original.get(k), dict):
+                    nested_duplicates = recursive_dict_update(original[k], v)
+                    duplicates |= {f"{k}.{d}" for d in nested_duplicates}
+                elif isinstance(v, list) and isinstance(original.get(k), list):
+                    original[k] += v
+                else:
+                    if k in original:
+                        duplicates.add(k)
+                    original[k] = v
+            return duplicates
+
+        delete = set[int]()
+        dict_args = defaultdict[str, dict[str, Any]](dict)
+        duplicates = set[str]()
+        for i, processed_arg in enumerate(processed_args):
+            if i in delete:  # skip if value from previous arg
+                continue
+
+            if processed_arg.startswith("-") and "." in processed_arg:
+                if "=" in processed_arg:
+                    processed_arg, value_str = processed_arg.split("=", 1)
+                    if "." not in processed_arg:
+                        # False positive, '.' was only in the value
+                        continue
+                else:
+                    value_str = processed_args[i + 1]
+                    delete.add(i + 1)
+
+                if processed_arg.endswith("+"):
+                    processed_arg = processed_arg[:-1]
+                    value_str = json.dumps(list(value_str.split(",")))
+
+                key, *keys = processed_arg.split(".")
+                try:
+                    value = json.loads(value_str)
+                except json.decoder.JSONDecodeError:
+                    value = value_str
+
+                # Merge all values with the same key into a single dict
+                arg_dict = create_nested_dict(keys, value)
+                arg_duplicates = recursive_dict_update(dict_args[key],
+                                                       arg_dict)
+                duplicates |= {f'{key}.{d}' for d in arg_duplicates}
+                delete.add(i)
+        # Filter out the dict args we set to None
+        processed_args = [
+            a for i, a in enumerate(processed_args) if i not in delete
+        ]
+        if duplicates:
+            logger.warning("Found duplicate keys %s", ", ".join(duplicates))
+
+        # Add the dict args back as if they were originally passed as JSON
+        for dict_arg, dict_value in dict_args.items():
+            processed_args.append(dict_arg)
+            processed_args.append(json.dumps(dict_value))
+
+        return super().parse_args(processed_args, namespace)
+
+    def check_port(self, value):
+        try:
+            value = int(value)
+        except ValueError:
+            msg = "Port must be an integer"
+            raise ArgumentTypeError(msg) from None
+
+        if not (1024 <= value <= 65535):
+            raise ArgumentTypeError("Port must be between 1024 and 65535")
+
+        return value
+
+    def _pull_args_from_config(self, args: list[str]) -> list[str]:
+        """Method to pull arguments specified in the config file
+        into the command-line args variable.
+
+        The arguments in config file will be inserted between
+        the argument list.
+
+        example:
+        ```yaml
+            port: 12323
+            tensor-parallel-size: 4
+        ```
+        ```python
+        $: vllm {serve,chat,complete} "facebook/opt-12B" \
+            --config config.yaml -tp 2
+        $: args = [
+            "serve,chat,complete",
+            "facebook/opt-12B",
+            '--config', 'config.yaml',
+            '-tp', '2'
+        ]
+        $: args = [
+            "serve,chat,complete",
+            "facebook/opt-12B",
+            '--port', '12323',
+            '--tensor-parallel-size', '4',
+            '-tp', '2'
+            ]
+        ```
+
+        Please note how the config args are inserted after the sub command.
+        this way the order of priorities is maintained when these are args
+        parsed by super().
+        """
+        assert args.count(
+            '--config') <= 1, "More than one config file specified!"
+
+        index = args.index('--config')
+        if index == len(args) - 1:
+            raise ValueError("No config file specified! \
+                             Please check your command-line arguments.")
+
+        file_path = args[index + 1]
+
+        config_args = self._load_config_file(file_path)
+
+        # 0th index is for {serve,chat,complete}
+        # optionally followed by model_tag (only for serve)
+        # followed by config args
+        # followed by rest of cli args.
+        # maintaining this order will enforce the precedence
+        # of cli > config > defaults
+        if args[0] == "serve":
+            model_in_cli = len(args) > 1 and not args[1].startswith('-')
+            model_in_config = any(arg == '--model' for arg in config_args)
+
+            if not model_in_cli and not model_in_config:
+                raise ValueError(
+                    "No model specified! Please specify model either "
+                    "as a positional argument or in a config file.")
+
+            if model_in_cli:
+                # Model specified as positional arg, keep CLI version
+                args = [args[0]] + [
+                    args[1]
+                ] + config_args + args[2:index] + args[index + 2:]
+            else:
+                # No model in CLI, use config if available
+                args = [args[0]
+                        ] + config_args + args[1:index] + args[index + 2:]
+        else:
+            args = [args[0]] + config_args + args[1:index] + args[index + 2:]
+
+        return args
+
+    def _load_config_file(self, file_path: str) -> list[str]:
+        """Loads a yaml file and returns the key value pairs as a
+        flattened list with argparse like pattern
+        ```yaml
+            port: 12323
+            tensor-parallel-size: 4
+        ```
+        returns:
+            processed_args: list[str] = [
+                '--port': '12323',
+                '--tensor-parallel-size': '4'
+            ]
+        """
+        extension: str = file_path.split('.')[-1]
+        if extension not in ('yaml', 'yml'):
+            raise ValueError(
+                "Config file must be of a yaml/yml type.\
+                              %s supplied", extension)
+
+        # only expecting a flat dictionary of atomic types
+        processed_args: list[str] = []
+
+        config: dict[str, Union[int, str]] = {}
+        try:
+            with open(file_path) as config_file:
+                config = yaml.safe_load(config_file)
+        except Exception as ex:
+            logger.error(
+                "Unable to read the config file at %s. \
+                Make sure path is correct", file_path)
+            raise ex
+
+        store_boolean_arguments = [
+            action.dest for action in self._actions
+            if isinstance(action, StoreBoolean)
+        ]
+
+        for key, value in config.items():
+            if isinstance(value, bool) and key not in store_boolean_arguments:
+                if value:
+                    processed_args.append('--' + key)
+            else:
+                processed_args.append('--' + key)
+                processed_args.append(str(value))
+
+        return processed_args
+
+
+async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args,
+                              **kwargs):
+    """Utility function to run async task in a lock"""
+    async with lock:
+        return await task(*args, **kwargs)
+
+
+def supports_kw(
+    callable: Callable[..., object],
+    kw_name: str,
+    *,
+    requires_kw_only: bool = False,
+    allow_var_kwargs: bool = True,
+) -> bool:
+    """Check if a keyword is a valid kwarg for a callable; if requires_kw_only
+    disallows kwargs names that can also be positional arguments.
+    """
+    params = inspect.signature(callable).parameters
+    if not params:
+        return False
+
+    param_val = params.get(kw_name)
+
+    # Types where the it may be valid, i.e., explicitly defined & nonvariadic
+    passable_kw_types = set((inspect.Parameter.POSITIONAL_ONLY,
+                             inspect.Parameter.POSITIONAL_OR_KEYWORD,
+                             inspect.Parameter.KEYWORD_ONLY))
+
+    if param_val:
+        is_sig_param = param_val.kind in passable_kw_types
+        # We want kwargs only, but this is passable as a positional arg
+        if (requires_kw_only and is_sig_param
+                and param_val.kind != inspect.Parameter.KEYWORD_ONLY):
+            return False
+        if ((requires_kw_only
+             and param_val.kind == inspect.Parameter.KEYWORD_ONLY)
+                or (not requires_kw_only and is_sig_param)):
+            return True
+
+    # If we're okay with var-kwargs, it's supported as long as
+    # the kw_name isn't something like *args, **kwargs
+    if allow_var_kwargs:
+        # Get the last param; type is ignored here because params is a proxy
+        # mapping, but it wraps an ordered dict, and they appear in order.
+        # Ref: https://docs.python.org/3/library/inspect.html#inspect.Signature.parameters
+        last_param = params[next(reversed(params))]  # type: ignore
+        return (last_param.kind == inspect.Parameter.VAR_KEYWORD
+                and last_param.name != kw_name)
+
+    return False
+
+
+def resolve_mm_processor_kwargs(
+    init_kwargs: Optional[Mapping[str, object]],
+    inference_kwargs: Optional[Mapping[str, object]],
+    callable: Callable[..., object],
+    *,
+    requires_kw_only: bool = True,
+    allow_var_kwargs: bool = False,
+) -> dict[str, Any]:
+    """Applies filtering to eliminate invalid mm_processor_kwargs, i.e.,
+    those who are not explicit keywords to the given callable (of one is
+    given; otherwise no filtering is done), then merges the kwarg dicts,
+    giving priority to inference_kwargs if there are any collisions.
+
+    In the case that no kwarg overrides are provided, returns an empty
+    dict so that it can still be kwarg expanded into the callable later on.
+
+    If allow_var_kwargs=True, allows for things that can be expanded into
+    kwargs as long as they aren't naming collision for var_kwargs or potential
+    positional arguments.
+    """
+    # Filter inference time multimodal processor kwargs provided
+    runtime_mm_kwargs = get_allowed_kwarg_only_overrides(
+        callable,
+        overrides=inference_kwargs,
+        requires_kw_only=requires_kw_only,
+        allow_var_kwargs=allow_var_kwargs,
+    )
+
+    # Filter init time multimodal processor kwargs provided
+    init_mm_kwargs = get_allowed_kwarg_only_overrides(
+        callable,
+        overrides=init_kwargs,
+        requires_kw_only=requires_kw_only,
+        allow_var_kwargs=allow_var_kwargs,
+    )
+
+    # Merge the final processor kwargs, prioritizing inference
+    # time values over the initialization time values.
+    mm_processor_kwargs = {**init_mm_kwargs, **runtime_mm_kwargs}
+
+    return mm_processor_kwargs
+
+
+def get_allowed_kwarg_only_overrides(
+    callable: Callable[..., object],
+    overrides: Optional[Mapping[str, object]],
+    *,
+    requires_kw_only: bool = True,
+    allow_var_kwargs: bool = False,
+) -> dict[str, Any]:
+    """
+    Given a callable which has one or more keyword only params and a dict
+    mapping param names to values, drop values that can be not be kwarg
+    expanded to overwrite one or more keyword-only args. This is used in a
+    few places to handle custom processor overrides for multimodal models,
+    e.g., for profiling when processor options provided by the user
+    may affect the number of mm tokens per instance.
+
+    Args:
+        callable: Callable which takes 0 or more keyword only arguments.
+                  If None is provided, all overrides names are allowed.
+        overrides: Potential overrides to be used when invoking the callable.
+        allow_var_kwargs: Allows overrides that are expandable for var kwargs.
+
+    Returns:
+        Dictionary containing the kwargs to be leveraged which may be used
+        to overwrite one or more keyword only arguments when invoking the
+        callable.
+    """
+    if not overrides:
+        return {}
+
+    # Drop any mm_processor_kwargs provided by the user that
+    # are not kwargs, unless it can fit it var_kwargs param
+    filtered_overrides = {
+        kwarg_name: val
+        for kwarg_name, val in overrides.items()
+        if supports_kw(callable,
+                       kwarg_name,
+                       requires_kw_only=requires_kw_only,
+                       allow_var_kwargs=allow_var_kwargs)
+    }
+
+    # If anything is dropped, log a warning
+    dropped_keys = overrides.keys() - filtered_overrides.keys()
+    if dropped_keys:
+        if requires_kw_only:
+            logger.warning(
+                "The following intended overrides are not keyword-only args "
+                "and will be dropped: %s", dropped_keys)
+        else:
+            logger.warning(
+                "The following intended overrides are not keyword args "
+                "and will be dropped: %s", dropped_keys)
+
+    return filtered_overrides
+
+
+# Using dynamo with vLLM doesn't really work well with PyTorch versions < 2.4.0.
+# In particular, the FakeScalarType is not supported for earlier versions of
+# PyTorch which breaks dynamo for any ops registered using ScalarType.
+def supports_dynamo() -> bool:
+    base_torch_version = Version(Version(torch.__version__).base_version)
+    return base_torch_version >= Version("2.4.0")
+
+
+# Supports xccl with PyTorch versions >= 2.8.0.dev for XPU platform
+def supports_xccl() -> bool:
+    return is_torch_equal_or_newer(
+        "2.8.0.dev") and torch.distributed.is_xccl_available()
+
+
+# Some backends use pytorch version < 2.4.0 which doesn't
+# support `torch.library.custom_op`.
+def supports_custom_op() -> bool:
+    return hasattr(torch.library, "custom_op")
+
+
+class AtomicCounter:
+    """An atomic, thread-safe counter"""
+
+    def __init__(self, initial=0):
+        """Initialize a new atomic counter to given initial value"""
+        self._value = initial
+        self._lock = threading.Lock()
+
+    def inc(self, num=1):
+        """Atomically increment the counter by num and return the new value"""
+        with self._lock:
+            self._value += num
+            return self._value
+
+    def dec(self, num=1):
+        """Atomically decrement the counter by num and return the new value"""
+        with self._lock:
+            self._value -= num
+            return self._value
+
+    @property
+    def value(self):
+        return self._value
+
+
+# Adapted from: https://stackoverflow.com/a/47212782/5082708
+class LazyDict(Mapping[str, T], Generic[T]):
+
+    def __init__(self, factory: dict[str, Callable[[], T]]):
+        self._factory = factory
+        self._dict: dict[str, T] = {}
+
+    def __getitem__(self, key: str) -> T:
+        if key not in self._dict:
+            if key not in self._factory:
+                raise KeyError(key)
+            self._dict[key] = self._factory[key]()
+        return self._dict[key]
+
+    def __setitem__(self, key: str, value: Callable[[], T]):
+        self._factory[key] = value
+
+    def __iter__(self):
+        return iter(self._factory)
+
+    def __len__(self):
+        return len(self._factory)
+
+
+class ClassRegistry(UserDict[type[T], _V]):
+
+    def __getitem__(self, key: type[T]) -> _V:
+        for cls in key.mro():
+            if cls in self.data:
+                return self.data[cls]
+
+        raise KeyError(key)
+
+    def __contains__(self, key: object) -> bool:
+        return self.contains(key)
+
+    def contains(self, key: object, *, strict: bool = False) -> bool:
+        if not isinstance(key, type):
+            return False
+
+        if strict:
+            return key in self.data
+
+        return any(cls in self.data for cls in key.mro())
+
+
+def weak_ref_tensor(tensor: Any) -> Any:
+    """
+    Create a weak reference to a tensor.
+    The new tensor will share the same data as the original tensor,
+    but will not keep the original tensor alive.
+    """
+    if isinstance(tensor, torch.Tensor):
+        return torch.ops._C.weak_ref_tensor(tensor)
+    else:
+        return tensor
+
+
+def weak_ref_tensors(
+    tensors: Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor]]
+) -> Union[torch.Tensor, list[Any], tuple[Any], Any]:
+    """
+    Convenience function to create weak references to tensors,
+    for single tensor, list of tensors or tuple of tensors.
+    """
+    if isinstance(tensors, torch.Tensor):
+        return weak_ref_tensor(tensors)
+    if isinstance(tensors, list):
+        return [weak_ref_tensor(t) for t in tensors]
+    if isinstance(tensors, tuple):
+        return tuple(weak_ref_tensor(t) for t in tensors)
+    raise ValueError("Invalid type for tensors")
+
+
+def get_cuda_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tensor:
+    """
+    Get a CUDA view of a CPU tensor using Unified Virtual Addressing (UVA).
+    """
+    assert cpu_tensor.is_pinned(), "CPU tensor must be pinned"
+    return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor)
+
+
+def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
+    """
+    Import a Python file according to its file path.
+
+    Based on the official recipe:
+    https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly
+    """
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ModuleNotFoundError(f"No module named '{module_name}'")
+
+    assert spec.loader is not None
+
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)
+    return module
+
+
+@cache
+def get_vllm_optional_dependencies():
+    metadata = importlib.metadata.metadata("vllm")
+    requirements = metadata.get_all("Requires-Dist", [])
+    extras = metadata.get_all("Provides-Extra", [])
+
+    return {
+        extra: [
+            re.split(r";|>=|<=|==", req)[0] for req in requirements
+            if req.endswith(f'extra == "{extra}"')
+        ]
+        for extra in extras
+    }
+
+
+class _PlaceholderBase:
+    """
+    Disallows downstream usage of placeholder modules.
+
+    We need to explicitly override each dunder method because
+    [`__getattr__`][vllm.utils._PlaceholderBase.__getattr__]
+    is not called when they are accessed.
+
+    Info:
+        [Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup)
+    """
+
+    def __getattr__(self, key: str) -> Never:
+        """
+        The main class should implement this to throw an error
+        for attribute accesses representing downstream usage.
+        """
+        raise NotImplementedError
+
+    # [Basic customization]
+
+    def __lt__(self, other: object):
+        return self.__getattr__("__lt__")
+
+    def __le__(self, other: object):
+        return self.__getattr__("__le__")
+
+    def __eq__(self, other: object):
+        return self.__getattr__("__eq__")
+
+    def __ne__(self, other: object):
+        return self.__getattr__("__ne__")
+
+    def __gt__(self, other: object):
+        return self.__getattr__("__gt__")
+
+    def __ge__(self, other: object):
+        return self.__getattr__("__ge__")
+
+    def __hash__(self):
+        return self.__getattr__("__hash__")
+
+    def __bool__(self):
+        return self.__getattr__("__bool__")
+
+    # [Callable objects]
+
+    def __call__(self, *args: object, **kwargs: object):
+        return self.__getattr__("__call__")
+
+    # [Container types]
+
+    def __len__(self):
+        return self.__getattr__("__len__")
+
+    def __getitem__(self, key: object):
+        return self.__getattr__("__getitem__")
+
+    def __setitem__(self, key: object, value: object):
+        return self.__getattr__("__setitem__")
+
+    def __delitem__(self, key: object):
+        return self.__getattr__("__delitem__")
+
+    # __missing__ is optional according to __getitem__ specification,
+    # so it is skipped
+
+    # __iter__ and __reversed__ have a default implementation
+    # based on __len__ and __getitem__, so they are skipped.
+
+    # [Numeric Types]
+
+    def __add__(self, other: object):
+        return self.__getattr__("__add__")
+
+    def __sub__(self, other: object):
+        return self.__getattr__("__sub__")
+
+    def __mul__(self, other: object):
+        return self.__getattr__("__mul__")
+
+    def __matmul__(self, other: object):
+        return self.__getattr__("__matmul__")
+
+    def __truediv__(self, other: object):
+        return self.__getattr__("__truediv__")
+
+    def __floordiv__(self, other: object):
+        return self.__getattr__("__floordiv__")
+
+    def __mod__(self, other: object):
+        return self.__getattr__("__mod__")
+
+    def __divmod__(self, other: object):
+        return self.__getattr__("__divmod__")
+
+    def __pow__(self, other: object, modulo: object = ...):
+        return self.__getattr__("__pow__")
+
+    def __lshift__(self, other: object):
+        return self.__getattr__("__lshift__")
+
+    def __rshift__(self, other: object):
+        return self.__getattr__("__rshift__")
+
+    def __and__(self, other: object):
+        return self.__getattr__("__and__")
+
+    def __xor__(self, other: object):
+        return self.__getattr__("__xor__")
+
+    def __or__(self, other: object):
+        return self.__getattr__("__or__")
+
+    # r* and i* methods have lower priority than
+    # the methods for left operand so they are skipped
+
+    def __neg__(self):
+        return self.__getattr__("__neg__")
+
+    def __pos__(self):
+        return self.__getattr__("__pos__")
+
+    def __abs__(self):
+        return self.__getattr__("__abs__")
+
+    def __invert__(self):
+        return self.__getattr__("__invert__")
+
+    # __complex__, __int__ and __float__ have a default implementation
+    # based on __index__, so they are skipped.
+
+    def __index__(self):
+        return self.__getattr__("__index__")
+
+    def __round__(self, ndigits: object = ...):
+        return self.__getattr__("__round__")
+
+    def __trunc__(self):
+        return self.__getattr__("__trunc__")
+
+    def __floor__(self):
+        return self.__getattr__("__floor__")
+
+    def __ceil__(self):
+        return self.__getattr__("__ceil__")
+
+    # [Context managers]
+
+    def __enter__(self):
+        return self.__getattr__("__enter__")
+
+    def __exit__(self, *args: object, **kwargs: object):
+        return self.__getattr__("__exit__")
+
+
+class PlaceholderModule(_PlaceholderBase):
+    """
+    A placeholder object to use when a module does not exist.
+
+    This enables more informative errors when trying to access attributes
+    of a module that does not exists.
+    """
+
+    def __init__(self, name: str) -> None:
+        super().__init__()
+
+        # Apply name mangling to avoid conflicting with module attributes
+        self.__name = name
+
+    def placeholder_attr(self, attr_path: str):
+        return _PlaceholderModuleAttr(self, attr_path)
+
+    def __getattr__(self, key: str):
+        name = self.__name
+
+        try:
+            importlib.import_module(name)
+        except ImportError as exc:
+            for extra, names in get_vllm_optional_dependencies().items():
+                if name in names:
+                    msg = f"Please install vllm[{extra}] for {extra} support"
+                    raise ImportError(msg) from exc
+
+            raise exc
+
+        raise AssertionError("PlaceholderModule should not be used "
+                             "when the original module can be imported")
+
+
+class _PlaceholderModuleAttr(_PlaceholderBase):
+
+    def __init__(self, module: PlaceholderModule, attr_path: str) -> None:
+        super().__init__()
+
+        # Apply name mangling to avoid conflicting with module attributes
+        self.__module = module
+        self.__attr_path = attr_path
+
+    def placeholder_attr(self, attr_path: str):
+        return _PlaceholderModuleAttr(self.__module,
+                                      f"{self.__attr_path}.{attr_path}")
+
+    def __getattr__(self, key: str):
+        getattr(self.__module, f"{self.__attr_path}.{key}")
+
+        raise AssertionError("PlaceholderModule should not be used "
+                             "when the original module can be imported")
+
+
+# create a library to hold the custom op
+vllm_lib = Library("vllm", "FRAGMENT")  # noqa
+
+
+def direct_register_custom_op(
+        op_name: str,
+        op_func: Callable,
+        mutates_args: list[str],
+        fake_impl: Optional[Callable] = None,
+        target_lib: Optional[Library] = None,
+        dispatch_key: str = "CUDA",
+        tags: tuple[torch.Tag, ...] = (),
+):
+    """
+    `torch.library.custom_op` can have significant overhead because it
+    needs to consider complicated dispatching logic. This function
+    directly registers a custom op and dispatches it to the CUDA backend.
+    See https://gist.github.com/youkaichao/ecbea9ec9fc79a45d2adce1784d7a9a5
+    for more details.
+
+    By default, the custom op is registered to the vLLM library. If you
+    want to register it to a different library, you can pass the library
+    object to the `target_lib` argument.
+
+    IMPORTANT: the lifetime of the operator is tied to the lifetime of the
+    library object. If you want to bind the operator to a different library,
+    make sure the library object is alive when the operator is used.
+    """
+    if not supports_custom_op():
+        from vllm.platforms import current_platform
+        assert not current_platform.is_cuda_alike(), (
+            "cuda platform needs torch>=2.4 to support custom op, "
+            "chances are you are using an old version of pytorch "
+            "or a custom build of pytorch. It is recommended to "
+            "use vLLM in a fresh new environment and let it install "
+            "the required dependencies.")
+        return
+
+    import torch.library
+    if hasattr(torch.library, "infer_schema"):
+        schema_str = torch.library.infer_schema(op_func,
+                                                mutates_args=mutates_args)
+    else:
+        # for pytorch 2.4
+        import torch._custom_op.impl
+        schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args)
+    my_lib = target_lib or vllm_lib
+    my_lib.define(op_name + schema_str, tags=tags)
+    my_lib.impl(op_name, op_func, dispatch_key=dispatch_key)
+    if fake_impl is not None:
+        my_lib._register_fake(op_name, fake_impl)
+
+
+def resolve_obj_by_qualname(qualname: str) -> Any:
+    """
+    Resolve an object by its fully qualified name.
+    """
+    module_name, obj_name = qualname.rsplit(".", 1)
+    module = importlib.import_module(module_name)
+    return getattr(module, obj_name)
+
+
+def kill_process_tree(pid: int):
+    """
+    Kills all descendant processes of the given pid by sending SIGKILL.
+
+    Args:
+        pid (int): Process ID of the parent process
+    """
+    try:
+        parent = psutil.Process(pid)
+    except psutil.NoSuchProcess:
+        return
+
+    # Get all children recursively
+    children = parent.children(recursive=True)
+
+    # Send SIGKILL to all children first
+    for child in children:
+        with contextlib.suppress(ProcessLookupError):
+            os.kill(child.pid, signal.SIGKILL)
+
+    # Finally kill the parent
+    with contextlib.suppress(ProcessLookupError):
+        os.kill(pid, signal.SIGKILL)
+
+
+@dataclass
+class MemorySnapshot:
+    """Memory snapshot."""
+    torch_peak: int = 0
+    free_memory: int = 0
+    total_memory: int = 0
+    cuda_memory: int = 0
+    torch_memory: int = 0
+    non_torch_memory: int = 0
+    timestamp: float = 0.0
+    auto_measure: bool = True
+
+    def __post_init__(self):
+        if self.auto_measure:
+            self.measure()
+
+    def measure(self):
+        # we measure the torch peak memory usage via allocated_bytes,
+        # rather than `torch.cuda.memory_reserved()` .
+        # After `torch.cuda.reset_peak_memory_stats()`,
+        # `torch.cuda.memory_reserved()` will keep growing, and only shrink
+        # when we call `torch.cuda.empty_cache()` or OOM happens.
+        self.torch_peak = torch.cuda.memory_stats().get(
+            "allocated_bytes.all.peak", 0)
+
+        self.free_memory, self.total_memory = torch.cuda.mem_get_info()
+        self.cuda_memory = self.total_memory - self.free_memory
+
+        # torch.cuda.memory_reserved() is how many bytes
+        # PyTorch gets from cuda (by calling cudaMalloc, etc.)
+        # this is used to measure the non-torch memory usage
+        self.torch_memory = torch.cuda.memory_reserved()
+
+        self.non_torch_memory = self.cuda_memory - self.torch_memory
+        self.timestamp = time.time()
+
+    def __sub__(self, other: MemorySnapshot) -> MemorySnapshot:
+        return MemorySnapshot(
+            torch_peak=self.torch_peak - other.torch_peak,
+            free_memory=self.free_memory - other.free_memory,
+            total_memory=self.total_memory - other.total_memory,
+            cuda_memory=self.cuda_memory - other.cuda_memory,
+            torch_memory=self.torch_memory - other.torch_memory,
+            non_torch_memory=self.non_torch_memory - other.non_torch_memory,
+            timestamp=self.timestamp - other.timestamp,
+            auto_measure=False,
+        )
+
+
+@dataclass
+class MemoryProfilingResult:
+    """Memory profiling result. All numbers are in bytes.
+    """
+    non_kv_cache_memory: int = 0
+    torch_peak_increase: int = 0
+    non_torch_increase: int = 0
+    weights_memory: float = 0
+    before_create: MemorySnapshot = field(default_factory=MemorySnapshot)
+    before_profile: MemorySnapshot = field(default_factory=MemorySnapshot)
+    after_profile: MemorySnapshot = field(default_factory=MemorySnapshot)
+    profile_time: float = 0.0
+
+    def __repr__(self) -> str:
+        return (f"Memory profiling takes {self.profile_time:.2f} seconds. "
+                f"Total non KV cache memory: "
+                f"{(self.non_kv_cache_memory / GiB_bytes):.2f}GiB; "
+                f"torch peak memory increase: "
+                f"{(self.torch_peak_increase / GiB_bytes):.2f}GiB; "
+                f"non-torch forward increase memory: "
+                f"{(self.non_torch_increase / GiB_bytes):.2f}GiB; "
+                f"weights memory: {(self.weights_memory / GiB_bytes):.2f}GiB.")
+
+
+@contextlib.contextmanager
+def memory_profiling(
+        baseline_snapshot: MemorySnapshot,
+        weights_memory: int) -> Generator[MemoryProfilingResult, None, None]:
+    """Memory profiling context manager.
+    baseline_snapshot: the memory snapshot before the current vLLM instance.
+    weights_memory: memory used by PyTorch when loading the model weights.
+        Note that, before loading the model weights, we also initialize the device
+        and distributed environment, which may consume some memory. This part is not
+        included in the weights_memory because PyTorch does not control it.
+
+    The memory in one GPU can be classified into 3 categories:
+    1. memory used by anything other than the current vLLM instance.
+    2. memory used by torch in the current vLLM instance.
+    3. memory used in the current vLLM instance, but not by torch.
+
+    A quantitive example:
+
+    Before creating the current vLLM instance:
+        category 1: 1 GiB
+        category 2: 0 GiB
+        category 3: 0 GiB
+
+    After creating the current vLLM instance and loading the model,
+    (i.e. before profiling):
+        category 1: 1 GiB
+        category 2: 2 GiB (model weights take 2 GiB)
+        category 3: 0.5 GiB (memory used by NCCL)
+
+    During profiling (peak):
+        category 1: 1 GiB
+        category 2: 4 GiB (peak activation tensors take 2 GiB)
+        category 3: 1 GiB (memory used by NCCL + buffers for some attention backends)
+
+    After profiling:
+        category 1: 1 GiB
+        category 2: 3 GiB (after garbage-collecting activation tensors)
+        category 3: 1 GiB (memory used by NCCL + buffers for some attention backends)
+
+    In this case, non-kv cache takes 5 GiB in total, including:
+    a. 2 GiB used by the model weights (category 2)
+    b. 2 GiB reserved for the peak activation tensors (category 2)
+    c. 1 GiB used by non-torch components (category 3)
+
+    The memory used for loading weights (a.) is directly given from the argument `weights_memory`.
+
+    The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` during profiling gives (b.).
+
+    The increase of `non_torch_memory` from creating the current vLLM instance until after profiling to get (c.).
+    """  # noqa
+    gc.collect()
+    torch.cuda.empty_cache()
+    torch.cuda.reset_peak_memory_stats()
+
+    result = MemoryProfilingResult()
+
+    result.before_create = baseline_snapshot
+    # the part of memory used for holding the model weights
+    result.weights_memory = weights_memory
+
+    result.before_profile.measure()
+
+    yield result
+
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    result.after_profile.measure()
+
+    diff_profile = result.after_profile - result.before_profile
+    diff_from_create = result.after_profile - result.before_create
+    result.torch_peak_increase = diff_profile.torch_peak
+    result.non_torch_increase = diff_from_create.non_torch_memory
+    result.profile_time = diff_profile.timestamp
+    result.non_kv_cache_memory = result.non_torch_increase + result.torch_peak_increase + result.weights_memory  # noqa
+
+
+# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630 # noqa: E501
+def set_ulimit(target_soft_limit=65535):
+    if sys.platform.startswith('win'):
+        logger.info("Windows detected, skipping ulimit adjustment.")
+        return
+
+    import resource
+    resource_type = resource.RLIMIT_NOFILE
+    current_soft, current_hard = resource.getrlimit(resource_type)
+
+    if current_soft < target_soft_limit:
+        try:
+            resource.setrlimit(resource_type,
+                               (target_soft_limit, current_hard))
+        except ValueError as e:
+            logger.warning(
+                "Found ulimit of %s and failed to automatically increase "
+                "with error %s. This can cause fd limit errors like "
+                "`OSError: [Errno 24] Too many open files`. Consider "
+                "increasing with ulimit -n", current_soft, e)
+
+
+# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/utils.py#L28 # noqa: E501
+def get_exception_traceback():
+    etype, value, tb = sys.exc_info()
+    err_str = "".join(traceback.format_exception(etype, value, tb))
+    return err_str
+
+
+def split_zmq_path(path: str) -> tuple[str, str, str]:
+    """Split a zmq path into its parts."""
+    parsed = urlparse(path)
+    if not parsed.scheme:
+        raise ValueError(f"Invalid zmq path: {path}")
+
+    scheme = parsed.scheme
+    host = parsed.hostname or ""
+    port = str(parsed.port or "")
+
+    if scheme == "tcp" and not all((host, port)):
+        # The host and port fields are required for tcp
+        raise ValueError(f"Invalid zmq path: {path}")
+
+    if scheme != "tcp" and port:
+        # port only makes sense with tcp
+        raise ValueError(f"Invalid zmq path: {path}")
+
+    return scheme, host, port
+
+
+def make_zmq_path(scheme: str, host: str, port: Optional[int] = None) -> str:
+    """Make a ZMQ path from its parts.
+
+    Args:
+        scheme: The ZMQ transport scheme (e.g. tcp, ipc, inproc).
+        host: The host - can be an IPv4 address, IPv6 address, or hostname.
+        port: Optional port number, only used for TCP sockets.
+
+    Returns:
+        A properly formatted ZMQ path string.
+    """
+    if port is None:
+        return f"{scheme}://{host}"
+    if is_valid_ipv6_address(host):
+        return f"{scheme}://[{host}]:{port}"
+    return f"{scheme}://{host}:{port}"
+
+
+# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L783 # noqa: E501
+def make_zmq_socket(
+    ctx: Union[zmq.asyncio.Context, zmq.Context],  # type: ignore[name-defined]
+    path: str,
+    socket_type: Any,
+    bind: Optional[bool] = None,
+    identity: Optional[bytes] = None,
+    linger: Optional[int] = None,
+) -> Union[zmq.Socket, zmq.asyncio.Socket]:  # type: ignore[name-defined]
+    """Make a ZMQ socket with the proper bind/connect semantics."""
+
+    mem = psutil.virtual_memory()
+    socket = ctx.socket(socket_type)
+
+    # Calculate buffer size based on system memory
+    total_mem = mem.total / 1024**3
+    available_mem = mem.available / 1024**3
+    # For systems with substantial memory (>32GB total, >16GB available):
+    # - Set a large 0.5GB buffer to improve throughput
+    # For systems with less memory:
+    # - Use system default (-1) to avoid excessive memory consumption
+    if total_mem > 32 and available_mem > 16:
+        buf_size = int(0.5 * 1024**3)  # 0.5GB in bytes
+    else:
+        buf_size = -1  # Use system default buffer size
+
+    if bind is None:
+        bind = socket_type not in (zmq.PUSH, zmq.SUB, zmq.XSUB)
+
+    if socket_type in (zmq.PULL, zmq.DEALER, zmq.ROUTER):
+        socket.setsockopt(zmq.RCVHWM, 0)
+        socket.setsockopt(zmq.RCVBUF, buf_size)
+
+    if socket_type in (zmq.PUSH, zmq.DEALER, zmq.ROUTER):
+        socket.setsockopt(zmq.SNDHWM, 0)
+        socket.setsockopt(zmq.SNDBUF, buf_size)
+
+    if identity is not None:
+        socket.setsockopt(zmq.IDENTITY, identity)
+
+    if linger is not None:
+        socket.setsockopt(zmq.LINGER, linger)
+
+    # Determine if the path is a TCP socket with an IPv6 address.
+    # Enable IPv6 on the zmq socket if so.
+    scheme, host, _ = split_zmq_path(path)
+    if scheme == "tcp" and is_valid_ipv6_address(host):
+        socket.setsockopt(zmq.IPV6, 1)
+
+    if bind:
+        socket.bind(path)
+    else:
+        socket.connect(path)
+
+    return socket
+
+
+@contextlib.contextmanager
+def zmq_socket_ctx(
+    path: str,
+    socket_type: Any,
+    bind: Optional[bool] = None,
+    linger: int = 0,
+    identity: Optional[bytes] = None,
+) -> Iterator[zmq.Socket]:
+    """Context manager for a ZMQ socket"""
+
+    ctx = zmq.Context()  # type: ignore[attr-defined]
+    try:
+        yield make_zmq_socket(ctx,
+                              path,
+                              socket_type,
+                              bind=bind,
+                              identity=identity)
+    except KeyboardInterrupt:
+        logger.debug("Got Keyboard Interrupt.")
+
+    finally:
+        ctx.destroy(linger=linger)
+
+
+def is_in_ray_actor():
+    """Check if we are in a Ray actor."""
+
+    try:
+        import ray
+        return (ray.is_initialized()
+                and ray.get_runtime_context().get_actor_id() is not None)
+    except ImportError:
+        return False
+
+
+def _maybe_force_spawn():
+    """Check if we need to force the use of the `spawn` multiprocessing start
+    method.
+    """
+    if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") == "spawn":
+        return
+
+    reason = None
+    if cuda_is_initialized():
+        reason = "CUDA is initialized"
+    elif xpu_is_initialized():
+        reason = "XPU is initialized"
+    elif is_in_ray_actor():
+        # even if we choose to spawn, we need to pass the ray address
+        # to the subprocess so that it knows how to connect to the ray cluster.
+        # env vars are inherited by subprocesses, even if we use spawn.
+        import ray
+        os.environ["RAY_ADDRESS"] = ray.get_runtime_context().gcs_address
+        reason = "In a Ray actor and can only be spawned"
+
+    if reason is not None:
+        logger.warning(
+            "We must use the `spawn` multiprocessing start method. "
+            "Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
+            "See https://docs.vllm.ai/en/latest/usage/"
+            "troubleshooting.html#python-multiprocessing "
+            "for more information. Reason: %s", reason)
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+
+def get_mp_context():
+    """Get a multiprocessing context with a particular method (spawn or fork).
+    By default we follow the value of the VLLM_WORKER_MULTIPROC_METHOD to
+    determine the multiprocessing method (default is fork). However, under
+    certain conditions, we may enforce spawn and override the value of
+    VLLM_WORKER_MULTIPROC_METHOD.
+    """
+    _maybe_force_spawn()
+    mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
+    return multiprocessing.get_context(mp_method)
+
+
+def bind_kv_cache(
+    ctx: dict[str, Any],
+    kv_cache: list[list[torch.Tensor]],  # [virtual_engine][layer_index]
+    shared_kv_cache_layers: Optional[dict[str, str]] = None
+) -> None:
+    # Bind the kv_cache tensor to Attention modules, similar to
+    # ctx[layer_name].kv_cache[ve]=kv_cache[ve][extract_layer_index(layer_name)]
+    # Special things handled here:
+    # 1. Some models have non-attention layers, e.g., Jamba
+    # 2. Pipeline parallelism, each rank only has a subset of layers
+    # 3. Encoder attention has no kv cache
+    # 4. Encoder-decoder models, encoder-decoder attention and decoder-only
+    #    attention of the same layer (e.g., bart's decoder.layers.1.self_attn
+    #    and decoder.layers.1.encoder_attn) is mapped to the same kv cache
+    #    tensor
+    # 5. Some models have attention layers that share kv cache with previous
+    #    layers, this is specified through shared_kv_cache_layers
+    if shared_kv_cache_layers is None:
+        shared_kv_cache_layers = {}
+    from vllm.attention import AttentionType
+    from vllm.model_executor.models.utils import extract_layer_index
+    layer_need_kv_cache = [
+        layer_name for layer_name in ctx
+        if (hasattr(ctx[layer_name], 'attn_type') and ctx[layer_name].attn_type
+            in (AttentionType.DECODER, AttentionType.ENCODER_DECODER)) \
+                and ctx[layer_name].kv_sharing_target_layer_name is None
+    ]
+    layer_index_sorted = sorted(
+        set(
+            extract_layer_index(layer_name)
+            for layer_name in layer_need_kv_cache))
+    for layer_name in layer_need_kv_cache:
+        kv_cache_idx = layer_index_sorted.index(
+            extract_layer_index(layer_name))
+        forward_ctx = ctx[layer_name]
+        assert len(forward_ctx.kv_cache) == len(kv_cache)
+        for ve, ve_kv_cache in enumerate(kv_cache):
+            forward_ctx.kv_cache[ve] = ve_kv_cache[kv_cache_idx]
+    if shared_kv_cache_layers is not None:
+        for layer_name, target_layer_name in shared_kv_cache_layers.items():
+            assert extract_layer_index(target_layer_name) < \
+               extract_layer_index(layer_name), \
+                   "v0 doesn't support interleaving kv sharing"
+            ctx[layer_name].kv_cache = ctx[target_layer_name].kv_cache
+
+
+def run_method(obj: Any, method: Union[str, bytes, Callable], args: tuple[Any],
+               kwargs: dict[str, Any]) -> Any:
+    """
+    Run a method of an object with the given arguments and keyword arguments.
+    If the method is string, it will be converted to a method using getattr.
+    If the method is serialized bytes and will be deserialized using
+    cloudpickle.
+    If the method is a callable, it will be called directly.
+    """
+    if isinstance(method, bytes):
+        func = partial(cloudpickle.loads(method), obj)
+    elif isinstance(method, str):
+        try:
+            func = getattr(obj, method)
+        except AttributeError:
+            raise NotImplementedError(f"Method {method!r} is not"
+                                      " implemented.") from None
+    else:
+        func = partial(method, obj)  # type: ignore
+    return func(*args, **kwargs)
+
+
+def import_pynvml():
+    """
+    Historical comments:
+
+    libnvml.so is the library behind nvidia-smi, and
+    pynvml is a Python wrapper around it. We use it to get GPU
+    status without initializing CUDA context in the current process.
+    Historically, there are two packages that provide pynvml:
+    - `nvidia-ml-py` (https://pypi.org/project/nvidia-ml-py/): The official
+        wrapper. It is a dependency of vLLM, and is installed when users
+        install vLLM. It provides a Python module named `pynvml`.
+    - `pynvml` (https://pypi.org/project/pynvml/): An unofficial wrapper.
+        Prior to version 12.0, it also provides a Python module `pynvml`,
+        and therefore conflicts with the official one. What's worse,
+        the module is a Python package, and has higher priority than
+        the official one which is a standalone Python file.
+        This causes errors when both of them are installed.
+        Starting from version 12.0, it migrates to a new module
+        named `pynvml_utils` to avoid the conflict.
+    It is so confusing that many packages in the community use the
+    unofficial one by mistake, and we have to handle this case.
+    For example, `nvcr.io/nvidia/pytorch:24.12-py3` uses the unofficial
+    one, and it will cause errors, see the issue
+    https://github.com/vllm-project/vllm/issues/12847 for example.
+    After all the troubles, we decide to copy the official `pynvml`
+    module to our codebase, and use it directly.
+    """
+    import vllm.third_party.pynvml as pynvml
+    return pynvml
+
+
+def warn_for_unimplemented_methods(cls: type[T]) -> type[T]:
+    """
+    A replacement for `abc.ABC`.
+    When we use `abc.ABC`, subclasses will fail to instantiate
+    if they do not implement all abstract methods.
+    Here, we only require `raise NotImplementedError` in the
+    base class, and log a warning if the method is not implemented
+    in the subclass.
+    """
+
+    original_init = cls.__init__
+
+    def find_unimplemented_methods(self: object):
+        unimplemented_methods = []
+        for attr_name in dir(self):
+            # bypass inner method
+            if attr_name.startswith('_'):
+                continue
+
+            try:
+                attr = getattr(self, attr_name)
+                # get the func of callable method
+                if callable(attr):
+                    attr_func = attr.__func__
+            except AttributeError:
+                continue
+            src = inspect.getsource(attr_func)
+            if "NotImplementedError" in src:
+                unimplemented_methods.append(attr_name)
+        if unimplemented_methods:
+            method_names = ','.join(unimplemented_methods)
+            msg = (f"Methods {method_names} not implemented in {self}")
+            logger.debug(msg)
+
+    @wraps(original_init)
+    def wrapped_init(self, *args, **kwargs) -> None:
+        original_init(self, *args, **kwargs)
+        find_unimplemented_methods(self)
+
+    type.__setattr__(cls, '__init__', wrapped_init)
+    return cls
+
+
+class LazyLoader(types.ModuleType):
+    """
+    LazyLoader module borrowed from Tensorflow
+    https://github.com/tensorflow/tensorflow/blob/main/tensorflow/python/util/lazy_loader.py
+    with a addition of "module caching".
+
+    Lazily import a module, mainly to avoid pulling in large dependencies.
+    Modules such as `xgrammar` might do additional side effects, so we
+    only want to use this when it is needed, delaying all eager effects
+    """
+
+    def __init__(
+        self,
+        local_name: str,
+        parent_module_globals: dict[str, Any],
+        name: str,
+    ):
+        self._local_name = local_name
+        self._parent_module_globals = parent_module_globals
+        self._module: types.ModuleType | None = None
+
+        super().__init__(str(name))
+
+    def _load(self) -> types.ModuleType:
+        # Import the target module and insert it into the parent's namespace
+        try:
+            module = importlib.import_module(self.__name__)
+            self._parent_module_globals[self._local_name] = module
+            # The additional add to sys.modules
+            # ensures library is actually loaded.
+            sys.modules[self._local_name] = module
+        except ModuleNotFoundError as err:
+            raise err from None
+
+        # Update this object's dict so that if someone keeps a
+        # reference to the LazyLoader, lookups are efficient
+        # (__getattr__ is only called on lookups that fail).
+        self.__dict__.update(module.__dict__)
+        return module
+
+    def __getattr__(self, item: Any) -> Any:
+        if self._module is None:
+            self._module = self._load()
+        return getattr(self._module, item)
+
+    def __dir__(self) -> list[str]:
+        if self._module is None:
+            self._module = self._load()
+        return dir(self._module)
+
+
+def swap_dict_values(obj: dict[_K, _V], key1: _K, key2: _K) -> None:
+    """
+    Helper function to swap values for two keys
+    """
+    v1 = obj.get(key1)
+    v2 = obj.get(key2)
+    if v1 is not None:
+        obj[key2] = v1
+    else:
+        obj.pop(key2, None)
+    if v2 is not None:
+        obj[key1] = v2
+    else:
+        obj.pop(key1, None)
+
+
+@contextlib.contextmanager
+def cprofile_context(save_file: Optional[str] = None):
+    """Run a cprofile
+
+    Args:
+        save_file: path to save the profile result. "1" or
+          None will result in printing to stdout.
+    """
+    import cProfile
+
+    prof = cProfile.Profile()
+    prof.enable()
+
+    try:
+        yield
+    finally:
+        prof.disable()
+        if save_file and save_file != "1":
+            prof.dump_stats(save_file)
+        else:
+            prof.print_stats(sort="cumtime")
+
+
+def cprofile(save_file: Optional[str] = None, enabled: bool = True):
+    """Decorator to profile a Python method using cProfile.
+
+    Args:
+        save_file: Path to save the profile result.
+            If "1", None, or "", results will be printed to stdout.
+        enabled: Set to false to turn this into a no-op
+    """
+
+    def decorator(func: Callable):
+
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            if not enabled:
+                # If profiling is disabled, just call the function directly.
+                return func(*args, **kwargs)
+
+            with cprofile_context(save_file):
+                return func(*args, **kwargs)
+
+        return wrapper
+
+    return decorator
+
+
+# Only relevant for models using ALiBi (e.g, MPT)
+def check_use_alibi(model_config: ModelConfig) -> bool:
+    cfg = model_config.hf_text_config
+    return (getattr(cfg, "alibi", False)  # Falcon
+            or ("BloomForCausalLM" in getattr(model_config.hf_config,
+                                              "architectures", []))  # Bloom
+            or getattr(cfg, "position_encoding_type", "") ==
+            "alibi"  # codellm_1b_alibi
+            or (hasattr(cfg, "attn_config")  # MPT
+                and ((isinstance(cfg.attn_config, dict)
+                      and cfg.attn_config.get("alibi", False)) or
+                     (not isinstance(cfg.attn_config, dict)
+                      and getattr(cfg.attn_config, "alibi", False)))))
+
+
+def sha256(input) -> int:
+    """Hash any picklable Python object using SHA-256.
+
+    The input is serialized using pickle before hashing, which allows
+    arbitrary Python objects to be used. Note that this function does
+    not use a hash seed—if you need one, prepend it explicitly to the input.
+
+    Args:
+        input: Any picklable Python object.
+
+    Returns:
+        An integer representing the SHA-256 hash of the serialized input.
+    """
+    input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
+    return int.from_bytes(hashlib.sha256(input_bytes).digest(),
+                          byteorder="big")
+
+
+def sha256_cbor_64bit(input) -> int:
+    """
+    Hash objects using CBOR serialization and SHA-256, then truncate to 64bits.
+
+    This option is useful for non-Python-dependent serialization and hashing.
+
+    Args:
+        input: Object to be serialized and hashed. Supported types include
+            basic Python types and complex structures like lists, tuples, and
+            dictionaries.
+            Custom classes must implement CBOR serialization methods.
+
+    Returns:
+        An integer in the range [0, 2^64-1] representing the lower 64 bits
+        of the SHA-256 hash of the CBOR serialized input.
+    """
+    input_bytes = cbor2.dumps(input, canonical=True)
+    full_hash = int.from_bytes(hashlib.sha256(input_bytes).digest(),
+                               byteorder="big")
+
+    return full_hash & ((1 << 64) - 1)
+
+
+def is_torch_equal_or_newer(target: str) -> bool:
+    """Check if the installed torch version is >= the target version.
+
+    Args:
+        target: a version string, like "2.6.0".
+
+    Returns:
+        Whether the condition meets.
+    """
+    try:
+        return _is_torch_equal_or_newer(str(torch.__version__), target)
+    except Exception:
+        # Fallback to PKG-INFO to load the package info, needed by the doc gen.
+        return Version(importlib.metadata.version('torch')) >= Version(target)
+
+
+# Helper function used in testing.
+def _is_torch_equal_or_newer(torch_version: str, target: str) -> bool:
+    torch_version = version.parse(torch_version)
+    return torch_version >= version.parse(target)
+
+
+@cache
+def _has_module(module_name: str) -> bool:
+    """Return True if *module_name* can be found in the current environment.
+
+    The result is cached so that subsequent queries for the same module incur
+    no additional overhead.
+    """
+    return importlib.util.find_spec(module_name) is not None
+
+
+def has_pplx() -> bool:
+    """Whether the optional `pplx_kernels` package is available."""
+
+    return _has_module("pplx_kernels")
+
+
+def has_deep_ep() -> bool:
+    """Whether the optional `deep_ep` package is available."""
+
+    return _has_module("deep_ep")
+
+
+def has_deep_gemm() -> bool:
+    """Whether the optional `deep_gemm` package is available."""
+
+    return _has_module("deep_gemm")
diff --git a/vllm_v0.10.0/vllm/utils/deep_gemm.py b/vllm_v0.10.0/vllm/utils/deep_gemm.py
new file mode 100644
index 0000000..09a12a8
--- /dev/null
+++ b/vllm_v0.10.0/vllm/utils/deep_gemm.py
@@ -0,0 +1,141 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Compatibility wrapper for DeepGEMM API changes.
+
+Users of vLLM should always import **only** these wrappers.
+"""
+from __future__ import annotations
+
+import functools
+import importlib
+from typing import Any, Callable, NoReturn
+
+import torch
+
+import vllm.envs as envs
+from vllm.utils import cuda_get_device_properties, has_deep_gemm
+
+
+@functools.cache
+def is_blackwell_deep_gemm_used() -> bool:
+    """Return ``True`` if vLLM is configured to use DeepGEMM on a
+    Blackwell-class GPU.
+    """
+
+    if not (envs.VLLM_USE_DEEP_GEMM and has_deep_gemm()
+            and _per_block_cast_impl is not None):
+        return False
+
+    return cuda_get_device_properties(0, ("major", ))[0] == 10
+
+
+def _missing(*_: Any, **__: Any) -> NoReturn:
+    """Placeholder for unavailable DeepGEMM backend."""
+    raise RuntimeError(
+        "DeepGEMM backend is not available. Please install the `deep_gemm` "
+        "package to enable FP8 kernels.")
+
+
+def _resolve_symbol(module, new: str, old: str) -> Callable[..., Any] | None:
+    """Return the *new* symbol if it exists, otherwise the *old* one."""
+    if hasattr(module, new):
+        return getattr(module, new)
+    if hasattr(module, old):
+        return getattr(module, old)
+    return None
+
+
+_fp8_gemm_nt_impl: Callable[..., Any] | None = None
+_grouped_impl: Callable[..., Any] | None = None
+_grouped_masked_impl: Callable[..., Any] | None = None
+_per_block_cast_impl: Callable[..., Any] | None = None
+
+
+def _lazy_init() -> None:
+    """Import deep_gemm and resolve symbols on first use."""
+    global _fp8_gemm_nt_impl, _grouped_impl, _grouped_masked_impl, \
+        _per_block_cast_impl
+
+    # fast path
+    if (_fp8_gemm_nt_impl is not None or _grouped_impl is not None
+            or _grouped_masked_impl is not None
+            or _per_block_cast_impl is not None):
+        return
+
+    if not has_deep_gemm():
+        return
+
+    _dg = importlib.import_module("deep_gemm")
+
+    _fp8_gemm_nt_impl = _resolve_symbol(_dg, "fp8_gemm_nt",
+                                        "gemm_fp8_fp8_bf16_nt")
+    _grouped_impl = _resolve_symbol(
+        _dg, "m_grouped_fp8_gemm_nt_contiguous",
+        "m_grouped_gemm_fp8_fp8_bf16_nt_contiguous")
+    _grouped_masked_impl = _resolve_symbol(
+        _dg, "fp8_m_grouped_gemm_nt_masked",
+        "m_grouped_gemm_fp8_fp8_bf16_nt_masked")
+    # Try to get per_token_cast_to_fp8 from DeepGEMM math utils.
+    try:
+        _math_mod = importlib.import_module(
+            "deep_gemm.utils.math")  # type: ignore
+        _per_block_cast_impl = getattr(_math_mod, "per_block_cast_to_fp8",
+                                       None)
+    except ModuleNotFoundError:
+        _per_block_cast_impl = None
+
+
+def fp8_gemm_nt(*args, **kwargs):
+    _lazy_init()
+    if _fp8_gemm_nt_impl is None:
+        return _missing(*args, **kwargs)
+    return _fp8_gemm_nt_impl(*args, **kwargs)
+
+
+def m_grouped_fp8_gemm_nt_contiguous(*args, **kwargs):
+    _lazy_init()
+    if _grouped_impl is None:
+        return _missing(*args, **kwargs)
+    return _grouped_impl(*args, **kwargs)
+
+
+def fp8_m_grouped_gemm_nt_masked(*args, **kwargs):
+    _lazy_init()
+    if _grouped_masked_impl is None:
+        return _missing(*args, **kwargs)
+    return _grouped_masked_impl(*args, **kwargs)
+
+
+def per_block_cast_to_fp8(x, *args, **kwargs):
+    _lazy_init()
+    if _per_block_cast_impl is not None and is_blackwell_deep_gemm_used():
+        return _per_block_cast_impl(x, use_ue8m0=True)
+    # TODO: refactor the `per_block_cast_to_fp8` from tests to vllm utils
+    from tests.kernels.quant_utils import per_block_cast_to_fp8 as _pbcf
+    return _pbcf(x, *args, **kwargs)
+
+
+def calc_diff(x: torch.Tensor, y: torch.Tensor):
+    """Return a global difference metric for unit tests.
+
+    DeepGEMM kernels on Blackwell/B200 currently exhibit noticeable per-element
+    error, causing ``torch.testing.assert_close`` to fail.  Instead of checking
+    every element, we compute a cosine-style similarity over the whole tensor
+    and report ``1 - sim``.  Once kernel accuracy improves this helper can be
+    removed.
+    """
+
+    x, y = x.double(), y.double()
+    denominator = (x * x + y * y).sum()
+    sim = 2 * (x * y).sum() / denominator
+    return 1 - sim
+
+
+__all__ = [
+    "calc_diff",
+    "fp8_gemm_nt",
+    "m_grouped_fp8_gemm_nt_contiguous",
+    "fp8_m_grouped_gemm_nt_masked",
+    "per_block_cast_to_fp8",
+    "is_blackwell_deep_gemm_used",
+]
diff --git a/vllm_v0.10.0/vllm/utils/flashinfer.py b/vllm_v0.10.0/vllm/utils/flashinfer.py
new file mode 100644
index 0000000..1ddafba
--- /dev/null
+++ b/vllm_v0.10.0/vllm/utils/flashinfer.py
@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Compatibility wrapper for FlashInfer API changes.
+
+Users of vLLM should always import **only** these wrappers.
+"""
+from __future__ import annotations
+
+import contextlib
+import functools
+import importlib
+import importlib.util
+from typing import Any, Callable, NoReturn
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+@functools.cache
+def has_flashinfer() -> bool:
+    """Return ``True`` if FlashInfer is available."""
+    # Use find_spec to check if the module exists without importing it
+    # This avoids potential CUDA initialization side effects
+    return importlib.util.find_spec("flashinfer") is not None
+
+
+def _missing(*_: Any, **__: Any) -> NoReturn:
+    """Placeholder for unavailable FlashInfer backend."""
+    raise RuntimeError(
+        "FlashInfer backend is not available. Please install the package "
+        "to enable FlashInfer kernels: "
+        "https://github.com/flashinfer-ai/flashinfer")
+
+
+def _get_submodule(module_name: str) -> Any | None:
+    """Safely import a submodule and return it, or None if not available."""
+    try:
+        return importlib.import_module(module_name)
+    except (ImportError, ModuleNotFoundError):
+        return None
+
+
+# General lazy import wrapper
+def _lazy_import_wrapper(module_name: str,
+                         attr_name: str,
+                         fallback_fn: Callable[..., Any] = _missing):
+    """Create a lazy import wrapper for a specific function."""
+
+    @functools.cache
+    def _get_impl():
+        if not has_flashinfer():
+            return None
+        mod = _get_submodule(module_name)
+        return getattr(mod, attr_name, None) if mod else None
+
+    def wrapper(*args, **kwargs):
+        impl = _get_impl()
+        if impl is None:
+            return fallback_fn(*args, **kwargs)
+        return impl(*args, **kwargs)
+
+    return wrapper
+
+
+# Create lazy wrappers for each function
+flashinfer_trtllm_fp8_block_scale_moe = _lazy_import_wrapper(
+    "flashinfer.fused_moe", "trtllm_fp8_block_scale_moe")
+flashinfer_cutlass_fused_moe = _lazy_import_wrapper("flashinfer.fused_moe",
+                                                    "cutlass_fused_moe")
+fp4_quantize = _lazy_import_wrapper("flashinfer", "fp4_quantize")
+block_scale_interleave = _lazy_import_wrapper("flashinfer",
+                                              "block_scale_interleave")
+
+# Special case for autotune since it returns a context manager
+autotune = _lazy_import_wrapper(
+    "flashinfer.autotuner",
+    "autotune",
+    fallback_fn=lambda *args, **kwargs: contextlib.nullcontext())
+
+
+@functools.cache
+def has_flashinfer_moe() -> bool:
+    """Return ``True`` if FlashInfer MoE module is available."""
+    return importlib.util.find_spec("flashinfer.fused_moe") is not None
+
+
+@functools.cache
+def has_flashinfer_cutlass_fused_moe() -> bool:
+    """Return ``True`` if FlashInfer CUTLASS fused MoE is available."""
+    if not has_flashinfer_moe():
+        return False
+
+    # Check if all required functions are available
+    required_functions = [
+        ("flashinfer.fused_moe", "cutlass_fused_moe"),
+        ("flashinfer", "fp4_quantize"),
+        ("flashinfer", "block_scale_interleave"),
+    ]
+
+    for module_name, attr_name in required_functions:
+        mod = _get_submodule(module_name)
+        if not mod or not hasattr(mod, attr_name):
+            return False
+    return True
+
+
+__all__ = [
+    "has_flashinfer",
+    "flashinfer_trtllm_fp8_block_scale_moe",
+    "flashinfer_cutlass_fused_moe",
+    "fp4_quantize",
+    "block_scale_interleave",
+    "autotune",
+    "has_flashinfer_moe",
+    "has_flashinfer_cutlass_fused_moe",
+]
diff --git a/vllm_v0.10.0/vllm/v1/__init__.py b/vllm_v0.10.0/vllm/v1/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/v1/attention/__init__.py b/vllm_v0.10.0/vllm/v1/attention/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/v1/attention/backends/__init__.py b/vllm_v0.10.0/vllm/v1/attention/backends/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/v1/attention/backends/cpu_attn.py b/vllm_v0.10.0/vllm/v1/attention/backends/cpu_attn.py
new file mode 100644
index 0000000..3b6d753
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/attention/backends/cpu_attn.py
@@ -0,0 +1,920 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+from typing import Optional
+
+import numpy as np
+import torch
+from torch.nn.functional import scaled_dot_product_attention
+
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer,
+                                              AttentionMetadata, AttentionType,
+                                              is_quantized_kv_cache)
+from vllm.attention.backends.utils import CommonAttentionState
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
+                                              CommonAttentionMetadata)
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.kv_cache_interface import AttentionSpec
+from vllm.v1.worker.gpu_input_batch import InputBatch
+
+try:
+    import intel_extension_for_pytorch.llm.modules as ipex_modules
+    _use_ipex = True
+# AttributeError is to handle a bug in ipex
+# https://github.com/intel/intel-extension-for-pytorch/pull/813
+except (ImportError, AttributeError):
+    _use_ipex = False
+
+from vllm import _custom_ops as ops
+
+logger = init_logger(__name__)
+
+
+class TorchSDPABackend(AttentionBackend):
+    accept_output_buffer: bool = False
+
+    @classmethod
+    def get_supported_dtypes(cls) -> list[torch.dtype]:
+        return [torch.float16, torch.bfloat16, torch.float32]
+
+    @classmethod
+    def validate_head_size(cls, head_size: int) -> None:
+        attn_impl = _get_paged_attn_impl()
+        is_valid, supported_head_sizes = attn_impl.validate_head_size(
+            head_size)
+        if not is_valid:
+            attn_type = cls.__name__.removesuffix("Backend")
+            raise ValueError(
+                f"Head size {head_size} is not supported by {attn_type}. "
+                f"Supported head sizes are: {supported_head_sizes}. "
+                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
+                "FlexAttention backend which supports all head sizes.")
+
+    @staticmethod
+    def get_name() -> str:
+        return "TORCH_SDPA_VLLM_V1"
+
+    @staticmethod
+    def get_impl_cls() -> type["TorchSDPABackendImpl"]:
+        return TorchSDPABackendImpl
+
+    @staticmethod
+    def get_metadata_cls() -> type["AttentionMetadata"]:
+        return TorchSDPAMetadata
+
+    @staticmethod
+    def get_state_cls() -> type["CommonAttentionState"]:
+        return CommonAttentionState
+
+    @staticmethod
+    def get_builder_cls() -> type["TorchSDPAMetadataBuilderV1"]:
+        return TorchSDPAMetadataBuilderV1
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> tuple[int, ...]:
+        return _get_paged_attn_impl().get_kv_cache_shape(
+            num_blocks, block_size, num_kv_heads, head_size)
+
+    @staticmethod
+    def use_cascade_attention(*args, **kwargs) -> bool:
+        return False
+
+
+@dataclass
+class TorchSDPAMetadata(AttentionMetadata):
+    """Metadata for PagedAttention."""
+    # (batch_size,). The length of sequences (entire tokens seen so far) per
+    # sequence.
+    seq_lens_tensor: Optional[torch.Tensor]
+    # Maximum sequence length in the batch. 0 if it is prefill-only batch.
+    max_decode_seq_len: int
+    # (batch_size, max_blocks_per_seq).
+    # Block addresses per sequence. (Seq id -> list of physical block)
+    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
+    # in the kv cache. Each block can contain up to block_size tokens.
+    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
+    # captured.
+    block_tables: Optional[torch.Tensor]
+    """Metadata for TorchSDPABackend.
+    """
+    # Currently, input sequences can only contain all prompts
+    # or all decoding. True if all sequences are prompts.
+    chunked_prefill: bool
+    seq_lens: Optional[list[int]] = None  # For non-chunked prefill
+
+    # For chunked prefill only
+    max_query_len: Optional[int] = None
+    max_kv_len: Optional[int] = None
+    prefill_query_start_loc: Optional[torch.Tensor] = None
+    kv_start_loc: Optional[torch.Tensor] = None
+    prefill_block_tables: Optional[torch.Tensor] = None
+
+    # For V1 logits index only
+    query_start_loc: Optional[torch.Tensor] = None
+
+    # Begin encoder attn & enc/dec cross-attn fields...
+    # Encoder sequence lengths representation
+    encoder_seq_lens: Optional[list[int]] = None
+    encoder_seq_lens_tensor: Optional[torch.Tensor] = None
+
+    # Maximum sequence length among encoder sequences
+    max_encoder_seq_len: Optional[int] = None
+
+    # Number of tokens input to encoder
+    num_encoder_tokens: Optional[int] = None
+
+    # Cross-attention memory-mapping data structures: slot mapping
+    # and block tables
+    cross_slot_mapping: Optional[torch.Tensor] = None
+    cross_block_tables: Optional[torch.Tensor] = None
+
+    def __post_init__(self):
+        # Set during the execution of the first attention op.
+        # It is a list because it is needed to set per prompt
+        # when alibi slopes is used. It is because of the limitation
+        # from xformer API.
+        # will not appear in the __repr__ and __init__
+        self.attn_bias: Optional[list[torch.Tensor]] = None
+        self.encoder_attn_bias: Optional[list[torch.Tensor]] = None
+        self.cross_attn_bias: Optional[list[torch.Tensor]] = None
+
+    @property
+    def is_all_encoder_attn_metadata_set(self):
+        '''
+        All attention metadata required for encoder attention is set.
+        '''
+        return ((self.encoder_seq_lens is not None)
+                and (self.encoder_seq_lens_tensor is not None)
+                and (self.max_encoder_seq_len is not None))
+
+    @property
+    def is_all_cross_attn_metadata_set(self):
+        '''
+        All attention metadata required for enc/dec cross-attention is set.
+
+        Superset of encoder attention required metadata.
+        '''
+        return (self.is_all_encoder_attn_metadata_set
+                and (self.cross_slot_mapping is not None)
+                and (self.cross_block_tables is not None))
+
+    @property
+    def prefill_metadata(self) -> Optional["TorchSDPAMetadata"]:
+        if self.num_prefill_tokens == 0:
+            return None
+        return self
+
+    @property
+    def decode_metadata(self) -> Optional["TorchSDPAMetadata"]:
+        if self.num_decode_tokens == 0:
+            return None
+        return self
+
+    def get_seq_lens(
+        self,
+        attn_type: str,
+    ):
+        '''
+        Extract appropriate sequence lengths from attention metadata
+        according to attention type.
+
+        Arguments:
+
+        * attn_metadata: Attention metadata structure associated with attention
+        * attn_type: encoder attention, decoder self-attention,
+                    encoder/decoder cross-attention
+
+        Returns:
+        * Appropriate sequence lengths tensor for query
+        * Appropriate sequence lengths tensor for key & value
+        '''
+
+        if (attn_type == AttentionType.DECODER
+                or attn_type == AttentionType.ENCODER_ONLY):
+            seq_lens_q = self.seq_lens
+            seq_lens_kv = self.seq_lens
+        elif attn_type == AttentionType.ENCODER:
+            seq_lens_q = self.encoder_seq_lens
+            seq_lens_kv = self.encoder_seq_lens
+        elif attn_type == AttentionType.ENCODER_DECODER:
+            seq_lens_q = self.seq_lens
+            seq_lens_kv = self.encoder_seq_lens
+        else:
+            raise AttributeError(f"Invalid attention type {str(attn_type)}")
+        return seq_lens_q, seq_lens_kv
+
+    def get_attn_bias(
+        self,
+        attn_type: str,
+    ) -> Optional[list[torch.Tensor]]:
+        '''
+        Extract appropriate attention bias from attention metadata
+        according to attention type.
+
+        Arguments:
+
+        * attn_metadata: Attention metadata structure associated with attention
+        * attn_type: encoder attention, decoder self-attention,
+                    encoder/decoder cross-attention
+
+        Returns:
+        * Appropriate attention bias value given the attention type
+        '''
+
+        if (attn_type == AttentionType.DECODER
+                or attn_type == AttentionType.ENCODER_ONLY):
+            return self.attn_bias
+        elif attn_type == AttentionType.ENCODER:
+            return self.encoder_attn_bias
+        elif attn_type == AttentionType.ENCODER_DECODER:
+            return self.cross_attn_bias
+        else:
+            raise AttributeError(f"Invalid attention type {str(attn_type)}")
+
+    def set_attn_bias(
+        self,
+        attn_bias: list[torch.Tensor],
+        attn_type: str,
+    ) -> None:
+        '''
+        Update appropriate attention bias field of attention metadata,
+        according to attention type.
+
+        Arguments:
+
+        * attn_metadata: Attention metadata structure associated with attention
+        * attn_bias: The desired attention bias value
+        * attn_type: encoder attention, decoder self-attention,
+                    encoder/decoder cross-attention
+        '''
+
+        if (attn_type == AttentionType.DECODER
+                or attn_type == AttentionType.ENCODER_ONLY):
+            self.attn_bias = attn_bias
+        elif attn_type == AttentionType.ENCODER:
+            self.encoder_attn_bias = attn_bias
+        elif attn_type == AttentionType.ENCODER_DECODER:
+            self.cross_attn_bias = attn_bias
+        else:
+            raise AttributeError(f"Invalid attention type {str(attn_type)}")
+
+    def get_seq_len_block_table_args(
+        self,
+        attn_type: str,
+    ) -> tuple:
+        '''
+        The particular choice of sequence-length- and block-table-related
+        attributes which should be extracted from attn_metadata is dependent
+        on the type of attention operation.
+
+        Decoder attn -> select entirely decoder self-attention-related fields
+        Encoder/decoder cross-attn -> select encoder sequence lengths &
+                                    cross-attn block-tables fields
+        Encoder attn -> select encoder sequence lengths fields & no block tables
+
+        Arguments:
+
+        * attn_metadata: Attention metadata structure associated with attention
+        * is_prompt: True if prefill, False otherwise
+        * attn_type: encoder attention, decoder self-attention,
+                    encoder/decoder cross-attention
+
+        Returns:
+
+        * Appropriate sequence-lengths tensor
+        * Appropriate max sequence-length scalar
+        * Appropriate block tables (or None)
+        '''
+
+        if (attn_type == AttentionType.DECODER
+                or attn_type == AttentionType.ENCODER_ONLY):
+            # Decoder self-attention
+            # Choose max_seq_len based on whether we are in prompt_run
+            return (self.seq_lens_tensor, self.max_decode_seq_len,
+                    self.block_tables)
+        elif attn_type == AttentionType.ENCODER_DECODER:
+            # Enc/dec cross-attention KVs match encoder sequence length;
+            # cross-attention utilizes special "cross" block tables
+            return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len,
+                    self.cross_block_tables)
+        elif attn_type == AttentionType.ENCODER:
+            # No block tables associated with encoder attention
+            return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len,
+                    None)
+        else:
+            raise AttributeError(f"Invalid attention type {str(attn_type)}")
+
+
+class TorchSDPAMetadataBuilderV1(AttentionMetadataBuilder[TorchSDPAMetadata]):
+
+    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
+                 device: torch.device) -> None:
+        self.kv_cache_spec = kv_cache_spec
+        self.vllm_config = vllm_config
+        self.scheduler_config = vllm_config.scheduler_config
+
+        # For reorder
+        self.reorder_prompt_req_index_list = np.empty(
+            vllm_config.scheduler_config.max_num_seqs, dtype=np.int64)
+        self.reorder_decode_req_index_list = np.empty(
+            vllm_config.scheduler_config.max_num_seqs, dtype=np.int64)
+        self.num_prompt_req: int = 0
+
+        self.seq_start_loc_cpu = torch.zeros(
+            vllm_config.scheduler_config.max_num_seqs + 1,
+            dtype=torch.int32,
+            device="cpu",
+        )
+        self.seq_start_loc_np = self.seq_start_loc_cpu.numpy()
+
+    def reorder_batch(self, input_batch: InputBatch,
+                      scheduler_output: SchedulerOutput) -> bool:
+        prompt_list_idx = 0
+        decode_list_idx = 0
+        for req_index in range(input_batch.num_reqs):
+            if input_batch.num_computed_tokens_cpu[
+                    req_index] < input_batch.num_prompt_tokens[req_index]:
+                # prompt stage
+                self.reorder_prompt_req_index_list[prompt_list_idx] = req_index
+                prompt_list_idx += 1
+            else:
+                # decode stage
+                self.reorder_decode_req_index_list[decode_list_idx] = req_index
+                decode_list_idx += 1
+        assert decode_list_idx + prompt_list_idx == input_batch.num_reqs
+
+        # Update prompt requests number
+        self.num_prompt_req = prompt_list_idx
+
+        reorder_req_num = 0
+        for req_index in range(decode_list_idx):
+            if self.reorder_decode_req_index_list[req_index] < prompt_list_idx:
+                reorder_req_num += 1
+            else:
+                break
+
+        if reorder_req_num == 0:
+            return False
+
+        reorder_prompt_list = (
+            self.reorder_prompt_req_index_list[:prompt_list_idx]
+            [-reorder_req_num:])
+        reorder_decode_list = (
+            self.reorder_decode_req_index_list[:decode_list_idx]
+            [:reorder_req_num])
+        assert reorder_decode_list.size == reorder_prompt_list.size
+
+        for idx in range(reorder_req_num):
+            prompt_req_index = reorder_prompt_list[idx].item()
+            decode_req_index = reorder_decode_list[idx].item()
+            input_batch.swap_states(prompt_req_index, decode_req_index)
+
+        return True
+
+    def build(self,
+              common_prefix_len: int,
+              common_attn_metadata: CommonAttentionMetadata,
+              fast_build: bool = False) -> TorchSDPAMetadata:
+        num_reqs = common_attn_metadata.num_reqs
+        max_query_len = common_attn_metadata.max_query_len
+
+        seq_lens_cpu = common_attn_metadata.seq_lens_cpu
+        seq_lens_np = seq_lens_cpu.numpy()
+        num_prompt_req = self.num_prompt_req
+        max_prefill_seq_len = seq_lens_np[:num_prompt_req].max().item(
+        ) if num_prompt_req > 0 else 0
+        max_decode_seq_len = seq_lens_np[num_prompt_req:num_reqs].max().item(
+        ) if num_prompt_req < num_reqs else 0
+        self.seq_start_loc_np[0] = 0
+        np.cumsum(seq_lens_np, out=self.seq_start_loc_np[1:num_reqs + 1])
+
+        query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
+        num_prefill_tokens = int(query_start_loc_cpu[num_prompt_req].item())
+        num_decode_tokens = int(query_start_loc_cpu[num_reqs].item() -
+                                num_prefill_tokens)
+
+        slot_mapping = common_attn_metadata.slot_mapping.long()
+        block_table_tensor = common_attn_metadata.block_table_tensor
+
+        attn_metadata = TorchSDPAMetadata(
+            num_prefills=num_prompt_req,
+            num_prefill_tokens=num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            slot_mapping=slot_mapping,
+            # to ensure inference when chunked_prefill is disabled
+            seq_lens=seq_lens_cpu.tolist(),
+            seq_lens_tensor=seq_lens_cpu[num_prompt_req:num_reqs],  # decode
+            max_decode_seq_len=max_decode_seq_len,  # decode
+            block_tables=block_table_tensor[num_prompt_req:num_reqs],  # decode
+            chunked_prefill=self.scheduler_config.chunked_prefill_enabled,
+            max_query_len=max_query_len,
+            max_kv_len=max_prefill_seq_len,
+            prefill_query_start_loc=query_start_loc_cpu[:num_prompt_req +
+                                                        1],  # prefill
+            kv_start_loc=self.seq_start_loc_cpu[:num_prompt_req +
+                                                1],  # prefill
+            prefill_block_tables=block_table_tensor[:
+                                                    num_prompt_req],  # prefill
+            query_start_loc=query_start_loc_cpu[:num_reqs +
+                                                1],  # for logits index
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=False,
+        )
+
+        return attn_metadata
+
+
+class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[list[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
+    ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
+        if logits_soft_cap is not None:
+            logger.warning_once("Torch SPDA does not support logits soft cap. "
+                                "Outputs may be slightly off.")
+        self.paged_attn_impl = _get_paged_attn_impl()
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+        self.alibi_slopes = alibi_slopes
+        self.sliding_window = sliding_window
+        self.kv_cache_dtype = kv_cache_dtype
+
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+        self.need_mask = (self.alibi_slopes is not None
+                          or self.sliding_window is not None)
+
+        if is_quantized_kv_cache(kv_cache_dtype) and not _use_ipex:
+            raise NotImplementedError(
+                "Torch SDPA backend FP8 KV cache requires "
+                "intel_extension_for_pytorch support.")
+        self.attn_type = attn_type
+
+    def forward(
+        self,
+        layer: AttentionLayer,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: TorchSDPAMetadata,  # type: ignore
+        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass with torch SDPA and PagedAttention.
+
+        Args:
+            query: shape = [num_tokens, num_heads * head_size]
+            key: shape = [num_tokens, num_kv_heads * head_size]
+            value: shape = [num_tokens, num_kv_heads * head_size]
+            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
+                NOTE: kv_cache will be an empty tensor with shape [0]
+                for profiling run.
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """
+        if output_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported"
+                " for TorchSDPABackendImpl")
+
+        # For warming-up
+        if attn_metadata is None:
+            return query
+
+        attn_type = self.attn_type
+        if (attn_type == AttentionType.ENCODER
+                and (not attn_metadata.is_all_encoder_attn_metadata_set)):
+            raise AttributeError("Encoder attention requires setting "
+                                 "encoder metadata attributes.")
+        elif (attn_type == AttentionType.ENCODER_DECODER
+              and (not attn_metadata.is_all_cross_attn_metadata_set)):
+            raise AttributeError("Encoder/decoder cross-attention "
+                                 "requires setting cross-attention "
+                                 "metadata attributes.")
+
+        # Reshape the query, key, and value tensors.
+        query = query.view(-1, self.num_heads, self.head_size)
+        if key is not None:
+            assert value is not None
+            key = key.view(-1, self.num_kv_heads, self.head_size)
+            value = value.view(-1, self.num_kv_heads, self.head_size)
+        else:
+            assert value is None
+
+        if (attn_type != AttentionType.ENCODER and kv_cache.numel() > 0):
+            # KV-cache during decoder-self- or
+            # encoder-decoder-cross-attention, but not
+            # during encoder attention.
+            #
+            # Even if there are no new key/value pairs to cache,
+            # we still need to break out key_cache and value_cache
+            # i.e. for later use by paged attention
+            key_cache, value_cache = self.paged_attn_impl.split_kv_cache(
+                kv_cache, self.num_kv_heads, self.head_size)
+
+            if (key is not None) and (value is not None):
+                if attn_type == AttentionType.ENCODER_DECODER:
+                    # Update cross-attention KV cache (prefill-only)
+                    # During cross-attention decode, key & value will be None,
+                    # preventing this IF-statement branch from running
+                    updated_slot_mapping = attn_metadata.cross_slot_mapping
+                else:
+                    # Update self-attention KV cache (prefill/decode)
+                    updated_slot_mapping = attn_metadata.slot_mapping
+
+                self.paged_attn_impl.write_to_paged_cache(
+                    key, value, key_cache, value_cache, updated_slot_mapping,
+                    self.kv_cache_dtype, layer._k_scale, layer._v_scale)
+
+        if attn_type != AttentionType.ENCODER:
+            # Decoder self-attention supports chunked prefill.
+            # Encoder/decoder cross-attention requires no chunked
+            # prefill (100% prefill or 100% decode tokens, no mix)
+            num_prefill_tokens = attn_metadata.num_prefill_tokens
+            num_decode_tokens = attn_metadata.num_decode_tokens
+        else:
+            # Encoder attention - chunked prefill is not applicable;
+            # derive token-count from query shape & and treat them
+            # as 100% prefill tokens
+            assert attn_metadata.num_encoder_tokens is not None
+            num_prefill_tokens = attn_metadata.num_encoder_tokens
+            num_decode_tokens = 0
+
+        if attn_type == AttentionType.DECODER:
+            # Only enforce this shape-constraint for decoder
+            # self-attention
+            assert key.shape[0] == num_prefill_tokens + num_decode_tokens
+            assert value.shape[0] == num_prefill_tokens + num_decode_tokens
+
+        output = torch.empty_like(query)
+        if prefill_meta := attn_metadata.prefill_metadata:
+            if not prefill_meta.prefill_metadata.chunked_prefill:  # type: ignore
+                assert attn_metadata.seq_lens is not None
+                self._run_sdpa_forward(output,
+                                       query,
+                                       key,
+                                       value,
+                                       prefill_meta,
+                                       attn_type=attn_type)
+            else:
+                # prefix-enabled attention
+                assert not self.need_mask
+                import intel_extension_for_pytorch.llm.modules as ipex_modules
+                output = torch.empty_like(query)
+                ipex_modules.PagedAttention.flash_attn_varlen_func(
+                    output[:prefill_meta.num_prefill_tokens, :, :],
+                    query[:prefill_meta.num_prefill_tokens, :, :],
+                    key_cache,
+                    value_cache,
+                    prefill_meta.prefill_query_start_loc,
+                    prefill_meta.kv_start_loc,
+                    prefill_meta.max_query_len,
+                    prefill_meta.max_kv_len,
+                    self.scale,
+                    True,
+                    prefill_meta.prefill_block_tables,
+                    self.alibi_slopes,
+                )
+
+        if decode_meta := attn_metadata.decode_metadata:
+            assert attn_type != AttentionType.ENCODER_ONLY, (
+                "Encoder-only models should not have decode metadata.")
+            # Decoding run.
+            (
+                seq_lens_arg,
+                max_seq_len_arg,
+                block_tables_arg,
+            ) = decode_meta.get_seq_len_block_table_args(attn_type)
+
+            self.paged_attn_impl.forward_decode(
+                output[attn_metadata.num_prefill_tokens:, :, :],
+                query[attn_metadata.num_prefill_tokens:, :, :],
+                key_cache,
+                value_cache,
+                block_tables_arg,
+                seq_lens_arg,
+                max_seq_len_arg,
+                self.kv_cache_dtype,
+                self.num_kv_heads,
+                self.scale,
+                self.alibi_slopes,
+                layer._k_scale,
+                layer._v_scale,
+            )
+
+        # Reshape the output tensor.
+        return output.view(-1, self.num_heads * self.head_size)
+
+    def _run_sdpa_forward(
+        self,
+        output: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_metadata: TorchSDPAMetadata,
+        attn_type: str = AttentionType.DECODER,
+    ) -> None:
+        if self.num_kv_heads != self.num_heads:
+            key = key.repeat_interleave(self.num_queries_per_kv, dim=1)
+            value = value.repeat_interleave(self.num_queries_per_kv, dim=1)
+
+        attn_masks = attn_metadata.get_attn_bias(attn_type)
+        if attn_masks is None:
+            if self.alibi_slopes is not None:
+                attn_masks = _make_alibi_bias(
+                    self.alibi_slopes, query.dtype,
+                    attn_metadata.seq_lens)  # type: ignore
+            elif self.sliding_window is not None:
+                assert attn_metadata.seq_lens is not None
+                attn_masks = _make_sliding_window_bias(
+                    attn_metadata.seq_lens, self.sliding_window,
+                    query.dtype)  # type: ignore
+            else:
+                seq_lens, _ = attn_metadata.get_seq_lens(attn_type)
+                attn_masks = [None] * len(seq_lens)
+            attn_metadata.set_attn_bias(attn_masks, attn_type)
+
+        query = query.movedim(0, query.dim() - 2)
+        key = key.movedim(0, key.dim() - 2)
+        value = value.movedim(0, value.dim() - 2)
+
+        causal_attn = (attn_type == AttentionType.DECODER)
+
+        seq_lens_q, seq_lens_kv = attn_metadata.get_seq_lens(attn_type)
+        start_q, start_kv = 0, 0
+        for seq_len_q, seq_len_kv, mask in zip(seq_lens_q, seq_lens_kv,
+                                               attn_masks):
+            end_q = start_q + seq_len_q
+            end_kv = start_kv + seq_len_kv
+            sub_out = scaled_dot_product_attention(
+                query[None, :, start_q:end_q, :],
+                key[None, :, start_kv:end_kv, :],
+                value[None, :, start_kv:end_kv, :],
+                attn_mask=mask,
+                dropout_p=0.0,
+                is_causal=causal_attn and mask is None,
+                scale=self.scale).squeeze(0).movedim(query.dim() - 2, 0)
+            output[start_q:end_q, :, :] = sub_out
+            start_q, start_kv = end_q, end_kv
+
+
+def _make_alibi_bias(
+    alibi_slopes: torch.Tensor,
+    dtype: torch.dtype,
+    seq_lens: list[int],
+) -> list[torch.Tensor]:
+    attn_biases: list[torch.Tensor] = []
+    for seq_len in seq_lens:
+        bias = torch.arange(seq_len, dtype=dtype)
+        # NOTE(zhuohan): HF uses
+        #     `bias = bias[None, :].repeat(seq_len, 1)`
+        # here. We find that both biases give the same results, but
+        # the bias below more accurately follows the original ALiBi
+        # paper.
+        bias = bias[None, :] - bias[:, None]
+
+        num_heads = alibi_slopes.shape[0]
+        bias = bias[None, :].repeat((num_heads, 1, 1))
+        bias.mul_(alibi_slopes[:, None, None]).unsqueeze_(0)
+        inf_mask = torch.empty(
+            (1, seq_len, seq_len),
+            dtype=bias.dtype).fill_(-torch.inf).triu_(diagonal=1)
+        attn_biases.append((bias + inf_mask).to(dtype))
+
+    return attn_biases
+
+
+def _make_sliding_window_bias(
+    seq_lens: list[int],
+    window_size: Optional[int],
+    dtype: torch.dtype,
+) -> list[torch.Tensor]:
+    attn_biases: list[torch.Tensor] = []
+    for seq_len in seq_lens:
+        tensor = torch.full(
+            (1, seq_len, seq_len),
+            dtype=dtype,
+            fill_value=1,
+        )
+        shift = 0
+        mask = torch.tril(tensor, diagonal=shift).to(dtype)  # type: ignore
+        if window_size is not None:
+            mask = torch.triu(mask, diagonal=shift - window_size + 1)
+        mask = torch.log(mask)
+        attn_biases.append(mask.to(dtype))
+
+    return attn_biases
+
+
+class _PagedAttention:
+
+    @staticmethod
+    def validate_head_size(head_size: int) -> tuple[bool, list[int]]:
+        SUPPORT_HS = [32, 64, 80, 96, 112, 128, 192, 256]
+        return head_size in SUPPORT_HS, SUPPORT_HS
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+        *args,
+    ) -> tuple[int, ...]:
+        return 2, num_blocks, block_size * num_kv_heads * head_size
+
+    @staticmethod
+    def split_kv_cache(
+        kv_cache: torch.Tensor,
+        num_kv_heads: int,
+        head_size: int,
+        *args,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        x = 16 // kv_cache.element_size()
+        num_blocks = kv_cache.shape[1]
+
+        key_cache = kv_cache[0]
+        key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x,
+                                   -1, x)
+        value_cache = kv_cache[1]
+        value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1)
+        return key_cache, value_cache
+
+    @staticmethod
+    def write_to_paged_cache(
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        slot_mapping: torch.Tensor,
+        kv_cache_dtype: str,
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
+        *args,
+    ) -> None:
+        ops.reshape_and_cache(
+            key,
+            value,
+            key_cache,
+            value_cache,
+            slot_mapping.flatten(),
+            kv_cache_dtype,
+            k_scale,
+            v_scale,
+        )
+
+    @staticmethod
+    def forward_decode(
+        output: torch.Tensor,
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_tables: torch.Tensor,
+        context_lens: torch.Tensor,
+        max_context_len: int,
+        kv_cache_dtype: str,
+        num_kv_heads: int,
+        scale: float,
+        alibi_slopes: Optional[torch.Tensor],
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
+        *args,
+    ) -> None:
+        tp_rank: int = 0
+        blocksparse_local_blocks: int = 0
+        blocksparse_vert_stride: int = 0
+        blocksparse_block_size: int = 64
+        blocksparse_head_sliding_step: int = 0
+        block_size = value_cache.shape[3]
+
+        ops.paged_attention_v1(
+            output,
+            query,
+            key_cache,
+            value_cache,
+            num_kv_heads,
+            scale,
+            block_tables,
+            context_lens,
+            block_size,
+            max_context_len,
+            alibi_slopes,
+            kv_cache_dtype,
+            k_scale,
+            v_scale,
+            tp_rank,
+            blocksparse_local_blocks,
+            blocksparse_vert_stride,
+            blocksparse_block_size,
+            blocksparse_head_sliding_step,
+        )
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: list[torch.Tensor],
+        src_to_dists: torch.Tensor,
+        *args,
+    ) -> None:
+        key_caches = [kv_cache[0] for kv_cache in kv_caches]
+        value_caches = [kv_cache[1] for kv_cache in kv_caches]
+        ops.copy_blocks(key_caches, value_caches, src_to_dists)
+
+
+class _IPEXPagedAttention(_PagedAttention):
+
+    @staticmethod
+    def validate_head_size(head_size: int) -> tuple[bool, list[int]]:
+        return True, []
+
+    @staticmethod
+    def split_kv_cache(
+        kv_cache: torch.Tensor,
+        num_kv_heads: int,
+        head_size: int,
+        *args,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        num_blocks = kv_cache.shape[1]
+
+        key_cache = kv_cache[0]
+        key_cache = key_cache.view(num_blocks, num_kv_heads, -1, head_size)
+        value_cache = kv_cache[1]
+        value_cache = value_cache.view(num_blocks, num_kv_heads, -1, head_size)
+        return key_cache, value_cache
+
+    @staticmethod
+    def write_to_paged_cache(
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        slot_mapping: torch.Tensor,
+        kv_cache_dtype: str,
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
+        *args,
+    ) -> None:
+        ipex_modules.PagedAttention.reshape_and_cache(
+            key, value, key_cache, value_cache,
+            slot_mapping.flatten().int())
+
+    @staticmethod
+    def forward_decode(
+        output: torch.Tensor,
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_tables: torch.Tensor,
+        context_lens: torch.Tensor,
+        max_context_len: int,
+        kv_cache_dtype: str,
+        num_kv_heads: int,
+        scale: float,
+        alibi_slopes: Optional[torch.Tensor],
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
+        *args,
+    ) -> None:
+        block_size = value_cache.shape[2]
+        head_mapping = torch.arange(
+            0,
+            num_kv_heads,
+            device="cpu",
+            dtype=torch.int32,
+        ).view(num_kv_heads,
+               1).repeat_interleave(query.size(1) // num_kv_heads).flatten()
+        ipex_modules.PagedAttention.single_query_cached_kv_attention(
+            output, query.contiguous(), key_cache, value_cache, head_mapping,
+            scale, block_tables, context_lens, block_size, max_context_len,
+            alibi_slopes)
+
+
+def _get_paged_attn_impl():
+    if _use_ipex:
+        return _IPEXPagedAttention
+    else:
+        return _PagedAttention
diff --git a/vllm_v0.10.0/vllm/v1/attention/backends/flash_attn.py b/vllm_v0.10.0/vllm/v1/attention/backends/flash_attn.py
new file mode 100755
index 0000000..5fe274f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/attention/backends/flash_attn.py
@@ -0,0 +1,687 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Attention layer with FlashAttention."""
+from dataclasses import dataclass
+from typing import ClassVar, Optional
+
+import numpy as np
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionMetadata, AttentionType,
+                                              is_quantized_kv_cache)
+from vllm.attention.layer import Attention
+from vllm.attention.ops.merge_attn_states import merge_attn_states
+from vllm.attention.utils.fa_utils import (flash_attn_supports_fp8,
+                                           get_flash_attn_version,
+                                           is_flash_attn_varlen_func_available)
+
+if is_flash_attn_varlen_func_available():
+    from vllm.attention.utils.fa_utils import (flash_attn_varlen_func,
+                                               get_scheduler_metadata,
+                                               reshape_and_cache_flash)
+
+from vllm.config import VllmConfig, get_layers_from_vllm_config
+from vllm.logger import init_logger
+from vllm.utils import cdiv
+from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
+                                              CommonAttentionMetadata,
+                                              get_kv_cache_layout)
+from vllm.v1.kv_cache_interface import AttentionSpec
+
+logger = init_logger(__name__)
+
+# NOTE(woosuk): This is an arbitrary number. Tune it if needed.
+_DEFAULT_MAX_NUM_SPLITS_FOR_CUDA_GRAPH = 16
+
+
+class FlashAttentionBackend(AttentionBackend):
+
+    accept_output_buffer: bool = True
+
+    @classmethod
+    def get_supported_dtypes(cls) -> list[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        return [32, 64, 96, 128, 160, 192, 224, 256]
+
+    @classmethod
+    def validate_head_size(cls, head_size: int) -> None:
+        supported_head_sizes = cls.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
+            attn_type = cls.__name__.removesuffix("Backend")
+            raise ValueError(
+                f"Head size {head_size} is not supported by {attn_type}. "
+                f"Supported head sizes are: {supported_head_sizes}. "
+                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
+                "FlexAttention backend which supports all head sizes.")
+
+    @staticmethod
+    def get_name() -> str:
+        return "FLASH_ATTN_VLLM_V1"
+
+    @staticmethod
+    def get_impl_cls() -> type["FlashAttentionImpl"]:
+        return FlashAttentionImpl
+
+    @staticmethod
+    def get_metadata_cls() -> type["AttentionMetadata"]:
+        return FlashAttentionMetadata
+
+    @staticmethod
+    def get_builder_cls() -> type["FlashAttentionMetadataBuilder"]:
+        return FlashAttentionMetadataBuilder
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> tuple[int, ...]:
+        if block_size % 16 != 0:
+            raise ValueError("Block size must be a multiple of 16.")
+        return (2, num_blocks, block_size, num_kv_heads, head_size)
+
+    @staticmethod
+    def get_kv_cache_stride_order() -> tuple[int, ...]:
+        # `stride_order` indicates the permutation that gets
+        # us from `get_kv_cache_shape` to the actual memory layout we want.
+        cache_layout = get_kv_cache_layout()
+        if cache_layout == "NHD":
+            stride_order = (0, 1, 2, 3, 4)
+        elif cache_layout == "HND":
+            stride_order = (0, 1, 3, 2, 4)
+        else:
+            raise ValueError(f"Unknown cache layout format {cache_layout}.")
+        return stride_order
+
+
+@dataclass
+class FlashAttentionMetadata:
+    # NOTE(sang): Definition of context_len, query_len, and seq_len.
+    # |---------- N-1 iteration --------|
+    # |---------------- N iteration ---------------------|
+    # |- tokenA -|......................|-- newTokens ---|
+    # |---------- context_len ----------|
+    # |-------------------- seq_len ---------------------|
+    #                                   |-- query_len ---|
+
+    num_actual_tokens: int  # Number of tokens excluding padding.
+    max_query_len: int
+    query_start_loc: torch.Tensor
+    max_seq_len: int
+    seq_lens: torch.Tensor
+    block_table: torch.Tensor
+    slot_mapping: torch.Tensor
+
+    # For cascade attention.
+    use_cascade: bool
+    common_prefix_len: int
+    cu_prefix_query_lens: Optional[torch.Tensor]
+    prefix_kv_lens: Optional[torch.Tensor]
+    suffix_kv_lens: Optional[torch.Tensor]
+
+    # Optional aot scheduling
+    scheduler_metadata: Optional[torch.Tensor] = None
+    prefix_scheduler_metadata: Optional[torch.Tensor] = None
+    max_num_splits: int = 0
+
+
+def _get_sliding_window_configs(
+        vllm_config: VllmConfig) -> set[Optional[tuple[int, int]]]:
+    """Get the set of all sliding window configs used in the model."""
+    sliding_window_configs: set[Optional[tuple[int, int]]] = set()
+    layers = get_layers_from_vllm_config(vllm_config, Attention)
+    for layer in layers.values():
+        assert isinstance(layer.impl, FlashAttentionImpl)
+        sliding_window_configs.add(layer.impl.sliding_window)
+    return sliding_window_configs
+
+
+class FlashAttentionMetadataBuilder(
+        AttentionMetadataBuilder[FlashAttentionMetadata]):
+    full_cudagraph_supported: ClassVar[bool] = get_flash_attn_version() == 3
+
+    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
+                 device: torch.device):
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.parallel_config = vllm_config.parallel_config
+        self.cache_config = vllm_config.cache_config
+        self.compilation_config = vllm_config.compilation_config
+        self.device = device
+
+        self.num_heads_q = self.model_config.get_num_attention_heads(
+            self.parallel_config)
+        self.num_heads_kv = self.model_config.get_num_kv_heads(
+            self.parallel_config)
+        self.headdim = self.model_config.get_head_size()
+        self.block_size = kv_cache_spec.block_size
+
+        self.max_num_splits = 0  # No upper bound on the number of splits.
+        self.aot_schedule = (get_flash_attn_version() == 3)
+        self.use_full_cuda_graph = self.compilation_config.full_cuda_graph
+        if self.use_full_cuda_graph:
+            if not self.aot_schedule:
+                raise ValueError(
+                    "AoT scheduling is required for full cuda graph.")
+            capture_sizes = self.compilation_config.cudagraph_capture_sizes
+            if not capture_sizes:
+                raise ValueError(
+                    "cudagraph_capture_sizes should not be None when "
+                    "full_cuda_graph is True.")
+            self.max_cudagraph_size = max(capture_sizes)
+            if self.max_cudagraph_size > 992:
+                # This condition derives from FA3's internal heuristic.
+                # TODO(woosuk): Support larger cudagraph sizes.
+                raise ValueError(
+                    "Capture size larger than 992 is not supported for "
+                    "full cuda graph.")
+
+            self.scheduler_metadata = torch.zeros(
+                vllm_config.scheduler_config.max_num_seqs + 1,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            # When using cuda graph, we need to set the upper bound of the
+            # number of splits so that large enough intermediate buffers are
+            # pre-allocated during capture.
+            self.max_num_splits = _DEFAULT_MAX_NUM_SPLITS_FOR_CUDA_GRAPH
+
+        # Sliding window size to be used with the AOT scheduler will be
+        # populated on first build() call.
+        self.aot_sliding_window: Optional[tuple[int, int]] = None
+
+    def build(self,
+              common_prefix_len: int,
+              common_attn_metadata: CommonAttentionMetadata,
+              fast_build: bool = False) -> FlashAttentionMetadata:
+        """
+        fast_build disables AOT scheduling, used when there will be few 
+        iterations i.e. spec-decode
+        """
+        num_reqs = common_attn_metadata.num_reqs
+        num_actual_tokens = common_attn_metadata.num_actual_tokens
+        max_query_len = common_attn_metadata.max_query_len
+        max_seq_len = int(common_attn_metadata.seq_lens_cpu.max())
+        query_start_loc = common_attn_metadata.query_start_loc
+        seq_lens = common_attn_metadata.seq_lens
+        seq_lens_cpu = common_attn_metadata.seq_lens_cpu
+        block_table_tensor = common_attn_metadata.block_table_tensor
+        slot_mapping = common_attn_metadata.slot_mapping
+
+        # the overhead of the aot schedule is not worth it for spec-decode
+        aot_schedule = self.aot_schedule and not fast_build
+
+        if self.aot_sliding_window is None:
+            self.aot_sliding_window = (-1, -1)
+            # For the AOT scheduler we need the sliding window value to be
+            # constant for all layers to. We have to populate this on the first
+            # build() call so the layers are constructed (cannot populate)
+            # in __init__.
+            if aot_schedule:
+                sliding_window_configs = _get_sliding_window_configs(
+                    self.vllm_config)
+                if len(sliding_window_configs) == 1:
+                    sliding_window_config = sliding_window_configs.pop()
+                    if sliding_window_config is not None:
+                        self.aot_sliding_window = sliding_window_config
+                elif len(sliding_window_configs) > 1:
+                    self.aot_schedule = False
+                    aot_schedule = False
+
+        def schedule(batch_size, cu_query_lens, max_query_len, seqlens,
+                     max_seq_len, causal):
+            if aot_schedule:
+                return get_scheduler_metadata(
+                    batch_size=batch_size,
+                    max_seqlen_q=max_query_len,
+                    max_seqlen_k=max_seq_len,
+                    cache_seqlens=seqlens,
+                    num_heads_q=self.num_heads_q,
+                    num_heads_kv=self.num_heads_kv,
+                    headdim=self.headdim,
+                    page_size=self.block_size,
+                    cu_seqlens_q=cu_query_lens,
+                    causal=causal,
+                    window_size=self.aot_sliding_window,
+                    num_splits=self.max_num_splits,
+                )
+            return None
+
+        use_cascade = common_prefix_len > 0
+
+        if use_cascade:
+            cu_prefix_query_lens = torch.tensor([0, num_actual_tokens],
+                                                dtype=torch.int32,
+                                                device=self.device)
+            prefix_kv_lens = torch.tensor([common_prefix_len],
+                                          dtype=torch.int32,
+                                          device=self.device)
+            suffix_kv_lens = (seq_lens_cpu[:num_reqs] - common_prefix_len).to(
+                self.device, non_blocking=True)
+            prefix_scheduler_metadata = schedule(
+                batch_size=1,
+                cu_query_lens=cu_prefix_query_lens,
+                max_query_len=num_actual_tokens,
+                seqlens=prefix_kv_lens,
+                max_seq_len=common_prefix_len,
+                causal=False)
+            scheduler_metadata = schedule(batch_size=num_reqs,
+                                          cu_query_lens=query_start_loc,
+                                          max_query_len=max_query_len,
+                                          seqlens=suffix_kv_lens,
+                                          max_seq_len=max_seq_len -
+                                          common_prefix_len,
+                                          causal=True)
+        else:
+            cu_prefix_query_lens = None
+            prefix_kv_lens = None
+            suffix_kv_lens = None
+            prefix_scheduler_metadata = None
+            scheduler_metadata = schedule(batch_size=num_reqs,
+                                          cu_query_lens=query_start_loc,
+                                          max_query_len=max_query_len,
+                                          seqlens=seq_lens,
+                                          max_seq_len=max_seq_len,
+                                          causal=True)
+
+        if self.use_full_cuda_graph:
+            assert scheduler_metadata is not None
+            n = scheduler_metadata.shape[0]
+            self.scheduler_metadata[:n] = scheduler_metadata
+            # NOTE(woosuk): We should zero out the rest of the scheduler
+            # metadata to guarantee the correctness. Otherwise, some thread
+            # blocks may use the invalid scheduler metadata and overwrite the
+            # output buffer.
+            self.scheduler_metadata[n:] = 0
+            scheduler_metadata = self.scheduler_metadata[:n]
+
+        max_num_splits = 0
+        if (self.use_full_cuda_graph
+                and num_actual_tokens <= self.max_cudagraph_size):
+            # NOTE(woosuk): Setting num_splits > 1 may increase the memory
+            # usage, because the intermediate buffers of size [num_splits,
+            # num_heads, num_tokens, head_size] are allocated. Therefore,
+            # we only set num_splits when using cuda graphs.
+            max_num_splits = self.max_num_splits
+
+        attn_metadata = FlashAttentionMetadata(
+            num_actual_tokens=num_actual_tokens,
+            max_query_len=max_query_len,
+            query_start_loc=query_start_loc,
+            max_seq_len=max_seq_len,
+            seq_lens=seq_lens,
+            block_table=block_table_tensor,
+            slot_mapping=slot_mapping,
+            use_cascade=use_cascade,
+            common_prefix_len=common_prefix_len,
+            scheduler_metadata=scheduler_metadata,
+            cu_prefix_query_lens=cu_prefix_query_lens,
+            prefix_kv_lens=prefix_kv_lens,
+            suffix_kv_lens=suffix_kv_lens,
+            prefix_scheduler_metadata=prefix_scheduler_metadata,
+            max_num_splits=max_num_splits,
+        )
+        return attn_metadata
+
+    def can_run_in_cudagraph(
+            self, common_attn_metadata: CommonAttentionMetadata) -> bool:
+        # Full CUDA Graph always supported (FA2 support checked separately)
+        return True
+
+    def use_cascade_attention(self, *args, **kwargs) -> bool:
+        return use_cascade_attention(*args, **kwargs)
+
+
+class FlashAttentionImpl(AttentionImpl):
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[list[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: AttentionType = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+        self.alibi_slopes = alibi_slopes
+        if sliding_window is None:
+            self.sliding_window = (-1, -1)
+        else:
+            self.sliding_window = (sliding_window - 1, 0)
+        self.kv_cache_dtype = kv_cache_dtype
+        if logits_soft_cap is None:
+            # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
+            logits_soft_cap = 0
+        self.logits_soft_cap = logits_soft_cap
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
+
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        FlashAttentionBackend.validate_head_size(head_size)
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "FlashAttentionImpl")
+        self.vllm_flash_attn_version = get_flash_attn_version()
+        if is_quantized_kv_cache(self.kv_cache_dtype) \
+            and not flash_attn_supports_fp8():
+            raise NotImplementedError(
+                "FlashAttention does not support fp8 kv-cache on this device.")
+
+    def forward(
+        self,
+        layer: torch.nn.Module,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: FlashAttentionMetadata,
+        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass with FlashAttention.
+
+        Args:
+            query: shape = [num_tokens, num_heads, head_size]
+            key: shape = [num_tokens, num_kv_heads, head_size]
+            value: shape = [num_tokens, num_kv_heads, head_size]
+            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        NOTE: FP8 quantization, flash-attn expect the size of
+              {q,k,v}_descale to be (num_sequences, num_kv_heads).
+              We use torch's .expand() to avoid duplicating values
+        """
+        assert output is not None, "Output tensor must be provided."
+
+        if output_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported"
+                " for FlashAttentionImpl")
+
+        if attn_metadata is None:
+            # Profiling run.
+            return output
+
+        # IMPORTANT!
+        # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in
+        # eager-mode PyTorch. Thus, we need to be careful about any CPU overhead
+        # in this method. For example, `view` and `slice` (or `[:n]`) operations
+        # are surprisingly slow even in the case they do not invoke any GPU ops.
+        # Minimize the PyTorch ops in this method as much as possible.
+        # Whenever making a change in this method, please benchmark the
+        # performance to make sure it does not introduce any overhead.
+
+        num_actual_tokens = attn_metadata.num_actual_tokens
+        key_cache, value_cache = kv_cache.unbind(0)
+
+        if self.kv_sharing_target_layer_name is None:
+            # Reshape the input keys and values and store them in the cache.
+            # Skip this if sharing KV cache with an earlier attention layer.
+            # NOTE(woosuk): Here, key and value are padded while slot_mapping is
+            # not padded. However, we don't need to do key[:num_actual_tokens]
+            # and value[:num_actual_tokens] because the reshape_and_cache_flash
+            # op uses the slot_mapping's shape to determine the number of
+            # actual tokens.
+            reshape_and_cache_flash(
+                key,
+                value,
+                key_cache,
+                value_cache,
+                attn_metadata.slot_mapping,
+                self.kv_cache_dtype,
+                layer._k_scale,
+                layer._v_scale,
+            )
+
+        if self.kv_cache_dtype.startswith("fp8"):
+            key_cache = key_cache.view(torch.float8_e4m3fn)
+            value_cache = value_cache.view(torch.float8_e4m3fn)
+            num_tokens, num_heads, head_size = query.shape
+            query, _ = ops.scaled_fp8_quant(
+                query.reshape(
+                    (num_tokens, num_heads * head_size)).contiguous(),
+                layer._q_scale)
+            query = query.reshape((num_tokens, num_heads, head_size))
+
+        if not attn_metadata.use_cascade:
+            cu_seqlens_q = attn_metadata.query_start_loc
+            seqused_k = attn_metadata.seq_lens
+            max_seqlen_q = attn_metadata.max_query_len
+            max_seqlen_k = attn_metadata.max_seq_len
+            block_table = attn_metadata.block_table
+            scheduler_metadata = attn_metadata.scheduler_metadata
+
+            descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1])
+
+            flash_attn_varlen_func(
+                q=query[:num_actual_tokens],
+                k=key_cache,
+                v=value_cache,
+                out=output[:num_actual_tokens],
+                cu_seqlens_q=cu_seqlens_q,
+                max_seqlen_q=max_seqlen_q,
+                seqused_k=seqused_k,
+                max_seqlen_k=max_seqlen_k,
+                softmax_scale=self.scale,
+                causal=True,
+                alibi_slopes=self.alibi_slopes,
+                window_size=self.sliding_window,
+                block_table=block_table,
+                softcap=self.logits_soft_cap,
+                scheduler_metadata=scheduler_metadata,
+                fa_version=self.vllm_flash_attn_version,
+                q_descale=layer._q_scale.expand(descale_shape),
+                k_descale=layer._k_scale.expand(descale_shape),
+                v_descale=layer._v_scale.expand(descale_shape),
+                num_splits=attn_metadata.max_num_splits,
+            )
+            return output
+
+        # Cascade attention (rare case).
+        cascade_attention(
+            output[:num_actual_tokens],
+            query[:num_actual_tokens],
+            key_cache,
+            value_cache,
+            cu_query_lens=attn_metadata.query_start_loc,
+            max_query_len=attn_metadata.max_query_len,
+            cu_prefix_query_lens=attn_metadata.cu_prefix_query_lens,
+            prefix_kv_lens=attn_metadata.prefix_kv_lens,
+            suffix_kv_lens=attn_metadata.suffix_kv_lens,
+            max_kv_len=attn_metadata.max_seq_len,
+            softmax_scale=self.scale,
+            alibi_slopes=self.alibi_slopes,
+            sliding_window=self.sliding_window,
+            logits_soft_cap=self.logits_soft_cap,
+            block_table=attn_metadata.block_table,
+            common_prefix_len=attn_metadata.common_prefix_len,
+            fa_version=self.vllm_flash_attn_version,
+            prefix_scheduler_metadata=attn_metadata.prefix_scheduler_metadata,
+            suffix_scheduler_metadata=attn_metadata.scheduler_metadata,
+            q_descale=layer._q_scale,
+            k_descale=layer._k_scale,
+            v_descale=layer._v_scale,
+        )
+        return output
+
+
+def use_cascade_attention(
+    common_prefix_len: int,
+    query_lens: np.ndarray,
+    num_query_heads: int,
+    num_kv_heads: int,
+    use_alibi: bool,
+    use_sliding_window: bool,
+    use_local_attention: bool,
+    num_sms: int,
+) -> bool:
+    """Decide whether to use cascade attention.
+
+    This function 1) checks whether cascade attention is supported with the
+    given configuration, and 2) heuristically decides whether using cascade
+    attention can improve performance.
+    """
+    # Too short common prefix. Probably not worth using cascade attention.
+    # We use an arbitrary threshold of 256 tokens. TODO: Tune this threshold.
+    # NOTE(woosuk): This is the common case. We should return False as soon as
+    # possible to avoid any unnecessary computation.
+    if common_prefix_len < 256:
+        return False
+    # Cascade attention is currently not supported with these variants.
+    if use_alibi or use_sliding_window or use_local_attention:
+        return False
+    # Too few queries. Probably not worth using cascade attention.
+    # We use an arbitrary threshold of 8 queries. TODO: Tune this threshold.
+    num_reqs = len(query_lens)
+    if num_reqs < 8:
+        return False
+
+    # Heuristics to decide whether using cascade attention is beneficial.
+    # 1. When FlashDecoding is not used for normal attention, cascade attention
+    #    is likely to be faster since it saves memory bandwidth.
+    num_queries_per_kv = num_query_heads // num_kv_heads
+    # The criteria for using FlashDecoding can be found in the following link:
+    # https://github.com/vllm-project/flash-attention/blob/96266b1111111f3d11aabefaf3bacbab6a89d03c/csrc/flash_attn/flash_api.cpp#L535
+    use_flash_decoding = (num_queries_per_kv > 1 and not use_sliding_window
+                          and not use_alibi and np.all(query_lens == 1))
+    if not use_flash_decoding:
+        # Use cascade attention.
+        return True
+
+    # 2. When FlashDecoding is used for normal attention, it is not clear
+    #    whether cascade attention is beneficial, because FlashDecoding can
+    #    launch more CTAs than cascade attention.
+    #    We use a simple performance model to compare the two methods.
+    #    NOTE(woosuk): The performance model is very rough and may not be
+    #    accurate.
+    num_tokens = num_reqs
+    # NOTE(woosuk): These are default tile sizes. flash-attn might use
+    # different tile sizes (e.g., 64 or 256) depending on the configuration.
+    q_tile_size = 128
+    kv_tile_size = 128
+    num_prefix_tiles = cdiv(common_prefix_len, kv_tile_size)
+
+    cascade_ctas = num_query_heads * cdiv(num_tokens, q_tile_size)
+    cascade_waves = cdiv(cascade_ctas, num_sms)
+    cascade_time = cascade_waves * num_prefix_tiles
+
+    flash_decoding_ctas = (num_reqs * num_kv_heads *
+                           cdiv(num_queries_per_kv, q_tile_size))
+    flash_decoding_ctas *= num_prefix_tiles
+    flash_decoding_time = cdiv(flash_decoding_ctas, num_sms)
+
+    # Use cascade attention if it is faster than FlashDecoding.
+    return cascade_time < flash_decoding_time
+
+
+def cascade_attention(
+    output: torch.Tensor,
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    cu_query_lens: torch.Tensor,
+    max_query_len: int,
+    cu_prefix_query_lens: torch.Tensor,
+    prefix_kv_lens: torch.Tensor,
+    suffix_kv_lens: torch.Tensor,
+    max_kv_len: int,
+    softmax_scale: float,
+    alibi_slopes: Optional[torch.Tensor],
+    sliding_window: tuple[int, int],
+    logits_soft_cap: float,
+    block_table: torch.Tensor,
+    common_prefix_len: int,
+    fa_version: int,
+    prefix_scheduler_metadata: Optional[torch.Tensor] = None,
+    suffix_scheduler_metadata: Optional[torch.Tensor] = None,
+    q_descale: Optional[torch.Tensor] = None,
+    k_descale: Optional[torch.Tensor] = None,
+    v_descale: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    assert alibi_slopes is None, ("Cascade attention does not support ALiBi.")
+    # TODO: Support sliding window.
+    assert sliding_window == (-1, -1), (
+        "Cascade attention does not support sliding window.")
+
+    num_tokens = query.shape[0]
+    block_size = key_cache.shape[-3]
+    assert common_prefix_len % block_size == 0
+    num_common_kv_blocks = common_prefix_len // block_size
+    assert num_common_kv_blocks > 0
+    descale_shape = (cu_prefix_query_lens.shape[0] - 1, key_cache.shape[-2])
+
+    # Process shared prefix.
+    prefix_output, prefix_lse = flash_attn_varlen_func(
+        q=query,
+        k=key_cache,
+        v=value_cache,
+        cu_seqlens_q=cu_prefix_query_lens,
+        seqused_k=prefix_kv_lens,
+        max_seqlen_q=num_tokens,
+        max_seqlen_k=common_prefix_len,
+        softmax_scale=softmax_scale,
+        causal=False,
+        window_size=sliding_window,
+        block_table=block_table[:1],
+        softcap=logits_soft_cap,
+        return_softmax_lse=True,
+        scheduler_metadata=prefix_scheduler_metadata,
+        fa_version=fa_version,
+        q_descale=q_descale.expand(descale_shape)
+        if q_descale is not None else None,
+        k_descale=k_descale.expand(descale_shape)
+        if k_descale is not None else None,
+        v_descale=v_descale.expand(descale_shape)
+        if v_descale is not None else None,
+    )
+
+    descale_shape = (cu_query_lens.shape[0] - 1, key_cache.shape[-2])
+
+    # Process suffix per query.
+    suffix_output, suffix_lse = flash_attn_varlen_func(
+        q=query,
+        k=key_cache,
+        v=value_cache,
+        cu_seqlens_q=cu_query_lens,
+        seqused_k=suffix_kv_lens,
+        max_seqlen_q=max_query_len,
+        max_seqlen_k=max_kv_len - common_prefix_len,
+        softmax_scale=softmax_scale,
+        causal=True,
+        window_size=sliding_window,
+        block_table=block_table[:, num_common_kv_blocks:],
+        softcap=logits_soft_cap,
+        return_softmax_lse=True,
+        scheduler_metadata=suffix_scheduler_metadata,
+        fa_version=fa_version,
+        q_descale=q_descale.expand(descale_shape)
+        if q_descale is not None else None,
+        k_descale=k_descale.expand(descale_shape)
+        if k_descale is not None else None,
+        v_descale=v_descale.expand(descale_shape)
+        if v_descale is not None else None,
+    )
+
+    # Merge prefix and suffix outputs, and store the result in output.
+    merge_attn_states(output, prefix_output, prefix_lse, suffix_output,
+                      suffix_lse)
diff --git a/vllm_v0.10.0/vllm/v1/attention/backends/flashinfer.py b/vllm_v0.10.0/vllm/v1/attention/backends/flashinfer.py
new file mode 100755
index 0000000..953ef26
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/attention/backends/flashinfer.py
@@ -0,0 +1,683 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Attention layer with FlashInfer."""
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional
+
+import torch
+from flashinfer import (BatchDecodeWithPagedKVCacheWrapper,
+                        BatchPrefillWithPagedKVCacheWrapper,
+                        MultiLevelCascadeAttentionWrapper)
+from flashinfer.decode import trtllm_batch_decode_with_kv_cache
+
+import vllm.envs as envs
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionType)
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.v1.attention.backends.flash_attn import use_cascade_attention
+from vllm.v1.attention.backends.utils import (
+    AttentionMetadataBuilder, CommonAttentionMetadata, PerLayerParameters,
+    get_kv_cache_layout, get_per_layer_parameters,
+    infer_global_hyperparameters, reorder_batch_to_split_decodes_and_prefills,
+    split_decodes_and_prefills)
+from vllm.v1.kv_cache_interface import AttentionSpec
+
+if TYPE_CHECKING:
+    from vllm.v1.core.sched.output import SchedulerOutput
+    from vllm.v1.worker.gpu_input_batch import InputBatch
+
+FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
+
+logger = init_logger(__name__)
+
+
+class FlashInferBackend(AttentionBackend):
+
+    accept_output_buffer: bool = True
+    cached_sm100a_supported: Optional[bool] = None
+
+    @classmethod
+    def get_supported_dtypes(cls) -> list[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        # https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157
+        return [64, 128, 256]
+
+    @classmethod
+    def validate_head_size(cls, head_size: int) -> None:
+        supported_head_sizes = cls.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
+            attn_type = cls.__name__.removesuffix("Backend")
+            raise ValueError(
+                f"Head size {head_size} is not supported by {attn_type}. "
+                f"Supported head sizes are: {supported_head_sizes}. "
+                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
+                "FlexAttention backend which supports all head sizes.")
+
+    @staticmethod
+    def get_name() -> str:
+        return "FLASHINFER_VLLM_V1"
+
+    @staticmethod
+    def get_impl_cls() -> type[FlashInferImpl]:
+        return FlashInferImpl
+
+    @staticmethod
+    def get_metadata_cls() -> type[FlashInferMetadata]:
+        return FlashInferMetadata
+
+    @staticmethod
+    def get_builder_cls() -> type[FlashInferMetadataBuilder]:
+        return FlashInferMetadataBuilder
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> tuple[int, ...]:
+        return (num_blocks, 2, block_size, num_kv_heads, head_size)
+
+    @staticmethod
+    def get_kv_cache_stride_order() -> tuple[int, ...]:
+        # `stride_order` indicates the permutation that gets us from
+        # `get_kv_cache_shape` to the actual memory layout we want.
+        cache_layout = get_kv_cache_layout()
+        if cache_layout == "NHD":
+            stride_order = (0, 1, 2, 3, 4)
+        elif cache_layout == "HND":
+            stride_order = (0, 1, 3, 2, 4)
+        else:
+            raise ValueError(f"Unknown cache layout format {cache_layout}.")
+        return stride_order
+
+    @staticmethod
+    def use_trtllm_decode_attention(
+        batch_size: int,
+        max_seq_len: int,
+        kv_cache_dtype: str,
+        num_qo_heads: int,
+        num_kv_heads: int,
+        attn_head_size: int,
+    ) -> bool:
+        if FlashInferBackend.cached_sm100a_supported is None:
+            FlashInferBackend.cached_sm100a_supported = (
+                current_platform.has_device_capability(100))
+        if not FlashInferBackend.cached_sm100a_supported:
+            return False
+        if (num_qo_heads // num_kv_heads > 8
+                or num_qo_heads % num_kv_heads != 0 or attn_head_size != 128):
+            return False
+        env_value = envs.VLLM_USE_TRTLLM_DECODE_ATTENTION
+        if env_value is not None:
+            logger.info_once("VLLM_USE_TRTLLM_DECODE_ATTENTION is set to %s",
+                             env_value)
+            # Environment variable is set - respect it
+            # Making the conditional check for zero because
+            # the path is automatically enabled if the batch size condition
+            # is satisfied.
+            no_use_trtllm = env_value == "0"
+            if not no_use_trtllm:
+                logger.info_once(
+                    "VLLM_USE_TRTLLM_DECODE_ATTENTION is set to 1, "
+                    "using TRTLLM decode attention.")
+            return not no_use_trtllm
+        else:
+            # Environment variable not set - use auto-detection
+            # Only supports attention head size of 128
+            use_trtllm = (FlashInferBackend.cached_sm100a_supported
+                          and batch_size <= 256 and max_seq_len < 131072
+                          and kv_cache_dtype == "auto")
+            if use_trtllm:
+                logger.warning_once(
+                    "Using TRTLLM decode attention (auto-detected).")
+        return use_trtllm
+
+    @staticmethod
+    def get_fp8_dtype_for_flashinfer(kv_cache_dtype: str) -> torch.dtype:
+        if kv_cache_dtype in ("fp8", "fp8_e4m3"):
+            return torch.float8_e4m3fn
+        elif kv_cache_dtype == "fp8_e5m2":
+            return torch.float8_e5m2
+        else:
+            raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}")
+
+
+@dataclass
+class FlashInferMetadata:
+
+    num_actual_tokens: int  # Number of tokens excluding padding.
+
+    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
+    # the batch, used to index into subquery. E.g., if the subquery length
+    # is [4, 6], it is [0, 4, 10].
+    qo_indptr: torch.Tensor
+    # An example for paged_kv_indices, paged_kv_indptr:
+    # request 1, page indices [0, 5, 8]
+    # request 2, page indices [1, 6, 7]
+    # request 3, page indices [3, 4]
+    # paged_kv_indices is a concatenation of page indices of all requests:
+    # [0, 5, 8, 1, 6, 7, 3, 4]
+    # paged_kv_indptr is used to index into paged_kv_indices:
+    # [0, 3, 6, 8]
+    # The indptr of the paged kv cache, shape: [batch_size + 1]
+    paged_kv_indptr: torch.Tensor
+    # The page indices of the paged kv cache
+    paged_kv_indices: torch.Tensor
+    # The number of entries in the last page of each request in
+    # the paged kv cache, shape: [batch_size]
+    paged_kv_last_page_len: torch.Tensor
+    # The number of query/output heads
+    num_qo_heads: int
+    # The number of key/value heads
+    num_kv_heads: int
+    # The dimension of the attention heads
+    head_dim: int
+    # Block size of vllm
+    page_size: int
+    # The data type of the paged kv cache
+    kv_data_type: torch.dtype
+    # The data type of the query
+    q_data_type: torch.dtype
+
+    slot_mapping: torch.Tensor
+
+    # For flashinfer trtllm batch decode
+    max_seq_len: int
+    seq_lens: torch.Tensor
+    block_table_tensor: torch.Tensor
+    workspace_buffer: torch.Tensor
+
+    # For handling prefill decode split
+    num_decodes: int
+    num_decode_tokens: int
+    num_prefills: int
+    num_prefill_tokens: int
+
+    # For cascade attention.
+    use_cascade: bool
+    shared_qo_indptr: Optional[torch.Tensor] = None
+    shared_kv_page_indptr: Optional[torch.Tensor] = None
+    shared_kv_page_indices: Optional[torch.Tensor] = None
+    shared_kv_last_page_len: Optional[torch.Tensor] = None
+
+    prefill_wrapper: Optional[BatchPrefillWithPagedKVCacheWrapper] = None
+    decode_wrapper: Optional[BatchDecodeWithPagedKVCacheWrapper] = None
+    cascade_wrapper: Optional[MultiLevelCascadeAttentionWrapper] = None
+
+    @property
+    def query_start_loc(self):
+        # The GPUModelRunner expects to be able to access this property.
+        return self.qo_indptr
+
+    def __post_init__(self):
+        if self.head_dim is not None:
+            FlashInferBackend.validate_head_size(self.head_dim)
+
+
+class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
+
+    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
+                 device: torch.device):
+        self.device = device
+        self._workspace_buffer = None
+        self._prefill_wrapper = None  # Wrapper for prefill/append
+        self._decode_wrapper = None  # Wrapper for decode
+        self._cascade_wrapper = None  # Wrapper for cascade attention
+
+        # Global hyperparameters shared by all attention layers
+        self.global_hyperparameters: Optional[PerLayerParameters] = None
+
+        self.vllm_config = vllm_config
+        self.cache_config = vllm_config.cache_config
+        self.kv_cache_spec = kv_cache_spec
+
+    def reorder_batch(self, input_batch: InputBatch,
+                      scheduler_output: SchedulerOutput) -> bool:
+        return reorder_batch_to_split_decodes_and_prefills(input_batch,
+                                                           scheduler_output,
+                                                           decode_threshold=1)
+
+    def _get_workspace_buffer(self):
+        if self._workspace_buffer is None:
+            self._workspace_buffer = torch.empty(
+                FLASHINFER_WORKSPACE_BUFFER_SIZE,
+                dtype=torch.uint8,
+                device=self.device)
+        return self._workspace_buffer
+
+    def _get_prefill_wrapper(self):
+        if self._prefill_wrapper is None:
+            self._prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper(
+                self._get_workspace_buffer(), get_kv_cache_layout())
+        return self._prefill_wrapper
+
+    def _get_decode_wrapper(self):
+        if self._decode_wrapper is None:
+            num_qo_heads = (
+                self.vllm_config.model_config.get_num_attention_heads(
+                    self.vllm_config.parallel_config))
+            num_kv_heads = self.vllm_config.model_config.get_num_kv_heads(
+                self.vllm_config.parallel_config)
+            use_tensor_cores = envs.VLLM_FLASHINFER_FORCE_TENSOR_CORES or (
+                num_qo_heads // num_kv_heads > 4)
+            self._decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(
+                self._get_workspace_buffer(),
+                get_kv_cache_layout(),
+                use_tensor_cores=use_tensor_cores)
+        return self._decode_wrapper
+
+    def _get_cascade_wrapper(self):
+        if self._cascade_wrapper is None:
+            self._cascade_wrapper = MultiLevelCascadeAttentionWrapper(
+                2, self._get_workspace_buffer(), get_kv_cache_layout())
+        return self._cascade_wrapper
+
+    def _plan(self, num_prefills: int, num_decodes: int,
+              attn_metadata: FlashInferMetadata):
+        if self.global_hyperparameters is None:
+            self.global_hyperparameters = infer_global_hyperparameters(
+                get_per_layer_parameters(self.vllm_config, FlashInferImpl))
+        if attn_metadata.use_cascade:
+            attn_metadata.cascade_wrapper = self._get_cascade_wrapper()
+            attn_metadata.cascade_wrapper.plan(
+                [attn_metadata.shared_qo_indptr, attn_metadata.qo_indptr],
+                [
+                    attn_metadata.shared_kv_page_indptr,
+                    attn_metadata.paged_kv_indptr
+                ],
+                [
+                    attn_metadata.shared_kv_page_indices,
+                    attn_metadata.paged_kv_indices
+                ],
+                [
+                    attn_metadata.shared_kv_last_page_len,
+                    attn_metadata.paged_kv_last_page_len
+                ],
+                attn_metadata.num_qo_heads,
+                attn_metadata.num_kv_heads,
+                attn_metadata.head_dim,
+                attn_metadata.page_size,
+                causal=True,
+                sm_scale=self.global_hyperparameters.sm_scale,
+                window_left=self.global_hyperparameters.window_left,
+                logits_soft_cap=self.global_hyperparameters.logits_soft_cap,
+                q_data_type=attn_metadata.q_data_type,
+                kv_data_type=attn_metadata.kv_data_type,
+            )
+        else:
+            # Regular attention (common case).
+            # Decodes are at the front and prefills are at the back,
+            # according to reorder_batch()
+            if num_prefills > 0:
+                # Decodes are first so prefills start after the last decode
+                prefill_start = num_decodes
+                attn_metadata.prefill_wrapper = self._get_prefill_wrapper()
+                assert attn_metadata.qo_indptr[prefill_start:].shape[
+                    0] == num_prefills + 1
+                assert attn_metadata.paged_kv_indptr[prefill_start:].shape[
+                    0] == num_prefills + 1
+                assert attn_metadata.paged_kv_last_page_len[
+                    prefill_start:].shape[0] == num_prefills
+                # Since prefill_wrapper.run() will be called with
+                # query[num_decode_tokens:] we need to adjust the qo_indptr
+                # to be relative to the start of the prefill queries.
+                qo_indptr = attn_metadata.qo_indptr[
+                    prefill_start:] - attn_metadata.qo_indptr[prefill_start]
+                attn_metadata.prefill_wrapper.plan(
+                    qo_indptr,
+                    attn_metadata.paged_kv_indptr[prefill_start:],
+                    attn_metadata.paged_kv_indices,
+                    attn_metadata.paged_kv_last_page_len[prefill_start:],
+                    attn_metadata.num_qo_heads,
+                    attn_metadata.num_kv_heads,
+                    attn_metadata.head_dim,
+                    attn_metadata.page_size,
+                    causal=True,
+                    sm_scale=self.global_hyperparameters.sm_scale,
+                    window_left=self.global_hyperparameters.window_left,
+                    logits_soft_cap=self.global_hyperparameters.
+                    logits_soft_cap,
+                    q_data_type=attn_metadata.q_data_type,
+                    kv_data_type=attn_metadata.kv_data_type,
+                )
+
+            if num_decodes > 0:
+                attn_metadata.decode_wrapper = self._get_decode_wrapper()
+                if not FlashInferBackend.use_trtllm_decode_attention(
+                        num_decodes, attn_metadata.max_seq_len,
+                        self.cache_config.cache_dtype,
+                        attn_metadata.num_qo_heads, attn_metadata.num_kv_heads,
+                        attn_metadata.head_dim):
+                    attn_metadata.decode_wrapper.plan(
+                        attn_metadata.paged_kv_indptr[:num_decodes + 1],
+                        attn_metadata.paged_kv_indices,
+                        attn_metadata.paged_kv_last_page_len[:num_decodes],
+                        attn_metadata.num_qo_heads,
+                        attn_metadata.num_kv_heads,
+                        attn_metadata.head_dim,
+                        attn_metadata.page_size,
+                        # Disable flashinfer's pos encoding and use vllm's rope.
+                        pos_encoding_mode="NONE",
+                        sm_scale=self.global_hyperparameters.sm_scale,
+                        window_left=self.global_hyperparameters.window_left,
+                        logits_soft_cap=self.global_hyperparameters.
+                        logits_soft_cap,
+                        q_data_type=attn_metadata.q_data_type,
+                        kv_data_type=attn_metadata.kv_data_type,
+                    )
+
+    def build(self,
+              common_prefix_len: int,
+              common_attn_metadata: CommonAttentionMetadata,
+              fast_build: bool = False) -> FlashInferMetadata:
+        num_actual_tokens = common_attn_metadata.num_actual_tokens
+        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens =\
+            split_decodes_and_prefills(common_attn_metadata)
+
+        page_size = self.kv_cache_spec.block_size
+        device = self.device
+        qo_indptr = common_attn_metadata.query_start_loc
+        max_seq_len = common_attn_metadata.seq_lens_cpu.max()
+        seq_lens = common_attn_metadata.seq_lens
+        block_table_tensor = common_attn_metadata.block_table_tensor
+
+        block_table_bounds = (seq_lens + page_size - 1) // page_size
+
+        use_cascade = common_prefix_len > 0
+        if use_cascade:
+            # Grab the blocks of the shared prefix from the first request.
+            assert common_prefix_len % page_size == 0
+            num_common_kv_blocks = common_prefix_len // page_size
+            shared_qo_indptr = torch.tensor([0, num_actual_tokens],
+                                            dtype=torch.int32,
+                                            device=device)
+            shared_kv_page_indptr = torch.tensor([0, num_common_kv_blocks],
+                                                 dtype=torch.int32,
+                                                 device=device)
+            shared_kv_page_indices = block_table_tensor[
+                0, :num_common_kv_blocks]
+            shared_kv_last_page_len = torch.tensor([page_size],
+                                                   dtype=torch.int32,
+                                                   device=device)
+            # Remove the blocks of the shared prefix from all requests.
+            block_table_tensor = block_table_tensor[:, num_common_kv_blocks:]
+            block_table_bounds -= num_common_kv_blocks
+        else:
+            shared_qo_indptr = None
+            shared_kv_page_indptr = None
+            shared_kv_page_indices = None
+            shared_kv_last_page_len = None
+
+        mask = (torch.arange(block_table_tensor.size(1),
+                             dtype=block_table_tensor.dtype,
+                             device=block_table_tensor.device).unsqueeze(0)
+                < block_table_bounds.unsqueeze(1))
+        paged_kv_indices = block_table_tensor[mask]
+
+        paged_kv_indptr = torch.cat([
+            torch.zeros(1,
+                        dtype=block_table_bounds.dtype,
+                        device=block_table_bounds.device),
+            block_table_bounds.cumsum(dim=0, dtype=torch.int32)
+        ])
+
+        paged_kv_last_page_len = seq_lens % page_size
+        paged_kv_last_page_len = torch.where(paged_kv_last_page_len == 0,
+                                             page_size, paged_kv_last_page_len)
+        cache_dtype = self.cache_config.cache_dtype
+        if cache_dtype.startswith("fp8"):
+            kv_cache_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
+                cache_dtype)
+        else:
+            kv_cache_dtype = self.kv_cache_spec.dtype
+        attn_metadata = FlashInferMetadata(
+            num_actual_tokens=num_actual_tokens,
+            qo_indptr=qo_indptr,
+            paged_kv_indptr=paged_kv_indptr,
+            paged_kv_indices=paged_kv_indices,
+            paged_kv_last_page_len=paged_kv_last_page_len,
+            num_qo_heads=self.vllm_config.model_config.get_num_attention_heads(
+                self.vllm_config.parallel_config),
+            num_kv_heads=self.kv_cache_spec.num_kv_heads,
+            head_dim=self.kv_cache_spec.head_size,
+            page_size=page_size,
+            kv_data_type=kv_cache_dtype,
+            q_data_type=self.vllm_config.model_config.dtype,
+            slot_mapping=common_attn_metadata.slot_mapping,
+            num_decodes=num_decodes,
+            num_decode_tokens=num_decode_tokens,
+            num_prefills=num_prefills,
+            num_prefill_tokens=num_prefill_tokens,
+            use_cascade=use_cascade,
+            shared_qo_indptr=shared_qo_indptr,
+            shared_kv_page_indptr=shared_kv_page_indptr,
+            shared_kv_page_indices=shared_kv_page_indices,
+            shared_kv_last_page_len=shared_kv_last_page_len,
+            max_seq_len=max_seq_len,
+            seq_lens=seq_lens,
+            block_table_tensor=block_table_tensor,
+            workspace_buffer=self._workspace_buffer,
+        )
+
+        self._plan(num_prefills, num_decodes, attn_metadata)
+
+        return attn_metadata
+
+    def use_cascade_attention(self, *args, **kwargs) -> bool:
+        if self.kv_cache_spec.dtype != self.vllm_config.model_config.dtype:
+            # TODO: The cascade wrapper currently does not support setting
+            # kv cache dtype to something different from query dtype.
+            return False
+        return use_cascade_attention(*args, **kwargs)
+
+
+class FlashInferImpl(AttentionImpl):
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[list[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: AttentionType = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[int] = None,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+        self.alibi_slopes = alibi_slopes
+        if sliding_window is None:
+            self.sliding_window = (-1, -1)
+        else:
+            self.sliding_window = (sliding_window - 1, 0)
+        self.kv_cache_dtype = kv_cache_dtype
+        self.logits_soft_cap = logits_soft_cap
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
+
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "FlashInferImpl")
+
+    def forward(
+        self,
+        layer: torch.nn.Module,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: FlashInferMetadata,
+        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass with FlashInfer.
+
+        Args:
+            query: shape = [num_tokens, num_heads, head_size]
+            key: shape = [num_tokens, num_kv_heads, head_size]
+            value: shape = [num_tokens, num_kv_heads, head_size]
+            kv_cache: shape -
+            # NHD: [num_blocks, 2, block_size, num_kv_heads, head_size]
+            # HND: [num_blocks, 2,  num_kv_heads, block_size, head_size]
+
+
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """
+        assert output is not None, "Output tensor must be provided."
+
+        if output_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported"
+                " for FlashInferImpl")
+
+        if attn_metadata is None:
+            # Profiling run.
+            return output
+
+        # IMPORTANT!
+        # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in
+        # eager-mode PyTorch. Thus, we need to be careful about any CPU overhead
+        # in this method. For example, `view` and `slice` (or `[:n]`) operations
+        # are surprisingly slow even in the case they do not invoke any GPU ops.
+        # Minimize the PyTorch ops in this method as much as possible.
+        # Whenever making a change in this method, please benchmark the
+        # performance to make sure it does not introduce any overhead.
+
+        num_actual_tokens = attn_metadata.num_actual_tokens
+
+        if self.kv_sharing_target_layer_name is None:
+            # Reshape the input keys and values and store them in the cache.
+            # Skip this if sharing KV cache with an earlier attention layer.
+            # NOTE(woosuk): Here, key and value are padded while slot_mapping is
+            # not padded. However, we don't need to do key[:num_actual_tokens]
+            # and value[:num_actual_tokens] because the reshape_and_cache_flash
+            # op uses the slot_mapping's shape to determine the number of
+            # actual tokens.
+            torch.ops._C_cache_ops.reshape_and_cache_flash(
+                key,
+                value,
+                kv_cache[:, 0],
+                kv_cache[:, 1],
+                attn_metadata.slot_mapping,
+                self.kv_cache_dtype,
+                layer._k_scale,
+                layer._v_scale,
+            )
+
+            # The FlashInfer api requires data to be in fp8_e4m3 or fp8_e5m2
+            # to process the cache when the kv_cache_dtype is fp8
+            if self.kv_cache_dtype.startswith("fp8"):
+                torch_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
+                    self.kv_cache_dtype)
+                kv_cache = kv_cache.view(torch_dtype)
+
+        window_left = (self.sliding_window[0]
+                       if self.sliding_window is not None else -1)
+
+        # Inputs and outputs may be padded for CUDA graphs
+        query = query[:num_actual_tokens]
+        output_padded = output
+        output = output[:num_actual_tokens]
+
+        if attn_metadata.use_cascade:
+            # Cascade attention (rare case).
+            assert attn_metadata.cascade_wrapper is not None
+            output.copy_(attn_metadata.cascade_wrapper.run(query, kv_cache))
+            return output
+
+        num_decode_tokens = attn_metadata.num_decode_tokens
+        num_prefill_tokens = attn_metadata.num_prefill_tokens
+
+        stride_order = FlashInferBackend.get_kv_cache_stride_order()
+        kv_cache_permute = kv_cache.permute(*stride_order)
+        # Regular attention (common case).
+        # Decodes are at the front and prefills are at the back,
+        # according to reorder_batch()
+        if prefill_wrapper := attn_metadata.prefill_wrapper:
+            prefill_query = query[num_decode_tokens:]
+            assert prefill_query.shape[0] == num_prefill_tokens
+            assert prefill_wrapper is not None
+            assert prefill_wrapper._causal
+            assert prefill_wrapper._window_left == window_left
+            assert prefill_wrapper._logits_soft_cap == (self.logits_soft_cap
+                                                        or 0.0)
+            assert prefill_wrapper._sm_scale == self.scale
+            prefill_wrapper.run(
+                prefill_query,
+                kv_cache_permute,
+                k_scale=layer._k_scale_float,
+                v_scale=layer._v_scale_float,
+                out=output[num_decode_tokens:],
+            )
+        if decode_wrapper := attn_metadata.decode_wrapper:
+            decode_query = query[:num_decode_tokens]
+            assert decode_query.shape[0] == num_decode_tokens
+            if not FlashInferBackend.use_trtllm_decode_attention(
+                    attn_metadata.num_decodes, attn_metadata.max_seq_len,
+                    self.kv_cache_dtype, attn_metadata.num_qo_heads,
+                    attn_metadata.num_kv_heads, attn_metadata.head_dim):
+                assert decode_wrapper is not None
+                assert decode_wrapper._window_left == window_left
+                assert decode_wrapper._logits_soft_cap == (self.logits_soft_cap
+                                                           or 0.0)
+                assert decode_wrapper._sm_scale == self.scale
+                decode_wrapper.run(
+                    decode_query,
+                    kv_cache_permute,
+                    k_scale=layer._k_scale_float,
+                    v_scale=layer._v_scale_float,
+                    out=output[:num_decode_tokens],
+                )
+            else:
+                # This path needs to be enabled with VLLM_KV_CACHE_LAYOUT = HND
+                if num_decode_tokens > 0:
+                    # decode_query may be non-contiguous
+                    decode_query = decode_query.contiguous()
+                    block_tables_decode = attn_metadata.block_table_tensor[:
+                                                                           num_decode_tokens]
+                    seq_lens_decode = attn_metadata.seq_lens[:
+                                                             num_decode_tokens]
+
+                    assert get_kv_cache_layout() == "HND"
+                    assert decode_query.is_contiguous()
+                    assert kv_cache_permute.is_contiguous()
+                    assert block_tables_decode.is_contiguous()
+                    assert seq_lens_decode.is_contiguous()
+
+                    output[:num_decode_tokens] = (
+                        trtllm_batch_decode_with_kv_cache(
+                            query=decode_query,
+                            kv_cache=kv_cache_permute,
+                            workspace_buffer=attn_metadata.workspace_buffer,
+                            num_heads=self.num_heads,
+                            num_kv_heads=self.num_kv_heads,
+                            scale=self.scale,
+                            block_tables=block_tables_decode,
+                            seq_lens=seq_lens_decode,
+                            block_size=attn_metadata.page_size,
+                            max_seq_len=attn_metadata.max_seq_len,
+                            kv_cache_dtype=self.kv_cache_dtype,
+                            k_scale=layer._k_scale_float,
+                            v_scale=layer._v_scale_float,
+                        ))
+        return output_padded
diff --git a/vllm_v0.10.0/vllm/v1/attention/backends/flex_attention.py b/vllm_v0.10.0/vllm/v1/attention/backends/flex_attention.py
new file mode 100644
index 0000000..ad63f92
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/attention/backends/flex_attention.py
@@ -0,0 +1,483 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Attention layer with FlashAttention."""
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+from torch.nn.attention.flex_attention import (BlockMask, _mask_mod_signature,
+                                               _score_mod_signature,
+                                               create_block_mask,
+                                               flex_attention)
+
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionMetadata, AttentionType,
+                                              is_quantized_kv_cache)
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
+                                              CommonAttentionMetadata)
+from vllm.v1.kv_cache_interface import AttentionSpec
+
+logger = init_logger(__name__)
+
+create_block_mask_compiled = torch.compile(create_block_mask,
+                                           fullgraph=True,
+                                           mode="reduce-overhead")
+flex_attention_compiled = torch.compile(flex_attention, fullgraph=True)
+
+
+def _offsets_to_doc_ids_tensor(offsets: torch.Tensor) -> torch.Tensor:
+    device = offsets.device
+    counts = offsets[1:] - offsets[:-1]
+    return torch.repeat_interleave(
+        torch.arange(len(counts), device=device, dtype=torch.int32), counts)
+
+
+class FlexAttentionBackend(AttentionBackend):
+    accept_output_buffer: bool = True
+
+    @classmethod
+    def get_supported_dtypes(cls) -> list[torch.dtype]:
+        return [torch.float16, torch.bfloat16, torch.float32]
+
+    @classmethod
+    def validate_head_size(cls, head_size: int) -> None:
+        return  # FlexAttention supports any head size
+
+    @staticmethod
+    def get_name() -> str:
+        return "FLEX_ATTENTION"
+
+    @staticmethod
+    def get_impl_cls() -> type["FlexAttentionImpl"]:
+        return FlexAttentionImpl
+
+    @staticmethod
+    def get_metadata_cls() -> type["AttentionMetadata"]:
+        return FlexAttentionMetadata
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> tuple[int, ...]:
+        return (2, num_blocks, block_size, num_kv_heads, head_size)
+
+    @staticmethod
+    def get_builder_cls() -> type["FlexAttentionMetadataBuilder"]:
+        return FlexAttentionMetadataBuilder
+
+    @staticmethod
+    def use_cascade_attention(*args, **kwargs) -> bool:
+        return False
+
+
+# @torch.compile(fullgraph=True, mode="reduce-overhead")
+def physical_to_logical_mapping(
+        block_table: torch.Tensor,
+        total_blocks: Optional[int] = None) -> torch.Tensor:
+    """
+    Creates an inverse mapping from physical block locations to logical indices.
+
+    The original block_table maps from logical blocks to physical locations:
+
+    Logical to Physical (Original block_table):
+    ┌───────────────────────────────────────────┐
+    │ Request 0:                                │
+    │                                           │
+    │ Logical Blocks:  0  1  2  3  4  5  6  7   │
+    │                  │  │  │  │  │  │  │  │   │
+    │                  v  v  v  v  v  v  v  v   │
+    │ Physical Blocks: 3  5  1  7  4  2  0  6   │
+    └───────────────────────────────────────────┘
+
+    This function creates the inverse mapping:
+
+    Physical to Logical (Inverse mapping):
+    ┌───────────────────────────────────────────┐
+    │ Request 0:                                │
+    │                                           │
+    │ Physical Blocks: 0  1  2  3  4  5  6  7   │
+    │                  │  │  │  │  │  │  │  │   │
+    │                  v  v  v  v  v  v  v  v   │
+    │ Logical Blocks:  6  2  5  0  4  1  7  3   │
+    └───────────────────────────────────────────┘
+
+    If multiple logical blocks map to the same physical block,
+    this function returns the first (minimum) logical block index.
+
+    If a physical block is not mapped to by any logical block,
+    its value in the result will be -1.
+
+
+    Args:
+        block_table: Tensor of shape [max_reqs, max_num_blocks]
+            mapping logical blocks to physical locations
+
+    Returns:
+        A tensor of shape [max_reqs, max_physical_block]
+    """
+    max_reqs, max_num_blocks = block_table.shape
+    device = block_table.device
+
+    physical_to_logical = torch.full((max_reqs, total_blocks),
+                                     -1,
+                                     dtype=torch.long,
+                                     device=device)
+
+    logical_indices = (torch.arange(max_num_blocks,
+                                    device=device).unsqueeze(0).expand(
+                                        max_reqs, -1))
+
+    physical_to_logical.scatter_(-1, block_table.to(torch.int64),
+                                 logical_indices)
+    # TODO Confirm - Seems like block 0 is always empty so we reset it manually
+    physical_to_logical[:, 0] = -1
+    return physical_to_logical
+
+
+def causal_mask_mod(b: torch.Tensor, h: torch.Tensor, q_idx: torch.Tensor,
+                    kv_idx: torch.Tensor):
+    return q_idx >= kv_idx
+
+
+@dataclass
+class FlexAttentionMetadata:
+    num_actual_tokens: int  # Number of tokens excluding padding.
+    max_query_len: int
+    query_start_loc: torch.Tensor
+    max_seq_len: int
+    seq_lens: torch.Tensor
+    block_table: torch.Tensor
+    slot_mapping: torch.Tensor
+
+    use_cascade: bool
+    common_prefix_len: int
+    cu_prefix_query_lens: Optional[torch.Tensor]
+    prefix_kv_lens: Optional[torch.Tensor]
+    suffix_kv_lens: Optional[torch.Tensor]
+
+    # Block info
+    total_cache_tokens: int
+    block_size: int
+    max_possible_sequence_length: int
+    num_reqs: int
+    physical_to_logical: torch.Tensor
+    decode_offset: torch.Tensor
+
+    # For logging.
+    num_input_tokens: int = 0  # Number of tokens including padding.
+
+    # Flex Metadata
+    num_blocks = 0
+    block_mask: Optional[BlockMask] = None
+    score_mod: Optional[_score_mod_signature] = None
+    mask_mod: Optional[_mask_mod_signature] = None
+    logical_mask_mod: _mask_mod_signature = causal_mask_mod
+
+    def get_mask_mod(self) -> _mask_mod_signature:
+        """Creates the mask_mod function for FlexAttention.
+
+        This function creates the combined mask mod function that handles:
+            1. The paged attention block mapping
+            2. The mapping from packed query sequences to logical query entries
+
+        It also by defaults adds the decoding offset to the query indices.
+        With this info we create the "logical" indices that are passed to
+        mask_mod functions. This allows mask mod functions to be agnostic to
+        layout of the query and key/value tensors.
+
+        TODO is_within_lower_bound: do sequences start on block_boundaries?
+        """
+        # Create a lookup mapping from query indices -> request number
+        request_lookup = _offsets_to_doc_ids_tensor(self.query_start_loc)
+
+        def final_mask_mod(
+            b: torch.Tensor,
+            h: torch.Tensor,
+            q_idx: torch.Tensor,
+            physical_kv_idx: torch.Tensor,
+        ) -> torch.Tensor:
+            # Map query indices to corresponding request indices
+            q_req = request_lookup[q_idx]
+
+            # Convert physical KV indices to logical indices
+            physical_kv_block = physical_kv_idx // self.block_size
+            physical_kv_offset = physical_kv_idx % self.block_size
+            logical_block_idx = self.physical_to_logical[q_req,
+                                                         physical_kv_block]
+            logical_kv_idx = logical_block_idx * self.block_size + physical_kv_offset  # noqa: E501
+
+            # Determine valid kv indices
+            live_block = logical_block_idx >= 0
+            within_upper_bound = logical_kv_idx < self.seq_lens[q_req]
+            within_lower_bound = logical_kv_idx >= 0
+
+            is_valid = live_block & within_upper_bound & within_lower_bound
+
+            # Convert physical query indices to logical indices
+            local_q_idx = q_idx - self.query_start_loc[q_req]
+            logical_q_idx = local_q_idx + self.decode_offset[q_req]
+
+            # Apply mask modification only for valid indices
+            return torch.where(
+                is_valid,
+                self.logical_mask_mod(b, h, logical_q_idx, logical_kv_idx),
+                False,
+            )
+
+        return final_mask_mod
+
+    def build_block_mask(self) -> BlockMask:
+        assert self.mask_mod is not None
+        return create_block_mask_compiled(
+            self.mask_mod,
+            None,
+            None,
+            self.num_actual_tokens,
+            self.total_cache_tokens,
+            device=self.block_table.device,
+        )
+
+    def __post_init__(self):
+        assert self.use_cascade is False, "Not implemented yet."
+        assert self.common_prefix_len == 0, "Not implemented yet."
+        assert self.cu_prefix_query_lens is None, "Not implemented yet."
+        assert self.prefix_kv_lens is None, "Not implemented yet."
+        assert self.suffix_kv_lens is None, "Not implemented yet."
+        self.num_blocks = self.total_cache_tokens // self.block_size
+        self.mask_mod = self.get_mask_mod()
+        self.block_mask = self.build_block_mask()
+
+
+class FlexAttentionMetadataBuilder(
+        AttentionMetadataBuilder[FlexAttentionMetadata]):
+
+    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
+                 device: torch.device):
+        self.model_config = vllm_config.model_config
+        self.parallel_config = vllm_config.parallel_config
+        self.cache_config = vllm_config.cache_config
+
+        self.num_heads_q = self.model_config.get_num_attention_heads(
+            vllm_config.parallel_config)
+        self.num_heads_kv = self.model_config.get_num_kv_heads(
+            vllm_config.parallel_config)
+        self.headdim = self.model_config.get_head_size()
+        self.block_size = kv_cache_spec.block_size
+        self.kv_cache_spec = kv_cache_spec
+        self.device = device
+
+    def build(self,
+              common_prefix_len: int,
+              common_attn_metadata: CommonAttentionMetadata,
+              fast_build: bool = False) -> FlexAttentionMetadata:
+        num_reqs = common_attn_metadata.num_reqs
+        num_actual_tokens = common_attn_metadata.num_actual_tokens
+        max_query_len = common_attn_metadata.max_query_len
+
+        max_seq_len = int(common_attn_metadata.seq_lens_cpu.max())
+        query_start_loc = common_attn_metadata.query_start_loc
+        seq_lens = common_attn_metadata.seq_lens
+        block_table_tensor = common_attn_metadata.block_table_tensor
+        slot_mapping = common_attn_metadata.slot_mapping
+
+        use_cascade = common_prefix_len > 0
+        cu_prefix_query_lens = None
+        prefix_kv_lens = None
+        suffix_kv_lens = None
+        if use_cascade:
+            raise NotImplementedError("Not yet my friend")
+
+        block_size = self.kv_cache_spec.block_size
+        max_possible_seq_len = self.model_config.max_model_len
+        total_cache_tokens = self.cache_config.num_gpu_blocks * block_size
+
+        inverse_block_table = physical_to_logical_mapping(
+            block_table_tensor, self.cache_config.num_gpu_blocks)
+
+        # Get the original offset tensor
+        offset_tensor = common_attn_metadata.num_computed_tokens_cpu.to(
+            self.device, non_blocking=True)
+
+        out = FlexAttentionMetadata(
+            num_actual_tokens=num_actual_tokens,
+            max_query_len=max_query_len,
+            query_start_loc=query_start_loc,
+            max_seq_len=max_seq_len,
+            seq_lens=seq_lens,
+            block_table=block_table_tensor,
+            slot_mapping=slot_mapping,
+            use_cascade=use_cascade,
+            common_prefix_len=common_prefix_len,
+            cu_prefix_query_lens=cu_prefix_query_lens,
+            prefix_kv_lens=prefix_kv_lens,
+            suffix_kv_lens=suffix_kv_lens,
+            block_size=block_size,
+            max_possible_sequence_length=max_possible_seq_len,
+            num_reqs=num_reqs,
+            physical_to_logical=inverse_block_table,
+            total_cache_tokens=total_cache_tokens,
+            decode_offset=offset_tensor,
+        )
+        return out
+
+
+class FlexAttentionImpl(AttentionImpl):
+    sliding_window: Optional[tuple[int, int]]
+    alibi_slopes: Optional[torch.Tensor]
+    logits_soft_cap: Optional[float]
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[list[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: AttentionType = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+
+        if alibi_slopes is not None:
+            raise NotImplementedError(
+                "FlexAttention does not support alibi slopes yet.")
+        else:
+            self.alibi_slopes = None
+        if sliding_window is not None:
+            raise NotImplementedError(
+                "FlexAttention does not support sliding window yet.")
+        else:
+            self.sliding_window = (-1, -1)
+        self.kv_cache_dtype = kv_cache_dtype
+        self.logits_soft_cap = logits_soft_cap
+        if self.logits_soft_cap is not None:
+            raise NotImplementedError(
+                "FlexAttention does not support logits soft cap yet.")
+
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError(
+                "FlexAttention does not support kv sharing yet.")
+
+        FlexAttentionBackend.validate_head_size(head_size)
+
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "FlexAttention does not support quantized kv-cache. Yet")
+
+    @staticmethod
+    def view_as_4d(tensor: torch.Tensor) -> torch.Tensor:
+        """View a 3d tensor as 4D."""
+        if tensor.ndim == 4:
+            return tensor
+        assert tensor.ndim == 3
+        return tensor[None, :, :, :]
+
+    def forward(
+        self,
+        layer: torch.nn.Module,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: FlexAttentionMetadata,
+        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass with FLexAttention.
+
+        Args:
+            query: shape = [num_tokens, num_heads, head_size]
+            key: shape = [num_tokens, num_kv_heads, head_size]
+            value: shape = [num_tokens, num_kv_heads, head_size]
+            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """
+        assert output is not None, "Output tensor must be provided."
+        if output_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported"
+                " for FlexAttentionImpl")
+
+        enable_gqa = self.num_kv_heads != self.num_heads
+
+        if attn_metadata is None:
+            # Profiling run.
+            return output
+            # query = self.view_as_4d(query).permute(0, 2, 1, 3)
+            # return torch.empty_like(query)
+
+        num_actual_tokens = attn_metadata.num_actual_tokens
+
+        key_cache, value_cache = kv_cache.unbind(0)
+
+        torch.ops._C_cache_ops.reshape_and_cache_flash(
+            key,
+            value,
+            key_cache,
+            value_cache,
+            attn_metadata.slot_mapping,
+            self.kv_cache_dtype,
+            layer._k_scale,
+            layer._v_scale,
+        )
+
+        # View out the block_size dim
+        key_cache = key_cache.view(-1, self.num_kv_heads, self.head_size)
+        value_cache = value_cache.view(-1, self.num_kv_heads, self.head_size)
+        query, key_cache, value_cache = map(
+            lambda x: self.view_as_4d(x).permute(0, 2, 1, 3),
+            (query, key_cache, value_cache),
+        )
+        query = query[:, :, :num_actual_tokens, :]
+        # Doesn't work for now -> constraint violation
+        # torch._dynamo.try_mark_dynamic(query, 2)
+
+        # default M=64, N=64 may run out of shared memory on some GPUs
+        # TODO: Explicit configs for each GPU?
+        # Not sure how to calculate the shared memory requirement
+        extra_kernel_options = defaultdict[str, int](lambda: 64)
+        if query.dtype == torch.float32:
+            extra_kernel_options["BLOCK_M"] //= 2
+            extra_kernel_options["BLOCK_N"] //= 2
+        if current_platform.is_cuda():
+            device_props = torch.cuda.get_device_properties()
+            max_shared_memory = device_props.shared_memory_per_block_optin
+            if max_shared_memory < 144 * 1024:
+                extra_kernel_options["BLOCK_M"] //= 2
+                extra_kernel_options["BLOCK_N"] //= 2
+
+        out = flex_attention_compiled(
+            query,
+            key_cache,
+            value_cache,
+            attn_metadata.score_mod,
+            attn_metadata.block_mask,
+            self.scale,
+            enable_gqa=enable_gqa,
+            kernel_options={
+                "FORCE_USE_FLEX_ATTENTION": True,
+                **extra_kernel_options
+            },
+        )
+
+        # Flex doesn't have an out variant today, rely on epilogue fusion
+        out = out.permute(0, 2, 1, 3).squeeze(0)
+        output[:num_actual_tokens, :, :].copy_(out)
+        return output
diff --git a/vllm_v0.10.0/vllm/v1/attention/backends/mamba_attn.py b/vllm_v0.10.0/vllm/v1/attention/backends/mamba_attn.py
new file mode 100644
index 0000000..dca5de4
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/attention/backends/mamba_attn.py
@@ -0,0 +1,170 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+from vllm.attention.backends.abstract import AttentionBackend
+from vllm.config import VllmConfig
+from vllm.v1.attention.backends.utils import (
+    AttentionMetadataBuilder, CommonAttentionMetadata,
+    reorder_batch_to_split_decodes_and_prefills, split_decodes_and_prefills)
+from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
+
+if TYPE_CHECKING:
+    from vllm.v1.core.sched.output import SchedulerOutput
+    from vllm.v1.worker.gpu_input_batch import InputBatch
+
+
+def _query_start_loc_to_chunk_indices_offsets(query_start_loc: torch.Tensor,
+                                              chunk_size: int,
+                                              total_seqlens: int):
+
+    cu_seqlens = query_start_loc[1:]  # remove prepended 0
+
+    # outputs will have length expansion of chunks that do not divide
+    # chunk_size
+    N = math.ceil(total_seqlens / chunk_size) + (cu_seqlens[:-1] % chunk_size
+                                                 > 0).sum()
+    chunk_indices = torch.arange(N,
+                                 dtype=torch.int,
+                                 device=query_start_loc.device)
+    chunk_offsets = torch.zeros((N, ),
+                                dtype=torch.int,
+                                device=query_start_loc.device)
+
+    p = 0  # num of insertions
+    for s, e in zip(cu_seqlens[:-1], cu_seqlens[1:]):
+
+        # if does not divide chunk_size, then there is one chunk insertion
+        p += (s % chunk_size > 0)
+
+        # get the dimensions
+        # - the + 1 for _e is to shift the boundary by one chunk
+        # - this shifting is not needed if chunk_size divides e
+        _s, _e = s // chunk_size + p, e // chunk_size + p + (e % chunk_size
+                                                             > 0)
+
+        # adjust indices and offsets
+        chunk_indices[_s:_e] -= p
+        chunk_offsets[_s] = s % chunk_size
+
+    return chunk_indices, chunk_offsets
+
+
+class Mamba2AttentionBackend(AttentionBackend):
+
+    @staticmethod
+    def get_builder_cls() -> type["Mamba2AttentionMetadataBuilder"]:
+        return Mamba2AttentionMetadataBuilder
+
+
+@dataclass
+class Mamba2AttentionMetadata:
+    num_prefills: int
+    num_prefill_tokens: int
+    num_decodes: int
+    num_decode_tokens: int
+    query_start_loc: torch.Tensor
+    seq_lens: torch.Tensor
+
+    has_initial_states: torch.Tensor
+    prep_initial_states: bool
+    chunk_size: int
+    seq_idx: torch.Tensor
+    chunk_indices: torch.Tensor
+    chunk_offsets: torch.Tensor
+
+    state_indices_tensor: torch.Tensor  # shape: [batch,]
+    nums_dict: Optional[dict] = None
+    cu_seqlen: Optional[int] = None
+    batch_ptr: Optional[torch.tensor] = None
+    token_chunk_offset_ptr: Optional[torch.tensor] = None
+
+
+class Mamba2AttentionMetadataBuilder(
+        AttentionMetadataBuilder[Mamba2AttentionMetadata]):
+
+    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
+                 device: torch.device):
+        assert isinstance(kv_cache_spec, MambaSpec)
+        self.kv_cache_spec = kv_cache_spec
+        self.chunk_size = vllm_config.model_config.get_mamba_chunk_size()
+        assert self.chunk_size is not None, (
+            "chunk_size needs to be set in the model config for Mamba2 models")
+
+    def reorder_batch(self, input_batch: "InputBatch",
+                      scheduler_output: "SchedulerOutput") -> bool:
+        return reorder_batch_to_split_decodes_and_prefills(input_batch,
+                                                           scheduler_output,
+                                                           decode_threshold=1)
+
+    def build(self,
+              common_prefix_len: int,
+              common_attn_metadata: CommonAttentionMetadata,
+              fast_build: bool = False) -> Mamba2AttentionMetadata:
+        num_reqs = common_attn_metadata.num_reqs
+        query_start_loc = common_attn_metadata.query_start_loc
+        seq_lens = common_attn_metadata.seq_lens
+
+        seq_idx = None
+        chunk_indices, chunk_offsets = None, None
+        # Need flags to indicate if there are initial states
+        # currently we really only support the FlashAttention backend
+        has_initial_states = None
+        prep_initial_states = False
+
+        state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
+
+        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
+            split_decodes_and_prefills(common_attn_metadata,
+                                       decode_threshold=1))
+
+        # Compute seq_idx, chunk_indices and chunk_offsets for prefill only
+        if num_prefills > 0:
+            #[batch,]
+            has_initial_states_cpu = (
+                common_attn_metadata.
+                num_computed_tokens_cpu[num_reqs - num_prefills:num_reqs] > 0)
+            prep_initial_states = torch.any(has_initial_states_cpu).item()
+            has_initial_states = has_initial_states_cpu.to(
+                query_start_loc.device)
+
+            query_start_loc_p = common_attn_metadata.query_start_loc[
+                -num_prefills - 1:] - num_decode_tokens
+
+            seq_idx = torch.repeat_interleave(torch.arange(
+                num_prefills,
+                dtype=torch.int32,
+                device=query_start_loc_p.device),
+                                              query_start_loc_p.diff(),
+                                              output_size=num_prefill_tokens)
+            seq_idx.unsqueeze_(0)
+
+            # We compute metadata for chunked prefill once at the top level
+            # model forward and reuse them in mamba layers. If not needed,
+            # they will be ignored inside mamba kernels.
+            if prep_initial_states:
+                chunk_indices, chunk_offsets = (
+                    _query_start_loc_to_chunk_indices_offsets(
+                        query_start_loc_p, self.chunk_size,
+                        num_prefill_tokens))
+
+        attn_metadata = Mamba2AttentionMetadata(
+            num_prefills=num_prefills,
+            num_prefill_tokens=num_prefill_tokens,
+            num_decodes=num_decodes,
+            num_decode_tokens=num_decode_tokens,
+            query_start_loc=query_start_loc,
+            seq_lens=seq_lens,
+            has_initial_states=has_initial_states,
+            prep_initial_states=prep_initial_states,
+            chunk_size=self.chunk_size,
+            seq_idx=seq_idx,
+            chunk_indices=chunk_indices,
+            chunk_offsets=chunk_offsets,
+            state_indices_tensor=state_indices_tensor,
+        )
+        return attn_metadata
diff --git a/vllm_v0.10.0/vllm/v1/attention/backends/mla/__init__.py b/vllm_v0.10.0/vllm/v1/attention/backends/mla/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/v1/attention/backends/mla/common.py b/vllm_v0.10.0/vllm/v1/attention/backends/mla/common.py
new file mode 100755
index 0000000..cf17d93
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/attention/backends/mla/common.py
@@ -0,0 +1,1216 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+# MLA Common Components
+
+This file implements common components for MLA implementations.
+
+First we define:
+
+Sq      as Q sequence length
+Skv     as KV sequence length
+
+MLA has two possible ways of computing, a data-movement friendly approach and a
+compute friendly approach, we generally want to use the compute friendly
+approach for "prefill" (i.e. the ratio Sq / Skv is "small", is near 1)
+and the data-movement friendly approach for "decode" (i.e. the ratio
+Sq / Skv is "large").
+
+NOTE what we deem small and large is currently determined by if its labelled
+prefill or decode by the scheduler, but this is something we should probably
+tune.
+
+Main reference: DeepseekV2 paper, and FlashInfer Implementation
+(https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
+
+Deepseek's MLA attention works the following way:
+* Use a single latent vector to represent the per-token entry of the KV cache. 
+* For decode (i.e. the memory friendly approach) the attention "simulates" a
+multi-head attention, while the compute is similar to multi-query attention.
+
+Below is example of both paths assuming batchsize = 1
+
+## More Extent Definitions:
+
+C           Context length, `Skv - Sq`
+H           hidden size
+N           number of attention heads
+Lq          latent dimension for Q              1536 in DSV3
+Lkv         latent dimension for K/V            512 in DSV3
+P           nope dimension, no rope.            128 in DSV3
+R           rope dimension, goes through rope.  64 in DSV3
+V           V head dim.                         128 in DSV3
+
+## Vector/Matrix Definitions
+
+h_t         hidden states (input to attention)  shape [Sq, H]
+q_c         latent/compressed Q                 shape [Sq, Lq]
+q_nope      uncompressed Q (no-rope)            shape [Sq, N, P]
+q_pe        uncompressed Q (rope)               shape [Sq, N, R]
+kv_c        latent/compressed KV                shape [Skv, Lkv]
+k_pe        decoupled k position embeddings     shape [Skv, R]
+new_kv_c    new kv_c from current iter          shape [Sq, Lkv]
+new_k_pe    new k_pe from current iter          shape [Sq, R]
+cache_kv_c  cached k_c from previous iters      shape [C, Lkv]
+cache_k_pe  cached k_pe from previous iters     shape [C, R]
+W_DQ        project h_t to q_c                  shape [H, Lq]
+W_UQ        project q_c to q_nope               shape [Lq, N * P]
+W_QR        project q_c to q_pe                 shape [Lq, N * R]
+W_DKV       project h_t to kv_c                 shape [H, Lkv]
+W_UK        project kv_c to k_nope              shape [Lkv, N, P]
+W_KR        project h_t to k_pe                 shape [H, R]
+W_UV        project kv_c to v                   shape [Lkv, N, V]
+W_O         project v to h_t                    shape [N * V, H]
+
+
+## Compute Friendly Approach (i.e. "_forward_prefill"):
+
+q_c      = h_t @ W_DQ
+q_nope   = (q_c @ W_UQ).view(Sq, N, P)
+q_pe     = RoPE(q_c @ W_QR).view(Sq, N, R)
+new_kv_c = h_t @ W_DKV
+new_k_pe = RoPE(h_t @ W_KR)
+kv_c     = torch.cat([new_kv_c, cache_kv_c], dim=0)
+k_pe     = torch.cat([new_k_pe, cache_k_pe], dim=0)
+k_nope   = (kv_c @ W_UK.view(Lkv, N * P)).view(Skv, N, P)
+v        = (kv_c @ W_UV.view(Lkv, N * V)).view(Skv, N, V)
+
+// MHA with QK headdim = P + R
+//           V headdim = V
+//      spda_o shape [Sq, N, V]
+spda_o = scaled_dot_product_attention(
+    torch.cat([q_nope, q_pe], dim=-1),
+    torch.cat([k_nope, k_pe.unsqueeze(1).expand(-1, N, -1)], dim=-1),
+    v
+) 
+return spda_o @ W_O
+
+NOTE: in the actual code,
+    `kv_b_proj` is [W_UK; W_UV] concatenated per head
+    `q_b_proj` is [W_UQ; W_QR] concatenated per head
+    `out_proj` is W_O
+
+
+## Data-Movement Friendly Approach (i.e. "_forward_decode"):
+
+Runtime
+q_c      = h_t @ W_DQ
+q_nope   = (q_c @ W_UQ).view(-1, N, P)
+ql_nope  = einsum("snh,lnh->snl", q, W_UK)
+q_pe     = RoPE(q_c @ W_QR).view(Sq, N, R)
+new_kv_c = h_t @ W_DKV
+new_k_pe = RoPE(h_t @ W_KR)
+kv_c     = torch.cat([new_kv_c, cache_kv_c], dim=0)
+k_pe     = torch.cat([new_k_pe, cache_k_pe], dim=0)
+
+// MQA with QK headdim = Lkv + R
+//           V headdim = Lkv
+//      spda_o shape [Sq, N, Lkv]
+// NOTE: this is less compute-friendly since Lkv > P
+//       but is more data-movement friendly since its MQA vs MHA
+spda_o = scaled_dot_product_attention(
+    torch.cat([ql_nope, q_pe], dim=-1),
+    torch.cat([kv_c, k_pe], dim=-1),
+    kv_c
+)
+
+o = einsum("snl,lnv->snv", spda_o.reshape(-1, N, Lkv), W_UV)
+return o.view(-1, N * V) @ self.num_heads @ W_O
+
+
+## Chunked Prefill
+
+For chunked prefill we want to use the compute friendly algorithm. We are 
+assuming sufficiently large Sq / Skv ratio, in the future may want to switch to 
+the data-movement friendly approach if the chunk (i.e. `Sq`) is small.
+
+However, the compute-friendly approach can potentially run out of memory if Skv
+is large due to: `k_nope = (kv_c @ W_UK).view(Skv, N, P)`
+
+To mitigate this, we chunk the computation of attention with respect to the 
+current context (i.e. `cache_kv_c` and `cache_k_pe`) so that we can used a 
+fixed workspace size.
+
+The chunked prefill approach is as follows:
+
+MCC        Max chunk of context to process per iter, computed dynamically, 
+           used to bound the memory usage
+
+q_c        = h_t @ W_DQ
+q_nope     = (q_c @ W_UQ).view(Sq, N, P)
+q_pe       = RoPE(q_c @ W_QR).view(Sq, N, R)
+new_kv_c   = h_t @ W_DKV
+new_k_pe   = RoPE(h_t @ W_KR)
+new_k_nope = (new_kv_c @ W_UK.view(Lkv, N * P)).view(Sq, N, P)
+new_v      = (new_kv_c @ W_UV.view(Lkv, N * V)).view(Sq, N, V)
+
+// MHA between queries and new KV
+//     with QK headdim = P + R
+//           V headdim = V
+//    curr_o   shape [Sq, N, V]
+//    curr_lse shape [N, Sq], this is just order FA returns
+curr_o, curr_lse = scaled_dot_product_attention(
+    torch.cat([q_nope, q_pe], dim=-1),
+    torch.cat([new_k_nope, new_k_pe.unsqueeze(1).expand(-1, N, -1)], dim=-1),
+    new_v,
+    casual=True,
+    return_softmax_lse=True
+) 
+
+// Compute attention with the already existing context
+for chunk_idx in range(cdiv(C, MCC)):
+    chunk_start  = chunk_idx * MCC
+    chunk_end    = min(chunk_start + MCC, C)
+    Sc           = chunk_end - chunk_start
+    cache_kv_c_chunk   = cache_kv_c[chunk_start:chunk_end]
+    cache_k_pe_chunk   = cache_k_pe[chunk_start:chunk_end]
+    cache_k_nope_chunk = (cache_kv_c_chunk @ W_UK).view(-1, N, P)
+    cache_v_chunk      = (cache_kv_c_chunk @ W_UV).view(-1, N, V)
+
+    chunk_o, chunk_lse = scaled_dot_product_attention(
+        torch.cat([q_nope, q_pe], dim=-1),
+        torch.cat([cache_k_nope_chunk,
+                   cache_k_pe_chunk.unsqueeze(1).expand(-1, N, -1)],
+                   dim=-1),
+        cache_v_chunk,
+        casual=False,
+        return_softmax_lse=True
+    )
+
+    curr_o, curr_lse = merge_attn_states(
+        suffix_output=curr_o,
+        suffix_lse=curr_lse,
+        prefix_output=chunk_o,
+        prefix_lse=chunk_lse,
+    )
+
+return curr_o @ W_O
+"""
+
+import functools
+from abc import abstractmethod
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Generic, Optional, TypeVar, Union
+
+import torch
+
+import vllm.envs as envs
+from vllm import _custom_ops as ops
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
+                                              AttentionMetadata,
+                                              MLAAttentionImpl)
+from vllm.attention.backends.utils import get_mla_dims
+from vllm.attention.ops.merge_attn_states import merge_attn_states
+from vllm.attention.utils.fa_utils import get_flash_attn_version
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               LinearBase,
+                                               UnquantizedLinearMethod)
+from vllm.platforms import current_platform
+from vllm.utils import cdiv, round_down
+from vllm.v1.attention.backends.utils import (
+    AttentionMetadataBuilder, CommonAttentionMetadata,
+    get_per_layer_parameters, infer_global_hyperparameters,
+    reorder_batch_to_split_decodes_and_prefills, split_decodes_and_prefills)
+from vllm.v1.kv_cache_interface import AttentionSpec
+
+try:
+    from vllm.vllm_flash_attn import flash_attn_varlen_func
+    is_vllm_fa = True
+except ImportError:
+    # For rocm use upstream flash attention
+    if current_platform.is_rocm():
+        from flash_attn import flash_attn_varlen_func
+    is_vllm_fa = False
+
+try:
+    from flashinfer import BatchPrefillWithRaggedKVCacheWrapper
+    from flashinfer.prefill import (  # noqa: F401
+        cudnn_batch_prefill_with_kv_cache)
+    flashinfer_available = True
+except ImportError:
+    flashinfer_available = False
+
+if TYPE_CHECKING:
+    from vllm.v1.core.sched.output import SchedulerOutput
+    from vllm.v1.worker.gpu_input_batch import InputBatch
+
+logger = init_logger(__name__)
+
+CUDNN_WORKSPACE_SIZE = 12800
+
+
+class MLACommonBackend(AttentionBackend):
+
+    accept_output_buffer: bool = True
+
+    @staticmethod
+    def get_name() -> str:
+        return "TRITON_MLA_VLLM_V1"
+
+    @staticmethod
+    def get_metadata_cls() -> type["AttentionMetadata"]:
+        return MLACommonMetadata
+
+    @staticmethod
+    def get_builder_cls() -> type["MLACommonMetadataBuilder"]:
+        return MLACommonMetadataBuilder
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,  # assumed to be 1 for MLA
+        head_size: int,
+    ) -> tuple[int, ...]:
+        return (num_blocks, block_size, head_size)
+
+    @classmethod
+    def get_supported_dtypes(cls) -> list[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        return [576]
+
+    @classmethod
+    def validate_head_size(cls, head_size: int) -> None:
+        supported_head_sizes = cls.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
+            attn_type = cls.__name__.removesuffix("Backend")
+            raise ValueError(
+                f"Head size {head_size} is not supported by {attn_type}. "
+                f"Supported head sizes are: {supported_head_sizes}. "
+                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
+                "FlexAttention backend which supports all head sizes.")
+
+
+@dataclass
+class MLACommonPrefillMetadata:
+    """ Prefill Specific Metadata """
+
+    @dataclass
+    class ChunkedContextMetadata:
+        # New for MLA (compared to FlashAttention)
+        # For handling chunked prefill
+        cu_seq_lens: torch.Tensor
+        starts: torch.Tensor
+        seq_tot: list[int]
+        max_seq_lens: list[int]
+        seq_lens: torch.Tensor
+        workspace: torch.Tensor
+
+    block_table: torch.Tensor
+    query_start_loc: torch.Tensor
+    max_query_len: int
+    chunked_context: Optional[ChunkedContextMetadata] = None
+
+
+@dataclass
+class FlashInferPrefillMetadata(MLACommonPrefillMetadata):
+    prefill_main: Optional['BatchPrefillWithRaggedKVCacheWrapper'] = None
+    prefill_chunks: list['BatchPrefillWithRaggedKVCacheWrapper'] = field(
+        default_factory=list)
+
+
+@dataclass
+class CudnnPrefillMetadata(MLACommonPrefillMetadata):
+
+    class ChunkedContextMetadata(
+            MLACommonPrefillMetadata.ChunkedContextMetadata):
+        seq_lens: torch.Tensor
+
+    query_seq_lens: Optional[torch.Tensor] = None
+    cudnn_workspace: Optional[torch.Tensor] = None
+
+
+@dataclass
+class MLACommonDecodeMetadata:
+    block_table: torch.Tensor
+    seq_lens: torch.Tensor
+
+
+D = TypeVar("D", bound=MLACommonDecodeMetadata)
+
+
+@dataclass
+class MLACommonMetadata(Generic[D]):
+    """Metadata for MLACommon.
+
+    NOTE: Please read the comment at the top of the file before trying to
+    understand this class
+    """
+    # NOTE(sang): Definition of context_len, query_len, and seq_len.
+    # |---------- N-1 iteration --------|
+    # |---------------- N iteration ---------------------|
+    # |- tokenA -|......................|-- newTokens ---|
+    # |---------- context_len ----------|
+    # |-------------------- seq_len ---------------------|
+    #                                   |-- query_len ---|
+
+    num_reqs: int
+    max_query_len: int
+
+    num_actual_tokens: int  # Number of tokens excluding padding.
+    query_start_loc: torch.Tensor
+    slot_mapping: torch.Tensor
+
+    # New for MLA (compared to FlashAttention)
+    # For handling prefill decode split
+    num_decodes: int
+    num_decode_tokens: int
+    num_prefills: int
+
+    # The dimension of the attention heads
+    head_dim: Optional[int] = None
+
+    decode: Optional[D] = None
+    prefill: Optional[Union[MLACommonPrefillMetadata,
+                            FlashInferPrefillMetadata,
+                            CudnnPrefillMetadata]] = None
+
+    def __post_init__(self):
+        if self.head_dim is not None:
+            MLACommonBackend.validate_head_size(self.head_dim)
+
+
+M = TypeVar("M", bound=MLACommonMetadata)
+
+
+def use_flashinfer_prefill() -> bool:
+    if flashinfer_available and not envs.VLLM_USE_CUDNN_PREFILL:
+        # For blackwell default to flashinfer prefill if its available since
+        #  its faster than FA2.
+        return current_platform.has_device_capability(100)
+    return False
+
+
+def use_cudnn_prefill() -> bool:
+    if flashinfer_available and envs.VLLM_USE_CUDNN_PREFILL:
+        return current_platform.has_device_capability(100)
+    return False
+
+
+# Currently 394MB, this can be tuned based on GEMM sizes used.
+# Chosen to be the same as sglang:
+#  https://github.com/sgl-project/sglang/blob/766392c6bda2558b61ce6d1c1bfd8081a549e1f1/python/sglang/global_config.py#L37
+FLASHINFER_WORKSPACE_BUFFER_SIZE = 394 * 1024 * 1024
+
+
+class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
+    """
+    NOTE: Please read the comment at the top of the file before trying to
+    understand this class
+    """
+
+    def __init__(self,
+                 kv_cache_spec: AttentionSpec,
+                 vllm_config: VllmConfig,
+                 device: torch.device,
+                 metadata_cls: Optional[type[M]] = None):
+        self.metadata_cls = metadata_cls \
+            if metadata_cls is not None else MLACommonMetadata
+        self.kv_cache_spec = kv_cache_spec
+        self.device = device
+        scheduler_config = vllm_config.scheduler_config
+        self.model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        parallel_config = vllm_config.parallel_config
+        self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
+        self.num_heads = self.model_config.get_num_attention_heads(
+            parallel_config)
+        self.mla_dims = get_mla_dims(self.model_config)
+        self.aot_schedule = current_platform.is_cuda()
+
+        # Dont try to access the runner on AMD
+        if self.aot_schedule:
+            self.page_size = self.kv_cache_spec.block_size
+
+        if self.chunked_prefill_enabled:
+            self.chunked_prefill_workspace_size = min(
+                # Max sure there is enough for 8 full length request or at least
+                # 4 pages of cache per request
+                max(
+                    8 * self.model_config.max_model_len, 4 *
+                    scheduler_config.max_num_seqs * cache_config.block_size),
+                # For long-context models try not to over-allocate limiting
+                # kv-cache space, limiting it to 64k tokens,
+                # which would result in the workspace being:
+                #   2*(576)*(64*1024) = 144mb
+                # (assuming 576 MLA head dim, and fp16)
+                # which would result in up-projected context being
+                #   2*(192*128)*(64*1024) = 3gb
+                # (assuming 192 QK head dim, 128 heads, and fp16)
+                128 * 1024)
+            assert self.chunked_prefill_workspace_size >= \
+                scheduler_config.max_num_seqs * cache_config.block_size
+            self.chunked_prefill_workspace = torch.empty(
+                (self.chunked_prefill_workspace_size,
+                 self.model_config.get_head_size()),
+                dtype=self.model_config.dtype,
+                device=device,
+            )
+
+        self._use_cudnn_prefill = use_cudnn_prefill()
+        self._use_fi_prefill = use_flashinfer_prefill()
+        self.prefill_metadata_cls = (
+            FlashInferPrefillMetadata
+            if self._use_fi_prefill else CudnnPrefillMetadata
+            if self._use_cudnn_prefill else MLACommonPrefillMetadata)
+
+        if self._use_fi_prefill:
+            self._workspace_buffer = torch.empty(
+                FLASHINFER_WORKSPACE_BUFFER_SIZE,
+                dtype=torch.uint8,
+                device=device)
+
+            self._fi_prefill_main: Optional[
+                BatchPrefillWithRaggedKVCacheWrapper] = None
+            self._fi_prefill_chunks: list[
+                BatchPrefillWithRaggedKVCacheWrapper] = []
+
+            self._global_hyperparameters = infer_global_hyperparameters(
+                get_per_layer_parameters(vllm_config, MLACommonImpl))
+
+        if self._use_cudnn_prefill:
+            self.cudnn_workspace = torch.empty(
+                CUDNN_WORKSPACE_SIZE * scheduler_config.max_num_seqs,
+                dtype=torch.int8,
+                device=device,
+            )
+
+    def _build_fi_prefill_wrappers(self, prefill: FlashInferPrefillMetadata):
+        qo_indptr = prefill.query_start_loc
+
+        has_context = False
+        if prefill.chunked_context is not None:
+            chunked_context = prefill.chunked_context
+            has_context = True
+
+        if self._fi_prefill_main is None:
+            self._fi_prefill_main = BatchPrefillWithRaggedKVCacheWrapper(
+                self._workspace_buffer, "NHD", backend="cutlass")
+
+        if has_context:
+            num_chunks = chunked_context.cu_seq_lens.shape[0]
+            # Allocate more prefill chunk wrappers if needed
+            if len(self._fi_prefill_chunks) < num_chunks:
+                for _ in range(len(self._fi_prefill_chunks), num_chunks):
+                    self._fi_prefill_chunks.append(
+                        BatchPrefillWithRaggedKVCacheWrapper(
+                            self._workspace_buffer, "NHD", backend="cutlass"))
+            assert num_chunks <= len(self._fi_prefill_chunks)
+
+        # In MLA, the non-latent num_qo_heads == num_kv_heads
+        num_qo_heads = self.num_heads
+        num_kv_heads = num_qo_heads
+
+        # Sanity: Verify that num_kv_heads == 1 since it is latent space
+        assert self.kv_cache_spec.num_kv_heads == 1
+
+        # Get non-latent head_dim_qk and head_dim_vo
+        head_dim_qk = (self.mla_dims.qk_nope_head_dim +
+                       self.mla_dims.qk_rope_head_dim)
+        head_dim_vo = self.mla_dims.v_head_dim
+
+        # For main run, qo_indptr == kv_indptr
+        kv_indptr = qo_indptr.clone()
+
+        # Prepare main prefill
+        self._fi_prefill_main.plan(
+            qo_indptr=qo_indptr,
+            kv_indptr=kv_indptr,
+            num_qo_heads=num_qo_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim_qk=head_dim_qk,
+            head_dim_vo=head_dim_vo,
+            causal=True,  # This is main run
+            sm_scale=self._global_hyperparameters.sm_scale,
+            window_left=self._global_hyperparameters.window_left,
+            logits_soft_cap=self._global_hyperparameters.logits_soft_cap,
+            q_data_type=self.model_config.dtype,
+            kv_data_type=self.kv_cache_spec.dtype,
+        )
+
+        # Prepare context prefills
+        if has_context:
+            for i in range(num_chunks):
+                kv_indptr_chunk = chunked_context.cu_seq_lens[i]
+
+                self._fi_prefill_chunks[i].plan(
+                    qo_indptr=qo_indptr,
+                    kv_indptr=kv_indptr_chunk,
+                    num_qo_heads=num_qo_heads,
+                    num_kv_heads=num_kv_heads,
+                    head_dim_qk=head_dim_qk,
+                    head_dim_vo=head_dim_vo,
+                    causal=False,  # This is context run
+                    sm_scale=self._global_hyperparameters.sm_scale,
+                    window_left=self._global_hyperparameters.window_left,
+                    logits_soft_cap=self._global_hyperparameters.
+                    logits_soft_cap,
+                    q_data_type=self.model_config.dtype,
+                    kv_data_type=self.kv_cache_spec.dtype,
+                )
+
+        prefill.prefill_main = self._fi_prefill_main
+        prefill.prefill_chunks = self._fi_prefill_chunks
+
+    def reorder_batch(self, input_batch: "InputBatch",
+                      scheduler_output: "SchedulerOutput") -> bool:
+        return reorder_batch_to_split_decodes_and_prefills(input_batch,
+                                                           scheduler_output,
+                                                           decode_threshold=1)
+
+    def _build_decode(self, block_table_tensor: torch.Tensor,
+                      seq_lens: torch.Tensor):
+        return MLACommonDecodeMetadata(
+            block_table=block_table_tensor,
+            seq_lens=seq_lens,
+        )
+
+    def build_for_cudagraph_capture(
+            self, common_attn_metadata: CommonAttentionMetadata) -> M:
+        """
+        This method builds the metadata for full cudagraph capture.
+        Currently, only decode is supported for full cudagraphs with MLA.
+        """
+        m = common_attn_metadata
+        assert m.num_reqs == m.num_actual_tokens, \
+            "MLA only supports decode-only full CUDAGraph capture. " \
+            "Make sure all cudagraph capture sizes <= max_num_seq."
+
+        m.max_query_len = 1  # decode-only
+
+        return self.build(0, m)
+
+    def build(self,
+              common_prefix_len: int,
+              common_attn_metadata: CommonAttentionMetadata,
+              fast_build: bool = False) -> M:
+        num_reqs = common_attn_metadata.num_reqs
+        num_tokens = common_attn_metadata.num_actual_tokens
+        max_query_len = common_attn_metadata.max_query_len
+
+        # Note(simon): be careful about the CPU <> GPU memory movement in this
+        # function. We should avoid GPU -> CPU sync as much as possible because
+        # it blocks on all previous kernels.
+        device = self.device
+        block_table_tensor = common_attn_metadata.block_table_tensor
+        slot_mapping = common_attn_metadata.slot_mapping
+
+        query_start_loc = common_attn_metadata.query_start_loc
+        query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
+        seq_lens = common_attn_metadata.seq_lens
+
+        query_seq_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
+
+        num_computed_tokens_cpu = (common_attn_metadata.seq_lens_cpu -
+                                   query_seq_lens_cpu)
+
+        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = \
+            split_decodes_and_prefills(common_attn_metadata)
+
+        assert num_decodes + num_prefills == num_reqs
+        assert num_decode_tokens + num_prefill_tokens == num_tokens
+
+        prefill_metadata = None
+        if num_prefills > 0:
+            reqs_start = num_decodes  # prefill_start
+
+            context_lens_cpu = num_computed_tokens_cpu[reqs_start:num_reqs]
+            max_context_len_cpu = context_lens_cpu.max().item()
+            num_prefills_with_context_cpu = (context_lens_cpu > 0).sum().item()
+            prefill_query_start_loc = query_start_loc[
+                reqs_start:] - query_start_loc[reqs_start]
+
+            chunked_context_metadata = None
+            if self.chunked_prefill_enabled and num_prefills > 0 \
+                and max_context_len_cpu > 0:
+                # NOTE: it is recommend you read the `Chunked Prefill` section
+                # in the comment at the top of the file before trying to
+                # understand the following code
+
+                # currently we allocate an equal amount of workspace for each
+                # prefill in the batch, we could probably use a more advanced
+                # algorithm here and allocate more workspace to prefills with
+                # longer context lengths
+                max_context_chunk = (self.chunked_prefill_workspace_size //
+                                     num_prefills_with_context_cpu)
+
+                if self.aot_schedule:
+                    # align max_context_chunk to page_size by rounding down,
+                    # currently the `gather_cache` kernel cannot handle
+                    # `context_chunk_starts` that are not aligned to page_size
+                    max_context_chunk = round_down(max_context_chunk,
+                                                   self.page_size)
+
+                assert max_context_chunk > 0
+                num_chunks = cdiv(max_context_len_cpu, max_context_chunk)
+
+                # if `max_context_chunk = 256`, `num_chunks = 3`, and
+                #   `num_prefills_with_context = 4`, create a tensor that looks
+                # like
+                #  [[0, 0, 0, 0], [256, 256, 256, 256], [512, 512, 512, 512]]
+                # Note(simon): this is done in CPU because of downstream's
+                # of `to_list`.
+                chunk_starts = \
+                    torch.arange(num_chunks, dtype=torch.int32) \
+                    .unsqueeze(1).expand(-1, num_prefills) \
+                    * max_context_chunk
+                chunk_ends = torch.min(context_lens_cpu.unsqueeze(0),
+                                       chunk_starts + max_context_chunk)
+                chunk_seq_lens = (chunk_ends - chunk_starts).clamp(min=0)
+
+                cu_seq_lens_cpu = torch.zeros(num_chunks,
+                                              num_prefills + 1,
+                                              dtype=torch.int32,
+                                              pin_memory=True)
+                torch.cumsum(chunk_seq_lens,
+                             dim=1,
+                             out=cu_seq_lens_cpu[:, 1:],
+                             dtype=torch.int32)
+
+                chunked_context_metadata_cls = \
+                    CudnnPrefillMetadata.ChunkedContextMetadata \
+                    if self._use_cudnn_prefill else \
+                        MLACommonPrefillMetadata.ChunkedContextMetadata
+
+                chunked_context_metadata = \
+                    chunked_context_metadata_cls(
+                    cu_seq_lens=cu_seq_lens_cpu.to(device, non_blocking=True),
+                    starts=chunk_starts.to(device, non_blocking=True),
+                    seq_tot=chunk_seq_lens.sum(dim=1).tolist(),
+                    max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
+                    seq_lens=chunk_seq_lens,
+                    workspace=self.chunked_prefill_workspace,
+                )
+
+                if self._use_cudnn_prefill:
+                    chunked_context_metadata.seq_lens = chunk_seq_lens
+
+                assert max(chunked_context_metadata.max_seq_lens) <= \
+                    self.chunked_prefill_workspace_size
+
+            prefill_metadata = self.prefill_metadata_cls(
+                block_table=block_table_tensor[reqs_start:, ...],
+                query_start_loc=prefill_query_start_loc,
+                max_query_len=max_query_len,
+                chunked_context=chunked_context_metadata,
+            )
+
+            if self._use_cudnn_prefill:
+                assert isinstance(prefill_metadata, CudnnPrefillMetadata)
+                prefill_metadata.query_seq_lens = prefill_query_start_loc[1:] \
+                    - prefill_query_start_loc[:-1]
+                prefill_metadata.cudnn_workspace = self.cudnn_workspace
+
+        decode_metadata = None
+        if num_decodes > 0:
+            decode_metadata = self._build_decode(
+                block_table_tensor=block_table_tensor[:num_decodes, ...],
+                seq_lens=seq_lens[:num_decodes],
+            )
+
+        attn_metadata = self.metadata_cls(
+            num_reqs=common_attn_metadata.num_reqs,
+            max_query_len=common_attn_metadata.max_query_len,
+            num_actual_tokens=num_tokens,
+            query_start_loc=query_start_loc,
+            slot_mapping=slot_mapping,
+            head_dim=self.model_config.get_head_size(),
+            # MLACommonMetadata Chunk prefill specific
+            num_decodes=num_decodes,
+            num_decode_tokens=num_decode_tokens,
+            num_prefills=num_prefills,
+            prefill=prefill_metadata,
+            decode=decode_metadata,
+        )
+
+        if self._use_fi_prefill and num_prefills > 0:
+            assert isinstance(attn_metadata.prefill, FlashInferPrefillMetadata)
+            self._build_fi_prefill_wrappers(attn_metadata.prefill)
+
+        return attn_metadata
+
+    def can_run_in_cudagraph(
+            self, common_attn_metadata: CommonAttentionMetadata) -> bool:
+        return common_attn_metadata.max_query_len == 1
+
+
+class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
+    """
+    NOTE: Please read the comment at the top of the file before trying to
+    understand this class
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[list[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        logits_soft_cap: Optional[float],
+        attn_type: str,
+        kv_sharing_target_layer_name: Optional[str],
+        # MLA Specific Arguments
+        q_lora_rank: Optional[int],
+        kv_lora_rank: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        qk_head_dim: int,
+        v_head_dim: int,
+        kv_b_proj: ColumnParallelLinear,
+    ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported for MLA")
+
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        self.kv_cache_dtype = kv_cache_dtype
+
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_head_dim
+        self.v_head_dim = v_head_dim
+        self.kv_b_proj = kv_b_proj
+
+        if use_flashinfer_prefill():
+            logger.debug_once("Using FlashInfer prefill for MLA")
+            self._run_prefill_context_chunk = self._run_prefill_context_chunk_fi
+            self._run_prefill_new_tokens = self._run_prefill_new_tokens_fi
+            self._pad_v = False
+        elif use_cudnn_prefill():
+            logger.debug_once("Using CUDNN prefill for MLA")
+            self._run_prefill_context_chunk = \
+                self._run_prefill_context_chunk_cudnn
+            self._run_prefill_new_tokens = self._run_prefill_new_tokens_cudnn
+            self._pad_v = False
+        else:  # Use FlashAttention
+            logger.debug_once("Using FlashAttention prefill for MLA")
+            self._run_prefill_context_chunk = self._run_prefill_context_chunk_fa
+            self._run_prefill_new_tokens = self._run_prefill_new_tokens_fa
+
+            # Handle the differences between the flash_attn_varlen from
+            # flash_attn and the one from vllm_flash_attn. The former is used on
+            # RoCM and the latter has an additional parameter to control
+            # FA2 vs FA3
+            self.flash_attn_varlen_func = flash_attn_varlen_func
+            self.vllm_flash_attn_version = get_flash_attn_version()
+            if self.vllm_flash_attn_version is not None:
+                self.flash_attn_varlen_func = \
+                    functools.partial(flash_attn_varlen_func,
+                                    fa_version=self.vllm_flash_attn_version)
+
+            # For MLA the v head dim is smaller than qk head dim so we pad out
+            # v with 0s to match the qk head dim for attention backends that do
+            # not support different headdims
+            # We don't need to pad V if we are on a hopper system with FA3
+            self._pad_v = self.vllm_flash_attn_version is None or not (
+                self.vllm_flash_attn_version == 3
+                and current_platform.get_device_capability()[0] == 9)
+
+    def _flash_attn_varlen_diff_headdims(self,
+                                         q,
+                                         k,
+                                         v,
+                                         return_softmax_lse=False,
+                                         softmax_scale=None,
+                                         **kwargs):
+        maybe_padded_v = v
+        if self._pad_v:
+            maybe_padded_v = torch.nn.functional.pad(
+                v, [0, q.shape[-1] - v.shape[-1]], value=0)
+
+        if is_vllm_fa:
+            kwargs["return_softmax_lse"] = return_softmax_lse
+        else:
+            # ROCm leverages the upstream flash_attn, which takes a parameter
+            # called "return_attn_probs" instead of return_softmax_lse
+            kwargs["return_attn_probs"] = return_softmax_lse
+
+        attn_out = self.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=maybe_padded_v,
+            softmax_scale=softmax_scale,
+            **kwargs,
+        )
+
+        # Unpack the output if there is multiple results
+        lse = None
+        if isinstance(attn_out, tuple):
+            attn_out, lse = attn_out[0], attn_out[1]
+
+        # Remain consistent with old `flash_attn_varlen_func` where there
+        # is only one output tensor if `return_softmax_lse` is False.
+        if return_softmax_lse:
+            return attn_out, lse
+        return attn_out
+
+    def _run_prefill_new_tokens_fa(self, prefill: MLACommonPrefillMetadata, q,
+                                   k, v, return_softmax_lse):
+        return self._flash_attn_varlen_diff_headdims(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=prefill.query_start_loc,
+            cu_seqlens_k=prefill.query_start_loc,
+            max_seqlen_q=prefill.max_query_len,
+            max_seqlen_k=prefill.max_query_len,
+            softmax_scale=self.scale,
+            causal=True,
+            return_softmax_lse=return_softmax_lse,
+        )
+
+    def _run_prefill_new_tokens_fi(self, prefill: MLACommonPrefillMetadata, q,
+                                   k, v, return_softmax_lse):
+        assert isinstance(prefill, FlashInferPrefillMetadata)
+        assert prefill.prefill_main is not None
+        return prefill.prefill_main.run(
+            q=q,
+            k=k,
+            v=v,
+            return_lse=return_softmax_lse,
+        )
+
+    def _run_prefill_new_tokens_cudnn(self, prefill: MLACommonPrefillMetadata,
+                                      q, k, v, return_softmax_lse):
+        assert isinstance(prefill, CudnnPrefillMetadata)
+        assert prefill.query_seq_lens is not None
+        output, lse = cudnn_batch_prefill_with_kv_cache(
+            q=q,
+            k_cache=k,
+            v_cache=v,
+            scale=self.scale,
+            workspace_buffer=prefill.cudnn_workspace,
+            max_token_per_sequence=prefill.max_query_len,
+            max_sequence_kv=prefill.max_query_len,
+            actual_seq_lens_q=prefill.query_seq_lens.view(-1, 1, 1, 1),
+            actual_seq_lens_kv=prefill.query_seq_lens.view(-1, 1, 1, 1),
+            causal=True,
+            return_lse=True,  # do not support False for now
+            is_cuda_graph_compatible=
+            True,  #Indicates actual_seq_lens are on GPU or CPU.
+        )
+        if return_softmax_lse:
+            return output, lse
+        return output
+
+    def _run_prefill_context_chunk_fa(self, prefill: MLACommonPrefillMetadata,
+                                      chunk_idx: int, q, k, v):
+        assert prefill.chunked_context is not None
+        return self._flash_attn_varlen_diff_headdims(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=prefill.query_start_loc,
+            cu_seqlens_k=prefill.chunked_context.cu_seq_lens[chunk_idx],
+            max_seqlen_q=prefill.max_query_len,
+            max_seqlen_k=prefill.chunked_context.max_seq_lens[chunk_idx],
+            softmax_scale=self.scale,
+            causal=False,  # Context is unmasked
+            return_softmax_lse=True,
+        )
+
+    def _run_prefill_context_chunk_fi(self, prefill: MLACommonPrefillMetadata,
+                                      chunk_idx: int, q, k, v):
+        assert isinstance(prefill, FlashInferPrefillMetadata)
+        return prefill.prefill_chunks[chunk_idx].run(
+            q=q,
+            k=k,
+            v=v,
+            return_lse=True,
+        )
+
+    def _run_prefill_context_chunk_cudnn(self,
+                                         prefill: MLACommonPrefillMetadata,
+                                         chunk_idx: int, q, k, v):
+        assert isinstance(prefill, CudnnPrefillMetadata)
+        assert prefill.chunked_context is not None
+        assert prefill.chunked_context.seq_lens[chunk_idx] is not None
+        assert prefill.query_seq_lens is not None
+        return cudnn_batch_prefill_with_kv_cache(
+            q=q,
+            k_cache=k,
+            v_cache=v,
+            scale=self.scale,
+            workspace_buffer=prefill.cudnn_workspace,
+            max_token_per_sequence=prefill.max_query_len,
+            max_sequence_kv=prefill.chunked_context.max_seq_lens[chunk_idx],
+            actual_seq_lens_q=prefill.query_seq_lens.view(-1, 1, 1, 1),
+            actual_seq_lens_kv=prefill.chunked_context.seq_lens[chunk_idx].
+            view(-1, 1, 1, 1),
+            causal=False,
+            return_lse=True,
+            is_cuda_graph_compatible=
+            True,  #Indicates actual_seq_lens are on GPU or CPU.
+        )
+
+    def _v_up_proj(self, x):
+        # Convert from (B, N, L) to (N, B, L)
+        x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
+        # Multiply (N, B, L) x (N, L, V) -> (N, B, V)
+        x = torch.bmm(x, self.W_UV)
+        # Convert from (N, B, V) to (B, N * V)
+        return x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim)
+
+    def process_weights_after_loading(self, act_dtype: torch.dtype):
+
+        def get_layer_weight(layer):
+            WEIGHT_NAMES = ("weight", "qweight", "weight_packed")
+            for attr in WEIGHT_NAMES:
+                if hasattr(layer, attr):
+                    return getattr(layer, attr)
+            raise AttributeError(
+                f"Layer '{layer}' has no recognized weight attribute:"
+                f" {WEIGHT_NAMES}.")
+
+        def get_and_maybe_dequant_weights(layer: LinearBase):
+            if not isinstance(layer.quant_method, UnquantizedLinearMethod):
+                # NOTE: This should only be used offline, since it's O(N^3)
+                eye = torch.eye(layer.input_size_per_partition,
+                                dtype=act_dtype,
+                                device=get_layer_weight(layer).device)
+                dequant_weights = layer.quant_method.apply(layer,
+                                                           eye,
+                                                           bias=None)
+                del eye
+                # standardize to (output, input)
+                return dequant_weights.T
+            return layer.weight
+
+        # we currently do not have quantized bmm's which are needed for
+        # `W_UV` and `W_UK_T`, we we just store fp16/bf16 copies and perform
+        # the bmm's in 16-bit, the extra memory overhead of this is fairly low
+        kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
+        assert kv_b_proj_weight.shape == (
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim)), (
+                f"{kv_b_proj_weight.shape=}, "
+                f"{self.kv_lora_rank=}, "
+                f"{self.num_heads=}, "
+                f"{self.qk_nope_head_dim=}, "
+                f"{self.v_head_dim=}")
+        kv_b_proj_weight = kv_b_proj_weight.view(
+            self.kv_lora_rank,
+            self.num_heads,
+            self.qk_nope_head_dim + self.v_head_dim,
+        )
+
+        W_UK, W_UV = kv_b_proj_weight.split(
+            [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+        # Convert from (L, N, V) to (N, L, V)
+        self.W_UV = W_UV.transpose(0, 1)
+        # Convert from (L, N, P) to (N, P, L)
+        self.W_UK_T = W_UK.permute(1, 2, 0)
+
+    def _compute_prefill_context(
+        self,
+        q: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: MLACommonMetadata,
+    ):
+        assert attn_metadata.prefill is not None
+        prefill_metadata = attn_metadata.prefill
+        assert prefill_metadata.chunked_context is not None
+
+        output = None
+        iters = len(prefill_metadata.chunked_context.seq_tot)
+        workspace = prefill_metadata.chunked_context.workspace
+
+        for i in range(iters):
+            toks = prefill_metadata.chunked_context.seq_tot[i]
+
+            ops.gather_cache(
+                src_cache=kv_c_and_k_pe_cache,
+                dst=workspace,
+                block_table=prefill_metadata.block_table,
+                cu_seq_lens=prefill_metadata.chunked_context.cu_seq_lens[i],
+                batch_size=attn_metadata.num_prefills,
+                seq_starts=prefill_metadata.chunked_context.starts[i],
+            )
+
+            kv_c_normed = workspace[:toks]\
+                [..., :self.kv_lora_rank]
+            k_pe = workspace[:toks]\
+                [..., self.kv_lora_rank:].unsqueeze(1)
+
+            kv_nope = self.kv_b_proj(kv_c_normed)[0].view( \
+                -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+            k_nope, v = kv_nope\
+                .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+            k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))),
+                          dim=-1)
+
+            attn_output, attn_softmax_lse = self._run_prefill_context_chunk(
+                prefill=prefill_metadata,
+                chunk_idx=i,
+                q=q,
+                k=k,
+                v=v,
+            )
+
+            if output is None:
+                output = attn_output
+                output_lse = attn_softmax_lse
+            else:
+                output_tmp = torch.empty_like(output)
+                output_lse_tmp = torch.empty_like(output_lse)
+                merge_attn_states(
+                    output=output_tmp,
+                    output_lse=output_lse_tmp,
+                    prefix_output=output,
+                    prefix_lse=output_lse,
+                    suffix_output=attn_output,
+                    suffix_lse=attn_softmax_lse,
+                )
+                output = output_tmp
+                output_lse = output_lse_tmp
+
+        return output, output_lse
+
+    def _forward_prefill(
+        self,
+        q: torch.Tensor,
+        kv_c_normed: torch.Tensor,
+        k_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: MLACommonMetadata,
+    ) -> torch.Tensor:
+        assert attn_metadata.prefill is not None
+
+        has_context = attn_metadata.prefill.chunked_context is not None
+        kv_nope = self.kv_b_proj(kv_c_normed)[0].view(\
+            -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+        k_nope, v = kv_nope\
+            .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+        k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
+
+        output = self._run_prefill_new_tokens(
+            prefill=attn_metadata.prefill,
+            q=q,
+            k=k,
+            v=v,
+            return_softmax_lse=has_context,
+        )
+
+        if has_context:
+            suffix_output, suffix_lse = output
+            context_output, context_lse = self._compute_prefill_context( \
+                q, kv_c_and_k_pe_cache, attn_metadata)
+
+            output = torch.empty_like(suffix_output)
+            merge_attn_states(
+                output=output,
+                prefix_output=context_output,
+                prefix_lse=context_lse,
+                suffix_output=suffix_output,
+                suffix_lse=suffix_lse,
+            )
+
+        # unpad if necessary
+        if self._pad_v:
+            output = output[..., :v.shape[-1]]
+
+        return output.flatten(start_dim=-2)
+
+    @abstractmethod
+    def _forward_decode(
+        self,
+        ql_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: M,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def forward(
+        self,
+        layer: AttentionLayer,
+        q: torch.Tensor,
+        k_c_normed: torch.Tensor,  # key in unified attn
+        k_pe: torch.Tensor,  # value in unified attn
+        kv_cache: torch.Tensor,
+        attn_metadata: M,
+        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        assert output is not None, "Output tensor must be provided."
+
+        if output_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported"
+                " for MLACommonImpl")
+
+        if attn_metadata is None:
+            # The zero fill is required when used with DP + EP
+            # to ensure all ranks within a DP group compute the
+            # same expert outputs.
+            return output.fill_(0)
+
+        num_actual_toks = attn_metadata.num_actual_tokens
+
+        # Inputs and outputs may be padded for CUDA graphs
+        output_padded = output
+        output = output[:num_actual_toks, ...]
+        q = q[:num_actual_toks, ...]
+        k_c_normed = k_c_normed[:num_actual_toks, ...]
+        k_pe = k_pe[:num_actual_toks, ...]
+
+        assert attn_metadata.num_decodes is not None and \
+            attn_metadata.num_prefills is not None and \
+            attn_metadata.num_decode_tokens is not None
+
+        has_decode = attn_metadata.num_decodes > 0
+        has_prefill = attn_metadata.num_prefills > 0
+        num_decode_tokens = attn_metadata.num_decode_tokens
+
+        decode_q = q[:num_decode_tokens]
+
+        prefill_q = q[num_decode_tokens:]
+        prefill_k_pe = k_pe[num_decode_tokens:]
+        prefill_k_c_normed = k_c_normed[num_decode_tokens:]
+
+        # write the latent and rope to kv cache
+        if kv_cache.numel() > 0:
+            ops.concat_and_cache_mla(
+                k_c_normed,
+                k_pe.squeeze(1),
+                kv_cache,
+                attn_metadata.slot_mapping.flatten(),
+                kv_cache_dtype=self.kv_cache_dtype,
+                scale=layer._k_scale,
+            )
+
+        if has_prefill:
+            output[num_decode_tokens:] = self._forward_prefill(
+                prefill_q, prefill_k_c_normed, prefill_k_pe, kv_cache,
+                attn_metadata)
+
+        if has_decode:
+            assert attn_metadata.decode is not None
+            decode_q_nope, decode_q_pe = decode_q.split(
+                [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+            # Convert from (B, N, P) to (N, B, P)
+            decode_q_nope = decode_q_nope.transpose(0, 1)
+            # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
+            decode_ql_nope = torch.bmm(decode_q_nope, self.W_UK_T)
+            # Convert from (N, B, L) to (B, N, L)
+            decode_ql_nope = decode_ql_nope.transpose(0, 1)
+
+            output[:num_decode_tokens] = self._forward_decode(
+                decode_ql_nope, decode_q_pe, kv_cache, attn_metadata)
+
+        return output_padded
diff --git a/vllm_v0.10.0/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm_v0.10.0/vllm/v1/attention/backends/mla/cutlass_mla.py
new file mode 100644
index 0000000..c787f25
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -0,0 +1,277 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from typing import Optional
+
+import torch
+
+import vllm._custom_ops as ops
+from vllm.attention.backends.abstract import (AttentionType,
+                                              is_quantized_kv_cache)
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
+                                                   MLACommonImpl,
+                                                   MLACommonMetadata)
+
+logger = init_logger(__name__)
+
+
+class CutlassMLABackend(MLACommonBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "CUTLASS_MLA_VLLM_V1"
+
+    @staticmethod
+    def get_impl_cls() -> type["CutlassMLAImpl"]:
+        return CutlassMLAImpl
+
+
+class SM100Workspace:
+
+    def __init__(self, initial_workspace_size):
+        self._workspace_buf = torch.empty(initial_workspace_size,
+                                          device="cuda",
+                                          dtype=torch.uint8)
+
+        self._block_size = 128  # Forced to 128
+
+        # Pre-compute sm_count to avoid recomputing it. Use device 0 as a proxy
+        # (assumes all devices are similar)
+        properties = torch.cuda.get_device_properties(torch.device("cuda:0"))
+        self._sm_count = properties.multi_processor_count
+
+    def get_buf(self):
+        return self._workspace_buf
+
+    def ensure_size(self, attn_metadata: MLACommonMetadata,
+                    num_kv_splits: int):
+        batch_size = attn_metadata.num_reqs
+        max_seq_len = attn_metadata.max_query_len
+
+        workspace_size = ops.sm100_cutlass_mla_get_workspace_size(
+            max_seq_len * self._block_size,
+            batch_size,
+            self._sm_count,
+            num_kv_splits=num_kv_splits)
+
+        if self._workspace_buf.shape[0] < workspace_size:
+            self._workspace_buf.resize_(workspace_size)
+
+
+g_sm100_workspace = SM100Workspace(128 * 1024 * 1024)  # 128MB
+
+
+class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
+
+    def __init__(
+            self,
+            num_heads: int,
+            head_size: int,
+            scale: float,
+            num_kv_heads: int,
+            alibi_slopes: Optional[list[float]],
+            sliding_window: Optional[int],
+            kv_cache_dtype: str,
+            logits_soft_cap: Optional[float],
+            attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
+            # MLA Specific Arguments
+            **mla_args) -> None:
+        super().__init__(num_heads, head_size, scale, num_kv_heads,
+                         alibi_slopes, sliding_window, kv_cache_dtype,
+                         logits_soft_cap, attn_type,
+                         kv_sharing_target_layer_name, **mla_args)
+
+        unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
+        if any(unsupported_features):
+            raise NotImplementedError(
+                "CutlassMLAImpl does not support one of the following: "
+                "alibi_slopes, sliding_window, logits_soft_cap")
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "CutlassMLAImpl")
+
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "CutlassMLA V1 with FP8 KV cache not yet supported")
+
+        self._use_old_cutlass_mla = False
+        force_old_cutlass = os.environ.get("FORCE_OLD_CUTLASS_MLA", None)
+        if force_old_cutlass:
+            logger.warning("Forcing old cutlass mla kernel")
+            self._use_old_cutlass_mla = True
+
+        # TODO: Currently, num_kv_splits is limited to 16 to avoid hanging
+        #       issues. In case the code hangs, use:
+        #       FORCE_NUM_KV_SPLITS=1
+        force_num_kv_splits = os.environ.get("FORCE_NUM_KV_SPLITS", None)
+        if force_num_kv_splits:
+            logger.warning("Forcing num_kv_splits to %d",
+                           int(force_num_kv_splits))
+            self._num_kv_splits = int(force_num_kv_splits)
+        else:
+            self._num_kv_splits = -1  # => Auto-detect
+
+        # Share workspace buffer across all executions
+        self._workspace = g_sm100_workspace
+
+    def _sm100_cutlass_mla_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        seq_lens: torch.Tensor,
+        page_table: torch.Tensor,
+        workspace: torch.Tensor,
+        sm_scale: float,
+        num_kv_splits: int,
+    ) -> torch.Tensor:
+        assert (q_nope.ndim == 3
+                ), f"q_nope must be a 3D tensor, but got {q_nope.ndim}"
+        assert (
+            q_pe.ndim == 3), f"q_pe must be a 3D tensor, but got {q_pe.ndim}"
+        assert (
+            kv_c_and_k_pe_cache.ndim == 3
+        ), "kv_c_and_k_pe_cache must be a 3D tensor, but got {}".format(
+            kv_c_and_k_pe_cache.ndim)
+
+        B_q, H, D_q_nope = q_nope.shape
+        B_q_2, H_2, D_q_pe = q_pe.shape
+        assert (B_q == B_q_2) and (H == H_2)
+
+        _, PAGE_SIZE, D_ckv = kv_c_and_k_pe_cache.shape
+
+        D_latent = 512
+        D_rope = 64
+        assert D_q_nope == D_latent
+        assert D_q_pe == D_rope
+        assert D_ckv == D_latent + D_rope
+
+        MAX_HEADS = 128
+        assert H <= MAX_HEADS, f"H must be <= {MAX_HEADS}, but got {H}"
+        if H < MAX_HEADS:
+            q_nope_padded = q_nope.new_empty((B_q, MAX_HEADS, D_q_nope))
+            q_nope_padded[:, :H] = q_nope
+            q_nope = q_nope_padded
+
+            q_pe_padded = q_pe.new_empty((B_q, MAX_HEADS, D_q_pe))
+            q_pe_padded[:, :H] = q_pe
+            q_pe = q_pe_padded
+
+        assert len(page_table.shape) == 2
+        B_block_table, block_num = page_table.shape
+        assert B_block_table == B_q
+        assert (block_num
+                > 0), f"block num must be greater than 0, got {block_num}"
+        assert block_num % (128 / PAGE_SIZE) == 0
+
+        # TODO(kaixih@nvidia): support fp8
+        assert q_nope.dtype in (
+            torch.float16,
+            torch.bfloat16,
+        ), f"q_nope.dtype needs to be fp16 or bf16 but got {q_nope.dtype}."
+        assert q_nope.dtype == q_pe.dtype == kv_c_and_k_pe_cache.dtype
+        assert (
+            seq_lens.dtype == torch.int32
+        ), f"seq_lens.dtype needs to be int32 but got {seq_lens.dtype}."
+        assert (
+            page_table.dtype == torch.int32
+        ), f"page_table.dtype needs to be int32 but got {page_table.dtype}."
+
+        out = q_nope.new_empty((B_q, MAX_HEADS, D_latent))
+
+        ops.sm100_cutlass_mla_decode(
+            out,
+            q_nope,
+            q_pe,
+            kv_c_and_k_pe_cache,
+            seq_lens,
+            page_table,
+            workspace,
+            sm_scale,
+            num_kv_splits,
+        )
+        return out[:, :H].contiguous()
+
+    def _sm100_forward_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: MLACommonMetadata,
+    ) -> torch.Tensor:
+        assert kv_c_and_k_pe_cache.numel() > 0
+        assert attn_metadata.decode is not None
+
+        if self.kv_cache_dtype.startswith("fp8"):
+            raise NotImplementedError("FP8 Cutlass MLA not yet supported")
+
+        # Adjust workspace size (if necessary)
+        self._workspace.ensure_size(attn_metadata, self._num_kv_splits)
+
+        # Run MLA
+        # Clone q_nope and q_pe to make sure strides computation is correct.
+        # TODO: Check if we really need it
+        q_nope = q_nope.clone()
+        q_pe = q_pe.clone()
+
+        o = self._sm100_cutlass_mla_decode(q_nope, q_pe, kv_c_and_k_pe_cache,
+                                           attn_metadata.decode.seq_lens,
+                                           attn_metadata.decode.block_table,
+                                           self._workspace.get_buf(),
+                                           self.scale, self._num_kv_splits)
+
+        return self._v_up_proj(o)
+
+    # TODO: Currently we leave it here only for backup in case something is
+    #       wrong with the new SM100 CUTLASS MLA kernel
+    def _old_forward_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: MLACommonMetadata,
+    ) -> torch.Tensor:
+        assert kv_c_and_k_pe_cache.numel() > 0
+        assert attn_metadata.decode is not None
+
+        if self.kv_cache_dtype.startswith("fp8"):
+            raise NotImplementedError("FP8 Cutlass MLA not yet supported")
+
+        B = q_nope.shape[0]
+
+        o = torch.empty((B, self.num_heads, self.kv_lora_rank),
+                        dtype=q_nope.dtype,
+                        device=q_nope.device)
+
+        # Run MLA
+        # Clone q_nope and q_pe to make sure strides computation is correct.
+        q_nope = q_nope.clone()
+        q_pe = q_pe.clone()
+
+        ops.cutlass_mla_decode(o, q_nope, q_pe, kv_c_and_k_pe_cache,
+                               attn_metadata.decode.seq_lens,
+                               attn_metadata.decode.block_table, self.scale)
+
+        return self._v_up_proj(o)
+
+    def _forward_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: MLACommonMetadata,
+    ) -> torch.Tensor:
+        if self._use_old_cutlass_mla:
+            # TODO: Remove the old cutlass MLA kernel after more extensive
+            #       testing
+            return self._old_forward_decode(q_nope, q_pe, kv_c_and_k_pe_cache,
+                                            attn_metadata)
+
+        return self._sm100_forward_decode(q_nope, q_pe, kv_c_and_k_pe_cache,
+                                          attn_metadata)
diff --git a/vllm_v0.10.0/vllm/v1/attention/backends/mla/flashmla.py b/vllm_v0.10.0/vllm/v1/attention/backends/mla/flashmla.py
new file mode 100644
index 0000000..d3e5300
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/attention/backends/mla/flashmla.py
@@ -0,0 +1,177 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from typing import ClassVar, Optional
+
+import torch
+
+from vllm.attention.backends.abstract import (AttentionType,
+                                              is_quantized_kv_cache)
+from vllm.attention.ops.flashmla import (flash_mla_with_kvcache,
+                                         get_mla_metadata,
+                                         is_flashmla_supported)
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
+                                                   MLACommonDecodeMetadata,
+                                                   MLACommonImpl,
+                                                   MLACommonMetadata,
+                                                   MLACommonMetadataBuilder)
+from vllm.v1.kv_cache_interface import AttentionSpec
+
+logger = init_logger(__name__)
+
+
+class FlashMLABackend(MLACommonBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "FLASHMLA_VLLM_V1"
+
+    @staticmethod
+    def get_metadata_cls() -> type["FlashMLAMetadata"]:
+        return FlashMLAMetadata
+
+    @staticmethod
+    def get_builder_cls() -> type["FlashMLAMetadataBuilder"]:
+        return FlashMLAMetadataBuilder
+
+    @staticmethod
+    def get_impl_cls() -> type["FlashMLAImpl"]:
+        return FlashMLAImpl
+
+
+@dataclass
+class FlashMLADecodeMetadata(MLACommonDecodeMetadata):
+    tile_scheduler_metadata: torch.Tensor
+    num_splits: torch.Tensor
+
+
+@dataclass
+class FlashMLAMetadata(MLACommonMetadata[FlashMLADecodeMetadata]):
+    pass
+
+
+class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
+    full_cudagraph_supported: ClassVar[bool] = True  # Decode-only
+
+    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
+                 device: torch.device):
+        super().__init__(kv_cache_spec, vllm_config, device, FlashMLAMetadata)
+
+        self.compilation_config = vllm_config.compilation_config
+        self.num_q_heads = vllm_config.model_config.get_num_attention_heads(
+            vllm_config.parallel_config)
+
+        self.cg_buf_tile_scheduler_metadata = None
+        self.cg_buf_num_splits = None
+
+    def _build_decode(self, block_table_tensor: torch.Tensor,
+                      seq_lens: torch.Tensor) -> FlashMLADecodeMetadata:
+        tile_scheduler_metadata, num_splits = \
+            get_mla_metadata(
+            seq_lens,
+            self.num_q_heads,
+            1, # MQA for the decode path
+        )
+
+        if self.compilation_config.full_cuda_graph:
+            # First time around (CUDAGraph capture), allocate the static buffer
+            if self.cg_buf_tile_scheduler_metadata is None:
+                self.cg_buf_tile_scheduler_metadata = tile_scheduler_metadata
+                self.cg_buf_num_splits = num_splits
+            else:
+                assert self.cg_buf_num_splits is not None
+
+                # Metadata per-SM, fixed size (#SMs, TileMetadataSize)
+                assert (self.cg_buf_tile_scheduler_metadata.size() ==
+                        tile_scheduler_metadata.size())
+                self.cg_buf_tile_scheduler_metadata.\
+                    copy_(tile_scheduler_metadata)
+                tile_scheduler_metadata = self.cg_buf_tile_scheduler_metadata
+
+                # Num splits is per-batch, varying size (batch_size,)
+                n = num_splits.size(0)
+                # make sure static buffer is large enough
+                assert n <= self.cg_buf_num_splits.size(0)
+                num_splits_view = self.cg_buf_num_splits[:n]
+                num_splits_view.copy_(num_splits)
+                self.cg_buf_num_splits[n:].fill_(0)  # fill the rest with 0s
+                num_splits = num_splits_view
+
+        return FlashMLADecodeMetadata(
+            block_table=block_table_tensor,
+            seq_lens=seq_lens,
+            tile_scheduler_metadata=tile_scheduler_metadata,
+            num_splits=num_splits,
+        )
+
+
+class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
+
+    def __init__(
+            self,
+            num_heads: int,
+            head_size: int,
+            scale: float,
+            num_kv_heads: int,
+            alibi_slopes: Optional[list[float]],
+            sliding_window: Optional[int],
+            kv_cache_dtype: str,
+            logits_soft_cap: Optional[float],
+            attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
+            # MLA Specific Arguments
+            **mla_args) -> None:
+        super().__init__(num_heads, head_size, scale, num_kv_heads,
+                         alibi_slopes, sliding_window, kv_cache_dtype,
+                         logits_soft_cap, attn_type,
+                         kv_sharing_target_layer_name, **mla_args)
+
+        assert is_flashmla_supported(), \
+            "FlashMLA is not supported on this device"
+
+        unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
+        if any(unsupported_features):
+            raise NotImplementedError(
+                "FlashMLAImpl does not support one of the following: "
+                "alibi_slopes, sliding_window, logits_soft_cap")
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "FlashMLAImpl")
+
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "FlashMLA V1 with FP8 KV cache not yet supported")
+
+    def _forward_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: FlashMLAMetadata,
+    ) -> torch.Tensor:
+        assert kv_c_and_k_pe_cache.numel() > 0
+        assert attn_metadata.decode is not None
+
+        q = torch.cat([q_nope, q_pe], dim=-1)\
+            .unsqueeze(1) # Add seqlen dim of 1 (decode)
+
+        o, _ = flash_mla_with_kvcache(
+            q=q,
+            k_cache=kv_c_and_k_pe_cache.unsqueeze(-2),  # Add head dim of 1
+            block_table=attn_metadata.decode.block_table,
+            cache_seqlens=attn_metadata.decode.seq_lens,
+            head_dim_v=self.kv_lora_rank,
+            tile_scheduler_metadata=attn_metadata.decode.
+            tile_scheduler_metadata,
+            num_splits=attn_metadata.decode.num_splits,
+            softmax_scale=self.scale,
+            causal=True,
+        )
+
+        return self._v_up_proj(o)
diff --git a/vllm_v0.10.0/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm_v0.10.0/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
new file mode 100644
index 0000000..834c234
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -0,0 +1,240 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from typing import ClassVar, Optional
+
+import torch
+
+import vllm.envs as envs
+from vllm.attention.ops.rocm_aiter_mla import aiter_mla_decode_fwd
+from vllm.config import VllmConfig
+from vllm.utils import cdiv
+# yapf conflicts with isort for this docstring
+# yapf: disable
+from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
+                                                   MLACommonDecodeMetadata,
+                                                   MLACommonImpl,
+                                                   MLACommonMetadata,
+                                                   MLACommonMetadataBuilder)
+from vllm.v1.kv_cache_interface import AttentionSpec
+
+# yapf: enable
+
+
+def is_aiter_mla_enabled() -> bool:
+    return envs.VLLM_ROCM_USE_AITER \
+        and envs.VLLM_ROCM_USE_AITER_MLA
+
+
+class AiterMLABackend(MLACommonBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "ROCM_AITER_MLA_VLLM_V1"
+
+    @staticmethod
+    def get_impl_cls() -> type["AiterMLAImpl"]:
+        return AiterMLAImpl
+
+    @staticmethod
+    def get_metadata_cls() -> type["AiterMLAMetadata"]:
+        return AiterMLAMetadata
+
+    @staticmethod
+    def get_builder_cls() -> type["AiterMLAMetadataBuilder"]:
+        return AiterMLAMetadataBuilder
+
+
+@dataclass
+class AiterMLADecodeMetadata(MLACommonDecodeMetadata):
+    # The indptr of the paged kv cache, shape: [batch_size + 1]
+    paged_kv_indptr: Optional[torch.Tensor] = None
+    # The page indices of the paged kv cache
+    paged_kv_indices: Optional[torch.Tensor] = None
+    # The number of entries in the last page of each request in
+    # the paged kv cache, shape: [batch_size]
+    paged_kv_last_page_len: Optional[torch.Tensor] = None
+    # The query indptr, shape : [num_decode + 1]
+    qo_indptr: Optional[torch.Tensor] = None
+
+
+class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]):
+    pass
+
+
+class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
+    full_cudagraph_supported: ClassVar[bool] = True  # decode only
+
+    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
+                 device: torch.device):
+        super().__init__(kv_cache_spec, vllm_config, device, AiterMLAMetadata)
+        assert self.kv_cache_spec.block_size == 1, "AITER MLA" \
+            "only supports block size 1."
+
+        self.compilation_config = vllm_config.compilation_config
+        max_num_pages_per_req = cdiv(vllm_config.model_config.max_model_len,
+                                     self.kv_cache_spec.block_size)
+        max_num_reqs = vllm_config.scheduler_config.max_num_seqs
+        max_num_pages = max_num_reqs * max_num_pages_per_req
+
+        # Preparing persistent buffers
+        if vllm_config.compilation_config.full_cuda_graph:
+            self.paged_kv_indptr = torch.zeros(max_num_reqs + 1,
+                                               dtype=torch.int32,
+                                               device=device)
+            self.paged_kv_indices = torch.zeros(max_num_pages,
+                                                dtype=torch.int32,
+                                                device=device)
+            self.paged_kv_last_page_len = torch.zeros(max_num_reqs,
+                                                      dtype=torch.int32,
+                                                      device=device)
+
+            self.qo_indptr = torch.arange(0,
+                                          max_num_reqs + 1,
+                                          dtype=torch.int32,
+                                          device=device)
+
+    def _build_decode(self, block_table_tensor: torch.Tensor,
+                      seq_lens: torch.Tensor) -> AiterMLADecodeMetadata:
+        page_size = self.kv_cache_spec.block_size
+        block_table_bounds = (seq_lens + page_size - 1) // page_size
+        device = self.device
+        num_reqs = seq_lens.size(0)
+
+        mask = (torch.arange(block_table_tensor.size(1),
+                             dtype=block_table_tensor.dtype,
+                             device=device).unsqueeze(0)
+                < block_table_bounds.unsqueeze(1))
+        paged_kv_indices = block_table_tensor[mask]
+
+        paged_kv_last_page_len = seq_lens % page_size
+        paged_kv_last_page_len = torch.where(paged_kv_last_page_len == 0,
+                                             page_size, paged_kv_last_page_len)
+
+        paged_kv_indptr = torch.cat([
+            torch.zeros(1, dtype=block_table_bounds.dtype, device=device),
+            block_table_bounds.cumsum(dim=0, dtype=torch.int32)
+        ])
+
+        if self.compilation_config.full_cuda_graph:
+
+            num_actual_pages = paged_kv_indices.size(0)
+
+            self.paged_kv_indices[:num_actual_pages].copy_(paged_kv_indices,
+                                                           non_blocking=True)
+            self.paged_kv_indices[num_actual_pages:].fill_(-1)
+            paged_kv_indices = self.paged_kv_indices[:num_actual_pages]
+
+            self.paged_kv_indptr[:1 + num_reqs].copy_(paged_kv_indptr,
+                                                      non_blocking=True)
+            self.paged_kv_indptr[1 + num_reqs:].fill_(paged_kv_indptr[-1])
+            paged_kv_indptr = self.paged_kv_indptr[:1 + num_reqs]
+
+            self.paged_kv_last_page_len[:num_reqs].copy_(
+                paged_kv_last_page_len, non_blocking=True)
+            self.paged_kv_last_page_len[num_reqs:].fill_(1)
+            paged_kv_last_page_len = self.paged_kv_last_page_len[:num_reqs]
+
+            qo_indptr = self.qo_indptr[:1 + num_reqs]
+
+        else:
+            qo_indptr = torch.arange(0,
+                                     num_reqs + 1,
+                                     step=1,
+                                     dtype=torch.int32,
+                                     device=device)
+
+        attn_metadata = AiterMLADecodeMetadata(
+            block_table=block_table_tensor,
+            seq_lens=seq_lens,
+            paged_kv_indptr=paged_kv_indptr,
+            paged_kv_indices=paged_kv_indices,
+            paged_kv_last_page_len=paged_kv_last_page_len,
+            qo_indptr=qo_indptr)
+
+        return attn_metadata
+
+
+class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
+
+    def __init__(
+            self,
+            num_heads: int,
+            head_size: int,
+            scale: float,
+            num_kv_heads: int,
+            alibi_slopes: Optional[list[float]],
+            sliding_window: Optional[int],
+            kv_cache_dtype: str,
+            logits_soft_cap: Optional[float],
+            attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
+            # MLA Specific Arguments
+            **mla_args) -> None:
+        super().__init__(num_heads, head_size, scale, num_kv_heads,
+                         alibi_slopes, sliding_window, kv_cache_dtype,
+                         logits_soft_cap, attn_type,
+                         kv_sharing_target_layer_name, **mla_args)
+        assert (num_heads == 16 or num_heads == 128), (
+            f"Aiter MLA only supports 16 or 128 number of heads.\n"
+            f"Provided {num_heads} number of heads.\n"
+            "Try adjusting tensor_parallel_size value.")
+        unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
+        if any(unsupported_features):
+            raise NotImplementedError(
+                "Aiter MLA does not support one of the following: "
+                "alibi_slopes, sliding_window, logits_soft_cap")
+
+        from aiter import flash_attn_varlen_func
+        self.flash_attn_varlen_func = flash_attn_varlen_func
+
+    def _flash_attn_varlen_diff_headdims(self,
+                                         q,
+                                         k,
+                                         v,
+                                         return_softmax_lse=False,
+                                         softmax_scale=None,
+                                         **kwargs):
+        output = self.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=v,
+            softmax_scale=softmax_scale,
+            return_lse=return_softmax_lse,
+            **kwargs,
+        )
+
+        return output
+
+    def _forward_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: AiterMLAMetadata,
+    ) -> torch.Tensor:
+        assert kv_c_and_k_pe_cache.numel() > 0
+        assert attn_metadata.decode is not None
+
+        B = q_nope.shape[0]
+
+        q = torch.cat([q_nope, q_pe], dim=-1)
+        o = torch.zeros(B,
+                        self.num_heads,
+                        self.kv_lora_rank,
+                        dtype=q.dtype,
+                        device=q.device)
+
+        kv_buffer = kv_c_and_k_pe_cache.unsqueeze(2)
+
+        # max_seqlen_qo must be 1 except for MTP
+        # TODO: Find the best value for MTP
+        max_seqlen_qo = 1
+        aiter_mla_decode_fwd(q, kv_buffer, o, self.scale,
+                             attn_metadata.decode.qo_indptr, max_seqlen_qo,
+                             attn_metadata.decode.paged_kv_indptr,
+                             attn_metadata.decode.paged_kv_indices,
+                             attn_metadata.decode.paged_kv_last_page_len)
+
+        return self._v_up_proj(o)
diff --git a/vllm_v0.10.0/vllm/v1/attention/backends/mla/triton_mla.py b/vllm_v0.10.0/vllm/v1/attention/backends/mla/triton_mla.py
new file mode 100644
index 0000000..700fce6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/attention/backends/mla/triton_mla.py
@@ -0,0 +1,173 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+from vllm import envs
+from vllm.attention.backends.abstract import (AttentionType,
+                                              is_quantized_kv_cache)
+from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
+from vllm.attention.ops.triton_flash_attention import triton_attention
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.triton_utils import HAS_TRITON
+from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
+                                                   MLACommonImpl,
+                                                   MLACommonMetadata)
+
+logger = init_logger(__name__)
+
+
+class TritonMLABackend(MLACommonBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "TRITON_MLA_VLLM_V1"
+
+    @staticmethod
+    def get_impl_cls() -> type["TritonMLAImpl"]:
+        return TritonMLAImpl
+
+
+class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
+
+    def __init__(
+            self,
+            num_heads: int,
+            head_size: int,
+            scale: float,
+            num_kv_heads: int,
+            alibi_slopes: Optional[list[float]],
+            sliding_window: Optional[int],
+            kv_cache_dtype: str,
+            logits_soft_cap: Optional[float],
+            attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
+            # MLA Specific Arguments
+            **mla_args) -> None:
+        super().__init__(num_heads, head_size, scale, num_kv_heads,
+                         alibi_slopes, sliding_window, kv_cache_dtype,
+                         logits_soft_cap, attn_type,
+                         kv_sharing_target_layer_name, **mla_args)
+
+        unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
+        if any(unsupported_features):
+            raise NotImplementedError(
+                "TritonMLAImpl does not support one of the following: "
+                "alibi_slopes, sliding_window, logits_soft_cap")
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "TritonMLAImpl")
+
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "TritonMLA V1 with FP8 KV cache not yet supported")
+
+        self.use_triton_flash_attn = envs.VLLM_USE_TRITON_FLASH_ATTN
+        self.triton_fa_func = triton_attention if HAS_TRITON else None
+
+    def _flash_attn_varlen_diff_headdims_rocm(self,
+                                              q,
+                                              k,
+                                              v,
+                                              softmax_scale=None,
+                                              **kwargs):
+        assert self.triton_fa_func is not None
+
+        # Triton Attention requires a padded V
+        padded_v = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
+                                           value=0)
+        # The output of triton_attention is a tuple of
+        # [output_tensor, encoded_softmax] where encoded_softmax is always None
+        output_tensor, _ = self.triton_fa_func(
+            q,
+            k,
+            padded_v,
+            None,  # output
+            kwargs["cu_seqlens_q"],
+            kwargs["cu_seqlens_k"],
+            kwargs["max_seqlen_q"],
+            kwargs["max_seqlen_k"],
+            kwargs["causal"],
+            softmax_scale,
+            None,  # bias
+        )
+
+        return output_tensor
+
+    def _flash_attn_varlen_diff_headdims(self,
+                                         q,
+                                         k,
+                                         v,
+                                         return_softmax_lse=False,
+                                         softmax_scale=None,
+                                         **kwargs):
+        if current_platform.is_rocm() \
+            and self.use_triton_flash_attn \
+            and not return_softmax_lse:
+            return self._flash_attn_varlen_diff_headdims_rocm(
+                q, k, v, softmax_scale=softmax_scale, **kwargs)
+        else:
+            return super()._flash_attn_varlen_diff_headdims(
+                q,
+                k,
+                v,
+                return_softmax_lse=return_softmax_lse,
+                softmax_scale=softmax_scale,
+                **kwargs)
+
+    def _forward_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: MLACommonMetadata,
+    ) -> torch.Tensor:
+        assert kv_c_and_k_pe_cache.numel() > 0
+        assert attn_metadata.decode is not None
+
+        if self.kv_cache_dtype.startswith("fp8"):
+            raise NotImplementedError("FP8 Triton MLA not yet supported")
+
+        B = q_nope.shape[0]
+
+        q = torch.cat([q_nope, q_pe], dim=-1)
+        o = torch.zeros(B,
+                        self.num_heads,
+                        self.kv_lora_rank,
+                        dtype=q.dtype,
+                        device=q.device)
+
+        num_kv_splits = 4  # TODO: heuristic
+
+        # TODO(lucas) Allocate ahead of time
+        attn_logits = torch.empty(
+            (
+                B,
+                self.num_heads,
+                num_kv_splits,
+                # NOTE(lucas) idk why the +1 is here but sglang has it so we
+                # just mirror that
+                self.kv_lora_rank + 1,
+            ),
+            dtype=torch.float32,
+            device=q.device,
+        )
+
+        # Add a head dim of 1
+        kv_c_and_k_pe_cache = kv_c_and_k_pe_cache.unsqueeze(2)
+        kv_c_cache = kv_c_and_k_pe_cache[..., :self.kv_lora_rank]
+        PAGE_SIZE = kv_c_and_k_pe_cache.size(1)
+
+        # Run MQA
+        decode_attention_fwd(q, kv_c_and_k_pe_cache, kv_c_cache, o,
+                             attn_metadata.decode.block_table,
+                             attn_metadata.decode.seq_lens, attn_logits,
+                             num_kv_splits, self.scale, PAGE_SIZE)
+
+        return self._v_up_proj(o)
diff --git a/vllm_v0.10.0/vllm/v1/attention/backends/pallas.py b/vllm_v0.10.0/vllm/v1/attention/backends/pallas.py
new file mode 100644
index 0000000..9b12213
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/attention/backends/pallas.py
@@ -0,0 +1,406 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+import torch_xla.core.xla_builder as xb
+import torch_xla.experimental.custom_kernel  # noqa: F401
+# Required to register custom ops.
+from torch.library import impl
+from torch_xla._internal.jax_workarounds import requires_jax
+from torch_xla.experimental.custom_kernel import XLA_LIB
+
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer, AttentionType)
+from vllm.attention.backends.utils import CommonAttentionState
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.utils import cdiv, next_power_of_2
+
+logger = init_logger(__name__)
+
+# TPU requires the head size to be a multiple of 128.
+TPU_HEAD_SIZE_ALIGNMENT = 128
+
+# Note: TPU can fp8 as storage dtype but doesn't support converting from uint8
+# from to fp32 directly. That's why it has a dtype mapping different from GPU
+TPU_STR_DTYPE_TO_TORCH_DTYPE = {
+    "half": torch.half,
+    "bfloat16": torch.bfloat16,
+    "float": torch.float,
+    "fp8": torch.float8_e4m3fn,
+    "fp8_e4m3": torch.float8_e4m3fn,
+    "fp8_e5m2": torch.float8_e5m2,
+    "int8": torch.int8,
+    "uint8": torch.uint8,
+}
+
+
+class PallasAttentionBackend(AttentionBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "PALLAS_VLLM_V1"
+
+    @staticmethod
+    def get_impl_cls() -> type["PallasAttentionBackendImpl"]:
+        return PallasAttentionBackendImpl
+
+    @staticmethod
+    def get_metadata_cls() -> type["PallasMetadata"]:
+        return PallasMetadata
+
+    @staticmethod
+    def get_state_cls() -> type["CommonAttentionState"]:
+        return CommonAttentionState
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> tuple[int, ...]:
+        padded_head_size = cdiv(
+            head_size, TPU_HEAD_SIZE_ALIGNMENT) * TPU_HEAD_SIZE_ALIGNMENT
+        return (num_blocks, block_size, num_kv_heads * 2, padded_head_size)
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        raise RuntimeError("swap_blocks is not used for the TPU backend.")
+
+    # In recent TPU generations, up to v6e, the SMEM size is 1MB. The
+    # block_tables within the PallasMetadata constitute almost the entire SMEM
+    # requirement. Its size is max_num_seqs * num_page_per_seq * 4 (Int). Here
+    # we simply make sure that the size is smaller than half of SMEM capacity.
+    @staticmethod
+    def get_min_page_size(vllm_config: VllmConfig) -> int:
+        max_num_page_per_req = (1024 * 1024 // 2 //
+                                vllm_config.scheduler_config.max_num_seqs // 4)
+        min_page_size = cdiv(vllm_config.model_config.max_model_len,
+                             max_num_page_per_req)
+        min_page_size = 1 << (min_page_size - 1).bit_length()
+        return min_page_size
+
+    @staticmethod
+    def get_max_num_seqs(model_len: int, page_size: int) -> int:
+        num_page_per_req = cdiv(model_len, page_size)
+        return 1024 * 1024 // 2 // num_page_per_req // 4
+
+    # TPU has limited SREGs (scalar registers), if page_size is too small, we
+    # can spill SREGs easily which leads to bad performance. The strategy we
+    # apply here is trying to split max-model-len to 16 pages which make the
+    # spill less likely. Meanwhile we make sure the page size is in [16, 256].
+    @staticmethod
+    def get_page_size(vllm_config: VllmConfig) -> int:
+        # TODO: This is a temporary fix for vmem OOM.
+        # For long model length, we use 16 page-size to avoid too much
+        # VMEM spill. A more robust solution should be implemented to
+        # handle VREG spills.
+        if vllm_config.model_config.max_model_len > 8192:
+            return 16
+        page_size = next_power_of_2(
+            vllm_config.model_config.max_model_len) // 16
+        if page_size <= 16:
+            return 16
+        if page_size >= 256:
+            return 256
+        return page_size
+
+
+@dataclass
+class PallasMetadata:
+    # NOTE(sang): Definition of context_len, query_len, and seq_len.
+    # |---------- N-1 iteration --------|
+    # |---------------- N iteration ---------------------|
+    # |- tokenA -|......................|-- newTokens ---|
+    # |---------- context_len ----------|
+    # |-------------------- seq_len ---------------------|
+    #                                   |-- query_len ---|
+
+    # Used in the PallasAttentionBackendImpl
+    slot_mapping: torch.Tensor
+    block_tables: torch.Tensor
+    context_lens: torch.Tensor
+    query_start_loc: torch.Tensor
+    num_seqs: torch.Tensor
+    num_kv_update_slices: torch.Tensor
+    num_slices_per_kv_cache_update_block: int
+
+
+class PallasAttentionBackendImpl(AttentionImpl):
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[list[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[int] = None,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        self.sliding_window = sliding_window
+        self.logits_soft_cap = logits_soft_cap
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
+
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+        if alibi_slopes is not None:
+            raise NotImplementedError("Alibi slopes is not supported.")
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "PallasAttentionBackendImpl")
+
+        self.kv_cache_quantized_dtype = None
+        if kv_cache_dtype != "auto":
+            self.kv_cache_quantized_dtype = TPU_STR_DTYPE_TO_TORCH_DTYPE.get(
+                kv_cache_dtype.lower().strip())
+
+    def forward(
+        self,
+        layer: AttentionLayer,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: PallasMetadata,
+        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass with Pallas attention.
+
+        Args:
+            query: shape = [num_tokens, num_heads * head_size]
+            key: shape = [num_tokens, num_kv_heads * head_size]
+            value: shape = [num_tokens, num_kv_heads * head_size]
+            kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """
+        if output_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported"
+                " for PallasAttentionBackendImpl")
+
+        # For determine_available_memory case.
+        if kv_cache.numel() == 0:
+            if output is None:
+                output = torch.ones_like(query)
+            return output
+
+        num_tokens, hidden_size = query.shape
+        query = query.view(num_tokens, self.num_heads, self.head_size)
+        key = key.view(-1, self.num_kv_heads, self.head_size)
+        value = value.view(-1, self.num_kv_heads, self.head_size)
+        if self.head_size % TPU_HEAD_SIZE_ALIGNMENT != 0:
+            padded_head_size = cdiv(
+                self.head_size,
+                TPU_HEAD_SIZE_ALIGNMENT) * TPU_HEAD_SIZE_ALIGNMENT
+            query = torch.nn.functional.pad(
+                query, (0, padded_head_size - self.head_size), value=0.0)
+            key = torch.nn.functional.pad(
+                key, (0, padded_head_size - self.head_size), value=0.0)
+            value = torch.nn.functional.pad(
+                value, (0, padded_head_size - self.head_size), value=0.0)
+
+        if self.kv_sharing_target_layer_name is None and kv_cache.numel() > 0:
+            # Write input keys and values to the KV cache.
+            # Skip this if sharing KV cache with an earlier attention layer.
+            slot_mapping = attn_metadata.slot_mapping
+            write_to_kv_cache(
+                key,
+                value,
+                kv_cache,
+                slot_mapping,
+                attn_metadata.num_slices_per_kv_cache_update_block,
+                attn_metadata.num_kv_update_slices,
+                self.kv_cache_quantized_dtype,
+                layer._k_scale_float,
+                layer._v_scale_float,
+            )
+
+        if self.kv_cache_quantized_dtype is not None and (
+                layer._k_scale_float == 0.0 or layer._v_scale_float == 0.0):
+            raise ValueError(
+                "k_scale_float and v_scale_float must be non-zero")
+        output = torch.ops.xla.ragged_paged_attention(
+            query,
+            kv_cache,
+            attn_metadata.context_lens,
+            attn_metadata.block_tables,
+            attn_metadata.query_start_loc,
+            attn_metadata.num_seqs,
+            # By default, the system utilizes optimized block size and
+            # vmem_limit_bytes parameters from the kernel repository. However,
+            # these can be manually adjusted for debugging if necessary.
+            num_kv_pages_per_block=None,
+            num_queries_per_block=None,
+            vmem_limit_bytes=None,
+            use_kernel=True,
+            sm_scale=self.scale,
+            sliding_window=self.sliding_window,
+            soft_cap=self.logits_soft_cap,
+            k_scale=layer._k_scale_float,
+            v_scale=layer._v_scale_float,
+        )
+
+        if self.head_size % TPU_HEAD_SIZE_ALIGNMENT != 0:
+            output = output[:, :, :self.head_size]
+
+        return output.reshape(num_tokens, hidden_size)
+
+
+def write_to_kv_cache(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    kv_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,
+    num_slices_per_kv_cache_update_block: int,
+    num_kv_update_slices: torch.Tensor,
+    kv_cache_quantized_dtype: Optional[torch.dtype] = None,
+    k_scale: float = 1.0,
+    v_scale: float = 1.0,
+) -> None:
+    """ Write the key and values to the KV cache.
+
+    Args:
+        key: shape = [num_tokens, num_kv_heads, head_size]
+        value: shape = [num_tokens, num_kv_heads, head_size]
+        kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size]
+        num_slices_per_kv_cache_update_block: int
+    """
+    _, page_size, num_combined_kv_heads, head_size = kv_cache.shape
+    head_size = cdiv(head_size,
+                     TPU_HEAD_SIZE_ALIGNMENT) * TPU_HEAD_SIZE_ALIGNMENT
+
+    if kv_cache_quantized_dtype is not None:
+        dtype_info = torch.finfo(kv_cache_quantized_dtype)
+        key = key.to(torch.float32) / k_scale
+        # NOTE: clamp is added here to avoid out of range of quantized dtype
+        key = torch.clamp(key, dtype_info.min, dtype_info.max)
+        key = key.to(kv_cache_quantized_dtype)
+        value = value.to(torch.float32) / v_scale
+        value = torch.clamp(value, dtype_info.min, dtype_info.max)
+        value = value.to(kv_cache_quantized_dtype)
+
+    kv = torch.cat([key, value], axis=-1).reshape(-1, num_combined_kv_heads,
+                                                  head_size)
+
+    torch.ops.xla.dynamo_set_buffer_donor_(kv_cache, True)
+
+    kv_cache = kv_cache.flatten(0, 1)
+    new_kv_cache = torch.ops.xla.kv_cache_update_op(
+        kv, slot_mapping, kv_cache, num_kv_update_slices, page_size,
+        num_slices_per_kv_cache_update_block)
+    # NOTE: the in-place copy will be optimized away by XLA compiler.
+    kv_cache.copy_(new_kv_cache)
+
+
+@requires_jax
+def kv_cache_update_op_impl(kv: torch.Tensor, slot_mapping: torch.Tensor,
+                            kv_cache: torch.Tensor,
+                            num_kv_update_slices: torch.Tensor, page_size: int,
+                            num_slices_per_block: int):
+    from vllm.attention.ops.pallas_kv_cache_update import kv_cache_update
+    new_kv_cache = xb.call_jax(
+        kv_cache_update, (kv, slot_mapping, kv_cache, num_kv_update_slices), {
+            "page_size": page_size,
+            "num_slices_per_block": num_slices_per_block
+        })
+    return new_kv_cache
+
+
+XLA_LIB.define(
+    "kv_cache_update_op(Tensor kv, Tensor slot_mapping, Tensor kv_cache," \
+    "Tensor num_kv_update_slices, int page_size, int num_slices_per_block)" \
+    "-> Tensor", )
+
+
+@impl(XLA_LIB, "kv_cache_update_op", "XLA")
+def kv_cache_update_op_xla(kv: torch.Tensor, slot_mapping: torch.Tensor,
+                           kv_cache: torch.Tensor,
+                           num_kv_update_slices: torch.Tensor, page_size: int,
+                           num_slices_per_block: int) -> torch.Tensor:
+    new_kv_cache = kv_cache_update_op_impl(kv, slot_mapping, kv_cache,
+                                           num_kv_update_slices, page_size,
+                                           num_slices_per_block)
+    return new_kv_cache
+
+
+@impl(XLA_LIB, "kv_cache_update_op", "CompositeExplicitAutograd")
+def kv_cache_update_op_non_xla(kv: torch.Tensor, slot_mapping: torch.Tensor,
+                               kv_cache: torch.Tensor,
+                               num_kv_update_slices: torch.Tensor,
+                               page_size: int,
+                               num_slices_per_block: int) -> torch.Tensor:
+    return kv_cache
+
+
+# We can move this function to a common utils file if it's also useful for other
+# hardware.
+def dtype_bits(dtype: torch.dtype):
+    if dtype.is_floating_point:
+        try:
+            return torch.finfo(dtype).bits
+        except TypeError:
+            pass
+    elif dtype.is_complex:
+        if dtype is torch.complex32:
+            return 32
+        elif dtype is torch.complex64:
+            return 64
+        elif dtype is torch.complex128:
+            return 128
+    else:
+        try:
+            return torch.iinfo(dtype).bits
+        # torch.iinfo cannot support int4, int2, bits8...
+        except TypeError:
+            pass
+    str_dtype = str(dtype)
+    # support torch.int4, torch.int5, torch.uint5...
+    if str_dtype.startswith("torch.int") or str_dtype.startswith("torch.uint"):
+        return int(str_dtype[-1])
+    raise TypeError(f"Getting the bit width of {dtype} is not supported")
+
+
+def get_dtype_packing(dtype):
+    bits = dtype_bits(dtype)
+    if 32 % bits != 0:
+        raise ValueError(
+            f"The bit width must be divisible by 32, but got bits={bits}, "
+            "dtype={dtype}")
+    return 32 // bits
+
+
+def get_page_size_bytes(block_size: int, num_kv_heads: int, head_size: int,
+                        kv_cache_dtype: torch.dtype) -> int:
+    """Returns the size in bytes of one page of the KV cache."""
+    padded_head_size = cdiv(head_size,
+                            TPU_HEAD_SIZE_ALIGNMENT) * TPU_HEAD_SIZE_ALIGNMENT
+    num_combined_kv_heads = num_kv_heads * 2
+
+    # NOTE: for the implicit padding in XLA
+    packing = get_dtype_packing(kv_cache_dtype)
+    num_combined_kv_heads = cdiv(num_combined_kv_heads, packing) * packing
+
+    kv_cache_dtype_bits = dtype_bits(kv_cache_dtype)
+    return (block_size * num_combined_kv_heads * padded_head_size *
+            kv_cache_dtype_bits // 8)
diff --git a/vllm_v0.10.0/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm_v0.10.0/vllm/v1/attention/backends/rocm_aiter_fa.py
new file mode 100644
index 0000000..0739d25
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -0,0 +1,513 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Attention layer with AiterFlashAttention."""
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionMetadata, AttentionType,
+                                              is_quantized_kv_cache)
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+from vllm.v1.kv_cache_interface import AttentionSpec
+
+if current_platform.is_rocm():
+    import aiter
+
+    from vllm.triton_utils import tl, triton
+    from vllm.utils import direct_register_custom_op
+
+    @triton.jit
+    def _vllm_layout_trans_kernel(
+        k_buffer_ptr,
+        v_buffer_ptr,
+        k_values_ptr,
+        v_values_ptr,
+        b_query_lens_loc,
+        b_seq_lens_loc,
+        block_table,
+        block_table_stride_0,
+        E_DIM: tl.constexpr,
+        BLOCK_SIZE: tl.constexpr,
+    ):
+        batch_idx = tl.program_id(0)
+        block_idx = tl.program_id(1)
+        batch_token_indexes = tl.load(b_seq_lens_loc + batch_idx +
+                                      tl.arange(0, 2))
+        batch_token_start, batch_token_end = tl.split(batch_token_indexes)
+        seq_len = batch_token_end - batch_token_start
+
+        batch_query_indexes = tl.load(b_query_lens_loc + batch_idx +
+                                      tl.arange(0, 2))
+        batch_query_start, batch_query_end = tl.split(batch_query_indexes)
+        query_len = batch_query_end - batch_query_start
+        if query_len <= 1:
+            return
+        if block_idx * BLOCK_SIZE < seq_len:
+            block_mask = (block_idx * BLOCK_SIZE +
+                          tl.arange(0, BLOCK_SIZE)[:, None]) < seq_len
+
+            kv_idx = tl.load(block_table + batch_idx * block_table_stride_0 +
+                             block_idx)
+
+            kv_buffer_off = kv_idx * BLOCK_SIZE * E_DIM + tl.arange(
+                0, BLOCK_SIZE)[:, None] * E_DIM + tl.arange(0, E_DIM)[None, :]
+            k_vals = tl.load(k_buffer_ptr + kv_buffer_off,
+                             mask=block_mask,
+                             other=0.0)
+            v_vals = tl.load(v_buffer_ptr + kv_buffer_off,
+                             mask=block_mask,
+                             other=0.0)
+
+            kv_values_off = batch_token_start * E_DIM + \
+                block_idx * BLOCK_SIZE * E_DIM + \
+                tl.arange(0, BLOCK_SIZE)[:, None] * E_DIM + \
+                tl.arange(0, E_DIM)[None, :]
+            tl.store(k_values_ptr + kv_values_off, k_vals, mask=block_mask)
+            tl.store(v_values_ptr + kv_values_off, v_vals, mask=block_mask)
+
+    def vllm_layout_trans(b_query_lens_loc, b_seq_lens_loc, block_table,
+                          k_buffer, v_buffer, max_seq_len, total_tokens):
+        H_KV = v_buffer.shape[2]
+        D = v_buffer.shape[3]
+        BLOCK_SIZE = v_buffer.shape[1]
+        dtype = k_buffer.dtype
+        k_values = torch.empty((total_tokens, H_KV, D),
+                               dtype=dtype,
+                               device="cuda")
+        v_values = torch.empty((total_tokens, H_KV, D),
+                               dtype=dtype,
+                               device="cuda")
+
+        grid = (block_table.shape[0],
+                (max_seq_len + BLOCK_SIZE - 1) // BLOCK_SIZE)
+
+        _vllm_layout_trans_kernel[grid](k_buffer,
+                                        v_buffer,
+                                        k_values,
+                                        v_values,
+                                        b_query_lens_loc,
+                                        b_seq_lens_loc,
+                                        block_table,
+                                        block_table.stride(0),
+                                        E_DIM=H_KV * D,
+                                        BLOCK_SIZE=BLOCK_SIZE)
+
+        return k_values, v_values
+
+    def flash_attn_varlen_func_impl(
+        q: torch.Tensor,
+        k_cache: torch.Tensor,
+        v_cache: torch.Tensor,
+        out: torch.Tensor,
+        cu_seqlens_q: torch.Tensor,
+        cu_seqlens_k: torch.Tensor,
+        total_tokens: int,
+        max_seqlen_q: int,
+        max_seqlen_k: int,
+        softmax_scale: float,
+        window_size: Optional[list[int]],  # -1 means infinite context window
+        alibi_slopes: Optional[list[float]],
+        block_table: torch.Tensor,
+    ) -> torch.Tensor:
+        k, v = vllm_layout_trans(cu_seqlens_q, cu_seqlens_k, block_table,
+                                 k_cache, v_cache, max_seqlen_k, total_tokens)
+        output = aiter.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=cu_seqlens_q,
+            max_seqlen_q=max_seqlen_q,
+            min_seqlen_q=1,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_k=max_seqlen_k,
+            softmax_scale=softmax_scale,
+            causal=True,
+            alibi_slopes=alibi_slopes,
+            window_size=window_size,
+            out=out,
+        )
+        return output
+
+    def flash_attn_varlen_func_fake(
+        q: torch.Tensor,
+        k_cache: torch.Tensor,
+        v_cache: torch.Tensor,
+        out: torch.Tensor,
+        cu_seqlens_q: torch.Tensor,
+        cu_seqlens_k: torch.Tensor,
+        total_tokens: int,
+        max_seqlen_q: int,
+        max_seqlen_k: int,
+        softmax_scale: float,
+        window_size: Optional[list[int]],  # -1 means infinite context window
+        alibi_slopes: Optional[list[float]],
+        block_table: torch.Tensor,
+    ) -> torch.Tensor:
+        return torch.empty(q.shape[0],
+                           q.shape[1],
+                           v_cache.shape[-2],
+                           dtype=torch.float8_e4m3fnuz,
+                           device="cuda")
+
+    direct_register_custom_op("flash_attn_varlen_func",
+                              flash_attn_varlen_func_impl, ["out"],
+                              flash_attn_varlen_func_fake,
+                              dispatch_key=current_platform.dispatch_key)
+
+logger = init_logger(__name__)
+
+
+class AiterFlashAttentionMetadataBuilder:
+
+    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
+                 device: torch.device):
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.parallel_config = vllm_config.parallel_config
+        self.cache_config = vllm_config.cache_config
+        self.device = device
+
+        self.num_heads_q = self.model_config.get_num_attention_heads(
+            self.parallel_config)
+        self.num_heads_kv = self.model_config.get_num_kv_heads(
+            self.parallel_config)
+        self.headdim = self.model_config.get_head_size()
+        self.block_size = kv_cache_spec.block_size
+        self.kv_cache_spec = kv_cache_spec
+
+        # Sliding window size to be used with the AOT scheduler will be
+        # populated on first build() call.
+        self.aot_sliding_window: Optional[tuple[int, int]] = None
+
+    def reorder_batch(self, input_batch, scheduler_output) -> bool:
+        return False
+
+    def build(self,
+              common_prefix_len: int,
+              common_attn_metadata: CommonAttentionMetadata,
+              fast_build: bool = False) -> 'AiterFlashAttentionMetadata':
+
+        num_actual_tokens = common_attn_metadata.num_actual_tokens
+        max_query_len = common_attn_metadata.max_query_len
+
+        max_seq_len = int(common_attn_metadata.seq_lens_cpu.max())
+        total_tokens = int(common_attn_metadata.seq_lens_cpu.sum())
+        query_start_loc = common_attn_metadata.query_start_loc
+        seq_lens = common_attn_metadata.seq_lens
+        block_table_tensor = common_attn_metadata.block_table_tensor
+        slot_mapping = common_attn_metadata.slot_mapping
+
+        cu_seq_lens = torch.zeros(seq_lens.shape[0] + 1,
+                                  dtype=torch.int32,
+                                  device=self.device)
+        torch.cumsum(seq_lens,
+                     dim=0,
+                     dtype=cu_seq_lens.dtype,
+                     out=cu_seq_lens[1:])
+
+        use_cascade = common_prefix_len > 0
+
+        cu_prefix_query_lens = None
+        prefix_kv_lens = None
+        suffix_kv_lens = None
+
+        attn_metadata = AiterFlashAttentionMetadata(
+            num_actual_tokens=num_actual_tokens,
+            max_query_len=max_query_len,
+            query_start_loc=query_start_loc,
+            max_seq_len=max_seq_len,
+            seq_lens=seq_lens,
+            cu_seq_lens=cu_seq_lens,
+            total_tokens=total_tokens,
+            block_table=block_table_tensor,
+            slot_mapping=slot_mapping,
+            use_cascade=use_cascade,
+            common_prefix_len=common_prefix_len,
+            cu_prefix_query_lens=cu_prefix_query_lens,
+            prefix_kv_lens=prefix_kv_lens,
+            suffix_kv_lens=suffix_kv_lens,
+        )
+        return attn_metadata
+
+    def can_run_in_cudagraph(
+            self, common_attn_metadata: CommonAttentionMetadata) -> bool:
+        # Full CUDA Graph always supported (FA2 support checked separately)
+        return True
+
+    def use_cascade_attention(self, *args, **kwargs) -> bool:
+        return False
+
+
+class AiterFlashAttentionBackend(AttentionBackend):
+
+    accept_output_buffer: bool = True
+
+    @classmethod
+    def get_supported_dtypes(cls) -> list[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        return [32, 64, 96, 128, 160, 192, 224, 256]
+
+    @classmethod
+    def validate_head_size(cls, head_size: int) -> None:
+        supported_head_sizes = cls.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
+            attn_type = cls.__name__.removesuffix("Backend")
+            raise ValueError(
+                f"Head size {head_size} is not supported by {attn_type}. "
+                f"Supported head sizes are: {supported_head_sizes}. "
+                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
+                "FlexAttention backend which supports all head sizes.")
+
+    @staticmethod
+    def get_name() -> str:
+        return "FLASH_ATTN_VLLM_V1"
+
+    @staticmethod
+    def get_impl_cls() -> type["AiterFlashAttentionImpl"]:
+        return AiterFlashAttentionImpl
+
+    @staticmethod
+    def get_metadata_cls() -> type["AttentionMetadata"]:
+        return AiterFlashAttentionMetadata
+
+    @staticmethod
+    def get_builder_cls() -> type["AiterFlashAttentionMetadataBuilder"]:
+        return AiterFlashAttentionMetadataBuilder
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> tuple[int, ...]:
+        if block_size % 16 != 0:
+            raise ValueError("Block size must be a multiple of 16.")
+        return (2, num_blocks, block_size, num_kv_heads, head_size)
+
+
+@dataclass
+class AiterFlashAttentionMetadata:
+    # NOTE(sang): Definition of context_len, query_len, and seq_len.
+    # |---------- N-1 iteration --------|
+    # |---------------- N iteration ---------------------|
+    # |- tokenA -|......................|-- newTokens ---|
+    # |---------- context_len ----------|
+    # |-------------------- seq_len ---------------------|
+    #                                   |-- query_len ---|
+
+    num_actual_tokens: int  # Number of tokens excluding padding.
+    max_query_len: int
+    query_start_loc: torch.Tensor
+    max_seq_len: int
+    seq_lens: torch.Tensor
+    cu_seq_lens: torch.Tensor
+    total_tokens: int
+    block_table: torch.Tensor
+    slot_mapping: torch.Tensor
+
+    # For cascade attention.
+    use_cascade: bool
+    common_prefix_len: int
+    cu_prefix_query_lens: Optional[torch.Tensor]
+    prefix_kv_lens: Optional[torch.Tensor]
+    suffix_kv_lens: Optional[torch.Tensor]
+
+
+class AiterFlashAttentionImpl(AttentionImpl):
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[list[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: AttentionType = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[int] = None,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+        self.alibi_slopes = alibi_slopes
+        if sliding_window is None:
+            self.sliding_window = [-1, -1]
+        else:
+            self.sliding_window = [sliding_window - 1, 0]
+        self.kv_cache_dtype = kv_cache_dtype
+        if logits_soft_cap is None:
+            # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
+            logits_soft_cap = 0.
+        self.logits_soft_cap = logits_soft_cap
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
+
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        AiterFlashAttentionBackend.validate_head_size(head_size)
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "FlashAttentionImpl")
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "AiterFlashAttention does not support fp8 kv-cache on this "
+                "device.")
+
+    def forward(
+        self,
+        layer: torch.nn.Module,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AiterFlashAttentionMetadata,
+        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass with AiterFlashAttention.
+
+        Args:
+            query: shape = [num_tokens, num_heads, head_size]
+            key: shape = [num_tokens, num_kv_heads, head_size]
+            value: shape = [num_tokens, num_kv_heads, head_size]
+            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        NOTE: FP8 quantization, flash-attn expect the size of
+              {q,k,v}_descale to be (num_sequences, num_kv_heads).
+              We use torch's .expand() to avoid duplicating values
+        """
+        assert output is not None, "Output tensor must be provided."
+
+        if output_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported"
+                " for FlashAttentionImpl")
+
+        if attn_metadata is None:
+            # Profiling run.
+            return output
+
+        # IMPORTANT!
+        # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in
+        # eager-mode PyTorch. Thus, we need to be careful about any CPU overhead
+        # in this method. For example, `view` and `slice` (or `[:n]`) operations
+        # are surprisingly slow even in the case they do not invoke any GPU ops.
+        # Minimize the PyTorch ops in this method as much as possible.
+        # Whenever making a change in this method, please benchmark the
+        # performance to make sure it does not introduce any overhead.
+
+        num_actual_tokens = attn_metadata.num_actual_tokens
+        key_cache, value_cache = kv_cache.unbind(0)
+        if self.kv_sharing_target_layer_name is None:
+            # Reshape the input keys and values and store them in the cache.
+            # Skip this if sharing KV cache with an earlier attention layer.
+            # NOTE(woosuk): Here, key and value are padded while slot_mapping is
+            # not padded. However, we don't need to do key[:num_actual_tokens]
+            # and value[:num_actual_tokens] because the reshape_and_cache_flash
+            # op uses the slot_mapping's shape to determine the number of
+            # actual tokens.
+            torch.ops._C_cache_ops.reshape_and_cache_flash(
+                key,
+                value,
+                key_cache,
+                value_cache,
+                attn_metadata.slot_mapping,
+                self.kv_cache_dtype,
+                layer._k_scale,
+                layer._v_scale,
+            )
+
+        if self.kv_cache_dtype.startswith("fp8"):
+            key_cache = key_cache.view(torch.float8_e4m3fnuz)
+            value_cache = value_cache.view(torch.float8_e4m3fnuz)
+            num_tokens, num_heads, head_size = query.shape
+            query, _ = ops.scaled_fp8_quant(
+                query.reshape(
+                    (num_tokens, num_heads * head_size)).contiguous(),
+                layer._q_scale)
+            query = query.reshape((num_tokens, num_heads, head_size))
+
+        if not attn_metadata.use_cascade:
+            cu_seqlens_q = attn_metadata.query_start_loc
+            seqused_k = attn_metadata.seq_lens
+            max_seqlen_q = attn_metadata.max_query_len
+            max_seqlen_k = attn_metadata.max_seq_len
+            block_table = attn_metadata.block_table
+
+            if max_seqlen_q > 1:
+                cu_seq_lens = attn_metadata.cu_seq_lens
+                total_tokens = attn_metadata.total_tokens
+                torch.ops.vllm.flash_attn_varlen_func(
+                    query[:num_actual_tokens],
+                    key_cache,
+                    value_cache,
+                    out=output[:num_actual_tokens],
+                    cu_seqlens_q=cu_seqlens_q,
+                    max_seqlen_q=max_seqlen_q,
+                    max_seqlen_k=max_seqlen_k,
+                    total_tokens=total_tokens,
+                    softmax_scale=self.scale,
+                    alibi_slopes=self.alibi_slopes,
+                    window_size=self.sliding_window,
+                    block_table=block_table,
+                    cu_seqlens_k=cu_seq_lens)
+
+            _, num_heads, head_size = query.shape
+            _PARTITION_SIZE_ROCM = 256
+            num_seqs = seqused_k.shape[0]
+            nbyes_per_qo_elem = torch.finfo(output.dtype).bits // 8
+            max_num_partitions = (max_seqlen_k + _PARTITION_SIZE_ROCM -
+                                  1) // _PARTITION_SIZE_ROCM
+
+            workspace_buffer = torch.empty(
+                (num_seqs * num_heads * max_num_partitions * head_size) *
+                nbyes_per_qo_elem + 2 *
+                (num_seqs * num_heads * max_num_partitions) * 4,
+                dtype=torch.uint8,
+                device=output.device,
+            )
+
+            aiter.paged_attention_v1(
+                output[:num_actual_tokens],
+                workspace_buffer,
+                query[:num_actual_tokens],
+                key_cache,
+                value_cache,
+                self.scale,
+                block_table,
+                cu_seqlens_q,
+                seqused_k,
+                max_seqlen_k,
+                self.alibi_slopes,
+                self.kv_cache_dtype,
+                "NHD",
+                self.logits_soft_cap,
+                layer._k_scale,
+                layer._v_scale,
+                None,
+                _PARTITION_SIZE_ROCM,
+            )
+            return output
+        else:
+            raise NotImplementedError(
+                "Cascade attention is not implemented for ROCM AITER")
diff --git a/vllm_v0.10.0/vllm/v1/attention/backends/triton_attn.py b/vllm_v0.10.0/vllm/v1/attention/backends/triton_attn.py
new file mode 100644
index 0000000..83471ca
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/attention/backends/triton_attn.py
@@ -0,0 +1,384 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Attention layer with PagedAttention and Triton prefix prefill."""
+from dataclasses import dataclass
+from typing import ClassVar, Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm import envs
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionMetadata, AttentionType)
+from vllm.attention.ops.chunked_prefill_paged_decode import (
+    chunked_prefill_paged_decode)
+from vllm.attention.ops.paged_attn import PagedAttention
+from vllm.attention.ops.triton_unified_attention import unified_attention
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
+from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
+                                              CommonAttentionMetadata)
+from vllm.v1.kv_cache_interface import AttentionSpec
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class TritonAttentionMetadata:
+    # NOTE(sang): Definition of context_len, query_len, and seq_len.
+    # |---------- N-1 iteration --------|
+    # |---------------- N iteration ---------------------|
+    # |- tokenA -|......................|-- newTokens ---|
+    # |---------- context_len ----------|
+    # |-------------------- seq_len ---------------------|
+    #                                   |-- query_len ---|
+
+    num_actual_tokens: int  # Number of tokens excluding padding.
+    max_query_len: int
+    query_start_loc: torch.Tensor
+    max_seq_len: int
+    seq_lens: torch.Tensor
+    block_table: torch.Tensor
+    slot_mapping: torch.Tensor
+
+    # For cascade attention.
+    use_cascade: bool
+    common_prefix_len: int
+    cu_prefix_query_lens: Optional[torch.Tensor]
+    prefix_kv_lens: Optional[torch.Tensor]
+    suffix_kv_lens: Optional[torch.Tensor]
+
+    # Optional aot scheduling
+    scheduler_metadata: Optional[torch.Tensor] = None
+    prefix_scheduler_metadata: Optional[torch.Tensor] = None
+
+
+class TritonAttentionMetadataBuilder(
+        AttentionMetadataBuilder[TritonAttentionMetadata]):
+    full_cudagraph_supported: ClassVar[bool] = True
+
+    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
+                 device: torch.device):
+        self.device = device
+        self.block_size = kv_cache_spec.block_size
+        self.kv_cache_spec = kv_cache_spec
+
+        model_config = vllm_config.model_config
+        self.num_heads_q = model_config.get_num_attention_heads(
+            vllm_config.parallel_config)
+        self.num_heads_kv = model_config.get_num_kv_heads(
+            vllm_config.parallel_config)
+        self.headdim = model_config.get_head_size()
+
+    def build_for_cudagraph_capture(
+        self, common_attn_metadata: CommonAttentionMetadata
+    ) -> TritonAttentionMetadata:
+        attn_metadata = self.build(0, common_attn_metadata)
+        # When doing full graph capture, setting seq_lens to
+        # max_model_len will cause graph capture to be extremely
+        # slow, so here we set it to 1.
+        attn_metadata.seq_lens.fill_(1)
+        return attn_metadata
+
+    def build(self,
+              common_prefix_len: int,
+              common_attn_metadata: CommonAttentionMetadata,
+              fast_build: bool = False) -> TritonAttentionMetadata:
+        num_actual_tokens = common_attn_metadata.num_actual_tokens
+        max_query_len = common_attn_metadata.max_query_len
+
+        max_seq_len = int(common_attn_metadata.seq_lens_cpu.max())
+        query_start_loc = common_attn_metadata.query_start_loc
+        seq_lens = common_attn_metadata.seq_lens
+        block_table_tensor = common_attn_metadata.block_table_tensor
+        slot_mapping = common_attn_metadata.slot_mapping
+
+        use_cascade = common_prefix_len > 0
+
+        if use_cascade:
+            cu_prefix_query_lens = torch.tensor([0, num_actual_tokens],
+                                                dtype=torch.int32,
+                                                device=self.device)
+            prefix_kv_lens = torch.tensor([common_prefix_len],
+                                          dtype=torch.int32,
+                                          device=self.device)
+            suffix_kv_lens = (common_attn_metadata.seq_lens_cpu -
+                              common_prefix_len)
+            suffix_kv_lens = suffix_kv_lens.to(self.device)
+        else:
+            cu_prefix_query_lens = None
+            prefix_kv_lens = None
+            suffix_kv_lens = None
+            prefix_scheduler_metadata = None
+
+        attn_metadata = TritonAttentionMetadata(
+            num_actual_tokens=num_actual_tokens,
+            max_query_len=max_query_len,
+            query_start_loc=query_start_loc,
+            max_seq_len=max_seq_len,
+            seq_lens=seq_lens,
+            block_table=block_table_tensor,
+            slot_mapping=slot_mapping,
+            use_cascade=use_cascade,
+            common_prefix_len=common_prefix_len,
+            cu_prefix_query_lens=cu_prefix_query_lens,
+            prefix_kv_lens=prefix_kv_lens,
+            suffix_kv_lens=suffix_kv_lens,
+            prefix_scheduler_metadata=prefix_scheduler_metadata,
+        )
+        return attn_metadata
+
+    def can_run_in_cudagraph(
+            self, common_attn_metadata: CommonAttentionMetadata) -> bool:
+        # Full CUDA Graph always supported
+        return True
+
+
+class TritonAttentionBackend(AttentionBackend):
+
+    accept_output_buffer: bool = True
+
+    @classmethod
+    def get_supported_dtypes(cls) -> list[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        return [32, 64, 96, 128, 160, 192, 224, 256]
+
+    @classmethod
+    def validate_head_size(cls, head_size: int) -> None:
+        supported_head_sizes = cls.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
+            attn_type = cls.__name__.removesuffix("Backend")
+            raise ValueError(
+                f"Head size {head_size} is not supported by {attn_type}. "
+                f"Supported head sizes are: {supported_head_sizes}. "
+                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
+                "FlexAttention backend which supports all head sizes.")
+
+    @staticmethod
+    def get_name() -> str:
+        return "TRITON_ATTN_VLLM_V1"
+
+    @staticmethod
+    def get_impl_cls() -> type["TritonAttentionImpl"]:
+        return TritonAttentionImpl
+
+    @staticmethod
+    def get_metadata_cls() -> type["AttentionMetadata"]:
+        return TritonAttentionMetadata
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> tuple[int, ...]:
+        if block_size % 16 != 0:
+            raise ValueError("Block size must be a multiple of 16.")
+        return (2, num_blocks, block_size, num_kv_heads, head_size)
+
+    @staticmethod
+    def use_cascade_attention(*args, **kwargs) -> bool:
+        return False
+
+    @staticmethod
+    def get_builder_cls() -> type["TritonAttentionMetadataBuilder"]:
+        return TritonAttentionMetadataBuilder
+
+
+class TritonAttentionImpl(AttentionImpl):
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[list[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: AttentionType = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[int] = None,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+        self.alibi_slopes = alibi_slopes
+        if sliding_window is None:
+            self.sliding_window = (-1, -1)
+        else:
+            self.sliding_window = (sliding_window - 1, 0)
+        self.kv_cache_dtype = kv_cache_dtype
+        if logits_soft_cap is None:
+            # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
+            logits_soft_cap = 0
+        self.logits_soft_cap = logits_soft_cap
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
+
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        TritonAttentionBackend.validate_head_size(head_size)
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "TritonAttentionImpl")
+
+        self.fp8_dtype = current_platform.fp8_dtype()
+        self.force_prefill_decode_attn = \
+            envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION
+
+    def forward(
+        self,
+        layer: torch.nn.Module,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: FlashAttentionMetadata,
+        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass with FlashAttention.
+
+        Args:
+            query: shape = [num_tokens, num_heads, head_size]
+            key: shape = [num_tokens, num_kv_heads, head_size]
+            value: shape = [num_tokens, num_kv_heads, head_size]
+            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """
+        assert output is not None, "Output tensor must be provided."
+
+        if output_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported"
+                " for TritonAttentionImpl")
+
+        if attn_metadata is None:
+            # Profiling run.
+            return output
+
+        assert attn_metadata.use_cascade is False
+
+        # IMPORTANT!
+        # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in
+        # eager-mode PyTorch. Thus, we need to be careful about any CPU overhead
+        # in this method. For example, `view` and `slice` (or `[:n]`) operations
+        # are surprisingly slow even in the case they do not invoke any GPU ops.
+        # Minimize the PyTorch ops in this method as much as possible.
+        # Whenever making a change in this method, please benchmark the
+        # performance to make sure it does not introduce any overhead.
+
+        use_prefill_decode_attn = self.force_prefill_decode_attn
+        num_actual_tokens = attn_metadata.num_actual_tokens
+
+        if use_prefill_decode_attn:
+            key_cache, value_cache = PagedAttention.split_kv_cache(
+                kv_cache, self.num_kv_heads, self.head_size)
+        else:
+            key_cache, value_cache = kv_cache.unbind(0)
+
+        if self.kv_sharing_target_layer_name is None:
+            # Reshape the input keys and values and store them in the cache.
+            # Skip this if sharing KV cache with an earlier attention layer.
+            if use_prefill_decode_attn:
+                PagedAttention.write_to_paged_cache(
+                    key,
+                    value,
+                    key_cache,
+                    value_cache,
+                    attn_metadata.slot_mapping,
+                    self.kv_cache_dtype,
+                    layer._k_scale,
+                    layer._v_scale,
+                )
+            else:
+                torch.ops._C_cache_ops.reshape_and_cache_flash(
+                    key,
+                    value,
+                    key_cache,
+                    value_cache,
+                    attn_metadata.slot_mapping,
+                    self.kv_cache_dtype,
+                    layer._k_scale,
+                    layer._v_scale,
+                )
+
+        if self.kv_cache_dtype.startswith("fp8"):
+            key_cache = key_cache.view(self.fp8_dtype)
+            value_cache = value_cache.view(self.fp8_dtype)
+            num_tokens, num_heads, head_size = query.shape
+            assert layer._q_scale == 1.0, \
+                "A non 1.0 q_scale is not currently supported."
+            if not current_platform.is_rocm():
+                # Skip Q quantization on ROCm, since dequantizing back to
+                # f32 in the attention kernel is not supported.
+                query, _ = ops.scaled_fp8_quant(
+                    query.reshape(
+                        (num_tokens, num_heads * head_size)).contiguous(),
+                    layer._q_scale)
+                query = query.reshape((num_tokens, num_heads, head_size))
+
+        cu_seqlens_q = attn_metadata.query_start_loc
+        seqused_k = attn_metadata.seq_lens
+        max_seqlen_q = attn_metadata.max_query_len
+        max_seqlen_k = attn_metadata.max_seq_len
+        block_table = attn_metadata.block_table
+
+        if use_prefill_decode_attn:
+            # Compute attention and update output up to `num_actual_tokens`.
+            chunked_prefill_paged_decode(query=query[:num_actual_tokens],
+                                         key=key[:num_actual_tokens],
+                                         value=value[:num_actual_tokens],
+                                         output=output[:num_actual_tokens],
+                                         kv_cache_dtype=self.kv_cache_dtype,
+                                         key_cache=key_cache,
+                                         value_cache=value_cache,
+                                         block_table=block_table,
+                                         query_start_loc=cu_seqlens_q,
+                                         seq_lens=seqused_k,
+                                         max_seq_len=max_seqlen_k,
+                                         max_query_len=max_seqlen_q,
+                                         k_scale=layer._k_scale,
+                                         v_scale=layer._v_scale,
+                                         alibi_slopes=self.alibi_slopes,
+                                         sliding_window=self.sliding_window[0],
+                                         sm_scale=self.scale)
+
+        else:
+            descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1])
+
+            unified_attention(
+                q=query[:num_actual_tokens],
+                k=key_cache,
+                v=value_cache,
+                out=output[:num_actual_tokens],
+                cu_seqlens_q=cu_seqlens_q,
+                max_seqlen_q=max_seqlen_q,
+                seqused_k=seqused_k,
+                max_seqlen_k=max_seqlen_k,
+                softmax_scale=self.scale,
+                causal=True,
+                alibi_slopes=self.alibi_slopes,
+                window_size=self.sliding_window,
+                block_table=block_table,
+                softcap=self.logits_soft_cap,
+                q_descale=None,  # Not supported
+                k_descale=layer._k_scale.expand(descale_shape),
+                v_descale=layer._v_scale.expand(descale_shape),
+            )
+
+        return output
diff --git a/vllm_v0.10.0/vllm/v1/attention/backends/utils.py b/vllm_v0.10.0/vllm/v1/attention/backends/utils.py
new file mode 100644
index 0000000..fc8649d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/attention/backends/utils.py
@@ -0,0 +1,503 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import abc
+import functools
+from abc import abstractmethod
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, ClassVar, Generic, Optional, TypeVar
+
+import numpy as np
+import torch
+
+from vllm.attention.layer import Attention
+from vllm.config import VllmConfig, get_layers_from_vllm_config
+from vllm.utils import cdiv
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionImpl
+    from vllm.v1.core.sched.output import SchedulerOutput
+    from vllm.v1.worker.gpu_input_batch import InputBatch
+
+import vllm.envs as envs
+from vllm.distributed.kv_transfer.kv_connector.utils import (
+    get_kv_connector_cache_layout)
+from vllm.logger import init_logger
+from vllm.v1.kv_cache_interface import AttentionSpec
+
+logger = init_logger(__name__)
+_KV_CACHE_LAYOUT_OVERRIDE = None
+
+
+@dataclass
+class CommonAttentionMetadata:
+    """
+    Per-batch attention metadata, shared across layers and backends.
+    AttentionMetadataBuilder instances use it to construct per-layer metadata.
+    
+    For many of the tensors we keep both GPU and CPU versions.
+    """
+
+    query_start_loc: torch.Tensor
+    query_start_loc_cpu: torch.Tensor
+    """(batch_size + 1,), the start location of each request in query Tensor"""
+
+    seq_lens: torch.Tensor
+    seq_lens_cpu: torch.Tensor
+    """(batch_size,), the length of each request including both computed tokens
+    and newly scheduled tokens"""
+
+    num_computed_tokens_cpu: torch.Tensor
+    """(batch_size,), the number of computed tokens for each request"""
+
+    num_reqs: int
+    """Number of requests"""
+    num_actual_tokens: int
+    """Total number of tokens in batch"""
+    max_query_len: int
+    """Longest query in batch"""
+
+    block_table_tensor: torch.Tensor
+    slot_mapping: torch.Tensor
+
+
+M = TypeVar("M")
+
+
+class AttentionMetadataBuilder(abc.ABC, Generic[M]):
+    # Does this backend/builder support CUDA Graphs for attention.
+    full_cudagraph_supported: ClassVar[bool] = False
+
+    @abstractmethod
+    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
+                 device: torch.device):
+        self.kv_cache_spec = kv_cache_spec
+
+    @abstractmethod
+    def build(self,
+              common_prefix_len: int,
+              common_attn_metadata: CommonAttentionMetadata,
+              fast_build: bool = False) -> M:
+        """
+        Central method that builds attention metadata.
+        Some builders (MLA) require reorder_batch to be called prior to build.
+        
+        Args:
+            common_prefix_len: The length of the common prefix of the batch.
+            common_attn_metadata: The common attention metadata.
+            fast_build: The meta-data will prioritize speed of building over
+                then speed at execution. Can be used for spec-decode where the
+                result of a build call may only be used for few layers/iters.
+        """
+        raise NotImplementedError
+
+    def can_run_in_cudagraph(
+            self, common_attn_metadata: CommonAttentionMetadata) -> bool:
+        """
+        Can this batch (with given metadata) use CUDA Graphs for attention.
+        """
+        return False
+
+    def build_for_cudagraph_capture(
+            self, common_attn_metadata: CommonAttentionMetadata) -> M:
+        """
+        Build attention metadata for CUDA graph capture. Uses build by default.
+        Subclasses that override this method should call self.build or
+        super().build_for_cudagraph_capture.
+        """
+        return self.build(common_prefix_len=0,
+                          common_attn_metadata=common_attn_metadata)
+
+    def use_cascade_attention(
+        self,
+        common_prefix_len: int,
+        query_lens: np.ndarray,
+        num_query_heads: int,
+        num_kv_heads: int,
+        use_alibi: bool,
+        use_sliding_window: bool,
+        use_local_attention: bool,
+        num_sms: int,
+    ) -> bool:
+        return False
+
+    def reorder_batch(self, input_batch: "InputBatch",
+                      scheduler_output: "SchedulerOutput") -> bool:
+        """
+        This method can reorder the batch if desired by the backend.
+        :return: Has the batch been reordered (default False).
+        """
+        return False
+
+
+@functools.lru_cache
+def get_kv_cache_layout():
+    global _KV_CACHE_LAYOUT_OVERRIDE
+    # Override with format specified by the user.
+    cache_layout = envs.VLLM_KV_CACHE_LAYOUT
+    if cache_layout is None:
+        cache_layout = get_kv_connector_cache_layout()
+    else:
+        logger.info_once("`VLLM_KV_CACHE_LAYOUT` environment variable " \
+        "detected. Setting KV cache layout to %s.", cache_layout)
+    if _KV_CACHE_LAYOUT_OVERRIDE is not None:
+        cache_layout = _KV_CACHE_LAYOUT_OVERRIDE
+    return cache_layout
+
+
+def set_kv_cache_layout(cache_layout: str):
+    global _KV_CACHE_LAYOUT_OVERRIDE
+    _KV_CACHE_LAYOUT_OVERRIDE = cache_layout
+
+
+@dataclass
+class PerLayerParameters:
+    """
+    Currently, FlashInfer backend only support models in which all layers share
+    the same values for the following hyperparameters.
+    """
+
+    window_left: int
+    logits_soft_cap: Optional[float]
+    sm_scale: float
+
+
+def get_per_layer_parameters(
+        vllm_config: VllmConfig,
+        cls_: type['AttentionImpl']) -> dict[str, PerLayerParameters]:
+    """
+    Scan all attention layers and determine some hyperparameters
+    to use during `plan`.
+    """
+
+    layers = get_layers_from_vllm_config(vllm_config, Attention)
+    per_layer_params: dict[str, PerLayerParameters] = {}
+
+    for key, layer in layers.items():
+        impl = layer.impl
+        assert isinstance(impl, cls_)
+
+        # Infer hyperparameters from the attention layer
+        window_size = getattr(impl, "sliding_window", None)
+        window_left = window_size[0] if window_size is not None else -1
+        logits_soft_cap = getattr(impl, "logits_soft_cap", None)
+        sm_scale = impl.scale
+
+        per_layer_params[key] = PerLayerParameters(window_left,
+                                                   logits_soft_cap, sm_scale)
+
+    return per_layer_params
+
+
+def infer_global_hyperparameters(
+        per_layer_params: dict[str, PerLayerParameters]) -> PerLayerParameters:
+    """
+    Currently, FlashInfer backend only support models in which all layers share
+    the same values for the following hyperparameters:
+    - `window_left`
+    - `logits_soft_cap`
+    - `sm_scale`
+
+    So this function asserts that all layers share the same values for these
+    hyperparameters and returns the global values.
+    """
+
+    assert len(per_layer_params) > 0, "No attention layers found in the model."
+
+    param_sets = list(per_layer_params.values())
+    global_params = param_sets[0]
+    for params in param_sets:
+        assert params == global_params, (
+            "FlashInfer backend currently only supports models in which all "
+            "layers share the same values for the following hyperparameters: "
+            "`window_left`, `logits_soft_cap`, `sm_scale`.")
+
+    return global_params
+
+
+#
+# Take in `query_start_loc_np` and `seq_lens_np` and break the sequences into
+# local attention blocks, where each block is passed to the attention kernel
+# as an independent local ("virtual") batch item.
+#
+# For example, if are performing a chunked prefill a batch of 3 sequences:
+#   q_seqlens  = [4, 10, 5]
+#   kv_seqlens = [6, 17, 9]
+# Then normally for regular attention we would compute with an attention mask
+#  for batch idx 0 (q_seqlens = 4, kv_seqlens = 6) like:
+#   batch idx: 0 (q_seqlens = 4, kv_seqlens = 6)
+#        k_toks >   0 1 2 3 4 5
+#        q_toks v  _____________
+#               0 | 1 1 1
+#               1 | 1 1 1 1
+#               2 | 1 1 1 1 1
+#               3 | 1 1 1 1 1 1
+#
+# for local attention (with attn_chunk_size = 4) we would compute with an
+#  attention mask like:
+#   batch idx: 0  (q_seqlens = 4, kv_seqlens = 6, attn_chunk_size = 4)
+#        k_toks >   0 1 2 3 4 5
+#        q_toks v  _____________
+#               0 | 1 1 1
+#               1 | 1 1 1 1
+#               2 |         1
+#               3 |         1 1
+#
+# We can simulate this mask using standard flash-attention by breaking the
+#  sequences into local ("virtual") batches, where each local batch item is a
+#  local attention block, so in this case batch idx 0 would be broken up into:
+#
+#   local-batch idx: 0 (q_seqlens = 2, kv_seqlens = 4)  (batch 0)
+#        k_toks >   0 1 2 3
+#        q_toks v  _____________
+#               0 | 1 1 1
+#               1 | 1 1 1 1
+#   local-batch idx: 1 (q_seqlens = 2, kv_seqlens = 2) (batch 0)
+#        k_toks >   4 5
+#        q_toks v  _____________
+#               2 | 1
+#               3 | 1 1
+#
+# e.g. if we have:
+#   attn_chunk_size = 4
+#   query_start_loc_np = [0, 4, 14, 19] (q_seqlens = [4, 10, 5])
+# Then this function would return:
+#                           __b0__  ______b1______  __b2__ < orig batch indices
+#   q_seqlens_local    = [   2,  2,  1,  4,  4,  1,  4,  1]
+#   cu_seqlens_q_local = [0, 4,  6, 10, 14, 18, 19, 23, 24]
+#   seqlens_k_local    = [   4,  2,  4,  4,  4,  1,  4,  1]
+#   block_table_local  : shape[local_virtual_batches, pages_per_local_batch]
+def make_local_attention_virtual_batches(
+    attn_chunk_size: int,
+    common_attn_metadata: CommonAttentionMetadata,
+    block_size: int = 0,
+) -> CommonAttentionMetadata:
+    query_start_loc_np = common_attn_metadata.query_start_loc_cpu.numpy()
+    seq_lens_np = common_attn_metadata.seq_lens_cpu.numpy()
+    block_table = common_attn_metadata.block_table_tensor
+    device = common_attn_metadata.query_start_loc.device
+
+    q_seqlens = query_start_loc_np[1:] - query_start_loc_np[:-1]
+    actual_batch_size = seq_lens_np.shape[0]
+
+    # Handle if we are starting in the middle of a local attention block,
+    #  we assume q_seqlens > 0 (for all elements), for each batch idx we compute
+    #  the number of tokens that are not in the first local attention block and
+    #  then we can simply use a cdiv for the rest.
+    # For example if we have:
+    #   attn_chunk_size = 4
+    #   q_seqlens = [4, 10, 5]
+    #   k_seqlens = [6, 17, 9]
+    # Then we would get:
+    #   new_tokens_in_first_block = [2, 1, 4]
+    #   local_blocks = [2, 4, 2]
+    q_tokens_in_first_block = np.minimum(
+        attn_chunk_size - ((seq_lens_np - q_seqlens) % attn_chunk_size),
+        q_seqlens).astype(np.int32)
+    tokens_in_last_block = attn_chunk_size + (seq_lens_np % -attn_chunk_size)
+    local_blocks = 1 + cdiv(q_seqlens - q_tokens_in_first_block,
+                            attn_chunk_size)
+
+    # Once we know the number of local blocks we can compute the request spans
+    #  for each batch idx, we can figure out the number of "virtual" requests we
+    #  have to make,
+    # For the above example we would get:
+    #   seqlens_q_local = [2, 2, 1, 4, 4, 1, 4, 1]
+    #
+    # First Get batched arange. (E.g., [2, 4, 2] -> [0, 1, 0, 1, 2, 3, 0, 1])
+    #   (TODO: max a utility to share this code with _prepare_inputs)
+    # arange step 1. [2, 4, 2] -> [2, 6, 8]
+    cu_num_blocks = np.cumsum(local_blocks)
+    virtual_batches = cu_num_blocks[-1]
+    # arange step 2. [2, 6, 8] -> [0, 0, 2, 2, 2, 2, 6, 6]
+    block_offsets = np.repeat(cu_num_blocks - local_blocks, local_blocks)
+    # arange step 3. [0, 1, 0, 1, 2, 3, 0, 1]
+    arange = np.arange(virtual_batches, dtype=np.int32) - block_offsets
+    # also compute reverse arange (i.e. [1, 0, 3, 2, 1, 0, 1, 0])
+    rarange = np.repeat(local_blocks, local_blocks) - arange - 1
+    # Then we can compute the seqlens_q_local, handling the fact that the
+    #  first and last blocks could be partial
+    seqlens_q_local = \
+        np.repeat(q_seqlens - q_tokens_in_first_block, local_blocks)
+    # set the first block since this may be a partial block
+    seqlens_q_local[arange == 0] = q_tokens_in_first_block
+    # set the remaining blocks
+    seqlens_q_local[arange > 0] = np.minimum(
+        seqlens_q_local - attn_chunk_size * (arange - 1),
+        attn_chunk_size)[arange > 0]
+
+    # convert from q_seqlens to cu_seqlens_q
+    cu_seqlens_q_local = np.pad(np.cumsum(seqlens_q_local), (1, 0))\
+        .astype(np.int32)
+
+    # compute the seqlens_k_local,
+    #  basically a full local attention block for all but the last block in each
+    #  batch
+    # For our example this will be:
+    #   seqlens_k_local = [4, 2, 4, 4, 4, 1, 4, 1]
+    seqlens_k_local = np.full(cu_num_blocks[-1],
+                              attn_chunk_size,
+                              dtype=np.int32)
+    seqlens_k_local[cu_num_blocks - 1] = tokens_in_last_block
+    num_computed_tokens_local = seqlens_k_local - seqlens_q_local
+
+    k_seqstarts_absolute = np.repeat(seq_lens_np, local_blocks) - \
+        (rarange * attn_chunk_size + \
+            np.repeat(tokens_in_last_block, local_blocks))
+    # For the example the local attention blocks start at:
+    #                           _b0_  _____b1_____  _b2_
+    #   k_seqstarts_absolute = [0, 4, 4, 8, 12, 16, 4, 8]
+    block_starts = k_seqstarts_absolute // block_size
+    assert attn_chunk_size % block_size == 0, \
+        f"attn_chunk_size {attn_chunk_size} is not " \
+        f"divisible by block_size {block_size}"
+    pages_per_local_batch = attn_chunk_size // block_size
+
+    # Create a block_table for the local attention blocks
+    # For out example if we have a block-table like (assuming block_size=2):
+    #   block_table = [
+    #     [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],  < batch 0
+    #     [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],  < batch 1
+    #     [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],  < batch 2
+    #   ]
+    # Then for the local batches we would want a block-table like
+    #   block_table_local = [
+    #     [  0,  1 ], < local-batch 0, (batch 0, starting from k[0])
+    #     [  2,  3 ], < local-batch 1, (batch 0, starting from k[4])
+    #     [ 12, 13 ], < local-batch 2, (batch 1, starting from k[4])
+    #     [ 14, 15 ], < local-batch 3, (batch 1, starting from k[8])
+    #     [ 16, 17 ], < local-batch 4, (batch 1, starting from k[12])
+    #     [ 18, 19 ], < local-batch 5, (batch 1, starting from k[16])
+    #     [ 22, 23 ], < local-batch 6, (batch 2, starting from k[4])
+    #     [ 24, 25 ], < local-batch 7, (batch 2, starting from k[8])
+    #   ]
+    block_indices= np.broadcast_to(
+        np.arange(pages_per_local_batch, dtype=np.int32),
+        (virtual_batches, pages_per_local_batch)) \
+            + np.expand_dims(block_starts, axis=1)
+    block_indices = block_indices.flatten().clip(max=block_table.shape[1] - 1)
+    batch_indices = np.repeat(np.arange(actual_batch_size, dtype=np.int32),
+                              local_blocks * pages_per_local_batch)
+    block_table_local = block_table[batch_indices, block_indices]\
+        .view(virtual_batches, -1)
+
+    query_start_loc_cpu = torch.from_numpy(cu_seqlens_q_local)
+    seq_lens_cpu = torch.from_numpy(seqlens_k_local)
+
+    return CommonAttentionMetadata(
+        query_start_loc_cpu=query_start_loc_cpu,
+        query_start_loc=query_start_loc_cpu.to(device=device,
+                                               non_blocking=True),
+        seq_lens_cpu=seq_lens_cpu,
+        seq_lens=seq_lens_cpu.to(device=device, non_blocking=True),
+        num_computed_tokens_cpu=torch.from_numpy(num_computed_tokens_local),
+        num_reqs=len(seq_lens_cpu),
+        num_actual_tokens=common_attn_metadata.num_actual_tokens,
+        max_query_len=seqlens_q_local.max(),
+        block_table_tensor=block_table_local,
+        slot_mapping=common_attn_metadata.slot_mapping,
+    )
+
+
+def split_decodes_and_prefills(
+    common_attn_metadata: CommonAttentionMetadata,
+    decode_threshold: int = 1,
+) -> tuple[int, int, int, int]:
+    """
+    Assuming a reordered batch, finds the boundary between prefill and decode
+    requests.
+
+    Args:
+        common_attn_metadata: CommonAttentionMetadata object containing the
+            batch metadata.
+        decode_threshold: The maximum query length to be considered a decode.
+
+    Returns:
+        num_decodes: The number of decode requests.
+        num_prefills: The number of prefill requests.
+        num_decode_tokens: The number of tokens in the decode requests.
+        num_prefill_tokens: The number of tokens in the prefill requests.
+    """
+    max_query_len = common_attn_metadata.max_query_len
+    num_reqs = common_attn_metadata.num_reqs
+    num_tokens = common_attn_metadata.num_actual_tokens
+    query_start_loc = common_attn_metadata.query_start_loc_cpu
+
+    if max_query_len <= decode_threshold:
+        return num_reqs, 0, num_tokens, 0
+
+    query_lens = query_start_loc[1:] - query_start_loc[:-1]
+    is_prefill = query_lens > decode_threshold
+    if not torch.any(is_prefill):
+        return num_reqs, 0, num_tokens, 0
+
+    first_prefill = is_prefill.int().argmax(dim=-1).item()
+    assert torch.all(query_lens[first_prefill:] > decode_threshold)
+    assert torch.all(query_lens[:first_prefill] <= decode_threshold)
+    num_decodes = first_prefill
+    num_prefills = num_reqs - num_decodes
+    num_decode_tokens = query_start_loc[first_prefill].item()
+    num_prefill_tokens = num_tokens - num_decode_tokens
+    return (num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens)
+
+
+def reorder_batch_to_split_decodes_and_prefills(
+    input_batch: "InputBatch",
+    scheduler_output: "SchedulerOutput",
+    decode_threshold: int = 1,
+) -> bool:
+    """
+    Reorders the batch to split into prefill and decode requests; places all
+    requests with <= decode_threshold tokens at the front of the batch.
+    
+    Returns:
+        True if the batch was modified, False otherwise.
+    """
+    # We now want to reorder the batch so that the "decode" requests are at
+    # the front and the "prefill" requests are at the back using the least
+    # amount of swaps possible. (NOTE for now we loosely use "decode" to mean
+    # requests where attention is likely memory-bound and "prefill" to mean
+    # requests where attention is likely compute-bound, TODO(lucas): figure out
+    # a better naming here)
+    decodes = []
+    prefills = []
+    num_decode_tokens = 0
+    num_prefill_tokens = 0
+
+    for i, req_id in enumerate(input_batch.req_ids):
+        num_tokens = scheduler_output.num_scheduled_tokens[req_id]
+        # for now treat 1 scheduled token as "decode" even if its not,
+        # we should update this to something like < 8 in the future but
+        # currently the TritonMLA._forward_decode only supports
+        # num_tokens = 1
+        if num_tokens <= decode_threshold:
+            decodes.append(i)
+            num_decode_tokens += num_tokens
+        else:
+            prefills.append(i)
+            num_prefill_tokens += num_tokens
+
+    # We hope that this is fairly minimal since decodes
+    # should be around for a number of iterations so hopefully they are
+    # relatively stationary (and new request are generally appended to the
+    # persistent batch so already should be at the back)
+    # To achieve this we loop over the decodes in descending order and
+    # the prefills in ascending order. We swap decodes from the  "back"
+    # i.e. past where the last decode should be in the reodorered with
+    # prefills from the front of the batch.
+    # `decodes` and `prefills` are already in ascending order just based on
+    # the above loop
+    num_decodes = len(decodes)
+    num_prefills = len(prefills)
+    modified_batch = False
+
+    for i in range(1, min(num_decodes, num_prefills) + 1):
+        # If the decode is at the "back" of the batch, i, we can swap it
+        # with the prefill closest to the front of the batch
+        decode_idx = decodes[num_decodes - i]
+        if decode_idx < num_decodes:
+            break
+
+        input_batch.swap_states(prefills[i - 1], decode_idx)
+        modified_batch = True
+
+    return modified_batch
diff --git a/vllm_v0.10.0/vllm/v1/core/__init__.py b/vllm_v0.10.0/vllm/v1/core/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/v1/core/block_pool.py b/vllm_v0.10.0/vllm/v1/core/block_pool.py
new file mode 100644
index 0000000..5bf4d3a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/core/block_pool.py
@@ -0,0 +1,354 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections import defaultdict
+from collections.abc import Iterable
+from typing import Callable, Optional
+
+from vllm.distributed.kv_events import (AllBlocksCleared, BlockRemoved,
+                                        BlockStored, KVCacheEvent)
+from vllm.logger import init_logger
+from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
+                                         FreeKVCacheBlockQueue, KVCacheBlock,
+                                         generate_block_hash_extra_keys,
+                                         hash_block_tokens)
+from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+
+class BlockPool:
+    """BlockPool that manages KVCacheBlocks.
+    It provides methods to allocate, free and cache the kv cache blocks. The
+    free_block_queue stores the free blocks in eviction order to enable
+    allocation, free, and cache eviction. The cached_block_hash_to_block
+    maps between block hash and cached block to support finding cached blocks
+    by their block hash.
+
+    Args:
+        num_gpu_blocks: The number of blocks in the pool.
+        enable_caching: Whether to enable prefix caching.
+        enable_kv_cache_events: Whether to enable kv cache events.
+    """
+
+    def __init__(
+        self,
+        num_gpu_blocks: int,
+        enable_caching: bool,
+        enable_kv_cache_events: bool = False,
+    ):
+        assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0
+        self.num_gpu_blocks = num_gpu_blocks
+        self.enable_caching = enable_caching
+        # All kv-cache blocks.
+        self.blocks: list[KVCacheBlock] = [
+            KVCacheBlock(idx) for idx in range(num_gpu_blocks)
+        ]
+        # Free block queue that constructs and manipulates a doubly linked
+        # list of free blocks (including eviction candidates when caching is
+        # enabled).
+        self.free_block_queue = FreeKVCacheBlockQueue(self.blocks)
+
+        # {block_hash: {block ID: block}}. A cached block is
+        # a full block with a block hash that can be used for prefix caching.
+        # The cached block may be used by running requests or in the
+        # free_block_queue that could potentially be evicted.
+        # NOTE: We currently don't de-duplicate the blocks in the cache,
+        # meaning that if a block becomes full and is cached, we don't check
+        # if there is already an identical block in the cache. This is because
+        # we want to make sure the allocated block IDs won't change so that
+        # block tables are append-only.
+        self.cached_block_hash_to_block: dict[BlockHashWithGroupId, dict[
+            int, KVCacheBlock]] = defaultdict(dict)
+
+        # To represent a placeholder block with block_id=0.
+        # The ref_cnt of null_block is not maintained, needs special care to
+        # avoid freeing it.
+        self.null_block = self.free_block_queue.popleft()
+        self.null_block.is_null = True
+
+        self.enable_kv_cache_events = enable_kv_cache_events
+        self.kv_event_queue: list[KVCacheEvent] = []
+
+    def get_cached_block(
+            self, block_hash: BlockHash,
+            kv_cache_group_ids: list[int]) -> Optional[list[KVCacheBlock]]:
+        """Get the cached block by the block hash for each group in 
+        `kv_cache_group_ids`, or None if cache miss for any group.
+        If there are duplicated blocks, we return the first block in the cache.
+
+        Args:
+            block_hash: The hash value of the block.
+            kv_cache_group_ids: The ids of the KV cache groups.
+
+        Returns:
+            The cached blocks if exists, or None.
+        """
+        cached_blocks = []
+        for group_id in kv_cache_group_ids:
+            cached_blocks_one_group = self.cached_block_hash_to_block.get(
+                BlockHashWithGroupId(block_hash, group_id))
+            if not cached_blocks_one_group:
+                return None
+            first_block = next(iter(cached_blocks_one_group.values()))
+            cached_blocks.append(first_block)
+        return cached_blocks
+
+    def cache_full_blocks(
+        self,
+        request: Request,
+        blocks: list[KVCacheBlock],
+        block_hashes: list[BlockHash],
+        num_cached_blocks: int,
+        num_full_blocks: int,
+        block_size: int,
+        kv_cache_group_id: int,
+        hash_fn: Callable,
+    ) -> None:
+        """Cache a list of full blocks for prefix caching.
+        This function takes a list of blocks that will have their block hash
+        metadata to be updated and cached. Given a request, it computes the
+        block hashes for the blocks starting from `num_cached_blocks` to
+        `num_full_blocks`, updating the metadata for each block
+        and caching them in the `cached_block_hash_to_block`.
+
+        Args:
+            request: The request to cache the blocks.
+            blocks: All blocks in the request.
+            block_hashes: Block hashes of the blocks in the request. Note that
+            this list may be shorter than the blocks list. In this case the
+            missed block hash will be computed in this function.
+            num_cached_blocks: The number of blocks that are already cached.
+            num_full_blocks: The number of blocks that are full and should
+                be cached after this function.
+            block_size: Number of tokens in each block.
+            kv_cache_group_id: The id of the KV cache group.
+            hash_fn: The hash function to use for block hashes.
+        """
+        if num_cached_blocks == num_full_blocks:
+            return
+        new_full_blocks = blocks[num_cached_blocks:num_full_blocks]
+        assert len(block_hashes) >= num_cached_blocks
+        new_block_hashes = block_hashes[num_cached_blocks:]
+
+        # Update the new blocks with the block hashes through the chain.
+        if num_cached_blocks == 0:
+            prev_block_hash_value = None
+        else:
+            prev_block = blocks[num_cached_blocks - 1]
+            assert prev_block.block_hash is not None
+            prev_block_hash_value = prev_block.block_hash.get_hash_value()
+
+        parent_block_hash = prev_block_hash_value
+        new_hashes: Optional[list[int]] = ([] if self.enable_kv_cache_events
+                                           else None)
+        for i, blk in enumerate(new_full_blocks):
+            assert blk.block_hash is None
+
+            if i < len(new_block_hashes):
+                # The block hash may already be computed in
+                # "get_computed_blocks" if the tokens are not generated by
+                # this request (either the prompt tokens or the previously
+                # generated tokens with preemption), or by other
+                # single_type_managers with the same block_size.
+                # In this case we simply reuse the block hash.
+                block_hash = new_block_hashes[i]
+            else:
+                # Otherwise compute the block hash and cache it in the request
+                # in case it will be preempted in the future.
+                blk_idx = num_cached_blocks + i
+                start_token_idx = blk_idx * block_size
+                end_token_idx = (blk_idx + 1) * block_size
+                block_tokens = request.all_token_ids[
+                    start_token_idx:end_token_idx]
+                assert len(block_tokens) == block_size, (
+                    f"Expected {block_size} tokens, got "
+                    f"{len(block_tokens)} at {blk_idx}th block for request "
+                    f"{request.request_id}({request})")
+
+                # Generate extra keys for multi-modal inputs. Note that since
+                # we reach to this branch only when the block is completed with
+                # generated tokens, we only need to consider the last mm input.
+                extra_keys, _ = generate_block_hash_extra_keys(
+                    request, start_token_idx, end_token_idx, -1)
+
+                # Compute the hash of the current block.
+                block_hash = hash_block_tokens(hash_fn, prev_block_hash_value,
+                                               block_tokens, extra_keys)
+                block_hashes.append(block_hash)
+
+            # Update and added the full block to the cache.
+            block_hash_with_group_id = BlockHashWithGroupId(
+                block_hash, kv_cache_group_id)
+            blk.block_hash = block_hash_with_group_id
+            self.cached_block_hash_to_block[block_hash_with_group_id][
+                blk.block_id] = blk
+            if new_hashes is not None:
+                new_hashes.append(block_hash.hash_value)
+            prev_block_hash_value = block_hash.hash_value
+
+        if self.enable_kv_cache_events:
+            self.kv_event_queue.append(
+                BlockStored(
+                    block_hashes=new_hashes,
+                    parent_block_hash=parent_block_hash,
+                    token_ids=request.
+                    all_token_ids[num_cached_blocks *
+                                  block_size:num_full_blocks * block_size],
+                    block_size=block_size,
+                    lora_id=request.lora_request.id
+                    if request.lora_request else None,
+                ))
+
+    def get_new_blocks(self, num_blocks: int) -> list[KVCacheBlock]:
+        """Get new blocks from the free block pool.
+
+        Note that we do not check block cache in this function.
+
+        Args:
+            num_blocks: The number of blocks to allocate.
+
+        Returns:
+            A list of new block.
+        """
+        if num_blocks > self.get_num_free_blocks():
+            raise ValueError(
+                f"Cannot get {num_blocks} free blocks from the pool")
+
+        ret: list[KVCacheBlock] = self.free_block_queue.popleft_n(num_blocks)
+
+        # In order to only iterate the list once, we duplicated code a bit
+        if self.enable_caching:
+            for block in ret:
+                self._maybe_evict_cached_block(block)
+                assert block.ref_cnt == 0
+                block.ref_cnt += 1
+        else:
+            for block in ret:
+                assert block.ref_cnt == 0
+                block.ref_cnt += 1
+        return ret
+
+    def _maybe_evict_cached_block(self, block: KVCacheBlock) -> bool:
+        """
+        If a block is cached in `cached_block_hash_to_block`, we reset its hash
+        metadata and evict it from the cache.
+
+        Args:
+            block: The block to evict.
+
+        Returns:
+            True if the block is evicted, False otherwise.
+        """
+        block_hash = block.block_hash
+        if block_hash is None:
+            # The block doesn't have hash, eviction is not needed
+            return False
+        blocks_by_id = self.cached_block_hash_to_block.get(block_hash)
+        if blocks_by_id is None:
+            # block_hash not found in cached_block_hash_to_block,
+            # eviction is not needed
+            return False
+        block.reset_hash()
+        blocks_by_id.pop(block.block_id, None)
+        if len(blocks_by_id) == 0:
+            del self.cached_block_hash_to_block[block_hash]
+
+        if self.enable_kv_cache_events:
+            # FIXME (Chen): Not sure whether we should return `hash_value`
+            # or `(hash_value, group_id)` here. But it's fine now because
+            # we disable hybrid kv cache manager when kv cache event is
+            # enabled, so there is only one group.
+            self.kv_event_queue.append(
+                BlockRemoved(block_hashes=[block_hash.get_hash_value()]))
+        return True
+
+    def touch(self, blocks: tuple[list[KVCacheBlock], ...]) -> None:
+        """Touch a block increases its reference count by 1, and may remove
+        the block from the free queue. This is used when a block is hit by
+        another request with the same prefix.
+
+        Args:
+            blocks: A list of blocks to touch.
+        """
+        for blocks_per_group in blocks:
+            for block in blocks_per_group:
+                # ref_cnt=0 means this block is in the free list (i.e. eviction
+                # candidate), so remove it.
+                if block.ref_cnt == 0 and not block.is_null:
+                    self.free_block_queue.remove(block)
+                block.incr_ref()
+
+    def free_blocks(self, ordered_blocks: Iterable[KVCacheBlock]) -> None:
+        """Free a list of blocks. The blocks should be ordered by their
+        eviction priority, where the first block will be evicted first.
+
+        Args:
+            ordered_blocks: A list of blocks to free ordered by their eviction
+                priority.
+        """
+        # Materialize the iterable to allow multiple passes.
+        blocks_list = list(ordered_blocks)
+        for block in blocks_list:
+            block.ref_cnt -= 1
+        self.free_block_queue.append_n([
+            block for block in blocks_list
+            if block.ref_cnt == 0 and not block.is_null
+        ])
+
+    def reset_prefix_cache(self) -> bool:
+        """Reset prefix cache. This function may be used in RLHF
+        flows to invalid prefix caching after the weights are updated,
+        or used for resetting prefix caching status for benchmarking.
+
+        Returns:
+            bool: True if the prefix cache is successfully reset,
+            False otherwise.
+        """
+        num_used_blocks = self.num_gpu_blocks - self.get_num_free_blocks()
+        if num_used_blocks != 1:  # The null block is always marked as used
+            logger.warning(
+                "Failed to reset prefix cache because some "
+                "blocks (%d) are not freed yet", num_used_blocks - 1)
+            return False
+
+        # Remove all hashes so that no new blocks will hit.
+        self.cached_block_hash_to_block = defaultdict(dict)
+
+        # Remove all hashes from all blocks.
+        for block in self.blocks:
+            block.reset_hash()
+
+        logger.info("Successfully reset prefix cache")
+
+        if self.enable_kv_cache_events:
+            self.kv_event_queue.append(AllBlocksCleared())
+
+        return True
+
+    def get_num_free_blocks(self) -> int:
+        """Get the number of free blocks in the pool.
+
+        Returns:
+            The number of free blocks.
+        """
+        return self.free_block_queue.num_free_blocks
+
+    def get_usage(self) -> float:
+        """Get the KV cache usage.
+
+        Returns:
+            The KV cache usage (between 0.0 and 1.0).
+        """
+        return 1.0 - (self.get_num_free_blocks() / self.num_gpu_blocks)
+
+    def take_events(self) -> list[KVCacheEvent]:
+        """Atomically takes all events and clears the queue.
+        
+        Returns:
+            A list of KV cache events.
+        """
+        if not self.enable_kv_cache_events:
+            return []
+        events = self.kv_event_queue
+        self.kv_event_queue = []
+        return events
diff --git a/vllm_v0.10.0/vllm/v1/core/encoder_cache_manager.py b/vllm_v0.10.0/vllm/v1/core/encoder_cache_manager.py
new file mode 100644
index 0000000..67ea3b0
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/core/encoder_cache_manager.py
@@ -0,0 +1,254 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import TYPE_CHECKING
+
+from vllm.logger import init_logger
+from vllm.multimodal import MultiModalRegistry
+from vllm.v1.request import Request
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig, SchedulerConfig
+
+logger = init_logger(__name__)
+
+
+class EncoderCacheManager:
+    """Manages caching of encoder outputs for multimodal models in vLLM V1.
+
+    The EncoderCacheManager handles the lifecycle of multimodal encoder outputs
+    (such as vision embeddings from images) during request processing. It
+    provides memory-aware caching to avoid recomputing encoder outputs when the
+    same multimodal inputs appear in different stages of request processing.
+
+    This manager is particularly important for:
+    - Vision-language models (e.g., LLaVA) where image encoder outputs are
+      cached
+    - Any multimodal model where encoder computation is expensive and
+      cacheable
+
+    The cache operates at the granularity of individual multimodal input items
+    within requests, allowing for fine-grained memory management and enabling
+    chunked processing of multimodal inputs.
+
+    Note that no caching is shared between requests at this time. If the same
+    input is used across multiple requests, it will be reprocessed for each
+    request.
+    
+    Args:
+        cache_size: Limit the size of the cache, measured by the number of
+                    tokens from the input sequence.
+
+    Attributes:
+        cache_size: Total cache capacity in encoder tokens
+        num_free_slots: Current available cache capacity in encoder tokens
+        cached: Mapping from request_id to set of cached input_ids for that
+                request
+        freed: List of (request_id, input_id) pairs that were recently freed.
+               This is cleared after every call to get_freed_ids().
+    """
+
+    def __init__(self, cache_size: int):
+        self.cache_size = cache_size
+        self.num_free_slots = cache_size
+        # req_id -> cached input ids
+        self.cached: dict[str, set[int]] = {}
+        # list of [req_id, input_id]
+        self.freed: list[tuple[str, int]] = []
+
+    def has_cache(self, request: Request, input_id: int) -> bool:
+        """Check if encoder output for a specific multimodal input is cached.
+
+        Args:
+            request: The request containing the multimodal input
+            input_id: Index of the multimodal input within the request
+
+        Returns:
+            True if the encoder output for this input is already cached
+        """
+        req_id = request.request_id
+        return req_id in self.cached and input_id in self.cached[req_id]
+
+    def can_allocate(self, request: Request, input_id: int) -> bool:
+        """Check if there's sufficient cache space for a multimodal input.
+
+        Args:
+            request: The request containing the multimodal input
+            input_id: Index of the multimodal input within the request
+
+        Returns:
+            True if there's enough free cache space to store the encoder output
+            for this multimodal input
+        """
+        num_tokens = request.get_num_encoder_tokens(input_id)
+        return num_tokens <= self.num_free_slots
+
+    def allocate(self, request: Request, input_id: int) -> None:
+        """Allocate cache space for a multimodal input's encoder output.
+
+        This method reserves cache space for storing the encoder output of
+        the specified multimodal input. The actual encoder output storage
+        happens in the model runner, but this method ensures the cache
+        manager tracks the allocation.
+
+        Args:
+            request: The request containing the multimodal input
+            input_id: Index of the multimodal input within the request
+
+        Note:
+            This method assumes can_allocate() returned True for the same
+            request and input_id. It will reduce available cache space.
+        """
+        req_id = request.request_id
+        if req_id not in self.cached:
+            self.cached[req_id] = set()
+        self.cached[req_id].add(input_id)
+        self.num_free_slots -= request.get_num_encoder_tokens(input_id)
+
+    def get_cached_input_ids(self, request: Request) -> set[int]:
+        """Get all cached multimodal input IDs for a request.
+
+        Args:
+            request: The request to query
+
+        Returns:
+            Set of input_ids that have cached encoder outputs for this request.
+            Returns empty set if no inputs are cached for this request.
+        """
+        return self.cached.get(request.request_id, set())
+
+    def free_encoder_input(self, request: Request, input_id: int) -> None:
+        """Free cache space for a single multimodal input's encoder output.
+
+        This method is called when:
+        - The encoder output has been fully consumed by the decoder and is
+          no longer needed (e.g., in vision-language models after image
+          tokens are processed)
+        - A request is being cancelled or aborted
+
+        Args:
+            request: The request containing the multimodal input
+            input_id: Index of the multimodal input to free from cache
+        """
+        req_id = request.request_id
+        if req_id not in self.cached:
+            return
+
+        self.cached[req_id].discard(input_id)
+        if len(self.cached[req_id]) == 0:
+            del self.cached[req_id]
+        self.num_free_slots += request.get_num_encoder_tokens(input_id)
+        self.freed.append((req_id, input_id))
+
+    def free(self, request: Request) -> None:
+        """Free all cached encoder outputs for a request.
+
+        This method is typically called when a request is finished, cancelled,
+        or aborted, and all its encoder outputs should be freed from cache.
+
+        Args:
+            request: The request whose encoder outputs should be freed
+        """
+        input_ids = self.get_cached_input_ids(request).copy()
+        for input_id in input_ids:
+            self.free_encoder_input(request, input_id)
+
+    def get_freed_ids(self) -> list[tuple[str, int]]:
+        """Get and clear the list of recently freed encoder cache entries.
+
+        This method returns all encoder cache entries that were freed since
+        the last call to this method. It's used by the scheduler to notify
+        workers about which encoder outputs can be removed from their caches.
+
+        Returns:
+            List of (request_id, input_id) tuples that were freed since the
+            last call. The internal freed list is cleared after this call.
+        """
+        freed = self.freed
+        self.freed = []
+        return freed
+
+
+def compute_encoder_budget(
+    model_config: "ModelConfig",
+    scheduler_config: "SchedulerConfig",
+    mm_registry: MultiModalRegistry,
+) -> tuple[int, int]:
+    """Compute the encoder cache budget based on the model and scheduler 
+    configurations.
+
+    Args:
+        model_config: Model configuration.
+        scheduler_config: Scheduler configuration.
+        mm_registry: Provides information about the token cost.
+
+    Returns:
+        - Compute budget for encoder execution, in unit of number of tokens 
+            in the input sequence.
+        - Space budget for encoder cache size, in unit of number of tokens 
+            in the input sequence.
+    """
+
+    if not model_config.is_multimodal_model:
+        return 0, 0
+
+    # TODO: handle encoder-decoder models once we support them.
+    (
+        encoder_compute_budget,
+        encoder_cache_size,
+    ) = _compute_encoder_budget_multimodal(
+        model_config,
+        scheduler_config,
+        mm_registry,
+    )
+
+    return encoder_compute_budget, encoder_cache_size
+
+
+def _compute_encoder_budget_multimodal(
+    model_config: "ModelConfig",
+    scheduler_config: "SchedulerConfig",
+    mm_registry: MultiModalRegistry,
+) -> tuple[int, int]:
+    """Compute the encoder cache budget based on the model and scheduler 
+    configurations for a multimodal model.
+
+    Args:
+        model_config: Model configuration.
+        scheduler_config: Scheduler configuration.
+        mm_registry: Provides information about the token cost.
+
+    Returns:
+        - Compute budget for encoder execution, in unit of number of tokens 
+            in the input sequence.
+        - Space budget for encoder cache size, in unit of number of tokens 
+            in the input sequence.
+    """
+
+    max_tokens_by_modality_dict = mm_registry \
+        .get_max_tokens_per_item_by_nonzero_modality(model_config)
+
+    if not max_tokens_by_modality_dict:
+        logger.warning(
+            "All non-text modalities supported by the model have been "
+            "explicitly disabled via limit_mm_per_prompt. Encoder cache will "
+            "not be initialized.")
+        return 0, 0
+
+    _, max_tokens_per_mm_item = max(max_tokens_by_modality_dict.items(),
+                                    key=lambda item: item[1])
+
+    if (scheduler_config.disable_chunked_mm_input and max_tokens_per_mm_item
+            > scheduler_config.max_num_batched_tokens):
+        raise ValueError(
+            "Chunked MM input disabled but max_tokens_per_mm_item "
+            f"({max_tokens_per_mm_item}) is larger than max_num_batched_tokens"
+            f" ({scheduler_config.max_num_batched_tokens}). Please increase "
+            "max_num_batched_tokens.")
+
+    encoder_compute_budget = max(scheduler_config.max_num_encoder_input_tokens,
+                                 max_tokens_per_mm_item)
+    encoder_cache_size = max(scheduler_config.encoder_cache_size,
+                             max_tokens_per_mm_item)
+
+    return encoder_compute_budget, encoder_cache_size
diff --git a/vllm_v0.10.0/vllm/v1/core/kv_cache_coordinator.py b/vllm_v0.10.0/vllm/v1/core/kv_cache_coordinator.py
new file mode 100644
index 0000000..de72e60
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/core/kv_cache_coordinator.py
@@ -0,0 +1,402 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from typing import Callable, Optional
+
+from vllm.v1.core.block_pool import BlockPool
+from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
+from vllm.v1.core.single_type_kv_cache_manager import (
+    FullAttentionManager, get_manager_for_kv_cache_spec)
+from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheConfig
+from vllm.v1.request import Request
+
+
+class KVCacheCoordinator(ABC):
+    """
+    Coordinate the KV cache of different KV cache groups.
+    """
+
+    def __init__(
+        self,
+        kv_cache_config: KVCacheConfig,
+        max_model_len: int,
+        use_eagle: bool,
+        enable_caching: bool,
+        caching_hash_fn: Callable,
+        enable_kv_cache_events: bool,
+    ):
+        self.kv_cache_config = kv_cache_config
+        self.max_model_len = max_model_len
+        self.enable_caching = enable_caching
+
+        self.block_pool = BlockPool(kv_cache_config.num_blocks, enable_caching,
+                                    enable_kv_cache_events)
+
+        # Needs special handling for find_longest_cache_hit if eagle is enabled
+        self.use_eagle = use_eagle
+        self.single_type_managers = tuple(
+            get_manager_for_kv_cache_spec(
+                kv_cache_spec=kv_cache_group.kv_cache_spec,
+                block_pool=self.block_pool,
+                kv_cache_group_id=i,
+                caching_hash_fn=caching_hash_fn,
+            ) for i, kv_cache_group in enumerate(
+                self.kv_cache_config.kv_cache_groups))
+
+    def get_num_blocks_to_allocate(
+            self, request_id: str, num_tokens: int,
+            new_computed_blocks: tuple[list[KVCacheBlock], ...]) -> int:
+        """
+        Get the number of blocks needed to be allocated for the request.
+
+        Args:
+            request_id: The request ID.
+            num_tokens: The total number of tokens that need a slot (including 
+                tokens that are already allocated).
+            new_computed_blocks: The new computed blocks just hitting the
+                prefix caching.
+
+        Returns:
+            The number of blocks.
+        """
+        num_blocks_to_allocate = 0
+        for i, manager in enumerate(self.single_type_managers):
+            num_blocks_to_allocate += manager.get_num_blocks_to_allocate(
+                request_id, num_tokens, new_computed_blocks[i])
+        return num_blocks_to_allocate
+
+    def save_new_computed_blocks(
+            self, request_id: str,
+            new_computed_blocks: tuple[list[KVCacheBlock], ...]) -> None:
+        """
+        Add the new computed blocks to the request.
+
+        Args:
+            request_id: The request ID.
+            new_computed_blocks: The new computed blocks just hitting the
+                prefix cache.
+        """
+        for i, manager in enumerate(self.single_type_managers):
+            manager.save_new_computed_blocks(request_id,
+                                             new_computed_blocks[i])
+
+    def allocate_new_blocks(self, request_id: str,
+                            num_tokens: int) -> tuple[list[KVCacheBlock], ...]:
+        """
+        Allocate new blocks for the request to give it at least `num_tokens` 
+        token slots.
+
+        Args:
+            request_id: The request ID.
+            num_tokens: The total number of tokens that need a slot (including 
+                tokens that are already allocated).
+
+        Returns:
+            The new allocated blocks.
+        """
+        return tuple(
+            manager.allocate_new_blocks(request_id, num_tokens)
+            for manager in self.single_type_managers)
+
+    def cache_blocks(self, request: Request, block_hashes: list[BlockHash],
+                     num_computed_tokens: int) -> None:
+        """
+        Cache the blocks for the request.
+
+        Args:
+            request: The request.
+            block_hashes: The block hashes of the request.
+            num_tokens: The total number of tokens that need to be cached 
+                (including tokens that are already cached).
+        """
+        for manager in self.single_type_managers:
+            manager.cache_blocks(request, block_hashes, num_computed_tokens)
+
+    def free(self, request_id: str) -> None:
+        """
+        Free the blocks for the request.
+
+        Args:
+            request_id: The request ID.
+        """
+        for manager in self.single_type_managers:
+            manager.free(request_id)
+
+    def get_num_common_prefix_blocks(self, request_id: str,
+                                     num_running_requests: int) -> list[int]:
+        """
+        Get the number of common prefix blocks for a request.
+
+        Args:
+            request_id: The request ID.
+            block_hashes: The block hashes of the request.
+
+        Returns:
+            The number of common prefix blocks.
+        """
+        num_blocks_per_group = [
+            manager.get_num_common_prefix_blocks(request_id,
+                                                 num_running_requests)
+            for manager in self.single_type_managers
+        ]
+        return num_blocks_per_group
+
+    def remove_skipped_blocks(self, request_id: str,
+                              num_computed_tokens: int) -> None:
+        """
+        Remove the blocks that are no longer needed from `blocks` and replace 
+        the removed blocks with null_block.
+
+        Args:
+            request_id: The request ID.
+            num_computed_tokens: The number of tokens that have been computed.
+        """
+        for manager in self.single_type_managers:
+            manager.remove_skipped_blocks(request_id, num_computed_tokens)
+
+    def get_blocks(self, request_id: str) -> tuple[list[KVCacheBlock], ...]:
+        """
+        Get the blocks for the request.
+        """
+        return tuple(
+            manager.req_to_blocks.get(request_id) or []
+            for manager in self.single_type_managers)
+
+    @abstractmethod
+    def find_longest_cache_hit(
+        self,
+        block_hashes: list[BlockHash],
+        max_cache_hit_length: int,
+    ) -> tuple[tuple[list[KVCacheBlock], ...], int]:
+        pass
+
+
+class KVCacheCoordinatorNoPrefixCache(KVCacheCoordinator):
+    """
+    KV cache coordinator to use if prefix caching is disabled or unsupported.
+    In contrast to UnitaryKVCacheCoordinator and HybridKVCacheCoordinator,
+    supports arbitrary numbers of KV cache groups (including 0 groups).
+    Does not implement any features related to prefix caching.
+    """
+
+    def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
+                 use_eagle: bool, caching_hash_fn: Callable,
+                 enable_kv_cache_events: bool):
+        super().__init__(kv_cache_config, max_model_len, use_eagle, False,
+                         caching_hash_fn, enable_kv_cache_events)
+        self.num_single_type_manager = len(self.single_type_managers)
+
+    def get_num_common_prefix_blocks(self, request_id: str,
+                                     num_running_requests: int) -> list[int]:
+        return [0] * self.num_single_type_manager
+
+    def find_longest_cache_hit(
+        self,
+        block_hashes: list[BlockHash],
+        max_cache_hit_length: int,
+    ) -> tuple[tuple[list[KVCacheBlock], ...], int]:
+        blocks: tuple[list[KVCacheBlock], ...] = tuple(
+            [] for _ in range(self.num_single_type_manager))
+        return blocks, 0
+
+
+class UnitaryKVCacheCoordinator(KVCacheCoordinator):
+    """
+    KV cache coordinator for models with only one KV cache group. This is the
+    case for models with only one KV cache type, e.g., all attention layers use
+    full attention or all attention layers use sliding window attention.
+    """
+
+    def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
+                 use_eagle: bool, enable_caching: bool,
+                 caching_hash_fn: Callable, enable_kv_cache_events: bool):
+        super().__init__(kv_cache_config, max_model_len, use_eagle,
+                         enable_caching, caching_hash_fn,
+                         enable_kv_cache_events)
+        self.kv_cache_spec = self.kv_cache_config.kv_cache_groups[
+            0].kv_cache_spec
+        self.block_size = self.kv_cache_spec.block_size
+        assert len(self.kv_cache_config.kv_cache_groups) == 1, (
+            "UnitaryKVCacheCoordinator assumes only one kv cache group")
+
+    def find_longest_cache_hit(
+        self,
+        block_hashes: list[BlockHash],
+        max_cache_hit_length: int,
+    ) -> tuple[tuple[list[KVCacheBlock], ...], int]:
+        hit_blocks = self.single_type_managers[0].find_longest_cache_hit(
+            block_hashes=block_hashes,
+            max_length=max_cache_hit_length,
+            kv_cache_group_ids=[0],
+            block_pool=self.block_pool,
+            kv_cache_spec=self.kv_cache_spec,
+            use_eagle=self.use_eagle,
+        )
+        return hit_blocks, len(hit_blocks[0]) * self.block_size
+
+
+class HybridKVCacheCoordinator(KVCacheCoordinator):
+    """
+    KV cache coordinator for hybrid models with multiple KV cache types, and
+    thus multiple kv cache groups.
+    To simplify `find_longest_cache_hit`, it only supports the combination of 
+    two types of KV cache groups, and one of them must be full attention.
+    May extend to more general cases in the future.
+    """
+
+    def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
+                 use_eagle: bool, enable_caching: bool,
+                 caching_hash_fn: Callable, enable_kv_cache_events: bool):
+        super().__init__(kv_cache_config, max_model_len, use_eagle,
+                         enable_caching, caching_hash_fn,
+                         enable_kv_cache_events)
+        self.verify_and_split_kv_cache_groups()
+
+    def verify_and_split_kv_cache_groups(self) -> None:
+        """
+        Verifies that the model has exactly two types of KV cache groups, and 
+        one of them is full attention. Then, split the kv cache groups into full
+        attention groups and other groups.
+        """
+        full_attention_type_id: Optional[str] = None
+        other_type_id: Optional[str] = None
+        self.full_attention_group_ids: list[int] = []
+        self.other_group_ids: list[int] = []
+        for i, g in enumerate(self.kv_cache_config.kv_cache_groups):
+            if isinstance(g.kv_cache_spec, FullAttentionSpec):
+                if full_attention_type_id is None:
+                    full_attention_type_id = g.kv_cache_spec.type_id
+                else:
+                    assert full_attention_type_id == g.kv_cache_spec.type_id, (
+                        "HybridKVCacheCoordinator assumes exactly one type of "
+                        "full attention groups now.")
+                self.full_attention_group_ids.append(i)
+            else:
+                if other_type_id is None:
+                    other_type_id = g.kv_cache_spec.type_id
+                else:
+                    assert other_type_id == g.kv_cache_spec.type_id, (
+                        "HybridKVCacheCoordinator assumes "
+                        "exactly one other type of groups now.")
+                self.other_group_ids.append(i)
+
+        assert full_attention_type_id is not None, (
+            "HybridKVCacheCoordinator assumes exactly one type of full "
+            "attention groups now.")
+        assert other_type_id is not None, (
+            "HybridKVCacheCoordinator assumes exactly one type of other "
+            "groups now.")
+
+        self.full_attention_manager_cls = FullAttentionManager
+        self.other_attention_cls = self.single_type_managers[
+            self.other_group_ids[0]].__class__
+
+        self.full_attention_spec = self.kv_cache_config.kv_cache_groups[
+            self.full_attention_group_ids[0]].kv_cache_spec
+        self.other_spec = self.kv_cache_config.kv_cache_groups[
+            self.other_group_ids[0]].kv_cache_spec
+
+        self.full_attention_block_size = self.full_attention_spec.block_size
+        self.other_block_size = self.other_spec.block_size
+
+        if self.enable_caching:
+            # this requirement is only needed for the prefix caching logic
+            divisible = self.other_block_size % self.full_attention_block_size
+            assert divisible == 0, (
+                "KVCacheCoordinator assumes the block_size of full "
+                "attention layers is divisible by other layers now.")
+
+        if max(self.full_attention_group_ids) < min(self.other_group_ids):
+            self.full_attn_first = True
+        elif max(self.other_group_ids) < min(self.full_attention_group_ids):
+            self.full_attn_first = False
+        else:
+            raise ValueError(
+                "HybridKVCacheCoordinator assumes the full "
+                "attention group ids and other attention group ids "
+                "do not interleave, either full attention group ids "
+                "are before other attention group ids or vice versa."
+                "This is for simplifying merging hit_blocks_full_attn and "
+                "hit_blocks_other_attn to hit_blocks.")
+
+    def find_longest_cache_hit(
+        self,
+        block_hashes: list[BlockHash],
+        max_cache_hit_length: int,
+    ) -> tuple[tuple[list[KVCacheBlock], ...], int]:
+        """
+        Find the longest cache hit for the request.
+
+        Args:
+            block_hashes: The block hashes of the request.
+            max_cache_hit_length: The maximum length of the cache hit.
+
+        Returns:
+            A tuple containing:
+                - A list of the cache hit blocks for each single type manager.
+                - The number of tokens of the longest cache hit.
+        """
+        # First, find the longest cache hit for full attention.
+        hit_blocks_full_attn = (
+            self.full_attention_manager_cls.find_longest_cache_hit(
+                block_hashes=block_hashes,
+                max_length=max_cache_hit_length,
+                kv_cache_group_ids=self.full_attention_group_ids,
+                block_pool=self.block_pool,
+                kv_cache_spec=self.full_attention_spec,
+                use_eagle=self.use_eagle,
+            ))
+        hit_length = len(
+            hit_blocks_full_attn[0]) * self.full_attention_block_size
+
+        # Next, find the cache hit for the other attention WITHIN
+        # the cache hit of full attention.
+        hit_blocks_other_attn = (
+            self.other_attention_cls.find_longest_cache_hit(
+                block_hashes=block_hashes,
+                max_length=hit_length,
+                kv_cache_group_ids=self.other_group_ids,
+                block_pool=self.block_pool,
+                kv_cache_spec=self.other_spec,
+                use_eagle=self.use_eagle,
+            ))
+        hit_length = len(hit_blocks_other_attn[0]) * self.other_block_size
+
+        # NOTE: the prefix cache hit length must be a multiple of block_size as
+        # we don't support partial block cache hit yet. The cache hit length
+        # of other attention is ensured to be a multiple of the block size of
+        # full attention layers in current implementation, because hit_length is
+        # a multiple of other attention's block size, and other attention's
+        # block size is a multiple of full attention's block size (verified in
+        # `verify_and_split_kv_cache_groups`).
+        assert hit_length % self.full_attention_block_size == 0
+
+        # Truncate the full attention cache hit to the length of the
+        # cache hit of the other attention.
+        for group_hit_blocks in hit_blocks_full_attn:
+            del group_hit_blocks[hit_length // self.full_attention_block_size:]
+
+        # Merge the hit blocks of full attention and other attention.
+        if self.full_attn_first:
+            hit_blocks = hit_blocks_full_attn + hit_blocks_other_attn
+        else:
+            hit_blocks = hit_blocks_other_attn + hit_blocks_full_attn
+        return hit_blocks, hit_length
+
+
+def get_kv_cache_coordinator(
+        kv_cache_config: KVCacheConfig, max_model_len: int, use_eagle: bool,
+        enable_caching: bool, caching_hash_fn: Callable,
+        enable_kv_cache_events: bool) -> KVCacheCoordinator:
+    if not enable_caching:
+        return KVCacheCoordinatorNoPrefixCache(kv_cache_config, max_model_len,
+                                               use_eagle, caching_hash_fn,
+                                               enable_kv_cache_events)
+    if len(kv_cache_config.kv_cache_groups) == 1:
+        return UnitaryKVCacheCoordinator(kv_cache_config, max_model_len,
+                                         use_eagle, enable_caching,
+                                         caching_hash_fn,
+                                         enable_kv_cache_events)
+    return HybridKVCacheCoordinator(kv_cache_config, max_model_len, use_eagle,
+                                    enable_caching, caching_hash_fn,
+                                    enable_kv_cache_events)
diff --git a/vllm_v0.10.0/vllm/v1/core/kv_cache_manager.py b/vllm_v0.10.0/vllm/v1/core/kv_cache_manager.py
new file mode 100644
index 0000000..e820a0a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/core/kv_cache_manager.py
@@ -0,0 +1,410 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Optional
+
+from vllm.distributed.kv_events import KVCacheEvent
+from vllm.logger import init_logger
+from vllm.utils import sha256, sha256_cbor_64bit
+from vllm.v1.core.kv_cache_coordinator import get_kv_cache_coordinator
+from vllm.v1.core.kv_cache_utils import (BlockHash, KVCacheBlock,
+                                         hash_request_tokens, init_none_hash)
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.metrics.stats import PrefixCacheStats
+from vllm.v1.request import Request, RequestStatus
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class KVCacheBlocks:
+    """
+    The allocation result of KVCacheManager, work as the interface between
+    Scheduler and KVCacheManager, to hide KVCacheManager's internal data
+    structure from the Scheduler.
+    """
+    blocks: tuple[list[KVCacheBlock], ...]
+    """
+    blocks[i][j] refers to the i-th kv_cache_group and the j-th block of tokens.
+    We don't use block of tokens as the outer dimension because it assumes all
+    kv_cache_groups have the same number of blocks, which is true for now but 
+    will be broken if we want to give different block_size to different 
+    kv_cache_groups in the future.
+    """
+
+    def __add__(self, other: "KVCacheBlocks") -> "KVCacheBlocks":
+        """Adds two KVCacheBlocks instances."""
+        return KVCacheBlocks(
+            tuple(blk1 + blk2
+                  for blk1, blk2 in zip(self.blocks, other.blocks)))
+
+    def get_block_ids(self) -> tuple[list[int], ...]:
+        """
+        Converts the KVCacheBlocks instance to block_ids.
+        
+        Returns:
+            tuple[list[int], ...]: A tuple of lists where
+            * the outer tuple corresponds to KV cache groups
+            * each inner list contains the block_ids of the blocks in that group
+        """
+        return tuple([blk.block_id for blk in group] for group in self.blocks)
+
+    def get_unhashed_block_ids(self) -> list[int]:
+        """Get block_ids of unhashed blocks from KVCacheBlocks instance."""
+        assert len(self.blocks) == 1, "Only one group is supported"
+        return [
+            block.block_id for block in self.blocks[0]
+            if block.block_hash is None
+        ]
+
+    def new_empty(self) -> "KVCacheBlocks":
+        """Creates a new KVCacheBlocks instance with no blocks."""
+        return KVCacheBlocks(tuple([] for _ in range(len(self.blocks))))
+
+
+class KVCacheManager:
+
+    def __init__(
+        self,
+        kv_cache_config: KVCacheConfig,
+        max_model_len: int,
+        enable_caching: bool = True,
+        caching_hash_algo: str = "builtin",
+        use_eagle: bool = False,
+        log_stats: bool = False,
+        enable_kv_cache_events: bool = False,
+    ) -> None:
+        self.max_model_len = max_model_len
+
+        if len(kv_cache_config.kv_cache_groups) == 0:
+            # Attention free models don't have kv cache,
+            # thus don't need prefix caching.
+            enable_caching = False
+        self.enable_caching = enable_caching
+
+        self.caching_hash_fn = (
+            sha256_cbor_64bit if caching_hash_algo == "sha256_cbor_64bit" else
+            sha256 if caching_hash_algo == "sha256" else hash)
+        init_none_hash(self.caching_hash_fn)
+        self.use_eagle = use_eagle
+        self.log_stats = log_stats
+        # FIXME: make prefix cache stats conditional on log_stats
+        self.prefix_cache_stats = PrefixCacheStats() if log_stats else None
+
+        self.block_size: Optional[int] = None
+        if self.enable_caching:
+            assert len(
+                set(g.kv_cache_spec.block_size
+                    for g in kv_cache_config.kv_cache_groups)
+            ) == 1, "Only one block size is supported for now"
+            self.block_size = kv_cache_config.kv_cache_groups[
+                0].kv_cache_spec.block_size
+
+        self.coordinator = get_kv_cache_coordinator(
+            kv_cache_config=kv_cache_config,
+            max_model_len=self.max_model_len,
+            use_eagle=self.use_eagle,
+            enable_caching=self.enable_caching,
+            caching_hash_fn=self.caching_hash_fn,
+            enable_kv_cache_events=enable_kv_cache_events,
+        )
+        self.num_kv_cache_groups = len(kv_cache_config.kv_cache_groups)
+        self.block_pool = self.coordinator.block_pool
+        self.kv_cache_config = kv_cache_config
+
+        # Mapping from request ID to kv block hashes.
+        # This is to avoid recomputing the block hashes for each call of
+        # `get_computed_blocks` or `allocate_slots`.
+        self.req_to_block_hashes: defaultdict[
+            str, list[BlockHash]] = defaultdict(list)
+
+    @property
+    def usage(self) -> float:
+        """Get the KV cache usage.
+
+        Returns:
+            The KV cache usage (between 0.0 and 1.0).
+        """
+        return self.block_pool.get_usage()
+
+    def make_prefix_cache_stats(self) -> Optional[PrefixCacheStats]:
+        """Get (and reset) the prefix cache stats.
+
+        Returns:
+            The current prefix caching stats, or None if logging is disabled.
+        """
+        if not self.log_stats:
+            return None
+        stats = self.prefix_cache_stats
+        self.prefix_cache_stats = PrefixCacheStats()
+        return stats
+
+    def get_computed_blocks(self,
+                            request: Request) -> tuple[KVCacheBlocks, int]:
+        """Get the computed (cached) blocks for the request.
+        Note that the computed blocks must be full.
+
+        Args:
+            request: The request to get the computed blocks.
+
+        Returns:
+            A tuple containing:
+                - A list of blocks that are computed for the request.
+                - The number of computed tokens.
+        """
+        # Prefix caching is disabled or
+        # When the request requires prompt logprobs, we skip prefix caching.
+        if (not self.enable_caching
+                or (request.sampling_params is not None
+                    and request.sampling_params.prompt_logprobs is not None)):
+            return self.create_empty_block_list(), 0
+
+        # The block hashes for the request may already be computed
+        # if the scheduler has tried to schedule the request before.
+        block_hashes = self.req_to_block_hashes[request.request_id]
+        if not block_hashes:
+            assert self.block_size is not None
+            block_hashes = hash_request_tokens(self.caching_hash_fn,
+                                               self.block_size, request)
+            self.req_to_block_hashes[request.request_id] = block_hashes
+
+        if self.log_stats:
+            assert self.prefix_cache_stats is not None
+            self.prefix_cache_stats.requests += 1
+
+        # NOTE: When all tokens hit the cache, we must recompute the last token
+        # to obtain logits. Thus, set max_cache_hit_length to prompt_length - 1.
+        # This can trigger recomputation of an entire block, rather than just
+        # the single last token, because allocate_slots() requires
+        # num_computed_tokens to be block-size aligned. Removing this limitation
+        # could slightly improve performance in the future.
+        max_cache_hit_length = request.num_tokens - 1
+        computed_blocks, num_new_computed_tokens = (
+            self.coordinator.find_longest_cache_hit(block_hashes,
+                                                    max_cache_hit_length))
+
+        if self.log_stats:
+            assert self.prefix_cache_stats is not None
+            self.prefix_cache_stats.queries += request.num_tokens
+            self.prefix_cache_stats.hits += num_new_computed_tokens
+
+        return KVCacheBlocks(computed_blocks), num_new_computed_tokens
+
+    def allocate_slots(
+        self,
+        request: Request,
+        num_new_tokens: int,
+        num_new_computed_tokens: int = 0,
+        new_computed_blocks: Optional[KVCacheBlocks] = None,
+        num_lookahead_tokens: int = 0,
+        delay_cache_blocks: bool = False,
+    ) -> Optional[KVCacheBlocks]:
+        """Add slots for a request with new tokens to append.
+
+        Args:
+            request: The request to allocate slots.
+            num_new_tokens: The number of tokens to allocate, including external
+                tokens. Note that this does not include tokens that have
+                already been computed locally (i.e. new_computed_blocks).
+            num_new_computed_tokens: The number of new computed tokens just
+                hitting the prefix caching, excluding external tokens.
+            new_computed_blocks: The cached blocks for the above new computed 
+                tokens.
+            num_lookahead_tokens: The number of speculative tokens to allocate.
+                This is used by spec decode proposers with kv-cache such 
+                as eagle.
+            delay_cache_blocks: Whether to skip caching the blocks. This is
+                used by P/D when allocating blocks used in a KV transfer
+                which will complete in a future step.
+
+        Blocks layout:
+        ```
+        -----------------------------------------------------------------------
+        | < computed > | < new computed > |    < new >    | < pre-allocated > |
+        -----------------------------------------------------------------------
+        |                  < required >                   |
+        --------------------------------------------------
+        |                    < full >                  |
+        ------------------------------------------------
+                                          | <new full> |
+                                          --------------
+        ```
+        The following *_blocks are illustrated in this layout.
+
+        Returns:
+            A list of new allocated blocks.
+        """
+        if num_new_tokens == 0:
+            raise ValueError("num_new_tokens must be greater than 0")
+
+        if new_computed_blocks is not None:
+            new_computed_block_list = new_computed_blocks.blocks
+        else:
+            new_computed_block_list = tuple(
+                [] for _ in range(len(self.kv_cache_config.kv_cache_groups)))
+
+        # Free the blocks that are skipped during the attention computation
+        # (e.g., tokens outside the sliding window).
+        # We can do this even if we cannot schedule this request due to
+        # insufficient free blocks.
+        # Should call this function before allocating new blocks to reduce
+        # the number of evicted blocks.
+        self.coordinator.remove_skipped_blocks(request.request_id,
+                                               request.num_computed_tokens)
+
+        # The number of computed tokens is the number of computed tokens plus
+        # the new prefix caching hits
+        num_computed_tokens = (request.num_computed_tokens +
+                               num_new_computed_tokens)
+        num_tokens_need_slot = min(
+            num_computed_tokens + num_new_tokens + num_lookahead_tokens,
+            self.max_model_len)
+
+        num_blocks_to_allocate = self.coordinator.get_num_blocks_to_allocate(
+            request_id=request.request_id,
+            num_tokens=num_tokens_need_slot,
+            new_computed_blocks=new_computed_block_list,
+        )
+
+        if num_blocks_to_allocate > self.block_pool.get_num_free_blocks():
+            # Cannot allocate new blocks
+            return None
+
+        # Touch the computed blocks to make sure they won't be evicted.
+        if self.enable_caching:
+            self.block_pool.touch(new_computed_block_list)
+        else:
+            assert not any(new_computed_block_list), (
+                "Computed blocks should be empty when "
+                "prefix caching is disabled")
+
+        # Append the new computed blocks to the request blocks until now to
+        # avoid the case where the new blocks cannot be allocated.
+        self.coordinator.save_new_computed_blocks(request.request_id,
+                                                  new_computed_block_list)
+
+        new_blocks = self.coordinator.allocate_new_blocks(
+            request.request_id, num_tokens_need_slot)
+
+        # P/D: delay caching blocks if we have to recv from
+        # remote. Update state for locally cached blocks.
+        if not self.enable_caching or delay_cache_blocks:
+            return KVCacheBlocks(new_blocks)
+
+        # NOTE(woosuk): We want to commit (cache) up to num_computed_tokens +
+        # num_new_tokens, but must exclude "non-committable" tokens (e.g.,
+        # draft tokens that could be rejected). Therefore, we cap the number
+        # at `request.num_tokens`, ensuring only "finalized" tokens are cached.
+        num_tokens_to_cache = min(num_computed_tokens + num_new_tokens,
+                                  request.num_tokens)
+        self.coordinator.cache_blocks(
+            request,
+            self.req_to_block_hashes[request.request_id],
+            num_tokens_to_cache,
+        )
+
+        return KVCacheBlocks(new_blocks)
+
+    def free(self, request: Request) -> None:
+        """Free the blocks allocated for the request.
+        We free the blocks in reverse order so that he tail blocks are evicted 
+        first when caching is enabled.
+
+        Args:
+            request: The request to free the blocks.
+        """
+        self.coordinator.free(request.request_id)
+
+    def reset_prefix_cache(self) -> bool:
+        """Reset prefix cache. This function may be used in RLHF
+        flows to invalidate prefix caching after the weights are updated,
+        or used for resetting prefix caching status for benchmarking.
+
+        Returns:
+            bool: True if the prefix cache is successfully reset,
+            False otherwise.
+        """
+        if not self.block_pool.reset_prefix_cache():
+            return False
+        if self.log_stats:
+            assert self.prefix_cache_stats is not None
+            self.prefix_cache_stats.reset = True
+        return True
+
+    def get_num_common_prefix_blocks(
+        self,
+        request: Request,
+        num_running_requests: int,
+    ) -> list[int]:
+        """Calculate the number of common prefix blocks shared by all requests
+        in the RUNNING state for each kv cache group.
+
+        The function determines this by selecting any request and iterating
+        through its blocks.  A block is considered a common prefix block if its
+        `ref_cnt` equals the total number of requests in the RUNNING state.
+
+        NOTE(woosuk): The number of requests in the RUNNING state is **greater
+        than or equal to** the number of requests scheduled in the current step.
+        This is because the RUNNING state only indicates that:
+        1. The request has not yet finished, and
+        2. The request holds its blocks unfreed.
+
+        While all scheduled requests must be in the RUNNING state, the inverse
+        is not necessarily true. There may be RUNNING requests that are not
+        scheduled in the current step.
+
+        This can result in an edge case where the number of common prefix blocks
+        is 0, even though all scheduled requests share a common prefix. This
+        occurs because there may be unscheduled RUNNING requests that do not
+        share the common prefix. Currently, this case cannot be easily detected,
+        so the function returns 0 in such cases.
+
+        Args:
+            request: Any request in the RUNNING state, used to identify the
+                common prefix blocks.
+            num_running_requests: The total number of requests in the RUNNING
+                state. This can be different from the number of scheduled
+                requests in the current step.
+
+        Returns:
+            list[int]: The number of common prefix blocks for each kv cache 
+            group.
+        """
+        assert request.status == RequestStatus.RUNNING
+        return self.coordinator.get_num_common_prefix_blocks(
+            request.request_id, num_running_requests)
+
+    def free_block_hashes(self, request: Request) -> None:
+        """Discard the block hashes for the request.
+
+        NOTE: Unlike `free`, this method should be called only when the request
+        is finished, not when it is preempted.
+        """
+        self.req_to_block_hashes.pop(request.request_id, None)
+
+    def take_events(self) -> list[KVCacheEvent]:
+        """Take the KV cache events from the block pool.
+
+        Returns:
+            A list of KV cache events.
+        """
+        return self.block_pool.take_events()
+
+    def get_block_ids(self, request_id: str) -> tuple[list[int], ...]:
+        """Get the block ids of a request."""
+        return KVCacheBlocks(
+            self.coordinator.get_blocks(request_id)).get_block_ids()
+
+    def cache_blocks(self, request: Request, num_computed_tokens: int) -> None:
+        """Cache the blocks for the request, if enabled."""
+        if self.enable_caching:
+            block_hashes = self.req_to_block_hashes[request.request_id]
+            self.coordinator.cache_blocks(request, block_hashes,
+                                          num_computed_tokens)
+
+    def create_empty_block_list(self) -> KVCacheBlocks:
+        """Creates a new KVCacheBlocks instance with no blocks."""
+        return KVCacheBlocks(tuple([]
+                                   for _ in range(self.num_kv_cache_groups)))
diff --git a/vllm_v0.10.0/vllm/v1/core/kv_cache_utils.py b/vllm_v0.10.0/vllm/v1/core/kv_cache_utils.py
new file mode 100644
index 0000000..5b02186
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/core/kv_cache_utils.py
@@ -0,0 +1,1143 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""KV-Cache Utilities."""
+
+import os
+from collections import defaultdict, deque
+from collections.abc import Iterable, Sequence
+from dataclasses import dataclass
+from typing import Any, Callable, NamedTuple, Optional
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.utils import GiB_bytes, cdiv, sha256_cbor_64bit
+from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
+                                        FullAttentionSpec, KVCacheConfig,
+                                        KVCacheGroupSpec, KVCacheSpec,
+                                        KVCacheTensor, SlidingWindowSpec)
+from vllm.v1.metrics.stats import PrefixCacheStats
+from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+
+class BlockHash(NamedTuple):
+    """Hash value of a block (int), the token IDs in the block, and extra keys.
+    We keep a tuple of token IDs and extra keys to reduce the likelihood of
+    hash collisions when the hash value is the same. By using SHA256 however,
+    hash collisions are practically impossible.
+    """
+    # Hash value of the block in an integer.
+    hash_value: int
+    # Token IDs in the block.
+    token_ids: tuple[int, ...]
+    # Extra keys for the block.
+    extra_keys: Optional[Any] = None
+
+
+class BlockHashWithGroupId(NamedTuple):
+    # The hash value for the contents (e.g., token_ids) of a block without group
+    # ID. The value is the same for blocks representing the same tokens but for
+    # different groups.
+    block_hash: BlockHash
+    # The KV cache group ID.
+    group_id: int
+
+    def get_hash_value(self) -> int:
+        return self.block_hash.hash_value
+
+
+# The hash seed for the first block of any prefix block sequence.
+#
+# We use a random value to avoid hash collisions or PYTHONHASHSEED environment
+# variable if set such that processes can share the seed if needed.
+# This aligns with the behavior of Python's hash() function, which also uses
+# a random seed if PYTHONHASHSEED is not set.
+#
+# The function `init_none_hash` initializes this variable globally.
+NONE_HASH: int
+
+
+def init_none_hash(hash_fn: Callable):
+    global NONE_HASH
+
+    hash_seed = os.getenv("PYTHONHASHSEED")
+    if hash_seed is None and hash_fn is sha256_cbor_64bit:
+        logger.warning(
+            "PYTHONHASHSEED is not set. This will lead to non-reproducible "
+            "block-hashes when using sha256_cbor_64bit as the hash function."
+            "Consider setting PYTHONHASHSEED to a fixed value for "
+            "reproducibility.")
+
+    NONE_HASH = (int.from_bytes(os.urandom(32), byteorder="big")
+                 if hash_seed is None else hash_fn(hash_seed))
+
+
+class PrefixCachingMetrics:
+    """Metrics for prefix caching with a hit rate of the max recent N requests.
+
+    Args:
+        max_recent_requests: The number of the max recent requests to aggregate.
+            Defaults to 1000.
+    """
+
+    def __init__(self, max_recent_requests: int = 1000):
+        self.max_recent_requests = max_recent_requests
+        # The current aggregated values.
+        self.aggregated_requests = 0
+        self.aggregated_query_total = 0
+        self.aggregated_query_hit = 0
+        # A deque of (requests, queries, hits) for the most recent requests.
+        self.query_queue: deque[tuple[int, int, int]] = deque()
+
+    def observe(self, stats: PrefixCacheStats):
+        """Observe the prefix caching for a set of requests.
+
+        This function is called with information gathered when new requests
+        are being scheduled and are looking for computed blocks.
+
+        When there are more than `interval` requests, the oldest set of
+        requests are removed from the metrics.
+
+        Args:
+            stats: The prefix cache stats.
+        """
+        # reset_prefix_cache was invoked before the current update.
+        # Reset the metrics before aggregating the current stats.
+        if stats.reset:
+            self.reset()
+
+        # Update the metrics.
+        self.query_queue.append((stats.requests, stats.queries, stats.hits))
+        self.aggregated_requests += stats.requests
+        self.aggregated_query_total += stats.queries
+        self.aggregated_query_hit += stats.hits
+
+        # Remove the oldest stats if the number of requests exceeds.
+        if self.aggregated_requests > self.max_recent_requests:
+            old_requests, old_queries, old_hits = self.query_queue.popleft()
+            self.aggregated_requests -= old_requests
+            self.aggregated_query_total -= old_queries
+            self.aggregated_query_hit -= old_hits
+
+    def reset(self):
+        """Reset the metrics."""
+        self.aggregated_requests = 0
+        self.aggregated_query_total = 0
+        self.aggregated_query_hit = 0
+        self.query_queue.clear()
+
+    @property
+    def hit_rate(self) -> float:
+        """Calculate the hit rate for the past N requests."""
+        if self.aggregated_query_total == 0:
+            return 0.0
+        return self.aggregated_query_hit / self.aggregated_query_total
+
+
+@dataclass
+class KVCacheBlock:
+    """KV-cache block metadata."""
+    # Block ID, ranging from 0 to num_gpu_blocks - 1.
+    block_id: int
+    # Reference count.
+    ref_cnt: int = 0
+    # The hash of the block composed of (block hash, tuple of token IDs).
+    # It is only available when the block is full.
+    _block_hash: Optional[BlockHashWithGroupId] = None
+
+    # Used to construct a doubly linked list for free blocks.
+    # These two attributes should only be manipulated by FreeKVCacheBlockQueue.
+    prev_free_block: Optional["KVCacheBlock"] = None
+    next_free_block: Optional["KVCacheBlock"] = None
+
+    # Whether the block is a null block that should never be cached.
+    is_null: bool = False
+
+    # TODO(Jialin): For performance, let callers handle ref_cnt bumps to
+    # avoid function calls.
+    def incr_ref(self):
+        self.ref_cnt += 1
+
+    def decr_ref(self):
+        self.ref_cnt -= 1
+
+    @property
+    def block_hash(self) -> Optional[BlockHashWithGroupId]:
+        return self._block_hash
+
+    @block_hash.setter
+    def block_hash(self, block_hash: BlockHashWithGroupId):
+        assert self.block_hash is None, (
+            "The block already has a hash. This should not happen.")
+        self._block_hash = block_hash
+
+    def reset_hash(self):
+        """Reset the block hash when the block is evicted."""
+        self._block_hash = None
+
+    def __repr__(self) -> str:
+        # Use block_id instead of KVCacheBlock object to avoid calling __repr__
+        # on KVCacheBlock object recursively.
+        prev_block_id = (self.prev_free_block.block_id
+                         if self.prev_free_block else None)
+        next_block_id = (self.next_free_block.block_id
+                         if self.next_free_block else None)
+        return (f"KVCacheBlock(block_id={self.block_id}, "
+                f"ref_cnt={self.ref_cnt}, "
+                f"_block_hash={self._block_hash}, "
+                f"prev_free_block={prev_block_id}, "
+                f"next_free_block={next_block_id})")
+
+
+class FreeKVCacheBlockQueue:
+    """This class organizes a list of KVCacheBlock objects to a doubly linked
+    list of free blocks. We implement this class instead of using Python
+    builtin deque to support removing a block in the middle of the queue
+    in O(1) time. To close the performance gap to the builtin deque which is
+    implemented in C++, this class does not allocate any Python objects when
+    manipulating the linked list. Instead, this class manipulates the
+    prev_free_block and next_free_block attributes of the given blocks.
+
+    The queue is ordered by block ID in the beginning. When a block is allocated
+    and then freed, it will be appended back with the eviction order:
+    1. The least recent used block is at the front (LRU).
+    2. If two blocks have the same last accessed time (allocated by the
+       same sequence), the one with more hash tokens (the tail of a block
+       chain) is at the front.
+    Note that we maintain this order by reversing the block order when free
+    blocks of a request. This operation is outside of this class.
+
+    Args:
+        blocks: A list of KVCacheBlock objects.
+    """
+
+    def __init__(self, blocks: list[KVCacheBlock]) -> None:
+        self.num_free_blocks = len(blocks)
+
+        # Initialize doubly links of consecutive blocks
+        for i in range(self.num_free_blocks):
+            if i > 0:
+                blocks[i].prev_free_block = blocks[i - 1]
+            if i < self.num_free_blocks - 1:
+                blocks[i].next_free_block = blocks[i + 1]
+
+        # Create a fake head and a tail block for the doubly linked list to
+        # reduce branching in the code
+        #
+        # The implementation garenteed that the fake head and tail
+        # are NEVER got popped, so we could safely assume each real blocks
+        # in the queue has prev and next blocks.
+        self.fake_free_list_head = KVCacheBlock(block_id=-1)
+        self.fake_free_list_tail = KVCacheBlock(block_id=-1)
+        if self.num_free_blocks > 0:
+            # Connect fake_head and fake_tail to the first and last block
+            # respectively.
+            self.fake_free_list_head.next_free_block = blocks[0]
+            blocks[0].prev_free_block = self.fake_free_list_head
+            self.fake_free_list_tail.prev_free_block = blocks[-1]
+            blocks[-1].next_free_block = self.fake_free_list_tail
+        else:
+            # For empty list, simply connect the fake head and tail.
+            self.fake_free_list_head.next_free_block = self.fake_free_list_tail
+            self.fake_free_list_tail.prev_free_block = self.fake_free_list_head
+
+    def popleft(self) -> KVCacheBlock:
+        """Pop the first free block and reduce num_free_blocks by 1.
+
+        Returns:
+            The first free block.
+        """
+        if (self.fake_free_list_head.next_free_block
+                is self.fake_free_list_tail
+                or self.fake_free_list_head.next_free_block is None):
+            assert self.num_free_blocks == 0, (
+                f"num_free_blocks ({self.num_free_blocks}) is out of sync "
+                "with the free list.")
+            raise ValueError("No free blocks available")
+
+        first_block: KVCacheBlock = self.fake_free_list_head.next_free_block
+
+        if first_block.next_free_block is None:
+            # This should not happen if the block is from the free list.
+            # It indicates a bug in the caller's logic.
+            raise RuntimeError("Invalid block found in popleft() "
+                               "which doesn't have a valid next_free_block")
+
+        # Connect fake_head and the next block of first_block (i.e. second block
+        # or fake tail).
+        self.fake_free_list_head.next_free_block = first_block.next_free_block
+        first_block.next_free_block.prev_free_block = self.fake_free_list_head
+
+        # Remove the block from the linked list.
+        first_block.prev_free_block = first_block.next_free_block = None
+
+        self.num_free_blocks -= 1
+        return first_block
+
+    def popleft_n(self, n: int) -> list[KVCacheBlock]:
+        """Pop the first n free blocks and reduce num_free_blocks by n.
+
+        Args:
+            n: The number of blocks to pop.
+
+        Returns:
+            A list of n free blocks.
+        """
+        if n == 0:
+            return []
+        assert self.num_free_blocks >= n
+        self.num_free_blocks -= n
+
+        curr_block = self.fake_free_list_head.next_free_block
+        # Pop n blocks from the head of the list
+        ret = []
+        for _ in range(n):
+            assert curr_block is not None
+            ret.append(curr_block)
+            last_block = curr_block
+            curr_block = curr_block.next_free_block
+            # Reset prev_free_block and next_free_block of all popped blocks
+            last_block.prev_free_block = None
+            last_block.next_free_block = None
+
+        if curr_block is not None:
+            # The queue is not empty, connect the fake head to
+            # the new first block.
+            self.fake_free_list_head.next_free_block = curr_block
+            curr_block.prev_free_block = self.fake_free_list_head
+        return ret
+
+    def remove(self, block: KVCacheBlock) -> None:
+        """Remove a block in the free list and reduce num_free_blocks by 1.
+
+        Args:
+            block: The block to remove.
+        """
+        if block.prev_free_block is None or block.next_free_block is None:
+            # This should not happen if the block is from the free list.
+            # It indicates a bug in the caller's logic.
+            raise RuntimeError(f"remove() called on an invalid block: {block}")
+
+        # Link the previous block to the next block.
+        block.prev_free_block.next_free_block = block.next_free_block
+        # Link the next block to the previous block.
+        block.next_free_block.prev_free_block = block.prev_free_block
+
+        # Remove the block from the linked list.
+        block.prev_free_block = block.next_free_block = None
+        self.num_free_blocks -= 1
+
+    def append(self, block: KVCacheBlock) -> None:
+        """Put a block back into the free list and increase
+        num_free_blocks by 1.
+
+        Args:
+            block: The block to append.
+        """
+        if self.fake_free_list_tail.prev_free_block is None:
+            raise RuntimeError(
+                "prev_free_block of fake_free_list_tail should always exist")
+        last_block: KVCacheBlock = self.fake_free_list_tail.prev_free_block
+
+        # Connect the new block after the last block.
+        last_block.next_free_block = block
+        block.prev_free_block = last_block
+
+        # Connect the fake tail after the new block.
+        block.next_free_block = self.fake_free_list_tail
+        self.fake_free_list_tail.prev_free_block = block
+
+        self.num_free_blocks += 1
+
+    def append_n(self, blocks: list[KVCacheBlock]) -> None:
+        """Put a list of blocks back into the free list
+
+        Args:
+            blocks: The blocks to append.
+        """
+        if len(blocks) == 0:
+            return
+        self.num_free_blocks += len(blocks)
+
+        last_block = self.fake_free_list_tail.prev_free_block
+        assert last_block is not None, (
+            "prev_free_block of fake_free_list_tail should always exist")
+        # Add inter-connections between consecutive blocks
+        for block in blocks:
+            block.prev_free_block = last_block
+            last_block.next_free_block = block
+            last_block = block
+
+        # Connect the last block of <blocks> to the fake tail
+        last_block.next_free_block = self.fake_free_list_tail
+        self.fake_free_list_tail.prev_free_block = last_block
+
+    def get_all_free_blocks(self) -> list[KVCacheBlock]:
+        """Get all free blocks in the free list. Mainly used for testing.
+
+        Returns:
+            A list of free blocks.
+        """
+        ret = []
+        if self.fake_free_list_head.next_free_block is None:
+            raise RuntimeError(
+                "next_free_block of fake_free_list_head should always exist")
+        # Start from the first block
+        curr_block: KVCacheBlock = self.fake_free_list_head.next_free_block
+        # As long as next_free_block is available, we haven't reached to
+        # the fake tail yet.
+        while curr_block.next_free_block is not None:
+            ret.append(curr_block)
+            curr_block = curr_block.next_free_block
+        return ret
+
+
+def need_extra_keys(request: Request) -> bool:
+    """Check whether the blocks allocated to this request need extra hash keys.
+
+    Args:
+        request (Request): The request.
+
+    Returns:
+        bool: Whether blocks allocated to this request need extra hash keys.
+    """
+
+    # Multimodal requests need to include the MM hash.
+    # LoRA requests need to include the LoRA ID.
+    # Request with provided cache salt need to include the salt.
+    return bool(request.mm_hashes) or (request.lora_request
+                                       is not None) or (request.cache_salt
+                                                        is not None)
+
+
+def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
+                            end_token_idx: int,
+                            start_mm_idx: int) -> tuple[list[Any], int]:
+    """Generate extra keys related to MultiModal request for block hash
+    computation. For multi-modal inputs, the extra keys are
+    (mm_hash, start_offset) that indicate a mm input contained in the
+    block and its starting offset in the block tokens.
+
+    Args:
+        request: The request object.
+        start_token_idx: The start token index of the block.
+        end_token_idx: The end token index of the block.
+        start_mm_idx: The start multi-modal index of the block.
+
+    Returns:
+        A tuple of extra keys and the next multi-modal index.
+    """
+    extra_keys: list[Any] = []
+
+    mm_positions, mm_hashes = request.mm_positions, request.mm_hashes
+    if not mm_positions:
+        return extra_keys, start_mm_idx
+
+    if mm_positions and len(mm_positions) != len(mm_hashes):
+        raise ValueError(
+            "The number of multi-modal positions and hashes must match. This "
+            "is likely because you do not enable MM preprocessor hashing. "
+            "Please set disable_mm_preprocessor_cache=False.")
+
+    # Note that we assume mm_positions is sorted by offset.
+    # We do not need to check all mm inputs if the start token index is out of
+    # range. This usually happens in the late prefill phase and decoding phase.
+    if mm_positions[-1].offset + mm_positions[-1].length < start_token_idx:
+        return extra_keys, start_mm_idx
+
+    # Support start_mm_idx == -1 to indicate the last mm input.
+    if start_mm_idx < 0:
+        assert -start_mm_idx <= len(mm_positions)
+        start_mm_idx = len(mm_positions) + start_mm_idx
+
+    curr_mm_idx = start_mm_idx
+    while mm_positions and curr_mm_idx < len(mm_positions):
+        assert mm_hashes[curr_mm_idx] is not None
+        offset = mm_positions[curr_mm_idx].offset
+        length = mm_positions[curr_mm_idx].length
+        if end_token_idx > offset:
+            if start_token_idx > offset + length:
+                # This block has passed the current mm input.
+                curr_mm_idx += 1
+                continue
+
+            # The block contains the current mm input.
+            extra_keys.append(mm_hashes[curr_mm_idx])
+
+            if end_token_idx >= offset + length:
+                # If this block contains the end of the current mm input,
+                # move to the next mm input as this block may also contain
+                # the next mm input.
+                curr_mm_idx += 1
+            else:
+                # Otherwise this block is done with mm inputs.
+                break
+        else:
+            # This block has not reached the current mm input.
+            break
+    return extra_keys, curr_mm_idx
+
+
+def _gen_lora_extra_hash_keys(request: Request) -> list[int]:
+    """Generate extra keys related to LoRA for block hash computation.
+
+    Args:
+        request: The request object.
+
+    Returns:
+        Return LoRA id of the request if it is a LoRA request. Return empty
+        list otherwise.
+    """
+    if not request.lora_request:
+        return []
+    return [request.lora_request.lora_int_id]
+
+
+def generate_block_hash_extra_keys(
+        request: Request, start_token_idx: int, end_token_idx: int,
+        start_mm_idx: int) -> tuple[Optional[tuple[Any, ...]], int]:
+    """Generate extra keys for the block hash. The extra keys can come from
+    the multi-modal inputs and request specific metadata (e.g., LoRA ID).
+
+    Args:
+        request: The request object.
+        start_token_idx: The start token index of the block.
+        end_token_idx: The end token index of the block.
+        start_mm_idx: The start multi-modal index of the block.
+
+    Returns:
+        A tuple of extra keys and the next multi-modal index.
+    """
+    mm_extra_keys: list[Any]
+    mm_extra_keys, new_start_mm_idx = _gen_mm_extra_hash_keys(
+        request, start_token_idx, end_token_idx, start_mm_idx)
+    lora_extra_keys: list[int] = _gen_lora_extra_hash_keys(request)
+    cache_salt_keys: list[str] = [request.cache_salt] if (
+        start_token_idx == 0 and request.cache_salt) else []
+
+    extra_keys: list[Any] = lora_extra_keys + mm_extra_keys + cache_salt_keys
+
+    if not extra_keys:
+        return None, new_start_mm_idx
+
+    return tuple(extra_keys), new_start_mm_idx
+
+
+def hash_block_tokens(
+        hash_function: Callable,
+        parent_block_hash: Optional[int],
+        curr_block_token_ids: Sequence[int],
+        extra_keys: Optional[tuple[Any, ...]] = None) -> BlockHash:
+    """Computes a hash value corresponding to the contents of a block and
+    the contents of the preceding block(s). The hash value is used for
+    prefix caching. We use LRU cache for this function to avoid recomputing
+    hash values for the same block contents.
+
+    Args:
+        parent_block_hash: The hash of the parent block. None
+            if this is the first block.
+        curr_block_token_ids: A list of token ids in the current
+            block. The current block is assumed to be full.
+        extra_keys: Extra keys for the block.
+
+    Returns:
+        The hash value of the block and the token ids in the block.
+        The entire tuple is used as the hash key of the block.
+    """
+    if not parent_block_hash:
+        parent_block_hash = NONE_HASH
+
+    curr_block_token_ids_tuple = tuple(curr_block_token_ids)
+    return BlockHash(
+        hash_function(
+            (parent_block_hash, curr_block_token_ids_tuple, extra_keys)),
+        curr_block_token_ids_tuple, extra_keys)
+
+
+def hash_request_tokens(hash_function: Any, block_size: int,
+                        request: Request) -> list[BlockHash]:
+    """Computes hash values of a chain of blocks given a sequence of
+    token IDs. The hash value is used for prefix caching.
+
+    Args:
+        block_size: The size of each block.
+        request: The request object.
+
+    Returns:
+        The list of computed hash values.
+    """
+    token_ids = request.all_token_ids
+
+    req_need_extra_keys = need_extra_keys(request)
+    req_extra_keys = None
+    curr_mm_idx = 0
+
+    ret = []
+    parent_block_hash_value = None
+    for start in range(0, len(token_ids), block_size):
+        end = start + block_size
+        block_token_ids = token_ids[start:end]
+        # Do not hash the block if it is not full.
+        if len(block_token_ids) < block_size:
+            break
+
+        if req_need_extra_keys:
+            # MM and LoRA requests need extra keys for block-hash computation.
+            req_extra_keys, curr_mm_idx = generate_block_hash_extra_keys(
+                request, start, end, curr_mm_idx)
+
+        block_hash = hash_block_tokens(hash_function, parent_block_hash_value,
+                                       block_token_ids, req_extra_keys)
+        ret.append(block_hash)
+        parent_block_hash_value = block_hash.hash_value
+    return ret
+
+
+def max_memory_usage_bytes(vllm_config: VllmConfig,
+                           kv_cache_specs: Iterable[KVCacheSpec]) -> int:
+    """
+    Get the maximum memory usage in bytes for the given KV cache specs.
+    """
+    return sum(
+        spec.max_memory_usage_bytes(vllm_config) for spec in kv_cache_specs)
+
+
+def estimate_max_model_len(vllm_config: VllmConfig,
+                           kv_cache_spec: dict[str, KVCacheSpec],
+                           available_memory: int) -> int:
+    """
+    Estimates the maximum model length that can fit in the available memory
+    using binary search.
+
+    Args:
+        vllm_config: The global VllmConfig
+        kv_cache_spec: The kv cache spec of each attention layer in the model
+        available_memory: Memory available for KV cache in bytes.
+
+    Returns:
+        The estimated maximum model length that can fit in the available memory.
+    """
+
+    # Define a function to check if a given model length fits in memory
+    def fits_in_memory(model_len: int) -> bool:
+        # Modify the max_model_len for this calculation
+        vllm_config.model_config.max_model_len = model_len
+        # Calculate memory needed for the given model length
+        memory_needed = max_memory_usage_bytes(vllm_config,
+                                               kv_cache_spec.values())
+        return memory_needed <= available_memory
+
+    # Binary search for the maximum model length
+    current_max = vllm_config.model_config.max_model_len
+    left, right = 1, current_max
+
+    # If even the smallest model length doesn't fit, return 0
+    if not fits_in_memory(left):
+        return 0
+
+    # Binary search for the maximum model length that fits
+    result = 1
+    while left <= right:
+        mid = (left + right) // 2
+        if fits_in_memory(mid):
+            result = mid
+            left = mid + 1
+        else:
+            right = mid - 1
+    return result
+
+
+def check_enough_kv_cache_memory(vllm_config: VllmConfig,
+                                 kv_cache_spec: dict[str, KVCacheSpec],
+                                 available_memory: int):
+    """
+    Checks whether `available_memory` is enough for the KV cache to hold at
+    least one request with the model's max_model_len.
+
+    Args:
+        vllm_config: The global VllmConfig
+        kv_cache_spec: The kv cache spec of each attention layer in the model
+        available_memory: Memory available for KV cache in bytes.
+
+    Raises:
+        ValueError: If there is not enough memory available for the KV cache.
+    """
+
+    # No need to check for available memory if the kv_cache_spec is empty
+    if not kv_cache_spec:
+        return
+
+    if available_memory <= 0:
+        raise ValueError("No available memory for the cache blocks. "
+                         "Try increasing `gpu_memory_utilization` when "
+                         "initializing the engine.")
+
+    max_model_len = vllm_config.model_config.max_model_len
+    needed_memory = max_memory_usage_bytes(vllm_config, kv_cache_spec.values())
+
+    if needed_memory > available_memory:
+        # Estimate the maximum model length that can fit in the available memory
+        estimated_max_len = estimate_max_model_len(vllm_config, kv_cache_spec,
+                                                   available_memory)
+        estimated_msg = ""
+        if estimated_max_len > 0:
+            estimated_msg = (
+                "Based on the available memory, "
+                f"the estimated maximum model length is {estimated_max_len}.")
+
+        raise ValueError(
+            f"To serve at least one request with the models's max seq len "
+            f"({max_model_len}), ({needed_memory/GiB_bytes:.2f} GiB KV "
+            f"cache is needed, which is larger than the available KV cache "
+            f"memory ({available_memory/GiB_bytes:.2f} GiB). "
+            f"{estimated_msg} "
+            f"Try increasing `gpu_memory_utilization` or decreasing "
+            f"`max_model_len` when initializing the engine.")
+
+
+def create_kv_cache_group_specs(
+        kv_cache_spec: dict[str, KVCacheSpec],
+        grouped_layer_names: list[list[str]]) -> list[KVCacheGroupSpec]:
+    """
+    Create KVCacheGroupSpec object for each kv cache group layer.
+    The layers in the same group should share the same
+    KVCacheSpec.
+
+    Args:
+        kv_cache_spec:
+            A mapping from each layer name to its corresponding KVCacheSpec.
+        grouped_layer_names:
+            A list of kv cache groups, where each element is a list of layer
+            names that belong to the same group and should share the same
+            KVCacheSpec.
+    Returns:
+        A list of KVCacheGroupSpec objects, one for each group.
+    """
+    kv_cache_groups = []
+    for layer_names_one_group in grouped_layer_names:
+        layer_specs = [
+            kv_cache_spec[layer_name] for layer_name in layer_names_one_group
+        ]
+        merged_layer_spec = layer_specs[0].merge(layer_specs)
+        kv_cache_groups.append(
+            KVCacheGroupSpec(layer_names_one_group, merged_layer_spec))
+    return kv_cache_groups
+
+
+def is_kv_cache_type_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
+    """
+    Whether all layers in the given KVCacheSpec have the same type of KV cache.
+
+    Args:
+        kv_cache_spec: The kv cache spec of each attention layer in the model
+
+    Returns:
+        True if all layers have the same type, False otherwise.
+    """
+
+    layer_keys = set(layer.type_id for layer in kv_cache_spec.values())
+    return len(layer_keys) == 1
+
+
+def get_max_concurrency_for_kv_cache_config(
+        vllm_config: VllmConfig, kv_cache_config: KVCacheConfig) -> float:
+    """
+    Get the maximum concurrency for the given KV cache configuration.
+    """
+    num_layer_per_group = max(
+        len(group.layer_names) for group in kv_cache_config.kv_cache_groups)
+    max_memory_usage_per_request = num_layer_per_group * max_memory_usage_bytes(
+        vllm_config,
+        (group.kv_cache_spec for group in kv_cache_config.kv_cache_groups))
+    memory_per_block = kv_cache_config.kv_cache_groups[
+        0].kv_cache_spec.page_size_bytes * num_layer_per_group
+    num_block_per_request = cdiv(max_memory_usage_per_request,
+                                 memory_per_block)
+    max_concurrency = kv_cache_config.num_blocks / num_block_per_request
+    return max_concurrency
+
+
+def get_num_blocks(vllm_config: VllmConfig, num_layers: int,
+                   available_memory: int, page_size: int) -> int:
+    """
+    Get the number of kv cache blocks.
+
+    Args:
+        vllm_config: The global VllmConfig
+        num_layers: The number of layers
+        available_memory: Memory available for KV cache in bytes.
+        page_size: The page size of the KV cache.
+    """
+    num_blocks = int(available_memory // page_size // num_layers)
+    num_blocks = max(num_blocks, 0)
+    if vllm_config.cache_config.num_gpu_blocks_override is not None:
+        num_gpu_blocks_override = \
+            vllm_config.cache_config.num_gpu_blocks_override
+        logger.info(
+            "Overriding num_gpu_blocks=%d with "
+            "num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override)
+        num_blocks = num_gpu_blocks_override
+    return num_blocks
+
+
+def get_uniform_page_size(kv_cache_spec: dict[str, KVCacheSpec]) -> int:
+    """
+    Get the page size of the KV cache.
+    """
+    page_sizes = set(layer.page_size_bytes for layer in kv_cache_spec.values())
+    assert len(page_sizes) == 1
+    return page_sizes.pop()
+
+
+def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
+                                      kv_cache_spec: dict[str, KVCacheSpec],
+                                      available_memory: int) -> KVCacheConfig:
+    """
+    Generates the KV cache configuration for a model with one type of KV cache.
+    Divide the available memory equally among all layers.
+
+    Args:
+        vllm_config: The global VllmConfig
+        kv_cache_spec: The kv cache spec of each attention layer in the model
+        available_memory: Memory available for KV cache in bytes.
+
+    Returns:
+        The generated KVCacheConfig
+    """
+
+    page_size = get_uniform_page_size(kv_cache_spec)
+    num_blocks = get_num_blocks(vllm_config, len(kv_cache_spec),
+                                available_memory, page_size)
+
+    per_layer_size = page_size * num_blocks
+    # All layers have the same KV cache spec, so we create one kv cache group
+    # for all layers.
+    grouped_layer_names = [list(kv_cache_spec.keys())]
+
+    # Each layer uses a separate Tensor to store its KV cache.
+    kv_cache_tensors = [
+        KVCacheTensor(size=per_layer_size, shared_by=[layer_name])
+        for layer_name in kv_cache_spec
+    ]
+
+    kv_cache_config = KVCacheConfig(
+        num_blocks=num_blocks,
+        kv_cache_tensors=kv_cache_tensors,
+        kv_cache_groups=create_kv_cache_group_specs(kv_cache_spec,
+                                                    grouped_layer_names),
+    )
+
+    num_tokens = num_blocks * vllm_config.cache_config.block_size
+    num_tokens_str = f"{num_tokens:,}"
+    logger.info("GPU KV cache size: %s tokens", num_tokens_str)
+    max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
+    max_concurrency = get_max_concurrency_for_kv_cache_config(
+        vllm_config, kv_cache_config)
+    logger.info("Maximum concurrency for %s tokens per request: %.2fx",
+                max_model_len_str, max_concurrency)
+    return kv_cache_config
+
+
+def is_kv_cache_page_size_uniform(
+        kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
+    """
+    Whether all layers in the given KVCacheSpec have the same page size.
+    Args:
+        kv_cache_spec: The KVCacheSpec of each attention layer in the model
+
+    Returns:
+        True if all layers have the same page size, False otherwise.
+    """
+
+    page_sizes = {layer.page_size_bytes for layer in kv_cache_spec.values()}
+    return len(page_sizes) == 1
+
+
+def is_kv_cache_type_attention_free(
+        kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
+
+    # kv_cache_spec is an empty dict for attention free models
+    return not kv_cache_spec
+
+
+def _get_kv_cache_config_uniform_page_size(
+        vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec],
+        available_memory: int) -> KVCacheConfig:
+    """
+    Generates the KV cache configuration for hybrid models with multiple 
+    attention types but still with a uniform page size (physical memory per 
+    block per layer) for all layers.
+
+    Detailed explanation about kv cache management of hybrid models:
+    The layers in the models are repeated with some patterns, e.g., a model
+    with 10 full attention layers and 20 sliding window attention layers can be
+    regarded as repeating the pattern (1 * full, 2 * sw) 10 times. 
+    The KVCacheManager allocates different block tables for each of the 3 layers
+    in the pattern, and repeats each of them 10 times to generate the 
+    block_table for the 30 layers in the model.
+    Therefore, we can group the layers in the model into 3 kv_cache_groups, each
+    of which contains 10 layers in the model.
+    The KVCacheManager allocates the block_table for each group based on its
+    kv_cache spec, and the model runner applies the block table to each layer 
+    in the group.
+    For example:
+    1. A model only uses full attention. The pattern is 
+    (num_hidden_layers * full), so there is only one group and the block table 
+    is shared by all layers. It is already handled by 
+    `_get_kv_cache_config_uniform_type`.
+    2. A model with 10 full attention layers and 20 sliding window 
+    attention layers. There are 3 layers in the pattern (1 * full, 2 * sw), so 
+    there are 3 kv_cache_groups, each of which represents 10 layers.
+
+    To simplify the implementation, we make the following assumptions:
+    1. Physical memory per block: Must be the same across all KV cache groups. 
+    Breaking this assumption is non-trivial due to memory fragmentation concerns
+    when allocating blocks of different sizes.
+    2. Tokens per block (block_size): Currently, we directly use 
+    `CacheConfig.block_size` for all layers. It can be extended to vary by KV 
+    cache group, but within each KV cache group, all layers must share the same 
+    block size.
+    3. Physical memory per token per layer: This property is decided by model 
+    config. Currently we only support models that have the same physical memory 
+    per token per layer for all layers. Can be relaxed with a simple extension, 
+    but still need to keep physical memory per block the same for all groups.
+    4. Number of layers per group: Currently assumed the same for all layers. 
+    Can be relaxed with a simple extension, but still need to keep physical 
+    memory per block the same for all groups.
+    5. Attention type within groups: All layers in a group must share the same
+    attention type. One exception is that, when 
+    `--disable-hybrid-kv-cache-manager` is true, the single group for full 
+    attention layers may also include attention layers using sliding window or 
+    LLaMA 4 local attention. See `unify_hybrid_kv_cache_specs` for more details.
+    6. Support for multiple attention types: The design for most components is 
+    general to an arbitrary number of attention types. But 
+    `find_longest_cache_hit` only supports one attention type or two 
+    types of full-attention plus exactly one another type. The general
+    implementation of this function is feasible but we don't know how to 
+    implement it cleanly yet.
+
+    As we assume tokens per block, physical memory per token per layer, and 
+    number of layers per group are the same now, we can ensure that physical 
+    memory per block is the same for all groups.
+
+    Args:
+        vllm_config: The global VllmConfig
+        kv_cache_spec: The KVCacheSpec of each attention layer in the model
+        available_memory: Memory available for KV cache in bytes.
+    Returns:
+        The generated KVCacheConfig
+    """
+    # Group all layers by type_id.
+    # E.g., 2 full attention layers and 3 sliding window attention layers,
+    # -> (full.0, full.1), (sw.0, sw.1, sw.2).
+    same_type_layers: dict[str, list[str]] = defaultdict(list)
+    for layer_name, layer_spec in kv_cache_spec.items():
+        same_type_layers[layer_spec.type_id].append(layer_name)
+
+    # Split each group into smaller groups, to make the number of layers in each
+    # group identical. Add padding to the last group of each type if necessary.
+    # E.g., (full.0, full.1), (sw.0, sw.1, sw.2)
+    # split to 3 groups with 2 layers each:
+    # (full.0, full.1), (sw.0, sw.1), (sw.2, padding).
+    # FIXME(Chen): At the moment of writing this code (2025-06-02), all
+    # open-source hybrid model follows a n:1 pattern between different attention
+    # types (e.g., Gemma3 5:1 between sw and full, LLaMA4 3:1 between local and
+    # full), so we can use the "1" in the n:1 pattern as the group size, which
+    # is the minimum number of layers among all attention types. Need a better
+    # strategy if we want to support more complex patterns (e.g., 20 full + 30
+    # sw, where the group size should be 10).
+    group_size = min([len(layers) for layers in same_type_layers.values()])
+    grouped_layers = []
+    for layers in same_type_layers.values():
+        num_padding_layers = group_size - len(layers) % group_size
+        if num_padding_layers != group_size:
+            logger.warning(
+                "Add %d padding layers, may waste at most %.2f%% KV cache memory",  # noqa
+                num_padding_layers,
+                num_padding_layers / len(layers) * 100,
+            )
+        for i in range(0, len(layers), group_size):
+            grouped_layers.append(layers[i:i + group_size])
+    kv_cache_groups = create_kv_cache_group_specs(kv_cache_spec,
+                                                  grouped_layers)
+
+    # Determine how model runners should initialize the KV cache tensors.
+    # We will have group_size memory pools, each is shared by one layer from
+    # each group. As layers of different groups have different block table,
+    # they will use different parts of the shared Tensor.
+    # The memory layout in the example will be:
+    # full.0, sw.0, sw.2: share a Tensor with size=available_memory//2
+    # full.1, sw.1: share another Tensor with size=available_memory//2
+    page_size = get_uniform_page_size(kv_cache_spec)
+    num_blocks = get_num_blocks(vllm_config, group_size, available_memory,
+                                page_size)
+    per_memory_pool_size = page_size * num_blocks
+    kv_cache_tensors = []
+    for i in range(group_size):
+        shared_by = []
+        for j in range(len(kv_cache_groups)):
+            if i < len(grouped_layers[j]):
+                shared_by.append(grouped_layers[j][i])
+        kv_cache_tensors.append(
+            KVCacheTensor(size=per_memory_pool_size, shared_by=shared_by))
+
+    kv_cache_config = KVCacheConfig(
+        num_blocks=num_blocks,
+        kv_cache_tensors=kv_cache_tensors,
+        kv_cache_groups=kv_cache_groups,
+    )
+
+    min_block_size = min(
+        [group.kv_cache_spec.block_size for group in kv_cache_groups])
+
+    # Print the KV cache size and maximum concurrency.
+    num_tokens = num_blocks // len(grouped_layers) * min_block_size
+    num_tokens_str = f"{num_tokens:,}"
+    logger.info("GPU KV cache size: %s tokens", num_tokens_str)
+    max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
+    max_concurrency = get_max_concurrency_for_kv_cache_config(
+        vllm_config, kv_cache_config)
+    logger.info("Maximum concurrency for %s tokens per request: %.2fx",
+                max_model_len_str, max_concurrency)
+    return kv_cache_config
+
+
+def _get_kv_cache_config_attention_free() -> KVCacheConfig:
+    return KVCacheConfig(num_blocks=1, kv_cache_tensors=[], kv_cache_groups=[])
+
+
+def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
+    """
+    This function tries to convert the KV cache specs to one type if the model
+    is a hybrid model with multiple type of KV cache. It will convert all
+    SlidingWindowSpec to FullAttentionSpec if both types are present.
+
+    Args:
+        kv_cache_spec: The kv cache spec of each attention layer in the model
+    """
+
+    def is_hybrid(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
+        type_ids = set(layer_spec.type_id
+                       for layer_spec in kv_cache_spec.values())
+        return len(type_ids) > 1
+
+    if not is_hybrid(kv_cache_spec):
+        return
+
+    logger.warning(
+        "Hybrid KV cache manager is disabled for this hybrid model, "
+        "This means we do not enable any optimizations for saving KV cache "
+        "memory (e.g., dropping the KV cache outside the sliding window). "
+        "The compute of layers like sliding window is still saved.")
+
+    has_full_attention = any(
+        isinstance(spec, FullAttentionSpec) for spec in kv_cache_spec.values())
+    has_sliding_window = any(
+        isinstance(spec, SlidingWindowSpec) for spec in kv_cache_spec.values())
+    has_chunked_local_attention = any(
+        isinstance(spec, ChunkedLocalAttentionSpec)
+        for spec in kv_cache_spec.values())
+    if has_full_attention and (has_sliding_window
+                               or has_chunked_local_attention):
+        for layer_name, spec in kv_cache_spec.items():
+            if isinstance(spec, SlidingWindowSpec):
+                kv_cache_spec[layer_name] = FullAttentionSpec(
+                    block_size=spec.block_size,
+                    num_kv_heads=spec.num_kv_heads,
+                    head_size=spec.head_size,
+                    dtype=spec.dtype,
+                    use_mla=spec.use_mla,
+                    sliding_window=spec.sliding_window,
+                )
+            elif isinstance(spec, ChunkedLocalAttentionSpec):
+                kv_cache_spec[layer_name] = FullAttentionSpec(
+                    block_size=spec.block_size,
+                    num_kv_heads=spec.num_kv_heads,
+                    head_size=spec.head_size,
+                    dtype=spec.dtype,
+                    use_mla=spec.use_mla,
+                    attention_chunk_size=spec.attention_chunk_size,
+                )
+
+    if is_hybrid(kv_cache_spec):
+        raise ValueError("Hybrid KV cache manager is disabled but failed to "
+                         "convert the KV cache specs to one unified type.")
+
+
+def get_kv_cache_config(
+    vllm_config: VllmConfig,
+    kv_cache_spec: dict[str, KVCacheSpec],
+    available_memory: int,
+) -> KVCacheConfig:
+    """
+    Generates the KV cache configuration for a model.
+
+    Args:
+        vllm_config: The global VllmConfig
+        kv_cache_spec: The kv cache spec of each attention layer in the model
+        available_memory: Memory available for KV cache in bytes.
+
+    Returns:
+        The generated KVCacheConfigs
+    """
+    check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory)
+    if vllm_config.scheduler_config.disable_hybrid_kv_cache_manager:
+        unify_hybrid_kv_cache_specs(kv_cache_spec)
+
+    if is_kv_cache_type_attention_free(kv_cache_spec):
+        # This returns a kv_cache config with 0 kv_cache groups and 1 block
+        # to allow for the KVCache manager to handle attention free models.
+        return _get_kv_cache_config_attention_free()
+    elif is_kv_cache_type_uniform(kv_cache_spec):
+        # KV cache of all layers are the same, which is true for
+        # most models. Allocate the same amount of memory for
+        # each layer.
+        return _get_kv_cache_config_uniform_type(vllm_config, kv_cache_spec,
+                                                 available_memory)
+    elif is_kv_cache_page_size_uniform(kv_cache_spec):
+        # Model contains multiple attention types, but KV cache of all layers
+        # have the same physical memory per block per layer. Split the layers
+        # into groups with the same number of layers, and thus same total page
+        # size.
+        return _get_kv_cache_config_uniform_page_size(vllm_config,
+                                                      kv_cache_spec,
+                                                      available_memory)
+
+    raise NotImplementedError
+
+
+def unify_kv_cache_configs(kv_cache_configs: list[KVCacheConfig]):
+    """
+    Make the KV cache configurations for each worker consistent, so that all
+    workers can be controlled by the same KVCacheManager.
+    This function verifies that the layer group of each worker are the same,
+    and changes the num_blocks of each worker to the smallest among all workers.
+
+    Args:
+        kv_cache_configs: The KV cache configurations for each worker. Will be
+            in-place modified to make them consistent.
+    """
+
+    # Sort the kv cache groups by the type_id of their KV cache spec.
+    # This can avoid the inconsistency caused by the order of groups.
+    for kv_cache_config in kv_cache_configs:
+        kv_cache_config.kv_cache_groups.sort(
+            key=lambda x: x.kv_cache_spec.type_id)
+
+    # Verify that the groups of each rank are the same.
+    for kv_cache_config in kv_cache_configs[1:]:
+        for group_rank_0, group_rank_i in zip(
+                kv_cache_configs[0].kv_cache_groups,
+                kv_cache_config.kv_cache_groups):
+            assert group_rank_0.kv_cache_spec == group_rank_i.kv_cache_spec
+
+    # Change the num_blocks of each rank to the smallest among all ranks. We
+    # do not need to shrink the tensor size because it is valid to only use the
+    # first `num_blocks` blocks of the tensor.
+    min_num_blocks = min(kv_cache_config.num_blocks
+                         for kv_cache_config in kv_cache_configs)
+    for kv_cache_config in kv_cache_configs:
+        kv_cache_config.num_blocks = min_num_blocks
+
+    return kv_cache_configs
diff --git a/vllm_v0.10.0/vllm/v1/core/sched/__init__.py b/vllm_v0.10.0/vllm/v1/core/sched/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/v1/core/sched/async_scheduler.py b/vllm_v0.10.0/vllm/v1/core/sched/async_scheduler.py
new file mode 100644
index 0000000..74ff626
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/core/sched/async_scheduler.py
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+from vllm.logger import init_logger
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.core.sched.scheduler import Scheduler
+from vllm.v1.request import Request, RequestStatus
+
+logger = init_logger(__name__)
+
+
+class AsyncScheduler(Scheduler):
+
+    def _update_after_schedule(
+        self,
+        scheduler_output: SchedulerOutput,
+    ) -> None:
+        super()._update_after_schedule(scheduler_output)
+        for req_id in scheduler_output.num_scheduled_tokens:
+            request = self.requests[req_id]
+            if (request.num_computed_tokens == request.num_tokens +
+                    request.num_output_placeholders):
+                # The request will generate a new token in this scheduling step.
+                # TODO(woosuk): Support speculative decoding.
+                request.num_output_placeholders += 1
+
+    def _update_request_with_output(
+        self,
+        request: Request,
+        new_token_ids: list[int],
+    ) -> tuple[list[int], bool]:
+        status_before_update = request.status
+        new_token_ids, stopped = super()._update_request_with_output(
+            request, new_token_ids)
+
+        # Update the number of output placeholders.
+        request.num_output_placeholders -= len(new_token_ids)
+        assert request.num_output_placeholders >= 0
+
+        # Cache the new tokens. Preempted requests should be skipped.
+        if status_before_update == RequestStatus.RUNNING:
+            self.kv_cache_manager.cache_blocks(
+                request,
+                request.num_computed_tokens - request.num_output_placeholders)
+        return new_token_ids, stopped
diff --git a/vllm_v0.10.0/vllm/v1/core/sched/interface.py b/vllm_v0.10.0/vllm/v1/core/sched/interface.py
new file mode 100644
index 0000000..dd5052a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/core/sched/interface.py
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from collections.abc import Iterable
+from typing import TYPE_CHECKING, Optional, Union
+
+if TYPE_CHECKING:
+    from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
+    from vllm.v1.core.sched.output import SchedulerOutput
+    from vllm.v1.engine import EngineCoreOutputs
+    from vllm.v1.metrics.stats import SchedulerStats
+    from vllm.v1.outputs import ModelRunnerOutput
+    from vllm.v1.request import Request, RequestStatus
+
+
+class SchedulerInterface(ABC):
+
+    @abstractmethod
+    def schedule(self) -> "SchedulerOutput":
+        """Schedule the requests to process in this scheduling step.
+
+        The scheduling decision is made at the iteration level. Each scheduling
+        step corresponds to a single forward pass of the model. Therefore, this
+        method is called repeatedly by a busy loop in the engine.
+
+        Essentially, the scheduler produces a dictionary of {req_id: num_tokens}
+        that specifies how many tokens to process for each request in this
+        scheduling step. For example, num_tokens can be as large as the number
+        of prompt tokens for new requests, or it can be 1 for the requests that
+        are auto-regressively generating new tokens one by one. Otherwise, it
+        can be somewhere in between in case of chunked prefills, prefix caching,
+        speculative decoding, etc.
+
+        Additionally, the scheduler also returns useful data about each request
+        or the batch as a whole. The model runner will use this information in
+        preparing inputs to the model.
+
+        Returns:
+            A SchedulerOutput object containing information about the scheduled
+            requests.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def update_from_output(
+        self,
+        scheduler_output: "SchedulerOutput",
+        model_runner_output: "ModelRunnerOutput",
+    ) -> dict[int, "EngineCoreOutputs"]:
+        """Update the scheduler state based on the model runner output.
+
+        This method is called after the model runner has processed the scheduled
+        requests. The model runner output includes generated token ids, draft
+        token ids for next step, etc. The scheduler uses this information to
+        update its states, checks the finished requests, and returns the output
+        for each request.
+
+        Returns:
+            A dict of client index to EngineCoreOutputs object containing the
+            outputs for each request originating from that client.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_request(self, request: "Request") -> None:
+        """Add a new request to the scheduler's internal queue.
+        
+        Args:
+            request: The new request being added.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def finish_requests(
+        self,
+        request_ids: Union[str, Iterable[str]],
+        finished_status: "RequestStatus",
+    ) -> None:
+        """Finish the requests in the scheduler's internal queue. If the request
+        is not in the queue, this method will do nothing.
+
+        This method is called in two cases:
+        1. When the request is aborted by the client.
+        2. When the frontend process detects a stop string of the request after
+           de-tokenizing its generated tokens.
+           
+        Args:
+            request_ids: A single or a list of request IDs.
+            finished_status: The finished status of the given requests.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_num_unfinished_requests(self) -> int:
+        """Number of unfinished requests in the scheduler's internal queue."""
+        raise NotImplementedError
+
+    def has_unfinished_requests(self) -> bool:
+        """Returns True if there are unfinished requests in the scheduler's
+        internal queue."""
+        return self.get_num_unfinished_requests() > 0
+
+    @abstractmethod
+    def has_finished_requests(self) -> bool:
+        """Returns True if there are finished requests that need to be cleared.
+        NOTE: This is different from `not self.has_unfinished_requests()`.
+
+        The scheduler maintains an internal list of the requests finished in the
+        previous step. This list is returned from the next call to schedule(),
+        to be sent to the model runner in the next step to clear cached states
+        for these finished requests.
+
+        This method checks if this internal list of finished requests is
+        non-empty. This information is useful for DP attention.
+        """
+        raise NotImplementedError
+
+    def has_requests(self) -> bool:
+        """Returns True if there are unfinished requests, or finished requests
+        not yet returned in SchedulerOutputs."""
+        return self.has_unfinished_requests() or self.has_finished_requests()
+
+    @abstractmethod
+    def reset_prefix_cache(self) -> bool:
+        """Reset the prefix cache for KV cache.
+
+        This is particularly required when the model weights are live-updated.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_request_counts(self) -> tuple[int, int]:
+        """Returns (num_running_reqs, num_waiting_reqs)."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def make_stats(self) -> Optional["SchedulerStats"]:
+        """Make a SchedulerStats object for logging.
+
+        The SchedulerStats object is created for every scheduling step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def shutdown(self) -> None:
+        """Shutdown the scheduler."""
+        raise NotImplementedError
+
+    def get_kv_connector(self) -> Optional["KVConnectorBase_V1"]:
+        return None
diff --git a/vllm_v0.10.0/vllm/v1/core/sched/output.py b/vllm_v0.10.0/vllm/v1/core/sched/output.py
new file mode 100644
index 0000000..d34f393
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/core/sched/output.py
@@ -0,0 +1,157 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional
+
+if TYPE_CHECKING:
+    import numpy as np
+    import numpy.typing as npt
+
+    from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+        KVConnectorMetadata)
+    from vllm.lora.request import LoRARequest
+    from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+    from vllm.pooling_params import PoolingParams
+    from vllm.sampling_params import SamplingParams
+    from vllm.v1.request import Request
+
+
+@dataclass
+class NewRequestData:
+
+    req_id: str
+    prompt_token_ids: list[int]
+    mm_inputs: list[MultiModalKwargs]
+    mm_hashes: list[str]
+    mm_positions: list[PlaceholderRange]
+    sampling_params: Optional[SamplingParams]
+    pooling_params: Optional[PoolingParams]
+    block_ids: tuple[list[int], ...]
+    num_computed_tokens: int
+    lora_request: Optional[LoRARequest]
+
+    @classmethod
+    def from_request(
+        cls,
+        request: Request,
+        block_ids: tuple[list[int], ...],
+    ) -> NewRequestData:
+        return cls(
+            req_id=request.request_id,
+            prompt_token_ids=request.prompt_token_ids,
+            mm_inputs=request.mm_inputs,
+            mm_hashes=request.mm_hashes,
+            mm_positions=request.mm_positions,
+            sampling_params=request.sampling_params,
+            pooling_params=request.pooling_params,
+            block_ids=block_ids,
+            num_computed_tokens=request.num_computed_tokens,
+            lora_request=request.lora_request,
+        )
+
+    def __repr__(self):
+        return (f"NewRequestData("
+                f"req_id={self.req_id},"
+                f"prompt_token_ids={self.prompt_token_ids},"
+                f"mm_inputs={self.mm_inputs},"
+                f"mm_hashes={self.mm_hashes},"
+                f"mm_positions={self.mm_positions},"
+                f"sampling_params={self.sampling_params},"
+                f"block_ids={self.block_ids},"
+                f"num_computed_tokens={self.num_computed_tokens},"
+                f"lora_request={self.lora_request}"
+                ")")
+
+    # Version of __repr__ with the prompt data obfuscated
+    def anon_repr(self):
+        return (f"NewRequestData("
+                f"req_id={self.req_id},"
+                f"prompt_token_ids_len={len(self.prompt_token_ids)},"
+                f"mm_inputs={self.mm_inputs},"
+                f"mm_hashes={self.mm_hashes},"
+                f"mm_positions={self.mm_positions},"
+                f"sampling_params={self.sampling_params},"
+                f"block_ids={self.block_ids},"
+                f"num_computed_tokens={self.num_computed_tokens},"
+                f"lora_request={self.lora_request}"
+                ")")
+
+
+@dataclass
+class CachedRequestData:
+
+    req_ids: list[str]
+    # If resumed_from_preemption is False, new_block_ids will be appended to
+    # the request's block IDs. If True, new_block_ids will be used as the
+    # request's block IDs instead of appending to the existing block IDs.
+    resumed_from_preemption: list[bool]
+    # NOTE(woosuk): new_token_ids is only used for pipeline parallelism.
+    # When PP is not used, new_token_ids will be empty.
+    new_token_ids: list[list[int]]
+    new_block_ids: list[tuple[list[int], ...]]
+    num_computed_tokens: list[int]
+
+    @property
+    def num_reqs(self) -> int:
+        return len(self.req_ids)
+
+    @classmethod
+    def make_empty(cls) -> CachedRequestData:
+        return cls(
+            req_ids=[],
+            resumed_from_preemption=[],
+            new_token_ids=[],
+            new_block_ids=[],
+            num_computed_tokens=[],
+        )
+
+
+@dataclass
+class SchedulerOutput:
+
+    # list of the requests that are scheduled for the first time.
+    # We cache the request's data in each worker process, so that we don't
+    # need to re-send it every scheduling step.
+    scheduled_new_reqs: list[NewRequestData]
+    # list of the requests that have been scheduled before.
+    # Since the request's data is already cached in the worker processes,
+    # we only send the diff to minimize the communication cost.
+    scheduled_cached_reqs: CachedRequestData
+
+    # req_id -> num_scheduled_tokens
+    # Number of tokens scheduled for each request.
+    num_scheduled_tokens: dict[str, int]
+    # Total number of tokens scheduled for all requests.
+    # Equal to sum(num_scheduled_tokens.values())
+    total_num_scheduled_tokens: int
+    # req_id -> spec_token_ids
+    # If a request does not have any spec decode tokens, it will not be
+    # included in the dictionary.
+    scheduled_spec_decode_tokens: dict[str, list[int]]
+    # req_id -> encoder input indices that need processing.
+    # E.g., if a request has [0, 1], it could mean the vision encoder needs
+    # to process that the request's 0-th and 1-th images in the current step.
+    scheduled_encoder_inputs: dict[str, list[int]]
+    # Number of common prefix blocks for all requests in each KV cache group.
+    # This can be used for cascade attention.
+    num_common_prefix_blocks: list[int]
+
+    # Request IDs that are finished in between the previous and the current
+    # steps. This is used to notify the workers about the finished requests
+    # so that they can free the cached states for those requests.
+    finished_req_ids: set[str]
+    # list of (req_id, encoder_input_index) tuples.
+    # Used to free the encoder cache.
+    free_encoder_input_ids: list[tuple[str, int]]
+
+    # Dict of request ids to their index within the batch
+    # for filling the next token bitmask
+    structured_output_request_ids: dict[str, int]
+    # the bitmask for the whole batch
+    grammar_bitmask: Optional[npt.NDArray[np.int32]]
+
+    # KV Cache Connector metadata.
+    kv_connector_metadata: Optional[KVConnectorMetadata] = None
diff --git a/vllm_v0.10.0/vllm/v1/core/sched/request_queue.py b/vllm_v0.10.0/vllm/v1/core/sched/request_queue.py
new file mode 100644
index 0000000..fc2bc30
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/core/sched/request_queue.py
@@ -0,0 +1,224 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import heapq
+from abc import ABC, abstractmethod
+from collections import deque
+from collections.abc import Iterable, Iterator
+from enum import Enum
+
+from vllm.v1.request import Request
+
+
+class SchedulingPolicy(Enum):
+    """Enum for scheduling policies."""
+    FCFS = "fcfs"
+    PRIORITY = "priority"
+
+
+class RequestQueue(ABC):
+    """Abstract base class for request queues."""
+
+    @abstractmethod
+    def add_request(self, request: Request) -> None:
+        """Add a request to the queue according to the policy."""
+        pass
+
+    @abstractmethod
+    def pop_request(self) -> Request:
+        """Pop a request from the queue according to the policy."""
+        pass
+
+    @abstractmethod
+    def peek_request(self) -> Request:
+        """Peek at the request at the front of the queue without removing it."""
+        pass
+
+    @abstractmethod
+    def prepend_request(self, request: Request) -> None:
+        """Prepend a request to the front of the queue."""
+        pass
+
+    @abstractmethod
+    def prepend_requests(self, requests: RequestQueue) -> None:
+        """Prepend all requests from another queue to the front of this
+        queue."""
+        pass
+
+    @abstractmethod
+    def remove_request(self, request: Request) -> None:
+        """Remove a specific request from the queue."""
+        pass
+
+    @abstractmethod
+    def remove_requests(self, requests: Iterable[Request]) -> None:
+        """Remove multiple specific requests from the queue."""
+        pass
+
+    @abstractmethod
+    def __bool__(self) -> bool:
+        """Check if queue has any requests."""
+        pass
+
+    @abstractmethod
+    def __len__(self) -> int:
+        """Get number of requests in queue."""
+        pass
+
+    @abstractmethod
+    def __iter__(self) -> Iterator[Request]:
+        """Iterate over the queue according to the policy."""
+        pass
+
+    @abstractmethod
+    def __reversed__(self) -> Iterator[Request]:
+        """Iterate over the queue in reverse order."""
+        pass
+
+
+class FCFSRequestQueue(deque[Request], RequestQueue):
+    """A first-come-first-served queue that supports deque operations."""
+
+    def add_request(self, request: Request) -> None:
+        """Add a request to the queue according to FCFS policy."""
+        self.append(request)
+
+    def pop_request(self) -> Request:
+        """Pop a request from the queue according to FCFS policy."""
+        return self.popleft()
+
+    def peek_request(self) -> Request:
+        """Peek at the next request in the queue without removing it."""
+        if not self:
+            raise IndexError("peek from an empty queue")
+        return self[0]
+
+    def prepend_request(self, request: Request) -> None:
+        """Prepend a request to the front of the queue."""
+        self.appendleft(request)
+
+    def prepend_requests(self, requests: RequestQueue) -> None:
+        """Prepend all requests from another queue to the front of this
+        queue."""
+        self.extendleft(reversed(requests))
+
+    def remove_request(self, request: Request) -> None:
+        """Remove a specific request from the queue."""
+        self.remove(request)
+
+    def remove_requests(self, requests: Iterable[Request]) -> None:
+        """Remove multiple specific requests from the queue."""
+        requests_to_remove = set(requests)
+        filtered_requests = [
+            req for req in self if req not in requests_to_remove
+        ]
+        # deque does not support in-place filtering, so we need to clear
+        # and extend
+        self.clear()
+        self.extend(filtered_requests)
+
+    def __bool__(self) -> bool:
+        """Check if queue has any requests."""
+        return len(self) > 0
+
+    def __len__(self) -> int:
+        """Get number of requests in queue."""
+        return super().__len__()
+
+    def __iter__(self) -> Iterator[Request]:
+        """Iterate over the queue according to FCFS policy."""
+        return super().__iter__()
+
+    def __reversed__(self) -> Iterator[Request]:
+        """Iterate over the queue in reverse order."""
+        return super().__reversed__()
+
+
+class PriorityRequestQueue(RequestQueue):
+    """
+    A priority queue that supports heap operations.
+
+    Requests with a smaller value of `priority` are processed first.
+    If multiple requests have the same priority, the one with the earlier
+    `arrival_time` is processed first.
+    """
+
+    def __init__(self) -> None:
+        self._heap: list[tuple[int, float, Request]] = []
+
+    def add_request(self, request: Request) -> None:
+        """Add a request to the queue according to priority policy."""
+        heapq.heappush(self._heap,
+                       (request.priority, request.arrival_time, request))
+
+    def pop_request(self) -> Request:
+        """Pop a request from the queue according to priority policy."""
+        if not self._heap:
+            raise IndexError("pop from empty heap")
+        _, _, request = heapq.heappop(self._heap)
+        return request
+
+    def peek_request(self) -> Request:
+        """Peek at the next request in the queue without removing it."""
+        if not self._heap:
+            raise IndexError("peek from empty heap")
+        _, _, request = self._heap[0]
+        return request
+
+    def prepend_request(self, request: Request) -> None:
+        """Add a request to the queue according to priority policy.
+        
+        Note: In a priority queue, there is no concept of prepending to the 
+        front. Requests are ordered by (priority, arrival_time)."""
+        self.add_request(request)
+
+    def prepend_requests(self, requests: RequestQueue) -> None:
+        """Add all requests from another queue according to priority policy.
+        
+        Note: In a priority queue, there is no concept of prepending to the 
+        front. Requests are ordered by (priority, arrival_time)."""
+        for request in requests:
+            self.add_request(request)
+
+    def remove_request(self, request: Request) -> None:
+        """Remove a specific request from the queue."""
+        self._heap = [(p, t, r) for p, t, r in self._heap if r != request]
+        heapq.heapify(self._heap)
+
+    def remove_requests(self, requests: Iterable[Request]) -> None:
+        """Remove multiple specific requests from the queue."""
+        requests_to_remove = set(requests)
+        self._heap = [(p, t, r) for p, t, r in self._heap
+                      if r not in requests_to_remove]
+        heapq.heapify(self._heap)
+
+    def __bool__(self) -> bool:
+        """Check if queue has any requests."""
+        return bool(self._heap)
+
+    def __len__(self) -> int:
+        """Get number of requests in queue."""
+        return len(self._heap)
+
+    def __iter__(self) -> Iterator[Request]:
+        """Iterate over the queue according to priority policy."""
+        heap_copy = self._heap[:]
+        while heap_copy:
+            _, _, request = heapq.heappop(heap_copy)
+            yield request
+
+    def __reversed__(self) -> Iterator[Request]:
+        """Iterate over the queue in reverse priority order."""
+        return reversed(list(self))
+
+
+def create_request_queue(policy: SchedulingPolicy) -> RequestQueue:
+    """Create request queue based on scheduling policy."""
+    if policy == SchedulingPolicy.PRIORITY:
+        return PriorityRequestQueue()
+    elif policy == SchedulingPolicy.FCFS:
+        return FCFSRequestQueue()
+    else:
+        raise ValueError(f"Unknown scheduling policy: {policy}")
diff --git a/vllm_v0.10.0/vllm/v1/core/sched/scheduler.py b/vllm_v0.10.0/vllm/v1/core/sched/scheduler.py
new file mode 100644
index 0000000..446f980
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/core/sched/scheduler.py
@@ -0,0 +1,1147 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import itertools
+import time
+from collections import defaultdict
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+
+from vllm.config import VllmConfig
+from vllm.distributed.kv_events import EventPublisherFactory, KVEventBatch
+from vllm.distributed.kv_transfer.kv_connector.factory import (
+    KVConnectorFactory)
+from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1,
+                                                          KVConnectorRole)
+from vllm.logger import init_logger
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
+                                                compute_encoder_budget)
+from vllm.v1.core.kv_cache_manager import KVCacheManager
+from vllm.v1.core.sched.interface import SchedulerInterface
+from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
+                                       SchedulerOutput)
+from vllm.v1.core.sched.request_queue import (SchedulingPolicy,
+                                              create_request_queue)
+from vllm.v1.core.sched.utils import check_stop
+from vllm.v1.engine import (EngineCoreEventType, EngineCoreOutput,
+                            EngineCoreOutputs)
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.metrics.stats import SchedulerStats
+from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.request import Request, RequestStatus
+from vllm.v1.spec_decode.metrics import SpecDecodingStats
+from vllm.v1.structured_output import StructuredOutputManager
+
+logger = init_logger(__name__)
+
+
+class Scheduler(SchedulerInterface):
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        kv_cache_config: KVCacheConfig,
+        structured_output_manager: StructuredOutputManager,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+        include_finished_set: bool = False,
+        log_stats: bool = False,
+    ) -> None:
+        self.vllm_config = vllm_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.kv_cache_config = kv_cache_config
+        self.kv_events_config = vllm_config.kv_events_config
+        self.parallel_config = vllm_config.parallel_config
+        self.log_stats = log_stats
+        self.structured_output_manager = structured_output_manager
+
+        # include_finished_set controls whether a separate set of finished
+        # request ids should be included in the EngineCoreOutputs returned
+        # by update_from_outputs(). This is currently used in the multi-engine
+        # case to track request lifetimes efficiently.
+        self.finished_req_ids_dict: Optional[dict[int, set[str]]] = (
+            defaultdict(set) if include_finished_set else None)
+
+        # Scheduling constraints.
+        self.max_num_running_reqs = self.scheduler_config.max_num_seqs
+        self.max_num_scheduled_tokens = \
+            self.scheduler_config.max_num_batched_tokens
+        self.max_model_len = self.scheduler_config.max_model_len
+        self.enable_kv_cache_events = (
+            self.kv_events_config is not None
+            and self.kv_events_config.enable_kv_cache_events)
+
+        # Create KVConnector for the Scheduler. Note that each Worker
+        # will have a corresponding KVConnector with Role=WORKER.
+        # KV Connector pushes/pull of remote KVs for P/D and offloading.
+        self.connector = None
+        if self.vllm_config.kv_transfer_config is not None:
+            assert len(self.kv_cache_config.kv_cache_groups) == 1, (
+                "Multiple KV cache groups are not currently supported "
+                "with KV connectors")
+            self.connector = KVConnectorFactory.create_connector_v1(
+                config=self.vllm_config, role=KVConnectorRole.SCHEDULER)
+
+        self.kv_event_publisher = EventPublisherFactory.create(
+            self.kv_events_config,
+            self.parallel_config.data_parallel_rank,
+        )
+
+        num_gpu_blocks = self.cache_config.num_gpu_blocks
+        assert num_gpu_blocks is not None and num_gpu_blocks > 0
+
+        self.block_size = self.cache_config.block_size
+
+        # req_id -> Request
+        self.requests: dict[str, Request] = {}
+        # Scheduling policy
+        if self.scheduler_config.policy == "priority":
+            self.policy = SchedulingPolicy.PRIORITY
+        elif self.scheduler_config.policy == "fcfs":
+            self.policy = SchedulingPolicy.FCFS
+        else:
+            raise ValueError(
+                f"Unknown scheduling policy: {self.scheduler_config.policy}")
+        # Priority queues for requests.
+        self.waiting = create_request_queue(self.policy)
+        self.running: list[Request] = []
+
+        # The request IDs that are finished in between the previous and the
+        # current steps. This is used to notify the workers about the finished
+        # requests so that they can free the cached states for those requests.
+        # This is flushed at the end of each scheduling step.
+        self.finished_req_ids: set[str] = set()
+
+        # KV Connector: requests in process of async KV loading or recving
+        self.finished_recving_kv_req_ids: set[str] = set()
+
+        # Encoder-related.
+        # Calculate encoder cache size if applicable
+        # NOTE: For now we use the same budget for both compute and space.
+        # This can be changed when we make encoder cache for embedding caching
+        # across requests.
+        encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
+            model_config=vllm_config.model_config,
+            scheduler_config=vllm_config.scheduler_config,
+            mm_registry=mm_registry,
+        )
+
+        # NOTE(woosuk): Here, "encoder" includes the vision encoder (and
+        # projector if needed). Currently, we assume that the encoder also
+        # has the Transformer architecture (e.g., ViT).
+        self.max_num_encoder_input_tokens = encoder_compute_budget
+        # NOTE: For the models without encoder (e.g., text-only models),
+        # the encoder cache will not be initialized because cache size is 0
+        # for these models.
+        self.encoder_cache_manager = EncoderCacheManager(
+            cache_size=encoder_cache_size)
+
+        speculative_config = vllm_config.speculative_config
+
+        self.use_eagle = False
+        self.num_spec_tokens = self.num_lookahead_tokens = 0
+        if speculative_config:
+            self.num_spec_tokens = speculative_config.num_speculative_tokens
+            if speculative_config.use_eagle():
+                self.use_eagle = True
+                self.num_lookahead_tokens = self.num_spec_tokens
+
+        # Create the KV cache manager.
+        self.kv_cache_manager = KVCacheManager(
+            kv_cache_config=kv_cache_config,
+            max_model_len=self.max_model_len,
+            enable_caching=self.cache_config.enable_prefix_caching,
+            caching_hash_algo=self.cache_config.prefix_caching_hash_algo,
+            use_eagle=self.use_eagle,
+            log_stats=self.log_stats,
+            enable_kv_cache_events=self.enable_kv_cache_events,
+        )
+        self.use_pp = self.parallel_config.pipeline_parallel_size > 1
+
+    def schedule(self) -> SchedulerOutput:
+        # NOTE(woosuk) on the scheduling algorithm:
+        # There's no "decoding phase" nor "prefill phase" in the scheduler.
+        # Each request just has the num_computed_tokens and
+        # num_tokens_with_spec. num_tokens_with_spec =
+        # len(prompt_token_ids) + len(output_token_ids) + len(spec_token_ids).
+        # At each step, the scheduler tries to assign tokens to the requests
+        # so that each request's num_computed_tokens can catch up its
+        # num_tokens_with_spec. This is general enough to cover
+        # chunked prefills, prefix caching, speculative decoding,
+        # and the "jump decoding" optimization in the future.
+
+        scheduled_new_reqs: list[Request] = []
+        scheduled_resumed_reqs: list[Request] = []
+        scheduled_running_reqs: list[Request] = []
+        preempted_reqs: list[Request] = []
+
+        # NOTE: structured_output_request_ids maps
+        # a request's (request that uses structured output)
+        # request_id to the running request index.
+        # This will helps us determine to slice the grammar bitmask
+        # and only applies valid mask for requests that
+        # uses structured decoding.
+        structured_output_request_ids: dict[str, int] = {}
+
+        req_to_new_block_ids: dict[str, tuple[list[int], ...]] = {}
+        num_scheduled_tokens: dict[str, int] = {}
+        token_budget = self.max_num_scheduled_tokens
+        # Encoder-related.
+        scheduled_encoder_inputs: dict[str, list[int]] = {}
+        encoder_budget = self.max_num_encoder_input_tokens
+        # Spec decode-related.
+        scheduled_spec_decode_tokens: dict[str, list[int]] = {}
+
+        # For logging.
+        scheduled_timestamp = time.monotonic()
+
+        # First, schedule the RUNNING requests.
+        req_index = 0
+        while req_index < len(self.running) and token_budget > 0:
+            request = self.running[req_index]
+
+            num_new_tokens = (request.num_tokens_with_spec +
+                              request.num_output_placeholders -
+                              request.num_computed_tokens)
+            if (0 < self.scheduler_config.long_prefill_token_threshold <
+                    num_new_tokens):
+                num_new_tokens = (
+                    self.scheduler_config.long_prefill_token_threshold)
+            num_new_tokens = min(num_new_tokens, token_budget)
+
+            # Make sure the input position does not exceed the max model len.
+            # This is necessary when using spec decoding.
+            num_new_tokens = min(
+                num_new_tokens,
+                self.max_model_len - 1 - request.num_computed_tokens)
+
+            # Schedule encoder inputs.
+            encoder_inputs_to_schedule = None
+            new_encoder_budget = encoder_budget
+            if request.has_encoder_inputs:
+                (encoder_inputs_to_schedule, num_new_tokens,
+                 new_encoder_budget) = self._try_schedule_encoder_inputs(
+                     request, request.num_computed_tokens, num_new_tokens,
+                     encoder_budget)
+
+            if num_new_tokens == 0:
+                # The request cannot be scheduled because one of the following
+                # reasons:
+                # 1. No new tokens to schedule. This may happen when
+                #    (1) PP>1 and we have already scheduled all prompt tokens
+                #    but they are not finished yet.
+                #    (2) Async scheduling and the request has reached to either
+                #    its max_total_tokens or max_model_len.
+                # 2. The encoder budget is exhausted.
+                # 3. The encoder cache is exhausted.
+                # NOTE(woosuk): Here, by doing `continue` instead of `break`,
+                # we do not strictly follow the FCFS scheduling policy and
+                # allow the lower-priority requests to be scheduled.
+                req_index += 1
+                continue
+
+            while True:
+                new_blocks = self.kv_cache_manager.allocate_slots(
+                    request,
+                    num_new_tokens,
+                    num_lookahead_tokens=self.num_lookahead_tokens)
+                if new_blocks is None:
+                    # The request cannot be scheduled.
+                    # Preempt the lowest-priority request.
+                    if self.policy == SchedulingPolicy.PRIORITY:
+                        preempted_req = max(
+                            self.running,
+                            key=lambda r: (r.priority, r.arrival_time),
+                        )
+                        self.running.remove(preempted_req)
+                    else:
+                        preempted_req = self.running.pop()
+
+                    self.kv_cache_manager.free(preempted_req)
+                    preempted_req.status = RequestStatus.PREEMPTED
+                    preempted_req.num_computed_tokens = 0
+                    if self.log_stats:
+                        preempted_req.record_event(
+                            EngineCoreEventType.PREEMPTED, scheduled_timestamp)
+
+                    self.waiting.prepend_request(preempted_req)
+                    preempted_reqs.append(preempted_req)
+                    if preempted_req == request:
+                        # No more request to preempt.
+                        can_schedule = False
+                        break
+                else:
+                    # The request can be scheduled.
+                    can_schedule = True
+                    break
+            if not can_schedule:
+                break
+            assert new_blocks is not None
+
+            # Schedule the request.
+            scheduled_running_reqs.append(request)
+            if request.use_structured_output:
+                # PERF: in case of chunked prefill,
+                # request might not include any new tokens.
+                # Therefore, we might introduce some additional
+                # cycle to fill in the bitmask, which could be a big no-op.
+                structured_output_request_ids[request.request_id] = req_index
+            req_to_new_block_ids[request.request_id] = (
+                new_blocks.get_block_ids())
+            num_scheduled_tokens[request.request_id] = num_new_tokens
+            token_budget -= num_new_tokens
+            req_index += 1
+
+            # Speculative decode related.
+            if request.spec_token_ids:
+                num_scheduled_spec_tokens = (num_new_tokens +
+                                             request.num_computed_tokens -
+                                             request.num_tokens)
+                if num_scheduled_spec_tokens > 0:
+                    # Trim spec_token_ids list to num_scheduled_spec_tokens.
+                    del request.spec_token_ids[num_scheduled_spec_tokens:]
+                    scheduled_spec_decode_tokens[request.request_id] = (
+                        request.spec_token_ids)
+
+            # Encoder-related.
+            if encoder_inputs_to_schedule:
+                scheduled_encoder_inputs[request.request_id] = (
+                    encoder_inputs_to_schedule)
+                # Allocate the encoder cache.
+                for i in encoder_inputs_to_schedule:
+                    self.encoder_cache_manager.allocate(request, i)
+                encoder_budget = new_encoder_budget
+
+        # Record the LoRAs in scheduled_running_reqs
+        scheduled_loras: set[int] = set()
+        if self.lora_config:
+            scheduled_loras = set(
+                req.lora_request.lora_int_id for req in scheduled_running_reqs
+                if req.lora_request and req.lora_request.lora_int_id > 0)
+            assert len(scheduled_loras) <= self.lora_config.max_loras
+
+        # Use a temporary RequestQueue to collect requests that need to be
+        # skipped and put back at the head of the waiting queue later
+        skipped_waiting_requests = create_request_queue(self.policy)
+
+        # Next, schedule the WAITING requests.
+        if not preempted_reqs:
+            while self.waiting and token_budget > 0:
+                if len(self.running) == self.max_num_running_reqs:
+                    break
+
+                request = self.waiting.peek_request()
+
+                # KVTransfer: skip request if still waiting for remote kvs.
+                if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
+                    is_ready = self._update_waiting_for_remote_kv(request)
+                    if is_ready:
+                        request.status = RequestStatus.WAITING
+                    else:
+                        logger.debug(
+                            "%s is still in WAITING_FOR_REMOTE_KVS state.",
+                            request.request_id)
+                        self.waiting.pop_request()
+                        skipped_waiting_requests.prepend_request(request)
+                        continue
+
+                # Skip request if the structured output request is still waiting
+                # for FSM compilation.
+                if request.status == RequestStatus.WAITING_FOR_FSM:
+                    structured_output_req = request.structured_output_request
+                    if structured_output_req and structured_output_req.grammar:
+                        request.status = RequestStatus.WAITING
+                    else:
+                        self.waiting.pop_request()
+                        skipped_waiting_requests.prepend_request(request)
+                        continue
+
+                # Check that adding the request still respects the max_loras
+                # constraint.
+                if (self.lora_config and request.lora_request and
+                    (len(scheduled_loras) == self.lora_config.max_loras and
+                     request.lora_request.lora_int_id not in scheduled_loras)):
+                    # Scheduling would exceed max_loras, skip.
+                    self.waiting.pop_request()
+                    skipped_waiting_requests.prepend_request(request)
+                    continue
+
+                num_external_computed_tokens = 0
+                load_kv_async = False
+
+                # Get already-cached tokens.
+                if request.num_computed_tokens == 0:
+                    # Get locally-cached tokens.
+                    new_computed_blocks, num_new_local_computed_tokens = \
+                        self.kv_cache_manager.get_computed_blocks(
+                            request)
+
+                    # Get externally-cached tokens if using a KVConnector.
+                    if self.connector is not None:
+                        num_external_computed_tokens, load_kv_async = (
+                            self.connector.get_num_new_matched_tokens(
+                                request, num_new_local_computed_tokens))
+
+                    # Total computed tokens (local + external).
+                    num_computed_tokens = (num_new_local_computed_tokens +
+                                           num_external_computed_tokens)
+                # KVTransfer: WAITING reqs have num_computed_tokens > 0
+                # after async KV recvs are completed.
+                else:
+                    new_computed_blocks = (
+                        self.kv_cache_manager.create_empty_block_list())
+                    num_new_local_computed_tokens = 0
+                    num_computed_tokens = request.num_computed_tokens
+
+                encoder_inputs_to_schedule = None
+                new_encoder_budget = encoder_budget
+
+                # KVTransfer: loading remote KV, do not allocate for new work.
+                if load_kv_async:
+                    assert num_external_computed_tokens > 0
+                    num_new_tokens = 0
+                # Number of tokens to be scheduled.
+                else:
+                    # We use `request.num_tokens` instead of
+                    # `request.num_prompt_tokens` to consider the resumed
+                    # requests, which have output tokens.
+                    num_new_tokens = request.num_tokens - num_computed_tokens
+                    if (0 < self.scheduler_config.long_prefill_token_threshold
+                            < num_new_tokens):
+                        num_new_tokens = (
+                            self.scheduler_config.long_prefill_token_threshold)
+
+                    # chunked prefill has to be enabled explicitly to allow
+                    # pooling requests to be chunked
+                    if not self.scheduler_config.chunked_prefill_enabled and \
+                        num_new_tokens > token_budget:
+                        self.waiting.pop_request()
+                        skipped_waiting_requests.prepend_request(request)
+                        continue
+
+                    num_new_tokens = min(num_new_tokens, token_budget)
+                    assert num_new_tokens > 0
+
+                    # Schedule encoder inputs.
+                    if request.has_encoder_inputs:
+                        (encoder_inputs_to_schedule, num_new_tokens,
+                         new_encoder_budget
+                         ) = self._try_schedule_encoder_inputs(
+                             request, num_computed_tokens, num_new_tokens,
+                             encoder_budget)
+                        if num_new_tokens == 0:
+                            # The request cannot be scheduled.
+                            break
+
+                new_blocks = self.kv_cache_manager.allocate_slots(
+                    request,
+                    num_new_tokens + num_external_computed_tokens,
+                    num_new_local_computed_tokens,
+                    new_computed_blocks,
+                    num_lookahead_tokens=self.num_lookahead_tokens,
+                    delay_cache_blocks=load_kv_async,
+                )
+                if new_blocks is None:
+                    # The request cannot be scheduled.
+                    break
+
+                # KVTransfer: the connector uses this info to determine
+                # if a load is needed. Note that
+                # This information is used to determine if a load is
+                # needed for this request.
+                if self.connector is not None:
+                    self.connector.update_state_after_alloc(
+                        request,
+                        new_computed_blocks + new_blocks,
+                        num_external_computed_tokens,
+                    )
+
+                # Request was already popped from self.waiting
+                # unless it was re-added above due to new_blocks being None.
+                request = self.waiting.pop_request()
+                if load_kv_async:
+                    # If loading async, allocate memory and put request
+                    # into the WAITING_FOR_REMOTE_KV state.
+                    skipped_waiting_requests.prepend_request(request)
+                    request.status = RequestStatus.WAITING_FOR_REMOTE_KVS
+                    continue
+
+                if request.use_structured_output:
+                    structured_output_request_ids[request.request_id] = (
+                        req_index)
+                req_index += 1
+                self.running.append(request)
+                if self.log_stats:
+                    request.record_event(EngineCoreEventType.SCHEDULED,
+                                         scheduled_timestamp)
+                if request.status == RequestStatus.WAITING:
+                    scheduled_new_reqs.append(request)
+                elif request.status == RequestStatus.PREEMPTED:
+                    scheduled_resumed_reqs.append(request)
+                else:
+                    raise RuntimeError(
+                        f"Invalid request status: {request.status}")
+
+                if self.lora_config and request.lora_request:
+                    scheduled_loras.add(request.lora_request.lora_int_id)
+                req_to_new_block_ids[request.request_id] = (
+                    self.kv_cache_manager.get_block_ids(request.request_id))
+                num_scheduled_tokens[request.request_id] = num_new_tokens
+                token_budget -= num_new_tokens
+                request.status = RequestStatus.RUNNING
+                request.num_computed_tokens = num_computed_tokens
+                # Count the number of prefix cached tokens.
+                if request.num_cached_tokens < 0:
+                    request.num_cached_tokens = num_computed_tokens
+                # Encoder-related.
+                if encoder_inputs_to_schedule:
+                    scheduled_encoder_inputs[request.request_id] = (
+                        encoder_inputs_to_schedule)
+                    # Allocate the encoder cache.
+                    for i in encoder_inputs_to_schedule:
+                        self.encoder_cache_manager.allocate(request, i)
+                    encoder_budget = new_encoder_budget
+
+        # Put back any skipped requests at the head of the waiting queue
+        if skipped_waiting_requests:
+            self.waiting.prepend_requests(skipped_waiting_requests)
+
+        # Check if the scheduling constraints are satisfied.
+        total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
+        assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens
+        assert token_budget >= 0
+        assert len(self.running) <= self.max_num_running_reqs
+        # Since some requests in the RUNNING queue may not be scheduled in
+        # this step, the total number of scheduled requests can be smaller than
+        # len(self.running).
+        assert (len(scheduled_new_reqs) + len(scheduled_resumed_reqs) +
+                len(scheduled_running_reqs) <= len(self.running))
+
+        # Get the longest common prefix among all requests in the running queue.
+        # This can be potentially used for cascade attention.
+        num_common_prefix_blocks = [0] * len(
+            self.kv_cache_config.kv_cache_groups)
+        if self.running:
+            any_request = self.running[0]
+            num_common_prefix_blocks = (
+                self.kv_cache_manager.get_num_common_prefix_blocks(
+                    any_request, len(self.running)))
+
+        grammar_bitmask = self.structured_output_manager.grammar_bitmask(
+            self.requests,
+            structured_output_request_ids,
+            scheduled_spec_decode_tokens,
+        )
+        # Construct the scheduler output.
+        new_reqs_data = [
+            NewRequestData.from_request(req,
+                                        req_to_new_block_ids[req.request_id])
+            for req in scheduled_new_reqs
+        ]
+        cached_reqs_data = self._make_cached_request_data(
+            scheduled_running_reqs,
+            scheduled_resumed_reqs,
+            num_scheduled_tokens,
+            scheduled_spec_decode_tokens,
+            req_to_new_block_ids,
+        )
+        scheduler_output = SchedulerOutput(
+            scheduled_new_reqs=new_reqs_data,
+            scheduled_cached_reqs=cached_reqs_data,
+            num_scheduled_tokens=num_scheduled_tokens,
+            total_num_scheduled_tokens=total_num_scheduled_tokens,
+            scheduled_spec_decode_tokens=scheduled_spec_decode_tokens,
+            scheduled_encoder_inputs=scheduled_encoder_inputs,
+            num_common_prefix_blocks=num_common_prefix_blocks,
+            # finished_req_ids is an existing state in the scheduler,
+            # instead of being newly scheduled in this step.
+            # It contains the request IDs that are finished in between
+            # the previous and the current steps.
+            finished_req_ids=self.finished_req_ids,
+            free_encoder_input_ids=self.encoder_cache_manager.get_freed_ids(),
+            structured_output_request_ids=structured_output_request_ids,
+            grammar_bitmask=grammar_bitmask,
+        )
+
+        # NOTE(Kuntai): this function is designed for multiple purposes:
+        # 1. Plan the KV cache store
+        # 2. Wrap up all the KV cache load / save ops into an opaque object
+        # 3. Clear the internal states of the connector
+        if self.connector is not None:
+            meta = self.connector.build_connector_meta(scheduler_output)
+            scheduler_output.kv_connector_metadata = meta
+
+        events = self.kv_cache_manager.take_events()
+        if events:
+            batch = KVEventBatch(ts=time.time(), events=events)
+            self.kv_event_publisher.publish(batch)
+
+        self._update_after_schedule(scheduler_output)
+        return scheduler_output
+
+    def _update_after_schedule(
+        self,
+        scheduler_output: SchedulerOutput,
+    ) -> None:
+        # Advance the number of computed tokens for the request AFTER
+        # the request is scheduled.
+        # 1. The scheduler_output of the current step has to include the
+        #    original number of scheduled tokens to determine input IDs.
+        # 2. Advance the number of computed tokens here allowing us to
+        #    schedule the prefill request again immediately in the next
+        #    scheduling step.
+        # 3. If some tokens (e.g. spec tokens) are rejected later, the number of
+        #    computed tokens will be adjusted in update_from_output.
+        num_scheduled_tokens = scheduler_output.num_scheduled_tokens
+        for req_id, num_scheduled_token in num_scheduled_tokens.items():
+            request = self.requests[req_id]
+            request.num_computed_tokens += num_scheduled_token
+
+            # NOTE: _free_encoder_inputs relies on num_computed_tokens, which
+            # may be updated again in _update_from_output for speculative
+            # decoding. However, it is safe to call the method here because
+            # encoder inputs are always part of the prompt, not the output,
+            # and thus are unaffected by speculative decoding.
+            if request.has_encoder_inputs:
+                self._free_encoder_inputs(request)
+
+        # Clear the finished request IDs.
+        # NOTE: We shouldn't do self.finished_req_ids.clear() here because
+        # it will also affect the scheduler output.
+        self.finished_req_ids = set()
+
+    def _make_cached_request_data(
+        self,
+        running_reqs: list[Request],
+        resumed_reqs: list[Request],
+        num_scheduled_tokens: dict[str, int],
+        spec_decode_tokens: dict[str, list[int]],
+        req_to_new_block_ids: dict[str, tuple[list[int], ...]],
+    ) -> CachedRequestData:
+        req_ids: list[str] = []
+        new_token_ids: list[list[int]] = []
+        new_block_ids: list[tuple[list[int], ...]] = []
+        num_computed_tokens: list[int] = []
+
+        use_connector = self.connector is not None
+        for req in itertools.chain(running_reqs, resumed_reqs):
+            req_id = req.request_id
+            req_ids.append(req_id)
+            num_tokens = (num_scheduled_tokens[req_id] -
+                          len(spec_decode_tokens.get(req_id, ())))
+            if self.use_pp:
+                # When using PP, the scheduler sends the sampled tokens back,
+                # because there's no direct communication between the first-
+                # stage worker and the last-stage worker. Otherwise, we don't
+                # need to send the sampled tokens back because the model runner
+                # will cache them.
+                token_ids = req.all_token_ids[req.num_computed_tokens:req.
+                                              num_computed_tokens + num_tokens]
+                new_token_ids.append(token_ids)
+            elif use_connector:
+                # When using a KVConnector, we add a placeholder to avoid index
+                # out of bounds errors. TODO: Remove this once the KVConnector
+                # is updated to handle token IDs properly.
+                new_token_ids.append([])
+            new_block_ids.append(req_to_new_block_ids[req_id])
+            num_computed_tokens.append(req.num_computed_tokens)
+        # Because resumed_reqs is usually empty, it is more efficient to do
+        # in-place appending so that we don't need to allocate a new list.
+        resumed_from_preemption = [False] * len(running_reqs)
+        resumed_from_preemption += [True] * len(resumed_reqs)
+
+        return CachedRequestData(
+            req_ids=req_ids,
+            resumed_from_preemption=resumed_from_preemption,
+            new_token_ids=new_token_ids,
+            new_block_ids=new_block_ids,
+            num_computed_tokens=num_computed_tokens,
+        )
+
+    def _try_schedule_encoder_inputs(
+        self,
+        request: Request,
+        num_computed_tokens: int,
+        num_new_tokens: int,
+        encoder_budget: int,
+    ) -> tuple[list[int], int, int]:
+        """
+        Determine which encoder inputs need to be scheduled in the current step,
+        and update `num_new_tokens` and encoder token budget accordingly.
+
+        An encoder input will be scheduled if:
+        - Its output tokens overlap with the range of tokens being computed
+        in this step, i.e.,
+        [num_computed_tokens, num_computed_tokens + num_new_tokens).
+        - It is not already computed and stored in the encoder cache.
+        - There is sufficient encoder token budget to process it.
+        - The encoder cache has space to store it.
+
+        If an encoder input cannot be scheduled due to cache or budget
+        limitations, the method adjusts `num_new_tokens` to schedule only the
+        decoder tokens up to just before the unschedulable encoder input.
+
+        Note that num_computed_tokens includes both locally cached
+        blocks and externally cached blocks (via KVConnector).
+        """
+        if num_new_tokens == 0 or not request.has_encoder_inputs:
+            return [], num_new_tokens, encoder_budget
+        encoder_inputs_to_schedule: list[int] = []
+        mm_positions = request.mm_positions
+        assert mm_positions is not None
+        assert len(mm_positions) > 0
+        for i, pos_info in enumerate(mm_positions):
+            start_pos = pos_info.offset
+            num_encoder_tokens = pos_info.length
+
+            # The encoder output is needed if the two ranges overlap:
+            # [num_computed_tokens, num_computed_tokens + num_new_tokens) and
+            # [start_pos, start_pos + num_encoder_tokens)
+            if start_pos >= num_computed_tokens + num_new_tokens:
+                # The encoder input is not needed in this step.
+                break
+            if start_pos + num_encoder_tokens <= num_computed_tokens:
+                # The encoder input is already computed and stored
+                # in the decoder's KV cache.
+                continue
+
+            if self.encoder_cache_manager.has_cache(request, i):
+                # The encoder input is already computed and cached.
+                continue
+
+            # If no encoder input chunking is allowed, we do not want to
+            # partially schedule a multimodal item. If the scheduled range would
+            # only cover part of the mm input, roll back to before the mm item.
+            if (self.scheduler_config.disable_chunked_mm_input
+                    and num_computed_tokens < start_pos
+                    and (num_computed_tokens + num_new_tokens)
+                    < (start_pos + num_encoder_tokens)):
+                num_new_tokens = start_pos - num_computed_tokens
+                break
+
+            if (not self.encoder_cache_manager.can_allocate(request, i)
+                    or num_encoder_tokens > encoder_budget):
+                # The encoder cache is full or the encoder budget is exhausted.
+                # NOTE(woosuk): We assume that the encoder input tokens should
+                # be processed altogether, as the encoder usually uses
+                # bidirectional attention.
+                if num_computed_tokens < start_pos:
+                    # We only schedule the decoder tokens just before the
+                    # encoder input.
+                    num_new_tokens = start_pos - num_computed_tokens
+                else:
+                    # Because of prefix caching, num_computed_tokens is greater
+                    # than start_pos even though its encoder input is not
+                    # available. In this case, we can't schedule any token for
+                    # the request in this step.
+                    num_new_tokens = 0
+                break
+
+            encoder_budget -= num_encoder_tokens
+            encoder_inputs_to_schedule.append(i)
+        return encoder_inputs_to_schedule, num_new_tokens, encoder_budget
+
+    def update_from_output(
+        self,
+        scheduler_output: SchedulerOutput,
+        model_runner_output: ModelRunnerOutput,
+    ) -> dict[int, EngineCoreOutputs]:
+        sampled_token_ids = model_runner_output.sampled_token_ids
+        spec_token_ids = model_runner_output.spec_token_ids
+        logprobs = model_runner_output.logprobs
+        prompt_logprobs_dict = model_runner_output.prompt_logprobs_dict
+        num_scheduled_tokens = scheduler_output.num_scheduled_tokens
+        pooler_outputs = model_runner_output.pooler_output
+        num_nans_in_logits = model_runner_output.num_nans_in_logits
+
+        outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list)
+        spec_decoding_stats: Optional[SpecDecodingStats] = None
+
+        # NOTE(woosuk): As len(num_scheduled_tokens) can be up to 1K or more,
+        # the below loop can be a performance bottleneck. We should do our best
+        # to avoid expensive operations inside the loop.
+        stopped_running_reqs: set[Request] = set()
+        stopped_preempted_reqs: set[Request] = set()
+        for req_id, num_tokens_scheduled in num_scheduled_tokens.items():
+            assert num_tokens_scheduled > 0
+            request = self.requests.get(req_id)
+            if request is None:
+                # The request is already finished. This can happen if the
+                # request is aborted while the model is executing it (e.g.,
+                # in pipeline parallelism).
+                continue
+
+            req_index = model_runner_output.req_id_to_index[req_id]
+            generated_token_ids = sampled_token_ids[
+                req_index] if sampled_token_ids else []
+
+            scheduled_spec_token_ids = (
+                scheduler_output.scheduled_spec_decode_tokens.get(req_id))
+            if scheduled_spec_token_ids:
+                # num_computed_tokens represents the number of tokens
+                # processed in the current step, considering scheduled
+                # tokens and rejections. If some tokens are rejected,
+                # num_computed_tokens is decreased by the number of rejected
+                # tokens, where is given by:
+                # len(scheduled_spec_token_ids) + 1 - len(generated_token_ids).
+                num_tokens_rejected = (len(scheduled_spec_token_ids) + 1 -
+                                       len(generated_token_ids))
+                request.num_computed_tokens -= num_tokens_rejected
+                spec_decoding_stats = self.make_spec_decoding_stats(
+                    spec_decoding_stats,
+                    num_draft_tokens=len(scheduled_spec_token_ids),
+                    num_accepted_tokens=len(generated_token_ids) - 1)
+
+            stopped = False
+            new_logprobs = None
+            new_token_ids = generated_token_ids
+            kv_transfer_params = None
+            status_before_stop = request.status
+
+            # Check for stop and update request status.
+            if new_token_ids:
+                new_token_ids, stopped = self._update_request_with_output(
+                    request, new_token_ids)
+
+            # Stop checking for pooler models.
+            pooler_output = None
+            if pooler_outputs:
+                pooler_output = pooler_outputs[req_index]
+                stopped = check_stop(request, self.max_model_len,
+                                     pooler_output)
+
+            if stopped:
+                kv_transfer_params = self._free_request(request)
+                if status_before_stop == RequestStatus.RUNNING:
+                    stopped_running_reqs.add(request)
+                else:
+                    stopped_preempted_reqs.add(request)
+
+            # Extract sample logprobs if needed.
+            if request.sampling_params is not None \
+                and request.sampling_params.logprobs is not None and logprobs:
+                # NOTE: once we support N tokens per step (spec decode),
+                # the outer lists can be of length > 1.
+                new_logprobs = logprobs.slice(req_index, req_index + 1)
+
+            if new_token_ids and self.structured_output_manager.should_advance(
+                    request):
+                # NOTE: structured_output_request
+                # should not be None if use_structured_output, we have
+                # check above, so safe to ignore type warning
+                request.structured_output_request.grammar.accept_tokens(  # type: ignore[union-attr]
+                    req_id, new_token_ids)
+
+            # spec_token_ids comes from the model runner output
+            if num_nans_in_logits is not None and req_id in num_nans_in_logits:
+                request.num_nans_in_logits = num_nans_in_logits[req_id]
+
+            # Add newly generated spec token ids to the request.
+            if spec_token_ids is not None:
+                if self.structured_output_manager.should_advance(request):
+                    metadata = request.structured_output_request
+                    # Needs to happen after new_token_ids are accepted.
+                    request.spec_token_ids = metadata.grammar.validate_tokens(  # type: ignore[union-attr]
+                        spec_token_ids[req_index])
+                else:
+                    request.spec_token_ids = spec_token_ids[req_index]
+
+            # Get prompt logprobs for this request.
+            prompt_logprobs_tensors = prompt_logprobs_dict.get(req_id)
+            if new_token_ids or pooler_output is not None \
+                or kv_transfer_params:
+
+                # Add EngineCoreOutput for this Request.
+                outputs[request.client_index].append(
+                    EngineCoreOutput(
+                        request_id=req_id,
+                        new_token_ids=new_token_ids,
+                        finish_reason=request.get_finished_reason(),
+                        new_logprobs=new_logprobs,
+                        new_prompt_logprobs_tensors=prompt_logprobs_tensors,
+                        pooling_output=pooler_output,
+                        stop_reason=request.stop_reason,
+                        events=request.take_events(),
+                        kv_transfer_params=kv_transfer_params,
+                        num_cached_tokens=request.num_cached_tokens,
+                    ))
+
+            else:
+                # Invariant: EngineCore returns no partial prefill outputs.
+                assert not prompt_logprobs_tensors
+
+        # Remove the stopped requests from the running and waiting queues.
+        if stopped_running_reqs:
+            self.running = [
+                req for req in self.running if req not in stopped_running_reqs
+            ]
+        if stopped_preempted_reqs:
+            # This is a rare case and unlikely to impact performance.
+            self.waiting.remove_requests(stopped_preempted_reqs)
+
+        # KV Connector: update state for finished KV Transfers.
+        self._update_from_kv_xfer_finished(model_runner_output)
+
+        # Create EngineCoreOutputs for all clients that have requests with
+        # outputs in this step.
+        engine_core_outputs = {
+            client_index: EngineCoreOutputs(outputs=outs)
+            for client_index, outs in outputs.items()
+        }
+
+        finished_req_ids = self.finished_req_ids_dict
+        if finished_req_ids:
+            # Include ids of requests that finished since last outputs
+            # were sent.
+            for client_index, finished_set in finished_req_ids.items():
+                # Set finished request set in EngineCoreOutputs for this client.
+                if (eco := engine_core_outputs.get(client_index)) is not None:
+                    eco.finished_requests = finished_set
+                else:
+                    engine_core_outputs[client_index] = EngineCoreOutputs(
+                        finished_requests=finished_set)
+            finished_req_ids.clear()
+
+        if engine_core_outputs:
+            # Return stats to only one of the front-ends.
+            next(iter(engine_core_outputs.values())).scheduler_stats = (
+                self.make_stats(spec_decoding_stats))
+
+        return engine_core_outputs
+
+    def _update_request_with_output(
+        self,
+        request: Request,
+        new_token_ids: list[int],
+    ) -> tuple[list[int], bool]:
+        # Append generated tokens and check for stop. Note that if
+        # a request is still being prefilled, we expect the model runner
+        # to return empty token ids for the request.
+        stopped = False
+        for num_new, output_token_id in enumerate(new_token_ids, 1):
+            request.append_output_token_ids(output_token_id)
+
+            # Check for stop and update request state.
+            # This must be called before we make the EngineCoreOutput.
+            stopped = check_stop(request, self.max_model_len)
+            if stopped:
+                del new_token_ids[num_new:]  # Trim new tokens if needed.
+                break
+        return new_token_ids, stopped
+
+    def _free_encoder_inputs(self, request: Request) -> None:
+        cached_encoder_input_ids = (
+            self.encoder_cache_manager.get_cached_input_ids(request))
+        # OPTIMIZATION: Avoid list(set) if the set is empty.
+        if not cached_encoder_input_ids:
+            return
+
+        # Here, we use list(set) to avoid modifying the set while iterating
+        # over it.
+        for input_id in list(cached_encoder_input_ids):
+            mm_positions = request.mm_positions[input_id]
+            start_pos = mm_positions.offset
+            num_tokens = mm_positions.length
+            if start_pos + num_tokens <= request.num_computed_tokens:
+                # The encoder output is already processed and stored
+                # in the decoder's KV cache.
+                self.encoder_cache_manager.free_encoder_input(
+                    request, input_id)
+
+    def get_request_counts(self) -> tuple[int, int]:
+        """Returns (num_running_reqs, num_waiting_reqs)."""
+        return len(self.running), len(self.waiting)
+
+    def add_request(self, request: Request) -> None:
+        self.waiting.add_request(request)
+        self.requests[request.request_id] = request
+        if self.log_stats:
+            request.record_event(EngineCoreEventType.QUEUED)
+
+    def finish_requests(
+        self,
+        request_ids: Union[str, Iterable[str]],
+        finished_status: RequestStatus,
+    ) -> None:
+        """Handles the finish signal from outside the scheduler.
+
+        For example, the API server can abort a request when the client
+        disconnects.
+        """
+        assert RequestStatus.is_finished(finished_status)
+        if isinstance(request_ids, str):
+            request_ids = (request_ids, )
+        else:
+            request_ids = set(request_ids)
+
+        running_requests_to_remove = []
+        waiting_requests_to_remove = []
+        valid_requests = []
+
+        # First pass: collect requests to remove from queues
+        for req_id in request_ids:
+            request = self.requests.get(req_id)
+            if request is None:
+                # Invalid request ID.
+                continue
+
+            valid_requests.append(request)
+            if request.status == RequestStatus.RUNNING:
+                running_requests_to_remove.append(request)
+            else:
+                waiting_requests_to_remove.append(request)
+
+        # Remove all requests from queues at once for better efficiency
+        for request in running_requests_to_remove:
+            self.running.remove(request)
+        if waiting_requests_to_remove:
+            self.waiting.remove_requests(waiting_requests_to_remove)
+
+        # Second pass: set status and free requests
+        for request in valid_requests:
+            request.status = finished_status
+            self._free_request(request)
+
+    def _free_request(self, request: Request) -> Optional[dict[str, Any]]:
+        assert request.is_finished()
+
+        delay_free_blocks, kv_xfer_params = self._connector_finished(request)
+        self.encoder_cache_manager.free(request)
+        request_id = request.request_id
+        self.finished_req_ids.add(request_id)
+        if self.finished_req_ids_dict is not None:
+            self.finished_req_ids_dict[request.client_index].add(request_id)
+
+        if not delay_free_blocks:
+            self._free_blocks(request)
+
+        return kv_xfer_params
+
+    def _free_blocks(self, request: Request):
+        assert request.is_finished()
+        self.kv_cache_manager.free(request)
+        self.kv_cache_manager.free_block_hashes(request)
+        del self.requests[request.request_id]
+
+    def get_num_unfinished_requests(self) -> int:
+        return len(self.waiting) + len(self.running)
+
+    def has_finished_requests(self) -> bool:
+        return len(self.finished_req_ids) > 0
+
+    def reset_prefix_cache(self) -> bool:
+        return self.kv_cache_manager.reset_prefix_cache()
+
+    def make_stats(
+        self,
+        spec_decoding_stats: Optional[SpecDecodingStats] = None,
+    ) -> Optional[SchedulerStats]:
+        if not self.log_stats:
+            return None
+        prefix_cache_stats = self.kv_cache_manager.make_prefix_cache_stats()
+        assert prefix_cache_stats is not None
+        return SchedulerStats(
+            num_running_reqs=len(self.running),
+            num_waiting_reqs=len(self.waiting),
+            kv_cache_usage=self.kv_cache_manager.usage,
+            prefix_cache_stats=prefix_cache_stats,
+            spec_decoding_stats=spec_decoding_stats,
+            num_corrupted_reqs=sum(req.is_output_corrupted
+                                   for req in self.running),
+        )
+
+    def make_spec_decoding_stats(
+        self,
+        spec_decoding_stats: Optional[SpecDecodingStats],
+        num_draft_tokens: int,
+        num_accepted_tokens: int,
+    ) -> Optional[SpecDecodingStats]:
+        if not self.log_stats:
+            return None
+        if spec_decoding_stats is None:
+            spec_decoding_stats = SpecDecodingStats.new(self.num_spec_tokens)
+        spec_decoding_stats.observe_draft(
+            num_draft_tokens=num_draft_tokens,
+            num_accepted_tokens=num_accepted_tokens)
+        return spec_decoding_stats
+
+    def shutdown(self) -> None:
+        if self.kv_event_publisher:
+            self.kv_event_publisher.shutdown()
+
+    ########################################################################
+    # KV Connector Related Methods
+    ########################################################################
+
+    def get_kv_connector(self) -> Optional[KVConnectorBase_V1]:
+        return self.connector
+
+    def _connector_finished(
+            self, request: Request) -> tuple[bool, Optional[dict[str, Any]]]:
+        """
+        Invoke the KV connector request_finished() method if applicable.
+
+        Returns optional kv transfer parameters to be included with the
+        request outputs.
+        """
+        if self.connector is None:
+            return False, None
+
+        (block_ids, ) = self.kv_cache_manager.get_block_ids(request.request_id)
+        return self.connector.request_finished(request, block_ids)
+
+    def _update_waiting_for_remote_kv(self, request: Request) -> bool:
+        """
+        KV Connector: check if the request_id is finished_recving.
+
+        The finished_recving_kv_req_ids list is populated
+        on the previous steps()'s update_from_output based
+        on the worker side connector.
+
+        When the kv transfer is ready, we cache the blocks
+        and the request state will be moved back to WAITING from
+        WAITING_FOR_REMOTE_KV.
+        """
+        assert self.connector is not None
+        if request.request_id not in self.finished_recving_kv_req_ids:
+            return False
+
+        # Now that the blocks are ready, actually cache them.
+        (block_ids, ) = self.kv_cache_manager.get_block_ids(request.request_id)
+        num_computed_tokens = len(block_ids) * self.block_size
+        # Handle the case where num request tokens less then one block.
+        num_computed_tokens = min(num_computed_tokens, request.num_tokens)
+        if num_computed_tokens == request.num_tokens:
+            num_computed_tokens -= 1
+        # This will cache the blocks iff caching is enabled.
+        self.kv_cache_manager.cache_blocks(request, num_computed_tokens)
+
+        # Update the request state for scheduling.
+        request.num_computed_tokens = num_computed_tokens
+
+        # Return that we are ready.
+        self.finished_recving_kv_req_ids.remove(request.request_id)
+        return True
+
+    def _update_from_kv_xfer_finished(self,
+                                      model_runner_output: ModelRunnerOutput):
+        """
+        KV Connector: update the scheduler state based on the output.
+
+        The Worker side connectors add finished_recving and
+        finished_sending reqs to the output.
+        * if finished_sending: free the blocks
+        # if finished_recving: add to state so we can
+            scheduler the request during the next step.
+        """
+        # KV Connector:: update recv and send status from last step.
+        for req_id in (model_runner_output.finished_recving or ()):
+            logger.debug("Finished recving KV transfer for request %s", req_id)
+            self.finished_recving_kv_req_ids.add(req_id)
+        for req_id in (model_runner_output.finished_sending or ()):
+            logger.debug("Finished sending KV transfer for request %s", req_id)
+            self._free_blocks(self.requests[req_id])
diff --git a/vllm_v0.10.0/vllm/v1/core/sched/utils.py b/vllm_v0.10.0/vllm/v1/core/sched/utils.py
new file mode 100644
index 0000000..42ec950
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/core/sched/utils.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import torch
+
+from vllm.v1.request import Request, RequestStatus
+
+
+def check_stop(request: Request,
+               max_model_len: int,
+               pooler_output: Optional[torch.Tensor] = None) -> bool:
+    if (request.num_tokens >= max_model_len
+            or request.num_output_tokens >= request.max_tokens):
+        request.status = RequestStatus.FINISHED_LENGTH_CAPPED
+        return True
+
+    if request.pooling_params:
+        if pooler_output is not None:
+            request.status = RequestStatus.FINISHED_STOPPED
+            return True
+        return False
+
+    sampling_params = request.sampling_params
+    assert sampling_params is not None
+    last_token_id = request.output_token_ids[-1]
+    if (not sampling_params.ignore_eos
+            and last_token_id == request.eos_token_id):
+        request.status = RequestStatus.FINISHED_STOPPED
+        return True
+
+    if last_token_id in (sampling_params.stop_token_ids or ()):
+        request.status = RequestStatus.FINISHED_STOPPED
+        request.stop_reason = last_token_id
+        return True
+    return False
diff --git a/vllm_v0.10.0/vllm/v1/core/single_type_kv_cache_manager.py b/vllm_v0.10.0/vllm/v1/core/single_type_kv_cache_manager.py
new file mode 100644
index 0000000..65a196e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/core/single_type_kv_cache_manager.py
@@ -0,0 +1,571 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from typing import Callable
+
+from vllm.utils import cdiv
+from vllm.v1.core.block_pool import BlockPool
+from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
+from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
+                                        FullAttentionSpec, KVCacheSpec,
+                                        MambaSpec, SlidingWindowSpec)
+from vllm.v1.request import Request
+
+
+class SingleTypeKVCacheManager(ABC):
+    """
+    An abstract base class for a manager that handle the kv cache management 
+    logic of one specific type of attention layer.
+    """
+
+    def __init__(
+        self,
+        kv_cache_spec: KVCacheSpec,
+        block_pool: BlockPool,
+        kv_cache_group_id: int,
+        caching_hash_fn: Callable,
+    ) -> None:
+        """
+        Initializes the SpecializedManager.
+        Args:
+            kv_cache_spec: The kv_cache_spec for this manager.
+            block_pool: The block pool.
+            kv_cache_group_id: The id of the kv cache group of this manager.
+            caching_hash_fn: The caching hash function.
+        """
+
+        self.block_size = kv_cache_spec.block_size
+        self.kv_cache_spec = kv_cache_spec
+        self.block_pool = block_pool
+
+        # Mapping from request ID to blocks to track the blocks allocated
+        # for each request, so that we can free the blocks when the request
+        # is finished.
+        self.req_to_blocks: defaultdict[str,
+                                        list[KVCacheBlock]] = defaultdict(list)
+
+        # {req_id: The number of cached blocks for this given request}
+        # This is used to track the number of cached blocks for each request.
+        # This is only used to track the RUNNING requests, we do not track the
+        # data for reempted ones.
+        self.num_cached_block: dict[str, int] = {}
+
+        self.caching_hash_fn = caching_hash_fn
+        self.kv_cache_group_id = kv_cache_group_id
+        self._null_block = block_pool.null_block
+
+    def get_num_blocks_to_allocate(
+            self, request_id: str, num_tokens: int,
+            new_computed_blocks: list[KVCacheBlock]) -> int:
+        """
+        Get the number of blocks needed to be allocated for the request.
+
+        Args:
+            request_id: The request ID.
+            num_tokens: The total number of tokens that need a slot (including 
+                tokens that are already allocated).
+            new_computed_blocks: The new computed blocks just hitting the
+                prefix caching.
+
+        Returns:
+            The number of blocks.
+        """
+
+        num_required_blocks = cdiv(num_tokens, self.block_size)
+        num_new_blocks = (num_required_blocks - len(new_computed_blocks) -
+                          len(self.req_to_blocks[request_id]))
+        # If a computed block of a request is an eviction candidate (in the
+        # free queue and ref_cnt == 0), it will be changed from a free block
+        # to a computed block when the request is allocated, so we also count
+        # it as needed to be allocated.
+        num_evictable_computed_blocks = sum(
+            blk.ref_cnt == 0 and not blk.is_null
+            for blk in new_computed_blocks)
+        return num_new_blocks + num_evictable_computed_blocks
+
+    def save_new_computed_blocks(
+            self, request_id: str,
+            new_computed_blocks: list[KVCacheBlock]) -> None:
+        """
+        Add the new computed blocks to the request.
+
+        Args:
+            request_id: The request ID.
+            new_computed_blocks: The new computed blocks just hitting the
+                prefix cache.
+        """
+        if request_id not in self.num_cached_block:
+            # A new request.
+            req_blocks = self.req_to_blocks[request_id]
+            assert len(req_blocks) == 0
+            req_blocks.extend(new_computed_blocks)
+            self.num_cached_block[request_id] = len(new_computed_blocks)
+        else:
+            # A running request. Should not have new computed blocks.
+            assert len(new_computed_blocks) == 0
+
+    def allocate_new_blocks(self, request_id: str,
+                            num_tokens: int) -> list[KVCacheBlock]:
+        """
+        Allocate new blocks for the request to give it at least `num_tokens` 
+        token slots.
+
+        Args:
+            request_id: The request ID.
+            num_tokens: The total number of tokens that need a slot (including 
+                tokens that are already allocated).
+
+        Returns:
+            The new allocated blocks.
+        """
+        req_blocks = self.req_to_blocks[request_id]
+        num_required_blocks = cdiv(num_tokens, self.block_size)
+        num_new_blocks = num_required_blocks - len(req_blocks)
+        if num_new_blocks <= 0:
+            return []
+        else:
+            new_blocks = self.block_pool.get_new_blocks(num_new_blocks)
+            req_blocks.extend(new_blocks)
+            return new_blocks
+
+    def cache_blocks(self, request: Request, block_hashes: list[BlockHash],
+                     num_tokens: int) -> None:
+        """
+        Cache the blocks for the request.
+
+        Args:
+            request: The request.
+            block_hashes: The block hashes of the request.
+            num_tokens: The total number of tokens that need to be cached 
+                (including tokens that are already cached).
+        """
+        num_cached_blocks = self.num_cached_block[request.request_id]
+        num_full_blocks = num_tokens // self.block_size
+
+        self.block_pool.cache_full_blocks(
+            request=request,
+            blocks=self.req_to_blocks[request.request_id],
+            block_hashes=block_hashes,
+            num_cached_blocks=num_cached_blocks,
+            num_full_blocks=num_full_blocks,
+            block_size=self.block_size,
+            kv_cache_group_id=self.kv_cache_group_id,
+            hash_fn=self.caching_hash_fn,
+        )
+
+        self.num_cached_block[request.request_id] = num_full_blocks
+
+    def free(self, request_id: str) -> None:
+        """
+        Free the blocks for the request.
+
+        Args:
+            request_id: The request ID.
+        """
+        # Default to [] in case a request is freed (aborted) before alloc.
+        req_blocks = self.req_to_blocks.pop(request_id, [])
+
+        # Free blocks in reverse order so that the tail blocks are
+        # freed first.
+        ordered_blocks = reversed(req_blocks)
+
+        self.block_pool.free_blocks(ordered_blocks)
+        self.num_cached_block.pop(request_id, None)
+
+    @abstractmethod
+    def get_num_common_prefix_blocks(self, request_id: str,
+                                     num_running_requests: int) -> int:
+        """
+        Get the number of common prefix blocks for a request.
+
+        Args:
+            request_id: The request ID.
+            block_hashes: The block hashes of the request.
+
+        Returns:
+            The number of common prefix blocks.
+        """
+
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def find_longest_cache_hit(
+        cls,
+        block_hashes: list[BlockHash],
+        max_length: int,
+        kv_cache_group_ids: list[int],
+        block_pool: BlockPool,
+        kv_cache_spec: KVCacheSpec,
+        use_eagle: bool,
+    ) -> tuple[list[KVCacheBlock], ...]:
+        """
+        Get the longest cache hit prefix of the blocks that is not longer than 
+        `max_length`. The prefix should be a common prefix hit for all the 
+        kv cache groups in `kv_cache_group_ids`. If no cache hit is found, 
+        return an empty list. 
+        If eagle is enabled, drop the last matched block to force recompute the 
+        last block to get the required hidden states for eagle drafting head. 
+        Need to be customized for each attention type.
+
+        Args:
+            block_hashes: The block hashes of the request.
+            max_length: The maximum length of the cache hit prefix.
+            kv_cache_group_ids: The ids of the kv cache groups.
+            block_pool: The block pool.
+            kv_cache_spec: The kv cache spec.
+            use_eagle: Whether to use eagle.
+
+        Returns:
+            A list of cached blocks with skipped blocks replaced by null block
+            for each kv cache group in `kv_cache_group_ids`.
+            Return a list of length `len(kv_cache_group_ids)`, where the i-th
+            element is a list of cached blocks for the i-th kv cache group
+            in `kv_cache_group_ids`.
+            For example, sliding window manager should return a list like
+            ([NULL, NULL, KVCacheBlock(7), KVCacheBlock(8)]) for block size 4
+            and sliding window 8 and len(kv_cache_group_ids) = 1.
+        """
+
+        raise NotImplementedError
+
+    @abstractmethod
+    def remove_skipped_blocks(self, request_id: str,
+                              num_computed_tokens: int) -> None:
+        """
+        Remove the blocks that are no longer needed from `blocks` and free the 
+        blocks. The removed blocks should be replaced by null_block.
+        Need to be customized for each attention type.
+
+        Args:
+            request_id: The request ID.
+            num_computed_tokens: The number of tokens that have been computed.
+        """
+        raise NotImplementedError
+
+
+class FullAttentionManager(SingleTypeKVCacheManager):
+
+    @classmethod
+    def find_longest_cache_hit(
+        cls,
+        block_hashes: list[BlockHash],
+        max_length: int,
+        kv_cache_group_ids: list[int],
+        block_pool: BlockPool,
+        kv_cache_spec: KVCacheSpec,
+        use_eagle: bool,
+    ) -> tuple[list[KVCacheBlock], ...]:
+        assert isinstance(
+            kv_cache_spec, (FullAttentionSpec, ChunkedLocalAttentionSpec)
+        ), "FullAttentionManager can only be used for full attention " \
+            "and chunked local attention groups"
+        computed_blocks: tuple[list[KVCacheBlock], ...] = tuple(
+            [] for _ in range(len(kv_cache_group_ids)))
+        max_num_blocks = max_length // kv_cache_spec.block_size
+        for i, block_hash in zip(range(max_num_blocks), block_hashes):
+            # block_hashes is a chain of block hashes. If a block hash is not
+            # in the cached_block_hash_to_id, the following block hashes are
+            # not computed yet for sure.
+            if cached_block := block_pool.get_cached_block(
+                    block_hash, kv_cache_group_ids):
+                for computed, cached in zip(computed_blocks, cached_block):
+                    computed.append(cached)
+            else:
+                break
+        if use_eagle and computed_blocks[0]:
+            for computed in computed_blocks:
+                computed.pop()
+        return computed_blocks
+
+    def remove_skipped_blocks(self, request_id: str,
+                              num_computed_tokens: int) -> None:
+        # No need to remove blocks for full attention.
+        pass
+
+    def get_num_common_prefix_blocks(self, request_id: str,
+                                     num_running_requests: int) -> int:
+        blocks = self.req_to_blocks[request_id]
+        num_common_blocks = 0
+        for block in blocks:
+            if block.ref_cnt == num_running_requests:
+                num_common_blocks += 1
+            else:
+                break
+        return num_common_blocks
+
+
+class SlidingWindowManager(SingleTypeKVCacheManager):
+
+    def __init__(self, kv_cache_spec: SlidingWindowSpec, block_pool: BlockPool,
+                 **kwargs) -> None:
+        super().__init__(kv_cache_spec, block_pool, **kwargs)
+        self.sliding_window = kv_cache_spec.sliding_window
+        self._null_block = block_pool.null_block
+
+    @classmethod
+    def find_longest_cache_hit(
+        cls,
+        block_hashes: list[BlockHash],
+        max_length: int,
+        kv_cache_group_ids: list[int],
+        block_pool: BlockPool,
+        kv_cache_spec: KVCacheSpec,
+        use_eagle: bool,
+    ) -> tuple[list[KVCacheBlock], ...]:
+        assert isinstance(kv_cache_spec, SlidingWindowSpec), (
+            "SlidingWindowManager can only be used for sliding window groups")
+
+        # The number of contiguous blocks needed for prefix cache hit.
+        # -1 since the input token itself is also included in the window
+        sliding_window_contiguous_blocks = cdiv(
+            kv_cache_spec.sliding_window - 1, kv_cache_spec.block_size)
+        if use_eagle:
+            # Need to drop the last matched block if eagle is enabled. For
+            # sliding window layer, we achieve this by increasing the number of
+            # contiguous blocks needed for prefix cache hit by one and dropping
+            # the last matched block.
+            sliding_window_contiguous_blocks += 1
+
+        # TODO: reduce i by sliding_window_contiguous_blocks when cache miss, to
+        # optimize the time complexity from O(max_num_blocks) to
+        # O(max_num_blocks / sliding_window_contiguous_blocks +
+        # sliding_window_contiguous_blocks),
+        # which is good for low cache hit rate scenarios.
+        max_num_blocks = max_length // kv_cache_spec.block_size
+        computed_blocks = tuple([block_pool.null_block] * max_num_blocks
+                                for _ in range(len(kv_cache_group_ids)))
+        num_contiguous_blocks = 0
+        match_found = False
+        # Search from right to left and early stop when a match is found.
+        for i in range(max_num_blocks - 1, -1, -1):
+            if cached_block := block_pool.get_cached_block(
+                    block_hashes[i], kv_cache_group_ids):
+                for computed, cached in zip(computed_blocks, cached_block):
+                    computed[i] = cached
+                num_contiguous_blocks += 1
+                if num_contiguous_blocks >= sliding_window_contiguous_blocks:
+                    # Trim the trailing blocks.
+                    # E.g., [NULL, NULL, 8, 3, NULL, 9] -> [NULL, NULL, 8, 3]
+                    # when sliding_window_contiguous_blocks=2.
+                    for computed in computed_blocks:
+                        del computed[i + num_contiguous_blocks:]
+                    match_found = True
+                    break
+            else:
+                num_contiguous_blocks = 0
+        if not match_found:
+            # The first `num_contiguous_blocks` is a cache hit even if
+            # `num_contiguous_blocks < sliding_window_contiguous_blocks`.
+            for computed in computed_blocks:
+                del computed[num_contiguous_blocks:]
+        if use_eagle and computed_blocks[0]:
+            for computed in computed_blocks:
+                computed.pop()
+        return computed_blocks
+
+    def remove_skipped_blocks(self, request_id: str,
+                              num_computed_tokens: int) -> None:
+        # Remove the blocks that are no longer be in the sliding window and
+        # skipped during the attention computation.
+        last_useful_token = num_computed_tokens - self.sliding_window + 1
+        last_useful_block = last_useful_token // self.block_size
+        blocks = self.req_to_blocks[request_id]
+        removed_blocks: list[KVCacheBlock] = []
+        for i in range(last_useful_block - 1, -1, -1):
+            if blocks[i] == self._null_block:
+                # If the block is already a null block, the blocks before it
+                # should also have been set to null blocks by the previous calls
+                # to this function.
+                break
+            removed_blocks.append(blocks[i])
+            blocks[i] = self._null_block
+        self.block_pool.free_blocks(removed_blocks)
+
+    def get_num_common_prefix_blocks(self, request_id: str,
+                                     num_running_requests: int) -> int:
+        """
+        NOTE(Chen): The prefix blocks are null blocks for sliding window layers.
+        So it's not correct to count ref_cnt like FullAttentionManager. Return 
+        0 here for correctness. Need to support cascade attention + sliding 
+        window in the future.
+        """
+        return 0
+
+
+class ChunkedLocalAttentionManager(SingleTypeKVCacheManager):
+
+    def __init__(self, kv_cache_spec: ChunkedLocalAttentionSpec,
+                 block_pool: BlockPool, **kwargs) -> None:
+        super().__init__(kv_cache_spec, block_pool, **kwargs)
+        self.attention_chunk_size = kv_cache_spec.attention_chunk_size
+        self._null_block = block_pool.null_block
+
+    @classmethod
+    def find_longest_cache_hit(
+        cls,
+        block_hashes: list[BlockHash],
+        max_length: int,
+        kv_cache_group_ids: list[int],
+        block_pool: BlockPool,
+        kv_cache_spec: KVCacheSpec,
+        use_eagle: bool,
+    ) -> tuple[list[KVCacheBlock], ...]:
+        """
+        For chunked local attention, we need to find the longest cache hit
+        prefix of the blocks that is not longer than `max_length`. The prefix
+        should be a common prefix hit for all the kv cache groups in
+        `kv_cache_group_ids`. If no cache hit is found, return an empty list.
+        note we mark as computed if the whole block is outside of the local 
+        window, and set the block as null. Examples:
+
+        1. Attention chunk size of 8, block size of 4, max length of 15
+        for next token at 15th (zero-indexed), 8th - 14th tokens are in 
+        the window(needs lookup), 0th - 7th are not in the window, 
+        so they are already marked as computed. We check the complete 
+        block3 (8th - 11th tokens), Assume block 3 is hit, we will return 
+        [null, null, block 3], otherwise, we return [null, null]
+
+        2. Attention chunk size of 8, block size of 4, max length of 16
+        for next token at 16th (zero-indexed), 0th - 15th tokens are not 
+        in the window, so they are already marked as computed. 
+        we return 4 blocks[null, null, null, null]
+
+        Args:
+            block_hashes: The block hashes of the request.
+            max_length: The maximum length of the cache hit prefix.
+            kv_cache_group_ids: The ids of the kv cache groups.
+            block_pool: The block pool.
+            kv_cache_spec: The kv cache spec.
+            use_eagle: Whether to use eagle.
+
+        Returns:
+            A list of cached blocks
+        """
+        assert isinstance(kv_cache_spec, ChunkedLocalAttentionSpec), (
+            "ChunkedLocalAttentionManager can only be used for " +
+            "chunked local attention groups")
+        assert use_eagle is False, ("Hybrid KV cache is not supported for " +
+                                    "eagle + chunked local attention.")
+        max_num_blocks = max_length // kv_cache_spec.block_size
+        if max_length > 0:
+            local_attention_start_idx = (max_length //
+                                         kv_cache_spec.attention_chunk_size *
+                                         kv_cache_spec.attention_chunk_size)
+        else:
+            local_attention_start_idx = 0
+        # we marked blocks out of window as computed
+        # with null blocks, and blocks inside window based on cache lookup
+        # result [null] [null] ... [null] [hit block 1 (1st block contain
+        # last window)] [hit block 2] ... [hit block x]
+        local_attention_start_block_idx = (local_attention_start_idx //
+                                           kv_cache_spec.block_size)
+        computed_blocks: tuple[list[KVCacheBlock], ...] = tuple(
+            [block_pool.null_block] * local_attention_start_block_idx
+            for _ in range(len(kv_cache_group_ids)))
+        for i in range(local_attention_start_block_idx, max_num_blocks):
+            block_hash = block_hashes[i]
+            if cached_block := block_pool.get_cached_block(
+                    block_hash, kv_cache_group_ids):
+                for computed, cached in zip(computed_blocks, cached_block):
+                    computed.append(cached)
+            else:
+                break
+        return computed_blocks
+
+    def remove_skipped_blocks(self, request_id: str,
+                              num_computed_tokens: int) -> None:
+        # Remove the blocks that are no longer be in the chunked attention
+        # window and skipped during the attention computation.
+
+        # [chunk 0][chunk 1]local_attention_start_idx ... current
+        # we computed previous number of chunks to get the idx of
+        # current chunk window starting offset,
+        # e.g. for computed 1024 tokens, the 1024th token (0 indexed)
+        # is in the second chunk, there are 1 prev chunk, the start idx
+        # is 1024. for 1023, it will be 0.
+        num_cached_block = self.num_cached_block.get(request_id, 0)
+        local_attention_start_idx = (
+            num_computed_tokens
+        ) // self.attention_chunk_size * self.attention_chunk_size
+        first_useful_block_idx = local_attention_start_idx // self.block_size
+        if num_cached_block > 0:
+            # Make sure we don't delete the last cached block
+            first_useful_block_idx = min(first_useful_block_idx,
+                                         num_cached_block - 1)
+        # if block size = 128, 0 -> block 0, 1024 (= 128 * 8) ->
+        # block 8, 372 (= 128 * 2 + 116) -> block 2
+        blocks = self.req_to_blocks[request_id]
+        removed_blocks: list[KVCacheBlock] = []
+        # we need to keep the last block to get the previous hash key
+        for i in range(first_useful_block_idx - 1, -1, -1):
+            if blocks[i] == self._null_block:
+                # If the block is already a null block, the blocks before it
+                # should also have been set to null blocks by the previous calls
+                # to this function.
+                break
+            removed_blocks.append(blocks[i])
+            blocks[i] = self._null_block
+        self.block_pool.free_blocks(removed_blocks)
+
+    def get_num_common_prefix_blocks(self, request_id: str,
+                                     num_running_requests: int) -> int:
+        """
+        cascade attention is not supported by chunked local attention.
+        """
+        return 0
+
+
+class MambaManager(SingleTypeKVCacheManager):
+
+    @classmethod
+    def find_longest_cache_hit(
+        cls,
+        block_hashes: list[BlockHash],
+        max_length: int,
+        kv_cache_group_ids: list[int],
+        block_pool: BlockPool,
+        kv_cache_spec: KVCacheSpec,
+        use_eagle: bool,
+    ) -> tuple[list[KVCacheBlock], ...]:
+        assert isinstance(
+            kv_cache_spec,
+            MambaSpec), ("MambaManager can only be used for mamba groups")
+        # Prefix caching is not supported for mamba now. Always return empty
+        # list.
+        computed_blocks: tuple[list[KVCacheBlock], ...] = tuple(
+            [] for _ in range(len(kv_cache_group_ids)))
+        return computed_blocks
+
+    def remove_skipped_blocks(self, request_id: str,
+                              num_computed_tokens: int) -> None:
+        # Each request will always have 1 block at this moment, so no need to
+        # remove blocks.
+        pass
+
+    def get_num_common_prefix_blocks(self, request_id: str,
+                                     num_running_requests: int) -> int:
+        return 0
+
+    def allocate_new_blocks(self, request_id: str,
+                            num_tokens: int) -> list[KVCacheBlock]:
+        new_blocks = super().allocate_new_blocks(request_id, num_tokens)
+        assert len(self.req_to_blocks[request_id]) == 1, (
+            "MambaManager should only allocate 1 block for each request.")
+        return new_blocks
+
+
+spec_manager_map: dict[type[KVCacheSpec], type[SingleTypeKVCacheManager]] = {
+    FullAttentionSpec: FullAttentionManager,
+    SlidingWindowSpec: SlidingWindowManager,
+    ChunkedLocalAttentionSpec: ChunkedLocalAttentionManager,
+    MambaSpec: MambaManager,
+}
+
+
+def get_manager_for_kv_cache_spec(kv_cache_spec: KVCacheSpec,
+                                  **kwargs) -> SingleTypeKVCacheManager:
+    manager_class = spec_manager_map[type(kv_cache_spec)]
+    manager = manager_class(kv_cache_spec, **kwargs)
+    return manager
diff --git a/vllm_v0.10.0/vllm/v1/engine/__init__.py b/vllm_v0.10.0/vllm/v1/engine/__init__.py
new file mode 100644
index 0000000..79dc80d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/engine/__init__.py
@@ -0,0 +1,195 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import enum
+import time
+from collections.abc import Sequence
+from typing import Any, Optional, Union
+
+import msgspec
+import torch
+
+from vllm.lora.request import LoRARequest
+from vllm.multimodal import MultiModalKwargs
+from vllm.multimodal.inputs import PlaceholderRange
+from vllm.pooling_params import PoolingParams
+from vllm.sampling_params import SamplingParams
+from vllm.v1.metrics.stats import SchedulerStats
+from vllm.v1.outputs import LogprobsLists, LogprobsTensors
+
+# These are possible values of RequestOutput.finish_reason,
+# so form part of the external API.
+FINISH_REASON_STRINGS = ("stop", "length", "abort")
+
+
+class FinishReason(enum.IntEnum):
+    """
+    Reason a request finished - stop, length, or abort.
+
+    Int rather than Str for more compact serialization.
+
+    stop - a stop string was emitted
+    length - max_tokens was consumed, or max_model_len was reached
+    abort - aborted for another reason
+
+    """
+    STOP = 0
+    LENGTH = 1
+    ABORT = 2
+
+    def __str__(self):
+        return FINISH_REASON_STRINGS[self.value]
+
+
+class EngineCoreRequest(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True,  # type: ignore[call-arg]
+        gc=False):  # type: ignore[call-arg]
+
+    request_id: str
+    prompt_token_ids: list[int]
+    mm_inputs: Optional[Sequence[Optional[MultiModalKwargs]]]
+    mm_hashes: Optional[list[str]]
+    mm_placeholders: Optional[list[PlaceholderRange]]
+    sampling_params: Optional[SamplingParams]
+    pooling_params: Optional[PoolingParams]
+    eos_token_id: Optional[int]
+    arrival_time: float
+    lora_request: Optional[LoRARequest]
+    cache_salt: Optional[str]
+    data_parallel_rank: Optional[int]
+
+    # Index of the client, used to ensure outputs are sent back to the same
+    # client for this request when scaling out the front-end.
+    client_index: int = 0
+
+    # Used in DP case to indicate which wave of requests this is expected to
+    # belong to, to cover a race condition where the request is sent before
+    # a wave finished notification is received.
+    current_wave: int = 0
+    priority: int = 0
+
+
+class EngineCoreEventType(enum.IntEnum):
+    """The type of engine core request event."""
+    QUEUED = 1
+    SCHEDULED = 2
+    PREEMPTED = 3
+
+
+class EngineCoreEvent(msgspec.Struct):
+    """A timestamped engine core event associated with a request.
+
+    The timestamp is a monotonic timestamps and is used for by the engine
+    frontend to calculate intervals between engine core events. These
+    timestamps should not be compared with timestamps from other processes.
+    """
+    type: EngineCoreEventType
+    timestamp: float
+
+    @classmethod
+    def new_event(cls,
+                  event_type: EngineCoreEventType,
+                  timestamp: Optional[float] = None) -> "EngineCoreEvent":
+        timestamp = time.monotonic() if timestamp is None else timestamp
+        return cls(event_type, timestamp)
+
+
+class EngineCoreOutput(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True,  # type: ignore[call-arg]
+        gc=False):  # type: ignore[call-arg]
+
+    request_id: str
+    new_token_ids: list[int]
+
+    new_logprobs: Optional[LogprobsLists] = None
+    new_prompt_logprobs_tensors: Optional[LogprobsTensors] = None
+
+    pooling_output: Optional[torch.Tensor] = None
+
+    finish_reason: Optional[FinishReason] = None
+    stop_reason: Union[int, str, None] = None
+    events: Optional[list[EngineCoreEvent]] = None
+    kv_transfer_params: Optional[dict[str, Any]] = None
+
+    # The number of tokens with prefix cache hits.
+    num_cached_tokens: int = 0
+
+    @property
+    def finished(self) -> bool:
+        return self.finish_reason is not None
+
+
+class UtilityOutput(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        gc=False):  # type: ignore[call-arg]
+
+    call_id: int
+
+    # Non-None implies the call failed, result should be None.
+    failure_message: Optional[str] = None
+    result: Any = None
+
+
+class EngineCoreOutputs(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True,  # type: ignore[call-arg]
+        gc=False):  # type: ignore[call-arg]
+
+    #NOTE(Nick): We could consider ways to make this more compact,
+    # e.g. columnwise layout
+
+    engine_index: int = 0
+
+    # [num_reqs]
+    outputs: list[EngineCoreOutput] = []
+    scheduler_stats: Optional[SchedulerStats] = None
+    timestamp: float = 0.0
+
+    utility_output: Optional[UtilityOutput] = None
+    finished_requests: Optional[set[str]] = None
+
+    # In DP case, used to signal that the current wave of requests
+    # has finished and the engines are paused.
+    wave_complete: Optional[int] = None
+    # In DP case, used to signal that a request was received for an
+    # "old" wave, so the next wave needs to be started in other engines.
+    start_wave: Optional[int] = None
+
+    def __post_init__(self):
+        if self.timestamp == 0.0:
+            self.timestamp = time.monotonic()
+
+
+class EngineCoreRequestType(enum.Enum):
+    """
+    Request types defined as hex byte strings, so it can be sent over sockets
+    without separate encoding step.
+    """
+    ADD = b'\x00'
+    ABORT = b'\x01'
+    START_DP_WAVE = b'\x02'
+    UTILITY = b'\x03'
+    # Sentinel used within EngineCoreProc.
+    EXECUTOR_FAILED = b'\x04'
+
+
+class ReconfigureDistributedRequest(msgspec.Struct):
+    new_data_parallel_size: int
+    new_data_parallel_rank: int
+    new_data_parallel_rank_local: int
+    new_data_parallel_master_ip: str
+    new_data_parallel_master_port: int
+
+
+class ReconfigureRankType(enum.IntEnum):
+    """
+    Rank type for reconfiguring distributed request.
+    """
+    KEEP_CURRENT_RANK = -1
+    SHUTDOWN_CURRENT_RANK = -2
diff --git a/vllm_v0.10.0/vllm/v1/engine/async_llm.py b/vllm_v0.10.0/vllm/v1/engine/async_llm.py
new file mode 100644
index 0000000..02cb801
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/engine/async_llm.py
@@ -0,0 +1,671 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import time
+from collections.abc import AsyncGenerator, Mapping
+from copy import copy
+from typing import Any, Optional, Union
+
+import numpy as np
+
+import vllm.envs as envs
+from vllm.config import ModelConfig, VllmConfig
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.protocol import EngineClient
+from vllm.envs import VLLM_V1_OUTPUT_PROC_CHUNK_SIZE
+from vllm.inputs import PromptType
+from vllm.inputs.preprocess import InputPreprocessor
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.outputs import PoolingRequestOutput, RequestOutput
+from vllm.pooling_params import PoolingParams
+from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.config import (
+    maybe_register_config_serialize_by_value)
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
+from vllm.usage.usage_lib import UsageContext
+from vllm.utils import Device, cdiv
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.core_client import EngineCoreClient
+from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
+from vllm.v1.engine.output_processor import (OutputProcessor,
+                                             RequestOutputCollector)
+from vllm.v1.engine.parallel_sampling import ParentRequest
+from vllm.v1.engine.processor import Processor
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.metrics.loggers import StatLoggerFactory, StatLoggerManager
+from vllm.v1.metrics.prometheus import shutdown_prometheus
+from vllm.v1.metrics.stats import IterationStats
+
+logger = init_logger(__name__)
+
+
+class AsyncLLM(EngineClient):
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+        use_cached_outputs: bool = False,
+        log_requests: bool = True,
+        start_engine_loop: bool = True,
+        stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        client_addresses: Optional[dict[str, str]] = None,
+        client_index: int = 0,
+    ) -> None:
+        """
+        Create an AsyncLLM.
+
+        Args:
+            vllm_config: global configuration.
+            executor_class: an Executor impl, e.g. MultiprocExecutor.
+            log_stats: Whether to log stats.
+            usage_context: Usage context of the LLM.
+            mm_registry: Multi-modal registry.
+            use_cached_outputs: Whether to use cached outputs.
+            log_requests: Whether to log requests.
+            start_engine_loop: Whether to start the engine loop.
+            stat_loggers: customized stat loggers for the engine.
+                If not provided, default stat loggers will be used.
+                PLEASE BE AWARE THAT STAT LOGGER IS NOT STABLE
+                IN V1, AND ITS BASE CLASS INTERFACE MIGHT CHANGE.
+
+        Returns:
+            None
+        """
+        if not envs.VLLM_USE_V1:
+            raise ValueError(
+                "Using V1 AsyncLLMEngine, but envs.VLLM_USE_V1=False. "
+                "This should not happen. As a workaround, try using "
+                "AsyncLLMEngine.from_vllm_config(...) or explicitly set "
+                "VLLM_USE_V1=0 or 1 and report this issue on Github.")
+
+        # Ensure we can serialize custom transformer configs
+        maybe_register_config_serialize_by_value()
+
+        self.model_config = vllm_config.model_config
+        self.vllm_config = vllm_config
+        self.log_requests = log_requests
+        self.log_stats = log_stats
+
+        if self.model_config.skip_tokenizer_init:
+            self.tokenizer = None
+        else:
+            # Tokenizer (+ ensure liveness if running in another process).
+            self.tokenizer = init_tokenizer_from_configs(
+                model_config=vllm_config.model_config,
+                scheduler_config=vllm_config.scheduler_config,
+                lora_config=vllm_config.lora_config)
+
+        # Processor (converts Inputs --> EngineCoreRequests).
+        self.processor = Processor(
+            vllm_config=vllm_config,
+            tokenizer=self.tokenizer,
+            mm_registry=mm_registry,
+        )
+
+        # OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
+        self.output_processor = OutputProcessor(self.tokenizer,
+                                                log_stats=self.log_stats)
+
+        # EngineCore (starts the engine in background process).
+        self.engine_core = EngineCoreClient.make_async_mp_client(
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=self.log_stats,
+            client_addresses=client_addresses,
+            client_index=client_index,
+        )
+
+        # Loggers.
+        self.logger_manager: Optional[StatLoggerManager] = None
+        if self.log_stats:
+            self.logger_manager = StatLoggerManager(
+                vllm_config=vllm_config,
+                engine_idxs=self.engine_core.engine_ranks_managed,
+                custom_stat_loggers=stat_loggers,
+            )
+            self.logger_manager.log_engine_initialized()
+
+        self.output_handler: Optional[asyncio.Task] = None
+        try:
+            # Start output handler eagerly if we are in the asyncio eventloop.
+            asyncio.get_running_loop()
+            self._run_output_handler()
+        except RuntimeError:
+            pass
+
+    @classmethod
+    def from_vllm_config(
+        cls,
+        vllm_config: VllmConfig,
+        start_engine_loop: bool = True,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        disable_log_requests: bool = False,
+        disable_log_stats: bool = False,
+        client_addresses: Optional[dict[str, str]] = None,
+        client_index: int = 0,
+    ) -> "AsyncLLM":
+        if not envs.VLLM_USE_V1:
+            raise ValueError(
+                "Using V1 AsyncLLMEngine, but envs.VLLM_USE_V1=False. "
+                "This should not happen. As a workaround, try using "
+                "AsyncLLMEngine.from_vllm_config(...) or explicitly set "
+                "VLLM_USE_V1=0 or 1 and report this issue on Github.")
+
+        # Create the LLMEngine.
+        return cls(
+            vllm_config=vllm_config,
+            executor_class=Executor.get_class(vllm_config),
+            start_engine_loop=start_engine_loop,
+            stat_loggers=stat_loggers,
+            log_requests=not disable_log_requests,
+            log_stats=not disable_log_stats,
+            usage_context=usage_context,
+            client_addresses=client_addresses,
+            client_index=client_index,
+        )
+
+    @classmethod
+    def from_engine_args(
+        cls,
+        engine_args: AsyncEngineArgs,
+        start_engine_loop: bool = True,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[list[StatLoggerFactory]] = None,
+    ) -> "AsyncLLM":
+        """Create an AsyncLLM from the EngineArgs."""
+
+        # Create the engine configs.
+        vllm_config = engine_args.create_engine_config(usage_context)
+        executor_class = Executor.get_class(vllm_config)
+
+        # Create the AsyncLLM.
+        return cls(
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_requests=not engine_args.disable_log_requests,
+            log_stats=not engine_args.disable_log_stats,
+            start_engine_loop=start_engine_loop,
+            usage_context=usage_context,
+            stat_loggers=stat_loggers,
+        )
+
+    def __del__(self):
+        self.shutdown()
+
+    def shutdown(self):
+        """Shutdown, cleaning up the background proc and IPC."""
+
+        shutdown_prometheus()
+
+        if engine_core := getattr(self, "engine_core", None):
+            engine_core.shutdown()
+
+        if handler := getattr(self, "output_handler", None):
+            handler.cancel()
+
+    async def add_request(
+        self,
+        request_id: str,
+        prompt: PromptType,
+        params: Union[SamplingParams, PoolingParams],
+        arrival_time: Optional[float] = None,
+        lora_request: Optional[LoRARequest] = None,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
+    ) -> RequestOutputCollector:
+        """Add new request to the AsyncLLM."""
+
+        if self.errored:
+            raise EngineDeadError()
+
+        is_pooling = isinstance(params, PoolingParams)
+
+        # Create a new output collector for the request.
+        queue = RequestOutputCollector(output_kind=params.output_kind)
+
+        # Convert Input --> Request.
+        prompt_str, request = self.processor.process_inputs(
+            request_id, prompt, params, arrival_time, lora_request,
+            tokenization_kwargs, trace_headers, priority, data_parallel_rank)
+
+        if is_pooling or params.n == 1:
+            await self._add_request(request, prompt_str, None, 0, queue)
+            return queue
+
+        # Fan out child requests (for n>1).
+        parent_request = ParentRequest(request_id, params)
+        for idx in range(params.n):
+            request_id, params = parent_request.get_child_info(idx)
+            child_request = request if idx == params.n - 1 else copy(request)
+            child_request.request_id = request_id
+            child_request.sampling_params = params
+            await self._add_request(child_request, prompt_str, parent_request,
+                                    idx, queue)
+        return queue
+
+    async def _add_request(self, request: EngineCoreRequest,
+                           prompt: Optional[str],
+                           parent_req: Optional[ParentRequest], index: int,
+                           queue: RequestOutputCollector):
+
+        # Add the request to OutputProcessor (this process).
+        self.output_processor.add_request(request, prompt, parent_req, index,
+                                          queue)
+
+        # Add the EngineCoreRequest to EngineCore (separate process).
+        await self.engine_core.add_request_async(request)
+
+        if self.log_requests:
+            logger.info("Added request %s.", request.request_id)
+
+    # TODO: we should support multiple prompts in one call, as you
+    # can do with LLM.generate. So that for multi-prompt completion
+    # requests we don't need to send multiple messages to core proc,
+    # and so we don't need multiple streams which then get
+    # re-multiplexed in the API server anyhow.
+    async def generate(
+        self,
+        prompt: PromptType,
+        sampling_params: SamplingParams,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
+    ) -> AsyncGenerator[RequestOutput, None]:
+        """
+        Main function called by the API server to kick off a request
+            * 1) Making an AsyncStream corresponding to the Request.
+            * 2) Processing the Input.
+            * 3) Adding the Request to the Detokenizer.
+            * 4) Adding the Request to the EngineCore (separate process).
+
+        A separate output_handler loop runs in a background AsyncIO task,
+        pulling outputs from EngineCore and putting them into the
+        per-request AsyncStream.
+
+        The caller of generate() iterates the returned AsyncGenerator,
+        returning the RequestOutput back to the caller.
+        """
+
+        try:
+            # We start the output_handler on the first call to generate() so
+            # we can call __init__ before the event loop, which enables us
+            # to handle startup failure gracefully in the OpenAI server.
+            self._run_output_handler()
+
+            q = await self.add_request(
+                request_id,
+                prompt,
+                sampling_params,
+                lora_request=lora_request,
+                trace_headers=trace_headers,
+                priority=priority,
+                data_parallel_rank=data_parallel_rank,
+            )
+
+            # The output_handler task pushes items into the queue.
+            # This task pulls from the queue and yields to caller.
+            finished = False
+            while not finished:
+                # Note: drain queue without await if possible (avoids
+                # task switching under load which helps performance).
+                out = q.get_nowait() or await q.get()
+
+                # Note: both OutputProcessor and EngineCore handle their
+                # own request cleanup based on finished.
+                finished = out.finished
+                yield out
+
+        # If the request is disconnected by the client, generate()
+        # is cancelled or the generator is garbage collected. So,
+        # we abort the request if we end up here.
+        except (asyncio.CancelledError, GeneratorExit):
+            await self.abort(request_id)
+            if self.log_requests:
+                logger.info("Request %s aborted.", request_id)
+            raise
+
+        # Engine is dead. Do not abort since we shut down.
+        except EngineDeadError:
+            if self.log_requests:
+                logger.info("Request %s failed (engine dead).", request_id)
+            raise
+
+        # Request validation error.
+        except ValueError:
+            if self.log_requests:
+                logger.info("Request %s failed (bad request).", request_id)
+            raise
+
+        # Unexpected error in the generate() task (possibly recoverable).
+        except Exception as e:
+            await self.abort(request_id)
+            if self.log_requests:
+                logger.info("Request %s failed.", request_id)
+            raise EngineGenerateError() from e
+
+    def _run_output_handler(self):
+        """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
+
+        if self.output_handler is not None:
+            return
+
+        # Ensure that the task doesn't have a circular ref back to the AsyncLLM
+        # object, or else it won't be garbage collected and cleaned up properly.
+        engine_core = self.engine_core
+        output_processor = self.output_processor
+        log_stats = self.log_stats
+        logger_manager = self.logger_manager
+
+        async def output_handler():
+            try:
+                while True:
+                    # 1) Pull EngineCoreOutputs from the EngineCore.
+                    outputs = await engine_core.get_output_async()
+                    num_outputs = len(outputs.outputs)
+
+                    iteration_stats = IterationStats() if (
+                        log_stats and num_outputs) else None
+
+                    # Split outputs into chunks of at most
+                    # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the
+                    # event loop for too long.
+                    if num_outputs <= VLLM_V1_OUTPUT_PROC_CHUNK_SIZE:
+                        slices = (outputs.outputs, )
+                    else:
+                        slices = np.array_split(
+                            outputs.outputs,
+                            cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE))
+
+                    for i, outputs_slice in enumerate(slices):
+                        # 2) Process EngineCoreOutputs.
+                        processed_outputs = output_processor.process_outputs(
+                            outputs_slice, outputs.timestamp, iteration_stats)
+                        # NOTE: RequestOutputs are pushed to their queues.
+                        assert not processed_outputs.request_outputs
+
+                        # Allow other asyncio tasks to run between chunks
+                        if i + 1 < len(slices):
+                            await asyncio.sleep(0)
+
+                        # 3) Abort any reqs that finished due to stop strings.
+                        await engine_core.abort_requests_async(
+                            processed_outputs.reqs_to_abort)
+
+                    # 4) Logging.
+                    # TODO(rob): make into a coroutine and launch it in
+                    # background thread once Prometheus overhead is non-trivial.
+                    if logger_manager:
+                        logger_manager.record(
+                            engine_idx=outputs.engine_index,
+                            scheduler_stats=outputs.scheduler_stats,
+                            iteration_stats=iteration_stats,
+                        )
+            except Exception as e:
+                logger.exception("AsyncLLM output_handler failed.")
+                output_processor.propagate_error(e)
+
+        self.output_handler = asyncio.create_task(output_handler())
+
+    async def abort(self, request_id: str) -> None:
+        """Abort RequestId in OutputProcessor and EngineCore."""
+
+        request_ids = self.output_processor.abort_requests((request_id, ))
+        await self.engine_core.abort_requests_async(request_ids)
+
+        if self.log_requests:
+            logger.info("Aborted request %s.", request_id)
+
+    async def encode(
+        self,
+        prompt: PromptType,
+        pooling_params: PoolingParams,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        priority: int = 0,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+    ) -> AsyncGenerator[PoolingRequestOutput, None]:
+        """
+        Main function called by the API server to kick off a request
+            * 1) Making an AsyncStream corresponding to the Request.
+            * 2) Processing the Input.
+            * 3) Adding the Request to the EngineCore (separate process).
+
+        A separate output_handler loop runs in a background AsyncIO task,
+        pulling outputs from EngineCore and putting them into the
+        per-request AsyncStream.
+
+        The caller of generate() iterates the returned AsyncGenerator,
+        returning the RequestOutput back to the caller.
+        """
+
+        try:
+            # We start the output_handler on the first call to generate() so
+            # we can call __init__ before the event loop, which enables us
+            # to handle startup failure gracefully in the OpenAI server.
+            self._run_output_handler()
+
+            q = await self.add_request(
+                request_id,
+                prompt,
+                pooling_params,
+                lora_request=lora_request,
+                trace_headers=trace_headers,
+                priority=priority,
+                tokenization_kwargs=tokenization_kwargs,
+            )
+
+            # The output_handler task pushes items into the queue.
+            # This task pulls from the queue and yields to caller.
+            finished = False
+            while not finished:
+                # Note: drain queue without await if possible (avoids
+                # task switching under load which helps performance).
+                out = q.get_nowait() or await q.get()
+                assert isinstance(out, PoolingRequestOutput)
+                # Note: both OutputProcessor and EngineCore handle their
+                # own request cleanup based on finished.
+                finished = out.finished
+                yield out
+
+        # If the request is disconnected by the client, generate()
+        # is cancelled. So, we abort the request if we end up here.
+        except asyncio.CancelledError:
+            await self.abort(request_id)
+            if self.log_requests:
+                logger.info("Request %s aborted.", request_id)
+            raise
+
+        # Engine is dead. Do not abort since we shut down.
+        except EngineDeadError:
+            if self.log_requests:
+                logger.info("Request %s failed (engine dead).", request_id)
+            raise
+
+        # Request validation error.
+        except ValueError:
+            if self.log_requests:
+                logger.info("Request %s failed (bad request).", request_id)
+            raise
+
+        # Unexpected error in the generate() task (possibly recoverable).
+        except Exception as e:
+            await self.abort(request_id)
+            if self.log_requests:
+                logger.info("Request %s failed.", request_id)
+            raise EngineGenerateError() from e
+
+    async def get_vllm_config(self) -> VllmConfig:
+        return self.vllm_config
+
+    async def get_model_config(self) -> ModelConfig:
+        return self.model_config
+
+    async def get_decoding_config(self):
+        raise ValueError("Not Supported on V1 yet.")
+
+    async def get_input_preprocessor(self) -> InputPreprocessor:
+        return self.processor.input_preprocessor
+
+    async def get_tokenizer(
+        self,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> AnyTokenizer:
+        if self.tokenizer is None:
+            raise ValueError("Unable to get tokenizer because "
+                             "skip_tokenizer_init is True")
+
+        return self.tokenizer.get_lora_tokenizer(lora_request)
+
+    async def is_tracing_enabled(self) -> bool:
+        return False
+
+    async def do_log_stats(
+        self,
+        scheduler_outputs=None,
+        model_output=None,
+    ) -> None:
+        if self.logger_manager:
+            self.logger_manager.log()
+
+    async def check_health(self) -> None:
+        logger.debug("Called check_health.")
+        if self.errored:
+            raise self.dead_error
+
+    async def start_profile(self) -> None:
+        await self.engine_core.profile_async(True)
+
+    async def stop_profile(self) -> None:
+        await self.engine_core.profile_async(False)
+
+    async def reset_mm_cache(self) -> None:
+        self.processor.mm_registry.reset_processor_cache()
+        self.processor.mm_input_cache_client.reset()
+        await self.engine_core.reset_mm_cache_async()
+
+    async def reset_prefix_cache(self,
+                                 device: Optional[Device] = None) -> None:
+        if device == Device.CPU:
+            raise ValueError("Not supported on CPU.")
+        await self.engine_core.reset_prefix_cache_async()
+
+    async def sleep(self, level: int = 1) -> None:
+        await self.engine_core.sleep_async(level)
+
+    async def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        await self.engine_core.wake_up_async(tags)
+
+    async def is_sleeping(self) -> bool:
+        return await self.engine_core.is_sleeping_async()
+
+    async def add_lora(self, lora_request: LoRARequest) -> bool:
+        """Load a new LoRA adapter into the engine for future requests."""
+        return await self.engine_core.add_lora_async(lora_request)
+
+    async def remove_lora(self, lora_id: int) -> bool:
+        """Remove an already loaded LoRA adapter."""
+        return await self.engine_core.remove_lora_async(lora_id)
+
+    async def list_loras(self) -> set[int]:
+        """List all registered adapters."""
+        return await self.engine_core.list_loras_async()
+
+    async def pin_lora(self, lora_id: int) -> bool:
+        """Prevent an adapter from being evicted."""
+        return await self.engine_core.pin_lora_async(lora_id)
+
+    async def collective_rpc(self,
+                             method: str,
+                             timeout: Optional[float] = None,
+                             args: tuple = (),
+                             kwargs: Optional[dict] = None):
+        """
+        Perform a collective RPC call to the given path.
+        """
+        return await self.engine_core.collective_rpc_async(
+            method, timeout, args, kwargs)
+
+    async def wait_for_requests_to_drain(self, drain_timeout: int = 300):
+        """Wait for all requests to be drained."""
+        start_time = time.time()
+        while time.time() - start_time < drain_timeout:
+            if not self.engine_core.dp_engines_running():
+                logger.info("Engines are idle, requests have been drained")
+                return
+
+            logger.info(
+                "Engines are still running, waiting for requests to drain...")
+            await asyncio.sleep(1)  # Wait 1 second before checking again
+
+        raise TimeoutError(f"Timeout reached after {drain_timeout} seconds "
+                           "waiting for requests to drain.")
+
+    async def scale_elastic_ep(self,
+                               new_data_parallel_size: int,
+                               drain_timeout: int = 300):
+        """
+        Scale up or down the data parallel size by adding or removing
+        engine cores.
+        Args:
+            new_data_parallel_size: The new number of data parallel workers
+            drain_timeout:
+                Maximum time to wait for requests to drain (seconds)
+        """
+        old_data_parallel_size = \
+            self.vllm_config.parallel_config.data_parallel_size
+        if old_data_parallel_size == new_data_parallel_size:
+            logger.info("Data parallel size is already %s, skipping scale",
+                        new_data_parallel_size)
+            return
+        logger.info(
+            "Waiting for requests to drain before "
+            "scaling up to %s engines...", new_data_parallel_size)
+        await self.wait_for_requests_to_drain(drain_timeout)
+        logger.info(
+            "Requests have been drained, proceeding with scale "
+            "to %s engines", new_data_parallel_size)
+        await self.engine_core.scale_elastic_ep(new_data_parallel_size)
+        self.vllm_config.parallel_config.data_parallel_size = \
+            new_data_parallel_size
+
+        # recreate stat loggers
+        if new_data_parallel_size > old_data_parallel_size and self.log_stats:
+            # TODO(rob): fix this after talking with Ray team.
+            # This resets all the prometheus metrics since we
+            # unregister during initialization. Need to understand
+            # the intended behavior here better.
+            self.logger_manager = StatLoggerManager(
+                vllm_config=self.vllm_config,
+                engine_idxs=list(range(new_data_parallel_size)),
+                custom_stat_loggers=None,
+            )
+
+    @property
+    def is_running(self) -> bool:
+        # Is None before the loop is started.
+        return self.output_handler is None or not self.output_handler.done()
+
+    @property
+    def is_stopped(self) -> bool:
+        return self.errored
+
+    @property
+    def errored(self) -> bool:
+        return self.engine_core.resources.engine_dead or not self.is_running
+
+    @property
+    def dead_error(self) -> BaseException:
+        return EngineDeadError()
diff --git a/vllm_v0.10.0/vllm/v1/engine/coordinator.py b/vllm_v0.10.0/vllm/v1/engine/coordinator.py
new file mode 100644
index 0000000..c0decd6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/engine/coordinator.py
@@ -0,0 +1,309 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import multiprocessing
+import time
+import weakref
+from typing import Optional
+
+import msgspec.msgpack
+import zmq
+
+from vllm.config import ParallelConfig
+from vllm.logger import init_logger
+from vllm.utils import get_mp_context, make_zmq_socket
+from vllm.v1.engine import EngineCoreOutputs, EngineCoreRequestType
+from vllm.v1.serial_utils import MsgpackDecoder
+from vllm.v1.utils import get_engine_client_zmq_addr, shutdown
+
+logger = init_logger(__name__)
+
+
+class DPCoordinator:
+    """Coordinator process used for data-parallel deployments (DP>1).
+
+    Intermediates between multiple DP engine rank processes and one or more
+    front-end API server processes.
+
+    * Collects stats from each DP engine (currently just waiting and running
+      queue lengths), and publishes these to all front-ends for use in
+      load-balancing decisions.
+
+    * Keeps track of the current DP "request wave" number and running state
+      of the engines. This is received from the DP rank 0 engine and published
+      to the front-end processes along with the current load stats.
+
+      The engines alternate between a global running/paused state. The global
+      "request wave" number is a count of the number of times that the workers
+      collectively move from a running state to a paused state. This transition
+      is synchronized via the all-reduce operation performed in the
+      DPEngineCoreProc._has_global_unfinished_reqs method.
+
+    * Broadcasts the START_DP_WAVE message to engines to move them from paused
+      to running state when one engine receives a new request. This can happen
+      in two cases:
+      1) A front-end sending a new request while the engines are paused will
+         concurrently notify the coordinator.
+      2) An engine receiving a request for a stale request wave while in paused
+         state will notify the coordinator.
+
+    Engines will move into running state when receiving a new request or
+    START_DP_WAVE message.
+
+    Note that when deployed in External LB mode, no stats will be published by
+    the engines and thus updates will only be sent to front-ends when the
+    request wave / running state changes.
+    """
+
+    def __init__(self, parallel_config: ParallelConfig):
+
+        dp_size = parallel_config.data_parallel_size
+        assert dp_size > 1, "Coordinator only used for data parallel"
+
+        host = parallel_config.data_parallel_master_ip
+        external_lb = parallel_config.data_parallel_external_lb
+        hybrid_lb = parallel_config.data_parallel_hybrid_lb
+
+        # Assume coordinator is colocated with front-end procs when not in
+        # either external or hybrid DP LB mode.
+        front_publish_address = get_engine_client_zmq_addr(
+            local_only=not external_lb and not hybrid_lb, host=host)
+
+        local_only_eng = dp_size == parallel_config.data_parallel_size_local
+        back_publish_address = get_engine_client_zmq_addr(local_only_eng, host)
+        back_output_address = get_engine_client_zmq_addr(local_only_eng, host)
+
+        # When in external LB mode, load stats aren't published, only changes
+        # to request wave / running state, so we don't need to rate-limit the
+        # updates to the front-end proc(s).
+        min_stats_update_interval_ms = 0 if external_lb else 100
+
+        context = get_mp_context()
+        self.proc: multiprocessing.Process = context.Process(
+            target=CoordinatorProc.run_coordinator,
+            name="VLLM_DP_Coordinator",
+            kwargs={
+                "engine_count": parallel_config.data_parallel_size,
+                "front_publish_address": front_publish_address,
+                "back_output_address": back_output_address,
+                "back_publish_address": back_publish_address,
+                "min_stats_update_interval_ms": min_stats_update_interval_ms,
+            },
+            daemon=True)
+        self.proc.start()
+
+        self.stats_publish_address = front_publish_address
+        self.coord_in_address = back_publish_address
+        self.coord_out_address = back_output_address
+        self._finalizer = weakref.finalize(self, shutdown, [self.proc])
+
+    def get_stats_publish_address(self) -> str:
+        return self.stats_publish_address
+
+    def get_engine_socket_addresses(self) -> tuple[str, str]:
+        """Returns tuple of ZMQ input address, output address."""
+        return self.coord_in_address, self.coord_out_address
+
+    def close(self):
+        self._finalizer()
+
+
+class EngineState:
+
+    def __init__(self):
+        self.request_counts = [0, 0]  # [waiting, running]
+
+
+class CoordinatorProc:
+
+    def __init__(self,
+                 engine_count: int,
+                 min_stats_update_interval_ms: int = 100):
+
+        self.ctx = zmq.Context()
+
+        self.engines = [EngineState() for _ in range(engine_count)]
+
+        self.stats_update_interval_ms = min_stats_update_interval_ms
+
+        self.current_wave = 0
+        self.engines_running = False
+        self.stats_changed = False
+
+    @staticmethod
+    def run_coordinator(
+        engine_count: int,
+        front_publish_address: str,
+        back_output_address: str,
+        back_publish_address: str,
+        min_stats_update_interval_ms: int = 100,
+    ):
+        coordinator = CoordinatorProc(
+            engine_count=engine_count,
+            min_stats_update_interval_ms=min_stats_update_interval_ms)
+        try:
+            coordinator.process_input_socket(
+                front_publish_address,
+                back_output_address,
+                back_publish_address,
+            )
+        except KeyboardInterrupt:
+            logger.info("DP Coordinator process exiting")
+
+    def process_input_socket(self, front_publish_address: str,
+                             back_output_address: str,
+                             back_publish_address: str):
+
+        decoder = MsgpackDecoder(EngineCoreOutputs)
+
+        with make_zmq_socket(
+                path=front_publish_address,  # IPC
+                ctx=self.ctx,
+                socket_type=zmq.XPUB,
+                bind=True,
+        ) as publish_front, make_zmq_socket(
+                path=back_output_address,  # IPC or TCP
+                ctx=self.ctx,
+                socket_type=zmq.PULL,
+                bind=True,
+        ) as output_back, make_zmq_socket(
+                path=back_publish_address,  # IPC or TCP
+                ctx=self.ctx,
+                socket_type=zmq.XPUB,
+                bind=True,
+        ) as publish_back:
+
+            poller = zmq.Poller()
+            poller.register(publish_front, zmq.POLLIN)
+            poller.register(output_back, zmq.POLLIN)
+            last_publish_time = 0
+            while True:
+                elapsed = int(time.time() * 1000) - last_publish_time
+                # Send at stats_update_interval_ms interval if the stats have
+                # changed, or otherwise every 4 seconds.
+                wait_for = (self.stats_update_interval_ms
+                            if self.stats_changed else 4000)
+                events = poller.poll(timeout=max(0, wait_for - elapsed))
+                if not events:
+                    # Poller timeout - publish current stats to front-ends.
+                    engine_req_counts_list = self._get_engine_counts()
+                    to_publish = (engine_req_counts_list, self.current_wave,
+                                  self.engines_running)
+                    publish_front.send(msgspec.msgpack.encode(to_publish))
+                    last_publish_time = int(time.time() * 1000)
+                    self.stats_changed = False
+                    continue
+
+                events = dict(events)
+
+                if publish_front in events:
+                    buffer = publish_front.recv()
+                    if buffer in (b'\x01', b'\x00'):
+                        # Ignore subscription messages.
+                        continue
+
+                    decoded = msgspec.msgpack.decode(buffer)
+                    if isinstance(decoded, (list, tuple)) and len(
+                            decoded) == 2 and decoded[0] == "SCALE_ELASTIC_EP":
+                        # Handle scale up notification
+                        new_engine_count = decoded[1]
+                        current_count = len(self.engines)
+                        if new_engine_count > current_count:
+                            for _ in range(new_engine_count - current_count):
+                                self.engines.append(EngineState())
+                            # NOTE(yongji): handle the case
+                            # where newly started engines have current_wave = 0
+                            # if existing engines just finished a wave
+                            # and engine_running isn't updated yet at
+                            # CoordinatorProc requests routed to newly started
+                            # engines may not wake up existing engines, as long
+                            # as 0 < request.wave < existing engines'
+                            # current_wave
+                            # we note that 0 is the wave number for the new
+                            # engine
+                            self.engines_running = False
+                            logger.info(
+                                "DPCoordinator scaled up from %s to %s "
+                                "engines", current_count, new_engine_count)
+                        else:
+                            self.engines = self.engines[:new_engine_count]
+                            logger.info(
+                                "DPCoordinator scaled down from %s to %s "
+                                "engines", current_count, new_engine_count)
+                        continue  # Skip normal engine notification processing
+
+                    # We received a message on the front-end XPUB socket,
+                    # from an API server sending a new request while the
+                    # engines are paused, so that we can wake the other
+                    # engines.
+                    engine_to_exclude, wave = decoded
+                    if not self.engines_running:
+                        if wave < self.current_wave:
+                            # If the wave number is stale, ensure the message
+                            # is handled by all the engines.
+                            engine_to_exclude = None
+
+                        self.engines_running = True
+                        self.stats_changed = True
+                        self._send_start_wave(publish_back, self.current_wave,
+                                              engine_to_exclude)
+
+                if output_back in events:
+                    # We received a message from one of the engines.
+
+                    buffer = output_back.recv()
+                    outputs: EngineCoreOutputs = decoder.decode(buffer)
+
+                    assert not outputs.outputs
+                    assert outputs.utility_output is None
+
+                    eng_index = outputs.engine_index
+                    scheduler_stats = outputs.scheduler_stats
+                    if scheduler_stats:
+                        # 1. Updated request load stats - update our local
+                        # state with these.
+                        stats = self.engines[eng_index].request_counts
+                        stats[0] = scheduler_stats.num_waiting_reqs
+                        stats[1] = scheduler_stats.num_running_reqs
+                        self.stats_changed = True
+
+                    if (wave := outputs.wave_complete) is not None:
+                        # 2. Notification from rank 0 engine that we've
+                        # moved into the global paused state
+                        # (engines_running==False).
+                        if self.current_wave <= wave:
+                            new_wave = wave + 1
+                            logger.debug("Moving DP wave from %d to %d.",
+                                         self.current_wave, new_wave)
+                            self.current_wave = new_wave
+                            self.engines_running = False
+                            self.stats_changed = True
+                    elif (wave := outputs.start_wave) is not None and (
+                            wave > self.current_wave or
+                        (wave == self.current_wave
+                         and not self.engines_running)):
+                        # 3. The engine received request for a non-current wave
+                        # so we must ensure that other engines progress to the
+                        # next wave (race condition handling).
+                        logger.debug(
+                            "Starting wave %d after notification of "
+                            "stale wave request from engine.", wave)
+                        self.current_wave = wave
+                        self.engines_running = True
+                        self.stats_changed = True
+                        self._send_start_wave(publish_back, wave, eng_index)
+
+    @staticmethod
+    def _send_start_wave(socket: zmq.Socket, wave: int,
+                         exclude_engine_index: Optional[int]):
+        """Broadcast the START_DP_WAVE message to all the engines.
+        It includes the current wave number and index of engine which
+        has already received a request with this wave number and so doesn't
+        require additional notification.
+        """
+        wave_encoded = msgspec.msgpack.encode((wave, exclude_engine_index))
+        socket.send_multipart(
+            (EngineCoreRequestType.START_DP_WAVE.value, wave_encoded))
+
+    def _get_engine_counts(self) -> list[list[int]]:
+        """Return list of [waiting, running] count lists for each engine."""
+        return [e.request_counts for e in self.engines]
diff --git a/vllm_v0.10.0/vllm/v1/engine/core.py b/vllm_v0.10.0/vllm/v1/engine/core.py
new file mode 100644
index 0000000..7779b55
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/engine/core.py
@@ -0,0 +1,1147 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+import queue
+import signal
+import sys
+import threading
+import time
+from collections import deque
+from collections.abc import Generator
+from concurrent.futures import Future
+from contextlib import ExitStack, contextmanager
+from inspect import isclass, signature
+from logging import DEBUG
+from typing import Any, Callable, Optional, TypeVar, Union
+
+import msgspec
+import zmq
+
+from vllm.config import ParallelConfig, VllmConfig
+from vllm.distributed import stateless_destroy_torch_distributed_process_group
+from vllm.executor.multiproc_worker_utils import _add_prefix
+from vllm.logger import init_logger
+from vllm.logging_utils.dump_input import dump_engine_exception
+from vllm.lora.request import LoRARequest
+from vllm.transformers_utils.config import (
+    maybe_register_config_serialize_by_value)
+from vllm.utils import make_zmq_socket, resolve_obj_by_qualname
+from vllm.v1.core.kv_cache_utils import (get_kv_cache_config,
+                                         unify_kv_cache_configs)
+from vllm.v1.core.sched.interface import SchedulerInterface
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
+from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
+                            EngineCoreRequestType,
+                            ReconfigureDistributedRequest, ReconfigureRankType,
+                            UtilityOutput)
+from vllm.v1.engine.mm_input_cache import MirroredProcessingCache
+from vllm.v1.engine.utils import EngineHandshakeMetadata, EngineZmqAddresses
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.metrics.stats import SchedulerStats
+from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.request import Request, RequestStatus
+from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
+from vllm.v1.structured_output import StructuredOutputManager
+from vllm.version import __version__ as VLLM_VERSION
+
+logger = init_logger(__name__)
+
+POLLING_TIMEOUT_S = 2.5
+HANDSHAKE_TIMEOUT_MINS = 5
+
+_R = TypeVar('_R')  # Return type for collective_rpc
+
+
+class EngineCore:
+    """Inner loop of vLLM's Engine."""
+
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 executor_class: type[Executor],
+                 log_stats: bool,
+                 executor_fail_callback: Optional[Callable] = None):
+
+        # plugins need to be loaded at the engine/scheduler level too
+        from vllm.plugins import load_general_plugins
+        load_general_plugins()
+
+        self.vllm_config = vllm_config
+        logger.info("Initializing a V1 LLM engine (v%s) with config: %s",
+                    VLLM_VERSION, vllm_config)
+
+        self.log_stats = log_stats
+
+        # Setup Model.
+        self.model_executor = executor_class(vllm_config)
+        if executor_fail_callback is not None:
+            self.model_executor.register_failure_callback(
+                executor_fail_callback)
+
+        self.available_gpu_memory_for_kv_cache = -1
+
+        # Setup KV Caches and update CacheConfig after profiling.
+        num_gpu_blocks, num_cpu_blocks, kv_cache_config = \
+            self._initialize_kv_caches(vllm_config)
+
+        vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks
+        vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks
+        self.collective_rpc("initialize_cache",
+                            args=(num_gpu_blocks, num_cpu_blocks))
+
+        self.structured_output_manager = StructuredOutputManager(vllm_config)
+
+        # Setup scheduler.
+        if isinstance(vllm_config.scheduler_config.scheduler_cls, str):
+            Scheduler = resolve_obj_by_qualname(
+                vllm_config.scheduler_config.scheduler_cls)
+        else:
+            Scheduler = vllm_config.scheduler_config.scheduler_cls
+
+        # This warning can be removed once the V1 Scheduler interface is
+        # finalized and we can maintain support for scheduler classes that
+        # implement it
+        if Scheduler is not V1Scheduler:
+            logger.warning(
+                "Using configured V1 scheduler class %s. "
+                "This scheduler interface is not public and "
+                "compatibility may not be maintained.",
+                vllm_config.scheduler_config.scheduler_cls)
+
+        self.scheduler: SchedulerInterface = Scheduler(
+            vllm_config=vllm_config,
+            kv_cache_config=kv_cache_config,
+            structured_output_manager=self.structured_output_manager,
+            include_finished_set=vllm_config.parallel_config.data_parallel_size
+            > 1,
+            log_stats=self.log_stats,
+        )
+
+        # Setup MM Input Mapper.
+        self.mm_input_cache_server = MirroredProcessingCache(
+            vllm_config.model_config)
+
+        # Setup batch queue for pipeline parallelism.
+        # Batch queue for scheduled batches. This enables us to asynchronously
+        # schedule and execute batches, and is required by pipeline parallelism
+        # to eliminate pipeline bubbles.
+        self.batch_queue_size = self.model_executor.max_concurrent_batches
+        self.batch_queue: Optional[queue.Queue[tuple[Future[ModelRunnerOutput],
+                                                     SchedulerOutput]]] = None
+        if self.batch_queue_size > 1:
+            logger.info("Batch queue is enabled with size %d",
+                        self.batch_queue_size)
+            self.batch_queue = queue.Queue(self.batch_queue_size)
+
+    def _initialize_kv_caches(
+            self, vllm_config: VllmConfig) -> tuple[int, int, KVCacheConfig]:
+        start = time.time()
+
+        # Get all kv cache needed by the model
+        kv_cache_specs = self.model_executor.get_kv_cache_specs()
+
+        has_kv_cache = any(kv_cache_spec for kv_cache_spec in kv_cache_specs)
+        if has_kv_cache:
+            if os.environ.get("VLLM_ELASTIC_EP_SCALE_UP_LAUNCH") == "1":
+                dp_group = getattr(self, "dp_group", None)
+                assert dp_group is not None
+                self.available_gpu_memory_for_kv_cache = \
+                    ParallelConfig.sync_kv_cache_memory_size(dp_group, -1)
+                available_gpu_memory = [
+                    self.available_gpu_memory_for_kv_cache
+                ] * len(kv_cache_specs)
+            else:
+                # Profiles the peak memory usage of the model to determine how
+                # much memory can be allocated for kv cache.
+                available_gpu_memory = (
+                    self.model_executor.determine_available_memory())
+                self.available_gpu_memory_for_kv_cache = \
+                    available_gpu_memory[0]
+        else:
+            # Attention free models don't need memory for kv cache
+            available_gpu_memory = [0] * len(kv_cache_specs)
+
+        assert len(kv_cache_specs) == len(available_gpu_memory)
+        # Get the kv cache tensor size
+        kv_cache_configs = [
+            get_kv_cache_config(vllm_config, kv_cache_spec_one_worker,
+                                available_gpu_memory_one_worker)
+            for kv_cache_spec_one_worker, available_gpu_memory_one_worker in
+            zip(kv_cache_specs, available_gpu_memory)
+        ]
+
+        # Since we use a shared centralized controller, we need the
+        # `kv_cache_config` to be consistent across all workers to make sure
+        # all the memory operators can be applied to all workers.
+        unify_kv_cache_configs(kv_cache_configs)
+
+        # All workers have the same kv_cache_config except layer names, so use
+        # an arbitrary one to initialize the scheduler.
+        assert all([
+            cfg.num_blocks == kv_cache_configs[0].num_blocks
+            for cfg in kv_cache_configs
+        ])
+        num_gpu_blocks = kv_cache_configs[0].num_blocks
+        num_cpu_blocks = 0
+        scheduler_kv_cache_config = kv_cache_configs[0]
+
+        # Initialize kv cache and warmup the execution
+        self.model_executor.initialize_from_config(kv_cache_configs)
+
+        elapsed = time.time() - start
+        logger.info(("init engine (profile, create kv cache, "
+                     "warmup model) took %.2f seconds"), elapsed)
+        return num_gpu_blocks, num_cpu_blocks, scheduler_kv_cache_config
+
+    def add_request(self, request: EngineCoreRequest):
+        """Add request to the scheduler."""
+        if pooling_params := request.pooling_params:
+            supported_pooling_tasks = (
+                self.model_executor.supported_pooling_tasks)
+            if pooling_params.task not in supported_pooling_tasks:
+                raise ValueError(f"Unsupported task: {pooling_params.task!r} "
+                                 f"Supported tasks: {supported_pooling_tasks}")
+
+        if request.mm_hashes is not None:
+            # Here, if hash exists for a multimodal input, then it will be
+            # fetched from the cache, else it will be added to the cache.
+            # Note that the cache here is mirrored with the client cache, so
+            # anything that has a hash must have a HIT cache entry here
+            # as well.
+            assert request.mm_inputs is not None
+            request.mm_inputs = self.mm_input_cache_server.get_and_update_p1(
+                request.mm_inputs, request.mm_hashes)
+
+        req = Request.from_engine_core_request(request)
+        if req.use_structured_output:
+            # Start grammar compilation asynchronously
+            self.structured_output_manager.grammar_init(req)
+
+        if req.kv_transfer_params is not None and (
+                not self.scheduler.get_kv_connector()):
+            logger.warning("Got kv_transfer_params, but no KVConnector found. "
+                           "Disabling KVTransfer for this request.")
+
+        self.scheduler.add_request(req)
+
+    def abort_requests(self, request_ids: list[str]):
+        """Abort requests from the scheduler."""
+
+        # TODO: The scheduler doesn't really need to know the
+        # specific finish reason, TBD whether we propagate that
+        # (i.e. client-aborted vs stop criteria met).
+        self.scheduler.finish_requests(request_ids,
+                                       RequestStatus.FINISHED_ABORTED)
+
+    def execute_model_with_error_logging(
+        self,
+        model_fn: Callable[[SchedulerOutput], ModelRunnerOutput],
+        scheduler_output: SchedulerOutput,
+    ) -> ModelRunnerOutput:
+        """Execute the model and log detailed info on failure."""
+        try:
+            return model_fn(scheduler_output)
+        except Exception as err:
+            # We do not want to catch BaseException here since we're only
+            # interested in dumping info when the exception is due to an
+            # error from execute_model itself.
+
+            # NOTE: This method is exception-free
+            dump_engine_exception(self.vllm_config, scheduler_output,
+                                  self.scheduler.make_stats())
+            raise err
+
+    def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]:
+        """Schedule, execute, and make output.
+
+        Returns tuple of outputs and a flag indicating whether the model
+        was executed.
+        """
+
+        # Check for any requests remaining in the scheduler - unfinished,
+        # or finished and not yet removed from the batch.
+        if not self.scheduler.has_requests():
+            return {}, False
+        scheduler_output = self.scheduler.schedule()
+        model_output = self.execute_model_with_error_logging(
+            self.model_executor.execute_model,  # type: ignore
+            scheduler_output)
+        engine_core_outputs = self.scheduler.update_from_output(
+            scheduler_output, model_output)  # type: ignore
+
+        return (engine_core_outputs,
+                scheduler_output.total_num_scheduled_tokens > 0)
+
+    def step_with_batch_queue(
+            self) -> tuple[Optional[dict[int, EngineCoreOutputs]], bool]:
+        """Schedule and execute batches with the batch queue.
+        Note that if nothing to output in this step, None is returned.
+
+        The execution flow is as follows:
+        1. Try to schedule a new batch if the batch queue is not full.
+        If a new batch is scheduled, directly return an empty engine core
+        output. In other words, fulfilling the batch queue has a higher priority
+        than getting model outputs.
+        2. If there is no new scheduled batch, meaning that the batch queue
+        is full or no other requests can be scheduled, we block until the first
+        batch in the job queue is finished.
+        3. Update the scheduler from the output.
+        """
+        assert self.batch_queue is not None
+
+        engine_core_outputs = None
+        scheduler_output = None
+        # Try to schedule a new batch if the batch queue is not full, but
+        # the scheduler may return an empty batch if all requests are scheduled.
+        # Note that this is not blocking.
+        if not self.batch_queue.full():
+            scheduler_output = self.scheduler.schedule()
+            if scheduler_output.total_num_scheduled_tokens > 0:
+                future = self.model_executor.execute_model(scheduler_output)
+                self.batch_queue.put_nowait(
+                    (future, scheduler_output))  # type: ignore
+
+        scheduled_batch = (scheduler_output is not None
+                           and scheduler_output.total_num_scheduled_tokens > 0)
+
+        # If no more requests can be scheduled and the job queue is not empty,
+        # block until the first batch in the job queue is finished.
+        # TODO(comaniac): Ideally we should peek the first batch in the
+        # job queue to check if it's finished before scheduling a new batch,
+        # but peeking the first element in a queue is not thread-safe,
+        # so we need more work.
+        if not scheduled_batch and not self.batch_queue.empty():
+            future, scheduler_output = self.batch_queue.get_nowait()
+
+            # Blocking until the first result is available.
+            model_output = self.execute_model_with_error_logging(
+                lambda _: future.result(), scheduler_output)
+
+            self.batch_queue.task_done()
+            engine_core_outputs = (self.scheduler.update_from_output(
+                scheduler_output, model_output))
+
+        return engine_core_outputs, scheduled_batch
+
+    def shutdown(self):
+        self.structured_output_manager.clear_backend()
+        if self.model_executor:
+            self.model_executor.shutdown()
+        if self.scheduler:
+            self.scheduler.shutdown()
+
+    def profile(self, is_start: bool = True):
+        self.model_executor.profile(is_start)
+
+    def reset_mm_cache(self):
+        # NOTE: Since this is mainly for debugging, we don't attempt to
+        # re-sync the internal caches (P0 processor, P0 mirror, P1 mirror)
+        if self.scheduler.has_unfinished_requests():
+            logger.warning("Resetting the multi-modal cache when requests are "
+                           "in progress may lead to desynced internal caches.")
+
+        self.mm_input_cache_server.reset()
+
+    def reset_prefix_cache(self):
+        self.scheduler.reset_prefix_cache()
+
+    def sleep(self, level: int = 1):
+        self.model_executor.sleep(level)
+
+    def wake_up(self, tags: Optional[list[str]] = None):
+        self.model_executor.wake_up(tags)
+
+    def is_sleeping(self) -> bool:
+        return self.model_executor.is_sleeping
+
+    def execute_dummy_batch(self):
+        self.model_executor.collective_rpc("execute_dummy_batch")
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.model_executor.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        return self.model_executor.remove_lora(lora_id)
+
+    def list_loras(self) -> set[int]:
+        return self.model_executor.list_loras()
+
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.model_executor.pin_lora(lora_id)
+
+    def save_sharded_state(
+        self,
+        path: str,
+        pattern: Optional[str] = None,
+        max_size: Optional[int] = None,
+    ) -> None:
+        self.model_executor.save_sharded_state(path=path,
+                                               pattern=pattern,
+                                               max_size=max_size)
+
+    def collective_rpc(self,
+                       method: Union[str, Callable[..., _R]],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        return self.model_executor.collective_rpc(method, timeout, args,
+                                                  kwargs)
+
+    def save_tensorized_model(
+        self,
+        tensorizer_config,
+    ) -> None:
+        self.model_executor.save_tensorized_model(
+            tensorizer_config=tensorizer_config, )
+
+
+class EngineCoreProc(EngineCore):
+    """ZMQ-wrapper for running EngineCore in background process."""
+
+    ENGINE_CORE_DEAD = b'ENGINE_CORE_DEAD'
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        local_client: bool,
+        handshake_address: str,
+        executor_class: type[Executor],
+        log_stats: bool,
+        client_handshake_address: Optional[str] = None,
+        engine_index: int = 0,
+    ):
+        self.input_queue = queue.Queue[tuple[EngineCoreRequestType, Any]]()
+        self.output_queue = queue.Queue[Union[tuple[int, EngineCoreOutputs],
+                                              bytes]]()
+        executor_fail_callback = lambda: self.input_queue.put_nowait(
+            (EngineCoreRequestType.EXECUTOR_FAILED, b''))
+
+        self.engine_index = engine_index
+        identity = self.engine_index.to_bytes(length=2, byteorder="little")
+        self.engines_running = False
+
+        with self._perform_handshakes(handshake_address, identity,
+                                      local_client, vllm_config,
+                                      client_handshake_address) as addresses:
+            self.client_count = len(addresses.outputs)
+
+            # Set up data parallel environment.
+            self.has_coordinator = addresses.coordinator_output is not None
+            self.frontend_stats_publish_address = (
+                addresses.frontend_stats_publish_address)
+            # Only publish request queue stats to coordinator for "internal"
+            # LB mode.
+            self.publish_dp_lb_stats = (
+                self.has_coordinator
+                and not vllm_config.parallel_config.data_parallel_external_lb)
+
+            self._init_data_parallel(vllm_config)
+
+            super().__init__(vllm_config, executor_class, log_stats,
+                             executor_fail_callback)
+
+        self.step_fn = (self.step if self.batch_queue is None else
+                        self.step_with_batch_queue)
+
+        # Background Threads and Queues for IO. These enable us to
+        # overlap ZMQ socket IO with GPU since they release the GIL,
+        # and to overlap some serialization/deserialization with the
+        # model forward pass.
+        # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
+        threading.Thread(target=self.process_input_sockets,
+                         args=(addresses.inputs, addresses.coordinator_input,
+                               identity),
+                         daemon=True).start()
+        self.output_thread = threading.Thread(
+            target=self.process_output_sockets,
+            args=(addresses.outputs, addresses.coordinator_output,
+                  self.engine_index),
+            daemon=True)
+        self.output_thread.start()
+
+    @contextmanager
+    def _perform_handshakes(
+        self,
+        handshake_address: str,
+        identity: bytes,
+        local_client: bool,
+        vllm_config: VllmConfig,
+        client_handshake_address: Optional[str],
+    ) -> Generator[EngineZmqAddresses, None, None]:
+        """
+        Perform startup handshakes.
+
+        For DP=1 or offline mode, this is with the colocated front-end process.
+
+        For DP>1 with internal loadbalancing this is with the shared front-end
+        process which may reside on a different node.
+
+        For DP>1 with external or hybrid loadbalancing, two handshakes are
+        performed:
+            - With the rank 0 front-end process which retrieves the
+              DP Coordinator ZMQ addresses and DP process group address.
+            - With the colocated front-end process which retrieves the
+              client input/output socket addresses.
+        with the exception of the rank 0 and colocated engines themselves which
+        don't require the second handshake.
+
+        Here, "front-end" process can mean the process containing the engine
+        core client (which is the API server process in the case the API
+        server is not scaled out), OR the launcher process running the
+        run_multi_api_server() function in serve.py.
+        """
+        input_ctx = zmq.Context()
+        is_local = local_client and client_handshake_address is None
+        headless = not local_client
+        handshake = self._perform_handshake(input_ctx, handshake_address,
+                                            identity, is_local, headless,
+                                            vllm_config,
+                                            vllm_config.parallel_config)
+        if client_handshake_address is None:
+            with handshake as addresses:
+                yield addresses
+        else:
+            assert local_client
+            local_handshake = self._perform_handshake(
+                input_ctx, client_handshake_address, identity, True, False,
+                vllm_config)
+            with handshake as addresses, local_handshake as client_addresses:
+                addresses.inputs = client_addresses.inputs
+                addresses.outputs = client_addresses.outputs
+                yield addresses
+
+        # Update config which may have changed from the handshake
+        vllm_config.__post_init__()
+
+    @contextmanager
+    def _perform_handshake(
+        self,
+        ctx: zmq.Context,
+        handshake_address: str,
+        identity: bytes,
+        local_client: bool,
+        headless: bool,
+        vllm_config: VllmConfig,
+        parallel_config_to_update: Optional[ParallelConfig] = None,
+    ) -> Generator[EngineZmqAddresses, None, None]:
+        with make_zmq_socket(ctx,
+                             handshake_address,
+                             zmq.DEALER,
+                             identity=identity,
+                             linger=5000,
+                             bind=False) as handshake_socket:
+            # Register engine with front-end.
+            addresses = self.startup_handshake(handshake_socket, local_client,
+                                               headless,
+                                               parallel_config_to_update)
+            yield addresses
+
+            # Send ready message.
+            num_gpu_blocks = vllm_config.cache_config.num_gpu_blocks
+            # We pass back the coordinator stats update address here for the
+            # external LB case for our colocated front-end to use (coordinator
+            # only runs with rank 0).
+            dp_stats_address = self.frontend_stats_publish_address
+            handshake_socket.send(
+                msgspec.msgpack.encode({
+                    "status": "READY",
+                    "local": local_client,
+                    "headless": headless,
+                    "num_gpu_blocks": num_gpu_blocks,
+                    "dp_stats_address": dp_stats_address,
+                }))
+
+    @staticmethod
+    def startup_handshake(
+        handshake_socket: zmq.Socket,
+        local_client: bool,
+        headless: bool,
+        parallel_config: Optional[ParallelConfig] = None,
+    ) -> EngineZmqAddresses:
+
+        # Send registration message.
+        handshake_socket.send(
+            msgspec.msgpack.encode({
+                "status": "HELLO",
+                "local": local_client,
+                "headless": headless,
+            }))
+
+        # Receive initialization message.
+        logger.info("Waiting for init message from front-end.")
+        if not handshake_socket.poll(timeout=HANDSHAKE_TIMEOUT_MINS * 60_000):
+            raise RuntimeError("Did not receive response from front-end "
+                               f"process within {HANDSHAKE_TIMEOUT_MINS} "
+                               f"minutes")
+        init_bytes = handshake_socket.recv()
+        init_message: EngineHandshakeMetadata = msgspec.msgpack.decode(
+            init_bytes, type=EngineHandshakeMetadata)
+        logger.debug("Received init message: %s", init_message)
+
+        if parallel_config is not None:
+            for key, value in init_message.parallel_config.items():
+                setattr(parallel_config, key, value)
+
+        return init_message.addresses
+
+    @staticmethod
+    def run_engine_core(*args,
+                        dp_rank: int = 0,
+                        local_dp_rank: int = 0,
+                        **kwargs):
+        """Launch EngineCore busy loop in background process."""
+
+        # Signal handler used for graceful termination.
+        # SystemExit exception is only raised once to allow this and worker
+        # processes to terminate without error
+        shutdown_requested = False
+
+        # Ensure we can serialize transformer config after spawning
+        maybe_register_config_serialize_by_value()
+
+        def signal_handler(signum, frame):
+            nonlocal shutdown_requested
+            if not shutdown_requested:
+                shutdown_requested = True
+                raise SystemExit()
+
+        # Either SIGTERM or SIGINT will terminate the engine_core
+        signal.signal(signal.SIGTERM, signal_handler)
+        signal.signal(signal.SIGINT, signal_handler)
+
+        engine_core: Optional[EngineCoreProc] = None
+        try:
+            parallel_config: ParallelConfig = kwargs[
+                "vllm_config"].parallel_config
+            if parallel_config.data_parallel_size > 1 or dp_rank > 0:
+                # Set data parallel rank for this engine process.
+                parallel_config.data_parallel_rank = dp_rank
+                parallel_config.data_parallel_rank_local = local_dp_rank
+                engine_core = DPEngineCoreProc(*args, **kwargs)
+            else:
+                engine_core = EngineCoreProc(*args, **kwargs)
+
+            engine_core.run_busy_loop()
+
+        except SystemExit:
+            logger.debug("EngineCore exiting.")
+            raise
+        except Exception as e:
+            if engine_core is None:
+                logger.exception("EngineCore failed to start.")
+            else:
+                logger.exception("EngineCore encountered a fatal error.")
+                engine_core._send_engine_dead()
+            raise e
+        finally:
+            if engine_core is not None:
+                engine_core.shutdown()
+
+    def _init_data_parallel(self, vllm_config: VllmConfig):
+        pass
+
+    def run_busy_loop(self):
+        """Core busy loop of the EngineCore."""
+
+        # Loop until process is sent a SIGINT or SIGTERM
+        while True:
+            # 1) Poll the input queue until there is work to do.
+            self._process_input_queue()
+            # 2) Step the engine core and return the outputs.
+            self._process_engine_step()
+
+    def _process_input_queue(self):
+        """Exits when an engine step needs to be performed."""
+
+        waited = False
+        while not self.engines_running and not self.scheduler.has_requests():
+            if logger.isEnabledFor(DEBUG) and self.input_queue.empty():
+                logger.debug("EngineCore waiting for work.")
+                waited = True
+            req = self.input_queue.get()
+            self._handle_client_request(*req)
+
+        if waited:
+            logger.debug("EngineCore loop active.")
+
+        # Handle any more client requests.
+        while not self.input_queue.empty():
+            req = self.input_queue.get_nowait()
+            self._handle_client_request(*req)
+
+    def _process_engine_step(self) -> bool:
+        """Called only when there are unfinished local requests."""
+
+        # Step the engine core.
+        outputs, model_executed = self.step_fn()
+        # Put EngineCoreOutputs into the output queue.
+        for output in (outputs.items() if outputs else ()):
+            self.output_queue.put_nowait(output)
+
+        return model_executed
+
+    def _handle_client_request(self, request_type: EngineCoreRequestType,
+                               request: Any) -> None:
+        """Dispatch request from client."""
+
+        if request_type == EngineCoreRequestType.ADD:
+            self.add_request(request)
+        elif request_type == EngineCoreRequestType.ABORT:
+            self.abort_requests(request)
+        elif request_type == EngineCoreRequestType.UTILITY:
+            client_idx, call_id, method_name, args = request
+            output = UtilityOutput(call_id)
+            try:
+                method = getattr(self, method_name)
+                output.result = method(
+                    *self._convert_msgspec_args(method, args))
+            except BaseException as e:
+                logger.exception("Invocation of %s method failed", method_name)
+                output.failure_message = (f"Call to {method_name} method"
+                                          f" failed: {str(e)}")
+            self.output_queue.put_nowait(
+                (client_idx, EngineCoreOutputs(utility_output=output)))
+        elif request_type == EngineCoreRequestType.EXECUTOR_FAILED:
+            raise RuntimeError("Executor failed.")
+        else:
+            logger.error("Unrecognized input request type encountered: %s",
+                         request_type)
+
+    @staticmethod
+    def _convert_msgspec_args(method, args):
+        """If a provided arg type doesn't match corresponding target method
+         arg type, try converting to msgspec object."""
+        if not args:
+            return args
+        arg_types = signature(method).parameters.values()
+        assert len(args) <= len(arg_types)
+        return tuple(
+            msgspec.convert(v, type=p.annotation) if isclass(p.annotation)
+            and issubclass(p.annotation, msgspec.Struct)
+            and not isinstance(v, p.annotation) else v
+            for v, p in zip(args, arg_types))
+
+    def _send_engine_dead(self):
+        """Send EngineDead status to the EngineCoreClient."""
+
+        # Put ENGINE_CORE_DEAD in the queue.
+        self.output_queue.put_nowait(EngineCoreProc.ENGINE_CORE_DEAD)
+
+        # Wait until msg sent by the daemon before shutdown.
+        self.output_thread.join(timeout=5.0)
+        if self.output_thread.is_alive():
+            logger.fatal("vLLM shutdown signal from EngineCore failed "
+                         "to send. Please report this issue.")
+
+    def process_input_sockets(self, input_addresses: list[str],
+                              coord_input_address: Optional[str],
+                              identity: bytes):
+        """Input socket IO thread."""
+
+        # Msgpack serialization decoding.
+        add_request_decoder = MsgpackDecoder(EngineCoreRequest)
+        generic_decoder = MsgpackDecoder()
+
+        with ExitStack() as stack, zmq.Context() as ctx:
+            input_sockets = [
+                stack.enter_context(
+                    make_zmq_socket(ctx,
+                                    input_address,
+                                    zmq.DEALER,
+                                    identity=identity,
+                                    bind=False))
+                for input_address in input_addresses
+            ]
+            if coord_input_address is None:
+                coord_socket = None
+            else:
+                coord_socket = stack.enter_context(
+                    make_zmq_socket(ctx,
+                                    coord_input_address,
+                                    zmq.XSUB,
+                                    identity=identity,
+                                    bind=False))
+                # Send subscription message to coordinator.
+                coord_socket.send(b'\x01')
+
+            # Register sockets with poller.
+            poller = zmq.Poller()
+            for input_socket in input_sockets:
+                # Send initial message to each input socket - this is required
+                # before the front-end ROUTER socket can send input messages
+                # back to us.
+                input_socket.send(b'')
+                poller.register(input_socket, zmq.POLLIN)
+            if coord_socket is not None:
+                poller.register(coord_socket, zmq.POLLIN)
+
+            while True:
+                for input_socket, _ in poller.poll():
+                    # (RequestType, RequestData)
+                    type_frame, *data_frames = input_socket.recv_multipart(
+                        copy=False)
+                    request_type = EngineCoreRequestType(
+                        bytes(type_frame.buffer))
+
+                    # Deserialize the request data.
+                    decoder = add_request_decoder if (
+                        request_type
+                        == EngineCoreRequestType.ADD) else generic_decoder
+                    request = decoder.decode(data_frames)
+
+                    # Push to input queue for core busy loop.
+                    self.input_queue.put_nowait((request_type, request))
+
+    def process_output_sockets(self, output_paths: list[str],
+                               coord_output_path: Optional[str],
+                               engine_index: int):
+        """Output socket IO thread."""
+
+        # Msgpack serialization encoding.
+        encoder = MsgpackEncoder()
+        # Send buffers to reuse.
+        reuse_buffers: list[bytearray] = []
+        # Keep references to outputs and buffers until zmq is finished
+        # with them (outputs may contain tensors/np arrays whose
+        # backing buffers were extracted for zero-copy send).
+        pending = deque[tuple[zmq.MessageTracker, Any, bytearray]]()
+
+        # We must set linger to ensure the ENGINE_CORE_DEAD
+        # message is sent prior to closing the socket.
+        with ExitStack() as stack, zmq.Context() as ctx:
+            sockets = [
+                stack.enter_context(
+                    make_zmq_socket(ctx, output_path, zmq.PUSH, linger=4000))
+                for output_path in output_paths
+            ]
+            coord_socket = stack.enter_context(
+                make_zmq_socket(
+                    ctx, coord_output_path, zmq.PUSH, bind=False,
+                    linger=4000)) if coord_output_path is not None else None
+            max_reuse_bufs = len(sockets) + 1
+
+            while True:
+                output = self.output_queue.get()
+                if output == EngineCoreProc.ENGINE_CORE_DEAD:
+                    for socket in sockets:
+                        socket.send(output)
+                    break
+                assert not isinstance(output, bytes)
+                client_index, outputs = output
+                outputs.engine_index = engine_index
+
+                if client_index == -1:
+                    # Don't reuse buffer for coordinator message
+                    # which will be very small.
+                    assert coord_socket is not None
+                    coord_socket.send_multipart(encoder.encode(outputs))
+                    continue
+
+                # Reclaim buffers that zmq is finished with.
+                while pending and pending[-1][0].done:
+                    reuse_buffers.append(pending.pop()[2])
+
+                buffer = reuse_buffers.pop() if reuse_buffers else bytearray()
+                buffers = encoder.encode_into(outputs, buffer)
+                tracker = sockets[client_index].send_multipart(buffers,
+                                                               copy=False,
+                                                               track=True)
+                if not tracker.done:
+                    ref = outputs if len(buffers) > 1 else None
+                    pending.appendleft((tracker, ref, buffer))
+                elif len(reuse_buffers) < max_reuse_bufs:
+                    # Limit the number of buffers to reuse.
+                    reuse_buffers.append(buffer)
+
+
+class DPEngineCoreProc(EngineCoreProc):
+    """ZMQ-wrapper for running EngineCore in background process
+    in a data parallel context."""
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        local_client: bool,
+        handshake_address: str,
+        executor_class: type[Executor],
+        log_stats: bool,
+        client_handshake_address: Optional[str] = None,
+    ):
+        self._decorate_logs()
+
+        # Counts forward-passes of the model so that we can synchronize
+        # finished with DP peers every N steps.
+        self.counter = 0
+        self.current_wave = 0
+        self.last_counts = (0, 0)
+
+        # Initialize the engine.
+        dp_rank = vllm_config.parallel_config.data_parallel_rank
+        super().__init__(vllm_config, local_client, handshake_address,
+                         executor_class, log_stats, client_handshake_address,
+                         dp_rank)
+
+    def _decorate_logs(self):
+        # Add process-specific prefix to stdout and stderr before
+        # we initialize the engine.
+        from multiprocessing import current_process
+        process_name = current_process().name
+        pid = os.getpid()
+        _add_prefix(sys.stdout, process_name, pid)
+        _add_prefix(sys.stderr, process_name, pid)
+
+    def _init_data_parallel(self, vllm_config: VllmConfig):
+
+        # Configure GPUs and stateless process group for data parallel.
+        dp_rank = vllm_config.parallel_config.data_parallel_rank
+        dp_size = vllm_config.parallel_config.data_parallel_size
+        local_dp_rank = vllm_config.parallel_config.data_parallel_rank_local
+
+        assert dp_size > 1
+        assert 0 <= local_dp_rank <= dp_rank < dp_size
+
+        if vllm_config.kv_transfer_config is not None:
+            # modify the engine_id and append the local_dp_rank to it to ensure
+            # that the kv_transfer_config is unique for each DP rank.
+            vllm_config.kv_transfer_config.engine_id = (
+                f"{vllm_config.kv_transfer_config.engine_id}_dp{local_dp_rank}"
+            )
+            logger.debug("Setting kv_transfer_config.engine_id to %s",
+                         vllm_config.kv_transfer_config.engine_id)
+
+        self.dp_rank = dp_rank
+        self.dp_group = vllm_config.parallel_config.stateless_init_dp_group()
+
+    def shutdown(self):
+        super().shutdown()
+        if dp_group := getattr(self, "dp_group", None):
+            stateless_destroy_torch_distributed_process_group(dp_group)
+
+    def add_request(self, request: EngineCoreRequest):
+        if self.has_coordinator and request.current_wave != self.current_wave:
+            if request.current_wave > self.current_wave:
+                self.current_wave = request.current_wave
+            elif not self.engines_running:
+                # Request received for an already-completed wave, notify
+                # front-end that we need to start the next one.
+                self.output_queue.put_nowait(
+                    (-1, EngineCoreOutputs(start_wave=self.current_wave)))
+
+        super().add_request(request)
+
+    def _handle_client_request(self, request_type: EngineCoreRequestType,
+                               request: Any) -> None:
+        if request_type == EngineCoreRequestType.START_DP_WAVE:
+            new_wave, exclude_eng_index = request
+            if exclude_eng_index != self.engine_index and (
+                    new_wave >= self.current_wave):
+                self.current_wave = new_wave
+                if not self.engines_running:
+                    logger.debug("EngineCore starting idle loop for wave %d.",
+                                 new_wave)
+                    self.engines_running = True
+        else:
+            super()._handle_client_request(request_type, request)
+
+    def _maybe_publish_request_counts(self):
+        if not self.publish_dp_lb_stats:
+            return
+
+        # Publish our request counts (if they've changed).
+        counts = self.scheduler.get_request_counts()
+        if counts != self.last_counts:
+            self.last_counts = counts
+            stats = SchedulerStats(*counts)
+            self.output_queue.put_nowait(
+                (-1, EngineCoreOutputs(scheduler_stats=stats)))
+
+    def run_busy_loop(self):
+        """Core busy loop of the EngineCore for data parallel case."""
+
+        # Loop until process is sent a SIGINT or SIGTERM
+        while True:
+            # 1) Poll the input queue until there is work to do.
+            self._process_input_queue()
+
+            # 2) Step the engine core.
+            executed = self._process_engine_step()
+            self._maybe_publish_request_counts()
+
+            local_unfinished_reqs = self.scheduler.has_unfinished_requests()
+            if not executed:
+                if not local_unfinished_reqs and not self.engines_running:
+                    # All engines are idle.
+                    continue
+
+                # We are in a running state and so must execute a dummy pass
+                # if the model didn't execute any ready requests.
+                self.execute_dummy_batch()
+
+            # 3) All-reduce operation to determine global unfinished reqs.
+            self.engines_running = self._has_global_unfinished_reqs(
+                local_unfinished_reqs)
+
+            if not self.engines_running:
+                if self.dp_rank == 0 or not self.has_coordinator:
+                    # Notify client that we are pausing the loop.
+                    logger.debug("Wave %d finished, pausing engine loop.",
+                                 self.current_wave)
+                    # In the coordinator case, dp rank 0 sends updates to the
+                    # coordinator. Otherwise (offline spmd case), each rank
+                    # sends the update to its colocated front-end process.
+                    client_index = -1 if self.has_coordinator else 0
+                    self.output_queue.put_nowait(
+                        (client_index,
+                         EngineCoreOutputs(wave_complete=self.current_wave)))
+                self.current_wave += 1
+
+    def _has_global_unfinished_reqs(self, local_unfinished: bool) -> bool:
+
+        # Optimization - only perform finish-sync all-reduce every 32 steps.
+        self.counter += 1
+        if self.counter != 32:
+            return True
+        self.counter = 0
+
+        return ParallelConfig.has_unfinished_dp(self.dp_group,
+                                                local_unfinished)
+
+    def reinitialize_distributed(
+            self, reconfig_request: ReconfigureDistributedRequest) -> None:
+        stateless_destroy_torch_distributed_process_group(self.dp_group)
+        self.shutdown()
+
+        parallel_config = self.vllm_config.parallel_config
+        old_dp_size = parallel_config.data_parallel_size
+        parallel_config.data_parallel_size = \
+            reconfig_request.new_data_parallel_size
+        if reconfig_request.new_data_parallel_rank != -1:
+            parallel_config.data_parallel_rank = \
+                reconfig_request.new_data_parallel_rank
+        # local rank specifies device visibility, it should not be changed
+        assert reconfig_request.new_data_parallel_rank_local == \
+            ReconfigureRankType.KEEP_CURRENT_RANK
+        parallel_config.data_parallel_master_ip = \
+            reconfig_request.new_data_parallel_master_ip
+        parallel_config.data_parallel_master_port = \
+            reconfig_request.new_data_parallel_master_port
+        if reconfig_request.new_data_parallel_rank != -2:
+            self.dp_rank = parallel_config.data_parallel_rank
+            self.dp_group = parallel_config.stateless_init_dp_group()
+        reconfig_request.new_data_parallel_master_port = \
+            parallel_config.data_parallel_master_port
+
+        self.model_executor.reinitialize_distributed(reconfig_request)
+        if reconfig_request.new_data_parallel_size > old_dp_size:
+            assert self.available_gpu_memory_for_kv_cache > 0
+            # pass available_gpu_memory_for_kv_cache from existing
+            # engine-cores to new engine-cores so they can directly
+            # use it in _initialize_kv_caches() rather than profiling.
+            ParallelConfig.sync_kv_cache_memory_size(
+                self.dp_group, self.available_gpu_memory_for_kv_cache)
+            # NOTE(yongji): newly joined workers require dummy_run even
+            # CUDA graph is not used
+            self.model_executor.collective_rpc("compile_or_warm_up_model")
+        if reconfig_request.new_data_parallel_rank == \
+        ReconfigureRankType.SHUTDOWN_CURRENT_RANK:
+            self.shutdown()
+            logger.info("DPEngineCoreProc %s shutdown", self.dp_rank)
+        else:
+            logger.info("Distributed environment reinitialized for DP rank %s",
+                        self.dp_rank)
+
+
+class DPEngineCoreActor(DPEngineCoreProc):
+    """
+    Ray actor for running EngineCore in a data parallel context
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        local_client: bool,
+        addresses: EngineZmqAddresses,
+        executor_class: type[Executor],
+        log_stats: bool,
+        dp_rank: int = 0,
+        local_dp_rank: int = 0,
+    ):
+        self.addresses = addresses
+        vllm_config.parallel_config.data_parallel_rank = dp_rank
+        vllm_config.parallel_config.data_parallel_rank_local = \
+            local_dp_rank
+
+        # Set CUDA_VISIBLE_DEVICES as early as possible in actor life cycle
+        # NOTE: in MP we set CUDA_VISIBLE_DEVICES at process creation time,
+        # and this cannot be done in the same way for Ray because:
+        # 1) Ray manages life cycle of all ray workers (including
+        # DPEngineCoreActor)
+        # 2) Ray sets CUDA_VISIBLE_DEVICES based on num_gpus configuration
+        # To bypass 2, we need to also set
+        # RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES, but vLLM workers created
+        # thereafter would have CUDA_VISIBLE_DEVICES set, which is sticky:
+        # https://github.com/ray-project/ray/blob/e752fc319ddedd9779a0989b6d3613909bad75c9/python/ray/_private/worker.py#L456 # noqa: E501
+        # But vLLM worker assumes visibility into all local GPUs, therefore
+        # this results in incorrect indexing into the GPU ID list.
+        self._set_cuda_visible_devices(vllm_config, local_dp_rank)
+
+        super().__init__(vllm_config, local_client, "", executor_class,
+                         log_stats)
+
+    def _set_cuda_visible_devices(self, vllm_config: VllmConfig,
+                                  local_dp_rank: int):
+        from vllm.platforms import current_platform
+        device_control_env_var = current_platform.device_control_env_var
+        world_size = vllm_config.parallel_config.world_size
+        # Set CUDA_VISIBLE_DEVICES or equivalent.
+        try:
+            os.environ[device_control_env_var] = ",".join(
+                str(current_platform.device_id_to_physical_device_id(i))
+                for i in range(local_dp_rank *
+                               world_size, (local_dp_rank + 1) * world_size))
+        except IndexError as e:
+            raise Exception(
+                f"Error setting {device_control_env_var}: "
+                f"local range: [{local_dp_rank * world_size}, "
+                f"{(local_dp_rank + 1) * world_size}) "
+                f"base value: \"{os.getenv(device_control_env_var)}\"") from e
+
+    def _decorate_logs(self):
+        pass
+
+    @contextmanager
+    def _perform_handshakes(self, handshake_address: str, identity: bytes,
+                            local_client: bool, vllm_config: VllmConfig,
+                            client_handshake_address: Optional[str]):
+        """
+        For Ray, we don't need to actually perform handshake.
+        All addresses information is known before the actor creation.
+        Therefore, we simply yield these addresses.
+        """
+        yield self.addresses
+
+    def wait_for_init(self):
+        """
+        Wait until the engine core is initialized.
+
+        This is just an empty method. When ray.get() on this method
+        (or any other method of the actor) returns, it is guaranteed
+        that actor creation (i.e., __init__) is complete.
+        """
+        pass
+
+    def run(self):
+        """
+        Run the engine core busy loop.
+        """
+        try:
+            self.run_busy_loop()
+        except SystemExit:
+            logger.debug("EngineCore exiting.")
+            raise
+        except Exception:
+            logger.exception("EngineCore encountered a fatal error.")
+            raise
+        finally:
+            self.shutdown()
diff --git a/vllm_v0.10.0/vllm/v1/engine/core_client.py b/vllm_v0.10.0/vllm/v1/engine/core_client.py
new file mode 100644
index 0000000..69ae369
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/engine/core_client.py
@@ -0,0 +1,1236 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import contextlib
+import queue
+import sys
+import uuid
+import weakref
+from abc import ABC, abstractmethod
+from collections import defaultdict, deque
+from collections.abc import Awaitable, Sequence
+from concurrent.futures import Future
+from dataclasses import dataclass
+from threading import Thread
+from typing import Any, Callable, Optional, TypeVar, Union
+
+import msgspec.msgpack
+import zmq
+import zmq.asyncio
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.utils import get_open_port, get_open_zmq_inproc_path, make_zmq_socket
+from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
+                            EngineCoreRequestType,
+                            ReconfigureDistributedRequest, ReconfigureRankType,
+                            UtilityOutput)
+from vllm.v1.engine.coordinator import DPCoordinator
+from vllm.v1.engine.core import EngineCore, EngineCoreProc
+from vllm.v1.engine.exceptions import EngineDeadError
+from vllm.v1.engine.utils import (CoreEngineActorManager,
+                                  CoreEngineProcManager, launch_core_engines)
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder, bytestr
+
+logger = init_logger(__name__)
+
+AnyFuture = Union[asyncio.Future[Any], Future[Any]]
+
+_R = TypeVar('_R')  # Return type for collective_rpc
+
+EngineIdentity = bytes
+
+
+class EngineCoreClient(ABC):
+    """
+    EngineCoreClient: subclasses handle different methods for pushing 
+        and pulling from the EngineCore for asyncio / multiprocessing.
+
+    Subclasses:
+    * InprocClient: In process EngineCore (for V0-style LLMEngine use)
+    * SyncMPClient: ZMQ + background proc EngineCore (for LLM)
+    * AsyncMPClient: ZMQ + background proc EngineCore w/ asyncio (for AsyncLLM)
+    """
+
+    @staticmethod
+    def make_client(
+        multiprocess_mode: bool,
+        asyncio_mode: bool,
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+    ) -> "EngineCoreClient":
+
+        # TODO: support this for debugging purposes.
+        if asyncio_mode and not multiprocess_mode:
+            raise NotImplementedError(
+                "Running EngineCore in asyncio without multiprocessing "
+                "is not currently supported.")
+
+        if multiprocess_mode and asyncio_mode:
+            return EngineCoreClient.make_async_mp_client(
+                vllm_config, executor_class, log_stats)
+
+        if multiprocess_mode and not asyncio_mode:
+            return SyncMPClient(vllm_config, executor_class, log_stats)
+
+        return InprocClient(vllm_config, executor_class, log_stats)
+
+    @staticmethod
+    def make_async_mp_client(
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+        client_addresses: Optional[dict[str, str]] = None,
+        client_index: int = 0,
+    ) -> "MPClient":
+        parallel_config = vllm_config.parallel_config
+        client_args = (vllm_config, executor_class, log_stats,
+                       client_addresses, client_index)
+        if parallel_config.data_parallel_size > 1:
+            if parallel_config.data_parallel_external_lb:
+                # External load balancer - client per DP rank.
+                return DPAsyncMPClient(*client_args)
+            # Internal load balancer - client balances to all DP ranks.
+            return DPLBAsyncMPClient(*client_args)
+        return AsyncMPClient(*client_args)
+
+    @abstractmethod
+    def shutdown(self):
+        ...
+
+    def get_output(self) -> EngineCoreOutputs:
+        raise NotImplementedError
+
+    def add_request(self, request: EngineCoreRequest) -> None:
+        raise NotImplementedError
+
+    def profile(self, is_start: bool = True) -> None:
+        raise NotImplementedError
+
+    def reset_mm_cache(self) -> None:
+        raise NotImplementedError
+
+    def reset_prefix_cache(self) -> None:
+        raise NotImplementedError
+
+    def sleep(self, level: int = 1) -> None:
+        raise NotImplementedError
+
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        raise NotImplementedError
+
+    def is_sleeping(self) -> bool:
+        raise NotImplementedError
+
+    def execute_dummy_batch(self) -> None:
+        raise NotImplementedError
+
+    async def execute_dummy_batch_async(self) -> None:
+        raise NotImplementedError
+
+    def abort_requests(self, request_ids: list[str]) -> None:
+        raise NotImplementedError
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        raise NotImplementedError
+
+    def remove_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError
+
+    def list_loras(self) -> set[int]:
+        raise NotImplementedError
+
+    def pin_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError
+
+    def save_sharded_state(self,
+                           path: str,
+                           pattern: Optional[str] = None,
+                           max_size: Optional[int] = None) -> None:
+        raise NotImplementedError
+
+    def collective_rpc(self,
+                       method: Union[str, Callable[..., _R]],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        raise NotImplementedError
+
+    def dp_engines_running(self) -> bool:
+        """Returns True id data parallel engines are collectively in a
+        running state."""
+        raise NotImplementedError
+
+    async def scale_elastic_ep(self, new_data_parallel_size: int) -> None:
+        raise NotImplementedError
+
+    async def get_output_async(self) -> EngineCoreOutputs:
+        raise NotImplementedError
+
+    async def add_request_async(self, request: EngineCoreRequest) -> None:
+        raise NotImplementedError
+
+    async def profile_async(self, is_start: bool = True) -> None:
+        raise NotImplementedError
+
+    async def reset_mm_cache_async(self) -> None:
+        raise NotImplementedError
+
+    async def reset_prefix_cache_async(self) -> None:
+        raise NotImplementedError
+
+    async def sleep_async(self, level: int = 1) -> None:
+        raise NotImplementedError
+
+    async def wake_up_async(self, tags: Optional[list[str]] = None) -> None:
+        raise NotImplementedError
+
+    async def is_sleeping_async(self) -> bool:
+        raise NotImplementedError
+
+    async def abort_requests_async(self, request_ids: list[str]) -> None:
+        raise NotImplementedError
+
+    async def add_lora_async(self, lora_request: LoRARequest) -> bool:
+        raise NotImplementedError
+
+    async def remove_lora_async(self, lora_id: int) -> bool:
+        raise NotImplementedError
+
+    async def list_loras_async(self) -> set[int]:
+        raise NotImplementedError
+
+    async def pin_lora_async(self, lora_id: int) -> bool:
+        raise NotImplementedError
+
+    async def save_sharded_state_async(self,
+                                       path: str,
+                                       pattern: Optional[str] = None,
+                                       max_size: Optional[int] = None) -> None:
+        raise NotImplementedError
+
+    async def collective_rpc_async(
+            self,
+            method: Union[str, Callable[..., _R]],
+            timeout: Optional[float] = None,
+            args: tuple = (),
+            kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        raise NotImplementedError
+
+
+class InprocClient(EngineCoreClient):
+    """
+    InprocClient: client for in-process EngineCore. Intended 
+    for use in LLMEngine for V0-style add_request() and step()
+        EngineCore setup in this process (no busy loop).
+
+        * pushes EngineCoreRequest directly into the EngineCore
+        * pulls EngineCoreOutputs by stepping the EngineCore
+    """
+
+    def __init__(self, *args, **kwargs):
+        self.engine_core = EngineCore(*args, **kwargs)
+
+    def get_output(self) -> EngineCoreOutputs:
+        outputs, _ = self.engine_core.step()
+        return outputs.get(0) or EngineCoreOutputs()
+
+    def add_request(self, request: EngineCoreRequest) -> None:
+        self.engine_core.add_request(request)
+
+    def abort_requests(self, request_ids: list[str]) -> None:
+        if len(request_ids) > 0:
+            self.engine_core.abort_requests(request_ids)
+
+    def shutdown(self) -> None:
+        self.engine_core.shutdown()
+
+    def profile(self, is_start: bool = True) -> None:
+        self.engine_core.profile(is_start)
+
+    def reset_mm_cache(self) -> None:
+        self.engine_core.reset_mm_cache()
+
+    def reset_prefix_cache(self) -> None:
+        self.engine_core.reset_prefix_cache()
+
+    def sleep(self, level: int = 1) -> None:
+        self.engine_core.sleep(level)
+
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        self.engine_core.wake_up(tags)
+
+    def is_sleeping(self) -> bool:
+        return self.engine_core.is_sleeping()
+
+    def execute_dummy_batch(self) -> None:
+        self.engine_core.execute_dummy_batch()
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.engine_core.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        return self.engine_core.remove_lora(lora_id)
+
+    def list_loras(self) -> set[int]:
+        return self.engine_core.list_loras()
+
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.engine_core.pin_lora(lora_id)
+
+    def save_sharded_state(self,
+                           path: str,
+                           pattern: Optional[str] = None,
+                           max_size: Optional[int] = None) -> None:
+        self.engine_core.save_sharded_state(path, pattern, max_size)
+
+    def collective_rpc(self,
+                       method: Union[str, Callable[..., _R]],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        return self.engine_core.collective_rpc(method, timeout, args, kwargs)
+
+    def dp_engines_running(self) -> bool:
+        return False
+
+
+@dataclass
+class BackgroundResources:
+    """Used as a finalizer for clean shutdown, avoiding
+    circular reference back to the client object."""
+
+    ctx: Union[zmq.Context]
+    # If CoreEngineProcManager, it manages local engines;
+    # if CoreEngineActorManager, it manages all engines.
+    engine_manager: Optional[Union[CoreEngineProcManager,
+                                   CoreEngineActorManager]] = None
+    coordinator: Optional[DPCoordinator] = None
+    output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
+    input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
+    first_req_send_socket: Optional[zmq.asyncio.Socket] = None
+    output_queue_task: Optional[asyncio.Task] = None
+    stats_update_task: Optional[asyncio.Task] = None
+    shutdown_path: Optional[str] = None
+
+    # Set if any of the engines are dead. Here so that the output
+    # processing threads can access it without holding a ref to the client.
+    engine_dead: bool = False
+
+    def __call__(self):
+        """Clean up background resources."""
+
+        self.engine_dead = True
+        if self.engine_manager is not None:
+            self.engine_manager.close()
+        if self.coordinator is not None:
+            self.coordinator.close()
+
+        if self.output_queue_task is not None:
+            self.output_queue_task.cancel()
+        if self.stats_update_task is not None:
+            self.stats_update_task.cancel()
+
+        # ZMQ context termination can hang if the sockets
+        # aren't explicitly closed first.
+        for socket in (self.output_socket, self.input_socket,
+                       self.first_req_send_socket):
+            if socket is not None:
+                socket.close(linger=0)
+
+        if self.shutdown_path is not None:
+            # We must ensure that the sync output socket is
+            # closed cleanly in its own thread.
+            with self.ctx.socket(zmq.PAIR) as shutdown_sender:
+                shutdown_sender.connect(self.shutdown_path)
+                # Send shutdown signal.
+                shutdown_sender.send(b'')
+
+    def validate_alive(self, frames: Sequence[zmq.Frame]):
+        if len(frames) == 1 and (frames[0].buffer
+                                 == EngineCoreProc.ENGINE_CORE_DEAD):
+            self.engine_dead = True
+            raise EngineDeadError()
+
+
+class MPClient(EngineCoreClient):
+    """
+    MPClient: base client for multi-proc EngineCore.
+        EngineCore runs in a background process busy loop, getting
+        new EngineCoreRequests and returning EngineCoreOutputs
+
+        * pushes EngineCoreRequests via input_socket
+        * pulls EngineCoreOutputs via output_socket
+    
+        * AsyncMPClient subclass for AsyncLLM usage
+        * SyncMPClient subclass for LLM usage
+    """
+
+    def __init__(
+        self,
+        asyncio_mode: bool,
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+        client_addresses: Optional[dict[str, str]] = None,
+    ):
+        self.vllm_config = vllm_config
+        # Serialization setup.
+        self.encoder = MsgpackEncoder()
+        self.decoder = MsgpackDecoder(EngineCoreOutputs)
+
+        # ZMQ setup.
+        sync_ctx = zmq.Context(io_threads=2)
+        self.ctx = zmq.asyncio.Context(sync_ctx) if asyncio_mode else sync_ctx
+
+        # This will ensure resources created so far are closed
+        # when the client is garbage collected, even if an
+        # exception is raised mid-construction.
+        self.resources = BackgroundResources(ctx=sync_ctx)
+        self._finalizer = weakref.finalize(self, self.resources)
+        success = False
+        try:
+            # State used for data parallel.
+            self.engines_running = False
+
+            self.stats_update_address: Optional[str] = None
+            if client_addresses is not None:
+                # Engines are managed externally to this client.
+                input_address = client_addresses["input_address"]
+                output_address = client_addresses["output_address"]
+                self.stats_update_address = client_addresses.get(
+                    "stats_update_address")
+            else:
+                # Engines are managed by this client.
+                with launch_core_engines(vllm_config, executor_class,
+                                         log_stats) as (engine_manager,
+                                                        coordinator,
+                                                        addresses):
+                    self.resources.coordinator = coordinator
+                    self.resources.engine_manager = engine_manager
+
+                (input_address, ) = addresses.inputs
+                (output_address, ) = addresses.outputs
+                self.stats_update_address = (
+                    addresses.frontend_stats_publish_address)
+                if coordinator is not None:
+                    assert self.stats_update_address == (
+                        coordinator.get_stats_publish_address())
+
+            # Create input and output sockets.
+            self.input_socket = self.resources.input_socket = make_zmq_socket(
+                self.ctx, input_address, zmq.ROUTER, bind=True)
+            self.resources.output_socket = make_zmq_socket(
+                self.ctx, output_address, zmq.PULL)
+
+            parallel_config = vllm_config.parallel_config
+            dp_size = parallel_config.data_parallel_size
+            dp_rank = parallel_config.data_parallel_rank
+            dp_local_size = parallel_config.data_parallel_size_local
+            offline_mode = parallel_config.data_parallel_rank_local is not None
+            # Client manages local+remote EngineCores in pure internal LB case.
+            # Client manages local EngineCores in hybrid and external LB case.
+            local_engines_only = (parallel_config.data_parallel_hybrid_lb
+                                  or parallel_config.data_parallel_external_lb)
+
+            num_ranks = dp_local_size if local_engines_only else dp_size
+            self.engine_ranks_managed = [dp_rank] if offline_mode else list(
+                range(dp_rank, dp_rank + num_ranks))
+            assert parallel_config.data_parallel_size_local <= len(
+                self.engine_ranks_managed)
+
+            # ZMQ identity of each engine that this client will talk to.
+            self.core_engines: list[EngineIdentity] = [
+                rank.to_bytes(2, "little")
+                for rank in self.engine_ranks_managed
+            ]
+
+            # Wait for ready messages from each engine on the input socket.
+            identities = set(self.core_engines)
+            sync_input_socket = zmq.Socket.shadow(self.input_socket)
+            while identities:
+                if not sync_input_socket.poll(timeout=600_000):
+                    raise TimeoutError("Timed out waiting for engines to send"
+                                       "initial message on input socket.")
+                identity, _ = sync_input_socket.recv_multipart()
+                identities.remove(identity)
+
+            self.core_engine: EngineIdentity = self.core_engines[0]
+            self.utility_results: dict[int, AnyFuture] = {}
+
+            # Request objects which may contain pytorch-allocated tensors
+            # that we need to keep references to until zmq is done with the
+            # underlying data.
+            self.pending_messages = deque[tuple[zmq.MessageTracker, Any]]()
+
+            success = True
+        finally:
+            if not success:
+                self._finalizer()
+
+    def shutdown(self):
+        # Terminate background resources.
+        self._finalizer()
+
+    def _format_exception(self, e: Exception) -> Exception:
+        """If errored, use EngineDeadError so root cause is clear."""
+        return EngineDeadError(
+            suppress_context=True) if self.resources.engine_dead else e
+
+    def ensure_alive(self):
+        if self.resources.engine_dead:
+            raise EngineDeadError()
+
+    def add_pending_message(self, tracker: zmq.MessageTracker, msg: Any):
+        if not tracker.done:
+            self.pending_messages.appendleft((tracker, msg))
+
+    def free_pending_messages(self):
+        while self.pending_messages and self.pending_messages[-1][0].done:
+            self.pending_messages.pop()
+
+    def dp_engines_running(self) -> bool:
+        return self.engines_running
+
+
+def _process_utility_output(output: UtilityOutput,
+                            utility_results: dict[int, AnyFuture]):
+    """Set the result from a utility method in the waiting future"""
+    future = utility_results.pop(output.call_id)
+    if output.failure_message is not None:
+        future.set_exception(Exception(output.failure_message))
+    else:
+        future.set_result(output.result)
+
+
+class SyncMPClient(MPClient):
+    """Synchronous client for multi-proc EngineCore."""
+
+    def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
+                 log_stats: bool):
+        super().__init__(
+            asyncio_mode=False,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=log_stats,
+        )
+
+        self.is_dp = self.vllm_config.parallel_config.data_parallel_size > 1
+        self.outputs_queue = queue.Queue[Union[EngineCoreOutputs, Exception]]()
+
+        # Ensure that the outputs socket processing thread does not have
+        # a ref to the client which prevents gc.
+        ctx = self.ctx
+        out_socket = self.resources.output_socket
+        decoder = self.decoder
+        utility_results = self.utility_results
+        outputs_queue = self.outputs_queue
+
+        shutdown_path = get_open_zmq_inproc_path()
+        resources = self.resources
+        resources.shutdown_path = shutdown_path
+
+        def process_outputs_socket():
+            assert isinstance(out_socket, zmq.Socket)
+            shutdown_socket = ctx.socket(zmq.PAIR)
+            try:
+                shutdown_socket.bind(shutdown_path)
+                poller = zmq.Poller()
+                poller.register(shutdown_socket, zmq.POLLIN)
+                poller.register(out_socket, zmq.POLLIN)
+                while True:
+                    socks = poller.poll()
+                    if not socks:
+                        continue
+                    if len(socks) == 2 or socks[0][0] == shutdown_socket:
+                        # shutdown signal, exit thread.
+                        break
+
+                    frames = out_socket.recv_multipart(copy=False)
+                    resources.validate_alive(frames)
+                    outputs: EngineCoreOutputs = decoder.decode(frames)
+                    if outputs.utility_output:
+                        _process_utility_output(outputs.utility_output,
+                                                utility_results)
+                    else:
+                        outputs_queue.put_nowait(outputs)
+            except Exception as e:
+                outputs_queue.put_nowait(e)
+            finally:
+                # Close sockets.
+                shutdown_socket.close(linger=0)
+                out_socket.close(linger=0)
+
+        # Process outputs from engine in separate thread.
+        self.output_queue_thread = Thread(target=process_outputs_socket,
+                                          name="EngineCoreOutputQueueThread",
+                                          daemon=True)
+        self.output_queue_thread.start()
+
+        # The thread takes on responsibility for closing the socket.
+        self.resources.output_socket = None
+
+    def get_output(self) -> EngineCoreOutputs:
+        # If an exception arises in process_outputs_socket task,
+        # it is forwarded to the outputs_queue so we can raise it
+        # from this (run_output_handler) task to shut down the server.
+        outputs = self.outputs_queue.get()
+        if isinstance(outputs, Exception):
+            raise self._format_exception(outputs) from None
+        if outputs.wave_complete is not None:
+            self.engines_running = False
+        return outputs
+
+    def _send_input(self, request_type: EngineCoreRequestType, request: Any):
+        self.ensure_alive()
+        self.free_pending_messages()
+        # (Identity, RequestType, SerializedRequest)
+        msg = (self.core_engine, request_type.value,
+               *self.encoder.encode(request))
+
+        if len(msg) <= 3:
+            # No auxiliary buffers => no tensor backing buffers in request.
+            self.input_socket.send_multipart(msg, copy=False)
+            return
+
+        tracker = self.input_socket.send_multipart(msg, copy=False, track=True)
+        self.add_pending_message(tracker, request)
+
+    def call_utility(self, method: str, *args) -> Any:
+        call_id = uuid.uuid1().int >> 64
+        future: Future[Any] = Future()
+        self.utility_results[call_id] = future
+        self._send_input(EngineCoreRequestType.UTILITY,
+                         (0, call_id, method, args))
+
+        return future.result()
+
+    def add_request(self, request: EngineCoreRequest) -> None:
+        if self.is_dp:
+            self.engines_running = True
+        self._send_input(EngineCoreRequestType.ADD, request)
+
+    def abort_requests(self, request_ids: list[str]) -> None:
+        if request_ids and not self.resources.engine_dead:
+            self._send_input(EngineCoreRequestType.ABORT, request_ids)
+
+    def profile(self, is_start: bool = True) -> None:
+        self.call_utility("profile", is_start)
+
+    def reset_mm_cache(self) -> None:
+        self.call_utility("reset_mm_cache")
+
+    def reset_prefix_cache(self) -> None:
+        self.call_utility("reset_prefix_cache")
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.call_utility("add_lora", lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        return self.call_utility("remove_lora", lora_id)
+
+    def list_loras(self) -> set[int]:
+        return self.call_utility("list_loras")
+
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.call_utility("pin_lora", lora_id)
+
+    def sleep(self, level: int = 1) -> None:
+        self.call_utility("sleep", level)
+
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        self.call_utility("wake_up", tags)
+
+    def is_sleeping(self) -> bool:
+        return self.call_utility("is_sleeping")
+
+    def execute_dummy_batch(self) -> None:
+        self.call_utility("execute_dummy_batch")
+
+    def collective_rpc(self,
+                       method: Union[str, Callable[..., _R]],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        return self.call_utility("collective_rpc", method, timeout, args,
+                                 kwargs)
+
+    def save_sharded_state(self,
+                           path: str,
+                           pattern: Optional[str] = None,
+                           max_size: Optional[int] = None) -> None:
+        self.call_utility("save_sharded_state", path, pattern, max_size)
+
+
+class AsyncMPClient(MPClient):
+    """Asyncio-compatible client for multi-proc EngineCore."""
+
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 executor_class: type[Executor],
+                 log_stats: bool,
+                 client_addresses: Optional[dict[str, str]] = None,
+                 client_index: int = 0):
+        super().__init__(
+            asyncio_mode=True,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=log_stats,
+            client_addresses=client_addresses,
+        )
+
+        self.client_index = client_index
+        self.outputs_queue = asyncio.Queue[Union[EngineCoreOutputs,
+                                                 Exception]]()
+        try:
+            # If we are running in an asyncio event loop, start the queue task.
+            # Otherwise, it will be started lazily. If it is not started here,
+            # we could miss EXECUTOR_FAILED messages from engine core if they
+            # occur prior to any requests being sent.
+            asyncio.get_running_loop()
+            self._ensure_output_queue_task()
+        except RuntimeError:
+            pass
+
+    def _ensure_output_queue_task(self):
+        resources = self.resources
+        if resources.output_queue_task is not None:
+            return
+
+        # Perform IO in separate task to parallelize as much as possible.
+        # Avoid task having direct reference back to the client.
+        decoder = self.decoder
+        utility_results = self.utility_results
+        outputs_queue = self.outputs_queue
+        output_handler: Optional[Callable[[AsyncMPClient, EngineCoreOutputs],
+                                          Awaitable[None]]] = getattr(
+                                              self.__class__,
+                                              "process_engine_outputs", None)
+        _self_ref = weakref.ref(self) if output_handler else None
+        output_socket = resources.output_socket
+        assert output_socket is not None
+
+        async def process_outputs_socket():
+            try:
+                while True:
+                    frames = await output_socket.recv_multipart(copy=False)
+                    resources.validate_alive(frames)
+                    outputs: EngineCoreOutputs = decoder.decode(frames)
+                    if outputs.utility_output:
+                        _process_utility_output(outputs.utility_output,
+                                                utility_results)
+                        continue
+
+                    if output_handler is not None:
+                        assert _self_ref is not None
+                        _self = _self_ref()
+                        if not _self:
+                            # Client has been garbage collected, abort.
+                            return
+                        await output_handler(_self, outputs)
+
+                    if outputs.outputs or outputs.scheduler_stats:
+                        outputs_queue.put_nowait(outputs)
+            except Exception as e:
+                outputs_queue.put_nowait(e)
+
+        resources.output_queue_task = asyncio.create_task(
+            process_outputs_socket(), name="EngineCoreOutputQueueTask")
+
+    async def get_output_async(self) -> EngineCoreOutputs:
+        self._ensure_output_queue_task()
+        # If an exception arises in process_outputs_socket task,
+        # it is forwarded to the outputs_queue so we can raise it
+        # from this (run_output_handler) task to shut down the server.
+        assert self.outputs_queue is not None
+        outputs = await self.outputs_queue.get()
+        if isinstance(outputs, Exception):
+            raise self._format_exception(outputs) from None
+        return outputs
+
+    def _send_input(self,
+                    request_type: EngineCoreRequestType,
+                    request: Any,
+                    engine: Optional[EngineIdentity] = None) -> Awaitable[Any]:
+        if engine is None:
+            engine = self.core_engine
+
+        message = (request_type.value, *self.encoder.encode(request))
+        return self._send_input_message(message, engine, request)
+
+    def _send_input_message(self, message: tuple[bytestr,
+                                                 ...], engine: EngineIdentity,
+                            objects: Any) -> Awaitable[Any]:
+        """
+        objects is a reference to retain until zmq is finished with the
+        buffers, in case they were extracted from tensors in the request.
+        """
+        self.ensure_alive()
+        self.free_pending_messages()
+
+        msg = (engine, ) + message
+        if not objects or len(msg) <= 3:
+            # No auxiliary buffers => no tensor backing buffers in request.
+            return self.input_socket.send_multipart(msg, copy=False)
+
+        future: asyncio.Future[zmq.MessageTracker]
+        future = self.input_socket.send_multipart(msg, copy=False, track=True)
+
+        def add_pending(f: asyncio.Future[zmq.MessageTracker]):
+            with contextlib.suppress(BaseException):
+                self.add_pending_message(f.result(), objects)
+
+        future.add_done_callback(add_pending)
+        return future
+
+    async def call_utility_async(self, method: str, *args) -> Any:
+        return await self._call_utility_async(method,
+                                              *args,
+                                              engine=self.core_engine)
+
+    async def _call_utility_async(self, method: str, *args,
+                                  engine: EngineIdentity) -> Any:
+        call_id = uuid.uuid1().int >> 64
+        future = asyncio.get_running_loop().create_future()
+        self.utility_results[call_id] = future
+        message = (EngineCoreRequestType.UTILITY.value, *self.encoder.encode(
+            (self.client_index, call_id, method, args)))
+        await self._send_input_message(message, engine, args)
+        self._ensure_output_queue_task()
+        return await future
+
+    async def add_request_async(self, request: EngineCoreRequest) -> None:
+        request.client_index = self.client_index
+        await self._send_input(EngineCoreRequestType.ADD, request)
+        self._ensure_output_queue_task()
+
+    async def abort_requests_async(self, request_ids: list[str]) -> None:
+        if request_ids and not self.resources.engine_dead:
+            await self._send_input(EngineCoreRequestType.ABORT, request_ids)
+
+    async def profile_async(self, is_start: bool = True) -> None:
+        await self.call_utility_async("profile", is_start)
+
+    async def reset_mm_cache_async(self) -> None:
+        await self.call_utility_async("reset_mm_cache")
+
+    async def reset_prefix_cache_async(self) -> None:
+        await self.call_utility_async("reset_prefix_cache")
+
+    async def sleep_async(self, level: int = 1) -> None:
+        await self.call_utility_async("sleep", level)
+
+    async def wake_up_async(self, tags: Optional[list[str]] = None) -> None:
+        await self.call_utility_async("wake_up", tags)
+
+    async def is_sleeping_async(self) -> bool:
+        return await self.call_utility_async("is_sleeping")
+
+    async def execute_dummy_batch_async(self) -> None:
+        await self.call_utility_async("execute_dummy_batch")
+
+    async def add_lora_async(self, lora_request: LoRARequest) -> bool:
+        return await self.call_utility_async("add_lora", lora_request)
+
+    async def remove_lora_async(self, lora_id: int) -> bool:
+        return await self.call_utility_async("remove_lora", lora_id)
+
+    async def list_loras_async(self) -> set[int]:
+        return await self.call_utility_async("list_loras")
+
+    async def pin_lora_async(self, lora_id: int) -> bool:
+        return await self.call_utility_async("pin_lora", lora_id)
+
+    async def save_sharded_state_async(self,
+                                       path: str,
+                                       pattern: Optional[str] = None,
+                                       max_size: Optional[int] = None) -> None:
+        await self.call_utility_async("save_sharded_state", path, pattern,
+                                      max_size)
+
+    async def collective_rpc_async(
+            self,
+            method: Union[str, Callable[..., _R]],
+            timeout: Optional[float] = None,
+            args: tuple = (),
+            kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        return await self.call_utility_async("collective_rpc", method, timeout,
+                                             args, kwargs)
+
+
+class DPAsyncMPClient(AsyncMPClient):
+    """Asyncio-compatible client for multi-proc, multi-engine (data parallel)
+    EngineCore. Assumes external load-balancing by default."""
+
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 executor_class: type[Executor],
+                 log_stats: bool,
+                 client_addresses: Optional[dict[str, str]] = None,
+                 client_index: int = 0):
+        self.current_wave = 0
+
+        super().__init__(vllm_config, executor_class, log_stats,
+                         client_addresses, client_index)
+
+        # List of [waiting, running] pair per engine.
+        # Used only by DPLBAsyncMPClient subclass.
+        self.lb_engines: list[list[int]] = []
+
+        self.first_req_sock_addr = get_open_zmq_inproc_path()
+        self.first_req_send_socket = self.resources.first_req_send_socket = (
+            make_zmq_socket(self.ctx,
+                            self.first_req_sock_addr,
+                            zmq.PAIR,
+                            bind=True))
+        try:
+            # If we are running in an asyncio event loop, start the stats task.
+            # Otherwise, it will be started lazily.
+            asyncio.get_running_loop()
+            self._ensure_stats_update_task()
+        except RuntimeError:
+            pass
+
+    def _ensure_stats_update_task(self):
+        resources = self.resources
+        if resources.stats_update_task is not None:
+            return
+
+        assert self.stats_update_address is not None
+        assert len(self.engine_ranks_managed) > 0
+        # NOTE: running and waiting counts are all global from
+        # the Coordinator include all global EngineCores. This
+        # slice includes just the cores managed by this client.
+        count_slice = slice(self.engine_ranks_managed[0],
+                            self.engine_ranks_managed[-1] + 1)
+
+        async def run_engine_stats_update_task():
+            with make_zmq_socket(self.ctx, self.stats_update_address,
+                                 zmq.XSUB) as socket, make_zmq_socket(
+                                     self.ctx,
+                                     self.first_req_sock_addr,
+                                     zmq.PAIR,
+                                     bind=False) as first_req_rcv_socket:
+                assert isinstance(socket, zmq.asyncio.Socket)
+                assert isinstance(first_req_rcv_socket, zmq.asyncio.Socket)
+                # Send subscription message.
+                await socket.send(b'\x01')
+
+                poller = zmq.asyncio.Poller()
+                poller.register(socket, zmq.POLLIN)
+                poller.register(first_req_rcv_socket, zmq.POLLIN)
+
+                while True:
+                    events = await poller.poll()
+                    if not self.engines_running and len(events) == 2 or (
+                            events[0][0] == first_req_rcv_socket):
+                        # Check if this is a regular request notification or
+                        # scale up notification
+                        buf = first_req_rcv_socket.recv(
+                            flags=zmq.NOBLOCK).result()
+
+                        decoded = msgspec.msgpack.decode(buf)
+                        if isinstance(
+                                decoded,
+                            (list, tuple)) and len(decoded) == 2 and decoded[
+                                0] == "SCALE_ELASTIC_EP":
+                            # Extract new engine count from the decoded message
+                            new_engine_count = decoded[1]
+                            # Send scale up notification to coordinator
+                            scale_msg = msgspec.msgpack.encode(
+                                ("SCALE_ELASTIC_EP", new_engine_count))
+                            await socket.send(scale_msg)
+                            continue
+
+                        # we're sending a request while the engines are
+                        # paused, so that it can wake the others up
+                        # (to run dummy EP loop).
+                        assert decoded[0] == "FIRST_REQ"
+                        target_eng_index = decoded[1]
+                        self.engines_running = True
+                        msg = msgspec.msgpack.encode(
+                            (target_eng_index, self.current_wave))
+                        await socket.send(msg)
+
+                    buf = None
+                    while True:
+                        # Drain all stats events (we only care about latest).
+                        future: asyncio.Future[bytes] = socket.recv(
+                            flags=zmq.NOBLOCK)
+                        if isinstance(future.exception(), zmq.Again):
+                            break
+                        buf = future.result()
+                    if buf is None:
+                        continue
+
+                    # Update local load-balancing state.
+                    counts, wave, running = msgspec.msgpack.decode(buf)
+                    self.current_wave = wave
+                    self.engines_running = running
+                    self.lb_engines = counts[count_slice]
+
+        resources.stats_update_task = asyncio.create_task(
+            run_engine_stats_update_task())
+
+    async def add_request_async(self, request: EngineCoreRequest) -> None:
+        self._ensure_stats_update_task()
+
+        request.current_wave = self.current_wave
+        request.client_index = self.client_index
+
+        chosen_engine = self.get_core_engine_for_request(request)
+        to_await = self._send_input(EngineCoreRequestType.ADD, request,
+                                    chosen_engine)
+        if not self.engines_running:
+            # Notify coordinator that we're sending a request
+            req_msg = msgspec.msgpack.encode(("FIRST_REQ", chosen_engine))
+            await self.first_req_send_socket.send(req_msg)
+
+        await to_await
+
+        self._ensure_output_queue_task()
+
+    def get_core_engine_for_request(self, request: EngineCoreRequest):
+        return self.core_engine
+
+
+class DPLBAsyncMPClient(DPAsyncMPClient):
+    """Asyncio-compatible client for multi-proc, multi-engine (data parallel)
+    EngineCore. Load-balances between multiple engine processes."""
+
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 executor_class: type[Executor],
+                 log_stats: bool,
+                 client_addresses: Optional[dict[str, str]] = None,
+                 client_index: int = 0):
+
+        # To route aborts to the correct engine.
+        self.reqs_in_flight: dict[str, EngineIdentity] = {}
+
+        super().__init__(vllm_config, executor_class, log_stats,
+                         client_addresses, client_index)
+
+        assert len(self.core_engines) > 1
+
+    def get_core_engine_for_request(
+            self, request: EngineCoreRequest) -> EngineIdentity:
+        # Engines are in rank order.
+        if (eng_index := request.data_parallel_rank) is None:
+            if not self.lb_engines:
+                return self.core_engine
+            # TODO use P2C alg for larger DP sizes
+            num_engines = len(self.lb_engines)
+            min_counts = [sys.maxsize, sys.maxsize]
+            eng_index = 0
+            for i in range(num_engines):
+                # Start from client_index to help with balancing when engines
+                # are empty.
+                idx = (self.client_index + i) % num_engines
+                counts = self.lb_engines[idx]
+                if counts < min_counts:
+                    min_counts = counts
+                    eng_index = idx
+            # Adjust local counts for better balancing between stats updates
+            # from the coordinator (which happen every 100ms).
+            if min_counts[0]:
+                min_counts[0] += 1
+            else:
+                min_counts[1] += 1
+
+        chosen_engine = self.core_engines[eng_index]
+        # Record which engine is chosen for this request, to handle aborts.
+        self.reqs_in_flight[request.request_id] = chosen_engine
+        return chosen_engine
+
+    async def call_utility_async(self, method: str, *args) -> Any:
+        # Only the result from the first engine is returned.
+        return (await asyncio.gather(*[
+            self._call_utility_async(method, *args, engine=engine)
+            for engine in self.core_engines
+        ]))[0]
+
+    @staticmethod
+    async def process_engine_outputs(self: "DPLBAsyncMPClient",
+                                     outputs: EngineCoreOutputs):
+        if outputs.finished_requests and self.reqs_in_flight:
+            for req_id in outputs.finished_requests:
+                self.reqs_in_flight.pop(req_id, None)
+
+    async def abort_requests_async(self, request_ids: list[str]) -> None:
+        if not request_ids or self.resources.engine_dead:
+            return
+
+        if len(request_ids) == 1:
+            # Fast-path common case.
+            if engine := self.reqs_in_flight.get(request_ids[0]):
+                await self._abort_requests(request_ids, engine)
+            return
+
+        by_engine = defaultdict[EngineIdentity, list[str]](list)
+        for req_id in request_ids:
+            if engine := self.reqs_in_flight.get(req_id):
+                by_engine[engine].append(req_id)
+        for engine, req_ids in by_engine.items():
+            await self._abort_requests(req_ids, engine)
+
+    async def _abort_requests(self, request_ids: list[str],
+                              engine: EngineIdentity) -> None:
+        await self._send_input(EngineCoreRequestType.ABORT, request_ids,
+                               engine)
+
+    async def _send_reconfig_message(
+            self, reconfig_request: ReconfigureDistributedRequest,
+            engine: EngineIdentity) -> asyncio.Future:
+        """Send reconfiguration message and return the result future without
+        waiting for completion."""
+        call_id = uuid.uuid1().int >> 64
+        future = asyncio.get_running_loop().create_future()
+        self.utility_results[call_id] = future
+        message = (EngineCoreRequestType.UTILITY.value, *self.encoder.encode(
+            (self.client_index, call_id, "reinitialize_distributed",
+             (reconfig_request, ))))
+        await self._send_input_message(message, engine, reconfig_request)
+        self._ensure_output_queue_task()
+        return future
+
+    async def scale_elastic_ep(self, new_data_parallel_size: int) -> None:
+        """Scale elastic EP data parallel size"""
+        cur_data_parallel_size = len(self.core_engines)
+
+        assert new_data_parallel_size != cur_data_parallel_size, (
+            f"new_data_parallel_size {new_data_parallel_size} must be "
+            f"different from cur_data_parallel_size {cur_data_parallel_size}")
+
+        assert self.vllm_config.parallel_config.data_parallel_backend == \
+            "ray", ("Only ray DP backend supports scaling elastic EP")
+
+        scale_up = new_data_parallel_size > cur_data_parallel_size
+
+        if scale_up:
+            await self._scale_up_elastic_ep(cur_data_parallel_size,
+                                            new_data_parallel_size)
+        else:
+            await self._scale_down_elastic_ep(cur_data_parallel_size,
+                                              new_data_parallel_size)
+
+    async def _scale_up_elastic_ep(self, cur_data_parallel_size: int,
+                                   new_data_parallel_size: int) -> None:
+        """Scale up the data parallel size by creating new engine cores
+        and reconfiguring existing ones."""
+        cur_data_parallel_size = len(self.core_engines)
+
+        # Phase 1: Send reconfigure messages to all existing engines and wait
+        # for them to be sent
+        reconfig_futures = []
+        self.vllm_config.parallel_config.data_parallel_master_port = \
+            get_open_port()
+        for engine in self.core_engines:
+            reconfig_request = ReconfigureDistributedRequest(
+                new_data_parallel_size=new_data_parallel_size,
+                new_data_parallel_rank=ReconfigureRankType.KEEP_CURRENT_RANK,
+                new_data_parallel_rank_local=\
+                ReconfigureRankType.KEEP_CURRENT_RANK,
+                new_data_parallel_master_ip=self.vllm_config.parallel_config.
+                data_parallel_master_ip,
+                new_data_parallel_master_port=self.vllm_config.parallel_config.
+                data_parallel_master_port)
+            future = await self._send_reconfig_message(reconfig_request,
+                                                       engine)
+            reconfig_futures.append(future)
+
+        logger.info("All reconfigure messages sent, starting engine creation")
+
+        # Phase 2: Create new engines now that reconfig messages have been sent
+        # self.resources.engine_manager is guaranteed to be
+        # CoreEngineActorManager for RayDPClient
+        assert isinstance(self.resources.engine_manager,
+                          CoreEngineActorManager)
+        self.resources.engine_manager.scale_up_elastic_ep(
+            self.vllm_config, new_data_parallel_size)
+
+        # Create new CoreEngine objects for the new engines
+        new_engine_identities = set()
+        for i in range(cur_data_parallel_size, new_data_parallel_size):
+            new_engine = i.to_bytes(2, "little")
+            self.core_engines.append(new_engine)
+            new_engine_identities.add(new_engine)
+
+        # Wait for ready messages from new engines on the input socket
+        sync_input_socket = zmq.Socket.shadow(self.input_socket)
+        while new_engine_identities:
+            if not sync_input_socket.poll(timeout=600_000):
+                raise TimeoutError(
+                    "Timed out waiting for new engines to send initial "
+                    "message on input socket.")
+            identity, _ = sync_input_socket.recv_multipart()
+            new_engine_identities.discard(identity)
+
+        # Phase 3: Wait for all existing engines to complete reconfiguration
+        logger.info("Waiting for existing engines to complete reconfiguration")
+        await asyncio.gather(*reconfig_futures)
+
+        # Notify coordinator about scale up through existing
+        # stats_update_task connection
+        self._ensure_stats_update_task()
+        scale_up_marker = msgspec.msgpack.encode(
+            ("SCALE_ELASTIC_EP", new_data_parallel_size))
+        await self.first_req_send_socket.send(scale_up_marker)
+
+        # Update the parallel config
+        self.vllm_config.parallel_config.data_parallel_size = \
+            new_data_parallel_size
+        logger.info(
+            "[Elastic EP] Scale up completed, new data parallel size: %s",
+            new_data_parallel_size)
+
+    async def _scale_down_elastic_ep(self, cur_data_parallel_size: int,
+                                     new_data_parallel_size: int) -> None:
+        """Scale down the data parallel size by shutting down and
+        reconfiguring existing engine cores."""
+        cur_data_parallel_size = len(self.core_engines)
+
+        self.vllm_config.parallel_config.data_parallel_master_port = \
+            get_open_port()
+
+        reconfig_futures = []
+        for cur_dp_rank, engine in enumerate(self.core_engines):
+            reconfig_request = ReconfigureDistributedRequest(
+                new_data_parallel_size=new_data_parallel_size,
+                new_data_parallel_rank=ReconfigureRankType.KEEP_CURRENT_RANK,
+                new_data_parallel_rank_local=\
+                ReconfigureRankType.KEEP_CURRENT_RANK,
+                new_data_parallel_master_ip=self.vllm_config.parallel_config.
+                data_parallel_master_ip,
+                new_data_parallel_master_port=self.vllm_config.parallel_config.
+                data_parallel_master_port)
+            if cur_dp_rank >= new_data_parallel_size:
+                reconfig_request.new_data_parallel_rank = \
+                ReconfigureRankType.SHUTDOWN_CURRENT_RANK
+            future = await self._send_reconfig_message(reconfig_request,
+                                                       engine)
+            reconfig_futures.append(future)
+
+        for _ in range(new_data_parallel_size, cur_data_parallel_size):
+            self.core_engines.pop()
+
+        await asyncio.gather(*reconfig_futures)
+
+        assert isinstance(self.resources.engine_manager,
+                          CoreEngineActorManager)
+        self.resources.engine_manager.scale_down_elastic_ep(
+            cur_data_parallel_size, new_data_parallel_size)
+
+        self._ensure_stats_update_task()
+        scale_down_marker = msgspec.msgpack.encode(
+            ("SCALE_ELASTIC_EP", new_data_parallel_size))
+        await self.first_req_send_socket.send(scale_down_marker)
+
+        self.vllm_config.parallel_config.data_parallel_size = \
+            new_data_parallel_size
+        logger.info(
+            "[Elastic EP] Scale down completed, new data parallel size: %s",
+            new_data_parallel_size)
diff --git a/vllm_v0.10.0/vllm/v1/engine/detokenizer.py b/vllm_v0.10.0/vllm/v1/engine/detokenizer.py
new file mode 100644
index 0000000..2f5504e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/engine/detokenizer.py
@@ -0,0 +1,292 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from typing import Optional
+
+import tokenizers
+from packaging import version
+from tokenizers import Tokenizer
+from tokenizers.decoders import DecodeStream
+from transformers import PreTrainedTokenizerFast
+
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.logger import init_logger
+from vllm.transformers_utils.detokenizer_utils import (
+    AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
+from vllm.v1.engine import EngineCoreRequest
+
+logger = init_logger(__name__)
+
+# Only tokenizers >= 0.21.1 supports DecodeStream used for
+# FastIncrementalDetokenizer.
+USE_FAST_DETOKENIZER = version.parse(
+    tokenizers.__version__) >= version.parse("0.21.1")
+
+# Error string from https://github.com/huggingface/tokenizers/blob/909fdde2a4ffedd9295206f705eb612be2a91b12/tokenizers/src/tokenizer/mod.rs#L1042
+INVALID_PREFIX_ERR_MSG = "Invalid prefix encountered"
+
+
+class IncrementalDetokenizer:
+
+    def __init__(self):
+        self.token_ids: list[int] = []
+
+    @property
+    def output_token_ids(self) -> list[int]:
+        return self.token_ids
+
+    def update(self, new_token_ids: list[int],
+               stop_terminated: bool) -> Optional[str]:
+        self.token_ids.extend(new_token_ids)
+        return None
+
+    def get_next_output_text(self, finished: bool, delta: bool) -> str:
+        return ""
+
+    @classmethod
+    def from_new_request(
+        cls,
+        tokenizer: Optional[AnyTokenizer],
+        request: EngineCoreRequest,
+    ) -> "IncrementalDetokenizer":
+
+        assert request.sampling_params is not None
+
+        if tokenizer is None:
+            # No tokenizer => skipping detokenization.
+            return IncrementalDetokenizer()
+
+        if USE_FAST_DETOKENIZER and isinstance(tokenizer,
+                                               PreTrainedTokenizerFast):
+            # Fast tokenizer => use tokenizers library DecodeStream.
+            return FastIncrementalDetokenizer(tokenizer, request)
+
+        # Fall back to slow python-based incremental detokenization.
+        return SlowIncrementalDetokenizer(tokenizer, request)
+
+
+class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC):
+
+    def __init__(self, request: EngineCoreRequest):
+        super().__init__()
+
+        # Stop strings
+        params = request.sampling_params
+        assert params is not None
+        self.stop = stop = params.stop
+        self.include_stop_str_in_output = params.include_stop_str_in_output
+
+        # Number of chars to hold back when stop strings are to be excluded
+        # from streamed output.
+        if stop and not self.include_stop_str_in_output:
+            self.stop_buffer_length = max(len(s) for s in stop) - 1
+        else:
+            self.stop_buffer_length = 0
+        self._last_output_text_offset: int = 0
+
+        # Generation data
+        self.output_text = ""
+
+    def update(self, new_token_ids: list[int],
+               stop_terminated: bool) -> Optional[str]:
+        """
+        Update RequestState for the request_id by:
+            1) Detokenize the new token ids incrementally.
+            2) Evaluate stop criteria.
+
+        Return matched stop string or None.
+        """
+        if not new_token_ids:
+            # Skip detokenization if no new token ids.
+            return None
+
+        if stop_terminated and not self.include_stop_str_in_output:
+            # If stop-terminated, exclude last token from detokenization
+            # based on include_stop_str_in_output parameter.
+            skipped_stop_token_id = new_token_ids[-1]
+            new_token_ids = new_token_ids[:-1]
+        else:
+            skipped_stop_token_id = None
+
+        # 1) Detokenize the new token ids incrementally.
+        # TODO(woosuk): This method becomes very inefficient when the number of
+        # new_token_ids is more than 1. We need to optimize this.
+        offset_before = len(self.output_text)
+        for new_token_id in new_token_ids:
+            self.token_ids.append(new_token_id)
+            self.output_text += self.decode_next(new_token_id)
+
+        if stop_terminated:
+            if skipped_stop_token_id is not None:
+                # Cleanup after skipping detokenization.
+                self.token_ids.append(skipped_stop_token_id)
+            # Stop token triggered; skip stop string check.
+            return None
+
+        # 2) Evaluate stop strings.
+        stop_string = None
+        if self.stop:
+            stop = StopChecker.check_stop_strings(
+                output_text=self.output_text,
+                new_char_count=len(self.output_text) - offset_before,
+                stop=self.stop,
+                include_in_output=self.include_stop_str_in_output,
+            )
+            if stop is not None:
+                stop_string, truncate_to = stop
+                if truncate_to != -1:
+                    self.output_text = self.output_text[:truncate_to]
+
+        return stop_string
+
+    @abstractmethod
+    def decode_next(self, next_token_id: int) -> str:
+        raise NotImplementedError
+
+    def get_next_output_text(self, finished: bool, delta: bool) -> str:
+        """If delta is True, only new text since the last call to
+        this method is returned"""
+
+        # We return the full output text if the sequence is finished.
+        buffer_length = 0 if finished else self.stop_buffer_length
+        if not delta:
+            return self.output_text[:-buffer_length] if buffer_length else (
+                self.output_text)
+        length = len(self.output_text) - buffer_length
+        last_offset = self._last_output_text_offset
+        if last_offset < length:
+            self._last_output_text_offset = length
+            return self.output_text[last_offset:length]
+        return ""
+
+
+class FastIncrementalDetokenizer(BaseIncrementalDetokenizer):
+
+    def __init__(self, tokenizer: PreTrainedTokenizerFast,
+                 request: EngineCoreRequest):
+        super().__init__(request)
+
+        sampling_params = request.sampling_params
+        assert sampling_params is not None
+
+        self.request_id = request.request_id
+        self.skip_special_tokens = sampling_params.skip_special_tokens
+        self.stream = DecodeStream(
+            skip_special_tokens=self.skip_special_tokens)
+
+        self.tokenizer: Tokenizer = tokenizer._tokenizer
+
+        # Find a safe place to start.
+        prompt_suffix = request.prompt_token_ids
+        prompt_len = len(prompt_suffix)
+        if prompt_len > 4:
+            for i in range(4, min(prompt_len + 1, 24)):
+                suffix = request.prompt_token_ids[-i:]
+                if '�' not in self.tokenizer.decode(suffix):
+                    prompt_suffix = suffix
+                    break
+
+        # Prime the stream.
+        for tid in prompt_suffix:
+            self._protected_step(tid)
+
+        self.spaces_between_special_tokens = (
+            sampling_params.skip_special_tokens
+            or sampling_params.spaces_between_special_tokens)
+
+        if not self.spaces_between_special_tokens:
+            # Store dict of added token ids so that we can suppress
+            # the spaces between them.
+            if (added_token_ids := getattr(self.tokenizer, "added_token_ids",
+                                           None)) is None:
+                self.tokenizer.added_token_ids = added_token_ids = {
+                    tid: tok.content
+                    for tid, tok in
+                    self.tokenizer.get_added_tokens_decoder().items()
+                }
+
+            if added_token_ids:
+                self.last_special = False
+                self.added_token_ids = added_token_ids
+            else:
+                # No added tokens.
+                self.spaces_between_special_tokens = True
+
+    def decode_next(self, next_token_id: int) -> str:
+        token = self._protected_step(next_token_id)
+
+        if not self.spaces_between_special_tokens:
+            special_token = self.added_token_ids.get(next_token_id)
+            is_special = special_token is not None
+            if is_special and self.last_special:
+                # Return raw token string without any prefixed spaces.
+                token = special_token
+            self.last_special = is_special
+
+        return token or ""
+
+    def _protected_step(self, next_token_id: int) -> Optional[str]:
+        try:
+            token = self.stream.step(self.tokenizer, next_token_id)
+        except Exception as e:
+            if str(e) != INVALID_PREFIX_ERR_MSG:
+                raise e
+            # Recover from edge case where tokenizer can produce non-monotonic,
+            # invalid UTF-8 output, which breaks the internal state of
+            # tokenizers' DecodeStream.
+            # See https://github.com/vllm-project/vllm/issues/17448.
+            logger.warning(
+                "Encountered invalid prefix detokenization error"
+                " for request %s, resetting decode stream.", self.request_id)
+            self.stream = DecodeStream(self.skip_special_tokens)
+            token = self.stream.step(self.tokenizer, next_token_id)
+        return token
+
+
+class SlowIncrementalDetokenizer(BaseIncrementalDetokenizer):
+
+    def __init__(self, tokenizer: AnyTokenizer, request: EngineCoreRequest):
+        super().__init__(request)
+
+        self.tokenizer = tokenizer
+        params = request.sampling_params
+        assert params is not None
+
+        # Metadata for incremental detokenization.
+        self.tokens, self.prefix_offset, self.read_offset = (
+            convert_prompt_ids_to_tokens(
+                tokenizer=tokenizer,
+                prompt_ids=request.prompt_token_ids,
+                skip_special_tokens=params.skip_special_tokens,
+            ))
+
+        self.token_ids.extend(request.prompt_token_ids)
+        self.prompt_len = len(request.prompt_token_ids)
+
+        self.skip_special_tokens = params.skip_special_tokens
+        self.spaces_between_special_tokens = (
+            params.spaces_between_special_tokens)
+
+    @property
+    def output_token_ids(self) -> list[int]:
+        return self.token_ids if not self.prompt_len else (
+            self.token_ids[self.prompt_len:])
+
+    def decode_next(self, next_token_id: int) -> str:
+        new_tokens, decoded_text, prefix_offset, read_offset = (
+            detokenize_incrementally(
+                tokenizer=self.tokenizer,
+                all_input_ids=self.token_ids,
+                prev_tokens=self.tokens,
+                prefix_offset=self.prefix_offset,
+                read_offset=self.read_offset,
+                skip_special_tokens=self.skip_special_tokens,
+                spaces_between_special_tokens=self.
+                spaces_between_special_tokens,
+            ))
+
+        self.tokens.extend(new_tokens)
+        self.prefix_offset = prefix_offset
+        self.read_offset = read_offset
+
+        return decoded_text
diff --git a/vllm_v0.10.0/vllm/v1/engine/exceptions.py b/vllm_v0.10.0/vllm/v1/engine/exceptions.py
new file mode 100644
index 0000000..692ba9d
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/engine/exceptions.py
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+class EngineGenerateError(Exception):
+    """Raised when a AsyncLLM.generate() fails. Recoverable."""
+    pass
+
+
+class EngineDeadError(Exception):
+    """Raised when the EngineCore dies. Unrecoverable."""
+
+    def __init__(self, *args, suppress_context: bool = False, **kwargs):
+        ENGINE_DEAD_MESSAGE = "EngineCore encountered an issue. See stack trace (above) for the root cause."  # noqa: E501
+
+        super().__init__(ENGINE_DEAD_MESSAGE, *args, **kwargs)
+        # Make stack trace clearer when using with LLMEngine by
+        # silencing irrelevant ZMQError.
+        self.__suppress_context__ = suppress_context
diff --git a/vllm_v0.10.0/vllm/v1/engine/llm_engine.py b/vllm_v0.10.0/vllm/v1/engine/llm_engine.py
new file mode 100644
index 0000000..991242e
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/engine/llm_engine.py
@@ -0,0 +1,322 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Mapping
+from copy import copy
+from typing import Any, Callable, Optional, Union
+
+from typing_extensions import TypeVar
+
+import vllm.envs as envs
+from vllm.config import ParallelConfig, VllmConfig
+from vllm.distributed import stateless_destroy_torch_distributed_process_group
+from vllm.engine.arg_utils import EngineArgs
+from vllm.inputs import PromptType
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.outputs import PoolingRequestOutput, RequestOutput
+from vllm.pooling_params import PoolingParams
+from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.tokenizer_group import (
+    TokenizerGroup, init_tokenizer_from_configs)
+from vllm.usage.usage_lib import UsageContext
+from vllm.utils import Device
+from vllm.v1.engine.core_client import EngineCoreClient
+from vllm.v1.engine.output_processor import OutputProcessor
+from vllm.v1.engine.parallel_sampling import ParentRequest
+from vllm.v1.engine.processor import Processor
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.metrics.loggers import (PrometheusStatLogger, StatLoggerBase,
+                                     StatLoggerFactory)
+from vllm.v1.metrics.reader import Metric, get_metrics_snapshot
+from vllm.v1.metrics.stats import IterationStats
+
+logger = init_logger(__name__)
+
+_R = TypeVar("_R", default=Any)
+
+
+class LLMEngine:
+    """Legacy LLMEngine for backwards compatibility."""
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+        use_cached_outputs: bool = False,
+        multiprocess_mode: bool = False,
+    ) -> None:
+        if not envs.VLLM_USE_V1:
+            raise ValueError(
+                "Using V1 LLMEngine, but envs.VLLM_USE_V1=False. "
+                "This should not happen. As a workaround, try using "
+                "LLMEngine.from_vllm_config(...) or explicitly set "
+                "VLLM_USE_V1=0 or 1 and report this issue on Github.")
+
+        if stat_loggers is not None:
+            raise NotImplementedError(
+                "Passing StatLoggers to LLMEngine in V1 is not yet supported. "
+                "Set VLLM_USE_V1=0 and file and issue on Github.")
+
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+
+        self.log_stats = log_stats
+        self.stat_logger: Optional[StatLoggerBase] = None
+        if self.log_stats:
+            self.stat_logger = PrometheusStatLogger(vllm_config)
+
+        # important: init dp group before init the engine_core
+        # In the decoupled engine case this is handled in EngineCoreProc.
+        parallel_config = vllm_config.parallel_config
+        if not multiprocess_mode and parallel_config.data_parallel_size > 1:
+            self.dp_group = parallel_config.stateless_init_dp_group()
+        else:
+            self.dp_group = None
+        self.should_execute_dummy_batch = False
+
+        if self.model_config.skip_tokenizer_init:
+            self.tokenizer = None
+        else:
+            # Tokenizer (+ ensure liveness if running in another process).
+            self.tokenizer = init_tokenizer_from_configs(
+                model_config=vllm_config.model_config,
+                scheduler_config=vllm_config.scheduler_config,
+                lora_config=vllm_config.lora_config)
+
+        # Processor (convert Inputs --> EngineCoreRequests)
+        self.processor = Processor(vllm_config=vllm_config,
+                                   tokenizer=self.tokenizer,
+                                   mm_registry=mm_registry)
+
+        # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
+        self.output_processor = OutputProcessor(self.tokenizer,
+                                                log_stats=self.log_stats)
+
+        # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
+        self.engine_core = EngineCoreClient.make_client(
+            multiprocess_mode=multiprocess_mode,
+            asyncio_mode=False,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=self.log_stats,
+        )
+
+        if not multiprocess_mode:
+            # for v0 compatibility
+            self.model_executor = self.engine_core.engine_core.model_executor  # type: ignore
+
+        # Don't keep the dummy data in memory
+        self.reset_mm_cache()
+
+    @classmethod
+    def from_vllm_config(
+        cls,
+        vllm_config: VllmConfig,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        disable_log_stats: bool = False,
+    ) -> "LLMEngine":
+        return cls(vllm_config=vllm_config,
+                   executor_class=Executor.get_class(vllm_config),
+                   log_stats=(not disable_log_stats),
+                   usage_context=usage_context,
+                   stat_loggers=stat_loggers,
+                   multiprocess_mode=envs.VLLM_ENABLE_V1_MULTIPROCESSING)
+
+    @classmethod
+    def from_engine_args(
+        cls,
+        engine_args: EngineArgs,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        enable_multiprocessing: bool = False,
+    ) -> "LLMEngine":
+        """Creates an LLM engine from the engine arguments."""
+
+        # Create the engine configs.
+        vllm_config = engine_args.create_engine_config(usage_context)
+        executor_class = Executor.get_class(vllm_config)
+
+        if envs.VLLM_ENABLE_V1_MULTIPROCESSING:
+            logger.debug("Enabling multiprocessing for LLMEngine.")
+            enable_multiprocessing = True
+
+        # Create the LLMEngine.
+        return cls(vllm_config=vllm_config,
+                   executor_class=executor_class,
+                   log_stats=not engine_args.disable_log_stats,
+                   usage_context=usage_context,
+                   stat_loggers=stat_loggers,
+                   multiprocess_mode=enable_multiprocessing)
+
+    def get_num_unfinished_requests(self) -> int:
+        return self.output_processor.get_num_unfinished_requests()
+
+    def has_unfinished_requests(self) -> bool:
+        has_unfinished = self.output_processor.has_unfinished_requests()
+        if self.dp_group is None:
+            return has_unfinished or self.engine_core.dp_engines_running()
+        return self.has_unfinished_requests_dp(has_unfinished)
+
+    def has_unfinished_requests_dp(self, has_unfinished: bool) -> bool:
+        aggregated_has_unfinished = ParallelConfig.has_unfinished_dp(
+            self.dp_group, has_unfinished)
+        if not has_unfinished and aggregated_has_unfinished:
+            self.should_execute_dummy_batch = True
+        return aggregated_has_unfinished
+
+    @classmethod
+    def validate_outputs(cls, outputs, output_type):
+        return outputs
+
+    def abort_request(self, request_ids: list[str]) -> None:
+        """Remove request_ids from EngineCore and Detokenizer."""
+
+        request_ids = self.output_processor.abort_requests(request_ids)
+        self.engine_core.abort_requests(request_ids)
+
+    def add_request(
+        self,
+        request_id: str,
+        prompt: PromptType,
+        params: Union[SamplingParams, PoolingParams],
+        arrival_time: Optional[float] = None,
+        lora_request: Optional[LoRARequest] = None,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        priority: int = 0,
+    ) -> None:
+        # Validate the request_id type.
+        if not isinstance(request_id, str):
+            raise TypeError(
+                f"request_id must be a string, got {type(request_id)}")
+
+        # Process raw inputs into the request.
+        prompt_str, request = self.processor.process_inputs(
+            request_id, prompt, params, arrival_time, lora_request,
+            tokenization_kwargs, trace_headers, priority)
+
+        n = params.n if isinstance(params, SamplingParams) else 1
+
+        if n == 1:
+            # Make a new RequestState and queue.
+            self.output_processor.add_request(request, prompt_str, None, 0)
+            # Add the request to EngineCore.
+            self.engine_core.add_request(request)
+            return
+
+        # Fan out child requests (for n>1).
+        parent_req = ParentRequest(request_id, params)
+        for idx in range(n):
+            request_id, params = parent_req.get_child_info(idx)
+            child_request = request if idx == n - 1 else copy(request)
+            child_request.request_id = request_id
+            child_request.sampling_params = params
+
+            # Make a new RequestState and queue.
+            self.output_processor.add_request(child_request, prompt_str,
+                                              parent_req, idx)
+            # Add the request to EngineCore.
+            self.engine_core.add_request(child_request)
+
+    def step(self) -> Union[list[RequestOutput], list[PoolingRequestOutput]]:
+
+        if self.should_execute_dummy_batch:
+            self.should_execute_dummy_batch = False
+            self.engine_core.execute_dummy_batch()
+            return []
+
+        # 1) Get EngineCoreOutput from the EngineCore.
+        outputs = self.engine_core.get_output()
+
+        # 2) Process EngineCoreOutputs.
+        iteration_stats = IterationStats() if self.log_stats else None
+        processed_outputs = self.output_processor.process_outputs(
+            outputs.outputs,
+            engine_core_timestamp=outputs.timestamp,
+            iteration_stats=iteration_stats)
+
+        # 3) Abort any reqs that finished due to stop strings.
+        self.engine_core.abort_requests(processed_outputs.reqs_to_abort)
+
+        # 4) Record stats
+        if self.stat_logger is not None:
+            assert outputs.scheduler_stats is not None
+            self.stat_logger.record(scheduler_stats=outputs.scheduler_stats,
+                                    iteration_stats=iteration_stats)
+
+        return processed_outputs.request_outputs
+
+    def get_vllm_config(self):
+        return self.vllm_config
+
+    def get_model_config(self):
+        return self.model_config
+
+    def start_profile(self):
+        self.engine_core.profile(True)
+
+    def stop_profile(self):
+        self.engine_core.profile(False)
+
+    def reset_mm_cache(self):
+        self.processor.mm_registry.reset_processor_cache()
+        self.processor.mm_input_cache_client.reset()
+        self.engine_core.reset_mm_cache()
+
+    def reset_prefix_cache(self, device: Optional[Device] = None):
+        self.engine_core.reset_prefix_cache()
+
+    def sleep(self, level: int = 1):
+        self.engine_core.sleep(level)
+
+    def wake_up(self, tags: Optional[list[str]] = None):
+        self.engine_core.wake_up(tags)
+
+    def is_sleeping(self) -> bool:
+        return self.engine_core.is_sleeping()
+
+    def get_metrics(self) -> list[Metric]:
+        assert self.log_stats, "Stat logging disabled"
+        return get_metrics_snapshot()
+
+    def get_tokenizer_group(self) -> TokenizerGroup:
+        if self.tokenizer is None:
+            raise ValueError("Unable to get tokenizer because "
+                             "skip_tokenizer_init is True")
+
+        return self.tokenizer
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        """Load a new LoRA adapter into the engine for future requests."""
+        return self.engine_core.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        """Remove an already loaded LoRA adapter."""
+        return self.engine_core.remove_lora(lora_id)
+
+    def list_loras(self) -> set[int]:
+        """List all registered adapters."""
+        return self.engine_core.list_loras()
+
+    def pin_lora(self, lora_id: int) -> bool:
+        """Prevent an adapter from being evicted."""
+        return self.engine_core.pin_lora(lora_id)
+
+    def collective_rpc(self,
+                       method: Union[str, Callable[..., _R]],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        return self.engine_core.collective_rpc(method, timeout, args, kwargs)
+
+    def __del__(self):
+        if dp_group := getattr(self, "dp_group", None):
+            stateless_destroy_torch_distributed_process_group(dp_group)
diff --git a/vllm_v0.10.0/vllm/v1/engine/logprobs.py b/vllm_v0.10.0/vllm/v1/engine/logprobs.py
new file mode 100644
index 0000000..e95da0a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/engine/logprobs.py
@@ -0,0 +1,200 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import itertools
+from collections.abc import Iterable
+from dataclasses import dataclass
+from typing import Optional
+
+from vllm.logger import init_logger
+from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
+from vllm.transformers_utils.detokenizer_utils import (
+    AnyTokenizer, convert_ids_list_to_tokens)
+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
+from vllm.v1.outputs import LogprobsLists, LogprobsTensors
+
+logger = init_logger(__name__)
+
+NONES = itertools.repeat(None)
+
+
+@dataclass
+class LogprobsProcessor:
+
+    # Tokenizer for this request,
+    # None if detokenization is disabled.
+    tokenizer: Optional[AnyTokenizer]
+
+    # Logprobs for this request
+    logprobs: Optional[SampleLogprobs]
+    prompt_logprobs: Optional[PromptLogprobs]
+    cumulative_logprob: Optional[float]
+    num_logprobs: Optional[int]
+    num_prompt_logprobs: Optional[int]
+
+    @classmethod
+    def from_new_request(
+        cls,
+        tokenizer: Optional[AnyTokenizer],
+        request: EngineCoreRequest,
+    ) -> "LogprobsProcessor":
+        assert request.sampling_params is not None
+        num_logprobs = request.sampling_params.logprobs
+        num_prompt_logprobs = request.sampling_params.prompt_logprobs
+        return cls(
+            tokenizer=tokenizer,
+            cumulative_logprob=(None if num_logprobs is None else 0.),
+            logprobs=(None if num_logprobs is None else []),
+            # NOTE: logprob of first prompt token is None.
+            prompt_logprobs=(None if num_prompt_logprobs is None else [None]),
+            num_prompt_logprobs=num_prompt_logprobs,
+            num_logprobs=num_logprobs,
+        )
+
+    def _update_sample_logprobs(self, logprobs_lists: LogprobsLists) -> None:
+        """Update with sample logprobs from EngineCore.
+
+        Outer lists are only of len > 1 if EngineCore made
+        >1 tokens in prior step (e.g. in spec decoding).
+
+        Args:
+          logprobs_lists: the lists of logprob tokens, logprobs, and ranks.
+
+        """
+
+        assert self.num_logprobs is not None
+        assert self.logprobs is not None
+        assert self.cumulative_logprob is not None
+
+        token_ids_lst, logprobs_lst, ranks_lst = logprobs_lists
+
+        for rank, logprobs, token_ids in zip(ranks_lst, logprobs_lst,
+                                             token_ids_lst):
+
+            # Detokenize (non-incrementally).
+            decoded_tokens = NONES if self.tokenizer is None else (
+                convert_ids_list_to_tokens(self.tokenizer, token_ids))
+
+            # Sampler puts the sampled logprob in first.
+            sampled_token_logprob = logprobs[0]
+            self.cumulative_logprob += sampled_token_logprob
+
+            # Update with the Logprob dictionary for this pos.
+            self.logprobs.append(
+                self._make_logprob_dict(
+                    logprobs,
+                    token_ids,
+                    decoded_tokens,
+                    rank,
+                    self.num_logprobs,
+                ))
+
+    def _update_prompt_logprobs(
+        self,
+        prompt_logprobs_tensors: LogprobsTensors,
+    ) -> None:
+        """Update with prompt logprobs from EngineCore.
+
+        Args:
+          prompt_logprobs_tensors: tuple containing the prompt logprobs
+                                   tensors.
+
+        """
+
+        # Prompt logprobs are enabled.
+        assert self.num_prompt_logprobs is not None
+        assert self.prompt_logprobs is not None
+
+        token_ids, logprobs, ranks = prompt_logprobs_tensors
+
+        # Detokenize non-incrementally.
+        # Output is flat: [num_tok, num_lps] -> [num_tok * num_lps]
+        decoded_tokens = None if self.tokenizer is None else (
+            convert_ids_list_to_tokens(self.tokenizer,
+                                       token_ids.flatten().tolist()))
+
+        # Recover shapes.
+        num_prompt_tokens, num_logprobs = logprobs.shape
+
+        # Pythonize the torch tensors.
+        prompt_token_ranks = ranks.tolist()
+        prompt_logprobs = logprobs.tolist()
+        token_ids = token_ids.tolist()
+
+        # Make Logprob for each position.
+        for pos in range(num_prompt_tokens):
+            # Handle flattening.
+            offset = pos * num_logprobs
+            offset_end = offset + num_logprobs
+            decoded_tokens_for_pos = NONES \
+            if decoded_tokens is None else decoded_tokens[offset:offset_end]
+
+            # Update with the Logprob dictionary for this pos.
+            self.prompt_logprobs.append(
+                self._make_logprob_dict(prompt_logprobs[pos], token_ids[pos],
+                                        decoded_tokens_for_pos,
+                                        prompt_token_ranks[pos],
+                                        self.num_prompt_logprobs))
+
+    def pop_prompt_logprobs(self) -> Optional[PromptLogprobs]:
+        """Pop and return all request prompt logprobs
+        
+        The logprobs processor aggregates prompt chunk logprobs
+        over one or more prefill chunks. This method returns
+        all prompt logprobs at once and then forgets them.
+        Ensures correct RequestOutputKind.DELTA semantics
+        wherein all prompt logprobs are returned at once at
+        the end of prefill.
+
+        Returns:
+          None if prompt logprobs are disabled for this request.
+          List of all prompt logprobs, otherwise.
+        """
+        plp = self.prompt_logprobs
+        if plp:
+            self.prompt_logprobs = []
+        return plp
+
+    @staticmethod
+    def _make_logprob_dict(
+        logprobs: list[float],
+        logprob_token_ids: list[int],
+        decoded_tokens: Iterable[Optional[str]],
+        rank: int,
+        num_logprobs: int,
+    ) -> dict[int, Logprob]:
+        """Make a Logprob dictionary for a position.
+
+        Args:
+          logprobs: list of log probabilities
+          logprob_token_ids: list of top token ids
+          decoded_tokens: list of decoded top tokens
+          rank: rank of the sampled token
+          num_logprobs: number of logprobs requested
+            by the user (in addition to sampled logprob)
+
+        Returns:
+          dict[token id, Logprob]
+        """
+
+        # We do not need a special case for the sampled token
+        # being in the topk, since inserting duplicated data
+        # into a dictionary twice is the same as doing it once.
+        topk_ranks = range(1, num_logprobs + 1)
+        ranks = itertools.chain((rank, ), topk_ranks)
+
+        return {
+            token_id: Logprob(
+                logprob=logprob,
+                rank=rank,
+                decoded_token=token,
+            )
+            for token_id, logprob, rank, token in zip(
+                logprob_token_ids, logprobs, ranks, decoded_tokens)
+        }
+
+    def update_from_output(self, output: EngineCoreOutput) -> None:
+        if output.new_logprobs is not None:
+            self._update_sample_logprobs(output.new_logprobs)
+        if output.new_prompt_logprobs_tensors is not None:
+            self._update_prompt_logprobs(output.new_prompt_logprobs_tensors)
diff --git a/vllm_v0.10.0/vllm/v1/engine/mm_input_cache.py b/vllm_v0.10.0/vllm/v1/engine/mm_input_cache.py
new file mode 100644
index 0000000..abe98a1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/engine/mm_input_cache.py
@@ -0,0 +1,91 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Sequence
+from typing import Optional
+
+from vllm.envs import VLLM_MM_INPUT_CACHE_GIB
+from vllm.multimodal import MultiModalKwargs
+from vllm.multimodal.processing import ProcessingCache
+from vllm.utils import is_list_of
+
+# The idea of multimodal preprocessing caching is based on having a client and
+# a server, where the client executes in the frontend process (=P0) and the
+# server in the core process (=P1).
+#
+# -- Client:
+#  - BaseMultiModalProcessor to process MultiModalData into MultiModalKwargs
+#    with built-in caching functionality, with mm_hash as its identifier.
+#  - MirroredProcessingCache to keep track of the cached entries and
+#    determine whether to send the MultiModalKwargs to P1.
+#
+# -- Server:
+#  - MirroredProcessingCache to store the MultiModalKwargs from P0.
+#
+# The caching for both client and server is mirrored, and this allows us
+# to avoid the serialization of "mm_inputs" (like pixel values) between
+# client (=P0) and server (=P1) processes if the mm_hash is found in the client
+# cache.
+
+# Both Client and Server must use the same cache size
+# (to perform mirrored caching). This cache size is set by the environment
+# variable VLLM_MM_INPUT_CACHE_GIB.
+
+
+class MirroredProcessingCache:
+
+    def __init__(self, model_config):
+        mm_config = model_config.multimodal_config
+        disable_mm_preprocessor_cache = (
+            mm_config is not None and mm_config.disable_mm_preprocessor_cache)
+        self.use_cache = not disable_mm_preprocessor_cache
+        self.mm_cache = ProcessingCache.get_lru_cache(VLLM_MM_INPUT_CACHE_GIB,
+                                                      MultiModalKwargs)
+
+    def get_and_update_p0(
+        self,
+        mm_inputs: Sequence[MultiModalKwargs],
+        mm_hashes: list[str],
+    ) -> Sequence[Optional[MultiModalKwargs]]:
+        assert len(mm_inputs) == len(mm_hashes)
+
+        if not self.use_cache:
+            assert is_list_of(mm_inputs, MultiModalKwargs)
+            return mm_inputs
+
+        full_mm_inputs = list[Optional[MultiModalKwargs]]()
+        for mm_input, mm_hash in zip(mm_inputs, mm_hashes):
+            if self.mm_cache.get(mm_hash) is not None:
+                mm_input = None
+            else:
+                self.mm_cache[mm_hash] = mm_input
+
+            full_mm_inputs.append(mm_input)
+
+        return full_mm_inputs
+
+    def get_and_update_p1(
+        self,
+        mm_inputs: Sequence[Optional[MultiModalKwargs]],
+        mm_hashes: list[str],
+    ) -> Sequence[MultiModalKwargs]:
+        assert len(mm_inputs) == len(mm_hashes)
+
+        if not self.use_cache:
+            assert is_list_of(mm_inputs, MultiModalKwargs)
+            return mm_inputs
+
+        full_mm_inputs = list[MultiModalKwargs]()
+        for mm_input, mm_hash in zip(mm_inputs, mm_hashes):
+            if mm_input is None:
+                mm_input = self.mm_cache[mm_hash]
+            else:
+                self.mm_cache[mm_hash] = mm_input
+
+            full_mm_inputs.append(mm_input)
+
+        return full_mm_inputs
+
+    def reset(self) -> bool:
+        self.mm_cache.clear()
+
+        return True
diff --git a/vllm_v0.10.0/vllm/v1/engine/output_processor.py b/vllm_v0.10.0/vllm/v1/engine/output_processor.py
new file mode 100644
index 0000000..3be6c48
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/engine/output_processor.py
@@ -0,0 +1,479 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+from collections.abc import Iterable
+from dataclasses import dataclass
+from typing import Any, Optional, Union, cast
+
+import torch
+
+from vllm.outputs import (CompletionOutput, PoolingOutput,
+                          PoolingRequestOutput, RequestOutput)
+from vllm.sampling_params import RequestOutputKind
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
+from vllm.v1.engine.detokenizer import IncrementalDetokenizer
+from vllm.v1.engine.logprobs import LogprobsProcessor
+from vllm.v1.engine.parallel_sampling import ParentRequest
+from vllm.v1.metrics.stats import (IterationStats, LoRARequestStates,
+                                   RequestStateStats)
+
+
+class RequestOutputCollector:
+    """
+    Collects streamed RequestOutputs per individual request,
+    for hand-off to the consuming asyncio generate task.
+
+    When streaming deltas, RequestOutputs are merged if the
+    producer gets ahead of the consumer.
+    """
+
+    def __init__(self, output_kind: RequestOutputKind):
+        self.aggregate = output_kind == RequestOutputKind.DELTA
+        self.output: Optional[Union[RequestOutput, PoolingRequestOutput,
+                                    Exception]] = None
+        self.ready = asyncio.Event()
+
+    def put(self, output: Union[RequestOutput, PoolingRequestOutput,
+                                Exception]) -> None:
+        """Non-blocking put operation."""
+        if self.output is None or isinstance(output, Exception):
+            self.output = output
+            self.ready.set()
+        elif isinstance(self.output, (RequestOutput, PoolingRequestOutput)):
+            # This ensures that request outputs with different request indexes
+            # (if n > 1) do not override each other.
+            self.output.add(output, aggregate=self.aggregate)
+
+    async def get(self) -> Union[RequestOutput, PoolingRequestOutput]:
+        """Get operation blocks on put event."""
+        while (output := self.output) is None:
+            await self.ready.wait()
+        self.output = None
+        self.ready.clear()
+        if isinstance(output, Exception):
+            raise output
+        return output
+
+    def get_nowait(
+            self) -> Optional[Union[RequestOutput, PoolingRequestOutput]]:
+        """Non-blocking get operation."""
+        output = self.output
+        if output is not None:
+            self.output = None
+            self.ready.clear()
+        if isinstance(output, Exception):
+            raise output
+        return output
+
+
+@dataclass
+class OutputProcessorOutput:
+
+    request_outputs: list[Union[RequestOutput, PoolingRequestOutput]]
+    reqs_to_abort: list[str]
+
+
+class RequestState:
+
+    def __init__(
+        self,
+        request_id: str,
+        parent_req: Optional[ParentRequest],
+        request_index: int,
+        lora_name: Optional[str],
+        output_kind: RequestOutputKind,
+        prompt: Optional[str],
+        prompt_token_ids: list[int],
+        logprobs_processor: Optional[LogprobsProcessor],
+        detokenizer: Optional[IncrementalDetokenizer],
+        max_tokens_param: Optional[int],
+        arrival_time: float,
+        queue: Optional[RequestOutputCollector],
+        log_stats: bool,
+    ):
+        self.request_id = request_id
+        self.parent_req = parent_req
+        self.request_index = request_index
+        self.lora_name = lora_name
+        self.output_kind = output_kind
+        self.prompt = prompt
+        self.prompt_token_ids = prompt_token_ids
+        self.prompt_len = len(prompt_token_ids)
+        self.logprobs_processor = logprobs_processor
+        self.detokenizer = detokenizer
+        self.max_tokens_param = max_tokens_param
+        self.is_prefilling = True
+        self.queue = queue
+
+        self.stats = RequestStateStats(
+            arrival_time=arrival_time) if log_stats else None
+
+    @classmethod
+    def from_new_request(
+        cls,
+        tokenizer: AnyTokenizer,
+        request: EngineCoreRequest,
+        prompt: Optional[str],
+        parent_req: Optional[ParentRequest],
+        request_index: int,
+        queue: Optional[RequestOutputCollector],
+        log_stats: bool,
+    ) -> "RequestState":
+
+        if sampling_params := request.sampling_params:
+            if not sampling_params.detokenize:
+                tokenizer = None
+            output_kind = sampling_params.output_kind
+            logprobs_processor = LogprobsProcessor.from_new_request(
+                tokenizer=tokenizer,
+                request=request,
+            )
+            detokenizer = IncrementalDetokenizer.from_new_request(
+                tokenizer=tokenizer,
+                request=request,
+            )
+            max_tokens_param = sampling_params.max_tokens
+        else:
+            logprobs_processor = None
+            detokenizer = None
+            max_tokens_param = None
+            assert request.pooling_params is not None
+            output_kind = request.pooling_params.output_kind
+
+        return cls(
+            request_id=request.request_id,
+            parent_req=parent_req,
+            request_index=request_index,
+            lora_name=(request.lora_request.name
+                       if request.lora_request is not None else None),
+            output_kind=output_kind,
+            prompt=prompt,
+            prompt_token_ids=request.prompt_token_ids,
+            logprobs_processor=logprobs_processor,
+            detokenizer=detokenizer,
+            max_tokens_param=max_tokens_param,
+            arrival_time=request.arrival_time,
+            queue=queue,
+            log_stats=log_stats,
+        )
+
+    def make_request_output(
+        self,
+        new_token_ids: list[int],
+        pooling_output: Optional[torch.Tensor],
+        finish_reason: Optional[FinishReason],
+        stop_reason: Union[int, str, None],
+        kv_transfer_params: Optional[dict[str, Any]] = None,
+        num_cached_tokens: int = 0,
+    ) -> Optional[Union[RequestOutput, PoolingRequestOutput]]:
+
+        finished = finish_reason is not None
+        final_only = self.output_kind == RequestOutputKind.FINAL_ONLY
+
+        if not finished and final_only:
+            # Only the final output is required in FINAL_ONLY mode.
+            return None
+
+        request_id = self.request_id
+        if pooling_output is not None:
+            return self._new_request_output(
+                request_id, [self._new_pooling_output(pooling_output)],
+                finished)
+
+        output = self._new_completion_output(new_token_ids, finish_reason,
+                                             stop_reason)
+
+        if self.parent_req is None:
+            outputs = [output]
+        else:
+            request_id, outputs, finished = self.parent_req.get_outputs(
+                request_id, output)
+            if not outputs:
+                return None
+
+        return self._new_request_output(request_id, outputs, finished,
+                                        kv_transfer_params, num_cached_tokens)
+
+    def _new_request_output(
+        self,
+        request_id: str,
+        outputs: Union[list[CompletionOutput], list[PoolingOutput]],
+        finished: bool,
+        kv_transfer_params: Optional[dict[str, Any]] = None,
+        num_cached_tokens: int = 0,
+    ) -> Union[RequestOutput, PoolingRequestOutput]:
+
+        if isinstance(outputs[0], PoolingOutput):
+            assert len(outputs) == 1
+            return PoolingRequestOutput(
+                request_id=request_id,
+                outputs=outputs[0],
+                prompt_token_ids=self.prompt_token_ids,
+                finished=finished,
+            )
+        assert self.logprobs_processor is not None
+        if self.output_kind == RequestOutputKind.DELTA:
+            # Side effect: logprobs processor forgets prompt logprobs
+            prompt_logprobs = self.logprobs_processor.pop_prompt_logprobs()
+        else:
+            prompt_logprobs = self.logprobs_processor.prompt_logprobs
+
+        return RequestOutput(
+            request_id=request_id,
+            prompt=self.prompt,
+            prompt_token_ids=self.prompt_token_ids,
+            prompt_logprobs=prompt_logprobs,
+            outputs=cast(list[CompletionOutput], outputs),
+            finished=finished,
+            kv_transfer_params=kv_transfer_params,
+            num_cached_tokens=num_cached_tokens,
+        )
+
+    def _new_completion_output(
+        self,
+        token_ids: list[int],
+        finish_reason: Optional[FinishReason],
+        stop_reason: Union[int, str, None],
+    ) -> CompletionOutput:
+
+        assert self.detokenizer is not None
+        assert self.logprobs_processor is not None
+        finished = finish_reason is not None
+        delta = self.output_kind == RequestOutputKind.DELTA
+
+        # Prepare text and token_ids, based on delta mode
+        text = self.detokenizer.get_next_output_text(finished, delta)
+        if not delta:
+            token_ids = self.detokenizer.output_token_ids
+
+        # Prepare logprobs, based on delta mode
+        logprobs = self.logprobs_processor.logprobs
+        if delta and logprobs:
+            logprobs = logprobs[-len(token_ids):]
+
+        return CompletionOutput(
+            index=self.request_index,
+            text=text,
+            token_ids=token_ids,
+            logprobs=logprobs,
+            cumulative_logprob=self.logprobs_processor.cumulative_logprob,
+            finish_reason=str(finish_reason) if finished else None,
+            stop_reason=stop_reason if finished else None)
+
+    def _new_pooling_output(
+        self,
+        pooling_output: torch.Tensor,
+    ) -> PoolingOutput:
+
+        return PoolingOutput(data=pooling_output)
+
+
+class OutputProcessor:
+    """Process EngineCoreOutputs into RequestOutputs."""
+
+    def __init__(
+        self,
+        tokenizer: TokenizerGroup,
+        log_stats: bool,
+    ):
+        self.log_stats = log_stats
+        self.tokenizer = tokenizer
+        self.request_states: dict[str, RequestState] = {}
+        self.parent_requests: dict[str, ParentRequest] = {}
+        self.lora_states = LoRARequestStates()
+
+    def get_num_unfinished_requests(self):
+        return len(self.request_states)
+
+    def has_unfinished_requests(self) -> bool:
+        return len(self.request_states) > 0
+
+    def propagate_error(self, e: Exception):
+        """Propagate error to all generate() tasks."""
+
+        for _, state in self.request_states.items():
+            assert state.queue is not None
+            state.queue.put(e)
+
+    def abort_requests(
+        self,
+        request_ids: Iterable[str],
+    ) -> list[str]:
+        request_ids_to_abort = []
+        for request_id in request_ids:
+            req_state = self.request_states.pop(request_id, None)
+            if req_state is not None:
+                self.lora_states.abort_request(req_state)
+                request_ids_to_abort.append(request_id)
+            else:
+                parent = self.parent_requests.pop(request_id, None)
+                if parent and parent.child_requests:
+                    self.abort_requests(parent.child_requests)
+                    request_ids_to_abort.extend(parent.child_requests)
+        return request_ids_to_abort
+
+    def add_request(
+        self,
+        request: EngineCoreRequest,
+        prompt: Optional[str],
+        parent_req: Optional[ParentRequest] = None,
+        request_index: int = 0,
+        queue: Optional[RequestOutputCollector] = None,
+    ) -> None:
+        request_id = request.request_id
+        if request_id in self.request_states:
+            raise ValueError(f"Request id {request_id} already running.")
+
+        tokenizer = None if not self.tokenizer else \
+            self.tokenizer.get_lora_tokenizer(request.lora_request)
+
+        req_state = RequestState.from_new_request(tokenizer=tokenizer,
+                                                  request=request,
+                                                  prompt=prompt,
+                                                  parent_req=parent_req,
+                                                  request_index=request_index,
+                                                  queue=queue,
+                                                  log_stats=self.log_stats)
+        self.request_states[request_id] = req_state
+        self.lora_states.add_request(req_state)
+        if parent_req:
+            self.parent_requests[parent_req.request_id] = parent_req
+
+    def process_outputs(
+        self,
+        engine_core_outputs: list[EngineCoreOutput],
+        engine_core_timestamp: Optional[float] = None,
+        iteration_stats: Optional[IterationStats] = None,
+    ) -> OutputProcessorOutput:
+        """
+        Process the EngineCoreOutputs:
+        1) Compute stats for logging
+        2) Detokenize
+        3) Create and handle RequestOutput objects:
+            * If there is a queue (for usage with AsyncLLM), 
+              put the RequestOutput objects into the queue for
+              handling by the per-request generate() tasks.
+
+            * If there is no queue (for usage with LLMEngine), 
+              return a list of RequestOutput objects.
+
+        NOTE FOR DEVELOPERS
+
+        vLLM V1 minimizes the number of python loops over the full
+        batch to ensure system overheads are minimized. This is the 
+        only function that should loop over EngineCoreOutputs.
+
+        If you need to touch every element of the batch, do it from
+        within the loop below.
+        """
+
+        request_outputs: Union[list[RequestOutput],
+                               list[PoolingRequestOutput]] = []
+        reqs_to_abort: list[str] = []
+        for engine_core_output in engine_core_outputs:
+            req_id = engine_core_output.request_id
+            req_state = self.request_states.get(req_id)
+            if req_state is None:
+                # Ignore output for already-aborted request.
+                continue
+
+            # 1) Compute stats for this iteration.
+            self._update_stats_from_output(req_state, engine_core_output,
+                                           engine_core_timestamp,
+                                           iteration_stats)
+
+            new_token_ids = engine_core_output.new_token_ids
+            pooling_output = engine_core_output.pooling_output
+            finish_reason = engine_core_output.finish_reason
+            stop_reason = engine_core_output.stop_reason
+            kv_transfer_params = engine_core_output.kv_transfer_params
+            num_cached_tokens = engine_core_output.num_cached_tokens
+            req_state.is_prefilling = False
+
+            if pooling_output is None:
+                assert req_state.detokenizer is not None
+                assert req_state.logprobs_processor is not None
+                # 2) Detokenize the token ids into text and perform stop checks.
+                stop_string = req_state.detokenizer.update(
+                    new_token_ids, finish_reason == FinishReason.STOP)
+                if stop_string:
+                    finish_reason = FinishReason.STOP
+                    stop_reason = stop_string
+
+                # 3) Compute sample and prompt logprobs for request,
+                # if required.
+                req_state.logprobs_processor.update_from_output(
+                    engine_core_output)
+
+            # 4) Create and handle RequestOutput objects.
+            if request_output := req_state.make_request_output(
+                    new_token_ids, pooling_output, finish_reason, stop_reason,
+                    kv_transfer_params, num_cached_tokens):
+                if req_state.queue is not None:
+                    # AsyncLLM: put into queue for handling by generate().
+                    req_state.queue.put(request_output)
+                else:
+                    # LLMEngine: return list of RequestOutputs.
+                    request_outputs.append(request_output)
+
+            # Free completed requests.
+            if finish_reason is not None:
+                self.request_states.pop(req_id)
+                # Remove parent request if applicable.
+                parent_req = req_state.parent_req
+                if parent_req and not parent_req.child_requests:
+                    self.parent_requests.pop(parent_req.request_id, None)
+                if not engine_core_output.finished:
+                    # If req not finished in EngineCore, but Detokenizer
+                    # detected stop string, abort needed in EngineCore.
+                    reqs_to_abort.append(req_id)
+
+                # Track per-request stats
+                self._update_stats_from_finished(req_state, finish_reason,
+                                                 iteration_stats)
+
+        self.lora_states.update_iteration_stats(iteration_stats)
+
+        return OutputProcessorOutput(
+            request_outputs=request_outputs,
+            reqs_to_abort=reqs_to_abort,
+        )
+
+    def _update_stats_from_output(self, req_state: RequestState,
+                                  engine_core_output: EngineCoreOutput,
+                                  engine_core_timestamp: Optional[float],
+                                  iteration_stats: Optional[IterationStats]):
+        if iteration_stats is None:
+            return
+
+        lora_stats = self.lora_states.get_stats(req_state)
+
+        assert engine_core_timestamp is not None
+        assert req_state.stats is not None
+        iteration_stats.update_from_output(engine_core_output,
+                                           engine_core_timestamp,
+                                           req_state.is_prefilling,
+                                           req_state.prompt_len,
+                                           req_state.stats, lora_stats)
+
+    def _update_stats_from_finished(self, req_state: RequestState,
+                                    finish_reason: Optional[FinishReason],
+                                    iteration_stats: Optional[IterationStats]):
+        if iteration_stats is None:
+            return
+
+        assert finish_reason is not None
+        assert req_state.stats is not None
+        iteration_stats.update_from_finished_request(
+            finish_reason=finish_reason,
+            num_prompt_tokens=len(req_state.prompt_token_ids),
+            max_tokens_param=req_state.max_tokens_param,
+            req_stats=req_state.stats)
+        self.lora_states.finish_request(req_state)
+
+        ParentRequest.observe_finished_request(
+            req_state.parent_req, iteration_stats,
+            req_state.stats.num_generation_tokens)
diff --git a/vllm_v0.10.0/vllm/v1/engine/parallel_sampling.py b/vllm_v0.10.0/vllm/v1/engine/parallel_sampling.py
new file mode 100644
index 0000000..1e99111
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/engine/parallel_sampling.py
@@ -0,0 +1,133 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from copy import copy
+from typing import Optional
+
+from vllm.outputs import CompletionOutput
+from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.v1.metrics.stats import IterationStats
+
+
+class ParentRequest:
+    """Info, state & processing for parallel sampling request.
+
+    Store parent request ID and sampling params.
+    Facilitate generating child request sampling params.
+    """
+
+    request_id: str
+    sampling_params: SamplingParams
+
+    # To track the completion of child requests
+    child_requests: set[str]
+
+    # To aggregate child completions when not streaming
+    output_aggregator: list[CompletionOutput]
+
+    # To find the max number of generated tokens across all children
+    max_num_generation_tokens: int
+
+    # To efficiently obtain child sampling params
+    cached_child_sampling_params: Optional[SamplingParams]
+
+    def __init__(self, request_id: str,
+                 sampling_params: SamplingParams) -> None:
+        self.request_id = request_id
+        self.sampling_params = sampling_params
+
+        self.child_requests = set()
+        self.output_aggregator = [None] * sampling_params.n if (
+            sampling_params.output_kind
+            == RequestOutputKind.FINAL_ONLY) else []
+        self.max_num_generation_tokens = 0
+        self.cached_child_sampling_params = None
+
+    def _get_child_sampling_params(
+        self,
+        index: int,
+    ) -> SamplingParams:
+        """Efficiently obtain child `sampling_params`
+
+        If `sampling_params.seed` is not `None` then 
+        each child request requires a unique clone of
+        parent `sampling_params` with a unique seed.
+
+        Args:
+          index: index within `n` child requests
+
+        Returns:
+          Child `sampling_params` instance.
+        """
+        seed = self.sampling_params.seed
+        if self.cached_child_sampling_params:
+            # Reuse child sampling_params data structure
+            return self.cached_child_sampling_params
+        # Build child sampling_params
+        child_sampling_params = copy(self.sampling_params)
+        child_sampling_params.n = 1
+        if seed is None:
+            # Cache child sampling_params for later reuse
+            self.cached_child_sampling_params = child_sampling_params
+        else:
+            # Each child gets a clone with a unique seed
+            child_sampling_params.seed = seed + index
+        return child_sampling_params
+
+    def get_child_info(self, index: int) -> tuple[str, SamplingParams]:
+        """Get child request ID and sampling params.
+        
+        Args:
+          index: index within `n` child requests.
+        
+        Returns:
+          (request ID, sampling_params) tuple
+        """
+        child_req_id = f"{index}_{self.request_id}"
+        self.child_requests.add(child_req_id)
+        return child_req_id, self._get_child_sampling_params(index)
+
+    @property
+    def n(self) -> int:
+        return self.sampling_params.n
+
+    def get_outputs(
+        self,
+        child_request_id: str,
+        completion_output: CompletionOutput,
+    ) -> tuple[str, list[CompletionOutput], bool]:
+        if completion_output.finished():
+            self.child_requests.remove(child_request_id)
+
+        if self.sampling_params.output_kind != RequestOutputKind.FINAL_ONLY:
+            # If streaming, just return the current output.
+            outputs = [completion_output]
+        else:
+            # If not streaming, aggregate the n final outputs.
+            self.output_aggregator[completion_output.index] = completion_output
+            outputs = [] if self.child_requests else self.output_aggregator
+
+        finished = not self.child_requests
+        return self.request_id, outputs, finished
+
+    def observe_num_generation_tokens(self, num_generation_tokens: int):
+        self.max_num_generation_tokens = max(num_generation_tokens,
+                                             self.max_num_generation_tokens)
+        return self.max_num_generation_tokens
+
+    @staticmethod
+    def observe_finished_request(parent_req: Optional['ParentRequest'],
+                                 iteration_stats: IterationStats,
+                                 num_generation_tokens: int):
+
+        n_param = parent_req.n if parent_req is not None else 1
+
+        if parent_req is not None:
+            num_generation_tokens = parent_req.observe_num_generation_tokens(
+                num_generation_tokens)
+
+        # Child requests finished, we can now record to iteration stats
+        if parent_req is None or not parent_req.child_requests:
+            iteration_stats.max_num_generation_tokens_iter.append(
+                num_generation_tokens)
+            iteration_stats.n_params_iter.append(n_param)
diff --git a/vllm_v0.10.0/vllm/v1/engine/processor.py b/vllm_v0.10.0/vllm/v1/engine/processor.py
new file mode 100644
index 0000000..0f2f404
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/engine/processor.py
@@ -0,0 +1,425 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import time
+from collections.abc import Mapping, Sequence
+from typing import Any, Literal, Optional, Union
+
+from vllm.config import VllmConfig
+from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs
+from vllm.inputs.parse import split_enc_dec_inputs
+from vllm.inputs.preprocess import InputPreprocessor
+from vllm.lora.request import LoRARequest
+from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
+                             MultiModalRegistry)
+from vllm.multimodal.inputs import PlaceholderRange
+from vllm.multimodal.processing import EncDecMultiModalProcessor
+from vllm.multimodal.utils import merge_and_sort_multimodal_metadata
+from vllm.pooling_params import PoolingParams
+from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.mm_input_cache import MirroredProcessingCache
+from vllm.v1.structured_output.backend_guidance import (
+    validate_guidance_grammar)
+from vllm.v1.structured_output.backend_outlines import (
+    validate_structured_output_request_outlines)
+from vllm.v1.structured_output.backend_xgrammar import (
+    validate_xgrammar_grammar)
+
+
+class Processor:
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        tokenizer: TokenizerGroup,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+    ):
+
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.decoding_config = vllm_config.decoding_config
+        self.tokenizer = tokenizer
+
+        self.generation_config_fields = (
+            self.model_config.try_get_generation_config())
+        self.input_preprocessor = InputPreprocessor(self.model_config,
+                                                    self.tokenizer,
+                                                    mm_registry)
+
+        self.mm_input_cache_client = MirroredProcessingCache(self.model_config)
+
+        # Multi-modal hasher (for images)
+        self.use_hash = self.mm_input_cache_client.use_cache or \
+            self.cache_config.enable_prefix_caching
+
+    @property
+    def mm_registry(self):
+        return self.input_preprocessor.mm_registry
+
+    def _validate_logprobs(
+        self,
+        params: SamplingParams,
+    ) -> None:
+        max_logprobs = self.model_config.max_logprobs
+        # Validate sample logprobs.
+        if params.logprobs and params.logprobs > max_logprobs:
+            raise ValueError(
+                f"Requested sample logprobs of {params.logprobs}, "
+                f"which is greater than max allowed: {max_logprobs}")
+
+        # Validate prompt logprobs.
+        if params.prompt_logprobs and params.prompt_logprobs > max_logprobs:
+            raise ValueError(
+                f"Requested prompt logprobs of {params.prompt_logprobs}, "
+                f"which is greater than max allowed: {max_logprobs}")
+
+    def _validate_sampling_params(
+        self,
+        params: SamplingParams,
+        lora_request: Optional[LoRARequest],
+    ) -> None:
+        self._validate_structured_output(params)
+        self._validate_logit_bias(params)
+
+        if params.allowed_token_ids is None:
+            return
+        if not params.allowed_token_ids:
+            raise ValueError("allowed_token_ids is not None and empty!")
+        tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
+        vocab_size = len(tokenizer)
+        if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids):
+            raise ValueError(
+                "allowed_token_ids contains out-of-vocab token id!")
+
+    def _validate_logit_bias(
+        self,
+        params: SamplingParams,
+    ) -> None:
+        """Validate logit_bias token IDs are within vocabulary range."""
+        if not params.logit_bias:
+            return
+
+        vocab_size = self.model_config.get_vocab_size()
+        invalid_token_ids = []
+
+        for token_id in params.logit_bias:
+            if token_id < 0 or token_id >= vocab_size:
+                invalid_token_ids.append(token_id)
+
+        if invalid_token_ids:
+            raise ValueError(
+                f"token_id(s) {invalid_token_ids} in logit_bias contain "
+                f"out-of-vocab token ids. Vocabulary size: {vocab_size}")
+
+    def _validate_supported_sampling_params(
+        self,
+        params: SamplingParams,
+    ) -> None:
+        # Best of not yet supported.
+        if params.best_of is not None and params.best_of > 1:
+            raise ValueError("vLLM V1 does not yet support best_of.")
+        # Logits processors not supported.
+        if params.logits_processors:
+            raise ValueError("vLLM V1 does not support per request "
+                             "user provided logits processors.")
+
+    def _validate_params(
+        self,
+        params: Union[SamplingParams, PoolingParams],
+        lora_request: Optional[LoRARequest],
+    ):
+        """
+        Validate supported SamplingParam.
+        Should raise ValueError if unsupported for API Server.
+        """
+
+        if isinstance(params, PoolingParams):
+            return
+
+        self._validate_logprobs(params)
+        self._validate_sampling_params(params, lora_request)
+        self._validate_supported_sampling_params(params)
+
+    def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None:
+        if lora_request is not None and not self.lora_config:
+            raise ValueError(f"Got lora_request {lora_request} but LoRA is "
+                             "not enabled!")
+
+    def _validate_structured_output(self, params: SamplingParams) -> None:
+        if not params.guided_decoding or not self.decoding_config:
+            return
+
+        if self.model_config.skip_tokenizer_init and params.guided_decoding:
+            raise ValueError(
+                "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'"  # noqa: E501
+            )
+
+        engine_level_backend = self.decoding_config.backend
+        if params.guided_decoding.backend:
+            # Request-level backend selection is not supported in V1.
+            # The values may differ if `params` is reused and was set
+            # to a specific backend based on `auto` behavior in a previous
+            # request. We remember that it was set as a result of `auto`
+            # using the `_auto` option set on the backend in the params.
+            if (params.guided_decoding.backend != engine_level_backend
+                    and not (engine_level_backend == "auto"
+                             and params.guided_decoding.backend_was_auto)):
+                raise ValueError(
+                    "Request-level structured output backend selection is no "
+                    "longer supported. The request specified "
+                    f"'{params.guided_decoding.backend}', but vLLM was "
+                    f"initialised with '{engine_level_backend}'. This error "
+                    "can be resolved by removing backend selection from the "
+                    "request.")
+        else:
+            params.guided_decoding.backend = engine_level_backend
+
+        # Request content validation
+        if (isinstance(params.guided_decoding.choice, list)
+                and not params.guided_decoding.choice):
+            # It is invalid for choice to be an empty list
+            raise ValueError(f"Choice '{params.guided_decoding.choice}' "
+                             "cannot be an empty list")
+
+        if engine_level_backend.startswith("xgrammar"):
+            # xgrammar with no fallback
+            validate_xgrammar_grammar(params)
+        elif engine_level_backend.startswith("guidance"):
+            # TODO: ideally we would have the LLTokenizer here as Lark syntax
+            # allows <|special_token|> and similar, see
+            # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens
+            # Without tokenizer these are disallowed in grammars.
+            validate_guidance_grammar(params, tokenizer=None)
+        elif engine_level_backend == "outlines":
+            # outlines backend
+            validate_structured_output_request_outlines(params)
+        else:
+            # NOTE: engine_level_backend must be "auto" here, because we have
+            # checked supported_backends above.
+            # "auto" is an opt-in to opinionated behavior where we try to
+            # choose a backend based on request contents. This is not the
+            # default as it is less predictable and subject to change
+            # between releases as feature support changes.
+            try:
+                validate_xgrammar_grammar(params)
+                params.guided_decoding.backend = "xgrammar"
+            except ValueError:
+                # The request either failed validation
+                # or includes some jsonschema feature(s) that
+                # are not supported in xgrammar. Fall back to guidance.
+                validate_guidance_grammar(params, tokenizer=None)
+                params.guided_decoding.backend = "guidance"
+            # Remember that this backend was set automatically
+            params.guided_decoding.backend_was_auto = True
+
+    def process_inputs(
+        self,
+        request_id: str,
+        prompt: PromptType,
+        params: Union[SamplingParams, PoolingParams],
+        arrival_time: Optional[float] = None,
+        lora_request: Optional[LoRARequest] = None,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
+    ) -> tuple[Optional[str], EngineCoreRequest]:
+
+        # TODO(woosuk): Support pooling models.
+        # TODO(woosuk): Support encoder-decoder models.
+        self._validate_lora(lora_request)
+        self._validate_params(params, lora_request)
+        if trace_headers is not None:
+            raise ValueError("V1 does not support tracing yet.")
+
+        data_parallel_size = self.vllm_config.parallel_config.data_parallel_size
+        if data_parallel_rank is not None and not (0 <= data_parallel_rank <
+                                                   data_parallel_size):
+            raise ValueError(f"data_parallel_rank {data_parallel_rank} "
+                             f"is out of range [0, {data_parallel_size}).")
+
+        if arrival_time is None:
+            arrival_time = time.time()
+
+        # Process inputs, which includes:
+        # 1. Tokenize text prompt, with LoRA request if one exists.
+        # 2. For multimodal models with a merged preprocessor, preprocess
+        #   multimodal data and expand prompt token ids accordingly.
+        processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
+            prompt,
+            tokenization_kwargs=tokenization_kwargs,
+            lora_request=lora_request,
+            return_mm_hashes=self.use_hash,
+        )
+        from vllm.platforms import current_platform
+        current_platform.validate_request(
+            prompt=prompt,
+            params=params,
+            processed_inputs=processed_inputs,
+        )
+        eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
+
+        self._validate_model_inputs(processed_inputs, lora_request)
+
+        encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
+
+        # TODO: Impl encoder-decoder
+        if encoder_inputs is not None:
+            raise NotImplementedError
+
+        sampling_params = None
+        pooling_params = None
+        if isinstance(params, SamplingParams):
+            # TODO: can we avoid cloning here in multiproc case?
+            sampling_params = params.clone()
+            # If unset max tokens, then generate up to the max_model_len.
+            if sampling_params.max_tokens is None:
+                sampling_params.max_tokens = (
+                    self.model_config.max_model_len -
+                    len(decoder_inputs["prompt_token_ids"]))
+            sampling_params.update_from_generation_config(
+                self.generation_config_fields, eos_token_id)
+            sampling_params.update_from_tokenizer(
+                self.tokenizer.get_lora_tokenizer(lora_request))
+        else:
+            pooling_params = params.clone()
+
+        # Multimodal related.
+        sorted_mm_inputs: Optional[Sequence[Optional[MultiModalKwargs]]] = None
+        sorted_mm_positions: Optional[list[PlaceholderRange]] = None
+        sorted_mm_hashes: Optional[list[str]] = None
+        if decoder_inputs["type"] == "multimodal":
+            decoder_mm_inputs = decoder_inputs["mm_kwargs"]
+
+            # Merge and flatten multimodal placeholders, hashes and inputs
+            # from dictionaries to lists, and sort them by each item's position
+            # in the input sequence.
+            (
+                sorted_item_modalities,
+                sorted_mm_positions,
+                sorted_mm_hashes,
+            ) = merge_and_sort_multimodal_metadata(
+                decoder_inputs["mm_placeholders"],
+                decoder_inputs["mm_hashes"] if self.use_hash else None,
+            )
+
+            # The output of merged multi-modal processor (`decoder_mm_inputs`)
+            # is a single MultiModalKwargs for all items from all modalities.
+            # This code flattens kwargs for individual items in a list and
+            # sorts them by each item's position in the input sequence if there
+            # are multiple modalities.
+            unique_modalities = set(sorted_item_modalities)
+            if len(unique_modalities) > 1:
+                orig_sorted_mm_inputs = []
+                used_indices = {modality: 0 for modality in unique_modalities}
+
+                for modality in sorted_item_modalities:
+                    items = decoder_mm_inputs.get_items(modality)
+                    item = items[used_indices[modality]]
+
+                    orig_sorted_mm_inputs.append(
+                        MultiModalKwargs.from_items([item]))
+                    used_indices[modality] += 1
+            else:
+                orig_sorted_mm_inputs = [
+                    MultiModalKwargs.from_items([item]) for item in
+                    decoder_mm_inputs.get_items(sorted_item_modalities[0])
+                ]
+
+            if sorted_mm_hashes is not None:
+                sorted_mm_inputs = self.mm_input_cache_client.get_and_update_p0(
+                    orig_sorted_mm_inputs, sorted_mm_hashes)
+            else:
+                sorted_mm_inputs = orig_sorted_mm_inputs
+
+        return decoder_inputs.get("prompt"), EngineCoreRequest(
+            request_id=request_id,
+            prompt_token_ids=decoder_inputs["prompt_token_ids"],
+            mm_inputs=sorted_mm_inputs,
+            mm_hashes=sorted_mm_hashes,
+            mm_placeholders=sorted_mm_positions,
+            sampling_params=sampling_params,
+            pooling_params=pooling_params,
+            eos_token_id=eos_token_id,
+            arrival_time=arrival_time,
+            lora_request=lora_request,
+            cache_salt=decoder_inputs.get("cache_salt"),
+            priority=priority,
+            data_parallel_rank=data_parallel_rank,
+        )
+
+    def _validate_model_inputs(self,
+                               inputs: ProcessorInputs,
+                               lora_request: Optional[LoRARequest] = None):
+        encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs)
+
+        if encoder_inputs is not None:
+            self._validate_model_input(encoder_inputs,
+                                       lora_request,
+                                       prompt_type="encoder")
+
+        self._validate_model_input(decoder_inputs,
+                                   lora_request,
+                                   prompt_type="decoder")
+
+    def _validate_model_input(
+        self,
+        prompt_inputs: SingletonInputs,
+        lora_request: Optional[LoRARequest],
+        *,
+        prompt_type: Literal["encoder", "decoder"],
+    ):
+        model_config = self.model_config
+
+        prompt_ids = prompt_inputs["prompt_token_ids"]
+        if not prompt_ids:
+            if prompt_type == "encoder" and model_config.is_multimodal_model:
+                pass  # Mllama may have empty encoder inputs for text-only data
+            else:
+                raise ValueError(f"The {prompt_type} prompt cannot be empty")
+
+        if self.model_config.skip_tokenizer_init:
+            tokenizer = None
+        else:
+            tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
+            max_input_id = max(prompt_ids, default=0)
+            if max_input_id > tokenizer.max_token_id:
+                raise ValueError(
+                    f"Token id {max_input_id} is out of vocabulary")
+
+        max_prompt_len = self.model_config.max_model_len
+        if len(prompt_ids) > max_prompt_len:
+            if prompt_type == "encoder" and model_config.is_multimodal_model:
+                mm_registry = self.input_preprocessor.mm_registry
+                mm_processor = mm_registry.create_processor(
+                    model_config,
+                    tokenizer=tokenizer,
+                )
+                assert isinstance(mm_processor, EncDecMultiModalProcessor)
+
+                if mm_processor.pad_dummy_encoder_prompt:
+                    return  # Skip encoder length check for Whisper
+
+            if model_config.is_multimodal_model:
+                suggestion = (
+                    "Make sure that `max_model_len` is no smaller than the "
+                    "number of text tokens plus multimodal tokens. For image "
+                    "inputs, the number of image tokens depends on the number "
+                    "of images, and possibly their aspect ratios as well.")
+            else:
+                suggestion = (
+                    "Make sure that `max_model_len` is no smaller than the "
+                    "number of text tokens.")
+
+            raise ValueError(
+                f"The {prompt_type} prompt (length {len(prompt_ids)}) is "
+                f"longer than the maximum model length of {max_prompt_len}. "
+                f"{suggestion}")
+
+            # TODO: Find out how many placeholder tokens are there so we can
+            # check that chunked prefill does not truncate them
+            # max_batch_len = self.scheduler_config.max_num_batched_tokens
diff --git a/vllm_v0.10.0/vllm/v1/engine/utils.py b/vllm_v0.10.0/vllm/v1/engine/utils.py
new file mode 100644
index 0000000..f39aa40
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/engine/utils.py
@@ -0,0 +1,823 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import contextlib
+import os
+import weakref
+from collections.abc import Iterator
+from dataclasses import dataclass
+from enum import Enum, auto
+from multiprocessing import Process, connection
+from multiprocessing.process import BaseProcess
+from typing import TYPE_CHECKING, Callable, Optional, Union
+from unittest.mock import patch
+
+import msgspec
+import zmq
+
+from vllm.config import CacheConfig, ParallelConfig, VllmConfig
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.ray.ray_env import get_env_vars_to_copy
+from vllm.utils import get_mp_context, get_open_zmq_ipc_path, zmq_socket_ctx
+from vllm.v1.engine.coordinator import DPCoordinator
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.utils import get_engine_client_zmq_addr, shutdown
+
+if TYPE_CHECKING:
+    from ray.util.placement_group import PlacementGroup
+
+logger = init_logger(__name__)
+
+STARTUP_POLL_PERIOD_MS = 10000
+
+
+class CoreEngineState(Enum):
+    NEW = auto()
+    CONNECTED = auto()
+    READY = auto()
+
+
+class CoreEngine:
+    """One per data parallel rank, used to track state during handshaking."""
+
+    def __init__(self, index: int = 0, local: bool = True):
+        self.local = local
+        self.identity = index.to_bytes(2, "little")
+
+        self.state = CoreEngineState.NEW
+
+
+@dataclass
+class EngineZmqAddresses:
+    # ZMQ input socket addresses for each front-end client (requests)
+    inputs: list[str]
+    # ZMQ output socket addresses for each front-end client (responses)
+    outputs: list[str]
+    # ZMQ input socket address of DP coordinator if applicable
+    coordinator_input: Optional[str] = None
+    # ZMQ output socket address of DP coordinator if applicable
+    coordinator_output: Optional[str] = None
+    # ZMQ socket for front-end to connect to DP coordinator.
+    # Not used by engine, just relayed to front-end in handshake response.
+    # Only required for external DP LB case.
+    frontend_stats_publish_address: Optional[str] = None
+
+
+@dataclass
+class EngineHandshakeMetadata:
+    """Metadata sent to each engine process during startup handshake,
+    including addresses of the front-end ZMQ queues that they should
+    connect to.
+    """
+    addresses: EngineZmqAddresses
+    parallel_config: dict[str, Union[int, str]]
+
+
+class CoreEngineProcManager:
+    """
+    Utility class to handle creation, readiness, and shutdown
+    of background processes used by the AsyncLLM and LLMEngine.
+    """
+
+    def __init__(
+        self,
+        target_fn: Callable,
+        local_engine_count: int,
+        start_index: int,
+        local_start_index: int,
+        vllm_config: VllmConfig,
+        local_client: bool,
+        handshake_address: str,
+        executor_class: type[Executor],
+        log_stats: bool,
+        client_handshake_address: Optional[str] = None,
+    ):
+        context = get_mp_context()
+        common_kwargs = {
+            "vllm_config": vllm_config,
+            "local_client": local_client,
+            "handshake_address": handshake_address,
+            "executor_class": executor_class,
+            "log_stats": log_stats,
+        }
+
+        if client_handshake_address:
+            common_kwargs[
+                "client_handshake_address"] = client_handshake_address
+
+        self.processes: list[BaseProcess] = []
+        local_dp_ranks = []
+        for index in range(local_engine_count):
+            local_index = local_start_index + index
+            global_index = start_index + index
+
+            # Start EngineCore in background process.
+            local_dp_ranks.append(local_index)
+            self.processes.append(
+                context.Process(target=target_fn,
+                                name=f"EngineCore_{global_index}",
+                                kwargs=common_kwargs | {
+                                    "dp_rank": global_index,
+                                    "local_dp_rank": local_index,
+                                }))
+
+        self._finalizer = weakref.finalize(self, shutdown, self.processes)
+
+        data_parallel = vllm_config.parallel_config.data_parallel_size > 1
+        try:
+            for proc, local_dp_rank in zip(self.processes, local_dp_ranks):
+                with set_device_control_env_var(
+                        vllm_config, local_dp_rank) if (
+                            data_parallel) else contextlib.nullcontext():
+                    proc.start()
+        finally:
+            # Kill other procs if not all are running.
+            if self.finished_procs():
+                self.close()
+
+    def close(self):
+        """Shutdown all procs."""
+        self._finalizer()
+
+    def join_first(self):
+        """Wait for any process to exit."""
+        connection.wait(proc.sentinel for proc in self.processes)
+
+    def sentinels(self) -> list:
+        return [proc.sentinel for proc in self.processes]
+
+    def finished_procs(self) -> dict[str, int]:
+        """Returns dict of proc name -> exit code for any finished procs."""
+        return {
+            proc.name: proc.exitcode
+            for proc in self.processes if proc.exitcode is not None
+        }
+
+
+@contextlib.contextmanager
+def set_device_control_env_var(vllm_config: VllmConfig,
+                               local_dp_rank: int) -> Iterator[None]:
+    """
+    Temporarily set CUDA_VISIBLE_DEVICES or equivalent
+    for engine subprocess.
+    """
+    world_size = vllm_config.parallel_config.world_size
+    evar = current_platform.device_control_env_var
+    try:
+        value = ",".join(
+            str(current_platform.device_id_to_physical_device_id(i))
+            for i in range(local_dp_rank * world_size, (local_dp_rank + 1) *
+                           world_size))
+    except IndexError as e:
+        raise Exception(f"Error setting {evar}: "
+                        f"local range: [{local_dp_rank * world_size}, "
+                        f"{(local_dp_rank + 1) * world_size}) "
+                        "base value: "
+                        f"\"{os.getenv(evar)}\"") from e
+    with patch.dict(os.environ, values=((evar, value), )):
+        yield
+
+
+class CoreEngineActorManager:
+    """
+    Utility class to handle creation, readiness, and shutdown
+    of core engine Ray actors used by the AsyncLLM and LLMEngine.
+
+    Different from CoreEngineProcManager, this class manages
+    core engines for both local and remote nodes.
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        addresses: EngineZmqAddresses,
+        executor_class: type[Executor],
+        log_stats: bool,
+        placement_groups: Optional[list["PlacementGroup"]] = None,
+        local_dp_ranks: Optional[list[int]] = None,
+    ):
+        import copy
+
+        import ray
+        from ray.runtime_env import RuntimeEnv
+        from ray.util.scheduling_strategies import (
+            PlacementGroupSchedulingStrategy)
+
+        from vllm.v1.engine.core import DPEngineCoreActor
+
+        self.local_engine_actors: list[ray.ActorHandle] = []
+        self.remote_engine_actors: list[ray.ActorHandle] = []
+
+        env_vars_list = get_env_vars_to_copy(destination="DPEngineCoreActor")
+        self.env_vars_dict = {
+            name: os.environ[name]
+            for name in env_vars_list if name in os.environ
+        }
+        runtime_env = RuntimeEnv(env_vars=self.env_vars_dict)
+
+        self.addresses = addresses
+        self.executor_class = executor_class
+        self.log_stats = log_stats
+        dp_size = vllm_config.parallel_config.data_parallel_size
+        local_engine_count = \
+            vllm_config.parallel_config.data_parallel_size_local
+        world_size = vllm_config.parallel_config.world_size
+
+        if ray.is_initialized():
+            logger.info(
+                "Ray is already initialized. Skipping Ray initialization.")
+        else:
+            ray.init()
+
+        if placement_groups is not None:
+            assert local_dp_ranks is not None, (
+                "local_dp_ranks must be provided if "
+                "placement_groups is provided")
+            assert len(placement_groups) == len(local_dp_ranks), (
+                "placement_groups and local_dp_ranks must "
+                "have the same length")
+            logger.info("Using provided placement groups")
+            # TODO(rui): validate passed-in placement groups
+            self.created_placement_groups = []
+        else:
+            placement_groups, local_dp_ranks = \
+                CoreEngineActorManager.create_dp_placement_groups(vllm_config)
+            self.created_placement_groups = placement_groups
+        assert len(placement_groups) == dp_size, (
+            "Number of placement groups must match data parallel size")
+
+        self.placement_group_is_local = []
+        refs = []
+        for index, local_index, pg in zip(range(dp_size), local_dp_ranks,
+                                          placement_groups):
+            dp_vllm_config = copy.deepcopy(vllm_config)
+            dp_vllm_config.parallel_config.placement_group = pg
+            local_client = index < local_engine_count
+            actor = ray.remote(DPEngineCoreActor).options(
+                scheduling_strategy=PlacementGroupSchedulingStrategy(
+                    placement_group=pg,
+                    placement_group_bundle_index=world_size,
+                ),
+                runtime_env=runtime_env).remote(vllm_config=dp_vllm_config,
+                                                executor_class=executor_class,
+                                                log_stats=log_stats,
+                                                local_client=local_client,
+                                                addresses=addresses,
+                                                dp_rank=index,
+                                                local_dp_rank=local_index)
+            if local_client:
+                self.local_engine_actors.append(actor)
+            else:
+                self.remote_engine_actors.append(actor)
+            self.placement_group_is_local.append(local_client)
+            refs.append(actor.wait_for_init.remote())
+
+        ray.get(refs)
+        self.run_refs = []
+        for actor in self.local_engine_actors + self.remote_engine_actors:
+            self.run_refs.append(actor.run.remote())
+
+    @staticmethod
+    def create_dp_placement_groups(
+            vllm_config: VllmConfig
+    ) -> tuple[list["PlacementGroup"], list[int]]:
+        """
+        Create placement groups for data parallel.
+        """
+
+        import ray
+        from ray._private.state import available_resources_per_node
+        from ray.util.state import list_nodes
+
+        logger.info("Creating placement groups for data parallel")
+        dp_master_ip = \
+            vllm_config.parallel_config.data_parallel_master_ip
+        num_pg_to_create = vllm_config.parallel_config.data_parallel_size
+        local_engine_count = \
+            vllm_config.parallel_config.data_parallel_size_local
+
+        nodes = sorted(list_nodes(),
+                       key=lambda node: node.node_ip != dp_master_ip)
+        assert nodes[0].node_ip == dp_master_ip, (
+            "The first node must be the head node")
+        assert len(nodes) == 1 or nodes[1].node_ip != dp_master_ip, (
+            "There can only be one head node")
+
+        available_resources = available_resources_per_node()
+        world_size = vllm_config.parallel_config.world_size
+        placement_groups: list[PlacementGroup] = []
+        local_dp_ranks: list[int] = []
+
+        for node in nodes:
+            node_ip = node.node_ip
+            node_resources = available_resources[node.node_id]
+            # For now, each DP rank can only be assigned to one node
+            # TODO(rui): support allocating a single DP rank
+            # to multiple nodes
+            available_engine_count = int(node_resources["GPU"]) // world_size
+            if node_ip == dp_master_ip:
+                assert available_engine_count >= local_engine_count, (
+                    "Not enough resources to allocate DP ranks "
+                    f"on DP master node {node_ip}")
+                for i in range(local_engine_count):
+                    bundles = [{
+                        "GPU": 1.0,
+                        "node:" + dp_master_ip: 0.001
+                    }] * world_size + [{
+                        "CPU": 1.0
+                    }]
+                    pg = ray.util.placement_group(
+                        name=f"dp_rank_{len(placement_groups)}",
+                        strategy="STRICT_PACK",
+                        bundles=bundles,
+                    )
+                    placement_groups.append(pg)
+                    local_dp_ranks.append(i)
+            else:
+                for i in range(available_engine_count):
+                    if len(placement_groups) == num_pg_to_create:
+                        break
+                    bundles = [{"GPU": 1.0}] * world_size + [{"CPU": 1.0}]
+                    pg = ray.util.placement_group(
+                        name=f"dp_rank_{len(placement_groups)}",
+                        strategy="STRICT_PACK",
+                        bundles=bundles,
+                    )
+                    placement_groups.append(pg)
+                    local_dp_ranks.append(i)
+        return placement_groups, local_dp_ranks
+
+    @staticmethod
+    def add_dp_placement_groups(
+        old_vllm_config: VllmConfig, new_data_parallel_size: int
+    ) -> tuple[list["PlacementGroup"], list[int]]:
+        """
+        Add placement groups for new data parallel size.
+        """
+        import ray
+        from ray._private.state import (available_resources_per_node,
+                                        total_resources_per_node)
+        from ray.util.state import list_nodes
+
+        old_dp_size = old_vllm_config.parallel_config.data_parallel_size
+        num_pg_to_create = new_data_parallel_size - old_dp_size
+
+        if num_pg_to_create <= 0:
+            return [], []
+
+        dp_master_ip = old_vllm_config.parallel_config.data_parallel_master_ip
+        world_size = old_vllm_config.parallel_config.world_size
+
+        nodes = list_nodes()
+        nodes = sorted(nodes, key=lambda node: node.node_ip != dp_master_ip)
+        assert nodes[0].node_ip == dp_master_ip, (
+            "The first node must be the head node")
+        assert len(nodes) == 1 or nodes[1].node_ip != dp_master_ip, (
+            "There can only be one head node")
+
+        available_resources = available_resources_per_node()
+        total_resources = total_resources_per_node()
+
+        placement_groups = []
+        local_dp_ranks = []
+        num_pg_created = 0
+
+        for node in nodes:
+            if num_pg_created >= num_pg_to_create:
+                break
+
+            node_ip = node.node_ip
+            node_id = node.node_id
+            available_gpus = int(available_resources[node_id]["GPU"])
+
+            # Get total GPUs on this node from the node's resources
+            # Ray stores node resources with node ID as key
+            total_gpus = int(total_resources[node_id]["GPU"])
+
+            # Calculate used GPUs and used engines on this node
+            used_gpus = max(0, total_gpus - available_gpus)
+            used_engines_on_node = used_gpus // world_size
+
+            # Calculate how many new engines this node can accommodate
+            available_engine_count = available_gpus // world_size
+
+            # Create placement groups for new engines on this node
+            for i in range(available_engine_count):
+                if num_pg_created >= num_pg_to_create:
+                    break
+
+                rank = old_dp_size + num_pg_created
+
+                # Create bundles with node constraint for master node
+                if node_ip == dp_master_ip:
+                    bundles = [{
+                        "GPU": 1.0,
+                        "node:" + dp_master_ip: 0.001
+                    }] * world_size + [{
+                        "CPU": 1.0
+                    }]
+                else:
+                    bundles = [{"GPU": 1.0}] * world_size + [{"CPU": 1.0}]
+
+                pg = ray.util.placement_group(
+                    name=f"dp_rank_{rank}",
+                    strategy="STRICT_PACK",
+                    bundles=bundles,
+                )
+                placement_groups.append(pg)
+
+                # Local rank starts from the number of engines already used
+                # on this node
+                local_rank = used_engines_on_node + i
+                local_dp_ranks.append(local_rank)
+                num_pg_created += 1
+
+        return placement_groups, local_dp_ranks
+
+    def scale_up_elastic_ep(self, cur_vllm_config: VllmConfig,
+                            new_data_parallel_size: int) -> None:
+        import copy
+
+        import ray
+        from ray.runtime_env import RuntimeEnv
+        from ray.util.scheduling_strategies import (
+            PlacementGroupSchedulingStrategy)
+
+        from vllm.v1.engine.core import DPEngineCoreActor
+
+        cur_data_parallel_size = len(self.local_engine_actors) + \
+            len(self.remote_engine_actors)
+
+        assert new_data_parallel_size > cur_data_parallel_size, (
+            f"New data parallel size {new_data_parallel_size} must be greater "
+            f"than current data parallel size {cur_data_parallel_size} "
+            "for scale up")
+
+        placement_groups, local_dp_ranks = \
+            self.add_dp_placement_groups(
+                cur_vllm_config, new_data_parallel_size)
+
+        world_size = cur_vllm_config.parallel_config.world_size
+        dp_master_ip = cur_vllm_config.parallel_config.data_parallel_master_ip
+        new_local_engines = 0
+
+        runtime_env = RuntimeEnv(env_vars=self.env_vars_dict
+                                 | {"VLLM_ELASTIC_EP_SCALE_UP_LAUNCH": "1"})
+        for i, (pg,
+                local_rank) in enumerate(zip(placement_groups,
+                                             local_dp_ranks)):
+            rank = cur_data_parallel_size + i
+            dp_vllm_config = copy.deepcopy(cur_vllm_config)
+            dp_vllm_config.parallel_config.data_parallel_size = \
+                new_data_parallel_size
+            dp_vllm_config.parallel_config.placement_group = pg
+
+            # Check if this placement group is on the head node
+            local_client = any(
+                bundle.get("node:" + dp_master_ip, 0) > 0
+                for bundle in pg.bundle_specs)
+
+            if local_client:
+                new_local_engines += 1
+                # Update data_parallel_size_local
+                dp_vllm_config.parallel_config.data_parallel_size_local = (
+                    cur_vllm_config.parallel_config.data_parallel_size_local +
+                    new_local_engines)
+
+            actor = ray.remote(DPEngineCoreActor).options(
+                scheduling_strategy=PlacementGroupSchedulingStrategy(
+                    placement_group=pg,
+                    placement_group_bundle_index=world_size,
+                ),
+                runtime_env=runtime_env).remote(
+                    vllm_config=dp_vllm_config,
+                    executor_class=self.executor_class,
+                    log_stats=self.log_stats,
+                    local_client=local_client,
+                    addresses=self.addresses,
+                    dp_rank=rank,
+                    local_dp_rank=local_rank)
+
+            if local_client:
+                self.local_engine_actors.append(actor)
+            else:
+                self.remote_engine_actors.append(actor)
+            self.created_placement_groups.append(pg)
+            self.placement_group_is_local.append(local_client)
+
+        ray.get([
+            actor.wait_for_init.remote()
+            for actor in (self.local_engine_actors[-new_local_engines:]
+                          if new_local_engines > 0 else []) +
+            self.remote_engine_actors[-(len(placement_groups) -
+                                        new_local_engines):]
+        ])
+
+        actors = (self.local_engine_actors[-new_local_engines:]
+                  if new_local_engines > 0 else []) + \
+            self.remote_engine_actors[-(len(placement_groups) -
+                                        new_local_engines):]
+
+        for actor in actors:
+            self.run_refs.append(actor.run.remote())
+
+        cur_vllm_config.parallel_config.data_parallel_size = \
+            new_data_parallel_size
+        # Update old_vllm_config with new data_parallel_size_local if any new
+        # local engines were added
+        if new_local_engines > 0:
+            cur_vllm_config.parallel_config.data_parallel_size_local += \
+                new_local_engines
+
+    def scale_down_elastic_ep(self, cur_data_parallel_size: int,
+                              new_data_parallel_size: int) -> None:
+        import ray
+        assert cur_data_parallel_size > new_data_parallel_size, (
+            f"cur_data_parallel_size {cur_data_parallel_size} must be greater "
+            f"than new_data_parallel_size {new_data_parallel_size} "
+            "for scale down")
+        for _ in range(cur_data_parallel_size - new_data_parallel_size):
+            pg = self.created_placement_groups.pop()
+            is_local = self.placement_group_is_local.pop()
+            if is_local:
+                self.local_engine_actors.pop()
+            else:
+                self.remote_engine_actors.pop()
+            ray.util.remove_placement_group(pg)
+
+    def get_run_refs(self):
+        return self.run_refs
+
+    def close(self):
+        import ray
+        for actor in self.local_engine_actors + self.remote_engine_actors:
+            ray.kill(actor)
+        for pg in self.created_placement_groups:
+            ray.util.remove_placement_group(pg)
+
+
+@contextlib.contextmanager
+def launch_core_engines(
+    vllm_config: VllmConfig,
+    executor_class: type[Executor],
+    log_stats: bool,
+    num_api_servers: int = 1,
+) -> Iterator[tuple[
+        Optional[Union[CoreEngineProcManager, CoreEngineActorManager]],
+        Optional[DPCoordinator],
+        EngineZmqAddresses,
+]]:
+    """Launch engine and DP coordinator processes as needed."""
+
+    parallel_config = vllm_config.parallel_config
+    dp_size = parallel_config.data_parallel_size
+    local_engine_count = parallel_config.data_parallel_size_local
+    local_start_index = parallel_config.data_parallel_rank_local
+    dp_rank = parallel_config.data_parallel_rank
+    host = parallel_config.data_parallel_master_ip
+    local_engines_only = (parallel_config.data_parallel_hybrid_lb
+                          or parallel_config.data_parallel_external_lb)
+
+    # In offline mode there is an LLM instance per DP rank and
+    # one core engine per LLM, see
+    # examples/offline_inference/data_parallel.py.
+    offline_mode = local_start_index is not None
+
+    # client_local_only = True for cases where this front-end
+    # sends requests only to colocated engines.
+    client_local_only = (offline_mode or local_engines_only
+                         or (local_engine_count == dp_size))
+
+    # Set up input and output addresses.
+    addresses = EngineZmqAddresses(
+        inputs=[
+            get_engine_client_zmq_addr(client_local_only, host)
+            for _ in range(num_api_servers)
+        ],
+        outputs=[
+            get_engine_client_zmq_addr(client_local_only, host)
+            for _ in range(num_api_servers)
+        ],
+    )
+
+    # Run the DP Coordinator process with rank 0 when in
+    # online DP mode.
+    run_coordinator = dp_size > 1 and not offline_mode and dp_rank == 0
+
+    if run_coordinator:
+        coordinator = DPCoordinator(parallel_config)
+
+        addresses.coordinator_input, addresses.coordinator_output = (
+            coordinator.get_engine_socket_addresses())
+        addresses.frontend_stats_publish_address = (
+            coordinator.get_stats_publish_address())
+
+        logger.info("Started DP Coordinator process (PID: %d)",
+                    coordinator.proc.pid)
+    else:
+        coordinator = None
+
+    if parallel_config.data_parallel_backend == "ray":
+        logger.info("Starting ray-based data parallel backend")
+
+        engine_actor_manager = CoreEngineActorManager(
+            vllm_config=vllm_config,
+            addresses=addresses,
+            executor_class=executor_class,
+            log_stats=log_stats,
+        )
+
+        yield engine_actor_manager, coordinator, addresses
+        return
+
+    if offline_mode:
+        assert local_engine_count == 1
+        engines_to_handshake = [CoreEngine(index=dp_rank, local=True)]
+    elif dp_rank == 0:
+        # Rank 0 holds Coordinator, so it handshakes with all Cores
+        # in both external dplb and internal dplb mode.
+        # Note this also covers the case where we have zero local engines
+        # and rank 0 is headless.
+        engines_to_handshake = [
+            CoreEngine(index=i, local=(i < local_engine_count))
+            for i in range(dp_size)
+        ]
+    else:
+        # Rank > 0 handshakes with just the local cores it is managing.
+        assert local_engines_only, (
+            "Attempting to launch core_engines from dp_rank > 0, but "
+            "found internal DPLB, which is incompatible.")
+        engines_to_handshake = [
+            CoreEngine(index=i, local=True)
+            for i in range(dp_rank, dp_rank + local_engine_count)
+        ]
+
+    # Whether the started engines will handshake only with co-located
+    # front-end processes. In external_dp_lb mode, ranks > 0 handshake with
+    # their co-located frontend and also the rank 0 front-end, and hence this
+    # will be False.
+    handshake_local_only = offline_mode or local_engine_count == dp_size
+
+    handshake_address = get_engine_client_zmq_addr(
+        handshake_local_only, host, parallel_config.data_parallel_rpc_port)
+
+    if local_engines_only and dp_rank > 0:
+        assert not handshake_local_only
+        local_handshake_address = get_open_zmq_ipc_path()
+        client_handshake_address = local_handshake_address
+    else:
+        local_handshake_address = handshake_address
+        client_handshake_address = None
+
+    with zmq_socket_ctx(local_handshake_address, zmq.ROUTER,
+                        bind=True) as handshake_socket:
+
+        from vllm.v1.engine.core import EngineCoreProc
+
+        # Start local engines.
+        if local_engine_count:
+            local_engine_manager = CoreEngineProcManager(
+                EngineCoreProc.run_engine_core,
+                vllm_config=vllm_config,
+                executor_class=executor_class,
+                log_stats=log_stats,
+                handshake_address=handshake_address,
+                client_handshake_address=client_handshake_address,
+                local_client=True,
+                local_engine_count=local_engine_count,
+                start_index=dp_rank,
+                local_start_index=local_start_index or 0)
+        else:
+            local_engine_manager = None
+
+        yield local_engine_manager, coordinator, addresses
+
+        # Now wait for engines to start.
+        wait_for_engine_startup(
+            handshake_socket,
+            addresses,
+            engines_to_handshake,
+            parallel_config,
+            vllm_config.cache_config,
+            local_engine_manager,
+            coordinator.proc if coordinator else None,
+        )
+
+
+def wait_for_engine_startup(
+    handshake_socket: zmq.Socket,
+    addresses: EngineZmqAddresses,
+    core_engines: list[CoreEngine],
+    parallel_config: ParallelConfig,
+    cache_config: CacheConfig,
+    proc_manager: Optional[CoreEngineProcManager],
+    coord_process: Optional[Process],
+):
+    # Wait for engine core process(es) to send ready messages.
+    local_count = parallel_config.data_parallel_size_local
+    remote_count = len(core_engines) - local_count
+    # [local, remote] counts
+    conn_pending, start_pending = [local_count, remote_count], [0, 0]
+    poller = zmq.Poller()
+    poller.register(handshake_socket, zmq.POLLIN)
+
+    remote_should_be_headless = not parallel_config.data_parallel_hybrid_lb \
+        and not parallel_config.data_parallel_external_lb
+
+    if proc_manager is not None:
+        for sentinel in proc_manager.sentinels():
+            poller.register(sentinel, zmq.POLLIN)
+    if coord_process is not None:
+        poller.register(coord_process.sentinel, zmq.POLLIN)
+    while any(conn_pending) or any(start_pending):
+        events = poller.poll(STARTUP_POLL_PERIOD_MS)
+        if not events:
+            if any(conn_pending):
+                logger.debug(
+                    "Waiting for %d local, %d remote core engine proc(s) "
+                    "to connect.", *conn_pending)
+            if any(start_pending):
+                logger.debug(
+                    "Waiting for %d local, %d remote core engine proc(s) "
+                    "to start.", *start_pending)
+            continue
+        if len(events) > 1 or events[0][0] != handshake_socket:
+            # One of the local core processes exited.
+            finished = proc_manager.finished_procs() if proc_manager else {}
+            if coord_process is not None and coord_process.exitcode is not None:
+                finished[coord_process.name] = coord_process.exitcode
+            raise RuntimeError("Engine core initialization failed. "
+                               "See root cause above. "
+                               f"Failed core proc(s): {finished}")
+
+        # Receive HELLO and READY messages from the input socket.
+        eng_identity, ready_msg_bytes = handshake_socket.recv_multipart()
+        eng_index = int.from_bytes(eng_identity, "little")
+        engine = next((e for e in core_engines if e.identity == eng_identity),
+                      None)
+        if engine is None:
+            raise RuntimeError(f"Message from engine with unexpected data "
+                               f"parallel rank: {eng_index}")
+        msg = msgspec.msgpack.decode(ready_msg_bytes)
+        status, local, headless = msg["status"], msg["local"], msg["headless"]
+        if local != engine.local:
+            raise RuntimeError(f"{status} message from "
+                               f"{'local' if local else 'remote'} "
+                               f"engine {eng_index}, expected it to be "
+                               f"{'local' if engine.local else 'remote'}")
+
+        # Remote engines must be headless iff we aren't in hybrid dp lb mode.
+        if not local and headless != remote_should_be_headless:
+            if headless:
+                raise RuntimeError(f"Remote engine {eng_index} must not use "
+                                   f"--headless in external or hybrid dp lb "
+                                   f"mode")
+            else:
+                raise RuntimeError(f"Remote engine {eng_index} must use "
+                                   f"--headless unless in external or hybrid "
+                                   f"dp lb mode")
+
+        if status == "HELLO" and engine.state == CoreEngineState.NEW:
+
+            # Send init message with DP config info.
+            init_message = msgspec.msgpack.encode(
+                EngineHandshakeMetadata(
+                    addresses=addresses,
+                    parallel_config={
+                        "data_parallel_master_ip":
+                        parallel_config.data_parallel_master_ip,
+                        "data_parallel_master_port":
+                        parallel_config.data_parallel_master_port,
+                        "data_parallel_size":
+                        parallel_config.data_parallel_size,
+                    }))
+            handshake_socket.send_multipart((eng_identity, init_message),
+                                            copy=False)
+            conn_pending[0 if local else 1] -= 1
+            start_pending[0 if local else 1] += 1
+            engine.state = CoreEngineState.CONNECTED
+        elif status == "READY" and engine.state == CoreEngineState.CONNECTED:
+            # Setup KV cache config with initialization state from
+            # engine core process. Sum values from all engines in DP case.
+            num_gpu_blocks = cache_config.num_gpu_blocks or 0
+            num_gpu_blocks += msg["num_gpu_blocks"]
+            cache_config.num_gpu_blocks = num_gpu_blocks
+
+            # In external DP LB mode, the coordinator address that the
+            # front-end procs connect to is obtained from rank 0 via
+            # one of the engine handshakes, and passed to the local
+            # front-end process in the response from the other.
+            if addresses.frontend_stats_publish_address is None:
+                addresses.frontend_stats_publish_address = msg.get(
+                    "dp_stats_address")
+
+            start_pending[0 if local else 1] -= 1
+            engine.state = CoreEngineState.READY
+        else:
+            raise RuntimeError(f"Unexpected {status} message for "
+                               f"{'local' if local else 'remote'} engine "
+                               f"{eng_index} in {engine.state} state.")
+
+        logger.debug("%s from %s core engine process %s.", status,
+                     "local" if local else "remote", eng_index)
diff --git a/vllm_v0.10.0/vllm/v1/executor/__init__.py b/vllm_v0.10.0/vllm/v1/executor/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/v1/executor/abstract.py b/vllm_v0.10.0/vllm/v1/executor/abstract.py
new file mode 100644
index 0000000..50b9634
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/executor/abstract.py
@@ -0,0 +1,113 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from concurrent.futures import Future
+from typing import Callable, Union
+
+import torch
+import torch.distributed as dist
+
+from vllm.config import VllmConfig
+from vllm.executor.executor_base import ExecutorBase
+from vllm.executor.uniproc_executor import (  # noqa
+    ExecutorWithExternalLauncher as ExecutorWithExternalLauncherV0)
+from vllm.executor.uniproc_executor import (  # noqa
+    UniProcExecutor as UniProcExecutorV0)
+from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
+from vllm.v1.outputs import ModelRunnerOutput
+
+FailureCallback = Callable[[], None]
+
+
+class Executor(ExecutorBase):
+    """
+    Abstract class for v1 executors, mainly define some methods for v1.
+    For methods shared by v0 and v1, define them in ExecutorBase"""
+
+    @staticmethod
+    def get_class(vllm_config: VllmConfig) -> type["Executor"]:
+        executor_class: type[Executor]
+        parallel_config = vllm_config.parallel_config
+        distributed_executor_backend = (
+            parallel_config.distributed_executor_backend)
+        # distributed_executor_backend must be set in VllmConfig.__post_init__
+        if isinstance(distributed_executor_backend, type):
+            if not issubclass(distributed_executor_backend, ExecutorBase):
+                raise TypeError(
+                    "distributed_executor_backend must be a subclass of "
+                    f"ExecutorBase. Got {distributed_executor_backend}.")
+            executor_class = distributed_executor_backend
+        elif distributed_executor_backend == "ray":
+            from vllm.v1.executor.ray_distributed_executor import (  # noqa
+                RayDistributedExecutor)
+            executor_class = RayDistributedExecutor
+        elif distributed_executor_backend == "mp":
+            from vllm.v1.executor.multiproc_executor import MultiprocExecutor
+            executor_class = MultiprocExecutor
+        elif distributed_executor_backend == "uni":
+            executor_class = UniProcExecutor
+        elif distributed_executor_backend == "external_launcher":
+            # TODO: make v1 scheduling deterministic
+            # to support external launcher
+            executor_class = ExecutorWithExternalLauncher
+        else:
+            raise ValueError("Unknown distributed executor backend: "
+                             f"{distributed_executor_backend}")
+        return executor_class
+
+    def initialize_from_config(self,
+                               kv_cache_configs: list[KVCacheConfig]) -> None:
+        """
+        Initialize the KV caches and begin the model execution loop of the
+        underlying workers.
+        """
+        self.collective_rpc("initialize_from_config",
+                            args=(kv_cache_configs, ))
+        self.collective_rpc("compile_or_warm_up_model")
+
+    def register_failure_callback(self, callback: FailureCallback):
+        """
+        Register a function to be called if the executor enters a permanent
+        failed state.
+        """
+        pass
+
+    def determine_available_memory(self) -> list[int]:  # in bytes
+        output = self.collective_rpc("determine_available_memory")
+        return output
+
+    def get_kv_cache_specs(self) -> list[dict[str, KVCacheSpec]]:
+        output = self.collective_rpc("get_kv_cache_spec")
+        return output
+
+    def execute_model(
+        self,
+        scheduler_output,
+    ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
+        output = self.collective_rpc("execute_model",
+                                     args=(scheduler_output, ))
+        return output[0]
+
+    @property
+    def max_concurrent_batches(self) -> int:
+        return 1
+
+    def profile(self, is_start: bool = True):
+        self.collective_rpc("profile", args=(is_start, ))
+
+
+class UniProcExecutor(UniProcExecutorV0, Executor):
+    pass
+
+
+class ExecutorWithExternalLauncher(ExecutorWithExternalLauncherV0, Executor):
+
+    def determine_available_memory(self) -> list[int]:  # in bytes
+        # same as determine_num_available_blocks in v0,
+        # we need to get the min across all ranks.
+        memory = super().determine_available_memory()
+        from vllm.distributed.parallel_state import get_world_group
+        cpu_group = get_world_group().cpu_group
+        memory_tensor = torch.tensor([memory], device="cpu", dtype=torch.int64)
+        dist.all_reduce(memory_tensor, group=cpu_group, op=dist.ReduceOp.MIN)
+        return [memory_tensor.item()]
diff --git a/vllm_v0.10.0/vllm/v1/executor/multiproc_executor.py b/vllm_v0.10.0/vllm/v1/executor/multiproc_executor.py
new file mode 100644
index 0000000..11ddade
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/executor/multiproc_executor.py
@@ -0,0 +1,556 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import multiprocessing
+import os
+import pickle
+import signal
+import sys
+import threading
+import time
+import traceback
+import weakref
+from concurrent.futures import Future, ThreadPoolExecutor
+from dataclasses import dataclass
+from enum import Enum, auto
+from functools import partial
+from multiprocessing.connection import Connection
+from multiprocessing.process import BaseProcess
+from threading import Thread
+from typing import Any, Callable, Optional, Union, cast
+
+import cloudpickle
+
+import vllm.envs as envs
+from vllm.config import VllmConfig
+from vllm.distributed import (destroy_distributed_environment,
+                              destroy_model_parallel)
+from vllm.distributed.device_communicators.shm_broadcast import (Handle,
+                                                                 MessageQueue)
+from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
+from vllm.executor.multiproc_worker_utils import (
+    _add_prefix, set_multiprocessing_worker_envs)
+from vllm.logger import init_logger
+from vllm.utils import (get_distributed_init_method, get_loopback_ip,
+                        get_mp_context, get_open_port)
+from vllm.v1.executor.abstract import Executor, FailureCallback
+from vllm.v1.outputs import ModelRunnerOutput
+from vllm.worker.worker_base import WorkerWrapperBase
+
+logger = init_logger(__name__)
+
+
+class MultiprocExecutor(Executor):
+
+    def _init_executor(self) -> None:
+        # Call self.shutdown at exit to clean up
+        # and ensure workers will be terminated.
+        self._finalizer = weakref.finalize(self, self.shutdown)
+        self.is_failed = False
+        self.shutdown_event = threading.Event()
+        self.failure_callback: Optional[FailureCallback] = None
+        self.io_thread_pool: Optional[ThreadPoolExecutor] = None
+
+        self.world_size = self.parallel_config.world_size
+        tensor_parallel_size = self.parallel_config.tensor_parallel_size
+        pp_parallel_size = self.parallel_config.pipeline_parallel_size
+        assert self.world_size == tensor_parallel_size * pp_parallel_size, (
+            f"world_size ({self.world_size}) must be equal to the "
+            f"tensor_parallel_size ({tensor_parallel_size}) x pipeline"
+            f"_parallel_size ({pp_parallel_size}). ")
+
+        # Set multiprocessing envs that are common to V0 and V1
+        set_multiprocessing_worker_envs(self.parallel_config)
+
+        # Multiprocessing-based executor does not support multi-node setting.
+        # Since it only works for single node, we can use the loopback address
+        # get_loopback_ip() for communication.
+        distributed_init_method = get_distributed_init_method(
+            get_loopback_ip(), get_open_port())
+
+        # Initialize worker and set up message queues for SchedulerOutputs
+        # and ModelRunnerOutputs
+        max_chunk_bytes = envs.VLLM_MQ_MAX_CHUNK_BYTES_MB * 1024 * 1024
+        self.rpc_broadcast_mq = MessageQueue(self.world_size,
+                                             self.world_size,
+                                             max_chunk_bytes=max_chunk_bytes)
+        scheduler_output_handle = self.rpc_broadcast_mq.export_handle()
+
+        # Create workers
+        unready_workers: list[UnreadyWorkerProcHandle] = []
+        success = False
+        try:
+            for rank in range(self.world_size):
+                unready_workers.append(
+                    WorkerProc.make_worker_process(
+                        vllm_config=self.vllm_config,
+                        local_rank=rank,
+                        rank=rank,
+                        distributed_init_method=distributed_init_method,
+                        input_shm_handle=scheduler_output_handle,
+                    ))
+
+            # Workers must be created before wait_for_ready to avoid
+            # deadlock, since worker.init_device() does a device sync.
+            self.workers = WorkerProc.wait_for_ready(unready_workers)
+
+            # Ensure message queues are ready. Will deadlock if re-ordered
+            # Must be kept consistent with the WorkerProc.
+            self.rpc_broadcast_mq.wait_until_ready()
+            for w in self.workers:
+                w.worker_response_mq.wait_until_ready()
+
+            self.start_worker_monitor()
+            success = True
+        finally:
+            if not success:
+                # Clean up the worker procs if there was a failure.
+                self._ensure_worker_termination(
+                    [w.proc for w in unready_workers])
+
+        # For pipeline parallel, we use a thread pool for asynchronous
+        # execute_model.
+        if self.max_concurrent_batches > 1:
+            # Note: must use only 1 IO thread to keep dequeue sequence
+            # from the response queue
+            # _async_aggregate_workers_output also assumes a single IO thread
+            self.io_thread_pool = ThreadPoolExecutor(
+                max_workers=1, thread_name_prefix="mp_exec_io")
+
+        self.output_rank = self._get_output_rank()
+        self.has_connector = self.vllm_config.kv_transfer_config is not None
+        self.kv_output_aggregator = KVOutputAggregator(
+            self.parallel_config.world_size)
+
+    def start_worker_monitor(self):
+        workers = self.workers
+        self_ref = weakref.ref(self)
+
+        # Monitors worker process liveness. If any die unexpectedly,
+        # logs an error, shuts down the executor and invokes the failure
+        # callback to inform the engine.
+        def monitor_workers():
+            sentinels = [h.proc.sentinel for h in workers]
+            died = multiprocessing.connection.wait(sentinels)
+            _self = self_ref()
+            if not _self or getattr(_self, 'shutting_down', False):
+                return
+            _self.is_failed = True
+            proc_name = next(h.proc.name for h in workers
+                             if h.proc.sentinel == died[0])
+            logger.error(
+                "Worker proc %s died unexpectedly, "
+                "shutting down executor.", proc_name)
+            _self.shutdown()
+            callback = _self.failure_callback
+            if callback is not None:
+                _self.failure_callback = None
+                callback()
+
+        Thread(target=monitor_workers,
+               daemon=True,
+               name="MultiprocWorkerMonitor").start()
+
+    def register_failure_callback(self, callback: FailureCallback):
+        if self.is_failed:
+            callback()
+        else:
+            self.failure_callback = callback
+
+    def execute_model(
+        self,
+        scheduler_output,
+    ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
+        non_block = self.max_concurrent_batches > 1
+
+        if not self.has_connector:
+            # get output only from a single worker (output_rank)
+            (output, ) = self.collective_rpc(
+                "execute_model",
+                args=(scheduler_output, ),
+                unique_reply_rank=self.output_rank,
+                non_block=non_block,
+                timeout=envs.VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS)
+            return output
+
+        # get output from all workers
+        outputs = self.collective_rpc(
+            "execute_model",
+            args=(scheduler_output, ),
+            non_block=non_block,
+            timeout=envs.VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS)
+
+        # aggregate all workers output to a single output
+        if non_block:
+            return self.kv_output_aggregator.async_aggregate(
+                outputs, self.output_rank)
+        return self.kv_output_aggregator.aggregate(outputs, self.output_rank)
+
+    def collective_rpc(self,
+                       method: Union[str, Callable],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict] = None,
+                       non_block: bool = False,
+                       unique_reply_rank: Optional[int] = None) -> list[Any]:
+        if self.is_failed:
+            raise RuntimeError("Executor failed.")
+
+        deadline = None if timeout is None else time.monotonic() + timeout
+        kwargs = kwargs or {}
+
+        # NOTE: If the args are heterogeneous, then we pack them into a list,
+        # and unpack them in the method of every worker, because every worker
+        # knows their own rank.
+        try:
+            if isinstance(method, str):
+                send_method = method
+            else:
+                send_method = cloudpickle.dumps(
+                    method, protocol=pickle.HIGHEST_PROTOCOL)
+            self.rpc_broadcast_mq.enqueue(
+                (send_method, args, kwargs, unique_reply_rank))
+
+            workers = (self.workers[unique_reply_rank],
+                       ) if unique_reply_rank is not None else self.workers
+            responses = []
+
+            def get_response(w: WorkerProcHandle,
+                             dequeue_timeout: Optional[float] = None,
+                             cancel_event: Optional[threading.Event] = None):
+                status, result = w.worker_response_mq.dequeue(
+                    timeout=dequeue_timeout, cancel=cancel_event)
+
+                if status != WorkerProc.ResponseStatus.SUCCESS:
+                    raise RuntimeError(
+                        f"Worker failed with error '{result}', please check the"
+                        " stack trace above for the root cause")
+                return result
+
+            for w in workers:
+                dequeue_timeout = None if deadline is None else (
+                    deadline - time.monotonic())
+
+                if non_block:
+                    result = self.io_thread_pool.submit(  # type: ignore
+                        get_response, w, dequeue_timeout, self.shutdown_event)
+                else:
+                    result = get_response(w, dequeue_timeout)
+
+                responses.append(result)
+
+            return responses
+        except TimeoutError as e:
+            raise TimeoutError(f"RPC call to {method} timed out.") from e
+
+    @staticmethod
+    def _ensure_worker_termination(worker_procs: list[BaseProcess]):
+        """Ensure that all worker processes are terminated. Assumes workers have
+        received termination requests. Waits for processing, then sends
+        termination and kill signals if needed."""
+
+        def wait_for_termination(procs, timeout):
+            if not time:
+                # If we are in late stage shutdown, the interpreter may replace
+                # `time` with `None`.
+                return all(not proc.is_alive() for proc in procs)
+            start_time = time.time()
+            while time.time() - start_time < timeout:
+                if all(not proc.is_alive() for proc in procs):
+                    return True
+                time.sleep(0.1)
+            return False
+
+        # Send SIGTERM if still running
+        active_procs = [proc for proc in worker_procs if proc.is_alive()]
+        for p in active_procs:
+            p.terminate()
+        if not wait_for_termination(active_procs, 4):
+            # Send SIGKILL if still running
+            active_procs = [p for p in active_procs if p.is_alive()]
+            for p in active_procs:
+                p.kill()
+
+    def shutdown(self):
+        """Properly shut down the executor and its workers"""
+        if not getattr(self, 'shutting_down', False):
+            self.shutting_down = True
+            self.shutdown_event.set()
+
+            if self.io_thread_pool is not None:
+                self.io_thread_pool.shutdown(wait=False, cancel_futures=True)
+                self.io_thread_pool = None
+
+            if workers := getattr(self, 'workers', None):
+                for w in workers:
+                    w.worker_response_mq = None
+                self._ensure_worker_termination([w.proc for w in workers])
+
+        self.rpc_broadcast_mq = None
+
+    def check_health(self) -> None:
+        self.collective_rpc("check_health", timeout=10)
+        return
+
+    @property
+    def max_concurrent_batches(self) -> int:
+        if self.scheduler_config.async_scheduling:
+            return 2
+        return self.parallel_config.pipeline_parallel_size
+
+    def _get_output_rank(self) -> int:
+        # Only returns ModelRunnerOutput from TP rank=0 and PP rank=-1
+        # (the first TP worker of the last PP stage).
+        # Example:
+        # Assuming TP=8, PP=4, then the world_size=32
+        # 0-7, PP rank 0
+        # 8-15, PP rank 1
+        # 16-23, PP rank 2
+        # 24-31, PP rank 3
+        # so world_size - tp_size = 32 - 8 = 24 should be PP rank = -1 (i.e. 3)
+        return self.world_size - self.parallel_config.tensor_parallel_size
+
+
+@dataclass
+class UnreadyWorkerProcHandle:
+    """WorkerProcess handle before READY."""
+    proc: BaseProcess
+    rank: int
+    ready_pipe: Connection
+
+
+@dataclass
+class WorkerProcHandle:
+    proc: BaseProcess
+    rank: int
+    worker_response_mq: MessageQueue  # The worker process writes to this MQ
+
+    @classmethod
+    def from_unready_handle(
+            cls, unready_handle: UnreadyWorkerProcHandle,
+            worker_response_mq: MessageQueue) -> "WorkerProcHandle":
+        return cls(
+            proc=unready_handle.proc,
+            rank=unready_handle.rank,
+            worker_response_mq=worker_response_mq,
+        )
+
+
+class WorkerProc:
+    """Wrapper that runs one Worker in a separate process."""
+
+    READY_STR = "READY"
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        local_rank: int,
+        rank: int,
+        distributed_init_method: str,
+        input_shm_handle: Handle,
+    ):
+        self.rank = rank
+        wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank)
+        # TODO: move `init_worker` to executor level as a collective rpc call
+        all_kwargs: list[dict] = [
+            {} for _ in range(vllm_config.parallel_config.world_size)
+        ]
+        is_driver_worker = (
+            rank % vllm_config.parallel_config.tensor_parallel_size == 0)
+        all_kwargs[rank] = {
+            "vllm_config": vllm_config,
+            "local_rank": local_rank,
+            "rank": rank,
+            "distributed_init_method": distributed_init_method,
+            "is_driver_worker": is_driver_worker,
+        }
+        wrapper.init_worker(all_kwargs)
+        self.worker = wrapper
+
+        pid = os.getpid()
+        _add_prefix(sys.stdout, f"VllmWorker rank={rank}", pid)
+        _add_prefix(sys.stderr, f"VllmWorker rank={rank}", pid)
+
+        # Initialize MessageQueue for receiving SchedulerOutput
+        self.rpc_broadcast_mq = MessageQueue.create_from_handle(
+            input_shm_handle, self.worker.rank)
+
+        # Initializes a message queue for sending the model output
+        self.worker_response_mq = MessageQueue(1, 1)
+
+        # Initialize device and loads weights
+        self.worker.init_device()
+        self.worker.load_model()
+
+    @staticmethod
+    def make_worker_process(
+            vllm_config: VllmConfig,
+            local_rank: int,
+            rank: int,
+            distributed_init_method: str,
+            input_shm_handle,  # Receive SchedulerOutput
+    ) -> UnreadyWorkerProcHandle:
+        context = get_mp_context()
+        # (reader, writer)
+        reader, writer = context.Pipe(duplex=False)
+
+        process_kwargs = {
+            "vllm_config": vllm_config,
+            "local_rank": local_rank,
+            "rank": rank,
+            "distributed_init_method": distributed_init_method,
+            "input_shm_handle": input_shm_handle,
+            "ready_pipe": (reader, writer),
+        }
+        # Run EngineCore busy loop in background process.
+        proc = context.Process(target=WorkerProc.worker_main,
+                               kwargs=process_kwargs,
+                               name=f"VllmWorker-{rank}",
+                               daemon=True)
+
+        proc.start()
+        writer.close()
+        return UnreadyWorkerProcHandle(proc, rank, reader)
+
+    @staticmethod
+    def wait_for_ready(
+        unready_proc_handles: list[UnreadyWorkerProcHandle]
+    ) -> list[WorkerProcHandle]:
+
+        e = Exception("WorkerProc initialization failed due to "
+                      "an exception in a background process. "
+                      "See stack trace for root cause.")
+
+        pipes = {handle.ready_pipe: handle for handle in unready_proc_handles}
+        ready_proc_handles: list[Optional[WorkerProcHandle]] = (
+            [None] * len(unready_proc_handles))
+        while pipes:
+            ready = multiprocessing.connection.wait(pipes.keys())
+            for pipe in ready:
+                assert isinstance(pipe, Connection)
+                try:
+                    # Wait until the WorkerProc is ready.
+                    unready_proc_handle = pipes.pop(pipe)
+                    response: dict[str, Any] = pipe.recv()
+                    if response["status"] != "READY":
+                        raise e
+
+                    # Extract the message queue handle.
+                    worker_response_mq = MessageQueue.create_from_handle(
+                        response["handle"], 0)
+                    ready_proc_handles[unready_proc_handle.rank] = (
+                        WorkerProcHandle.from_unready_handle(
+                            unready_proc_handle, worker_response_mq))
+
+                except EOFError:
+                    e.__suppress_context__ = True
+                    raise e from None
+
+                finally:
+                    # Close connection.
+                    pipe.close()
+
+        return cast(list[WorkerProcHandle], ready_proc_handles)
+
+    def shutdown(self):
+        self.rpc_broadcast_mq = None
+        self.worker_response_mq = None
+        destroy_model_parallel()
+        destroy_distributed_environment()
+
+    @staticmethod
+    def worker_main(*args, **kwargs):
+        """ Worker initialization and execution loops.
+        This runs a background process """
+
+        # Signal handler used for graceful termination.
+        # SystemExit exception is only raised once to allow this and worker
+        # processes to terminate without error
+        shutdown_requested = False
+
+        def signal_handler(signum, frame):
+            nonlocal shutdown_requested
+            if not shutdown_requested:
+                shutdown_requested = True
+                raise SystemExit()
+
+        # Either SIGTERM or SIGINT will terminate the worker
+        signal.signal(signal.SIGTERM, signal_handler)
+        signal.signal(signal.SIGINT, signal_handler)
+
+        worker = None
+        # tuple[Connection, Connection]
+        reader, ready_writer = kwargs.pop("ready_pipe")
+        try:
+            reader.close()
+            worker = WorkerProc(*args, **kwargs)
+
+            # Send READY once we know everything is loaded
+            ready_writer.send({
+                "status":
+                WorkerProc.READY_STR,
+                "handle":
+                worker.worker_response_mq.export_handle(),
+            })
+
+            # Ensure message queues are ready. Will deadlock if re-ordered.
+            # Must be kept consistent with the Executor
+            worker.rpc_broadcast_mq.wait_until_ready()
+            worker.worker_response_mq.wait_until_ready()
+            ready_writer.close()
+            ready_writer = None
+
+            worker.worker_busy_loop()
+
+        except Exception:
+            # NOTE: if an Exception arises in busy_loop, we send
+            # a FAILURE message over the MQ RPC to notify the Executor,
+            # which triggers system shutdown.
+            # TODO(rob): handle case where the MQ itself breaks.
+
+            if ready_writer is not None:
+                logger.exception("WorkerProc failed to start.")
+            else:
+                logger.exception("WorkerProc failed.")
+
+            # The parent sends a SIGTERM to all worker processes if
+            # any worker dies. Set this value so we don't re-throw
+            # SystemExit() to avoid zmq exceptions in __del__.
+            shutdown_requested = True
+
+        finally:
+            if ready_writer is not None:
+                ready_writer.close()
+            # Clean up once worker exits busy loop
+            if worker is not None:
+                worker.shutdown()
+
+    class ResponseStatus(Enum):
+        SUCCESS = auto()
+        FAILURE = auto()
+
+    def worker_busy_loop(self):
+        """Main busy loop for Multiprocessing Workers"""
+        while True:
+            method, args, kwargs, output_rank = self.rpc_broadcast_mq.dequeue()
+
+            try:
+                if isinstance(method, str):
+                    func = getattr(self.worker, method)
+                elif isinstance(method, bytes):
+                    func = partial(cloudpickle.loads(method), self.worker)
+                output = func(*args, **kwargs)
+            except Exception as e:
+                # Notes have been introduced in python 3.11
+                if hasattr(e, "add_note"):
+                    e.add_note(traceback.format_exc())
+                logger.exception("WorkerProc hit an exception.")
+                # exception might not be serializable, so we convert it to
+                # string, only for logging purpose.
+                if output_rank is None or self.rank == output_rank:
+                    self.worker_response_mq.enqueue(
+                        (WorkerProc.ResponseStatus.FAILURE, str(e)))
+                continue
+
+            if output_rank is None or self.rank == output_rank:
+                self.worker_response_mq.enqueue(
+                    (WorkerProc.ResponseStatus.SUCCESS, output))
diff --git a/vllm_v0.10.0/vllm/v1/executor/ray_distributed_executor.py b/vllm_v0.10.0/vllm/v1/executor/ray_distributed_executor.py
new file mode 100644
index 0000000..b86ac04
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/executor/ray_distributed_executor.py
@@ -0,0 +1,106 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from concurrent.futures import Future
+from typing import Optional, Union
+
+from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
+from vllm.executor.ray_distributed_executor import (  # noqa
+    RayDistributedExecutor as RayDistributedExecutorV0)
+from vllm.logger import init_logger
+from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.outputs import ModelRunnerOutput
+
+logger = init_logger(__name__)
+
+
+class FutureWrapper(Future):
+    """A wrapper around Ray output reference to meet the interface
+    of .execute_model(): The top level (core busy loop) expects .result() api 
+    to block and return a single output.
+    
+    If aggregator is provided, the outputs from all workers are aggregated upon 
+    the result() call. If not only the first worker's output is returned.
+    """
+
+    def __init__(self, refs, aggregator: Optional[KVOutputAggregator] = None):
+        super().__init__()
+        self.refs = refs
+        self.aggregator = aggregator
+
+    def result(self, timeout=None):
+        if timeout is not None:
+            raise NotImplementedError("timeout is not supported")
+
+        if self.aggregator is None:
+            return self.refs[0].get()
+
+        outputs = [ref.get() for ref in self.refs]
+        return self.aggregator.aggregate(outputs, output_rank=0)
+
+
+class RayDistributedExecutor(RayDistributedExecutorV0, Executor):
+    """Ray distributed executor using Ray Compiled Graphs."""
+
+    def _init_executor(self) -> None:
+        super()._init_executor()
+
+        # KV connector setup
+        self.has_connector = self.vllm_config.kv_transfer_config is not None
+        self.kv_output_aggregator = KVOutputAggregator(
+            self.parallel_config.world_size)
+
+    @property
+    def max_concurrent_batches(self) -> int:
+        """Ray distributed executor supports pipeline parallelism,
+        meaning that it allows PP size batches to be executed concurrently.
+        """
+        if self.scheduler_config.async_scheduling:
+            return 2
+        return self.parallel_config.pipeline_parallel_size
+
+    def execute_model(
+        self,
+        scheduler_output,
+    ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
+        """Execute the model on the Ray workers.
+
+        Args:
+            scheduler_output: The scheduler output to execute.
+
+        Returns:
+            The model runner output.
+        """
+        # Build the compiled DAG for the first time.
+        if self.forward_dag is None:  # type: ignore
+            self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
+
+        refs = self.forward_dag.execute(scheduler_output)  # type: ignore
+
+        if not self.has_connector:
+            # Get output only from a single worker (output_rank)
+            # When PP is not used, we block here until the result is available.
+            if self.max_concurrent_batches == 1:
+                return refs[0].get()
+
+            # When PP is used, we return a FutureWrapper immediately so that
+            # the scheduler can yield to the next batch.
+            return FutureWrapper(refs)
+
+        # Get output from all workers when connector is present
+        if self.max_concurrent_batches == 1:
+            # Block and get results from all workers
+            outputs = [ref.get() for ref in refs]
+            return self.kv_output_aggregator.aggregate(outputs)
+
+        # Return a future that will aggregate outputs from all workers
+        return FutureWrapper(refs, self.kv_output_aggregator)
+
+    def reinitialize_distributed(
+            self, reconfig_request: ReconfigureDistributedRequest) -> None:
+        self._run_workers("reinitialize_distributed", reconfig_request)
+        if reconfig_request.new_data_parallel_rank == \
+        ReconfigureRankType.SHUTDOWN_CURRENT_RANK:
+            self.shutdown()
+        return
\ No newline at end of file
diff --git a/vllm_v0.10.0/vllm/v1/kv_cache_interface.py b/vllm_v0.10.0/vllm/v1/kv_cache_interface.py
new file mode 100644
index 0000000..bec31a7
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/kv_cache_interface.py
@@ -0,0 +1,263 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import copy
+from dataclasses import dataclass
+from math import prod
+from typing import Optional
+
+import torch
+from typing_extensions import Self
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.utils import cdiv, get_dtype_size
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class KVCacheSpec:
+    """
+    A base class for specifying the KV cache format of one layer.
+    """
+
+    # number of tokens in a block
+    block_size: int
+
+    @property
+    def type_id(self) -> str:
+        """
+        The type identifier of this KV cache.
+        Return different strings for layers with different KV cache type (e.g.,
+        different number of tokens like full attention vs sliding window
+        attention, different KV cache size per token like layers with different
+        number of heads)
+
+        Returns:
+            The type identifier of this KV cache.
+        """
+        raise NotImplementedError
+
+    @property
+    def page_size_bytes(self) -> int:
+        """
+        The size of a page with `block_size` tokens in bytes.
+
+        Returns:
+            The page size
+        """
+        raise NotImplementedError
+
+    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
+        """
+        The maximum possible memory usage of this KV cache in bytes.
+
+        Returns:
+            The KV cache size in bytes
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def merge(cls, specs: list[Self]) -> Self:
+        """
+        Merge a list of KVCacheSpec objects into a single KVCacheSpec object.
+        """
+        assert all(spec.type_id == specs[0].type_id for spec in specs[1:]), (
+            "All layers in the same KV cache group must share the same "
+            "type_id.")
+        return copy.deepcopy(specs[0])
+
+
+@dataclass
+class AttentionSpec(KVCacheSpec):
+    num_kv_heads: int
+    head_size: int
+    dtype: torch.dtype
+    use_mla: bool
+
+    @property
+    def page_size_bytes(self) -> int:
+        # For MLA we only store a single latent vector
+        coef = 1 if self.use_mla else 2
+        return coef * self.block_size * self.num_kv_heads * self.head_size \
+                * get_dtype_size(self.dtype)
+
+
+@dataclass
+class FullAttentionSpec(AttentionSpec):
+    sliding_window: Optional[int] = None
+    attention_chunk_size: Optional[int] = None
+    """
+    When hybrid allocator is disabled and the model contains both full 
+    attention layers and sliding window attention layers, sliding 
+    window attention are regarded as full attention in KV cache manager 
+    (blocks are allocated for all tokens), while computed as sliding window 
+    attention in model runner.
+    In this case, we use FullAttentionSpec and record the sliding window size.
+    Default to None for not using sliding window attention.
+    """
+
+    @property
+    def type_id(self) -> str:
+        return f"full_attention_{self.block_size}_{self.page_size_bytes}"
+
+    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
+        max_model_len = vllm_config.model_config.max_model_len
+        return cdiv(max_model_len, self.block_size) * self.page_size_bytes
+
+    @classmethod
+    def merge_window_sizes(cls, window_sizes: set[int]) -> Optional[int]:
+        if len(window_sizes) == 0:
+            return None
+        elif len(window_sizes) == 1:
+            return window_sizes.pop()
+        else:
+            raise ValueError(
+                "All attention layers in the same KV cache group must have the "
+                "same window size.")
+
+    @classmethod
+    def merge(cls, specs: list[Self]) -> Self:
+        """
+        Merge a list of FullAttentionSpec objects into a single 
+        FullAttentionSpec object.
+        """
+        merged_spec = super().merge(specs)
+        sliding_window = set(spec.sliding_window for spec in specs
+                             if spec.sliding_window is not None)
+        attention_chunk_size = set(spec.attention_chunk_size for spec in specs
+                                   if spec.attention_chunk_size is not None)
+
+        merged_spec.sliding_window = cls.merge_window_sizes(sliding_window)
+        merged_spec.attention_chunk_size = (
+            cls.merge_window_sizes(attention_chunk_size))
+        assert (
+            (merged_spec.sliding_window is not None) +
+            (merged_spec.attention_chunk_size is not None) <= 1
+        ), ("Model with both sliding window layers and chunked local attention "
+            "layers is not supported.")
+        return merged_spec
+
+
+@dataclass
+class ChunkedLocalAttentionSpec(AttentionSpec):
+    attention_chunk_size: int
+
+    @property
+    def type_id(self) -> str:
+        return (
+            f"local_attention_{self.attention_chunk_size}_{self.block_size}_{self.page_size_bytes}"
+        )  # noqa
+
+    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
+        max_model_len = vllm_config.model_config.max_model_len
+        max_num_batched_tokens = (
+            vllm_config.scheduler_config.max_num_batched_tokens)
+
+        # During chunked prefill, we allocate KV cache for at most
+        # `self.attention_chunk_size` computed tokens plus the newly scheduled
+        # tokens. And we won't allocate KV cache for more than `max_model_len`
+        # tokens.
+        num_tokens = min(self.attention_chunk_size + max_num_batched_tokens,
+                         max_model_len)
+
+        return cdiv(num_tokens, self.block_size) * self.page_size_bytes
+
+
+@dataclass
+class SlidingWindowSpec(AttentionSpec):
+    sliding_window: int
+
+    def __post_init__(self):
+        assert not self.use_mla, "MLA is not supported for sliding window"
+
+    @property
+    def type_id(self) -> str:
+        return f"sliding_window_{self.sliding_window}_{self.block_size}_{self.page_size_bytes}"  # noqa
+
+    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
+        max_model_len = vllm_config.model_config.max_model_len
+        max_num_batched_tokens = (
+            vllm_config.scheduler_config.max_num_batched_tokens)
+
+        # During chunked prefill, we allocate KV cache for the last
+        # `self.sliding_window-1` computed tokens plus the newly scheduled
+        # tokens. And we won't allocate KV cache for more than `max_model_len`
+        # tokens.
+        num_tokens = min(self.sliding_window - 1 + max_num_batched_tokens,
+                         max_model_len)
+
+        # +1 here because the sliding window may not start from the beginning
+        # of the block. For example, if the block size is 4 and num_token
+        # is 4, we need two blocks [XXCD] [EF] to store the sliding
+        # window [CDEF] of 6 tokens.
+        return (cdiv(num_tokens, self.block_size) + 1) * self.page_size_bytes
+
+
+@dataclass
+class MambaSpec(KVCacheSpec):
+    shapes: tuple[tuple[int, ...], ...]
+    dtype: torch.dtype
+    page_size_padded: Optional[int] = None
+
+    def __post_init__(self):
+        self.num_elements = sum(prod(shape) for shape in self.shapes)
+
+    @property
+    def type_id(self) -> str:
+        return f"mamba_{self.shapes}_{self.dtype}"
+
+    @property
+    def page_size_bytes(self) -> int:
+        page_size = self.num_elements * get_dtype_size(self.dtype)
+        if self.page_size_padded is not None:
+            assert self.page_size_padded >= page_size
+            return self.page_size_padded
+        return page_size
+
+    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
+        # We allocate 1 block for each request now, so max_memory_usage_bytes is
+        # the same as page_size_bytes.
+        # Need to update this when supporting prefix caching.
+        return self.page_size_bytes
+
+
+@dataclass
+class KVCacheTensor:
+    """
+    A class for specifying how the workers should initialize the KV cache.
+    """
+    size: int  # size of the KV cache tensor in bytes
+    shared_by: list[str]  # layer names that share the same KV cache tensor
+
+
+@dataclass
+class KVCacheGroupSpec:
+    """
+    Represents a group of model layers that share the same KV cache block table.
+    These layers are regarded as one layer in the KV cache manager.
+    """
+    # The names of model layers in this group
+    layer_names: list[str]
+    # The KV cache spec of this manager layer
+    kv_cache_spec: KVCacheSpec
+
+
+@dataclass
+class KVCacheConfig:
+    """
+    The KV cache configuration of a model.
+    """
+    """The number of KV cache blocks"""
+    num_blocks: int
+    """How should model runner initialize the KV cache tensors for each layer"""
+    kv_cache_tensors: list[KVCacheTensor]
+    """
+    The kv cache groups of the model.
+    For models with only one type of attention, there is only one group that
+    contains all layers.
+    For models with multiple types of attention, there will be multiple groups,
+    see `_get_kv_cache_config_uniform_page_size` for more details.
+    """
+    kv_cache_groups: list[KVCacheGroupSpec]
diff --git a/vllm_v0.10.0/vllm/v1/metrics/__init__.py b/vllm_v0.10.0/vllm/v1/metrics/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/v1/metrics/loggers.py b/vllm_v0.10.0/vllm/v1/metrics/loggers.py
new file mode 100644
index 0000000..7f2556b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/metrics/loggers.py
@@ -0,0 +1,694 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import logging
+import time
+from abc import ABC, abstractmethod
+from typing import Callable, Optional, Union
+
+import numpy as np
+import prometheus_client
+
+from vllm.config import SupportsMetricsInfo, VllmConfig
+from vllm.logger import init_logger
+from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics
+from vllm.v1.engine import FinishReason
+from vllm.v1.metrics.prometheus import unregister_vllm_metrics
+from vllm.v1.metrics.stats import IterationStats, SchedulerStats
+from vllm.v1.spec_decode.metrics import SpecDecodingLogging, SpecDecodingProm
+
+logger = init_logger(__name__)
+
+StatLoggerFactory = Callable[[VllmConfig, int], "StatLoggerBase"]
+
+
+class StatLoggerBase(ABC):
+    """Interface for logging metrics.
+
+    API users may define custom loggers that implement this interface.
+    However, note that the `SchedulerStats` and `IterationStats` classes
+    are not considered stable interfaces and may change in future versions.
+    """
+
+    @abstractmethod
+    def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
+        ...
+
+    @abstractmethod
+    def record(self,
+               scheduler_stats: Optional[SchedulerStats],
+               iteration_stats: Optional[IterationStats],
+               engine_idx: int = 0):
+        ...
+
+    @abstractmethod
+    def log_engine_initialized(self):
+        ...
+
+    def log(self):  # noqa
+        pass
+
+
+class LoggingStatLogger(StatLoggerBase):
+
+    def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
+        self.engine_index = engine_index
+        self.vllm_config = vllm_config
+        self._reset(time.monotonic())
+        self.last_scheduler_stats = SchedulerStats()
+        # Prefix cache metrics. This cannot be reset.
+        # TODO: Make the interval configurable.
+        self.prefix_caching_metrics = PrefixCachingMetrics()
+        self.spec_decoding_logging = SpecDecodingLogging()
+        self.last_prompt_throughput: float = 0.0
+        self.last_generation_throughput: float = 0.0
+
+    def _reset(self, now):
+        self.last_log_time = now
+
+        # Tracked stats over current local logging interval.
+        self.num_prompt_tokens: list[int] = []
+        self.num_generation_tokens: list[int] = []
+
+    def _track_iteration_stats(self, iteration_stats: IterationStats):
+        # Save tracked stats for token counters.
+        self.num_prompt_tokens.append(iteration_stats.num_prompt_tokens)
+        self.num_generation_tokens.append(
+            iteration_stats.num_generation_tokens)
+
+    def _get_throughput(self, tracked_stats: list[int], now: float) -> float:
+        # Compute summary metrics for tracked stats
+        return float(np.sum(tracked_stats) / (now - self.last_log_time))
+
+    def record(self,
+               scheduler_stats: Optional[SchedulerStats],
+               iteration_stats: Optional[IterationStats],
+               engine_idx: int = 0):
+        """Log Stats to standard output."""
+
+        if iteration_stats:
+            self._track_iteration_stats(iteration_stats)
+
+        if scheduler_stats is not None:
+            self.prefix_caching_metrics.observe(
+                scheduler_stats.prefix_cache_stats)
+
+            if scheduler_stats.spec_decoding_stats is not None:
+                self.spec_decoding_logging.observe(
+                    scheduler_stats.spec_decoding_stats)
+
+            self.last_scheduler_stats = scheduler_stats
+
+    def log(self):
+        now = time.monotonic()
+        prompt_throughput = self._get_throughput(self.num_prompt_tokens, now)
+        generation_throughput = self._get_throughput(
+            self.num_generation_tokens, now)
+
+        self._reset(now)
+
+        scheduler_stats = self.last_scheduler_stats
+
+        log_fn = logger.info
+        if not any(
+            (prompt_throughput, generation_throughput,
+             self.last_prompt_throughput, self.last_generation_throughput)):
+            # Avoid log noise on an idle production system
+            log_fn = logger.debug
+        self.last_generation_throughput = generation_throughput
+        self.last_prompt_throughput = prompt_throughput
+
+        # Format and print output.
+        log_fn(
+            "Engine %03d: "
+            "Avg prompt throughput: %.1f tokens/s, "
+            "Avg generation throughput: %.1f tokens/s, "
+            "Running: %d reqs, Waiting: %d reqs, "
+            "GPU KV cache usage: %.1f%%, "
+            "Prefix cache hit rate: %.1f%%",
+            self.engine_index,
+            prompt_throughput,
+            generation_throughput,
+            scheduler_stats.num_running_reqs,
+            scheduler_stats.num_waiting_reqs,
+            scheduler_stats.kv_cache_usage * 100,
+            self.prefix_caching_metrics.hit_rate * 100,
+        )
+        self.spec_decoding_logging.log(log_fn=log_fn)
+
+    def log_engine_initialized(self):
+        if self.vllm_config.cache_config.num_gpu_blocks:
+            logger.info(
+                "Engine %03d: vllm cache_config_info with initialization "
+                "after num_gpu_blocks is: %d", self.engine_index,
+                self.vllm_config.cache_config.num_gpu_blocks)
+
+
+class PrometheusStatLogger(StatLoggerBase):
+    _gauge_cls = prometheus_client.Gauge
+    _counter_cls = prometheus_client.Counter
+    _histogram_cls = prometheus_client.Histogram
+    _spec_decoding_cls = SpecDecodingProm
+
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 engine_indexes: Optional[list[int]] = None):
+        if engine_indexes is None:
+            engine_indexes = [0]
+        self.engine_indexes = engine_indexes
+
+        unregister_vllm_metrics()
+        self.vllm_config = vllm_config
+        # Use this flag to hide metrics that were deprecated in
+        # a previous release and which will be removed future
+        self.show_hidden_metrics = \
+            vllm_config.observability_config.show_hidden_metrics
+
+        labelnames = ["model_name", "engine"]
+        model_name = vllm_config.model_config.served_model_name
+        max_model_len = vllm_config.model_config.max_model_len
+
+        if (len(self.engine_indexes) > 1
+                and vllm_config.speculative_config is not None):
+            raise NotImplementedError("Prometheus metrics with Spec Decoding "
+                                      "with >1 EngineCore per AsyncLLM is not "
+                                      "supported yet.")
+        spec_decode_labelvalues = [
+            vllm_config.model_config.served_model_name,
+            str(self.engine_indexes[0])
+        ]
+        self.spec_decoding_prom = self._spec_decoding_cls(
+            vllm_config.speculative_config, labelnames,
+            spec_decode_labelvalues)
+
+        #
+        # Scheduler state
+        #
+        gauge_scheduler_running = self._gauge_cls(
+            name="vllm:num_requests_running",
+            documentation="Number of requests in model execution batches.",
+            multiprocess_mode="mostrecent",
+            labelnames=labelnames)
+        self.gauge_scheduler_running = make_per_engine(gauge_scheduler_running,
+                                                       engine_indexes,
+                                                       model_name)
+
+        gauge_scheduler_waiting = self._gauge_cls(
+            name="vllm:num_requests_waiting",
+            documentation="Number of requests waiting to be processed.",
+            multiprocess_mode="mostrecent",
+            labelnames=labelnames)
+        self.gauge_scheduler_waiting = make_per_engine(gauge_scheduler_waiting,
+                                                       engine_indexes,
+                                                       model_name)
+
+        #
+        # GPU cache
+        #
+        # Deprecated in 0.9 - Renamed as vllm:kv_cache_usage_perc
+        # TODO: in 0.10, only enable if show_hidden_metrics=True
+        gauge_gpu_cache_usage = self._gauge_cls(
+            name="vllm:gpu_cache_usage_perc",
+            documentation=(
+                "GPU KV-cache usage. 1 means 100 percent usage."
+                "DEPRECATED: Use vllm:kv_cache_usage_perc instead."),
+            multiprocess_mode="mostrecent",
+            labelnames=labelnames)
+        self.gauge_gpu_cache_usage = make_per_engine(gauge_gpu_cache_usage,
+                                                     engine_indexes,
+                                                     model_name)
+
+        # Deprecated in 0.9 - Renamed as vllm:prefix_cache_queries
+        # TODO: in 0.10, only enable if show_hidden_metrics=True
+        counter_gpu_prefix_cache_queries = self._counter_cls(
+            name="vllm:gpu_prefix_cache_queries",
+            documentation=(
+                "GPU prefix cache queries, in terms of number of queried"
+                "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."),
+            labelnames=labelnames)
+        self.counter_gpu_prefix_cache_queries = make_per_engine(
+            counter_gpu_prefix_cache_queries, engine_indexes, model_name)
+
+        # Deprecated in 0.9 - Renamed as vllm:prefix_cache_hits
+        # TODO: in 0.10, only enable if show_hidden_metrics=True
+        counter_gpu_prefix_cache_hits = self._counter_cls(
+            name="vllm:gpu_prefix_cache_hits",
+            documentation=(
+                "GPU prefix cache hits, in terms of number of cached "
+                "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."),
+            labelnames=labelnames)
+        self.counter_gpu_prefix_cache_hits = make_per_engine(
+            counter_gpu_prefix_cache_hits, engine_indexes, model_name)
+
+        gauge_kv_cache_usage = self._gauge_cls(
+            name="vllm:kv_cache_usage_perc",
+            documentation="KV-cache usage. 1 means 100 percent usage.",
+            labelnames=labelnames)
+        self.gauge_kv_cache_usage = make_per_engine(gauge_kv_cache_usage,
+                                                    engine_indexes, model_name)
+
+        counter_prefix_cache_queries = self._counter_cls(
+            name="vllm:prefix_cache_queries",
+            documentation=(
+                "Prefix cache queries, in terms of number of queried tokens."),
+            labelnames=labelnames)
+        self.counter_prefix_cache_queries = make_per_engine(
+            counter_prefix_cache_queries, engine_indexes, model_name)
+
+        counter_prefix_cache_hits = self._counter_cls(
+            name="vllm:prefix_cache_hits",
+            documentation=(
+                "Prefix cache hits, in terms of number of cached tokens."),
+            labelnames=labelnames)
+        self.counter_prefix_cache_hits = make_per_engine(
+            counter_prefix_cache_hits, engine_indexes, model_name)
+
+        #
+        # Counters
+        #
+        counter_num_preempted_reqs = self._counter_cls(
+            name="vllm:num_preemptions",
+            documentation="Cumulative number of preemption from the engine.",
+            labelnames=labelnames)
+        self.counter_num_preempted_reqs = make_per_engine(
+            counter_num_preempted_reqs, engine_indexes, model_name)
+
+        counter_prompt_tokens = self._counter_cls(
+            name="vllm:prompt_tokens",
+            documentation="Number of prefill tokens processed.",
+            labelnames=labelnames)
+        self.counter_prompt_tokens = make_per_engine(counter_prompt_tokens,
+                                                     engine_indexes,
+                                                     model_name)
+
+        counter_generation_tokens = self._counter_cls(
+            name="vllm:generation_tokens",
+            documentation="Number of generation tokens processed.",
+            labelnames=labelnames)
+        self.counter_generation_tokens = make_per_engine(
+            counter_generation_tokens, engine_indexes, model_name)
+
+        self.counter_request_success: dict[FinishReason, dict[
+            int, prometheus_client.Counter]] = {}
+        counter_request_success_base = self._counter_cls(
+            name="vllm:request_success",
+            documentation="Count of successfully processed requests.",
+            labelnames=labelnames + ["finished_reason"])
+        for reason in FinishReason:
+            self.counter_request_success[reason] = {
+                idx:
+                counter_request_success_base.labels(model_name, str(idx),
+                                                    str(reason))
+                for idx in engine_indexes
+            }
+
+        #
+        # Histograms of counts
+        #
+        histogram_num_prompt_tokens_request = self._histogram_cls(
+            name="vllm:request_prompt_tokens",
+            documentation="Number of prefill tokens processed.",
+            buckets=build_1_2_5_buckets(max_model_len),
+            labelnames=labelnames)
+        self.histogram_num_prompt_tokens_request = make_per_engine(
+            histogram_num_prompt_tokens_request, engine_indexes, model_name)
+
+        histogram_num_generation_tokens_request = self._histogram_cls(
+            name="vllm:request_generation_tokens",
+            documentation="Number of generation tokens processed.",
+            buckets=build_1_2_5_buckets(max_model_len),
+            labelnames=labelnames)
+        self.histogram_num_generation_tokens_request = make_per_engine(
+            histogram_num_generation_tokens_request, engine_indexes,
+            model_name)
+
+        # TODO: This metric might be incorrect in case of using multiple
+        # api_server counts which uses prometheus mp.
+        # See: https://github.com/vllm-project/vllm/pull/18053
+        histogram_iteration_tokens = self._histogram_cls(
+            name="vllm:iteration_tokens_total",
+            documentation="Histogram of number of tokens per engine_step.",
+            buckets=[
+                1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384
+            ],
+            labelnames=labelnames)
+        self.histogram_iteration_tokens = make_per_engine(
+            histogram_iteration_tokens, engine_indexes, model_name)
+
+        histogram_max_num_generation_tokens_request = self._histogram_cls(
+            name="vllm:request_max_num_generation_tokens",
+            documentation=
+            "Histogram of maximum number of requested generation tokens.",
+            buckets=build_1_2_5_buckets(max_model_len),
+            labelnames=labelnames)
+        self.histogram_max_num_generation_tokens_request = make_per_engine(
+            histogram_max_num_generation_tokens_request, engine_indexes,
+            model_name)
+
+        histogram_n_request = self._histogram_cls(
+            name="vllm:request_params_n",
+            documentation="Histogram of the n request parameter.",
+            buckets=[1, 2, 5, 10, 20],
+            labelnames=labelnames)
+        self.histogram_n_request = make_per_engine(histogram_n_request,
+                                                   engine_indexes, model_name)
+
+        histogram_max_tokens_request = self._histogram_cls(
+            name="vllm:request_params_max_tokens",
+            documentation="Histogram of the max_tokens request parameter.",
+            buckets=build_1_2_5_buckets(max_model_len),
+            labelnames=labelnames)
+        self.histogram_max_tokens_request = make_per_engine(
+            histogram_max_tokens_request, engine_indexes, model_name)
+
+        #
+        # Histogram of timing intervals
+        #
+        histogram_time_to_first_token = self._histogram_cls(
+            name="vllm:time_to_first_token_seconds",
+            documentation="Histogram of time to first token in seconds.",
+            buckets=[
+                0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
+                0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0,
+                2560.0
+            ],
+            labelnames=labelnames)
+        self.histogram_time_to_first_token = make_per_engine(
+            histogram_time_to_first_token, engine_indexes, model_name)
+
+        histogram_time_per_output_token = self._histogram_cls(
+            name="vllm:time_per_output_token_seconds",
+            documentation="Histogram of time per output token in seconds.",
+            buckets=[
+                0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
+                1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
+            ],
+            labelnames=labelnames)
+        self.histogram_time_per_output_token = make_per_engine(
+            histogram_time_per_output_token, engine_indexes, model_name)
+
+        request_latency_buckets = [
+            0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
+            40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0
+        ]
+        histogram_e2e_time_request = self._histogram_cls(
+            name="vllm:e2e_request_latency_seconds",
+            documentation="Histogram of e2e request latency in seconds.",
+            buckets=request_latency_buckets,
+            labelnames=labelnames)
+        self.histogram_e2e_time_request = make_per_engine(
+            histogram_e2e_time_request, engine_indexes, model_name)
+
+        histogram_queue_time_request = self._histogram_cls(
+            name="vllm:request_queue_time_seconds",
+            documentation=
+            "Histogram of time spent in WAITING phase for request.",
+            buckets=request_latency_buckets,
+            labelnames=labelnames)
+        self.histogram_queue_time_request = make_per_engine(
+            histogram_queue_time_request, engine_indexes, model_name)
+
+        histogram_inference_time_request = self._histogram_cls(
+            name="vllm:request_inference_time_seconds",
+            documentation=
+            "Histogram of time spent in RUNNING phase for request.",
+            buckets=request_latency_buckets,
+            labelnames=labelnames)
+        self.histogram_inference_time_request = make_per_engine(
+            histogram_inference_time_request, engine_indexes, model_name)
+
+        histogram_prefill_time_request = self._histogram_cls(
+            name="vllm:request_prefill_time_seconds",
+            documentation=
+            "Histogram of time spent in PREFILL phase for request.",
+            buckets=request_latency_buckets,
+            labelnames=labelnames)
+        self.histogram_prefill_time_request = make_per_engine(
+            histogram_prefill_time_request, engine_indexes, model_name)
+
+        histogram_decode_time_request = self._histogram_cls(
+            name="vllm:request_decode_time_seconds",
+            documentation=
+            "Histogram of time spent in DECODE phase for request.",
+            buckets=request_latency_buckets,
+            labelnames=labelnames)
+        self.histogram_decode_time_request = make_per_engine(
+            histogram_decode_time_request, engine_indexes, model_name)
+
+        #
+        # LoRA metrics
+        #
+
+        # TODO: This metric might be incorrect in case of using multiple
+        # api_server counts which uses prometheus mp.
+        self.gauge_lora_info: Optional[prometheus_client.Gauge] = None
+        if vllm_config.lora_config is not None:
+            if len(self.engine_indexes) > 1:
+                raise NotImplementedError(
+                    "LoRA in DP mode is not supported yet.")
+            self.labelname_max_lora = "max_lora"
+            self.labelname_waiting_lora_adapters = "waiting_lora_adapters"
+            self.labelname_running_lora_adapters = "running_lora_adapters"
+            self.max_lora = vllm_config.lora_config.max_loras
+            self.gauge_lora_info = \
+                self._gauge_cls(
+                    name="vllm:lora_requests_info",
+                    documentation="Running stats on lora requests.",
+                    multiprocess_mode="sum",
+                    labelnames=[
+                        self.labelname_max_lora,
+                        self.labelname_waiting_lora_adapters,
+                        self.labelname_running_lora_adapters,
+                    ],
+                )
+
+    def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo):
+        metrics_info = config_obj.metrics_info()
+        metrics_info["engine"] = ""
+
+        name, documentation = None, None
+        if type == "cache_config":
+            name = "vllm:cache_config_info"
+            documentation = "Information of the LLMEngine CacheConfig"
+        assert name is not None, f"Unknown metrics info type {type}"
+
+        # Info type metrics are syntactic sugar for a gauge permanently set to 1
+        # Since prometheus multiprocessing mode does not support Info, emulate
+        # info here with a gauge.
+        info_gauge = self._gauge_cls(
+            name=name,
+            documentation=documentation,
+            multiprocess_mode="mostrecent",
+            labelnames=metrics_info.keys(),
+        )
+        for engine_index in self.engine_indexes:
+            metrics_info = config_obj.metrics_info()
+            metrics_info["engine"] = str(engine_index)
+            info_gauge.labels(**metrics_info).set(1)
+
+    def record(self,
+               scheduler_stats: Optional[SchedulerStats],
+               iteration_stats: Optional[IterationStats],
+               engine_idx: int = 0):
+        """Log to prometheus."""
+        if scheduler_stats is not None:
+            self.gauge_scheduler_running[engine_idx].set(
+                scheduler_stats.num_running_reqs)
+            self.gauge_scheduler_waiting[engine_idx].set(
+                scheduler_stats.num_waiting_reqs)
+
+            self.gauge_gpu_cache_usage[engine_idx].set(
+                scheduler_stats.kv_cache_usage)
+            self.gauge_kv_cache_usage[engine_idx].set(
+                scheduler_stats.kv_cache_usage)
+
+            self.counter_gpu_prefix_cache_queries[engine_idx].inc(
+                scheduler_stats.prefix_cache_stats.queries)
+            self.counter_gpu_prefix_cache_hits[engine_idx].inc(
+                scheduler_stats.prefix_cache_stats.hits)
+
+            self.counter_prefix_cache_queries[engine_idx].inc(
+                scheduler_stats.prefix_cache_stats.queries)
+            self.counter_prefix_cache_hits[engine_idx].inc(
+                scheduler_stats.prefix_cache_stats.hits)
+
+            if scheduler_stats.spec_decoding_stats is not None:
+                self.spec_decoding_prom.observe(
+                    scheduler_stats.spec_decoding_stats)
+
+        if iteration_stats is None:
+            return
+
+        self.counter_num_preempted_reqs[engine_idx].inc(
+            iteration_stats.num_preempted_reqs)
+        self.counter_prompt_tokens[engine_idx].inc(
+            iteration_stats.num_prompt_tokens)
+        self.counter_generation_tokens[engine_idx].inc(
+            iteration_stats.num_generation_tokens)
+        self.histogram_iteration_tokens[engine_idx].observe(
+            iteration_stats.num_prompt_tokens + \
+            iteration_stats.num_generation_tokens)
+
+        for max_gen_tokens in iteration_stats.max_num_generation_tokens_iter:
+            self.histogram_max_num_generation_tokens_request[
+                engine_idx].observe(max_gen_tokens)
+        for n_param in iteration_stats.n_params_iter:
+            self.histogram_n_request[engine_idx].observe(n_param)
+        for ttft in iteration_stats.time_to_first_tokens_iter:
+            self.histogram_time_to_first_token[engine_idx].observe(ttft)
+        for tpot in iteration_stats.time_per_output_tokens_iter:
+            self.histogram_time_per_output_token[engine_idx].observe(tpot)
+
+        for finished_request in iteration_stats.finished_requests:
+            self.counter_request_success[
+                finished_request.finish_reason][engine_idx].inc()
+            self.histogram_e2e_time_request[engine_idx].observe(
+                finished_request.e2e_latency)
+            self.histogram_queue_time_request[engine_idx].observe(
+                finished_request.queued_time)
+            self.histogram_prefill_time_request[engine_idx].observe(
+                finished_request.prefill_time)
+            self.histogram_inference_time_request[engine_idx].observe(
+                finished_request.inference_time)
+            self.histogram_decode_time_request[engine_idx].observe(
+                finished_request.decode_time)
+            self.histogram_num_prompt_tokens_request[engine_idx].observe(
+                finished_request.num_prompt_tokens)
+            self.histogram_num_generation_tokens_request[engine_idx].observe(
+                finished_request.num_generation_tokens)
+            if finished_request.max_tokens_param:
+                self.histogram_max_tokens_request[engine_idx].observe(
+                    finished_request.max_tokens_param)
+
+        if self.gauge_lora_info is not None:
+            running_lora_adapters = \
+                ",".join(iteration_stats.running_lora_adapters.keys())
+            waiting_lora_adapters = \
+                ",".join(iteration_stats.waiting_lora_adapters.keys())
+            lora_info_labels = {
+                self.labelname_running_lora_adapters: running_lora_adapters,
+                self.labelname_waiting_lora_adapters: waiting_lora_adapters,
+                self.labelname_max_lora: self.max_lora,
+            }
+            self.gauge_lora_info.labels(**lora_info_labels)\
+                                .set_to_current_time()
+
+    def log_engine_initialized(self):
+        self.log_metrics_info("cache_config", self.vllm_config.cache_config)
+
+
+PromMetric = Union[
+    prometheus_client.Gauge,
+    prometheus_client.Counter,
+    prometheus_client.Histogram,
+]
+
+
+def make_per_engine(metric: PromMetric, engine_idxs: list[int],
+                    model_name: str) -> dict[int, PromMetric]:
+    return {idx: metric.labels(model_name, str(idx)) for idx in engine_idxs}
+
+
+def build_buckets(mantissa_lst: list[int], max_value: int) -> list[int]:
+    """
+    Builds a list of buckets with increasing powers of 10 multiplied by
+    mantissa values until the value exceeds the specified maximum.
+
+    """
+    exponent = 0
+    buckets: list[int] = []
+    while True:
+        for m in mantissa_lst:
+            value = m * 10**exponent
+            if value <= max_value:
+                buckets.append(value)
+            else:
+                return buckets
+        exponent += 1
+
+
+def build_1_2_5_buckets(max_value: int) -> list[int]:
+    """
+    Example:
+    >>> build_1_2_5_buckets(100)
+    [1, 2, 5, 10, 20, 50, 100]
+    """
+    return build_buckets([1, 2, 5], max_value)
+
+
+class StatLoggerManager:
+    """
+    StatLoggerManager:
+        Logging happens at the level of the EngineCore (per scheduler).
+         * DP: >1 EngineCore per AsyncLLM - loggers for each EngineCore.
+         * With Local Logger, just make N copies for N EngineCores.
+         * With Prometheus, we need a single logger with N "labels"
+
+        This class abstracts away this implementation detail from
+        the AsyncLLM, allowing the AsyncLLM to just call .record()
+        and .log() to a simple interface.
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        engine_idxs: Optional[list[int]] = None,
+        custom_stat_loggers: Optional[list[StatLoggerFactory]] = None,
+    ):
+        self.engine_idxs = engine_idxs if engine_idxs else [0]
+
+        factories: list[StatLoggerFactory]
+        if custom_stat_loggers is not None:
+            factories = custom_stat_loggers
+        else:
+            factories = []
+            if logger.isEnabledFor(logging.INFO):
+                factories.append(LoggingStatLogger)
+
+        # engine_idx: StatLogger
+        self.per_engine_logger_dict: dict[int, list[StatLoggerBase]] = {}
+        prometheus_factory = PrometheusStatLogger
+        for engine_idx in self.engine_idxs:
+            loggers: list[StatLoggerBase] = []
+            for logger_factory in factories:
+                # If we get a custom prometheus logger, use that
+                # instead. This is typically used for the ray case.
+                if (isinstance(logger_factory, type)
+                        and issubclass(logger_factory, PrometheusStatLogger)):
+                    prometheus_factory = logger_factory
+                    continue
+                loggers.append(logger_factory(vllm_config,
+                                              engine_idx))  # type: ignore
+            self.per_engine_logger_dict[engine_idx] = loggers
+
+        # For Prometheus, need to share the metrics between EngineCores.
+        # Each EngineCore's metrics are expressed as a unique label.
+        self.prometheus_logger = prometheus_factory(vllm_config, engine_idxs)
+
+    def record(
+        self,
+        scheduler_stats: Optional[SchedulerStats],
+        iteration_stats: Optional[IterationStats],
+        engine_idx: Optional[int] = None,
+    ):
+        if engine_idx is None:
+            engine_idx = 0
+
+        per_engine_loggers = self.per_engine_logger_dict[engine_idx]
+        for logger in per_engine_loggers:
+            logger.record(scheduler_stats, iteration_stats, engine_idx)
+
+        self.prometheus_logger.record(scheduler_stats, iteration_stats,
+                                      engine_idx)
+
+    def log(self):
+        for per_engine_loggers in self.per_engine_logger_dict.values():
+            for logger in per_engine_loggers:
+                logger.log()
+
+    def log_engine_initialized(self):
+        self.prometheus_logger.log_engine_initialized()
+
+        for per_engine_loggers in self.per_engine_logger_dict.values():
+            for logger in per_engine_loggers:
+                logger.log_engine_initialized()
diff --git a/vllm_v0.10.0/vllm/v1/metrics/prometheus.py b/vllm_v0.10.0/vllm/v1/metrics/prometheus.py
new file mode 100644
index 0000000..61ba5d6
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/metrics/prometheus.py
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import tempfile
+from typing import Optional
+
+from prometheus_client import REGISTRY, CollectorRegistry, multiprocess
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+# Global temporary directory for prometheus multiprocessing
+_prometheus_multiproc_dir: Optional[tempfile.TemporaryDirectory] = None
+
+
+def setup_multiprocess_prometheus():
+    """Set up prometheus multiprocessing directory if not already configured.
+    
+    """
+    global _prometheus_multiproc_dir
+
+    if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
+        # Make TemporaryDirectory for prometheus multiprocessing
+        # Note: global TemporaryDirectory will be automatically
+        # cleaned up upon exit.
+        _prometheus_multiproc_dir = tempfile.TemporaryDirectory()
+        os.environ["PROMETHEUS_MULTIPROC_DIR"] = _prometheus_multiproc_dir.name
+        logger.debug("Created PROMETHEUS_MULTIPROC_DIR at %s",
+                     _prometheus_multiproc_dir.name)
+    else:
+        logger.warning("Found PROMETHEUS_MULTIPROC_DIR was set by user. "
+                       "This directory must be wiped between vLLM runs or "
+                       "you will find inaccurate metrics. Unset the variable "
+                       "and vLLM will properly handle cleanup.")
+
+
+def get_prometheus_registry():
+    """Get the appropriate prometheus registry based on multiprocessing 
+    configuration.
+    
+    Returns:
+        Registry: A prometheus registry
+    """
+    if os.getenv("PROMETHEUS_MULTIPROC_DIR") is not None:
+        logger.debug("Using multiprocess registry for prometheus metrics")
+        registry = CollectorRegistry()
+        multiprocess.MultiProcessCollector(registry)
+        return registry
+
+    return REGISTRY
+
+
+def unregister_vllm_metrics():
+    """Unregister any existing vLLM collectors from the prometheus registry.
+    
+    This is useful for testing and CI/CD where metrics may be registered
+    multiple times across test runs.
+    
+    Also, in case of multiprocess, we need to unregister the metrics from the 
+    global registry.
+    """
+    registry = REGISTRY
+    # Unregister any existing vLLM collectors
+    for collector in list(registry._collector_to_names):
+        if hasattr(collector, "_name") and "vllm" in collector._name:
+            registry.unregister(collector)
+
+
+def shutdown_prometheus():
+    """Shutdown prometheus metrics."""
+
+    path = _prometheus_multiproc_dir
+    if path is None:
+        return
+    try:
+        pid = os.getpid()
+        multiprocess.mark_process_dead(pid, path)
+        logger.debug("Marked Prometheus metrics for process %d as dead", pid)
+    except Exception as e:
+        logger.error("Error during metrics cleanup: %s", str(e))
diff --git a/vllm_v0.10.0/vllm/v1/metrics/ray_wrappers.py b/vllm_v0.10.0/vllm/v1/metrics/ray_wrappers.py
new file mode 100644
index 0000000..ae8f944
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/metrics/ray_wrappers.py
@@ -0,0 +1,133 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import time
+from typing import Optional, Union
+
+from vllm.v1.metrics.loggers import PrometheusStatLogger
+from vllm.v1.spec_decode.metrics import SpecDecodingProm
+
+try:
+    from ray.util import metrics as ray_metrics
+    from ray.util.metrics import Metric
+except ImportError:
+    ray_metrics = None
+
+
+class RayPrometheusMetric:
+
+    def __init__(self):
+        if ray_metrics is None:
+            raise ImportError(
+                "RayPrometheusMetric requires Ray to be installed.")
+
+        self.metric: Metric = None
+
+    def labels(self, *labels, **labelskwargs):
+        if labelskwargs:
+            for k, v in labelskwargs.items():
+                if not isinstance(v, str):
+                    labelskwargs[k] = str(v)
+
+            self.metric.set_default_tags(labelskwargs)
+
+        if labels:
+            if len(labels) != len(self.metric._tag_keys):
+                raise ValueError(
+                    "Number of labels must match the number of tag keys. "
+                    f"Expected {len(self.metric._tag_keys)}, got {len(labels)}"
+                )
+
+            self.metric.set_default_tags(
+                dict(zip(self.metric._tag_keys, labels)))
+
+        return self
+
+
+class RayGaugeWrapper(RayPrometheusMetric):
+    """Wraps around ray.util.metrics.Gauge to provide same API as
+    prometheus_client.Gauge"""
+
+    def __init__(self,
+                 name: str,
+                 documentation: Optional[str] = "",
+                 labelnames: Optional[list[str]] = None,
+                 multiprocess_mode: Optional[str] = ""):
+
+        # All Ray metrics are keyed by WorkerId, so multiprocess modes like
+        # "mostrecent", "all", "sum" do not apply. This logic can be manually
+        # implemented at the observability layer (Prometheus/Grafana).
+        del multiprocess_mode
+        labelnames_tuple = tuple(labelnames) if labelnames else None
+        self.metric = ray_metrics.Gauge(name=name,
+                                        description=documentation,
+                                        tag_keys=labelnames_tuple)
+
+    def set(self, value: Union[int, float]):
+        return self.metric.set(value)
+
+    def set_to_current_time(self):
+        # ray metrics doesn't have set_to_current time, https://docs.ray.io/en/latest/_modules/ray/util/metrics.html
+        return self.metric.set(time.time())
+
+
+class RayCounterWrapper(RayPrometheusMetric):
+    """Wraps around ray.util.metrics.Counter to provide same API as
+    prometheus_client.Counter"""
+
+    def __init__(self,
+                 name: str,
+                 documentation: Optional[str] = "",
+                 labelnames: Optional[list[str]] = None):
+        labelnames_tuple = tuple(labelnames) if labelnames else None
+        self.metric = ray_metrics.Counter(name=name,
+                                          description=documentation,
+                                          tag_keys=labelnames_tuple)
+
+    def inc(self, value: Union[int, float] = 1.0):
+        if value == 0:
+            return
+        return self.metric.inc(value)
+
+
+class RayHistogramWrapper(RayPrometheusMetric):
+    """Wraps around ray.util.metrics.Histogram to provide same API as
+    prometheus_client.Histogram"""
+
+    def __init__(self,
+                 name: str,
+                 documentation: Optional[str] = "",
+                 labelnames: Optional[list[str]] = None,
+                 buckets: Optional[list[float]] = None):
+        labelnames_tuple = tuple(labelnames) if labelnames else None
+        boundaries = buckets if buckets else []
+        self.metric = ray_metrics.Histogram(name=name,
+                                            description=documentation,
+                                            tag_keys=labelnames_tuple,
+                                            boundaries=boundaries)
+
+    def observe(self, value: Union[int, float]):
+        return self.metric.observe(value)
+
+
+class RaySpecDecodingProm(SpecDecodingProm):
+    """
+    RaySpecDecodingProm is used by RayMetrics to log to Ray metrics.
+    Provides the same metrics as SpecDecodingProm but uses Ray's
+    util.metrics library.
+    """
+
+    _counter_cls = RayCounterWrapper
+
+
+class RayPrometheusStatLogger(PrometheusStatLogger):
+    """RayPrometheusStatLogger uses Ray metrics instead."""
+
+    _gauge_cls = RayGaugeWrapper
+    _counter_cls = RayCounterWrapper
+    _histogram_cls = RayHistogramWrapper
+    _spec_decoding_cls = RaySpecDecodingProm
+
+    @staticmethod
+    def _unregister_vllm_metrics():
+        # No-op on purpose
+        pass
diff --git a/vllm_v0.10.0/vllm/v1/metrics/reader.py b/vllm_v0.10.0/vllm/v1/metrics/reader.py
new file mode 100644
index 0000000..4d6e599
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/metrics/reader.py
@@ -0,0 +1,246 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from typing import Optional
+
+from prometheus_client import REGISTRY
+from prometheus_client import Metric as PromMetric
+from prometheus_client.samples import Sample
+
+
+@dataclass
+class Metric:
+    """A base class for prometheus metrics.
+
+    Each metric may be associated with key=value labels, and
+    in some cases a single vLLM instance may have multiple
+    metrics with the same name but different sets of labels.
+    """
+    name: str
+    labels: dict[str, str]
+
+
+@dataclass
+class Counter(Metric):
+    """A monotonically increasing integer counter."""
+    value: int
+
+
+@dataclass
+class Vector(Metric):
+    """An ordered array of integer counters.
+
+    This type - which doesn't exist in Prometheus - models one very
+    specific metric, vllm:spec_decode_num_accepted_tokens_per_pos.
+    """
+    values: list[int]
+
+
+@dataclass
+class Gauge(Metric):
+    """A numerical value that can go up or down."""
+    value: float
+
+
+@dataclass
+class Histogram(Metric):
+    """Observations recorded in configurable buckets.
+
+    Buckets are represented by a dictionary. The key is
+    the upper limit of the bucket, and the value is the
+    observed count in that bucket. A '+Inf' key always
+    exists.
+
+    The count property is the total count across all
+    buckets, identical to the count of the '+Inf' bucket.
+
+    The sum property is the total sum of all observed
+    values.
+    """
+    count: int
+    sum: float
+    buckets: dict[str, int]
+
+
+def get_metrics_snapshot() -> list[Metric]:
+    """An API for accessing in-memory Prometheus metrics.
+
+    Example:
+        >>> for metric in llm.get_metrics():
+        ...     if isinstance(metric, Counter):
+        ...         print(f"{metric} = {metric.value}")
+        ...     elif isinstance(metric, Gauge):
+        ...         print(f"{metric} = {metric.value}")
+        ...     elif isinstance(metric, Histogram):
+        ...         print(f"{metric}")
+        ...         print(f"    sum = {metric.sum}")
+        ...         print(f"    count = {metric.count}")
+        ...         for bucket_le, value in metrics.buckets.items():
+        ...             print(f"    {bucket_le} = {value}")
+    """
+    collected: list[Metric] = []
+    for metric in REGISTRY.collect():
+        if not metric.name.startswith("vllm:"):
+            continue
+        if metric.type == "gauge":
+            samples = _get_samples(metric)
+            for s in samples:
+                collected.append(
+                    Gauge(name=metric.name, labels=s.labels, value=s.value))
+        elif metric.type == "counter":
+            samples = _get_samples(metric, "_total")
+            if metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos":
+                #
+                # Ugly vllm:num_accepted_tokens_per_pos special case.
+                #
+                # This metric is a vector of counters - for each spec
+                # decoding token position, we observe the number of
+                # accepted tokens using a Counter labeled with 'position'.
+                # We convert these into a vector of integer values.
+                #
+                for labels, values in _digest_num_accepted_by_pos_samples(
+                        samples):
+                    collected.append(
+                        Vector(name=metric.name, labels=labels, values=values))
+            else:
+                for s in samples:
+                    collected.append(
+                        Counter(name=metric.name,
+                                labels=s.labels,
+                                value=int(s.value)))
+
+        elif metric.type == "histogram":
+            #
+            # A histogram has a number of '_bucket' samples where
+            # the 'le' label represents the upper limit of the bucket.
+            # We convert these bucketized values into a dict of values
+            # indexed by the value of the 'le' label. The 'le=+Inf'
+            # label is a special case, catching all values observed.
+            #
+            bucket_samples = _get_samples(metric, "_bucket")
+            count_samples = _get_samples(metric, "_count")
+            sum_samples = _get_samples(metric, "_sum")
+            for labels, buckets, count_value, sum_value in _digest_histogram(
+                    bucket_samples, count_samples, sum_samples):
+                collected.append(
+                    Histogram(name=metric.name,
+                              labels=labels,
+                              buckets=buckets,
+                              count=count_value,
+                              sum=sum_value))
+        else:
+            raise AssertionError(f"Unknown metric type {metric.type}")
+
+    return collected
+
+
+def _get_samples(metric: PromMetric,
+                 suffix: Optional[str] = None) -> list[Sample]:
+    name = (metric.name + suffix) if suffix is not None else metric.name
+    return [s for s in metric.samples if s.name == name]
+
+
+def _strip_label(labels: dict[str, str], key_to_remove: str) -> dict[str, str]:
+    labels_copy = labels.copy()
+    labels_copy.pop(key_to_remove)
+    return labels_copy
+
+
+def _digest_histogram(
+    bucket_samples: list[Sample], count_samples: list[Sample],
+    sum_samples: list[Sample]
+) -> list[tuple[dict[str, str], dict[str, int], int, float]]:
+    #
+    # In the case of DP, we have an indigestable
+    # per-bucket-per-engine count as a list of labelled
+    # samples, along with total and sum samples
+    #
+    # bucket_samples (in):
+    #   labels = {bucket: 100, idx: 0}, value = 2
+    #   labels = {bucket: 200, idx: 0}, value = 4
+    #   labels = {bucket: Inf, idx: 0}, value = 10
+    #   labels = {bucket: 100, idx: 1}, value = 1
+    #   labels = {bucket: 200, idx: 2}, value = 5
+    #   labels = {bucket: Inf, idx: 3}, value = 7
+    # count_samples (in):
+    #   labels = {idx: 0}, value = 10
+    #   labels = {idx: 1}, value = 7
+    # sum_samples (in):
+    #   labels = {idx: 0}, value = 2000
+    #   labels = {idx: 1}, value = 1200
+    #
+    # output: [
+    #   {idx: 0}, {"100": 2, "200": 4, "Inf": 10}, 10, 2000
+    #   {idx: 1}, {"100": 1, "200": 5, "Inf": 7},   7, 1200
+    # ]
+    buckets_by_labels: dict[frozenset[tuple[str, str]], dict[str, int]] = {}
+    for s in bucket_samples:
+        bucket = s.labels["le"]
+        labels_key = frozenset(_strip_label(s.labels, "le").items())
+        if labels_key not in buckets_by_labels:
+            buckets_by_labels[labels_key] = {}
+        buckets_by_labels[labels_key][bucket] = int(s.value)
+
+    counts_by_labels: dict[frozenset[tuple[str, str]], int] = {}
+    for s in count_samples:
+        labels_key = frozenset(s.labels.items())
+        counts_by_labels[labels_key] = int(s.value)
+
+    sums_by_labels: dict[frozenset[tuple[str, str]], float] = {}
+    for s in sum_samples:
+        labels_key = frozenset(s.labels.items())
+        sums_by_labels[labels_key] = s.value
+
+    assert set(buckets_by_labels.keys()) == set(
+        counts_by_labels.keys()) == set(sums_by_labels.keys())
+
+    output = []
+    label_keys = list(buckets_by_labels.keys())
+    for k in label_keys:
+        labels = dict(k)
+        output.append((labels, buckets_by_labels[k], counts_by_labels[k],
+                       sums_by_labels[k]))
+    return output
+
+
+def _digest_num_accepted_by_pos_samples(
+        samples: list[Sample]) -> list[tuple[dict[str, str], list[int]]]:
+    #
+    # In the case of DP, we have an indigestable
+    # per-position-per-engine count as a list of
+    # labelled samples
+    #
+    # samples (in):
+    #   labels = {pos: 0, idx: 0}, value = 10
+    #   labels = {pos: 1, idx: 0}, value = 7
+    #   labels = {pos: 2, idx: 0}, value = 2
+    #   labels = {pos: 0, idx: 1}, value = 5
+    #   labels = {pos: 1, idx: 1}, value = 3
+    #   labels = {pos: 2, idx: 1}, value = 1
+    #
+    # output: [
+    #   {idx: 0}, [10, 7, 2]
+    #   {idx: 1}, [5, 3, 1]
+    # ]
+    #
+    max_pos = 0
+    values_by_labels: dict[frozenset[tuple[str, str]], dict[int, int]] = {}
+
+    for s in samples:
+        position = int(s.labels["position"])
+        max_pos = max(max_pos, position)
+
+        labels_key = frozenset(_strip_label(s.labels, "position").items())
+        if labels_key not in values_by_labels:
+            values_by_labels[labels_key] = {}
+        values_by_labels[labels_key][position] = int(s.value)
+
+    output = []
+    for labels_key, values_by_position in values_by_labels.items():
+        labels = dict(labels_key)
+        values = [0] * (max_pos + 1)
+        for pos, val in values_by_position.items():
+            values[pos] = val
+        output.append((labels, values))
+    return output
diff --git a/vllm_v0.10.0/vllm/v1/metrics/stats.py b/vllm_v0.10.0/vllm/v1/metrics/stats.py
new file mode 100644
index 0000000..1eb10cc
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/metrics/stats.py
@@ -0,0 +1,240 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import time
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Optional
+
+from vllm.v1.spec_decode.metrics import SpecDecodingStats
+
+if TYPE_CHECKING:
+    from vllm.v1.engine import EngineCoreEvent, EngineCoreOutput, FinishReason
+    from vllm.v1.engine.output_processor import RequestState
+
+
+@dataclass
+class PrefixCacheStats:
+    """Stores prefix cache hit statistics."""
+    # Whether reset_prefix_cache was invoked.
+    reset: bool = False
+    # The number of requests in this update.
+    requests: int = 0
+    # The number of queries in these requests. Note that "queries" here
+    # means the number of tokens that were queried from the cache.
+    queries: int = 0
+    # The number of hits in these requests.
+    hits: int = 0
+
+
+@dataclass
+class SchedulerStats:
+    """Stats associated with the scheduler."""
+
+    num_running_reqs: int = 0
+    num_waiting_reqs: int = 0
+
+    kv_cache_usage: float = 0.0
+
+    prefix_cache_stats: PrefixCacheStats = field(
+        default_factory=PrefixCacheStats)
+
+    spec_decoding_stats: Optional[SpecDecodingStats] = None
+
+    num_corrupted_reqs: int = 0
+
+
+@dataclass
+class LoRAStats:
+    waiting_requests: set[str] = field(default_factory=set)
+    running_requests: set[str] = field(default_factory=set)
+
+
+@dataclass
+class RequestStateStats:
+    """Stats that need to be tracked across delta updates."""
+
+    num_generation_tokens: int = 0
+
+    # This is a engine frontend timestamp (wall-clock)
+    arrival_time: float = 0.0
+
+    # These are engine core timestamps (monotonic)
+    queued_ts: float = 0.0
+    scheduled_ts: float = 0.0
+    first_token_ts: float = 0.0
+    last_token_ts: float = 0.0
+
+
+@dataclass
+class FinishedRequestStats:
+    """Stats associated with a finished request."""
+
+    finish_reason: "FinishReason"
+    e2e_latency: float = 0.0
+    num_prompt_tokens: int = 0
+    num_generation_tokens: int = 0
+    max_tokens_param: Optional[int] = None
+    queued_time: float = 0.0
+    prefill_time: float = 0.0
+    inference_time: float = 0.0
+    decode_time: float = 0.0
+
+
+class IterationStats:
+    """Stats associated with a single set of EngineCoreOutputs."""
+
+    def __init__(self):
+        self.iteration_timestamp = time.time()
+        self.num_generation_tokens = 0
+        self.num_prompt_tokens = 0
+        self.num_preempted_reqs = 0
+        self.finished_requests: list[FinishedRequestStats] = []
+        self.max_num_generation_tokens_iter: list[int] = []
+        self.n_params_iter: list[int] = []
+        self.time_to_first_tokens_iter: list[float] = []
+        self.time_per_output_tokens_iter: list[float] = []
+        self.waiting_lora_adapters: dict[str, int] = {}
+        self.running_lora_adapters: dict[str, int] = {}
+
+    def _time_since(self, start: float) -> float:
+        """Calculate an interval relative to this iteration's timestamp."""
+        return self.iteration_timestamp - start
+
+    def update_from_output(self, output: "EngineCoreOutput",
+                           engine_core_timestamp: float, is_prefilling: bool,
+                           prompt_len: int, req_stats: RequestStateStats,
+                           lora_stats: Optional[LoRAStats]):
+        num_new_generation_tokens = len(output.new_token_ids)
+
+        self.num_generation_tokens += num_new_generation_tokens
+        if is_prefilling:
+            self.num_prompt_tokens += prompt_len
+
+            first_token_latency = self._time_since(req_stats.arrival_time)
+            self.time_to_first_tokens_iter.append(first_token_latency)
+
+        req_stats.num_generation_tokens += num_new_generation_tokens
+
+        # Process request-level engine core events
+        if output.events is not None:
+            self.update_from_events(output.request_id, output.events,
+                                    is_prefilling, req_stats, lora_stats)
+
+        # Process the batch-level "new tokens" engine core event
+        if is_prefilling:
+            req_stats.first_token_ts = engine_core_timestamp
+        else:
+            tpot = engine_core_timestamp - req_stats.last_token_ts
+            self.time_per_output_tokens_iter.append(tpot)
+
+        req_stats.last_token_ts = engine_core_timestamp
+
+    def update_from_events(self, req_id: str, events: list["EngineCoreEvent"],
+                           is_prefilling: bool, req_stats: RequestStateStats,
+                           lora_stats: Optional[LoRAStats]):
+        # Avoid circular dependency
+        from vllm.v1.engine import EngineCoreEventType
+        for event in events:
+            if event.type == EngineCoreEventType.QUEUED:
+                req_stats.queued_ts = event.timestamp
+                if lora_stats is not None:
+                    lora_stats.waiting_requests.add(req_id)
+            elif event.type == EngineCoreEventType.SCHEDULED:
+                if req_stats.scheduled_ts == 0.0:  # ignore preemptions
+                    req_stats.scheduled_ts = event.timestamp
+                LoRARequestStates.scheduled_request(lora_stats, req_id)
+            elif event.type == EngineCoreEventType.PREEMPTED:
+                self.num_preempted_reqs += 1
+                LoRARequestStates.preempted_request(lora_stats, req_id)
+
+    def update_from_finished_request(self, finish_reason: "FinishReason",
+                                     num_prompt_tokens: int,
+                                     max_tokens_param: Optional[int],
+                                     req_stats: RequestStateStats):
+        e2e_latency = self._time_since(req_stats.arrival_time)
+
+        # Queued interval is from first QUEUED event to first SCHEDULED
+        queued_time = req_stats.scheduled_ts - req_stats.queued_ts
+
+        # Prefill interval is from first SCHEDULED to first NEW_TOKEN
+        # Any preemptions during prefill is included in the interval
+        prefill_time = req_stats.first_token_ts - req_stats.scheduled_ts
+
+        # Decode interval is from first NEW_TOKEN to last NEW_TOKEN
+        # Any preemptions during decode are included
+        decode_time = req_stats.last_token_ts - req_stats.first_token_ts
+
+        # Inference interval is from first SCHEDULED to last NEW_TOKEN
+        # Any preemptions during prefill or decode are included
+        inference_time = req_stats.last_token_ts - req_stats.scheduled_ts
+
+        finished_req = \
+            FinishedRequestStats(finish_reason=finish_reason,
+                                 e2e_latency=e2e_latency,
+                                 num_prompt_tokens=num_prompt_tokens,
+                                 num_generation_tokens=req_stats.num_generation_tokens,
+                                 max_tokens_param=max_tokens_param,
+                                 queued_time=queued_time,
+                                 prefill_time=prefill_time,
+                                 inference_time=inference_time,
+                                 decode_time=decode_time)
+        self.finished_requests.append(finished_req)
+
+
+class LoRARequestStates:
+    """Per-LoRA request state stats."""
+
+    def __init__(self):
+        self.lora_name_to_stats: dict[str, LoRAStats] = {}
+
+    def get_stats(self, req_state: 'RequestState') -> Optional[LoRAStats]:
+        if req_state.lora_name is None:
+            return None
+        if req_state.lora_name not in self.lora_name_to_stats:
+            self.lora_name_to_stats[req_state.lora_name] = LoRAStats()
+        return self.lora_name_to_stats[req_state.lora_name]
+
+    def add_request(self, req_state: 'RequestState'):
+        if (lora_stats := self.get_stats(req_state)) is not None:
+            lora_stats.waiting_requests.add(req_state.request_id)
+
+    def finish_request(self, req_state: 'RequestState'):
+        if req_state.lora_name is None:
+            return
+        lora_stats = self.lora_name_to_stats[req_state.lora_name]
+        lora_stats.running_requests.remove(req_state.request_id)
+
+    def abort_request(self, req_state: 'RequestState'):
+        if req_state.lora_name is None:
+            return
+        lora_stats = self.lora_name_to_stats[req_state.lora_name]
+        lora_stats.waiting_requests.discard(req_state.request_id)
+        lora_stats.running_requests.discard(req_state.request_id)
+
+    # Break the pattern for this lifecycle methods so we can
+    # call this from IterationStats.update_from_events()
+    @staticmethod
+    def scheduled_request(lora_stats: Optional[LoRAStats], request_id: str):
+        if lora_stats is None:
+            return
+        lora_stats.waiting_requests.remove(request_id)
+        lora_stats.running_requests.add(request_id)
+
+    @staticmethod
+    def preempted_request(lora_stats: Optional[LoRAStats], request_id: str):
+        if lora_stats is None:
+            return
+        lora_stats.running_requests.remove(request_id)
+        lora_stats.waiting_requests.add(request_id)
+
+    def update_iteration_stats(self,
+                               iteration_stats: Optional[IterationStats]):
+        if iteration_stats is None:
+            return
+        for lora_name, stats in self.lora_name_to_stats.items():
+            if stats.waiting_requests:
+                iteration_stats.waiting_lora_adapters[lora_name] = \
+                    len(stats.waiting_requests)
+            if stats.running_requests:
+                iteration_stats.running_lora_adapters[lora_name] = \
+                    len(stats.running_requests)
diff --git a/vllm_v0.10.0/vllm/v1/outputs.py b/vllm_v0.10.0/vllm/v1/outputs.py
new file mode 100644
index 0000000..f78623f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/outputs.py
@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from typing import NamedTuple, Optional
+
+import torch
+
+
+class LogprobsLists(NamedTuple):
+
+    # [num_reqs, max_num_logprobs + 1]
+    logprob_token_ids: list[list[int]]
+    # [num_reqs, max_num_logprobs + 1]
+    logprobs: list[list[float]]
+    # [num_reqs]
+    sampled_token_ranks: list[int]
+
+    def slice(self, start: int, end: int):
+        return LogprobsLists(
+            self.logprob_token_ids[start:end],
+            self.logprobs[start:end],
+            self.sampled_token_ranks[start:end],
+        )
+
+
+class LogprobsTensors(NamedTuple):
+
+    # [num_reqs, max_num_logprobs + 1]
+    logprob_token_ids: torch.Tensor
+    # [num_reqs, max_num_logprobs + 1]
+    logprobs: torch.Tensor
+    # [num_reqs]
+    selected_token_ranks: torch.Tensor
+
+    def tolists(self):
+        return LogprobsLists(
+            self.logprob_token_ids.tolist(),
+            self.logprobs.tolist(),
+            self.selected_token_ranks.tolist(),
+        )
+
+    @staticmethod
+    def empty_cpu(num_positions: int,
+                  num_tokens_per_position: int) -> "LogprobsTensors":
+        """Create empty LogprobsTensors on CPU."""
+
+        logprob_token_ids = torch.empty(
+            (num_positions, num_tokens_per_position),
+            dtype=torch.int32,
+            device="cpu")
+        logprobs = torch.empty_like(logprob_token_ids, dtype=torch.float32)
+        selected_token_ranks = torch.empty(num_positions,
+                                           dtype=torch.int32,
+                                           device="cpu")
+        return LogprobsTensors(
+            logprob_token_ids=logprob_token_ids,
+            logprobs=logprobs,
+            selected_token_ranks=selected_token_ranks,
+        )
+
+
+@dataclass
+class SamplerOutput:
+
+    # [num_reqs, max_num_generated_tokens]
+    # Different requests can have different number of generated tokens.
+    # All requests are padded to max_num_generated_tokens.
+    # PLACEHOLDER_TOKEN_ID (-1 by default) is used for padding.
+    sampled_token_ids: torch.Tensor
+    logprobs_tensors: Optional[LogprobsTensors]
+
+
+# ModelRunnerOutput is serialized and sent to the scheduler process.
+# This is expensive for torch.Tensor so prefer to use list instead.
+@dataclass
+class ModelRunnerOutput:
+
+    # [num_reqs]
+    req_ids: list[str]
+    # req_id -> index
+    req_id_to_index: dict[str, int]
+
+    # num_reqs x num_generated_tokens
+    # num_generated_tokens is the number of tokens
+    # generated in the current step. It can be different for
+    # each request due to speculative/jump decoding.
+    sampled_token_ids: list[list[int]]
+
+    # num_reqs x num_spec_tokens
+    spec_token_ids: Optional[list[list[int]]]
+
+    # [num_reqs, max_num_logprobs + 1]
+    # [num_reqs, max_num_logprobs + 1]
+    # [num_reqs]
+    logprobs: Optional[LogprobsLists]
+
+    # req_id -> (token_ids, logprobs, ranks)
+    # [prompt_len, num_prompt_logprobs]
+    # [prompt_len, num_prompt_logprobs]
+    # [prompt_len]
+    prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]]
+
+    # [num_reqs, hidden_size]
+    pooler_output: list[Optional[torch.Tensor]]
+
+    # [req_ids]
+    finished_sending: Optional[set[str]] = None
+    finished_recving: Optional[set[str]] = None
+
+    # req_id -> num_nans_in_logits
+    num_nans_in_logits: Optional[dict[str, int]] = None
+
+
+EMPTY_MODEL_RUNNER_OUTPUT = ModelRunnerOutput(req_ids=[],
+                                              req_id_to_index={},
+                                              sampled_token_ids=[],
+                                              spec_token_ids=None,
+                                              logprobs=None,
+                                              prompt_logprobs_dict={},
+                                              pooler_output=[],
+                                              finished_sending=None,
+                                              finished_recving=None,
+                                              num_nans_in_logits=None)
diff --git a/vllm_v0.10.0/vllm/v1/pool/__init__.py b/vllm_v0.10.0/vllm/v1/pool/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/v1/pool/metadata.py b/vllm_v0.10.0/vllm/v1/pool/metadata.py
new file mode 100644
index 0000000..28af720
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/pool/metadata.py
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+
+from vllm.pooling_params import PoolingParams
+
+
+@dataclass
+class PoolingMetadata:
+    """Tensors for pooling."""
+
+    prompt_lens: torch.Tensor
+    prompt_token_ids: Optional[torch.Tensor]
+    pooling_params: list[PoolingParams]
+
+    def __getitem__(self, indices: slice):
+        return PoolingMetadata(
+            prompt_lens=self.prompt_lens[indices],
+            prompt_token_ids=None if self.prompt_token_ids is None else
+            self.prompt_token_ids[indices],
+            pooling_params=self.pooling_params[indices],
+        )
diff --git a/vllm_v0.10.0/vllm/v1/request.py b/vllm_v0.10.0/vllm/v1/request.py
new file mode 100644
index 0000000..85f5dcb
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/request.py
@@ -0,0 +1,230 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import enum
+import time
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.pooling_params import PoolingParams
+from vllm.sampling_params import SamplingParams
+from vllm.utils import is_list_of
+from vllm.v1.engine import (EngineCoreEvent, EngineCoreEventType,
+                            EngineCoreRequest, FinishReason)
+from vllm.v1.structured_output.request import StructuredOutputRequest
+from vllm.v1.utils import ConstantList
+
+if TYPE_CHECKING:
+    from vllm.lora.request import LoRARequest
+
+
+class Request:
+
+    def __init__(
+        self,
+        request_id: str,
+        prompt_token_ids: list[int],
+        multi_modal_inputs: Optional[list[MultiModalKwargs]],
+        multi_modal_hashes: Optional[list[str]],
+        multi_modal_placeholders: Optional[list[PlaceholderRange]],
+        sampling_params: Optional[SamplingParams],
+        pooling_params: Optional[PoolingParams],
+        eos_token_id: Optional[int],
+        client_index: int = 0,
+        arrival_time: Optional[float] = None,
+        lora_request: Optional["LoRARequest"] = None,
+        structured_output_request: Optional["StructuredOutputRequest"] = None,
+        cache_salt: Optional[str] = None,
+        priority: int = 0,
+    ) -> None:
+        self.request_id = request_id
+        self.client_index = client_index
+        self.priority = priority
+        self.sampling_params = sampling_params
+        self.pooling_params = pooling_params
+        # Because of LoRA, the eos token id can be different for each request.
+        self.eos_token_id = eos_token_id
+        self.lora_request = lora_request
+        self.structured_output_request = structured_output_request
+        self.arrival_time = arrival_time if arrival_time is not None else \
+            time.time()
+
+        self.status = RequestStatus.WAITING
+        if sampling_params and sampling_params.guided_decoding is not None:
+            self.status = RequestStatus.WAITING_FOR_FSM
+        self.events: list[EngineCoreEvent] = []
+        self.stop_reason: Union[int, str, None] = None
+
+        # P/D: Connector-specific KV transfer parameters.
+        self.kv_transfer_params: Optional[dict[str, Any]] = None
+
+        if pooling_params is not None:
+            self.max_tokens = 1
+        elif sampling_params is not None:
+            assert sampling_params.max_tokens is not None
+            self.max_tokens = sampling_params.max_tokens
+            if sampling_params.guided_decoding is not None:
+                self.status = RequestStatus.WAITING_FOR_FSM
+
+            if sampling_params.extra_args is not None:
+                self.kv_transfer_params = \
+                    sampling_params.extra_args.get("kv_transfer_params")
+        else:
+            raise ValueError(
+                "sampling_params and pooling_params can't both be unset")
+
+        self.prompt_token_ids = prompt_token_ids
+        self.num_prompt_tokens = len(self.prompt_token_ids)
+        self._output_token_ids: list[int] = []
+        self._all_token_ids: list[int] = self.prompt_token_ids.copy()
+        self.num_output_placeholders = 0  # Used in async scheduling.
+        self.spec_token_ids: list[int] = []
+        self.num_computed_tokens = 0
+        self.cache_salt: Optional[str] = cache_salt
+
+        # Multi-modal related
+        self.mm_positions = multi_modal_placeholders or []
+        self.mm_inputs = multi_modal_inputs or []
+        self.mm_hashes: list[str] = multi_modal_hashes or []
+        self.num_encoder_inputs = len(self.mm_inputs)
+        self.has_encoder_inputs = self.num_encoder_inputs > 0
+
+        # Sanity check
+        assert len(self.mm_inputs) == len(self.mm_positions)
+        if self.mm_hashes:
+            assert len(self.mm_inputs) == len(self.mm_hashes)
+
+        # Read-only views
+        # Prevent directly appending to these lists since
+        # they should also be updated simultaneously.
+        self.output_token_ids = ConstantList(self._output_token_ids)
+        self.all_token_ids = ConstantList(self._all_token_ids)
+
+        # State
+        # The number of tokens with prefix cache hits.
+        self.num_cached_tokens = -1
+
+        # The number of NaNs in logits. A value greater than 0
+        # indicates that the output is corrupted
+        self.num_nans_in_logits = 0
+
+    @classmethod
+    def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
+        if request.mm_inputs is not None:
+            assert isinstance(request.mm_inputs, list)
+            assert is_list_of(request.mm_inputs, MultiModalKwargs), (
+                "mm_inputs was not updated in EngineCore.add_request")
+
+        return cls(
+            request_id=request.request_id,
+            client_index=request.client_index,
+            prompt_token_ids=request.prompt_token_ids,
+            multi_modal_inputs=request.mm_inputs,
+            multi_modal_hashes=request.mm_hashes,
+            multi_modal_placeholders=request.mm_placeholders,
+            sampling_params=request.sampling_params,
+            pooling_params=request.pooling_params,
+            eos_token_id=request.eos_token_id,
+            arrival_time=request.arrival_time,
+            lora_request=request.lora_request,
+            structured_output_request=StructuredOutputRequest(
+                sampling_params=request.sampling_params) \
+                    if request.sampling_params else None,
+            cache_salt=request.cache_salt,
+            priority=request.priority,
+        )
+
+    def append_output_token_ids(
+        self,
+        token_ids: Union[int, list[int]],
+    ) -> None:
+        if isinstance(token_ids, int):
+            self._output_token_ids.append(token_ids)
+            self._all_token_ids.append(token_ids)
+        else:
+            self._output_token_ids.extend(token_ids)
+            self._all_token_ids.extend(token_ids)
+
+    @property
+    def is_output_corrupted(self) -> bool:
+        return self.num_nans_in_logits > 0
+
+    @property
+    def num_tokens(self) -> int:
+        return len(self._all_token_ids)
+
+    @property
+    def num_tokens_with_spec(self) -> int:
+        return len(self._all_token_ids) + len(self.spec_token_ids)
+
+    @property
+    def num_output_tokens(self) -> int:
+        return len(self._output_token_ids)
+
+    def is_finished(self) -> bool:
+        return RequestStatus.is_finished(self.status)
+
+    def get_finished_reason(self) -> Union[FinishReason, None]:
+        return RequestStatus.get_finished_reason(self.status)
+
+    def get_num_encoder_tokens(self, input_id: int) -> int:
+        assert input_id < len(self.mm_positions)
+        num_tokens = self.mm_positions[input_id].length
+        return num_tokens
+
+    @property
+    def use_structured_output(self) -> bool:
+        return self.sampling_params is not None and \
+            self.sampling_params.guided_decoding is not None
+
+    def record_event(
+        self,
+        event_type: EngineCoreEventType,
+        timestamp: Optional[float] = None,
+    ) -> None:
+        self.events.append(EngineCoreEvent.new_event(event_type, timestamp))
+
+    def take_events(self) -> Optional[list[EngineCoreEvent]]:
+        if not self.events:
+            return None
+        events, self.events = self.events, []
+        return events
+
+
+class RequestStatus(enum.IntEnum):
+    """Status of a request."""
+    WAITING = enum.auto()
+    WAITING_FOR_FSM = enum.auto()
+    WAITING_FOR_REMOTE_KVS = enum.auto()
+    RUNNING = enum.auto()
+    PREEMPTED = enum.auto()
+    # Note: anything after PREEMPTED will be considered
+    # as a finished status.
+    FINISHED_STOPPED = enum.auto()
+    FINISHED_LENGTH_CAPPED = enum.auto()
+    FINISHED_ABORTED = enum.auto()
+    FINISHED_IGNORED = enum.auto()
+
+    def __str__(self):
+        return self.name
+
+    @staticmethod
+    def is_finished(status: "RequestStatus") -> bool:
+        return status > RequestStatus.PREEMPTED
+
+    @staticmethod
+    def get_finished_reason(
+            status: "RequestStatus") -> Union[FinishReason, None]:
+        return _FINISHED_REASON_MAP.get(status)
+
+
+# Mapping of finished statuses to their finish reasons.
+# NOTE: The ignored requests are the requests whose prompt lengths
+# are longer than the model's length cap. Therefore, the stop
+# reason should also be "length" as in OpenAI API.
+_FINISHED_REASON_MAP = {
+    RequestStatus.FINISHED_STOPPED: FinishReason.STOP,
+    RequestStatus.FINISHED_LENGTH_CAPPED: FinishReason.LENGTH,
+    RequestStatus.FINISHED_ABORTED: FinishReason.ABORT,
+    RequestStatus.FINISHED_IGNORED: FinishReason.LENGTH,
+}
diff --git a/vllm_v0.10.0/vllm/v1/sample/__init__.py b/vllm_v0.10.0/vllm/v1/sample/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/v1/sample/logits_processor.py b/vllm_v0.10.0/vllm/v1/sample/logits_processor.py
new file mode 100644
index 0000000..3a06e71
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/sample/logits_processor.py
@@ -0,0 +1,533 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import dataclasses
+from abc import ABC, abstractmethod
+from collections.abc import Iterator, Sequence
+from dataclasses import dataclass, field
+from enum import Enum
+from itertools import chain
+from typing import Optional, Union
+
+import torch
+from torch._prims_common import DeviceLikeType
+
+from vllm import PoolingParams, SamplingParams
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class MoveDirectionality(Enum):
+    # One-way i1->i2 req move within batch
+    UNIDIRECTIONAL = 0
+    # Two-way i1<->i2 req swap within batch
+    SWAP = 1
+
+
+# (index, params, output_tok_ids) tuples for new
+# requests added to the batch.
+AddedRequest = tuple[int, Union[SamplingParams, PoolingParams], list[int]]
+# (index 1, index 2, directionality) tuples representing
+# one-way moves or two-way swaps of requests in batch
+MovedRequest = tuple[int, int, MoveDirectionality]
+# Batch indices of any removed requests.
+RemovedRequest = int
+
+
+@dataclasses.dataclass(frozen=True)
+class BatchUpdate:
+    """Persistent batch state change info for logitsprocs"""
+    batch_size: int  # Current num reqs in batch
+
+    # Metadata for requests added to, removed from, and moved
+    # within the persistent batch.
+    #
+    # Note: each added request is represented as
+    # (index, params, output_tok_ids)
+    # Key assumption: output_tok_ids is a reference to the
+    # request's running output tokens list; in this way
+    # the logits processors always see the latest list of
+    # generated tokens
+    removed: Sequence[RemovedRequest]
+    moved: Sequence[MovedRequest]
+    added: Sequence[AddedRequest]
+
+
+class BatchUpdateBuilder:
+    """Helps track persistent batch state changes and build
+    a batch update data structure for logitsprocs
+    
+    Assumptions:
+    * All information about requests removed from persistent batch
+      during a step is aggregated in self._removed through calls to
+      self.removed_append() at the beginning of a step. This must happen
+      before the first time that self.removed, self.pop_removed()
+      or self.peek_removed() are invoked in a given step
+    * After the first time that self.removed, self.pop_removed()
+      or self.peek_removed() are read in a step, no new removals
+      are registered using self.removed_append()
+    * Elements of self._removed are never directly modified, added or
+      removed (i.e. modification is only via self.removed_append() and
+      self.pop_removed())
+    
+    Guarantees under above assumptions:
+    * self.removed is always sorted in descending order
+    * self.pop_removed() and self.peek_removed() both return
+      the lowest removed request index in the current step
+    """
+
+    _removed: list[RemovedRequest]
+    _is_removed_sorted: bool
+    moved: list[MovedRequest]
+    added: list[AddedRequest]
+
+    def __init__(
+        self,
+        removed: Optional[list[RemovedRequest]] = None,
+        moved: Optional[list[MovedRequest]] = None,
+        added: Optional[list[AddedRequest]] = None,
+    ) -> None:
+        self._removed = removed or []
+        self.moved = moved or []
+        self.added = added or []
+        self._is_removed_sorted = False
+
+    def _ensure_removed_sorted(self) -> None:
+        """Sort removed request indices in
+        descending order.
+        
+        Idempotent after first call in a
+        given step, until reset.
+        """
+        if not self._is_removed_sorted:
+            self._removed.sort(reverse=True)
+            self._is_removed_sorted = True
+
+    @property
+    def removed(self) -> list[RemovedRequest]:
+        """Removed request indices sorted in
+        descending order"""
+        self._ensure_removed_sorted()
+        return self._removed
+
+    def removed_append(self, index: int) -> None:
+        """Register the removal of a request from
+        the persistent batch.
+
+        Must not be called after the first time
+        self.removed, self.pop_removed() or
+        self.peek_removed() are invoked.
+        
+        Args:
+          index: request index
+        """
+        if self._is_removed_sorted:
+            raise RuntimeError("Cannot register new removed request after"
+                               " self.removed has been read.")
+        self._removed.append(index)
+
+    def has_removed(self) -> bool:
+        return bool(self._removed)
+
+    def peek_removed(self) -> Optional[int]:
+        """Return lowest removed request index"""
+        if self.has_removed():
+            self._ensure_removed_sorted()
+            return self._removed[-1]
+        return None
+
+    def pop_removed(self) -> Optional[int]:
+        """Pop lowest removed request index"""
+        if self.has_removed():
+            self._ensure_removed_sorted()
+            return self._removed.pop()
+        return None
+
+    def get_and_reset(self, batch_size: int) -> Optional[BatchUpdate]:
+        """Generate a logitsprocs batch update data structure
+        and reset internal batch update builder state.
+        
+        Args:
+          batch_size: current persistent batch size
+
+        Returns:
+          Frozen logitsprocs batch update instance; `None` if no updates
+        """
+        # Reset removal-sorting logic
+        self._is_removed_sorted = False
+        if not any((self._removed, self.moved, self.added)):
+            # No update; short-circuit
+            return None
+        # Build batch state update
+        batch_update = BatchUpdate(
+            batch_size=batch_size,
+            removed=self._removed,
+            moved=self.moved,
+            added=self.added,
+        )
+        # Reset removed/moved/added update lists
+        self._removed = []
+        self.moved = []
+        self.added = []
+        return batch_update
+
+
+class LogitsProcessor(ABC):
+
+    @abstractmethod
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+
+    @abstractmethod
+    def is_argmax_invariant(self) -> bool:
+        """True if logits processor has no impact on the
+        argmax computation in greedy sampling.
+        NOTE: may or may not have the same value for all
+        instances of a given LogitsProcessor subclass,
+        depending on subclass implementation.
+        TODO(andy): won't be utilized until logits
+        processors are user-extensible
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def update_state(
+        self,
+        batch_update: Optional[BatchUpdate],
+    ) -> None:
+        """Called when there are new output tokens, prior
+        to each forward pass.
+
+        Args:
+            batch_update is non-None iff there have been
+            changes to the batch makeup.
+        """
+        raise NotImplementedError
+
+
+@dataclass
+class LogitsProcessorManager:
+    """Encapsulates initialized logitsproc objects."""
+    argmax_invariant: list[LogitsProcessor] = field(
+        default_factory=list)  # argmax-invariant logitsprocs
+    non_argmax_invariant: list[LogitsProcessor] = field(
+        default_factory=list)  # non-argmax-invariant logitsprocs
+
+    @property
+    def all(self) -> Iterator[LogitsProcessor]:
+        """Iterator over all logits processors."""
+        return chain(self.argmax_invariant, self.non_argmax_invariant)
+
+
+###### ----- Built-in LogitsProcessor impls below here
+
+
+class MinPLogitsProcessor(LogitsProcessor):
+
+    def __init__(self, max_num_reqs: int, pin_memory: bool,
+                 device: DeviceLikeType):
+        super().__init__()
+        self.min_p_count: int = 0
+
+        self.min_p_cpu_tensor = torch.zeros((max_num_reqs, ),
+                                            dtype=torch.float32,
+                                            device="cpu",
+                                            pin_memory=pin_memory)
+        self.min_p_cpu = self.min_p_cpu_tensor.numpy()
+
+        self.use_double_tensor = torch.device("cpu") != torch.device(device)
+
+        if self.use_double_tensor:
+            # Pre-allocated device tensor
+            self.min_p_device: torch.Tensor = torch.empty((max_num_reqs, ),
+                                                          dtype=torch.float32,
+                                                          device=device)
+        else:
+            self.min_p_device = self.min_p_cpu_tensor
+        # Current slice of the device tensor
+        self.min_p: torch.Tensor = self.min_p_device[:0]
+
+    def is_argmax_invariant(self) -> bool:
+        """Min-p never impacts greedy sampling"""
+        return True
+
+    def get_min_p_by_index(self, index: int) -> float:
+        return float(self.min_p_cpu[index])
+
+    def update_state(self, batch_update: Optional[BatchUpdate]):
+        if not batch_update:
+            return
+
+        needs_update = False
+        # Process added requests.
+        for index, params, _ in batch_update.added:
+            min_p = params.min_p if isinstance(params, SamplingParams) else 0.0
+            if self.min_p_cpu[index] != min_p:
+                needs_update = True
+                self.min_p_cpu[index] = min_p
+            if min_p:
+                self.min_p_count += 1
+
+        if self.min_p_count:
+            # Process removed requests.
+            needs_update |= bool(batch_update.removed)
+            for index in batch_update.removed:
+                if self.min_p_cpu[index]:
+                    self.min_p_count -= 1
+
+            # Process moved requests, unidirectional (a->b) and swap (a<->b)
+            for adx, bdx, direct in batch_update.moved:
+                change = (min_p_a :=
+                          self.min_p_cpu[adx]) != (min_p_b :=
+                                                   self.min_p_cpu[bdx])
+                needs_update |= change
+                if change:
+                    self.min_p_cpu[bdx] = min_p_a
+                    if direct == MoveDirectionality.SWAP:
+                        self.min_p_cpu[adx] = min_p_b
+
+        # Update tensors if needed.
+        size = batch_update.batch_size
+        if self.min_p_count and (needs_update or self.min_p.shape[0] != size):
+            self.min_p = self.min_p_device[:size]
+            if self.use_double_tensor:
+                self.min_p.copy_(self.min_p_cpu_tensor[:size],
+                                 non_blocking=True)
+            self.min_p.unsqueeze_(1)
+
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        if not self.min_p_count:
+            return logits
+
+        # Convert logits to probability distribution
+        probability_values = torch.nn.functional.softmax(logits, dim=-1)
+        # Calculate maximum probabilities per sequence
+        max_probabilities = torch.amax(probability_values,
+                                       dim=-1,
+                                       keepdim=True)
+        # Adjust min_p
+        adjusted_min_p = max_probabilities.mul_(self.min_p)
+        # Identify valid tokens using threshold comparison
+        invalid_token_mask = probability_values < adjusted_min_p
+        # Apply mask using boolean indexing
+        logits[invalid_token_mask] = -float('inf')
+        return logits
+
+
+class LogitBiasLogitsProcessor(LogitsProcessor):
+
+    def __init__(self, pin_memory: bool, device: torch.device):
+        super().__init__()
+        self.biases: dict[int, dict[int, float]] = {}
+        self.device = device
+        self.pin_memory = pin_memory
+
+        self.bias_tensor: torch.Tensor = torch.tensor(())
+        self.logits_slice = (self._device_tensor([], torch.int32),
+                             self._device_tensor([], torch.int32))
+
+    def is_argmax_invariant(self) -> bool:
+        """Logit bias can rebalance token probabilities and change the
+        outcome of argmax in greedy sampling."""
+        return False
+
+    def update_state(self, batch_update: Optional[BatchUpdate]):
+        if not batch_update:
+            return
+
+        needs_update: bool = False
+        # Process added requests.
+        for index, params, _ in batch_update.added:
+            if isinstance(params, SamplingParams) and (lb :=
+                                                       params.logit_bias):
+                self.biases[index] = lb
+                needs_update = True
+            else:
+                # Drop biases metadata at batch index
+                if self.biases.pop(index, None) is not None:
+                    # If a new request replaces an old request which
+                    # specified biases, we should update processor tensors
+                    needs_update = True
+
+        if self.biases:
+            # Process removed requests.
+            for index in batch_update.removed:
+                if self.biases.pop(index, None):
+                    needs_update = True
+
+            # Process moved requests, unidirectional (a->b) and swap (a<->b)
+            for a_index, b_index, direct in batch_update.moved:
+                if direct == MoveDirectionality.UNIDIRECTIONAL:
+                    if (a_entry := self.biases.pop(a_index, None)) is None:
+                        if self.biases.pop(b_index, None) is not None:
+                            needs_update = True
+                    else:
+                        self.biases[b_index] = a_entry
+                        needs_update = True
+                else:
+                    a_entry = self.biases.pop(a_index, None)
+                    if (b_entry := self.biases.pop(b_index, None)) is not None:
+                        self.biases[a_index] = b_entry
+                        needs_update = True
+                    if a_entry is not None:
+                        self.biases[b_index] = a_entry
+                        needs_update = True
+
+        # Update tensors if needed.
+        if needs_update:
+            reqs, tok_ids, biases = [], [], []
+            for req, lb in self.biases.items():
+                reqs.extend([req] * len(lb))
+                tok_ids.extend(lb.keys())
+                biases.extend(lb.values())
+
+            self.bias_tensor = self._device_tensor(biases, torch.float32)
+            self.logits_slice = (self._device_tensor(reqs, torch.int32),
+                                 self._device_tensor(tok_ids, torch.int32))
+
+    def _device_tensor(self, data: list, dtype: torch.dtype) -> torch.Tensor:
+        return (torch.tensor(data,
+                             device="cpu",
+                             dtype=dtype,
+                             pin_memory=self.pin_memory).to(device=self.device,
+                                                            non_blocking=True))
+
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        if self.biases:
+            logits[self.logits_slice] += self.bias_tensor
+        return logits
+
+
+class MinTokensLogitsProcessor(LogitsProcessor):
+
+    def __init__(self, pin_memory: bool, device: torch.device):
+        # index -> (min_toks, output_token_ids, stop_token_ids)
+        super().__init__()
+        self.min_toks: dict[int, tuple[int, Sequence[int], set[int]]] = {}
+        self.device = device
+        self.pin_memory = pin_memory
+
+        # (req_idx_tensor,eos_tok_id_tensor)
+        self.logits_slice: tuple[torch.Tensor,
+                                 torch.Tensor] = (self._device_tensor(
+                                     [], torch.int32),
+                                                  self._device_tensor(
+                                                      [], torch.int32))
+
+    def is_argmax_invariant(self) -> bool:
+        """By censoring stop tokens, min-tokens can change the outcome
+        of the argmax operation in greedy sampling."""
+        return False
+
+    def update_state(self, batch_update: Optional[BatchUpdate]):
+        needs_update = False
+
+        if batch_update:
+            # Process added requests.
+            for index, params, output_tok_ids in batch_update.added:
+                if (isinstance(params, SamplingParams)
+                        and (min_tokens := params.min_tokens)
+                        and len(output_tok_ids) < min_tokens):
+                    # Replace request metadata at batch index
+                    self.min_toks[index] = (min_tokens, output_tok_ids,
+                                            params.all_stop_token_ids)
+                    needs_update = True
+                else:
+                    # Drop min_toks metadata at batch index
+                    if self.min_toks.pop(index, None) is not None:
+                        # If a new request replaces an old request which
+                        # specified min_toks, we should update processor tensors
+                        needs_update = True
+
+            if self.min_toks:
+                # Process removed requests.
+                for index in batch_update.removed:
+                    if self.min_toks.pop(index, None):
+                        needs_update = True
+
+                # Process moved requests, unidirectional (a->b) and
+                # swapped (a<->b)
+                for a_index, b_index, direct in batch_update.moved:
+                    if direct == MoveDirectionality.UNIDIRECTIONAL:
+                        if (a_entry := self.min_toks.pop(a_index,
+                                                         None)) is None:
+                            if self.min_toks.pop(b_index, None) is not None:
+                                needs_update = True
+                        else:
+                            self.min_toks[b_index] = a_entry
+                            needs_update = True
+                    else:
+                        a_entry = self.min_toks.pop(a_index, None)
+                        if (b_entry := self.min_toks.pop(b_index,
+                                                         None)) is not None:
+                            self.min_toks[a_index] = b_entry
+                            needs_update = True
+                        if a_entry is not None:
+                            self.min_toks[b_index] = a_entry
+                            needs_update = True
+
+        if self.min_toks:
+            # Check for any requests that have attained their min tokens.
+            to_remove = tuple(index for index, (min_toks, out_tok_ids,
+                                                _) in self.min_toks.items()
+                              if len(out_tok_ids) >= min_toks)
+            if to_remove:
+                needs_update = True
+                for index in to_remove:
+                    del self.min_toks[index]
+
+        # Update tensors if needed.
+        if needs_update:
+            reqs: list[int] = []
+            tok_ids: list[int] = []
+            for req, (_, _, stop_tok_ids) in self.min_toks.items():
+                reqs.extend([req] * len(stop_tok_ids))
+                tok_ids.extend(stop_tok_ids)
+
+            self.logits_slice = (self._device_tensor(reqs, torch.int32),
+                                 self._device_tensor(tok_ids, torch.int32))
+
+    def _device_tensor(self, data: list, dtype: torch.dtype) -> torch.Tensor:
+        return (torch.tensor(data,
+                             device="cpu",
+                             dtype=dtype,
+                             pin_memory=self.pin_memory).to(device=self.device,
+                                                            non_blocking=True))
+
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        if self.min_toks:
+            # Inhibit EOS token for requests which have not reached min length
+            logits[self.logits_slice] = -float("inf")
+        return logits
+
+
+def init_builtin_logitsprocs(pin_memory_available: bool, max_num_reqs: int,
+                             device: torch.device) -> LogitsProcessorManager:
+    """Construct 'builtin' vLLM logitsprocs which the engine
+    loads by default.
+
+    Args:
+      pin_memory_available: pinned memory is available for use
+                            for use by logitsproc
+      max_num_reqs: ceiling on request count in persistent batch
+      device: inference device
+
+    Returns:
+      Data structure encapsulating loaded logitsprocs
+    """
+    min_tokens_logitproc = MinTokensLogitsProcessor(
+        pin_memory=pin_memory_available, device=device)
+    logit_bias_logitproc = LogitBiasLogitsProcessor(
+        pin_memory=pin_memory_available, device=device)
+    min_p_logitproc = MinPLogitsProcessor(
+        pin_memory=pin_memory_available,
+        device=device,
+        # +1 for temporary swap space
+        max_num_reqs=max_num_reqs + 1)
+    return LogitsProcessorManager(
+        non_argmax_invariant=[
+            min_tokens_logitproc,
+            logit_bias_logitproc,
+        ],
+        argmax_invariant=[min_p_logitproc],
+    )
diff --git a/vllm_v0.10.0/vllm/v1/sample/metadata.py b/vllm_v0.10.0/vllm/v1/sample/metadata.py
new file mode 100644
index 0000000..1189b12
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/sample/metadata.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+
+from vllm.v1.sample.logits_processor import LogitsProcessorManager
+
+
+@dataclass
+class SamplingMetadata:
+
+    temperature: Optional[torch.Tensor]
+    all_greedy: bool
+    all_random: bool
+
+    top_p: Optional[torch.Tensor]
+    top_k: Optional[torch.Tensor]
+
+    generators: dict[int, torch.Generator]
+
+    # None means no logprobs, 0 means sampled token logprobs only
+    max_num_logprobs: Optional[int]
+
+    no_penalties: bool
+    prompt_token_ids: Optional[torch.Tensor]
+    frequency_penalties: torch.Tensor
+    presence_penalties: torch.Tensor
+    repetition_penalties: torch.Tensor
+
+    output_token_ids: list[list[int]]
+
+    # `allowed_token_ids_mask` is a 2D bool tensor of shape (max batch size,
+    # vocab size).
+    allowed_token_ids_mask: Optional[torch.Tensor]
+
+    # req_index -> bad_words_token_ids
+    bad_words_token_ids: dict[int, list[list[int]]]
+
+    # Loaded logits processors
+    logitsprocs: LogitsProcessorManager
diff --git a/vllm_v0.10.0/vllm/v1/sample/ops/__init__.py b/vllm_v0.10.0/vllm/v1/sample/ops/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/v1/sample/ops/bad_words.py b/vllm_v0.10.0/vllm/v1/sample/ops/bad_words.py
new file mode 100644
index 0000000..1b69956
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/sample/ops/bad_words.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+_SMALLEST_LOGIT = float("-inf")
+
+
+def _apply_bad_words_single_batch(
+    logits: torch.Tensor,
+    bad_words_token_ids: list[list[int]],
+    past_tokens_ids: list[int],
+) -> None:
+    for bad_word_ids in bad_words_token_ids:
+        if len(bad_word_ids) > len(past_tokens_ids) + 1:
+            continue
+
+        prefix_length = len(bad_word_ids) - 1
+        last_token_id = bad_word_ids[-1]
+        if prefix_length > 0:
+            actual_prefix = past_tokens_ids[-prefix_length:]
+        else:
+            actual_prefix = []
+        expected_prefix = bad_word_ids[:prefix_length]
+
+        assert len(actual_prefix) == len(expected_prefix)
+
+        if actual_prefix == expected_prefix:
+            logits[last_token_id] = _SMALLEST_LOGIT
+
+
+def apply_bad_words(
+    logits: torch.Tensor,
+    bad_words_token_ids: dict[int, list[list[int]]],
+    past_tokens_ids: list[list[int]],
+) -> None:
+    for i, bad_words_ids in bad_words_token_ids.items():
+        _apply_bad_words_single_batch(logits[i], bad_words_ids,
+                                      past_tokens_ids[i])
diff --git a/vllm_v0.10.0/vllm/v1/sample/ops/logprobs.py b/vllm_v0.10.0/vllm/v1/sample/ops/logprobs.py
new file mode 100644
index 0000000..a4d6548
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/sample/ops/logprobs.py
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Some utilities for logprobs, including logits."""
+
+import torch
+
+
+@torch.compile(dynamic=True)
+def batched_count_greater_than(x: torch.Tensor,
+                               values: torch.Tensor) -> torch.Tensor:
+    """
+    Counts elements in each row of x that are greater than the corresponding
+    value in values.  Use torch.compile to generate an optimized kernel for
+    this function. otherwise, it will create additional copies of the input
+    tensors and cause memory issues.
+
+    Args:
+        x (torch.Tensor): A 2D tensor of shape (batch_size, n_elements).
+        values (torch.Tensor): A 2D tensor of shape (batch_size, 1).
+
+    Returns:
+        torch.Tensor: A 1D tensor of shape (batch_size,) with the counts.
+    """
+    return (x >= values).sum(-1)
diff --git a/vllm_v0.10.0/vllm/v1/sample/ops/penalties.py b/vllm_v0.10.0/vllm/v1/sample/ops/penalties.py
new file mode 100644
index 0000000..5d54f66
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/sample/ops/penalties.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.model_executor.layers.utils import apply_penalties
+from vllm.utils import is_pin_memory_available, make_tensor_with_pad
+
+
+def apply_all_penalties(
+    logits: torch.Tensor,
+    prompt_token_ids: torch.Tensor,
+    presence_penalties: torch.Tensor,
+    frequency_penalties: torch.Tensor,
+    repetition_penalties: torch.Tensor,
+    output_token_ids: list[list[int]],
+) -> torch.Tensor:
+    """
+    Applies presence, frequency and repetition penalties to the logits.
+    """
+    _, vocab_size = logits.shape
+    output_tokens_t = _convert_to_tensors(output_token_ids, vocab_size,
+                                          logits.device)
+    return apply_penalties(logits, prompt_token_ids, output_tokens_t,
+                           presence_penalties, frequency_penalties,
+                           repetition_penalties)
+
+
+def _convert_to_tensors(output_token_ids: list[list[int]], vocab_size: int,
+                        device: torch.device) -> torch.Tensor:
+    """
+    Convert the different list data structures to tensors.
+    """
+    output_tokens_tensor = make_tensor_with_pad(
+        output_token_ids,
+        # Use the value of vocab_size as a pad since we don't have a
+        # token_id of this value.
+        pad=vocab_size,
+        device="cpu",
+        dtype=torch.int64,
+        pin_memory=is_pin_memory_available(),
+    )
+    return output_tokens_tensor.to(device, non_blocking=True)
diff --git a/vllm_v0.10.0/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm_v0.10.0/vllm/v1/sample/ops/topk_topp_sampler.py
new file mode 100644
index 0000000..87a84e5
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -0,0 +1,296 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from vllm import envs
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+
+logger = init_logger(__name__)
+
+try:
+    import flashinfer.sampling
+    is_flashinfer_available = True
+except ImportError:
+    is_flashinfer_available = False
+
+
+class TopKTopPSampler(nn.Module):
+    """
+    Module that performs optional top-k and top-p filtering followed by
+    weighted random sampling of logits.
+
+    Implementations may update the logits tensor in-place.
+    """
+
+    def __init__(self):
+        super().__init__()
+        if current_platform.is_cuda():
+            if is_flashinfer_available:
+                flashinfer_version = flashinfer.__version__
+                if flashinfer_version < "0.2.3":
+                    logger.warning(
+                        "FlashInfer version >= 0.2.3 required. "
+                        "Falling back to default sampling implementation.")
+                    self.forward = self.forward_native
+                elif envs.VLLM_USE_FLASHINFER_SAMPLER is not False:
+                    # NOTE(woosuk): The V0 sampler doesn't use FlashInfer for
+                    # sampling unless VLLM_USE_FLASHINFER_SAMPLER=1 (i.e., by
+                    # default it is unused). For backward compatibility, we set
+                    # `VLLM_USE_FLASHINFER_SAMPLER` as None by default and
+                    # interpret it differently in V0 and V1 samplers: In V0,
+                    # None means False, while in V1, None means True. This is
+                    # why we use the condition
+                    # `envs.VLLM_USE_FLASHINFER_SAMPLER is not False` here.
+                    logger.info("Using FlashInfer for top-p & top-k sampling.")
+                    self.forward = self.forward_cuda
+                else:
+                    logger.warning(
+                        "FlashInfer is available, but it is not enabled. "
+                        "Falling back to the PyTorch-native implementation of "
+                        "top-p & top-k sampling. For the best performance, "
+                        "please set VLLM_USE_FLASHINFER_SAMPLER=1.")
+                    self.forward = self.forward_native
+            else:
+                logger.warning(
+                    "FlashInfer is not available. Falling back to the PyTorch-"
+                    "native implementation of top-p & top-k sampling. For the "
+                    "best performance, please install FlashInfer.")
+                self.forward = self.forward_native
+        elif current_platform.is_tpu():
+            self.forward = self.forward_tpu
+        else:
+            self.forward = self.forward_native
+
+    def forward_native(
+        self,
+        logits: torch.Tensor,
+        generators: dict[int, torch.Generator],
+        k: Optional[torch.Tensor],
+        p: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        """
+        PyTorch-native implementation of top-k and top-p sampling.
+
+        The logits tensor may be updated in-place.
+        """
+        logits = apply_top_k_top_p(logits, k, p)
+        probs = logits.softmax(dim=-1, dtype=torch.float32)
+        return random_sample(probs, generators)
+
+    def forward_cuda(
+        self,
+        logits: torch.Tensor,
+        generators: dict[int, torch.Generator],
+        k: Optional[torch.Tensor],
+        p: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        """More optimized implementation for top-k and top-p sampling."""
+        if k is None and p is None:
+            # We prefer `random_sample` over `flashinfer_sample` when sorting is
+            # not needed. This is because `random_sample` does not require
+            # CPU-GPU synchronization while `flashinfer_sample` does.
+            probs = logits.softmax(dim=-1, dtype=torch.float32)
+            return random_sample(probs, generators)
+        if generators:
+            logger.warning("FlashInfer 0.2.3+ does not support "
+                           "per-request generators. Falling back to "
+                           "PyTorch-native implementation.")
+            return self.forward_native(logits, generators, k, p)
+        # flashinfer sampling functions expect contiguous logits.
+        # In flex_attn/triton_attn fp32 inference, logits can be non-contiguous
+        # because of slicing operation in logits_processor.
+        return flashinfer_sample(logits.contiguous(), k, p, generators)
+
+    def forward_tpu(
+        self,
+        logits: torch.Tensor,
+        generators: dict[int, torch.Generator],
+        k: Optional[torch.Tensor],
+        p: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        logits = apply_top_k_top_p_tpu(logits, k, p)
+        probs = logits.softmax(dim=-1, dtype=torch.float32)
+        return random_sample(probs, generators)
+
+
+def apply_top_k_top_p_tpu(
+    logits: torch.Tensor,
+    k: torch.Tensor,
+    p: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Apply top-k and top-p optimized for TPU.
+
+    This algorithm avoids using torch.scatter which is extremely slow on TPU.
+    This is achieved by finding a "cut-off" element in the original logit, and
+    after thresholding the logit using this cut-off, the remaining elements
+    shall constitute the top-p set.
+
+    Note: in the case of tie (i.e. multipple cut-off elements present in the
+    logit), all tie elements are included in the top-p set. In other words,
+    this function does not break ties. Instead, these tie tokens have equal
+    chance of being chosen during final sampling, so we can consider the tie
+    being broken then.
+    """
+    probs = logits.softmax(dim=-1)
+    probs_sort, _ = probs.sort(dim=-1, descending=False)
+
+    if k is not None:
+        top_k_count = probs_sort.size(1) - k.to(torch.long)  # shape: (batch, )
+        top_k_count = top_k_count.unsqueeze(dim=1)
+        top_k_cutoff = probs_sort.gather(-1, top_k_count)
+
+        # Make sure the no top-k rows are no-op.
+        no_top_k_mask = (k == logits.shape[1]).unsqueeze(dim=1)
+        top_k_cutoff.masked_fill_(no_top_k_mask, -float("inf"))
+
+        elements_to_discard = probs < top_k_cutoff
+        logits.masked_fill_(elements_to_discard, -float("inf"))
+
+    if p is not None:
+        cumprob = torch.cumsum(probs_sort, dim=-1)
+        top_p_mask = cumprob <= 1 - p.unsqueeze(dim=1)
+        top_p_mask[:, -1] = False  # at least one
+
+        top_p_count = top_p_mask.sum(dim=-1).unsqueeze(1)
+        top_p_cutoff = probs_sort.gather(-1, top_p_count)
+        elements_to_discard = probs < top_p_cutoff
+        logits.masked_fill_(elements_to_discard, -float("inf"))
+
+    return logits
+
+
+def apply_top_k_top_p(
+    logits: torch.Tensor,
+    k: Optional[torch.Tensor],
+    p: Optional[torch.Tensor],
+) -> torch.Tensor:
+    """Apply top-k and top-p masks to the logits.
+
+    If a top-p is used, this function will sort the logits tensor,
+    which can be slow for large batches.
+
+    The logits tensor may be updated in-place.
+    """
+    if p is None:
+        if k is None:
+            return logits
+
+        # Avoid sorting vocab for top-k only case.
+        return apply_top_k_only(logits, k)
+
+    logits_sort, logits_idx = logits.sort(dim=-1, descending=False)
+
+    if k is not None:
+        # Apply top-k.
+        top_k_mask = logits_sort.size(1) - k.to(torch.long)  # shape: B
+        # Get all the top_k values.
+        top_k_mask = logits_sort.gather(1, top_k_mask.unsqueeze(dim=1))
+        top_k_mask = logits_sort < top_k_mask
+        logits_sort.masked_fill_(top_k_mask, -float("inf"))
+
+    if p is not None:
+        # Apply top-p.
+        probs_sort = logits_sort.softmax(dim=-1)
+        probs_sum = torch.cumsum(probs_sort, dim=-1, out=probs_sort)
+        top_p_mask = probs_sum <= 1 - p.unsqueeze(dim=1)
+        # at least one
+        top_p_mask[:, -1] = False
+        logits_sort.masked_fill_(top_p_mask, -float("inf"))
+
+    # Re-sort the probabilities.
+    logits = logits_sort.scatter(dim=-1, index=logits_idx, src=logits_sort)
+    return logits
+
+
+def apply_top_k_only(
+    logits: torch.Tensor,
+    k: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Apply top-k mask to the logits.
+
+    This implementation doesn't involve sorting the entire vocab.
+
+    The logits tensor may be updated in-place.
+    """
+    no_top_k_mask = k == logits.shape[1]
+    # Set non-top-k rows to 1 so that we can gather.
+    k = k.masked_fill(no_top_k_mask, 1)
+    max_top_k = k.max()
+    # topk.values tensor has shape [batch_size, max_top_k].
+    # Convert top k to 0-based index in range [0, max_top_k).
+    k_index = k.sub_(1).unsqueeze(1)
+    top_k_mask = logits.topk(max_top_k, dim=1).values.gather(1, k_index.long())
+    # Handle non-topk rows.
+    top_k_mask.masked_fill_(no_top_k_mask.unsqueeze(1), -float("inf"))
+    logits.masked_fill_(logits < top_k_mask, -float("inf"))
+    return logits
+
+
+def random_sample(
+    probs: torch.Tensor,
+    generators: dict[int, torch.Generator],
+) -> torch.Tensor:
+    """Randomly sample from the probabilities.
+
+    We use this function instead of torch.multinomial because torch.multinomial
+    causes CPU-GPU synchronization.
+    """
+    q = torch.empty_like(probs)
+    # NOTE(woosuk): To batch-process the requests without their own seeds,
+    # which is the common case, we first assume that every request does
+    # not have its own seed. Then, we overwrite the values for the requests
+    # that have their own seeds.
+    if len(generators) != probs.shape[0]:
+        q.exponential_()
+    if generators:
+        # TODO(woosuk): This can be slow because we handle each request
+        # one by one. Optimize this.
+        for i, generator in generators.items():
+            q[i].exponential_(generator=generator)
+    return probs.div_(q).argmax(dim=-1).view(-1)
+
+
+def flashinfer_sample(
+    logits: torch.Tensor,
+    k: Optional[torch.Tensor],
+    p: Optional[torch.Tensor],
+    generators: dict[int, torch.Generator],
+) -> torch.Tensor:
+    """Sample from the logits using FlashInfer.
+
+    Statistically, this function is equivalent to the `random_sample` function.
+    However, this function is faster because it avoids sorting the logits tensor
+    via rejection sampling.
+
+    NOTE: The outputs of this function do not necessarily match the outputs of
+    the `random_sample` function. It only guarantees that the outputs are
+    statistically equivalent.
+
+    NOTE: This function includes CPU-GPU synchronization, while `random_sample`
+    does not. Call this function at the end of the forward pass to minimize
+    the synchronization overhead.
+    """
+    assert not (k is None and p is None)
+    if k is None:
+        # Top-p only.
+        probs = logits.softmax(dim=-1, dtype=torch.float32)
+        next_token_ids = flashinfer.sampling.top_p_sampling_from_probs(
+            probs, p, deterministic=True)
+    elif p is None:
+        # Top-k only.
+        probs = logits.softmax(dim=-1, dtype=torch.float32)
+        next_token_ids = flashinfer.sampling.top_k_sampling_from_probs(
+            probs, k, deterministic=True)
+    else:
+        # Both top-k and top-p.
+        next_token_ids = flashinfer.sampling.top_k_top_p_sampling_from_logits(
+            logits, k, p, deterministic=True)
+
+    return next_token_ids.view(-1)
diff --git a/vllm_v0.10.0/vllm/v1/sample/rejection_sampler.py b/vllm_v0.10.0/vllm/v1/sample/rejection_sampler.py
new file mode 100644
index 0000000..b2354c5
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/sample/rejection_sampler.py
@@ -0,0 +1,631 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from vllm.logger import init_logger
+from vllm.triton_utils import tl, triton
+from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
+from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
+
+logger = init_logger(__name__)
+
+PLACEHOLDER_TOKEN_ID: tl.constexpr = -1
+GREEDY_TEMPERATURE: tl.constexpr = -1
+# Maximum number of speculative draft tokens allowed per request in a single
+# step. This value is chosen to be large enough to handle typical use cases.
+MAX_SPEC_LEN = 32
+
+
+class RejectionSampler(nn.Module):
+    """
+    The implementation strictly follows the algorithm described in
+        https://arxiv.org/abs/2211.17192.
+    However, we want to clarify the terminology used in the implementation:
+    accepted tokens: tokens that are accepted based on the relationship
+            between the "raw" draft and target probabilities.
+    recovered tokens: tokens that are sampled based on the adjusted probability
+        distribution, which is derived from both the draft and target
+        probabilities.
+    bonus tokens:
+        If all proposed tokens are accepted, the bonus token is added to the
+        end of the sequence. The bonus token is only sampled from the target
+        probabilities. We pass in the bonus tokens instead of sampling them
+        in the rejection sampler to allow for more flexibility in the
+        sampling process. For example, we can use top_p, top_k sampling for
+        bonus tokens, while spec decode does not support these sampling
+        strategies.
+    output tokens:
+        Tokens are finally generated with the rejection sampler.
+        output tokens = accepted tokens + recovered tokens + bonus tokens
+    """
+
+    def forward(
+        self,
+        metadata: SpecDecodeMetadata,
+        # [num_tokens, vocab_size]
+        draft_probs: Optional[torch.Tensor],
+        # [num_tokens, vocab_size]
+        target_logits: torch.Tensor,
+        # [batch_size, 1]
+        bonus_token_ids: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> torch.Tensor:
+        '''
+        Args:
+            metadata:
+                Metadata for spec decoding.
+            draft_probs (Optional[torch.Tensor]):
+                Probability distribution for the draft tokens. Shape is
+                [num_tokens, vocab_size]. Can be None if probabilities are
+                not provided, which is the case for ngram spec decode.
+            target_logits (torch.Tensor):
+                Target model's logits probability distribution.
+                Shape is [num_tokens, vocab_size]. Here, probabilities from
+                different requests are flattened into a single tensor because
+                this is the shape of the output logits.
+                NOTE: `target_logits` can be updated in place to save memory.
+            bonus_token_ids_tensor (torch.Tensor):
+                A tensor containing bonus tokens. Shape is [batch_size, 1].
+                Bonus tokens are added to the end of the sequence if all
+                proposed tokens are accepted. We generate the bonus tokens
+                outside of the rejection sampler with the default sampling
+                strategy. It allows for more flexibility in the sampling
+                process such as top_p, top_k sampling.
+            sampling_metadata (vllm.v1.sample.metadata.SamplingMetadata):
+                Additional metadata needed for sampling, such as temperature,
+                top-k/top-p parameters, or other relevant information.
+        Returns:
+            output_token_ids (torch.Tensor):
+                A tensor containing the final output token IDs.
+        '''
+        assert metadata.max_spec_len <= MAX_SPEC_LEN
+        # [num_tokens, vocab_size]
+        # NOTE(woosuk): `target_logits` can be updated in place inside the
+        # `compute_probs` function.
+        target_probs = compute_probs(
+            target_logits,
+            metadata.cu_num_draft_tokens,
+            sampling_metadata,
+        )
+
+        output_token_ids = rejection_sample(
+            metadata.draft_token_ids,
+            metadata.num_draft_tokens,
+            metadata.max_spec_len,
+            metadata.cu_num_draft_tokens,
+            draft_probs,
+            target_probs,
+            bonus_token_ids,
+            sampling_metadata,
+        )
+        return output_token_ids
+
+    @staticmethod
+    def parse_output(
+        output_token_ids: torch.Tensor,
+        vocab_size: int,
+    ) -> list[list[int]]:
+        """Parse the output of the rejection sampler.
+
+        Args:
+            output_token_ids: The sampled token IDs in shape
+                [batch_size, max_spec_len + 1]. The rejected tokens are
+                replaced with `PLACEHOLDER_TOKEN_ID` by the rejection sampler
+                and will be filtered out in this function.
+            vocab_size: The size of the vocabulary.
+
+        Returns:
+            A list of lists of token IDs.
+        """
+        output_token_ids_np = output_token_ids.cpu().numpy()
+        # Create mask for valid tokens.
+        valid_mask = ((output_token_ids_np != PLACEHOLDER_TOKEN_ID) &
+                      (output_token_ids_np < vocab_size))
+        outputs = [
+            row[valid_mask[i]].tolist()
+            for i, row in enumerate(output_token_ids_np)
+        ]
+        return outputs
+
+
+def rejection_sample(
+    # [num_tokens]
+    draft_token_ids: torch.Tensor,
+    # [batch_size]
+    num_draft_tokens: list[int],
+    max_spec_len: int,
+    # [batch_size]
+    cu_num_draft_tokens: torch.Tensor,
+    # [num_tokens, vocab_size]
+    draft_probs: Optional[torch.Tensor],
+    # [num_tokens, vocab_size]
+    target_probs: torch.Tensor,
+    # [batch_size, 1]
+    bonus_token_ids: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+) -> torch.Tensor:
+    assert draft_token_ids.ndim == 1
+    assert draft_probs is None or draft_probs.ndim == 2
+    assert cu_num_draft_tokens.ndim == 1
+    assert target_probs.ndim == 2
+
+    batch_size = len(num_draft_tokens)
+    num_tokens = draft_token_ids.shape[0]
+    vocab_size = target_probs.shape[-1]
+    device = target_probs.device
+    assert draft_token_ids.is_contiguous()
+    assert draft_probs is None or draft_probs.is_contiguous()
+    assert target_probs.is_contiguous()
+    assert bonus_token_ids.is_contiguous()
+    assert target_probs.shape == (num_tokens, vocab_size)
+
+    # Create output buffer.
+    output_token_ids = torch.empty(
+        (batch_size, max_spec_len + 1),
+        dtype=torch.int32,  # Consistent with SamplerOutput.sampled_token_ids.
+        device=device,
+    )
+    output_token_ids.fill_(PLACEHOLDER_TOKEN_ID)
+
+    if sampling_metadata.all_greedy:
+        is_greedy = None
+    else:
+        is_greedy = sampling_metadata.temperature == GREEDY_TEMPERATURE
+    if not sampling_metadata.all_random:
+        # Rejection sampling for greedy sampling requests.
+        target_argmax = target_probs.argmax(dim=-1)
+        rejection_greedy_sample_kernel[(batch_size, )](
+            output_token_ids,
+            cu_num_draft_tokens,
+            draft_token_ids,
+            target_argmax,
+            bonus_token_ids,
+            is_greedy,
+            max_spec_len,
+            num_warps=1,
+        )
+        if sampling_metadata.all_greedy:
+            return output_token_ids
+
+    # Generate uniform probabilities for rejection sampling.
+    # [num_tokens]
+    uniform_probs = generate_uniform_probs(
+        num_tokens,
+        num_draft_tokens,
+        sampling_metadata.generators,
+        device,
+    )
+
+    # Sample recovered tokens for each position.
+    # [num_tokens]
+    recovered_token_ids = sample_recovered_tokens(
+        max_spec_len,
+        num_draft_tokens,
+        cu_num_draft_tokens,
+        draft_token_ids,
+        draft_probs,
+        target_probs,
+        sampling_metadata,
+        device,
+    )
+
+    # Rejection sampling for random sampling requests.
+    rejection_random_sample_kernel[(batch_size, )](
+        output_token_ids,
+        cu_num_draft_tokens,
+        draft_token_ids,
+        draft_probs,
+        target_probs,
+        bonus_token_ids,
+        recovered_token_ids,
+        uniform_probs,
+        is_greedy,
+        max_spec_len,
+        vocab_size,
+        NO_DRAFT_PROBS=draft_probs is None,
+        num_warps=1,
+    )
+    return output_token_ids
+
+
+def compute_probs(
+    logits: torch.Tensor,  # [num_tokens, vocab_size]
+    cu_num_draft_tokens: torch.Tensor,  # [batch_size]
+    sampling_metadata: SamplingMetadata,
+) -> torch.Tensor:
+    """Compute probability distribution from logits based on sampling metadata.
+
+    This function applies temperature scaling to the logits and converts
+    them to probabilities using softmax. For greedy decoding, it returns
+    the original logits.
+
+    Args:
+        logits: Input logits tensor to be converted to probabilities.
+        cu_num_draft_tokens: Cumulative number of draft tokens.
+        sampling_metadata: Metadata containing sampling parameters such as
+            temperature and whether greedy sampling is used.
+
+    Returns:
+        torch.Tensor: Probability distribution (softmax of scaled logits)
+            if non-greedy sampling is used, otherwise returns the
+            original logits.
+    """
+    assert logits.ndim == 2
+    assert cu_num_draft_tokens.ndim == 1
+    if sampling_metadata.all_greedy:
+        return logits
+
+    num_tokens = logits.shape[0]
+    temperature = expand_batch_to_tokens(
+        sampling_metadata.temperature,
+        cu_num_draft_tokens,
+        num_tokens,
+        replace_from=GREEDY_TEMPERATURE,
+        replace_to=1,
+    )
+    # NOTE(woosuk): Update `logits` in place to avoid allocating a new tensor.
+    logits.div_(temperature.unsqueeze(-1))
+
+    # Get expanded top_k and top_p tensors.
+    top_k = None
+    if sampling_metadata.top_k is not None:
+        top_k = expand_batch_to_tokens(
+            sampling_metadata.top_k,
+            cu_num_draft_tokens,
+            num_tokens,
+        )
+    top_p = None
+    if sampling_metadata.top_p is not None:
+        top_p = expand_batch_to_tokens(
+            sampling_metadata.top_p,
+            cu_num_draft_tokens,
+            num_tokens,
+        )
+
+    # NOTE(woosuk): `apply_top_k_top_p` uses sorting to calculate the mask,
+    # which is slow for large vocab sizes. This may cause performance issues.
+    logits = apply_top_k_top_p(logits, top_k, top_p)
+    output_prob = logits.softmax(dim=-1, dtype=torch.float32)
+    return output_prob
+
+
+def expand_batch_to_tokens(
+    x: torch.Tensor,  # [batch_size]
+    cu_num_tokens: torch.Tensor,  # [batch_size]
+    num_tokens: int,
+    replace_from: int = 0,
+    replace_to: int = 0,
+) -> torch.Tensor:
+    """Expand [batch_size] tensor to [num_tokens] tensor based on the number of
+    tokens per batch in cu_num_tokens.
+
+    For example, if x = [a, b, c] and cu_num_tokens = [2, 5, 6], then
+    num_tokens = 6, and expanded_x = [a, a, b, b, b, c].
+
+    Args:
+        x: [batch_size] tensor to expand.
+        cu_num_tokens: [batch_size] tensor containing the cumulative number of
+            tokens per batch. Each element represents the total number of
+            tokens up to and including that batch.
+        num_tokens: Total number of tokens.
+        replace_from: int = 0
+            Value to be replaced if it is found in x.
+        replace_to: int = 0
+            Value to replace with when replace_from is found.
+    Returns:
+        expanded_x: [num_tokens] tensor.
+    """
+    batch_size = x.shape[0]
+    assert cu_num_tokens.shape[0] == batch_size
+    expanded_x = x.new_empty(num_tokens)
+    expand_kernel[(batch_size, )](
+        expanded_x,
+        x,
+        cu_num_tokens,
+        replace_from,
+        replace_to,
+        MAX_NUM_TOKENS=MAX_SPEC_LEN,  # To avoid recompilation.
+        num_warps=1,
+    )
+    return expanded_x
+
+
+def generate_uniform_probs(
+    num_tokens: int,
+    num_draft_tokens: list[int],
+    generators: dict[int, torch.Generator],
+    device: torch.device,
+) -> torch.Tensor:
+    """
+    Generates a batch of uniform random samples, with optional seeding
+    if available.
+
+    This method creates a tensor of shape `(num_tokens, )` filled
+    with uniform random values in the range [0, 1). If `generators` is provided,
+    the requests with their own seeds will use the provided `torch.Generator`
+    for reproducibility. The samples for the other requests will be generated
+    without a seed.
+
+    Args:
+        num_tokens : int
+            Total number of tokens.
+        num_draft_tokens : List[List[int]]
+            Number of draft tokens per request.
+        generators : Optional[Dict[int, torch.Generator]]
+            A dictionary mapping indices in the batch to
+            `torch.Generator` objects.
+        device : torch.device
+            The device on which to allocate the tensor.
+    Returns:
+        uniform_rand : torch.Tensor
+            A tensor of shape `(num_tokens, )` containing uniform
+            random values in the range [0, 1).
+    """
+    uniform_probs = torch.rand(
+        (num_tokens, ),
+        dtype=torch.float32,
+        device=device,
+    )
+    start_idx = 0
+    for req_idx, n in enumerate(num_draft_tokens):
+        # Do not generate random numbers for requests with no draft tokens.
+        # This can be important for reproducibility.
+        if n == 0:
+            continue
+        end_idx = start_idx + n
+        generator = generators.get(req_idx)
+        if generator is not None:
+            uniform_probs[start_idx:end_idx].uniform_(generator=generator)
+        start_idx = end_idx
+    return uniform_probs
+
+
+def sample_recovered_tokens(
+    max_spec_len: int,
+    num_draft_tokens: list[int],
+    # [batch_size]
+    cu_num_draft_tokens: torch.Tensor,
+    # [num_tokens]
+    draft_token_ids: torch.Tensor,
+    # [num_tokens, vocab_size]
+    draft_probs: Optional[torch.Tensor],
+    # [num_tokens, vocab_size]
+    target_probs: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+    device: torch.device,
+) -> torch.Tensor:
+    # NOTE(woosuk): Create only one distribution for each request.
+    batch_size = len(num_draft_tokens)
+    vocab_size = target_probs.shape[-1]
+    q = torch.empty(
+        (batch_size, vocab_size),
+        dtype=torch.float32,
+        device=device,
+    )
+    q.exponential_()
+    for i, generator in sampling_metadata.generators.items():
+        # Do not generate random numbers for requests with no draft tokens.
+        # This can be important for reproducibility.
+        if num_draft_tokens[i] > 0:
+            q[i].exponential_(generator=generator)
+
+    recovered_token_ids = torch.empty_like(draft_token_ids)
+    sample_recovered_tokens_kernel[(batch_size, max_spec_len)](
+        recovered_token_ids,
+        cu_num_draft_tokens,
+        draft_token_ids,
+        draft_probs,
+        target_probs,
+        q,
+        vocab_size,
+        triton.next_power_of_2(vocab_size),
+        NO_DRAFT_PROBS=draft_probs is None,
+    )
+    return recovered_token_ids
+
+
+# NOTE(woosuk): Avoid specialization to prevent unnecessary recompilation.
+@triton.jit(do_not_specialize=["max_spec_len"])
+def rejection_greedy_sample_kernel(
+    output_token_ids_ptr,  # [batch_size, max_spec_len + 1]
+    cu_num_draft_tokens_ptr,  # [batch_size]
+    draft_token_ids_ptr,  # [num_tokens]
+    target_argmax_ptr,  # [num_tokens]
+    bonus_token_ids_ptr,  # [batch_size]
+    is_greedy_ptr,  # [batch_size] or None
+    max_spec_len,
+):
+    req_idx = tl.program_id(0)
+    # FIXME(woosuk): Because is_greedy_ptr is not None at profiling run,
+    # re-compilation may happen during runtime when is_greedy_ptr is None.
+    if is_greedy_ptr is None:
+        is_greedy = True
+    else:
+        is_greedy = tl.load(is_greedy_ptr + req_idx)
+    if not is_greedy:
+        # Early exit for non-greedy sampling requests.
+        return
+
+    if req_idx == 0:
+        start_idx = 0
+    else:
+        start_idx = tl.load(cu_num_draft_tokens_ptr + req_idx - 1)
+    end_idx = tl.load(cu_num_draft_tokens_ptr + req_idx)
+    num_draft_tokens = end_idx - start_idx
+
+    rejected = False
+    for pos in range(num_draft_tokens):
+        if not rejected:
+            draft_token_id = tl.load(draft_token_ids_ptr + start_idx + pos)
+            target_argmax_id = tl.load(target_argmax_ptr + start_idx + pos)
+            tl.store(output_token_ids_ptr + req_idx * (max_spec_len + 1) + pos,
+                     target_argmax_id)
+            if draft_token_id != target_argmax_id:
+                # Reject.
+                rejected = True
+
+    if not rejected:
+        # If all tokens are accepted, append the bonus token.
+        bonus_token_id = tl.load(bonus_token_ids_ptr + req_idx)
+        tl.store(
+            output_token_ids_ptr + req_idx * (max_spec_len + 1) +
+            num_draft_tokens, bonus_token_id)
+
+
+# NOTE(woosuk): Avoid specialization to prevent unnecessary recompilation.
+@triton.jit(do_not_specialize=["max_spec_len"])
+def rejection_random_sample_kernel(
+    output_token_ids_ptr,  # [batch_size, max_spec_len + 1]
+    cu_num_draft_tokens_ptr,  # [batch_size]
+    draft_token_ids_ptr,  # [num_tokens]
+    draft_probs_ptr,  # [num_tokens, vocab_size] or None
+    target_probs_ptr,  # [num_tokens, vocab_size]
+    bonus_token_ids_ptr,  # [batch_size]
+    recovered_token_ids_ptr,  # [num_tokens]
+    uniform_probs_ptr,  # [num_tokens]
+    is_greedy_ptr,  # [batch_size]
+    max_spec_len,
+    vocab_size,
+    NO_DRAFT_PROBS: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    is_greedy = tl.load(is_greedy_ptr + req_idx)
+    if is_greedy:
+        # Early exit for greedy sampling requests.
+        return
+
+    if req_idx == 0:
+        start_idx = 0
+    else:
+        start_idx = tl.load(cu_num_draft_tokens_ptr + req_idx - 1)
+    end_idx = tl.load(cu_num_draft_tokens_ptr + req_idx)
+    num_draft_tokens = end_idx - start_idx
+
+    rejected = False
+    for pos in range(num_draft_tokens):
+        if not rejected:
+            draft_token_id = tl.load(draft_token_ids_ptr + start_idx + pos)
+            if NO_DRAFT_PROBS:
+                draft_prob = 1
+            else:
+                draft_prob = tl.load(draft_probs_ptr +
+                                     (start_idx + pos) * vocab_size +
+                                     draft_token_id)
+            target_prob = tl.load(target_probs_ptr +
+                                  (start_idx + pos) * vocab_size +
+                                  draft_token_id)
+            uniform_prob = tl.load(uniform_probs_ptr + start_idx + pos)
+            # NOTE(woosuk): While the draft probability should never be 0,
+            # we check it to avoid NaNs. If it happens to be 0, we reject.
+            if draft_prob > 0 and target_prob / draft_prob >= uniform_prob:
+                # Accept.
+                token_id = draft_token_id
+            else:
+                # Reject. Use recovered token.
+                rejected = True
+                token_id = tl.load(recovered_token_ids_ptr + start_idx + pos)
+            tl.store(output_token_ids_ptr + req_idx * (max_spec_len + 1) + pos,
+                     token_id)
+
+    if not rejected:
+        # If all tokens are accepted, append the bonus token.
+        bonus_token_id = tl.load(bonus_token_ids_ptr + req_idx)
+        tl.store(
+            output_token_ids_ptr + req_idx * (max_spec_len + 1) +
+            num_draft_tokens, bonus_token_id)
+
+
+# NOTE(woosuk): Avoid specialization to prevent unnecessary recompilation.
+@triton.jit(do_not_specialize=["replace_from", "replace_to"])
+def expand_kernel(
+    output_ptr,  # [num_tokens]
+    input_ptr,  # [batch_size]
+    cu_num_tokens_ptr,  # [batch_size]
+    replace_from,
+    replace_to,
+    MAX_NUM_TOKENS: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    if req_idx == 0:  # noqa: SIM108
+        start_idx = 0
+    else:
+        start_idx = tl.load(cu_num_tokens_ptr + req_idx - 1)
+    end_idx = tl.load(cu_num_tokens_ptr + req_idx)
+    num_tokens = end_idx - start_idx
+
+    src_val = tl.load(input_ptr + req_idx)
+    src_val = tl.where(src_val == replace_from, replace_to, src_val)
+    offset = tl.arange(0, MAX_NUM_TOKENS)
+    tl.store(output_ptr + start_idx + offset,
+             src_val,
+             mask=offset < num_tokens)
+
+
+@triton.jit
+def sample_recovered_tokens_kernel(
+    output_token_ids_ptr,  # [num_tokens]
+    cu_num_draft_tokens_ptr,  # [batch_size]
+    draft_token_ids_ptr,  # [num_tokens]
+    draft_probs_ptr,  # [num_tokens, vocab_size] or None
+    target_probs_ptr,  # [num_tokens, vocab_size]
+    q_ptr,  # [batch_size, vocab_size]
+    vocab_size,
+    PADDED_VOCAB_SIZE: tl.constexpr,
+    NO_DRAFT_PROBS: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    if req_idx == 0:
+        start_idx = 0
+    else:
+        start_idx = tl.load(cu_num_draft_tokens_ptr + req_idx - 1)
+    end_idx = tl.load(cu_num_draft_tokens_ptr + req_idx)
+    num_draft_tokens = end_idx - start_idx
+
+    # Early exit for out-of-range positions.
+    pos = tl.program_id(1)
+    if pos >= num_draft_tokens:
+        return
+
+    vocab_offset = tl.arange(0, PADDED_VOCAB_SIZE)
+    if NO_DRAFT_PROBS:
+        draft_token_id = tl.load(draft_token_ids_ptr + start_idx + pos)
+        orig_prob = tl.load(target_probs_ptr + (start_idx + pos) * vocab_size +
+                            draft_token_id)
+        # Temporarily zero out the probability of the draft token.
+        # This is essentially the same as target_prob - draft_prob, except that
+        # n-gram does not have draft_prob. We regard it as 1.
+        tl.store(
+            target_probs_ptr + (start_idx + pos) * vocab_size + draft_token_id,
+            0)
+        prob = tl.load(target_probs_ptr + (start_idx + pos) * vocab_size +
+                       vocab_offset,
+                       mask=vocab_offset < vocab_size,
+                       other=0)
+    else:
+        draft_prob = tl.load(draft_probs_ptr + (start_idx + pos) * vocab_size +
+                             vocab_offset,
+                             mask=vocab_offset < vocab_size,
+                             other=0)
+        target_prob = tl.load(target_probs_ptr +
+                              (start_idx + pos) * vocab_size + vocab_offset,
+                              mask=vocab_offset < vocab_size,
+                              other=0)
+        prob = tl.maximum(target_prob - draft_prob, 0)
+        # NOTE(woosuk): We don't need `prob = prob / tl.sum(prob)` here because
+        # `tl.argmax` will select the maximum value.
+
+    q = tl.load(q_ptr + req_idx * vocab_size + vocab_offset,
+                mask=vocab_offset < vocab_size,
+                other=float("-inf"))
+    recovered_id = tl.argmax(prob / q, axis=-1)
+    tl.store(output_token_ids_ptr + start_idx + pos, recovered_id)
+
+    if NO_DRAFT_PROBS:
+        # Restore the original probability.
+        tl.store(
+            target_probs_ptr + (start_idx + pos) * vocab_size + draft_token_id,
+            orig_prob)
diff --git a/vllm_v0.10.0/vllm/v1/sample/sampler.py b/vllm_v0.10.0/vllm/v1/sample/sampler.py
new file mode 100644
index 0000000..82f5129
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/sample/sampler.py
@@ -0,0 +1,240 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""A layer that samples the next tokens from the model's outputs."""
+
+import torch
+import torch.nn as nn
+
+from vllm.config import LogprobsMode
+from vllm.utils import is_pin_memory_available
+from vllm.v1.outputs import LogprobsTensors, SamplerOutput
+from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.sample.ops.bad_words import apply_bad_words
+from vllm.v1.sample.ops.logprobs import batched_count_greater_than
+from vllm.v1.sample.ops.penalties import apply_all_penalties
+from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler
+
+_SAMPLING_EPS = 1e-5
+
+
+class Sampler(nn.Module):
+
+    def __init__(self, logprobs_mode: LogprobsMode = "raw_logprobs"):
+        super().__init__()
+        self.topk_topp_sampler = TopKTopPSampler()
+        self.pin_memory = is_pin_memory_available()
+        self.logprobs_mode = logprobs_mode
+
+    def forward(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
+        # NOTE(woosuk): Use the original logits (before any penalties or
+        # temperature scaling) for the top-k logprobs.
+        # This is different from the V0 sampler, which uses the logits that
+        # is used for sampling (after penalties and temperature scaling).
+        # TODO(rob): provide option for logprobs post sampling.
+        # See https://vllm-dev.slack.com/archives/C07UUL8E61Z/p1735907856007919 # noqa: E501
+        num_logprobs = sampling_metadata.max_num_logprobs
+        if num_logprobs is not None:
+            if self.logprobs_mode == "raw_logprobs":
+                raw_logprobs = self.compute_logprobs(logits)
+            elif self.logprobs_mode == "raw_logits":
+                raw_logprobs = logits.clone()
+
+        # Use float32 for the logits.
+        logits = logits.to(torch.float32)
+        # Apply allowed token ids.
+        logits = self.apply_allowed_token_ids(logits, sampling_metadata)
+        # Apply bad words exclusion.
+        logits = self.apply_bad_words(logits, sampling_metadata)
+
+        # Apply logits processors which can impact greedy sampling
+        for processor in (sampling_metadata.logitsprocs.non_argmax_invariant):
+            logits = processor.apply(logits)
+
+        # Apply penalties (e.g., min_tokens, freq_penalties).
+        logits = self.apply_penalties(logits, sampling_metadata)
+
+        # Get the process logprobs or logits.
+        if num_logprobs is not None:
+            if self.logprobs_mode == "processed_logprobs":
+                raw_logprobs = self.compute_logprobs(logits)
+            elif self.logprobs_mode == "processed_logits":
+                raw_logprobs = logits.clone()
+
+        # Sample the next token.
+        sampled = self.sample(logits, sampling_metadata)
+        # Convert sampled token ids to int64 (long) type to ensure compatibility
+        # with subsequent operations that may use these values as indices.
+        # This conversion is necessary because FlashInfer sampling operations
+        # return int32 (while PyTorch argmax and topk return int64).
+        sampled = sampled.long()
+
+        # Gather the logprobs of the topk and sampled token (if requested).
+        # Get logprobs and rank tensors (if requested)
+        logprobs_tensors = None if num_logprobs is None else \
+            self.gather_logprobs(raw_logprobs, num_logprobs, token_ids=sampled)
+
+        # Use int32 to reduce the tensor size.
+        sampled = sampled.to(torch.int32)
+
+        # These are GPU tensors.
+        sampler_output = SamplerOutput(
+            # The sampled tokens are expanded to 2D tensor with shape
+            # [num_requests, 1], where each row represents one generated
+            # token per request.
+            sampled_token_ids=sampled.unsqueeze(-1),
+            logprobs_tensors=logprobs_tensors,
+        )
+        return sampler_output
+
+    def apply_temperature(
+        self,
+        logits: torch.Tensor,
+        temp: torch.Tensor,
+    ) -> torch.Tensor:
+        # Use in-place division to avoid creating a new tensor.
+        return logits.div_(temp.unsqueeze(dim=1))
+
+    def greedy_sample(self, logits: torch.Tensor) -> torch.Tensor:
+        return logits.argmax(dim=-1).view(-1)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> torch.Tensor:
+        """Sample logits based on sampling metadata.
+
+        The various logits processing functions called in this method
+        may update the logits tensor in-place.
+        """
+
+        assert not (sampling_metadata.all_greedy
+                    and sampling_metadata.all_random)
+        if sampling_metadata.all_random:
+            greedy_sampled = None
+        else:
+            greedy_sampled = self.greedy_sample(logits)
+            if sampling_metadata.all_greedy:
+                return greedy_sampled
+
+        assert sampling_metadata.temperature is not None
+
+        # Apply temperature.
+        logits = self.apply_temperature(logits, sampling_metadata.temperature)
+
+        # Apply logits processors that only apply to random sampling
+        # (argmax invariant)
+        for processor in sampling_metadata.logitsprocs.argmax_invariant:
+            logits = processor.apply(logits)
+
+        # Apply top_k and/or top_p.
+        random_sampled = self.topk_topp_sampler(
+            logits,
+            sampling_metadata.generators,
+            sampling_metadata.top_k,
+            sampling_metadata.top_p,
+        )
+
+        if greedy_sampled is None:
+            return random_sampled
+
+        sampled = torch.where(
+            sampling_metadata.temperature < _SAMPLING_EPS,
+            greedy_sampled,
+            random_sampled,
+            out=greedy_sampled,  # Reuse tensor
+        )
+        return sampled
+
+    def compute_logprobs(self, logits: torch.Tensor) -> torch.Tensor:
+        return logits.log_softmax(dim=-1, dtype=torch.float32)
+
+    def gather_logprobs(
+        self,
+        logprobs: torch.Tensor,
+        num_logprobs: int,
+        token_ids: torch.Tensor,
+    ) -> LogprobsTensors:
+        """
+        Gather logprobs for topk and sampled/prompt token.
+
+        Args:
+          logprobs: (num tokens) x (vocab) tensor
+          num_logprobs: minimum number of logprobs to
+                        retain per token
+          token_ids: prompt tokens (if prompt logprobs)
+                     or sampled tokens (if sampled
+                     logprobs); 1D token ID tensor
+                     with (num tokens) elements
+                     Must be int64.
+
+        Returns:
+          Top-k int indices tensor, (num tokens) x (num_logprobs + 1)
+          Top-k float logprobs tensor, (num tokens) x (num_logprobs + 1)
+          Sampled token rank tensor, (num tokens)
+        """
+        assert token_ids.dtype == torch.int64
+        # Find the topK values.
+        topk_logprobs, topk_indices = torch.topk(logprobs,
+                                                 num_logprobs,
+                                                 dim=-1)
+
+        # Get with the logprob of the prompt or sampled token.
+        token_ids = token_ids.unsqueeze(-1)
+        token_logprobs = logprobs.gather(-1, token_ids)
+
+        # Compute the ranks of the actual token.
+        token_ranks = batched_count_greater_than(logprobs, token_logprobs)
+
+        # Concatenate together with the topk.
+        indices = torch.cat((token_ids, topk_indices), dim=1)
+        logprobs = torch.cat((token_logprobs, topk_logprobs), dim=1)
+
+        # Use int32 to reduce the tensor size.
+        indices = indices.to(torch.int32)
+
+        return LogprobsTensors(indices, logprobs, token_ranks)
+
+    def apply_penalties(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> torch.Tensor:
+        if not sampling_metadata.no_penalties:
+            assert sampling_metadata.prompt_token_ids is not None
+            logits = apply_all_penalties(
+                logits,
+                sampling_metadata.prompt_token_ids,
+                sampling_metadata.presence_penalties,
+                sampling_metadata.frequency_penalties,
+                sampling_metadata.repetition_penalties,
+                sampling_metadata.output_token_ids,
+            )
+        return logits
+
+    def apply_allowed_token_ids(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> torch.Tensor:
+        if sampling_metadata.allowed_token_ids_mask is not None:
+            logits.masked_fill_(sampling_metadata.allowed_token_ids_mask,
+                                float("-inf"))
+        return logits
+
+    def apply_bad_words(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> torch.Tensor:
+        if sampling_metadata.bad_words_token_ids:
+            apply_bad_words(
+                logits,
+                sampling_metadata.bad_words_token_ids,
+                sampling_metadata.output_token_ids,
+            )
+        return logits
diff --git a/vllm_v0.10.0/vllm/v1/sample/tpu/__init__.py b/vllm_v0.10.0/vllm/v1/sample/tpu/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/v1/sample/tpu/metadata.py b/vllm_v0.10.0/vllm/v1/sample/tpu/metadata.py
new file mode 100644
index 0000000..6491c84
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/sample/tpu/metadata.py
@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass, field
+from typing import Optional
+
+import torch
+
+from vllm.v1.worker.tpu_input_batch import InputBatch
+
+DEFAULT_SAMPLING_PARAMS = dict(
+    temperature=-1.0,
+    min_p=0.0,
+    # strictly disabled for now
+    top_k=0,
+    top_p=1.0,
+    # frequency_penalties=0.0,
+    # presence_penalties=0.0,
+    # repetition_penalties=0.0,
+)
+
+
+@dataclass
+class TPUSupportedSamplingMetadata:
+    # This class exposes a more xla-friendly interface than SamplingMetadata
+    # on TPU, in particular all arguments should be traceable and no optionals
+    # are allowed, to avoid graph recompilation on Nones.
+    temperature: torch.Tensor = None
+
+    min_p: torch.Tensor = None
+    top_k: torch.Tensor = None
+    top_p: torch.Tensor = None
+
+    all_greedy: bool = True
+
+    # Whether logprobs are to be gathered in this batch of request. To balance
+    # out compile time and runtime, a fixed `max_number_logprobs` value is used
+    # when gathering logprobs, regardless of the values specified in the batch.
+    logprobs: bool = False
+
+    # TODO No penalties for now
+    no_penalties: bool = True
+    prompt_token_ids = None
+    frequency_penalties = None
+    presence_penalties = None
+    repetition_penalties = None
+    # should use tensor
+    output_token_ids: list[list[int]] = field(default_factory=lambda: list())
+
+    min_tokens = None  # impl is not vectorized
+
+    logit_bias: list[Optional[dict[int, float]]] = field(
+        default_factory=lambda: list())
+
+    allowed_token_ids_mask = None
+    bad_words_token_ids = None
+
+    # Generator not supported by xla
+    _generators: dict[int,
+                      torch.Generator] = field(default_factory=lambda: dict())
+
+    @property
+    def generators(self) -> dict[int, torch.Generator]:
+        # Generator not supported by torch/xla. This field must be immutable.
+        return self._generators
+
+    @classmethod
+    def from_input_batch(
+        cls,
+        input_batch: InputBatch,
+        padded_num_reqs: int,
+        xla_device: torch.device,
+        generate_params_if_all_greedy: bool = False
+    ) -> "TPUSupportedSamplingMetadata":
+        """
+        Copy sampling tensors slices from `input_batch` to on device tensors.
+
+        `InputBatch._make_sampling_metadata` causes recompilation on XLA as it 
+        slices dynamic shapes on device tensors. This impl moves the dynamic 
+        ops to CPU and produces tensors of fixed `padded_num_reqs` size.
+
+        Args:
+            input_batch: The input batch containing sampling parameters.
+            padded_num_reqs: The padded number of requests.
+            xla_device: The XLA device.
+            generate_params_if_all_greedy: If True, generate sampling parameters
+                even if all requests are greedy. this is useful for cases where
+                we want to pre-compile a graph with sampling parameters, even if
+                they are not strictly needed for greedy decoding.
+        """
+        needs_logprobs = input_batch.max_num_logprobs>0 if \
+            input_batch.max_num_logprobs else False
+        # Early return to avoid unnecessary cpu to tpu copy
+        if (input_batch.all_greedy is True
+                and generate_params_if_all_greedy is False):
+            return cls(all_greedy=True, logprobs=needs_logprobs)
+
+        num_reqs = input_batch.num_reqs
+
+        def fill_slice(cpu_tensor: torch.Tensor, fill_val) -> torch.Tensor:
+            # Pad value is the default one.
+            cpu_tensor[num_reqs:padded_num_reqs] = fill_val
+
+        fill_slice(input_batch.temperature_cpu_tensor,
+                   DEFAULT_SAMPLING_PARAMS["temperature"])
+        fill_slice(input_batch.min_p_cpu_tensor,
+                   DEFAULT_SAMPLING_PARAMS["min_p"])
+        fill_slice(input_batch.top_k_cpu_tensor,
+                   DEFAULT_SAMPLING_PARAMS["top_k"])
+        fill_slice(input_batch.top_p_cpu_tensor,
+                   DEFAULT_SAMPLING_PARAMS["top_p"])
+
+        # Slice persistent device tensors to a fixed pre-compiled padded shape.
+        return cls(
+            temperature=input_batch.temperature_cpu_tensor[:padded_num_reqs].
+            to(xla_device),
+            all_greedy=input_batch.all_greedy,
+            # TODO enable more and avoid returning None values
+            top_p=input_batch.top_p_cpu_tensor[:padded_num_reqs].to(
+                xla_device),
+            top_k=input_batch.top_k_cpu_tensor[:padded_num_reqs].to(
+                xla_device),
+            min_p=input_batch.min_p_cpu_tensor[:padded_num_reqs].to(
+                xla_device),
+            logprobs=needs_logprobs)
diff --git a/vllm_v0.10.0/vllm/v1/sample/tpu/sampler.py b/vllm_v0.10.0/vllm/v1/sample/tpu/sampler.py
new file mode 100644
index 0000000..2c9f489
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/sample/tpu/sampler.py
@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Sampler layer implementing TPU supported operations."""
+
+import torch
+import torch.nn as nn
+
+from vllm.v1.outputs import LogprobsTensors, SamplerOutput
+from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler
+from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata
+
+_SAMPLING_EPS = 1e-5
+
+
+class Sampler(nn.Module):
+
+    def __init__(self):
+        # TODO(houseroad): Add support for logprobs_mode.
+        super().__init__()
+        self.topk_topp_sampler = TopKTopPSampler()
+
+    def forward(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: TPUSupportedSamplingMetadata,
+    ) -> SamplerOutput:
+        # Use float32 for the logits.
+        logits = logits.to(torch.float32)
+        # Sample the next token.
+        sampled = self.sample(logits, sampling_metadata)
+
+        # These are TPU tensors.
+        sampler_output = SamplerOutput(
+            # The sampled tokens are expanded to 2D tensor with shape
+            # [num_requests, 1], where each row represents one generated
+            # token per request.
+            sampled_token_ids=sampled.unsqueeze(-1),
+            logprobs_tensors=None)
+        return sampler_output
+
+    def apply_temperature(
+        self,
+        logits: torch.Tensor,
+        temp: torch.Tensor,
+    ) -> torch.Tensor:
+        return logits.div_(temp.unsqueeze(dim=1))
+
+    def greedy_sample(self, logits: torch.Tensor) -> torch.Tensor:
+        return logits.argmax(dim=-1).view(-1)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: TPUSupportedSamplingMetadata,
+    ) -> torch.Tensor:
+        greedy_sampled = self.greedy_sample(logits)
+
+        assert sampling_metadata.temperature is not None
+
+        # Apply temperature.
+        logits = self.apply_temperature(logits, sampling_metadata.temperature)
+
+        # Apply min_p.
+        if sampling_metadata.min_p is not None:
+            logits = self.apply_min_p(logits, sampling_metadata.min_p)
+
+        # Apply top_k and/or top_p.
+        random_sampled = self.topk_topp_sampler(
+            logits,
+            sampling_metadata.generators,
+            sampling_metadata.top_k,
+            sampling_metadata.top_p,
+        )
+
+        sampled = torch.where(sampling_metadata.temperature < _SAMPLING_EPS,
+                              greedy_sampled, random_sampled)
+        return sampled
+
+    def compute_logprobs(self, logits: torch.Tensor) -> torch.Tensor:
+        return logits.log_softmax(dim=-1, dtype=torch.float32)
+
+    def gather_logprobs(
+        self,
+        logprobs: torch.Tensor,
+        num_logprobs: int,
+        token_ids: torch.Tensor,
+    ) -> LogprobsTensors:
+        """
+        Gather logprobs for topk and sampled/prompt token.
+
+        Args:
+          logits: (num tokens) x (vocab) tensor
+          num_logprobs: minimum number of logprobs to
+                        retain per token
+          token_ids: prompt tokens (if prompt logprobs)
+                     or sampled tokens (if sampled
+                     logprobs); 1D token ID tensor
+                     with (num tokens) elements
+
+        Returns:
+          Top-k int indices tensor, (num tokens) x (num_logprobs + 1)
+          Top-k float logprobs tensor, (num tokens) x (num_logprobs + 1)
+          Sampled token rank tensor, (num tokens)
+        """
+        # Find the topK values.
+        topk_logprobs, topk_indices = torch.topk(logprobs,
+                                                 num_logprobs,
+                                                 dim=-1)
+
+        # Get with the logprob of the prompt or sampled token.
+        token_ids = token_ids.unsqueeze(-1)
+        token_logprobs = logprobs.gather(-1, token_ids)
+
+        # Compute the ranks of the actual token.
+        token_ranks = (logprobs >= token_logprobs).sum(-1)
+
+        # Concatenate together with the topk.
+        indices = torch.cat((token_ids, topk_indices), dim=1)
+        logprobs = torch.cat((token_logprobs, topk_logprobs), dim=1)
+
+        # Use int32 to reduce the tensor size.
+        indices = indices.to(torch.int32)
+
+        return LogprobsTensors(indices, logprobs, token_ranks)
+
+    def apply_min_p(
+        self,
+        logits: torch.Tensor,
+        min_p: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Filters logits using adaptive probability thresholding.
+        """
+        # Convert logits to probability distribution
+        probability_values = torch.nn.functional.softmax(logits, dim=-1)
+        # Calculate maximum probabilities per sequence
+        max_probabilities = torch.amax(probability_values,
+                                       dim=-1,
+                                       keepdim=True)
+        # Reshape min_p for broadcasting
+        adjusted_min_p = min_p.unsqueeze(1) * max_probabilities
+        # Identify valid tokens using threshold comparison
+        valid_token_mask = probability_values >= adjusted_min_p
+        # Apply mask using boolean indexing (xla friendly)
+        logits.masked_fill_(~valid_token_mask, -float("inf"))
+        return logits
diff --git a/vllm_v0.10.0/vllm/v1/serial_utils.py b/vllm_v0.10.0/vllm/v1/serial_utils.py
new file mode 100644
index 0000000..03200c2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/serial_utils.py
@@ -0,0 +1,315 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import dataclasses
+import pickle
+from collections.abc import Sequence
+from inspect import isclass
+from types import FunctionType
+from typing import Any, Optional, Union
+
+import cloudpickle
+import numpy as np
+import torch
+import zmq
+from msgspec import msgpack
+
+from vllm import envs
+from vllm.logger import init_logger
+from vllm.multimodal.inputs import (BaseMultiModalField,
+                                    MultiModalBatchedField,
+                                    MultiModalFieldConfig, MultiModalFieldElem,
+                                    MultiModalFlatField, MultiModalKwargs,
+                                    MultiModalKwargsItem,
+                                    MultiModalSharedField, NestedTensors)
+
+logger = init_logger(__name__)
+
+CUSTOM_TYPE_PICKLE = 1
+CUSTOM_TYPE_CLOUDPICKLE = 2
+CUSTOM_TYPE_RAW_VIEW = 3
+
+# MultiModalField class serialization type map.
+# These need to list all possible field types and match them
+# to factory methods in `MultiModalFieldConfig`.
+MMF_CLASS_TO_FACTORY: dict[type[BaseMultiModalField], str] = {
+    MultiModalFlatField: "flat",
+    MultiModalSharedField: "shared",
+    MultiModalBatchedField: "batched",
+}
+
+bytestr = Union[bytes, bytearray, memoryview, zmq.Frame]
+
+
+def _log_insecure_serialization_warning():
+    logger.warning_once("Allowing insecure serialization using pickle due to "
+                        "VLLM_ALLOW_INSECURE_SERIALIZATION=1")
+
+
+class MsgpackEncoder:
+    """Encoder with custom torch tensor and numpy array serialization.
+
+    Note that unlike vanilla `msgspec` Encoders, this interface is generally
+    not thread-safe when encoding tensors / numpy arrays.
+
+    By default, arrays below 256B are serialized inline Larger will get sent 
+    via dedicated messages. Note that this is a per-tensor limit.
+    """
+
+    def __init__(self, size_threshold: Optional[int] = None):
+        if size_threshold is None:
+            size_threshold = envs.VLLM_MSGPACK_ZERO_COPY_THRESHOLD
+        self.encoder = msgpack.Encoder(enc_hook=self.enc_hook)
+        # This is used as a local stash of buffers that we can then access from
+        # our custom `msgspec` hook, `enc_hook`. We don't have a way to
+        # pass custom data to the hook otherwise.
+        self.aux_buffers: Optional[list[bytestr]] = None
+        self.size_threshold = size_threshold
+        if envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
+            _log_insecure_serialization_warning()
+
+    def encode(self, obj: Any) -> Sequence[bytestr]:
+        try:
+            self.aux_buffers = bufs = [b'']
+            bufs[0] = self.encoder.encode(obj)
+            # This `bufs` list allows us to collect direct pointers to backing
+            # buffers of tensors and np arrays, and return them along with the
+            # top-level encoded buffer instead of copying their data into the
+            # new buffer.
+            return bufs
+        finally:
+            self.aux_buffers = None
+
+    def encode_into(self, obj: Any, buf: bytearray) -> Sequence[bytestr]:
+        try:
+            self.aux_buffers = [buf]
+            bufs = self.aux_buffers
+            self.encoder.encode_into(obj, buf)
+            return bufs
+        finally:
+            self.aux_buffers = None
+
+    def enc_hook(self, obj: Any) -> Any:
+        if isinstance(obj, torch.Tensor):
+            return self._encode_tensor(obj)
+
+        # Fall back to pickle for object or void kind ndarrays.
+        if isinstance(obj, np.ndarray) and obj.dtype.kind not in ('O', 'V'):
+            return self._encode_ndarray(obj)
+
+        if isinstance(obj, slice):
+            # We are assuming only int-based values will be used here.
+            return tuple(
+                int(v) if v is not None else None
+                for v in (obj.start, obj.stop, obj.step))
+
+        if isinstance(obj, MultiModalKwargs):
+            mm: MultiModalKwargs = obj
+            if not mm.modalities:
+                # just return the main dict if there are no modalities.
+                return dict(mm)
+
+            # ignore the main dict, it will be re-indexed.
+            # Encode a list of MultiModalKwargsItems as plain dicts
+            # + special handling for .field.
+            # Any tensors *not* indexed by modality will be ignored.
+            return [[{
+                "modality": elem.modality,
+                "key": elem.key,
+                "data": self._encode_nested_tensors(elem.data),
+                "field": self._encode_mm_field(elem.field),
+            } for elem in item.values()]
+                    for itemlist in mm._items_by_modality.values()
+                    for item in itemlist]
+
+        if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
+            raise TypeError(f"Object of type {type(obj)} is not serializable"
+                            "Set VLLM_ALLOW_INSECURE_SERIALIZATION=1 to allow "
+                            "fallback to pickle-based serialization.")
+
+        if isinstance(obj, FunctionType):
+            # `pickle` is generally faster than cloudpickle, but can have
+            # problems serializing methods.
+            return msgpack.Ext(CUSTOM_TYPE_CLOUDPICKLE, cloudpickle.dumps(obj))
+
+        return msgpack.Ext(CUSTOM_TYPE_PICKLE,
+                           pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL))
+
+    def _encode_ndarray(
+        self, obj: np.ndarray
+    ) -> tuple[str, tuple[int, ...], Union[int, memoryview]]:
+        assert self.aux_buffers is not None
+        # If the array is non-contiguous, we need to copy it first
+        arr_data = obj.data if obj.flags.c_contiguous else obj.tobytes()
+        if not obj.shape or obj.nbytes < self.size_threshold:
+            # Encode small arrays and scalars inline. Using this extension type
+            # ensures we can avoid copying when decoding.
+            data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr_data)
+        else:
+            # Otherwise encode index of backing buffer to avoid copy.
+            data = len(self.aux_buffers)
+            self.aux_buffers.append(arr_data)
+
+        # We serialize the ndarray as a tuple of native types.
+        # The data is either inlined if small, or an index into a list of
+        # backing buffers that we've stashed in `aux_buffers`.
+        return obj.dtype.str, obj.shape, data
+
+    def _encode_tensor(
+        self, obj: torch.Tensor
+    ) -> tuple[str, tuple[int, ...], Union[int, memoryview]]:
+        assert self.aux_buffers is not None
+        # view the tensor as a contiguous 1D array of bytes
+        arr = obj.flatten().contiguous().view(torch.uint8).numpy()
+        if obj.nbytes < self.size_threshold:
+            # Smaller tensors are encoded inline, just like ndarrays.
+            data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr.data)
+        else:
+            # Otherwise encode index of backing buffer to avoid copy.
+            data = len(self.aux_buffers)
+            self.aux_buffers.append(arr.data)
+        dtype = str(obj.dtype).removeprefix("torch.")
+        return dtype, obj.shape, data
+
+    def _encode_nested_tensors(self, nt: NestedTensors) -> Any:
+        if isinstance(nt, torch.Tensor):
+            return self._encode_tensor(nt)
+        if isinstance(nt, (int, float)):
+            # Although it violates NestedTensors type, MultiModalKwargs
+            # values are sometimes floats.
+            return nt
+        return [self._encode_nested_tensors(x) for x in nt]
+
+    def _encode_mm_field(self, field: BaseMultiModalField):
+        # Figure out the factory name for the field type.
+        name = MMF_CLASS_TO_FACTORY.get(field.__class__)
+        if not name:
+            raise TypeError(f"Unsupported field type: {field.__class__}")
+        # We just need to copy all of the field values in order
+        # which will be then used to reconstruct the field.
+        field_values = (getattr(field, f.name)
+                        for f in dataclasses.fields(field))
+        return name, *field_values
+
+
+class MsgpackDecoder:
+    """Decoder with custom torch tensor and numpy array serialization.
+
+    Note that unlike vanilla `msgspec` Decoders, this interface is generally
+    not thread-safe when encoding tensors / numpy arrays.
+    """
+
+    def __init__(self, t: Optional[Any] = None):
+        args = () if t is None else (t, )
+        self.decoder = msgpack.Decoder(*args,
+                                       ext_hook=self.ext_hook,
+                                       dec_hook=self.dec_hook)
+        self.aux_buffers: Sequence[bytestr] = ()
+        if envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
+            _log_insecure_serialization_warning()
+
+    def decode(self, bufs: Union[bytestr, Sequence[bytestr]]) -> Any:
+        if isinstance(bufs, (bytes, bytearray, memoryview, zmq.Frame)):
+            # TODO - This check can become `isinstance(bufs, bytestr)`
+            # as of Python 3.10.
+            return self.decoder.decode(bufs)
+
+        self.aux_buffers = bufs
+        try:
+            return self.decoder.decode(bufs[0])
+        finally:
+            self.aux_buffers = ()
+
+    def dec_hook(self, t: type, obj: Any) -> Any:
+        # Given native types in `obj`, convert to type `t`.
+        if isclass(t):
+            if issubclass(t, np.ndarray):
+                return self._decode_ndarray(obj)
+            if issubclass(t, torch.Tensor):
+                return self._decode_tensor(obj)
+            if t is slice:
+                return slice(*obj)
+            if issubclass(t, MultiModalKwargs):
+                if isinstance(obj, list):
+                    return MultiModalKwargs.from_items(
+                        self._decode_mm_items(obj))
+                return MultiModalKwargs({
+                    k: self._decode_nested_tensors(v)
+                    for k, v in obj.items()
+                })
+        return obj
+
+    def _decode_ndarray(self, arr: Any) -> np.ndarray:
+        dtype, shape, data = arr
+        # zero-copy decode. We assume the ndarray will not be kept around,
+        # as it now locks the whole received message buffer in memory.
+        buffer = self.aux_buffers[data] if isinstance(data, int) else data
+        return np.frombuffer(buffer, dtype=dtype).reshape(shape)
+
+    def _decode_tensor(self, arr: Any) -> torch.Tensor:
+        dtype, shape, data = arr
+        # Copy from inline representation, to decouple the memory storage
+        # of the message from the original buffer. And also make Torch
+        # not complain about a readonly memoryview.
+        buffer = self.aux_buffers[data] if isinstance(data, int) \
+            else bytearray(data)
+        torch_dtype = getattr(torch, dtype)
+        assert isinstance(torch_dtype, torch.dtype)
+        if not buffer:  # torch.frombuffer doesn't like empty buffers
+            assert 0 in shape
+            return torch.empty(shape, dtype=torch_dtype)
+        # Create uint8 array
+        arr = torch.frombuffer(buffer, dtype=torch.uint8)
+        # Convert back to proper shape & type
+        return arr.view(torch_dtype).view(shape)
+
+    def _decode_mm_items(self, obj: list) -> list[MultiModalKwargsItem]:
+        decoded_items = []
+        for item in obj:
+            elems = []
+            for v in item:
+                v["data"] = self._decode_nested_tensors(v["data"])
+                # Reconstruct the field processor using MultiModalFieldConfig
+                factory_meth_name, *field_args = v["field"]
+                factory_meth = getattr(MultiModalFieldConfig,
+                                       factory_meth_name)
+
+                # Special case: decode the union "slices" field of
+                # MultiModalFlatField
+                if factory_meth_name == "flat":
+                    field_args[0] = self._decode_nested_slices(field_args[0])
+
+                v["field"] = factory_meth(None, *field_args).field
+                elems.append(MultiModalFieldElem(**v))
+            decoded_items.append(MultiModalKwargsItem.from_elems(elems))
+        return decoded_items
+
+    def _decode_nested_tensors(self, obj: Any) -> NestedTensors:
+        if isinstance(obj, (int, float)):
+            # Although it violates NestedTensors type, MultiModalKwargs
+            # values are sometimes floats.
+            return obj
+        if not isinstance(obj, list):
+            raise TypeError(f"Unexpected NestedTensors contents: {type(obj)}")
+        if obj and isinstance(obj[0], str):
+            return self._decode_tensor(obj)
+        return [self._decode_nested_tensors(x) for x in obj]
+
+    def _decode_nested_slices(self, obj: Any) -> Any:
+        assert isinstance(obj, (list, tuple))
+        if obj and not isinstance(obj[0], (list, tuple)):
+            return slice(*obj)
+        return [self._decode_nested_slices(x) for x in obj]
+
+    def ext_hook(self, code: int, data: memoryview) -> Any:
+        if code == CUSTOM_TYPE_RAW_VIEW:
+            return data
+
+        if envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
+            if code == CUSTOM_TYPE_PICKLE:
+                return pickle.loads(data)
+            if code == CUSTOM_TYPE_CLOUDPICKLE:
+                return cloudpickle.loads(data)
+
+        raise NotImplementedError(
+            f"Extension type code {code} is not supported")
diff --git a/vllm_v0.10.0/vllm/v1/spec_decode/__init__.py b/vllm_v0.10.0/vllm/v1/spec_decode/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/v1/spec_decode/eagle.py b/vllm_v0.10.0/vllm/v1/spec_decode/eagle.py
new file mode 100644
index 0000000..967847c
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/spec_decode/eagle.py
@@ -0,0 +1,459 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import numpy as np
+import torch
+import torch.nn as nn
+
+from vllm.attention.layer import Attention
+from vllm.config import (CompilationLevel, VllmConfig,
+                         get_layers_from_vllm_config)
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.forward_context import set_forward_context
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader import get_model
+from vllm.model_executor.models import supports_multimodal
+from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
+from vllm.utils import is_pin_memory_available
+from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
+from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.sample.metadata import SamplingMetadata
+
+logger = init_logger(__name__)
+
+PADDING_SLOT_ID = -1
+
+
+class EagleProposer:
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        device: torch.device,
+        runner=None,
+    ):
+        self.vllm_config = vllm_config
+        self.speculative_config = vllm_config.speculative_config
+        self.draft_model_config = self.speculative_config.draft_model_config
+        self.method = self.speculative_config.method
+
+        self.runner = runner
+        self.dtype = vllm_config.model_config.dtype
+        self.max_model_len = vllm_config.model_config.max_model_len
+        self.block_size = vllm_config.cache_config.block_size
+        self.num_speculative_tokens = (
+            self.speculative_config.num_speculative_tokens)
+        self.max_num_tokens = (
+            vllm_config.scheduler_config.max_num_batched_tokens)
+        self.token_arange_np = np.arange(self.max_num_tokens)
+        # We need to get the hidden size from the draft model config because
+        # the draft model's hidden size can be different from the target model's
+        # hidden size (e.g., Llama 3.3 70B).
+        self.hidden_size = self.draft_model_config.get_hidden_size()
+
+        self.use_cuda_graph = (self.vllm_config.compilation_config.level
+                               == CompilationLevel.PIECEWISE and
+                               not self.vllm_config.model_config.enforce_eager)
+        self.cudagraph_batch_sizes = list(
+            reversed(
+                self.vllm_config.compilation_config.cudagraph_capture_sizes))
+
+        # persistent buffers for cuda graph
+        self.input_ids = torch.zeros(self.max_num_tokens,
+                                     dtype=torch.int32,
+                                     device=device)
+        self.positions = torch.zeros(self.max_num_tokens,
+                                     dtype=torch.int64,
+                                     device=device)
+        self.hidden_states = torch.zeros(
+            (self.max_num_tokens, self.hidden_size),
+            dtype=self.dtype,
+            device=device)
+        # We need +1 here because the arange is used to set query_start_loc,
+        # which has one more element than batch_size.
+        self.arange = torch.arange(vllm_config.scheduler_config.max_num_seqs +
+                                   1,
+                                   device=device,
+                                   dtype=torch.int32)
+
+    def propose(
+        self,
+        # [num_tokens]
+        target_token_ids: torch.Tensor,
+        # [num_tokens]
+        target_positions: torch.Tensor,
+        # [num_tokens, hidden_size]
+        target_hidden_states: torch.Tensor,
+        # [batch_size]
+        next_token_ids: torch.Tensor,
+        common_attn_metadata: CommonAttentionMetadata,
+        sampling_metadata: SamplingMetadata,
+    ) -> torch.Tensor:
+        num_tokens = target_token_ids.shape[0]
+        batch_size = next_token_ids.shape[0]
+        last_token_indices = common_attn_metadata.query_start_loc[1:] - 1
+
+        if self.method == "eagle3":
+            assert isinstance(self.model, Eagle3LlamaForCausalLM)
+            target_hidden_states = self.model.combine_hidden_states(
+                target_hidden_states)
+            assert target_hidden_states.shape[-1] == self.hidden_size
+
+        # Shift the input ids by one token.
+        # E.g., [a1, b1, b2, c1, c2, c3] -> [b1, b2, c1, c2, c3, c3]
+        self.input_ids[:num_tokens - 1] = target_token_ids[1:]
+        # Replace the last token with the next token.
+        # E.g., [b1, b2, c1, c2, c3, c3] -> [a2, b2, b3, c2, c3, c4]
+        self.input_ids[last_token_indices] = next_token_ids
+
+        assert self.runner is not None
+
+        # FIXME: need to consider multiple kv_cache_groups
+        attn_metadata = self.runner.attn_metadata_builders[0].build(
+            common_prefix_len=0,
+            common_attn_metadata=common_attn_metadata,
+            fast_build=True,
+        )
+
+        # At this moment, we assume all eagle layers belong to the same KV
+        # cache group, thus using the same attention metadata.
+        per_layer_attn_metadata = {}
+        for layer_name in self.attn_layer_names:
+            per_layer_attn_metadata[layer_name] = attn_metadata
+        if self.use_cuda_graph and \
+            num_tokens <= self.cudagraph_batch_sizes[-1]:
+            num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
+        else:
+            num_input_tokens = num_tokens
+        # copy inputs to buffer for cudagraph
+        self.positions[:num_tokens] = target_positions
+        self.hidden_states[:num_tokens] = target_hidden_states
+
+        with set_forward_context(per_layer_attn_metadata,
+                                 self.vllm_config,
+                                 num_tokens=num_input_tokens):
+            ret_hidden_states = self.model(
+                self.input_ids[:num_input_tokens],
+                self.positions[:num_input_tokens],
+                self.hidden_states[:num_input_tokens],
+            )
+            if self.method == "deepseek_mtp":
+                last_hidden_states = ret_hidden_states
+            else:
+                last_hidden_states, hidden_states = ret_hidden_states
+        sample_hidden_states = last_hidden_states[last_token_indices]
+        logits = self.model.compute_logits(sample_hidden_states, None)
+        draft_token_ids = logits.argmax(dim=-1)
+
+        # Early exit if there is only one draft token to be generated.
+        if self.num_speculative_tokens == 1:
+            # [batch_size, 1]
+            return draft_token_ids.view(-1, 1)
+
+        # TODO: Currently, MTP module released by deepseek only has
+        # one layer. Adapt this code to support multiple layers once
+        # there's a multi-layer MTP module.
+
+        # Currently FlashAttention is the only backend that supports
+        # multi-token eagle spec decode. This is because the code below
+        # makes assumptions about attn_metadata attributes available.
+        assert isinstance(attn_metadata, FlashAttentionMetadata)
+
+        # Generate the remaining draft tokens.
+        draft_token_ids_list = [draft_token_ids]
+
+        positions = target_positions[last_token_indices]
+        hidden_states = hidden_states[last_token_indices]
+        if self.use_cuda_graph and \
+            batch_size <= self.cudagraph_batch_sizes[-1]:
+            input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size)
+        else:
+            input_batch_size = batch_size
+        attn_metadata.num_actual_tokens = batch_size
+        attn_metadata.max_query_len = 1
+        attn_metadata.query_start_loc = self.arange[:batch_size + 1]
+        for _ in range(self.num_speculative_tokens - 1):
+            # Update the inputs.
+            # cast to int32 is crucial when eagle model is compiled.
+            # tensor.argmax() returns int64 by default.
+            input_ids = draft_token_ids_list[-1].int()
+            positions += 1
+
+            # NOTE(woosuk): We should handle the case where the draft model
+            # generates tokens beyond the max model length. Since it is complex
+            # to remove such requests from the batch, we keep them in the batch
+            # but adjust the position ids and slot mappings to avoid the
+            # out-of-range access during the model execution. The draft tokens
+            # generated with this adjustment should be ignored.
+            exceeds_max_model_len = positions >= self.max_model_len
+            # Mask out the position ids that exceed the max model length.
+            # Otherwise, we may get out-of-range error in RoPE.
+            clamped_positions = torch.where(exceeds_max_model_len, 0,
+                                            positions)
+
+            # Increment the sequence lengths.
+            attn_metadata.max_seq_len += 1
+            attn_metadata.seq_lens += 1
+            # Consider max model length.
+            attn_metadata.max_seq_len = min(attn_metadata.max_seq_len,
+                                            self.max_model_len)
+            # For the requests that exceed the max model length, we set the
+            # sequence length to 1 to minimize their overheads in attention.
+            attn_metadata.seq_lens.masked_fill_(exceeds_max_model_len, 1)
+
+            # Compute the slot mapping.
+            block_numbers = clamped_positions // self.block_size
+            block_ids = attn_metadata.block_table.gather(
+                dim=1, index=block_numbers.view(-1, 1))
+            block_ids = block_ids.view(-1)
+            attn_metadata.slot_mapping = (block_ids * self.block_size +
+                                          clamped_positions % self.block_size)
+            # Mask out the slot mappings that exceed the max model length.
+            # Otherwise, the KV cache will be inadvertently updated with the
+            # padding tokens.
+            attn_metadata.slot_mapping.masked_fill_(exceeds_max_model_len,
+                                                    PADDING_SLOT_ID)
+
+            # copy inputs to buffer for cudagraph
+            self.input_ids[:batch_size] = input_ids
+            self.positions[:batch_size] = clamped_positions
+            self.hidden_states[:batch_size] = hidden_states
+
+            # Run the model.
+            with set_forward_context(per_layer_attn_metadata,
+                                     self.vllm_config,
+                                     num_tokens=input_batch_size):
+                last_hidden_states, hidden_states = self.model(
+                    self.input_ids[:input_batch_size],
+                    self.positions[:input_batch_size],
+                    self.hidden_states[:input_batch_size],
+                )
+            hidden_states = hidden_states[:batch_size]
+            logits = self.model.compute_logits(last_hidden_states[:batch_size],
+                                               None)
+
+            # TODO(wenlong): get more than one token for tree attention
+            draft_token_ids = logits.argmax(dim=-1)
+            draft_token_ids_list.append(draft_token_ids)
+
+        # [batch_size, num_speculative_tokens]
+        draft_token_ids = torch.stack(draft_token_ids_list, dim=1)
+        return draft_token_ids
+
+    def prepare_inputs(
+        self,
+        common_attn_metadata: CommonAttentionMetadata,
+        # [batch_size]
+        num_rejected_tokens: torch.Tensor
+    ) -> tuple[CommonAttentionMetadata, torch.Tensor]:
+        """
+        This function is used to prepare the inputs for the spec decode.
+        It updates to the common_attn_metadata to account for the rejected
+        tokens (and newly sampled tokens). It also returns the token indices
+        of the tokens that should be fed to the speculator.
+        """
+        # E.g.
+        #  common_attn_metadata.query_start_loc{_cpu}:
+        #         [0, q1, q1 + q2, q1 + q2 + q3]
+        #  common_attn_metadata.seq_lens{_cpu}: [s1, s2, s3]
+        #  num_rejected_tokens: [n1, n2, n3]
+        # This function computes the intermediate values:
+        #  num_tokens_per_req: [q1 - n1, q2 - n2, q3 - n3]
+        # And returns:
+        #  common_attn_metadata.query_start_loc{_cpu}:
+        #         [0, q1 - n1, q1 + q2 - n1 - n2, q1 + q2 + q3 - n1 - n2 - n3]
+        #  common_attn_metadata.seq_lens{_cpu}:
+        #         [s1 - n1 + 1, s2 - n2 + 1, s3 - n3 + 1]
+        #  token_indices: [0, 1, ..., q1 - n1 - 1,
+        #                  q1, q1 + 1, ..., q1 + q2 - n2 - 1,
+        #                  q1 + q2, q1 + q2 + 1, ..., q1 + q2 + q3 - n3 - 1]
+
+        device = common_attn_metadata.query_start_loc.device
+        query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
+        new_seq_lens_cpu = common_attn_metadata.seq_lens_cpu \
+            - num_rejected_tokens
+
+        # [0, q1, q1 + q2, q1 + q2 + q3] -> [q1, q2, q3]
+        new_query_len_per_req = (query_start_loc_cpu[1:] -
+                                 query_start_loc_cpu[:-1])
+        # [q1, q2, q3] -> [q1 - n1, q2 - n2, q3 - n3]
+        new_num_tokens_per_req = new_query_len_per_req - num_rejected_tokens
+        new_num_tokens_per_req_np = new_num_tokens_per_req.numpy()
+
+        # [q1 - n1, q2 - n2, q3 - n3] ->
+        # [0, q1 - n1, q1 + q2 - n1 - n2, q1 + q2 + q3 - n1 - n2 - n3]
+        new_query_start_loc_cpu = torch.zeros(
+            query_start_loc_cpu.shape,
+            dtype=torch.int32,
+            pin_memory=is_pin_memory_available())
+        new_query_start_loc_np = new_query_start_loc_cpu.numpy()
+        np.cumsum(new_num_tokens_per_req_np, out=new_query_start_loc_np[1:])
+
+        total_num_tokens = new_query_start_loc_np[-1]
+        # Example assuming num_tokens_per_req_np = [2, 4, 3]
+        # this implies that `new_query_start_locs` is:
+        # [0, 2, 6, 9] ->
+        # [0, 0, 2, 2, 2, 2, 6, 6, 6]
+        #  _r1_  ____r2____  ___r3__
+        new_query_start_locs_expanded = np.repeat(new_query_start_loc_np[:-1],
+                                                  new_num_tokens_per_req_np)
+        # [0, 1, 2, 3, 4, 5, 6, 7, 8] ->
+        # [0, 1, 0, 1, 2, 3, 0, 1, 2]
+        #  _r1_  ____r2____  ___r3__
+        token_offests = self.token_arange_np[:total_num_tokens] \
+            - new_query_start_locs_expanded
+
+        # Expand starting positions to match token pattern
+        # [0, q1, q1 + q2] ->
+        # [0, 0, q1, q1, q1, q1, q1 + q2, q1 + q2, q1 + q2]
+        #  _r1_  _____r2_______  ___________r3____________
+        old_query_start_locs_expanded = np.repeat(
+            query_start_loc_cpu[:-1].numpy(), new_num_tokens_per_req_np)
+        # Final token indices are:
+        # [0, 1,                                   // req 1
+        #  q1 + 0, q1 + 1, q1 + 2, q1 + 3,         // req 2
+        #  q1 + q2 + 0, q1 + q2 + 1, q1 + q2 + 2]  // req 3
+        token_indices_np = token_offests + old_query_start_locs_expanded
+        token_indices = torch.from_numpy(token_indices_np).to(
+            device, non_blocking=True)
+
+        spec_common_attn_metadata = CommonAttentionMetadata(
+            query_start_loc=new_query_start_loc_cpu.to(device,
+                                                       non_blocking=True),
+            seq_lens=new_seq_lens_cpu.to(device, non_blocking=True),
+            query_start_loc_cpu=new_query_start_loc_cpu,
+            seq_lens_cpu=new_seq_lens_cpu,
+            num_computed_tokens_cpu=common_attn_metadata.
+            num_computed_tokens_cpu,
+            num_reqs=common_attn_metadata.num_reqs,
+            num_actual_tokens=total_num_tokens,
+            max_query_len=new_query_len_per_req.max().item(),
+            block_table_tensor=common_attn_metadata.block_table_tensor,
+            slot_mapping=common_attn_metadata.slot_mapping[token_indices],
+        )
+
+        return spec_common_attn_metadata, token_indices
+
+    def load_model(self, target_model: nn.Module) -> None:
+        draft_model_config = \
+            self.vllm_config.speculative_config.draft_model_config
+        target_attn_layer_names = set(
+            get_layers_from_vllm_config(self.vllm_config, Attention).keys())
+
+        from vllm.compilation.backends import set_model_tag
+        with set_model_tag("eagle_head"):
+            self.model = get_model(vllm_config=self.vllm_config,
+                                   model_config=draft_model_config)
+
+        draft_attn_layer_names = (
+            get_layers_from_vllm_config(self.vllm_config, Attention).keys() -
+            target_attn_layer_names)
+
+        self.attn_layer_names = list(draft_attn_layer_names)
+
+        if supports_multimodal(target_model):
+            # handle multimodality
+            self.model.config.image_token_index = (
+                target_model.config.image_token_index)
+            target_language_model = target_model.get_language_model()
+        else:
+            target_language_model = target_model
+        # share embed_tokens with the target model if needed
+        if get_pp_group().world_size == 1 \
+            and self.model.model.embed_tokens.weight.shape \
+                == target_language_model.model.embed_tokens.weight.shape:
+            logger.info(
+                "Assuming the EAGLE head shares the same vocab embedding" \
+                " with the target model."
+            )
+            del self.model.model.embed_tokens
+            self.model.model.embed_tokens = (
+                target_language_model.model.embed_tokens)
+        else:
+            logger.info(
+                "The EAGLE head's vocab embedding will be loaded separately" \
+                " from the target model."
+            )
+
+        # share lm_head with the target model if needed
+        # some model definition do not define lm_head explicitly
+        # and reuse embed_tokens for lm_head, e.g., CohereForCausalLM
+        if self.vllm_config.speculative_config.method != "eagle3" and \
+                hasattr(target_language_model, "lm_head"):
+            logger.info("Loading EAGLE LM head weights from the target model.")
+            self.model.lm_head = target_language_model.lm_head
+
+    @torch.inference_mode()
+    def dummy_run(
+        self,
+        num_tokens: int,
+    ) -> None:
+        with set_forward_context(None, self.vllm_config,
+                                 num_tokens=num_tokens):
+            self.model(
+                self.input_ids[:num_tokens],
+                self.positions[:num_tokens],
+                self.hidden_states[:num_tokens],
+            )
+
+    def validate_same_kv_cache_group(self,
+                                     kv_cache_config: KVCacheConfig) -> None:
+        """
+        Validate that all eagle layers belong to the same KVCacheGroup.
+        Need this assumption to ensure all eagle layers can use the
+        same AttentionMetadata.
+        May extend to multiple AttentionMetadata in the future.
+        """
+        kv_cache_groups: dict[str, int] = {}
+        for id, kv_cache_group in enumerate(kv_cache_config.kv_cache_groups):
+            for layer_name in kv_cache_group.layer_names:
+                kv_cache_groups[layer_name] = id
+        assert len(
+            set([
+                kv_cache_groups[layer_name]
+                for layer_name in self.attn_layer_names
+            ])
+        ) == 1, "All eagle layers should belong to the same kv cache group"
+
+
+# NOTE(woosuk): Currently, the below code is not used and we always use argmax
+# to sample the draft tokens. We will use this after we find a way to manage
+# the draft prob tensor.
+# Refer to https://github.com/vllm-project/vllm/pull/16899 for the details.
+# FIXME(woosuk): The logic here is duplicated with the main sampling code.
+# We should refactor this to reuse the same sampling implementation.
+def compute_probs_and_sample_next_token(
+    logits: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    if sampling_metadata.all_greedy:
+        # For greedy requests, draft_probs is not used in rejection sampling.
+        # Therefore, we can just return the logits.
+        probs = logits
+        next_token_ids = logits.argmax(dim=-1)
+        return next_token_ids, probs
+
+    is_greedy = sampling_metadata.temperature == -1
+    temperature = torch.where(is_greedy, 1.0, sampling_metadata.temperature)
+    logits.div_(temperature.view(-1, 1))
+    probs = logits.softmax(dim=-1, dtype=torch.float32)
+
+    # NOTE(woosuk): Currently, we ignore most of the sampling parameters in
+    # generating the draft tokens. We only use the temperature. While this
+    # could degrade the acceptance rate, it does not affect the distribution
+    # of the generated tokens after rejection sampling.
+
+    # TODO(woosuk): Consider seeds.
+    q = torch.empty_like(probs)
+    q.exponential_()
+    # NOTE(woosuk): We shouldn't use `probs.div_(q)` because the draft_probs
+    # will be used later for rejection sampling.
+    next_token_ids = probs.div(q).argmax(dim=-1).view(-1)
+    if not sampling_metadata.all_random:
+        greedy_token_ids = probs.argmax(dim=-1)
+        next_token_ids = torch.where(
+            is_greedy,
+            greedy_token_ids,
+            next_token_ids,
+        )
+    return next_token_ids, probs
diff --git a/vllm_v0.10.0/vllm/v1/spec_decode/medusa.py b/vllm_v0.10.0/vllm/v1/spec_decode/medusa.py
new file mode 100644
index 0000000..309fd92
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/spec_decode/medusa.py
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+import torch.nn as nn
+
+from vllm.config import VllmConfig
+from vllm.forward_context import set_forward_context
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader import get_model
+from vllm.v1.sample.metadata import SamplingMetadata
+
+# Initialize logger
+logger = init_logger(__name__)
+
+
+class MedusaProposer:
+    """
+    Medusa proposer class for generating token sequences
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        # Save config parameters
+        self.vllm_config = vllm_config
+        self.device = device
+        self.max_num_tokens = (
+            vllm_config.scheduler_config.max_num_batched_tokens)
+        self.hidden_size = vllm_config.speculative_config.\
+            draft_model_config.get_hidden_size(
+        )
+        self.dtype = vllm_config.model_config.dtype
+
+    def propose(
+        self,
+        target_hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> torch.Tensor:
+        # Generate blocks and compute logits
+        blocks = self.model(target_hidden_states)
+        logits = self.model.compute_logits(blocks, None)
+
+        # Get draft tokens and transpose the result
+        draft_tokens = [logit.argmax(dim=-1).tolist() for logit in logits]
+        return [list(row) for row in zip(*draft_tokens)]
+
+    def load_model(self, target_model: nn.Module) -> None:
+        from vllm.compilation.backends import set_model_tag
+        with set_model_tag("medusa_head"):
+            self.model = get_model(vllm_config=self.vllm_config,
+                                   model_config=self.vllm_config.
+                                   speculative_config.draft_model_config)
+
+    @torch.inference_mode()
+    def dummy_run(self, num_tokens: int) -> None:
+        hidden_states = torch.zeros((self.max_num_tokens, self.hidden_size),
+                                    dtype=self.dtype,
+                                    device=self.device)
+        with set_forward_context(None, self.vllm_config,
+                                 num_tokens=num_tokens):
+            self.model(hidden_states)
diff --git a/vllm_v0.10.0/vllm/v1/spec_decode/metadata.py b/vllm_v0.10.0/vllm/v1/spec_decode/metadata.py
new file mode 100644
index 0000000..b1efb40
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/spec_decode/metadata.py
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+
+import numpy as np
+import torch
+
+
+@dataclass
+class SpecDecodeMetadata:
+
+    # [num_tokens]
+    draft_token_ids: torch.Tensor
+    # [batch_size]
+    num_draft_tokens: list[int]
+    # [batch_size]
+    cu_num_draft_tokens: torch.Tensor
+    # [num_tokens]
+    target_logits_indices: torch.Tensor
+    # [batch_size]
+    bonus_logits_indices: torch.Tensor
+    # [num_tokens + batch_size]
+    logits_indices: torch.Tensor
+
+    def __post_init__(self):
+        self.max_spec_len = max(self.num_draft_tokens)
+
+    @classmethod
+    def make_dummy(
+        cls,
+        draft_token_ids: list[list[int]],
+        device: torch.device,
+    ) -> "SpecDecodeMetadata":
+        batch_size = len(draft_token_ids)
+        num_draft_tokens = [len(ids) for ids in draft_token_ids]
+        flattened_draft_token_ids = sum(draft_token_ids, [])
+        num_tokens = len(flattened_draft_token_ids)
+
+        draft_token_ids_tensor = torch.tensor(flattened_draft_token_ids,
+                                              dtype=torch.int32,
+                                              device=device)
+        cu_num_draft_tokens = np.cumsum(num_draft_tokens, dtype=np.int32)
+        cu_num_draft_tokens_tensor = torch.from_numpy(cu_num_draft_tokens).to(
+            device)
+
+        target_logits_indices = torch.zeros(num_tokens,
+                                            dtype=torch.int32,
+                                            device=device)
+        bonus_logits_indices = torch.zeros(batch_size,
+                                           dtype=torch.int32,
+                                           device=device)
+        logits_indices = torch.zeros(num_tokens + batch_size,
+                                     dtype=torch.int32,
+                                     device=device)
+        return cls(
+            draft_token_ids=draft_token_ids_tensor,
+            num_draft_tokens=num_draft_tokens,
+            cu_num_draft_tokens=cu_num_draft_tokens_tensor,
+            target_logits_indices=target_logits_indices,
+            bonus_logits_indices=bonus_logits_indices,
+            logits_indices=logits_indices,
+        )
diff --git a/vllm_v0.10.0/vllm/v1/spec_decode/metrics.py b/vllm_v0.10.0/vllm/v1/spec_decode/metrics.py
new file mode 100644
index 0000000..b4bc305
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/spec_decode/metrics.py
@@ -0,0 +1,178 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass, field
+from typing import Optional
+
+import numpy as np
+import prometheus_client
+
+from vllm.config import SpeculativeConfig
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class SpecDecodingStats:
+    """Per-step iteration decoding stats from scheduler.
+
+    Each scheduler step, statistics on spec decoding performance are
+    aggregated across requests by the scheduler and returned to the
+    frontend in EngineCoreOutputs->SchedulerStats.
+    """
+
+    num_spec_tokens: int
+    num_drafts: int = 0
+    num_draft_tokens: int = 0
+    num_accepted_tokens: int = 0
+    num_accepted_tokens_per_pos: list[int] = field(default_factory=list)
+
+    @classmethod
+    def new(cls, num_spec_tokens: int) -> "SpecDecodingStats":
+        return cls(num_spec_tokens=num_spec_tokens,
+                   num_accepted_tokens_per_pos=[0] * num_spec_tokens)
+
+    def observe_draft(self, num_draft_tokens: int, num_accepted_tokens: int):
+        self.num_drafts += 1
+        self.num_draft_tokens += num_draft_tokens
+        self.num_accepted_tokens += num_accepted_tokens
+        assert num_accepted_tokens <= self.num_spec_tokens
+        for i in range(num_accepted_tokens):
+            self.num_accepted_tokens_per_pos[i] += 1
+
+
+class SpecDecodingLogging:
+    """Aggregate and log spec decoding metrics.
+
+    LoggingStatLogger aggregates per-iteration metrics over a set
+    time interval using observe() and then logs them using log()
+    before resetting to zero.
+    """
+
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.num_drafts: list[int] = []
+        self.num_draft_tokens: list[int] = []
+        self.num_accepted_tokens: list[int] = []
+        self.accepted_tokens_per_pos_lists: list[list[int]] = []
+
+    def observe(self, spec_decoding_stats: SpecDecodingStats):
+        self.num_drafts.append(spec_decoding_stats.num_drafts)
+        self.num_draft_tokens.append(spec_decoding_stats.num_draft_tokens)
+        self.num_accepted_tokens.append(
+            spec_decoding_stats.num_accepted_tokens)
+        self.accepted_tokens_per_pos_lists.append(
+            spec_decoding_stats.num_accepted_tokens_per_pos)
+
+    def log(self, log_fn=logger.info):
+        if not self.num_drafts:
+            return
+        num_drafts = np.sum(self.num_drafts)
+        num_draft_tokens = np.sum(self.num_draft_tokens)
+        num_accepted_tokens = np.sum(self.num_accepted_tokens)
+
+        draft_acceptance_rate = (num_accepted_tokens / num_draft_tokens *
+                                 100 if num_draft_tokens > 0 else float("nan"))
+
+        # Conventionally, mean acceptance length includes the bonus token
+        mean_acceptance_length = 1 + (num_accepted_tokens / num_drafts)
+
+        pos_matrix = np.array(self.accepted_tokens_per_pos_lists)
+        acceptance_rates = np.sum(pos_matrix, axis=0) / num_drafts
+        rates_str = ", ".join(f"{p:.3f}" for p in acceptance_rates)
+
+        log_fn(
+            "SpecDecoding metrics: "
+            "Draft acceptance rate: %.1f%%, "
+            "Mean acceptance length: %.2f, "
+            "Accepted: %d tokens, "
+            "Drafted: %d tokens, "
+            "Per-position acceptance rate: %s",
+            draft_acceptance_rate,
+            mean_acceptance_length,
+            num_accepted_tokens,
+            num_draft_tokens,
+            rates_str,
+        )
+        self.reset()
+
+
+class SpecDecodingProm:
+    """Record spec decoding metrics in Prometheus.
+
+    The acceptance rate can be calculated using a PromQL query:
+
+      rate(vllm:spec_decode_num_accepted_tokens_total[$interval]) /
+      rate(vllm:spec_decode_num_draft_tokens_total[$interval])
+
+    The mean acceptance length (conventionally including bonus tokens)
+    can be calculated using:
+
+      1 + (
+      rate(vllm:spec_decode_num_accepted_tokens_total[$interval]) /
+      rate(vllm:spec_decode_num_drafts[$interval]))
+
+    A per-position acceptance rate vector can be computed using
+
+      vllm:spec_decode_num_accepted_tokens_per_pos[$interval] /
+      vllm:spec_decode_num_drafts[$interval]
+    """
+
+    _counter_cls = prometheus_client.Counter
+
+    def __init__(
+        self,
+        speculative_config: Optional[SpeculativeConfig],
+        labelnames: list[str],
+        labelvalues: list[str],
+    ):
+        self.spec_decoding_enabled = speculative_config is not None
+        if not self.spec_decoding_enabled:
+            return
+
+        self.counter_spec_decode_num_drafts = \
+            self._counter_cls(
+                name="vllm:spec_decode_num_drafts",
+                documentation="Number of spec decoding drafts.",
+                labelnames=labelnames).labels(*labelvalues)
+        self.counter_spec_decode_num_draft_tokens = \
+            self._counter_cls(
+                name="vllm:spec_decode_num_draft_tokens",
+                documentation="Number of draft tokens.",
+                labelnames=labelnames,).labels(*labelvalues)
+        self.counter_spec_decode_num_accepted_tokens = \
+            self._counter_cls(
+                name="vllm:spec_decode_num_accepted_tokens",
+                documentation="Number of accepted tokens.",
+                labelnames=labelnames).labels(*labelvalues)
+
+        assert speculative_config is not None
+        num_spec_tokens = (speculative_config.num_speculative_tokens
+                           if self.spec_decoding_enabled else 0)
+        pos_labelnames = labelnames + ["position"]
+        base_counter = self._counter_cls(
+            name="vllm:spec_decode_num_accepted_tokens_per_pos",
+            documentation="Accepted tokens per draft position.",
+            labelnames=pos_labelnames,
+        )
+        self.counter_spec_decode_num_accepted_tokens_per_pos: list[
+            prometheus_client.Counter] = []
+        for pos in range(num_spec_tokens):
+            pos_labelvalues = labelvalues + [str(pos)]
+            self.counter_spec_decode_num_accepted_tokens_per_pos.append(
+                base_counter.labels(*pos_labelvalues))
+
+    def observe(self, spec_decoding_stats: SpecDecodingStats):
+        if not self.spec_decoding_enabled:
+            return
+        self.counter_spec_decode_num_drafts.inc(spec_decoding_stats.num_drafts)
+        self.counter_spec_decode_num_draft_tokens.inc(
+            spec_decoding_stats.num_draft_tokens)
+        self.counter_spec_decode_num_accepted_tokens.inc(
+            spec_decoding_stats.num_accepted_tokens)
+        for pos, counter in enumerate(
+                self.counter_spec_decode_num_accepted_tokens_per_pos):
+            counter.inc(spec_decoding_stats.num_accepted_tokens_per_pos[pos])
diff --git a/vllm_v0.10.0/vllm/v1/spec_decode/ngram_proposer.py b/vllm_v0.10.0/vllm/v1/spec_decode/ngram_proposer.py
new file mode 100644
index 0000000..6b90d09
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/spec_decode/ngram_proposer.py
@@ -0,0 +1,132 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import numpy as np
+from numba import jit
+
+from vllm.config import VllmConfig
+
+
+class NgramProposer:
+
+    def __init__(self, vllm_config: VllmConfig):
+        # Minimum length of the n-gram to match.
+        self.min_n = vllm_config.speculative_config.prompt_lookup_min
+        # Maximum length of the n-gram to match.
+        self.max_n = vllm_config.speculative_config.prompt_lookup_max
+        # Number of tokens follow the match. If there are less than k
+        # tokens follow the match, we will return the maximum amount of
+        # tokens until the end.
+        self.k = vllm_config.speculative_config.num_speculative_tokens
+        # Maximum length of the model.
+        self.max_model_len = vllm_config.model_config.max_model_len
+
+        # Trigger Numba JIT compilation for N-gram proposer.
+        # This usually takes less than 1 second.
+        self.propose(np.zeros(1024, dtype=np.int32))
+
+    def propose(
+        self,
+        context_token_ids: np.ndarray,
+    ) -> Optional[np.ndarray]:
+        """Proposes the next sequence of tokens based on n-gram pattern 
+        matching in the context. The function finds matches of the last n 
+        tokens in the previous context, and returns k tokens that followed 
+        that match.
+        
+        Args:
+            context_token_ids: Numpy array of token IDs representing the 
+                               context sequence.
+
+        Returns:
+            np.ndarray: The sequence of tokens that followed 
+                        the matched n-gram in the context.
+            None: If no matching n-gram pattern is found.
+
+        Example:
+            If context_token_ids = [1,2,3,4,2,3], min_n = 2, max_n = 3, and
+            k = 4:
+            - The last 3 (= max_n) tokens [4,2,3] cannot find a match.
+            - The last 2 tokens [2,3] will be matched against the previous 
+              4 tokens [1,2,3,4].
+            - Finding a match of [2,3] would return the tokens that 
+              followed that pattern. Here we will return [4,2,3] because 
+              we only have three tokens after the match.
+        """
+        # Do not generate draft tokens beyond the max model length.
+        k = min(self.k, self.max_model_len - context_token_ids.shape[0])
+        if k <= 0:
+            return None
+
+        # TODO(woosuk): Optimize this.
+        for n in range(self.max_n, self.min_n - 1, -1):
+            result = _find_subarray_kmp(context_token_ids, n, k)
+            if result is not None:
+                return result
+        return None
+
+    def load_model(self, *args, **kwargs):
+        # No model to load.
+        pass
+
+
+@jit(nopython=True)
+def _kmp_lps_array(pattern: np.ndarray) -> np.ndarray:
+    """
+    Build the lps (longest proper prefix which is also suffix) 
+    array for the pattern.
+    """
+    lps = np.zeros(len(pattern), dtype=np.int32)
+    prev_lps = 0  # length of the previous longest prefix suffix
+    i = 1
+
+    while i < len(pattern):
+        if pattern[i] == pattern[prev_lps]:
+            prev_lps += 1
+            lps[i] = prev_lps
+            i += 1
+        else:
+            if prev_lps != 0:
+                prev_lps = lps[prev_lps - 1]
+            else:
+                lps[i] = 0
+                i += 1
+    return lps
+
+
+@jit(nopython=True)
+def _find_subarray_kmp(
+    context_token_ids: np.ndarray,
+    n: int,
+    k: int,
+) -> Optional[np.ndarray]:
+    context_len = context_token_ids.shape[0]
+    assert n > 0
+
+    pattern = context_token_ids[-n:]
+    # Precompute lps array for Y
+    lps = _kmp_lps_array(pattern)
+
+    i = 0
+    j = 0
+    # -n because the last n tokens are used as pattern
+    while i < context_len - n:
+        if context_token_ids[i] == pattern[j]:
+            i += 1
+            j += 1
+
+            # If we have matched the entire Y
+            if j == n:
+                # Found pattern in context, gather the next K elements
+                return context_token_ids[i:i + k]
+        else:
+            # Mismatch
+            if j != 0:
+                # Use the lps array to avoid re-checking elements
+                j = lps[j - 1]
+            else:
+                i += 1
+
+    # Y not found
+    return None
diff --git a/vllm_v0.10.0/vllm/v1/spec_decode/utils.py b/vllm_v0.10.0/vllm/v1/spec_decode/utils.py
new file mode 100644
index 0000000..1116179
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/spec_decode/utils.py
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.sampling_params import SamplingParams
+
+_SAMPLING_EPS = 1e-5
+
+
+def is_spec_decode_unsupported(sampling_params: SamplingParams) -> bool:
+    """True if request is incompatible with speculative decoding"""
+    return (sampling_params.frequency_penalty != 0.0
+            or sampling_params.presence_penalty != 0.0
+            or sampling_params.repetition_penalty != 1.0
+            or sampling_params.min_p > _SAMPLING_EPS
+            or sampling_params.logprobs is not None)
diff --git a/vllm_v0.10.0/vllm/v1/structured_output/__init__.py b/vllm_v0.10.0/vllm/v1/structured_output/__init__.py
new file mode 100644
index 0000000..bd1dd01
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/structured_output/__init__.py
@@ -0,0 +1,236 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import multiprocessing
+from concurrent.futures import ThreadPoolExecutor
+from typing import TYPE_CHECKING, Optional
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParserManager
+from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
+from vllm.utils import LazyLoader
+from vllm.v1.structured_output.backend_guidance import GuidanceBackend
+from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
+                                                     StructuredOutputGrammar)
+from vllm.v1.structured_output.backend_xgrammar import XgrammarBackend
+
+if TYPE_CHECKING:
+    import numpy as np
+    import numpy.typing as npt
+    import torch
+
+    from vllm.reasoning import ReasoningParser
+    from vllm.v1.request import Request
+else:
+    torch = LazyLoader("torch", globals(), "torch")
+
+logger = init_logger(__name__)
+
+
+class StructuredOutputManager:
+    """Engine-level manager for structured output requests."""
+
+    def __init__(self, vllm_config: VllmConfig):
+        self.backend: Optional[StructuredOutputBackend] = None
+        self.reasoner: Optional[ReasoningParser] = None
+        self.vllm_config = vllm_config
+
+        self._grammar_bitmask: Optional[torch.Tensor] = None
+        self._full_mask = torch.tensor(-1, dtype=torch.int32)
+
+        if not self.vllm_config.model_config.skip_tokenizer_init:
+            # The default max_workers if not specified is the number of
+            # CPUs * 5, which is way too high since these tasks are CPU-bound,
+            # not I/O bound. We also know we would never dominate CPU usage
+            # with just grammar compilation, so we set it to half the number
+            # of CPUs.
+            max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
+            self.executor = ThreadPoolExecutor(max_workers=max_workers)
+            self.tokenizer = init_tokenizer_from_configs(
+                model_config=self.vllm_config.model_config,
+                scheduler_config=self.vllm_config.scheduler_config,
+                lora_config=self.vllm_config.lora_config,
+            ).get_lora_tokenizer(None)
+            reasoning_backend = \
+                    self.vllm_config.decoding_config.reasoning_backend
+            if reasoning_backend:
+                reasoner_cls = ReasoningParserManager.get_reasoning_parser(
+                    reasoning_backend)
+                self.reasoner = reasoner_cls(tokenizer=self.tokenizer)
+
+    def grammar_init(self, request: Request) -> None:
+        if request.structured_output_request is None:
+            return
+
+        if TYPE_CHECKING:
+            assert request.sampling_params is not None and \
+                request.sampling_params.guided_decoding is not None
+
+        # Initialize the backend the first time it is needed.
+        #
+        # NOTE: We only support a single backend. We do NOT support different
+        # backends on a per-request basis in V1 (for now, anyway...).
+        if self.backend is None:
+            assert request.sampling_params is not None
+            backend = request.sampling_params.guided_decoding.backend
+            vocab_size = self.vllm_config.model_config.get_vocab_size()
+            if backend == "xgrammar":
+                self.backend = XgrammarBackend(
+                    self.vllm_config,
+                    tokenizer=self.tokenizer,
+                    vocab_size=vocab_size,
+                )
+            elif backend == "guidance":
+                self.backend = GuidanceBackend(
+                    self.vllm_config,
+                    tokenizer=self.tokenizer,
+                    vocab_size=vocab_size,
+                )
+            elif backend == "outlines":
+                from vllm.v1.structured_output.backend_outlines import (
+                    OutlinesBackend)
+
+                self.backend = OutlinesBackend(
+                    self.vllm_config,
+                    tokenizer=self.tokenizer,
+                    vocab_size=vocab_size,
+                )
+            else:
+                raise ValueError(
+                    f"Unsupported structured output backend: {backend}")
+
+        grammar = self.executor.submit(self._async_create_grammar, request)
+        request.structured_output_request.grammar = grammar  # type: ignore[assignment]
+
+    def _async_create_grammar(
+        self,
+        request: Request,
+    ) -> StructuredOutputGrammar:
+        key = request.structured_output_request.structured_output_key  # type: ignore[union-attr]
+
+        # Note that the request was validated in the engine core client,
+        # so at this point we know it is a supported type of request.
+        #
+        # TODO: we still need to handle xgrammar compilation failures,
+        # though it should be unlikely as we test that up front as well.
+        request_type, grammar_spec = key
+
+        assert self.backend is not None
+        return self.backend.compile_grammar(request_type, grammar_spec)
+
+    def grammar_bitmask(
+        self,
+        requests: dict[str, Request],
+        structured_output_request_ids: dict[str, int],
+        scheduled_spec_decode_tokens: dict[str, list[int]],
+    ) -> Optional[npt.NDArray[np.int32]]:
+        # Prepare the structured output bitmask for this batch.
+        if not structured_output_request_ids:
+            return None
+
+        max_num_spec_tokens = 0
+        if self.vllm_config.speculative_config is not None:
+            max_num_spec_tokens = \
+                self.vllm_config.speculative_config.num_speculative_tokens
+
+        if self._grammar_bitmask is None:
+            assert self.backend is not None
+            max_batch_size = self.vllm_config.scheduler_config.max_num_seqs
+
+            # Allocate a bitmask for each token needing to be checked:
+            # one for each speculative position, and one more for the
+            # bonus token / non-speculative token.
+            self._grammar_bitmask = \
+                self.backend.allocate_token_bitmask(
+                    max_batch_size * (1 + max_num_spec_tokens))
+
+        bitmask_tensor = self._grammar_bitmask
+        # Generate a batched bitmask for all structured output requests.
+        # When speculative decoding is enabled, we need to include multiple
+        # masks for each request, one for each possible bonus token position.
+        # These are stored inline in the tensor and unpacked by the gpu runner.
+        cumulative_index = 0
+        ordered_seq = sorted(structured_output_request_ids.items(),
+                             key=lambda x: x[1])
+
+        # Note that for thinking support, we will need to
+        # reset the relevant part of the bitmask for consequent
+        # request here.
+        bitmask_tensor[:(len(ordered_seq) * (1 + max_num_spec_tokens))].fill_(
+            self._full_mask)
+
+        # NOTE: This outer loop can likely be parallelized to improve
+        # performance of bitmask generation for large batches.
+        for req_id, _ in ordered_seq:
+            request = requests[req_id]
+            structured_output_request = request.structured_output_request
+
+            if TYPE_CHECKING:
+                assert structured_output_request is not None
+                assert structured_output_request.grammar is not None
+            apply_bitmask: bool = True
+            if self.reasoner is not None:
+                if structured_output_request.reasoning_ended is None:
+                    structured_output_request.reasoning_ended = \
+                        self.reasoner.is_reasoning_end(request.prompt_token_ids)
+                apply_bitmask = structured_output_request.reasoning_ended
+
+            state_advancements = 0
+            req_tokens = scheduled_spec_decode_tokens.get(req_id, []) + [None]
+            for i, token in enumerate(req_tokens):
+                if apply_bitmask and not \
+                    structured_output_request.grammar.is_terminated():
+                    structured_output_request.grammar.fill_bitmask(
+                        bitmask_tensor, cumulative_index)
+                    if token is not None:
+                        # In order to generate the correct bitmask for each
+                        # position in the speculative sequence, we advance
+                        # the FSM state for each speculative token and rollback
+                        # to restore the previous state when we are finished.
+                        assert structured_output_request.grammar.accept_tokens(
+                            req_id, [token])
+                        state_advancements += 1
+                cumulative_index += 1
+            if state_advancements > 0:
+                structured_output_request.grammar.rollback(state_advancements)
+
+        if cumulative_index < bitmask_tensor.shape[0]:
+            bitmask_tensor = bitmask_tensor[:cumulative_index]
+
+        # After finishing with the xgrammar operations, we convert to
+        # np.ndarray, because that is much more efficient for serialization
+        # and deserialization when sending this to the GPU workers.
+        return bitmask_tensor.numpy()
+
+    def should_advance(self, request: Request) -> bool:
+        if not request.use_structured_output:
+            return False
+
+        # To determine whether we can advance the FSM.
+        # Supports thinking usage where we skip the reasoning components.
+        if TYPE_CHECKING:
+            assert request.structured_output_request is not None
+            assert request.structured_output_request.grammar is not None
+        # by default, we should always advance
+        # for cases that doesn't uses thinking mode.
+        if self.reasoner is not None:
+            structured_req = request.structured_output_request
+
+            if structured_req.reasoning_ended:
+                return True
+
+            # Check if reasoning ends in *this* step
+            if self.reasoner.is_reasoning_end(request.all_token_ids):
+                # Reasoning just ended, so we shouldn't advanced til
+                # next pass
+                structured_req.reasoning_ended = True
+
+            return False
+        else:
+            return True
+
+    def clear_backend(self) -> None:
+        if self.backend is not None:
+            self.backend.destroy()
diff --git a/vllm_v0.10.0/vllm/v1/structured_output/backend_guidance.py b/vllm_v0.10.0/vllm/v1/structured_output/backend_guidance.py
new file mode 100644
index 0000000..02e7fc3
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/structured_output/backend_guidance.py
@@ -0,0 +1,245 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import copy
+import json
+import os
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.sampling_params import SamplingParams
+from vllm.utils import LazyLoader
+from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
+                                                     StructuredOutputGrammar,
+                                                     StructuredOutputOptions)
+from vllm.v1.structured_output.request import get_structured_output_key
+
+if TYPE_CHECKING:
+    import llguidance
+    import llguidance.hf as llguidance_hf
+    import llguidance.torch as llguidance_torch
+else:
+    llguidance = LazyLoader("llguidance", globals(), "llguidance")
+    llguidance_hf = LazyLoader("llguidance.hf", globals(), "llguidance.hf")
+    llguidance_torch = LazyLoader("llguidance.torch", globals(),
+                                  "llguidance.torch")
+
+logger = init_logger(__name__)
+
+
+def _walk_json_for_additional_properties(data: object):
+    if isinstance(data, dict):
+        for value in data.values():
+            _walk_json_for_additional_properties(value)
+        if 'additionalProperties' not in data and \
+            ('properties' in data or 'patternProperties' in data):
+            data['additionalProperties'] = False
+    elif isinstance(data, list):
+        for item in data:
+            _walk_json_for_additional_properties(item)
+
+
+def process_for_additional_properties(
+        guide_json: Union[str, dict[str, Any]]) -> dict[str, Any]:
+    if isinstance(guide_json, str):
+        guide_json_obj = json.loads(guide_json)
+    else:
+        # copy for modifications
+        guide_json_obj = copy.deepcopy(guide_json)
+    _walk_json_for_additional_properties(guide_json_obj)
+    return guide_json_obj
+
+
+@dataclass
+class GuidanceBackend(StructuredOutputBackend):
+
+    def __post_init__(self):
+        self.disable_any_whitespace = \
+            self.vllm_config.decoding_config.disable_any_whitespace
+        self.disable_additional_properties = \
+            self.vllm_config.decoding_config.disable_additional_properties
+
+        self.ll_tokenizer = llguidance_hf.from_tokenizer(
+            self.tokenizer, self.vocab_size)
+
+    def compile_grammar(self, request_type: StructuredOutputOptions,
+                        grammar_spec: str) -> StructuredOutputGrammar:
+        self.serialized_grammar = serialize_guidance_grammar(
+            request_type, grammar_spec, self.disable_any_whitespace,
+            self.disable_additional_properties)
+
+        ll_matcher = llguidance.LLMatcher(
+            self.ll_tokenizer,
+            self.serialized_grammar,
+            log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")),
+        )
+
+        r = GuidanceGrammar(
+            ll_matcher=ll_matcher,
+            ll_tokenizer=self.ll_tokenizer,
+            vocab_size=self.vocab_size,
+        )
+
+        r.check_error()
+        return r
+
+    def allocate_token_bitmask(self, max_num_seqs: int):
+        return llguidance_torch.allocate_token_bitmask(
+            max_num_seqs, self.ll_tokenizer.vocab_size)
+
+    def destroy(self):
+        pass
+
+
+@dataclass
+class GuidanceGrammar(StructuredOutputGrammar):
+    ll_matcher: llguidance.LLMatcher
+    ll_tokenizer: llguidance.LLTokenizer
+    vocab_size: int
+    printed_error: bool = False
+    terminated: bool = False
+
+    def check_error(self):
+        if not self.printed_error:
+            err = self.ll_matcher.get_error()
+            if err:
+                self.printed_error = True
+                logger.warning("LLMatcher error: %s", err)
+
+    def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
+        """Accepts a list of tokens and advances the parser.
+
+        Returns True if the parser was advanced successfully.
+        Returns False if the parser failed to advance.
+        """
+
+        if self.ll_tokenizer.eos_token in tokens:
+            self.terminated = True
+
+        if self.ll_matcher.is_stopped():
+            return True
+
+        # TODO - Add jump decoding support in the future:
+        # self.ll_matcher.compute_ff_bytes() - this should always work
+        # self.ll_matcher.compute_ff_tokens() - this only works for
+        #   "canonical" tokenizers
+        # For conversion between the two, see
+        # https://github.com/guidance-ai/llguidance/blob/main/docs/fast_forward.md
+
+        r = self.ll_matcher.consume_tokens(tokens)
+
+        self.check_error()
+
+        return r
+
+    def validate_tokens(self, tokens: list[int]) -> list[int]:
+        """Checks if the list of tokens are accepted by the parser in sequence.
+        Will not advance the parser.
+
+        Returns the prefix list of tokens that are accepted by the parser.
+        """
+        if len(tokens) == 0:
+            return []
+        if self.ll_matcher.is_stopped():
+            return []
+
+        num_tokens = self.ll_matcher.validate_tokens(tokens)
+
+        self.check_error()
+
+        return tokens[:num_tokens]
+
+    def rollback(self, num_tokens: int) -> None:
+        self.ll_matcher.rollback(num_tokens)
+        self.check_error()
+
+    def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> None:
+        # this will automatically return [EOS] mask if the matcher is stopped
+        # or otherwise in an error state
+        llguidance_torch.fill_next_token_bitmask(self.ll_matcher, bitmask, idx)
+        self.check_error()
+
+    def is_terminated(self) -> bool:
+        return self.terminated
+
+    def reset(self):
+        # This method may be not needed anymore? TODO
+        self.ll_matcher.reset()
+
+
+def serialize_guidance_grammar(
+    request_type: StructuredOutputOptions,
+    grammar_spec: Union[str, dict[str, Any]],
+    disable_any_whitespace: bool = False,
+    disable_additional_properties: bool = False,
+) -> str:
+
+    def _process_schema(grammar_spec: Union[str, dict[str, Any]], ) -> str:
+        if disable_additional_properties:
+            grammar_spec = process_for_additional_properties(grammar_spec)
+        return llguidance.LLMatcher.grammar_from_json_schema(
+            grammar_spec,
+            defaults={
+                "whitespace_flexible": not disable_any_whitespace,
+            })
+
+    if request_type == StructuredOutputOptions.JSON:
+        return _process_schema(grammar_spec)
+    elif request_type == StructuredOutputOptions.JSON_OBJECT:
+        return llguidance.LLMatcher.grammar_from_json_schema(
+            '{"type": "object"}',
+            defaults={
+                "whitespace_flexible": not disable_any_whitespace,
+            })
+    else:
+        if request_type == StructuredOutputOptions.REGEX:
+            tp = "regex"
+        elif request_type == StructuredOutputOptions.GRAMMAR:
+            tp = "grammar"
+        elif request_type == StructuredOutputOptions.CHOICE:
+            tp = "choice"
+        elif request_type == StructuredOutputOptions.STRUCTURAL_TAG:
+            if isinstance(grammar_spec, str):
+                s_tag = json.loads(grammar_spec)
+            else:
+                s_tag = grammar_spec
+            triggers: list[str] = s_tag["triggers"]
+            tags: list[llguidance.StructTag] = []
+            for s in s_tag["structures"]:
+                begin: str = s["begin"]
+                trig = next((t for t in triggers if begin.startswith(t)), None)
+                if trig is None:
+                    raise ValueError(
+                        f"Trigger {begin} not found in triggers {triggers}")
+                tags.append(
+                    llguidance.StructTag(
+                        trigger=trig,
+                        begin=s["begin"],
+                        grammar=_process_schema(s["schema"]),
+                        end=s["end"],
+                    ))
+            if not tags:
+                raise ValueError(
+                    "No structural tags found in the grammar spec.")
+            return llguidance.StructTag.to_grammar(tags)
+        else:
+            logger.error("Validation should have already occurred. "
+                         "Please file an issue.")
+            raise ValueError("grammar is not of valid supported types. "
+                             f"({request_type!s})")
+        return llguidance.grammar_from(tp, grammar_spec)
+
+
+def validate_guidance_grammar(
+        sampling_params: SamplingParams,
+        tokenizer: Optional[llguidance.LLTokenizer] = None) -> None:
+    tp, grm = get_structured_output_key(sampling_params)
+    guidance_grm = serialize_guidance_grammar(tp, grm)
+    err = llguidance.LLMatcher.validate_grammar(guidance_grm, tokenizer)
+    if err:
+        raise ValueError(f"Grammar error: {err}")
diff --git a/vllm_v0.10.0/vllm/v1/structured_output/backend_outlines.py b/vllm_v0.10.0/vllm/v1/structured_output/backend_outlines.py
new file mode 100644
index 0000000..572e498
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/structured_output/backend_outlines.py
@@ -0,0 +1,320 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright 2025-present the Outlines developers
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import ast
+import importlib
+import json
+import sys
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING
+
+import torch
+from regex import escape as regex_escape
+
+from vllm.sampling_params import SamplingParams
+from vllm.utils import LazyLoader
+from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
+                                                     StructuredOutputGrammar,
+                                                     StructuredOutputOptions)
+from vllm.v1.structured_output.utils import (OutlinesVocabulary,
+                                             get_outlines_cache,
+                                             get_outlines_vocabulary)
+
+if TYPE_CHECKING:
+    import outlines_core as oc
+    import outlines_core.json_schema as json_schema
+else:
+    oc = LazyLoader("oc", globals(), "outlines_core")
+    json_schema = LazyLoader("json_schema", globals(),
+                             "outlines_core.json_schema")
+
+# Python 3.11+ sre_parse and sre_constants
+# are deprecated, so we must import them from re
+if sys.version_info >= (3, 11):
+    # Hack to get around pre-commit regex module rule
+    # because going through re is the only way to get sre_parse
+    # and sre_constants in Python 3.11+
+    _re = importlib.import_module("re")
+    sre_parse = _re._parser
+    sre_constants = _re._constants
+else:
+    import sre_constants
+    import sre_parse
+
+
+@dataclass
+class OutlinesBackend(StructuredOutputBackend):
+
+    def __post_init__(self):
+        self.vocabulary = get_outlines_vocabulary(self.tokenizer)
+        self.cache = get_outlines_cache()
+
+    def _compile_index(self, regex_string: str,
+                       vocabulary: OutlinesVocabulary) -> oc.Index:
+        cache_key = f"{vocabulary._hash}_{regex_string}"
+        if cache_key in self.cache:
+            return self.cache[cache_key]
+
+        index = oc.Index(regex_string, vocabulary.inner)
+        self.cache[cache_key] = index
+
+        return index
+
+    def compile_grammar(self, request_type: StructuredOutputOptions,
+                        grammar_spec: str) -> StructuredOutputGrammar:
+        if request_type == StructuredOutputOptions.JSON:
+            regex = json_schema.build_regex_from_schema(grammar_spec)
+        elif request_type == StructuredOutputOptions.REGEX:
+            regex = grammar_spec
+        elif request_type == StructuredOutputOptions.CHOICE:
+            choices = ast.literal_eval(grammar_spec)
+            choices = [regex_escape(c) for c in choices]
+            regex = "(" + "|".join(choices) + ")"
+        else:
+            raise ValueError(
+                f"Invalid request type for Outlines backend ({request_type!s})"
+            )
+        index = self._compile_index(regex, self.vocabulary)
+        max_rollback_tokens = (
+            self.vllm_config.speculative_config.num_speculative_tokens
+            if self.vllm_config.speculative_config is not None else 0)
+        return OutlinesGrammar(vocab_size=self.vocab_size,
+                               guide=oc.Guide(
+                                   index, max_rollback=max_rollback_tokens))
+
+    def allocate_token_bitmask(self, max_num_seqs: int) -> torch.Tensor:
+        return torch.full(
+            (max_num_seqs, (self.vocab_size + 31) // 32),
+            -1,
+            dtype=torch.int32,
+            pin_memory=torch.cuda.is_available(),
+        )
+
+    def destroy(self):
+        pass
+
+
+@dataclass
+class OutlinesGrammar(StructuredOutputGrammar):
+
+    vocab_size: int
+    guide: oc.Guide = field(hash=False)
+    num_processed_tokens: int = field(default_factory=lambda: 0,
+                                      repr=False,
+                                      hash=False,
+                                      init=False)
+
+    # outlines_core signals done on DFA accept; vLLM expects done after EOS.
+    # We delay the finished flag by one step so EOS can still be emitted.
+    _prev_finished: bool = field(default=False,
+                                 init=False,
+                                 repr=False,
+                                 hash=False)
+
+    def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
+        """Accepts a list of tokens and advances the FSM.
+
+        Returns True if the FSM was advanced successfully.
+        Returns False if the FSM failed to advance.
+        """
+        if self.guide.accepts_tokens(tokens):
+            # Advance cannot fail because we checked Guide.accepts_tokens()
+            for t in tokens:
+                self.guide.advance(t)
+                self.num_processed_tokens += 1
+            return True
+        return False
+
+    def rollback(self, num_tokens: int) -> None:
+        self.guide.rollback_state(num_tokens)
+        self.num_processed_tokens -= num_tokens
+
+    def validate_tokens(self, tokens: list[int]) -> list[int]:
+        accepted: list[int] = []
+        for tok in tokens:
+            accepted.append(tok)
+            if not self.guide.accepts_tokens(accepted):
+                accepted.pop()
+                break
+        return accepted
+
+    def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> None:
+        mask = bitmask[idx]
+        self.guide.write_mask_into(mask.data_ptr(), mask.numel(),
+                                   mask.element_size())
+
+    def is_terminated(self) -> bool:
+        curr = self.guide.is_finished()
+        prev = self._prev_finished
+        self._prev_finished = curr
+        return prev
+
+    def reset(self):
+        self.num_processed_tokens = 0
+        self._prev_finished = False
+        self.guide.reset()
+
+
+def validate_structured_output_request_outlines(params: SamplingParams):
+    if params.guided_decoding is None:
+        return
+
+    gd_params = params.guided_decoding
+
+    if gd_params.regex:
+        validate_regex_is_buildable(gd_params.regex)
+    elif gd_params.json:
+        if isinstance(gd_params.json, str):
+            try:
+                # make sure schema is valid json
+                json.loads(gd_params.json)
+                schema = gd_params.json
+            except json.JSONDecodeError as e:
+                raise ValueError("Invalid JSON grammar specification.") from e
+        else:
+            try:
+                schema = json.dumps(gd_params.json)
+            except Exception as e:
+                raise ValueError(
+                    f"Error serializing guided decoding jsonschema: {e}"
+                ) from e
+        pattern = json_schema.build_regex_from_schema(schema)
+        validate_regex_is_buildable(pattern)
+    elif gd_params.choice:
+        choices = [regex_escape(str(choice)) for choice in gd_params.choice]
+        regex = "(" + "|".join(choices) + ")"
+        validate_regex_is_buildable(regex)
+    elif gd_params.grammar:
+        raise ValueError("Outlines guided decoding backend "
+                         "does not support grammar specifications")
+
+
+def _prefix_needs_context(parsed) -> bool:
+    """Return True if there's a look-around/anchor before any consumer."""
+
+    def subpattern_consumes(parsed) -> bool:
+        """Return True if subpattern can consume at least one character."""
+        tokens = parsed.data if hasattr(parsed, 'data') else parsed
+        for ttype, tval in tokens:
+            # literal, character class, or dot always consumes
+            if ttype in (sre_parse.LITERAL, sre_parse.IN, sre_parse.ANY):
+                return True
+            # quantified subpattern: check inner pattern
+            elif ttype == sre_parse.MAX_REPEAT:
+                _, mx, sub = tval
+                if mx != 0 and subpattern_consumes(sub):
+                    return True
+            # alternation: if any branch consumes, the whole does
+            elif ttype == sre_parse.BRANCH:
+                _, branches = tval
+                if any(subpattern_consumes(br) for br in branches):
+                    return True
+            # grouped subpattern: recurse into its contents
+            elif ttype == sre_parse.SUBPATTERN and subpattern_consumes(
+                    tval[3]):
+                return True
+        # No consumers, return False
+        return False
+
+    tokens = parsed.data if hasattr(parsed, 'data') else parsed
+    for ttype, tval in tokens:
+        # Direct anchors or look-around
+        if ttype == sre_parse.AT or ttype in (sre_constants.ASSERT,
+                                              sre_constants.ASSERT_NOT):
+            return True
+
+        # Nested subpattern: check
+        if ttype == sre_parse.SUBPATTERN:
+            # tval: (group, add_flags, del_flags, subpattern)
+            if _prefix_needs_context(tval[3]):
+                return True
+            if subpattern_consumes(tval[3]):
+                return False
+
+        # if any branch has a prefix anchor => True,
+        # else if at least one branch consumes => prefix ends => False
+        elif ttype == sre_parse.BRANCH:
+            saw_consumer = False
+            for br in tval[1]:
+                if _prefix_needs_context(br):
+                    return True
+                if subpattern_consumes(br):
+                    saw_consumer = True
+            if saw_consumer:
+                return False
+
+        # Immediate consumer tokens
+        elif ttype in (sre_parse.LITERAL, sre_parse.IN, sre_parse.ANY):
+            return False
+
+        # if subpattern has anchor => True, if it can consume => stop
+        elif ttype == sre_parse.MAX_REPEAT:
+            if _prefix_needs_context(tval[2]):
+                return True
+            if subpattern_consumes(tval[2]):
+                return False
+
+    return False
+
+
+def _check_unsupported(parsed) -> None:
+    """Check for regex features unsupported by regex-automata"""
+    tokens = parsed.data if hasattr(parsed, 'data') else parsed
+    for ttype, tval in tokens:
+
+        # backreference
+        if ttype in (sre_parse.GROUPREF, sre_parse.GROUPREF_EXISTS):
+            raise ValueError("Backreferences are unsupported.")
+
+        # look-around assertion
+        elif ttype in (sre_constants.ASSERT, sre_constants.ASSERT_NOT):
+            raise ValueError("Look-Around assertion are unsupported.")
+
+        # unicode word boundaries
+        elif ttype == sre_parse.AT:
+            if tval in (sre_constants.AT_BOUNDARY,
+                        sre_constants.AT_NON_BOUNDARY):
+                raise ValueError("Unicode word boundaries are unsupported.")
+
+        elif ttype == sre_parse.BRANCH:
+            # tval is (None, branches)
+            for branch in tval[1]:
+                _check_unsupported(branch)
+
+        # tval is (min, max, subpattern)
+        elif ttype == sre_parse.MAX_REPEAT:
+            _check_unsupported(tval[2])
+
+
+def validate_regex_is_buildable(pattern: str) -> None:
+    """
+    Validates that the input regex is not using unsupported features
+    of the `regex-automata` crate (outlines_core regex engine) and has a
+    universal start state.
+    definition of universal start state used can be found at:
+    https://docs.rs/regex-automata/latest/regex_automata/dfa/trait.Automaton.html#method.universal_start_state
+    """
+    try:
+        parsed = sre_parse.parse(pattern)
+
+    except sre_constants.error as e:
+        raise ValueError(f"Error parsing regex: {e}") from e
+
+    try:
+        _check_unsupported(parsed)
+    except ValueError as e:
+        raise ValueError(
+            f"Regex uses unsupported feature for guided decoding: {e}. "
+            "Only basic matching constructs are supported—lookarounds, "
+            "backreferences, and unicode boundaries are not.") from e
+
+    if _prefix_needs_context(parsed):
+        raise ValueError(
+            "Regex does not have a anchored universal start state"
+            "This means that the Regex uses anchors (^) or look-arounds "
+            "in a way which requires context before any token is matched."
+            "Guided decoding needs regexes that can match without needing "
+            "that context. Try rewriting the pattern without using these "
+            f"constructs. Pattern:\n{pattern}")
diff --git a/vllm_v0.10.0/vllm/v1/structured_output/backend_types.py b/vllm_v0.10.0/vllm/v1/structured_output/backend_types.py
new file mode 100644
index 0000000..d500783
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/structured_output/backend_types.py
@@ -0,0 +1,134 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import enum
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    import torch
+
+    from vllm.config import VllmConfig
+    from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+
+class StructuredOutputOptions(enum.Enum):
+    JSON = enum.auto()
+    JSON_OBJECT = enum.auto()
+    REGEX = enum.auto()
+    GRAMMAR = enum.auto()
+    CHOICE = enum.auto()
+    STRUCTURAL_TAG = enum.auto()
+
+
+StructuredOutputKey = tuple[StructuredOutputOptions, str]
+
+
+class StructuredOutputGrammar(ABC):
+    """Request-level backend for structured output requests."""
+
+    @abstractmethod
+    def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
+        """
+        Determines whether the provided tokens are accepted for the
+        given request.
+
+        Args:
+            request_id (str): The unique identifier for the request.
+            tokens (list[int]): A list of token IDs to evaluate.
+
+        Returns:
+            bool: True if the tokens are accepted, False otherwise.
+        """
+
+    @abstractmethod
+    def validate_tokens(self, tokens: list[int]) -> list[int]:
+        """
+        Validates the provided tokens against the grammar.
+        Will not advance the FSM.
+
+        Args:
+            tokens (list[int]): A list of token IDs to validate.
+
+        Returns:
+            list[int]: A list of accepted token IDs. Will be a prefix
+                of the input tokens, and empty if none are accepted.
+        """
+
+    @abstractmethod
+    def rollback(self, num_tokens: int) -> None:
+        """
+        Rolls back the state of the grammar by a specified number of tokens.
+        Will also revert counters for the number of processed tokens.
+
+        Args:
+            num_tokens (int): The number of tokens to roll back.
+        """
+
+    @abstractmethod
+    def fill_bitmask(self, bitmask: torch.Tensor, batch_index: int) -> None:
+        """
+        Fills the bitmask for a specific batch index.
+
+        Args:
+            bitmask (torch.Tensor): The bitmask to fill
+            batch_index (int): The index in the bitmask to fill
+        """
+
+    @abstractmethod
+    def is_terminated(self) -> bool:
+        """
+        Checks whether the structured output process has terminated.
+
+        Returns:
+            bool: True if the process is terminated, False otherwise.
+        """
+
+    @abstractmethod
+    def reset(self):
+        """
+        Resets the state of the structured output grammar.
+        """
+
+
+@dataclass
+class StructuredOutputBackend(ABC):
+    """Engine-level backend for structured output requests."""
+
+    vllm_config: VllmConfig
+    tokenizer: AnyTokenizer
+    vocab_size: int
+
+    @abstractmethod
+    def compile_grammar(self, request_type: StructuredOutputOptions,
+                        grammar_spec: str) -> StructuredOutputGrammar:
+        """
+        Compiles a grammar specification into a structured output grammar.
+
+        Args:
+            request_type (StructuredOutputOptions): The type of structured
+              output request.
+            grammar_spec (str): The grammar specification to compile.
+
+        Returns:
+            StructuredOutputGrammar: The compiled structured output grammar.
+        """
+
+    @abstractmethod
+    def allocate_token_bitmask(self, max_num_seqs: int) -> torch.Tensor:
+        """
+        Allocates a token bitmask for the specified maximum number of sequences.
+
+        Args:
+            max_num_seqs (int): The maximum number of sequences for which
+              to allocate the bitmask.
+        """
+
+    @abstractmethod
+    def destroy(self):
+        """
+        Backend-specific cleanup.
+        """
diff --git a/vllm_v0.10.0/vllm/v1/structured_output/backend_xgrammar.py b/vllm_v0.10.0/vllm/v1/structured_output/backend_xgrammar.py
new file mode 100644
index 0000000..8854456
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/structured_output/backend_xgrammar.py
@@ -0,0 +1,318 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any
+
+import torch
+
+import vllm.envs
+from vllm.logger import init_logger
+from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.utils import LazyLoader
+from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
+                                                     StructuredOutputGrammar,
+                                                     StructuredOutputOptions)
+from vllm.v1.structured_output.utils import (choice_as_grammar,
+                                             convert_lark_to_ebnf,
+                                             grammar_is_likely_lark)
+
+if TYPE_CHECKING:
+    import xgrammar as xgr
+else:
+    xgr = LazyLoader("xgr", globals(), "xgrammar")
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class XgrammarBackend(StructuredOutputBackend):
+
+    def __post_init__(self):
+        self.disable_any_whitespace = \
+            self.vllm_config.decoding_config.disable_any_whitespace
+
+        if isinstance(self.tokenizer, MistralTokenizer):
+            # NOTE: ideally, xgrammar should handle this accordingly.
+            # refer to https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98
+            try:
+                if self.tokenizer.is_tekken:
+                    encoded_vocab = self.tokenizer._vocab
+                else:
+                    encoded_vocab = [
+                        token for token, _ in sorted(
+                            self.tokenizer.get_vocab().items(),
+                            key=lambda x: x[1],
+                        )
+                    ]
+                stop_token_ids = None
+                if (hasattr(
+                        self.tokenizer,
+                        "eos_token_id",
+                ) and self.tokenizer.eos_token_id is not None):
+                    stop_token_ids = [self.tokenizer.eos_token_id]
+            except AttributeError as e:
+                raise ValueError(
+                    f"Cannot get the vocabulary of the tokenizer "
+                    f"{type(self.tokenizer)}. The tokenizer should have a "
+                    "get_vocab method.") from e
+            tokenizer_info = xgr.TokenizerInfo(  # type: ignore
+                encoded_vocab=encoded_vocab,
+                # NOTE: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
+                vocab_type=xgr.VocabType.RAW
+                if self.tokenizer.is_tekken else xgr.VocabType.BYTE_FALLBACK,
+                vocab_size=self.vocab_size,
+                stop_token_ids=stop_token_ids,
+                add_prefix_space=True,
+            )
+        else:
+            tokenizer_info = xgr.TokenizerInfo.from_huggingface(
+                self.tokenizer,
+                vocab_size=self.vocab_size,
+            )
+        self.compiler = xgr.GrammarCompiler(
+            tokenizer_info,
+            max_threads=8,
+            cache_enabled=True,
+            cache_limit_bytes=vllm.envs.VLLM_XGRAMMAR_CACHE_MB * 1024 * 1024,
+        )
+
+        self.num_speculative_tokens = 0
+        if self.vllm_config.speculative_config is not None:
+            self.num_speculative_tokens = \
+                self.vllm_config.speculative_config.num_speculative_tokens
+
+    def compile_grammar(self, request_type: StructuredOutputOptions,
+                        grammar_spec: str) -> StructuredOutputGrammar:
+        if request_type == StructuredOutputOptions.JSON:
+            ctx = self.compiler.compile_json_schema(
+                grammar_spec, any_whitespace=not self.disable_any_whitespace)
+        elif request_type == StructuredOutputOptions.JSON_OBJECT:
+            ctx = self.compiler.compile_json_schema(
+                '{"type": "object"}',
+                any_whitespace=not self.disable_any_whitespace)
+        elif request_type == StructuredOutputOptions.GRAMMAR:
+            ctx = self.compiler.compile_grammar(grammar_spec)
+        elif request_type == StructuredOutputOptions.REGEX:
+            ctx = self.compiler.compile_regex(grammar_spec)
+        elif request_type == StructuredOutputOptions.STRUCTURAL_TAG:
+            s_tag = json.loads(grammar_spec)
+            tags = [
+                xgr.StructuralTagItem(
+                    begin=s["begin"],
+                    schema=json.dumps(s["schema"]),
+                    end=s["end"],
+                ) for s in s_tag["structures"]
+            ]
+            ctx = self.compiler.compile_structural_tag(tags, s_tag["triggers"])
+        else:
+            logger.error(
+                "Validation should have already occurred. Please file an issue."
+            )
+            raise ValueError(
+                f"grammar is not of valid supported types. ({request_type!s})")
+
+        return XgrammarGrammar(
+            matcher=xgr.GrammarMatcher(
+                ctx,
+                max_rollback_tokens=self.num_speculative_tokens,
+            ),
+            vocab_size=self.vocab_size,
+            ctx=ctx,
+        )
+
+    def allocate_token_bitmask(self, max_num_seqs: int):
+        return xgr.allocate_token_bitmask(max_num_seqs, self.vocab_size)
+
+    def destroy(self):
+        del self.compiler
+
+
+@dataclass
+class XgrammarGrammar(StructuredOutputGrammar):
+    # NOTE: This would be a generic-enough class for
+    # supporting different backends, in the future.
+    # For now, just xgrammar.
+    #
+    # https://xgrammar.mlc.ai/docs/api/python/index.html#xgrammar.GrammarMatcher.find_jump_forward_string
+    # for jump-forward decoding
+
+    vocab_size: int
+    matcher: xgr.GrammarMatcher = field(hash=False)
+    ctx: xgr.CompiledGrammar = field(hash=False)
+    num_processed_tokens: int = field(default_factory=lambda: 0,
+                                      repr=False,
+                                      hash=False,
+                                      init=False)
+
+    def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
+        """Accepts a list of tokens and advances the FSM.
+
+        Returns True if the FSM was advanced successfully.
+        Returns False if the FSM failed to advance.
+        """
+        for token in tokens:
+            if not self.matcher.accept_token(token):
+                logger.error(
+                    "Failed to advance FSM for request %s "
+                    "for tokens %s. Please file an issue.", request_id, token)
+                return False
+            self.num_processed_tokens += 1
+        return True
+
+    def validate_tokens(self, tokens: list[int]) -> list[int]:
+        """Checks if the list of tokens are accepted by the FSM in sequence.
+        Will not advance the FSM.
+
+        Returns the prefix list of tokens that are accepted by the FSM.
+        """
+        accepted_tokens = []
+        for token in tokens:
+            if self.matcher.accept_token(token):
+                accepted_tokens.append(token)
+            else:
+                break
+        if len(accepted_tokens) > 0:
+            # Rollback the FSM to the initial state
+            self.matcher.rollback(len(accepted_tokens))
+        return accepted_tokens
+
+    def rollback(self, num_tokens: int) -> None:
+        self.matcher.rollback(num_tokens)
+        self.num_processed_tokens -= num_tokens
+
+    def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> None:
+        self.matcher.fill_next_token_bitmask(bitmask, idx)
+
+    def is_terminated(self) -> bool:
+        return self.matcher.is_terminated()
+
+    def reset(self):
+        self.num_processed_tokens = 0
+        self.matcher.reset()
+
+
+def has_xgrammar_unsupported_json_features(schema: dict[str, Any]) -> bool:
+    """Check if JSON schema contains features unsupported by xgrammar."""
+
+    def check_object(obj: dict[str, Any]) -> bool:
+        if not isinstance(obj, dict):
+            return False
+
+        # Check for numeric ranges
+        if obj.get("type") in ("integer", "number") and ("multipleOf" in obj):
+            return True
+
+        # Check for array unsupported keywords
+        if obj.get("type") == "array" and any(
+                key in obj for key in ("uniqueItems", "contains",
+                                       "minContains", "maxContains")):
+            return True
+
+        # Unsupported keywords for strings
+        if obj.get("type") == "string" and "format" in obj:
+            return True
+
+        # Unsupported keywords for objects
+        if obj.get("type") == "object" and any(
+                key in obj for key in ("minProperties", "maxProperties",
+                                       "propertyNames", "patternProperties")):
+            return True
+
+        # Recursively check all nested objects and arrays
+        for value in obj.values():
+            if isinstance(value, dict):
+                if check_object(value):
+                    return True
+            elif isinstance(value, list):
+                for item in value:
+                    if isinstance(item, dict) and check_object(item):
+                        return True
+
+        return False
+
+    return check_object(schema)
+
+
+def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None:
+    """Validate that the request is supported by structured output.
+
+    Raises ValueError if the request is not supported.
+    """
+    if sampling_params.guided_decoding is None:
+        return
+
+    gd_params = sampling_params.guided_decoding
+
+    if gd_params.regex:
+        try:
+            xgr.Grammar.from_regex(gd_params.regex)
+        except Exception as err:
+            raise ValueError("Failed to transform regex into a grammar: "
+                             f"{err}") from err
+
+    if gd_params.choice:
+        choice_grammar = choice_as_grammar(gd_params.choice)
+        try:
+            xgr.Grammar.from_ebnf(choice_grammar)
+        except Exception as err:
+            raise ValueError("Failed to transform choices into a grammar: "
+                             "{err}") from err
+        gd_params.choice = None
+        gd_params.grammar = choice_grammar
+        return
+
+    if gd_params.json:
+        if isinstance(gd_params.json, str):
+            try:
+                schema = json.loads(gd_params.json)
+            except json.JSONDecodeError as e:
+                raise ValueError("Invalid JSON grammar specification.") from e
+        else:
+            schema = gd_params.json
+
+        try:
+            xgr.Grammar.from_json_schema(schema)
+        except Exception as err:
+            raise ValueError("Failed to transform json schema into a grammar: "
+                             f"{err}") from err
+
+        if has_xgrammar_unsupported_json_features(schema):
+            raise ValueError("The provided JSON schema contains features not "
+                             "supported by xgrammar.")
+        return
+
+    if gd_params.grammar:
+        if grammar_is_likely_lark(gd_params.grammar):
+            # xgrammar supports EBNF grammars only
+            try:
+                gd_params.grammar = convert_lark_to_ebnf(gd_params.grammar)
+            except ValueError as e:
+                raise ValueError(
+                    "Failed to convert the grammar from Lark to EBNF. ") from e
+
+        # Test parsing EBNF grammar, possibly already converted from Lark
+        try:
+            # parse the grammar, but we aren't compiling it.
+            xgr.Grammar.from_ebnf(gd_params.grammar)
+        except Exception as e:
+            raise ValueError("Invalid grammar specification.") from e
+        return
+
+    if gd_params.structural_tag:
+        try:
+            s_tag = json.loads(gd_params.structural_tag)
+            tags = [
+                xgr.StructuralTagItem(
+                    begin=s["begin"],
+                    schema=json.dumps(s["schema"]),
+                    end=s["end"],
+                ) for s in s_tag["structures"]
+            ]
+            xgr.Grammar.from_structural_tag(tags, s_tag["triggers"])
+        except Exception as e:
+            raise ValueError("Invalid structural tag specification.") from e
diff --git a/vllm_v0.10.0/vllm/v1/structured_output/request.py b/vllm_v0.10.0/vllm/v1/structured_output/request.py
new file mode 100644
index 0000000..fc365f1
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/structured_output/request.py
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import dataclasses
+import functools
+import json
+from concurrent.futures import Future
+from concurrent.futures._base import TimeoutError
+from typing import Optional, Union, cast
+
+from vllm.sampling_params import SamplingParams
+from vllm.v1.structured_output.backend_types import (StructuredOutputGrammar,
+                                                     StructuredOutputKey,
+                                                     StructuredOutputOptions)
+
+
+@dataclasses.dataclass
+class StructuredOutputRequest:
+
+    sampling_params: SamplingParams
+    _grammar: Optional[Union[Future[StructuredOutputGrammar],
+                             StructuredOutputGrammar]] = None
+    reasoning_ended: Optional[bool] = None
+
+    def _check_grammar_completion(self) -> bool:
+        # NOTE: We have to lazy import to gate circular imports
+        from vllm.v1.request import RequestStatus
+
+        if isinstance(self._grammar, Future):
+            try:
+                # We will check whether the future is ready within 100 us
+                self._grammar = self._grammar.result(timeout=0.0001)
+                self.status = RequestStatus.WAITING
+            except TimeoutError:
+                return False
+        return True
+
+    @property
+    def is_grammar_ready(self) -> bool:
+        return self._check_grammar_completion()
+
+    @property
+    def grammar(self) -> Optional[StructuredOutputGrammar]:
+        completed = self._check_grammar_completion()
+        return cast(Optional[StructuredOutputGrammar],
+                    self._grammar) if completed else None
+
+    @grammar.setter
+    def grammar(
+        self, grammar: Union[StructuredOutputGrammar,
+                             Future[StructuredOutputGrammar]]
+    ) -> None:
+        self._grammar = grammar
+
+    @functools.cached_property
+    def structured_output_key(self) -> StructuredOutputKey:
+        return get_structured_output_key(self.sampling_params)
+
+
+def get_structured_output_key(
+        sampling_params: SamplingParams) -> StructuredOutputKey:
+    params = sampling_params.guided_decoding
+    assert params is not None, "params can't be None."
+    if params.json is not None:
+        if not isinstance(params.json, str):
+            json_str = json.dumps(params.json)
+        else:
+            json_str = params.json
+        return (StructuredOutputOptions.JSON, json_str)
+    elif params.json_object:
+        return (StructuredOutputOptions.JSON_OBJECT, "")
+    elif params.regex is not None:
+        return (StructuredOutputOptions.REGEX, params.regex)
+    elif params.choice is not None:
+        if not isinstance(params.choice, str):
+            json_str = json.dumps(params.choice)
+        else:
+            json_str = params.choice
+        return (StructuredOutputOptions.CHOICE, json_str)
+    elif params.grammar is not None:
+        return (StructuredOutputOptions.GRAMMAR, params.grammar)
+    elif params.structural_tag is not None:
+        return (StructuredOutputOptions.STRUCTURAL_TAG, params.structural_tag)
+    else:
+        raise ValueError("No valid structured output parameter found")
diff --git a/vllm_v0.10.0/vllm/v1/structured_output/utils.py b/vllm_v0.10.0/vllm/v1/structured_output/utils.py
new file mode 100644
index 0000000..9531983
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/structured_output/utils.py
@@ -0,0 +1,373 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import hashlib
+import importlib.metadata
+import os
+from typing import TYPE_CHECKING
+
+import regex as re
+from cachetools import LRUCache
+from diskcache import Cache
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.utils import LazyLoader
+
+if TYPE_CHECKING:
+    import outlines_core as oc
+    import transformers.file_utils as file_utils
+    import transformers.models.gpt2.tokenization_gpt2 as tokenization_gpt2
+
+    from vllm.transformers_utils.tokenizer import AnyTokenizer
+else:
+    oc = LazyLoader("oc", globals(), "outlines_core")
+    file_utils = LazyLoader("file_utils", globals(), "transformers.file_utils")
+    tokenization_gpt2 = LazyLoader(
+        "tokenization_gpt2",
+        globals(),
+        "transformers.models.gpt2.tokenization_gpt2",
+    )
+
+logger = init_logger(__name__)
+
+CACHE = None
+
+
+class OutlinesVocabulary:
+    """
+    Wrapper class for `outlines_core.Vocabulary`,
+    which allows us to store a hash with the vocabulary
+    """
+
+    def __init__(self, vocabulary: oc.Vocabulary) -> None:
+        # Actual vocabulary object
+        self.inner = vocabulary
+        # Have to do abs(hash()) because python hashes can
+        # be negative, and we are using hash as a cache key.
+        hex_str = hashlib.sha256(
+            vocabulary.__repr__().encode('utf-8')).hexdigest()
+        hash_int = int(hex_str, 16)
+        self._hash = hash_int
+
+
+def get_outlines_cache_path() -> str:
+    """Get the context object that contains previously-computed return values"""
+    outlines_cache_dir = os.getenv("OUTLINES_CACHE_DIR")
+    xdg_cache_home = os.getenv("XDG_CACHE_HOME")
+    home_dir = os.path.expanduser("~")
+
+    if outlines_cache_dir:
+        # OUTLINES_CACHE_DIR takes precedence
+        return outlines_cache_dir
+    elif xdg_cache_home:
+        return os.path.join(xdg_cache_home, ".cache", "outlines")
+    # If homedir is "/", we may be inside a container, and thus writing to
+    # root would be problematic, so we fallback to using a tempfile.
+    # Also validate the path exists, since os.path.expanduser does
+    # not garuntee existence.
+    elif os.path.isdir(home_dir) and home_dir != "/":
+        # Default Unix fallback: ~/.cache/outlines
+        return os.path.join(home_dir, ".cache", "outlines")
+    else:
+        import tempfile
+
+        # home_dir may be / inside a docker container without existing user
+        tempdir = tempfile.gettempdir()
+        return os.path.join(tempdir, ".cache", "outlines")
+
+
+def get_outlines_cache():
+    """Get the Cache instance to be used for index caching"""
+
+    cache_dir = get_outlines_cache_path()
+    if envs.VLLM_V1_USE_OUTLINES_CACHE:
+        logger.warning("Enabling outlines cache. This is an unbounded on-disk "
+                       "cache. It may consume a lot of disk space and should "
+                       "not be used with untrusted clients.")
+        cache = Cache(cache_dir, eviction_policy="none", cull_limit=0)
+        outlines_version = importlib.metadata.version("outlines_core")
+
+        cached_version = cache.get('__version__', None)
+        if cached_version != outlines_version:
+            cache.clear()
+        cache.set('__version__', outlines_version)
+        return cache
+    else:
+        return LRUCache(maxsize=128)
+
+
+re_llama_byte_token = re.compile(r"^<0x[0-9A-F]{2}>$")
+re_replacement_seq = re.compile(r"^.{0,6}�+.{0,6}$")
+
+
+def _reduced_vocabulary(
+    tokenizer: AnyTokenizer,
+    eos_token_id: int,
+) -> dict[bytes, list[int]]:
+    """Create a map from vocabulary tokens to lists of equivalent token ids.
+
+    Returns:
+        A Dict of token string -> equivalent token ids
+    """
+
+    unicode_to_bytes = {
+        v: k
+        for k, v in tokenization_gpt2.bytes_to_unicode().items()
+    }
+
+    def convert_token_to_string(token: str) -> str:
+
+        string = tokenizer.convert_tokens_to_string([token])
+
+        # A hack to handle missing spaces to HF's Llama tokenizers
+        if (type(token) is str
+                and token.startswith(file_utils.SPIECE_UNDERLINE)
+                or token == "<0x20>"):
+            return " " + string
+
+        return string
+
+    vocabulary: dict[bytes, list[int]] = {}
+    empty_token_ids: list[int] = []
+    for token, token_idx in tokenizer.get_vocab().items():
+        if token in tokenizer.all_special_tokens:  # type: ignore
+            continue
+
+        token_str = convert_token_to_string(token)
+        if token_str:
+            if isinstance(token, (bytes, bytearray)):
+                # For BPE tokenizers where tokens are stored as bytes.
+
+                # safe to ignore since token_str is of type (bytearray, bytes)
+                # by this point.
+                token_bytes = bytes(token_str)  # type: ignore[arg-type]
+
+            elif "\ufffd" in token_str and not re_replacement_seq.match(
+                    token_str):
+                # Handle tokens with invalid UTF-8 sequences.
+                if re_llama_byte_token.match(token):
+                    # Llama-like tokenizers use <0xXX> for incomplete sequences.
+                    token_bytes = bytes([int(token[3:5], 16)])
+                else:
+                    # GPT2 tokenizers: map each byte back using unicode_to_bytes
+                    byte_vals = [unicode_to_bytes.get(c) for c in token]
+                    if None in byte_vals:
+                        raise RuntimeError(
+                            f"Cannot convert token `{token}`"
+                            f" ({token_idx}) to bytes: {token_str}")
+                    # safe to ignore, since if None in byte_vals,
+                    # an error is thrown.
+                    token_bytes = bytes(byte_vals)  # type: ignore[arg-type]
+            else:
+                token_bytes = token_str.encode('utf-8')
+
+            if token_idx != eos_token_id:
+                vocabulary.setdefault(token_bytes, []).append(token_idx)
+        else:
+            empty_token_ids.append(token_idx)
+
+    return vocabulary
+
+
+def get_outlines_vocabulary(tokenizer: AnyTokenizer) -> oc.Vocabulary:
+    """Get the `Vocabulary` object for a given tokenizer.
+    """
+    if hasattr(tokenizer, "_outlines_vocabulary"):
+        return tokenizer._outlines_vocabulary  # type: ignore
+
+    try:
+        if hasattr(
+                tokenizer,
+                "eos_token_id",
+        ) and tokenizer.eos_token_id is not None:
+            eos_token_id = tokenizer.eos_token_id
+        else:
+            raise ValueError(
+                f"Error during structured outputs setup for outlines: Tokenizer ({type(tokenizer)}) has no `eos_token_id` property, but `eos_token_id` is required for structured outputs to work properly."  # noqa: E501
+            )
+
+        reduced_vocab = _reduced_vocabulary(
+            tokenizer,
+            eos_token_id  #type: ignore
+        )
+        vocabulary = OutlinesVocabulary(
+            oc.Vocabulary(eos_token_id, reduced_vocab))
+        tokenizer._outlines_vocabulary = vocabulary  # type: ignore
+
+        return vocabulary
+    except AttributeError as e:
+        raise ValueError(f"Cannot get the vocabulary of the tokenizer "
+                         f"({type(tokenizer)}). The tokenizer should have a "
+                         "get_vocab method.") from e
+
+
+def grammar_is_likely_lark(grammar_str: str) -> bool:
+    """
+    Check if grammar appears to use Lark syntax.
+
+    Args:
+        grammar_str: Input grammar string
+
+    Returns:
+        bool: True if grammar appears to be in Lark format, False otherwise
+
+    Examples:
+        >>> grammar_is_likely_lark("rule: 'abc'")
+        True
+        >>> grammar_is_likely_lark("rule ::= 'abc'")
+        False
+    """
+    if not grammar_str or not isinstance(grammar_str, str):
+        return False
+
+    for line in grammar_str.split('\n'):
+        # Remove both comment styles
+        line = re.sub(r'(#|//).*$', '', line).strip()
+        if not line:
+            continue
+
+        # Look for EBNF rule definition
+        if '::=' in line:
+            return False
+
+    return True
+
+
+def convert_lark_to_ebnf(grammar_str: str) -> str:
+    """
+    Convert a Lark grammar string to EBNF format.
+
+    EBNF reference:
+    https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md
+    Lark grammar reference:
+    https://lark-parser.readthedocs.io/en/latest/grammar.html
+
+    Args:
+        grammar_str: Input grammar in Lark format
+
+    Returns:
+        str: Converted grammar in EBNF format
+
+    Examples:
+        >>> print(convert_lark_to_ebnf("rule: 'hello'"))
+        root ::= rule
+        rule ::= "hello"
+    """
+    if not isinstance(grammar_str, str):
+        raise ValueError(f"Grammar must be a string, got {type(grammar_str)}")
+    if not grammar_str.strip():
+        raise ValueError("Grammar string cannot be empty")
+
+    defined_rules = set()
+    referenced_rules = set()
+    output_lines = []
+
+    def clean_line(line: str) -> str:
+        """Remove comments and whitespace from line."""
+        return re.sub(r'(#|//).*$', '', line).strip()
+
+    def check_quotes(text: str, rule_name: str, line_num: int) -> None:
+        """Validate quote matching in text."""
+        if text.count("'") % 2 != 0 or text.count('"') % 2 != 0:
+            raise ValueError(
+                f"Mismatched quotes in {rule_name} on line {line_num}")
+
+    def extract_references(text: str) -> set[str]:
+        """Extract rule references from text."""
+        # Remove quoted strings and special characters
+        text = re.sub(r'"[^"]*"', '', text)
+        text = re.sub(r'[+*?()|\[\]{}]', ' ', text)
+        return set(re.findall(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', text))
+
+    # First pass: Find root rule and validate rule definitions
+    lines = [clean_line(line) for line in grammar_str.split('\n')]
+    first_rule = None
+
+    for line_num, line in enumerate(lines, 1):
+        if not line or line.startswith('|'):
+            continue
+
+        if ':' in line:
+            try:
+                name = line.split(':', 1)[0].strip().strip('?')
+                defined_rules.add(name)
+                if first_rule is None:
+                    first_rule = name
+                if name == 'start':
+                    first_rule = 'start'
+            except IndexError as e:
+                raise ValueError(f"Invalid rule format on line {line_num}. "
+                                 "Expected 'rule_name: definition'") from e
+
+    if not defined_rules:
+        raise ValueError("No valid rules found in grammar")
+
+    # Add root rule
+    output_lines.append(f"root ::= {first_rule}")
+
+    # Second pass: Process rule definitions and alternatives
+    current_rule = None
+    current_definition = []
+
+    for line_num, line in enumerate(lines, 1):
+        if not line:
+            continue
+
+        try:
+            if ':' in line and not line.startswith('|'):
+                # Save previous rule if exists
+                if current_rule:
+                    output_lines.append(
+                        f"{current_rule} ::= {' | '.join(current_definition)}")
+
+                # Process new rule
+                name, definition = line.split(':', 1)
+                current_rule = name.strip().strip('?')
+
+                check_quotes(definition, f"rule '{current_rule}'", line_num)
+                definition = re.sub(r"'([^']*)'", r'"\1"', definition)
+                referenced_rules.update(extract_references(definition))
+                current_definition = [definition.strip()]
+
+            elif line.startswith('|'):
+                if not current_rule:
+                    raise ValueError(f"Alternative '|' on line {line_num} "
+                                     "without a preceding rule definition")
+
+                alt_def = line[1:].strip()
+                check_quotes(alt_def, f"alternative for rule '{current_rule}'",
+                             line_num)
+                alt_def = re.sub(r"'([^']*)'", r'"\1"', alt_def)
+                referenced_rules.update(extract_references(alt_def))
+                current_definition.append(alt_def)
+
+        except ValueError as e:
+            raise ValueError(f"Error on line {line_num}: {str(e)}") from e
+
+    # Add final rule if exists
+    if current_rule:
+        output_lines.append(
+            f"{current_rule} ::= {' | '.join(current_definition)}")
+
+    # Validate all rules are defined
+    undefined_rules = referenced_rules - defined_rules - {'root'}
+    if undefined_rules:
+        raise ValueError("Referenced rules are not defined: "
+                         f"{', '.join(sorted(undefined_rules))}")
+
+    return '\n'.join(output_lines)
+
+
+def choice_as_grammar(choice: list[str]) -> str:
+
+    def escape_ebnf_string(s: str) -> str:
+        """Escape special characters in a EBNF string."""
+        # Escape double quotes and backslashes
+        return re.sub(r'(["\\])', r'\\\1', s)
+
+    escaped_choices = (escape_ebnf_string(c) for c in choice)
+    grammar = ('root ::= ' + ' | '.join(f'"{c}"' for c in escaped_choices))
+    return grammar
diff --git a/vllm_v0.10.0/vllm/v1/utils.py b/vllm_v0.10.0/vllm/v1/utils.py
new file mode 100644
index 0000000..c74d8c5
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/utils.py
@@ -0,0 +1,327 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import multiprocessing
+import time
+import weakref
+from collections.abc import Sequence
+from multiprocessing import connection
+from multiprocessing.process import BaseProcess
+from typing import (TYPE_CHECKING, Any, Callable, Generic, Optional, TypeVar,
+                    Union, overload)
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
+                                  usage_message)
+from vllm.utils import (get_open_port, get_open_zmq_ipc_path, get_tcp_uri,
+                        kill_process_tree)
+
+if TYPE_CHECKING:
+    from vllm.v1.engine.coordinator import DPCoordinator
+    from vllm.v1.engine.utils import (CoreEngineActorManager,
+                                      CoreEngineProcManager)
+
+logger = init_logger(__name__)
+
+T = TypeVar("T")
+
+
+class ConstantList(Generic[T], Sequence):
+
+    def __init__(self, x: list[T]) -> None:
+        self._x = x
+
+    def append(self, item):
+        raise Exception("Cannot append to a constant list")
+
+    def extend(self, item):
+        raise Exception("Cannot extend a constant list")
+
+    def insert(self, item):
+        raise Exception("Cannot insert into a constant list")
+
+    def pop(self, item):
+        raise Exception("Cannot pop from a constant list")
+
+    def remove(self, item):
+        raise Exception("Cannot remove from a constant list")
+
+    def clear(self):
+        raise Exception("Cannot clear a constant list")
+
+    def index(self,
+              item: T,
+              start: int = 0,
+              stop: Optional[int] = None) -> int:
+        return self._x.index(item, start,
+                             stop if stop is not None else len(self._x))
+
+    @overload
+    def __getitem__(self, item: int) -> T:
+        ...
+
+    @overload
+    def __getitem__(self, s: slice, /) -> list[T]:
+        ...
+
+    def __getitem__(self, item: Union[int, slice]) -> Union[T, list[T]]:
+        return self._x[item]
+
+    @overload
+    def __setitem__(self, item: int, value: T):
+        ...
+
+    @overload
+    def __setitem__(self, s: slice, value: T, /):
+        ...
+
+    def __setitem__(self, item: Union[int, slice], value: Union[T, list[T]]):
+        raise Exception("Cannot set item in a constant list")
+
+    def __delitem__(self, item):
+        raise Exception("Cannot delete item from a constant list")
+
+    def __iter__(self):
+        return iter(self._x)
+
+    def __contains__(self, item):
+        return item in self._x
+
+    def __len__(self):
+        return len(self._x)
+
+    def __repr__(self):
+        return f"ConstantList({self._x})"
+
+
+def get_engine_client_zmq_addr(local_only: bool,
+                               host: str,
+                               port: int = 0) -> str:
+    """Assign a new ZMQ socket address.
+
+    If local_only is True, participants are colocated and so a unique IPC
+    address will be returned.
+
+    Otherwise, the provided host and port will be used to construct a TCP
+    address (port == 0 means assign an available port)."""
+
+    return get_open_zmq_ipc_path() if local_only else (get_tcp_uri(
+        host, port or get_open_port()))
+
+
+class APIServerProcessManager:
+    """Manages a group of API server processes.
+    
+    Handles creation, monitoring, and termination of API server worker
+    processes. Also monitors extra processes to check if they are healthy.
+    """
+
+    def __init__(
+        self,
+        target_server_fn: Callable,
+        listen_address: str,
+        sock: Any,
+        args: argparse.Namespace,
+        num_servers: int,
+        input_addresses: list[str],
+        output_addresses: list[str],
+        stats_update_address: Optional[str] = None,
+    ):
+        """Initialize and start API server worker processes.
+        
+        Args:
+            target_server_fn: Function to call for each API server process
+            listen_address: Address to listen for client connections
+            sock: Socket for client connections
+            args: Command line arguments
+            num_servers: Number of API server processes to start
+            input_addresses: Input addresses for each API server
+            output_addresses: Output addresses for each API server
+            stats_update_address: Optional stats update address 
+        """
+        self.listen_address = listen_address
+        self.sock = sock
+        self.args = args
+
+        # Start API servers
+        spawn_context = multiprocessing.get_context("spawn")
+        self.processes: list[BaseProcess] = []
+
+        for i, in_addr, out_addr in zip(range(num_servers), input_addresses,
+                                        output_addresses):
+            client_config = {
+                "input_address": in_addr,
+                "output_address": out_addr,
+                "client_index": i
+            }
+            if stats_update_address is not None:
+                client_config["stats_update_address"] = stats_update_address
+
+            proc = spawn_context.Process(target=target_server_fn,
+                                         name=f"ApiServer_{i}",
+                                         args=(listen_address, sock, args,
+                                               client_config))
+            self.processes.append(proc)
+            proc.start()
+
+        logger.info("Started %d API server processes", len(self.processes))
+
+        # Shutdown only the API server processes on garbage collection
+        # The extra processes are managed by their owners
+        self._finalizer = weakref.finalize(self, shutdown, self.processes)
+
+    def close(self) -> None:
+        self._finalizer()
+
+
+def wait_for_completion_or_failure(
+        api_server_manager: APIServerProcessManager,
+        engine_manager: Optional[Union["CoreEngineProcManager",
+                                       "CoreEngineActorManager"]] = None,
+        coordinator: Optional["DPCoordinator"] = None) -> None:
+    """Wait for all processes to complete or detect if any fail.
+    
+    Raises an exception if any process exits with a non-zero status.
+
+    Args:
+        api_server_manager: The manager for API servers.
+        engine_manager: The manager for engine processes.
+            If CoreEngineProcManager, it manages local engines;
+            if CoreEngineActorManager, it manages all engines.
+        coordinator: The coordinator for data parallel.
+    """
+
+    from vllm.v1.engine.utils import (CoreEngineActorManager,
+                                      CoreEngineProcManager)
+
+    try:
+        logger.info("Waiting for API servers to complete ...")
+        # Create a mapping of sentinels to their corresponding processes
+        # for efficient lookup
+        sentinel_to_proc: dict[Any, BaseProcess] = {
+            proc.sentinel: proc
+            for proc in api_server_manager.processes
+        }
+
+        if coordinator:
+            sentinel_to_proc[coordinator.proc.sentinel] = coordinator.proc
+
+        actor_run_refs = []
+        if isinstance(engine_manager, CoreEngineProcManager):
+            for proc in engine_manager.processes:
+                sentinel_to_proc[proc.sentinel] = proc
+        elif isinstance(engine_manager, CoreEngineActorManager):
+            actor_run_refs = engine_manager.get_run_refs()
+
+        # Check if any process terminates
+        while sentinel_to_proc or actor_run_refs:
+            # Wait for any process to terminate
+            ready_sentinels: list[Any] = connection.wait(sentinel_to_proc,
+                                                         timeout=5)
+
+            # Process any terminated processes
+            for sentinel in ready_sentinels:
+                proc = sentinel_to_proc.pop(sentinel)
+
+                # Check if process exited with error
+                if proc.exitcode != 0:
+                    raise RuntimeError(
+                        f"Process {proc.name} (PID: {proc.pid}) "
+                        f"died with exit code {proc.exitcode}")
+
+            if actor_run_refs:
+                import ray
+                _, actor_run_refs = ray.wait(actor_run_refs, timeout=5)
+
+    except KeyboardInterrupt:
+        logger.info("Received KeyboardInterrupt, shutting down API servers...")
+    except Exception as e:
+        logger.exception("Exception occurred while running API servers: %s",
+                         str(e))
+        raise
+    finally:
+        logger.info("Terminating remaining processes ...")
+        api_server_manager.close()
+        if coordinator:
+            coordinator.close()
+        if engine_manager:
+            engine_manager.close()
+
+
+# Note(rob): shutdown function cannot be a bound method,
+# else the gc cannot collect the object.
+def shutdown(procs: list[BaseProcess]):
+    # Shutdown the process.
+    for proc in procs:
+        if proc.is_alive():
+            proc.terminate()
+
+    # Allow 5 seconds for remaining procs to terminate.
+    deadline = time.monotonic() + 5
+    for proc in procs:
+        remaining = deadline - time.monotonic()
+        if remaining <= 0:
+            break
+        if proc.is_alive():
+            proc.join(remaining)
+
+    for proc in procs:
+        if proc.is_alive() and (pid := proc.pid) is not None:
+            kill_process_tree(pid)
+
+
+def copy_slice(from_tensor: torch.Tensor, to_tensor: torch.Tensor,
+               length: int) -> torch.Tensor:
+    """
+    Copy the first length elements of a tensor into another tensor in a
+    non-blocking manner.
+
+    Used to copy pinned CPU tensor data to pre-allocated GPU tensors.
+
+    Returns the sliced target tensor.
+    """
+    return to_tensor[:length].copy_(from_tensor[:length], non_blocking=True)
+
+
+def report_usage_stats(
+        vllm_config,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT) -> None:
+    """Report usage statistics if enabled."""
+
+    if not is_usage_stats_enabled():
+        return
+
+    from vllm.model_executor.model_loader import get_architecture_class_name
+
+    usage_message.report_usage(
+        get_architecture_class_name(vllm_config.model_config),
+        usage_context,
+        extra_kvs={
+            # Common configuration
+            "dtype":
+            str(vllm_config.model_config.dtype),
+            "tensor_parallel_size":
+            vllm_config.parallel_config.tensor_parallel_size,
+            "block_size":
+            vllm_config.cache_config.block_size,
+            "gpu_memory_utilization":
+            vllm_config.cache_config.gpu_memory_utilization,
+
+            # Quantization
+            "quantization":
+            vllm_config.model_config.quantization,
+            "kv_cache_dtype":
+            str(vllm_config.cache_config.cache_dtype),
+
+            # Feature flags
+            "enable_lora":
+            bool(vllm_config.lora_config),
+            "enable_prefix_caching":
+            vllm_config.cache_config.enable_prefix_caching,
+            "enforce_eager":
+            vllm_config.model_config.enforce_eager,
+            "disable_custom_all_reduce":
+            vllm_config.parallel_config.disable_custom_all_reduce,
+        })
diff --git a/vllm_v0.10.0/vllm/v1/worker/__init__.py b/vllm_v0.10.0/vllm/v1/worker/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/v1/worker/block_table.py b/vllm_v0.10.0/vllm/v1/worker/block_table.py
new file mode 100644
index 0000000..bf38e88
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/worker/block_table.py
@@ -0,0 +1,175 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import numpy as np
+import torch
+
+from vllm.logger import init_logger
+from vllm.utils import cdiv
+
+logger = init_logger(__name__)
+
+
+class BlockTable:
+
+    def __init__(
+        self,
+        block_size: int,
+        max_num_reqs: int,
+        max_num_blocks_per_req: int,
+        max_num_batched_tokens: int,
+        pin_memory: bool,
+        device: torch.device,
+    ):
+        self.block_size = block_size
+        self.max_num_reqs = max_num_reqs
+        self.max_num_blocks_per_req = max_num_blocks_per_req
+        self.max_num_batched_tokens = max_num_batched_tokens
+        self.pin_memory = pin_memory
+        self.device = device
+
+        self.block_table = torch.zeros(
+            (max_num_reqs, max_num_blocks_per_req),
+            device=self.device,
+            dtype=torch.int32,
+        )
+        self.block_table_cpu = torch.zeros(
+            (max_num_reqs, max_num_blocks_per_req),
+            device="cpu",
+            dtype=torch.int32,
+            pin_memory=pin_memory,
+        )
+        self.block_table_np = self.block_table_cpu.numpy()
+        self.num_blocks_per_row = np.zeros(max_num_reqs, dtype=np.int32)
+
+        self.slot_mapping_cpu = torch.zeros(self.max_num_batched_tokens,
+                                            dtype=torch.int64,
+                                            device="cpu",
+                                            pin_memory=self.pin_memory)
+        self.slot_mapping_np = self.slot_mapping_cpu.numpy()
+        self.slot_mapping = torch.zeros(self.max_num_batched_tokens,
+                                        dtype=torch.int64,
+                                        device=self.device)
+
+    def append_row(
+        self,
+        block_ids: list[int],
+        row_idx: int,
+    ) -> None:
+        if not block_ids:
+            return
+        num_blocks = len(block_ids)
+        start = self.num_blocks_per_row[row_idx]
+        self.num_blocks_per_row[row_idx] += num_blocks
+        self.block_table_np[row_idx, start:start + num_blocks] = block_ids
+
+    def add_row(self, block_ids: list[int], row_idx: int) -> None:
+        self.num_blocks_per_row[row_idx] = 0
+        self.append_row(block_ids, row_idx)
+
+    def move_row(self, src: int, tgt: int) -> None:
+        num_blocks = self.num_blocks_per_row[src]
+        self.block_table_np[tgt, :num_blocks] = self.block_table_np[
+            src, :num_blocks]
+        self.num_blocks_per_row[tgt] = num_blocks
+
+    def swap_row(self, src: int, tgt: int) -> None:
+        num_blocks_src = self.num_blocks_per_row[src]
+        num_blocks_tgt = self.num_blocks_per_row[tgt]
+        self.num_blocks_per_row[src] = num_blocks_tgt
+        self.num_blocks_per_row[tgt] = num_blocks_src
+
+        self.block_table_np[[src, tgt]] = self.block_table_np[[tgt, src]]
+
+    def compute_slot_mapping(self, req_indices: np.ndarray,
+                             positions: np.ndarray) -> None:
+        # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
+        # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1]
+        # where K is the max_num_blocks_per_req and the block size is 2.
+        # NOTE(woosuk): We can't simply use `token_indices // block_size`
+        # here because M (max_model_len) is not necessarily divisible by
+        # block_size.
+        block_table_indices = (req_indices * self.max_num_blocks_per_req +
+                               positions // self.block_size)
+        block_table_cpu = self.get_cpu_tensor()
+        block_numbers = block_table_cpu.flatten()[block_table_indices].numpy()
+        block_offsets = positions % self.block_size
+        np.add(block_numbers * self.block_size,
+               block_offsets,
+               out=self.slot_mapping_np[:req_indices.shape[0]])
+
+    def commit_block_table(self, num_reqs: int) -> None:
+        self.block_table[:num_reqs].copy_(self.block_table_cpu[:num_reqs],
+                                          non_blocking=True)
+
+    def commit_slot_mapping(self, num_tokens: int) -> None:
+        self.slot_mapping[:num_tokens].copy_(
+            self.slot_mapping_cpu[:num_tokens], non_blocking=True)
+
+    def clear(self) -> None:
+        self.block_table.fill_(0)
+        self.block_table_cpu.fill_(0)
+
+    def get_device_tensor(self) -> torch.Tensor:
+        """Ruturns the device tensor of the block table."""
+        return self.block_table
+
+    def get_cpu_tensor(self) -> torch.Tensor:
+        """Returns the CPU tensor of the block table."""
+        return self.block_table_cpu
+
+    def get_numpy_array(self) -> np.ndarray:
+        """Returns the numpy array of the block table."""
+        return self.block_table_np
+
+
+class MultiGroupBlockTable:
+    """The BlockTables for each KV cache group."""
+
+    def __init__(self, max_num_reqs: int, max_model_len: int,
+                 max_num_batched_tokens: int, pin_memory: bool,
+                 device: torch.device, block_sizes: list[int]) -> None:
+        self.block_tables = [
+            BlockTable(block_size, max_num_reqs, cdiv(max_model_len,
+                                                      block_size),
+                       max_num_batched_tokens, pin_memory, device)
+            for block_size in block_sizes
+        ]
+
+    def append_row(self, block_ids: tuple[list[int], ...],
+                   row_idx: int) -> None:
+        for i, block_table in enumerate(self.block_tables):
+            block_table.append_row(block_ids[i], row_idx)
+
+    def add_row(self, block_ids: tuple[list[int], ...], row_idx: int) -> None:
+        for i, block_table in enumerate(self.block_tables):
+            block_table.add_row(block_ids[i], row_idx)
+
+    def move_row(self, src: int, tgt: int) -> None:
+        for block_table in self.block_tables:
+            block_table.move_row(src, tgt)
+
+    def swap_row(self, src: int, tgt: int) -> None:
+        for block_table in self.block_tables:
+            block_table.swap_row(src, tgt)
+
+    def compute_slot_mapping(self, req_indices: np.ndarray,
+                             positions: np.ndarray) -> None:
+        for block_table in self.block_tables:
+            block_table.compute_slot_mapping(req_indices, positions)
+
+    def commit_block_table(self, num_reqs: int) -> None:
+        for block_table in self.block_tables:
+            block_table.commit_block_table(num_reqs)
+
+    def commit_slot_mapping(self, num_tokens: int) -> None:
+        for block_table in self.block_tables:
+            block_table.commit_slot_mapping(num_tokens)
+
+    def clear(self) -> None:
+        for block_table in self.block_tables:
+            block_table.clear()
+
+    def __getitem__(self, idx: int) -> "BlockTable":
+        """Returns the BlockTable for the i-th KV cache group."""
+        return self.block_tables[idx]
diff --git a/vllm_v0.10.0/vllm/v1/worker/cpu_model_runner.py b/vllm_v0.10.0/vllm/v1/worker/cpu_model_runner.py
new file mode 100644
index 0000000..ca94ac8
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/worker/cpu_model_runner.py
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from contextlib import contextmanager
+from typing import Any
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader import get_model
+from vllm.v1.worker.gpu_model_runner import GPUModelRunner
+
+logger = init_logger(__name__)
+
+
+class CPUModelRunner(GPUModelRunner):
+
+    def __init__(self, vllm_config: VllmConfig, device: torch.device):
+        super().__init__(vllm_config, device)
+
+        assert device == torch.device("cpu")
+        assert self.speculative_config is None, "spec decode is not supported."
+
+        self.use_cuda_graph = False
+        self.cascade_attn_enabled = False
+
+        self._postprocess_tenosrs()
+
+    def _postprocess_tenosrs(self) -> None:
+        # Note: replace device tensors with cpu tensors
+        def replace_tensor(obj: Any, cpu_attr_name: str,
+                           device_attr_name) -> None:
+            cpu_tensor = getattr(obj, cpu_attr_name, None)
+            device_tensor = getattr(obj, device_attr_name, None)
+            if cpu_tensor is not None and device_tensor is not None:
+                assert isinstance(cpu_tensor, torch.Tensor)
+                assert isinstance(device_tensor, torch.Tensor)
+                setattr(obj, device_attr_name, cpu_tensor)
+
+        for k, v in vars(self).items():
+            if k.endswith("_cpu") and isinstance(v, torch.Tensor):
+                replace_tensor(self, k, k[:-4])
+
+        for k, v in vars(self.input_batch).items():
+            if k.endswith("_cpu_tensor") and isinstance(v, torch.Tensor):
+                replace_tensor(self.input_batch, k, k[:-11])
+
+        for block_table in self.input_batch.block_table.block_tables:
+            for k, v in vars(block_table).items():
+                if k.endswith("_cpu") and isinstance(v, torch.Tensor):
+                    replace_tensor(block_table, k, k[:-4])
+
+    def load_model(self, eep_scale_up: bool = False) -> None:
+        logger.info("Starting to load model %s...", self.model_config.model)
+        self.model = get_model(vllm_config=self.vllm_config)
+
+        if self.lora_config:
+            self.model = self.load_lora_model(self.model, self.model_config,
+                                              self.scheduler_config,
+                                              self.lora_config, self.device)
+
+    def warming_up_model(self) -> None:
+        logger.info("Warming up model for the compilation...")
+        # Only generate graph for the generic shape
+        with _set_global_compilation_settings(self.vllm_config):
+            self._dummy_run(max(16, self.max_num_reqs))
+        logger.info("Warming up done.")
+
+    def _init_device_properties(self) -> None:
+        pass
+
+    def _sync_device(self) -> None:
+        pass
+
+
+@contextmanager
+def _set_global_compilation_settings(config: VllmConfig):
+    import torch._inductor.config
+
+    inductor_config = config.compilation_config.inductor_compile_config
+    try:
+        # Note: The MKLDNN and CPPGEMM backend requires freezing parameters.
+        freezing_value = torch._inductor.config.freezing
+        if inductor_config.get("max_autotune", False):
+            torch._inductor.config.freezing = True
+        yield
+    finally:
+        torch._inductor.config.freezing = freezing_value
diff --git a/vllm_v0.10.0/vllm/v1/worker/cpu_worker.py b/vllm_v0.10.0/vllm/v1/worker/cpu_worker.py
new file mode 100644
index 0000000..2dc28d9
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/worker/cpu_worker.py
@@ -0,0 +1,173 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+import platform
+from typing import Callable, Optional
+
+import torch
+
+from vllm import envs
+from vllm.config import VllmConfig
+from vllm.distributed.parallel_state import get_pp_group, get_tp_group
+from vllm.logger import init_logger
+from vllm.model_executor.utils import set_random_seed
+from vllm.platforms import CpuArchEnum, current_platform
+from vllm.platforms.cpu import CpuPlatform, LogicalCPUInfo
+from vllm.sequence import IntermediateTensors
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.worker.cpu_model_runner import CPUModelRunner
+from vllm.v1.worker.gpu_worker import (Worker,
+                                       init_worker_distributed_environment)
+
+logger = init_logger(__name__)
+
+
+class CPUWorker(Worker):
+
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 local_rank: int,
+                 rank: int,
+                 distributed_init_method: str,
+                 is_driver_worker: bool = False):
+        super().__init__(vllm_config,
+                         local_rank,
+                         rank,
+                         distributed_init_method,
+                         is_driver_worker=is_driver_worker)
+
+        self.parallel_config.disable_custom_all_reduce = True
+
+    def init_device(self):
+        # Setup OpenMP threads affinity.
+        omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND
+        if omp_cpuids == "auto" and platform.system() == "Linux":
+            if current_platform.get_cpu_architecture() == CpuArchEnum.POWERPC:
+                # For POWERPC SMT-8/4/2
+                self.local_omp_cpuid = self._get_autobind_cpu_ids(
+                    lambda cpus: [cpu for cpu in cpus if cpu.id % 8 < 4])
+            elif current_platform.get_cpu_architecture() == CpuArchEnum.X86:
+                # For x86 SMT-2, use 1 CPU per core
+                self.local_omp_cpuid = self._get_autobind_cpu_ids(
+                    lambda cpus: cpus[-1:])
+            else:
+                self.local_omp_cpuid = "all"
+        else:
+            self.local_omp_cpuid = omp_cpuids.split("|")[self.rank]
+
+        if self.local_omp_cpuid != "all":
+            ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
+            if ret:
+                logger.info(ret)
+
+        # Note: unique identifier for creating allreduce shared memory
+        os.environ["VLLM_DIST_IDENT"] = self.distributed_init_method.split(
+            ":")[-1]
+        # Initialize the distributed environment.
+        init_worker_distributed_environment(self.vllm_config, self.rank,
+                                            self.distributed_init_method,
+                                            self.local_rank,
+                                            current_platform.dist_backend)
+        # Set random seed.
+        set_random_seed(self.model_config.seed)
+
+        # Construct the model runner
+        self.model_runner: CPUModelRunner = CPUModelRunner(
+            self.vllm_config, torch.device("cpu"))
+
+    def sleep(self, level: int = 1) -> None:
+        logger.warning("sleep mode is not supported on CPU, ignore it.")
+        pass
+
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        logger.warning("sleep mode is not supported on CPU, ignore it.")
+        pass
+
+    def determine_available_memory(self) -> int:
+        return self.cache_config.cpu_kvcache_space_bytes  # type: ignore
+
+    def compile_or_warm_up_model(self) -> None:
+        # Reset the seed to ensure that the random state is not affected by
+        # the model initialization and profiling.
+        set_random_seed(self.model_config.seed)
+        self.model_runner.warming_up_model()
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        scheduler_output: "SchedulerOutput",
+    ) -> Optional[ModelRunnerOutput]:
+        intermediate_tensors = None
+        if not get_pp_group().is_first_rank:
+            intermediate_tensors = IntermediateTensors(
+                get_pp_group().recv_tensor_dict(
+                    all_gather_group=get_tp_group()))
+
+        output = self.model_runner.execute_model(scheduler_output,
+                                                 intermediate_tensors)
+
+        if not get_pp_group().is_last_rank:
+            assert isinstance(output, IntermediateTensors)
+            get_pp_group().send_tensor_dict(output.tensors,
+                                            all_gather_group=get_tp_group())
+            return None
+
+        assert isinstance(output, ModelRunnerOutput)
+        return output if self.is_driver_worker else None
+
+    def _get_autobind_cpu_ids(
+        self, cpu_selector: Callable[[list[LogicalCPUInfo]],
+                                     list[LogicalCPUInfo]]
+    ) -> str:
+        """
+        Return CPU ids to bind based on NUMA nodes. 
+        Currently for rank N, only CPU ids on the N-th node in available NUMA 
+        node list will be selected.
+        Args:
+            cpu_selector: a callable object to select CPUs from a CPU list 
+            of a physical core. The input is a LogicalCPUInfo list, sorted by
+            the LogicalCPUInfo.id. A selected LogicalCPUInfo list should be 
+            returned.
+        """
+
+        allowed_numa_nodes, logical_cpu_list = \
+            CpuPlatform.get_allowed_cpu_memory_node_list()
+        assert len(allowed_numa_nodes) >= self.parallel_config.world_size, (
+            f"No enough allowed NUMA nodes to bind threads of "
+            f"{self.parallel_config.world_size} CPUWorkers. "
+            f"Allowed NUMA nodes are {allowed_numa_nodes}. "
+            "Please try to bind threads manually.")
+
+        # Get CPUs on NUMA node `allowed_numa_nodes[local_rank]``
+        selected_numa_node = allowed_numa_nodes[
+            self.local_rank]  # type: ignore
+        logical_cpu_list = [
+            x for x in logical_cpu_list if x.numa_node == selected_numa_node
+        ]
+
+        # Select CPUs from each physical core via cpu_selector
+        core_to_cpus: dict[int, list[LogicalCPUInfo]] = {}
+        for cpu_info in logical_cpu_list:
+            if cpu_info.physical_core not in core_to_cpus:
+                core_to_cpus[cpu_info.physical_core] = []
+            core_to_cpus[cpu_info.physical_core].append(cpu_info)
+        logical_cpu_list = []
+        for cpu_list in core_to_cpus.values():
+            cpu_list = sorted(cpu_list, key=lambda x: x.id)
+            logical_cpu_list.extend(cpu_selector(cpu_list))
+        logical_cpu_list = sorted(logical_cpu_list, key=lambda x: x.id)
+
+        # Reserve CPUs for other processes
+        reserve_cpu_num = envs.VLLM_CPU_NUM_OF_RESERVED_CPU
+        if reserve_cpu_num is None:
+            reserve_cpu_num = 1 if self.parallel_config.world_size > 1 else 0
+        assert len(logical_cpu_list) > reserve_cpu_num, (
+            f"VLLM_CPU_NUM_OF_RESERVED_CPU ({reserve_cpu_num}) "
+            f"should less than {len(logical_cpu_list)}.")
+        if reserve_cpu_num != 0:
+            logical_cpu_list = logical_cpu_list[:-reserve_cpu_num]
+
+        logger.info("auto thread-binding list (id, physical core): %s",
+                    [(x.id, x.physical_core) for x in logical_cpu_list])
+        return ",".join([str(x.id) for x in logical_cpu_list])
diff --git a/vllm_v0.10.0/vllm/v1/worker/gpu_input_batch.py b/vllm_v0.10.0/vllm/v1/worker/gpu_input_batch.py
new file mode 100644
index 0000000..c630416
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/worker/gpu_input_batch.py
@@ -0,0 +1,760 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Datastructures defining a GPU input batch
+
+from dataclasses import dataclass
+from typing import Optional, cast
+
+import numpy as np
+import torch
+
+from vllm.lora.request import LoRARequest
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.pooling_params import PoolingParams
+from vllm.sampling_params import SamplingParams, SamplingType
+from vllm.utils import swap_dict_values
+from vllm.v1.outputs import LogprobsTensors
+from vllm.v1.pool.metadata import PoolingMetadata
+from vllm.v1.sample.logits_processor import (BatchUpdateBuilder,
+                                             MoveDirectionality,
+                                             init_builtin_logitsprocs)
+from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
+from vllm.v1.utils import copy_slice
+from vllm.v1.worker.block_table import MultiGroupBlockTable
+
+
+@dataclass
+class CachedRequestState:
+
+    req_id: str
+    prompt_token_ids: list[int]
+    mm_inputs: list[MultiModalKwargs]
+    mm_positions: list[PlaceholderRange]
+    sampling_params: Optional[SamplingParams]
+    pooling_params: Optional[PoolingParams]
+    generator: Optional[torch.Generator]
+
+    block_ids: tuple[list[int], ...]
+    num_computed_tokens: int
+    output_token_ids: list[int]
+
+    mrope_positions: Optional[torch.Tensor] = None
+    mrope_position_delta: Optional[int] = None
+
+    lora_request: Optional[LoRARequest] = None
+
+    def __post_init__(self):
+        self.num_prompt_tokens = len(self.prompt_token_ids)
+
+    @property
+    def num_tokens(self) -> int:
+        return self.num_prompt_tokens + len(self.output_token_ids)
+
+    def get_token_id(self, idx: int) -> int:
+        if idx < self.num_prompt_tokens:
+            return self.prompt_token_ids[idx]
+        else:
+            return self.output_token_ids[idx - self.num_prompt_tokens]
+
+
+class InputBatch:
+
+    def __init__(
+        self,
+        max_num_reqs: int,
+        max_model_len: int,
+        max_num_batched_tokens: int,
+        device: torch.device,
+        pin_memory: bool,
+        vocab_size: int,
+        block_sizes: list[int],  # The block_size of each kv cache group
+        is_spec_decode: bool = False,
+    ):
+        self.is_spec_decode = is_spec_decode
+        self.max_num_reqs = max_num_reqs
+        self.max_model_len = max_model_len
+        self.max_num_batched_tokens = max_num_batched_tokens
+        self.device = device
+        self.pin_memory = pin_memory
+        self.vocab_size = vocab_size
+
+        self._req_ids: list[Optional[str]] = []
+        self.req_id_to_index: dict[str, int] = {}
+
+        # TODO(woosuk): This buffer could be too large if max_model_len is big.
+        # Find a way to reduce the CPU memory usage.
+        # This buffer is not directly transferred to the GPU, so it does not
+        # need to be pinned.
+        self.token_ids_cpu_tensor = torch.zeros(
+            (max_num_reqs, max_model_len),
+            device="cpu",
+            dtype=torch.int32,
+            pin_memory=False,
+        )
+        self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
+        self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
+        self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32)
+        self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
+        self.num_computed_tokens_cpu_tensor = torch.zeros(
+            (max_num_reqs, ),
+            device="cpu",
+            dtype=torch.int32,
+            pin_memory=pin_memory,
+        )
+        self.num_computed_tokens_cpu = \
+            self.num_computed_tokens_cpu_tensor.numpy()
+
+        # Block table.
+        self.block_table = MultiGroupBlockTable(
+            max_num_reqs=max_num_reqs,
+            max_model_len=max_model_len,
+            max_num_batched_tokens=max_num_batched_tokens,
+            pin_memory=pin_memory,
+            device=device,
+            block_sizes=block_sizes,
+        )
+
+        # Sampling-related.
+        self.temperature = torch.empty((max_num_reqs, ),
+                                       dtype=torch.float32,
+                                       device=device)
+        self.temperature_cpu_tensor = torch.empty((max_num_reqs, ),
+                                                  dtype=torch.float32,
+                                                  device="cpu",
+                                                  pin_memory=pin_memory)
+        self.temperature_cpu = self.temperature_cpu_tensor.numpy()
+        self.greedy_reqs: set[str] = set()
+        self.random_reqs: set[str] = set()
+
+        self.top_p = torch.empty((max_num_reqs, ),
+                                 dtype=torch.float32,
+                                 device=device)
+        self.top_p_cpu_tensor = torch.empty((max_num_reqs, ),
+                                            dtype=torch.float32,
+                                            device="cpu",
+                                            pin_memory=pin_memory)
+        self.top_p_cpu = self.top_p_cpu_tensor.numpy()
+        self.top_p_reqs: set[str] = set()
+
+        self.top_k = torch.empty((max_num_reqs, ),
+                                 dtype=torch.int32,
+                                 device=device)
+        self.top_k_cpu_tensor = torch.empty((max_num_reqs, ),
+                                            dtype=torch.int32,
+                                            device="cpu",
+                                            pin_memory=pin_memory)
+        self.top_k_cpu = self.top_k_cpu_tensor.numpy()
+        self.top_k_reqs: set[str] = set()
+
+        # IDs of requests which do not support spec decoding
+        self.spec_decode_unsupported_reqs: set[str] = set()
+
+        # Frequency penalty related data structures
+        self.frequency_penalties = torch.empty((max_num_reqs, ),
+                                               dtype=torch.float,
+                                               device=device)
+        self.frequency_penalties_cpu_tensor = torch.empty(
+            (max_num_reqs, ),
+            dtype=torch.float,
+            device="cpu",
+            pin_memory=pin_memory)
+        self.frequency_penalties_cpu = \
+            self.frequency_penalties_cpu_tensor.numpy()
+        self.frequency_penalties_reqs: set[str] = set()
+
+        # Presence penalty related data structures
+        self.presence_penalties = torch.empty((max_num_reqs, ),
+                                              dtype=torch.float,
+                                              device=device)
+        self.presence_penalties_cpu_tensor = torch.empty((max_num_reqs, ),
+                                                         dtype=torch.float,
+                                                         device="cpu",
+                                                         pin_memory=pin_memory)
+        self.presence_penalties_cpu = self.presence_penalties_cpu_tensor.numpy(
+        )
+        self.presence_penalties_reqs: set[str] = set()
+
+        # Repetition penalty related data structures
+        self.repetition_penalties = torch.empty((max_num_reqs, ),
+                                                dtype=torch.float,
+                                                device=device)
+        self.repetition_penalties_cpu_tensor = torch.empty(
+            (max_num_reqs, ),
+            dtype=torch.float,
+            device="cpu",
+            pin_memory=pin_memory)
+        self.repetition_penalties_cpu = \
+            self.repetition_penalties_cpu_tensor.numpy()
+        self.repetition_penalties_reqs: set[str] = set()
+
+        # lora related
+        self.request_lora_mapping = np.zeros((self.max_num_reqs, ),
+                                             dtype=np.int32)
+        self.lora_id_to_request_ids: dict[int, set[str]] = {}
+        self.lora_id_to_lora_request: dict[int, LoRARequest] = {}
+
+        # req_index -> generator
+        # NOTE(woosuk): The indices of the requests that do not have their own
+        # generator should not be included in the dictionary.
+        self.generators: dict[int, torch.Generator] = {}
+
+        self.num_logprobs: dict[str, int] = {}
+        # NOTE(rob): num_prompt_logprobs only includes reqs
+        # that are currently in the prefill phase.
+        self.num_prompt_logprobs: dict[str, int] = {}
+
+        # To accumulate prompt logprobs tensor chunks across prefill steps.
+        self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {}
+
+        # Internal representation of per-step batch state changes, used for
+        # reordering persistent batch and generating logitsprocs batch state
+        # updates. Should reset each step.
+        self.batch_update_builder = BatchUpdateBuilder()
+
+        # Define logits processors.
+        # TODO(andy): logits processor list should be extensible via engine
+        # constructor argument; for now the list is fixed.
+        self.logitsprocs = init_builtin_logitsprocs(
+            pin_memory_available=pin_memory,
+            max_num_reqs=max_num_reqs + 1,
+            device=device)
+
+        # TODO convert this to LogitsProcessor
+        self.has_allowed_token_ids: set[str] = set()
+        # NOTE(lufang): In the mask tensor, if the corresponding token allowed,
+        # the value is False. Since we use masked_fill_ to set -inf.
+        self.allowed_token_ids_mask: Optional[torch.Tensor] = None
+        self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None
+
+        # req_index -> bad_words_token_ids
+        self.bad_words_token_ids: dict[int, list[list[int]]] = {}
+
+        self.logits_processing_needs_token_ids = np.zeros(max_num_reqs,
+                                                          dtype=bool)
+
+        self.req_output_token_ids: list[Optional[list[int]]] = []
+
+        # This is updated each time the batch constituents change.
+        self.sampling_metadata = self._make_sampling_metadata()
+
+        self.pooling_params: dict[str, PoolingParams] = {}
+
+    @property
+    def req_ids(self) -> list[str]:
+        # None elements should only be present transiently
+        # while performing state updates to the batch.
+        return cast(list[str], self._req_ids)
+
+    def _get_next_add_index(self) -> int:
+        if (req_index := self.batch_update_builder.pop_removed()) is not None:
+            # Fill the empty index.
+            return req_index
+        # Append to end
+        return self.num_reqs
+
+    def _register_add_request(self, request: "CachedRequestState") -> int:
+        """Track add-request operations"""
+        req_index = self._get_next_add_index()
+        assert req_index < self.max_num_reqs
+        params = (request.sampling_params
+                  if request.sampling_params else request.pooling_params)
+        self.batch_update_builder.added.append(
+            (req_index, params, request.output_token_ids))
+        return req_index
+
+    def add_request(
+        self,
+        request: "CachedRequestState",
+    ) -> int:
+        req_index = self._register_add_request(request)
+
+        req_id = request.req_id
+        if req_index == len(self._req_ids):
+            self._req_ids.append(req_id)
+            self.req_output_token_ids.append(request.output_token_ids)
+        else:
+            self._req_ids[req_index] = req_id
+            self.req_output_token_ids[req_index] = request.output_token_ids
+
+        self.req_id_to_index[req_id] = req_index
+
+        # Copy the prompt token ids and output token ids.
+        num_prompt_tokens = len(request.prompt_token_ids)
+        self.num_prompt_tokens[req_index] = num_prompt_tokens
+        self.token_ids_cpu[
+            req_index, :num_prompt_tokens] = request.prompt_token_ids
+        start_idx = num_prompt_tokens
+        end_idx = start_idx + len(request.output_token_ids)
+        self.token_ids_cpu[req_index,
+                           start_idx:end_idx] = request.output_token_ids
+        # Number of token ids in token_ids_cpu.
+        # NOTE(woosuk): This may include spec decode tokens.
+        self.num_tokens[req_index] = request.num_tokens
+        # Number of tokens without spec decode tokens.
+        self.num_tokens_no_spec[req_index] = request.num_tokens
+
+        self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens
+        self.block_table.add_row(request.block_ids, req_index)
+
+        if sampling_params := request.sampling_params:
+            if (self.is_spec_decode
+                    and is_spec_decode_unsupported(sampling_params)):
+                self.spec_decode_unsupported_reqs.add(req_id)
+            if sampling_params.sampling_type == SamplingType.GREEDY:
+                # Avoid later division by zero.
+                self.temperature_cpu[req_index] = -1.0
+                self.greedy_reqs.add(req_id)
+            else:
+                self.temperature_cpu[req_index] = sampling_params.temperature
+                self.random_reqs.add(req_id)
+
+            self.top_p_cpu[req_index] = sampling_params.top_p
+            if sampling_params.top_p < 1:
+                self.top_p_reqs.add(req_id)
+            top_k = sampling_params.top_k
+            if 0 < top_k < self.vocab_size:
+                self.top_k_reqs.add(req_id)
+            else:
+                top_k = self.vocab_size
+            self.top_k_cpu[req_index] = top_k
+            self.frequency_penalties_cpu[
+                req_index] = sampling_params.frequency_penalty
+            if sampling_params.frequency_penalty != 0.0:
+                self.frequency_penalties_reqs.add(req_id)
+            self.presence_penalties_cpu[
+                req_index] = sampling_params.presence_penalty
+            if sampling_params.presence_penalty != 0.0:
+                self.presence_penalties_reqs.add(req_id)
+            self.repetition_penalties_cpu[
+                req_index] = sampling_params.repetition_penalty
+            if sampling_params.repetition_penalty != 1.0:
+                self.repetition_penalties_reqs.add(req_id)
+
+            # NOTE(woosuk): self.generators should not include the requests that
+            # do not have their own generator.
+            if request.generator is not None:
+                self.generators[req_index] = request.generator
+
+            if sampling_params.logprobs is not None:
+                self.num_logprobs[req_id] = sampling_params.logprobs
+            if sampling_params.prompt_logprobs is not None:
+                self.num_prompt_logprobs[
+                    req_id] = sampling_params.prompt_logprobs
+
+            if sampling_params.allowed_token_ids:
+                self.has_allowed_token_ids.add(req_id)
+                if self.allowed_token_ids_mask_cpu_tensor is None:
+                    # Lazy allocation for this tensor, which can be large.
+                    # False means we don't fill with -inf.
+                    self.allowed_token_ids_mask = torch.zeros(
+                        self.max_num_reqs,
+                        self.vocab_size,
+                        dtype=torch.bool,
+                        device=self.device)
+                    self.allowed_token_ids_mask_cpu_tensor = torch.zeros(
+                        self.max_num_reqs,
+                        self.vocab_size,
+                        dtype=torch.bool,
+                        device="cpu")
+                self.allowed_token_ids_mask_cpu_tensor[req_index] = True
+                # False means we don't fill with -inf.
+                self.allowed_token_ids_mask_cpu_tensor[req_index][
+                    sampling_params.allowed_token_ids] = False
+
+            if sampling_params.bad_words_token_ids:
+                self.bad_words_token_ids[
+                    req_index] = sampling_params.bad_words_token_ids
+        elif pooling_params := request.pooling_params:
+            self.pooling_params[req_id] = pooling_params
+            self.logits_processing_needs_token_ids[req_index] = (
+                pooling_params.requires_token_ids)
+        else:
+            raise NotImplementedError(request)
+
+        # Add request lora ID
+        if request.lora_request:
+            lora_id = request.lora_request.lora_int_id
+            if lora_id not in self.lora_id_to_request_ids:
+                self.lora_id_to_request_ids[lora_id] = set()
+
+            self.request_lora_mapping[req_index] = lora_id
+            self.lora_id_to_request_ids[lora_id].add(request.req_id)
+            self.lora_id_to_lora_request[lora_id] = request.lora_request
+        else:
+            # No LoRA
+            self.request_lora_mapping[req_index] = 0
+
+        return req_index
+
+    def remove_request(self, req_id: str) -> Optional[int]:
+        """This method must always be followed by a call to condense().
+
+        Args:
+          req_id: request to remove
+
+        Returns:
+          Removed request index, or `None` if `req_id` not recognized
+        """
+
+        req_index = self.req_id_to_index.pop(req_id, None)
+        if req_index is None:
+            return None
+        self.batch_update_builder.removed_append(req_index)
+        self._req_ids[req_index] = None
+        self.req_output_token_ids[req_index] = None
+
+        self.greedy_reqs.discard(req_id)
+        self.random_reqs.discard(req_id)
+        self.top_p_reqs.discard(req_id)
+        self.top_k_reqs.discard(req_id)
+        self.spec_decode_unsupported_reqs.discard(req_id)
+        self.frequency_penalties_reqs.discard(req_id)
+        self.presence_penalties_reqs.discard(req_id)
+        self.repetition_penalties_reqs.discard(req_id)
+        self.generators.pop(req_index, None)
+        self.num_logprobs.pop(req_id, None)
+        self.num_prompt_logprobs.pop(req_id, None)
+        self.in_progress_prompt_logprobs_cpu.pop(req_id, None)
+
+        # LoRA
+        lora_id = self.request_lora_mapping[req_index]
+        if lora_id != 0:
+            self.lora_id_to_request_ids[lora_id].discard(req_id)
+            if len(self.lora_id_to_request_ids[lora_id]) == 0:
+                self.lora_id_to_request_ids.pop(lora_id)
+                self.lora_id_to_lora_request.pop(lora_id)
+            self.request_lora_mapping[req_index] = 0
+
+        self.has_allowed_token_ids.discard(req_id)
+        if self.allowed_token_ids_mask_cpu_tensor is not None:
+            # False means we don't fill with -inf.
+            self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False)
+        self.bad_words_token_ids.pop(req_index, None)
+        self.pooling_params.pop(req_id, None)
+        return req_index
+
+    def swap_states(self, i1: int, i2: int) -> None:
+        self.batch_update_builder.moved.append(
+            (i1, i2, MoveDirectionality.SWAP))
+        old_id_i1 = self._req_ids[i1]
+        old_id_i2 = self._req_ids[i2]
+        self._req_ids[i1], self._req_ids[i2] =\
+            self._req_ids[i2], self._req_ids[i1] # noqa
+        self.req_output_token_ids[i1], self.req_output_token_ids[i2] =\
+            self.req_output_token_ids[i2], self.req_output_token_ids[i1]
+        assert old_id_i1 is not None and old_id_i2 is not None
+        self.req_id_to_index[old_id_i1], self.req_id_to_index[old_id_i2] =\
+            self.req_id_to_index[old_id_i2], self.req_id_to_index[old_id_i1]
+        self.num_tokens[i1], self.num_tokens[i2] =\
+            self.num_tokens[i2], self.num_tokens[i1]
+        self.num_tokens_no_spec[i1], self.num_tokens_no_spec[i2] =\
+            self.num_tokens_no_spec[i2], self.num_tokens_no_spec[i1]
+        self.num_prompt_tokens[i1], self.num_prompt_tokens[i2] =\
+            self.num_prompt_tokens[i2], self.num_prompt_tokens[i1]
+        self.num_computed_tokens_cpu[i1], self.num_computed_tokens_cpu[i2] =\
+            self.num_computed_tokens_cpu[i2], self.num_computed_tokens_cpu[i1]
+        self.temperature_cpu[i1], self.temperature_cpu[i2] =\
+            self.temperature_cpu[i2], self.temperature_cpu[i1]
+        self.top_p_cpu[i1], self.top_p_cpu[i2] =\
+            self.top_p_cpu[i2], self.top_p_cpu[i1]
+        self.top_k_cpu[i1], self.top_k_cpu[i2] =\
+            self.top_k_cpu[i2], self.top_k_cpu[i1]
+        self.frequency_penalties_cpu[i1], self.frequency_penalties_cpu[i2] =\
+            self.frequency_penalties_cpu[i2], self.frequency_penalties_cpu[i1]
+        self.presence_penalties_cpu[i1], self.presence_penalties_cpu[i2] =\
+            self.presence_penalties_cpu[i2], self.presence_penalties_cpu[i1]
+        self.repetition_penalties_cpu[i1], self.repetition_penalties_cpu[i2] =\
+            self.repetition_penalties_cpu[i2], self.repetition_penalties_cpu[i1]
+
+        # NOTE: the following is unsafe
+        # self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
+        #     self.token_ids_cpu[i2, ...], self.token_ids_cpu[i1, ...]
+        # instead, we need to temporiarily copy the data for one of the indices
+        # TODO(lucas): optimize this by only copying valid indices
+        tmp = self.token_ids_cpu[i1, ...].copy()
+        self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...]
+        self.token_ids_cpu[i2, ...] = tmp
+
+        swap_dict_values(self.generators, i1, i2)
+        swap_dict_values(self.bad_words_token_ids, i1, i2)
+
+        self.request_lora_mapping[i1], self.request_lora_mapping[i2] =\
+            self.request_lora_mapping[i2], self.request_lora_mapping[i1]
+
+        if self.allowed_token_ids_mask_cpu_tensor is not None:
+            self.allowed_token_ids_mask_cpu_tensor[i1], \
+                self.allowed_token_ids_mask_cpu_tensor[i2] =\
+                self.allowed_token_ids_mask_cpu_tensor[i2], \
+                    self.allowed_token_ids_mask_cpu_tensor[i1]
+        self.block_table.swap_row(i1, i2)
+
+    def condense(self) -> None:
+        """Slide non-empty requests down into lower, empty indices.
+
+        Any consecutive empty indices at the very end of the list are not
+        filled.
+
+        Args:
+          empty_req_indices: empty indices which may be filled.
+
+        Returns:
+          swaps: list of (from,to) swap tuples for moved requests
+          empty_req_indices: indices not filled by condensation
+        """
+        if not (empty_req_indices := self.batch_update_builder.removed):
+            # All removed requests were replaced by added requests, or else no
+            # requests were removed at all. No condense() needed
+            return
+        num_reqs = self.num_reqs
+        if num_reqs == 0:
+            # The batched states are empty.
+            self._req_ids.clear()
+            self.req_output_token_ids.clear()
+            return
+
+        # NOTE(woosuk): This function assumes that the empty_req_indices
+        # is sorted in descending order.
+        last_req_index = num_reqs + len(empty_req_indices) - 1
+        while empty_req_indices:
+            # Find the largest non-empty index.
+            while last_req_index in empty_req_indices:
+                last_req_index -= 1
+
+            # Find the smallest empty index.
+            empty_index = self.batch_update_builder.peek_removed()
+            assert empty_index is not None
+            if empty_index >= last_req_index:
+                break
+
+            # Move active request down into empty request
+            # index.
+            self.batch_update_builder.pop_removed()
+            self.batch_update_builder.moved.append(
+                (last_req_index, empty_index,
+                 MoveDirectionality.UNIDIRECTIONAL))
+            req_id = self._req_ids[last_req_index]
+            output_token_ids = self.req_output_token_ids[last_req_index]
+            assert req_id is not None
+            self._req_ids[empty_index] = req_id
+            self._req_ids[last_req_index] = None
+            self.req_output_token_ids[empty_index] = output_token_ids
+            self.req_output_token_ids[last_req_index] = None
+            self.req_id_to_index[req_id] = empty_index
+
+            num_tokens = self.num_tokens[last_req_index]
+            self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
+                last_req_index, :num_tokens]
+            self.num_tokens[empty_index] = num_tokens
+            self.num_tokens_no_spec[empty_index] = self.num_tokens_no_spec[
+                last_req_index]
+            self.num_prompt_tokens[empty_index] = self.num_prompt_tokens[
+                last_req_index]
+            self.num_computed_tokens_cpu[
+                empty_index] = self.num_computed_tokens_cpu[last_req_index]
+            self.block_table.move_row(last_req_index, empty_index)
+            self.temperature_cpu[empty_index] = self.temperature_cpu[
+                last_req_index]
+            self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index]
+            self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index]
+            self.frequency_penalties_cpu[
+                empty_index] = self.frequency_penalties_cpu[last_req_index]
+            self.presence_penalties_cpu[
+                empty_index] = self.presence_penalties_cpu[last_req_index]
+            self.repetition_penalties_cpu[
+                empty_index] = self.repetition_penalties_cpu[last_req_index]
+            generator = self.generators.pop(last_req_index, None)
+            if generator is not None:
+                self.generators[empty_index] = generator
+
+            self.request_lora_mapping[empty_index] = self.request_lora_mapping[
+                last_req_index]
+
+            # TODO convert these to LogitsProcessors
+            if self.allowed_token_ids_mask_cpu_tensor is not None:
+                self.allowed_token_ids_mask_cpu_tensor[
+                    empty_index] = self.allowed_token_ids_mask_cpu_tensor[
+                        last_req_index]
+
+            bad_words_token_ids = self.bad_words_token_ids.pop(
+                last_req_index, None)
+            if bad_words_token_ids is not None:
+                self.bad_words_token_ids[empty_index] = bad_words_token_ids
+
+            # Decrement last_req_index since it is now empty.
+            last_req_index -= 1
+
+        # Trim lists to the batch size.
+        del self._req_ids[self.num_reqs:]
+        del self.req_output_token_ids[self.num_reqs:]
+
+    def refresh_metadata(self):
+        """Apply batch updates, reset input batch at end of step
+
+        * Apply batch add/remove/permute to logits procs' states
+        * If batch state is modified, update sampling metadata
+        """
+        batch_update = self.batch_update_builder.get_and_reset(self.num_reqs)
+        for logit_proc in self.logitsprocs.all:
+            logit_proc.update_state(batch_update)
+        if batch_update:
+            self.sampling_metadata = self._make_sampling_metadata()
+
+    def _make_sampling_metadata(self) -> SamplingMetadata:
+        num_reqs = self.num_reqs
+        if not self.all_greedy:
+            temperature = copy_slice(self.temperature_cpu_tensor,
+                                     self.temperature, num_reqs)
+        else:
+            temperature = None
+        if not self.no_top_p:
+            copy_slice(self.top_p_cpu_tensor, self.top_p, num_reqs)
+        if not self.no_top_k:
+            copy_slice(self.top_k_cpu_tensor, self.top_k, num_reqs)
+
+        if not self.no_penalties:
+            # Since syncing these tensors is expensive only copy them
+            # if necessary i.e. if there are requests which require
+            # penalties to be applied during sampling.
+            copy_slice(self.frequency_penalties_cpu_tensor,
+                       self.frequency_penalties, num_reqs)
+            copy_slice(self.presence_penalties_cpu_tensor,
+                       self.presence_penalties, num_reqs)
+            copy_slice(self.repetition_penalties_cpu_tensor,
+                       self.repetition_penalties, num_reqs)
+
+        needs_prompt_token_ids = (
+            not self.no_penalties
+            or self.logits_processing_needs_token_ids[:num_reqs].any())
+        if needs_prompt_token_ids:
+            # The prompt tokens are used only for applying penalties or
+            # step pooling during the sampling/pooling process.
+            # Hence copy these tensors only when there are requests which
+            # need penalties/step_pooler to be applied.
+            prompt_token_ids = self._make_prompt_token_ids_tensor()
+        else:
+            prompt_token_ids = None
+
+        allowed_token_ids_mask: Optional[torch.Tensor] = None
+        if not self.no_allowed_token_ids:
+            assert self.allowed_token_ids_mask is not None
+            copy_slice(self.allowed_token_ids_mask_cpu_tensor,
+                       self.allowed_token_ids_mask, num_reqs)
+            allowed_token_ids_mask = self.allowed_token_ids_mask[:num_reqs]
+
+        return SamplingMetadata(
+            temperature=temperature,
+            all_greedy=self.all_greedy,
+            all_random=self.all_random,
+            top_p=None if self.no_top_p else self.top_p[:num_reqs],
+            top_k=None if self.no_top_k else self.top_k[:num_reqs],
+            generators=self.generators,
+            max_num_logprobs=self.max_num_logprobs,
+            prompt_token_ids=prompt_token_ids,
+            frequency_penalties=self.frequency_penalties[:num_reqs],
+            presence_penalties=self.presence_penalties[:num_reqs],
+            repetition_penalties=self.repetition_penalties[:num_reqs],
+            output_token_ids=cast(list[list[int]], self.req_output_token_ids),
+            no_penalties=self.no_penalties,
+            allowed_token_ids_mask=allowed_token_ids_mask,
+            bad_words_token_ids=self.bad_words_token_ids,
+            logitsprocs=self.logitsprocs,
+        )
+
+    @property
+    def pooling_metadata(self) -> PoolingMetadata:
+        if len(self.pooling_params) == 0:
+            pooling_params = []
+        else:
+            # Note, for now this assumes that all request in the batch
+            # are either sampling or pooling requests
+            assert len(self.req_ids) == len(self.pooling_params)
+            pooling_params = [
+                self.pooling_params[req_id] for req_id in self.req_ids
+            ]
+
+        return PoolingMetadata(
+            prompt_lens=torch.from_numpy(
+                self.num_prompt_tokens[:self.num_reqs]).to(self.device),
+            prompt_token_ids=self.sampling_metadata.prompt_token_ids,
+            pooling_params=pooling_params,
+        )
+
+    def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
+        max_prompt_len = self.num_prompt_tokens[:self.num_reqs].max()
+        prompt_token_ids_cpu_tensor = torch.empty(
+            (self.num_reqs, max_prompt_len),
+            device="cpu",
+            dtype=torch.int64,
+            pin_memory=self.pin_memory,
+        )
+        prompt_token_ids = prompt_token_ids_cpu_tensor.numpy()
+        prompt_token_ids[:] = self.token_ids_cpu[:self.
+                                                 num_reqs, :max_prompt_len]
+        # Use the value of vocab_size as a pad since we don't have a
+        # token_id of this value.
+        for i in range(self.num_reqs):
+            prompt_token_ids[i, self.num_prompt_tokens[i]:] = self.vocab_size
+        return prompt_token_ids_cpu_tensor.to(device=self.device,
+                                              non_blocking=True)
+
+    def make_lora_inputs(
+        self, num_scheduled_tokens: np.ndarray
+    ) -> tuple[tuple[int, ...], tuple[int, ...], set[LoRARequest]]:
+        """
+        Given the num_scheduled_tokens for each request in the batch, return
+        datastructures used to activate the current LoRAs.
+        Returns:
+            1. prompt_lora_mapping: A tuple of size self.num_reqs where,
+               prompt_lora_mapping[i] is the LoRA id to use for the ith prompt.
+            2. token_lora_mapping: A tuple of size np.sum(num_scheduled_tokens)
+               where, token_lora_mapping[i] is the LoRA id to use for ith token.
+            3. lora_requests: Set of relevant LoRA requests.
+        """
+
+        req_lora_mapping = self.request_lora_mapping[:self.num_reqs]
+        prompt_lora_mapping = tuple(req_lora_mapping)
+        token_lora_mapping = tuple(
+            req_lora_mapping.repeat(num_scheduled_tokens))
+        active_lora_requests: set[LoRARequest] = set(
+            self.lora_id_to_lora_request.values())
+
+        return prompt_lora_mapping, token_lora_mapping, active_lora_requests
+
+    @property
+    def num_reqs(self) -> int:
+        return len(self.req_id_to_index)
+
+    @property
+    def all_greedy(self) -> bool:
+        return len(self.random_reqs) == 0
+
+    @property
+    def all_random(self) -> bool:
+        return len(self.greedy_reqs) == 0
+
+    @property
+    def no_top_p(self) -> bool:
+        return len(self.top_p_reqs) == 0
+
+    @property
+    def no_top_k(self) -> bool:
+        return len(self.top_k_reqs) == 0
+
+    @property
+    def no_penalties(self) -> bool:
+        return (len(self.presence_penalties_reqs) == 0
+                and len(self.frequency_penalties_reqs) == 0
+                and len(self.repetition_penalties_reqs) == 0)
+
+    @property
+    def max_num_logprobs(self) -> Optional[int]:
+        return max(self.num_logprobs.values()) if self.num_logprobs else None
+
+    @property
+    def no_prompt_logprob(self) -> bool:
+        return not self.num_prompt_logprobs
+
+    @property
+    def no_allowed_token_ids(self) -> bool:
+        return len(self.has_allowed_token_ids) == 0
diff --git a/vllm_v0.10.0/vllm/v1/worker/gpu_model_runner.py b/vllm_v0.10.0/vllm/v1/worker/gpu_model_runner.py
new file mode 100644
index 0000000..a5bf197
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/worker/gpu_model_runner.py
@@ -0,0 +1,2854 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import copy
+import gc
+import time
+from contextlib import contextmanager
+from typing import TYPE_CHECKING, Any, Optional, Union, cast
+
+import numpy as np
+import torch
+import torch.distributed
+import torch.nn as nn
+from tqdm import tqdm
+
+import vllm.envs as envs
+from vllm.attention import AttentionType, get_attn_backend
+from vllm.attention.backends.abstract import AttentionBackend
+from vllm.attention.layer import Attention
+from vllm.compilation.counter import compilation_counter
+from vllm.config import (CompilationLevel, VllmConfig,
+                         get_layers_from_vllm_config, update_config)
+from vllm.distributed.eplb.eplb_state import EplbState
+from vllm.distributed.kv_transfer import (get_kv_transfer_group,
+                                          has_kv_transfer_group)
+from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
+from vllm.distributed.parallel_state import (
+    get_pp_group, get_tp_group, graph_capture, is_global_first_rank,
+    prepare_communication_buffer_for_model)
+from vllm.forward_context import (DPMetadata, get_forward_context,
+                                  set_forward_context)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaBase
+from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
+from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader
+from vllm.model_executor.models.interfaces import is_mixture_of_experts
+from vllm.model_executor.models.interfaces_base import (VllmModelForPooling,
+                                                        is_pooling_model)
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.multimodal.utils import group_mm_inputs_by_modality
+from vllm.pooling_params import PoolingParams, PoolingTask
+from vllm.sampling_params import SamplingType
+from vllm.sequence import IntermediateTensors, PoolerOutput
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
+                        GiB_bytes, LazyLoader, check_use_alibi, get_dtype_size,
+                        is_pin_memory_available, round_up)
+from vllm.v1.attention.backends.mamba_attn import Mamba2AttentionBackend
+from vllm.v1.attention.backends.utils import (
+    AttentionMetadataBuilder, CommonAttentionMetadata,
+    make_local_attention_virtual_batches)
+from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
+from vllm.v1.kv_cache_interface import (AttentionSpec,
+                                        ChunkedLocalAttentionSpec,
+                                        FullAttentionSpec, KVCacheConfig,
+                                        KVCacheSpec, MambaSpec,
+                                        SlidingWindowSpec)
+from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
+                             ModelRunnerOutput)
+from vllm.v1.pool.metadata import PoolingMetadata
+from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.sample.rejection_sampler import RejectionSampler
+from vllm.v1.sample.sampler import Sampler
+from vllm.v1.spec_decode.eagle import EagleProposer
+from vllm.v1.spec_decode.medusa import MedusaProposer
+from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
+from vllm.v1.spec_decode.ngram_proposer import NgramProposer
+from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
+from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
+
+from ..sample.logits_processor import LogitsProcessorManager
+from .utils import (bind_kv_cache, gather_mm_placeholders,
+                    initialize_kv_cache_for_kv_sharing,
+                    sanity_check_mm_encoder_outputs, scatter_mm_placeholders)
+
+if TYPE_CHECKING:
+    import xgrammar as xgr
+    import xgrammar.kernels.apply_token_bitmask_inplace_torch_compile as xgr_torch_compile  # noqa: E501
+
+    from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
+    from vllm.v1.core.sched.output import SchedulerOutput
+else:
+    xgr = LazyLoader("xgr", globals(), "xgrammar")
+    xgr_torch_compile = LazyLoader(
+        "xgr_torch_compile", globals(),
+        "xgrammar.kernels.apply_token_bitmask_inplace_torch_compile")
+
+logger = init_logger(__name__)
+
+
+class GPUModelRunner(LoRAModelRunnerMixin):
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.compilation_config = vllm_config.compilation_config
+        self.lora_config = vllm_config.lora_config
+        self.load_config = vllm_config.load_config
+        self.parallel_config = vllm_config.parallel_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.speculative_config = vllm_config.speculative_config
+        self.observability_config = vllm_config.observability_config
+
+        from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
+        set_cpu_offload_max_bytes(
+            int(self.cache_config.cpu_offload_gb * 1024**3))
+
+        model_config = self.model_config
+        cache_config = self.cache_config
+        scheduler_config = self.scheduler_config
+        parallel_config = self.parallel_config
+        self.device = device
+        self.pin_memory = is_pin_memory_available()
+        self.dtype = self.model_config.dtype
+        if cache_config.cache_dtype == "auto":
+            self.kv_cache_dtype = self.dtype
+        else:
+            self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
+                cache_config.cache_dtype]
+
+        self.is_multimodal_model = model_config.is_multimodal_model
+        self.is_pooling_model = model_config.pooler_config is not None
+        self.model_supports_multimodal_raw_input = (
+            model_config.model_supports_multimodal_raw_input)
+        self.max_model_len = model_config.max_model_len
+        self.max_num_tokens = scheduler_config.max_num_batched_tokens
+        self.max_num_reqs = scheduler_config.max_num_seqs
+
+        # Model-related.
+        self.num_query_heads = model_config.get_num_attention_heads(
+            parallel_config)
+        self.hidden_size = model_config.get_hidden_size()
+        self.attention_chunk_size = model_config.attention_chunk_size
+
+        self.cascade_attn_enabled = not self.model_config.disable_cascade_attn
+
+        # Multi-modal data support
+        self.mm_registry = MULTIMODAL_REGISTRY
+        self.uses_mrope = model_config.uses_mrope
+
+        encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
+            model_config=model_config,
+            scheduler_config=scheduler_config,
+            mm_registry=self.mm_registry,
+        )
+        self.max_num_encoder_input_tokens = encoder_compute_budget
+        self.encoder_cache_size = encoder_cache_size
+
+        # Sampler
+        self.sampler = Sampler(logprobs_mode=self.model_config.logprobs_mode)
+
+        self.eplb_state: Optional[EplbState] = None
+        """
+        State of the expert parallelism load balancer.
+
+        Will be lazily initialized when the model is loaded.
+        """
+
+        # Lazy initializations
+        # self.model: nn.Module  # Set after load_model
+        # Initialize in initialize_kv_cache
+        self.kv_caches: list[torch.Tensor] = []
+        self.attn_metadata_builders: list[AttentionMetadataBuilder] = []
+        self.attn_backends: list[type[AttentionBackend]] = []
+        # self.kv_cache_config: KVCacheConfig
+
+        # req_id -> (input_id -> encoder_output)
+        self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {}
+
+        self.use_aux_hidden_state_outputs = False
+        # Set up speculative decoding.
+        # NOTE(Jiayi): currently we put the entire draft model on
+        # the last PP rank. This is not ideal if there are many
+        # layers in the draft model.
+        if self.speculative_config and get_pp_group().is_last_rank:
+            if self.speculative_config.method == "ngram":
+                self.drafter = NgramProposer(self.vllm_config)
+            elif self.speculative_config.use_eagle():
+                self.drafter = EagleProposer(self.vllm_config, self.device,
+                                             self)  # type: ignore
+                if self.speculative_config.method == "eagle3":
+                    self.use_aux_hidden_state_outputs = True
+            elif self.speculative_config.method == "medusa":
+                self.drafter = MedusaProposer(
+                    vllm_config=self.vllm_config,
+                    device=self.device)  # type: ignore
+            else:
+                raise ValueError("Unknown speculative decoding method: "
+                                 f"{self.speculative_config.method}")
+            self.rejection_sampler = RejectionSampler()
+
+        # Request states.
+        self.requests: dict[str, CachedRequestState] = {}
+
+        # Input Batch
+        # NOTE(Chen): Ideally, we should initialize the input batch inside
+        # `initialize_kv_cache` based on the kv cache config. However, as in
+        # https://github.com/vllm-project/vllm/pull/18298, due to some unknown
+        # reasons, we have to initialize the input batch before `load_model`,
+        # quantization + weight offloading will fail otherwise. As a temporary
+        # solution, we initialize the input batch here, and re-initialize it
+        # in `initialize_kv_cache` if the block_sizes here is different from
+        # the block_sizes in the kv cache config.
+        self.input_batch = InputBatch(
+            max_num_reqs=self.max_num_reqs,
+            max_model_len=self.max_model_len,
+            max_num_batched_tokens=self.max_num_tokens,
+            device=self.device,
+            pin_memory=self.pin_memory,
+            vocab_size=self.model_config.get_vocab_size(),
+            block_sizes=[self.cache_config.block_size],
+            is_spec_decode=bool(self.vllm_config.speculative_config),
+        )
+
+        self.use_cuda_graph = (
+            self.vllm_config.compilation_config.level
+            == CompilationLevel.PIECEWISE
+            and self.vllm_config.compilation_config.use_cudagraph
+            and not self.model_config.enforce_eager)
+        # TODO(woosuk): Provide an option to tune the max cudagraph batch size.
+        # The convention is different.
+        # self.cudagraph_batch_sizes sorts in ascending order.
+        # The batch sizes in the config are in descending order.
+        self.cudagraph_batch_sizes = list(
+            reversed(self.compilation_config.cudagraph_capture_sizes))
+
+        self.full_cuda_graph = self.compilation_config.full_cuda_graph
+
+        # Cache the device properties.
+        self._init_device_properties()
+
+        # Persistent buffers for CUDA graphs.
+        self.input_ids = torch.zeros(self.max_num_tokens,
+                                     dtype=torch.int32,
+                                     device=self.device)
+        self.positions = torch.zeros(self.max_num_tokens,
+                                     dtype=torch.int64,
+                                     device=self.device)
+        self.query_start_loc = torch.zeros(self.max_num_reqs + 1,
+                                           dtype=torch.int32,
+                                           device=self.device)
+        self.seq_lens = torch.zeros(self.max_num_reqs,
+                                    dtype=torch.int32,
+                                    device=self.device)
+        self.slot_mapping = torch.zeros(self.max_num_tokens,
+                                        dtype=torch.int64,
+                                        device=self.device)
+
+        # None in the first PP rank. The rest are set after load_model.
+        self.intermediate_tensors: Optional[IntermediateTensors] = None
+
+        # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
+        if self.uses_mrope:
+            # NOTE: `mrope_positions` is implemented with one additional dummy
+            # position on purpose to make it non-contiguous so that it can work
+            # with torch compile.
+            # See detailed explanation in https://github.com/vllm-project/vllm/pull/12128#discussion_r1926431923
+
+            # NOTE: When M-RoPE is enabled, position ids are 3D regardless of
+            # the modality of inputs. For text-only inputs, each dimension has
+            # identical position IDs, making M-RoPE functionally equivalent to
+            # 1D-RoPE.
+            # See page 5 of https://arxiv.org/abs/2409.12191
+            self.mrope_positions = torch.zeros((3, self.max_num_tokens + 1),
+                                               dtype=torch.int64,
+                                               device=self.device)
+            self.mrope_positions_cpu = torch.zeros(
+                (3, self.max_num_tokens + 1),
+                dtype=torch.int64,
+                device="cpu",
+                pin_memory=self.pin_memory)
+            self.mrope_positions_np = self.mrope_positions_cpu.numpy()
+
+        # Only relevant for models using ALiBi (e.g, MPT)
+        self.use_alibi = check_use_alibi(model_config)
+
+        self.inputs_embeds = torch.zeros(
+            (self.max_num_tokens, self.hidden_size),
+            dtype=self.dtype,
+            device=self.device)
+
+        # OPTIMIZATION: Cache the tensors rather than creating them every step.
+        # Keep in int64 to avoid overflow with long context
+        self.arange_np = np.arange(max(self.max_num_reqs + 1,
+                                       self.max_model_len,
+                                       self.max_num_tokens),
+                                   dtype=np.int64)
+        # NOTE(woosuk): These tensors are "stateless", i.e., they are literally
+        # a faster version of creating a new tensor every time. Thus, we should
+        # not make any assumptions about the values in these tensors.
+        self.input_ids_cpu = torch.zeros(self.max_num_tokens,
+                                         dtype=torch.int32,
+                                         device="cpu",
+                                         pin_memory=self.pin_memory)
+        self.positions_cpu = torch.zeros(self.max_num_tokens,
+                                         dtype=torch.int64,
+                                         device="cpu",
+                                         pin_memory=self.pin_memory)
+        self.positions_np = self.positions_cpu.numpy()
+        self.query_start_loc_cpu = torch.zeros(self.max_num_reqs + 1,
+                                               dtype=torch.int32,
+                                               device="cpu",
+                                               pin_memory=self.pin_memory)
+        self.query_start_loc_np = self.query_start_loc_cpu.numpy()
+        self.seq_lens_cpu = torch.zeros(self.max_num_reqs,
+                                        dtype=torch.int32,
+                                        device="cpu",
+                                        pin_memory=self.pin_memory)
+        self.seq_lens_np = self.seq_lens_cpu.numpy()
+
+        # Layer pairings for cross-layer KV sharing.
+        # If an Attention layer `layer_name` is in the keys of this dict, it
+        # means this layer will perform attention using the keys and values
+        # from the KV cache of `shared_kv_cache_layers[layer_name]`.
+        self.shared_kv_cache_layers: dict[str, str] = {}
+
+    def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
+        """
+        Update the order of requests in the batch based on the attention
+        backend's needs. For example, some attention backends (namely MLA) may
+        want to separate requests based on if the attention computation will be
+        compute-bound or memory-bound.
+
+        Args:
+            scheduler_output: The scheduler output.
+        """
+        # Attention free models have zero kv_cache_goups, however models
+        # like Mamba are also attention free but use the kv_cache for
+        # keeping its internal state. This is why we check the number
+        # of kv_cache groups instead of solely checking
+        # for self.model_config.is_attention_free.
+        if len(self.kv_cache_config.kv_cache_groups) == 0:
+            return
+
+        self.attn_metadata_builders[0].reorder_batch(self.input_batch,
+                                                     scheduler_output)
+
+        # For models with multiple KV cache groups, the groups should agree on
+        # the same order of requests. We ensure this by only allowing the first
+        # group to reorder the batch and asserting that all other groups do not
+        # reorder the batch.
+        # TODO(tdoublep): make this more flexible so that any group can
+        # re-order the batch (not only the first).
+        # TODO(tdoublep): verify this during engine init instead of at runtime
+        for i in range(1, len(self.kv_cache_config.kv_cache_groups)):
+            batch_reordered = self.attn_metadata_builders[i].reorder_batch(
+                self.input_batch, scheduler_output)
+            assert not batch_reordered
+
+    # Note: used for model runner override.
+    def _init_device_properties(self) -> None:
+        """Initialize attributes from torch.cuda.get_device_properties
+        """
+        self.device_properties = torch.cuda.get_device_properties(self.device)
+        self.num_sms = self.device_properties.multi_processor_count
+
+    # Note: used for model runner override.
+    def _sync_device(self) -> None:
+        torch.cuda.synchronize()
+
+    def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
+        """Update the cached states and the persistent batch with the scheduler
+        output.
+
+        The updated states are used by the `_prepare_inputs` function to create
+        the input GPU tensors for the model.
+
+        The SamplingMetadata is updated and copied to the GPU if there is a
+        new/resumed/paused/finished request in the batch.
+        """
+        # Remove finished requests from the cached states.
+        for req_id in scheduler_output.finished_req_ids:
+            self.requests.pop(req_id, None)
+            self.encoder_cache.pop(req_id, None)
+        # Remove the finished requests from the persistent batch.
+        # NOTE(woosuk): There could be an edge case where finished_req_ids and
+        # scheduled_req_ids overlap. This happens when a request is aborted and
+        # then resubmitted with the same ID. In this case, we treat them as two
+        # distinct requests - clearing the cached states for the first request
+        # and handling the second as a new request.
+        for req_id in scheduler_output.finished_req_ids:
+            self.input_batch.remove_request(req_id)
+
+        # Free the cached encoder outputs.
+        for req_id, input_id in scheduler_output.free_encoder_input_ids:
+            encoder_outputs = self.encoder_cache.get(req_id)
+            if encoder_outputs is not None:
+                encoder_outputs.pop(input_id, None)
+                if not encoder_outputs:
+                    self.encoder_cache.pop(req_id, None)
+
+        # Remove the unscheduled requests from the persistent batch.
+        # NOTE(woosuk): The unscheduled requests are either preempted requests
+        # or running requests that are not scheduled in this step. We remove
+        # them from the persistent batch but keep their cached states since
+        # they will be scheduled again sometime in the future.
+        scheduled_req_ids = scheduler_output.num_scheduled_tokens.keys()
+        cached_req_ids = self.input_batch.req_id_to_index.keys()
+        unscheduled_req_ids = cached_req_ids - scheduled_req_ids
+        # NOTE(woosuk): The persistent batch optimization assumes that
+        # consecutive batches contain mostly the same requests. If batches
+        # have low request overlap (e.g., alternating between two distinct
+        # sets of requests), this optimization becomes very inefficient.
+        for req_id in unscheduled_req_ids:
+            self.input_batch.remove_request(req_id)
+
+        req_ids_to_add: list[str] = []
+        # Add new requests to the cached states.
+        for new_req_data in scheduler_output.scheduled_new_reqs:
+            req_id = new_req_data.req_id
+            sampling_params = new_req_data.sampling_params
+            pooling_params = new_req_data.pooling_params
+
+            if sampling_params and \
+                sampling_params.sampling_type == SamplingType.RANDOM_SEED:
+                generator = torch.Generator(device=self.device)
+                generator.manual_seed(sampling_params.seed)
+            else:
+                generator = None
+
+            if pooling_params:
+                assert (task := pooling_params.task) is not None, (
+                    "You did not set `task` in the API")
+
+                model = cast(VllmModelForPooling, self.model)
+                to_update = model.pooler.get_pooling_updates(task)
+                to_update.apply(pooling_params)
+
+            self.requests[req_id] = CachedRequestState(
+                req_id=req_id,
+                prompt_token_ids=new_req_data.prompt_token_ids,
+                mm_inputs=new_req_data.mm_inputs,
+                mm_positions=new_req_data.mm_positions,
+                sampling_params=sampling_params,
+                pooling_params=pooling_params,
+                generator=generator,
+                block_ids=new_req_data.block_ids,
+                num_computed_tokens=new_req_data.num_computed_tokens,
+                output_token_ids=[],
+                lora_request=new_req_data.lora_request,
+            )
+
+            # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
+            if self.uses_mrope:
+                image_grid_thw = []
+                video_grid_thw = []
+                second_per_grid_ts = []
+                audio_feature_lengths = []
+                use_audio_in_video = False
+                for mm_input in self.requests[req_id].mm_inputs:
+                    if mm_input.get("image_grid_thw") is not None:
+                        image_grid_thw.extend(
+                            mm_input["image_grid_thw"].tolist())
+                    if mm_input.get("video_grid_thw") is not None:
+                        video_grid_thw.extend(
+                            mm_input["video_grid_thw"].tolist())
+                    if mm_input.get("second_per_grid_ts") is not None:
+                        second_per_grid_ts.extend(
+                            mm_input["second_per_grid_ts"])
+                    if mm_input.get("audio_feature_lengths") is not None:
+                        audio_feature_lengths.extend(
+                            mm_input["audio_feature_lengths"])
+                    if mm_input.get("use_audio_in_video") is True:
+                        use_audio_in_video = True
+
+                hf_config = self.model_config.hf_config
+
+                self.requests[req_id].mrope_positions, \
+                    self.requests[req_id].mrope_position_delta = \
+                    MRotaryEmbedding.get_input_positions_tensor(
+                        self.requests[req_id].prompt_token_ids,
+                        hf_config=hf_config,
+                        image_grid_thw=image_grid_thw,
+                        video_grid_thw=video_grid_thw,
+                        second_per_grid_ts=second_per_grid_ts,
+                        audio_feature_lengths=audio_feature_lengths,
+                        use_audio_in_video=use_audio_in_video,
+                    )
+
+            req_ids_to_add.append(req_id)
+
+        # Update the states of the running/resumed requests.
+        is_last_rank = get_pp_group().is_last_rank
+        req_data = scheduler_output.scheduled_cached_reqs
+        for i, req_id in enumerate(req_data.req_ids):
+            req_state = self.requests[req_id]
+            num_computed_tokens = req_data.num_computed_tokens[i]
+            new_block_ids = req_data.new_block_ids[i]
+            resumed_from_preemption = req_data.resumed_from_preemption[i]
+
+            # Update the cached states.
+            req_state.num_computed_tokens = num_computed_tokens
+
+            if not is_last_rank:
+                # When using PP, the scheduler sends the sampled tokens back,
+                # because there's no direct communication between the first-
+                # stage worker and the last-stage worker.
+                new_token_ids = req_data.new_token_ids[i]
+                # Add the sampled token(s) from the previous step (if any).
+                # This doesn't include "unverified" tokens like spec tokens.
+                num_new_tokens = (num_computed_tokens + len(new_token_ids) -
+                                  req_state.num_tokens)
+                if num_new_tokens == 1:
+                    # Avoid slicing list in most common case.
+                    req_state.output_token_ids.append(new_token_ids[-1])
+                elif num_new_tokens > 0:
+                    req_state.output_token_ids.extend(
+                        new_token_ids[-num_new_tokens:])
+
+            # Update the block IDs.
+            if not resumed_from_preemption:
+                # Append the new blocks to the existing block IDs.
+                for block_ids, new_ids in zip(req_state.block_ids,
+                                              new_block_ids):
+                    block_ids.extend(new_ids)
+            else:
+                # The request is resumed from preemption.
+                # Replace the existing block IDs with the new ones.
+                req_state.block_ids = new_block_ids
+
+            req_index = self.input_batch.req_id_to_index.get(req_id)
+            if req_index is None:
+                # The request is not in the persistent batch.
+                # The request was either preempted and resumed later, or was not
+                # scheduled in the previous step and needs to be added again.
+                req_ids_to_add.append(req_id)
+                continue
+
+            # Update the persistent batch.
+            self.input_batch.num_computed_tokens_cpu[req_index] = (
+                num_computed_tokens)
+            self.input_batch.block_table.append_row(new_block_ids, req_index)
+
+            # For the last rank, we don't need to update the token_ids_cpu
+            # because the sampled tokens are already cached.
+            if not is_last_rank:
+                # Add new_token_ids to token_ids_cpu.
+                start_token_index = num_computed_tokens
+                end_token_index = num_computed_tokens + len(new_token_ids)
+                self.input_batch.token_ids_cpu[
+                    req_index,
+                    start_token_index:end_token_index] = new_token_ids
+                self.input_batch.num_tokens_no_spec[
+                    req_index] = end_token_index
+                self.input_batch.num_tokens[req_index] = end_token_index
+
+            # Add spec_token_ids to token_ids_cpu.
+            spec_token_ids = (
+                scheduler_output.scheduled_spec_decode_tokens.get(req_id, ()))
+            if spec_token_ids:
+                num_spec_tokens = len(spec_token_ids)
+                start_index = self.input_batch.num_tokens_no_spec[req_index]
+                end_token_index = start_index + num_spec_tokens
+                self.input_batch.token_ids_cpu[
+                    req_index, start_index:end_token_index] = spec_token_ids
+                # NOTE(woosuk): `num_tokens` here may include spec tokens.
+                self.input_batch.num_tokens[req_index] += num_spec_tokens
+
+        # Add the new or resumed requests to the persistent batch.
+        # The smaller empty indices are filled first.
+        for req_id in req_ids_to_add:
+            req_state = self.requests[req_id]
+            self.input_batch.add_request(req_state)
+
+        # Condense the batched states if there are gaps left by removed requests
+        self.input_batch.condense()
+        # Allow attention backend to reorder the batch, potentially
+        self._may_reorder_batch(scheduler_output)
+        # Refresh batch metadata with any pending updates.
+        self.input_batch.refresh_metadata()
+
+    def _init_model_kwargs_for_multimodal_model(
+        self,
+        scheduler_output: Optional["SchedulerOutput"] = None,
+        num_reqs: int = -1,
+    ) -> dict[str, Any]:
+
+        model_kwargs: dict[str, Any] = {}
+        if self.model_supports_multimodal_raw_input:
+            # This model requires the raw multimodal data in input.
+            if scheduler_output:
+                multi_modal_kwargs_list = []
+                for req in scheduler_output.scheduled_new_reqs:
+                    req_mm_inputs = req.mm_inputs
+                    if not isinstance(req_mm_inputs, list):
+                        req_mm_inputs = list(req_mm_inputs)
+                    multi_modal_kwargs_list.extend(req_mm_inputs)
+                multi_modal_kwargs = MultiModalKwargs.batch(
+                    multi_modal_kwargs_list)
+            else:
+                # The only case where SchedulerOutput is None is for
+                # a dummy run let's get some dummy data.
+                dummy_data = [
+                    self.mm_registry.get_decoder_dummy_data(
+                        model_config=self.model_config,
+                        seq_len=1).multi_modal_data for i in range(num_reqs)
+                ]
+                multi_modal_kwargs = MultiModalKwargs.batch(dummy_data)
+
+            model_kwargs.update(multi_modal_kwargs)
+
+        return model_kwargs
+
+    def _get_cumsum_and_arange(
+        self,
+        num_tokens: np.ndarray,
+        cumsum_dtype: Optional[np.dtype] = None,
+    ) -> tuple[np.ndarray, np.ndarray]:
+        """Get the cumulative sum and batched arange of the given array.
+        # E.g., [2, 5, 3] -> ([2, 7, 10], [0, 1, 0, 1, 2, 3, 4, 0, 1, 2])
+        # Equivalent to but faster than:
+        # np.concatenate([np.arange(n) for n in num_tokens])
+        """
+        # Step 1. [2, 5, 3] -> [2, 7, 10]
+        cu_num_tokens = np.cumsum(num_tokens, dtype=cumsum_dtype)
+        total_num_tokens = cu_num_tokens[-1]
+        # Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7]
+        cumsums_offsets = np.repeat(cu_num_tokens - num_tokens, num_tokens)
+        # Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
+        arange = self.arange_np[:total_num_tokens] - cumsums_offsets
+
+        return cu_num_tokens, arange
+
+    def _prepare_inputs(
+        self,
+        scheduler_output: "SchedulerOutput",
+    ) -> tuple[dict[str,
+                    Any], bool, torch.Tensor, Optional[SpecDecodeMetadata],
+               np.ndarray, Optional[CommonAttentionMetadata]]:
+        """
+        :return: tuple[
+            attn_metadata: layer-to-attention_metadata mapping,
+            attention_cuda_graphs: whether attention can run in cudagraph
+            logits_indices, spec_decode_metadata
+        ]
+        """
+        total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
+        assert total_num_scheduled_tokens > 0
+        num_reqs = self.input_batch.num_reqs
+        assert num_reqs > 0
+
+        # OPTIMIZATION: Start copying the block table first.
+        # This way, we can overlap the copy with the following CPU operations.
+        self.input_batch.block_table.commit_block_table(num_reqs)
+
+        # Get the number of scheduled tokens for each request.
+        req_ids = self.input_batch.req_ids
+        tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
+        num_scheduled_tokens = np.array(tokens, dtype=np.int32)
+        max_num_scheduled_tokens = max(tokens)
+
+        # Get request indices.
+        # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
+        req_indices = np.repeat(self.arange_np[:num_reqs],
+                                num_scheduled_tokens)
+
+        # cu_num_tokens: [2, 5, 3] -> [2, 7, 10]
+        # arange: [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
+        cu_num_tokens, arange = self._get_cumsum_and_arange(
+            num_scheduled_tokens)
+
+        # Get positions.
+        positions_np = self.positions_np[:total_num_scheduled_tokens]
+        np.add(self.input_batch.num_computed_tokens_cpu[req_indices],
+               arange,
+               out=positions_np)
+
+        # Calculate M-RoPE positions.
+        # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
+        if self.uses_mrope:
+            self._calc_mrope_positions(scheduler_output)
+
+        # Get token indices.
+        # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
+        # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2]
+        # where M is the max_model_len.
+        token_indices = (positions_np +
+                         req_indices * self.input_batch.token_ids_cpu.shape[1])
+
+        # NOTE(woosuk): We use torch.index_select instead of np.take here
+        # because torch.index_select is much faster than np.take for large
+        # tensors.
+        torch.index_select(self.input_batch.token_ids_cpu_tensor.flatten(),
+                           0,
+                           torch.from_numpy(token_indices),
+                           out=self.input_ids_cpu[:total_num_scheduled_tokens])
+
+        self.input_batch.block_table.compute_slot_mapping(
+            req_indices, positions_np)
+        self.input_batch.block_table.commit_slot_mapping(
+            total_num_scheduled_tokens)
+
+        # Prepare the attention metadata.
+        self.query_start_loc_np[0] = 0
+        self.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens
+
+        self.seq_lens_np[:num_reqs] = (
+            self.input_batch.num_computed_tokens_cpu[:num_reqs] +
+            num_scheduled_tokens)
+
+        # Copy the tensors to the GPU.
+        self.input_ids[:total_num_scheduled_tokens].copy_(
+            self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True)
+        if self.uses_mrope:
+            # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
+            self.mrope_positions[:, :total_num_scheduled_tokens].copy_(
+                self.mrope_positions_cpu[:, :total_num_scheduled_tokens],
+                non_blocking=True)
+        else:
+            # Common case (1D positions)
+            self.positions[:total_num_scheduled_tokens].copy_(
+                self.positions_cpu[:total_num_scheduled_tokens],
+                non_blocking=True)
+
+        self.query_start_loc[:num_reqs + 1].copy_(
+            self.query_start_loc_cpu[:num_reqs + 1], non_blocking=True)
+        self.seq_lens[:num_reqs].copy_(self.seq_lens_cpu[:num_reqs],
+                                       non_blocking=True)
+
+        # Fill unused with 0 for full cuda graph mode.
+        self.seq_lens[num_reqs:].fill_(0)
+        # Note: pad query_start_loc to be non-decreasing, as kernels
+        # like FlashAttention requires that
+        self.query_start_loc[num_reqs + 1:].fill_(
+            self.query_start_loc_cpu[num_reqs].item())
+
+        query_start_loc = self.query_start_loc[:num_reqs + 1]
+
+        spec_decode_common_attn_metadata = None
+
+        attn_metadata: dict[str, Any] = {}
+        # Prepare the attention metadata for each KV cache group and make layers
+        # in the same group share the same metadata.
+        for kv_cache_group_id, kv_cache_group_spec in enumerate(
+                self.kv_cache_config.kv_cache_groups):
+
+            blk_table = self.input_batch.block_table[kv_cache_group_id]
+            blk_table_tensor = blk_table.get_device_tensor()[:num_reqs]
+            slot_mapping = blk_table.slot_mapping[:total_num_scheduled_tokens]
+
+            # Fill unused with -1. Needed for reshape_and_cache in full cuda
+            # graph mode.
+            blk_table.slot_mapping[total_num_scheduled_tokens:].fill_(-1)
+
+            common_attn_metadata = CommonAttentionMetadata(
+                query_start_loc=self.query_start_loc[:num_reqs + 1],
+                query_start_loc_cpu=self.query_start_loc_cpu[:num_reqs + 1],
+                seq_lens=self.seq_lens[:num_reqs],
+                seq_lens_cpu=self.seq_lens_cpu[:num_reqs],
+                num_computed_tokens_cpu=self.input_batch.
+                num_computed_tokens_cpu_tensor[:num_reqs],
+                num_reqs=num_reqs,
+                num_actual_tokens=total_num_scheduled_tokens,
+                max_query_len=max_num_scheduled_tokens,
+                block_table_tensor=blk_table_tensor,
+                slot_mapping=slot_mapping,
+            )
+
+            if self.speculative_config and \
+                spec_decode_common_attn_metadata is None:
+                spec_decode_common_attn_metadata = common_attn_metadata
+
+            if isinstance(kv_cache_group_spec.kv_cache_spec,
+                          ChunkedLocalAttentionSpec):
+                common_attn_metadata = make_local_attention_virtual_batches(
+                    kv_cache_group_spec.kv_cache_spec.attention_chunk_size,
+                    common_attn_metadata, self.cache_config.block_size)
+
+            # Prepare for cascade attention if enabled & beneficial.
+            common_prefix_len = 0
+            builder = self.attn_metadata_builders[kv_cache_group_id]
+            if self.cascade_attn_enabled:
+                common_prefix_len = self._compute_cascade_attn_prefix_len(
+                    num_scheduled_tokens,
+                    scheduler_output.
+                    num_common_prefix_blocks[kv_cache_group_id],
+                    kv_cache_group_spec.kv_cache_spec,
+                    builder,
+                )
+
+            attn_metadata_i = (builder.build(
+                common_prefix_len=common_prefix_len,
+                common_attn_metadata=common_attn_metadata,
+            ))
+
+            for layer_name in kv_cache_group_spec.layer_names:
+                attn_metadata[layer_name] = attn_metadata_i
+
+        attention_cuda_graphs = all(
+            b.can_run_in_cudagraph(common_attn_metadata)
+            for b in self.attn_metadata_builders)
+
+        use_spec_decode = len(
+            scheduler_output.scheduled_spec_decode_tokens) > 0
+        if not use_spec_decode:
+            # NOTE(woosuk): Due to chunked prefills, the batch may contain
+            # partial requests. While we should not sample any token
+            # from these partial requests, we do so for simplicity.
+            # We will ignore the sampled tokens from the partial requests.
+            # TODO: Support prompt logprobs.
+            logits_indices = query_start_loc[1:] - 1
+            spec_decode_metadata = None
+        else:
+            # Get the number of draft tokens for each request.
+            # Iterate over the dictionary rather than all requests since not all
+            # requests have draft tokens.
+            num_draft_tokens = np.zeros(num_reqs, dtype=np.int32)
+            for req_id, draft_token_ids in (
+                    scheduler_output.scheduled_spec_decode_tokens.items()):
+                req_idx = self.input_batch.req_id_to_index[req_id]
+                num_draft_tokens[req_idx] = len(draft_token_ids)
+
+            spec_decode_metadata = self._calc_spec_decode_metadata(
+                num_draft_tokens, cu_num_tokens)
+            logits_indices = spec_decode_metadata.logits_indices
+
+        # Hot-Swap lora model
+        if self.lora_config:
+            self.set_active_loras(self.input_batch, num_scheduled_tokens)
+
+        return (attn_metadata, attention_cuda_graphs, logits_indices,
+                spec_decode_metadata, num_scheduled_tokens,
+                spec_decode_common_attn_metadata)
+
+    def _compute_cascade_attn_prefix_len(
+        self,
+        num_scheduled_tokens: np.ndarray,
+        num_common_prefix_blocks: int,
+        kv_cache_spec: KVCacheSpec,
+        attn_metadata_builder: AttentionMetadataBuilder,
+    ) -> int:
+        """Compute the length of the common prefix for cascade attention.
+
+        NOTE(woosuk): The common prefix length returned by this function
+        represents the length used specifically for cascade attention, not the
+        actual number of tokens shared between requests. When cascade attention
+        is disabled (use_cascade=False), this function returns 0 even if
+        requests share common tokens. Additionally, the common prefix length is
+        truncated to a multiple of the block size and may be further truncated
+        due to implementation details explained below.
+
+        Args:
+            num_scheduled_tokens: Number of tokens scheduled per request.
+            num_common_prefix_blocks: Number of shared KV cache blocks.
+
+        Returns:
+            int: Length of common prefix in tokens.
+        """
+        common_prefix_len = num_common_prefix_blocks * kv_cache_spec.block_size
+        if common_prefix_len == 0:
+            # Common case.
+            return 0
+
+        # NOTE(woosuk): Cascade attention uses two attention kernels: one
+        # for the common prefix and the other for the rest. For the first
+        # kernel, we concatenate all the query tokens (possibly from
+        # different requests) and treat them as if they are from the same
+        # request. Then, we use bi-directional attention to process the
+        # common prefix in the KV cache. Importantly, this means that the
+        # first kernel does not do any masking.
+
+        # Consider the following example:
+        # Request 1's input query: [D, E, X]
+        # Request 1's kv cache: [A, B, C, D, E, X]
+        # Request 1's num_computed_tokens: 3 (i.e., [A, B, C])
+        # Request 2's input query: [E, Y]
+        # Request 2's kv cache: [A, B, C, D, E, Y]
+        # Request 2's num_computed_tokens: 4 (i.e., [A, B, C, D])
+
+        # If we use [A, B, C, D, E] as the common prefix, then the
+        # first kernel will compute the bi-directional attention between
+        # input query [D, E, X, E, Y] and common prefix [A, B, C, D, E].
+        # However, this is wrong because D in Request 1 should not attend to
+        # E in the common prefix (i.e., we need masking).
+        # To avoid this, [A, B, C, D] should be the common prefix.
+        # That is, the common prefix should be capped by the minimum
+        # num_computed_tokens among the requests, and plus one to include
+        # the first token of the query.
+
+        # In practice, we use [A, B, C] as the common prefix, instead of
+        # [A, B, C, D] (i.e., the common prefix is capped by the minimum
+        # num_computed_tokens, without plus one).
+        # This is because of an implementation detail: We want to always
+        # use two kernels for cascade attention. Let's imagine:
+        # Request 3's input query: [D]
+        # Request 3's kv cache: [A, B, C, D]
+        # Request 3's num_computed_tokens: 3 (i.e., [A, B, C])
+        # If we use [A, B, C, D] as the common prefix for Request 1-3,
+        # then Request 3 will be processed only by the first kernel,
+        # and the second kernel will get an empty input. While this is not
+        # a fundamental problem, our current implementation does not support
+        # this case.
+        num_reqs = len(num_scheduled_tokens)
+        common_prefix_len = min(
+            common_prefix_len,
+            self.input_batch.num_computed_tokens_cpu[:num_reqs].min())
+        # common_prefix_len should be a multiple of the block size.
+        common_prefix_len = (common_prefix_len // kv_cache_spec.block_size *
+                             kv_cache_spec.block_size)
+        use_sliding_window = (isinstance(kv_cache_spec, SlidingWindowSpec) or
+                              (isinstance(kv_cache_spec, FullAttentionSpec)
+                               and kv_cache_spec.sliding_window is not None))
+        use_local_attention = (
+            isinstance(kv_cache_spec, ChunkedLocalAttentionSpec)
+            or (isinstance(kv_cache_spec, FullAttentionSpec)
+                and kv_cache_spec.attention_chunk_size is not None))
+        assert isinstance(kv_cache_spec, AttentionSpec)
+        use_cascade = attn_metadata_builder.use_cascade_attention(
+            common_prefix_len=common_prefix_len,
+            query_lens=num_scheduled_tokens,
+            num_query_heads=self.num_query_heads,
+            num_kv_heads=kv_cache_spec.num_kv_heads,
+            use_alibi=self.use_alibi,
+            use_sliding_window=use_sliding_window,
+            use_local_attention=use_local_attention,
+            num_sms=self.num_sms,
+        )
+        return common_prefix_len if use_cascade else 0
+
+    def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"):
+        mrope_pos_ptr = 0
+        for index, req_id in enumerate(self.input_batch.req_ids):
+            req = self.requests[req_id]
+            assert req.mrope_positions is not None
+
+            num_computed_tokens = \
+                self.input_batch.num_computed_tokens_cpu[index]
+            num_scheduled_tokens = \
+                scheduler_output.num_scheduled_tokens[req_id]
+            num_prompt_tokens = len(req.prompt_token_ids)
+
+            if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens:
+                prompt_part_len = max(0,
+                                      num_prompt_tokens - num_computed_tokens)
+                completion_part_len = max(
+                    0, num_scheduled_tokens - prompt_part_len)
+            else:
+                prompt_part_len = num_scheduled_tokens
+                completion_part_len = 0
+
+            assert num_scheduled_tokens == prompt_part_len + completion_part_len
+
+            if prompt_part_len > 0:
+                # prompt's mrope_positions are pre-computed
+                dst_start = mrope_pos_ptr
+                dst_end = mrope_pos_ptr + prompt_part_len
+                src_start = num_computed_tokens
+                src_end = num_computed_tokens + prompt_part_len
+
+                self.mrope_positions_cpu[:, dst_start:dst_end] = \
+                    req.mrope_positions[:,src_start:src_end]
+
+                mrope_pos_ptr += prompt_part_len
+
+            if completion_part_len > 0:
+                # compute completion's mrope_positions on-the-fly
+                dst_start = mrope_pos_ptr
+                dst_end = mrope_pos_ptr + completion_part_len
+
+                MRotaryEmbedding.get_next_input_positions_tensor(
+                    out=self.mrope_positions_np,
+                    out_offset=dst_start,
+                    mrope_position_delta=req.mrope_position_delta,
+                    context_len=num_computed_tokens + prompt_part_len,
+                    num_new_tokens=completion_part_len,
+                )
+
+                mrope_pos_ptr += completion_part_len
+
+    def _calc_spec_decode_metadata(
+        self,
+        num_draft_tokens: np.ndarray,
+        cu_num_scheduled_tokens: np.ndarray,
+    ) -> SpecDecodeMetadata:
+        # Inputs:
+        # cu_num_scheduled_tokens:  [  4, 104, 107, 207, 209]
+        # num_draft_tokens:         [  3,   0,   2,   0,   1]
+        # Outputs:
+        # cu_num_draft_tokens:      [  3,   3,   5,   5,   6]
+        # logits_indices:           [  0,   1,   2,   3, 103, 104, 105, 106,
+        #                            206, 207, 208]
+        # target_logits_indices:    [  0,   1,   2,   5,   6,   9]
+        # bonus_logits_indices:     [  3,   4,   7,   8,  10]
+
+        # Compute the logits indices.
+        # [4, 1, 3, 1, 2]
+        num_sampled_tokens = num_draft_tokens + 1
+
+        # Step 1. cu_num_sampled_tokens: [4, 5, 8, 9, 11]
+        # arange: [0, 1, 2, 3, 0, 0, 1, 2, 0, 0, 1]
+        cu_num_sampled_tokens, arange = self._get_cumsum_and_arange(
+            num_sampled_tokens, cumsum_dtype=np.int32)
+        # Step 2. [0, 0, 0, 0, 103, 104, 104, 104, 206, 207, 207]
+        logits_indices = np.repeat(
+            cu_num_scheduled_tokens - num_sampled_tokens, num_sampled_tokens)
+        # Step 3. [0, 1, 2, 3, 103, 104, 105, 106, 206, 207, 208]
+        logits_indices += arange
+
+        # Compute the bonus logits indices.
+        bonus_logits_indices = cu_num_sampled_tokens - 1
+
+        # Compute the draft logits indices.
+        # cu_num_draft_tokens: [3, 3, 5, 5, 6]
+        # arange: [0, 1, 2, 0, 1, 0]
+        cu_num_draft_tokens, arange = self._get_cumsum_and_arange(
+            num_draft_tokens, cumsum_dtype=np.int32)
+        # [0, 0, 0, 5, 5, 9]
+        target_logits_indices = np.repeat(
+            cu_num_sampled_tokens - num_sampled_tokens, num_draft_tokens)
+        # [0, 1, 2, 5, 6, 9]
+        target_logits_indices += arange
+
+        # TODO: Optimize the CPU -> GPU copy.
+        cu_num_draft_tokens = torch.from_numpy(cu_num_draft_tokens).to(
+            self.device, non_blocking=True)
+        logits_indices = torch.from_numpy(logits_indices).to(self.device,
+                                                             non_blocking=True)
+        target_logits_indices = torch.from_numpy(target_logits_indices).to(
+            self.device, non_blocking=True)
+        bonus_logits_indices = torch.from_numpy(bonus_logits_indices).to(
+            self.device, non_blocking=True)
+
+        # Compute the draft token ids.
+        # draft_token_indices:      [  1,   2,   3, 105, 106, 208]
+        draft_token_ids = self.input_ids[logits_indices]
+        draft_token_ids = draft_token_ids[target_logits_indices + 1]
+
+        metadata = SpecDecodeMetadata(
+            draft_token_ids=draft_token_ids,
+            num_draft_tokens=num_draft_tokens.tolist(),
+            cu_num_draft_tokens=cu_num_draft_tokens,
+            target_logits_indices=target_logits_indices,
+            bonus_logits_indices=bonus_logits_indices,
+            logits_indices=logits_indices,
+        )
+        return metadata
+
+    def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
+        scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
+        if not scheduled_encoder_inputs:
+            return
+
+        # Batch the multi-modal inputs.
+        mm_inputs = list[MultiModalKwargs]()
+        req_ids_pos = list[tuple[str, int, PlaceholderRange]]()
+        for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
+            req_state = self.requests[req_id]
+
+            for mm_input_id in encoder_input_ids:
+                mm_inputs.append(req_state.mm_inputs[mm_input_id])
+                req_ids_pos.append(
+                    (req_id, mm_input_id, req_state.mm_positions[mm_input_id]))
+
+        # Batch mm inputs as much as we can: if a request in the batch has
+        # multiple modalities or a different modality than the previous one,
+        # we process it separately to preserve item order.
+        # FIXME(ywang96): This is a hacky way to deal with multiple modalities
+        # in the same batch while still being able to benefit from batching
+        # multimodal inputs. The proper solution should be reordering the
+        # encoder outputs.
+        grouped_mm_inputs_list = group_mm_inputs_by_modality(mm_inputs)
+
+        encoder_outputs = []
+        for grouped_mm_inputs in grouped_mm_inputs_list:
+            batched_mm_inputs = MultiModalKwargs.batch(
+                grouped_mm_inputs, pin_memory=self.pin_memory)
+            batched_mm_inputs = MultiModalKwargs.as_kwargs(
+                batched_mm_inputs,
+                device=self.device,
+            )
+
+            # Run the encoder.
+            # `curr_group_outputs` is either of the following:
+            # 1. A tensor of shape (num_items, feature_size, hidden_size)
+            # in case feature_size is fixed across all multimodal items.
+            # 2. A list or tuple (length: num_items) of tensors, each of shape
+            # (feature_size, hidden_size) in case the feature size is dynamic
+            # depending on the input multimodal items.
+            curr_group_outputs = self.model.get_multimodal_embeddings(
+                **batched_mm_inputs)
+
+            sanity_check_mm_encoder_outputs(
+                curr_group_outputs,
+                expected_num_items=len(grouped_mm_inputs),
+            )
+
+            for output in curr_group_outputs:
+                encoder_outputs.append(output)
+
+        # Cache the encoder outputs.
+        for (req_id, input_id, pos_info), output in zip(
+                req_ids_pos,
+                encoder_outputs,
+        ):
+            if req_id not in self.encoder_cache:
+                self.encoder_cache[req_id] = {}
+
+            self.encoder_cache[req_id][input_id] = scatter_mm_placeholders(
+                output,
+                is_embed=pos_info.is_embed,
+            )
+
+    def _gather_mm_embeddings(
+        self,
+        scheduler_output: "SchedulerOutput",
+    ) -> list[torch.Tensor]:
+        mm_embeds: list[torch.Tensor] = []
+        for req_id in self.input_batch.req_ids:
+            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
+                req_id]
+            req_state = self.requests[req_id]
+            num_computed_tokens = req_state.num_computed_tokens
+            mm_positions = req_state.mm_positions
+            for i, pos_info in enumerate(mm_positions):
+                start_pos = pos_info.offset
+                num_encoder_tokens = pos_info.length
+
+                # The encoder output is needed if the two ranges overlap:
+                # [num_computed_tokens,
+                #  num_computed_tokens + num_scheduled_tokens) and
+                # [start_pos, start_pos + num_encoder_tokens)
+                if start_pos >= num_computed_tokens + num_scheduled_tokens:
+                    # The encoder output is not needed in this step.
+                    break
+                if start_pos + num_encoder_tokens <= num_computed_tokens:
+                    # The encoder output is already processed and stored
+                    # in the decoder's KV cache.
+                    continue
+
+                start_idx = max(num_computed_tokens - start_pos, 0)
+                end_idx = min(
+                    num_computed_tokens - start_pos + num_scheduled_tokens,
+                    num_encoder_tokens)
+                assert start_idx < end_idx
+                assert req_id in self.encoder_cache
+                assert i in self.encoder_cache[req_id]
+                encoder_output = self.encoder_cache[req_id][i]
+
+                if (is_embed := pos_info.is_embed) is not None:
+                    is_embed = is_embed[start_idx:end_idx]
+
+                mm_embeds_item = gather_mm_placeholders(
+                    encoder_output[start_idx:end_idx],
+                    is_embed=is_embed,
+                )
+                mm_embeds.append(mm_embeds_item)
+        return mm_embeds
+
+    def get_model(self) -> nn.Module:
+        return self.model
+
+    def get_supported_pooling_tasks(self) -> list[PoolingTask]:
+        model = self.get_model()
+        if not is_pooling_model(model):
+            return []
+
+        return list(model.pooler.get_supported_tasks())
+
+    def apply_grammar_bitmask(
+        self,
+        scheduler_output: "SchedulerOutput",
+        logits: torch.Tensor,
+    ):
+        grammar_bitmask = scheduler_output.grammar_bitmask
+        if grammar_bitmask is None:
+            return
+
+        # We receive the structured output bitmask from the scheduler,
+        # compacted to contain bitmasks only for structured output requests.
+        # The order of the requests in the bitmask is not guaranteed to be the
+        # same as the order of the requests in the gpu runner's batch. We need
+        # to sort the bitmask to match the order of the requests used here.
+
+        # Get the batch indices of the structured output requests.
+        # Keep track of the number of speculative tokens scheduled for every
+        # request in the batch, as the logit indices are offset by this amount.
+        struct_out_req_batch_indices: dict[str, int] = {}
+        cumulative_offset = 0
+        seq = sorted(self.input_batch.req_id_to_index.items(),
+                     key=lambda x: x[1])
+        for req_id, batch_index in seq:
+            logit_index = batch_index + cumulative_offset
+            cumulative_offset += len(
+                scheduler_output.scheduled_spec_decode_tokens.get(req_id, []))
+            if req_id in scheduler_output.structured_output_request_ids:
+                struct_out_req_batch_indices[req_id] = logit_index
+
+        out_indices = []
+
+        # Reorder the bitmask to match the order of the requests in the batch.
+        sorted_bitmask = np.zeros_like(grammar_bitmask,
+                                       shape=(logits.shape[0],
+                                              grammar_bitmask.shape[1]))
+        cumulative_index = 0
+        seq = sorted(scheduler_output.structured_output_request_ids.items(),
+                     key=lambda x: x[1])
+        for req_id, _ in seq:
+            logit_index = struct_out_req_batch_indices[req_id]
+            num_spec_tokens = len(
+                scheduler_output.scheduled_spec_decode_tokens.get(req_id, []))
+            for i in range(1 + num_spec_tokens):
+                sorted_bitmask[logit_index + i] = \
+                    grammar_bitmask[cumulative_index + i]
+                out_indices.append(logit_index + i)
+            cumulative_index += 1 + num_spec_tokens
+        grammar_bitmask = sorted_bitmask
+
+        # Serialization of np.ndarray is much more efficient than a tensor,
+        # so we receive it in that format.
+        grammar_bitmask = torch.from_numpy(grammar_bitmask)
+
+        # Force use of the torch.compile implementation from xgrammar to work
+        # around issues with the Triton kernel in concurrent structured output
+        # scenarios. See PR #19565 and issues #19493, #18376 for details.
+        xgr_torch_compile.apply_token_bitmask_inplace_torch_compile(
+            logits,
+            grammar_bitmask.to(self.device, non_blocking=True),
+            indices=out_indices,
+        )
+
+    def sync_and_slice_intermediate_tensors(
+            self, num_tokens: int, intermediate_tensors: IntermediateTensors,
+            sync_self: bool) -> IntermediateTensors:
+
+        assert self.intermediate_tensors is not None
+
+        tp = self.vllm_config.parallel_config.tensor_parallel_size
+        enabled_sp = self.compilation_config.pass_config. \
+            enable_sequence_parallelism
+        if enabled_sp:
+            # When sequence parallelism is enabled, we always pad num_tokens
+            # to be a multiple of tensor_parallel_size (tp) earlier
+            assert num_tokens % tp == 0
+        is_residual_scattered = tp > 1 and enabled_sp \
+            and num_tokens % tp == 0
+
+        # When sequence parallelism is enabled, the "residual" tensor is sharded
+        # across tensor parallel ranks, so each rank only needs its own slice.
+        if sync_self:
+            assert intermediate_tensors is not None
+            for k, v in intermediate_tensors.items():
+                is_scattered = "residual" and is_residual_scattered
+                copy_len = num_tokens // tp if is_scattered else \
+                    num_tokens
+                self.intermediate_tensors[k][:copy_len].copy_(
+                    v[:copy_len], non_blocking=True)
+
+        return IntermediateTensors({
+            k:
+            v[:num_tokens // tp]
+            if k == "residual" and is_residual_scattered else v[:num_tokens]
+            for k, v in self.intermediate_tensors.items()
+        })
+
+    def eplb_step(self,
+                  is_dummy: bool = False,
+                  is_profile: bool = False) -> None:
+        """
+        Step for the EPLB (Expert Parallelism Load Balancing) state.
+        """
+        if not self.parallel_config.enable_eplb:
+            return
+
+        assert self.eplb_state is not None
+        assert is_mixture_of_experts(self.model)
+        self.eplb_state.step(
+            self.model,
+            is_dummy,
+            is_profile,
+            log_stats=self.parallel_config.eplb_log_balancedness,
+        )
+
+    def get_dp_padding(self,
+                       num_tokens: int) -> tuple[int, Optional[torch.Tensor]]:
+        dp_size = self.vllm_config.parallel_config.data_parallel_size
+        dp_rank = self.vllm_config.parallel_config.data_parallel_rank
+
+        # For DP: Don't pad when setting enforce_eager.
+        # This lets us set enforce_eager on the prefiller in a P/D setup and
+        # still use CUDA graphs (enabled by this padding) on the decoder.
+        #
+        # TODO(tms) : There are many cases where padding is enabled for
+        # prefills, causing unnecessary and excessive padding of activations.
+
+        if dp_size == 1 or self.vllm_config.model_config.enforce_eager:
+            # Early exit.
+            return 0, None
+
+        num_tokens_across_dp = DPMetadata.num_tokens_across_dp(
+            num_tokens, dp_size, dp_rank)
+        max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp).item()
+        num_tokens_after_padding = torch.tensor([max_tokens_across_dp_cpu] *
+                                                dp_size,
+                                                device="cpu",
+                                                dtype=torch.int32)
+        return max_tokens_across_dp_cpu - num_tokens, num_tokens_after_padding
+
+    def _pool(
+        self,
+        hidden_states: torch.Tensor,
+        num_scheduled_tokens: int,
+        num_scheduled_tokens_np: np.ndarray,
+        finished_sending: Optional[set[str]],
+        finished_recving: Optional[set[str]],
+    ) -> ModelRunnerOutput:
+        assert self.input_batch.num_reqs ==\
+            len(self.input_batch.pooling_params), \
+        "Either all or none of the requests in" \
+        " a batch must be pooling request"
+
+        extracted_hidden_states = list(
+            torch.split(hidden_states[:num_scheduled_tokens],
+                        num_scheduled_tokens_np.tolist()))
+
+        pooling_metadata = self.input_batch.pooling_metadata
+
+        raw_pooler_output = self.model.pooler(
+            hidden_states=extracted_hidden_states,
+            pooling_metadata=pooling_metadata)
+
+        pooler_output: list[Optional[torch.Tensor]] = []
+        seq_lens = self.seq_lens[:self.input_batch.num_reqs]
+        for raw_output, seq_len, prompt_len in zip(
+                raw_pooler_output, seq_lens, pooling_metadata.prompt_lens):
+
+            if seq_len == prompt_len:
+                pooler_output.append(raw_output.data.cpu())
+            else:
+                pooler_output.append(None)
+
+        return ModelRunnerOutput(
+            req_ids=self.input_batch.req_ids,
+            req_id_to_index=self.input_batch.req_id_to_index,
+            sampled_token_ids=[],
+            spec_token_ids=None,
+            logprobs=None,
+            prompt_logprobs_dict={},
+            pooler_output=pooler_output,
+            finished_sending=finished_sending,
+            finished_recving=finished_recving,
+        )
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        scheduler_output: "SchedulerOutput",
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+    ) -> Union[ModelRunnerOutput, IntermediateTensors]:
+        self._update_states(scheduler_output)
+        if not scheduler_output.total_num_scheduled_tokens:
+            if not has_kv_transfer_group():
+                # Return empty ModelRunnerOutput if there's no work to do.
+                return EMPTY_MODEL_RUNNER_OUTPUT
+
+            return self.kv_connector_no_forward(scheduler_output)
+
+        # Prepare the decoder inputs.
+        (attn_metadata, attention_cuda_graphs, logits_indices,
+         spec_decode_metadata, num_scheduled_tokens_np,
+         spec_decode_common_attn_metadata) = (
+             self._prepare_inputs(scheduler_output))
+        num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
+        if (self.use_cuda_graph
+                and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
+            # Use piecewise CUDA graphs.
+            # Add padding to the batch size.
+            num_input_tokens = self.vllm_config.pad_for_cudagraph(
+                num_scheduled_tokens)
+        else:
+            # Eager mode.
+            # Pad tokens to multiple of tensor_parallel_size when
+            # enabled collective fusion for SP
+            tp_size = self.vllm_config.parallel_config.tensor_parallel_size
+            if self.compilation_config.pass_config. \
+                enable_sequence_parallelism and tp_size > 1:
+                num_input_tokens = round_up(num_scheduled_tokens, tp_size)
+            else:
+                num_input_tokens = num_scheduled_tokens
+
+        # Padding for DP
+        num_pad, num_tokens_across_dp = self.get_dp_padding(num_input_tokens)
+        num_input_tokens += num_pad
+
+        # _prepare_inputs may reorder the batch, so we must gather multi
+        # modal outputs after that to ensure the correct order
+        if self.is_multimodal_model:
+            # Run the multimodal encoder if any.
+            self._execute_mm_encoder(scheduler_output)
+            mm_embeds = self._gather_mm_embeddings(scheduler_output)
+        else:
+            mm_embeds = []
+
+        if self.is_multimodal_model and get_pp_group().is_first_rank:
+            # NOTE(woosuk): To unify token ids and soft tokens (vision
+            # embeddings), we always use embeddings (rather than token ids)
+            # as input to the multimodal model, even when the input is text.
+            input_ids = self.input_ids[:num_scheduled_tokens]
+
+            model_kwargs = self._init_model_kwargs_for_multimodal_model(
+                scheduler_output=scheduler_output)
+            inputs_embeds = self.model.get_input_embeddings(
+                input_ids=input_ids,
+                multimodal_embeddings=mm_embeds or None,
+            )
+
+            # TODO(woosuk): Avoid the copy. Optimize.
+            self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds)
+            inputs_embeds = self.inputs_embeds[:num_input_tokens]
+            input_ids = None
+        else:
+            # For text-only models, we use token ids as input.
+            # While it is possible to use embeddings as input just like the
+            # multimodal models, it is not desirable for performance since
+            # then the embedding layer is not included in the CUDA graph.
+            input_ids = self.input_ids[:num_input_tokens]
+            inputs_embeds = None
+            model_kwargs = {}
+        if self.uses_mrope:
+            positions = self.mrope_positions[:, :num_input_tokens]
+        else:
+            positions = self.positions[:num_input_tokens]
+
+        if get_pp_group().is_first_rank:
+            intermediate_tensors = None
+        else:
+            intermediate_tensors = self.sync_and_slice_intermediate_tensors(
+                num_input_tokens, intermediate_tensors, True)
+
+        # Some attention backends only support CUDA Graphs in pure decode.
+        # If attention doesn't support CUDA Graphs for this batch, but we
+        # compiled with full CUDA graphs, we have to skip them entirely.
+        skip_cuda_graphs = self.full_cuda_graph and not attention_cuda_graphs
+
+        # Run the model.
+        # Use persistent buffers for CUDA graphs.
+        with set_forward_context(
+                attn_metadata,
+                self.vllm_config,
+                num_tokens=num_input_tokens,
+                num_tokens_across_dp=num_tokens_across_dp,
+                skip_cuda_graphs=skip_cuda_graphs,
+        ):
+            self.maybe_setup_kv_connector(scheduler_output)
+
+            model_output = self.model(
+                input_ids=input_ids,
+                positions=positions,
+                intermediate_tensors=intermediate_tensors,
+                inputs_embeds=inputs_embeds,
+                **MultiModalKwargs.as_kwargs(
+                    model_kwargs,
+                    device=self.device,
+                ),
+            )
+
+            self.maybe_wait_for_kv_save()
+            finished_sending, finished_recving = (
+                self.get_finished_kv_transfers(scheduler_output))
+
+        if self.use_aux_hidden_state_outputs:
+            hidden_states, aux_hidden_states = model_output
+        else:
+            hidden_states = model_output
+            aux_hidden_states = None
+
+        # Broadcast PP output for external_launcher (torchrun)
+        # to make sure we are synced across pp ranks
+        # TODO: Support overlapping mirco-batches
+        # https://github.com/vllm-project/vllm/issues/18019
+        broadcast_pp_output = \
+            self.parallel_config.distributed_executor_backend \
+            == "external_launcher" and len(get_pp_group().ranks) > 0
+        if not get_pp_group().is_last_rank:
+            # For mid-pipeline stages, return the hidden states.
+            if not broadcast_pp_output:
+                if finished_sending or finished_recving:
+                    hidden_states.finished_sending = finished_sending
+                    hidden_states.finished_recving = finished_recving
+                return hidden_states
+            assert isinstance(hidden_states, IntermediateTensors)
+            get_pp_group().send_tensor_dict(hidden_states.tensors,
+                                            all_gather_group=get_tp_group())
+            logits = None
+        else:
+            if self.input_batch.pooling_params:
+                return self._pool(hidden_states, num_scheduled_tokens,
+                                  num_scheduled_tokens_np, finished_sending,
+                                  finished_recving)
+
+            sample_hidden_states = hidden_states[logits_indices]
+            logits = self.model.compute_logits(sample_hidden_states, None)
+        if broadcast_pp_output:
+            model_output_broadcast_data = {
+                "logits": logits.contiguous(),
+            } if logits is not None else {}
+            model_output_broadcast_data = get_pp_group().broadcast_tensor_dict(
+                model_output_broadcast_data, src=len(get_pp_group().ranks) - 1)
+            assert model_output_broadcast_data is not None
+            logits = model_output_broadcast_data["logits"]
+
+        # Apply structured output bitmasks if present
+        if scheduler_output.grammar_bitmask is not None:
+            self.apply_grammar_bitmask(scheduler_output, logits)
+
+        # Sample the next token and get logprobs if needed.
+        sampling_metadata = self.input_batch.sampling_metadata
+        if spec_decode_metadata is None:
+            sampler_output = self.sampler(
+                logits=logits,
+                sampling_metadata=sampling_metadata,
+            )
+        else:
+            # When indexing with a tensor (bonus_logits_indices), PyTorch
+            # creates a new tensor with separate storage from the original
+            # logits tensor. This means any in-place operations on bonus_logits
+            # won't affect the original logits tensor.
+            assert logits is not None
+            bonus_logits = logits[spec_decode_metadata.bonus_logits_indices]
+            sampler_output = self.sampler(
+                logits=bonus_logits,
+                sampling_metadata=sampling_metadata,
+            )
+            bonus_token_ids = sampler_output.sampled_token_ids
+
+            # Just like `bonus_logits`, `target_logits` is a new tensor with
+            # separate storage from the original `logits` tensor. Therefore,
+            # it is safe to update `target_logits` in place.
+            target_logits = logits[spec_decode_metadata.target_logits_indices]
+            output_token_ids = self.rejection_sampler(
+                spec_decode_metadata,
+                None,  # draft_probs
+                target_logits,
+                bonus_token_ids,
+                sampling_metadata,
+            )
+            sampler_output.sampled_token_ids = output_token_ids
+
+        num_nans_in_logits = {}
+        if envs.VLLM_COMPUTE_NANS_IN_LOGITS:
+            num_nans_in_logits = self._get_nans_in_logits(logits)
+
+        # TODO(woosuk): The following loop can be slow since it iterates over
+        # the requests one by one. Optimize.
+        discard_sampled_tokens_req_indices = []
+        for i, req_id in enumerate(self.input_batch.req_ids):
+            req_state = self.requests[req_id]
+            seq_len = (req_state.num_computed_tokens +
+                       scheduler_output.num_scheduled_tokens[req_id])
+            if seq_len < req_state.num_tokens:
+                # Ignore the sampled token for partial prefills.
+                # Rewind the generator state as if the token was not sampled.
+                # This relies on cuda-specific torch-internal impl details
+                generator = self.input_batch.generators.get(i)
+                if generator is not None:
+                    generator.set_offset(generator.get_offset() - 4)
+                # Record the index of the request that should not be sampled,
+                # so that we could clear the sampled tokens before returning.
+                discard_sampled_tokens_req_indices.append(i)
+
+        # NOTE: GPU -> CPU Sync happens here.
+        # Move as many CPU operations as possible before this sync point.
+        logprobs_tensors = sampler_output.logprobs_tensors
+        logprobs_lists = logprobs_tensors.tolists() \
+            if logprobs_tensors is not None else None
+
+        # Compute prompt logprobs if needed.
+        prompt_logprobs_dict = self._get_prompt_logprobs_dict(
+            hidden_states[:num_scheduled_tokens],
+            scheduler_output,
+        )
+
+        # Get the valid generated tokens.
+        sampled_token_ids = sampler_output.sampled_token_ids
+        max_gen_len = sampled_token_ids.shape[-1]
+        if max_gen_len == 1:
+            # No spec decode tokens.
+            valid_sampled_token_ids = sampled_token_ids.tolist()
+        else:
+            # Includes spec decode tokens.
+            valid_sampled_token_ids = self.rejection_sampler.parse_output(
+                sampled_token_ids,
+                self.input_batch.vocab_size,
+            )
+        # Mask out the sampled tokens that should not be sampled.
+        for i in discard_sampled_tokens_req_indices:
+            valid_sampled_token_ids[i].clear()
+
+        # Cache the sampled tokens in the model runner, so that the scheduler
+        # doesn't need to send them back.
+        # NOTE(woosuk): As an exception, when using PP, the scheduler sends
+        # the sampled tokens back, because there's no direct communication
+        # between the first-stage worker and the last-stage worker.
+        for req_idx, sampled_ids in enumerate(valid_sampled_token_ids):
+            if not sampled_ids:
+                continue
+
+            start_idx = self.input_batch.num_tokens_no_spec[req_idx]
+            end_idx = start_idx + len(sampled_ids)
+            assert end_idx <= self.max_model_len, (
+                "Sampled token IDs exceed the max model length. "
+                f"Total number of tokens: {end_idx} > max_model_len: "
+                f"{self.max_model_len}")
+
+            self.input_batch.token_ids_cpu[req_idx,
+                                           start_idx:end_idx] = sampled_ids
+            self.input_batch.num_tokens_no_spec[req_idx] = end_idx
+            self.input_batch.num_tokens[req_idx] = end_idx
+            req_id = self.input_batch.req_ids[req_idx]
+            req_state = self.requests[req_id]
+            req_state.output_token_ids.extend(sampled_ids)
+
+        if not self.speculative_config:
+            # Speculative decoding is not enabled.
+            spec_token_ids = None
+        else:
+            assert spec_decode_common_attn_metadata is not None
+            spec_token_ids = self.propose_draft_token_ids(
+                scheduler_output,
+                valid_sampled_token_ids,
+                sampling_metadata,
+                hidden_states,
+                sample_hidden_states,
+                aux_hidden_states,
+                spec_decode_metadata,
+                spec_decode_common_attn_metadata,
+            )
+
+        self.eplb_step()
+
+        return ModelRunnerOutput(
+            req_ids=self.input_batch.req_ids,
+            req_id_to_index=self.input_batch.req_id_to_index,
+            sampled_token_ids=valid_sampled_token_ids,
+            spec_token_ids=spec_token_ids,
+            logprobs=logprobs_lists,
+            prompt_logprobs_dict=prompt_logprobs_dict,
+            pooler_output=[],
+            finished_sending=finished_sending,
+            finished_recving=finished_recving,
+            num_nans_in_logits=num_nans_in_logits,
+        )
+
+    def propose_draft_token_ids(
+        self,
+        scheduler_output: "SchedulerOutput",
+        sampled_token_ids: list[list[int]],
+        sampling_metadata: SamplingMetadata,
+        hidden_states: torch.Tensor,
+        sample_hidden_states: torch.Tensor,
+        aux_hidden_states: Optional[torch.Tensor],
+        spec_decode_metadata: Optional[SpecDecodeMetadata],
+        common_attn_metadata: CommonAttentionMetadata,
+    ) -> list[list[int]]:
+        num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
+        if self.speculative_config.method == "ngram":
+            assert isinstance(self.drafter, NgramProposer)
+            spec_token_ids = self.propose_ngram_draft_token_ids(
+                sampled_token_ids)
+        elif self.speculative_config.method == "medusa":
+            assert isinstance(self.drafter, MedusaProposer)
+            if sample_hidden_states.shape[0] == len(sampled_token_ids):
+                # The input to the target model does not include draft tokens.
+                hidden_states = sample_hidden_states
+            else:
+                indices = []
+                offset = 0
+                for num_draft, tokens in zip(
+                        spec_decode_metadata.num_draft_tokens,
+                        sampled_token_ids):
+                    indices.append(offset + len(tokens) - 1)
+                    offset += num_draft + 1
+                indices = torch.tensor(indices, device=self.device)
+                hidden_states = sample_hidden_states[indices]
+
+            spec_token_ids = self.drafter.propose(
+                target_hidden_states=hidden_states,
+                sampling_metadata=sampling_metadata,
+            )
+        elif self.speculative_config.use_eagle():
+            assert isinstance(self.drafter, EagleProposer)
+            # TODO(woosuk): Refactor the loop.
+            next_token_ids: list[int] = []
+            for i, token_ids in enumerate(sampled_token_ids):
+                if token_ids:
+                    # Common case.
+                    next_token_id = token_ids[-1]
+                else:
+                    # Partial prefill (rare case).
+                    # Get the next token id from the request state.
+                    req_id = self.input_batch.req_ids[i]
+                    req_state = self.requests[req_id]
+                    seq_len = (req_state.num_computed_tokens +
+                               scheduler_output.num_scheduled_tokens[req_id])
+                    next_token_id = req_state.get_token_id(seq_len)
+                next_token_ids.append(next_token_id)
+            next_token_ids = torch.tensor(next_token_ids,
+                                          dtype=torch.int32,
+                                          device=self.device)
+
+            if spec_decode_metadata is None:
+                # input_ids can be None for multimodal models.
+                target_token_ids = self.input_ids[:num_scheduled_tokens]
+                # TODO(woosuk): Support M-RoPE.
+                target_positions = self.positions[:num_scheduled_tokens]
+                if self.use_aux_hidden_state_outputs:
+                    target_hidden_states = torch.cat(
+                        [h[:num_scheduled_tokens] for h in aux_hidden_states],
+                        dim=-1)
+                else:
+                    target_hidden_states = hidden_states[:num_scheduled_tokens]
+            else:
+                # TODO(woosuk): Refactor this.
+                num_draft_tokens = spec_decode_metadata.num_draft_tokens
+                num_rejected_tokens = [
+                    n + 1 - len(sampled_token_ids[i]) if n > 0 else 0
+                    for i, n in enumerate(num_draft_tokens)
+                ]
+                num_rejected_tokens_cpu = torch.tensor(num_rejected_tokens,
+                                                       dtype=torch.int32)
+                common_attn_metadata, token_indices =\
+                    self.drafter.prepare_inputs(
+                    common_attn_metadata, num_rejected_tokens_cpu)
+
+                target_token_ids = self.input_ids[token_indices]
+                # TODO(woosuk): Support M-RoPE.
+                target_positions = self.positions[token_indices]
+                if self.use_aux_hidden_state_outputs:
+                    target_hidden_states = torch.cat(
+                        [h[token_indices] for h in aux_hidden_states], dim=-1)
+                else:
+                    target_hidden_states = hidden_states[token_indices]
+            draft_token_ids = self.drafter.propose(
+                target_token_ids=target_token_ids,
+                target_positions=target_positions,
+                target_hidden_states=target_hidden_states,
+                next_token_ids=next_token_ids,
+                sampling_metadata=sampling_metadata,
+                common_attn_metadata=common_attn_metadata,
+            )
+            spec_token_ids = draft_token_ids.tolist()
+        return spec_token_ids
+
+    @staticmethod
+    def maybe_setup_kv_connector(scheduler_output: "SchedulerOutput"):
+        # Update KVConnector with the KVConnector metadata forward().
+        if has_kv_transfer_group():
+            kv_connector = get_kv_transfer_group()
+            assert isinstance(kv_connector, KVConnectorBase_V1)
+            assert scheduler_output.kv_connector_metadata is not None
+            kv_connector.bind_connector_metadata(
+                scheduler_output.kv_connector_metadata)
+
+            # Background KV cache transfers happen here.
+            # These transfers are designed to be async and the requests
+            # involved may be disjoint from the running requests.
+            # Do this here to save a collective_rpc.
+            kv_connector.start_load_kv(get_forward_context())
+
+    @staticmethod
+    def maybe_wait_for_kv_save() -> None:
+        if has_kv_transfer_group():
+            get_kv_transfer_group().wait_for_save()
+
+    @staticmethod
+    def get_finished_kv_transfers(
+        scheduler_output: "SchedulerOutput",
+    ) -> tuple[Optional[set[str]], Optional[set[str]]]:
+        if has_kv_transfer_group():
+            return get_kv_transfer_group().get_finished(
+                scheduler_output.finished_req_ids)
+        return None, None
+
+    def kv_connector_no_forward(
+            self, scheduler_output: "SchedulerOutput") -> ModelRunnerOutput:
+        # KV send/recv even if no work to do.
+        with set_forward_context(None, self.vllm_config):
+            self.maybe_setup_kv_connector(scheduler_output)
+            finished_sending, finished_recving = (
+                self.get_finished_kv_transfers(scheduler_output))
+
+        if not finished_sending and not finished_recving:
+            return EMPTY_MODEL_RUNNER_OUTPUT
+
+        output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT)
+        output.finished_sending = finished_sending
+        output.finished_recving = finished_recving
+        return output
+
+    def propose_ngram_draft_token_ids(
+        self,
+        sampled_token_ids: list[list[int]],
+    ) -> list[list[int]]:
+        # TODO(woosuk): Optimize.
+        draft_token_ids: list[list[int]] = []
+        for i, sampled_ids in enumerate(sampled_token_ids):
+            num_sampled_ids = len(sampled_ids)
+            if not num_sampled_ids:
+                # Skip speculative decoding.
+                draft_token_ids.append([])
+                continue
+
+            # Skip requests that require sampling parameters that are not
+            # supported with speculative decoding.
+            req_id = self.input_batch.req_ids[i]
+            if req_id in self.input_batch.spec_decode_unsupported_reqs:
+                draft_token_ids.append([])
+                continue
+
+            num_tokens = self.input_batch.num_tokens_no_spec[i]
+            if num_tokens >= self.max_model_len:
+                # Skip requests that have already reached the max model length.
+                draft_token_ids.append([])
+                continue
+
+            drafter_output = self.drafter.propose(
+                self.input_batch.token_ids_cpu[i, :num_tokens])
+            if drafter_output is None or len(drafter_output) == 0:
+                draft_token_ids.append([])
+            else:
+                draft_token_ids.append(drafter_output.tolist())
+        return draft_token_ids
+
+    def update_config(self, overrides: dict[str, Any]) -> None:
+        allowed_config_names = {"load_config", "model_config"}
+        for config_name, config_overrides in overrides.items():
+            assert config_name in allowed_config_names, \
+                f"Config `{config_name}` not supported. " \
+                f"Allowed configs: {allowed_config_names}"
+            config = getattr(self, config_name)
+            new_config = update_config(config, config_overrides)
+            setattr(self, config_name, new_config)
+
+    def load_model(self, eep_scale_up: bool = False) -> None:
+        """
+        Args:
+            eep_scale_up: the model loading is for elastic EP scale up.
+        """
+        logger.info("Starting to load model %s...", self.model_config.model)
+        if eep_scale_up:
+            from vllm.distributed.parallel_state import get_ep_group
+            num_local_physical_experts = torch.empty(1,
+                                                     dtype=torch.int32,
+                                                     device="cpu")
+            torch.distributed.broadcast(num_local_physical_experts,
+                                        group=get_ep_group().cpu_group,
+                                        group_src=0)
+            num_local_physical_experts = int(num_local_physical_experts.item())
+            new_ep_size = get_ep_group().world_size
+            global_expert_load, old_global_expert_indices = (
+                EplbState.recv_state())
+            num_logical_experts = global_expert_load.shape[1]
+            self.parallel_config.num_redundant_experts = (
+                num_local_physical_experts * new_ep_size - num_logical_experts)
+            assert old_global_expert_indices.shape[
+                1] % num_local_physical_experts == 0
+            old_ep_size = old_global_expert_indices.shape[
+                1] // num_local_physical_experts
+            rank_mapping = {
+                old_ep_rank: old_ep_rank
+                for old_ep_rank in range(old_ep_size)
+            }
+        else:
+            global_expert_load = None
+            old_global_expert_indices = None
+            rank_mapping = None
+
+        with DeviceMemoryProfiler() as m:
+            time_before_load = time.perf_counter()
+            model_loader = get_model_loader(self.load_config)
+            logger.info("Loading model from scratch...")
+            self.model = model_loader.load_model(
+                vllm_config=self.vllm_config, model_config=self.model_config)
+            if self.lora_config:
+                self.model = self.load_lora_model(self.model,
+                                                  self.model_config,
+                                                  self.scheduler_config,
+                                                  self.lora_config,
+                                                  self.device)
+            if hasattr(self, "drafter"):
+                logger.info("Loading drafter model...")
+                self.drafter.load_model(self.model)
+            if self.use_aux_hidden_state_outputs:
+                self.model.set_aux_hidden_state_layers(
+                    self.model.get_eagle3_aux_hidden_state_layers())
+            time_after_load = time.perf_counter()
+        self.model_memory_usage = m.consumed_memory
+        logger.info("Model loading took %.4f GiB and %.6f seconds",
+                    self.model_memory_usage / GiB_bytes,
+                    time_after_load - time_before_load)
+        prepare_communication_buffer_for_model(self.model)
+
+        if is_mixture_of_experts(
+                self.model) and self.parallel_config.enable_eplb:
+            logger.info("EPLB is enabled for model %s.",
+                        self.model_config.model)
+            self.eplb_state = EplbState.build(
+                self.model,
+                self.device,
+                self.parallel_config,
+                global_expert_load,
+                old_global_expert_indices,
+                rank_mapping,
+            )
+
+    def reload_weights(self) -> None:
+        assert getattr(self, "model", None) is not None, \
+            "Cannot reload weights before model is loaded."
+        model_loader = get_model_loader(self.load_config)
+        logger.info("Reloading weights inplace...")
+        model_loader.load_weights(self.model, model_config=self.model_config)
+
+    def save_tensorized_model(
+        self,
+        tensorizer_config: "TensorizerConfig",
+    ) -> None:
+        TensorizerLoader.save_model(
+            self.model,
+            tensorizer_config=tensorizer_config,
+            model_config=self.model_config,
+        )
+
+    def _get_prompt_logprobs_dict(
+        self,
+        hidden_states: torch.Tensor,
+        scheduler_output: "SchedulerOutput",
+    ) -> dict[str, Optional[LogprobsTensors]]:
+        num_prompt_logprobs_dict = self.input_batch.num_prompt_logprobs
+        if not num_prompt_logprobs_dict:
+            return {}
+
+        in_progress_dict = self.input_batch.in_progress_prompt_logprobs_cpu
+        prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]] = {}
+
+        # Since prompt logprobs are a rare feature, prioritize simple,
+        # maintainable loop over optimal performance.
+        completed_prefill_reqs = []
+        for req_id, num_prompt_logprobs in num_prompt_logprobs_dict.items():
+
+            num_tokens = scheduler_output.num_scheduled_tokens[req_id]
+
+            # Get metadata for this request.
+            request = self.requests[req_id]
+            num_prompt_tokens = len(request.prompt_token_ids)
+            prompt_token_ids = torch.tensor(request.prompt_token_ids).to(
+                self.device, non_blocking=True)
+
+            # Set up target LogprobsTensors object.
+            logprobs_tensors = in_progress_dict.get(req_id)
+            if not logprobs_tensors:
+                # Create empty logprobs CPU tensors for the entire prompt.
+                # If chunked, we'll copy in slice by slice.
+                logprobs_tensors = LogprobsTensors.empty_cpu(
+                    num_prompt_tokens - 1, num_prompt_logprobs + 1)
+                in_progress_dict[req_id] = logprobs_tensors
+
+            # Determine number of logits to retrieve.
+            start_idx = request.num_computed_tokens
+            start_tok = start_idx + 1
+            num_remaining_tokens = num_prompt_tokens - start_tok
+            if num_tokens <= num_remaining_tokens:
+                # This is a chunk, more tokens remain.
+                # In the == case, there are no more prompt logprobs to produce
+                # but we want to defer returning them to the next step where we
+                # have new generated tokens to return.
+                num_logits = num_tokens
+            else:
+                # This is the last chunk of prompt tokens to return.
+                num_logits = num_remaining_tokens
+                completed_prefill_reqs.append(req_id)
+                prompt_logprobs_dict[req_id] = logprobs_tensors
+
+            if num_logits <= 0:
+                # This can happen for the final chunk if we prefilled exactly
+                # (num_prompt_tokens - 1) tokens for this request in the prior
+                # step. There are no more prompt logprobs to produce.
+                continue
+
+            # Get the logits corresponding to this req's prompt tokens.
+            # If this is a partial request (i.e. chunked prefill),
+            # then there is prompt logprob generated for each index.
+            req_idx = self.input_batch.req_id_to_index[req_id]
+            offset = self.query_start_loc_np[req_idx].item()
+            prompt_hidden_states = hidden_states[offset:offset + num_logits]
+            logits = self.model.compute_logits(prompt_hidden_states, None)
+
+            # Get the "target" tokens for each index. For prompt at index i,
+            # the token at prompt index i+1 is the "sampled" token we want
+            # to gather the logprob for.
+            tgt_token_ids = prompt_token_ids[start_tok:start_tok + num_logits]
+
+            # Compute prompt logprobs.
+            logprobs = self.sampler.compute_logprobs(logits)
+            token_ids, logprobs, ranks = self.sampler.gather_logprobs(
+                logprobs, num_prompt_logprobs, tgt_token_ids)
+
+            # Transfer GPU->CPU async.
+            chunk_slice = slice(start_idx, start_idx + num_logits)
+            logprobs_tensors.logprob_token_ids[chunk_slice].copy_(
+                token_ids, non_blocking=True)
+            logprobs_tensors.logprobs[chunk_slice].copy_(logprobs,
+                                                         non_blocking=True)
+            logprobs_tensors.selected_token_ranks[chunk_slice].copy_(
+                ranks, non_blocking=True)
+
+        # Remove requests that have completed prefill from the batch
+        # num_prompt_logprobs_dict.
+        for req_id in completed_prefill_reqs:
+            del num_prompt_logprobs_dict[req_id]
+            del in_progress_dict[req_id]
+
+        # Must synchronize the non-blocking GPU->CPU transfers.
+        if prompt_logprobs_dict:
+            self._sync_device()
+
+        return prompt_logprobs_dict
+
+    def _get_nans_in_logits(
+        self,
+        logits: Optional[torch.Tensor],
+    ) -> dict[str, int]:
+        try:
+            if logits is None:
+                return {req_id: 0 for req_id in self.input_batch.req_ids}
+
+            num_nans_in_logits = {}
+            num_nans_for_index = logits.isnan().sum(dim=-1).cpu().numpy()
+            for req_id in self.input_batch.req_ids:
+                req_index = self.input_batch.req_id_to_index[req_id]
+                num_nans_in_logits[req_id] = (
+                    int(num_nans_for_index[req_index])
+                    if num_nans_for_index is not None
+                    and req_index < logits.shape[0] else 0)
+            return num_nans_in_logits
+        except IndexError:
+            return {}
+
+    @contextmanager
+    def maybe_randomize_inputs(self, input_ids: torch.Tensor):
+        """
+        Randomize input_ids if VLLM_RANDOMIZE_DP_DUMMY_INPUTS is set.
+        This is to help balance expert-selection
+         - during profile_run
+         - during DP rank dummy run
+        """
+        dp_size = self.vllm_config.parallel_config.data_parallel_size
+        randomize_inputs = envs.VLLM_RANDOMIZE_DP_DUMMY_INPUTS and dp_size > 1
+        if not randomize_inputs:
+            yield
+        else:
+            import functools
+
+            @functools.cache
+            def rand_input_ids() -> torch.Tensor:
+                return torch.randint_like(
+                    self.input_ids,
+                    low=0,
+                    high=self.model_config.get_vocab_size(),
+                    dtype=input_ids.dtype)
+
+            logger.debug("Randomizing dummy data for DP Rank")
+            input_ids.copy_(rand_input_ids()[:input_ids.size(0)],
+                            non_blocking=True)
+            yield
+            input_ids.fill_(0)
+
+    @torch.inference_mode()
+    def _dummy_run(
+        self,
+        num_tokens: int,
+        capture_attn_cudagraph: bool = False,
+        skip_eplb: bool = False,
+        is_profile: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+
+        # Padding for DP
+        num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens)
+        num_tokens += num_pad
+
+        # Set num_scheduled_tokens based on num_tokens and max_num_seqs
+        # for dummy run with LoRA so that the num_reqs collectively
+        # has num_tokens in total.
+        assert num_tokens <= self.scheduler_config.max_num_batched_tokens
+        max_num_reqs = self.scheduler_config.max_num_seqs
+        num_reqs = min(num_tokens, max_num_reqs)
+        min_tokens_per_req = num_tokens // num_reqs
+        num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs
+        num_scheduled_tokens_list[-1] += num_tokens % num_reqs
+        assert sum(num_scheduled_tokens_list) == num_tokens
+        assert len(num_scheduled_tokens_list) == num_reqs
+        num_scheduled_tokens = np.array(num_scheduled_tokens_list,
+                                        dtype=np.int32)
+
+        attn_metadata: Optional[dict[str, Any]] = None
+        if capture_attn_cudagraph:
+            attn_metadata = {}
+
+            # Make sure max_model_len is used at the graph capture time.
+            self.seq_lens_np[:num_reqs] = self.max_model_len
+            self.seq_lens_np[num_reqs:] = 0
+            self.seq_lens[:num_reqs].copy_(self.seq_lens_cpu[:num_reqs],
+                                           non_blocking=True)
+
+            for kv_cache_group_id, kv_cache_group_spec in enumerate(
+                    self.kv_cache_config.kv_cache_groups):
+                common_attn_metadata = CommonAttentionMetadata(
+                    query_start_loc=self.query_start_loc[:num_reqs + 1],
+                    query_start_loc_cpu=self.query_start_loc_cpu[:num_reqs +
+                                                                 1],
+                    seq_lens=self.seq_lens[:num_reqs],
+                    seq_lens_cpu=self.seq_lens_cpu[:num_reqs],
+                    num_computed_tokens_cpu=self.input_batch.
+                    num_computed_tokens_cpu_tensor[:num_reqs],
+                    num_reqs=num_reqs,
+                    num_actual_tokens=num_tokens,
+                    max_query_len=num_tokens,
+                    block_table_tensor=self.input_batch.block_table[
+                        kv_cache_group_id].get_device_tensor()[:num_reqs],
+                    slot_mapping=self.input_batch.
+                    block_table[kv_cache_group_id].slot_mapping[:num_tokens])
+
+                attn_metadata_i = self.attn_metadata_builders[
+                    kv_cache_group_id].build_for_cudagraph_capture(
+                        common_attn_metadata)
+                for layer_name in kv_cache_group_spec.layer_names:
+                    attn_metadata[layer_name] = attn_metadata_i
+
+        with self.maybe_dummy_run_with_lora(self.lora_config,
+                                            num_scheduled_tokens):
+            model = self.model
+            if self.is_multimodal_model:
+                model_kwargs = self._init_model_kwargs_for_multimodal_model(
+                    num_reqs=num_reqs)
+                input_ids = None
+                inputs_embeds = self.inputs_embeds[:num_tokens]
+            else:
+                input_ids = self.input_ids[:num_tokens]
+                inputs_embeds = None
+                model_kwargs = {}
+
+            if self.uses_mrope:
+                positions = self.mrope_positions[:, :num_tokens]
+            else:
+                positions = self.positions[:num_tokens]
+
+            if get_pp_group().is_first_rank:
+                intermediate_tensors = None
+            else:
+                if self.intermediate_tensors is None:
+                    self.intermediate_tensors = (
+                        self.model.make_empty_intermediate_tensors(
+                            batch_size=self.max_num_tokens,
+                            dtype=self.model_config.dtype,
+                            device=self.device))
+
+                intermediate_tensors = self.sync_and_slice_intermediate_tensors(
+                    num_tokens, None, False)
+
+            with self.maybe_randomize_inputs(input_ids), set_forward_context(
+                    attn_metadata,
+                    self.vllm_config,
+                    num_tokens=num_tokens,
+                    num_tokens_across_dp=num_tokens_across_dp):
+                outputs = model(
+                    input_ids=input_ids,
+                    positions=positions,
+                    intermediate_tensors=intermediate_tensors,
+                    inputs_embeds=inputs_embeds,
+                    **MultiModalKwargs.as_kwargs(
+                        model_kwargs,
+                        device=self.device,
+                    ),
+                )
+
+            if self.use_aux_hidden_state_outputs:
+                hidden_states, _ = outputs
+            else:
+                hidden_states = outputs
+
+            if self.speculative_config and self.speculative_config.use_eagle():
+                assert isinstance(self.drafter, EagleProposer)
+                self.drafter.dummy_run(num_tokens)
+
+        # This is necessary to avoid blocking DP.
+        # For dummy runs, we typically skip EPLB since we don't have any real
+        # requests to process.
+        # However, in DP settings, there may be cases when some DP ranks do
+        # not have any requests to process, so they're executing dummy batches.
+        # In such cases, we still have to trigger EPLB to make sure
+        # ranks execute the rearrangement in synchronization.
+        if not skip_eplb:
+            self.eplb_step(is_dummy=True, is_profile=is_profile)
+
+        logit_indices = np.cumsum(num_scheduled_tokens) - 1
+        return hidden_states, hidden_states[logit_indices]
+
+    @torch.inference_mode()
+    def _dummy_sampler_run(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        # The dummy hidden states may contain special values,
+        # like `inf` or `nan`.
+        # To avoid breaking the sampler, we use a random tensor here instead.
+        hidden_states = torch.rand_like(hidden_states)
+
+        logits = self.model.compute_logits(hidden_states, None)
+        num_reqs = logits.size(0)
+
+        dummy_tensors = lambda v: torch.full(
+            (num_reqs, ), v, device=self.device)
+
+        dummy_metadata = SamplingMetadata(
+            temperature=dummy_tensors(0.5),
+            all_greedy=False,
+            all_random=False,
+            top_p=dummy_tensors(0.9),
+            top_k=dummy_tensors(logits.size(1) - 1),
+            generators={},
+            max_num_logprobs=None,
+            no_penalties=True,
+            prompt_token_ids=None,
+            frequency_penalties=dummy_tensors(0.1),
+            presence_penalties=dummy_tensors(0.1),
+            repetition_penalties=dummy_tensors(0.1),
+            output_token_ids=[[] for _ in range(num_reqs)],
+            allowed_token_ids_mask=None,
+            bad_words_token_ids={},
+            logitsprocs=LogitsProcessorManager(),
+        )
+        try:
+            sampler_output = self.sampler(logits=logits,
+                                          sampling_metadata=dummy_metadata)
+        except RuntimeError as e:
+            if 'out of memory' in str(e):
+                raise RuntimeError(
+                    "CUDA out of memory occurred when warming up sampler with "
+                    f"{num_reqs} dummy requests. Please try lowering "
+                    "`max_num_seqs` or `gpu_memory_utilization` when "
+                    "initializing the engine.") from e
+            else:
+                raise e
+        if self.speculative_config:
+            draft_token_ids = [[0] for _ in range(num_reqs)]
+            dummy_spec_decode_metadata = SpecDecodeMetadata.make_dummy(
+                draft_token_ids, self.device)
+
+            num_tokens = sum(len(ids) for ids in draft_token_ids)
+            # draft_probs = torch.randn(
+            #     num_tokens, logits.shape[-1], device=self.device,
+            #     dtype=logits.dtype)
+            draft_probs = None
+            target_logits = torch.randn(num_tokens,
+                                        logits.shape[-1],
+                                        device=self.device,
+                                        dtype=logits.dtype)
+            # NOTE(woosuk): Here, we should use int32 because the sampler uses
+            # int32 for bonus_token_ids. If the dtype mismatches, re-compilation
+            # will occur at runtime.
+            bonus_token_ids = torch.zeros(num_reqs,
+                                          device=self.device,
+                                          dtype=torch.int32)
+            self.rejection_sampler(
+                dummy_spec_decode_metadata,
+                draft_probs,
+                target_logits,
+                bonus_token_ids,
+                dummy_metadata,
+            )
+        return sampler_output
+
+    def _dummy_pooler_run_task(
+        self,
+        hidden_states: torch.Tensor,
+        task: PoolingTask,
+    ) -> PoolerOutput:
+        num_tokens = hidden_states.shape[0]
+        max_num_reqs = self.scheduler_config.max_num_seqs
+        num_reqs = min(num_tokens, max_num_reqs)
+        min_tokens_per_req = num_tokens // num_reqs
+        num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs
+        num_scheduled_tokens_list[-1] += num_tokens % num_reqs
+        assert sum(num_scheduled_tokens_list) == num_tokens
+        assert len(num_scheduled_tokens_list) == num_reqs
+
+        hidden_states_list = list(
+            torch.split(hidden_states, num_scheduled_tokens_list))
+        req_num_tokens = num_tokens // num_reqs
+
+        dummy_prompt_lens = torch.tensor(
+            [h.shape[0] for h in hidden_states_list],
+            device=self.device,
+        )
+        dummy_token_ids = torch.zeros((num_reqs, req_num_tokens),
+                                      dtype=torch.int32,
+                                      device=self.device)
+
+        model = cast(VllmModelForPooling, self.model)
+        dummy_pooling_params = PoolingParams(task=task)
+        to_update = model.pooler.get_pooling_updates(task)
+        to_update.apply(dummy_pooling_params)
+
+        dummy_metadata = PoolingMetadata(
+            prompt_lens=dummy_prompt_lens,
+            prompt_token_ids=dummy_token_ids,
+            pooling_params=[dummy_pooling_params] * num_reqs,
+        )
+
+        try:
+            return model.pooler(hidden_states=hidden_states_list,
+                                pooling_metadata=dummy_metadata)
+        except RuntimeError as e:
+            if 'out of memory' in str(e):
+                raise RuntimeError(
+                    "CUDA out of memory occurred when warming up pooler "
+                    f"({task=}) with {num_reqs} dummy requests. Please try "
+                    "lowering `max_num_seqs` or `gpu_memory_utilization` when "
+                    "initializing the engine.") from e
+            else:
+                raise e
+
+    @torch.inference_mode()
+    def _dummy_pooler_run(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> PoolerOutput:
+        # Find the task that has the largest output for subsequent steps
+        output_size = dict[PoolingTask, float]()
+        for task in self.get_supported_pooling_tasks():
+            # Run a full batch with each task to ensure none of them OOMs
+            output = self._dummy_pooler_run_task(hidden_states, task)
+            output_size[task] = output.get_data_nbytes()
+            del output  # Allow GC
+
+        max_task = max(output_size.items(), key=lambda x: x[1])[0]
+        return self._dummy_pooler_run_task(hidden_states, max_task)
+
+    def profile_run(self) -> None:
+        # Profile with multimodal encoder & encoder cache.
+        # TODO: handle encoder-decoder models once we support them.
+        if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0
+                and self.encoder_cache_size > 0):
+
+            # NOTE: Currently model is profiled with a single non-text
+            # modality with the max possible input tokens even when
+            # it supports multiple.
+            max_tokens_by_modality_dict = self.mm_registry \
+                .get_max_tokens_per_item_by_nonzero_modality(self.model_config)
+            dummy_data_modality, max_tokens_per_mm_item = max(
+                max_tokens_by_modality_dict.items(), key=lambda item: item[1])
+
+            # Check how many items of this modality can be supported by
+            # the encoder budget.
+            encoder_budget = min(self.max_num_encoder_input_tokens,
+                                 self.encoder_cache_size)
+
+            max_num_mm_items_encoder_budget = encoder_budget // \
+                max_tokens_per_mm_item
+
+            # Check how many items of this modality can be supported by
+            # the decoder budget.
+            max_mm_items_per_req = self.mm_registry.get_mm_limits_per_prompt(
+                self.model_config)[dummy_data_modality]
+
+            # NOTE: We do not consider max_num_batched_tokens on purpose
+            # because the multimodal embeddings can be generated in advance
+            # and chunked prefilled.
+            max_num_mm_items_decoder_budget = self.max_num_reqs * \
+                max_mm_items_per_req
+
+            max_num_mm_items = max(
+                1,
+                min(max_num_mm_items_encoder_budget,
+                    max_num_mm_items_decoder_budget))
+
+            logger.info(
+                "Encoder cache will be initialized with a budget of %s tokens,"
+                " and profiled with %s %s items of the maximum feature size.",
+                encoder_budget, max_num_mm_items, dummy_data_modality)
+
+            # Create dummy batch of multimodal inputs.
+            dummy_mm_kwargs = self.mm_registry.get_decoder_dummy_data(
+                model_config=self.model_config,
+                seq_len=max_tokens_per_mm_item,
+                mm_counts={
+                    dummy_data_modality: 1
+                },
+            ).multi_modal_data
+
+            batched_dummy_mm_inputs = MultiModalKwargs.batch(
+                [dummy_mm_kwargs] * max_num_mm_items,
+                pin_memory=self.pin_memory)
+            batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs(
+                batched_dummy_mm_inputs,
+                device=self.device,
+            )
+
+            # Run multimodal encoder.
+            dummy_encoder_outputs = self.model.get_multimodal_embeddings(
+                **batched_dummy_mm_inputs)
+
+            sanity_check_mm_encoder_outputs(
+                dummy_encoder_outputs,
+                expected_num_items=max_num_mm_items,
+            )
+
+            # Cache the dummy encoder outputs.
+            self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
+
+        # Add `is_profile` here to pre-allocate communication buffers
+        hidden_states, last_hidden_states \
+            = self._dummy_run(self.max_num_tokens, is_profile=True)
+        if get_pp_group().is_last_rank:
+            if self.is_pooling_model:
+                output = self._dummy_pooler_run(hidden_states)
+            else:
+                output = self._dummy_sampler_run(last_hidden_states)
+        else:
+            output = None
+        self._sync_device()
+        del hidden_states, output
+        self.encoder_cache.clear()
+        gc.collect()
+
+    def capture_model(self) -> None:
+        if not self.use_cuda_graph:
+            logger.warning(
+                "Skipping CUDA graph capture. To turn on CUDA graph capture, "
+                "set -O %s and ensure `use_cudagraph` was not manually set to "
+                "False", CompilationLevel.PIECEWISE)
+            return
+
+        compilation_counter.num_gpu_runner_capture_triggers += 1
+
+        start_time = time.perf_counter()
+        start_free_gpu_memory = torch.cuda.mem_get_info()[0]
+
+        @contextmanager
+        def freeze_gc():
+            # Optimize garbage collection during CUDA graph capture.
+            # Clean up, then freeze all remaining objects from being included
+            # in future collections.
+            gc.collect()
+            should_freeze = not envs.VLLM_ENABLE_CUDAGRAPH_GC
+            if should_freeze:
+                gc.freeze()
+            try:
+                yield
+            finally:
+                if should_freeze:
+                    gc.unfreeze()
+
+        # Trigger CUDA graph capture for specific shapes.
+        # Capture the large shapes first so that the smaller shapes
+        # can reuse the memory pool allocated for the large shapes.
+        with freeze_gc(), graph_capture(device=self.device):
+            full_cg = self.full_cuda_graph
+            # Only rank 0 should print progress bar during capture
+            compilation_cases = reversed(self.cudagraph_batch_sizes)
+            if is_global_first_rank():
+                compilation_cases = tqdm(
+                    list(compilation_cases),
+                    disable=not self.load_config.use_tqdm_on_load,
+                    desc="Capturing CUDA graph shapes")
+            for num_tokens in compilation_cases:
+                # We skip EPLB here since we don't want to record dummy metrics
+                for _ in range(
+                        self.compilation_config.cudagraph_num_of_warmups):
+                    self._dummy_run(num_tokens,
+                                    capture_attn_cudagraph=full_cg,
+                                    skip_eplb=True)
+                self._dummy_run(num_tokens,
+                                capture_attn_cudagraph=full_cg,
+                                skip_eplb=True)
+
+        end_time = time.perf_counter()
+        end_free_gpu_memory = torch.cuda.mem_get_info()[0]
+        elapsed_time = end_time - start_time
+        cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory
+        # This usually takes 5~20 seconds.
+        logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
+                    elapsed_time, cuda_graph_size / (1 << 30))
+
+    def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None:
+        """
+        Initialize the attention backends and attention metadata builders.
+        """
+        assert len(self.attn_backends) == 0 and len(
+            self.attn_metadata_builders
+        ) == 0, "Attention backends are already initialized"
+        for i, kv_cache_group_spec in enumerate(
+                kv_cache_config.kv_cache_groups):
+            kv_cache_spec = kv_cache_group_spec.kv_cache_spec
+            if isinstance(kv_cache_spec, AttentionSpec):
+                attn_backend_i = get_attn_backend(
+                    kv_cache_spec.head_size,
+                    self.dtype,
+                    kv_cache_spec.dtype,
+                    kv_cache_spec.block_size,
+                    self.model_config.is_attention_free,
+                    use_mla=kv_cache_spec.use_mla,
+                )
+                if attn_backend_i is None:
+                    error_msg = (f"Error with get_attn_backend: "
+                                 f"{kv_cache_spec.head_size=}, "
+                                 f"{self.dtype=}, {kv_cache_spec.dtype=}, "
+                                 f"{kv_cache_spec.block_size=}, "
+                                 f"{self.model_config.is_attention_free=}, "
+                                 f"{kv_cache_spec.use_mla=}")
+                    logger.error(error_msg)
+                    raise NotImplementedError(
+                        "Non-Attention backend is not supported by V1 "
+                        "GPUModelRunner.")
+            elif isinstance(kv_cache_spec, MambaSpec):
+                attn_backend_i = Mamba2AttentionBackend
+            else:
+                raise ValueError(
+                    f"Unknown KV cache spec type: {type(kv_cache_spec)}")
+
+            attn_metadata_builder_i = attn_backend_i.get_builder_cls()(
+                kv_cache_spec,
+                self.vllm_config,
+                self.device,
+            )
+
+            if (self.full_cuda_graph
+                    and not attn_metadata_builder_i.full_cudagraph_supported):
+                raise ValueError(
+                    f"Full CUDAGraph not supported for "
+                    f"{attn_backend_i.__name__}. Turn off CompilationConfig."
+                    f"full_cuda_graph or use a different attention backend.")
+
+            self.attn_backends.append(attn_backend_i)
+            self.attn_metadata_builders.append(attn_metadata_builder_i)
+
+    def may_reinitialize_input_batch(self,
+                                     kv_cache_config: KVCacheConfig) -> None:
+        """
+        Re-initialize the input batch if the block sizes are different from
+        `[self.cache_config.block_size]`. This usually happens when there
+        are multiple KV cache groups.
+
+        Args:
+            kv_cache_config: The KV cache configuration.
+        """
+        block_sizes = [
+            kv_cache_group.kv_cache_spec.block_size
+            for kv_cache_group in kv_cache_config.kv_cache_groups
+        ]
+        if block_sizes != [self.cache_config.block_size]:
+            assert self.cache_config.cpu_offload_gb == 0, (
+                "Cannot re-initialize the input batch when CPU weight "
+                "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 "  # noqa: E501
+                "for more details.")
+            self.input_batch = InputBatch(
+                max_num_reqs=self.max_num_reqs,
+                max_model_len=self.max_model_len,
+                max_num_batched_tokens=self.max_num_tokens,
+                device=self.device,
+                pin_memory=self.pin_memory,
+                vocab_size=self.model_config.get_vocab_size(),
+                block_sizes=block_sizes,
+                is_spec_decode=bool(self.vllm_config.speculative_config),
+            )
+
+    def _allocate_kv_cache_tensors(
+            self, kv_cache_config: KVCacheConfig) -> dict[str, torch.Tensor]:
+        """
+        Initializes the KV cache buffer with the correct size. The buffer needs
+        to be reshaped to the desired shape before being used by the models.
+
+        Args:
+            kv_cache_config: The KV cache config
+        Returns:
+            dict[str, torch.Tensor]: A map between layer names to their
+            corresponding memory buffer for KV cache.
+         """
+        kv_cache_raw_tensors: dict[str, torch.Tensor] = {}
+        for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
+            tensor = torch.zeros(kv_cache_tensor.size,
+                                 dtype=torch.int8,
+                                 device=self.device)
+            for layer_name in kv_cache_tensor.shared_by:
+                kv_cache_raw_tensors[layer_name] = tensor
+
+        layer_names = set()
+        for group in kv_cache_config.kv_cache_groups:
+            layer_names.update(group.layer_names)
+        assert layer_names == set(kv_cache_raw_tensors.keys(
+        )), "Some layers are not correctly initialized"
+        return kv_cache_raw_tensors
+
+    def _reshape_kv_cache_tensors(
+        self,
+        kv_cache_config: KVCacheConfig,
+        kv_cache_raw_tensors: dict[str, torch.Tensor],
+    ) -> dict[str, torch.Tensor]:
+        """
+        Reshape the KV cache tensors to the desired shape and dtype.
+
+        Args:
+            kv_cache_config: The KV cache config
+            kv_cache_raw_tensors: The KV cache buffer of each layer, with
+            correct size but uninitialized shape.
+        Returns:
+            Dict[str, torch.Tensor]: A map between layer names to their
+            corresponding memory buffer for KV cache.
+        """
+        kv_caches: dict[str, torch.Tensor] = {}
+        has_attn, has_mamba = False, False
+        for i, kv_cache_group_spec in enumerate(
+                kv_cache_config.kv_cache_groups):
+            kv_cache_spec = kv_cache_group_spec.kv_cache_spec
+            for layer_name in kv_cache_group_spec.layer_names:
+                raw_tensor = kv_cache_raw_tensors[layer_name]
+                assert raw_tensor.numel() % kv_cache_spec.page_size_bytes == 0
+                num_blocks = (raw_tensor.numel() //
+                              kv_cache_spec.page_size_bytes)
+                if isinstance(kv_cache_spec, AttentionSpec):
+                    has_attn = True
+                    kv_cache_shape = self.attn_backends[i].get_kv_cache_shape(
+                        num_blocks, kv_cache_spec.block_size,
+                        kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
+                    dtype = kv_cache_spec.dtype
+                    try:
+                        kv_cache_stride_order = self.attn_backends[
+                            i].get_kv_cache_stride_order()
+                        assert len(kv_cache_stride_order) == len(
+                            kv_cache_shape)
+                    except (AttributeError, NotImplementedError):
+                        kv_cache_stride_order = tuple(
+                            range(len(kv_cache_shape)))
+                    # The allocation respects the backend-defined stride order
+                    # to ensure the semantic remains consistent for each
+                    # backend. We first obtain the generic kv cache shape and
+                    # then permute it according to the stride order which could
+                    # result in a non-contiguous tensor.
+                    kv_cache_shape = tuple(kv_cache_shape[i]
+                                           for i in kv_cache_stride_order)
+                    # Maintain original KV shape view.
+                    inv_order = [
+                        kv_cache_stride_order.index(i)
+                        for i in range(len(kv_cache_stride_order))
+                    ]
+                    kv_caches[layer_name] = kv_cache_raw_tensors[
+                        layer_name].view(dtype).view(kv_cache_shape).permute(
+                            *inv_order)
+                elif isinstance(kv_cache_spec, MambaSpec):
+                    has_mamba = True
+                    raw_tensor = kv_cache_raw_tensors[layer_name]
+                    dtype = kv_cache_spec.dtype
+                    num_element_per_page = (kv_cache_spec.page_size_bytes //
+                                            get_dtype_size(dtype))
+                    state_tensors = []
+                    storage_offset = 0
+                    for shape in kv_cache_spec.shapes:
+                        target_shape = (num_blocks, *shape)
+                        stride = torch.empty(target_shape).stride()
+                        target_stride = (num_element_per_page, *stride[1:])
+                        tensor = torch.as_strided(
+                            raw_tensor.view(dtype),
+                            size=target_shape,
+                            stride=target_stride,
+                            storage_offset=storage_offset,
+                        )
+                        state_tensors.append(tensor)
+                        storage_offset += stride[0]
+
+                    kv_caches[layer_name] = state_tensors
+                else:
+                    raise NotImplementedError
+
+        if has_attn and has_mamba:
+            self._verify_hybrid_attention_mamba_layout(kv_cache_config,
+                                                       kv_cache_raw_tensors)
+
+        return kv_caches
+
+    def _verify_hybrid_attention_mamba_layout(
+            self, kv_cache_config: KVCacheConfig,
+            kv_cache_raw_tensors: dict[str, torch.Tensor]) -> None:
+        """
+        Verify that the KV cache memory layout is compatible for
+        models with both attention and mamba KV cache groups.
+
+        Args:
+            kv_cache_config: The KV cache config
+            kv_cache_raw_tensors: The KV cache buffer of each layer.
+        """
+
+        for i, kv_cache_group_spec in enumerate(
+                kv_cache_config.kv_cache_groups):
+            kv_cache_spec = kv_cache_group_spec.kv_cache_spec
+            for layer_name in kv_cache_group_spec.layer_names:
+                raw_tensor = kv_cache_raw_tensors[layer_name]
+                num_blocks = (raw_tensor.numel() //
+                              kv_cache_spec.page_size_bytes)
+                if isinstance(kv_cache_spec, AttentionSpec):
+                    kv_cache_shape = self.attn_backends[i].get_kv_cache_shape(
+                        num_blocks, kv_cache_spec.block_size,
+                        kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
+                    if kv_cache_shape[0] != num_blocks or kv_cache_shape[
+                            1] != 2:
+                        raise ValueError(
+                            "Hybrid models in V1 require an attention "
+                            "backend with kv_cache_shape="
+                            "(num_blocks, 2, ...). Please try setting "
+                            "VLLM_ATTENTION_BACKEND=FLASHINFER")
+
+    def initialize_kv_cache_tensors(
+            self, kv_cache_config: KVCacheConfig) -> dict[str, torch.Tensor]:
+        """
+        Initialize the memory buffer for KV cache.
+
+        Args:
+            kv_cache_config: The KV cache config
+        Returns:
+            Dict[str, torch.Tensor]: A map between layer names to their
+            corresponding memory buffer for KV cache.
+        """
+        # Initialize the memory buffer for KV cache
+        kv_cache_raw_tensors = self._allocate_kv_cache_tensors(kv_cache_config)
+        # Change the memory buffer to the desired shape
+        kv_caches = self._reshape_kv_cache_tensors(kv_cache_config,
+                                                   kv_cache_raw_tensors)
+
+        # Setup `kv_cache_config` and `kv_caches` for models
+        # with cross-layer KV sharing
+        if self.shared_kv_cache_layers:
+            initialize_kv_cache_for_kv_sharing(
+                self.shared_kv_cache_layers,
+                kv_cache_config.kv_cache_groups,
+                kv_caches,
+            )
+
+        bind_kv_cache(kv_caches,
+                      self.compilation_config.static_forward_context,
+                      self.kv_caches)
+        return kv_caches
+
+    def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
+        """
+        Initialize KV cache based on `kv_cache_config`.
+        Args:
+            kv_cache_config: Configuration for the KV cache, including the KV
+            cache size of each layer
+        """
+        self.kv_cache_config = kv_cache_config
+        self.may_reinitialize_input_batch(kv_cache_config)
+        self.initialize_attn_backend(kv_cache_config)
+        kv_caches = self.initialize_kv_cache_tensors(kv_cache_config)
+
+        if self.speculative_config and self.speculative_config.use_eagle():
+            assert isinstance(self.drafter, EagleProposer)
+            # validate all draft model layers belong to the same kv cache
+            # group
+            self.drafter.validate_same_kv_cache_group(kv_cache_config)
+
+        if has_kv_transfer_group():
+            get_kv_transfer_group().register_kv_caches(kv_caches)
+
+    def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
+        """
+        Generates the KVCacheSpec by parsing the kv cache format from each
+        Attention module in the static forward context.
+        Returns:
+            KVCacheSpec: A dictionary mapping layer names to their KV cache
+            format. Layers that do not need KV cache are not included.
+        """
+
+        block_size = self.vllm_config.cache_config.block_size
+        use_mla = self.vllm_config.model_config.use_mla
+        kv_cache_spec: dict[str, KVCacheSpec] = {}
+        attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention)
+        for layer_name, attn_module in attn_layers.items():
+            if (kv_tgt_layer :=
+                    attn_module.kv_sharing_target_layer_name) is not None:
+                # The layer doesn't need its own KV cache and will use that of
+                # the target layer. We skip creating a KVCacheSpec for it, so
+                # that KV cache management logic will act as this layer does
+                # not exist, and doesn't allocate KV cache for the layer. This
+                # enables the memory saving of cross-layer kv sharing, allowing
+                # a given amount of memory to accommodate longer context lengths
+                # or enable more requests to be processed simultaneously.
+                self.shared_kv_cache_layers[layer_name] = kv_tgt_layer
+                continue
+
+            # TODO: Support other attention modules, e.g., cross-attention
+            if attn_module.attn_type == AttentionType.DECODER:
+                use_local_attention = (self.attention_chunk_size is not None
+                                       and attn_module.use_irope)
+                if attn_module.sliding_window is not None:
+                    kv_cache_spec[layer_name] = SlidingWindowSpec(
+                        block_size=block_size,
+                        num_kv_heads=attn_module.num_kv_heads,
+                        head_size=attn_module.head_size,
+                        dtype=self.kv_cache_dtype,
+                        sliding_window=attn_module.sliding_window,
+                        use_mla=use_mla)
+                    assert not use_local_attention, (
+                        "attention module can not be with ",
+                        "both local attention and sliding window")
+                elif use_local_attention:
+                    kv_cache_spec[layer_name] = ChunkedLocalAttentionSpec(
+                        block_size=block_size,
+                        num_kv_heads=attn_module.num_kv_heads,
+                        head_size=attn_module.head_size,
+                        dtype=self.kv_cache_dtype,
+                        attention_chunk_size=self.attention_chunk_size,
+                        use_mla=use_mla)
+                else:
+                    kv_cache_spec[layer_name] = FullAttentionSpec(
+                        block_size=block_size,
+                        num_kv_heads=attn_module.num_kv_heads,
+                        head_size=attn_module.head_size,
+                        dtype=self.kv_cache_dtype,
+                        use_mla=use_mla)
+            elif attn_module.attn_type in (AttentionType.ENCODER,
+                                           AttentionType.ENCODER_ONLY):
+                # encoder-only attention does not need KV cache.
+                continue
+            elif attn_module.attn_type == AttentionType.ENCODER_DECODER:
+                raise NotImplementedError
+            else:
+                raise ValueError(
+                    f"Unknown attention type: {attn_module.attn_type}")
+
+        mamba_layers = get_layers_from_vllm_config(self.vllm_config, MambaBase)
+        if len(mamba_layers) > 0:
+            if self.vllm_config.speculative_config is not None:
+                raise NotImplementedError(
+                    "Mamba with speculative decoding is not supported yet.")
+            if self.vllm_config.cache_config.enable_prefix_caching:
+                raise NotImplementedError(
+                    "Prefix caching is not supported for Mamba yet.")
+            max_model_len = self.vllm_config.model_config.max_model_len
+
+            page_size_padded = (
+                self.vllm_config.cache_config.mamba_page_size_padded)
+
+            # Set block_size to max_model_len, so that mamba model will always
+            # have only one block in the KV cache.
+            for layer_name, mamba_module in mamba_layers.items():
+                kv_cache_spec[layer_name] = MambaSpec(
+                    shapes=mamba_module.get_state_shape(),
+                    dtype=self.kv_cache_dtype,
+                    block_size=max_model_len,
+                    page_size_padded=page_size_padded)
+
+        return kv_cache_spec
diff --git a/vllm_v0.10.0/vllm/v1/worker/gpu_worker.py b/vllm_v0.10.0/vllm/v1/worker/gpu_worker.py
new file mode 100644
index 0000000..5229463
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/worker/gpu_worker.py
@@ -0,0 +1,604 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""A GPU worker class."""
+import copy
+import gc
+import os
+from contextlib import AbstractContextManager, nullcontext
+from typing import TYPE_CHECKING, Any, Optional
+
+import torch
+import torch.distributed
+import torch.nn as nn
+
+import vllm.envs as envs
+from vllm.config import VllmConfig
+from vllm.distributed import (ensure_model_parallel_initialized,
+                              init_distributed_environment,
+                              set_custom_all_reduce)
+from vllm.distributed.kv_transfer import (ensure_kv_transfer_initialized,
+                                          has_kv_transfer_group)
+from vllm.distributed.parallel_state import get_pp_group, get_tp_group
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.model_executor import set_random_seed
+from vllm.platforms import current_platform
+from vllm.pooling_params import PoolingTask
+from vllm.sequence import IntermediateTensors
+from vllm.utils import GiB_bytes, MemorySnapshot, memory_profiling
+from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
+from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
+from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, ModelRunnerOutput
+from vllm.v1.utils import report_usage_stats
+from vllm.v1.worker.gpu_model_runner import GPUModelRunner
+from vllm.v1.worker.worker_base import WorkerBase
+
+logger = init_logger(__name__)
+
+if TYPE_CHECKING:
+    from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
+    from vllm.v1.core.sched.output import SchedulerOutput
+
+
+class Worker(WorkerBase):
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        local_rank: int,
+        rank: int,
+        distributed_init_method: str,
+        is_driver_worker: bool = False,
+    ):
+
+        super().__init__(vllm_config=vllm_config,
+                         local_rank=local_rank,
+                         rank=rank,
+                         distributed_init_method=distributed_init_method,
+                         is_driver_worker=is_driver_worker)
+
+        if self.model_config.trust_remote_code:
+            # note: lazy import to avoid importing torch before initializing
+            from vllm.utils import init_cached_hf_modules
+            init_cached_hf_modules()
+
+        # Buffers saved before sleep
+        self._sleep_saved_buffers: dict[str, torch.Tensor] = {}
+
+        # Torch profiler. Enabled and configured through env vars:
+        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
+        if envs.VLLM_TORCH_PROFILER_DIR:
+            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
+            logger.info("Profiling enabled. Traces will be saved to: %s",
+                        torch_profiler_trace_dir)
+            self.profiler = torch.profiler.profile(
+                activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                    torch.profiler.ProfilerActivity.CUDA,
+                ],
+                with_stack=True,
+                on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                    torch_profiler_trace_dir, use_gzip=True))
+        else:
+            self.profiler = None
+
+    def sleep(self, level: int = 1) -> None:
+        from vllm.device_allocator.cumem import CuMemAllocator
+
+        free_bytes_before_sleep = torch.cuda.mem_get_info()[0]
+
+        # Save the buffers before level 2 sleep
+        if level == 2:
+            model = self.model_runner.model
+            self._sleep_saved_buffers = {
+                name: buffer.cpu().clone()
+                for name, buffer in model.named_buffers()
+            }
+
+        allocator = CuMemAllocator.get_instance()
+        allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple())
+        free_bytes_after_sleep, total = torch.cuda.mem_get_info()
+        freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep
+        used_bytes = total - free_bytes_after_sleep
+        assert freed_bytes >= 0, "Memory usage increased after sleeping."
+        logger.info(
+            "Sleep mode freed %.2f GiB memory, "
+            "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes,
+            used_bytes / GiB_bytes)
+
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        from vllm.device_allocator.cumem import CuMemAllocator
+
+        allocator = CuMemAllocator.get_instance()
+        allocator.wake_up(tags)
+
+        # Restore the buffers after level 2 sleep
+        if len(self._sleep_saved_buffers):
+            model = self.model_runner.model
+            for name, buffer in model.named_buffers():
+                if name in self._sleep_saved_buffers:
+                    buffer.data.copy_(self._sleep_saved_buffers[name].data)
+            self._sleep_saved_buffers = {}
+
+    def _maybe_get_memory_pool_context(self,
+                                       tag: str) -> AbstractContextManager:
+        if self.vllm_config.model_config.enable_sleep_mode:
+            from vllm.device_allocator.cumem import CuMemAllocator
+
+            allocator = CuMemAllocator.get_instance()
+            if tag == "weights":
+                assert allocator.get_current_usage() == 0, (
+                    "Sleep mode can only be "
+                    "used for one instance per process.")
+            context = allocator.use_memory_pool(tag=tag)
+        else:
+            context = nullcontext()
+        return context
+
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+
+    def init_device(self):
+        if self.device_config.device.type == "cuda":
+            # torch.distributed.all_reduce does not free the input tensor until
+            # the synchronization point. This causes the memory usage to grow
+            # as the number of all_reduce calls increases. This env var disables
+            # this behavior.
+            # Related issue:
+            # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
+            os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
+
+            # This env var set by Ray causes exceptions with graph building.
+            os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
+            self.device = torch.device(f"cuda:{self.local_rank}")
+            current_platform.set_device(self.device)
+
+            _check_if_gpu_supports_dtype(self.model_config.dtype)
+            gc.collect()
+            torch.cuda.empty_cache()
+
+            # take current memory snapshot
+            self.init_snapshot = MemorySnapshot()
+            self.requested_memory = (self.init_snapshot.total_memory *
+                                     self.cache_config.gpu_memory_utilization)
+            if self.init_snapshot.free_memory < self.requested_memory:
+                GiB = lambda b: round(b / GiB_bytes, 2)
+                raise ValueError(
+                    f"Free memory on device "
+                    f"({GiB(self.init_snapshot.free_memory)}/"
+                    f"{GiB(self.init_snapshot.total_memory)} GiB) on startup "
+                    f"is less than desired GPU memory utilization "
+                    f"({self.cache_config.gpu_memory_utilization}, "
+                    f"{GiB(self.requested_memory)} GiB). Decrease GPU memory "
+                    f"utilization or reduce GPU memory used by other processes."
+                )
+        else:
+            raise RuntimeError(
+                f"Not support device type: {self.device_config.device}")
+        # Initialize the distributed environment.
+        init_worker_distributed_environment(self.vllm_config, self.rank,
+                                            self.distributed_init_method,
+                                            self.local_rank,
+                                            current_platform.dist_backend)
+        # Set random seed.
+        set_random_seed(self.model_config.seed)
+
+        # Construct the model runner
+        self.model_runner: GPUModelRunner = GPUModelRunner(
+            self.vllm_config, self.device)
+
+        if self.rank == 0:
+            # If usage stat is enabled, collect relevant info.
+            report_usage_stats(self.vllm_config)
+
+    # FIXME(youkaichao & ywang96): Use TorchDispatchMode instead of memory pool
+    # to hijack tensor allocation.
+    def load_model(self) -> None:
+        eep_scale_up = os.environ.get("VLLM_ELASTIC_EP_SCALE_UP_LAUNCH") == "1"
+        with self._maybe_get_memory_pool_context(tag="weights"):
+            self.model_runner.load_model(eep_scale_up=eep_scale_up)
+
+    def update_config(self, overrides: dict[str, Any]) -> None:
+        self.model_runner.update_config(overrides)
+
+    def reload_weights(self) -> None:
+        with self._maybe_get_memory_pool_context(tag="weights"):
+            self.model_runner.reload_weights()
+
+    @torch.inference_mode()
+    def determine_available_memory(self) -> int:
+        """Profiles the peak memory usage of the model to determine how much 
+        memory can be used for KV cache without OOMs.
+
+        The engine will first conduct a profiling of the existing memory usage.
+        Then, it calculate the free memory that can be used for KV cache in
+        bytes.
+
+        Tip:
+            You may limit the usage of GPU memory
+            by adjusting the `gpu_memory_utilization` parameter.
+        """
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+        GiB = lambda b: b / GiB_bytes
+
+        # Execute a forward pass with dummy inputs to profile the memory usage
+        # of the model.
+        with memory_profiling(
+                self.init_snapshot,
+                weights_memory=int(
+                    self.model_runner.model_memory_usage)) as profile_result:
+            self.model_runner.profile_run()
+
+        free_gpu_memory = profile_result.after_profile.free_memory
+        # NOTE(woosuk): Here we assume that the other processes using the same
+        # GPU did not change their memory usage during the profiling.
+        assert self.init_snapshot.free_memory > free_gpu_memory, (
+            "Error in memory profiling. "
+            f"Initial free memory {GiB(self.init_snapshot.free_memory)} GiB, "
+            f"current free memory {GiB(free_gpu_memory)} GiB. "
+            "This happens when other processes sharing the same container "
+            "release GPU memory while vLLM is profiling during initialization. "
+            "To fix this, ensure consistent GPU memory allocation or "
+            "isolate vLLM in its own container.")
+        available_kv_cache_memory = self.requested_memory \
+            - profile_result.non_kv_cache_memory
+
+        logger.debug(
+            "Initial free memory: %.2f GiB, free memory: %.2f GiB, "
+            "requested GPU memory: %.2f GiB",
+            GiB(self.init_snapshot.free_memory), GiB(free_gpu_memory),
+            GiB(self.requested_memory))
+        logger.debug(profile_result)
+        logger.info("Available KV cache memory: %.2f GiB",
+                    GiB(available_kv_cache_memory))
+        gc.collect()
+
+        return int(available_kv_cache_memory)
+
+    def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
+        return self.model_runner.get_kv_cache_spec()
+
+    def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
+        """Allocate GPU KV cache with the specified kv_cache_config."""
+
+        if self.vllm_config.model_config.enable_sleep_mode:
+            from vllm.device_allocator.cumem import CuMemAllocator
+
+            allocator = CuMemAllocator.get_instance()
+            context = allocator.use_memory_pool(tag="kv_cache")
+        else:
+            from contextlib import nullcontext
+            context = nullcontext()
+        with context:
+            self.model_runner.initialize_kv_cache(kv_cache_config)
+
+    def compile_or_warm_up_model(self) -> None:
+        # warm up sizes that are not in cudagraph capture sizes,
+        # but users still want to compile for better performance,
+        # e.g. for the max-num-batched token size in chunked prefill.
+        warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy()
+        if not self.model_config.enforce_eager:
+            warmup_sizes = [
+                x for x in warmup_sizes if x not in
+                self.vllm_config.compilation_config.cudagraph_capture_sizes
+            ]
+        # We skip EPLB here since we don't want to record dummy metrics
+        for size in sorted(warmup_sizes, reverse=True):
+            logger.info("Compile and warming up model for size %d", size)
+            self.model_runner._dummy_run(size, skip_eplb=True)
+        if not self.model_config.enforce_eager:
+            self.model_runner.capture_model()
+
+        # Warm up sampler and preallocate memory buffer for logits and other
+        # sampling related tensors of max possible shape to avoid memory
+        # fragmentation issue.
+        # NOTE: This is called after `capture_model` on purpose to prevent
+        # memory buffers from being cleared by `torch.cuda.empty_cache`.
+        if get_pp_group().is_last_rank:
+            max_num_reqs = min(self.scheduler_config.max_num_seqs,
+                               self.scheduler_config.max_num_batched_tokens)
+
+            # We skip EPLB here since we don't want to record dummy metrics
+            hidden_states, last_hidden_states = \
+                self.model_runner._dummy_run(
+                    num_tokens=max_num_reqs,
+                    skip_eplb=True,
+                )
+            if self.model_runner.is_pooling_model:
+                self.model_runner._dummy_pooler_run(hidden_states)
+            else:
+                self.model_runner._dummy_sampler_run(
+                    hidden_states=last_hidden_states)
+
+        # Reset the seed to ensure that the random state is not affected by
+        # the model initialization and profiling.
+        set_random_seed(self.model_config.seed)
+
+    def get_model(self) -> nn.Module:
+        return self.model_runner.get_model()
+
+    def get_supported_pooling_tasks(self) -> list[PoolingTask]:
+        return self.model_runner.get_supported_pooling_tasks()
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        scheduler_output: "SchedulerOutput",
+    ) -> Optional[ModelRunnerOutput]:
+        intermediate_tensors = None
+        if not get_pp_group().is_first_rank:
+            intermediate_tensors = IntermediateTensors(
+                get_pp_group().recv_tensor_dict(
+                    all_gather_group=get_tp_group()))
+
+        output = self.model_runner.execute_model(scheduler_output,
+                                                 intermediate_tensors)
+
+        parallel_config = self.vllm_config.parallel_config
+        if parallel_config.distributed_executor_backend != "external_launcher" \
+            and not get_pp_group().is_last_rank:
+            assert isinstance(output, IntermediateTensors)
+            get_pp_group().send_tensor_dict(output.tensors,
+                                            all_gather_group=get_tp_group())
+            if not has_kv_transfer_group():
+                return None
+
+            # In case of PP with kv transfer, we need to pass through the
+            # finished_sending and finished_recving buffers.
+            new_output = EMPTY_MODEL_RUNNER_OUTPUT
+            if output.finished_sending or output.finished_recving:
+                new_output = copy.copy(new_output)
+                new_output.finished_sending = output.finished_sending
+                new_output.finished_recving = output.finished_recving
+            output = new_output
+
+        assert isinstance(output, ModelRunnerOutput)
+        return output
+
+    def profile(self, is_start: bool = True):
+        if self.profiler is None:
+            raise RuntimeError("Profiler is not enabled.")
+        if is_start:
+            self.profiler.start()
+        else:
+            self.profiler.stop()
+            print(self.profiler.key_averages().table(
+                sort_by="self_cuda_time_total"))
+
+    def execute_dummy_batch(self) -> None:
+        self.model_runner._dummy_run(1)
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.model_runner.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        return self.model_runner.remove_lora(lora_id)
+
+    def list_loras(self) -> set[int]:
+        return self.model_runner.list_loras()
+
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.model_runner.pin_lora(lora_id)
+
+    def check_health(self) -> None:
+        # worker will always be healthy as long as it's running.
+        return
+
+    def _eplb_before_scale_down(self, old_ep_size: int,
+                                new_ep_size: int) -> None:
+        from vllm.distributed.parallel_state import get_ep_group
+        if get_ep_group().rank == 0:
+            logger.info("[Elastic EP] Starting expert resharding "
+                        "before scaling down...")
+        rank_mapping = {
+            old_ep_rank: old_ep_rank if old_ep_rank < new_ep_size else -1
+            for old_ep_rank in range(old_ep_size)
+        }
+        assert self.model_runner.eplb_state is not None
+        self.model_runner.eplb_state.rearrange(self.model_runner.model,
+                                               execute_shuffle=True,
+                                               global_expert_load=None,
+                                               rank_mapping=rank_mapping)
+        torch.cuda.synchronize()
+        if get_ep_group().rank == 0:
+            logger.info("[Elastic EP] Expert resharding completed!")
+
+    def _eplb_after_scale_up(
+            self, old_ep_size: int, new_ep_size: int,
+            global_expert_load: Optional[torch.Tensor]) -> None:
+        from vllm.distributed.parallel_state import get_ep_group
+        if get_ep_group().rank == 0:
+            logger.info("[Elastic EP] Starting expert resharding "
+                        "after scaling up...")
+        rank_mapping = {
+            old_ep_rank: old_ep_rank
+            for old_ep_rank in range(old_ep_size)
+        }
+        assert self.model_runner.eplb_state is not None
+        self.model_runner.eplb_state.rearrange(
+            self.model_runner.model,
+            execute_shuffle=True,
+            global_expert_load=global_expert_load,
+            rank_mapping=rank_mapping)
+        if get_ep_group().rank == 0:
+            logger.info("[Elastic EP] Expert resharding completed!")
+
+    def _reconfigure_parallel_config(
+            self, reconfig_request: ReconfigureDistributedRequest) -> None:
+        """
+        Update parallel config with provided reconfig_request
+        """
+        parallel_config = self.vllm_config.parallel_config
+        parallel_config.data_parallel_size = \
+            reconfig_request.new_data_parallel_size
+        if reconfig_request.new_data_parallel_rank != \
+        ReconfigureRankType.KEEP_CURRENT_RANK:
+            parallel_config.data_parallel_rank = \
+                reconfig_request.new_data_parallel_rank
+        if reconfig_request.new_data_parallel_rank_local != \
+        ReconfigureRankType.KEEP_CURRENT_RANK:
+            parallel_config.data_parallel_rank_local = \
+                reconfig_request.new_data_parallel_rank_local
+        parallel_config.data_parallel_master_ip = \
+            reconfig_request.new_data_parallel_master_ip
+        parallel_config.data_parallel_master_port = \
+            reconfig_request.new_data_parallel_master_port
+
+    def _reconfigure_moe(self, old_ep_size: int,
+                         new_ep_size: int) -> Optional[torch.Tensor]:
+        """
+        Reconfigure MoE modules with provided reconfig_request
+
+        Return the global expert load if new_ep_size > old_ep_size,
+        otherwise None
+        """
+        from vllm.distributed.parallel_state import (
+            get_dp_group, get_ep_group, prepare_communication_buffer_for_model)
+        from vllm.model_executor.layers.fused_moe.layer import (
+            FusedMoEParallelConfig)
+
+        parallel_config = self.vllm_config.parallel_config
+        moe_modules = [
+            module for module in self.model_runner.model.modules()
+            if module.__class__.__name__ == "FusedMoE"
+        ]
+        num_local_experts = moe_modules[0].moe_config.num_local_experts
+        assert all(module.moe_config.num_local_experts == num_local_experts
+                   for module in moe_modules), (
+                       "All MoE modules must have the same number of experts")
+        for module in moe_modules:
+            module.moe_config.num_experts = num_local_experts * new_ep_size
+            module.global_num_experts = module.moe_config.num_experts
+            module.moe_parallel_config = FusedMoEParallelConfig.make(
+                tp_size_=get_tp_group().world_size,
+                dp_size_=get_dp_group().world_size,
+                vllm_parallel_config=parallel_config,
+            )
+            module.moe_config.moe_parallel_config = module.moe_parallel_config
+        if new_ep_size < old_ep_size:
+            num_local_physical_experts = num_local_experts
+            assert self.model_runner.eplb_state is not None
+            new_physical_experts = \
+                self.model_runner.eplb_state.physical_to_logical_map.shape[1]
+            parallel_config.num_redundant_experts = (
+                new_physical_experts -
+                self.model_runner.eplb_state.logical_replica_count.shape[1])
+            global_expert_load = None
+        else:
+            num_local_physical_experts = torch.tensor([num_local_experts],
+                                                      dtype=torch.int32,
+                                                      device="cpu")
+            torch.distributed.broadcast(num_local_physical_experts,
+                                        group=get_ep_group().cpu_group,
+                                        group_src=0)
+            num_local_physical_experts = num_local_physical_experts.item()
+            new_physical_experts = num_local_physical_experts * new_ep_size
+            assert self.model_runner.eplb_state is not None
+            global_expert_load = self.model_runner.eplb_state.rearrange(
+                self.model_runner.model, execute_shuffle=False)
+            parallel_config.num_redundant_experts = (
+                new_physical_experts - global_expert_load.shape[1])
+        prepare_communication_buffer_for_model(self.model_runner.model)
+        self.model_runner.model.update_physical_experts_metadata(
+            num_physical_experts=new_physical_experts,
+            num_local_physical_experts=num_local_physical_experts)
+        return global_expert_load
+
+    def reinitialize_distributed(
+            self, reconfig_request: ReconfigureDistributedRequest) -> None:
+        from vllm.config import set_current_vllm_config
+        from vllm.distributed.parallel_state import (
+            cleanup_dist_env_and_memory, get_ep_group)
+
+        old_ep_size = get_ep_group().world_size
+        old_ep_rank = get_ep_group().rank
+        new_ep_size = reconfig_request.new_data_parallel_size * get_tp_group(
+        ).world_size * get_pp_group().world_size
+        if new_ep_size < old_ep_size:
+            self._eplb_before_scale_down(old_ep_size, new_ep_size)
+
+        cleanup_dist_env_and_memory()
+
+        if reconfig_request.new_data_parallel_rank == \
+        ReconfigureRankType.SHUTDOWN_CURRENT_RANK:
+            assert old_ep_rank >= new_ep_size
+            # shutdown
+            return
+
+        self._reconfigure_parallel_config(reconfig_request)
+
+        with set_current_vllm_config(self.vllm_config):
+            init_worker_distributed_environment(self.vllm_config, self.rank,
+                                                self.distributed_init_method,
+                                                self.local_rank)
+
+        global_expert_load = self._reconfigure_moe(old_ep_size, new_ep_size)
+
+        if new_ep_size > old_ep_size:
+            assert global_expert_load is not None
+            self._eplb_after_scale_up(old_ep_size, new_ep_size,
+                                      global_expert_load)
+
+    def save_sharded_state(
+        self,
+        path: str,
+        pattern: Optional[str] = None,
+        max_size: Optional[int] = None,
+    ) -> None:
+        from vllm.model_executor.model_loader import ShardedStateLoader
+        ShardedStateLoader.save_model(
+            self.model_runner.model,
+            path,
+            pattern=pattern,
+            max_size=max_size,
+        )
+
+    def save_tensorized_model(
+        self,
+        tensorizer_config: "TensorizerConfig",
+    ) -> None:
+        self.model_runner.save_tensorized_model(
+            tensorizer_config=tensorizer_config, )
+
+
+def init_worker_distributed_environment(
+    vllm_config: VllmConfig,
+    rank: int,
+    distributed_init_method: Optional[str] = None,
+    local_rank: int = -1,
+    backend: str = "nccl",
+) -> None:
+    """Initialize the distributed environment."""
+    parallel_config = vllm_config.parallel_config
+    set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
+
+    init_distributed_environment(parallel_config.world_size, rank,
+                                 distributed_init_method, local_rank, backend)
+
+    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
+                                      parallel_config.pipeline_parallel_size)
+
+    ensure_kv_transfer_initialized(vllm_config)
+
+
+def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
+    # Check if the GPU supports the dtype.
+    if torch_dtype == torch.bfloat16:  # noqa: SIM102
+        if not current_platform.has_device_capability(80):
+            capability = current_platform.get_device_capability()
+            gpu_name = current_platform.get_device_name()
+
+            if capability is None:
+                compute_str = "does not have a compute capability"
+            else:
+                version_str = capability.as_version_str()
+                compute_str = f"has compute capability {version_str}"
+
+            raise ValueError(
+                "Bfloat16 is only supported on GPUs with compute capability "
+                f"of at least 8.0. Your {gpu_name} GPU {compute_str}. "
+                "You can use float16 instead by explicitly setting the "
+                "`dtype` flag in CLI, for example: --dtype=half.")
diff --git a/vllm_v0.10.0/vllm/v1/worker/lora_model_runner_mixin.py b/vllm_v0.10.0/vllm/v1/worker/lora_model_runner_mixin.py
new file mode 100644
index 0000000..2fbdee4
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/worker/lora_model_runner_mixin.py
@@ -0,0 +1,177 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Define LoRA functionality mixin for model runners.
+"""
+
+from contextlib import contextmanager
+from typing import Union
+
+import numpy as np
+import torch.nn as nn
+
+from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig
+from vllm.logger import init_logger
+from vllm.lora.layers import LoRAMapping
+from vllm.lora.request import LoRARequest
+from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
+from vllm.model_executor.models import supports_lora, supports_multimodal
+from vllm.v1.worker.gpu_input_batch import InputBatch as GPUInputBatch
+from vllm.v1.worker.tpu_input_batch import InputBatch as TPUInputBatch
+
+InputBatch = Union[TPUInputBatch, GPUInputBatch]
+
+logger = init_logger(__name__)
+
+
+# Defined as a mixin for GPUModelRunner
+class LoRAModelRunnerMixin:
+
+    LORA_WARMUP_RANK = 8
+
+    def load_lora_model(self, model: nn.Module, model_config: ModelConfig,
+                        scheduler_config: SchedulerConfig,
+                        lora_config: LoRAConfig, device: str) -> nn.Module:
+
+        if not supports_lora(model):
+            raise ValueError(
+                f"{model.__class__.__name__} does not support LoRA yet.")
+
+        if supports_multimodal(model):
+            logger.warning("Regarding multimodal models, vLLM currently "
+                           "only supports adding LoRA to language model.")
+
+        # Use get_text_config() in case of multimodal models
+        text_config = model_config.hf_config.get_text_config()
+
+        # Add LoRA Manager to the Model Runner
+        self.lora_manager = LRUCacheWorkerLoRAManager(
+            scheduler_config.max_num_seqs,
+            scheduler_config.max_num_batched_tokens,
+            model_config.get_vocab_size(),
+            lora_config,
+            device,
+            model.embedding_modules,
+            model.embedding_padding_modules,
+            max_position_embeddings=text_config.max_position_embeddings,
+        )
+        return self.lora_manager.create_lora_manager(model)
+
+    def _set_active_loras(self, prompt_lora_mapping: tuple[int, ...],
+                          token_lora_mapping: tuple[int, ...],
+                          lora_requests: set[LoRARequest]) -> None:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+
+        # Set is_prefill to True, so we always use the SGMV kernels on
+        # non-cuda platforms.
+        # On cuda platforms we use the same kernels for prefill and
+        # decode and this flag is generally ignored.
+        lora_mapping = LoRAMapping(token_lora_mapping,
+                                   prompt_lora_mapping,
+                                   is_prefill=True)
+        self.lora_manager.set_active_adapters(lora_requests, lora_mapping)
+
+    def set_active_loras(self, input_batch: InputBatch,
+                         num_scheduled_tokens: np.ndarray) -> None:
+
+        prompt_lora_mapping: tuple[int, ...]  # of size input_batch.num_reqs
+        token_lora_mapping: tuple[int,
+                                  ...]  # of size np.sum(num_scheduled_tokens)
+        lora_requests: set[LoRARequest]
+        prompt_lora_mapping, token_lora_mapping, lora_requests = \
+                            input_batch.make_lora_inputs(num_scheduled_tokens)
+        return self._set_active_loras(prompt_lora_mapping, token_lora_mapping,
+                                      lora_requests)
+
+    @contextmanager
+    def maybe_setup_dummy_loras(self, lora_config):
+        if lora_config is None:
+            yield
+        else:
+            # __enter__ code
+            assert self.lora_manager is not None, "LoRA is not enabled"
+
+            num_loras = lora_config.max_loras
+
+            # Make dummy lora requests
+            lora_requests: set[LoRARequest] = {
+                LoRARequest(lora_name=f"warmup_{lora_id}",
+                            lora_int_id=lora_id,
+                            lora_path="/not/a/real/path")
+                for lora_id in range(1, num_loras + 1)
+            }
+
+            with self.lora_manager.dummy_lora_cache():
+                # Add the dummy LoRAs here so _set_active_loras doesn't try to
+                # load from disk.
+                for lr in lora_requests:
+                    self.lora_manager.add_dummy_lora(
+                        lr, rank=self.LORA_WARMUP_RANK)
+
+                yield
+
+            # __exit__ code
+            self.lora_manager.remove_all_adapters()
+
+    @contextmanager
+    def maybe_select_dummy_loras(self, lora_config: LoRAConfig,
+                                 num_scheduled_tokens: np.ndarray):
+        if lora_config is None:
+            yield
+        else:
+            # __enter__ code
+            assert self.lora_manager is not None, "LoRA is not enabled"
+
+            num_reqs = len(num_scheduled_tokens)
+            num_loras = lora_config.max_loras
+
+            # Make prompt lora mapping
+            # Assign LoRA IDs cyclically to simulate a worst-case scenario.
+            prompt_lora_mapping = (np.arange(num_reqs, dtype=np.int32) %
+                                   num_loras) + 1
+
+            # Make token lora mapping
+            token_lora_mapping = np.repeat(prompt_lora_mapping,
+                                           num_scheduled_tokens)
+
+            # Make dummy lora requests
+            lora_requests: set[LoRARequest] = {
+                LoRARequest(lora_name=f"warmup_{lora_id}",
+                            lora_int_id=lora_id,
+                            lora_path="/not/a/real/path")
+                for lora_id in range(1, num_loras + 1)
+            }
+
+            self._set_active_loras(tuple(prompt_lora_mapping),
+                                   tuple(token_lora_mapping), lora_requests)
+
+            yield
+
+    @contextmanager
+    def maybe_dummy_run_with_lora(self, lora_config: LoRAConfig,
+                                  num_scheduled_tokens: np.ndarray):
+        with self.maybe_setup_dummy_loras(
+                lora_config), self.maybe_select_dummy_loras(
+                    lora_config, num_scheduled_tokens):
+            yield
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.add_adapter(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.remove_adapter(lora_id)
+
+    def pin_lora(self, lora_id: int) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.pin_adapter(lora_id)
+
+    def list_loras(self) -> set[int]:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.list_adapters()
diff --git a/vllm_v0.10.0/vllm/v1/worker/tpu_input_batch.py b/vllm_v0.10.0/vllm/v1/worker/tpu_input_batch.py
new file mode 100644
index 0000000..81c7986
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/worker/tpu_input_batch.py
@@ -0,0 +1,585 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Datastructures defining a TPU input batch
+
+from typing import Optional, cast
+
+import numpy as np
+import torch
+
+from vllm.lora.request import LoRARequest
+from vllm.sampling_params import SamplingType
+from vllm.utils import swap_dict_values
+from vllm.v1.outputs import LogprobsTensors
+from vllm.v1.worker.block_table import MultiGroupBlockTable
+from vllm.v1.worker.gpu_input_batch import CachedRequestState
+
+_SAMPLING_EPS = 1e-5
+
+
+class InputBatch:
+
+    def __init__(
+            self,
+            max_num_reqs: int,
+            max_model_len: int,
+            max_num_batched_tokens: int,
+            device: torch.device,
+            pin_memory: bool,
+            vocab_size: int,
+            block_sizes: list[int],  # The block_size of each kv cache group
+    ):
+        self.max_num_reqs = max_num_reqs
+        self.max_model_len = max_model_len
+        self.max_num_batched_tokens = max_num_batched_tokens
+        self.device = device
+        self.pin_memory = pin_memory
+        self.vocab_size = vocab_size
+
+        self._req_ids: list[Optional[str]] = []
+        self.req_id_to_index: dict[str, int] = {}
+
+        # TODO(woosuk): This buffer could be too large if max_model_len is big.
+        # Find a way to reduce the CPU memory usage.
+        # This buffer is not directly transferred to the GPU, so it does not
+        # need to be pinned.
+        self.token_ids_cpu_tensor = torch.zeros(
+            (max_num_reqs, max_model_len),
+            device="cpu",
+            dtype=torch.int32,
+            pin_memory=False,
+        )
+        self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
+        self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
+        self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32)
+        self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
+        self.num_computed_tokens_cpu_tensor = torch.zeros(
+            (max_num_reqs, ),
+            device="cpu",
+            dtype=torch.int32,
+            pin_memory=pin_memory,
+        )
+        self.num_computed_tokens_cpu = \
+            self.num_computed_tokens_cpu_tensor.numpy()
+
+        # Block table.
+        self.block_table = MultiGroupBlockTable(
+            max_num_reqs=max_num_reqs,
+            max_model_len=max_model_len,
+            max_num_batched_tokens=max_num_batched_tokens,
+            pin_memory=pin_memory,
+            device=device,
+            block_sizes=block_sizes,
+        )
+
+        # Sampling-related.
+        self.temperature = torch.empty((max_num_reqs, ),
+                                       dtype=torch.float32,
+                                       device=device)
+        self.temperature_cpu_tensor = torch.empty((max_num_reqs, ),
+                                                  dtype=torch.float32,
+                                                  device="cpu",
+                                                  pin_memory=pin_memory)
+        self.temperature_cpu = self.temperature_cpu_tensor.numpy()
+        self.greedy_reqs: set[str] = set()
+        self.random_reqs: set[str] = set()
+
+        self.top_p = torch.empty((max_num_reqs, ),
+                                 dtype=torch.float32,
+                                 device=device)
+        self.top_p_cpu_tensor = torch.empty((max_num_reqs, ),
+                                            dtype=torch.float32,
+                                            device="cpu",
+                                            pin_memory=pin_memory)
+        self.top_p_cpu = self.top_p_cpu_tensor.numpy()
+        self.top_p_reqs: set[str] = set()
+
+        self.top_k = torch.empty((max_num_reqs, ),
+                                 dtype=torch.int32,
+                                 device=device)
+        self.top_k_cpu_tensor = torch.empty((max_num_reqs, ),
+                                            dtype=torch.int32,
+                                            device="cpu",
+                                            pin_memory=pin_memory)
+        self.top_k_cpu = self.top_k_cpu_tensor.numpy()
+        self.top_k_reqs: set[str] = set()
+
+        self.min_p = torch.empty((max_num_reqs, ),
+                                 dtype=torch.float32,
+                                 device=device)
+        self.min_p_cpu_tensor = torch.empty((max_num_reqs, ),
+                                            dtype=torch.float32,
+                                            device="cpu",
+                                            pin_memory=pin_memory)
+        self.min_p_cpu = self.min_p_cpu_tensor.numpy()
+        self.min_p_reqs: set[str] = set()
+
+        # Frequency penalty related data structures
+        self.frequency_penalties = torch.empty((max_num_reqs, ),
+                                               dtype=torch.float,
+                                               device=device)
+        self.frequency_penalties_cpu_tensor = torch.empty(
+            (max_num_reqs, ),
+            dtype=torch.float,
+            device="cpu",
+            pin_memory=pin_memory)
+        self.frequency_penalties_cpu = \
+            self.frequency_penalties_cpu_tensor.numpy()
+        self.frequency_penalties_reqs: set[str] = set()
+
+        # Presence penalty related data structures
+        self.presence_penalties = torch.empty((max_num_reqs, ),
+                                              dtype=torch.float,
+                                              device=device)
+        self.presence_penalties_cpu_tensor = torch.empty((max_num_reqs, ),
+                                                         dtype=torch.float,
+                                                         device="cpu",
+                                                         pin_memory=pin_memory)
+        self.presence_penalties_cpu = self.presence_penalties_cpu_tensor.numpy(
+        )
+        self.presence_penalties_reqs: set[str] = set()
+
+        # Repetition penalty related data structures
+        self.repetition_penalties = torch.empty((max_num_reqs, ),
+                                                dtype=torch.float,
+                                                device=device)
+        self.repetition_penalties_cpu_tensor = torch.empty(
+            (max_num_reqs, ),
+            dtype=torch.float,
+            device="cpu",
+            pin_memory=pin_memory)
+        self.repetition_penalties_cpu = \
+            self.repetition_penalties_cpu_tensor.numpy()
+        self.repetition_penalties_reqs: set[str] = set()
+
+        # req_index -> (min_tokens, stop_token_ids)
+        self.min_tokens: dict[int, tuple[int, set[int]]] = {}
+
+        # lora related
+        self.request_lora_mapping = np.zeros((self.max_num_reqs, ),
+                                             dtype=np.int32)
+        self.lora_id_to_request_ids: dict[int, set[str]] = {}
+        self.lora_id_to_lora_request: dict[int, LoRARequest] = {}
+
+        # req_index -> generator
+        # NOTE(woosuk): The indices of the requests that do not have their own
+        # generator should not be included in the dictionary.
+        self.generators: dict[int, torch.Generator] = {}
+
+        self.num_logprobs: dict[str, int] = {}
+        # NOTE(rob): num_prompt_logprobs only includes reqs
+        # that are currently in the prefill phase.
+        self.num_prompt_logprobs: dict[str, int] = {}
+
+        # To accumulate prompt logprobs tensor chunks across prefill steps.
+        self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {}
+
+        self.logit_bias: list[Optional[dict[int,
+                                            float]]] = [None] * max_num_reqs
+        self.has_allowed_token_ids: set[str] = set()
+        # NOTE(lufang): In the mask tensor, if the corresponding token allowed,
+        # the value is False. Since we use masked_fill_ to set -inf.
+        self.allowed_token_ids_mask: Optional[torch.Tensor] = None
+        self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None
+
+        # req_index -> bad_words_token_ids
+        self.bad_words_token_ids: dict[int, list[list[int]]] = {}
+
+        self.req_output_token_ids: list[Optional[list[int]]] = []
+
+    @property
+    def req_ids(self) -> list[str]:
+        # None elements should only be present transiently
+        # while performing state updates to the batch.
+        return cast(list[str], self._req_ids)
+
+    def add_request(
+        self,
+        request: "CachedRequestState",
+        req_index: Optional[int] = None,
+    ) -> None:
+        if req_index is None:
+            req_index = self.num_reqs
+        assert req_index < self.max_num_reqs
+
+        req_id = request.req_id
+        if req_index == len(self._req_ids):
+            self._req_ids.append(req_id)
+            self.req_output_token_ids.append(request.output_token_ids)
+        else:
+            self._req_ids[req_index] = req_id
+            self.req_output_token_ids[req_index] = request.output_token_ids
+
+        self.req_id_to_index[req_id] = req_index
+
+        # Copy the prompt token ids and output token ids.
+        num_prompt_tokens = len(request.prompt_token_ids)
+        self.num_prompt_tokens[req_index] = num_prompt_tokens
+        self.token_ids_cpu[
+            req_index, :num_prompt_tokens] = request.prompt_token_ids
+        start_idx = num_prompt_tokens
+        end_idx = start_idx + len(request.output_token_ids)
+        self.token_ids_cpu[req_index,
+                           start_idx:end_idx] = request.output_token_ids
+        # Number of token ids in token_ids_cpu.
+        # NOTE(woosuk): This may include spec decode tokens.
+        self.num_tokens[req_index] = request.num_tokens
+        # Number of tokens without spec decode tokens.
+        self.num_tokens_no_spec[req_index] = request.num_tokens
+
+        self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens
+        self.block_table.add_row(request.block_ids, req_index)
+
+        sampling_params = request.sampling_params
+        assert sampling_params is not None, "pooling requests not supported yet"
+        if sampling_params.sampling_type == SamplingType.GREEDY:
+            # Avoid later division by zero.
+            self.temperature_cpu[req_index] = -1.0
+            self.greedy_reqs.add(req_id)
+        else:
+            self.temperature_cpu[req_index] = sampling_params.temperature
+            self.random_reqs.add(req_id)
+
+        self.top_p_cpu[req_index] = sampling_params.top_p
+        if sampling_params.top_p < 1:
+            self.top_p_reqs.add(req_id)
+        top_k = sampling_params.top_k
+        if 0 < top_k < self.vocab_size:
+            self.top_k_reqs.add(req_id)
+        else:
+            top_k = self.vocab_size
+        self.top_k_cpu[req_index] = top_k
+        self.min_p_cpu[req_index] = sampling_params.min_p
+        self.frequency_penalties_cpu[
+            req_index] = sampling_params.frequency_penalty
+        if sampling_params.min_p > _SAMPLING_EPS:
+            self.min_p_reqs.add(req_id)
+        if sampling_params.frequency_penalty != 0.0:
+            self.frequency_penalties_reqs.add(req_id)
+        self.presence_penalties_cpu[
+            req_index] = sampling_params.presence_penalty
+        if sampling_params.presence_penalty != 0.0:
+            self.presence_penalties_reqs.add(req_id)
+        self.repetition_penalties_cpu[
+            req_index] = sampling_params.repetition_penalty
+        if sampling_params.repetition_penalty != 1.0:
+            self.repetition_penalties_reqs.add(req_id)
+        if sampling_params.min_tokens:
+            self.min_tokens[req_index] = (sampling_params.min_tokens,
+                                          sampling_params.all_stop_token_ids)
+
+        # NOTE(woosuk): self.generators should not include the requests that
+        # do not have their own generator.
+        if request.generator is not None:
+            self.generators[req_index] = request.generator
+
+        if sampling_params.logprobs is not None:
+            self.num_logprobs[req_id] = sampling_params.logprobs
+        if sampling_params.prompt_logprobs is not None:
+            self.num_prompt_logprobs[req_id] = sampling_params.prompt_logprobs
+        if sampling_params.logit_bias is not None:
+            self.logit_bias[req_index] = sampling_params.logit_bias
+
+        if sampling_params.allowed_token_ids:
+            self.has_allowed_token_ids.add(req_id)
+            if self.allowed_token_ids_mask_cpu_tensor is None:
+                # Lazy allocation for this tensor, which can be large.
+                # False means we don't fill with -inf.
+                self.allowed_token_ids_mask = torch.zeros(self.max_num_reqs,
+                                                          self.vocab_size,
+                                                          dtype=torch.bool,
+                                                          device=self.device)
+                self.allowed_token_ids_mask_cpu_tensor = torch.zeros(
+                    self.max_num_reqs,
+                    self.vocab_size,
+                    dtype=torch.bool,
+                    device="cpu")
+            self.allowed_token_ids_mask_cpu_tensor[req_index] = True
+            # False means we don't fill with -inf.
+            self.allowed_token_ids_mask_cpu_tensor[req_index][
+                sampling_params.allowed_token_ids] = False
+
+        if sampling_params.bad_words_token_ids:
+            self.bad_words_token_ids[
+                req_index] = sampling_params.bad_words_token_ids
+
+        # Add request lora ID
+        if request.lora_request:
+            lora_id = request.lora_request.lora_int_id
+            if lora_id not in self.lora_id_to_request_ids:
+                self.lora_id_to_request_ids[lora_id] = set()
+
+            self.request_lora_mapping[req_index] = lora_id
+            self.lora_id_to_request_ids[lora_id].add(request.req_id)
+            self.lora_id_to_lora_request[lora_id] = request.lora_request
+        else:
+            # No LoRA
+            self.request_lora_mapping[req_index] = 0
+
+    def remove_request(self, req_id: str) -> Optional[int]:
+        """This method must always be followed by a call to condense()."""
+
+        req_index = self.req_id_to_index.pop(req_id, None)
+        if req_index is None:
+            return None
+        self._req_ids[req_index] = None
+        self.req_output_token_ids[req_index] = None
+
+        self.greedy_reqs.discard(req_id)
+        self.random_reqs.discard(req_id)
+        self.top_p_reqs.discard(req_id)
+        self.top_k_reqs.discard(req_id)
+        self.min_p_reqs.discard(req_id)
+        self.min_tokens.pop(req_index, None)
+        self.frequency_penalties_reqs.discard(req_id)
+        self.presence_penalties_reqs.discard(req_id)
+        self.repetition_penalties_reqs.discard(req_id)
+        self.generators.pop(req_index, None)
+        self.num_logprobs.pop(req_id, None)
+        self.num_prompt_logprobs.pop(req_id, None)
+        self.in_progress_prompt_logprobs_cpu.pop(req_id, None)
+
+        # LoRA
+        lora_id = self.request_lora_mapping[req_index]
+        if lora_id != 0:
+            self.lora_id_to_request_ids[lora_id].discard(req_id)
+            if len(self.lora_id_to_request_ids[lora_id]) == 0:
+                self.lora_id_to_request_ids.pop(lora_id)
+                self.lora_id_to_lora_request.pop(lora_id)
+            self.request_lora_mapping[req_index] = 0
+
+        self.logit_bias[req_index] = None
+        self.has_allowed_token_ids.discard(req_id)
+        if self.allowed_token_ids_mask_cpu_tensor is not None:
+            # False means we don't fill with -inf.
+            self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False)
+        self.bad_words_token_ids.pop(req_index, None)
+        return req_index
+
+    def swap_states(self, i1: int, i2: int) -> None:
+        old_id_i1 = self._req_ids[i1]
+        old_id_i2 = self._req_ids[i2]
+        self._req_ids[i1], self._req_ids[i2] =\
+            self._req_ids[i2], self._req_ids[i1] # noqa
+        self.req_output_token_ids[i1], self.req_output_token_ids[i2] =\
+            self.req_output_token_ids[i2], self.req_output_token_ids[i1]
+        assert old_id_i1 is not None and old_id_i2 is not None
+        self.req_id_to_index[old_id_i1], self.req_id_to_index[old_id_i2] =\
+            self.req_id_to_index[old_id_i2], self.req_id_to_index[old_id_i1]
+        self.num_tokens[i1], self.num_tokens[i2] =\
+            self.num_tokens[i2], self.num_tokens[i1]
+        self.num_tokens_no_spec[i1], self.num_tokens_no_spec[i2] =\
+            self.num_tokens_no_spec[i2], self.num_tokens_no_spec[i1]
+        self.num_prompt_tokens[i1], self.num_prompt_tokens[i2] =\
+            self.num_prompt_tokens[i2], self.num_prompt_tokens[i1]
+        self.num_computed_tokens_cpu[i1], self.num_computed_tokens_cpu[i2] =\
+            self.num_computed_tokens_cpu[i2], self.num_computed_tokens_cpu[i1]
+        self.temperature_cpu[i1], self.temperature_cpu[i2] =\
+            self.temperature_cpu[i2], self.temperature_cpu[i1]
+        self.top_p_cpu[i1], self.top_p_cpu[i2] =\
+            self.top_p_cpu[i2], self.top_p_cpu[i1]
+        self.top_k_cpu[i1], self.top_k_cpu[i2] =\
+            self.top_k_cpu[i2], self.top_k_cpu[i1]
+        self.frequency_penalties_cpu[i1], self.frequency_penalties_cpu[i2] =\
+            self.frequency_penalties_cpu[i2], self.frequency_penalties_cpu[i1]
+        self.presence_penalties_cpu[i1], self.presence_penalties_cpu[i2] =\
+            self.presence_penalties_cpu[i2], self.presence_penalties_cpu[i1]
+        self.repetition_penalties_cpu[i1], self.repetition_penalties_cpu[i2] =\
+            self.repetition_penalties_cpu[i2], self.repetition_penalties_cpu[i1]
+        self.min_p_cpu[i1], self.min_p_cpu[i2] =\
+            self.min_p_cpu[i2], self.min_p_cpu[i1]
+
+        # NOTE: the following is unsafe
+        # self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
+        #     self.token_ids_cpu[i2, ...], self.token_ids_cpu[i1, ...]
+        # instead, we need to temporiarily copy the data for one of the indices
+        # TODO(lucas): optimize this by only copying valid indices
+        tmp = self.token_ids_cpu[i1, ...].copy()
+        self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...]
+        self.token_ids_cpu[i2, ...] = tmp
+
+        swap_dict_values(self.generators, i1, i2)
+        swap_dict_values(self.min_tokens, i1, i2)
+        swap_dict_values(self.bad_words_token_ids, i1, i2)
+
+        self.request_lora_mapping[i1], self.request_lora_mapping[i2] =\
+            self.request_lora_mapping[i2], self.request_lora_mapping[i1]
+        self.logit_bias[i1], self.logit_bias[i2] =\
+            self.logit_bias[i2], self.logit_bias[i1]
+
+        if self.allowed_token_ids_mask_cpu_tensor is not None:
+            self.allowed_token_ids_mask_cpu_tensor[i1], \
+                self.allowed_token_ids_mask_cpu_tensor[i2] =\
+                self.allowed_token_ids_mask_cpu_tensor[i2], \
+                    self.allowed_token_ids_mask_cpu_tensor[i1]
+        self.block_table.swap_row(i1, i2)
+
+    def condense(self, empty_req_indices: list[int]) -> None:
+        """Move non-empty requests down into lower, empty indices.
+        
+        Args:
+          empty_req_indices: empty batch indices, sorted descending.
+        """
+        num_reqs = self.num_reqs
+        if num_reqs == 0:
+            # The batched states are empty.
+            self._req_ids.clear()
+            self.req_output_token_ids.clear()
+            return
+
+        # NOTE(woosuk): This function assumes that the empty_req_indices
+        # is sorted in descending order.
+        last_req_index = num_reqs + len(empty_req_indices) - 1
+        while empty_req_indices:
+            # Find the largest non-empty index.
+            while last_req_index in empty_req_indices:
+                last_req_index -= 1
+
+            # Find the smallest empty index.
+            empty_index = empty_req_indices.pop()
+            if empty_index >= last_req_index:
+                break
+
+            # Swap the states.
+            req_id = self._req_ids[last_req_index]
+            output_token_ids = self.req_output_token_ids[last_req_index]
+            assert req_id is not None
+            self._req_ids[empty_index] = req_id
+            self._req_ids[last_req_index] = None
+            self.req_output_token_ids[empty_index] = output_token_ids
+            self.req_output_token_ids[last_req_index] = None
+            self.req_id_to_index[req_id] = empty_index
+
+            num_tokens = self.num_tokens[last_req_index]
+            self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
+                last_req_index, :num_tokens]
+            self.num_tokens[empty_index] = num_tokens
+            self.num_tokens_no_spec[empty_index] = self.num_tokens_no_spec[
+                last_req_index]
+            self.num_prompt_tokens[empty_index] = self.num_prompt_tokens[
+                last_req_index]
+            self.num_computed_tokens_cpu[
+                empty_index] = self.num_computed_tokens_cpu[last_req_index]
+            self.block_table.move_row(last_req_index, empty_index)
+            self.temperature_cpu[empty_index] = self.temperature_cpu[
+                last_req_index]
+            self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index]
+            self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index]
+            self.frequency_penalties_cpu[
+                empty_index] = self.frequency_penalties_cpu[last_req_index]
+            self.presence_penalties_cpu[
+                empty_index] = self.presence_penalties_cpu[last_req_index]
+            self.repetition_penalties_cpu[
+                empty_index] = self.repetition_penalties_cpu[last_req_index]
+            self.min_p_cpu[empty_index] = self.min_p_cpu[last_req_index]
+            generator = self.generators.pop(last_req_index, None)
+            if generator is not None:
+                self.generators[empty_index] = generator
+
+            min_token = self.min_tokens.pop(last_req_index, None)
+            if min_token is not None:
+                self.min_tokens[empty_index] = min_token
+
+            self.request_lora_mapping[empty_index] = self.request_lora_mapping[
+                last_req_index]
+
+            self.logit_bias[empty_index] = self.logit_bias[last_req_index]
+
+            if self.allowed_token_ids_mask_cpu_tensor is not None:
+                self.allowed_token_ids_mask_cpu_tensor[
+                    empty_index] = self.allowed_token_ids_mask_cpu_tensor[
+                        last_req_index]
+
+            bad_words_token_ids = self.bad_words_token_ids.pop(
+                last_req_index, None)
+            if bad_words_token_ids is not None:
+                self.bad_words_token_ids[empty_index] = bad_words_token_ids
+            # Decrement last_req_index since it is now empty.
+            last_req_index -= 1
+
+        # Trim lists to the batch size.
+        del self._req_ids[self.num_reqs:]
+        del self.req_output_token_ids[self.num_reqs:]
+
+    def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
+        max_prompt_len = self.num_prompt_tokens[:self.num_reqs].max()
+        prompt_token_ids_cpu_tensor = torch.empty(
+            (self.num_reqs, max_prompt_len),
+            device="cpu",
+            dtype=torch.int64,
+            pin_memory=self.pin_memory,
+        )
+        prompt_token_ids = prompt_token_ids_cpu_tensor.numpy()
+        prompt_token_ids[:] = self.token_ids_cpu[:self.
+                                                 num_reqs, :max_prompt_len]
+        # Use the value of vocab_size as a pad since we don't have a
+        # token_id of this value.
+        for i in range(self.num_reqs):
+            prompt_token_ids[i, self.num_prompt_tokens[i]:] = self.vocab_size
+        return prompt_token_ids_cpu_tensor.to(device=self.device,
+                                              non_blocking=True)
+
+    def make_lora_inputs(
+        self, num_scheduled_tokens: np.ndarray
+    ) -> tuple[tuple[int, ...], tuple[int, ...], set[LoRARequest]]:
+        """
+        Given the num_scheduled_tokens for each request in the batch, return
+        datastructures used to activate the current LoRAs.
+        Returns:
+            1. prompt_lora_mapping: A tuple of size self.num_reqs where,
+               prompt_lora_mapping[i] is the LoRA id to use for the ith prompt.
+            2. token_lora_mapping: A tuple of size np.sum(num_scheduled_tokens)
+               where, token_lora_mapping[i] is the LoRA id to use for ith token.
+            3. lora_requests: Set of relevant LoRA requests.
+        """
+
+        req_lora_mapping = self.request_lora_mapping[:self.num_reqs]
+        prompt_lora_mapping = tuple(req_lora_mapping)
+        token_lora_mapping = tuple(
+            req_lora_mapping.repeat(num_scheduled_tokens))
+        active_lora_requests: set[LoRARequest] = set(
+            self.lora_id_to_lora_request.values())
+
+        return prompt_lora_mapping, token_lora_mapping, active_lora_requests
+
+    @property
+    def num_reqs(self) -> int:
+        return len(self.req_id_to_index)
+
+    @property
+    def all_greedy(self) -> bool:
+        return len(self.random_reqs) == 0
+
+    @property
+    def all_random(self) -> bool:
+        return len(self.greedy_reqs) == 0
+
+    @property
+    def no_top_p(self) -> bool:
+        return len(self.top_p_reqs) == 0
+
+    @property
+    def no_top_k(self) -> bool:
+        return len(self.top_k_reqs) == 0
+
+    @property
+    def no_min_p(self) -> bool:
+        return len(self.min_p_reqs) == 0
+
+    @property
+    def no_penalties(self) -> bool:
+        return (len(self.presence_penalties_reqs) == 0
+                and len(self.frequency_penalties_reqs) == 0
+                and len(self.repetition_penalties_reqs) == 0)
+
+    @property
+    def max_num_logprobs(self) -> Optional[int]:
+        return max(self.num_logprobs.values()) if self.num_logprobs else None
+
+    @property
+    def no_prompt_logprob(self) -> bool:
+        return not self.num_prompt_logprobs
+
+    @property
+    def no_allowed_token_ids(self) -> bool:
+        return len(self.has_allowed_token_ids) == 0
diff --git a/vllm_v0.10.0/vllm/v1/worker/tpu_model_runner.py b/vllm_v0.10.0/vllm/v1/worker/tpu_model_runner.py
new file mode 100644
index 0000000..3bb033f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/worker/tpu_model_runner.py
@@ -0,0 +1,1916 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import bisect
+import gc
+import time
+from typing import TYPE_CHECKING, Any, Optional, cast
+from unittest.mock import patch
+
+import numpy as np
+import torch
+import torch.nn as nn
+# TPU XLA related
+import torch_xla.core.xla_model as xm
+import torch_xla.distributed.spmd as xs
+import torch_xla.runtime as xr
+
+import vllm.envs as envs
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
+from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
+from vllm.config import (ParallelConfig, VllmConfig,
+                         get_layers_from_vllm_config, update_config)
+from vllm.forward_context import set_forward_context
+from vllm.logger import init_logger
+from vllm.lora.layers import BaseLayerWithLoRA
+from vllm.model_executor.model_loader import get_model_loader
+from vllm.model_executor.model_loader.tpu import TPUModelLoader
+from vllm.model_executor.models.interfaces_base import is_pooling_model
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargs,
+                                    PlaceholderRange)
+from vllm.multimodal.utils import group_mm_inputs_by_modality
+from vllm.pooling_params import PoolingTask
+from vllm.sequence import IntermediateTensors
+from vllm.utils import (LayerBlockType, cdiv, is_pin_memory_available,
+                        prev_power_of_2)
+from vllm.v1.attention.backends.pallas import (TPU_STR_DTYPE_TO_TORCH_DTYPE,
+                                               PallasAttentionBackend,
+                                               PallasMetadata,
+                                               get_page_size_bytes)
+from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
+from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec,
+                                        KVCacheConfig, KVCacheSpec,
+                                        SlidingWindowSpec)
+from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsLists,
+                             LogprobsTensors, ModelRunnerOutput)
+from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata
+from vllm.v1.sample.tpu.sampler import Sampler as TPUSampler
+from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
+from vllm.v1.worker.tpu_input_batch import CachedRequestState, InputBatch
+
+from .utils import (bind_kv_cache, initialize_kv_cache_for_kv_sharing,
+                    sanity_check_mm_encoder_outputs)
+
+if TYPE_CHECKING:
+    from vllm.v1.core.sched.output import SchedulerOutput
+
+logger = init_logger(__name__)
+
+INVALID_TOKEN_ID = -1
+# Smallest output size
+MIN_NUM_SEQS = 8
+
+
+#########################################################
+# Ways to avoid recompilation
+#########################################################
+#
+# The model executor has two primary components:
+# 1. preparing the model and sampler inputs
+# 2. executing the model and sampler.
+# The core idea is to avoid any TPU computation during input preparation. For
+# better compilation tracking and increased flexibility, the model execution and
+# sampler are divided into several distinct components.
+#
+# Below are the detailed steps:
+#
+# Step 1
+# It is recommended to avoid TPU operations when preparing the model and sampler
+# inputs. CPU tensors can be prepared and transferred to the XLA device using
+# cpu_tensor.to(xla_device), which only triggers CPU to TPU transfers and avoids
+# compilation.
+#
+# Step 2
+# The TPU execution should be decomposed into subgraphs (4 at the moment):
+# 1. the main model
+# 2. selecting hidden states for each request
+# 3. sampler
+# 4. encoder.
+# Each subgraph should be decorated in a torch.compile. This is used to make
+# sure that we have the same subgraph topology in both dummy_run and
+# xecute_model. The results from these subgraphs should either be passed to
+# other subgraphs, or transferred from TPU to CPU using xla_tensor.cpu() for
+# subsequent processing on the CPU.
+#
+# Step 3
+# The dummy_run should be comprehensive, ensuring all potential input shapes and
+# branch predictions are included as subgraph inputs to facilitate
+# pre-compilation.
+class TPUModelRunner(LoRAModelRunnerMixin):
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        device: torch.device,
+        original_parallel_config: Optional[ParallelConfig] = None,
+    ):
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.load_config = vllm_config.load_config
+        self.parallel_config = vllm_config.parallel_config
+        self.original_parallel_config = original_parallel_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.speculative_config = vllm_config.speculative_config
+        self.observability_config = vllm_config.observability_config
+        self.device_config = vllm_config.device_config
+
+        model_config = self.model_config
+        cache_config = self.cache_config
+        scheduler_config = self.scheduler_config
+        parallel_config = self.parallel_config
+        self.device = device
+        self.check_recompilation = envs.VLLM_XLA_CHECK_RECOMPILATION
+
+        # SPMD Related
+        self.use_spmd = envs.VLLM_XLA_USE_SPMD
+        if self.use_spmd:
+            num_devices = xr.global_runtime_device_count()
+            mesh_shape = (num_devices, 1)
+            device_ids = np.array(range(num_devices))
+            self.mesh = xs.Mesh(device_ids, mesh_shape, ('x', 'y'))
+
+        self.enforce_eager = model_config.enforce_eager
+
+        self.num_xla_graphs = 0
+        self._update_num_xla_graphs("init")
+
+        self.pin_memory = is_pin_memory_available()
+        self.dtype = self.model_config.dtype
+        if cache_config.cache_dtype == "auto":
+            model_dtype = self.dtype
+            if isinstance(model_dtype, str):
+                self.kv_cache_dtype = TPU_STR_DTYPE_TO_TORCH_DTYPE[model_dtype]
+            else:
+                self.kv_cache_dtype = model_dtype
+        else:
+            self.kv_cache_dtype = TPU_STR_DTYPE_TO_TORCH_DTYPE[
+                cache_config.cache_dtype]
+        self._hidden_states_dtype = self.dtype
+
+        self.is_multimodal_model = model_config.is_multimodal_model
+        self.sliding_window = model_config.get_sliding_window()
+        self.block_size = cache_config.block_size
+        self.max_model_len = model_config.max_model_len
+        self.most_model_len = envs.VLLM_TPU_MOST_MODEL_LEN
+        self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size)
+        self.num_blocks_per_most_len_req = cdiv(
+            self.most_model_len,
+            self.block_size) if self.most_model_len is not None else None
+        # InputBatch needs to work with sampling tensors greater than padding
+        # to avoid dynamic shapes. Also, avoid suboptimal alignment.
+        self.max_num_reqs = max(scheduler_config.max_num_seqs, MIN_NUM_SEQS)
+        self.num_tokens_paddings = _get_token_paddings(
+            min_token_size=16,
+            max_token_size=scheduler_config.max_num_batched_tokens,
+            padding_gap=envs.VLLM_TPU_BUCKET_PADDING_GAP)
+        # In case `max_num_tokens < max(num_tokens_paddings)` use the actual
+        # padded max value to pre-allocate data structures and pre-compile.
+        self.max_num_tokens = self.num_tokens_paddings[-1]
+
+        # Model-related.
+        self.num_attn_layers = model_config.get_num_layers_by_block_type(
+            parallel_config, LayerBlockType.attention)
+        self.num_query_heads = model_config.get_num_attention_heads(
+            parallel_config)
+        self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
+        self.head_size = model_config.get_head_size()
+        self.hidden_size = model_config.get_hidden_size()
+        self.vocab_size = model_config.get_vocab_size()
+
+        if self.lora_config is not None:
+            self.vocab_size += self.lora_config.lora_extra_vocab_size
+
+        # Multi-modal data support
+        self.mm_registry = MULTIMODAL_REGISTRY
+        self.uses_mrope = model_config.uses_mrope
+        # TODO: Support M-RoPE (e.g, Qwen2-VL)
+        assert not self.uses_mrope, "TPU does not support M-RoPE yet."
+
+        encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
+            model_config=model_config,
+            scheduler_config=scheduler_config,
+            mm_registry=self.mm_registry,
+        )
+        self.max_num_encoder_input_tokens = encoder_compute_budget
+        self.encoder_cache_size = encoder_cache_size
+
+        self._num_slices_per_kv_cache_update_block = \
+            _get_num_slices_per_kv_cache_update_block(get_page_size_bytes(
+                block_size=self.block_size,
+                num_kv_heads=self.num_kv_heads,
+                head_size=self.head_size,
+                kv_cache_dtype=self.kv_cache_dtype,
+            ))
+
+        # Lazy initialization
+        self.model: nn.Module  # Set after load_model
+        self.kv_caches: list[torch.Tensor] = []
+        # req_id -> (input_id -> encoder_output)
+        self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {}
+
+        # Request states.
+        self.requests: dict[str, CachedRequestState] = {}
+
+        # Initialize input batch early to avoid AttributeError in _update_states
+        self.input_batch = InputBatch(
+            max_num_reqs=self.max_num_reqs,
+            max_model_len=self.max_model_len,
+            max_num_batched_tokens=self.max_num_tokens,
+            device=self.device,
+            pin_memory=self.pin_memory,
+            vocab_size=self.model_config.get_vocab_size(),
+            block_sizes=[self.block_size],
+        )
+
+        # Cached torch/numpy tensor
+        # The pytorch tensor and numpy array share the same buffer.
+        # Sometimes the numpy op is faster so we create both.
+        self.input_ids_cpu = torch.zeros(self.max_num_tokens,
+                                         dtype=torch.int32,
+                                         device="cpu")
+
+        self.positions_cpu = torch.zeros(self.max_num_tokens,
+                                         dtype=torch.int32,
+                                         device="cpu")
+        self.positions_np = self.positions_cpu.numpy()
+        self.block_table_cpu = torch.zeros(
+            (self.max_num_reqs, self.max_num_blocks_per_req),
+            dtype=torch.int32,
+            device="cpu")
+        # adjust num_reqs to avoid SMEM OOM.
+        self.num_reqs_most_model_len = min(
+            PallasAttentionBackend.get_max_num_seqs(self.most_model_len,
+                                                    self.block_size),
+            self.max_num_reqs) if self.most_model_len is not None else None
+        self.num_reqs_max_model_len = min(
+            PallasAttentionBackend.get_max_num_seqs(self.max_model_len,
+                                                    self.block_size),
+            self.max_num_reqs)
+        self.query_start_loc_cpu = torch.zeros(self.max_num_tokens + 1,
+                                               dtype=torch.int32,
+                                               device="cpu",
+                                               pin_memory=self.pin_memory)
+        self.query_start_loc_np = self.query_start_loc_cpu.numpy()
+
+        self.seq_lens_cpu = torch.zeros(self.max_num_tokens,
+                                        dtype=torch.int32,
+                                        device="cpu",
+                                        pin_memory=self.pin_memory)
+        self.seq_lens_np = self.seq_lens_cpu.numpy()
+
+        # Range tensor with values [0 .. self.max_num_tokens - 1].
+        # Used to initialize positions / context_lens / seq_lens
+        # Keep in int64 to avoid overflow with long context
+        self.arange_np = np.arange(self.max_num_tokens, dtype=np.int64)
+        self.num_reqs_paddings = _get_req_paddings(
+            min_req_size=MIN_NUM_SEQS, max_req_size=self.max_num_reqs)
+
+        # Layer pairings for cross-layer KV sharing.
+        # If an Attention layer `layer_name` is in the keys of this dict, it
+        # means this layer will perform attention using the keys and values
+        # from the KV cache of `shared_kv_cache_layers[layer_name]`.
+        self.shared_kv_cache_layers: dict[str, str] = {}
+
+        # tensors for structured decoding
+        self.grammar_bitmask_cpu = torch.zeros(
+            (self.max_num_reqs, cdiv(self.vocab_size, 32)),
+            dtype=torch.int32,
+            device="cpu",
+            pin_memory=self.pin_memory)
+        self.require_structured_out_cpu = torch.zeros(
+            (self.max_num_reqs, 1),
+            dtype=torch.bool,
+            device="cpu",
+            pin_memory=self.pin_memory)
+        self.structured_decode_arange = torch.arange(
+            0, 32, device="cpu", pin_memory=self.pin_memory)
+
+        # Get maximum number of mm items per modality (batch size).
+        self.max_num_mm_items_by_modality = dict()
+        if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0
+                and self.encoder_cache_size > 0):
+            max_tokens_by_modality_dict = (
+                MULTIMODAL_REGISTRY.
+                get_max_tokens_per_item_by_nonzero_modality(self.model_config))
+            for modality, max_tokens in max_tokens_by_modality_dict.items():
+                # Check how many items of this modality can be supported by
+                # the encoder budget.
+                encoder_budget = min(self.max_num_encoder_input_tokens,
+                                     self.encoder_cache_size)
+
+                max_num_mm_items_encoder_budget = cdiv(encoder_budget,
+                                                       max_tokens)
+
+                # Check how many items of this modality can be supported by
+                # the decoder budget.
+                max_mm_items_per_req = self.mm_registry.\
+                    get_mm_limits_per_prompt(self.model_config)[modality]
+
+                # NOTE: We do not consider max_num_batched_tokens on purpose
+                # because the multimodal embeddings can be generated in advance
+                # and chunked prefilled.
+                max_num_mm_items_decoder_budget = self.max_num_reqs * \
+                    max_mm_items_per_req
+
+                max_num_mm_items = min(max_num_mm_items_encoder_budget,
+                                       max_num_mm_items_decoder_budget)
+                self.max_num_mm_items_by_modality[modality] = max_num_mm_items
+
+        if not self.use_spmd:
+            self.sample_from_logits_func = torch.compile(
+                self.sample_from_logits,
+                backend="openxla",
+                fullgraph=True,
+                dynamic=False)
+        else:
+            self.sample_from_logits_func = self.sample_from_logits
+
+    def _update_num_xla_graphs(self, case_str):
+        check_comp = self.check_recompilation and not self.enforce_eager
+        if not check_comp:
+            return
+
+        total_cached_graphs = xr.get_num_cached_compilation_graph()
+        new_compiled_graphs = total_cached_graphs - self.num_xla_graphs
+        if new_compiled_graphs == 0:
+            return
+
+        logger.info("Add new %d compiled XLA graphs due to %s",
+                    new_compiled_graphs, case_str)
+        self.num_xla_graphs += new_compiled_graphs
+
+    def _verify_num_xla_graphs(self, case_str):
+        check_comp = self.check_recompilation and not self.enforce_eager
+        if not check_comp:
+            return
+
+        curr_cached_graph = xr.get_num_cached_compilation_graph()
+        assert self.num_xla_graphs == curr_cached_graph, (
+            "Recompilation after warm up is detected during {}."
+            " num_xla_graphs = {} curr_cached_graph = {}".format(
+                case_str, self.num_xla_graphs, curr_cached_graph))
+
+    def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
+        """Update the cached states and the persistent batch with the scheduler
+        output.
+
+        The updated states are used by the `_prepare_inputs` function to create
+        the input GPU tensors for the model.
+
+        Returns:
+            True if there is a new/resumed/paused/finished request.
+            If False, we can skip copying SamplingMetadata to the GPU.
+        """
+        # Remove finished requests from the cached states.
+        for req_id in scheduler_output.finished_req_ids:
+            self.requests.pop(req_id, None)
+            self.encoder_cache.pop(req_id, None)
+
+        # Remove the finished requests from the persistent batch.
+        # NOTE(woosuk): There could be an edge case where finished_req_ids and
+        # scheduled_req_ids overlap. This happens when a request is aborted and
+        # then resubmitted with the same ID. In this case, we treat them as two
+        # distinct requests - clearing the cached states for the first request
+        # and handling the second as a new request.
+        removed_req_indices: list[int] = []
+        for req_id in scheduler_output.finished_req_ids:
+            req_index = self.input_batch.remove_request(req_id)
+            if req_index is not None:
+                removed_req_indices.append(req_index)
+
+        # Free the cached encoder outputs.
+        for req_id, input_id in scheduler_output.free_encoder_input_ids:
+            encoder_outputs = self.encoder_cache.get(req_id)
+            if encoder_outputs is not None:
+                encoder_outputs.pop(input_id, None)
+                if not encoder_outputs:
+                    self.encoder_cache.pop(req_id, None)
+
+        # Remove the unscheduled requests from the persistent batch.
+        # NOTE(woosuk): The unscheduled requests are either preempted requests
+        # or running requests that are not scheduled in this step. We remove
+        # them from the persistent batch but keep their cached states since
+        # they will be scheduled again sometime in the future.
+        scheduled_req_ids = scheduler_output.num_scheduled_tokens.keys()
+        cached_req_ids = self.input_batch.req_id_to_index.keys()
+        unscheduled_req_ids = cached_req_ids - scheduled_req_ids
+        # NOTE(woosuk): The persistent batch optimization assumes that
+        # consecutive batches contain mostly the same requests. If batches
+        # have low request overlap (e.g., alternating between two distinct
+        # sets of requests), this optimization becomes very inefficient.
+        for req_id in unscheduled_req_ids:
+            req_index = self.input_batch.remove_request(req_id)
+            assert req_index is not None
+            removed_req_indices.append(req_index)
+
+        req_ids_to_add: list[str] = []
+        # Add new requests to the cached states.
+        for new_req_data in scheduler_output.scheduled_new_reqs:
+            assert new_req_data.sampling_params is not None,\
+                "Pooling is not supported in TPU yet"
+            req_id = new_req_data.req_id
+            sampling_params = new_req_data.sampling_params
+
+            self.requests[req_id] = CachedRequestState(
+                req_id=req_id,
+                prompt_token_ids=new_req_data.prompt_token_ids,
+                mm_inputs=new_req_data.mm_inputs,
+                mm_positions=new_req_data.mm_positions,
+                sampling_params=sampling_params,
+                pooling_params=None,
+                generator=None,
+                block_ids=new_req_data.block_ids,
+                num_computed_tokens=new_req_data.num_computed_tokens,
+                output_token_ids=[],
+                lora_request=new_req_data.lora_request,
+            )
+
+            req_ids_to_add.append(req_id)
+
+        # Update the states of the running/resumed requests.
+        req_data = scheduler_output.scheduled_cached_reqs
+        for i, req_id in enumerate(req_data.req_ids):
+            req_state = self.requests[req_id]
+            num_computed_tokens = req_data.num_computed_tokens[i]
+            new_block_ids = req_data.new_block_ids[i]
+            resumed_from_preemption = req_data.resumed_from_preemption[i]
+
+            # Update the cached states.
+            req_state.num_computed_tokens = num_computed_tokens
+            if not resumed_from_preemption:
+                # Append the new blocks to the existing block IDs.
+                for block_ids, new_ids in zip(req_state.block_ids,
+                                              new_block_ids):
+                    block_ids.extend(new_ids)
+            else:
+                # The request is resumed from preemption.
+                # Replace the existing block IDs with the new ones.
+                req_state.block_ids = new_block_ids
+
+            req_index = self.input_batch.req_id_to_index.get(req_id)
+            if req_index is None:
+                # The request is not in the persistent batch.
+                # The request was either preempted and resumed later, or was not
+                # scheduled in the previous step and needs to be added again.
+                req_ids_to_add.append(req_id)
+                continue
+
+            # Update the persistent batch.
+            self.input_batch.num_computed_tokens_cpu[req_index] = (
+                num_computed_tokens)
+            self.input_batch.block_table.append_row(new_block_ids, req_index)
+
+        # Add the new or resumed requests to the persistent batch.
+        # The smaller empty indices are filled first.
+        removed_req_indices = sorted(removed_req_indices, reverse=True)
+        for req_id in req_ids_to_add:
+            req_state = self.requests[req_id]
+            if removed_req_indices:
+                # Fill the empty index.
+                req_index = removed_req_indices.pop()
+            else:
+                # Append to the end.
+                req_index = None
+            self.input_batch.add_request(req_state, req_index)
+
+        # Condense the batched states if there are empty indices.
+        if removed_req_indices:
+            self.input_batch.condense(removed_req_indices)
+
+        return len(unscheduled_req_ids) > 0 or len(req_ids_to_add) > 0
+
+    def get_model(self) -> nn.Module:
+        return self.model
+
+    def get_supported_pooling_tasks(self) -> list[PoolingTask]:
+        model = self.get_model()
+        if not is_pooling_model(model):
+            return []
+
+        return list(model.pooler.get_supported_tasks())
+
+    def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
+        """
+        Generates the KVCacheSpec by parsing the kv cache format from each
+        Attention module in the static forward context.
+        Returns:
+            KVCacheSpec: A dictionary mapping layer names to their KV cache
+            format. Layers that do not need KV cache are not included.
+        """
+
+        layers = get_layers_from_vllm_config(self.vllm_config, Attention)
+        block_size = self.vllm_config.cache_config.block_size
+        kv_cache_spec: dict[str, KVCacheSpec] = {}
+        for layer_name, attn_module in layers.items():
+            if (kv_tgt_layer :=
+                    attn_module.kv_sharing_target_layer_name) is not None:
+                # The layer doesn't need its own KV cache and will use that of
+                # the target layer. We skip creating a KVCacheSpec for it, so
+                # that KV cache management logic will act as this layer does
+                # not exist, and doesn't allocate KV cache for the layer. This
+                # enables the memory saving of cross-layer kv sharing, allowing
+                # a given amount of memory to accommodate longer context lengths
+                # or enable more requests to be processed simultaneously.
+                self.shared_kv_cache_layers[layer_name] = kv_tgt_layer
+                continue
+
+            if attn_module.attn_type == AttentionType.DECODER:
+                if attn_module.use_irope:
+                    logger.warning_once(
+                        "Using irope in Pallas is not supported yet, it "
+                        "will fall back to global attention for long context.")
+                if attn_module.sliding_window is not None:
+                    kv_cache_spec[layer_name] = SlidingWindowSpec(
+                        block_size=block_size,
+                        num_kv_heads=attn_module.num_kv_heads,
+                        head_size=attn_module.head_size,
+                        dtype=self.kv_cache_dtype,
+                        sliding_window=attn_module.sliding_window,
+                        use_mla=False,
+                    )
+                else:
+                    kv_cache_spec[layer_name] = FullAttentionSpec(
+                        block_size=block_size,
+                        num_kv_heads=attn_module.num_kv_heads,
+                        head_size=attn_module.head_size,
+                        dtype=self.kv_cache_dtype,
+                        use_mla=False,
+                    )
+            elif attn_module.attn_type in (AttentionType.ENCODER,
+                                           AttentionType.ENCODER_ONLY):
+                # encoder-only attention does not need KV cache.
+                continue
+            elif attn_module.attn_type == AttentionType.ENCODER_DECODER:
+                raise NotImplementedError
+            else:
+                raise ValueError(
+                    f"Unknown attention type: {attn_module.attn_type}")
+
+        return kv_cache_spec
+
+    def _get_slot_mapping_metadata(self, num_reqs,
+                                   num_scheduled_tokens_per_req):
+        """
+        Computes metadata for mapping slots to blocks in the key-value (KV)
+        cache for a batch of requests.
+
+        This function determines, for each request in the batch, how the
+        scheduled tokens are distributed across memory blocks, and generates
+        metadata needed to map slices of tokens to their corresponding positions
+        in the KV cache.
+
+        Args:
+            num_reqs (int): Number of requests in the current batch.
+            num_scheduled_tokens_per_req (int or np.ndarray): Number of tokens
+            to be scheduled for each request.
+
+        Returns:
+            np.ndarray: A 2D array of shape (total_block_len, 3), where each row
+            contains:
+                - kv_cache_start_index (int): The starting index in the KV cache
+                    for the corresponding slice.
+                - new_kv_start_index (int): The starting index in the new KV
+                    cache for the corresponding slice.
+                - slice_len (int): The length of the slice.
+        """
+        slices_start = self.input_batch.num_computed_tokens_cpu[:num_reqs]
+        slices_end = self.input_batch.num_computed_tokens_cpu[:num_reqs] + \
+            num_scheduled_tokens_per_req
+        local_block_start_idx = slices_start // self.block_size
+        local_block_end_idx = (slices_end - 1) // self.block_size
+        no_repeat_req_indices = self.arange_np[:num_reqs]
+        global_block_start_idx = (
+            no_repeat_req_indices * self.max_num_blocks_per_req +
+            local_block_start_idx)
+        block_lens = local_block_end_idx - local_block_start_idx + 1
+        global_block_start_idx = np.repeat(global_block_start_idx, block_lens)
+        slice_arange = np.concatenate([self.arange_np[:n] for n in block_lens])
+        global_block_indices = global_block_start_idx + slice_arange
+        block_table_cpu = self.input_batch.block_table[0].get_cpu_tensor()
+        block_numbers = block_table_cpu.flatten()[global_block_indices].numpy()
+        total_block_len = np.sum(block_lens)
+        slot_mapping_slices = np.repeat(np.array([[0, self.block_size]],
+                                                 dtype=np.int32),
+                                        total_block_len,
+                                        axis=0)
+        cu_block_lens = np.zeros(len(block_lens) + 1, dtype=np.int32)
+        np.cumsum(block_lens, out=cu_block_lens[1:])
+        for req_idx in range(num_reqs):
+            slot_mapping_slices[cu_block_lens[req_idx]][
+                0] = slices_start[req_idx] % self.block_size
+            slot_mapping_slices[
+                cu_block_lens[req_idx + 1] -
+                1][1] = (slices_end[req_idx] - 1) % self.block_size + 1
+        slice_lens = slot_mapping_slices[:, 1] - slot_mapping_slices[:, 0]
+        cu_slices_lens = np.zeros(len(slice_lens) + 1, dtype=np.int32)
+        np.cumsum(slice_lens, out=cu_slices_lens[1:])
+        kv_cache_start_indices = slot_mapping_slices[:, 0] + \
+            (block_numbers * self.block_size)
+        new_kv_start_indices = cu_slices_lens[:-1]
+        slot_mapping_metadata = np.stack(
+            [kv_cache_start_indices, new_kv_start_indices, slice_lens], axis=1)
+        return slot_mapping_metadata
+
+    def _prepare_inputs(self, scheduler_output: "SchedulerOutput",
+                        start_index: int):
+        assert scheduler_output.total_num_scheduled_tokens > 0
+        num_reqs = self.input_batch.num_reqs
+        assert num_reqs > 0
+        assert start_index < num_reqs
+
+        # Get the number of scheduled tokens for each request.
+        use_max_model_len = self.most_model_len is None
+        num_scheduled_tokens_per_req = []
+        max_num_scheduled_tokens_all_reqs = 0
+        end_index = start_index
+
+        # Use either most_model_len or max_model_len depending on request size.
+        for i in range(start_index, num_reqs):
+            req_id = self.input_batch.req_ids[i]
+            assert req_id is not None
+            num_tokens = scheduler_output.num_scheduled_tokens[req_id]
+            if not use_max_model_len and num_tokens > self.most_model_len:
+                use_max_model_len = True
+            num_scheduled_tokens_per_req.append(num_tokens)
+        if use_max_model_len:
+            if len(num_scheduled_tokens_per_req) > self.num_reqs_max_model_len:
+                num_scheduled_tokens_per_req = \
+                    num_scheduled_tokens_per_req[:self.num_reqs_max_model_len]
+                end_index = start_index + self.num_reqs_max_model_len
+            else:
+                end_index = num_reqs
+        else:
+            if len(num_scheduled_tokens_per_req
+                   ) > self.num_reqs_most_model_len:
+                num_scheduled_tokens_per_req = \
+                    num_scheduled_tokens_per_req[:self.num_reqs_most_model_len]
+                end_index = start_index + self.num_reqs_most_model_len
+            else:
+                end_index = num_reqs
+        max_num_scheduled_tokens_all_reqs = max(num_scheduled_tokens_per_req)
+        num_scheduled_tokens_per_req = np.array(num_scheduled_tokens_per_req,
+                                                dtype=np.int32)
+        total_num_scheduled_tokens = sum(num_scheduled_tokens_per_req)
+        assert max_num_scheduled_tokens_all_reqs > 0
+
+        num_reqs = len(num_scheduled_tokens_per_req)
+
+        # Get request indices.
+        # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
+        # For each scheduled token, what are the corresponding req index.
+        req_indices = np.repeat(self.arange_np[:num_reqs],
+                                num_scheduled_tokens_per_req)
+
+        # Get batched arange.
+        # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
+        # For each scheduled token, what is its position in corresponding req.
+        arange = np.concatenate(
+            [self.arange_np[:n] for n in num_scheduled_tokens_per_req])
+
+        # Get positions.
+        positions_np = self.positions_np[:total_num_scheduled_tokens]
+        np.add(self.input_batch.num_computed_tokens_cpu[req_indices],
+               arange,
+               out=positions_np)
+
+        # Get token indices.
+        # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
+        # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2]
+        # where M is the max_model_len.
+        token_indices = (positions_np +
+                         req_indices * self.input_batch.token_ids_cpu.shape[1])
+
+        # NOTE(woosuk): We use torch.index_select instead of np.take here
+        # because torch.index_select is much faster than np.take for large
+        # tensors.
+        torch.index_select(self.input_batch.token_ids_cpu_tensor.flatten(),
+                           0,
+                           torch.from_numpy(token_indices),
+                           out=self.input_ids_cpu[:total_num_scheduled_tokens])
+
+        # Prepare the attention metadata.
+        self.query_start_loc_np[0] = 0
+        np.cumsum(num_scheduled_tokens_per_req,
+                  out=self.query_start_loc_np[1:num_reqs + 1])
+        self.query_start_loc_np[num_reqs + 1:] = 1
+
+        self.seq_lens_np[:num_reqs] = (
+            self.input_batch.num_computed_tokens_cpu[:num_reqs] +
+            num_scheduled_tokens_per_req)
+
+        # Do the padding and copy the tensors to the TPU.
+        padded_total_num_scheduled_tokens = _get_padded_token_len(
+            self.num_tokens_paddings, total_num_scheduled_tokens)
+        # Zero out to avoid spurious values from prev iteration (last cp chunk)
+        self.input_ids_cpu[
+            total_num_scheduled_tokens:padded_total_num_scheduled_tokens] = 0
+        self.input_ids = self.input_ids_cpu[:
+                                            padded_total_num_scheduled_tokens].to(
+                                                self.device)
+        self.position_ids = self.positions_cpu[:
+                                               padded_total_num_scheduled_tokens].to(
+                                                   self.device)
+        if use_max_model_len:
+            block_tables = self.block_table_cpu[:self.num_reqs_max_model_len, :
+                                                self.max_num_blocks_per_req]
+            block_tables[:num_reqs, :self.max_num_blocks_per_req] = (
+                self.input_batch.block_table[0].get_cpu_tensor()[:num_reqs])
+            query_start_loc = self.query_start_loc_cpu[:self.
+                                                       num_reqs_max_model_len +
+                                                       1].to(self.device)
+            seq_lens = self.seq_lens_cpu[:self.num_reqs_max_model_len].to(
+                self.device)
+        else:
+            block_tables = self.block_table_cpu[:self.
+                                                num_reqs_most_model_len, :self.
+                                                num_blocks_per_most_len_req]
+            block_tables[:num_reqs, :self.num_blocks_per_most_len_req] = (
+                self.input_batch.block_table[0].get_cpu_tensor()
+                [:num_reqs, :self.num_blocks_per_most_len_req])
+            query_start_loc = self.query_start_loc_cpu[:self.
+                                                       num_reqs_most_model_len +
+                                                       1].to(self.device)
+            seq_lens = self.seq_lens_cpu[:self.num_reqs_most_model_len].to(
+                self.device)
+        block_tables = block_tables.to(self.device)
+
+        # Calculate the slot mapping
+        slot_mapping_metadata = self._get_slot_mapping_metadata(
+            num_reqs, num_scheduled_tokens_per_req)
+        num_kv_update_slices = slot_mapping_metadata.shape[0]
+        padded_num_slices = _get_padded_num_kv_cache_update_slices(
+            padded_total_num_scheduled_tokens, self.max_num_reqs,
+            self.block_size, self._num_slices_per_kv_cache_update_block)
+        slot_mapping_metadata = np.pad(
+            slot_mapping_metadata,
+            [[0, padded_num_slices - len(slot_mapping_metadata)], [0, 0]],
+            constant_values=0)
+        slot_mapping_metadata = np.transpose(slot_mapping_metadata)
+        slot_mapping_metadata = torch.tensor(slot_mapping_metadata,
+                                             device=self.device)
+
+        if self.lora_config is not None:
+            # We need to respect padding when activating LoRA adapters
+            padded_num_scheduled_tokens_per_req = np.copy(
+                num_scheduled_tokens_per_req
+            )  # Copying to avoid accidental state corruption bugs
+            padded_num_scheduled_tokens_per_req[-1] += \
+                padded_total_num_scheduled_tokens - total_num_scheduled_tokens
+
+            self.set_active_loras(self.input_batch,
+                                  padded_num_scheduled_tokens_per_req)
+
+        attn_metadata = PallasMetadata(
+            slot_mapping=slot_mapping_metadata,
+            block_tables=block_tables,
+            context_lens=seq_lens,
+            query_start_loc=query_start_loc,
+            num_seqs=torch.tensor([num_reqs],
+                                  dtype=torch.int32,
+                                  device=self.device),
+            num_kv_update_slices=torch.tensor([num_kv_update_slices],
+                                              dtype=torch.int32,
+                                              device=self.device),
+            num_slices_per_kv_cache_update_block=self.
+            _num_slices_per_kv_cache_update_block,
+        )
+        # NOTE(woosuk): Due to chunked prefills, there can be at most 1 partial
+        # request in the batch. While we should not sample any token from this
+        # partial request, we do so for simplicity. We will ignore the sampled
+        # token from the partial request.
+        # TODO: Support prompt logprobs.
+        padded_num_reqs = _get_padded_num_reqs_with_upper_limit(
+            num_reqs, self.max_num_reqs)
+        # Indices at which we sample (positions of last token in the sequence).
+        # Padded to avoid recompiling when `num_reqs` varies.
+        logits_indices = self.query_start_loc_cpu[1:padded_num_reqs + 1] - 1
+        logits_indices = logits_indices.to(self.device)
+
+        if self.lora_config is not None:
+            # We need to respect padding when activating LoRA adapters
+            padded_num_scheduled_tokens_per_req = np.copy(
+                num_scheduled_tokens_per_req
+            )  # Copying to avoid accidental state corruption bugs
+            padded_num_scheduled_tokens_per_req[-1] += \
+                padded_total_num_scheduled_tokens - total_num_scheduled_tokens
+
+            self.set_active_loras(self.input_batch,
+                                  padded_num_scheduled_tokens_per_req)
+
+        layer_names = get_layers_from_vllm_config(self.vllm_config,
+                                                  Attention).keys()
+        per_layer_attn_metadata = {
+            layer_name: attn_metadata
+            for layer_name in layer_names
+        }
+        return per_layer_attn_metadata, logits_indices, padded_num_reqs,\
+            num_reqs, end_index
+
+    def _scatter_placeholders(
+        self,
+        embeds: torch.Tensor,
+        is_embed: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        if is_embed is None:
+            return embeds
+
+        placeholders = embeds.new_full(
+            (is_embed.shape[0], embeds.shape[-1]),
+            fill_value=torch.nan,
+        )
+        placeholders[is_embed] = embeds
+        return placeholders
+
+    def _gather_placeholders(
+        self,
+        placeholders: torch.Tensor,
+        is_embed: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        if is_embed is None:
+            return placeholders
+
+        return placeholders[is_embed]
+
+    def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
+        scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
+        if not scheduled_encoder_inputs:
+            return
+
+        # Batch the multi-modal inputs.
+        mm_inputs = list[MultiModalKwargs]()
+        req_ids_pos = list[tuple[str, int, PlaceholderRange]]()
+        for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
+            req_state = self.requests[req_id]
+
+            for mm_input_id in encoder_input_ids:
+                mm_inputs.append(req_state.mm_inputs[mm_input_id])
+                req_ids_pos.append(
+                    (req_id, mm_input_id, req_state.mm_positions[mm_input_id]))
+
+        # Batch mm inputs as much as we can: if a request in the batch has
+        # multiple modalities or a different modality than the previous one,
+        # we process it separately to preserve item order.
+        # FIXME(ywang96): This is a hacky way to deal with multiple modalities
+        # in the same batch while still being able to benefit from batching
+        # multimodal inputs. The proper solution should be reordering the
+        # encoder outputs.
+        grouped_mm_inputs_list = group_mm_inputs_by_modality(mm_inputs)
+
+        encoder_outputs = []
+        for grouped_mm_inputs in grouped_mm_inputs_list:
+            batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs)
+            batched_mm_inputs = MultiModalKwargs.as_kwargs(
+                batched_mm_inputs,
+                device=self.device,
+            )
+
+            # Run the encoder.
+            # `curr_group_outputs` is either of the following:
+            # 1. A tensor of shape (num_items, feature_size, hidden_size)
+            # in case feature_size is fixed across all multimodal items.
+            # 2. A list or tuple (length: num_items) of tensors, each of shape
+            # (feature_size, hidden_size) in case the feature size is dynamic
+            # depending on the input multimodal items.
+            xm.mark_step()
+            curr_group_outputs = self.model.get_multimodal_embeddings(
+                **batched_mm_inputs)
+            xm.mark_step()
+
+            sanity_check_mm_encoder_outputs(
+                curr_group_outputs,
+                expected_num_items=len(grouped_mm_inputs),
+            )
+
+            if isinstance(curr_group_outputs, torch.Tensor):
+                encoder_outputs.append(curr_group_outputs)
+            else:
+                assert isinstance(curr_group_outputs, (list, tuple))
+                for output in curr_group_outputs:
+                    encoder_outputs.append(output)
+
+        # Cache the encoder outputs.
+        # NOTE (NickLucche) here we diverge from logic in other runners, as we
+        # assume to only have whole mm items to process. Hence we avoid the
+        # intrinsic dynamism that `scatter_mm_placeholders` introduces.
+        for (req_id, input_id, pos_info), output in zip(
+                req_ids_pos,
+                encoder_outputs,
+        ):
+            if req_id not in self.encoder_cache:
+                self.encoder_cache[req_id] = {}
+            assert pos_info.is_embed is None, "Expected all positions to be"\
+                " contiguous and embeddings."
+            self.encoder_cache[req_id][input_id] = output
+
+    def _gather_mm_embeddings(
+        self,
+        scheduler_output: "SchedulerOutput",
+    ) -> list[torch.Tensor]:
+        mm_embeds: list[torch.Tensor] = []
+        for req_id in self.input_batch.req_ids:
+            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
+                req_id]
+            req_state = self.requests[req_id]
+            num_computed_tokens = req_state.num_computed_tokens
+            mm_positions = req_state.mm_positions
+            # TODO unroll loop and assume/enforce --disable_chunked_mm_input
+            # NOTE (NickLucche) here we diverge from logic in other runners, as
+            # we assume to only have whole mm items to process. Hence we avoid
+            # the intrinsic dynamism that `gather_mm_placeholders` introduces.
+            for i, pos_info in enumerate(mm_positions):
+                start_pos = pos_info.offset
+                num_encoder_tokens = pos_info.length
+
+                # The encoder output is needed if the two ranges overlap:
+                # [num_computed_tokens,
+                #  num_computed_tokens + num_scheduled_tokens) and
+                # [start_pos, start_pos + num_encoder_tokens)
+                if start_pos >= num_computed_tokens + num_scheduled_tokens:
+                    # The encoder output is not needed in this step.
+                    break
+                if start_pos + num_encoder_tokens <= num_computed_tokens:
+                    # The encoder output is already processed and stored
+                    # in the decoder's KV cache.
+                    continue
+
+                assert req_id in self.encoder_cache
+                assert i in self.encoder_cache[req_id]
+                assert pos_info.is_embed is None, "Expected all positions to"\
+                " be contiguous and embeddings."
+                encoder_output = self.encoder_cache[req_id][i]
+                mm_embeds.append(encoder_output)
+        return mm_embeds
+
+    def _get_model_inputs(self, input_ids: torch.Tensor,
+                          mm_embeds: list[torch.Tensor]):
+        if self.is_multimodal_model:
+            # NOTE(woosuk): To unify token ids and soft tokens (vision
+            # embeddings), we always use embeddings (rather than token ids)
+            # as input to the multimodal model, even when the input is text.
+            inputs_embeds = self.model.get_input_embeddings(
+                input_ids=input_ids,
+                multimodal_embeddings=mm_embeds,
+            )
+            return None, inputs_embeds
+        else:
+            # For text-only models, we use token ids as input.
+            # While it is possible to use embeddings as input just like the
+            # multimodal models, it is not desirable for performance since
+            # then the embedding layer is not included in the CUDA graph.
+            return input_ids, None
+
+    @torch.no_grad()
+    def execute_model(
+        self,
+        scheduler_output: "SchedulerOutput",
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+    ) -> ModelRunnerOutput:
+        # Update cached state
+        self._update_states(scheduler_output)
+        if not scheduler_output.total_num_scheduled_tokens:
+            # Return empty ModelRunnerOutput if there's no work to do.
+            return EMPTY_MODEL_RUNNER_OUTPUT
+
+        if self.is_multimodal_model:
+            # Run the multimodal encoder if any.
+            self._execute_mm_encoder(scheduler_output)
+            mm_embeds = self._gather_mm_embeddings(scheduler_output)
+        else:
+            mm_embeds = []
+        xm.mark_step()
+        # Prepare inputs, the requests might be split into multiple
+        # executions, combine the result of each execution.
+        start_index = 0
+        combined_selected_tokens: list[torch.Tensor] = []
+        combined_logprobs: list[LogprobsLists] = []
+        while start_index < self.input_batch.num_reqs:
+            attn_metadata, logits_indices, padded_num_reqs, num_reqs,\
+                end_index = self._prepare_inputs(scheduler_output, start_index)
+            input_ids, inputs_embeds = self._get_model_inputs(
+                self.input_ids, mm_embeds)
+            xm.mark_step()
+            # Run the decoder
+            with set_forward_context(
+                    attn_metadata,
+                    self.vllm_config,
+                    num_tokens=scheduler_output.total_num_scheduled_tokens):
+                hidden_states = self.model(
+                    input_ids=input_ids,
+                    positions=self.position_ids,
+                    inputs_embeds=inputs_embeds,
+                )
+            hidden_states = self.select_hidden_states(hidden_states,
+                                                      logits_indices)
+            logits = self.compute_logits(hidden_states)
+            tpu_sampling_metadata = TPUSupportedSamplingMetadata.\
+                from_input_batch(self.input_batch, padded_num_reqs, self.device)
+            if scheduler_output.grammar_bitmask is not None:
+                require_struct_decoding, grammar_bitmask_padded, arange = \
+                    self.prepare_structured_decoding_input(logits,
+                                                           scheduler_output)
+                logits = self.structured_decode(require_struct_decoding,
+                                                grammar_bitmask_padded, logits,
+                                                arange)
+            selected_token_ids = self.sample_from_logits_func(
+                logits, tpu_sampling_metadata)
+            # NOTE (NickLucche) Use the original logits (before any penalties or
+            # temperature scaling) for the top-k logprobs. We can't enforce it
+            # due to recompilations outside torch.compiled code, so just make
+            # sure `sample_from_logits` does not modify the logits in-place.
+            logprobs = self.gather_logprobs(logits, selected_token_ids) \
+                if tpu_sampling_metadata.logprobs else None
+
+            # Remove padding on cpu and keep dynamic op outside of xla graph.
+            selected_token_ids = selected_token_ids.cpu()[:num_reqs]
+
+            combined_selected_tokens.append(selected_token_ids)
+            if tpu_sampling_metadata.logprobs:
+                combined_logprobs.append(logprobs.tolists())
+
+            start_index = end_index
+
+        selected_token_ids = torch.cat(combined_selected_tokens, dim=0)
+        if tpu_sampling_metadata.logprobs:
+
+            def concat_lists(input_lists):
+                result = []
+                for input_list in input_lists:
+                    result.extend(input_list)
+                return result
+
+            logprobs_lists = LogprobsLists(logprob_token_ids=concat_lists(
+                [lp.logprob_token_ids for lp in combined_logprobs]),
+                                           logprobs=concat_lists([
+                                               lp.logprobs
+                                               for lp in combined_logprobs
+                                           ]),
+                                           sampled_token_ranks=concat_lists([
+                                               lp.sampled_token_ranks
+                                               for lp in combined_logprobs
+                                           ]))
+        else:
+            logprobs_lists = None
+
+        # Update the cache state concurrently. Code above will not block until
+        # we use `selected_token_ids`. Add mark_step if post-processing changes
+        request_seq_lens: list[tuple[int, CachedRequestState, int]] = []
+        discard_sampled_tokens_req_indices = []
+        num_reqs = self.input_batch.num_reqs
+        for i, req_id in zip(range(num_reqs), self.input_batch.req_ids):
+            assert req_id is not None
+            req_state = self.requests[req_id]
+            seq_len = (req_state.num_computed_tokens +
+                       scheduler_output.num_scheduled_tokens[req_id])
+            if seq_len >= req_state.num_tokens:
+                request_seq_lens.append((i, req_state, seq_len))
+            else:
+                # Ignore the sampled token from the partial request.
+                # Rewind the generator state as if the token was not sampled.
+                generator = self.input_batch.generators.get(i)
+                if generator is not None:
+                    # This relies on cuda-specific torch-internal impl details
+                    generator.set_offset(generator.get_offset() - 4)
+
+                # Record the index of the request that should not be sampled,
+                # so that we could clear the sampled tokens before returning.
+                discard_sampled_tokens_req_indices.append(i)
+
+        assert all(
+            req_id is not None for req_id in
+            self.input_batch.req_ids[:num_reqs]), "req_ids contains None"
+        req_ids = cast(list[str], self.input_batch.req_ids[:num_reqs])
+
+        prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]] = {}
+        for req_id in self.input_batch.req_ids[:num_reqs]:
+            prompt_logprobs_dict[req_id] = None
+
+        max_gen_len = selected_token_ids.shape[-1]
+        if max_gen_len == 1:
+            valid_sampled_token_ids = selected_token_ids.tolist()
+
+            # Mask out the sampled tokens that should not be sampled.
+            # TODO: Keep in sync with gpu_model_runner.py, in particular
+            #       the "else" case here
+            for i in discard_sampled_tokens_req_indices:
+                valid_sampled_token_ids[i].clear()
+
+            # Append sampled tokens
+            for i, req_state, seq_len in request_seq_lens:
+                token_id = valid_sampled_token_ids[i][0]
+                self.input_batch.token_ids_cpu[i, seq_len] = token_id
+                req_state.output_token_ids.append(token_id)
+                self.input_batch.num_tokens[i] += 1
+
+        else:
+            valid_mask = selected_token_ids != INVALID_TOKEN_ID
+            gen_lens = valid_mask.sum(dim=1).tolist()
+            valid_sampled_token_ids = [
+                seq.tolist()
+                for seq in selected_token_ids[valid_mask].split(gen_lens)
+            ]
+            self.input_batch.num_tokens[:num_reqs] += gen_lens
+            for i, req_state, seq_len in request_seq_lens:
+                target_slice = slice(seq_len - gen_lens[i] + 1, seq_len + 1)
+                self.input_batch.token_ids_cpu[
+                    i, target_slice] = valid_sampled_token_ids[i]
+                req_state.output_token_ids.extend(valid_sampled_token_ids[i])
+
+        model_runner_output = ModelRunnerOutput(
+            req_ids=req_ids,
+            req_id_to_index=self.input_batch.req_id_to_index,
+            sampled_token_ids=valid_sampled_token_ids,
+            spec_token_ids=None,
+            logprobs=logprobs_lists,
+            prompt_logprobs_dict=prompt_logprobs_dict,
+            pooler_output=[],
+        )
+
+        # Check there are no new graphs compiled - all the graphs should be
+        # captured and compiled during warm up.
+        self._verify_num_xla_graphs("execute_model")
+
+        return model_runner_output
+
+    def update_config(self, overrides: dict[str, Any]) -> None:
+        # TODO: TPU config may need extra validation
+        # https://github.com/vllm-project/vllm/pull/20095#discussion_r2201497754
+        allowed_config_names = {"load_config", "model_config"}
+        for config_name, config_overrides in overrides.items():
+            assert config_name in allowed_config_names, \
+                f"Config `{config_name}` not supported. " \
+                f"Allowed configs: {allowed_config_names}"
+            config = getattr(self, config_name)
+            new_config = update_config(config, config_overrides)
+            setattr(self, config_name, new_config)
+
+    def load_model(self) -> None:
+        self.device = self.device_config.device
+
+        # NOTE(woosuk): While the executor assigns the TP ranks to the worker
+        # process, the ranks can be different from the ranks internally assigned
+        # by the xm runtime. Therefore, there is a mismatch in the rank
+        # assignment between the gloo (cpu) runtime and the xm (tpu) runtime.
+        # This is not a problem in linear layers because all-reduce is
+        # rank-agnostic. However, it matters for all-gather as the ranks
+        # determine the order of concatenating the output tensors.
+        # As a workaround, we use the xm's rank assignment only when loading
+        # the embedding weights.
+        xm_tp_rank = xr.global_ordinal()
+        with patch(
+                "vllm.model_executor.layers.vocab_parallel_embedding."
+                "get_tensor_model_parallel_rank",
+                return_value=xm_tp_rank):
+            try:
+                if self.use_spmd:
+                    tpu_loader = TPUModelLoader(
+                        load_config=self.vllm_config.load_config)
+                    model = tpu_loader.load_model(
+                        vllm_config=self.vllm_config,
+                        model_config=self.vllm_config.model_config,
+                        mesh=self.mesh)
+                else:
+                    model_loader = get_model_loader(self.load_config)
+                    logger.info("Loading model from scratch...")
+                    model = model_loader.load_model(
+                        vllm_config=self.vllm_config,
+                        model_config=self.model_config)
+            except RuntimeError as e:
+                raise RuntimeError(
+                    f"Unable to load model, a likely reason is the model is "
+                    "too large for the current device's HBM memory. "
+                    "Consider switching to a smaller model "
+                    "or sharding the weights on more chips. "
+                    f"See the detailed error: {e}") from e
+        if self.lora_config is not None:
+            model = self.load_lora_model(model, self.model_config,
+                                         self.scheduler_config,
+                                         self.lora_config, self.device)
+            replace_set_lora(model)
+
+        # Sync all pending XLA execution during model initialization and weight
+        # loading.
+        xm.mark_step()
+        xm.wait_device_ops()
+        if not hasattr(self, "model"):
+            self.model = model
+        self.sampler = TPUSampler()
+
+    def reload_weights(self) -> None:
+        assert getattr(self, "model", None) is not None, \
+            "Cannot reload weights before model is loaded."
+        model_loader = get_model_loader(self.load_config)
+        logger.info("Reloading weights inplace...")
+        model_loader.load_weights(self.model, model_config=self.model_config)
+
+    @torch.no_grad()
+    def _dummy_run(self, num_tokens: int, num_reqs: int,
+                   num_blocks: int) -> None:
+        if self.is_multimodal_model:
+            input_ids = None
+            inputs_embeds = torch.zeros((num_tokens, self.hidden_size),
+                                        dtype=self.dtype,
+                                        device=self.device)
+        else:
+            input_ids = torch.zeros((num_tokens),
+                                    dtype=torch.int32).to(self.device)
+            inputs_embeds = None
+        actual_num_reqs = min(num_tokens, num_reqs)
+        position_ids = torch.zeros(num_tokens,
+                                   dtype=torch.int32).to(self.device)
+        padded_num_slices = _get_padded_num_kv_cache_update_slices(
+            num_tokens, self.max_num_reqs, self.block_size,
+            self._num_slices_per_kv_cache_update_block)
+        num_kv_update_slices = torch.tensor([padded_num_slices],
+                                            dtype=torch.int32).to(self.device)
+        slot_mapping = torch.zeros((3, padded_num_slices),
+                                   dtype=torch.int32).to(self.device)
+        block_tables = torch.zeros((num_reqs, num_blocks),
+                                   dtype=torch.int32).to(self.device)
+        query_lens = [1] * num_reqs
+        query_start_loc = torch.cumsum(torch.tensor([0] + query_lens,
+                                                    dtype=torch.int32),
+                                       dim=0,
+                                       dtype=torch.int32).to(self.device)
+        context_lens = torch.ones((num_reqs, ),
+                                  dtype=torch.int32).to(self.device)
+        num_seqs = torch.tensor([actual_num_reqs],
+                                dtype=torch.int32).to(self.device)
+        attn_metadata = PallasMetadata(
+            slot_mapping=slot_mapping,
+            block_tables=block_tables,
+            context_lens=context_lens,
+            query_start_loc=query_start_loc,
+            num_seqs=num_seqs,
+            num_kv_update_slices=num_kv_update_slices,
+            num_slices_per_kv_cache_update_block=self.
+            _num_slices_per_kv_cache_update_block,
+        )
+
+        if self.is_multimodal_model:
+            torch._dynamo.mark_dynamic(inputs_embeds, 0)
+        else:
+            torch._dynamo.mark_dynamic(input_ids, 0)
+        torch._dynamo.mark_dynamic(position_ids, 0)
+        torch._dynamo.mark_dynamic(attn_metadata.slot_mapping, 0)
+        torch._dynamo.mark_dynamic(attn_metadata.block_tables, (0, 1))
+        torch._dynamo.mark_dynamic(attn_metadata.context_lens, 0)
+        torch._dynamo.mark_dynamic(attn_metadata.query_start_loc, 0)
+
+        layer_names = get_layers_from_vllm_config(self.vllm_config,
+                                                  Attention).keys()
+        per_layer_attn_metadata = {
+            layer_name: attn_metadata
+            for layer_name in layer_names
+        }
+
+        with self.maybe_select_dummy_loras(
+                self.lora_config,
+                np.array([num_tokens], dtype=np.int32)), set_forward_context(
+                    per_layer_attn_metadata, self.vllm_config, 0):
+            out = self.model(input_ids=input_ids,
+                             positions=position_ids,
+                             inputs_embeds=inputs_embeds)
+        self._hidden_states_dtype = out.dtype
+
+    def _set_active_loras(self, prompt_lora_mapping, token_lora_mapping,
+                          lora_requests) -> None:
+        xm.mark_step()  # Captures input updates
+        super()._set_active_loras(prompt_lora_mapping, token_lora_mapping,
+                                  lora_requests)
+        xm.mark_step()  # Captures metadata updates
+
+    def _precompile_mm_encoder(self) -> None:
+        # Pre-compile MM encoder for all supported data modalities.
+        hf_config = self.vllm_config.model_config.hf_config
+        for mode, max_items_by_mode in \
+            self.max_num_mm_items_by_modality.items():
+            logger.info(
+                "Compiling Multimodal %s Encoder with different input"
+                " shapes.", mode)
+            start = time.perf_counter()
+            # No padding for MM encoder just yet.
+            for num_items in range(1, max_items_by_mode + 1):
+                logger.info("  -- mode: %s items: %d", mode, num_items)
+                batched_dummy_mm_inputs = self._get_mm_dummy_batch(
+                    mode, num_items)
+                # Run multimodal encoder.
+                xm.mark_step()
+                mm_embeds = self.model.\
+                    get_multimodal_embeddings(**batched_dummy_mm_inputs)
+                xm.mark_step()
+                num_patches = mm_embeds[0].shape[0]
+                items_size = num_patches * num_items
+
+                # NOTE (NickLucche) pre-compile `get_input_embeddings` when mm
+                # embeddings are present. We assume `--disable-mm-chunked`,
+                # hence only whole items can be scheduled. This implies we just
+                # need to compile when `num_items` fit the (padded) `input_ids`
+                for num_tokens in self.num_tokens_paddings:
+                    if num_tokens >= items_size:
+                        # XLA Workaround: if torch.zeros(..device) is used, XLA
+                        # compiles a scalar+expansion op, which won't match
+                        # the graph generated at runtime. CPU->TPU must be used
+                        placeholders_ids = torch.zeros(num_tokens,
+                                                       dtype=torch.int32,
+                                                       device="cpu")
+                        # Align placeholders and actual num mm_embeddings.
+                        placeholders_ids[:items_size] = \
+                            hf_config.image_token_index
+
+                        placeholders_ids = placeholders_ids.to(self.device)
+                        # Assign outputs or the graph will be cut short.
+                        a, b = self._get_model_inputs(placeholders_ids,
+                                                      [mm_embeds])
+                        assert a is None
+                        xm.mark_step()
+
+            # Pre-compile `get_input_embeddings` when mm_embeddings are not
+            # present. Chunk is only made of text, no mm_placeholders.
+            for num_tokens in self.num_tokens_paddings:
+                placeholders_ids = torch.zeros(num_tokens,
+                                               dtype=torch.int32,
+                                               device="cpu")
+                placeholders_ids = placeholders_ids.to(self.device)
+                a, b = self._get_model_inputs(placeholders_ids, [])
+                assert a is None
+                xm.mark_step()
+
+            xm.wait_device_ops()
+            end = time.perf_counter()
+            logger.info(
+                "Multimodal %s Encoder compilation finished in in %.2f "
+                "[secs].", mode, end - start)
+
+    def _precompile_backbone(self) -> None:
+        logger.info("Compiling the model with different input shapes.")
+        start = time.perf_counter()
+        for num_tokens in self.num_tokens_paddings:
+            logger.info("  -- num_tokens: %d", num_tokens)
+            self._dummy_run(num_tokens, self.num_reqs_max_model_len,
+                            self.max_num_blocks_per_req)
+            if self.most_model_len is not None:
+                self._dummy_run(num_tokens, self.num_reqs_most_model_len,
+                                self.num_blocks_per_most_len_req)
+        xm.wait_device_ops()
+        end = time.perf_counter()
+        logger.info("Compilation finished in %.2f [secs].", end - start)
+        self._update_num_xla_graphs("model backbone")
+
+    def _precompile_select_hidden_states(self) -> None:
+        # Compile hidden state selection function for bucketed
+        # n_tokens x max_num_reqs. Graph is really small so this is fine.
+        logger.info(
+            "Compiling select_hidden_states with different input shapes.")
+        start = time.perf_counter()
+        hsize = self.model_config.get_hidden_size()
+        for num_tokens in self.num_tokens_paddings:
+            dummy_hidden = torch.zeros((num_tokens, hsize),
+                                       device=self.device,
+                                       dtype=self._hidden_states_dtype)
+            torch._dynamo.mark_dynamic(dummy_hidden, 0)
+            for num_reqs in self.num_reqs_paddings:
+                indices = torch.zeros(num_reqs,
+                                      dtype=torch.int32,
+                                      device=self.device)
+                torch._dynamo.mark_dynamic(indices, 0)
+                self.select_hidden_states(dummy_hidden, indices)
+                logger.info("  -- num_tokens: %d, num_seqs: %d", num_tokens,
+                            num_reqs)
+                # Requests can't be more than tokens. But do compile for the
+                # next bigger value in case num_tokens uses bucketed padding.
+                if num_reqs >= min(num_tokens, self.max_num_reqs):
+                    break
+        xm.wait_device_ops()
+        end = time.perf_counter()
+        logger.info("Compilation finished in %.2f [secs].", end - start)
+        self._update_num_xla_graphs("select_hidden_states")
+
+    def _precompile_compute_logits(self) -> None:
+        logger.info("Compiling compute_logits with different input shapes.")
+        start = time.perf_counter()
+        hsize = self.model_config.get_hidden_size()
+        for num_reqs in self.num_reqs_paddings:
+            dummy_hidden = torch.zeros((num_reqs, hsize),
+                                       device=self.device,
+                                       dtype=self._hidden_states_dtype)
+            torch._dynamo.mark_dynamic(dummy_hidden, 0)
+            self.compute_logits(dummy_hidden)
+            logger.info("  -- num_seqs: %d", num_reqs)
+        xm.wait_device_ops()
+        end = time.perf_counter()
+        logger.info("Compilation finished in %.2f [secs].", end - start)
+        self._update_num_xla_graphs("compute_logits")
+
+    def _precompile_structured_decoding(self) -> None:
+        logger.info(
+            "Compiling structured_decoding with different input shapes.")
+        start = time.perf_counter()
+        for num_reqs in self.num_reqs_paddings:
+            dummy_logits = torch.zeros((num_reqs, self.vocab_size),
+                                       device=self.device,
+                                       dtype=self._hidden_states_dtype)
+            dummy_require_struct_decoding = \
+                self.require_structured_out_cpu[:num_reqs].to(self.device)
+            dummy_grammar_bitmask = \
+                self.grammar_bitmask_cpu[:num_reqs].to(self.device)
+            # The first dimension of the above 3 dummy tensors cannot be
+            # mark_dynamic because some operations in structured_decode require
+            # them to be static.
+            arange = self.structured_decode_arange.to(self.device)
+            self.structured_decode(dummy_require_struct_decoding,
+                                   dummy_grammar_bitmask, dummy_logits, arange)
+            logger.info("  -- num_seqs: %d", num_reqs)
+        xm.wait_device_ops()
+        end = time.perf_counter()
+        logger.info("Compilation finished in %.2f [secs].", end - start)
+        self._update_num_xla_graphs("structured_decoding")
+
+    def _precompile_sample_from_logits(self) -> None:
+        logger.info(
+            "Compiling sample_from_logits with different input shapes.")
+        start = time.perf_counter()
+        for num_reqs in self.num_reqs_paddings:
+            dummy_logits = torch.zeros((num_reqs, self.vocab_size),
+                                       device=self.device,
+                                       dtype=self._hidden_states_dtype)
+            # The first dimension of dummy_logits cannot be mark_dynamic
+            # because some operations in the sampler require it to be static.
+            for all_greedy in [False, True]:
+                generate_params_if_all_greedy = not all_greedy
+                sampling_metadata = (
+                    TPUSupportedSamplingMetadata.from_input_batch(
+                        self.input_batch,
+                        num_reqs,
+                        self.device,
+                        generate_params_if_all_greedy,
+                    ))
+                sampling_metadata.all_greedy = all_greedy
+                with self.maybe_select_dummy_loras(
+                        self.lora_config, np.array([num_reqs],
+                                                   dtype=np.int32)):
+                    self.sample_from_logits_func(dummy_logits,
+                                                 sampling_metadata)
+            logger.info("  -- num_seqs: %d", num_reqs)
+        xm.wait_device_ops()
+        end = time.perf_counter()
+        logger.info("Compilation finished in %.2f [secs].", end - start)
+        self._update_num_xla_graphs("sample_from_logits")
+
+    def _precompile_gather_logprobs(self) -> None:
+        logger.info("Compiling gather_logprobs with different input shapes.")
+        start = time.perf_counter()
+        for num_reqs in self.num_reqs_paddings:
+            dummy_logits = torch.zeros((num_reqs, self.vocab_size),
+                                       device=self.device,
+                                       dtype=self._hidden_states_dtype)
+            dummy_tokens = torch.zeros((num_reqs, 1),
+                                       dtype=torch.int64).to(self.device)
+            with self.maybe_select_dummy_loras(
+                    self.lora_config, np.array([num_reqs], dtype=np.int32)):
+                self.gather_logprobs(dummy_logits, dummy_tokens)
+            logger.info("  -- num_seqs: %d", num_reqs)
+        xm.wait_device_ops()
+        end = time.perf_counter()
+        logger.info("Compilation finished in %.2f [secs].", end - start)
+        self._update_num_xla_graphs("gather_logprobs")
+
+    def capture_model(self) -> None:
+        """
+        Precompile all the subgraphs with possible input shapes.
+        """
+        with self.maybe_setup_dummy_loras(self.lora_config):
+            self._precompile_mm_encoder()
+            self._precompile_backbone()
+            self._precompile_select_hidden_states()
+            self._precompile_compute_logits()
+            self._precompile_structured_decoding()
+            self._precompile_sample_from_logits()
+            self._precompile_gather_logprobs()
+
+    def profile_run(
+        self,
+        num_tokens: int,
+    ) -> None:
+        # Profile with multimodal encoder & encoder cache.
+        # TODO: handle encoder-decoder models once we support them.
+        if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0
+                and self.encoder_cache_size > 0):
+
+            # NOTE: Currently model is profiled with a single non-text
+            # modality with the max possible input tokens even when
+            # it supports multiple.
+            dummy_data_modality, max_num_mm_items = max(
+                self.max_num_mm_items_by_modality.items(), key=lambda t: t[1])
+
+            encoder_budget = min(self.max_num_encoder_input_tokens,
+                                 self.encoder_cache_size)
+
+            logger.info(
+                "Encoder cache will be initialized with a budget of %d tokens,"
+                " and profiled with %s %s items of the maximum feature size.",
+                encoder_budget, max_num_mm_items, dummy_data_modality)
+
+            # Create dummy batch of multimodal inputs.
+            batched_dummy_mm_inputs = self._get_mm_dummy_batch(
+                dummy_data_modality, max_num_mm_items)
+
+            # Run multimodal encoder.
+            # Isolate encoder graph from post-processing to minimize
+            # impact of recompilation until it's fixed.
+            start = time.perf_counter()
+            xm.mark_step()
+            dummy_encoder_outputs = self.model.get_multimodal_embeddings(
+                **batched_dummy_mm_inputs)
+            xm.mark_step()
+            xm.wait_device_ops()
+            end = time.perf_counter()
+            logger.info(
+                "Multimodal Encoder profiling finished in in %.2f [secs].",
+                end - start)
+
+            assert len(dummy_encoder_outputs) == max_num_mm_items, (
+                "Expected dimension 0 of encoder outputs to match the number "
+                f"of multimodal data items: {max_num_mm_items}, got "
+                f"{len(dummy_encoder_outputs)=} instead. This is most likely "
+                "due to the 'get_multimodal_embeddings' method of the model "
+                "not implemented correctly.")
+
+            # Cache the dummy encoder outputs.
+            self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
+
+        # Trigger compilation for general shape.
+        self._dummy_run(num_tokens, self.num_reqs_max_model_len,
+                        self.max_num_blocks_per_req)
+        if self.most_model_len is not None:
+            self._dummy_run(num_tokens, self.num_reqs_most_model_len,
+                            self.num_blocks_per_most_len_req)
+
+        xm.mark_step()
+        xm.wait_device_ops()
+        self.encoder_cache.clear()
+        gc.collect()
+
+    def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
+        """
+        Initialize KV cache based on `kv_cache_config`.
+        Args:
+            kv_cache_config: Configuration for the KV cache, including the KV
+            cache size of each layer
+        """
+        if len(kv_cache_config.kv_cache_groups) > 1:
+            raise NotImplementedError(
+                "Hybrid models with more than one KV cache type are not "
+                "supported yet.")
+
+        if kv_cache_config.kv_cache_groups[
+                0].kv_cache_spec.block_size != self.block_size:
+            self.input_batch = InputBatch(
+                max_num_reqs=self.max_num_reqs,
+                max_model_len=self.max_model_len,
+                max_num_batched_tokens=self.max_num_tokens,
+                device=self.device,
+                pin_memory=self.pin_memory,
+                vocab_size=self.model_config.get_vocab_size(),
+                block_sizes=[
+                    kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size
+                ],
+            )
+        # Verify dtype compatibility between block_table_cpu and input_batch
+        assert self.block_table_cpu.dtype == self.input_batch.block_table[
+            0].get_cpu_tensor().dtype
+
+        kv_cache_sizes = {}
+        for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
+            assert len(kv_cache_tensor.shared_by) == 1, (
+                "KV cache tensor shared by multiple layers is not supported in "
+                "TPU.")
+            kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size
+
+        kv_caches: dict[str, torch.Tensor] = {}
+        for kv_cache_group in kv_cache_config.kv_cache_groups:
+            kv_cache_spec = kv_cache_group.kv_cache_spec
+            for layer_name in kv_cache_group.layer_names:
+                tensor_size = kv_cache_sizes[layer_name]
+                assert tensor_size % kv_cache_spec.page_size_bytes == 0
+                num_blocks = tensor_size // kv_cache_spec.page_size_bytes  # noqa
+                if isinstance(kv_cache_spec, AttentionSpec):
+                    if self.use_spmd:
+                        num_kv_heads = kv_cache_spec.num_kv_heads
+                        assert self.original_parallel_config is not None
+                        tp_size = \
+                            self.original_parallel_config.tensor_parallel_size
+                        # TODO: Handle kv cache duplication under SPMD mode.
+                        assert num_kv_heads % tp_size == 0, (
+                            f"num_kv_heads {num_kv_heads} must be divisible by "
+                            f"tp_size {tp_size} under SPMD mode")
+                    kv_cache_shape = PallasAttentionBackend.get_kv_cache_shape(
+                        num_blocks, kv_cache_spec.block_size,
+                        kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
+                    dtype = kv_cache_spec.dtype
+
+                    tpu_kv_cache = torch.zeros(kv_cache_shape,
+                                               dtype=dtype).to(self.device)
+
+                    kv_caches[layer_name] = tpu_kv_cache
+                else:
+                    raise NotImplementedError
+
+        # Setup `kv_cache_config` and `kv_caches` for models
+        # with cross-layer KV sharing
+        if self.shared_kv_cache_layers:
+            initialize_kv_cache_for_kv_sharing(
+                self.shared_kv_cache_layers,
+                kv_cache_config.kv_cache_groups,
+                kv_caches,
+            )
+
+        bind_kv_cache(
+            kv_caches,
+            self.vllm_config.compilation_config.static_forward_context,
+            self.kv_caches)
+
+        if self.use_spmd:
+            # Shard KV Cache
+            for cache in self.kv_caches:
+                xs.mark_sharding(cache, self.mesh, (None, 'x', None, None))
+
+    def reset_dynamo_cache(self):
+        if self.is_multimodal_model:
+            compiled_model = self.model.get_language_model().model
+        else:
+            compiled_model = self.model.model
+        if isinstance(compiled_model, TorchCompileWrapperWithCustomDispatcher):
+            logger.info("Clear dynamo cache and cached dynamo bytecode.")
+            torch._dynamo.eval_frame.remove_from_cache(
+                compiled_model.original_code_object)
+            compiled_model.compiled_codes.clear()
+
+    @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
+    def select_hidden_states(self, hidden_states, indices_do_sample):
+        return hidden_states[indices_do_sample]
+
+    @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
+    def compute_logits(self,
+                       sample_hidden_states: torch.Tensor) -> torch.Tensor:
+        return self.model.compute_logits(sample_hidden_states, None)
+
+    # TODO: Under SPMD mode, sample_from_logits has correctness issue.
+    #       Re-enable the torch.compile once the issue is fixed in torchxla.
+    # @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
+    def sample_from_logits(
+            self, logits: torch.Tensor,
+            sampling_metadata: TPUSupportedSamplingMetadata) -> torch.Tensor:
+        """
+        Sample with xla-friendly function. This function is to be traced 
+        separately from `forward` for lighter compilation overhead.
+        """
+        if sampling_metadata.all_greedy:
+            out_tokens = torch.argmax(logits, dim=-1, keepdim=True)
+        else:
+            out_tokens = self.sampler(logits,
+                                      sampling_metadata).sampled_token_ids
+        return out_tokens
+
+    @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
+    def gather_logprobs(self, logits: torch.Tensor,
+                        sampled_tokens: torch.Tensor) -> LogprobsTensors:
+        """
+        Gather the top_logprobs with corresponding tokens. Use a fixed number
+        of logprobs as an alternative to having multiple pre-compiled graphs.
+        Select the number of logprobs actually demanded by each request on CPU.
+        """
+        logprobs = self.sampler.compute_logprobs(logits)
+        return self.sampler.gather_logprobs(
+            logprobs,
+            self.model_config.max_logprobs,
+            token_ids=sampled_tokens.squeeze(-1))
+
+    @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
+    def structured_decode(self, require_struct_decoding: torch.Tensor,
+                          grammar_bitmask: torch.Tensor, logits: torch.Tensor,
+                          arange: torch.Tensor) -> torch.Tensor:
+        return torch.where(
+            require_struct_decoding,
+            self.apply_grammar_bitmask(logits, grammar_bitmask, arange),
+            logits)
+
+    def apply_grammar_bitmask(self, logits: torch.Tensor,
+                              grammar_bitmask: torch.Tensor,
+                              arange: torch.Tensor):
+        assert (logits.shape[0] == grammar_bitmask.shape[0])
+        logits_cloned = logits.clone()
+        for i in range(logits.shape[0]):
+            unpacked_bitmask = (torch.bitwise_right_shift(
+                grammar_bitmask[i][:, None], arange[None, :]) & 1) == 0
+            unpacked_bitmask = unpacked_bitmask.reshape(-1)[:self.vocab_size]
+            logits_cloned[i] = logits_cloned[i].masked_fill(
+                unpacked_bitmask, -float("inf"))
+        return logits_cloned
+
+    def get_multimodal_embeddings(self, *args, **kwargs):
+        return self.model.get_multimodal_embeddings(*args, **kwargs)
+
+    def get_input_embeddings(self, *args, **kwargs):
+        return self.model.get_input_embeddings(*args, **kwargs)
+
+    def prepare_structured_decoding_input(
+        self, logits: torch.Tensor, scheduler_output: "SchedulerOutput"
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        grammar_bitmask = scheduler_output.grammar_bitmask
+        assert grammar_bitmask is not None
+        num_reqs, _ = logits.shape
+
+        # Reset pre-allocated tensors
+        self.grammar_bitmask_cpu.zero_()
+        self.require_structured_out_cpu.zero_()
+
+        # We receive the structured output bitmask from the scheduler, but the
+        # indices of the requests in the batch may not match the indices of
+        # the bitmask since the scheduler doesn't know how the tpu runner is
+        # ordering the requests in the batch. We need to match the order of
+        # bitmask with the order of requests
+        struct_out_indices: list[int] = []
+        mask_indices: list[int] = []
+        for req_id in self.input_batch.req_ids:
+            mask_index = scheduler_output.structured_output_request_ids.get(
+                req_id)
+            if mask_index is None:
+                continue
+            batch_index = self.input_batch.req_id_to_index[req_id]
+            struct_out_indices.append(batch_index)
+            mask_indices.append(mask_index)
+        self.grammar_bitmask_cpu[struct_out_indices] = torch.from_numpy(
+            grammar_bitmask[mask_indices])
+        # It's not guaranteed that all requests in this batch require
+        # structured output, so create a bool tensor to represent
+        # the requests that need structured output.
+        struct_out_indices = torch.tensor(struct_out_indices, dtype=torch.long)
+        self.require_structured_out_cpu[struct_out_indices] = True
+        return self.require_structured_out_cpu[:num_reqs].to(logits.device), \
+            self.grammar_bitmask_cpu[:num_reqs].to(logits.device), \
+            self.structured_decode_arange.to(logits.device)
+
+    def _get_mm_dummy_batch(self, modality: str,
+                            batch_size: int) -> BatchedTensorInputs:
+        # Dummy data for pre-compiling multimodal models.
+        dummy_request_data = self.mm_registry.get_decoder_dummy_data(
+            model_config=self.model_config,
+            seq_len=self.max_num_tokens,
+        )
+        dummy_mm_data = dummy_request_data.multi_modal_data
+
+        # Dummy data definition in V0 may contain multiple multimodal items
+        # (e.g, multiple images) for a single request, therefore here we
+        # always replicate first item by max_num_mm_items times since in V1
+        # they are scheduled to be processed separately.
+        assert isinstance(dummy_mm_data, MultiModalKwargs), (
+            "Expected dummy multimodal data to be of type "
+            f"MultiModalKwargs, got {type(dummy_mm_data)=} instead. "
+            "This is most likely due to the model not having a merged "
+            "processor.")
+
+        # When models have a merged processor, their dummy data is
+        # already batched `MultiModalKwargs`, therefore we take the first
+        # `MultiModalKwargsItem` from the desired modality to profile on.
+        dummy_mm_item = dummy_mm_data.get_item(modality=modality, item_index=0)
+        dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item])
+
+        batched_dummy_mm_inputs = MultiModalKwargs.batch([dummy_mm_kwargs] *
+                                                         batch_size)
+        return MultiModalKwargs.as_kwargs(
+            batched_dummy_mm_inputs,
+            device=self.device,
+        )
+
+
+def _get_req_paddings(min_req_size: int, max_req_size: int) -> list[int]:
+    logger.info("Preparing request paddings:")
+    # assert min_req_size is power of 2
+    assert (min_req_size & (min_req_size - 1) == 0) and min_req_size > 0
+    paddings: list = []
+    num = max(MIN_NUM_SEQS, min_req_size)
+    while num <= max_req_size and (len(paddings) == 0 or paddings[-1] != num):
+        paddings.append(num)
+        logger.info("    %d", num)
+        num = _get_padded_num_reqs_with_upper_limit(num + 1, max_req_size)
+    return paddings
+
+
+def _get_padded_num_reqs_with_upper_limit(x: int, upper_limit: int) -> int:
+    res = MIN_NUM_SEQS if x <= MIN_NUM_SEQS else 1 << (x - 1).bit_length()
+    return min(res, upper_limit)
+
+
+def _get_token_paddings(min_token_size: int, max_token_size: int,
+                        padding_gap: int) -> list[int]:
+    """Generate a list of padding size, starting from min_token_size, 
+    ending with a number that can cover max_token_size
+
+    If padding_gap == 0 then:
+        increase 2X each time (exponential)
+    else:
+        first increase the size to twice,
+        then increase the padding size by padding_gap.
+    """
+    # assert min_token_size is power of 2
+    assert (min_token_size & (min_token_size - 1) == 0) and min_token_size > 0
+    paddings = []
+    num = min_token_size
+
+    if padding_gap == 0:
+        logger.info("Using exponential token paddings:")
+        while True:
+            logger.info("    %d", num)
+            paddings.append(num)
+            if num >= max_token_size:
+                break
+            num *= 2
+    else:
+        logger.info("Using incremental token paddings:")
+        while num <= padding_gap:
+            logger.info("    %d", num)
+            paddings.append(num)
+            num *= 2
+        num //= 2
+        while num < max_token_size:
+            num += padding_gap
+            logger.info("    %d", num)
+            paddings.append(num)
+
+    return paddings
+
+
+def _get_padded_token_len(paddings: list[int], x: int) -> int:
+    """Return the first element in paddings list greater or equal to x.
+    """
+    index = bisect.bisect_left(paddings, x)
+    assert index < len(paddings)
+    return paddings[index]
+
+
+def _get_padded_num_kv_cache_update_slices(
+        num_tokens: int, max_num_reqs: int, page_size: int,
+        num_slices_per_kv_cache_update_block: int) -> int:
+    """Calculates the padded number of KV cache update slices to avoid
+    recompilation."""
+    padded_num_slices = 2 * max_num_reqs + num_tokens // page_size
+    padded_num_slices = min(padded_num_slices, num_tokens)
+    padded_num_slices = (
+        padded_num_slices + num_slices_per_kv_cache_update_block - 1
+    ) // num_slices_per_kv_cache_update_block * \
+        num_slices_per_kv_cache_update_block
+    return padded_num_slices
+
+
+def _get_num_slices_per_kv_cache_update_block(page_size_bytes: int) -> int:
+    """Find the optimum number of slices to copy per Pallas program instance.
+
+    Increasing the number of slices copied in one instance of the kernel program
+    will increase HBM bandwidth utilization via more in-flight DMAs.
+
+    However, it will also use more VMEM, and experimentally, we observed
+    performance regression at 128 slices on v6e, likely due to running
+    out of scalar registers. Thus this function will limit the number of
+    slices to 64.
+    """
+    # The default vmem_limit_bytes of a pallas kernel is 32MB. Here we
+    # calculate num_slices_per_block based on 16MB in case any register spills.
+    vmem_limit = 16 * 1024 * 1024
+    num_slices_per_block = vmem_limit // page_size_bytes
+    assert num_slices_per_block > 0, "Number of slices should be positive"
+    num_slices_per_block = prev_power_of_2(num_slices_per_block)
+    if num_slices_per_block > 64:
+        num_slices_per_block = 64
+    return num_slices_per_block
+
+
+def replace_set_lora(model):
+
+    def _tpu_set_lora(
+        self,
+        index: int,
+        lora_a: torch.Tensor,
+        lora_b: torch.Tensor,
+        embeddings_tensor: Optional[torch.Tensor],
+        bias: Optional[torch.Tensor] = None,
+    ):
+        # TODO: The integer index leads to a recompilation, but converting it
+        # to a tensor doesn't seem to work anymore. This might be fixed with a
+        # later release of torch_xla.
+        self._original_set_lora(index, lora_a, lora_b, embeddings_tensor, bias)
+        xm.mark_step()
+
+    def _tpu_reset_lora(self, index: int):
+        self._original_reset_lora(index)
+        xm.mark_step()
+
+    for _, module in model.named_modules():
+        if isinstance(module, BaseLayerWithLoRA):
+            module._original_set_lora = module.set_lora
+            module._original_reset_lora = module.reset_lora
+            module.set_lora = _tpu_set_lora.__get__(module, module.__class__)
+            module.reset_lora = _tpu_reset_lora.__get__(
+                module, module.__class__)
diff --git a/vllm_v0.10.0/vllm/v1/worker/tpu_worker.py b/vllm_v0.10.0/vllm/v1/worker/tpu_worker.py
new file mode 100644
index 0000000..648d9c3
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/worker/tpu_worker.py
@@ -0,0 +1,326 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""A TPU worker class."""
+import os
+from typing import Any, Optional
+
+import torch
+import torch.distributed
+import torch.nn as nn
+import torch_xla.core.xla_model as xm
+import torch_xla.debug.profiler as xp
+import torch_xla.runtime as xr
+
+import vllm.envs as envs
+from vllm.config import ParallelConfig, VllmConfig
+from vllm.distributed import (ensure_model_parallel_initialized,
+                              init_distributed_environment)
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.model_executor import set_random_seed
+from vllm.platforms import current_platform
+from vllm.pooling_params import PoolingTask
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
+from vllm.v1.attention.backends.pallas import TPU_HEAD_SIZE_ALIGNMENT
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.kv_cache_interface import (AttentionSpec, KVCacheConfig,
+                                        KVCacheSpec)
+from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.utils import report_usage_stats
+from vllm.v1.worker.tpu_model_runner import TPUModelRunner
+from vllm.v1.worker.utils import bind_kv_cache
+
+logger = init_logger(__name__)
+
+
+class TPUWorker:
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        local_rank: int,
+        rank: int,
+        distributed_init_method: str,
+        is_driver_worker: bool = False,
+    ):
+        self.is_driver_worker = is_driver_worker
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.load_config = vllm_config.load_config
+        self.parallel_config = vllm_config.parallel_config
+        self.use_spmd = envs.VLLM_XLA_USE_SPMD
+        self.original_parallel_config = None
+        if self.use_spmd:
+            # Under SPMD mode, distributed env is initialized as if there is
+            # only one worker/device.
+            self.original_parallel_config = self.parallel_config
+            self.parallel_config.tensor_parallel_size = 1
+            self.parallel_config.pipeline_parallel_size = 1
+            self.parallel_config.world_size = 1
+        self.scheduler_config = vllm_config.scheduler_config
+        self.device_config = vllm_config.device_config
+        self.speculative_config = vllm_config.speculative_config
+        self.observability_config = vllm_config.observability_config
+
+        self.parallel_config.rank = rank
+        self.local_rank = local_rank
+        self.rank = rank
+        self.distributed_init_method = distributed_init_method
+
+        if self.cache_config.cache_dtype == "auto":
+            self.cache_dtype = self.model_config.dtype
+        else:
+            self.cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
+                self.cache_config.cache_dtype]
+
+        if self.model_config.trust_remote_code:
+            # note: lazy import to avoid importing torch before initializing
+            from vllm.utils import init_cached_hf_modules
+            init_cached_hf_modules()
+
+        # Delay profiler initialization to the start of the profiling.
+        # This is because in vLLM V1, MP runtime is initialized before the
+        # TPU Worker is initialized. The profiler server needs to start after
+        # MP runtime is initialized.
+        self.profiler = None
+        self.profile_dir = None
+        if envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1:
+            # For TPU, we can only have 1 active profiler session for 1 profiler
+            # server. So we only profile on rank0.
+            self.profile_dir = envs.VLLM_TORCH_PROFILER_DIR
+            logger.info("Profiling enabled. Traces will be saved to: %s",
+                        self.profile_dir)
+
+        if self.model_config.seed is None:
+            self.model_config.seed = 0
+
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+
+    def init_device(self):
+        os.environ["PJRT_DEVICE"] = "TPU"
+        # Note: Currently the XLA compiler wrongly uses 2D ring strategy on 1D
+        # ring, the xla tpu compiler flag
+        # `xla_tpu_force_1d_allreduce_at_chunk_count` is a temporary solution to
+        # fix this. It will be removed after the bug in XLA compiler is fixed.
+        os.environ["LIBTPU_INIT_ARGS"] = (
+            os.environ.get("LIBTPU_INIT_ARGS", "") +
+            " --xla_tpu_force_1d_allreduce_at_chunk_count=1"
+            " --xla_jf_conv_input_fusion=False")
+        # --xla_jf_conv_input_fusion=False is used to improve the perf of
+        # quantized matmul.
+        torch.set_grad_enabled(False)
+        torch.set_default_dtype(self.model_config.dtype)
+
+        # Initialize the distributed environment.
+        self._init_tpu_worker_distributed_environment(
+            self.parallel_config, self.rank, self.distributed_init_method,
+            self.local_rank)
+
+        # Device initialization should happen after initializing
+        # the distributed runtime.
+        self.device = xm.xla_device()
+        self.device_config.device = self.device
+
+        # Set random seed.
+        set_random_seed(self.model_config.seed)
+        if self.model_config.seed is not None:
+            xm.set_rng_state(self.model_config.seed, self.device)
+
+        # Increase the cache size limit, which is the maximum number of
+        # dynamo graphs that can be compiled.
+        # TODO (NickLucche) On gsm we compile 80+ graphs.
+        # Re-evaluate limit, with MM we may get close to this limit.
+        torch._dynamo.config.cache_size_limit = 128
+        # Use persistent cache to avoid XLA recompilation.
+        # NOTE(woosuk): Set per-rank cache path since different ranks
+        # can have slightly different XLA graphs.
+        world_size = self.parallel_config.world_size
+        rank = xr.global_ordinal()
+        # The PyTorch/XLA compilation cache uses the Torch IR to generate keys.
+        # Consequently, changes in optimization flags, which affect compilation
+        # results, don't change the cache key. This can result in the wrong
+        # compilation being used. To prevent this, disabling the XLA compilation
+        # cache during development is recommended.We can disable it by
+        # `export VLLM_XLA_CACHE_PATH=`
+        if envs.VLLM_XLA_CACHE_PATH:
+            per_rank_path = os.path.join(envs.VLLM_XLA_CACHE_PATH,
+                                         f"tp{world_size}_rank{rank}")
+            xr.initialize_cache(per_rank_path, readonly=False)
+
+        # Init ModelRunner here, so that we have access to self.device.
+        self.model_runner = \
+            TPUModelRunner(self.vllm_config, self.device,
+                           self.original_parallel_config)
+
+        if rank == 0:
+            # If usage stat is enabled, collect relevant info.
+            report_usage_stats(self.vllm_config)
+
+    def determine_available_memory(self) -> int:
+        kv_caches: dict[str, torch.Tensor] = {}
+        kv_cache_spec = self.model_runner.get_kv_cache_spec()
+        for layer_name, layer_spec in kv_cache_spec.items():
+            if isinstance(layer_spec, AttentionSpec):
+                dtype = layer_spec.dtype
+
+                # Use an empty tensor instead of `None`` to force Dynamo to pass
+                # it by reference, rather by specializing on the value ``None``.
+                tpu_kv_cache = torch.tensor([], dtype=dtype).to(self.device)
+                kv_caches[layer_name] = tpu_kv_cache
+            else:
+                raise NotImplementedError(
+                    f"Unsupported KV cache spec '{type(layer_spec)}'")
+
+        runner_kv_caches: list[torch.Tensor] = []
+        bind_kv_cache(
+            kv_caches,
+            self.vllm_config.compilation_config.static_forward_context,
+            runner_kv_caches)
+
+        # `max_num_tokens >= max_num_batched_tokens` due to padding.
+        with self.model_runner.maybe_setup_dummy_loras(self.lora_config):
+            self.model_runner.profile_run(self.model_runner.max_num_tokens)
+
+        # Synchronize before measuring the memory usage.
+        xm.wait_device_ops()
+
+        # During the profiling run, the model runs without KV cache. After
+        # the profiling run, the model always runs with KV cache. Here we clear
+        # the dynamo cache and cached bytecode to ensure the model always has
+        # one compiled bytecode. Having one FX graph/cached bytecode per
+        # compiled model is required for `support_torch_compile` decorator to
+        # skip dynamo guard.
+        self.model_runner.reset_dynamo_cache()
+
+        # Get the maximum amount of memory used by the model weights and
+        # intermediate activations.
+        if self.use_spmd:
+            # This is a workaround for the TPU SPMD mode. The get_memory_info
+            # API doesn't work with SPMD mode in PyTorch/XLA.
+            # TODO: use xm.get_memory_info for SPMD once it's supported in
+            # PyTorch/XLA.
+            import tpu_info
+            chip_type, _ = tpu_info.device.get_local_chips()
+            device_usage = tpu_info.metrics.get_chip_usage(chip_type)
+            total_memory_size = device_usage[0].total_memory
+            current_mem = device_usage[0].memory_usage
+        else:
+            m = xm.get_memory_info(self.device)
+            total_memory_size = m["bytes_limit"]
+            current_mem = m["bytes_used"]
+        # Ideally we would use profiled = m["peak_bytes_used"] to
+        # get weights + activations. But there is memory used during
+        # compilation / weight loading that impacts the peak and
+        # there is no way to reset peak memory in XLA, So we
+        # use the heuristic of 2% of weights.
+        profiled = current_mem * 1.02
+
+        # Calculate the TPU KV cache size based on profiling.
+        usable_memory_size = int(total_memory_size *
+                                 self.cache_config.gpu_memory_utilization)
+        tpu_kv_cache_bytes = max(usable_memory_size - profiled, 0)
+        head_size = self.model_config.get_head_size()
+        if head_size > 0:
+            padded_head_size = cdiv(
+                head_size, TPU_HEAD_SIZE_ALIGNMENT) * TPU_HEAD_SIZE_ALIGNMENT
+            if padded_head_size != head_size:
+                logger.warning_once("head size is padded to %d",
+                                    padded_head_size)
+            # We adjust the usable memory size for the KV cache to prevent OOM
+            # errors, even after padding the head_size.
+            tpu_kv_cache_bytes = (tpu_kv_cache_bytes * head_size //
+                                  padded_head_size)
+        return int(tpu_kv_cache_bytes)
+
+    def execute_model(
+        self,
+        scheduler_output: "SchedulerOutput",
+    ) -> Optional[ModelRunnerOutput]:
+        output = self.model_runner.execute_model(scheduler_output)
+        return output if self.is_driver_worker else None
+
+    def profile(self, is_start: bool = True):
+        if self.rank < 1:
+            if self.profile_dir is None:
+                raise RuntimeError("Profiler is not enabled.")
+            if is_start:
+                if self.profiler is None:
+                    self.profiler = xp.start_server(9012)
+                xp.start_trace(self.profile_dir)
+            else:
+                xp.stop_trace()
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.model_runner.add_lora(lora_request)
+
+    def load_model(self) -> None:
+        self.model_runner.load_model()
+
+    def update_config(self, overrides: dict[str, Any]) -> None:
+        self.model_runner.update_config(overrides)
+
+    def reload_weights(self) -> None:
+        self.model_runner.reload_weights()
+
+    def compile_or_warm_up_model(self) -> None:
+        if not self.model_config.enforce_eager:
+            self.model_runner.capture_model()
+
+        # Reset the seed to ensure that the random state is not affected by
+        # the model initialization and profiling.
+        set_random_seed(self.model_config.seed)
+
+    def get_model(self) -> nn.Module:
+        return self.model_runner.get_model()
+
+    def get_supported_pooling_tasks(self) -> list[PoolingTask]:
+        return self.model_runner.get_supported_pooling_tasks()
+
+    def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
+        return self.model_runner.get_kv_cache_spec()
+
+    def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
+        """Allocate GPU KV cache with the specified kv_cache_config."""
+        self.model_runner.initialize_kv_cache(kv_cache_config)
+
+    def check_health(self) -> None:
+        # worker will always be healthy as long as it's running.
+        return
+
+    def _init_tpu_worker_distributed_environment(
+        self,
+        parallel_config: ParallelConfig,
+        rank: int,
+        distributed_init_method: Optional[str] = None,
+        local_rank: int = -1,
+    ) -> None:
+        """Initialize the distributed environment."""
+        if self.use_spmd:
+            xr.use_spmd()
+        # NOTE(woosuk): This is just to initialize the TP group and broadcast
+        # the input objects on CPU. The all-reduce and all-gather ops on TPU
+        # are invoked by `xm.all_reduce` and `xm.all_gather` which use their
+        # own context.
+        init_distributed_environment(
+            world_size=parallel_config.world_size,
+            rank=rank,
+            local_rank=local_rank,
+            distributed_init_method=distributed_init_method,
+            backend=current_platform.dist_backend,
+        )
+        ensure_model_parallel_initialized(
+            parallel_config.tensor_parallel_size,
+            parallel_config.pipeline_parallel_size)
+
+
+try:
+    from tpu_commons.worker import TPUWorker as TPUCommonsWorker
+    TPUWorker = TPUCommonsWorker  # type: ignore
+except ImportError:
+    logger.info("tpu_commons not found, using vLLM's TPUWorker.")
+    pass
diff --git a/vllm_v0.10.0/vllm/v1/worker/utils.py b/vllm_v0.10.0/vllm/v1/worker/utils.py
new file mode 100644
index 0000000..3ecb1d7
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/worker/utils.py
@@ -0,0 +1,162 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections import defaultdict
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+from vllm.model_executor.models.interfaces import MultiModalEmbeddings
+from vllm.model_executor.models.utils import extract_layer_index
+from vllm.v1.kv_cache_interface import KVCacheGroupSpec
+
+if TYPE_CHECKING:
+    from vllm.attention.layer import Attention
+
+
+def sanity_check_mm_encoder_outputs(
+    mm_embeddings: MultiModalEmbeddings,
+    expected_num_items: int,
+) -> None:
+    """
+    Perform sanity checks for the result of
+    [`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`][].
+    """
+    assert isinstance(mm_embeddings, (list, tuple, torch.Tensor)), (
+        "Expected multimodal embeddings to be a list/tuple of 2D tensors, "
+        f"or a single 3D tensor, but got {type(mm_embeddings)} "
+        "instead. This is most likely due to incorrect implementation "
+        "of the model's `get_multimodal_embeddings` method.")
+
+    assert len(mm_embeddings) == expected_num_items, (
+        "Expected number of multimodal embeddings to match number of "
+        f"input items: {expected_num_items}, but got {len(mm_embeddings)=} "
+        "instead. This is most likely due to incorrect implementation "
+        "of the model's `get_multimodal_embeddings` method.")
+
+    assert all(e.ndim == 2 for e in mm_embeddings), (
+        "Expected multimodal embeddings to be a sequence of 2D tensors, "
+        f"but got tensors with shapes {[e.shape for e in mm_embeddings]} "
+        "instead. This is most likely due to incorrect implementation "
+        "of the model's `get_multimodal_embeddings` method.")
+
+
+def scatter_mm_placeholders(
+    embeds: torch.Tensor,
+    is_embed: Optional[torch.Tensor],
+) -> torch.Tensor:
+    """
+    Scatter the multimodal embeddings into a contiguous tensor that represents
+    the placeholder tokens.
+
+    [`vllm.multimodal.processing.PromptUpdateDetails.is_embed`][].
+
+    Args:
+        embeds: The multimodal embeddings.
+          Shape: `(num_embeds, embed_dim)`
+        is_embed: A boolean mask indicating which positions in the placeholder
+          tokens need to be filled with multimodal embeddings.
+          Shape: `(num_placeholders, num_embeds)`
+    """
+    if is_embed is None:
+        return embeds
+
+    placeholders = embeds.new_full(
+        (is_embed.shape[0], embeds.shape[-1]),
+        fill_value=torch.nan,
+    )
+    placeholders[is_embed] = embeds
+    return placeholders
+
+
+def gather_mm_placeholders(
+    placeholders: torch.Tensor,
+    is_embed: Optional[torch.Tensor],
+) -> torch.Tensor:
+    """
+    Reconstructs the embeddings from the placeholder tokens.
+
+    This is the operation of [scatter_mm_placeholders][].
+    """
+    if is_embed is None:
+        return placeholders
+
+    return placeholders[is_embed]
+
+
+def initialize_kv_cache_for_kv_sharing(
+    shared_kv_cache_layers: dict[str, str],
+    kv_cache_groups: list[KVCacheGroupSpec],
+    kv_caches: dict[str, torch.Tensor],
+) -> None:
+    """
+    Sets up KV cache sharing by reusing the allocated KV caches in `kv_caches`
+    for layers that do not allocate its own KV cache, based on the mapping in
+    `shared_kv_cache_layers`. Adds these layers to the corresponding KV cache
+    group, which is needed to ensure that attention metadata is assigned later.
+
+    Args:
+        shared_kv_cache_layers: Layer pairings for cross-layer KV sharing.
+            If an Attention layer `layer_name` is in the keys of this dict, it
+            means this layer will perform attention using the keys and values
+            from the KV cache of `shared_kv_cache_layers[layer_name]`.
+        kv_cache_groups: The KV cache groups of the model.
+        kv_caches: The allocated kv_caches with layer names as keys.
+            Note that layers in shared_kv_cache_layers.keys() are not
+            originally included as it only contains layers which have its own
+            KV cache allocation.
+    """
+    # Record index of KV cache group for each layer that allocates a KV cache.
+    layer_to_kv_cache_group_idx: dict[str, int] = {}
+    for i, kv_cache_group in enumerate(kv_cache_groups):
+        for layer_name in kv_cache_group.layer_names:
+            layer_to_kv_cache_group_idx[layer_name] = i
+
+    for layer_name, target_layer_name in shared_kv_cache_layers.items():
+        kv_caches[layer_name] = kv_caches[target_layer_name]
+        group_idx = layer_to_kv_cache_group_idx[target_layer_name]
+        kv_cache_groups[group_idx].layer_names.append(layer_name)
+
+
+def bind_kv_cache(
+    kv_caches: dict[str, torch.Tensor],
+    forward_context: dict[str, "Attention"],
+    runner_kv_caches: list[torch.Tensor],
+) -> None:
+    """
+    Bind the allocated KV cache to both ModelRunner and forward context so
+    that the KV cache can be used in the forward pass.
+
+    This function:
+      1) Fills the ModelRunner's kv cache list (`runner_kv_caches`) with
+         kv_caches.
+      2) Associates each attention layer in the `forward_context` with its
+         corresponding KV cache in kv_caches.
+
+    Args:
+        kv_caches: The allocated kv_caches with layer names as keys.
+        forward_context: The global forward context containing all Attention
+        layers with layer names as keys.
+        runner_kv_caches: The kv_cache declared by ModelRunner.
+    """
+    # Bind kv_caches to ModelRunner
+    assert len(runner_kv_caches) == 0
+
+    # Convert kv_caches dict to a list of tensors in the order of layer_index.
+    index2name = defaultdict(list)
+    for layer_name in kv_caches:
+        index2name[extract_layer_index(layer_name)].append(layer_name)
+
+    for layer_index in sorted(index2name.keys()):
+        layer_names = index2name[layer_index]
+        if len(layer_names) > 1:
+            # One typical case is encoder-decoder model, e.g., bart.
+            # The cross attention and self attention in the same decoder layer
+            # has different layer_name but the same layer_index.
+            raise NotImplementedError
+        layer_name = layer_names[0]
+        runner_kv_caches.append(kv_caches[layer_name])
+
+    # Bind kv_caches to forward context
+    for layer_name, kv_cache in kv_caches.items():
+        # NOTE: Use list because of v0 PP virtual engine.
+        forward_context[layer_name].kv_cache = [kv_cache]
diff --git a/vllm_v0.10.0/vllm/v1/worker/worker_base.py b/vllm_v0.10.0/vllm/v1/worker/worker_base.py
new file mode 100644
index 0000000..9c93754
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/worker/worker_base.py
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.v1.kv_cache_interface import KVCacheSpec
+from vllm.worker.worker_base import WorkerBase as WorkerBaseV0
+
+logger = init_logger(__name__)
+
+
+class WorkerBase(WorkerBaseV0):
+    """
+    Abstract class for v1 worker, mainly define some methods for v1.
+    For methods shared by v0 and v1, define them in v0 WorkerBase
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        local_rank: int,
+        rank: int,
+        distributed_init_method: str,
+        is_driver_worker: bool = False,
+    ):
+        """
+        Initialize common worker components.
+        
+        Args:
+            vllm_config: Complete vLLM configuration
+            local_rank: Local device index
+            rank: Global rank in distributed setup
+            distributed_init_method: Distributed initialization method
+            is_driver_worker: Whether this worker handles driver 
+            responsibilities
+        """
+        # Configuration storage
+        super().__init__(vllm_config=vllm_config)
+
+        self.parallel_config.rank = rank
+        self.local_rank = local_rank
+        self.rank = rank
+        self.distributed_init_method = distributed_init_method
+        self.is_driver_worker = is_driver_worker
+
+        # Device and model state
+        self.device: Optional[torch.device] = None
+        self.model_runner: Optional[nn.Module] = None
+
+    def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
+        """Get specifications for KV cache implementation."""
+        raise NotImplementedError
+
+    def compile_or_warm_up_model(self) -> None:
+        """Prepare model for execution through compilation/warmup."""
+        raise NotImplementedError
+
+    def check_health(self) -> None:
+        """Basic health check (override for device-specific checks)."""
+        return
diff --git a/vllm_v0.10.0/vllm/v1/worker/xpu_model_runner.py b/vllm_v0.10.0/vllm/v1/worker/xpu_model_runner.py
new file mode 100644
index 0000000..59f8d0f
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/worker/xpu_model_runner.py
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import TYPE_CHECKING
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.v1.worker.gpu_model_runner import GPUModelRunner
+
+if TYPE_CHECKING:
+    pass
+
+logger = init_logger(__name__)
+
+
+class XPUModelRunner(GPUModelRunner):
+    """A model runner for XPU devices."""
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        super().__init__(vllm_config, device)
+        # FIXME: To be verified.
+        self.cascade_attn_enabled = False
+
+    def _init_device_properties(self) -> None:
+        self.num_sms = None
+
+    def _sync_device(self) -> None:
+        torch.xpu.synchronize()
diff --git a/vllm_v0.10.0/vllm/v1/worker/xpu_worker.py b/vllm_v0.10.0/vllm/v1/worker/xpu_worker.py
new file mode 100644
index 0000000..c788569
--- /dev/null
+++ b/vllm_v0.10.0/vllm/v1/worker/xpu_worker.py
@@ -0,0 +1,167 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+
+import torch
+import torch.distributed
+
+import vllm.envs as envs
+from vllm.config import VllmConfig
+from vllm.distributed import get_world_group
+from vllm.logger import init_logger
+from vllm.model_executor import set_random_seed
+from vllm.platforms import current_platform
+from vllm.v1.worker.gpu_worker import (Worker,
+                                       init_worker_distributed_environment)
+from vllm.v1.worker.xpu_model_runner import XPUModelRunner
+
+logger = init_logger(__name__)
+
+
+class XPUWorker(Worker):
+    """A XPU worker class."""
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        local_rank: int,
+        rank: int,
+        distributed_init_method: str,
+        is_driver_worker: bool = False,
+    ):
+        super().__init__(vllm_config, local_rank, rank,
+                         distributed_init_method, is_driver_worker)
+        device_config = self.device_config
+        assert device_config.device_type == "xpu"
+        assert current_platform.is_xpu()
+
+        # Torch profiler. Enabled and configured through env vars:
+        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
+        if envs.VLLM_TORCH_PROFILER_DIR:
+            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
+            logger.info("Profiling enabled. Traces will be saved to: %s",
+                        torch_profiler_trace_dir)
+            self.profiler = torch.profiler.profile(
+                activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                    torch.profiler.ProfilerActivity.XPU,
+                ],
+                with_stack=True,
+                on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                    torch_profiler_trace_dir, use_gzip=True))
+        else:
+            self.profiler = None
+
+    # we provide this function due to `torch.xpu.mem_get_info()` doesn't
+    # return correct free_gpu_memory on intel client GPU. We need to
+    # calculate/estiamte it.
+    def xpu_get_mem_info(self):
+        if current_platform.is_data_center_gpu():
+            return torch.xpu.mem_get_info()
+        else:
+            _, total_gpu_memory = torch.xpu.mem_get_info()
+            # FIXME: memory_allocated() doesn't count non-torch allocations,
+            # and we don't have any API to get it. so we mark it as 128MB.
+            used_memory = torch.xpu.memory_allocated()
+            non_torch_allocations = 128 * 1024 * 1024
+            free_gpu_memory = total_gpu_memory - (used_memory +
+                                                  non_torch_allocations)
+            return free_gpu_memory, total_gpu_memory
+
+    @torch.inference_mode()
+    def determine_available_memory(self) -> int:
+        """Profiles the peak memory usage of the model to determine how many
+        KV blocks may be allocated without OOMs.
+        The engine will first conduct a profiling of the existing memory usage.
+        Then, it calculate the maximum possible number of GPU and CPU blocks
+        that can be allocated with the remaining free memory.
+        .. tip::
+            You may limit the usage of GPU memory
+            by adjusting the `gpu_memory_utilization` parameter.
+        """
+        # Profile the memory usage of the model and get the maximum number of
+        # cache blocks that can be allocated with the remaining free memory.
+        torch.xpu.empty_cache()
+        torch.xpu.reset_peak_memory_stats()
+
+        free_gpu_memory, total_gpu_memory = torch.xpu.mem_get_info()
+        current_allocated_bytes = torch.xpu.memory_allocated()
+        msg = ("Before memory profiling run, "
+               f"total GPU memory: {total_gpu_memory / 1024**2:.2f} MB, "
+               f"model load takes {current_allocated_bytes / 1024**2:.2f} MB, "
+               f"free gpu memory is {free_gpu_memory / 1024**2:.2f} MB.")
+        logger.info(msg)
+        # Execute a forward pass with dummy inputs to profile the memory usage
+        # of the model.
+        self.model_runner.profile_run()
+
+        free_gpu_memory, _ = self.xpu_get_mem_info()
+        # NOTE(woosuk): Here we assume that the other processes using the same
+        # GPU did not change their memory usage during the profiling.
+        assert self.init_gpu_memory > free_gpu_memory, (
+            "Error in memory profiling. "
+            f"Initial free memory {self.init_gpu_memory}, current free memory"
+            f" {free_gpu_memory}. This happens when the GPU memory was "
+            "not properly cleaned up before initializing the vLLM instance.")
+
+        # Get the peak memory allocation recorded by torch
+        peak_memory = torch.xpu.memory_stats()["allocated_bytes.all.peak"]
+
+        torch.xpu.empty_cache()
+        torch_allocated_bytes = torch.xpu.memory_stats(
+        )["allocated_bytes.all.current"]
+        total_allocated_bytes = self.xpu_get_mem_info(
+        )[1] - self.xpu_get_mem_info()[0]
+
+        non_torch_allocations = total_allocated_bytes - torch_allocated_bytes
+        if non_torch_allocations > 0:
+            peak_memory += non_torch_allocations
+        available_kv_cache_memory = (
+            total_gpu_memory * self.cache_config.gpu_memory_utilization -
+            peak_memory)
+
+        msg = ("After memory profiling run, "
+               f"peak memory usage is {peak_memory / 1024**2:.2f} MB,"
+               f"torch mem is {torch_allocated_bytes / 1024**2:.2f} MB, "
+               f"non-torch mem is {non_torch_allocations / 1024**2:.2f} MB, "
+               f"free gpu memory is {free_gpu_memory / 1024**2:.2f} MB.")
+        logger.info(msg)
+
+        return int(available_kv_cache_memory)
+
+    def init_device(self):
+        if self.device_config.device.type == "xpu" and current_platform.is_xpu(
+        ):
+            self.device = torch.device(f"xpu:{self.local_rank}")
+            current_platform.set_device(self.device)
+            torch.xpu.empty_cache()
+            self.init_gpu_memory = torch.xpu.get_device_properties(
+                self.local_rank).total_memory
+        else:
+            raise RuntimeError(
+                f"Not support device type: {self.device_config.device}")
+
+        ENV_CCL_ZE_IPC_EXCHANGE = os.getenv("CCL_ZE_IPC_EXCHANGE", "drmfd")
+        ENV_CCL_ATL_TRANSPORT = os.getenv("CCL_ATL_TRANSPORT", "ofi")
+        ENV_LOCAL_WORLD_SIZE = os.getenv("LOCAL_WORLD_SIZE",
+                                         str(self.parallel_config.world_size))
+        os.environ["CCL_ZE_IPC_EXCHANGE"] = ENV_CCL_ZE_IPC_EXCHANGE
+        os.environ["CCL_ATL_TRANSPORT"] = ENV_CCL_ATL_TRANSPORT
+        os.environ["LOCAL_WORLD_SIZE"] = ENV_LOCAL_WORLD_SIZE
+        os.environ["LOCAL_RANK"] = str(self.local_rank)
+
+        init_worker_distributed_environment(self.vllm_config, self.rank,
+                                            self.distributed_init_method,
+                                            self.local_rank,
+                                            current_platform.dist_backend)
+
+        # global all_reduce needed for overall oneccl warm up
+        torch.distributed.all_reduce(torch.zeros(1).xpu(),
+                                     group=get_world_group().device_group)
+
+        # Set random seed.
+        set_random_seed(self.model_config.seed)
+
+        # Construct the model runner
+        self.model_runner = XPUModelRunner(  # type: ignore
+            self.vllm_config, self.device)
diff --git a/vllm_v0.10.0/vllm/version.py b/vllm_v0.10.0/vllm/version.py
new file mode 100644
index 0000000..6c88b1b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/version.py
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+try:
+    from ._version import __version__, __version_tuple__
+except Exception as e:
+    import warnings
+
+    warnings.warn(f"Failed to read commit hash:\n{e}",
+                  RuntimeWarning,
+                  stacklevel=2)
+
+    __version__ = "dev"
+    __version_tuple__ = (0, 0, __version__)
+
+
+def _prev_minor_version_was(version_str):
+    """Check whether a given version matches the previous minor version.
+
+    Return True if version_str matches the previous minor version.
+
+    For example - return True if the current version if 0.7.4 and the
+    supplied version_str is '0.6'.
+
+    Used for --show-hidden-metrics-for-version.
+    """
+    # Match anything if this is a dev tree
+    if __version_tuple__[0:2] == (0, 0):
+        return True
+
+    # Note - this won't do the right thing when we release 1.0!
+    assert __version_tuple__[0] == 0
+    assert isinstance(__version_tuple__[1], int)
+    return version_str == f"{__version_tuple__[0]}.{__version_tuple__[1] - 1}"
+
+
+def _prev_minor_version():
+    """For the purpose of testing, return a previous minor version number."""
+    # In dev tree, this will return "0.-1", but that will work fine"
+    assert isinstance(__version_tuple__[1], int)
+    return f"{__version_tuple__[0]}.{__version_tuple__[1] - 1}"
diff --git a/vllm_v0.10.0/vllm/worker/__init__.py b/vllm_v0.10.0/vllm/worker/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vllm_v0.10.0/vllm/worker/cache_engine.py b/vllm_v0.10.0/vllm/worker/cache_engine.py
new file mode 100644
index 0000000..5309070
--- /dev/null
+++ b/vllm_v0.10.0/vllm/worker/cache_engine.py
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""CacheEngine class for managing the KV cache."""
+from typing import List
+
+import torch
+
+from vllm.attention import get_attn_backend
+from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig
+from vllm.logger import init_logger
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType,
+                        get_dtype_size, is_pin_memory_available)
+
+logger = init_logger(__name__)
+
+
+class CacheEngine:
+    """Manages the KV cache.
+
+    This class is responsible for initializing and managing the GPU and CPU KV
+    caches. It also provides methods for performing KV cache operations, such
+    as swapping and copying.
+    """
+
+    def __init__(
+        self,
+        cache_config: CacheConfig,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        device_config: DeviceConfig,
+    ) -> None:
+        self.cache_config = cache_config
+        self.model_config = model_config
+        self.parallel_config = parallel_config
+        self.device_config = device_config
+
+        self.head_size = model_config.get_head_size()
+        # Models like Jamba, have mixed typed layers, E.g Mamba
+        self.num_attention_layers = model_config.get_num_layers_by_block_type(
+            parallel_config, LayerBlockType.attention)
+        self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
+
+        self.block_size = cache_config.block_size
+        self.num_gpu_blocks = cache_config.num_gpu_blocks
+        if self.num_gpu_blocks:
+            self.num_gpu_blocks //= parallel_config.pipeline_parallel_size
+        self.num_cpu_blocks = cache_config.num_cpu_blocks
+        if self.num_cpu_blocks:
+            self.num_cpu_blocks //= parallel_config.pipeline_parallel_size
+
+        if cache_config.cache_dtype == "auto":
+            self.dtype = model_config.dtype
+        else:
+            self.dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
+
+        # Get attention backend.
+        self.attn_backend = get_attn_backend(self.head_size,
+                                             model_config.dtype,
+                                             cache_config.cache_dtype,
+                                             self.block_size,
+                                             model_config.is_attention_free,
+                                             use_mla=model_config.use_mla)
+
+        # Initialize the cache.
+        self.gpu_cache = self._allocate_kv_cache(
+            self.num_gpu_blocks, self.device_config.device_type)
+        self.cpu_cache = self._allocate_kv_cache(self.num_cpu_blocks, "cpu")
+
+    def _allocate_kv_cache(
+        self,
+        num_blocks: int,
+        device: str,
+    ) -> List[torch.Tensor]:
+        """Allocates KV cache on the specified device."""
+        kv_cache_generic_shape = self.attn_backend.get_kv_cache_shape(
+            num_blocks, self.block_size, self.num_kv_heads, self.head_size)
+        pin_memory = is_pin_memory_available() if device == "cpu" else False
+        kv_cache: List[torch.Tensor] = []
+        try:
+            kv_cache_stride_order = self.attn_backend.get_kv_cache_stride_order(
+            )
+        except (AttributeError, NotImplementedError):
+            kv_cache_stride_order = tuple(range(len(kv_cache_generic_shape)))
+
+        # The allocation respects the backend-defined stride order to ensure
+        # the semantic remains consistent for each backend. We first obtain the
+        # generic kv cache shape and then permute it according to the stride
+        # order which could result in a non-contiguous tensor.
+        kv_cache_allocation_shape = tuple(kv_cache_generic_shape[i]
+                                          for i in kv_cache_stride_order)
+
+        for _ in range(self.num_attention_layers):
+            # null block in CpuGpuBlockAllocator requires at least that
+            # block to be zeroed-out.
+            # We zero-out everything for simplicity.
+            layer_kv_cache = torch.zeros(
+                kv_cache_allocation_shape,
+                dtype=self.dtype,
+                pin_memory=pin_memory,
+                device=device).permute(*kv_cache_stride_order)
+
+            # view back to (TOTAL_PAGES, PAGE_SIZE, entry_shape...) for cases
+            # when entry_shape is higher than 1D
+            kv_cache.append(layer_kv_cache)
+        return kv_cache
+
+    def swap_in(self, src_to_dst: torch.Tensor) -> None:
+        for i in range(self.num_attention_layers):
+            self.attn_backend.swap_blocks(self.cpu_cache[i], self.gpu_cache[i],
+                                          src_to_dst)
+
+    def swap_out(self, src_to_dst: torch.Tensor) -> None:
+        for i in range(self.num_attention_layers):
+            self.attn_backend.swap_blocks(self.gpu_cache[i], self.cpu_cache[i],
+                                          src_to_dst)
+
+    def copy(self, src_to_dsts: torch.Tensor) -> None:
+        self.attn_backend.copy_blocks(self.gpu_cache, src_to_dsts)
+
+    @staticmethod
+    def get_cache_block_size(
+        cache_config: CacheConfig,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+    ) -> int:
+        head_size = model_config.get_head_size()
+        num_heads = model_config.get_num_kv_heads(parallel_config)
+        num_attention_layers = model_config.get_num_layers_by_block_type(
+            parallel_config, LayerBlockType.attention)
+
+        if cache_config.cache_dtype == "auto":
+            dtype = model_config.dtype
+        else:
+            dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
+
+        key_cache_entry = num_heads * head_size
+
+        # For MLA there is no value cache, since the latent vector
+        # is joint keys and values.
+        value_cache_entry = key_cache_entry if not model_config.use_mla else 0
+        total = num_attention_layers * cache_config.block_size * \
+            (key_cache_entry + value_cache_entry)
+
+        dtype_size = get_dtype_size(dtype)
+        return dtype_size * total
diff --git a/vllm_v0.10.0/vllm/worker/enc_dec_model_runner.py b/vllm_v0.10.0/vllm/worker/enc_dec_model_runner.py
new file mode 100644
index 0000000..cb5d566
--- /dev/null
+++ b/vllm_v0.10.0/vllm/worker/enc_dec_model_runner.py
@@ -0,0 +1,554 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import dataclasses
+import itertools
+from typing import Any, Dict, List, Optional, Tuple, Type, cast
+
+import torch
+import torch.distributed
+
+from vllm.attention.backends.abstract import (AttentionBackend,
+                                              AttentionMetadata)
+from vllm.attention.backends.utils import PAD_SLOT_ID
+from vllm.attention.selector import (get_env_variable_attn_backend,
+                                     get_global_forced_attn_backend)
+from vllm.config import VllmConfig
+from vllm.forward_context import set_forward_context
+from vllm.inputs import INPUT_REGISTRY, InputRegistry
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
+                             MultiModalRegistry)
+from vllm.platforms import _Backend
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import (IntermediateTensors, PoolerOutput,
+                           SequenceGroupMetadata)
+from vllm.utils import STR_NOT_IMPL_ENC_DEC_BACKEND, make_tensor_with_pad
+from vllm.worker.model_runner import (GPUModelRunnerBase,
+                                      ModelInputForGPUBuilder,
+                                      ModelInputForGPUWithSamplingMetadata)
+from vllm.worker.model_runner_base import (
+    _add_attn_metadata_broadcastable_dict,
+    _add_sampling_metadata_broadcastable_dict)
+from vllm.worker.utils import assert_enc_dec_mr_supported_scenario
+
+logger = init_logger(__name__)
+LORA_WARMUP_RANK = 8
+
+
+@dataclasses.dataclass(frozen=True)
+class EncoderDecoderModelInput(ModelInputForGPUWithSamplingMetadata):
+    """
+    Used by the EncoderDecoderModelRunner.
+    """
+    encoder_input_tokens: Optional[torch.Tensor] = None
+    encoder_input_positions: Optional[torch.Tensor] = None
+
+    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
+        tensor_dict = {
+            "input_tokens": self.input_tokens,
+            "inputs_embeds": self.inputs_embeds,
+            "input_positions": self.input_positions,
+            "encoder_input_tokens": self.encoder_input_tokens,
+            "encoder_input_positions": self.encoder_input_positions,
+            "virtual_engine": self.virtual_engine,
+            "request_ids_to_seq_ids": self.request_ids_to_seq_ids,
+            "finished_requests_ids": self.finished_requests_ids,
+            "multi_modal_kwargs": self.multi_modal_kwargs,
+        }
+        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
+        _add_sampling_metadata_broadcastable_dict(tensor_dict,
+                                                  self.sampling_metadata)
+        return tensor_dict
+
+    @classmethod
+    def from_broadcasted_tensor_dict(
+        cls,
+        tensor_dict: Dict[str, Any],
+        attn_backend: Optional["AttentionBackend"] = None,
+    ) -> "EncoderDecoderModelInput":
+        return cast(
+            EncoderDecoderModelInput,
+            super().from_broadcasted_tensor_dict(tensor_dict, attn_backend))
+
+
+class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
+    _model_input_cls: Type[EncoderDecoderModelInput] = (
+        EncoderDecoderModelInput)
+    _builder_cls: Type[ModelInputForGPUBuilder] = (ModelInputForGPUBuilder)
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        kv_cache_dtype: Optional[str] = "auto",
+        is_driver_worker: bool = False,
+        input_registry: InputRegistry = INPUT_REGISTRY,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+    ):
+        '''
+        EncoderDecoderModelRunner constructor.
+
+        `lora_config` is unused (since these features are not yet supported
+        for encoder/decoder models) but these arguments are present here for
+        compatibility with the base-class constructor.
+        '''
+        self._maybe_force_supported_attention_backend()
+
+        super().__init__(
+            vllm_config=vllm_config,
+            kv_cache_dtype=kv_cache_dtype,
+            is_driver_worker=is_driver_worker,
+            input_registry=input_registry,
+            mm_registry=mm_registry,
+        )
+
+        # Crash for unsupported encoder/scenarios
+        assert_enc_dec_mr_supported_scenario(self)
+
+    def _maybe_force_supported_attention_backend(self):
+        '''
+        Force vLLM to use the XFormers attention backend,
+        which is currently the only supported option.
+        '''
+
+        def raise_backend_err():
+            # The user has specified an attention backend override
+            # which is invalid for encoder/decoder models
+            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_BACKEND)
+
+        maybe_env_var_forced_backend = get_env_variable_attn_backend()
+        maybe_global_forced_backend = get_global_forced_attn_backend()
+        is_forced_by_global = maybe_global_forced_backend is not None
+        is_forced_by_env_var = maybe_env_var_forced_backend is not None
+        if is_forced_by_global:  # noqa: SIM102
+            # Backend override enforced by global variable takes
+            # precedence over vLLM backend environment variable.
+            if maybe_global_forced_backend not in\
+                 [_Backend.XFORMERS, _Backend.FLASH_ATTN]:
+                raise_backend_err()
+        elif is_forced_by_env_var:  # noqa: SIM102
+            # Backend override enforced by vLLM backend
+            # environment variable
+            if maybe_env_var_forced_backend not in\
+                 [_Backend.XFORMERS, _Backend.FLASH_ATTN]:
+                raise_backend_err()
+
+    def _list_to_int32_tensor(
+        self,
+        _list: List[int],
+    ) -> torch.Tensor:
+        return torch.tensor(_list, dtype=torch.int32, device=self.device)
+
+    def _list_to_long_tensor(
+        self,
+        _list: List[int],
+    ) -> torch.Tensor:
+        return torch.tensor(_list, dtype=torch.long, device=self.device)
+
+    def _empty_int32_tensor(self) -> torch.Tensor:
+        return self._list_to_int32_tensor([])
+
+    def _empty_long_tensor(self) -> torch.Tensor:
+        return self._list_to_long_tensor([])
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        model_input: EncoderDecoderModelInput,
+        kv_caches: List[torch.Tensor],
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        num_steps: int = 1,
+    ) -> Optional[List[PoolerOutput]]:
+        if num_steps > 1:
+            raise ValueError("num_steps > 1 is not supported in "
+                             "EncoderDecoderModelRunner")
+        if self.lora_config:
+            assert model_input.lora_requests is not None
+            assert model_input.lora_mapping is not None
+            self.set_active_loras(model_input.lora_requests,
+                                  model_input.lora_mapping)
+        if (model_input.attn_metadata is not None
+                and model_input.attn_metadata.prefill_metadata is None
+                and model_input.attn_metadata.decode_metadata.use_cuda_graph):
+            if model_input.inputs_embeds is None:
+                assert model_input.input_tokens is not None
+                graph_batch_size = model_input.input_tokens.shape[0]
+                model_executable = (
+                    self.graph_runners[model_input.virtual_engine][(
+                        graph_batch_size, False)])
+            else:
+                graph_batch_size = model_input.inputs_embeds.shape[0]
+                model_executable = (
+                    self.graph_runners[model_input.virtual_engine][(
+                        graph_batch_size, True)])
+        else:
+            model_executable = self.model
+
+        seqlen_agnostic_kwargs = {
+            "finished_requests_ids": model_input.finished_requests_ids,
+            "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
+        } if self.has_inner_state else {}
+
+        multi_modal_kwargs = model_input.multi_modal_kwargs or {}
+        with set_forward_context(model_input.attn_metadata, self.vllm_config,
+                                 model_input.virtual_engine):
+            hidden_or_intermediate_states = model_executable(
+                input_ids=model_input.input_tokens,
+                inputs_embeds=model_input.inputs_embeds,
+                positions=model_input.input_positions,
+                encoder_input_ids=model_input.encoder_input_tokens,
+                encoder_positions=model_input.encoder_input_positions,
+                intermediate_tensors=intermediate_tensors,
+                **MultiModalKwargs.as_kwargs(
+                    multi_modal_kwargs,
+                    device=self.device,
+                ),
+                **seqlen_agnostic_kwargs,
+            )
+
+        logits = self.model.compute_logits(hidden_or_intermediate_states,
+                                           model_input.sampling_metadata)
+
+        if not self.is_driver_worker:
+            return []
+
+        if model_input.async_callback is not None:
+            model_input.async_callback()
+
+        # Sample the next token.
+        output: SamplerOutput = self.sampler(
+            logits=logits,
+            sampling_metadata=model_input.sampling_metadata,
+        )
+
+        return [output]
+
+    def make_model_input_from_broadcasted_tensor_dict(
+            self, tensor_dict: Dict[str, Any]) -> EncoderDecoderModelInput:
+        return EncoderDecoderModelInput.from_broadcasted_tensor_dict(
+            tensor_dict,
+            attn_backend=self.attn_backend,
+        )
+
+    def prepare_model_input(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        virtual_engine: int = 0,
+        finished_requests_ids: Optional[List[str]] = None
+    ) -> EncoderDecoderModelInput:
+        """Prepare the model input based on a given sequence group, including
+        metadata for the sampling step.
+
+        Since chunked prefill is not supported for encoder/decoder models,
+        `input_tokens` is assumed to be either entirely prefill tokens or
+        entirely decode tokens.
+
+        """
+        model_input = self._prepare_model_input_tensors(
+            seq_group_metadata_list, finished_requests_ids)
+        (
+            attn_metadata,
+            encoder_input_tokens_tensor,
+            encoder_input_positions_tensor,
+        ) = (self._prepare_encoder_model_input_tensors(seq_group_metadata_list,
+                                                       model_input))
+        # Inject attn_metadata encoder/cross-attention fields &
+        # encoder input tokens/positions into model_input.
+        # Frozen dataclass fields cannot be modified, so use
+        # dataclasses.replace to construct a new model input
+        # instance.
+        model_input = dataclasses.replace(
+            model_input,
+            attn_metadata=attn_metadata,
+            encoder_input_tokens=encoder_input_tokens_tensor,
+            encoder_input_positions=encoder_input_positions_tensor,
+        )
+
+        generators = self.get_generators(finished_requests_ids)
+        sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list,
+                                                     model_input.seq_lens,
+                                                     model_input.query_lens,
+                                                     self.device,
+                                                     self.pin_memory,
+                                                     generators=generators)
+        is_prompt = (seq_group_metadata_list[0].is_prompt
+                     if seq_group_metadata_list else None)
+        return dataclasses.replace(model_input,
+                                   sampling_metadata=sampling_metadata,
+                                   is_prompt=is_prompt,
+                                   virtual_engine=virtual_engine)
+
+    @torch.inference_mode()
+    def profile_run(self) -> None:
+        # Enable top-k sampling to reflect the accurate memory usage.
+        sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
+        max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
+        max_num_seqs = self.scheduler_config.max_num_seqs
+
+        # This represents the maximum number of different requests
+        # that will have unique loras, and therefore the max amount of
+        # memory consumption. Create dummy lora request copies from the
+        # lora request passed in, which contains a lora from the lora
+        # warmup path.
+        dummy_lora_requests: List[LoRARequest] = []
+        dummy_lora_requests_per_seq: List[LoRARequest] = []
+        if self.lora_config:
+            dummy_lora_requests = self._add_dummy_loras(
+                self.lora_config.max_loras)
+            assert len(dummy_lora_requests) == self.lora_config.max_loras
+            dummy_lora_requests_per_seq = [
+                dummy_lora_requests[idx % len(dummy_lora_requests)]
+                for idx in range(max_num_seqs)
+            ]
+
+        # Profile memory usage with max_num_sequences sequences and the total
+        # number of tokens equal to max_num_batched_tokens.
+        seqs: List[SequenceGroupMetadata] = []
+
+        max_mm_tokens = self.mm_registry.get_max_multimodal_tokens(
+            self.model_config)
+        if max_mm_tokens > 0:
+            logger.info("Starting profile run for multi-modal models.")
+
+        batch_size = 0
+        for group_id in range(max_num_seqs):
+            seq_len = (max_num_batched_tokens // max_num_seqs +
+                       (group_id < max_num_batched_tokens % max_num_seqs))
+            batch_size += seq_len
+
+            decoder_dummy_data = self.input_registry \
+                .dummy_data_for_profiling(self.model_config,
+                                          seq_len,
+                                          self.mm_registry,
+                                          is_encoder_data=False)
+            encoder_dummy_data = self.input_registry \
+                .dummy_data_for_profiling(self.model_config,
+                                          seq_len,
+                                          self.mm_registry,
+                                          is_encoder_data=True)
+
+            # Having more tokens is over-conservative but otherwise fine
+            assert len(
+                decoder_dummy_data.seq_data.prompt_token_ids
+            ) >= seq_len, (
+                f"Expected at least {seq_len} dummy tokens for profiling, "
+                f"but got: {len(decoder_dummy_data.seq_data.prompt_token_ids)}"
+            )
+
+            assert decoder_dummy_data.multi_modal_data is None or \
+            encoder_dummy_data.multi_modal_data is None, (
+                "Multi-modal data can't be provided in both encoder and decoder"
+            )
+
+            seq = SequenceGroupMetadata(
+                request_id=str(group_id),
+                is_prompt=True,
+                seq_data={group_id: decoder_dummy_data.seq_data},
+                sampling_params=sampling_params,
+                block_tables=None,
+                encoder_seq_data=encoder_dummy_data.seq_data,
+                cross_block_table=None,
+                lora_request=dummy_lora_requests_per_seq[group_id]
+                if dummy_lora_requests_per_seq else None,
+                multi_modal_data=decoder_dummy_data.multi_modal_data
+                or encoder_dummy_data.multi_modal_data,
+                multi_modal_placeholders=decoder_dummy_data.
+                multi_modal_placeholders
+                or encoder_dummy_data.multi_modal_placeholders)
+            seqs.append(seq)
+
+        finished_requests_ids = [seq.request_id for seq in seqs]
+        model_input = self.prepare_model_input(
+            seqs, finished_requests_ids=finished_requests_ids)
+        intermediate_tensors = None
+        self.execute_model(model_input, None, intermediate_tensors)
+        torch.cuda.synchronize()
+        return
+
+    def _prepare_encoder_model_input_tensors(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        model_input: EncoderDecoderModelInput,
+    ) -> Tuple[AttentionMetadata, Optional[torch.Tensor],
+               Optional[torch.Tensor]]:
+        """Helper method to prepare the encoder- and cross-attn-related
+        model inputs based on a given sequence group. These additional inputs
+        are used to augment an already-computed `EncoderDecoderModelInput`
+        data structure which already has decoder-related model inputs
+        populated.
+
+        Sets the following attn_metadata fields:
+        * `num_encoder_tokens`
+        * `encoder_seq_lens`
+        * `encoder_seq_lens_tensor`
+        * `max_encoder_seq_len`
+        * `cross_slot_mapping`
+        * `cross_block_tables`
+
+        Constructs a new model inputs data structure, based on
+        (1) the existing fields in the `model_inputs` argument,
+        and (2) the following additional fields which are
+        computed (or in the case of `attn_metadata`, updated) 
+        by this function:
+        * attn_metadata
+        * encoder_input_tokens
+        * encoder_input_positions
+
+        Arguments:
+
+        * seq_group_metadata_list: list of sequence groups for which to
+                                   compute inputs
+        * model_inputs: model inputs data structure with decoder-oriented
+                        fields already computed.
+
+        Return:
+
+        * Updated model inputs data structure
+        """
+
+        if len(seq_group_metadata_list) == 0:
+            return (model_input.attn_metadata, None, None)
+
+        # Since we are not supporting chunked prefill either the entire
+        # batch is prefill or it is decode
+        is_prompt = seq_group_metadata_list[0].is_prompt
+
+        # Build encoder inputs
+        encoder_seq_lens: List[int] = []
+        if is_prompt:
+            # Prefill phase.
+            cross_block_tables = self._empty_int32_tensor().view(
+                len(seq_group_metadata_list), -1)
+
+            # Extract input tokens/positions, cross-attention slot-mapping,
+            # & seq len from each sequence group metadata
+            (
+                encoder_input_tokens,
+                encoder_input_positions,
+                cross_slot_mapping,
+            ) = (
+                [],
+                [],
+                [],
+            )
+            for seq_group_metadata in seq_group_metadata_list:
+                # Build seq lens
+                seq_len = seq_group_metadata.encoder_seq_data.get_len()
+                token_ids = seq_group_metadata.encoder_seq_data.get_token_ids()
+                encoder_seq_lens.append(seq_len)
+
+                # Build slot mapping
+                is_profile_run = (seq_group_metadata.block_tables is None)
+                if is_profile_run:
+                    # During memory profiling, the block tables are not
+                    # initialized yet. In this case, we just use a dummy
+                    # slot mapping.
+                    # In embeddings, the block tables are {seq_id: None}.
+                    cross_slot_mapping.extend([PAD_SLOT_ID] * seq_len)
+                else:
+                    for i in range(0, seq_len):
+                        block_number = seq_group_metadata.cross_block_table[
+                            i // self.block_size]
+                        block_offset = i % self.block_size
+                        slot = block_number * self.block_size + block_offset
+                        cross_slot_mapping.append(slot)
+
+                # Build encoder input tokens
+                encoder_input_tokens.extend(token_ids)
+                encoder_input_positions.extend(list(range(0, seq_len)))
+
+            # Convert tokens/positions & cross-attention
+            # slot-mapping to encoder input tensors
+            encoder_input_tokens_tensor = self._list_to_long_tensor(
+                encoder_input_tokens)
+            encoder_input_positions_tensor = self._list_to_long_tensor(
+                encoder_input_positions)
+            cross_slot_mapping_tensor = self._list_to_long_tensor(
+                cross_slot_mapping)
+
+        else:
+            # Decode phase.
+            encoder_input_tokens_tensor = self._empty_long_tensor()
+            encoder_input_positions_tensor = self._empty_long_tensor()
+            cross_slot_mapping_tensor = self._empty_long_tensor()
+            # Extract cross-attention block tables &
+            # seq len from each sequence group metadata.
+            # Cross-attention block tables are empty
+            # during vLLM memory profiling.
+            cross_block_tables = []
+            for seq_group_metadata in seq_group_metadata_list:
+                for _ in range(len(seq_group_metadata.seq_data)):
+                    encoder_seq_lens.append(
+                        seq_group_metadata.encoder_seq_data.get_len())
+                    cross_block_table = seq_group_metadata.cross_block_table
+                    cross_block_tables.append([] if (
+                        cross_block_table is None) else cross_block_table)
+
+            if (model_input.attn_metadata is not None
+                    and model_input.attn_metadata.use_cuda_graph):
+                # We will be using CUDA graph replay for this decode.
+                max_len_of_block_table = self.get_max_block_per_batch()
+                batch_size = len(encoder_seq_lens)
+                graph_batch_size = self.vllm_config.pad_for_cudagraph(
+                    batch_size)
+                assert graph_batch_size >= batch_size
+                cuda_graph_pad_size = graph_batch_size - batch_size
+                # extend the cross_block_tables and encoder_seq_lens to match
+                # the graph_batch_size.
+                cross_block_tables.extend([[]
+                                           for _ in range(cuda_graph_pad_size)
+                                           ])
+                encoder_seq_lens.extend(
+                    itertools.repeat(1, cuda_graph_pad_size))
+
+            else:
+                max_len_of_block_table = max(
+                    len(block_table) for block_table in cross_block_tables)
+
+            cross_block_tables = make_tensor_with_pad(
+                cross_block_tables,
+                max_len=max_len_of_block_table,
+                pad=0,
+                dtype=torch.int32,
+                device=self.device,
+            )
+
+        # Compute encoder sequence lengths & encoder
+        # sequence starting offset tensors
+        max_encoder_seq_len = max(encoder_seq_lens, default=0)
+        encoder_seq_lens_tensor = self._list_to_int32_tensor(encoder_seq_lens)
+        encoder_seq_start_loc = torch.zeros(encoder_seq_lens_tensor.shape[0] +
+                                            1,
+                                            dtype=torch.int32,
+                                            device=self.device)
+        torch.cumsum(encoder_seq_lens_tensor,
+                     dim=0,
+                     dtype=encoder_seq_start_loc.dtype,
+                     out=encoder_seq_start_loc[1:])
+
+        # Update attention metadata with encoder-oriented attributes
+        attn_metadata = model_input.attn_metadata
+        assert attn_metadata is not None
+        (
+            attn_metadata.num_encoder_tokens,
+            attn_metadata.encoder_seq_lens,
+            attn_metadata.encoder_seq_lens_tensor,
+            attn_metadata.max_encoder_seq_len,
+            attn_metadata.encoder_seq_start_loc,
+            attn_metadata.cross_slot_mapping,
+            attn_metadata.cross_block_tables,
+        ) = (
+            sum(encoder_seq_lens),
+            encoder_seq_lens,
+            encoder_seq_lens_tensor,
+            max_encoder_seq_len,
+            encoder_seq_start_loc,
+            cross_slot_mapping_tensor,
+            cross_block_tables,
+        )
+
+        return (attn_metadata, encoder_input_tokens_tensor,
+                encoder_input_positions_tensor)
diff --git a/vllm_v0.10.0/vllm/worker/model_runner.py b/vllm_v0.10.0/vllm/worker/model_runner.py
new file mode 100644
index 0000000..5a185e7
--- /dev/null
+++ b/vllm_v0.10.0/vllm/worker/model_runner.py
@@ -0,0 +1,2045 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import dataclasses
+import gc
+import inspect
+import itertools
+import time
+import weakref
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set,
+                    Tuple, Type, TypeVar, Union)
+
+import numpy as np
+import torch
+import torch.distributed
+import torch.nn as nn
+from tqdm.auto import tqdm
+
+import vllm.envs as envs
+from vllm.attention import AttentionMetadata, get_attn_backend
+from vllm.attention.backends.abstract import AttentionState
+from vllm.attention.backends.utils import CommonAttentionState
+from vllm.config import CompilationLevel, VllmConfig
+from vllm.core.scheduler import SchedulerOutputs
+from vllm.distributed import broadcast_tensor_dict, get_pp_group
+from vllm.distributed.kv_transfer import get_kv_transfer_group
+from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank,
+                                             graph_capture)
+from vllm.forward_context import get_forward_context, set_forward_context
+from vllm.inputs import INPUT_REGISTRY, InputRegistry
+from vllm.logger import init_logger
+from vllm.lora.layers import LoRAMapping
+from vllm.lora.request import LoRARequest
+from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
+from vllm.model_executor import SamplingMetadata, SamplingMetadataCache
+from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
+from vllm.model_executor.layers.sampler import (Sampler, SamplerOutput,
+                                                get_sampler)
+from vllm.model_executor.model_loader import get_model
+from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
+from vllm.model_executor.models import supports_lora, supports_multimodal
+from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
+                             MultiModalKwargs, MultiModalPlaceholderMap,
+                             MultiModalRegistry)
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
+from vllm.utils import (DeviceMemoryProfiler, GiB_bytes, PyObjectCache,
+                        async_tensor_h2d, flatten_2d_lists,
+                        is_pin_memory_available, supports_dynamo,
+                        weak_ref_tensor)
+from vllm.worker.model_runner_base import (
+    InputProcessingError, ModelRunnerBase, ModelRunnerInputBase,
+    ModelRunnerInputBuilderBase, _add_attn_metadata_broadcastable_dict,
+    _add_sampling_metadata_broadcastable_dict,
+    _init_attn_metadata_from_tensor_dict,
+    _init_sampling_metadata_from_tensor_dict)
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
+
+logger = init_logger(__name__)
+
+LORA_WARMUP_RANK = 8
+
+_NUM_WARMUP_ITERS = 2
+
+TModelInputForGPU = TypeVar('TModelInputForGPU', bound="ModelInputForGPU")
+
+# For now, bump up cache limits for recompilations during CUDA graph warmups.
+torch._dynamo.config.cache_size_limit = 128
+torch._dynamo.config.accumulated_cache_size_limit = 128
+
+
+@dataclass(frozen=True)
+class ModelInputForGPU(ModelRunnerInputBase):
+    """
+    This base class contains metadata needed for the base model forward pass
+    but not metadata for possible additional steps, e.g., sampling. Model
+    runners that run additional steps should subclass this method to add
+    additional fields.
+    """
+    input_tokens: Optional[torch.Tensor] = None
+    inputs_embeds: Optional[torch.Tensor] = None
+    input_positions: Optional[torch.Tensor] = None
+    token_types: Optional[torch.Tensor] = None
+    seq_lens: Optional[List[int]] = None
+    query_lens: Optional[List[int]] = None
+    lora_mapping: Optional["LoRAMapping"] = None
+    lora_requests: Optional[Set[LoRARequest]] = None
+    attn_metadata: Optional["AttentionMetadata"] = None
+    multi_modal_kwargs: Optional[BatchedTensorInputs] = None
+    request_ids_to_seq_ids: Optional[Dict[str, List[int]]] = None
+    finished_requests_ids: Optional[List[str]] = None
+    virtual_engine: int = 0
+    async_callback: Optional[Callable] = None
+    scheduler_outputs: Optional[SchedulerOutputs] = None
+    previous_hidden_states: Optional[torch.Tensor] = None
+
+    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
+        tensor_dict = {
+            "input_tokens": self.input_tokens,
+            "inputs_embeds": self.inputs_embeds,
+            "input_positions": self.input_positions,
+            "lora_requests": self.lora_requests,
+            "lora_mapping": self.lora_mapping,
+            "multi_modal_kwargs": self.multi_modal_kwargs,
+            "virtual_engine": self.virtual_engine,
+            "request_ids_to_seq_ids": self.request_ids_to_seq_ids,
+            "finished_requests_ids": self.finished_requests_ids,
+        }
+        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
+        return tensor_dict
+
+    @classmethod
+    def from_broadcasted_tensor_dict(
+        cls: Type[TModelInputForGPU],
+        tensor_dict: Dict[str, Any],
+        attn_backend: Optional["AttentionBackend"] = None,
+    ) -> TModelInputForGPU:
+        if attn_backend is not None:
+            tensor_dict = _init_attn_metadata_from_tensor_dict(
+                attn_backend, tensor_dict)
+        return cls(**tensor_dict)
+
+    # Exclude `async_callback` to be able to pickle this object
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        del state["async_callback"]
+        return state
+
+    # TODO: What happens when we depickle this object?
+    # How can we update this callback to properly pass it to the engine?
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        self.__dict__.update({'async_callback': None})
+
+
+@dataclass(frozen=True)
+class ModelInputForGPUWithSamplingMetadata(ModelInputForGPU):
+    """
+    Used by the ModelRunner.
+    """
+    sampling_metadata: Optional["SamplingMetadata"] = None
+    # Used for speculative decoding. We do not broadcast it because it is only
+    # used by the driver worker.
+    is_prompt: Optional[bool] = None
+
+    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
+        tensor_dict = {
+            "input_tokens": self.input_tokens,
+            "inputs_embeds": self.inputs_embeds,
+            "input_positions": self.input_positions,
+            "lora_requests": self.lora_requests,
+            "lora_mapping": self.lora_mapping,
+            "multi_modal_kwargs": self.multi_modal_kwargs,
+            "virtual_engine": self.virtual_engine,
+            "request_ids_to_seq_ids": self.request_ids_to_seq_ids,
+            "finished_requests_ids": self.finished_requests_ids,
+        }
+        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
+        _add_sampling_metadata_broadcastable_dict(tensor_dict,
+                                                  self.sampling_metadata)
+        return tensor_dict
+
+    @classmethod
+    def from_broadcasted_tensor_dict(
+        cls,
+        tensor_dict: Dict[str, Any],
+        attn_backend: Optional["AttentionBackend"] = None,
+    ) -> "ModelInputForGPUWithSamplingMetadata":
+        tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict)
+        if attn_backend is not None:
+            tensor_dict = _init_attn_metadata_from_tensor_dict(
+                attn_backend, tensor_dict)
+        return cls(**tensor_dict)
+
+
+class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
+    """Build ModelInputForGPU from SequenceGroupMetadata."""
+
+    # Note: ideally we would be using a dataclass(kw_only=True)
+    # here, so that this can be subclassed easily,
+    # but kw_only is not supported in python<3.10.
+    class InterDataForSeqGroup:
+        """Intermediate data for the current sequence group."""
+
+        def simple_reinit(self):
+            self.input_tokens[0].clear()  # type: ignore
+            self.inputs_embeds = None  # type: ignore
+            self.input_positions[0].clear()  # type: ignore
+            self.token_types[0].clear()  # type: ignore
+            self.mrope_input_positions = None  # type: ignore
+            self.seq_lens[0] = 0  # type: ignore
+            self.orig_seq_lens[0] = 0  # type: ignore
+            self.prompt_lens[0] = 0  # type: ignore
+            self.query_lens[0] = 0  # type: ignore
+            self.context_lens[0] = 0  # type: ignore
+            self.curr_sliding_window_blocks[0] = 0  # type: ignore
+            self.lora_index_mapping.clear()  # type: ignore
+            self.lora_prompt_mapping.clear()  # type: ignore
+            self.lora_requests.clear()  # type: ignore
+
+        def __init__(
+            self,
+            *,
+            # From sequence group metadata.
+            request_id: str,
+            seq_ids: List[int],
+            is_prompt: bool,
+            block_tables: Optional[Dict[int, List[int]]],
+            computed_block_nums: List[int],
+            n_seqs: int = 0,
+
+            # Input tokens and positions.
+            input_tokens: Optional[List[List[int]]] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            input_positions: Optional[List[List[int]]] = None,
+            token_types: Optional[List[List[int]]] = None,
+            mrope_input_positions: Optional[List[List[List[int]]]] = None,
+
+            # The sequence length (may be capped to the sliding window).
+            seq_lens: Optional[List[int]] = None,
+            # The original sequence length (before applying sliding window).
+            # This is used to compute slot mapping.
+            orig_seq_lens: Optional[List[int]] = None,
+            # This is used in the dual-chunk flash attention backend.
+            prompt_lens: Optional[List[int]] = None,
+            # The query length.
+            query_lens: Optional[List[int]] = None,
+            # The number of tokens that are already computed.
+            context_lens: Optional[List[int]] = None,
+            # The current sliding window block.
+            curr_sliding_window_blocks: Optional[List[int]] = None,
+
+            # LoRA inputs.
+            lora_index_mapping: Optional[List[List[int]]] = None,
+            lora_prompt_mapping: Optional[List[List[int]]] = None,
+            lora_requests: Optional[Set[LoRARequest]] = None,
+
+            # Multi-modal inputs.
+            multi_modal_kwargs: Optional[MultiModalKwargs] = None,
+            multi_modal_placeholder_maps: Optional[Dict[
+                str, MultiModalPlaceholderMap]] = None,
+
+            # Whether the prefix cache is hit (prefill only).
+            prefix_cache_hit: bool = False,
+            reinit: bool = False,
+            reinit_use_defaults: bool = False,
+            encoder_seq_len: int = 0,
+        ):
+            if reinit:
+                assert len(self.seq_ids) == len(seq_ids)  # type: ignore
+                for i, seq_id in enumerate(seq_ids):
+                    self.seq_ids[i] = seq_id  # type: ignore
+            else:
+                self.seq_ids = seq_ids
+
+            self.request_id = request_id
+            self.is_prompt = is_prompt
+            self.block_tables = block_tables
+            self.computed_block_nums = computed_block_nums
+            self.n_seqs = n_seqs
+            self.encoder_seq_len = encoder_seq_len
+
+            if reinit:
+                if len(self.seq_ids) == 1 and reinit_use_defaults:
+                    self.simple_reinit()
+                else:
+                    if input_tokens:
+                        self.input_tokens = input_tokens
+                    else:
+                        for seq_id in range(len(self.seq_ids)):
+                            self.input_tokens[seq_id].clear()
+
+                    self.inputs_embeds = inputs_embeds
+
+                    if input_positions:
+                        self.input_positions = input_positions
+                    else:
+                        for seq_id in range(len(self.seq_ids)):
+                            self.input_positions[seq_id].clear()
+
+                    if token_types:
+                        self.token_types = token_types
+                    else:
+                        for seq_id in range(len(self.seq_ids)):
+                            self.token_types[seq_id].clear()
+
+                    self.mrope_input_positions = None
+
+                    if seq_lens:
+                        self.seq_lens = seq_lens
+                    else:
+                        for seq_id in range(len(self.seq_ids)):
+                            self.seq_lens[seq_id] = 0
+
+                    if orig_seq_lens:
+                        self.orig_seq_lens = orig_seq_lens
+                    else:
+                        for seq_id in range(len(self.seq_ids)):
+                            self.orig_seq_lens[seq_id] = 0
+
+                    if prompt_lens:
+                        self.prompt_lens = prompt_lens
+                    else:
+                        for seq_id in range(len(self.seq_ids)):
+                            self.prompt_lens[seq_id] = 0
+
+                    if query_lens:
+                        self.query_lens = query_lens
+                    else:
+                        for seq_id in range(len(self.seq_ids)):
+                            self.query_lens[seq_id] = 0
+
+                    if context_lens:
+                        self.context_lens = context_lens
+                    else:
+                        for seq_id in range(len(self.seq_ids)):
+                            self.context_lens[seq_id] = 0
+
+                    if curr_sliding_window_blocks:
+                        self.curr_sliding_window_blocks = \
+                            curr_sliding_window_blocks
+                    else:
+                        for seq_id in range(len(self.seq_ids)):
+                            self.curr_sliding_window_blocks[seq_id] = 0
+
+                    if lora_index_mapping:
+                        self.lora_index_mapping = lora_index_mapping
+                    else:
+                        self.lora_index_mapping.clear()
+
+                    if lora_prompt_mapping:
+                        self.lora_prompt_mapping = lora_prompt_mapping
+                    else:
+                        self.lora_prompt_mapping.clear()
+
+                    if lora_requests:
+                        self.lora_requests = lora_requests
+                    else:
+                        self.lora_requests.clear()
+
+            else:
+                self.input_tokens = input_tokens or []
+                self.inputs_embeds = inputs_embeds
+                self.input_positions = input_positions or []
+                self.token_types = token_types or []
+                self.mrope_input_positions = mrope_input_positions or None
+                self.seq_lens = seq_lens or []
+                self.orig_seq_lens = orig_seq_lens or []
+                self.prompt_lens = prompt_lens or []
+                self.query_lens = query_lens or []
+                self.context_lens = context_lens or []
+                self.curr_sliding_window_blocks = \
+                    curr_sliding_window_blocks or []
+
+                self.lora_index_mapping = lora_index_mapping or []
+                self.lora_prompt_mapping = lora_prompt_mapping or []
+                self.lora_requests = lora_requests or set()
+
+            self.multi_modal_kwargs = multi_modal_kwargs
+            self.multi_modal_placeholder_maps = multi_modal_placeholder_maps
+            self.prefix_cache_hit = prefix_cache_hit
+
+            self.n_seqs = len(self.seq_ids)
+
+            if not reinit:
+                self.__post_init__()
+
+        def __post_init__(self):
+            self.n_seqs = len(self.seq_ids)
+
+            self.input_tokens = [[] for _ in range(self.n_seqs)]
+            self.input_positions = [[] for _ in range(self.n_seqs)]
+            self.token_types = [[] for _ in range(self.n_seqs)]
+            self.mrope_input_positions = None
+            self.seq_lens = [0] * self.n_seqs
+            self.orig_seq_lens = [0] * self.n_seqs
+            self.prompt_lens = [0] * self.n_seqs
+            self.query_lens = [0] * self.n_seqs
+            self.context_lens = [0] * self.n_seqs
+            self.curr_sliding_window_blocks = [0] * self.n_seqs
+
+            self.lora_index_mapping = []
+            self.lora_prompt_mapping = []
+
+        def __repr__(self) -> str:
+            return (f"InterDataForSeqGroup("
+                    f"request_id={self.request_id}, "
+                    f"seq_ids={self.seq_ids}, "
+                    f"is_prompt={self.is_prompt}, "
+                    f"block_tables={self.block_tables}, "
+                    f"computed_block_nums={self.computed_block_nums}, "
+                    f"n_seqs={self.n_seqs}, "
+                    f"input_tokens={self.input_tokens}, "
+                    f"inputs_embeds.shape="
+                    f"{getattr(self.inputs_embeds, 'shape', None)}, "
+                    f"input_positions={self.input_positions}, "
+                    f"token_types={self.token_types}, "
+                    f"mrope_input_positions={self.mrope_input_positions}, "
+                    f"seq_lens={self.seq_lens}, "
+                    f"orig_seq_lens={self.orig_seq_lens}, "
+                    f"query_lens={self.query_lens}, "
+                    f"context_lens={self.context_lens}, "
+                    f"multi_modal_kwargs={self.multi_modal_kwargs}")
+
+    def gen_inter_data_builder(self, num_seqs: int):
+        return lambda: ModelInputForGPUBuilder.InterDataForSeqGroup(
+            request_id="",
+            seq_ids=[0] * num_seqs,
+            is_prompt=True,
+            block_tables=None,
+            computed_block_nums=[])
+
+    def init_cached_inter_data(self, *args, **kwargs):
+        assert len(args) == 0
+        assert "seq_ids" in kwargs
+        seq_ids = kwargs["seq_ids"]
+        num_seqs = len(seq_ids)
+
+        # The inter-data cache is per model_runner
+        inter_data_cache = self.runner.inter_data_cache
+        if num_seqs not in inter_data_cache:
+            inter_data_cache[num_seqs] = PyObjectCache(
+                self.gen_inter_data_builder(num_seqs))
+
+        obj = inter_data_cache[num_seqs].get_object()
+        obj.__init__(*args, **kwargs)
+        return obj
+
+    def reset_cached_inter_data(self):
+        for cache in self.runner.inter_data_cache.values():
+            cache.reset()
+
+    def __init__(self,
+                 runner: "GPUModelRunnerBase",
+                 finished_requests_ids: Optional[List[str]] = None):
+        super().__init__()
+        # Compute functions for each sequence in a sequence group.
+        # WARNING: The order of the functions matters!
+        self.per_seq_compute_fns = [
+            self._compute_lens,
+            self._compute_for_prefix_cache_hit,
+            self._compute_for_sliding_window,
+            self._compute_lora_input,
+        ]
+        # Compute functions for each sequence group.
+        # WARNING: The order of the functions matters!
+        self.per_seq_group_compute_fns = [
+            self._compute_multi_modal_input,
+        ]
+
+        self.runner = runner
+        self.model_input_cls = self.runner._model_input_cls
+        self.attn_backend = self.runner.attn_backend
+        self.scheduler_config = self.runner.scheduler_config
+        self.sliding_window = self.runner.sliding_window
+        self.block_size = self.runner.block_size
+        self.enable_lora = self.runner.lora_config is not None
+
+        # Attention metadata inputs.
+        if self.attn_backend is not None:
+            # spec decode (e.g. Medusa) does not have atten backend
+            self.attn_metadata_builder = self.attn_backend.get_builder_cls()(
+                weakref.proxy(self))
+
+        # Engine/Model configurations.
+        self.chunked_prefill_enabled = (
+            self.scheduler_config is not None
+            and self.scheduler_config.chunked_prefill_enabled)
+        if self.sliding_window is not None:
+            self.sliding_window_blocks = (
+                self.sliding_window + self.block_size - 1) // self.block_size
+            self.block_aligned_sliding_window = \
+                self.sliding_window_blocks * self.block_size
+
+    def prepare(self,
+                finished_requests_ids: Optional[List[str]] = None) -> None:
+        self.finished_requests_ids = finished_requests_ids
+
+        # if the current batch is decode-only.
+        # will be set to False if there is any non-decode request.
+        self.decode_only = True
+
+        # Intermediate data (data in CPU before going to GPU) for
+        # the current sequence group.
+        self.inter_data_list: List[
+            ModelInputForGPUBuilder.InterDataForSeqGroup] = []
+
+        self.attn_metadata_builder.prepare()
+
+    def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int,
+                      seq_group_metadata: SequenceGroupMetadata):
+        """Compute context length, sequence length and tokens
+        for the given sequence data.
+        """
+        seq_data = seq_group_metadata.seq_data[inter_data.seq_ids[seq_idx]]
+        token_chunk_size = seq_group_metadata.token_chunk_size
+
+        # Compute context length (the number of tokens that are
+        # already computed) and sequence length (total number of tokens).
+
+        seq_len = seq_data.get_len()
+        if inter_data.is_prompt:
+            context_len = seq_data.get_num_computed_tokens()
+            seq_len = min(seq_len, context_len + token_chunk_size)
+        elif self.runner.scheduler_config.is_multi_step or \
+            self.runner.model_config.is_encoder_decoder:
+            context_len = seq_len - 1
+        else:
+            context_len = seq_data.get_num_computed_tokens()
+
+        # Compute tokens.
+        if seq_data.prompt_embeds is None:
+            tokens = seq_data.get_token_ids()[context_len:seq_len]
+            prompt_embeds = None
+        else:
+            tokens = [0] * (seq_len - context_len)
+            prompt_embeds = seq_data.get_token_embeddings(
+            )[context_len:seq_len]
+
+        token_types = seq_group_metadata.token_type_ids
+
+        inter_data.seq_lens[seq_idx] = seq_len
+        inter_data.orig_seq_lens[seq_idx] = seq_len
+        inter_data.prompt_lens[seq_idx] = seq_data.get_prompt_len()
+        inter_data.context_lens[seq_idx] = context_len
+        inter_data.input_tokens[seq_idx].extend(tokens)
+        inter_data.inputs_embeds = prompt_embeds
+        inter_data.input_positions[seq_idx].extend(range(context_len, seq_len))
+        inter_data.token_types[seq_idx].extend(
+            token_types if token_types else [])
+        inter_data.query_lens[seq_idx] = seq_len - context_len
+
+        if seq_data.mrope_position_delta is not None:
+            if inter_data.mrope_input_positions is None:
+                inter_data.mrope_input_positions = [None] * inter_data.n_seqs
+
+            inter_data.mrope_input_positions[
+                seq_idx] = MRotaryEmbedding.get_next_input_positions(
+                    seq_data.mrope_position_delta,
+                    context_len,
+                    seq_len,
+                )
+
+    def _compute_for_prefix_cache_hit(
+            self, inter_data: InterDataForSeqGroup, seq_idx: int,
+            seq_group_metadata: SequenceGroupMetadata):
+        """Check if hit prefix cache (i.e., some blocks are already computed).
+        If hit, update input tokens and positions to only compute the
+        remaining blocks.
+        """
+        computed_block_nums = inter_data.computed_block_nums
+
+        # Note that prefix caching does not support sliding window.
+        prefix_cache_hit = (computed_block_nums is not None
+                            and len(computed_block_nums) > 0
+                            and self.sliding_window is None
+                            and inter_data.is_prompt)
+        inter_data.prefix_cache_hit = prefix_cache_hit
+
+        if not prefix_cache_hit:
+            return
+
+        assert computed_block_nums is not None
+        # The cache hit prompt tokens in this sequence. Note that
+        # this may be larger than the sequence length if chunked
+        # prefill is enabled.
+        prefix_cache_len = len(computed_block_nums) * self.block_size
+        seq_group_metadata.seq_data[inter_data.seq_ids[
+            seq_idx]].update_num_cached_tokens(prefix_cache_len)
+
+        # The number of so far computed prompt tokens in this sequence.
+        context_len = inter_data.context_lens[seq_idx]
+        # The total number of prompt tokens in this sequence.
+        # When chunked prefill is enabled, this is the token number of
+        # computed chunks + current chunk.
+        seq_len = inter_data.seq_lens[seq_idx]
+        if prefix_cache_len <= context_len:
+            # We already passed the cache hit region,
+            # so do normal computation.
+            pass
+        elif context_len < prefix_cache_len < seq_len:
+            # Partial hit. Compute the missing part.
+            uncomputed_start = prefix_cache_len - context_len
+            inter_data.input_tokens[seq_idx] = inter_data.input_tokens[
+                seq_idx][uncomputed_start:]
+            inter_data.input_positions[seq_idx] = inter_data.input_positions[
+                seq_idx][uncomputed_start:]
+            inter_data.token_types[seq_idx] = inter_data.token_types[seq_idx][
+                uncomputed_start:]
+            context_len = prefix_cache_len
+
+            inter_data.context_lens[seq_idx] = context_len
+            inter_data.query_lens[
+                seq_idx] = inter_data.seq_lens[seq_idx] - context_len
+        elif seq_len <= prefix_cache_len:
+            # Full hit. Only compute the last token to avoid
+            # erroneous behavior. FIXME: Ideally we should directly
+            # mark all tokens as computed in the scheduler and do not
+            # schedule this sequence, so this case should not happen.
+            inter_data.input_tokens[seq_idx] = inter_data.input_tokens[
+                seq_idx][-1:]
+            inter_data.input_positions[seq_idx] = inter_data.input_positions[
+                seq_idx][-1:]
+            inter_data.token_types[seq_idx] = inter_data.token_types[seq_idx][
+                -1:]
+            inter_data.query_lens[seq_idx] = 1
+            inter_data.context_lens[seq_idx] = inter_data.seq_lens[seq_idx] - 1
+
+    def _compute_for_sliding_window(self, inter_data: InterDataForSeqGroup,
+                                    seq_idx: int,
+                                    seq_group_metadata: SequenceGroupMetadata):
+        """Update seq_len and curr_sliding_window_block for the given
+        sequence data (only required by decoding) if sliding window is enabled.
+        """
+        curr_sliding_window_block = 0
+        sliding_seq_len = inter_data.seq_lens[seq_idx]
+        if not inter_data.is_prompt and self.sliding_window is not None:
+            # TODO(sang): This is a hack to make sliding window work with
+            # paged attn. We can remove it if we make paged attn kernel
+            # to properly handle slinding window attn.
+            curr_sliding_window_block = self.sliding_window_blocks
+            # number of elements in last block
+            suff_len = inter_data.seq_lens[seq_idx] % self.block_size
+            sliding_seq_len = min(inter_data.seq_lens[seq_idx],
+                                  self.block_aligned_sliding_window + suff_len)
+            if suff_len > 0:
+                curr_sliding_window_block += 1
+
+        inter_data.curr_sliding_window_blocks[
+            seq_idx] = curr_sliding_window_block
+        inter_data.seq_lens[seq_idx] = sliding_seq_len
+
+    def _compute_lora_input(self, inter_data: InterDataForSeqGroup,
+                            seq_idx: int,
+                            seq_group_metadata: SequenceGroupMetadata):
+        """If LoRA is enabled, compute LoRA index and prompt mapping."""
+        if not self.enable_lora:
+            return
+
+        lora_id = seq_group_metadata.lora_int_id
+        if lora_id > 0:
+            inter_data.lora_requests.add(seq_group_metadata.lora_request)
+        query_len = inter_data.query_lens[seq_idx]
+        inter_data.lora_index_mapping.append([lora_id] * query_len)
+        sampling_params = seq_group_metadata.sampling_params
+        if sampling_params and sampling_params.prompt_logprobs is not None:
+            inter_data.lora_prompt_mapping.append([lora_id] * query_len)
+        elif not self.chunked_prefill_enabled or seq_group_metadata.do_sample:
+            inter_data.lora_prompt_mapping.append([lora_id])
+        else:
+            inter_data.lora_prompt_mapping.append([])
+
+    def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
+                                   seq_group_metadata: SequenceGroupMetadata):
+        """If multi-modal data is given, add it to the input."""
+        # NOTE: mm_kwargs only includes the subset of multi-modal items that
+        # intersect with the current prefill positions.
+        positions = inter_data.input_positions[0]
+        mm_kwargs, placeholder_maps = MultiModalPlaceholderMap.from_seq_group(
+            seq_group_metadata,
+            range(positions[0], positions[0] + len(positions)))
+
+        # M-RoPE requires mrope_positions even for plain text; return early
+        # when mm_kwargs is empty only if inter_data.is_prompt is False.
+        if not mm_kwargs and not inter_data.is_prompt:
+            return
+
+        inter_data.multi_modal_kwargs = mm_kwargs
+        inter_data.multi_modal_placeholder_maps = placeholder_maps
+
+        # special processing for mrope position deltas.
+        if self.runner.model_config.uses_mrope:
+            image_grid_thw = mm_kwargs.get("image_grid_thw", None)
+            video_grid_thw = mm_kwargs.get("video_grid_thw", None)
+            audio_feature_lengths = mm_kwargs.get("audio_feature_lengths",
+                                                  None)
+
+            second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None)
+            use_audio_in_video = mm_kwargs.get("use_audio_in_video", False)
+            hf_config = self.runner.model_config.hf_config
+
+            inter_data.mrope_input_positions = [None] * inter_data.n_seqs
+            for seq_idx in range(inter_data.n_seqs):
+                seq_data = seq_group_metadata.seq_data[
+                    inter_data.seq_ids[seq_idx]]
+                token_ids = seq_data.get_token_ids()
+
+                mrope_input_positions, mrope_position_delta = \
+                    MRotaryEmbedding.get_input_positions(
+                        token_ids,
+                        hf_config=hf_config,
+                        image_grid_thw=image_grid_thw,
+                        video_grid_thw=video_grid_thw,
+                        second_per_grid_ts=second_per_grid_ts,
+                        context_len=inter_data.context_lens[seq_idx],
+                        seq_len=inter_data.seq_lens[seq_idx],
+                        audio_feature_lengths=audio_feature_lengths,
+                        use_audio_in_video=use_audio_in_video,
+                    )
+
+                seq_data.mrope_position_delta = mrope_position_delta
+                inter_data.mrope_input_positions[
+                    seq_idx] = mrope_input_positions
+
+    def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata):
+        """Add a sequence group to the builder."""
+        seq_ids = seq_group_metadata.seq_data.keys()
+        n_seqs = len(seq_ids)
+        is_prompt = seq_group_metadata.is_prompt
+
+        if is_prompt:
+            assert n_seqs == 1
+            self.decode_only = False
+
+        encoder_seq_len = 0
+
+        if self.runner.model_config.is_encoder_decoder:
+            encoder_seq_len = seq_group_metadata.encoder_seq_data.get_len()
+
+        inter_data = self.init_cached_inter_data(
+            request_id=seq_group_metadata.request_id,
+            seq_ids=seq_ids,
+            is_prompt=is_prompt,
+            block_tables=seq_group_metadata.block_tables,
+            computed_block_nums=seq_group_metadata.computed_block_nums,
+            reinit=True,
+            reinit_use_defaults=True,
+            encoder_seq_len=encoder_seq_len)
+
+        self.inter_data_list.append(inter_data)
+
+        for seq_idx in range(n_seqs):
+            for per_seq_fn in self.per_seq_compute_fns:
+                per_seq_fn(inter_data, seq_idx, seq_group_metadata)
+        for per_seq_group_fn in self.per_seq_group_compute_fns:
+            per_seq_group_fn(inter_data, seq_group_metadata)
+
+    def _use_captured_graph(self,
+                            batch_size: int,
+                            decode_only: bool,
+                            max_decode_seq_len: int,
+                            max_encoder_seq_len: int = 0) -> bool:
+        return (decode_only and not self.runner.model_config.enforce_eager
+                and max_decode_seq_len <= self.runner.max_seq_len_to_capture
+                and max_encoder_seq_len <= self.runner.max_seq_len_to_capture
+                and batch_size <= self.runner.max_batchsize_to_capture)
+
+    def _get_cuda_graph_pad_size(self,
+                                 num_seqs: int,
+                                 max_decode_seq_len: int,
+                                 max_encoder_seq_len: int = 0) -> int:
+        """
+        Determine the number of padding sequences required for running in
+        CUDA graph mode. Returns -1 if CUDA graphs cannot be used.
+
+        In the multi-step + chunked-prefill case, only the first step
+        has Prefills (if any). The rest of the steps are guaranteed to be all
+        decodes. In this case, we set up the padding as if all the sequences
+        are decodes so we may run all steps except the first step in CUDA graph
+        mode. The padding is accounted for in the multi-step `advance_step`
+        family of functions.
+
+        Args:
+            num_seqs (int): Number of sequences scheduled to run.
+            max_decode_seq_len (int): Greatest of all the decode sequence
+                lengths. Used only in checking the viablility of using
+                CUDA graphs.
+            max_encoder_seq_len (int, optional): Greatest of all the encode
+                sequence lengths. Defaults to 0. Used only in checking the
+                viability of using CUDA graphs.
+        Returns:
+            int: Returns the determined number of padding sequences. If
+                CUDA graphs is not viable, returns -1.
+        """
+        is_mscp: bool = self.runner.scheduler_config.is_multi_step and \
+                    self.runner.scheduler_config.chunked_prefill_enabled
+        decode_only = self.decode_only or is_mscp
+        if not decode_only:
+            # Early exit so we can treat num_seqs as the batch_size below.
+            return -1
+
+        # batch_size out of this function refers to the number of input
+        # tokens being scheduled. This conflation of num_seqs as batch_size
+        # is valid as this is a decode-only case.
+        batch_size = num_seqs
+        if not self._use_captured_graph(batch_size, decode_only,
+                                        max_decode_seq_len,
+                                        max_encoder_seq_len):
+            return -1
+
+        graph_batch_size = self.runner.vllm_config.pad_for_cudagraph(
+            batch_size)
+        assert graph_batch_size >= batch_size
+        return graph_batch_size - batch_size
+
+    def build(self) -> ModelInputForGPU:
+        """Finalize the builder intermediate data and
+        create on-device tensors.
+        """
+        # Combine and flatten intermediate data.
+        input_tokens = list[int]()
+        inputs_embeds_list = list[torch.Tensor]()
+        token_types = list[int]()
+        for inter_data in self.inter_data_list:
+            for cur_input_tokens in inter_data.input_tokens:
+                input_tokens.extend(cur_input_tokens)
+            for cur_token_types in inter_data.token_types:
+                token_types.extend(cur_token_types)
+            if inter_data.inputs_embeds is not None:
+                inputs_embeds_list.append(
+                    inter_data.inputs_embeds.to(
+                        dtype=self.runner.model_config.dtype,
+                        device=self.runner.device))
+        inputs_embeds: Optional[torch.Tensor]
+        if len(inputs_embeds_list) == 0:
+            inputs_embeds = None
+        else:
+            inputs_embeds = torch.cat(inputs_embeds_list, dim=0).to(
+                dtype=self.runner.model_config.dtype,
+                device=self.runner.device)
+            assert len(inputs_embeds) == len(input_tokens)
+
+        if not input_tokens and inputs_embeds is None:
+            # This may happen when all prefill requests hit
+            # prefix caching and there is no decode request.
+            return self.model_input_cls()
+
+        mrope_input_positions: Optional[List[List[int]]] = None
+        if any(inter_data.mrope_input_positions is not None
+               for inter_data in self.inter_data_list):
+            mrope_input_positions = [[] for _ in range(3)]
+            for idx in range(3):
+                for inter_data in self.inter_data_list:
+                    msections = inter_data.mrope_input_positions
+                    if msections is None:
+                        for _seq_input_positions in inter_data.input_positions:
+                            mrope_input_positions[idx].extend(
+                                _seq_input_positions)
+                    else:
+                        for _seq_mrope_input_positions in msections:
+                            mrope_input_positions[idx].extend(
+                                _seq_mrope_input_positions[idx])
+            input_positions = None
+        else:
+            input_positions = []
+            for inter_data in self.inter_data_list:
+                for cur_input_positions in inter_data.input_positions:
+                    input_positions.extend(cur_input_positions)
+
+        seq_lens = []
+        query_lens = []
+        max_decode_seq_len = 0
+        max_encoder_seq_len = 0
+        for inter_data in self.inter_data_list:
+            seq_lens.extend(inter_data.seq_lens)
+            query_lens.extend(inter_data.query_lens)
+            if not inter_data.is_prompt:
+                max_decode_seq_len = max(max_decode_seq_len,
+                                         max(inter_data.seq_lens))
+                if self.runner.model_config.is_encoder_decoder:
+                    max_encoder_seq_len = max(max_encoder_seq_len,
+                                              inter_data.encoder_seq_len)
+
+        # Mapping from request IDs to sequence IDs. Used for Jamba models
+        # that manages the cache by itself.
+        request_ids_to_seq_ids = {
+            data.request_id: data.seq_ids
+            for data in self.inter_data_list
+        }
+
+        cuda_graph_pad_size = self._get_cuda_graph_pad_size(
+            num_seqs=len(seq_lens),
+            max_decode_seq_len=max_decode_seq_len,
+            max_encoder_seq_len=max_encoder_seq_len)
+
+        batch_size = len(input_tokens)
+        if cuda_graph_pad_size != -1:
+            # If cuda graph can be used, pad tensors accordingly.
+            # See `capture_model` API for more details.
+            # vLLM uses cuda graph only for decoding requests.
+            batch_size += cuda_graph_pad_size
+
+        # Tokens and positions.
+        if cuda_graph_pad_size:
+            input_tokens.extend(itertools.repeat(0, cuda_graph_pad_size))
+        assert self.runner.device is not None
+        input_tokens_tensor = async_tensor_h2d(input_tokens, torch.long,
+                                               self.runner.device,
+                                               self.runner.pin_memory)
+
+        token_types_tensor = async_tensor_h2d(token_types, torch.long,
+                                               self.runner.device,
+                                               self.runner.pin_memory) \
+                                                if token_types else None
+
+        if mrope_input_positions is not None:
+            for idx in range(3):
+                mrope_input_positions[idx].extend(
+                    itertools.repeat(0, cuda_graph_pad_size))
+            input_positions_tensor = async_tensor_h2d(mrope_input_positions,
+                                                      torch.long,
+                                                      self.runner.device,
+                                                      self.runner.pin_memory)
+        else:
+            input_positions.extend(itertools.repeat(0, cuda_graph_pad_size))
+            input_positions_tensor = async_tensor_h2d(input_positions,
+                                                      torch.long,
+                                                      self.runner.device,
+                                                      self.runner.pin_memory)
+        # Sequence and query lengths.
+        if cuda_graph_pad_size:
+            seq_lens.extend(itertools.repeat(1, cuda_graph_pad_size))
+
+        # Attention metadata.
+        attn_metadata = self.attn_metadata_builder.build(
+            seq_lens, query_lens, cuda_graph_pad_size, batch_size)
+
+        # LoRA data.
+        lora_requests = set()
+        lora_mapping = None
+        if self.enable_lora:
+            lora_requests = set(r for data in self.inter_data_list
+                                for r in data.lora_requests)
+            lora_index_mapping = flatten_2d_lists([
+                flatten_2d_lists(inter_data.lora_index_mapping)
+                for inter_data in self.inter_data_list
+            ])
+            if cuda_graph_pad_size:
+                lora_index_mapping.extend(
+                    itertools.repeat(0, cuda_graph_pad_size))
+            lora_prompt_mapping = flatten_2d_lists([
+                flatten_2d_lists(inter_data.lora_prompt_mapping)
+                for inter_data in self.inter_data_list
+            ])
+
+            lora_mapping = LoRAMapping(
+                **dict(index_mapping=lora_index_mapping,
+                       prompt_mapping=lora_prompt_mapping,
+                       is_prefill=not self.decode_only))
+
+        # Multi-modal data.
+        multi_modal_kwargs_list = [
+            data.multi_modal_kwargs for data in self.inter_data_list
+            if data.multi_modal_kwargs is not None
+        ]
+        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
+
+        return self.model_input_cls(
+            input_tokens=input_tokens_tensor,
+            inputs_embeds=inputs_embeds,
+            input_positions=input_positions_tensor,
+            token_types=token_types_tensor,
+            attn_metadata=attn_metadata,
+            seq_lens=seq_lens,
+            query_lens=query_lens,
+            lora_mapping=lora_mapping,
+            lora_requests=lora_requests,
+            multi_modal_kwargs=multi_modal_kwargs,
+            request_ids_to_seq_ids=request_ids_to_seq_ids,
+            finished_requests_ids=self.finished_requests_ids)
+
+
+class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
+    """
+    Helper class for shared methods between GPU model runners.
+    """
+    _model_input_cls: Type[TModelInputForGPU]
+    _builder_cls: Type[ModelInputForGPUBuilder]
+    builder: ModelInputForGPUBuilder
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        kv_cache_dtype: Optional[str] = "auto",
+        is_driver_worker: bool = False,
+        return_hidden_states: bool = False,
+        input_registry: InputRegistry = INPUT_REGISTRY,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+    ):
+
+        ModelRunnerBase.__init__(self, vllm_config)
+        model_config = self.model_config
+        cache_config = self.cache_config
+
+        self.is_driver_worker = is_driver_worker
+        self.return_hidden_states = return_hidden_states
+
+        self.device = self.device_config.device
+        self.pin_memory = is_pin_memory_available()
+
+        self.kv_cache_dtype = kv_cache_dtype
+        self.sliding_window = model_config.get_sliding_window()
+        self.block_size = cache_config.block_size
+        self.max_seq_len_to_capture = self.model_config.max_seq_len_to_capture
+        self.max_batchsize_to_capture = \
+            self.vllm_config.compilation_config.max_capture_size
+
+        #
+        self.graph_runners: List[Dict[Tuple[int, bool], CUDAGraphRunner]] = [
+            {} for _ in range(self.parallel_config.pipeline_parallel_size)
+        ]
+        self.graph_memory_pool: Optional[Tuple[
+            int, int]] = None  # Set during graph capture.
+
+        self.has_inner_state = model_config.has_inner_state
+
+        self.in_profile_run = False
+
+        # When using CUDA graph, the input block tables must be padded to
+        # max_seq_len_to_capture. However, creating the block table in
+        # Python can be expensive. To optimize this, we cache the block table
+        # in numpy and only copy the actual input content at every iteration.
+        # The shape of the cached block table will be
+        # (max batch size to capture, max seq len to capture / block size).
+        self.graph_block_tables = np.zeros(
+            (self.max_batchsize_to_capture, self.get_max_block_per_batch()),
+            dtype=np.int32)
+
+        self.cross_layer_shared_graph_block_tables = np.zeros(
+            (self.max_batchsize_to_capture, self.get_max_block_per_batch()),
+            dtype=np.int32)
+
+        # Attention-free but stateful models like Mamba need a placeholder attn
+        # backend, as the attention metadata is needed to manage internal state.
+        # However we must bypass attention selection altogether for some models
+        # used for speculative decoding to avoid a divide-by-zero in
+        # model_config.get_head_size()
+        num_attn_heads = self.model_config.get_num_attention_heads(
+            self.parallel_config)
+        needs_attn_backend = (num_attn_heads != 0
+                              or self.model_config.is_attention_free)
+
+        self.attn_backend = get_attn_backend(
+            self.model_config.get_head_size(),
+            self.model_config.dtype,
+            self.kv_cache_dtype,
+            self.block_size,
+            self.model_config.is_attention_free,
+            use_mla=self.model_config.use_mla,
+        ) if needs_attn_backend else None
+        if self.attn_backend:
+            self.attn_state = self.attn_backend.get_state_cls()(
+                weakref.proxy(self))
+        else:
+            self.attn_state = CommonAttentionState(weakref.proxy(self))
+
+        # Multi-modal data support
+        self.input_registry = input_registry
+        self.mm_registry = mm_registry
+
+        # Lazy initialization
+        self.model: nn.Module  # Set after load_model
+        # Set after load_model.
+        self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None
+        self.sampler = get_sampler()
+
+        set_cpu_offload_max_bytes(
+            int(self.cache_config.cpu_offload_gb * 1024**3))
+
+        # Used to cache python objects
+        self.inter_data_cache: Dict[int, PyObjectCache] = {}
+
+        # Using the PythonizationCache in Pipeline-Parallel clobbers the
+        # SequenceGroupToSample object. In Pipeline-Parallel, we have
+        # more than 1 Scheduler, resulting in a potential back-to-back
+        # prepare_model_inputs() call. This clobbers the cached
+        # SequenceGroupToSample objects, as we reset the cache during
+        # every prepare_model_inputs() call.
+        self.sampling_metadata_cache: SamplingMetadataCache = \
+              SamplingMetadataCache() \
+                if self.parallel_config.pipeline_parallel_size == 1 else None
+
+        if hasattr(self, "_builder_cls"):
+            # multi-step model runner does not have `_builder_cls`
+            self.builder = self._builder_cls(weakref.proxy(self))
+
+    def load_model(self) -> None:
+        logger.info("Starting to load model %s...", self.model_config.model)
+        with DeviceMemoryProfiler(self.device) as m:
+            time_before_load = time.perf_counter()
+            self.model = get_model(vllm_config=self.vllm_config)
+            if self.lora_config:
+                assert supports_lora(
+                    self.model
+                ), f"{self.model.__class__.__name__} does not support LoRA yet."
+
+                if supports_multimodal(self.model):
+                    logger.warning(
+                        "Regarding multimodal models, vLLM currently "
+                        "only supports adding LoRA to language model.")
+
+                # Use get_text_config() in case of multimodal models
+                text_config = self.model_config.hf_config.get_text_config()
+
+                self.lora_manager = LRUCacheWorkerLoRAManager(
+                    self.scheduler_config.max_num_seqs,
+                    self.scheduler_config.max_num_batched_tokens,
+                    self.vocab_size,
+                    self.lora_config,
+                    self.device,
+                    self.model.embedding_modules,
+                    self.model.embedding_padding_modules,
+                    max_position_embeddings=text_config.
+                    max_position_embeddings,
+                )
+                self.model = self.lora_manager.create_lora_manager(self.model)
+            time_after_load = time.perf_counter()
+
+        self.model_memory_usage = m.consumed_memory
+        logger.info("Model loading took %.4f GiB and %.6f seconds",
+                    self.model_memory_usage / GiB_bytes,
+                    time_after_load - time_before_load)
+
+
+        if self.vllm_config.compilation_config.level ==\
+            CompilationLevel.DYNAMO_AS_IS and supports_dynamo():
+            backend = self.vllm_config.compilation_config.init_backend(
+                self.vllm_config)
+            self.model = torch.compile(
+                self.model,
+                fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
+                backend=backend)
+
+    def get_model(self) -> nn.Module:
+        return self.model
+
+    def save_sharded_state(
+        self,
+        path: str,
+        pattern: Optional[str] = None,
+        max_size: Optional[int] = None,
+    ) -> None:
+        from vllm.model_executor.model_loader import ShardedStateLoader
+        ShardedStateLoader.save_model(
+            self.model,
+            path,
+            pattern=pattern,
+            max_size=max_size,
+        )
+
+    def save_tensorized_model(
+        self,
+        tensorizer_config: TensorizerConfig,
+    ) -> None:
+        from vllm.model_executor.model_loader import TensorizerLoader
+        TensorizerLoader.save_model(
+            self.model,
+            tensorizer_config=tensorizer_config,
+            model_config=self.model_config,
+        )
+
+    def get_max_block_per_batch(self) -> int:
+        block_size = self.block_size
+        return (self.max_seq_len_to_capture + block_size - 1) // block_size
+
+    def _prepare_model_input_tensors(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        finished_requests_ids: Optional[List[str]] = None
+    ) -> TModelInputForGPU:
+        """Helper method to prepare the model input based on a given sequence
+        group. Prepares metadata needed for the base model forward pass but not
+        metadata for possible additional steps, e.g., sampling.
+
+        The API assumes seq_group_metadata_list is sorted by prefill -> decode.
+
+        The result tensors and data structure also batches input in prefill
+        -> decode order. For example,
+
+        - input_tokens[:num_prefill_tokens] contains prefill tokens.
+        - input_tokens[num_prefill_tokens:] contains decode tokens.
+
+        If cuda graph is required, this API automatically pads inputs.
+        """
+        self.builder.prepare(finished_requests_ids)
+        for seq_group_metadata in seq_group_metadata_list:
+            try:
+                self.builder.add_seq_group(seq_group_metadata)
+            except Exception as e:
+                # Raise an exception that tracks the ID of the bad request
+                raise InputProcessingError(seq_group_metadata.request_id,
+                                           str(e)) from e
+
+        self.builder.reset_cached_inter_data()
+
+        return self.builder.build()  # type: ignore
+
+    @contextmanager
+    def set_in_profile_run(self):
+        self.in_profile_run = True
+        try:
+            yield
+        finally:
+            self.in_profile_run = False
+
+    @torch.inference_mode()
+    def profile_run(self) -> None:
+        max_num_batched_tokens = \
+            self.scheduler_config.max_num_batched_tokens
+        max_num_seqs = self.scheduler_config.max_num_seqs
+        self._dummy_run(max_num_batched_tokens, max_num_seqs)
+
+    def _add_dummy_loras(self, num_loras: int) -> list[LoRARequest]:
+        assert num_loras > 0
+        assert self.lora_manager is not None
+
+        dummy_lora_requests: list[LoRARequest] = []
+        with self.lora_manager.dummy_lora_cache():
+            for idx in range(num_loras):
+                lora_id = idx + 1
+                dummy_lora_request = LoRARequest(
+                    lora_name=f"warmup_{lora_id}",
+                    lora_int_id=lora_id,
+                    lora_path="/not/a/real/path",
+                )
+                self.lora_manager.add_dummy_lora(dummy_lora_request,
+                                                 rank=LORA_WARMUP_RANK)
+                dummy_lora_requests.append(dummy_lora_request)
+        return dummy_lora_requests
+
+    def _remove_dummy_loras(self):
+        # Remove dummy loras.
+        assert self.lora_manager is not None
+        self.remove_all_loras()
+
+    def _dummy_run(self,
+                   max_num_batched_tokens: int,
+                   max_num_seqs: int = 1) -> None:
+        with self.set_in_profile_run():
+            # Enable top-k sampling to reflect the accurate memory usage.
+            sampling_params = \
+                SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
+
+            # This represents the maximum number of different requests
+            # that will have unique loras, and therefore the max amount of
+            # memory consumption. Create dummy lora request copies from the
+            # lora request passed in, which contains a lora from the lora
+            # warmup path.
+            dummy_lora_requests: List[LoRARequest] = []
+            dummy_lora_requests_per_seq: List[LoRARequest] = []
+            if self.lora_config:
+                dummy_lora_requests = self._add_dummy_loras(
+                    self.lora_config.max_loras)
+                assert len(dummy_lora_requests) == self.lora_config.max_loras
+                dummy_lora_requests_per_seq = [
+                    dummy_lora_requests[idx % len(dummy_lora_requests)]
+                    for idx in range(max_num_seqs)
+                ]
+
+            # Profile memory usage with max_num_sequences sequences and the
+            # total number of tokens equal to max_num_batched_tokens.
+            seqs: List[SequenceGroupMetadata] = []
+            # Additional GPU memory may be needed for multi-modal encoding,
+            # which needs to be accounted for when calculating the GPU blocks
+            # for vLLM blocker manager.
+            # To exercise the worst scenario for GPU memory consumption,
+            # the number of seqs (batch_size) is chosen to maximize the number
+            # of images processed.
+
+            max_mm_tokens = self.mm_registry.get_max_multimodal_tokens(
+                self.model_config)
+            if max_mm_tokens > 0:
+                max_num_seqs_orig = max_num_seqs
+                max_num_seqs = min(max_num_seqs,
+                                   max_num_batched_tokens // max_mm_tokens)
+                if max_num_seqs < 1:
+                    expr = (f"min({max_num_seqs_orig}, "
+                            f"{max_num_batched_tokens} // {max_mm_tokens})")
+                    logger.warning(
+                        "Computed max_num_seqs (%s) to be less than 1. "
+                        "Setting it to the minimum value of 1.", expr)
+                    max_num_seqs = 1
+
+            batch_size = 0
+            for group_id in range(max_num_seqs):
+                seq_len = (max_num_batched_tokens // max_num_seqs +
+                           (group_id < max_num_batched_tokens % max_num_seqs))
+                batch_size += seq_len
+
+                dummy_data = self.input_registry \
+                    .dummy_data_for_profiling(self.model_config,
+                                              seq_len,
+                                              self.mm_registry)
+
+                seq = SequenceGroupMetadata(
+                    request_id=str(group_id),
+                    is_prompt=True,
+                    seq_data={group_id: dummy_data.seq_data},
+                    sampling_params=sampling_params,
+                    block_tables=None,
+                    lora_request=dummy_lora_requests_per_seq[group_id]
+                    if dummy_lora_requests_per_seq else None,
+                    multi_modal_data=dummy_data.multi_modal_data,
+                    multi_modal_placeholders=dummy_data.
+                    multi_modal_placeholders,
+                )
+                seqs.append(seq)
+
+            # Run the model with the dummy inputs.
+            num_layers = self.model_config.get_num_layers(self.parallel_config)
+            # use an empty tensor instead of `None`` to force Dynamo to pass
+            # it by reference, rather by specializing on the value ``None``.
+            # the `dtype` argument does not matter, and we use `float32` as
+            # a placeholder (it has wide hardware support).
+            # it is important to create tensors inside the loop, rather than
+            # multiplying the list, to avoid Dynamo from treating them as
+            # tensor aliasing.
+            kv_caches = [
+                torch.tensor([], dtype=torch.float32, device=self.device)
+                for _ in range(num_layers)
+            ]
+            finished_requests_ids = [seq.request_id for seq in seqs]
+            model_input = self.prepare_model_input(
+                seqs, finished_requests_ids=finished_requests_ids)
+            intermediate_tensors = None
+            if not get_pp_group().is_first_rank:
+                intermediate_tensors = \
+                    self.model.make_empty_intermediate_tensors(
+                    batch_size=batch_size,
+                    dtype=self.model_config.dtype,
+                    device=self.device)
+
+            # Disable KV Scale Calculation for dummy data during profile run
+            if model_input.attn_metadata is not None:
+                model_input.attn_metadata.enable_kv_scales_calculation = False
+
+            self.execute_model(model_input, kv_caches, intermediate_tensors)
+            torch.cuda.synchronize()
+            if self.lora_config:
+                self._remove_dummy_loras()
+
+            return
+
+    def remove_all_loras(self):
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        self.lora_manager.remove_all_adapters()
+
+    def set_active_loras(self, lora_requests: Set[LoRARequest],
+                         lora_mapping: LoRAMapping) -> None:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        self.lora_manager.set_active_adapters(lora_requests, lora_mapping)
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.add_adapter(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.remove_adapter(lora_id)
+
+    def pin_lora(self, lora_id: int) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.pin_adapter(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.list_adapters()
+
+    @torch.inference_mode()
+    def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
+        """Cuda graph capture a model.
+
+        Note that CUDA graph's performance gain is negligible if number
+        of batched tokens are larger than 200. And since CUDA graph
+        requires fixed sized tensors, supporting large/variable batch
+        size requires high GPU memory overhead. Thus, vLLM only captures
+        decoding requests. Mixed batch (chunked prefill + decoding) or
+        prefill requests are not captured.
+
+        Since it is used for decoding-only, it assumes there's only 1 token
+        per sequence in the batch.
+        """
+        assert not self.model_config.enforce_eager
+        logger.info("Capturing cudagraphs for decoding. This may lead to "
+                    "unexpected consequences if the model is not static. To "
+                    "run the model in eager mode, set 'enforce_eager=True' or "
+                    "use '--enforce-eager' in the CLI. "
+                    "If out-of-memory error occurs during cudagraph capture,"
+                    " consider decreasing `gpu_memory_utilization` or "
+                    "switching to eager mode. You can also reduce the "
+                    "`max_num_seqs` as needed to decrease memory usage.")
+        start_time = time.perf_counter()
+        start_free_gpu_memory = torch.cuda.mem_get_info()[0]
+
+        # Prepare dummy inputs. These will be reused for all batch sizes.
+        max_batch_size = self.max_batchsize_to_capture
+        input_tokens = torch.zeros(max_batch_size,
+                                   dtype=torch.long,
+                                   device=self.device)
+        input_positions = torch.zeros(max_batch_size,
+                                      dtype=torch.long,
+                                      device=self.device)
+        inputs_embeds = torch.zeros(
+            (max_batch_size, self.model_config.get_hidden_size()),
+            dtype=self.model_config.dtype,
+            device=self.device)
+        if self.model_config.uses_mrope:
+            input_positions = torch.tile(input_positions,
+                                         (3, 1)).cuda(device=self.device)
+        # Prepare dummy previous_hidden_states only if needed by the model.
+        # This is used by draft models such as EAGLE.
+        previous_hidden_states = None
+        if "previous_hidden_states" in inspect.signature(
+                self.model.forward).parameters:
+            previous_hidden_states = torch.empty(
+                [max_batch_size,
+                 self.model_config.get_hidden_size()],
+                dtype=self.model_config.dtype,
+                device=self.device)
+
+        intermediate_inputs = None
+        if not get_pp_group().is_first_rank:
+            intermediate_inputs = self.model.make_empty_intermediate_tensors(
+                batch_size=max_batch_size,
+                dtype=self.model_config.dtype,
+                device=self.device)
+
+        dummy_lora_id: Optional[int] = None
+        dummy_lora_request: LoRARequest = []
+        if self.lora_config:
+            # The goal is to capture the LoRA kernels in cuda graphs.
+            # for this purpose, as single dummy lora is sufficient.
+            dummy_lora_requests = self._add_dummy_loras(num_loras=1)
+            assert len(dummy_lora_requests) == 1
+            dummy_lora_request = dummy_lora_requests[0]
+            dummy_lora_id = dummy_lora_request.lora_int_id
+
+        with self.attn_state.graph_capture(max_batch_size), graph_capture(
+                self.device) as graph_capture_context:
+            # NOTE: Capturing the largest batch size first may help reduce the
+            # memory usage of CUDA graph.
+            for virtual_engine in range(
+                    self.parallel_config.pipeline_parallel_size):
+                # We need to not only iterate over batch sizes, but also whether
+                # to use inputs_embeds or not, hence we use the cartesian
+                # product.
+                cudagraph_capture_sizes = self.vllm_config.compilation_config\
+                    .cudagraph_capture_sizes
+                cudagraph_inputs_embeds = ((
+                    True, False) if self.model_config.enable_prompt_embeds else
+                                           (False, ))
+                compilation_cases = itertools.product(
+                    cudagraph_capture_sizes,
+                    cudagraph_inputs_embeds,
+                )
+                # Only rank 0 should print progress bar during capture
+                if get_tensor_model_parallel_rank() == 0:
+                    compilation_cases = tqdm(
+                        list(compilation_cases),
+                        disable=not self.load_config.use_tqdm_on_load,
+                        desc="Capturing CUDA graph shapes")
+                for batch_size, use_inputs_embeds in compilation_cases:
+                    attn_metadata = (
+                        self.attn_state.graph_capture_get_metadata_for_batch(
+                            batch_size,
+                            is_encoder_decoder_model=self.model_config.
+                            is_encoder_decoder))
+                    # Disable KV Scale Calculation for graph capture
+                    attn_metadata.enable_kv_scales_calculation = False
+                    if self.lora_config:
+                        lora_mapping = LoRAMapping(
+                            **dict(index_mapping=[dummy_lora_id] * batch_size,
+                                   prompt_mapping=[dummy_lora_id] * batch_size,
+                                   is_prefill=False))
+                        self.set_active_loras(set([dummy_lora_request]),
+                                              lora_mapping)
+
+                    graph_runner = CUDAGraphRunner(
+                        self.model, self.attn_backend.get_name(),
+                        self.attn_state.graph_clone(batch_size),
+                        self.model_config.is_encoder_decoder)
+
+                    capture_inputs = {
+                        "input_ids":
+                        input_tokens[:batch_size],
+                        "inputs_embeds":
+                        inputs_embeds[:batch_size]
+                        if use_inputs_embeds else None,
+                        "positions":
+                        input_positions[..., :batch_size],
+                        "intermediate_inputs":
+                        intermediate_inputs[:batch_size]
+                        if intermediate_inputs is not None else None,
+                        "kv_caches":
+                        kv_caches[virtual_engine],
+                        "attn_metadata":
+                        attn_metadata,
+                        "memory_pool":
+                        self.graph_memory_pool,
+                        "stream":
+                        graph_capture_context.stream
+                    }
+                    if previous_hidden_states is not None:
+                        capture_inputs[
+                            "previous_hidden_states"] = previous_hidden_states[:
+                                                                               batch_size]
+
+                    if self.has_inner_state:
+                        # Only used by Mamba-based models CUDA graph atm (Jamba)
+                        capture_inputs.update({
+                            "seqlen_agnostic_capture_inputs":
+                            self.model.get_seqlen_agnostic_capture_inputs(
+                                batch_size)
+                        })
+                    if self.model_config.is_encoder_decoder:
+                        # add the additional inputs to capture for
+                        # encoder-decoder models.
+                        self._update_inputs_to_capture_for_enc_dec_model(
+                            capture_inputs)
+
+                    with set_forward_context(attn_metadata, self.vllm_config,
+                                             virtual_engine):
+                        graph_runner.capture(**capture_inputs)
+                    self.graph_memory_pool = graph_runner.graph.pool()
+                    self.graph_runners[virtual_engine][(
+                        batch_size, use_inputs_embeds)] = graph_runner
+
+        if self.lora_config:
+            self._remove_dummy_loras()
+
+        end_time = time.perf_counter()
+        end_free_gpu_memory = torch.cuda.mem_get_info()[0]
+        elapsed_time = end_time - start_time
+        cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory
+        # This usually takes < 10 seconds.
+        logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
+                    elapsed_time, cuda_graph_size / GiB_bytes)
+
+    def _update_inputs_to_capture_for_enc_dec_model(self,
+                                                    capture_inputs: Dict[str,
+                                                                         Any]):
+        """
+        Updates the set of input tensors needed for CUDA graph capture in an
+        encoder-decoder model.
+
+        This method modifies the provided `capture_inputs` dictionary by
+        adding tensors specific to encoder-decoder specific models that
+        need to be captured for CUDA Graph replay.
+        """
+        # During the decode phase encoder_input_ids and encoder_positions are
+        # unset. Do the same thing for graph capture.
+        capture_inputs["encoder_input_ids"] = torch.tensor([],
+                                                           dtype=torch.long,
+                                                           device=self.device)
+        capture_inputs["encoder_positions"] = torch.tensor([],
+                                                           dtype=torch.long,
+                                                           device=self.device)
+
+    @property
+    def vocab_size(self) -> int:
+        return self.model_config.get_vocab_size()
+
+
+class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
+    """
+    GPU model runner with sampling step.
+    """
+    _model_input_cls: Type[ModelInputForGPUWithSamplingMetadata] = (
+        ModelInputForGPUWithSamplingMetadata)
+    _builder_cls: Type[ModelInputForGPUBuilder] = ModelInputForGPUBuilder
+
+    def make_model_input_from_broadcasted_tensor_dict(
+        self,
+        tensor_dict: Dict[str, Any],
+    ) -> ModelInputForGPUWithSamplingMetadata:
+        model_input = \
+            ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict(
+                tensor_dict,
+                attn_backend=self.attn_backend,
+            )
+        return model_input
+
+    def prepare_model_input(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        virtual_engine: int = 0,
+        finished_requests_ids: Optional[List[str]] = None,
+    ) -> ModelInputForGPUWithSamplingMetadata:
+        """Prepare the model input based on a given sequence group, including
+        metadata for the sampling step.
+
+        The API assumes seq_group_metadata_list is sorted by prefill -> decode.
+
+        The result tensors and data structure also batches input in prefill
+        -> decode order. For example,
+
+        - input_tokens[:num_prefill_tokens] contains prefill tokens.
+        - input_tokens[num_prefill_tokens:] contains decode tokens.
+
+        If cuda graph is required, this API automatically pads inputs.
+        """
+        model_input = self._prepare_model_input_tensors(
+            seq_group_metadata_list, finished_requests_ids)
+        if get_pp_group().is_last_rank:
+            # Sampling metadata is only required for the final pp group
+            generators = self.get_generators(finished_requests_ids)
+            sampling_metadata = SamplingMetadata.prepare(
+                seq_group_metadata_list, model_input.seq_lens,
+                model_input.query_lens, self.device, self.pin_memory,
+                generators, self.sampling_metadata_cache)
+        else:
+            sampling_metadata = None
+        is_prompt = (seq_group_metadata_list[0].is_prompt
+                     if seq_group_metadata_list else None)
+        return dataclasses.replace(model_input,
+                                   sampling_metadata=sampling_metadata,
+                                   is_prompt=is_prompt,
+                                   virtual_engine=virtual_engine)
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        model_input: ModelInputForGPUWithSamplingMetadata,
+        kv_caches: List[torch.Tensor],
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        num_steps: int = 1,
+        **kwargs,
+    ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]:
+        if num_steps > 1:
+            raise ValueError("num_steps > 1 is not supported in ModelRunner")
+
+        if self.lora_config:
+            assert model_input.lora_requests is not None
+            assert model_input.lora_mapping is not None
+            self.set_active_loras(model_input.lora_requests,
+                                  model_input.lora_mapping)
+
+        self.attn_state.begin_forward(model_input)
+
+        # Currently cuda graph is only supported by the decode phase.
+        assert model_input.attn_metadata is not None
+        prefill_meta = model_input.attn_metadata.prefill_metadata
+        decode_meta = model_input.attn_metadata.decode_metadata
+        # TODO(andoorve): We can remove this once all
+        # virtual engines share the same kv cache.
+        virtual_engine = model_input.virtual_engine
+        previous_hidden_states = kwargs.get("previous_hidden_states")
+        if prefill_meta is None and decode_meta.use_cuda_graph:
+            assert model_input.input_tokens is not None
+            graph_batch_size = model_input.input_tokens.shape[0]
+            use_inputs_embeds = model_input.inputs_embeds is not None
+            model_executable = self.graph_runners[virtual_engine][(
+                graph_batch_size, use_inputs_embeds)]
+            if previous_hidden_states is not None:
+                previous_hidden_states = torch.cat([
+                    previous_hidden_states,
+                    torch.empty([
+                        graph_batch_size - previous_hidden_states.shape[0],
+                        *previous_hidden_states.shape[1:]
+                    ],
+                                dtype=previous_hidden_states.dtype,
+                                device=previous_hidden_states.device)
+                ])
+        else:
+            model_executable = self.model
+
+        # Receive KV cache in distributed KV cache transfer setting
+        # In disagg prefill setting, it will also recv hidden states and bypass
+        # model forwarding
+        # In KV cache database setting, it will change the model input so that
+        # we can skip prefilling on tokens that successfully received KV caches
+        # NOTE: The receive operation is blocking
+        bypass_model_exec = False
+        if self.need_recv_kv(model_input, kv_caches):
+            hidden_or_intermediate_states, bypass_model_exec, model_input = \
+                get_kv_transfer_group().recv_kv_caches_and_hidden_states(
+                    # model is used to know which layer the current worker
+                    # is working on, so that we can receive KV for only those
+                    # layers.
+                    model_executable,
+                    model_input,
+                    kv_caches=kv_caches
+                )
+
+        multi_modal_kwargs = model_input.multi_modal_kwargs or {}
+        seqlen_agnostic_kwargs = {
+            "finished_requests_ids": model_input.finished_requests_ids,
+            "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
+        } if self.has_inner_state else {}
+        model_kwargs = {}
+        if previous_hidden_states is not None:
+            model_kwargs["previous_hidden_states"] = previous_hidden_states
+        if (self.observability_config is not None
+                and self.observability_config.collect_model_forward_time):
+            model_forward_start = torch.cuda.Event(enable_timing=True)
+            model_forward_end = torch.cuda.Event(enable_timing=True)
+            model_forward_start.record()
+
+        if not bypass_model_exec:
+            with set_forward_context(model_input.attn_metadata,
+                                     self.vllm_config, virtual_engine):
+                hidden_or_intermediate_states = model_executable(
+                    input_ids=model_input.input_tokens,
+                    inputs_embeds=model_input.inputs_embeds,
+                    positions=model_input.input_positions,
+                    intermediate_tensors=intermediate_tensors,
+                    **MultiModalKwargs.as_kwargs(
+                        multi_modal_kwargs,
+                        device=self.device,
+                    ),
+                    **seqlen_agnostic_kwargs,
+                    **model_kwargs,
+                )
+
+        if (self.observability_config is not None
+                and self.observability_config.collect_model_forward_time):
+            model_forward_end.record()
+
+        # Sending KV cache in distributed KV cache transfer setting
+        # NOTE: the send operation is non-blocking
+        if self.need_send_kv(model_input, kv_caches):
+            get_kv_transfer_group().send_kv_caches_and_hidden_states(
+                # model_executable is used to know which layer the current
+                # worker is working on, so that we can send KV for only those
+                # layers.
+                model_executable,
+                model_input,
+                kv_caches,
+                hidden_or_intermediate_states,
+            )
+
+        # Compute the logits in the last pipeline stage.
+        if not get_pp_group().is_last_rank:
+            if (self.is_driver_worker
+                    and hidden_or_intermediate_states is not None
+                    and isinstance(hidden_or_intermediate_states,
+                                   IntermediateTensors)
+                    and self.observability_config is not None
+                    and self.observability_config.collect_model_forward_time):
+                model_forward_end.synchronize()
+                model_forward_time = model_forward_start.elapsed_time(
+                    model_forward_end)
+                orig_model_forward_time = 0.0
+                if intermediate_tensors is not None:
+                    orig_model_forward_time = intermediate_tensors.tensors.get(
+                        "model_forward_time", torch.tensor(0.0)).item()
+                hidden_or_intermediate_states.tensors["model_forward_time"] = (
+                    torch.tensor(model_forward_time + orig_model_forward_time))
+            return hidden_or_intermediate_states
+
+        logits = self.model.compute_logits(hidden_or_intermediate_states,
+                                           model_input.sampling_metadata)
+
+        if self.is_driver_worker:
+            if model_input.async_callback is not None:
+                model_input.async_callback()
+
+            # Sample the next token.
+            assert isinstance(self.sampler, Sampler)
+            orig_include_gpu_probs = self.sampler.include_gpu_probs_tensor
+            if model_input.inputs_embeds is not None:
+                self.sampler.include_gpu_probs_tensor = True
+
+            output: SamplerOutput = self.sampler(
+                logits=logits,
+                sampling_metadata=model_input.sampling_metadata,
+            )
+            if (self.observability_config is not None
+                    and self.observability_config.collect_model_forward_time
+                    and output is not None):
+                model_forward_end.synchronize()
+                model_forward_time = model_forward_start.elapsed_time(
+                    model_forward_end)
+                orig_model_forward_time = 0.0
+                if intermediate_tensors is not None:
+                    orig_model_forward_time = intermediate_tensors.tensors.get(
+                        "model_forward_time", torch.tensor(0.0)).item()
+                # If there are multiple workers, we are still tracking the
+                # latency from the start time of the driver worker to the end
+                # time of the driver worker. The model forward time will then
+                # end up covering the communication time as well.
+                output.model_forward_time = (orig_model_forward_time +
+                                             model_forward_time)
+
+        if model_input.inputs_embeds is not None:
+            if self.is_driver_worker:
+                sampled_token_ids = []
+                valid_outputs = []
+                for sequence_group_output in output.outputs:
+                    if len(sequence_group_output.samples) == 0:
+                        continue
+                    assert len(sequence_group_output.samples) == 1
+                    valid_outputs.append(sequence_group_output)
+                    sampled_token_ids.append(
+                        sequence_group_output.samples[0].output_token)
+                sampled_token_ids = torch.tensor(sampled_token_ids).to(
+                    self.device)
+                sampled_token_ids = broadcast_tensor_dict(
+                    {"sampled_token_ids":
+                     sampled_token_ids})["sampled_token_ids"]
+            else:
+                sampled_token_ids = broadcast_tensor_dict(
+                )["sampled_token_ids"]
+            if len(sampled_token_ids) > 0:
+                sampled_token_embeds = \
+                    self.model.get_input_embeddings(sampled_token_ids)
+                if self.is_driver_worker:
+                    self.sampler.include_gpu_probs_tensor = \
+                        orig_include_gpu_probs
+                    for i, sequence_group_output in enumerate(valid_outputs):
+                        sequence_group_output.samples[0].output_embed = \
+                            sampled_token_embeds[i]
+
+        if not self.is_driver_worker:
+            return []
+
+        if self.return_hidden_states:
+            # we only need to pass hidden states of most recent token
+            assert model_input.sampling_metadata is not None
+            indices = model_input.sampling_metadata.selected_token_indices
+            if model_input.is_prompt:
+                hidden_states = hidden_or_intermediate_states.index_select(
+                    0, indices)
+                output.prefill_hidden_states = hidden_or_intermediate_states
+            elif decode_meta.use_cuda_graph:
+                hidden_states = hidden_or_intermediate_states[:len(indices)]
+            else:
+                hidden_states = hidden_or_intermediate_states
+
+            output.hidden_states = hidden_states
+
+        return [output]
+
+    def need_recv_kv(self, model_input, kv_caches) -> bool:
+        """Check if we need to receive kv-cache from the other worker.
+        We need to receive KV when
+            1. current vLLM instance is KV cache consumer/decode vLLM instance
+            2. this batch is not a profiling run
+            3. this batch is a prefill run
+
+        Args:
+            model_input: input to the model executable
+            kv_caches: vLLM's paged memory
+        """
+
+        if self.vllm_config.kv_transfer_config is None:
+            return False
+
+        prefill_meta = model_input.attn_metadata.prefill_metadata
+
+        # check if the current run is profiling
+        is_profile_run = (kv_caches[0].numel() == 0)
+        # check if the current run is prefill
+        is_prefill_run = prefill_meta is not None
+
+        return self.vllm_config.kv_transfer_config.is_kv_consumer and (
+            not is_profile_run) and is_prefill_run
+
+    def need_send_kv(self, model_input, kv_caches) -> bool:
+        """Check if we need to send kv-cache to the other worker.
+        We need to send KV when
+            1. current vLLM instance is KV cache producer/prefill vLLM instance
+            2. this batch is not a profiling run
+            3. this batch is a prefill run
+
+        Args:
+            model_input: input to the model executable
+            kv_caches: vLLM's paged memory
+        """
+
+        if self.vllm_config.kv_transfer_config is None:
+            return False
+
+        prefill_meta = model_input.attn_metadata.prefill_metadata
+
+        # check if the current run is profiling
+        is_profile_run = (kv_caches[0].numel() == 0)
+        # check if the current run is prefill
+        is_prefill_run = prefill_meta is not None
+
+        return self.vllm_config.kv_transfer_config.is_kv_producer and (
+            not is_profile_run) and is_prefill_run
+
+
+# NOTE: this is nn.Module so the profiler can properly capture/group
+#  kernels calls made within the graph
+class CUDAGraphRunner(nn.Module):
+
+    def __init__(self, model: nn.Module, backend_name: str,
+                 attn_state: AttentionState, is_encoder_decoder_model: bool):
+        super().__init__()
+        self.model = model
+        self.backend_name = backend_name
+        self.attn_state = attn_state
+
+        self.input_buffers: Dict[str, torch.Tensor] = {}
+        self.output_buffers: Dict[str, torch.Tensor] = {}
+
+        self._graph: Optional[torch.cuda.CUDAGraph] = None
+        self._is_encoder_decoder_model = is_encoder_decoder_model
+
+    @property
+    def graph(self):
+        assert self._graph is not None
+        return self._graph
+
+    def capture(
+        self,
+        input_ids: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_inputs: Optional[IntermediateTensors],
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        memory_pool: Optional[Tuple[int, int]],
+        stream: torch.cuda.Stream,
+        **kwargs,
+    ):
+        assert self._graph is None
+        # Run the model a few times without capturing the graph.
+        # This is to make sure that the captured graph does not include the
+        # kernel launches for initial benchmarking (e.g., Triton autotune).
+        # Note one iteration is not enough for torch.compile
+        for _ in range(_NUM_WARMUP_ITERS):
+            self.model(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                positions=positions,
+                intermediate_tensors=intermediate_inputs,
+                **kwargs,
+            )
+        # Wait for the warm up operations to finish before proceeding with
+        # Graph Capture.
+        torch.cuda.synchronize()
+        # Capture the graph.
+        self._graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(self._graph, pool=memory_pool, stream=stream):
+            output_hidden_or_intermediate_states = self.model(
+                input_ids=input_ids,
+                **({
+                    "inputs_embeds": inputs_embeds,
+                } if inputs_embeds is not None else {}),
+                positions=positions,
+                intermediate_tensors=intermediate_inputs,
+                **kwargs,
+            )
+
+            if isinstance(output_hidden_or_intermediate_states, torch.Tensor):
+                hidden_or_intermediate_states = weak_ref_tensor(
+                    output_hidden_or_intermediate_states)
+            elif isinstance(output_hidden_or_intermediate_states,
+                            IntermediateTensors):
+                hidden_or_intermediate_states = IntermediateTensors(
+                    tensors={
+                        key: weak_ref_tensor(value)
+                        for key, value in
+                        output_hidden_or_intermediate_states.tensors.items()
+                    })
+
+            del output_hidden_or_intermediate_states
+            # make sure `output_hidden_or_intermediate_states` is deleted
+            # in the graph's memory pool
+            gc.collect()
+        torch.cuda.synchronize()
+
+        # Save the input and output buffers.
+        self.input_buffers = {
+            "input_ids":
+            input_ids,
+            **({
+                "inputs_embeds": inputs_embeds,
+            } if inputs_embeds is not None else {}),
+            "positions":
+            positions,
+            "kv_caches":
+            kv_caches,
+            **self.attn_state.get_graph_input_buffers(
+                attn_metadata, self._is_encoder_decoder_model),
+            **kwargs,
+        }
+        if intermediate_inputs is not None:
+            self.input_buffers.update(intermediate_inputs.tensors)
+        if get_pp_group().is_last_rank:
+            self.output_buffers = {
+                "hidden_states": hidden_or_intermediate_states
+            }
+        else:
+            self.output_buffers = hidden_or_intermediate_states
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        **kwargs,
+    ) -> torch.Tensor:
+        attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
+
+        # Copy the input tensors to the input buffers.
+        self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True)
+        if positions is not None:
+            # in some case like MLA, it will reuse positions in metadata
+            # but truncate them to the original size
+            # so the shape is not padded, we need to copy partial only
+            self.input_buffers["positions"][:positions.shape[0]].copy_(
+                positions, non_blocking=True)
+        if inputs_embeds is not None:
+            self.input_buffers["inputs_embeds"][:inputs_embeds.shape[0]].copy_(
+                inputs_embeds, non_blocking=True)
+
+        if self.backend_name != "NO_ATTENTION":
+            self.input_buffers["slot_mapping"].copy_(
+                attn_metadata.slot_mapping, non_blocking=True)
+
+        self.attn_state.prepare_graph_input_buffers(
+            self.input_buffers, attn_metadata, self._is_encoder_decoder_model)
+
+        if "seqlen_agnostic_capture_inputs" in self.input_buffers:
+            self.model.copy_inputs_before_cuda_graphs(self.input_buffers,
+                                                      **kwargs)
+
+        if "previous_hidden_states" in self.input_buffers:
+            self.input_buffers["previous_hidden_states"].copy_(
+                kwargs["previous_hidden_states"], non_blocking=True)
+
+        if intermediate_tensors is not None:
+            for key in intermediate_tensors.tensors:
+                if key != "model_execute_time" and key != "model_forward_time":
+                    self.input_buffers[key].copy_(intermediate_tensors[key],
+                                                  non_blocking=True)
+        if self._is_encoder_decoder_model:
+            self.input_buffers["encoder_input_ids"].copy_(
+                kwargs['encoder_input_ids'], non_blocking=True)
+            self.input_buffers["encoder_positions"].copy_(
+                kwargs['encoder_positions'], non_blocking=True)
+
+        # Run the graph.
+        self.graph.replay()
+        # Return the output tensor.
+        if get_pp_group().is_last_rank:
+            return self.output_buffers["hidden_states"]
+
+        return self.output_buffers
diff --git a/vllm_v0.10.0/vllm/worker/model_runner_base.py b/vllm_v0.10.0/vllm/worker/model_runner_base.py
new file mode 100644
index 0000000..feca8a7
--- /dev/null
+++ b/vllm_v0.10.0/vllm/worker/model_runner_base.py
@@ -0,0 +1,290 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import dataclasses
+from abc import ABC, abstractmethod
+from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Type,
+                    TypeVar)
+
+import torch
+import torch.nn as nn
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.models.interfaces_base import is_pooling_model
+from vllm.pooling_params import PoolingTask
+from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
+
+if TYPE_CHECKING:
+    from vllm.attention import AttentionMetadata
+    from vllm.attention.backends.abstract import AttentionBackend
+    from vllm.model_executor import SamplingMetadata
+
+logger = init_logger(__name__)
+
+T = TypeVar('T', bound="BroadcastableModelInput")
+
+
+def _add_attn_metadata_broadcastable_dict(
+        tensor_dict: Dict[str, Any],
+        attn_metadata: Optional["AttentionMetadata"]) -> None:
+    """
+    Helper method to update tensor_dict with broadcastable
+    AttentionMetadata fields.
+    """
+    if attn_metadata is not None:
+        tensor_dict.update(attn_metadata.asdict_zerocopy())
+
+
+def _init_attn_metadata_from_tensor_dict(
+    attn_backend: "AttentionBackend",
+    tensor_dict: Dict[str, Any],
+) -> Dict[str, Any]:
+    """
+    Helper method to initialize AttentionMetadata based on an
+    AttentionBackend and broadcastable AttentionMetadata fields.
+    """
+    # Extract the fields used to create AttentionMetadata.
+    valid_attn_kwargs = {}
+    for field in dataclasses.fields(attn_backend.get_metadata_cls()):
+        if field.name in tensor_dict:
+            if field.name == "input_positions":
+                valid_attn_kwargs[field.name] = tensor_dict[field.name]
+            else:
+                valid_attn_kwargs[field.name] = tensor_dict.pop(field.name)
+
+    attn_metadata = attn_backend.make_metadata(**valid_attn_kwargs)
+    tensor_dict["attn_metadata"] = attn_metadata
+    return tensor_dict
+
+
+def _init_sampling_metadata_from_tensor_dict(  # type: ignore
+        tensor_dict: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Helper method to initialize SamplingMetadata based on broadcastable
+    SamplingMetadata fields.
+    """
+    from vllm.model_executor import SamplingMetadata
+
+    selected_token_indices = tensor_dict.pop("selected_token_indices", None)
+    # An empty SamplingMetadata to signal that the worker should skip
+    # sampling.
+    if selected_token_indices is not None:
+        tensor_dict["sampling_metadata"] = SamplingMetadata(
+            seq_groups=None,
+            selected_token_indices=selected_token_indices,
+            categorized_sample_indices=None,
+            num_prompts=0,
+        )
+    return tensor_dict
+
+
+def _add_sampling_metadata_broadcastable_dict(
+        tensor_dict: Dict[str, Any],
+        sampling_metadata: Optional["SamplingMetadata"]) -> None:
+    """
+    Helper method to update tensor_dict with broadcastable
+    SamplingMetadata fields.
+    """
+    if sampling_metadata is not None:
+        tensor_dict["selected_token_indices"] = (
+            sampling_metadata.selected_token_indices)
+
+
+def _init_frozen_model_input_from_tensor_dict(
+        frozen_model_input_cls: Type["ModelRunnerInputBase"],
+        tensor_dict: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Helper method to initialize a frozen ModelInput based on broadcastable
+    """
+    valid_tensor_kwargs = {}
+    for field in dataclasses.fields(frozen_model_input_cls):
+        val = tensor_dict.pop(field.name, None)
+        if val is not None:
+            valid_tensor_kwargs[field.name] = val
+
+    frozen_model_input = frozen_model_input_cls(**valid_tensor_kwargs)
+    tensor_dict["frozen_model_input"] = frozen_model_input
+    return tensor_dict
+
+
+class BroadcastableModelInput(ABC):
+
+    @abstractmethod
+    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
+        """
+        Extract broadcastable fields. Override for fields that require some
+        custom deserialization.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def from_broadcasted_tensor_dict(
+        cls: Type[T],
+        tensor_dict: Dict[str, Any],
+        attn_backend: Optional["AttentionBackend"] = None,
+    ) -> T:
+        """
+        Pop fields from the given tensor_dict and populate a new instance of
+        BroadcastableModelInput.
+        """
+        raise NotImplementedError
+
+
+@dataclasses.dataclass(frozen=True)
+class ModelRunnerInputBase(BroadcastableModelInput):
+    """Local inputs to each worker's model runner. May contain
+    device-specific data. Different worker backends may have different methods
+    of converting from the global ExecuteModelRequest produced by the LLM
+    engine to the worker-local ModelRunnerInputBase objects.
+
+    Model runners that support multi-GPU execution should define a
+    ModelRunnerInputBase subclass, add their required fields, and specify how to
+    serialize/deserialize a ModelInput for broadcast between workers.
+    """
+    pass
+
+
+class ModelRunnerInputBuilderBase(ABC, Generic[T]):
+    """A builder to create ModelRunnerInputBase objects.
+  """
+
+    @abstractmethod
+    def prepare(self,
+                finished_requests_ids: Optional[List[str]] = None) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_seq_group(self, seq_group_metadata):
+        """TBA"""
+        raise NotImplementedError
+
+    @abstractmethod
+    def build(self, *args, **kwargs) -> T:
+        """Build metadata with on-device tensors."""
+        raise NotImplementedError
+
+
+class ModelRunnerBase(ABC, Generic[T]):
+    """
+    Model runner interface that abstracts a particular hardware and/or type of
+    model. Model execution may communicate data with model runners in other
+    processes, but it should not include control plane metadata communication.
+
+    Each ModelRunnerBase subclass should define a corresponding
+    ModelRunnerInputBase subclass.
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+    ) -> None:
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.load_config = vllm_config.load_config
+        self.parallel_config = vllm_config.parallel_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.device_config = vllm_config.device_config
+        self.speculative_config = vllm_config.speculative_config
+        self.observability_config = vllm_config.observability_config
+
+    # Map of request_id -> generator used for seeded random sampling
+    generators: Dict[str, torch.Generator] = {}
+
+    @abstractmethod
+    def make_model_input_from_broadcasted_tensor_dict(
+        self,
+        tensor_dict: Dict[str, Any],
+    ) -> T:
+        """
+        Make an instance of a ModelRunnerInputBase from the broadcasted tensor
+        dict.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def prepare_model_input(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        virtual_engine: int = 0,
+        finished_requests_ids: Optional[List[str]] = None,
+    ) -> T:
+        """
+        Prepare the inputs to ModelRunnerBase.execute_model from an execution
+        request. This method may move data to the worker's local device. It is
+        not allowed to communicate with other workers or devices.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_model(self) -> nn.Module:
+        raise NotImplementedError
+
+    def get_supported_pooling_tasks(self) -> list[PoolingTask]:
+        model = self.get_model()
+        if not is_pooling_model(model):
+            return []
+
+        return list(model.pooler.get_supported_tasks())
+
+    def execute_model(
+        self,
+        model_input: T,
+        kv_caches: Optional[List[torch.Tensor]],
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        num_steps: int = 1,
+        **kwargs,
+    ) -> Optional[List[SamplerOutput]]:
+        """
+        Execute the model on the given input.
+        """
+        raise NotImplementedError
+
+    def get_generators(self, finished_request_ids: Optional[List[str]] = None):
+        """
+        Return dict of per-request generators used for random sampling.
+        """
+
+        # Clean up generators from completed requests
+        if finished_request_ids:
+            for request_id in finished_request_ids:
+                self.generators.pop(request_id, None)
+
+        return self.generators
+
+
+class ModelRunnerWrapperBase:
+    """
+    The whole point of this class is to lazily initialize the model_runner.
+    """
+
+    def __init__(
+        self,
+        model_runner: ModelRunnerBase,
+    ) -> None:
+        self.model_runner: ModelRunnerBase = model_runner
+
+    def __getattr__(self, attr):
+        return getattr(self.model_runner, attr)
+
+
+class InputProcessingError(Exception):
+    """This exception is raised when an error occurs preparing the inputs for
+    a single sequence group.
+    This allows the engine to gracefully handle errors with a single sequence
+    group without having to fail the entire batch.
+    """
+
+    def __init__(self, request_id, message):
+        """request_id is the id of the offending sequence group"""
+        self.request_id = request_id
+        self.message = message
+        super().__init__(self.message)
+
+    def __str__(self):
+        return "Failed to prepare inputs for sequence group with request id: " \
+                f"{self.request_id}, Error: {self.message}"
diff --git a/vllm_v0.10.0/vllm/worker/multi_step_model_runner.py b/vllm_v0.10.0/vllm/worker/multi_step_model_runner.py
new file mode 100644
index 0000000..2aa910b
--- /dev/null
+++ b/vllm_v0.10.0/vllm/worker/multi_step_model_runner.py
@@ -0,0 +1,908 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import dataclasses
+import functools
+from dataclasses import dataclass, field
+from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple,
+                    Union)
+
+import torch
+
+from vllm.distributed import get_pp_group
+from vllm.logger import init_logger
+from vllm.model_executor.layers.sampler import (PromptLogprobs, SampleLogprobs,
+                                                SamplerOutput,
+                                                SamplingMetadata, get_logprobs,
+                                                get_pythonized_sample_results)
+from vllm.platforms import current_platform
+from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors,
+                           Logprob, SequenceGroupMetadata, SequenceOutput)
+from vllm.utils import PyObjectCache, async_tensor_h2d, current_stream
+from vllm.worker.model_runner import (GPUModelRunnerBase,
+                                      ModelInputForGPUWithSamplingMetadata)
+from vllm.worker.model_runner_base import (
+    BroadcastableModelInput, _init_attn_metadata_from_tensor_dict,
+    _init_frozen_model_input_from_tensor_dict,
+    _init_sampling_metadata_from_tensor_dict)
+
+from ..model_executor.model_loader.tensorizer import TensorizerConfig
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
+
+logger = init_logger(__name__)
+
+MULTI_STEP_ATTENTION_BACKENDS = [
+    "FLASH_ATTN", "ROCM_FLASH", "FLASHINFER", "NO_ATTENTION"
+]
+MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS = ["FLASH_ATTN", "FLASHINFER"]
+
+def _get_supported_attention_backends(chunked_prefill_enabled: bool) \
+    -> List[str]:
+    if chunked_prefill_enabled:
+        return MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS
+    else:
+        return MULTI_STEP_ATTENTION_BACKENDS
+
+
+def seq_output_builder():
+    return SequenceOutput(
+        0, 0,
+        {0: Logprob(logprob=float('inf'), rank=None, decoded_token=None)})
+
+
+def completion_seq_group_output_builder():
+    return CompletionSequenceGroupOutput([], None)
+
+
+# Used by pythonization to reduce python object allocations
+class PythonizationCache:
+
+    def __init__(self):
+        self.cached_seq_output = PyObjectCache(seq_output_builder)
+        self.cached_completion_seq_group_output = PyObjectCache(
+            completion_seq_group_output_builder)
+
+    def reset(self):
+        self.cached_seq_output.reset()
+        self.cached_completion_seq_group_output.reset()
+
+
+@dataclass
+class ModelOutput:
+    """The output of a single model forward pass.
+
+    The sampler_output_ready_event is set when the tensors in
+    sampler_output are ready (the model+sampler forward pass has
+    completed). We use the event to synchronize the GPU->CPU transfer,
+    which we want to only run when the data has been written to the
+    GPU tensors. Until the event is ready, the tensors in sampler_output
+    will have garbage data.
+
+    There are two scenarios:
+    1. The output tensors are ready and we can pythonize them immediately.
+    2. The output tensors are not ready and we need to wait for the event to be
+    ready.
+    """
+    sampler_output: SamplerOutput
+    sampler_output_ready_event: torch.cuda.Event
+    sampled_token_ids: Optional[torch.Tensor] = None
+    pythonized: bool = False
+    # On-device tensor containing the logprobs of each token.
+    logprobs: Optional["torch.Tensor"] = None
+    pythonization_cache: Optional[PythonizationCache] = None
+
+    def pythonize(self, input_metadata: "StatefulModelInput",
+                  copy_stream: torch.cuda.Stream,
+                  pinned_sampled_token_buffer: torch.Tensor) -> None:
+        """Pythonize the output. Blocking."""
+        if not self.pythonized:
+            self._pythonize_sampler_output(input_metadata, copy_stream,
+                                           pinned_sampled_token_buffer, True)
+            self.pythonized = True
+
+    def maybe_pythonize(self, input_metadata: "StatefulModelInput",
+                        copy_stream: torch.cuda.Stream,
+                        pinned_sampled_token_buffer: torch.Tensor) -> None:
+        """Pythonize the output if ready, else return None. Non-blocking."""
+        if not self.pythonized:
+            self.pythonized = self._pythonize_sampler_output(
+                input_metadata, copy_stream, pinned_sampled_token_buffer,
+                False)
+
+    def _pythonize_sampler_output(self, input_metadata: "StatefulModelInput",
+                                  copy_stream: torch.cuda.Stream,
+                                  pinned_sampled_token_buffer: torch.Tensor,
+                                  blocking: bool) -> bool:
+        """
+        If blocking is set, will block until the forward pass for the output is
+        ready and pythonize the output. Upon completing Pythonization, erases
+        self.logprobs (note that a non-blocking call that is performed when
+        the sampler output is not yet ready, will not erase self.logprobs.)
+        """
+        assert self.sampled_token_ids is not None
+        if not blocking and not self.sampler_output_ready_event.query():
+            return False
+
+        if blocking:
+            self.sampler_output_ready_event.synchronize()
+        with torch.cuda.stream(copy_stream):
+            _pythonize_sampler_output(input_metadata, self.sampler_output,
+                                      pinned_sampled_token_buffer,
+                                      self.sampled_token_ids, self.logprobs,
+                                      self.pythonization_cache)
+
+        # Erase the logprobs GPU-side tensor.
+        # Note that although _pythonize_sampler_output() runs in its
+        # own CUDA stream, nonetheless _pythonize_sampler_output()
+        # cannot return until Pythonization is complete; therefore
+        # we know that by the time the CPU reaches this point,
+        # `self.logprobs` is no longer needed.
+        self.logprobs = None
+        return True
+
+
+@dataclass(frozen=False)
+class StatefulModelInput(BroadcastableModelInput):
+    # actual frozen model input dataclass passed to _base_model_runner
+    frozen_model_input: Optional[ModelInputForGPUWithSamplingMetadata] = None
+
+    # list of model outputs for each step, may not be all pythonized
+    cached_outputs: List[ModelOutput] = field(default_factory=list)
+
+    # used to pass sampled token ids from the last step to the current step for
+    # TP workers. Used to append to end of outputs and used by advance_step
+    last_sampled_token_ids: Optional[torch.Tensor] = None
+    current_step: int = 0
+    is_multi_step: bool = True
+    is_last_step: bool = False
+    is_first_multi_step: bool = False
+    base_output_proc_callback: Optional[Callable] = None
+    # ping-pong data structures for multi-step to wait on the previous step
+    step_cuda_events: List[current_platform.Event] = field(
+        default_factory=lambda: [current_platform.Event(blocking=True)] * 2)
+    num_seqs: int = -1
+    num_queries: int = -1
+    num_single_step_prefills: int = 0
+
+    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
+        assert self.frozen_model_input is not None
+        tensor_dict = self.frozen_model_input.as_broadcastable_tensor_dict()
+        new_tensor_dict = {
+            'last_sampled_token_ids': self.last_sampled_token_ids,
+            'current_step': self.current_step,
+            'is_multi_step': self.is_multi_step,
+            'is_last_step': self.is_last_step,
+            'is_first_multi_step': self.is_first_multi_step,
+            'num_seqs': self.num_seqs,
+            'num_queries': self.num_queries,
+            'num_single_step_prefills': self.num_single_step_prefills,
+        }
+        tensor_dict.update(new_tensor_dict)
+        return tensor_dict
+
+    @classmethod
+    def from_broadcasted_tensor_dict(
+        cls,
+        tensor_dict: Dict[str, Any],
+        attn_backend: Optional["AttentionBackend"] = None,
+    ) -> "StatefulModelInput":
+        tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict)
+        if attn_backend is not None:
+            tensor_dict = _init_attn_metadata_from_tensor_dict(
+                attn_backend, tensor_dict)
+        tensor_dict = _init_frozen_model_input_from_tensor_dict(
+            ModelInputForGPUWithSamplingMetadata, tensor_dict)
+
+        return cls(**tensor_dict)
+
+    def record_step_event(self, current_stream: torch.cuda.Stream):
+        # record the event for the current step so that the next step can sync
+        # on it. We modulo by 2 to keep the events in a circular buffer and
+        # support any attn backends that may be supported in the future. ie
+        # Flashinfer would want two DecodeWrappers to overlap the CPU and GPU.
+        self.step_cuda_events[self.current_step & 1] = \
+            torch.cuda.Event(blocking=True)
+        self.step_cuda_events[self.current_step & 1].record(current_stream)
+
+    def wait_previous_step(self):
+        # These cuda events are an explicit synchronization to ensure that
+        # advance_step() (for other attn backends that may be supported in the
+        # future) do not clobber any data structures that is also used by any
+        # enqueued forwards steps. For distributed case, only a single event is
+        # needed, but for single GPU case, since we can let the CPU run much
+        # further ahead, two events allow us to overlap the advance_step with
+        # the previous forward (ie using two DecodeWrappers for flashinfer
+        # backend)
+        self.step_cuda_events[(self.current_step + 1) & 1].wait()
+
+    def add_sampler_output(self,
+                           sampler_output: SamplerOutput,
+                           sampled_token_ids: Optional[torch.Tensor] = None):
+        self.cached_outputs.append(
+            ModelOutput(sampler_output=sampler_output,
+                        sampler_output_ready_event=None,
+                        sampled_token_ids=sampled_token_ids,
+                        pythonized=False))
+
+    def maybe_advance_sampling_metadata(self, device: str, pin_memory: bool):
+        """
+        sampling_metadata.selected_token_indices is constructed for the
+        first-step in Multi-Step. However, when chunked-prefill is enabled with
+        multi-step, the scheduled prompts are fully processed in the
+        first-step and are processed as decodes in the rest of the steps.
+        This function updates the sampling_metadata.selected_token_indices
+        to account for this conversion.
+
+        Example:
+        Let 2 prompts and 2 decodes be scheduled together. Let the
+        num-tokens to process for the 2 prompts be 5 and 8 respectively.
+
+        In that case, sampling_metadata.sampled_token_indices will be,
+        [4, 12, 13, 14] as it is constructed for the first-step in
+        multi-step.
+        However, the prompts turns to decodes after the first-step
+        and the num-tokens for the previously-prompt sequences will
+        be 1 and 1 as they are decodes now. The self.sampled_token_indices
+        must be updated to [0,1,2,3].
+        """
+        assert self.current_step == 1 and self.num_single_step_prefills > 0
+        if not get_pp_group().is_last_rank:
+            return
+
+        assert self.frozen_model_input is not None
+        assert self.frozen_model_input.sampling_metadata is not None
+        self.frozen_model_input.sampling_metadata.selected_token_indices =  \
+            async_tensor_h2d(list(range(self.num_queries)),
+                             dtype=torch.long,
+                             target_device=device,
+                             pin_memory=pin_memory)
+
+    def maybe_advance_frozen_model_input(self, device: str, pin_memory: bool):
+        """
+        Advancing the datastructures of StatefulModelInput::frozen_model_input
+        is only required when prefills are scheduled with decodes to run in
+        multi-step. This advancement/correction is required to account for
+        the conversion of Prefills to Decodes after the first multi-step.
+        """
+        if self.current_step != 1 or self.num_single_step_prefills == 0:
+            return
+
+        assert self.frozen_model_input is not None
+        fmi = self.frozen_model_input
+
+        # Truncate input_tokens
+        assert fmi.input_tokens is not None
+        assert fmi.input_tokens.shape[0] >= self.num_seqs
+        fmi_new_input_tokens: torch.Tensor = fmi.input_tokens[:self.num_seqs]
+
+        # Update frozen_model_input::input_positions.
+        assert fmi.input_positions is not None
+        assert fmi.input_positions.shape[0] >= self.num_seqs
+        fmi_new_input_positions: torch.Tensor = fmi.input_positions[:self.
+                                                                    num_seqs]
+
+        # Assert unsupported
+        assert fmi.lora_mapping is None
+        assert fmi.lora_requests is not None
+        assert len(fmi.lora_requests) == 0
+        assert fmi.attn_metadata is not None
+        assert fmi.multi_modal_kwargs is not None
+        assert len(fmi.multi_modal_kwargs) == 0
+
+        self.frozen_model_input = dataclasses.replace(
+            self.frozen_model_input,
+            input_tokens=fmi_new_input_tokens,
+            input_positions=fmi_new_input_positions)
+
+        self.maybe_advance_sampling_metadata(device, pin_memory)
+
+
+# MutableModelInputForGPUWithMultiStepMetadata is not subclass of
+# ModelInputForGPU but it wraps the actual input dataclass and adds multi-step
+# metadata
+# mypy: disable-error-code=type-var
+class MultiStepModelRunner(GPUModelRunnerBase[StatefulModelInput]):
+    # mypy: enable-error-code=type-var
+
+    def __init__(self, base_model_runner: GPUModelRunnerBase, *args, **kwargs):
+
+        super().__init__(*args, **kwargs)
+
+        # Check attention backend support.
+        supported_attention_backends: List[str] = \
+            _get_supported_attention_backends(
+                self.scheduler_config.chunked_prefill_enabled)
+        if self.attn_backend.get_name() not in supported_attention_backends:
+            ms_config_str: str = "Multi-Step + Chunked-Prefill" \
+                if self.scheduler_config.chunked_prefill_enabled \
+                      else "Multi-Step"
+            raise ValueError(
+                f"{ms_config_str} not supported for attention backend: "
+                f"{self.attn_backend.get_name()}. Set VLLM_ATTENTION_BACKEND "
+                f"to a value from {supported_attention_backends}.")
+
+        # uses the base model runner to execute the model and wraps it with
+        # multi-step logic
+        self._base_model_runner: GPUModelRunnerBase = base_model_runner
+
+        self.is_multi_step = self.scheduler_config.is_multi_step
+        self.pinned_sampled_token_ids: Optional[torch.Tensor] = None
+
+        # Using the PythonizationCache in Pipeline-Parallel clobbers the
+        # SequenceOutput and CompletionSequenceGroupOutput object.
+        # When cache-reset happens at the last step of a multi-step
+        # execution, there may be other on-going single-step/multi-step
+        # executions. The current caching implementation does not check
+        # for this.
+        self.pythonization_cache = PythonizationCache() \
+            if self.parallel_config.pipeline_parallel_size == 1 else None
+
+    @functools.cached_property
+    def _copy_stream(self):
+        # used to copy tensors from GPU to CPU asynchronously
+        return torch.cuda.Stream()
+
+    def make_model_input_from_broadcasted_tensor_dict(
+            self, tensor_dict: Dict[str, Any]) -> StatefulModelInput:
+        model_input = (StatefulModelInput.from_broadcasted_tensor_dict(
+            tensor_dict,
+            attn_backend=self.attn_backend,
+        ))
+        return model_input
+
+    def prepare_model_input(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        virtual_engine: int = 0,
+        finished_requests_ids: Optional[List[str]] = None
+    ) -> StatefulModelInput:
+        frozen_model_input: ModelInputForGPUWithSamplingMetadata = \
+              self._base_model_runner.prepare_model_input(
+                    seq_group_metadata_list,
+                    virtual_engine,
+                    finished_requests_ids)
+
+        assert frozen_model_input.query_lens is not None
+        assert frozen_model_input.seq_lens is not None
+        assert frozen_model_input.attn_metadata is not None
+        num_queries = len(frozen_model_input.query_lens)
+        num_seqs = len(frozen_model_input.seq_lens)
+        num_single_step_prefills = frozen_model_input.attn_metadata.num_prefills
+
+        model_input = StatefulModelInput(
+            frozen_model_input=frozen_model_input,
+            num_seqs=num_seqs,
+            num_queries=num_queries,
+            num_single_step_prefills=num_single_step_prefills)
+
+        return model_input
+
+    def _async_process_outputs(self, model_input: StatefulModelInput,
+                               output_proc_callback: Callable):
+        # Proceed with pythonization and output_proc in order.
+        # Stop on the first one that fails to pythonize
+        output_proc_callback()
+
+        cont = True
+        for step_num, model_output in enumerate(model_input.cached_outputs):
+            if not model_output.pythonized:
+                model_output.maybe_pythonize(model_input, self._copy_stream,
+                                             self.pinned_sampled_token_ids)
+                if model_output.pythonized:
+                    ctx = output_proc_callback.keywords["ctx"]
+                    ctx.append_output(
+                        outputs=[model_output.sampler_output],
+                        seq_group_metadata_list=ctx.seq_group_metadata_list,
+                        scheduler_outputs=ctx.scheduler_outputs,
+                        is_async=False,
+                        is_last_step=False,
+                        is_first_step_output=step_num == 0)
+
+                    output_proc_callback()
+                else:
+                    cont = False
+
+            if not cont:
+                break
+
+    def _final_process_outputs(
+            self, model_input: StatefulModelInput,
+            output_proc_callback: Optional[Callable]) -> List[SamplerOutput]:
+        assert model_input.frozen_model_input is not None
+
+        has_async_callback = output_proc_callback is not None
+
+        outputs = []
+        for step_num, output in enumerate(model_input.cached_outputs):
+            is_last_step = step_num == len(model_input.cached_outputs) - 1
+
+            # For non-async case:
+            #   -- We simply add the outputs
+            # For async case:
+            #   -- Invoke callback, pythonize, add to callback queue and repeat
+            #   -- For last output, just add to callback queue
+            if has_async_callback:
+                assert output_proc_callback is not None
+
+                # Invoke callback before pythonize (to overlap with GPU)
+                output_proc_callback()
+
+                # Pythonize
+                if not output.pythonized:
+                    output.pythonize(model_input, self._copy_stream,
+                                     self.pinned_sampled_token_ids)
+
+                    # For non last step, add to callback queue to chain
+                    # callbacks=>pythonize pairs (for GPU overlap)
+                    if not is_last_step:
+                        ctx = output_proc_callback.keywords[  # type: ignore
+                            "ctx"]  # type: ignore
+                        ctx.append_output(
+                            outputs=[output.sampler_output],
+                            seq_group_metadata_list=ctx.
+                            seq_group_metadata_list,
+                            scheduler_outputs=ctx.scheduler_outputs,
+                            is_async=False,
+                            is_last_step=False,
+                            is_first_step_output=step_num == 0)
+                    else:
+                        outputs.append(output.sampler_output)
+            else:
+                output.pythonize(model_input, self._copy_stream,
+                                 self.pinned_sampled_token_ids)
+                outputs.append(output.sampler_output)
+
+        return outputs
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        model_input: StatefulModelInput,
+        kv_caches: List[torch.Tensor],
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        num_steps: int = 1,
+    ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]:
+        """ 
+        Execute the model for a single step and update multi-step
+        metadata
+        """
+        assert num_steps == 1, "MultiStepModelRunner only supports num_steps=1"
+        frozen_model_input = model_input.frozen_model_input
+        assert frozen_model_input is not None
+
+        # path for warm up runs
+        if not model_input.is_multi_step:
+            return self._base_model_runner.execute_model(
+                frozen_model_input, None, intermediate_tensors, num_steps)
+
+        # make sure we skip the sampler on the lask rank and only pythonize
+        # if CPU is ahead.
+        if self.is_driver_worker and get_pp_group().is_last_rank:
+            if self.pinned_sampled_token_ids is None:
+                self.pinned_sampled_token_ids = torch.zeros(
+                    (self.scheduler_config.max_num_seqs, 1),
+                    dtype=torch.long,
+                    device="cpu",
+                    pin_memory=True)
+
+            self._base_model_runner.sampler.include_gpu_probs_tensor = True
+            if frozen_model_input.sampling_metadata:
+                frozen_model_input.sampling_metadata.skip_sampler_cpu_output = (
+                    True)
+
+        # some pre-execute model logic for multi-step:
+        #   - if it's the first step, we need to reset the sampling tensors
+        #   - if it's not the first step, we need to advance the step using the
+        #   appended sampler output from last iteration
+        #   - also maybe pythonize if CPU is ahead of GPU
+
+        stream = current_stream()
+        if not model_input.is_first_multi_step:
+            # Explicitly block on the previous step's forward to make sure we
+            # don't clobber any GPU tensors still in use.
+            # This is not needed for flashattn backend, but for other attn
+            # backends such as flashinfer that performs extra CPU operations on
+            # input metadata we may need to synchronize any CPU operations that
+            # might clobber enqueued forwards. (prevents CPU from running too
+            # far ahead if needed)
+            model_input.wait_previous_step()
+            model_input = self._advance_step(
+                model_input, model_input.cached_outputs[-1].sampler_output)
+
+            # frozen_model_input may have been updated
+            frozen_model_input = model_input.frozen_model_input
+            assert frozen_model_input is not None
+
+        if model_input.base_output_proc_callback is None:
+            assert frozen_model_input is not None
+            model_input.base_output_proc_callback = \
+                        frozen_model_input.async_callback
+
+        if frozen_model_input.async_callback is not None:
+            assert model_input.base_output_proc_callback is not None
+            async_callback = functools.partial(
+                self._async_process_outputs,
+                model_input=model_input,
+                output_proc_callback=model_input.base_output_proc_callback)
+
+            model_input.frozen_model_input = dataclasses.replace(  # type: ignore
+                model_input.frozen_model_input,
+                async_callback=async_callback)
+            # Update the local instance
+            frozen_model_input = model_input.frozen_model_input
+            assert frozen_model_input is not None
+
+        # Execute the model
+        output = self._base_model_runner.execute_model(frozen_model_input,
+                                                       None,
+                                                       intermediate_tensors,
+                                                       num_steps=1)
+
+        # record the event for the current step so that the next step can sync
+        model_input.record_step_event(stream)
+
+        if get_pp_group().is_last_rank and self.is_driver_worker:
+            assert isinstance(output, list)
+            assert len(
+                output
+            ) == 1, "MultiStepModelRunner requires single-step base_models"
+
+            # event for the pythonization so that we only pythonize if the
+            # tensors are ready. May be able to be combined with the step event
+            output_ready_event = torch.cuda.Event()
+            output_ready_event.record(stream)
+            if self.parallel_config.pipeline_parallel_size > 1:
+                output[0].sampled_token_ids_cpu = output[
+                    0].sampled_token_ids.cpu()
+            model_input.cached_outputs.append(
+                ModelOutput(output[0], output_ready_event,
+                            output[0].sampled_token_ids, False,
+                            output[0].logprobs, self.pythonization_cache))
+
+            # These GPU tensors are not required by multi-step;
+            # erase them to ensure they are not pythonized or
+            # transferred to CPU
+            output[0].sampled_token_ids = None
+            output[0].sampled_token_probs = None
+            output[0].logprobs = None
+
+            # Pythonize the output if CPU is ahead and the previous step is
+            # ready.
+            if frozen_model_input.async_callback is None:
+                for model_output in model_input.cached_outputs:
+                    model_output.maybe_pythonize(model_input,
+                                                 self._copy_stream,
+                                                 self.pinned_sampled_token_ids)
+
+        model_input.current_step += 1
+
+        if not get_pp_group().is_last_rank:
+            # Should be IntermediateTensors
+            assert isinstance(output, IntermediateTensors)
+            return output
+        if not self.is_driver_worker:
+            return []
+
+        # Pythonize the output and block if needed since it is the last step
+        if model_input.is_last_step:
+            outputs = self._final_process_outputs(
+                model_input, model_input.base_output_proc_callback)
+            if self.pythonization_cache:
+                self.pythonization_cache.reset()
+            return outputs
+
+        # should be [SamplerOutput]
+        return output
+
+    def _update_sampling_metadata(self, sampling_metadata: SamplingMetadata,
+                                  num_seqs: Optional[int], num_queries: int):
+
+        assert sampling_metadata.num_prompts == 0
+        assert len(sampling_metadata.seq_groups) == num_queries
+        assert sampling_metadata.selected_token_indices.shape == (
+            num_queries, )
+        # assert sampling_metadata.categorized_sample_indices == TODO: Add if needed # noqa: E501
+
+        # Verify that all sequences are decodes
+        for i in range(num_queries):
+            seq_group = sampling_metadata.seq_groups[i]
+
+            assert seq_group.is_prompt is False  # No prompt
+            assert seq_group.prompt_logprob_indices == []  # No prompt
+            assert seq_group.sample_indices == [i]  # Simple
+            assert seq_group.seq_len is None  # Decode
+            assert seq_group.query_len is None  # Decode
+
+    def _advance_step(self, model_input: StatefulModelInput,
+                      out: SamplerOutput) -> StatefulModelInput:
+
+        model_input.maybe_advance_frozen_model_input(self.device,
+                                                     self.pin_memory)
+        frozen_model_input = model_input.frozen_model_input
+        assert frozen_model_input is not None
+        assert frozen_model_input.input_tokens is not None
+        assert frozen_model_input.input_tokens.shape[0] == model_input.num_seqs
+        assert frozen_model_input.attn_metadata is not None
+
+        sampled_token_ids = model_input.cached_outputs[-1].sampled_token_ids
+        num_seqs = model_input.num_seqs
+        num_queries = model_input.num_queries
+        frozen_model_input = model_input.frozen_model_input
+        assert frozen_model_input is not None
+        attn_metadata = frozen_model_input.attn_metadata
+        assert attn_metadata is not None
+
+        turn_prefills_into_decodes: bool = model_input.current_step == 1 and \
+                                    model_input.num_single_step_prefills != 0
+        attn_metadata.advance_step(
+            frozen_model_input,
+            sampled_token_ids,
+            self.block_size,
+            num_seqs,
+            num_queries,
+            turn_prefills_into_decodes=turn_prefills_into_decodes)
+
+        return model_input
+
+    def load_model(self) -> None:
+        self._base_model_runner.load_model()
+        self.model_memory_usage = self._base_model_runner.model_memory_usage
+
+    def save_sharded_state(
+        self,
+        path: str,
+        pattern: Optional[str] = None,
+        max_size: Optional[int] = None,
+    ) -> None:
+        return self._base_model_runner.save_sharded_state(
+            path, pattern, max_size)
+
+    def save_tensorized_model(self,
+                              tensorizer_config: TensorizerConfig) -> None:
+        return self._base_model_runner.save_tensorized_model(tensorizer_config)
+
+    def profile_run(self) -> None:
+        return self._base_model_runner.profile_run()
+
+    def remove_all_loras(self):
+        return self._base_model_runner.remove_all_loras()
+
+    def capture_model(self, kv_caches: List[List]) -> None:
+        return self._base_model_runner.capture_model(kv_caches)
+
+    @property
+    def vocab_size(self) -> int:
+        return self._base_model_runner.vocab_size
+
+
+DeferredLogprobsReturnType = Tuple[Optional[List[Optional[PromptLogprobs]]],
+                                   Optional[List[SampleLogprobs]]]
+
+
+def deferred_pythonize_logprobs(
+    output: SamplerOutput,
+    sampling_metadata: SamplingMetadata,
+    logprobs_tensor: Optional[torch.Tensor],
+) -> DeferredLogprobsReturnType:
+    """Perform deferred logprob Pythonization.
+
+    1. Pythonize GPU-side sampler result tensors into CPU-side sampler result.
+    2. Pythonize GPU-side logprobs tensor into CPU-side logprobs lists,
+       utilizing  the Pythonized sampler result computed in step 1.
+    
+    These deferred computations are not required for single-step scheduling
+    or the `profile_run()` phase of multi-step scheduling.
+
+    Args:
+        output: sampler output (under deferred Pythonization)
+        sampling_metadata
+        
+    Returns:
+        prompt_logprobs (CPU), sample_logprobs (CPU)
+    """
+
+    # - Deferred pythonization of sample result
+    sampler_result = get_pythonized_sample_results(
+        output.deferred_sample_results_args)
+
+    # - Erase the GPU-side deferred sample_result
+    #   computation args to ensure it is never
+    #   pythonized or transferred to CPU
+    output.deferred_sample_results_args = None
+
+    # - Deferred pythonization of logprobs
+    (
+        prompt_logprobs,
+        sample_logprobs,
+    ) = get_logprobs(logprobs_tensor, sampling_metadata, sampler_result)
+    assert len(prompt_logprobs) == len(sampling_metadata.seq_groups)
+    assert len(sample_logprobs) == len(sampling_metadata.seq_groups)
+
+    return prompt_logprobs, sample_logprobs
+
+
+def _pythonize_sampler_output(
+    model_input: StatefulModelInput,
+    output: SamplerOutput,
+    pinned_sampled_token_buffer: torch.Tensor,
+    sampled_token_ids: torch.Tensor,
+    logprobs_tensor: Optional[torch.Tensor],
+    cache: Optional[PythonizationCache],
+) -> None:
+    """ This function is only called when the output tensors are ready.
+    See [`ModelOutput`][vllm.worker.multi_step_model_runner.ModelOutput].
+
+    Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place,
+    adding a Pythonized output data structure
+    ([`CompletionSequenceGroupOutput`][vllm.sequence.CompletionSequenceGroupOutput])
+    for each [`SequenceGroup`][vllm.sequence.SequenceGroup].
+
+    Args:
+      model_input
+      output: sampler output
+      pinned_sampled_token_token_buffer: CPU-side pinned memory
+                                         (receives copy of
+                                         GPU-side token buffer.)
+      sampled_token_ids: GPU-side token buffer
+      logprobs_tensor: GPU-side tensor containing 
+                       logprobs computed during sampling
+    """
+
+    assert model_input.frozen_model_input is not None
+
+    frozen_model_input = model_input.frozen_model_input
+    assert frozen_model_input.sampling_metadata is not None
+    sampling_metadata = frozen_model_input.sampling_metadata
+    # samples generation should have been skipped
+    assert not output.outputs
+
+    pinned_buffer = pinned_sampled_token_buffer[:model_input.num_queries]
+
+    # We guarantee output tensors are ready, so it is safe to
+    # pythonize the sampler output & obtain CPU-side logprobs.
+    #
+    # However we should check whether logprobs pythonization may
+    # be skipped entirely, i.e. because no logprobs were requested
+    # or pythonization was not deferred. To that end,
+    #
+    # * `prompt_logprobs_are_requested_for_prefill` signals that
+    #   there are *any* prefill-phase requests which specify that
+    #   prompt logprobs should be returned.
+    #
+    # * `any_logprobs_are_requested` signals that there are any
+    #   requests which (1) specify that sample logprobs should be
+    #   returned, or (2) are in the prefill phase AND specify that
+    #   prompt logprobs should be returned.
+    #
+    # Later on, these flags cause adjustments to the pythonization
+    # process to accommodate logprobs.
+
+    seq_groups = sampling_metadata.seq_groups
+    prompt_logprobs_are_requested_for_prefill = any([
+        sg.sampling_params.prompt_logprobs is not None and sg.is_prompt
+        for sg in seq_groups
+    ])
+    any_logprobs_are_requested = (
+        prompt_logprobs_are_requested_for_prefill
+        or any([sg.sampling_params.logprobs is not None for sg in seq_groups]))
+
+    if prompt_logprobs_are_requested_for_prefill:
+        # CPU GPU sync, after gathering *only* sampled tokens (since
+        # requesting prompt logprobs leads `sampled_token_ids` to
+        # include prompt token ids in addition to sampled token ids.)
+        sample_idx_tensor = torch.tensor(
+            [sdx for sg in seq_groups for sdx in sg.sample_indices])
+        pinned_buffer = pinned_buffer.copy_(
+            sampled_token_ids[sample_idx_tensor, :], non_blocking=False)
+    else:
+        # CPU GPU sync
+        pinned_buffer = pinned_buffer.copy_(sampled_token_ids,
+                                            non_blocking=False)
+
+    # this will not block as the tensors are already on CPU
+    samples_list = pinned_buffer.tolist()
+
+    skip_sampler_cpu_output = (
+        frozen_model_input.sampling_metadata.skip_sampler_cpu_output)
+
+    # *Don't* skip logprobs pythonization *if*:
+    # * Any requests require logprobs to be returned in this
+    # iteration AND
+    # * These requests are being scheduled in a fashion which
+    # defers pythonization (i.e. multi-step scheduling.)
+    do_pythonize_logprobs = (skip_sampler_cpu_output
+                             and any_logprobs_are_requested)
+    (
+        prompt_logprobs,
+        sample_logprobs,
+    ) = (deferred_pythonize_logprobs(output, sampling_metadata,
+                                     logprobs_tensor)
+         if do_pythonize_logprobs else (None, None))
+
+    for sgdx, (seq_group,
+               sample_result) in enumerate(zip(seq_groups, samples_list)):
+        # Reminder: Please update docs/features/compatibility_matrix.md
+        # If the feature combo become valid
+        # (Check for Guided Decoding)
+        if seq_group.sampling_params.logits_processors:
+            assert len(seq_group.sampling_params.logits_processors) == 0, (
+                "Logits Processors are not supported in multi-step decoding")
+
+        if do_pythonize_logprobs:
+            assert prompt_logprobs is not None
+            assert sample_logprobs is not None
+
+            (
+                group_prompt_logprobs,
+                group_sample_logprobs,
+            ) = (  # Utilize deferred pythonization results
+                prompt_logprobs[sgdx],
+                sample_logprobs[sgdx],
+            )
+        elif any_logprobs_are_requested:
+            (
+                group_prompt_logprobs,
+                group_sample_logprobs,
+            ) = (
+                # profile_run: use already-computed logprobs
+                output.outputs[sgdx].prompt_logprobs,
+                [sample.logprobs for sample in output.outputs[sgdx].samples])
+
+        seq_ids = seq_group.seq_ids
+        next_token_ids = sample_result
+        parent_ids = [0]
+        seq_outputs: List[SequenceOutput]
+
+        if cache is not None:
+            completion_seq_group_output: CompletionSequenceGroupOutput = \
+                cache.cached_completion_seq_group_output.get_object()
+            completion_seq_group_output.samples.clear()
+            seq_outputs = completion_seq_group_output.samples
+        else:
+            seq_outputs = []
+
+        for tdx, (parent_id,
+                  next_token_id) in enumerate(zip(parent_ids, next_token_ids)):
+            if cache is not None:
+                seq_output: SequenceOutput = cache.cached_seq_output.get_object(
+                )
+                seq_output.parent_seq_id = seq_ids[parent_id]
+                seq_output.output_token = next_token_id
+
+                if any_logprobs_are_requested:
+                    seq_output.logprobs = group_sample_logprobs[tdx]
+                else:
+                    logprobs = next(iter(seq_output.logprobs.values()))
+                    seq_output.logprobs.clear()
+
+                    logprobs.logprob = float('inf')
+                    logprobs.rank = None
+                    logprobs.decoded_token = None
+
+                    seq_output.logprobs[next_token_id] = logprobs
+
+                seq_outputs.append(seq_output)
+
+            else:
+                seq_outputs.append(
+                    SequenceOutput(seq_ids[parent_id], next_token_id,
+                                   (group_sample_logprobs[tdx]
+                                    if any_logprobs_are_requested else {
+                                        next_token_id:
+                                        Logprob(logprob=float('inf'),
+                                                rank=None,
+                                                decoded_token=None)
+                                    })))
+        if cache is not None:
+            completion_seq_group_output.prompt_logprobs = \
+                group_prompt_logprobs if any_logprobs_are_requested else None
+            output.outputs.append(completion_seq_group_output)
+        else:
+            output.outputs.append(
+                CompletionSequenceGroupOutput(
+                    seq_outputs, (group_prompt_logprobs
+                                  if any_logprobs_are_requested else None)))
+
+    assert len(output.outputs) > 0
diff --git a/vllm_v0.10.0/vllm/worker/multi_step_neuron_model_runner.py b/vllm_v0.10.0/vllm/worker/multi_step_neuron_model_runner.py
new file mode 100644
index 0000000..25f5880
--- /dev/null
+++ b/vllm_v0.10.0/vllm/worker/multi_step_neuron_model_runner.py
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from importlib.util import find_spec
+from typing import List, Optional
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.multimodal import MultiModalKwargs
+from vllm.sequence import IntermediateTensors
+from vllm.worker.neuron_model_runner import (ModelInputForNeuron,
+                                             NeuronModelRunner)
+
+
+class MultiStepNeuronModelRunner(NeuronModelRunner):
+    """A model runner for multi step decoding using the transformers_neuronx
+    framework"""
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+    ):
+        super().__init__(vllm_config)
+        self.speculation_config = self.speculative_config
+        from transformers_neuronx.config import GenerationConfig
+        self.speculation_config.draft_model_config.neuron_sampling_params = (
+            GenerationConfig(
+            max_length=self.scheduler_config.max_model_len,
+            do_sample=True,
+            per_batch_line=True,
+            top_k=[self._MAX_NEURON_SAMPLING_TOP_K] \
+                  * self.scheduler_config.max_num_seqs,
+            top_p=[1.0] * self.scheduler_config.max_num_seqs,
+            temperature=[1.0] * self.scheduler_config.max_num_seqs,
+            dynamic=True,
+            global_top_k=self._MAX_NEURON_SAMPLING_TOP_K
+        ))
+
+    def load_model(self) -> None:
+        if find_spec("transformers_neuronx") is not None:
+            from vllm.model_executor.model_loader.neuron import (
+                get_neuron_eagle_speculation_model,
+                get_neuron_speculation_model)
+            if self.speculation_config.speculative_token_tree is not None:
+                self.model = get_neuron_eagle_speculation_model(
+                    self.model_config,
+                    parallel_config=self.parallel_config,
+                    scheduler_config=self.scheduler_config,
+                    speculation_config=self.speculation_config)
+            else:
+                self.model = get_neuron_speculation_model(
+                    self.model_config,
+                    parallel_config=self.parallel_config,
+                    scheduler_config=self.scheduler_config,
+                    speculation_config=self.speculation_config)
+        else:
+            raise NotImplementedError(
+                "Supports only Transformer-NeuronX based models.")
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        model_input: ModelInputForNeuron,
+        kv_caches: Optional[List[torch.Tensor]] = None,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        num_steps: int = 1,
+    ) -> Optional[List[SamplerOutput]]:
+        logits = self.model(
+            input_ids=model_input.input_tokens,
+            positions=model_input.input_positions,
+            input_block_ids=model_input.input_block_ids,
+            **MultiModalKwargs.as_kwargs(
+                model_input.multi_modal_kwargs or {},
+                device=self.device,
+            ),
+        )
+
+        output = self.model.sample(
+            logits=logits,
+            sampling_metadata=model_input.sampling_metadata,
+        )
+        return output
diff --git a/vllm_v0.10.0/vllm/worker/multi_step_neuronx_distributed_model_runner.py b/vllm_v0.10.0/vllm/worker/multi_step_neuronx_distributed_model_runner.py
new file mode 100644
index 0000000..dd521dd
--- /dev/null
+++ b/vllm_v0.10.0/vllm/worker/multi_step_neuronx_distributed_model_runner.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import List, Optional
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.multimodal import MultiModalKwargs
+from vllm.sequence import IntermediateTensors
+from vllm.worker.neuronx_distributed_model_runner import (
+    NeuronxDistributedModelRunner)
+
+
+class MultiStepNeuronxDistributedModelRunner(NeuronxDistributedModelRunner):
+    """A model runner for multi-step decoding using the
+    neuronx-distributed-inference framework"""
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+    ):
+        super().__init__(vllm_config)
+
+    def load_model(self) -> None:
+        from vllm.model_executor.model_loader.neuronx_distributed import (
+            get_neuron_speculation_model)
+        self.model = get_neuron_speculation_model(
+            self.model_config,
+            parallel_config=self.parallel_config,
+            scheduler_config=self.scheduler_config,
+            speculation_config=self.speculative_config)
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        model_input,
+        kv_caches: Optional[List[torch.Tensor]] = None,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        num_steps: int = 1,
+    ) -> Optional[List[SamplerOutput]]:
+        sampling_params = torch.tensor([[
+            seq_group.sampling_params.top_k,
+            seq_group.sampling_params.top_p,
+            seq_group.sampling_params.temperature,
+        ] for seq_group in model_input.sampling_metadata.seq_groups])
+
+        logits = self.model(
+            input_ids=model_input.input_tokens,
+            positions=model_input.input_positions,
+            input_block_ids=model_input.input_block_ids,
+            sampling_params=sampling_params,
+            **MultiModalKwargs.as_kwargs(
+                model_input.multi_modal_kwargs or {},
+                device=self.device,
+            ),
+        )
+
+        output = self.model.sample(
+            logits=logits,
+            sampling_metadata=model_input.sampling_metadata,
+        )
+        return output
diff --git a/vllm_v0.10.0/vllm/worker/multi_step_worker.py b/vllm_v0.10.0/vllm/worker/multi_step_worker.py
new file mode 100644
index 0000000..ea16e14
--- /dev/null
+++ b/vllm_v0.10.0/vllm/worker/multi_step_worker.py
@@ -0,0 +1,197 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import dataclasses
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+import torch
+
+from vllm.distributed import broadcast_tensor_dict, get_pp_group
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.sequence import ExecuteModelRequest
+from vllm.worker.model_runner_base import BroadcastableModelInput
+from vllm.worker.multi_step_model_runner import (MultiStepModelRunner,
+                                                 StatefulModelInput)
+from vllm.worker.worker import Worker, WorkerInput
+
+
+@dataclass
+class MultiStepState:
+    worker_input: WorkerInput
+    model_input: StatefulModelInput
+
+
+class MultiStepWorker(Worker):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        base_model_runner = self.model_runner
+        # for multi-step model, wrap the model runner with MultiStepModelRunner
+        self.model_runner = MultiStepModelRunner(
+            base_model_runner,
+            vllm_config=base_model_runner.vllm_config,
+            kv_cache_dtype=self.cache_config.cache_dtype,
+            is_driver_worker=base_model_runner.is_driver_worker,
+        )
+
+        pipeline_parallel_size = self.parallel_config.pipeline_parallel_size
+        self.multi_step_states: List[
+            Optional[MultiStepState]] = [None] * pipeline_parallel_size
+        self.temp_output = None
+
+    def _get_driver_input_and_broadcast(
+        self, execute_model_req: ExecuteModelRequest
+    ) -> Tuple[BroadcastableModelInput, WorkerInput, Dict[str, torch.Tensor]]:
+        """
+        Get the driver input and broadcast it to other workers.
+        """
+        assert self.is_driver_worker
+        virtual_engine = execute_model_req.virtual_engine
+        is_first_multi_step = execute_model_req.is_first_multi_step
+        if is_first_multi_step:
+            # on first step we prepare the worker input and model input normally
+            worker_input: WorkerInput = self.prepare_worker_input(
+                execute_model_req=execute_model_req)
+            model_input: StatefulModelInput = (
+                self.model_runner.prepare_model_input(
+                    execute_model_req.seq_group_metadata_list,
+                    execute_model_req.virtual_engine,
+                    execute_model_req.finished_requests_ids))
+
+            if execute_model_req.async_callback:
+                model_input.frozen_model_input = dataclasses.replace(  # type: ignore
+                    model_input.frozen_model_input,
+                    async_callback=execute_model_req.async_callback)
+        else:
+            # on subsequent steps we reuse the worker input and model input
+            multi_step_state = self.multi_step_states[virtual_engine]
+            worker_input = multi_step_state.worker_input
+            model_input = multi_step_state.model_input
+            frozen_model_input = model_input.frozen_model_input
+            assert frozen_model_input is not None
+            assert frozen_model_input.attn_metadata is not None
+            # clear the cached metadata so that it can be recomputed on
+            # the workers.
+            frozen_model_input.attn_metadata._cached_prefill_metadata = None
+            frozen_model_input.attn_metadata._cached_decode_metadata = None
+
+        model_input.is_first_multi_step = is_first_multi_step
+        model_input.is_last_step = execute_model_req.is_last_step
+
+        if not is_first_multi_step:
+            # we broadcast the last sampled token ids to all TP workers so they
+            # can update their model input metadata in-place.
+            self._prepare_last_sampled_token_ids_for_tp_workers(
+                execute_model_req=execute_model_req, model_input=model_input)
+
+        if self.do_metadata_broadcast:
+            broadcast_data = worker_input.as_broadcastable_tensor_dict()
+            broadcast_data.update(model_input.as_broadcastable_tensor_dict())
+            broadcast_tensor_dict(broadcast_data, src=0)
+
+        # Retuning empty dict here to keep this compatible with
+        # `LocalOrDistributedWorkerBase._get_driver_input_and_broadcast`
+        return model_input, worker_input, {}
+
+    def _prepare_last_sampled_token_ids_for_tp_workers(
+        self,
+        execute_model_req: ExecuteModelRequest,
+        model_input: StatefulModelInput,
+    ) -> None:
+        """ 
+        Prepare the last sampled token ids for TP workers. If it's the last 
+        PP rank, then the last sampled token ids are already in the model_input.
+        If it is NOT the last PP rank, then we need to get the last sampled
+        token that is cached in the execute_model_req.
+        """
+        if get_pp_group().is_last_rank:
+            assert model_input.cached_outputs[
+                -1].sampler_output.sampled_token_ids is None
+            assert model_input.cached_outputs[-1].sampled_token_ids is not None
+            model_input.last_sampled_token_ids = model_input.cached_outputs[
+                -1].sampled_token_ids
+            # free sampled token ids from the previous step if it has been
+            # pythonized. Cannot free the last sampled token ids because
+            # we need it for GPU advance_step.
+            for output in model_input.cached_outputs[:-1]:
+                if output.pythonized:
+                    output.sampled_token_ids = None
+        else:
+            # otherwise we need to get the cached sampled token ids from the
+            # execute_model_req
+            assert execute_model_req.last_sampled_token_ids is not None
+            model_input.last_sampled_token_ids = (
+                execute_model_req.last_sampled_token_ids.cuda())
+            model_input.add_sampler_output(
+                SamplerOutput(outputs=[], sampled_token_ids=None),
+                model_input.last_sampled_token_ids)
+
+            # free sampled token ids from the previous step.
+            # TODO(will) we could reuse the sampled token ids tensor from
+            # the previous step instead.
+            for output in model_input.cached_outputs[:-1]:
+                output.sampled_token_ids = None
+            assert model_input.cached_outputs[-1].sampled_token_ids is not None
+
+    def prepare_input(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None,
+    ) -> Optional[Tuple[StatefulModelInput, WorkerInput, Dict[str,
+                                                              torch.Tensor]]]:
+        """
+        Depending on the current state of the request and multi step worker,
+        this method may skip the normal _prepare_model_input and
+        _prepare_worker_input methods and instead used cached values.
+        """
+        if self.is_driver_worker:
+            if execute_model_req is None:
+                if self.do_metadata_broadcast:
+                    # This signals that there's no more requests to process for
+                    # now. All workers are running infinite loop with
+                    # broadcast_tensor_dict, and it stops the loop when the
+                    # driver broadcasts an empty input. Send an empty input to
+                    # notify all other workers to stop their execution loop.
+                    broadcast_tensor_dict({}, src=0)
+                return None
+
+            virtual_engine = execute_model_req.virtual_engine
+            (model_input, worker_input,
+             kwargs) = self._get_driver_input_and_broadcast(execute_model_req)
+            assert isinstance(model_input, StatefulModelInput)
+            if execute_model_req.is_first_multi_step:
+                # cache the worker input and model input for the next steps
+                self.multi_step_states[virtual_engine] = MultiStepState(
+                    worker_input=worker_input, model_input=model_input)
+        # if TP workers
+        else:
+            broadcast_data = self._get_worker_input_from_broadcast()
+            # if the driver has sent an empty input, we should stop the worker
+            # loop
+            if broadcast_data is None:
+                return None
+            model_input, worker_input, kwargs = broadcast_data
+            assert isinstance(model_input, StatefulModelInput)
+            virtual_engine = worker_input.virtual_engine
+            if model_input.is_first_multi_step:
+                pass
+                # TODO(will) Can cache the worker input and model input for the
+                # next steps. See below for details
+            else:
+                # TODO(will) possible to also cache and reuse the cached worker
+                # input and model input. The idea is essentially the delta
+                # optimization for model_inputs. Where the TP workers can cache
+                # the model input states and we only broadcast the delta need
+                # for the next step (sampled_token_ids from the previous step)
+
+                assert isinstance(model_input, StatefulModelInput)
+                # we need to update the last sampled token ids in the model
+                # input for the workers so that they can run inplace
+                # advance_step
+                model_input.add_sampler_output(
+                    SamplerOutput(outputs=[], sampled_token_ids=None),
+                    model_input.last_sampled_token_ids)
+
+        assert model_input is not None
+        assert worker_input is not None
+        return model_input, worker_input, kwargs
diff --git a/vllm_v0.10.0/vllm/worker/neuron_model_runner.py b/vllm_v0.10.0/vllm/worker/neuron_model_runner.py
new file mode 100644
index 0000000..7ccf1a2
--- /dev/null
+++ b/vllm_v0.10.0/vllm/worker/neuron_model_runner.py
@@ -0,0 +1,460 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union
+
+import torch
+from torch import nn
+
+from vllm.config import DeviceConfig, VllmConfig
+from vllm.logger import init_logger
+from vllm.lora.layers import LoRAMapping
+from vllm.lora.request import LoRARequest
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.model_loader.neuron import get_neuron_model
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
+                             MultiModalKwargs)
+from vllm.platforms import current_platform
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
+from vllm.utils import is_pin_memory_available, make_tensor_with_pad
+from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
+
+logger = init_logger(__name__)
+
+
+@dataclass(frozen=True)
+class ModelInputForNeuron(ModelRunnerInputBase):
+    """
+    Used by the NeuronModelRunner.
+    """
+    input_tokens: Optional[torch.Tensor] = None
+    input_positions: Optional[torch.Tensor] = None
+    input_block_ids: Optional[torch.Tensor] = None
+    sampling_metadata: SamplingMetadata = None
+    multi_modal_kwargs: BatchedTensorInputs = None
+    adapter_ids: Optional[str] = None
+
+    def as_broadcastable_tensor_dict(
+            self) -> Dict[str, Union[int, torch.Tensor]]:
+        return {
+            "input_tokens": self.input_tokens,
+            "input_positions": self.input_positions,
+            "input_block_ids": self.input_block_ids,
+            "sampling_metadata": self.sampling_metadata,
+            "multi_modal_kwargs": self.multi_modal_kwargs,
+        }
+
+    @classmethod
+    def from_broadcasted_tensor_dict(
+        cls,
+        tensor_dict: Dict[str, Any],
+        attn_backend: Optional["AttentionBackend"] = None,
+    ) -> "ModelInputForNeuron":
+        return ModelInputForNeuron(
+            input_tokens=tensor_dict["input_tokens"],
+            input_positions=tensor_dict["input_positions"],
+            input_block_ids=tensor_dict["input_block_ids"],
+            sampling_metadata=tensor_dict["sampling_metadata"],
+            multi_modal_kwargs=tensor_dict["multi_modal_kwargs"],
+        )
+
+
+class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
+    """A model runner for AWS Neuron hardware"""
+
+    # NEURON has an upper limit on the top_k
+    _MAX_NEURON_SAMPLING_TOP_K = 256
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+    ):
+        ModelRunnerBase.__init__(self, vllm_config)
+
+        if (self.model_config is not None
+                and self.model_config.get_sliding_window()):
+            logger.warning("Sliding window is not supported on Neuron. "
+                           "The model will run without sliding window.")
+        self.device_config = (self.device_config if self.device_config
+                              is not None else DeviceConfig())
+        self.lora_config = vllm_config.lora_config
+        self.device = self.device_config.device
+        self.pin_memory = is_pin_memory_available()
+
+        # Multi-modal data support
+        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
+            .create_input_mapper(self.model_config)
+
+        # Lazy initialization.
+        self.model: nn.Module  # initialize after load_model.
+
+        # Once NEURON_ON_DEVICE_SAMPLING_DISABLED is set to a non-zero value,
+        # turn off on-device sampling.
+        self._on_device_sampling_disabled = int(
+            os.getenv("NEURON_ON_DEVICE_SAMPLING_DISABLED", "0"))
+
+        # NEURON needs to update sampling parameters when request IDs change
+        # across batches. This variable stores the previous batch's request IDs
+        # to determine if an update is needed.
+        self._previous_batch_request_ids: List[str] = []
+
+        if not self._on_device_sampling_disabled:
+            self._init_neuron_sampling()
+
+    def _init_neuron_sampling(self) -> None:
+        if current_platform.use_transformers_neuronx():
+            from transformers_neuronx.config import GenerationConfig
+        else:
+            from transformers import GenerationConfig
+        logger.warning(
+            "On-device sampling is turned on in Neuron by default, only "
+            "top_k, top_p, and temperature are current supported sampling "
+            "parameters. To turn off the on-device sampling, please set "
+            "the environment variable NEURON_ON_DEVICE_SAMPLING_DISABLED=1.")
+        self.model_config.neuron_sampling_params = GenerationConfig(
+            max_length=self.scheduler_config.max_model_len,
+            do_sample=True,
+            per_batch_line=True,
+            top_k=[self._MAX_NEURON_SAMPLING_TOP_K] \
+                  * self.scheduler_config.max_num_seqs,
+            top_p=[1.0] * self.scheduler_config.max_num_seqs,
+            temperature=[1.0] * self.scheduler_config.max_num_seqs,
+            dynamic=True,
+            global_top_k=self._MAX_NEURON_SAMPLING_TOP_K)
+
+    def load_model(self) -> None:
+        self.model = get_neuron_model(self.model_config,
+                                      parallel_config=self.parallel_config,
+                                      scheduler_config=self.scheduler_config)
+
+    def get_model(self) -> nn.Module:
+        return self.model
+
+    def _prepare_prompt(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[int],
+               BatchedTensorInputs]:
+        assert len(seq_group_metadata_list) > 0
+        input_tokens: List[List[int]] = []
+        input_positions: List[List[int]] = []
+        input_block_ids: List[int] = []
+
+        seq_lens: List[int] = []
+        multi_modal_kwargs_list: List[MultiModalKwargs] = []
+        for seq_group_metadata in seq_group_metadata_list:
+            assert seq_group_metadata.is_prompt
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            assert len(seq_ids) == 1
+            seq_id = seq_ids[0]
+
+            seq_data = seq_group_metadata.seq_data[seq_id]
+            prompt_tokens = seq_data.get_token_ids()
+            seq_len = len(prompt_tokens)
+            seq_lens.append(seq_len)
+
+            input_tokens.append(prompt_tokens)
+            input_positions.append(list(range(seq_len)))
+
+            assert seq_group_metadata.block_tables is not None
+            block_table = seq_group_metadata.block_tables[seq_id]
+            assert len(block_table) == 1
+            input_block_ids.append(block_table[0])
+
+            mm_kwargs = seq_group_metadata.multi_modal_data
+            if mm_kwargs:
+                mm_kwargs = self.process_multi_modal_data_neuron(mm_kwargs)
+                multi_modal_kwargs_list.append(mm_kwargs)
+
+        max_seq_len = max(seq_lens)
+        assert max_seq_len > 0
+        input_tokens = make_tensor_with_pad(input_tokens,
+                                            pad=0,
+                                            max_len=max_seq_len,
+                                            dtype=torch.long,
+                                            device=self.device)
+        input_positions = make_tensor_with_pad(input_positions,
+                                               pad=0,
+                                               max_len=max_seq_len,
+                                               dtype=torch.long,
+                                               device=self.device)
+        input_block_ids = torch.tensor(input_block_ids,
+                                       dtype=torch.long,
+                                       device=self.device)
+
+        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
+
+        return (input_tokens, input_positions, input_block_ids, seq_lens,
+                multi_modal_kwargs)
+
+    def _prepare_decode(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        assert len(seq_group_metadata_list) > 0
+        input_tokens: List[List[int]] = []
+        input_positions: List[List[int]] = []
+        input_block_ids: List[int] = []
+        context_lens: List[int] = []
+
+        for seq_group_metadata in seq_group_metadata_list:
+            assert not seq_group_metadata.is_prompt
+
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+
+            for seq_id in seq_ids:
+                seq_data = seq_group_metadata.seq_data[seq_id]
+                generation_token = seq_data.get_last_token_id()
+                input_tokens.append([generation_token])
+
+                seq_len = seq_data.get_len()
+                position = seq_len - 1
+                input_positions.append([position])
+                context_lens.append(seq_len)
+
+                assert seq_group_metadata.block_tables is not None
+                block_table = seq_group_metadata.block_tables[seq_id]
+                assert len(block_table) == 1
+                input_block_ids.append(block_table[0])
+
+        input_tokens = make_tensor_with_pad(input_tokens,
+                                            pad=0,
+                                            max_len=1,
+                                            dtype=torch.long,
+                                            device=self.device)
+        input_positions = make_tensor_with_pad(input_positions,
+                                               pad=0,
+                                               max_len=1,
+                                               dtype=torch.long,
+                                               device=self.device)
+        context_lens = torch.tensor(context_lens,
+                                    dtype=torch.int,
+                                    device=self.device)
+        input_block_ids = torch.tensor(input_block_ids,
+                                       dtype=torch.long,
+                                       device=self.device)
+
+        return input_tokens, input_positions, input_block_ids
+
+    def make_model_input_from_broadcasted_tensor_dict(
+            self, tensor_dict: Dict[str, Any]) -> ModelInputForNeuron:
+        return ModelInputForNeuron.from_broadcasted_tensor_dict(tensor_dict)
+
+    def prepare_model_input(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        virtual_engine: int = 0,
+        finished_requests_ids: Optional[List[str]] = None
+    ) -> ModelInputForNeuron:
+        multi_modal_kwargs = None
+        # NOTE: We assume that all sequences in the group are all prompts or
+        # all decodes.
+        is_prompt = seq_group_metadata_list[0].is_prompt
+        # Prepare input tensors.
+        if is_prompt:
+            (input_tokens, input_positions, input_block_ids, seq_lens,
+             multi_modal_kwargs
+             ) = self._prepare_prompt(seq_group_metadata_list)
+        else:
+            (input_tokens, input_positions,
+             input_block_ids) = self._prepare_decode(seq_group_metadata_list)
+            seq_lens = None
+
+        if not self._on_device_sampling_disabled:
+            for seq_group_metadata in seq_group_metadata_list:
+                sampling_params = seq_group_metadata.sampling_params
+                top_k, top_p, temperature = (
+                    self._convert_to_neuron_sampling_params(sampling_params))
+                sampling_params.top_k = top_k
+                sampling_params.top_p = top_p
+                sampling_params.temperature = temperature
+
+        # we need multi_modal_data for later tokens as well
+        multi_modal_kwargs_list: List[MultiModalKwargs] = []
+        for seq_group_metadata in seq_group_metadata_list:
+            mm_data = seq_group_metadata.multi_modal_data
+            if mm_data:
+                multi_modal_kwargs_list.append(mm_data)
+        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
+
+        sampling_metadata = SamplingMetadata.prepare(
+            seq_group_metadata_list,
+            seq_lens,
+            # query_lens is not needed if chunked prefill is not
+            # supported. Since neuron worker doesn't support chunked prefill
+            # just use seq_lens instead.
+            seq_lens,
+            self.device,
+            self.pin_memory,
+            generators=self.get_generators(finished_requests_ids))
+
+        if current_platform.use_transformers_neuronx(
+        ) and not self._on_device_sampling_disabled:
+            # Once the request IDs are changed in current iteration, we will
+            # update the on-device sampling parameters.
+            current_batch_request_ids = [
+                seq_group_meta_data.request_id
+                for seq_group_meta_data in seq_group_metadata_list
+            ]
+            if current_batch_request_ids != self._previous_batch_request_ids:
+                self._update_neuron_sampling_params(seq_group_metadata_list)
+                self._previous_batch_request_ids = current_batch_request_ids
+
+        return ModelInputForNeuron(input_tokens=input_tokens,
+                                   input_positions=input_positions,
+                                   input_block_ids=input_block_ids,
+                                   sampling_metadata=sampling_metadata,
+                                   multi_modal_kwargs=multi_modal_kwargs)
+
+    def _update_neuron_sampling_params(
+            self, seq_group_metadata_list: List[SequenceGroupMetadata]):
+        # Update Neuron sampling parameters (GenerationConfig in Neuron)
+        current_sampling_params = self.model_config.neuron_sampling_params
+        assert current_sampling_params is not None, (
+            f"Failed to update sampling_params, "
+            f"current sampling params is {current_sampling_params}")
+
+        is_update_needed = False
+
+        top_k = current_sampling_params.top_k
+        top_p = current_sampling_params.top_p
+        temperature = current_sampling_params.temperature
+
+        # The index of a sequence's sampling parameters in neuron is equal to
+        # its index in `input_block_ids`.
+        for seq_group_metadata in seq_group_metadata_list:
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            sampling_params = seq_group_metadata.sampling_params
+
+            seq_group_top_k = sampling_params.top_k
+            seq_group_top_p = sampling_params.top_p
+            seq_group_temperature = sampling_params.temperature
+
+            for seq_id in seq_ids:
+                index = seq_group_metadata.block_tables[seq_id][0]
+                if (top_k[index] != seq_group_top_k
+                        or top_p[index] != seq_group_top_p
+                        or temperature[index] != seq_group_temperature):
+                    is_update_needed = True
+
+                top_k[index] = seq_group_top_k
+                top_p[index] = seq_group_top_p
+                temperature[index] = seq_group_temperature
+
+        # update_generation_config is only available in transformers-neuronx
+        if is_update_needed and current_platform.use_transformers_neuronx():
+            self.model.model.update_generation_config(current_sampling_params)
+
+    def _convert_to_neuron_sampling_params(
+            self, sampling_params: SamplingParams) -> Tuple[int, float, float]:
+        # Returns the top_k, top_p and temperature parameters for neuron.
+        top_k = sampling_params.top_k
+        top_p = sampling_params.top_p
+        temperature = sampling_params.temperature
+
+        if temperature == 0.0:
+            # Enable greedy sampling on zero temperature
+            return (1, 1.0, 1.0)
+        if top_k < 1 or top_k > self._MAX_NEURON_SAMPLING_TOP_K:
+            top_k = self._MAX_NEURON_SAMPLING_TOP_K
+
+        return (top_k, top_p, temperature)
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        model_input: ModelInputForNeuron,
+        kv_caches: Optional[List[torch.Tensor]] = None,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        num_steps: int = 1,
+    ) -> Optional[List[SamplerOutput]]:
+        if num_steps > 1:
+            raise ValueError(
+                "NeuronModelRunner does not support multi-step execution.")
+
+        # extract top_k, top_p and temperature from model_input for neuron
+        # forward call
+        sampling_params = (torch.tensor([[
+            seq_group.sampling_params.top_k, seq_group.sampling_params.top_p,
+            seq_group.sampling_params.temperature
+        ] for seq_group in model_input.sampling_metadata.seq_groups]))
+
+        if current_platform.use_neuronx_distributed():
+            hidden_states = self.model(
+                input_ids=model_input.input_tokens,
+                positions=model_input.input_positions,
+                input_block_ids=model_input.input_block_ids,
+                sampling_params=sampling_params,
+                adapter_ids=model_input.adapter_ids,
+                **MultiModalKwargs.as_kwargs(
+                    model_input.multi_modal_kwargs or {},
+                    device=self.device,
+                ),
+            )
+        elif current_platform.use_transformers_neuronx():
+            # [TODO] validate on-device sampling
+            # The model signature may need change for on-device sampling
+            hidden_states = self.model(
+                input_ids=model_input.input_tokens,
+                positions=model_input.input_positions,
+                input_block_ids=model_input.input_block_ids,
+                **MultiModalKwargs.as_kwargs(
+                    model_input.multi_modal_kwargs or {},
+                    device=self.device,
+                ),
+            )
+
+        # Compute the logits only if the on-device sampling is turned off as
+        # on-device sampling outputs the token ids.
+        if self._on_device_sampling_disabled:
+            logits = self.model.compute_logits(hidden_states,
+                                               model_input.sampling_metadata)
+        else:
+            logits = hidden_states
+
+        # Sample the next token.
+        output = self.model.sample(
+            logits=logits,
+            sampling_metadata=model_input.sampling_metadata,
+        )
+        return [output]
+
+    @property
+    def vocab_size(self) -> int:
+        return self.model_config.get_vocab_size()
+
+    def process_multi_modal_data_neuron(self, mm_data):
+        # this is a no-op for NeuronModelRunner
+        return mm_data
+
+    def remove_all_loras(self):
+        raise NotImplementedError(
+            "LoRAs are not supported for Transformers NeuronX framework")
+
+    def set_active_loras(self, lora_requests: Set[LoRARequest],
+                         lora_mapping: LoRAMapping) -> None:
+        raise NotImplementedError(
+            "LoRAs are not supported for Transformers NeuronX framework")
+
+    def add_lora(self, lora_request: LoRARequest):
+        raise NotImplementedError(
+            "LoRAs are not supported for Transformers NeuronX framework")
+
+    def remove_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError(
+            "LoRAs are not supported for Transformers NeuronX framework")
+
+    def pin_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError(
+            "LoRAs are not supported for Transformers NeuronX framework")
+
+    def list_loras(self) -> Set[int]:
+        raise NotImplementedError(
+            "LoRAs are not supported for Transformers NeuronX framework")
diff --git a/vllm_v0.10.0/vllm/worker/neuron_worker.py b/vllm_v0.10.0/vllm/worker/neuron_worker.py
new file mode 100644
index 0000000..4e14083
--- /dev/null
+++ b/vllm_v0.10.0/vllm/worker/neuron_worker.py
@@ -0,0 +1,193 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""A Neuron worker class."""
+import os
+from typing import List, Optional, Set, Tuple
+
+import torch.distributed
+
+from vllm.config import VllmConfig
+from vllm.distributed import (ensure_model_parallel_initialized,
+                              init_distributed_environment)
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.model_executor import set_random_seed
+from vllm.platforms import current_platform
+from vllm.platforms.neuron import NeuronFramework
+from vllm.sequence import ExecuteModelRequest
+from vllm.worker.neuron_model_runner import NeuronModelRunner
+from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
+                                     WorkerInput)
+
+logger = init_logger(__name__)
+
+
+class NeuronWorker(LocalOrDistributedWorkerBase):
+    """A worker class that executes the model on a group of neuron cores.
+    """
+
+    model_runner: NeuronModelRunner
+
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 local_rank: int,
+                 rank: int,
+                 distributed_init_method: str,
+                 is_driver_worker: bool = False) -> None:
+        WorkerBase.__init__(self, vllm_config=vllm_config)
+        self.local_rank = local_rank
+        self.rank = rank
+        self.distributed_init_method = distributed_init_method
+        self.is_driver_worker = is_driver_worker
+        self.lora_config = vllm_config.lora_config
+
+        if self.model_config.trust_remote_code:
+            # note: lazy import to avoid importing torch before initializing
+            from vllm.utils import init_cached_hf_modules
+            init_cached_hf_modules()
+
+        neuron_framework = current_platform.get_neuron_framework_to_use()
+        if neuron_framework == NeuronFramework.TRANSFORMERS_NEURONX:
+            self.model_runner = self.get_tnx_model_runner(vllm_config)
+        elif neuron_framework == NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE:
+            self.model_runner = self.get_neuronx_distributed_model_runner(
+                vllm_config)
+        else:
+            raise NotImplementedError(
+                "Specified framework" +
+                f" {os.environ.get('VLLM_NEURON_FRAMEWORK')}" +
+                " is either not installed or not supported." +
+                " Supported frameworks: " +
+                "[transformers-neuronx, neuronx-distributed-inference]")
+
+    def get_tnx_model_runner(self, vllm_config):
+        assert (self.lora_config
+                is None), ("LoRA is not supported for TransformersNeuronX "
+                           "framework.")
+        from vllm.worker.multi_step_neuron_model_runner import (
+            MultiStepNeuronModelRunner)
+        if self.speculative_config is not None:
+            return MultiStepNeuronModelRunner(vllm_config=vllm_config)
+        else:
+            return NeuronModelRunner(vllm_config=vllm_config)
+
+    def get_neuronx_distributed_model_runner(self, vllm_config):
+        from vllm.worker.multi_step_neuronx_distributed_model_runner import (
+            MultiStepNeuronxDistributedModelRunner)
+        from vllm.worker.neuronx_distributed_model_runner import (
+            NeuronxDistributedModelRunner)
+        if self.speculative_config is not None:
+            assert (self.lora_config
+                    is None), "LoRA is not supported for Speculative Decoding"
+            return MultiStepNeuronxDistributedModelRunner(
+                vllm_config=vllm_config)
+        else:
+            return NeuronxDistributedModelRunner(vllm_config=vllm_config)
+
+    def init_device(self) -> None:
+        self.init_distributed_environment()
+
+        # Set random seed.
+        set_random_seed(self.model_config.seed)
+
+    def load_model(self):
+        self.model_runner.load_model()
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Determine the number of available KV blocks.
+
+        Swapping is not yet supported, so always return num_cpu_blocks=0.
+
+        We configure num_gpu_blocks to be equal to max_num_seqs.
+        """
+        # Set the number of GPU blocks to be the same as the maximum number of
+        # sequences that can be processed in a single batch. This is equivalent
+        # to schedule without PagedAttention.
+        num_gpu_blocks = self.scheduler_config.max_num_seqs + 1
+
+        # Swap not yet supported with Neuron backend.
+        num_cpu_blocks = 0
+
+        return num_gpu_blocks, num_cpu_blocks
+
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        """Initialize the KV cache.
+        """
+
+        # Different values are not tested.
+        assert num_cpu_blocks == 0
+        assert num_gpu_blocks == self.scheduler_config.max_num_seqs + 1
+
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+
+    @property
+    def do_metadata_broadcast(self) -> bool:
+        return False
+
+    @property
+    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
+        return None
+
+    @torch.inference_mode()
+    def prepare_worker_input(
+            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
+        return WorkerInput(num_seq_groups=len(
+            execute_model_req.seq_group_metadata_list), )
+
+    def execute_worker(self, worker_input: WorkerInput) -> None:
+        pass
+
+    def get_cache_block_size_bytes(self) -> int:
+        """Determine the size in bytes of a cache block.
+
+        This is required for speculative decoding; it is not yet implemented.
+        """
+        raise NotImplementedError
+
+    def init_distributed_environment(self):
+        """Neuron uses transformers-neuronx for tensor parallelism.
+
+        vLLM still needs the environment initialized when TP/PP > 1
+        """
+        init_distributed_environment(
+            world_size=1,
+            rank=self.rank,
+            local_rank=self.local_rank,
+            distributed_init_method=self.distributed_init_method,
+            backend=current_platform.dist_backend,
+        )
+
+        ensure_model_parallel_initialized(
+            1,
+            1,
+        )
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        if current_platform.use_transformers_neuronx():
+            raise NotImplementedError(
+                f"{type(self)} does not support LoRA with Neuron Framework "
+                f"Transformers NeuronX")
+        return self.model_runner.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        if current_platform.use_transformers_neuronx():
+            raise NotImplementedError(
+                f"{type(self)} does not support LoRA with Neuron Framework "
+                f"Transformers NeuronX")
+        return self.model_runner.remove_lora(lora_id)
+
+    def pin_lora(self, lora_id: int) -> bool:
+        if current_platform.use_transformers_neuronx():
+            raise NotImplementedError(
+                f"{type(self)} does not support LoRA with Neuron Framework "
+                f"Transformers NeuronX")
+        return self.model_runner.pin_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        if current_platform.use_transformers_neuronx():
+            raise NotImplementedError(
+                f"{type(self)} does not support LoRA with Neuron Framework "
+                f"Transformers NeuronX")
+        return self.model_runner.list_loras()
diff --git a/vllm_v0.10.0/vllm/worker/neuronx_distributed_model_runner.py b/vllm_v0.10.0/vllm/worker/neuronx_distributed_model_runner.py
new file mode 100644
index 0000000..2a0f4e7
--- /dev/null
+++ b/vllm_v0.10.0/vllm/worker/neuronx_distributed_model_runner.py
@@ -0,0 +1,294 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import List, Optional, Set
+
+import torch
+from neuronx_distributed_inference.models.mllama.aspect_ratio_utils import (
+    get_all_supported_aspect_ratios)
+from neuronx_distributed_inference.modules.generation.sampling import (
+    prepare_sampling_params)
+from neuronx_distributed_inference.modules.lora_serving import (
+    LoraCheckpoint, LoraServingConfig)
+
+from vllm.config import VllmConfig
+from vllm.entrypoints.openai.serving_models import LoRAModulePath
+from vllm.logger import init_logger
+from vllm.lora.layers import LoRAMapping
+from vllm.lora.request import LoRARequest
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.model_loader.neuronx_distributed import (
+    _get_model_architecture, get_neuron_model)
+from vllm.multimodal import MultiModalKwargs
+from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
+from vllm.worker.neuron_model_runner import (ModelInputForNeuron,
+                                             NeuronModelRunner)
+
+logger = init_logger(__name__)
+
+
+class NeuronxDistributedModelRunner(NeuronModelRunner):
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+    ):
+        super().__init__(vllm_config)
+        self.lora_checkpoint = None
+        self.model = None
+        self.lora_serving_config = None
+
+    @staticmethod
+    def _get_lora_paths_strings(lora_modules: List[LoRAModulePath]):
+        if not lora_modules:
+            return None
+        return {_.get("name"): _.get("path") for _ in lora_modules}
+
+    def _get_nxdi_lora_config(self):
+        override_neuron_config = self.model_config.override_neuron_config
+        lora_modules = override_neuron_config.pop("lora_modules", None)
+        target_modules = override_neuron_config.pop("target_modules", None)
+        lora_ckpt_paths = self._get_lora_paths_strings(lora_modules)
+        if self.lora_config.max_loras < len(lora_ckpt_paths):
+            raise ValueError(
+                "Number of LoRAs (%s) exceeds maximum "
+                "allowed (%s)", len(lora_ckpt_paths),
+                self.lora_config.max_loras)
+
+        return LoraServingConfig(
+            max_loras=self.lora_config.max_loras,
+            max_lora_rank=self.lora_config.max_lora_rank,
+            target_modules=target_modules,
+            lora_ckpt_paths=lora_ckpt_paths,
+        )
+
+    def load_model(self) -> None:
+        # Update LoRA config
+        if self.lora_config is not None:
+            self.lora_serving_config = self._get_nxdi_lora_config()
+            self.lora_checkpoint = LoraCheckpoint(self.lora_serving_config)
+        self.model = get_neuron_model(
+            self.model_config,
+            parallel_config=self.parallel_config,
+            scheduler_config=self.scheduler_config,
+            lora_serving_config=self.lora_serving_config)
+
+    def get_nxd_sampling_params(self, sampling_metadata):
+        if self.model.config.neuron_config.on_device_sampling_config:
+            max_topk = (self.model.config.neuron_config.
+                        on_device_sampling_config.global_topk)
+        else:
+            max_topk = self.model.config.vocab_size
+
+        top_k = [1] * self.scheduler_config.max_num_seqs
+        top_p = [1.0] * self.scheduler_config.max_num_seqs
+        temperature = [1.0] * self.scheduler_config.max_num_seqs
+
+        for index, sequenceGroupToSample in enumerate(
+                sampling_metadata.seq_groups):
+            top_k[index] = (sequenceGroupToSample.sampling_params.top_k
+                            if sequenceGroupToSample.sampling_params.top_k > 0
+                            else max_topk)
+            top_p[index] = sequenceGroupToSample.sampling_params.top_p
+            temperature[index] = (
+                sequenceGroupToSample.sampling_params.temperature)
+
+        sampling_params = prepare_sampling_params(
+            batch_size=self.scheduler_config.max_num_seqs,
+            top_k=top_k,
+            top_p=top_p,
+            temperature=temperature)
+        return sampling_params
+
+    def get_multi_modal_data_neuron(self, input_images):
+        raise NotImplementedError("need to restore multi-modal support")
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        model_input: ModelInputForNeuron,
+        kv_caches: Optional[List[torch.Tensor]] = None,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        num_steps: int = 1,
+    ) -> Optional[List[SamplerOutput]]:
+        if num_steps > 1:
+            raise ValueError(
+                "NeuronModelRunner does not support multi-step execution.")
+
+        if _get_model_architecture(
+                self.model.config) != "MllamaForConditionalGeneration":
+            return super().execute_model(model_input, kv_caches,
+                                         intermediate_tensors, num_steps)
+
+        sampling_params = self.get_nxd_sampling_params(
+            model_input.sampling_metadata)
+
+        if model_input.multi_modal_kwargs.get('pixel_values') is not None:
+            hidden_states = self.model(
+                input_ids=model_input.input_tokens,
+                positions=model_input.input_positions,
+                seq_ids=model_input.input_block_ids,
+                pixel_values=model_input.multi_modal_kwargs.get(
+                    'pixel_values'),
+                aspect_ratios=model_input.multi_modal_kwargs.get(
+                    'aspect_ratios'),
+                sampling_params=sampling_params,
+                num_chunks=model_input.multi_modal_kwargs.get('num_chunks'),
+                has_image=model_input.multi_modal_kwargs.get(
+                    'has_image').squeeze(1),
+            )
+        else:
+            bs = model_input.input_tokens.shape[0] if (model_input.input_tokens
+                                                       is not None) else 1
+            empty_pixel_values = torch.zeros([bs, 1, 4, 3, 560, 560],
+                                             dtype=torch.bfloat16)
+            empty_aspect_ratios = torch.ones([bs, 1, 2], dtype=torch.int64)
+            num_chunks = torch.zeros((bs, 1), dtype=torch.int32)
+            has_image = torch.zeros([bs], dtype=torch.int32)
+            hidden_states = self.model(
+                input_ids=model_input.input_tokens,
+                positions=model_input.input_positions,
+                seq_ids=model_input.input_block_ids,
+                pixel_values=empty_pixel_values,
+                aspect_ratios=empty_aspect_ratios,
+                sampling_params=sampling_params,
+                num_chunks=num_chunks,
+                has_image=has_image,
+            )
+
+        output = self.model.sample(
+            hidden_states=hidden_states,
+            sampling_metadata=model_input.sampling_metadata,
+        )
+
+        return [output]
+
+    def process_multi_modal_data_neuron(self, mm_data):
+        # Neuron uses aspect_ratios instead of aspect_ratio_ids
+        all_supported_aspect_ratios = get_all_supported_aspect_ratios(
+            self.model.config.vision_config.max_num_tiles)
+        aspect_ratio_ids = mm_data.get("aspect_ratio_ids")
+        mm_data["aspect_ratios"] = torch.tensor(
+            all_supported_aspect_ratios[aspect_ratio_ids]).unsqueeze(0)
+
+        # Neuron's num_chunks is HF's num_tiles
+        mm_data["num_chunks"] = mm_data.get("num_tiles")
+
+        # Input has an image if it has pixel_values
+        bs = mm_data["num_chunks"].shape[0]
+        pixel_values = mm_data.get("pixel_values")
+        if pixel_values is not None and not torch.all(pixel_values == 0):
+            mm_data["has_image"] = torch.ones(bs)
+
+        else:
+            mm_data["has_image"] = torch.zeros(bs)
+        return mm_data
+
+    def _get_lora_adapter_ids(self, seq_group_metadata_list):
+        # set LoRA adapter IDs for multi-lora serving
+        batch_size = len(seq_group_metadata_list)
+        if self.lora_checkpoint is not None:
+            # "0" indicates NxDI to use the base model for inference
+            adapter_ids = ["0"] * batch_size
+            for idx, seq_group_metadata in enumerate(seq_group_metadata_list):
+                if seq_group_metadata.lora_request is not None:
+                    adapter_ids[
+                        idx] = seq_group_metadata.lora_request.lora_name
+
+            # convert adapter_ids from strings to integers
+            adapter_ids = self.lora_checkpoint.convert_adapter_ids_to_indices(
+                adapter_ids, batch_size)
+        else:
+            adapter_ids = torch.zeros((batch_size), dtype=torch.int32)
+
+        return adapter_ids
+
+    def prepare_model_input(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        virtual_engine: int = 0,
+        finished_requests_ids: Optional[List[str]] = None
+    ) -> ModelInputForNeuron:
+        # NOTE: We assume that all sequences in the group are all prompts or
+        # all decodes.
+        is_prompt = seq_group_metadata_list[0].is_prompt
+        # Prepare input tensors.
+        if is_prompt:
+            (input_tokens, input_positions, input_block_ids, seq_lens,
+             multi_modal_kwargs
+             ) = self._prepare_prompt(seq_group_metadata_list)
+        else:
+            (input_tokens, input_positions,
+             input_block_ids) = self._prepare_decode(seq_group_metadata_list)
+            seq_lens = None
+
+        if not self._on_device_sampling_disabled:
+            for seq_group_metadata in seq_group_metadata_list:
+                sampling_params = seq_group_metadata.sampling_params
+                top_k, top_p, temperature = (
+                    self._convert_to_neuron_sampling_params(sampling_params))
+                sampling_params.top_k = top_k
+                sampling_params.top_p = top_p
+                sampling_params.temperature = temperature
+
+        # we need multi_modal_data for later tokens as well
+        multi_modal_kwargs_list: List[MultiModalKwargs] = []
+        for seq_group_metadata in seq_group_metadata_list:
+            mm_data = seq_group_metadata.multi_modal_data
+            if mm_data:
+                multi_modal_kwargs_list.append(mm_data)
+        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
+
+        lora_adapter_ids = self._get_lora_adapter_ids(seq_group_metadata_list)
+
+        sampling_metadata = SamplingMetadata.prepare(
+            seq_group_metadata_list,
+            seq_lens,
+            # query_lens is not needed if chunked prefill is not
+            # supported. Since neuron worker doesn't support chunked prefill
+            # just use seq_lens instead.
+            seq_lens,
+            self.device,
+            self.pin_memory,
+            generators=self.get_generators(finished_requests_ids))
+
+        return ModelInputForNeuron(input_tokens=input_tokens,
+                                   input_positions=input_positions,
+                                   input_block_ids=input_block_ids,
+                                   sampling_metadata=sampling_metadata,
+                                   multi_modal_kwargs=multi_modal_kwargs,
+                                   adapter_ids=lora_adapter_ids)
+
+    def remove_all_loras(self):
+        raise NotImplementedError(
+            "Managing LoRAs is only supported through the "
+            "lora_modules parameter in override_neuron_config")
+
+    def set_active_loras(self, lora_requests: Set[LoRARequest],
+                         lora_mapping: LoRAMapping) -> None:
+        raise NotImplementedError(
+            "Managing LoRAs is only supported through the "
+            "lora_modules parameter in override_neuron_config")
+
+    def add_lora(self, lora_request: LoRARequest):
+        logger.warning(
+            "Adding LoRAs is only supported through the "
+            "lora_modules parameter in override_neuron_config. If you supplied "
+            "the parameter, you can ignore this warning. Ignoring"
+            "lora request: ", lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError(
+            "Managing LoRAs is only supported through the "
+            "lora_modules parameter in override_neuron_config")
+
+    def pin_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError(
+            "Managing LoRAs is only supported through the "
+            "lora_modules parameter in override_neuron_config")
+
+    def list_loras(self) -> Set[int]:
+        raise NotImplementedError(
+            "Managing LoRAs is only supported through the "
+            "lora_modules parameter in override_neuron_config")
diff --git a/vllm_v0.10.0/vllm/worker/pooling_model_runner.py b/vllm_v0.10.0/vllm/worker/pooling_model_runner.py
new file mode 100644
index 0000000..e49783a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/worker/pooling_model_runner.py
@@ -0,0 +1,214 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import dataclasses
+from typing import Any, Dict, List, Optional, Tuple, Type, Union, cast
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.distributed import get_pp_group
+from vllm.forward_context import set_forward_context
+from vllm.logger import init_logger
+from vllm.model_executor.models.interfaces_base import VllmModelForPooling
+from vllm.model_executor.pooling_metadata import PoolingMetadata
+from vllm.multimodal import MultiModalKwargs
+from vllm.pooling_params import PoolingParams
+from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData,
+                           SequenceGroupMetadata)
+from vllm.worker.model_runner import (GPUModelRunnerBase, ModelInputForGPU,
+                                      ModelInputForGPUBuilder)
+
+logger = init_logger(__name__)
+
+
+@dataclasses.dataclass(frozen=True)
+class ModelInputForGPUWithPoolingMetadata(ModelInputForGPU):
+    """
+    Used by the PoolingModelRunner.
+    """
+    pooling_metadata: Optional["PoolingMetadata"] = None
+
+
+class PoolingModelRunner(
+        GPUModelRunnerBase[ModelInputForGPUWithPoolingMetadata]):
+    _model_input_cls: Type[ModelInputForGPUWithPoolingMetadata] = (
+        ModelInputForGPUWithPoolingMetadata)
+    _builder_cls: Type[ModelInputForGPUBuilder] = ModelInputForGPUBuilder
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        kv_cache_dtype: Optional[str] = "auto",
+        is_driver_worker: bool = False,
+    ):
+        super().__init__(vllm_config=vllm_config,
+                         kv_cache_dtype=kv_cache_dtype,
+                         is_driver_worker=is_driver_worker)
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        model_input: ModelInputForGPUWithPoolingMetadata,
+        kv_caches: List[torch.Tensor],
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        num_steps: int = 1,
+    ) -> Optional[Union[List[PoolerOutput], IntermediateTensors]]:
+        if num_steps > 1:
+            raise ValueError(
+                "PoolingModelRunner does not support multi-step execution.")
+
+        if self.lora_config:
+            assert model_input.lora_requests is not None
+            assert model_input.lora_mapping is not None
+            self.set_active_loras(model_input.lora_requests,
+                                  model_input.lora_mapping)
+
+        # Currently cuda graph is only supported by the decode phase.
+        assert model_input.attn_metadata is not None
+        prefill_meta = model_input.attn_metadata.prefill_metadata
+        decode_meta = model_input.attn_metadata.decode_metadata
+        virtual_engine = model_input.virtual_engine
+        # Pooling models are (ab-)used also to integrate non text models that
+        # are not autoregressive (PrithviGeosaptialMAE).
+        # These model might not use attention and do not really have a prefill
+        # and decode phase. The model input is processed in one shot and both
+        # decode_metadata and prefill_metadata would be None for such models.
+        # See the PlaceholderAttentionMetadata class.
+        # TODO: Figure out if cuda_graph is of any use for these models and
+        #  explore how to leverage it.
+        if (prefill_meta is None and decode_meta is not None
+                and decode_meta.use_cuda_graph):
+            if model_input.inputs_embeds is None:
+                assert model_input.input_tokens is not None
+                graph_batch_size = model_input.input_tokens.shape[0]
+                model_executable = (
+                    self.graph_runners[model_input.virtual_engine][(
+                        graph_batch_size, False)])
+            else:
+                graph_batch_size = model_input.inputs_embeds.shape[0]
+                model_executable = (
+                    self.graph_runners[model_input.virtual_engine][(
+                        graph_batch_size, True)])
+        else:
+            model_executable = self.model
+
+        multi_modal_kwargs = model_input.multi_modal_kwargs or {}
+        seqlen_agnostic_kwargs = {
+            "finished_requests_ids": model_input.finished_requests_ids,
+            "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
+        } if self.has_inner_state else {}
+        if (self.observability_config is not None
+                and self.observability_config.collect_model_forward_time):
+            model_forward_start = torch.cuda.Event(enable_timing=True)
+            model_forward_end = torch.cuda.Event(enable_timing=True)
+            model_forward_start.record()
+
+        cross_enc_kwargs = {}
+        if model_input.token_types is not None:
+            cross_enc_kwargs["token_type_ids"] = model_input.token_types
+
+        with set_forward_context(model_input.attn_metadata, self.vllm_config,
+                                 virtual_engine):
+            hidden_or_intermediate_states = model_executable(
+                input_ids=model_input.input_tokens,
+                positions=model_input.input_positions,
+                intermediate_tensors=intermediate_tensors,
+                **MultiModalKwargs.as_kwargs(
+                    multi_modal_kwargs,
+                    device=self.device,
+                ),
+                **cross_enc_kwargs,
+                **seqlen_agnostic_kwargs,
+            )
+
+        if (self.observability_config is not None
+                and self.observability_config.collect_model_forward_time):
+            model_forward_end.record()
+
+        # Only perform pooling in the last pipeline stage.
+        if not get_pp_group().is_last_rank:
+            if (self.is_driver_worker
+                    and hidden_or_intermediate_states is not None
+                    and isinstance(hidden_or_intermediate_states,
+                                   IntermediateTensors)
+                    and self.observability_config is not None
+                    and self.observability_config.collect_model_forward_time):
+                model_forward_end.synchronize()
+                model_forward_time = model_forward_start.elapsed_time(
+                    model_forward_end)
+                orig_model_forward_time = 0.0
+                if intermediate_tensors is not None:
+                    orig_model_forward_time = intermediate_tensors.tensors.get(
+                        "model_forward_time", torch.tensor(0.0)).item()
+                hidden_or_intermediate_states.tensors["model_forward_time"] = (
+                    torch.tensor(model_forward_time + orig_model_forward_time))
+            return hidden_or_intermediate_states
+
+        # Only perform pooling in the driver worker.
+        if not self.is_driver_worker:
+            return []
+
+        return [
+            self.model.pooler(hidden_states=hidden_or_intermediate_states,
+                              pooling_metadata=model_input.pooling_metadata)
+        ]
+
+    def make_model_input_from_broadcasted_tensor_dict(
+            self,
+            tensor_dict: Dict[str,
+                              Any]) -> ModelInputForGPUWithPoolingMetadata:
+        return ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict(
+            tensor_dict,
+            attn_backend=self.attn_backend,
+        )
+
+    def prepare_model_input(
+        self,
+        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+        virtual_engine: int = 0,
+        finished_requests_ids: Optional[List[str]] = None
+    ) -> ModelInputForGPUWithPoolingMetadata:
+        assert seq_group_metadata_list is not None
+        model_input = self._prepare_model_input_tensors(
+            seq_group_metadata_list, finished_requests_ids)
+        # Prepare PoolingMetadata.
+        assert model_input.seq_lens is not None
+        pooling_metadata = self._prepare_pooling(seq_group_metadata_list,
+                                                 model_input.seq_lens)
+
+        return dataclasses.replace(model_input,
+                                   pooling_metadata=pooling_metadata)
+
+    def _prepare_pooling(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        prompt_lens: List[int],
+    ) -> PoolingMetadata:
+        """Prepare PoolingMetadata for the sequence group metadata list."""
+        seq_groups: List[Tuple[List[int], PoolingParams]] = []
+        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+
+            pooling_params = seq_group_metadata.pooling_params
+            assert pooling_params is not None
+            assert (task := pooling_params.task) is not None, (
+                "You did not set `task` in the API")
+
+            model = cast(VllmModelForPooling, self.model)
+            to_update = model.pooler.get_pooling_updates(task)
+            to_update.apply(pooling_params)
+
+            seq_groups.append((seq_ids, pooling_params))
+
+        seq_data: Dict[int, SequenceData] = {}
+        for seq_group_metadata in seq_group_metadata_list:
+            seq_data.update(seq_group_metadata.seq_data)
+
+        pooling_metadata = PoolingMetadata(
+            seq_groups=seq_groups,
+            seq_data=seq_data,
+            prompt_lens=prompt_lens,
+        )
+
+        return pooling_metadata
diff --git a/vllm_v0.10.0/vllm/worker/utils.py b/vllm_v0.10.0/vllm/worker/utils.py
new file mode 100644
index 0000000..512a1dc
--- /dev/null
+++ b/vllm_v0.10.0/vllm/worker/utils.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+'''
+Worker-related helper functions.
+'''
+
+from vllm.utils import STR_NOT_IMPL_ENC_DEC_ERR_STRS
+from vllm.worker.model_runner import GPUModelRunnerBase
+
+
+def assert_enc_dec_mr_supported_scenario(
+        enc_dec_mr: GPUModelRunnerBase) -> None:
+    '''
+    Asserted that the provided encoder/decoder model runner instance reflects
+    a supported scenario.
+    '''
+
+    # Reminder: Please update docs/features/compatibility_matrix.md
+    # If the feature combo become valid
+
+    if enc_dec_mr.cache_config.enable_prefix_caching:
+        raise NotImplementedError(
+            STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE'])
+
+    if enc_dec_mr.sliding_window is not None:
+        raise NotImplementedError(
+            STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_SWA'])
+
+    if enc_dec_mr.scheduler_config.chunked_prefill_enabled:
+        raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_ERR_STRS[
+            'STR_NOT_IMPL_ENC_DEC_CHUNKED_PREFILL'])
+
+    if getattr(enc_dec_mr.model_config.hf_config, 'attn_logit_softcapping',
+               None) is not None:
+        raise NotImplementedError(
+            STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP']
+        )
+
+    if enc_dec_mr.lora_config is not None:
+        raise NotImplementedError(
+            STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_LORA'])
+
+    if enc_dec_mr.parallel_config.pipeline_parallel_size > 1:
+        raise NotImplementedError(
+            STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_PP'])
+
+    if enc_dec_mr.scheduler_config.num_lookahead_slots > 0:
+        raise NotImplementedError(
+            STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_SPEC_DEC'])
diff --git a/vllm_v0.10.0/vllm/worker/worker.py b/vllm_v0.10.0/vllm/worker/worker.py
new file mode 100644
index 0000000..9dfea94
--- /dev/null
+++ b/vllm_v0.10.0/vllm/worker/worker.py
@@ -0,0 +1,587 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""A GPU worker class."""
+import gc
+import os
+from typing import Dict, List, Optional, Set, Tuple, Type, Union
+
+import torch
+import torch.distributed
+
+import vllm.envs as envs
+from vllm.attention.layer import Attention
+from vllm.config import VllmConfig, get_layers_from_vllm_config
+from vllm.device_allocator.cumem import CuMemAllocator
+from vllm.distributed import (ensure_model_parallel_initialized,
+                              init_distributed_environment,
+                              set_custom_all_reduce)
+from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.model_executor import set_random_seed
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
+from vllm.platforms import current_platform
+from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
+                           SequenceGroupMetadata, SequenceGroupMetadataDelta)
+from vllm.utils import (GiB_bytes, MemorySnapshot, bind_kv_cache,
+                        memory_profiling)
+from vllm.worker.cache_engine import CacheEngine
+from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
+from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
+from vllm.worker.pooling_model_runner import PoolingModelRunner
+from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
+                                     WorkerInput)
+
+logger = init_logger(__name__)
+
+
+class Worker(LocalOrDistributedWorkerBase):
+    """A worker class that executes (a partition of) the model on a GPU.
+
+    Each worker is associated with a single GPU. The worker is responsible for
+    maintaining the KV cache and executing the model on the GPU. In case of
+    distributed inference, each worker is assigned a partition of the model.
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        local_rank: int,
+        rank: int,
+        distributed_init_method: str,
+        is_driver_worker: bool = False,
+        model_runner_cls: Optional[Type[GPUModelRunnerBase]] = None,
+    ) -> None:
+        WorkerBase.__init__(self, vllm_config)
+        self.parallel_config.rank = rank
+        self.local_rank = local_rank
+        self.rank = rank
+        self.distributed_init_method = distributed_init_method
+        self.is_driver_worker = is_driver_worker
+        if self.model_config.trust_remote_code:
+            # note: lazy import to avoid importing torch before initializing
+            from vllm.utils import init_cached_hf_modules
+            init_cached_hf_modules()
+
+        # Return hidden states from target model if the draft model is an
+        # mlp_speculator
+        speculative_config = self.speculative_config
+        model_config = self.model_config
+        speculative_args = {} if speculative_config is None \
+            or (speculative_config.draft_model_config.hf_config.model_type ==
+                model_config.hf_config.model_type) \
+            or (speculative_config.draft_model_config.hf_config.model_type
+                not in ("medusa",
+                        "mlp_speculator",
+                        "eagle",
+                        "deepseek_mtp",
+                        "glm4_moe_mtp",
+                        "mimo_mtp")) \
+                    else {"return_hidden_states": True}
+
+        ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner
+        if model_config.runner_type == "pooling":
+            ModelRunnerClass = PoolingModelRunner
+        elif self.model_config.is_encoder_decoder:
+            ModelRunnerClass = EncoderDecoderModelRunner
+        self.model_runner: GPUModelRunnerBase = ModelRunnerClass(
+            vllm_config=self.vllm_config,
+            kv_cache_dtype=self.cache_config.cache_dtype,
+            is_driver_worker=is_driver_worker,
+            **speculative_args,
+        )
+        if model_runner_cls is not None:
+            self.model_runner = model_runner_cls(self.model_runner)
+
+        # Uninitialized cache engine. Will be initialized by
+        # initialize_cache.
+        self.cache_engine: List[CacheEngine]
+        # Initialize gpu_cache as pooling models don't initialize kv_caches
+        self.gpu_cache: Optional[List[List[torch.Tensor]]] = None
+        self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {}
+
+        # Buffers saved before sleep
+        self._sleep_saved_buffers: Dict[str, torch.Tensor] = {}
+
+        # Torch profiler. Enabled and configured through env vars:
+        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
+        if envs.VLLM_TORCH_PROFILER_DIR:
+            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
+            logger.info("Profiling enabled. Traces will be saved to: %s",
+                        torch_profiler_trace_dir)
+            self.profiler = torch.profiler.profile(
+                activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                    torch.profiler.ProfilerActivity.CUDA,
+                ],
+                with_stack=True,
+                on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                    torch_profiler_trace_dir, use_gzip=True))
+        else:
+            self.profiler = None
+
+    def start_profile(self):
+        if self.profiler is None:
+            raise RuntimeError("Profiler is not enabled.")
+        self.profiler.start()
+
+    def stop_profile(self):
+        if self.profiler is None:
+            raise RuntimeError("Profiler is not enabled.")
+        self.profiler.stop()
+        print(
+            self.profiler.key_averages().table(sort_by="self_cuda_time_total"))
+
+    def sleep(self, level: int = 1) -> None:
+        free_bytes_before_sleep = torch.cuda.mem_get_info()[0]
+
+        # Save the buffers before level 2 sleep
+        if level == 2:
+            model = self.model_runner.model
+            self._sleep_saved_buffers = {
+                name: buffer.cpu().clone()
+                for name, buffer in model.named_buffers()
+            }
+
+        allocator = CuMemAllocator.get_instance()
+        allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple())
+        free_bytes_after_sleep, total = torch.cuda.mem_get_info()
+        freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep
+        used_bytes = total - free_bytes_after_sleep
+        assert freed_bytes >= 0, "Memory usage increased after sleeping."
+        logger.info(
+            "Sleep mode freed %.2f GiB memory, "
+            "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes,
+            used_bytes / GiB_bytes)
+
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        allocator = CuMemAllocator.get_instance()
+        allocator.wake_up(tags=tags)
+
+        # Restore the buffers after level 2 sleep
+        if len(self._sleep_saved_buffers):
+            model = self.model_runner.model
+            for name, buffer in model.named_buffers():
+                if name in self._sleep_saved_buffers:
+                    buffer.data.copy_(self._sleep_saved_buffers[name].data)
+            self._sleep_saved_buffers = {}
+
+    def init_device(self) -> None:
+        if self.device_config.device.type == "cuda":
+            # torch.distributed.all_reduce does not free the input tensor until
+            # the synchronization point. This causes the memory usage to grow
+            # as the number of all_reduce calls increases. This env var disables
+            # this behavior.
+            # Related issue:
+            # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
+            os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
+
+            # This env var set by Ray causes exceptions with graph building.
+            os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
+            self.device = torch.device(f"cuda:{self.local_rank}")
+            torch.cuda.set_device(self.device)
+
+            _check_if_gpu_supports_dtype(self.model_config.dtype)
+            gc.collect()
+            torch.cuda.empty_cache()
+            torch.cuda.reset_peak_memory_stats()
+            self.baseline_snapshot = MemorySnapshot()
+        else:
+            raise RuntimeError(
+                f"Not support device type: {self.device_config.device}")
+        # Initialize the distributed environment.
+        init_worker_distributed_environment(self.vllm_config, self.rank,
+                                            self.distributed_init_method,
+                                            self.local_rank)
+        # Set random seed.
+        set_random_seed(self.model_config.seed)
+
+    def load_model(self):
+        if self.vllm_config.model_config.enable_sleep_mode:
+            allocator = CuMemAllocator.get_instance()
+            assert allocator.get_current_usage() == 0, (
+                "Sleep mode can only be "
+                "used for one instance per process.")
+            context = allocator.use_memory_pool(tag="weights")
+        else:
+            from contextlib import nullcontext
+            context = nullcontext()
+        with context:
+            self.model_runner.load_model()
+
+    def save_sharded_state(
+        self,
+        path: str,
+        pattern: Optional[str] = None,
+        max_size: Optional[int] = None,
+    ) -> None:
+        self.model_runner.save_sharded_state(
+            path,
+            pattern=pattern,
+            max_size=max_size,
+        )
+
+    def save_tensorized_model(
+        self,
+        tensorizer_config: TensorizerConfig,
+    ) -> None:
+        self.model_runner.save_tensorized_model(
+            tensorizer_config=tensorizer_config, )
+
+    @torch.inference_mode()
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Profiles the peak memory usage of the model to determine how many
+        KV blocks may be allocated without OOMs.
+
+        The engine will first conduct a profiling of the existing memory usage.
+        Then, it calculate the maximum possible number of GPU and CPU blocks
+        that can be allocated with the remaining free memory.
+
+        Tip:
+            You may limit the usage of GPU memory
+            by adjusting the `gpu_memory_utilization` parameter.
+        """
+        # Profile the memory usage of the model and get the maximum number of
+        # cache blocks that can be allocated with the remaining free memory.
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+
+        free_memory_pre_profile, total_gpu_memory = torch.cuda.mem_get_info()
+
+        # Execute a forward pass with dummy inputs to profile the memory usage
+        # of the model.
+        with memory_profiling(
+                self.baseline_snapshot,
+                weights_memory=self.model_runner.model_memory_usage) as result:
+            self.model_runner.profile_run()
+
+        self._assert_memory_footprint_increased_during_profiling()
+
+        memory_for_current_instance = total_gpu_memory * \
+            self.cache_config.gpu_memory_utilization
+        available_kv_cache_memory = (memory_for_current_instance -
+                                     result.non_kv_cache_memory)
+
+        # Calculate the number of blocks that can be allocated with the
+        # profiled peak memory.
+        cache_block_size = self.get_cache_block_size_bytes()
+        if cache_block_size == 0:
+            num_gpu_blocks = 0
+            num_cpu_blocks = 0
+        else:
+            num_gpu_blocks = int(available_kv_cache_memory // cache_block_size)
+            num_cpu_blocks = int(self.cache_config.swap_space_bytes //
+                                 cache_block_size)
+        num_gpu_blocks = max(num_gpu_blocks, 0)
+        num_cpu_blocks = max(num_cpu_blocks, 0)
+
+        msg = (f"Memory profiling takes {result.profile_time:.2f} seconds\n"
+               "the current vLLM instance can use "
+               "total_gpu_memory "
+               f"({(total_gpu_memory / GiB_bytes):.2f}GiB)"
+               " x gpu_memory_utilization "
+               f"({self.cache_config.gpu_memory_utilization:.2f})"
+               f" = {(memory_for_current_instance / GiB_bytes):.2f}GiB\n"
+               "model weights take "
+               f"{(result.weights_memory / GiB_bytes):.2f}GiB;"
+               " non_torch_memory takes "
+               f"{(result.non_torch_increase / GiB_bytes):.2f}GiB;"
+               " PyTorch activation peak memory takes "
+               f"{(result.torch_peak_increase / GiB_bytes):.2f}GiB;"
+               " the rest of the memory reserved for KV Cache is "
+               f"{(available_kv_cache_memory / GiB_bytes):.2f}GiB.")
+
+        logger.info(msg)
+        # Final cleanup
+        gc.collect()
+
+        return num_gpu_blocks, num_cpu_blocks
+
+    def _assert_memory_footprint_increased_during_profiling(self):
+        # NOTE(woosuk): Here we assume that the other processes using the same
+        # GPU did not change their memory usage during the profiling.
+        free_gpu_memory, total = torch.cuda.mem_get_info()
+        cuda_memory = total - free_gpu_memory
+        assert self.baseline_snapshot.cuda_memory < cuda_memory, (
+            "Error in memory profiling. "
+            f"Initial used memory {self.baseline_snapshot.cuda_memory}, "
+            f"currently used memory {cuda_memory}. "
+            f"This happens when the GPU memory was "
+            "not properly cleaned up before initializing the vLLM instance.")
+
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        """Allocate GPU and CPU KV cache with the specified number of blocks.
+
+        This also warms up the model, which may record CUDA graphs.
+        """
+        raise_if_cache_size_invalid(
+            num_gpu_blocks, self.cache_config.block_size,
+            self.cache_config.is_attention_free,
+            self.model_config.max_model_len,
+            self.parallel_config.pipeline_parallel_size)
+
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+
+        if self.vllm_config.model_config.enable_sleep_mode:
+            allocator = CuMemAllocator.get_instance()
+            context = allocator.use_memory_pool(tag="kv_cache")
+        else:
+            from contextlib import nullcontext
+            context = nullcontext()
+        with context:
+            self._init_cache_engine()
+        self._warm_up_model()
+
+    def _init_cache_engine(self):
+        assert self.cache_config.num_gpu_blocks is not None
+        self.cache_engine = [
+            CacheEngine(self.cache_config, self.model_config,
+                        self.parallel_config, self.device_config)
+            for _ in range(self.parallel_config.pipeline_parallel_size)
+        ]
+        self.gpu_cache = [
+            self.cache_engine[ve].gpu_cache
+            for ve in range(self.parallel_config.pipeline_parallel_size)
+        ]
+
+        # Layer pairings for cross-layer KV sharing.
+        # If an Attention layer `layer_name` is in the keys of this dict, it
+        # means this layer will perform attention using the keys and values
+        # from the KV cache of `shared_kv_cache_layers[layer_name]`.
+        shared_kv_cache_layers: dict[str, str] = {}
+
+        attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention)
+
+        for layer_name, attn_module in attn_layers.items():
+            if (kv_tgt_layer :=
+                    attn_module.kv_sharing_target_layer_name) is not None:
+                # The layer doesn't need its own KV cache and will use that of
+                # the target layer. We skip creating a KVCacheSpec for it, so
+                # that KV cache management logic will act as this layer does
+                # not exist, and doesn't allocate KV cache for the layer. This
+                # enables the memory saving of cross-layer kv sharing, allowing
+                # a given amount of memory to accommodate longer context lengths
+                # or enable more requests to be processed simultaneously.
+                shared_kv_cache_layers[layer_name] = kv_tgt_layer
+
+        bind_kv_cache(self.compilation_config.static_forward_context,
+                      self.gpu_cache, shared_kv_cache_layers)
+
+    def _warm_up_model(self) -> None:
+        # warm up sizes that are not in cudagraph capture sizes,
+        # but users still want to compile for better performance,
+        # e.g. for the max-num-batched token size in chunked prefill.
+        warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy()
+        if not self.model_config.enforce_eager:
+            warmup_sizes = [
+                x for x in warmup_sizes if x not in
+                self.vllm_config.compilation_config.cudagraph_capture_sizes
+            ]
+        for size in sorted(warmup_sizes, reverse=True):
+            logger.info("Compile and warming up model for size %d", size)
+            self.model_runner._dummy_run(size)
+        if not self.model_config.enforce_eager:
+            self.model_runner.capture_model(self.gpu_cache)
+        # Reset the seed to ensure that the random state is not affected by
+        # the model initialization and profiling.
+        set_random_seed(self.model_config.seed)
+
+    @property
+    def do_metadata_broadcast(self) -> bool:
+        return self.parallel_config.tensor_parallel_size > 1
+
+    @property
+    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
+        return self.gpu_cache
+
+    @torch.inference_mode()
+    def prepare_worker_input(
+            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
+        virtual_engine = execute_model_req.virtual_engine
+        num_steps = execute_model_req.num_steps
+        num_seq_groups = len(execute_model_req.seq_group_metadata_list)
+        # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors.
+        # they contain parameters to launch cudamemcpyasync.
+        blocks_to_swap_in = torch.tensor(execute_model_req.blocks_to_swap_in,
+                                         device="cpu",
+                                         dtype=torch.int64).view(-1, 2)
+        blocks_to_swap_out = torch.tensor(execute_model_req.blocks_to_swap_out,
+                                          device="cpu",
+                                          dtype=torch.int64).view(-1, 2)
+        # `blocks_to_copy` is a gpu tensor. The src and tgt of
+        # blocks to copy are in the same device, and `blocks_to_copy`
+        # can be used directly within cuda kernels.
+        blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
+                                      device=self.device,
+                                      dtype=torch.int64).view(-1, 2)
+
+        return WorkerInput(
+            num_seq_groups=num_seq_groups,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy,
+            virtual_engine=virtual_engine,
+            num_steps=num_steps,
+        )
+
+    @torch.inference_mode()
+    def execute_worker(self, worker_input: WorkerInput) -> None:
+        virtual_engine = worker_input.virtual_engine
+        # Issue cache operations.
+        if (worker_input.blocks_to_swap_in is not None
+                and worker_input.blocks_to_swap_in.numel() > 0):
+            self.cache_engine[virtual_engine].swap_in(
+                worker_input.blocks_to_swap_in)
+        if (worker_input.blocks_to_swap_out is not None
+                and worker_input.blocks_to_swap_out.numel() > 0):
+            self.cache_engine[virtual_engine].swap_out(
+                worker_input.blocks_to_swap_out)
+        if (worker_input.blocks_to_copy is not None
+                and worker_input.blocks_to_copy.numel() > 0):
+            self.cache_engine[virtual_engine].copy(worker_input.blocks_to_copy)
+
+    def _get_cached_seq_group_metadata(
+            self,
+            seq_group_metadata_list: List[Union[SequenceGroupMetadata,
+                                                SequenceGroupMetadataDelta]],
+            finished_request_ids: List[str]) -> List[SequenceGroupMetadata]:
+        """Return a list of cached Sequence Group Metadata after updating its
+        state.
+
+        It is used because scheduler only sends delta to workers to reduce
+        the data payload size. The function also cleans up cache based on
+        a given `finished_request_ids`.
+        """
+        new_seq_group_metadata_list = []
+        for metadata_or_delta in seq_group_metadata_list:
+            request_id = metadata_or_delta.request_id
+            if request_id not in self._seq_group_metadata_cache:
+                # The first prefill.
+                assert isinstance(metadata_or_delta, SequenceGroupMetadata)
+                self._seq_group_metadata_cache[request_id] = metadata_or_delta
+            else:
+                # The first prefill is already cached.
+                if isinstance(metadata_or_delta, SequenceGroupMetadataDelta):
+                    self._seq_group_metadata_cache[request_id].apply_delta(
+                        metadata_or_delta)
+                else:
+                    # If metadata snapshot is sent again, it is
+                    # preempted. Reset the cache because we need to start
+                    # from scratch.
+                    assert isinstance(metadata_or_delta, SequenceGroupMetadata)
+                    self._seq_group_metadata_cache[
+                        request_id] = metadata_or_delta
+
+            new_seq_group_metadata_list.append(
+                self._seq_group_metadata_cache[request_id])
+
+        # Clean up finished ids
+        for finished_id in finished_request_ids:
+            del self._seq_group_metadata_cache[finished_id]
+
+        return new_seq_group_metadata_list
+
+    def _execute_model_spmd(
+        self,
+        execute_model_req: ExecuteModelRequest,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+    ) -> Optional[List[SamplerOutput]]:
+        if execute_model_req is not None:
+            new_seq_group_metadata_list = self._get_cached_seq_group_metadata(
+                execute_model_req.seq_group_metadata_list,
+                execute_model_req.finished_requests_ids)
+
+            execute_model_req.seq_group_metadata_list = (
+                new_seq_group_metadata_list)
+        output = super()._execute_model_spmd(execute_model_req,
+                                             intermediate_tensors)
+        return output
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.model_runner.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        return self.model_runner.remove_lora(lora_id)
+
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.model_runner.pin_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        return self.model_runner.list_loras()
+
+    @property
+    def max_model_len(self) -> int:
+        return self.model_config.max_model_len
+
+    @property
+    def vocab_size(self) -> int:
+        return self.model_runner.vocab_size
+
+    def get_cache_block_size_bytes(self) -> int:
+        """Get the size of the KV cache block size in bytes.
+        """
+        return CacheEngine.get_cache_block_size(self.cache_config,
+                                                self.model_config,
+                                                self.parallel_config)
+
+
+def init_worker_distributed_environment(
+    vllm_config: VllmConfig,
+    rank: int,
+    distributed_init_method: Optional[str] = None,
+    local_rank: int = -1,
+) -> None:
+    """Initialize the distributed environment."""
+    parallel_config = vllm_config.parallel_config
+    set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
+
+    init_distributed_environment(parallel_config.world_size, rank,
+                                 distributed_init_method, local_rank,
+                                 current_platform.dist_backend)
+    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
+                                      parallel_config.pipeline_parallel_size)
+
+    ensure_kv_transfer_initialized(vllm_config)
+
+
+def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
+    # Check if the GPU supports the dtype.
+    if torch_dtype == torch.bfloat16:  # noqa: SIM102
+        if not current_platform.has_device_capability(80):
+            capability = current_platform.get_device_capability()
+            gpu_name = current_platform.get_device_name()
+
+            if capability is None:
+                compute_str = "does not have a compute capability"
+            else:
+                version_str = capability.as_version_str()
+                compute_str = f"has compute capability {version_str}"
+
+            raise ValueError(
+                "Bfloat16 is only supported on GPUs with compute capability "
+                f"of at least 8.0. Your {gpu_name} GPU {compute_str}. "
+                "You can use float16 instead by explicitly setting the "
+                "`dtype` flag in CLI, for example: --dtype=half.")
+
+
+def raise_if_cache_size_invalid(num_gpu_blocks, block_size, is_attention_free,
+                                max_model_len, pipeline_parallel_size) -> None:
+    if is_attention_free and num_gpu_blocks != 0:
+        raise ValueError("No memory should be allocated for the cache blocks "
+                         f"for an attention-free model, but {num_gpu_blocks} "
+                         "blocks are allocated.")
+    if not is_attention_free and num_gpu_blocks <= 0:
+        raise ValueError("No available memory for the cache blocks. "
+                         "Try increasing `gpu_memory_utilization` when "
+                         "initializing the engine.")
+    max_seq_len = block_size * (num_gpu_blocks // pipeline_parallel_size)
+    if not is_attention_free and max_model_len > max_seq_len:
+        raise ValueError(
+            f"The model's max seq len ({max_model_len}) "
+            "is larger than the maximum number of tokens that can be "
+            f"stored in KV cache ({max_seq_len}). Try increasing "
+            "`gpu_memory_utilization` or decreasing `max_model_len` when "
+            "initializing the engine.")
diff --git a/vllm_v0.10.0/vllm/worker/worker_base.py b/vllm_v0.10.0/vllm/worker/worker_base.py
new file mode 100644
index 0000000..f1c9a0a
--- /dev/null
+++ b/vllm_v0.10.0/vllm/worker/worker_base.py
@@ -0,0 +1,643 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import dataclasses
+import os
+import time
+from abc import abstractmethod
+from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
+
+import cloudpickle
+import torch
+import torch.nn as nn
+
+from vllm.config import (ObservabilityConfig, VllmConfig,
+                         set_current_vllm_config)
+from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.sequence import ExecuteModelRequest, IntermediateTensors
+from vllm.utils import (enable_trace_function_call_for_thread,
+                        resolve_obj_by_qualname, run_method,
+                        update_environment_variables,
+                        warn_for_unimplemented_methods)
+from vllm.worker.model_runner_base import (BroadcastableModelInput,
+                                           ModelRunnerBase,
+                                           ModelRunnerInputBase)
+
+logger = init_logger(__name__)
+
+
+@warn_for_unimplemented_methods
+class WorkerBase:
+    """Worker interface that allows vLLM to cleanly separate implementations for
+    different hardware. Also abstracts control plane communication, e.g., to
+    communicate request metadata to other workers.
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+    ) -> None:
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.load_config = vllm_config.load_config
+        self.parallel_config = vllm_config.parallel_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.device_config = vllm_config.device_config
+        self.speculative_config = vllm_config.speculative_config
+        self.observability_config = vllm_config.observability_config
+        self.kv_transfer_config = vllm_config.kv_transfer_config
+        self.compilation_config = vllm_config.compilation_config
+        from vllm.platforms import current_platform
+        self.current_platform = current_platform
+
+    def init_device(self) -> None:
+        """Initialize device state, such as loading the model or other on-device
+        memory allocations.
+        """
+        raise NotImplementedError
+
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        """Initialize the KV cache with the given size in blocks.
+        """
+        raise NotImplementedError
+
+    def get_model(self) -> nn.Module:
+        raise NotImplementedError
+
+    def load_model(self) -> None:
+        """Load model onto target device."""
+        raise NotImplementedError
+
+    def execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> Optional[List[SamplerOutput]]:
+        raise NotImplementedError
+
+    def start_worker_execution_loop(self) -> None:
+        """Execute model loop in parallel worker.
+
+        You can stop the loop by executing a driver worker with an empty output.
+        See `stop_remote_worker_execution_loop` for more details.
+        """
+        with self.current_platform.inference_mode():
+            while True:
+                output = self.execute_model(execute_model_req=None)
+                if output is None:
+                    return None
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Determine the number of available blocks for the GPU KV cache and
+        swappable CPU KV cache.
+
+        The implementation may run profiling or other heuristics to determine
+        the size of caches.
+
+        Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
+        are blocks that are "active" on the device and can be appended to.
+        num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
+        appended to.
+        """
+        raise NotImplementedError
+
+    def get_cache_block_size_bytes(self) -> int:
+        """Return the size of a single cache block, in bytes. Used in
+        speculative decoding.
+        """
+        raise NotImplementedError
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        raise NotImplementedError
+
+    def remove_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError
+
+    def pin_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError
+
+    def list_loras(self) -> Set[int]:
+        raise NotImplementedError
+
+    @property
+    def vocab_size(self) -> int:
+        """Get vocabulary size from model configuration."""
+        return self.model_config.get_vocab_size()
+
+
+class DelegateWorkerBase(WorkerBase):
+    """
+    A class that delegates all methods to another WorkerBase instance. This is
+    useful for creating a WorkerBase that wraps another WorkerBase instance,
+    e.g. speculative decoding.
+    """
+    worker: WorkerBase
+
+    def __init__(
+        self,
+        *args,
+        **kwargs,
+    ) -> None:
+        vllm_config: VllmConfig = kwargs.get("vllm_config")
+        cls = resolve_obj_by_qualname(vllm_config.parallel_config.worker_cls)
+        self.worker = cls(*args, **kwargs)
+
+    def init_device(self) -> None:
+        self.worker.init_device()
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        return self.worker.determine_num_available_blocks()
+
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        self.worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
+
+    def load_model(self) -> None:
+        """Load model onto target device."""
+        self.worker.load_model()
+
+    def get_model(self) -> nn.Module:
+        return self.worker.get_model()
+
+    def execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> Optional[List[SamplerOutput]]:
+        return self.worker.execute_model(execute_model_req)
+
+    def get_cache_block_size_bytes(self) -> int:
+        return self.worker.get_cache_block_size_bytes()
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.worker.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        return self.worker.remove_lora(lora_id)
+
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.worker.pin_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        return self.worker.list_loras()
+
+    def __getattr__(self, attr):
+        return getattr(self.worker, attr)
+
+
+class LoRANotSupportedWorkerBase(WorkerBase):
+    """Partial implementation of WorkerBase that raises exceptions when LoRA
+    methods are invoked.
+    """
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        raise ValueError(f"{type(self)} does not support LoRA")
+
+    def remove_lora(self, lora_id: int) -> bool:
+        raise ValueError(f"{type(self)} does not support LoRA")
+
+    def pin_lora(self, lora_id: int) -> bool:
+        raise ValueError(f"{type(self)} does not support LoRA")
+
+    def list_loras(self) -> Set[int]:
+        raise ValueError(f"{type(self)} does not support LoRA")
+
+
+@dataclasses.dataclass(frozen=True)
+class WorkerInput:
+    """Local inputs to each worker. May contain device-specific data. These
+    fields should be broadcastable to other workers.
+    """
+
+    num_seq_groups: Optional[int] = None
+    blocks_to_swap_in: Optional[torch.Tensor] = None
+    blocks_to_swap_out: Optional[torch.Tensor] = None
+    blocks_to_copy: Optional[torch.Tensor] = None
+    virtual_engine: int = 0
+    num_steps: int = 1
+
+    @classmethod
+    def from_broadcasted_tensor_dict(
+        cls: Type["WorkerInput"],
+        tensor_dict: Dict[str, Any],
+    ) -> "WorkerInput":
+        """
+        Pop fields from the given tensor_dict and populate a new instance of
+        WorkerInput.
+        """
+        return cls(
+            num_seq_groups=tensor_dict.pop("num_seq_groups"),
+            blocks_to_swap_in=tensor_dict.pop("blocks_to_swap_in"),
+            blocks_to_swap_out=tensor_dict.pop("blocks_to_swap_out"),
+            blocks_to_copy=tensor_dict.pop("blocks_to_copy"),
+            virtual_engine=tensor_dict["virtual_engine"],
+            num_steps=tensor_dict.pop("num_steps"),
+        )
+
+    def as_broadcastable_tensor_dict(
+            self) -> Dict[str, Union[int, torch.Tensor]]:
+        """
+        Extract broadcastable fields.
+        """
+        tensor_dict = {
+            "num_seq_groups": self.num_seq_groups,
+            "blocks_to_swap_in": self.blocks_to_swap_in,
+            "blocks_to_swap_out": self.blocks_to_swap_out,
+            "blocks_to_copy": self.blocks_to_copy,
+            "virtual_engine": self.virtual_engine,
+            "num_steps": self.num_steps,
+        }
+
+        return tensor_dict
+
+
+class LocalOrDistributedWorkerBase(WorkerBase):
+    """
+    Partial implementation of WorkerBase that has a default `execute_model`
+    definition to perform metadata transfer between workers when in distributed
+    mode. Subclasses of this interface should use model runners that inherit
+    from ModelRunnerBase, and should only need to implement worker-local logic.
+    If custom control plane logic is needed to transfer metadata, or if the
+    model runner cannot inherit from ModelRunnerBase, use WorkerBase instead.
+    """
+    is_driver_worker: bool
+    model_runner: ModelRunnerBase
+    observability_config: Optional[ObservabilityConfig] = None
+
+    @property
+    @abstractmethod
+    def do_metadata_broadcast(self) -> bool:
+        """
+        Used by the default `execute_model` to check whether broadcast is
+        needed to transfer request inputs from the driver worker to other
+        workers in the TP group. If WorkerBase subclass only supports
+        single-worker execution, then this method should return False.
+        """
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
+        """
+        Gets the list of kv caches to pass to the worker's model runner. Each
+        element in the list is a kv cache corresponding to a particular virtual
+        engine (PP stream). Used by the default `execute_model`. If the worker's
+        model runner does not follow the ModelRunnerBase interface, then inherit
+        from WorkerBase instead.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def prepare_worker_input(
+            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
+        """
+        Prepare the inputs to WorkerBase.execute_worker from an execution
+        request. This method may move data to the worker's local device. It is
+        not allowed to communicate with other workers or devices.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def execute_worker(self, worker_input: WorkerInput) -> None:
+        """
+        Process an execution request.
+        """
+        raise NotImplementedError
+
+    def _get_worker_input_from_broadcast(
+        self
+    ) -> Optional[Tuple[BroadcastableModelInput, WorkerInput, Dict[
+            str, torch.Tensor]]]:
+        """ Get the worker input from the broadcasted tensor dict. """
+        assert self.do_metadata_broadcast
+        assert not self.is_driver_worker
+        broadcast_data = broadcast_tensor_dict(src=0)
+        if not broadcast_data:
+            return None
+
+        worker_input = WorkerInput.from_broadcasted_tensor_dict(broadcast_data)
+        model_input = (
+            self.model_runner.make_model_input_from_broadcasted_tensor_dict(
+                broadcast_data))
+
+        kwargs = extract_previous_hidden_states(broadcast_data)
+
+        return model_input, worker_input, kwargs
+
+    def _get_driver_input_and_broadcast(
+        self, execute_model_req: ExecuteModelRequest
+    ) -> Tuple[BroadcastableModelInput, WorkerInput, Dict[str, torch.Tensor]]:
+        """ Get the driver input and broadcast it to other workers.  """
+        assert self.is_driver_worker
+
+        worker_input: WorkerInput = self.prepare_worker_input(
+            execute_model_req=execute_model_req)
+        model_input: ModelRunnerInputBase = (
+            self.model_runner.prepare_model_input(
+                execute_model_req.seq_group_metadata_list,
+                execute_model_req.virtual_engine,
+                execute_model_req.finished_requests_ids))
+
+        kwargs = extract_previous_hidden_states(execute_model_req)
+
+        if self.do_metadata_broadcast:
+            broadcast_data = worker_input.as_broadcastable_tensor_dict()
+            broadcast_data.update(model_input.as_broadcastable_tensor_dict())
+            broadcast_data.update(kwargs)
+            broadcast_tensor_dict(broadcast_data, src=0)
+
+        if execute_model_req.async_callback:
+            model_input = dataclasses.replace(  # type: ignore
+                model_input,
+                async_callback=execute_model_req.async_callback)
+
+        return model_input, worker_input, kwargs
+
+    def prepare_input(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> Optional[Tuple[BroadcastableModelInput, WorkerInput, Dict[
+            str, torch.Tensor]]]:
+        """
+        Prepare the inputs to ModelRunner and workers.
+        """
+        if self.is_driver_worker:
+            if execute_model_req is None:
+                if self.do_metadata_broadcast:
+                    # This signals that there's no more requests to process for
+                    # now. All workers are running infinite loop with
+                    # broadcast_tensor_dict, and it stops the loop when the
+                    # driver broadcasts an empty input. Send an empty input to
+                    # notify all other workers to stop their execution loop.
+                    broadcast_tensor_dict({}, src=0)
+                return None
+            return self._get_driver_input_and_broadcast(execute_model_req)
+        else:
+            return self._get_worker_input_from_broadcast()
+
+    def get_model(self) -> nn.Module:
+        return self.model_runner.get_model()
+
+    def execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None,
+    ) -> Optional[List[SamplerOutput]]:
+        """Executes at least one model step on the given sequences, unless no
+        sequences are provided."""
+        start_time = time.perf_counter()
+
+        inputs = self.prepare_input(execute_model_req)
+        if inputs is None:
+            return None
+
+        model_input, worker_input, kwargs = inputs
+        num_steps = worker_input.num_steps
+
+        self.execute_worker(worker_input)
+
+        # If there is no input, we don't need to execute the model.
+        if worker_input.num_seq_groups == 0:
+            return []
+
+        intermediate_tensors = None
+        orig_model_execute_time = 0.0
+        if not get_pp_group().is_first_rank:
+            intermediate_tensors = IntermediateTensors(
+                get_pp_group().recv_tensor_dict(
+                    all_gather_group=get_tp_group()))
+            if (self.observability_config is not None
+                    and self.observability_config.collect_model_execute_time):
+                orig_model_execute_time = intermediate_tensors.tensors.get(
+                    "model_execute_time", torch.tensor(0)).item()
+
+        output = self.model_runner.execute_model(
+            model_input=model_input,
+            kv_caches=self.kv_cache[worker_input.virtual_engine]
+            if self.kv_cache is not None else None,
+            intermediate_tensors=intermediate_tensors,
+            num_steps=num_steps,
+            **kwargs,
+        )
+
+        model_execute_time = time.perf_counter() - start_time
+        if not get_pp_group().is_last_rank:
+            # output is IntermediateTensors
+            assert isinstance(output, IntermediateTensors)
+            if (self.observability_config is not None
+                    and self.observability_config.collect_model_execute_time):
+                output.tensors["model_execute_time"] = torch.tensor(
+                    model_execute_time + orig_model_execute_time)
+            get_pp_group().send_tensor_dict(output.tensors,
+                                            all_gather_group=get_tp_group())
+            return [None]
+        if (self.observability_config is not None
+                and self.observability_config.collect_model_execute_time
+                and output is not None):
+            for o in output:
+                o.model_execute_time = (orig_model_execute_time +
+                                        model_execute_time)
+
+        # output is List[SamplerOutput]
+        return output
+
+    def _execute_model_spmd(
+        self,
+        execute_model_req: ExecuteModelRequest,
+        intermediate_tensors: Optional[IntermediateTensors] = None
+    ) -> Optional[List[SamplerOutput]]:
+        """
+        Execute model in Single Program Multiple Data (SPMD) fashion.
+        All workers take the same request, prepare the input and
+        execute the model.
+        """
+        assert execute_model_req is not None, (
+            "_execute_model_spmd() requires each worker to take in an "
+            "ExecuteModelRequest")
+        worker_input: WorkerInput = self.prepare_worker_input(
+            execute_model_req=execute_model_req)
+        model_input: ModelRunnerInputBase = (
+            self.model_runner.prepare_model_input(
+                execute_model_req.seq_group_metadata_list))
+
+        self.execute_worker(worker_input)
+
+        # If there is no input, we don't need to execute the model.
+        if worker_input.num_seq_groups == 0:
+            return []
+
+        kwargs = extract_previous_hidden_states(execute_model_req)
+
+        return self.model_runner.execute_model(
+            model_input=model_input,
+            kv_caches=self.kv_cache[worker_input.virtual_engine]
+            if self.kv_cache is not None else None,
+            intermediate_tensors=intermediate_tensors,
+            **kwargs,
+        )
+
+
+class WorkerWrapperBase:
+    """
+    This class represents one process in an executor/engine. It is responsible
+    for lazily initializing the worker and handling the worker's lifecycle.
+    We first instantiate the WorkerWrapper, which remembers the worker module
+    and class name. Then, when we call `update_environment_variables`, and the
+    real initialization happens in `init_worker`.
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        rpc_rank: int = 0,
+    ) -> None:
+        """
+        Initialize the worker wrapper with the given vllm_config and rpc_rank.
+        Note: rpc_rank is the rank of the worker in the executor. In most cases,
+        it is also the rank of the worker in the distributed group. However,
+        when multiple executors work together, they can be different.
+        e.g. in the case of SPMD-style offline inference with TP=2,
+        users can launch 2 engines/executors, each with only 1 worker.
+        All workers have rpc_rank=0, but they have different ranks in the TP
+        group.
+        """
+        self.rpc_rank = rpc_rank
+        self.worker: Optional[WorkerBase] = None
+        self.vllm_config: Optional[VllmConfig] = None
+        # do not store this `vllm_config`, `init_worker` will set the final
+        # one. TODO: investigate if we can remove this field in
+        # `WorkerWrapperBase`, `init_cached_hf_modules` should be
+        # unnecessary now.
+        if vllm_config.model_config is not None:
+            # it can be None in tests
+            trust_remote_code = vllm_config.model_config.trust_remote_code
+            if trust_remote_code:
+                # note: lazy import to avoid importing torch before initializing
+                from vllm.utils import init_cached_hf_modules
+                init_cached_hf_modules()
+
+    def adjust_rank(self, rank_mapping: Dict[int, int]) -> None:
+        """
+        Adjust the rpc_rank based on the given mapping.
+        It is only used during the initialization of the executor,
+        to adjust the rpc_rank of workers after we create all workers.
+        """
+        if self.rpc_rank in rank_mapping:
+            self.rpc_rank = rank_mapping[self.rpc_rank]
+
+    def update_environment_variables(self, envs_list: List[Dict[str,
+                                                                str]]) -> None:
+        envs = envs_list[self.rpc_rank]
+        key = 'CUDA_VISIBLE_DEVICES'
+        if key in envs and key in os.environ:
+            # overwriting CUDA_VISIBLE_DEVICES is desired behavior
+            # suppress the warning in `update_environment_variables`
+            del os.environ[key]
+        update_environment_variables(envs)
+
+    def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None:
+        """
+        Here we inject some common logic before initializing the worker.
+        Arguments are passed to the worker class constructor.
+        """
+        kwargs = all_kwargs[self.rpc_rank]
+        self.vllm_config = kwargs.get("vllm_config", None)
+        assert self.vllm_config is not None, (
+            "vllm_config is required to initialize the worker")
+        enable_trace_function_call_for_thread(self.vllm_config)
+
+        from vllm.plugins import load_general_plugins
+        load_general_plugins()
+
+        if isinstance(self.vllm_config.parallel_config.worker_cls, str):
+            worker_class = resolve_obj_by_qualname(
+                self.vllm_config.parallel_config.worker_cls)
+        else:
+            logger.warning(
+                "passing worker_cls as a class object is strongly deprecated,"
+                " as the serialization of class objects can be tricky and"
+                " error-prone. To be safe, please keep the class in a separate"
+                " module and pass the qualified name of the class as a string."
+            )
+            assert isinstance(self.vllm_config.parallel_config.worker_cls,
+                              bytes)
+            worker_class = cloudpickle.loads(
+                self.vllm_config.parallel_config.worker_cls)
+        if self.vllm_config.parallel_config.worker_extension_cls:
+            worker_extension_cls = resolve_obj_by_qualname(
+                self.vllm_config.parallel_config.worker_extension_cls)
+            extended_calls = []
+            if worker_extension_cls not in worker_class.__bases__:
+                # check any conflicts between worker and worker_extension_cls
+                for attr in dir(worker_extension_cls):
+                    if attr.startswith("__"):
+                        continue
+                    assert not hasattr(worker_class, attr), (
+                        f"Worker class {worker_class} already has an attribute"
+                        f" {attr}, which conflicts with the worker"
+                        f" extension class {worker_extension_cls}.")
+                    if callable(getattr(worker_extension_cls, attr)):
+                        extended_calls.append(attr)
+                # dynamically inherit the worker extension class
+                worker_class.__bases__ = worker_class.__bases__ + (
+                    worker_extension_cls, )
+                logger.info(
+                    "Injected %s into %s for extended collective_rpc calls %s",
+                    worker_extension_cls, worker_class, extended_calls)
+        with set_current_vllm_config(self.vllm_config):
+            # To make vLLM config available during worker initialization
+            self.worker = worker_class(**kwargs)
+            assert self.worker is not None
+
+    def initialize_from_config(self, kv_cache_configs: List[Any]) -> None:
+        kv_cache_config = kv_cache_configs[self.rpc_rank]
+        with set_current_vllm_config(self.vllm_config):
+            self.worker.initialize_from_config(kv_cache_config)  # type: ignore
+
+    def init_device(self):
+        with set_current_vllm_config(self.vllm_config):
+            # To make vLLM config available during device initialization
+            self.worker.init_device()  # type: ignore
+
+    def execute_method(self, method: Union[str, bytes], *args, **kwargs):
+        try:
+            # method resolution order:
+            # if a method is defined in this class, it will be called directly.
+            # otherwise, since we define `__getattr__` and redirect attribute
+            # query to `self.worker`, the method will be called on the worker.
+            return run_method(self, method, args, kwargs)
+        except Exception as e:
+            # if the driver worker also execute methods,
+            # exceptions in the rest worker may cause deadlock in rpc like ray
+            # see https://github.com/vllm-project/vllm/issues/3455
+            # print the error and inform the user to solve the error
+            msg = (f"Error executing method {method!r}. "
+                   "This might cause deadlock in distributed execution.")
+            logger.exception(msg)
+            raise e
+
+    def __getattr__(self, attr):
+        return getattr(self.worker, attr)
+
+
+def extract_previous_hidden_states(
+        data: Union[ExecuteModelRequest, Dict[str, torch.Tensor]]) -> \
+            Dict[str, torch.Tensor]:
+    """If data contains previous_hidden_states, extract it. This returns a dict
+    which can be used directly as additional kwargs in any following 
+    execute_model calls. This is used in draft models like EAGLE."""
+    output = {}
+
+    # When called from non-driver worker, data is dict but when called from
+    # driver worker, data is ExecuteModelRequest.
+    if isinstance(data, dict):
+        if "previous_hidden_states" in data:
+            output["previous_hidden_states"] = data["previous_hidden_states"]
+    elif data.previous_hidden_states is not None:
+        output["previous_hidden_states"] = data.previous_hidden_states\
+            .hidden_states
+
+    return output